From a24a75aee125415b092672b3aae3a49cdcdd5a0d Mon Sep 17 00:00:00 2001
From: Fausto Morales <faustomorales@gmail.com>
Date: Tue, 28 Jan 2020 23:41:39 -0600
Subject: [PATCH 0001/1533] Pad keras.backend.ctc_decode output to consistent
 shape.

---
 tensorflow/python/keras/backend.py      | 5 ++++-
 tensorflow/python/keras/backend_test.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index da5b241c52a..e8e2b72f88c 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -5824,10 +5824,13 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
               contains the decoded sequence.
               If `false`, returns the `top_paths` most probable
               decoded sequences.
+              Each decoded sequence has shape (samples, time_steps).
               Important: blank labels are returned as `-1`.
           Tensor `(top_paths, )` that contains
               the log probability of each decoded sequence.
   """
+  input_shape = shape(y_pred)
+  samples, steps = input_shape[0], input_shape[1]
   y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
   input_length = math_ops.cast(input_length, dtypes_module.int32)
 
@@ -5842,7 +5845,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
         top_paths=top_paths)
   decoded_dense = [
       sparse_ops.sparse_to_dense(
-          st.indices, st.dense_shape, st.values, default_value=-1)
+          st.indices, (samples, steps), st.values, default_value=-1)
       for st in decoded
   ]
   return (decoded_dense, log_prob)
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index e0eccfc7c44..2dff49e61e6 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -1771,7 +1771,7 @@ class TestCTC(test.TestCase):
         -3.777835    # output beam 1
     ], np.float32)[np.newaxis, :]
 
-    decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
+    decode_truth = [np.array([1, 0, -1, -1, -1, -1, -1]), np.array([0, 1, 0, -1, -1, -1 ,-1])]
     beam_width = 2
     top_paths = 2
 

From 5f1ee72e97e09da9171b2c226d4f10f05e89a38a Mon Sep 17 00:00:00 2001
From: Milan Straka <milan@strakovi.com>
Date: Sun, 23 Feb 2020 13:58:03 +0100
Subject: [PATCH 0002/1533] Fix label_smoothing in multidimensional
 CategoricalCrossentropy.

When label smoothing in CategoricalCrossentropy is non-zero, it takes
tf.shape(y_true)[1] as the number of classes. However, if the true
values and predictions are multidimensional (for example when training
a POS tagger where batch elements are sentences composed of words),
a wrong value is taken and the training does not work.

This fix takes the _last_ dimension as the one containing classes.
---
 tensorflow/python/keras/losses.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 061e31140b7..a1046bb5689 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -1084,7 +1084,7 @@ def categorical_crossentropy(y_true,
   label_smoothing = ops.convert_to_tensor_v2(label_smoothing, dtype=K.floatx())
 
   def _smooth_labels():
-    num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+    num_classes = math_ops.cast(array_ops.shape(y_true)[-1], y_pred.dtype)
     return y_true * (1.0 - label_smoothing) + (label_smoothing / num_classes)
 
   y_true = smart_cond.smart_cond(label_smoothing,

From f3f5afbe7bf3499e8735df0655344e7dc7ae554e Mon Sep 17 00:00:00 2001
From: Janosh Riebesell <janosh.riebesell@gmail.com>
Date: Wed, 26 Feb 2020 10:38:36 +0000
Subject: [PATCH 0003/1533] docs: add tip to prefer tf.shape(x) over x.shape
 when writing custom layers/models

See #36991 for details.
---
 tensorflow/python/ops/array_ops.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 50afcfbc6e0..4f03b985b69 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -557,6 +557,14 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
   >>> a.shape
   TensorShape([None, None, 10])
+  
+  However, when defining custom layers and models that will be run in graph mode
+  at some point, prefer `tf.shape(x)` over `x.shape`. `x.shape` is the static shape
+  of `x` and usually evaluates to `None` in the first dimension during graph
+  construction (to represent the as yet unknown batch size). This can cause problems in
+  function calls like `tf.zeros(x.shape[0])` which don't support `None` values.
+  `tf.shape(x)` on the other hand gives the dynamic shape of `x` which isn't
+  evaluated until training/predicting begins where the full shape of `x`  is known.
 
   `tf.shape` and `Tensor.shape` should be identical in eager mode.  Within
   `tf.function` or within a `compat.v1` context, not all dimensions may be

From 82a6ba4c50b3880dfd98d4a1706d0c7422ad7639 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cjaketae=E2=80=9D?= <jaesungtae@gmail.com>
Date: Mon, 2 Mar 2020 06:46:35 +0900
Subject: [PATCH 0004/1533] Fix plot_model for PDF

---
 tensorflow/python/keras/utils/vis_utils.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 1af6e00cbf9..d09fd30bd7c 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -293,8 +293,9 @@ def plot_model(model,
   # Return the image as a Jupyter Image object, to be displayed in-line.
   # Note that we cannot easily detect whether the code is running in a
   # notebook, and thus we always return the Image if Jupyter is available.
-  try:
-    from IPython import display
-    return display.Image(filename=to_file)
-  except ImportError:
-    pass
+  if extension != 'pdf':
+    try:
+      from IPython import display
+      return display.Image(filename=to_file)
+    except ImportError:
+      pass

From aa070f5be6bfc55c25802540da149138696f965f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cjaketae=E2=80=9D?= <jaesungtae@gmail.com>
Date: Mon, 9 Mar 2020 18:49:32 +0900
Subject: [PATCH 0005/1533] Minor refactoring of conditionals

---
 .../keras/applications/imagenet_utils.py      | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 8f698a1a1e4..5c3fdbe20b0 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -66,11 +66,11 @@ PREPROCESS_INPUT_DOC = """
       {ret}
 
   Raises:
-      ValueError: In case of unknown `data_format` argument.
+      ValueError: In case of unknown `mode` or `data_format` argument.
   """
 
 PREPROCESS_INPUT_MODE_DOC = """
-    mode: One of "caffe", "tf" or "torch".
+    mode: One of "caffe", "tf" or "torch". Defaults to "caffe".
       - caffe: will convert the images from RGB to BGR,
           then will zero-center each color channel with
           respect to the ImageNet dataset,
@@ -97,9 +97,12 @@ PREPROCESS_INPUT_RET_DOC_CAFFE = """
 @keras_export('keras.applications.imagenet_utils.preprocess_input')
 def preprocess_input(x, data_format=None, mode='caffe'):
   """Preprocesses a tensor or Numpy array encoding a batch of images."""
+  if mode not in {'caffe', 'tf','torch'}:
+    raise ValueError('Unknown mode ' + str(mode))
+
   if data_format is None:
     data_format = backend.image_data_format()
-  if data_format not in {'channels_first', 'channels_last'}:
+  elif data_format not in {'channels_first', 'channels_last'}:
     raise ValueError('Unknown data_format ' + str(data_format))
 
   if isinstance(x, np.ndarray):
@@ -182,8 +185,7 @@ def _preprocess_numpy_input(x, data_format, mode):
     x /= 127.5
     x -= 1.
     return x
-
-  if mode == 'torch':
+  elif mode == 'torch':
     x /= 255.
     mean = [0.485, 0.456, 0.406]
     std = [0.229, 0.224, 0.225]
@@ -253,8 +255,7 @@ def _preprocess_symbolic_input(x, data_format, mode):
     x /= 127.5
     x -= 1.
     return x
-
-  if mode == 'torch':
+  elif mode == 'torch':
     x /= 255.
     mean = [0.485, 0.456, 0.406]
     std = [0.229, 0.224, 0.225]
@@ -414,10 +415,10 @@ def validate_activation(classifier_activation, weights):
     return
 
   classifier_activation = activations.get(classifier_activation)
-  if classifier_activation not in [
+  if classifier_activation not in {
       activations.get('softmax'),
       activations.get(None)
-  ]:
+  }:
     raise ValueError('Only `None` and `softmax` activations are allowed '
                      'for the `classifier_activation` argument when using '
                      'pretrained weights, with `include_top=True`')

From f673a86e1c94944957487c41253ff19b50175a07 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cjaketae=E2=80=9D?= <jaesungtae@gmail.com>
Date: Tue, 10 Mar 2020 19:04:35 +0900
Subject: [PATCH 0006/1533] Fixed whitespace

---
 tensorflow/python/keras/applications/imagenet_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 5c3fdbe20b0..4b73b1c9bf9 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -97,7 +97,7 @@ PREPROCESS_INPUT_RET_DOC_CAFFE = """
 @keras_export('keras.applications.imagenet_utils.preprocess_input')
 def preprocess_input(x, data_format=None, mode='caffe'):
   """Preprocesses a tensor or Numpy array encoding a batch of images."""
-  if mode not in {'caffe', 'tf','torch'}:
+  if mode not in {'caffe', 'tf', 'torch'}:
     raise ValueError('Unknown mode ' + str(mode))
 
   if data_format is None:

From db1e40b9c5ad3c6acc36b0d33e08058787b11725 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cjaketae=E2=80=9D?= <jaesungtae@gmail.com>
Date: Wed, 11 Mar 2020 08:29:05 +0900
Subject: [PATCH 0007/1533] Edited docstrings, added unit test, and fixed minor
 typos

---
 tensorflow/python/keras/applications/densenet.py |  4 +++-
 .../python/keras/applications/imagenet_utils.py  | 16 +++++++++++++---
 .../keras/applications/imagenet_utils_test.py    |  5 +++++
 .../keras/applications/inception_resnet_v2.py    |  4 +++-
 .../python/keras/applications/inception_v3.py    |  4 +++-
 .../python/keras/applications/mobilenet.py       |  4 +++-
 .../python/keras/applications/mobilenet_v2.py    |  4 +++-
 tensorflow/python/keras/applications/nasnet.py   |  4 +++-
 tensorflow/python/keras/applications/resnet.py   |  4 +++-
 .../python/keras/applications/resnet_v2.py       |  4 +++-
 tensorflow/python/keras/applications/vgg16.py    |  4 +++-
 tensorflow/python/keras/applications/vgg19.py    |  3 ++-
 12 files changed, 47 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 9b11c342536..2dfd1482fbf 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -393,7 +393,9 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TORCH)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TORCH, 
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 4b73b1c9bf9..5de98527cf4 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -66,7 +66,7 @@ PREPROCESS_INPUT_DOC = """
       {ret}
 
   Raises:
-      ValueError: In case of unknown `mode` or `data_format` argument.
+      {error}
   """
 
 PREPROCESS_INPUT_MODE_DOC = """
@@ -82,12 +82,21 @@ PREPROCESS_INPUT_MODE_DOC = """
           ImageNet dataset.
   """
 
+PREPROCESS_INPUT_DEFAULT_ERROR_DOC = """
+    ValueError: In case of unknown `mode` or `data_format` argument.
+  """
+
+PREPROCESS_INPUT_ERROR_DOC = """
+    ValueError: In case of unknown `data_format` argument.
+  """
+
+
 PREPROCESS_INPUT_RET_DOC_TF = """
       The inputs pixel values are scaled between -1 and 1, sample-wise."""
 
 PREPROCESS_INPUT_RET_DOC_TORCH = """
       The input pixels values are scaled between 0 and 1 and each channel is
-      normalized with respect to the InageNet dataset."""
+      normalized with respect to the ImageNet dataset."""
 
 PREPROCESS_INPUT_RET_DOC_CAFFE = """
       The images are converted from RGB to BGR, then each color channel is
@@ -114,7 +123,8 @@ def preprocess_input(x, data_format=None, mode='caffe'):
 
 
 preprocess_input.__doc__ = PREPROCESS_INPUT_DOC.format(
-    mode=PREPROCESS_INPUT_MODE_DOC, ret='')
+    mode=PREPROCESS_INPUT_MODE_DOC, ret='', 
+    error=PREPROCESS_INPUT_DEFAULT_ERROR_DOC)
 
 
 @keras_export('keras.applications.imagenet_utils.decode_predictions')
diff --git a/tensorflow/python/keras/applications/imagenet_utils_test.py b/tensorflow/python/keras/applications/imagenet_utils_test.py
index f37ae8c188c..7f7698606d7 100644
--- a/tensorflow/python/keras/applications/imagenet_utils_test.py
+++ b/tensorflow/python/keras/applications/imagenet_utils_test.py
@@ -29,6 +29,11 @@ from tensorflow.python.platform import test
 class TestImageNetUtils(keras_parameterized.TestCase):
 
   def test_preprocess_input(self):
+    # Test invalid mode check
+    x = np.random.uniform(0, 255, (10, 10, 3))
+    with self.assertRaises(ValueError):
+      utils.preprocess_input(x, mode='some_unknown_mode')
+
     # Test image batch with float and int image input
     x = np.random.uniform(0, 255, (2, 10, 10, 3))
     xint = x.astype('int32')
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 7f338f82597..51b52eaf3d6 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -412,5 +412,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index fa44becfe48..6a130b8ba49 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -440,5 +440,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index d935282f98a..806fafd42c4 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -473,5 +473,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index bdd21c3da62..7dc0fd907eb 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -506,5 +506,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 3da415dbb12..238f3c9c64d 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -819,5 +819,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index 3e33bb04bdd..1713e823b1c 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -553,7 +553,9 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index 2e1ee272c4b..aa52d0047f0 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -164,7 +164,9 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
 
 DOC = """
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index 534d2cff6be..b1f3cbfa52e 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -262,5 +262,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='', 
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index 81c90e1ebb4..90605d40b51 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -267,5 +267,6 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE)
+    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__

From 349e30095e5ba7fbe274de782c8288b545720bd8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E2=80=9Cjaketae=E2=80=9D?= <jaesungtae@gmail.com>
Date: Fri, 20 Mar 2020 02:01:05 +0900
Subject: [PATCH 0008/1533] Resolve key error and add missing docs

---
 .../keras/applications/imagenet_utils.py      |  7 ++---
 .../keras/applications/inception_resnet_v2.py |  3 +-
 .../python/keras/applications/inception_v3.py |  3 +-
 .../python/keras/applications/mobilenet.py    |  3 +-
 .../python/keras/applications/mobilenet_v2.py | 30 +++++++++++++++++--
 .../python/keras/applications/nasnet.py       |  5 ++--
 .../python/keras/applications/resnet_v2.py    |  2 +-
 tensorflow/python/keras/applications/vgg16.py |  2 +-
 tensorflow/python/keras/applications/vgg19.py |  3 +-
 .../python/keras/applications/xception.py     |  4 ++-
 10 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
index 5de98527cf4..e0650f7c4a8 100644
--- a/tensorflow/python/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -83,13 +83,10 @@ PREPROCESS_INPUT_MODE_DOC = """
   """
 
 PREPROCESS_INPUT_DEFAULT_ERROR_DOC = """
-    ValueError: In case of unknown `mode` or `data_format` argument.
-  """
+    ValueError: In case of unknown `mode` or `data_format` argument."""
 
 PREPROCESS_INPUT_ERROR_DOC = """
-    ValueError: In case of unknown `data_format` argument.
-  """
-
+    ValueError: In case of unknown `data_format` argument."""
 
 PREPROCESS_INPUT_RET_DOC_TF = """
       The inputs pixel values are scaled between -1 and 1, sample-wise."""
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 51b52eaf3d6..5c2bd20f782 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -389,7 +389,8 @@ def preprocess_input(x, data_format=None):
   Raises
     ValueError: In case of unknown `data_format` argument.
   """
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+  return imagenet_utils.preprocess_input(
+      x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.inception_resnet_v2.decode_predictions')
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 6a130b8ba49..fc38d61a6d0 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -417,7 +417,8 @@ def preprocess_input(x, data_format=None):
   Raises
     ValueError: In case of unknown `data_format` argument.
   """
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+  return imagenet_utils.preprocess_input(
+      x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.inception_v3.decode_predictions')
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index 806fafd42c4..445d5ac1b52 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -450,7 +450,8 @@ def preprocess_input(x, data_format=None):
   Raises
     ValueError: In case of unknown `data_format` argument.
   """
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+  return imagenet_utils.preprocess_input(
+      x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.mobilenet.decode_predictions')
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 7dc0fd907eb..721fdf252ca 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -497,16 +497,42 @@ def _make_divisible(v, divisor, min_value=None):
 
 @keras_export('keras.applications.mobilenet_v2.preprocess_input')
 def preprocess_input(x, data_format=None):
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+  """Preprocesses a numpy array encoding a batch of images.
+
+  Arguments
+    x: A 4D numpy array consists of RGB values within [0, 255].
+
+  Returns
+    Preprocessed array.
+
+  Raises
+    ValueError: In case of unknown `data_format` argument.
+  """
+  return imagenet_utils.preprocess_input(
+      x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.mobilenet_v2.decode_predictions')
 def decode_predictions(preds, top=5):
+  """Decodes the prediction result from the model.
+
+  Arguments
+    preds: Numpy tensor encoding a batch of predictions.
+    top: Integer, how many top-guesses to return.
+
+  Returns
+    A list of lists of top class prediction tuples
+    `(class_name, class_description, score)`.
+    One list of tuples per sample in batch input.
+
+  Raises
+    ValueError: In case of invalid shape of the `preds` array (must be 2D).
+  """
   return imagenet_utils.decode_predictions(preds, top=top)
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', 
+    mode='',
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
     error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index 238f3c9c64d..d7ed2eb3823 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -796,7 +796,8 @@ def preprocess_input(x, data_format=None):
   Raises
     ValueError: In case of unknown `data_format` argument.
   """
-  return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf')
+  return imagenet_utils.preprocess_input(
+      x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.nasnet.decode_predictions')
@@ -819,7 +820,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', 
+    mode='',
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
     error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/resnet_v2.py b/tensorflow/python/keras/applications/resnet_v2.py
index aa52d0047f0..2c733ba0e29 100644
--- a/tensorflow/python/keras/applications/resnet_v2.py
+++ b/tensorflow/python/keras/applications/resnet_v2.py
@@ -164,7 +164,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', 
+    mode='',
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
     error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index b1f3cbfa52e..3bbfe16515c 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -262,7 +262,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', 
+    mode='',
     ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
     error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index 90605d40b51..738aa41465d 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -267,6 +267,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_CAFFE,
     error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 5ea0f14cc79..b689607b15a 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -346,5 +346,7 @@ def decode_predictions(preds, top=5):
 
 
 preprocess_input.__doc__ = imagenet_utils.PREPROCESS_INPUT_DOC.format(
-    mode='', ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF)
+    mode='',
+    ret=imagenet_utils.PREPROCESS_INPUT_RET_DOC_TF,
+    error=imagenet_utils.PREPROCESS_INPUT_ERROR_DOC)
 decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__

From 6f042c81d73079d226c10cc21832d4b2e61ca32a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= <mans.nilsson@arm.com>
Date: Fri, 20 Mar 2020 07:22:09 +0100
Subject: [PATCH 0009/1533] TFLu: remove -fno-builtin compiler flag

The flag may cause performance issues, since it disables special
handling and optimizations of standard C library functions.
---
 tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc | 1 -
 tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc   | 1 -
 tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc    | 1 -
 tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc    | 1 -
 tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc  | 1 -
 tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc    | 1 -
 6 files changed, 6 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
index 9494158cd50..aa221174d0c 100644
--- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc
@@ -40,7 +40,6 @@ $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downlo
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
index 878067cf083..3f3e2ce425d 100644
--- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc
@@ -19,7 +19,6 @@ ifeq ($(TARGET), bluepill)
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
index 8b24f5beb92..e899cbd0672 100644
--- a/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/ecm3531_makefile.inc
@@ -27,7 +27,6 @@ ifeq ($(TARGET), ecm3531)
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
index 85e5aa7154d..bfeec5e55a2 100644
--- a/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/hexagon_makefile.inc
@@ -39,7 +39,6 @@ ifeq ($(TARGET), hexagon)
     -fdata-sections \
     -ffunction-sections \
     -fmessage-length=0 \
-    -fno-builtin \
     -fno-delete-null-pointer-checks \
     -fno-exceptions \
     -fno-register-global-dtors-with-atexit \
diff --git a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
index 7336c520b11..9062f25254e 100644
--- a/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/mcu_riscv_makefile.inc
@@ -16,7 +16,6 @@ ifeq ($(TARGET), riscv32_mcu)
     -DTF_LITE_MCU_DEBUG_LOG \
     -DTF_LITE_USE_GLOBAL_ROUND \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
diff --git a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
index 539f4528d06..24b36f119a2 100644
--- a/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/stm32f4_makefile.inc
@@ -16,7 +16,6 @@ ifeq ($(TARGET), stm32f4)
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
-    -fno-builtin \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \

From d4d23502bfc20172a34ce36b8129696e1b63bfa7 Mon Sep 17 00:00:00 2001
From: ShengYang1 <yang.sheng@intel.com>
Date: Sun, 29 Mar 2020 16:23:03 +0800
Subject: [PATCH 0010/1533] Fuse BN and Relu in mkl path

---
 tensorflow/core/graph/mkl_layout_pass.cc      |  56 +++-
 tensorflow/core/graph/mkl_layout_pass_test.cc | 106 +++++++
 .../grappler/optimizers/mkl_remapper_test.cc  | 173 +++++++++++
 .../core/grappler/optimizers/remapper.cc      |  23 +-
 tensorflow/core/kernels/BUILD                 |   9 +-
 .../core/kernels/mkl_fused_batch_norm_op.cc   | 272 +++++++++++++-----
 tensorflow/core/ops/mkl_nn_ops.cc             |  42 +++
 tensorflow/core/ops/nn_ops.cc                 |   4 +
 8 files changed, 601 insertions(+), 84 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index c27c7aa911b..e5d0fbfbd09 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -268,6 +268,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.dequantize = "Dequantize";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.fused_batch_norm_ex = "_FusedBatchNormEx";
     csinfo_.fused_batch_norm_v2 = "FusedBatchNormV2";
     csinfo_.fused_batch_norm_grad_v2 = "FusedBatchNormGradV2";
     csinfo_.fused_batch_norm_v3 = "FusedBatchNormV3";
@@ -294,6 +295,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "_MklDepthwiseConv2dNativeBackpropInput";
     csinfo_.mkl_depthwise_conv2d_grad_filter =
         "_MklDepthwiseConv2dNativeBackpropFilter";
+    csinfo_.mkl_fused_batch_norm_ex = "_MklFusedBatchNormEx";
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_fused_matmul = "_MklFusedMatMul";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
@@ -476,6 +478,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#ifdef ENABLE_MKLDNN_V1
+    rinfo_.push_back({csinfo_.fused_batch_norm_ex,
+                      csinfo_.mkl_fused_batch_norm_ex, CopyAttrsAll,
+                      FusedBatchNormExRewrite, kRewriteForLayoutPropagation});
+#endif
     rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
                       kRewriteForLayoutPropagation});
@@ -920,6 +927,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string dequantize;
     string fused_batch_norm;
     string fused_batch_norm_grad;
+    string fused_batch_norm_ex;
     string fused_batch_norm_v2;
     string fused_batch_norm_grad_v2;
     string fused_batch_norm_v3;
@@ -944,6 +952,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_with_bias;
     string mkl_depthwise_conv2d_grad_input;
     string mkl_depthwise_conv2d_grad_filter;
+    string mkl_fused_batch_norm_ex;
     string mkl_fused_conv2d;
     string mkl_fused_matmul;
     string mkl_pad_with_conv2d;
@@ -1652,6 +1661,31 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  static bool FusedBatchNormExRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    int num_side_inputs;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "num_side_inputs", &num_side_inputs));
+    string activation_mode;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "activation_mode", &activation_mode));
+
+    // if the num_side_inputs is not 0, don't rewrite the node.
+    if (num_side_inputs != 0) {
+      VLOG(1) << "FusedBatchNormExRewrite: The model sets num_side_inputs"
+              << "larger than 0 is not optimized by Intel MKL.";
+      return false;
+    }
+
+    // if the activation_mode is not 'Relu', don't rewrite the node.
+    if (activation_mode != "Relu") {
+      VLOG(1) << "FusedBatchNormExRewrite: Only Relu activation mode is"
+              << "supported by Intel MKL.";
+      return false;
+    }
+
+    return true;
+  }
+
   static bool FusedConv2DRewrite(const Node* n) {
     // MKL DNN currently doesn't support all fusions that grappler fuses
     // together with Conv2D (ex. batchnorm). We rewrite _FusedConv2D only if
@@ -2131,9 +2165,6 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   // Number of input slots to original op
   // Input slots are represented by .Input() calls in REGISTER_OP.
   int old_node_input_slots = old_node->op_def().input_arg_size();
-  // Actual number of inputs can be greater than or equal to number
-  // of Input slots because inputs of type list could be unfolded.
-  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
   int nn_slot_idx = 0;  // slot index for inputs of new node
 
   // Let's copy all inputs (TF tensors) of original node to new node.
@@ -2141,13 +2172,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingTFTensorList(old_node_inputs, &iidx,
+                                      tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -2180,13 +2212,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
-                                     &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                       tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -3702,6 +3735,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
       n->type_string() != csinfo_.pad_with_conv2d &&
       n->type_string() != csinfo_.pad_with_fused_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      n->type_string() != csinfo_.fused_batch_norm_ex &&
       n->type_string() != csinfo_.fused_conv2d &&
       n->type_string() != csinfo_.fused_matmul &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 02979d3ac2d..e5a50c27f0b 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3108,6 +3108,112 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
             "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
+#ifdef ENABLE_MKLDNN_V1
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph("node { name: 'A' op: '" #INPUT                                \
+              "'}"                                                           \
+              "node { name: 'B' op: 'Input'}"                                \
+              "node { name: 'C' op: 'Input'}"                                \
+              "node { name: 'D' op: 'Input'}"                                \
+              "node { name: 'E' op: 'Input'}"                                \
+              "node { name: 'F' op: '_FusedBatchNormEx'"                     \
+              " attr { key: 'T'               value { type: " #T             \
+              " } }"                                                         \
+              " attr { key: 'U'               value { type: DT_FLOAT } }"    \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"         \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"         \
+              " attr { key: 'num_side_inputs' value { i: 0 } }"              \
+              " attr { key: 'is_training'     value { b: true } }"           \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"         \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                          \
+              "node { name: 'G' op: 'Zeta'"                                  \
+              " attr { key: 'T' value { type: " #T                           \
+              " } }"                                                         \
+              " input: ['A', 'F'] }");                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+              "A(" #INPUT                                                    \
+              ");B(Input);C(Input);D(Input);"                                \
+              "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);"     \
+              "DMT/_4(Const);E(Input);"                                      \
+              "F(_MklFusedBatchNormEx);G(Zeta)|A->F;A->G;"                   \
+              "A:control->DMT/_0:control;A:control->DMT/_1:control;"         \
+              "A:control->DMT/_2:control;A:control->DMT/_3:control;"         \
+              "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"              \
+              "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;" \
+              "E->F:4;F->G:1");                                              \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with side input
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT                             \
+              "'}"                                                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'F' op: '" #INPUT                             \
+              "'}"                                                        \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T          \
+              " } }"                                                      \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"      \
+              " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                  \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T                        \
+              " } }"                                                      \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT                                                 \
+              ");B(Input);C(Input);D(Input);E(Input);"                    \
+              "F(" #INPUT                                                 \
+              ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                 \
+              "B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");               \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with Identity activation
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT                             \
+              "'}"                                                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T          \
+              " } }"                                                      \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Identity' } }"  \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                       \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T                        \
+              " } }"                                                      \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT                                                 \
+              ");B(Input);C(Input);D(Input);E(Input);"                    \
+              "G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                   \
+              "B->G:1;C->G:2;D->G:3;E->G:4;G->H:1");                      \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
+#undef REGISTER_TEST
+#endif  // ENABLE_MKLDNN_V1
+
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
       "node { name: 'A' op: 'QuantizedUnsignedInt8Input'}"
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 87841316fc1..66cc3418f3a 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -173,6 +174,178 @@ TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddNRelu) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
+#ifdef ENABLE_MKLDNN_V1
+TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
+  using ::tensorflow::ops::Placeholder;
+
+  for (bool is_training : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    const int num_channels = 24;
+
+    TensorShape channel_shape({num_channels});
+    TensorShape empty_shape({0});
+
+    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
+                             ops::Placeholder::Shape({2, 8, 8, num_channels}));
+    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+
+    float epsilon = 0.1f;
+    auto fbn = ops::FusedBatchNormV3(
+        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
+        ops::FusedBatchNormV3::IsTraining(is_training)
+            .Epsilon(epsilon)
+            .DataFormat("NHWC"));
+    auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
+    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+    auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                             : channel_shape);
+    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                            : channel_shape);
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t},
+                 {"scale", scale_t},
+                 {"offset", offset_t},
+                 {"mean", mean_t},
+                 {"var", var_t}};
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "relu") {
+        EXPECT_EQ(node.op(), "Identity");
+        ASSERT_EQ(node.input_size(), 1);
+        EXPECT_EQ(node.input(0), "fused_batch_norm");
+        found++;
+      }
+      if (node.name() == "fused_batch_norm") {
+        EXPECT_EQ(node.op(), "_FusedBatchNormEx");
+        ASSERT_EQ(node.input_size(), 5);
+        EXPECT_EQ(node.input(0), "input_cast");
+        EXPECT_EQ(node.input(1), "scale");
+        EXPECT_EQ(node.input(2), "offset");
+        EXPECT_EQ(node.input(3), "mean");
+        EXPECT_EQ(node.input(4), "var");
+
+        auto attr = node.attr();
+        EXPECT_EQ(attr["num_side_inputs"].i(), 0);
+        EXPECT_EQ(attr["activation_mode"].s(), "Relu");
+        found++;
+      }
+    }
+    EXPECT_EQ(found, 2);
+  }
+}
+
+TEST_F(MklRemapperTest, FuseBatchNormWithAddAndRelu) {
+  using ::tensorflow::ops::Placeholder;
+
+  for (bool is_training : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    const int num_channels = 24;
+
+    TensorShape input_shape({2, 8, 8, num_channels});
+    TensorShape channel_shape({num_channels});
+    TensorShape empty_shape({0});
+
+    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
+                             ops::Placeholder::Shape(input_shape));
+    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+    auto side_input = Placeholder(s.WithOpName("side_input"), DT_FLOAT,
+                                  ops::Placeholder::Shape(input_shape));
+    auto side_input_cast =
+        ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
+
+    float epsilon = 0.1f;
+    auto fbn = ops::FusedBatchNormV3(
+        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
+        ops::FusedBatchNormV3::IsTraining(is_training)
+            .Epsilon(epsilon)
+            .DataFormat("NHWC"));
+    auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
+    auto relu = ops::Relu(s.WithOpName("relu"), add);
+    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+    auto input_t = GenerateRandomTensor<DT_FLOAT>(input_shape);
+    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                             : channel_shape);
+    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                            : channel_shape);
+    auto side_input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t},   {"scale", scale_t},
+                 {"offset", offset_t}, {"mean", mean_t},
+                 {"var", var_t},       {"side_input", side_input_t}};
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "add") {
+        EXPECT_EQ(node.op(), "Add");
+        ASSERT_EQ(node.input_size(), 2);
+        EXPECT_EQ(node.input(0), "fused_batch_norm");
+        EXPECT_EQ(node.input(1), "side_input_cast");
+        found++;
+      }
+      if (node.name() == "relu") {
+        EXPECT_EQ(node.op(), "Relu");
+        ASSERT_EQ(node.input_size(), 1);
+        EXPECT_EQ(node.input(0), "add");
+        found++;
+      }
+      if (node.name() == "fused_batch_norm") {
+        EXPECT_EQ(node.op(), "FusedBatchNormV3");
+        ASSERT_EQ(node.input_size(), 5);
+        EXPECT_EQ(node.input(0), "input_cast");
+        EXPECT_EQ(node.input(1), "scale");
+        EXPECT_EQ(node.input(2), "offset");
+        EXPECT_EQ(node.input(3), "mean");
+        EXPECT_EQ(node.input(4), "var");
+        found++;
+      }
+    }
+    EXPECT_EQ(found, 3);
+  }
+}
+#endif  // ENABLE_MKLDNN_V1
+
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 5b41ad38089..fd8c7a0af12 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -741,24 +741,27 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
       [&](const utils::MutableNodeView& fused_batch_norm) -> bool {
     const auto* fused_batch_norm_node_def = fused_batch_norm.node();
     if (!IsFusedBatchNorm(*fused_batch_norm_node_def)) return false;
-
-    // We fuse FusedBatchNorm only on GPU, because on CPU we fuse it with
-    // contraction (MatMul or Conv2D node).
+// We fuse FusedBatchNorm on GPU or MKL CPU.
+#ifndef ENABLE_MKLDNN_V1
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
+#endif
 
     DataType t_dtype = GetDataTypeFromAttr(*fused_batch_norm_node_def, "T");
+#ifndef ENABLE_MKLDNN_V1
     if (t_dtype != DT_FLOAT && t_dtype != DT_HALF) return false;
+#else
+    if (t_dtype != DT_FLOAT && t_dtype != DT_BFLOAT16) return false;
+#endif
 
     // Get the FusedBatchNorm training mode.
     bool is_training;
     if (!GetNodeAttr(*fused_batch_norm_node_def, kIsTraining, &is_training)
              .ok())
       return false;
-
     // In training mode we rely on cuDNN for computing FusedBatchNorm with side
     // inputs and activation, and it has its own limitations. In inference mode
     // we have a custom CUDA kernel that doesn't not have these constraints.
-    if (is_training) {
+    if (is_training && NodeIsOnGpu(fused_batch_norm_node_def)) {
       // cuDNN only supports NHWC data layout.
       string data_format;
       if (!GetNodeAttr(*fused_batch_norm_node_def, kDataFormat, &data_format)
@@ -810,6 +813,12 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 
   // Input to a Relu can be an Add node with FusedBatchNorm as one of the inputs
   if (IsAdd(*relu_fanin_0_node_def)) {
+// Currently no CPU implementation for "FusedBatchNorm + SideInput +
+// <Activation>""
+#ifdef ENABLE_MKLDNN_V1
+    return false;
+#endif
+
     // Check that only Relu node consumes the output of an Add node.
     if (HasControlFaninOrFanout(*relu_fanin_0_node_view) ||
         !HasAtMostOneFanoutAtPort0(*relu_fanin_0_node_view) ||
@@ -881,7 +890,11 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
   if (fused_batch_norm.op() != "FusedBatchNorm") {
     (*attr)["U"] = src_attr.at("U");
   } else {
+#ifndef ENABLE_MKLDNN_V1
     (*attr)["U"] = src_attr.at("T");
+#else
+    SetAttrValue(DT_FLOAT, &(*attr)["U"]);
+#endif
   }
 }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0477d260e10..327ada53ec1 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8135,7 +8135,14 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + mkl_deps(),
+    hdrs = [
+        "fused_batch_norm_op.h",
+        "no_op.h",
+    ],
+    deps = NN_DEPS + [
+        ":fused_batch_norm_op",
+        ":no_op",
+    ] + mkl_deps(),
 )
 
 tf_cc_test_mkl(
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 40e4825c0fa..6df02bc3023 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -14,14 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #ifdef INTEL_MKL
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fused_batch_norm_op.h"
+#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define GET_FLAG(bn_flag) static_cast<int>(BN_FLAGS::bn_flag)
 #define IS_SET(cflag) (context_.flags & GET_FLAG(cflag))
@@ -37,11 +39,14 @@ using BatchNormBwdPd = mkldnn::batch_normalization_backward::primitive_desc;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
+using FusedBNActivationMode = functor::FusedBatchNormActivationMode;
+
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
   int depth;
   float eps;
   bool training;
+  FusedBNActivationMode activation_mode;
 #ifndef ENABLE_MKLDNN_V1
   MEMORY_FORMAT src_format;
 #else
@@ -50,14 +55,17 @@ struct MklBatchNormFwdParams {
 
   MklBatchNormFwdParams(const memory::dims& src_dims, int depth, float eps,
 #ifndef ENABLE_MKLDNN_V1
-                        bool training, MEMORY_FORMAT src_format)
+                        bool training, MEMORY_FORMAT src_format,
+                        FusedBNActivationMode activation_mode)
 #else
-                        bool training, memory::desc src_md)
+                        bool training, memory::desc src_md,
+                        FusedBNActivationMode activation_mode)
 #endif  // !ENABLE_MKLDNN_V1
       : src_dims(src_dims),
         depth(depth),
         eps(eps),
         training(training),
+        activation_mode(activation_mode),
 #ifndef ENABLE_MKLDNN_V1
         src_format(src_format) {
   }
@@ -90,7 +98,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   //   mean_data:     output data buffer of means
   //   variance_data: output data buffer of variances
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
-               U* mean_data, U* variance_data) {
+               U* mean_data, U* variance_data, U* workspace_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -104,6 +112,9 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(static_cast<void*>(mean_data));
       context_.variance_mem->set_data_handle(static_cast<void*>(variance_data));
     }
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(workspace_data);
+    }
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
     execute_primitives(context_.fwd_primitives, context_.fwd_stream,
@@ -123,6 +134,10 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(DummyData);
       context_.variance_mem->set_data_handle(DummyData);
     }
+
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(DummyData);
+    }
   }
 
   MEMORY_PRIMITIVE_DESC GetDstPd() const { return context_.dst_mem->GET_DESC; }
@@ -158,6 +173,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> dst_mem;
     std::shared_ptr<mkldnn::memory> mean_mem;
     std::shared_ptr<mkldnn::memory> variance_mem;
+    std::shared_ptr<mkldnn::memory> ws_mem;
 
     // Forward BatchNorm primitive descriptor.
     std::shared_ptr<BatchNormFwdPd> fwd_pd;
@@ -179,6 +195,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           dst_mem(nullptr),
           mean_mem(nullptr),
           variance_mem(nullptr),
+          ws_mem(nullptr),
           bn_fwd(nullptr),
           fwd_stream(nullptr) {}
   };
@@ -192,6 +209,9 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
                                         : prop_kind::forward_scoring;
 
 #ifdef ENABLE_MKLDNN_V1
+    if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+      context_.flags |= GET_FLAG(fuse_norm_relu);
+    }
     // Memory descriptor
     auto src_md = fwdParams.src_md;
     // Create forward BatchNorm descriptor and primitive descriptor.
@@ -229,6 +249,13 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           m_dims, U, MEMORY_FORMAT::nc, cpu_engine_, DummyData));
     }
 
+#ifdef ENABLE_MKLDNN_V1
+    if (IS_SET(fuse_norm_relu)) {
+      context_.ws_mem.reset(new MEMORY_CONSTRUCTOR(
+          context_.fwd_pd->workspace_desc(), cpu_engine_, DummyData));
+    }
+#endif  // ENABLE_MKLDNN_V1
+
     // BatchNorm forward primitive.
     // TODO(intel-tf): Merge all the #ifdefs and simplify code
     if (!fwdParams.training && !(IS_SET(use_global_stats))) {
@@ -258,20 +285,41 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else if (IS_SET(use_global_stats)) {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       } else {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -291,19 +339,40 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             {MKLDNN_ARG_DST, *context_.dst_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             { MKLDNN_ARG_VARIANCE,
-               *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               { MKLDNN_ARG_VARIANCE,
+                 *context_.variance_mem }});
+        }
       } else {
-        context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
-                                     {MKLDNN_ARG_DST, *context_.dst_mem},
-                                     {MKLDNN_ARG_MEAN, *context_.mean_mem},
-                                     { MKLDNN_ARG_VARIANCE,
-                                       *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
+                                       {MKLDNN_ARG_DST, *context_.dst_mem},
+                                       {MKLDNN_ARG_MEAN, *context_.mean_mem},
+                                       { MKLDNN_ARG_VARIANCE,
+                                         *context_.variance_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -360,6 +429,7 @@ class MklFusedBatchNormFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey<int>(fwdParams.depth);
     key_creator.AddAsKey<float>(fwdParams.eps);
     key_creator.AddAsKey<bool>(fwdParams.training);
+    key_creator.AddAsKey<FusedBNActivationMode>(fwdParams.activation_mode);
     key_creator.AddAsKey(typeid(T).name());
     key_creator.AddAsKey(typeid(U).name());
     return key_creator.GetKey();
@@ -676,7 +746,8 @@ class MklFusedBatchNormBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 //  Adding a third parameter to the template to support FusedBatchNormV3
 //  with MKL. This is different from default where the classes are
 //  derived. Moves enabling to compile-time rather than runtime.
-template <typename Device, typename T, typename U, bool reserved_space>
+template <typename Device, typename T, typename U, bool reserved_space,
+          bool is_batch_norm_ex = false>
 class MklFusedBatchNormOp : public OpKernel {
  public:
   explicit MklFusedBatchNormOp(OpKernelConstruction* context)
@@ -696,6 +767,28 @@ class MklFusedBatchNormOp : public OpKernel {
     depth_ = 0;
     mean_values_ = nullptr;
     variance_values_ = nullptr;
+
+#ifndef ENABLE_MKLDNN_V1
+    OP_REQUIRES(context, !is_batch_norm_ex,
+                errors::InvalidArgument(
+                    "_MklFusedBatchNormEx is not supported in DNNL 0.x ."));
+#endif
+    if (!is_batch_norm_ex) {
+      activation_mode_ = FusedBNActivationMode::kIdentity;
+    } else {
+      int num_side_inputs;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("num_side_inputs", &num_side_inputs));
+      // Currently _MKLFusedBatchNormEx do not support "SideInput"
+      OP_REQUIRES(context, num_side_inputs == 0,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm do not support side input now."));
+
+      OP_REQUIRES_OK(context, ParseActivationMode(context, &activation_mode_));
+      OP_REQUIRES(context, activation_mode_ == FusedBNActivationMode::kRelu,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm only support Relu activation"));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -744,9 +837,12 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Handle the special case: input with 0 element and 0 batch size.
       Tensor* dst_tensor = nullptr;
+      TensorShape workspace_tf_shape;
       if (tf_shape_src.num_elements() == 0) {
-        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
-                         &dst_tensor);
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        HandleEmptyInput(context, tf_shape_src, workspace_tf_shape,
+                         scale_tensor.shape(), &dst_tensor);
         return;
       }
 
@@ -758,23 +854,16 @@ class MklFusedBatchNormOp : public OpKernel {
       // Index of output tensor(diff_src).
       const size_t kDstIndex = 0;
 
-      // Allocate 4 output TF tensors.
+      // Allocate 5 output TF tensors.
       Tensor* batch_mean_tensor = nullptr;
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
       Tensor* reserved_space_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
-                        &batch_variance_tensor, &saved_mean_tensor,
-                        &saved_variance_tensor, &reserved_space_tensor);
-
-      if (is_training_)
-        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
-      else
-        SetMeanVariance(est_mean_tensor, est_variance_tensor);
 
       MklDnnData<T> src(&cpu_engine_);
       MklDnnData<U> weights(&cpu_engine_);
+      MklDnnData<U> wksp(&cpu_engine_);
 
       MEMORY_FORMAT dnn_fmt;
       MKL_TENSOR_FORMAT mkl_tensor_fmt;
@@ -801,6 +890,51 @@ class MklFusedBatchNormOp : public OpKernel {
                         ? dnn_shape_src.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), dnn_fmt);
 
+#ifdef ENABLE_MKLDNN_V1
+      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
+                                      src_md, activation_mode_);
+#else
+      MklBatchNormFwdParams fwdParams(
+          src_dims, depth_, epsilon_, is_training_,
+          static_cast<MEMORY_FORMAT>(src_md.data.format), activation_mode_);
+#endif  // ENABLE_MKLDNN_V1
+      // Get forward batch-normalization op from the primitive caching pool.
+      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
+          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
+
+      // Allocate workspace tensor
+      U* ws_data = nullptr;
+      if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+#ifdef ENABLE_MKLDNN_V1
+        MEMORY_PRIMITIVE_DESC workspace_pd =
+            bn_fwd->GetBatchNormFwdPd()->workspace_desc();
+        size_t workspace_bytes = workspace_pd.get_size();
+        workspace_tf_shape.AddDim(workspace_bytes);
+
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+        if (reserved_space) {
+          wksp.SetUsrMem(workspace_pd, reserved_space_tensor);
+          ws_data = static_cast<U*>(wksp.GetOpMem().get_data_handle());
+        }
+#endif  // ENABLE_MKLDNN_V1
+      } else {
+        // There is actually no workspace tensor out, so we make a dummy one.
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+      }
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
       weights.AllocateBuffer(2 * depth_ * sizeof(U));
@@ -821,18 +955,6 @@ class MklFusedBatchNormOp : public OpKernel {
                   reinterpret_cast<char*>(variance_values_),
                   depth_ * sizeof(U));
 
-#ifdef ENABLE_MKLDNN_V1
-      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
-                                      src_md);
-#else
-      MklBatchNormFwdParams fwdParams(
-          src_dims, depth_, epsilon_, is_training_,
-          static_cast<MEMORY_FORMAT>(src_md.data.format));
-#endif  // ENABLE_MKLDNN_V1
-      // Get forward batch-normalization op from the primitive caching pool.
-      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
-          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
-
       // Check if reorder is needed for src.
       const T* src_data = nullptr;
       std::shared_ptr<BatchNormFwdPd> bn_fwd_pd = bn_fwd->GetBatchNormFwdPd();
@@ -866,7 +988,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Execute
       bn_fwd->Execute(src_data, weights_op_data, dst_data, mean_op_data,
-                      variance_op_data);
+                      variance_op_data, ws_data);
 
       float adjust_factor = 1.0;
       if (is_training_) {
@@ -924,6 +1046,7 @@ class MklFusedBatchNormOp : public OpKernel {
   U* mean_values_;
   U* variance_values_;
   size_t depth_;  // Batch normalization is performed for per channel.
+  FusedBNActivationMode activation_mode_;
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 
   void ExtractParams(OpKernelContext* context) {
@@ -938,6 +1061,7 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape workspace_tf_shape,
                         TensorShape tf_shape_scale, Tensor** dst_tensor) {
     DCHECK(dst_tensor);
 
@@ -955,12 +1079,14 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
     Tensor* reserved_space_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
-                      &batch_variance_tensor, &saved_mean_tensor,
-                      &saved_variance_tensor, &reserved_space_tensor);
+    AllocateTFOutputs(context, tf_shape_scale, workspace_tf_shape,
+                      &batch_mean_tensor, &batch_variance_tensor,
+                      &saved_mean_tensor, &saved_variance_tensor,
+                      &reserved_space_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
+                         TensorShape workspace_tf_shape,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1024,21 +1150,15 @@ class MklFusedBatchNormOp : public OpKernel {
     std::fill_n(saved_variance_data, num_elements, static_cast<U>(0));
 
     // Changes to support reserved_space_3 parameter in FusedBatchNormV3.
-    // TODO: This parameter functionality is not implemented on CPU.
-    //       It is used to hold intermediate results. So the allocated
-    //       memory is filled with 0s.
     if (reserved_space) {
       DCHECK(reserved_space_tensor != nullptr);
 
       MklDnnShape mkl_shape_reserved_space;
       mkl_shape_reserved_space.SetMklTensor(false);
       AllocateOutputSetMklShape(context, kReservedSpaceIndex,
-                                reserved_space_tensor, tf_shape_scale,
+                                reserved_space_tensor, workspace_tf_shape,
                                 mkl_shape_reserved_space);
       DCHECK((*reserved_space_tensor) != nullptr);
-      auto saved_reserved_space_data =
-          (*reserved_space_tensor)->flat<U>().data();
-      std::fill_n(saved_reserved_space_data, num_elements, static_cast<U>(0));
     }
   }
 };
@@ -1367,7 +1487,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, T, false>);
+      MklFusedBatchNormOp<CPUDevice, T, T, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_CPU);
 TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
@@ -1380,7 +1500,7 @@ TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, false>);
+      MklFusedBatchNormOp<CPUDevice, T, U, false, false>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(bfloat16, float);
@@ -1421,12 +1541,30 @@ REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(bfloat16, float);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, true>);
+      MklFusedBatchNormOp<CPUDevice, T, U, true, false>);      \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklFusedBatchNormEx")                             \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .TypeConstraint<U>("U")                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(bfloat16, float);
 #undef REGISTER_MKL_FUSED_BATCHNORM_V3_CPU
 
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+
 #define REGISTER_MKL_FUSED_BATCHNORM_GRAD_V3_CPU(T, U)         \
   REGISTER_KERNEL_BUILDER(                                     \
       Name("_MklFusedBatchNormGradV3")                         \
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 3f9cc0df131..90f945d2692 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -1342,6 +1342,48 @@ REGISTER_OP("_MklFusedBatchNormGradV3")
         R"doc(MKL-DNN implementation of FusedBatchNormGradV3: Do not invoke this operator directly in Python.
              Graph rewrite pass is expected to invoke this operator.)doc");
 
+REGISTER_OP("_MklFusedBatchNormEx")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Input("side_input: num_side_inputs * T")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_offset: uint8")
+    .Input("mkl_mean: uint8")
+    .Input("mkl_variance: uint8")
+    .Input("mkl_side_input: num_side_inputs * uint8")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Output("reserve_space_3: U")
+    .Output("mkl_y: uint8")
+    .Output("mkl_batch_mean: uint8")
+    .Output("mkl_batch_variance: uint8")
+    .Output("mkl_reserve_space_1: uint8")
+    .Output("mkl_reserve_space_2: uint8")
+    .Output("mkl_reserve_space_3: uint8")
+    .Attr("T: {bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("num_side_inputs: int >= 0 = 0")
+    .Attr("activation_mode: string = \"Identity\"")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
+    .Doc(R"doc(
+MKL version of FusedBatchNormEx operator. Uses MKL DNN APIs to perform fused
+batch normalization and relu.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 2e55bc6cd95..53ee2cfa035 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -238,7 +238,11 @@ REGISTER_OP("_FusedBatchNormEx")
     .Output("reserve_space_1: U")
     .Output("reserve_space_2: U")
     .Output("reserve_space_3: U")
+#ifdef ENABLE_MKLDNN_V1
+    .Attr("T: {half, float, bfloat16}")
+#else
     .Attr("T: {half, float}")
+#endif
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("exponential_avg_factor: float = 1.0")

From d4097df2ace6c1da7dc898119de2f43853bf00be Mon Sep 17 00:00:00 2001
From: blueyi <blueyiniu@qq.com>
Date: Sun, 29 Mar 2020 22:08:17 +0800
Subject: [PATCH 0011/1533] Fix document error for ApplyFtrl, ApplyFtrlV2 and
 SparseApplyFtrlV2

---
 tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt      | 2 +-
 tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt    | 4 ++--
 .../core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt     | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
index 0f49a18a114..f3379461a5f 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrl.pbtxt
@@ -65,7 +65,7 @@ END
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
   description: <<END
 accum_new = accum + grad * grad
-linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
 accum = accum_new
diff --git a/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
index 3218ab7776c..1eb33005e91 100644
--- a/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ApplyFtrlV2.pbtxt
@@ -65,8 +65,8 @@ END
   summary: "Update \'*var\' according to the Ftrl-proximal scheme."
   description: <<END
 grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
+accum_new = accum + grad * grad
+linear += grad_with_shrinkage -
     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
index df924f29636..5300b5570cb 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseApplyFtrlV2.pbtxt
@@ -72,8 +72,8 @@ END
   description: <<END
 That is for rows we have grad for, we update var, accum and linear as follows:
 grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-linear += grad_with_shrinkage +
+accum_new = accum + grad * grad
+linear += grad_with_shrinkage -
     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
 quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0

From 056359fee8f26bf93d44605adc2c22330a849074 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 15 Feb 2020 00:41:56 +0000
Subject: [PATCH 0012/1533] Support Keras grouped convolutions

---
 .../python/keras/layers/convolutional.py      | 60 ++++++++++++++++++-
 .../python/keras/layers/convolutional_test.py | 32 ++++++----
 tensorflow/python/ops/nn_ops.py               | 10 ++--
 3 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 2cdecbc15ca..36a1aec5ce1 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -77,6 +77,16 @@ class Conv(Layer):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
+    groups: Integer, the number of channel groups controlling the connections
+      between inputs and outputs. Input channels and `filters` must both be
+      divisible by `groups`. For example,
+        - At `groups=1`, all inputs are convolved to all outputs.
+        - At `groups=2`, the operation becomes equivalent to having two
+          convolutional layers side by side, each seeing half the input
+          channels, and producing half the output channels, and both
+          subsequently concatenated.
+        - At `groups=input_channels`, each input channel is convolved with its
+          own set of filters, of size `input_channels / filters`
     activation: Activation function to use.
       If you don't specify anything, no activation is applied.
     use_bias: Boolean, whether the layer uses a bias.
@@ -106,6 +116,7 @@ class Conv(Layer):
                padding='valid',
                data_format=None,
                dilation_rate=1,
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -127,6 +138,11 @@ class Conv(Layer):
     if filters is not None and not isinstance(filters, int):
       filters = int(filters)
     self.filters = filters
+    self.groups = groups
+    if filters and filters % self.groups != 0:
+      raise ValueError(
+          'The number of filters is not divisible by the number groups. '
+          '{} % {} = {}'.format(filters, groups, filters % groups))
     self.kernel_size = conv_utils.normalize_tuple(
         kernel_size, rank, 'kernel_size')
     if not all(self.kernel_size):
@@ -154,7 +170,12 @@ class Conv(Layer):
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_channel = self._get_input_channel(input_shape)
-    kernel_shape = self.kernel_size + (input_channel, self.filters)
+    if input_channel % self.groups != 0:
+      raise ValueError(
+          'The number of input channels is not divisible by the number of '
+          'channel group. {} % {} = {}'.format(input_channel, self.groups,
+                                               input_channel % self.groups))
+    kernel_shape = self.kernel_size + (input_channel // self.groups, self.filters)
 
     self.kernel = self.add_weight(
         name='kernel',
@@ -262,6 +283,7 @@ class Conv(Layer):
         'padding': self.padding,
         'data_format': self.data_format,
         'dilation_rate': self.dilation_rate,
+        'groups': self.groups,
         'activation': activations.serialize(self.activation),
         'use_bias': self.use_bias,
         'kernel_initializer': initializers.serialize(self.kernel_initializer),
@@ -374,6 +396,16 @@ class Conv1D(Conv):
         2.1](https://arxiv.org/abs/1609.03499).
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
+    groups: Integer, the number of channel groups controlling the connections
+      between inputs and outputs. Input channels and `filters` must both be
+      divisible by `groups`. For example,
+        - At `groups=1`, all inputs are convolved to all outputs.
+        - At `groups=2`, the operation becomes equivalent to having two
+          convolutional layers side by side, each seeing half the input
+          channels, and producing half the output channels, and both
+          subsequently concatenated.
+        - At `groups=input_channels`, each input channel is convolved with its
+          own set of filters, of size `input_channels / filters`
     dilation_rate: an integer or tuple/list of a single integer, specifying
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
@@ -420,6 +452,7 @@ class Conv1D(Conv):
                padding='valid',
                data_format='channels_last',
                dilation_rate=1,
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -438,6 +471,7 @@ class Conv1D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -524,6 +558,16 @@ class Conv2D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
+    groups: Integer, the number of channel groups controlling the connections
+      between inputs and outputs. Input channels and `filters` must both be
+      divisible by `groups`. For example,
+        - At `groups=1`, all inputs are convolved to all outputs.
+        - At `groups=2`, the operation becomes equivalent to having two
+          convolutional layers side by side, each seeing half the input
+          channels, and producing half the output channels, and both
+          subsequently concatenated.
+        - At `groups=input_channels`, each input channel is convolved with its
+          own set of filters, of size `input_channels / filters`
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -573,6 +617,7 @@ class Conv2D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1),
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -591,6 +636,7 @@ class Conv2D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -662,6 +708,16 @@ class Conv3D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
+    groups: Integer, the number of channel groups controlling the connections
+      between inputs and outputs. Input channels and `filters` must both be
+      divisible by `groups`. For example,
+        - At `groups=1`, all inputs are convolved to all outputs.
+        - At `groups=2`, the operation becomes equivalent to having two
+          convolutional layers side by side, each seeing half the input
+          channels, and producing half the output channels, and both
+          subsequently concatenated.
+        - At `groups=input_channels`, each input channel is convolved with its
+          own set of filters, of size `input_channels / filters`
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -717,6 +773,7 @@ class Conv3D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1, 1),
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -735,6 +792,7 @@ class Conv3D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 9e2859f166b..d201ae9924d 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -54,11 +54,15 @@ class Conv1DTest(keras_parameterized.TestCase):
       ('padding_causal', {'padding': 'causal'}, (None, 7, 2)),
       ('strides', {'strides': 2}, (None, 3, 2)),
       ('dilation_rate', {'dilation_rate': 2}, (None, 3, 2)),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {'groups': 3, 'filters': 6}, (None, 5, 6), True),
   )
-  def test_conv1d(self, kwargs, expected_output_shape):
-    kwargs['filters'] = 2
+  def test_conv1d(self, kwargs, expected_output_shape, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = 3
-    self._run_test(kwargs, expected_output_shape)
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, expected_output_shape)
 
   def test_conv1d_regularizers(self):
     kwargs = {
@@ -134,12 +138,15 @@ class Conv2DTest(keras_parameterized.TestCase):
       ('dilation_rate', {'dilation_rate': (2, 2)}, (None, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
+      ('data_format', {'data_format': 'channels_first'}, None, True),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {'groups': 3, 'filters': 6}, (None, 5, 4, 6), True),
   )
-  def test_conv2d(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
+  def test_conv2d(self, kwargs, expected_output_shape=None, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = (3, 3)
-    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, expected_output_shape)
 
   def test_conv2d_regularizers(self):
@@ -207,12 +214,15 @@ class Conv3DTest(keras_parameterized.TestCase):
       ('dilation_rate', {'dilation_rate': (2, 2, 2)}, (None, 1, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
+      ('data_format', {'data_format': 'channels_first'}, None, True),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {'groups': 3, 'filters': 6}, (None, 3, 5, 4, 6), True),
   )
-  def test_conv3d(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
+  def test_conv3d(self, kwargs, expected_output_shape=None, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = (3, 3, 3)
-    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, expected_output_shape)
 
   def test_conv3d_regularizers(self):
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 8df06a8e861..cf280b3a2b8 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1063,12 +1063,12 @@ class Convolution(object):
       input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1)
       spatial_dims = range(2, num_spatial_dims + 2)
 
-    if not input_channels_dim.is_compatible_with(
-        filter_shape[num_spatial_dims]):
+    filter_dim = tensor_shape.dimension_at_index(filter_shape, num_spatial_dims)
+    if not (input_channels_dim % filter_dim).is_compatible_with(0):
       raise ValueError(
-          "number of input channels does not match corresponding dimension of "
-          "filter, {} != {}".format(input_channels_dim,
-                                    filter_shape[num_spatial_dims]))
+          "number of input channels is not divisible by corresponding "
+          "dimension of filter, {} % {} != 0".format(input_channels_dim,
+                                                     filter_dim))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)

From 6b4ddc8a6a04c78689add2e7e5551292ad469bf4 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 13 Mar 2020 00:35:31 +0000
Subject: [PATCH 0013/1533] Improve error messages

---
 tensorflow/python/keras/layers/convolutional.py | 16 +++++++++-------
 tensorflow/python/ops/nn_ops.py                 | 13 +++++++------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 36a1aec5ce1..72e93c34778 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -139,10 +139,10 @@ class Conv(Layer):
       filters = int(filters)
     self.filters = filters
     self.groups = groups
-    if filters and filters % self.groups != 0:
+    if filters is not None and filters % self.groups != 0:
       raise ValueError(
-          'The number of filters is not divisible by the number groups. '
-          '{} % {} = {}'.format(filters, groups, filters % groups))
+          'The number of filters must be evenly divisible by the number of '
+          'groups. Received: groups={}, filters={}'.format(groups, filters))
     self.kernel_size = conv_utils.normalize_tuple(
         kernel_size, rank, 'kernel_size')
     if not all(self.kernel_size):
@@ -172,10 +172,12 @@ class Conv(Layer):
     input_channel = self._get_input_channel(input_shape)
     if input_channel % self.groups != 0:
       raise ValueError(
-          'The number of input channels is not divisible by the number of '
-          'channel group. {} % {} = {}'.format(input_channel, self.groups,
-                                               input_channel % self.groups))
-    kernel_shape = self.kernel_size + (input_channel // self.groups, self.filters)
+          'The number of input channels must be evenly divisible by the number '
+          'of groups. Received groups={}, but the input has {} channels '
+          '(full input shape is {}).'.format(self.groups, input_channel,
+                                             input_shape))
+    kernel_shape = self.kernel_size + (input_channel // self.groups,
+                                       self.filters)
 
     self.kernel = self.add_weight(
         name='kernel',
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index cf280b3a2b8..f4ca1f0e6fe 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1066,9 +1066,10 @@ class Convolution(object):
     filter_dim = tensor_shape.dimension_at_index(filter_shape, num_spatial_dims)
     if not (input_channels_dim % filter_dim).is_compatible_with(0):
       raise ValueError(
-          "number of input channels is not divisible by corresponding "
-          "dimension of filter, {} % {} != 0".format(input_channels_dim,
-                                                     filter_dim))
+          "The number of input channels is not divisible by the "
+          "corresponding number of output filters. Received: "
+          "input channels={}, output filters={}".format(input_channels_dim,
+                                                        filter_dim))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -1857,9 +1858,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
   Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
   horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-  
+
   Usage Example:
-  
+
   >>> x_in = np.array([[
   ...   [[2], [1], [2], [0], [1]],
   ...   [[1], [3], [2], [2], [3]],
@@ -3278,7 +3279,7 @@ def softmax(logits, axis=None, name=None, dim=None):
       Tensor.
     RuntimeError: If a registered conversion function returns an invalid
       value.
-      
+
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:

From b1caec43f12d2b09b1d547d81efe436001179c32 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 13 Mar 2020 00:46:57 +0000
Subject: [PATCH 0014/1533] Test group convolution error messages

---
 tensorflow/python/keras/layers/convolutional_test.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index d201ae9924d..e494a3df3ab 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -285,6 +285,18 @@ class Conv3DTest(keras_parameterized.TestCase):
             input_shape=(None, 3, None, None, None),
             input_data=input_data)
 
+class GroupedConvTest(keras_parameterized.TestCase):
+  @parameterized.named_parameters(
+      ('Conv1D', keras.layers.Conv1D),
+      ('Conv2D', keras.layers.Conv2D),
+      ('Conv3D', keras.layers.Conv3D),
+  )
+  def test_group_conv_incorrect_use(self, layer):
+    with self.assertRaisesRegexp(ValueError, 'The number of filters'):
+      layer(16, 3, groups=3)
+    with self.assertRaisesRegexp(ValueError, 'The number of input channels'):
+      layer(16, 3, groups=4).build((32, 12, 12, 3))
+
 
 @keras_parameterized.run_all_keras_modes
 class Conv3DTransposeTest(keras_parameterized.TestCase):

From ff3aaa368515d7a693e4bf103695f27de28fd8e1 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 13 Mar 2020 14:08:55 +0000
Subject: [PATCH 0015/1533] Test group convolution

---
 .../python/keras/layers/convolutional_test.py | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index e494a3df3ab..5fd52716af7 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -297,6 +298,25 @@ class GroupedConvTest(keras_parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'The number of input channels'):
       layer(16, 3, groups=4).build((32, 12, 12, 3))
 
+  @parameterized.named_parameters(
+      ('Conv1D', keras.layers.Conv1D, (32, 12, 32)),
+      ('Conv2D', keras.layers.Conv2D, (32, 12, 12, 32)),
+      ('Conv3D', keras.layers.Conv3D, (32, 12, 12, 12, 32)),
+  )
+  def test_group_conv(self, layer, input_shape):
+    if test.is_gpu_available(cuda_only=True):
+      with self.cached_session(use_gpu=True):
+        inputs = np.random.uniform(size=input_shape)
+
+        outputs = layer(16, 3, groups=4, kernel_initializer="ones")(inputs)
+
+        input_slices = np.split(inputs, 4, axis=-1)
+        expected_outputs = array_ops.concat([
+            layer(16 // 4, 3, kernel_initializer="ones")(slice)
+            for slice in input_slices], axis=-1)
+
+        self.assertAllClose(outputs, expected_outputs)
+
 
 @keras_parameterized.run_all_keras_modes
 class Conv3DTransposeTest(keras_parameterized.TestCase):

From 83c785bc2b7e55d616873db4b5e4f306316ac1c3 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Mon, 6 Apr 2020 17:15:12 +0100
Subject: [PATCH 0016/1533] Allow groups=None

---
 tensorflow/python/keras/layers/convolutional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 72e93c34778..9c0eedac29d 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -138,7 +138,7 @@ class Conv(Layer):
     if filters is not None and not isinstance(filters, int):
       filters = int(filters)
     self.filters = filters
-    self.groups = groups
+    self.groups = groups or 1
     if filters is not None and filters % self.groups != 0:
       raise ValueError(
           'The number of filters must be evenly divisible by the number of '

From 91610a726e931e8592a86b3a685f323430927708 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Mon, 6 Apr 2020 17:28:40 +0100
Subject: [PATCH 0017/1533] Use test_util.use_gpu()

---
 tensorflow/python/keras/layers/convolutional_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 5fd52716af7..20b6e582402 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -305,7 +305,7 @@ class GroupedConvTest(keras_parameterized.TestCase):
   )
   def test_group_conv(self, layer, input_shape):
     if test.is_gpu_available(cuda_only=True):
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         inputs = np.random.uniform(size=input_shape)
 
         outputs = layer(16, 3, groups=4, kernel_initializer="ones")(inputs)

From 52160bc31c19a2fa1be5d8df3f09ccd50ea343d8 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 7 Apr 2020 02:17:10 +0100
Subject: [PATCH 0018/1533] Improve grouped convolution tests

---
 .../python/keras/layers/convolutional_test.py | 32 +++++++++++++++----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 20b6e582402..2067206435c 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -25,6 +25,8 @@ from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -303,19 +305,35 @@ class GroupedConvTest(keras_parameterized.TestCase):
       ('Conv2D', keras.layers.Conv2D, (32, 12, 12, 32)),
       ('Conv3D', keras.layers.Conv3D, (32, 12, 12, 12, 32)),
   )
-  def test_group_conv(self, layer, input_shape):
+  def test_group_conv(self, layer_cls, input_shape):
     if test.is_gpu_available(cuda_only=True):
       with test_util.use_gpu():
-        inputs = np.random.uniform(size=input_shape)
+        inputs = random_ops.random_uniform(shape=input_shape)
 
-        outputs = layer(16, 3, groups=4, kernel_initializer="ones")(inputs)
+        layer = layer_cls(16, 3, groups=4)
+        layer.build(input_shape)
 
-        input_slices = np.split(inputs, 4, axis=-1)
+        input_slices = array_ops.split(inputs, 4, axis=-1)
+        weight_slices = array_ops.split(layer.kernel, 4, axis=-1)
         expected_outputs = array_ops.concat([
-            layer(16 // 4, 3, kernel_initializer="ones")(slice)
-            for slice in input_slices], axis=-1)
+            nn.convolution_v2(inputs, weights)
+            for inputs, weights in zip(input_slices, weight_slices)], axis=-1)
 
-        self.assertAllClose(outputs, expected_outputs)
+        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
+
+  def test_group_conv_depthwise(self):
+    if test.is_gpu_available(cuda_only=True):
+      with test_util.use_gpu():
+        inputs = random_ops.random_uniform(shape=(3, 27, 27, 32))
+
+        layer = keras.layers.Conv2D(32, 3, groups=32)
+        layer.build((3, 27, 27, 32))
+
+        weights_dw = array_ops.reshape(layer.kernel, [3, 3, 32, 1])
+        expected_outputs = nn.depthwise_conv2d(
+            inputs, weights_dw, strides=[1, 1, 1, 1], padding="VALID")
+
+        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
 
 
 @keras_parameterized.run_all_keras_modes

From 212616f0cae39ac2ccfcc2988eca5fcb3161cdf4 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 7 Apr 2020 09:43:04 +0100
Subject: [PATCH 0019/1533] Update api/golden

---
 .../tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt  | 2 +-
 .../tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt  | 2 +-
 .../tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt  | 2 +-
 .../api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt | 2 +-
 .../api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt | 2 +-
 .../api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt | 2 +-
 .../tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt  | 2 +-
 .../tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt  | 2 +-
 .../tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt  | 2 +-
 .../api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt | 2 +-
 .../api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt | 2 +-
 .../api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt | 2 +-
 12 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 7748f763576..8d57f80c9f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 2d03874d6b1..a300c10d1eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index e7974563f59..cb11862ca58 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index f4906272693..d9dc72b1673 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 7e60cdfdce3..c82642f939d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 3071323b7d1..333a02cc6d4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 7748f763576..8d57f80c9f5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 2d03874d6b1..a300c10d1eb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index e7974563f59..cb11862ca58 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index f4906272693..d9dc72b1673 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 7e60cdfdce3..c82642f939d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 3071323b7d1..333a02cc6d4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"

From 08cd923c406a20aa019344e081a74f03ad2ebd0f Mon Sep 17 00:00:00 2001
From: ShengYang1 <yang.sheng@intel.com>
Date: Wed, 8 Apr 2020 14:07:24 +0800
Subject: [PATCH 0020/1533] Copy exponential_avg_factor attr in grappler

---
 tensorflow/core/grappler/optimizers/remapper.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index fd8c7a0af12..6f94f66cb00 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -885,6 +885,7 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
   (*attr)["is_training"] = src_attr.at("is_training");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["epsilon"] = src_attr.at("epsilon");
+  (*attr)["exponential_avg_factor"] = src_attr.at("exponential_avg_factor");
 
   // FusedBatchNormV2 and V3 have an extra type parameter.
   if (fused_batch_norm.op() != "FusedBatchNorm") {

From 8d80fbfc3677468c530ce49cbc5d69d8e28d28ce Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 11 Mar 2020 13:05:55 +0000
Subject: [PATCH 0021/1533] General case of logistic/tanh, 16-bit -> 16-bit.

---
 tensorflow/lite/kernels/activations.cc        |  68 +++++---
 tensorflow/lite/kernels/activations_test.cc   | 154 +++++++++++++++---
 .../internal/reference/integer_ops/logistic.h |   9 +-
 .../internal/reference/integer_ops/tanh.h     |  10 +-
 4 files changed, 190 insertions(+), 51 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 2a236666739..daa4c24f7be 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -430,21 +430,29 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    TF_LITE_ENSURE(context,
-                   CheckedLog2(input->params.scale, &input_scale_log2_rounded));
+    bool paramScalePOT =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    paramScalePOT &=
+        (data->input_left_shift == 0 || data->input_left_shift == 1);
+
+    if (!paramScalePOT) {
+      // In case of general scale parameter, we need to do a rescaling.
+      // Magic constant 4096:
+      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
+      // from 16-bit (-2^15, 2^15),
+      // so we need to multiply by
+      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
+      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+    }
 
     int output_scale_log2_rounded;
     TF_LITE_ENSURE(
         context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
     TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
                       -kOutputFractionalBits);
-
-    data->input_left_shift =
-        (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    // Support for shifts is limited until we have a parameterized version of
-    // SaturatingRoundingMultiplyByPOT().
-    TF_LITE_ENSURE(context, data->input_left_shift >= 0);
-    TF_LITE_ENSURE(context, data->input_left_shift <= 1);
   }
 
   return context->ResizeTensor(context, output,
@@ -518,19 +526,28 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    TF_LITE_ENSURE(context,
-                   CheckedLog2(input->params.scale, &input_scale_log2_rounded));
+    bool paramScalePOT =
+        CheckedLog2(input->params.scale, &input_scale_log2_rounded);
+
+    data->input_left_shift =
+        (15 - kInputIntegerBits) + input_scale_log2_rounded;
+    paramScalePOT &= (data->input_left_shift == 0);
+
+    if (!paramScalePOT) {
+      // In case of general scale parameter, we need to do a rescaling.
+      // Magic constant 4096:
+      // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
+      // from 16-bit (-2^15, 2^15),
+      // so we need to multiply by
+      // 2^(15 - kInputIntegerBits) = 2^12 = 4096.
+      data->input_multiplier = static_cast<int32_t>(input->params.scale * 4096);
+    }
 
     int output_scale_log2_rounded;
     TF_LITE_ENSURE(
         context, CheckedLog2(output->params.scale, &output_scale_log2_rounded));
     TF_LITE_ENSURE_EQ(context, output_scale_log2_rounded,
                       -kOutputFractionalBits);
-
-    data->input_left_shift =
-        (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    // The int16 logistic implementation does not support shifting of the input.
-    TF_LITE_ENSURE_EQ(context, data->input_left_shift, 0);
   }
 
   return context->ResizeTensor(context, output,
@@ -797,13 +814,16 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
     case kTfLiteInt16: {
       TanhParams params;
       params.input_left_shift = data->input_left_shift;
-      if (kernel_type == kReference) {
+      if (kernel_type == kReference || (data->input_multiplier > 0)) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        reference_integer_ops::Tanh(data->input_left_shift, size,
-                                    GetTensorData<int16_t>(input),
-                                    GetTensorData<int16_t>(output));
+        const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
+        int16_t* ptr_output_data = GetTensorData<int16_t>(output);
+
+        reference_integer_ops::Tanh(
+            data->input_multiplier, data->input_left_shift, size,
+            GetTensorData<int16_t>(input), GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Tanh(
             params, GetTensorShape(input), GetTensorData<int16_t>(input),
@@ -872,11 +892,15 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
     }
     case kTfLiteInt16: {
       LogisticParams params;
-      if (kernel_type == kReference) {
+      if (kernel_type == kReference || (data->input_multiplier > 0)) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        reference_integer_ops::Logistic(size, GetTensorData<int16_t>(input),
+        int16_t* ptr_output_data = GetTensorData<int16_t>(output);
+        const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
+
+        reference_integer_ops::Logistic(data->input_multiplier, size,
+                                        GetTensorData<int16_t>(input),
                                         GetTensorData<int16_t>(output));
       } else {
         optimized_ops::Logistic(
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index a33d5e65200..2cf5adbcf46 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -764,19 +764,73 @@ TEST_P(TanhOpTest, TanhInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_TANH,
-      /*input=*/{TensorType_INT16, {1, 2, 8, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 8, 1}, kMin, kMax});
-  m.SetInput<int16_t>({0, -6, 2, 4,   //
-                       -4, -2, 8, 1,  //
-                       7, -8, 3, -5,  //
-                       6, -1, -3, 5});
+      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+  m.SetInput<int16_t>(
+      {-8.0000000000, -7.8181818182, -7.6363636364, -7.4545454545,
+       -7.2727272727, -7.0909090909, -6.9090909091, -6.7272727273,
+       -6.5454545455, -6.3636363636, -6.1818181818, -6.0000000000,
+       -5.8181818182, -5.6363636364, -5.4545454545, -5.2727272727,
+       -5.0909090909, -4.9090909091, -4.7272727273, -4.5454545455,
+       -4.3636363636, -4.1818181818, -4.0000000000, -3.8181818182,
+       -3.6363636364, -3.4545454545, -3.2727272727, -3.0909090909,
+       -2.9090909091, -2.7272727273, -2.5454545455, -2.3636363636,
+       -2.1818181818, -2.0000000000, -1.8181818182, -1.6363636364,
+       -1.4545454545, -1.2727272727, -1.0909090909, -0.9090909091,
+       -0.7272727273, -0.5454545455, -0.3636363636, -0.1818181818,
+       0.0000000000,  0.1818181818,  0.3636363636,  0.5454545455,
+       0.7272727273,  0.9090909091,  1.0909090909,  1.2727272727,
+       1.4545454545,  1.6363636364,  1.8181818182,  2.0000000000,
+       2.1818181818,  2.3636363636,  2.5454545455,  2.7272727273,
+       2.9090909091,  3.0909090909,  3.2727272727,  3.4545454545,
+       3.6363636364,  3.8181818182,  4.0000000000,  4.1818181818,
+       4.3636363636,  4.5454545455,  4.7272727273,  4.9090909091,
+       5.0909090909,  5.2727272727,  5.4545454545,  5.6363636364,
+       5.8181818182,  6.0000000000,  6.1818181818,  6.3636363636,
+       6.5454545455,  6.7272727273,  6.9090909091,  7.0909090909,
+       7.2727272727,  7.4545454545,  7.6363636364,  7.8181818182,
+       8.0000000000});
   m.Invoke();
   EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
               ElementsAreArray(ArrayFloatNear(
-                  {0.0, -0.999987, 0.964027, 0.999329,                //
-                   -0.999329, -0.96402, 0.99999, 0.76159,             //
-                   0.999998337, -0.99999, 0.995054754, -0.999909204,  //
-                   0.999999996, -0.76159, -0.995054754, 0.999909204},
+                  {-0.9999997749, -0.9999996762, -0.9999995342, -0.9999993300,
+                   -0.9999990361, -0.9999986134, -0.9999980053, -0.9999971306,
+                   -0.9999958722, -0.9999940619, -0.9999914578, -0.9999877117,
+                   -0.9999823226, -0.9999745703, -0.9999634183, -0.9999473758,
+                   -0.9999242982, -0.9998911009, -0.9998433469, -0.9997746542,
+                   -0.9996758446, -0.9995337191, -0.9993292997, -0.9990353053,
+                   -0.9986125310, -0.9980046622, -0.9971308601, -0.9958751909,
+                   -0.9940716137, -0.9914827859, -0.9877703933, -0.9824541388,
+                   -0.9748561217, -0.9640275801, -0.9486568273, -0.9269625051,
+                   -0.8965880154, -0.8545351057, -0.7972097087, -0.7206956332,
+                   -0.6213939966, -0.4971057414, -0.3484130125, -0.1798408185,
+                   0.0000000000,  0.1798408185,  0.3484130125,  0.4971057414,
+                   0.6213939966,  0.7206956332,  0.7972097087,  0.8545351057,
+                   0.8965880154,  0.9269625051,  0.9486568273,  0.9640275801,
+                   0.9748561217,  0.9824541388,  0.9877703933,  0.9914827859,
+                   0.9940716137,  0.9958751909,  0.9971308601,  0.9980046622,
+                   0.9986125310,  0.9990353053,  0.9993292997,  0.9995337191,
+                   0.9996758446,  0.9997746542,  0.9998433469,  0.9998911009,
+                   0.9999242982,  0.9999473758,  0.9999634183,  0.9999745703,
+                   0.9999823226,  0.9999877117,  0.9999914578,  0.9999940619,
+                   0.9999958722,  0.9999971306,  0.9999980053,  0.9999986134,
+                   0.9999990361,  0.9999993300,  0.9999995342,  0.9999996762,
+                   0.9999997749},
+                  kQuantizedToleranceInt16)));
+}
+
+TEST_P(TanhOpTest, TanhInt16General) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      GetRegistration(), BuiltinOperator_TANH,
+      /*input=*/{TensorType_INT16, {6}, 11 * kMin, 11 * kMax},
+      /*output=*/{TensorType_INT16, {5}, kMin, kMax});
+  m.SetInput<int16_t>({-10, -4, 0, 6, 7.0909090909, 8});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-0.999969, -0.99408, 0, 0.999664, 0.999939, 0.999969},
                   kQuantizedToleranceInt16)));
 }
 
@@ -905,20 +959,74 @@ TEST_P(LogisticOpTest, SigmoidInt16) {
   const float kMax = 32767.f / 32768.f;
   QuantizedActivationsOpModel m(
       GetRegistration(), BuiltinOperator_LOGISTIC,
-      /*input=*/{TensorType_INT16, {1, 2, 6, 1}, 8 * kMin, 8 * kMax},
-      /*output=*/{TensorType_INT16, {1, 2, 6, 1}, kMin, kMax});
-  m.SetInput<int16_t>({0, -6, 2, 4,  //
-                       3, -2, 8, 1,  //
-                       5, -8, 7, -3});
+      /*input=*/{TensorType_INT16, {89}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_INT16, {89}, kMin, kMax});
+  m.SetInput<int16_t>(
+      {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182,
+       -9.0909090909,  -8.8636363636, -8.6363636364, -8.4090909091,
+       -8.1818181818,  -7.9545454545, -7.7272727273, -7.5000000000,
+       -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909,
+       -6.3636363636,  -6.1363636364, -5.9090909091, -5.6818181818,
+       -5.4545454545,  -5.2272727273, -5.0000000000, -4.7727272727,
+       -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636,
+       -3.6363636364,  -3.4090909091, -3.1818181818, -2.9545454545,
+       -2.7272727273,  -2.5000000000, -2.2727272727, -2.0454545455,
+       -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364,
+       -0.9090909091,  -0.6818181818, -0.4545454545, -0.2272727273,
+       0.0000000000,   0.2272727273,  0.4545454545,  0.6818181818,
+       0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,
+       1.8181818182,   2.0454545455,  2.2727272727,  2.5000000000,
+       2.7272727273,   2.9545454545,  3.1818181818,  3.4090909091,
+       3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,
+       4.5454545455,   4.7727272727,  5.0000000000,  5.2272727273,
+       5.4545454545,   5.6818181818,  5.9090909091,  6.1363636364,
+       6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,
+       7.2727272727,   7.5000000000,  7.7272727273,  7.9545454545,
+       8.1818181818,   8.4090909091,  8.6363636364,  8.8636363636,
+       9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,
+       10.0000000000});
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput<int16_t>(),
-              ElementsAreArray(ArrayFloatNear(
-                  {
-                      0.5, 0.002473, 0.880797, 0.982014,       //
-                      0.952574, 0.119203, 0.9995, 0.731059,    //
-                      0.993307, 0.0003535, 0.999089, 0.047426  //
-                  },
-                  kQuantizedToleranceInt16)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int16_t>(),
+      ElementsAreArray(ArrayFloatNear(
+          {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729,
+           0.0001414198, 0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396,
+           0.0004404502, 0.0005527786, 0.0006937345, 0.0008706021, 0.0010925128,
+           0.0013709094, 0.0017201256, 0.0021581065, 0.0027073042, 0.0033957870,
+           0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576, 0.0105038445,
+           0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
+           0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047,
+           0.1145124805, 0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272,
+           0.2871859014, 0.3358556241, 0.3882805886, 0.4434251301, 0.5000000000,
+           0.5565748699, 0.6117194114, 0.6641443759, 0.7128140986, 0.7570113728,
+           0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195, 0.9065929953,
+           0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
+           0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555,
+           0.9916136424, 0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130,
+           0.9972926958, 0.9978418935, 0.9982798744, 0.9986290906, 0.9989074872,
+           0.9991293979, 0.9993062655, 0.9994472214, 0.9995595498, 0.9996490604,
+           0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802, 0.9998873271,
+           0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021},
+          kQuantizedToleranceInt16)));
+}
+
+TEST_P(LogisticOpTest, SigmoidInt16General) {
+  const float kMin = -1;
+  const float kMax = 32767.f / 32768.f;
+  QuantizedActivationsOpModel m(
+      GetRegistration(), BuiltinOperator_LOGISTIC,
+      /*input=*/{TensorType_INT16, {8}, 10 * kMin, 10 * kMax},
+      /*output=*/{TensorType_INT16, {8}, kMin, kMax});
+  m.SetInput<int16_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int16_t>(),
+      ElementsAreArray(ArrayFloatNear({0.5, 0.00814819, 0.832031, 0.960846,  //
+                                       0.916809, 0.167969, 0.999664, 0.689972},
+                                      kQuantizedToleranceInt16)));
 }
 
 TEST(FloatActivationsOpTest, Softmax4D) {
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index aa626f43f19..39654e542f2 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -58,12 +58,15 @@ inline void Logistic(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
-inline void Logistic(int32_t input_size, const int16_t* ptr_input_data,
-                     int16_t* ptr_output_data) {
+inline void Logistic(int32_t input_multiplier, int32_t input_size,
+                     const int16_t* ptr_input_data, int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data)*input_data_mul;
 
     // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
     // we do interpolation on unsigned values.
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index 8c07c6f6d6c..58e36f955d0 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -57,12 +57,16 @@ inline void Tanh(int32_t input_zero_point, int32_t input_range_radius,
   }
 }
 
-inline void Tanh(int32_t input_left_shift, int32_t input_size,
-                 const int16_t* ptr_input_data, int16_t* ptr_output_data) {
+inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
+                 int32_t input_size, const int16_t* ptr_input_data,
+                 int16_t* ptr_output_data) {
   // We use the LUT for sigmoid and take into account, that
   // tanh(x) = 2*sigmoid(2*x) - 1
+
+  int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
+
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = *ptr_input_data;
+    int32_t input_data = (*ptr_input_data)*input_data_mul;
 
     if (input_left_shift == 1) {
       input_data <<= 1;

From c5398ad90fb5aa804269828dc907d8425b4c187f Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 8 Apr 2020 10:50:26 +0100
Subject: [PATCH 0022/1533] Versioning for activation functions Tanh/Sigmoid.

---
 tensorflow/lite/kernels/activations.cc         |  6 ------
 tensorflow/lite/kernels/register.cc            |  4 ++--
 tensorflow/lite/toco/tflite/op_version.cc      |  2 ++
 tensorflow/lite/tools/versioning/op_version.cc | 14 ++++++++++++--
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index daa4c24f7be..cd16bccc062 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -818,9 +818,6 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
-        int16_t* ptr_output_data = GetTensorData<int16_t>(output);
-
         reference_integer_ops::Tanh(
             data->input_multiplier, data->input_left_shift, size,
             GetTensorData<int16_t>(input), GetTensorData<int16_t>(output));
@@ -896,9 +893,6 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
         const int size =
             MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
 
-        int16_t* ptr_output_data = GetTensorData<int16_t>(output);
-        const int16_t* ptr_input_data = GetTensorData<int16_t>(input);
-
         reference_integer_ops::Logistic(data->input_multiplier, size,
                                         GetTensorData<int16_t>(input),
                                         GetTensorData<int16_t>(output));
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 8c1f6b4a9e7..faa4d818e0a 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -39,10 +39,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6(), /* min_version = */ 1,
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_TANH, Register_TANH(), /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(),
              /* min_version = */ 1,
              /* max_version = */ 2);
diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 1c699410a3e..ac9d575ab64 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -158,6 +158,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kSlice, 3}, "1.14.0"},
           {{OperatorType::kTanh, 1}, "1.14.0"},
           {{OperatorType::kTanh, 2}, "1.14.0"},
+          {{OperatorType::kTanh, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kOneHot, 1}, "1.11.0"},
           {{OperatorType::kCTCBeamSearchDecoder, 1}, "1.11.0"},
           {{OperatorType::kUnpack, 1}, "1.11.0"},
@@ -167,6 +168,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kLeakyRelu, 1}, "1.13.1"},
           {{OperatorType::kLogistic, 1}, "1.14.0"},
           {{OperatorType::kLogistic, 2}, "1.14.0"},
+          {{OperatorType::kLogistic, 3}, kPendingReleaseOpVersion},
           {{OperatorType::kLogSoftmax, 1}, "1.14.0"},
           {{OperatorType::kLogSoftmax, 2}, "1.14.0"},
           {{OperatorType::kSquaredDifference, 1}, "1.13.1"},
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 7b68e9d698c..7a10dc5637f 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -387,6 +387,18 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_TANH:
+    case BuiltinOperator_LOGISTIC:
+      if (op_sig.input_types.at(0) == TensorType_INT16 &&
+          op_sig.output_types.at(0) == TensorType_INT16) {
+        return 3;
+      }
+
+      if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_AVERAGE_POOL_2D:
     case BuiltinOperator_ADD:
     case BuiltinOperator_CONCATENATION:
@@ -402,8 +414,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
     case BuiltinOperator_REDUCE_MIN:
     case BuiltinOperator_RELU6:
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
-    case BuiltinOperator_TANH:
-    case BuiltinOperator_LOGISTIC:
     case BuiltinOperator_LOG_SOFTMAX:
     case BuiltinOperator_TOPK_V2:
     case BuiltinOperator_ARG_MAX:

From 56cea6e30cdec9a12832def2ad97634101d8c784 Mon Sep 17 00:00:00 2001
From: JLZ <jlz.3008@gmail.com>
Date: Thu, 9 Apr 2020 10:26:08 +0200
Subject: [PATCH 0023/1533] TFLite makefile build rule for compile label_image
 example. label_image is not build by default. You must build it expressly
 using the command

   ./tensorflow/lite/tools/make/build_aarch64_lib.sh label_image
---
 tensorflow/lite/tools/make/Makefile | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 7a77cf2b3f5..186b2ece16a 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -97,6 +97,11 @@ BENCHMARK_PERF_OPTIONS_BINARY_NAME := benchmark_model_performance_options
 MINIMAL_SRCS := \
 	tensorflow/lite/examples/minimal/minimal.cc
 
+LABEL_IMAGE_SRCS := \
+	tensorflow/lite/examples/label_image/bitmap_helpers.cc \
+	tensorflow/lite/examples/label_image/label_image.cc \
+	tensorflow/lite/tools/evaluation/utils.cc
+
 # What sources we want to compile, must be kept in sync with the main Bazel
 # build files.
 
@@ -260,6 +265,7 @@ BENCHMARK_LIB := $(LIBDIR)$(BENCHMARK_LIB_NAME)
 BENCHMARK_BINARY := $(BINDIR)$(BENCHMARK_BINARY_NAME)
 BENCHMARK_PERF_OPTIONS_BINARY := $(BINDIR)$(BENCHMARK_PERF_OPTIONS_BINARY_NAME)
 MINIMAL_BINARY := $(BINDIR)minimal
+LABEL_IMAGE_BINARY := $(BINDIR)label_image
 
 CXX := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}g++
 CC := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}gcc
@@ -268,6 +274,9 @@ AR := $(CC_PREFIX)${TARGET_TOOLCHAIN_PREFIX}ar
 MINIMAL_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
 
+LABEL_IMAGE_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(LABEL_IMAGE_SRCS))))
+
 LIB_OBJS := $(addprefix $(OBJDIR), \
 $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(patsubst %.cpp,%.o,$(TF_LITE_CC_SRCS)))))
 
@@ -322,6 +331,14 @@ $(MINIMAL_BINARY): $(MINIMAL_OBJS) $(LIB_PATH)
 
 minimal: $(MINIMAL_BINARY)
 
+$(LABEL_IMAGE_BINARY): $(LABEL_IMAGE_OBJS) $(LIB_PATH)
+	@mkdir -p $(dir $@)
+	$(CXX) $(CXXFLAGS) $(INCLUDES) \
+	-o $(LABEL_IMAGE_BINARY) $(LABEL_IMAGE_OBJS) \
+	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
+
+label_image: $(LABEL_IMAGE_BINARY)
+
 $(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_LIB_OBJS)
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(BENCHMARK_LIB) $(LIB_OBJS) $(BENCHMARK_LIB_OBJS)

From 5e4ce4f0776772798cbe0036b3b42a4aa416fabe Mon Sep 17 00:00:00 2001
From: Marcin Sielski <marcin.sielski@gmail.com>
Date: Mon, 13 Apr 2020 16:46:53 +0200
Subject: [PATCH 0024/1533] Fix a bug related to build TF Lite on RPI Zero.

Why:

* Enable to build TF Lite on RPI Zero.

This change addresses the need by:

* Changing compiler from arm-linux-gnueabi- to arm-linux-gnueabihf-.
---
 tensorflow/lite/tools/make/targets/rpi_makefile.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/make/targets/rpi_makefile.inc b/tensorflow/lite/tools/make/targets/rpi_makefile.inc
index 2225848ae64..71046d08131 100644
--- a/tensorflow/lite/tools/make/targets/rpi_makefile.inc
+++ b/tensorflow/lite/tools/make/targets/rpi_makefile.inc
@@ -32,7 +32,7 @@ ifeq ($(TARGET),rpi)
   # TODO(petewarden) In the future, we'll want to use OpenBLAS as a faster
   # alternative to Eigen on non-NEON ARM hardware like armv6.
   ifeq ($(TARGET_ARCH), armv6)
-    TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabi-
+    TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf-
     CXXFLAGS += \
       -march=armv6 \
       -mfpu=vfp \

From 7dc937f063d6b67bca070a8d7c0368ec01bfb96c Mon Sep 17 00:00:00 2001
From: Edgar Liberis <edgaras.liberis@gmail.com>
Date: Wed, 15 Apr 2020 09:39:32 +0100
Subject: [PATCH 0025/1533] Reset best weights in EarlyStopping.on_train_begin

EarlyStopping callback instances can be reused in different training loops, but the `on_train_begin` hook forgets to reset the `self.best_weights` state.
---
 tensorflow/python/keras/callbacks.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 2d178bb6409..e0cea492664 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -1524,6 +1524,7 @@ class EarlyStopping(Callback):
       self.best = self.baseline
     else:
       self.best = np.Inf if self.monitor_op == np.less else -np.Inf
+    self.best_weights = None
 
   def on_epoch_end(self, epoch, logs=None):
     current = self.get_monitor_value(logs)

From 966ed1cafc770e81e6a56be3f5715e0fe257b742 Mon Sep 17 00:00:00 2001
From: Fei Sun <sunfei.china@gmail.com>
Date: Thu, 16 Apr 2020 18:41:20 +0800
Subject: [PATCH 0026/1533] Use provided host name/ip instead of localhost if
 possible

---
 .../distributed_runtime/rpc/grpc_server_lib.cc    | 15 +++++++++++----
 .../distributed_runtime/rpc/grpc_server_lib.h     |  5 ++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 32083fc272f..7e2c42dabea 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -132,8 +132,9 @@ GrpcServer::~GrpcServer() {
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
 // Look up the port that has been requested for this task in `server_def`.
-Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
+Status GrpcServer::GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const {
   *port = -1;
+  *host_name = "localhost";
   for (const auto& job : server_def.cluster().job()) {
     if (job.name() == server_def.job_name()) {
       auto iter = job.tasks().find(server_def.task_index());
@@ -153,6 +154,10 @@ Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
               "Could not parse port for local server from \"", iter->second,
               "\".");
         }
+
+        if (colon_index != string::npos && !iter->second.substr(0, colon_index).empty()) {
+          *host_name = iter->second.substr(0, colon_index);
+        }
       }
       break;
     }
@@ -175,7 +180,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   // otherwise if 'task_index=-1' the program will abort.
 
   int requested_port;
-  TF_RETURN_IF_ERROR(GetPort(server_def_, &requested_port));
+  string host_name;
+  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name, &requested_port));
+  host_name_ = host_name;
 
   SessionOptions sess_opts;
   ConfigProto config = server_def_.default_session_config();
@@ -325,7 +332,7 @@ Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                        task.second);
       }
       if (job.name() == *options.job_name && task.first == options.task_index) {
-        host_port = strings::StrCat("localhost:", bound_port_);
+        host_port = strings::StrCat(host_name_, ":", bound_port_);
       } else {
         host_port = task.second;
       }
@@ -478,7 +485,7 @@ Status GrpcServer::Join() {
 }
 
 const string GrpcServer::target() const {
-  return strings::StrCat("grpc://localhost:", bound_port_);
+  return strings::StrCat("grpc://", host_name_, ":", bound_port_);
 }
 
 std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 8e25b8835eb..feb174cde4e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -104,7 +104,7 @@ class GrpcServer : public ServerInterface {
   Status UpdateServerDef(const ServerDef& server_def);
 
  protected:
-  virtual Status GetPort(const ServerDef& server_def, int* port) const;
+  virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const;
   Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.
@@ -136,6 +136,9 @@ class GrpcServer : public ServerInterface {
   // The port to which this server is bound.
   int bound_port_ = 0;
 
+  // The host name of this server
+  string host_name_;
+
   // Guards server configuration, server, and state.
   mutex mu_;
 

From 7d76b0ac7f76a9f4e1d7005ee9b28fce0f0c0451 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 20 Apr 2020 11:59:57 +0100
Subject: [PATCH 0027/1533] Addressed review comments.

---
 tensorflow/lite/kernels/activations.cc               | 12 ++++++------
 .../internal/reference/integer_ops/logistic.h        |  2 +-
 .../kernels/internal/reference/integer_ops/tanh.h    |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index cd16bccc062..bf1ff8d50d7 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -430,15 +430,15 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    bool paramScalePOT =
+    bool param_scale_pot =
         CheckedLog2(input->params.scale, &input_scale_log2_rounded);
 
     data->input_left_shift =
         (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    paramScalePOT &=
+    param_scale_pot &=
         (data->input_left_shift == 0 || data->input_left_shift == 1);
 
-    if (!paramScalePOT) {
+    if (!param_scale_pot) {
       // In case of general scale parameter, we need to do a rescaling.
       // Magic constant 4096:
       // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
@@ -526,14 +526,14 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
 
     int input_scale_log2_rounded;
-    bool paramScalePOT =
+    bool param_scale_pot =
         CheckedLog2(input->params.scale, &input_scale_log2_rounded);
 
     data->input_left_shift =
         (15 - kInputIntegerBits) + input_scale_log2_rounded;
-    paramScalePOT &= (data->input_left_shift == 0);
+    param_scale_pot &= (data->input_left_shift == 0);
 
-    if (!paramScalePOT) {
+    if (!param_scale_pot) {
       // In case of general scale parameter, we need to do a rescaling.
       // Magic constant 4096:
       // We need to scale down to (-2^3, 2^3) / 3 is kInputIntegerBits/ interval
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
index 39654e542f2..e315683c0cd 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h
@@ -66,7 +66,7 @@ inline void Logistic(int32_t input_multiplier, int32_t input_size,
   int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
 
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = (*ptr_input_data)*input_data_mul;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     // Scale by 3/4 to expand range [-8,8]->[-10.7,10.7] and
     // we do interpolation on unsigned values.
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
index 58e36f955d0..baae65ab30e 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h
@@ -66,7 +66,7 @@ inline void Tanh(int32_t input_multiplier, int32_t input_left_shift,
   int32_t input_data_mul = (input_multiplier > 0) ? input_multiplier : 1;
 
   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++) {
-    int32_t input_data = (*ptr_input_data)*input_data_mul;
+    int32_t input_data = (*ptr_input_data) * input_data_mul;
 
     if (input_left_shift == 1) {
       input_data <<= 1;

From a4d22c8d01610b4d6c5c5d1017755c04f86429c1 Mon Sep 17 00:00:00 2001
From: "tongxuan.ltx" <tongxuan.ltx@alibaba-inc.com>
Date: Mon, 20 Apr 2020 19:09:47 +0800
Subject: [PATCH 0028/1533] Support options(environment variable) to enable
 grpc reuse port.

ReusePort scenario: parent process occupies the port, then share
the port through service such as ZooKeeper, and then child
process (TensorFlow process) reuse the port.
---
 .../rpc/grpc_server_lib.cc                    | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 32083fc272f..aba9fe03d40 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -70,6 +70,18 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption {
                          plugins) override {}
 };
 
+// Define an option subclass in order to disable SO_REUSEPORT for the
+// server socket.
+class ReusePortOption : public ::grpc::ServerBuilderOption {
+ public:
+  void UpdateArguments(::grpc::ChannelArguments* args) override {
+    args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 1);
+  }
+
+  void UpdatePlugins(std::vector<std::unique_ptr<::grpc::ServerBuilderPlugin>>*
+                         plugins) override {}
+};
+
 // static utility function
 RendezvousMgrInterface* NewRpcRendezvousMgr(const WorkerEnv* env) {
   return new RpcRendezvousMgr(env);
@@ -220,8 +232,14 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
                            GetServerCredentials(server_def_), &bound_port_);
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
 
-  builder.SetOption(
-      std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption));
+  bool reuse_port = false;
+  ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, &reuse_port)
+      .IgnoreError();
+  auto server_build_option = reuse_port ?
+      std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption) :
+      std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption);
+  builder.SetOption(server_build_option);
+
   // Allow subclasses to specify more args to pass to the gRPC server.
   MaybeMutateBuilder(&builder);
   master_impl_ = CreateMaster(&master_env_);

From bbe13474e71eb2694be5050e22759d3bc5307026 Mon Sep 17 00:00:00 2001
From: "tongxuan.ltx" <tongxuan.ltx@alibaba-inc.com>
Date: Mon, 20 Apr 2020 23:12:36 +0800
Subject: [PATCH 0029/1533] fix typo

---
 tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index aba9fe03d40..6555ded82da 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -70,7 +70,7 @@ class NoReusePortOption : public ::grpc::ServerBuilderOption {
                          plugins) override {}
 };
 
-// Define an option subclass in order to disable SO_REUSEPORT for the
+// Define an option subclass in order to enable SO_REUSEPORT for the
 // server socket.
 class ReusePortOption : public ::grpc::ServerBuilderOption {
  public:

From 8a25f427db3d3dc5c9ddffc775b4c7dd4a96a6f9 Mon Sep 17 00:00:00 2001
From: Teng Lu <teng.lu@intel.com>
Date: Fri, 17 Apr 2020 16:36:57 +0800
Subject: [PATCH 0030/1533] Enabe BF16 SoftmaxGrad(Sum), and fix accuracy by
 accum type.

---
 tensorflow/core/kernels/reduction_ops.h | 25 ++++++++++++++++++++++++-
 tensorflow/core/ops/nn_grad.cc          |  4 ++++
 tensorflow/python/ops/math_ops_test.py  | 10 ++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 46d8051fff1..8814a2eb467 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -19,9 +19,9 @@ limitations under the License.
 // Functor definitions for Reduction ops, must be compilable by nvcc.
 
 #include <iostream>
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 namespace tensorflow {
 namespace functor {
@@ -58,6 +58,29 @@ struct ReduceEigenImpl {
   }
 };
 
+// Specialization for BF16 Reducer to fix accuracy.
+// TODO: all BF16 Reducer should have specialization to fix accuracy.
+#define CASTING_SPECIALIZATION(Reducer, ScalarType, IntermediateType)        \
+  template <typename Device, typename OUT_T, typename IN_T,                  \
+            typename ReductionAxes>                                          \
+  struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,                 \
+                         Reducer<ScalarType>> {                              \
+    void operator()(const Device& d, OUT_T out, IN_T in,                     \
+                    const ReductionAxes& reduction_axes,                     \
+                    const Reducer<ScalarType>& reducer) {                    \
+      static_assert(std::is_same<ScalarType, typename OUT_T::Scalar>::value, \
+                    "");                                                     \
+      Reducer<IntermediateType> intermediate_reducer;                        \
+      auto in_as_intermediate = in.template cast<IntermediateType>();        \
+      out.device(d) =                                                        \
+          in_as_intermediate.reduce(reduction_axes, intermediate_reducer)    \
+              .template cast<ScalarType>();                                  \
+    }                                                                        \
+  };
+
+CASTING_SPECIALIZATION(Eigen::internal::SumReducer, bfloat16, float);
+#undef CASTING_SPECIALIZATION
+
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Scalar>
 struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index 7beaf57c10b..c39f3adfa97 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -31,7 +31,11 @@ Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       // Ret val defs
       {"grad_x: T"},
       // Attr defs
+#if defined(INTEL_MKL) && defined(ENABLE_INTEL_MKL_BFLOAT16)
+      {{"T: {float, double, bfloat16}"}},
+#else
       {{"T: {float, double}"}},
+#endif
       // Nodes
       // Based on _SoftmaxGrad in nn_grad.py.
       {
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 2405eec9e49..ab554388cdc 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -44,6 +44,16 @@ class ReduceTest(test_util.TensorFlowTestCase):
       y_tf = self.evaluate(math_ops.reduce_sum(x))
       self.assertEqual(y_tf, 21)
 
+  def testReduceExtendType(self):
+    in_f32 = np.random.rand(1024, 1024).astype(np.float)
+    in_bf16 = math_ops.cast(in_f32, dtypes.bfloat16)
+
+    out_f32 = self.evaluate(math_ops.reduce_sum(in_f32))
+    out_bf16 = self.evaluate(math_ops.reduce_sum(in_bf16))
+    expected = math_ops.cast(out_f32, dtypes.bfloat16)
+
+    self.assertAllEqual(out_bf16, expected)
+
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with test_util.device(use_gpu=True):

From 231edfa4184dfd9cd2dce7a24d089d079ffffc6d Mon Sep 17 00:00:00 2001
From: "tongxuan.ltx" <tongxuan.ltx@alibaba-inc.com>
Date: Thu, 23 Apr 2020 10:13:45 +0800
Subject: [PATCH 0031/1533] fix build break

---
 tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 6555ded82da..b2efed619a4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -238,7 +238,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   auto server_build_option = reuse_port ?
       std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption) :
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption);
-  builder.SetOption(server_build_option);
+  builder.SetOption(std::move(server_build_option));
 
   // Allow subclasses to specify more args to pass to the gRPC server.
   MaybeMutateBuilder(&builder);

From c87915e09e5d9db8eae7b017307fc4b22862acc9 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 25 Apr 2020 13:41:02 +0100
Subject: [PATCH 0032/1533] Fix group convolution docstring

---
 tensorflow/python/keras/layers/convolutional.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 9c0eedac29d..2ae59d7583f 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -86,7 +86,7 @@ class Conv(Layer):
           channels, and producing half the output channels, and both
           subsequently concatenated.
         - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `input_channels / filters`
+          own set of filters, of size `filters / input_channels`
     activation: Activation function to use.
       If you don't specify anything, no activation is applied.
     use_bias: Boolean, whether the layer uses a bias.
@@ -407,7 +407,7 @@ class Conv1D(Conv):
           channels, and producing half the output channels, and both
           subsequently concatenated.
         - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `input_channels / filters`
+          own set of filters, of size `filters / input_channels`
     dilation_rate: an integer or tuple/list of a single integer, specifying
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
@@ -569,7 +569,7 @@ class Conv2D(Conv):
           channels, and producing half the output channels, and both
           subsequently concatenated.
         - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `input_channels / filters`
+          own set of filters, of size `filters / input_channels`
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -719,7 +719,7 @@ class Conv3D(Conv):
           channels, and producing half the output channels, and both
           subsequently concatenated.
         - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `input_channels / filters`
+          own set of filters, of size `filters / input_channels`
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).

From 389fba4b3c9337ae2930b605f8cae3891c8b60d7 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 25 Apr 2020 17:46:16 +0100
Subject: [PATCH 0033/1533] Improve group convolution docstring

---
 .../python/keras/layers/convolutional.py      | 60 +++++++------------
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 2ae59d7583f..ac0d06c011b 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -77,16 +77,11 @@ class Conv(Layer):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
-    groups: Integer, the number of channel groups controlling the connections
-      between inputs and outputs. Input channels and `filters` must both be
-      divisible by `groups`. For example,
-        - At `groups=1`, all inputs are convolved to all outputs.
-        - At `groups=2`, the operation becomes equivalent to having two
-          convolutional layers side by side, each seeing half the input
-          channels, and producing half the output channels, and both
-          subsequently concatenated.
-        - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `filters / input_channels`
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied.
     use_bias: Boolean, whether the layer uses a bias.
@@ -398,20 +393,15 @@ class Conv1D(Conv):
         2.1](https://arxiv.org/abs/1609.03499).
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
-    groups: Integer, the number of channel groups controlling the connections
-      between inputs and outputs. Input channels and `filters` must both be
-      divisible by `groups`. For example,
-        - At `groups=1`, all inputs are convolved to all outputs.
-        - At `groups=2`, the operation becomes equivalent to having two
-          convolutional layers side by side, each seeing half the input
-          channels, and producing half the output channels, and both
-          subsequently concatenated.
-        - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `filters / input_channels`
     dilation_rate: an integer or tuple/list of a single integer, specifying
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -560,16 +550,11 @@ class Conv2D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
-    groups: Integer, the number of channel groups controlling the connections
-      between inputs and outputs. Input channels and `filters` must both be
-      divisible by `groups`. For example,
-        - At `groups=1`, all inputs are convolved to all outputs.
-        - At `groups=2`, the operation becomes equivalent to having two
-          convolutional layers side by side, each seeing half the input
-          channels, and producing half the output channels, and both
-          subsequently concatenated.
-        - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `filters / input_channels`
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -710,16 +695,11 @@ class Conv3D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
-    groups: Integer, the number of channel groups controlling the connections
-      between inputs and outputs. Input channels and `filters` must both be
-      divisible by `groups`. For example,
-        - At `groups=1`, all inputs are convolved to all outputs.
-        - At `groups=2`, the operation becomes equivalent to having two
-          convolutional layers side by side, each seeing half the input
-          channels, and producing half the output channels, and both
-          subsequently concatenated.
-        - At `groups=input_channels`, each input channel is convolved with its
-          own set of filters, of size `filters / input_channels`
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).

From a610493e778f2badf8f2674c9933d0807d15b4bb Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 26 Apr 2020 18:36:07 -0700
Subject: [PATCH 0034/1533] Update examples in docstring to use TF 2.x code

The examples in docstrings of two APIs, tf.histogram_fixed_width_bins
and tf.histogram_fixed_width still used TF 1.x code.

This PR updates the docstring to use TF 2.x code in examples.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/histogram_ops.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 92f3e7a24ba..009f9f63f48 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -68,10 +68,8 @@ def histogram_fixed_width_bins(values,
   value_range = [0.0, 5.0]
   new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 
-  with tf.compat.v1.get_default_session() as sess:
-    indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
-    variables.global_variables_initializer().run()
-    sess.run(indices) # [0, 0, 1, 2, 4, 4]
+  indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
+  indices # [0, 0, 1, 2, 4, 4]
   ```
   """
   with ops.name_scope(name, 'histogram_fixed_width_bins',
@@ -137,10 +135,8 @@ def histogram_fixed_width(values,
   value_range = [0.0, 5.0]
   new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
 
-  with tf.compat.v1.get_default_session() as sess:
-    hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-    variables.global_variables_initializer().run()
-    sess.run(hist) => [2, 1, 1, 0, 2]
+  hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+  hist # [2, 1, 1, 0, 2]
   ```
   """
   with ops.name_scope(name, 'histogram_fixed_width',

From c7a16159f71bc5beb9a3fc35cc97a9e5b9f94d40 Mon Sep 17 00:00:00 2001
From: Kayou <pierre@kayou.io>
Date: Mon, 27 Apr 2020 14:18:08 +0200
Subject: [PATCH 0035/1533] Update check_cuda_libs.py

---
 third_party/gpus/check_cuda_libs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
index b7b36e6466e..728d178afec 100644
--- a/third_party/gpus/check_cuda_libs.py
+++ b/third_party/gpus/check_cuda_libs.py
@@ -59,7 +59,7 @@ def check_cuda_lib(path, check_soname=True):
   objdump = which("objdump")
   if check_soname and objdump is not None and not _is_windows():
     # Decode is necessary as in py3 the return type changed from str to bytes
-    output = subprocess.check_output([objdump, "-p", path]).decode("ascii")
+    output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
     output = [line for line in output.splitlines() if "SONAME" in line]
     sonames = [line.strip().split(" ")[-1] for line in output]
     if not any([soname == os.path.basename(path) for soname in sonames]):

From 84d354f945a85368b1dcece6c203fc5538ff2fab Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 31 Mar 2020 22:45:32 +0100
Subject: [PATCH 0036/1533] Remove expired forward compatibility horizons

---
 .../compiler/tests/fused_batchnorm_test.py    |  4 --
 .../python/keras/layers/normalization.py      |  2 -
 .../python/ops/nn_fused_batchnorm_test.py     |  9 +----
 tensorflow/python/ops/nn_impl.py              | 40 ++++++-------------
 4 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/tests/fused_batchnorm_test.py b/tensorflow/compiler/tests/fused_batchnorm_test.py
index 6a9076e9be8..a36effe5984 100644
--- a/tensorflow/compiler/tests/fused_batchnorm_test.py
+++ b/tensorflow/compiler/tests/fused_batchnorm_test.py
@@ -23,7 +23,6 @@ import numpy as np
 
 from tensorflow.compiler.tests import test_utils
 from tensorflow.compiler.tests import xla_test
-from tensorflow.python.compat import compat
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -132,9 +131,6 @@ class FusedBatchNormTest(xla_test.XLATestCase, parameterized.TestCase):
 
   def _testLearning(self, use_gradient_checker, data_format,
                     exponential_avg_factor):
-    if not compat.forward_compatible(2020, 3,
-                                     6) and exponential_avg_factor != 1.0:
-      self.skipTest("running average not available.")
     channel = 3
     x_shape = [2, 2, 6, channel]
     scale_shape = [channel]
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 97da2954b65..e1fb21c76ef 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -547,7 +546,6 @@ class BatchNormalizationBase(Layer):
     # after fixing graph pattern matching and enabling fused_batch_norm to
     # take exponential_avg_factor as a tensor input.
     use_fused_avg_updates = (
-        compat.forward_compatible(2020, 3, 6) and
         ops.executing_eagerly_outside_functions() and
         isinstance(self.momentum, (float, int)) and
         device_context.enclosing_tpu_context() is None)
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index a809b768833..5497325f6c0 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -380,13 +379,7 @@ class BatchNormalizationTest(test.TestCase):
     use_gpu_vals = [False]
     if test.is_gpu_available(cuda_only=True):
       use_gpu_vals += [True]
-    factors = [
-        1.0,
-    ]
-    if compat.forward_compatible(2020, 3, 6):
-      factors += [
-          0.6,
-      ]
+    factors = [1.0, 0.6]
     for dtype in [np.float16, np.float32]:
       for use_gpu in use_gpu_vals:
         for data_format in ['NHWC', 'NCHW']:
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 8a3a620f765..e7e44a6d490 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import math
 
-from tensorflow.python.compat import compat
 from tensorflow.python.distribute import distribution_strategy_context as ds
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -1458,7 +1457,7 @@ def batch_normalization(x,
       `tf.nn.moments(..., keepdims=False)` during training, or running averages
       thereof during inference.
 
-  See equation 11 in Algorithm 2 of source: 
+  See equation 11 in Algorithm 2 of source:
   [Batch Normalization: Accelerating Deep Network Training by
   Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
   (http://arxiv.org/abs/1502.03167).
@@ -1589,31 +1588,18 @@ def fused_batch_norm(
   min_epsilon = 1.001e-5
   epsilon = epsilon if epsilon > min_epsilon else min_epsilon
 
-  if compat.forward_compatible(2020, 3, 6):
-    y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
-        x,
-        scale,
-        offset,
-        mean,
-        variance,
-        epsilon=epsilon,
-        exponential_avg_factor=exponential_avg_factor,
-        data_format=data_format,
-        is_training=is_training,
-        name=name)
-    return y, running_mean, running_var
-  else:
-    y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
-        x,
-        scale,
-        offset,
-        mean,
-        variance,
-        epsilon=epsilon,
-        data_format=data_format,
-        is_training=is_training,
-        name=name)
-    return y, running_mean, running_var
+  y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
+      x,
+      scale,
+      offset,
+      mean,
+      variance,
+      epsilon=epsilon,
+      exponential_avg_factor=exponential_avg_factor,
+      data_format=data_format,
+      is_training=is_training,
+      name=name)
+  return y, running_mean, running_var
 
 
 @tf_export(v1=["nn.batch_norm_with_global_normalization"])

From fe3a4bcf2f7d0be92b6b70de43cd05d61cb0e025 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 28 Apr 2020 09:00:24 -0700
Subject: [PATCH 0037/1533] Update tf.histogram_fixed_width docstring to
 comform to Python doctest

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/histogram_ops.py | 30 ++++++++++++++------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 009f9f63f48..3ef711a838f 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -63,13 +63,14 @@ def histogram_fixed_width_bins(values,
   Examples:
 
   ```python
-  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-  nbins = 5
-  value_range = [0.0, 5.0]
-  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-
-  indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
-  indices # [0, 0, 1, 2, 4, 4]
+  >>> # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+  ... 
+  >>> nbins = 5
+  >>> value_range = [0.0, 5.0]
+  >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+  >>> indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
+  >>> print(indices) 
+  tf.Tensor([0 0 1 2 4 4], shape=(6,), dtype=int32)
   ```
   """
   with ops.name_scope(name, 'histogram_fixed_width_bins',
@@ -130,13 +131,14 @@ def histogram_fixed_width(values,
   Examples:
 
   ```python
-  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-  nbins = 5
-  value_range = [0.0, 5.0]
-  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-
-  hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-  hist # [2, 1, 1, 0, 2]
+  >>> # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
+  ... 
+  >>> nbins = 5
+  >>> value_range = [0.0, 5.0]
+  >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
+  >>> hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+  >>> print(hist)
+  tf.Tensor([2 1 1 0 2], shape=(5,), dtype=int32)
   ```
   """
   with ops.name_scope(name, 'histogram_fixed_width',

From 99b5405c610f82ad8958429987eeeea598898655 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 15 Jan 2020 10:44:02 -0800
Subject: [PATCH 0038/1533] Pydoc fixes for StructuredTensor

---
 .../ops/structured/structured_tensor.py       | 33 ++++++++++---------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index a75364df659..0a24bba1cd9 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -62,21 +62,22 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
   ```python
   >>> # A scalar StructuredTensor describing a single person.
-  >>> s1 = tf.structured.constant({"age": 82, "nicknames": ["Bob", "Bobby"]})
-  >>> print s1.shape
-  ()
-  >>> print s1["age"]
-  tf.Tensor(82, shape=(), dtype=int32)
+  >>> s1 = StructuredTensor.from_pyval(
+  ...     {"age": 82, "nicknames": ["Bob", "Bobby"]})
+  >>> s1.shape
+  TensorShape([])
+  >>> s1["age"]
+  <tf.Tensor: shape=(), dtype=int32, numpy=82>
 
   >>> # A vector StructuredTensor describing three people.
-  >>> s2 = stf.struct.constant([
+  >>> s2 = StructuredTensor.from_pyval([
   ...     {"age": 12, "nicknames": ["Josaphine"]},
   ...     {"age": 82, "nicknames": ["Bob", "Bobby"]},
-  ...     {"age": 82, "nicknames": ["Elmo"]}])
-  >>> print s2.shape
-  (3,)
-  >>> print s2[0]["age"]
-  tf.Tensor(12, shape=(), dtype=int32)
+  ...     {"age": 42, "nicknames": ["Elmo"]}])
+  >>> s2.shape
+  TensorShape([3])
+  >>> s2[0]["age"]
+  <tf.Tensor: shape=(), dtype=int32, numpy=12>
   ```
 
   ### Field Paths
@@ -312,7 +313,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     If `field_name` is a `string`, then it names a field directly owned by this
     `StructuredTensor`.  If this `StructuredTensor` has shape `[D1...DN]`, then
     the returned tensor will have shape `[D1...DN, V1...VM]`, where the slice
-    `result[d1...dN]`contains the field value for the structure at
+    `result[d1...dN]` contains the field value for the structure at
     `self[d1...dN]`.
 
     If `field_name` is a `tuple` of `string`, then it specifies a path to a
@@ -458,9 +459,9 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
     Requires that all fields are Eager tensors.
 
-    >>> print(StructuredTensor.from_fields(
-    ...     {'a': [1, 2, 3]}, [3]).to_pyval())
-    [{b'a': 1}, {b'a': 2}, {b'a': 3}]
+    >>> StructuredTensor.from_fields(
+    ...     {'a': [1, 2, 3]}, [3]).to_pyval()
+    [{'a': 1}, {'a': 2}, {'a': 3}]
 
     Note that `StructuredTensor.from_pyval(pyval).to_pyval() == pyval`.
 
@@ -496,7 +497,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   def from_pyval(cls, pyval, typespec=None):
     """Constructs a StructuredTensor from a nested Python structure.
 
-    >>> print StructuredTensor.from_pyval(
+    >>> StructuredTensor.from_pyval(
     ...     {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]})
     <StructuredTensor {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]}>
 

From 58a378f9f608c942ffe66ba12cc85f8d8fc3e7a4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 28 Apr 2020 12:49:13 -0700
Subject: [PATCH 0039/1533] Remove `print` in docstring as it causes
 discrepancy in doctest

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/histogram_ops.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 3ef711a838f..ffdd900ec71 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -68,9 +68,8 @@ def histogram_fixed_width_bins(values,
   >>> nbins = 5
   >>> value_range = [0.0, 5.0]
   >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-  >>> indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
-  >>> print(indices) 
-  tf.Tensor([0 0 1 2 4 4], shape=(6,), dtype=int32)
+  >>> tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
+  <tf.Tensor: shape=(6,), dtype=int32, numpy=array([0, 0, 1, 2, 4, 4], dtype=int32)>
   ```
   """
   with ops.name_scope(name, 'histogram_fixed_width_bins',
@@ -136,9 +135,8 @@ def histogram_fixed_width(values,
   >>> nbins = 5
   >>> value_range = [0.0, 5.0]
   >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-  >>> hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
-  >>> print(hist)
-  tf.Tensor([2 1 1 0 2], shape=(5,), dtype=int32)
+  >>> tf.histogram_fixed_width(new_values, value_range, nbins=5)
+  <tf.Tensor: shape=(5,), dtype=int32, numpy=array([2, 1, 1, 0, 2], dtype=int32)>
   ```
   """
   with ops.name_scope(name, 'histogram_fixed_width',

From 3a8b6ba5c1c8c2111c53490eba3f0c1a07f2494a Mon Sep 17 00:00:00 2001
From: Fei Sun <sunfei.china@gmail.com>
Date: Wed, 29 Apr 2020 10:35:01 +0800
Subject: [PATCH 0040/1533] Edit according to PR comments

---
 .../core/distributed_runtime/rpc/grpc_server_lib.cc       | 8 ++++----
 tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 7e2c42dabea..2cfdde5f56f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -132,7 +132,9 @@ GrpcServer::~GrpcServer() {
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
 // Look up the port that has been requested for this task in `server_def`.
-Status GrpcServer::GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const {
+Status GrpcServer::GetHostAndPort(const ServerDef& server_def,
+                                  string* host_name,
+                                  int* port) const {
   *port = -1;
   *host_name = "localhost";
   for (const auto& job : server_def.cluster().job()) {
@@ -180,9 +182,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   // otherwise if 'task_index=-1' the program will abort.
 
   int requested_port;
-  string host_name;
-  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name, &requested_port));
-  host_name_ = host_name;
+  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name_, &requested_port));
 
   SessionOptions sess_opts;
   ConfigProto config = server_def_.default_session_config();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index feb174cde4e..8ecf0e158bf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -104,7 +104,9 @@ class GrpcServer : public ServerInterface {
   Status UpdateServerDef(const ServerDef& server_def);
 
  protected:
-  virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const;
+  virtual Status GetHostAndPort(const ServerDef& server_def,
+                                string* host_name,
+                                int* port) const;
   Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.

From 1c1203f4566d085f1ca8fd37c8313bb7b00170b1 Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Wed, 29 Apr 2020 06:10:49 +0000
Subject: [PATCH 0041/1533] Fixed eager mode gradient checkpointing by
 eliminating unecessary persistence of intermediate activations in memory

---
 tensorflow/python/ops/custom_gradient.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 4040a4db038..a20619f5be7 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -406,14 +406,17 @@ def _graph_mode_decorator(f, args, kwargs):
 
 def _eager_mode_decorator(f, args, kwargs):
   """Implement custom gradient decorator for eager mode."""
-  with tape_lib.VariableWatcher() as variable_watcher:
-    result, grad_fn = f(*args, **kwargs)
+
+  trainable_vars = []
+  if 'trainable_variables' in kwargs:
+    trainable_vars = kwargs.pop('trainable_variables')
+  result, grad_fn = f(*args, **kwargs)
   all_inputs = list(args) + list(kwargs.values())
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
   variables = [
       v.deref()  # pylint: disable=g-complex-comprehension
-      for v in set(v.ref() for v in variable_watcher.watched_variables())
+      for v in set(v.ref() for v in trainable_vars)
       if all(v.deref() is not i for i in all_inputs)
   ]
   grad_argspec = tf_inspect.getfullargspec(grad_fn)
@@ -483,7 +486,8 @@ def recompute_grad(f):
     """Inner function closure for calculating gradients."""
     current_var_scope = variable_scope.get_variable_scope()
 
-    result = f(*args, **kwargs)
+    with tape_lib.stop_recording():
+      result = f(*args, **kwargs)
 
     def grad(*dresult, **grad_kwargs):
       """Gradient function calculation for inner function."""

From c55943eb30056dc94cf22be6a83fd71c47f541c4 Mon Sep 17 00:00:00 2001
From: mshr-h <mshr-h@users.noreply.github.com>
Date: Wed, 29 Apr 2020 21:10:09 +0900
Subject: [PATCH 0042/1533] Fix misspelling

---
 .../lite/tools/benchmark/benchmark_performance_options.cc     | 4 ++--
 tensorflow/lite/tools/delegates/external_delegate_provider.cc | 2 +-
 .../lite/tools/evaluation/evaluation_delegate_provider.h      | 2 +-
 .../tools/evaluation/evaluation_delegate_provider_test.cc     | 2 +-
 tensorflow/lite/tools/make/Makefile                           | 2 +-
 tensorflow/lite/tools/optimize/operator_property.h            | 2 +-
 .../lite/tools/optimize/python/modify_model_interface_lib.py  | 2 +-
 tensorflow/lite/tools/versioning/op_version.cc                | 2 +-
 8 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index 26fed5e279f..cafef6fa133 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -334,7 +334,7 @@ void BenchmarkPerformanceOptions::Run() {
   // profiling listener etc. in each Run() invoke because such listeners may be
   // reset and become invalid in the next Run(). As a result, we record the
   // number of externally-added listeners here to prevent they're cleared later.
-  const int num_external_listners = single_option_run_->NumListeners();
+  const int num_external_listeners = single_option_run_->NumListeners();
 
   // Now perform all runs, each with different performance-affecting parameters.
   for (const auto& run_params : all_run_params_) {
@@ -349,7 +349,7 @@ void BenchmarkPerformanceOptions::Run() {
 
     // Clear internally created listeners before each run but keep externally
     // created ones.
-    single_option_run_->RemoveListeners(num_external_listners);
+    single_option_run_->RemoveListeners(num_external_listeners);
 
     all_run_stats_->MarkBenchmarkStart(*single_option_run_params_);
     single_option_run_->Run();
diff --git a/tensorflow/lite/tools/delegates/external_delegate_provider.cc b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
index 95b0e42802f..193860820b1 100644
--- a/tensorflow/lite/tools/delegates/external_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/external_delegate_provider.cc
@@ -119,7 +119,7 @@ std::vector<Flag> ExternalDelegateProvider::CreateFlags(
                               "The library path for the underlying external."),
       CreateFlag<std::string>(
           "external_delegate_options", params,
-          "Comma-seperated options to be passed to the external delegate")};
+          "Comma-separated options to be passed to the external delegate")};
   return flags;
 }
 
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
index 36f80469a97..9ff20d630ce 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.h
@@ -33,7 +33,7 @@ class DelegateProviders {
   DelegateProviders();
 
   // Initialize delegate-related parameters from commandline arguments and
-  // returns true if sucessful.
+  // returns true if successful.
   bool InitFromCmdlineArgs(int* argc, const char** argv);
 
   // Get all parameters from all registered delegate providers.
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
index c2dfa8d0360..5d0a4dfa7d3 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
@@ -66,7 +66,7 @@ TEST(EvaluationDelegateProviderTest, GetAllParamsWithTfliteInferenceParams) {
   TfliteInferenceParams params;
   params.set_delegate(TfliteInferenceParams::NONE);
   params.set_num_threads(4);
-  // The same-meaning parameter in TfliteInferenceParams takes precendence.
+  // The same-meaning parameter in TfliteInferenceParams takes precedence.
   tools::ToolParams tool_params = providers.GetAllParams(params);
   EXPECT_EQ(4, tool_params.Get<int>("num_threads"));
   EXPECT_EQ(1, argc);
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index ad3832f9962..41f87fb033d 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -246,7 +246,7 @@ BENCHMARK_LIB_SRCS := $(filter-out \
 	$(BENCHMARK_ALL_SRCS))
 
 # These target-specific makefiles should modify or replace options like
-# CXXFLAGS or LIBS to work for a specific targetted architecture. All logic
+# CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
 # based on platforms or architectures should happen within these files, to
 # keep this main makefile focused on the sources and dependencies.
 include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc)
diff --git a/tensorflow/lite/tools/optimize/operator_property.h b/tensorflow/lite/tools/optimize/operator_property.h
index 995595e7878..95b0e5000c3 100644
--- a/tensorflow/lite/tools/optimize/operator_property.h
+++ b/tensorflow/lite/tools/optimize/operator_property.h
@@ -86,7 +86,7 @@ struct OperatorProperty {
   bool restrict_same_input_output_scale = false;
 
   // Use same min of min and max of max for each group.
-  // Incompatable with restrict_same_input_output_scale and restricted_value.
+  // Incompatible with restrict_same_input_output_scale and restricted_value.
   // TODO(jianlijianli): make it compatible with other restrictions when there
   // is a use case.
   std::vector<std::vector<int>> restrict_scale = {};
diff --git a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
index 5e4bf99ccdf..782d88cbc9b 100644
--- a/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
+++ b/tensorflow/lite/tools/optimize/python/modify_model_interface_lib.py
@@ -74,6 +74,6 @@ def modify_model_interface(input_file, output_file, input_type, output_type):
   # Throw an exception if the return status is an error.
   if status != 0:
     raise RuntimeError(
-        'Error occured when trying to modify the model input type from float '
+        'Error occurred when trying to modify the model input type from float '
         'to {input_type} and output type from float to {output_type}.'.format(
             input_type=input_type, output_type=output_type))
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 0b892cf847f..e60865b85a8 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -110,7 +110,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       if (op_sig.input_types.size() == 2) {
         return 6;
       }
-      // `keep_num_dims` is supported at verison 5.
+      // `keep_num_dims` is supported at version 5.
       if (op_sig.options.fully_connected.keep_num_dims) {
         return 5;
       }

From cfe69e1493f8a85b369a2405572df27b8a547cbc Mon Sep 17 00:00:00 2001
From: Andrew Stevens <andrew.stevens@infineon.com>
Date: Wed, 29 Apr 2020 17:57:49 +0200
Subject: [PATCH 0043/1533] Improvements to QAT support in TFLite conversion.

  Warning if redundant back-to-back fake quantization ops are detected + associated imrpovements .

   Suppress premature elimination of Dequant ; Quant pairs that are not no-ops (i.e. applied to value of differing quantization).

    FakeQuantWithMinMaxArgsOp treated consistently with constant min/max input FakeQuantWithMinMaxVarsPerChannelOp and FakeQuantWithMinMaxVarsOp.

    tf_tfl_translate test application exposes command-line options controlling MLIR::PassManager logging.
---
 .../tests/end2end/back2back_fake_quant.pbtxt  | 1186 +++++++++++++++++
 .../compiler/mlir/lite/tf_tfl_translate.cc    |    3 +
 .../compiler/mlir/lite/transforms/optimize.cc |    9 +
 .../mlir/lite/transforms/optimize_patterns.td |    7 +-
 .../mlir/lite/transforms/prepare_quantize.cc  |   53 +-
 .../mlir/lite/transforms/prepare_tf.cc        |  108 +-
 6 files changed, 1344 insertions(+), 22 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt

diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt
new file mode 100644
index 00000000000..8ae4c0e8239
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt
@@ -0,0 +1,1186 @@
+# RUN: tf_tfl_translate --mlir-elide-elementsattrs-if-larger=10 --emit-builtin-tflite-ops \
+# RUN: --emit-select-tf-ops --tf-inference-type=DT_QUINT8 --tf-input-min-values=0.0 \
+# RUN: --tf-input-max-values=6.283185307179586 --tf-input-arrays=quant_dense_input --tf-input-shapes=1,1  \
+# RUN: --tf-output-arrays=Identity -o %t.tflite  %s 2>&1 | FileCheck %s
+# RUN: flatbuffer_to_string %t.tflite | FileCheck --check-prefix=RESULT1 %s
+
+node {
+  name: "quant_dense_input"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: -1
+        }
+        dim {
+          size: 1
+        }
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "n\267\313\276@W\337>\227o9>PF\237\275|%}\276\333n\005>\371\005\031?\230\355\235\275\344\211_>\034\222\264=\254\003\345=Q\027*>\225\304\373>Qm6>\270\025\022;N/\020\277"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/MatMul/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.5634322166442871
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5968852043151855
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/MatMul/ReadVariableOp"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul/kquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense/MatMul/kquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/MatMul/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10455"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/MatMul"
+  op: "MatMul"
+  input: "quant_dense_input"
+  input: "sequential/quant_dense/MatMul/kquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/BiasAdd/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "\000\000\000\000L\020\341=\355\223\242\276\000\000\000\000\000\000\000\000\223&<>\206\234d\276\000\000\000\000\323%\241\276\331\305\234>\004\341\216>\3545e\276\032\363O\276\257:u\276\313\223\r>\000\000\000\000"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/BiasAdd/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/BiasAdd/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/BiasAdd"
+  op: "BiasAdd"
+  input: "sequential/quant_dense/MatMul"
+  input: "sequential/quant_dense/BiasAdd/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/Relu"
+  op: "Relu"
+  input: "sequential/quant_dense/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0019266719464212656
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.249699354171753
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/Relu"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense/oquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense/oquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/Relu"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10477"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0019266719464212656
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 3.1626083850860596
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/oquant/IdentityN"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/iquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_1/iquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense/oquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10494"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "\356x\007<\273\351H\275\2506\242\275\261t\332>\"O\'>m\"\010\276(\311b\275\301\233\262=d\327\031\275V\030`>1U\212\276\325\353,=\321\375+=\305\016-=IQC\276\272N\255\276\342\\\241=\253K\230>\221\247\220\274\026[\220=\301W9\276\3041\311>\200\013\255\276@\331\307>\250\246\320=e\305A\276\312\211\200>\252\220x\276?\306\305>jC\221>_h\304>\350\2701>\213&\014\277\327a\251>\010\321\365=\022\346\357>_P\326>\024@\260\276\010\002\203\275%\013;\276\300\270s\2762~\206\2769\341\356<;\364\023\276\'=\225\276\347\327\247\275\037\317\326\276\2379\243>\241+\306\276\252\226\007\276\232\273\010= \001s>h.\010=x\rK=\231\255\315\276\001\244>>_\227\304\276\030\255\255\276\t\013\014>\263\031i\276\312\312\204>\262\331\244=\367\331R\276\020h\005>\361\260\322<n\273\274>a?\323>\t\236\304\275\240\215\240\275Y\034V>\200\236=<\232\336Y\276\274_\263\276\240\004\334\276\232\003\252>\264\243\273>\254Q\201>_\271\263\276a1\305>\237?\331>\244\031\275>\365K\351\276\212\302\234>\276{\217\275{\211{\276.\016\271\276\337\344\255\276\230\2347\2757O(>puO=\024\177\207\276v:\255>%.\230>\237\211\341=\327\316\037\274\302\205I>\020\234\201\276\027\356(\276J\320*\277\014\226\240=\222\312\323\276J\235\253=\263\201\221>\030\220\027\276\375g\"\277\227E\326\276\206\267{>\253\271\374\276!\235\203>\264\215\342=g\235\267\2767\211\306>x\323i>\244TJ>\306\216]\275\310\230\004\276\340\000B\274\351\324a>$[\266=\267\220\256\275\241n >\003s\235>\340\251\225\276\335jX=N\"@\276N;\271\276Sl6=\234S\007>~\345\025\277\013\020\362\275\377\333 \276@\227\325=\177\375\215=\335\357\332=t\217\027\276D\337\326=}\211\363\276\227\"\322>r\245\267=]\007x=4\372e>\252\235\373\276\030\276\221\275\2763\000\276\202\007\301>H\037Y>\307;\006?QT\241=NY\\<\014\254@>P\031\217<l\023\340>\010\327+\276\212\240\305\276\371x7>$\350\037?I\332\262\276\361\307\031\276\275o\206>\355\217\016\276 \036\007>yQ\221=\372\204\207>#\301\214>\344\376\302\275y\024\261\2768\304+\276\260\n \276(\026\254>\300*\034=j\2142=[\023\233>\301\370\304\2766<0=\321\025\237>\264C\220\2765\301\330\276\274\027\252\276\n\331U\276\355\363\302>\214i\305\276\367`\273>\244\352\241\276c\030C\276u\033\251\275r\314\036>\033\352h\275\361\372\374\276\370\356\242\274O\305\030\277\360\264\221>\244 \'\276\223j\006>\334\320E>\274\223\031\277\275\210\326>\330\301o=9\273>>\324\003\314\275\234\204\215\276\321\253|\276\313v\275\276d\256\270>r,\013\27765J=\313s\'\276\035\363{\276\"3\251\275\007\267d<\266\261\256\2766\205g\276~\275\331>\272\317\242>\237\305\034>\312\1770\2769\356\024\277\255|Y\276\374V\271=\320b\177\276\014\023\345\2766\353\226\273\333\033\373\275.[\264\276\tx@\276Dx\273>Q\327\233>\246\265\353>\021\214\315\276\000K\356\272d\207\037\276\331Z\321\276\\>\001?\202\343\031?\000>^\2727\375V\276\266\006[=\367\377\313>;vB\275F\212\022?D\253\246\276\314\207\210>\255\222\211\276\230V\223\273z,\316\276\325D\206\276CR\260> \260-\275\273-*\276i\032d>\266\316\003>\300\322\205\276\232\205\322\276\036\267\205>\372Y\342=r\221k\276(\003k>"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/MatMul/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.6485296487808228
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.6135242581367493
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/MatMul/ReadVariableOp"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul/kquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_1/MatMul/kquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/MatMul/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10513"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/MatMul"
+  op: "MatMul"
+  input: "sequential/quant_dense_1/iquant/IdentityN"
+  input: "sequential/quant_dense_1/MatMul/kquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/BiasAdd/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+        }
+        tensor_content: "Y\226;>\002#==\315\253\207>\032t\021\276\2510O\273\354\026\001\276\000\000\000\000\227\021M>\275D\213>\000\000\000\000\231\354)\276]@\237>\026\327E=p\007\003>\007\340c>\\\241\336\275"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/BiasAdd/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/BiasAdd/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/BiasAdd"
+  op: "BiasAdd"
+  input: "sequential/quant_dense_1/MatMul"
+  input: "sequential/quant_dense_1/BiasAdd/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/Relu"
+  op: "Relu"
+  input: "sequential/quant_dense_1/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.0019266719464212656
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 2.7181646823883057
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/Relu"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_1/oquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_1/oquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_1/Relu"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10535"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 16
+          }
+          dim {
+            size: 1
+          }
+        }
+        tensor_content: "\217\017J?$\023*>\352\265\272\276\304|;\276\2178\277=\275\031\026\277J\017\304\276Iu!?h]\314\276A7;\276\204\221\031\275\323\2259\277\341K\304>Uzs?@)\337>\"\300n\276"
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/MatMul/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.7017847895622253
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.9041997790336609
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/MatMul/ReadVariableOp"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul/kquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_2/MatMul/kquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/MatMul/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10554"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/MatMul"
+  op: "MatMul"
+  input: "sequential/quant_dense_1/oquant/IdentityN"
+  input: "sequential/quant_dense_2/MatMul/kquant/IdentityN"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "transpose_a"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "transpose_b"
+    value {
+      b: false
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/BiasAdd/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+          dim {
+            size: 1
+          }
+        }
+        float_val: 0.04556306451559067
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/BiasAdd/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/BiasAdd/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/BiasAdd"
+  op: "BiasAdd"
+  input: "sequential/quant_dense_2/MatMul"
+  input: "sequential/quant_dense_2/BiasAdd/ReadVariableOp"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "data_format"
+    value {
+      s: "NHWC"
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: -0.8735198974609375
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  op: "Identity"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 1.0447778701782227
+      }
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  op: "Identity"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1/resource"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"
+  op: "FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/BiasAdd"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars/ReadVariableOp_1"
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "sequential/quant_dense_2/oquant/IdentityN"
+  op: "IdentityN"
+  input: "sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"
+  input: "sequential/quant_dense_2/BiasAdd"
+  attr {
+    key: "T"
+    value {
+      list {
+        type: DT_FLOAT
+        type: DT_FLOAT
+      }
+    }
+  }
+  attr {
+    key: "_gradient_op_type"
+    value {
+      s: "CustomGradient-10575"
+    }
+  }
+}
+node {
+  name: "sequential/output/FakeQuantWithMinMaxArgs"
+  op: "FakeQuantWithMinMaxArgs"
+  input: "sequential/quant_dense_2/oquant/IdentityN"
+  attr {
+    key: "max"
+    value {
+      f: 0.9921875
+    }
+  }
+  attr {
+    key: "min"
+    value {
+      f: -1.0
+    }
+  }
+  attr {
+    key: "narrow_range"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "num_bits"
+    value {
+      i: 8
+    }
+  }
+}
+node {
+  name: "Identity"
+  op: "Identity"
+  input: "sequential/output/FakeQuantWithMinMaxArgs"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+}
+versions {
+  producer: 175
+}
+
+# CHECK: {{.*: warning:}} loc(fused["sequential/quant_dense/oquant/FakeQuantWithMinMaxVars"{{.*\): }}quantizer's output has another quantizer
+# CHECK-NEXT: {{.*: warning:}} loc(fused["sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"{{.*\): }}quantizer's output has another quantizer
+
+# RESULT1: name: "Identity"
+# RESULT1-NEXT: quantization:
+# RESULT1-NEXT: scale: [ 0.007523 ],
+# RESULT1-NEXT: zero_point: [ 116 ]
+
+# TODO  Actually RESULT1 represents in incomplete implementation
+# Currently TF2.2.0-rc3 all but the first fake_quant in 
+# sequence are dropped.  A correct transalation would retain the second as a requantization
+# op.
+
+# CORRECT1:name: "Identity"
+# CORRECT1-NEXT: quantization:
+# CORRECT1-NEXT: scale: [ 0.007813 ],
+# CORRECT1-NEXT: zero_point: [ 128 ]
+
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index ab9baefacaf..6b48a27dbe3 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -130,6 +130,7 @@ int main(int argc, char **argv) {
   // interface. That also means we need to relay the value set in one option to
   // all its aliases.
   mlir::registerAsmPrinterCLOptions();
+  mlir::registerPassManagerCLOptions();
   llvm::cl::ParseCommandLineOptions(
       argc, argv, "TF GraphDef to TFLite FlatBuffer converter\n");
 
@@ -174,6 +175,8 @@ int main(int argc, char **argv) {
 
   mlir::PassManager pm(&context);
 
+  mlir::applyPassManagerCLOptions(pm);
+
   // Set the quantization specifications from the command line flags.
   mlir::TFL::QuantizationSpecs quant_specs;
   if (mlir::TFL::ParseInputNodeQuantSpecs(input_arrays, min_values, max_values,
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index a1aedb0af32..c44222bf496 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -206,6 +206,15 @@ DenseElementsAttr GetShape(Value output_val) {
       llvm::makeArrayRef(shape));
 }
 
+bool notFromQuantOpDifferentQuanteUse( Value val, TypeAttr qtype) {
+  auto val_defn_op = val.getDefiningOp();
+  TFL::QuantizeOp q_op = llvm::dyn_cast_or_null<TFL::QuantizeOp>(val_defn_op);
+  if( !q_op)
+    return true;
+
+  return q_op.qtype() == qtype.getValue();
+}
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
 // Fuse Add with proceeding FullyConnected.
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 82d9a76fab3..c6c71b106c7 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -29,6 +29,11 @@ def ExtractSingleElementAsFloat : NativeCodeCall<
 // Checks if the value has only one user.
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 
+
+// Checks value is not produce by a TLF_QUant with
+// different quantization attribute
+
+def NotFromQuantOpDifferentQuant : Constraint<CPred<"notFromQuantOpDifferentQuanteUse($0,$1)">>;
 //===----------------------------------------------------------------------===//
 // Ternary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -160,7 +165,7 @@ foreach BinaryOp = [TFL_DivOp, TFL_MulOp] in
 // This pattern applies when the same quantize/dequantize have been used twice
 // with the same scale. We want to remove the redundancy.
 // TODO(fengliuai): move this to the sanity check of pre-quantize pass.
-def : Pat<(TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), (replaceWithValue $in)>;
+def eliminate_dq_q_pairs : Pat<(TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), (replaceWithValue $in), [(NotFromQuantOpDifferentQuant $in, $qt)]>;
 
 
 // Constraint that makes sure both operands are the same operands.
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 4f25e434fac..e24bfe883a5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -205,6 +205,7 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
   // one is returned directly, we decide to return the quantized result instead,
   // so this op can be quantized. This is only applied on the returned result
   // because the error will not be accumulated.
+
   func.walk([&](ReturnOp ret) {
     int i = 0;
     for (Value returned : ret.operands()) {
@@ -226,11 +227,57 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
   func.walk([&](ConcatenationOp concat) {
     if (concat.output().hasOneUse() &&
         Quantized(*concat.output().user_begin())) {
-      return;
+        return;
     }
     concat.emitWarning(
-        "Missing quantization parameter on the output might introduce "
-        "quantization error!");
+            "Missing quantization parameter on the output might introduce "
+            "quantization error!");
+  });
+
+  // Check for  (Quant (Dequant $in), $qA) "qdq" pairs that couldn't be eliminated at this
+  // point.  This only occurs for the pattern 
+  //      (Quant (Dequant (Quant $in, $qB)), $qA)   $qB != $qA
+  // where the  qdq pair denotes a non-trivial requantiziion of an alreadyquantized value.
+  // Since this makes little sense (directly quantizing (Quant $in, $qA) would introduce
+  // less quantization noise) the likley cause is an minor error in constructing
+  // the original network model that introduced back-to-back Fake Quantization operations.
+  // Hence: emit a warning.
+  // N.b. at this point weŕe (teporarility) in the quantization dialect (presuambly
+  // enalbe re-use in xla etc) quant::*QuantizeCastOp weŕe matching here.
+  //
+  func.walk([&](quant::QuantizeCastOp q_op) {
+
+    // If up with end up with
+    auto dq_op = 
+      dyn_cast_or_null<quant::DequantizeCastOp>(q_op.getOperand().getDefiningOp());
+    if (!dq_op) {
+      return;
+    }
+    auto dq_arg = dq_op.getOperand();
+
+    if (!dq_arg.hasOneUse()  ) {
+      // The initial quanization is used sompleace else ... so it might be
+      // reasonable for it to requantized for another purpose.
+      // TODO: ideally would want to still check whether requanization narrows 
+      // rather than widens the representation
+      return;
+    }
+
+    // Invariant: 
+    // isa<quant::QuantizeCastOp>(dq_arg.getDefiningOp()) -->
+    // getdq_arg.getType() != q_op.getResult().getType() 
+    //
+    // as otherwise qdq pair would have been optimized away.
+
+    auto qd_arg_def_q_op = 
+      dyn_cast_or_null<quant::QuantizeCastOp>(dq_arg.getDefiningOp());
+    if(!qd_arg_def_q_op) {
+      return;
+    }
+
+    qd_arg_def_q_op.emitWarning() << " quantizer's output has another quantizer ("
+        << q_op.getLoc()
+        << ") as consumer - intentional?";
   });
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index a97af8e632e..cc73f67f386 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -81,13 +81,52 @@ class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
   bool unfold_batch_matmul_;
 };
 
+
+template <class TFFakeQuantOp>
+struct FetchConstantMinMaxInputs {
+
+  using AttrType = DenseFPElementsAttr;
+  bool operator () (TFFakeQuantOp tf_op, AttrType &min_value, AttrType &max_value) const {
+    Value min = tf_op.min(), max = tf_op.max();
+    ;
+    // TODO This is likely redundant (Identity elimination rule are in
+    // prepare_patterns.td.  If not, its certainly, incomplete as neither
+    // IdentityN ops Nor chains of Identiy* (not sooo rare) are handled
+    if (auto id1 = dyn_cast_or_null<TF::IdentityOp>(min.getDefiningOp()))
+      min = id1.input();
+    if (auto id2 = dyn_cast_or_null<TF::IdentityOp>(max.getDefiningOp()))
+      max = id2.input();
+    if (!matchPattern(min, m_Constant(&min_value))) {
+      return false;
+    }
+    if (!matchPattern(max, m_Constant(&max_value))) {
+      return false;
+    }
+    return true; // Succesfully matched and fetched.
+  }
+};
+
+
+template <class TFFakeQuantOp>
+struct FetchMinMaxAttrs {
+
+  using AttrType = FloatAttr;
+  bool operator () (TFFakeQuantOp tf_op, AttrType &min_value, AttrType &max_value) const {
+    min_value = tf_op.minAttr();
+    max_value = tf_op.maxAttr();
+    return true;  // Succesfully matched and fetched.
+  }
+};
+
+
 // TODO(fengliuai): move this rule to PreparePatterns.td
 // TODO(fengliuai): reuse the quantization/tensorflow/tf_to_quant pass.
 // TODO(b/140968741): propagate the sign from the command line. Currently all
 // the FakeQuant is assumed to targeting UIN8, but per-channel kernel is
 // actually INT8.
 // Inserts a "tfl.quantize" and "tfl.dequantize" op pair (QDQs) after the
-// "tf.FakeQuantWithMinMaxVarsOp" to be constant folded. Since the constant
+// tf.FakeQyantWithMinMax{Vars|VarsPerChannel|Args}Op
+// to be constant folded. Since the constant
 // folding logic will use a "std.constant" op to replace the
 // "tf.FakeQuantWithMinMaxVarsOp", the "tfl.quantize" op is used to preserve
 // the quantization parameters as a TypeAttr and "tfl.dequantize" op used to
@@ -111,17 +150,33 @@ class PrepareTFPass : public PassWrapper<PrepareTFPass, FunctionPass> {
 //                   |
 //              tf.dequantize
 //                   |
-template <typename TFFakeQuantOp, bool PerAxis>
+//
+//
+// Warns if the (most likely unwanted, currently not quite correctly handled)
+// case of back-to-back tf.FakeQuant occurs
+//
+//             tf.FakeQuant*
+//                   |
+//             tf.FakeQuant*
+//
+// tf.identity / tf.IdentityN between the tf.FakeQuant* ops
+// need no special treatment are already eliminated before the rewrites / check is applied.
+//
+
+template <typename TFFakeQuantOp, bool PerAxis, class FetchMinMax>
 struct InsertTFLQuantOpsAfterTFFakeQuantOp
     : public OpRewritePattern<TFFakeQuantOp> {
-  using BaseType = InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis>;
+  using BaseType = InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis, FetchMinMax>;
 
-  explicit InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis>(
+  explicit InsertTFLQuantOpsAfterTFFakeQuantOp<TFFakeQuantOp, PerAxis, FetchMinMax>(
       MLIRContext *ctx)
       : OpRewritePattern<TFFakeQuantOp>(ctx) {}
 
+  FetchMinMax fetchMinMax;
+
+  using FetchAttrType = typename FetchMinMax::AttrType;
   LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
-                                PatternRewriter &rewriter) const override {
+                                     PatternRewriter &rewriter) const override {
     // We don't want to insert quantize/dequantize if the quantize op exists.
     auto res = tf_op.outputs();
     if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin()))
@@ -130,14 +185,11 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
     // Extract the min/max constant values from the operands. We also consider
     // a special case that there are tf.Identity ops between the min/max
     // constants and the tf.FakeQuantWithMinMaxVarsOp.
-    Value min = tf_op.min(), max = tf_op.max();
-    DenseFPElementsAttr min_value, max_value;
-    if (auto id1 = dyn_cast_or_null<TF::IdentityOp>(min.getDefiningOp()))
-      min = id1.input();
-    if (auto id2 = dyn_cast_or_null<TF::IdentityOp>(max.getDefiningOp()))
-      max = id2.input();
-    if (!matchPattern(min, m_Constant(&min_value))) return failure();
-    if (!matchPattern(max, m_Constant(&max_value))) return failure();
+
+    FetchAttrType min_value, max_value;
+    if (!fetchMinMax(tf_op, min_value, max_value)) {
+      return this->failure();
+    }
 
     int quant_dim = -1;
     if (PerAxis) {
@@ -171,12 +223,28 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
   }
 };
 
+//
+// Three instances of the rule to cover the three different types of
+// TF::FakeQuant operators
+// 
 using PreparePerTensorFakeQuant =
-    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxVarsOp, false>;
+    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxVarsOp, 
+                                        false,
+                                        FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsOp>
+                                       >;
 
 using PreparePerChannelFakeQuant =
     InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxVarsPerChannelOp,
-                                        true>;
+                                        true,
+                                        FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsPerChannelOp>
+                                       >;
+
+using PreparePerTensorFakeQuantWithMinMaxArgs =
+    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxArgsOp,
+                                        false,
+                                        FetchMinMaxAttrs<TF::FakeQuantWithMinMaxArgsOp>
+                                       >;
+
 
 // Templated class for declaring a converter from some TensorFlow convolution
 // op into its counterpart in TensorFlow Lite.
@@ -619,9 +687,13 @@ void PrepareTFPass::runOnFunction() {
 
   // This pattern was intented to uses TFL QDQs to preserve the quantization
   // parameters from the TF Quant ops, thus this pattern should run with the
-  // first `applyPatternsAndFoldGreedily` method, which would otherwise removes
-  // the TF FakeQuant ops by the constant folding.
-  patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant>(ctx);
+  // first `applyPatternsGreedily` method, which would otherwise removes the
+  // TF FakeQuant ops by the constant folding.
+  //patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant, 
+  //                PreparePerTensorFakeQuantWithMinMaxArgs>(ctx);
+
+  patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant, PreparePerTensorFakeQuantWithMinMaxArgs>(ctx);
+
 
   // This pattern will try to identify and optimize for dilated convolution.
   // e.g. Patterns like "SpaceToBatchND -> Conv2D -> BatchToSpaceND" will be

From cc12353111433a9aa4ad6894aae69dc7a66d1f22 Mon Sep 17 00:00:00 2001
From: Andrew Stevens <andrew.stevens@infineon.com>
Date: Wed, 29 Apr 2020 18:14:16 +0200
Subject: [PATCH 0044/1533] Fix: correct typos in prepare_tf.cc

---
 .../compiler/mlir/lite/transforms/prepare_tf.cc       | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index cc73f67f386..51553ea35f4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -179,8 +179,9 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
                                      PatternRewriter &rewriter) const override {
     // We don't want to insert quantize/dequantize if the quantize op exists.
     auto res = tf_op.outputs();
-    if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin()))
+    if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin())) {
       return failure();
+    }
 
     // Extract the min/max constant values from the operands. We also consider
     // a special case that there are tf.Identity ops between the min/max
@@ -188,7 +189,7 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
 
     FetchAttrType min_value, max_value;
     if (!fetchMinMax(tf_op, min_value, max_value)) {
-      return this->failure();
+      return failure();
     }
 
     int quant_dim = -1;
@@ -206,8 +207,10 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
     TypeAttr qtype = quant::GetQuantizedTypeAttr(
         rewriter, res_type, min_value, max_value, quant_dim, num_bits,
         narrow_range, /*is_signed=*/false);
-    if (!qtype) failure();
-
+    if (!qtype) {
+      return failure();
+    }
+    
     // Finally, use the quantization parameter to create the quantize and
     // dequantize ops, and insert them between the tf.FakeQuantWithMinMaxVarsOp
     // and its users.

From a97894d97fffe924a806decae523540da313c267 Mon Sep 17 00:00:00 2001
From: Andrew Stevens <andrew.stevens@infineon.com>
Date: Wed, 29 Apr 2020 18:16:08 +0200
Subject: [PATCH 0045/1533] Fix: adjust test for .note in tf_tfl_translate 
 warnings

---
 .../compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt
index 8ae4c0e8239..31e2157d360 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/back2back_fake_quant.pbtxt
@@ -1167,7 +1167,7 @@ versions {
 }
 
 # CHECK: {{.*: warning:}} loc(fused["sequential/quant_dense/oquant/FakeQuantWithMinMaxVars"{{.*\): }}quantizer's output has another quantizer
-# CHECK-NEXT: {{.*: warning:}} loc(fused["sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"{{.*\): }}quantizer's output has another quantizer
+# CHECK: {{.*: warning:}} loc(fused["sequential/quant_dense_2/oquant/FakeQuantWithMinMaxVars"{{.*\): }}quantizer's output has another quantizer
 
 # RESULT1: name: "Identity"
 # RESULT1-NEXT: quantization:

From dd98b39d00b037624f2e8cd9966deb3bd116ffee Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Wed, 29 Apr 2020 15:56:33 -0700
Subject: [PATCH 0046/1533] Added field values to __repr__, fixed pydoc outputs

---
 .../ops/structured/structured_tensor.py       | 32 ++++++++++++++-----
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index 0a24bba1cd9..1ca80d314e6 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -156,11 +156,17 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     Examples:
 
       >>> StructuredTensor.from_fields({'x': 1, 'y': [1, 2, 3]})
-      (FILL THIS IN)
+      <StructuredTensor(fields={
+                            x: tf.Tensor(1, shape=(), dtype=int32),
+                            y: tf.Tensor([1 2 3], shape=(3,), dtype=int32)},
+                        shape=())>
 
       >>> StructuredTensor.from_fields({'foo': [1, 2], 'bar': [3, 4]},
       ...                              shape=[2])
-      (FILL THIS IN)
+      <StructuredTensor(fields={
+                            bar: tf.Tensor([3 4], shape=(2,), dtype=int32),
+                            foo: tf.Tensor([1 2], shape=(2,), dtype=int32)},
+                        shape=(2,))>
 
     """
     shape = tensor_shape.as_shape(shape)
@@ -432,7 +438,8 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
   def __repr__(self):
     return '<StructuredTensor(fields={%s}, shape=%s)>' % (', '.join(
-        '%r' % k for k in sorted(self._fields)), self._shape)
+        '%s: %s' % (k, v) for k, v in sorted(self._fields.items())),
+        self._shape)
 
   #=============================================================================
   # Conversion
@@ -499,7 +506,10 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
     >>> StructuredTensor.from_pyval(
     ...     {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]})
-    <StructuredTensor {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]}>
+    <StructuredTensor(fields={
+                          a: tf.Tensor([1 2 3], shape=(3,), dtype=int32),
+                          b: <tf.RaggedTensor [[4, 5], [6, 7]]>},
+                      shape=())>
 
     Note that `StructuredTensor.from_pyval(pyval).to_pyval() == pyval`.
 
@@ -629,7 +639,9 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     ...     [{'foo': 12}, {'foo': 33}, {'foo': 99}])
     >>> partition = RowPartition.from_row_lengths([2, 0, 1])
     >>> st.partition_outer_dimension(partition)
-    <StructuredTensor [[{'foo': 12}, {'foo': 33}], [], [{'foo': 99}]]>
+    <StructuredTensor(fields={
+                          foo: <tf.RaggedTensor [[12, 33], [], [99]]>},
+                      shape=(3, None))>
 
     Args:
       row_partition: A `RowPartition`.
@@ -652,7 +664,9 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     >>> st = StructuredTensor.from_pyval(
     ...     [[{'foo': 12}, {'foo': 33}], [], [{'foo': 99}]])
     >>> st.merge_dims(0, 1)
-    <StructuredTensor [{'foo': 12}, {'foo': 33}, {'foo': 99}]>
+    <StructuredTensor(fields={
+                          foo: tf.Tensor([12 33 99], shape=(3,), dtype=int32)},
+                      shape=(3,))>
 
     Args:
       outer_axis: `int`: The first dimension in the range of dimensions to
@@ -1059,12 +1073,14 @@ def _partition_outer_dimension(value, row_partition):
 
     >>> partition = row_partition.RowPartition.from_row_lengths([2, 0, 1])
     >>> _partition_outer_dimension(tf.constant([1, 2, 3]), partition)
-    [[1, 2], [], [3]]
+    <tf.RaggedTensor [[1, 2], [], [3]]>
 
     >>> struct_value = StructuredTensor.from_pyval(
     ...     [{'x': 1}, {'x': 2}, {'x': 3}])
     >>> _partition_outer_dimension(struct_value, partition)
-    [[{'x': 1}, {'x': 2}], [], [{'x': 3}]])
+    <StructuredTensor(fields={
+                          x: <tf.RaggedTensor [[1, 2], [], [3]]>},
+                      shape=(3, None))>
 
   Args:
     value: Tensor, RaggedTensor, or StructuredTensor

From 2c71fe1ff34af5277673db7b67320e6796823e0b Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Mon, 27 Apr 2020 16:50:08 -0700
Subject: [PATCH 0047/1533] Provide NVIDIA CUDA build data in metadata and API

This change:

First exposes //third_party/gpus:find_cuda_config as a library.

Then, it extends gen_build_info.py with find_cuda_config to provide package
build information within TensorFlow's API. This is accessible as a
dictionary:

    from tensorflow.python.platform import build_info
    print(build_info.cuda_build_info)
    {'cuda_version': '10.2', 'cudnn_version': '7', 'tensorrt_version': None, 'nccl_version': None}

Finally, setup.py pulls that into package metadata. The same wheel's
long description ends with:

    TensorFlow 2.1.0 for NVIDIA GPUs was built with these platform
    and library versions:

      - NVIDIA CUDA 10.2
      - NVIDIA cuDNN 7
      - NVIDIA NCCL not enabled
      - NVIDIA TensorRT not enabled

In lieu of NVIDIA CUDA classifiers [1], the same metadata is exposed in the
normally-unused "platform" tag:

    >>> import pkginfo
    >>> a = pkginfo.Wheel('./tf_nightly_gpu-2.1.0-cp36-cp36m-linux_x86_64.whl')
    >>> a.platforms
    ['cuda_version:10.2', 'cudnn_version:7', 'tensorrt_version:None', 'nccl_version:None']

I'm not 100% confident this is the best way to accomplish this. It
seems odd to import like this setup.py, even though it works, even in
an environment with TensorFlow installed.

One caveat for RBE: the contents of genrules still run on the local
system, so I had to syncronize my local environment with the RBE
environment I used to build TensorFlow. I'm not sure if this is going to
require intervention on TensorFlow's current CI.

Currently tested only on Linux GPU (Remote Build) for Python 3.6. I'd
like to see more tests before merging.

[1]: (https://github.com/pypa/trove-classifiers/issues/25),
---
 tensorflow/tools/build_info/BUILD             |  1 +
 tensorflow/tools/build_info/gen_build_info.py | 44 +++++++++++++---
 tensorflow/tools/pip_package/setup.py         | 51 ++++++++++++-------
 third_party/gpus/BUILD                        |  6 +++
 4 files changed, 77 insertions(+), 25 deletions(-)

diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index 556dd0c86f0..1baa16724fe 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -15,5 +15,6 @@ py_binary(
     tags = ["no-remote-exec"],
     deps = [
         "@six_archive//:six",
+        "//third_party/gpus:find_cuda_config",
     ],
 )
diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index df9068fb3d1..3180010bb13 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -1,4 +1,4 @@
-# Lint as: python2, python3
+# Lint as: python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,9 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
+import os
+import platform
+import sys
 
 import six
 
+from third_party.gpus import find_cuda_config
+
 
 def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
   """Writes a Python that describes the build.
@@ -61,7 +66,31 @@ def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
       key_value_pair_stmts.append("%s = %r" % (key, value))
   key_value_pair_content = "\n".join(key_value_pair_stmts)
 
-  contents = """
+  # Generate cuda_build_info, a dict describing the CUDA component versions
+  # used to build TensorFlow.
+  cuda_build_info = "{}"
+  if is_config_cuda == "True":
+    libs = ["_", "cuda", "cudnn"]
+    if platform.system() == "Linux":
+      if os.environ.get("TF_NEED_TENSORRT", "0") == "1":
+        libs.append("tensorrt")
+      if "TF_NCCL_VERSION" in os.environ:
+        libs.append("nccl")
+    # find_cuda_config accepts libraries to inspect as argv from the command
+    # line. We can work around this restriction by setting argv manually
+    # before calling find_cuda_config.
+    backup_argv = sys.argv
+    sys.argv = libs
+    cuda = find_cuda_config.find_cuda_config()
+    cuda_build_info = str({
+        "cuda_version": cuda["cuda_version"],
+        "cudnn_version": cuda["cudnn_version"],
+        "tensorrt_version": cuda.get("tensorrt_version", None),
+        "nccl_version": cuda.get("nccl_version", None),
+    })
+    sys.argv = backup_argv
+
+  contents = f"""
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -76,17 +105,16 @@ def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-%s
+{module_docstring}
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-is_rocm_build = %s
-is_cuda_build = %s
+is_rocm_build = {build_config_rocm_bool}
+is_cuda_build = {build_config_cuda_bool}
+cuda_build_info = {cuda_build_info}
 
-%s
-""" % (module_docstring, build_config_rocm_bool, build_config_cuda_bool,
-       key_value_pair_content)
+"""
   open(filename, "w").write(contents)
 
 
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 1c9a37bf652..1e99d659830 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -1,3 +1,4 @@
+# lint as: python3
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -43,6 +44,8 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
+from tensorflow.python.platform import build_info
+
 DOCLINES = __doc__.split('\n')
 
 # This version string is semver compatible, but incompatible with pip.
@@ -82,6 +85,20 @@ REQUIRED_PACKAGES = [
     'scipy == 1.2.2;python_version<"3"',
 ]
 
+GPU_DESCRIPTION = ''
+if build_info.is_cuda_build:
+  gpu_header = (f'\nTensorFlow {_VERSION} for NVIDIA GPUs was built with these '
+                'platform and library versions:\n\n  - ')
+  cbi = build_info.cuda_build_info
+  trt_ver = cbi['tensorrt_version']
+  nccl_ver = cbi['nccl_version']
+  GPU_DESCRIPTION = gpu_header + '\n  - '.join([
+      'NVIDIA CUDA ' + cbi['cuda_version'],
+      'NVIDIA cuDNN ' + cbi['cudnn_version'],
+      'NVIDIA NCCL ' + 'not enabled' if not nccl_ver else nccl_ver,
+      'NVIDIA TensorRT ' + 'not enabled' if not trt_ver else trt_ver,
+  ])
+
 if sys.byteorder == 'little':
   # grpcio does not build correctly on big-endian machines due to lack of
   # BoringSSL support.
@@ -117,7 +134,8 @@ CONSOLE_SCRIPTS = [
     # even though the command is not removed, just moved to a different wheel.
     'tensorboard = tensorboard.main:run_main',
     'tf_upgrade_v2 = tensorflow.tools.compatibility.tf_upgrade_v2_main:main',
-    'estimator_ckpt_converter = tensorflow_estimator.python.estimator.tools.checkpoint_converter:main',
+    'estimator_ckpt_converter = '
+    'tensorflow_estimator.python.estimator.tools.checkpoint_converter:main',
 ]
 # pylint: enable=line-too-long
 
@@ -161,11 +179,10 @@ class InstallHeaders(Command):
   """
   description = 'install C/C++ header files'
 
-  user_options = [('install-dir=', 'd',
-                   'directory to install header files to'),
-                  ('force', 'f',
-                   'force installation (overwrite existing files)'),
-                 ]
+  user_options = [
+      ('install-dir=', 'd', 'directory to install header files to'),
+      ('force', 'f', 'force installation (overwrite existing files)'),
+  ]
 
   boolean_options = ['force']
 
@@ -175,8 +192,7 @@ class InstallHeaders(Command):
     self.outfiles = []
 
   def finalize_options(self):
-    self.set_undefined_options('install',
-                               ('install_headers', 'install_dir'),
+    self.set_undefined_options('install', ('install_headers', 'install_dir'),
                                ('force', 'force'))
 
   def mkdir_and_copy_file(self, header):
@@ -236,9 +252,7 @@ so_lib_paths = [
 
 matches = []
 for path in so_lib_paths:
-  matches.extend(
-      ['../' + x for x in find_files('*', path) if '.py' not in x]
-  )
+  matches.extend(['../' + x for x in find_files('*', path) if '.py' not in x])
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
@@ -257,17 +271,16 @@ headers = (
     list(find_files('*.h', 'tensorflow/stream_executor')) +
     list(find_files('*.h', 'google/com_google_protobuf/src')) +
     list(find_files('*.inc', 'google/com_google_protobuf/src')) +
-    list(find_files('*', 'third_party/eigen3')) + list(
-        find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
-    list(
-        find_files('*.inc', 'tensorflow/include/external/com_google_absl'))
-    + list(find_files('*', 'tensorflow/include/external/eigen_archive')))
+    list(find_files('*', 'third_party/eigen3')) +
+    list(find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
+    list(find_files('*.inc', 'tensorflow/include/external/com_google_absl')) +
+    list(find_files('*', 'tensorflow/include/external/eigen_archive')))
 
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
     description=DOCLINES[0],
-    long_description='\n'.join(DOCLINES[2:]),
+    long_description='\n'.join(DOCLINES[2:]) + GPU_DESCRIPTION,
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
@@ -288,6 +301,10 @@ setup(
         ] + matches,
     },
     zip_safe=False,
+    # Accessible with importlib.metadata.metadata('tf-pkg-name').items()
+    platforms=[
+        f'{key}:{value}' for key, value in build_info.cuda_build_info.items()
+    ],
     distclass=BinaryDistribution,
     cmdclass={
         'install_headers': InstallHeaders,
diff --git a/third_party/gpus/BUILD b/third_party/gpus/BUILD
index e69de29bb2d..d570c4894ce 100644
--- a/third_party/gpus/BUILD
+++ b/third_party/gpus/BUILD
@@ -0,0 +1,6 @@
+# Expose find_cuda_config.py as a library so other tools can reference it.
+py_library(
+    name = "find_cuda_config",
+    srcs = ["find_cuda_config.py"],
+    visibility = ["//visibility:public"],
+)

From 019e9fca7be020133ec8bbbfd69aa166602bae47 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Tue, 28 Apr 2020 12:27:04 -0700
Subject: [PATCH 0048/1533] Add NVIDIA CUDA and cuDNN info to tf.config

---
 tensorflow/python/framework/config.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 5361d7290e8..12feaf3d89d 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.eager import context
+from tensorflow.python.platform import build_info
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -653,3 +654,25 @@ def disable_mlir_bridge():
 def disable_mlir_graph_optimization():
   """Disables experimental MLIR-Based TensorFlow Compiler Optimizations."""
   context.context().enable_mlir_graph_optimization = False
+
+
+@tf_export('config.get_cuda_version_used_to_compile_tf')
+def get_cuda_version_used_to_compile_tf():
+  """Get the version of NVIDIA CUDA used to compile this TensorFlow release.
+
+  Returns:
+    String representation of CUDA version number (Major.Minor) if CUDA support
+    is included, otherwise None.
+  """
+  return build_info.cuda_build_info.get('cuda_version', None)
+
+
+@tf_export('config.get_cudnn_version_used_to_compile_tf')
+def get_cudnn_version_used_to_compile_tf():
+  """Get the version of NVIDIA cuDNN used to compile this TensorFlow release.
+
+  Returns:
+    String representation of cuDNN version number (Major only) if cuDNN support
+    is included, otherwise None.
+  """
+  return build_info.cuda_build_info.get('cudnn_version', None)

From df1ee5d5516413642aee6d0e1c8ef08a4eff1902 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Wed, 29 Apr 2020 16:04:35 -0700
Subject: [PATCH 0049/1533] Convert build_info to dict format and expose it.

Since this module now generates a dictionary to expose in tf.config, it
doesn't make much sense to store only certain values in the build_info
dictionary and others as module variables. This obsoletes a lot of code
in gen_build_info.py and I've removed it.

I also updated all the in-code references I've found to the build_info
module. I think this may break whomever used to be using the build_info
library, but since it wasn't part of the API, there was no guarantee
that it would continue to be available.
---
 tensorflow/python/framework/config.py         | 35 ++++----
 .../python/keras/layers/recurrent_v2.py       |  4 +-
 tensorflow/python/platform/build_info_test.py |  4 +-
 tensorflow/python/platform/self_check.py      |  5 +-
 tensorflow/tensorflow.bzl                     | 25 +++---
 tensorflow/tools/build_info/gen_build_info.py | 83 ++++++-------------
 tensorflow/tools/pip_package/setup.py         |  2 +-
 7 files changed, 67 insertions(+), 91 deletions(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 12feaf3d89d..9997b9833a5 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -656,23 +656,26 @@ def disable_mlir_graph_optimization():
   context.context().enable_mlir_graph_optimization = False
 
 
-@tf_export('config.get_cuda_version_used_to_compile_tf')
-def get_cuda_version_used_to_compile_tf():
-  """Get the version of NVIDIA CUDA used to compile this TensorFlow release.
+@tf_export('config.get_build_info()')
+def get_build_info():
+  """Get a dictionary describing TensorFlow's build environment.
+
+  Values are generated when TensorFlow is compiled, and are static for each
+  TensorFlow package. This information is limited to a subset of the following
+  keys based on the platforms targeted by the package:
+
+    - cuda_version
+    - cudnn_version
+    - tensorrt_version
+    - nccl_version
+    - is_cuda_build
+    - is_rocm_build
+    - msvcp_dll_names
+    - nvcuda_dll_name
+    - cudart_dll_name
+    - cudnn_dll_name
 
   Returns:
-    String representation of CUDA version number (Major.Minor) if CUDA support
-    is included, otherwise None.
+    A Dictionary describing TensorFlow's build environment.
   """
   return build_info.cuda_build_info.get('cuda_version', None)
-
-
-@tf_export('config.get_cudnn_version_used_to_compile_tf')
-def get_cudnn_version_used_to_compile_tf():
-  """Get the version of NVIDIA cuDNN used to compile this TensorFlow release.
-
-  Returns:
-    String representation of cuDNN version number (Major only) if cuDNN support
-    is included, otherwise None.
-  """
-  return build_info.cuda_build_info.get('cudnn_version', None)
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index a9d5ef8587c..d5a54a0a8c6 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -601,7 +601,7 @@ def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
   # (6 * units)
   bias = array_ops.split(K.flatten(bias), 6)
 
-  if build_info.is_cuda_build:
+  if build_info.build_info["is_cuda_build"]:
     # Note that the gate order for CuDNN is different from the canonical format.
     # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need
     # to be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
@@ -1361,7 +1361,7 @@ def gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
   # so that mathematically it is same as the canonical LSTM implementation.
   full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
 
-  if build_info.is_rocm_build:
+  if build_info.build_info["is_rocm_build"]:
     # ROCm MIOpen's weight sequence for LSTM is different from both canonical
     # and Cudnn format
     # MIOpen: [i, f, o, c] Cudnn/Canonical: [i, f, c, o]
diff --git a/tensorflow/python/platform/build_info_test.py b/tensorflow/python/platform/build_info_test.py
index f0df0b756cc..81fb5a6e1e3 100644
--- a/tensorflow/python/platform/build_info_test.py
+++ b/tensorflow/python/platform/build_info_test.py
@@ -25,8 +25,8 @@ from tensorflow.python.platform import test
 class BuildInfoTest(test.TestCase):
 
   def testBuildInfo(self):
-    self.assertEqual(build_info.is_rocm_build, test.is_built_with_rocm())
-    self.assertEqual(build_info.is_cuda_build, test.is_built_with_cuda())
+    self.assertEqual(build_info.build_info["is_rocm_build"], test.is_built_with_rocm())
+    self.assertEqual(build_info.build_info["is_cuda_build"], test.is_built_with_cuda())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/platform/self_check.py b/tensorflow/python/platform/self_check.py
index f6cf7705e13..c10c4108c7d 100644
--- a/tensorflow/python/platform/self_check.py
+++ b/tensorflow/python/platform/self_check.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 
+MSVCP_DLL_NAMES = "msvcp_dll_names"
 
 try:
   from tensorflow.python.platform import build_info
@@ -42,9 +43,9 @@ def preload_check():
     # we load the Python extension, so that we can raise an actionable error
     # message if they are not found.
     import ctypes  # pylint: disable=g-import-not-at-top
-    if hasattr(build_info, "msvcp_dll_names"):
+    if MSVCP_DLL_NAMES in build_info.build_info:
       missing = []
-      for dll_name in build_info.msvcp_dll_names.split(","):
+      for dll_name in build_info.build_info[MSVCP_DLL_NAMES].split(","):
         try:
           ctypes.WinDLL(dll_name)
         except OSError:
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d9229e00306..ada75fef957 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2593,6 +2593,10 @@ def tf_version_info_genrule(name, out):
         arguments = "--generate \"$@\" --git_tag_override=${GIT_TAG_OVERRIDE:-}",
     )
 
+def dict_to_kv(d):
+    """Convert a dictionary to a space-joined list of key=value pairs."""
+    return " " + " ".join(["%s=%s" % (k,v) for k, v in d.items()])
+
 def tf_py_build_info_genrule(name, out):
     _local_genrule(
         name = name,
@@ -2600,16 +2604,17 @@ def tf_py_build_info_genrule(name, out):
         exec_tool = "//tensorflow/tools/build_info:gen_build_info",
         arguments =
             "--raw_generate \"$@\" " +
-            " --is_config_cuda " + if_cuda("True", "False") +
-            " --is_config_rocm " + if_rocm("True", "False") +
-            " --key_value " +
-            if_cuda(" cuda_version_number=${TF_CUDA_VERSION:-} cudnn_version_number=${TF_CUDNN_VERSION:-} ", "") +
-            if_windows(" msvcp_dll_names=msvcp140.dll,msvcp140_1.dll ", "") +
-            if_windows_cuda(" ".join([
-                "nvcuda_dll_name=nvcuda.dll",
-                "cudart_dll_name=cudart64_$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
-                "cudnn_dll_name=cudnn64_${TF_CUDNN_VERSION:-}.dll",
-            ]), ""),
+            " --key_value"
+                + " is_rocm_build=" + if_rocm("True", "False")
+                + " is_cuda_build=" + if_cuda("True", "False")
+                # TODO(angerson) Can we reliably load CUDA compute capabilities here?
+                if_windows(dict_to_kv({
+                    "msvcp_dll_names": "msvcp140.dll,msvcp140_1.dll"
+                }), "") + if_windows_cuda(dict_to_kv({
+                    "nvcuda_dll_name": "nvcuda.dll",
+                    "cudart_dll_name": "cudart64_$$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
+                    "cudnn_dll_name": "cudnn64_$${TF_CUDNN_VERSION:-}.dll",
+            }), ""),
     )
 
 def cc_library_with_android_deps(
diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index 3180010bb13..0757a1f57a6 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -28,48 +28,28 @@ import six
 from third_party.gpus import find_cuda_config
 
 
-def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
+def write_build_info(filename, key_value_list):
   """Writes a Python that describes the build.
 
   Args:
     filename: filename to write to.
-    is_config_cuda: Whether this build is using CUDA.
-    is_config_rocm: Whether this build is using ROCm.
     key_value_list: A list of "key=value" strings that will be added to the
-      module as additional fields.
-
-  Raises:
-    ValueError: If `key_value_list` includes the key "is_cuda_build", which
-      would clash with one of the default fields.
+      module's "build_info" dictionary as additional entries.
   """
-  module_docstring = "\"\"\"Generates a Python module containing information "
-  module_docstring += "about the build.\"\"\""
 
-  build_config_rocm_bool = "False"
-  build_config_cuda_bool = "False"
-
-  if is_config_rocm == "True":
-    build_config_rocm_bool = "True"
-  elif is_config_cuda == "True":
-    build_config_cuda_bool = "True"
-
-  key_value_pair_stmts = []
-  if key_value_list:
-    for arg in key_value_list:
-      key, value = six.ensure_str(arg).split("=")
-      if key == "is_cuda_build":
-        raise ValueError("The key \"is_cuda_build\" cannot be passed as one of "
-                         "the --key_value arguments.")
-      if key == "is_rocm_build":
-        raise ValueError("The key \"is_rocm_build\" cannot be passed as one of "
-                         "the --key_value arguments.")
-      key_value_pair_stmts.append("%s = %r" % (key, value))
-  key_value_pair_content = "\n".join(key_value_pair_stmts)
+  build_info = {}
+  for arg in key_value_list:
+    key, value = six.ensure_str(arg).split("=")
+    if value.lower() == "true":
+      build_info[key] = True
+    elif value.lower() == "false":
+      build_info[key] = False
+    else:
+      build_info[key] = value
 
   # Generate cuda_build_info, a dict describing the CUDA component versions
   # used to build TensorFlow.
-  cuda_build_info = "{}"
-  if is_config_cuda == "True":
+  if build_info.get("is_cuda_build", False):
     libs = ["_", "cuda", "cudnn"]
     if platform.system() == "Linux":
       if os.environ.get("TF_NEED_TENSORRT", "0") == "1":
@@ -82,16 +62,15 @@ def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
     backup_argv = sys.argv
     sys.argv = libs
     cuda = find_cuda_config.find_cuda_config()
-    cuda_build_info = str({
-        "cuda_version": cuda["cuda_version"],
-        "cudnn_version": cuda["cudnn_version"],
-        "tensorrt_version": cuda.get("tensorrt_version", None),
-        "nccl_version": cuda.get("nccl_version", None),
-    })
+
+    build_info["cuda_version"] = cuda["cuda_version"]
+    build_info["cudnn_version"] = cuda["cudnn_version"]
+    build_info["tensorrt_version"] = cuda.get("tensorrt_version", None)
+    build_info["nccl_version"] = cuda.get("nccl_version", None)
     sys.argv = backup_argv
 
   contents = f"""
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -105,15 +84,14 @@ def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-{module_docstring}
+\"\"\"Auto-generated module providing information about the build.\"\"\""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-is_rocm_build = {build_config_rocm_bool}
-is_cuda_build = {build_config_cuda_bool}
-cuda_build_info = {cuda_build_info}
+from collections import namedtuple
 
+build_info = {build_info}
 """
   open(filename, "w").write(contents)
 
@@ -121,16 +99,6 @@ cuda_build_info = {cuda_build_info}
 parser = argparse.ArgumentParser(
     description="""Build info injection into the PIP package.""")
 
-parser.add_argument(
-    "--is_config_cuda",
-    type=str,
-    help="'True' for CUDA GPU builds, 'False' otherwise.")
-
-parser.add_argument(
-    "--is_config_rocm",
-    type=str,
-    help="'True' for ROCm GPU builds, 'False' otherwise.")
-
 parser.add_argument("--raw_generate", type=str, help="Generate build_info.py")
 
 parser.add_argument(
@@ -138,10 +106,9 @@ parser.add_argument(
 
 args = parser.parse_args()
 
-if (args.raw_generate is not None) and (args.is_config_cuda is not None) and (
-    args.is_config_rocm is not None):
-  write_build_info(args.raw_generate, args.is_config_cuda, args.is_config_rocm,
-                   args.key_value)
+if args.raw_generate:
+  print(args.key_value)
+  write_build_info(args.raw_generate, args.key_value)
 else:
   raise RuntimeError(
-      "--raw_generate, --is_config_cuda and --is_config_rocm must be used")
+      "--raw_generate must be used.")
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 1e99d659830..c39bd254442 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -86,7 +86,7 @@ REQUIRED_PACKAGES = [
 ]
 
 GPU_DESCRIPTION = ''
-if build_info.is_cuda_build:
+if build_info.build_info['is_cuda_build']:
   gpu_header = (f'\nTensorFlow {_VERSION} for NVIDIA GPUs was built with these '
                 'platform and library versions:\n\n  - ')
   cbi = build_info.cuda_build_info

From b044056aa35d07eb63d11e034e2560c350978bb5 Mon Sep 17 00:00:00 2001
From: "ag.ramesh" <ag.ramesh@intel.com>
Date: Thu, 30 Apr 2020 11:52:36 -0700
Subject: [PATCH 0050/1533] Fixed compilation errors with config=mkl

---
 tensorflow/core/common_runtime/BUILD                         | 2 ++
 tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index c6b0088d8d0..1706fc0c246 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1028,6 +1028,7 @@ cc_library(
     hdrs = ["mkl_layout_pass.h"],
     copts = tf_copts(),
     deps = [
+        ":core_cpu",
         ":optimization_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1043,6 +1044,7 @@ cc_library(
     hdrs = ["mkl_tfconversion_pass.h"],
     copts = tf_copts(),
     deps = [
+        ":core_cpu",
         ":optimization_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
index 2d4ae338144..f2339806814 100644
--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
-#include "tensorflow/core/common_runyime/mkl_layout_pass.h"
+#include "tensorflow/core/common_runtime/mkl_layout_pass.h"
 #include "tensorflow/core/graph/mkl_graph_util.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/mkl_util.h"

From 777482a6269a61218a5e3a4c3468d883573261be Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Thu, 30 Apr 2020 13:14:45 -0700
Subject: [PATCH 0051/1533] Fix incorrect call to tf_export

---
 tensorflow/python/framework/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 9997b9833a5..c00157f6193 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -656,7 +656,7 @@ def disable_mlir_graph_optimization():
   context.context().enable_mlir_graph_optimization = False
 
 
-@tf_export('config.get_build_info()')
+@tf_export('config.get_build_info')
 def get_build_info():
   """Get a dictionary describing TensorFlow's build environment.
 

From 893d49626360eb6091cd4fa29f5b57afc9311dc1 Mon Sep 17 00:00:00 2001
From: "ag.ramesh" <ag.ramesh@intel.com>
Date: Thu, 30 Apr 2020 15:00:36 -0700
Subject: [PATCH 0052/1533] Added missing header file

---
 tensorflow/core/common_runtime/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 1706fc0c246..efe637936d0 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -243,6 +243,7 @@ filegroup(
         "memory_types.h",
         "mkl_cpu_allocator.h",
         "mkl_layout_pass.h",
+        "mkl_tfconversion_pass.h",
         "optimization_registry.h",
         "partitioning_utils.h",
         "placer.h",

From cea00c61d8243b2b55bc30ff0d75a865466f52ec Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Wed, 29 Apr 2020 15:32:18 -0700
Subject: [PATCH 0053/1533] Remove unnecessary objc_library target

With the current versions of rules_swift + rules_apple we no longer need
a shim `objc_library` target to interface with Swift, or to build a
`ios_static_framework`. This simplifies building these iOS targets by
depending on the `cc_library` directly.
---
 tensorflow/lite/experimental/ios/BUILD.apple  | 42 +++++--------------
 .../lite/experimental/swift/BUILD.apple       |  2 +-
 2 files changed, 11 insertions(+), 33 deletions(-)

diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index faa3f12971c..a53dd328de9 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -22,13 +22,6 @@ genrule(
     """,
 )
 
-TFL_LIBRARY_HDRS = [
-    "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
-    "//tensorflow/lite/c:c_api.h",
-    "//tensorflow/lite/c:common.h",
-]
-
 TFL_FRAMEWORK_HDRS = [
     "//tensorflow/lite/delegates/gpu:metal_delegate.h",
     ":coreml_delegate.h",
@@ -42,19 +35,6 @@ ios_static_framework(
     hdrs = TFL_FRAMEWORK_HDRS,
     bundle_name = "TensorFlowLiteC",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
-    deps = [
-        ":TensorFlowLiteC",
-    ],
-)
-
-objc_library(
-    name = "TensorFlowLiteC",
-    hdrs = TFL_LIBRARY_HDRS,
-    module_name = "TensorFlowLiteC",
-    weak_sdk_frameworks = [
-        "Metal",
-        "CoreML",
-    ],
     deps = [
         ":tensorflow_lite_c",
     ],
@@ -78,20 +58,18 @@ ios_static_framework(
     ],
 )
 
-# Using this intermediate target is a workaround for a bug in bazel build rules
-# involving mixed objc_library & cc_library deps mentioned in (b/74809458).
-# When these dependencies are declared directly under the "TensorFlowLiteC"
-# target above, the resulting static library incorrectly contains duplicate
-# symbols from some ObjC code in the transitive dependencies.
-#
-# When a new dependency should be added to the TensorFlowLiteC framework, the
-# dependency should be added under this target instead.
-# When a new header file needs to be exposed, the header should be added to the
-# TFL_LIBRARY_HDRS list above.
 cc_library(
     name = "tensorflow_lite_c",
-    hdrs = TFL_LIBRARY_HDRS,
-    tags = ["nobuilder"],
+    hdrs = [
+        "//tensorflow/lite/c:c_api.h",
+        "//tensorflow/lite/c:common.h",
+        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
+    ],
+    tags = [
+        "nobuilder",
+        "swift_module=TensorFlowLiteC",
+    ],
     deps = [
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/delegates/gpu:metal_delegate",
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 2ce8428b1ce..ea468216d7e 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -17,7 +17,7 @@ swift_library(
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_whitelist(),
     deps = [
-        "//tensorflow/lite/experimental/ios:TensorFlowLiteC",
+        "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
     ],
 )
 

From 642dc30acaedcdc3daa2558766aa66871bd1714c Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Thu, 30 Apr 2020 17:45:36 -0700
Subject: [PATCH 0054/1533] Fix typos from the latest version

---
 tensorflow/tensorflow.bzl                     |  2 +-
 tensorflow/tools/build_info/gen_build_info.py |  2 +-
 tensorflow/tools/pip_package/setup.py         | 14 ++++++++------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index ada75fef957..501b9f9b088 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2608,7 +2608,7 @@ def tf_py_build_info_genrule(name, out):
                 + " is_rocm_build=" + if_rocm("True", "False")
                 + " is_cuda_build=" + if_cuda("True", "False")
                 # TODO(angerson) Can we reliably load CUDA compute capabilities here?
-                if_windows(dict_to_kv({
+                + if_windows(dict_to_kv({
                     "msvcp_dll_names": "msvcp140.dll,msvcp140_1.dll"
                 }), "") + if_windows_cuda(dict_to_kv({
                     "nvcuda_dll_name": "nvcuda.dll",
diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index 0757a1f57a6..a00fc064fa5 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -84,7 +84,7 @@ def write_build_info(filename, key_value_list):
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-\"\"\"Auto-generated module providing information about the build.\"\"\""
+\"\"\"Auto-generated module providing information about the build.\"\"\"
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index c39bd254442..abb00eecdfb 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -85,16 +85,18 @@ REQUIRED_PACKAGES = [
     'scipy == 1.2.2;python_version<"3"',
 ]
 
+# Generate a footer describing the CUDA technology this release was built
+# against.
 GPU_DESCRIPTION = ''
 if build_info.build_info['is_cuda_build']:
   gpu_header = (f'\nTensorFlow {_VERSION} for NVIDIA GPUs was built with these '
                 'platform and library versions:\n\n  - ')
-  cbi = build_info.cuda_build_info
-  trt_ver = cbi['tensorrt_version']
-  nccl_ver = cbi['nccl_version']
+  bi = build_info.build_info
+  trt_ver = bi['tensorrt_version']
+  nccl_ver = bi['nccl_version']
   GPU_DESCRIPTION = gpu_header + '\n  - '.join([
-      'NVIDIA CUDA ' + cbi['cuda_version'],
-      'NVIDIA cuDNN ' + cbi['cudnn_version'],
+      'NVIDIA CUDA ' + bi['cuda_version'],
+      'NVIDIA cuDNN ' + bi['cudnn_version'],
       'NVIDIA NCCL ' + 'not enabled' if not nccl_ver else nccl_ver,
       'NVIDIA TensorRT ' + 'not enabled' if not trt_ver else trt_ver,
   ])
@@ -303,7 +305,7 @@ setup(
     zip_safe=False,
     # Accessible with importlib.metadata.metadata('tf-pkg-name').items()
     platforms=[
-        f'{key}:{value}' for key, value in build_info.cuda_build_info.items()
+        f'{key}:{value}' for key, value in build_info.build_info.items()
     ],
     distclass=BinaryDistribution,
     cmdclass={

From 441d6983812af97104aa3453b09f3f411117d6c3 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Tue, 14 Jan 2020 09:52:26 +0100
Subject: [PATCH 0055/1533] Use datamove in conv wrapper

---
 tensorflow/lite/micro/kernels/arc/conv.cc     |  57 +++++--
 .../lite/micro/kernels/arc/scratch_buffers.cc | 146 ++++++++++++++++++
 .../lite/micro/kernels/arc/scratch_buffers.h  |  42 +++++
 .../micro/tools/make/targets/arc_makefile.inc |   5 +
 .../tools/make/third_party_downloads.inc      |   4 +-
 5 files changed, 235 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
 create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buffers.h

diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc
index 69542e12e90..46be76a407b 100644
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/conv.cc
@@ -25,6 +25,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+
+#include "mli_api.h"
 
 namespace tflite {
 namespace ops {
@@ -139,7 +142,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                       GetTensorData<uint8_t>(im2col), nullptr);
 }
 
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteConvParams* params, OpData* data,
                              const TfLiteTensor* input,
                              const TfLiteTensor* filter,
@@ -195,24 +198,43 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = data->padding.height + data->padding.height_offset;
     }
 
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
+    // Get first input from batch
+    mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1]) };
+    mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1]) };
+    mli_tensor sub_mli_in = { 0 };
+    mli_tensor sub_mli_out = { 0 };
+    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
 
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
+    // Tensors for data in fast (local) memory and config to copy data from external to local memory
+    mli_tensor weights_local = mli_weights;
+    mli_tensor bias_local = mli_bias;
+    mli_tensor in_local = sub_mli_in;
+    mli_tensor out_local = sub_mli_out;
+    mli_mov_cfg_t copy_config;
+    mli_mov_cfg_for_copy(&copy_config);
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
+    bool in_is_local = in_local.data == sub_mli_in.data;
+    bool out_is_local = out_local.data == sub_mli_out.data;
+
+    mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
+    mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
+    const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
 
     for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
-                                      &cfg, &sub_mli_out);
+      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
+      mli_krn_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local);
+      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
+      subtsr_cfg_in.start_coord[0]++;
+      subtsr_cfg_out.start_coord[0]++;
+      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+      if (in_is_local) {
+        in_local.data = sub_mli_in.data;
+      }
+      if (out_is_local) {
+        out_local.data = sub_mli_out.data;
+      }
     }
   } else {
     ConvParams op_params;
@@ -233,6 +255,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
         GetTensorData<int32>(bias), GetTensorShape(output),
         GetTensorData<int8>(output));
   }
+  return kTfLiteOk;
 }
 
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
@@ -309,7 +332,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                 nullptr, output);
       break;
     case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
                               output, nullptr);
       break;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
new file mode 100644
index 00000000000..2ac60dd0f25
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
@@ -0,0 +1,146 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include <limits.h>
+
+/* by default use all the XY memory, and half of the DCCM because DCCM is also used
+ * for the data section and the stack.
+ * the values can be overruled by adding a -D option to the makefile of the application
+ */
+#ifndef SCRATCH_MEM_X_SIZE
+#ifdef core_config_xy_size
+#define SCRATCH_MEM_X_SIZE (core_config_xy_size)
+#else
+#define SCRATCH_MEM_X_SIZE (0)
+#endif
+#endif
+
+#ifndef SCRATCH_MEM_Y_SIZE
+#ifdef core_config_xy_size
+#define SCRATCH_MEM_Y_SIZE (core_config_xy_size)
+#else
+#define SCRATCH_MEM_Y_SIZE (0)
+#endif
+#endif
+
+#ifndef SCRATCH_MEM_Z_SIZE
+#ifdef core_config_dccm_size
+#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
+#else
+#define SCRATCH_MEM_Z_SIZE (0)
+#endif
+#endif
+
+namespace {
+#pragma Data(".Xdata")
+    static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE];
+#pragma Data()
+
+#pragma Data(".Ydata")
+    static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE];
+#pragma Data()
+
+#pragma Data(".Zdata")
+    static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE];
+#pragma Data()
+}
+
+static inline
+bool inside_arc_dccm(void* p) {
+#if core_config_dccm_present
+  return ((unsigned)p >= core_config_dccm_base) && ((unsigned)p < core_config_dccm_base + core_config_dccm_size);
+#else
+  return false;
+#endif
+}
+static inline
+bool inside_arc_xccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_x_base) && ((unsigned)p < core_config_xy_x_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+static inline
+bool inside_arc_yccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_y_base) && ((unsigned)p < core_config_xy_y_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+
+static inline
+bool inside_arc_ccm(void* p) {
+  return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* weights, 
+    mli_tensor* bias, 
+    mli_tensor* out) {
+#ifdef __Xxy
+  // Function to assign fast memory from one of 3 scratch buffers.
+  // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused
+  mli_tensor* tensors[3] = { weights, in, out };
+  uint32_t tensor_sizes[3] = {
+    mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0), mli_hlp_count_elem_num(tensors[2], 0) };
+  bool mem_is_free[3] = { true, true, true };
+  int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
+  uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE};
+
+  for (int i = 0; i < 3; ++i) {
+    int best_mem_idx = -1;
+    int best_mem_delta = INT_MAX;
+	// only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size.
+	if (inside_arc_ccm(tensors[i]->data)) continue;
+    for (int j = 0; j < 3; ++j) {
+       // Best Fit
+       if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) {
+          best_mem_idx = j;
+          best_mem_delta = scratch_sizes[j] - tensor_sizes[i];
+       }
+    }
+    if (best_mem_idx >= 0) {
+      tensors[i]->data = static_cast<void*>(scratch_mem[best_mem_idx]);
+      tensors[i]->capacity = scratch_sizes[best_mem_idx];
+      mem_is_free[best_mem_idx] = false;
+    } else {
+        return kTfLiteError;
+    }
+  }
+
+  // Bias is expected to be much smaller than other operands, not affect performance and can be placed 
+  // in the end of some of already used memory bank (to occupy free space of it)
+  bool is_bias_allocated = inside_arc_ccm(bias->data);
+  if (!is_bias_allocated) {
+    uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    for (int i = 0; i < 3; ++i) {
+      if (tensors[i]->capacity - tensor_sizes[i] > bias_mem_requirements) {
+        bias->data = &((char*)tensors[i]->data)[tensor_sizes[i]];
+        bias->capacity = bias_mem_requirements;
+        tensors[i]->capacity = tensor_sizes[i];
+        is_bias_allocated = true;
+        break;
+      }
+    }
+  }
+  return (is_bias_allocated) ? kTfLiteOk : kTfLiteError;
+#else
+  return kTfLiteOk;
+#endif
+}
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
new file mode 100644
index 00000000000..198cc5b83cf
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
@@ -0,0 +1,42 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
+#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "mli_api.h"
+
+/**
+ * @brief Function to allocate scratch buffers for the convolution tensors
+ *
+ * @detail This function will update the data pointers in the 4 tensors with pointers
+ * to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param weights [IO] pointer to the weights tensor
+ * @param bias [IO] pointer to the bias tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* weights, 
+    mli_tensor* bias, 
+    mli_tensor* out);
+
+#endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 0f56e5f4641..16e89266614 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -22,6 +22,7 @@ else
 endif
 
   PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
+  PLATFORM_FLAGS += -tcf_core_config
   PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map
 
   CXXFLAGS += $(PLATFORM_FLAGS)
@@ -80,6 +81,10 @@ endif
     third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
     third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
     third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
 
 endif # USE_EMBARC_MLI
 
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index b331db2c80e..69e7910f6c2 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
 EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
 EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/6316034d421cbbb59756239908d7c9a99075a3bb.zip"
-EMBARC_MLI_MD5 := "db0910cf0e07e43f74ae7a31de485d56"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/428cfd6a89f848e403a8b8ca02eab2a897ae8cd3.zip"
+EMBARC_MLI_MD5 := "9c6c8f8877fa6dd738d7ab62665b3a6e"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"

From 9d6f2440471312a44914db75e77dbe91ab532e7e Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Thu, 16 Jan 2020 15:39:33 +0100
Subject: [PATCH 0056/1533] add data move functionality to depthwise, fc,
 pooling

---
 .../person_detection_test.cc                  |  2 +
 .../lite/micro/kernels/arc/depthwise_conv.cc  | 57 +++++++++++++------
 .../lite/micro/kernels/arc/fully_connected.cc | 46 +++++++++++----
 tensorflow/lite/micro/kernels/arc/pooling.cc  | 44 ++++++++++----
 .../lite/micro/kernels/arc/scratch_buffers.cc | 44 +++++++++++++-
 .../lite/micro/kernels/arc/scratch_buffers.h  | 16 ++++++
 6 files changed, 168 insertions(+), 41 deletions(-)

diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index b0979735d4f..cac5596cd83 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -28,7 +28,9 @@ limitations under the License.
 
 // Create an area of memory to use for input, output, and intermediate arrays.
 constexpr int tensor_arena_size = 125 * 1024;
+#pragma Data(".System")
 uint8_t tensor_arena[tensor_arena_size];
+#pragma Data()
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
index 6322414f5c6..4cf7b08bda8 100644
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
@@ -26,6 +26,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+
+#include "mli_api.h"
 
 namespace tflite {
 namespace ops {
@@ -131,7 +134,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       GetTensorData<float>(output));
 }
 
-void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, OpData* data,
                              const TfLiteTensor* input,
                              const TfLiteTensor* filter,
@@ -186,24 +189,43 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = data->padding.height + data->padding.height_offset;
     }
 
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
+    // Get first input from batch
+    mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1]) };
+    mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1]) };
+    mli_tensor sub_mli_in = { 0 };
+    mli_tensor sub_mli_out = { 0 };
+    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
 
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
+    // Tensors for data in fast (local) memory and config to copy data from external to local memory
+    mli_tensor weights_local = mli_weights;
+    mli_tensor bias_local = mli_bias;
+    mli_tensor in_local = sub_mli_in;
+    mli_tensor out_local = sub_mli_out;
+    mli_mov_cfg_t copy_config;
+    mli_mov_cfg_for_copy(&copy_config);
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
+    bool in_is_local = in_local.data == sub_mli_in.data;
+    bool out_is_local = out_local.data == sub_mli_out.data;
+
+    mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
+    mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
+    const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
 
     for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&sub_mli_in, &mli_weights,
-                                                &mli_bias, &cfg, &sub_mli_out);
+      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
+      mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local);
+      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
+      subtsr_cfg_in.start_coord[0]++;
+      subtsr_cfg_out.start_coord[0]++;
+      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+      if (in_is_local) {
+        in_local.data = sub_mli_in.data;
+      }
+      if (out_is_local) {
+        out_local.data = sub_mli_out.data;
+      }
     }
   } else {
     DepthwiseParams op_params;
@@ -230,6 +252,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
         GetTensorData<int32>(bias), GetTensorShape(output),
         GetTensorData<int8>(output));
   }
+  return kTfLiteOk;
 }
 
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
@@ -311,7 +334,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       EvalFloat(context, node, params, &data, input, filter, bias, output);
       break;
     case kTfLiteInt8:
-      EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
+      return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
                               output);
       break;
     case kTfLiteUInt8:
diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
index 57203f10487..9c484718b25 100644
--- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
@@ -24,6 +24,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+
+#include "mli_api.h"
+
 
 namespace tflite {
 namespace ops {
@@ -95,24 +99,44 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
     ConvertToMliTensor<int32_t>(bias, &mli_bias);
     ConvertToMliTensor<int8_t>(output, &mli_out);
 
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
+    mli_point_to_subtsr_cfg subtsr_cfg_in = {{0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
+    mli_point_to_subtsr_cfg subtsr_cfg_out = {{0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
     mli_tensor sub_mli_in = {0};
     mli_tensor sub_mli_out = {0};
+    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+
+    // Tensors for data in fast (local) memory and config to copy data from external to local memory
+    mli_tensor weights_local = mli_weights;
+    mli_tensor bias_local = mli_bias;
+    mli_tensor in_local = sub_mli_in;
+    mli_tensor out_local = sub_mli_out;
+    mli_mov_cfg_t copy_config;
+    mli_mov_cfg_for_copy(&copy_config);
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
+    bool in_is_local = in_local.data == sub_mli_in.data;
+    bool out_is_local = out_local.data == sub_mli_out.data;
+
+    mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
+    mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
 
     const int batches =
         MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
 
     for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_fully_connected_sa8_sa8_sa32(&sub_mli_in, &mli_weights, &mli_bias,
-                                           &sub_mli_out);
+      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
+      mli_krn_fully_connected_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &out_local);
+      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
+      subtsr_cfg_in.start_coord[0]++;
+      subtsr_cfg_out.start_coord[0]++;
+      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+      if (in_is_local) {
+        in_local.data = sub_mli_in.data;
+      }
+      if (out_is_local) {
+        out_local.data = sub_mli_out.data;
+      }
     }
   } else {
     FullyConnectedParams op_params;
diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc
index 55452013028..ef72a6c0649 100644
--- a/tensorflow/lite/micro/kernels/arc/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc/pooling.cc
@@ -21,6 +21,9 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+
+#include "mli_api.h"
 
 namespace tflite {
 namespace ops {
@@ -97,7 +100,7 @@ void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node,
       GetTensorShape(output), GetTensorData<uint8_t>(output));
 }
 
-void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
+TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
                      const TfLitePoolParams* params, const OpData* data,
                      const TfLiteTensor* input, TfLiteTensor* output) {
   // Run Average Pooling MLI kernel
@@ -128,23 +131,39 @@ void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
       cfg.padding_bottom = data->padding.height + data->padding.height_offset;
     }
 
-    mli_point_to_subtsr_cfg substr_cfg_in = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg substr_cfg_out = {
-        {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
+    mli_point_to_subtsr_cfg subtsr_cfg_in = {{0,0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
+    mli_point_to_subtsr_cfg subtsr_cfg_out = {{0,0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
     mli_tensor sub_mli_in = {0};
     mli_tensor sub_mli_out = {0};
+    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+
+    // Tensors for data in fast (local) memory and config to copy data from external to local memory
+    mli_tensor in_local = sub_mli_in;
+    mli_tensor out_local = sub_mli_out;
+    mli_mov_cfg_t copy_config;
+    mli_mov_cfg_for_copy(&copy_config);
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_io_tensors(context, &in_local, &out_local));
+	bool in_is_local = in_local.data == sub_mli_in.data;
+	bool out_is_local = out_local.data == sub_mli_out.data;
 
     const int batches =
         MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
 
     for (int i = 0; i < batches; i++) {
-      substr_cfg_in.start_coord[0] = i;
-      substr_cfg_out.start_coord[0] = i;
-      mli_hlp_point_to_subtensor(&mli_in, &substr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &substr_cfg_out, &sub_mli_out);
-
-      mli_krn_avepool_hwc_sa8(&sub_mli_in, &cfg, &sub_mli_out);
+      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
+      mli_krn_avepool_hwc_sa8(&in_local, &cfg, &out_local);
+      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
+      subtsr_cfg_in.start_coord[0]++;
+      subtsr_cfg_out.start_coord[0]++;
+      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+      if (in_is_local) {
+        in_local.data = sub_mli_in.data;
+	  }
+      if (out_is_local) {
+        out_local.data = sub_mli_out.data;
+	  }
     }
   } else {
     int32_t activation_min, activation_max;
@@ -163,6 +182,7 @@ void AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
         op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
         GetTensorShape(output), GetTensorData<int8_t>(output));
   }
+  return kTfLiteOk;
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
@@ -227,7 +247,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
       AverageEvalUint8(context, node, params, &data, input, output);
       break;
     case kTfLiteInt8:
-      AverageEvalInt8(context, node, params, &data, input, output);
+      return AverageEvalInt8(context, node, params, &data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
index 2ac60dd0f25..5bcc4752260 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
@@ -38,7 +38,9 @@ limitations under the License.
 
 #ifndef SCRATCH_MEM_Z_SIZE
 #ifdef core_config_dccm_size
-#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
+// temporary disable the use of dccm scratch mem
+//#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
+#define SCRATCH_MEM_Z_SIZE (0)
 #else
 #define SCRATCH_MEM_Z_SIZE (0)
 #endif
@@ -144,3 +146,43 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
   return kTfLiteOk;
 #endif
 }
+
+TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* out) {
+#ifdef __Xxy
+  // Function to assign fast memory from one of 3 scratch buffers.
+  // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused
+  mli_tensor* tensors[2] = { in, out };
+  uint32_t tensor_sizes[2] = {
+    mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)};
+  bool mem_is_free[3] = { true, true, true };
+  int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
+  uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE};
+  int num_tensors = 2;
+  int num_memories = 3;
+  
+
+  for (int i = 0; i < num_tensors; ++i) {
+    int best_mem_idx = -1;
+    int best_mem_delta = INT_MAX;
+	// only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size.
+	if (inside_arc_ccm(tensors[i]->data)) continue;
+    for (int j = 0; j < num_memories; ++j) {
+       // Best Fit
+       if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) {
+          best_mem_idx = j;
+          best_mem_delta = scratch_sizes[j] - tensor_sizes[i];
+       }
+    }
+    if (best_mem_idx >= 0) {
+      tensors[i]->data = static_cast<void*>(scratch_mem[best_mem_idx]);
+      tensors[i]->capacity = scratch_sizes[best_mem_idx];
+      mem_is_free[best_mem_idx] = false;
+    } else {
+        return kTfLiteError;
+    }
+  }
+#endif
+  return kTfLiteOk;
+}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
index 198cc5b83cf..d92ecc02d3a 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
@@ -39,4 +39,20 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
     mli_tensor* bias, 
     mli_tensor* out);
 
+/**
+ * @brief Function to allocate scratch buffers for kernels with only input and output buffers
+ *
+ * @detail This function will update the data pointers in the 2 tensors with pointers
+ * to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* out);
+
 #endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_

From bf8b8ac71ca40917a9ba09933179343f03879edb Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Mon, 20 Jan 2020 18:41:26 +0300
Subject: [PATCH 0057/1533] person_detection example: wrap data with named bss
 section

---
 .../person_detection_experimental/main_functions.cc  |  2 ++
 .../person_detection_test.cc                         |  4 ++--
 tensorflow/lite/micro/kernels/arc/scratch_buffers.cc | 12 ++++++------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index 719f16b2d36..552b52c9c51 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -42,7 +42,9 @@ TfLiteTensor* input = nullptr;
 
 // An area of memory to use for input, output, and intermediate arrays.
 constexpr int kTensorArenaSize = 125 * 1024;
+#pragma Bss(".tensor_arena")
 static uint8_t tensor_arena[kTensorArenaSize];
+#pragma Bss()
 }  // namespace
 
 // The name of this function is important for Arduino compatibility.
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index cac5596cd83..9c7212648cc 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -28,9 +28,9 @@ limitations under the License.
 
 // Create an area of memory to use for input, output, and intermediate arrays.
 constexpr int tensor_arena_size = 125 * 1024;
-#pragma Data(".System")
+#pragma Bss(".tensor_arena")
 uint8_t tensor_arena[tensor_arena_size];
-#pragma Data()
+#pragma Bss()
 
 TF_LITE_MICRO_TESTS_BEGIN
 
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
index 5bcc4752260..477f4f37b2b 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
@@ -47,17 +47,17 @@ limitations under the License.
 #endif
 
 namespace {
-#pragma Data(".Xdata")
+#pragma Bss(".Xdata")
     static int8_t scratch_mem_x[SCRATCH_MEM_X_SIZE];
-#pragma Data()
+#pragma Bss()
 
-#pragma Data(".Ydata")
+#pragma Bss(".Ydata")
     static int8_t scratch_mem_y[SCRATCH_MEM_Y_SIZE];
-#pragma Data()
+#pragma Bss()
 
-#pragma Data(".Zdata")
+#pragma Bss(".Zdata")
     static int8_t scratch_mem_z[SCRATCH_MEM_Z_SIZE];
-#pragma Data()
+#pragma Bss()
 }
 
 static inline

From d6917614dd5d5d3d58e699ab113b08ff07a1b2d6 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Mon, 20 Jan 2020 16:56:53 +0100
Subject: [PATCH 0058/1533] add LCF file for ARC target

---
 .../micro/tools/make/targets/arc/memory.lcf   | 49 +++++++++++++++++++
 .../micro/tools/make/targets/arc_makefile.inc |  4 +-
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/memory.lcf

diff --git a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
new file mode 100644
index 00000000000..1d967bde0fa
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
@@ -0,0 +1,49 @@
+    # SYSTEM memory regions indicate where external memory might be located.
+    #   The TCF has no specific knowledge of whether SYSTEM regions contain 
+    #   external memory or not.
+    # CCMWRAP memory regions indicate unusable portions of the address space
+    #   due to CCM memory wrapping into upper addresses beyond its size
+
+    MEMORY {
+        ICCM0   : ORIGIN = 0x00000000, LENGTH = 0x00080000
+    #   CCMWRAP0: ORIGIN = 0x00080000, LENGTH = 0x00080000
+    #   SYSTEM0 : ORIGIN = 0x00100000, LENGTH = 0x00700000
+        DCCM    : ORIGIN = 0x00800000, LENGTH = 0x00080000
+    #   CCMWRAP1: ORIGIN = 0x00880000, LENGTH = 0x00080000
+    #   SYSTEM1 : ORIGIN = 0x00900000, LENGTH = 0x00300000
+        XCCM    : ORIGIN = 0x00c00000, LENGTH = 0x00010000
+    #   CCMWRAP2: ORIGIN = 0x00c10000, LENGTH = 0x000f0000
+    #   SYSTEM2 : ORIGIN = 0x00d00000, LENGTH = 0x00100000
+        YCCM    : ORIGIN = 0x00e00000, LENGTH = 0x00010000
+    #   CCMWRAP3: ORIGIN = 0x00e10000, LENGTH = 0x000f0000
+    #   SYSTEM3 : ORIGIN = 0x00f00000, LENGTH = 0x00100000
+    }
+    SECTIONS {
+        GROUP BLOCK(4): {
+            .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:132): {}
+    	.text? : { *('.text$crt*') }
+            * (TEXT): {}
+            * (LIT): {}
+            .tensor_arena?: {}
+    	} > ICCM0
+
+        GROUP BLOCK(4): {
+    	/* _SDA_BASE_ computed implicitly */
+            .sdata?: {}
+            .sbss?: {}
+            .protobuf?: {}
+            * (DATA): {}
+            * (BSS): {}
+           .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {}
+           .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
+    	} > DCCM
+        GROUP BLOCK(4): {
+            .Xdata? : {}
+            } > XCCM
+        GROUP BLOCK(4): {
+            .Ydata? : {}
+            } > YCCM
+    }
+
+
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 16e89266614..09fabd5e2cf 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -23,7 +23,7 @@ endif
 
   PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
   PLATFORM_FLAGS += -tcf_core_config
-  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map
+  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf 
 
   CXXFLAGS += $(PLATFORM_FLAGS)
   CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
@@ -86,6 +86,8 @@ endif
     MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
     MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
 
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
+
 endif # USE_EMBARC_MLI
 
 endif

From bab1f34a3cb829a900f30178cda321b418909ff1 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Mon, 20 Jan 2020 17:05:42 +0100
Subject: [PATCH 0059/1533] Update URL to latest MLI lib with optimizations for
 person detect example

---
 tensorflow/lite/micro/tools/make/third_party_downloads.inc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 69e7910f6c2..8c8684ebec6 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
 EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
 EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/428cfd6a89f848e403a8b8ca02eab2a897ae8cd3.zip"
-EMBARC_MLI_MD5 := "9c6c8f8877fa6dd738d7ab62665b3a6e"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/d8702db473472764dcc8d2dff1f68c690d368be3.zip"
+EMBARC_MLI_MD5 := "7a798dfe1424971b9ae50cd019e03616"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"

From 279e034660d296ca3dc3eed1ea604ce61e96a58b Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Wed, 22 Jan 2020 14:46:58 +0100
Subject: [PATCH 0060/1533] fix memory allocation issue for person detect
 example

---
 .../lite/micro/kernels/arc/scratch_buffers.cc | 15 ++++++--
 .../micro/tools/make/targets/arc/memory.lcf   | 35 ++++++++++---------
 2 files changed, 30 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
index 477f4f37b2b..4c75a0a0fd4 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
@@ -38,9 +38,7 @@ limitations under the License.
 
 #ifndef SCRATCH_MEM_Z_SIZE
 #ifdef core_config_dccm_size
-// temporary disable the use of dccm scratch mem
-//#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
-#define SCRATCH_MEM_Z_SIZE (0)
+#define SCRATCH_MEM_Z_SIZE ((core_config_dccm_size) / 2)
 #else
 #define SCRATCH_MEM_Z_SIZE (0)
 #endif
@@ -141,6 +139,17 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
       }
     }
   }
+  if (!is_bias_allocated) {
+    uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    for (int i = 0; i < 3; ++i) {
+      if (mem_is_free[i]) {
+		  bias->data = static_cast<void*>(scratch_mem[i]);
+		  bias->capacity = bias_mem_requirements;
+        is_bias_allocated = true;
+        break;
+	  }
+    }
+  }
   return (is_bias_allocated) ? kTfLiteOk : kTfLiteError;
 #else
   return kTfLiteOk;
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
index 1d967bde0fa..00cf0a3050b 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
@@ -5,30 +5,30 @@
     #   due to CCM memory wrapping into upper addresses beyond its size
 
     MEMORY {
-        ICCM0   : ORIGIN = 0x00000000, LENGTH = 0x00080000
-    #   CCMWRAP0: ORIGIN = 0x00080000, LENGTH = 0x00080000
-    #   SYSTEM0 : ORIGIN = 0x00100000, LENGTH = 0x00700000
-        DCCM    : ORIGIN = 0x00800000, LENGTH = 0x00080000
-    #   CCMWRAP1: ORIGIN = 0x00880000, LENGTH = 0x00080000
-    #   SYSTEM1 : ORIGIN = 0x00900000, LENGTH = 0x00300000
-        XCCM    : ORIGIN = 0x00c00000, LENGTH = 0x00010000
-    #   CCMWRAP2: ORIGIN = 0x00c10000, LENGTH = 0x000f0000
-    #   SYSTEM2 : ORIGIN = 0x00d00000, LENGTH = 0x00100000
-        YCCM    : ORIGIN = 0x00e00000, LENGTH = 0x00010000
-    #   CCMWRAP3: ORIGIN = 0x00e10000, LENGTH = 0x000f0000
-    #   SYSTEM3 : ORIGIN = 0x00f00000, LENGTH = 0x00100000
+        ICCM0   : ORIGIN = 0x00000000, LENGTH = 0x00010000
+    #   CCMWRAP0: ORIGIN = 0x00010000, LENGTH = 0x0fff0000
+        ICCM1   : ORIGIN = 0x10000000, LENGTH = 0x00080000
+    #   CCMWRAP1: ORIGIN = 0x10080000, LENGTH = 0x0ff80000
+    #   SYSTEM0 : ORIGIN = 0x20000000, LENGTH = 0x60000000
+        DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00080000
+    #   CCMWRAP2: ORIGIN = 0x80080000, LENGTH = 0x0ff80000
+        XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00008000
+    #   CCMWRAP3: ORIGIN = 0x90008000, LENGTH = 0x0fff8000
+        YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00008000
+    #   CCMWRAP4: ORIGIN = 0xa0008000, LENGTH = 0x0fff8000
+    #   SYSTEM1 : ORIGIN = 0xb0000000, LENGTH = 0x50000000
     }
     SECTIONS {
         GROUP BLOCK(4): {
             .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:132): {}
-    	.text? : { *('.text$crt*') }
+        .text? : { *('.text$crt*') }
             * (TEXT): {}
             * (LIT): {}
-            .tensor_arena?: {}
-    	} > ICCM0
+            .rodata_in_data?:{}
+        } > ICCM1
 
         GROUP BLOCK(4): {
-    	/* _SDA_BASE_ computed implicitly */
+        /* _SDA_BASE_ computed implicitly */
             .sdata?: {}
             .sbss?: {}
             .protobuf?: {}
@@ -36,7 +36,8 @@
             * (BSS): {}
            .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {}
            .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
-    	} > DCCM
+            .tensor_arena?: {}
+        } > DCCM
         GROUP BLOCK(4): {
             .Xdata? : {}
             } > XCCM

From b045244f289aacf22c51c9202b68e9ea311e9554 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Mon, 10 Feb 2020 10:37:30 +0100
Subject: [PATCH 0061/1533] update MLI lib to performance optimized MLI1.1
 pre-release

---
 tensorflow/lite/micro/tools/make/third_party_downloads.inc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 8c8684ebec6..6141efedbee 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
 EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
 EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/d8702db473472764dcc8d2dff1f68c690d368be3.zip"
-EMBARC_MLI_MD5 := "7a798dfe1424971b9ae50cd019e03616"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/97c09b81bd1c4d0455de298626c271d75faedba2.zip"
+EMBARC_MLI_MD5 := "f7c5555a15e7837806cfaeb22d3c7b50"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"

From f110cdd8303a2365fafa7c9ffab984d27f7538e5 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Fri, 6 Mar 2020 15:00:54 +0100
Subject: [PATCH 0062/1533] Add slicing logic for convolution layers

in case the tensors don't fit completely in local memory, slicing is used to split the tensors.
---
 tensorflow/lite/micro/kernels/arc/conv.cc     |  61 +++---
 .../lite/micro/kernels/arc/depthwise_conv.cc  |  63 +++---
 .../lite/micro/kernels/arc/fully_connected.cc |   1 +
 .../lite/micro/kernels/arc/mli_slicers.cc     |  93 +++++++++
 .../lite/micro/kernels/arc/mli_slicers.h      |  56 +++++
 tensorflow/lite/micro/kernels/arc/pooling.cc  |   3 +
 .../lite/micro/kernels/arc/scratch_buf_mgr.cc | 192 ++++++++++++++++++
 .../lite/micro/kernels/arc/scratch_buf_mgr.h  |  75 +++++++
 .../lite/micro/kernels/arc/scratch_buffers.cc | 179 +++++-----------
 .../lite/micro/kernels/arc/scratch_buffers.h  |  75 +++----
 .../micro/tools/make/targets/arc_makefile.inc |   4 +
 11 files changed, 588 insertions(+), 214 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/arc/mli_slicers.cc
 create mode 100644 tensorflow/lite/micro/kernels/arc/mli_slicers.h
 create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
 create mode 100644 tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h

diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc
index 46be76a407b..8141154147b 100644
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/conv.cc
@@ -26,6 +26,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
 
 #include "mli_api.h"
 
@@ -198,44 +200,51 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = data->padding.height + data->padding.height_offset;
     }
 
-    // Get first input from batch
-    mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1]) };
-    mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1]) };
-    mli_tensor sub_mli_in = { 0 };
-    mli_tensor sub_mli_out = { 0 };
-    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+    const int heightDimension = 1;
+    int inSliceHeight = 0;
+    int outSliceHeight = 0;
+    const int kernelHeight = static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
+    const int overlap = kernelHeight - cfg.stride_height;
 
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
     mli_tensor weights_local = mli_weights;
     mli_tensor bias_local = mli_bias;
-    mli_tensor in_local = sub_mli_in;
-    mli_tensor out_local = sub_mli_out;
+    mli_tensor in_local = mli_in;
+    mli_tensor out_local = mli_out;
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
     TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    bool in_is_local = in_local.data == sub_mli_in.data;
-    bool out_is_local = out_local.data == sub_mli_out.data;
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight));
+
+    const bool in_is_local = in_local.data == mli_in.data;
+    const bool out_is_local = out_local.data == mli_out.data;
+
+    /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
+    because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+    on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
+    The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
+    in chunks of 'sliceHeight' */
+    TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); 
+    TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight);
+
+    mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
 
     mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
     mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
-    const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
 
-    for (int i = 0; i < batches; i++) {
-      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
-      mli_krn_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local);
-      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
-      subtsr_cfg_in.start_coord[0]++;
-      subtsr_cfg_out.start_coord[0]++;
-      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
-      if (in_is_local) {
-        in_local.data = sub_mli_in.data;
-      }
-      if (out_is_local) {
-        out_local.data = sub_mli_out.data;
-      }
+    while (!out_slice.Done()) {
+      cfg.padding_top = in_slice.GetPaddingPre();
+      cfg.padding_bottom = in_slice.GetPaddingPost();
+
+      mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+      mli_krn_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
     }
+    free_arc_scratch_buffers();
   } else {
     ConvParams op_params;
     op_params.input_offset = -input->params.zero_point;
diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
index 4cf7b08bda8..5921c4e4dff 100644
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
 
 #include "mli_api.h"
 
@@ -189,44 +191,53 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = data->padding.height + data->padding.height_offset;
     }
 
-    // Get first input from batch
-    mli_point_to_subtsr_cfg subtsr_cfg_in = { {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1]) };
-    mli_point_to_subtsr_cfg subtsr_cfg_out = { {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1]) };
-    mli_tensor sub_mli_in = { 0 };
-    mli_tensor sub_mli_out = { 0 };
-    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+    const int heightDimension = 1;
+    int inSliceHeight = 0;
+    int outSliceHeight = 0;
+    const int kernelHeight = static_cast<int>(mli_weights.shape[KRNL_DW_H_DIM_HWC]); 
+    const int overlap = kernelHeight - cfg.stride_height;
 
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
     mli_tensor weights_local = mli_weights;
     mli_tensor bias_local = mli_bias;
-    mli_tensor in_local = sub_mli_in;
-    mli_tensor out_local = sub_mli_out;
+    mli_tensor in_local = mli_in;
+    mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct.
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
+
     TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    bool in_is_local = in_local.data == sub_mli_in.data;
-    bool out_is_local = out_local.data == sub_mli_out.data;
+    /* if the tensor is already in local memory, is_local is true */
+    const bool in_is_local = in_local.data == mli_in.data;
+    const bool out_is_local = out_local.data == mli_out.data;
+
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight));
+
+    /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
+       because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+       on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
+       The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
+       in chunks of 'sliceHeight' */
+    TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap);
+    TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight);
+
+    mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
 
     mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
     mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
-    const int batches = MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
 
-    for (int i = 0; i < batches; i++) {
-      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
-      mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &cfg, &out_local);
-      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
-      subtsr_cfg_in.start_coord[0]++;
-      subtsr_cfg_out.start_coord[0]++;
-      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
-      if (in_is_local) {
-        in_local.data = sub_mli_in.data;
-      }
-      if (out_is_local) {
-        out_local.data = sub_mli_out.data;
-      }
+    while (!out_slice.Done()) {
+      cfg.padding_top = in_slice.GetPaddingPre();
+      cfg.padding_bottom = in_slice.GetPaddingPost();
+
+      mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+      mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
     }
+    free_arc_scratch_buffers();
   } else {
     DepthwiseParams op_params;
     op_params.padding_type = PaddingType::kSame;
diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
index 9c484718b25..42921037481 100644
--- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
new file mode 100644
index 00000000000..0ae80d1afc3
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
@@ -0,0 +1,93 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mli_slicers.h"
+
+#define MAX(A,B) (((A) > (B))? (A): (B))
+#define MIN(A,B) (((A) > (B))? (B): (A)) 
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre, int padding_post, int overlap)
+  : full_tensor_(full_tensor)
+  , sliceDim_(slice_dim)
+  , pad_pre_(padding_pre)
+  , pad_post_(padding_post)
+  , overlap_(overlap)
+  , subtsr_cfg_{ {0, 0}, static_cast<uint8_t>(slice_dim + 1), static_cast<uint8_t>(slice_size) }
+  , sub_tensor_{0}
+  , done_(false){
+
+  ComputeSubTensor();
+}
+
+void TensorSlicer::ComputeSubTensor(void) {
+  // subtsr_cfg_ is used to keep track of the itteration.
+  // A copy is created to update it with the correct clipping and padding for the current slice
+  mli_point_to_subtsr_cfg cfg_new = subtsr_cfg_;
+  // add clipping of first_out_dim_size to not exceed total size in that dimensions
+  // add padding logic
+
+  // begin and end spans the complete input region including padding areas.
+  const int begin = (int)subtsr_cfg_.start_coord[1] - pad_pre_;
+  // end is clipped to the end of the full input region. this is needed for cases where the last slice is smaller than the rest.
+  const int end = MIN(begin + subtsr_cfg_.first_out_dim_size + overlap_, full_tensor_->shape[sliceDim_] + pad_post_);
+  // The start coordinate of the subtensor is clipped to zero
+  cfg_new.start_coord[sliceDim_] = MAX(begin, 0);
+  // and the stop coordinate is clipped to the size of the full tensor
+  const int stop_coord = MIN(end, full_tensor_->shape[sliceDim_]);
+  // compute the size of the subtensor
+  cfg_new.first_out_dim_size = stop_coord - cfg_new.start_coord[sliceDim_];
+
+  // compute the padding configuration for the current slice.
+  actual_padding_pre = cfg_new.start_coord[sliceDim_] - begin;
+  actual_padding_post = end - stop_coord;
+
+  mli_hlp_point_to_subtensor(full_tensor_, &cfg_new, &sub_tensor_);
+}
+void TensorSlicer::Next(void){
+  // TODO make generic for any number of dimensions.
+  subtsr_cfg_.start_coord[1]+= subtsr_cfg_.first_out_dim_size;
+  if (subtsr_cfg_.start_coord[1] >= full_tensor_->shape[1]) {
+    subtsr_cfg_.start_coord[1] = 0;
+    subtsr_cfg_.start_coord[0]++;
+    if (subtsr_cfg_.start_coord[0] >= full_tensor_->shape[0]) {
+      done_ = true;
+    }
+  }
+  if (!done_) ComputeSubTensor();
+}
+
+bool TensorSlicer::Done(void) {
+  return done_;
+}
+
+int TensorSlicer::GetPaddingPre(void) {
+  return actual_padding_pre;
+}
+
+int TensorSlicer::GetPaddingPost(void) {
+  return actual_padding_post;
+}
+
+mli_tensor* TensorSlicer::Sub(void) {
+  return &sub_tensor_;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.h b/tensorflow/lite/micro/kernels/arc/mli_slicers.h
new file mode 100644
index 00000000000..40f948a07ef
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
+#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
+
+#include "mli_api.h"
+namespace tflite {
+namespace ops {
+namespace micro {
+
+class TensorSlicer {
+public:
+
+  TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre = 0, int padding_post = 0, int overlap = 0);
+  ~TensorSlicer() = default;
+
+  void Next();
+  bool Done();
+  int GetPaddingPre();
+  int GetPaddingPost();
+
+  mli_tensor *Sub();
+
+  // Default constructor is deleted
+  TensorSlicer() = delete;
+
+
+private:
+  const mli_tensor* full_tensor_;
+  mli_tensor sub_tensor_;
+  mli_point_to_subtsr_cfg subtsr_cfg_;
+  bool done_;
+  int sliceDim_;
+  int pad_pre_, pad_post_, overlap_;
+  int actual_padding_pre, actual_padding_post;
+
+  void ComputeSubTensor();
+};
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+#endif //TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc
index ef72a6c0649..dab0ad7e314 100644
--- a/tensorflow/lite/micro/kernels/arc/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc/pooling.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/padding.h"
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
 
 #include "mli_api.h"
 
@@ -154,6 +155,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
       mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
       mli_krn_avepool_hwc_sa8(&in_local, &cfg, &out_local);
       mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
+	  if (i == batches -1) break;
       subtsr_cfg_in.start_coord[0]++;
       subtsr_cfg_out.start_coord[0]++;
       mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
@@ -165,6 +167,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
         out_local.data = sub_mli_out.data;
 	  }
     }
+    free_arc_scratch_buffers();
   } else {
     int32_t activation_min, activation_max;
     (void)CalculateActivationRangeQuantized(context, params->activation, output,
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
new file mode 100644
index 00000000000..26f4f45f17f
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
@@ -0,0 +1,192 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include <limits.h>
+#define MAX(A,B) (((A) > (B))? (A): (B))
+#define MIN(A,B) (((A) > (B))? (B): (A)) 
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+
+
+void get_arc_two_buffer_sizes(int requestsize1, int requestsize2, int *grantsize1, int *grantsize2) {
+  int maxrequest = 0;
+  int secondrequest = 0;
+  int maxavailable = 0;
+  int secondavail = 0;
+
+  // determine the largest requested buffer.
+  if (requestsize1 > requestsize2) {
+    maxrequest = requestsize1;
+    secondrequest = requestsize2;
+  } else {
+    maxrequest = requestsize2;
+    secondrequest = requestsize1;
+  }
+
+  // find the two largest available buffers.
+  get_arc_scratch_buffer_two_max_sizes(&maxavailable, &secondavail);
+
+  // in case two buffers are available, the largest buffer can go to the largest request.
+  if (secondavail > 0) { // this condition can be enhanced to prevent cases where the second buffer is so small that it is better to use one buffer and split it.
+    if (requestsize1 > requestsize2) {
+      *grantsize1 = maxavailable;
+      *grantsize2 = secondavail;
+    } else {
+      *grantsize1 = secondavail;
+      *grantsize2 = maxavailable;
+    }
+  } else {
+    // In case only one buffer is available,
+    // use only the max buffer, and split it.
+    // TODO compute optimal split ratio based on request ratio.
+    *grantsize1 = maxavailable / 2;
+    *grantsize2 = maxavailable / 2;
+  }
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* weights, 
+    mli_tensor* bias, 
+    mli_tensor* out) {
+#ifdef __Xxy
+
+  if (!inside_arc_ccm(weights->data)) {
+    int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights);
+    weights->data = get_arc_scratch_buffer(weights_size);
+    weights->capacity = weights_size;
+    if (weights->data == NULL) return kTfLiteError;
+  }
+
+  if (!inside_arc_ccm(bias->data)) {
+    uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
+    bias->capacity = bias_mem_requirements;
+    if (bias->data == NULL) return kTfLiteError;
+  }
+
+  int requestSizeIn = 0;
+  int requestSizeOut = 0;
+  int grantsizeIn = 0;
+  int grantsizeOut = 0;
+  if (!inside_arc_ccm(in->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the size
+    // of a single batch. that is why the startRank is 1 in case of input rank 4
+    int startRank = in->rank - 3; // tOdo explain
+    requestSizeIn = mli_hlp_count_elem_num(in, startRank) * mli_hlp_tensor_element_size(in);
+  }
+  if (!inside_arc_ccm(out->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the size
+    // of a single batch. that is why the startRank is 1 in case of input rank 4
+    int startRank = out->rank - 3;
+    requestSizeOut = mli_hlp_count_elem_num(out, startRank) * mli_hlp_tensor_element_size(out);
+  }
+
+  get_arc_two_buffer_sizes(requestSizeIn, requestSizeOut, &grantsizeIn, &grantsizeOut);
+
+  if (!inside_arc_ccm(in->data)) {
+    in->data = get_arc_scratch_buffer(grantsizeIn);
+    in->capacity = grantsizeIn;
+    if (in->data == NULL) return kTfLiteError;
+  }
+  if (!inside_arc_ccm(out->data)) {
+    out->data = get_arc_scratch_buffer(grantsizeOut);
+    out->capacity = grantsizeOut;
+    if (out->data == NULL) return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+#else
+  return kTfLiteOk;
+#endif
+}
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
+    const mli_tensor *in,
+    const mli_tensor *out,
+    const int kernelHeight,
+    const int strideHeight,
+    int *inSliceHeight,
+    int *outSliceHeight) {
+  const int heightDimension = 1; // todo: compute from rank
+  const int inHeight = in->shape[heightDimension];
+  const int outHeight = out->shape[heightDimension];
+  const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in);
+  const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out);
+  int maxLinesIn = 0;
+  int maxLinesOut = 0;
+  int maxOutLinesForInput = 0;
+  bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut);
+  if (fit) {
+    // in case both tensors completely fit in the capacity, there is no need for slicing
+    *inSliceHeight = inHeight;
+    *outSliceHeight = outHeight;
+  } else {
+    // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that.
+    maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn);
+    if (maxLinesIn >= inHeight) {
+      maxOutLinesForInput = outHeight;
+    } else {
+      maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false;
+    }
+    // Ten compute how many ouput lines fit into the output tensor.
+    maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut);
+    // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input.
+    *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut);
+    *inSliceHeight = *outSliceHeight * strideHeight;
+  }
+  if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* out) {
+#ifdef __Xxy
+  // Function to assign fast memory from one of 3 scratch buffers.
+  // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused
+  mli_tensor* tensors[2] = { in, out };
+  uint32_t tensor_sizes[2] = {
+    mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)};
+  int num_tensors = 2;
+  
+
+  for (int i = 0; i < num_tensors; ++i) {
+    // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size.
+    if (inside_arc_ccm(tensors[i]->data)) continue;
+    tensors[i]->data = get_arc_scratch_buffer(tensor_sizes[i]);
+    tensors[i]->capacity = tensor_sizes[i];
+
+    if (tensors[i]->data == NULL) {
+      return kTfLiteError;
+    }
+  }
+#endif
+  return kTfLiteOk;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
new file mode 100644
index 00000000000..a27df8a5358
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
+#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "mli_api.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+
+/**
+ * @brief Function to allocate scratch buffers for the convolution tensors
+ *
+ * @detail This function will update the data pointers in the 4 tensors with pointers
+ * to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param weights [IO] pointer to the weights tensor
+ * @param bias [IO] pointer to the bias tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* weights, 
+    mli_tensor* bias, 
+    mli_tensor* out);
+
+/**
+ * @brief Function to allocate scratch buffers for kernels with only input and output buffers
+ *
+ * @detail This function will update the data pointers in the 2 tensors with pointers
+ * to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* out);
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
+    const mli_tensor *in,
+    const mli_tensor *out,
+    const int kernelHeight,
+    const int strideHeight,
+    int *inSliceHeight,
+    int *outSliceHeight);
+
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
index 4c75a0a0fd4..5ef1b445a22 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
@@ -15,6 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
 #include <limits.h>
+#define MAX(A,B) (((A) > (B))? (A): (B))
+#define MIN(A,B) (((A) > (B))? (B): (A)) 
+
+namespace tflite {
+namespace ops {
+namespace micro {
 
 /* by default use all the XY memory, and half of the DCCM because DCCM is also used
  * for the data section and the stack.
@@ -58,140 +64,57 @@ namespace {
 #pragma Bss()
 }
 
-static inline
-bool inside_arc_dccm(void* p) {
-#if core_config_dccm_present
-  return ((unsigned)p >= core_config_dccm_base) && ((unsigned)p < core_config_dccm_base + core_config_dccm_size);
-#else
-  return false;
-#endif
-}
-static inline
-bool inside_arc_xccm(void* p) {
-#if core_config_xy
-  return ((unsigned)p >= core_config_xy_x_base) && ((unsigned)p < core_config_xy_x_base + core_config_xy_size);
-#else
-  return false;
-#endif
-}
-static inline
-bool inside_arc_yccm(void* p) {
-#if core_config_xy
-  return ((unsigned)p >= core_config_xy_y_base) && ((unsigned)p < core_config_xy_y_base + core_config_xy_size);
-#else
-  return false;
-#endif
-}
+static int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
+static uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE};
 
-static inline
-bool inside_arc_ccm(void* p) {
-  return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
-}
 
-TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
-    mli_tensor* in, 
-    mli_tensor* weights, 
-    mli_tensor* bias, 
-    mli_tensor* out) {
-#ifdef __Xxy
-  // Function to assign fast memory from one of 3 scratch buffers.
-  // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused
-  mli_tensor* tensors[3] = { weights, in, out };
-  uint32_t tensor_sizes[3] = {
-    mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0), mli_hlp_count_elem_num(tensors[2], 0) };
-  bool mem_is_free[3] = { true, true, true };
-  int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
-  uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE};
-
-  for (int i = 0; i < 3; ++i) {
-    int best_mem_idx = -1;
-    int best_mem_delta = INT_MAX;
-	// only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size.
-	if (inside_arc_ccm(tensors[i]->data)) continue;
-    for (int j = 0; j < 3; ++j) {
-       // Best Fit
-       if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) {
-          best_mem_idx = j;
-          best_mem_delta = scratch_sizes[j] - tensor_sizes[i];
-       }
-    }
-    if (best_mem_idx >= 0) {
-      tensors[i]->data = static_cast<void*>(scratch_mem[best_mem_idx]);
-      tensors[i]->capacity = scratch_sizes[best_mem_idx];
-      mem_is_free[best_mem_idx] = false;
-    } else {
-        return kTfLiteError;
+void *get_arc_scratch_buffer(int size) {
+  // Function to asign fast memory from one of 3 scratch buffers.
+  // Best Fit strategy - memory is allocated from that memory bank that leaves the least unused memory.
+  void *buf = NULL;
+  int best_mem_idx = -1;
+  int best_mem_delta = INT_MAX;
+  // find a local memory that fits the data size.
+  for (int mem_idx = 0; mem_idx < sizeof(scratch_mem)/sizeof(scratch_mem[0]); ++mem_idx) {
+    // Best Fit
+    if ((size <= scratch_sizes[mem_idx]) && (scratch_sizes[mem_idx] - size < best_mem_delta)) {
+      best_mem_idx = mem_idx;
+      best_mem_delta = scratch_sizes[mem_idx] - size;
     }
   }
-
-  // Bias is expected to be much smaller than other operands, not affect performance and can be placed 
-  // in the end of some of already used memory bank (to occupy free space of it)
-  bool is_bias_allocated = inside_arc_ccm(bias->data);
-  if (!is_bias_allocated) {
-    uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
-    for (int i = 0; i < 3; ++i) {
-      if (tensors[i]->capacity - tensor_sizes[i] > bias_mem_requirements) {
-        bias->data = &((char*)tensors[i]->data)[tensor_sizes[i]];
-        bias->capacity = bias_mem_requirements;
-        tensors[i]->capacity = tensor_sizes[i];
-        is_bias_allocated = true;
-        break;
-      }
-    }
+  if (best_mem_idx >= 0) {
+    buf = static_cast<void*>(scratch_mem[best_mem_idx]);
+    scratch_mem[best_mem_idx] += size;
+    scratch_sizes[best_mem_idx] -= size;
   }
-  if (!is_bias_allocated) {
-    uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
-    for (int i = 0; i < 3; ++i) {
-      if (mem_is_free[i]) {
-		  bias->data = static_cast<void*>(scratch_mem[i]);
-		  bias->capacity = bias_mem_requirements;
-        is_bias_allocated = true;
-        break;
-	  }
-    }
-  }
-  return (is_bias_allocated) ? kTfLiteOk : kTfLiteError;
-#else
-  return kTfLiteOk;
-#endif
+  return buf;
 }
 
-TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
-    mli_tensor* in, 
-    mli_tensor* out) {
-#ifdef __Xxy
-  // Function to assign fast memory from one of 3 scratch buffers.
-  // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused
-  mli_tensor* tensors[2] = { in, out };
-  uint32_t tensor_sizes[2] = {
-    mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)};
-  bool mem_is_free[3] = { true, true, true };
-  int8_t* scratch_mem[] = {scratch_mem_x, scratch_mem_y, scratch_mem_z};
-  uint32_t scratch_sizes[] = {SCRATCH_MEM_X_SIZE, SCRATCH_MEM_Y_SIZE, SCRATCH_MEM_Z_SIZE};
-  int num_tensors = 2;
-  int num_memories = 3;
-  
-
-  for (int i = 0; i < num_tensors; ++i) {
-    int best_mem_idx = -1;
-    int best_mem_delta = INT_MAX;
-	// only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size.
-	if (inside_arc_ccm(tensors[i]->data)) continue;
-    for (int j = 0; j < num_memories; ++j) {
-       // Best Fit
-       if (mem_is_free[j] && tensor_sizes[i] <= scratch_sizes[j] && scratch_sizes[j] - tensor_sizes[i] < best_mem_delta) {
-          best_mem_idx = j;
-          best_mem_delta = scratch_sizes[j] - tensor_sizes[i];
-       }
-    }
-    if (best_mem_idx >= 0) {
-      tensors[i]->data = static_cast<void*>(scratch_mem[best_mem_idx]);
-      tensors[i]->capacity = scratch_sizes[best_mem_idx];
-      mem_is_free[best_mem_idx] = false;
-    } else {
-        return kTfLiteError;
+void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
+  int maxavailable = 0;
+  int secondavail = 0;
+  // find the two largest available buffers.
+  for (int i = 0; i < 3; i++) {
+    if (scratch_sizes[i] > maxavailable) {
+      secondavail = maxavailable;
+      maxavailable = scratch_sizes[i];
+    } else if (scratch_sizes[i] > secondavail) {
+      secondavail = scratch_sizes[i];
     }
   }
-#endif
-  return kTfLiteOk;
-}
\ No newline at end of file
+  *size1 = maxavailable;
+  *size2 = secondavail;
+}
+
+void free_arc_scratch_buffers(void) {
+  scratch_mem[0] = scratch_mem_x;
+  scratch_mem[1] = scratch_mem_y;
+  scratch_mem[2] = scratch_mem_z;
+  scratch_sizes[0] = SCRATCH_MEM_X_SIZE;
+  scratch_sizes[1] = SCRATCH_MEM_Y_SIZE;
+  scratch_sizes[2] = SCRATCH_MEM_Z_SIZE;
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
index d92ecc02d3a..52a12c7899d 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
@@ -19,40 +19,47 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "mli_api.h"
 
-/**
- * @brief Function to allocate scratch buffers for the convolution tensors
- *
- * @detail This function will update the data pointers in the 4 tensors with pointers
- * to scratch buffers in fast local memory.
- *
- * @param context  [I] pointer to TfLite context (needed for error handling)
- * @param in [IO] pointer to the input tensor
- * @param weights [IO] pointer to the weights tensor
- * @param bias [IO] pointer to the bias tensor
- * @param output [IO] pointer to the output tensor
- *
- * @return Tf Lite status code
- */
-TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
-    mli_tensor* in, 
-    mli_tensor* weights, 
-    mli_tensor* bias, 
-    mli_tensor* out);
+namespace tflite {
+namespace ops {
+namespace micro {
 
-/**
- * @brief Function to allocate scratch buffers for kernels with only input and output buffers
- *
- * @detail This function will update the data pointers in the 2 tensors with pointers
- * to scratch buffers in fast local memory.
- *
- * @param context  [I] pointer to TfLite context (needed for error handling)
- * @param in [IO] pointer to the input tensor
- * @param output [IO] pointer to the output tensor
- *
- * @return Tf Lite status code
- */
-TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
-    mli_tensor* in, 
-    mli_tensor* out);
+
+void free_arc_scratch_buffers(void);
+void *get_arc_scratch_buffer(int size);// Function to assign fast memory from one of 3 scratch buffers.
+
+void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2);
+
+static inline bool inside_arc_dccm(void* p) {
+#if core_config_dccm_present
+  return ((unsigned)p >= core_config_dccm_base) && ((unsigned)p < core_config_dccm_base + core_config_dccm_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_xccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_x_base) && ((unsigned)p < core_config_xy_x_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+
+static inline bool inside_arc_yccm(void* p) {
+#if core_config_xy
+  return ((unsigned)p >= core_config_xy_y_base) && ((unsigned)p < core_config_xy_y_base + core_config_xy_size);
+#else
+  return false;
+#endif
+}
+
+static inline
+bool inside_arc_ccm(void* p) {
+  return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
+}
+
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 09fabd5e2cf..a1f5546b8f5 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -85,6 +85,10 @@ endif
 
     MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
     MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc
 
     MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
 

From c2e501e017b31b94c30bc5903bc613a8b0d7e109 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Wed, 4 Mar 2020 09:58:48 +0100
Subject: [PATCH 0063/1533] Fix for upstream merge conflict

the location of the header file was changed in the upstream archive.
but the makefile was not updated.
---
 tensorflow/lite/micro/tools/make/targets/arc_makefile.inc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index a1f5546b8f5..5ce2e03bfc3 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -89,6 +89,7 @@ endif
     MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
     MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h
     MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
 
     MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
 

From 210253668472888264a9c8f6eef9f58e3d7f3e34 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Thu, 26 Mar 2020 17:26:19 +0100
Subject: [PATCH 0064/1533] update to new version of MLI needed for slicing

---
 tensorflow/lite/micro/kernels/arc/conv.cc                  | 2 +-
 tensorflow/lite/micro/kernels/arc/depthwise_conv.cc        | 2 +-
 tensorflow/lite/micro/tools/make/third_party_downloads.inc | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc
index 8141154147b..06be9384125 100644
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/conv.cc
@@ -238,7 +238,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = in_slice.GetPaddingPost();
 
       mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-      mli_krn_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
+      mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
       mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
 
       in_slice.Next();
diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
index 5921c4e4dff..fe47c7f25e0 100644
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
@@ -231,7 +231,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = in_slice.GetPaddingPost();
 
       mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-      mli_krn_depthwise_conv2d_hwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
+      mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
       mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
 
       in_slice.Next();
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 6141efedbee..ce24ba29542 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -74,8 +74,8 @@ PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
 EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
 EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/97c09b81bd1c4d0455de298626c271d75faedba2.zip"
-EMBARC_MLI_MD5 := "f7c5555a15e7837806cfaeb22d3c7b50"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/4b6c6eed65395dced1564006be8188781af16035.zip"
+EMBARC_MLI_MD5 := "47167553c17ff8c7cd59fb1afb90c304"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"

From b4bcc4e5743fbe031406745f2474bb27bc49ba2e Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Fri, 20 Mar 2020 16:32:14 +0100
Subject: [PATCH 0065/1533] add slicing logic for weight slicing in conv kernel
 for ARC backend

---
 tensorflow/lite/micro/kernels/arc/conv.cc     |  78 ++++++--
 .../lite/micro/kernels/arc/depthwise_conv.cc  |   2 +-
 .../lite/micro/kernels/arc/mli_slicers.cc     |  74 +++++--
 .../lite/micro/kernels/arc/mli_slicers.h      |   4 +-
 tensorflow/lite/micro/kernels/arc/pooling.cc  |  48 +++--
 .../lite/micro/kernels/arc/scratch_buf_mgr.cc | 184 +++++++++++-------
 .../lite/micro/kernels/arc/scratch_buf_mgr.h  |   6 +
 .../lite/micro/kernels/arc/scratch_buffers.cc |  18 +-
 .../lite/micro/kernels/arc/scratch_buffers.h  |   1 +
 9 files changed, 278 insertions(+), 137 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc
index 06be9384125..9e9a37821e8 100644
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/conv.cc
@@ -200,12 +200,18 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = data->padding.height + data->padding.height_offset;
     }
 
+    // for height slicing
     const int heightDimension = 1;
     int inSliceHeight = 0;
     int outSliceHeight = 0;
     const int kernelHeight = static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
     const int overlap = kernelHeight - cfg.stride_height;
 
+    // for weight slicing (on output channels)
+    const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension.
+    int sliceChannels = static_cast<int>(mli_weights.shape[weightOutChDimension]);
+    const int outTensorChDimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
+
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
     mli_tensor weights_local = mli_weights;
     mli_tensor bias_local = mli_bias;
@@ -214,36 +220,68 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
     TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, &sliceChannels));
 
+    /* is_local indicates that the tensor is already in local memory,
+       so in that case the original tensor can be used,
+       and there is no need to copy it to the local tensor*/
     const bool in_is_local = in_local.data == mli_in.data;
     const bool out_is_local = out_local.data == mli_out.data;
+    const bool w_is_local = weights_local.data == mli_weights.data;
+    const bool b_is_local = bias_local.data == mli_bias.data;
 
-    /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
-    because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
-    on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
-    The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
-    in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap); 
-    TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight);
+    TensorSlicer w_slice(&mli_weights, weightOutChDimension, sliceChannels);
+    TensorSlicer b_slice(&mli_bias, weightOutChDimension, sliceChannels);
+    TensorSlicer out_ch_slice(&mli_out, outTensorChDimension, sliceChannels, 0, 0, 0, true);
 
-    mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
-    mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+    mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+    mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
 
-    mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
-    mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
+    void *inputBufferPtr = NULL;
 
-    while (!out_slice.Done()) {
-      cfg.padding_top = in_slice.GetPaddingPre();
-      cfg.padding_bottom = in_slice.GetPaddingPost();
+    while (!w_slice.Done()){
+      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
 
-      mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-      mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
-      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+      /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
+      because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+      on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
+      The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
+      in chunks of 'sliceHeight' */
+      TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap);
 
-      in_slice.Next();
-      out_slice.Next();
+      /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of
+      output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and
+      height dimension. */
+      TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+
+      /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */
+      mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+      mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+      while (!out_slice.Done()) {
+        TF_LITE_ENSURE(context, !in_slice.Done());
+        cfg.padding_top = in_slice.GetPaddingPre();
+        cfg.padding_bottom = in_slice.GetPaddingPost();
+
+        // if same input copy as previous iteration, skip the copy of input
+        if (in_slice.Sub()->data != inputBufferPtr) {
+          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+          inputBufferPtr = in_slice.Sub()->data;
+        }
+        mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
+        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+        in_slice.Next();
+        out_slice.Next();
+      }
+      w_slice.Next();
+      b_slice.Next();
+      out_ch_slice.Next();
+      TF_LITE_ENSURE(context, in_slice.Done());
     }
+
     free_arc_scratch_buffers();
   } else {
     ConvParams op_params;
diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
index fe47c7f25e0..00c46c442b7 100644
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
@@ -210,7 +210,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     const bool in_is_local = in_local.data == mli_in.data;
     const bool out_is_local = out_local.data == mli_out.data;
 
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, &inSliceHeight, &outSliceHeight));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
 
     /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
        because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
index 0ae80d1afc3..6c6c89715f8 100644
--- a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
@@ -22,53 +22,89 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
-TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre, int padding_post, int overlap)
+TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre, int padding_post, int overlap, bool interleave_mode)
   : full_tensor_(full_tensor)
   , sliceDim_(slice_dim)
   , pad_pre_(padding_pre)
   , pad_post_(padding_post)
   , overlap_(overlap)
-  , subtsr_cfg_{ {0, 0}, static_cast<uint8_t>(slice_dim + 1), static_cast<uint8_t>(slice_size) }
+  , sub_cfg_{0}
   , sub_tensor_{0}
   , done_(false){
 
+  /* In the interleave mode, the slicing happens from the deepest dimension up to the slice_dim
+  for example in an HWC layout this can mode can be used to slice in the C dimenstion.
+  in this mode the data is not contiguous in memory anymore */
+  if (interleave_mode) {
+    for (int i = 0; i< full_tensor->rank; i++){
+      if (i > slice_dim) {
+        sub_cfg_.size[i] = 1;
+      } else if (i == slice_dim) {
+        sub_cfg_.size[i] = slice_size;
+      } else {
+        sub_cfg_.size[i] = full_tensor->shape[i];
+      }
+    }
+    sub_cfg_.sub_tensor_rank = full_tensor->rank;
+
+  } else {
+    /* In the not interlevaed mode, the slicing happens from the outer most dimension up to the slice_dim
+    for example in an HWC layout this mode can be used to slice in the H dimension.
+    in this mode the data of the slice is still contiguous in memory (if that was the case in the input tensor */
+    for (int i = 0; i< full_tensor->rank; i++){
+      if (i < slice_dim) {
+        sub_cfg_.size[i] = 1;
+      } else if (i == slice_dim) {
+        sub_cfg_.size[i] = slice_size;
+      }else {
+        sub_cfg_.size[i] = full_tensor->shape[i];
+      }
+    }
+    sub_cfg_.sub_tensor_rank = full_tensor->rank - slice_dim;
+  }
+
   ComputeSubTensor();
 }
 
 void TensorSlicer::ComputeSubTensor(void) {
-  // subtsr_cfg_ is used to keep track of the itteration.
+
+  // subtsr_cfg_ is used to keep track of the iteration.
   // A copy is created to update it with the correct clipping and padding for the current slice
-  mli_point_to_subtsr_cfg cfg_new = subtsr_cfg_;
-  // add clipping of first_out_dim_size to not exceed total size in that dimensions
-  // add padding logic
+  mli_sub_tensor_cfg cfg_new = sub_cfg_;
 
   // begin and end spans the complete input region including padding areas.
-  const int begin = (int)subtsr_cfg_.start_coord[1] - pad_pre_;
+  const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_;
   // end is clipped to the end of the full input region. this is needed for cases where the last slice is smaller than the rest.
-  const int end = MIN(begin + subtsr_cfg_.first_out_dim_size + overlap_, full_tensor_->shape[sliceDim_] + pad_post_);
+  const int end = MIN(begin + sub_cfg_.size[sliceDim_] + overlap_, full_tensor_->shape[sliceDim_] + pad_post_);
   // The start coordinate of the subtensor is clipped to zero
-  cfg_new.start_coord[sliceDim_] = MAX(begin, 0);
+  cfg_new.offset[sliceDim_] = MAX(begin, 0);
   // and the stop coordinate is clipped to the size of the full tensor
   const int stop_coord = MIN(end, full_tensor_->shape[sliceDim_]);
   // compute the size of the subtensor
-  cfg_new.first_out_dim_size = stop_coord - cfg_new.start_coord[sliceDim_];
+  cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_];
 
   // compute the padding configuration for the current slice.
-  actual_padding_pre = cfg_new.start_coord[sliceDim_] - begin;
+  actual_padding_pre = cfg_new.offset[sliceDim_] - begin;
   actual_padding_post = end - stop_coord;
 
-  mli_hlp_point_to_subtensor(full_tensor_, &cfg_new, &sub_tensor_);
+  mli_hlp_create_subtensor(full_tensor_, &cfg_new, &sub_tensor_);
 }
+
 void TensorSlicer::Next(void){
-  // TODO make generic for any number of dimensions.
-  subtsr_cfg_.start_coord[1]+= subtsr_cfg_.first_out_dim_size;
-  if (subtsr_cfg_.start_coord[1] >= full_tensor_->shape[1]) {
-    subtsr_cfg_.start_coord[1] = 0;
-    subtsr_cfg_.start_coord[0]++;
-    if (subtsr_cfg_.start_coord[0] >= full_tensor_->shape[0]) {
-      done_ = true;
+  for (int i = full_tensor_->rank - 1; i >= 0; i--) {
+    sub_cfg_.offset[i] += sub_cfg_.size[i];
+    if (sub_cfg_.offset[i] >= full_tensor_->shape[i]){
+      // wrap
+      sub_cfg_.offset[i] = 0;
+      // and continue to the next dimension, if no next dimension we are done.
+      if (i == 0) done_ = true;
+      continue;
+    } else {
+      // carry is false, so break from the loop
+      break;
     }
   }
+
   if (!done_) ComputeSubTensor();
 }
 
diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.h b/tensorflow/lite/micro/kernels/arc/mli_slicers.h
index 40f948a07ef..3fc7d911fa5 100644
--- a/tensorflow/lite/micro/kernels/arc/mli_slicers.h
+++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.h
@@ -24,7 +24,7 @@ namespace micro {
 class TensorSlicer {
 public:
 
-  TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre = 0, int padding_post = 0, int overlap = 0);
+  TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size, int padding_pre = 0, int padding_post = 0, int overlap = 0, bool interleave_mode = false);
   ~TensorSlicer() = default;
 
   void Next();
@@ -41,7 +41,7 @@ public:
 private:
   const mli_tensor* full_tensor_;
   mli_tensor sub_tensor_;
-  mli_point_to_subtsr_cfg subtsr_cfg_;
+  mli_sub_tensor_cfg sub_cfg_;
   bool done_;
   int sliceDim_;
   int pad_pre_, pad_post_, overlap_;
diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc
index dab0ad7e314..0cfa5363d69 100644
--- a/tensorflow/lite/micro/kernels/arc/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc/pooling.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
 
 #include "mli_api.h"
 
@@ -139,33 +140,42 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
     mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
     mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
 
+    const int heightDimension = 1;
+    int inSliceHeight = 0;
+    int outSliceHeight = 0;
+    const int overlap = cfg.kernel_height - cfg.stride_height;
+
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
     mli_tensor in_local = sub_mli_in;
     mli_tensor out_local = sub_mli_out;
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
     TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_io_tensors(context, &in_local, &out_local));
-	bool in_is_local = in_local.data == sub_mli_in.data;
-	bool out_is_local = out_local.data == sub_mli_out.data;
+    bool in_is_local = in_local.data == sub_mli_in.data;
+    bool out_is_local = out_local.data == sub_mli_out.data;
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
 
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
+    /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
+       because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+       on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
+       The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
+       in chunks of 'sliceHeight' */
+    TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap);
+    TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight);
 
-    for (int i = 0; i < batches; i++) {
-      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
-      mli_krn_avepool_hwc_sa8(&in_local, &cfg, &out_local);
-      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
-	  if (i == batches -1) break;
-      subtsr_cfg_in.start_coord[0]++;
-      subtsr_cfg_out.start_coord[0]++;
-      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
-      if (in_is_local) {
-        in_local.data = sub_mli_in.data;
-	  }
-      if (out_is_local) {
-        out_local.data = sub_mli_out.data;
-	  }
+    mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+    while (!out_slice.Done()) {
+      cfg.padding_top = in_slice.GetPaddingPre();
+      cfg.padding_bottom = in_slice.GetPaddingPost();
+
+      mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+      mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
     }
     free_arc_scratch_buffers();
   } else {
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
index 26f4f45f17f..e9adbb37e9e 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
@@ -66,22 +66,128 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
     mli_tensor* weights, 
     mli_tensor* bias, 
     mli_tensor* out) {
+TfLiteStatus ret_val = kTfLiteOk;
 #ifdef __Xxy
 
   if (!inside_arc_ccm(weights->data)) {
     int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights);
+    int maxWeightsSize = 0;
     weights->data = get_arc_scratch_buffer(weights_size);
     weights->capacity = weights_size;
-    if (weights->data == NULL) return kTfLiteError;
+    if (weights->data == NULL) {
+      get_arc_scratch_buffer_max_size(&maxWeightsSize);
+      weights->data = get_arc_scratch_buffer(maxWeightsSize);
+      weights->capacity = maxWeightsSize;
+      if (maxWeightsSize == 0) ret_val = kTfLiteError;
+    }
+    if (weights->data == NULL) ret_val = kTfLiteError;
   }
 
   if (!inside_arc_ccm(bias->data)) {
     uint32_t bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
     bias->data = get_arc_scratch_buffer(bias_mem_requirements);
     bias->capacity = bias_mem_requirements;
-    if (bias->data == NULL) return kTfLiteError;
+  }
+  if (ret_val == kTfLiteOk) {
+    ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
+  }
+  if (bias->data == NULL) {
+    int maxBiasSize = 0;
+    get_arc_scratch_buffer_max_size(&maxBiasSize);
+    bias->data = get_arc_scratch_buffer(maxBiasSize);
+    bias->capacity = maxBiasSize;
+    if (maxBiasSize == 0) ret_val = kTfLiteError;
+  }
+  if (bias->data == NULL) ret_val = kTfLiteError;
+
+#endif
+  return ret_val;
+}
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
+    const mli_tensor *in,
+    const mli_tensor *out,
+    const int kernelHeight,
+    const int strideHeight,
+    const int padding_top,
+    const int padding_bot,
+    int *inSliceHeight,
+    int *outSliceHeight) {
+  const int heightDimension = 1; // todo: compute from rank
+  const int inHeight = in->shape[heightDimension];
+  const int outHeight = out->shape[heightDimension];
+  const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in);
+  const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out);
+  int maxLinesIn = 0;
+  int maxLinesOut = 0;
+  int maxOutLinesForInput = 0;
+  bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut);
+  if (fit) {
+    // in case both tensors completely fit in the capacity, there is no need for slicing
+    *inSliceHeight = inHeight;
+    *outSliceHeight = outHeight;
+  } else {
+    // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that.
+    maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn);
+    if (maxLinesIn >= inHeight) {
+      maxOutLinesForInput = outHeight;
+    } else if (2 * maxLinesIn >= inHeight) {
+      // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case.
+      maxOutLinesForInput = (maxLinesIn + MIN(padding_top, padding_bot) - kernelHeight + 1) / strideHeight;
+    } else {
+      maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false;
+    }
+    // Ten compute how many ouput lines fit into the output tensor.
+    maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut);
+    // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input.
+    *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut);
+    *inSliceHeight = *outSliceHeight * strideHeight;
   }
 
+  if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
+    const mli_tensor *weights,
+    const mli_tensor *bias,
+    int *sliceChannels) {
+  const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension.
+  const int channels = weights->shape[weightOutChDimension];
+
+
+  const int chSizeW = mli_hlp_count_elem_num(weights, weightOutChDimension + 1) * mli_hlp_tensor_element_size(weights);
+  const int chSizeB = mli_hlp_count_elem_num(bias, weightOutChDimension + 1) * mli_hlp_tensor_element_size(bias);
+  int maxChWeights = 0;
+  int maxChBias = 0;
+
+  bool fit = (weights->capacity >= channels * chSizeW) && (bias->capacity >= channels * chSizeB);
+  if (fit) {
+    // in case both tensors completely fit in the capacity, there is no need for slicing
+    *sliceChannels = channels;
+  } else {
+    // First compute how many channels fit into the weights tensor
+    maxChWeights = MIN(channels, weights->capacity / chSizeW);
+    // Ten compute how many channels fit into the bias tensor.
+    maxChBias = MIN(channels, bias->capacity / chSizeB);
+    // the smallest of the two determines the slice size
+    *sliceChannels = MIN(maxChWeights, maxChBias);
+  }
+
+  if (*sliceChannels > 0) {
+    return kTfLiteOk;
+  } else {
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+    mli_tensor* in, 
+    mli_tensor* out) {
+#ifdef __Xxy
   int requestSizeIn = 0;
   int requestSizeOut = 0;
   int grantsizeIn = 0;
@@ -89,8 +195,8 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
   if (!inside_arc_ccm(in->data)) {
     // In case the input tensor contains multiple batches, it has rank 4
     // because the mli kernel cannot operate on batches, we need to have the size
-    // of a single batch. that is why the startRank is 1 in case of input rank 4
-    int startRank = in->rank - 3; // tOdo explain
+    // of a single HWC tensor. that is why the startRank is 1 in case of input rank 4
+    int startRank = in->rank - 3;
     requestSizeIn = mli_hlp_count_elem_num(in, startRank) * mli_hlp_tensor_element_size(in);
   }
   if (!inside_arc_ccm(out->data)) {
@@ -113,76 +219,6 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
     out->capacity = grantsizeOut;
     if (out->data == NULL) return kTfLiteError;
   }
-
-  return kTfLiteOk;
-#else
-  return kTfLiteOk;
-#endif
-}
-
-TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
-    const mli_tensor *in,
-    const mli_tensor *out,
-    const int kernelHeight,
-    const int strideHeight,
-    int *inSliceHeight,
-    int *outSliceHeight) {
-  const int heightDimension = 1; // todo: compute from rank
-  const int inHeight = in->shape[heightDimension];
-  const int outHeight = out->shape[heightDimension];
-  const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in);
-  const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out);
-  int maxLinesIn = 0;
-  int maxLinesOut = 0;
-  int maxOutLinesForInput = 0;
-  bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut);
-  if (fit) {
-    // in case both tensors completely fit in the capacity, there is no need for slicing
-    *inSliceHeight = inHeight;
-    *outSliceHeight = outHeight;
-  } else {
-    // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that.
-    maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn);
-    if (maxLinesIn >= inHeight) {
-      maxOutLinesForInput = outHeight;
-    } else {
-      maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false;
-    }
-    // Ten compute how many ouput lines fit into the output tensor.
-    maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut);
-    // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input.
-    *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut);
-    *inSliceHeight = *outSliceHeight * strideHeight;
-  }
-  if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) {
-    return kTfLiteOk;
-  } else {
-    return kTfLiteError;
-  }
-}
-
-TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
-    mli_tensor* in, 
-    mli_tensor* out) {
-#ifdef __Xxy
-  // Function to assign fast memory from one of 3 scratch buffers.
-  // Best Fit strategy - memory is asigned to those tensor which leave less memory of bank unused
-  mli_tensor* tensors[2] = { in, out };
-  uint32_t tensor_sizes[2] = {
-    mli_hlp_count_elem_num(tensors[0], 0), mli_hlp_count_elem_num(tensors[1], 0)};
-  int num_tensors = 2;
-  
-
-  for (int i = 0; i < num_tensors; ++i) {
-    // only for tensors that are not already located in one of the ccm memories, find a local memory that fits the data size.
-    if (inside_arc_ccm(tensors[i]->data)) continue;
-    tensors[i]->data = get_arc_scratch_buffer(tensor_sizes[i]);
-    tensors[i]->capacity = tensor_sizes[i];
-
-    if (tensors[i]->data == NULL) {
-      return kTfLiteError;
-    }
-  }
 #endif
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
index a27df8a5358..fc348229235 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
@@ -64,9 +64,15 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
     const mli_tensor *out,
     const int kernelHeight,
     const int strideHeight,
+    const int padding_top,
+    const int padding_bot,
     int *inSliceHeight,
     int *outSliceHeight);
 
+TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
+    const mli_tensor *weights,
+    const mli_tensor *bias,
+    int *sliceChannels);
 
 }  // namespace micro
 }  // namespace ops
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
index 5ef1b445a22..106743cf471 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
@@ -74,8 +74,9 @@ void *get_arc_scratch_buffer(int size) {
   void *buf = NULL;
   int best_mem_idx = -1;
   int best_mem_delta = INT_MAX;
+  const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
   // find a local memory that fits the data size.
-  for (int mem_idx = 0; mem_idx < sizeof(scratch_mem)/sizeof(scratch_mem[0]); ++mem_idx) {
+  for (int mem_idx = 0; mem_idx < numMem; ++mem_idx) {
     // Best Fit
     if ((size <= scratch_sizes[mem_idx]) && (scratch_sizes[mem_idx] - size < best_mem_delta)) {
       best_mem_idx = mem_idx;
@@ -90,11 +91,24 @@ void *get_arc_scratch_buffer(int size) {
   return buf;
 }
 
+void get_arc_scratch_buffer_max_size(int *size) {
+  int maxavailable = 0;
+  const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
+  // find the largest available buffer.
+  for (int i = 0; i < numMem; i++) {
+    if (scratch_sizes[i] > maxavailable) {
+      maxavailable = scratch_sizes[i];
+    }
+  }
+  *size = maxavailable;
+}
+
 void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
   int maxavailable = 0;
   int secondavail = 0;
+  const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
   // find the two largest available buffers.
-  for (int i = 0; i < 3; i++) {
+  for (int i = 0; i < numMem; i++) {
     if (scratch_sizes[i] > maxavailable) {
       secondavail = maxavailable;
       maxavailable = scratch_sizes[i];
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
index 52a12c7899d..927e480da5a 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
@@ -27,6 +27,7 @@ namespace micro {
 void free_arc_scratch_buffers(void);
 void *get_arc_scratch_buffer(int size);// Function to assign fast memory from one of 3 scratch buffers.
 
+void get_arc_scratch_buffer_max_size(int *size);
 void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2);
 
 static inline bool inside_arc_dccm(void* p) {

From 330c649075978d1718c7b590da38dea640f67698 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Thu, 26 Mar 2020 17:25:37 +0100
Subject: [PATCH 0066/1533] weight slicing for depthwise and fully connected in
 ARC backend

---
 tensorflow/lite/micro/kernels/arc/conv.cc     |  39 +--
 .../lite/micro/kernels/arc/depthwise_conv.cc  | 106 +++++--
 .../lite/micro/kernels/arc/fully_connected.cc |  93 ++++--
 .../lite/micro/kernels/arc/mli_slicers.cc     |   2 +-
 tensorflow/lite/micro/kernels/arc/pooling.cc  |  19 +-
 .../lite/micro/kernels/arc/scratch_buf_mgr.cc | 288 +++++++++++-------
 .../lite/micro/kernels/arc/scratch_buf_mgr.h  |  71 ++++-
 .../lite/micro/kernels/arc/scratch_buffers.cc |  14 +-
 .../lite/micro/kernels/arc/scratch_buffers.h  |   2 +-
 9 files changed, 434 insertions(+), 200 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/arc/conv.cc
index 9e9a37821e8..6cf26c7d6d9 100644
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/conv.cc
@@ -201,16 +201,16 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     }
 
     // for height slicing
-    const int heightDimension = 1;
-    int inSliceHeight = 0;
-    int outSliceHeight = 0;
-    const int kernelHeight = static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
-    const int overlap = kernelHeight - cfg.stride_height;
+    const int height_dimension = 1;
+    int in_slice_height = 0;
+    int out_slice_height = 0;
+    const int kernel_height = static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
+    const int overlap = kernel_height - cfg.stride_height;
 
     // for weight slicing (on output channels)
-    const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension.
-    int sliceChannels = static_cast<int>(mli_weights.shape[weightOutChDimension]);
-    const int outTensorChDimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
+    const int weight_out_ch_dimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension.
+    int slice_channels = static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+    const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
 
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
     mli_tensor weights_local = mli_weights;
@@ -220,8 +220,8 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
     TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, &sliceChannels));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
 
     /* is_local indicates that the tensor is already in local memory,
        so in that case the original tensor can be used,
@@ -231,14 +231,15 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     const bool w_is_local = weights_local.data == mli_weights.data;
     const bool b_is_local = bias_local.data == mli_bias.data;
 
-    TensorSlicer w_slice(&mli_weights, weightOutChDimension, sliceChannels);
-    TensorSlicer b_slice(&mli_bias, weightOutChDimension, sliceChannels);
-    TensorSlicer out_ch_slice(&mli_out, outTensorChDimension, sliceChannels, 0, 0, 0, true);
+    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
+    TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
+    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
 
     mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
     mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
 
-    void *inputBufferPtr = NULL;
+    void *input_buffer_ptr = NULL;
+    int input_buffer_size = 0;
 
     while (!w_slice.Done()){
       mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
@@ -249,12 +250,12 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
       The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
       in chunks of 'sliceHeight' */
-      TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap);
+      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap);
 
       /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of
       output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and
       height dimension. */
-      TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension, out_slice_height);
 
       /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */
       mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
@@ -266,9 +267,10 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
         cfg.padding_bottom = in_slice.GetPaddingPost();
 
         // if same input copy as previous iteration, skip the copy of input
-        if (in_slice.Sub()->data != inputBufferPtr) {
+        if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
           mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-          inputBufferPtr = in_slice.Sub()->data;
+          input_buffer_ptr = in_slice.Sub()->data;
+          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
         }
         mli_krn_conv2d_nhwc_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
         mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
@@ -282,7 +284,6 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       TF_LITE_ENSURE(context, in_slice.Done());
     }
 
-    free_arc_scratch_buffers();
   } else {
     ConvParams op_params;
     op_params.input_offset = -input->params.zero_point;
diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
index 00c46c442b7..74e48c8c064 100644
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
@@ -191,12 +191,21 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       cfg.padding_bottom = data->padding.height + data->padding.height_offset;
     }
 
+    // for height slicing
     const int heightDimension = 1;
     int inSliceHeight = 0;
     int outSliceHeight = 0;
     const int kernelHeight = static_cast<int>(mli_weights.shape[KRNL_DW_H_DIM_HWC]); 
     const int overlap = kernelHeight - cfg.stride_height;
 
+    // for weight slicing (on output channels)
+    const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension.
+    const int bias_out_ch_dimension = 0; // bias has only 1 dimension
+    const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
+    const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension];
+    const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension];
+    int slice_channels = static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
     mli_tensor weights_local = mli_weights;
     mli_tensor bias_local = mli_bias;
@@ -206,38 +215,83 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     mli_mov_cfg_for_copy(&copy_config);
 
     TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    /* if the tensor is already in local memory, is_local is true */
+    /* is_local indicates that the tensor is already in local memory,
+       so in that case the original tensor can be used,
+       and there is no need to copy it to the local tensor*/
     const bool in_is_local = in_local.data == mli_in.data;
     const bool out_is_local = out_local.data == mli_out.data;
+    const bool w_is_local = weights_local.data == mli_weights.data;
+    const bool b_is_local = bias_local.data == mli_bias.data;
 
     TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
 
-    /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
-       because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
-       on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
-       The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
-       in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap);
-    TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight);
-
-    mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
-    mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
-
-    mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
-    mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
-
-    while (!out_slice.Done()) {
-      cfg.padding_top = in_slice.GetPaddingPre();
-      cfg.padding_bottom = in_slice.GetPaddingPost();
-
-      mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-      mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, &weights_local, &bias_local, &cfg, out_ptr);
-      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
-
-      in_slice.Next();
-      out_slice.Next();
+    /* if input channels is not equal to output channels, a channel multiplier is used.
+       in this case the slice channels needs to be rounded down to a multiple of the input channels */
+    if (in_channels != out_channels) {
+      slice_channels = (slice_channels / in_channels) * in_channels;
     }
-    free_arc_scratch_buffers();
+
+    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true);
+    TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
+    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
+    TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
+
+    mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+    mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+    void *input_buffer_ptr = NULL;
+    int input_buffer_size = 0;
+    int padding_top = cfg.padding_top;
+    int padding_bottom = cfg.padding_bottom;
+
+    while (!w_slice.Done()){
+      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+      /* input tensor is alreade sliced in the  channel dimension. out_ch_slice.Sub() is the tensor for the amount of
+      channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and
+      height dimension.
+      in_ch_slice.Sub() tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
+      because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+      on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
+      The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
+      in chunks of 'sliceHeight' */
+      TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap);
+
+      /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of
+      output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and
+      height dimension. */
+      TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+
+      /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */
+      mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+      mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+      while (!out_slice.Done()) {
+        TF_LITE_ENSURE(context, !in_slice.Done());
+        cfg.padding_top = in_slice.GetPaddingPre();
+        cfg.padding_bottom = in_slice.GetPaddingPost();
+
+        // if same input copy as previous iteration, skip the copy of input
+        if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
+          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+          input_buffer_ptr = in_slice.Sub()->data;
+          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
+        }
+        mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
+        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+        in_slice.Next();
+        out_slice.Next();
+      }
+      w_slice.Next();
+      b_slice.Next();
+      out_ch_slice.Next();
+      in_ch_slice.Next();
+      TF_LITE_ENSURE(context, in_slice.Done());
+    }
+
   } else {
     DepthwiseParams op_params;
     op_params.padding_type = PaddingType::kSame;
diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
index 42921037481..cc9b95c570a 100644
--- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc/fully_connected.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
 #include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
 
 #include "mli_api.h"
 
@@ -100,44 +101,80 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
     ConvertToMliTensor<int32_t>(bias, &mli_bias);
     ConvertToMliTensor<int8_t>(output, &mli_out);
 
-    mli_point_to_subtsr_cfg subtsr_cfg_in = {{0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg subtsr_cfg_out = {{0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+    /* The input tensor can have more than 2 dimensions. for the compute this doesn't make any difference
+       because all the inputs or a batch entry will be used anyway. because the MLI kernel doesn't recognize
+       the multiple dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */
+    mli_in.shape[0] = mli_out.shape[0];
+    mli_in.shape[1] = mli_weights.shape[1];
+    mli_in.shape[2] = 0;
+    mli_in.shape[3] = 0;
+    mli_in.rank = 2;
 
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
     mli_tensor weights_local = mli_weights;
     mli_tensor bias_local = mli_bias;
-    mli_tensor in_local = sub_mli_in;
-    mli_tensor out_local = sub_mli_out;
+    mli_tensor in_local = mli_in;
+    mli_tensor out_local = mli_out;
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    bool in_is_local = in_local.data == sub_mli_in.data;
-    bool out_is_local = out_local.data == sub_mli_out.data;
+    const int weight_out_dimension = 0;
+    const int out_tensor_dimension = 1;
+    const int batch_dimension = 0;
+    int slice_size = mli_weights.shape[weight_out_dimension];
 
-    mli_mov_tensor_sync(&mli_weights, &copy_config, &weights_local);
-    mli_mov_tensor_sync(&mli_bias, &copy_config, &bias_local);
+    /* allocate the local buffers, and compute the slice size */
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_dimension, &slice_size));
+    int max_out_slice_size = out_local.capacity / mli_hlp_tensor_element_size(&out_local);
+    if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;
 
-    const int batches =
-        MatchingDim(GetTensorShape(input), 0, GetTensorShape(output), 0);
+    /* is_local indicates that the tensor is already in local memory,
+       so in that case the original tensor can be used,
+       and there is no need to copy it to the local tensor*/
+    const bool in_is_local = in_local.data == mli_in.data;
+    const bool out_is_local = out_local.data == mli_out.data;
+    const bool w_is_local = weights_local.data == mli_weights.data;
+    const bool b_is_local = bias_local.data == mli_bias.data;
 
-    for (int i = 0; i < batches; i++) {
-      mli_mov_tensor_sync(&sub_mli_in, &copy_config, &in_local);
-      mli_krn_fully_connected_sa8_sa8_sa32(&in_local, &weights_local, &bias_local, &out_local);
-      mli_mov_tensor_sync(&out_local, &copy_config, &sub_mli_out);
-      subtsr_cfg_in.start_coord[0]++;
-      subtsr_cfg_out.start_coord[0]++;
-      mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-      mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
-      if (in_is_local) {
-        in_local.data = sub_mli_in.data;
-      }
-      if (out_is_local) {
-        out_local.data = sub_mli_out.data;
+    TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
+    TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
+    TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0, true);
+
+    mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+    mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+    void *input_buffer_ptr = NULL;
+
+    while (!w_slice.Done()){
+      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+      TensorSlicer in_slice(&mli_in, batch_dimension, 1);
+
+      /* output tensor is alreade sliced in the output size dimension. out_ch_slice.Sub() is the tensor for the amount of
+      output size of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch */
+      TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1);
+
+      /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */
+      mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+      mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+      while (!out_slice.Done()) {
+
+        // if same input copy as previous iteration, skip the copy of input
+        if (in_slice.Sub()->data != input_buffer_ptr) {
+          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+          input_buffer_ptr = in_slice.Sub()->data;
+        }
+        mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
+        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+        in_slice.Next();
+        out_slice.Next();
       }
+      w_slice.Next();
+      b_slice.Next();
+      out_ch_slice.Next();
     }
   } else {
     FullyConnectedParams op_params;
diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
index 6c6c89715f8..91bae5caa38 100644
--- a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc/mli_slicers.cc
@@ -48,7 +48,7 @@ TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int sli
     sub_cfg_.sub_tensor_rank = full_tensor->rank;
 
   } else {
-    /* In the not interlevaed mode, the slicing happens from the outer most dimension up to the slice_dim
+    /* In the not interleaved mode, the slicing happens from the outer most dimension up to the slice_dim
     for example in an HWC layout this mode can be used to slice in the H dimension.
     in this mode the data of the slice is still contiguous in memory (if that was the case in the input tensor */
     for (int i = 0; i< full_tensor->rank; i++){
diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/arc/pooling.cc
index 0cfa5363d69..7a26a10e23b 100644
--- a/tensorflow/lite/micro/kernels/arc/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc/pooling.cc
@@ -140,9 +140,9 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
     mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
     mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
 
-    const int heightDimension = 1;
-    int inSliceHeight = 0;
-    int outSliceHeight = 0;
+    const int height_dimension = 1;
+    int in_slice_height = 0;
+    int out_slice_height = 0;
     const int overlap = cfg.kernel_height - cfg.stride_height;
 
     // Tensors for data in fast (local) memory and config to copy data from external to local memory
@@ -150,19 +150,22 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
     mli_tensor out_local = sub_mli_out;
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_io_tensors(context, &in_local, &out_local));
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(context, &in_local, &out_local));
     bool in_is_local = in_local.data == sub_mli_in.data;
     bool out_is_local = out_local.data == sub_mli_out.data;
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height));
 
     /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
        because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
        on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
        The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
        in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(&mli_in, heightDimension, inSliceHeight, cfg.padding_top, cfg.padding_bottom, overlap);
-    TensorSlicer out_slice(&mli_out, heightDimension, outSliceHeight);
+    TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap);
+    TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height);
 
+    /* is_local indicates that the tensor is already in local memory,
+       so in that case the original tensor can be used,
+       and there is no need to copy it to the local tensor*/
     mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
     mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
 
@@ -177,7 +180,7 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
       in_slice.Next();
       out_slice.Next();
     }
-    free_arc_scratch_buffers();
+
   } else {
     int32_t activation_min, activation_max;
     (void)CalculateActivationRangeQuantized(context, params->activation, output,
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
index e9adbb37e9e..5bd2d6aed22 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
@@ -23,21 +23,19 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
-
-
-void get_arc_two_buffer_sizes(int requestsize1, int requestsize2, int *grantsize1, int *grantsize2) {
+static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2, int *grant_size_1, int *grant_size_2) {
   int maxrequest = 0;
   int secondrequest = 0;
   int maxavailable = 0;
   int secondavail = 0;
 
   // determine the largest requested buffer.
-  if (requestsize1 > requestsize2) {
-    maxrequest = requestsize1;
-    secondrequest = requestsize2;
+  if (request_size_1 > request_size_2) {
+    maxrequest = request_size_1;
+    secondrequest = request_size_2;
   } else {
-    maxrequest = requestsize2;
-    secondrequest = requestsize1;
+    maxrequest = request_size_2;
+    secondrequest = request_size_1;
   }
 
   // find the two largest available buffers.
@@ -45,40 +43,79 @@ void get_arc_two_buffer_sizes(int requestsize1, int requestsize2, int *grantsize
 
   // in case two buffers are available, the largest buffer can go to the largest request.
   if (secondavail > 0) { // this condition can be enhanced to prevent cases where the second buffer is so small that it is better to use one buffer and split it.
-    if (requestsize1 > requestsize2) {
-      *grantsize1 = maxavailable;
-      *grantsize2 = secondavail;
+    if (request_size_1 > request_size_2) {
+      *grant_size_1 = maxavailable;
+      *grant_size_2 = secondavail;
     } else {
-      *grantsize1 = secondavail;
-      *grantsize2 = maxavailable;
+      *grant_size_1 = secondavail;
+      *grant_size_2 = maxavailable;
     }
   } else {
     // In case only one buffer is available,
     // use only the max buffer, and split it.
     // TODO compute optimal split ratio based on request ratio.
-    *grantsize1 = maxavailable / 2;
-    *grantsize2 = maxavailable / 2;
+    *grant_size_1 = maxavailable / 2;
+    *grant_size_2 = maxavailable / 2;
   }
 }
 
+static TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+    mli_tensor* in,
+    mli_tensor* out) {
+#ifdef __Xxy
+  int request_size_in = 0;
+  int request_size_out = 0;
+  int grant_size_in = 0;
+  int grant_size_out = 0;
+  if (!inside_arc_ccm(in->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the size
+    // of a single HWC tensor. that is why the start_rank is 1 in case of input rank 4
+    int start_rank = in->rank - 3;
+    request_size_in = mli_hlp_count_elem_num(in, start_rank) * mli_hlp_tensor_element_size(in);
+  }
+  if (!inside_arc_ccm(out->data)) {
+    // In case the input tensor contains multiple batches, it has rank 4
+    // because the mli kernel cannot operate on batches, we need to have the size
+    // of a single batch. that is why the start_rank is 1 in case of input rank 4
+    int start_rank = out->rank - 3;
+    request_size_out = mli_hlp_count_elem_num(out, start_rank) * mli_hlp_tensor_element_size(out);
+  }
+
+  get_arc_two_buffer_sizes(request_size_in, request_size_out, &grant_size_in, &grant_size_out);
+
+  if (!inside_arc_ccm(in->data)) {
+    in->data = get_arc_scratch_buffer(grant_size_in);
+    in->capacity = grant_size_in;
+    if (in->data == NULL) return kTfLiteError;
+  }
+  if (!inside_arc_ccm(out->data)) {
+    out->data = get_arc_scratch_buffer(grant_size_out);
+    out->capacity = grant_size_out;
+    if (out->data == NULL) return kTfLiteError;
+  }
+#endif
+  return kTfLiteOk;
+}
+
 TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
-    mli_tensor* in, 
-    mli_tensor* weights, 
-    mli_tensor* bias, 
+    mli_tensor* in,
+    mli_tensor* weights,
+    mli_tensor* bias,
     mli_tensor* out) {
 TfLiteStatus ret_val = kTfLiteOk;
 #ifdef __Xxy
-
+  init_arc_scratch_buffers();
   if (!inside_arc_ccm(weights->data)) {
     int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights);
-    int maxWeightsSize = 0;
+    int max_weights_size = 0;
     weights->data = get_arc_scratch_buffer(weights_size);
     weights->capacity = weights_size;
     if (weights->data == NULL) {
-      get_arc_scratch_buffer_max_size(&maxWeightsSize);
-      weights->data = get_arc_scratch_buffer(maxWeightsSize);
-      weights->capacity = maxWeightsSize;
-      if (maxWeightsSize == 0) ret_val = kTfLiteError;
+      get_arc_scratch_buffer_max_size(&max_weights_size);
+      weights->data = get_arc_scratch_buffer(max_weights_size);
+      weights->capacity = max_weights_size;
+      if (max_weights_size == 0) ret_val = kTfLiteError;
     }
     if (weights->data == NULL) ret_val = kTfLiteError;
   }
@@ -88,15 +125,92 @@ TfLiteStatus ret_val = kTfLiteOk;
     bias->data = get_arc_scratch_buffer(bias_mem_requirements);
     bias->capacity = bias_mem_requirements;
   }
+
   if (ret_val == kTfLiteOk) {
     ret_val = get_arc_scratch_buffer_for_io_tensors(context, in, out);
   }
+
   if (bias->data == NULL) {
-    int maxBiasSize = 0;
-    get_arc_scratch_buffer_max_size(&maxBiasSize);
-    bias->data = get_arc_scratch_buffer(maxBiasSize);
-    bias->capacity = maxBiasSize;
-    if (maxBiasSize == 0) ret_val = kTfLiteError;
+    int max_bias_size = 0;
+    get_arc_scratch_buffer_max_size(&max_bias_size);
+    bias->data = get_arc_scratch_buffer(max_bias_size);
+    bias->capacity = max_bias_size;
+    if (max_bias_size == 0) ret_val = kTfLiteError;
+  }
+  if (bias->data == NULL) ret_val = kTfLiteError;
+
+#endif
+  return ret_val;
+}
+
+TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(TfLiteContext* context,
+    mli_tensor* in,
+    mli_tensor* weights,
+    mli_tensor* bias,
+    mli_tensor* out) {
+TfLiteStatus ret_val = kTfLiteOk;
+#ifdef __Xxy
+  init_arc_scratch_buffers();
+  /* strategy for FC kernels:
+     first allocate input, because this cannot be sliced. (in case of batch processing, only a single input needs to be allocated)
+     then weigths & bias because if fully loaded, they can be reused over batches.
+     then output.
+     The number of output channels (for weights slicing) depends on size of output and size of weights&bias */
+
+  if (!inside_arc_ccm(in->data)) {
+    /* In case the input tensor contains multiple batches,
+       only count the size if the inner most dimension */
+    int size_in = mli_hlp_count_elem_num(in, in->rank - 1) * mli_hlp_tensor_element_size(in);
+    in->data = get_arc_scratch_buffer(size_in);
+    in->capacity = size_in;
+    if (in->data == NULL) {
+      in->capacity = 0;
+      ret_val = kTfLiteError;
+    }
+  }
+
+  if (!inside_arc_ccm(weights->data)) {
+    int weights_size = mli_hlp_count_elem_num(weights, 0) * mli_hlp_tensor_element_size(weights);
+    int max_weights_size = 0;
+    weights->data = get_arc_scratch_buffer(weights_size);
+    weights->capacity = weights_size;
+    if (weights->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_weights_size);
+      weights->data = get_arc_scratch_buffer(max_weights_size);
+      weights->capacity = max_weights_size;
+      if (max_weights_size == 0) ret_val = kTfLiteError;
+    }
+    if (weights->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (!inside_arc_ccm(bias->data)) {
+    int bias_mem_requirements = mli_hlp_count_elem_num(bias, 0) * mli_hlp_tensor_element_size(bias);
+    bias->data = get_arc_scratch_buffer(bias_mem_requirements);
+    bias->capacity = bias_mem_requirements;
+  }
+
+  if (!inside_arc_ccm(out->data)) {
+    /* In case the input tensor contains multiple batches,
+       only count the size if the inner most dimension */
+    int out_size = mli_hlp_count_elem_num(out, out->rank - 1) * mli_hlp_tensor_element_size(out);
+    int max_out_size = 0;
+    out->data = get_arc_scratch_buffer(out_size);
+    out->capacity = out_size;
+    if (out->data == NULL) {
+      get_arc_scratch_buffer_max_size(&max_out_size);
+      out->data = get_arc_scratch_buffer(max_out_size);
+      out->capacity = max_out_size;
+      if (max_out_size == 0) ret_val = kTfLiteError;
+    }
+    if (out->data == NULL) ret_val = kTfLiteError;
+  }
+
+  if (bias->data == NULL) {
+    int max_bias_size = 0;
+    get_arc_scratch_buffer_max_size(&max_bias_size);
+    bias->data = get_arc_scratch_buffer(max_bias_size);
+    bias->capacity = max_bias_size;
+    if (max_bias_size == 0) ret_val = kTfLiteError;
   }
   if (bias->data == NULL) ret_val = kTfLiteError;
 
@@ -107,44 +221,44 @@ TfLiteStatus ret_val = kTfLiteOk;
 TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
     const mli_tensor *in,
     const mli_tensor *out,
-    const int kernelHeight,
-    const int strideHeight,
+    const int kernel_height,
+    const int stride_height,
     const int padding_top,
     const int padding_bot,
-    int *inSliceHeight,
-    int *outSliceHeight) {
-  const int heightDimension = 1; // todo: compute from rank
-  const int inHeight = in->shape[heightDimension];
-  const int outHeight = out->shape[heightDimension];
-  const int lineSizeIn = mli_hlp_count_elem_num(in, heightDimension + 1) * mli_hlp_tensor_element_size(in);
-  const int lineSizeOut = mli_hlp_count_elem_num(out, heightDimension + 1) * mli_hlp_tensor_element_size(out);
-  int maxLinesIn = 0;
-  int maxLinesOut = 0;
-  int maxOutLinesForInput = 0;
-  bool fit = (in->capacity >= inHeight * lineSizeIn) && (out->capacity >= outHeight * lineSizeOut);
+    int *in_slice_height,
+    int *out_slice_height) {
+  const int height_dimension = 1; // todo: compute from rank
+  const int in_height = in->shape[height_dimension];
+  const int out_height = out->shape[height_dimension];
+  const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) * mli_hlp_tensor_element_size(in);
+  const int line_size_out = mli_hlp_count_elem_num(out, height_dimension + 1) * mli_hlp_tensor_element_size(out);
+  int max_lines_in = 0;
+  int max_lines_out = 0;
+  int max_out_lines_for_input = 0;
+  bool fit = (in->capacity >= in_height * line_size_in) && (out->capacity >= out_height * line_size_out);
   if (fit) {
     // in case both tensors completely fit in the capacity, there is no need for slicing
-    *inSliceHeight = inHeight;
-    *outSliceHeight = outHeight;
+    *in_slice_height = in_height;
+    *out_slice_height = out_height;
   } else {
     // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that.
-    maxLinesIn = MIN(inHeight, in->capacity / lineSizeIn);
-    if (maxLinesIn >= inHeight) {
-      maxOutLinesForInput = outHeight;
-    } else if (2 * maxLinesIn >= inHeight) {
+    max_lines_in = MIN(in_height, in->capacity / line_size_in);
+    if (max_lines_in >= in_height) {
+      max_out_lines_for_input = out_height;
+    } else if (2 * max_lines_in >= in_height) {
       // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case.
-      maxOutLinesForInput = (maxLinesIn + MIN(padding_top, padding_bot) - kernelHeight + 1) / strideHeight;
+      max_out_lines_for_input = (max_lines_in + MIN(padding_top, padding_bot) - kernel_height + 1) / stride_height;
     } else {
-      maxOutLinesForInput = (maxLinesIn - kernelHeight + 1) / strideHeight; // TODO add padding exceptions and test by makin fit=false;
+      max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height; // TODO add padding exceptions and test by makin fit=false;
     }
     // Ten compute how many ouput lines fit into the output tensor.
-    maxLinesOut = MIN(outHeight, out->capacity / lineSizeOut);
+    max_lines_out = MIN(out_height, out->capacity / line_size_out);
     // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input.
-    *outSliceHeight = MIN(maxOutLinesForInput, maxLinesOut);
-    *inSliceHeight = *outSliceHeight * strideHeight;
+    *out_slice_height = MIN(max_out_lines_for_input, max_lines_out);
+    *in_slice_height = *out_slice_height * stride_height;
   }
 
-  if ((*inSliceHeight > 0) && (*outSliceHeight > 0)) {
+  if ((*in_slice_height > 0) && (*out_slice_height > 0)) {
     return kTfLiteOk;
   } else {
     return kTfLiteError;
@@ -154,73 +268,43 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
 TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
     const mli_tensor *weights,
     const mli_tensor *bias,
-    int *sliceChannels) {
-  const int weightOutChDimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension.
-  const int channels = weights->shape[weightOutChDimension];
+    const int weight_out_ch_dimension,
+    int *slice_channels) {
+  const int channels = weights->shape[weight_out_ch_dimension];
+  const int ch_size_w = (mli_hlp_count_elem_num(weights, 0) / channels) * mli_hlp_tensor_element_size(weights);
+  const int ch_size_b = (mli_hlp_count_elem_num(bias, 0) / channels) * mli_hlp_tensor_element_size(bias);
+  int max_ch_weigths = 0;
+  int max_ch_bias = 0;
 
-
-  const int chSizeW = mli_hlp_count_elem_num(weights, weightOutChDimension + 1) * mli_hlp_tensor_element_size(weights);
-  const int chSizeB = mli_hlp_count_elem_num(bias, weightOutChDimension + 1) * mli_hlp_tensor_element_size(bias);
-  int maxChWeights = 0;
-  int maxChBias = 0;
-
-  bool fit = (weights->capacity >= channels * chSizeW) && (bias->capacity >= channels * chSizeB);
+  bool fit = (weights->capacity >= channels * ch_size_w) && (bias->capacity >= channels * ch_size_b);
   if (fit) {
     // in case both tensors completely fit in the capacity, there is no need for slicing
-    *sliceChannels = channels;
+    *slice_channels = channels;
   } else {
     // First compute how many channels fit into the weights tensor
-    maxChWeights = MIN(channels, weights->capacity / chSizeW);
+    max_ch_weigths = MIN(channels, weights->capacity / ch_size_w);
     // Ten compute how many channels fit into the bias tensor.
-    maxChBias = MIN(channels, bias->capacity / chSizeB);
+    max_ch_bias = MIN(channels, bias->capacity / ch_size_b);
     // the smallest of the two determines the slice size
-    *sliceChannels = MIN(maxChWeights, maxChBias);
+    *slice_channels = MIN(max_ch_weigths, max_ch_bias);
   }
 
-  if (*sliceChannels > 0) {
+  if (*slice_channels > 0) {
     return kTfLiteOk;
   } else {
     return kTfLiteError;
   }
 }
 
-TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
     mli_tensor* in, 
     mli_tensor* out) {
 #ifdef __Xxy
-  int requestSizeIn = 0;
-  int requestSizeOut = 0;
-  int grantsizeIn = 0;
-  int grantsizeOut = 0;
-  if (!inside_arc_ccm(in->data)) {
-    // In case the input tensor contains multiple batches, it has rank 4
-    // because the mli kernel cannot operate on batches, we need to have the size
-    // of a single HWC tensor. that is why the startRank is 1 in case of input rank 4
-    int startRank = in->rank - 3;
-    requestSizeIn = mli_hlp_count_elem_num(in, startRank) * mli_hlp_tensor_element_size(in);
-  }
-  if (!inside_arc_ccm(out->data)) {
-    // In case the input tensor contains multiple batches, it has rank 4
-    // because the mli kernel cannot operate on batches, we need to have the size
-    // of a single batch. that is why the startRank is 1 in case of input rank 4
-    int startRank = out->rank - 3;
-    requestSizeOut = mli_hlp_count_elem_num(out, startRank) * mli_hlp_tensor_element_size(out);
-  }
-
-  get_arc_two_buffer_sizes(requestSizeIn, requestSizeOut, &grantsizeIn, &grantsizeOut);
-
-  if (!inside_arc_ccm(in->data)) {
-    in->data = get_arc_scratch_buffer(grantsizeIn);
-    in->capacity = grantsizeIn;
-    if (in->data == NULL) return kTfLiteError;
-  }
-  if (!inside_arc_ccm(out->data)) {
-    out->data = get_arc_scratch_buffer(grantsizeOut);
-    out->capacity = grantsizeOut;
-    if (out->data == NULL) return kTfLiteError;
-  }
-#endif
+  init_arc_scratch_buffers();
+  return get_arc_scratch_buffer_for_io_tensors(context, in, out);
+#else
   return kTfLiteOk;
+#endif
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
index fc348229235..276f976cf0f 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
@@ -38,13 +38,13 @@ namespace micro {
  * @return Tf Lite status code
  */
 TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
-    mli_tensor* in, 
-    mli_tensor* weights, 
-    mli_tensor* bias, 
+    mli_tensor* in,
+    mli_tensor* weights,
+    mli_tensor* bias,
     mli_tensor* out);
 
 /**
- * @brief Function to allocate scratch buffers for kernels with only input and output buffers
+ * @brief Function to allocate scratch buffers for pooling kernels with only input and output buffers
  *
  * @detail This function will update the data pointers in the 2 tensors with pointers
  * to scratch buffers in fast local memory.
@@ -55,10 +55,49 @@ TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
  *
  * @return Tf Lite status code
  */
-TfLiteStatus get_arc_scratch_buffer_for_io_tensors(TfLiteContext* context,
+TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
     mli_tensor* in, 
     mli_tensor* out);
 
+/**
+ * @brief Function to allocate scratch buffers for the fully connect tensors
+ *
+ * @detail This function will update the data pointers in the 4 tensors with pointers
+ * to scratch buffers in fast local memory.
+ *
+ * @param context  [I] pointer to TfLite context (needed for error handling)
+ * @param in [IO] pointer to the input tensor
+ * @param weights [IO] pointer to the weights tensor
+ * @param bias [IO] pointer to the bias tensor
+ * @param output [IO] pointer to the output tensor
+ *
+ * @return Tf Lite status code
+ */
+TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(TfLiteContext* context,
+    mli_tensor* in,
+    mli_tensor* weights,
+    mli_tensor* bias,
+    mli_tensor* out);
+
+/**
+ * @brief Function to calculate slice size for io tensors
+ *
+ * @detail This function will calculate the slice size in the height dimension
+ * for input and output tensors. it takes into account the kernel size and the padding.
+ * the function will look at the capacity filed in the in and out tensor to
+ * determine the available buffersize.
+ *
+ * @param in [I] pointer to the input tensor
+ * @param out [I] pointer to the output tensor
+ * @param kernelHeight [I] size of the kernel in height dimension
+ * @param strideHeight [I] input stride in height dimension
+ * @param padding_top [I] number of lines with zeros at the top
+ * @param padding_bot [I] number of lines with zeros at the bottom
+ * @param inSliceHeight [O] slice size in height dimension for the input tensor
+ * @param outSliceHeight [O] slice size in height dimension for the output tensor
+ *
+ * @return Tf Lite status code
+ */
 TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
     const mli_tensor *in,
     const mli_tensor *out,
@@ -66,13 +105,29 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
     const int strideHeight,
     const int padding_top,
     const int padding_bot,
-    int *inSliceHeight,
-    int *outSliceHeight);
+    int *in_slice_height,
+    int *out_slice_height);
 
+/**
+ * @brief Function to calculate slice size for weight slicing
+ *
+ * @detail This function will calculate the slice size in the output channel dimension
+ * for weight and bias tensors.
+ * the function will look at the capacity filed in the weights and bias tensor to
+ * determine the available buffersize.
+ *
+ * @param weights [I] pointer to the input tensor
+ * @param bias [I] pointer to the output tensor
+ * @param weightOutChDimension [I] dimension of the output channels in the weights tensor
+ * @param sliceChannels [O] slice size in output channel dimension
+ *
+ * @return Tf Lite status code
+ */
 TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
     const mli_tensor *weights,
     const mli_tensor *bias,
-    int *sliceChannels);
+    const int weight_out_ch_dimension,
+    int *slice_channels);
 
 }  // namespace micro
 }  // namespace ops
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
index 106743cf471..f36059f82d2 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
@@ -74,9 +74,9 @@ void *get_arc_scratch_buffer(int size) {
   void *buf = NULL;
   int best_mem_idx = -1;
   int best_mem_delta = INT_MAX;
-  const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
+  const int num_mem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
   // find a local memory that fits the data size.
-  for (int mem_idx = 0; mem_idx < numMem; ++mem_idx) {
+  for (int mem_idx = 0; mem_idx < num_mem; ++mem_idx) {
     // Best Fit
     if ((size <= scratch_sizes[mem_idx]) && (scratch_sizes[mem_idx] - size < best_mem_delta)) {
       best_mem_idx = mem_idx;
@@ -93,9 +93,9 @@ void *get_arc_scratch_buffer(int size) {
 
 void get_arc_scratch_buffer_max_size(int *size) {
   int maxavailable = 0;
-  const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
+  const int num_mem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
   // find the largest available buffer.
-  for (int i = 0; i < numMem; i++) {
+  for (int i = 0; i < num_mem; i++) {
     if (scratch_sizes[i] > maxavailable) {
       maxavailable = scratch_sizes[i];
     }
@@ -106,9 +106,9 @@ void get_arc_scratch_buffer_max_size(int *size) {
 void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
   int maxavailable = 0;
   int secondavail = 0;
-  const int numMem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
+  const int num_mem = sizeof(scratch_mem)/sizeof(scratch_mem[0]);
   // find the two largest available buffers.
-  for (int i = 0; i < numMem; i++) {
+  for (int i = 0; i < num_mem; i++) {
     if (scratch_sizes[i] > maxavailable) {
       secondavail = maxavailable;
       maxavailable = scratch_sizes[i];
@@ -120,7 +120,7 @@ void get_arc_scratch_buffer_two_max_sizes(int *size1, int *size2) {
   *size2 = secondavail;
 }
 
-void free_arc_scratch_buffers(void) {
+void init_arc_scratch_buffers(void) {
   scratch_mem[0] = scratch_mem_x;
   scratch_mem[1] = scratch_mem_y;
   scratch_mem[2] = scratch_mem_z;
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
index 927e480da5a..703c164e077 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
+++ b/tensorflow/lite/micro/kernels/arc/scratch_buffers.h
@@ -24,7 +24,7 @@ namespace ops {
 namespace micro {
 
 
-void free_arc_scratch_buffers(void);
+void init_arc_scratch_buffers(void);
 void *get_arc_scratch_buffer(int size);// Function to assign fast memory from one of 3 scratch buffers.
 
 void get_arc_scratch_buffer_max_size(int *size);

From 0b15d4264d6cc5695fca35b7f68dcf64e4353bcf Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Fri, 17 Jan 2020 19:30:30 +0300
Subject: [PATCH 0067/1533] Minor fixes to restore 'generate_projects' target
 functionality

---
 tensorflow/lite/micro/tools/make/targets/arc_makefile.inc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 5ce2e03bfc3..eb890ef1999 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -95,4 +95,10 @@ endif
 
 endif # USE_EMBARC_MLI
 
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+
+# Not applicable for ARC, leaving it empty.
+$(BINDIR)%.bin:
+
 endif

From e6f9f08acb00745c429baf199486cb8a6e07c08c Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Tue, 21 Jan 2020 20:11:27 +0300
Subject: [PATCH 0068/1533] Initial implementation of TCF and LCF files support
 for IoTDK and EMSDP platforms

---
 .../micro/tools/make/helper_functions.inc     |    7 +
 .../tools/make/targets/arc/emsdp/emsdp.lcf    |   47 +
 .../targets/arc/emsdp/emsdp_em11d_dfss.tcf    | 4907 +++++++++++++++++
 .../tools/make/targets/arc/iotdk/iotdk.lcf    |   47 +
 .../tools/make/targets/arc/iotdk/iotdk.tcf    | 4621 ++++++++++++++++
 .../micro/tools/make/targets/arc_makefile.inc |   15 +
 6 files changed, 9644 insertions(+)
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf

diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 09771419843..a7f9bd788e3 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -145,6 +145,13 @@ ifneq ($(TCF_FILE_NAME), )
 $(PRJDIR)$(3)/$(1)/$(TCF_FILE_NAME): $(TCF_FILE)
 	@cp $$< $$@
 endif
+
+# Special rule to copy LCF in case the local filesystem file name has been defined
+ifneq ($(LCF_FILE), )
+$(PRJDIR)$(3)/$(1)/$(notdir $(LCF_FILE)): $(LCF_FILE)
+	@cp $$< $$@
+endif
+
 endif
 endef
 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
new file mode 100644
index 00000000000..fc34759d745
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -0,0 +1,47 @@
+# SYSTEM memory regions indicate where external memory might be located.
+#   The TCF has no specific knowledge of whether SYSTEM regions contain 
+#   external memory or not.
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    IVT     : ORIGIN = 0x00000000, LENGTH = 0x60000000
+    ICCM0   : ORIGIN = 0x60000000, LENGTH = 0x00020000
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+#   SYSTEM1 : ORIGIN = 0x70000000, LENGTH = 0x10000000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    SYSTEM2 : ORIGIN = 0xb0000000, LENGTH = 0x50000000
+    }
+SECTIONS {
+    GROUP BLOCK(4): {
+	.text? : { *('.text$crt*') }
+        * (TEXT): {}
+    	* (LIT): {}
+	} > ICCM0
+
+    GROUP BLOCK(4): {
+	/* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
+	} > SYSTEM2
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+        } > XCCM
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+        } > YCCM
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+        } > IVT
+    }
+
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf
new file mode 100644
index 00000000000..833fa9ca9b9
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf
@@ -0,0 +1,4907 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<config_list>
+  <tool_config version="1.0.72" mwdt_version="O-2018.09" />
+  <configuration name="BCRs" filename="bcr_contents.txt">
+    <string><![CDATA[
+	0x4	0x44		IDENTITY
+	0x18	0x80000000	AUX_DCCM
+	0x60	0x2		BCR_VER
+	0x68	0x10		VECBASE_AC_BUILD
+	0x6d	0x1002		MPU_BUILD
+	0x6e	0xc902		RF_BUILD
+	0x72	0x215104	D_CACHE_BUILD
+	0x74	0x904		DCCM_BUILD
+	0x75	0x10504		TIMER_BUILD
+	0x76	0x605		AP_BUILD
+	0x77	0x135104	I_CACHE_BUILD
+	0x78	0x904		ICCM_BUILD
+	0x79	0x2220		XY_BUILD
+	0x7a	0x3521		DSP_BUILD
+	0x7b	0x22206		MULTIPLY_BUILD
+	0x7c	0x3		SWAP_BUILD
+	0x7d	0x3		NORM_BUILD
+	0x7e	0x2		MINMAX_BUILD
+	0x7f	0x303		BARREL_BUILD
+	0xc1	0x12447402	ISA_CONFIG
+	0xc3	0xf0000012	DMP_PP_BUILD
+	0xc5	0x2		STACK_REGION_BUILD
+	0xc7	0x50000004	ERP_BUILD
+	0xc8	0x1004f03	FPU_BUILD
+	0xcb	0x2		BS_BUILD
+	0xcc	0x1988c02	AGU_BUILD
+	0xcd	0x120f02	DMAC_BUILD
+	0xf0	0x101063	SUBSYS_BUILD
+	0xf1	0x1		CORE_CONFIG
+	0xf2	0x503		RTT_BUILD
+	0xf3	0x134d6001	IRQ_BUILD
+	0xf5	0x8080104	PCT_BUILD
+	0xf6	0x6f0004	CC_BUILD
+	0xff	0x2003		SMART_BUILD
+	0x208	0x60000000	AUX_ICCM
+	0x5f8	0x90000000	XCCM_BASE
+	0x5f9	0xa0000000	YCCM_BASE
+	0xa00	0x1000		SUBSYS_DSP_0_BUILD
+	0xa04	0x171700f0	SUBSYS_IO_0_BUILD
+	0xa05	0x7		SUBSYS_IO_1_BUILD
+	0xa06	0x111		SUBSYS_IO_2_BUILD
+	0xa1e	0x100000	SUBSYS_UAUX_OFFSET
+	0xa1f	0x80000000	SUBSYS_APEX_OFFSET
+]]></string>
+  </configuration>
+  <configuration name="build_version_info" filename="build_version_info.txt">
+    <string><![CDATA[
+Version Information:
+    ARChitect O-2018.09
+    IP Libraries:
+        ARCv2EM            v5.0.32
+        ARC Data Fusion IP Subsystem DSP  v1.1.6
+        ARC Data Fusion IP Subsystem INFRA  v1.1.6
+        ARC Data Fusion IP Subsystem IO  v1.1.6
+        ARC Data Fusion IP Subsystem SPEECH  v1.1.6
+        ARC Debug          v2.1.9
+        ARC RTT            v1.0.23
+        ARC xCAM           v4.3.7
+        ARCv2EM_CCT        v5.0.32
+        EMSDP_BOARD        v1.0.0
+        Implementation     v5.0.32
+        Tool Configuration  v1.0.72
+]]></string>
+  </configuration>
+  <configuration name="mw_compiler" filename="ccac.arg">
+    <string><![CDATA[
+	-arcv2em
+	-core4
+	-Hrgf_banked_regs=32
+	-HL
+	-Xunaligned
+	-Xcode_density
+	-Xdiv_rem=radix2
+	-Xswap
+	-Xbitscan
+	-Xmpy_option=mpyd
+	-Xshift_assist
+	-Xbarrel_shifter
+	-Xdsp2
+	-Xdsp_complex
+	-Xdsp_divsqrt=radix2
+	-Xdsp_itu
+	-Xdsp_accshift=full
+	-Xagu_large
+	-Xxy
+	-Xxy_config=dccm_x_y
+	-Xbitstream
+	-Xfpus_div
+	-Xfpu_mac
+	-Xfpuda
+	-Xfpus_mpy_slow
+	-Xfpus_div_slow
+	-Xfpu_pipe_impl
+	-Xtimer0
+	-Xrtc
+	-Xstack_check
+	-dcache=16384,32,2,a
+	-Hccm
+	-Xdmac
+]]></string>
+  </configuration>
+  <configuration name="mw_debugger" filename="mdb.arg">
+    <string><![CDATA[
+	-arcv2em 
+	-core4 
+	-rgf_num_banks=2 
+	-rgf_banked_regs=32 
+	-rgf_num_wr_ports=2 
+	-Xunaligned 
+	-Xcode_density 
+	-Xdiv_rem=radix2 
+	-Xswap 
+	-Xbitscan 
+	-Xmpy_option=mpyd 
+	-Xshift_assist 
+	-Xbarrel_shifter 
+	-Xdsp2 
+	-Xdsp_complex 
+	-Xdsp_divsqrt=radix2 
+	-Xdsp_itu 
+	-Xdsp_accshift=full 
+	-Xagu_large 
+	-Xagu_wb_depth=4 
+	-Xagu_accord 
+	-Xxy 
+	-Xxy_config=dccm_x_y 
+	-Xxy_size=16K 
+	-Xxy_x_base=0x90000000 
+	-Xxy_y_base=0xa0000000 
+	-Xbitstream 
+	-Xfpus_div 
+	-Xfpu_mac 
+	-Xfpuda 
+	-Xfpus_mpy_slow 
+	-Xfpus_div_slow 
+	-Xfpu_pipe_impl 
+	-Xtimer0 
+	-Xtimer0_level=1 
+	-Xrtc 
+	-action_points=8 
+	-Xstack_check 
+	-smart_stack_entries=8 
+	-mpu 
+	-mpu_regions=16 
+	-interrupts=96 
+	-interrupt_priorities=4 
+	-ext_interrupts=77 
+	-firq 
+	-interrupt_base=0x0 
+	-dcache=16384,32,2,a 
+	-dcache_feature=2 
+	-icache=16384,64,2,a 
+	-icache_feature=1 
+	-dccm_size=0x20000 
+	-dccm_base=0x80000000 
+	-iccm0_size=0x20000 
+	-iccm0_base=0x60000000 
+	-error_prot_ver=4 
+	-ccm_prot_pipelined 
+	-watchdog 
+	-watchdog_size=32 
+	-Xpct_counters=8 
+	-dmac 
+	-dmac_channels=16 
+	-dmac_registers=0 
+	-dmac_fifo_depth=2 
+	-dmac_int_config=multiple_internal 
+]]></string>
+  </configuration>
+  <configuration name="nSIM" filename="nsim.props">
+    <string><![CDATA[
+	nsim_isa_family=av2em
+	nsim_isa_core=4
+	arcver=0x44
+	nsim_isa_rgf_num_banks=2
+	nsim_isa_rgf_banked_regs=32
+	nsim_isa_rgf_num_regs=32
+	nsim_isa_rgf_num_wr_ports=2
+	nsim_isa_big_endian=0
+	nsim_isa_lpc_size=32
+	nsim_isa_pc_size=32
+	nsim_isa_addr_size=32
+	nsim_isa_unaligned_option=1
+	nsim_isa_code_density_option=2
+	nsim_isa_div_rem_option=1
+	nsim_isa_swap_option=1
+	nsim_isa_bitscan_option=1
+	nsim_isa_mpy_option=8
+	nsim_isa_shift_option=3
+	nsim_isa_dsp_option=2
+	nsim_isa_dsp_complex_option=1
+	nsim_isa_dsp_divsqrt_option=1
+	nsim_isa_dsp_itu_option=1
+	nsim_isa_dsp_accshift_option=2
+	nsim_isa_agu_size=large
+	nsim_isa_agu_wb_depth=4
+	nsim_isa_agu_accord=1
+	nsim_isa_xy=1
+	nsim_isa_xy_config=dccm_x_y
+	nsim_isa_xy_size=16K
+	nsim_isa_xy_x_base=0x90000000
+	nsim_isa_xy_y_base=0xa0000000
+	nsim_isa_bitstream_option=1
+	nsim_isa_fpus_div_option=1
+	nsim_isa_fpu_mac_option=1
+	nsim_isa_fpuda_option=1
+	nsim_isa_fpu_fast_mpy_option=0
+	nsim_isa_fpu_fast_div_option=0
+	nsim_isa_fpu_pipe_impl=1
+	nsim_isa_enable_timer_0=1
+	nsim_isa_timer_0_int_level=1
+	nsim_isa_rtc_option=1
+	nsim_isa_num_actionpoints=8
+	nsim_isa_stack_checking=1
+	nsim_isa_smart_stack_entries=8
+	mpu_regions=16
+	mpu_version=2
+	nsim_isa_number_of_interrupts=96
+	nsim_isa_number_of_levels=4
+	nsim_isa_number_of_external_interrupts=77
+	nsim_isa_fast_irq=1
+	nsim_isa_intvbase_preset=0x0
+	dcache=16384,32,2,a
+	nsim_isa_dc_feature_level=2
+	icache=16384,64,2,a
+	nsim_isa_ic_feature_level=1
+	dccm_size=0x20000
+	dccm_base=0x80000000
+	iccm0_size=0x20000
+	iccm0_base=0x60000000
+	nsim_isa_error_prot=4
+	nsim_isa_error_prot_ccm_wb=1
+	nsim_isa_watchdog=1
+	nsim_isa_watchdog_size=32
+	nsim_isa_pct_counters=8
+	nsim_isa_dmac_option=1
+	nsim_isa_dmac_channels=16
+	nsim_isa_dmac_registers=0
+	nsim_isa_dmac_fifo_depth=2
+	nsim_isa_dmac_int_config=multiple_internal
+]]></string>
+  </configuration>
+  <configuration name="IDE" filename="ide.props">
+    <string><![CDATA[
+	processor.family=4
+	processor.core_version=4
+	processor.family_name=arcv2em
+	processor.rgf_num_banks=2
+	processor.rgf_banked_regs=32
+	processor.rgf_num_wr_ports=2
+	processor.endian=little
+	processor.lpc_size=32
+	processor.pc_size=32
+	processor.addr_size=32
+	processor.Xunaligned=1
+	processor.Xcode_density=1
+	processor.Xdiv_rem=radix2
+	processor.Xswap=1
+	processor.Xbitscan=1
+	processor.Xmpy_option=mpyd
+	processor.Xshift_assist=1
+	processor.Xbarrel_shifter=1
+	processor.Xdsp2=1
+	processor.Xdsp_complex=1
+	processor.Xdsp_divsqrt=radix2
+	processor.Xdsp_itu=1
+	processor.Xdsp_accshift=full
+	processor.Xagu_large=1
+	processor.Xagu_wb_depth=4
+	processor.Xagu_accord=1
+	processor.Xxy=1
+	processor.Xxy_config=dccm_x_y
+	processor.Xxy_size=16K
+	processor.Xxy_x_base=0x90000000
+	processor.Xxy_y_base=0xa0000000
+	processor.Xbitstream=1
+	processor.Xfpus_div=1
+	processor.Xfpu_mac=1
+	processor.Xfpuda=1
+	processor.Xfpus_mpy_slow=1
+	processor.Xfpus_div_slow=1
+	processor.Xfpu_pipe_impl=1
+	processor.Xtimer0=1
+	processor.Xtimer0_level=1
+	processor.Xrtc=1
+	processor.action_points=8
+	processor.Xstack_check=1
+	processor.smart_stack_entries=8
+	processor.mpu=1
+	processor.mpu.regions=16
+	processor.interrupts=96
+	processor.interrupt_priorities=4
+	processor.ext_interrupts=77
+	processor.firq=1
+	processor.interrupt_base=0x0
+	processor.dcache.size=16384
+	processor.dcache.line_size=32
+	processor.dcache.ways=2
+	processor.dcache_feature=2
+	processor.icache.size=16384
+	processor.icache.line_size=64
+	processor.icache.ways=2
+	processor.icache_feature=1
+	processor.dccm_size=0x20000
+	processor.dccm_base=0x80000000
+	processor.Hccm=1
+	processor.iccm0_size=0x20000
+	processor.iccm0_base=0x60000000
+	processor.error_prot_ver=4
+	processor.ccm_prot_pipelined=1
+	processor.watchdog=1
+	processor.watchdog_size=32
+	processor.Xpct_counters=8
+	processor.dmac=1
+	processor.dmac_channels=16
+	processor.dmac_registers=0
+	processor.dmac_fifo_depth=2
+	processor.dmac_int_config=multiple_internal
+	processor.tcf_include1=apexextensions.h
+	processor.tcf_include2=core_config.h
+]]></string>
+  </configuration>
+  <configuration name="architect" filename="build_configuration.txt">
+    <string><![CDATA[
+######## project_emsdp_em11d_dfss_RC0 --- com.arc.templates.project.Empty.1_0 ########
+
+# BuildHTMLDocs --- Creates custom HTML documentation in the 'docs' directory.
+-build_html_docs true
+
+# BuildSoftware --- Creates software under the Software directory.
+-build_software true
+
+# BuildTestCode --- Creates test source code under the 'tests' directory.
+-build_test_code true
+
+# BuildScripts --- Creates synthesis scripts and configuration files, which are required for hierarchy generation.
+-build_scripts true
+
+# BuildHDL --- Creates the behavioural and synthesisable HDL source code.
+-build_hdl true
+
+# CompileTestCode --- Compiles and assembles the test code.
+-compile_test_code false
+
+# GenerateStructuralHDL --- Generate the necessary structural HDL
+-generate_structural_hdl true
+
+# CompileForHDLSimulation --- Compile the HDL ready for simulation, using the selected Simulator.
+-compile_hdl_for_simulation false
+
+# BuildXCAM --- 
+# When true, build the XCAM cycle accurate model from HDL.
+# This happens only when the VTOC component (in the XCAM library) has been added to the design.
+# 
+-build_xcam false
+
+# RunARCsyn --- Synthesize design using ARCsyn
+-run_arcsyn false
+
+# RunSEIF --- Run Synopsys Embedit Integrator Flow to generate configured memory instances
+-run_seif false
+
+# RunARCrams --- Run ARCrams on the current build, this will stitch in vendor supplied RAM models and update the synthesis and simulation environment to use the models.
+-run_arcrams false
+
+# RunARCformal --- Formal Verification using ARCformal
+-run_arcformal false
+
+# RunARCpower --- Run the Power Analysis using RTL simulation to derive the activity
+-run_arcpower false
+
+# compile_nsim_user_extensions --- Build nSIM extensions for any APEX components in the current design using their C Models.
+-compile_nsim_user_extension false
+
+# compile_translated_nsim_extensions --- Converts APEX extensions from Verilog to C++ and compiles the model for nSIM.
+-compile_translated_nsim_extensions false
+
+# compile_iss_user_extensions --- Build ISS extensions for any APEX components in the current design using their C Models.
+-compile_iss_user_extensions false
+
+# compile_translated_iss_extensions --- Converts APEX extensions from Verilog to C++ and compiles the model for the ISS.
+-compile_translated_iss_extensions false
+
+
+######## System --- com.arc.hardware.System.1_0 ########
+
+# Create System
+-create com.arc.hardware.System.1_0 System
+
+# Testbench --- 
+# Only the rascal testbench is supported, and is required by ARCtest.
+# 	
+-testbench rascal
+
+# SynthesisLevel --- 
+# Sets the top level module name for synthesis.  
+# 
+# If not using core_sys: for single-core designs, cpu_isle is used; for multicore designs, archipelago is used.
+# 	
+-synthesislevel cpu_isle/archipelago
+
+# GateLevelSim --- When selected the gate level sim test code and scripts would be installed to run ARCgatesim
+-gatesim true
+
+# UserLibraryName --- The name for your HDL library
+-library_name user
+
+# OPTION_SimulatorName --- The name of the simulator you wish to use
+-simulator vcs
+
+# sim64 --- When selected, the 64-bit version of the simulator is used.  Be sure you have the 64-bit-capable simulator installed  and $ARCHITECT_ROOT/lib/linux_x86_64/ added to your LD_LIBRARY_PATH.
+# The setting of this option affects the content of the generated makefile_interface_*_verilog, where * is the simulator name.
+-sim64 false
+
+# verilog_2001 --- Enable Verilog 2001 file-io syntax (if false: use pli)
+-verilog_2001 true
+
+# export_srams_to --- Where to place srams, if not cpu_top
+-export_srams_to none
+
+# copy_prefix --- 
+# A Copy Prefix P causes creation of a separate copy of the entire Verilog build where each Verilog filename, module, and `define is prefixed with P and copied to a separate directory named P.
+# 	
+-copy_prefix ""
+
+
+######## CPUisle --- com.arc.hardware.CPU_isle.1_0 ########
+
+# Create CPUisle
+-create com.arc.hardware.CPU_isle.1_0 System.CPUisle
+
+# unique_name --- verilog module modifier prefix
+-unique_name ""
+
+# ArcNum --- The processor number as read back in the ARCNUM field of the IDENTITY register.
+-arc_num 0
+
+# instances --- 
+# The number of instantiations of this core.
+# 
+-instances 1
+
+# instance_signal_prefix --- 
+# [arc_dev] Specifies the prefix used for each instance, when multiple instances are created.  If N is in the text, N is replaced by the instance number; otherwise the instance number is appended.
+# 
+-instance_signal_prefix c
+
+# skip_vpp --- 
+# This is a secret option, not seen by customers.
+# If you check this, we won't VPP most of the *.vpp files.
+# This can speed up re-build if you've already built them and not
+# changed the core options.
+# Use at your own risk.
+# 	
+-skip_vpp false
+
+# OPTION_remove_tmpdir --- 
+# This is a secret option, not seen by customers.
+# If you uncheck this, we'll leave in place the temporary directory in which RTL is generated to support unique_name.
+# 	
+-remove_tmpdir true
+
+# CPUFloorplan --- Floorplan giving relative placement of the RAMs  for the given configuration of ARCv2HS or ARCv2EM in this CPUisle
+-cpu_floorplan create
+
+# userCPUFloorplanPath --- Pathname of user floorplan for the CPU when using a hierarchical implementation
+-usercpufloorplan_path ""
+
+# pinLocationConstraintsFile --- Pathname+filename of the physical pin location constraints file or just "side1" (all pins on l.h.s) or "side2" (pins on top only) or "side3" (pins on r.h.s. only) or "side4" (pins on bottom only) to get a template file generated
+-pin_location_constraints_file ""
+
+
+######## ARCv2EM --- com.arc.hardware.ARCv2EM.1_0 ########
+
+# Create ARCv2EM
+-create com.arc.hardware.ARCv2EM.1_0 System.CPUisle.ARCv2EM
+
+# arcv2em --- Description to follow
+-arcv2em true
+
+# def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate core clock, and the value N means core is running at (1/N) x ref_clk.
+-def_div2ref 1
+
+# addr_size --- This defines the address bus width (in bits).
+-addr_size 32
+
+# pc_size --- This defines the program counter (in bits).
+-pc_size 32
+
+# lpc_size --- This defines the size of the loop counter (in bits).
+-lpc_size 32
+
+# halt_on_reset --- This defines whether the core is halted initially on reset.
+-halt_on_reset true
+
+# byte_order --- This defines the endianness of the core.
+-byte_order little
+
+# sep_option --- Enable PC/RF and other key register protection for SEP.
+-sep_option false
+
+# code_density_option --- This reduces the size of program memory by adding instructions that condense commonly used instruction patterns with some marginal increase in processor gate count. The added instructions are ENTER_S, LEAVE_S, JLI_S, BI, BIH.
+-code_density_option true
+
+# bitscan_option --- This adds instructions for efficient search of bits within a 32 bit word, including normalize (NORM, NORMH, NORMW) and find first or last set bit (FFS, FLS) instructions.
+-bitscan_option true
+
+# shift_option --- The Shift ISA option adds variable and multi-length shift rotation instructions:  (0) No shift/rotation instructions (1) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8 (2) ASRM, ASLM, LSRM, RORM (3) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8, ASRM, ASLM, LSRM, RORM
+-shift_option 3
+
+# swap_option --- This adds two instructions used to swap half-words or bytes in a 32b word. Useful for converting between little to big endianess and vice-versa.
+-swap_option true
+
+# div_rem_option --- The DIV/REM option adds non-blocking multi-cycle implementation of integer divide/remainder functions. Added instructions are DIV, DIVU (integer divide), REM and REMU (integer divide remainder).radix2 takes 33 cycles.  radix4_enhanced takes 3 to 19 cycles per operation.
+-div_rem_option none
+
+# mpy_option --- The Multiplier ISA option allows selection between several multiplier configurations to tradeoff performance with silicon area.
+# For select multiply options, when the DIV/REM option is also selected, some datapath resources will be shared between the multiply and divide pipeline to minimize total area.
+# 
+# Cycle count (16-bit, lower 32-bit or upper 32-bit) for the different configurations is as follows:
+# <pre>
+# 
+# option  16/L32/U32  Instructions
+# ------  ----------  ---------------------
+#       
+# none	  -/-/-     None
+# wlh1	  1/1/1     MPYW/U, MPY/U, MPYH/U
+# wlh2	  2/2/2     MPYW/U, MPY/U, MPYH/U
+# wlh3	  2/3/3     MPYW/U, MPY/U, MPYH/U
+# wlh4	  2/4/5     MPYW/U, MPY/U, MPYH/U
+# wlh5	  5/9/9     MPYW/U, MPY/U, MPYH/U
+# </pre>
+# 
+-mpy_option none
+
+# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually.  This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region.  An attempt to access a protected region raises an EV_ProtV exception.
+-code_protection false
+
+# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected.
+-stack_checking true
+
+# unaligned_option --- This enables unaligned loads and stores.
+-unaligned_option true
+
+# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE.
+-intvbase_preset 0x0
+
+# intvbase_preset_s --- This sets the secure interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE_S.This is effective only when 2+2 mode is enabled.
+-intvbase_preset_s 0x0
+
+# intvbase_ext --- Set this option to drive the upper 22 bits of the interrupt base vector externally, into signal intvbase_in.
+-intvbase_ext false
+
+# nmi_option --- add Non-maskable external exception support
+-nmi_option false
+
+# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro.
+-rgf_impl flip_flops
+
+# rgf_num_regs --- This defines the size (in 32b register) of the processor register file.
+-rgf_num_regs 32
+
+# rgf_wr_ports --- This defines the number of write ports on the register file.
+-rgf_wr_ports 2
+
+# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not.
+-rgf_num_banks 2
+
+# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank.
+-rgf_banked_regs 32
+
+# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions.
+-turbo_boost false
+
+# infer_alu_adder --- infer: datapath is described as behavioral code: A + B
+# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder.  It is generally preferable to use the infer option and add directives for your target synthesizer. 
+-infer_alu_adder infer
+
+# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs)
+# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. 
+-infer_mpy_wtree instantiate
+
+# scantest_ram_bypass_mux --- This mux is used to make logic trapped between flops and memory (aka shadow logic) to be covered by scantest without requiring advanced sequential ATPG on the memory to be applied. Will add delay to functional access time
+-scantest_ram_bypass_mux false
+
+# logic_bist --- This option will OR LBIST_EN with test_mode
+-logic_bist false
+
+# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts
+-power_domains false
+
+# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core
+-dvfs false
+
+# voltage_domains --- Creates a voltage  domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints
+-voltage_domains false
+
+# mem_bus_option --- The core supports two bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator if present.
+-mem_bus_option AHB
+
+# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered.
+-mem_bus_reg_interface true
+
+# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle.
+-dmi_burst_option true
+
+# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost.
+-has_dmp_peripheral true
+
+# per0_base --- This option specifies the memory region assignment for this peripheral aperture
+-per0_base 15
+
+# per0_limit --- This option specifies the end of this peripheral aperture
+-per0_limit 0
+
+# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite.
+-per_bus_option AHB-Lite
+
+# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered.
+-per_bus_reg_interface true
+
+# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power.
+-clock_gating false
+
+# back_compat --- This enables the addition of rst_a input in the clkgate module to support backward compatibility with the older EM and Subsystem releases.
+-back_compat true
+
+# byte_parity --- If parity protection on the CCMs or Cache is configured, this option enables parity protection on a per-byte basis. Otherwise, parity is per word basis
+-byte_parity false
+
+# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback, no influence on Cache protection
+-prot_pipelined false
+
+# cct_test_ena --- When ECC is configured, this option enables single bit error injection in CCT RAM models to demonstrate ECC protection on the RAMs. When enabled, the RAM models can only be used in HDL CCT simulation (no xCAM support) and are not intended for use in SoC level integration.
+-cct_test_ena false
+
+# err_prot_ehce --- Enabled enhanced ECC architecture for CCM. Instruction fetch with single bit error is not replayed; ecc cac modules are shared to reduce area and timing opt.
+-err_prot_ehce false
+
+
+######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ########
+
+# Create dsp_trig
+-create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig
+
+# dsp_trig --- Command line option for EIA extension component 'dsp_trig'.
+-dsp_trig true
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio0 --- com.arc.hardware.dfss.io_gpio0.1_0 ########
+
+# Create io_gpio0
+-create com.arc.hardware.dfss.io_gpio0.1_0 System.CPUisle.ARCv2EM.io_gpio0
+
+# io_gpio0 --- Command line option for EIA extension component 'io_gpio0'.
+-io_gpio0 true
+
+# io_gpio0_debounce --- Selects the inclusion of Debounce logic
+-io_gpio0_debounce 1
+
+# io_gpio0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio0_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+# io_gpio0_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output.
+-io_gpio0_direction_rst_value 0
+
+# io_gpio0_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored.
+-io_gpio0_output_rst_value 0x0
+
+
+######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ########
+
+# Create io_i2c_mst0
+-create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0
+
+# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'.
+-io_i2c_mst0 true
+
+# io_i2c_mst0_fs --- RX/TX FIFO size
+-io_i2c_mst0_fs 16
+
+# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2c_mst0_dma_support None
+
+# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
+-io_i2c_mst0_cdc_included 0
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_i2c_slv0 --- com.arc.hardware.dfss.io_i2c_slv0.1_0 ########
+
+# Create io_i2c_slv0
+-create com.arc.hardware.dfss.io_i2c_slv0.1_0 System.CPUisle.ARCv2EM.io_i2c_slv0
+
+# io_i2c_slv0 --- Command line option for APEX extension component 'io_i2c_slv0'.
+-io_i2c_slv0 true
+
+# io_i2c_slv0_fs --- RX/TX FIFO size
+-io_i2c_slv0_fs 16
+
+# io_i2c_slv0_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2c_slv0_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ########
+
+# Create io_spi_mst0
+-create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0
+
+# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'.
+-io_spi_mst0 true
+
+# io_spi_mst0_fz --- RX/TX FIFO depth
+-io_spi_mst0_fs 16
+
+# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_mst0_max_xfer_size 16
+
+# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
+-io_spi_mst0_cdc_included 0
+
+# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_mst0_dma_support Memory-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ########
+
+# Create subsys_bcr
+-create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ########
+
+# Create io_spi_mst1
+-create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1
+
+# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'.
+-io_spi_mst1 true
+
+# io_spi_mst1_fz --- RX/TX FIFO depth
+-io_spi_mst1_fs 16
+
+# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_mst1_max_xfer_size 16
+
+# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
+-io_spi_mst1_cdc_included 0
+
+# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_mst1_dma_support Memory-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ########
+
+# Create io_spi_mst2
+-create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2
+
+# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'.
+-io_spi_mst2 true
+
+# io_spi_mst2_fz --- RX/TX FIFO depth
+-io_spi_mst2_fs 16
+
+# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_mst2_max_xfer_size 16
+
+# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
+-io_spi_mst2_cdc_included 0
+
+# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_mst2_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ########
+
+# Create io_spi_slv0
+-create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0
+
+# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'.
+-io_spi_slv0 true
+
+# io_spi_slv0_fz --- RX/TX FIFO depth
+-io_spi_slv0_fs 16
+
+# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_slv0_max_xfer_size 16
+
+# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_slv0_dma_support Memory-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio1 --- com.arc.hardware.dfss.io_gpio1.1_0 ########
+
+# Create io_gpio1
+-create com.arc.hardware.dfss.io_gpio1.1_0 System.CPUisle.ARCv2EM.io_gpio1
+
+# io_gpio1 --- Command line option for EIA extension component 'io_gpio1'.
+-io_gpio1 true
+
+# io_gpio1_debounce --- Selects the inclusion of Debounce logic
+-io_gpio1_debounce 1
+
+# io_gpio1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio1_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+# io_gpio1_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output.
+-io_gpio1_direction_rst_value 0
+
+# io_gpio1_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored.
+-io_gpio1_output_rst_value 0x0
+
+
+######## io_gpio2 --- com.arc.hardware.dfss.io_gpio2.1_0 ########
+
+# Create io_gpio2
+-create com.arc.hardware.dfss.io_gpio2.1_0 System.CPUisle.ARCv2EM.io_gpio2
+
+# io_gpio2 --- Command line option for EIA extension component 'io_gpio2'.
+-io_gpio2 true
+
+# io_gpio2_debounce --- Selects the inclusion of Debounce logic
+-io_gpio2_debounce 1
+
+# io_gpio2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio2_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+# io_gpio2_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output.
+-io_gpio2_direction_rst_value 0
+
+# io_gpio2_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored.
+-io_gpio2_output_rst_value 0x0
+
+
+######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ########
+
+# Create io_i2c_mst1
+-create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1
+
+# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'.
+-io_i2c_mst1 true
+
+# io_i2c_mst1_fs --- RX/TX FIFO size
+-io_i2c_mst1_fs 16
+
+# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2c_mst1_dma_support None
+
+# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
+-io_i2c_mst1_cdc_included 0
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ########
+
+# Create io_i2c_mst2
+-create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2
+
+# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'.
+-io_i2c_mst2 true
+
+# io_i2c_mst2_fs --- RX/TX FIFO size
+-io_i2c_mst2_fs 16
+
+# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2c_mst2_dma_support None
+
+# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
+-io_i2c_mst2_cdc_included 0
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ########
+
+# Create io_uart0
+-create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0
+
+# io_uart0 --- Command line option for EIA extension component 'io_uart0'.
+-io_uart0 true
+
+# io_uart0_fifo_mode --- Set the UART FIFO mode
+-io_uart0_fifo_mode 16
+
+# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart0_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ########
+
+# Create io_uart1
+-create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1
+
+# io_uart1 --- Command line option for EIA extension component 'io_uart1'.
+-io_uart1 true
+
+# io_uart1_fifo_mode --- Set the UART FIFO mode
+-io_uart1_fifo_mode 16
+
+# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart1_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ########
+
+# Create io_uart2
+-create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2
+
+# io_uart2 --- Command line option for EIA extension component 'io_uart2'.
+-io_uart2 true
+
+# io_uart2_fifo_mode --- Set the UART FIFO mode
+-io_uart2_fifo_mode 16
+
+# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart2_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ########
+
+# Create io_uart3
+-create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3
+
+# io_uart3 --- Command line option for EIA extension component 'io_uart3'.
+-io_uart3 true
+
+# io_uart3_fifo_mode --- Set the UART FIFO mode
+-io_uart3_fifo_mode 16
+
+# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart3_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_i2s_rx_mst0 --- com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 ########
+
+# Create io_i2s_rx_mst0
+-create com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_rx_mst0
+
+# io_i2s_rx_mst0 --- Command line option for APEX extension component 'io_i2s_rx_mst0'.
+-io_i2s_rx_mst0 true
+
+# io_i2s_rx_mst0_fs --- RX FIFO size
+-io_i2s_rx_mst0_fs 8
+
+# io_i2s_rx_mst0_fw --- RX FIFO width
+-io_i2s_rx_mst0_fw 16
+
+# io_i2s_rx_mst0_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2s_rx_mst0_dma_support Memory-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_i2s_tx_mst0 --- com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 ########
+
+# Create io_i2s_tx_mst0
+-create com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_tx_mst0
+
+# io_i2s_tx_mst0 --- Command line option for APEX extension component 'io_i2s_tx_mst0'.
+-io_i2s_tx_mst0 true
+
+# io_i2s_tx_mst0_fs --- TX FIFO size
+-io_i2s_tx_mst0_fs 8
+
+# io_i2s_tx_mst0_fw --- TX FIFO width
+-io_i2s_tx_mst0_fw 16
+
+# io_i2s_tx_mst0_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2s_tx_mst0_dma_support Memory-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_pdm_rx0 --- com.arc.hardware.dfss.io_pdm_rx0.1_0 ########
+
+# Create io_pdm_rx0
+-create com.arc.hardware.dfss.io_pdm_rx0.1_0 System.CPUisle.ARCv2EM.io_pdm_rx0
+
+# io_pdm_rx0 --- Command line option for APEX extension component 'io_pdm_rx0'.
+-io_pdm_rx0 true
+
+# io_pdm_rx0_ch --- Number of Stereo Channels
+-io_pdm_rx0_ch 1
+
+# io_pdm_rx0_fs --- RX FIFO size
+-io_pdm_rx0_fs 16
+
+# io_pdm_rx0_ns --- Maximum number of CIC stages
+-io_pdm_rx0_ns 4
+
+# io_pdm_rx0_ds --- Maximum delay in the COMB filter of the CIC filter
+-io_pdm_rx0_ds 2
+
+# io_pdm_rx0_dma_support --- Specifies whether the DMA handshake interface is included
+-io_pdm_rx0_dma_support Memory-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## DCCM --- com.arc.hardware.DCCM.1_0 ########
+
+# Create DCCM
+-create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM
+
+# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes
+-dccm_size 131072
+
+# dccm_base --- Sets the initial memory region assignment for DCCM
+-dccm_base 8
+
+# dccm_interleave --- Split DCCM into even/odd memory banks.
+-dccm_interleave false
+
+# dccm_prot --- Specifies the type of protection built for the DCCM.
+-dccm_prot None
+
+# dccm_prot_level --- Specifies the level protection.
+-dccm_prot_level Data_Only
+
+# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM
+-dccm_prot_exceptions true
+
+# dccm_sec_lvl --- Specifies the level of secure DCCM.
+-dccm_sec_lvl Non_Secure
+
+# dccm_dmi --- This enables external access through a DMI (direct memory interface) port.
+-dccm_dmi true
+
+
+######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ########
+
+# Create DMA Controller
+-create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller"
+
+# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller
+-dmac_channels 16
+
+# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words.
+-dmac_fifo_depth 2
+
+# dmac_int_config --- None: the DMA controller cannot raise an interrupt
+# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy
+# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy
+# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core
+# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core
+-dmac_int_config Multiple-Internal
+
+# dmac_separate_error_interrupts --- This specifies whether there is a separate error interrupt per DMA channel, or just one.
+-dmac_separate_error_interrupts false
+
+# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space.
+-dmac_registers 0
+
+# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface.
+-dmac_mem_if integrated
+
+# dmac_per_if --- Internal vs DW peripheral interface. Specify (in hex) which channels have the DW interface, where bit 0 corresponds to DMA channel 0, bit 1 for DMA channel 1, etc.
+# Example: 4 channel DMA controller where -dmac_per_if is set to 0x9 = DMA Channels 0 and 3 configured with the DW req interface, DMA Channels 1 and 2 configured with the internal req interface.
+-dmac_per_if 0x7e00
+
+
+######## DSP --- com.arc.hardware.DSP.1_0 ########
+
+# Create DSP
+-create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP
+
+# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support
+-dsp_complex true
+
+# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only
+-dsp_itu true
+
+# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT
+-dsp_divsqrt radix2
+
+# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding
+-dsp_accshift full
+
+# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing
+-dsp_impl optimized
+
+
+######## Data Cache --- com.arc.hardware.Data_Cache.1_0 ########
+
+# Create Data Cache
+-create com.arc.hardware.Data_Cache.1_0 "System.CPUisle.ARCv2EM.Data Cache"
+
+# dc_size --- This defines the total size of the Data Cache in bytes.
+-dc_size 16384
+
+# dc_ways --- This defines the number of cache ways.
+-dc_ways 2
+
+# dc_bsize --- This defines the cache line length in bytes.
+-dc_bsize 32
+
+# dc_feature_level --- Feature Level, indicates locking and debug feature level  00 = Basic cache, with no locking or debug features  01 = Lock and flush features supported  10 = Lock, flush and advanced debug features supported  11 = Reserved
+-dc_feature_level 2
+
+# dc_uncached_region --- Enable an uncached region defined by aux reg
+-dc_uncached_region false
+
+# dc_prot --- Specifies the type of protection built for DCACHE.
+-dc_prot None
+
+# dc_prot_level --- Specifies the level of protection.
+-dc_prot_level Data_Only
+
+# dc_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on DCACHE.
+-dc_prot_exceptions true
+
+
+######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ########
+
+# Create Debug Interface
+-create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface"
+
+# dbg_en_option --- Adds an enable pin to the existing debug interface
+-dbg_en_option false
+
+# secure_debug --- This enables secure debug feature
+-secure_debug false
+
+# scdbg_aux_unlk --- An internal demo module will be included when enable
+-scdbg_aux_unlk false
+
+# dbg_apb_option --- Adds an additional APB debug port alongside the BVCI one
+-dbg_apb_option false
+
+
+######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ########
+
+# Create ICCM0
+-create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0
+
+# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states.
+-iccm0_size 131072
+
+# iccm0_base --- Sets the initial memory region assignment for ICCM0
+-iccm0_base 6
+
+# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses.
+-iccm0_wide false
+
+# iccm0_prot --- Specifies the type of protection built for ICCM0.
+-iccm0_prot None
+
+# iccm0_prot_level --- Specifies the level of protection.
+-iccm0_prot_level Data_Only
+
+# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0
+-iccm0_prot_exceptions true
+
+# iccm0_sec_lvl --- Specifies the level of secure ICCM0.
+-iccm0_sec_lvl Non_Secure
+
+# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port.
+-iccm0_dmi true
+
+
+######## Instruction Cache --- com.arc.hardware.Instruction_Cache.1_0 ########
+
+# Create Instruction Cache
+-create com.arc.hardware.Instruction_Cache.1_0 "System.CPUisle.ARCv2EM.Instruction Cache"
+
+# ic_size --- This defines the total size of the instruction cache in bytes.
+-ic_size 16384
+
+# ic_ways --- This defines the number of cache ways
+-ic_ways 2
+
+# ic_bsize --- This defines the cache line length in bytes.
+-ic_bsize 64
+
+# ic_disable_on_reset --- The instruction cache may be enabled immediately after reset, depending on this option.  If this option is enabled, the last cache operation is set to failed, and the direct cache-RAM access is enabled.  Furthermore, the instruction cache is invalidated  all cache lines are invalidated and unlocked, and the tag RAM is cleared.
+-ic_disable_on_reset false
+
+# ic_feature_level --- This defines the feature level of the cache.
+-ic_feature_level 1
+
+# ic_pwr_opt_level --- This selects power-optimization options in the micro-architecture of the instruction cache.
+-ic_pwr_opt_level 0
+
+# ic_prot --- Specifies the type of protection built for ICACHE.
+-ic_prot None
+
+# ic_prot_level --- Specifies the level of protection.
+-ic_prot_level Data_Only
+
+# ic_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on ICACHE.
+-ic_prot_exceptions true
+
+
+######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ########
+
+# Create Interrupt Controller
+-create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller"
+
+# number_of_interrupts --- This is the total number of interrupts available to the core.  Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts).  For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual.
+-number_of_interrupts 96
+
+# number_of_levels --- Priority levels in the interrupt controller.
+-number_of_levels 4
+
+# external_interrupts --- This is the total number of interrupt pins available for external system components.  This parameter must be less than the total number of interrupts.
+-external_interrupts 77
+
+# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory.
+-firq_option true
+
+
+######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ########
+
+# Create JTAG Interface
+-create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface"
+
+######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ########
+
+# Create Timer 0
+-create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0"
+
+# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0.
+-timer_0_int_level 1
+
+
+######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ########
+
+# Create Watchdog Timer
+-create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer"
+
+# watchdog_size --- Specifies the bit width of timer's internal counter.
+-watchdog_size 32
+
+# watchdog_clk --- Specifies whether the timer should be driven from a separate clock.
+-watchdog_clk false
+
+
+######## Real-time Counter --- com.arc.hardware.Real_time_Counter.1_0 ########
+
+# Create Real-time Counter
+-create com.arc.hardware.Real_time_Counter.1_0 "System.CPUisle.ARCv2EM.Real-time Counter"
+
+######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ########
+
+# Create Performance Monitor
+-create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor"
+
+# pct_counters --- Number of counters for performance monitoring.
+-pct_counters 8
+
+
+######## SmaRT --- com.arc.hardware.SmaRT.1_0 ########
+
+# Create SmaRT
+-create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT
+
+# smart_stack_entries --- This specifies the number of entries in the trace buffer.
+-smart_stack_entries 8
+
+# smart_implementation --- Flip-flop = FF-based design.  Memory = memory-based design (provides better density for larger trace buffers).
+-smart_implementation flip-flop
+
+
+######## XY --- com.arc.hardware.XY.1_0 ########
+
+# Create XY
+-create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY
+
+# xy_config --- XY memory configuration:
+# One memory: DCCM only.
+# Two memories: DCCM + Y.
+# Three memories: DCCM + X + Y.
+-xy_config dccm_x_y
+
+# xy_size --- Size of X and Y memories if included.
+# X and Y memories both have the same configured size.
+-xy_size 16384
+
+# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access.
+-xy_interleave false
+
+# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory.
+-xy_x_base 9
+
+# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory.
+-xy_y_base 10
+
+
+######## AGU --- com.arc.hardware.AGU.1_0 ########
+
+# Create AGU
+-create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU
+
+# agu_size --- Predefined configurations of modifiers, address 
+# pointers and offset registers                   
+# <pre>
+# 
+#         address     address                     
+#         pointers    offset regs      modifiers  
+#        ----------- --------------- ------------ 
+# small:     4           2                 4      
+# medium:    8           4                 12     
+# large:     12          8                 24     
+# </pre>
+# 
+-agu_size large
+
+# agu_accord --- Enable the accordion stage if operating frequency is critical
+-agu_accord true
+
+# agu_wb_depth --- Write buffer depth
+-agu_wb_depth 4
+
+
+######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ########
+
+# Create Actionpoints
+-create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints
+
+# num_actionpoints --- This is the number of trigger events available.
+-num_actionpoints 8
+
+# aps_feature --- Selects Actionpoint feature set
+-aps_feature min
+
+
+######## Bit stream --- com.arc.hardware.Bit_stream.1_0 ########
+
+# Create Bit stream
+-create com.arc.hardware.Bit_stream.1_0 "System.CPUisle.ARCv2EM.Bit stream"
+
+######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ########
+
+# Create Floating-point unit
+-create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit"
+
+# fpu_dp_assist --- This enables double-precision acceleration instructions.
+-fpu_dp_assist true
+
+# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions.
+-fpu_fma_option true
+
+# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed.
+-fpu_mas_cycles 2
+
+# fpu_pipe_impl --- FPU pipelined implementation
+-fpu_pipe_impl true
+
+# fpu_div_option --- This enables divide & square-root acceleration
+-fpu_div_option true
+
+# fpu_div_cycles --- Controls div/sqrt implementation.
+-fpu_div_cycles 17
+
+
+######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ########
+
+# Create Memory Protection Unit
+-create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit"
+
+# mpu_num_regions --- Number of configured memory regions.
+-mpu_num_regions 16
+
+# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB.
+-mpu_32b false
+
+# mpu_sid_option --- It will enable SID support in Secure Shield
+-mpu_sid_option false
+
+
+######## Real-time trace producer --- com.arc.hardware.Real_time_trace_producer.1_0 ########
+
+# Create Real-time trace producer
+-create com.arc.hardware.Real_time_trace_producer.1_0 "System.CPUisle.ARCv2EM.Real-time trace producer"
+
+# rtt_feature_level --- 'small' means that program trace only is available.  `medium' adds data trace.  `full' adds core and aux register trace.
+-rtt_feature_level full
+
+
+######## ARCv2EM CCT --- cct.1_0 ########
+
+# Create ARCv2EM CCT
+-create cct.1_0 "System.ARCv2EM CCT"
+
+# cct --- 
+# 	Option used to add a CCT to the design for command-line builds
+# 	Without this architect can't add this component to a build
+# 	via a cmdline -create command.  
+# 	with old scripts.
+# 	
+-cct true
+
+# no_hostlink --- 
+# This prevents the inclusion of the hostlink library when compiling
+# C or C++ programs.  The resultant executable, if it contains printfs,
+# will print to an internal fixed buffer __mwwrite_buf.  
+# Other hostlink operations that require debugger assistance, such as file
+# opens, will fail.
+# 
+# Hostlink references incur memory cycles at unpredictable times and 
+# so can perturb cycle-timing results.  Without hostlink,
+# the debugger will not in any way interfere with the target while it is running.  
+# Therefore this option is useful for simulation in which you want precisely the
+# same cycle timing to occur each time you run, or for accurate power consumption results.
+# 	
+-cct_no_hostlink false
+
+# has_subsystem_cct_flow --- 
+# The above option will check for the presence of subsystem component in the build configuration and suitably modifies the Makefile for the sub-system environment.
+# 	
+-has_subsystem_cct_flow false
+
+
+######## BusFabric --- com.arc.hardware.ARCv2MSS.BusFabric.1_0 ########
+
+# Create BusFabric
+-create com.arc.hardware.ARCv2MSS.BusFabric.1_0 System.BusFabric
+
+######## ClkCtrl --- com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 ########
+
+# Create ClkCtrl
+-create com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 System.ClkCtrl
+
+######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ########
+
+# Create DSP Software
+-create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software"
+
+# sw_dsp --- Command line option for Software element 'DSP Software'
+-sw_dsp true
+
+
+######## EMSDP_BOARD --- com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 ########
+
+# Create EMSDP_BOARD
+-create com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 System.EMSDP_BOARD
+
+# emsdp_sys_freq --- Select the core frequency.
+-emsdp_sys_freq 40
+
+
+######## IO Software --- com.arc.software.dfss.sw_io.1_0 ########
+
+# Create IO Software
+-create com.arc.software.dfss.sw_io.1_0 "System.IO Software"
+
+# sw_io --- Command line option for Software element 'IO Software'
+-sw_io true
+
+
+######## Implementation --- com.arc.hardware.implementation.1_0 ########
+
+# Create Implementation
+-create com.arc.hardware.implementation.1_0 System.Implementation
+
+# ClockSpeed --- Target clock speed of the system
+-clock_speed 10
+
+# DDR2_clk_Ratio --- DDR2 Clock Vs System Clock Ratio
+# 2x
+# 3x
+# 4x
+-ddr2_clk_ratio 3x
+
+# ClockSkew --- The clock skew for the system
+-clock_skew 0.2
+
+# HoldMargin --- Margin for hold time checks
+-hold_margin 0.05
+
+# Floorplan --- Floorplan definition for relative placement of  RAMs (at CPU-level) or the placement of the rams and CPU hard cores (at multicore level)
+-floorplan em4_sensor
+
+# JTAGFrequency --- Select the frequency of the JTAG clock Tck (in MHz).
+# 
+# The JTAG clock speed has to be less than 1/2 of the cpu clock otherwise the signals on the BVCI interface are not guaranteed to be valid.
+# 
+# NOTE: The RTL simulations will work when the JTAG clock frequency is set to half the CPU clock, however this may not be the case when simulating at gate level due to delays on the IO pads.
+# 
+# The default is set to 10 MHz so that there is no conflict when simulating with an ARCangel3 at 30MHz. (30 > 10*2)
+# 
+# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock.
+# 
+-jtag_tclk 4
+
+# execution_trace_level --- 
+# This traces committed instructions as they execute, and gathers statistics
+# visible in the debugger for counting instructions & cycle delays.
+# At the "stats" level ony the statistics are gathered and no trace is printed.
+# "file" is equivalent to "full", but the results go to a trace .txt file instead.
+# 
+-execution_trace_level stats
+
+# tb_trace --- 
+# Enable instruction execution trace.
+# This is available to arc_dev licensees (internal developers) only.
+# 
+-tb_trace false
+
+# zero_based_arcnum --- 
+# In a multicore build, number ARCs from 0.
+# If this is not selected, arcs are numbered from 1.
+# (This provides the initial value to the arcnum signal.)
+# 
+-zero_based_arcnum true
+
+# generate_ipxact --- 
+# Generate ipxact.xml file describing the CPUisle or archipelago frontier
+# 
+-generate_ipxact false
+
+# ipxact_relative_path_names --- 
+# Use relative path names for Verilog files in the ipxact.
+# Otherwise, absolute path names are used.
+# 
+-ipxact_relative_path_names true
+
+# optional_encryption --- 
+# When selected, encrypted RTL output is generated.
+# 	
+-optional_encryption false
+
+# ignore_encrypt_license --- 
+# When selected, pretend the encryption license is missing.  For testing.
+# 	
+-ignore_encrypt_license false
+
+# ignore_clear_license --- 
+# When selected, pretend the cleartest license is missing.  For testing.
+# 	
+-ignore_clear_license false
+
+# OPTION_require_archipelago --- 
+# When selected, force use of archipelago.  This is for testing purposes.
+# 	
+-require_archipelago false
+
+
+######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ########
+
+# Create Infrastructure Software
+-create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software"
+
+# sw_infra --- Command line option for Software element 'Infrastructure Software'
+-sw_infra true
+
+# templateName --- Template name
+-template_name siss_combo_sensor_dsp
+
+
+######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ########
+
+# Create subsys_infra
+-create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra
+
+# subsys_infra --- Command line option for EIA glue logic.
+-subsys_infra true
+
+# internal_interrupt --- Connect the IO interrupts internally
+-internal_interrupt true
+
+# internal_dma_handshake --- Connect the DMA handshake signals internally
+-internal_dma_handshake true
+
+# spi_tb_sw_test_mode --- 
+# This is a secret option, not seen by customers.
+# If you check this, the SPI peripheral's testbenches will be set to SW test mode:
+# The serial interface of the first SPI master io_spi_mstN peripheral is connected to all SPI slave peripherals io_spi_slvN.
+# This is used for testing the SW drivers.
+# 	
+-spi_tb_sw_test_mode false
+
+# i3c_tb_sw_test_mode --- 
+# This is a secret option, not seen by customers.
+# If you check this, the I3C peripheral's testbenches will be set to SW test mode:
+# The serial interface of the io_i3cN peripheral is connected to the I2C slave peripherals io_i2c_slv0.
+# This is used for testing the SW drivers.
+# 	
+-i3c_tb_sw_test_mode false
+
+# subsys_apex_offset --- Subsystem APEX address offset in the AUX address space. The aperture used by the subsystem is fixed to 0x0010_0000. In general, the APEX address offset must be in the range from 0x0010_0000 to 0xFFF0_0000. However, if your design includes the "UAUX Interface" component, then the APEX address offset must be in the range from 0x0010_0000 to 0x7FF0_0000 to avoid address conflicts with any UAUX components.
+-subsys_apex_offset 0x8000_0000
+
+# subsys_uaux_offset --- Subsystem UAUX address offset in the UAUX address space. The UAUX address offset must be an integer multiple of 0x0010_0000 in the range from 0x0000_0000 to 0x7FF0_0000. The aperture reserved for the subsystem is fixed to 0x0010_0000.
+-subsys_uaux_offset 0x10_0000
+
+
+######## ARC_RTT --- com.arc.hardware.ARC_RTT.1_0 ########
+
+# Create ARC_RTT
+-create com.arc.hardware.ARC_RTT.1_0 System.ARC_RTT
+
+# has_nexus_if --- Please select Nexus interface to offload the data from RTT 
+-has_nexus_if true
+
+# has_on_chip_mem --- Please select the on-chip memory option to store the trace data in shared memory 
+-has_on_chip_mem true
+
+# nexus_data_wdt --- Please select the Nexus Data Width to offload the data from RTT 
+-nexus_data_wdt 16
+
+# internal_memory_size --- Please select internal memory size to capture the trace data 
+-internal_memory_size 16k
+
+# ram_type --- Please select Types of internal memories to be inferred for the logic 
+-ram_type 1_PORT
+
+# power_domains --- Adds isolation signal inputs/power switch controls for use in UPF flow when configuring power domains.
+-rtt_power_domains false
+
+
+######## Tool Configuration --- cgen.1_0 ########
+
+# Create Tool Configuration
+-create cgen.1_0 "System.Tool Configuration"
+
+# mwdt_version --- Selects the MetaWare version to be used with the TCF file.
+# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools.
+-mwdt_version O-2018.09
+
+# code_base_addr --- 
+# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build.  This value is ignored when there is an ICCM.
+# 
+-code_base_addr 0x0
+
+# data_base_addr --- 
+# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM.  This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used.
+# 
+# A value of 0xffffffff means that the data segment will not be mapped to any specific address.
+# 
+-data_base_addr 0xffff_ffff
+
+# underscores_in_numbers --- Use underscores in hex numbers to improve readability.
+-underscores_in_numbers false
+
+# tcf_rebrand --- Alternate branding of TCF (not used)
+-rebrand false
+
+
+]]></string>
+  </configuration>
+  <configuration name="assembler_defines" filename="core_config.s">
+    <string><![CDATA[
+.ifndef __core_config_s
+	.define __core_config_s, 1
+	.define _TOOL_CONFIG_VER, 10072
+	.define	core_config_cir_identity,0x00000044
+	.define	core_config_cir_identity_chipid,0
+	.define	core_config_cir_identity_arcnum,0
+	.define	core_config_cir_identity_arcver,68
+	.define	core_config_cir_identity_family,4
+	.define	core_config_cir_identity_corever,4
+	.define	core_config_cir_aux_dccm,0x80000000
+	.define	core_config_bcr_bcr_ver,0x00000002
+	.define	core_config_bcr_bcr_ver_version,2
+	.define	core_config_bcr_vecbase_ac_build,0x00000010
+	.define	core_config_bcr_vecbase_ac_build_version,4
+	.define	core_config_bcr_vecbase_ac_build_vector_config,0
+	.define	core_config_bcr_vecbase_ac_build_addr,0
+	.define	core_config_bcr_mpu_build,0x00001002
+	.define	core_config_bcr_mpu_build_i,0
+	.define	core_config_bcr_mpu_build_s,0
+	.define	core_config_bcr_mpu_build_regions,16
+	.define	core_config_bcr_mpu_build_version,2
+	.define	core_config_bcr_rf_build,0x0000c902
+	.define	core_config_bcr_rf_build_version,2
+	.define	core_config_bcr_rf_build_p,1
+	.define	core_config_bcr_rf_build_e,0
+	.define	core_config_bcr_rf_build_r,0
+	.define	core_config_bcr_rf_build_b,1
+	.define	core_config_bcr_rf_build_d,3
+	.define	core_config_bcr_d_cache_build,0x00215104
+	.define	core_config_bcr_d_cache_build_version,4
+	.define	core_config_bcr_d_cache_build_assoc,1
+	.define	core_config_bcr_d_cache_build_capacity,5
+	.define	core_config_bcr_d_cache_build_bsize,1
+	.define	core_config_bcr_d_cache_build_fl,2
+	.define	core_config_bcr_d_cache_build_ioc,0
+	.define	core_config_bcr_d_cache_build_cp,0
+	.define	core_config_bcr_d_cache_build_u,0
+	.define	core_config_bcr_d_cache_build_cycles,0
+	.define	core_config_bcr_dccm_build,0x00000904
+	.define	core_config_bcr_dccm_build_w,0
+	.define	core_config_bcr_dccm_build_cycles,0
+	.define	core_config_bcr_dccm_build_interleave,0
+	.define	core_config_bcr_dccm_build_size1,0
+	.define	core_config_bcr_dccm_build_size0,9
+	.define	core_config_bcr_dccm_build_version,4
+	.define	core_config_bcr_timer_build,0x00010504
+	.define	core_config_bcr_timer_build_sp1,0
+	.define	core_config_bcr_timer_build_sp0,0
+	.define	core_config_bcr_timer_build_p1,0
+	.define	core_config_bcr_timer_build_p0,1
+	.define	core_config_bcr_timer_build_st1,0
+	.define	core_config_bcr_timer_build_st0,0
+	.define	core_config_bcr_timer_build_rtc,1
+	.define	core_config_bcr_timer_build_rtsc_ver,1
+	.define	core_config_bcr_timer_build_rtsc,0
+	.define	core_config_bcr_timer_build_t0,1
+	.define	core_config_bcr_timer_build_t1,0
+	.define	core_config_bcr_timer_build_version,4
+	.define	core_config_bcr_ap_build,0x00000605
+	.define	core_config_bcr_ap_build_version,5
+	.define	core_config_bcr_ap_build_type,6
+	.define	core_config_bcr_i_cache_build,0x00135104
+	.define	core_config_bcr_i_cache_build_assoc,1
+	.define	core_config_bcr_i_cache_build_version,4
+	.define	core_config_bcr_i_cache_build_capacity,5
+	.define	core_config_bcr_i_cache_build_bsize,3
+	.define	core_config_bcr_i_cache_build_fl,1
+	.define	core_config_bcr_i_cache_build_d,0
+	.define	core_config_bcr_iccm_build,0x00000904
+	.define	core_config_bcr_iccm_build_w0,0
+	.define	core_config_bcr_iccm_build_iccm1_size1,0
+	.define	core_config_bcr_iccm_build_iccm0_size1,0
+	.define	core_config_bcr_iccm_build_iccm1_size0,0
+	.define	core_config_bcr_iccm_build_iccm0_size0,9
+	.define	core_config_bcr_iccm_build_version,4
+	.define	core_config_bcr_xy_build,0x00002220
+	.define	core_config_bcr_xy_build_memsize,2
+	.define	core_config_bcr_xy_build_interleaved,0
+	.define	core_config_bcr_xy_build_config,2
+	.define	core_config_bcr_xy_build_version,32
+	.define	core_config_bcr_dsp_build,0x00003521
+	.define	core_config_bcr_dsp_build_wide,0
+	.define	core_config_bcr_dsp_build_itu_pa,1
+	.define	core_config_bcr_dsp_build_acc_shift,2
+	.define	core_config_bcr_dsp_build_comp,1
+	.define	core_config_bcr_dsp_build_divsqrt,1
+	.define	core_config_bcr_dsp_build_version,33
+	.define	core_config_bcr_multiply_build,0x00022206
+	.define	core_config_bcr_multiply_build_version16x16,2
+	.define	core_config_bcr_multiply_build_dsp,2
+	.define	core_config_bcr_multiply_build_cyc,0
+	.define	core_config_bcr_multiply_build_type,2
+	.define	core_config_bcr_multiply_build_version32x32,6
+	.define	core_config_bcr_swap_build,0x00000003
+	.define	core_config_bcr_swap_build_version,3
+	.define	core_config_bcr_norm_build,0x00000003
+	.define	core_config_bcr_norm_build_version,3
+	.define	core_config_bcr_minmax_build,0x00000002
+	.define	core_config_bcr_minmax_build_version,2
+	.define	core_config_bcr_barrel_build,0x00000303
+	.define	core_config_bcr_barrel_build_version,3
+	.define	core_config_bcr_barrel_build_shift_option,3
+	.define	core_config_bcr_isa_config,0x12447402
+	.define	core_config_bcr_isa_config_res1,0
+	.define	core_config_bcr_isa_config_d,1
+	.define	core_config_bcr_isa_config_res2,0
+	.define	core_config_bcr_isa_config_f,0
+	.define	core_config_bcr_isa_config_c,2
+	.define	core_config_bcr_isa_config_l,0
+	.define	core_config_bcr_isa_config_n,1
+	.define	core_config_bcr_isa_config_a,0
+	.define	core_config_bcr_isa_config_b,0
+	.define	core_config_bcr_isa_config_addr_size,4
+	.define	core_config_bcr_isa_config_lpc_size,7
+	.define	core_config_bcr_isa_config_pc_size,4
+	.define	core_config_bcr_isa_config_version,2
+	.define	core_config_bcr_dmp_pp_build,0xf0000012
+	.define	core_config_bcr_stack_region_build,0x00000002
+	.define	core_config_bcr_erp_build,0x50000004
+	.define	core_config_bcr_erp_build_l,0
+	.define	core_config_bcr_erp_build_wd,2
+	.define	core_config_bcr_erp_build_c,1
+	.define	core_config_bcr_erp_build_mmu,0
+	.define	core_config_bcr_erp_build_rf,0
+	.define	core_config_bcr_erp_build_pc,0
+	.define	core_config_bcr_erp_build_ic,0
+	.define	core_config_bcr_erp_build_dc,0
+	.define	core_config_bcr_erp_build_ip,0
+	.define	core_config_bcr_erp_build_dp,0
+	.define	core_config_bcr_erp_build_version,4
+	.define	core_config_bcr_fpu_build,0x01004f03
+	.define	core_config_bcr_fpu_build_da,1
+	.define	core_config_bcr_fpu_build_dd,0
+	.define	core_config_bcr_fpu_build_dc,0
+	.define	core_config_bcr_fpu_build_df,0
+	.define	core_config_bcr_fpu_build_dp,0
+	.define	core_config_bcr_fpu_build_fd_v1,2
+	.define	core_config_bcr_fpu_build_pi,1
+	.define	core_config_bcr_fpu_build_fd,0
+	.define	core_config_bcr_fpu_build_fm,0
+	.define	core_config_bcr_fpu_build_sd,1
+	.define	core_config_bcr_fpu_build_sc,1
+	.define	core_config_bcr_fpu_build_sf,1
+	.define	core_config_bcr_fpu_build_sp,1
+	.define	core_config_bcr_fpu_build_version,3
+	.define	core_config_bcr_bs_build,0x00000002
+	.define	core_config_bcr_bs_build_version,2
+	.define	core_config_bcr_agu_build,0x01988c02
+	.define	core_config_bcr_agu_build_accordian,1
+	.define	core_config_bcr_agu_build_wb_size,4
+	.define	core_config_bcr_agu_build_num_modifier,24
+	.define	core_config_bcr_agu_build_num_offset,8
+	.define	core_config_bcr_agu_build_num_addr,12
+	.define	core_config_bcr_agu_build_version,2
+	.define	core_config_bcr_dmac_build,0x00120f02
+	.define	core_config_bcr_dmac_build_int_cfg,2
+	.define	core_config_bcr_dmac_build_fifo,1
+	.define	core_config_bcr_dmac_build_chan_mem,0
+	.define	core_config_bcr_dmac_build_channels,15
+	.define	core_config_bcr_dmac_build_version,2
+	.define	core_config_bcr_subsys_build,0x00101063
+	.define	core_config_bcr_subsys_build_version_major,0
+	.define	core_config_bcr_subsys_build_version_minor,2
+	.define	core_config_bcr_subsys_build_version_build,6
+	.define	core_config_bcr_subsys_build_type,3
+	.define	core_config_bcr_core_config,0x00000001
+	.define	core_config_bcr_core_config_turbo_boost,0
+	.define	core_config_bcr_core_config_version,1
+	.define	core_config_bcr_rtt_build,0x00000503
+	.define	core_config_bcr_rtt_build_prod_src_num,0
+	.define	core_config_bcr_rtt_build_fl,2
+	.define	core_config_bcr_rtt_build_pi,1
+	.define	core_config_bcr_rtt_build_version,3
+	.define	core_config_bcr_irq_build,0x134d6001
+	.define	core_config_bcr_irq_build_raz,0
+	.define	core_config_bcr_irq_build_nmi,0
+	.define	core_config_bcr_irq_build_f,1
+	.define	core_config_bcr_irq_build_p,3
+	.define	core_config_bcr_irq_build_exts,77
+	.define	core_config_bcr_irq_build_irqs,96
+	.define	core_config_bcr_irq_build_version,1
+	.define	core_config_bcr_pct_build,0x08080104
+	.define	core_config_bcr_pct_build_version,4
+	.define	core_config_bcr_pct_build_s,1
+	.define	core_config_bcr_pct_build_i,0
+	.define	core_config_bcr_pct_build_c,8
+	.define	core_config_bcr_cc_build,0x006f0004
+	.define	core_config_bcr_cc_build_version,4
+	.define	core_config_bcr_cc_build_cc,111
+	.define	core_config_bcr_smart_build,0x00002003
+	.define	core_config_bcr_smart_build_version,3
+	.define	core_config_bcr_smart_build_stack_size,8
+	.define	core_config_cir_aux_iccm,0x60000000
+	.define	core_config_cir_xccm_base,0x90000000
+	.define	core_config_cir_yccm_base,0xa0000000
+	.define	core_config_cir_subsys_dsp_0_build,0x00001000
+	.define	core_config_cir_subsys_io_0_build,0x171700f0
+	.define	core_config_cir_subsys_io_1_build,0x00000007
+	.define	core_config_cir_subsys_io_2_build,0x00000111
+	.define	core_config_cir_subsys_uaux_offset,0x00100000
+	.define	core_config_cir_subsys_apex_offset,0x80000000
+	.define	core_config_family,4
+	.define	core_config_core_version,4
+	.define	core_config_family_name,"arcv2em"
+	.define	core_config_rgf_num_banks,2
+	.define	core_config_rgf_banked_regs,32
+	.define	core_config_rgf_num_wr_ports,2
+	.define	core_config_endian,"little"
+	.define	core_config_endian_little,1
+	.define	core_config_endian_big,0
+	.define	core_config_lpc_size,32
+	.define	core_config_pc_size,32
+	.define	core_config_addr_size,32
+	.define	core_config_unaligned,1
+	.define	core_config_code_density,1
+	.define	core_config_div_rem,"radix2"
+	.define	core_config_div_rem_radix2,1
+	.define	core_config_swap,1
+	.define	core_config_bitscan,1
+	.define	core_config_mpy_option,"mpyd"
+	.define	core_config_mpy_option_num,8
+	.define	core_config_shift_assist,1
+	.define	core_config_barrel_shifter,1
+	.define	core_config_dsp,1
+	.define	core_config_dsp2,1
+	.define	core_config_dsp_complex,1
+	.define	core_config_dsp_divsqrt,"radix2"
+	.define	core_config_dsp_divsqrt_radix2,1
+	.define	core_config_dsp_itu,1
+	.define	core_config_dsp_accshift,"full"
+	.define	core_config_dsp_accshift_full,1
+	.define	core_config_agu_large,1
+	.define	core_config_agu_wb_depth,4
+	.define	core_config_agu_accord,1
+	.define	core_config_xy,1
+	.define	core_config_xy_config,"dccm_x_y"
+	.define	core_config_xy_config_dccm_x_y,1
+	.define	core_config_xy_size,16384
+	.define	core_config_xy_size_KM,"16K"
+	.define	core_config_xy_x_base,0x90000000
+	.define	core_config_xy_y_base,0xa0000000
+	.define	core_config_bitstream,1
+	.define	core_config_fpus_div,1
+	.define	core_config_fpu_mac,1
+	.define	core_config_fpuda,1
+	.define	core_config_fpus_mpy_slow,1
+	.define	core_config_fpus_div_slow,1
+	.define	core_config_fpu_pipe_impl,1
+	.define	core_config_timer0,1
+	.define	core_config_timer0_level,1
+	.define	core_config_timer0_vector,16
+	.define	core_config_rtc,1
+	.define	core_config_action_points,8
+	.define	core_config_stack_check,1
+	.define	core_config_smart_stack_entries,8
+	.define	core_config_mpu_present,1
+	.define	core_config_mpu,1
+	.define	core_config_mpu_regions,16
+	.define	core_config_interrupts_present,1
+	.define	core_config_interrupts_number,96
+	.define	core_config_interrupts_priorities,4
+	.define	core_config_interrupts_externals,77
+	.define	core_config_interrupts,96
+	.define	core_config_interrupt_priorities,4
+	.define	core_config_ext_interrupts,77
+	.define	core_config_interrupts_firq,1
+	.define	core_config_interrupts_base,0x0
+	.define	core_config_dcache_present,1
+	.define	core_config_dcache_size,16384
+	.define	core_config_dcache_line_size,32
+	.define	core_config_dcache_ways,2
+	.define	core_config_dcache_feature,2
+	.define	core_config_icache_present,1
+	.define	core_config_icache_size,16384
+	.define	core_config_icache_line_size,64
+	.define	core_config_icache_ways,2
+	.define	core_config_icache_feature,1
+	.define	core_config_dccm_present,1
+	.define	core_config_dccm_size,0x20000
+	.define	core_config_dccm_base,0x80000000
+	.define	core_config_iccm_present,1
+	.define	core_config_iccm0_present,1
+	.define	core_config_iccm_size,0x20000
+	.define	core_config_iccm0_size,0x20000
+	.define	core_config_iccm_base,0x60000000
+	.define	core_config_iccm0_base,0x60000000
+	.define	core_config_error_prot_ver,4
+	.define	core_config_ccm_prot_pipelined,1
+	.define	core_config_watchdog,1
+	.define	core_config_watchdog_size,32
+	.define	core_config_pct_counters,8
+	.define	core_config_dmac,1
+	.define	core_config_dmac_channels,16
+	.define	core_config_dmac_registers,0
+	.define	core_config_dmac_fifo_depth,2
+	.define	core_config_dmac_int_config,"multiple_internal"
+	.define	core_config_clock_speed,10
+.endif ; __core_config_s
+
+]]></string>
+  </configuration>
+  <configuration name="C_defines" filename="core_config.h">
+    <string><![CDATA[
+#ifndef __core_config_h
+	#define __core_config_h  1
+	#define _TOOL_CONFIG_VER 10072
+	#define	core_config_cir_identity	0x00000044
+	#define	core_config_cir_identity_chipid	0
+	#define	core_config_cir_identity_arcnum	0
+	#define	core_config_cir_identity_arcver	68
+	#define	core_config_cir_identity_family	4
+	#define	core_config_cir_identity_corever	4
+	#define	core_config_cir_aux_dccm	0x80000000
+	#define	core_config_bcr_bcr_ver	0x00000002
+	#define	core_config_bcr_bcr_ver_version	2
+	#define	core_config_bcr_vecbase_ac_build	0x00000010
+	#define	core_config_bcr_vecbase_ac_build_version	4
+	#define	core_config_bcr_vecbase_ac_build_vector_config	0
+	#define	core_config_bcr_vecbase_ac_build_addr	0
+	#define	core_config_bcr_mpu_build	0x00001002
+	#define	core_config_bcr_mpu_build_i	0
+	#define	core_config_bcr_mpu_build_s	0
+	#define	core_config_bcr_mpu_build_regions	16
+	#define	core_config_bcr_mpu_build_version	2
+	#define	core_config_bcr_rf_build	0x0000c902
+	#define	core_config_bcr_rf_build_version	2
+	#define	core_config_bcr_rf_build_p	1
+	#define	core_config_bcr_rf_build_e	0
+	#define	core_config_bcr_rf_build_r	0
+	#define	core_config_bcr_rf_build_b	1
+	#define	core_config_bcr_rf_build_d	3
+	#define	core_config_bcr_d_cache_build	0x00215104
+	#define	core_config_bcr_d_cache_build_version	4
+	#define	core_config_bcr_d_cache_build_assoc	1
+	#define	core_config_bcr_d_cache_build_capacity	5
+	#define	core_config_bcr_d_cache_build_bsize	1
+	#define	core_config_bcr_d_cache_build_fl	2
+	#define	core_config_bcr_d_cache_build_ioc	0
+	#define	core_config_bcr_d_cache_build_cp	0
+	#define	core_config_bcr_d_cache_build_u	0
+	#define	core_config_bcr_d_cache_build_cycles	0
+	#define	core_config_bcr_dccm_build	0x00000904
+	#define	core_config_bcr_dccm_build_w	0
+	#define	core_config_bcr_dccm_build_cycles	0
+	#define	core_config_bcr_dccm_build_interleave	0
+	#define	core_config_bcr_dccm_build_size1	0
+	#define	core_config_bcr_dccm_build_size0	9
+	#define	core_config_bcr_dccm_build_version	4
+	#define	core_config_bcr_timer_build	0x00010504
+	#define	core_config_bcr_timer_build_sp1	0
+	#define	core_config_bcr_timer_build_sp0	0
+	#define	core_config_bcr_timer_build_p1	0
+	#define	core_config_bcr_timer_build_p0	1
+	#define	core_config_bcr_timer_build_st1	0
+	#define	core_config_bcr_timer_build_st0	0
+	#define	core_config_bcr_timer_build_rtc	1
+	#define	core_config_bcr_timer_build_rtsc_ver	1
+	#define	core_config_bcr_timer_build_rtsc	0
+	#define	core_config_bcr_timer_build_t0	1
+	#define	core_config_bcr_timer_build_t1	0
+	#define	core_config_bcr_timer_build_version	4
+	#define	core_config_bcr_ap_build	0x00000605
+	#define	core_config_bcr_ap_build_version	5
+	#define	core_config_bcr_ap_build_type	6
+	#define	core_config_bcr_i_cache_build	0x00135104
+	#define	core_config_bcr_i_cache_build_assoc	1
+	#define	core_config_bcr_i_cache_build_version	4
+	#define	core_config_bcr_i_cache_build_capacity	5
+	#define	core_config_bcr_i_cache_build_bsize	3
+	#define	core_config_bcr_i_cache_build_fl	1
+	#define	core_config_bcr_i_cache_build_d	0
+	#define	core_config_bcr_iccm_build	0x00000904
+	#define	core_config_bcr_iccm_build_w0	0
+	#define	core_config_bcr_iccm_build_iccm1_size1	0
+	#define	core_config_bcr_iccm_build_iccm0_size1	0
+	#define	core_config_bcr_iccm_build_iccm1_size0	0
+	#define	core_config_bcr_iccm_build_iccm0_size0	9
+	#define	core_config_bcr_iccm_build_version	4
+	#define	core_config_bcr_xy_build	0x00002220
+	#define	core_config_bcr_xy_build_memsize	2
+	#define	core_config_bcr_xy_build_interleaved	0
+	#define	core_config_bcr_xy_build_config	2
+	#define	core_config_bcr_xy_build_version	32
+	#define	core_config_bcr_dsp_build	0x00003521
+	#define	core_config_bcr_dsp_build_wide	0
+	#define	core_config_bcr_dsp_build_itu_pa	1
+	#define	core_config_bcr_dsp_build_acc_shift	2
+	#define	core_config_bcr_dsp_build_comp	1
+	#define	core_config_bcr_dsp_build_divsqrt	1
+	#define	core_config_bcr_dsp_build_version	33
+	#define	core_config_bcr_multiply_build	0x00022206
+	#define	core_config_bcr_multiply_build_version16x16	2
+	#define	core_config_bcr_multiply_build_dsp	2
+	#define	core_config_bcr_multiply_build_cyc	0
+	#define	core_config_bcr_multiply_build_type	2
+	#define	core_config_bcr_multiply_build_version32x32	6
+	#define	core_config_bcr_swap_build	0x00000003
+	#define	core_config_bcr_swap_build_version	3
+	#define	core_config_bcr_norm_build	0x00000003
+	#define	core_config_bcr_norm_build_version	3
+	#define	core_config_bcr_minmax_build	0x00000002
+	#define	core_config_bcr_minmax_build_version	2
+	#define	core_config_bcr_barrel_build	0x00000303
+	#define	core_config_bcr_barrel_build_version	3
+	#define	core_config_bcr_barrel_build_shift_option	3
+	#define	core_config_bcr_isa_config	0x12447402
+	#define	core_config_bcr_isa_config_res1	0
+	#define	core_config_bcr_isa_config_d	1
+	#define	core_config_bcr_isa_config_res2	0
+	#define	core_config_bcr_isa_config_f	0
+	#define	core_config_bcr_isa_config_c	2
+	#define	core_config_bcr_isa_config_l	0
+	#define	core_config_bcr_isa_config_n	1
+	#define	core_config_bcr_isa_config_a	0
+	#define	core_config_bcr_isa_config_b	0
+	#define	core_config_bcr_isa_config_addr_size	4
+	#define	core_config_bcr_isa_config_lpc_size	7
+	#define	core_config_bcr_isa_config_pc_size	4
+	#define	core_config_bcr_isa_config_version	2
+	#define	core_config_bcr_dmp_pp_build	0xf0000012
+	#define	core_config_bcr_stack_region_build	0x00000002
+	#define	core_config_bcr_erp_build	0x50000004
+	#define	core_config_bcr_erp_build_l	0
+	#define	core_config_bcr_erp_build_wd	2
+	#define	core_config_bcr_erp_build_c	1
+	#define	core_config_bcr_erp_build_mmu	0
+	#define	core_config_bcr_erp_build_rf	0
+	#define	core_config_bcr_erp_build_pc	0
+	#define	core_config_bcr_erp_build_ic	0
+	#define	core_config_bcr_erp_build_dc	0
+	#define	core_config_bcr_erp_build_ip	0
+	#define	core_config_bcr_erp_build_dp	0
+	#define	core_config_bcr_erp_build_version	4
+	#define	core_config_bcr_fpu_build	0x01004f03
+	#define	core_config_bcr_fpu_build_da	1
+	#define	core_config_bcr_fpu_build_dd	0
+	#define	core_config_bcr_fpu_build_dc	0
+	#define	core_config_bcr_fpu_build_df	0
+	#define	core_config_bcr_fpu_build_dp	0
+	#define	core_config_bcr_fpu_build_fd_v1	2
+	#define	core_config_bcr_fpu_build_pi	1
+	#define	core_config_bcr_fpu_build_fd	0
+	#define	core_config_bcr_fpu_build_fm	0
+	#define	core_config_bcr_fpu_build_sd	1
+	#define	core_config_bcr_fpu_build_sc	1
+	#define	core_config_bcr_fpu_build_sf	1
+	#define	core_config_bcr_fpu_build_sp	1
+	#define	core_config_bcr_fpu_build_version	3
+	#define	core_config_bcr_bs_build	0x00000002
+	#define	core_config_bcr_bs_build_version	2
+	#define	core_config_bcr_agu_build	0x01988c02
+	#define	core_config_bcr_agu_build_accordian	1
+	#define	core_config_bcr_agu_build_wb_size	4
+	#define	core_config_bcr_agu_build_num_modifier	24
+	#define	core_config_bcr_agu_build_num_offset	8
+	#define	core_config_bcr_agu_build_num_addr	12
+	#define	core_config_bcr_agu_build_version	2
+	#define	core_config_bcr_dmac_build	0x00120f02
+	#define	core_config_bcr_dmac_build_int_cfg	2
+	#define	core_config_bcr_dmac_build_fifo	1
+	#define	core_config_bcr_dmac_build_chan_mem	0
+	#define	core_config_bcr_dmac_build_channels	15
+	#define	core_config_bcr_dmac_build_version	2
+	#define	core_config_bcr_subsys_build	0x00101063
+	#define	core_config_bcr_subsys_build_version_major	0
+	#define	core_config_bcr_subsys_build_version_minor	2
+	#define	core_config_bcr_subsys_build_version_build	6
+	#define	core_config_bcr_subsys_build_type	3
+	#define	core_config_bcr_core_config	0x00000001
+	#define	core_config_bcr_core_config_turbo_boost	0
+	#define	core_config_bcr_core_config_version	1
+	#define	core_config_bcr_rtt_build	0x00000503
+	#define	core_config_bcr_rtt_build_prod_src_num	0
+	#define	core_config_bcr_rtt_build_fl	2
+	#define	core_config_bcr_rtt_build_pi	1
+	#define	core_config_bcr_rtt_build_version	3
+	#define	core_config_bcr_irq_build	0x134d6001
+	#define	core_config_bcr_irq_build_raz	0
+	#define	core_config_bcr_irq_build_nmi	0
+	#define	core_config_bcr_irq_build_f	1
+	#define	core_config_bcr_irq_build_p	3
+	#define	core_config_bcr_irq_build_exts	77
+	#define	core_config_bcr_irq_build_irqs	96
+	#define	core_config_bcr_irq_build_version	1
+	#define	core_config_bcr_pct_build	0x08080104
+	#define	core_config_bcr_pct_build_version	4
+	#define	core_config_bcr_pct_build_s	1
+	#define	core_config_bcr_pct_build_i	0
+	#define	core_config_bcr_pct_build_c	8
+	#define	core_config_bcr_cc_build	0x006f0004
+	#define	core_config_bcr_cc_build_version	4
+	#define	core_config_bcr_cc_build_cc	111
+	#define	core_config_bcr_smart_build	0x00002003
+	#define	core_config_bcr_smart_build_version	3
+	#define	core_config_bcr_smart_build_stack_size	8
+	#define	core_config_cir_aux_iccm	0x60000000
+	#define	core_config_cir_xccm_base	0x90000000
+	#define	core_config_cir_yccm_base	0xa0000000
+	#define	core_config_cir_subsys_dsp_0_build	0x00001000
+	#define	core_config_cir_subsys_io_0_build	0x171700f0
+	#define	core_config_cir_subsys_io_1_build	0x00000007
+	#define	core_config_cir_subsys_io_2_build	0x00000111
+	#define	core_config_cir_subsys_uaux_offset	0x00100000
+	#define	core_config_cir_subsys_apex_offset	0x80000000
+	#define	core_config_family	4
+	#define	core_config_core_version	4
+	#define	core_config_family_name	"arcv2em"
+	#define	core_config_rgf_num_banks	2
+	#define	core_config_rgf_banked_regs	32
+	#define	core_config_rgf_num_wr_ports	2
+	#define	core_config_endian	"little"
+	#define	core_config_endian_little	1
+	#define	core_config_endian_big	0
+	#define	core_config_lpc_size	32
+	#define	core_config_pc_size	32
+	#define	core_config_addr_size	32
+	#define	core_config_unaligned	1
+	#define	core_config_code_density	1
+	#define	core_config_div_rem	"radix2"
+	#define	core_config_div_rem_radix2	1
+	#define	core_config_swap	1
+	#define	core_config_bitscan	1
+	#define	core_config_mpy_option	"mpyd"
+	#define	core_config_mpy_option_num	8
+	#define	core_config_shift_assist	1
+	#define	core_config_barrel_shifter	1
+	#define	core_config_dsp	1
+	#define	core_config_dsp2	1
+	#define	core_config_dsp_complex	1
+	#define	core_config_dsp_divsqrt	"radix2"
+	#define	core_config_dsp_divsqrt_radix2	1
+	#define	core_config_dsp_itu	1
+	#define	core_config_dsp_accshift	"full"
+	#define	core_config_dsp_accshift_full	1
+	#define	core_config_agu_large	1
+	#define	core_config_agu_wb_depth	4
+	#define	core_config_agu_accord	1
+	#define	core_config_xy	1
+	#define	core_config_xy_config	"dccm_x_y"
+	#define	core_config_xy_config_dccm_x_y	1
+	#define	core_config_xy_size	16384
+	#define	core_config_xy_size_KM	"16K"
+	#define	core_config_xy_x_base	0x90000000
+	#define	core_config_xy_y_base	0xa0000000
+	#define	core_config_bitstream	1
+	#define	core_config_fpus_div	1
+	#define	core_config_fpu_mac	1
+	#define	core_config_fpuda	1
+	#define	core_config_fpus_mpy_slow	1
+	#define	core_config_fpus_div_slow	1
+	#define	core_config_fpu_pipe_impl	1
+	#define	core_config_timer0	1
+	#define	core_config_timer0_level	1
+	#define	core_config_timer0_vector	16
+	#define	core_config_rtc	1
+	#define	core_config_action_points	8
+	#define	core_config_stack_check	1
+	#define	core_config_smart_stack_entries	8
+	#define	core_config_mpu_present	1
+	#define	core_config_mpu	1
+	#define	core_config_mpu_regions	16
+	#define	core_config_interrupts_present	1
+	#define	core_config_interrupts_number	96
+	#define	core_config_interrupts_priorities	4
+	#define	core_config_interrupts_externals	77
+	#define	core_config_interrupts	96
+	#define	core_config_interrupt_priorities	4
+	#define	core_config_ext_interrupts	77
+	#define	core_config_interrupts_firq	1
+	#define	core_config_interrupts_base	0x0
+	#define	core_config_dcache_present	1
+	#define	core_config_dcache_size	16384
+	#define	core_config_dcache_line_size	32
+	#define	core_config_dcache_ways	2
+	#define	core_config_dcache_feature	2
+	#define	core_config_icache_present	1
+	#define	core_config_icache_size	16384
+	#define	core_config_icache_line_size	64
+	#define	core_config_icache_ways	2
+	#define	core_config_icache_feature	1
+	#define	core_config_dccm_present	1
+	#define	core_config_dccm_size	0x20000
+	#define	core_config_dccm_base	0x80000000
+	#define	core_config_iccm_present	1
+	#define	core_config_iccm0_present	1
+	#define	core_config_iccm_size	0x20000
+	#define	core_config_iccm0_size	0x20000
+	#define	core_config_iccm_base	0x60000000
+	#define	core_config_iccm0_base	0x60000000
+	#define	core_config_error_prot_ver	4
+	#define	core_config_ccm_prot_pipelined	1
+	#define	core_config_watchdog	1
+	#define	core_config_watchdog_size	32
+	#define	core_config_pct_counters	8
+	#define	core_config_dmac	1
+	#define	core_config_dmac_channels	16
+	#define	core_config_dmac_registers	0
+	#define	core_config_dmac_fifo_depth	2
+	#define	core_config_dmac_int_config	"multiple_internal"
+	#define	core_config_clock_speed	10
+#endif /* __core_config_h */
+
+]]></string>
+  </configuration>
+  <configuration name="core" filename="core.props">
+    <string><![CDATA[
+	core_config.cir.identity=0x00000044
+	core_config.cir.identity.chipid=0
+	core_config.cir.identity.arcnum=0
+	core_config.cir.identity.arcver=68
+	core_config.cir.identity.family=4
+	core_config.cir.identity.corever=4
+	core_config.cir.aux_dccm=0x80000000
+	core_config.bcr.bcr_ver=0x00000002
+	core_config.bcr.bcr_ver.version=2
+	core_config.bcr.vecbase_ac_build=0x00000010
+	core_config.bcr.vecbase_ac_build.version=4
+	core_config.bcr.vecbase_ac_build.vector_config=0
+	core_config.bcr.vecbase_ac_build.addr=0
+	core_config.bcr.mpu_build=0x00001002
+	core_config.bcr.mpu_build.i=0
+	core_config.bcr.mpu_build.s=0
+	core_config.bcr.mpu_build.regions=16
+	core_config.bcr.mpu_build.version=2
+	core_config.bcr.rf_build=0x0000c902
+	core_config.bcr.rf_build.version=2
+	core_config.bcr.rf_build.p=1
+	core_config.bcr.rf_build.e=0
+	core_config.bcr.rf_build.r=0
+	core_config.bcr.rf_build.b=1
+	core_config.bcr.rf_build.d=3
+	core_config.bcr.d_cache_build=0x00215104
+	core_config.bcr.d_cache_build.version=4
+	core_config.bcr.d_cache_build.assoc=1
+	core_config.bcr.d_cache_build.capacity=5
+	core_config.bcr.d_cache_build.bsize=1
+	core_config.bcr.d_cache_build.fl=2
+	core_config.bcr.d_cache_build.ioc=0
+	core_config.bcr.d_cache_build.cp=0
+	core_config.bcr.d_cache_build.u=0
+	core_config.bcr.d_cache_build.cycles=0
+	core_config.bcr.dccm_build=0x00000904
+	core_config.bcr.dccm_build.w=0
+	core_config.bcr.dccm_build.cycles=0
+	core_config.bcr.dccm_build.interleave=0
+	core_config.bcr.dccm_build.size1=0
+	core_config.bcr.dccm_build.size0=9
+	core_config.bcr.dccm_build.version=4
+	core_config.bcr.timer_build=0x00010504
+	core_config.bcr.timer_build.sp1=0
+	core_config.bcr.timer_build.sp0=0
+	core_config.bcr.timer_build.p1=0
+	core_config.bcr.timer_build.p0=1
+	core_config.bcr.timer_build.st1=0
+	core_config.bcr.timer_build.st0=0
+	core_config.bcr.timer_build.rtc=1
+	core_config.bcr.timer_build.rtsc_ver=1
+	core_config.bcr.timer_build.rtsc=0
+	core_config.bcr.timer_build.t0=1
+	core_config.bcr.timer_build.t1=0
+	core_config.bcr.timer_build.version=4
+	core_config.bcr.ap_build=0x00000605
+	core_config.bcr.ap_build.version=5
+	core_config.bcr.ap_build.type=6
+	core_config.bcr.i_cache_build=0x00135104
+	core_config.bcr.i_cache_build.assoc=1
+	core_config.bcr.i_cache_build.version=4
+	core_config.bcr.i_cache_build.capacity=5
+	core_config.bcr.i_cache_build.bsize=3
+	core_config.bcr.i_cache_build.fl=1
+	core_config.bcr.i_cache_build.d=0
+	core_config.bcr.iccm_build=0x00000904
+	core_config.bcr.iccm_build.w0=0
+	core_config.bcr.iccm_build.iccm1_size1=0
+	core_config.bcr.iccm_build.iccm0_size1=0
+	core_config.bcr.iccm_build.iccm1_size0=0
+	core_config.bcr.iccm_build.iccm0_size0=9
+	core_config.bcr.iccm_build.version=4
+	core_config.bcr.xy_build=0x00002220
+	core_config.bcr.xy_build.memsize=2
+	core_config.bcr.xy_build.interleaved=0
+	core_config.bcr.xy_build.config=2
+	core_config.bcr.xy_build.version=32
+	core_config.bcr.dsp_build=0x00003521
+	core_config.bcr.dsp_build.wide=0
+	core_config.bcr.dsp_build.itu_pa=1
+	core_config.bcr.dsp_build.acc_shift=2
+	core_config.bcr.dsp_build.comp=1
+	core_config.bcr.dsp_build.divsqrt=1
+	core_config.bcr.dsp_build.version=33
+	core_config.bcr.multiply_build=0x00022206
+	core_config.bcr.multiply_build.version16x16=2
+	core_config.bcr.multiply_build.dsp=2
+	core_config.bcr.multiply_build.cyc=0
+	core_config.bcr.multiply_build.type=2
+	core_config.bcr.multiply_build.version32x32=6
+	core_config.bcr.swap_build=0x00000003
+	core_config.bcr.swap_build.version=3
+	core_config.bcr.norm_build=0x00000003
+	core_config.bcr.norm_build.version=3
+	core_config.bcr.minmax_build=0x00000002
+	core_config.bcr.minmax_build.version=2
+	core_config.bcr.barrel_build=0x00000303
+	core_config.bcr.barrel_build.version=3
+	core_config.bcr.barrel_build.shift_option=3
+	core_config.bcr.isa_config=0x12447402
+	core_config.bcr.isa_config.res1=0
+	core_config.bcr.isa_config.d=1
+	core_config.bcr.isa_config.res2=0
+	core_config.bcr.isa_config.f=0
+	core_config.bcr.isa_config.c=2
+	core_config.bcr.isa_config.l=0
+	core_config.bcr.isa_config.n=1
+	core_config.bcr.isa_config.a=0
+	core_config.bcr.isa_config.b=0
+	core_config.bcr.isa_config.addr_size=4
+	core_config.bcr.isa_config.lpc_size=7
+	core_config.bcr.isa_config.pc_size=4
+	core_config.bcr.isa_config.version=2
+	core_config.bcr.dmp_pp_build=0xf0000012
+	core_config.bcr.stack_region_build=0x00000002
+	core_config.bcr.erp_build=0x50000004
+	core_config.bcr.erp_build.l=0
+	core_config.bcr.erp_build.wd=2
+	core_config.bcr.erp_build.c=1
+	core_config.bcr.erp_build.mmu=0
+	core_config.bcr.erp_build.rf=0
+	core_config.bcr.erp_build.pc=0
+	core_config.bcr.erp_build.ic=0
+	core_config.bcr.erp_build.dc=0
+	core_config.bcr.erp_build.ip=0
+	core_config.bcr.erp_build.dp=0
+	core_config.bcr.erp_build.version=4
+	core_config.bcr.fpu_build=0x01004f03
+	core_config.bcr.fpu_build.da=1
+	core_config.bcr.fpu_build.dd=0
+	core_config.bcr.fpu_build.dc=0
+	core_config.bcr.fpu_build.df=0
+	core_config.bcr.fpu_build.dp=0
+	core_config.bcr.fpu_build.fd_v1=2
+	core_config.bcr.fpu_build.pi=1
+	core_config.bcr.fpu_build.fd=0
+	core_config.bcr.fpu_build.fm=0
+	core_config.bcr.fpu_build.sd=1
+	core_config.bcr.fpu_build.sc=1
+	core_config.bcr.fpu_build.sf=1
+	core_config.bcr.fpu_build.sp=1
+	core_config.bcr.fpu_build.version=3
+	core_config.bcr.bs_build=0x00000002
+	core_config.bcr.bs_build.version=2
+	core_config.bcr.agu_build=0x01988c02
+	core_config.bcr.agu_build.accordian=1
+	core_config.bcr.agu_build.wb_size=4
+	core_config.bcr.agu_build.num_modifier=24
+	core_config.bcr.agu_build.num_offset=8
+	core_config.bcr.agu_build.num_addr=12
+	core_config.bcr.agu_build.version=2
+	core_config.bcr.dmac_build=0x00120f02
+	core_config.bcr.dmac_build.int_cfg=2
+	core_config.bcr.dmac_build.fifo=1
+	core_config.bcr.dmac_build.chan_mem=0
+	core_config.bcr.dmac_build.channels=15
+	core_config.bcr.dmac_build.version=2
+	core_config.bcr.subsys_build=0x00101063
+	core_config.bcr.subsys_build.version_major=0
+	core_config.bcr.subsys_build.version_minor=2
+	core_config.bcr.subsys_build.version_build=6
+	core_config.bcr.subsys_build.type=3
+	core_config.bcr.core_config=0x00000001
+	core_config.bcr.core_config.turbo_boost=0
+	core_config.bcr.core_config.version=1
+	core_config.bcr.rtt_build=0x00000503
+	core_config.bcr.rtt_build.prod_src_num=0
+	core_config.bcr.rtt_build.fl=2
+	core_config.bcr.rtt_build.pi=1
+	core_config.bcr.rtt_build.version=3
+	core_config.bcr.irq_build=0x134d6001
+	core_config.bcr.irq_build.raz=0
+	core_config.bcr.irq_build.nmi=0
+	core_config.bcr.irq_build.f=1
+	core_config.bcr.irq_build.p=3
+	core_config.bcr.irq_build.exts=77
+	core_config.bcr.irq_build.irqs=96
+	core_config.bcr.irq_build.version=1
+	core_config.bcr.pct_build=0x08080104
+	core_config.bcr.pct_build.version=4
+	core_config.bcr.pct_build.s=1
+	core_config.bcr.pct_build.i=0
+	core_config.bcr.pct_build.c=8
+	core_config.bcr.cc_build=0x006f0004
+	core_config.bcr.cc_build.version=4
+	core_config.bcr.cc_build.cc=111
+	core_config.bcr.smart_build=0x00002003
+	core_config.bcr.smart_build.version=3
+	core_config.bcr.smart_build.stack_size=8
+	core_config.cir.aux_iccm=0x60000000
+	core_config.cir.xccm_base=0x90000000
+	core_config.cir.yccm_base=0xa0000000
+	core_config.cir.subsys_dsp_0_build=0x00001000
+	core_config.cir.subsys_io_0_build=0x171700f0
+	core_config.cir.subsys_io_1_build=0x00000007
+	core_config.cir.subsys_io_2_build=0x00000111
+	core_config.cir.subsys_uaux_offset=0x00100000
+	core_config.cir.subsys_apex_offset=0x80000000
+	core_config.family=4
+	core_config.core_version=4
+	core_config.family_name=arcv2em
+	core_config.rgf_num_banks=2
+	core_config.rgf_banked_regs=32
+	core_config.rgf_num_wr_ports=2
+	core_config.endian=little
+	core_config.endian_little=1
+	core_config.endian_big=0
+	core_config.lpc_size=32
+	core_config.pc_size=32
+	core_config.addr_size=32
+	core_config.unaligned=1
+	core_config.code_density=1
+	core_config.div_rem=radix2
+	core_config.div_rem_radix2=1
+	core_config.swap=1
+	core_config.bitscan=1
+	core_config.mpy_option=mpyd
+	core_config.mpy_option_num=8
+	core_config.shift_assist=1
+	core_config.barrel_shifter=1
+	core_config.dsp=1
+	core_config.dsp2=1
+	core_config.dsp_complex=1
+	core_config.dsp_divsqrt=radix2
+	core_config.dsp_divsqrt_radix2=1
+	core_config.dsp_itu=1
+	core_config.dsp_accshift=full
+	core_config.dsp_accshift_full=1
+	core_config.agu_large=1
+	core_config.agu_wb_depth=4
+	core_config.agu_accord=1
+	core_config.xy=1
+	core_config.xy_config=dccm_x_y
+	core_config.xy_config_dccm_x_y=1
+	core_config.xy_size=16K
+	core_config.xy_x_base=0x90000000
+	core_config.xy_y_base=0xa0000000
+	core_config.bitstream=1
+	core_config.fpus_div=1
+	core_config.fpu_mac=1
+	core_config.fpuda=1
+	core_config.fpus_mpy_slow=1
+	core_config.fpus_div_slow=1
+	core_config.fpu_pipe_impl=1
+	core_config.timer0=1
+	core_config.timer0_level=1
+	core_config.timer0.vector=16
+	core_config.rtc=1
+	core_config.action_points=8
+	core_config.stack_check=1
+	core_config.smart_stack_entries=8
+	core_config.mpu.present=1
+	core_config.mpu=1
+	core_config.mpu.regions=16
+	core_config.interrupts.present=1
+	core_config.interrupts.number=96
+	core_config.interrupts.priorities=4
+	core_config.interrupts.externals=77
+	core_config.interrupts=96
+	core_config.interrupt_priorities=4
+	core_config.ext_interrupts=77
+	core_config.interrupts.firq=1
+	core_config.interrupts.base=0x0
+	core_config.dcache.present=1
+	core_config.dcache.size=16384
+	core_config.dcache.line_size=32
+	core_config.dcache.ways=2
+	core_config.dcache_feature=2
+	core_config.icache.present=1
+	core_config.icache.size=16384
+	core_config.icache.line_size=64
+	core_config.icache.ways=2
+	core_config.icache_feature=1
+	core_config.dccm.present=1
+	core_config.dccm_size=0x20000
+	core_config.dccm_base=0x80000000
+	core_config.iccm.present=1
+	core_config.iccm0.present=1
+	core_config.iccm.size=0x20000
+	core_config.iccm0.size=0x20000
+	core_config.iccm.base=0x60000000
+	core_config.iccm0.base=0x60000000
+	core_config.error_prot_ver=4
+	core_config.ccm_prot_pipelined=1
+	core_config.watchdog=1
+	core_config.watchdog_size=32
+	core_config.pct_counters=8
+	core_config.dmac=1
+	core_config.dmac_channels=16
+	core_config.dmac_registers=0
+	core_config.dmac_fifo_depth=2
+	core_config.dmac_int_config=multiple_internal
+	core_config.clock_speed=10
+]]></string>
+  </configuration>
+  <configuration name="gcc_compiler" filename="gcc.arg">
+    <string><![CDATA[
+	-mcpu=em4_fpuda
+	-mlittle-endian
+	-mcode-density
+	-mdiv-rem
+	-mswap
+	-mnorm
+	-mmpy-option=6
+	-mbarrel-shifter
+	-mfpu=fpuda_all
+	--param l1-cache-size=16384
+	--param l1-cache-line-size=32
+]]></string>
+  </configuration>
+  <configuration name="linker_command_file" filename="link_cmd.txt">
+    <string><![CDATA[
+# SYSTEM memory regions indicate where external memory might be located.
+#   The TCF has no specific knowledge of whether SYSTEM regions contain 
+#   external memory or not.
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    IVT     : ORIGIN = 0x00000000, LENGTH = 0x60000000
+    ICCM0   : ORIGIN = 0x60000000, LENGTH = 0x00020000
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+#   SYSTEM1 : ORIGIN = 0x70000000, LENGTH = 0x10000000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    SYSTEM2 : ORIGIN = 0xb0000000, LENGTH = 0x50000000
+    }
+SECTIONS {
+    GROUP BLOCK(4): {
+	.text? : { *('.text$crt*') }
+        * (TEXT): {}
+    	* (LIT): {}
+	} > ICCM0
+
+    GROUP BLOCK(4): {
+	/* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
+	} > SYSTEM2
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+        } > XCCM
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+        } > YCCM
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+        } > IVT
+    }
+
+]]></string>
+  </configuration>
+  <configuration name="gnu_linker_command_file" filename="memory.x">
+    <string><![CDATA[
+MEMORY {
+    IVT      : ORIGIN = 0x00000000, LENGTH = 0x60000000
+    ICCM0    : ORIGIN = 0x60000000, LENGTH = 0x00020000
+    CCMWRAP0 : ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    SYSTEM1  : ORIGIN = 0x70000000, LENGTH = 0x10000000
+    DCCM     : ORIGIN = 0x80000000, LENGTH = 0x00020000
+    CCMWRAP1 : ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM     : ORIGIN = 0x90000000, LENGTH = 0x00004000
+    CCMWRAP2 : ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM     : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+    CCMWRAP3 : ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    SYSTEM2  : ORIGIN = 0xb0000000, LENGTH = 0x50000000
+    }
+REGION_ALIAS("startup", ICCM0)
+REGION_ALIAS("text", ICCM0)
+REGION_ALIAS("data", DCCM)
+REGION_ALIAS("sdata", DCCM)
+PROVIDE (__stack_top = (0x8001ffff & -4 ));
+PROVIDE (__end_heap =  (0x8001ffff ));
+]]></string>
+  </configuration>
+  <configuration name="apex_header" filename="apexextensions.h">
+    <string><![CDATA[
+
+/* **** DO NOT EDIT - this file is generated by ARChitect2 ****
+ *
+ * Description: Header file declaring the compiler extensions for apex components 
+ */
+
+#ifndef _apexextensions_H_
+#define _apexextensions_H_
+
+// User extension instruction - dsp_cos
+extern long dsp_cos(long);
+#pragma intrinsic(dsp_cos, opcode => 0x07, sub_opcode => 0x1E , latency_cycles => 8)
+
+// User extension instruction - dsp_sin
+extern long dsp_sin(long);
+#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8)
+
+// User extension instruction - dsp_tan
+extern long dsp_tan(long);
+#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11)
+
+// User extension instruction - dsp_acos
+extern long dsp_acos(long);
+#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31)
+
+// User extension instruction - dsp_asin
+extern long dsp_asin(long);
+#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31)
+
+// User extension instruction - dsp_atan
+extern long dsp_atan(long);
+#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13)
+
+// User extension instruction - dsp_sqrt
+extern long dsp_sqrt(long);
+#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31)
+
+// User extension instruction - dsp_sqrt15
+extern long dsp_sqrt15(long);
+#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15)
+
+#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT	1
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO0_PRESENT	1
+
+// User extension aux register io_gpio0_debounce
+#define AR_IO_GPIO0_DEBOUNCE 0x80017048
+#pragma Aux_register(0x80017048, name=>"io_gpio0_debounce")
+
+// User extension aux register io_gpio0_clken
+#define AR_IO_GPIO0_CLKEN 0x80017080
+#pragma Aux_register(0x80017080, name=>"io_gpio0_clken")
+
+// User extension aux register io_gpio0_swporta_dr
+#define AR_IO_GPIO0_SWPORTA_DR 0x80017000
+#pragma Aux_register(0x80017000, name=>"io_gpio0_swporta_dr")
+
+// User extension aux register io_gpio0_swporta_ddr
+#define AR_IO_GPIO0_SWPORTA_DDR 0x80017004
+#pragma Aux_register(0x80017004, name=>"io_gpio0_swporta_ddr")
+
+// User extension aux register io_gpio0_inten
+#define AR_IO_GPIO0_INTEN 0x80017030
+#pragma Aux_register(0x80017030, name=>"io_gpio0_inten")
+
+// User extension aux register io_gpio0_intmask
+#define AR_IO_GPIO0_INTMASK 0x80017034
+#pragma Aux_register(0x80017034, name=>"io_gpio0_intmask")
+
+// User extension aux register io_gpio0_inttype_level
+#define AR_IO_GPIO0_INTTYPE_LEVEL 0x80017038
+#pragma Aux_register(0x80017038, name=>"io_gpio0_inttype_level")
+
+// User extension aux register io_gpio0_int_polarity
+#define AR_IO_GPIO0_INT_POLARITY 0x8001703c
+#pragma Aux_register(0x8001703c, name=>"io_gpio0_int_polarity")
+
+// User extension aux register io_gpio0_intstatus
+#define AR_IO_GPIO0_INTSTATUS 0x80017040
+#pragma Aux_register(0x80017040, name=>"io_gpio0_intstatus")
+
+// User extension aux register io_gpio0_raw_intstatus
+#define AR_IO_GPIO0_RAW_INTSTATUS 0x80017044
+#pragma Aux_register(0x80017044, name=>"io_gpio0_raw_intstatus")
+
+// User extension aux register io_gpio0_porta_eoi
+#define AR_IO_GPIO0_PORTA_EOI 0x8001704c
+#pragma Aux_register(0x8001704c, name=>"io_gpio0_porta_eoi")
+
+// User extension aux register io_gpio0_ext_porta
+#define AR_IO_GPIO0_EXT_PORTA 0x80017050
+#pragma Aux_register(0x80017050, name=>"io_gpio0_ext_porta")
+
+// User extension aux register io_gpio0_ls_sync
+#define AR_IO_GPIO0_LS_SYNC 0x80017060
+#pragma Aux_register(0x80017060, name=>"io_gpio0_ls_sync")
+
+// User extension aux register io_gpio0_int_bothedge
+#define AR_IO_GPIO0_INT_BOTHEDGE 0x80017068
+#pragma Aux_register(0x80017068, name=>"io_gpio0_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_PRESENT	1
+
+// User extension aux register io_i2c_mst0_clken
+#define AR_IO_I2C_MST0_CLKEN 0x800120c0
+#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken")
+
+// User extension aux register io_i2c_mst0_con
+#define AR_IO_I2C_MST0_CON 0x80012000
+#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con")
+
+// User extension aux register io_i2c_mst0_tar
+#define AR_IO_I2C_MST0_TAR 0x80012004
+#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar")
+
+// User extension aux register io_i2c_mst0_data_cmd
+#define AR_IO_I2C_MST0_DATA_CMD 0x80012010
+#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd")
+
+// User extension aux register io_i2c_mst0_ss_scl_hcnt
+#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014
+#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt")
+
+// User extension aux register io_i2c_mst0_ss_scl_lcnt
+#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018
+#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt")
+
+// User extension aux register io_i2c_mst0_fs_scl_hcnt
+#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c
+#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt")
+
+// User extension aux register io_i2c_mst0_fs_scl_lcnt
+#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020
+#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt")
+
+// User extension aux register io_i2c_mst0_intr_stat
+#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c
+#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat")
+
+// User extension aux register io_i2c_mst0_intr_mask
+#define AR_IO_I2C_MST0_INTR_MASK 0x80012030
+#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask")
+
+// User extension aux register io_i2c_mst0_raw_intr_stat
+#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034
+#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat")
+
+// User extension aux register io_i2c_mst0_rx_tl
+#define AR_IO_I2C_MST0_RX_TL 0x80012038
+#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl")
+
+// User extension aux register io_i2c_mst0_tx_tl
+#define AR_IO_I2C_MST0_TX_TL 0x8001203c
+#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl")
+
+// User extension aux register io_i2c_mst0_clr_intr
+#define AR_IO_I2C_MST0_CLR_INTR 0x80012040
+#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr")
+
+// User extension aux register io_i2c_mst0_clr_rx_under
+#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044
+#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under")
+
+// User extension aux register io_i2c_mst0_clr_rx_over
+#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048
+#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over")
+
+// User extension aux register io_i2c_mst0_clr_tx_over
+#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c
+#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over")
+
+// User extension aux register io_i2c_mst0_clr_tx_abrt
+#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054
+#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt")
+
+// User extension aux register io_i2c_mst0_clr_activity
+#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c
+#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity")
+
+// User extension aux register io_i2c_mst0_clr_stop_det
+#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060
+#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det")
+
+// User extension aux register io_i2c_mst0_clr_start_det
+#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064
+#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det")
+
+// User extension aux register io_i2c_mst0_enable
+#define AR_IO_I2C_MST0_ENABLE 0x8001206c
+#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable")
+
+// User extension aux register io_i2c_mst0_status
+#define AR_IO_I2C_MST0_STATUS 0x80012070
+#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status")
+
+// User extension aux register io_i2c_mst0_txflr
+#define AR_IO_I2C_MST0_TXFLR 0x80012074
+#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr")
+
+// User extension aux register io_i2c_mst0_rxflr
+#define AR_IO_I2C_MST0_RXFLR 0x80012078
+#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr")
+
+// User extension aux register io_i2c_mst0_sda_hold
+#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c
+#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold")
+
+// User extension aux register io_i2c_mst0_tx_abrt_source
+#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080
+#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source")
+
+// User extension aux register io_i2c_mst0_enable_status
+#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c
+#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status")
+
+// User extension aux register io_i2c_mst0_fs_spklen
+#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0
+#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_SLV0_PRESENT	1
+
+// User extension aux register io_i2c_slv0_clken
+#define AR_IO_I2C_SLV0_CLKEN 0x800130c0
+#pragma Aux_register(0x800130c0, name=>"io_i2c_slv0_clken")
+
+// User extension aux register io_i2c_slv0_con
+#define AR_IO_I2C_SLV0_CON 0x80013000
+#pragma Aux_register(0x80013000, name=>"io_i2c_slv0_con")
+
+// User extension aux register io_i2c_slv0_sar
+#define AR_IO_I2C_SLV0_SAR 0x80013008
+#pragma Aux_register(0x80013008, name=>"io_i2c_slv0_sar")
+
+// User extension aux register io_i2c_slv0_data_cmd
+#define AR_IO_I2C_SLV0_DATA_CMD 0x80013010
+#pragma Aux_register(0x80013010, name=>"io_i2c_slv0_data_cmd")
+
+// User extension aux register io_i2c_slv0_intr_stat
+#define AR_IO_I2C_SLV0_INTR_STAT 0x8001302c
+#pragma Aux_register(0x8001302c, name=>"io_i2c_slv0_intr_stat")
+
+// User extension aux register io_i2c_slv0_intr_mask
+#define AR_IO_I2C_SLV0_INTR_MASK 0x80013030
+#pragma Aux_register(0x80013030, name=>"io_i2c_slv0_intr_mask")
+
+// User extension aux register io_i2c_slv0_raw_intr_stat
+#define AR_IO_I2C_SLV0_RAW_INTR_STAT 0x80013034
+#pragma Aux_register(0x80013034, name=>"io_i2c_slv0_raw_intr_stat")
+
+// User extension aux register io_i2c_slv0_rx_tl
+#define AR_IO_I2C_SLV0_RX_TL 0x80013038
+#pragma Aux_register(0x80013038, name=>"io_i2c_slv0_rx_tl")
+
+// User extension aux register io_i2c_slv0_tx_tl
+#define AR_IO_I2C_SLV0_TX_TL 0x8001303c
+#pragma Aux_register(0x8001303c, name=>"io_i2c_slv0_tx_tl")
+
+// User extension aux register io_i2c_slv0_clr_intr
+#define AR_IO_I2C_SLV0_CLR_INTR 0x80013040
+#pragma Aux_register(0x80013040, name=>"io_i2c_slv0_clr_intr")
+
+// User extension aux register io_i2c_slv0_clr_rx_under
+#define AR_IO_I2C_SLV0_CLR_RX_UNDER 0x80013044
+#pragma Aux_register(0x80013044, name=>"io_i2c_slv0_clr_rx_under")
+
+// User extension aux register io_i2c_slv0_clr_rx_over
+#define AR_IO_I2C_SLV0_CLR_RX_OVER 0x80013048
+#pragma Aux_register(0x80013048, name=>"io_i2c_slv0_clr_rx_over")
+
+// User extension aux register io_i2c_slv0_clr_tx_over
+#define AR_IO_I2C_SLV0_CLR_TX_OVER 0x8001304c
+#pragma Aux_register(0x8001304c, name=>"io_i2c_slv0_clr_tx_over")
+
+// User extension aux register io_i2c_slv0_clr_rd_req
+#define AR_IO_I2C_SLV0_CLR_RD_REQ 0x80013050
+#pragma Aux_register(0x80013050, name=>"io_i2c_slv0_clr_rd_req")
+
+// User extension aux register io_i2c_slv0_clr_tx_abrt
+#define AR_IO_I2C_SLV0_CLR_TX_ABRT 0x80013054
+#pragma Aux_register(0x80013054, name=>"io_i2c_slv0_clr_tx_abrt")
+
+// User extension aux register io_i2c_slv0_clr_rx_done
+#define AR_IO_I2C_SLV0_CLR_RX_DONE 0x80013058
+#pragma Aux_register(0x80013058, name=>"io_i2c_slv0_clr_rx_done")
+
+// User extension aux register io_i2c_slv0_clr_activity
+#define AR_IO_I2C_SLV0_CLR_ACTIVITY 0x8001305c
+#pragma Aux_register(0x8001305c, name=>"io_i2c_slv0_clr_activity")
+
+// User extension aux register io_i2c_slv0_clr_stop_det
+#define AR_IO_I2C_SLV0_CLR_STOP_DET 0x80013060
+#pragma Aux_register(0x80013060, name=>"io_i2c_slv0_clr_stop_det")
+
+// User extension aux register io_i2c_slv0_clr_start_det
+#define AR_IO_I2C_SLV0_CLR_START_DET 0x80013064
+#pragma Aux_register(0x80013064, name=>"io_i2c_slv0_clr_start_det")
+
+// User extension aux register io_i2c_slv0_enable
+#define AR_IO_I2C_SLV0_ENABLE 0x8001306c
+#pragma Aux_register(0x8001306c, name=>"io_i2c_slv0_enable")
+
+// User extension aux register io_i2c_slv0_status
+#define AR_IO_I2C_SLV0_STATUS 0x80013070
+#pragma Aux_register(0x80013070, name=>"io_i2c_slv0_status")
+
+// User extension aux register io_i2c_slv0_txflr
+#define AR_IO_I2C_SLV0_TXFLR 0x80013074
+#pragma Aux_register(0x80013074, name=>"io_i2c_slv0_txflr")
+
+// User extension aux register io_i2c_slv0_rxflr
+#define AR_IO_I2C_SLV0_RXFLR 0x80013078
+#pragma Aux_register(0x80013078, name=>"io_i2c_slv0_rxflr")
+
+// User extension aux register io_i2c_slv0_sda_hold
+#define AR_IO_I2C_SLV0_SDA_HOLD 0x8001307c
+#pragma Aux_register(0x8001307c, name=>"io_i2c_slv0_sda_hold")
+
+// User extension aux register io_i2c_slv0_tx_abrt_source
+#define AR_IO_I2C_SLV0_TX_ABRT_SOURCE 0x80013080
+#pragma Aux_register(0x80013080, name=>"io_i2c_slv0_tx_abrt_source")
+
+// User extension aux register io_i2c_slv0_sda_setup
+#define AR_IO_I2C_SLV0_SDA_SETUP 0x80013094
+#pragma Aux_register(0x80013094, name=>"io_i2c_slv0_sda_setup")
+
+// User extension aux register io_i2c_slv0_enable_status
+#define AR_IO_I2C_SLV0_ENABLE_STATUS 0x8001309c
+#pragma Aux_register(0x8001309c, name=>"io_i2c_slv0_enable_status")
+
+// User extension aux register io_i2c_slv0_fs_spklen
+#define AR_IO_I2C_SLV0_FS_SPKLEN 0x800130a0
+#pragma Aux_register(0x800130a0, name=>"io_i2c_slv0_fs_spklen")
+
+// User extension aux register io_i2c_slv0_clr_restart_det
+#define AR_IO_I2C_SLV0_CLR_RESTART_DET 0x800130a8
+#pragma Aux_register(0x800130a8, name=>"io_i2c_slv0_clr_restart_det")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_PRESENT	1
+
+// User extension aux register io_spi_mst0_ctrlr0
+#define AR_IO_SPI_MST0_CTRLR0 0x80010000
+#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0")
+
+// User extension aux register io_spi_mst0_ctrlr1
+#define AR_IO_SPI_MST0_CTRLR1 0x80010001
+#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1")
+
+// User extension aux register io_spi_mst0_spien
+#define AR_IO_SPI_MST0_SPIEN 0x80010002
+#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien")
+
+// User extension aux register io_spi_mst0_ser
+#define AR_IO_SPI_MST0_SER 0x80010004
+#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser")
+
+// User extension aux register io_spi_mst0_baudr
+#define AR_IO_SPI_MST0_BAUDR 0x80010005
+#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr")
+
+// User extension aux register io_spi_mst0_txftlr
+#define AR_IO_SPI_MST0_TXFTLR 0x80010006
+#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr")
+
+// User extension aux register io_spi_mst0_rxftlr
+#define AR_IO_SPI_MST0_RXFTLR 0x80010007
+#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr")
+
+// User extension aux register io_spi_mst0_txflr
+#define AR_IO_SPI_MST0_TXFLR 0x80010008
+#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr")
+
+// User extension aux register io_spi_mst0_rxflr
+#define AR_IO_SPI_MST0_RXFLR 0x80010009
+#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr")
+
+// User extension aux register io_spi_mst0_sr
+#define AR_IO_SPI_MST0_SR 0x8001000a
+#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr")
+
+// User extension aux register io_spi_mst0_imr
+#define AR_IO_SPI_MST0_IMR 0x8001000b
+#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr")
+
+// User extension aux register io_spi_mst0_isr
+#define AR_IO_SPI_MST0_ISR 0x8001000c
+#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr")
+
+// User extension aux register io_spi_mst0_risr
+#define AR_IO_SPI_MST0_RISR 0x8001000d
+#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr")
+
+// User extension aux register io_spi_mst0_txoicr
+#define AR_IO_SPI_MST0_TXOICR 0x8001000e
+#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr")
+
+// User extension aux register io_spi_mst0_rxoicr
+#define AR_IO_SPI_MST0_RXOICR 0x8001000f
+#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr")
+
+// User extension aux register io_spi_mst0_rxuicr
+#define AR_IO_SPI_MST0_RXUICR 0x80010010
+#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr")
+
+// User extension aux register io_spi_mst0_icr
+#define AR_IO_SPI_MST0_ICR 0x80010012
+#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr")
+
+// User extension aux register io_spi_mst0_clken
+#define AR_IO_SPI_MST0_CLKEN 0x80010016
+#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken")
+
+// User extension aux register io_spi_mst0_dr
+#define AR_IO_SPI_MST0_DR 0x80010018
+#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr")
+
+// User extension aux register io_spi_mst0_rx_sample_dly
+#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c
+#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly")
+#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_PRESENT	1
+
+// User extension aux register SUBSYS_BUILD
+#define AR_SUBSYS_BUILD 0xf0
+#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD")
+
+// User extension aux register SUBSYS_DSP_0_BUILD
+#define AR_SUBSYS_DSP_0_BUILD 0xa00
+#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD")
+
+// User extension aux register SUBSYS_DSP_0_CONFIG
+#define AR_SUBSYS_DSP_0_CONFIG 0xa02
+#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG")
+
+// User extension aux register SUBSYS_IO_0_BUILD
+#define AR_SUBSYS_IO_0_BUILD 0xa04
+#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD")
+
+// User extension aux register SUBSYS_IO_1_BUILD
+#define AR_SUBSYS_IO_1_BUILD 0xa05
+#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD")
+
+// User extension aux register SUBSYS_IO_2_BUILD
+#define AR_SUBSYS_IO_2_BUILD 0xa06
+#pragma Aux_register(0xa06, name=>"SUBSYS_IO_2_BUILD")
+
+// User extension aux register SUBSYS_UAUX_OFFSET
+#define AR_SUBSYS_UAUX_OFFSET 0xa1e
+#pragma Aux_register(0xa1e, name=>"SUBSYS_UAUX_OFFSET")
+
+// User extension aux register SUBSYS_APEX_OFFSET
+#define AR_SUBSYS_APEX_OFFSET 0xa1f
+#pragma Aux_register(0xa1f, name=>"SUBSYS_APEX_OFFSET")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_PRESENT	1
+
+// User extension aux register io_spi_mst1_ctrlr0
+#define AR_IO_SPI_MST1_CTRLR0 0x80010100
+#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0")
+
+// User extension aux register io_spi_mst1_ctrlr1
+#define AR_IO_SPI_MST1_CTRLR1 0x80010101
+#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1")
+
+// User extension aux register io_spi_mst1_spien
+#define AR_IO_SPI_MST1_SPIEN 0x80010102
+#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien")
+
+// User extension aux register io_spi_mst1_ser
+#define AR_IO_SPI_MST1_SER 0x80010104
+#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser")
+
+// User extension aux register io_spi_mst1_baudr
+#define AR_IO_SPI_MST1_BAUDR 0x80010105
+#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr")
+
+// User extension aux register io_spi_mst1_txftlr
+#define AR_IO_SPI_MST1_TXFTLR 0x80010106
+#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr")
+
+// User extension aux register io_spi_mst1_rxftlr
+#define AR_IO_SPI_MST1_RXFTLR 0x80010107
+#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr")
+
+// User extension aux register io_spi_mst1_txflr
+#define AR_IO_SPI_MST1_TXFLR 0x80010108
+#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr")
+
+// User extension aux register io_spi_mst1_rxflr
+#define AR_IO_SPI_MST1_RXFLR 0x80010109
+#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr")
+
+// User extension aux register io_spi_mst1_sr
+#define AR_IO_SPI_MST1_SR 0x8001010a
+#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr")
+
+// User extension aux register io_spi_mst1_imr
+#define AR_IO_SPI_MST1_IMR 0x8001010b
+#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr")
+
+// User extension aux register io_spi_mst1_isr
+#define AR_IO_SPI_MST1_ISR 0x8001010c
+#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr")
+
+// User extension aux register io_spi_mst1_risr
+#define AR_IO_SPI_MST1_RISR 0x8001010d
+#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr")
+
+// User extension aux register io_spi_mst1_txoicr
+#define AR_IO_SPI_MST1_TXOICR 0x8001010e
+#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr")
+
+// User extension aux register io_spi_mst1_rxoicr
+#define AR_IO_SPI_MST1_RXOICR 0x8001010f
+#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr")
+
+// User extension aux register io_spi_mst1_rxuicr
+#define AR_IO_SPI_MST1_RXUICR 0x80010110
+#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr")
+
+// User extension aux register io_spi_mst1_icr
+#define AR_IO_SPI_MST1_ICR 0x80010112
+#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr")
+
+// User extension aux register io_spi_mst1_clken
+#define AR_IO_SPI_MST1_CLKEN 0x80010116
+#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken")
+
+// User extension aux register io_spi_mst1_dr
+#define AR_IO_SPI_MST1_DR 0x80010118
+#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr")
+
+// User extension aux register io_spi_mst1_rx_sample_dly
+#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c
+#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_PRESENT	1
+
+// User extension aux register io_spi_mst2_ctrlr0
+#define AR_IO_SPI_MST2_CTRLR0 0x80010200
+#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0")
+
+// User extension aux register io_spi_mst2_ctrlr1
+#define AR_IO_SPI_MST2_CTRLR1 0x80010201
+#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1")
+
+// User extension aux register io_spi_mst2_spien
+#define AR_IO_SPI_MST2_SPIEN 0x80010202
+#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien")
+
+// User extension aux register io_spi_mst2_ser
+#define AR_IO_SPI_MST2_SER 0x80010204
+#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser")
+
+// User extension aux register io_spi_mst2_baudr
+#define AR_IO_SPI_MST2_BAUDR 0x80010205
+#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr")
+
+// User extension aux register io_spi_mst2_txftlr
+#define AR_IO_SPI_MST2_TXFTLR 0x80010206
+#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr")
+
+// User extension aux register io_spi_mst2_rxftlr
+#define AR_IO_SPI_MST2_RXFTLR 0x80010207
+#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr")
+
+// User extension aux register io_spi_mst2_txflr
+#define AR_IO_SPI_MST2_TXFLR 0x80010208
+#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr")
+
+// User extension aux register io_spi_mst2_rxflr
+#define AR_IO_SPI_MST2_RXFLR 0x80010209
+#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr")
+
+// User extension aux register io_spi_mst2_sr
+#define AR_IO_SPI_MST2_SR 0x8001020a
+#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr")
+
+// User extension aux register io_spi_mst2_imr
+#define AR_IO_SPI_MST2_IMR 0x8001020b
+#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr")
+
+// User extension aux register io_spi_mst2_isr
+#define AR_IO_SPI_MST2_ISR 0x8001020c
+#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr")
+
+// User extension aux register io_spi_mst2_risr
+#define AR_IO_SPI_MST2_RISR 0x8001020d
+#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr")
+
+// User extension aux register io_spi_mst2_txoicr
+#define AR_IO_SPI_MST2_TXOICR 0x8001020e
+#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr")
+
+// User extension aux register io_spi_mst2_rxoicr
+#define AR_IO_SPI_MST2_RXOICR 0x8001020f
+#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr")
+
+// User extension aux register io_spi_mst2_rxuicr
+#define AR_IO_SPI_MST2_RXUICR 0x80010210
+#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr")
+
+// User extension aux register io_spi_mst2_icr
+#define AR_IO_SPI_MST2_ICR 0x80010212
+#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr")
+
+// User extension aux register io_spi_mst2_clken
+#define AR_IO_SPI_MST2_CLKEN 0x80010216
+#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken")
+
+// User extension aux register io_spi_mst2_dr
+#define AR_IO_SPI_MST2_DR 0x80010218
+#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr")
+
+// User extension aux register io_spi_mst2_rx_sample_dly
+#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c
+#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_PRESENT	1
+
+// User extension aux register io_spi_slv0_ctrlr0
+#define AR_IO_SPI_SLV0_CTRLR0 0x80011000
+#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0")
+
+// User extension aux register io_spi_slv0_spien
+#define AR_IO_SPI_SLV0_SPIEN 0x80011002
+#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien")
+
+// User extension aux register io_spi_slv0_txftlr
+#define AR_IO_SPI_SLV0_TXFTLR 0x80011006
+#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr")
+
+// User extension aux register io_spi_slv0_rxftlr
+#define AR_IO_SPI_SLV0_RXFTLR 0x80011007
+#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr")
+
+// User extension aux register io_spi_slv0_txflr
+#define AR_IO_SPI_SLV0_TXFLR 0x80011008
+#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr")
+
+// User extension aux register io_spi_slv0_rxflr
+#define AR_IO_SPI_SLV0_RXFLR 0x80011009
+#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr")
+
+// User extension aux register io_spi_slv0_sr
+#define AR_IO_SPI_SLV0_SR 0x8001100a
+#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr")
+
+// User extension aux register io_spi_slv0_imr
+#define AR_IO_SPI_SLV0_IMR 0x8001100b
+#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr")
+
+// User extension aux register io_spi_slv0_isr
+#define AR_IO_SPI_SLV0_ISR 0x8001100c
+#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr")
+
+// User extension aux register io_spi_slv0_risr
+#define AR_IO_SPI_SLV0_RISR 0x8001100d
+#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr")
+
+// User extension aux register io_spi_slv0_txoicr
+#define AR_IO_SPI_SLV0_TXOICR 0x8001100e
+#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr")
+
+// User extension aux register io_spi_slv0_rxoicr
+#define AR_IO_SPI_SLV0_RXOICR 0x8001100f
+#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr")
+
+// User extension aux register io_spi_slv0_rxuicr
+#define AR_IO_SPI_SLV0_RXUICR 0x80011010
+#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr")
+
+// User extension aux register io_spi_slv0_icr
+#define AR_IO_SPI_SLV0_ICR 0x80011012
+#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr")
+
+// User extension aux register io_spi_slv0_clken
+#define AR_IO_SPI_SLV0_CLKEN 0x80011016
+#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken")
+
+// User extension aux register io_spi_slv0_dr
+#define AR_IO_SPI_SLV0_DR 0x80011018
+#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO1_PRESENT	1
+
+// User extension aux register io_gpio1_debounce
+#define AR_IO_GPIO1_DEBOUNCE 0x80017148
+#pragma Aux_register(0x80017148, name=>"io_gpio1_debounce")
+
+// User extension aux register io_gpio1_clken
+#define AR_IO_GPIO1_CLKEN 0x80017180
+#pragma Aux_register(0x80017180, name=>"io_gpio1_clken")
+
+// User extension aux register io_gpio1_swporta_dr
+#define AR_IO_GPIO1_SWPORTA_DR 0x80017100
+#pragma Aux_register(0x80017100, name=>"io_gpio1_swporta_dr")
+
+// User extension aux register io_gpio1_swporta_ddr
+#define AR_IO_GPIO1_SWPORTA_DDR 0x80017104
+#pragma Aux_register(0x80017104, name=>"io_gpio1_swporta_ddr")
+
+// User extension aux register io_gpio1_inten
+#define AR_IO_GPIO1_INTEN 0x80017130
+#pragma Aux_register(0x80017130, name=>"io_gpio1_inten")
+
+// User extension aux register io_gpio1_intmask
+#define AR_IO_GPIO1_INTMASK 0x80017134
+#pragma Aux_register(0x80017134, name=>"io_gpio1_intmask")
+
+// User extension aux register io_gpio1_inttype_level
+#define AR_IO_GPIO1_INTTYPE_LEVEL 0x80017138
+#pragma Aux_register(0x80017138, name=>"io_gpio1_inttype_level")
+
+// User extension aux register io_gpio1_int_polarity
+#define AR_IO_GPIO1_INT_POLARITY 0x8001713c
+#pragma Aux_register(0x8001713c, name=>"io_gpio1_int_polarity")
+
+// User extension aux register io_gpio1_intstatus
+#define AR_IO_GPIO1_INTSTATUS 0x80017140
+#pragma Aux_register(0x80017140, name=>"io_gpio1_intstatus")
+
+// User extension aux register io_gpio1_raw_intstatus
+#define AR_IO_GPIO1_RAW_INTSTATUS 0x80017144
+#pragma Aux_register(0x80017144, name=>"io_gpio1_raw_intstatus")
+
+// User extension aux register io_gpio1_porta_eoi
+#define AR_IO_GPIO1_PORTA_EOI 0x8001714c
+#pragma Aux_register(0x8001714c, name=>"io_gpio1_porta_eoi")
+
+// User extension aux register io_gpio1_ext_porta
+#define AR_IO_GPIO1_EXT_PORTA 0x80017150
+#pragma Aux_register(0x80017150, name=>"io_gpio1_ext_porta")
+
+// User extension aux register io_gpio1_ls_sync
+#define AR_IO_GPIO1_LS_SYNC 0x80017160
+#pragma Aux_register(0x80017160, name=>"io_gpio1_ls_sync")
+
+// User extension aux register io_gpio1_int_bothedge
+#define AR_IO_GPIO1_INT_BOTHEDGE 0x80017168
+#pragma Aux_register(0x80017168, name=>"io_gpio1_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO2_PRESENT	1
+
+// User extension aux register io_gpio2_debounce
+#define AR_IO_GPIO2_DEBOUNCE 0x80017248
+#pragma Aux_register(0x80017248, name=>"io_gpio2_debounce")
+
+// User extension aux register io_gpio2_clken
+#define AR_IO_GPIO2_CLKEN 0x80017280
+#pragma Aux_register(0x80017280, name=>"io_gpio2_clken")
+
+// User extension aux register io_gpio2_swporta_dr
+#define AR_IO_GPIO2_SWPORTA_DR 0x80017200
+#pragma Aux_register(0x80017200, name=>"io_gpio2_swporta_dr")
+
+// User extension aux register io_gpio2_swporta_ddr
+#define AR_IO_GPIO2_SWPORTA_DDR 0x80017204
+#pragma Aux_register(0x80017204, name=>"io_gpio2_swporta_ddr")
+
+// User extension aux register io_gpio2_inten
+#define AR_IO_GPIO2_INTEN 0x80017230
+#pragma Aux_register(0x80017230, name=>"io_gpio2_inten")
+
+// User extension aux register io_gpio2_intmask
+#define AR_IO_GPIO2_INTMASK 0x80017234
+#pragma Aux_register(0x80017234, name=>"io_gpio2_intmask")
+
+// User extension aux register io_gpio2_inttype_level
+#define AR_IO_GPIO2_INTTYPE_LEVEL 0x80017238
+#pragma Aux_register(0x80017238, name=>"io_gpio2_inttype_level")
+
+// User extension aux register io_gpio2_int_polarity
+#define AR_IO_GPIO2_INT_POLARITY 0x8001723c
+#pragma Aux_register(0x8001723c, name=>"io_gpio2_int_polarity")
+
+// User extension aux register io_gpio2_intstatus
+#define AR_IO_GPIO2_INTSTATUS 0x80017240
+#pragma Aux_register(0x80017240, name=>"io_gpio2_intstatus")
+
+// User extension aux register io_gpio2_raw_intstatus
+#define AR_IO_GPIO2_RAW_INTSTATUS 0x80017244
+#pragma Aux_register(0x80017244, name=>"io_gpio2_raw_intstatus")
+
+// User extension aux register io_gpio2_porta_eoi
+#define AR_IO_GPIO2_PORTA_EOI 0x8001724c
+#pragma Aux_register(0x8001724c, name=>"io_gpio2_porta_eoi")
+
+// User extension aux register io_gpio2_ext_porta
+#define AR_IO_GPIO2_EXT_PORTA 0x80017250
+#pragma Aux_register(0x80017250, name=>"io_gpio2_ext_porta")
+
+// User extension aux register io_gpio2_ls_sync
+#define AR_IO_GPIO2_LS_SYNC 0x80017260
+#pragma Aux_register(0x80017260, name=>"io_gpio2_ls_sync")
+
+// User extension aux register io_gpio2_int_bothedge
+#define AR_IO_GPIO2_INT_BOTHEDGE 0x80017268
+#pragma Aux_register(0x80017268, name=>"io_gpio2_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_PRESENT	1
+
+// User extension aux register io_i2c_mst1_clken
+#define AR_IO_I2C_MST1_CLKEN 0x800121c0
+#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken")
+
+// User extension aux register io_i2c_mst1_con
+#define AR_IO_I2C_MST1_CON 0x80012100
+#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con")
+
+// User extension aux register io_i2c_mst1_tar
+#define AR_IO_I2C_MST1_TAR 0x80012104
+#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar")
+
+// User extension aux register io_i2c_mst1_data_cmd
+#define AR_IO_I2C_MST1_DATA_CMD 0x80012110
+#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd")
+
+// User extension aux register io_i2c_mst1_ss_scl_hcnt
+#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114
+#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt")
+
+// User extension aux register io_i2c_mst1_ss_scl_lcnt
+#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118
+#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt")
+
+// User extension aux register io_i2c_mst1_fs_scl_hcnt
+#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c
+#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt")
+
+// User extension aux register io_i2c_mst1_fs_scl_lcnt
+#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120
+#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt")
+
+// User extension aux register io_i2c_mst1_intr_stat
+#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c
+#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat")
+
+// User extension aux register io_i2c_mst1_intr_mask
+#define AR_IO_I2C_MST1_INTR_MASK 0x80012130
+#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask")
+
+// User extension aux register io_i2c_mst1_raw_intr_stat
+#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134
+#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat")
+
+// User extension aux register io_i2c_mst1_rx_tl
+#define AR_IO_I2C_MST1_RX_TL 0x80012138
+#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl")
+
+// User extension aux register io_i2c_mst1_tx_tl
+#define AR_IO_I2C_MST1_TX_TL 0x8001213c
+#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl")
+
+// User extension aux register io_i2c_mst1_clr_intr
+#define AR_IO_I2C_MST1_CLR_INTR 0x80012140
+#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr")
+
+// User extension aux register io_i2c_mst1_clr_rx_under
+#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144
+#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under")
+
+// User extension aux register io_i2c_mst1_clr_rx_over
+#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148
+#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over")
+
+// User extension aux register io_i2c_mst1_clr_tx_over
+#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c
+#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over")
+
+// User extension aux register io_i2c_mst1_clr_tx_abrt
+#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154
+#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt")
+
+// User extension aux register io_i2c_mst1_clr_activity
+#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c
+#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity")
+
+// User extension aux register io_i2c_mst1_clr_stop_det
+#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160
+#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det")
+
+// User extension aux register io_i2c_mst1_clr_start_det
+#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164
+#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det")
+
+// User extension aux register io_i2c_mst1_enable
+#define AR_IO_I2C_MST1_ENABLE 0x8001216c
+#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable")
+
+// User extension aux register io_i2c_mst1_status
+#define AR_IO_I2C_MST1_STATUS 0x80012170
+#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status")
+
+// User extension aux register io_i2c_mst1_txflr
+#define AR_IO_I2C_MST1_TXFLR 0x80012174
+#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr")
+
+// User extension aux register io_i2c_mst1_rxflr
+#define AR_IO_I2C_MST1_RXFLR 0x80012178
+#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr")
+
+// User extension aux register io_i2c_mst1_sda_hold
+#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c
+#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold")
+
+// User extension aux register io_i2c_mst1_tx_abrt_source
+#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180
+#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source")
+
+// User extension aux register io_i2c_mst1_enable_status
+#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c
+#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status")
+
+// User extension aux register io_i2c_mst1_fs_spklen
+#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0
+#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_PRESENT	1
+
+// User extension aux register io_i2c_mst2_clken
+#define AR_IO_I2C_MST2_CLKEN 0x800122c0
+#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken")
+
+// User extension aux register io_i2c_mst2_con
+#define AR_IO_I2C_MST2_CON 0x80012200
+#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con")
+
+// User extension aux register io_i2c_mst2_tar
+#define AR_IO_I2C_MST2_TAR 0x80012204
+#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar")
+
+// User extension aux register io_i2c_mst2_data_cmd
+#define AR_IO_I2C_MST2_DATA_CMD 0x80012210
+#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd")
+
+// User extension aux register io_i2c_mst2_ss_scl_hcnt
+#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214
+#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt")
+
+// User extension aux register io_i2c_mst2_ss_scl_lcnt
+#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218
+#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt")
+
+// User extension aux register io_i2c_mst2_fs_scl_hcnt
+#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c
+#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt")
+
+// User extension aux register io_i2c_mst2_fs_scl_lcnt
+#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220
+#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt")
+
+// User extension aux register io_i2c_mst2_intr_stat
+#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c
+#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat")
+
+// User extension aux register io_i2c_mst2_intr_mask
+#define AR_IO_I2C_MST2_INTR_MASK 0x80012230
+#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask")
+
+// User extension aux register io_i2c_mst2_raw_intr_stat
+#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234
+#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat")
+
+// User extension aux register io_i2c_mst2_rx_tl
+#define AR_IO_I2C_MST2_RX_TL 0x80012238
+#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl")
+
+// User extension aux register io_i2c_mst2_tx_tl
+#define AR_IO_I2C_MST2_TX_TL 0x8001223c
+#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl")
+
+// User extension aux register io_i2c_mst2_clr_intr
+#define AR_IO_I2C_MST2_CLR_INTR 0x80012240
+#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr")
+
+// User extension aux register io_i2c_mst2_clr_rx_under
+#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244
+#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under")
+
+// User extension aux register io_i2c_mst2_clr_rx_over
+#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248
+#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over")
+
+// User extension aux register io_i2c_mst2_clr_tx_over
+#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c
+#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over")
+
+// User extension aux register io_i2c_mst2_clr_tx_abrt
+#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254
+#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt")
+
+// User extension aux register io_i2c_mst2_clr_activity
+#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c
+#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity")
+
+// User extension aux register io_i2c_mst2_clr_stop_det
+#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260
+#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det")
+
+// User extension aux register io_i2c_mst2_clr_start_det
+#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264
+#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det")
+
+// User extension aux register io_i2c_mst2_enable
+#define AR_IO_I2C_MST2_ENABLE 0x8001226c
+#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable")
+
+// User extension aux register io_i2c_mst2_status
+#define AR_IO_I2C_MST2_STATUS 0x80012270
+#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status")
+
+// User extension aux register io_i2c_mst2_txflr
+#define AR_IO_I2C_MST2_TXFLR 0x80012274
+#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr")
+
+// User extension aux register io_i2c_mst2_rxflr
+#define AR_IO_I2C_MST2_RXFLR 0x80012278
+#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr")
+
+// User extension aux register io_i2c_mst2_sda_hold
+#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c
+#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold")
+
+// User extension aux register io_i2c_mst2_tx_abrt_source
+#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280
+#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source")
+
+// User extension aux register io_i2c_mst2_enable_status
+#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c
+#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status")
+
+// User extension aux register io_i2c_mst2_fs_spklen
+#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0
+#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_PRESENT	1
+
+// User extension aux register io_uart0_clken
+#define AR_IO_UART0_CLKEN 0x800140c0
+#pragma Aux_register(0x800140c0, name=>"io_uart0_clken")
+
+// User extension aux register io_uart0_rbr_thr_dll
+#define AR_IO_UART0_RBR_THR_DLL 0x80014000
+#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll")
+
+// User extension aux register io_uart0_ier_dlh
+#define AR_IO_UART0_IER_DLH 0x80014004
+#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh")
+
+// User extension aux register io_uart0_iir_fcr
+#define AR_IO_UART0_IIR_FCR 0x80014008
+#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr")
+
+// User extension aux register io_uart0_lcr
+#define AR_IO_UART0_LCR 0x8001400c
+#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr")
+
+// User extension aux register io_uart0_mcr
+#define AR_IO_UART0_MCR 0x80014010
+#pragma Aux_register(0x80014010, name=>"io_uart0_mcr")
+
+// User extension aux register io_uart0_lsr
+#define AR_IO_UART0_LSR 0x80014014
+#pragma Aux_register(0x80014014, name=>"io_uart0_lsr")
+
+// User extension aux register io_uart0_msr
+#define AR_IO_UART0_MSR 0x80014018
+#pragma Aux_register(0x80014018, name=>"io_uart0_msr")
+
+// User extension aux register io_uart0_usr
+#define AR_IO_UART0_USR 0x8001407c
+#pragma Aux_register(0x8001407c, name=>"io_uart0_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_PRESENT	1
+
+// User extension aux register io_uart1_clken
+#define AR_IO_UART1_CLKEN 0x800141c0
+#pragma Aux_register(0x800141c0, name=>"io_uart1_clken")
+
+// User extension aux register io_uart1_rbr_thr_dll
+#define AR_IO_UART1_RBR_THR_DLL 0x80014100
+#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll")
+
+// User extension aux register io_uart1_ier_dlh
+#define AR_IO_UART1_IER_DLH 0x80014104
+#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh")
+
+// User extension aux register io_uart1_iir_fcr
+#define AR_IO_UART1_IIR_FCR 0x80014108
+#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr")
+
+// User extension aux register io_uart1_lcr
+#define AR_IO_UART1_LCR 0x8001410c
+#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr")
+
+// User extension aux register io_uart1_mcr
+#define AR_IO_UART1_MCR 0x80014110
+#pragma Aux_register(0x80014110, name=>"io_uart1_mcr")
+
+// User extension aux register io_uart1_lsr
+#define AR_IO_UART1_LSR 0x80014114
+#pragma Aux_register(0x80014114, name=>"io_uart1_lsr")
+
+// User extension aux register io_uart1_msr
+#define AR_IO_UART1_MSR 0x80014118
+#pragma Aux_register(0x80014118, name=>"io_uart1_msr")
+
+// User extension aux register io_uart1_usr
+#define AR_IO_UART1_USR 0x8001417c
+#pragma Aux_register(0x8001417c, name=>"io_uart1_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_PRESENT	1
+
+// User extension aux register io_uart2_clken
+#define AR_IO_UART2_CLKEN 0x800142c0
+#pragma Aux_register(0x800142c0, name=>"io_uart2_clken")
+
+// User extension aux register io_uart2_rbr_thr_dll
+#define AR_IO_UART2_RBR_THR_DLL 0x80014200
+#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll")
+
+// User extension aux register io_uart2_ier_dlh
+#define AR_IO_UART2_IER_DLH 0x80014204
+#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh")
+
+// User extension aux register io_uart2_iir_fcr
+#define AR_IO_UART2_IIR_FCR 0x80014208
+#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr")
+
+// User extension aux register io_uart2_lcr
+#define AR_IO_UART2_LCR 0x8001420c
+#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr")
+
+// User extension aux register io_uart2_mcr
+#define AR_IO_UART2_MCR 0x80014210
+#pragma Aux_register(0x80014210, name=>"io_uart2_mcr")
+
+// User extension aux register io_uart2_lsr
+#define AR_IO_UART2_LSR 0x80014214
+#pragma Aux_register(0x80014214, name=>"io_uart2_lsr")
+
+// User extension aux register io_uart2_msr
+#define AR_IO_UART2_MSR 0x80014218
+#pragma Aux_register(0x80014218, name=>"io_uart2_msr")
+
+// User extension aux register io_uart2_usr
+#define AR_IO_UART2_USR 0x8001427c
+#pragma Aux_register(0x8001427c, name=>"io_uart2_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_PRESENT	1
+
+// User extension aux register io_uart3_clken
+#define AR_IO_UART3_CLKEN 0x800143c0
+#pragma Aux_register(0x800143c0, name=>"io_uart3_clken")
+
+// User extension aux register io_uart3_rbr_thr_dll
+#define AR_IO_UART3_RBR_THR_DLL 0x80014300
+#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll")
+
+// User extension aux register io_uart3_ier_dlh
+#define AR_IO_UART3_IER_DLH 0x80014304
+#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh")
+
+// User extension aux register io_uart3_iir_fcr
+#define AR_IO_UART3_IIR_FCR 0x80014308
+#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr")
+
+// User extension aux register io_uart3_lcr
+#define AR_IO_UART3_LCR 0x8001430c
+#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr")
+
+// User extension aux register io_uart3_mcr
+#define AR_IO_UART3_MCR 0x80014310
+#pragma Aux_register(0x80014310, name=>"io_uart3_mcr")
+
+// User extension aux register io_uart3_lsr
+#define AR_IO_UART3_LSR 0x80014314
+#pragma Aux_register(0x80014314, name=>"io_uart3_lsr")
+
+// User extension aux register io_uart3_msr
+#define AR_IO_UART3_MSR 0x80014318
+#pragma Aux_register(0x80014318, name=>"io_uart3_msr")
+
+// User extension aux register io_uart3_usr
+#define AR_IO_UART3_USR 0x8001437c
+#pragma Aux_register(0x8001437c, name=>"io_uart3_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_RX_MST0_PRESENT	1
+
+// User extension aux register io_i2s_rx_mst0_ier
+#define AR_IO_I2S_RX_MST0_IER 0x8001a000
+#pragma Aux_register(0x8001a000, name=>"io_i2s_rx_mst0_ier")
+
+// User extension aux register io_i2s_rx_mst0_irer
+#define AR_IO_I2S_RX_MST0_IRER 0x8001a004
+#pragma Aux_register(0x8001a004, name=>"io_i2s_rx_mst0_irer")
+
+// User extension aux register io_i2s_rx_mst0_cer
+#define AR_IO_I2S_RX_MST0_CER 0x8001a00c
+#pragma Aux_register(0x8001a00c, name=>"io_i2s_rx_mst0_cer")
+
+// User extension aux register io_i2s_rx_mst0_ccr
+#define AR_IO_I2S_RX_MST0_CCR 0x8001a010
+#pragma Aux_register(0x8001a010, name=>"io_i2s_rx_mst0_ccr")
+
+// User extension aux register io_i2s_rx_mst0_rxffr
+#define AR_IO_I2S_RX_MST0_RXFFR 0x8001a014
+#pragma Aux_register(0x8001a014, name=>"io_i2s_rx_mst0_rxffr")
+
+// User extension aux register io_i2s_rx_mst0_lrbr
+#define AR_IO_I2S_RX_MST0_LRBR 0x8001a020
+#pragma Aux_register(0x8001a020, name=>"io_i2s_rx_mst0_lrbr")
+
+// User extension aux register io_i2s_rx_mst0_rrbr
+#define AR_IO_I2S_RX_MST0_RRBR 0x8001a024
+#pragma Aux_register(0x8001a024, name=>"io_i2s_rx_mst0_rrbr")
+
+// User extension aux register io_i2s_rx_mst0_rer
+#define AR_IO_I2S_RX_MST0_RER 0x8001a028
+#pragma Aux_register(0x8001a028, name=>"io_i2s_rx_mst0_rer")
+
+// User extension aux register io_i2s_rx_mst0_rcr
+#define AR_IO_I2S_RX_MST0_RCR 0x8001a030
+#pragma Aux_register(0x8001a030, name=>"io_i2s_rx_mst0_rcr")
+
+// User extension aux register io_i2s_rx_mst0_isr
+#define AR_IO_I2S_RX_MST0_ISR 0x8001a038
+#pragma Aux_register(0x8001a038, name=>"io_i2s_rx_mst0_isr")
+
+// User extension aux register io_i2s_rx_mst0_imr
+#define AR_IO_I2S_RX_MST0_IMR 0x8001a03c
+#pragma Aux_register(0x8001a03c, name=>"io_i2s_rx_mst0_imr")
+
+// User extension aux register io_i2s_rx_mst0_ror
+#define AR_IO_I2S_RX_MST0_ROR 0x8001a040
+#pragma Aux_register(0x8001a040, name=>"io_i2s_rx_mst0_ror")
+
+// User extension aux register io_i2s_rx_mst0_rfcr
+#define AR_IO_I2S_RX_MST0_RFCR 0x8001a048
+#pragma Aux_register(0x8001a048, name=>"io_i2s_rx_mst0_rfcr")
+
+// User extension aux register io_i2s_rx_mst0_rff
+#define AR_IO_I2S_RX_MST0_RFF 0x8001a050
+#pragma Aux_register(0x8001a050, name=>"io_i2s_rx_mst0_rff")
+
+// User extension aux register io_i2s_rx_mst0_rxdma
+#define AR_IO_I2S_RX_MST0_RXDMA 0x8001a1c0
+#pragma Aux_register(0x8001a1c0, name=>"io_i2s_rx_mst0_rxdma")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_TX_MST0_PRESENT	1
+
+// User extension aux register io_i2s_tx_mst0_ier
+#define AR_IO_I2S_TX_MST0_IER 0x80019000
+#pragma Aux_register(0x80019000, name=>"io_i2s_tx_mst0_ier")
+
+// User extension aux register io_i2s_tx_mst0_iter
+#define AR_IO_I2S_TX_MST0_ITER 0x80019008
+#pragma Aux_register(0x80019008, name=>"io_i2s_tx_mst0_iter")
+
+// User extension aux register io_i2s_tx_mst0_cer
+#define AR_IO_I2S_TX_MST0_CER 0x8001900c
+#pragma Aux_register(0x8001900c, name=>"io_i2s_tx_mst0_cer")
+
+// User extension aux register io_i2s_tx_mst0_ccr
+#define AR_IO_I2S_TX_MST0_CCR 0x80019010
+#pragma Aux_register(0x80019010, name=>"io_i2s_tx_mst0_ccr")
+
+// User extension aux register io_i2s_tx_mst0_txffr
+#define AR_IO_I2S_TX_MST0_TXFFR 0x80019018
+#pragma Aux_register(0x80019018, name=>"io_i2s_tx_mst0_txffr")
+
+// User extension aux register io_i2s_tx_mst0_lthr
+#define AR_IO_I2S_TX_MST0_LTHR 0x80019020
+#pragma Aux_register(0x80019020, name=>"io_i2s_tx_mst0_lthr")
+
+// User extension aux register io_i2s_tx_mst0_rthr
+#define AR_IO_I2S_TX_MST0_RTHR 0x80019024
+#pragma Aux_register(0x80019024, name=>"io_i2s_tx_mst0_rthr")
+
+// User extension aux register io_i2s_tx_mst0_ter
+#define AR_IO_I2S_TX_MST0_TER 0x8001902c
+#pragma Aux_register(0x8001902c, name=>"io_i2s_tx_mst0_ter")
+
+// User extension aux register io_i2s_tx_mst0_tcr
+#define AR_IO_I2S_TX_MST0_TCR 0x80019034
+#pragma Aux_register(0x80019034, name=>"io_i2s_tx_mst0_tcr")
+
+// User extension aux register io_i2s_tx_mst0_isr
+#define AR_IO_I2S_TX_MST0_ISR 0x80019038
+#pragma Aux_register(0x80019038, name=>"io_i2s_tx_mst0_isr")
+
+// User extension aux register io_i2s_tx_mst0_imr
+#define AR_IO_I2S_TX_MST0_IMR 0x8001903c
+#pragma Aux_register(0x8001903c, name=>"io_i2s_tx_mst0_imr")
+
+// User extension aux register io_i2s_tx_mst0_tor
+#define AR_IO_I2S_TX_MST0_TOR 0x80019044
+#pragma Aux_register(0x80019044, name=>"io_i2s_tx_mst0_tor")
+
+// User extension aux register io_i2s_tx_mst0_tfcr
+#define AR_IO_I2S_TX_MST0_TFCR 0x8001904c
+#pragma Aux_register(0x8001904c, name=>"io_i2s_tx_mst0_tfcr")
+
+// User extension aux register io_i2s_tx_mst0_tff
+#define AR_IO_I2S_TX_MST0_TFF 0x80019054
+#pragma Aux_register(0x80019054, name=>"io_i2s_tx_mst0_tff")
+
+// User extension aux register io_i2s_tx_mst0_txdma
+#define AR_IO_I2S_TX_MST0_TXDMA 0x800191c8
+#pragma Aux_register(0x800191c8, name=>"io_i2s_tx_mst0_txdma")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_PDM_RX0_PRESENT	1
+
+// User extension aux register io_pdm_rx0_pdm_en
+#define AR_IO_PDM_RX0_PDM_EN 0x8001b000
+#pragma Aux_register(0x8001b000, name=>"io_pdm_rx0_pdm_en")
+
+// User extension aux register io_pdm_rx0_pdm_ren
+#define AR_IO_PDM_RX0_PDM_REN 0x8001b004
+#pragma Aux_register(0x8001b004, name=>"io_pdm_rx0_pdm_ren")
+
+// User extension aux register io_pdm_rx0_cer
+#define AR_IO_PDM_RX0_CER 0x8001b00c
+#pragma Aux_register(0x8001b00c, name=>"io_pdm_rx0_cer")
+
+// User extension aux register io_pdm_rx0_rxffr
+#define AR_IO_PDM_RX0_RXFFR 0x8001b014
+#pragma Aux_register(0x8001b014, name=>"io_pdm_rx0_rxffr")
+
+// User extension aux register io_pdm_rx0_rer0
+#define AR_IO_PDM_RX0_RER0 0x8001b028
+#pragma Aux_register(0x8001b028, name=>"io_pdm_rx0_rer0")
+
+// User extension aux register io_pdm_rx0_isr
+#define AR_IO_PDM_RX0_ISR 0x8001b038
+#pragma Aux_register(0x8001b038, name=>"io_pdm_rx0_isr")
+
+// User extension aux register io_pdm_rx0_imr
+#define AR_IO_PDM_RX0_IMR 0x8001b03c
+#pragma Aux_register(0x8001b03c, name=>"io_pdm_rx0_imr")
+
+// User extension aux register io_pdm_rx0_ror
+#define AR_IO_PDM_RX0_ROR 0x8001b040
+#pragma Aux_register(0x8001b040, name=>"io_pdm_rx0_ror")
+
+// User extension aux register io_pdm_rx0_rfcr
+#define AR_IO_PDM_RX0_RFCR 0x8001b048
+#pragma Aux_register(0x8001b048, name=>"io_pdm_rx0_rfcr")
+
+// User extension aux register io_pdm_rx0_rxdma
+#define AR_IO_PDM_RX0_RXDMA 0x8001b1c0
+#pragma Aux_register(0x8001b1c0, name=>"io_pdm_rx0_rxdma")
+
+// User extension aux register io_pdm_rx0_pdm_rr
+#define AR_IO_PDM_RX0_PDM_RR 0x8001b1d0
+#pragma Aux_register(0x8001b1d0, name=>"io_pdm_rx0_pdm_rr")
+
+// User extension aux register io_pdm_rx0_cic_n
+#define AR_IO_PDM_RX0_CIC_N 0x8001b1d4
+#pragma Aux_register(0x8001b1d4, name=>"io_pdm_rx0_cic_n")
+
+// User extension aux register io_pdm_rx0_cic_d
+#define AR_IO_PDM_RX0_CIC_D 0x8001b1d8
+#pragma Aux_register(0x8001b1d8, name=>"io_pdm_rx0_cic_d")
+
+// User extension aux register io_pdm_rx0_dcrc
+#define AR_IO_PDM_RX0_DCRC 0x8001b1dc
+#pragma Aux_register(0x8001b1dc, name=>"io_pdm_rx0_dcrc")
+
+// User extension aux register io_pdm_rx0_brc_b0
+#define AR_IO_PDM_RX0_BRC_B0 0x8001b1e0
+#pragma Aux_register(0x8001b1e0, name=>"io_pdm_rx0_brc_b0")
+
+// User extension aux register io_pdm_rx0_brc_clp
+#define AR_IO_PDM_RX0_BRC_CLP 0x8001b1f0
+#pragma Aux_register(0x8001b1f0, name=>"io_pdm_rx0_brc_clp")
+#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT	1
+
+// User extension aux register fpu_build
+#define AR_FPU_BUILD 0xc8
+#pragma Aux_register(0xc8, name=>"fpu_build")
+
+// User extension aux register fpu_ctrl
+#define AR_FPU_CTRL 0x300
+#pragma Aux_register(0x300, name=>"fpu_ctrl")
+
+// User extension aux register fpu_status
+#define AR_FPU_STATUS 0x301
+#pragma Aux_register(0x301, name=>"fpu_status")
+
+// User extension instruction fsmadd
+extern int fsmadd(int,int);
+#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsmsub
+extern int fsmsub(int,int);
+#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsmul
+extern int fsmul(int,int);
+#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsadd
+extern int fsadd(int,int);
+#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fssub
+extern int fssub(int,int);
+#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fcvt32
+extern int fcvt32(int,int);
+#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsdiv
+extern int fsdiv(int,int);
+#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmp
+extern int fscmp(int,int);
+#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmp
+extern int fscmp_f(int,int);
+#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmpf
+extern int fscmpf(int,int);
+#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmpf
+extern int fscmpf_f(int,int);
+#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fssqrt
+extern int fssqrt(int);
+#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT	1
+
+// User extension aux register aux_dpfp1l
+#define AR_AUX_DPFP1L 0x302
+#pragma Aux_register(0x302, name=>"aux_dpfp1l")
+
+// User extension aux register aux_dpfp1h
+#define AR_AUX_DPFP1H 0x303
+#pragma Aux_register(0x303, name=>"aux_dpfp1h")
+
+// User extension aux register aux_dpfp2l
+#define AR_AUX_DPFP2L 0x304
+#pragma Aux_register(0x304, name=>"aux_dpfp2l")
+
+// User extension aux register aux_dpfp2h
+#define AR_AUX_DPFP2H 0x305
+#pragma Aux_register(0x305, name=>"aux_dpfp2h")
+
+// User extension instruction dmulh11
+extern int dmulh11(int,int);
+#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh11
+extern int dmulh11_f(int,int);
+#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh12
+extern int dmulh12(int,int);
+#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh12
+extern int dmulh12_f(int,int);
+#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh21
+extern int dmulh21(int,int);
+#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh21
+extern int dmulh21_f(int,int);
+#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh22
+extern int dmulh22(int,int);
+#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh22
+extern int dmulh22_f(int,int);
+#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh11
+extern int daddh11(int,int);
+#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh11
+extern int daddh11_f(int,int);
+#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh12
+extern int daddh12(int,int);
+#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh12
+extern int daddh12_f(int,int);
+#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh21
+extern int daddh21(int,int);
+#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh21
+extern int daddh21_f(int,int);
+#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh22
+extern int daddh22(int,int);
+#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh22
+extern int daddh22_f(int,int);
+#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh11
+extern int dsubh11(int,int);
+#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh11
+extern int dsubh11_f(int,int);
+#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh12
+extern int dsubh12(int,int);
+#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh12
+extern int dsubh12_f(int,int);
+#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh21
+extern int dsubh21(int,int);
+#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh21
+extern int dsubh21_f(int,int);
+#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh22
+extern int dsubh22(int,int);
+#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh22
+extern int dsubh22_f(int,int);
+#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dexcl1
+extern int dexcl1(int,int);
+#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dexcl2
+extern int dexcl2(int,int);
+#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+
+#endif
+
+
+]]></string>
+  </configuration>
+  <configuration name="apex_assembly" filename="apexextensions.s">
+    <string><![CDATA[
+
+; Assembler directives for eia extensions in this design
+.set apex_com_arc_hardware_dfss_dsp_trig_present,1
+.extInstruction dsp_cos, 7, 0x1E, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_sin, 7, 0x1F, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_tan, 7, 0x22, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_acos, 7, 0x23, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_asin, 7, 0x24, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_atan, 7, 0x25, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_sqrt, 7, 0x20, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_sqrt15, 7, 0x21, FLAGS_NONE, SYNTAX_2OP
+ .set apex_com_arc_hardware_dfss_io_gpio0_present,1
+ .set apex_com_arc_hardware_dfss_io_i2c_mst0_present,1
+ .set apex_com_arc_hardware_dfss_io_i2c_slv0_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_mst0_present,1
+ .set apex_com_arc_hardware_dfss_subsys_bcr_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_mst1_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_mst2_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_slv0_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio1_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio2_present,1
+ .set apex_com_arc_hardware_dfss_io_i2c_mst1_present,1
+ .set apex_com_arc_hardware_dfss_io_i2c_mst2_present,1
+ .set apex_com_arc_hardware_dfss_io_uart0_present,1
+ .set apex_com_arc_hardware_dfss_io_uart1_present,1
+ .set apex_com_arc_hardware_dfss_io_uart2_present,1
+ .set apex_com_arc_hardware_dfss_io_uart3_present,1
+ .set apex_com_arc_hardware_dfss_io_i2s_rx_mst0_present,1
+ .set apex_com_arc_hardware_dfss_io_i2s_tx_mst0_present,1
+ .set apex_com_arc_hardware_dfss_io_pdm_rx0_present,1
+.set apex_com_arc_hardware_dfss_io_gpio0_present,1
+.extAuxRegister io_gpio0_debounce,0x80017048,r|w
+.extAuxRegister io_gpio0_clken,0x80017080,r|w
+.extAuxRegister io_gpio0_swporta_dr,0x80017000,r|w
+.extAuxRegister io_gpio0_swporta_ddr,0x80017004,r|w
+.extAuxRegister io_gpio0_inten,0x80017030,r|w
+.extAuxRegister io_gpio0_intmask,0x80017034,r|w
+.extAuxRegister io_gpio0_inttype_level,0x80017038,r|w
+.extAuxRegister io_gpio0_int_polarity,0x8001703c,r|w
+.extAuxRegister io_gpio0_intstatus,0x80017040,r
+.extAuxRegister io_gpio0_raw_intstatus,0x80017044,r
+.extAuxRegister io_gpio0_porta_eoi,0x8001704c,w
+.extAuxRegister io_gpio0_ext_porta,0x80017050,r
+.extAuxRegister io_gpio0_ls_sync,0x80017060,r|w
+.extAuxRegister io_gpio0_int_bothedge,0x80017068,r|w
+.set apex_com_arc_hardware_dfss_io_i2c_mst0_present,1
+.extAuxRegister io_i2c_mst0_clken,0x800120c0,r|w
+.extAuxRegister io_i2c_mst0_con,0x80012000,r|w
+.extAuxRegister io_i2c_mst0_tar,0x80012004,r|w
+.extAuxRegister io_i2c_mst0_data_cmd,0x80012010,r|w
+.extAuxRegister io_i2c_mst0_ss_scl_hcnt,0x80012014,r|w
+.extAuxRegister io_i2c_mst0_ss_scl_lcnt,0x80012018,r|w
+.extAuxRegister io_i2c_mst0_fs_scl_hcnt,0x8001201c,r|w
+.extAuxRegister io_i2c_mst0_fs_scl_lcnt,0x80012020,r|w
+.extAuxRegister io_i2c_mst0_intr_stat,0x8001202c,r
+.extAuxRegister io_i2c_mst0_intr_mask,0x80012030,r|w
+.extAuxRegister io_i2c_mst0_raw_intr_stat,0x80012034,r
+.extAuxRegister io_i2c_mst0_rx_tl,0x80012038,r|w
+.extAuxRegister io_i2c_mst0_tx_tl,0x8001203c,r|w
+.extAuxRegister io_i2c_mst0_clr_intr,0x80012040,r
+.extAuxRegister io_i2c_mst0_clr_rx_under,0x80012044,r
+.extAuxRegister io_i2c_mst0_clr_rx_over,0x80012048,r
+.extAuxRegister io_i2c_mst0_clr_tx_over,0x8001204c,r
+.extAuxRegister io_i2c_mst0_clr_tx_abrt,0x80012054,r
+.extAuxRegister io_i2c_mst0_clr_activity,0x8001205c,r
+.extAuxRegister io_i2c_mst0_clr_stop_det,0x80012060,r
+.extAuxRegister io_i2c_mst0_clr_start_det,0x80012064,r
+.extAuxRegister io_i2c_mst0_enable,0x8001206c,r|w
+.extAuxRegister io_i2c_mst0_status,0x80012070,r
+.extAuxRegister io_i2c_mst0_txflr,0x80012074,r
+.extAuxRegister io_i2c_mst0_rxflr,0x80012078,r
+.extAuxRegister io_i2c_mst0_sda_hold,0x8001207c,r|w
+.extAuxRegister io_i2c_mst0_tx_abrt_source,0x80012080,r
+.extAuxRegister io_i2c_mst0_enable_status,0x8001209c,r
+.extAuxRegister io_i2c_mst0_fs_spklen,0x800120a0,r|w
+.set apex_com_arc_hardware_dfss_io_i2c_slv0_present,1
+.extAuxRegister io_i2c_slv0_clken,0x800130c0,r|w
+.extAuxRegister io_i2c_slv0_con,0x80013000,r|w
+.extAuxRegister io_i2c_slv0_sar,0x80013008,r|w
+.extAuxRegister io_i2c_slv0_data_cmd,0x80013010,r|w
+.extAuxRegister io_i2c_slv0_intr_stat,0x8001302c,r
+.extAuxRegister io_i2c_slv0_intr_mask,0x80013030,r|w
+.extAuxRegister io_i2c_slv0_raw_intr_stat,0x80013034,r
+.extAuxRegister io_i2c_slv0_rx_tl,0x80013038,r|w
+.extAuxRegister io_i2c_slv0_tx_tl,0x8001303c,r|w
+.extAuxRegister io_i2c_slv0_clr_intr,0x80013040,r
+.extAuxRegister io_i2c_slv0_clr_rx_under,0x80013044,r
+.extAuxRegister io_i2c_slv0_clr_rx_over,0x80013048,r
+.extAuxRegister io_i2c_slv0_clr_tx_over,0x8001304c,r
+.extAuxRegister io_i2c_slv0_clr_rd_req,0x80013050,r
+.extAuxRegister io_i2c_slv0_clr_tx_abrt,0x80013054,r
+.extAuxRegister io_i2c_slv0_clr_rx_done,0x80013058,r
+.extAuxRegister io_i2c_slv0_clr_activity,0x8001305c,r
+.extAuxRegister io_i2c_slv0_clr_stop_det,0x80013060,r
+.extAuxRegister io_i2c_slv0_clr_start_det,0x80013064,r
+.extAuxRegister io_i2c_slv0_enable,0x8001306c,r|w
+.extAuxRegister io_i2c_slv0_status,0x80013070,r
+.extAuxRegister io_i2c_slv0_txflr,0x80013074,r
+.extAuxRegister io_i2c_slv0_rxflr,0x80013078,r
+.extAuxRegister io_i2c_slv0_sda_hold,0x8001307c,r|w
+.extAuxRegister io_i2c_slv0_tx_abrt_source,0x80013080,r
+.extAuxRegister io_i2c_slv0_sda_setup,0x80013094,r|w
+.extAuxRegister io_i2c_slv0_enable_status,0x8001309c,r
+.extAuxRegister io_i2c_slv0_fs_spklen,0x800130a0,r|w
+.extAuxRegister io_i2c_slv0_clr_restart_det,0x800130a8,r
+.set apex_com_arc_hardware_dfss_io_spi_mst0_present,1
+.extAuxRegister io_spi_mst0_ctrlr0,0x80010000,r|w
+.extAuxRegister io_spi_mst0_ctrlr1,0x80010001,r|w
+.extAuxRegister io_spi_mst0_spien,0x80010002,r|w
+.extAuxRegister io_spi_mst0_ser,0x80010004,r|w
+.extAuxRegister io_spi_mst0_baudr,0x80010005,r|w
+.extAuxRegister io_spi_mst0_txftlr,0x80010006,r|w
+.extAuxRegister io_spi_mst0_rxftlr,0x80010007,r|w
+.extAuxRegister io_spi_mst0_txflr,0x80010008,r
+.extAuxRegister io_spi_mst0_rxflr,0x80010009,r
+.extAuxRegister io_spi_mst0_sr,0x8001000a,r
+.extAuxRegister io_spi_mst0_imr,0x8001000b,r|w
+.extAuxRegister io_spi_mst0_isr,0x8001000c,r
+.extAuxRegister io_spi_mst0_risr,0x8001000d,r
+.extAuxRegister io_spi_mst0_txoicr,0x8001000e,r
+.extAuxRegister io_spi_mst0_rxoicr,0x8001000f,r
+.extAuxRegister io_spi_mst0_rxuicr,0x80010010,r
+.extAuxRegister io_spi_mst0_icr,0x80010012,r|w
+.extAuxRegister io_spi_mst0_clken,0x80010016,r|w
+.extAuxRegister io_spi_mst0_dr,0x80010018,r|w
+.extAuxRegister io_spi_mst0_rx_sample_dly,0x8001003c,r|w
+.set apex_com_arc_hardware_dfss_subsys_bcr_present,1
+.extAuxRegister SUBSYS_BUILD,0xf0,r
+.extAuxRegister SUBSYS_DSP_0_BUILD,0xa00,r
+.extAuxRegister SUBSYS_DSP_0_CONFIG,0xa02,r
+.extAuxRegister SUBSYS_IO_0_BUILD,0xa04,r
+.extAuxRegister SUBSYS_IO_1_BUILD,0xa05,r
+.extAuxRegister SUBSYS_IO_2_BUILD,0xa06,r
+.extAuxRegister SUBSYS_UAUX_OFFSET,0xa1e,r
+.extAuxRegister SUBSYS_APEX_OFFSET,0xa1f,r
+.set apex_com_arc_hardware_dfss_io_spi_mst1_present,1
+.extAuxRegister io_spi_mst1_ctrlr0,0x80010100,r|w
+.extAuxRegister io_spi_mst1_ctrlr1,0x80010101,r|w
+.extAuxRegister io_spi_mst1_spien,0x80010102,r|w
+.extAuxRegister io_spi_mst1_ser,0x80010104,r|w
+.extAuxRegister io_spi_mst1_baudr,0x80010105,r|w
+.extAuxRegister io_spi_mst1_txftlr,0x80010106,r|w
+.extAuxRegister io_spi_mst1_rxftlr,0x80010107,r|w
+.extAuxRegister io_spi_mst1_txflr,0x80010108,r
+.extAuxRegister io_spi_mst1_rxflr,0x80010109,r
+.extAuxRegister io_spi_mst1_sr,0x8001010a,r
+.extAuxRegister io_spi_mst1_imr,0x8001010b,r|w
+.extAuxRegister io_spi_mst1_isr,0x8001010c,r
+.extAuxRegister io_spi_mst1_risr,0x8001010d,r
+.extAuxRegister io_spi_mst1_txoicr,0x8001010e,r
+.extAuxRegister io_spi_mst1_rxoicr,0x8001010f,r
+.extAuxRegister io_spi_mst1_rxuicr,0x80010110,r
+.extAuxRegister io_spi_mst1_icr,0x80010112,r|w
+.extAuxRegister io_spi_mst1_clken,0x80010116,r|w
+.extAuxRegister io_spi_mst1_dr,0x80010118,r|w
+.extAuxRegister io_spi_mst1_rx_sample_dly,0x8001013c,r|w
+.set apex_com_arc_hardware_dfss_io_spi_mst2_present,1
+.extAuxRegister io_spi_mst2_ctrlr0,0x80010200,r|w
+.extAuxRegister io_spi_mst2_ctrlr1,0x80010201,r|w
+.extAuxRegister io_spi_mst2_spien,0x80010202,r|w
+.extAuxRegister io_spi_mst2_ser,0x80010204,r|w
+.extAuxRegister io_spi_mst2_baudr,0x80010205,r|w
+.extAuxRegister io_spi_mst2_txftlr,0x80010206,r|w
+.extAuxRegister io_spi_mst2_rxftlr,0x80010207,r|w
+.extAuxRegister io_spi_mst2_txflr,0x80010208,r
+.extAuxRegister io_spi_mst2_rxflr,0x80010209,r
+.extAuxRegister io_spi_mst2_sr,0x8001020a,r
+.extAuxRegister io_spi_mst2_imr,0x8001020b,r|w
+.extAuxRegister io_spi_mst2_isr,0x8001020c,r
+.extAuxRegister io_spi_mst2_risr,0x8001020d,r
+.extAuxRegister io_spi_mst2_txoicr,0x8001020e,r
+.extAuxRegister io_spi_mst2_rxoicr,0x8001020f,r
+.extAuxRegister io_spi_mst2_rxuicr,0x80010210,r
+.extAuxRegister io_spi_mst2_icr,0x80010212,r|w
+.extAuxRegister io_spi_mst2_clken,0x80010216,r|w
+.extAuxRegister io_spi_mst2_dr,0x80010218,r|w
+.extAuxRegister io_spi_mst2_rx_sample_dly,0x8001023c,r|w
+.set apex_com_arc_hardware_dfss_io_spi_slv0_present,1
+.extAuxRegister io_spi_slv0_ctrlr0,0x80011000,r|w
+.extAuxRegister io_spi_slv0_spien,0x80011002,r|w
+.extAuxRegister io_spi_slv0_txftlr,0x80011006,r|w
+.extAuxRegister io_spi_slv0_rxftlr,0x80011007,r|w
+.extAuxRegister io_spi_slv0_txflr,0x80011008,r
+.extAuxRegister io_spi_slv0_rxflr,0x80011009,r
+.extAuxRegister io_spi_slv0_sr,0x8001100a,r
+.extAuxRegister io_spi_slv0_imr,0x8001100b,r|w
+.extAuxRegister io_spi_slv0_isr,0x8001100c,r
+.extAuxRegister io_spi_slv0_risr,0x8001100d,r
+.extAuxRegister io_spi_slv0_txoicr,0x8001100e,r
+.extAuxRegister io_spi_slv0_rxoicr,0x8001100f,r
+.extAuxRegister io_spi_slv0_rxuicr,0x80011010,r
+.extAuxRegister io_spi_slv0_icr,0x80011012,r|w
+.extAuxRegister io_spi_slv0_clken,0x80011016,r|w
+.extAuxRegister io_spi_slv0_dr,0x80011018,r|w
+.set apex_com_arc_hardware_dfss_io_gpio1_present,1
+.extAuxRegister io_gpio1_debounce,0x80017148,r|w
+.extAuxRegister io_gpio1_clken,0x80017180,r|w
+.extAuxRegister io_gpio1_swporta_dr,0x80017100,r|w
+.extAuxRegister io_gpio1_swporta_ddr,0x80017104,r|w
+.extAuxRegister io_gpio1_inten,0x80017130,r|w
+.extAuxRegister io_gpio1_intmask,0x80017134,r|w
+.extAuxRegister io_gpio1_inttype_level,0x80017138,r|w
+.extAuxRegister io_gpio1_int_polarity,0x8001713c,r|w
+.extAuxRegister io_gpio1_intstatus,0x80017140,r
+.extAuxRegister io_gpio1_raw_intstatus,0x80017144,r
+.extAuxRegister io_gpio1_porta_eoi,0x8001714c,w
+.extAuxRegister io_gpio1_ext_porta,0x80017150,r
+.extAuxRegister io_gpio1_ls_sync,0x80017160,r|w
+.extAuxRegister io_gpio1_int_bothedge,0x80017168,r|w
+.set apex_com_arc_hardware_dfss_io_gpio2_present,1
+.extAuxRegister io_gpio2_debounce,0x80017248,r|w
+.extAuxRegister io_gpio2_clken,0x80017280,r|w
+.extAuxRegister io_gpio2_swporta_dr,0x80017200,r|w
+.extAuxRegister io_gpio2_swporta_ddr,0x80017204,r|w
+.extAuxRegister io_gpio2_inten,0x80017230,r|w
+.extAuxRegister io_gpio2_intmask,0x80017234,r|w
+.extAuxRegister io_gpio2_inttype_level,0x80017238,r|w
+.extAuxRegister io_gpio2_int_polarity,0x8001723c,r|w
+.extAuxRegister io_gpio2_intstatus,0x80017240,r
+.extAuxRegister io_gpio2_raw_intstatus,0x80017244,r
+.extAuxRegister io_gpio2_porta_eoi,0x8001724c,w
+.extAuxRegister io_gpio2_ext_porta,0x80017250,r
+.extAuxRegister io_gpio2_ls_sync,0x80017260,r|w
+.extAuxRegister io_gpio2_int_bothedge,0x80017268,r|w
+.set apex_com_arc_hardware_dfss_io_i2c_mst1_present,1
+.extAuxRegister io_i2c_mst1_clken,0x800121c0,r|w
+.extAuxRegister io_i2c_mst1_con,0x80012100,r|w
+.extAuxRegister io_i2c_mst1_tar,0x80012104,r|w
+.extAuxRegister io_i2c_mst1_data_cmd,0x80012110,r|w
+.extAuxRegister io_i2c_mst1_ss_scl_hcnt,0x80012114,r|w
+.extAuxRegister io_i2c_mst1_ss_scl_lcnt,0x80012118,r|w
+.extAuxRegister io_i2c_mst1_fs_scl_hcnt,0x8001211c,r|w
+.extAuxRegister io_i2c_mst1_fs_scl_lcnt,0x80012120,r|w
+.extAuxRegister io_i2c_mst1_intr_stat,0x8001212c,r
+.extAuxRegister io_i2c_mst1_intr_mask,0x80012130,r|w
+.extAuxRegister io_i2c_mst1_raw_intr_stat,0x80012134,r
+.extAuxRegister io_i2c_mst1_rx_tl,0x80012138,r|w
+.extAuxRegister io_i2c_mst1_tx_tl,0x8001213c,r|w
+.extAuxRegister io_i2c_mst1_clr_intr,0x80012140,r
+.extAuxRegister io_i2c_mst1_clr_rx_under,0x80012144,r
+.extAuxRegister io_i2c_mst1_clr_rx_over,0x80012148,r
+.extAuxRegister io_i2c_mst1_clr_tx_over,0x8001214c,r
+.extAuxRegister io_i2c_mst1_clr_tx_abrt,0x80012154,r
+.extAuxRegister io_i2c_mst1_clr_activity,0x8001215c,r
+.extAuxRegister io_i2c_mst1_clr_stop_det,0x80012160,r
+.extAuxRegister io_i2c_mst1_clr_start_det,0x80012164,r
+.extAuxRegister io_i2c_mst1_enable,0x8001216c,r|w
+.extAuxRegister io_i2c_mst1_status,0x80012170,r
+.extAuxRegister io_i2c_mst1_txflr,0x80012174,r
+.extAuxRegister io_i2c_mst1_rxflr,0x80012178,r
+.extAuxRegister io_i2c_mst1_sda_hold,0x8001217c,r|w
+.extAuxRegister io_i2c_mst1_tx_abrt_source,0x80012180,r
+.extAuxRegister io_i2c_mst1_enable_status,0x8001219c,r
+.extAuxRegister io_i2c_mst1_fs_spklen,0x800121a0,r|w
+.set apex_com_arc_hardware_dfss_io_i2c_mst2_present,1
+.extAuxRegister io_i2c_mst2_clken,0x800122c0,r|w
+.extAuxRegister io_i2c_mst2_con,0x80012200,r|w
+.extAuxRegister io_i2c_mst2_tar,0x80012204,r|w
+.extAuxRegister io_i2c_mst2_data_cmd,0x80012210,r|w
+.extAuxRegister io_i2c_mst2_ss_scl_hcnt,0x80012214,r|w
+.extAuxRegister io_i2c_mst2_ss_scl_lcnt,0x80012218,r|w
+.extAuxRegister io_i2c_mst2_fs_scl_hcnt,0x8001221c,r|w
+.extAuxRegister io_i2c_mst2_fs_scl_lcnt,0x80012220,r|w
+.extAuxRegister io_i2c_mst2_intr_stat,0x8001222c,r
+.extAuxRegister io_i2c_mst2_intr_mask,0x80012230,r|w
+.extAuxRegister io_i2c_mst2_raw_intr_stat,0x80012234,r
+.extAuxRegister io_i2c_mst2_rx_tl,0x80012238,r|w
+.extAuxRegister io_i2c_mst2_tx_tl,0x8001223c,r|w
+.extAuxRegister io_i2c_mst2_clr_intr,0x80012240,r
+.extAuxRegister io_i2c_mst2_clr_rx_under,0x80012244,r
+.extAuxRegister io_i2c_mst2_clr_rx_over,0x80012248,r
+.extAuxRegister io_i2c_mst2_clr_tx_over,0x8001224c,r
+.extAuxRegister io_i2c_mst2_clr_tx_abrt,0x80012254,r
+.extAuxRegister io_i2c_mst2_clr_activity,0x8001225c,r
+.extAuxRegister io_i2c_mst2_clr_stop_det,0x80012260,r
+.extAuxRegister io_i2c_mst2_clr_start_det,0x80012264,r
+.extAuxRegister io_i2c_mst2_enable,0x8001226c,r|w
+.extAuxRegister io_i2c_mst2_status,0x80012270,r
+.extAuxRegister io_i2c_mst2_txflr,0x80012274,r
+.extAuxRegister io_i2c_mst2_rxflr,0x80012278,r
+.extAuxRegister io_i2c_mst2_sda_hold,0x8001227c,r|w
+.extAuxRegister io_i2c_mst2_tx_abrt_source,0x80012280,r
+.extAuxRegister io_i2c_mst2_enable_status,0x8001229c,r
+.extAuxRegister io_i2c_mst2_fs_spklen,0x800122a0,r|w
+.set apex_com_arc_hardware_dfss_io_uart0_present,1
+.extAuxRegister io_uart0_clken,0x800140c0,r|w
+.extAuxRegister io_uart0_rbr_thr_dll,0x80014000,r|w
+.extAuxRegister io_uart0_ier_dlh,0x80014004,r|w
+.extAuxRegister io_uart0_iir_fcr,0x80014008,r|w
+.extAuxRegister io_uart0_lcr,0x8001400c,r|w
+.extAuxRegister io_uart0_mcr,0x80014010,r|w
+.extAuxRegister io_uart0_lsr,0x80014014,r
+.extAuxRegister io_uart0_msr,0x80014018,r
+.extAuxRegister io_uart0_usr,0x8001407c,r
+.set apex_com_arc_hardware_dfss_io_uart1_present,1
+.extAuxRegister io_uart1_clken,0x800141c0,r|w
+.extAuxRegister io_uart1_rbr_thr_dll,0x80014100,r|w
+.extAuxRegister io_uart1_ier_dlh,0x80014104,r|w
+.extAuxRegister io_uart1_iir_fcr,0x80014108,r|w
+.extAuxRegister io_uart1_lcr,0x8001410c,r|w
+.extAuxRegister io_uart1_mcr,0x80014110,r|w
+.extAuxRegister io_uart1_lsr,0x80014114,r
+.extAuxRegister io_uart1_msr,0x80014118,r
+.extAuxRegister io_uart1_usr,0x8001417c,r
+.set apex_com_arc_hardware_dfss_io_uart2_present,1
+.extAuxRegister io_uart2_clken,0x800142c0,r|w
+.extAuxRegister io_uart2_rbr_thr_dll,0x80014200,r|w
+.extAuxRegister io_uart2_ier_dlh,0x80014204,r|w
+.extAuxRegister io_uart2_iir_fcr,0x80014208,r|w
+.extAuxRegister io_uart2_lcr,0x8001420c,r|w
+.extAuxRegister io_uart2_mcr,0x80014210,r|w
+.extAuxRegister io_uart2_lsr,0x80014214,r
+.extAuxRegister io_uart2_msr,0x80014218,r
+.extAuxRegister io_uart2_usr,0x8001427c,r
+.set apex_com_arc_hardware_dfss_io_uart3_present,1
+.extAuxRegister io_uart3_clken,0x800143c0,r|w
+.extAuxRegister io_uart3_rbr_thr_dll,0x80014300,r|w
+.extAuxRegister io_uart3_ier_dlh,0x80014304,r|w
+.extAuxRegister io_uart3_iir_fcr,0x80014308,r|w
+.extAuxRegister io_uart3_lcr,0x8001430c,r|w
+.extAuxRegister io_uart3_mcr,0x80014310,r|w
+.extAuxRegister io_uart3_lsr,0x80014314,r
+.extAuxRegister io_uart3_msr,0x80014318,r
+.extAuxRegister io_uart3_usr,0x8001437c,r
+.set apex_com_arc_hardware_dfss_io_i2s_rx_mst0_present,1
+.extAuxRegister io_i2s_rx_mst0_ier,0x8001a000,r|w
+.extAuxRegister io_i2s_rx_mst0_irer,0x8001a004,r|w
+.extAuxRegister io_i2s_rx_mst0_cer,0x8001a00c,r|w
+.extAuxRegister io_i2s_rx_mst0_ccr,0x8001a010,r|w
+.extAuxRegister io_i2s_rx_mst0_rxffr,0x8001a014,w
+.extAuxRegister io_i2s_rx_mst0_lrbr,0x8001a020,r
+.extAuxRegister io_i2s_rx_mst0_rrbr,0x8001a024,r
+.extAuxRegister io_i2s_rx_mst0_rer,0x8001a028,r|w
+.extAuxRegister io_i2s_rx_mst0_rcr,0x8001a030,r|w
+.extAuxRegister io_i2s_rx_mst0_isr,0x8001a038,r
+.extAuxRegister io_i2s_rx_mst0_imr,0x8001a03c,r|w
+.extAuxRegister io_i2s_rx_mst0_ror,0x8001a040,r
+.extAuxRegister io_i2s_rx_mst0_rfcr,0x8001a048,r|w
+.extAuxRegister io_i2s_rx_mst0_rff,0x8001a050,w
+.extAuxRegister io_i2s_rx_mst0_rxdma,0x8001a1c0,r
+.set apex_com_arc_hardware_dfss_io_i2s_tx_mst0_present,1
+.extAuxRegister io_i2s_tx_mst0_ier,0x80019000,r|w
+.extAuxRegister io_i2s_tx_mst0_iter,0x80019008,r|w
+.extAuxRegister io_i2s_tx_mst0_cer,0x8001900c,r|w
+.extAuxRegister io_i2s_tx_mst0_ccr,0x80019010,r|w
+.extAuxRegister io_i2s_tx_mst0_txffr,0x80019018,w
+.extAuxRegister io_i2s_tx_mst0_lthr,0x80019020,w
+.extAuxRegister io_i2s_tx_mst0_rthr,0x80019024,w
+.extAuxRegister io_i2s_tx_mst0_ter,0x8001902c,r|w
+.extAuxRegister io_i2s_tx_mst0_tcr,0x80019034,r|w
+.extAuxRegister io_i2s_tx_mst0_isr,0x80019038,r
+.extAuxRegister io_i2s_tx_mst0_imr,0x8001903c,r|w
+.extAuxRegister io_i2s_tx_mst0_tor,0x80019044,r
+.extAuxRegister io_i2s_tx_mst0_tfcr,0x8001904c,r|w
+.extAuxRegister io_i2s_tx_mst0_tff,0x80019054,w
+.extAuxRegister io_i2s_tx_mst0_txdma,0x800191c8,w
+.set apex_com_arc_hardware_dfss_io_pdm_rx0_present,1
+.extAuxRegister io_pdm_rx0_pdm_en,0x8001b000,r|w
+.extAuxRegister io_pdm_rx0_pdm_ren,0x8001b004,r|w
+.extAuxRegister io_pdm_rx0_cer,0x8001b00c,r|w
+.extAuxRegister io_pdm_rx0_rxffr,0x8001b014,w
+.extAuxRegister io_pdm_rx0_rer0,0x8001b028,r|w
+.extAuxRegister io_pdm_rx0_isr,0x8001b038,r
+.extAuxRegister io_pdm_rx0_imr,0x8001b03c,r|w
+.extAuxRegister io_pdm_rx0_ror,0x8001b040,r
+.extAuxRegister io_pdm_rx0_rfcr,0x8001b048,r|w
+.extAuxRegister io_pdm_rx0_rxdma,0x8001b1c0,r
+.extAuxRegister io_pdm_rx0_pdm_rr,0x8001b1d0,r|w
+.extAuxRegister io_pdm_rx0_cic_n,0x8001b1d4,r|w
+.extAuxRegister io_pdm_rx0_cic_d,0x8001b1d8,r|w
+.extAuxRegister io_pdm_rx0_dcrc,0x8001b1dc,r|w
+.extAuxRegister io_pdm_rx0_brc_b0,0x8001b1e0,r|w
+.extAuxRegister io_pdm_rx0_brc_clp,0x8001b1f0,r|w
+.set apex_com_arc_hardware_floating_point_unit_fpu_present,1
+.extAuxRegister fpu_build,0xc8,r
+.extAuxRegister fpu_ctrl,0x300,r|w
+.extAuxRegister fpu_status,0x301,r|w
+.extInstruction fsmadd,6,5,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsmsub,6,6,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsmul,6,0,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsadd,6,1,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fssub,6,2,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fcvt32,6,8,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsdiv,6,7,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fscmp,6,3,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction fscmpf,6,4,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction fssqrt,6,0,FLAGS_NONE,SYNTAX_2OP
+.set apex_com_arc_hardware_floating_point_unit_fpu_dp_assist_present,1
+.extAuxRegister aux_dpfp1l,0x302,r|w
+.extAuxRegister aux_dpfp1h,0x303,r|w
+.extAuxRegister aux_dpfp2l,0x304,r|w
+.extAuxRegister aux_dpfp2h,0x305,r|w
+.extInstruction dmulh11,6,48,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dmulh12,6,49,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dmulh21,6,50,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dmulh22,6,51,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh11,6,52,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh12,6,53,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh21,6,54,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh22,6,55,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh11,6,56,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh12,6,57,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh21,6,58,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh22,6,59,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dexcl1,6,60,SUFFIX_COND,SYNTAX_3OP
+.extInstruction dexcl2,6,61,SUFFIX_COND,SYNTAX_3OP
+
+]]></string>
+  </configuration>
+</config_list>
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf
new file mode 100644
index 00000000000..da39ae911ff
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf
@@ -0,0 +1,47 @@
+# SYSTEM memory regions indicate where external memory might be located.
+#   The TCF has no specific knowledge of whether SYSTEM regions contain 
+#   external memory or not.
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+#   SYSTEM0 : ORIGIN = 0x00000000, LENGTH = 0x20000000
+    ICCM0   : ORIGIN = 0x20000000, LENGTH = 0x00040000
+#   CCMWRAP0: ORIGIN = 0x20040000, LENGTH = 0x0ffc0000
+#   SYSTEM1 : ORIGIN = 0x30000000, LENGTH = 0x50000000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+#   SYSTEM2 : ORIGIN = 0x90000000, LENGTH = 0x30000000
+    XCCM    : ORIGIN = 0xc0000000, LENGTH = 0x00008000
+#   CCMWRAP2: ORIGIN = 0xc0008000, LENGTH = 0x0fff8000
+#   SYSTEM3 : ORIGIN = 0xd0000000, LENGTH = 0x10000000
+    YCCM    : ORIGIN = 0xe0000000, LENGTH = 0x00008000
+#   CCMWRAP3: ORIGIN = 0xe0008000, LENGTH = 0x0fff8000
+#   SYSTEM4 : ORIGIN = 0xf0000000, LENGTH = 0x10000000
+    }
+SECTIONS {
+    GROUP: {
+	.text? : { *('.text$crt*') }
+        * (TEXT): {}
+    	* (LIT): {}
+	} > ICCM0
+
+    GROUP: {
+	/* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
+	} > DCCM
+    GROUP: {
+        .Xdata? : {}
+        } > XCCM
+    GROUP: {
+        .Ydata? : {}
+        } > YCCM
+    GROUP BIND(0x0): {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4)
+        }
+    }
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf
new file mode 100644
index 00000000000..004215a2f6a
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf
@@ -0,0 +1,4621 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<config_list>
+  <tool_config version="1.0.59" mwdt_version="M-2016.12" />
+  <configuration name="BCRs" filename="bcr_contents.txt">
+    <string><![CDATA[
+	0x4	0x142		IDENTITY
+	0x18	0x80000000	AUX_DCCM
+	0x60	0x2		BCR_VER
+	0x68	0x10		VECBASE_AC_BUILD
+	0x6d	0x1002		MPU_BUILD
+	0x6e	0xc902		RF_BUILD
+	0x74	0x904		DCCM_BUILD
+	0x75	0x10304		TIMER_BUILD
+	0x76	0x605		AP_BUILD
+	0x78	0xa04		ICCM_BUILD
+	0x79	0x3620		XY_BUILD
+	0x7a	0x3521		DSP_BUILD
+	0x7b	0x22a06		MULTIPLY_BUILD
+	0x7c	0x3		SWAP_BUILD
+	0x7d	0x3		NORM_BUILD
+	0x7e	0x2		MINMAX_BUILD
+	0x7f	0x303		BARREL_BUILD
+	0xc1	0x12447402	ISA_CONFIG
+	0xc5	0x2		STACK_REGION_BUILD
+	0xc7	0x30000003	ERP_BUILD
+	0xc8	0x1000f02	FPU_BUILD
+	0xc9	0x1		CPROT_BUILD
+	0xcc	0x1442401	AGU_BUILD
+	0xcd	0x170f01	DMAC_BUILD
+	0xd0	0x2011002	MCIP_SYSTEM_BUILD
+	0xd3	0x2		MCIP_PMU_BUILD
+	0xe3	0x1		MCIP_PDM_BUILD
+	0xf0	0x100013	SUBSYS_BUILD
+	0xf1	0x1		CORE_CONFIG
+	0xf3	0x133c5f01	IRQ_BUILD
+	0xf5	0x8080102	PCT_BUILD
+	0xf6	0x6f0004	CC_BUILD
+	0xf7	0x302		PDM_DVFS_BUILD
+	0xfe	0x202		IFQUEUE_BUILD
+	0xff	0x10003		SMART_BUILD
+	0x208	0x20000000	AUX_ICCM
+	0x5f8	0xc0000000	XCCM_BASE
+	0x5f9	0xe0000000	YCCM_BASE
+	0xa00	0x1000		SUBSYS_DSP_0_BUILD
+	0xa04	0x71711f0	SUBSYS_IO_0_BUILD
+	0xa05	0xf70		SUBSYS_IO_1_BUILD
+]]></string>
+  </configuration>
+  <configuration name="mw_compiler" filename="ccac.arg">
+    <string><![CDATA[
+	-arcv2em
+	-core2
+	-Hrgf_banked_regs=32
+	-HL
+	-Xunaligned
+	-Xcode_density
+	-Xdiv_rem=radix2
+	-Xswap
+	-Xbitscan
+	-Xmpy_option=mpyd
+	-Xshift_assist
+	-Xbarrel_shifter
+	-Xdsp2
+	-Xdsp_complex
+	-Xdsp_divsqrt=radix2
+	-Xdsp_itu
+	-Xdsp_accshift=full
+	-Xagu_small
+	-Xxy
+	-Xfpus_div
+	-Xfpu_mac
+	-Xfpuda
+	-Xfpus_mpy_slow
+	-Xfpus_div_slow
+	-Xtimer0
+	-Xtimer1
+	-Xstack_check
+	-Hccm
+	-Xdmac
+]]></string>
+  </configuration>
+  <configuration name="mw_debugger" filename="mdb.arg">
+    <string><![CDATA[
+	-arcv2em 
+	-core2 
+	-rgf_num_banks=2 
+	-rgf_banked_regs=32 
+	-rgf_num_wr_ports=2 
+	-Xunaligned 
+	-Xcode_density 
+	-Xdiv_rem=radix2 
+	-Xswap 
+	-Xbitscan 
+	-Xmpy_option=mpyd 
+	-Xshift_assist 
+	-Xbarrel_shifter 
+	-Xdsp2 
+	-Xdsp_complex 
+	-Xdsp_divsqrt=radix2 
+	-Xdsp_itu 
+	-Xdsp_accshift=full 
+	-Xagu_small 
+	-Xagu_wb_depth=2 
+	-Xagu_accord 
+	-Xxy 
+	-Xxy_config=dccm_x_y 
+	-Xxy_size=32K 
+	-Xxy_interleave 
+	-Xxy_x_base=0xc0000000 
+	-Xxy_y_base=0xe0000000 
+	-Xfpus_div 
+	-Xfpu_mac 
+	-Xfpuda 
+	-Xfpus_mpy_slow 
+	-Xfpus_div_slow 
+	-Xtimer0 
+	-Xtimer0_level=1 
+	-Xtimer1 
+	-Xtimer1_level=0 
+	-action_points=8 
+	-Xstack_check 
+	-code_protection 
+	-smart_stack_entries=64 
+	-mpu 
+	-mpu_regions=16 
+	-ifq_entries=4 
+	-interrupts=95 
+	-interrupt_priorities=4 
+	-ext_interrupts=60 
+	-firq 
+	-interrupt_base=0x0 
+	-dccm_size=0x20000 
+	-dccm_base=0x80000000 
+	-iccm0_size=0x40000 
+	-iccm0_base=0x20000000 
+	-error_prot_ver=3 
+	-ccm_prot_pipelined 
+	-watchdog 
+	-watchdog_size=16 
+	-Xpct_counters=8 
+	-arconnect 
+	-connect_pmu 
+	-connect_pdm 
+	-dmac 
+	-dmac_channels=16 
+	-dmac_registers=16 
+	-dmac_fifo_depth=4 
+	-dmac_int_config=multiple_internal 
+	-power_domains 
+	-dvfs 
+]]></string>
+  </configuration>
+  <configuration name="nSIM" filename="nsim.props">
+    <string><![CDATA[
+	nsim_isa_family=av2em
+	nsim_isa_core=2
+	arcver=0x42
+	nsim_isa_rgf_num_banks=2
+	nsim_isa_rgf_banked_regs=32
+	nsim_isa_rgf_num_regs=32
+	nsim_isa_rgf_num_wr_ports=2
+	nsim_isa_big_endian=0
+	nsim_isa_lpc_size=32
+	nsim_isa_pc_size=32
+	nsim_isa_addr_size=32
+	nsim_isa_ad_option=1
+	nsim_isa_code_density_option=2
+	nsim_isa_div_rem_option=1
+	nsim_isa_swap_option=1
+	nsim_isa_bitscan_option=1
+	nsim_isa_mpy_option=8
+	nsim_isa_shift_option=3
+	nsim_isa_dsp_option=2
+	nsim_isa_dsp_complex_option=1
+	nsim_isa_dsp_divsqrt_option=1
+	nsim_isa_dsp_itu_option=1
+	nsim_isa_dsp_accshift_option=2
+	nsim_isa_agu_size=small
+	nsim_isa_agu_wb_depth=2
+	nsim_isa_agu_accord=1
+	nsim_isa_xy=1
+	nsim_isa_xy_config=dccm_x_y
+	nsim_isa_xy_size=32K
+	nsim_isa_xy_interleave=1
+	nsim_isa_xy_x_base=0xc0000000
+	nsim_isa_xy_y_base=0xe0000000
+	nsim_isa_fpus_div_option=1
+	nsim_isa_fpu_mac_option=1
+	nsim_isa_fpuda_option=1
+	nsim_isa_fpu_fast_mpy_option=0
+	nsim_isa_fpu_fast_div_option=0
+	nsim_isa_enable_timer_0=1
+	nsim_isa_timer_0_int_level=1
+	nsim_isa_enable_timer_1=1
+	nsim_isa_timer_1_int_level=0
+	nsim_isa_num_actionpoints=8
+	nsim_isa_stack_checking=1
+	nsim_isa_code_protect_mask=0x0
+	nsim_isa_smart_stack_entries=64
+	mpu_regions=16
+	mpu_version=2
+	nsim_isa_ifq_size=4
+	nsim_isa_number_of_interrupts=95
+	nsim_isa_number_of_levels=4
+	nsim_isa_number_of_external_interrupts=60
+	nsim_isa_fast_irq=1
+	nsim_isa_intvbase_preset=0x0
+	dccm_size=0x20000
+	dccm_base=0x80000000
+	iccm0_size=0x40000
+	iccm0_base=0x20000000
+	nsim_isa_error_prot=3
+	nsim_isa_error_prot_ccm_wb=1
+	nsim_isa_watchdog=1
+	nsim_isa_watchdog_size=16
+	nsim_isa_pct_counters=8
+	nsim_connect=2
+	nsim_connect_pmu=1
+	nsim_connect_pdm=1
+	nsim_isa_dmac_option=1
+	nsim_isa_dmac_channels=16
+	nsim_isa_dmac_registers=16
+	nsim_isa_dmac_fifo_depth=4
+	nsim_isa_dmac_int_config=multiple_internal
+	nsim_isa_pdm_option=1
+	nsim_isa_dvfs_option=1
+]]></string>
+  </configuration>
+  <configuration name="IDE" filename="ide.props">
+    <string><![CDATA[
+	processor.family=4
+	processor.core_version=2
+	processor.family_name=arcv2em
+	processor.rgf_num_banks=2
+	processor.rgf_banked_regs=32
+	processor.rgf_num_wr_ports=2
+	processor.endian=little
+	processor.lpc_size=32
+	processor.pc_size=32
+	processor.addr_size=32
+	processor.Xunaligned=1
+	processor.Xcode_density=1
+	processor.Xdiv_rem=radix2
+	processor.Xswap=1
+	processor.Xbitscan=1
+	processor.Xmpy_option=mpyd
+	processor.Xshift_assist=1
+	processor.Xbarrel_shifter=1
+	processor.Xdsp2=1
+	processor.Xdsp_complex=1
+	processor.Xdsp_divsqrt=radix2
+	processor.Xdsp_itu=1
+	processor.Xdsp_accshift=full
+	processor.Xagu_small=1
+	processor.Xagu_wb_depth=2
+	processor.Xagu_accord=1
+	processor.Xxy=1
+	processor.Xxy_config=dccm_x_y
+	processor.Xxy_size=32K
+	processor.Xxy_interleave=1
+	processor.Xxy_x_base=0xc0000000
+	processor.Xxy_y_base=0xe0000000
+	processor.Xfpus_div=1
+	processor.Xfpu_mac=1
+	processor.Xfpuda=1
+	processor.Xfpus_mpy_slow=1
+	processor.Xfpus_div_slow=1
+	processor.Xtimer0=1
+	processor.Xtimer0_level=1
+	processor.Xtimer1=1
+	processor.Xtimer1_level=0
+	processor.action_points=8
+	processor.Xstack_check=1
+	processor.code_protection=1
+	processor.smart_stack_entries=64
+	processor.mpu=1
+	processor.mpu.regions=16
+	processor.ifq_entries=4
+	processor.interrupts=95
+	processor.interrupt_priorities=4
+	processor.ext_interrupts=60
+	processor.firq=1
+	processor.interrupt_base=0x0
+	processor.dccm_size=0x20000
+	processor.dccm_base=0x80000000
+	processor.Hccm=1
+	processor.iccm0_size=0x40000
+	processor.iccm0_base=0x20000000
+	processor.error_prot_ver=3
+	processor.ccm_prot_pipelined=1
+	processor.watchdog=1
+	processor.watchdog_size=16
+	processor.Xpct_counters=8
+	processor.arconnect=1
+	processor.connect_pmu=1
+	processor.connect_pdm=1
+	processor.dmac=1
+	processor.dmac_channels=16
+	processor.dmac_registers=16
+	processor.dmac_fifo_depth=4
+	processor.dmac_int_config=multiple_internal
+	processor.power_domains=1
+	processor.dvfs=1
+]]></string>
+  </configuration>
+  <configuration name="architect" filename="architect.txt">
+    <string><![CDATA[
+######## architect --- com.arc.templates.project.Empty.1_0 ########
+
+# BuildHTMLDocs --- Creates custom HTML documentation in the 'docs' directory.
+-build_html_docs true
+
+# BuildSoftware --- Creates software under the Software directory.
+-build_software true
+
+# BuildTestCode --- Creates test source code under the 'tests' directory.
+-build_test_code true
+
+# BuildScripts --- Creates synthesis scripts and configuration files, which are required for hierarchy generation.
+-build_scripts true
+
+# BuildHDL --- Creates the behavioural and synthesisable HDL source code.
+-build_hdl true
+
+# CompileTestCode --- Compiles and assembles the test code.
+-compile_test_code true
+
+# GenerateStructuralHDL --- Generate the necessary structural HDL
+-generate_structural_hdl true
+
+# CompileForHDLSimulation --- Compile the HDL ready for simulation, using the selected Simulator.
+-compile_hdl_for_simulation true
+
+# BuildXCAM --- 
+# When true, build the XCAM cycle accurate model from HDL.
+# This happens only when the VTOC component (in the XCAM library) has been added to the design.
+# 
+-build_xcam false
+
+# RunARCsyn --- Synthesize design using ARCsyn
+-run_arcsyn false
+
+# RunSEIF --- Run Synopsys Embedit Integrator Flow to generate configured memory instances
+-run_seif false
+
+# RunARCrams --- Run ARCrams on the current build, this will stitch in vendor supplied RAM models and update the synthesis and simulation environment to use the models.
+-run_arcrams false
+
+# RunARCformal --- Formal Verification using ARCformal
+-run_arcformal false
+
+# RunARCpower --- Run the Power Analysis using RTL simulation to derive the activity
+-run_arcpower false
+
+# compile_nsim_user_extensions --- Build nSIM extensions for any APEX components in the current design using their C Models.
+-compile_nsim_user_extension false
+
+# compile_translated_nsim_extensions --- Converts APEX extensions from Verilog to C++ and compiles the model for nSIM.
+-compile_translated_nsim_extensions false
+
+
+######## System --- com.arc.hardware.System.1_0 ########
+
+# Create System
+-create com.arc.hardware.System.1_0 System
+
+# Testbench --- 
+# Only the rascal testbench is supported, and is required by ARCtest.
+# 	
+-testbench rascal
+
+# SynthesisLevel --- 
+# Sets the top level module name for synthesis.  
+# 
+# If not using core_sys: for single-core designs, cpu_isle is used; for multicore designs, archipelago is used.
+# 	
+-synthesislevel cpu_isle/archipelago
+
+# GateLevelSim --- When selected the gate level sim test code and scripts would be installed to run ARCgatesim
+-gatesim true
+
+# UserLibraryName --- The name for your HDL library
+-library_name user
+
+# OPTION_SimulatorName --- The name of the simulator you wish to use
+-simulator vcs
+
+# sim64 --- When selected, the 64-bit version of the simulator is used.  Be sure you have the 64-bit-capable simulator installed  and $ARCHITECT_ROOT/lib/linux_x86_64/ added to your LD_LIBRARY_PATH.
+# The setting of this option affects the content of the generated makefile_interface_*_verilog, where * is the simulator name.
+-sim64 true
+
+# verilog_2001 --- Enable Verilog 2001 file-io syntax (if false: use pli)
+-verilog_2001 true
+
+
+######## ARCv2EM CCT --- cct.1_0 ########
+
+# Create ARCv2EM CCT
+-create cct.1_0 "System.ARCv2EM CCT"
+
+# cct --- 
+# 	Option used to add a CCT to the design for command-line builds
+# 	Without this architect can't add this component to a build
+# 	via a cmdline -create command.  
+# 	with old scripts.
+# 	
+-cct true
+
+# no_hostlink --- 
+# This prevents the inclusion of the hostlink library when compiling
+# C or C++ programs.  The resultant executable, if it contains printfs,
+# will print to an internal fixed buffer __mwwrite_buf.  
+# Other hostlink operations that require debugger assistance, such as file
+# opens, will fail.
+# 
+# Hostlink references incur memory cycles at unpredictable times and 
+# so can perturb cycle-timing results.  Without hostlink,
+# the debugger will not in any way interfere with the target while it is running.  
+# Therefore this option is useful for simulation in which you want precisely the
+# same cycle timing to occur each time you run, or for accurate power consumption results.
+# 	
+-cct_no_hostlink false
+
+
+######## BusFabric --- com.arc.hardware.ARCv2MSS.BusFabric.1_0 ########
+
+# Create BusFabric
+-create com.arc.hardware.ARCv2MSS.BusFabric.1_0 System.BusFabric
+
+# alb_mss_fab_def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate mss fabric clock, and the value N means mss fabric is running at (1/N) x ref_clk.
+-alb_mss_fab_def_div2ref 1
+
+# alb_mss_fab_perf_transparent --- If true then there is no latency penalty cost in BusFabric for memory access transaction.
+-alb_mss_fab_perf_transparent true
+
+# alb_mss_fab_lat --- This specifies the maximum latency in the master latency units.
+-alb_mss_fab_lat 0
+
+# alb_mss_fab_def_lat --- This specifies the latency after reset for the master latency units.
+-alb_mss_fab_def_lat 0
+
+# alb_mss_ccm_base --- This specifies the base address at which the ICCM and DCCM DMIs will be placed in the memory map. The address should be divided by 4KB i.e. do not specify the lower 12 bits of the address.
+-alb_mss_ccm_base 262144
+
+
+######## ClkCtrl --- com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 ########
+
+# Create ClkCtrl
+-create com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 System.ClkCtrl
+
+# alb_mss_clkctrl_base_addr --- This specifies the clock controller base address in the memory map, divided by 4KB i.e. do not specify the lower 12 bits of the address.
+-alb_mss_clkctrl_base_addr 786432
+
+# alb_mss_clkctrl_bypass_mode --- If true then all clock dividers/gaters in the clock controller are bypassed, clock ratio is not supported and the division options/registers are overriden
+-alb_mss_clkctrl_bypass_mode false
+
+
+######## SRAM --- com.arc.hardware.ARCv2MSS.SRAMCtrl.1_0 ########
+
+# Create SRAM
+-create com.arc.hardware.ARCv2MSS.SRAMCtrl.1_0 System.SRAM
+
+# alb_mss_mem_base_addr --- This specifies the memory controller base address in the memory map, divided by 4KB i.e. do not specify the lower 12 bits of the address.
+-alb_mss_mem_base_addr 0
+
+# alb_mss_mem_lat --- This specifies the maximum latency in the memory latency unit.
+-alb_mss_mem_lat 0
+
+# alb_mss_mem_def_lat --- This specifies the latency after reset for the memory latency unit.
+-alb_mss_mem_def_lat 0
+
+# alb_mss_mem_size --- This specifies size of the SRAM.
+-alb_mss_mem_size 512KB
+
+# alb_mss_mem_is_default_slave --- If true then all transactions without destination will be routed here.
+-alb_mss_mem_is_default_slave false
+
+
+######## Implementation --- com.arc.hardware.implementation.1_0 ########
+
+# Create Implementation
+-create com.arc.hardware.implementation.1_0 System.Implementation
+
+# ClockSpeed --- Target clock speed of the system
+-clock_speed 10
+
+# DDR2_clk_Ratio --- DDR2 Clock Vs System Clock Ratio
+# 2x
+# 3x
+# 4x
+-ddr2_clk_ratio 3x
+
+# ClockSkew --- The clock skew for the system
+-clock_skew 0.2
+
+# HoldMargin --- Margin for hold time checks
+-hold_margin 0.05
+
+# Floorplan --- Floorplan definition for relative placement of  RAMs (at CPU-level) or the placement of the rams and CPU hard cores (at multicore level)
+-floorplan em4_sensor
+
+# JTAGFrequency --- Select the frequency of the JTAG clock Tck (in MHz).
+# 
+# The JTAG clock speed has to be less than 1/2 of the cpu clock otherwise the signals on the BVCI interface are not guaranteed to be valid.
+# 
+# NOTE: The RTL simulations will work when the JTAG clock frequency is set to half the CPU clock, however this may not be the case when simulating at gate level due to delays on the IO pads.
+# 
+# The default is set to 10 MHz so that there is no conflict when simulating with an ARCangel3 at 30MHz. (30 > 10*2)
+# 
+# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock.
+# 
+-jtag_tclk 4
+
+# execution_trace_level --- 
+# This traces committed instructions as they execute, and gathers statistics
+# visible in the debugger for counting instructions & cycle delays.
+# At the "stats" level ony the statistics are gathered and no trace is printed.
+# "file" is equivalent to "full", but the results go to a trace .txt file instead.
+# 
+-execution_trace_level stats
+
+# generate_ipxact --- 
+# Generate ipxact.xml file describing the CPUisle or archipelago frontier
+# 
+-generate_ipxact false
+
+# ipxact_relative_path_names --- 
+# Use relative path names for Verilog files in the ipxact.
+# Otherwise, absolute path names are used.
+# 
+-ipxact_relative_path_names true
+
+# optional_encryption --- 
+# When selected, encrypted RTL output is generated.
+# 	
+-optional_encryption false
+
+# ignore_encrypt_license --- 
+# When selected, pretend the encryption license is missing.  For testing.
+# 	
+-ignore_encrypt_license false
+
+# ignore_clear_license --- 
+# When selected, pretend the cleartest license is missing.  For testing.
+# 	
+-ignore_clear_license false
+
+
+######## Tool Configuration --- cgen.1_0 ########
+
+# Create Tool Configuration
+-create cgen.1_0 "System.Tool Configuration"
+
+# mwdt_version --- Selects the MetaWare version to be used with the TCF file.
+# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools.
+-mwdt_version K-2015.09
+
+# code_base_addr --- 
+# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build.  This value is ignored when there is an ICCM.
+# 
+-code_base_addr 0
+
+# data_base_addr --- 
+# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM.  This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used.
+# 
+# A value of 0xffffffff means that the data segment will not be mapped to any specific address.
+# 
+-data_base_addr 4294967295
+
+
+######## IO Software --- com.arc.software.dfss.sw_io.1_0 ########
+
+# Create IO Software
+-create com.arc.software.dfss.sw_io.1_0 "System.IO Software"
+
+# sw_io --- Command line option for Software element 'IO Software'
+-sw_io true
+
+
+######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ########
+
+# Create DSP Software
+-create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software"
+
+# sw_dsp --- Command line option for Software element 'DSP Software'
+-sw_dsp true
+
+
+######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ########
+
+# Create Infrastructure Software
+-create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software"
+
+# sw_infra --- Command line option for Software element 'Infrastructure Software'
+-sw_infra true
+
+
+######## CPUisle --- com.arc.hardware.CPU_isle.1_0 ########
+
+# Create CPUisle
+-create com.arc.hardware.CPU_isle.1_0 System.CPUisle
+
+# unique_name --- verilog module modifier prefix
+-unique_name ""
+
+# ArcNum --- The processor number as read back in the ARCNUM field of the IDENTITY register.
+-arc_num 1
+
+# instances --- 
+# The number of instantiations of this core.
+# 
+-instances 1
+
+# CPUFloorplan --- Floorplan giving relative placement of the RAMs  for the given configuration of ARCv2HS or ARCv2EM in this CPUisle
+-cpu_floorplan em9d_xyccm
+
+# userCPUFloorplanPath --- Pathname of user floorplan for the CPU when using a hierarchical implementation
+-usercpufloorplan_path ""
+
+# pinLocationConstraintsFile --- Pathname+filename of the physical pin location constraints file or just "side1" (all pins on l.h.s) or "side2" (pins on top only) or "side3" (pins on r.h.s. only) or "side4" (pins on bottom only) to get a template file generated
+-pin_location_constraints_file ""
+
+
+######## ARCv2EM --- com.arc.hardware.ARCv2EM.1_0 ########
+
+# Create ARCv2EM
+-create com.arc.hardware.ARCv2EM.1_0 System.CPUisle.ARCv2EM
+
+# arcv2em --- Description to follow
+-arcv2em true
+
+# def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate core clock, and the value N means core is running at (1/N) x ref_clk.
+-def_div2ref 1
+
+# addr_size --- This defines the address bus width (in bits).
+-addr_size 32
+
+# pc_size --- This defines the program counter (in bits).
+-pc_size 32
+
+# lpc_size --- This defines the size of the loop counter (in bits).
+-lpc_size 32
+
+# halt_on_reset --- This defines whether the core is halted initially on reset.
+-halt_on_reset true
+
+# byte_order --- This defines the endianness of the core.
+-byte_order little
+
+# code_density_option --- This reduces the size of program memory by adding instructions that condense commonly used instruction patterns with some marginal increase in processor gate count. The added instructions are ENTER_S, LEAVE_S, JLI_S, BI, BIH.
+-code_density_option true
+
+# bitscan_option --- This adds instructions for efficient search of bits within a 32 bit word, including normalize (NORM, NORMH, NORMW) and find first or last set bit (FFS, FLS) instructions.
+-bitscan_option true
+
+# shift_option --- The Shift ISA option adds variable and multi-length shift rotation instructions:  (0) No shift/rotation instructions (1) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8 (2) ASRM, ASLM, LSRM, RORM (3) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8, ASRM, ASLM, LSRM, RORM
+-shift_option 3
+
+# swap_option --- This adds two instructions used to swap half-words or bytes in a 32b word. Useful for converting between little to big endianess and vice-versa.
+-swap_option true
+
+# div_rem_option --- The DIV/REM option adds non-blocking multi-cycle implementation of integer divide/remainder functions. Added instructions are DIV, DIVU (integer divide), REM and REMU (integer divide remainder).radix2 takes 33 cycles.  radix4_enhanced takes 3 to 19 cycles per operation.
+-div_rem_option none
+
+# mpy_option --- The Multiplier ISA option allows selection between several multiplier configurations to tradeoff performance with silicon area.
+# For select multiply options, when the DIV/REM option is also selected, some datapath resources will be shared between the multiply and divide pipeline to minimize total area.
+# 
+# Cycle count (16-bit, lower 32-bit or upper 32-bit) for the different configurations is as follows:
+# <pre>
+# 
+# option  16/L32/U32  Instructions
+# ------  ----------  ---------------------
+#       
+# none	  -/-/-     None
+# wlh1	  1/1/1     MPYW/U, MPY/U, MPYH/U
+# wlh2	  2/2/2     MPYW/U, MPY/U, MPYH/U
+# wlh3	  2/3/3     MPYW/U, MPY/U, MPYH/U
+# wlh4	  2/4/5     MPYW/U, MPY/U, MPYH/U
+# wlh5	  5/9/9     MPYW/U, MPY/U, MPYH/U
+# </pre>
+# 
+-mpy_option none
+
+# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually.  This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region.  An attempt to access a protected region raises an EV_ProtV exception.
+-code_protection true
+
+# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected.
+-stack_checking true
+
+# unaligned_option --- This enables unaligned loads and stores.
+-unaligned_option true
+
+# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE.
+-intvbase_preset 0
+
+# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro.
+-rgf_impl flip_flops
+
+# rgf_num_regs --- This defines the size (in 32b register) of the processor register file.
+-rgf_num_regs 32
+
+# rgf_wr_ports --- This defines the number of write ports on the register file.
+-rgf_wr_ports 2
+
+# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not.
+-rgf_num_banks 2
+
+# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank.
+-rgf_banked_regs 32
+
+# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions.
+-turbo_boost false
+
+# infer_alu_adder --- infer: datapath is described as behavioral code: A + B
+# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder.  It is generally preferable to use the infer option and add directives for your target synthesizer. 
+-infer_alu_adder infer
+
+# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs)
+# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. 
+-infer_mpy_wtree instantiate
+
+# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts
+-power_domains true
+
+# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core
+-dvfs true
+
+# voltage_domains --- Creates a voltage  domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints
+-voltage_domains false
+
+# mem_bus_option --- The core supports three bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator.
+-mem_bus_option AHB-Lite-dual
+
+# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered.
+-mem_bus_reg_interface true
+
+# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle.
+-dmi_burst_option false
+
+# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost.
+-has_dmp_peripheral false
+
+# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite.
+-per_bus_option AHB-Lite
+
+# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered.
+-per_bus_reg_interface false
+
+# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power.
+-clock_gating true
+
+# byte_parity --- If parity protection on the CCMs is configured, this option is used to enable parity protection on a per-byte basis. Otherwise, parity will be per word basis
+-byte_parity false
+
+# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback
+-prot_pipelined false
+
+# cct_test_ena --- When ECC is configured, this option enables automatic generation of error conditions in relevant testbench memories to exercise error detection and correction features
+-cct_test_ena false
+
+
+######## AGU --- com.arc.hardware.AGU.1_0 ########
+
+# Create AGU
+-create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU
+
+# agu_size --- Predefined configurations of modifiers, address 
+# pointers and offset registers                   
+# <pre>
+# 
+#         address     address                     
+#         pointers    offset regs      modifiers  
+#        ----------- --------------- ------------ 
+# small:     4           2                 4      
+# medium:    8           4                 12     
+# large:     12          8                 24     
+# </pre>
+# 
+-agu_size small
+
+# agu_accord --- Enable the accordion stage if operating frequency is critical
+-agu_accord true
+
+# agu_wb_depth --- Write buffer depth
+-agu_wb_depth 2
+
+
+######## DSP --- com.arc.hardware.DSP.1_0 ########
+
+# Create DSP
+-create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP
+
+# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support
+-dsp_complex true
+
+# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only
+-dsp_itu true
+
+# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT
+-dsp_divsqrt radix2
+
+# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding
+-dsp_accshift full
+
+# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing
+-dsp_impl optimized
+
+
+######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ########
+
+# Create Interrupt Controller
+-create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller"
+
+# number_of_interrupts --- This is the total number of interrupts available to the core.  Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts).  For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual.
+-number_of_interrupts 95
+
+# number_of_levels --- Priority levels in the interrupt controller.
+-number_of_levels 4
+
+# external_interrupts --- This is the total number of interrupt pins available for external system components.  This parameter must be less than the total number of interrupts.
+-external_interrupts 60
+
+# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory.
+-firq_option true
+
+
+######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ########
+
+# Create Timer 0
+-create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0"
+
+# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0.
+-timer_0_int_level 1
+
+
+######## Timer 1 --- com.arc.hardware.Timer_1.1_0 ########
+
+# Create Timer 1
+-create com.arc.hardware.Timer_1.1_0 "System.CPUisle.ARCv2EM.Timer 1"
+
+# timer_1_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 1.
+-timer_1_int_level 0
+
+
+######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ########
+
+# Create Watchdog Timer
+-create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer"
+
+# watchdog_size --- Specifies the bit width of the internal counter used within the timer.
+-watchdog_size 16
+
+# watchdog_clk --- Specifies whether the timer should be driven from a separate clock.
+-watchdog_clk true
+
+
+######## Data Memory Initiator --- com.arc.hardware.Data_Memory_Initiator.1_0 ########
+
+# Create Data Memory Initiator
+-create com.arc.hardware.Data_Memory_Initiator.1_0 "System.CPUisle.ARCv2EM.Data Memory Initiator"
+
+######## Instruction Fetch Queue --- com.arc.hardware.Instruction_Fetch_Queue.1_0 ########
+
+# Create Instruction Fetch Queue
+-create com.arc.hardware.Instruction_Fetch_Queue.1_0 "System.CPUisle.ARCv2EM.Instruction Fetch Queue"
+
+# ifqueue_size --- This defines the number of entires in the Instruction Fetch Queue.
+-ifqueue_size 4
+
+# ifqueue_burst_size --- This sets the burst size for bus data transfers (in 32-bit words).  It cannot exceed the number of entries.
+-ifqueue_burst_size 2
+
+
+######## DCCM --- com.arc.hardware.DCCM.1_0 ########
+
+# Create DCCM
+-create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM
+
+# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes
+-dccm_size 131072
+
+# dccm_base --- Sets the initial memory region assignment for DCCM
+-dccm_base 8
+
+# dccm_interleave --- Split DCCM into even/odd memory banks.
+-dccm_interleave false
+
+# dccm_prot --- Specifies the type of protection built for the DCCM.
+-dccm_prot None
+
+# dccm_prot_level --- Specifies the level protection.
+-dccm_prot_level Data_Only
+
+# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM
+-dccm_prot_exceptions true
+
+# dccm_dmi --- This enables external access through a DMI (direct memory interface) port.
+-dccm_dmi true
+
+
+######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ########
+
+# Create ICCM0
+-create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0
+
+# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states.
+-iccm0_size 262144
+
+# iccm0_base --- Sets the initial memory region assignment for ICCM0
+-iccm0_base 2
+
+# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses.
+-iccm0_wide true
+
+# iccm0_prot --- Specifies the type of protection built for ICCM0.
+-iccm0_prot None
+
+# iccm0_prot_level --- Specifies the level of protection.
+-iccm0_prot_level Data_Only
+
+# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0
+-iccm0_prot_exceptions true
+
+# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port.
+-iccm0_dmi true
+
+
+######## XY --- com.arc.hardware.XY.1_0 ########
+
+# Create XY
+-create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY
+
+# xy_config --- XY memory configuration:
+# One memory: DCCM only.
+# Two memories: DCCM + Y.
+# Three memories: DCCM + X + Y.
+-xy_config dccm_x_y
+
+# xy_size --- Size of X and Y memories if included.
+# X and Y memories both have the same configured size.
+-xy_size 32768
+
+# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access.
+-xy_interleave true
+
+# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory.
+-xy_x_base 12
+
+# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory.
+-xy_y_base 14
+
+
+######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ########
+
+# Create DMA Controller
+-create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller"
+
+# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller
+-dmac_channels 16
+
+# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words.
+-dmac_fifo_depth 4
+
+# dmac_int_config --- None: the DMA controller cannot raise an interrupt
+# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy
+# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy
+# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core
+# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core
+-dmac_int_config Multiple-Internal
+
+# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space.
+-dmac_registers 16
+
+# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface.
+-dmac_mem_if separate
+
+
+######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ########
+
+# Create JTAG Interface
+-create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface"
+
+######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ########
+
+# Create Debug Interface
+-create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface"
+
+######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ########
+
+# Create Actionpoints
+-create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints
+
+# num_actionpoints --- This is the number of trigger events available.
+-num_actionpoints 8
+
+# aps_feature --- Selects Actionpoint feature set
+-aps_feature min
+
+
+######## SmaRT --- com.arc.hardware.SmaRT.1_0 ########
+
+# Create SmaRT
+-create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT
+
+# smart_stack_entries --- This specifies the number of entries in the trace buffer.
+-smart_stack_entries 64
+
+# smart_implementation --- Flip-flop = FF-based design.  Memory = memory-based design (provides better density for larger trace buffers).
+-smart_implementation memory
+
+
+######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ########
+
+# Create Memory Protection Unit
+-create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit"
+
+# mpu_num_regions --- Number of configured memory regions.
+-mpu_num_regions 16
+
+# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB.
+-mpu_32b false
+
+
+######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ########
+
+# Create Floating-point unit
+-create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit"
+
+# fpu_dp_assist --- This enables double-precision acceleration instructions.
+-fpu_dp_assist true
+
+# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions.
+-fpu_fma_option true
+
+# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed.
+-fpu_mas_cycles 2
+
+# fpu_div_option --- This enables divide & square-root acceleration
+-fpu_div_option true
+
+# fpu_div_cycles --- "inferred" option infers DSP datapath elements from verilog operators for better area and "optimized" option selects hardware for better timing
+-fpu_div_cycles 17
+
+
+######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ########
+
+# Create Performance Monitor
+-create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor"
+
+# pct_counters --- Number of counters for performance monitoring.
+-pct_counters 8
+
+
+######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ########
+
+# Create dsp_trig
+-create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig
+
+# dsp_trig --- Command line option for EIA extension component 'dsp_trig'.
+-dsp_trig true
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio_4b0 --- com.arc.hardware.dfss.io_gpio_4b0.1_0 ########
+
+# Create io_gpio_4b0
+-create com.arc.hardware.dfss.io_gpio_4b0.1_0 System.CPUisle.ARCv2EM.io_gpio_4b0
+
+# io_gpio_4b0 --- Command line option for EIA extension component 'io_gpio_4b0'.
+-io_gpio_4b0 true
+
+# io_gpio_4b0_debounce --- Selects the inclusion of Debounce logic
+-io_gpio_4b0_debounce 1
+
+# io_gpio_4b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio_4b0_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio_4b1 --- com.arc.hardware.dfss.io_gpio_4b1.1_0 ########
+
+# Create io_gpio_4b1
+-create com.arc.hardware.dfss.io_gpio_4b1.1_0 System.CPUisle.ARCv2EM.io_gpio_4b1
+
+# io_gpio_4b1 --- Command line option for EIA extension component 'io_gpio_4b1'.
+-io_gpio_4b1 true
+
+# io_gpio_4b1_debounce --- Selects the inclusion of Debounce logic
+-io_gpio_4b1_debounce 1
+
+# io_gpio_4b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio_4b1_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio_4b2 --- com.arc.hardware.dfss.io_gpio_4b2.1_0 ########
+
+# Create io_gpio_4b2
+-create com.arc.hardware.dfss.io_gpio_4b2.1_0 System.CPUisle.ARCv2EM.io_gpio_4b2
+
+# io_gpio_4b2 --- Command line option for EIA extension component 'io_gpio_4b2'.
+-io_gpio_4b2 true
+
+# io_gpio_4b2_debounce --- Selects the inclusion of Debounce logic
+-io_gpio_4b2_debounce 1
+
+# io_gpio_4b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio_4b2_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio_8b0 --- com.arc.hardware.dfss.io_gpio_8b0.1_0 ########
+
+# Create io_gpio_8b0
+-create com.arc.hardware.dfss.io_gpio_8b0.1_0 System.CPUisle.ARCv2EM.io_gpio_8b0
+
+# io_gpio_8b0 --- Command line option for EIA extension component 'io_gpio_8b0'.
+-io_gpio_8b0 true
+
+# io_gpio_8b0_debounce --- Selects the inclusion of Debounce logic
+-io_gpio_8b0_debounce 1
+
+# io_gpio_8b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio_8b0_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio_8b1 --- com.arc.hardware.dfss.io_gpio_8b1.1_0 ########
+
+# Create io_gpio_8b1
+-create com.arc.hardware.dfss.io_gpio_8b1.1_0 System.CPUisle.ARCv2EM.io_gpio_8b1
+
+# io_gpio_8b1 --- Command line option for EIA extension component 'io_gpio_8b1'.
+-io_gpio_8b1 true
+
+# io_gpio_8b1_debounce --- Selects the inclusion of Debounce logic
+-io_gpio_8b1_debounce 1
+
+# io_gpio_8b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio_8b1_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio_8b2 --- com.arc.hardware.dfss.io_gpio_8b2.1_0 ########
+
+# Create io_gpio_8b2
+-create com.arc.hardware.dfss.io_gpio_8b2.1_0 System.CPUisle.ARCv2EM.io_gpio_8b2
+
+# io_gpio_8b2 --- Command line option for EIA extension component 'io_gpio_8b2'.
+-io_gpio_8b2 true
+
+# io_gpio_8b2_debounce --- Selects the inclusion of Debounce logic
+-io_gpio_8b2_debounce 1
+
+# io_gpio_8b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio_8b2_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_gpio_8b3 --- com.arc.hardware.dfss.io_gpio_8b3.1_0 ########
+
+# Create io_gpio_8b3
+-create com.arc.hardware.dfss.io_gpio_8b3.1_0 System.CPUisle.ARCv2EM.io_gpio_8b3
+
+# io_gpio_8b3 --- Command line option for EIA extension component 'io_gpio_8b3'.
+-io_gpio_8b3 true
+
+# io_gpio_8b3_debounce --- Selects the inclusion of Debounce logic
+-io_gpio_8b3_debounce 1
+
+# io_gpio_8b3_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
+-io_gpio_8b3_readback_sync 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ########
+
+# Create io_i2c_mst0
+-create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0
+
+# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'.
+-io_i2c_mst0 true
+
+# io_i2c_mst0_fs --- RX/TX FIFO size
+-io_i2c_mst0_fs 16
+
+# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2c_mst0_dma_support None
+
+# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
+-io_i2c_mst0_cdc_included 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ########
+
+# Create io_i2c_mst1
+-create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1
+
+# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'.
+-io_i2c_mst1 true
+
+# io_i2c_mst1_fs --- RX/TX FIFO size
+-io_i2c_mst1_fs 16
+
+# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2c_mst1_dma_support None
+
+# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
+-io_i2c_mst1_cdc_included 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ########
+
+# Create io_i2c_mst2
+-create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2
+
+# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'.
+-io_i2c_mst2 true
+
+# io_i2c_mst2_fs --- RX/TX FIFO size
+-io_i2c_mst2_fs 16
+
+# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included
+-io_i2c_mst2_dma_support None
+
+# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
+-io_i2c_mst2_cdc_included 1
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ########
+
+# Create io_spi_mst0
+-create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0
+
+# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'.
+-io_spi_mst0 true
+
+# io_spi_mst0_fz --- RX/TX FIFO depth
+-io_spi_mst0_fs 16
+
+# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_mst0_max_xfer_size 16
+
+# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
+-io_spi_mst0_cdc_included 1
+
+# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_mst0_dma_support Aux-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ########
+
+# Create io_spi_mst1
+-create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1
+
+# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'.
+-io_spi_mst1 true
+
+# io_spi_mst1_fz --- RX/TX FIFO depth
+-io_spi_mst1_fs 16
+
+# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_mst1_max_xfer_size 16
+
+# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
+-io_spi_mst1_cdc_included 1
+
+# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_mst1_dma_support Aux-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ########
+
+# Create io_spi_mst2
+-create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2
+
+# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'.
+-io_spi_mst2 true
+
+# io_spi_mst2_fz --- RX/TX FIFO depth
+-io_spi_mst2_fs 16
+
+# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_mst2_max_xfer_size 16
+
+# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
+-io_spi_mst2_cdc_included 1
+
+# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_mst2_dma_support Aux-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ########
+
+# Create io_spi_slv0
+-create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0
+
+# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'.
+-io_spi_slv0 true
+
+# io_spi_slv0_fz --- RX/TX FIFO depth
+-io_spi_slv0_fs 16
+
+# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
+-io_spi_slv0_max_xfer_size 16
+
+# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_spi_slv0_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ########
+
+# Create io_uart0
+-create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0
+
+# io_uart0 --- Command line option for EIA extension component 'io_uart0'.
+-io_uart0 true
+
+# io_uart0_fifo_mode --- Set the UART FIFO mode
+-io_uart0_fifo_mode 16
+
+# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart0_dma_support None
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ########
+
+# Create io_uart1
+-create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1
+
+# io_uart1 --- Command line option for EIA extension component 'io_uart1'.
+-io_uart1 true
+
+# io_uart1_fifo_mode --- Set the UART FIFO mode
+-io_uart1_fifo_mode 16
+
+# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart1_dma_support Aux-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ########
+
+# Create io_uart2
+-create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2
+
+# io_uart2 --- Command line option for EIA extension component 'io_uart2'.
+-io_uart2 true
+
+# io_uart2_fifo_mode --- Set the UART FIFO mode
+-io_uart2_fifo_mode 16
+
+# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart2_dma_support Aux-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ########
+
+# Create io_uart3
+-create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3
+
+# io_uart3 --- Command line option for EIA extension component 'io_uart3'.
+-io_uart3 true
+
+# io_uart3_fifo_mode --- Set the UART FIFO mode
+-io_uart3_fifo_mode 16
+
+# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
+-io_uart3_dma_support Aux-Based
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_creg_mst0 --- com.arc.hardware.dfss.io_creg_mst0.1_0 ########
+
+# Create io_creg_mst0
+-create com.arc.hardware.dfss.io_creg_mst0.1_0 System.CPUisle.ARCv2EM.io_creg_mst0
+
+# io_creg_mst0 --- Command line option for EIA extension component 'io_creg_mst0'.
+-io_creg_mst0 true
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## io_creg_slv0 --- com.arc.hardware.dfss.io_creg_slv0.1_0 ########
+
+# Create io_creg_slv0
+-create com.arc.hardware.dfss.io_creg_slv0.1_0 System.CPUisle.ARCv2EM.io_creg_slv0
+
+# io_creg_slv0 --- Command line option for EIA extension component 'io_creg_slv0'.
+-io_creg_slv0 true
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ########
+
+# Create subsys_bcr
+-create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr
+
+# assign_xpubit --- 
+#       
+# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
+# <p>
+# By default an extension is not assigned a bit in this register.  This means the extension is always available.
+# <p>
+# If you wish to assign an XPU bit number, select this option.
+# 
+# 
+-assign_xpubit false
+
+# xpubit --- 
+# The XPU bit number for this extension.
+# 
+-xpubit 0
+
+
+######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ########
+
+# Create subsys_infra
+-create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra
+
+# subsys_infra --- Command line option for EIA glue logic.
+-subsys_infra true
+
+# internal_interrupt --- Connect the IO interrupts internally
+-internal_interrupt true
+
+# internal_dma_handshake --- Connect the DMA handshake signals internally
+-internal_dma_handshake true
+
+
+######## ARConnect --- com.arc.hardware.ARConnect.1_0 ########
+
+# Create ARConnect
+-create com.arc.hardware.ARConnect.1_0 System.ARConnect
+
+# mcip_def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate ARConnect clock, and the value N means ARConnect is running at (1/N) x ref_clk.
+-mcip_def_div2ref 1
+
+# mcip_has_intrpt --- This specifies whether the Inter-core Interrupt Unit exists
+-mcip_has_intrpt false
+
+# mcip_has_sema --- This specifies whether the Inter-core Semaphore Unit exists
+-mcip_has_sema false
+
+# mcip_sema_num --- This specifies the number of semaphores in the Inter-core Semaphores Unit
+-mcip_sema_num 16
+
+# mcip_has_msg_sram --- This specifies whether the Inter-core Message Unit exists
+-mcip_has_msg_sram false
+
+# mcip_msg_sram_size --- This specifies the bytes of SRAM in the Inter-core Message Unit
+-mcip_msg_sram_size 512
+
+# mcip_msg_1cycle --- True: The access path to message SRAM is 1 clock cycle; False: The access path to message SRAM 1.5 cycles. Note: The 1.5 cycles path use clock negetive edge for SRAM, but can acheive higher frequency. No performance difference caused by the value of this option
+-mcip_msg_1cycle false
+
+# mcip_has_debug --- This specifies whether the Inter-core Debug Unit exists
+-mcip_has_debug false
+
+# mcip_has_grtc --- This specifies whether the Global Real-Time Counter Unit exists
+-mcip_has_grtc false
+
+# mcip_has_pmu --- This specifies whether the external Power Management Unit exists
+-mcip_has_pmu true
+
+# mcip_power_domains --- This specifies whether the ARConnect Power Domain Management Unit exists
+-mcip_power_domains true
+
+# mcip_llm_size --- This specifies the KBytes of SRAM in the Low Latency Memory Unit
+-mcip_llm_size 32
+
+# mcip_llm_base --- This specifies the default memory region of Low Latency Memory Unit
+-mcip_llm_base 2
+
+# mcip_llm_ecc --- This specifies the ECC mode of SRAM in Low Latency Memory Unit. none = No checking; parity = Parity only; SECDED = single-error correction and double-error detection (SECDED)
+-mcip_llm_ecc SECDED
+
+# mcip_idu_cirq_num --- This specifies the number of common interrupts supported by IDU
+-mcip_idu_cirq_num 4
+
+# mcip_bsu_dbw --- This specifies the data bus width of Bus Slave Unit
+-mcip_bsu_dbw 64
+
+# mcip_bsu_type --- This specifies the bus protocol of Bus Slave Unit
+-mcip_bsu_type AXI
+
+
+]]></string>
+  </configuration>
+  <configuration name="assembler_defines" filename="core_config.s">
+    <string><![CDATA[
+.ifndef __core_config_s
+	.define __core_config_s, 1
+	.define	core_config_cir_identity,0x00000142
+	.define	core_config_cir_identity_chipid,0
+	.define	core_config_cir_identity_arcnum,1
+	.define	core_config_cir_identity_arcver,66
+	.define	core_config_cir_identity_family,4
+	.define	core_config_cir_identity_corever,2
+	.define	core_config_cir_aux_dccm,0x80000000
+	.define	core_config_bcr_bcr_ver,0x00000002
+	.define	core_config_bcr_bcr_ver_version,2
+	.define	core_config_bcr_vecbase_ac_build,0x00000010
+	.define	core_config_bcr_mpu_build,0x00001002
+	.define	core_config_bcr_mpu_build_i,0
+	.define	core_config_bcr_mpu_build_s,0
+	.define	core_config_bcr_mpu_build_regions,16
+	.define	core_config_bcr_mpu_build_version,2
+	.define	core_config_bcr_rf_build,0x0000c902
+	.define	core_config_bcr_rf_build_version,2
+	.define	core_config_bcr_rf_build_p,1
+	.define	core_config_bcr_rf_build_e,0
+	.define	core_config_bcr_rf_build_r,0
+	.define	core_config_bcr_rf_build_b,1
+	.define	core_config_bcr_rf_build_d,3
+	.define	core_config_bcr_dccm_build,0x00000904
+	.define	core_config_bcr_dccm_build_cycles,0
+	.define	core_config_bcr_dccm_build_interleave,0
+	.define	core_config_bcr_dccm_build_size1,0
+	.define	core_config_bcr_dccm_build_size0,9
+	.define	core_config_bcr_dccm_build_version,4
+	.define	core_config_bcr_timer_build,0x00010304
+	.define	core_config_bcr_timer_build_sp1,0
+	.define	core_config_bcr_timer_build_sp0,0
+	.define	core_config_bcr_timer_build_p1,0
+	.define	core_config_bcr_timer_build_p0,1
+	.define	core_config_bcr_timer_build_st1,0
+	.define	core_config_bcr_timer_build_st0,0
+	.define	core_config_bcr_timer_build_rtc,0
+	.define	core_config_bcr_timer_build_rtsc_ver,1
+	.define	core_config_bcr_timer_build_rtsc,0
+	.define	core_config_bcr_timer_build_t0,1
+	.define	core_config_bcr_timer_build_t1,1
+	.define	core_config_bcr_timer_build_version,4
+	.define	core_config_bcr_ap_build,0x00000605
+	.define	core_config_bcr_ap_build_version,5
+	.define	core_config_bcr_ap_build_type,6
+	.define	core_config_bcr_iccm_build,0x00000a04
+	.define	core_config_bcr_iccm_build_iccm1_size1,0
+	.define	core_config_bcr_iccm_build_iccm0_size1,0
+	.define	core_config_bcr_iccm_build_iccm1_size0,0
+	.define	core_config_bcr_iccm_build_iccm0_size0,10
+	.define	core_config_bcr_iccm_build_version,4
+	.define	core_config_bcr_xy_build,0x00003620
+	.define	core_config_bcr_xy_build_memsize,3
+	.define	core_config_bcr_xy_build_interleaved,1
+	.define	core_config_bcr_xy_build_config,2
+	.define	core_config_bcr_xy_build_version,32
+	.define	core_config_bcr_dsp_build,0x00003521
+	.define	core_config_bcr_dsp_build_wide,0
+	.define	core_config_bcr_dsp_build_itu_pa,1
+	.define	core_config_bcr_dsp_build_acc_shift,2
+	.define	core_config_bcr_dsp_build_comp,1
+	.define	core_config_bcr_dsp_build_divsqrt,1
+	.define	core_config_bcr_dsp_build_version,33
+	.define	core_config_bcr_multiply_build,0x00022a06
+	.define	core_config_bcr_multiply_build_version16x16,2
+	.define	core_config_bcr_multiply_build_dsp,2
+	.define	core_config_bcr_multiply_build_cyc,2
+	.define	core_config_bcr_multiply_build_type,2
+	.define	core_config_bcr_multiply_build_version32x32,6
+	.define	core_config_bcr_swap_build,0x00000003
+	.define	core_config_bcr_swap_build_version,3
+	.define	core_config_bcr_norm_build,0x00000003
+	.define	core_config_bcr_norm_build_version,3
+	.define	core_config_bcr_minmax_build,0x00000002
+	.define	core_config_bcr_minmax_build_version,2
+	.define	core_config_bcr_barrel_build,0x00000303
+	.define	core_config_bcr_barrel_build_version,3
+	.define	core_config_bcr_barrel_build_shift_option,3
+	.define	core_config_bcr_isa_config,0x12447402
+	.define	core_config_bcr_isa_config_d,1
+	.define	core_config_bcr_isa_config_c,2
+	.define	core_config_bcr_isa_config_l,0
+	.define	core_config_bcr_isa_config_n,1
+	.define	core_config_bcr_isa_config_a,0
+	.define	core_config_bcr_isa_config_b,0
+	.define	core_config_bcr_isa_config_addr_size,4
+	.define	core_config_bcr_isa_config_lpc_size,7
+	.define	core_config_bcr_isa_config_pc_size,4
+	.define	core_config_bcr_isa_config_version,2
+	.define	core_config_bcr_stack_region_build,0x00000002
+	.define	core_config_bcr_erp_build,0x30000003
+	.define	core_config_bcr_erp_build_l,0
+	.define	core_config_bcr_erp_build_wd,1
+	.define	core_config_bcr_erp_build_c,1
+	.define	core_config_bcr_erp_build_rf,0
+	.define	core_config_bcr_erp_build_pc,0
+	.define	core_config_bcr_erp_build_ic,0
+	.define	core_config_bcr_erp_build_dc,0
+	.define	core_config_bcr_erp_build_ip,0
+	.define	core_config_bcr_erp_build_dp,0
+	.define	core_config_bcr_erp_build_version,3
+	.define	core_config_bcr_fpu_build,0x01000f02
+	.define	core_config_bcr_fpu_build_da,1
+	.define	core_config_bcr_fpu_build_dd,0
+	.define	core_config_bcr_fpu_build_dc,0
+	.define	core_config_bcr_fpu_build_df,0
+	.define	core_config_bcr_fpu_build_dp,0
+	.define	core_config_bcr_fpu_build_fd,0
+	.define	core_config_bcr_fpu_build_fm,0
+	.define	core_config_bcr_fpu_build_sd,1
+	.define	core_config_bcr_fpu_build_sc,1
+	.define	core_config_bcr_fpu_build_sf,1
+	.define	core_config_bcr_fpu_build_sp,1
+	.define	core_config_bcr_fpu_build_version,2
+	.define	core_config_bcr_cprot_build,0x00000001
+	.define	core_config_bcr_agu_build,0x01442401
+	.define	core_config_bcr_agu_build_accordian,1
+	.define	core_config_bcr_agu_build_wb_size,2
+	.define	core_config_bcr_agu_build_num_modifier,4
+	.define	core_config_bcr_agu_build_num_offset,2
+	.define	core_config_bcr_agu_build_num_addr,4
+	.define	core_config_bcr_agu_build_version,1
+	.define	core_config_bcr_dmac_build,0x00170f01
+	.define	core_config_bcr_dmac_build_int_cfg,2
+	.define	core_config_bcr_dmac_build_fifo,3
+	.define	core_config_bcr_dmac_build_chan_mem,16
+	.define	core_config_bcr_dmac_build_channels,15
+	.define	core_config_bcr_dmac_build_version,1
+	.define	core_config_bcr_mcip_system_build,0x02011002
+	.define	core_config_bcr_mcip_system_build_pdm,1
+	.define	core_config_bcr_mcip_system_build_idu,0
+	.define	core_config_bcr_mcip_system_build_corenum,1
+	.define	core_config_bcr_mcip_system_build_gfrc,0
+	.define	core_config_bcr_mcip_system_build_icd,0
+	.define	core_config_bcr_mcip_system_build_pmu,1
+	.define	core_config_bcr_mcip_system_build_icm,0
+	.define	core_config_bcr_mcip_system_build_ics,0
+	.define	core_config_bcr_mcip_system_build_ici,0
+	.define	core_config_bcr_mcip_system_build_asi,0
+	.define	core_config_bcr_mcip_system_build_version,2
+	.define	core_config_bcr_mcip_system_build_llm,0
+	.define	core_config_bcr_mcip_system_build_rtc,0
+	.define	core_config_bcr_mcip_system_build_mcd,0
+	.define	core_config_bcr_mcip_system_build_mps,0
+	.define	core_config_bcr_mcip_system_build_bsu,0
+	.define	core_config_bcr_mcip_pmu_build,0x00000002
+	.define	core_config_bcr_mcip_pmu_build_version,2
+	.define	core_config_bcr_mcip_pmu_build_dvfs,0
+	.define	core_config_bcr_mcip_pmu_build_pm,0
+	.define	core_config_bcr_mcip_pdm_build,0x00000001
+	.define	core_config_bcr_mcip_pdm_build_version,1
+	.define	core_config_bcr_subsys_build,0x00100013
+	.define	core_config_bcr_core_config,0x00000001
+	.define	core_config_bcr_core_config_turbo_boost,0
+	.define	core_config_bcr_core_config_version,1
+	.define	core_config_bcr_irq_build,0x133c5f01
+	.define	core_config_bcr_irq_build_raz,0
+	.define	core_config_bcr_irq_build_f,1
+	.define	core_config_bcr_irq_build_p,3
+	.define	core_config_bcr_irq_build_exts,60
+	.define	core_config_bcr_irq_build_irqs,95
+	.define	core_config_bcr_irq_build_version,1
+	.define	core_config_bcr_pct_build,0x08080102
+	.define	core_config_bcr_pct_build_version,2
+	.define	core_config_bcr_pct_build_s,1
+	.define	core_config_bcr_pct_build_i,0
+	.define	core_config_bcr_pct_build_c,8
+	.define	core_config_bcr_cc_build,0x006f0004
+	.define	core_config_bcr_cc_build_version,4
+	.define	core_config_bcr_cc_build_cc,111
+	.define	core_config_bcr_pdm_dvfs_build,0x00000302
+	.define	core_config_bcr_pdm_dvfs_build_dvfs,1
+	.define	core_config_bcr_pdm_dvfs_build_pdm,1
+	.define	core_config_bcr_pdm_dvfs_build_version,2
+	.define	core_config_bcr_ifqueue_build,0x00000202
+	.define	core_config_bcr_ifqueue_build_bd,2
+	.define	core_config_bcr_ifqueue_build_version,2
+	.define	core_config_bcr_smart_build,0x00010003
+	.define	core_config_bcr_smart_build_version,3
+	.define	core_config_bcr_smart_build_stack_size,64
+	.define	core_config_cir_aux_iccm,0x20000000
+	.define	core_config_cir_xccm_base,0xc0000000
+	.define	core_config_cir_yccm_base,0xe0000000
+	.define	core_config_cir_subsys_dsp_0_build,0x00001000
+	.define	core_config_cir_subsys_io_0_build,0x071711f0
+	.define	core_config_cir_subsys_io_1_build,0x00000f70
+	.define	core_config_family,4
+	.define	core_config_core_version,2
+	.define	core_config_family_name,"arcv2em"
+	.define	core_config_rgf_num_banks,2
+	.define	core_config_rgf_banked_regs,32
+	.define	core_config_rgf_num_wr_ports,2
+	.define	core_config_endian,"little"
+	.define	core_config_endian_little,1
+	.define	core_config_endian_big,0
+	.define	core_config_lpc_size,32
+	.define	core_config_pc_size,32
+	.define	core_config_addr_size,32
+	.define	core_config_unaligned,1
+	.define	core_config_code_density,1
+	.define	core_config_div_rem,"radix2"
+	.define	core_config_div_rem_radix2,1
+	.define	core_config_swap,1
+	.define	core_config_bitscan,1
+	.define	core_config_mpy_option,"mpyd"
+	.define	core_config_mpy_option_num,8
+	.define	core_config_shift_assist,1
+	.define	core_config_barrel_shifter,1
+	.define	core_config_dsp,1
+	.define	core_config_dsp2,1
+	.define	core_config_dsp_complex,1
+	.define	core_config_dsp_divsqrt,"radix2"
+	.define	core_config_dsp_divsqrt_radix2,1
+	.define	core_config_dsp_itu,1
+	.define	core_config_dsp_accshift,"full"
+	.define	core_config_dsp_accshift_full,1
+	.define	core_config_agu_small,1
+	.define	core_config_agu_wb_depth,2
+	.define	core_config_agu_accord,1
+	.define	core_config_xy,1
+	.define	core_config_xy_config,"dccm_x_y"
+	.define	core_config_xy_config_dccm_x_y,1
+	.define	core_config_xy_size,32768
+	.define	core_config_xy_size_KM,"32K"
+	.define	core_config_xy_interleave,1
+	.define	core_config_xy_x_base,0xc0000000
+	.define	core_config_xy_y_base,0xe0000000
+	.define	core_config_fpus_div,1
+	.define	core_config_fpu_mac,1
+	.define	core_config_fpuda,1
+	.define	core_config_fpus_mpy_slow,1
+	.define	core_config_fpus_div_slow,1
+	.define	core_config_timer0,1
+	.define	core_config_timer0_level,1
+	.define	core_config_timer0_vector,16
+	.define	core_config_timer1,1
+	.define	core_config_timer1_level,0
+	.define	core_config_timer1_vector,17
+	.define	core_config_action_points,8
+	.define	core_config_stack_check,1
+	.define	core_config_code_protection,1
+	.define	core_config_smart_stack_entries,64
+	.define	core_config_mpu_present,1
+	.define	core_config_mpu,1
+	.define	core_config_mpu_regions,16
+	.define	core_config_ifq_present,1
+	.define	core_config_ifq_entries,4
+	.define	core_config_interrupts_present,1
+	.define	core_config_interrupts_number,95
+	.define	core_config_interrupts_priorities,4
+	.define	core_config_interrupts_externals,60
+	.define	core_config_interrupts,95
+	.define	core_config_interrupt_priorities,4
+	.define	core_config_ext_interrupts,60
+	.define	core_config_interrupts_firq,1
+	.define	core_config_interrupts_base,0x0
+	.define	core_config_dccm_present,1
+	.define	core_config_dccm_size,0x20000
+	.define	core_config_dccm_base,0x80000000
+	.define	core_config_iccm_present,1
+	.define	core_config_iccm0_present,1
+	.define	core_config_iccm_size,0x40000
+	.define	core_config_iccm0_size,0x40000
+	.define	core_config_iccm_base,0x20000000
+	.define	core_config_iccm0_base,0x20000000
+	.define	core_config_error_prot_ver,3
+	.define	core_config_ccm_prot_pipelined,1
+	.define	core_config_watchdog,1
+	.define	core_config_watchdog_size,16
+	.define	core_config_pct_counters,8
+	.define	core_config_connect_pmu,1
+	.define	core_config_connect_pdm,1
+	.define	core_config_dmac,1
+	.define	core_config_dmac_channels,16
+	.define	core_config_dmac_registers,16
+	.define	core_config_dmac_fifo_depth,4
+	.define	core_config_dmac_int_config,"multiple_internal"
+	.define	core_config_power_domains,1
+	.define	core_config_dvfs,1
+.endif ; __core_config_s
+
+]]></string>
+  </configuration>
+  <configuration name="C_defines" filename="core_config.h">
+    <string><![CDATA[
+#ifndef __core_config_h
+	#define __core_config_h  1
+	#define	core_config_cir_identity	0x00000142
+	#define	core_config_cir_identity_chipid	0
+	#define	core_config_cir_identity_arcnum	1
+	#define	core_config_cir_identity_arcver	66
+	#define	core_config_cir_identity_family	4
+	#define	core_config_cir_identity_corever	2
+	#define	core_config_cir_aux_dccm	0x80000000
+	#define	core_config_bcr_bcr_ver	0x00000002
+	#define	core_config_bcr_bcr_ver_version	2
+	#define	core_config_bcr_vecbase_ac_build	0x00000010
+	#define	core_config_bcr_mpu_build	0x00001002
+	#define	core_config_bcr_mpu_build_i	0
+	#define	core_config_bcr_mpu_build_s	0
+	#define	core_config_bcr_mpu_build_regions	16
+	#define	core_config_bcr_mpu_build_version	2
+	#define	core_config_bcr_rf_build	0x0000c902
+	#define	core_config_bcr_rf_build_version	2
+	#define	core_config_bcr_rf_build_p	1
+	#define	core_config_bcr_rf_build_e	0
+	#define	core_config_bcr_rf_build_r	0
+	#define	core_config_bcr_rf_build_b	1
+	#define	core_config_bcr_rf_build_d	3
+	#define	core_config_bcr_dccm_build	0x00000904
+	#define	core_config_bcr_dccm_build_cycles	0
+	#define	core_config_bcr_dccm_build_interleave	0
+	#define	core_config_bcr_dccm_build_size1	0
+	#define	core_config_bcr_dccm_build_size0	9
+	#define	core_config_bcr_dccm_build_version	4
+	#define	core_config_bcr_timer_build	0x00010304
+	#define	core_config_bcr_timer_build_sp1	0
+	#define	core_config_bcr_timer_build_sp0	0
+	#define	core_config_bcr_timer_build_p1	0
+	#define	core_config_bcr_timer_build_p0	1
+	#define	core_config_bcr_timer_build_st1	0
+	#define	core_config_bcr_timer_build_st0	0
+	#define	core_config_bcr_timer_build_rtc	0
+	#define	core_config_bcr_timer_build_rtsc_ver	1
+	#define	core_config_bcr_timer_build_rtsc	0
+	#define	core_config_bcr_timer_build_t0	1
+	#define	core_config_bcr_timer_build_t1	1
+	#define	core_config_bcr_timer_build_version	4
+	#define	core_config_bcr_ap_build	0x00000605
+	#define	core_config_bcr_ap_build_version	5
+	#define	core_config_bcr_ap_build_type	6
+	#define	core_config_bcr_iccm_build	0x00000a04
+	#define	core_config_bcr_iccm_build_iccm1_size1	0
+	#define	core_config_bcr_iccm_build_iccm0_size1	0
+	#define	core_config_bcr_iccm_build_iccm1_size0	0
+	#define	core_config_bcr_iccm_build_iccm0_size0	10
+	#define	core_config_bcr_iccm_build_version	4
+	#define	core_config_bcr_xy_build	0x00003620
+	#define	core_config_bcr_xy_build_memsize	3
+	#define	core_config_bcr_xy_build_interleaved	1
+	#define	core_config_bcr_xy_build_config	2
+	#define	core_config_bcr_xy_build_version	32
+	#define	core_config_bcr_dsp_build	0x00003521
+	#define	core_config_bcr_dsp_build_wide	0
+	#define	core_config_bcr_dsp_build_itu_pa	1
+	#define	core_config_bcr_dsp_build_acc_shift	2
+	#define	core_config_bcr_dsp_build_comp	1
+	#define	core_config_bcr_dsp_build_divsqrt	1
+	#define	core_config_bcr_dsp_build_version	33
+	#define	core_config_bcr_multiply_build	0x00022a06
+	#define	core_config_bcr_multiply_build_version16x16	2
+	#define	core_config_bcr_multiply_build_dsp	2
+	#define	core_config_bcr_multiply_build_cyc	2
+	#define	core_config_bcr_multiply_build_type	2
+	#define	core_config_bcr_multiply_build_version32x32	6
+	#define	core_config_bcr_swap_build	0x00000003
+	#define	core_config_bcr_swap_build_version	3
+	#define	core_config_bcr_norm_build	0x00000003
+	#define	core_config_bcr_norm_build_version	3
+	#define	core_config_bcr_minmax_build	0x00000002
+	#define	core_config_bcr_minmax_build_version	2
+	#define	core_config_bcr_barrel_build	0x00000303
+	#define	core_config_bcr_barrel_build_version	3
+	#define	core_config_bcr_barrel_build_shift_option	3
+	#define	core_config_bcr_isa_config	0x12447402
+	#define	core_config_bcr_isa_config_d	1
+	#define	core_config_bcr_isa_config_c	2
+	#define	core_config_bcr_isa_config_l	0
+	#define	core_config_bcr_isa_config_n	1
+	#define	core_config_bcr_isa_config_a	0
+	#define	core_config_bcr_isa_config_b	0
+	#define	core_config_bcr_isa_config_addr_size	4
+	#define	core_config_bcr_isa_config_lpc_size	7
+	#define	core_config_bcr_isa_config_pc_size	4
+	#define	core_config_bcr_isa_config_version	2
+	#define	core_config_bcr_stack_region_build	0x00000002
+	#define	core_config_bcr_erp_build	0x30000003
+	#define	core_config_bcr_erp_build_l	0
+	#define	core_config_bcr_erp_build_wd	1
+	#define	core_config_bcr_erp_build_c	1
+	#define	core_config_bcr_erp_build_rf	0
+	#define	core_config_bcr_erp_build_pc	0
+	#define	core_config_bcr_erp_build_ic	0
+	#define	core_config_bcr_erp_build_dc	0
+	#define	core_config_bcr_erp_build_ip	0
+	#define	core_config_bcr_erp_build_dp	0
+	#define	core_config_bcr_erp_build_version	3
+	#define	core_config_bcr_fpu_build	0x01000f02
+	#define	core_config_bcr_fpu_build_da	1
+	#define	core_config_bcr_fpu_build_dd	0
+	#define	core_config_bcr_fpu_build_dc	0
+	#define	core_config_bcr_fpu_build_df	0
+	#define	core_config_bcr_fpu_build_dp	0
+	#define	core_config_bcr_fpu_build_fd	0
+	#define	core_config_bcr_fpu_build_fm	0
+	#define	core_config_bcr_fpu_build_sd	1
+	#define	core_config_bcr_fpu_build_sc	1
+	#define	core_config_bcr_fpu_build_sf	1
+	#define	core_config_bcr_fpu_build_sp	1
+	#define	core_config_bcr_fpu_build_version	2
+	#define	core_config_bcr_cprot_build	0x00000001
+	#define	core_config_bcr_agu_build	0x01442401
+	#define	core_config_bcr_agu_build_accordian	1
+	#define	core_config_bcr_agu_build_wb_size	2
+	#define	core_config_bcr_agu_build_num_modifier	4
+	#define	core_config_bcr_agu_build_num_offset	2
+	#define	core_config_bcr_agu_build_num_addr	4
+	#define	core_config_bcr_agu_build_version	1
+	#define	core_config_bcr_dmac_build	0x00170f01
+	#define	core_config_bcr_dmac_build_int_cfg	2
+	#define	core_config_bcr_dmac_build_fifo	3
+	#define	core_config_bcr_dmac_build_chan_mem	16
+	#define	core_config_bcr_dmac_build_channels	15
+	#define	core_config_bcr_dmac_build_version	1
+	#define	core_config_bcr_mcip_system_build	0x02011002
+	#define	core_config_bcr_mcip_system_build_pdm	1
+	#define	core_config_bcr_mcip_system_build_idu	0
+	#define	core_config_bcr_mcip_system_build_corenum	1
+	#define	core_config_bcr_mcip_system_build_gfrc	0
+	#define	core_config_bcr_mcip_system_build_icd	0
+	#define	core_config_bcr_mcip_system_build_pmu	1
+	#define	core_config_bcr_mcip_system_build_icm	0
+	#define	core_config_bcr_mcip_system_build_ics	0
+	#define	core_config_bcr_mcip_system_build_ici	0
+	#define	core_config_bcr_mcip_system_build_asi	0
+	#define	core_config_bcr_mcip_system_build_version	2
+	#define	core_config_bcr_mcip_system_build_llm	0
+	#define	core_config_bcr_mcip_system_build_rtc	0
+	#define	core_config_bcr_mcip_system_build_mcd	0
+	#define	core_config_bcr_mcip_system_build_mps	0
+	#define	core_config_bcr_mcip_system_build_bsu	0
+	#define	core_config_bcr_mcip_pmu_build	0x00000002
+	#define	core_config_bcr_mcip_pmu_build_version	2
+	#define	core_config_bcr_mcip_pmu_build_dvfs	0
+	#define	core_config_bcr_mcip_pmu_build_pm	0
+	#define	core_config_bcr_mcip_pdm_build	0x00000001
+	#define	core_config_bcr_mcip_pdm_build_version	1
+	#define	core_config_bcr_subsys_build	0x00100013
+	#define	core_config_bcr_core_config	0x00000001
+	#define	core_config_bcr_core_config_turbo_boost	0
+	#define	core_config_bcr_core_config_version	1
+	#define	core_config_bcr_irq_build	0x133c5f01
+	#define	core_config_bcr_irq_build_raz	0
+	#define	core_config_bcr_irq_build_f	1
+	#define	core_config_bcr_irq_build_p	3
+	#define	core_config_bcr_irq_build_exts	60
+	#define	core_config_bcr_irq_build_irqs	95
+	#define	core_config_bcr_irq_build_version	1
+	#define	core_config_bcr_pct_build	0x08080102
+	#define	core_config_bcr_pct_build_version	2
+	#define	core_config_bcr_pct_build_s	1
+	#define	core_config_bcr_pct_build_i	0
+	#define	core_config_bcr_pct_build_c	8
+	#define	core_config_bcr_cc_build	0x006f0004
+	#define	core_config_bcr_cc_build_version	4
+	#define	core_config_bcr_cc_build_cc	111
+	#define	core_config_bcr_pdm_dvfs_build	0x00000302
+	#define	core_config_bcr_pdm_dvfs_build_dvfs	1
+	#define	core_config_bcr_pdm_dvfs_build_pdm	1
+	#define	core_config_bcr_pdm_dvfs_build_version	2
+	#define	core_config_bcr_ifqueue_build	0x00000202
+	#define	core_config_bcr_ifqueue_build_bd	2
+	#define	core_config_bcr_ifqueue_build_version	2
+	#define	core_config_bcr_smart_build	0x00010003
+	#define	core_config_bcr_smart_build_version	3
+	#define	core_config_bcr_smart_build_stack_size	64
+	#define	core_config_cir_aux_iccm	0x20000000
+	#define	core_config_cir_xccm_base	0xc0000000
+	#define	core_config_cir_yccm_base	0xe0000000
+	#define	core_config_cir_subsys_dsp_0_build	0x00001000
+	#define	core_config_cir_subsys_io_0_build	0x071711f0
+	#define	core_config_cir_subsys_io_1_build	0x00000f70
+	#define	core_config_family	4
+	#define	core_config_core_version	2
+	#define	core_config_family_name	"arcv2em"
+	#define	core_config_rgf_num_banks	2
+	#define	core_config_rgf_banked_regs	32
+	#define	core_config_rgf_num_wr_ports	2
+	#define	core_config_endian	"little"
+	#define	core_config_endian_little	1
+	#define	core_config_endian_big	0
+	#define	core_config_lpc_size	32
+	#define	core_config_pc_size	32
+	#define	core_config_addr_size	32
+	#define	core_config_unaligned	1
+	#define	core_config_code_density	1
+	#define	core_config_div_rem	"radix2"
+	#define	core_config_div_rem_radix2	1
+	#define	core_config_swap	1
+	#define	core_config_bitscan	1
+	#define	core_config_mpy_option	"mpyd"
+	#define	core_config_mpy_option_num	8
+	#define	core_config_shift_assist	1
+	#define	core_config_barrel_shifter	1
+	#define	core_config_dsp	1
+	#define	core_config_dsp2	1
+	#define	core_config_dsp_complex	1
+	#define	core_config_dsp_divsqrt	"radix2"
+	#define	core_config_dsp_divsqrt_radix2	1
+	#define	core_config_dsp_itu	1
+	#define	core_config_dsp_accshift	"full"
+	#define	core_config_dsp_accshift_full	1
+	#define	core_config_agu_small	1
+	#define	core_config_agu_wb_depth	2
+	#define	core_config_agu_accord	1
+	#define	core_config_xy	1
+	#define	core_config_xy_config	"dccm_x_y"
+	#define	core_config_xy_config_dccm_x_y	1
+	#define	core_config_xy_size	32768
+	#define	core_config_xy_size_KM	"32K"
+	#define	core_config_xy_interleave	1
+	#define	core_config_xy_x_base	0xc0000000
+	#define	core_config_xy_y_base	0xe0000000
+	#define	core_config_fpus_div	1
+	#define	core_config_fpu_mac	1
+	#define	core_config_fpuda	1
+	#define	core_config_fpus_mpy_slow	1
+	#define	core_config_fpus_div_slow	1
+	#define	core_config_timer0	1
+	#define	core_config_timer0_level	1
+	#define	core_config_timer0_vector	16
+	#define	core_config_timer1	1
+	#define	core_config_timer1_level	0
+	#define	core_config_timer1_vector	17
+	#define	core_config_action_points	8
+	#define	core_config_stack_check	1
+	#define	core_config_code_protection	1
+	#define	core_config_smart_stack_entries	64
+	#define	core_config_mpu_present	1
+	#define	core_config_mpu	1
+	#define	core_config_mpu_regions	16
+	#define	core_config_ifq_present	1
+	#define	core_config_ifq_entries	4
+	#define	core_config_interrupts_present	1
+	#define	core_config_interrupts_number	95
+	#define	core_config_interrupts_priorities	4
+	#define	core_config_interrupts_externals	60
+	#define	core_config_interrupts	95
+	#define	core_config_interrupt_priorities	4
+	#define	core_config_ext_interrupts	60
+	#define	core_config_interrupts_firq	1
+	#define	core_config_interrupts_base	0x0
+	#define	core_config_dccm_present	1
+	#define	core_config_dccm_size	0x20000
+	#define	core_config_dccm_base	0x80000000
+	#define	core_config_iccm_present	1
+	#define	core_config_iccm0_present	1
+	#define	core_config_iccm_size	0x40000
+	#define	core_config_iccm0_size	0x40000
+	#define	core_config_iccm_base	0x20000000
+	#define	core_config_iccm0_base	0x20000000
+	#define	core_config_error_prot_ver	3
+	#define	core_config_ccm_prot_pipelined	1
+	#define	core_config_watchdog	1
+	#define	core_config_watchdog_size	16
+	#define	core_config_pct_counters	8
+	#define	core_config_connect_pmu	1
+	#define	core_config_connect_pdm	1
+	#define	core_config_dmac	1
+	#define	core_config_dmac_channels	16
+	#define	core_config_dmac_registers	16
+	#define	core_config_dmac_fifo_depth	4
+	#define	core_config_dmac_int_config	"multiple_internal"
+	#define	core_config_power_domains	1
+	#define	core_config_dvfs	1
+#endif /* __core_config_h */
+
+]]></string>
+  </configuration>
+  <configuration name="core" filename="core.props">
+    <string><![CDATA[
+	core_config.cir.identity=0x00000142
+	core_config.cir.identity.chipid=0
+	core_config.cir.identity.arcnum=1
+	core_config.cir.identity.arcver=66
+	core_config.cir.identity.family=4
+	core_config.cir.identity.corever=2
+	core_config.cir.aux_dccm=0x80000000
+	core_config.bcr.bcr_ver=0x00000002
+	core_config.bcr.bcr_ver.version=2
+	core_config.bcr.vecbase_ac_build=0x00000010
+	core_config.bcr.mpu_build=0x00001002
+	core_config.bcr.mpu_build.i=0
+	core_config.bcr.mpu_build.s=0
+	core_config.bcr.mpu_build.regions=16
+	core_config.bcr.mpu_build.version=2
+	core_config.bcr.rf_build=0x0000c902
+	core_config.bcr.rf_build.version=2
+	core_config.bcr.rf_build.p=1
+	core_config.bcr.rf_build.e=0
+	core_config.bcr.rf_build.r=0
+	core_config.bcr.rf_build.b=1
+	core_config.bcr.rf_build.d=3
+	core_config.bcr.dccm_build=0x00000904
+	core_config.bcr.dccm_build.cycles=0
+	core_config.bcr.dccm_build.interleave=0
+	core_config.bcr.dccm_build.size1=0
+	core_config.bcr.dccm_build.size0=9
+	core_config.bcr.dccm_build.version=4
+	core_config.bcr.timer_build=0x00010304
+	core_config.bcr.timer_build.sp1=0
+	core_config.bcr.timer_build.sp0=0
+	core_config.bcr.timer_build.p1=0
+	core_config.bcr.timer_build.p0=1
+	core_config.bcr.timer_build.st1=0
+	core_config.bcr.timer_build.st0=0
+	core_config.bcr.timer_build.rtc=0
+	core_config.bcr.timer_build.rtsc_ver=1
+	core_config.bcr.timer_build.rtsc=0
+	core_config.bcr.timer_build.t0=1
+	core_config.bcr.timer_build.t1=1
+	core_config.bcr.timer_build.version=4
+	core_config.bcr.ap_build=0x00000605
+	core_config.bcr.ap_build.version=5
+	core_config.bcr.ap_build.type=6
+	core_config.bcr.iccm_build=0x00000a04
+	core_config.bcr.iccm_build.iccm1_size1=0
+	core_config.bcr.iccm_build.iccm0_size1=0
+	core_config.bcr.iccm_build.iccm1_size0=0
+	core_config.bcr.iccm_build.iccm0_size0=10
+	core_config.bcr.iccm_build.version=4
+	core_config.bcr.xy_build=0x00003620
+	core_config.bcr.xy_build.memsize=3
+	core_config.bcr.xy_build.interleaved=1
+	core_config.bcr.xy_build.config=2
+	core_config.bcr.xy_build.version=32
+	core_config.bcr.dsp_build=0x00003521
+	core_config.bcr.dsp_build.wide=0
+	core_config.bcr.dsp_build.itu_pa=1
+	core_config.bcr.dsp_build.acc_shift=2
+	core_config.bcr.dsp_build.comp=1
+	core_config.bcr.dsp_build.divsqrt=1
+	core_config.bcr.dsp_build.version=33
+	core_config.bcr.multiply_build=0x00022a06
+	core_config.bcr.multiply_build.version16x16=2
+	core_config.bcr.multiply_build.dsp=2
+	core_config.bcr.multiply_build.cyc=2
+	core_config.bcr.multiply_build.type=2
+	core_config.bcr.multiply_build.version32x32=6
+	core_config.bcr.swap_build=0x00000003
+	core_config.bcr.swap_build.version=3
+	core_config.bcr.norm_build=0x00000003
+	core_config.bcr.norm_build.version=3
+	core_config.bcr.minmax_build=0x00000002
+	core_config.bcr.minmax_build.version=2
+	core_config.bcr.barrel_build=0x00000303
+	core_config.bcr.barrel_build.version=3
+	core_config.bcr.barrel_build.shift_option=3
+	core_config.bcr.isa_config=0x12447402
+	core_config.bcr.isa_config.d=1
+	core_config.bcr.isa_config.c=2
+	core_config.bcr.isa_config.l=0
+	core_config.bcr.isa_config.n=1
+	core_config.bcr.isa_config.a=0
+	core_config.bcr.isa_config.b=0
+	core_config.bcr.isa_config.addr_size=4
+	core_config.bcr.isa_config.lpc_size=7
+	core_config.bcr.isa_config.pc_size=4
+	core_config.bcr.isa_config.version=2
+	core_config.bcr.stack_region_build=0x00000002
+	core_config.bcr.erp_build=0x30000003
+	core_config.bcr.erp_build.l=0
+	core_config.bcr.erp_build.wd=1
+	core_config.bcr.erp_build.c=1
+	core_config.bcr.erp_build.rf=0
+	core_config.bcr.erp_build.pc=0
+	core_config.bcr.erp_build.ic=0
+	core_config.bcr.erp_build.dc=0
+	core_config.bcr.erp_build.ip=0
+	core_config.bcr.erp_build.dp=0
+	core_config.bcr.erp_build.version=3
+	core_config.bcr.fpu_build=0x01000f02
+	core_config.bcr.fpu_build.da=1
+	core_config.bcr.fpu_build.dd=0
+	core_config.bcr.fpu_build.dc=0
+	core_config.bcr.fpu_build.df=0
+	core_config.bcr.fpu_build.dp=0
+	core_config.bcr.fpu_build.fd=0
+	core_config.bcr.fpu_build.fm=0
+	core_config.bcr.fpu_build.sd=1
+	core_config.bcr.fpu_build.sc=1
+	core_config.bcr.fpu_build.sf=1
+	core_config.bcr.fpu_build.sp=1
+	core_config.bcr.fpu_build.version=2
+	core_config.bcr.cprot_build=0x00000001
+	core_config.bcr.agu_build=0x01442401
+	core_config.bcr.agu_build.accordian=1
+	core_config.bcr.agu_build.wb_size=2
+	core_config.bcr.agu_build.num_modifier=4
+	core_config.bcr.agu_build.num_offset=2
+	core_config.bcr.agu_build.num_addr=4
+	core_config.bcr.agu_build.version=1
+	core_config.bcr.dmac_build=0x00170f01
+	core_config.bcr.dmac_build.int_cfg=2
+	core_config.bcr.dmac_build.fifo=3
+	core_config.bcr.dmac_build.chan_mem=16
+	core_config.bcr.dmac_build.channels=15
+	core_config.bcr.dmac_build.version=1
+	core_config.bcr.mcip_system_build=0x02011002
+	core_config.bcr.mcip_system_build.pdm=1
+	core_config.bcr.mcip_system_build.idu=0
+	core_config.bcr.mcip_system_build.corenum=1
+	core_config.bcr.mcip_system_build.gfrc=0
+	core_config.bcr.mcip_system_build.icd=0
+	core_config.bcr.mcip_system_build.pmu=1
+	core_config.bcr.mcip_system_build.icm=0
+	core_config.bcr.mcip_system_build.ics=0
+	core_config.bcr.mcip_system_build.ici=0
+	core_config.bcr.mcip_system_build.asi=0
+	core_config.bcr.mcip_system_build.version=2
+	core_config.bcr.mcip_system_build.llm=0
+	core_config.bcr.mcip_system_build.rtc=0
+	core_config.bcr.mcip_system_build.mcd=0
+	core_config.bcr.mcip_system_build.mps=0
+	core_config.bcr.mcip_system_build.bsu=0
+	core_config.bcr.mcip_pmu_build=0x00000002
+	core_config.bcr.mcip_pmu_build.version=2
+	core_config.bcr.mcip_pmu_build.dvfs=0
+	core_config.bcr.mcip_pmu_build.pm=0
+	core_config.bcr.mcip_pdm_build=0x00000001
+	core_config.bcr.mcip_pdm_build.version=1
+	core_config.bcr.subsys_build=0x00100013
+	core_config.bcr.core_config=0x00000001
+	core_config.bcr.core_config.turbo_boost=0
+	core_config.bcr.core_config.version=1
+	core_config.bcr.irq_build=0x133c5f01
+	core_config.bcr.irq_build.raz=0
+	core_config.bcr.irq_build.f=1
+	core_config.bcr.irq_build.p=3
+	core_config.bcr.irq_build.exts=60
+	core_config.bcr.irq_build.irqs=95
+	core_config.bcr.irq_build.version=1
+	core_config.bcr.pct_build=0x08080102
+	core_config.bcr.pct_build.version=2
+	core_config.bcr.pct_build.s=1
+	core_config.bcr.pct_build.i=0
+	core_config.bcr.pct_build.c=8
+	core_config.bcr.cc_build=0x006f0004
+	core_config.bcr.cc_build.version=4
+	core_config.bcr.cc_build.cc=111
+	core_config.bcr.pdm_dvfs_build=0x00000302
+	core_config.bcr.pdm_dvfs_build.dvfs=1
+	core_config.bcr.pdm_dvfs_build.pdm=1
+	core_config.bcr.pdm_dvfs_build.version=2
+	core_config.bcr.ifqueue_build=0x00000202
+	core_config.bcr.ifqueue_build.bd=2
+	core_config.bcr.ifqueue_build.version=2
+	core_config.bcr.smart_build=0x00010003
+	core_config.bcr.smart_build.version=3
+	core_config.bcr.smart_build.stack_size=64
+	core_config.cir.aux_iccm=0x20000000
+	core_config.cir.xccm_base=0xc0000000
+	core_config.cir.yccm_base=0xe0000000
+	core_config.cir.subsys_dsp_0_build=0x00001000
+	core_config.cir.subsys_io_0_build=0x071711f0
+	core_config.cir.subsys_io_1_build=0x00000f70
+	core_config.family=4
+	core_config.core_version=2
+	core_config.family_name=arcv2em
+	core_config.rgf_num_banks=2
+	core_config.rgf_banked_regs=32
+	core_config.rgf_num_wr_ports=2
+	core_config.endian=little
+	core_config.endian_little=1
+	core_config.endian_big=0
+	core_config.lpc_size=32
+	core_config.pc_size=32
+	core_config.addr_size=32
+	core_config.unaligned=1
+	core_config.code_density=1
+	core_config.div_rem=radix2
+	core_config.div_rem_radix2=1
+	core_config.swap=1
+	core_config.bitscan=1
+	core_config.mpy_option=mpyd
+	core_config.mpy_option_num=8
+	core_config.shift_assist=1
+	core_config.barrel_shifter=1
+	core_config.dsp=1
+	core_config.dsp2=1
+	core_config.dsp_complex=1
+	core_config.dsp_divsqrt=radix2
+	core_config.dsp_divsqrt_radix2=1
+	core_config.dsp_itu=1
+	core_config.dsp_accshift=full
+	core_config.dsp_accshift_full=1
+	core_config.agu_small=1
+	core_config.agu_wb_depth=2
+	core_config.agu_accord=1
+	core_config.xy=1
+	core_config.xy_config=dccm_x_y
+	core_config.xy_config_dccm_x_y=1
+	core_config.xy_size=32K
+	core_config.xy_interleave=1
+	core_config.xy_x_base=0xc0000000
+	core_config.xy_y_base=0xe0000000
+	core_config.fpus_div=1
+	core_config.fpu_mac=1
+	core_config.fpuda=1
+	core_config.fpus_mpy_slow=1
+	core_config.fpus_div_slow=1
+	core_config.timer0=1
+	core_config.timer0_level=1
+	core_config.timer0.vector=16
+	core_config.timer1=1
+	core_config.timer1_level=0
+	core_config.timer1.vector=17
+	core_config.action_points=8
+	core_config.stack_check=1
+	core_config.code_protection=1
+	core_config.smart_stack_entries=64
+	core_config.mpu.present=1
+	core_config.mpu=1
+	core_config.mpu.regions=16
+	core_config.ifq.present=1
+	core_config.ifq_entries=4
+	core_config.interrupts.present=1
+	core_config.interrupts.number=95
+	core_config.interrupts.priorities=4
+	core_config.interrupts.externals=60
+	core_config.interrupts=95
+	core_config.interrupt_priorities=4
+	core_config.ext_interrupts=60
+	core_config.interrupts.firq=1
+	core_config.interrupts.base=0x0
+	core_config.dccm.present=1
+	core_config.dccm_size=0x20000
+	core_config.dccm_base=0x80000000
+	core_config.iccm.present=1
+	core_config.iccm0.present=1
+	core_config.iccm.size=0x40000
+	core_config.iccm0.size=0x40000
+	core_config.iccm.base=0x20000000
+	core_config.iccm0.base=0x20000000
+	core_config.error_prot_ver=3
+	core_config.ccm_prot_pipelined=1
+	core_config.watchdog=1
+	core_config.watchdog_size=16
+	core_config.pct_counters=8
+	core_config.connect_pmu=1
+	core_config.connect_pdm=1
+	core_config.dmac=1
+	core_config.dmac_channels=16
+	core_config.dmac_registers=16
+	core_config.dmac_fifo_depth=4
+	core_config.dmac_int_config=multiple_internal
+	core_config.power_domains=1
+	core_config.dvfs=1
+]]></string>
+  </configuration>
+  <configuration name="gcc_compiler" filename="gcc.arg">
+    <string><![CDATA[
+	-mcpu=em4_fpuda
+	-mlittle-endian
+	-mcode-density
+	-mdiv-rem
+	-mswap
+	-mnorm
+	-mmpy-option=6
+	-mbarrel-shifter
+	-mfpu=fpuda_all
+]]></string>
+  </configuration>
+  <configuration name="linker_command_file" filename="link_cmd.txt">
+    <string><![CDATA[
+# SYSTEM memory regions indicate where external memory might be located.
+#   The TCF has no specific knowledge of whether SYSTEM regions contain 
+#   external memory or not.
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+#   SYSTEM0 : ORIGIN = 0x00000000, LENGTH = 0x20000000
+    ICCM0   : ORIGIN = 0x20000000, LENGTH = 0x00040000
+#   CCMWRAP0: ORIGIN = 0x20040000, LENGTH = 0x0ffc0000
+#   SYSTEM1 : ORIGIN = 0x30000000, LENGTH = 0x50000000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+#   SYSTEM2 : ORIGIN = 0x90000000, LENGTH = 0x30000000
+    XCCM    : ORIGIN = 0xc0000000, LENGTH = 0x00008000
+#   CCMWRAP2: ORIGIN = 0xc0008000, LENGTH = 0x0fff8000
+#   SYSTEM3 : ORIGIN = 0xd0000000, LENGTH = 0x10000000
+    YCCM    : ORIGIN = 0xe0000000, LENGTH = 0x00008000
+#   CCMWRAP3: ORIGIN = 0xe0008000, LENGTH = 0x0fff8000
+#   SYSTEM4 : ORIGIN = 0xf0000000, LENGTH = 0x10000000
+    }
+SECTIONS {
+    GROUP: {
+	.text? : { *('.text$crt*') }
+        * (TEXT): {}
+    	* (LIT): {}
+	} > ICCM0
+
+    GROUP: {
+	/* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
+	} > DCCM
+    GROUP: {
+        .Xdata? : {}
+        } > XCCM
+    GROUP: {
+        .Ydata? : {}
+        } > YCCM
+    GROUP BIND(0x0): {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4)
+        }
+    }
+
+]]></string>
+  </configuration>
+  <configuration name="gnu_linker_command_file" filename="memory.x">
+    <string><![CDATA[
+MEMORY {
+    SYSTEM0  : ORIGIN = 0x00000000, LENGTH = 0x20000000
+    ICCM0    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    CCMWRAP0 : ORIGIN = 0x20040000, LENGTH = 0x0ffc0000
+    SYSTEM1  : ORIGIN = 0x30000000, LENGTH = 0x50000000
+    DCCM     : ORIGIN = 0x80000000, LENGTH = 0x00020000
+    CCMWRAP1 : ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    SYSTEM2  : ORIGIN = 0x90000000, LENGTH = 0x30000000
+    XCCM     : ORIGIN = 0xc0000000, LENGTH = 0x00008000
+    CCMWRAP2 : ORIGIN = 0xc0008000, LENGTH = 0x0fff8000
+    SYSTEM3  : ORIGIN = 0xd0000000, LENGTH = 0x10000000
+    YCCM     : ORIGIN = 0xe0000000, LENGTH = 0x00008000
+    CCMWRAP3 : ORIGIN = 0xe0008000, LENGTH = 0x0fff8000
+    SYSTEM4  : ORIGIN = 0xf0000000, LENGTH = 0x10000000
+    }
+REGION_ALIAS("startup", ICCM0)
+REGION_ALIAS("text", ICCM0)
+REGION_ALIAS("data", DCCM)
+REGION_ALIAS("sdata", DCCM)
+PROVIDE (__stack_top = (0x8001ffff & -4 ));
+PROVIDE (__end_heap =  (0x8001ffff ));
+]]></string>
+  </configuration>
+  <configuration name="apex_header" filename="apexextensions.h">
+    <string><![CDATA[
+
+/* **** DO NOT EDIT - this file is generated by ARChitect2 ****
+ *
+ * Description: Header file declaring the compiler extensions for apex components 
+ */
+
+#ifndef _apexextensions_H_
+#define _apexextensions_H_
+
+// User extension instruction - dsp_cos
+extern long dsp_cos(long);
+#pragma intrinsic(dsp_cos, opcode => 0x07, sub_opcode => 0x1E , latency_cycles => 8)
+
+// User extension instruction - dsp_sin
+extern long dsp_sin(long);
+#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8)
+
+// User extension instruction - dsp_tan
+extern long dsp_tan(long);
+#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11)
+
+// User extension instruction - dsp_acos
+extern long dsp_acos(long);
+#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31)
+
+// User extension instruction - dsp_asin
+extern long dsp_asin(long);
+#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31)
+
+// User extension instruction - dsp_atan
+extern long dsp_atan(long);
+#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13)
+
+// User extension instruction - dsp_sqrt
+extern long dsp_sqrt(long);
+#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31)
+
+// User extension instruction - dsp_sqrt15
+extern long dsp_sqrt15(long);
+#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15)
+
+#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT	1
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B0_IO_GPIO_4B0_PRESENT	1
+
+// User extension aux register io_gpio_4b0_debounce
+#define AR_IO_GPIO_4B0_DEBOUNCE 0x80017c48
+#pragma Aux_register(0x80017c48, name=>"io_gpio_4b0_debounce")
+
+// User extension aux register io_gpio_4b0_clken
+#define AR_IO_GPIO_4B0_CLKEN 0x80017c80
+#pragma Aux_register(0x80017c80, name=>"io_gpio_4b0_clken")
+
+// User extension aux register io_gpio_4b0_swporta_dr
+#define AR_IO_GPIO_4B0_SWPORTA_DR 0x80017c00
+#pragma Aux_register(0x80017c00, name=>"io_gpio_4b0_swporta_dr")
+
+// User extension aux register io_gpio_4b0_swporta_ddr
+#define AR_IO_GPIO_4B0_SWPORTA_DDR 0x80017c04
+#pragma Aux_register(0x80017c04, name=>"io_gpio_4b0_swporta_ddr")
+
+// User extension aux register io_gpio_4b0_inten
+#define AR_IO_GPIO_4B0_INTEN 0x80017c30
+#pragma Aux_register(0x80017c30, name=>"io_gpio_4b0_inten")
+
+// User extension aux register io_gpio_4b0_intmask
+#define AR_IO_GPIO_4B0_INTMASK 0x80017c34
+#pragma Aux_register(0x80017c34, name=>"io_gpio_4b0_intmask")
+
+// User extension aux register io_gpio_4b0_inttype_level
+#define AR_IO_GPIO_4B0_INTTYPE_LEVEL 0x80017c38
+#pragma Aux_register(0x80017c38, name=>"io_gpio_4b0_inttype_level")
+
+// User extension aux register io_gpio_4b0_int_polarity
+#define AR_IO_GPIO_4B0_INT_POLARITY 0x80017c3c
+#pragma Aux_register(0x80017c3c, name=>"io_gpio_4b0_int_polarity")
+
+// User extension aux register io_gpio_4b0_intstatus
+#define AR_IO_GPIO_4B0_INTSTATUS 0x80017c40
+#pragma Aux_register(0x80017c40, name=>"io_gpio_4b0_intstatus")
+
+// User extension aux register io_gpio_4b0_raw_intstatus
+#define AR_IO_GPIO_4B0_RAW_INTSTATUS 0x80017c44
+#pragma Aux_register(0x80017c44, name=>"io_gpio_4b0_raw_intstatus")
+
+// User extension aux register io_gpio_4b0_porta_eoi
+#define AR_IO_GPIO_4B0_PORTA_EOI 0x80017c4c
+#pragma Aux_register(0x80017c4c, name=>"io_gpio_4b0_porta_eoi")
+
+// User extension aux register io_gpio_4b0_ext_porta
+#define AR_IO_GPIO_4B0_EXT_PORTA 0x80017c50
+#pragma Aux_register(0x80017c50, name=>"io_gpio_4b0_ext_porta")
+
+// User extension aux register io_gpio_4b0_ls_sync
+#define AR_IO_GPIO_4B0_LS_SYNC 0x80017c60
+#pragma Aux_register(0x80017c60, name=>"io_gpio_4b0_ls_sync")
+
+// User extension aux register io_gpio_4b0_int_bothedge
+#define AR_IO_GPIO_4B0_INT_BOTHEDGE 0x80017c68
+#pragma Aux_register(0x80017c68, name=>"io_gpio_4b0_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B1_IO_GPIO_4B1_PRESENT	1
+
+// User extension aux register io_gpio_4b1_debounce
+#define AR_IO_GPIO_4B1_DEBOUNCE 0x80017d48
+#pragma Aux_register(0x80017d48, name=>"io_gpio_4b1_debounce")
+
+// User extension aux register io_gpio_4b1_clken
+#define AR_IO_GPIO_4B1_CLKEN 0x80017d80
+#pragma Aux_register(0x80017d80, name=>"io_gpio_4b1_clken")
+
+// User extension aux register io_gpio_4b1_swporta_dr
+#define AR_IO_GPIO_4B1_SWPORTA_DR 0x80017d00
+#pragma Aux_register(0x80017d00, name=>"io_gpio_4b1_swporta_dr")
+
+// User extension aux register io_gpio_4b1_swporta_ddr
+#define AR_IO_GPIO_4B1_SWPORTA_DDR 0x80017d04
+#pragma Aux_register(0x80017d04, name=>"io_gpio_4b1_swporta_ddr")
+
+// User extension aux register io_gpio_4b1_inten
+#define AR_IO_GPIO_4B1_INTEN 0x80017d30
+#pragma Aux_register(0x80017d30, name=>"io_gpio_4b1_inten")
+
+// User extension aux register io_gpio_4b1_intmask
+#define AR_IO_GPIO_4B1_INTMASK 0x80017d34
+#pragma Aux_register(0x80017d34, name=>"io_gpio_4b1_intmask")
+
+// User extension aux register io_gpio_4b1_inttype_level
+#define AR_IO_GPIO_4B1_INTTYPE_LEVEL 0x80017d38
+#pragma Aux_register(0x80017d38, name=>"io_gpio_4b1_inttype_level")
+
+// User extension aux register io_gpio_4b1_int_polarity
+#define AR_IO_GPIO_4B1_INT_POLARITY 0x80017d3c
+#pragma Aux_register(0x80017d3c, name=>"io_gpio_4b1_int_polarity")
+
+// User extension aux register io_gpio_4b1_intstatus
+#define AR_IO_GPIO_4B1_INTSTATUS 0x80017d40
+#pragma Aux_register(0x80017d40, name=>"io_gpio_4b1_intstatus")
+
+// User extension aux register io_gpio_4b1_raw_intstatus
+#define AR_IO_GPIO_4B1_RAW_INTSTATUS 0x80017d44
+#pragma Aux_register(0x80017d44, name=>"io_gpio_4b1_raw_intstatus")
+
+// User extension aux register io_gpio_4b1_porta_eoi
+#define AR_IO_GPIO_4B1_PORTA_EOI 0x80017d4c
+#pragma Aux_register(0x80017d4c, name=>"io_gpio_4b1_porta_eoi")
+
+// User extension aux register io_gpio_4b1_ext_porta
+#define AR_IO_GPIO_4B1_EXT_PORTA 0x80017d50
+#pragma Aux_register(0x80017d50, name=>"io_gpio_4b1_ext_porta")
+
+// User extension aux register io_gpio_4b1_ls_sync
+#define AR_IO_GPIO_4B1_LS_SYNC 0x80017d60
+#pragma Aux_register(0x80017d60, name=>"io_gpio_4b1_ls_sync")
+
+// User extension aux register io_gpio_4b1_int_bothedge
+#define AR_IO_GPIO_4B1_INT_BOTHEDGE 0x80017d68
+#pragma Aux_register(0x80017d68, name=>"io_gpio_4b1_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B2_IO_GPIO_4B2_PRESENT	1
+
+// User extension aux register io_gpio_4b2_debounce
+#define AR_IO_GPIO_4B2_DEBOUNCE 0x80017e48
+#pragma Aux_register(0x80017e48, name=>"io_gpio_4b2_debounce")
+
+// User extension aux register io_gpio_4b2_clken
+#define AR_IO_GPIO_4B2_CLKEN 0x80017e80
+#pragma Aux_register(0x80017e80, name=>"io_gpio_4b2_clken")
+
+// User extension aux register io_gpio_4b2_swporta_dr
+#define AR_IO_GPIO_4B2_SWPORTA_DR 0x80017e00
+#pragma Aux_register(0x80017e00, name=>"io_gpio_4b2_swporta_dr")
+
+// User extension aux register io_gpio_4b2_swporta_ddr
+#define AR_IO_GPIO_4B2_SWPORTA_DDR 0x80017e04
+#pragma Aux_register(0x80017e04, name=>"io_gpio_4b2_swporta_ddr")
+
+// User extension aux register io_gpio_4b2_inten
+#define AR_IO_GPIO_4B2_INTEN 0x80017e30
+#pragma Aux_register(0x80017e30, name=>"io_gpio_4b2_inten")
+
+// User extension aux register io_gpio_4b2_intmask
+#define AR_IO_GPIO_4B2_INTMASK 0x80017e34
+#pragma Aux_register(0x80017e34, name=>"io_gpio_4b2_intmask")
+
+// User extension aux register io_gpio_4b2_inttype_level
+#define AR_IO_GPIO_4B2_INTTYPE_LEVEL 0x80017e38
+#pragma Aux_register(0x80017e38, name=>"io_gpio_4b2_inttype_level")
+
+// User extension aux register io_gpio_4b2_int_polarity
+#define AR_IO_GPIO_4B2_INT_POLARITY 0x80017e3c
+#pragma Aux_register(0x80017e3c, name=>"io_gpio_4b2_int_polarity")
+
+// User extension aux register io_gpio_4b2_intstatus
+#define AR_IO_GPIO_4B2_INTSTATUS 0x80017e40
+#pragma Aux_register(0x80017e40, name=>"io_gpio_4b2_intstatus")
+
+// User extension aux register io_gpio_4b2_raw_intstatus
+#define AR_IO_GPIO_4B2_RAW_INTSTATUS 0x80017e44
+#pragma Aux_register(0x80017e44, name=>"io_gpio_4b2_raw_intstatus")
+
+// User extension aux register io_gpio_4b2_porta_eoi
+#define AR_IO_GPIO_4B2_PORTA_EOI 0x80017e4c
+#pragma Aux_register(0x80017e4c, name=>"io_gpio_4b2_porta_eoi")
+
+// User extension aux register io_gpio_4b2_ext_porta
+#define AR_IO_GPIO_4B2_EXT_PORTA 0x80017e50
+#pragma Aux_register(0x80017e50, name=>"io_gpio_4b2_ext_porta")
+
+// User extension aux register io_gpio_4b2_ls_sync
+#define AR_IO_GPIO_4B2_LS_SYNC 0x80017e60
+#pragma Aux_register(0x80017e60, name=>"io_gpio_4b2_ls_sync")
+
+// User extension aux register io_gpio_4b2_int_bothedge
+#define AR_IO_GPIO_4B2_INT_BOTHEDGE 0x80017e68
+#pragma Aux_register(0x80017e68, name=>"io_gpio_4b2_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B0_IO_GPIO_8B0_PRESENT	1
+
+// User extension aux register io_gpio_8b0_debounce
+#define AR_IO_GPIO_8B0_DEBOUNCE 0x80017848
+#pragma Aux_register(0x80017848, name=>"io_gpio_8b0_debounce")
+
+// User extension aux register io_gpio_8b0_clken
+#define AR_IO_GPIO_8B0_CLKEN 0x80017880
+#pragma Aux_register(0x80017880, name=>"io_gpio_8b0_clken")
+
+// User extension aux register io_gpio_8b0_swporta_dr
+#define AR_IO_GPIO_8B0_SWPORTA_DR 0x80017800
+#pragma Aux_register(0x80017800, name=>"io_gpio_8b0_swporta_dr")
+
+// User extension aux register io_gpio_8b0_swporta_ddr
+#define AR_IO_GPIO_8B0_SWPORTA_DDR 0x80017804
+#pragma Aux_register(0x80017804, name=>"io_gpio_8b0_swporta_ddr")
+
+// User extension aux register io_gpio_8b0_inten
+#define AR_IO_GPIO_8B0_INTEN 0x80017830
+#pragma Aux_register(0x80017830, name=>"io_gpio_8b0_inten")
+
+// User extension aux register io_gpio_8b0_intmask
+#define AR_IO_GPIO_8B0_INTMASK 0x80017834
+#pragma Aux_register(0x80017834, name=>"io_gpio_8b0_intmask")
+
+// User extension aux register io_gpio_8b0_inttype_level
+#define AR_IO_GPIO_8B0_INTTYPE_LEVEL 0x80017838
+#pragma Aux_register(0x80017838, name=>"io_gpio_8b0_inttype_level")
+
+// User extension aux register io_gpio_8b0_int_polarity
+#define AR_IO_GPIO_8B0_INT_POLARITY 0x8001783c
+#pragma Aux_register(0x8001783c, name=>"io_gpio_8b0_int_polarity")
+
+// User extension aux register io_gpio_8b0_intstatus
+#define AR_IO_GPIO_8B0_INTSTATUS 0x80017840
+#pragma Aux_register(0x80017840, name=>"io_gpio_8b0_intstatus")
+
+// User extension aux register io_gpio_8b0_raw_intstatus
+#define AR_IO_GPIO_8B0_RAW_INTSTATUS 0x80017844
+#pragma Aux_register(0x80017844, name=>"io_gpio_8b0_raw_intstatus")
+
+// User extension aux register io_gpio_8b0_porta_eoi
+#define AR_IO_GPIO_8B0_PORTA_EOI 0x8001784c
+#pragma Aux_register(0x8001784c, name=>"io_gpio_8b0_porta_eoi")
+
+// User extension aux register io_gpio_8b0_ext_porta
+#define AR_IO_GPIO_8B0_EXT_PORTA 0x80017850
+#pragma Aux_register(0x80017850, name=>"io_gpio_8b0_ext_porta")
+
+// User extension aux register io_gpio_8b0_ls_sync
+#define AR_IO_GPIO_8B0_LS_SYNC 0x80017860
+#pragma Aux_register(0x80017860, name=>"io_gpio_8b0_ls_sync")
+
+// User extension aux register io_gpio_8b0_int_bothedge
+#define AR_IO_GPIO_8B0_INT_BOTHEDGE 0x80017868
+#pragma Aux_register(0x80017868, name=>"io_gpio_8b0_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B1_IO_GPIO_8B1_PRESENT	1
+
+// User extension aux register io_gpio_8b1_debounce
+#define AR_IO_GPIO_8B1_DEBOUNCE 0x80017948
+#pragma Aux_register(0x80017948, name=>"io_gpio_8b1_debounce")
+
+// User extension aux register io_gpio_8b1_clken
+#define AR_IO_GPIO_8B1_CLKEN 0x80017980
+#pragma Aux_register(0x80017980, name=>"io_gpio_8b1_clken")
+
+// User extension aux register io_gpio_8b1_swporta_dr
+#define AR_IO_GPIO_8B1_SWPORTA_DR 0x80017900
+#pragma Aux_register(0x80017900, name=>"io_gpio_8b1_swporta_dr")
+
+// User extension aux register io_gpio_8b1_swporta_ddr
+#define AR_IO_GPIO_8B1_SWPORTA_DDR 0x80017904
+#pragma Aux_register(0x80017904, name=>"io_gpio_8b1_swporta_ddr")
+
+// User extension aux register io_gpio_8b1_inten
+#define AR_IO_GPIO_8B1_INTEN 0x80017930
+#pragma Aux_register(0x80017930, name=>"io_gpio_8b1_inten")
+
+// User extension aux register io_gpio_8b1_intmask
+#define AR_IO_GPIO_8B1_INTMASK 0x80017934
+#pragma Aux_register(0x80017934, name=>"io_gpio_8b1_intmask")
+
+// User extension aux register io_gpio_8b1_inttype_level
+#define AR_IO_GPIO_8B1_INTTYPE_LEVEL 0x80017938
+#pragma Aux_register(0x80017938, name=>"io_gpio_8b1_inttype_level")
+
+// User extension aux register io_gpio_8b1_int_polarity
+#define AR_IO_GPIO_8B1_INT_POLARITY 0x8001793c
+#pragma Aux_register(0x8001793c, name=>"io_gpio_8b1_int_polarity")
+
+// User extension aux register io_gpio_8b1_intstatus
+#define AR_IO_GPIO_8B1_INTSTATUS 0x80017940
+#pragma Aux_register(0x80017940, name=>"io_gpio_8b1_intstatus")
+
+// User extension aux register io_gpio_8b1_raw_intstatus
+#define AR_IO_GPIO_8B1_RAW_INTSTATUS 0x80017944
+#pragma Aux_register(0x80017944, name=>"io_gpio_8b1_raw_intstatus")
+
+// User extension aux register io_gpio_8b1_porta_eoi
+#define AR_IO_GPIO_8B1_PORTA_EOI 0x8001794c
+#pragma Aux_register(0x8001794c, name=>"io_gpio_8b1_porta_eoi")
+
+// User extension aux register io_gpio_8b1_ext_porta
+#define AR_IO_GPIO_8B1_EXT_PORTA 0x80017950
+#pragma Aux_register(0x80017950, name=>"io_gpio_8b1_ext_porta")
+
+// User extension aux register io_gpio_8b1_ls_sync
+#define AR_IO_GPIO_8B1_LS_SYNC 0x80017960
+#pragma Aux_register(0x80017960, name=>"io_gpio_8b1_ls_sync")
+
+// User extension aux register io_gpio_8b1_int_bothedge
+#define AR_IO_GPIO_8B1_INT_BOTHEDGE 0x80017968
+#pragma Aux_register(0x80017968, name=>"io_gpio_8b1_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B2_IO_GPIO_8B2_PRESENT	1
+
+// User extension aux register io_gpio_8b2_debounce
+#define AR_IO_GPIO_8B2_DEBOUNCE 0x80017a48
+#pragma Aux_register(0x80017a48, name=>"io_gpio_8b2_debounce")
+
+// User extension aux register io_gpio_8b2_clken
+#define AR_IO_GPIO_8B2_CLKEN 0x80017a80
+#pragma Aux_register(0x80017a80, name=>"io_gpio_8b2_clken")
+
+// User extension aux register io_gpio_8b2_swporta_dr
+#define AR_IO_GPIO_8B2_SWPORTA_DR 0x80017a00
+#pragma Aux_register(0x80017a00, name=>"io_gpio_8b2_swporta_dr")
+
+// User extension aux register io_gpio_8b2_swporta_ddr
+#define AR_IO_GPIO_8B2_SWPORTA_DDR 0x80017a04
+#pragma Aux_register(0x80017a04, name=>"io_gpio_8b2_swporta_ddr")
+
+// User extension aux register io_gpio_8b2_inten
+#define AR_IO_GPIO_8B2_INTEN 0x80017a30
+#pragma Aux_register(0x80017a30, name=>"io_gpio_8b2_inten")
+
+// User extension aux register io_gpio_8b2_intmask
+#define AR_IO_GPIO_8B2_INTMASK 0x80017a34
+#pragma Aux_register(0x80017a34, name=>"io_gpio_8b2_intmask")
+
+// User extension aux register io_gpio_8b2_inttype_level
+#define AR_IO_GPIO_8B2_INTTYPE_LEVEL 0x80017a38
+#pragma Aux_register(0x80017a38, name=>"io_gpio_8b2_inttype_level")
+
+// User extension aux register io_gpio_8b2_int_polarity
+#define AR_IO_GPIO_8B2_INT_POLARITY 0x80017a3c
+#pragma Aux_register(0x80017a3c, name=>"io_gpio_8b2_int_polarity")
+
+// User extension aux register io_gpio_8b2_intstatus
+#define AR_IO_GPIO_8B2_INTSTATUS 0x80017a40
+#pragma Aux_register(0x80017a40, name=>"io_gpio_8b2_intstatus")
+
+// User extension aux register io_gpio_8b2_raw_intstatus
+#define AR_IO_GPIO_8B2_RAW_INTSTATUS 0x80017a44
+#pragma Aux_register(0x80017a44, name=>"io_gpio_8b2_raw_intstatus")
+
+// User extension aux register io_gpio_8b2_porta_eoi
+#define AR_IO_GPIO_8B2_PORTA_EOI 0x80017a4c
+#pragma Aux_register(0x80017a4c, name=>"io_gpio_8b2_porta_eoi")
+
+// User extension aux register io_gpio_8b2_ext_porta
+#define AR_IO_GPIO_8B2_EXT_PORTA 0x80017a50
+#pragma Aux_register(0x80017a50, name=>"io_gpio_8b2_ext_porta")
+
+// User extension aux register io_gpio_8b2_ls_sync
+#define AR_IO_GPIO_8B2_LS_SYNC 0x80017a60
+#pragma Aux_register(0x80017a60, name=>"io_gpio_8b2_ls_sync")
+
+// User extension aux register io_gpio_8b2_int_bothedge
+#define AR_IO_GPIO_8B2_INT_BOTHEDGE 0x80017a68
+#pragma Aux_register(0x80017a68, name=>"io_gpio_8b2_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B3_IO_GPIO_8B3_PRESENT	1
+
+// User extension aux register io_gpio_8b3_debounce
+#define AR_IO_GPIO_8B3_DEBOUNCE 0x80017b48
+#pragma Aux_register(0x80017b48, name=>"io_gpio_8b3_debounce")
+
+// User extension aux register io_gpio_8b3_clken
+#define AR_IO_GPIO_8B3_CLKEN 0x80017b80
+#pragma Aux_register(0x80017b80, name=>"io_gpio_8b3_clken")
+
+// User extension aux register io_gpio_8b3_swporta_dr
+#define AR_IO_GPIO_8B3_SWPORTA_DR 0x80017b00
+#pragma Aux_register(0x80017b00, name=>"io_gpio_8b3_swporta_dr")
+
+// User extension aux register io_gpio_8b3_swporta_ddr
+#define AR_IO_GPIO_8B3_SWPORTA_DDR 0x80017b04
+#pragma Aux_register(0x80017b04, name=>"io_gpio_8b3_swporta_ddr")
+
+// User extension aux register io_gpio_8b3_inten
+#define AR_IO_GPIO_8B3_INTEN 0x80017b30
+#pragma Aux_register(0x80017b30, name=>"io_gpio_8b3_inten")
+
+// User extension aux register io_gpio_8b3_intmask
+#define AR_IO_GPIO_8B3_INTMASK 0x80017b34
+#pragma Aux_register(0x80017b34, name=>"io_gpio_8b3_intmask")
+
+// User extension aux register io_gpio_8b3_inttype_level
+#define AR_IO_GPIO_8B3_INTTYPE_LEVEL 0x80017b38
+#pragma Aux_register(0x80017b38, name=>"io_gpio_8b3_inttype_level")
+
+// User extension aux register io_gpio_8b3_int_polarity
+#define AR_IO_GPIO_8B3_INT_POLARITY 0x80017b3c
+#pragma Aux_register(0x80017b3c, name=>"io_gpio_8b3_int_polarity")
+
+// User extension aux register io_gpio_8b3_intstatus
+#define AR_IO_GPIO_8B3_INTSTATUS 0x80017b40
+#pragma Aux_register(0x80017b40, name=>"io_gpio_8b3_intstatus")
+
+// User extension aux register io_gpio_8b3_raw_intstatus
+#define AR_IO_GPIO_8B3_RAW_INTSTATUS 0x80017b44
+#pragma Aux_register(0x80017b44, name=>"io_gpio_8b3_raw_intstatus")
+
+// User extension aux register io_gpio_8b3_porta_eoi
+#define AR_IO_GPIO_8B3_PORTA_EOI 0x80017b4c
+#pragma Aux_register(0x80017b4c, name=>"io_gpio_8b3_porta_eoi")
+
+// User extension aux register io_gpio_8b3_ext_porta
+#define AR_IO_GPIO_8B3_EXT_PORTA 0x80017b50
+#pragma Aux_register(0x80017b50, name=>"io_gpio_8b3_ext_porta")
+
+// User extension aux register io_gpio_8b3_ls_sync
+#define AR_IO_GPIO_8B3_LS_SYNC 0x80017b60
+#pragma Aux_register(0x80017b60, name=>"io_gpio_8b3_ls_sync")
+
+// User extension aux register io_gpio_8b3_int_bothedge
+#define AR_IO_GPIO_8B3_INT_BOTHEDGE 0x80017b68
+#pragma Aux_register(0x80017b68, name=>"io_gpio_8b3_int_bothedge")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_IO_I2C_MST0_PRESENT	1
+
+// User extension aux register io_i2c_mst0_clken
+#define AR_IO_I2C_MST0_CLKEN 0x800120c0
+#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken")
+
+// User extension aux register io_i2c_mst0_con
+#define AR_IO_I2C_MST0_CON 0x80012000
+#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con")
+
+// User extension aux register io_i2c_mst0_tar
+#define AR_IO_I2C_MST0_TAR 0x80012004
+#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar")
+
+// User extension aux register io_i2c_mst0_data_cmd
+#define AR_IO_I2C_MST0_DATA_CMD 0x80012010
+#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd")
+
+// User extension aux register io_i2c_mst0_ss_scl_hcnt
+#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014
+#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt")
+
+// User extension aux register io_i2c_mst0_ss_scl_lcnt
+#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018
+#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt")
+
+// User extension aux register io_i2c_mst0_fs_scl_hcnt
+#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c
+#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt")
+
+// User extension aux register io_i2c_mst0_fs_scl_lcnt
+#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020
+#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt")
+
+// User extension aux register io_i2c_mst0_intr_stat
+#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c
+#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat")
+
+// User extension aux register io_i2c_mst0_intr_mask
+#define AR_IO_I2C_MST0_INTR_MASK 0x80012030
+#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask")
+
+// User extension aux register io_i2c_mst0_raw_intr_stat
+#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034
+#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat")
+
+// User extension aux register io_i2c_mst0_rx_tl
+#define AR_IO_I2C_MST0_RX_TL 0x80012038
+#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl")
+
+// User extension aux register io_i2c_mst0_tx_tl
+#define AR_IO_I2C_MST0_TX_TL 0x8001203c
+#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl")
+
+// User extension aux register io_i2c_mst0_clr_intr
+#define AR_IO_I2C_MST0_CLR_INTR 0x80012040
+#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr")
+
+// User extension aux register io_i2c_mst0_clr_rx_under
+#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044
+#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under")
+
+// User extension aux register io_i2c_mst0_clr_rx_over
+#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048
+#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over")
+
+// User extension aux register io_i2c_mst0_clr_tx_over
+#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c
+#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over")
+
+// User extension aux register io_i2c_mst0_clr_tx_abrt
+#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054
+#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt")
+
+// User extension aux register io_i2c_mst0_clr_activity
+#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c
+#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity")
+
+// User extension aux register io_i2c_mst0_clr_stop_det
+#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060
+#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det")
+
+// User extension aux register io_i2c_mst0_clr_start_det
+#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064
+#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det")
+
+// User extension aux register io_i2c_mst0_enable
+#define AR_IO_I2C_MST0_ENABLE 0x8001206c
+#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable")
+
+// User extension aux register io_i2c_mst0_status
+#define AR_IO_I2C_MST0_STATUS 0x80012070
+#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status")
+
+// User extension aux register io_i2c_mst0_txflr
+#define AR_IO_I2C_MST0_TXFLR 0x80012074
+#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr")
+
+// User extension aux register io_i2c_mst0_rxflr
+#define AR_IO_I2C_MST0_RXFLR 0x80012078
+#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr")
+
+// User extension aux register io_i2c_mst0_sda_hold
+#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c
+#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold")
+
+// User extension aux register io_i2c_mst0_tx_abrt_source
+#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080
+#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source")
+
+// User extension aux register io_i2c_mst0_enable_status
+#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c
+#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status")
+
+// User extension aux register io_i2c_mst0_fs_spklen
+#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0
+#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_IO_I2C_MST1_PRESENT	1
+
+// User extension aux register io_i2c_mst1_clken
+#define AR_IO_I2C_MST1_CLKEN 0x800121c0
+#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken")
+
+// User extension aux register io_i2c_mst1_con
+#define AR_IO_I2C_MST1_CON 0x80012100
+#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con")
+
+// User extension aux register io_i2c_mst1_tar
+#define AR_IO_I2C_MST1_TAR 0x80012104
+#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar")
+
+// User extension aux register io_i2c_mst1_data_cmd
+#define AR_IO_I2C_MST1_DATA_CMD 0x80012110
+#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd")
+
+// User extension aux register io_i2c_mst1_ss_scl_hcnt
+#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114
+#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt")
+
+// User extension aux register io_i2c_mst1_ss_scl_lcnt
+#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118
+#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt")
+
+// User extension aux register io_i2c_mst1_fs_scl_hcnt
+#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c
+#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt")
+
+// User extension aux register io_i2c_mst1_fs_scl_lcnt
+#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120
+#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt")
+
+// User extension aux register io_i2c_mst1_intr_stat
+#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c
+#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat")
+
+// User extension aux register io_i2c_mst1_intr_mask
+#define AR_IO_I2C_MST1_INTR_MASK 0x80012130
+#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask")
+
+// User extension aux register io_i2c_mst1_raw_intr_stat
+#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134
+#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat")
+
+// User extension aux register io_i2c_mst1_rx_tl
+#define AR_IO_I2C_MST1_RX_TL 0x80012138
+#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl")
+
+// User extension aux register io_i2c_mst1_tx_tl
+#define AR_IO_I2C_MST1_TX_TL 0x8001213c
+#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl")
+
+// User extension aux register io_i2c_mst1_clr_intr
+#define AR_IO_I2C_MST1_CLR_INTR 0x80012140
+#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr")
+
+// User extension aux register io_i2c_mst1_clr_rx_under
+#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144
+#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under")
+
+// User extension aux register io_i2c_mst1_clr_rx_over
+#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148
+#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over")
+
+// User extension aux register io_i2c_mst1_clr_tx_over
+#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c
+#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over")
+
+// User extension aux register io_i2c_mst1_clr_tx_abrt
+#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154
+#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt")
+
+// User extension aux register io_i2c_mst1_clr_activity
+#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c
+#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity")
+
+// User extension aux register io_i2c_mst1_clr_stop_det
+#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160
+#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det")
+
+// User extension aux register io_i2c_mst1_clr_start_det
+#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164
+#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det")
+
+// User extension aux register io_i2c_mst1_enable
+#define AR_IO_I2C_MST1_ENABLE 0x8001216c
+#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable")
+
+// User extension aux register io_i2c_mst1_status
+#define AR_IO_I2C_MST1_STATUS 0x80012170
+#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status")
+
+// User extension aux register io_i2c_mst1_txflr
+#define AR_IO_I2C_MST1_TXFLR 0x80012174
+#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr")
+
+// User extension aux register io_i2c_mst1_rxflr
+#define AR_IO_I2C_MST1_RXFLR 0x80012178
+#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr")
+
+// User extension aux register io_i2c_mst1_sda_hold
+#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c
+#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold")
+
+// User extension aux register io_i2c_mst1_tx_abrt_source
+#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180
+#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source")
+
+// User extension aux register io_i2c_mst1_enable_status
+#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c
+#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status")
+
+// User extension aux register io_i2c_mst1_fs_spklen
+#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0
+#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_IO_I2C_MST2_PRESENT	1
+
+// User extension aux register io_i2c_mst2_clken
+#define AR_IO_I2C_MST2_CLKEN 0x800122c0
+#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken")
+
+// User extension aux register io_i2c_mst2_con
+#define AR_IO_I2C_MST2_CON 0x80012200
+#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con")
+
+// User extension aux register io_i2c_mst2_tar
+#define AR_IO_I2C_MST2_TAR 0x80012204
+#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar")
+
+// User extension aux register io_i2c_mst2_data_cmd
+#define AR_IO_I2C_MST2_DATA_CMD 0x80012210
+#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd")
+
+// User extension aux register io_i2c_mst2_ss_scl_hcnt
+#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214
+#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt")
+
+// User extension aux register io_i2c_mst2_ss_scl_lcnt
+#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218
+#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt")
+
+// User extension aux register io_i2c_mst2_fs_scl_hcnt
+#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c
+#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt")
+
+// User extension aux register io_i2c_mst2_fs_scl_lcnt
+#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220
+#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt")
+
+// User extension aux register io_i2c_mst2_intr_stat
+#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c
+#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat")
+
+// User extension aux register io_i2c_mst2_intr_mask
+#define AR_IO_I2C_MST2_INTR_MASK 0x80012230
+#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask")
+
+// User extension aux register io_i2c_mst2_raw_intr_stat
+#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234
+#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat")
+
+// User extension aux register io_i2c_mst2_rx_tl
+#define AR_IO_I2C_MST2_RX_TL 0x80012238
+#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl")
+
+// User extension aux register io_i2c_mst2_tx_tl
+#define AR_IO_I2C_MST2_TX_TL 0x8001223c
+#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl")
+
+// User extension aux register io_i2c_mst2_clr_intr
+#define AR_IO_I2C_MST2_CLR_INTR 0x80012240
+#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr")
+
+// User extension aux register io_i2c_mst2_clr_rx_under
+#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244
+#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under")
+
+// User extension aux register io_i2c_mst2_clr_rx_over
+#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248
+#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over")
+
+// User extension aux register io_i2c_mst2_clr_tx_over
+#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c
+#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over")
+
+// User extension aux register io_i2c_mst2_clr_tx_abrt
+#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254
+#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt")
+
+// User extension aux register io_i2c_mst2_clr_activity
+#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c
+#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity")
+
+// User extension aux register io_i2c_mst2_clr_stop_det
+#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260
+#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det")
+
+// User extension aux register io_i2c_mst2_clr_start_det
+#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264
+#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det")
+
+// User extension aux register io_i2c_mst2_enable
+#define AR_IO_I2C_MST2_ENABLE 0x8001226c
+#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable")
+
+// User extension aux register io_i2c_mst2_status
+#define AR_IO_I2C_MST2_STATUS 0x80012270
+#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status")
+
+// User extension aux register io_i2c_mst2_txflr
+#define AR_IO_I2C_MST2_TXFLR 0x80012274
+#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr")
+
+// User extension aux register io_i2c_mst2_rxflr
+#define AR_IO_I2C_MST2_RXFLR 0x80012278
+#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr")
+
+// User extension aux register io_i2c_mst2_sda_hold
+#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c
+#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold")
+
+// User extension aux register io_i2c_mst2_tx_abrt_source
+#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280
+#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source")
+
+// User extension aux register io_i2c_mst2_enable_status
+#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c
+#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status")
+
+// User extension aux register io_i2c_mst2_fs_spklen
+#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0
+#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_IO_SPI_MST0_PRESENT	1
+
+// User extension aux register io_spi_mst0_ctrlr0
+#define AR_IO_SPI_MST0_CTRLR0 0x80010000
+#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0")
+
+// User extension aux register io_spi_mst0_ctrlr1
+#define AR_IO_SPI_MST0_CTRLR1 0x80010001
+#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1")
+
+// User extension aux register io_spi_mst0_spien
+#define AR_IO_SPI_MST0_SPIEN 0x80010002
+#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien")
+
+// User extension aux register io_spi_mst0_ser
+#define AR_IO_SPI_MST0_SER 0x80010004
+#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser")
+
+// User extension aux register io_spi_mst0_baudr
+#define AR_IO_SPI_MST0_BAUDR 0x80010005
+#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr")
+
+// User extension aux register io_spi_mst0_txftlr
+#define AR_IO_SPI_MST0_TXFTLR 0x80010006
+#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr")
+
+// User extension aux register io_spi_mst0_rxftlr
+#define AR_IO_SPI_MST0_RXFTLR 0x80010007
+#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr")
+
+// User extension aux register io_spi_mst0_txflr
+#define AR_IO_SPI_MST0_TXFLR 0x80010008
+#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr")
+
+// User extension aux register io_spi_mst0_rxflr
+#define AR_IO_SPI_MST0_RXFLR 0x80010009
+#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr")
+
+// User extension aux register io_spi_mst0_sr
+#define AR_IO_SPI_MST0_SR 0x8001000a
+#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr")
+
+// User extension aux register io_spi_mst0_imr
+#define AR_IO_SPI_MST0_IMR 0x8001000b
+#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr")
+
+// User extension aux register io_spi_mst0_isr
+#define AR_IO_SPI_MST0_ISR 0x8001000c
+#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr")
+
+// User extension aux register io_spi_mst0_risr
+#define AR_IO_SPI_MST0_RISR 0x8001000d
+#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr")
+
+// User extension aux register io_spi_mst0_txoicr
+#define AR_IO_SPI_MST0_TXOICR 0x8001000e
+#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr")
+
+// User extension aux register io_spi_mst0_rxoicr
+#define AR_IO_SPI_MST0_RXOICR 0x8001000f
+#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr")
+
+// User extension aux register io_spi_mst0_rxuicr
+#define AR_IO_SPI_MST0_RXUICR 0x80010010
+#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr")
+
+// User extension aux register io_spi_mst0_icr
+#define AR_IO_SPI_MST0_ICR 0x80010012
+#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr")
+
+// User extension aux register io_spi_mst0_clken
+#define AR_IO_SPI_MST0_CLKEN 0x80010016
+#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken")
+
+// User extension aux register io_spi_mst0_dr
+#define AR_IO_SPI_MST0_DR 0x80010018
+#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr")
+
+// User extension aux register io_spi_mst0_rx_sample_dly
+#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c
+#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_IO_SPI_MST1_PRESENT	1
+
+// User extension aux register io_spi_mst1_ctrlr0
+#define AR_IO_SPI_MST1_CTRLR0 0x80010100
+#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0")
+
+// User extension aux register io_spi_mst1_ctrlr1
+#define AR_IO_SPI_MST1_CTRLR1 0x80010101
+#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1")
+
+// User extension aux register io_spi_mst1_spien
+#define AR_IO_SPI_MST1_SPIEN 0x80010102
+#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien")
+
+// User extension aux register io_spi_mst1_ser
+#define AR_IO_SPI_MST1_SER 0x80010104
+#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser")
+
+// User extension aux register io_spi_mst1_baudr
+#define AR_IO_SPI_MST1_BAUDR 0x80010105
+#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr")
+
+// User extension aux register io_spi_mst1_txftlr
+#define AR_IO_SPI_MST1_TXFTLR 0x80010106
+#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr")
+
+// User extension aux register io_spi_mst1_rxftlr
+#define AR_IO_SPI_MST1_RXFTLR 0x80010107
+#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr")
+
+// User extension aux register io_spi_mst1_txflr
+#define AR_IO_SPI_MST1_TXFLR 0x80010108
+#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr")
+
+// User extension aux register io_spi_mst1_rxflr
+#define AR_IO_SPI_MST1_RXFLR 0x80010109
+#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr")
+
+// User extension aux register io_spi_mst1_sr
+#define AR_IO_SPI_MST1_SR 0x8001010a
+#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr")
+
+// User extension aux register io_spi_mst1_imr
+#define AR_IO_SPI_MST1_IMR 0x8001010b
+#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr")
+
+// User extension aux register io_spi_mst1_isr
+#define AR_IO_SPI_MST1_ISR 0x8001010c
+#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr")
+
+// User extension aux register io_spi_mst1_risr
+#define AR_IO_SPI_MST1_RISR 0x8001010d
+#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr")
+
+// User extension aux register io_spi_mst1_txoicr
+#define AR_IO_SPI_MST1_TXOICR 0x8001010e
+#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr")
+
+// User extension aux register io_spi_mst1_rxoicr
+#define AR_IO_SPI_MST1_RXOICR 0x8001010f
+#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr")
+
+// User extension aux register io_spi_mst1_rxuicr
+#define AR_IO_SPI_MST1_RXUICR 0x80010110
+#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr")
+
+// User extension aux register io_spi_mst1_icr
+#define AR_IO_SPI_MST1_ICR 0x80010112
+#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr")
+
+// User extension aux register io_spi_mst1_clken
+#define AR_IO_SPI_MST1_CLKEN 0x80010116
+#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken")
+
+// User extension aux register io_spi_mst1_dr
+#define AR_IO_SPI_MST1_DR 0x80010118
+#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr")
+
+// User extension aux register io_spi_mst1_rx_sample_dly
+#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c
+#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_IO_SPI_MST2_PRESENT	1
+
+// User extension aux register io_spi_mst2_ctrlr0
+#define AR_IO_SPI_MST2_CTRLR0 0x80010200
+#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0")
+
+// User extension aux register io_spi_mst2_ctrlr1
+#define AR_IO_SPI_MST2_CTRLR1 0x80010201
+#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1")
+
+// User extension aux register io_spi_mst2_spien
+#define AR_IO_SPI_MST2_SPIEN 0x80010202
+#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien")
+
+// User extension aux register io_spi_mst2_ser
+#define AR_IO_SPI_MST2_SER 0x80010204
+#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser")
+
+// User extension aux register io_spi_mst2_baudr
+#define AR_IO_SPI_MST2_BAUDR 0x80010205
+#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr")
+
+// User extension aux register io_spi_mst2_txftlr
+#define AR_IO_SPI_MST2_TXFTLR 0x80010206
+#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr")
+
+// User extension aux register io_spi_mst2_rxftlr
+#define AR_IO_SPI_MST2_RXFTLR 0x80010207
+#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr")
+
+// User extension aux register io_spi_mst2_txflr
+#define AR_IO_SPI_MST2_TXFLR 0x80010208
+#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr")
+
+// User extension aux register io_spi_mst2_rxflr
+#define AR_IO_SPI_MST2_RXFLR 0x80010209
+#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr")
+
+// User extension aux register io_spi_mst2_sr
+#define AR_IO_SPI_MST2_SR 0x8001020a
+#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr")
+
+// User extension aux register io_spi_mst2_imr
+#define AR_IO_SPI_MST2_IMR 0x8001020b
+#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr")
+
+// User extension aux register io_spi_mst2_isr
+#define AR_IO_SPI_MST2_ISR 0x8001020c
+#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr")
+
+// User extension aux register io_spi_mst2_risr
+#define AR_IO_SPI_MST2_RISR 0x8001020d
+#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr")
+
+// User extension aux register io_spi_mst2_txoicr
+#define AR_IO_SPI_MST2_TXOICR 0x8001020e
+#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr")
+
+// User extension aux register io_spi_mst2_rxoicr
+#define AR_IO_SPI_MST2_RXOICR 0x8001020f
+#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr")
+
+// User extension aux register io_spi_mst2_rxuicr
+#define AR_IO_SPI_MST2_RXUICR 0x80010210
+#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr")
+
+// User extension aux register io_spi_mst2_icr
+#define AR_IO_SPI_MST2_ICR 0x80010212
+#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr")
+
+// User extension aux register io_spi_mst2_clken
+#define AR_IO_SPI_MST2_CLKEN 0x80010216
+#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken")
+
+// User extension aux register io_spi_mst2_dr
+#define AR_IO_SPI_MST2_DR 0x80010218
+#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr")
+
+// User extension aux register io_spi_mst2_rx_sample_dly
+#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c
+#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_IO_SPI_SLV0_PRESENT	1
+
+// User extension aux register io_spi_slv0_ctrlr0
+#define AR_IO_SPI_SLV0_CTRLR0 0x80011000
+#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0")
+
+// User extension aux register io_spi_slv0_spien
+#define AR_IO_SPI_SLV0_SPIEN 0x80011002
+#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien")
+
+// User extension aux register io_spi_slv0_txftlr
+#define AR_IO_SPI_SLV0_TXFTLR 0x80011006
+#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr")
+
+// User extension aux register io_spi_slv0_rxftlr
+#define AR_IO_SPI_SLV0_RXFTLR 0x80011007
+#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr")
+
+// User extension aux register io_spi_slv0_txflr
+#define AR_IO_SPI_SLV0_TXFLR 0x80011008
+#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr")
+
+// User extension aux register io_spi_slv0_rxflr
+#define AR_IO_SPI_SLV0_RXFLR 0x80011009
+#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr")
+
+// User extension aux register io_spi_slv0_sr
+#define AR_IO_SPI_SLV0_SR 0x8001100a
+#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr")
+
+// User extension aux register io_spi_slv0_imr
+#define AR_IO_SPI_SLV0_IMR 0x8001100b
+#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr")
+
+// User extension aux register io_spi_slv0_isr
+#define AR_IO_SPI_SLV0_ISR 0x8001100c
+#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr")
+
+// User extension aux register io_spi_slv0_risr
+#define AR_IO_SPI_SLV0_RISR 0x8001100d
+#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr")
+
+// User extension aux register io_spi_slv0_txoicr
+#define AR_IO_SPI_SLV0_TXOICR 0x8001100e
+#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr")
+
+// User extension aux register io_spi_slv0_rxoicr
+#define AR_IO_SPI_SLV0_RXOICR 0x8001100f
+#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr")
+
+// User extension aux register io_spi_slv0_rxuicr
+#define AR_IO_SPI_SLV0_RXUICR 0x80011010
+#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr")
+
+// User extension aux register io_spi_slv0_icr
+#define AR_IO_SPI_SLV0_ICR 0x80011012
+#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr")
+
+// User extension aux register io_spi_slv0_clken
+#define AR_IO_SPI_SLV0_CLKEN 0x80011016
+#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken")
+
+// User extension aux register io_spi_slv0_dr
+#define AR_IO_SPI_SLV0_DR 0x80011018
+#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_IO_UART0_PRESENT	1
+
+// User extension aux register io_uart0_clken
+#define AR_IO_UART0_CLKEN 0x800140c0
+#pragma Aux_register(0x800140c0, name=>"io_uart0_clken")
+
+// User extension aux register io_uart0_rbr_thr_dll
+#define AR_IO_UART0_RBR_THR_DLL 0x80014000
+#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll")
+
+// User extension aux register io_uart0_ier_dlh
+#define AR_IO_UART0_IER_DLH 0x80014004
+#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh")
+
+// User extension aux register io_uart0_iir_fcr
+#define AR_IO_UART0_IIR_FCR 0x80014008
+#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr")
+
+// User extension aux register io_uart0_lcr
+#define AR_IO_UART0_LCR 0x8001400c
+#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr")
+
+// User extension aux register io_uart0_mcr
+#define AR_IO_UART0_MCR 0x80014010
+#pragma Aux_register(0x80014010, name=>"io_uart0_mcr")
+
+// User extension aux register io_uart0_lsr
+#define AR_IO_UART0_LSR 0x80014014
+#pragma Aux_register(0x80014014, name=>"io_uart0_lsr")
+
+// User extension aux register io_uart0_msr
+#define AR_IO_UART0_MSR 0x80014018
+#pragma Aux_register(0x80014018, name=>"io_uart0_msr")
+
+// User extension aux register io_uart0_usr
+#define AR_IO_UART0_USR 0x8001407c
+#pragma Aux_register(0x8001407c, name=>"io_uart0_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_IO_UART1_PRESENT	1
+
+// User extension aux register io_uart1_clken
+#define AR_IO_UART1_CLKEN 0x800141c0
+#pragma Aux_register(0x800141c0, name=>"io_uart1_clken")
+
+// User extension aux register io_uart1_rbr_thr_dll
+#define AR_IO_UART1_RBR_THR_DLL 0x80014100
+#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll")
+
+// User extension aux register io_uart1_ier_dlh
+#define AR_IO_UART1_IER_DLH 0x80014104
+#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh")
+
+// User extension aux register io_uart1_iir_fcr
+#define AR_IO_UART1_IIR_FCR 0x80014108
+#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr")
+
+// User extension aux register io_uart1_lcr
+#define AR_IO_UART1_LCR 0x8001410c
+#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr")
+
+// User extension aux register io_uart1_mcr
+#define AR_IO_UART1_MCR 0x80014110
+#pragma Aux_register(0x80014110, name=>"io_uart1_mcr")
+
+// User extension aux register io_uart1_lsr
+#define AR_IO_UART1_LSR 0x80014114
+#pragma Aux_register(0x80014114, name=>"io_uart1_lsr")
+
+// User extension aux register io_uart1_msr
+#define AR_IO_UART1_MSR 0x80014118
+#pragma Aux_register(0x80014118, name=>"io_uart1_msr")
+
+// User extension aux register io_uart1_usr
+#define AR_IO_UART1_USR 0x8001417c
+#pragma Aux_register(0x8001417c, name=>"io_uart1_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_IO_UART2_PRESENT	1
+
+// User extension aux register io_uart2_clken
+#define AR_IO_UART2_CLKEN 0x800142c0
+#pragma Aux_register(0x800142c0, name=>"io_uart2_clken")
+
+// User extension aux register io_uart2_rbr_thr_dll
+#define AR_IO_UART2_RBR_THR_DLL 0x80014200
+#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll")
+
+// User extension aux register io_uart2_ier_dlh
+#define AR_IO_UART2_IER_DLH 0x80014204
+#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh")
+
+// User extension aux register io_uart2_iir_fcr
+#define AR_IO_UART2_IIR_FCR 0x80014208
+#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr")
+
+// User extension aux register io_uart2_lcr
+#define AR_IO_UART2_LCR 0x8001420c
+#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr")
+
+// User extension aux register io_uart2_mcr
+#define AR_IO_UART2_MCR 0x80014210
+#pragma Aux_register(0x80014210, name=>"io_uart2_mcr")
+
+// User extension aux register io_uart2_lsr
+#define AR_IO_UART2_LSR 0x80014214
+#pragma Aux_register(0x80014214, name=>"io_uart2_lsr")
+
+// User extension aux register io_uart2_msr
+#define AR_IO_UART2_MSR 0x80014218
+#pragma Aux_register(0x80014218, name=>"io_uart2_msr")
+
+// User extension aux register io_uart2_usr
+#define AR_IO_UART2_USR 0x8001427c
+#pragma Aux_register(0x8001427c, name=>"io_uart2_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_IO_UART3_PRESENT	1
+
+// User extension aux register io_uart3_clken
+#define AR_IO_UART3_CLKEN 0x800143c0
+#pragma Aux_register(0x800143c0, name=>"io_uart3_clken")
+
+// User extension aux register io_uart3_rbr_thr_dll
+#define AR_IO_UART3_RBR_THR_DLL 0x80014300
+#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll")
+
+// User extension aux register io_uart3_ier_dlh
+#define AR_IO_UART3_IER_DLH 0x80014304
+#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh")
+
+// User extension aux register io_uart3_iir_fcr
+#define AR_IO_UART3_IIR_FCR 0x80014308
+#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr")
+
+// User extension aux register io_uart3_lcr
+#define AR_IO_UART3_LCR 0x8001430c
+#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr")
+
+// User extension aux register io_uart3_mcr
+#define AR_IO_UART3_MCR 0x80014310
+#pragma Aux_register(0x80014310, name=>"io_uart3_mcr")
+
+// User extension aux register io_uart3_lsr
+#define AR_IO_UART3_LSR 0x80014314
+#pragma Aux_register(0x80014314, name=>"io_uart3_lsr")
+
+// User extension aux register io_uart3_msr
+#define AR_IO_UART3_MSR 0x80014318
+#pragma Aux_register(0x80014318, name=>"io_uart3_msr")
+
+// User extension aux register io_uart3_usr
+#define AR_IO_UART3_USR 0x8001437c
+#pragma Aux_register(0x8001437c, name=>"io_uart3_usr")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_MST0_IO_CREG_MST0_PRESENT	1
+
+// User extension aux register io_creg_mst0_ctrl
+#define AR_IO_CREG_MST0_CTRL 0x80018000
+#pragma Aux_register(0x80018000, name=>"io_creg_mst0_ctrl")
+#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_SLV0_IO_CREG_SLV0_PRESENT	1
+
+// User extension aux register io_creg_slv0_obsr
+#define AR_IO_CREG_SLV0_OBSR 0x80018080
+#pragma Aux_register(0x80018080, name=>"io_creg_slv0_obsr")
+#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_SUBSYS_BCR_PRESENT	1
+
+// User extension aux register SUBSYS_BUILD
+#define AR_SUBSYS_BUILD 0xf0
+#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD")
+
+// User extension aux register SUBSYS_DSP_0_BUILD
+#define AR_SUBSYS_DSP_0_BUILD 0xa00
+#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD")
+
+// User extension aux register SUBSYS_DSP_0_CONFIG
+#define AR_SUBSYS_DSP_0_CONFIG 0xa02
+#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG")
+
+// User extension aux register SUBSYS_IO_0_BUILD
+#define AR_SUBSYS_IO_0_BUILD 0xa04
+#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD")
+
+// User extension aux register SUBSYS_IO_1_BUILD
+#define AR_SUBSYS_IO_1_BUILD 0xa05
+#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD")
+#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT	1
+
+// User extension aux register fpu_build
+#define AR_FPU_BUILD 0xc8
+#pragma Aux_register(0xc8, name=>"fpu_build")
+
+// User extension aux register fpu_ctrl
+#define AR_FPU_CTRL 0x300
+#pragma Aux_register(0x300, name=>"fpu_ctrl")
+
+// User extension aux register fpu_status
+#define AR_FPU_STATUS 0x301
+#pragma Aux_register(0x301, name=>"fpu_status")
+
+// User extension instruction fsmadd
+extern long fsmadd(long,long);
+#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsmsub
+extern long fsmsub(long,long);
+#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsmul
+extern long fsmul(long,long);
+#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsadd
+extern long fsadd(long,long);
+#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fssub
+extern long fssub(long,long);
+#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fcvt32
+extern long fcvt32(long,long);
+#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fsdiv
+extern long fsdiv(long,long);
+#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmp
+extern long fscmp(long,long);
+#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmp
+extern long fscmp_f(long,long);
+#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmpf
+extern long fscmpf(long,long);
+#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fscmpf
+extern long fscmpf_f(long,long);
+#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+
+// User extension instruction fssqrt
+extern long fssqrt(long);
+#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
+#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT	1
+
+// User extension aux register aux_dpfp1l
+#define AR_AUX_DPFP1L 0x302
+#pragma Aux_register(0x302, name=>"aux_dpfp1l")
+
+// User extension aux register aux_dpfp1h
+#define AR_AUX_DPFP1H 0x303
+#pragma Aux_register(0x303, name=>"aux_dpfp1h")
+
+// User extension aux register aux_dpfp2l
+#define AR_AUX_DPFP2L 0x304
+#pragma Aux_register(0x304, name=>"aux_dpfp2l")
+
+// User extension aux register aux_dpfp2h
+#define AR_AUX_DPFP2H 0x305
+#pragma Aux_register(0x305, name=>"aux_dpfp2h")
+
+// User extension instruction dmulh11
+extern long dmulh11(long,long);
+#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh11
+extern long dmulh11_f(long,long);
+#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh12
+extern long dmulh12(long,long);
+#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh12
+extern long dmulh12_f(long,long);
+#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh21
+extern long dmulh21(long,long);
+#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh21
+extern long dmulh21_f(long,long);
+#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh22
+extern long dmulh22(long,long);
+#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dmulh22
+extern long dmulh22_f(long,long);
+#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh11
+extern long daddh11(long,long);
+#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh11
+extern long daddh11_f(long,long);
+#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh12
+extern long daddh12(long,long);
+#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh12
+extern long daddh12_f(long,long);
+#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh21
+extern long daddh21(long,long);
+#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh21
+extern long daddh21_f(long,long);
+#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh22
+extern long daddh22(long,long);
+#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction daddh22
+extern long daddh22_f(long,long);
+#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh11
+extern long dsubh11(long,long);
+#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh11
+extern long dsubh11_f(long,long);
+#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh12
+extern long dsubh12(long,long);
+#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh12
+extern long dsubh12_f(long,long);
+#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh21
+extern long dsubh21(long,long);
+#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh21
+extern long dsubh21_f(long,long);
+#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh22
+extern long dsubh22(long,long);
+#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dsubh22
+extern long dsubh22_f(long,long);
+#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dexcl1
+extern long dexcl1(long,long);
+#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+// User extension instruction dexcl2
+extern long dexcl2(long,long);
+#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
+
+
+#endif
+
+
+]]></string>
+  </configuration>
+  <configuration name="apex_assembly" filename="apexextensions.s">
+    <string><![CDATA[
+
+; Assembler directives for eia extensions in this design
+.set apex_com_arc_hardware_dfss_dsp_trig_present,1
+.extInstruction dsp_cos, 7, 0x1E, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_sin, 7, 0x1F, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_tan, 7, 0x22, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_acos, 7, 0x23, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_asin, 7, 0x24, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_atan, 7, 0x25, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_sqrt, 7, 0x20, FLAGS_NONE, SYNTAX_2OP
+.extInstruction dsp_sqrt15, 7, 0x21, FLAGS_NONE, SYNTAX_2OP
+ .set apex_com_arc_hardware_dfss_io_gpio_4b0_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio_4b1_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio_4b2_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio_8b0_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio_8b1_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio_8b2_present,1
+ .set apex_com_arc_hardware_dfss_io_gpio_8b3_present,1
+ .set apex_com_arc_hardware_dfss_io_i2c_mst0_present,1
+ .set apex_com_arc_hardware_dfss_io_i2c_mst1_present,1
+ .set apex_com_arc_hardware_dfss_io_i2c_mst2_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_mst0_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_mst1_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_mst2_present,1
+ .set apex_com_arc_hardware_dfss_io_spi_slv0_present,1
+ .set apex_com_arc_hardware_dfss_io_uart0_present,1
+ .set apex_com_arc_hardware_dfss_io_uart1_present,1
+ .set apex_com_arc_hardware_dfss_io_uart2_present,1
+ .set apex_com_arc_hardware_dfss_io_uart3_present,1
+ .set apex_com_arc_hardware_dfss_io_creg_mst0_present,1
+ .set apex_com_arc_hardware_dfss_io_creg_slv0_present,1
+ .set apex_com_arc_hardware_dfss_subsys_bcr_present,1
+.set apex_com_arc_hardware_dfss_io_gpio_4b0_io_gpio_4b0_present,1
+.extAuxRegister io_gpio_4b0_debounce,0x80017c48,r|w
+.extAuxRegister io_gpio_4b0_clken,0x80017c80,r|w
+.extAuxRegister io_gpio_4b0_swporta_dr,0x80017c00,r|w
+.extAuxRegister io_gpio_4b0_swporta_ddr,0x80017c04,r|w
+.extAuxRegister io_gpio_4b0_inten,0x80017c30,r|w
+.extAuxRegister io_gpio_4b0_intmask,0x80017c34,r|w
+.extAuxRegister io_gpio_4b0_inttype_level,0x80017c38,r|w
+.extAuxRegister io_gpio_4b0_int_polarity,0x80017c3c,r|w
+.extAuxRegister io_gpio_4b0_intstatus,0x80017c40,r
+.extAuxRegister io_gpio_4b0_raw_intstatus,0x80017c44,r
+.extAuxRegister io_gpio_4b0_porta_eoi,0x80017c4c,w
+.extAuxRegister io_gpio_4b0_ext_porta,0x80017c50,r
+.extAuxRegister io_gpio_4b0_ls_sync,0x80017c60,r|w
+.extAuxRegister io_gpio_4b0_int_bothedge,0x80017c68,r|w
+.set apex_com_arc_hardware_dfss_io_gpio_4b1_io_gpio_4b1_present,1
+.extAuxRegister io_gpio_4b1_debounce,0x80017d48,r|w
+.extAuxRegister io_gpio_4b1_clken,0x80017d80,r|w
+.extAuxRegister io_gpio_4b1_swporta_dr,0x80017d00,r|w
+.extAuxRegister io_gpio_4b1_swporta_ddr,0x80017d04,r|w
+.extAuxRegister io_gpio_4b1_inten,0x80017d30,r|w
+.extAuxRegister io_gpio_4b1_intmask,0x80017d34,r|w
+.extAuxRegister io_gpio_4b1_inttype_level,0x80017d38,r|w
+.extAuxRegister io_gpio_4b1_int_polarity,0x80017d3c,r|w
+.extAuxRegister io_gpio_4b1_intstatus,0x80017d40,r
+.extAuxRegister io_gpio_4b1_raw_intstatus,0x80017d44,r
+.extAuxRegister io_gpio_4b1_porta_eoi,0x80017d4c,w
+.extAuxRegister io_gpio_4b1_ext_porta,0x80017d50,r
+.extAuxRegister io_gpio_4b1_ls_sync,0x80017d60,r|w
+.extAuxRegister io_gpio_4b1_int_bothedge,0x80017d68,r|w
+.set apex_com_arc_hardware_dfss_io_gpio_4b2_io_gpio_4b2_present,1
+.extAuxRegister io_gpio_4b2_debounce,0x80017e48,r|w
+.extAuxRegister io_gpio_4b2_clken,0x80017e80,r|w
+.extAuxRegister io_gpio_4b2_swporta_dr,0x80017e00,r|w
+.extAuxRegister io_gpio_4b2_swporta_ddr,0x80017e04,r|w
+.extAuxRegister io_gpio_4b2_inten,0x80017e30,r|w
+.extAuxRegister io_gpio_4b2_intmask,0x80017e34,r|w
+.extAuxRegister io_gpio_4b2_inttype_level,0x80017e38,r|w
+.extAuxRegister io_gpio_4b2_int_polarity,0x80017e3c,r|w
+.extAuxRegister io_gpio_4b2_intstatus,0x80017e40,r
+.extAuxRegister io_gpio_4b2_raw_intstatus,0x80017e44,r
+.extAuxRegister io_gpio_4b2_porta_eoi,0x80017e4c,w
+.extAuxRegister io_gpio_4b2_ext_porta,0x80017e50,r
+.extAuxRegister io_gpio_4b2_ls_sync,0x80017e60,r|w
+.extAuxRegister io_gpio_4b2_int_bothedge,0x80017e68,r|w
+.set apex_com_arc_hardware_dfss_io_gpio_8b0_io_gpio_8b0_present,1
+.extAuxRegister io_gpio_8b0_debounce,0x80017848,r|w
+.extAuxRegister io_gpio_8b0_clken,0x80017880,r|w
+.extAuxRegister io_gpio_8b0_swporta_dr,0x80017800,r|w
+.extAuxRegister io_gpio_8b0_swporta_ddr,0x80017804,r|w
+.extAuxRegister io_gpio_8b0_inten,0x80017830,r|w
+.extAuxRegister io_gpio_8b0_intmask,0x80017834,r|w
+.extAuxRegister io_gpio_8b0_inttype_level,0x80017838,r|w
+.extAuxRegister io_gpio_8b0_int_polarity,0x8001783c,r|w
+.extAuxRegister io_gpio_8b0_intstatus,0x80017840,r
+.extAuxRegister io_gpio_8b0_raw_intstatus,0x80017844,r
+.extAuxRegister io_gpio_8b0_porta_eoi,0x8001784c,w
+.extAuxRegister io_gpio_8b0_ext_porta,0x80017850,r
+.extAuxRegister io_gpio_8b0_ls_sync,0x80017860,r|w
+.extAuxRegister io_gpio_8b0_int_bothedge,0x80017868,r|w
+.set apex_com_arc_hardware_dfss_io_gpio_8b1_io_gpio_8b1_present,1
+.extAuxRegister io_gpio_8b1_debounce,0x80017948,r|w
+.extAuxRegister io_gpio_8b1_clken,0x80017980,r|w
+.extAuxRegister io_gpio_8b1_swporta_dr,0x80017900,r|w
+.extAuxRegister io_gpio_8b1_swporta_ddr,0x80017904,r|w
+.extAuxRegister io_gpio_8b1_inten,0x80017930,r|w
+.extAuxRegister io_gpio_8b1_intmask,0x80017934,r|w
+.extAuxRegister io_gpio_8b1_inttype_level,0x80017938,r|w
+.extAuxRegister io_gpio_8b1_int_polarity,0x8001793c,r|w
+.extAuxRegister io_gpio_8b1_intstatus,0x80017940,r
+.extAuxRegister io_gpio_8b1_raw_intstatus,0x80017944,r
+.extAuxRegister io_gpio_8b1_porta_eoi,0x8001794c,w
+.extAuxRegister io_gpio_8b1_ext_porta,0x80017950,r
+.extAuxRegister io_gpio_8b1_ls_sync,0x80017960,r|w
+.extAuxRegister io_gpio_8b1_int_bothedge,0x80017968,r|w
+.set apex_com_arc_hardware_dfss_io_gpio_8b2_io_gpio_8b2_present,1
+.extAuxRegister io_gpio_8b2_debounce,0x80017a48,r|w
+.extAuxRegister io_gpio_8b2_clken,0x80017a80,r|w
+.extAuxRegister io_gpio_8b2_swporta_dr,0x80017a00,r|w
+.extAuxRegister io_gpio_8b2_swporta_ddr,0x80017a04,r|w
+.extAuxRegister io_gpio_8b2_inten,0x80017a30,r|w
+.extAuxRegister io_gpio_8b2_intmask,0x80017a34,r|w
+.extAuxRegister io_gpio_8b2_inttype_level,0x80017a38,r|w
+.extAuxRegister io_gpio_8b2_int_polarity,0x80017a3c,r|w
+.extAuxRegister io_gpio_8b2_intstatus,0x80017a40,r
+.extAuxRegister io_gpio_8b2_raw_intstatus,0x80017a44,r
+.extAuxRegister io_gpio_8b2_porta_eoi,0x80017a4c,w
+.extAuxRegister io_gpio_8b2_ext_porta,0x80017a50,r
+.extAuxRegister io_gpio_8b2_ls_sync,0x80017a60,r|w
+.extAuxRegister io_gpio_8b2_int_bothedge,0x80017a68,r|w
+.set apex_com_arc_hardware_dfss_io_gpio_8b3_io_gpio_8b3_present,1
+.extAuxRegister io_gpio_8b3_debounce,0x80017b48,r|w
+.extAuxRegister io_gpio_8b3_clken,0x80017b80,r|w
+.extAuxRegister io_gpio_8b3_swporta_dr,0x80017b00,r|w
+.extAuxRegister io_gpio_8b3_swporta_ddr,0x80017b04,r|w
+.extAuxRegister io_gpio_8b3_inten,0x80017b30,r|w
+.extAuxRegister io_gpio_8b3_intmask,0x80017b34,r|w
+.extAuxRegister io_gpio_8b3_inttype_level,0x80017b38,r|w
+.extAuxRegister io_gpio_8b3_int_polarity,0x80017b3c,r|w
+.extAuxRegister io_gpio_8b3_intstatus,0x80017b40,r
+.extAuxRegister io_gpio_8b3_raw_intstatus,0x80017b44,r
+.extAuxRegister io_gpio_8b3_porta_eoi,0x80017b4c,w
+.extAuxRegister io_gpio_8b3_ext_porta,0x80017b50,r
+.extAuxRegister io_gpio_8b3_ls_sync,0x80017b60,r|w
+.extAuxRegister io_gpio_8b3_int_bothedge,0x80017b68,r|w
+.set apex_com_arc_hardware_dfss_io_i2c_mst0_io_i2c_mst0_present,1
+.extAuxRegister io_i2c_mst0_clken,0x800120c0,r|w
+.extAuxRegister io_i2c_mst0_con,0x80012000,r|w
+.extAuxRegister io_i2c_mst0_tar,0x80012004,r|w
+.extAuxRegister io_i2c_mst0_data_cmd,0x80012010,r|w
+.extAuxRegister io_i2c_mst0_ss_scl_hcnt,0x80012014,r|w
+.extAuxRegister io_i2c_mst0_ss_scl_lcnt,0x80012018,r|w
+.extAuxRegister io_i2c_mst0_fs_scl_hcnt,0x8001201c,r|w
+.extAuxRegister io_i2c_mst0_fs_scl_lcnt,0x80012020,r|w
+.extAuxRegister io_i2c_mst0_intr_stat,0x8001202c,r
+.extAuxRegister io_i2c_mst0_intr_mask,0x80012030,r|w
+.extAuxRegister io_i2c_mst0_raw_intr_stat,0x80012034,r
+.extAuxRegister io_i2c_mst0_rx_tl,0x80012038,r|w
+.extAuxRegister io_i2c_mst0_tx_tl,0x8001203c,r|w
+.extAuxRegister io_i2c_mst0_clr_intr,0x80012040,r
+.extAuxRegister io_i2c_mst0_clr_rx_under,0x80012044,r
+.extAuxRegister io_i2c_mst0_clr_rx_over,0x80012048,r
+.extAuxRegister io_i2c_mst0_clr_tx_over,0x8001204c,r
+.extAuxRegister io_i2c_mst0_clr_tx_abrt,0x80012054,r
+.extAuxRegister io_i2c_mst0_clr_activity,0x8001205c,r
+.extAuxRegister io_i2c_mst0_clr_stop_det,0x80012060,r
+.extAuxRegister io_i2c_mst0_clr_start_det,0x80012064,r
+.extAuxRegister io_i2c_mst0_enable,0x8001206c,r|w
+.extAuxRegister io_i2c_mst0_status,0x80012070,r
+.extAuxRegister io_i2c_mst0_txflr,0x80012074,r
+.extAuxRegister io_i2c_mst0_rxflr,0x80012078,r
+.extAuxRegister io_i2c_mst0_sda_hold,0x8001207c,r|w
+.extAuxRegister io_i2c_mst0_tx_abrt_source,0x80012080,r
+.extAuxRegister io_i2c_mst0_enable_status,0x8001209c,r
+.extAuxRegister io_i2c_mst0_fs_spklen,0x800120a0,r|w
+.set apex_com_arc_hardware_dfss_io_i2c_mst1_io_i2c_mst1_present,1
+.extAuxRegister io_i2c_mst1_clken,0x800121c0,r|w
+.extAuxRegister io_i2c_mst1_con,0x80012100,r|w
+.extAuxRegister io_i2c_mst1_tar,0x80012104,r|w
+.extAuxRegister io_i2c_mst1_data_cmd,0x80012110,r|w
+.extAuxRegister io_i2c_mst1_ss_scl_hcnt,0x80012114,r|w
+.extAuxRegister io_i2c_mst1_ss_scl_lcnt,0x80012118,r|w
+.extAuxRegister io_i2c_mst1_fs_scl_hcnt,0x8001211c,r|w
+.extAuxRegister io_i2c_mst1_fs_scl_lcnt,0x80012120,r|w
+.extAuxRegister io_i2c_mst1_intr_stat,0x8001212c,r
+.extAuxRegister io_i2c_mst1_intr_mask,0x80012130,r|w
+.extAuxRegister io_i2c_mst1_raw_intr_stat,0x80012134,r
+.extAuxRegister io_i2c_mst1_rx_tl,0x80012138,r|w
+.extAuxRegister io_i2c_mst1_tx_tl,0x8001213c,r|w
+.extAuxRegister io_i2c_mst1_clr_intr,0x80012140,r
+.extAuxRegister io_i2c_mst1_clr_rx_under,0x80012144,r
+.extAuxRegister io_i2c_mst1_clr_rx_over,0x80012148,r
+.extAuxRegister io_i2c_mst1_clr_tx_over,0x8001214c,r
+.extAuxRegister io_i2c_mst1_clr_tx_abrt,0x80012154,r
+.extAuxRegister io_i2c_mst1_clr_activity,0x8001215c,r
+.extAuxRegister io_i2c_mst1_clr_stop_det,0x80012160,r
+.extAuxRegister io_i2c_mst1_clr_start_det,0x80012164,r
+.extAuxRegister io_i2c_mst1_enable,0x8001216c,r|w
+.extAuxRegister io_i2c_mst1_status,0x80012170,r
+.extAuxRegister io_i2c_mst1_txflr,0x80012174,r
+.extAuxRegister io_i2c_mst1_rxflr,0x80012178,r
+.extAuxRegister io_i2c_mst1_sda_hold,0x8001217c,r|w
+.extAuxRegister io_i2c_mst1_tx_abrt_source,0x80012180,r
+.extAuxRegister io_i2c_mst1_enable_status,0x8001219c,r
+.extAuxRegister io_i2c_mst1_fs_spklen,0x800121a0,r|w
+.set apex_com_arc_hardware_dfss_io_i2c_mst2_io_i2c_mst2_present,1
+.extAuxRegister io_i2c_mst2_clken,0x800122c0,r|w
+.extAuxRegister io_i2c_mst2_con,0x80012200,r|w
+.extAuxRegister io_i2c_mst2_tar,0x80012204,r|w
+.extAuxRegister io_i2c_mst2_data_cmd,0x80012210,r|w
+.extAuxRegister io_i2c_mst2_ss_scl_hcnt,0x80012214,r|w
+.extAuxRegister io_i2c_mst2_ss_scl_lcnt,0x80012218,r|w
+.extAuxRegister io_i2c_mst2_fs_scl_hcnt,0x8001221c,r|w
+.extAuxRegister io_i2c_mst2_fs_scl_lcnt,0x80012220,r|w
+.extAuxRegister io_i2c_mst2_intr_stat,0x8001222c,r
+.extAuxRegister io_i2c_mst2_intr_mask,0x80012230,r|w
+.extAuxRegister io_i2c_mst2_raw_intr_stat,0x80012234,r
+.extAuxRegister io_i2c_mst2_rx_tl,0x80012238,r|w
+.extAuxRegister io_i2c_mst2_tx_tl,0x8001223c,r|w
+.extAuxRegister io_i2c_mst2_clr_intr,0x80012240,r
+.extAuxRegister io_i2c_mst2_clr_rx_under,0x80012244,r
+.extAuxRegister io_i2c_mst2_clr_rx_over,0x80012248,r
+.extAuxRegister io_i2c_mst2_clr_tx_over,0x8001224c,r
+.extAuxRegister io_i2c_mst2_clr_tx_abrt,0x80012254,r
+.extAuxRegister io_i2c_mst2_clr_activity,0x8001225c,r
+.extAuxRegister io_i2c_mst2_clr_stop_det,0x80012260,r
+.extAuxRegister io_i2c_mst2_clr_start_det,0x80012264,r
+.extAuxRegister io_i2c_mst2_enable,0x8001226c,r|w
+.extAuxRegister io_i2c_mst2_status,0x80012270,r
+.extAuxRegister io_i2c_mst2_txflr,0x80012274,r
+.extAuxRegister io_i2c_mst2_rxflr,0x80012278,r
+.extAuxRegister io_i2c_mst2_sda_hold,0x8001227c,r|w
+.extAuxRegister io_i2c_mst2_tx_abrt_source,0x80012280,r
+.extAuxRegister io_i2c_mst2_enable_status,0x8001229c,r
+.extAuxRegister io_i2c_mst2_fs_spklen,0x800122a0,r|w
+.set apex_com_arc_hardware_dfss_io_spi_mst0_io_spi_mst0_present,1
+.extAuxRegister io_spi_mst0_ctrlr0,0x80010000,r|w
+.extAuxRegister io_spi_mst0_ctrlr1,0x80010001,r|w
+.extAuxRegister io_spi_mst0_spien,0x80010002,r|w
+.extAuxRegister io_spi_mst0_ser,0x80010004,r|w
+.extAuxRegister io_spi_mst0_baudr,0x80010005,r|w
+.extAuxRegister io_spi_mst0_txftlr,0x80010006,r|w
+.extAuxRegister io_spi_mst0_rxftlr,0x80010007,r|w
+.extAuxRegister io_spi_mst0_txflr,0x80010008,r
+.extAuxRegister io_spi_mst0_rxflr,0x80010009,r
+.extAuxRegister io_spi_mst0_sr,0x8001000a,r
+.extAuxRegister io_spi_mst0_imr,0x8001000b,r|w
+.extAuxRegister io_spi_mst0_isr,0x8001000c,r
+.extAuxRegister io_spi_mst0_risr,0x8001000d,r
+.extAuxRegister io_spi_mst0_txoicr,0x8001000e,r
+.extAuxRegister io_spi_mst0_rxoicr,0x8001000f,r
+.extAuxRegister io_spi_mst0_rxuicr,0x80010010,r
+.extAuxRegister io_spi_mst0_icr,0x80010012,r|w
+.extAuxRegister io_spi_mst0_clken,0x80010016,r|w
+.extAuxRegister io_spi_mst0_dr,0x80010018,r|w
+.extAuxRegister io_spi_mst0_rx_sample_dly,0x8001003c,r|w
+.set apex_com_arc_hardware_dfss_io_spi_mst1_io_spi_mst1_present,1
+.extAuxRegister io_spi_mst1_ctrlr0,0x80010100,r|w
+.extAuxRegister io_spi_mst1_ctrlr1,0x80010101,r|w
+.extAuxRegister io_spi_mst1_spien,0x80010102,r|w
+.extAuxRegister io_spi_mst1_ser,0x80010104,r|w
+.extAuxRegister io_spi_mst1_baudr,0x80010105,r|w
+.extAuxRegister io_spi_mst1_txftlr,0x80010106,r|w
+.extAuxRegister io_spi_mst1_rxftlr,0x80010107,r|w
+.extAuxRegister io_spi_mst1_txflr,0x80010108,r
+.extAuxRegister io_spi_mst1_rxflr,0x80010109,r
+.extAuxRegister io_spi_mst1_sr,0x8001010a,r
+.extAuxRegister io_spi_mst1_imr,0x8001010b,r|w
+.extAuxRegister io_spi_mst1_isr,0x8001010c,r
+.extAuxRegister io_spi_mst1_risr,0x8001010d,r
+.extAuxRegister io_spi_mst1_txoicr,0x8001010e,r
+.extAuxRegister io_spi_mst1_rxoicr,0x8001010f,r
+.extAuxRegister io_spi_mst1_rxuicr,0x80010110,r
+.extAuxRegister io_spi_mst1_icr,0x80010112,r|w
+.extAuxRegister io_spi_mst1_clken,0x80010116,r|w
+.extAuxRegister io_spi_mst1_dr,0x80010118,r|w
+.extAuxRegister io_spi_mst1_rx_sample_dly,0x8001013c,r|w
+.set apex_com_arc_hardware_dfss_io_spi_mst2_io_spi_mst2_present,1
+.extAuxRegister io_spi_mst2_ctrlr0,0x80010200,r|w
+.extAuxRegister io_spi_mst2_ctrlr1,0x80010201,r|w
+.extAuxRegister io_spi_mst2_spien,0x80010202,r|w
+.extAuxRegister io_spi_mst2_ser,0x80010204,r|w
+.extAuxRegister io_spi_mst2_baudr,0x80010205,r|w
+.extAuxRegister io_spi_mst2_txftlr,0x80010206,r|w
+.extAuxRegister io_spi_mst2_rxftlr,0x80010207,r|w
+.extAuxRegister io_spi_mst2_txflr,0x80010208,r
+.extAuxRegister io_spi_mst2_rxflr,0x80010209,r
+.extAuxRegister io_spi_mst2_sr,0x8001020a,r
+.extAuxRegister io_spi_mst2_imr,0x8001020b,r|w
+.extAuxRegister io_spi_mst2_isr,0x8001020c,r
+.extAuxRegister io_spi_mst2_risr,0x8001020d,r
+.extAuxRegister io_spi_mst2_txoicr,0x8001020e,r
+.extAuxRegister io_spi_mst2_rxoicr,0x8001020f,r
+.extAuxRegister io_spi_mst2_rxuicr,0x80010210,r
+.extAuxRegister io_spi_mst2_icr,0x80010212,r|w
+.extAuxRegister io_spi_mst2_clken,0x80010216,r|w
+.extAuxRegister io_spi_mst2_dr,0x80010218,r|w
+.extAuxRegister io_spi_mst2_rx_sample_dly,0x8001023c,r|w
+.set apex_com_arc_hardware_dfss_io_spi_slv0_io_spi_slv0_present,1
+.extAuxRegister io_spi_slv0_ctrlr0,0x80011000,r|w
+.extAuxRegister io_spi_slv0_spien,0x80011002,r|w
+.extAuxRegister io_spi_slv0_txftlr,0x80011006,r|w
+.extAuxRegister io_spi_slv0_rxftlr,0x80011007,r|w
+.extAuxRegister io_spi_slv0_txflr,0x80011008,r
+.extAuxRegister io_spi_slv0_rxflr,0x80011009,r
+.extAuxRegister io_spi_slv0_sr,0x8001100a,r
+.extAuxRegister io_spi_slv0_imr,0x8001100b,r|w
+.extAuxRegister io_spi_slv0_isr,0x8001100c,r
+.extAuxRegister io_spi_slv0_risr,0x8001100d,r
+.extAuxRegister io_spi_slv0_txoicr,0x8001100e,r
+.extAuxRegister io_spi_slv0_rxoicr,0x8001100f,r
+.extAuxRegister io_spi_slv0_rxuicr,0x80011010,r
+.extAuxRegister io_spi_slv0_icr,0x80011012,r|w
+.extAuxRegister io_spi_slv0_clken,0x80011016,r|w
+.extAuxRegister io_spi_slv0_dr,0x80011018,r|w
+.set apex_com_arc_hardware_dfss_io_uart0_io_uart0_present,1
+.extAuxRegister io_uart0_clken,0x800140c0,r|w
+.extAuxRegister io_uart0_rbr_thr_dll,0x80014000,r|w
+.extAuxRegister io_uart0_ier_dlh,0x80014004,r|w
+.extAuxRegister io_uart0_iir_fcr,0x80014008,r|w
+.extAuxRegister io_uart0_lcr,0x8001400c,r|w
+.extAuxRegister io_uart0_mcr,0x80014010,r|w
+.extAuxRegister io_uart0_lsr,0x80014014,r
+.extAuxRegister io_uart0_msr,0x80014018,r
+.extAuxRegister io_uart0_usr,0x8001407c,r
+.set apex_com_arc_hardware_dfss_io_uart1_io_uart1_present,1
+.extAuxRegister io_uart1_clken,0x800141c0,r|w
+.extAuxRegister io_uart1_rbr_thr_dll,0x80014100,r|w
+.extAuxRegister io_uart1_ier_dlh,0x80014104,r|w
+.extAuxRegister io_uart1_iir_fcr,0x80014108,r|w
+.extAuxRegister io_uart1_lcr,0x8001410c,r|w
+.extAuxRegister io_uart1_mcr,0x80014110,r|w
+.extAuxRegister io_uart1_lsr,0x80014114,r
+.extAuxRegister io_uart1_msr,0x80014118,r
+.extAuxRegister io_uart1_usr,0x8001417c,r
+.set apex_com_arc_hardware_dfss_io_uart2_io_uart2_present,1
+.extAuxRegister io_uart2_clken,0x800142c0,r|w
+.extAuxRegister io_uart2_rbr_thr_dll,0x80014200,r|w
+.extAuxRegister io_uart2_ier_dlh,0x80014204,r|w
+.extAuxRegister io_uart2_iir_fcr,0x80014208,r|w
+.extAuxRegister io_uart2_lcr,0x8001420c,r|w
+.extAuxRegister io_uart2_mcr,0x80014210,r|w
+.extAuxRegister io_uart2_lsr,0x80014214,r
+.extAuxRegister io_uart2_msr,0x80014218,r
+.extAuxRegister io_uart2_usr,0x8001427c,r
+.set apex_com_arc_hardware_dfss_io_uart3_io_uart3_present,1
+.extAuxRegister io_uart3_clken,0x800143c0,r|w
+.extAuxRegister io_uart3_rbr_thr_dll,0x80014300,r|w
+.extAuxRegister io_uart3_ier_dlh,0x80014304,r|w
+.extAuxRegister io_uart3_iir_fcr,0x80014308,r|w
+.extAuxRegister io_uart3_lcr,0x8001430c,r|w
+.extAuxRegister io_uart3_mcr,0x80014310,r|w
+.extAuxRegister io_uart3_lsr,0x80014314,r
+.extAuxRegister io_uart3_msr,0x80014318,r
+.extAuxRegister io_uart3_usr,0x8001437c,r
+.set apex_com_arc_hardware_dfss_io_creg_mst0_io_creg_mst0_present,1
+.extAuxRegister io_creg_mst0_ctrl,0x80018000,r|w
+.set apex_com_arc_hardware_dfss_io_creg_slv0_io_creg_slv0_present,1
+.extAuxRegister io_creg_slv0_obsr,0x80018080,r
+.set apex_com_arc_hardware_dfss_subsys_bcr_subsys_bcr_present,1
+.extAuxRegister SUBSYS_BUILD,0xf0,r
+.extAuxRegister SUBSYS_DSP_0_BUILD,0xa00,r
+.extAuxRegister SUBSYS_DSP_0_CONFIG,0xa02,r
+.extAuxRegister SUBSYS_IO_0_BUILD,0xa04,r
+.extAuxRegister SUBSYS_IO_1_BUILD,0xa05,r
+.set apex_com_arc_hardware_floating_point_unit_fpu_present,1
+.extAuxRegister fpu_build,0xc8,r
+.extAuxRegister fpu_ctrl,0x300,r|w
+.extAuxRegister fpu_status,0x301,r|w
+.extInstruction fsmadd,6,5,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsmsub,6,6,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsmul,6,0,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsadd,6,1,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fssub,6,2,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fcvt32,6,8,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fsdiv,6,7,SUFFIX_COND,SYNTAX_3OP
+.extInstruction fscmp,6,3,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction fscmpf,6,4,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction fssqrt,6,0,FLAGS_NONE,SYNTAX_2OP
+.set apex_com_arc_hardware_floating_point_unit_fpu_dp_assist_present,1
+.extAuxRegister aux_dpfp1l,0x302,r|w
+.extAuxRegister aux_dpfp1h,0x303,r|w
+.extAuxRegister aux_dpfp2l,0x304,r|w
+.extAuxRegister aux_dpfp2h,0x305,r|w
+.extInstruction dmulh11,6,48,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dmulh12,6,49,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dmulh21,6,50,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dmulh22,6,51,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh11,6,52,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh12,6,53,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh21,6,54,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction daddh22,6,55,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh11,6,56,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh12,6,57,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh21,6,58,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dsubh22,6,59,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
+.extInstruction dexcl1,6,60,SUFFIX_COND,SYNTAX_3OP
+.extInstruction dexcl2,6,61,SUFFIX_COND,SYNTAX_3OP
+
+]]></string>
+  </configuration>
+</config_list>
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index eb890ef1999..d6b6d604ac7 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -5,6 +5,16 @@ ifeq ($(TARGET_ARCH), arc)
   AR_TOOL = arac
   CXX_TOOL = ccac
 
+ifeq ($(TARGET), iotdk)
+  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.tcf
+  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.lcf
+endif
+
+ifeq ($(TARGET), emsdp)
+  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
+  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
+endif
+
 ifneq ($(TCF_FILE), )
   TARGET = $(basename $(notdir $(TCF_FILE)))
 else
@@ -25,6 +35,11 @@ endif
   PLATFORM_FLAGS += -tcf_core_config
   PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf 
 
+ifneq ($(LCF_FILE), )
+  PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
+  THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE))
+endif
+
   CXXFLAGS += $(PLATFORM_FLAGS)
   CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
   CCFLAGS += $(PLATFORM_FLAGS)

From ced5b5bebb526e3e08804f4ccf49b530b9098c31 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Wed, 11 Mar 2020 14:47:28 +0300
Subject: [PATCH 0069/1533] Updated LCF for EMSDP and fixes for arc build
 process

---
 .../micro/tools/make/download_and_extract.sh  |  2 +-
 .../tools/make/targets/arc/emsdp/emsdp.lcf    | 51 ++++++++++++-------
 .../micro/tools/make/targets/arc_makefile.inc |  2 +-
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 2248031f6d1..4a75b6b24cd 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -90,7 +90,7 @@ patch_cifar10_dataset() {
 }
 
 build_embarc_mli() {
-  gmake -j 4 -C ${1}/lib/make TCF_FILE=${2}
+  make -j 4 -C ${1}/lib/make TCF_FILE=${2}
 }
 
 # Main function handling the download, verify, extract, and patch process.
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
index fc34759d745..d2d1b4220f8 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -5,43 +5,58 @@
 #   due to CCM memory wrapping into upper addresses beyond its size
 
 MEMORY {
-    IVT     : ORIGIN = 0x00000000, LENGTH = 0x60000000
-    ICCM0   : ORIGIN = 0x60000000, LENGTH = 0x00020000
+    PSRAM   : ORIGIN = 0x10000000, LENGTH = 0x01000000
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
 #   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
-#   SYSTEM1 : ORIGIN = 0x70000000, LENGTH = 0x10000000
     DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
 #   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
     XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
 #   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
     YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
 #   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
-    SYSTEM2 : ORIGIN = 0xb0000000, LENGTH = 0x50000000
     }
+
 SECTIONS {
-    GROUP BLOCK(4): {
-	.text? : { *('.text$crt*') }
-        * (TEXT): {}
-    	* (LIT): {}
-	} > ICCM0
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
 
     GROUP BLOCK(4): {
-	/* _SDA_BASE_ computed implicitly */
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
         .sdata?: {}
         .sbss?: {}
         * (DATA): {}
         * (BSS): {}
-       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {}
-       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
-	} > SYSTEM2
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:16K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
     GROUP BLOCK(4): {
         .Xdata? : {}
-        } > XCCM
+    } > XCCM
+
     GROUP BLOCK(4): {
         .Ydata? : {}
-        } > YCCM
-    GROUP BLOCK(4) : {
-        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
-        } > IVT
+    } > YCCM
+
+    GROUP BLOCK(4): {
+        .Zdata? : {}
+    } > DCCM
+
+
     }
 
 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index d6b6d604ac7..29ad5f5347a 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -31,7 +31,7 @@ else
   TCF_FILE_NAME = $(TCF_FILE)
 endif
 
-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
   PLATFORM_FLAGS += -tcf_core_config
   PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf 
 

From 503f98f88c2d8a7a636ef4ed920e059196ac9b09 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Mon, 30 Mar 2020 18:08:12 +0300
Subject: [PATCH 0070/1533] ARC EMSDP board specific debug log

---
 tensorflow/lite/micro/emsdp/debug_log.cc      | 108 ++++++++++++++++++
 .../tools/make/targets/arc/emsdp/emsdp.lcf    |  36 +++---
 2 files changed, 127 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/lite/micro/emsdp/debug_log.cc

diff --git a/tensorflow/lite/micro/emsdp/debug_log.cc b/tensorflow/lite/micro/emsdp/debug_log.cc
new file mode 100644
index 00000000000..7d932939a0b
--- /dev/null
+++ b/tensorflow/lite/micro/emsdp/debug_log.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+#include <cstring>
+#include <cstdint>
+#include <cstdio>
+
+// Print to debug console by default. One can define next to extend destinations set:
+// EMSDP_LOG_TO_MEMORY 
+//   : fill .debug_log memory region (data section) with passed chars. 
+// EMSDP_LOG_TO_HOST 
+//   : Use hostlink to print output log. 
+// EMSDP_LOG_TO_UART 
+//   : use default debug UART (out to FTDI channel 0). The same USB Port is used for JTAG.
+#define EMSDP_LOG_TO_UART
+
+
+// For simplicity we assume U-boot has already initialized debug console durion 
+// application loading (or on reset).  Hence we use only status and data registers 
+// to organize blocking loop for printing symbols. No input and no IRQ handling. 
+// See embarc_osp repository for full EMSDP uart driver.
+// TODO: Consider U-Boot API to do it in a less "hacky" way.
+void DbgUartSendStr(const char* s) {
+#define EMSDP_DBG_UART_BASE     (0xF0004000U)
+#define DW_UART_CPR_FIFO_STAT   (1<<10)
+#define DW_UART_USR_TFNF        (0x02)
+#define DW_UART_LSR_TXD_EMPTY   (0x20)
+
+    typedef volatile struct dw_uart_reg {
+        uint32_t DATA;		/*!< data in/out and DLL */
+        uint32_t RES1[4];
+        uint32_t LSR;		/*!< Line Status Register */
+        uint32_t RES2[25];
+        uint32_t USR;		/*!< UART status register */
+        uint32_t RES3[29];
+        uint32_t CPR;		/*!< Component parameter register */
+    } DW_UART_REG;
+
+    DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
+    const char* src = s;
+    while (*src) {
+        // Check uart status to send char
+        bool uart_is_ready = false;
+        if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT)
+            uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0);
+        else
+            uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0);
+
+        // Send char if uart is ready. 
+        if (uart_is_ready)
+            uart_reg_ptr->DATA = *src++;
+    }
+}
+
+// Simple symbols dump to a pre-allocated memory region. 
+// The memory region can be viewed afterward with debugger.
+// It can be viewed/read with debugger afterward.
+void LogToMem(const char* s) {
+    constexpr int kDebugLogMemChars = 2 * 1024;
+    static int cursor = 0;
+#pragma Bss(".debug_log")
+    volatile static char debug_log_mem[kDebugLogMemChars];
+#pragma Bss()
+
+    const char* src = s;
+    while (*src) {
+        debug_log_mem[cursor] = *src++;
+        cursor = (cursor < kDebugLogMemChars) ? cursor + 1 : 0;
+    }
+    debug_log_mem[cursor] = '^';
+}
+
+
+extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
+
+#if defined EMSDP_LOG_TO_UART
+    DbgUartSendStr(s);
+#endif
+
+#if defined EMSDP_LOG_TO_MEMORY
+#warning "EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout"
+    LogToMem(s);
+#endif
+
+#if defined EMSDP_LOG_TO_HOST
+#warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked."
+    fprintf(stderr, "%s", s);
+#endif
+
+#endif // TF_LITE_STRIP_ERROR_STRINGS
+}
+
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
index d2d1b4220f8..d17c807e250 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -5,7 +5,7 @@
 #   due to CCM memory wrapping into upper addresses beyond its size
 
 MEMORY {
-    PSRAM   : ORIGIN = 0x10000000, LENGTH = 0x01000000
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
     SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
     IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
     ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
@@ -31,19 +31,11 @@ SECTIONS {
     } > ICCM0
 
     GROUP BLOCK(4): {
-    /* _SDA_BASE_ computed implicitly */
-        .sdata?: {}
-        .sbss?: {}
-        * (DATA): {}
-        * (BSS): {}
-       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:16K): {}
+       .Zdata? : {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
        .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
     } > DCCM
-
-    GROUP BLOCK(4): {
-        .rodata_in_data? : {}
-    } > PSRAM
-
+        
     GROUP BLOCK(4): {
         .Xdata? : {}
     } > XCCM
@@ -53,10 +45,20 @@ SECTIONS {
     } > YCCM
 
     GROUP BLOCK(4): {
-        .Zdata? : {}
-    } > DCCM
-
-
-    }
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+}
 
 
From 2d8e1a45ec34649d216566514d7c062ae985023a Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Wed, 1 Apr 2020 17:33:16 +0300
Subject: [PATCH 0071/1533] ARC EMSDB Board integration: Project generation

---
 .../micro/tools/make/helper_functions.inc     |  32 ++--
 .../tools/make/targets/arc/emsdp/uboot.env    | Bin 0 -> 4096 bytes
 .../tools/make/targets/emsdp_makefile.inc     | 155 ++++++++++++++++++
 .../make/templates/arc/arc_app_makefile.tpl   | 134 +++++++++++++++
 4 files changed, 307 insertions(+), 14 deletions(-)
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/uboot.env
 create mode 100644 tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
 create mode 100644 tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl

diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index a7f9bd788e3..0c398be2118 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -130,31 +130,35 @@ endef
 define generate_arc_project
 
 ifeq ($(TARGET_ARCH), arc)
-$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/Makefile.tpl
+
+$(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
 	@mkdir -p $$(dir $$@)
 	@sed -E 's#\%\{SRCS\}\%#$(4)#g' $$< | \
-	sed -E '1 i\CC = ccac\nCXX = ccac\nLD = ccac\n' | \
+	sed -E 's#\%\{CC\}\%#$(CC_TOOL)#g' | \
+	sed -E 's#\%\{CXX\}\%#$(CXX_TOOL)#g' | \
+	sed -E 's#\%\{LD\}\%#$(LD_TOOL)#g' | \
 	sed -E 's#\%\{EXECUTABLE\}\%#$(3).elf#g' | \
 	sed -E 's#\%\{LINKER_FLAGS\}\%#$(6)#g' | \
 	sed -E 's#\%\{CXX_FLAGS\}\%#$(7)#g' | \
-	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' > $$@
+	sed -E 's#\%\{CC_FLAGS\}\%#$(8)#g' | \
+	sed -E 's#\%\{EXTRA_APP_SETTINGS\}\%#$(ARC_EXTRA_APP_SETTINGS)#g' | \
+	sed -E 's#\%\{EXTRA_APP_RULES\}\%#$(ARC_EXTRA_APP_RULES)#g' | \
+	sed -E 's#\%\{BIN_DEPEND\}\%#$(ARC_BIN_DEPEND)#g' | \
+	sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \
+	sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \
+	sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
+	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
+	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
 
 
-# Special rule to copy TCF in case the local filesystem file name has been defined
-ifneq ($(TCF_FILE_NAME), )
-$(PRJDIR)$(3)/$(1)/$(TCF_FILE_NAME): $(TCF_FILE)
-	@cp $$< $$@
-endif
-
-# Special rule to copy LCF in case the local filesystem file name has been defined
-ifneq ($(LCF_FILE), )
-$(PRJDIR)$(3)/$(1)/$(notdir $(LCF_FILE)): $(LCF_FILE)
-	@cp $$< $$@
-endif
+$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
 
 endif
 endef
 
+
+
+
 # Creates a set of rules to build a standalone Arduino project for an
 # executable, including all of the source and header files required in a
 # separate folder and a simple makefile.
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/uboot.env b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/uboot.env
new file mode 100644
index 0000000000000000000000000000000000000000..c336b6c8733f90b8fbaab75fc705f787ad141607
GIT binary patch
literal 4096
zcmeIuOKQU~5C&i$Wfmp06q}LLM+gW?4^X=4y2yGYqOxUVIrKr+y;jfB(=?N$l+ZwX
zfYC}b9{*y#kI(n}MvW$^Y^4yFHDb}kn00GppL~~Xw}Qz8oXR0818u7T;pB}9WlUOc
z?lkXwuxlEm!UneuF*c|dDYkbSkgD%1`shs7vCAn2%8&w`b<Gru?^KD;0%u647_zY`
z$-ySC0^Yby?_%Hg-O+X&^Y8xHFOo7=+IBEKq}amPg{YtYT5kW}qi@w>x1sK<=0Jcv
zXuZL|?k0>@dTGIhaadCR&ztmOaS$oTlE|vYcg4yk`B2_&N~lrHDk0$XF0P=y-quh8
n&gS)WKD~l-6o_X99~NK%7GMDuU;!3j0Ty5Z7GMDu_?N&JGvbmg

literal 0
HcmV?d00001

diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
new file mode 100644
index 00000000000..c7286329651
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
@@ -0,0 +1,155 @@
+# Settings for arc processors
+ifeq ($(TARGET), emsdp)
+
+  TARGET_ARCH = arc
+  
+  CC_TOOL = ccac
+  AR_TOOL = arac
+  CXX_TOOL = ccac
+  LD_TOOL = ccac
+  
+   DLR = $$$$
+   ARC_EXTRA_APP_SETTINGS = \
+      BIN_DIR = .$(DLR)\(PS\)bin\n\
+      BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n
+
+   ARC_EXTRA_APP_RULES = \
+     $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
+     \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
+     \n\t\@$(DLR)\(CP\) uboot.env $(DLR)\(BIN_DIR\)$(DLR)\(PS)uboot.env\
+     \n \
+     \n$(DLR)\(BIN_DIR\):\
+     \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
+
+
+   ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\)
+
+   ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
+   ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions
+   
+   ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS)
+   ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS)
+   ARC_EXTRA_EXECUTE_RULES = 
+
+
+
+  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
+  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
+
+  MAKE_PROJECT_FILES += emsdp_em11d_dfss.tcf emsdp.lcf uboot.env
+
+  ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp $(PWD)/$(MAKEFILE_DIR)/targets/arc
+  # ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp
+
+# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
+# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+
+  THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
+
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
+  PLATFORM_FLAGS += -tcf_core_config
+  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map 
+
+#  DMITRYZ: I think we need to move it to target specific LCF file.
+  PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
+  # THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE))
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS))
+  LDFLAGS += $(PLATFORM_LDFLAGS)
+
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+  #  DMITRYZ: Here we need to check tags on "no_embarc_mli".
+  USE_EMBARC_MLI ?= true
+
+ifeq ($(USE_EMBARC_MLI), true)
+  ALL_TAGS += arc
+
+ifeq ($(PRE_COMPILED_MLI),true)
+  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
+
+  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
+  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/embarc_osp/LICENSE
+else
+  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
+
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+endif
+
+  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
+  GENERATED_PROJECT_LIBS += $(MLI_LIB)
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/$(MLI_INCLUDE_FOLDER) \
+    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
+
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
+
+endif # USE_EMBARC_MLI
+
+# We overwrite project generator to exclude everything not relevant to ARC platform
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+endef
+
+# Copy rule generator to do file copyes with changing paths in generated project
+# Arguments are:
+# 1 - Path files in generated project.
+# 2 - Path files in the source repo
+# Used in helper_functions.inc for arc projects to copy files
+define path_changing_copy_file
+$(1)/%: $(2)/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+endef
+
+$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call copy_arc_project_file,$(PRJDIR)$(3)/$(1),$(var))))
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+
+# Not applicable for ARC, leaving it empty.
+$(BINDIR)%.bin:
+
+endif
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
new file mode 100644
index 00000000000..5bbcb7d3f71
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
@@ -0,0 +1,134 @@
+#=============================================================
+# OS-specific definitions
+#=============================================================
+COMMA=,
+OPEN_PAREN=(
+CLOSE_PAREN=)
+BACKSLASH=\$(nullstring)
+ifneq ($(ComSpec)$(COMSPEC),)
+    O_SYS=Windows
+    RM=del /F /Q
+    MKDIR=mkdir 
+    CP=copy /Y
+    TYPE=type
+    PS=$(BACKSLASH)
+    Q=
+    coQ=\$(nullstring)
+    fix_platform_path = $(subst /,$(PS), $(1))
+    DEV_NULL = nul
+else
+    O_SYS=Unix
+    RM=rm -rf
+    MKDIR=mkdir -p
+    CP=cp 
+    TYPE=cat
+    PS=/
+    Q=$(BACKSLASH)
+    coQ=
+    fix_platform_path=$(1)
+    DEV_NULL=/dev/null
+endif
+
+# Note: Windows escaping rules is very combersome 
+# initially I tried to use Q=^, but this depends on the context and (looks like) on Win version.
+# Also expecially ugly thing is that in quoted strings the quotes the same are remain.
+# Batch has special parameter expansion syntax to remove quotes,
+# but many tools themselves remove quotes (unless escaped with backslash)
+# So finally we've found that in our use cases we may not escaping any symbols but prepend backslashes before quotes.
+
+quote=$(subst %,$(Q)%, \
+      $(subst &,$(Q)&, \
+      $(subst <,$(Q)<, \
+      $(subst >,$(Q)>, \
+      $(subst |,$(Q)|, \
+      $(subst ',$(Q)', \
+      $(subst $(COMMA),$(Q)$(COMMA), \
+      $(subst =,$(Q)=, \
+      $(subst $(OPEN_PAREN),$(Q)$(OPEN_PAREN), \
+      $(subst $(CLOSE_PAREN),$(Q)$(CLOSE_PAREN), \
+      $(subst !,$(Q)!, \
+      $(subst ",$(BACKSLASH)", \
+      $(subst $(Q),$(Q)$(Q), \
+      $(1) )))))))))))))
+
+#=============================================================
+# Toolchain definitions
+#=============================================================
+CC = %{CC}%
+CXX = %{CXX}%
+LD = %{LD}%
+
+
+#=============================================================
+# Applications settings
+#=============================================================
+OUT_NAME = %{EXECUTABLE}%
+
+DBG_ARGS ?= 
+
+RUN_ARGS ?= 
+
+CXXFLAGS += %{CXX_FLAGS}%
+
+CCFLAGS += %{CC_FLAGS}%
+
+LDFLAGS += %{LINKER_FLAGS}%
+
+%{EXTRA_APP_SETTINGS}%
+
+
+#=============================================================
+# Files and directories
+#=============================================================
+SRCS := \
+%{SRCS}%
+
+OBJS := \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
+
+
+#=============================================================
+# Common rules
+#=============================================================
+.PHONY: all app flash clean run debug
+
+%.o: %.cc
+	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+
+%.o: %.c
+	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+
+$(OUT_NAME): $(OBJS)
+	$(LD) $(CXXFLAGS) -o $@ -Ccrossref $(OBJS) $(LDFLAGS)
+
+%{EXTRA_APP_RULES}%
+
+
+#=================================================================
+# Global rules
+#=================================================================
+all: $(OUT_NAME)
+
+app: $(OUT_NAME)
+
+flash: %{BIN_DEPEND}%
+%{BIN_RULE}%
+
+clean: 
+	-@$(RM) $(call fix_platform_path,$(OBJS))
+	-@$(RM) $(OUT_NAME) %{EXTRA_RM_TARGETS}%
+
+#=================================================================
+# Execution rules
+#=================================================================
+
+APP_RUN := %{APP_RUN_CMD}%
+APP_DEBUG := %{APP_DEBUG_CMD}%
+
+run: $(OUT_NAME)
+	$(APP_RUN) $(OUT_NAME) $(RUN_ARGS)
+
+debug: $(OUT_NAME)
+	$(APP_DEBUG) $(OUT_NAME) $(RUN_ARGS)
+
+%{EXTRA_EXECUTE_RULES}%

From 1977bd0442998f7a1d8724d54e5a892d9df0daba Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Thu, 2 Apr 2020 15:52:03 +0300
Subject: [PATCH 0072/1533] Update project generation for custom ARC target
 (*.tcf)

---
 .../micro/tools/make/helper_functions.inc     |  2 +-
 .../micro/tools/make/targets/arc_makefile.inc | 85 ++++++++++++++-----
 .../tools/make/targets/emsdp_makefile.inc     | 15 ++--
 3 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 0c398be2118..0e21e02bc07 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -151,7 +151,7 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_
 	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
 
 
-$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
+$(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
 
 endif
 endef
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 29ad5f5347a..e6505cd187b 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -1,19 +1,12 @@
-# Settings for arc processors
+# Settings for not pre-defined ARC processors. 
+# User need to specify ARC target with Tool Configuration File (*.tcf). 
+# Path to this file must be passed through TCF_FILE variable.
+# Otherwise, default em7d_voice_audio configuration is used 
+
 ifeq ($(TARGET_ARCH), arc)
 
-  CC_TOOL = ccac
-  AR_TOOL = arac
-  CXX_TOOL = ccac
-
-ifeq ($(TARGET), iotdk)
-  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.tcf
-  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/iotdk/iotdk.lcf
-endif
-
-ifeq ($(TARGET), emsdp)
-  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
-  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
-endif
+# Known target are specifyed with their own make configurations. 
+ifeq ($(filter $(TARGET), emsdp iotdk),)
 
 ifneq ($(TCF_FILE), )
   TARGET = $(basename $(notdir $(TCF_FILE)))
@@ -26,30 +19,61 @@ endif
 # This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
 ifneq (,$(findstring .tcf,$(TCF_FILE)))
   TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
+  ARC_TARGET_FILES_DIRS := $(dir $(TCF_FILE))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
 else
   TCF_FILE_NAME = $(TCF_FILE)
 endif
 
-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
-  PLATFORM_FLAGS += -tcf_core_config
-  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map -default_lcf=$(MAKEFILE_DIR)/targets/arc/memory.lcf 
+  CC_TOOL = ccac
+  AR_TOOL = arac
+  CXX_TOOL = ccac
+  LD_TOOL = ccac
+  
+  # TODO: Move this to a common arc/arc_common.inc file to share this with other targets 
+  DLR = $$$$
+  ARC_EXTRA_APP_SETTINGS = 
 
+  ARC_EXTRA_APP_RULES = 
+  
+  ARC_EXTRA_RM_TARGETS =
+
+  ARC_BIN_DEPEND = 
+  ARC_BIN_RULE = \t$(DLR)\(error Flash rule isnt defined for this ARC target\)
+   
+  ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS)
+  ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS)
+  ARC_EXTRA_EXECUTE_RULES = 
+
+
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME)
+  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
+  PLATFORM_FLAGS += -tcf_core_config
+  
+  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
+  PLATFORM_LDFLAGS = -Hnocopyr -m -Hldopt=-Coutput=memory.map
 ifneq ($(LCF_FILE), )
   PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
-  THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
+ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir LCF_FILE)),)
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+endif
 endif
 
   CXXFLAGS += $(PLATFORM_FLAGS)
   CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
   CCFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS))
   LDFLAGS += $(PLATFORM_LDFLAGS)
 
+
   MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
 
+  # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example
   USE_EMBARC_MLI ?= true
 
 ifeq ($(USE_EMBARC_MLI), true)
+  # TODO: To understand why it's done here. The same is performed in the higher level MakeFile.
   ALL_TAGS += arc
 
 ifeq ($(PRE_COMPILED_MLI),true)
@@ -110,10 +134,29 @@ endif
 
 endif # USE_EMBARC_MLI
 
+# We overwrite project generator to exclude everything not relevant to ARC platform
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+endef
+
+# Copy rule generator to do file copyes with changing paths in generated project
+# Arguments are:
+# 1 - Path files in generated project.
+# 2 - Path files in the source repo
+# Used in helper_functions.inc for arc projects to copy files
+define path_changing_copy_file
+$(1)/%: $(2)/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+endef
+
 # These are microcontroller-specific rules for converting the ELF output
 # of the linker into a binary image that can be loaded directly.
-
 # Not applicable for ARC, leaving it empty.
 $(BINDIR)%.bin:
 
-endif
+
+endif  # ifeq ($(filter $(TARGET),$(ARC_PREDEFINED_TARGETS)),)
+endif  # ifeq ($(TARGET_ARCH), arc)
+
diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
index c7286329651..aeeb7fc178f 100644
--- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
@@ -21,7 +21,6 @@ ifeq ($(TARGET), emsdp)
      \n$(DLR)\(BIN_DIR\):\
      \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
 
-
    ARC_EXTRA_RM_TARGETS = $(DLR)\(BIN_DIR\)
 
    ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
@@ -31,21 +30,19 @@ ifeq ($(TARGET), emsdp)
    ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS)
    ARC_EXTRA_EXECUTE_RULES = 
 
-
-
   TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
   LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
 
   MAKE_PROJECT_FILES += emsdp_em11d_dfss.tcf emsdp.lcf uboot.env
 
-  ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp $(PWD)/$(MAKEFILE_DIR)/targets/arc
-  # ARC_TARGET_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp
+  ARC_TARGET_FILES_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp
 
+# TODO: LESS TCF/LCF Variables
 # The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
 # This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
   TCF_FILE_NAME = $(notdir $(TCF_FILE))
 
-  THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
+#  THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
 
   PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
   PLATFORM_FLAGS += -tcf_core_config
@@ -53,7 +50,7 @@ ifeq ($(TARGET), emsdp)
 
 #  DMITRYZ: I think we need to move it to target specific LCF file.
   PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
-  # THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE))
+#  THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE))
 
   CXXFLAGS += $(PLATFORM_FLAGS)
   CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
@@ -133,7 +130,7 @@ $(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(T
 $(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
 endef
 
-# Copy rule generator to do file copyes with changing paths in generated project
+# Copy rule generator to do file copyes changing paths in generated project
 # Arguments are:
 # 1 - Path files in generated project.
 # 2 - Path files in the source repo
@@ -144,7 +141,7 @@ $(1)/%: $(2)/%
 	@cp $$< $$@
 endef
 
-$(foreach var,$(ARC_TARGET_DIRS),$(eval $(call copy_arc_project_file,$(PRJDIR)$(3)/$(1),$(var))))
+
 
 # These are microcontroller-specific rules for converting the ELF output
 # of the linker into a binary image that can be loaded directly.

From 984457fd69a2615db8f2d1e5c5848b3b3c7ef27f Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Fri, 3 Apr 2020 11:41:58 +0300
Subject: [PATCH 0073/1533] Update platform flags and debug command template

---
 .../micro/tools/make/helper_functions.inc     |  2 +-
 .../micro/tools/make/targets/arc_makefile.inc | 10 +++++----
 .../tools/make/targets/emsdp_makefile.inc     | 22 +++++++++++++------
 .../make/templates/arc/arc_app_makefile.tpl   |  6 +++--
 4 files changed, 26 insertions(+), 14 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 0e21e02bc07..8d321d42490 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -147,7 +147,7 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_
 	sed -E 's#\%\{BIN_RULE\}\%#$(ARC_BIN_RULE)#g' | \
 	sed -E 's#\%\{EXTRA_RM_TARGETS\}\%#$(ARC_EXTRA_RM_TARGETS)#g' | \
 	sed -E 's#\%\{APP_RUN_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
-	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_RUN_CMD)#g' | \
+	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \
 	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
 
 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index e6505cd187b..1b30e6ac6d0 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -46,12 +46,14 @@ endif
   ARC_EXTRA_EXECUTE_RULES = 
 
 
-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME)
-  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
-  PLATFORM_FLAGS += -tcf_core_config
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
+  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
+  
+  # Use compact CRT. It requires pre-defined heap size
+  PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
   
   PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
-  PLATFORM_LDFLAGS = -Hnocopyr -m -Hldopt=-Coutput=memory.map
+  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K 
 ifneq ($(LCF_FILE), )
   PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
   MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
index aeeb7fc178f..86e9d9e7379 100644
--- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
@@ -33,24 +33,32 @@ ifeq ($(TARGET), emsdp)
   TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
   LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
 
-  MAKE_PROJECT_FILES += emsdp_em11d_dfss.tcf emsdp.lcf uboot.env
+  MAKE_PROJECT_FILES += $(notdir $(TCF_FILE)) $(notdir $(LCF_FILE)) uboot.env
 
-  ARC_TARGET_FILES_DIRS := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
+ifneq ($(dir $(TCF_FILE)), $(dir $(LCF_FILE)))
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+endif
 
 # TODO: LESS TCF/LCF Variables
 # The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
 # This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
   TCF_FILE_NAME = $(notdir $(TCF_FILE))
 
-#  THIRD_PARTY_CC_HDRS += $(TCF_FILE_NAME)
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
+  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
+  
+  # Use compact CRT. It requires pre-defined heap size
+  PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
+  
+  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map  -Hheap=2K
 
-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -O3 -Hpurge -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset -fslp-vectorize-aggressive -ffunction-sections -fdata-sections
-  PLATFORM_FLAGS += -tcf_core_config
-  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map 
+  # for default EMSD configuration we can use defaul em9d rt libs
+  # for better performance runime should be rebuilt for emsdp configuration
+  PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio
 
 #  DMITRYZ: I think we need to move it to target specific LCF file.
   PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
-#  THIRD_PARTY_CC_HDRS += $(notdir $(LCF_FILE))
 
   CXXFLAGS += $(PLATFORM_FLAGS)
   CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
index 5bbcb7d3f71..f79d04b26d1 100644
--- a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
@@ -68,6 +68,8 @@ DBG_ARGS ?=
 
 RUN_ARGS ?= 
 
+EXT_CFLAGS ?=
+
 CXXFLAGS += %{CXX_FLAGS}%
 
 CCFLAGS += %{CC_FLAGS}%
@@ -93,10 +95,10 @@ $(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(SRCS)))
 .PHONY: all app flash clean run debug
 
 %.o: %.cc
-	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $< -o $@
+	$(CXX) $(CXXFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
 
 %.o: %.c
-	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
+	$(CC) $(CCFLAGS) $(EXT_CFLAGS) $(INCLUDES) -c $< -o $@
 
 $(OUT_NAME): $(OBJS)
 	$(LD) $(CXXFLAGS) -o $@ -Ccrossref $(OBJS) $(LDFLAGS)

From 7c15ad0e98c1ba9234117fb160c082ef11108b46 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Fri, 3 Apr 2020 16:10:34 +0300
Subject: [PATCH 0074/1533] ARC platform common make parts was moved to a
 separate file

---
 .../tools/make/targets/arc/arc_common.inc     | 185 ++++++++++++++++++
 .../micro/tools/make/targets/arc_makefile.inc | 151 +-------------
 .../tools/make/targets/emsdp_makefile.inc     | 148 ++------------
 3 files changed, 207 insertions(+), 277 deletions(-)
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc

diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
new file mode 100644
index 00000000000..e20887abb07
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -0,0 +1,185 @@
+# Common Settings for ARC platform and it's projects. 
+# Might be reused across different targets
+
+ifeq ($(TARGET_ARCH), arc)
+
+  DLR := $$$$
+
+  # List of folders to search project files for copy with path changing
+  # For instance, TCF and LCF files are copyed into the root of generated project
+  ARC_TARGET_FILES_DIRS ?=
+
+  # For the following variables see arc_app_makefile.tpl for usage
+
+  # Additional text into application settings section of arc makefile project 
+  ARC_EXTRA_APP_SETTINGS ?=
+
+  # Additional text into application general rules of arc makefile project 
+  ARC_EXTRA_APP_RULES ?=
+  
+  # additional arguments for RM command of "clean" target rule ("make clean" command)
+  ARC_EXTRA_RM_TARGETS ?=
+
+  # Dependencies of "flash" target rule ("make flash" command)
+  ARC_BIN_DEPEND ?=
+  
+  # Commands in "flash" target rule ("make flash" command)
+  ARC_BIN_RULE ?= \t$(DLR)\(error Flash rule isnt defined for this ARC target\)
+  
+  # Command to run app on "make run" command of generated project
+  ARC_APP_RUN_CMD ?= 
+  
+  # Command to run app on "make debug" command of generated project
+  ARC_APP_DEBUG_CMD ?= 
+  
+  # Additional text into application execution rules of arc makefile project 
+  ARC_EXTRA_EXECUTE_RULES ?= 
+
+# We overwrite project generator to exclude everything not relevant to ARC platform.
+# ARC targets doesn't can't work with mbed, keil or other architecture specific development tools
+# Basic make project is updated to be applicable for general ARC platform
+define generate_microlite_projects
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+endef
+
+# Copy rule generator to do file copyes with changing paths in generated project
+# Arguments are:
+# 1 - Path files in generated project.
+# 2 - Path files in the source repo
+# Used in helper_functions.inc for arc projects to copy files
+define path_changing_copy_file
+$(1)/%: $(2)/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+endef
+
+# These are microcontroller-specific rules for converting the ELF output
+# of the linker into a binary image that can be loaded directly.
+# Not applicable for ARC, leaving it empty.
+$(BINDIR)%.bin:
+
+
+ifeq ($(ARC_TOOLCHAIN), mwdt)
+  CC_TOOL := ccac
+  AR_TOOL := arac
+  CXX_TOOL := ccac
+  LD_TOOL := ccac
+
+  ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
+  ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS\)
+
+  # The variable TCF_FILE stores path to Tool Configuration File (*.tcf). 
+  # This file is used by MWDT toolchain to properly compile/run code
+  TCF_FILE ?= 
+
+  LCF_FILE ?= 
+
+# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), 
+# this variable is used later to add the option to the linker/compiler flags.
+# This condition also handles the case when the user/makefile specifies 
+# the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
+ifneq (,$(findstring .tcf,$(TCF_FILE)))
+  TCF_FILE_NAME = $(notdir $(TCF_FILE))
+  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
+  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
+else
+  TCF_FILE_NAME = $(TCF_FILE)
+endif
+
+  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
+  
+  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
+  
+  # Use compact CRT. It requires pre-defined heap size
+  PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
+  
+  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
+  
+  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K 
+
+ifneq ($(LCF_FILE), )
+  PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
+ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(LCF_FILE))),)
+  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+endif
+endif
+
+  CXXFLAGS := $(filter-out -std=c++11,$(CXXFLAGS))
+  CCFLAGS := $(filter-out -std=c11,$(CCFLAGS))
+  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
+
+  CXXFLAGS += $(PLATFORM_FLAGS)
+  CCFLAGS += $(PLATFORM_FLAGS)
+  LDFLAGS += $(PLATFORM_LDFLAGS)
+
+
+  # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example
+  USE_EMBARC_MLI ?= true
+
+ifeq ($(USE_EMBARC_MLI), true)
+  # TODO: To understand why it's done here. The same is performed in the higher level MakeFile.
+  ALL_TAGS += arc
+
+ifeq ($(PRE_COMPILED_MLI),true)
+  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
+
+  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
+  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/embarc_osp/LICENSE
+else
+  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
+
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+endif
+
+  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
+  GENERATED_PROJECT_LIBS += $(MLI_LIB)
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/$(MLI_INCLUDE_FOLDER) \
+    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
+
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
+
+endif # USE_EMBARC_MLI
+
+endif # ARC_TOOLCHAIN
+endif  # TARGET_ARCH
+
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 1b30e6ac6d0..87d1b736807 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -8,157 +8,18 @@ ifeq ($(TARGET_ARCH), arc)
 # Known target are specifyed with their own make configurations. 
 ifeq ($(filter $(TARGET), emsdp iotdk),)
 
+ARC_TOOLCHAIN := mwdt
+
 ifneq ($(TCF_FILE), )
   TARGET = $(basename $(notdir $(TCF_FILE)))
 else
+  $(warning TCF_FILE variable is not specifyed. Use default em7d_voice_audio configuration)
   TARGET = em7d_voice_audio
   TCF_FILE = em7d_voice_audio
 endif
 
-# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
-# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
-ifneq (,$(findstring .tcf,$(TCF_FILE)))
-  TCF_FILE_NAME = $(notdir $(TCF_FILE))
-  ARC_TARGET_FILES_DIRS := $(dir $(TCF_FILE))
-  MAKE_PROJECT_FILES += $(TCF_FILE_NAME)
-else
-  TCF_FILE_NAME = $(TCF_FILE)
-endif
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
 
-  CC_TOOL = ccac
-  AR_TOOL = arac
-  CXX_TOOL = ccac
-  LD_TOOL = ccac
-  
-  # TODO: Move this to a common arc/arc_common.inc file to share this with other targets 
-  DLR = $$$$
-  ARC_EXTRA_APP_SETTINGS = 
-
-  ARC_EXTRA_APP_RULES = 
-  
-  ARC_EXTRA_RM_TARGETS =
-
-  ARC_BIN_DEPEND = 
-  ARC_BIN_RULE = \t$(DLR)\(error Flash rule isnt defined for this ARC target\)
-   
-  ARC_APP_RUN_CMD = mdb -run -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS)
-  ARC_APP_DEBUG_CMD = mdb -OK -jit -tcf=$(TCF_FILE_NAME) $(DLR)\(DBG_ARGS)
-  ARC_EXTRA_EXECUTE_RULES = 
-
-
-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
-  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
-  
-  # Use compact CRT. It requires pre-defined heap size
-  PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
-  
-  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) 
-  PLATFORM_LDFLAGS += -Hnocopyr -m -Hldopt=-Coutput=memory.map -Hheap=2K 
-ifneq ($(LCF_FILE), )
-  PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
-  MAKE_PROJECT_FILES += $(notdir $(LCF_FILE))
-ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir LCF_FILE)),)
-  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
-endif
-endif
-
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
-  CCFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS))
-  LDFLAGS += $(PLATFORM_LDFLAGS)
-
-
-  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
-
-  # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example
-  USE_EMBARC_MLI ?= true
-
-ifeq ($(USE_EMBARC_MLI), true)
-  # TODO: To understand why it's done here. The same is performed in the higher level MakeFile.
-  ALL_TAGS += arc
-
-ifeq ($(PRE_COMPILED_MLI),true)
-  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
-
-  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
-  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/embarc_osp/LICENSE
-else
-  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
-
-  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
-
-  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
-  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
-  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_LIB_DIR)/LICENSE
-endif
-
-  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
-  GENERATED_PROJECT_LIBS += $(MLI_LIB)
-
-  INCLUDES += \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
-
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/$(MLI_INCLUDE_FOLDER) \
-    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
-
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
-
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
-
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
-
-endif # USE_EMBARC_MLI
-
-# We overwrite project generator to exclude everything not relevant to ARC platform
-define generate_microlite_projects
-$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
-$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
-endef
-
-# Copy rule generator to do file copyes with changing paths in generated project
-# Arguments are:
-# 1 - Path files in generated project.
-# 2 - Path files in the source repo
-# Used in helper_functions.inc for arc projects to copy files
-define path_changing_copy_file
-$(1)/%: $(2)/%
-	@mkdir -p $$(dir $$@)
-	@cp $$< $$@
-endef
-
-# These are microcontroller-specific rules for converting the ELF output
-# of the linker into a binary image that can be loaded directly.
-# Not applicable for ARC, leaving it empty.
-$(BINDIR)%.bin:
-
-
-endif  # ifeq ($(filter $(TARGET),$(ARC_PREDEFINED_TARGETS)),)
-endif  # ifeq ($(TARGET_ARCH), arc)
+endif  # $(TARGET)
+endif  # $(TARGET_ARCH)...
 
diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
index 86e9d9e7379..9901fd82b07 100644
--- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
@@ -1,14 +1,16 @@
 # Settings for arc processors
 ifeq ($(TARGET), emsdp)
 
-  TARGET_ARCH = arc
+  TARGET_ARCH := arc
+  ARC_TOOLCHAIN := mwdt
+
+  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
+  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
+  UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
+  UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
+
+include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
   
-  CC_TOOL = ccac
-  AR_TOOL = arac
-  CXX_TOOL = ccac
-  LD_TOOL = ccac
-  
-   DLR = $$$$
    ARC_EXTRA_APP_SETTINGS = \
       BIN_DIR = .$(DLR)\(PS\)bin\n\
       BIN_FILE = $(DLR)\(BIN_DIR\)$(DLR)\(PS\)app.elf\n
@@ -16,7 +18,7 @@ ifeq ($(TARGET), emsdp)
    ARC_EXTRA_APP_RULES = \
      $(DLR)\(BIN_FILE\): $(DLR)\(BIN_DIR\) $(DLR)\(OUT_NAME\)\
      \n\t\@$(DLR)\(CP\) $(DLR)\(OUT_NAME\) $(DLR)\(BIN_FILE\)\
-     \n\t\@$(DLR)\(CP\) uboot.env $(DLR)\(BIN_DIR\)$(DLR)\(PS)uboot.env\
+     \n\t\@$(DLR)\(CP\) $(UBOOT_FILE_NAME) $(DLR)\(BIN_DIR\)$(DLR)\(PS\)$(UBOOT_FILE_NAME)\
      \n \
      \n$(DLR)\(BIN_DIR\):\
      \n\t\@$(DLR)\(MKDIR\) $(DLR)\(BIN_DIR\)\
@@ -26,135 +28,17 @@ ifeq ($(TARGET), emsdp)
    ARC_BIN_DEPEND = $(DLR)\(BIN_DIR\) $(DLR)\(BIN_FILE\)
    ARC_BIN_RULE = \t@echo Copy content of $(DLR)\(BIN_DIR\) into the root of SD card and follow instructions
    
-   ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS)
-   ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS)
+   ARC_APP_RUN_CMD = mdb -run -digilent -nooptions $(DLR)\(DBG_ARGS\)
+   ARC_APP_DEBUG_CMD = mdb -OK -digilent -nooptions $(DLR)\(DBG_ARGS\)
    ARC_EXTRA_EXECUTE_RULES = 
 
-  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
-  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
-
-  MAKE_PROJECT_FILES += $(notdir $(TCF_FILE)) $(notdir $(LCF_FILE)) uboot.env
-
-  ARC_TARGET_FILES_DIRS = $(dir $(TCF_FILE))
-ifneq ($(dir $(TCF_FILE)), $(dir $(LCF_FILE)))
-  ARC_TARGET_FILES_DIRS += $(dir $(LCF_FILE))
+  MAKE_PROJECT_FILES += $(UBOOT_FILE_NAME)
+ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
+  ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
 endif
 
-# TODO: LESS TCF/LCF Variables
-# The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), this variable is used later to add the option to the linker/compiler flags.
-# This condition also handles the case when the user/makefile specifies the configuration bundled with MWDT (usually without .tcf extension) and that doesn't require copying.
-  TCF_FILE_NAME = $(notdir $(TCF_FILE))
-
-  PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
-  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
-  
-  # Use compact CRT. It requires pre-defined heap size
-  PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
-  
-  PLATFORM_LDFLAGS = -tcf=$(TCF_FILE_NAME) -Hnocopyr -m -Hldopt=-Coutput=memory.map  -Hheap=2K
-
   # for default EMSD configuration we can use defaul em9d rt libs
-  # for better performance runime should be rebuilt for emsdp configuration
+  # for better performance runtime should be built for emsdp configuration
   PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio
 
-#  DMITRYZ: I think we need to move it to target specific LCF file.
-  PLATFORM_LDFLAGS += $(notdir $(LCF_FILE))
-
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CXXFLAGS:=$(filter-out -std=c++11,$(CXXFLAGS))
-  CCFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS:=$(filter-out -std=c11,$(CCFLAGS))
-  LDFLAGS += $(PLATFORM_LDFLAGS)
-
-  MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS))
-
-  #  DMITRYZ: Here we need to check tags on "no_embarc_mli".
-  USE_EMBARC_MLI ?= true
-
-ifeq ($(USE_EMBARC_MLI), true)
-  ALL_TAGS += arc
-
-ifeq ($(PRE_COMPILED_MLI),true)
-  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
-
-  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
-  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/embarc_osp/LICENSE
-else
-  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
-
-  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
-
-  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
-  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
-  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_LIB_DIR)/LICENSE
-endif
-
-  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
-  GENERATED_PROJECT_LIBS += $(MLI_LIB)
-
-  INCLUDES += \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
-
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/$(MLI_INCLUDE_FOLDER) \
-    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
-
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
-
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
-
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
-
-endif # USE_EMBARC_MLI
-
-# We overwrite project generator to exclude everything not relevant to ARC platform
-define generate_microlite_projects
-$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
-$(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
-endef
-
-# Copy rule generator to do file copyes changing paths in generated project
-# Arguments are:
-# 1 - Path files in generated project.
-# 2 - Path files in the source repo
-# Used in helper_functions.inc for arc projects to copy files
-define path_changing_copy_file
-$(1)/%: $(2)/%
-	@mkdir -p $$(dir $$@)
-	@cp $$< $$@
-endef
-
-
-
-# These are microcontroller-specific rules for converting the ELF output
-# of the linker into a binary image that can be loaded directly.
-
-# Not applicable for ARC, leaving it empty.
-$(BINDIR)%.bin:
-
 endif

From 2226b67dc3bb0a55b30a6599a94454715afba102 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Tue, 7 Apr 2020 12:53:32 +0300
Subject: [PATCH 0075/1533] changed EMSDP to ARC_EMSDP and other minor fixes
 regarding guidline

---
 .../micro/{emsdp => arc_emsdp}/debug_log.cc   | 82 ++++++++++---------
 .../micro/tools/make/download_and_extract.sh  |  3 +-
 .../tools/make/targets/arc/arc_common.inc     | 22 ++++-
 ...dp_makefile.inc => arc_emsdp_makefile.inc} | 20 ++++-
 .../micro/tools/make/targets/arc_makefile.inc | 21 ++++-
 .../make/templates/arc/arc_app_makefile.tpl   | 22 -----
 6 files changed, 97 insertions(+), 73 deletions(-)
 rename tensorflow/lite/micro/{emsdp => arc_emsdp}/debug_log.cc (55%)
 rename tensorflow/lite/micro/tools/make/targets/{emsdp_makefile.inc => arc_emsdp_makefile.inc} (66%)

diff --git a/tensorflow/lite/micro/emsdp/debug_log.cc b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
similarity index 55%
rename from tensorflow/lite/micro/emsdp/debug_log.cc
rename to tensorflow/lite/micro/arc_emsdp/debug_log.cc
index 7d932939a0b..57eea6a5579 100644
--- a/tensorflow/lite/micro/emsdp/debug_log.cc
+++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,16 +23,20 @@ limitations under the License.
 // EMSDP_LOG_TO_MEMORY 
 //   : fill .debug_log memory region (data section) with passed chars. 
 // EMSDP_LOG_TO_HOST 
-//   : Use hostlink to print output log. 
+//   : Use MetaWare HostLink to print output log. Requires Synopsys MetaWare debugger  
 // EMSDP_LOG_TO_UART 
 //   : use default debug UART (out to FTDI channel 0). The same USB Port is used for JTAG.
 #define EMSDP_LOG_TO_UART
 
+// Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination
+#define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024)
 
-// For simplicity we assume U-boot has already initialized debug console durion 
-// application loading (or on reset).  Hence we use only status and data registers 
+
+// For simplicity we assume U-boot has already initialized debug console during 
+// application loading (or on reset). Hence, we use only status and data registers 
 // to organize blocking loop for printing symbols. No input and no IRQ handling. 
 // See embarc_osp repository for full EMSDP uart driver.
+// (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp)
 // TODO: Consider U-Boot API to do it in a less "hacky" way.
 void DbgUartSendStr(const char* s) {
 #define EMSDP_DBG_UART_BASE     (0xF0004000U)
@@ -40,48 +44,48 @@ void DbgUartSendStr(const char* s) {
 #define DW_UART_USR_TFNF        (0x02)
 #define DW_UART_LSR_TXD_EMPTY   (0x20)
 
-    typedef volatile struct dw_uart_reg {
-        uint32_t DATA;		/*!< data in/out and DLL */
-        uint32_t RES1[4];
-        uint32_t LSR;		/*!< Line Status Register */
-        uint32_t RES2[25];
-        uint32_t USR;		/*!< UART status register */
-        uint32_t RES3[29];
-        uint32_t CPR;		/*!< Component parameter register */
-    } DW_UART_REG;
+  typedef volatile struct dw_uart_reg {
+    uint32_t DATA;		/*!< data in/out and DLL */
+    uint32_t RES1[4];
+    uint32_t LSR;		/*!< Line Status Register */
+    uint32_t RES2[25];
+    uint32_t USR;		/*!< UART status register */
+    uint32_t RES3[29];
+    uint32_t CPR;		/*!< Component parameter register */
+  } DW_UART_REG;
 
-    DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
-    const char* src = s;
-    while (*src) {
-        // Check uart status to send char
-        bool uart_is_ready = false;
-        if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT)
-            uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0);
-        else
-            uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0);
+  DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
+  const char* src = s;
+  while (*src) {
+    // Check uart status to send char
+    bool uart_is_ready = false;
+    if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT)
+      uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0);
+    else
+      uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0);
 
-        // Send char if uart is ready. 
-        if (uart_is_ready)
-            uart_reg_ptr->DATA = *src++;
-    }
+    // Send char if uart is ready. 
+    if (uart_is_ready)
+      uart_reg_ptr->DATA = *src++;
+  }
 }
 
-// Simple symbols dump to a pre-allocated memory region. 
+// Simple dump of symbols to a pre-allocated memory region.
+// When total log exceeds memory region size, cursor is moved to its begining.
 // The memory region can be viewed afterward with debugger.
 // It can be viewed/read with debugger afterward.
 void LogToMem(const char* s) {
-    constexpr int kDebugLogMemChars = 2 * 1024;
-    static int cursor = 0;
+  static int cursor = 0;
 #pragma Bss(".debug_log")
-    volatile static char debug_log_mem[kDebugLogMemChars];
+  volatile static char debug_log_mem[EMSDP_LOG_TO_MEMORY_SIZE];
 #pragma Bss()
 
-    const char* src = s;
-    while (*src) {
-        debug_log_mem[cursor] = *src++;
-        cursor = (cursor < kDebugLogMemChars) ? cursor + 1 : 0;
-    }
-    debug_log_mem[cursor] = '^';
+  const char* src = s;
+  while (*src) {
+    debug_log_mem[cursor] = *src++;
+    cursor = (cursor < EMSDP_LOG_TO_MEMORY_SIZE) ? cursor + 1 : 0;
+  }
+  debug_log_mem[cursor] = '^';
 }
 
 
@@ -89,17 +93,17 @@ extern "C" void DebugLog(const char* s) {
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
 
 #if defined EMSDP_LOG_TO_UART
-    DbgUartSendStr(s);
+  DbgUartSendStr(s);
 #endif
 
 #if defined EMSDP_LOG_TO_MEMORY
 #warning "EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout"
-    LogToMem(s);
+  LogToMem(s);
 #endif
 
 #if defined EMSDP_LOG_TO_HOST
 #warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked."
-    fprintf(stderr, "%s", s);
+  fprintf(stderr, "%s", s);
 #endif
 
 #endif // TF_LITE_STRIP_ERROR_STRINGS
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 4a75b6b24cd..5b06e4e819a 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -170,7 +170,8 @@ download_and_extract() {
   elif [[ ${action} == "patch_cifar10_dataset" ]]; then
     patch_cifar10_dataset ${dir}
   elif [[ ${action} == "build_embarc_mli" ]]; then
-    build_embarc_mli ${dir} ${action_param1}
+    cp ${action_param1} ${dir}/hw/arc.tcf
+    build_embarc_mli ${dir} ../../hw/arc.tcf
   elif [[ ${action} ]]; then
     echo "Unknown action '${action}'"
     exit 1
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index e20887abb07..50bb5c96799 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -1,4 +1,18 @@
-# Common Settings for ARC platform and it's projects. 
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Common Settings for ARC platform and its projects. 
 # Might be reused across different targets
 
 ifeq ($(TARGET_ARCH), arc)
@@ -6,7 +20,7 @@ ifeq ($(TARGET_ARCH), arc)
   DLR := $$$$
 
   # List of folders to search project files for copy with path changing
-  # For instance, TCF and LCF files are copyed into the root of generated project
+  # For instance, TCF and LCF files are copied into the root of generated project
   ARC_TARGET_FILES_DIRS ?=
 
   # For the following variables see arc_app_makefile.tpl for usage
@@ -36,14 +50,14 @@ ifeq ($(TARGET_ARCH), arc)
   ARC_EXTRA_EXECUTE_RULES ?= 
 
 # We overwrite project generator to exclude everything not relevant to ARC platform.
-# ARC targets doesn't can't work with mbed, keil or other architecture specific development tools
+# ARC targets cannot work with non-ARC development tools.
 # Basic make project is updated to be applicable for general ARC platform
 define generate_microlite_projects
 $(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
 $(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
 endef
 
-# Copy rule generator to do file copyes with changing paths in generated project
+# Copy rule generator to do file copies with changing paths in generated project
 # Arguments are:
 # 1 - Path files in generated project.
 # 2 - Path files in the source repo
diff --git a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
similarity index 66%
rename from tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
rename to tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
index 9901fd82b07..a84dd15e4e8 100644
--- a/tensorflow/lite/micro/tools/make/targets/emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -1,5 +1,19 @@
-# Settings for arc processors
-ifeq ($(TARGET), emsdp)
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for EMSDP target (ARC processor)
+ifeq ($(TARGET), arc_emsdp)
 
   TARGET_ARCH := arc
   ARC_TOOLCHAIN := mwdt
@@ -37,7 +51,7 @@ ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
   ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
 endif
 
-  # for default EMSD configuration we can use defaul em9d rt libs
+  # for default EMSD configuration we can use default em9d rt libs
   # for better performance runtime should be built for emsdp configuration
   PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio
 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index 87d1b736807..db474a54b2d 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -1,19 +1,32 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Settings for not pre-defined ARC processors. 
 # User need to specify ARC target with Tool Configuration File (*.tcf). 
 # Path to this file must be passed through TCF_FILE variable.
 # Otherwise, default em7d_voice_audio configuration is used 
-
 ifeq ($(TARGET_ARCH), arc)
 
-# Known target are specifyed with their own make configurations. 
-ifeq ($(filter $(TARGET), emsdp iotdk),)
+# Known target are specified with their own make configurations. 
+ifeq ($(filter $(TARGET), arc_emsdp arc_iotdk),)
 
 ARC_TOOLCHAIN := mwdt
 
 ifneq ($(TCF_FILE), )
   TARGET = $(basename $(notdir $(TCF_FILE)))
 else
-  $(warning TCF_FILE variable is not specifyed. Use default em7d_voice_audio configuration)
+  $(warning TCF_FILE variable is not specified. Use default em7d_voice_audio configuration)
   TARGET = em7d_voice_audio
   TCF_FILE = em7d_voice_audio
 endif
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
index f79d04b26d1..a1a3ab71028 100644
--- a/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/arc/arc_app_makefile.tpl
@@ -29,28 +29,6 @@ else
     DEV_NULL=/dev/null
 endif
 
-# Note: Windows escaping rules is very combersome 
-# initially I tried to use Q=^, but this depends on the context and (looks like) on Win version.
-# Also expecially ugly thing is that in quoted strings the quotes the same are remain.
-# Batch has special parameter expansion syntax to remove quotes,
-# but many tools themselves remove quotes (unless escaped with backslash)
-# So finally we've found that in our use cases we may not escaping any symbols but prepend backslashes before quotes.
-
-quote=$(subst %,$(Q)%, \
-      $(subst &,$(Q)&, \
-      $(subst <,$(Q)<, \
-      $(subst >,$(Q)>, \
-      $(subst |,$(Q)|, \
-      $(subst ',$(Q)', \
-      $(subst $(COMMA),$(Q)$(COMMA), \
-      $(subst =,$(Q)=, \
-      $(subst $(OPEN_PAREN),$(Q)$(OPEN_PAREN), \
-      $(subst $(CLOSE_PAREN),$(Q)$(CLOSE_PAREN), \
-      $(subst !,$(Q)!, \
-      $(subst ",$(BACKSLASH)", \
-      $(subst $(Q),$(Q)$(Q), \
-      $(1) )))))))))))))
-
 #=============================================================
 # Toolchain definitions
 #=============================================================

From a7dcdb21f69ca8a5078ad855044e76fefa4f0199 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Wed, 8 Apr 2020 15:11:41 +0300
Subject: [PATCH 0076/1533] Move out of function ARC EMSDP UART related
 constatnts

---
 tensorflow/lite/micro/arc_emsdp/debug_log.cc | 33 +++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/arc_emsdp/debug_log.cc b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
index 57eea6a5579..b3b25f88ac1 100644
--- a/tensorflow/lite/micro/arc_emsdp/debug_log.cc
+++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
@@ -31,6 +31,24 @@ limitations under the License.
 // Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination
 #define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024)
 
+// EMSDP Debug UART related defines (registers and bits)
+#define EMSDP_DBG_UART_BASE (0xF0004000U)
+#define DW_UART_CPR_FIFO_STAT (1 << 10)
+#define DW_UART_USR_TFNF (0x02)
+#define DW_UART_LSR_TXD_EMPTY (0x20)
+
+// EMSDP UART registers map (only necessairy fields)
+typedef volatile struct dw_uart_reg {
+  uint32_t DATA; /* data in/out and DLL */
+  uint32_t RES1[4];
+  uint32_t LSR; /* Line Status Register */
+  uint32_t RES2[25];
+  uint32_t USR; /* UART status register */
+  uint32_t RES3[29];
+  uint32_t CPR; /* Component parameter register */
+} DW_UART_REG;
+
+
 
 // For simplicity we assume U-boot has already initialized debug console during 
 // application loading (or on reset). Hence, we use only status and data registers 
@@ -39,21 +57,6 @@ limitations under the License.
 // (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp)
 // TODO: Consider U-Boot API to do it in a less "hacky" way.
 void DbgUartSendStr(const char* s) {
-#define EMSDP_DBG_UART_BASE     (0xF0004000U)
-#define DW_UART_CPR_FIFO_STAT   (1<<10)
-#define DW_UART_USR_TFNF        (0x02)
-#define DW_UART_LSR_TXD_EMPTY   (0x20)
-
-  typedef volatile struct dw_uart_reg {
-    uint32_t DATA;		/*!< data in/out and DLL */
-    uint32_t RES1[4];
-    uint32_t LSR;		/*!< Line Status Register */
-    uint32_t RES2[25];
-    uint32_t USR;		/*!< UART status register */
-    uint32_t RES3[29];
-    uint32_t CPR;		/*!< Component parameter register */
-  } DW_UART_REG;
-
   DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
   const char* src = s;
   while (*src) {

From 105eac5030a346febc615202a4841330f2779c0b Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Wed, 8 Apr 2020 17:40:54 +0300
Subject: [PATCH 0077/1533] Include new parameters of generate_project for arc

---
 tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 50bb5c96799..67be50d4854 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -53,7 +53,7 @@ ifeq ($(TARGET_ARCH), arc)
 # ARC targets cannot work with non-ARC development tools.
 # Basic make project is updated to be applicable for general ARC platform
 define generate_microlite_projects
-$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
+$(call generate_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES),$(TARGET_TOOLCHAIN_ROOT),$(TARGET_TOOLCHAIN_PREFIX))
 $(call generate_arc_project,make,$(MAKE_PROJECT_FILES),$(1),$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(2),$(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(MICROLITE_TEST_HDRS) $(3),$(LDFLAGS) $(GENERATED_PROJECT_LIBS),$(CXXFLAGS) $(GENERATED_PROJECT_INCLUDES), $(CCFLAGS) $(GENERATED_PROJECT_INCLUDES))
 endef
 

From e85244f2c3833f63653a92081e75f3cb2412ccc3 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Thu, 9 Apr 2020 15:12:31 +0300
Subject: [PATCH 0078/1533] Fix arc target list and build for built-in arc
 configurations

---
 tensorflow/lite/micro/tools/make/download_and_extract.sh  | 8 ++++++--
 tensorflow/lite/micro/tools/make/targets/arc_makefile.inc | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 5b06e4e819a..3ab7c3ba7bd 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -170,8 +170,12 @@ download_and_extract() {
   elif [[ ${action} == "patch_cifar10_dataset" ]]; then
     patch_cifar10_dataset ${dir}
   elif [[ ${action} == "build_embarc_mli" ]]; then
-    cp ${action_param1} ${dir}/hw/arc.tcf
-    build_embarc_mli ${dir} ../../hw/arc.tcf
+    if [[ "${action_param1}" == *.tcf ]]; then
+      cp ${action_param1} ${dir}/hw/arc.tcf
+      build_embarc_mli ${dir} ../../hw/arc.tcf
+    else
+      build_embarc_mli ${dir} ${action_param1}
+    fi
   elif [[ ${action} ]]; then
     echo "Unknown action '${action}'"
     exit 1
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index db474a54b2d..d379eea86f1 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -19,7 +19,7 @@
 ifeq ($(TARGET_ARCH), arc)
 
 # Known target are specified with their own make configurations. 
-ifeq ($(filter $(TARGET), arc_emsdp arc_iotdk),)
+ifeq ($(filter $(TARGET), arc_emsdp),)
 
 ARC_TOOLCHAIN := mwdt
 

From 3006c316b64077a6bad64f42cb5e879351072b29 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Mon, 13 Apr 2020 11:22:46 +0300
Subject: [PATCH 0079/1533] embARC MLI related code as an external library
 which might be turned-off

---
 .../micro/kernels/{arc => embarc_mli}/conv.cc |  8 +--
 .../{arc => embarc_mli}/depthwise_conv.cc     |  8 +--
 .../{arc => embarc_mli}/fully_connected.cc    |  9 ++-
 .../{arc => embarc_mli}/mli_slicers.cc        |  0
 .../kernels/{arc => embarc_mli}/mli_slicers.h |  0
 .../{arc => embarc_mli}/mli_tf_utils.h        |  0
 .../kernels/{arc => embarc_mli}/pooling.cc    |  8 +--
 .../{arc => embarc_mli}/scratch_buf_mgr.cc    |  4 +-
 .../{arc => embarc_mli}/scratch_buf_mgr.h     |  0
 .../{arc => embarc_mli}/scratch_buffers.cc    |  2 +-
 .../{arc => embarc_mli}/scratch_buffers.h     |  0
 .../micro/tools/make/ext_libs/embarc_mli.inc  | 67 +++++++++++++++++++
 .../tools/make/targets/arc/arc_common.inc     | 63 -----------------
 13 files changed, 86 insertions(+), 83 deletions(-)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/conv.cc (98%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/depthwise_conv.cc (98%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/fully_connected.cc (97%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/mli_slicers.cc (100%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/mli_slicers.h (100%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/mli_tf_utils.h (100%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/pooling.cc (98%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buf_mgr.cc (98%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buf_mgr.h (100%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buffers.cc (98%)
 rename tensorflow/lite/micro/kernels/{arc => embarc_mli}/scratch_buffers.h (100%)
 create mode 100644 tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc

diff --git a/tensorflow/lite/micro/kernels/arc/conv.cc b/tensorflow/lite/micro/kernels/embarc_mli/conv.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/arc/conv.cc
rename to tensorflow/lite/micro/kernels/embarc_mli/conv.cc
index 6cf26c7d6d9..b124b17f66d 100644
--- a/tensorflow/lite/micro/kernels/arc/conv.cc
+++ b/tensorflow/lite/micro/kernels/embarc_mli/conv.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
rename to tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc
index 74e48c8c064..0ad2a9fe6c6 100644
--- a/tensorflow/lite/micro/kernels/arc/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc
@@ -25,10 +25,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/arc/fully_connected.cc b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc
similarity index 97%
rename from tensorflow/lite/micro/kernels/arc/fully_connected.cc
rename to tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc
index cc9b95c570a..8088634f8de 100644
--- a/tensorflow/lite/micro/kernels/arc/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc
@@ -23,14 +23,13 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
 
 #include "mli_api.h"
 
-
 namespace tflite {
 namespace ops {
 namespace micro {
diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.cc b/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/arc/mli_slicers.cc
rename to tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc
diff --git a/tensorflow/lite/micro/kernels/arc/mli_slicers.h b/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/arc/mli_slicers.h
rename to tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h
diff --git a/tensorflow/lite/micro/kernels/arc/mli_tf_utils.h b/tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
rename to tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h
diff --git a/tensorflow/lite/micro/kernels/arc/pooling.cc b/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/arc/pooling.cc
rename to tensorflow/lite/micro/kernels/embarc_mli/pooling.cc
index 7a26a10e23b..a147171a859 100644
--- a/tensorflow/lite/micro/kernels/arc/pooling.cc
+++ b/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_tf_utils.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/arc/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc
index 5bd2d6aed22..8d00e28714c 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
 #include <limits.h>
 #define MAX(A,B) (((A) > (B))? (A): (B))
 #define MIN(A,B) (((A) > (B))? (B): (A)) 
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc
index f36059f82d2..689c490569e 100644
--- a/tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/micro/kernels/arc/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
 #include <limits.h>
 #define MAX(A,B) (((A) > (B))? (A): (B))
 #define MIN(A,B) (((A) > (B))? (B): (A)) 
diff --git a/tensorflow/lite/micro/kernels/arc/scratch_buffers.h b/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/arc/scratch_buffers.h
rename to tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc
new file mode 100644
index 00000000000..851a5d43378
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc
@@ -0,0 +1,67 @@
+ifeq ($(TARGET_ARCH), arc)
+
+# embarc_mli Library is used by default for ARC platform whenever it's possible.
+# To use TFLM reference implementation it should be intentionally turned off 
+# by passing 'no_embarc_mli' tag (make -f <tflm_main_makefile> TAGS=no_embarc_mli ...)
+ifeq ($(filter no_embarc_mli,$(ALL_TAGS)),)
+
+
+ALL_TAGS += embarc_mli
+
+ifeq ($(PRE_COMPILED_MLI),true)
+  # TODO: Replace with proper embarc_mli pre-builts.
+  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
+
+  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
+  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/embarc_osp/LICENSE
+else
+  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
+
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+endif
+
+  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
+  GENERATED_PROJECT_LIBS += $(MLI_LIB)
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/$(MLI_INCLUDE_FOLDER) \
+    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
+
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h
+
+endif # no_embarc_mli
+endif # TARGET_ARCH
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 67be50d4854..4a9a5ccdfc3 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -129,70 +129,7 @@ endif
   LDFLAGS += $(PLATFORM_LDFLAGS)
 
 
-  # TODO: Move/organize embarc_mli usage in an implied way (see ext_libs/cmsis.inc for example
-  USE_EMBARC_MLI ?= true
 
-ifeq ($(USE_EMBARC_MLI), true)
-  # TODO: To understand why it's done here. The same is performed in the higher level MakeFile.
-  ALL_TAGS += arc
-
-ifeq ($(PRE_COMPILED_MLI),true)
-  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
-
-  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
-  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/embarc_osp/LICENSE
-else
-  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
-
-  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
-
-  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
-  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
-  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_LIB_DIR)/LICENSE
-endif
-
-  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
-  GENERATED_PROJECT_LIBS += $(MLI_LIB)
-
-  INCLUDES += \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
-
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/$(MLI_INCLUDE_FOLDER) \
-    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
-
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
-
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buffers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buffers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/scratch_buf_mgr.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_slicers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc/mli_slicers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc/mli_tf_utils.h
-
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
-
-endif # USE_EMBARC_MLI
 
 endif # ARC_TOOLCHAIN
 endif  # TARGET_ARCH

From 03bec25ed962226e59d9d4a8b23a55540ab33ca9 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Mon, 13 Apr 2020 14:06:35 +0300
Subject: [PATCH 0080/1533] Additional tests for embARC MLI specific slicing
 (initial mock version)

---
 .../kernels/embarc_mli/conv_slicing_test.cc   |  629 ++++++++++
 .../embarc_mli/depthwise_conv_slicing_test.cc |  768 ++++++++++++
 .../fully_connected_slicing_test.cc           |  938 ++++++++++++++
 .../embarc_mli/pooling_slicing_test.cc        | 1116 +++++++++++++++++
 .../micro/tools/make/ext_libs/embarc_mli.inc  |   11 +-
 5 files changed, 3461 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc
 create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc
 create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc
 create mode 100644 tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc

diff --git a/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc
new file mode 100644
index 00000000000..a1f155ecc56
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc
@@ -0,0 +1,629 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/micro_utils.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+// Common inputs and outputs.
+static const int kInputElements = 16;
+static const int kInputShape[] = {4, 2, 2, 4, 1};
+static const float kInputData[] = {1, 1, 1, 1, 2, 2, 2, 2,
+                                   1, 2, 3, 4, 1, 2, 3, 4};
+static const int kFilterElements = 12;
+static const int kFilterShape[] = {4, 3, 2, 2, 1};
+static const float kFilterData[] = {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1};
+static const int kBiasElements = 3;
+static const int kBiasShape[] = {1, 3};
+static const float kBiasData[] = {1, 2, 3};
+static const int kOutputElements = 12;
+static const int kOutputShape[] = {4, 2, 1, 2, 3};
+static const float kGoldenData[] = {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3};
+
+static TfLiteConvParams common_conv_params = {
+    kTfLitePaddingValid,  // padding
+    2,                    // stride_width
+    2,                    // stride_height
+    kTfLiteActNone,       // activation
+    1,                    // dilation_width_factor
+    1,                    // dilation_height_factor
+};
+
+template <typename T>
+TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
+                                 const T* expected_output_data, T* output_data,
+                                 int output_length,
+                                 TfLiteConvParams* conv_params,
+                                 float tolerance = 1e-5) {
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_CONV_2D, 1);
+
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = reinterpret_cast<const char*>(conv_params);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(conv_params);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TfLiteStatus return_val = registration->invoke(&context, &node);
+  if (return_val != kTfLiteOk) {
+    return return_val;
+  }
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
+  }
+  return kTfLiteOk;
+}
+
+void TestConvFloat(const int* input_dims_data, const float* input_data,
+                   const int* filter_dims_data, const float* filter_data,
+                   const int* bias_dims_data, const float* bias_data,
+                   const int* output_dims_data,
+                   const float* expected_output_data, float* output_data,
+                   TfLiteConvParams* conv_params) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(filter_data, filter_dims, "filter_tensor"),
+      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      ValidateConvGoldens(tensors, tensors_size, expected_output_data,
+                          output_data, output_dims_count, conv_params));
+}
+
+void TestConvQuantizedPerLayer(
+    const int* input_dims_data, const float* input_data,
+    uint8_t* input_quantized, float input_scale, const int* filter_dims_data,
+    const float* filter_data, uint8_t* filter_quantized, float filter_scale,
+    const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized,
+    const int* output_dims_data, const float* expected_output_data,
+    uint8_t* expected_output_quantized, uint8_t* output_data,
+    float output_scale, TfLiteConvParams* conv_params) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  tflite::AsymmetricQuantize(expected_output_data, expected_output_quantized,
+                             output_dims_count, output_scale, 128);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, 128, "input_tensor"),
+      CreateQuantizedTensor(filter_data, filter_quantized, filter_dims,
+                            filter_scale, 128, "filter_tensor"),
+      CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims,
+                                input_scale, filter_scale, "bias_tensor"),
+      CreateQuantizedTensor(output_data, output_dims, output_scale, 128,
+                            "output_tensor")};
+
+  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
+  float filter_scales[] = {1, filter_scale};
+  int filter_zero_points[] = {1, 128};
+  TfLiteAffineQuantization filter_quant = {
+      FloatArrayFromFloats(filter_scales),
+      IntArrayFromInts(filter_zero_points)};
+  tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant};
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      ValidateConvGoldens(tensors, tensors_size, expected_output_quantized,
+                          output_data, output_dims_count, conv_params));
+}
+
+void TestConvQuantizedPerChannel(
+    const int* input_dims_data, const float* input_data,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    int8_t* filter_data_quantized, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_data_quantized, float* bias_scales,
+    int* bias_zero_points, const int* output_dims_data,
+    const float* expected_output_data, int8_t* expected_output_data_quantized,
+    int8_t* output_data, float output_scale, int output_zero_point,
+    TfLiteConvParams* conv_params) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  int filter_zero_points[5];
+  float filter_scales[5];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor =
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor");
+  TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
+      filter_data, filter_data_quantized, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant, 0 /* quantized dimension */,
+      "filter_tensor");
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
+      bias_scales, bias_zero_points, &bias_quant, 0 /* quantized dimension */,
+      "bias_tensor");
+  TfLiteTensor output_tensor =
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            output_zero_point, "output_tensor");
+
+  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, input_zero_point};
+  TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
+                                          IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  float output_scales[] = {1, output_scale};
+  int output_zero_points[] = {1, output_zero_point};
+  TfLiteAffineQuantization output_quant = {
+      FloatArrayFromFloats(output_scales),
+      IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  tflite::AsymmetricQuantize(expected_output_data,
+                             expected_output_data_quantized, output_dims_count,
+                             output_scale, output_zero_point);
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk,
+      ValidateConvGoldens(tensors, tensors_size, expected_output_data_quantized,
+                          output_data, output_dims_count, conv_params,
+                          1.0 /* tolerance */));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTestFloat) {
+  float output_data[tflite::testing::kOutputElements];
+
+  tflite::testing::TestConvFloat(
+      tflite::testing::kInputShape, tflite::testing::kInputData,
+      tflite::testing::kFilterShape, tflite::testing::kFilterData,
+      tflite::testing::kBiasShape, tflite::testing::kBiasData,
+      tflite::testing::kOutputShape, tflite::testing::kGoldenData, output_data,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(InputAndFilterSameWidthHeight) {
+  const int output_dims_count = 2;
+  float output_data[output_dims_count];
+
+  const int kFilterShape[] = {4, 1, 2, 4, 1};
+  const float filter_values[] = {1, 2, 3, 4, -1, -1, 1, 1};
+  const int kBiasShape[] = {1, 1};
+  const float bias_values[] = {0};
+  const int kOutputShape[] = {4, 2, 1, 1, 1};
+  const float expected_output[] = {10, 34};
+
+  tflite::testing::TestConvFloat(
+      tflite::testing::kInputShape, tflite::testing::kInputData, kFilterShape,
+      filter_values, kBiasShape, bias_values, kOutputShape, expected_output,
+      output_data, &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  const int output_dims_count = 12;
+  uint8_t output_data[output_dims_count];
+
+  const float input_scale = 0.5f;
+  const float filter_scale = 0.5f;
+  const float output_scale = 1.0f;
+
+  uint8_t input_quantized[tflite::testing::kInputElements];
+  uint8_t filter_quantized[tflite::testing::kFilterElements];
+  int32_t bias_quantized[tflite::testing::kBiasElements];
+  uint8_t golden_quantized[tflite::testing::kOutputElements];
+
+  tflite::testing::TestConvQuantizedPerLayer(
+      tflite::testing::kInputShape, tflite::testing::kInputData,
+      input_quantized, input_scale, tflite::testing::kFilterShape,
+      tflite::testing::kFilterData, filter_quantized, filter_scale,
+      tflite::testing::kBiasShape, tflite::testing::kBiasData, bias_quantized,
+      tflite::testing::kOutputShape, tflite::testing::kGoldenData,
+      golden_quantized, output_data, output_scale,
+      &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
+  const int output_dims_count = 12;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 0.5f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInputElements];
+  int8_t filter_quantized[tflite::testing::kFilterElements];
+  int32_t bias_quantized[tflite::testing::kBiasElements];
+  int8_t golden_quantized[tflite::testing::kOutputElements];
+  int zero_points[tflite::testing::kBiasElements + 1];
+  float scales[tflite::testing::kBiasElements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInputShape, tflite::testing::kInputData,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilterShape, tflite::testing::kFilterData,
+      filter_quantized, tflite::testing::kBiasShape, tflite::testing::kBiasData,
+      bias_quantized, scales, zero_points, tflite::testing::kOutputShape,
+      tflite::testing::kGoldenData, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelRelu6) {
+  // conv params:
+  // padding, stride_<width,height>, dilation_<width, height>, activation
+  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
+  const int output_dims_count = 12;
+  int8_t output_data[output_dims_count];
+
+  const float bias_values[] = {1, 2, -3};
+  const float golden_data[] = {6, 2, 0, 6, 2, 0, 6, 4, 0, 6, 4, 0};
+
+  const float input_scale = 0.023529f;
+  const float output_scale = 0.023529f;
+  const int input_zero_point = -128;
+  const int output_zero_point = -128;
+
+  int8_t input_quantized[tflite::testing::kInputElements];
+  int8_t filter_quantized[tflite::testing::kFilterElements];
+  int32_t bias_quantized[tflite::testing::kBiasElements];
+  int8_t golden_quantized[tflite::testing::kOutputElements];
+  int zero_points[tflite::testing::kBiasElements + 1];
+  float scales[tflite::testing::kBiasElements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInputShape, tflite::testing::kInputData,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilterShape, tflite::testing::kFilterData,
+      filter_quantized, tflite::testing::kBiasShape, bias_values,
+      bias_quantized, scales, zero_points, tflite::testing::kOutputShape,
+      golden_data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {
+  // conv params:
+  // padding, stride_<width,height>, activation, dilation_<width, height>
+  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
+                                  kTfLiteActNone,      1, 1};
+  const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
+  const int kInputElements =
+      kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
+  float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2,
+                                            1, 2, 3, 4, 1, 2, 3, 4};
+  const int kFilterShape[] = {4, 3, 1, 1, 4};
+  const int kFilterElements =
+      kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4];
+  float kFilterData[/* kFilterElements */] = {1,  2, 3,  4,  -1, 1,
+                                              -1, 1, -1, -1, 1,  1};
+  const int kBiasElements = kFilterShape[1];
+  const int kBiasShape[] = {1, kBiasElements};
+  float kBiasData[/* kBiasElements */] = {1, 2, 3};
+  const int kOutputShape[] = {4, 1, 2, 2, kBiasElements};
+  const int kOutputElements = 4 * 3;
+  int8_t output_data[kOutputElements];
+  const float kGoldenData[/* kOutputElements */] = {11, 2, 3, 21, 2, 3,
+                                                    31, 4, 7, 31, 4, 7};
+
+  const float input_scale = 0.5f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[kInputElements];
+  int8_t filter_quantized[kFilterElements];
+  int32_t bias_quantized[kBiasElements];
+  int8_t golden_quantized[kOutputElements];
+  int zero_points[kBiasElements + 1];
+  float scales[kBiasElements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      kInputShape, kInputData, input_quantized, input_scale, input_zero_point,
+      kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData,
+      bias_quantized, scales, zero_points, kOutputShape, kGoldenData,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &conv_params);
+}
+
+TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
+  // conv params:
+  // padding, stride_<width,height>, dilation_<width, height>, activation
+  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
+  const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
+  const int kInputElements =
+      kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
+  float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2,
+                                            1, 2, 3, 4, 1, 2, 3, 4};
+  const int kFilterShape[] = {4, 3, 1, 1, 4};
+  const int kFilterElements =
+      kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4];
+  float kFilterData[/* kFilterElements */] = {1,  2, 3,  4,  -1, 1,
+                                              -1, 1, -1, -1, 1,  1};
+  const int kBiasElements = kFilterShape[1];
+  const int kBiasShape[] = {1, kBiasElements};
+  float kBiasData[/* kBiasElements */] = {1, 2, -3};
+  const int kOutputShape[] = {4, 1, 2, 2, kBiasElements};
+  const int kOutputElements = 4 * 3;
+  int8_t output_data[kOutputElements];
+  const float kGoldenData[/* kOutputElements */] = {6, 2, 0, 6, 2, 0,
+                                                    6, 4, 1, 6, 4, 1};
+
+  const float input_scale = 0.023529f;
+  const float output_scale = 0.023529f;
+  const int input_zero_point = -128;
+  const int output_zero_point = -128;
+
+  int8_t input_quantized[kInputElements];
+  int8_t filter_quantized[kFilterElements];
+  int32_t bias_quantized[kBiasElements];
+  int8_t golden_quantized[kOutputElements];
+  int zero_points[kBiasElements + 1];
+  float scales[kBiasElements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      kInputShape, kInputData, input_quantized, input_scale, input_zero_point,
+      kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData,
+      bias_quantized, scales, zero_points, kOutputShape, kGoldenData,
+      golden_quantized, output_data, output_scale, output_zero_point,
+      &conv_params);
+}
+
+TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
+  const int output_dims_count = 12;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 0.5f;
+  const float output_scale = 1.0f;
+
+  int8_t input_quantized[tflite::testing::kInputElements];
+  int8_t filter_quantized[tflite::testing::kFilterElements];
+  int32_t bias_quantized[tflite::testing::kBiasElements];
+  int8_t golden_quantized[tflite::testing::kOutputElements];
+  int zero_points[tflite::testing::kBiasElements + 1];
+  float scales[tflite::testing::kBiasElements + 1];
+
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kInputShape);
+  TfLiteIntArray* filter_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape);
+  TfLiteIntArray* bias_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape);
+  TfLiteIntArray* output_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape);
+
+  int filter_zero_points[5];
+  float filter_scales[5];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
+      tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0,
+      "input_tensor");
+  TfLiteTensor filter_tensor =
+      tflite::testing::CreateSymmetricPerChannelQuantizedTensor(
+          tflite::testing::kFilterData, filter_quantized, filter_dims,
+          filter_scales, filter_zero_points, &filter_quant,
+          0 /* quantized dimension */, "filter_tensor");
+  TfLiteTensor bias_tensor =
+      tflite::testing::CreatePerChannelQuantizedBiasTensor(
+          tflite::testing::kBiasData, bias_quantized, bias_dims, input_scale,
+          &filter_scales[1], scales, zero_points, &bias_quant, 0,
+          "bias_tensor");
+  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
+      output_data, output_dims, output_scale, 0 /* quantized dimension */,
+      "output_tensor");
+
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, 128};
+  TfLiteAffineQuantization input_quant = {
+      tflite::testing::FloatArrayFromFloats(input_scales),
+      tflite::testing::IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized,
+                             output_dims_count, output_scale, 0);
+
+  // Set filter quant to mismatched dimension.
+  TfLiteAffineQuantization* quant = reinterpret_cast<TfLiteAffineQuantization*>(
+      filter_tensor.quantization.params);
+
+  // Choose arbitrary incorrect scale and zero point sizes which are neither 1
+  // (for broadcast case) nor the quantized dimension size.
+  quant->scale->size = 2;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError,
+      tflite::testing::ValidateConvGoldens(
+          tensors, tensors_size, golden_quantized, output_data,
+          output_dims_count, &tflite::testing::common_conv_params));
+
+  // Set scale back to correct dimension, and make zero point array too short.
+  quant->scale->size = tflite::testing::kFilterShape[0];
+  quant->zero_point->size = 2;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError,
+      tflite::testing::ValidateConvGoldens(
+          tensors, tensors_size, golden_quantized, output_data,
+          output_dims_count, &tflite::testing::common_conv_params));
+}
+
+TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
+  const int output_dims_count = 12;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0f;
+  const float filter_scale = 1.0f;
+  const float output_scale = 1.0f;
+
+  int8_t input_quantized[tflite::testing::kInputElements];
+  int8_t filter_quantized[tflite::testing::kFilterElements];
+  int32_t bias_quantized[tflite::testing::kBiasElements];
+  int8_t golden_quantized[tflite::testing::kOutputElements];
+
+  TfLiteIntArray* input_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kInputShape);
+  TfLiteIntArray* filter_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape);
+  TfLiteIntArray* bias_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape);
+  TfLiteIntArray* output_dims =
+      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape);
+
+  // Create per-layer quantized int8 input tensor.
+  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
+      tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0,
+      "input_tensor");
+  int input_zero_points[2] = {1, 0};
+  float input_scales[2] = {1, input_scale};
+  TfLiteAffineQuantization input_quant = {
+      tflite::testing::FloatArrayFromFloats(input_scales),
+      tflite::testing::IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  // Create per-layer quantized int8 filter tensor.
+  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
+      tflite::testing::kFilterData, filter_quantized, filter_dims, filter_scale,
+      0, "filter_tensor");
+  int filter_zero_points[2] = {1, 0};
+  float filter_scales[2] = {1, filter_scale};
+  TfLiteAffineQuantization filter_quant = {
+      tflite::testing::FloatArrayFromFloats(filter_scales),
+      tflite::testing::IntArrayFromInts(filter_zero_points)};
+  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
+
+  // Create per-layer quantized int32 bias tensor.
+  tflite::SymmetricQuantize(tflite::testing::kBiasData, bias_quantized,
+                            tflite::testing::kBiasElements,
+                            input_scale * output_scale);
+  TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor(
+      bias_quantized, bias_dims, "bias_tensor");
+
+  int bias_zero_points[2] = {1, 0};
+  float bias_scales[2] = {1, input_scale * filter_scale};
+  TfLiteAffineQuantization bias_quant = {
+      tflite::testing::FloatArrayFromFloats(bias_scales),
+      tflite::testing::IntArrayFromInts(bias_zero_points)};
+  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
+
+  // Create per-layer quantized int8 output tensor.
+  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
+      output_data, output_dims, output_scale, 0 /* quantized dimension */,
+      "output_tensor");
+  int output_zero_points[2] = {1, 0};
+  float output_scales[2] = {1, output_scale};
+  TfLiteAffineQuantization output_quant = {
+      tflite::testing::FloatArrayFromFloats(output_scales),
+      tflite::testing::IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized,
+                             output_dims_count, output_scale, 0);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::testing::ValidateConvGoldens(
+                     tensors, tensors_size, golden_quantized, output_data,
+                     output_dims_count, &tflite::testing::common_conv_params));
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc
new file mode 100644
index 00000000000..8b79885a8a8
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc
@@ -0,0 +1,768 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+constexpr int kMaxFilterChannels = 64;
+constexpr int kMaxBiasChannels = 64;
+
+// Index of the output tensor in context->tensors, specific to
+// DepthwiseConv.
+constexpr int kOutputTensorIndex = 3;
+
+// Creates a DepthwiseConv opeerator, calls it with the provided input tensors
+// and some defaults parameters, and compares the output with
+// expected_output_data.
+//
+// The tensors parameter contains both the input tensors as well as a
+// preallocated output tensor into which the output is stored.
+template <typename T>
+TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
+                                          int output_length,
+                                          TfLiteFusedActivation activation,
+                                          float tolerance, int tensors_size,
+                                          TfLiteTensor* tensors) {
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  int input_depth = tensors[0].dims->data[3];
+  int output_depth = tensors[1].dims->data[3];
+  int depth_mul = output_depth / input_depth;
+  TfLiteDepthwiseConvParams builtin_data;
+  builtin_data.padding = kTfLitePaddingValid;
+  builtin_data.activation = activation;
+  builtin_data.stride_height = 1;
+  builtin_data.stride_width = 1;
+  builtin_data.dilation_height_factor = 1;
+  builtin_data.dilation_width_factor = 1;
+  builtin_data.depth_multiplier = depth_mul;
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TfLiteStatus invoke_status = registration->invoke(&context, &node);
+  if (invoke_status != kTfLiteOk) {
+    return invoke_status;
+  }
+
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  const T* output_data = tflite::GetTensorData<T>(&tensors[kOutputTensorIndex]);
+  for (int i = 0; i < output_length; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
+                              tolerance);
+  }
+  return kTfLiteOk;
+}
+
+void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data,
+                            const int* filter_dims_data,
+                            const float* filter_data, const int* bias_dims_data,
+                            const float* bias_data,
+                            const float* expected_output_data,
+                            const int* output_dims_data,
+                            TfLiteFusedActivation activation,
+                            float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(filter_data, filter_dims, "filter_tensor"),
+      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  ValidateDepthwiseConvGoldens(expected_output_data, output_dims_count,
+                               activation, 1e-5, tensors_size, tensors);
+}
+
+void TestDepthwiseConvQuantizedPerLayer(
+    const int* input_dims_data, const float* input_data,
+    uint8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    uint8_t* filter_quantized, float filter_scale, int filter_zero_point,
+    const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized,
+    const float* golden, uint8_t* golden_quantized, const int* output_dims_data,
+    uint8_t* output_data, float output_scale, int output_zero_point,
+    TfLiteFusedActivation activation) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      tflite::testing::CreateQuantizedTensor(input_data, input_quantized,
+                                             input_dims, input_scale,
+                                             input_zero_point, "input_tensor"),
+      tflite::testing::CreateQuantizedTensor(
+          filter_data, filter_quantized, filter_dims, filter_scale,
+          filter_zero_point, "filter_tensor"),
+      tflite::testing::CreateQuantizedBiasTensor(bias_data, bias_quantized,
+                                                 bias_dims, input_scale,
+                                                 filter_scale, "bias_tensor"),
+      tflite::testing::CreateQuantizedTensor(output_data, output_dims,
+                                             output_scale, output_zero_point,
+                                             "output_tensor"),
+  };
+
+  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
+  float filter_scales[] = {1, filter_scale};
+  int filter_zero_points[] = {1, 128};
+  TfLiteAffineQuantization filter_quant = {
+      FloatArrayFromFloats(filter_scales),
+      IntArrayFromInts(filter_zero_points)};
+  tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant};
+
+  float bias_scales[] = {1, filter_scale * input_scale};
+  int bias_zero_points[] = {1, 128};
+  TfLiteAffineQuantization bias_quant = {FloatArrayFromFloats(bias_scales),
+                                         IntArrayFromInts(bias_zero_points)};
+  tensors[2].quantization = {kTfLiteAffineQuantization, &bias_quant};
+
+  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
+                     output_zero_point);
+  ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, activation,
+                               1.0, tensors_size, tensors);
+}
+
+void TestDepthwiseConvQuantizedPerChannel(
+    const int* input_dims_data, const float* input_data,
+    int8_t* input_quantized, float input_scale, int input_zero_point,
+    const int* filter_dims_data, const float* filter_data,
+    int8_t* filter_data_quantized, const int* bias_dims_data,
+    const float* bias_data, int32_t* bias_data_quantized,
+    const int* output_dims_data, const float* expected_output_data,
+    int8_t* expected_output_data_quantized, int8_t* output_data,
+    float output_scale, int output_zero_point,
+    TfLiteFusedActivation activation) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  int filter_zero_points[kMaxFilterChannels];
+  float filter_scales[kMaxFilterChannels];
+  int bias_zero_points[kMaxBiasChannels];
+  float bias_scales[kMaxBiasChannels];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor =
+      CreateQuantizedTensor(input_data, input_quantized, input_dims,
+                            input_scale, input_zero_point, "input_tensor");
+  TfLiteTensor filter_tensor = CreateSymmetricPerChannelQuantizedTensor(
+      filter_data, filter_data_quantized, filter_dims, filter_scales,
+      filter_zero_points, &filter_quant, 3 /* quantized dimension */,
+      "filter_tensor");
+  TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
+      bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
+      bias_scales, bias_zero_points, &bias_quant, 3 /* quantized dimension */,
+      "bias_tensor");
+  TfLiteTensor output_tensor =
+      CreateQuantizedTensor(output_data, output_dims, output_scale,
+                            input_zero_point, "output_tensor");
+
+  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, input_zero_point};
+  TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
+                                          IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  float output_scales[] = {1, output_scale};
+  int output_zero_points[] = {1, output_zero_point};
+  TfLiteAffineQuantization output_quant = {
+      FloatArrayFromFloats(output_scales),
+      IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  AsymmetricQuantize(expected_output_data, expected_output_data_quantized,
+                     output_dims_count, output_scale, output_zero_point);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, ValidateDepthwiseConvGoldens(expected_output_data_quantized,
+                                              output_dims_count, activation,
+                                              1.0, tensors_size, tensors));
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int input_elements = 12;
+  const int input_shape[] = {4, 1, 3, 2, 2};
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const float bias_values[] = {1, 2, 3, 4};
+  const float golden[] = {
+      71, -34, 99, -20, 91, -26, 127, -4,
+  };
+  const int output_shape[] = {4, 1, 2, 1, 4};
+  const int output_dims_count = 8;
+  float output_data[output_dims_count];
+  tflite::testing::TestDepthwiseConvFloat(
+      input_shape, input_values, filter_shape, filter_values, bias_shape,
+      bias_values, golden, output_shape, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantized) {
+  const int input_elements = 12;
+  const int input_shape[] = {4, 1, 3, 2, 2};
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 8;
+  const float bias_values[] = {1, 2, 3, 4};
+  const float golden[] = {
+      71, -34, 99, -20, 91, -26, 127, -4,
+  };
+  const int output_shape[] = {4, 1, 2, 1, 4};
+
+  const float input_scale = 0.5f;
+  const int input_zero_point = 128;
+  const float filter_scale = 0.5f;
+  const int filter_zero_point = 128;
+  const float output_scale = 1.0f;
+  const int output_zero_point = 128;
+
+  uint8_t input_quantized[input_elements];
+  uint8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  uint8_t golden_quantized[output_elements];
+  uint8_t output_data[output_elements];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerLayer(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, filter_scale,
+      filter_zero_point, bias_shape, bias_values, bias_quantized, golden,
+      golden_quantized, output_shape, output_data, output_scale,
+      output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestRelu) {
+  const int input_elements = 12;
+  const int input_shape[] = {4, 1, 3, 2, 2};
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 8;
+  const float bias_values[] = {1, 2, 3, 4};
+  const int output_shape[] = {4, 1, 2, 1, 4};
+  const int output_dims_count = 8;
+  const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0};
+  float output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvFloat(
+      input_shape, input_values, filter_shape, filter_values, bias_shape,
+      bias_values, golden_relu, output_shape, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
+  const int input_elements = 12;
+  const int input_shape[] = {4, 1, 3, 2, 2};
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 8;
+  const float bias_values[] = {1, 2, 3, 4};
+  const int output_shape[] = {4, 1, 2, 1, 4};
+  const int output_dims_count = 8;
+  const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0};
+
+  const float input_scale = 0.5f;
+  const int input_zero_point = 128;
+  const float filter_scale = 0.5f;
+  const int filter_zero_point = 128;
+  const float output_scale = 1.0f;
+  const int output_zero_point = 128;
+
+  uint8_t input_quantized[input_elements];
+  uint8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  uint8_t golden_quantized[output_elements];
+  uint8_t output_data[output_elements];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerLayer(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, filter_scale,
+      filter_zero_point, bias_shape, bias_values, bias_quantized, golden_relu,
+      golden_quantized, output_shape, output_data, output_scale,
+      output_zero_point, kTfLiteActRelu);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
+  const int input_elements = 12;
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 16;
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const float bias_values[] = {1, 2, 3, 4};
+  const int output_dims_count = 9;
+  const int input_shape[] = {4, 1, 1, 9, 1};
+  const int filter_shape[] = {4, 2, 1, 8, 1};
+  const int bias_shape[] = {1, 1};
+  const float goldens[] = {
+      92, 56, 12, 22, 33, 72, 44, 20, 5,
+  };
+  const int output_shape[] = {4, 1, 1, 9, 1};
+
+  const float input_scale = 1.0f;
+  const int input_zero_point = 128;
+  const float filter_scale = 0.5f;
+  const int filter_zero_point = 128;
+  const float output_scale = 1.0f;
+  const int output_zero_point = 128;
+
+  uint8_t input_quantized[input_elements];
+  uint8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  uint8_t golden_quantized[output_dims_count];
+  uint8_t output_data[output_dims_count];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerLayer(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, filter_scale,
+      filter_zero_point, bias_shape, bias_values, bias_quantized, goldens,
+      golden_quantized, output_shape, output_data, output_scale,
+      output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
+  const int input_elements = 12;
+  const int input_shape[] = {4, 1, 3, 2, 2};
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 8;
+  const float bias_values[] = {1, 2, 3, 4};
+  const float golden[] = {
+      71, -34, 99, -20, 91, -26, 127, -4,
+  };
+  const int output_shape[] = {4, 1, 2, 1, 4};
+  const int output_dims_count = 8;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 0.5;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) {
+  const int input_elements = 12;
+  const int input_shape[] = {4, 1, 3, 2, 2};
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 8;
+  const int filter_shape[] = {4, 1, 2, 2, 2};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 4;
+  const float bias_values[] = {1, 2};
+  const float golden[] = {
+      -103,
+      127,
+      -128,
+      127,
+  };
+  const int output_shape[] = {4, 1, 2, 1, 2};
+  const int output_dims_count = 4;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) {
+  const int input_elements = 24;
+  const int input_shape[] = {4, 1, 3, 2, 4};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {0,  1, 8,   -2, -1, 2, -10, 0,
+                                 -1, 3, -18, 0,  0,  4, 20,  -3};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 8;
+  const float bias_values[] = {1, 2, 3, 4};
+  const float golden[] = {
+      0, 6, 3, 0, 0, 6, 3, 0,
+  };
+  const int output_shape[] = {4, 1, 2, 1, 4};
+  int8_t output_data[output_elements];
+  float output_float[output_elements];
+
+  const float input_scale = 0.023529f;
+  const float output_scale = 0.023529f;
+  const int input_zero_point = -128;
+  const int output_zero_point = -128;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvFloat(
+      input_shape, input_values, filter_shape, filter_values, bias_shape,
+      bias_values, golden, output_shape, kTfLiteActRelu6, output_float);
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActRelu6);
+}
+
+TF_LITE_MICRO_TEST(TestQuantizedPerChannelCompareWithFloat) {
+  const int input_dims[] = {4, 1, 2, 3, 2};
+  const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4};
+  const int filter_dims[] = {4, 1, 2, 2, 4};
+  const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2};
+  const int bias_dims[] = {4, 1, 1, 1, 4};
+  const float bias_data[] = {3, -2, 4, 6};
+  const int output_dims[] = {4, 1, 1, 2, 4};
+  const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36};
+
+  const int input_size = 12;
+  const int filter_size = 16;
+  const int output_size = 8;
+  const int bias_size = 4;
+  int8_t input_quantized[input_size];
+  int8_t filter_quantized[filter_size];
+  int32_t bias_quantized[bias_size];
+  int8_t golden_quantized[output_size];
+  int zero_points[bias_size + 1];
+  float scales[bias_size + 1];
+  int8_t output_data[output_size];
+  float output_float[output_size];
+
+  const float input_scale = 0.5;
+  const float output_scale = 1.0;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_dims, input_data, input_quantized, input_scale, input_zero_point,
+      filter_dims, filter_data, filter_quantized, bias_dims, bias_data,
+      bias_quantized, output_dims, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+
+  tflite::testing::TestDepthwiseConvFloat(
+      input_dims, input_data, filter_dims, filter_data, bias_dims, bias_data,
+      golden, output_dims, kTfLiteActNone, output_float);
+}
+
+TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
+  const int input_shape[] = {4, 1, 2, 3, 2};
+  const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4};
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2};
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const float bias_data[] = {3, -2, 4, 6};
+  const int output_shape[] = {4, 1, 1, 2, 4};
+  const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36};
+
+  const int input_size = 12;
+  const int filter_size = 16;
+  const int output_size = 8;
+  const int bias_size = 4;
+  int8_t input_quantized[input_size];
+  int8_t filter_quantized[filter_size];
+  int32_t bias_quantized[bias_size];
+  int8_t golden_quantized[output_size];
+  int zero_points[bias_size + 1];
+  float scales[bias_size + 1];
+  int8_t output_data[output_size];
+  float output_float[output_size];
+
+  const float input_scale = 0.5;
+  const float output_scale = 1.0;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
+  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
+  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
+  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
+
+  int filter_zero_points[5];
+  float filter_scales[5];
+  TfLiteAffineQuantization filter_quant;
+  TfLiteAffineQuantization bias_quant;
+  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_data, input_quantized, input_dims, input_scale, input_zero_point,
+      "input_tensor");
+  TfLiteTensor filter_tensor =
+      tflite::testing::CreateSymmetricPerChannelQuantizedTensor(
+          filter_data, filter_quantized, filter_dims, filter_scales,
+          filter_zero_points, &filter_quant, 0 /* quantized dimension */,
+          "filter_tensor");
+  TfLiteTensor bias_tensor =
+      tflite::testing::CreatePerChannelQuantizedBiasTensor(
+          bias_data, bias_quantized, bias_dims, input_scale, &filter_scales[1],
+          scales, zero_points, &bias_quant, 0, "bias_tensor");
+  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
+      output_data, output_dims, output_scale, output_zero_point,
+      "output_tensor");
+
+  float input_scales[] = {1, input_scale};
+  int input_zero_points[] = {1, input_zero_point};
+  TfLiteAffineQuantization input_quant = {
+      tflite::testing::FloatArrayFromFloats(input_scales),
+      tflite::testing::IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  // Set filter quant to mismatched dimension.
+  TfLiteAffineQuantization* quant = reinterpret_cast<TfLiteAffineQuantization*>(
+      filter_tensor.quantization.params);
+  quant->scale->size = 2;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
+                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
+                        tensors_size, tensors));
+
+  // Set scale back to correct dimension, and make zero point array too short.
+  quant->scale->size = filter_shape[0];
+  quant->zero_point->size = 2;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
+                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
+                        tensors_size, tensors));
+}
+
+TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
+  const float input_scale = 1.0f;
+  const float filter_scale = 1.0f;
+  const float output_scale = 1.0f;
+
+  const int input_elements = 12;
+  const int input_shape[] = {4, 1, 3, 2, 2};
+  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
+  const int filter_elements = 16;
+  const int filter_shape[] = {4, 1, 2, 2, 4};
+  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
+                                 5, 6, 7, 8, 13, -14, 15,  -16};
+  const int bias_elements = 4;
+  const int bias_shape[] = {4, 1, 1, 1, 4};
+  const int output_elements = 8;
+  const float bias_values[] = {1, 2, 3, 4};
+  const float golden[] = {
+      71, -34, 99, -20, 91, -26, 127, -4,
+  };
+  const int output_shape[] = {4, 1, 2, 1, 4};
+  const int output_dims_count = 8;
+  int8_t output_data[output_dims_count];
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+
+  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
+  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
+  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
+  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
+
+  // Create per-layer quantized int8 input tensor.
+  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
+      input_values, input_quantized, input_dims, input_scale, 0,
+      "input_tensor");
+  int input_zero_points[2] = {1, 0};
+  float input_scales[2] = {1, input_scale};
+  TfLiteAffineQuantization input_quant = {
+      tflite::testing::FloatArrayFromFloats(input_scales),
+      tflite::testing::IntArrayFromInts(input_zero_points)};
+  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
+
+  // Create per-layer quantized int8 filter tensor.
+  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
+      filter_values, filter_quantized, filter_dims, filter_scale, 0,
+      "filter_tensor");
+  int filter_zero_points[2] = {1, 0};
+  float filter_scales[2] = {1, filter_scale};
+  TfLiteAffineQuantization filter_quant = {
+      tflite::testing::FloatArrayFromFloats(filter_scales),
+      tflite::testing::IntArrayFromInts(filter_zero_points)};
+  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
+
+  // Create per-layer quantized int32 bias tensor.
+  tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements,
+                            input_scale * output_scale);
+  TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor(
+      bias_quantized, bias_dims, "bias_tensor");
+
+  int bias_zero_points[2] = {1, 0};
+  float bias_scales[2] = {1, input_scale * filter_scale};
+  TfLiteAffineQuantization bias_quant = {
+      tflite::testing::FloatArrayFromFloats(bias_scales),
+      tflite::testing::IntArrayFromInts(bias_zero_points)};
+  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
+
+  // Create per-layer quantized int8 output tensor.
+  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
+      output_data, output_dims, output_scale, 0, "output_tensor");
+  int output_zero_points[2] = {1, 0};
+  float output_scales[2] = {1, output_scale};
+  TfLiteAffineQuantization output_quant = {
+      tflite::testing::FloatArrayFromFloats(output_scales),
+      tflite::testing::IntArrayFromInts(output_zero_points)};
+  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      input_tensor,
+      filter_tensor,
+      bias_tensor,
+      output_tensor,
+  };
+
+  tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count,
+                             output_scale, 0);
+
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, tflite::testing::ValidateDepthwiseConvGoldens(
+                     golden_quantized, output_dims_count, kTfLiteActNone, 1e-5,
+                     tensors_size, tensors));
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc
new file mode 100644
index 00000000000..539c7ecc3a4
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc
@@ -0,0 +1,938 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestFullyConnectedFloat(
+    const int* input_dims_data, const float* input_data,
+    const int* weights_dims_data, const float* weights_data,
+    const int* bias_dims_data, const float* bias_data,
+    const float* expected_output_data, const int* output_dims_data,
+    TfLiteFusedActivation activation, float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(weights_data, weights_dims, "weights_tensor"),
+      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
+  }
+}
+
+template <typename T>
+void TestFullyConnectedQuantized(
+    const int* input_dims_data, const T* input_data, const float input_min,
+    const float input_max, const int* weights_dims_data, const T* weights_data,
+    const float weights_min, const float weights_max, const int* bias_dims_data,
+    const int32_t* bias_data, const float bias_scale,
+    const T* expected_output_data, const int* output_dims_data,
+    const float output_min, const float output_max,
+    TfLiteFusedActivation activation, T* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
+  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 3;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(weights_data, weights_dims, "weights_tensor",
+                            weights_min, weights_max),
+      CreateQuantized32Tensor(bias_data, bias_dims, "bias_tensor", bias_scale),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLiteFullyConnectedParams builtin_data = {
+      activation,
+      kTfLiteFullyConnectedWeightsFormatDefault,
+  };
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {3, 0, 1, 2};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 3};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTest) {
+  const int input_dims_data[] = {2, 2, 10};
+  const float input_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const float weights_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  };
+  const int bias_dims_data[] = {1, 3};
+  const float bias_data[] = {1, 2, 3};
+  const float expected_output_data[] = {
+      24, 25, 26, 58, 59, 60,
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest2) {
+  const int input_dims_data[] = {2, 2, 2};
+  const float input_data[] = {
+      1, 2,  // b = 0
+      2, 1,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 1, 2};
+  const float weights_data[] = {
+      2, 4,  // u = 0
+  };
+  const int bias_dims_data[] = {1, 1};
+  const float bias_data[] = {1};
+  const float expected_output_data[] = {
+      11,
+      9,
+  };
+  const int output_dims_data[] = {2, 2, 1};
+
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestRelu) {
+  const int input_dims_data[] = {2, 2, 10};
+  const float input_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const float weights_data[] = {
+      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
+      -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
+      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
+  };
+  const int bias_dims_data[] = {1, 3};
+  const float bias_data[] = {1, -2, 3};
+  const float expected_output_data[] = {
+      24, 0, 26, 58, 0, 60,
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
+      kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+// TODO(b/138811455): Fix code duplication in micro tests
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -64.0f;
+  const float weights_max = 63.5f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+      F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
+      F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
+      F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
+      F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
+      F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(0, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(0, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(0, output_min, output_max),  F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -64.0f;
+  const float weights_max = 63.5f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
+      F2QS(-1, weights_min, weights_max), F2QS(-2, weights_min, weights_max),
+      F2QS(-3, weights_min, weights_max), F2QS(-4, weights_min, weights_max),
+      F2QS(-5, weights_min, weights_max), F2QS(-6, weights_min, weights_max),
+      F2QS(-7, weights_min, weights_max), F2QS(-8, weights_min, weights_max),
+      F2QS(-9, weights_min, weights_max), F2QS(-10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(0, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(0, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(0, output_min, output_max),  F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+
+  const int input_dims_data[] = {2, 2, 10};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInput) {
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const float input_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const float weights_data[] = {
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  };
+  const int bias_dims_data[] = {1, 3};
+  const float bias_data[] = {1, 2, 3};
+  const float expected_output_data[] = {
+      24, 25, 26, 58, 59, 60,  // Expected results.
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  float output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedFloat(
+      input_dims_data, input_data, weights_dims_data, weights_data,
+      bias_dims_data, bias_data, expected_output_data, output_dims_data,
+      kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -63.5f;
+  const float weights_max = 64.0f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -63.5f;
+  const float input_max = 64.0f;
+  const float weights_min = -64.0f;
+  const float weights_max = 63.5f;
+  const float bias_scale = 0.25f;
+  const float output_min = -127.0f;
+  const float output_max = 128.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(
+    SimpleTest4DInputQuantizedUInt8OutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q;
+  using tflite::testing::F2Q32;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -127.0f;
+  const float weights_max = 128.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const uint8_t input_data[] = {
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
+      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
+      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
+      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
+      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
+      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
+      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const uint8_t weights_data[] = {
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
+      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
+      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
+      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
+      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const uint8_t expected_output_data[] = {
+      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
+      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
+      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  uint8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) {
+  using tflite::testing::F2Q32;
+  using tflite::testing::F2QS;
+
+  const float input_min = -127.0f;
+  const float input_max = 128.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -63.5f;
+  const float output_max = 64.0f;
+
+  const int input_dims_data[] = {4, 1, 1, 5, 1};
+  const int8_t input_data[] = {
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
+      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
+      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
+      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
+      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
+      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
+      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
+  };
+  const int weights_dims_data[] = {2, 3, 10};
+  const int8_t weights_data[] = {
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
+      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
+      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
+      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
+      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
+  };
+  const int bias_dims_data[] = {1, 3};
+  const int32_t bias_data[] = {
+      F2Q32(1, bias_scale),
+      F2Q32(2, bias_scale),
+      F2Q32(3, bias_scale),
+  };
+  const int8_t expected_output_data[] = {
+      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
+      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
+      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
+  };
+  const int output_dims_data[] = {2, 2, 3};
+
+  const int output_dims_count = 6;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data, input_data, input_min, input_max, weights_dims_data,
+      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
+      bias_scale, expected_output_data, output_dims_data, output_min,
+      output_max, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc
new file mode 100644
index 00000000000..8bfeb718a1b
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc
@@ -0,0 +1,1116 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
+                             std::initializer_list<float> input_data,
+                             const int filter_height, const int filter_width,
+                             const int stride_height, const int stride_width,
+                             std::initializer_list<float> expected_output_data,
+                             std::initializer_list<int> output_dims_data,
+                             TfLitePadding padding,
+                             TfLiteFusedActivation activation,
+                             float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
+                                   filter_width, filter_height, activation};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+template <typename T>
+void TestAveragePoolingQuantized(
+    std::initializer_list<int> input_dims_data,
+    std::initializer_list<T> input_data, const float input_min,
+    const float input_max, const int filter_height, const int filter_width,
+    const int stride_height, const int stride_width,
+    std::initializer_list<T> expected_output_data,
+    std::initializer_list<int> output_dims_data, float output_min,
+    float output_max, TfLitePadding padding, TfLiteFusedActivation activation,
+    T* output_data) {
+  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
+                                   filter_width, filter_height, activation};
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
+                      std::initializer_list<float> input_data, int filter_width,
+                      int filter_height, int stride_width, int stride_height,
+                      std::initializer_list<float> expected_output_data,
+                      std::initializer_list<int> output_dims_data,
+                      TfLitePadding padding, TfLiteFusedActivation activation,
+                      float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {
+      padding,      stride_width,  stride_height,
+      filter_width, filter_height, activation,
+  };
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+template <typename T>
+void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
+                          std::initializer_list<T> input_data, float input_min,
+                          float input_max, int filter_width, int filter_height,
+                          int stride_width, int stride_height,
+                          std::initializer_list<T> expected_output_data,
+                          float output_min, float output_max,
+                          std::initializer_list<int> output_dims_data,
+                          TfLitePadding padding,
+                          TfLiteFusedActivation activation, T* output_data) {
+  static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
+
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_dims_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  TfLitePoolParams builtin_data = {
+      padding,      stride_width,  stride_height,
+      filter_width, filter_height, activation,
+  };
+
+  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+  int temporaries_array_data[] = {0};
+  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = temporaries_array;
+  node.user_data = user_data;
+  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_dims_count; ++i) {
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+  }
+}
+
+}  // namespace
+
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestFloat) {
+  float output_data[2];
+  tflite::testing::TestAveragePoolingFloat({4, 1, 2, 4, 1},  // Input shape
+                                           {                 // Input values
+                                            0., 6., 2., 4., 3., 2., 10., 7.},
+                                           2, 2,  // filter width, filter height
+                                           2, 2,  // stride width, stride height
+                                           {
+                                               // Output values
+                                               2.75,
+                                               5.75,
+                                           },
+                                           {4, 1, 1, 2, 1},  // Output shape
+                                           kTfLitePaddingValid, kTfLiteActNone,
+                                           output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestUint8) {
+  using tflite::testing::F2Q;
+
+  const float input_min = -15.9375;
+  const float input_max = 15.9375;
+  const float output_min = -15.9375;
+  const float output_max = 15.9375;
+  uint8_t output_data[2];
+  tflite::testing::TestAveragePoolingQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(0., input_min, input_max),
+          F2Q(-6., input_min, input_max),
+          F2Q(2., input_min, input_max),
+          F2Q(4., input_min, input_max),
+          F2Q(3., input_min, input_max),
+          F2Q(2., input_min, input_max),
+          F2Q(-10., input_min, input_max),
+          F2Q(7., input_min, input_max),
+      },
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter width, filter height
+      2, 2,                  // stride width, stride height
+      {
+          // Output values
+          F2Q(0., output_min, output_max),
+          F2Q(0.75, output_min, output_max),
+      },
+      {4, 1, 1, 2, 1},         // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2ActNone) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -15.9375;
+  const float input_max = 15.8130;
+  const float output_min = -15.9375;
+  const float output_max = 15.8130;
+  int8_t output_data[2];
+  tflite::testing::TestAveragePoolingQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {                 // Input values
+       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
+       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
+       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
+       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      2, 2,                  // stride height, stride width
+      {                      // Output values
+       F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
+      {4, 1, 1, 2, 1},         // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride1Stride2Relu) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -15.9375;
+  const float input_max = 15.8130;
+  const float output_min = -15.9375;
+  const float output_max = 15.8130;
+  int8_t output_data[3];
+  tflite::testing::TestAveragePoolingQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {                 // Input values
+       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
+       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
+       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
+       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      2, 1,                  // stride height, stride width
+      {                      // Output values
+       F2QS(0., output_min, output_max), F2QS(0., output_min, output_max),
+       F2QS(0.75, output_min, output_max)},
+      {4, 1, 1, 3, 1},         // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -15.9375;
+  const float input_max = 15.8130;
+  const float output_min = -15.9375;
+  const float output_max = 15.8130;
+  int8_t output_data[2];
+  tflite::testing::TestAveragePoolingQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {                 // Input values
+       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
+       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
+       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
+       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      1, 2,                  // stride height, stride width
+      {                      // Output values
+       F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
+      {4, 1, 1, 2, 1},         // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Relu6) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -15.9375;
+  const float input_max = 15.8130;
+  const float output_min = -15.9375;
+  const float output_max = 15.8130;
+  int8_t output_data[2];
+  tflite::testing::TestAveragePoolingQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {                 // Input values
+       F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max),
+       F2QS(8., input_min, input_max), F2QS(4., input_min, input_max),
+       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
+       F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)},
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      2, 2,                  // stride height, stride width
+      {                      // Output values
+       F2QS(0.5, output_min, output_max), F2QS(6., output_min, output_max)},
+      {4, 1, 1, 2, 1},         // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -15.9375;
+  const float input_max = 15.8130;
+  const float output_min = -15.9375;
+  const float output_max = 15.8130;
+  int8_t output_data[8];
+  tflite::testing::TestAveragePoolingQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {                 // Input values
+       F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max),
+       F2QS(8., input_min, input_max), F2QS(4., input_min, input_max),
+       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
+       F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)},
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      1, 1,                  // stride height, stride width
+      {                      // Output values
+       F2QS(0.5, output_min, output_max), F2QS(3.5, output_min, output_max),
+       F2QS(7.25, output_min, output_max), F2QS(5.5, output_min, output_max),
+       F2QS(2.5, output_min, output_max), F2QS(6., output_min, output_max),
+       F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
+      {4, 1, 2, 4, 1},         // Output shape
+      output_min, output_max,  // output quantization range
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {
+  float output_data[2];
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {                 // Input values
+                                     0, 6, 2, 4, 3, 2, 10, 7},
+                                    2, 2,  // filter width, filter height
+                                    2, 2,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        6,
+                                        10,
+                                    },
+                                    {4, 1, 1, 2, 1},  // Output shape
+                                    kTfLitePaddingValid, kTfLiteActNone,
+                                    output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu) {
+  float output_data[2];
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {
+                                        // Input values
+                                        -1, -6, 2, 4,     //
+                                        -3, -2, 10.5, 7,  //
+                                    },
+                                    2, 2,  // filter width, filter height
+                                    2, 2,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        0.0,
+                                        10.5,
+                                    },
+                                    {4, 1, 1, 2, 1},  // Output shape
+                                    kTfLitePaddingValid, kTfLiteActRelu,
+                                    output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
+  float output_data[2];
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {
+                                        // Input values
+                                        -2.75, -6, 0.2, 0.4,  //
+                                        -3, -2, -0.3, 0.7,    //
+                                    },
+                                    2, 2,  // filter width, filter height
+                                    2, 2,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        -1.0,
+                                        0.7,
+                                    },
+                                    {4, 1, 1, 2, 1},  // Output shape
+                                    kTfLitePaddingValid, kTfLiteActRelu1,
+                                    output_data);
+
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {
+                                        // Input values
+                                        -2.75, -6, -2, -4,  //
+                                        -3, -2, 10, -7,     //
+                                    },
+                                    2, 2,  // filter width, filter height
+                                    2, 2,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        -1.0,
+                                        1.0,
+                                    },
+                                    {4, 1, 1, 2, 1},  // Output shape
+                                    kTfLitePaddingValid, kTfLiteActRelu1,
+                                    output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu6) {
+  float output_data[2];
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {
+                                        // Input values
+                                        -1.5, -6, 12, 4,  //
+                                        -3, -2, 10, 7,    //
+                                    },
+                                    2, 2,  // filter width, filter height
+                                    2, 2,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        0.0,
+                                        6.0,
+                                    },
+                                    {4, 1, 1, 2, 1},  // Output shape
+                                    kTfLitePaddingValid, kTfLiteActRelu6,
+                                    output_data);
+
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {
+                                        // Input values
+                                        0, 4.5, 12, 4,  //
+                                        3, 2, 10, 7,    //
+                                    },
+                                    2, 2,  // filter width, filter height
+                                    2, 2,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        4.5,
+                                        6.0,
+                                    },
+                                    {4, 1, 1, 2, 1},  // Output shape
+                                    kTfLitePaddingValid, kTfLiteActRelu6,
+                                    output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingSameStride1) {
+  float output_data[8];
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {
+                                        // Input values
+                                        0, 6, 2, 4,   //
+                                        3, 2, 10, 7,  //
+                                    },
+                                    2, 2,  // filter width, filter height
+                                    1, 1,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        6, 10, 10, 7,  //
+                                        3, 10, 10, 7,  //
+                                    },
+                                    {4, 1, 2, 4, 1},  // Output shape
+                                    kTfLitePaddingSame, kTfLiteActNone,
+                                    output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingValidStride1) {
+  float output_data[3];
+  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
+                                    {
+                                        // Input values
+                                        0, 6, 2, 4,   //
+                                        3, 2, 10, 7,  //
+                                    },
+                                    2, 2,  // filter width, filter height
+                                    1, 1,  // stride width, stride height
+                                    {
+                                        // Output values
+                                        6,
+                                        10,
+                                        10,
+                                    },
+                                    {4, 1, 1, 3, 1},  // Output shape
+                                    kTfLitePaddingValid, kTfLiteActNone,
+                                    output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestUInt8ActNone) {
+  using tflite::testing::F2Q;
+
+  uint8_t output_data[2];
+  float input_min = 0;
+  float input_max = 15.9375;
+  float output_min = 0;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(0, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2Q(6, output_min, output_max), F2Q(10, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
+  using tflite::testing::F2Q;
+
+  uint8_t output_data[2];
+  float input_min = -15.9375;
+  float input_max = 15.9375;
+  float output_min = -15.9375;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(-1.5, input_min, input_max),
+          F2Q(-6, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(-3, input_min, input_max),
+          F2Q(-2, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2Q(0, output_min, output_max), F2Q(10, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
+  using tflite::testing::F2Q;
+
+  uint8_t output_data[2];
+  float input_min = -15.9375;
+  float input_max = 15.9375;
+  float output_min = -15.9375;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(-1.7, input_min, input_max),
+          F2Q(-6, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(-3, input_min, input_max),
+          F2Q(-2, input_min, input_max),
+          F2Q(-10, input_min, input_max),
+          F2Q(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2Q(-1.0, output_min, output_max), F2Q(1.0, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
+  using tflite::testing::F2Q;
+
+  uint8_t output_data[8];
+  float input_min = -15.9375;
+  float input_max = 15.9375;
+  float output_min = -15.9375;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(0, input_min, input_max),
+          F2Q(-6, input_min, input_max),
+          F2Q(12, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(-3, input_min, input_max),
+          F2Q(-2, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2Q(0.0, output_min, output_max), F2Q(6.0, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(0, input_min, input_max),
+          F2Q(4.5, input_min, input_max),
+          F2Q(12, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2Q(4.5, output_min, output_max), F2Q(6.0, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) {
+  using tflite::testing::F2Q;
+
+  uint8_t output_data[8];
+  float input_min = 0;
+  float input_max = 15.9375;
+  float output_min = 0;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(0, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {
+          // Output values
+          F2Q(6, output_min, output_max),
+          F2Q(10, output_min, output_max),
+          F2Q(10, output_min, output_max),
+          F2Q(7, output_min, output_max),
+          F2Q(3, output_min, output_max),
+          F2Q(10, output_min, output_max),
+          F2Q(10, output_min, output_max),
+          F2Q(7, output_min, output_max),
+      },
+      output_min, output_max, {4, 1, 2, 4, 1},  // Output shape
+      kTfLitePaddingSame, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) {
+  using tflite::testing::F2Q;
+
+  uint8_t output_data[3];
+  float input_min = 0;
+  float input_max = 15.9375;
+  float output_min = 0;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2Q(0, input_min, input_max),
+          F2Q(6, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(4, input_min, input_max),
+          F2Q(3, input_min, input_max),
+          F2Q(2, input_min, input_max),
+          F2Q(10, input_min, input_max),
+          F2Q(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {
+          // Output values
+          F2Q(6, output_min, output_max),
+          F2Q(10, output_min, output_max),
+          F2Q(10, output_min, output_max),
+      },
+      output_min, output_max, {4, 1, 1, 3, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleMaxPoolTestInt8ActNone) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[2];
+  float input_min = 0;
+  float input_max = 15.9375;
+  float output_min = 0;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2QS(0, input_min, input_max),
+          F2QS(6, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(3, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(10, input_min, input_max),
+          F2QS(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2QS(6, output_min, output_max), F2QS(10, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[2];
+  float input_min = -15.9375;
+  float input_max = 15.9375;
+  float output_min = -15.9375;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2QS(-1.5, input_min, input_max),
+          F2QS(-6, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(-3, input_min, input_max),
+          F2QS(-2, input_min, input_max),
+          F2QS(10, input_min, input_max),
+          F2QS(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2QS(0, output_min, output_max), F2QS(10, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[2];
+  float input_min = -15.9375;
+  float input_max = 15.9375;
+  float output_min = -15.9375;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2QS(-1.7, input_min, input_max),
+          F2QS(-6, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(-3, input_min, input_max),
+          F2QS(-2, input_min, input_max),
+          F2QS(-10, input_min, input_max),
+          F2QS(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2QS(-1.0, output_min, output_max), F2QS(1.0, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[8];
+  float input_min = -15.9375;
+  float input_max = 15.9375;
+  float output_min = -15.9375;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 2;
+  int stride_height = 2;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2QS(0, input_min, input_max),
+          F2QS(-6, input_min, input_max),
+          F2QS(12, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(-3, input_min, input_max),
+          F2QS(-2, input_min, input_max),
+          F2QS(10, input_min, input_max),
+          F2QS(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2QS(0.0, output_min, output_max), F2QS(6.0, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2QS(0, input_min, input_max),
+          F2QS(4.5, input_min, input_max),
+          F2QS(12, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(3, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(10, input_min, input_max),
+          F2QS(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {// Output values
+       F2QS(4.5, output_min, output_max), F2QS(6.0, output_min, output_max)},
+      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[8];
+  float input_min = 0;
+  float input_max = 15.9375;
+  float output_min = 0;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2QS(0, input_min, input_max),
+          F2QS(6, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(3, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(10, input_min, input_max),
+          F2QS(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {
+          // Output values
+          F2QS(6, output_min, output_max),
+          F2QS(10, output_min, output_max),
+          F2QS(10, output_min, output_max),
+          F2QS(7, output_min, output_max),
+          F2QS(3, output_min, output_max),
+          F2QS(10, output_min, output_max),
+          F2QS(10, output_min, output_max),
+          F2QS(7, output_min, output_max),
+      },
+      output_min, output_max, {4, 1, 2, 4, 1},  // Output shape
+      kTfLitePaddingSame, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[3];
+  float input_min = 0;
+  float input_max = 15.9375;
+  float output_min = 0;
+  float output_max = 15.9375;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+  tflite::testing::TestMaxPoolQuantized(
+      {4, 1, 2, 4, 1},  // Input shape
+      {
+          // Input values
+          F2QS(0, input_min, input_max),
+          F2QS(6, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(4, input_min, input_max),
+          F2QS(3, input_min, input_max),
+          F2QS(2, input_min, input_max),
+          F2QS(10, input_min, input_max),
+          F2QS(7, input_min, input_max),
+      },
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      {
+          // Output values
+          F2QS(6, output_min, output_max),
+          F2QS(10, output_min, output_max),
+          F2QS(10, output_min, output_max),
+      },
+      output_min, output_max, {4, 1, 1, 3, 1},  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc
index 851a5d43378..0cba07d9d27 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc
@@ -1,6 +1,6 @@
 ifeq ($(TARGET_ARCH), arc)
 
-# embarc_mli Library is used by default for ARC platform whenever it's possible.
+# embarc_mli Library is used by default for ARC platform whenever it is possible.
 # To use TFLM reference implementation it should be intentionally turned off 
 # by passing 'no_embarc_mli' tag (make -f <tflm_main_makefile> TAGS=no_embarc_mli ...)
 ifeq ($(filter no_embarc_mli,$(ALL_TAGS)),)
@@ -63,5 +63,14 @@ endif
     MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc
     MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h
 
+
+  MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/embarc_mli/*test.cc)
+
+  EMBARC_MLI_TESTS := conv depthwise_conv pooling fully_connected
+  EMBARC_MLI_TESTS += $(foreach TEST,$(EMBARC_MLI_TESTS), $(TEST)_slicing)
+
+generate_embarc_mli_test_projects: $(foreach TEST,$(EMBARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project)
+
+
 endif # no_embarc_mli
 endif # TARGET_ARCH

From fc83b7fedb4f8727ac63c9e8b4c3bc7e8e75643c Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Wed, 15 Apr 2020 13:26:08 +0300
Subject: [PATCH 0081/1533] embARC MLI related code is present in arc_mli

---
 .../kernels/{embarc_mli => arc_mli}/conv.cc   |  8 +-
 .../conv_slicing_test.cc                      |  0
 .../{embarc_mli => arc_mli}/depthwise_conv.cc |  8 +-
 .../depthwise_conv_slicing_test.cc            |  0
 .../fully_connected.cc                        |  8 +-
 .../fully_connected_slicing_test.cc           |  0
 .../{embarc_mli => arc_mli}/mli_slicers.cc    |  0
 .../{embarc_mli => arc_mli}/mli_slicers.h     |  0
 .../{embarc_mli => arc_mli}/mli_tf_utils.h    |  0
 .../{embarc_mli => arc_mli}/pooling.cc        |  8 +-
 .../pooling_slicing_test.cc                   |  0
 .../scratch_buf_mgr.cc                        |  4 +-
 .../{embarc_mli => arc_mli}/scratch_buf_mgr.h |  0
 .../scratch_buffers.cc                        |  2 +-
 .../{embarc_mli => arc_mli}/scratch_buffers.h |  0
 .../micro/tools/make/ext_libs/arc_mli.inc     | 92 +++++++++++++++++++
 .../micro/tools/make/ext_libs/embarc_mli.inc  | 76 ---------------
 17 files changed, 111 insertions(+), 95 deletions(-)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/conv.cc (98%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/conv_slicing_test.cc (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/depthwise_conv.cc (98%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/depthwise_conv_slicing_test.cc (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/fully_connected.cc (98%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/fully_connected_slicing_test.cc (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/mli_slicers.cc (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/mli_slicers.h (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/mli_tf_utils.h (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/pooling.cc (98%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/pooling_slicing_test.cc (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buf_mgr.cc (98%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buf_mgr.h (100%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buffers.cc (98%)
 rename tensorflow/lite/micro/kernels/{embarc_mli => arc_mli}/scratch_buffers.h (100%)
 create mode 100644 tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
 delete mode 100644 tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc

diff --git a/tensorflow/lite/micro/kernels/embarc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/embarc_mli/conv.cc
rename to tensorflow/lite/micro/kernels/arc_mli/conv.cc
index b124b17f66d..d02f081434f 100644
--- a/tensorflow/lite/micro/kernels/embarc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -24,10 +24,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/conv_slicing_test.cc
rename to tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc
rename to tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 0ad2a9fe6c6..049347cc7a1 100644
--- a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -25,10 +25,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/depthwise_conv_slicing_test.cc
rename to tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc
rename to tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 8088634f8de..61fa0ff397f 100644
--- a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -23,10 +23,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/fully_connected_slicing_test.cc
rename to tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc
rename to tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h
rename to tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h b/tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h
rename to tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/embarc_mli/pooling.cc
rename to tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index a147171a859..ced5c4a21b8 100644
--- a/tensorflow/lite/micro/kernels/embarc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -20,10 +20,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
 
 #include "mli_api.h"
 
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/pooling_slicing_test.cc
rename to tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc
rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
index 8d00e28714c..d030d04170c 100644
--- a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 #include <limits.h>
 #define MAX(A,B) (((A) > (B))? (A): (B))
 #define MIN(A,B) (((A) > (B))? (B): (A)) 
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h
rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
similarity index 98%
rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc
rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
index 689c490569e..a770e4ccd66 100644
--- a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 #include <limits.h>
 #define MAX(A,B) (((A) > (B))? (A): (B))
 #define MIN(A,B) (((A) > (B))? (B): (A)) 
diff --git a/tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
similarity index 100%
rename from tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h
rename to tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
new file mode 100644
index 00000000000..3b8fa04d536
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
@@ -0,0 +1,92 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Settings for embARC MLI library for ARC platform. 
+
+ifeq ($(TARGET_ARCH), arc)
+
+# MLI Library is used by default for ARC platform whenever it is possible.
+# To use TFLM reference implementation MLI should be intentionally turned off 
+# by passing 'no_arc_mli' tag (make -f <tflm_main_makefile> TAGS=no_arc_mli ...)
+ifeq ($(filter no_arc_mli,$(ALL_TAGS)),)
+
+
+ALL_TAGS += arc_mli
+
+ifeq ($(PRE_COMPILED_MLI),true)
+  # TODO: Replace with proper arc_mli pre-builts.
+  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
+
+  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
+  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/embarc_osp/LICENSE
+else
+  MLI_LIB_DIR = arc_mli_$(basename $(TCF_FILE_NAME))
+
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+endif
+
+  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
+  GENERATED_PROJECT_LIBS += $(MLI_LIB)
+
+  INCLUDES += \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
+    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
+
+  GENERATED_PROJECT_INCLUDES += \
+    -I. \
+    -I./third_party/$(MLI_INCLUDE_FOLDER) \
+    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
+
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
+    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
+
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h
+    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h
+
+
+  MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/arc_mli/*test.cc)
+
+  ARC_MLI_TESTS := conv depthwise_conv pooling fully_connected
+  ARC_MLI_TESTS += $(foreach TEST,$(ARC_MLI_TESTS), $(TEST)_slicing)
+
+generate_arc_mli_test_projects: $(foreach TEST,$(ARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project)
+
+
+endif # no_embarc_mli
+endif # TARGET_ARCH
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc
deleted file mode 100644
index 0cba07d9d27..00000000000
--- a/tensorflow/lite/micro/tools/make/ext_libs/embarc_mli.inc
+++ /dev/null
@@ -1,76 +0,0 @@
-ifeq ($(TARGET_ARCH), arc)
-
-# embarc_mli Library is used by default for ARC platform whenever it is possible.
-# To use TFLM reference implementation it should be intentionally turned off 
-# by passing 'no_embarc_mli' tag (make -f <tflm_main_makefile> TAGS=no_embarc_mli ...)
-ifeq ($(filter no_embarc_mli,$(ALL_TAGS)),)
-
-
-ALL_TAGS += embarc_mli
-
-ifeq ($(PRE_COMPILED_MLI),true)
-  # TODO: Replace with proper embarc_mli pre-builts.
-  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
-
-  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
-  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/embarc_osp/LICENSE
-else
-  MLI_LIB_DIR = embarc_mli_$(basename $(TCF_FILE_NAME))
-
-  $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
-
-  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
-  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/libmli.a
-  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/libmli.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_LIB_DIR)/LICENSE
-endif
-
-  THIRD_PARTY_CC_HDRS += $(MLI_LIB)
-  GENERATED_PROJECT_LIBS += $(MLI_LIB)
-
-  INCLUDES += \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER) \
-    -I$(MAKEFILE_DIR)/downloads/$(MLI_INCLUDE_FOLDER)/api
-
-  GENERATED_PROJECT_INCLUDES += \
-    -I. \
-    -I./third_party/$(MLI_INCLUDE_FOLDER) \
-    -I./third_party/$(MLI_INCLUDE_FOLDER)/api
-
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_config.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/mli_types.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_helpers_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_kernels_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_avepool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_depthwise_conv2d_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_krn_maxpool_spec_api.h \
-    third_party/$(MLI_INCLUDE_FOLDER)/api/mli_mov_api.h
-
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buffers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/scratch_buf_mgr.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.h
-    MICROLITE_CC_SRCS += tensorflow/lite/micro/kernels/embarc_mli/mli_slicers.cc
-    MICROLITE_CC_HDRS += tensorflow/lite/micro/kernels/embarc_mli/mli_tf_utils.h
-
-
-  MICROLITE_TEST_SRCS += $(wildcard tensorflow/lite/micro/kernels/embarc_mli/*test.cc)
-
-  EMBARC_MLI_TESTS := conv depthwise_conv pooling fully_connected
-  EMBARC_MLI_TESTS += $(foreach TEST,$(EMBARC_MLI_TESTS), $(TEST)_slicing)
-
-generate_embarc_mli_test_projects: $(foreach TEST,$(EMBARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project)
-
-
-endif # no_embarc_mli
-endif # TARGET_ARCH

From 1196bed72bcedb8abc72a3da70c7ba58af03395f Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Thu, 16 Apr 2020 12:15:40 +0300
Subject: [PATCH 0082/1533] Merge latest updates from reference kernelse inside
 wrappers of arc_mli + fix minor bugs in kernel tests

---
 tensorflow/lite/micro/kernels/arc_mli/conv.cc | 180 +++++---
 .../micro/kernels/arc_mli/depthwise_conv.cc   | 389 ++++++++++--------
 .../micro/kernels/arc_mli/fully_connected.cc  |  49 ++-
 tensorflow/lite/micro/kernels/conv_test.cc    |   4 +-
 tensorflow/lite/micro/kernels/pooling_test.cc |   2 +-
 5 files changed, 361 insertions(+), 263 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index d02f081434f..b9be93ceb11 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
 
-#include "mli_api.h"  // NOLINT
+#include "mli_api.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -24,12 +24,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/padding.h"
-#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
-
-#include "mli_api.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 
 namespace tflite {
 namespace ops {
@@ -42,9 +40,11 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 constexpr int kMaxChannels = 256;
 
-// This file has 2 implementation of Conv.
+// Conv is quantized along dimension 0:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kConvQuantizedDimension = 0;
 
-const int kTensorNotAllocated = -1;
+// This file has 2 implementation of Conv.
 
 struct OpData {
   TfLitePaddingValues padding;
@@ -101,13 +101,15 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
     const TfLiteTensor* bias =
         GetOptionalInputTensor(context, node, kBiasTensor);
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int output_channels = filter->dims->data[kConvQuantizedDimension];
 
     TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
         context, input, filter, bias, output, params->activation,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift)));
+        reinterpret_cast<int*>(data->per_channel_output_shift),
+        output_channels));
   }
   return kTfLiteOk;
 }
@@ -144,12 +146,10 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                       GetTensorData<uint8_t>(im2col), nullptr);
 }
 
-TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
-                             TfLiteConvParams* params, OpData* data,
-                             const TfLiteTensor* input,
-                             const TfLiteTensor* filter,
-                             const TfLiteTensor* bias, TfLiteTensor* output,
-                             TfLiteTensor* im2col) {
+TfLiteStatus EvalMliQuantizedPerChannel(
+    TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
+    OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
+    const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Conv MLI kernel
   // MLI optimized version only supports int8 dataype and dilation factor of 1
   if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
@@ -204,24 +204,36 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
     const int height_dimension = 1;
     int in_slice_height = 0;
     int out_slice_height = 0;
-    const int kernel_height = static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
+    const int kernel_height =
+        static_cast<int>(mli_weights.shape[KRNL_H_DIM_HWC]);
     const int overlap = kernel_height - cfg.stride_height;
 
     // for weight slicing (on output channels)
-    const int weight_out_ch_dimension = 0; // NHWC layout for weigths, output channel dimension is the first dimension.
-    int slice_channels = static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
-    const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
+    const int weight_out_ch_dimension =
+        0;  // NHWC layout for weigths, output channel dimension is the first
+            // dimension.
+    int slice_channels =
+        static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+    const int out_tensor_ch_dimension =
+        3;  // Batch-Height-Width-Channel layout means last dimension is output
+            // channels.
 
-    // Tensors for data in fast (local) memory and config to copy data from external to local memory
+    // Tensors for data in fast (local) memory and config to copy data from
+    // external to local memory
     mli_tensor weights_local = mli_weights;
     mli_tensor bias_local = mli_bias;
     mli_tensor in_local = mli_in;
     mli_tensor out_local = mli_out;
     mli_mov_cfg_t copy_config;
     mli_mov_cfg_for_copy(&copy_config);
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+        context, &in_local, &weights_local, &bias_local, &out_local));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+        &in_local, &out_local, kernel_height, cfg.stride_height,
+        cfg.padding_top, cfg.padding_bottom, &in_slice_height,
+        &out_slice_height));
+    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+        &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
 
     /* is_local indicates that the tensor is already in local memory,
        so in that case the original tensor can be used,
@@ -233,33 +245,40 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
 
     TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels);
     TensorSlicer b_slice(&mli_bias, weight_out_ch_dimension, slice_channels);
-    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
+    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
+                              0, 0, 0, true);
 
-    mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
-    mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+    mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+    mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
 
-    void *input_buffer_ptr = NULL;
+    void* input_buffer_ptr = NULL;
     int input_buffer_size = 0;
 
-    while (!w_slice.Done()){
+    while (!w_slice.Done()) {
       mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
       mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
 
-      /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
-      because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
-      on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
-      The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
-      in chunks of 'sliceHeight' */
-      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap);
+      /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
+      tensor. because the mli kernel will process one HWC tensor at a time, the
+      4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
+      on top of that there could be a need to also slice in the Height
+      dimension. for that the sliceHeight has been calculated. The tensor slicer
+      is configured that it will completely slice the nBatch dimension (0) and
+      slice the height dimension (1) in chunks of 'sliceHeight' */
+      TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
+                            cfg.padding_top, cfg.padding_bottom, overlap);
 
-      /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of
-      output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and
-      height dimension. */
-      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension, out_slice_height);
+      /* output tensor is alreade sliced in the output channel dimension.
+      out_ch_slice.Sub() is the tensor for the amount of output channels of this
+      itteration of the weight slice loop. This tensor needs to be further
+      sliced over the batch and height dimension. */
+      TensorSlicer out_slice(out_ch_slice.Sub(), height_dimension,
+                             out_slice_height);
 
-      /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */
-      mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
-      mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+      /* setup the pointers to the local or remote tensor to make the code
+       * inside the loop easier. */
+      mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+      mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
 
       while (!out_slice.Done()) {
         TF_LITE_ENSURE(context, !in_slice.Done());
@@ -267,7 +286,8 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
         cfg.padding_bottom = in_slice.GetPaddingPost();
 
         // if same input copy as previous iteration, skip the copy of input
-        if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
+        if ((in_slice.Sub()->data != input_buffer_ptr) ||
+            (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
           mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
           input_buffer_ptr = in_slice.Sub()->data;
           input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
@@ -283,26 +303,37 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       out_ch_slice.Next();
       TF_LITE_ENSURE(context, in_slice.Done());
     }
-
-  } else {
-    ConvParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.stride_height = params->stride_height;
-    op_params.stride_width = params->stride_width;
-    op_params.dilation_height_factor = params->dilation_height_factor;
-    op_params.dilation_width_factor = params->dilation_width_factor;
-    op_params.padding_values.height = data->padding.height;
-    op_params.padding_values.width = data->padding.width;
-
-    reference_integer_ops::ConvPerChannel(
-        op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
   }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                     TfLiteConvParams* params, OpData* data,
+                                     const TfLiteTensor* input,
+                                     const TfLiteTensor* filter,
+                                     const TfLiteTensor* bias,
+                                     TfLiteTensor* output) {
+  ConvParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::ConvPerChannel(
+      op_params, data->per_channel_output_multiplier,
+      data->per_channel_output_shift, GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
+
   return kTfLiteOk;
 }
 
@@ -352,6 +383,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData data;
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
+  bool mli_is_applicable = false;
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                       kTfLiteAffineQuantization);
@@ -362,26 +394,38 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE(context, affine_quantization);
     TF_LITE_ENSURE(context, affine_quantization->scale);
     TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    // Conv is quantized along dimension 0:
-    // https://www.tensorflow.org/lite/performance/quantization_spec
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
-                      affine_quantization->scale->size);
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[0],
+
+    TF_LITE_ENSURE(context,
+                   affine_quantization->scale->size == 1 ||
+                       affine_quantization->scale->size ==
+                           filter->dims->data[kConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                       affine_quantization->zero_point->size);
+    mli_is_applicable =
+        ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+         (params->dilation_width_factor == 1) &&
+         (params->dilation_height_factor == 1) &&
+         (affine_quantization->scale->size ==
+          filter->dims->data[kConvQuantizedDimension]));
   }
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(
       context, node, params, input_width, input_height, filter_width,
       filter_height, output_width, output_height, input->type, &data));
-
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
                 nullptr, output);
       break;
     case kTfLiteInt8:
-      return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output, nullptr);
+      if (mli_is_applicable) {
+        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
+                                       filter, bias, output);
+
+      } else {
+        return EvalQuantizedPerChannel(context, node, params, &data, input,
+                                       filter, bias, output);
+      }
       break;
     case kTfLiteUInt8:
       EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 049347cc7a1..9860235b2fb 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/depthwise_conv.h"
 
-#include "mli_api.h"  // NOLINT
+#include "mli_api.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -30,8 +30,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
 
-#include "mli_api.h"
-
 namespace tflite {
 namespace ops {
 namespace micro {
@@ -44,6 +42,10 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 constexpr int kMaxChannels = 256;
 
+// Depthwise conv is quantized along dimension 3:
+// https://www.tensorflow.org/lite/performance/quantization_spec
+constexpr int kDepthwiseConvQuantizedDimension = 3;
+
 struct OpData {
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
@@ -85,6 +87,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
     const TfLiteTensor* bias =
         GetOptionalInputTensor(context, node, kBiasTensor);
     TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+    int num_channels = filter->dims->data[kDepthwiseConvQuantizedDimension];
 
     // Ensure filter and bias channel count does not exceed space reserved for
     // quantization metadata.
@@ -101,7 +104,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
         &data->output_multiplier, &data->output_shift,
         &data->output_activation_min, &data->output_activation_max,
         data->per_channel_output_multiplier,
-        reinterpret_cast<int*>(data->per_channel_output_shift)));
+        reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
   }
   return kTfLiteOk;
 }
@@ -136,187 +139,201 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       GetTensorData<float>(output));
 }
 
-TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, OpData* data,
                              const TfLiteTensor* input,
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Depthwise Conv MLI kernel
   // MLI optimized version only supports int8 dataype and dilation factor of 1
-  if ((input->type == kTfLiteInt8) && (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
-    mli_conv2d_cfg cfg = {};
+  mli_tensor mli_in = {0};
+  mli_tensor mli_weights = {0};
+  mli_tensor mli_bias = {0};
+  mli_tensor mli_out = {0};
+  mli_conv2d_cfg cfg = {};
 
-    // reuse space allocated for OpData parameters
-    mli_weights.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_multiplier;
-    mli_bias.el_params.asym.scale.pi16 =
-        (int16_t*)data->per_channel_output_shift;
+  // reuse space allocated for OpData parameters
+  mli_weights.el_params.asym.scale.pi16 =
+      (int16_t*)data->per_channel_output_multiplier;
+  mli_bias.el_params.asym.scale.pi16 =
+      (int16_t*)data->per_channel_output_shift;
 
-    int16_t filter_zero_point = 0;
-    int16_t bias_zero_point = 0;
-    mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
-    mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
+  int16_t filter_zero_point = 0;
+  int16_t bias_zero_point = 0;
+  mli_weights.el_params.asym.zero_point.pi16 = &filter_zero_point;
+  mli_bias.el_params.asym.zero_point.pi16 = &bias_zero_point;
 
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
-    ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    if (params->activation == kTfLiteActRelu) {
-      cfg.relu.type = MLI_RELU_GEN;
-    } else if (params->activation == kTfLiteActRelu6) {
-      cfg.relu.type = MLI_RELU_6;
-    } else if (params->activation == kTfLiteActRelu1) {
-      cfg.relu.type = MLI_RELU_1;
-    } else {
-      cfg.relu.type = MLI_RELU_NONE;
-    }
-
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    // for height slicing
-    const int heightDimension = 1;
-    int inSliceHeight = 0;
-    int outSliceHeight = 0;
-    const int kernelHeight = static_cast<int>(mli_weights.shape[KRNL_DW_H_DIM_HWC]); 
-    const int overlap = kernelHeight - cfg.stride_height;
-
-    // for weight slicing (on output channels)
-    const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension.
-    const int bias_out_ch_dimension = 0; // bias has only 1 dimension
-    const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
-    const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension];
-    const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension];
-    int slice_channels = static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
-
-    // Tensors for data in fast (local) memory and config to copy data from external to local memory
-    mli_tensor weights_local = mli_weights;
-    mli_tensor bias_local = mli_bias;
-    mli_tensor in_local = mli_in;
-    mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct.
-    mli_mov_cfg_t copy_config;
-    mli_mov_cfg_for_copy(&copy_config);
-
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    /* is_local indicates that the tensor is already in local memory,
-       so in that case the original tensor can be used,
-       and there is no need to copy it to the local tensor*/
-    const bool in_is_local = in_local.data == mli_in.data;
-    const bool out_is_local = out_local.data == mli_out.data;
-    const bool w_is_local = weights_local.data == mli_weights.data;
-    const bool b_is_local = bias_local.data == mli_bias.data;
-
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
-
-    /* if input channels is not equal to output channels, a channel multiplier is used.
-       in this case the slice channels needs to be rounded down to a multiple of the input channels */
-    if (in_channels != out_channels) {
-      slice_channels = (slice_channels / in_channels) * in_channels;
-    }
-
-    TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true);
-    TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
-    TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
-    TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
-
-    mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
-    mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
-
-    void *input_buffer_ptr = NULL;
-    int input_buffer_size = 0;
-    int padding_top = cfg.padding_top;
-    int padding_bottom = cfg.padding_bottom;
-
-    while (!w_slice.Done()){
-      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
-      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
-
-      /* input tensor is alreade sliced in the  channel dimension. out_ch_slice.Sub() is the tensor for the amount of
-      channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and
-      height dimension.
-      in_ch_slice.Sub() tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
-      because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
-      on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
-      The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
-      in chunks of 'sliceHeight' */
-      TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap);
-
-      /* output tensor is alreade sliced in the output channel dimension. out_ch_slice.Sub() is the tensor for the amount of
-      output channels of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch and
-      height dimension. */
-      TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
-
-      /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */
-      mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
-      mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
-
-      while (!out_slice.Done()) {
-        TF_LITE_ENSURE(context, !in_slice.Done());
-        cfg.padding_top = in_slice.GetPaddingPre();
-        cfg.padding_bottom = in_slice.GetPaddingPost();
-
-        // if same input copy as previous iteration, skip the copy of input
-        if ((in_slice.Sub()->data != input_buffer_ptr) || (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
-          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-          input_buffer_ptr = in_slice.Sub()->data;
-          input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
-        }
-        mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
-        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
-
-        in_slice.Next();
-        out_slice.Next();
-      }
-      w_slice.Next();
-      b_slice.Next();
-      out_ch_slice.Next();
-      in_ch_slice.Next();
-      TF_LITE_ENSURE(context, in_slice.Done());
-    }
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensorPerChannel<int8_t>(filter, &mli_weights);
+  ConvertToMliTensorPerChannel<int32_t>(bias, &mli_bias);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
 
+  if (params->activation == kTfLiteActRelu) {
+    cfg.relu.type = MLI_RELU_GEN;
+  } else if (params->activation == kTfLiteActRelu6) {
+    cfg.relu.type = MLI_RELU_6;
+  } else if (params->activation == kTfLiteActRelu1) {
+    cfg.relu.type = MLI_RELU_1;
   } else {
-    DepthwiseParams op_params;
-    op_params.padding_type = PaddingType::kSame;
-    op_params.padding_values.width = data->padding.width;
-    op_params.padding_values.height = data->padding.height;
-    op_params.stride_width = params->stride_width;
-    op_params.stride_height = params->stride_height;
-    op_params.dilation_width_factor = params->dilation_width_factor;
-    op_params.dilation_height_factor = params->dilation_height_factor;
-    op_params.depth_multiplier = params->depth_multiplier;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = 0;
-    op_params.output_offset = output->params.zero_point;
-    // TODO(b/130439627): Use calculated value for clamping.
-    op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
-    op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
-
-    reference_integer_ops::DepthwiseConvPerChannel(
-        op_params, data->per_channel_output_multiplier,
-        data->per_channel_output_shift, GetTensorShape(input),
-        GetTensorData<int8>(input), GetTensorShape(filter),
-        GetTensorData<int8>(filter), GetTensorShape(bias),
-        GetTensorData<int32>(bias), GetTensorShape(output),
-        GetTensorData<int8>(output));
+    cfg.relu.type = MLI_RELU_NONE;
   }
+
+  cfg.stride_width = params->stride_width;
+  cfg.stride_height = params->stride_height;
+  if (params->padding == kTfLitePaddingValid) {
+    cfg.padding_left = 0;
+    cfg.padding_right = 0;
+    cfg.padding_top = 0;
+    cfg.padding_bottom = 0;
+  } else {
+    cfg.padding_left = data->padding.width;
+    cfg.padding_right = data->padding.width + data->padding.width_offset;
+    cfg.padding_top = data->padding.height;
+    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+  }
+
+  // for height slicing
+  const int heightDimension = 1;
+  int inSliceHeight = 0;
+  int outSliceHeight = 0;
+  const int kernelHeight = static_cast<int>(mli_weights.shape[KRNL_DW_H_DIM_HWC]); 
+  const int overlap = kernelHeight - cfg.stride_height;
+
+  // for weight slicing (on output channels)
+  const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension.
+  const int bias_out_ch_dimension = 0; // bias has only 1 dimension
+  const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
+  const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension];
+  const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension];
+  int slice_channels = static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
+
+  // Tensors for data in fast (local) memory and config to copy data from external to local memory
+  mli_tensor weights_local = mli_weights;
+  mli_tensor bias_local = mli_bias;
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct.
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_conv_tensors(
+      context, &in_local, &weights_local, &bias_local, &out_local));
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  const bool in_is_local = in_local.data == mli_in.data;
+  const bool out_is_local = out_local.data == mli_out.data;
+  const bool w_is_local = weights_local.data == mli_weights.data;
+  const bool b_is_local = bias_local.data == mli_bias.data;
+
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+      &in_local, &out_local, kernelHeight, cfg.stride_height, cfg.padding_top,
+      cfg.padding_bottom, &inSliceHeight, &outSliceHeight));
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+      &weights_local, &bias_local, weight_out_ch_dimension, &slice_channels));
+
+  /* if input channels is not equal to output channels, a channel multiplier
+     is used. in this case the slice channels needs to be rounded down to a
+     multiple of the input channels */
+  if (in_channels != out_channels) {
+    slice_channels = (slice_channels / in_channels) * in_channels;
+  }
+
+  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true);
+  TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
+  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
+  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
+
+  mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+  mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+  void *input_buffer_ptr = NULL;
+  int input_buffer_size = 0;
+  int padding_top = cfg.padding_top;
+  int padding_bottom = cfg.padding_bottom;
+
+  while (!w_slice.Done()){
+    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+    /* input tensor is alreade sliced in the  channel dimension.
+    out_ch_slice.Sub() is the tensor for the amount of channels of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch and height dimension. in_ch_slice.Sub() tensor
+    contains batches of HWC tensors. so it is a 4 dimensional tensor. because
+    the mli kernel will process one HWC tensor at a time, the 4 dimensional
+    tensor needs to be sliced into nBatch 3 dimensional tensors. on top of
+    that there could be a need to also slice in the Height dimension. for that
+    the sliceHeight has been calculated. The tensor slicer is configured that
+    it will completely slice the nBatch dimension (0) and slice the height
+    dimension (1) in chunks of 'sliceHeight' */
+    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap);
+
+    /* output tensor is alreade sliced in the output channel dimension.
+    out_ch_slice.Sub() is the tensor for the amount of output channels of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch and height dimension. */
+    TensorSlicer out_slice(out_ch_slice.Sub(), heightDimension, outSliceHeight);
+
+    /* setup the pointers to the local or remote tensor to make the code
+     * inside the loop easier. */
+    mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+    while (!out_slice.Done()) {
+      TF_LITE_ENSURE(context, !in_slice.Done());
+      cfg.padding_top = in_slice.GetPaddingPre();
+      cfg.padding_bottom = in_slice.GetPaddingPost();
+
+      // if same input copy as previous iteration, skip the copy of input
+      if ((in_slice.Sub()->data != input_buffer_ptr) ||
+          (mli_hlp_count_elem_num(in_slice.Sub(), 0) != input_buffer_size)) {
+        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+        input_buffer_ptr = in_slice.Sub()->data;
+        input_buffer_size = mli_hlp_count_elem_num(in_slice.Sub(), 0);
+      }
+      mli_krn_depthwise_conv2d_hwcn_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, &cfg, out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
+    }
+    w_slice.Next();
+    b_slice.Next();
+    out_ch_slice.Next();
+    in_ch_slice.Next();
+    TF_LITE_ENSURE(context, in_slice.Done());
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
+                                     TfLiteDepthwiseConvParams* params,
+                                     OpData* data, const TfLiteTensor* input,
+                                     const TfLiteTensor* filter,
+                                     const TfLiteTensor* bias,
+                                     TfLiteTensor* output) {
+  DepthwiseParams op_params;
+  op_params.padding_type = PaddingType::kSame;
+  op_params.padding_values.width = data->padding.width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.stride_width = params->stride_width;
+  op_params.stride_height = params->stride_height;
+  op_params.dilation_width_factor = params->dilation_width_factor;
+  op_params.dilation_height_factor = params->dilation_height_factor;
+  op_params.depth_multiplier = params->depth_multiplier;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = 0;
+  op_params.output_offset = output->params.zero_point;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
+
+  reference_integer_ops::DepthwiseConvPerChannel(
+      op_params, data->per_channel_output_multiplier,
+      data->per_channel_output_shift, GetTensorShape(input),
+      GetTensorData<int8>(input), GetTensorShape(filter),
+      GetTensorData<int8>(filter), GetTensorShape(bias),
+      GetTensorData<int32>(bias), GetTensorShape(output),
+      GetTensorData<int8>(output));
   return kTfLiteOk;
 }
 
@@ -373,6 +390,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData data;
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
+  bool mli_is_applicable = false;
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                       kTfLiteAffineQuantization);
@@ -383,12 +401,18 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE(context, affine_quantization);
     TF_LITE_ENSURE(context, affine_quantization->scale);
     TF_LITE_ENSURE(context, affine_quantization->zero_point);
-    // Depthwise conv is quantized along dimension 3:
-    // https://www.tensorflow.org/lite/performance/quantization_spec
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
-                      affine_quantization->scale->size);
-    TF_LITE_ENSURE_EQ(context, filter->dims->data[3],
+    TF_LITE_ENSURE(
+        context, affine_quantization->scale->size == 1 ||
+                     affine_quantization->scale->size ==
+                         filter->dims->data[kDepthwiseConvQuantizedDimension]);
+    TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                       affine_quantization->zero_point->size);
+    mli_is_applicable =
+        ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+         (params->dilation_width_factor == 1) &&
+         (params->dilation_height_factor == 1) &&
+         (affine_quantization->scale->size ==
+          filter->dims->data[kDepthwiseConvQuantizedDimension]));
   }
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
@@ -399,8 +423,13 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       EvalFloat(context, node, params, &data, input, filter, bias, output);
       break;
     case kTfLiteInt8:
-      return EvalQuantizedPerChannel(context, node, params, &data, input, filter, bias,
-                              output);
+      if (mli_is_applicable) {
+        return EvalMliQuantizedPerChannel(context, node, params, &data, input,
+                                          filter, bias, output);
+      } else {
+        return EvalQuantizedPerChannel(context, node, params, &data, input,
+                                       filter, bias, output);
+      }
       break;
     case kTfLiteUInt8:
       EvalQuantized(context, node, params, &data, input, filter, bias, output);
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 61fa0ff397f..185217d0c6a 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 
-#include "mli_api.h"  // NOLINT
+#include "mli_api.h" 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -28,8 +28,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
 
-#include "mli_api.h"
-
 namespace tflite {
 namespace ops {
 namespace micro {
@@ -77,6 +75,37 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
 
 }  // namespace
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  OpData* data = nullptr;
+  TfLiteStatus status = context->AllocatePersistentBuffer(
+      context, sizeof(OpData), reinterpret_cast<void**>(&data));
+  if (status != kTfLiteOk || data == nullptr) {
+    return nullptr;
+  }
+  return data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE_MSG(context, input->type == filter->type,
+                     "Hybrid models are not supported on TFLite Micro.");
+
+  TfLiteType data_type = input->type;
+  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
+                                        filter, bias, output, data));
+
+  return kTfLiteOk;
+}
+
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
                                TfLiteFullyConnectedParams* params, OpData* data,
                                const TfLiteTensor* input,
@@ -263,13 +292,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  TfLiteType data_type = input->type;
-  OpData local_data_object;
-  OpData* data = &local_data_object;
-  TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, data_type, input,
-                                        filter, bias, output, data));
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  switch (filter->type) {  // Already know in/out types are same.
+  // Checks in Prepare ensure input, output and filter types are all the same.
+  switch (input->type) {
     case kTfLiteFloat32:
       return EvalFloat(context, node, params, data, input, filter, bias,
                        output);
@@ -292,15 +318,14 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace fully_connected
 
 TfLiteRegistration* Register_FULLY_CONNECTED() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/fully_connected::Init,
                                  /*free=*/nullptr,
-                                 /*prepare=*/nullptr,
+                                 /*prepare=*/fully_connected::Prepare,
                                  /*invoke=*/fully_connected::Eval,
                                  /*profiling_string=*/nullptr,
                                  /*builtin_code=*/0,
                                  /*custom_name=*/nullptr,
                                  /*version=*/0};
-
   return &r;
 }
 
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 4cc2a80c3ea..8a3eb30630d 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -409,8 +409,8 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {
 
 TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
   // conv params:
-  // padding, stride_<width,height>, dilation_<width, height>, activation
-  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
+  // padding, stride_<width,height>, activation, dilation_<width, height>
+  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6, 1, 1};
   const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
   const int kInputElements =
       kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 8bfeb718a1b..96dff421d53 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -496,7 +496,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
        F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
       {4, 1, 2, 4, 1},         // Output shape
       output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
+      kTfLitePaddingSame, kTfLiteActNone, output_data);
 }
 
 TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {

From 273948c6aaf8424e8adf33d6f3fcba6c9fa935e2 Mon Sep 17 00:00:00 2001
From: Daria Zhuravleva <daria@synopsys.com>
Date: Tue, 14 Apr 2020 12:10:11 +0300
Subject: [PATCH 0083/1533] Common wrapper for average and max pooling

---
 .../lite/micro/kernels/arc_mli/pooling.cc     | 267 ++++++++++--------
 1 file changed, 145 insertions(+), 122 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index ced5c4a21b8..7f87d4849ff 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
 
-#include "mli_api.h"  // NOLINT
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -41,6 +40,8 @@ struct OpData {
   TfLitePaddingValues padding;
 };
 
+typedef enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 } MliPoolingType;
+
 TfLiteStatus CalculateOpData(const TfLiteContext* context,
                              const TfLitePoolParams* params,
                              const TfLiteTensor* input,
@@ -81,110 +82,111 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
       GetTensorShape(output), GetTensorData<float>(output));
 }
 
-void AverageEvalUint8(TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
+//Prepare MLI tensors and run Average or Max Pooling
+TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
+                     const OpData* data, const TfLiteTensor* input,
+                     TfLiteTensor* output, const MliPoolingType pooling_type) {
+  mli_tensor mli_in = {0};
+  mli_tensor mli_out = {0};
+  mli_pool_cfg cfg = {0};
 
-  PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_ops::AveragePool(
-      op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-      GetTensorShape(output), GetTensorData<uint8_t>(output));
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  cfg.kernel_width = params->filter_width;
+  cfg.kernel_height = params->filter_height;
+  cfg.stride_width = params->stride_width;
+  cfg.stride_height = params->stride_height;
+
+  if (params->padding == kTfLitePaddingValid) {
+    cfg.padding_left = 0;
+    cfg.padding_right = 0;
+    cfg.padding_top = 0;
+    cfg.padding_bottom = 0;
+  } else {
+    cfg.padding_left = data->padding.width;
+    cfg.padding_right = data->padding.width + data->padding.width_offset;
+    cfg.padding_top = data->padding.height;
+    cfg.padding_bottom = data->padding.height + data->padding.height_offset;
+  }
+
+  mli_point_to_subtsr_cfg subtsr_cfg_in = {
+      {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
+  mli_point_to_subtsr_cfg subtsr_cfg_out = {
+      {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
+  mli_tensor sub_mli_in = {0};
+  mli_tensor sub_mli_out = {0};
+  mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
+  mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
+
+  const int height_dimension = 1;
+  int in_slice_height = 0;
+  int out_slice_height = 0;
+  const int overlap = cfg.kernel_height - cfg.stride_height;
+
+  // Tensors for data in fast (local) memory and config to copy data from
+  // external to local memory
+  mli_tensor in_local = sub_mli_in;
+  mli_tensor out_local = sub_mli_out;
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(
+      context, &in_local, &out_local));
+  bool in_is_local = in_local.data == sub_mli_in.data;
+  bool out_is_local = out_local.data == sub_mli_out.data;
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
+      &in_local, &out_local, cfg.kernel_height, cfg.stride_height,
+      cfg.padding_top, cfg.padding_bottom, &in_slice_height,
+      &out_slice_height));
+
+  /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional
+     tensor. because the mli kernel will process one HWC tensor at a time, the 4
+     dimensional tensor needs to be sliced into nBatch 3 dimensional tensors. on
+     top of that there could be a need to also slice in the Height dimension.
+     for that the sliceHeight has been calculated. The tensor slicer is
+     configured that it will completely slice the nBatch dimension (0) and slice
+     the height dimension (1) in chunks of 'sliceHeight' */
+  TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height,
+                        cfg.padding_top, cfg.padding_bottom, overlap);
+  TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height);
+
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+  mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+  while (!out_slice.Done()) {
+    cfg.padding_top = in_slice.GetPaddingPre();
+    cfg.padding_bottom = in_slice.GetPaddingPost();
+
+    mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+    if (pooling_type == AveragePooling)
+      mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr);
+    else if (pooling_type == MaxPooling)
+      mli_krn_maxpool_hwc_sa8(in_ptr, &cfg, out_ptr);
+    mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+    in_slice.Next();
+    out_slice.Next();
+  }
+  return kTfLiteOk;
 }
 
-TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
-                     const TfLitePoolParams* params, const OpData* data,
-                     const TfLiteTensor* input, TfLiteTensor* output) {
+void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
+                          const TfLitePoolParams* params, const OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
   // Run Average Pooling MLI kernel
   // MLI optimized version only supports int8 dataype and no fused Relu
   // TODO: subject to add mli_saturate kernel
   if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_out = {0};
-    mli_pool_cfg cfg = {0};
-
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    cfg.kernel_width = params->filter_width;
-    cfg.kernel_height = params->filter_height;
-    cfg.stride_width = params->stride_width;
-    cfg.stride_height = params->stride_height;
-
-    if (params->padding == kTfLitePaddingValid) {
-      cfg.padding_left = 0;
-      cfg.padding_right = 0;
-      cfg.padding_top = 0;
-      cfg.padding_bottom = 0;
-    } else {
-      cfg.padding_left = data->padding.width;
-      cfg.padding_right = data->padding.width + data->padding.width_offset;
-      cfg.padding_top = data->padding.height;
-      cfg.padding_bottom = data->padding.height + data->padding.height_offset;
-    }
-
-    mli_point_to_subtsr_cfg subtsr_cfg_in = {{0,0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
-    mli_point_to_subtsr_cfg subtsr_cfg_out = {{0,0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
-    mli_tensor sub_mli_in = {0};
-    mli_tensor sub_mli_out = {0};
-    mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-    mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
-
-    const int height_dimension = 1;
-    int in_slice_height = 0;
-    int out_slice_height = 0;
-    const int overlap = cfg.kernel_height - cfg.stride_height;
-
-    // Tensors for data in fast (local) memory and config to copy data from external to local memory
-    mli_tensor in_local = sub_mli_in;
-    mli_tensor out_local = sub_mli_out;
-    mli_mov_cfg_t copy_config;
-    mli_mov_cfg_for_copy(&copy_config);
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(context, &in_local, &out_local));
-    bool in_is_local = in_local.data == sub_mli_in.data;
-    bool out_is_local = out_local.data == sub_mli_out.data;
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(&in_local, &out_local, cfg.kernel_height, cfg.stride_height, cfg.padding_top, cfg.padding_bottom, &in_slice_height, &out_slice_height));
-
-    /* mli_in tensor contains batches of HWC tensors. so it is a 4 dimensional tensor.
-       because the mli kernel will process one HWC tensor at a time, the 4 dimensional tensor needs to be sliced into nBatch 3 dimensional tensors.
-       on top of that there could be a need to also slice in the Height dimension. for that the sliceHeight has been calculated.
-       The tensor slicer is configured that it will completely slice the nBatch dimension (0) and slice the height dimension (1)
-       in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(&mli_in, height_dimension, in_slice_height, cfg.padding_top, cfg.padding_bottom, overlap);
-    TensorSlicer out_slice(&mli_out, height_dimension, out_slice_height);
-
-    /* is_local indicates that the tensor is already in local memory,
-       so in that case the original tensor can be used,
-       and there is no need to copy it to the local tensor*/
-    mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
-    mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
-
-    while (!out_slice.Done()) {
-      cfg.padding_top = in_slice.GetPaddingPre();
-      cfg.padding_bottom = in_slice.GetPaddingPost();
-
-      mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-      mli_krn_avepool_hwc_sa8(in_ptr, &cfg, out_ptr);
-      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
-
-      in_slice.Next();
-      out_slice.Next();
-    }
-
+    EvalMli(context, params, data, input, output, AveragePooling);
   } else {
     int32_t activation_min, activation_max;
     (void)CalculateActivationRangeQuantized(context, params->activation, output,
                                             &activation_min, &activation_max);
+
     PoolParams op_params;
     op_params.stride_height = params->stride_height;
     op_params.stride_width = params->stride_width;
@@ -194,11 +196,17 @@ TfLiteStatus AverageEvalInt8(TfLiteContext* context, const TfLiteNode* node,
     op_params.padding_values.width = data->padding.width;
     op_params.quantized_activation_min = activation_min;
     op_params.quantized_activation_max = activation_max;
-    reference_integer_ops::AveragePool(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
+
+    if (input->type == kTfLiteUInt8) {
+      reference_ops::AveragePool(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+    } else {
+      reference_integer_ops::AveragePool(
+          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(output), GetTensorData<int8_t>(output));
+    }
   }
-  return kTfLiteOk;
 }
 
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
@@ -222,29 +230,45 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
                          GetTensorData<float>(output));
 }
 
-void MaxEvalQuantizedUInt8(TfLiteContext* context, TfLiteNode* node,
-                           TfLitePoolParams* params, OpData* data,
-                           const TfLiteTensor* input, TfLiteTensor* output) {
-  int32_t activation_min, activation_max;
-  (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                          &activation_min, &activation_max);
+void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                      TfLitePoolParams* params, OpData* data,
+                      const TfLiteTensor* input, TfLiteTensor* output) {
+  TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
+  
+  // Run Max Pooling MLI kernel
+  // MLI optimized version only supports int8 dataype and no fused Relu
+  // TODO: subject to add mli_saturate kernel
+  if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) {
+    EvalMli(context, params, data, input, output, MaxPooling);
+  } else {
+    int32_t activation_min, activation_max;
+    (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                            &activation_min, &activation_max);
 
-  tflite::PoolParams op_params;
-  op_params.stride_height = params->stride_height;
-  op_params.stride_width = params->stride_width;
-  op_params.filter_height = params->filter_height;
-  op_params.filter_width = params->filter_width;
-  op_params.padding_values.height = data->padding.height;
-  op_params.padding_values.width = data->padding.width;
-  op_params.quantized_activation_min = activation_min;
-  op_params.quantized_activation_max = activation_max;
-  reference_ops::MaxPool(op_params, GetTensorShape(input),
-                         GetTensorData<uint8_t>(input), GetTensorShape(output),
-                         GetTensorData<uint8_t>(output));
+    tflite::PoolParams op_params;
+    op_params.stride_height = params->stride_height;
+    op_params.stride_width = params->stride_width;
+    op_params.filter_height = params->filter_height;
+    op_params.filter_width = params->filter_width;
+    op_params.padding_values.height = data->padding.height;
+    op_params.padding_values.width = data->padding.width;
+    op_params.quantized_activation_min = activation_min;
+    op_params.quantized_activation_max = activation_max;
+
+    if (input->type == kTfLiteUInt8) {
+      reference_ops::MaxPool(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+    } else {
+      reference_integer_ops::MaxPool(
+          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(output), GetTensorData<int8_t>(output));
+    }
+  }
 }
-
 }  // namespace
 
+
 TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
   OpData data;
@@ -254,16 +278,14 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, params, input, output, &data));
 
-  // Inputs and outputs share the same type, guarenteed by the converter.
+  // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
       AverageEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
-      AverageEvalUint8(context, node, params, &data, input, output);
-      break;
     case kTfLiteInt8:
-      return AverageEvalInt8(context, node, params, &data, input, output);
+      AverageEvalQuantized(context, node, params, &data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
@@ -287,7 +309,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
       MaxEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
-      MaxEvalQuantizedUInt8(context, node, params, &data, input, output);
+    case kTfLiteInt8:
+      MaxEvalQuantized(context, node, params, &data, input, output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",

From 8ed89130aa4c3da812790a73dae465881428863f Mon Sep 17 00:00:00 2001
From: Daria Zhuravleva <daria@synopsys.com>
Date: Wed, 15 Apr 2020 15:10:52 +0300
Subject: [PATCH 0084/1533] Refactoring

---
 tensorflow/lite/micro/kernels/arc_mli/pooling.cc | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index 7f87d4849ff..7b68e314277 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/kernels/internal/reference/pooling.h"
 
+#include "mli_api.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
@@ -24,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
 
-#include "mli_api.h"
 
 namespace tflite {
 namespace ops {
@@ -40,7 +40,7 @@ struct OpData {
   TfLitePaddingValues padding;
 };
 
-typedef enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 } MliPoolingType;
+enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
 
 TfLiteStatus CalculateOpData(const TfLiteContext* context,
                              const TfLitePoolParams* params,
@@ -111,9 +111,15 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
   }
 
   mli_point_to_subtsr_cfg subtsr_cfg_in = {
-      {0, 0}, 2, static_cast<uint8_t>(mli_in.shape[1])};
+      .start_coord = {0, 0}, 
+      .coord_num = 2, 
+      .first_out_dim_size = static_cast<uint8_t>(mli_in.shape[1]),
+  };
   mli_point_to_subtsr_cfg subtsr_cfg_out = {
-      {0, 0}, 2, static_cast<uint8_t>(mli_out.shape[1])};
+      .start_coord = {0, 0}, 
+      .coord_num = 2, 
+      .first_out_dim_size = static_cast<uint8_t>(mli_out.shape[1]),
+  };
   mli_tensor sub_mli_in = {0};
   mli_tensor sub_mli_out = {0};
   mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);

From 51522a108d0ee14a665752f3f65e534235925a41 Mon Sep 17 00:00:00 2001
From: Daria Zhuravleva <daria@synopsys.com>
Date: Wed, 15 Apr 2020 21:46:00 +0300
Subject: [PATCH 0085/1533] Removed sub_tensors

---
 .../lite/micro/kernels/arc_mli/pooling.cc     | 25 ++++---------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index 7b68e314277..2c3875b58eb 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -109,22 +109,7 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
     cfg.padding_top = data->padding.height;
     cfg.padding_bottom = data->padding.height + data->padding.height_offset;
   }
-
-  mli_point_to_subtsr_cfg subtsr_cfg_in = {
-      .start_coord = {0, 0}, 
-      .coord_num = 2, 
-      .first_out_dim_size = static_cast<uint8_t>(mli_in.shape[1]),
-  };
-  mli_point_to_subtsr_cfg subtsr_cfg_out = {
-      .start_coord = {0, 0}, 
-      .coord_num = 2, 
-      .first_out_dim_size = static_cast<uint8_t>(mli_out.shape[1]),
-  };
-  mli_tensor sub_mli_in = {0};
-  mli_tensor sub_mli_out = {0};
-  mli_hlp_point_to_subtensor(&mli_in, &subtsr_cfg_in, &sub_mli_in);
-  mli_hlp_point_to_subtensor(&mli_out, &subtsr_cfg_out, &sub_mli_out);
-
+  
   const int height_dimension = 1;
   int in_slice_height = 0;
   int out_slice_height = 0;
@@ -132,14 +117,14 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
 
   // Tensors for data in fast (local) memory and config to copy data from
   // external to local memory
-  mli_tensor in_local = sub_mli_in;
-  mli_tensor out_local = sub_mli_out;
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;
   mli_mov_cfg_t copy_config;
   mli_mov_cfg_for_copy(&copy_config);
   TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_pooling_tensors(
       context, &in_local, &out_local));
-  bool in_is_local = in_local.data == sub_mli_in.data;
-  bool out_is_local = out_local.data == sub_mli_out.data;
+  bool in_is_local = in_local.data == mli_in.data;
+  bool out_is_local = out_local.data == mli_out.data;
   TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_io(
       &in_local, &out_local, cfg.kernel_height, cfg.stride_height,
       cfg.padding_top, cfg.padding_bottom, &in_slice_height,

From 99d489c7efa85b121b99393a53c3c07ac356c641 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Mon, 20 Apr 2020 17:09:56 +0300
Subject: [PATCH 0086/1533] Option to remove kernels implementation beside ARC
 MLI

---
 tensorflow/lite/micro/kernels/arc_mli/conv.cc | 104 ++++---
 .../micro/kernels/arc_mli/depthwise_conv.cc   | 108 +++++--
 .../micro/kernels/arc_mli/fully_connected.cc  | 290 ++++++++++--------
 .../lite/micro/kernels/arc_mli/pooling.cc     | 172 +++++++----
 .../micro/tools/make/ext_libs/arc_mli.inc     |   8 +
 5 files changed, 427 insertions(+), 255 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index b9be93ceb11..4a2676821d9 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -44,8 +44,6 @@ constexpr int kMaxChannels = 256;
 // https://www.tensorflow.org/lite/performance/quantization_spec
 constexpr int kConvQuantizedDimension = 0;
 
-// This file has 2 implementation of Conv.
-
 struct OpData {
   TfLitePaddingValues padding;
   // The scaling factor from input to output (aka the 'real multiplier') can
@@ -76,11 +74,31 @@ inline PaddingType RuntimePaddingType(TfLitePadding padding) {
   }
 }
 
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteConvParams* params) {
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
+  // per-axis quantization of weights (no broadcasting/per-tensor)
+  bool ret_val = (filter->type == kTfLiteInt8) && 
+                 (input->type == kTfLiteInt8) &&
+                 (bias->type == kTfLiteInt32) &&
+                 (params->dilation_width_factor == 1) &&
+                 (params->dilation_height_factor == 1) &&
+                 (affine_quantization->scale->size ==
+                  filter->dims->data[kConvQuantizedDimension]) &&
+                 affine_quantization->scale->size <= (kMaxChannels * 2);
+  return ret_val;
+}
+
+
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteConvParams* params, int width, int height,
                              int filter_width, int filter_height, int out_width,
                              int out_height, const TfLiteType data_type,
-                             OpData* data) {
+                             bool mli_is_applicable, OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -95,7 +113,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
     const TfLiteTensor* input = GetInput(context, node, kInputTensor);
     const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
     const TfLiteTensor* bias =
@@ -111,14 +130,16 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
         reinterpret_cast<int*>(data->per_channel_output_shift),
         output_channels));
   }
+#endif
   return kTfLiteOk;
 }
 
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* im2col,
-                   TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteConvParams* params, OpData* data,
+                           const TfLiteTensor* input, const TfLiteTensor* filter,
+                           const TfLiteTensor* bias, TfLiteTensor* im2col,
+                           TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   const int32_t input_offset = -input->params.zero_point;
   const int32_t filter_offset = -filter->params.zero_point;
   const int32_t output_offset = output->params.zero_point;
@@ -144,6 +165,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                       GetTensorData<int32_t>(bias), GetTensorShape(output),
                       GetTensorData<uint8_t>(output), GetTensorShape(im2col),
                       GetTensorData<uint8_t>(im2col), nullptr);
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
 TfLiteStatus EvalMliQuantizedPerChannel(
@@ -209,14 +236,13 @@ TfLiteStatus EvalMliQuantizedPerChannel(
     const int overlap = kernel_height - cfg.stride_height;
 
     // for weight slicing (on output channels)
-    const int weight_out_ch_dimension =
-        0;  // NHWC layout for weigths, output channel dimension is the first
-            // dimension.
+    // NHWC layout for weigths, output channel dimension is the first dimension.
+    const int weight_out_ch_dimension = 0;        
     int slice_channels =
         static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
-    const int out_tensor_ch_dimension =
-        3;  // Batch-Height-Width-Channel layout means last dimension is output
-            // channels.
+    // Batch-Height-Width-Channel layout means last dimension is output channels.
+    const int out_tensor_ch_dimension = 3;
+            
 
     // Tensors for data in fast (local) memory and config to copy data from
     // external to local memory
@@ -304,7 +330,6 @@ TfLiteStatus EvalMliQuantizedPerChannel(
       TF_LITE_ENSURE(context, in_slice.Done());
     }
   }
-
   return kTfLiteOk;
 }
 
@@ -314,6 +339,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                                      const TfLiteTensor* filter,
                                      const TfLiteTensor* bias,
                                      TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   ConvParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.output_offset = output->params.zero_point;
@@ -333,15 +359,20 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       GetTensorData<int8>(filter), GetTensorShape(bias),
       GetTensorData<int32>(bias), GetTensorShape(output),
       GetTensorData<int8>(output));
-
   return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
 }
 
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* im2col,
-               TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteConvParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* im2col,
+                       TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -363,6 +394,12 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                       GetTensorData<float>(bias), GetTensorShape(output),
                       GetTensorData<float>(output), GetTensorShape(im2col),
                       GetTensorData<float>(im2col));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -383,7 +420,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData data;
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
-  bool mli_is_applicable = false;
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                       kTfLiteAffineQuantization);
@@ -401,26 +437,22 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                            filter->dims->data[kConvQuantizedDimension]);
     TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                       affine_quantization->zero_point->size);
-    mli_is_applicable =
-        ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
-         (params->dilation_width_factor == 1) &&
-         (params->dilation_height_factor == 1) &&
-         (affine_quantization->scale->size ==
-          filter->dims->data[kConvQuantizedDimension]));
   }
+  bool mli_is_applicable = IsMliApplicable(context, input, filter, bias, params);
+  TF_LITE_ENSURE_STATUS(
+      CalculateOpData(context, node, params, input_width, input_height,
+                      filter_width, filter_height, output_width, output_height,
+                      input->type, mli_is_applicable, &data));
 
-  TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
+      return EvalFloat(context, node, params, &data, input, filter, bias, nullptr,
                 nullptr, output);
       break;
     case kTfLiteInt8:
       if (mli_is_applicable) {
         return EvalMliQuantizedPerChannel(context, node, params, &data, input,
-                                       filter, bias, output);
+                                          filter, bias, output);
 
       } else {
         return EvalQuantizedPerChannel(context, node, params, &data, input,
@@ -428,7 +460,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
+      return EvalQuantized(context, node, params, &data, input, filter, bias, nullptr,
                     nullptr, output);
       break;
     default:
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 9860235b2fb..081a40b23b5 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -64,10 +64,30 @@ struct OpData {
   int32_t output_activation_max;
 };
 
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteDepthwiseConvParams* params) {
+  const auto* affine_quantization =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  // MLI optimized version only supports int8 dataype, dilation factor of 1 and
+  // per-axis quantization of weights (no broadcasting/per-tensor)
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) &&
+                 (bias->type == kTfLiteInt32) &&
+                 (params->dilation_width_factor == 1) &&
+                 (params->dilation_height_factor == 1) &&
+                 (affine_quantization->scale->size ==
+                  filter->dims->data[kDepthwiseConvQuantizedDimension]) &&
+                 affine_quantization->scale->size <= (kMaxChannels * 2);
+  return ret_val;
+}
+
+
 TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
                              TfLiteDepthwiseConvParams* params, int width,
                              int height, int filter_width, int filter_height,
-                             const TfLiteType data_type, OpData* data) {
+                             const TfLiteType data_type, bool mli_is_applicable, 
+                             OpData* data) {
   bool has_bias = node->inputs->size == 3;
   // Check number of inputs/outputs
   TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
@@ -81,7 +101,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
 
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
-  if (data_type != kTfLiteFloat32) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 && !mli_is_applicable) {
     const TfLiteTensor* input = GetInput(context, node, kInputTensor);
     const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
     const TfLiteTensor* bias =
@@ -106,15 +127,17 @@ TfLiteStatus CalculateOpData(TfLiteContext* context, TfLiteNode* node,
         data->per_channel_output_multiplier,
         reinterpret_cast<int*>(data->per_channel_output_shift), num_channels));
   }
+#endif
   return kTfLiteOk;
 }
 
 }  // namespace
 
-void EvalFloat(TfLiteContext* context, TfLiteNode* node,
-               TfLiteDepthwiseConvParams* params, OpData* data,
-               const TfLiteTensor* input, const TfLiteTensor* filter,
-               const TfLiteTensor* bias, TfLiteTensor* output) {
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteDepthwiseConvParams* params, OpData* data,
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -137,6 +160,12 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
       GetTensorShape(filter), GetTensorData<float>(filter),
       GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
       GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
 TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
@@ -145,7 +174,6 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node
                              const TfLiteTensor* filter,
                              const TfLiteTensor* bias, TfLiteTensor* output) {
   // Run Depthwise Conv MLI kernel
-  // MLI optimized version only supports int8 dataype and dilation factor of 1
   mli_tensor mli_in = {0};
   mli_tensor mli_weights = {0};
   mli_tensor mli_bias = {0};
@@ -200,18 +228,23 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node
   const int overlap = kernelHeight - cfg.stride_height;
 
   // for weight slicing (on output channels)
-  const int weight_out_ch_dimension = 3; // HWCN layout for weigths, output channel dimension is the first dimension.
-  const int bias_out_ch_dimension = 0; // bias has only 1 dimension
-  const int out_tensor_ch_dimension = 3; // Batch-Height-Width-Channel layout means last dimension is output channels.
+  // HWCN layout for weigths, output channel dimension is the first dimension.
+  const int weight_out_ch_dimension = 3;
+  // bias has only 1 dimension
+  const int bias_out_ch_dimension = 0; 
+  // Batch-Height-Width-Channel layout means last dimension is output channels.
+  const int out_tensor_ch_dimension = 3; 
   const int32_t in_channels = mli_in.shape[out_tensor_ch_dimension];
   const int32_t out_channels = mli_out.shape[out_tensor_ch_dimension];
   int slice_channels = static_cast<int>(mli_weights.shape[weight_out_ch_dimension]);
 
-  // Tensors for data in fast (local) memory and config to copy data from external to local memory
+  // Tensors for data in fast (local) memory 
+  // and config to copy data from external to local memory
   mli_tensor weights_local = mli_weights;
   mli_tensor bias_local = mli_bias;
   mli_tensor in_local = mli_in;
-  mli_tensor out_local = mli_out; // this assumes that output shape is already filled in the tensor struct.
+  mli_tensor out_local = mli_out; // this assumes that output shape 
+                                  // is already filled in the tensor struct.
   mli_mov_cfg_t copy_config;
   mli_mov_cfg_for_copy(&copy_config);
 
@@ -238,10 +271,13 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node
     slice_channels = (slice_channels / in_channels) * in_channels;
   }
 
-  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels, 0, 0, 0, true);
   TensorSlicer b_slice(&mli_bias, bias_out_ch_dimension, slice_channels);
-  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
-  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels, 0, 0, 0, true);
+  TensorSlicer w_slice(&mli_weights, weight_out_ch_dimension, slice_channels,
+                       0, 0, 0, true);
+  TensorSlicer out_ch_slice(&mli_out, out_tensor_ch_dimension, slice_channels,
+                            0, 0, 0, true);
+  TensorSlicer in_ch_slice(&mli_in, out_tensor_ch_dimension, slice_channels,
+                           0, 0, 0, true);
 
   mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
   mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
@@ -266,7 +302,8 @@ TfLiteStatus EvalMliQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node
     the sliceHeight has been calculated. The tensor slicer is configured that
     it will completely slice the nBatch dimension (0) and slice the height
     dimension (1) in chunks of 'sliceHeight' */
-    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight, padding_top, padding_bottom, overlap);
+    TensorSlicer in_slice(in_ch_slice.Sub(), heightDimension, inSliceHeight,
+                          padding_top, padding_bottom, overlap);
 
     /* output tensor is alreade sliced in the output channel dimension.
     out_ch_slice.Sub() is the tensor for the amount of output channels of this
@@ -312,6 +349,7 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
                                      const TfLiteTensor* filter,
                                      const TfLiteTensor* bias,
                                      TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   DepthwiseParams op_params;
   op_params.padding_type = PaddingType::kSame;
   op_params.padding_values.width = data->padding.width;
@@ -335,12 +373,18 @@ TfLiteStatus EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
       GetTensorData<int32>(bias), GetTensorShape(output),
       GetTensorData<int8>(output));
   return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
 }
 
-void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                   TfLiteDepthwiseConvParams* params, OpData* data,
-                   const TfLiteTensor* input, const TfLiteTensor* filter,
-                   const TfLiteTensor* bias, TfLiteTensor* output) {
+TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                           TfLiteDepthwiseConvParams* params, OpData* data,
+                           const TfLiteTensor* input, const TfLiteTensor* filter,
+                           const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   const int32_t input_offset = -input->params.zero_point;
   const int32_t filter_offset = -filter->params.zero_point;
   const int32_t output_offset = output->params.zero_point;
@@ -369,6 +413,12 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
       GetTensorShape(filter), GetTensorData<uint8_t>(filter),
       GetTensorShape(bias), GetTensorData<int32_t>(bias),
       GetTensorShape(output), GetTensorData<uint8_t>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context, "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -390,7 +440,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData data;
 
   // All per-channel quantized tensors need valid zero point and scale arrays.
-  bool mli_is_applicable = false;
   if (input->type == kTfLiteInt8) {
     TF_LITE_ENSURE_EQ(context, filter->quantization.type,
                       kTfLiteAffineQuantization);
@@ -407,20 +456,16 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                          filter->dims->data[kDepthwiseConvQuantizedDimension]);
     TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
                       affine_quantization->zero_point->size);
-    mli_is_applicable =
-        ((filter->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
-         (params->dilation_width_factor == 1) &&
-         (params->dilation_height_factor == 1) &&
-         (affine_quantization->scale->size ==
-          filter->dims->data[kDepthwiseConvQuantizedDimension]));
   }
 
+  bool mli_is_applicable = IsMliApplicable(context, input, filter, bias, params);
   TF_LITE_ENSURE_STATUS(CalculateOpData(context, node, params, width, height,
                                         filter_width, filter_height, data_type,
-                                        &data));
+                                        mli_is_applicable, &data));
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
-      EvalFloat(context, node, params, &data, input, filter, bias, output);
+      return EvalFloat(context, node, params, &data, input, filter, bias,
+                       output);
       break;
     case kTfLiteInt8:
       if (mli_is_applicable) {
@@ -432,7 +477,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     case kTfLiteUInt8:
-      EvalQuantized(context, node, params, &data, input, filter, bias, output);
+      return EvalQuantized(context, node, params, &data, input, filter, bias,
+                           output);
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 185217d0c6a..70d1fda4c2b 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017-2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 
-#include "mli_api.h" 
+#include "mli_api.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -23,10 +23,10 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
-#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
-#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
 #include "tensorflow/lite/micro/kernels/arc_mli/mli_slicers.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/mli_tf_utils.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
+#include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
 
 namespace tflite {
 namespace ops {
@@ -52,6 +52,18 @@ constexpr int kWeightsTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLiteTensor* filter, const TfLiteTensor* bias,
+                     const TfLiteFullyConnectedParams* params) {
+  // MLI optimized version only supports int8 dataype and no fused Relu and
+  // symmetric per-tensor quantization of weights (not per-axis)
+  bool ret_val = (filter->type == kTfLiteInt8) &&
+                 (input->type == kTfLiteInt8) && (bias->type == kTfLiteInt32) &&
+                 (params->activation == kTfLiteActNone) &&
+                 (filter->params.zero_point == 0); 
+  return ret_val;
+}
+
 TfLiteStatus CalculateOpData(TfLiteContext* context,
                              TfLiteFullyConnectedParams* params,
                              TfLiteType data_type, const TfLiteTensor* input,
@@ -59,7 +71,9 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
                              const TfLiteTensor* bias, TfLiteTensor* output,
                              OpData* data) {
   TfLiteStatus status = kTfLiteOk;
-  if (data_type != kTfLiteFloat32) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  if (data_type != kTfLiteFloat32 &&
+      !IsMliApplicable(context, input, filter, bias, params)) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
@@ -70,6 +84,7 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
         context, params->activation, output, &data->output_activation_min,
         &data->output_activation_max));
   }
+#endif
   return status;
 }
 
@@ -95,6 +110,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
+  TF_LITE_ENSURE(context, data != nullptr);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
   TF_LITE_ENSURE_MSG(context, input->type == filter->type,
                      "Hybrid models are not supported on TFLite Micro.");
@@ -106,122 +122,135 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
+                                  TfLiteFullyConnectedParams* params,
+                                  OpData* data, const TfLiteTensor* input,
+                                  const TfLiteTensor* filter,
+                                  const TfLiteTensor* bias,
+                                  TfLiteTensor* output) {
+  mli_tensor mli_in = {0};
+  mli_tensor mli_weights = {0};
+  mli_tensor mli_bias = {0};
+  mli_tensor mli_out = {0};
+
+  ConvertToMliTensor<int8_t>(input, &mli_in);
+  ConvertToMliTensor<int8_t>(filter, &mli_weights);
+  ConvertToMliTensor<int32_t>(bias, &mli_bias);
+  ConvertToMliTensor<int8_t>(output, &mli_out);
+
+  /* The input tensor can have more than 2 dimensions. for the compute this
+     doesn't make any difference because all the inputs or a batch entry will
+     be used anyway. because the MLI kernel doesn't recognize the multiple
+     dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */
+  mli_in.shape[0] = mli_out.shape[0];
+  mli_in.shape[1] = mli_weights.shape[1];
+  mli_in.shape[2] = 0;
+  mli_in.shape[3] = 0;
+  mli_in.rank = 2;
+
+  // Tensors for data in fast (local) memory and config to copy data from
+  // external to local memory
+  mli_tensor weights_local = mli_weights;
+  mli_tensor bias_local = mli_bias;
+  mli_tensor in_local = mli_in;
+  mli_tensor out_local = mli_out;
+  mli_mov_cfg_t copy_config;
+  mli_mov_cfg_for_copy(&copy_config);
+  const int weight_out_dimension = 0;
+  const int out_tensor_dimension = 1;
+  const int batch_dimension = 0;
+  int slice_size = mli_weights.shape[weight_out_dimension];
+
+  /* allocate the local buffers, and compute the slice size */
+  TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(
+      context, &in_local, &weights_local, &bias_local, &out_local));
+  TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(
+      &weights_local, &bias_local, weight_out_dimension, &slice_size));
+  int max_out_slice_size =
+      out_local.capacity / mli_hlp_tensor_element_size(&out_local);
+  if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;
+
+  /* is_local indicates that the tensor is already in local memory,
+     so in that case the original tensor can be used,
+     and there is no need to copy it to the local tensor*/
+  const bool in_is_local = in_local.data == mli_in.data;
+  const bool out_is_local = out_local.data == mli_out.data;
+  const bool w_is_local = weights_local.data == mli_weights.data;
+  const bool b_is_local = bias_local.data == mli_bias.data;
+
+  TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
+  TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
+  TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0,
+                            true);
+
+  mli_tensor* w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
+  mli_tensor* b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
+
+  void* input_buffer_ptr = NULL;
+
+  while (!w_slice.Done()) {
+    mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
+    mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
+
+    TensorSlicer in_slice(&mli_in, batch_dimension, 1);
+
+    /* output tensor is alreade sliced in the output size dimension.
+    out_ch_slice.Sub() is the tensor for the amount of output size of this
+    itteration of the weight slice loop. This tensor needs to be further
+    sliced over the batch */
+    TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1);
+
+    /* setup the pointers to the local or remote tensor to make the code
+     * inside the loop easier. */
+    mli_tensor* in_ptr = in_is_local ? in_slice.Sub() : &in_local;
+    mli_tensor* out_ptr = out_is_local ? out_slice.Sub() : &out_local;
+
+    while (!out_slice.Done()) {
+      // if same input copy as previous iteration, skip the copy of input
+      if (in_slice.Sub()->data != input_buffer_ptr) {
+        mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
+        input_buffer_ptr = in_slice.Sub()->data;
+      }
+      mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
+      mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
+
+      in_slice.Next();
+      out_slice.Next();
+    }
+    w_slice.Next();
+    b_slice.Next();
+    out_ch_slice.Next();
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
                                TfLiteFullyConnectedParams* params, OpData* data,
                                const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
-  // Run Fully Connected MLI kernel
-  // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
-  // work around for issue #35318, mli fully connect kernel only supports
-  // zeropoint == 0 for weights. this check can be removed once issue #35318 is
-  // resolved.
-  if ((filter->params.zero_point == 0) &&
-      (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone)) {
-    mli_tensor mli_in = {0};
-    mli_tensor mli_weights = {0};
-    mli_tensor mli_bias = {0};
-    mli_tensor mli_out = {0};
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
+  FullyConnectedParams op_params;
+  op_params.input_offset = -input->params.zero_point;
+  op_params.weights_offset = -filter->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier = data->output_multiplier;
+  // TODO(b/138810107): Figure out whether output shift should be inverted
+  op_params.output_shift = -data->output_shift;
+  op_params.quantized_activation_min = data->output_activation_min;
+  op_params.quantized_activation_max = data->output_activation_max;
 
-    ConvertToMliTensor<int8_t>(input, &mli_in);
-    ConvertToMliTensor<int8_t>(filter, &mli_weights);
-    ConvertToMliTensor<int32_t>(bias, &mli_bias);
-    ConvertToMliTensor<int8_t>(output, &mli_out);
-
-    /* The input tensor can have more than 2 dimensions. for the compute this doesn't make any difference
-       because all the inputs or a batch entry will be used anyway. because the MLI kernel doesn't recognize
-       the multiple dimensions, the tensor shape is casted to a {batchnum, inputsize} shape. */
-    mli_in.shape[0] = mli_out.shape[0];
-    mli_in.shape[1] = mli_weights.shape[1];
-    mli_in.shape[2] = 0;
-    mli_in.shape[3] = 0;
-    mli_in.rank = 2;
-
-    // Tensors for data in fast (local) memory and config to copy data from external to local memory
-    mli_tensor weights_local = mli_weights;
-    mli_tensor bias_local = mli_bias;
-    mli_tensor in_local = mli_in;
-    mli_tensor out_local = mli_out;
-    mli_mov_cfg_t copy_config;
-    mli_mov_cfg_for_copy(&copy_config);
-    const int weight_out_dimension = 0;
-    const int out_tensor_dimension = 1;
-    const int batch_dimension = 0;
-    int slice_size = mli_weights.shape[weight_out_dimension];
-
-    /* allocate the local buffers, and compute the slice size */
-    TF_LITE_ENSURE_STATUS(get_arc_scratch_buffer_for_fully_connect_tensors(context, &in_local, &weights_local, &bias_local, &out_local));
-    TF_LITE_ENSURE_STATUS(arc_scratch_buffer_calc_slice_size_weights(&weights_local, &bias_local, weight_out_dimension, &slice_size));
-    int max_out_slice_size = out_local.capacity / mli_hlp_tensor_element_size(&out_local);
-    if (slice_size > max_out_slice_size) slice_size = max_out_slice_size;
-
-    /* is_local indicates that the tensor is already in local memory,
-       so in that case the original tensor can be used,
-       and there is no need to copy it to the local tensor*/
-    const bool in_is_local = in_local.data == mli_in.data;
-    const bool out_is_local = out_local.data == mli_out.data;
-    const bool w_is_local = weights_local.data == mli_weights.data;
-    const bool b_is_local = bias_local.data == mli_bias.data;
-
-    TensorSlicer w_slice(&mli_weights, weight_out_dimension, slice_size);
-    TensorSlicer b_slice(&mli_bias, weight_out_dimension, slice_size);
-    TensorSlicer out_ch_slice(&mli_out, out_tensor_dimension, slice_size, 0, 0, 0, true);
-
-    mli_tensor *w_ptr = w_is_local ? w_slice.Sub() : &weights_local;
-    mli_tensor *b_ptr = b_is_local ? b_slice.Sub() : &bias_local;
-
-    void *input_buffer_ptr = NULL;
-
-    while (!w_slice.Done()){
-      mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
-      mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
-
-      TensorSlicer in_slice(&mli_in, batch_dimension, 1);
-
-      /* output tensor is alreade sliced in the output size dimension. out_ch_slice.Sub() is the tensor for the amount of
-      output size of this itteration of the weight slice loop. This tensor needs to be further sliced over the batch */
-      TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1);
-
-      /* setup the pointers to the local or remote tensor to make the code inside the loop easier. */
-      mli_tensor *in_ptr = in_is_local ? in_slice.Sub() : &in_local;
-      mli_tensor *out_ptr = out_is_local ? out_slice.Sub() : &out_local;
-
-      while (!out_slice.Done()) {
-
-        // if same input copy as previous iteration, skip the copy of input
-        if (in_slice.Sub()->data != input_buffer_ptr) {
-          mli_mov_tensor_sync(in_slice.Sub(), &copy_config, in_ptr);
-          input_buffer_ptr = in_slice.Sub()->data;
-        }
-        mli_krn_fully_connected_sa8_sa8_sa32(in_ptr, w_ptr, b_ptr, out_ptr);
-        mli_mov_tensor_sync(out_ptr, &copy_config, out_slice.Sub());
-
-        in_slice.Next();
-        out_slice.Next();
-      }
-      w_slice.Next();
-      b_slice.Next();
-      out_ch_slice.Next();
-    }
-  } else {
-    FullyConnectedParams op_params;
-    op_params.input_offset = -input->params.zero_point;
-    op_params.weights_offset = -filter->params.zero_point;
-    op_params.output_offset = output->params.zero_point;
-    op_params.output_multiplier = data->output_multiplier;
-    // TODO(b/138810107): Figure out whether output shift should be inverted
-    op_params.output_shift = -data->output_shift;
-    op_params.quantized_activation_min = data->output_activation_min;
-    op_params.quantized_activation_max = data->output_activation_max;
-
-    reference_integer_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(filter), GetTensorData<int8_t>(filter),
-        GetTensorShape(bias), GetTensorData<int32_t>(bias),
-        GetTensorShape(output), GetTensorData<int8_t>(output));
-  }
+  reference_integer_ops::FullyConnected(
+      op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+      GetTensorShape(filter), GetTensorData<int8_t>(filter),
+      GetTensorShape(bias), GetTensorData<int32_t>(bias),
+      GetTensorShape(output), GetTensorData<int8_t>(output));
   return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Node configuration is not supported by ARC MLI Library.");
+  return kTfLiteError;
+#endif
 }
 
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
@@ -229,6 +258,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            const TfLiteTensor* input,
                            const TfLiteTensor* filter, const TfLiteTensor* bias,
                            TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   const int32_t input_offset = -input->params.zero_point;
   const int32_t filter_offset = -filter->params.zero_point;
   const int32_t output_offset = output->params.zero_point;
@@ -261,14 +291,20 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                          TfLiteTypeGetName(output->type), output->type);
       return kTfLiteError;
   }
-
   return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteFullyConnectedParams* params, OpData* data,
                        const TfLiteTensor* input, const TfLiteTensor* filter,
                        const TfLiteTensor* bias, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float output_activation_min, output_activation_max;
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
@@ -281,6 +317,12 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
       GetTensorShape(bias), GetTensorData<float>(bias), GetTensorShape(output),
       GetTensorData<float>(output));
   return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
@@ -293,6 +335,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  TF_LITE_ENSURE(context, data != nullptr);
 
   // Checks in Prepare ensure input, output and filter types are all the same.
   switch (input->type) {
@@ -300,12 +343,17 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return EvalFloat(context, node, params, data, input, filter, bias,
                        output);
     case kTfLiteInt8:
-      return EvalQuantizedInt8(context, node, params, data, input, filter, bias,
-                               output);
+      if (IsMliApplicable(context, input, filter, bias, params)) {
+        return EvalMliQuantizedInt8(context, node, params, data, input, filter,
+                                 bias, output);
+      } else {
+        return EvalQuantizedInt8(context, node, params, data, input, filter,
+                                 bias, output);
+      }
 
-    case kTfLiteUInt8:
-      return EvalQuantized(context, node, params, data, input, filter, bias,
-                           output);
+      case kTfLiteUInt8:
+        return EvalQuantized(context, node, params, data, input, filter, bias,
+                             output);
 
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index 2c3875b58eb..79deacc23d9 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019-2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -42,6 +42,15 @@ struct OpData {
 
 enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
 
+
+bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
+                     const TfLitePoolParams* params) {
+  // MLI optimized version only supports int8 dataype and no fused Relu
+  // TODO: subject to add mli_saturate kernel
+  return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
+}
+
+
 TfLiteStatus CalculateOpData(const TfLiteContext* context,
                              const TfLitePoolParams* params,
                              const TfLiteTensor* input,
@@ -61,9 +70,11 @@ TfLiteStatus CalculateOpData(const TfLiteContext* context,
   return kTfLiteOk;
 }
 
-void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
-                      const TfLitePoolParams* params, const OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+TfLiteStatus AverageEvalFloat(TfLiteContext* context,
+                              const TfLiteNode* node,
+                              const TfLitePoolParams* params, const OpData* data,
+                              const TfLiteTensor* input, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -80,6 +91,13 @@ void AverageEvalFloat(const TfLiteContext* context, const TfLiteNode* node,
   reference_ops::AveragePool(
       op_params, GetTensorShape(input), GetTensorData<float>(input),
       GetTensorShape(output), GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
 //Prepare MLI tensors and run Average or Max Pooling
@@ -164,45 +182,49 @@ TfLiteStatus EvalMli(TfLiteContext* context, const TfLitePoolParams* params,
   return kTfLiteOk;
 }
 
-void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node,
-                          const TfLitePoolParams* params, const OpData* data,
-                          const TfLiteTensor* input, TfLiteTensor* output) {
+TfLiteStatus AverageEvalQuantized(TfLiteContext* context,
+                                  const TfLiteNode* node,
+                                  const TfLitePoolParams* params,
+                                  const OpData* data, const TfLiteTensor* input,
+                                  TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-  // Run Average Pooling MLI kernel
-  // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
-  if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) {
-    EvalMli(context, params, data, input, output, AveragePooling);
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
+  PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
+
+  if (input->type == kTfLiteUInt8) {
+    reference_ops::AveragePool(
+        op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+        GetTensorShape(output), GetTensorData<uint8_t>(output));
   } else {
-    int32_t activation_min, activation_max;
-    (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                            &activation_min, &activation_max);
-
-    PoolParams op_params;
-    op_params.stride_height = params->stride_height;
-    op_params.stride_width = params->stride_width;
-    op_params.filter_height = params->filter_height;
-    op_params.filter_width = params->filter_width;
-    op_params.padding_values.height = data->padding.height;
-    op_params.padding_values.width = data->padding.width;
-    op_params.quantized_activation_min = activation_min;
-    op_params.quantized_activation_max = activation_max;
-
-    if (input->type == kTfLiteUInt8) {
-      reference_ops::AveragePool(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
-    } else {
-      reference_integer_ops::AveragePool(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
-    }
+    reference_integer_ops::AveragePool(
+        op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(output), GetTensorData<int8_t>(output));
   }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(
+      context,
+      "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
+      TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
-void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data,
-                  const TfLiteTensor* input, TfLiteTensor* output) {
+TfLiteStatus MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
+                          TfLitePoolParams* params, OpData* data,
+                          const TfLiteTensor* input, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   float activation_min, activation_max;
   CalculateActivationRange(params->activation, &activation_min,
                            &activation_max);
@@ -219,43 +241,50 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
   reference_ops::MaxPool(op_params, GetTensorShape(input),
                          GetTensorData<float>(input), GetTensorShape(output),
                          GetTensorData<float>(output));
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+                     "Type %s (%d) is not supported by ARC MLI Library.",
+                     TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 
-void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
-                      TfLitePoolParams* params, OpData* data,
-                      const TfLiteTensor* input, TfLiteTensor* output) {
+TfLiteStatus MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLitePoolParams* params, OpData* data,
+                              const TfLiteTensor* input, TfLiteTensor* output) {
+#if !defined(TF_LITE_STRIP_REFERENCE_IMPL)
   TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8);
-  
-  // Run Max Pooling MLI kernel
-  // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
-  if (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone) {
-    EvalMli(context, params, data, input, output, MaxPooling);
-  } else {
-    int32_t activation_min, activation_max;
-    (void)CalculateActivationRangeQuantized(context, params->activation, output,
-                                            &activation_min, &activation_max);
+  int32_t activation_min, activation_max;
+  (void)CalculateActivationRangeQuantized(context, params->activation, output,
+                                          &activation_min, &activation_max);
 
-    tflite::PoolParams op_params;
-    op_params.stride_height = params->stride_height;
-    op_params.stride_width = params->stride_width;
-    op_params.filter_height = params->filter_height;
-    op_params.filter_width = params->filter_width;
-    op_params.padding_values.height = data->padding.height;
-    op_params.padding_values.width = data->padding.width;
-    op_params.quantized_activation_min = activation_min;
-    op_params.quantized_activation_max = activation_max;
+  tflite::PoolParams op_params;
+  op_params.stride_height = params->stride_height;
+  op_params.stride_width = params->stride_width;
+  op_params.filter_height = params->filter_height;
+  op_params.filter_width = params->filter_width;
+  op_params.padding_values.height = data->padding.height;
+  op_params.padding_values.width = data->padding.width;
+  op_params.quantized_activation_min = activation_min;
+  op_params.quantized_activation_max = activation_max;
 
-    if (input->type == kTfLiteUInt8) {
+  if (input->type == kTfLiteUInt8) {
       reference_ops::MaxPool(
           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
           GetTensorShape(output), GetTensorData<uint8_t>(output));
-    } else {
+  } else {
       reference_integer_ops::MaxPool(
           op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
           GetTensorShape(output), GetTensorData<int8_t>(output));
-    }
   }
+  return kTfLiteOk;
+#else
+  TF_LITE_KERNEL_LOG(context,
+      "Node configuration or type %s (%d) is not supported by ARC MLI Library.",
+      TfLiteTypeGetName(input->type), input->type);
+  return kTfLiteError;
+#endif
 }
 }  // namespace
 
@@ -272,11 +301,16 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   // Inputs and outputs share the same type, guaranteed by the converter.
   switch (input->type) {
     case kTfLiteFloat32:
-      AverageEvalFloat(context, node, params, &data, input, output);
+      return AverageEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      AverageEvalQuantized(context, node, params, &data, input, output);
+      if (IsMliApplicable(context, input, params)) {
+        return EvalMli(context, params, &data, input, output, AveragePooling);
+      } else {
+        return AverageEvalQuantized(context, node, params, &data, input,
+                                    output);
+      }
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Input type %s is not currently supported",
@@ -297,11 +331,15 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
 
   switch (input->type) {
     case kTfLiteFloat32:
-      MaxEvalFloat(context, node, params, &data, input, output);
+      return MaxEvalFloat(context, node, params, &data, input, output);
       break;
     case kTfLiteUInt8:
     case kTfLiteInt8:
-      MaxEvalQuantized(context, node, params, &data, input, output);
+      if (IsMliApplicable(context, input, params)) {
+        return EvalMli(context, params, &data, input, output, MaxPooling);
+      } else {
+        return MaxEvalQuantized(context, node, params, &data, input, output);
+      }
       break;
     default:
       TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
index 3b8fa04d536..ee3cc8113c1 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
@@ -86,6 +86,14 @@ endif
   ARC_MLI_TESTS += $(foreach TEST,$(ARC_MLI_TESTS), $(TEST)_slicing)
 
 generate_arc_mli_test_projects: $(foreach TEST,$(ARC_MLI_TESTS), generate_kernel_$(TEST)_test_make_project)
+  
+  ARC_EXTRA_APP_SETTINGS += \
+    \nMLI_ONLY ?= false\n\
+    \nifeq \($(DLR)\(MLI_ONLY\), true\)\
+    \nCCFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
+    \nCXXFLAGS += -DTF_LITE_STRIP_REFERENCE_IMPL\
+    \nendif\n
+
 
 
 endif # no_embarc_mli

From 2621bf4ee40a7d14db48b63ead3fca2589552670 Mon Sep 17 00:00:00 2001
From: naumkin <naumkin@synopsys.com>
Date: Sun, 26 Apr 2020 23:49:42 -0700
Subject: [PATCH 0087/1533] Data movement tests added

---
 .../kernels/arc_mli/conv_slicing_test.cc      |  784 +++++-------
 .../arc_mli/depthwise_conv_slicing_test.cc    |  836 +++++-------
 .../arc_mli/fully_connected_slicing_test.cc   | 1074 ++++------------
 .../kernels/arc_mli/pooling_slicing_test.cc   | 1140 ++++-------------
 4 files changed, 1167 insertions(+), 2667 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
index a1f155ecc56..27e30856f6c 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
@@ -24,25 +24,114 @@ namespace tflite {
 namespace testing {
 namespace {
 
-// Common inputs and outputs.
-static const int kInputElements = 16;
-static const int kInputShape[] = {4, 2, 2, 4, 1};
-static const float kInputData[] = {1, 1, 1, 1, 2, 2, 2, 2,
-                                   1, 2, 3, 4, 1, 2, 3, 4};
-static const int kFilterElements = 12;
-static const int kFilterShape[] = {4, 3, 2, 2, 1};
-static const float kFilterData[] = {1, 2, 3, 4, -1, 1, -1, 1, -1, -1, 1, 1};
-static const int kBiasElements = 3;
-static const int kBiasShape[] = {1, 3};
-static const float kBiasData[] = {1, 2, 3};
-static const int kOutputElements = 12;
-static const int kOutputShape[] = {4, 2, 1, 2, 3};
-static const float kGoldenData[] = {18, 2, 5, 18, 2, 5, 17, 4, 3, 37, 4, 3};
+// Common inputs and outputs 1.
+static const int kInput1Elements = 20;
+static const int kInput1Shape[] = {4, 1, 5, 2, 2};
+static const float kInput1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kFilter1Elements = 36;
+static const int kFilter1Shape[] = {4, 2, 3, 3, 2};
+static const float kFilter1Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                     2, 2, 2, 2, 2, 2};
+static const int kBias1Elements = 2;
+static const int kBias1Shape[] = {1, 2};
+static const float kBias1Data[] = {2, 2};
+static const int kOutput1Elements = 20;
+static const int kOutput1Shape[] = {4, 1, 5, 2, 2};
+static const float kGolden1Data[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                                     50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+// Common inputs and outputs 2.
+static const int kInput2Elements = 80;
+static const int kInput2Shape[] = {4, 1, 20, 2, 2};
+static const float kInput2Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+static const int kFilter2Elements = 36;
+static const int kFilter2Shape[] = {4, 2, 3, 3, 2};
+static const float kFilter2Data[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                     2, 2, 2, 2, 2, 2};
+static const int kBias2Elements = 2;
+static const int kBias2Shape[] = {1, 2};
+static const float kBias2Data[] = {2, 2};
+static const int kOutput2Elements = 80;
+static const int kOutput2Shape[] = {4, 1, 20, 2, 2};
+static const float kGolden2Data[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 
+                                     50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                                     50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                                     50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                                     50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                                     50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                                     50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                                     50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+
+// Common inputs and outputs 3.
+static const int kInput3Elements = 40;
+static const int kInput3Shape[] = {4, 1, 2, 2, 10};
+static const float kInput3Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kFilter3Elements = 90;
+static const int kFilter3Shape[] = {4, 1, 3, 3, 10}; // 1 3 3 10
+static const float kFilter3Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kBias3Elements = 1;
+static const int kBias3Shape[] = {1, 1};
+static const float kBias3Data[] = {1};
+static const int kOutput3Elements = 4;
+static const int kOutput3Shape[] = {4, 1, 2, 2, 1}; // 2 2 1
+static const float kGolden3Data[] = {41, 41, 41, 41};
+
+// Common inputs and outputs 4.
+static const int kInput4Elements = 80;
+static const int kInput4Shape[] = {4, 1, 4, 2, 10};
+static const float kInput4Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                    1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kFilter4Elements = 90;
+static const int kFilter4Shape[] = {4, 1, 3, 3, 10};
+static const float kFilter4Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+static const int kBias4Elements = 1;
+static const int kBias4Shape[] = {1, 1};
+static const float kBias4Data[] = {1};
+static const int kOutput4Elements = 8;
+static const int kOutput4Shape[] = {4, 1, 4, 2, 1};
+static const float kGolden4Data[] = {41, 41, 61, 61, 61, 61, 41, 41};
 
 static TfLiteConvParams common_conv_params = {
-    kTfLitePaddingValid,  // padding
-    2,                    // stride_width
-    2,                    // stride_height
+    kTfLitePaddingSame,  // padding
+    1,                    // stride_width
+    1,                    // stride_height
     kTfLiteActNone,       // activation
     1,                    // dilation_width_factor
     1,                    // dilation_height_factor
@@ -109,77 +198,6 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
   return kTfLiteOk;
 }
 
-void TestConvFloat(const int* input_dims_data, const float* input_data,
-                   const int* filter_dims_data, const float* filter_data,
-                   const int* bias_dims_data, const float* bias_data,
-                   const int* output_dims_data,
-                   const float* expected_output_data, float* output_data,
-                   TfLiteConvParams* conv_params) {
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(filter_data, filter_dims, "filter_tensor"),
-      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk,
-      ValidateConvGoldens(tensors, tensors_size, expected_output_data,
-                          output_data, output_dims_count, conv_params));
-}
-
-void TestConvQuantizedPerLayer(
-    const int* input_dims_data, const float* input_data,
-    uint8_t* input_quantized, float input_scale, const int* filter_dims_data,
-    const float* filter_data, uint8_t* filter_quantized, float filter_scale,
-    const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized,
-    const int* output_dims_data, const float* expected_output_data,
-    uint8_t* expected_output_quantized, uint8_t* output_data,
-    float output_scale, TfLiteConvParams* conv_params) {
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  tflite::AsymmetricQuantize(expected_output_data, expected_output_quantized,
-                             output_dims_count, output_scale, 128);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateQuantizedTensor(input_data, input_quantized, input_dims,
-                            input_scale, 128, "input_tensor"),
-      CreateQuantizedTensor(filter_data, filter_quantized, filter_dims,
-                            filter_scale, 128, "filter_tensor"),
-      CreateQuantizedBiasTensor(bias_data, bias_quantized, bias_dims,
-                                input_scale, filter_scale, "bias_tensor"),
-      CreateQuantizedTensor(output_data, output_dims, output_scale, 128,
-                            "output_tensor")};
-
-  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
-  float filter_scales[] = {1, filter_scale};
-  int filter_zero_points[] = {1, 128};
-  TfLiteAffineQuantization filter_quant = {
-      FloatArrayFromFloats(filter_scales),
-      IntArrayFromInts(filter_zero_points)};
-  tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant};
-
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk,
-      ValidateConvGoldens(tensors, tensors_size, expected_output_quantized,
-                          output_data, output_dims_count, conv_params));
-}
-
 void TestConvQuantizedPerChannel(
     const int* input_dims_data, const float* input_data,
     int8_t* input_quantized, float input_scale, int input_zero_point,
@@ -207,6 +225,20 @@ void TestConvQuantizedPerChannel(
       filter_data, filter_data_quantized, filter_dims, filter_scales,
       filter_zero_points, &filter_quant, 0 /* quantized dimension */,
       "filter_tensor");
+  
+  // DN: to replace scales and quantized data to avoid second quantization
+  int channel_count = filter_dims->data[0];
+  float true_filter_scales[5] = {1.0, 1.0, 1.0, 1.0, 1.0};
+  true_filter_scales[0] = static_cast<float>(channel_count);
+  TfLiteAffineQuantization *to_change = (TfLiteAffineQuantization *)filter_tensor.quantization.params;
+  to_change->scale = FloatArrayFromFloats(true_filter_scales);
+
+  int filter_size = filter_tensor.bytes;
+  for(int i = 0; i < filter_size; ++i) {
+    filter_tensor.data.int8[i] = filter_data[i];    
+  }
+
+
   TfLiteTensor bias_tensor = CreatePerChannelQuantizedBiasTensor(
       bias_data, bias_data_quantized, bias_dims, input_scale, &filter_scales[1],
       bias_scales, bias_zero_points, &bias_quant, 0 /* quantized dimension */,
@@ -255,375 +287,223 @@ void TestConvQuantizedPerChannel(
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SimpleTestFloat) {
-  float output_data[tflite::testing::kOutputElements];
-
-  tflite::testing::TestConvFloat(
-      tflite::testing::kInputShape, tflite::testing::kInputData,
-      tflite::testing::kFilterShape, tflite::testing::kFilterData,
-      tflite::testing::kBiasShape, tflite::testing::kBiasData,
-      tflite::testing::kOutputShape, tflite::testing::kGoldenData, output_data,
-      &tflite::testing::common_conv_params);
-}
-
-TF_LITE_MICRO_TEST(InputAndFilterSameWidthHeight) {
-  const int output_dims_count = 2;
-  float output_data[output_dims_count];
-
-  const int kFilterShape[] = {4, 1, 2, 4, 1};
-  const float filter_values[] = {1, 2, 3, 4, -1, -1, 1, 1};
-  const int kBiasShape[] = {1, 1};
-  const float bias_values[] = {0};
-  const int kOutputShape[] = {4, 2, 1, 1, 1};
-  const float expected_output[] = {10, 34};
-
-  tflite::testing::TestConvFloat(
-      tflite::testing::kInputShape, tflite::testing::kInputData, kFilterShape,
-      filter_values, kBiasShape, bias_values, kOutputShape, expected_output,
-      output_data, &tflite::testing::common_conv_params);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantized) {
-  const int output_dims_count = 12;
-  uint8_t output_data[output_dims_count];
-
-  const float input_scale = 0.5f;
-  const float filter_scale = 0.5f;
-  const float output_scale = 1.0f;
-
-  uint8_t input_quantized[tflite::testing::kInputElements];
-  uint8_t filter_quantized[tflite::testing::kFilterElements];
-  int32_t bias_quantized[tflite::testing::kBiasElements];
-  uint8_t golden_quantized[tflite::testing::kOutputElements];
-
-  tflite::testing::TestConvQuantizedPerLayer(
-      tflite::testing::kInputShape, tflite::testing::kInputData,
-      input_quantized, input_scale, tflite::testing::kFilterShape,
-      tflite::testing::kFilterData, filter_quantized, filter_scale,
-      tflite::testing::kBiasShape, tflite::testing::kBiasData, bias_quantized,
-      tflite::testing::kOutputShape, tflite::testing::kGoldenData,
-      golden_quantized, output_data, output_scale,
-      &tflite::testing::common_conv_params);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
-  const int output_dims_count = 12;
-  int8_t output_data[output_dims_count];
-
-  const float input_scale = 0.5f;
-  const float output_scale = 1.0f;
-  const int input_zero_point = 0;
-  const int output_zero_point = 0;
-
-  int8_t input_quantized[tflite::testing::kInputElements];
-  int8_t filter_quantized[tflite::testing::kFilterElements];
-  int32_t bias_quantized[tflite::testing::kBiasElements];
-  int8_t golden_quantized[tflite::testing::kOutputElements];
-  int zero_points[tflite::testing::kBiasElements + 1];
-  float scales[tflite::testing::kBiasElements + 1];
-
-  tflite::testing::TestConvQuantizedPerChannel(
-      tflite::testing::kInputShape, tflite::testing::kInputData,
-      input_quantized, input_scale, input_zero_point,
-      tflite::testing::kFilterShape, tflite::testing::kFilterData,
-      filter_quantized, tflite::testing::kBiasShape, tflite::testing::kBiasData,
-      bias_quantized, scales, zero_points, tflite::testing::kOutputShape,
-      tflite::testing::kGoldenData, golden_quantized, output_data, output_scale,
-      output_zero_point, &tflite::testing::common_conv_params);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelRelu6) {
-  // conv params:
-  // padding, stride_<width,height>, dilation_<width, height>, activation
-  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
-  const int output_dims_count = 12;
-  int8_t output_data[output_dims_count];
-
-  const float bias_values[] = {1, 2, -3};
-  const float golden_data[] = {6, 2, 0, 6, 2, 0, 6, 4, 0, 6, 4, 0};
-
-  const float input_scale = 0.023529f;
-  const float output_scale = 0.023529f;
-  const int input_zero_point = -128;
-  const int output_zero_point = -128;
-
-  int8_t input_quantized[tflite::testing::kInputElements];
-  int8_t filter_quantized[tflite::testing::kFilterElements];
-  int32_t bias_quantized[tflite::testing::kBiasElements];
-  int8_t golden_quantized[tflite::testing::kOutputElements];
-  int zero_points[tflite::testing::kBiasElements + 1];
-  float scales[tflite::testing::kBiasElements + 1];
-
-  tflite::testing::TestConvQuantizedPerChannel(
-      tflite::testing::kInputShape, tflite::testing::kInputData,
-      input_quantized, input_scale, input_zero_point,
-      tflite::testing::kFilterShape, tflite::testing::kFilterData,
-      filter_quantized, tflite::testing::kBiasShape, bias_values,
-      bias_quantized, scales, zero_points, tflite::testing::kOutputShape,
-      golden_data, golden_quantized, output_data, output_scale,
-      output_zero_point, &tflite::testing::common_conv_params);
-}
-
-TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {
-  // conv params:
-  // padding, stride_<width,height>, activation, dilation_<width, height>
-  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
-                                  kTfLiteActNone,      1, 1};
-  const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
-  const int kInputElements =
-      kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
-  float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2,
-                                            1, 2, 3, 4, 1, 2, 3, 4};
-  const int kFilterShape[] = {4, 3, 1, 1, 4};
-  const int kFilterElements =
-      kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4];
-  float kFilterData[/* kFilterElements */] = {1,  2, 3,  4,  -1, 1,
-                                              -1, 1, -1, -1, 1,  1};
-  const int kBiasElements = kFilterShape[1];
-  const int kBiasShape[] = {1, kBiasElements};
-  float kBiasData[/* kBiasElements */] = {1, 2, 3};
-  const int kOutputShape[] = {4, 1, 2, 2, kBiasElements};
-  const int kOutputElements = 4 * 3;
-  int8_t output_data[kOutputElements];
-  const float kGoldenData[/* kOutputElements */] = {11, 2, 3, 21, 2, 3,
-                                                    31, 4, 7, 31, 4, 7};
-
-  const float input_scale = 0.5f;
-  const float output_scale = 1.0f;
-  const int input_zero_point = 0;
-  const int output_zero_point = 0;
-
-  int8_t input_quantized[kInputElements];
-  int8_t filter_quantized[kFilterElements];
-  int32_t bias_quantized[kBiasElements];
-  int8_t golden_quantized[kOutputElements];
-  int zero_points[kBiasElements + 1];
-  float scales[kBiasElements + 1];
-
-  tflite::testing::TestConvQuantizedPerChannel(
-      kInputShape, kInputData, input_quantized, input_scale, input_zero_point,
-      kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData,
-      bias_quantized, scales, zero_points, kOutputShape, kGoldenData,
-      golden_quantized, output_data, output_scale, output_zero_point,
-      &conv_params);
-}
-
-TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
-  // conv params:
-  // padding, stride_<width,height>, dilation_<width, height>, activation
-  TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
-  const int kInputShape[] = {4, 1, 2, 2, 4};  // [len,N,H,W,C]
-  const int kInputElements =
-      kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
-  float kInputData[/* kInputElements */] = {1, 1, 1, 1, 2, 2, 2, 2,
-                                            1, 2, 3, 4, 1, 2, 3, 4};
-  const int kFilterShape[] = {4, 3, 1, 1, 4};
-  const int kFilterElements =
-      kFilterShape[1] * kFilterShape[2] * kFilterShape[3] * kFilterShape[4];
-  float kFilterData[/* kFilterElements */] = {1,  2, 3,  4,  -1, 1,
-                                              -1, 1, -1, -1, 1,  1};
-  const int kBiasElements = kFilterShape[1];
-  const int kBiasShape[] = {1, kBiasElements};
-  float kBiasData[/* kBiasElements */] = {1, 2, -3};
-  const int kOutputShape[] = {4, 1, 2, 2, kBiasElements};
-  const int kOutputElements = 4 * 3;
-  int8_t output_data[kOutputElements];
-  const float kGoldenData[/* kOutputElements */] = {6, 2, 0, 6, 2, 0,
-                                                    6, 4, 1, 6, 4, 1};
-
-  const float input_scale = 0.023529f;
-  const float output_scale = 0.023529f;
-  const int input_zero_point = -128;
-  const int output_zero_point = -128;
-
-  int8_t input_quantized[kInputElements];
-  int8_t filter_quantized[kFilterElements];
-  int32_t bias_quantized[kBiasElements];
-  int8_t golden_quantized[kOutputElements];
-  int zero_points[kBiasElements + 1];
-  float scales[kBiasElements + 1];
-
-  tflite::testing::TestConvQuantizedPerChannel(
-      kInputShape, kInputData, input_quantized, input_scale, input_zero_point,
-      kFilterShape, kFilterData, filter_quantized, kBiasShape, kBiasData,
-      bias_quantized, scales, zero_points, kOutputShape, kGoldenData,
-      golden_quantized, output_data, output_scale, output_zero_point,
-      &conv_params);
-}
-
-TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
-  const int output_dims_count = 12;
-  int8_t output_data[output_dims_count];
-
-  const float input_scale = 0.5f;
-  const float output_scale = 1.0f;
-
-  int8_t input_quantized[tflite::testing::kInputElements];
-  int8_t filter_quantized[tflite::testing::kFilterElements];
-  int32_t bias_quantized[tflite::testing::kBiasElements];
-  int8_t golden_quantized[tflite::testing::kOutputElements];
-  int zero_points[tflite::testing::kBiasElements + 1];
-  float scales[tflite::testing::kBiasElements + 1];
-
-  TfLiteIntArray* input_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kInputShape);
-  TfLiteIntArray* filter_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape);
-  TfLiteIntArray* bias_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape);
-  TfLiteIntArray* output_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape);
-
-  int filter_zero_points[5];
-  float filter_scales[5];
-  TfLiteAffineQuantization filter_quant;
-  TfLiteAffineQuantization bias_quant;
-  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
-      tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0,
-      "input_tensor");
-  TfLiteTensor filter_tensor =
-      tflite::testing::CreateSymmetricPerChannelQuantizedTensor(
-          tflite::testing::kFilterData, filter_quantized, filter_dims,
-          filter_scales, filter_zero_points, &filter_quant,
-          0 /* quantized dimension */, "filter_tensor");
-  TfLiteTensor bias_tensor =
-      tflite::testing::CreatePerChannelQuantizedBiasTensor(
-          tflite::testing::kBiasData, bias_quantized, bias_dims, input_scale,
-          &filter_scales[1], scales, zero_points, &bias_quant, 0,
-          "bias_tensor");
-  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
-      output_data, output_dims, output_scale, 0 /* quantized dimension */,
-      "output_tensor");
-
-  float input_scales[] = {1, input_scale};
-  int input_zero_points[] = {1, 128};
-  TfLiteAffineQuantization input_quant = {
-      tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
-  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      input_tensor,
-      filter_tensor,
-      bias_tensor,
-      output_tensor,
-  };
-
-  tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized,
-                             output_dims_count, output_scale, 0);
-
-  // Set filter quant to mismatched dimension.
-  TfLiteAffineQuantization* quant = reinterpret_cast<TfLiteAffineQuantization*>(
-      filter_tensor.quantization.params);
-
-  // Choose arbitrary incorrect scale and zero point sizes which are neither 1
-  // (for broadcast case) nor the quantized dimension size.
-  quant->scale->size = 2;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError,
-      tflite::testing::ValidateConvGoldens(
-          tensors, tensors_size, golden_quantized, output_data,
-          output_dims_count, &tflite::testing::common_conv_params));
-
-  // Set scale back to correct dimension, and make zero point array too short.
-  quant->scale->size = tflite::testing::kFilterShape[0];
-  quant->zero_point->size = 2;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError,
-      tflite::testing::ValidateConvGoldens(
-          tensors, tensors_size, golden_quantized, output_data,
-          output_dims_count, &tflite::testing::common_conv_params));
-}
-
-TF_LITE_MICRO_TEST(BroadcastPerLayerQuantizationToPerChannelShouldMatchGolden) {
-  const int output_dims_count = 12;
-  int8_t output_data[output_dims_count];
-
+// Test group 1
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
+  const int output_dims_count = 20;
   const float input_scale = 1.0f;
-  const float filter_scale = 1.0f;
   const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
 
-  int8_t input_quantized[tflite::testing::kInputElements];
-  int8_t filter_quantized[tflite::testing::kFilterElements];
-  int32_t bias_quantized[tflite::testing::kBiasElements];
-  int8_t golden_quantized[tflite::testing::kOutputElements];
+  int8_t input_quantized[tflite::testing::kInput1Elements];
+  int8_t filter_quantized[tflite::testing::kFilter1Elements];
+  int32_t bias_quantized[tflite::testing::kBias1Elements];
+  int8_t golden_quantized[tflite::testing::kOutput1Elements];
+  int8_t output_data[output_dims_count];
 
-  TfLiteIntArray* input_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kInputShape);
-  TfLiteIntArray* filter_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kFilterShape);
-  TfLiteIntArray* bias_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kBiasShape);
-  TfLiteIntArray* output_dims =
-      tflite::testing::IntArrayFromInts(tflite::testing::kOutputShape);
+  int zero_points[tflite::testing::kBias1Elements + 1];
+  float scales[tflite::testing::kBias1Elements + 1];
 
-  // Create per-layer quantized int8 input tensor.
-  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
-      tflite::testing::kInputData, input_quantized, input_dims, input_scale, 0,
-      "input_tensor");
-  int input_zero_points[2] = {1, 0};
-  float input_scales[2] = {1, input_scale};
-  TfLiteAffineQuantization input_quant = {
-      tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
-  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
-
-  // Create per-layer quantized int8 filter tensor.
-  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
-      tflite::testing::kFilterData, filter_quantized, filter_dims, filter_scale,
-      0, "filter_tensor");
-  int filter_zero_points[2] = {1, 0};
-  float filter_scales[2] = {1, filter_scale};
-  TfLiteAffineQuantization filter_quant = {
-      tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
-  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
-
-  // Create per-layer quantized int32 bias tensor.
-  tflite::SymmetricQuantize(tflite::testing::kBiasData, bias_quantized,
-                            tflite::testing::kBiasElements,
-                            input_scale * output_scale);
-  TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor(
-      bias_quantized, bias_dims, "bias_tensor");
-
-  int bias_zero_points[2] = {1, 0};
-  float bias_scales[2] = {1, input_scale * filter_scale};
-  TfLiteAffineQuantization bias_quant = {
-      tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
-  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
-
-  // Create per-layer quantized int8 output tensor.
-  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
-      output_data, output_dims, output_scale, 0 /* quantized dimension */,
-      "output_tensor");
-  int output_zero_points[2] = {1, 0};
-  float output_scales[2] = {1, output_scale};
-  TfLiteAffineQuantization output_quant = {
-      tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
-  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      input_tensor,
-      filter_tensor,
-      bias_tensor,
-      output_tensor,
-  };
-
-  tflite::AsymmetricQuantize(tflite::testing::kGoldenData, golden_quantized,
-                             output_dims_count, output_scale, 0);
-
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::testing::ValidateConvGoldens(
-                     tensors, tensors_size, golden_quantized, output_data,
-                     output_dims_count, &tflite::testing::common_conv_params));
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
+      filter_quantized, tflite::testing::kBias1Shape, tflite::testing::kBias1Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput1Shape,
+      tflite::testing::kGolden1Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
 }
 
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
+  const int output_dims_count = 20;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")  
+  static int8_t input_quantized[tflite::testing::kInput1Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter1Elements];
+  static int32_t bias_quantized[tflite::testing::kBias1Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()  
+
+  int8_t golden_quantized[tflite::testing::kOutput1Elements];
+  int zero_points[tflite::testing::kBias1Elements + 1];
+  float scales[tflite::testing::kBias1Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput1Shape, tflite::testing::kInput1Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter1Shape, tflite::testing::kFilter1Data,
+      filter_quantized, tflite::testing::kBias1Shape, tflite::testing::kBias1Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput1Shape,
+      tflite::testing::kGolden1Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
+  const int output_dims_count = 80;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput2Elements];
+  int8_t filter_quantized[tflite::testing::kFilter2Elements];
+  int32_t bias_quantized[tflite::testing::kBias2Elements];
+  int8_t golden_quantized[tflite::testing::kOutput2Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias2Elements + 1];
+  float scales[tflite::testing::kBias2Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
+      filter_quantized, tflite::testing::kBias2Shape, tflite::testing::kBias2Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput2Shape,
+      tflite::testing::kGolden2Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
+  const int output_dims_count = 80;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")  
+  static int8_t input_quantized[tflite::testing::kInput2Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter2Elements];
+  static int32_t bias_quantized[tflite::testing::kBias2Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()  
+
+  int8_t golden_quantized[tflite::testing::kOutput2Elements];
+  int zero_points[tflite::testing::kBias2Elements + 1];
+  float scales[tflite::testing::kBias2Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput2Shape, tflite::testing::kInput2Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter2Shape, tflite::testing::kFilter2Data,
+      filter_quantized, tflite::testing::kBias2Shape, tflite::testing::kBias2Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput2Shape,
+      tflite::testing::kGolden2Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
+  const int output_dims_count = 4;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput3Elements];
+  int8_t filter_quantized[tflite::testing::kFilter3Elements];
+  int32_t bias_quantized[tflite::testing::kBias3Elements];
+  int8_t golden_quantized[tflite::testing::kOutput3Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias3Elements + 1];
+  float scales[tflite::testing::kBias3Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
+      filter_quantized, tflite::testing::kBias3Shape, tflite::testing::kBias3Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput3Shape,
+      tflite::testing::kGolden3Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
+  const int output_dims_count = 4;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")  
+  static int8_t input_quantized[tflite::testing::kInput3Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter3Elements];
+  static int32_t bias_quantized[tflite::testing::kBias3Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()  
+
+  int8_t golden_quantized[tflite::testing::kOutput3Elements];
+  int zero_points[tflite::testing::kBias3Elements + 1];
+  float scales[tflite::testing::kBias3Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput3Shape, tflite::testing::kInput3Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter3Shape, tflite::testing::kFilter3Data,
+      filter_quantized, tflite::testing::kBias3Shape, tflite::testing::kBias3Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput3Shape,
+      tflite::testing::kGolden3Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
+  const int output_dims_count = 8;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[tflite::testing::kInput4Elements];
+  int8_t filter_quantized[tflite::testing::kFilter4Elements];
+  int32_t bias_quantized[tflite::testing::kBias4Elements];
+  int8_t golden_quantized[tflite::testing::kOutput4Elements];
+  int8_t output_data[output_dims_count];
+
+  int zero_points[tflite::testing::kBias4Elements + 1];
+  float scales[tflite::testing::kBias4Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
+      filter_quantized, tflite::testing::kBias4Shape, tflite::testing::kBias4Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput4Shape,
+      tflite::testing::kGolden4Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
+  const int output_dims_count = 8;
+  const float input_scale = 1.0f;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+#pragma Bss(".Xdata")  
+  static int8_t input_quantized[tflite::testing::kInput4Elements];
+  static int8_t filter_quantized[tflite::testing::kFilter4Elements];
+  static int32_t bias_quantized[tflite::testing::kBias4Elements];
+  static int8_t output_data[output_dims_count];
+#pragma Bss()  
+
+  int8_t golden_quantized[tflite::testing::kOutput4Elements];
+  int zero_points[tflite::testing::kBias4Elements + 1];
+  float scales[tflite::testing::kBias4Elements + 1];
+
+  tflite::testing::TestConvQuantizedPerChannel(
+      tflite::testing::kInput4Shape, tflite::testing::kInput4Data,
+      input_quantized, input_scale, input_zero_point,
+      tflite::testing::kFilter4Shape, tflite::testing::kFilter4Data,
+      filter_quantized, tflite::testing::kBias4Shape, tflite::testing::kBias4Data,
+      bias_quantized, scales, zero_points, tflite::testing::kOutput4Shape,
+      tflite::testing::kGolden4Data, golden_quantized, output_data, output_scale,
+      output_zero_point, &tflite::testing::common_conv_params);
+}
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
index 8b79885a8a8..fb9dd46c1e4 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
@@ -106,87 +106,6 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
   return kTfLiteOk;
 }
 
-void TestDepthwiseConvFloat(const int* input_dims_data, const float* input_data,
-                            const int* filter_dims_data,
-                            const float* filter_data, const int* bias_dims_data,
-                            const float* bias_data,
-                            const float* expected_output_data,
-                            const int* output_dims_data,
-                            TfLiteFusedActivation activation,
-                            float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(filter_data, filter_dims, "filter_tensor"),
-      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  ValidateDepthwiseConvGoldens(expected_output_data, output_dims_count,
-                               activation, 1e-5, tensors_size, tensors);
-}
-
-void TestDepthwiseConvQuantizedPerLayer(
-    const int* input_dims_data, const float* input_data,
-    uint8_t* input_quantized, float input_scale, int input_zero_point,
-    const int* filter_dims_data, const float* filter_data,
-    uint8_t* filter_quantized, float filter_scale, int filter_zero_point,
-    const int* bias_dims_data, const float* bias_data, int32_t* bias_quantized,
-    const float* golden, uint8_t* golden_quantized, const int* output_dims_data,
-    uint8_t* output_data, float output_scale, int output_zero_point,
-    TfLiteFusedActivation activation) {
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* filter_dims = IntArrayFromInts(filter_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      tflite::testing::CreateQuantizedTensor(input_data, input_quantized,
-                                             input_dims, input_scale,
-                                             input_zero_point, "input_tensor"),
-      tflite::testing::CreateQuantizedTensor(
-          filter_data, filter_quantized, filter_dims, filter_scale,
-          filter_zero_point, "filter_tensor"),
-      tflite::testing::CreateQuantizedBiasTensor(bias_data, bias_quantized,
-                                                 bias_dims, input_scale,
-                                                 filter_scale, "bias_tensor"),
-      tflite::testing::CreateQuantizedTensor(output_data, output_dims,
-                                             output_scale, output_zero_point,
-                                             "output_tensor"),
-  };
-
-  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
-  float filter_scales[] = {1, filter_scale};
-  int filter_zero_points[] = {1, 128};
-  TfLiteAffineQuantization filter_quant = {
-      FloatArrayFromFloats(filter_scales),
-      IntArrayFromInts(filter_zero_points)};
-  tensors[1].quantization = {kTfLiteAffineQuantization, &filter_quant};
-
-  float bias_scales[] = {1, filter_scale * input_scale};
-  int bias_zero_points[] = {1, 128};
-  TfLiteAffineQuantization bias_quant = {FloatArrayFromFloats(bias_scales),
-                                         IntArrayFromInts(bias_zero_points)};
-  tensors[2].quantization = {kTfLiteAffineQuantization, &bias_quant};
-
-  AsymmetricQuantize(golden, golden_quantized, output_dims_count, output_scale,
-                     output_zero_point);
-  ValidateDepthwiseConvGoldens(golden_quantized, output_dims_count, activation,
-                               1.0, tensors_size, tensors);
-}
-
 void TestDepthwiseConvQuantizedPerChannel(
     const int* input_dims_data, const float* input_data,
     int8_t* input_quantized, float input_scale, int input_zero_point,
@@ -263,183 +182,29 @@ void TestDepthwiseConvQuantizedPerChannel(
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SimpleTest) {
-  const int input_elements = 12;
-  const int input_shape[] = {4, 1, 3, 2, 2};
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
-                                 5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
-  const int bias_shape[] = {4, 1, 1, 1, 4};
-  const float bias_values[] = {1, 2, 3, 4};
-  const float golden[] = {
-      71, -34, 99, -20, 91, -26, 127, -4,
-  };
-  const int output_shape[] = {4, 1, 2, 1, 4};
-  const int output_dims_count = 8;
-  float output_data[output_dims_count];
-  tflite::testing::TestDepthwiseConvFloat(
-      input_shape, input_values, filter_shape, filter_values, bias_shape,
-      bias_values, golden, output_shape, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantized) {
-  const int input_elements = 12;
-  const int input_shape[] = {4, 1, 3, 2, 2};
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
-                                 5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
-  const int bias_shape[] = {4, 1, 1, 1, 4};
-  const int output_elements = 8;
-  const float bias_values[] = {1, 2, 3, 4};
-  const float golden[] = {
-      71, -34, 99, -20, 91, -26, 127, -4,
-  };
-  const int output_shape[] = {4, 1, 2, 1, 4};
-
-  const float input_scale = 0.5f;
-  const int input_zero_point = 128;
-  const float filter_scale = 0.5f;
-  const int filter_zero_point = 128;
-  const float output_scale = 1.0f;
-  const int output_zero_point = 128;
-
-  uint8_t input_quantized[input_elements];
-  uint8_t filter_quantized[filter_elements];
-  int32_t bias_quantized[bias_elements];
-  uint8_t golden_quantized[output_elements];
-  uint8_t output_data[output_elements];
-
-  tflite::testing::TestDepthwiseConvQuantizedPerLayer(
-      input_shape, input_values, input_quantized, input_scale, input_zero_point,
-      filter_shape, filter_values, filter_quantized, filter_scale,
-      filter_zero_point, bias_shape, bias_values, bias_quantized, golden,
-      golden_quantized, output_shape, output_data, output_scale,
-      output_zero_point, kTfLiteActNone);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestRelu) {
-  const int input_elements = 12;
-  const int input_shape[] = {4, 1, 3, 2, 2};
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
-                                 5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
-  const int bias_shape[] = {4, 1, 1, 1, 4};
-  const int output_elements = 8;
-  const float bias_values[] = {1, 2, 3, 4};
-  const int output_shape[] = {4, 1, 2, 1, 4};
-  const int output_dims_count = 8;
-  const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0};
-  float output_data[output_dims_count];
-
-  tflite::testing::TestDepthwiseConvFloat(
-      input_shape, input_values, filter_shape, filter_values, bias_shape,
-      bias_values, golden_relu, output_shape, kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestReluQuantized) {
-  const int input_elements = 12;
-  const int input_shape[] = {4, 1, 3, 2, 2};
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
-                                 5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
-  const int bias_shape[] = {4, 1, 1, 1, 4};
-  const int output_elements = 8;
-  const float bias_values[] = {1, 2, 3, 4};
-  const int output_shape[] = {4, 1, 2, 1, 4};
-  const int output_dims_count = 8;
-  const float golden_relu[] = {71, 0, 99, 0, 91, 0, 127, 0};
-
-  const float input_scale = 0.5f;
-  const int input_zero_point = 128;
-  const float filter_scale = 0.5f;
-  const int filter_zero_point = 128;
-  const float output_scale = 1.0f;
-  const int output_zero_point = 128;
-
-  uint8_t input_quantized[input_elements];
-  uint8_t filter_quantized[filter_elements];
-  int32_t bias_quantized[bias_elements];
-  uint8_t golden_quantized[output_elements];
-  uint8_t output_data[output_elements];
-
-  tflite::testing::TestDepthwiseConvQuantizedPerLayer(
-      input_shape, input_values, input_quantized, input_scale, input_zero_point,
-      filter_shape, filter_values, filter_quantized, filter_scale,
-      filter_zero_point, bias_shape, bias_values, bias_quantized, golden_relu,
-      golden_quantized, output_shape, output_data, output_scale,
-      output_zero_point, kTfLiteActRelu);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestOptimizedFilterWidth) {
-  const int input_elements = 12;
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
-  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
-                                 5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
-  const float bias_values[] = {1, 2, 3, 4};
-  const int output_dims_count = 9;
-  const int input_shape[] = {4, 1, 1, 9, 1};
-  const int filter_shape[] = {4, 2, 1, 8, 1};
-  const int bias_shape[] = {1, 1};
-  const float goldens[] = {
-      92, 56, 12, 22, 33, 72, 44, 20, 5,
-  };
-  const int output_shape[] = {4, 1, 1, 9, 1};
-
-  const float input_scale = 1.0f;
-  const int input_zero_point = 128;
-  const float filter_scale = 0.5f;
-  const int filter_zero_point = 128;
-  const float output_scale = 1.0f;
-  const int output_zero_point = 128;
-
-  uint8_t input_quantized[input_elements];
-  uint8_t filter_quantized[filter_elements];
-  int32_t bias_quantized[bias_elements];
-  uint8_t golden_quantized[output_dims_count];
-  uint8_t output_data[output_dims_count];
-
-  tflite::testing::TestDepthwiseConvQuantizedPerLayer(
-      input_shape, input_values, input_quantized, input_scale, input_zero_point,
-      filter_shape, filter_values, filter_quantized, filter_scale,
-      filter_zero_point, bias_shape, bias_values, bias_quantized, goldens,
-      golden_quantized, output_shape, output_data, output_scale,
-      output_zero_point, kTfLiteActNone);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
-  const int input_elements = 12;
-  const int input_shape[] = {4, 1, 3, 2, 2};
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
-                                 5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
-  const int bias_shape[] = {4, 1, 1, 1, 4};
-  const int output_elements = 8;
-  const float bias_values[] = {1, 2, 3, 4};
-  const float golden[] = {
-      71, -34, 99, -20, 91, -26, 127, -4,
-  };
-  const int output_shape[] = {4, 1, 2, 1, 4};
-  const int output_dims_count = 8;
+// Test group 1
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel1) {
+  const int input_elements = 20;
+  const int input_shape[] = {4, 1, 5, 2, 2};
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                 2, 2, 2, 2, 2, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 20;
+  const float bias_values[] = {2, 2};
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  const int output_shape[] = {4, 1, 5, 2, 2};
+  const int output_dims_count = 20;
   int8_t output_data[output_dims_count];
 
-  const float input_scale = 0.5;
+  const float input_scale = 1.0;
   const float output_scale = 1.0f;
   const int input_zero_point = 0;
   const int output_zero_point = 0;
@@ -458,28 +223,188 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) {
       output_scale, output_zero_point, kTfLiteActNone);
 }
 
-TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) {
-  const int input_elements = 12;
-  const int input_shape[] = {4, 1, 3, 2, 2};
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 8;
-  const int filter_shape[] = {4, 1, 2, 2, 2};
-  const float filter_values[] = {1, 2, 3, 4, -9, 10, -11, 12};
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel1) {
+  const int input_elements = 20;
+  const int input_shape[] = {4, 1, 5, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
   const int bias_elements = 2;
   const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 20;
+  const int output_shape[] = {4, 1, 5, 2, 2};
+  const int output_dims_count = 20;
+
+#pragma Bss(".Zdata")  
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                 2, 2, 2, 2, 2, 2};
+  const float bias_values[] = {2, 2};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel2) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 20, 2, 2};
+  const float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                                 2, 2, 2, 2, 2, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 80;
+  const float bias_values[] = {2, 2};
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  const int output_shape[] = {4, 1, 20, 2, 2};
+  const int output_dims_count = 80;
+  int8_t output_data[output_dims_count];
+
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel2) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 20, 2, 2};
+  const int filter_elements = 36;
+  const int filter_shape[] = {4, 2, 3, 3, 2};
+  const int bias_elements = 2;
+  const int bias_shape[] = {4, 1, 1, 1, 2};
+  const int output_elements = 80;
+  const int output_shape[] = {4, 1, 20, 2, 2};
+  const int output_dims_count = 80;
+
+#pragma Bss(".Zdata")  
+  float input_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                          2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
+  float filter_values[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+                           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
+                           2, 2, 2, 2, 2, 2};
+  float bias_values[] = {2, 2};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {34, 34, 34, 34, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 
+                          50, 50, 50, 50, 50, 50, 34, 34, 34, 34};
+  
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel3) {
+  const int input_elements = 40;
+  const int input_shape[] = {4, 1, 2, 2, 10};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
   const int output_elements = 4;
-  const float bias_values[] = {1, 2};
-  const float golden[] = {
-      -103,
-      127,
-      -128,
-      127,
-  };
-  const int output_shape[] = {4, 1, 2, 1, 2};
+  const float bias_values[] = {1};
+  const float golden[] = {41, 41, 41, 41};
+  const int output_shape[] = {4, 1, 2, 2, 1};
   const int output_dims_count = 4;
   int8_t output_data[output_dims_count];
 
-  const float input_scale = 1.0f;
+  const float input_scale = 1.0;
   const float output_scale = 1.0f;
   const int input_zero_point = 0;
   const int output_zero_point = 0;
@@ -498,30 +423,41 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannelDepthMultiplier1) {
       output_scale, output_zero_point, kTfLiteActNone);
 }
 
-TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) {
-  const int input_elements = 24;
-  const int input_shape[] = {4, 1, 3, 2, 4};
-  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-  const int filter_elements = 16;
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_values[] = {0,  1, 8,   -2, -1, 2, -10, 0,
-                                 -1, 3, -18, 0,  0,  4, 20,  -3};
-  const int bias_elements = 4;
-  const int bias_shape[] = {4, 1, 1, 1, 4};
-  const int output_elements = 8;
-  const float bias_values[] = {1, 2, 3, 4};
-  const float golden[] = {
-      0, 6, 3, 0, 0, 6, 3, 0,
-  };
-  const int output_shape[] = {4, 1, 2, 1, 4};
-  int8_t output_data[output_elements];
-  float output_float[output_elements];
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel3) {
+  const int input_elements = 40;
+  const int input_shape[] = {4, 1, 2, 2, 10};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 4;
+  const int output_shape[] = {4, 1, 2, 2, 1};
+  const int output_dims_count = 4;
 
-  const float input_scale = 0.023529f;
-  const float output_scale = 0.023529f;
-  const int input_zero_point = -128;
-  const int output_zero_point = -128;
+#pragma Bss(".Zdata")  
+  float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_values[] = {1};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {41, 41, 41, 41};
+  
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
 
   int8_t input_quantized[input_elements];
   int8_t filter_quantized[filter_elements];
@@ -530,239 +466,115 @@ TF_LITE_MICRO_TEST(TestQuantizedPerChannelDepthMultiplier1Relu6) {
   int zero_points[bias_elements + 1];
   float scales[bias_elements + 1];
 
-  tflite::testing::TestDepthwiseConvFloat(
-      input_shape, input_values, filter_shape, filter_values, bias_shape,
-      bias_values, golden, output_shape, kTfLiteActRelu6, output_float);
-
   tflite::testing::TestDepthwiseConvQuantizedPerChannel(
       input_shape, input_values, input_quantized, input_scale, input_zero_point,
       filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
       bias_quantized, output_shape, golden, golden_quantized, output_data,
-      output_scale, output_zero_point, kTfLiteActRelu6);
-}
-
-TF_LITE_MICRO_TEST(TestQuantizedPerChannelCompareWithFloat) {
-  const int input_dims[] = {4, 1, 2, 3, 2};
-  const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4};
-  const int filter_dims[] = {4, 1, 2, 2, 4};
-  const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2};
-  const int bias_dims[] = {4, 1, 1, 1, 4};
-  const float bias_data[] = {3, -2, 4, 6};
-  const int output_dims[] = {4, 1, 1, 2, 4};
-  const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36};
-
-  const int input_size = 12;
-  const int filter_size = 16;
-  const int output_size = 8;
-  const int bias_size = 4;
-  int8_t input_quantized[input_size];
-  int8_t filter_quantized[filter_size];
-  int32_t bias_quantized[bias_size];
-  int8_t golden_quantized[output_size];
-  int zero_points[bias_size + 1];
-  float scales[bias_size + 1];
-  int8_t output_data[output_size];
-  float output_float[output_size];
-
-  const float input_scale = 0.5;
-  const float output_scale = 1.0;
-  const int input_zero_point = 0;
-  const int output_zero_point = 0;
-
-  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
-      input_dims, input_data, input_quantized, input_scale, input_zero_point,
-      filter_dims, filter_data, filter_quantized, bias_dims, bias_data,
-      bias_quantized, output_dims, golden, golden_quantized, output_data,
       output_scale, output_zero_point, kTfLiteActNone);
-
-  tflite::testing::TestDepthwiseConvFloat(
-      input_dims, input_data, filter_dims, filter_data, bias_dims, bias_data,
-      golden, output_dims, kTfLiteActNone, output_float);
 }
 
-TF_LITE_MICRO_TEST(FilterDimsNotMatchingAffineQuantization) {
-  const int input_shape[] = {4, 1, 2, 3, 2};
-  const float input_data[] = {3, 2, 1, -1, -2, -3, 4, 3, 2, -2, -3, -4};
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_data[] = {1, 2, 3, 4, 3, 4, 5, 6, 7, 8, 5, 6, 3, 4, 1, 2};
-  const int bias_shape[] = {4, 1, 1, 1, 4};
-  const float bias_data[] = {3, -2, 4, 6};
-  const int output_shape[] = {4, 1, 1, 2, 4};
-  const float golden[] = {43, 48, 18, 22, 3, -4, -28, -36};
-
-  const int input_size = 12;
-  const int filter_size = 16;
-  const int output_size = 8;
-  const int bias_size = 4;
-  int8_t input_quantized[input_size];
-  int8_t filter_quantized[filter_size];
-  int32_t bias_quantized[bias_size];
-  int8_t golden_quantized[output_size];
-  int zero_points[bias_size + 1];
-  float scales[bias_size + 1];
-  int8_t output_data[output_size];
-  float output_float[output_size];
-
-  const float input_scale = 0.5;
-  const float output_scale = 1.0;
-  const int input_zero_point = 0;
-  const int output_zero_point = 0;
-
-  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
-  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
-  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
-  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
-
-  int filter_zero_points[5];
-  float filter_scales[5];
-  TfLiteAffineQuantization filter_quant;
-  TfLiteAffineQuantization bias_quant;
-  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
-      input_data, input_quantized, input_dims, input_scale, input_zero_point,
-      "input_tensor");
-  TfLiteTensor filter_tensor =
-      tflite::testing::CreateSymmetricPerChannelQuantizedTensor(
-          filter_data, filter_quantized, filter_dims, filter_scales,
-          filter_zero_points, &filter_quant, 0 /* quantized dimension */,
-          "filter_tensor");
-  TfLiteTensor bias_tensor =
-      tflite::testing::CreatePerChannelQuantizedBiasTensor(
-          bias_data, bias_quantized, bias_dims, input_scale, &filter_scales[1],
-          scales, zero_points, &bias_quant, 0, "bias_tensor");
-  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
-      output_data, output_dims, output_scale, output_zero_point,
-      "output_tensor");
-
-  float input_scales[] = {1, input_scale};
-  int input_zero_points[] = {1, input_zero_point};
-  TfLiteAffineQuantization input_quant = {
-      tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
-  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      input_tensor,
-      filter_tensor,
-      bias_tensor,
-      output_tensor,
-  };
-
-  // Set filter quant to mismatched dimension.
-  TfLiteAffineQuantization* quant = reinterpret_cast<TfLiteAffineQuantization*>(
-      filter_tensor.quantization.params);
-  quant->scale->size = 2;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
-                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
-                        tensors_size, tensors));
-
-  // Set scale back to correct dimension, and make zero point array too short.
-  quant->scale->size = filter_shape[0];
-  quant->zero_point->size = 2;
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, tflite::testing::ValidateDepthwiseConvGoldens(
-                        golden_quantized, output_size, kTfLiteActNone, 1e-5,
-                        tensors_size, tensors));
-}
-
-TF_LITE_MICRO_TEST(PerChannelBroadcastQuantizationParams) {
-  const float input_scale = 1.0f;
-  const float filter_scale = 1.0f;
-  const float output_scale = 1.0f;
-
-  const int input_elements = 12;
-  const int input_shape[] = {4, 1, 3, 2, 2};
-  const float input_values[] = {1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12};
-  const int filter_elements = 16;
-  const int filter_shape[] = {4, 1, 2, 2, 4};
-  const float filter_values[] = {1, 2, 3, 4, -9, 10,  -11, 12,
-                                 5, 6, 7, 8, 13, -14, 15,  -16};
-  const int bias_elements = 4;
-  const int bias_shape[] = {4, 1, 1, 1, 4};
+// Test group 4
+TF_LITE_MICRO_TEST(SystemTestQuantizedPerChannel4) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 4, 2, 10};
+  const float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
   const int output_elements = 8;
-  const float bias_values[] = {1, 2, 3, 4};
-  const float golden[] = {
-      71, -34, 99, -20, 91, -26, 127, -4,
-  };
-  const int output_shape[] = {4, 1, 2, 1, 4};
+  const float bias_values[] = {1};
+  const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
+  const int output_shape[] = {4, 1, 4, 2, 1};
   const int output_dims_count = 8;
   int8_t output_data[output_dims_count];
 
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
   int8_t input_quantized[input_elements];
   int8_t filter_quantized[filter_elements];
   int32_t bias_quantized[bias_elements];
   int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
 
-  TfLiteIntArray* input_dims = tflite::testing::IntArrayFromInts(input_shape);
-  TfLiteIntArray* filter_dims = tflite::testing::IntArrayFromInts(filter_shape);
-  TfLiteIntArray* bias_dims = tflite::testing::IntArrayFromInts(bias_shape);
-  TfLiteIntArray* output_dims = tflite::testing::IntArrayFromInts(output_shape);
-
-  // Create per-layer quantized int8 input tensor.
-  TfLiteTensor input_tensor = tflite::testing::CreateQuantizedTensor(
-      input_values, input_quantized, input_dims, input_scale, 0,
-      "input_tensor");
-  int input_zero_points[2] = {1, 0};
-  float input_scales[2] = {1, input_scale};
-  TfLiteAffineQuantization input_quant = {
-      tflite::testing::FloatArrayFromFloats(input_scales),
-      tflite::testing::IntArrayFromInts(input_zero_points)};
-  input_tensor.quantization = {kTfLiteAffineQuantization, &input_quant};
-
-  // Create per-layer quantized int8 filter tensor.
-  TfLiteTensor filter_tensor = tflite::testing::CreateQuantizedTensor(
-      filter_values, filter_quantized, filter_dims, filter_scale, 0,
-      "filter_tensor");
-  int filter_zero_points[2] = {1, 0};
-  float filter_scales[2] = {1, filter_scale};
-  TfLiteAffineQuantization filter_quant = {
-      tflite::testing::FloatArrayFromFloats(filter_scales),
-      tflite::testing::IntArrayFromInts(filter_zero_points)};
-  filter_tensor.quantization = {kTfLiteAffineQuantization, &filter_quant};
-
-  // Create per-layer quantized int32 bias tensor.
-  tflite::SymmetricQuantize(bias_values, bias_quantized, bias_elements,
-                            input_scale * output_scale);
-  TfLiteTensor bias_tensor = tflite::testing::CreateInt32Tensor(
-      bias_quantized, bias_dims, "bias_tensor");
-
-  int bias_zero_points[2] = {1, 0};
-  float bias_scales[2] = {1, input_scale * filter_scale};
-  TfLiteAffineQuantization bias_quant = {
-      tflite::testing::FloatArrayFromFloats(bias_scales),
-      tflite::testing::IntArrayFromInts(bias_zero_points)};
-  bias_tensor.quantization = {kTfLiteAffineQuantization, &bias_quant};
-
-  // Create per-layer quantized int8 output tensor.
-  TfLiteTensor output_tensor = tflite::testing::CreateQuantizedTensor(
-      output_data, output_dims, output_scale, 0, "output_tensor");
-  int output_zero_points[2] = {1, 0};
-  float output_scales[2] = {1, output_scale};
-  TfLiteAffineQuantization output_quant = {
-      tflite::testing::FloatArrayFromFloats(output_scales),
-      tflite::testing::IntArrayFromInts(output_zero_points)};
-  output_tensor.quantization = {kTfLiteAffineQuantization, &output_quant};
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      input_tensor,
-      filter_tensor,
-      bias_tensor,
-      output_tensor,
-  };
-
-  tflite::AsymmetricQuantize(golden, golden_quantized, output_dims_count,
-                             output_scale, 0);
-
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::testing::ValidateDepthwiseConvGoldens(
-                     golden_quantized, output_dims_count, kTfLiteActNone, 1e-5,
-                     tensors_size, tensors));
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
 }
 
+TF_LITE_MICRO_TEST(LocalTestQuantizedPerChannel4) {
+  const int input_elements = 80;
+  const int input_shape[] = {4, 1, 4, 2, 10};
+  const int filter_elements = 90;
+  const int filter_shape[] = {4, 1, 3, 3, 10};
+  const int bias_elements = 1;
+  const int bias_shape[] = {4, 1, 1, 1, 1};
+  const int output_elements = 8;
+  const int output_shape[] = {4, 1, 4, 2, 1};
+  const int output_dims_count = 8;
+
+#pragma Bss(".Zdata")  
+  float input_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float filter_values[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                           1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  float bias_values[] = {1};
+  int8_t output_data[output_dims_count];
+#pragma Bss()
+
+  const float golden[] = {41, 41, 61, 61, 61, 61, 41, 41};
+  
+  const float input_scale = 1.0;
+  const float output_scale = 1.0f;
+  const int input_zero_point = 0;
+  const int output_zero_point = 0;
+
+  int8_t input_quantized[input_elements];
+  int8_t filter_quantized[filter_elements];
+  int32_t bias_quantized[bias_elements];
+  int8_t golden_quantized[output_elements];
+  int zero_points[bias_elements + 1];
+  float scales[bias_elements + 1];
+
+  tflite::testing::TestDepthwiseConvQuantizedPerChannel(
+      input_shape, input_values, input_quantized, input_scale, input_zero_point,
+      filter_shape, filter_values, filter_quantized, bias_shape, bias_values,
+      bias_quantized, output_shape, golden, golden_quantized, output_data,
+      output_scale, output_zero_point, kTfLiteActNone);
+}
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
index 539c7ecc3a4..78cb2873c54 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
@@ -25,74 +25,6 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestFullyConnectedFloat(
-    const int* input_dims_data, const float* input_data,
-    const int* weights_dims_data, const float* weights_data,
-    const int* bias_dims_data, const float* bias_data,
-    const float* expected_output_data, const int* output_dims_data,
-    TfLiteFusedActivation activation, float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
-  TfLiteIntArray* weights_dims = IntArrayFromInts(weights_dims_data);
-  TfLiteIntArray* bias_dims = IntArrayFromInts(bias_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 3;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(weights_data, weights_dims, "weights_tensor"),
-      CreateFloatTensor(bias_data, bias_dims, "bias_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLiteFullyConnectedParams builtin_data = {
-      activation,
-      kTfLiteFullyConnectedWeightsFormatDefault,
-  };
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {3, 0, 1, 2};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 3};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i], 1e-5f);
-  }
-}
-
 template <typename T>
 void TestFullyConnectedQuantized(
     const int* input_dims_data, const T* input_data, const float input_min,
@@ -121,6 +53,10 @@ void TestFullyConnectedQuantized(
                             output_min, output_max),
   };
 
+  tensors[0].params.zero_point = 0;
+  tensors[1].params.zero_point = 0;
+  tensors[3].params.zero_point = 0;
+
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
@@ -176,466 +112,23 @@ void TestFullyConnectedQuantized(
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SimpleTest) {
-  const int input_dims_data[] = {2, 2, 10};
-  const float input_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const float weights_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-  };
-  const int bias_dims_data[] = {1, 3};
-  const float bias_data[] = {1, 2, 3};
-  const float expected_output_data[] = {
-      24, 25, 26, 58, 59, 60,
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest2) {
-  const int input_dims_data[] = {2, 2, 2};
-  const float input_data[] = {
-      1, 2,  // b = 0
-      2, 1,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 1, 2};
-  const float weights_data[] = {
-      2, 4,  // u = 0
-  };
-  const int bias_dims_data[] = {1, 1};
-  const float bias_data[] = {1};
-  const float expected_output_data[] = {
-      11,
-      9,
-  };
-  const int output_dims_data[] = {2, 2, 1};
-
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestRelu) {
-  const int input_dims_data[] = {2, 2, 10};
-  const float input_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const float weights_data[] = {
-      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 0
-      -1, -2, -3, -4, -5, -6, -7, -8, -9, -10,  // u = 1
-      1,  2,  3,  4,  5,  6,  7,  8,  9,  10,   // u = 2
-  };
-  const int bias_dims_data[] = {1, 3};
-  const float bias_data[] = {1, -2, 3};
-  const float expected_output_data[] = {
-      24, 0, 26, 58, 0, 60,
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
-}
-
-// TODO(b/138811455): Fix code duplication in micro tests
-TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -64.0f;
-  const float weights_max = 63.5f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8Relu) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-      F2Q(-1, weights_min, weights_max), F2Q(-2, weights_min, weights_max),
-      F2Q(-3, weights_min, weights_max), F2Q(-4, weights_min, weights_max),
-      F2Q(-5, weights_min, weights_max), F2Q(-6, weights_min, weights_max),
-      F2Q(-7, weights_min, weights_max), F2Q(-8, weights_min, weights_max),
-      F2Q(-9, weights_min, weights_max), F2Q(-10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max),  F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max),  F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max),  F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max),  F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max),  F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(0, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(0, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(0, output_min, output_max),  F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8Relu) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -64.0f;
-  const float weights_max = 63.5f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
-      F2QS(-1, weights_min, weights_max), F2QS(-2, weights_min, weights_max),
-      F2QS(-3, weights_min, weights_max), F2QS(-4, weights_min, weights_max),
-      F2QS(-5, weights_min, weights_max), F2QS(-6, weights_min, weights_max),
-      F2QS(-7, weights_min, weights_max), F2QS(-8, weights_min, weights_max),
-      F2QS(-9, weights_min, weights_max), F2QS(-10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max),  F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max),  F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max),  F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max),  F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max),  F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(0, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(0, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(0, output_min, output_max),  F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedUInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -127.0f;
-  const float weights_max = 128.0f;
-  const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
-
-  const int input_dims_data[] = {2, 2, 10};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
+// Test group 1
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized1) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
   const float weights_min = -128.0f;
   const float weights_max = 127.0f;
   const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
 
   const int input_dims_data[] = {2, 2, 10};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
+  const int8_t input_data[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
   const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
+  const int8_t weights_data[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
   const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
+  const int32_t bias_data[] = {1,1,1};
+  const int8_t expected_output_data[] = {41,41,41,41,41,41};
   const int output_dims_data[] = {2, 2, 3};
 
   const int output_dims_count = 6;
@@ -647,292 +140,273 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedInt8OutputMultiplierGreaterThan1) {
       output_max, kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleTest4DInput) {
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const float input_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
-      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const float weights_data[] = {
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
-      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
-  };
-  const int bias_dims_data[] = {1, 3};
-  const float bias_data[] = {1, 2, 3};
-  const float expected_output_data[] = {
-      24, 25, 26, 58, 59, 60,  // Expected results.
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  float output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedFloat(
-      input_dims_data, input_data, weights_dims_data, weights_data,
-      bias_dims_data, bias_data, expected_output_data, output_dims_data,
-      kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedUInt8) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -63.5f;
-  const float weights_max = 64.0f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
-
-  const float input_min = -63.5f;
-  const float input_max = 64.0f;
-  const float weights_min = -64.0f;
-  const float weights_max = 63.5f;
-  const float bias_scale = 0.25f;
-  const float output_min = -127.0f;
-  const float output_max = 128.0f;
-
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(
-    SimpleTest4DInputQuantizedUInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q;
-  using tflite::testing::F2Q32;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
-  const float weights_min = -127.0f;
-  const float weights_max = 128.0f;
-  const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
-
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const uint8_t input_data[] = {
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(8, input_min, input_max),
-      F2Q(-9, input_min, input_max), F2Q(-10, input_min, input_max),
-      F2Q(1, input_min, input_max),  F2Q(2, input_min, input_max),
-      F2Q(3, input_min, input_max),  F2Q(4, input_min, input_max),
-      F2Q(5, input_min, input_max),  F2Q(6, input_min, input_max),
-      F2Q(7, input_min, input_max),  F2Q(-8, input_min, input_max),
-      F2Q(9, input_min, input_max),  F2Q(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const uint8_t weights_data[] = {
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-      F2Q(1, weights_min, weights_max), F2Q(2, weights_min, weights_max),
-      F2Q(3, weights_min, weights_max), F2Q(4, weights_min, weights_max),
-      F2Q(5, weights_min, weights_max), F2Q(6, weights_min, weights_max),
-      F2Q(7, weights_min, weights_max), F2Q(8, weights_min, weights_max),
-      F2Q(9, weights_min, weights_max), F2Q(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const uint8_t expected_output_data[] = {
-      F2Q(24, output_min, output_max), F2Q(25, output_min, output_max),
-      F2Q(26, output_min, output_max), F2Q(58, output_min, output_max),
-      F2Q(59, output_min, output_max), F2Q(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
-
-  const int output_dims_count = 6;
-  uint8_t output_data[output_dims_count];
-  tflite::testing::TestFullyConnectedQuantized<uint8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleTest4DInputQuantizedInt8OutputMultiplierGreaterThan1) {
-  using tflite::testing::F2Q32;
-  using tflite::testing::F2QS;
-
-  const float input_min = -127.0f;
-  const float input_max = 128.0f;
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized1) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
   const float weights_min = -128.0f;
   const float weights_max = 127.0f;
   const float bias_scale = 1.0f;
-  const float output_min = -63.5f;
-  const float output_max = 64.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
 
-  const int input_dims_data[] = {4, 1, 1, 5, 1};
-  const int8_t input_data[] = {
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(8, input_min, input_max),
-      F2QS(-9, input_min, input_max), F2QS(-10, input_min, input_max),
-      F2QS(1, input_min, input_max),  F2QS(2, input_min, input_max),
-      F2QS(3, input_min, input_max),  F2QS(4, input_min, input_max),
-      F2QS(5, input_min, input_max),  F2QS(6, input_min, input_max),
-      F2QS(7, input_min, input_max),  F2QS(-8, input_min, input_max),
-      F2QS(9, input_min, input_max),  F2QS(-10, input_min, input_max),
-  };
-  const int weights_dims_data[] = {2, 3, 10};
-  const int8_t weights_data[] = {
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-      F2QS(1, weights_min, weights_max), F2QS(2, weights_min, weights_max),
-      F2QS(3, weights_min, weights_max), F2QS(4, weights_min, weights_max),
-      F2QS(5, weights_min, weights_max), F2QS(6, weights_min, weights_max),
-      F2QS(7, weights_min, weights_max), F2QS(8, weights_min, weights_max),
-      F2QS(9, weights_min, weights_max), F2QS(10, weights_min, weights_max),
-  };
-  const int bias_dims_data[] = {1, 3};
-  const int32_t bias_data[] = {
-      F2Q32(1, bias_scale),
-      F2Q32(2, bias_scale),
-      F2Q32(3, bias_scale),
-  };
-  const int8_t expected_output_data[] = {
-      F2QS(24, output_min, output_max), F2QS(25, output_min, output_max),
-      F2QS(26, output_min, output_max), F2QS(58, output_min, output_max),
-      F2QS(59, output_min, output_max), F2QS(60, output_min, output_max),
-  };
-  const int output_dims_data[] = {2, 2, 3};
+  const int input_dims_data_local[] = {2, 2, 10};
+  const int weights_dims_data_local[] = {2, 3, 10};
+  const int bias_dims_data_local[] = {1, 3};
+  const int output_dims_data_local[] = {2, 2, 3};
 
   const int output_dims_count = 6;
-  int8_t output_data[output_dims_count];
+
+#pragma Bss(".Zdata")  
+  const int8_t input_data_local[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
+  const int8_t weights_data_local[] = {2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2};
+  const int32_t bias_data_local[] = {1,1,1};
+  int8_t output_data_local[output_dims_count];
+#pragma Bss()
+
+  const int8_t expected_output_data[] = {41,41,41,41,41,41};
+
   tflite::testing::TestFullyConnectedQuantized<int8_t>(
-      input_dims_data, input_data, input_min, input_max, weights_dims_data,
-      weights_data, weights_min, weights_max, bias_dims_data, bias_data,
-      bias_scale, expected_output_data, output_dims_data, output_min,
-      output_max, kTfLiteActNone, output_data);
+      input_dims_data_local, input_data_local, input_min, input_max, weights_dims_data_local,
+      weights_data_local, weights_min, weights_max, bias_dims_data_local, bias_data_local,
+      bias_scale, expected_output_data, output_dims_data_local, output_min,
+      output_max, kTfLiteActNone, output_data_local);
+}
+
+// Test group 2
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized2) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_2[] = {2, 10, 4};
+  const int8_t input_data_2[] = {2,2,2,2,2,2,2,2,2,2,
+                               2,2,2,2,2,2,2,2,2,2,
+                               2,2,2,2,2,2,2,2,2,2,
+                               2,2,2,2,2,2,2,2,2,2};
+  const int weights_dims_data_2[] = {2, 6, 4};
+  const int8_t weights_data_2[] = {2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2};
+  const int bias_dims_data_2[] = {1, 6};
+  const int32_t bias_data_2[] = {1,1,1,1,1,1};
+  const int8_t expected_output_data_2[] = {17,17,17,17,17,17,17,17,17,17,
+                                         17,17,17,17,17,17,17,17,17,17,
+                                         17,17,17,17,17,17,17,17,17,17,
+                                         17,17,17,17,17,17,17,17,17,17,
+                                         17,17,17,17,17,17,17,17,17,17,
+                                         17,17,17,17,17,17,17,17,17,17};
+  const int output_dims_data_2[] = {2, 10, 6};
+
+  const int output_dims_count_2 = 60;
+  int8_t output_data_2[output_dims_count_2];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_2, input_data_2, input_min, input_max, weights_dims_data_2,
+      weights_data_2, weights_min, weights_max, bias_dims_data_2, bias_data_2,
+      bias_scale, expected_output_data_2, output_dims_data_2, output_min,
+      output_max, kTfLiteActNone, output_data_2);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized2) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_2[] = {2, 10, 4};
+  const int weights_dims_data_local_2[] = {2, 6, 4};
+  const int bias_dims_data_local_2[] = {1, 6};
+  const int output_dims_data_local_2[] = {2, 10, 6};
+
+  const int output_dims_count_local_2 = 60;
+
+#pragma Bss(".Zdata")  
+  const int8_t input_data_local_2[] = {2,2,2,2,2,2,2,2,2,2,
+                               2,2,2,2,2,2,2,2,2,2,
+                               2,2,2,2,2,2,2,2,2,2,
+                               2,2,2,2,2,2,2,2,2,2};
+  const int8_t weights_data_local_2[] = {2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2};
+  const int32_t bias_data_local_2[] = {1,1,1,1,1,1};
+  int8_t output_data_local_2[output_dims_count_local_2];
+#pragma Bss()
+
+  const int8_t expected_output_data_local_2[] = {41,41,41,41,41,41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_2, input_data_local_2, input_min, input_max, weights_dims_data_local_2,
+      weights_data_local_2, weights_min, weights_max, bias_dims_data_local_2, bias_data_local_2,
+      bias_scale, expected_output_data_local_2, output_dims_data_local_2, output_min,
+      output_max, kTfLiteActNone, output_data_local_2);
+}
+
+// Test group 3
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized3) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_3[] = {2, 2, 5};
+  const int8_t input_data_3[] = {2,2,2,2,2,2,2,2,2,2};
+  const int weights_dims_data_3[] = {2, 10, 5};
+  const int8_t weights_data_3[] = {2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2};
+  const int bias_dims_data_3[] = {1, 10};
+  const int32_t bias_data_3[] = {1,1,1,1,1,1,1,1,1,1};
+  const int8_t expected_output_data_3[] = {21,21,21,21,21,21,21,21,21,21,
+                                           21,21,21,21,21,21,21,21,21,21};
+  const int output_dims_data_3[] = {2, 2, 10};
+
+  const int output_dims_count_3 = 20;
+  int8_t output_data_3[output_dims_count_3];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_3, input_data_3, input_min, input_max, weights_dims_data_3,
+      weights_data_3, weights_min, weights_max, bias_dims_data_3, bias_data_3,
+      bias_scale, expected_output_data_3, output_dims_data_3, output_min,
+      output_max, kTfLiteActNone, output_data_3);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized3) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_3[] = {2, 2, 5};
+  const int weights_dims_data_local_3[] = {2, 10, 5};
+  const int bias_dims_data_local_3[] = {1, 10};
+  const int output_dims_data_local_3[] = {2, 2, 10};
+
+  const int output_dims_count_local_3 = 20;
+
+#pragma Bss(".Zdata")  
+  static int8_t input_data_local_3[10];
+  static int8_t weights_data_local_3[50];
+  static int32_t bias_data_local_3[10];
+  static int8_t output_data_local_3[output_dims_count_local_3];
+#pragma Bss()
+
+  for(int i = 0; i < 10; ++i) {
+    input_data_local_3[i] = 2;  
+  }
+
+  for(int i = 0; i < 50; ++i) {
+    weights_data_local_3[i] = 2;  
+  }
+
+  for(int i = 0; i < 10; ++i) {
+    bias_data_local_3[i] = 1;  
+  }
+
+  for(int i = 0; i < 20; ++i) {
+    output_data_local_3[i] = 0;  
+  }
+
+  const int8_t expected_output_data_local_3[] = {21,21,21,21,21,21,21,21,21,21,
+                                                 21,21,21,21,21,21,21,21,21,21};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_3, input_data_local_3, input_min, input_max, weights_dims_data_local_3,
+      weights_data_local_3, weights_min, weights_max, bias_dims_data_local_3, bias_data_local_3,
+      bias_scale, expected_output_data_local_3, output_dims_data_local_3, output_min,
+      output_max, kTfLiteActNone, output_data_local_3);
+}
+
+// Test group 4
+TF_LITE_MICRO_TEST(SystemSimpleTestQuantized4) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_4[] = {2, 5, 10};
+  const int8_t input_data_4[] = {2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2,2,2,2,2,2,2,
+                                 2,2,2,2,2,2,2,2,2,2};
+  const int weights_dims_data_4[] = {2, 5, 10};
+  const int8_t weights_data_4[] = {2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2,
+                                   2,2,2,2,2,2,2,2,2,2};
+  const int bias_dims_data_4[] = {1, 5};
+  const int32_t bias_data_4[] = {1,1,1,1,1};
+  const int8_t expected_output_data_4[] = {41,41,41,41,41,41,41,41,41,41,
+                                           41,41,41,41,41,41,41,41,41,41,
+                                           41,41,41,41,41};
+  const int output_dims_data_4[] = {2, 5, 5};
+
+  const int output_dims_count_4 = 25;
+  int8_t output_data_4[output_dims_count_4];
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_4, input_data_4, input_min, input_max, weights_dims_data_4,
+      weights_data_4, weights_min, weights_max, bias_dims_data_4, bias_data_4,
+      bias_scale, expected_output_data_4, output_dims_data_4, output_min,
+      output_max, kTfLiteActNone, output_data_4);
+}
+
+TF_LITE_MICRO_TEST(LocalSimpleTestQuantized4) {
+  const float input_min = -128.0f;
+  const float input_max = 127.0f;
+  const float weights_min = -128.0f;
+  const float weights_max = 127.0f;
+  const float bias_scale = 1.0f;
+  const float output_min = -128.0f;
+  const float output_max = 127.0f;
+
+  const int input_dims_data_local_4[] = {2, 5, 10};
+  const int weights_dims_data_local_4[] = {2, 5, 10};
+  const int bias_dims_data_local_4[] = {1, 5};
+  const int output_dims_data_local_4[] = {2, 5, 5};
+
+  const int output_dims_count_local_4 = 25;
+
+#pragma Bss(".Zdata")  
+  const int8_t input_data_local_4[] = {2,2,2,2,2,2,2,2,2,2,
+                                       2,2,2,2,2,2,2,2,2,2,
+                                       2,2,2,2,2,2,2,2,2,2,
+                                       2,2,2,2,2,2,2,2,2,2,
+                                       2,2,2,2,2,2,2,2,2,2};
+  const int8_t weights_data_local_4[] = {2,2,2,2,2,2,2,2,2,2,
+                                         2,2,2,2,2,2,2,2,2,2,
+                                         2,2,2,2,2,2,2,2,2,2,
+                                         2,2,2,2,2,2,2,2,2,2,
+                                         2,2,2,2,2,2,2,2,2,2};
+  const int32_t bias_data_local_4[] = {1,1,1,1,1};
+  int8_t output_data_local_4[output_dims_count_local_4];
+#pragma Bss()
+
+  const int8_t expected_output_data_local_4[] = {41,41,41,41,41,41,41,41,41,41,
+                                                 41,41,41,41,41,41,41,41,41,41,
+                                                 41,41,41,41,41};
+
+  tflite::testing::TestFullyConnectedQuantized<int8_t>(
+      input_dims_data_local_4, input_data_local_4, input_min, input_max, weights_dims_data_local_4,
+      weights_data_local_4, weights_min, weights_max, bias_dims_data_local_4, bias_data_local_4,
+      bias_scale, expected_output_data_local_4, output_dims_data_local_4, output_min,
+      output_max, kTfLiteActNone, output_data_local_4);
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
index 8bfeb718a1b..63737a41791 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
@@ -25,89 +25,20 @@ namespace tflite {
 namespace testing {
 namespace {
 
-void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
-                             std::initializer_list<float> input_data,
-                             const int filter_height, const int filter_width,
-                             const int stride_height, const int stride_width,
-                             std::initializer_list<float> expected_output_data,
-                             std::initializer_list<int> output_dims_data,
-                             TfLitePadding padding,
-                             TfLiteFusedActivation activation,
-                             float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 1;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
-                                   filter_width, filter_height, activation};
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
-}
-
 template <typename T>
 void TestAveragePoolingQuantized(
-    std::initializer_list<int> input_dims_data,
-    std::initializer_list<T> input_data, const float input_min,
+    const int* input_dims_data,
+    const T* input_data, const float input_min,
     const float input_max, const int filter_height, const int filter_width,
     const int stride_height, const int stride_width,
-    std::initializer_list<T> expected_output_data,
-    std::initializer_list<int> output_dims_data, float output_min,
+    const T* expected_output_data,
+    const int* output_dims_data, float output_min,
     float output_max, TfLitePadding padding, TfLiteFusedActivation activation,
     T* output_data) {
   static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
 
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
@@ -163,94 +94,25 @@ void TestAveragePoolingQuantized(
   }
 
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
-                              1e-5f);
-  }
-}
-
-void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
-                      std::initializer_list<float> input_data, int filter_width,
-                      int filter_height, int stride_width, int stride_height,
-                      std::initializer_list<float> expected_output_data,
-                      std::initializer_list<int> output_dims_data,
-                      TfLitePadding padding, TfLiteFusedActivation activation,
-                      float* output_data) {
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
-  const int output_dims_count = ElementCount(*output_dims);
-
-  constexpr int inputs_size = 1;
-  constexpr int outputs_size = 1;
-  constexpr int tensors_size = inputs_size + outputs_size;
-  TfLiteTensor tensors[tensors_size] = {
-      CreateFloatTensor(input_data, input_dims, "input_tensor"),
-      CreateFloatTensor(output_data, output_dims, "output_tensor"),
-  };
-
-  TfLiteContext context;
-  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
-
-  ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-
-  TfLitePoolParams builtin_data = {
-      padding,      stride_width,  stride_height,
-      filter_width, filter_height, activation,
-  };
-
-  const char* init_data = reinterpret_cast<const char*>(&builtin_data);
-  size_t init_data_size = 0;
-  void* user_data = nullptr;
-  if (registration->init) {
-    user_data = registration->init(&context, init_data, init_data_size);
-  }
-
-  int inputs_array_data[] = {1, 0};
-  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
-  int outputs_array_data[] = {1, 1};
-  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
-  int temporaries_array_data[] = {0};
-  TfLiteIntArray* temporaries_array = IntArrayFromInts(temporaries_array_data);
-
-  TfLiteNode node;
-  node.inputs = inputs_array;
-  node.outputs = outputs_array;
-  node.temporaries = temporaries_array;
-  node.user_data = user_data;
-  node.builtin_data = reinterpret_cast<void*>(&builtin_data);
-  node.custom_initial_data = nullptr;
-  node.custom_initial_data_size = 0;
-  node.delegate = nullptr;
-  if (registration->prepare) {
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
-  }
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
-  if (registration->free) {
-    registration->free(&context, user_data);
-  }
-  for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data[i], output_data[i],
                               1e-5f);
   }
 }
 
 template <typename T>
-void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
-                          std::initializer_list<T> input_data, float input_min,
+void TestMaxPoolQuantized(const int* input_dims_data,
+                          const T* input_data, float input_min,
                           float input_max, int filter_width, int filter_height,
                           int stride_width, int stride_height,
-                          std::initializer_list<T> expected_output_data,
+                          const T* expected_output_data,
                           float output_min, float output_max,
-                          std::initializer_list<int> output_dims_data,
+                          const int* output_dims_data,
                           TfLitePadding padding,
                           TfLiteFusedActivation activation, T* output_data) {
   static_assert(sizeof(T) == 1, "Only int8/uint8 data types allowed.");
 
-  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
-  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
 
   constexpr int inputs_size = 1;
@@ -308,7 +170,7 @@ void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
     registration->free(&context, user_data);
   }
   for (int i = 0; i < output_dims_count; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]);
+    TF_LITE_MICRO_EXPECT_EQ(expected_output_data[i], output_data[i]);
   }
 }
 
@@ -319,797 +181,269 @@ void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
 
 TF_LITE_MICRO_TESTS_BEGIN
 
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestFloat) {
-  float output_data[2];
-  tflite::testing::TestAveragePoolingFloat({4, 1, 2, 4, 1},  // Input shape
-                                           {                 // Input values
-                                            0., 6., 2., 4., 3., 2., 10., 7.},
-                                           2, 2,  // filter width, filter height
-                                           2, 2,  // stride width, stride height
-                                           {
-                                               // Output values
-                                               2.75,
-                                               5.75,
-                                           },
-                                           {4, 1, 1, 2, 1},  // Output shape
-                                           kTfLitePaddingValid, kTfLiteActNone,
-                                           output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestUint8) {
-  using tflite::testing::F2Q;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.9375;
-  const float output_min = -15.9375;
-  const float output_max = 15.9375;
-  uint8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0., input_min, input_max),
-          F2Q(-6., input_min, input_max),
-          F2Q(2., input_min, input_max),
-          F2Q(4., input_min, input_max),
-          F2Q(3., input_min, input_max),
-          F2Q(2., input_min, input_max),
-          F2Q(-10., input_min, input_max),
-          F2Q(7., input_min, input_max),
-      },
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter width, filter height
-      2, 2,                  // stride width, stride height
-      {
-          // Output values
-          F2Q(0., output_min, output_max),
-          F2Q(0.75, output_min, output_max),
-      },
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2ActNone) {
+TF_LITE_MICRO_TEST(SystemAveragePoolTestInt1) {
   using tflite::testing::F2QS;
 
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
-  int8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      2, 2,                  // stride height, stride width
-      {                      // Output values
-       F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride1Stride2Relu) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
   int8_t output_data[3];
+
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+
   tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      2, 1,                  // stride height, stride width
-      {                      // Output values
-       F2QS(0., output_min, output_max), F2QS(0., output_min, output_max),
-       F2QS(0.75, output_min, output_max)},
-      {4, 1, 1, 3, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Stride1Relu1) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
-  int8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(0., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(2., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(-10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      1, 2,                  // stride height, stride width
-      {                      // Output values
-       F2QS(-0.25, output_min, output_max), F2QS(0.75, output_min, output_max)},
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingValidStride2Relu6) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
-  int8_t output_data[2];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(8., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)},
-      input_min, input_max,  // input quantization range
-      2, 2,                  // filter height, filter width
-      2, 2,                  // stride height, stride width
-      {                      // Output values
-       F2QS(0.5, output_min, output_max), F2QS(6., output_min, output_max)},
-      {4, 1, 1, 2, 1},         // Output shape
-      output_min, output_max,  // output quantization range
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
-  using tflite::testing::F2QS;
-
-  const float input_min = -15.9375;
-  const float input_max = 15.8130;
-  const float output_min = -15.9375;
-  const float output_max = 15.8130;
-  int8_t output_data[8];
-  tflite::testing::TestAveragePoolingQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {                 // Input values
-       F2QS(3., input_min, input_max), F2QS(-6., input_min, input_max),
-       F2QS(8., input_min, input_max), F2QS(4., input_min, input_max),
-       F2QS(3., input_min, input_max), F2QS(2., input_min, input_max),
-       F2QS(10., input_min, input_max), F2QS(7., input_min, input_max)},
+      kInput1Shape,  // Input shape
+      kInput1Data,
       input_min, input_max,  // input quantization range
       2, 2,                  // filter height, filter width
       1, 1,                  // stride height, stride width
-      {                      // Output values
-       F2QS(0.5, output_min, output_max), F2QS(3.5, output_min, output_max),
-       F2QS(7.25, output_min, output_max), F2QS(5.5, output_min, output_max),
-       F2QS(2.5, output_min, output_max), F2QS(6., output_min, output_max),
-       F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
-      {4, 1, 2, 4, 1},         // Output shape
+      kGolden1Data,
+      kOutput1Shape,         // Output shape
       output_min, output_max,  // output quantization range
       kTfLitePaddingValid, kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {
-  float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {                 // Input values
-                                     0, 6, 2, 4, 3, 2, 10, 7},
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        6,
-                                        10,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActNone,
-                                    output_data);
-}
 
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu) {
-  float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -1, -6, 2, 4,     //
-                                        -3, -2, 10.5, 7,  //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        0.0,
-                                        10.5,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu,
-                                    output_data);
-}
+TF_LITE_MICRO_TEST(LocalAveragePoolTestInt1) {
+  using tflite::testing::F2QS;
 
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu1) {
-  float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -2.75, -6, 0.2, 0.4,  //
-                                        -3, -2, -0.3, 0.7,    //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        -1.0,
-                                        0.7,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu1,
-                                    output_data);
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[3];
 
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -2.75, -6, -2, -4,  //
-                                        -3, -2, 10, -7,     //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        -1.0,
-                                        1.0,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu1,
-                                    output_data);
-}
+#pragma Bss(".Zdata")  
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+#pragma Bss()  
 
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloatRelu6) {
-  float output_data[2];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        -1.5, -6, 12, 4,  //
-                                        -3, -2, 10, 7,    //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        0.0,
-                                        6.0,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu6,
-                                    output_data);
-
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        0, 4.5, 12, 4,  //
-                                        3, 2, 10, 7,    //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    2, 2,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        4.5,
-                                        6.0,
-                                    },
-                                    {4, 1, 1, 2, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActRelu6,
-                                    output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingSameStride1) {
-  float output_data[8];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        0, 6, 2, 4,   //
-                                        3, 2, 10, 7,  //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    1, 1,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        6, 10, 10, 7,  //
-                                        3, 10, 10, 7,  //
-                                    },
-                                    {4, 1, 2, 4, 1},  // Output shape
-                                    kTfLitePaddingSame, kTfLiteActNone,
-                                    output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestPaddingValidStride1) {
-  float output_data[3];
-  tflite::testing::TestMaxPoolFloat({4, 1, 2, 4, 1},  // Input shape
-                                    {
-                                        // Input values
-                                        0, 6, 2, 4,   //
-                                        3, 2, 10, 7,  //
-                                    },
-                                    2, 2,  // filter width, filter height
-                                    1, 1,  // stride width, stride height
-                                    {
-                                        // Output values
-                                        6,
-                                        10,
-                                        10,
-                                    },
-                                    {4, 1, 1, 3, 1},  // Output shape
-                                    kTfLitePaddingValid, kTfLiteActNone,
-                                    output_data);
-}
-
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestUInt8ActNone) {
-  using tflite::testing::F2Q;
-
-  uint8_t output_data[2];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(6, output_min, output_max), F2Q(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput1Shape,  // Input shape
+      kInput1Data,
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      1, 1,                  // stride height, stride width
+      kGolden1Data,
+      kOutput1Shape,         // Output shape
+      output_min, output_max,  // output quantization range
       kTfLitePaddingValid, kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
-  using tflite::testing::F2Q;
+// Test group AVG 2
+TF_LITE_MICRO_TEST(SystemAveragePoolTestInt2) {
+  using tflite::testing::F2QS;
 
-  uint8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(-1.5, input_min, input_max),
-          F2Q(-6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(-3, input_min, input_max),
-          F2Q(-2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(0, output_min, output_max), F2Q(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
-}
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[45];
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
-  using tflite::testing::F2Q;
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1}; 
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1};
 
-  uint8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(-1.7, input_min, input_max),
-          F2Q(-6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(-3, input_min, input_max),
-          F2Q(-2, input_min, input_max),
-          F2Q(-10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(-1.0, output_min, output_max), F2Q(1.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
-}
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
-  using tflite::testing::F2Q;
-
-  uint8_t output_data[8];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(-6, input_min, input_max),
-          F2Q(12, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(-3, input_min, input_max),
-          F2Q(-2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(0.0, output_min, output_max), F2Q(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
-
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(4.5, input_min, input_max),
-          F2Q(12, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2Q(4.5, output_min, output_max), F2Q(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
-}
-
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) {
-  using tflite::testing::F2Q;
-
-  uint8_t output_data[8];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 1;
-  int stride_height = 1;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {
-          // Output values
-          F2Q(6, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(7, output_min, output_max),
-          F2Q(3, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(7, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 2, 4, 1},  // Output shape
-      kTfLitePaddingSame, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) {
-  using tflite::testing::F2Q;
-
-  uint8_t output_data[3];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 1;
-  int stride_height = 1;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2Q(0, input_min, input_max),
-          F2Q(6, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(4, input_min, input_max),
-          F2Q(3, input_min, input_max),
-          F2Q(2, input_min, input_max),
-          F2Q(10, input_min, input_max),
-          F2Q(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {
-          // Output values
-          F2Q(6, output_min, output_max),
-          F2Q(10, output_min, output_max),
-          F2Q(10, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 1, 3, 1},  // Output shape
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data,
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      1, 1,                  // stride height, stride width
+      kGolden2Data,
+      kOutput2Shape,         // Output shape
+      output_min, output_max,  // output quantization range
       kTfLitePaddingValid, kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(SimpleMaxPoolTestInt8ActNone) {
+TF_LITE_MICRO_TEST(LocalAveragePoolTestInt2) {
   using tflite::testing::F2QS;
 
-  int8_t output_data[2];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(6, output_min, output_max), F2QS(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int8_t output_data[45];
+
+#pragma Bss(".Zdata")  
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1}; 
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1};
+#pragma Bss()  
+
+  tflite::testing::TestAveragePoolingQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data,
+      input_min, input_max,  // input quantization range
+      2, 2,                  // filter height, filter width
+      1, 1,                  // stride height, stride width
+      kGolden2Data,
+      kOutput2Shape,         // Output shape
+      output_min, output_max,  // output quantization range
       kTfLitePaddingValid, kTfLiteActNone, output_data);
 }
 
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu) {
-  using tflite::testing::F2QS;
-
-  int8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(-1.5, input_min, input_max),
-          F2QS(-6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(-3, input_min, input_max),
-          F2QS(-2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(0, output_min, output_max), F2QS(10, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu, output_data);
-}
-
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu1) {
-  using tflite::testing::F2QS;
-
-  int8_t output_data[2];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(-1.7, input_min, input_max),
-          F2QS(-6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(-3, input_min, input_max),
-          F2QS(-2, input_min, input_max),
-          F2QS(-10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(-1.0, output_min, output_max), F2QS(1.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu1, output_data);
-}
-
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8ActRelu6) {
-  using tflite::testing::F2QS;
-
-  int8_t output_data[8];
-  float input_min = -15.9375;
-  float input_max = 15.9375;
-  float output_min = -15.9375;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 2;
-  int stride_height = 2;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(-6, input_min, input_max),
-          F2QS(12, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(-3, input_min, input_max),
-          F2QS(-2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(0.0, output_min, output_max), F2QS(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
-
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(4.5, input_min, input_max),
-          F2QS(12, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {// Output values
-       F2QS(4.5, output_min, output_max), F2QS(6.0, output_min, output_max)},
-      output_min, output_max, {4, 1, 1, 2, 1},  // Output shape
-      kTfLitePaddingValid, kTfLiteActRelu6, output_data);
-}
-
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingSameStride1) {
-  using tflite::testing::F2QS;
-
-  int8_t output_data[8];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
-  int filter_width = 2;
-  int filter_height = 2;
-  int stride_width = 1;
-  int stride_height = 1;
-  tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
-      input_min, input_max, filter_width, filter_height, stride_width,
-      stride_height,
-      {
-          // Output values
-          F2QS(6, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(7, output_min, output_max),
-          F2QS(3, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(7, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 2, 4, 1},  // Output shape
-      kTfLitePaddingSame, kTfLiteActNone, output_data);
-}
-
-TF_LITE_MICRO_TEST(MaxPoolTestUInt8PaddingValidStride1) {
+// Test group MAX 1
+TF_LITE_MICRO_TEST(SystemMaxPoolTestInt1) {
   using tflite::testing::F2QS;
 
   int8_t output_data[3];
-  float input_min = 0;
-  float input_max = 15.9375;
-  float output_min = 0;
-  float output_max = 15.9375;
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
   int filter_width = 2;
   int filter_height = 2;
   int stride_width = 1;
   int stride_height = 1;
+
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+
   tflite::testing::TestMaxPoolQuantized(
-      {4, 1, 2, 4, 1},  // Input shape
-      {
-          // Input values
-          F2QS(0, input_min, input_max),
-          F2QS(6, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(4, input_min, input_max),
-          F2QS(3, input_min, input_max),
-          F2QS(2, input_min, input_max),
-          F2QS(10, input_min, input_max),
-          F2QS(7, input_min, input_max),
-      },
+      kInput1Shape,  // Input shape
+      kInput1Data,
       input_min, input_max, filter_width, filter_height, stride_width,
       stride_height,
-      {
-          // Output values
-          F2QS(6, output_min, output_max),
-          F2QS(10, output_min, output_max),
-          F2QS(10, output_min, output_max),
-      },
-      output_min, output_max, {4, 1, 1, 3, 1},  // Output shape
+      kGolden1Data,
+      output_min, output_max, kOutput1Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalMaxPoolTestInt1) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[3];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+#pragma Bss(".Zdata")
+  const int kInput1Shape[] = {4, 1, 2, 4, 1};
+  const int8_t kInput1Data[] = {1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput1Shape[] = {4, 1, 1, 3, 1};
+  const int8_t kGolden1Data[] = {1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput1Shape,  // Input shape
+      kInput1Data,
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      kGolden1Data,
+      output_min, output_max, kOutput1Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+
+// Test group MAX 2
+TF_LITE_MICRO_TEST(SystemMaxPoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[45];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1}; 
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1};
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data,
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      kGolden2Data,
+      output_min, output_max, kOutput2Shape,  // Output shape
+      kTfLitePaddingValid, kTfLiteActNone, output_data);
+}
+
+TF_LITE_MICRO_TEST(LocalMaxPoolTestInt2) {
+  using tflite::testing::F2QS;
+
+  int8_t output_data[45];
+  const float input_min = -128;
+  const float input_max = 127;
+  const float output_min = -128;
+  const float output_max = 127;
+  int filter_width = 2;
+  int filter_height = 2;
+  int stride_width = 1;
+  int stride_height = 1;
+
+ #pragma Bss(".Zdata") 
+  const int kInput2Shape[] = {4, 1, 6, 10, 1};
+  const int8_t kInput2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                               1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  const int kOutput2Shape[] = {4, 1, 5, 9, 1}; 
+  const int8_t kGolden2Data[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                                1, 1, 1, 1, 1};
+#pragma Bss()
+
+  tflite::testing::TestMaxPoolQuantized(
+      kInput2Shape,  // Input shape
+      kInput2Data,
+      input_min, input_max, filter_width, filter_height, stride_width,
+      stride_height,
+      kGolden2Data,
+      output_min, output_max, kOutput2Shape,  // Output shape
       kTfLitePaddingValid, kTfLiteActNone, output_data);
 }
 

From 9996df4d7c3cbd8fadf342f27df4ae3d225b56b0 Mon Sep 17 00:00:00 2001
From: jacco <jacco@synopsys.com>
Date: Wed, 29 Apr 2020 12:37:40 +0200
Subject: [PATCH 0088/1533] Small fix in mli slicing code for fully connect
 kernel

---
 tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 70d1fda4c2b..89eae356f51 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -158,7 +158,7 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   mli_mov_cfg_for_copy(&copy_config);
   const int weight_out_dimension = 0;
   const int out_tensor_dimension = 1;
-  const int batch_dimension = 0;
+  const int input_size_dimension = 1;
   int slice_size = mli_weights.shape[weight_out_dimension];
 
   /* allocate the local buffers, and compute the slice size */
@@ -192,13 +192,14 @@ TfLiteStatus EvalMliQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
     mli_mov_tensor_sync(w_slice.Sub(), &copy_config, w_ptr);
     mli_mov_tensor_sync(b_slice.Sub(), &copy_config, b_ptr);
 
-    TensorSlicer in_slice(&mli_in, batch_dimension, 1);
+    // Slice the input over the batches (one at a time with the size of a complete input)
+    TensorSlicer in_slice(&mli_in, input_size_dimension, mli_in.shape[input_size_dimension]);
 
     /* output tensor is alreade sliced in the output size dimension.
     out_ch_slice.Sub() is the tensor for the amount of output size of this
     itteration of the weight slice loop. This tensor needs to be further
     sliced over the batch */
-    TensorSlicer out_slice(out_ch_slice.Sub(), batch_dimension, 1);
+    TensorSlicer out_slice(out_ch_slice.Sub(), out_tensor_dimension, slice_size);
 
     /* setup the pointers to the local or remote tensor to make the code
      * inside the loop easier. */

From 21e7a9fffa8461f670abe50d2ef6a1724597d352 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Thu, 23 Apr 2020 14:09:21 +0300
Subject: [PATCH 0089/1533] Updated embARC MLI version for downloading +
 Package with pre-built libraries for various platforms

---
 .../micro/tools/make/ext_libs/arc_mli.inc     | 26 +++++++++++--------
 .../tools/make/targets/arc/arc_common.inc     |  2 ++
 .../tools/make/targets/arc_emsdp_makefile.inc |  3 +++
 .../tools/make/third_party_downloads.inc      |  8 +++---
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
index ee3cc8113c1..a95b4550417 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
@@ -21,19 +21,9 @@ ifeq ($(TARGET_ARCH), arc)
 # by passing 'no_arc_mli' tag (make -f <tflm_main_makefile> TAGS=no_arc_mli ...)
 ifeq ($(filter no_arc_mli,$(ALL_TAGS)),)
 
-
 ALL_TAGS += arc_mli
 
-ifeq ($(PRE_COMPILED_MLI),true)
-  # TODO: Replace with proper arc_mli pre-builts.
-  $(eval $(call add_third_party_download,$(EMBARC_OSP_URL),$(EMBARC_OSP_MD5),embarc_osp,))
-
-  MLI_INCLUDE_FOLDER = embarc_osp/library/embarc_mli/include
-  MLI_LIB = third_party/embarc_osp/library/embarc_mli/lib/arcem9d/libmli_iotdk.a
-
-  THIRD_PARTY_CC_HDRS += \
-    third_party/embarc_osp/LICENSE
-else
+ifeq ($(BUILD_ARC_MLI),true)
   MLI_LIB_DIR = arc_mli_$(basename $(TCF_FILE_NAME))
 
   $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
@@ -44,6 +34,20 @@ else
 
   THIRD_PARTY_CC_HDRS += \
     third_party/$(MLI_LIB_DIR)/LICENSE
+else
+ifneq ($(ARC_MLI_PRE_COMPILED_TARGET),)
+  MLI_LIB_DIR = arc_mli_package
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
+
+  MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
+  MLI_LIB = third_party/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
+  MICROLITE_LIBS += $(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/bin/$(ARC_MLI_PRE_COMPILED_TARGET)/release/libmli.a
+
+  THIRD_PARTY_CC_HDRS += \
+    third_party/$(MLI_LIB_DIR)/LICENSE
+else 
+$(error Target for pre compiled ARC MLI library is not defined)
+endif
 endif
 
   THIRD_PARTY_CC_HDRS += $(MLI_LIB)
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 4a9a5ccdfc3..9462c3852f2 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -89,6 +89,8 @@ ifeq ($(ARC_TOOLCHAIN), mwdt)
 
   LCF_FILE ?= 
 
+  BUILD_ARC_MLI ?= true
+
 # The variable TCF_FILE_NAME stores the TCF file name (including .tcf extension), 
 # this variable is used later to add the option to the linker/compiler flags.
 # This condition also handles the case when the user/makefile specifies 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
index a84dd15e4e8..b81bcea0eb8 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -23,6 +23,9 @@ ifeq ($(TARGET), arc_emsdp)
   UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
   UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
 
+  BUILD_ARC_MLI := false
+  ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss
+
 include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
   
    ARC_EXTRA_APP_SETTINGS = \
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index ce24ba29542..db420b7fd1b 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -71,11 +71,11 @@ PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
 PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
 PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
 
-EMBARC_OSP_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp/archive/embarc_mli.zip"
-EMBARC_OSP_MD5 := "9eaf7b3a1ed05872a03da9796672a776"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/7026ad09bb7f967324eb29e069f776bc44a08886.zip"
+EMBARC_MLI_MD5 := "7eebd730db79c6834399f87e509115fb"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/4b6c6eed65395dced1564006be8188781af16035.zip"
-EMBARC_MLI_MD5 := "47167553c17ff8c7cd59fb1afb90c304"
+EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC1/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_MD5 := "b85b8b89446757735342795367e37d22"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"

From f9b6799aadacfc19032994bbb1c4eba67e53c598 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Fri, 24 Apr 2020 13:31:42 +0300
Subject: [PATCH 0090/1533] Fixes in project generation for ARC specific
 projects

---
 tensorflow/lite/micro/tools/make/helper_functions.inc      | 2 ++
 .../lite/micro/tools/make/targets/arc/arc_common.inc       | 2 +-
 .../lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf      | 4 ++--
 .../lite/micro/tools/make/targets/arc_emsdp_makefile.inc   | 7 +++++--
 tensorflow/lite/micro/tools/make/targets/arc_makefile.inc  | 2 ++
 .../lite/micro/tools/make/templates/arc/README_ARC.md.tpl  | 2 ++
 .../micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl | 2 ++
 7 files changed, 16 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
 create mode 100644 tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl

diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 8d321d42490..1cf9afa8794 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -150,6 +150,8 @@ $(PRJDIR)$(3)/$(1)/Makefile: tensorflow/lite/micro/tools/make/templates/arc/arc_
 	sed -E 's#\%\{APP_DEBUG_CMD\}\%#$(ARC_APP_DEBUG_CMD)#g' | \
 	sed -E 's#\%\{EXTRA_EXECUTE_RULES\}\%#$(ARC_EXTRA_EXECUTE_RULES)#g' > $$@
 
+$(PRJDIR)$(3)/$(1)/%: tensorflow/lite/micro/tools/make/templates/arc/%.tpl
+	@cp $$< $$@
 
 $(foreach var,$(ARC_TARGET_FILES_DIRS),$(eval $(call path_changing_copy_file,$(PRJDIR)$(3)/$(1),$(var))))
 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
index 9462c3852f2..596f219d3d1 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc/arc_common.inc
@@ -105,7 +105,7 @@ endif
 
   PLATFORM_FLAGS = -tcf=$(TCF_FILE_NAME) -tcf_core_config
   
-  PLATFORM_FLAGS += -Hnocopyr -O3 -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
+  PLATFORM_FLAGS += -Hnocopyr -Hpurge -Hdense_prologue -Hon=Long_enums -fslp-vectorize-aggressive -ffunction-sections -fdata-sections 
   
   # Use compact CRT. It requires pre-defined heap size
   PLATFORM_FLAGS += -Hcl -Hcrt_fast_memcpy -Hcrt_fast_memset 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
index d17c807e250..c13dea5c6a0 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -28,11 +28,11 @@ SECTIONS {
         .text? : { *('.text$crt*') }
         * (TEXT): {}
         * (LIT): {}
-    } > ICCM0
+    } > SRAM
 
     GROUP BLOCK(4): {
        .Zdata? : {}
-       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32K): {}
        .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
     } > DCCM
         
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
index b81bcea0eb8..211437bd9f4 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -54,8 +54,11 @@ ifeq ($(filter $(ARC_TARGET_FILES_DIRS), $(dir $(UBOOT_FILE))),)
   ARC_TARGET_FILES_DIRS += $(dir $(UBOOT_FILE))
 endif
 
-  # for default EMSD configuration we can use default em9d rt libs
+  MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC_EMSDP.md
+
+  # for default EMSDP configuration we can use em9d_va rt libs
   # for better performance runtime should be built for emsdp configuration
-  PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio
+  # No hostlink library for smaller codesize purpose
+  PLATFORM_LDFLAGS += -Hlib=em9d_voice_audio -Hhostlib=
 
 endif
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
index d379eea86f1..9f5442b4c6c 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_makefile.inc
@@ -33,6 +33,8 @@ endif
 
 include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
 
+MAKE_PROJECT_FILES := $(filter-out README_MAKE.md, $(MAKE_PROJECT_FILES)) README_ARC.md
+
 endif  # $(TARGET)
 endif  # $(TARGET_ARCH)...
 
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
new file mode 100644
index 00000000000..b722b9c441d
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
@@ -0,0 +1,2 @@
+# Mock Project Readme for common ARC target
+
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
new file mode 100644
index 00000000000..b3d9257f4d2
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
@@ -0,0 +1,2 @@
+# Mock Project Readme for ARC EMSDP target
+

From 0fece983977cbf914a3a413005b8de7648963735 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Fri, 24 Apr 2020 17:45:52 +0300
Subject: [PATCH 0091/1533] ARC EMSDP specific patch of generated projects for
 examples

---
 .../micro_speech/arc_emsdp/Makefile.inc       | 22 +++++++
 .../person_detection/arc_emsdp/Makefile.inc   | 19 ++++++
 .../person_detection/arc_emsdp/emsdp.lcf      | 61 ++++++++++++++++++
 .../arc_emsdp/Makefile.inc                    | 16 +++++
 .../arc_emsdp/emsdp.lcf                       | 63 +++++++++++++++++++
 5 files changed, 181 insertions(+)
 create mode 100644 tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
 create mode 100644 tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
 create mode 100644 tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
 create mode 100644 tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf

diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
new file mode 100644
index 00000000000..7fe4906cdf9
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
@@ -0,0 +1,22 @@
+ifeq ($(TARGET), arc_emsdp)
+
+  MICRO_SPEECH_HDRS += \
+  micro_speech_patch.txt
+  
+  MICRO_SPEECH_TEST_HDRS += \
+  micro_speech_patch.txt
+  
+  MICRO_SPEECH_MOCK_HDRS += \
+  micro_speech_patch.txt
+
+%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
+	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
+	  $(word 2, $^)
+	@echo Makefile >> $@
+
+endif
diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
new file mode 100644
index 00000000000..cb7ba57ecb1
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
@@ -0,0 +1,19 @@
+ifeq ($(TARGET), arc_emsdp)
+
+  person_detection_HDRS += \
+  person_detection_patch.txt
+  
+  person_detection_TEST_HDRS += \
+  person_detection_patch.txt
+  
+
+%/person_detection_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
+	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
+	CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
+	  $(word 2, $^)
+	@echo Makefile >> $@
+
+endif
diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf
new file mode 100644
index 00000000000..34ed267652c
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf
@@ -0,0 +1,61 @@
+# SYSTEM memory regions indicate where external memory might be located.
+#   The TCF has no specific knowledge of whether SYSTEM regions contain 
+#   external memory or not.
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+       .Zdata? : {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+}
+
+
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
new file mode 100644
index 00000000000..94d73f903ed
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
@@ -0,0 +1,16 @@
+ifeq ($(TARGET), arc_emsdp)
+
+  person_detection_HDRS += \
+  person_detection_int8_patch.txt
+  
+  person_detection_TEST_HDRS += \
+  person_detection_int8_patch.txt
+  
+
+%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile
+	@cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $< 
+	@echo emsdp.lcf > $@
+	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^)
+	@echo Makefile > $@
+
+endif
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
new file mode 100644
index 00000000000..98b7e1d911f
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
@@ -0,0 +1,63 @@
+# SYSTEM memory regions indicate where external memory might be located.
+#   The TCF has no specific knowledge of whether SYSTEM regions contain 
+#   external memory or not.
+# CCMWRAP memory regions indicate unusable portions of the address space
+#   due to CCM memory wrapping into upper addresses beyond its size
+
+MEMORY {
+    PSRAM   : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
+    SRAM    : ORIGIN = 0x20000000, LENGTH = 0x00040000
+    IVT     : ORIGIN = 0x60000000, LENGTH = 0x400
+    ICCM0   : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
+#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
+    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
+#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
+    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
+#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
+    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
+#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
+    }
+
+SECTIONS {
+
+    GROUP BLOCK(4) : {
+        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
+    } > IVT
+
+    GROUP BLOCK(4): {
+        .text? : { *('.text$crt*') }
+        * (TEXT): {}
+        * (LIT): {}
+    } > ICCM0
+
+    GROUP BLOCK(4): {
+        .rodata_in_data? : {}
+    } > PSRAM
+
+    GROUP BLOCK(4): {
+    /* _SDA_BASE_ computed implicitly */
+        .sdata?: {}
+        .sbss?: {}
+        * (DATA): {}
+        * (BSS): {}
+        .debug_log? : {}
+    } > SRAM
+
+    GROUP BLOCK(4): {
+# TODO: Move tensor arena to DCCM when it will be possible
+#       .tensor_arena? : {}
+       .Zdata? : {}
+       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
+       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
+    } > DCCM
+
+    GROUP BLOCK(4): {
+        .Xdata? : {}
+    } > XCCM
+
+    GROUP BLOCK(4): {
+        .Ydata? : {}
+    } > YCCM
+}
+
+

From afef62b9764bc08289006e3a1ea60cffa9c55888 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Wed, 29 Apr 2020 14:42:14 +0300
Subject: [PATCH 0092/1533] ARC: Move shared lcf + Cleanup and comments

---
 .../micro_speech/arc_emsdp/Makefile.inc       |    8 +-
 .../person_detection/arc_emsdp/Makefile.inc   |    7 +-
 .../arc_emsdp/Makefile.inc                    |    5 +
 .../arc_emsdp/emsdp.lcf                       |    8 +-
 .../tools/make/targets/arc/emsdp/emsdp.lcf    |   15 +-
 .../make/targets/arc/emsdp/emsdp_v2.lcf}      |    7 +-
 .../tools/make/targets/arc/iotdk/iotdk.lcf    |   47 -
 .../tools/make/targets/arc/iotdk/iotdk.tcf    | 4621 -----------------
 .../micro/tools/make/targets/arc/memory.lcf   |   50 -
 9 files changed, 39 insertions(+), 4729 deletions(-)
 rename tensorflow/lite/micro/{examples/person_detection/arc_emsdp/emsdp.lcf => tools/make/targets/arc/emsdp/emsdp_v2.lcf} (90%)
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/memory.lcf

diff --git a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
index 7fe4906cdf9..850263f0eb9 100644
--- a/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/micro_speech/arc_emsdp/Makefile.inc
@@ -1,5 +1,11 @@
 ifeq ($(TARGET), arc_emsdp)
 
+# Patch of arc make project to adjust it specifically for micro speech example. 
+# In particular:
+# - Extend Heap and stack size for application needs
+# - Use Linker command file with better usage of fast memory
+# - In case project was generated with MLI usage, reduce scratch buffers.
+
   MICRO_SPEECH_HDRS += \
   micro_speech_patch.txt
   
@@ -10,7 +16,7 @@ ifeq ($(TARGET), arc_emsdp)
   micro_speech_patch.txt
 
 %/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
-	@cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< 
+	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
 	@echo emsdp.lcf > $@
 	@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
 	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
index cb7ba57ecb1..29a09466e83 100644
--- a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection/arc_emsdp/Makefile.inc
@@ -1,5 +1,10 @@
 ifeq ($(TARGET), arc_emsdp)
 
+# Patch of arc make project to adjust it specifically 
+# for person detection example. In particular:
+# - Use Linker command file with better usage of fast memory
+# - In case project was generated with MLI usage, reduce scratch buffers.
+
   person_detection_HDRS += \
   person_detection_patch.txt
   
@@ -8,7 +13,7 @@ ifeq ($(TARGET), arc_emsdp)
   
 
 %/person_detection_patch.txt: %/emsdp.lcf %/Makefile
-	@cp tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf $< 
+	@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $< 
 	@echo emsdp.lcf > $@
 	@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
 	CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
index 94d73f903ed..c00f9b89953 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/Makefile.inc
@@ -1,5 +1,10 @@
 ifeq ($(TARGET), arc_emsdp)
 
+# Patch of arc make project to adjust it specifically 
+# for experimental person detection example. In particular:
+# - Use Linker command file with better usage of fast memory
+# - Stripout TFLM reference code by default.
+
   person_detection_HDRS += \
   person_detection_int8_patch.txt
   
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
index 98b7e1d911f..2d7954217d3 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
@@ -1,6 +1,8 @@
-# SYSTEM memory regions indicate where external memory might be located.
-#   The TCF has no specific knowledge of whether SYSTEM regions contain 
-#   external memory or not.
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data from external PSRAM to on-chip memory
+# - move text from SRAM to ICCM
+# - TODO: Move tensor arena to DCCM to reduce data flow between fast and extrnal memory
+#
 # CCMWRAP memory regions indicate unusable portions of the address space
 #   due to CCM memory wrapping into upper addresses beyond its size
 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
index c13dea5c6a0..b01b4835071 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp.lcf
@@ -1,6 +1,15 @@
-# SYSTEM memory regions indicate where external memory might be located.
-#   The TCF has no specific knowledge of whether SYSTEM regions contain 
-#   external memory or not.
+# Common EMSDP LCF File for applications
+#
+# external SRAM memory is used for code, because some TFLM applications includes the whole 
+# set of supported kernels which doesn't fit to ICCM0. 
+# It could slow performance a bit. Smaller applications can use ICCM0 instead.
+#
+# External PSRAM is used for potentially big sections. In particular:
+# - rodata_in data which typically includes protobuf with model.
+# - other .data which typically includes tensor arena.
+#
+# stack and heap are kept in DCCM which is the closest memory to the core 
+
 # CCMWRAP memory regions indicate unusable portions of the address space
 #   due to CCM memory wrapping into upper addresses beyond its size
 
diff --git a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf
similarity index 90%
rename from tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf
rename to tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf
index 34ed267652c..a379fe69e21 100644
--- a/tensorflow/lite/micro/examples/person_detection/arc_emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf
@@ -1,6 +1,7 @@
-# SYSTEM memory regions indicate where external memory might be located.
-#   The TCF has no specific knowledge of whether SYSTEM regions contain 
-#   external memory or not.
+# Difference with common EMSDP LCF file (to reduce data access time): 
+# - move data from external PSRAM to DCCM
+# - move text from SRAM to ICCM
+#
 # CCMWRAP memory regions indicate unusable portions of the address space
 #   due to CCM memory wrapping into upper addresses beyond its size
 
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf
deleted file mode 100644
index da39ae911ff..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.lcf
+++ /dev/null
@@ -1,47 +0,0 @@
-# SYSTEM memory regions indicate where external memory might be located.
-#   The TCF has no specific knowledge of whether SYSTEM regions contain 
-#   external memory or not.
-# CCMWRAP memory regions indicate unusable portions of the address space
-#   due to CCM memory wrapping into upper addresses beyond its size
-
-MEMORY {
-#   SYSTEM0 : ORIGIN = 0x00000000, LENGTH = 0x20000000
-    ICCM0   : ORIGIN = 0x20000000, LENGTH = 0x00040000
-#   CCMWRAP0: ORIGIN = 0x20040000, LENGTH = 0x0ffc0000
-#   SYSTEM1 : ORIGIN = 0x30000000, LENGTH = 0x50000000
-    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
-#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
-#   SYSTEM2 : ORIGIN = 0x90000000, LENGTH = 0x30000000
-    XCCM    : ORIGIN = 0xc0000000, LENGTH = 0x00008000
-#   CCMWRAP2: ORIGIN = 0xc0008000, LENGTH = 0x0fff8000
-#   SYSTEM3 : ORIGIN = 0xd0000000, LENGTH = 0x10000000
-    YCCM    : ORIGIN = 0xe0000000, LENGTH = 0x00008000
-#   CCMWRAP3: ORIGIN = 0xe0008000, LENGTH = 0x0fff8000
-#   SYSTEM4 : ORIGIN = 0xf0000000, LENGTH = 0x10000000
-    }
-SECTIONS {
-    GROUP: {
-	.text? : { *('.text$crt*') }
-        * (TEXT): {}
-    	* (LIT): {}
-	} > ICCM0
-
-    GROUP: {
-	/* _SDA_BASE_ computed implicitly */
-        .sdata?: {}
-        .sbss?: {}
-        * (DATA): {}
-        * (BSS): {}
-       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {}
-       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
-	} > DCCM
-    GROUP: {
-        .Xdata? : {}
-        } > XCCM
-    GROUP: {
-        .Ydata? : {}
-        } > YCCM
-    GROUP BIND(0x0): {
-        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4)
-        }
-    }
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf b/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf
deleted file mode 100644
index 004215a2f6a..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/arc/iotdk/iotdk.tcf
+++ /dev/null
@@ -1,4621 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<config_list>
-  <tool_config version="1.0.59" mwdt_version="M-2016.12" />
-  <configuration name="BCRs" filename="bcr_contents.txt">
-    <string><![CDATA[
-	0x4	0x142		IDENTITY
-	0x18	0x80000000	AUX_DCCM
-	0x60	0x2		BCR_VER
-	0x68	0x10		VECBASE_AC_BUILD
-	0x6d	0x1002		MPU_BUILD
-	0x6e	0xc902		RF_BUILD
-	0x74	0x904		DCCM_BUILD
-	0x75	0x10304		TIMER_BUILD
-	0x76	0x605		AP_BUILD
-	0x78	0xa04		ICCM_BUILD
-	0x79	0x3620		XY_BUILD
-	0x7a	0x3521		DSP_BUILD
-	0x7b	0x22a06		MULTIPLY_BUILD
-	0x7c	0x3		SWAP_BUILD
-	0x7d	0x3		NORM_BUILD
-	0x7e	0x2		MINMAX_BUILD
-	0x7f	0x303		BARREL_BUILD
-	0xc1	0x12447402	ISA_CONFIG
-	0xc5	0x2		STACK_REGION_BUILD
-	0xc7	0x30000003	ERP_BUILD
-	0xc8	0x1000f02	FPU_BUILD
-	0xc9	0x1		CPROT_BUILD
-	0xcc	0x1442401	AGU_BUILD
-	0xcd	0x170f01	DMAC_BUILD
-	0xd0	0x2011002	MCIP_SYSTEM_BUILD
-	0xd3	0x2		MCIP_PMU_BUILD
-	0xe3	0x1		MCIP_PDM_BUILD
-	0xf0	0x100013	SUBSYS_BUILD
-	0xf1	0x1		CORE_CONFIG
-	0xf3	0x133c5f01	IRQ_BUILD
-	0xf5	0x8080102	PCT_BUILD
-	0xf6	0x6f0004	CC_BUILD
-	0xf7	0x302		PDM_DVFS_BUILD
-	0xfe	0x202		IFQUEUE_BUILD
-	0xff	0x10003		SMART_BUILD
-	0x208	0x20000000	AUX_ICCM
-	0x5f8	0xc0000000	XCCM_BASE
-	0x5f9	0xe0000000	YCCM_BASE
-	0xa00	0x1000		SUBSYS_DSP_0_BUILD
-	0xa04	0x71711f0	SUBSYS_IO_0_BUILD
-	0xa05	0xf70		SUBSYS_IO_1_BUILD
-]]></string>
-  </configuration>
-  <configuration name="mw_compiler" filename="ccac.arg">
-    <string><![CDATA[
-	-arcv2em
-	-core2
-	-Hrgf_banked_regs=32
-	-HL
-	-Xunaligned
-	-Xcode_density
-	-Xdiv_rem=radix2
-	-Xswap
-	-Xbitscan
-	-Xmpy_option=mpyd
-	-Xshift_assist
-	-Xbarrel_shifter
-	-Xdsp2
-	-Xdsp_complex
-	-Xdsp_divsqrt=radix2
-	-Xdsp_itu
-	-Xdsp_accshift=full
-	-Xagu_small
-	-Xxy
-	-Xfpus_div
-	-Xfpu_mac
-	-Xfpuda
-	-Xfpus_mpy_slow
-	-Xfpus_div_slow
-	-Xtimer0
-	-Xtimer1
-	-Xstack_check
-	-Hccm
-	-Xdmac
-]]></string>
-  </configuration>
-  <configuration name="mw_debugger" filename="mdb.arg">
-    <string><![CDATA[
-	-arcv2em 
-	-core2 
-	-rgf_num_banks=2 
-	-rgf_banked_regs=32 
-	-rgf_num_wr_ports=2 
-	-Xunaligned 
-	-Xcode_density 
-	-Xdiv_rem=radix2 
-	-Xswap 
-	-Xbitscan 
-	-Xmpy_option=mpyd 
-	-Xshift_assist 
-	-Xbarrel_shifter 
-	-Xdsp2 
-	-Xdsp_complex 
-	-Xdsp_divsqrt=radix2 
-	-Xdsp_itu 
-	-Xdsp_accshift=full 
-	-Xagu_small 
-	-Xagu_wb_depth=2 
-	-Xagu_accord 
-	-Xxy 
-	-Xxy_config=dccm_x_y 
-	-Xxy_size=32K 
-	-Xxy_interleave 
-	-Xxy_x_base=0xc0000000 
-	-Xxy_y_base=0xe0000000 
-	-Xfpus_div 
-	-Xfpu_mac 
-	-Xfpuda 
-	-Xfpus_mpy_slow 
-	-Xfpus_div_slow 
-	-Xtimer0 
-	-Xtimer0_level=1 
-	-Xtimer1 
-	-Xtimer1_level=0 
-	-action_points=8 
-	-Xstack_check 
-	-code_protection 
-	-smart_stack_entries=64 
-	-mpu 
-	-mpu_regions=16 
-	-ifq_entries=4 
-	-interrupts=95 
-	-interrupt_priorities=4 
-	-ext_interrupts=60 
-	-firq 
-	-interrupt_base=0x0 
-	-dccm_size=0x20000 
-	-dccm_base=0x80000000 
-	-iccm0_size=0x40000 
-	-iccm0_base=0x20000000 
-	-error_prot_ver=3 
-	-ccm_prot_pipelined 
-	-watchdog 
-	-watchdog_size=16 
-	-Xpct_counters=8 
-	-arconnect 
-	-connect_pmu 
-	-connect_pdm 
-	-dmac 
-	-dmac_channels=16 
-	-dmac_registers=16 
-	-dmac_fifo_depth=4 
-	-dmac_int_config=multiple_internal 
-	-power_domains 
-	-dvfs 
-]]></string>
-  </configuration>
-  <configuration name="nSIM" filename="nsim.props">
-    <string><![CDATA[
-	nsim_isa_family=av2em
-	nsim_isa_core=2
-	arcver=0x42
-	nsim_isa_rgf_num_banks=2
-	nsim_isa_rgf_banked_regs=32
-	nsim_isa_rgf_num_regs=32
-	nsim_isa_rgf_num_wr_ports=2
-	nsim_isa_big_endian=0
-	nsim_isa_lpc_size=32
-	nsim_isa_pc_size=32
-	nsim_isa_addr_size=32
-	nsim_isa_ad_option=1
-	nsim_isa_code_density_option=2
-	nsim_isa_div_rem_option=1
-	nsim_isa_swap_option=1
-	nsim_isa_bitscan_option=1
-	nsim_isa_mpy_option=8
-	nsim_isa_shift_option=3
-	nsim_isa_dsp_option=2
-	nsim_isa_dsp_complex_option=1
-	nsim_isa_dsp_divsqrt_option=1
-	nsim_isa_dsp_itu_option=1
-	nsim_isa_dsp_accshift_option=2
-	nsim_isa_agu_size=small
-	nsim_isa_agu_wb_depth=2
-	nsim_isa_agu_accord=1
-	nsim_isa_xy=1
-	nsim_isa_xy_config=dccm_x_y
-	nsim_isa_xy_size=32K
-	nsim_isa_xy_interleave=1
-	nsim_isa_xy_x_base=0xc0000000
-	nsim_isa_xy_y_base=0xe0000000
-	nsim_isa_fpus_div_option=1
-	nsim_isa_fpu_mac_option=1
-	nsim_isa_fpuda_option=1
-	nsim_isa_fpu_fast_mpy_option=0
-	nsim_isa_fpu_fast_div_option=0
-	nsim_isa_enable_timer_0=1
-	nsim_isa_timer_0_int_level=1
-	nsim_isa_enable_timer_1=1
-	nsim_isa_timer_1_int_level=0
-	nsim_isa_num_actionpoints=8
-	nsim_isa_stack_checking=1
-	nsim_isa_code_protect_mask=0x0
-	nsim_isa_smart_stack_entries=64
-	mpu_regions=16
-	mpu_version=2
-	nsim_isa_ifq_size=4
-	nsim_isa_number_of_interrupts=95
-	nsim_isa_number_of_levels=4
-	nsim_isa_number_of_external_interrupts=60
-	nsim_isa_fast_irq=1
-	nsim_isa_intvbase_preset=0x0
-	dccm_size=0x20000
-	dccm_base=0x80000000
-	iccm0_size=0x40000
-	iccm0_base=0x20000000
-	nsim_isa_error_prot=3
-	nsim_isa_error_prot_ccm_wb=1
-	nsim_isa_watchdog=1
-	nsim_isa_watchdog_size=16
-	nsim_isa_pct_counters=8
-	nsim_connect=2
-	nsim_connect_pmu=1
-	nsim_connect_pdm=1
-	nsim_isa_dmac_option=1
-	nsim_isa_dmac_channels=16
-	nsim_isa_dmac_registers=16
-	nsim_isa_dmac_fifo_depth=4
-	nsim_isa_dmac_int_config=multiple_internal
-	nsim_isa_pdm_option=1
-	nsim_isa_dvfs_option=1
-]]></string>
-  </configuration>
-  <configuration name="IDE" filename="ide.props">
-    <string><![CDATA[
-	processor.family=4
-	processor.core_version=2
-	processor.family_name=arcv2em
-	processor.rgf_num_banks=2
-	processor.rgf_banked_regs=32
-	processor.rgf_num_wr_ports=2
-	processor.endian=little
-	processor.lpc_size=32
-	processor.pc_size=32
-	processor.addr_size=32
-	processor.Xunaligned=1
-	processor.Xcode_density=1
-	processor.Xdiv_rem=radix2
-	processor.Xswap=1
-	processor.Xbitscan=1
-	processor.Xmpy_option=mpyd
-	processor.Xshift_assist=1
-	processor.Xbarrel_shifter=1
-	processor.Xdsp2=1
-	processor.Xdsp_complex=1
-	processor.Xdsp_divsqrt=radix2
-	processor.Xdsp_itu=1
-	processor.Xdsp_accshift=full
-	processor.Xagu_small=1
-	processor.Xagu_wb_depth=2
-	processor.Xagu_accord=1
-	processor.Xxy=1
-	processor.Xxy_config=dccm_x_y
-	processor.Xxy_size=32K
-	processor.Xxy_interleave=1
-	processor.Xxy_x_base=0xc0000000
-	processor.Xxy_y_base=0xe0000000
-	processor.Xfpus_div=1
-	processor.Xfpu_mac=1
-	processor.Xfpuda=1
-	processor.Xfpus_mpy_slow=1
-	processor.Xfpus_div_slow=1
-	processor.Xtimer0=1
-	processor.Xtimer0_level=1
-	processor.Xtimer1=1
-	processor.Xtimer1_level=0
-	processor.action_points=8
-	processor.Xstack_check=1
-	processor.code_protection=1
-	processor.smart_stack_entries=64
-	processor.mpu=1
-	processor.mpu.regions=16
-	processor.ifq_entries=4
-	processor.interrupts=95
-	processor.interrupt_priorities=4
-	processor.ext_interrupts=60
-	processor.firq=1
-	processor.interrupt_base=0x0
-	processor.dccm_size=0x20000
-	processor.dccm_base=0x80000000
-	processor.Hccm=1
-	processor.iccm0_size=0x40000
-	processor.iccm0_base=0x20000000
-	processor.error_prot_ver=3
-	processor.ccm_prot_pipelined=1
-	processor.watchdog=1
-	processor.watchdog_size=16
-	processor.Xpct_counters=8
-	processor.arconnect=1
-	processor.connect_pmu=1
-	processor.connect_pdm=1
-	processor.dmac=1
-	processor.dmac_channels=16
-	processor.dmac_registers=16
-	processor.dmac_fifo_depth=4
-	processor.dmac_int_config=multiple_internal
-	processor.power_domains=1
-	processor.dvfs=1
-]]></string>
-  </configuration>
-  <configuration name="architect" filename="architect.txt">
-    <string><![CDATA[
-######## architect --- com.arc.templates.project.Empty.1_0 ########
-
-# BuildHTMLDocs --- Creates custom HTML documentation in the 'docs' directory.
--build_html_docs true
-
-# BuildSoftware --- Creates software under the Software directory.
--build_software true
-
-# BuildTestCode --- Creates test source code under the 'tests' directory.
--build_test_code true
-
-# BuildScripts --- Creates synthesis scripts and configuration files, which are required for hierarchy generation.
--build_scripts true
-
-# BuildHDL --- Creates the behavioural and synthesisable HDL source code.
--build_hdl true
-
-# CompileTestCode --- Compiles and assembles the test code.
--compile_test_code true
-
-# GenerateStructuralHDL --- Generate the necessary structural HDL
--generate_structural_hdl true
-
-# CompileForHDLSimulation --- Compile the HDL ready for simulation, using the selected Simulator.
--compile_hdl_for_simulation true
-
-# BuildXCAM --- 
-# When true, build the XCAM cycle accurate model from HDL.
-# This happens only when the VTOC component (in the XCAM library) has been added to the design.
-# 
--build_xcam false
-
-# RunARCsyn --- Synthesize design using ARCsyn
--run_arcsyn false
-
-# RunSEIF --- Run Synopsys Embedit Integrator Flow to generate configured memory instances
--run_seif false
-
-# RunARCrams --- Run ARCrams on the current build, this will stitch in vendor supplied RAM models and update the synthesis and simulation environment to use the models.
--run_arcrams false
-
-# RunARCformal --- Formal Verification using ARCformal
--run_arcformal false
-
-# RunARCpower --- Run the Power Analysis using RTL simulation to derive the activity
--run_arcpower false
-
-# compile_nsim_user_extensions --- Build nSIM extensions for any APEX components in the current design using their C Models.
--compile_nsim_user_extension false
-
-# compile_translated_nsim_extensions --- Converts APEX extensions from Verilog to C++ and compiles the model for nSIM.
--compile_translated_nsim_extensions false
-
-
-######## System --- com.arc.hardware.System.1_0 ########
-
-# Create System
--create com.arc.hardware.System.1_0 System
-
-# Testbench --- 
-# Only the rascal testbench is supported, and is required by ARCtest.
-# 	
--testbench rascal
-
-# SynthesisLevel --- 
-# Sets the top level module name for synthesis.  
-# 
-# If not using core_sys: for single-core designs, cpu_isle is used; for multicore designs, archipelago is used.
-# 	
--synthesislevel cpu_isle/archipelago
-
-# GateLevelSim --- When selected the gate level sim test code and scripts would be installed to run ARCgatesim
--gatesim true
-
-# UserLibraryName --- The name for your HDL library
--library_name user
-
-# OPTION_SimulatorName --- The name of the simulator you wish to use
--simulator vcs
-
-# sim64 --- When selected, the 64-bit version of the simulator is used.  Be sure you have the 64-bit-capable simulator installed  and $ARCHITECT_ROOT/lib/linux_x86_64/ added to your LD_LIBRARY_PATH.
-# The setting of this option affects the content of the generated makefile_interface_*_verilog, where * is the simulator name.
--sim64 true
-
-# verilog_2001 --- Enable Verilog 2001 file-io syntax (if false: use pli)
--verilog_2001 true
-
-
-######## ARCv2EM CCT --- cct.1_0 ########
-
-# Create ARCv2EM CCT
--create cct.1_0 "System.ARCv2EM CCT"
-
-# cct --- 
-# 	Option used to add a CCT to the design for command-line builds
-# 	Without this architect can't add this component to a build
-# 	via a cmdline -create command.  
-# 	with old scripts.
-# 	
--cct true
-
-# no_hostlink --- 
-# This prevents the inclusion of the hostlink library when compiling
-# C or C++ programs.  The resultant executable, if it contains printfs,
-# will print to an internal fixed buffer __mwwrite_buf.  
-# Other hostlink operations that require debugger assistance, such as file
-# opens, will fail.
-# 
-# Hostlink references incur memory cycles at unpredictable times and 
-# so can perturb cycle-timing results.  Without hostlink,
-# the debugger will not in any way interfere with the target while it is running.  
-# Therefore this option is useful for simulation in which you want precisely the
-# same cycle timing to occur each time you run, or for accurate power consumption results.
-# 	
--cct_no_hostlink false
-
-
-######## BusFabric --- com.arc.hardware.ARCv2MSS.BusFabric.1_0 ########
-
-# Create BusFabric
--create com.arc.hardware.ARCv2MSS.BusFabric.1_0 System.BusFabric
-
-# alb_mss_fab_def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate mss fabric clock, and the value N means mss fabric is running at (1/N) x ref_clk.
--alb_mss_fab_def_div2ref 1
-
-# alb_mss_fab_perf_transparent --- If true then there is no latency penalty cost in BusFabric for memory access transaction.
--alb_mss_fab_perf_transparent true
-
-# alb_mss_fab_lat --- This specifies the maximum latency in the master latency units.
--alb_mss_fab_lat 0
-
-# alb_mss_fab_def_lat --- This specifies the latency after reset for the master latency units.
--alb_mss_fab_def_lat 0
-
-# alb_mss_ccm_base --- This specifies the base address at which the ICCM and DCCM DMIs will be placed in the memory map. The address should be divided by 4KB i.e. do not specify the lower 12 bits of the address.
--alb_mss_ccm_base 262144
-
-
-######## ClkCtrl --- com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 ########
-
-# Create ClkCtrl
--create com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 System.ClkCtrl
-
-# alb_mss_clkctrl_base_addr --- This specifies the clock controller base address in the memory map, divided by 4KB i.e. do not specify the lower 12 bits of the address.
--alb_mss_clkctrl_base_addr 786432
-
-# alb_mss_clkctrl_bypass_mode --- If true then all clock dividers/gaters in the clock controller are bypassed, clock ratio is not supported and the division options/registers are overriden
--alb_mss_clkctrl_bypass_mode false
-
-
-######## SRAM --- com.arc.hardware.ARCv2MSS.SRAMCtrl.1_0 ########
-
-# Create SRAM
--create com.arc.hardware.ARCv2MSS.SRAMCtrl.1_0 System.SRAM
-
-# alb_mss_mem_base_addr --- This specifies the memory controller base address in the memory map, divided by 4KB i.e. do not specify the lower 12 bits of the address.
--alb_mss_mem_base_addr 0
-
-# alb_mss_mem_lat --- This specifies the maximum latency in the memory latency unit.
--alb_mss_mem_lat 0
-
-# alb_mss_mem_def_lat --- This specifies the latency after reset for the memory latency unit.
--alb_mss_mem_def_lat 0
-
-# alb_mss_mem_size --- This specifies size of the SRAM.
--alb_mss_mem_size 512KB
-
-# alb_mss_mem_is_default_slave --- If true then all transactions without destination will be routed here.
--alb_mss_mem_is_default_slave false
-
-
-######## Implementation --- com.arc.hardware.implementation.1_0 ########
-
-# Create Implementation
--create com.arc.hardware.implementation.1_0 System.Implementation
-
-# ClockSpeed --- Target clock speed of the system
--clock_speed 10
-
-# DDR2_clk_Ratio --- DDR2 Clock Vs System Clock Ratio
-# 2x
-# 3x
-# 4x
--ddr2_clk_ratio 3x
-
-# ClockSkew --- The clock skew for the system
--clock_skew 0.2
-
-# HoldMargin --- Margin for hold time checks
--hold_margin 0.05
-
-# Floorplan --- Floorplan definition for relative placement of  RAMs (at CPU-level) or the placement of the rams and CPU hard cores (at multicore level)
--floorplan em4_sensor
-
-# JTAGFrequency --- Select the frequency of the JTAG clock Tck (in MHz).
-# 
-# The JTAG clock speed has to be less than 1/2 of the cpu clock otherwise the signals on the BVCI interface are not guaranteed to be valid.
-# 
-# NOTE: The RTL simulations will work when the JTAG clock frequency is set to half the CPU clock, however this may not be the case when simulating at gate level due to delays on the IO pads.
-# 
-# The default is set to 10 MHz so that there is no conflict when simulating with an ARCangel3 at 30MHz. (30 > 10*2)
-# 
-# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock.
-# 
--jtag_tclk 4
-
-# execution_trace_level --- 
-# This traces committed instructions as they execute, and gathers statistics
-# visible in the debugger for counting instructions & cycle delays.
-# At the "stats" level ony the statistics are gathered and no trace is printed.
-# "file" is equivalent to "full", but the results go to a trace .txt file instead.
-# 
--execution_trace_level stats
-
-# generate_ipxact --- 
-# Generate ipxact.xml file describing the CPUisle or archipelago frontier
-# 
--generate_ipxact false
-
-# ipxact_relative_path_names --- 
-# Use relative path names for Verilog files in the ipxact.
-# Otherwise, absolute path names are used.
-# 
--ipxact_relative_path_names true
-
-# optional_encryption --- 
-# When selected, encrypted RTL output is generated.
-# 	
--optional_encryption false
-
-# ignore_encrypt_license --- 
-# When selected, pretend the encryption license is missing.  For testing.
-# 	
--ignore_encrypt_license false
-
-# ignore_clear_license --- 
-# When selected, pretend the cleartest license is missing.  For testing.
-# 	
--ignore_clear_license false
-
-
-######## Tool Configuration --- cgen.1_0 ########
-
-# Create Tool Configuration
--create cgen.1_0 "System.Tool Configuration"
-
-# mwdt_version --- Selects the MetaWare version to be used with the TCF file.
-# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools.
--mwdt_version K-2015.09
-
-# code_base_addr --- 
-# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build.  This value is ignored when there is an ICCM.
-# 
--code_base_addr 0
-
-# data_base_addr --- 
-# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM.  This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used.
-# 
-# A value of 0xffffffff means that the data segment will not be mapped to any specific address.
-# 
--data_base_addr 4294967295
-
-
-######## IO Software --- com.arc.software.dfss.sw_io.1_0 ########
-
-# Create IO Software
--create com.arc.software.dfss.sw_io.1_0 "System.IO Software"
-
-# sw_io --- Command line option for Software element 'IO Software'
--sw_io true
-
-
-######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ########
-
-# Create DSP Software
--create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software"
-
-# sw_dsp --- Command line option for Software element 'DSP Software'
--sw_dsp true
-
-
-######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ########
-
-# Create Infrastructure Software
--create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software"
-
-# sw_infra --- Command line option for Software element 'Infrastructure Software'
--sw_infra true
-
-
-######## CPUisle --- com.arc.hardware.CPU_isle.1_0 ########
-
-# Create CPUisle
--create com.arc.hardware.CPU_isle.1_0 System.CPUisle
-
-# unique_name --- verilog module modifier prefix
--unique_name ""
-
-# ArcNum --- The processor number as read back in the ARCNUM field of the IDENTITY register.
--arc_num 1
-
-# instances --- 
-# The number of instantiations of this core.
-# 
--instances 1
-
-# CPUFloorplan --- Floorplan giving relative placement of the RAMs  for the given configuration of ARCv2HS or ARCv2EM in this CPUisle
--cpu_floorplan em9d_xyccm
-
-# userCPUFloorplanPath --- Pathname of user floorplan for the CPU when using a hierarchical implementation
--usercpufloorplan_path ""
-
-# pinLocationConstraintsFile --- Pathname+filename of the physical pin location constraints file or just "side1" (all pins on l.h.s) or "side2" (pins on top only) or "side3" (pins on r.h.s. only) or "side4" (pins on bottom only) to get a template file generated
--pin_location_constraints_file ""
-
-
-######## ARCv2EM --- com.arc.hardware.ARCv2EM.1_0 ########
-
-# Create ARCv2EM
--create com.arc.hardware.ARCv2EM.1_0 System.CPUisle.ARCv2EM
-
-# arcv2em --- Description to follow
--arcv2em true
-
-# def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate core clock, and the value N means core is running at (1/N) x ref_clk.
--def_div2ref 1
-
-# addr_size --- This defines the address bus width (in bits).
--addr_size 32
-
-# pc_size --- This defines the program counter (in bits).
--pc_size 32
-
-# lpc_size --- This defines the size of the loop counter (in bits).
--lpc_size 32
-
-# halt_on_reset --- This defines whether the core is halted initially on reset.
--halt_on_reset true
-
-# byte_order --- This defines the endianness of the core.
--byte_order little
-
-# code_density_option --- This reduces the size of program memory by adding instructions that condense commonly used instruction patterns with some marginal increase in processor gate count. The added instructions are ENTER_S, LEAVE_S, JLI_S, BI, BIH.
--code_density_option true
-
-# bitscan_option --- This adds instructions for efficient search of bits within a 32 bit word, including normalize (NORM, NORMH, NORMW) and find first or last set bit (FFS, FLS) instructions.
--bitscan_option true
-
-# shift_option --- The Shift ISA option adds variable and multi-length shift rotation instructions:  (0) No shift/rotation instructions (1) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8 (2) ASRM, ASLM, LSRM, RORM (3) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8, ASRM, ASLM, LSRM, RORM
--shift_option 3
-
-# swap_option --- This adds two instructions used to swap half-words or bytes in a 32b word. Useful for converting between little to big endianess and vice-versa.
--swap_option true
-
-# div_rem_option --- The DIV/REM option adds non-blocking multi-cycle implementation of integer divide/remainder functions. Added instructions are DIV, DIVU (integer divide), REM and REMU (integer divide remainder).radix2 takes 33 cycles.  radix4_enhanced takes 3 to 19 cycles per operation.
--div_rem_option none
-
-# mpy_option --- The Multiplier ISA option allows selection between several multiplier configurations to tradeoff performance with silicon area.
-# For select multiply options, when the DIV/REM option is also selected, some datapath resources will be shared between the multiply and divide pipeline to minimize total area.
-# 
-# Cycle count (16-bit, lower 32-bit or upper 32-bit) for the different configurations is as follows:
-# <pre>
-# 
-# option  16/L32/U32  Instructions
-# ------  ----------  ---------------------
-#       
-# none	  -/-/-     None
-# wlh1	  1/1/1     MPYW/U, MPY/U, MPYH/U
-# wlh2	  2/2/2     MPYW/U, MPY/U, MPYH/U
-# wlh3	  2/3/3     MPYW/U, MPY/U, MPYH/U
-# wlh4	  2/4/5     MPYW/U, MPY/U, MPYH/U
-# wlh5	  5/9/9     MPYW/U, MPY/U, MPYH/U
-# </pre>
-# 
--mpy_option none
-
-# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually.  This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region.  An attempt to access a protected region raises an EV_ProtV exception.
--code_protection true
-
-# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected.
--stack_checking true
-
-# unaligned_option --- This enables unaligned loads and stores.
--unaligned_option true
-
-# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE.
--intvbase_preset 0
-
-# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro.
--rgf_impl flip_flops
-
-# rgf_num_regs --- This defines the size (in 32b register) of the processor register file.
--rgf_num_regs 32
-
-# rgf_wr_ports --- This defines the number of write ports on the register file.
--rgf_wr_ports 2
-
-# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not.
--rgf_num_banks 2
-
-# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank.
--rgf_banked_regs 32
-
-# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions.
--turbo_boost false
-
-# infer_alu_adder --- infer: datapath is described as behavioral code: A + B
-# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder.  It is generally preferable to use the infer option and add directives for your target synthesizer. 
--infer_alu_adder infer
-
-# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs)
-# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. 
--infer_mpy_wtree instantiate
-
-# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts
--power_domains true
-
-# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core
--dvfs true
-
-# voltage_domains --- Creates a voltage  domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints
--voltage_domains false
-
-# mem_bus_option --- The core supports three bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator.
--mem_bus_option AHB-Lite-dual
-
-# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered.
--mem_bus_reg_interface true
-
-# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle.
--dmi_burst_option false
-
-# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost.
--has_dmp_peripheral false
-
-# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite.
--per_bus_option AHB-Lite
-
-# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered.
--per_bus_reg_interface false
-
-# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power.
--clock_gating true
-
-# byte_parity --- If parity protection on the CCMs is configured, this option is used to enable parity protection on a per-byte basis. Otherwise, parity will be per word basis
--byte_parity false
-
-# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback
--prot_pipelined false
-
-# cct_test_ena --- When ECC is configured, this option enables automatic generation of error conditions in relevant testbench memories to exercise error detection and correction features
--cct_test_ena false
-
-
-######## AGU --- com.arc.hardware.AGU.1_0 ########
-
-# Create AGU
--create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU
-
-# agu_size --- Predefined configurations of modifiers, address 
-# pointers and offset registers                   
-# <pre>
-# 
-#         address     address                     
-#         pointers    offset regs      modifiers  
-#        ----------- --------------- ------------ 
-# small:     4           2                 4      
-# medium:    8           4                 12     
-# large:     12          8                 24     
-# </pre>
-# 
--agu_size small
-
-# agu_accord --- Enable the accordion stage if operating frequency is critical
--agu_accord true
-
-# agu_wb_depth --- Write buffer depth
--agu_wb_depth 2
-
-
-######## DSP --- com.arc.hardware.DSP.1_0 ########
-
-# Create DSP
--create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP
-
-# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support
--dsp_complex true
-
-# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only
--dsp_itu true
-
-# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT
--dsp_divsqrt radix2
-
-# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding
--dsp_accshift full
-
-# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing
--dsp_impl optimized
-
-
-######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ########
-
-# Create Interrupt Controller
--create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller"
-
-# number_of_interrupts --- This is the total number of interrupts available to the core.  Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts).  For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual.
--number_of_interrupts 95
-
-# number_of_levels --- Priority levels in the interrupt controller.
--number_of_levels 4
-
-# external_interrupts --- This is the total number of interrupt pins available for external system components.  This parameter must be less than the total number of interrupts.
--external_interrupts 60
-
-# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory.
--firq_option true
-
-
-######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ########
-
-# Create Timer 0
--create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0"
-
-# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0.
--timer_0_int_level 1
-
-
-######## Timer 1 --- com.arc.hardware.Timer_1.1_0 ########
-
-# Create Timer 1
--create com.arc.hardware.Timer_1.1_0 "System.CPUisle.ARCv2EM.Timer 1"
-
-# timer_1_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 1.
--timer_1_int_level 0
-
-
-######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ########
-
-# Create Watchdog Timer
--create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer"
-
-# watchdog_size --- Specifies the bit width of the internal counter used within the timer.
--watchdog_size 16
-
-# watchdog_clk --- Specifies whether the timer should be driven from a separate clock.
--watchdog_clk true
-
-
-######## Data Memory Initiator --- com.arc.hardware.Data_Memory_Initiator.1_0 ########
-
-# Create Data Memory Initiator
--create com.arc.hardware.Data_Memory_Initiator.1_0 "System.CPUisle.ARCv2EM.Data Memory Initiator"
-
-######## Instruction Fetch Queue --- com.arc.hardware.Instruction_Fetch_Queue.1_0 ########
-
-# Create Instruction Fetch Queue
--create com.arc.hardware.Instruction_Fetch_Queue.1_0 "System.CPUisle.ARCv2EM.Instruction Fetch Queue"
-
-# ifqueue_size --- This defines the number of entires in the Instruction Fetch Queue.
--ifqueue_size 4
-
-# ifqueue_burst_size --- This sets the burst size for bus data transfers (in 32-bit words).  It cannot exceed the number of entries.
--ifqueue_burst_size 2
-
-
-######## DCCM --- com.arc.hardware.DCCM.1_0 ########
-
-# Create DCCM
--create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM
-
-# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes
--dccm_size 131072
-
-# dccm_base --- Sets the initial memory region assignment for DCCM
--dccm_base 8
-
-# dccm_interleave --- Split DCCM into even/odd memory banks.
--dccm_interleave false
-
-# dccm_prot --- Specifies the type of protection built for the DCCM.
--dccm_prot None
-
-# dccm_prot_level --- Specifies the level protection.
--dccm_prot_level Data_Only
-
-# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM
--dccm_prot_exceptions true
-
-# dccm_dmi --- This enables external access through a DMI (direct memory interface) port.
--dccm_dmi true
-
-
-######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ########
-
-# Create ICCM0
--create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0
-
-# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states.
--iccm0_size 262144
-
-# iccm0_base --- Sets the initial memory region assignment for ICCM0
--iccm0_base 2
-
-# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses.
--iccm0_wide true
-
-# iccm0_prot --- Specifies the type of protection built for ICCM0.
--iccm0_prot None
-
-# iccm0_prot_level --- Specifies the level of protection.
--iccm0_prot_level Data_Only
-
-# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0
--iccm0_prot_exceptions true
-
-# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port.
--iccm0_dmi true
-
-
-######## XY --- com.arc.hardware.XY.1_0 ########
-
-# Create XY
--create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY
-
-# xy_config --- XY memory configuration:
-# One memory: DCCM only.
-# Two memories: DCCM + Y.
-# Three memories: DCCM + X + Y.
--xy_config dccm_x_y
-
-# xy_size --- Size of X and Y memories if included.
-# X and Y memories both have the same configured size.
--xy_size 32768
-
-# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access.
--xy_interleave true
-
-# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory.
--xy_x_base 12
-
-# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory.
--xy_y_base 14
-
-
-######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ########
-
-# Create DMA Controller
--create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller"
-
-# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller
--dmac_channels 16
-
-# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words.
--dmac_fifo_depth 4
-
-# dmac_int_config --- None: the DMA controller cannot raise an interrupt
-# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy
-# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy
-# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core
-# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core
--dmac_int_config Multiple-Internal
-
-# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space.
--dmac_registers 16
-
-# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface.
--dmac_mem_if separate
-
-
-######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ########
-
-# Create JTAG Interface
--create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface"
-
-######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ########
-
-# Create Debug Interface
--create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface"
-
-######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ########
-
-# Create Actionpoints
--create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints
-
-# num_actionpoints --- This is the number of trigger events available.
--num_actionpoints 8
-
-# aps_feature --- Selects Actionpoint feature set
--aps_feature min
-
-
-######## SmaRT --- com.arc.hardware.SmaRT.1_0 ########
-
-# Create SmaRT
--create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT
-
-# smart_stack_entries --- This specifies the number of entries in the trace buffer.
--smart_stack_entries 64
-
-# smart_implementation --- Flip-flop = FF-based design.  Memory = memory-based design (provides better density for larger trace buffers).
--smart_implementation memory
-
-
-######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ########
-
-# Create Memory Protection Unit
--create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit"
-
-# mpu_num_regions --- Number of configured memory regions.
--mpu_num_regions 16
-
-# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB.
--mpu_32b false
-
-
-######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ########
-
-# Create Floating-point unit
--create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit"
-
-# fpu_dp_assist --- This enables double-precision acceleration instructions.
--fpu_dp_assist true
-
-# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions.
--fpu_fma_option true
-
-# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed.
--fpu_mas_cycles 2
-
-# fpu_div_option --- This enables divide & square-root acceleration
--fpu_div_option true
-
-# fpu_div_cycles --- "inferred" option infers DSP datapath elements from verilog operators for better area and "optimized" option selects hardware for better timing
--fpu_div_cycles 17
-
-
-######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ########
-
-# Create Performance Monitor
--create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor"
-
-# pct_counters --- Number of counters for performance monitoring.
--pct_counters 8
-
-
-######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ########
-
-# Create dsp_trig
--create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig
-
-# dsp_trig --- Command line option for EIA extension component 'dsp_trig'.
--dsp_trig true
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio_4b0 --- com.arc.hardware.dfss.io_gpio_4b0.1_0 ########
-
-# Create io_gpio_4b0
--create com.arc.hardware.dfss.io_gpio_4b0.1_0 System.CPUisle.ARCv2EM.io_gpio_4b0
-
-# io_gpio_4b0 --- Command line option for EIA extension component 'io_gpio_4b0'.
--io_gpio_4b0 true
-
-# io_gpio_4b0_debounce --- Selects the inclusion of Debounce logic
--io_gpio_4b0_debounce 1
-
-# io_gpio_4b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio_4b0_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio_4b1 --- com.arc.hardware.dfss.io_gpio_4b1.1_0 ########
-
-# Create io_gpio_4b1
--create com.arc.hardware.dfss.io_gpio_4b1.1_0 System.CPUisle.ARCv2EM.io_gpio_4b1
-
-# io_gpio_4b1 --- Command line option for EIA extension component 'io_gpio_4b1'.
--io_gpio_4b1 true
-
-# io_gpio_4b1_debounce --- Selects the inclusion of Debounce logic
--io_gpio_4b1_debounce 1
-
-# io_gpio_4b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio_4b1_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio_4b2 --- com.arc.hardware.dfss.io_gpio_4b2.1_0 ########
-
-# Create io_gpio_4b2
--create com.arc.hardware.dfss.io_gpio_4b2.1_0 System.CPUisle.ARCv2EM.io_gpio_4b2
-
-# io_gpio_4b2 --- Command line option for EIA extension component 'io_gpio_4b2'.
--io_gpio_4b2 true
-
-# io_gpio_4b2_debounce --- Selects the inclusion of Debounce logic
--io_gpio_4b2_debounce 1
-
-# io_gpio_4b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio_4b2_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio_8b0 --- com.arc.hardware.dfss.io_gpio_8b0.1_0 ########
-
-# Create io_gpio_8b0
--create com.arc.hardware.dfss.io_gpio_8b0.1_0 System.CPUisle.ARCv2EM.io_gpio_8b0
-
-# io_gpio_8b0 --- Command line option for EIA extension component 'io_gpio_8b0'.
--io_gpio_8b0 true
-
-# io_gpio_8b0_debounce --- Selects the inclusion of Debounce logic
--io_gpio_8b0_debounce 1
-
-# io_gpio_8b0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio_8b0_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio_8b1 --- com.arc.hardware.dfss.io_gpio_8b1.1_0 ########
-
-# Create io_gpio_8b1
--create com.arc.hardware.dfss.io_gpio_8b1.1_0 System.CPUisle.ARCv2EM.io_gpio_8b1
-
-# io_gpio_8b1 --- Command line option for EIA extension component 'io_gpio_8b1'.
--io_gpio_8b1 true
-
-# io_gpio_8b1_debounce --- Selects the inclusion of Debounce logic
--io_gpio_8b1_debounce 1
-
-# io_gpio_8b1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio_8b1_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio_8b2 --- com.arc.hardware.dfss.io_gpio_8b2.1_0 ########
-
-# Create io_gpio_8b2
--create com.arc.hardware.dfss.io_gpio_8b2.1_0 System.CPUisle.ARCv2EM.io_gpio_8b2
-
-# io_gpio_8b2 --- Command line option for EIA extension component 'io_gpio_8b2'.
--io_gpio_8b2 true
-
-# io_gpio_8b2_debounce --- Selects the inclusion of Debounce logic
--io_gpio_8b2_debounce 1
-
-# io_gpio_8b2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio_8b2_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio_8b3 --- com.arc.hardware.dfss.io_gpio_8b3.1_0 ########
-
-# Create io_gpio_8b3
--create com.arc.hardware.dfss.io_gpio_8b3.1_0 System.CPUisle.ARCv2EM.io_gpio_8b3
-
-# io_gpio_8b3 --- Command line option for EIA extension component 'io_gpio_8b3'.
--io_gpio_8b3 true
-
-# io_gpio_8b3_debounce --- Selects the inclusion of Debounce logic
--io_gpio_8b3_debounce 1
-
-# io_gpio_8b3_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio_8b3_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ########
-
-# Create io_i2c_mst0
--create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0
-
-# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'.
--io_i2c_mst0 true
-
-# io_i2c_mst0_fs --- RX/TX FIFO size
--io_i2c_mst0_fs 16
-
-# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2c_mst0_dma_support None
-
-# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
--io_i2c_mst0_cdc_included 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ########
-
-# Create io_i2c_mst1
--create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1
-
-# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'.
--io_i2c_mst1 true
-
-# io_i2c_mst1_fs --- RX/TX FIFO size
--io_i2c_mst1_fs 16
-
-# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2c_mst1_dma_support None
-
-# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
--io_i2c_mst1_cdc_included 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ########
-
-# Create io_i2c_mst2
--create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2
-
-# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'.
--io_i2c_mst2 true
-
-# io_i2c_mst2_fs --- RX/TX FIFO size
--io_i2c_mst2_fs 16
-
-# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2c_mst2_dma_support None
-
-# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
--io_i2c_mst2_cdc_included 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ########
-
-# Create io_spi_mst0
--create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0
-
-# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'.
--io_spi_mst0 true
-
-# io_spi_mst0_fz --- RX/TX FIFO depth
--io_spi_mst0_fs 16
-
-# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_mst0_max_xfer_size 16
-
-# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
--io_spi_mst0_cdc_included 1
-
-# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_mst0_dma_support Aux-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ########
-
-# Create io_spi_mst1
--create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1
-
-# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'.
--io_spi_mst1 true
-
-# io_spi_mst1_fz --- RX/TX FIFO depth
--io_spi_mst1_fs 16
-
-# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_mst1_max_xfer_size 16
-
-# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
--io_spi_mst1_cdc_included 1
-
-# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_mst1_dma_support Aux-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ########
-
-# Create io_spi_mst2
--create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2
-
-# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'.
--io_spi_mst2 true
-
-# io_spi_mst2_fz --- RX/TX FIFO depth
--io_spi_mst2_fs 16
-
-# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_mst2_max_xfer_size 16
-
-# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
--io_spi_mst2_cdc_included 1
-
-# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_mst2_dma_support Aux-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ########
-
-# Create io_spi_slv0
--create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0
-
-# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'.
--io_spi_slv0 true
-
-# io_spi_slv0_fz --- RX/TX FIFO depth
--io_spi_slv0_fs 16
-
-# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_slv0_max_xfer_size 16
-
-# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_slv0_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ########
-
-# Create io_uart0
--create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0
-
-# io_uart0 --- Command line option for EIA extension component 'io_uart0'.
--io_uart0 true
-
-# io_uart0_fifo_mode --- Set the UART FIFO mode
--io_uart0_fifo_mode 16
-
-# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart0_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ########
-
-# Create io_uart1
--create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1
-
-# io_uart1 --- Command line option for EIA extension component 'io_uart1'.
--io_uart1 true
-
-# io_uart1_fifo_mode --- Set the UART FIFO mode
--io_uart1_fifo_mode 16
-
-# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart1_dma_support Aux-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ########
-
-# Create io_uart2
--create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2
-
-# io_uart2 --- Command line option for EIA extension component 'io_uart2'.
--io_uart2 true
-
-# io_uart2_fifo_mode --- Set the UART FIFO mode
--io_uart2_fifo_mode 16
-
-# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart2_dma_support Aux-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ########
-
-# Create io_uart3
--create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3
-
-# io_uart3 --- Command line option for EIA extension component 'io_uart3'.
--io_uart3 true
-
-# io_uart3_fifo_mode --- Set the UART FIFO mode
--io_uart3_fifo_mode 16
-
-# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart3_dma_support Aux-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_creg_mst0 --- com.arc.hardware.dfss.io_creg_mst0.1_0 ########
-
-# Create io_creg_mst0
--create com.arc.hardware.dfss.io_creg_mst0.1_0 System.CPUisle.ARCv2EM.io_creg_mst0
-
-# io_creg_mst0 --- Command line option for EIA extension component 'io_creg_mst0'.
--io_creg_mst0 true
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_creg_slv0 --- com.arc.hardware.dfss.io_creg_slv0.1_0 ########
-
-# Create io_creg_slv0
--create com.arc.hardware.dfss.io_creg_slv0.1_0 System.CPUisle.ARCv2EM.io_creg_slv0
-
-# io_creg_slv0 --- Command line option for EIA extension component 'io_creg_slv0'.
--io_creg_slv0 true
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ########
-
-# Create subsys_bcr
--create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ########
-
-# Create subsys_infra
--create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra
-
-# subsys_infra --- Command line option for EIA glue logic.
--subsys_infra true
-
-# internal_interrupt --- Connect the IO interrupts internally
--internal_interrupt true
-
-# internal_dma_handshake --- Connect the DMA handshake signals internally
--internal_dma_handshake true
-
-
-######## ARConnect --- com.arc.hardware.ARConnect.1_0 ########
-
-# Create ARConnect
--create com.arc.hardware.ARConnect.1_0 System.ARConnect
-
-# mcip_def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate ARConnect clock, and the value N means ARConnect is running at (1/N) x ref_clk.
--mcip_def_div2ref 1
-
-# mcip_has_intrpt --- This specifies whether the Inter-core Interrupt Unit exists
--mcip_has_intrpt false
-
-# mcip_has_sema --- This specifies whether the Inter-core Semaphore Unit exists
--mcip_has_sema false
-
-# mcip_sema_num --- This specifies the number of semaphores in the Inter-core Semaphores Unit
--mcip_sema_num 16
-
-# mcip_has_msg_sram --- This specifies whether the Inter-core Message Unit exists
--mcip_has_msg_sram false
-
-# mcip_msg_sram_size --- This specifies the bytes of SRAM in the Inter-core Message Unit
--mcip_msg_sram_size 512
-
-# mcip_msg_1cycle --- True: The access path to message SRAM is 1 clock cycle; False: The access path to message SRAM 1.5 cycles. Note: The 1.5 cycles path use clock negetive edge for SRAM, but can acheive higher frequency. No performance difference caused by the value of this option
--mcip_msg_1cycle false
-
-# mcip_has_debug --- This specifies whether the Inter-core Debug Unit exists
--mcip_has_debug false
-
-# mcip_has_grtc --- This specifies whether the Global Real-Time Counter Unit exists
--mcip_has_grtc false
-
-# mcip_has_pmu --- This specifies whether the external Power Management Unit exists
--mcip_has_pmu true
-
-# mcip_power_domains --- This specifies whether the ARConnect Power Domain Management Unit exists
--mcip_power_domains true
-
-# mcip_llm_size --- This specifies the KBytes of SRAM in the Low Latency Memory Unit
--mcip_llm_size 32
-
-# mcip_llm_base --- This specifies the default memory region of Low Latency Memory Unit
--mcip_llm_base 2
-
-# mcip_llm_ecc --- This specifies the ECC mode of SRAM in Low Latency Memory Unit. none = No checking; parity = Parity only; SECDED = single-error correction and double-error detection (SECDED)
--mcip_llm_ecc SECDED
-
-# mcip_idu_cirq_num --- This specifies the number of common interrupts supported by IDU
--mcip_idu_cirq_num 4
-
-# mcip_bsu_dbw --- This specifies the data bus width of Bus Slave Unit
--mcip_bsu_dbw 64
-
-# mcip_bsu_type --- This specifies the bus protocol of Bus Slave Unit
--mcip_bsu_type AXI
-
-
-]]></string>
-  </configuration>
-  <configuration name="assembler_defines" filename="core_config.s">
-    <string><![CDATA[
-.ifndef __core_config_s
-	.define __core_config_s, 1
-	.define	core_config_cir_identity,0x00000142
-	.define	core_config_cir_identity_chipid,0
-	.define	core_config_cir_identity_arcnum,1
-	.define	core_config_cir_identity_arcver,66
-	.define	core_config_cir_identity_family,4
-	.define	core_config_cir_identity_corever,2
-	.define	core_config_cir_aux_dccm,0x80000000
-	.define	core_config_bcr_bcr_ver,0x00000002
-	.define	core_config_bcr_bcr_ver_version,2
-	.define	core_config_bcr_vecbase_ac_build,0x00000010
-	.define	core_config_bcr_mpu_build,0x00001002
-	.define	core_config_bcr_mpu_build_i,0
-	.define	core_config_bcr_mpu_build_s,0
-	.define	core_config_bcr_mpu_build_regions,16
-	.define	core_config_bcr_mpu_build_version,2
-	.define	core_config_bcr_rf_build,0x0000c902
-	.define	core_config_bcr_rf_build_version,2
-	.define	core_config_bcr_rf_build_p,1
-	.define	core_config_bcr_rf_build_e,0
-	.define	core_config_bcr_rf_build_r,0
-	.define	core_config_bcr_rf_build_b,1
-	.define	core_config_bcr_rf_build_d,3
-	.define	core_config_bcr_dccm_build,0x00000904
-	.define	core_config_bcr_dccm_build_cycles,0
-	.define	core_config_bcr_dccm_build_interleave,0
-	.define	core_config_bcr_dccm_build_size1,0
-	.define	core_config_bcr_dccm_build_size0,9
-	.define	core_config_bcr_dccm_build_version,4
-	.define	core_config_bcr_timer_build,0x00010304
-	.define	core_config_bcr_timer_build_sp1,0
-	.define	core_config_bcr_timer_build_sp0,0
-	.define	core_config_bcr_timer_build_p1,0
-	.define	core_config_bcr_timer_build_p0,1
-	.define	core_config_bcr_timer_build_st1,0
-	.define	core_config_bcr_timer_build_st0,0
-	.define	core_config_bcr_timer_build_rtc,0
-	.define	core_config_bcr_timer_build_rtsc_ver,1
-	.define	core_config_bcr_timer_build_rtsc,0
-	.define	core_config_bcr_timer_build_t0,1
-	.define	core_config_bcr_timer_build_t1,1
-	.define	core_config_bcr_timer_build_version,4
-	.define	core_config_bcr_ap_build,0x00000605
-	.define	core_config_bcr_ap_build_version,5
-	.define	core_config_bcr_ap_build_type,6
-	.define	core_config_bcr_iccm_build,0x00000a04
-	.define	core_config_bcr_iccm_build_iccm1_size1,0
-	.define	core_config_bcr_iccm_build_iccm0_size1,0
-	.define	core_config_bcr_iccm_build_iccm1_size0,0
-	.define	core_config_bcr_iccm_build_iccm0_size0,10
-	.define	core_config_bcr_iccm_build_version,4
-	.define	core_config_bcr_xy_build,0x00003620
-	.define	core_config_bcr_xy_build_memsize,3
-	.define	core_config_bcr_xy_build_interleaved,1
-	.define	core_config_bcr_xy_build_config,2
-	.define	core_config_bcr_xy_build_version,32
-	.define	core_config_bcr_dsp_build,0x00003521
-	.define	core_config_bcr_dsp_build_wide,0
-	.define	core_config_bcr_dsp_build_itu_pa,1
-	.define	core_config_bcr_dsp_build_acc_shift,2
-	.define	core_config_bcr_dsp_build_comp,1
-	.define	core_config_bcr_dsp_build_divsqrt,1
-	.define	core_config_bcr_dsp_build_version,33
-	.define	core_config_bcr_multiply_build,0x00022a06
-	.define	core_config_bcr_multiply_build_version16x16,2
-	.define	core_config_bcr_multiply_build_dsp,2
-	.define	core_config_bcr_multiply_build_cyc,2
-	.define	core_config_bcr_multiply_build_type,2
-	.define	core_config_bcr_multiply_build_version32x32,6
-	.define	core_config_bcr_swap_build,0x00000003
-	.define	core_config_bcr_swap_build_version,3
-	.define	core_config_bcr_norm_build,0x00000003
-	.define	core_config_bcr_norm_build_version,3
-	.define	core_config_bcr_minmax_build,0x00000002
-	.define	core_config_bcr_minmax_build_version,2
-	.define	core_config_bcr_barrel_build,0x00000303
-	.define	core_config_bcr_barrel_build_version,3
-	.define	core_config_bcr_barrel_build_shift_option,3
-	.define	core_config_bcr_isa_config,0x12447402
-	.define	core_config_bcr_isa_config_d,1
-	.define	core_config_bcr_isa_config_c,2
-	.define	core_config_bcr_isa_config_l,0
-	.define	core_config_bcr_isa_config_n,1
-	.define	core_config_bcr_isa_config_a,0
-	.define	core_config_bcr_isa_config_b,0
-	.define	core_config_bcr_isa_config_addr_size,4
-	.define	core_config_bcr_isa_config_lpc_size,7
-	.define	core_config_bcr_isa_config_pc_size,4
-	.define	core_config_bcr_isa_config_version,2
-	.define	core_config_bcr_stack_region_build,0x00000002
-	.define	core_config_bcr_erp_build,0x30000003
-	.define	core_config_bcr_erp_build_l,0
-	.define	core_config_bcr_erp_build_wd,1
-	.define	core_config_bcr_erp_build_c,1
-	.define	core_config_bcr_erp_build_rf,0
-	.define	core_config_bcr_erp_build_pc,0
-	.define	core_config_bcr_erp_build_ic,0
-	.define	core_config_bcr_erp_build_dc,0
-	.define	core_config_bcr_erp_build_ip,0
-	.define	core_config_bcr_erp_build_dp,0
-	.define	core_config_bcr_erp_build_version,3
-	.define	core_config_bcr_fpu_build,0x01000f02
-	.define	core_config_bcr_fpu_build_da,1
-	.define	core_config_bcr_fpu_build_dd,0
-	.define	core_config_bcr_fpu_build_dc,0
-	.define	core_config_bcr_fpu_build_df,0
-	.define	core_config_bcr_fpu_build_dp,0
-	.define	core_config_bcr_fpu_build_fd,0
-	.define	core_config_bcr_fpu_build_fm,0
-	.define	core_config_bcr_fpu_build_sd,1
-	.define	core_config_bcr_fpu_build_sc,1
-	.define	core_config_bcr_fpu_build_sf,1
-	.define	core_config_bcr_fpu_build_sp,1
-	.define	core_config_bcr_fpu_build_version,2
-	.define	core_config_bcr_cprot_build,0x00000001
-	.define	core_config_bcr_agu_build,0x01442401
-	.define	core_config_bcr_agu_build_accordian,1
-	.define	core_config_bcr_agu_build_wb_size,2
-	.define	core_config_bcr_agu_build_num_modifier,4
-	.define	core_config_bcr_agu_build_num_offset,2
-	.define	core_config_bcr_agu_build_num_addr,4
-	.define	core_config_bcr_agu_build_version,1
-	.define	core_config_bcr_dmac_build,0x00170f01
-	.define	core_config_bcr_dmac_build_int_cfg,2
-	.define	core_config_bcr_dmac_build_fifo,3
-	.define	core_config_bcr_dmac_build_chan_mem,16
-	.define	core_config_bcr_dmac_build_channels,15
-	.define	core_config_bcr_dmac_build_version,1
-	.define	core_config_bcr_mcip_system_build,0x02011002
-	.define	core_config_bcr_mcip_system_build_pdm,1
-	.define	core_config_bcr_mcip_system_build_idu,0
-	.define	core_config_bcr_mcip_system_build_corenum,1
-	.define	core_config_bcr_mcip_system_build_gfrc,0
-	.define	core_config_bcr_mcip_system_build_icd,0
-	.define	core_config_bcr_mcip_system_build_pmu,1
-	.define	core_config_bcr_mcip_system_build_icm,0
-	.define	core_config_bcr_mcip_system_build_ics,0
-	.define	core_config_bcr_mcip_system_build_ici,0
-	.define	core_config_bcr_mcip_system_build_asi,0
-	.define	core_config_bcr_mcip_system_build_version,2
-	.define	core_config_bcr_mcip_system_build_llm,0
-	.define	core_config_bcr_mcip_system_build_rtc,0
-	.define	core_config_bcr_mcip_system_build_mcd,0
-	.define	core_config_bcr_mcip_system_build_mps,0
-	.define	core_config_bcr_mcip_system_build_bsu,0
-	.define	core_config_bcr_mcip_pmu_build,0x00000002
-	.define	core_config_bcr_mcip_pmu_build_version,2
-	.define	core_config_bcr_mcip_pmu_build_dvfs,0
-	.define	core_config_bcr_mcip_pmu_build_pm,0
-	.define	core_config_bcr_mcip_pdm_build,0x00000001
-	.define	core_config_bcr_mcip_pdm_build_version,1
-	.define	core_config_bcr_subsys_build,0x00100013
-	.define	core_config_bcr_core_config,0x00000001
-	.define	core_config_bcr_core_config_turbo_boost,0
-	.define	core_config_bcr_core_config_version,1
-	.define	core_config_bcr_irq_build,0x133c5f01
-	.define	core_config_bcr_irq_build_raz,0
-	.define	core_config_bcr_irq_build_f,1
-	.define	core_config_bcr_irq_build_p,3
-	.define	core_config_bcr_irq_build_exts,60
-	.define	core_config_bcr_irq_build_irqs,95
-	.define	core_config_bcr_irq_build_version,1
-	.define	core_config_bcr_pct_build,0x08080102
-	.define	core_config_bcr_pct_build_version,2
-	.define	core_config_bcr_pct_build_s,1
-	.define	core_config_bcr_pct_build_i,0
-	.define	core_config_bcr_pct_build_c,8
-	.define	core_config_bcr_cc_build,0x006f0004
-	.define	core_config_bcr_cc_build_version,4
-	.define	core_config_bcr_cc_build_cc,111
-	.define	core_config_bcr_pdm_dvfs_build,0x00000302
-	.define	core_config_bcr_pdm_dvfs_build_dvfs,1
-	.define	core_config_bcr_pdm_dvfs_build_pdm,1
-	.define	core_config_bcr_pdm_dvfs_build_version,2
-	.define	core_config_bcr_ifqueue_build,0x00000202
-	.define	core_config_bcr_ifqueue_build_bd,2
-	.define	core_config_bcr_ifqueue_build_version,2
-	.define	core_config_bcr_smart_build,0x00010003
-	.define	core_config_bcr_smart_build_version,3
-	.define	core_config_bcr_smart_build_stack_size,64
-	.define	core_config_cir_aux_iccm,0x20000000
-	.define	core_config_cir_xccm_base,0xc0000000
-	.define	core_config_cir_yccm_base,0xe0000000
-	.define	core_config_cir_subsys_dsp_0_build,0x00001000
-	.define	core_config_cir_subsys_io_0_build,0x071711f0
-	.define	core_config_cir_subsys_io_1_build,0x00000f70
-	.define	core_config_family,4
-	.define	core_config_core_version,2
-	.define	core_config_family_name,"arcv2em"
-	.define	core_config_rgf_num_banks,2
-	.define	core_config_rgf_banked_regs,32
-	.define	core_config_rgf_num_wr_ports,2
-	.define	core_config_endian,"little"
-	.define	core_config_endian_little,1
-	.define	core_config_endian_big,0
-	.define	core_config_lpc_size,32
-	.define	core_config_pc_size,32
-	.define	core_config_addr_size,32
-	.define	core_config_unaligned,1
-	.define	core_config_code_density,1
-	.define	core_config_div_rem,"radix2"
-	.define	core_config_div_rem_radix2,1
-	.define	core_config_swap,1
-	.define	core_config_bitscan,1
-	.define	core_config_mpy_option,"mpyd"
-	.define	core_config_mpy_option_num,8
-	.define	core_config_shift_assist,1
-	.define	core_config_barrel_shifter,1
-	.define	core_config_dsp,1
-	.define	core_config_dsp2,1
-	.define	core_config_dsp_complex,1
-	.define	core_config_dsp_divsqrt,"radix2"
-	.define	core_config_dsp_divsqrt_radix2,1
-	.define	core_config_dsp_itu,1
-	.define	core_config_dsp_accshift,"full"
-	.define	core_config_dsp_accshift_full,1
-	.define	core_config_agu_small,1
-	.define	core_config_agu_wb_depth,2
-	.define	core_config_agu_accord,1
-	.define	core_config_xy,1
-	.define	core_config_xy_config,"dccm_x_y"
-	.define	core_config_xy_config_dccm_x_y,1
-	.define	core_config_xy_size,32768
-	.define	core_config_xy_size_KM,"32K"
-	.define	core_config_xy_interleave,1
-	.define	core_config_xy_x_base,0xc0000000
-	.define	core_config_xy_y_base,0xe0000000
-	.define	core_config_fpus_div,1
-	.define	core_config_fpu_mac,1
-	.define	core_config_fpuda,1
-	.define	core_config_fpus_mpy_slow,1
-	.define	core_config_fpus_div_slow,1
-	.define	core_config_timer0,1
-	.define	core_config_timer0_level,1
-	.define	core_config_timer0_vector,16
-	.define	core_config_timer1,1
-	.define	core_config_timer1_level,0
-	.define	core_config_timer1_vector,17
-	.define	core_config_action_points,8
-	.define	core_config_stack_check,1
-	.define	core_config_code_protection,1
-	.define	core_config_smart_stack_entries,64
-	.define	core_config_mpu_present,1
-	.define	core_config_mpu,1
-	.define	core_config_mpu_regions,16
-	.define	core_config_ifq_present,1
-	.define	core_config_ifq_entries,4
-	.define	core_config_interrupts_present,1
-	.define	core_config_interrupts_number,95
-	.define	core_config_interrupts_priorities,4
-	.define	core_config_interrupts_externals,60
-	.define	core_config_interrupts,95
-	.define	core_config_interrupt_priorities,4
-	.define	core_config_ext_interrupts,60
-	.define	core_config_interrupts_firq,1
-	.define	core_config_interrupts_base,0x0
-	.define	core_config_dccm_present,1
-	.define	core_config_dccm_size,0x20000
-	.define	core_config_dccm_base,0x80000000
-	.define	core_config_iccm_present,1
-	.define	core_config_iccm0_present,1
-	.define	core_config_iccm_size,0x40000
-	.define	core_config_iccm0_size,0x40000
-	.define	core_config_iccm_base,0x20000000
-	.define	core_config_iccm0_base,0x20000000
-	.define	core_config_error_prot_ver,3
-	.define	core_config_ccm_prot_pipelined,1
-	.define	core_config_watchdog,1
-	.define	core_config_watchdog_size,16
-	.define	core_config_pct_counters,8
-	.define	core_config_connect_pmu,1
-	.define	core_config_connect_pdm,1
-	.define	core_config_dmac,1
-	.define	core_config_dmac_channels,16
-	.define	core_config_dmac_registers,16
-	.define	core_config_dmac_fifo_depth,4
-	.define	core_config_dmac_int_config,"multiple_internal"
-	.define	core_config_power_domains,1
-	.define	core_config_dvfs,1
-.endif ; __core_config_s
-
-]]></string>
-  </configuration>
-  <configuration name="C_defines" filename="core_config.h">
-    <string><![CDATA[
-#ifndef __core_config_h
-	#define __core_config_h  1
-	#define	core_config_cir_identity	0x00000142
-	#define	core_config_cir_identity_chipid	0
-	#define	core_config_cir_identity_arcnum	1
-	#define	core_config_cir_identity_arcver	66
-	#define	core_config_cir_identity_family	4
-	#define	core_config_cir_identity_corever	2
-	#define	core_config_cir_aux_dccm	0x80000000
-	#define	core_config_bcr_bcr_ver	0x00000002
-	#define	core_config_bcr_bcr_ver_version	2
-	#define	core_config_bcr_vecbase_ac_build	0x00000010
-	#define	core_config_bcr_mpu_build	0x00001002
-	#define	core_config_bcr_mpu_build_i	0
-	#define	core_config_bcr_mpu_build_s	0
-	#define	core_config_bcr_mpu_build_regions	16
-	#define	core_config_bcr_mpu_build_version	2
-	#define	core_config_bcr_rf_build	0x0000c902
-	#define	core_config_bcr_rf_build_version	2
-	#define	core_config_bcr_rf_build_p	1
-	#define	core_config_bcr_rf_build_e	0
-	#define	core_config_bcr_rf_build_r	0
-	#define	core_config_bcr_rf_build_b	1
-	#define	core_config_bcr_rf_build_d	3
-	#define	core_config_bcr_dccm_build	0x00000904
-	#define	core_config_bcr_dccm_build_cycles	0
-	#define	core_config_bcr_dccm_build_interleave	0
-	#define	core_config_bcr_dccm_build_size1	0
-	#define	core_config_bcr_dccm_build_size0	9
-	#define	core_config_bcr_dccm_build_version	4
-	#define	core_config_bcr_timer_build	0x00010304
-	#define	core_config_bcr_timer_build_sp1	0
-	#define	core_config_bcr_timer_build_sp0	0
-	#define	core_config_bcr_timer_build_p1	0
-	#define	core_config_bcr_timer_build_p0	1
-	#define	core_config_bcr_timer_build_st1	0
-	#define	core_config_bcr_timer_build_st0	0
-	#define	core_config_bcr_timer_build_rtc	0
-	#define	core_config_bcr_timer_build_rtsc_ver	1
-	#define	core_config_bcr_timer_build_rtsc	0
-	#define	core_config_bcr_timer_build_t0	1
-	#define	core_config_bcr_timer_build_t1	1
-	#define	core_config_bcr_timer_build_version	4
-	#define	core_config_bcr_ap_build	0x00000605
-	#define	core_config_bcr_ap_build_version	5
-	#define	core_config_bcr_ap_build_type	6
-	#define	core_config_bcr_iccm_build	0x00000a04
-	#define	core_config_bcr_iccm_build_iccm1_size1	0
-	#define	core_config_bcr_iccm_build_iccm0_size1	0
-	#define	core_config_bcr_iccm_build_iccm1_size0	0
-	#define	core_config_bcr_iccm_build_iccm0_size0	10
-	#define	core_config_bcr_iccm_build_version	4
-	#define	core_config_bcr_xy_build	0x00003620
-	#define	core_config_bcr_xy_build_memsize	3
-	#define	core_config_bcr_xy_build_interleaved	1
-	#define	core_config_bcr_xy_build_config	2
-	#define	core_config_bcr_xy_build_version	32
-	#define	core_config_bcr_dsp_build	0x00003521
-	#define	core_config_bcr_dsp_build_wide	0
-	#define	core_config_bcr_dsp_build_itu_pa	1
-	#define	core_config_bcr_dsp_build_acc_shift	2
-	#define	core_config_bcr_dsp_build_comp	1
-	#define	core_config_bcr_dsp_build_divsqrt	1
-	#define	core_config_bcr_dsp_build_version	33
-	#define	core_config_bcr_multiply_build	0x00022a06
-	#define	core_config_bcr_multiply_build_version16x16	2
-	#define	core_config_bcr_multiply_build_dsp	2
-	#define	core_config_bcr_multiply_build_cyc	2
-	#define	core_config_bcr_multiply_build_type	2
-	#define	core_config_bcr_multiply_build_version32x32	6
-	#define	core_config_bcr_swap_build	0x00000003
-	#define	core_config_bcr_swap_build_version	3
-	#define	core_config_bcr_norm_build	0x00000003
-	#define	core_config_bcr_norm_build_version	3
-	#define	core_config_bcr_minmax_build	0x00000002
-	#define	core_config_bcr_minmax_build_version	2
-	#define	core_config_bcr_barrel_build	0x00000303
-	#define	core_config_bcr_barrel_build_version	3
-	#define	core_config_bcr_barrel_build_shift_option	3
-	#define	core_config_bcr_isa_config	0x12447402
-	#define	core_config_bcr_isa_config_d	1
-	#define	core_config_bcr_isa_config_c	2
-	#define	core_config_bcr_isa_config_l	0
-	#define	core_config_bcr_isa_config_n	1
-	#define	core_config_bcr_isa_config_a	0
-	#define	core_config_bcr_isa_config_b	0
-	#define	core_config_bcr_isa_config_addr_size	4
-	#define	core_config_bcr_isa_config_lpc_size	7
-	#define	core_config_bcr_isa_config_pc_size	4
-	#define	core_config_bcr_isa_config_version	2
-	#define	core_config_bcr_stack_region_build	0x00000002
-	#define	core_config_bcr_erp_build	0x30000003
-	#define	core_config_bcr_erp_build_l	0
-	#define	core_config_bcr_erp_build_wd	1
-	#define	core_config_bcr_erp_build_c	1
-	#define	core_config_bcr_erp_build_rf	0
-	#define	core_config_bcr_erp_build_pc	0
-	#define	core_config_bcr_erp_build_ic	0
-	#define	core_config_bcr_erp_build_dc	0
-	#define	core_config_bcr_erp_build_ip	0
-	#define	core_config_bcr_erp_build_dp	0
-	#define	core_config_bcr_erp_build_version	3
-	#define	core_config_bcr_fpu_build	0x01000f02
-	#define	core_config_bcr_fpu_build_da	1
-	#define	core_config_bcr_fpu_build_dd	0
-	#define	core_config_bcr_fpu_build_dc	0
-	#define	core_config_bcr_fpu_build_df	0
-	#define	core_config_bcr_fpu_build_dp	0
-	#define	core_config_bcr_fpu_build_fd	0
-	#define	core_config_bcr_fpu_build_fm	0
-	#define	core_config_bcr_fpu_build_sd	1
-	#define	core_config_bcr_fpu_build_sc	1
-	#define	core_config_bcr_fpu_build_sf	1
-	#define	core_config_bcr_fpu_build_sp	1
-	#define	core_config_bcr_fpu_build_version	2
-	#define	core_config_bcr_cprot_build	0x00000001
-	#define	core_config_bcr_agu_build	0x01442401
-	#define	core_config_bcr_agu_build_accordian	1
-	#define	core_config_bcr_agu_build_wb_size	2
-	#define	core_config_bcr_agu_build_num_modifier	4
-	#define	core_config_bcr_agu_build_num_offset	2
-	#define	core_config_bcr_agu_build_num_addr	4
-	#define	core_config_bcr_agu_build_version	1
-	#define	core_config_bcr_dmac_build	0x00170f01
-	#define	core_config_bcr_dmac_build_int_cfg	2
-	#define	core_config_bcr_dmac_build_fifo	3
-	#define	core_config_bcr_dmac_build_chan_mem	16
-	#define	core_config_bcr_dmac_build_channels	15
-	#define	core_config_bcr_dmac_build_version	1
-	#define	core_config_bcr_mcip_system_build	0x02011002
-	#define	core_config_bcr_mcip_system_build_pdm	1
-	#define	core_config_bcr_mcip_system_build_idu	0
-	#define	core_config_bcr_mcip_system_build_corenum	1
-	#define	core_config_bcr_mcip_system_build_gfrc	0
-	#define	core_config_bcr_mcip_system_build_icd	0
-	#define	core_config_bcr_mcip_system_build_pmu	1
-	#define	core_config_bcr_mcip_system_build_icm	0
-	#define	core_config_bcr_mcip_system_build_ics	0
-	#define	core_config_bcr_mcip_system_build_ici	0
-	#define	core_config_bcr_mcip_system_build_asi	0
-	#define	core_config_bcr_mcip_system_build_version	2
-	#define	core_config_bcr_mcip_system_build_llm	0
-	#define	core_config_bcr_mcip_system_build_rtc	0
-	#define	core_config_bcr_mcip_system_build_mcd	0
-	#define	core_config_bcr_mcip_system_build_mps	0
-	#define	core_config_bcr_mcip_system_build_bsu	0
-	#define	core_config_bcr_mcip_pmu_build	0x00000002
-	#define	core_config_bcr_mcip_pmu_build_version	2
-	#define	core_config_bcr_mcip_pmu_build_dvfs	0
-	#define	core_config_bcr_mcip_pmu_build_pm	0
-	#define	core_config_bcr_mcip_pdm_build	0x00000001
-	#define	core_config_bcr_mcip_pdm_build_version	1
-	#define	core_config_bcr_subsys_build	0x00100013
-	#define	core_config_bcr_core_config	0x00000001
-	#define	core_config_bcr_core_config_turbo_boost	0
-	#define	core_config_bcr_core_config_version	1
-	#define	core_config_bcr_irq_build	0x133c5f01
-	#define	core_config_bcr_irq_build_raz	0
-	#define	core_config_bcr_irq_build_f	1
-	#define	core_config_bcr_irq_build_p	3
-	#define	core_config_bcr_irq_build_exts	60
-	#define	core_config_bcr_irq_build_irqs	95
-	#define	core_config_bcr_irq_build_version	1
-	#define	core_config_bcr_pct_build	0x08080102
-	#define	core_config_bcr_pct_build_version	2
-	#define	core_config_bcr_pct_build_s	1
-	#define	core_config_bcr_pct_build_i	0
-	#define	core_config_bcr_pct_build_c	8
-	#define	core_config_bcr_cc_build	0x006f0004
-	#define	core_config_bcr_cc_build_version	4
-	#define	core_config_bcr_cc_build_cc	111
-	#define	core_config_bcr_pdm_dvfs_build	0x00000302
-	#define	core_config_bcr_pdm_dvfs_build_dvfs	1
-	#define	core_config_bcr_pdm_dvfs_build_pdm	1
-	#define	core_config_bcr_pdm_dvfs_build_version	2
-	#define	core_config_bcr_ifqueue_build	0x00000202
-	#define	core_config_bcr_ifqueue_build_bd	2
-	#define	core_config_bcr_ifqueue_build_version	2
-	#define	core_config_bcr_smart_build	0x00010003
-	#define	core_config_bcr_smart_build_version	3
-	#define	core_config_bcr_smart_build_stack_size	64
-	#define	core_config_cir_aux_iccm	0x20000000
-	#define	core_config_cir_xccm_base	0xc0000000
-	#define	core_config_cir_yccm_base	0xe0000000
-	#define	core_config_cir_subsys_dsp_0_build	0x00001000
-	#define	core_config_cir_subsys_io_0_build	0x071711f0
-	#define	core_config_cir_subsys_io_1_build	0x00000f70
-	#define	core_config_family	4
-	#define	core_config_core_version	2
-	#define	core_config_family_name	"arcv2em"
-	#define	core_config_rgf_num_banks	2
-	#define	core_config_rgf_banked_regs	32
-	#define	core_config_rgf_num_wr_ports	2
-	#define	core_config_endian	"little"
-	#define	core_config_endian_little	1
-	#define	core_config_endian_big	0
-	#define	core_config_lpc_size	32
-	#define	core_config_pc_size	32
-	#define	core_config_addr_size	32
-	#define	core_config_unaligned	1
-	#define	core_config_code_density	1
-	#define	core_config_div_rem	"radix2"
-	#define	core_config_div_rem_radix2	1
-	#define	core_config_swap	1
-	#define	core_config_bitscan	1
-	#define	core_config_mpy_option	"mpyd"
-	#define	core_config_mpy_option_num	8
-	#define	core_config_shift_assist	1
-	#define	core_config_barrel_shifter	1
-	#define	core_config_dsp	1
-	#define	core_config_dsp2	1
-	#define	core_config_dsp_complex	1
-	#define	core_config_dsp_divsqrt	"radix2"
-	#define	core_config_dsp_divsqrt_radix2	1
-	#define	core_config_dsp_itu	1
-	#define	core_config_dsp_accshift	"full"
-	#define	core_config_dsp_accshift_full	1
-	#define	core_config_agu_small	1
-	#define	core_config_agu_wb_depth	2
-	#define	core_config_agu_accord	1
-	#define	core_config_xy	1
-	#define	core_config_xy_config	"dccm_x_y"
-	#define	core_config_xy_config_dccm_x_y	1
-	#define	core_config_xy_size	32768
-	#define	core_config_xy_size_KM	"32K"
-	#define	core_config_xy_interleave	1
-	#define	core_config_xy_x_base	0xc0000000
-	#define	core_config_xy_y_base	0xe0000000
-	#define	core_config_fpus_div	1
-	#define	core_config_fpu_mac	1
-	#define	core_config_fpuda	1
-	#define	core_config_fpus_mpy_slow	1
-	#define	core_config_fpus_div_slow	1
-	#define	core_config_timer0	1
-	#define	core_config_timer0_level	1
-	#define	core_config_timer0_vector	16
-	#define	core_config_timer1	1
-	#define	core_config_timer1_level	0
-	#define	core_config_timer1_vector	17
-	#define	core_config_action_points	8
-	#define	core_config_stack_check	1
-	#define	core_config_code_protection	1
-	#define	core_config_smart_stack_entries	64
-	#define	core_config_mpu_present	1
-	#define	core_config_mpu	1
-	#define	core_config_mpu_regions	16
-	#define	core_config_ifq_present	1
-	#define	core_config_ifq_entries	4
-	#define	core_config_interrupts_present	1
-	#define	core_config_interrupts_number	95
-	#define	core_config_interrupts_priorities	4
-	#define	core_config_interrupts_externals	60
-	#define	core_config_interrupts	95
-	#define	core_config_interrupt_priorities	4
-	#define	core_config_ext_interrupts	60
-	#define	core_config_interrupts_firq	1
-	#define	core_config_interrupts_base	0x0
-	#define	core_config_dccm_present	1
-	#define	core_config_dccm_size	0x20000
-	#define	core_config_dccm_base	0x80000000
-	#define	core_config_iccm_present	1
-	#define	core_config_iccm0_present	1
-	#define	core_config_iccm_size	0x40000
-	#define	core_config_iccm0_size	0x40000
-	#define	core_config_iccm_base	0x20000000
-	#define	core_config_iccm0_base	0x20000000
-	#define	core_config_error_prot_ver	3
-	#define	core_config_ccm_prot_pipelined	1
-	#define	core_config_watchdog	1
-	#define	core_config_watchdog_size	16
-	#define	core_config_pct_counters	8
-	#define	core_config_connect_pmu	1
-	#define	core_config_connect_pdm	1
-	#define	core_config_dmac	1
-	#define	core_config_dmac_channels	16
-	#define	core_config_dmac_registers	16
-	#define	core_config_dmac_fifo_depth	4
-	#define	core_config_dmac_int_config	"multiple_internal"
-	#define	core_config_power_domains	1
-	#define	core_config_dvfs	1
-#endif /* __core_config_h */
-
-]]></string>
-  </configuration>
-  <configuration name="core" filename="core.props">
-    <string><![CDATA[
-	core_config.cir.identity=0x00000142
-	core_config.cir.identity.chipid=0
-	core_config.cir.identity.arcnum=1
-	core_config.cir.identity.arcver=66
-	core_config.cir.identity.family=4
-	core_config.cir.identity.corever=2
-	core_config.cir.aux_dccm=0x80000000
-	core_config.bcr.bcr_ver=0x00000002
-	core_config.bcr.bcr_ver.version=2
-	core_config.bcr.vecbase_ac_build=0x00000010
-	core_config.bcr.mpu_build=0x00001002
-	core_config.bcr.mpu_build.i=0
-	core_config.bcr.mpu_build.s=0
-	core_config.bcr.mpu_build.regions=16
-	core_config.bcr.mpu_build.version=2
-	core_config.bcr.rf_build=0x0000c902
-	core_config.bcr.rf_build.version=2
-	core_config.bcr.rf_build.p=1
-	core_config.bcr.rf_build.e=0
-	core_config.bcr.rf_build.r=0
-	core_config.bcr.rf_build.b=1
-	core_config.bcr.rf_build.d=3
-	core_config.bcr.dccm_build=0x00000904
-	core_config.bcr.dccm_build.cycles=0
-	core_config.bcr.dccm_build.interleave=0
-	core_config.bcr.dccm_build.size1=0
-	core_config.bcr.dccm_build.size0=9
-	core_config.bcr.dccm_build.version=4
-	core_config.bcr.timer_build=0x00010304
-	core_config.bcr.timer_build.sp1=0
-	core_config.bcr.timer_build.sp0=0
-	core_config.bcr.timer_build.p1=0
-	core_config.bcr.timer_build.p0=1
-	core_config.bcr.timer_build.st1=0
-	core_config.bcr.timer_build.st0=0
-	core_config.bcr.timer_build.rtc=0
-	core_config.bcr.timer_build.rtsc_ver=1
-	core_config.bcr.timer_build.rtsc=0
-	core_config.bcr.timer_build.t0=1
-	core_config.bcr.timer_build.t1=1
-	core_config.bcr.timer_build.version=4
-	core_config.bcr.ap_build=0x00000605
-	core_config.bcr.ap_build.version=5
-	core_config.bcr.ap_build.type=6
-	core_config.bcr.iccm_build=0x00000a04
-	core_config.bcr.iccm_build.iccm1_size1=0
-	core_config.bcr.iccm_build.iccm0_size1=0
-	core_config.bcr.iccm_build.iccm1_size0=0
-	core_config.bcr.iccm_build.iccm0_size0=10
-	core_config.bcr.iccm_build.version=4
-	core_config.bcr.xy_build=0x00003620
-	core_config.bcr.xy_build.memsize=3
-	core_config.bcr.xy_build.interleaved=1
-	core_config.bcr.xy_build.config=2
-	core_config.bcr.xy_build.version=32
-	core_config.bcr.dsp_build=0x00003521
-	core_config.bcr.dsp_build.wide=0
-	core_config.bcr.dsp_build.itu_pa=1
-	core_config.bcr.dsp_build.acc_shift=2
-	core_config.bcr.dsp_build.comp=1
-	core_config.bcr.dsp_build.divsqrt=1
-	core_config.bcr.dsp_build.version=33
-	core_config.bcr.multiply_build=0x00022a06
-	core_config.bcr.multiply_build.version16x16=2
-	core_config.bcr.multiply_build.dsp=2
-	core_config.bcr.multiply_build.cyc=2
-	core_config.bcr.multiply_build.type=2
-	core_config.bcr.multiply_build.version32x32=6
-	core_config.bcr.swap_build=0x00000003
-	core_config.bcr.swap_build.version=3
-	core_config.bcr.norm_build=0x00000003
-	core_config.bcr.norm_build.version=3
-	core_config.bcr.minmax_build=0x00000002
-	core_config.bcr.minmax_build.version=2
-	core_config.bcr.barrel_build=0x00000303
-	core_config.bcr.barrel_build.version=3
-	core_config.bcr.barrel_build.shift_option=3
-	core_config.bcr.isa_config=0x12447402
-	core_config.bcr.isa_config.d=1
-	core_config.bcr.isa_config.c=2
-	core_config.bcr.isa_config.l=0
-	core_config.bcr.isa_config.n=1
-	core_config.bcr.isa_config.a=0
-	core_config.bcr.isa_config.b=0
-	core_config.bcr.isa_config.addr_size=4
-	core_config.bcr.isa_config.lpc_size=7
-	core_config.bcr.isa_config.pc_size=4
-	core_config.bcr.isa_config.version=2
-	core_config.bcr.stack_region_build=0x00000002
-	core_config.bcr.erp_build=0x30000003
-	core_config.bcr.erp_build.l=0
-	core_config.bcr.erp_build.wd=1
-	core_config.bcr.erp_build.c=1
-	core_config.bcr.erp_build.rf=0
-	core_config.bcr.erp_build.pc=0
-	core_config.bcr.erp_build.ic=0
-	core_config.bcr.erp_build.dc=0
-	core_config.bcr.erp_build.ip=0
-	core_config.bcr.erp_build.dp=0
-	core_config.bcr.erp_build.version=3
-	core_config.bcr.fpu_build=0x01000f02
-	core_config.bcr.fpu_build.da=1
-	core_config.bcr.fpu_build.dd=0
-	core_config.bcr.fpu_build.dc=0
-	core_config.bcr.fpu_build.df=0
-	core_config.bcr.fpu_build.dp=0
-	core_config.bcr.fpu_build.fd=0
-	core_config.bcr.fpu_build.fm=0
-	core_config.bcr.fpu_build.sd=1
-	core_config.bcr.fpu_build.sc=1
-	core_config.bcr.fpu_build.sf=1
-	core_config.bcr.fpu_build.sp=1
-	core_config.bcr.fpu_build.version=2
-	core_config.bcr.cprot_build=0x00000001
-	core_config.bcr.agu_build=0x01442401
-	core_config.bcr.agu_build.accordian=1
-	core_config.bcr.agu_build.wb_size=2
-	core_config.bcr.agu_build.num_modifier=4
-	core_config.bcr.agu_build.num_offset=2
-	core_config.bcr.agu_build.num_addr=4
-	core_config.bcr.agu_build.version=1
-	core_config.bcr.dmac_build=0x00170f01
-	core_config.bcr.dmac_build.int_cfg=2
-	core_config.bcr.dmac_build.fifo=3
-	core_config.bcr.dmac_build.chan_mem=16
-	core_config.bcr.dmac_build.channels=15
-	core_config.bcr.dmac_build.version=1
-	core_config.bcr.mcip_system_build=0x02011002
-	core_config.bcr.mcip_system_build.pdm=1
-	core_config.bcr.mcip_system_build.idu=0
-	core_config.bcr.mcip_system_build.corenum=1
-	core_config.bcr.mcip_system_build.gfrc=0
-	core_config.bcr.mcip_system_build.icd=0
-	core_config.bcr.mcip_system_build.pmu=1
-	core_config.bcr.mcip_system_build.icm=0
-	core_config.bcr.mcip_system_build.ics=0
-	core_config.bcr.mcip_system_build.ici=0
-	core_config.bcr.mcip_system_build.asi=0
-	core_config.bcr.mcip_system_build.version=2
-	core_config.bcr.mcip_system_build.llm=0
-	core_config.bcr.mcip_system_build.rtc=0
-	core_config.bcr.mcip_system_build.mcd=0
-	core_config.bcr.mcip_system_build.mps=0
-	core_config.bcr.mcip_system_build.bsu=0
-	core_config.bcr.mcip_pmu_build=0x00000002
-	core_config.bcr.mcip_pmu_build.version=2
-	core_config.bcr.mcip_pmu_build.dvfs=0
-	core_config.bcr.mcip_pmu_build.pm=0
-	core_config.bcr.mcip_pdm_build=0x00000001
-	core_config.bcr.mcip_pdm_build.version=1
-	core_config.bcr.subsys_build=0x00100013
-	core_config.bcr.core_config=0x00000001
-	core_config.bcr.core_config.turbo_boost=0
-	core_config.bcr.core_config.version=1
-	core_config.bcr.irq_build=0x133c5f01
-	core_config.bcr.irq_build.raz=0
-	core_config.bcr.irq_build.f=1
-	core_config.bcr.irq_build.p=3
-	core_config.bcr.irq_build.exts=60
-	core_config.bcr.irq_build.irqs=95
-	core_config.bcr.irq_build.version=1
-	core_config.bcr.pct_build=0x08080102
-	core_config.bcr.pct_build.version=2
-	core_config.bcr.pct_build.s=1
-	core_config.bcr.pct_build.i=0
-	core_config.bcr.pct_build.c=8
-	core_config.bcr.cc_build=0x006f0004
-	core_config.bcr.cc_build.version=4
-	core_config.bcr.cc_build.cc=111
-	core_config.bcr.pdm_dvfs_build=0x00000302
-	core_config.bcr.pdm_dvfs_build.dvfs=1
-	core_config.bcr.pdm_dvfs_build.pdm=1
-	core_config.bcr.pdm_dvfs_build.version=2
-	core_config.bcr.ifqueue_build=0x00000202
-	core_config.bcr.ifqueue_build.bd=2
-	core_config.bcr.ifqueue_build.version=2
-	core_config.bcr.smart_build=0x00010003
-	core_config.bcr.smart_build.version=3
-	core_config.bcr.smart_build.stack_size=64
-	core_config.cir.aux_iccm=0x20000000
-	core_config.cir.xccm_base=0xc0000000
-	core_config.cir.yccm_base=0xe0000000
-	core_config.cir.subsys_dsp_0_build=0x00001000
-	core_config.cir.subsys_io_0_build=0x071711f0
-	core_config.cir.subsys_io_1_build=0x00000f70
-	core_config.family=4
-	core_config.core_version=2
-	core_config.family_name=arcv2em
-	core_config.rgf_num_banks=2
-	core_config.rgf_banked_regs=32
-	core_config.rgf_num_wr_ports=2
-	core_config.endian=little
-	core_config.endian_little=1
-	core_config.endian_big=0
-	core_config.lpc_size=32
-	core_config.pc_size=32
-	core_config.addr_size=32
-	core_config.unaligned=1
-	core_config.code_density=1
-	core_config.div_rem=radix2
-	core_config.div_rem_radix2=1
-	core_config.swap=1
-	core_config.bitscan=1
-	core_config.mpy_option=mpyd
-	core_config.mpy_option_num=8
-	core_config.shift_assist=1
-	core_config.barrel_shifter=1
-	core_config.dsp=1
-	core_config.dsp2=1
-	core_config.dsp_complex=1
-	core_config.dsp_divsqrt=radix2
-	core_config.dsp_divsqrt_radix2=1
-	core_config.dsp_itu=1
-	core_config.dsp_accshift=full
-	core_config.dsp_accshift_full=1
-	core_config.agu_small=1
-	core_config.agu_wb_depth=2
-	core_config.agu_accord=1
-	core_config.xy=1
-	core_config.xy_config=dccm_x_y
-	core_config.xy_config_dccm_x_y=1
-	core_config.xy_size=32K
-	core_config.xy_interleave=1
-	core_config.xy_x_base=0xc0000000
-	core_config.xy_y_base=0xe0000000
-	core_config.fpus_div=1
-	core_config.fpu_mac=1
-	core_config.fpuda=1
-	core_config.fpus_mpy_slow=1
-	core_config.fpus_div_slow=1
-	core_config.timer0=1
-	core_config.timer0_level=1
-	core_config.timer0.vector=16
-	core_config.timer1=1
-	core_config.timer1_level=0
-	core_config.timer1.vector=17
-	core_config.action_points=8
-	core_config.stack_check=1
-	core_config.code_protection=1
-	core_config.smart_stack_entries=64
-	core_config.mpu.present=1
-	core_config.mpu=1
-	core_config.mpu.regions=16
-	core_config.ifq.present=1
-	core_config.ifq_entries=4
-	core_config.interrupts.present=1
-	core_config.interrupts.number=95
-	core_config.interrupts.priorities=4
-	core_config.interrupts.externals=60
-	core_config.interrupts=95
-	core_config.interrupt_priorities=4
-	core_config.ext_interrupts=60
-	core_config.interrupts.firq=1
-	core_config.interrupts.base=0x0
-	core_config.dccm.present=1
-	core_config.dccm_size=0x20000
-	core_config.dccm_base=0x80000000
-	core_config.iccm.present=1
-	core_config.iccm0.present=1
-	core_config.iccm.size=0x40000
-	core_config.iccm0.size=0x40000
-	core_config.iccm.base=0x20000000
-	core_config.iccm0.base=0x20000000
-	core_config.error_prot_ver=3
-	core_config.ccm_prot_pipelined=1
-	core_config.watchdog=1
-	core_config.watchdog_size=16
-	core_config.pct_counters=8
-	core_config.connect_pmu=1
-	core_config.connect_pdm=1
-	core_config.dmac=1
-	core_config.dmac_channels=16
-	core_config.dmac_registers=16
-	core_config.dmac_fifo_depth=4
-	core_config.dmac_int_config=multiple_internal
-	core_config.power_domains=1
-	core_config.dvfs=1
-]]></string>
-  </configuration>
-  <configuration name="gcc_compiler" filename="gcc.arg">
-    <string><![CDATA[
-	-mcpu=em4_fpuda
-	-mlittle-endian
-	-mcode-density
-	-mdiv-rem
-	-mswap
-	-mnorm
-	-mmpy-option=6
-	-mbarrel-shifter
-	-mfpu=fpuda_all
-]]></string>
-  </configuration>
-  <configuration name="linker_command_file" filename="link_cmd.txt">
-    <string><![CDATA[
-# SYSTEM memory regions indicate where external memory might be located.
-#   The TCF has no specific knowledge of whether SYSTEM regions contain 
-#   external memory or not.
-# CCMWRAP memory regions indicate unusable portions of the address space
-#   due to CCM memory wrapping into upper addresses beyond its size
-
-MEMORY {
-#   SYSTEM0 : ORIGIN = 0x00000000, LENGTH = 0x20000000
-    ICCM0   : ORIGIN = 0x20000000, LENGTH = 0x00040000
-#   CCMWRAP0: ORIGIN = 0x20040000, LENGTH = 0x0ffc0000
-#   SYSTEM1 : ORIGIN = 0x30000000, LENGTH = 0x50000000
-    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
-#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
-#   SYSTEM2 : ORIGIN = 0x90000000, LENGTH = 0x30000000
-    XCCM    : ORIGIN = 0xc0000000, LENGTH = 0x00008000
-#   CCMWRAP2: ORIGIN = 0xc0008000, LENGTH = 0x0fff8000
-#   SYSTEM3 : ORIGIN = 0xd0000000, LENGTH = 0x10000000
-    YCCM    : ORIGIN = 0xe0000000, LENGTH = 0x00008000
-#   CCMWRAP3: ORIGIN = 0xe0008000, LENGTH = 0x0fff8000
-#   SYSTEM4 : ORIGIN = 0xf0000000, LENGTH = 0x10000000
-    }
-SECTIONS {
-    GROUP: {
-	.text? : { *('.text$crt*') }
-        * (TEXT): {}
-    	* (LIT): {}
-	} > ICCM0
-
-    GROUP: {
-	/* _SDA_BASE_ computed implicitly */
-        .sdata?: {}
-        .sbss?: {}
-        * (DATA): {}
-        * (BSS): {}
-       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:32768): {}
-       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
-	} > DCCM
-    GROUP: {
-        .Xdata? : {}
-        } > XCCM
-    GROUP: {
-        .Ydata? : {}
-        } > YCCM
-    GROUP BIND(0x0): {
-        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:684): {} = FILL(0xa5a5a5a5,4)
-        }
-    }
-
-]]></string>
-  </configuration>
-  <configuration name="gnu_linker_command_file" filename="memory.x">
-    <string><![CDATA[
-MEMORY {
-    SYSTEM0  : ORIGIN = 0x00000000, LENGTH = 0x20000000
-    ICCM0    : ORIGIN = 0x20000000, LENGTH = 0x00040000
-    CCMWRAP0 : ORIGIN = 0x20040000, LENGTH = 0x0ffc0000
-    SYSTEM1  : ORIGIN = 0x30000000, LENGTH = 0x50000000
-    DCCM     : ORIGIN = 0x80000000, LENGTH = 0x00020000
-    CCMWRAP1 : ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
-    SYSTEM2  : ORIGIN = 0x90000000, LENGTH = 0x30000000
-    XCCM     : ORIGIN = 0xc0000000, LENGTH = 0x00008000
-    CCMWRAP2 : ORIGIN = 0xc0008000, LENGTH = 0x0fff8000
-    SYSTEM3  : ORIGIN = 0xd0000000, LENGTH = 0x10000000
-    YCCM     : ORIGIN = 0xe0000000, LENGTH = 0x00008000
-    CCMWRAP3 : ORIGIN = 0xe0008000, LENGTH = 0x0fff8000
-    SYSTEM4  : ORIGIN = 0xf0000000, LENGTH = 0x10000000
-    }
-REGION_ALIAS("startup", ICCM0)
-REGION_ALIAS("text", ICCM0)
-REGION_ALIAS("data", DCCM)
-REGION_ALIAS("sdata", DCCM)
-PROVIDE (__stack_top = (0x8001ffff & -4 ));
-PROVIDE (__end_heap =  (0x8001ffff ));
-]]></string>
-  </configuration>
-  <configuration name="apex_header" filename="apexextensions.h">
-    <string><![CDATA[
-
-/* **** DO NOT EDIT - this file is generated by ARChitect2 ****
- *
- * Description: Header file declaring the compiler extensions for apex components 
- */
-
-#ifndef _apexextensions_H_
-#define _apexextensions_H_
-
-// User extension instruction - dsp_cos
-extern long dsp_cos(long);
-#pragma intrinsic(dsp_cos, opcode => 0x07, sub_opcode => 0x1E , latency_cycles => 8)
-
-// User extension instruction - dsp_sin
-extern long dsp_sin(long);
-#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8)
-
-// User extension instruction - dsp_tan
-extern long dsp_tan(long);
-#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11)
-
-// User extension instruction - dsp_acos
-extern long dsp_acos(long);
-#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31)
-
-// User extension instruction - dsp_asin
-extern long dsp_asin(long);
-#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31)
-
-// User extension instruction - dsp_atan
-extern long dsp_atan(long);
-#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13)
-
-// User extension instruction - dsp_sqrt
-extern long dsp_sqrt(long);
-#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31)
-
-// User extension instruction - dsp_sqrt15
-extern long dsp_sqrt15(long);
-#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15)
-
-#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT	1
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B0_IO_GPIO_4B0_PRESENT	1
-
-// User extension aux register io_gpio_4b0_debounce
-#define AR_IO_GPIO_4B0_DEBOUNCE 0x80017c48
-#pragma Aux_register(0x80017c48, name=>"io_gpio_4b0_debounce")
-
-// User extension aux register io_gpio_4b0_clken
-#define AR_IO_GPIO_4B0_CLKEN 0x80017c80
-#pragma Aux_register(0x80017c80, name=>"io_gpio_4b0_clken")
-
-// User extension aux register io_gpio_4b0_swporta_dr
-#define AR_IO_GPIO_4B0_SWPORTA_DR 0x80017c00
-#pragma Aux_register(0x80017c00, name=>"io_gpio_4b0_swporta_dr")
-
-// User extension aux register io_gpio_4b0_swporta_ddr
-#define AR_IO_GPIO_4B0_SWPORTA_DDR 0x80017c04
-#pragma Aux_register(0x80017c04, name=>"io_gpio_4b0_swporta_ddr")
-
-// User extension aux register io_gpio_4b0_inten
-#define AR_IO_GPIO_4B0_INTEN 0x80017c30
-#pragma Aux_register(0x80017c30, name=>"io_gpio_4b0_inten")
-
-// User extension aux register io_gpio_4b0_intmask
-#define AR_IO_GPIO_4B0_INTMASK 0x80017c34
-#pragma Aux_register(0x80017c34, name=>"io_gpio_4b0_intmask")
-
-// User extension aux register io_gpio_4b0_inttype_level
-#define AR_IO_GPIO_4B0_INTTYPE_LEVEL 0x80017c38
-#pragma Aux_register(0x80017c38, name=>"io_gpio_4b0_inttype_level")
-
-// User extension aux register io_gpio_4b0_int_polarity
-#define AR_IO_GPIO_4B0_INT_POLARITY 0x80017c3c
-#pragma Aux_register(0x80017c3c, name=>"io_gpio_4b0_int_polarity")
-
-// User extension aux register io_gpio_4b0_intstatus
-#define AR_IO_GPIO_4B0_INTSTATUS 0x80017c40
-#pragma Aux_register(0x80017c40, name=>"io_gpio_4b0_intstatus")
-
-// User extension aux register io_gpio_4b0_raw_intstatus
-#define AR_IO_GPIO_4B0_RAW_INTSTATUS 0x80017c44
-#pragma Aux_register(0x80017c44, name=>"io_gpio_4b0_raw_intstatus")
-
-// User extension aux register io_gpio_4b0_porta_eoi
-#define AR_IO_GPIO_4B0_PORTA_EOI 0x80017c4c
-#pragma Aux_register(0x80017c4c, name=>"io_gpio_4b0_porta_eoi")
-
-// User extension aux register io_gpio_4b0_ext_porta
-#define AR_IO_GPIO_4B0_EXT_PORTA 0x80017c50
-#pragma Aux_register(0x80017c50, name=>"io_gpio_4b0_ext_porta")
-
-// User extension aux register io_gpio_4b0_ls_sync
-#define AR_IO_GPIO_4B0_LS_SYNC 0x80017c60
-#pragma Aux_register(0x80017c60, name=>"io_gpio_4b0_ls_sync")
-
-// User extension aux register io_gpio_4b0_int_bothedge
-#define AR_IO_GPIO_4B0_INT_BOTHEDGE 0x80017c68
-#pragma Aux_register(0x80017c68, name=>"io_gpio_4b0_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B1_IO_GPIO_4B1_PRESENT	1
-
-// User extension aux register io_gpio_4b1_debounce
-#define AR_IO_GPIO_4B1_DEBOUNCE 0x80017d48
-#pragma Aux_register(0x80017d48, name=>"io_gpio_4b1_debounce")
-
-// User extension aux register io_gpio_4b1_clken
-#define AR_IO_GPIO_4B1_CLKEN 0x80017d80
-#pragma Aux_register(0x80017d80, name=>"io_gpio_4b1_clken")
-
-// User extension aux register io_gpio_4b1_swporta_dr
-#define AR_IO_GPIO_4B1_SWPORTA_DR 0x80017d00
-#pragma Aux_register(0x80017d00, name=>"io_gpio_4b1_swporta_dr")
-
-// User extension aux register io_gpio_4b1_swporta_ddr
-#define AR_IO_GPIO_4B1_SWPORTA_DDR 0x80017d04
-#pragma Aux_register(0x80017d04, name=>"io_gpio_4b1_swporta_ddr")
-
-// User extension aux register io_gpio_4b1_inten
-#define AR_IO_GPIO_4B1_INTEN 0x80017d30
-#pragma Aux_register(0x80017d30, name=>"io_gpio_4b1_inten")
-
-// User extension aux register io_gpio_4b1_intmask
-#define AR_IO_GPIO_4B1_INTMASK 0x80017d34
-#pragma Aux_register(0x80017d34, name=>"io_gpio_4b1_intmask")
-
-// User extension aux register io_gpio_4b1_inttype_level
-#define AR_IO_GPIO_4B1_INTTYPE_LEVEL 0x80017d38
-#pragma Aux_register(0x80017d38, name=>"io_gpio_4b1_inttype_level")
-
-// User extension aux register io_gpio_4b1_int_polarity
-#define AR_IO_GPIO_4B1_INT_POLARITY 0x80017d3c
-#pragma Aux_register(0x80017d3c, name=>"io_gpio_4b1_int_polarity")
-
-// User extension aux register io_gpio_4b1_intstatus
-#define AR_IO_GPIO_4B1_INTSTATUS 0x80017d40
-#pragma Aux_register(0x80017d40, name=>"io_gpio_4b1_intstatus")
-
-// User extension aux register io_gpio_4b1_raw_intstatus
-#define AR_IO_GPIO_4B1_RAW_INTSTATUS 0x80017d44
-#pragma Aux_register(0x80017d44, name=>"io_gpio_4b1_raw_intstatus")
-
-// User extension aux register io_gpio_4b1_porta_eoi
-#define AR_IO_GPIO_4B1_PORTA_EOI 0x80017d4c
-#pragma Aux_register(0x80017d4c, name=>"io_gpio_4b1_porta_eoi")
-
-// User extension aux register io_gpio_4b1_ext_porta
-#define AR_IO_GPIO_4B1_EXT_PORTA 0x80017d50
-#pragma Aux_register(0x80017d50, name=>"io_gpio_4b1_ext_porta")
-
-// User extension aux register io_gpio_4b1_ls_sync
-#define AR_IO_GPIO_4B1_LS_SYNC 0x80017d60
-#pragma Aux_register(0x80017d60, name=>"io_gpio_4b1_ls_sync")
-
-// User extension aux register io_gpio_4b1_int_bothedge
-#define AR_IO_GPIO_4B1_INT_BOTHEDGE 0x80017d68
-#pragma Aux_register(0x80017d68, name=>"io_gpio_4b1_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_4B2_IO_GPIO_4B2_PRESENT	1
-
-// User extension aux register io_gpio_4b2_debounce
-#define AR_IO_GPIO_4B2_DEBOUNCE 0x80017e48
-#pragma Aux_register(0x80017e48, name=>"io_gpio_4b2_debounce")
-
-// User extension aux register io_gpio_4b2_clken
-#define AR_IO_GPIO_4B2_CLKEN 0x80017e80
-#pragma Aux_register(0x80017e80, name=>"io_gpio_4b2_clken")
-
-// User extension aux register io_gpio_4b2_swporta_dr
-#define AR_IO_GPIO_4B2_SWPORTA_DR 0x80017e00
-#pragma Aux_register(0x80017e00, name=>"io_gpio_4b2_swporta_dr")
-
-// User extension aux register io_gpio_4b2_swporta_ddr
-#define AR_IO_GPIO_4B2_SWPORTA_DDR 0x80017e04
-#pragma Aux_register(0x80017e04, name=>"io_gpio_4b2_swporta_ddr")
-
-// User extension aux register io_gpio_4b2_inten
-#define AR_IO_GPIO_4B2_INTEN 0x80017e30
-#pragma Aux_register(0x80017e30, name=>"io_gpio_4b2_inten")
-
-// User extension aux register io_gpio_4b2_intmask
-#define AR_IO_GPIO_4B2_INTMASK 0x80017e34
-#pragma Aux_register(0x80017e34, name=>"io_gpio_4b2_intmask")
-
-// User extension aux register io_gpio_4b2_inttype_level
-#define AR_IO_GPIO_4B2_INTTYPE_LEVEL 0x80017e38
-#pragma Aux_register(0x80017e38, name=>"io_gpio_4b2_inttype_level")
-
-// User extension aux register io_gpio_4b2_int_polarity
-#define AR_IO_GPIO_4B2_INT_POLARITY 0x80017e3c
-#pragma Aux_register(0x80017e3c, name=>"io_gpio_4b2_int_polarity")
-
-// User extension aux register io_gpio_4b2_intstatus
-#define AR_IO_GPIO_4B2_INTSTATUS 0x80017e40
-#pragma Aux_register(0x80017e40, name=>"io_gpio_4b2_intstatus")
-
-// User extension aux register io_gpio_4b2_raw_intstatus
-#define AR_IO_GPIO_4B2_RAW_INTSTATUS 0x80017e44
-#pragma Aux_register(0x80017e44, name=>"io_gpio_4b2_raw_intstatus")
-
-// User extension aux register io_gpio_4b2_porta_eoi
-#define AR_IO_GPIO_4B2_PORTA_EOI 0x80017e4c
-#pragma Aux_register(0x80017e4c, name=>"io_gpio_4b2_porta_eoi")
-
-// User extension aux register io_gpio_4b2_ext_porta
-#define AR_IO_GPIO_4B2_EXT_PORTA 0x80017e50
-#pragma Aux_register(0x80017e50, name=>"io_gpio_4b2_ext_porta")
-
-// User extension aux register io_gpio_4b2_ls_sync
-#define AR_IO_GPIO_4B2_LS_SYNC 0x80017e60
-#pragma Aux_register(0x80017e60, name=>"io_gpio_4b2_ls_sync")
-
-// User extension aux register io_gpio_4b2_int_bothedge
-#define AR_IO_GPIO_4B2_INT_BOTHEDGE 0x80017e68
-#pragma Aux_register(0x80017e68, name=>"io_gpio_4b2_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B0_IO_GPIO_8B0_PRESENT	1
-
-// User extension aux register io_gpio_8b0_debounce
-#define AR_IO_GPIO_8B0_DEBOUNCE 0x80017848
-#pragma Aux_register(0x80017848, name=>"io_gpio_8b0_debounce")
-
-// User extension aux register io_gpio_8b0_clken
-#define AR_IO_GPIO_8B0_CLKEN 0x80017880
-#pragma Aux_register(0x80017880, name=>"io_gpio_8b0_clken")
-
-// User extension aux register io_gpio_8b0_swporta_dr
-#define AR_IO_GPIO_8B0_SWPORTA_DR 0x80017800
-#pragma Aux_register(0x80017800, name=>"io_gpio_8b0_swporta_dr")
-
-// User extension aux register io_gpio_8b0_swporta_ddr
-#define AR_IO_GPIO_8B0_SWPORTA_DDR 0x80017804
-#pragma Aux_register(0x80017804, name=>"io_gpio_8b0_swporta_ddr")
-
-// User extension aux register io_gpio_8b0_inten
-#define AR_IO_GPIO_8B0_INTEN 0x80017830
-#pragma Aux_register(0x80017830, name=>"io_gpio_8b0_inten")
-
-// User extension aux register io_gpio_8b0_intmask
-#define AR_IO_GPIO_8B0_INTMASK 0x80017834
-#pragma Aux_register(0x80017834, name=>"io_gpio_8b0_intmask")
-
-// User extension aux register io_gpio_8b0_inttype_level
-#define AR_IO_GPIO_8B0_INTTYPE_LEVEL 0x80017838
-#pragma Aux_register(0x80017838, name=>"io_gpio_8b0_inttype_level")
-
-// User extension aux register io_gpio_8b0_int_polarity
-#define AR_IO_GPIO_8B0_INT_POLARITY 0x8001783c
-#pragma Aux_register(0x8001783c, name=>"io_gpio_8b0_int_polarity")
-
-// User extension aux register io_gpio_8b0_intstatus
-#define AR_IO_GPIO_8B0_INTSTATUS 0x80017840
-#pragma Aux_register(0x80017840, name=>"io_gpio_8b0_intstatus")
-
-// User extension aux register io_gpio_8b0_raw_intstatus
-#define AR_IO_GPIO_8B0_RAW_INTSTATUS 0x80017844
-#pragma Aux_register(0x80017844, name=>"io_gpio_8b0_raw_intstatus")
-
-// User extension aux register io_gpio_8b0_porta_eoi
-#define AR_IO_GPIO_8B0_PORTA_EOI 0x8001784c
-#pragma Aux_register(0x8001784c, name=>"io_gpio_8b0_porta_eoi")
-
-// User extension aux register io_gpio_8b0_ext_porta
-#define AR_IO_GPIO_8B0_EXT_PORTA 0x80017850
-#pragma Aux_register(0x80017850, name=>"io_gpio_8b0_ext_porta")
-
-// User extension aux register io_gpio_8b0_ls_sync
-#define AR_IO_GPIO_8B0_LS_SYNC 0x80017860
-#pragma Aux_register(0x80017860, name=>"io_gpio_8b0_ls_sync")
-
-// User extension aux register io_gpio_8b0_int_bothedge
-#define AR_IO_GPIO_8B0_INT_BOTHEDGE 0x80017868
-#pragma Aux_register(0x80017868, name=>"io_gpio_8b0_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B1_IO_GPIO_8B1_PRESENT	1
-
-// User extension aux register io_gpio_8b1_debounce
-#define AR_IO_GPIO_8B1_DEBOUNCE 0x80017948
-#pragma Aux_register(0x80017948, name=>"io_gpio_8b1_debounce")
-
-// User extension aux register io_gpio_8b1_clken
-#define AR_IO_GPIO_8B1_CLKEN 0x80017980
-#pragma Aux_register(0x80017980, name=>"io_gpio_8b1_clken")
-
-// User extension aux register io_gpio_8b1_swporta_dr
-#define AR_IO_GPIO_8B1_SWPORTA_DR 0x80017900
-#pragma Aux_register(0x80017900, name=>"io_gpio_8b1_swporta_dr")
-
-// User extension aux register io_gpio_8b1_swporta_ddr
-#define AR_IO_GPIO_8B1_SWPORTA_DDR 0x80017904
-#pragma Aux_register(0x80017904, name=>"io_gpio_8b1_swporta_ddr")
-
-// User extension aux register io_gpio_8b1_inten
-#define AR_IO_GPIO_8B1_INTEN 0x80017930
-#pragma Aux_register(0x80017930, name=>"io_gpio_8b1_inten")
-
-// User extension aux register io_gpio_8b1_intmask
-#define AR_IO_GPIO_8B1_INTMASK 0x80017934
-#pragma Aux_register(0x80017934, name=>"io_gpio_8b1_intmask")
-
-// User extension aux register io_gpio_8b1_inttype_level
-#define AR_IO_GPIO_8B1_INTTYPE_LEVEL 0x80017938
-#pragma Aux_register(0x80017938, name=>"io_gpio_8b1_inttype_level")
-
-// User extension aux register io_gpio_8b1_int_polarity
-#define AR_IO_GPIO_8B1_INT_POLARITY 0x8001793c
-#pragma Aux_register(0x8001793c, name=>"io_gpio_8b1_int_polarity")
-
-// User extension aux register io_gpio_8b1_intstatus
-#define AR_IO_GPIO_8B1_INTSTATUS 0x80017940
-#pragma Aux_register(0x80017940, name=>"io_gpio_8b1_intstatus")
-
-// User extension aux register io_gpio_8b1_raw_intstatus
-#define AR_IO_GPIO_8B1_RAW_INTSTATUS 0x80017944
-#pragma Aux_register(0x80017944, name=>"io_gpio_8b1_raw_intstatus")
-
-// User extension aux register io_gpio_8b1_porta_eoi
-#define AR_IO_GPIO_8B1_PORTA_EOI 0x8001794c
-#pragma Aux_register(0x8001794c, name=>"io_gpio_8b1_porta_eoi")
-
-// User extension aux register io_gpio_8b1_ext_porta
-#define AR_IO_GPIO_8B1_EXT_PORTA 0x80017950
-#pragma Aux_register(0x80017950, name=>"io_gpio_8b1_ext_porta")
-
-// User extension aux register io_gpio_8b1_ls_sync
-#define AR_IO_GPIO_8B1_LS_SYNC 0x80017960
-#pragma Aux_register(0x80017960, name=>"io_gpio_8b1_ls_sync")
-
-// User extension aux register io_gpio_8b1_int_bothedge
-#define AR_IO_GPIO_8B1_INT_BOTHEDGE 0x80017968
-#pragma Aux_register(0x80017968, name=>"io_gpio_8b1_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B2_IO_GPIO_8B2_PRESENT	1
-
-// User extension aux register io_gpio_8b2_debounce
-#define AR_IO_GPIO_8B2_DEBOUNCE 0x80017a48
-#pragma Aux_register(0x80017a48, name=>"io_gpio_8b2_debounce")
-
-// User extension aux register io_gpio_8b2_clken
-#define AR_IO_GPIO_8B2_CLKEN 0x80017a80
-#pragma Aux_register(0x80017a80, name=>"io_gpio_8b2_clken")
-
-// User extension aux register io_gpio_8b2_swporta_dr
-#define AR_IO_GPIO_8B2_SWPORTA_DR 0x80017a00
-#pragma Aux_register(0x80017a00, name=>"io_gpio_8b2_swporta_dr")
-
-// User extension aux register io_gpio_8b2_swporta_ddr
-#define AR_IO_GPIO_8B2_SWPORTA_DDR 0x80017a04
-#pragma Aux_register(0x80017a04, name=>"io_gpio_8b2_swporta_ddr")
-
-// User extension aux register io_gpio_8b2_inten
-#define AR_IO_GPIO_8B2_INTEN 0x80017a30
-#pragma Aux_register(0x80017a30, name=>"io_gpio_8b2_inten")
-
-// User extension aux register io_gpio_8b2_intmask
-#define AR_IO_GPIO_8B2_INTMASK 0x80017a34
-#pragma Aux_register(0x80017a34, name=>"io_gpio_8b2_intmask")
-
-// User extension aux register io_gpio_8b2_inttype_level
-#define AR_IO_GPIO_8B2_INTTYPE_LEVEL 0x80017a38
-#pragma Aux_register(0x80017a38, name=>"io_gpio_8b2_inttype_level")
-
-// User extension aux register io_gpio_8b2_int_polarity
-#define AR_IO_GPIO_8B2_INT_POLARITY 0x80017a3c
-#pragma Aux_register(0x80017a3c, name=>"io_gpio_8b2_int_polarity")
-
-// User extension aux register io_gpio_8b2_intstatus
-#define AR_IO_GPIO_8B2_INTSTATUS 0x80017a40
-#pragma Aux_register(0x80017a40, name=>"io_gpio_8b2_intstatus")
-
-// User extension aux register io_gpio_8b2_raw_intstatus
-#define AR_IO_GPIO_8B2_RAW_INTSTATUS 0x80017a44
-#pragma Aux_register(0x80017a44, name=>"io_gpio_8b2_raw_intstatus")
-
-// User extension aux register io_gpio_8b2_porta_eoi
-#define AR_IO_GPIO_8B2_PORTA_EOI 0x80017a4c
-#pragma Aux_register(0x80017a4c, name=>"io_gpio_8b2_porta_eoi")
-
-// User extension aux register io_gpio_8b2_ext_porta
-#define AR_IO_GPIO_8B2_EXT_PORTA 0x80017a50
-#pragma Aux_register(0x80017a50, name=>"io_gpio_8b2_ext_porta")
-
-// User extension aux register io_gpio_8b2_ls_sync
-#define AR_IO_GPIO_8B2_LS_SYNC 0x80017a60
-#pragma Aux_register(0x80017a60, name=>"io_gpio_8b2_ls_sync")
-
-// User extension aux register io_gpio_8b2_int_bothedge
-#define AR_IO_GPIO_8B2_INT_BOTHEDGE 0x80017a68
-#pragma Aux_register(0x80017a68, name=>"io_gpio_8b2_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO_8B3_IO_GPIO_8B3_PRESENT	1
-
-// User extension aux register io_gpio_8b3_debounce
-#define AR_IO_GPIO_8B3_DEBOUNCE 0x80017b48
-#pragma Aux_register(0x80017b48, name=>"io_gpio_8b3_debounce")
-
-// User extension aux register io_gpio_8b3_clken
-#define AR_IO_GPIO_8B3_CLKEN 0x80017b80
-#pragma Aux_register(0x80017b80, name=>"io_gpio_8b3_clken")
-
-// User extension aux register io_gpio_8b3_swporta_dr
-#define AR_IO_GPIO_8B3_SWPORTA_DR 0x80017b00
-#pragma Aux_register(0x80017b00, name=>"io_gpio_8b3_swporta_dr")
-
-// User extension aux register io_gpio_8b3_swporta_ddr
-#define AR_IO_GPIO_8B3_SWPORTA_DDR 0x80017b04
-#pragma Aux_register(0x80017b04, name=>"io_gpio_8b3_swporta_ddr")
-
-// User extension aux register io_gpio_8b3_inten
-#define AR_IO_GPIO_8B3_INTEN 0x80017b30
-#pragma Aux_register(0x80017b30, name=>"io_gpio_8b3_inten")
-
-// User extension aux register io_gpio_8b3_intmask
-#define AR_IO_GPIO_8B3_INTMASK 0x80017b34
-#pragma Aux_register(0x80017b34, name=>"io_gpio_8b3_intmask")
-
-// User extension aux register io_gpio_8b3_inttype_level
-#define AR_IO_GPIO_8B3_INTTYPE_LEVEL 0x80017b38
-#pragma Aux_register(0x80017b38, name=>"io_gpio_8b3_inttype_level")
-
-// User extension aux register io_gpio_8b3_int_polarity
-#define AR_IO_GPIO_8B3_INT_POLARITY 0x80017b3c
-#pragma Aux_register(0x80017b3c, name=>"io_gpio_8b3_int_polarity")
-
-// User extension aux register io_gpio_8b3_intstatus
-#define AR_IO_GPIO_8B3_INTSTATUS 0x80017b40
-#pragma Aux_register(0x80017b40, name=>"io_gpio_8b3_intstatus")
-
-// User extension aux register io_gpio_8b3_raw_intstatus
-#define AR_IO_GPIO_8B3_RAW_INTSTATUS 0x80017b44
-#pragma Aux_register(0x80017b44, name=>"io_gpio_8b3_raw_intstatus")
-
-// User extension aux register io_gpio_8b3_porta_eoi
-#define AR_IO_GPIO_8B3_PORTA_EOI 0x80017b4c
-#pragma Aux_register(0x80017b4c, name=>"io_gpio_8b3_porta_eoi")
-
-// User extension aux register io_gpio_8b3_ext_porta
-#define AR_IO_GPIO_8B3_EXT_PORTA 0x80017b50
-#pragma Aux_register(0x80017b50, name=>"io_gpio_8b3_ext_porta")
-
-// User extension aux register io_gpio_8b3_ls_sync
-#define AR_IO_GPIO_8B3_LS_SYNC 0x80017b60
-#pragma Aux_register(0x80017b60, name=>"io_gpio_8b3_ls_sync")
-
-// User extension aux register io_gpio_8b3_int_bothedge
-#define AR_IO_GPIO_8B3_INT_BOTHEDGE 0x80017b68
-#pragma Aux_register(0x80017b68, name=>"io_gpio_8b3_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_IO_I2C_MST0_PRESENT	1
-
-// User extension aux register io_i2c_mst0_clken
-#define AR_IO_I2C_MST0_CLKEN 0x800120c0
-#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken")
-
-// User extension aux register io_i2c_mst0_con
-#define AR_IO_I2C_MST0_CON 0x80012000
-#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con")
-
-// User extension aux register io_i2c_mst0_tar
-#define AR_IO_I2C_MST0_TAR 0x80012004
-#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar")
-
-// User extension aux register io_i2c_mst0_data_cmd
-#define AR_IO_I2C_MST0_DATA_CMD 0x80012010
-#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd")
-
-// User extension aux register io_i2c_mst0_ss_scl_hcnt
-#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014
-#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt")
-
-// User extension aux register io_i2c_mst0_ss_scl_lcnt
-#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018
-#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt")
-
-// User extension aux register io_i2c_mst0_fs_scl_hcnt
-#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c
-#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt")
-
-// User extension aux register io_i2c_mst0_fs_scl_lcnt
-#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020
-#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt")
-
-// User extension aux register io_i2c_mst0_intr_stat
-#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c
-#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat")
-
-// User extension aux register io_i2c_mst0_intr_mask
-#define AR_IO_I2C_MST0_INTR_MASK 0x80012030
-#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask")
-
-// User extension aux register io_i2c_mst0_raw_intr_stat
-#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034
-#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat")
-
-// User extension aux register io_i2c_mst0_rx_tl
-#define AR_IO_I2C_MST0_RX_TL 0x80012038
-#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl")
-
-// User extension aux register io_i2c_mst0_tx_tl
-#define AR_IO_I2C_MST0_TX_TL 0x8001203c
-#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl")
-
-// User extension aux register io_i2c_mst0_clr_intr
-#define AR_IO_I2C_MST0_CLR_INTR 0x80012040
-#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr")
-
-// User extension aux register io_i2c_mst0_clr_rx_under
-#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044
-#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under")
-
-// User extension aux register io_i2c_mst0_clr_rx_over
-#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048
-#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over")
-
-// User extension aux register io_i2c_mst0_clr_tx_over
-#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c
-#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over")
-
-// User extension aux register io_i2c_mst0_clr_tx_abrt
-#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054
-#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt")
-
-// User extension aux register io_i2c_mst0_clr_activity
-#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c
-#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity")
-
-// User extension aux register io_i2c_mst0_clr_stop_det
-#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060
-#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det")
-
-// User extension aux register io_i2c_mst0_clr_start_det
-#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064
-#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det")
-
-// User extension aux register io_i2c_mst0_enable
-#define AR_IO_I2C_MST0_ENABLE 0x8001206c
-#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable")
-
-// User extension aux register io_i2c_mst0_status
-#define AR_IO_I2C_MST0_STATUS 0x80012070
-#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status")
-
-// User extension aux register io_i2c_mst0_txflr
-#define AR_IO_I2C_MST0_TXFLR 0x80012074
-#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr")
-
-// User extension aux register io_i2c_mst0_rxflr
-#define AR_IO_I2C_MST0_RXFLR 0x80012078
-#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr")
-
-// User extension aux register io_i2c_mst0_sda_hold
-#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c
-#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold")
-
-// User extension aux register io_i2c_mst0_tx_abrt_source
-#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080
-#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source")
-
-// User extension aux register io_i2c_mst0_enable_status
-#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c
-#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status")
-
-// User extension aux register io_i2c_mst0_fs_spklen
-#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0
-#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_IO_I2C_MST1_PRESENT	1
-
-// User extension aux register io_i2c_mst1_clken
-#define AR_IO_I2C_MST1_CLKEN 0x800121c0
-#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken")
-
-// User extension aux register io_i2c_mst1_con
-#define AR_IO_I2C_MST1_CON 0x80012100
-#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con")
-
-// User extension aux register io_i2c_mst1_tar
-#define AR_IO_I2C_MST1_TAR 0x80012104
-#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar")
-
-// User extension aux register io_i2c_mst1_data_cmd
-#define AR_IO_I2C_MST1_DATA_CMD 0x80012110
-#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd")
-
-// User extension aux register io_i2c_mst1_ss_scl_hcnt
-#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114
-#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt")
-
-// User extension aux register io_i2c_mst1_ss_scl_lcnt
-#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118
-#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt")
-
-// User extension aux register io_i2c_mst1_fs_scl_hcnt
-#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c
-#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt")
-
-// User extension aux register io_i2c_mst1_fs_scl_lcnt
-#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120
-#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt")
-
-// User extension aux register io_i2c_mst1_intr_stat
-#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c
-#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat")
-
-// User extension aux register io_i2c_mst1_intr_mask
-#define AR_IO_I2C_MST1_INTR_MASK 0x80012130
-#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask")
-
-// User extension aux register io_i2c_mst1_raw_intr_stat
-#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134
-#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat")
-
-// User extension aux register io_i2c_mst1_rx_tl
-#define AR_IO_I2C_MST1_RX_TL 0x80012138
-#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl")
-
-// User extension aux register io_i2c_mst1_tx_tl
-#define AR_IO_I2C_MST1_TX_TL 0x8001213c
-#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl")
-
-// User extension aux register io_i2c_mst1_clr_intr
-#define AR_IO_I2C_MST1_CLR_INTR 0x80012140
-#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr")
-
-// User extension aux register io_i2c_mst1_clr_rx_under
-#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144
-#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under")
-
-// User extension aux register io_i2c_mst1_clr_rx_over
-#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148
-#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over")
-
-// User extension aux register io_i2c_mst1_clr_tx_over
-#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c
-#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over")
-
-// User extension aux register io_i2c_mst1_clr_tx_abrt
-#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154
-#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt")
-
-// User extension aux register io_i2c_mst1_clr_activity
-#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c
-#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity")
-
-// User extension aux register io_i2c_mst1_clr_stop_det
-#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160
-#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det")
-
-// User extension aux register io_i2c_mst1_clr_start_det
-#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164
-#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det")
-
-// User extension aux register io_i2c_mst1_enable
-#define AR_IO_I2C_MST1_ENABLE 0x8001216c
-#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable")
-
-// User extension aux register io_i2c_mst1_status
-#define AR_IO_I2C_MST1_STATUS 0x80012170
-#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status")
-
-// User extension aux register io_i2c_mst1_txflr
-#define AR_IO_I2C_MST1_TXFLR 0x80012174
-#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr")
-
-// User extension aux register io_i2c_mst1_rxflr
-#define AR_IO_I2C_MST1_RXFLR 0x80012178
-#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr")
-
-// User extension aux register io_i2c_mst1_sda_hold
-#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c
-#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold")
-
-// User extension aux register io_i2c_mst1_tx_abrt_source
-#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180
-#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source")
-
-// User extension aux register io_i2c_mst1_enable_status
-#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c
-#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status")
-
-// User extension aux register io_i2c_mst1_fs_spklen
-#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0
-#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_IO_I2C_MST2_PRESENT	1
-
-// User extension aux register io_i2c_mst2_clken
-#define AR_IO_I2C_MST2_CLKEN 0x800122c0
-#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken")
-
-// User extension aux register io_i2c_mst2_con
-#define AR_IO_I2C_MST2_CON 0x80012200
-#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con")
-
-// User extension aux register io_i2c_mst2_tar
-#define AR_IO_I2C_MST2_TAR 0x80012204
-#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar")
-
-// User extension aux register io_i2c_mst2_data_cmd
-#define AR_IO_I2C_MST2_DATA_CMD 0x80012210
-#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd")
-
-// User extension aux register io_i2c_mst2_ss_scl_hcnt
-#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214
-#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt")
-
-// User extension aux register io_i2c_mst2_ss_scl_lcnt
-#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218
-#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt")
-
-// User extension aux register io_i2c_mst2_fs_scl_hcnt
-#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c
-#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt")
-
-// User extension aux register io_i2c_mst2_fs_scl_lcnt
-#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220
-#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt")
-
-// User extension aux register io_i2c_mst2_intr_stat
-#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c
-#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat")
-
-// User extension aux register io_i2c_mst2_intr_mask
-#define AR_IO_I2C_MST2_INTR_MASK 0x80012230
-#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask")
-
-// User extension aux register io_i2c_mst2_raw_intr_stat
-#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234
-#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat")
-
-// User extension aux register io_i2c_mst2_rx_tl
-#define AR_IO_I2C_MST2_RX_TL 0x80012238
-#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl")
-
-// User extension aux register io_i2c_mst2_tx_tl
-#define AR_IO_I2C_MST2_TX_TL 0x8001223c
-#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl")
-
-// User extension aux register io_i2c_mst2_clr_intr
-#define AR_IO_I2C_MST2_CLR_INTR 0x80012240
-#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr")
-
-// User extension aux register io_i2c_mst2_clr_rx_under
-#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244
-#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under")
-
-// User extension aux register io_i2c_mst2_clr_rx_over
-#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248
-#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over")
-
-// User extension aux register io_i2c_mst2_clr_tx_over
-#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c
-#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over")
-
-// User extension aux register io_i2c_mst2_clr_tx_abrt
-#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254
-#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt")
-
-// User extension aux register io_i2c_mst2_clr_activity
-#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c
-#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity")
-
-// User extension aux register io_i2c_mst2_clr_stop_det
-#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260
-#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det")
-
-// User extension aux register io_i2c_mst2_clr_start_det
-#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264
-#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det")
-
-// User extension aux register io_i2c_mst2_enable
-#define AR_IO_I2C_MST2_ENABLE 0x8001226c
-#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable")
-
-// User extension aux register io_i2c_mst2_status
-#define AR_IO_I2C_MST2_STATUS 0x80012270
-#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status")
-
-// User extension aux register io_i2c_mst2_txflr
-#define AR_IO_I2C_MST2_TXFLR 0x80012274
-#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr")
-
-// User extension aux register io_i2c_mst2_rxflr
-#define AR_IO_I2C_MST2_RXFLR 0x80012278
-#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr")
-
-// User extension aux register io_i2c_mst2_sda_hold
-#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c
-#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold")
-
-// User extension aux register io_i2c_mst2_tx_abrt_source
-#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280
-#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source")
-
-// User extension aux register io_i2c_mst2_enable_status
-#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c
-#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status")
-
-// User extension aux register io_i2c_mst2_fs_spklen
-#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0
-#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_IO_SPI_MST0_PRESENT	1
-
-// User extension aux register io_spi_mst0_ctrlr0
-#define AR_IO_SPI_MST0_CTRLR0 0x80010000
-#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0")
-
-// User extension aux register io_spi_mst0_ctrlr1
-#define AR_IO_SPI_MST0_CTRLR1 0x80010001
-#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1")
-
-// User extension aux register io_spi_mst0_spien
-#define AR_IO_SPI_MST0_SPIEN 0x80010002
-#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien")
-
-// User extension aux register io_spi_mst0_ser
-#define AR_IO_SPI_MST0_SER 0x80010004
-#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser")
-
-// User extension aux register io_spi_mst0_baudr
-#define AR_IO_SPI_MST0_BAUDR 0x80010005
-#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr")
-
-// User extension aux register io_spi_mst0_txftlr
-#define AR_IO_SPI_MST0_TXFTLR 0x80010006
-#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr")
-
-// User extension aux register io_spi_mst0_rxftlr
-#define AR_IO_SPI_MST0_RXFTLR 0x80010007
-#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr")
-
-// User extension aux register io_spi_mst0_txflr
-#define AR_IO_SPI_MST0_TXFLR 0x80010008
-#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr")
-
-// User extension aux register io_spi_mst0_rxflr
-#define AR_IO_SPI_MST0_RXFLR 0x80010009
-#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr")
-
-// User extension aux register io_spi_mst0_sr
-#define AR_IO_SPI_MST0_SR 0x8001000a
-#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr")
-
-// User extension aux register io_spi_mst0_imr
-#define AR_IO_SPI_MST0_IMR 0x8001000b
-#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr")
-
-// User extension aux register io_spi_mst0_isr
-#define AR_IO_SPI_MST0_ISR 0x8001000c
-#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr")
-
-// User extension aux register io_spi_mst0_risr
-#define AR_IO_SPI_MST0_RISR 0x8001000d
-#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr")
-
-// User extension aux register io_spi_mst0_txoicr
-#define AR_IO_SPI_MST0_TXOICR 0x8001000e
-#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr")
-
-// User extension aux register io_spi_mst0_rxoicr
-#define AR_IO_SPI_MST0_RXOICR 0x8001000f
-#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr")
-
-// User extension aux register io_spi_mst0_rxuicr
-#define AR_IO_SPI_MST0_RXUICR 0x80010010
-#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr")
-
-// User extension aux register io_spi_mst0_icr
-#define AR_IO_SPI_MST0_ICR 0x80010012
-#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr")
-
-// User extension aux register io_spi_mst0_clken
-#define AR_IO_SPI_MST0_CLKEN 0x80010016
-#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken")
-
-// User extension aux register io_spi_mst0_dr
-#define AR_IO_SPI_MST0_DR 0x80010018
-#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr")
-
-// User extension aux register io_spi_mst0_rx_sample_dly
-#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c
-#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_IO_SPI_MST1_PRESENT	1
-
-// User extension aux register io_spi_mst1_ctrlr0
-#define AR_IO_SPI_MST1_CTRLR0 0x80010100
-#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0")
-
-// User extension aux register io_spi_mst1_ctrlr1
-#define AR_IO_SPI_MST1_CTRLR1 0x80010101
-#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1")
-
-// User extension aux register io_spi_mst1_spien
-#define AR_IO_SPI_MST1_SPIEN 0x80010102
-#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien")
-
-// User extension aux register io_spi_mst1_ser
-#define AR_IO_SPI_MST1_SER 0x80010104
-#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser")
-
-// User extension aux register io_spi_mst1_baudr
-#define AR_IO_SPI_MST1_BAUDR 0x80010105
-#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr")
-
-// User extension aux register io_spi_mst1_txftlr
-#define AR_IO_SPI_MST1_TXFTLR 0x80010106
-#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr")
-
-// User extension aux register io_spi_mst1_rxftlr
-#define AR_IO_SPI_MST1_RXFTLR 0x80010107
-#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr")
-
-// User extension aux register io_spi_mst1_txflr
-#define AR_IO_SPI_MST1_TXFLR 0x80010108
-#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr")
-
-// User extension aux register io_spi_mst1_rxflr
-#define AR_IO_SPI_MST1_RXFLR 0x80010109
-#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr")
-
-// User extension aux register io_spi_mst1_sr
-#define AR_IO_SPI_MST1_SR 0x8001010a
-#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr")
-
-// User extension aux register io_spi_mst1_imr
-#define AR_IO_SPI_MST1_IMR 0x8001010b
-#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr")
-
-// User extension aux register io_spi_mst1_isr
-#define AR_IO_SPI_MST1_ISR 0x8001010c
-#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr")
-
-// User extension aux register io_spi_mst1_risr
-#define AR_IO_SPI_MST1_RISR 0x8001010d
-#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr")
-
-// User extension aux register io_spi_mst1_txoicr
-#define AR_IO_SPI_MST1_TXOICR 0x8001010e
-#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr")
-
-// User extension aux register io_spi_mst1_rxoicr
-#define AR_IO_SPI_MST1_RXOICR 0x8001010f
-#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr")
-
-// User extension aux register io_spi_mst1_rxuicr
-#define AR_IO_SPI_MST1_RXUICR 0x80010110
-#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr")
-
-// User extension aux register io_spi_mst1_icr
-#define AR_IO_SPI_MST1_ICR 0x80010112
-#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr")
-
-// User extension aux register io_spi_mst1_clken
-#define AR_IO_SPI_MST1_CLKEN 0x80010116
-#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken")
-
-// User extension aux register io_spi_mst1_dr
-#define AR_IO_SPI_MST1_DR 0x80010118
-#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr")
-
-// User extension aux register io_spi_mst1_rx_sample_dly
-#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c
-#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_IO_SPI_MST2_PRESENT	1
-
-// User extension aux register io_spi_mst2_ctrlr0
-#define AR_IO_SPI_MST2_CTRLR0 0x80010200
-#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0")
-
-// User extension aux register io_spi_mst2_ctrlr1
-#define AR_IO_SPI_MST2_CTRLR1 0x80010201
-#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1")
-
-// User extension aux register io_spi_mst2_spien
-#define AR_IO_SPI_MST2_SPIEN 0x80010202
-#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien")
-
-// User extension aux register io_spi_mst2_ser
-#define AR_IO_SPI_MST2_SER 0x80010204
-#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser")
-
-// User extension aux register io_spi_mst2_baudr
-#define AR_IO_SPI_MST2_BAUDR 0x80010205
-#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr")
-
-// User extension aux register io_spi_mst2_txftlr
-#define AR_IO_SPI_MST2_TXFTLR 0x80010206
-#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr")
-
-// User extension aux register io_spi_mst2_rxftlr
-#define AR_IO_SPI_MST2_RXFTLR 0x80010207
-#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr")
-
-// User extension aux register io_spi_mst2_txflr
-#define AR_IO_SPI_MST2_TXFLR 0x80010208
-#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr")
-
-// User extension aux register io_spi_mst2_rxflr
-#define AR_IO_SPI_MST2_RXFLR 0x80010209
-#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr")
-
-// User extension aux register io_spi_mst2_sr
-#define AR_IO_SPI_MST2_SR 0x8001020a
-#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr")
-
-// User extension aux register io_spi_mst2_imr
-#define AR_IO_SPI_MST2_IMR 0x8001020b
-#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr")
-
-// User extension aux register io_spi_mst2_isr
-#define AR_IO_SPI_MST2_ISR 0x8001020c
-#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr")
-
-// User extension aux register io_spi_mst2_risr
-#define AR_IO_SPI_MST2_RISR 0x8001020d
-#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr")
-
-// User extension aux register io_spi_mst2_txoicr
-#define AR_IO_SPI_MST2_TXOICR 0x8001020e
-#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr")
-
-// User extension aux register io_spi_mst2_rxoicr
-#define AR_IO_SPI_MST2_RXOICR 0x8001020f
-#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr")
-
-// User extension aux register io_spi_mst2_rxuicr
-#define AR_IO_SPI_MST2_RXUICR 0x80010210
-#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr")
-
-// User extension aux register io_spi_mst2_icr
-#define AR_IO_SPI_MST2_ICR 0x80010212
-#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr")
-
-// User extension aux register io_spi_mst2_clken
-#define AR_IO_SPI_MST2_CLKEN 0x80010216
-#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken")
-
-// User extension aux register io_spi_mst2_dr
-#define AR_IO_SPI_MST2_DR 0x80010218
-#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr")
-
-// User extension aux register io_spi_mst2_rx_sample_dly
-#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c
-#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_IO_SPI_SLV0_PRESENT	1
-
-// User extension aux register io_spi_slv0_ctrlr0
-#define AR_IO_SPI_SLV0_CTRLR0 0x80011000
-#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0")
-
-// User extension aux register io_spi_slv0_spien
-#define AR_IO_SPI_SLV0_SPIEN 0x80011002
-#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien")
-
-// User extension aux register io_spi_slv0_txftlr
-#define AR_IO_SPI_SLV0_TXFTLR 0x80011006
-#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr")
-
-// User extension aux register io_spi_slv0_rxftlr
-#define AR_IO_SPI_SLV0_RXFTLR 0x80011007
-#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr")
-
-// User extension aux register io_spi_slv0_txflr
-#define AR_IO_SPI_SLV0_TXFLR 0x80011008
-#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr")
-
-// User extension aux register io_spi_slv0_rxflr
-#define AR_IO_SPI_SLV0_RXFLR 0x80011009
-#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr")
-
-// User extension aux register io_spi_slv0_sr
-#define AR_IO_SPI_SLV0_SR 0x8001100a
-#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr")
-
-// User extension aux register io_spi_slv0_imr
-#define AR_IO_SPI_SLV0_IMR 0x8001100b
-#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr")
-
-// User extension aux register io_spi_slv0_isr
-#define AR_IO_SPI_SLV0_ISR 0x8001100c
-#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr")
-
-// User extension aux register io_spi_slv0_risr
-#define AR_IO_SPI_SLV0_RISR 0x8001100d
-#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr")
-
-// User extension aux register io_spi_slv0_txoicr
-#define AR_IO_SPI_SLV0_TXOICR 0x8001100e
-#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr")
-
-// User extension aux register io_spi_slv0_rxoicr
-#define AR_IO_SPI_SLV0_RXOICR 0x8001100f
-#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr")
-
-// User extension aux register io_spi_slv0_rxuicr
-#define AR_IO_SPI_SLV0_RXUICR 0x80011010
-#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr")
-
-// User extension aux register io_spi_slv0_icr
-#define AR_IO_SPI_SLV0_ICR 0x80011012
-#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr")
-
-// User extension aux register io_spi_slv0_clken
-#define AR_IO_SPI_SLV0_CLKEN 0x80011016
-#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken")
-
-// User extension aux register io_spi_slv0_dr
-#define AR_IO_SPI_SLV0_DR 0x80011018
-#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_IO_UART0_PRESENT	1
-
-// User extension aux register io_uart0_clken
-#define AR_IO_UART0_CLKEN 0x800140c0
-#pragma Aux_register(0x800140c0, name=>"io_uart0_clken")
-
-// User extension aux register io_uart0_rbr_thr_dll
-#define AR_IO_UART0_RBR_THR_DLL 0x80014000
-#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll")
-
-// User extension aux register io_uart0_ier_dlh
-#define AR_IO_UART0_IER_DLH 0x80014004
-#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh")
-
-// User extension aux register io_uart0_iir_fcr
-#define AR_IO_UART0_IIR_FCR 0x80014008
-#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr")
-
-// User extension aux register io_uart0_lcr
-#define AR_IO_UART0_LCR 0x8001400c
-#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr")
-
-// User extension aux register io_uart0_mcr
-#define AR_IO_UART0_MCR 0x80014010
-#pragma Aux_register(0x80014010, name=>"io_uart0_mcr")
-
-// User extension aux register io_uart0_lsr
-#define AR_IO_UART0_LSR 0x80014014
-#pragma Aux_register(0x80014014, name=>"io_uart0_lsr")
-
-// User extension aux register io_uart0_msr
-#define AR_IO_UART0_MSR 0x80014018
-#pragma Aux_register(0x80014018, name=>"io_uart0_msr")
-
-// User extension aux register io_uart0_usr
-#define AR_IO_UART0_USR 0x8001407c
-#pragma Aux_register(0x8001407c, name=>"io_uart0_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_IO_UART1_PRESENT	1
-
-// User extension aux register io_uart1_clken
-#define AR_IO_UART1_CLKEN 0x800141c0
-#pragma Aux_register(0x800141c0, name=>"io_uart1_clken")
-
-// User extension aux register io_uart1_rbr_thr_dll
-#define AR_IO_UART1_RBR_THR_DLL 0x80014100
-#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll")
-
-// User extension aux register io_uart1_ier_dlh
-#define AR_IO_UART1_IER_DLH 0x80014104
-#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh")
-
-// User extension aux register io_uart1_iir_fcr
-#define AR_IO_UART1_IIR_FCR 0x80014108
-#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr")
-
-// User extension aux register io_uart1_lcr
-#define AR_IO_UART1_LCR 0x8001410c
-#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr")
-
-// User extension aux register io_uart1_mcr
-#define AR_IO_UART1_MCR 0x80014110
-#pragma Aux_register(0x80014110, name=>"io_uart1_mcr")
-
-// User extension aux register io_uart1_lsr
-#define AR_IO_UART1_LSR 0x80014114
-#pragma Aux_register(0x80014114, name=>"io_uart1_lsr")
-
-// User extension aux register io_uart1_msr
-#define AR_IO_UART1_MSR 0x80014118
-#pragma Aux_register(0x80014118, name=>"io_uart1_msr")
-
-// User extension aux register io_uart1_usr
-#define AR_IO_UART1_USR 0x8001417c
-#pragma Aux_register(0x8001417c, name=>"io_uart1_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_IO_UART2_PRESENT	1
-
-// User extension aux register io_uart2_clken
-#define AR_IO_UART2_CLKEN 0x800142c0
-#pragma Aux_register(0x800142c0, name=>"io_uart2_clken")
-
-// User extension aux register io_uart2_rbr_thr_dll
-#define AR_IO_UART2_RBR_THR_DLL 0x80014200
-#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll")
-
-// User extension aux register io_uart2_ier_dlh
-#define AR_IO_UART2_IER_DLH 0x80014204
-#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh")
-
-// User extension aux register io_uart2_iir_fcr
-#define AR_IO_UART2_IIR_FCR 0x80014208
-#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr")
-
-// User extension aux register io_uart2_lcr
-#define AR_IO_UART2_LCR 0x8001420c
-#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr")
-
-// User extension aux register io_uart2_mcr
-#define AR_IO_UART2_MCR 0x80014210
-#pragma Aux_register(0x80014210, name=>"io_uart2_mcr")
-
-// User extension aux register io_uart2_lsr
-#define AR_IO_UART2_LSR 0x80014214
-#pragma Aux_register(0x80014214, name=>"io_uart2_lsr")
-
-// User extension aux register io_uart2_msr
-#define AR_IO_UART2_MSR 0x80014218
-#pragma Aux_register(0x80014218, name=>"io_uart2_msr")
-
-// User extension aux register io_uart2_usr
-#define AR_IO_UART2_USR 0x8001427c
-#pragma Aux_register(0x8001427c, name=>"io_uart2_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_IO_UART3_PRESENT	1
-
-// User extension aux register io_uart3_clken
-#define AR_IO_UART3_CLKEN 0x800143c0
-#pragma Aux_register(0x800143c0, name=>"io_uart3_clken")
-
-// User extension aux register io_uart3_rbr_thr_dll
-#define AR_IO_UART3_RBR_THR_DLL 0x80014300
-#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll")
-
-// User extension aux register io_uart3_ier_dlh
-#define AR_IO_UART3_IER_DLH 0x80014304
-#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh")
-
-// User extension aux register io_uart3_iir_fcr
-#define AR_IO_UART3_IIR_FCR 0x80014308
-#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr")
-
-// User extension aux register io_uart3_lcr
-#define AR_IO_UART3_LCR 0x8001430c
-#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr")
-
-// User extension aux register io_uart3_mcr
-#define AR_IO_UART3_MCR 0x80014310
-#pragma Aux_register(0x80014310, name=>"io_uart3_mcr")
-
-// User extension aux register io_uart3_lsr
-#define AR_IO_UART3_LSR 0x80014314
-#pragma Aux_register(0x80014314, name=>"io_uart3_lsr")
-
-// User extension aux register io_uart3_msr
-#define AR_IO_UART3_MSR 0x80014318
-#pragma Aux_register(0x80014318, name=>"io_uart3_msr")
-
-// User extension aux register io_uart3_usr
-#define AR_IO_UART3_USR 0x8001437c
-#pragma Aux_register(0x8001437c, name=>"io_uart3_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_MST0_IO_CREG_MST0_PRESENT	1
-
-// User extension aux register io_creg_mst0_ctrl
-#define AR_IO_CREG_MST0_CTRL 0x80018000
-#pragma Aux_register(0x80018000, name=>"io_creg_mst0_ctrl")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_CREG_SLV0_IO_CREG_SLV0_PRESENT	1
-
-// User extension aux register io_creg_slv0_obsr
-#define AR_IO_CREG_SLV0_OBSR 0x80018080
-#pragma Aux_register(0x80018080, name=>"io_creg_slv0_obsr")
-#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_SUBSYS_BCR_PRESENT	1
-
-// User extension aux register SUBSYS_BUILD
-#define AR_SUBSYS_BUILD 0xf0
-#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD")
-
-// User extension aux register SUBSYS_DSP_0_BUILD
-#define AR_SUBSYS_DSP_0_BUILD 0xa00
-#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD")
-
-// User extension aux register SUBSYS_DSP_0_CONFIG
-#define AR_SUBSYS_DSP_0_CONFIG 0xa02
-#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG")
-
-// User extension aux register SUBSYS_IO_0_BUILD
-#define AR_SUBSYS_IO_0_BUILD 0xa04
-#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD")
-
-// User extension aux register SUBSYS_IO_1_BUILD
-#define AR_SUBSYS_IO_1_BUILD 0xa05
-#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD")
-#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT	1
-
-// User extension aux register fpu_build
-#define AR_FPU_BUILD 0xc8
-#pragma Aux_register(0xc8, name=>"fpu_build")
-
-// User extension aux register fpu_ctrl
-#define AR_FPU_CTRL 0x300
-#pragma Aux_register(0x300, name=>"fpu_ctrl")
-
-// User extension aux register fpu_status
-#define AR_FPU_STATUS 0x301
-#pragma Aux_register(0x301, name=>"fpu_status")
-
-// User extension instruction fsmadd
-extern long fsmadd(long,long);
-#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsmsub
-extern long fsmsub(long,long);
-#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsmul
-extern long fsmul(long,long);
-#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsadd
-extern long fsadd(long,long);
-#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fssub
-extern long fssub(long,long);
-#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fcvt32
-extern long fcvt32(long,long);
-#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsdiv
-extern long fsdiv(long,long);
-#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmp
-extern long fscmp(long,long);
-#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmp
-extern long fscmp_f(long,long);
-#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmpf
-extern long fscmpf(long,long);
-#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmpf
-extern long fscmpf_f(long,long);
-#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fssqrt
-extern long fssqrt(long);
-#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT	1
-
-// User extension aux register aux_dpfp1l
-#define AR_AUX_DPFP1L 0x302
-#pragma Aux_register(0x302, name=>"aux_dpfp1l")
-
-// User extension aux register aux_dpfp1h
-#define AR_AUX_DPFP1H 0x303
-#pragma Aux_register(0x303, name=>"aux_dpfp1h")
-
-// User extension aux register aux_dpfp2l
-#define AR_AUX_DPFP2L 0x304
-#pragma Aux_register(0x304, name=>"aux_dpfp2l")
-
-// User extension aux register aux_dpfp2h
-#define AR_AUX_DPFP2H 0x305
-#pragma Aux_register(0x305, name=>"aux_dpfp2h")
-
-// User extension instruction dmulh11
-extern long dmulh11(long,long);
-#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh11
-extern long dmulh11_f(long,long);
-#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh12
-extern long dmulh12(long,long);
-#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh12
-extern long dmulh12_f(long,long);
-#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh21
-extern long dmulh21(long,long);
-#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh21
-extern long dmulh21_f(long,long);
-#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh22
-extern long dmulh22(long,long);
-#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh22
-extern long dmulh22_f(long,long);
-#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh11
-extern long daddh11(long,long);
-#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh11
-extern long daddh11_f(long,long);
-#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh12
-extern long daddh12(long,long);
-#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh12
-extern long daddh12_f(long,long);
-#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh21
-extern long daddh21(long,long);
-#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh21
-extern long daddh21_f(long,long);
-#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh22
-extern long daddh22(long,long);
-#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh22
-extern long daddh22_f(long,long);
-#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh11
-extern long dsubh11(long,long);
-#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh11
-extern long dsubh11_f(long,long);
-#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh12
-extern long dsubh12(long,long);
-#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh12
-extern long dsubh12_f(long,long);
-#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh21
-extern long dsubh21(long,long);
-#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh21
-extern long dsubh21_f(long,long);
-#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh22
-extern long dsubh22(long,long);
-#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh22
-extern long dsubh22_f(long,long);
-#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dexcl1
-extern long dexcl1(long,long);
-#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dexcl2
-extern long dexcl2(long,long);
-#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-
-#endif
-
-
-]]></string>
-  </configuration>
-  <configuration name="apex_assembly" filename="apexextensions.s">
-    <string><![CDATA[
-
-; Assembler directives for eia extensions in this design
-.set apex_com_arc_hardware_dfss_dsp_trig_present,1
-.extInstruction dsp_cos, 7, 0x1E, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_sin, 7, 0x1F, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_tan, 7, 0x22, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_acos, 7, 0x23, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_asin, 7, 0x24, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_atan, 7, 0x25, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_sqrt, 7, 0x20, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_sqrt15, 7, 0x21, FLAGS_NONE, SYNTAX_2OP
- .set apex_com_arc_hardware_dfss_io_gpio_4b0_present,1
- .set apex_com_arc_hardware_dfss_io_gpio_4b1_present,1
- .set apex_com_arc_hardware_dfss_io_gpio_4b2_present,1
- .set apex_com_arc_hardware_dfss_io_gpio_8b0_present,1
- .set apex_com_arc_hardware_dfss_io_gpio_8b1_present,1
- .set apex_com_arc_hardware_dfss_io_gpio_8b2_present,1
- .set apex_com_arc_hardware_dfss_io_gpio_8b3_present,1
- .set apex_com_arc_hardware_dfss_io_i2c_mst0_present,1
- .set apex_com_arc_hardware_dfss_io_i2c_mst1_present,1
- .set apex_com_arc_hardware_dfss_io_i2c_mst2_present,1
- .set apex_com_arc_hardware_dfss_io_spi_mst0_present,1
- .set apex_com_arc_hardware_dfss_io_spi_mst1_present,1
- .set apex_com_arc_hardware_dfss_io_spi_mst2_present,1
- .set apex_com_arc_hardware_dfss_io_spi_slv0_present,1
- .set apex_com_arc_hardware_dfss_io_uart0_present,1
- .set apex_com_arc_hardware_dfss_io_uart1_present,1
- .set apex_com_arc_hardware_dfss_io_uart2_present,1
- .set apex_com_arc_hardware_dfss_io_uart3_present,1
- .set apex_com_arc_hardware_dfss_io_creg_mst0_present,1
- .set apex_com_arc_hardware_dfss_io_creg_slv0_present,1
- .set apex_com_arc_hardware_dfss_subsys_bcr_present,1
-.set apex_com_arc_hardware_dfss_io_gpio_4b0_io_gpio_4b0_present,1
-.extAuxRegister io_gpio_4b0_debounce,0x80017c48,r|w
-.extAuxRegister io_gpio_4b0_clken,0x80017c80,r|w
-.extAuxRegister io_gpio_4b0_swporta_dr,0x80017c00,r|w
-.extAuxRegister io_gpio_4b0_swporta_ddr,0x80017c04,r|w
-.extAuxRegister io_gpio_4b0_inten,0x80017c30,r|w
-.extAuxRegister io_gpio_4b0_intmask,0x80017c34,r|w
-.extAuxRegister io_gpio_4b0_inttype_level,0x80017c38,r|w
-.extAuxRegister io_gpio_4b0_int_polarity,0x80017c3c,r|w
-.extAuxRegister io_gpio_4b0_intstatus,0x80017c40,r
-.extAuxRegister io_gpio_4b0_raw_intstatus,0x80017c44,r
-.extAuxRegister io_gpio_4b0_porta_eoi,0x80017c4c,w
-.extAuxRegister io_gpio_4b0_ext_porta,0x80017c50,r
-.extAuxRegister io_gpio_4b0_ls_sync,0x80017c60,r|w
-.extAuxRegister io_gpio_4b0_int_bothedge,0x80017c68,r|w
-.set apex_com_arc_hardware_dfss_io_gpio_4b1_io_gpio_4b1_present,1
-.extAuxRegister io_gpio_4b1_debounce,0x80017d48,r|w
-.extAuxRegister io_gpio_4b1_clken,0x80017d80,r|w
-.extAuxRegister io_gpio_4b1_swporta_dr,0x80017d00,r|w
-.extAuxRegister io_gpio_4b1_swporta_ddr,0x80017d04,r|w
-.extAuxRegister io_gpio_4b1_inten,0x80017d30,r|w
-.extAuxRegister io_gpio_4b1_intmask,0x80017d34,r|w
-.extAuxRegister io_gpio_4b1_inttype_level,0x80017d38,r|w
-.extAuxRegister io_gpio_4b1_int_polarity,0x80017d3c,r|w
-.extAuxRegister io_gpio_4b1_intstatus,0x80017d40,r
-.extAuxRegister io_gpio_4b1_raw_intstatus,0x80017d44,r
-.extAuxRegister io_gpio_4b1_porta_eoi,0x80017d4c,w
-.extAuxRegister io_gpio_4b1_ext_porta,0x80017d50,r
-.extAuxRegister io_gpio_4b1_ls_sync,0x80017d60,r|w
-.extAuxRegister io_gpio_4b1_int_bothedge,0x80017d68,r|w
-.set apex_com_arc_hardware_dfss_io_gpio_4b2_io_gpio_4b2_present,1
-.extAuxRegister io_gpio_4b2_debounce,0x80017e48,r|w
-.extAuxRegister io_gpio_4b2_clken,0x80017e80,r|w
-.extAuxRegister io_gpio_4b2_swporta_dr,0x80017e00,r|w
-.extAuxRegister io_gpio_4b2_swporta_ddr,0x80017e04,r|w
-.extAuxRegister io_gpio_4b2_inten,0x80017e30,r|w
-.extAuxRegister io_gpio_4b2_intmask,0x80017e34,r|w
-.extAuxRegister io_gpio_4b2_inttype_level,0x80017e38,r|w
-.extAuxRegister io_gpio_4b2_int_polarity,0x80017e3c,r|w
-.extAuxRegister io_gpio_4b2_intstatus,0x80017e40,r
-.extAuxRegister io_gpio_4b2_raw_intstatus,0x80017e44,r
-.extAuxRegister io_gpio_4b2_porta_eoi,0x80017e4c,w
-.extAuxRegister io_gpio_4b2_ext_porta,0x80017e50,r
-.extAuxRegister io_gpio_4b2_ls_sync,0x80017e60,r|w
-.extAuxRegister io_gpio_4b2_int_bothedge,0x80017e68,r|w
-.set apex_com_arc_hardware_dfss_io_gpio_8b0_io_gpio_8b0_present,1
-.extAuxRegister io_gpio_8b0_debounce,0x80017848,r|w
-.extAuxRegister io_gpio_8b0_clken,0x80017880,r|w
-.extAuxRegister io_gpio_8b0_swporta_dr,0x80017800,r|w
-.extAuxRegister io_gpio_8b0_swporta_ddr,0x80017804,r|w
-.extAuxRegister io_gpio_8b0_inten,0x80017830,r|w
-.extAuxRegister io_gpio_8b0_intmask,0x80017834,r|w
-.extAuxRegister io_gpio_8b0_inttype_level,0x80017838,r|w
-.extAuxRegister io_gpio_8b0_int_polarity,0x8001783c,r|w
-.extAuxRegister io_gpio_8b0_intstatus,0x80017840,r
-.extAuxRegister io_gpio_8b0_raw_intstatus,0x80017844,r
-.extAuxRegister io_gpio_8b0_porta_eoi,0x8001784c,w
-.extAuxRegister io_gpio_8b0_ext_porta,0x80017850,r
-.extAuxRegister io_gpio_8b0_ls_sync,0x80017860,r|w
-.extAuxRegister io_gpio_8b0_int_bothedge,0x80017868,r|w
-.set apex_com_arc_hardware_dfss_io_gpio_8b1_io_gpio_8b1_present,1
-.extAuxRegister io_gpio_8b1_debounce,0x80017948,r|w
-.extAuxRegister io_gpio_8b1_clken,0x80017980,r|w
-.extAuxRegister io_gpio_8b1_swporta_dr,0x80017900,r|w
-.extAuxRegister io_gpio_8b1_swporta_ddr,0x80017904,r|w
-.extAuxRegister io_gpio_8b1_inten,0x80017930,r|w
-.extAuxRegister io_gpio_8b1_intmask,0x80017934,r|w
-.extAuxRegister io_gpio_8b1_inttype_level,0x80017938,r|w
-.extAuxRegister io_gpio_8b1_int_polarity,0x8001793c,r|w
-.extAuxRegister io_gpio_8b1_intstatus,0x80017940,r
-.extAuxRegister io_gpio_8b1_raw_intstatus,0x80017944,r
-.extAuxRegister io_gpio_8b1_porta_eoi,0x8001794c,w
-.extAuxRegister io_gpio_8b1_ext_porta,0x80017950,r
-.extAuxRegister io_gpio_8b1_ls_sync,0x80017960,r|w
-.extAuxRegister io_gpio_8b1_int_bothedge,0x80017968,r|w
-.set apex_com_arc_hardware_dfss_io_gpio_8b2_io_gpio_8b2_present,1
-.extAuxRegister io_gpio_8b2_debounce,0x80017a48,r|w
-.extAuxRegister io_gpio_8b2_clken,0x80017a80,r|w
-.extAuxRegister io_gpio_8b2_swporta_dr,0x80017a00,r|w
-.extAuxRegister io_gpio_8b2_swporta_ddr,0x80017a04,r|w
-.extAuxRegister io_gpio_8b2_inten,0x80017a30,r|w
-.extAuxRegister io_gpio_8b2_intmask,0x80017a34,r|w
-.extAuxRegister io_gpio_8b2_inttype_level,0x80017a38,r|w
-.extAuxRegister io_gpio_8b2_int_polarity,0x80017a3c,r|w
-.extAuxRegister io_gpio_8b2_intstatus,0x80017a40,r
-.extAuxRegister io_gpio_8b2_raw_intstatus,0x80017a44,r
-.extAuxRegister io_gpio_8b2_porta_eoi,0x80017a4c,w
-.extAuxRegister io_gpio_8b2_ext_porta,0x80017a50,r
-.extAuxRegister io_gpio_8b2_ls_sync,0x80017a60,r|w
-.extAuxRegister io_gpio_8b2_int_bothedge,0x80017a68,r|w
-.set apex_com_arc_hardware_dfss_io_gpio_8b3_io_gpio_8b3_present,1
-.extAuxRegister io_gpio_8b3_debounce,0x80017b48,r|w
-.extAuxRegister io_gpio_8b3_clken,0x80017b80,r|w
-.extAuxRegister io_gpio_8b3_swporta_dr,0x80017b00,r|w
-.extAuxRegister io_gpio_8b3_swporta_ddr,0x80017b04,r|w
-.extAuxRegister io_gpio_8b3_inten,0x80017b30,r|w
-.extAuxRegister io_gpio_8b3_intmask,0x80017b34,r|w
-.extAuxRegister io_gpio_8b3_inttype_level,0x80017b38,r|w
-.extAuxRegister io_gpio_8b3_int_polarity,0x80017b3c,r|w
-.extAuxRegister io_gpio_8b3_intstatus,0x80017b40,r
-.extAuxRegister io_gpio_8b3_raw_intstatus,0x80017b44,r
-.extAuxRegister io_gpio_8b3_porta_eoi,0x80017b4c,w
-.extAuxRegister io_gpio_8b3_ext_porta,0x80017b50,r
-.extAuxRegister io_gpio_8b3_ls_sync,0x80017b60,r|w
-.extAuxRegister io_gpio_8b3_int_bothedge,0x80017b68,r|w
-.set apex_com_arc_hardware_dfss_io_i2c_mst0_io_i2c_mst0_present,1
-.extAuxRegister io_i2c_mst0_clken,0x800120c0,r|w
-.extAuxRegister io_i2c_mst0_con,0x80012000,r|w
-.extAuxRegister io_i2c_mst0_tar,0x80012004,r|w
-.extAuxRegister io_i2c_mst0_data_cmd,0x80012010,r|w
-.extAuxRegister io_i2c_mst0_ss_scl_hcnt,0x80012014,r|w
-.extAuxRegister io_i2c_mst0_ss_scl_lcnt,0x80012018,r|w
-.extAuxRegister io_i2c_mst0_fs_scl_hcnt,0x8001201c,r|w
-.extAuxRegister io_i2c_mst0_fs_scl_lcnt,0x80012020,r|w
-.extAuxRegister io_i2c_mst0_intr_stat,0x8001202c,r
-.extAuxRegister io_i2c_mst0_intr_mask,0x80012030,r|w
-.extAuxRegister io_i2c_mst0_raw_intr_stat,0x80012034,r
-.extAuxRegister io_i2c_mst0_rx_tl,0x80012038,r|w
-.extAuxRegister io_i2c_mst0_tx_tl,0x8001203c,r|w
-.extAuxRegister io_i2c_mst0_clr_intr,0x80012040,r
-.extAuxRegister io_i2c_mst0_clr_rx_under,0x80012044,r
-.extAuxRegister io_i2c_mst0_clr_rx_over,0x80012048,r
-.extAuxRegister io_i2c_mst0_clr_tx_over,0x8001204c,r
-.extAuxRegister io_i2c_mst0_clr_tx_abrt,0x80012054,r
-.extAuxRegister io_i2c_mst0_clr_activity,0x8001205c,r
-.extAuxRegister io_i2c_mst0_clr_stop_det,0x80012060,r
-.extAuxRegister io_i2c_mst0_clr_start_det,0x80012064,r
-.extAuxRegister io_i2c_mst0_enable,0x8001206c,r|w
-.extAuxRegister io_i2c_mst0_status,0x80012070,r
-.extAuxRegister io_i2c_mst0_txflr,0x80012074,r
-.extAuxRegister io_i2c_mst0_rxflr,0x80012078,r
-.extAuxRegister io_i2c_mst0_sda_hold,0x8001207c,r|w
-.extAuxRegister io_i2c_mst0_tx_abrt_source,0x80012080,r
-.extAuxRegister io_i2c_mst0_enable_status,0x8001209c,r
-.extAuxRegister io_i2c_mst0_fs_spklen,0x800120a0,r|w
-.set apex_com_arc_hardware_dfss_io_i2c_mst1_io_i2c_mst1_present,1
-.extAuxRegister io_i2c_mst1_clken,0x800121c0,r|w
-.extAuxRegister io_i2c_mst1_con,0x80012100,r|w
-.extAuxRegister io_i2c_mst1_tar,0x80012104,r|w
-.extAuxRegister io_i2c_mst1_data_cmd,0x80012110,r|w
-.extAuxRegister io_i2c_mst1_ss_scl_hcnt,0x80012114,r|w
-.extAuxRegister io_i2c_mst1_ss_scl_lcnt,0x80012118,r|w
-.extAuxRegister io_i2c_mst1_fs_scl_hcnt,0x8001211c,r|w
-.extAuxRegister io_i2c_mst1_fs_scl_lcnt,0x80012120,r|w
-.extAuxRegister io_i2c_mst1_intr_stat,0x8001212c,r
-.extAuxRegister io_i2c_mst1_intr_mask,0x80012130,r|w
-.extAuxRegister io_i2c_mst1_raw_intr_stat,0x80012134,r
-.extAuxRegister io_i2c_mst1_rx_tl,0x80012138,r|w
-.extAuxRegister io_i2c_mst1_tx_tl,0x8001213c,r|w
-.extAuxRegister io_i2c_mst1_clr_intr,0x80012140,r
-.extAuxRegister io_i2c_mst1_clr_rx_under,0x80012144,r
-.extAuxRegister io_i2c_mst1_clr_rx_over,0x80012148,r
-.extAuxRegister io_i2c_mst1_clr_tx_over,0x8001214c,r
-.extAuxRegister io_i2c_mst1_clr_tx_abrt,0x80012154,r
-.extAuxRegister io_i2c_mst1_clr_activity,0x8001215c,r
-.extAuxRegister io_i2c_mst1_clr_stop_det,0x80012160,r
-.extAuxRegister io_i2c_mst1_clr_start_det,0x80012164,r
-.extAuxRegister io_i2c_mst1_enable,0x8001216c,r|w
-.extAuxRegister io_i2c_mst1_status,0x80012170,r
-.extAuxRegister io_i2c_mst1_txflr,0x80012174,r
-.extAuxRegister io_i2c_mst1_rxflr,0x80012178,r
-.extAuxRegister io_i2c_mst1_sda_hold,0x8001217c,r|w
-.extAuxRegister io_i2c_mst1_tx_abrt_source,0x80012180,r
-.extAuxRegister io_i2c_mst1_enable_status,0x8001219c,r
-.extAuxRegister io_i2c_mst1_fs_spklen,0x800121a0,r|w
-.set apex_com_arc_hardware_dfss_io_i2c_mst2_io_i2c_mst2_present,1
-.extAuxRegister io_i2c_mst2_clken,0x800122c0,r|w
-.extAuxRegister io_i2c_mst2_con,0x80012200,r|w
-.extAuxRegister io_i2c_mst2_tar,0x80012204,r|w
-.extAuxRegister io_i2c_mst2_data_cmd,0x80012210,r|w
-.extAuxRegister io_i2c_mst2_ss_scl_hcnt,0x80012214,r|w
-.extAuxRegister io_i2c_mst2_ss_scl_lcnt,0x80012218,r|w
-.extAuxRegister io_i2c_mst2_fs_scl_hcnt,0x8001221c,r|w
-.extAuxRegister io_i2c_mst2_fs_scl_lcnt,0x80012220,r|w
-.extAuxRegister io_i2c_mst2_intr_stat,0x8001222c,r
-.extAuxRegister io_i2c_mst2_intr_mask,0x80012230,r|w
-.extAuxRegister io_i2c_mst2_raw_intr_stat,0x80012234,r
-.extAuxRegister io_i2c_mst2_rx_tl,0x80012238,r|w
-.extAuxRegister io_i2c_mst2_tx_tl,0x8001223c,r|w
-.extAuxRegister io_i2c_mst2_clr_intr,0x80012240,r
-.extAuxRegister io_i2c_mst2_clr_rx_under,0x80012244,r
-.extAuxRegister io_i2c_mst2_clr_rx_over,0x80012248,r
-.extAuxRegister io_i2c_mst2_clr_tx_over,0x8001224c,r
-.extAuxRegister io_i2c_mst2_clr_tx_abrt,0x80012254,r
-.extAuxRegister io_i2c_mst2_clr_activity,0x8001225c,r
-.extAuxRegister io_i2c_mst2_clr_stop_det,0x80012260,r
-.extAuxRegister io_i2c_mst2_clr_start_det,0x80012264,r
-.extAuxRegister io_i2c_mst2_enable,0x8001226c,r|w
-.extAuxRegister io_i2c_mst2_status,0x80012270,r
-.extAuxRegister io_i2c_mst2_txflr,0x80012274,r
-.extAuxRegister io_i2c_mst2_rxflr,0x80012278,r
-.extAuxRegister io_i2c_mst2_sda_hold,0x8001227c,r|w
-.extAuxRegister io_i2c_mst2_tx_abrt_source,0x80012280,r
-.extAuxRegister io_i2c_mst2_enable_status,0x8001229c,r
-.extAuxRegister io_i2c_mst2_fs_spklen,0x800122a0,r|w
-.set apex_com_arc_hardware_dfss_io_spi_mst0_io_spi_mst0_present,1
-.extAuxRegister io_spi_mst0_ctrlr0,0x80010000,r|w
-.extAuxRegister io_spi_mst0_ctrlr1,0x80010001,r|w
-.extAuxRegister io_spi_mst0_spien,0x80010002,r|w
-.extAuxRegister io_spi_mst0_ser,0x80010004,r|w
-.extAuxRegister io_spi_mst0_baudr,0x80010005,r|w
-.extAuxRegister io_spi_mst0_txftlr,0x80010006,r|w
-.extAuxRegister io_spi_mst0_rxftlr,0x80010007,r|w
-.extAuxRegister io_spi_mst0_txflr,0x80010008,r
-.extAuxRegister io_spi_mst0_rxflr,0x80010009,r
-.extAuxRegister io_spi_mst0_sr,0x8001000a,r
-.extAuxRegister io_spi_mst0_imr,0x8001000b,r|w
-.extAuxRegister io_spi_mst0_isr,0x8001000c,r
-.extAuxRegister io_spi_mst0_risr,0x8001000d,r
-.extAuxRegister io_spi_mst0_txoicr,0x8001000e,r
-.extAuxRegister io_spi_mst0_rxoicr,0x8001000f,r
-.extAuxRegister io_spi_mst0_rxuicr,0x80010010,r
-.extAuxRegister io_spi_mst0_icr,0x80010012,r|w
-.extAuxRegister io_spi_mst0_clken,0x80010016,r|w
-.extAuxRegister io_spi_mst0_dr,0x80010018,r|w
-.extAuxRegister io_spi_mst0_rx_sample_dly,0x8001003c,r|w
-.set apex_com_arc_hardware_dfss_io_spi_mst1_io_spi_mst1_present,1
-.extAuxRegister io_spi_mst1_ctrlr0,0x80010100,r|w
-.extAuxRegister io_spi_mst1_ctrlr1,0x80010101,r|w
-.extAuxRegister io_spi_mst1_spien,0x80010102,r|w
-.extAuxRegister io_spi_mst1_ser,0x80010104,r|w
-.extAuxRegister io_spi_mst1_baudr,0x80010105,r|w
-.extAuxRegister io_spi_mst1_txftlr,0x80010106,r|w
-.extAuxRegister io_spi_mst1_rxftlr,0x80010107,r|w
-.extAuxRegister io_spi_mst1_txflr,0x80010108,r
-.extAuxRegister io_spi_mst1_rxflr,0x80010109,r
-.extAuxRegister io_spi_mst1_sr,0x8001010a,r
-.extAuxRegister io_spi_mst1_imr,0x8001010b,r|w
-.extAuxRegister io_spi_mst1_isr,0x8001010c,r
-.extAuxRegister io_spi_mst1_risr,0x8001010d,r
-.extAuxRegister io_spi_mst1_txoicr,0x8001010e,r
-.extAuxRegister io_spi_mst1_rxoicr,0x8001010f,r
-.extAuxRegister io_spi_mst1_rxuicr,0x80010110,r
-.extAuxRegister io_spi_mst1_icr,0x80010112,r|w
-.extAuxRegister io_spi_mst1_clken,0x80010116,r|w
-.extAuxRegister io_spi_mst1_dr,0x80010118,r|w
-.extAuxRegister io_spi_mst1_rx_sample_dly,0x8001013c,r|w
-.set apex_com_arc_hardware_dfss_io_spi_mst2_io_spi_mst2_present,1
-.extAuxRegister io_spi_mst2_ctrlr0,0x80010200,r|w
-.extAuxRegister io_spi_mst2_ctrlr1,0x80010201,r|w
-.extAuxRegister io_spi_mst2_spien,0x80010202,r|w
-.extAuxRegister io_spi_mst2_ser,0x80010204,r|w
-.extAuxRegister io_spi_mst2_baudr,0x80010205,r|w
-.extAuxRegister io_spi_mst2_txftlr,0x80010206,r|w
-.extAuxRegister io_spi_mst2_rxftlr,0x80010207,r|w
-.extAuxRegister io_spi_mst2_txflr,0x80010208,r
-.extAuxRegister io_spi_mst2_rxflr,0x80010209,r
-.extAuxRegister io_spi_mst2_sr,0x8001020a,r
-.extAuxRegister io_spi_mst2_imr,0x8001020b,r|w
-.extAuxRegister io_spi_mst2_isr,0x8001020c,r
-.extAuxRegister io_spi_mst2_risr,0x8001020d,r
-.extAuxRegister io_spi_mst2_txoicr,0x8001020e,r
-.extAuxRegister io_spi_mst2_rxoicr,0x8001020f,r
-.extAuxRegister io_spi_mst2_rxuicr,0x80010210,r
-.extAuxRegister io_spi_mst2_icr,0x80010212,r|w
-.extAuxRegister io_spi_mst2_clken,0x80010216,r|w
-.extAuxRegister io_spi_mst2_dr,0x80010218,r|w
-.extAuxRegister io_spi_mst2_rx_sample_dly,0x8001023c,r|w
-.set apex_com_arc_hardware_dfss_io_spi_slv0_io_spi_slv0_present,1
-.extAuxRegister io_spi_slv0_ctrlr0,0x80011000,r|w
-.extAuxRegister io_spi_slv0_spien,0x80011002,r|w
-.extAuxRegister io_spi_slv0_txftlr,0x80011006,r|w
-.extAuxRegister io_spi_slv0_rxftlr,0x80011007,r|w
-.extAuxRegister io_spi_slv0_txflr,0x80011008,r
-.extAuxRegister io_spi_slv0_rxflr,0x80011009,r
-.extAuxRegister io_spi_slv0_sr,0x8001100a,r
-.extAuxRegister io_spi_slv0_imr,0x8001100b,r|w
-.extAuxRegister io_spi_slv0_isr,0x8001100c,r
-.extAuxRegister io_spi_slv0_risr,0x8001100d,r
-.extAuxRegister io_spi_slv0_txoicr,0x8001100e,r
-.extAuxRegister io_spi_slv0_rxoicr,0x8001100f,r
-.extAuxRegister io_spi_slv0_rxuicr,0x80011010,r
-.extAuxRegister io_spi_slv0_icr,0x80011012,r|w
-.extAuxRegister io_spi_slv0_clken,0x80011016,r|w
-.extAuxRegister io_spi_slv0_dr,0x80011018,r|w
-.set apex_com_arc_hardware_dfss_io_uart0_io_uart0_present,1
-.extAuxRegister io_uart0_clken,0x800140c0,r|w
-.extAuxRegister io_uart0_rbr_thr_dll,0x80014000,r|w
-.extAuxRegister io_uart0_ier_dlh,0x80014004,r|w
-.extAuxRegister io_uart0_iir_fcr,0x80014008,r|w
-.extAuxRegister io_uart0_lcr,0x8001400c,r|w
-.extAuxRegister io_uart0_mcr,0x80014010,r|w
-.extAuxRegister io_uart0_lsr,0x80014014,r
-.extAuxRegister io_uart0_msr,0x80014018,r
-.extAuxRegister io_uart0_usr,0x8001407c,r
-.set apex_com_arc_hardware_dfss_io_uart1_io_uart1_present,1
-.extAuxRegister io_uart1_clken,0x800141c0,r|w
-.extAuxRegister io_uart1_rbr_thr_dll,0x80014100,r|w
-.extAuxRegister io_uart1_ier_dlh,0x80014104,r|w
-.extAuxRegister io_uart1_iir_fcr,0x80014108,r|w
-.extAuxRegister io_uart1_lcr,0x8001410c,r|w
-.extAuxRegister io_uart1_mcr,0x80014110,r|w
-.extAuxRegister io_uart1_lsr,0x80014114,r
-.extAuxRegister io_uart1_msr,0x80014118,r
-.extAuxRegister io_uart1_usr,0x8001417c,r
-.set apex_com_arc_hardware_dfss_io_uart2_io_uart2_present,1
-.extAuxRegister io_uart2_clken,0x800142c0,r|w
-.extAuxRegister io_uart2_rbr_thr_dll,0x80014200,r|w
-.extAuxRegister io_uart2_ier_dlh,0x80014204,r|w
-.extAuxRegister io_uart2_iir_fcr,0x80014208,r|w
-.extAuxRegister io_uart2_lcr,0x8001420c,r|w
-.extAuxRegister io_uart2_mcr,0x80014210,r|w
-.extAuxRegister io_uart2_lsr,0x80014214,r
-.extAuxRegister io_uart2_msr,0x80014218,r
-.extAuxRegister io_uart2_usr,0x8001427c,r
-.set apex_com_arc_hardware_dfss_io_uart3_io_uart3_present,1
-.extAuxRegister io_uart3_clken,0x800143c0,r|w
-.extAuxRegister io_uart3_rbr_thr_dll,0x80014300,r|w
-.extAuxRegister io_uart3_ier_dlh,0x80014304,r|w
-.extAuxRegister io_uart3_iir_fcr,0x80014308,r|w
-.extAuxRegister io_uart3_lcr,0x8001430c,r|w
-.extAuxRegister io_uart3_mcr,0x80014310,r|w
-.extAuxRegister io_uart3_lsr,0x80014314,r
-.extAuxRegister io_uart3_msr,0x80014318,r
-.extAuxRegister io_uart3_usr,0x8001437c,r
-.set apex_com_arc_hardware_dfss_io_creg_mst0_io_creg_mst0_present,1
-.extAuxRegister io_creg_mst0_ctrl,0x80018000,r|w
-.set apex_com_arc_hardware_dfss_io_creg_slv0_io_creg_slv0_present,1
-.extAuxRegister io_creg_slv0_obsr,0x80018080,r
-.set apex_com_arc_hardware_dfss_subsys_bcr_subsys_bcr_present,1
-.extAuxRegister SUBSYS_BUILD,0xf0,r
-.extAuxRegister SUBSYS_DSP_0_BUILD,0xa00,r
-.extAuxRegister SUBSYS_DSP_0_CONFIG,0xa02,r
-.extAuxRegister SUBSYS_IO_0_BUILD,0xa04,r
-.extAuxRegister SUBSYS_IO_1_BUILD,0xa05,r
-.set apex_com_arc_hardware_floating_point_unit_fpu_present,1
-.extAuxRegister fpu_build,0xc8,r
-.extAuxRegister fpu_ctrl,0x300,r|w
-.extAuxRegister fpu_status,0x301,r|w
-.extInstruction fsmadd,6,5,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsmsub,6,6,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsmul,6,0,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsadd,6,1,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fssub,6,2,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fcvt32,6,8,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsdiv,6,7,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fscmp,6,3,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction fscmpf,6,4,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction fssqrt,6,0,FLAGS_NONE,SYNTAX_2OP
-.set apex_com_arc_hardware_floating_point_unit_fpu_dp_assist_present,1
-.extAuxRegister aux_dpfp1l,0x302,r|w
-.extAuxRegister aux_dpfp1h,0x303,r|w
-.extAuxRegister aux_dpfp2l,0x304,r|w
-.extAuxRegister aux_dpfp2h,0x305,r|w
-.extInstruction dmulh11,6,48,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dmulh12,6,49,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dmulh21,6,50,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dmulh22,6,51,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh11,6,52,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh12,6,53,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh21,6,54,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh22,6,55,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh11,6,56,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh12,6,57,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh21,6,58,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh22,6,59,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dexcl1,6,60,SUFFIX_COND,SYNTAX_3OP
-.extInstruction dexcl2,6,61,SUFFIX_COND,SYNTAX_3OP
-
-]]></string>
-  </configuration>
-</config_list>
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf b/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
deleted file mode 100644
index 00cf0a3050b..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/arc/memory.lcf
+++ /dev/null
@@ -1,50 +0,0 @@
-    # SYSTEM memory regions indicate where external memory might be located.
-    #   The TCF has no specific knowledge of whether SYSTEM regions contain 
-    #   external memory or not.
-    # CCMWRAP memory regions indicate unusable portions of the address space
-    #   due to CCM memory wrapping into upper addresses beyond its size
-
-    MEMORY {
-        ICCM0   : ORIGIN = 0x00000000, LENGTH = 0x00010000
-    #   CCMWRAP0: ORIGIN = 0x00010000, LENGTH = 0x0fff0000
-        ICCM1   : ORIGIN = 0x10000000, LENGTH = 0x00080000
-    #   CCMWRAP1: ORIGIN = 0x10080000, LENGTH = 0x0ff80000
-    #   SYSTEM0 : ORIGIN = 0x20000000, LENGTH = 0x60000000
-        DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00080000
-    #   CCMWRAP2: ORIGIN = 0x80080000, LENGTH = 0x0ff80000
-        XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00008000
-    #   CCMWRAP3: ORIGIN = 0x90008000, LENGTH = 0x0fff8000
-        YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00008000
-    #   CCMWRAP4: ORIGIN = 0xa0008000, LENGTH = 0x0fff8000
-    #   SYSTEM1 : ORIGIN = 0xb0000000, LENGTH = 0x50000000
-    }
-    SECTIONS {
-        GROUP BLOCK(4): {
-            .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:132): {}
-        .text? : { *('.text$crt*') }
-            * (TEXT): {}
-            * (LIT): {}
-            .rodata_in_data?:{}
-        } > ICCM1
-
-        GROUP BLOCK(4): {
-        /* _SDA_BASE_ computed implicitly */
-            .sdata?: {}
-            .sbss?: {}
-            .protobuf?: {}
-            * (DATA): {}
-            * (BSS): {}
-           .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {}
-           .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
-            .tensor_arena?: {}
-        } > DCCM
-        GROUP BLOCK(4): {
-            .Xdata? : {}
-            } > XCCM
-        GROUP BLOCK(4): {
-            .Ydata? : {}
-            } > YCCM
-    }
-
-
-

From 5b2f6d322cb4943548935b0fc52b528e18c4ad7d Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Thu, 30 Apr 2020 10:56:08 +0300
Subject: [PATCH 0093/1533] Cases with channel multiplier for DW conv (int8)
 temporarily fallback to reference code

---
 tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 081a40b23b5..2aad76bc042 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -69,8 +69,14 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                      const TfLiteDepthwiseConvParams* params) {
   const auto* affine_quantization =
       reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  const int in_ch = SizeOfDimension(input, 3);
+  const int filters_num = SizeOfDimension(filter, 3);
+
   // MLI optimized version only supports int8 dataype, dilation factor of 1 and
   // per-axis quantization of weights (no broadcasting/per-tensor)
+  // TODO: ((in_ch == filters_num) || (in_ch == 1)) is a forbidding of  
+  // channel multiplier logic for multichannel input.
+  // To be removed after it will be supported in MLI 
   bool ret_val = (filter->type == kTfLiteInt8) &&
                  (input->type == kTfLiteInt8) &&
                  (bias->type == kTfLiteInt32) &&
@@ -78,6 +84,7 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                  (params->dilation_height_factor == 1) &&
                  (affine_quantization->scale->size ==
                   filter->dims->data[kDepthwiseConvQuantizedDimension]) &&
+                 ((in_ch == filters_num) || (in_ch == 1)) &&
                  affine_quantization->scale->size <= (kMaxChannels * 2);
   return ret_val;
 }

From ea1a6715ef2fc136b06986cdade85f6a084855be Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Fri, 1 May 2020 13:46:45 +0300
Subject: [PATCH 0094/1533] ARC related documentation in readme files

---
 .../lite/micro/examples/hello_world/README.md |  45 ++++
 .../micro/examples/micro_speech/README.md     |  51 +++++
 .../micro/examples/person_detection/README.md |  52 +++++
 .../person_detection_experimental/README.md   |  54 +++++
 .../lite/micro/kernels/arc_mli/README.md      |  57 +++++
 .../micro/tools/make/targets/arc/README.md    | 214 ++++++++++++++++++
 .../make/templates/arc/README_ARC.md.tpl      |  45 +++-
 .../templates/arc/README_ARC_EMSDP.md.tpl     |  48 +++-
 8 files changed, 564 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/lite/micro/kernels/arc_mli/README.md
 create mode 100644 tensorflow/lite/micro/tools/make/targets/arc/README.md

diff --git a/tensorflow/lite/micro/examples/hello_world/README.md b/tensorflow/lite/micro/examples/hello_world/README.md
index 3f3fef67f28..a0a2b678157 100644
--- a/tensorflow/lite/micro/examples/hello_world/README.md
+++ b/tensorflow/lite/micro/examples/hello_world/README.md
@@ -15,6 +15,7 @@ animation.
 ## Table of contents
 
 -   [Understand the model](#understand-the-model)
+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@@ -30,6 +31,50 @@ Walk through this tutorial to understand what the model does,
 how it works, and how it was converted for use with TensorFlow Lite for
 Microcontrollers.
 
+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+###	Initial Setup
+
+Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. 
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following command:
+
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
+
+### Build and Run Example 
+
+For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. 
+
+1.	You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection.
+
+2.	Go to the generated example project director 
+    
+        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
+
+3.	Build the example using 
+
+        make app
+
+4.	To generate artefacts for self-boot of example from the board use 
+        
+        make flash
+
+5.	To run application from the board using microSD card:
+    * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes)
+    * Plug in the microSD card into the J11 connector. 
+    * Push the RST button. If a red LED is lit beside RST button, push the CFG button.
+
+6. If you have the MetaWare Debugger installed in your environment:
+    * To run application from the console using it type `make run`. 
+    * To stop the execution type `Ctrl+C` in the console several times. 
+
+In both cases (step 5 and 6) you will see the application output in the serial terminal.
+
 ## Deploy to Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index 7ccaa806366..ba55a7d8493 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -16,6 +16,7 @@ kilobytes of Flash.
 
 ## Table of contents
 
+-   [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
 -   [Deploy to Arduino](#deploy-to-arduino)
 -   [Deploy to ESP32](#deploy-to-esp32)
 -   [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
@@ -25,6 +26,56 @@ kilobytes of Flash.
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Train your own model](#train-your-own-model)
 
+## Deploy to ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example is quantized with symmetric uint8 scheme. As noted in [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md), embARC MLI supports optimized kernels for int8 quantization only.  Therefore, this example will only use TFLM reference kernels.
+
+The ARC EM SDP board contains the rich set of extension interfaces. You can choose any compatible microphone and modify [audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc) file accordingly to use input from your specific camera. By default, results of running this example are printed to the console.  If you would like to instead implement some target-specific actions, you need to modify [command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc) accordingly. 
+
+The reference implementations of these files are used by default on the EM SDP. 
+
+### Initial setup
+
+Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. 
+
+### Generate Example Project
+
+As default example doesn’t provide any output without real audio, it is recommended to get started with example for mock data. The project for ARC EM SDP platform can be generated with the following command:
+
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project
+
+### Build and Run Example 
+
+For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. 
+
+1.	You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection.
+
+2.	Go to the generated example project director 
+    
+        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make`
+        
+3.	Build the example using 
+
+        make app
+
+4.	To generate artefacts for self-boot of example from the board use 
+        
+        make flash
+
+5.	To run application from the board using microSD card:
+    * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes)
+    * Plug in the microSD card into the J11 connector. 
+    * Push the RST button. If a red LED is lit beside RST button, push the CFG button.
+
+6. If you have the MetaWare Debugger installed in your environment:
+    * To run application from the console using it type `make run`. 
+    * To stop the execution type `Ctrl+C` in the console several times. 
+
+In both cases (step 5 and 6) you will see the application output in the serial terminal.
+
 ## Deploy to Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md
index 5ee7bda9914..ae47c4be0ff 100644
--- a/tensorflow/lite/micro/examples/person_detection/README.md
+++ b/tensorflow/lite/micro/examples/person_detection/README.md
@@ -6,6 +6,7 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
 
 ## Table of contents
 -   [Getting started](#getting-started)
+-   [Running on ARC EM SDP](#running-on-arc-em-sdp)
 -   [Running on Arduino](#running-on-arduino)
 -   [Running on ESP32](#running-on-esp32)
 -   [Running on SparkFun Edge](#running-on-sparkfun-edge)
@@ -13,6 +14,57 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
 -   [Debugging image capture](#debugging-image-capture)
 -   [Training your own model](#training-your-own-model)
 
+## Running on ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example is quantized with symmetric uint8 scheme. As noted in [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md), embARC MLI supports optimized kernels for int8 quantization only.  Therefore, this example will only use TFLM reference kernels.
+
+The ARC EM SDP board contains the reach set of extension interfaces. 
+You can choose any compatible camera and modify [image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc) file accordingly to use input from your specific camera. By default, results of running this example are printed to the console.  If you would like to instead implement some target-specific actions, you need to modify [detection_responder.cc](/tensorflow/lite/micro/examples/person_detection/detection_responder.cc) accordingly. 
+
+The reference implementations of these files are used by default on the EM SDP. 
+
+### Initial setup
+
+Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. 
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following command:
+    
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project `
+
+### Build and Run Example 
+
+For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. 
+
+1.	You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection.
+
+2.	Go to the generated example project director 
+    
+        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make`
+        
+3.	Build the example using 
+
+        make app
+
+4.	To generate artefacts for self-boot of example from the board use 
+        
+        make flash
+
+5.	To run application from the board using microSD card:
+    * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes)
+    * Plug in the microSD card into the J11 connector. 
+    * Push the RST button. If a red LED is lit beside RST button, push the CFG button.
+
+6. If you have the MetaWare Debugger installed in your environment:
+    * To run application from the console using it type `make run`. 
+    * To stop the execution type `Ctrl+C` in the console several times. 
+
+In both cases (step 5 and 6) you will see the application output in the serial terminal.
+
 ## Running on Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index d8aaa9ba383..af0186fb276 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -7,12 +7,66 @@ This uses the experimental int8 quantized version of the person detection model.
 
 ## Table of contents
 -   [Getting started](#getting-started)
+-   [Running on ARC EM SDP](#running-on-arc-em-sdp)
 -   [Running on Arduino](#running-on-arduino)
 -   [Running on SparkFun Edge](#running-on-sparkfun-edge)
 -   [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
 -   [Debugging image capture](#debugging-image-capture)
 -   [Training your own model](#training-your-own-model)
 
+
+## Running on ARC EM SDP
+
+The following instructions will help you to build and deploy this example to
+[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform) board. General information and instructions on using the board with TensorFlow Lite Micro can be found in the common [ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
+
+This example uses asymmetric int8 quantization and can therefore leverage optimized int8 kernels from the embARC MLI library
+
+The ARC EM SDP board contains a rich set of extension interfaces. 
+You can choose any compatible camera and modify [image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc) file accordingly to use input from your specific camera. By default, results of running this example are printed to the console.  If you would like to instead implement some target-specific actions, you need to modify [detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc) accordingly. 
+
+The reference implementations of these files are used by default on the EM SDP. 
+
+### Initial setup
+
+Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) to get and install all required tools for work with ARC EM SDP. 
+
+### Generate Example Project
+
+The example project for ARC EM SDP platform can be generated with the following command:
+    
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+
+### Build and Run Example 
+
+For more detailed information on building and running examples see the appropriate sections of general descriptions of the [ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP). In the directory with generated project you can also find a *README_ARC_EMSDP.md* file with instructions and options on building and running. Here we only briefly mention main steps which are typically enough to get it started. 
+
+1.	You need to [connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board) and open an serial connection.
+
+2.	Go to the generated example project director 
+    
+        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make`
+        
+3.	Build the example using 
+
+        make app
+
+4.	To generate artefacts for self-boot of example from the board use 
+        
+        make flash
+
+5.	To run application from the board using microSD card:
+    * Copy the content of the created /bin folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes)
+    * Plug in the microSD card into the J11 connector. 
+    * Push the RST button. If a red LED is lit beside RST button, push the CFG button.
+
+6. If you have the MetaWare Debugger installed in your environment:
+    * To run application from the console using it type `make run`. 
+    * To stop the execution type `Ctrl+C` in the console several times. 
+
+In both cases (step 5 and 6) you will see the application output in the serial terminal.
+
+
 ## Running on Arduino
 
 The following instructions will help you build and deploy this sample
diff --git a/tensorflow/lite/micro/kernels/arc_mli/README.md b/tensorflow/lite/micro/kernels/arc_mli/README.md
new file mode 100644
index 00000000000..2b2e194e757
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/arc_mli/README.md
@@ -0,0 +1,57 @@
+# EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms.
+
+This folder contains kernel implementations which use optimized [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli). It allows acceleration of inference operations which use int8 (asymmetric quantization). 
+
+## Usage
+
+embARC MLI Library is used by default to speed up execution of some kernels for asymmetrically quantized layers. This means that usual project generation for ARC specific target implies usage of embARC MLI. 
+
+For example:
+
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
+
+In case MLI implementation can’t be used, kernels in this folder fallback to TFLM reference implementations. For applications which may not benefit from MLI library, projects can be generated without these implementations by adding `TAGS=no_arc_mli` in the command line, which can reduce overall code size:
+
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_int8_make_project
+
+For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the application. For a custom target ARC-based platform, MLI sources are downloaded and compiled during project generation phase. To build library from sources for ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command:
+
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project.
+
+If an application exclusively uses accelerated MLI kernel implementations, one can strip out TFLM reference kernel implementations to reduce code size of application. Build application with `MLI_ONLY=true` option in generated project (after the project was built):
+
+    cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
+    
+    make app MLI_ONLY=true
+
+if you try this and application execution fails, then most probably MLI can’t be used for some nodes and you need to revert to using TFLM reference kernels.
+
+
+## Limitations
+
+Currently, the MLI Library provides optimized implementation only for int8 (asymmetric) versions of the following kernels:
+1.	Convolution 2D – Per axis quantization only, `dilation_ratio==1`
+2.	Depthwise Convolution 2D – Per axis quantization only, `dilation_ratio==1`
+3.	Average Pooling 
+4.	Max Pooling
+5.	Fully Connected
+
+Currently only [/tensorflow/lite/micro/examples/person_detection_experimental](/tensorflow/lite/micro/examples/person_detection_experimental) is quantized using this specification. Other examples can be executed on ARC-based targets, but will only use reference kernels.
+
+
+##	Scratch Buffers and Slicing
+
+The following information applies only for ARC EM SDP and other targets with XY memory. embARC MLI uses specific optimizations which assumes node operands are in XY memory and/or DCCM (Data Closely Coupled Memory). As operands might be quite big and may not fit in available XY memory, special slicing logic is applied which allows kernel calculations to be split into multiple parts. For this reason, internal static buffers are allocated in these X, Y and DCCM memory banks and used to execute sub-calculations. 
+
+All this is performed automatically and invisible to the user. Half of the DCCM memory bank and the full XY banks are occupied for MLI specific needs. If the user needs space in XY memory for other tasks, these arrays can be reduced by setting specific sizes. For this, add the following option to build command replacing **<size[a|b|c]>**  with required values:
+
+    EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=<size_a> -DSCRATCH_MEM_X_SIZE=<size_b> -DSCRATCH_MEM_Y_SIZE=<size_c>”
+
+For example, to reduce sizes of arrays placed in DCCM and XCCM to 32k and 8k respectively, use next command:
+
+    make app EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=32*1024 -DSCRATCH_MEM_X_SIZE=8*1024”
+
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/README.md b/tensorflow/lite/micro/tools/make/targets/arc/README.md
new file mode 100644
index 00000000000..8d20a4681ff
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/arc/README.md
@@ -0,0 +1,214 @@
+# Building TensorFlow Lite for Microcontrollers for Synopsys DesignWare ARC EM/HS Processors
+
+This document contains the general information on building and running TensorFlow Lite Micro for targets based on the Synopsys ARC EM/HS Processors.
+
+## Table of Contents
+
+-   [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit)
+-   [ARC EM Software Development Platform (ARC EM SDP)](#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
+-   [Custom ARC EM or HS Platform](#Custom-ARC-EMHS-Platform)
+
+
+## Install the Synopsys DesignWare ARC MetaWare Development Toolkit
+
+The Synopsys DesignWare ARC MetaWare Development Toolkit (MWDT) is required to build and run Tensorflow Lite Micro applications for all ARC EM/HS targets.
+
+To license MWDT, please see further details [here](https://www.synopsys.com/dw/ipdir.php?ds=sw_metaware)
+
+To request an evaluation version of MWDT, please use the [Synopsys Eval Portal](https://eval.synopsys.com/) and follow the link for the MetaWare Development Toolkit (Important:  Do not confuse this with MetaWare EV Development Toolkit or MetaWare Lite options also available on this page)
+
+Run the downloaded installer and follow the instructions to set up the toolchain on your platform.
+
+TensorFlow Lite for Microcontrollers builds are divided into two phases:  Application Project Generation and Application Project Building/Running.  The former phase requires \*nix environment while the latter does not.
+
+For basic project generation targeting [ARC EM Software Development Platform](#ARC-EM-Software-Development-Platform-ARC-EM-SDP), MetaWare is NOT required for the Project Generation Phase.  However, it is required in case the following:
+- For project generation for custom (not EM SDP) targets
+- To build microlib target library with all required TFLM objects for external use
+
+Please consider the above when choosing whether to install Windows or Linux or both versions of MWDT
+
+
+## ARC EM Software Development Platform (ARC EM SDP) 
+
+This section describes how to deploy on an [ARC EM SDP board](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
+
+### Initial Setup
+
+To use the EM SDP, you need the following hardware and software:
+
+#### ARC EM SDP
+More information on the platform, including ordering information, can be found [here](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform).
+
+#### MetaWare Development Toolkit
+See [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit) section for instructions on toolchain installation.
+
+#### Digilent Adept 2 System Software Package
+If you wish to use the MetaWare Debugger to debug your code, you need to also install the Digilent Adept 2 software, which includes the necessary drivers for connecting to the targets.  This is available from oficial [Digilent site](https://reference.digilentinc.com/reference/software/adept/start?redirect=1#software_downloads).  You should install the “System” component, and Runtime. Utilities and SDK are NOT required.
+
+Digilent installation is NOT required if you plan to deploy to EM SDP via the SD card instead of using the debugger.
+
+#### Make Tool
+A `'make'` tool is required for both phases of deploying Tensorflow Lite Micro applications on ARC EM SDP: 
+1. Application project generation
+2. Working with generated application (build and run)
+
+For the first phase you need an environment and make tool compatible with Tensorflow Lite for Micro build system. At the moment of this writing, this requires make >=3.82 and a *nix-like environment which supports shell and native commands for file manipulations. MWDT toolkit is not required for this phase. 
+
+For the second phase, requirements are less strict. The gmake version delivered with MetaWare Development Toolkit is sufficient. There are no shell and *nix command dependencies, so Windows can be used 
+
+
+#### Serial Terminal Emulation Application
+The Debug UART port of the EM SDP is used to print application output. The USB connection provides both the debug channel and RS232 transport. You can use any terminal emulation program (like [PuTTY](https://www.putty.org/)) to view UART output from the EM SDP. 
+
+#### microSD Card
+If you want to self-boot your application (start it independently from a debugger connection), you also need a microSD card with a minimum size of 512 MB  and a way to write to the card from your development host
+
+### Connect the Board
+
+1.	Make sure Boot switches of the board (S3) are configured in the next way:
+
+| Switch #  |   Switch position  |
+| :-------: | :----------------: | 
+|     1     |       Low (0)      | 
+|     2     |       Low (0)      | 
+|     3     |       High (1)     | 
+|     4     |       Low (0)      | 
+
+
+2.	Connect the power supply included in the product package to the ARC EM SDP.  
+3.	Connect the USB cable to connector J10 on the ARC EM SDP (near the RST and CFG buttons) and to an available USB port on your development host.
+4.	Determine the COM port assigned to the USB Serial Port (on Windows, using Device Manager is an easy way to do this)
+5.	Execute the serial terminal application you installed in the previous step and open the serial connection with the early defined COM port (speed 115200 baud; 8 bits; 1 stop bit; no parity). 
+6.	Push the CFG button on the board. After a few seconds you should see the boot log in the terminal which begins as follows:
+
+```
+U-Boot <Versioning info>
+ 
+CPU:   ARC EM11D v5.0 at 40 MHz
+Subsys:ARC Data Fusion IP Subsystem
+Model: snps,emsdp
+Board: ARC EM Software Development Platform v1.0
+…
+```
+
+### Generate Application Project for ARC EM SDP
+
+Before building an example or test application, you need to generate a TFLM project for this application from TensorFlow sources and external dependencies. To generate it for ARC EM SDP board you need to set `TARGET=arc_emsdp` on the make command line. For instance, to build the Person Detect test application, use a shell to execute the following command from the root directory of the TensorFlow repo:
+
+    make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET=arc_emsdp 
+
+The application project will be generated into *tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_test_int8/make*
+
+Info on generating and building example applications for EM SDP (*tensorflow/lite/micro/examples*) can be found in the appropriate readme file placed in the same directory with the examples. In general, it’s the same process which described in this Readme. 
+
+The [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli) is used by default to speed up execution of some kernels for asymmetrically quantized layers. Kernels which use MLI-based implementations are kept in the *tensorflow/lite/micro/kernels/arc_mli*  folder. For applications which may not benefit from MLI library, the project can be generated without these implementations by adding `TAGS=no_arc_mli` in the command line.  This can reduce code size when the optimized kernels are not required.
+
+For more options on embARC MLI usage see [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
+
+### Build the Application
+
+You may need to adjust the following commands in order to use the appropriate make tool available in your environment (ie: `make` or `gmake`)
+
+1. Open command shell and change the working directory to the location which contains the generated project, as described in the previous section
+
+2. Clean previous build artifacts (optional)
+
+       make clean
+
+3. Build application
+
+       make app 
+
+### Run the Application on the Board Using MetaWare Debugger
+
+In case you do not have access to the MetaWare Debugger or have chosen not to install the Digilent drivers, you can skip to the next section.
+
+To run the application from the console, use the following command:
+
+       make run 
+
+If application runs in an infinite loop, type `Ctrl+C` several times to exit the debugger.
+
+To run the application in the GUI debugger, use the following command:
+
+       make debug
+
+In both cases you will see the application output in the serial terminal. 
+
+### Run the Application on the Board from the microSD Card
+
+1.	Use the following command in the same command shell you used for building the application, as described in the previous step
+
+       make flash
+
+2.	Copy the content of the created *./bin* folder into the root of microSD card. Note that the card must be formatted as FAT32 with default cluster size (but less than 32 Kbytes)
+3.	Plug in the microSD card into the J11 connector. 
+4.	Push the RST button. If a red LED is lit beside RST button, push the CFG button.
+
+You will see the application output in the serial terminal. 
+
+
+
+## Custom ARC EM/HS Platform 
+
+This section describes how to deploy on a Custom ARC EM/HS platform defined only by a TCF (Tool Configuration File, created at CPU configuration time) and optional LCF (Linker Command File).  In this case, the real hardware is unknown, and applications can be run only in the nSIM simulator included with the MetaWare toolkit
+
+### Initial Setup
+
+To with custom ARC EM/HS platform, you need the following :
+* Synopsys MetaWare Development Toolkit version 2019.12 or higher
+* Make tool (make or gmake)
+
+See [Install the Synopsys DesignWare ARC MetaWare Development Toolkit](#install-the-synopsys-designware-arc-metaWare-development-toolkit) section for instructions on toolchain installation.
+See [MetaWare Development Toolkit](#MetaWare-Development-Toolkit) and [Make Tool](#Make-Tool) sections for instructions on toolchain installation and comments about make versions.
+
+### Generate Application Project
+
+Before building the application itself, you need to generate the project for this application from TensorFlow sources and external dependencies. To generate it for a custom TCF you need to set the following variables in the make command line:
+* TARGET_ARCH=arc
+* TCF_FILE=<path to TCF file>
+* (optional) LCF_FILE=<path to LCF file> 
+
+If you don’t supply an external LCF, the one embedded in the TCF will be used instead
+
+For instance, to build **Person Detection** test application, use the following command from the root directory of the TensorFlow repo:
+
+    make -f tensorflow/lite/micro/tools/make/Makefile generate_person_detection_test_int8_make_project TARGET_ARCH=arc TCF_FILE=<path_to_tcf_file> LCF_FILE=<path_to_lcf_file> 
+
+The application project will be generated into *tensorflow/lite/micro/tools/make/gen/<tcf_file_basename>_arc/prj/person_detection_test_int8/make*
+
+The [embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli) is used by default to speed up execution of some kernels for asymmetrically quantized layers. Kernels which use MLI-based implementations are kept in the *tensorflow/lite/micro/kernels/arc_mli*  folder. For applications which may not benefit from MLI library, the project can be generated without these implementations by adding `TAGS=no_arc_mli` in the command line.  This can reduce code size when the optimized kernels are not required.
+
+For more options on embARC MLI usage see [kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md).
+
+### Build the Application
+
+You may need to adjust the following commands in order to use the appropriate make tool available in your environment (ie: `make` or `gmake`)
+
+1. Open command shell and change the working directory to the location which contains the generated project, as described in the previous section
+
+2. Clean previous build artifacts (optional)
+
+       make clean
+
+3. Build application
+
+       make app 
+
+### Run the Application with MetaWare Debugger on the nSim Simulator.
+
+To run application from the console, use the following command:
+
+       make run 
+
+If application runs in an infinite loop, type `Ctrl+C` several times to exit the debugger.
+
+To run the application in the GUI debugger, use the following command:
+
+       make debug
+
+You will see the application output in the same console where you ran it. 
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third-party dependencies are covered by their respective licenses, in the third_party folder of this package.
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
index b722b9c441d..0ddaf3e0a81 100644
--- a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC.md.tpl
@@ -1,2 +1,45 @@
-# Mock Project Readme for common ARC target
+# TensorFlow Lite Micro ARC Make Project
 
+This folder has been autogenerated by TensorFlow, and contains sources, headers, and project files needed to build a single TensorFlow Lite Micro application using make tool and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).  
+
+This project has been generated for a target defined by TCF file only (Tool Configuration File). The real target board is unspecified, and applications can be run only in the nSIM simulator included with MWDT.
+
+See
+[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
+for details on how projects like this can be generated from the main source tree.
+
+## Usage
+
+See [Custom ARC EM/HS Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Custom-ARC-EMHS-Platform) section for more detailed information on requirements and usage of this project. 
+
+The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`
+
+1. Build the application.
+
+       make app 
+
+2. Build the application passing additional flags to compiler.
+
+       make app EXT_CFLAGS=[additional compiler flags]
+
+3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. 
+
+       make app MLI_ONLY=[true|false]
+
+4. Delete all artifacts created during build.
+
+       make clean
+
+5. Run the application with the nSIM simulator in console mode.
+
+       make run 
+
+6. Run the application with the nSIM simulator, but using the MetaWare Debugger GUI for further execution/debugging capabilities. 	
+
+       make debug 
+
+
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.
diff --git a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
index b3d9257f4d2..9d2801ed6b7 100644
--- a/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
+++ b/tensorflow/lite/micro/tools/make/templates/arc/README_ARC_EMSDP.md.tpl
@@ -1,2 +1,48 @@
-# Mock Project Readme for ARC EMSDP target
+# TensorFlow Lite Micro ARC Make Project for EM SDP Board.
 
+This folder has been autogenerated by TensorFlow, and contains source, header, and project files needed to build a single TensorFlow Lite Micro target using make tool and and a Synopsys DesignWare ARC processor compatible toolchain, specifically the ARC MetaWare Development Toolkit (MWDT).  
+
+This project has been generated for the ARC EM Software Development Platform (EM SDP). The built application can be run only on this platform.
+
+See
+[tensorflow/lite/micro](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro)
+for details on how projects like this can be generated from the main source tree.
+
+## Usage
+
+See [ARC EM Software Development Platform](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP) section for more detailed information on requirements and usage of this project. 
+
+The Makefile contains all the information on building and running the project. One can modify it to satisfy specific needs. Next actions are available out of the box. You may need to adjust the following commands in order to use the appropriate make tool available in your environment, ie: `make` or `gmake`:
+
+1. Build the application.
+
+       make app 
+
+2. Build the application passing additional flags to compiler.
+
+       make app EXT_CFLAGS=[additional compiler flags]
+
+3. Build the application and stripout TFLM reference kernel fallback implementations in order to reduce code size. This only has an effect in case the project was generated with MLI support. See more info in [EmbARC MLI Library Based Optimizations](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/kernels/arc_mli/README.md). `false` is the default value. 
+
+       make app MLI_ONLY=[true|false]
+
+4. Delete all artifacts created during build.
+
+       make clean
+
+5. Run the application with the nSIM simulator in console mode.
+
+       make run 
+
+6. Load the application and open MetaWare Debugger GUI for further execution/debugging. 	
+
+       make debug 
+
+7. Generate necessary artefacts for self-booting execution from flash. See [reference to Run the application on the board from the micro SD card](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/tools/make/targets/arc/README.md#Run-the-Application-on-the-Board-from-the-microSD-Card). 	
+
+       make flash
+
+
+## License
+
+TensorFlow's code is covered by the Apache2 License included in the repository, and third party dependencies are covered by their respective licenses, in the third_party folder of this package.

From bf687433298f29af2ee7fc1068329b50ed310693 Mon Sep 17 00:00:00 2001
From: Austin Anderson <angerson@google.com>
Date: Fri, 1 May 2020 11:56:12 -0700
Subject: [PATCH 0095/1533] Remove extra debug print statement

---
 tensorflow/tools/build_info/gen_build_info.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index a00fc064fa5..42f62f5579f 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -107,7 +107,6 @@ parser.add_argument(
 args = parser.parse_args()
 
 if args.raw_generate:
-  print(args.key_value)
   write_build_info(args.raw_generate, args.key_value)
 else:
   raise RuntimeError(

From 754e0d967f131165badc7d28b41bf6ad3f7c9132 Mon Sep 17 00:00:00 2001
From: Rishit Dagli <39672672+Rishit-dagli@users.noreply.github.com>
Date: Sat, 2 May 2020 09:25:13 +0530
Subject: [PATCH 0096/1533] Added in resources section

Added Coursera course Machine Learning with TensorFlow on GCP
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 27032043e07..d1bc88b8dbc 100644
--- a/README.md
+++ b/README.md
@@ -142,6 +142,7 @@ Build Type                                                        | Status
 *   [Getting Started with TensorFlow 2 from Coursera](https://www.coursera.org/learn/getting-started-with-tensor-flow2)
 *   [Intro to TensorFlow for Deep Learning from Udacity](https://www.udacity.com/course/intro-to-tensorflow-for-deep-learning--ud187)
 *   [Introduction to TensorFlow Lite from Udacity](https://www.udacity.com/course/intro-to-tensorflow-lite--ud190)
+*   [Machine Learning with TensorFLow on GCP](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp)
 *   [TensorFlow Blog](https://blog.tensorflow.org)
 *   [Learn ML with TensorFlow](https://www.tensorflow.org/resources/learn-ml)
 *   [TensorFlow Twitter](https://twitter.com/tensorflow)

From 87fc8379200e1faf536d74281c062820e33be75e Mon Sep 17 00:00:00 2001
From: Gaurav Singh <gaurav1086@gmail.com>
Date: Sat, 2 May 2020 14:50:25 -0400
Subject: [PATCH 0097/1533] [Lite] data: Fix memory leak

---
 .../lite/python/interpreter_wrapper/interpreter_wrapper.cc       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index bd78d56172e..313de20595d 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -592,6 +592,7 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
       size_t size_of_type;
       if (GetSizeOfType(nullptr, tensor->type, &size_of_type) != kTfLiteOk) {
         PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
+		free(data)
         return nullptr;
       }
       sparse_buffer_dims[0] = tensor->bytes / size_of_type;

From 74b9f9dcc9e7bfaf1a72ddab5a6711d748e6fbf8 Mon Sep 17 00:00:00 2001
From: Marcin Sielski <marcin.sielski@gmail.com>
Date: Sun, 3 May 2020 13:31:57 +0200
Subject: [PATCH 0098/1533] Cross and native compilation of TFLite for RPI Why:

* Describe correct cross and native compilation process for RPI.

This change addresses the need by:

* Updates in instruction for cross compilation.
* Alignement text style across whole instruction.
---
 tensorflow/lite/g3doc/guide/build_rpi.md | 104 ++++++++++++-----------
 1 file changed, 53 insertions(+), 51 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 1e04ee77a0e..a1724258118 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -5,87 +5,89 @@ Raspberry Pi. If you just want to start using TensorFlow Lite to execute your
 models, the fastest option is to install the TensorFlow Lite runtime package as
 shown in the [Python quickstart](python.md).
 
-Note: This page shows how to compile only the C++ static library for
+**Note:** This page shows how to compile only the C++ static library for
 TensorFlow Lite. Alternative install options include: [install just the Python
 interpreter API](python.md) (for inferencing only); [install the full
 TensorFlow package from pip](https://www.tensorflow.org/install/pip);
 or [build the full TensorFlow package](
 https://www.tensorflow.org/install/source_rpi).
 
-
 ## Cross-compile for Raspberry Pi
 
-This has been tested on Ubuntu 16.04.3 64bit and TensorFlow devel docker image
+Instruction has been tested on Ubuntu 16.04.3 64-bit PC (AMD64) and TensorFlow devel
+docker image
 [tensorflow/tensorflow:nightly-devel](https://hub.docker.com/r/tensorflow/tensorflow/tags/).
 
-To cross compile TensorFlow Lite, first install the toolchain and libs:
+To cross compile TensorFlow Lite follow the steps:
 
-```bash
-sudo apt-get update
-sudo apt-get install crossbuild-essential-armhf
-# The following is only needed for Pi Zero build.
-sudo apt-get install crossbuild-essential-armel
-```
+1. Clone official Raspberry Pi cross-compilation toolchain:
 
-If you are using Docker, you may not use `sudo`.
+    ```bash
+    git clone --depth 1 https://github.com/raspberrypi/tools.git rpi_tools
+    ```
 
-Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`)—if you're using the TensorFlow
-Docker image, the repo is already provided in `/tensorflow_src/`—and then run
-this script at the root of the TensorFlow repository to download all the
+2. Clone TensorFlow repository:
+
+    ```bash
+    git clone --depth 1 https://github.com/tensorflow/tensorflow.git tensorflow_src
+
+    ```
+
+    **Note:** If you're using the TensorFlow Docker image, the repo is already provided in `/tensorflow_src/`.
+
+3. Run following script at the root of the TensorFlow repository to download all the
 build dependencies:
 
-```bash
-./tensorflow/lite/tools/make/download_dependencies.sh
-```
+    ```bash
+    cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    ```
 
-Note that you only need to do this once.
+    **Note:** You only need to do this once.
 
-You should then be able to compile:
+4. To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute:
 
-To build ARMv7 binary for Raspberry Pi 2, 3 and 4:
+    ```bash
+    PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh
+    ```
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh
-```
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/rpi_armv7l/lib/libtensorflow-lite.a`.
+5. To build ARMv6 binary for Raspberry Pi Zero execute:
 
-To build ARMv6 binary for Raspberry Pi Zero:
+    ```bash
+    PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
+    ```
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
-```
-
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/rpi_armv6/lib/libtensorflow-lite.a`.
 
 ## Compile natively on Raspberry Pi
 
-This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1).
+Instruction has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1):
 
-Log in to your Raspberry Pi and install the toolchain:
+To natively compile TensorFlow Lite follow the steps:
 
-```bash
-sudo apt-get install build-essential
-```
+1. Log in to your Raspberry Pi and install the toolchain:
 
-Now git-clone the TensorFlow repository
-(`https://github.com/tensorflow/tensorflow`) and run this at the root of
-the repository:
+    ```bash
+    sudo apt-get install build-essential
+    ```
 
-```bash
-./tensorflow/lite/tools/make/download_dependencies.sh
-```
+2. Run following script at the root of the TensorFlow repository to download all the
+build dependencies:
 
-Note that you only need to do this once.
+    ```bash
+    cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    ```
 
-You should then be able to compile:
+    **Note:** You only need to do this once.
 
-```bash
-./tensorflow/lite/tools/make/build_rpi_lib.sh
-```
+3. You should then be able to compile TensorFlow Lite with:
 
-This should compile a static library in:
-`tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+    ```bash
+    ./tensorflow/lite/tools/make/build_rpi_lib.sh
+    ```
+
+    **Note:** This should compile a static library in:
+    `tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`.

From eeafd8091221d332649cd4df50f679e3b406f88c Mon Sep 17 00:00:00 2001
From: Hahn Anselm <anselm.hahn@gmail.com>
Date: Sun, 3 May 2020 20:15:11 +0200
Subject: [PATCH 0099/1533] Removing unreachable return

---
 tensorflow/lite/python/op_hint.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 29683718016..c7f49bdf4b6 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -1168,7 +1168,6 @@ def _get_correct_mapping(original_index, nodes):
     return node_indices[-1]
   else:
     return original_index
-  return original_index
 
 
 def _convert_op_hints_to_stubs_helper(

From a744818876ab362ad4112b625f40b2a0dbdafb12 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 May 2020 22:36:08 +0000
Subject: [PATCH 0100/1533] Fix ValueError with tf.keras.metrics.Recall and
 float64 keras backend

This PR fixes the issue raised in 36790 where tf.keras.metrics.Recall
causes ValueError when the backend of the keras is float64:

This PR cast the value to the dtype of var as var.assign_add
is being called.

This PR fixes 36790.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/keras/utils/metrics_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 58fff40564d..5cb6fc5f9f8 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -422,9 +422,9 @@ def update_confusion_matrix_variables(variables_to_update,
 
   def weighted_assign_add(label, pred, weights, var):
     label_and_pred = math_ops.cast(
-        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+        math_ops.logical_and(label, pred), dtype=var.dtype)
     if weights is not None:
-      label_and_pred *= weights
+      label_and_pred *= math_ops.cast(weights, dtype=var.dtype)
     return var.assign_add(math_ops.reduce_sum(label_and_pred, 1))
 
   loop_vars = {

From 8dadd8304f3f15ecdc4d780c39b371390ab14fb7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 May 2020 22:39:51 +0000
Subject: [PATCH 0101/1533] Add test case for tf.keras.metrics.Recall() and
 float64 keras backend.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/keras/metrics_test.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index ba1b76bab32..29fc36b5fc6 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
@@ -2174,6 +2175,23 @@ class ResetStatesTest(keras_parameterized.TestCase):
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
 
+  def test_reset_states_recall_float64(self):
+    # Test case for GitHub issue 36790.
+    try:
+      backend.set_floatx('float64')
+      r_obj = metrics.Recall()
+      model = _get_model([r_obj])
+      x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+      y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+    finally:
+      backend.set_floatx('float32')
+
 
 if __name__ == '__main__':
   test.main()

From ee5854dd2c32085f0a9af4e5aa312970e22bf479 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Fri, 1 May 2020 16:25:43 +0100
Subject: [PATCH 0102/1533] Added 16x8 LeakyRelu kernel.

---
 tensorflow/lite/kernels/activations.cc        | 45 ++++++++++---------
 tensorflow/lite/kernels/activations_test.cc   | 37 ++++++++++++---
 .../lite/tools/optimize/operator_property.cc  |  4 ++
 3 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 4d52b5c7446..0fcf6baf1bd 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -364,7 +364,8 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   LeakyReluOpData* data = reinterpret_cast<LeakyReluOpData*>(node->user_data);
 
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) {
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
+      output->type == kTfLiteInt16) {
     const auto* params =
         reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
 
@@ -1183,6 +1184,22 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+template <typename T>
+void QuantizeLeakyRelu(const TfLiteTensor* input, TfLiteTensor* output,
+                       const LeakyReluOpData* data) {
+  LeakyReluParams op_params;
+
+  op_params.input_offset = input->params.zero_point;
+  op_params.output_offset = output->params.zero_point;
+  op_params.output_multiplier_alpha = data->output_multiplier_alpha;
+  op_params.output_shift_alpha = data->output_shift_alpha;
+  op_params.output_multiplier_identity = data->output_multiplier_identity;
+  op_params.output_shift_identity = data->output_shift_identity;
+  reference_ops::QuantizeLeakyRelu(
+      op_params, GetTensorShape(input), GetTensorData<T>(input),
+      GetTensorShape(output), GetTensorData<T>(output));
+}
+
 TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -1201,33 +1218,21 @@ TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
-      op_params.input_offset = input->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
-      op_params.output_shift_alpha = data->output_shift_alpha;
-      op_params.output_multiplier_identity = data->output_multiplier_identity;
-      op_params.output_shift_identity = data->output_shift_identity;
-      reference_ops::QuantizeLeakyRelu(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      QuantizeLeakyRelu<uint8_t>(input, output, data);
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
-      op_params.input_offset = input->params.zero_point;
-      op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier_alpha = data->output_multiplier_alpha;
-      op_params.output_shift_alpha = data->output_shift_alpha;
-      op_params.output_multiplier_identity = data->output_multiplier_identity;
-      op_params.output_shift_identity = data->output_shift_identity;
-      reference_ops::QuantizeLeakyRelu(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      QuantizeLeakyRelu<int8_t>(input, output, data);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteInt16: {
+      QuantizeLeakyRelu<int16_t>(input, output, data);
       return kTfLiteOk;
     } break;
     default:
       TF_LITE_KERNEL_LOG(
           context,
-          "Only float32, int8 and uint8 is supported currently, got %s.",
+          "Only float32, int8, int16 and uint8 is supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index b4711216524..3a1b8644818 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -108,10 +108,20 @@ class BaseActivationsOpModel : public SingleOpModel {
   BaseActivationsOpModel(TensorData input, float alpha) {
     input_ = AddInput(input);
     // The output scale and input scale might be different.
-    if (input.type == TensorType_UINT8 || input.type == TensorType_INT8) {
+    if (input.type == TensorType_UINT8 || input.type == TensorType_INT8 ||
+        input.type == TensorType_INT16) {
       auto output_min = (input.min >= 0) ? input.min : input.min * alpha;
       auto output_max = (input.max >= 0) ? input.max : input.max * alpha;
-      output_ = AddOutput({input.type, {}, output_min, output_max});
+      if (input.type == TensorType_INT16) {
+        output_ = AddOutput({TensorType_INT16,
+                           {},
+                           output_min,
+                           output_max,
+                           1.0f / (std::numeric_limits<int16_t>::max() + 1),
+                           0});
+      } else {
+        output_ = AddOutput({input.type, {}, output_min, output_max});
+      }
     } else {
       output_ = AddOutput({input.type, {}});
     }
@@ -504,14 +514,15 @@ TEST(QuantizedActivationsOpTest, LeakyReluUint8) {
                   kQuantizedTolerance * 8)));
 }
 
-TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedActivationsOpTestLeakyRelu() {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
 
   QuantizedActivationsOpModel m(
-      /*input=*/{TensorType_INT8, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
+      /*input=*/{tensor_type, {5, 5}, 5 * kMin, 5 * kMax}, 0.1);
 
-  m.SetInput<int8_t>({
+  m.SetInput<integer_dtype>({
       -5.0f, -4.6f, -4.2f, -3.8f, -3.4f,  // Row 1
       -3.0f, -2.6f, -2.2f, -1.8f, -1.4f,  // Row 2
       -1.0f, -0.6f, -0.2f, 0.2f,  0.6f,   // Row 3
@@ -519,7 +530,11 @@ TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
       3.0f,  3.4f,  3.8f,  4.2f,  4.6f,   // Row 5
   });
   m.Invoke();
-  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+
+  float kTestQuantizedTolerance = tensor_type == TensorType_INT16 ? 
+        kQuantizedToleranceInt16 : kQuantizedTolerance * 5;
+
+  EXPECT_THAT(m.GetDequantizedOutput<integer_dtype>(),
               ElementsAreArray(ArrayFloatNear(
                   {
                       -0.50f, -0.46f, -0.42f, -0.38f, -0.34f,  // Row 1
@@ -528,7 +543,15 @@ TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
                       1.00f,  1.40f,  1.80f,  2.20f,  2.60f,   // Row 4
                       3.00f,  3.40f,  3.80f,  4.20f,  4.60f,   // Row 5
                   },
-                  kQuantizedTolerance * 5)));
+                  kTestQuantizedTolerance)));
+}
+
+TEST(QuantizedActivationsOpTest, LeakyReluInt8) {
+  QuantizedActivationsOpTestLeakyRelu<TensorType_INT8, int8_t>();
+}
+
+TEST(QuantizedActivationsOpTest, LeakyReluInt16) {
+  QuantizedActivationsOpTestLeakyRelu<TensorType_INT16, int16_t>();
 }
 
 TEST(QuantizedActivationsOpTest, Relu1Int8) {
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 71fdad87bd2..def732a8eb3 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -816,6 +816,10 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.version = 2;
       break;
     case BuiltinOperator_LEAKY_RELU:
+      property.inputs = {{0, {}}};
+      property.outputs = {{0, {}}};
+      property.version = 2;
+      break;
     case BuiltinOperator_RELU:
     case BuiltinOperator_RELU6:
       property.inputs = {{0, {}}};

From 9acac9150e55ca7e54c2db34cf4f00de4a0bd7a9 Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Mon, 4 May 2020 11:42:29 +0100
Subject: [PATCH 0103/1533] Updated versioning for LeakyRelu.

---
 tensorflow/lite/toco/tflite/op_version.cc      | 1 +
 tensorflow/lite/tools/versioning/op_version.cc | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc
index 1b259b796b2..ca4cd801921 100644
--- a/tensorflow/lite/toco/tflite/op_version.cc
+++ b/tensorflow/lite/toco/tflite/op_version.cc
@@ -170,6 +170,7 @@ string GetMinimumRuntimeVersionForModel(const Model& model) {
           {{OperatorType::kUnpack, 3}, "2.2.0"},
           {{OperatorType::kUnpack, 4}, kPendingReleaseOpVersion},
           {{OperatorType::kLeakyRelu, 1}, "1.13.1"},
+          {{OperatorType::kLeakyRelu, 2}, kPendingReleaseOpVersion},
           {{OperatorType::kLogistic, 1}, "1.14.0"},
           {{OperatorType::kLogistic, 2}, "1.14.0"},
           {{OperatorType::kLogSoftmax, 1}, "1.14.0"},
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 0b892cf847f..dc61aef33d8 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -426,6 +426,12 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_LEAKY_RELU:
+      if (op_sig.input_types.at(0) == TensorType_INT16) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_ADD:
     case BuiltinOperator_CONCATENATION:
     case BuiltinOperator_PAD:

From e4281619f160dae0e04fc71c634da609c11ec52e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 May 2020 02:57:56 +0000
Subject: [PATCH 0104/1533] Add cardinality calculation for Dataset.unbatch()
 when possible

This PR tries to address the issue raised in 39136 where cardinality
of Dataset.unbatch() was always UNKNOWN, even if it might be known
in certain situations.

This PR add the cardinality calculation in case the input cardinality
is known and the leading dim of the output shape is known.

This PR fixes 39136.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../data/experimental/unbatch_dataset_op.cc    | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 111afa218df..9b2434d2046 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -38,8 +38,14 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
     explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
+      known_batch_size_ = -1;
       for (const PartialTensorShape& shape : input->output_shapes()) {
         if (!shape.unknown_rank()) {
+          if (known_batch_size_ < 0) {
+            if (shape.dim_size(0) >= 0) {
+              known_batch_size_ = shape.dim_size(0);
+            }
+          }
           gtl::InlinedVector<int64, 4> partial_dim_sizes;
           for (int i = 1; i < shape.dims(); ++i) {
             partial_dim_sizes.push_back(shape.dim_size(i));
@@ -69,6 +75,17 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      if (known_batch_size_ > 0) {
+        return n * known_batch_size_;
+      }
+      return kUnknownCardinality;
+    }
+
     Status CheckExternalState() const override {
       return input_->CheckExternalState();
     }
@@ -222,6 +239,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> shapes_;
+    int64 known_batch_size_;
   };
 };
 

From 8700af511ce3a6d4d2e200c8146c4a43e3933ee8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 May 2020 03:00:25 +0000
Subject: [PATCH 0105/1533] Add test case for cardinality with
 Dataset.unbatch()

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../data/experimental/kernel_tests/cardinality_test.py   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
index ffc98b917d2..34147f3a479 100644
--- a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -134,6 +134,15 @@ def _test_combinations():
        lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
        cardinality.UNKNOWN),
       ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Unbatch1",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True).unbatch(), 4),
+      ("Unbatch2",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=False).unbatch(), cardinality.UNKNOWN),
+      ("Unbatch3",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True).filter(lambda _: True).unbatch(),
+       cardinality.UNKNOWN),
+      ("Unbatch4", lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True).repeat().unbatch(),
+       cardinality.INFINITE),
       ("Window1", lambda: dataset_ops.Dataset.range(5).window(
           size=2, shift=2, drop_remainder=True), 2),
       ("Window2", lambda: dataset_ops.Dataset.range(5).window(

From 9cfeae817a1af13c53fec4e524c905132ce23c55 Mon Sep 17 00:00:00 2001
From: Hahn Anselm <anselm.hahn@gmail.com>
Date: Mon, 4 May 2020 19:35:41 +0200
Subject: [PATCH 0106/1533] Rearange the return

---
 tensorflow/lite/python/op_hint.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index c7f49bdf4b6..159fcaa2bf3 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -1166,8 +1166,7 @@ def _get_correct_mapping(original_index, nodes):
     node_indices = nodes.keys()
     node_indices = sorted(node_indices)
     return node_indices[-1]
-  else:
-    return original_index
+  return original_index
 
 
 def _convert_op_hints_to_stubs_helper(

From b37043c7dd809413c515b15a714d89a5759b78c9 Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 4 May 2020 10:42:55 -0700
Subject: [PATCH 0107/1533] Fix tests that use __repr__

---
 .../ops/structured/structured_tensor.py       |  2 +-
 .../ops/structured/structured_tensor_test.py  | 19 +++++++++++++++----
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index 1ca80d314e6..f82f4fbf73d 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -438,7 +438,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
   def __repr__(self):
     return '<StructuredTensor(fields={%s}, shape=%s)>' % (', '.join(
-        '%s: %s' % (k, v) for k, v in sorted(self._fields.items())),
+        '"%s": %s' % (k, v) for k, v in sorted(self._fields.items())),
         self._shape)
 
   #=============================================================================
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index e2d6a161641..a892dc56d17 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -900,14 +900,25 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
     st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
     self.assertAllEqual(st.field_value(("a",)), 5)
     self.assertAllEqual(st.field_value(("b", "c")), [1, 2, 3])
-    with self.assertRaisesRegexp(KeyError,
-                                 r"Field path \('a', 'b'\) not found in .*"):
+    expected = "Field path \(.*a.*,.*b.*\) not found in .*"
+    with self.assertRaisesRegexp(KeyError, expected):
       st.field_value(("a", "b"))
 
   def testRepr(self):
     st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
-    self.assertEqual(
-        repr(st), "<StructuredTensor(fields={'a', 'b'}, shape=())>")
+    if context.executing_eagerly():
+      expected = ('<StructuredTensor(fields={'
+                  '"a": tf.Tensor(5, shape=(), dtype=int32), '
+                  '"b": <StructuredTensor(fields={'
+                  '"c": tf.Tensor([1 2 3], shape=(3,), dtype=int32)}, '
+                  'shape=())>}, shape=())>')
+    else:
+      expected = ('<StructuredTensor(fields={'
+                  '"a": Tensor("Const:0", shape=(), dtype=int32), '
+                  '"b": <StructuredTensor(fields={'
+                  '"c": Tensor("RaggedConstant/Const:0", shape=(3,), '
+                  'dtype=int32)}, shape=())>}, shape=())>')
+    self.assertEqual(repr(st), expected)
 
 
 if __name__ == "__main__":

From b9579f96bd07d3016285128e1e2466540b47bf01 Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Fri, 27 Mar 2020 14:20:09 -0700
Subject: [PATCH 0108/1533] Vectorize transpose

---
 tensorflow/core/kernels/conv_2d_gpu.h | 91 +++++++++++++++++++++++++--
 1 file changed, 87 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 31abe9dfead..90d85e6f04e 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -210,6 +210,57 @@ __global__ void ShuffleInTensor3Simple(int nthreads,
   }
 }
 
+constexpr int kUnroll = 4;
+
+template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
+__global__ void ShuffleInTensor3SimpleVector(int nthreads,
+                                             const T* __restrict__ input,
+                                             Dimension<3> input_dims,
+                                             T* __restrict__ output) {
+  Dimension<3> output_dims;
+  output_dims[sp0] = input_dims[0];
+  output_dims[sp1] = input_dims[1];
+  output_dims[sp2] = input_dims[2];
+
+  const int stride = blockDim.x * gridDim.x * kUnroll;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  T buf[kUnroll];
+
+  int output_index;
+  for (output_index = tid * kUnroll; output_index + kUnroll - 1 < nthreads;
+       output_index += stride) {
+#pragma unroll
+    for (int i = 0; i < kUnroll; i++) {
+      int output_index_i = output_index + i;
+      Index<3> output_tensor_index = FlatToTensorIndex(output_index_i,
+                                                       output_dims);
+      Index<3> input_tensor_index;
+      input_tensor_index[0] = output_tensor_index[sp0];
+      input_tensor_index[1] = output_tensor_index[sp1];
+      input_tensor_index[2] = output_tensor_index[sp2];
+
+      int input_index_i = TensorIndexToFlat(input_tensor_index, input_dims);
+      buf[i] = maybe_conj<T, conjugate>::run(ldg(input + input_index_i));
+    }
+    float2 *out = reinterpret_cast<float2*>(output + output_index);
+    *out = *reinterpret_cast<float2*>(buf);
+  }
+
+  for(; output_index < nthreads; output_index++) {
+    Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
+
+    Index<3> input_tensor_index;
+    input_tensor_index[0] = output_tensor_index[sp0];
+    input_tensor_index[1] = output_tensor_index[sp1];
+    input_tensor_index[2] = output_tensor_index[sp2];
+
+    int input_index = TensorIndexToFlat(input_tensor_index, input_dims);
+
+    output[output_index] =
+        maybe_conj<T, conjugate>::run(ldg(input + input_index));
+  }
+}
+
 // Use shared memory tiles to swap dimension-1 and dimension-2 of a 3D tensor,
 // where dimensions are zero-based: output[i][j][k] = input[i][k][j].
 //
@@ -1008,10 +1059,42 @@ struct SwapDimension0And2InTensor3<GPUDevice, T, conjugate> {
                                static_cast<int>(combined_dims[2])};
     size_t total_size = combined_dims[0] * combined_dims[1] * combined_dims[2];
     GpuLaunchConfig config = GetGpuLaunchConfig(total_size, d);
-    TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
-                                config.block_count, config.thread_per_block, 0,
-                                d.stream(), config.virtual_thread_count, in,
-                                input_dims, out));
+
+    auto out_ptr = reinterpret_cast<uintptr_t>(out);
+    bool aligned = out_ptr % 16 == 0;
+
+    bool use_vector = false;
+    bool use_custom_config = false;
+    if (input_dims[0] <= 128 && input_dims[2] <= 128 ||
+        input_dims[0] * input_dims[1] <= 128 ||
+        input_dims[1] * input_dims[2] <= 8) {
+      use_vector = true;
+      use_custom_config = true;
+    } else if (input_dims[1] * input_dims[2] <= 16384) {
+      use_vector = true;
+    }
+                      
+    if (sizeof(T) == 2 && aligned && use_vector) {
+      int block_count;
+      if (use_custom_config) {
+        block_count = (total_size + config.thread_per_block - 1) /
+                          config.thread_per_block;
+      } else {
+        block_count = config.block_count;
+      }
+
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3SimpleVector<T, 2, 1, 0,
+                                                               conjugate>,
+                                  block_count,
+                                  config.thread_per_block / kUnroll,
+                                  0, d.stream(), total_size,
+                                  in, input_dims, out));
+    } else {
+      TF_CHECK_OK(GpuLaunchKernel(ShuffleInTensor3Simple<T, 2, 1, 0, conjugate>,
+                                  config.block_count, config.thread_per_block,
+                                  0, d.stream(), config.virtual_thread_count,
+                                  in, input_dims, out));
+    }
   }
 };
 

From 11a16a341d4ada66805d2941905de43756c01dee Mon Sep 17 00:00:00 2001
From: Bryan Cutler <cutlerb@gmail.com>
Date: Mon, 4 May 2020 14:04:48 -0700
Subject: [PATCH 0109/1533] Fix lint error

---
 tensorflow/python/ops/structured/structured_tensor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index f82f4fbf73d..5811294b176 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -439,7 +439,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   def __repr__(self):
     return '<StructuredTensor(fields={%s}, shape=%s)>' % (', '.join(
         '"%s": %s' % (k, v) for k, v in sorted(self._fields.items())),
-        self._shape)
+                                                          self._shape)
 
   #=============================================================================
   # Conversion

From 8723024fdd62bd4cbcf791a222ef5c16398528bb Mon Sep 17 00:00:00 2001
From: Andrew Stevens <andrew.stevens@infineon.com>
Date: Mon, 4 May 2020 23:53:39 +0200
Subject: [PATCH 0110/1533] Fixed: formatting/merge oops noted in PR review.

---
 tensorflow/compiler/mlir/lite/tf_tfl_translate.cc            | 2 --
 tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc | 2 +-
 tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc       | 3 ---
 3 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 4e37386401a..4d410577532 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -175,8 +175,6 @@ int main(int argc, char **argv) {
   if (!module.ok()) return kTrFailure;
 
   mlir::PassManager pm(&context);
-  applyPassManagerCLOptions(pm);
-
   mlir::applyPassManagerCLOptions(pm);
 
   // Set the quantization specifications from the command line flags.
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index e24bfe883a5..54040d63503 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -227,7 +227,7 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
   func.walk([&](ConcatenationOp concat) {
     if (concat.output().hasOneUse() &&
         Quantized(*concat.output().user_begin())) {
-        return;
+      return;
     }
     concat.emitWarning(
             "Missing quantization parameter on the output might introduce "
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 51553ea35f4..80e48fd1c83 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -692,9 +692,6 @@ void PrepareTFPass::runOnFunction() {
   // parameters from the TF Quant ops, thus this pattern should run with the
   // first `applyPatternsGreedily` method, which would otherwise removes the
   // TF FakeQuant ops by the constant folding.
-  //patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant, 
-  //                PreparePerTensorFakeQuantWithMinMaxArgs>(ctx);
-
   patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant, PreparePerTensorFakeQuantWithMinMaxArgs>(ctx);
 
 
From 2560a7c41fb874b83bd9b1c337b59fada9f78285 Mon Sep 17 00:00:00 2001
From: Andrew Stevens <andrew.stevens@infineon.com>
Date: Mon, 4 May 2020 23:55:17 +0200
Subject: [PATCH 0111/1533] Fix: ignore shape for quantization ==  comparison

---
 .../compiler/mlir/lite/transforms/optimize.cc | 19 +++++++++++++++++--
 .../mlir/lite/transforms/optimize_patterns.td |  2 +-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index c44222bf496..36b786a521a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -206,13 +206,28 @@ DenseElementsAttr GetShape(Value output_val) {
       llvm::makeArrayRef(shape));
 }
 
-bool notFromQuantOpDifferentQuanteUse( Value val, TypeAttr qtype) {
+static Type getShapeStrippedType(TypeAttr type_attr)
+{
+  auto type = type_attr.getValue();
+  auto shaped_type = type.dyn_cast<ShapedType>();
+  if (shaped_type) {
+    return shaped_type.getElementType();
+  } else {
+    return type;
+  }
+}
+
+bool NotFromQuantOpDifferentQuant(Value val, TypeAttr qtype_attr) {
   auto val_defn_op = val.getDefiningOp();
   TFL::QuantizeOp q_op = llvm::dyn_cast_or_null<TFL::QuantizeOp>(val_defn_op);
   if( !q_op)
     return true;
 
-  return q_op.qtype() == qtype.getValue();
+  // Ignore shape details - weŕe really only trying to
+  // check if quantization is the same.
+  auto stripped_src_qtype = getShapeStrippedType(q_op.qtypeAttr());
+  auto stripped_qtype = getShapeStrippedType(qtype_attr);
+  return stripped_src_qtype == stripped_qtype;
 }
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index c6c71b106c7..1c9a61f8519 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -33,7 +33,7 @@ def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 // Checks value is not produce by a TLF_QUant with
 // different quantization attribute
 
-def NotFromQuantOpDifferentQuant : Constraint<CPred<"notFromQuantOpDifferentQuanteUse($0,$1)">>;
+def NotFromQuantOpDifferentQuant : Constraint<CPred<"NotFromQuantOpDifferentQuant($0,$1)">>;
 //===----------------------------------------------------------------------===//
 // Ternary ops patterns.
 //===----------------------------------------------------------------------===//

From 0916c404a081488945ea28d12de129e63829f5d1 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Mon, 4 May 2020 23:58:45 +0100
Subject: [PATCH 0112/1533] Mark QuantizationSpec methods that don't change
 member data as constant

---
 .../compiler/mlir/lite/quantization/quantization_config.h     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index 5b1c73e7887..ea59f49f5b7 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -84,7 +84,7 @@ struct QuantizationSpecs {
   bool RunWeightQuantization() const { return weight_quantization; }
 
   // Whether this inference type represents a signed storage type.
-  bool IsSignedInferenceType() {
+  bool IsSignedInferenceType() const {
     switch (inference_type) {
       case tensorflow::DT_QUINT8:
       case tensorflow::DT_QUINT16:
@@ -96,7 +96,7 @@ struct QuantizationSpecs {
 
   // Gets the width of this quantization type. Returns 0 if it isn't a
   // quantization type.
-  int64_t GetQuantizationTypeWidth() {
+  int64_t GetQuantizationTypeWidth() const {
     switch (inference_type) {
       case tensorflow::DT_QINT8:
       case tensorflow::DT_QUINT8:

From a5ae7f124b27f97a35de9b3778f98e2bcc62a10f Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 5 May 2020 00:03:05 +0100
Subject: [PATCH 0113/1533] Respect inference type in DefaultQuantParamsPass

---
 tensorflow/compiler/mlir/lite/tf_tfl_passes.cc |  3 ++-
 .../lite/transforms/default_quant_params.cc    | 18 ++++++++++++------
 .../compiler/mlir/lite/transforms/passes.h     |  2 +-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 5eefa821c6b..8dbc84d1a9f 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -48,7 +48,8 @@ void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager->addPass(mlir::TFL::CreateDefaultQuantParamsPass(
         quant_specs.default_ranges.first.getValueOr(0.0),
-        quant_specs.default_ranges.second.getValueOr(0.0)));
+        quant_specs.default_ranges.second.getValueOr(0.0),
+        quant_specs.IsSignedInferenceType()));
     pass_manager->addPass(mlir::TFL::CreateQuantizePass());
     pass_manager->addPass(
         mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index a1602baced5..c23ae9fcfab 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -46,8 +46,11 @@ namespace {
 class DefaultQuantParamsPass
     : public PassWrapper<DefaultQuantParamsPass, FunctionPass> {
  public:
-  explicit DefaultQuantParamsPass(double default_min, double default_max)
-      : default_min_(default_min), default_max_(default_max) {}
+  explicit DefaultQuantParamsPass(double default_min, double default_max,
+                                  bool is_signed)
+      : default_min_(default_min),
+        default_max_(default_max),
+        is_signed_(is_signed) {}
 
   void runOnFunction() override;
 
@@ -82,6 +85,7 @@ class DefaultQuantParamsPass
 
   double default_min_;
   double default_max_;
+  bool is_signed_;
   quant::QuantParams default_quant_params_;
 };
 }  // namespace
@@ -214,15 +218,16 @@ quant::QuantParams DefaultQuantParamsPass::GetDefaultQuantParams(
     default_quant_params_ = quant::fakeQuantAttrsToType(
         builder.getUnknownLoc(),
         /*numBits=*/8, default_min_, default_max_, /*narrowRange=*/false,
-        builder.getF32Type());
+        builder.getF32Type(), is_signed_);
   }
   return default_quant_params_;
 }
 
 // Creates an instance of the default quant parameters pass.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max) {
-  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max);
+    double default_min, double default_max, bool is_signed) {
+  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max,
+                                                   is_signed);
 }
 
 // Registers this pass with default values, only for test
@@ -230,7 +235,8 @@ static PassRegistration<DefaultQuantParamsPass> pass(
     "tfl-default-quant",
     "Apply quantization with default quantization parameter", [] {
       return CreateDefaultQuantParamsPass(/*default_min=*/-1.0,
-                                          /*default_max=*/1.0);
+                                          /*default_max=*/1.0,
+                                          /*is_signed=*/false);
     });
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 959c17e317a..105c9394fb4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -76,7 +76,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeFunctionalOpsPass();
 // Creates an instance of the TensorFlow Lite dialect pass to add default
 // quantization parameters.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max);
+    double default_min, double default_max, bool is_signed);
 
 // Creates an instance of the TensorFlow Lite dialect pass to convert dense
 // tensor to sparse format.

From ba2e0157d89f84216183532b89b2441568832131 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 5 May 2020 04:19:02 +0000
Subject: [PATCH 0114/1533] Update based on review feedback.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../data/experimental/unbatch_dataset_op.cc       | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 9b2434d2046..e813de70931 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -38,13 +38,11 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
     explicit Dataset(OpKernelContext* ctx, DatasetBase* input)
         : DatasetBase(DatasetContext(ctx)), input_(input) {
       input_->Ref();
-      known_batch_size_ = -1;
+      batch_size_ = -1;
       for (const PartialTensorShape& shape : input->output_shapes()) {
         if (!shape.unknown_rank()) {
-          if (known_batch_size_ < 0) {
-            if (shape.dim_size(0) >= 0) {
-              known_batch_size_ = shape.dim_size(0);
-            }
+          if (batch_size_ < 0 && shape.dim_size(0) >= 0) {
+            batch_size_ = shape.dim_size(0);
           }
           gtl::InlinedVector<int64, 4> partial_dim_sizes;
           for (int i = 1; i < shape.dims(); ++i) {
@@ -80,8 +78,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       if (n == kInfiniteCardinality || n == kUnknownCardinality) {
         return n;
       }
-      if (known_batch_size_ > 0) {
-        return n * known_batch_size_;
+      if (batch_size_ > 0) {
+        return n * batch_size_;
       }
       return kUnknownCardinality;
     }
@@ -239,7 +237,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
 
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> shapes_;
-    int64 known_batch_size_;
+    // batch_size_ may or may not be known, with -1 as unknown
+    int64 batch_size_;
   };
 };
 

From e0533e171fa35107118f570b9104fdc38678b3fd Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 5 May 2020 04:19:32 +0000
Subject: [PATCH 0115/1533] Add additional test case where only the second
 batch size is known (from the review comment)

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../data/experimental/kernel_tests/cardinality_test.py      | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
index 34147f3a479..da40986f80d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -143,6 +143,12 @@ def _test_combinations():
        cardinality.UNKNOWN),
       ("Unbatch4", lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True).repeat().unbatch(),
        cardinality.INFINITE),
+      ("Unbatch5",
+       lambda: dataset_ops.Dataset.zip((
+           dataset_ops.Dataset.range(4).batch(2, drop_remainder=False),
+           dataset_ops.Dataset.range(5).batch(2, drop_remainder=True),
+       )).unbatch(),
+       4),
       ("Window1", lambda: dataset_ops.Dataset.range(5).window(
           size=2, shift=2, drop_remainder=True), 2),
       ("Window2", lambda: dataset_ops.Dataset.range(5).window(

From ce9ede9c45bf366ac297a5d5411ab99c7da9334d Mon Sep 17 00:00:00 2001
From: "ag.ramesh" <ag.ramesh@intel.com>
Date: Tue, 5 May 2020 09:25:39 -0700
Subject: [PATCH 0116/1533] Fixed circular dependency when building on Windows.

---
 tensorflow/core/common_runtime/BUILD | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 8b85b291af7..0fc0eaac3fc 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1026,10 +1026,13 @@ cc_library(
 cc_library(
     name = "mkl_layout_pass",
     srcs = ["mkl_layout_pass.cc"],
-    hdrs = ["mkl_layout_pass.h"],
+    hdrs = [
+        "mkl_layout_pass.h",
+        "//tensorflow/core/graph:mkl_graph_util_header",
+    ],
     copts = tf_copts(),
     deps = [
-        ":core_cpu",
+        ":function",
         ":optimization_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -1042,10 +1045,13 @@ cc_library(
 cc_library(
     name = "mkl_tfconversion_pass",
     srcs = ["mkl_tfconversion_pass.cc"],
-    hdrs = ["mkl_tfconversion_pass.h"],
+    hdrs = [
+        "mkl_tfconversion_pass.h",
+        "//tensorflow/core/graph:mkl_graph_util_header",
+    ],
     copts = tf_copts(),
     deps = [
-        ":core_cpu",
+        ":function",
         ":optimization_registry",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",

From 9a99741b51ccee8f33ae10de9d7f884c671a94bc Mon Sep 17 00:00:00 2001
From: sshiddib <sharada.shiddibhavi@intel.com>
Date: Thu, 30 Apr 2020 16:49:09 -0700
Subject: [PATCH 0117/1533] Fix for UT failure in graph_runner_test

---
 tensorflow/core/common_runtime/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 1ed2d9d9459..3c3dea50c5c 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -2282,7 +2282,7 @@ tf_cc_test(
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:cwise_op",
-    ] + if_mkl([":mkl_array_ops_op_lib"]),
+    ] + if_mkl(["//tensorflow/core:mkl_array_ops_op_lib"]),
 )
 
 tf_cc_test(

From fe3c1035c22eebad69d5fbe85e987f66d5a40a2c Mon Sep 17 00:00:00 2001
From: "tongxuan.ltx" <tongxuan.ltx@alibaba-inc.com>
Date: Sat, 2 May 2020 08:11:06 +0000
Subject: [PATCH 0118/1533] Check return status of reading environment variable

---
 tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index b2efed619a4..ccaf0af213b 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -233,8 +233,11 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   builder.SetMaxMessageSize(std::numeric_limits<int32>::max());
 
   bool reuse_port = false;
-  ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false, &reuse_port)
-      .IgnoreError();
+  const Status status = ReadBoolFromEnvVar("TF_GRPC_REUSE_PORT", false,
+      &reuse_port);
+  if (!status.ok()) {
+    LOG(ERROR) << status.error_message();
+  }
   auto server_build_option = reuse_port ?
       std::unique_ptr<::grpc::ServerBuilderOption>(new ReusePortOption) :
       std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption);

From 23395e662a859c53dbfc3ba79a5da1ed6aa8099a Mon Sep 17 00:00:00 2001
From: Elena Zhelezina <elena.zhelezina@arm.com>
Date: Wed, 6 May 2020 11:47:01 +0100
Subject: [PATCH 0119/1533] Corrected test for LeakyRelu.

---
 tensorflow/lite/kernels/activations_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 3a1b8644818..870adc507e2 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -115,9 +115,9 @@ class BaseActivationsOpModel : public SingleOpModel {
       if (input.type == TensorType_INT16) {
         output_ = AddOutput({TensorType_INT16,
                            {},
-                           output_min,
-                           output_max,
-                           1.0f / (std::numeric_limits<int16_t>::max() + 1),
+                           0,
+                           0,
+                           output_max / (std::numeric_limits<int16_t>::max()),
                            0});
       } else {
         output_ = AddOutput({input.type, {}, output_min, output_max});

From c7828e73f7f6e7b2b0e43d9b04800147615e25a0 Mon Sep 17 00:00:00 2001
From: Gaurav Singh <gaurav1086@gmail.com>
Date: Wed, 6 May 2020 09:25:09 -0400
Subject: [PATCH 0120/1533] Fix build errors

---
 .../lite/python/interpreter_wrapper/interpreter_wrapper.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 313de20595d..a414e26adb0 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -592,7 +592,7 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
       size_t size_of_type;
       if (GetSizeOfType(nullptr, tensor->type, &size_of_type) != kTfLiteOk) {
         PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
-		free(data)
+        free(data);
         return nullptr;
       }
       sparse_buffer_dims[0] = tensor->bytes / size_of_type;

From 9c36f4b4266a13501ebf131ded0fb5639c29ede7 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Wed, 6 May 2020 19:28:16 +0300
Subject: [PATCH 0121/1533] EM SDP tcf file is removed (to be downloaded with
 MLI package) + minor fixes in Readmes

---
 .../micro/examples/micro_speech/README.md     |    2 +-
 .../micro/examples/person_detection/README.md |    4 +-
 .../person_detection_experimental/README.md   |    2 +-
 .../micro/tools/make/ext_libs/arc_mli.inc     |    4 +-
 .../targets/arc/emsdp/emsdp_em11d_dfss.tcf    | 4907 -----------------
 .../tools/make/targets/arc_emsdp_makefile.inc |   17 +-
 .../tools/make/third_party_downloads.inc      |    2 +-
 7 files changed, 20 insertions(+), 4918 deletions(-)
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf

diff --git a/tensorflow/lite/micro/examples/micro_speech/README.md b/tensorflow/lite/micro/examples/micro_speech/README.md
index ba55a7d8493..3ab8ad24338 100644
--- a/tensorflow/lite/micro/examples/micro_speech/README.md
+++ b/tensorflow/lite/micro/examples/micro_speech/README.md
@@ -55,7 +55,7 @@ For more detailed information on building and running examples see the appropria
 
 2.	Go to the generated example project director 
     
-        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make`
+        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/micro_speech_mock/make
         
 3.	Build the example using 
 
diff --git a/tensorflow/lite/micro/examples/person_detection/README.md b/tensorflow/lite/micro/examples/person_detection/README.md
index ae47c4be0ff..d736d6f7cd5 100644
--- a/tensorflow/lite/micro/examples/person_detection/README.md
+++ b/tensorflow/lite/micro/examples/person_detection/README.md
@@ -34,7 +34,7 @@ Follow the instructions on the [ARC EM SDP Initial Setup](/tensorflow/lite/micro
 
 The example project for ARC EM SDP platform can be generated with the following command:
     
-    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project `
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project
 
 ### Build and Run Example 
 
@@ -44,7 +44,7 @@ For more detailed information on building and running examples see the appropria
 
 2.	Go to the generated example project director 
     
-        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make`
+        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make
         
 3.	Build the example using 
 
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/README.md b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
index af0186fb276..19a39ddd9a5 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/README.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/README.md
@@ -45,7 +45,7 @@ For more detailed information on building and running examples see the appropria
 
 2.	Go to the generated example project director 
     
-        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make`
+        cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
         
 3.	Build the example using 
 
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
index a95b4550417..5dbb91dd368 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/arc_mli.inc
@@ -24,7 +24,7 @@ ifeq ($(filter no_arc_mli,$(ALL_TAGS)),)
 ALL_TAGS += arc_mli
 
 ifeq ($(BUILD_ARC_MLI),true)
-  MLI_LIB_DIR = arc_mli_$(basename $(TCF_FILE_NAME))
+  MLI_LIB_DIR ?= arc_mli_$(basename $(TCF_FILE_NAME))
 
   $(eval $(call add_third_party_download,$(EMBARC_MLI_URL),$(EMBARC_MLI_MD5),$(MLI_LIB_DIR),build_embarc_mli,$(TCF_FILE)))
 
@@ -36,7 +36,7 @@ ifeq ($(BUILD_ARC_MLI),true)
     third_party/$(MLI_LIB_DIR)/LICENSE
 else
 ifneq ($(ARC_MLI_PRE_COMPILED_TARGET),)
-  MLI_LIB_DIR = arc_mli_package
+  MLI_LIB_DIR ?= arc_mli_package
   $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
 
   MLI_INCLUDE_FOLDER = $(MLI_LIB_DIR)/include
diff --git a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf b/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf
deleted file mode 100644
index 833fa9ca9b9..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_em11d_dfss.tcf
+++ /dev/null
@@ -1,4907 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<config_list>
-  <tool_config version="1.0.72" mwdt_version="O-2018.09" />
-  <configuration name="BCRs" filename="bcr_contents.txt">
-    <string><![CDATA[
-	0x4	0x44		IDENTITY
-	0x18	0x80000000	AUX_DCCM
-	0x60	0x2		BCR_VER
-	0x68	0x10		VECBASE_AC_BUILD
-	0x6d	0x1002		MPU_BUILD
-	0x6e	0xc902		RF_BUILD
-	0x72	0x215104	D_CACHE_BUILD
-	0x74	0x904		DCCM_BUILD
-	0x75	0x10504		TIMER_BUILD
-	0x76	0x605		AP_BUILD
-	0x77	0x135104	I_CACHE_BUILD
-	0x78	0x904		ICCM_BUILD
-	0x79	0x2220		XY_BUILD
-	0x7a	0x3521		DSP_BUILD
-	0x7b	0x22206		MULTIPLY_BUILD
-	0x7c	0x3		SWAP_BUILD
-	0x7d	0x3		NORM_BUILD
-	0x7e	0x2		MINMAX_BUILD
-	0x7f	0x303		BARREL_BUILD
-	0xc1	0x12447402	ISA_CONFIG
-	0xc3	0xf0000012	DMP_PP_BUILD
-	0xc5	0x2		STACK_REGION_BUILD
-	0xc7	0x50000004	ERP_BUILD
-	0xc8	0x1004f03	FPU_BUILD
-	0xcb	0x2		BS_BUILD
-	0xcc	0x1988c02	AGU_BUILD
-	0xcd	0x120f02	DMAC_BUILD
-	0xf0	0x101063	SUBSYS_BUILD
-	0xf1	0x1		CORE_CONFIG
-	0xf2	0x503		RTT_BUILD
-	0xf3	0x134d6001	IRQ_BUILD
-	0xf5	0x8080104	PCT_BUILD
-	0xf6	0x6f0004	CC_BUILD
-	0xff	0x2003		SMART_BUILD
-	0x208	0x60000000	AUX_ICCM
-	0x5f8	0x90000000	XCCM_BASE
-	0x5f9	0xa0000000	YCCM_BASE
-	0xa00	0x1000		SUBSYS_DSP_0_BUILD
-	0xa04	0x171700f0	SUBSYS_IO_0_BUILD
-	0xa05	0x7		SUBSYS_IO_1_BUILD
-	0xa06	0x111		SUBSYS_IO_2_BUILD
-	0xa1e	0x100000	SUBSYS_UAUX_OFFSET
-	0xa1f	0x80000000	SUBSYS_APEX_OFFSET
-]]></string>
-  </configuration>
-  <configuration name="build_version_info" filename="build_version_info.txt">
-    <string><![CDATA[
-Version Information:
-    ARChitect O-2018.09
-    IP Libraries:
-        ARCv2EM            v5.0.32
-        ARC Data Fusion IP Subsystem DSP  v1.1.6
-        ARC Data Fusion IP Subsystem INFRA  v1.1.6
-        ARC Data Fusion IP Subsystem IO  v1.1.6
-        ARC Data Fusion IP Subsystem SPEECH  v1.1.6
-        ARC Debug          v2.1.9
-        ARC RTT            v1.0.23
-        ARC xCAM           v4.3.7
-        ARCv2EM_CCT        v5.0.32
-        EMSDP_BOARD        v1.0.0
-        Implementation     v5.0.32
-        Tool Configuration  v1.0.72
-]]></string>
-  </configuration>
-  <configuration name="mw_compiler" filename="ccac.arg">
-    <string><![CDATA[
-	-arcv2em
-	-core4
-	-Hrgf_banked_regs=32
-	-HL
-	-Xunaligned
-	-Xcode_density
-	-Xdiv_rem=radix2
-	-Xswap
-	-Xbitscan
-	-Xmpy_option=mpyd
-	-Xshift_assist
-	-Xbarrel_shifter
-	-Xdsp2
-	-Xdsp_complex
-	-Xdsp_divsqrt=radix2
-	-Xdsp_itu
-	-Xdsp_accshift=full
-	-Xagu_large
-	-Xxy
-	-Xxy_config=dccm_x_y
-	-Xbitstream
-	-Xfpus_div
-	-Xfpu_mac
-	-Xfpuda
-	-Xfpus_mpy_slow
-	-Xfpus_div_slow
-	-Xfpu_pipe_impl
-	-Xtimer0
-	-Xrtc
-	-Xstack_check
-	-dcache=16384,32,2,a
-	-Hccm
-	-Xdmac
-]]></string>
-  </configuration>
-  <configuration name="mw_debugger" filename="mdb.arg">
-    <string><![CDATA[
-	-arcv2em 
-	-core4 
-	-rgf_num_banks=2 
-	-rgf_banked_regs=32 
-	-rgf_num_wr_ports=2 
-	-Xunaligned 
-	-Xcode_density 
-	-Xdiv_rem=radix2 
-	-Xswap 
-	-Xbitscan 
-	-Xmpy_option=mpyd 
-	-Xshift_assist 
-	-Xbarrel_shifter 
-	-Xdsp2 
-	-Xdsp_complex 
-	-Xdsp_divsqrt=radix2 
-	-Xdsp_itu 
-	-Xdsp_accshift=full 
-	-Xagu_large 
-	-Xagu_wb_depth=4 
-	-Xagu_accord 
-	-Xxy 
-	-Xxy_config=dccm_x_y 
-	-Xxy_size=16K 
-	-Xxy_x_base=0x90000000 
-	-Xxy_y_base=0xa0000000 
-	-Xbitstream 
-	-Xfpus_div 
-	-Xfpu_mac 
-	-Xfpuda 
-	-Xfpus_mpy_slow 
-	-Xfpus_div_slow 
-	-Xfpu_pipe_impl 
-	-Xtimer0 
-	-Xtimer0_level=1 
-	-Xrtc 
-	-action_points=8 
-	-Xstack_check 
-	-smart_stack_entries=8 
-	-mpu 
-	-mpu_regions=16 
-	-interrupts=96 
-	-interrupt_priorities=4 
-	-ext_interrupts=77 
-	-firq 
-	-interrupt_base=0x0 
-	-dcache=16384,32,2,a 
-	-dcache_feature=2 
-	-icache=16384,64,2,a 
-	-icache_feature=1 
-	-dccm_size=0x20000 
-	-dccm_base=0x80000000 
-	-iccm0_size=0x20000 
-	-iccm0_base=0x60000000 
-	-error_prot_ver=4 
-	-ccm_prot_pipelined 
-	-watchdog 
-	-watchdog_size=32 
-	-Xpct_counters=8 
-	-dmac 
-	-dmac_channels=16 
-	-dmac_registers=0 
-	-dmac_fifo_depth=2 
-	-dmac_int_config=multiple_internal 
-]]></string>
-  </configuration>
-  <configuration name="nSIM" filename="nsim.props">
-    <string><![CDATA[
-	nsim_isa_family=av2em
-	nsim_isa_core=4
-	arcver=0x44
-	nsim_isa_rgf_num_banks=2
-	nsim_isa_rgf_banked_regs=32
-	nsim_isa_rgf_num_regs=32
-	nsim_isa_rgf_num_wr_ports=2
-	nsim_isa_big_endian=0
-	nsim_isa_lpc_size=32
-	nsim_isa_pc_size=32
-	nsim_isa_addr_size=32
-	nsim_isa_unaligned_option=1
-	nsim_isa_code_density_option=2
-	nsim_isa_div_rem_option=1
-	nsim_isa_swap_option=1
-	nsim_isa_bitscan_option=1
-	nsim_isa_mpy_option=8
-	nsim_isa_shift_option=3
-	nsim_isa_dsp_option=2
-	nsim_isa_dsp_complex_option=1
-	nsim_isa_dsp_divsqrt_option=1
-	nsim_isa_dsp_itu_option=1
-	nsim_isa_dsp_accshift_option=2
-	nsim_isa_agu_size=large
-	nsim_isa_agu_wb_depth=4
-	nsim_isa_agu_accord=1
-	nsim_isa_xy=1
-	nsim_isa_xy_config=dccm_x_y
-	nsim_isa_xy_size=16K
-	nsim_isa_xy_x_base=0x90000000
-	nsim_isa_xy_y_base=0xa0000000
-	nsim_isa_bitstream_option=1
-	nsim_isa_fpus_div_option=1
-	nsim_isa_fpu_mac_option=1
-	nsim_isa_fpuda_option=1
-	nsim_isa_fpu_fast_mpy_option=0
-	nsim_isa_fpu_fast_div_option=0
-	nsim_isa_fpu_pipe_impl=1
-	nsim_isa_enable_timer_0=1
-	nsim_isa_timer_0_int_level=1
-	nsim_isa_rtc_option=1
-	nsim_isa_num_actionpoints=8
-	nsim_isa_stack_checking=1
-	nsim_isa_smart_stack_entries=8
-	mpu_regions=16
-	mpu_version=2
-	nsim_isa_number_of_interrupts=96
-	nsim_isa_number_of_levels=4
-	nsim_isa_number_of_external_interrupts=77
-	nsim_isa_fast_irq=1
-	nsim_isa_intvbase_preset=0x0
-	dcache=16384,32,2,a
-	nsim_isa_dc_feature_level=2
-	icache=16384,64,2,a
-	nsim_isa_ic_feature_level=1
-	dccm_size=0x20000
-	dccm_base=0x80000000
-	iccm0_size=0x20000
-	iccm0_base=0x60000000
-	nsim_isa_error_prot=4
-	nsim_isa_error_prot_ccm_wb=1
-	nsim_isa_watchdog=1
-	nsim_isa_watchdog_size=32
-	nsim_isa_pct_counters=8
-	nsim_isa_dmac_option=1
-	nsim_isa_dmac_channels=16
-	nsim_isa_dmac_registers=0
-	nsim_isa_dmac_fifo_depth=2
-	nsim_isa_dmac_int_config=multiple_internal
-]]></string>
-  </configuration>
-  <configuration name="IDE" filename="ide.props">
-    <string><![CDATA[
-	processor.family=4
-	processor.core_version=4
-	processor.family_name=arcv2em
-	processor.rgf_num_banks=2
-	processor.rgf_banked_regs=32
-	processor.rgf_num_wr_ports=2
-	processor.endian=little
-	processor.lpc_size=32
-	processor.pc_size=32
-	processor.addr_size=32
-	processor.Xunaligned=1
-	processor.Xcode_density=1
-	processor.Xdiv_rem=radix2
-	processor.Xswap=1
-	processor.Xbitscan=1
-	processor.Xmpy_option=mpyd
-	processor.Xshift_assist=1
-	processor.Xbarrel_shifter=1
-	processor.Xdsp2=1
-	processor.Xdsp_complex=1
-	processor.Xdsp_divsqrt=radix2
-	processor.Xdsp_itu=1
-	processor.Xdsp_accshift=full
-	processor.Xagu_large=1
-	processor.Xagu_wb_depth=4
-	processor.Xagu_accord=1
-	processor.Xxy=1
-	processor.Xxy_config=dccm_x_y
-	processor.Xxy_size=16K
-	processor.Xxy_x_base=0x90000000
-	processor.Xxy_y_base=0xa0000000
-	processor.Xbitstream=1
-	processor.Xfpus_div=1
-	processor.Xfpu_mac=1
-	processor.Xfpuda=1
-	processor.Xfpus_mpy_slow=1
-	processor.Xfpus_div_slow=1
-	processor.Xfpu_pipe_impl=1
-	processor.Xtimer0=1
-	processor.Xtimer0_level=1
-	processor.Xrtc=1
-	processor.action_points=8
-	processor.Xstack_check=1
-	processor.smart_stack_entries=8
-	processor.mpu=1
-	processor.mpu.regions=16
-	processor.interrupts=96
-	processor.interrupt_priorities=4
-	processor.ext_interrupts=77
-	processor.firq=1
-	processor.interrupt_base=0x0
-	processor.dcache.size=16384
-	processor.dcache.line_size=32
-	processor.dcache.ways=2
-	processor.dcache_feature=2
-	processor.icache.size=16384
-	processor.icache.line_size=64
-	processor.icache.ways=2
-	processor.icache_feature=1
-	processor.dccm_size=0x20000
-	processor.dccm_base=0x80000000
-	processor.Hccm=1
-	processor.iccm0_size=0x20000
-	processor.iccm0_base=0x60000000
-	processor.error_prot_ver=4
-	processor.ccm_prot_pipelined=1
-	processor.watchdog=1
-	processor.watchdog_size=32
-	processor.Xpct_counters=8
-	processor.dmac=1
-	processor.dmac_channels=16
-	processor.dmac_registers=0
-	processor.dmac_fifo_depth=2
-	processor.dmac_int_config=multiple_internal
-	processor.tcf_include1=apexextensions.h
-	processor.tcf_include2=core_config.h
-]]></string>
-  </configuration>
-  <configuration name="architect" filename="build_configuration.txt">
-    <string><![CDATA[
-######## project_emsdp_em11d_dfss_RC0 --- com.arc.templates.project.Empty.1_0 ########
-
-# BuildHTMLDocs --- Creates custom HTML documentation in the 'docs' directory.
--build_html_docs true
-
-# BuildSoftware --- Creates software under the Software directory.
--build_software true
-
-# BuildTestCode --- Creates test source code under the 'tests' directory.
--build_test_code true
-
-# BuildScripts --- Creates synthesis scripts and configuration files, which are required for hierarchy generation.
--build_scripts true
-
-# BuildHDL --- Creates the behavioural and synthesisable HDL source code.
--build_hdl true
-
-# CompileTestCode --- Compiles and assembles the test code.
--compile_test_code false
-
-# GenerateStructuralHDL --- Generate the necessary structural HDL
--generate_structural_hdl true
-
-# CompileForHDLSimulation --- Compile the HDL ready for simulation, using the selected Simulator.
--compile_hdl_for_simulation false
-
-# BuildXCAM --- 
-# When true, build the XCAM cycle accurate model from HDL.
-# This happens only when the VTOC component (in the XCAM library) has been added to the design.
-# 
--build_xcam false
-
-# RunARCsyn --- Synthesize design using ARCsyn
--run_arcsyn false
-
-# RunSEIF --- Run Synopsys Embedit Integrator Flow to generate configured memory instances
--run_seif false
-
-# RunARCrams --- Run ARCrams on the current build, this will stitch in vendor supplied RAM models and update the synthesis and simulation environment to use the models.
--run_arcrams false
-
-# RunARCformal --- Formal Verification using ARCformal
--run_arcformal false
-
-# RunARCpower --- Run the Power Analysis using RTL simulation to derive the activity
--run_arcpower false
-
-# compile_nsim_user_extensions --- Build nSIM extensions for any APEX components in the current design using their C Models.
--compile_nsim_user_extension false
-
-# compile_translated_nsim_extensions --- Converts APEX extensions from Verilog to C++ and compiles the model for nSIM.
--compile_translated_nsim_extensions false
-
-# compile_iss_user_extensions --- Build ISS extensions for any APEX components in the current design using their C Models.
--compile_iss_user_extensions false
-
-# compile_translated_iss_extensions --- Converts APEX extensions from Verilog to C++ and compiles the model for the ISS.
--compile_translated_iss_extensions false
-
-
-######## System --- com.arc.hardware.System.1_0 ########
-
-# Create System
--create com.arc.hardware.System.1_0 System
-
-# Testbench --- 
-# Only the rascal testbench is supported, and is required by ARCtest.
-# 	
--testbench rascal
-
-# SynthesisLevel --- 
-# Sets the top level module name for synthesis.  
-# 
-# If not using core_sys: for single-core designs, cpu_isle is used; for multicore designs, archipelago is used.
-# 	
--synthesislevel cpu_isle/archipelago
-
-# GateLevelSim --- When selected the gate level sim test code and scripts would be installed to run ARCgatesim
--gatesim true
-
-# UserLibraryName --- The name for your HDL library
--library_name user
-
-# OPTION_SimulatorName --- The name of the simulator you wish to use
--simulator vcs
-
-# sim64 --- When selected, the 64-bit version of the simulator is used.  Be sure you have the 64-bit-capable simulator installed  and $ARCHITECT_ROOT/lib/linux_x86_64/ added to your LD_LIBRARY_PATH.
-# The setting of this option affects the content of the generated makefile_interface_*_verilog, where * is the simulator name.
--sim64 false
-
-# verilog_2001 --- Enable Verilog 2001 file-io syntax (if false: use pli)
--verilog_2001 true
-
-# export_srams_to --- Where to place srams, if not cpu_top
--export_srams_to none
-
-# copy_prefix --- 
-# A Copy Prefix P causes creation of a separate copy of the entire Verilog build where each Verilog filename, module, and `define is prefixed with P and copied to a separate directory named P.
-# 	
--copy_prefix ""
-
-
-######## CPUisle --- com.arc.hardware.CPU_isle.1_0 ########
-
-# Create CPUisle
--create com.arc.hardware.CPU_isle.1_0 System.CPUisle
-
-# unique_name --- verilog module modifier prefix
--unique_name ""
-
-# ArcNum --- The processor number as read back in the ARCNUM field of the IDENTITY register.
--arc_num 0
-
-# instances --- 
-# The number of instantiations of this core.
-# 
--instances 1
-
-# instance_signal_prefix --- 
-# [arc_dev] Specifies the prefix used for each instance, when multiple instances are created.  If N is in the text, N is replaced by the instance number; otherwise the instance number is appended.
-# 
--instance_signal_prefix c
-
-# skip_vpp --- 
-# This is a secret option, not seen by customers.
-# If you check this, we won't VPP most of the *.vpp files.
-# This can speed up re-build if you've already built them and not
-# changed the core options.
-# Use at your own risk.
-# 	
--skip_vpp false
-
-# OPTION_remove_tmpdir --- 
-# This is a secret option, not seen by customers.
-# If you uncheck this, we'll leave in place the temporary directory in which RTL is generated to support unique_name.
-# 	
--remove_tmpdir true
-
-# CPUFloorplan --- Floorplan giving relative placement of the RAMs  for the given configuration of ARCv2HS or ARCv2EM in this CPUisle
--cpu_floorplan create
-
-# userCPUFloorplanPath --- Pathname of user floorplan for the CPU when using a hierarchical implementation
--usercpufloorplan_path ""
-
-# pinLocationConstraintsFile --- Pathname+filename of the physical pin location constraints file or just "side1" (all pins on l.h.s) or "side2" (pins on top only) or "side3" (pins on r.h.s. only) or "side4" (pins on bottom only) to get a template file generated
--pin_location_constraints_file ""
-
-
-######## ARCv2EM --- com.arc.hardware.ARCv2EM.1_0 ########
-
-# Create ARCv2EM
--create com.arc.hardware.ARCv2EM.1_0 System.CPUisle.ARCv2EM
-
-# arcv2em --- Description to follow
--arcv2em true
-
-# def_div2ref --- This specifies the clock division factor at reset. It is used for mss clock controller to generate core clock, and the value N means core is running at (1/N) x ref_clk.
--def_div2ref 1
-
-# addr_size --- This defines the address bus width (in bits).
--addr_size 32
-
-# pc_size --- This defines the program counter (in bits).
--pc_size 32
-
-# lpc_size --- This defines the size of the loop counter (in bits).
--lpc_size 32
-
-# halt_on_reset --- This defines whether the core is halted initially on reset.
--halt_on_reset true
-
-# byte_order --- This defines the endianness of the core.
--byte_order little
-
-# sep_option --- Enable PC/RF and other key register protection for SEP.
--sep_option false
-
-# code_density_option --- This reduces the size of program memory by adding instructions that condense commonly used instruction patterns with some marginal increase in processor gate count. The added instructions are ENTER_S, LEAVE_S, JLI_S, BI, BIH.
--code_density_option true
-
-# bitscan_option --- This adds instructions for efficient search of bits within a 32 bit word, including normalize (NORM, NORMH, NORMW) and find first or last set bit (FFS, FLS) instructions.
--bitscan_option true
-
-# shift_option --- The Shift ISA option adds variable and multi-length shift rotation instructions:  (0) No shift/rotation instructions (1) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8 (2) ASRM, ASLM, LSRM, RORM (3) ASR16, ASR8, LSR8, LSL8, ROL8, ROR8, ASRM, ASLM, LSRM, RORM
--shift_option 3
-
-# swap_option --- This adds two instructions used to swap half-words or bytes in a 32b word. Useful for converting between little to big endianess and vice-versa.
--swap_option true
-
-# div_rem_option --- The DIV/REM option adds non-blocking multi-cycle implementation of integer divide/remainder functions. Added instructions are DIV, DIVU (integer divide), REM and REMU (integer divide remainder).radix2 takes 33 cycles.  radix4_enhanced takes 3 to 19 cycles per operation.
--div_rem_option none
-
-# mpy_option --- The Multiplier ISA option allows selection between several multiplier configurations to tradeoff performance with silicon area.
-# For select multiply options, when the DIV/REM option is also selected, some datapath resources will be shared between the multiply and divide pipeline to minimize total area.
-# 
-# Cycle count (16-bit, lower 32-bit or upper 32-bit) for the different configurations is as follows:
-# <pre>
-# 
-# option  16/L32/U32  Instructions
-# ------  ----------  ---------------------
-#       
-# none	  -/-/-     None
-# wlh1	  1/1/1     MPYW/U, MPY/U, MPYH/U
-# wlh2	  2/2/2     MPYW/U, MPY/U, MPYH/U
-# wlh3	  2/3/3     MPYW/U, MPY/U, MPYH/U
-# wlh4	  2/4/5     MPYW/U, MPY/U, MPYH/U
-# wlh5	  5/9/9     MPYW/U, MPY/U, MPYH/U
-# </pre>
-# 
--mpy_option none
-
-# code_protection --- The ARC EM architecture divides the memory into 16 regions, which can be protected individually.  This feature adds a 16-bit input to the processor core, one bit per region. When the protect bit is set, the processor disables any load or store to the corresponding region.  An attempt to access a protected region raises an EV_ProtV exception.
--code_protection false
-
-# stack_checking --- Stack checking is a mechanism for checking stack accesses and raising an exception when a stack overflow or underflow is detected.
--stack_checking true
-
-# unaligned_option --- This enables unaligned loads and stores.
--unaligned_option true
-
-# intvbase_preset --- This sets the interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE.
--intvbase_preset 0x0
-
-# intvbase_preset_s --- This sets the secure interrupt vector base configuration register, VECBASE_AC_BUILD. The vector base address is aligned to a 1KB boundary, so the required address value should be divided by 1K (i.e. do not include the lower 10 bits). On reset, this register is loaded into the interrupt vector base address register, INT_VECTOR_BASE_S.This is effective only when 2+2 mode is enabled.
--intvbase_preset_s 0x0
-
-# intvbase_ext --- Set this option to drive the upper 22 bits of the interrupt base vector externally, into signal intvbase_in.
--intvbase_ext false
-
-# nmi_option --- add Non-maskable external exception support
--nmi_option false
-
-# rgf_impl --- This defines whether the register file is implemented using flip-flops, or with a hard macro.
--rgf_impl flip_flops
-
-# rgf_num_regs --- This defines the size (in 32b register) of the processor register file.
--rgf_num_regs 32
-
-# rgf_wr_ports --- This defines the number of write ports on the register file.
--rgf_wr_ports 2
-
-# rgf_num_banks --- Dual register banks are useful if Fast IRQ has been configured, but may be selected even if not.
--rgf_num_banks 2
-
-# rgf_banked_regs --- This selects the number of registers that are replicated in the second register-file bank.
--rgf_banked_regs 32
-
-# turbo_boost --- This enables the Turbo Boost synthesis option. By enabling this option, the achievable clock frequency is increased, but at the cost of an additional cycle latency on branch instructions.
--turbo_boost false
-
-# infer_alu_adder --- infer: datapath is described as behavioral code: A + B
-# instantiate: datapath is instantiated as a detailed multi-stage code of a carry-lookahead adder.  It is generally preferable to use the infer option and add directives for your target synthesizer. 
--infer_alu_adder infer
-
-# infer_mpy_wtree --- infer: datapath is described as behavioral code: A * B (applies to only wlh3, wlh4 and wlh5 designs)
-# instantiate: datapath is instantiated as a detailed multi-stage code of a Wallace Tree multiplier It is generally preferable to use the infer option and add directives for your target synthesizer. 
--infer_mpy_wtree instantiate
-
-# scantest_ram_bypass_mux --- This mux is used to make logic trapped between flops and memory (aka shadow logic) to be covered by scantest without requiring advanced sequential ATPG on the memory to be applied. Will add delay to functional access time
--scantest_ram_bypass_mux false
-
-# logic_bist --- This option will OR LBIST_EN with test_mode
--logic_bist false
-
-# power_domains --- Adds three separate power domains to the core, and propagates power-gate control signals to the top level of the core. Also generates UPF constraints and commands in the low-power scripts
--power_domains false
-
-# dvfs --- Adds logic to the core to allow dynamic controlling of voltage and frequency and propagates the associated control signals to the top level of core
--dvfs false
-
-# voltage_domains --- Creates a voltage  domain split between RAM and std cell parts to support Ultra Low Voltage on cells and generates UPF constraints
--voltage_domains false
-
-# mem_bus_option --- The core supports two bus protocols for accessing external memory: AHB & AHB-Lite. AHB-Lite-single means instruction fetch and data access share a single AHB-Lite port. AHB-Lite-dual means separate AHB-Lite port for each initiator if present.
--mem_bus_option AHB
-
-# mem_bus_reg_interface --- Specifies whether the memory bus interface is registered.
--mem_bus_reg_interface true
-
-# dmi_burst_option --- This will enable high-throughput burst support on the DMI slave interfaces. By enabling this option, the peak DMI read throughput goes from 1 word per 3 cycles to N words per N+2 cycles, in which N is the AHB burst lengthDMI write throughput goes from 1 word per 3 cycles to 1 word per cycle.
--dmi_burst_option true
-
-# has_dmp_peripheral --- This option enables the redirection of load/store accesses to one segment (1/16) of the addressable space to a dedicated peripheral bus. This offers high system integration and reduces overall system cost.
--has_dmp_peripheral true
-
-# per0_base --- This option specifies the memory region assignment for this peripheral aperture
--per0_base 15
-
-# per0_limit --- This option specifies the end of this peripheral aperture
--per0_limit 0
-
-# per_bus_option --- The core supports one bus protocol for accessing the peripheral space, when enabled: AHB-Lite.
--per_bus_option AHB-Lite
-
-# per_bus_reg_interface --- Specifies whether the peripheral bus interface is registered.
--per_bus_reg_interface true
-
-# clock_gating --- This enables the insertion of architectural clock gate elements in the design. By enabling this option, the clocks to various parts of the design will be disabled when the logic they drive is not in use to save power.
--clock_gating false
-
-# back_compat --- This enables the addition of rst_a input in the clkgate module to support backward compatibility with the older EM and Subsystem releases.
--back_compat true
-
-# byte_parity --- If parity protection on the CCMs or Cache is configured, this option enables parity protection on a per-byte basis. Otherwise, parity is per word basis
--byte_parity false
-
-# prot_pipelined --- Check the box if CCM memories are configured for ECC, and you want single-bit errors to be corrected, written back to memory, and re-fetched. When unchecked, single bit errors are corrected when read from memory, but the offending memory location itself is not corrected with a writeback, no influence on Cache protection
--prot_pipelined false
-
-# cct_test_ena --- When ECC is configured, this option enables single bit error injection in CCT RAM models to demonstrate ECC protection on the RAMs. When enabled, the RAM models can only be used in HDL CCT simulation (no xCAM support) and are not intended for use in SoC level integration.
--cct_test_ena false
-
-# err_prot_ehce --- Enabled enhanced ECC architecture for CCM. Instruction fetch with single bit error is not replayed; ecc cac modules are shared to reduce area and timing opt.
--err_prot_ehce false
-
-
-######## dsp_trig --- com.arc.hardware.dfss.dsp_trig.1_0 ########
-
-# Create dsp_trig
--create com.arc.hardware.dfss.dsp_trig.1_0 System.CPUisle.ARCv2EM.dsp_trig
-
-# dsp_trig --- Command line option for EIA extension component 'dsp_trig'.
--dsp_trig true
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio0 --- com.arc.hardware.dfss.io_gpio0.1_0 ########
-
-# Create io_gpio0
--create com.arc.hardware.dfss.io_gpio0.1_0 System.CPUisle.ARCv2EM.io_gpio0
-
-# io_gpio0 --- Command line option for EIA extension component 'io_gpio0'.
--io_gpio0 true
-
-# io_gpio0_debounce --- Selects the inclusion of Debounce logic
--io_gpio0_debounce 1
-
-# io_gpio0_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio0_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-# io_gpio0_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output.
--io_gpio0_direction_rst_value 0
-
-# io_gpio0_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored.
--io_gpio0_output_rst_value 0x0
-
-
-######## io_i2c_mst0 --- com.arc.hardware.dfss.io_i2c_mst0.1_0 ########
-
-# Create io_i2c_mst0
--create com.arc.hardware.dfss.io_i2c_mst0.1_0 System.CPUisle.ARCv2EM.io_i2c_mst0
-
-# io_i2c_mst0 --- Command line option for APEX extension component 'io_i2c_mst0'.
--io_i2c_mst0 true
-
-# io_i2c_mst0_fs --- RX/TX FIFO size
--io_i2c_mst0_fs 16
-
-# io_i2c_mst0_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2c_mst0_dma_support None
-
-# io_i2c_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
--io_i2c_mst0_cdc_included 0
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_i2c_slv0 --- com.arc.hardware.dfss.io_i2c_slv0.1_0 ########
-
-# Create io_i2c_slv0
--create com.arc.hardware.dfss.io_i2c_slv0.1_0 System.CPUisle.ARCv2EM.io_i2c_slv0
-
-# io_i2c_slv0 --- Command line option for APEX extension component 'io_i2c_slv0'.
--io_i2c_slv0 true
-
-# io_i2c_slv0_fs --- RX/TX FIFO size
--io_i2c_slv0_fs 16
-
-# io_i2c_slv0_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2c_slv0_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_mst0 --- com.arc.hardware.dfss.io_spi_mst0.1_0 ########
-
-# Create io_spi_mst0
--create com.arc.hardware.dfss.io_spi_mst0.1_0 System.CPUisle.ARCv2EM.io_spi_mst0
-
-# io_spi_mst0 --- Command line option for APEX extension component 'io_spi_mst0'.
--io_spi_mst0 true
-
-# io_spi_mst0_fz --- RX/TX FIFO depth
--io_spi_mst0_fs 16
-
-# io_spi_mst0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_mst0_max_xfer_size 16
-
-# io_spi_mst0_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
--io_spi_mst0_cdc_included 0
-
-# io_spi_mst0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_mst0_dma_support Memory-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## subsys_bcr --- com.arc.hardware.dfss.subsys_bcr.1_0 ########
-
-# Create subsys_bcr
--create com.arc.hardware.dfss.subsys_bcr.1_0 System.CPUisle.ARCv2EM.subsys_bcr
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_mst1 --- com.arc.hardware.dfss.io_spi_mst1.1_0 ########
-
-# Create io_spi_mst1
--create com.arc.hardware.dfss.io_spi_mst1.1_0 System.CPUisle.ARCv2EM.io_spi_mst1
-
-# io_spi_mst1 --- Command line option for APEX extension component 'io_spi_mst1'.
--io_spi_mst1 true
-
-# io_spi_mst1_fz --- RX/TX FIFO depth
--io_spi_mst1_fs 16
-
-# io_spi_mst1_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_mst1_max_xfer_size 16
-
-# io_spi_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
--io_spi_mst1_cdc_included 0
-
-# io_spi_mst1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_mst1_dma_support Memory-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_mst2 --- com.arc.hardware.dfss.io_spi_mst2.1_0 ########
-
-# Create io_spi_mst2
--create com.arc.hardware.dfss.io_spi_mst2.1_0 System.CPUisle.ARCv2EM.io_spi_mst2
-
-# io_spi_mst2 --- Command line option for APEX extension component 'io_spi_mst2'.
--io_spi_mst2 true
-
-# io_spi_mst2_fz --- RX/TX FIFO depth
--io_spi_mst2_fs 16
-
-# io_spi_mst2_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_mst2_max_xfer_size 16
-
-# io_spi_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the peripheral clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than or equal to the peripheral clock frequency.
--io_spi_mst2_cdc_included 0
-
-# io_spi_mst2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_mst2_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_spi_slv0 --- com.arc.hardware.dfss.io_spi_slv0.1_0 ########
-
-# Create io_spi_slv0
--create com.arc.hardware.dfss.io_spi_slv0.1_0 System.CPUisle.ARCv2EM.io_spi_slv0
-
-# io_spi_slv0 --- Command line option for APEX extension component 'io_spi_slv0'.
--io_spi_slv0 true
-
-# io_spi_slv0_fz --- RX/TX FIFO depth
--io_spi_slv0_fs 16
-
-# io_spi_slv0_max_xfer_size --- This defines the maximum number of bits per word at the serial data port, which determines the FIFO width.
--io_spi_slv0_max_xfer_size 16
-
-# io_spi_slv0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_spi_slv0_dma_support Memory-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_gpio1 --- com.arc.hardware.dfss.io_gpio1.1_0 ########
-
-# Create io_gpio1
--create com.arc.hardware.dfss.io_gpio1.1_0 System.CPUisle.ARCv2EM.io_gpio1
-
-# io_gpio1 --- Command line option for EIA extension component 'io_gpio1'.
--io_gpio1 true
-
-# io_gpio1_debounce --- Selects the inclusion of Debounce logic
--io_gpio1_debounce 1
-
-# io_gpio1_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio1_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-# io_gpio1_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output.
--io_gpio1_direction_rst_value 0
-
-# io_gpio1_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored.
--io_gpio1_output_rst_value 0x0
-
-
-######## io_gpio2 --- com.arc.hardware.dfss.io_gpio2.1_0 ########
-
-# Create io_gpio2
--create com.arc.hardware.dfss.io_gpio2.1_0 System.CPUisle.ARCv2EM.io_gpio2
-
-# io_gpio2 --- Command line option for EIA extension component 'io_gpio2'.
--io_gpio2 true
-
-# io_gpio2_debounce --- Selects the inclusion of Debounce logic
--io_gpio2_debounce 1
-
-# io_gpio2_readback_sync --- Selects the inclusion of metastability registers on the read back path when reading the external 'ext_porta' signal
--io_gpio2_readback_sync 1
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-# io_gpio2_direction_rst_value --- Reset value of the SWPORTA_DDR register, which determines the direction (input/output) of the GPIO interface. 0: input, 1: output.
--io_gpio2_direction_rst_value 0
-
-# io_gpio2_output_rst_value --- Reset value of the SWPORTA_DR register, which determines the reset value of the GPIO output ports. Bits corresponding to input ports are ignored.
--io_gpio2_output_rst_value 0x0
-
-
-######## io_i2c_mst1 --- com.arc.hardware.dfss.io_i2c_mst1.1_0 ########
-
-# Create io_i2c_mst1
--create com.arc.hardware.dfss.io_i2c_mst1.1_0 System.CPUisle.ARCv2EM.io_i2c_mst1
-
-# io_i2c_mst1 --- Command line option for APEX extension component 'io_i2c_mst1'.
--io_i2c_mst1 true
-
-# io_i2c_mst1_fs --- RX/TX FIFO size
--io_i2c_mst1_fs 16
-
-# io_i2c_mst1_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2c_mst1_dma_support None
-
-# io_i2c_mst1_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
--io_i2c_mst1_cdc_included 0
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_i2c_mst2 --- com.arc.hardware.dfss.io_i2c_mst2.1_0 ########
-
-# Create io_i2c_mst2
--create com.arc.hardware.dfss.io_i2c_mst2.1_0 System.CPUisle.ARCv2EM.io_i2c_mst2
-
-# io_i2c_mst2 --- Command line option for APEX extension component 'io_i2c_mst2'.
--io_i2c_mst2 true
-
-# io_i2c_mst2_fs --- RX/TX FIFO size
--io_i2c_mst2_fs 16
-
-# io_i2c_mst2_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2c_mst2_dma_support None
-
-# io_i2c_mst2_cdc_included --- Selects whether a clock-domain crossing (CDC) is included between the core clock and the serial clock. If no CDC is present, both clocks must be synchronous. Otherwise the core clock frequency may be higher than, lower than or equal to the serial clock frequency.
--io_i2c_mst2_cdc_included 0
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart0 --- com.arc.hardware.dfss.io_uart0.1_0 ########
-
-# Create io_uart0
--create com.arc.hardware.dfss.io_uart0.1_0 System.CPUisle.ARCv2EM.io_uart0
-
-# io_uart0 --- Command line option for EIA extension component 'io_uart0'.
--io_uart0 true
-
-# io_uart0_fifo_mode --- Set the UART FIFO mode
--io_uart0_fifo_mode 16
-
-# io_uart0_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart0_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart1 --- com.arc.hardware.dfss.io_uart1.1_0 ########
-
-# Create io_uart1
--create com.arc.hardware.dfss.io_uart1.1_0 System.CPUisle.ARCv2EM.io_uart1
-
-# io_uart1 --- Command line option for EIA extension component 'io_uart1'.
--io_uart1 true
-
-# io_uart1_fifo_mode --- Set the UART FIFO mode
--io_uart1_fifo_mode 16
-
-# io_uart1_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart1_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart2 --- com.arc.hardware.dfss.io_uart2.1_0 ########
-
-# Create io_uart2
--create com.arc.hardware.dfss.io_uart2.1_0 System.CPUisle.ARCv2EM.io_uart2
-
-# io_uart2 --- Command line option for EIA extension component 'io_uart2'.
--io_uart2 true
-
-# io_uart2_fifo_mode --- Set the UART FIFO mode
--io_uart2_fifo_mode 16
-
-# io_uart2_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart2_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_uart3 --- com.arc.hardware.dfss.io_uart3.1_0 ########
-
-# Create io_uart3
--create com.arc.hardware.dfss.io_uart3.1_0 System.CPUisle.ARCv2EM.io_uart3
-
-# io_uart3 --- Command line option for EIA extension component 'io_uart3'.
--io_uart3 true
-
-# io_uart3_fifo_mode --- Set the UART FIFO mode
--io_uart3_fifo_mode 16
-
-# io_uart3_dma_support --- Selects whether support for the ARC EM DMA is included and whether the handshake interface should be connected to a memory-based or to an Aux-based DMA channel.
--io_uart3_dma_support None
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_i2s_rx_mst0 --- com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 ########
-
-# Create io_i2s_rx_mst0
--create com.arc.hardware.dfss.io_i2s_rx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_rx_mst0
-
-# io_i2s_rx_mst0 --- Command line option for APEX extension component 'io_i2s_rx_mst0'.
--io_i2s_rx_mst0 true
-
-# io_i2s_rx_mst0_fs --- RX FIFO size
--io_i2s_rx_mst0_fs 8
-
-# io_i2s_rx_mst0_fw --- RX FIFO width
--io_i2s_rx_mst0_fw 16
-
-# io_i2s_rx_mst0_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2s_rx_mst0_dma_support Memory-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_i2s_tx_mst0 --- com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 ########
-
-# Create io_i2s_tx_mst0
--create com.arc.hardware.dfss.io_i2s_tx_mst0.1_0 System.CPUisle.ARCv2EM.io_i2s_tx_mst0
-
-# io_i2s_tx_mst0 --- Command line option for APEX extension component 'io_i2s_tx_mst0'.
--io_i2s_tx_mst0 true
-
-# io_i2s_tx_mst0_fs --- TX FIFO size
--io_i2s_tx_mst0_fs 8
-
-# io_i2s_tx_mst0_fw --- TX FIFO width
--io_i2s_tx_mst0_fw 16
-
-# io_i2s_tx_mst0_dma_support --- Specifies whether the DMA handshake interface is included
--io_i2s_tx_mst0_dma_support Memory-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## io_pdm_rx0 --- com.arc.hardware.dfss.io_pdm_rx0.1_0 ########
-
-# Create io_pdm_rx0
--create com.arc.hardware.dfss.io_pdm_rx0.1_0 System.CPUisle.ARCv2EM.io_pdm_rx0
-
-# io_pdm_rx0 --- Command line option for APEX extension component 'io_pdm_rx0'.
--io_pdm_rx0 true
-
-# io_pdm_rx0_ch --- Number of Stereo Channels
--io_pdm_rx0_ch 1
-
-# io_pdm_rx0_fs --- RX FIFO size
--io_pdm_rx0_fs 16
-
-# io_pdm_rx0_ns --- Maximum number of CIC stages
--io_pdm_rx0_ns 4
-
-# io_pdm_rx0_ds --- Maximum delay in the COMB filter of the CIC filter
--io_pdm_rx0_ds 2
-
-# io_pdm_rx0_dma_support --- Specifies whether the DMA handshake interface is included
--io_pdm_rx0_dma_support Memory-Based
-
-# assign_xpubit --- 
-#       
-# The User Mode Extension Enable register (XPU) controls user-mode access to extension instructions and state. Each extension group is assigned a bit within the XPU register, and this bit may be programmed to enable or disable user-mode access to the extensions within that group.
-# <p>
-# By default an extension is not assigned a bit in this register.  This means the extension is always available.
-# <p>
-# If you wish to assign an XPU bit number, select this option.
-# 
-# 
--assign_xpubit false
-
-# xpubit --- 
-# The XPU bit number for this extension.
-# 
--xpubit 0
-
-
-######## DCCM --- com.arc.hardware.DCCM.1_0 ########
-
-# Create DCCM
--create com.arc.hardware.DCCM.1_0 System.CPUisle.ARCv2EM.DCCM
-
-# dccm_size --- This defines the size of the Data Closely Coupled Memory (DCCM) in bytes
--dccm_size 131072
-
-# dccm_base --- Sets the initial memory region assignment for DCCM
--dccm_base 8
-
-# dccm_interleave --- Split DCCM into even/odd memory banks.
--dccm_interleave false
-
-# dccm_prot --- Specifies the type of protection built for the DCCM.
--dccm_prot None
-
-# dccm_prot_level --- Specifies the level protection.
--dccm_prot_level Data_Only
-
-# dccm_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the DCCM
--dccm_prot_exceptions true
-
-# dccm_sec_lvl --- Specifies the level of secure DCCM.
--dccm_sec_lvl Non_Secure
-
-# dccm_dmi --- This enables external access through a DMI (direct memory interface) port.
--dccm_dmi true
-
-
-######## DMA Controller --- com.arc.hardware.DMA_Controller.1_0 ########
-
-# Create DMA Controller
--create com.arc.hardware.DMA_Controller.1_0 "System.CPUisle.ARCv2EM.DMA Controller"
-
-# dmac_channels --- This options specifies the number of DMA channels implemented in the DMA controller
--dmac_channels 16
-
-# dmac_fifo_depth --- This option specifies the DMA transfer FIFO depth in 32b words.
--dmac_fifo_depth 2
-
-# dmac_int_config --- None: the DMA controller cannot raise an interrupt
-# Single-External: single done and single error interrupt signal for all DMA channels, and the interrupt signals are routed to a port at the top of the EM logical hierarchy
-# Multiple-External: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are routed to ports at the top of the EM logical hierarchy
-# Single-Internal: single done and single error interrupt signals for all DMA channels, and the interrupt signals are internal to the EM core
-# Multiple-Internal: each DMA channel can be configured to raise separate (per-channel) done and error interrupts, and the interrupt signals are internal to the EM core
--dmac_int_config Multiple-Internal
-
-# dmac_separate_error_interrupts --- This specifies whether there is a separate error interrupt per DMA channel, or just one.
--dmac_separate_error_interrupts false
-
-# dmac_registers --- This option defines the number of DMA channels with their registers located in auxiliary space.
--dmac_registers 0
-
-# dmac_mem_if --- This option specifies whether the DMA controller system memory interface is integrated into the existing EM system memory interfaces or has its own interface.
--dmac_mem_if integrated
-
-# dmac_per_if --- Internal vs DW peripheral interface. Specify (in hex) which channels have the DW interface, where bit 0 corresponds to DMA channel 0, bit 1 for DMA channel 1, etc.
-# Example: 4 channel DMA controller where -dmac_per_if is set to 0x9 = DMA Channels 0 and 3 configured with the DW req interface, DMA Channels 1 and 2 configured with the internal req interface.
--dmac_per_if 0x7e00
-
-
-######## DSP --- com.arc.hardware.DSP.1_0 ########
-
-# Create DSP
--create com.arc.hardware.DSP.1_0 System.CPUisle.ARCv2EM.DSP
-
-# dsp_complex --- Enable/disable support for single cycle 16b+16b complex instructions and butterfly operations, else 2-cycle complex instructions only without butterfly support
--dsp_complex true
-
-# dsp_itu --- Enable/disable support for ITU bit-accurate 1 bit fractional shift before accumulation, else 1-bit fractional shift result after accumulation only
--dsp_itu true
-
-# dsp_divsqrt --- Enable/disable support for divide and square root operations: DIV(U), REM(U), SQRT
--dsp_divsqrt radix2
-
-# dsp_accshift --- Select support for accumulator shift operations: no supported, limited shift support only or full shift support and convergent rounding
--dsp_accshift full
-
-# dsp_impl --- The datapath components may be inferred from Verilog for better area or optimized using carry-save components for better timing
--dsp_impl optimized
-
-
-######## Data Cache --- com.arc.hardware.Data_Cache.1_0 ########
-
-# Create Data Cache
--create com.arc.hardware.Data_Cache.1_0 "System.CPUisle.ARCv2EM.Data Cache"
-
-# dc_size --- This defines the total size of the Data Cache in bytes.
--dc_size 16384
-
-# dc_ways --- This defines the number of cache ways.
--dc_ways 2
-
-# dc_bsize --- This defines the cache line length in bytes.
--dc_bsize 32
-
-# dc_feature_level --- Feature Level, indicates locking and debug feature level  00 = Basic cache, with no locking or debug features  01 = Lock and flush features supported  10 = Lock, flush and advanced debug features supported  11 = Reserved
--dc_feature_level 2
-
-# dc_uncached_region --- Enable an uncached region defined by aux reg
--dc_uncached_region false
-
-# dc_prot --- Specifies the type of protection built for DCACHE.
--dc_prot None
-
-# dc_prot_level --- Specifies the level of protection.
--dc_prot_level Data_Only
-
-# dc_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on DCACHE.
--dc_prot_exceptions true
-
-
-######## Debug Interface --- com.arc.hardware.Debug_Interface.1_0 ########
-
-# Create Debug Interface
--create com.arc.hardware.Debug_Interface.1_0 "System.CPUisle.ARCv2EM.Debug Interface"
-
-# dbg_en_option --- Adds an enable pin to the existing debug interface
--dbg_en_option false
-
-# secure_debug --- This enables secure debug feature
--secure_debug false
-
-# scdbg_aux_unlk --- An internal demo module will be included when enable
--scdbg_aux_unlk false
-
-# dbg_apb_option --- Adds an additional APB debug port alongside the BVCI one
--dbg_apb_option false
-
-
-######## ICCM0 --- com.arc.hardware.ICCM0.1_0 ########
-
-# Create ICCM0
--create com.arc.hardware.ICCM0.1_0 System.CPUisle.ARCv2EM.ICCM0
-
-# iccm0_size --- This defines the size of ICCM0 in bytes.This ICCM has 0 wait states.
--iccm0_size 131072
-
-# iccm0_base --- Sets the initial memory region assignment for ICCM0
--iccm0_base 6
-
-# iccm0_wide --- Creates ICCM0 as 64b memory to reduce accesses.
--iccm0_wide false
-
-# iccm0_prot --- Specifies the type of protection built for ICCM0.
--iccm0_prot None
-
-# iccm0_prot_level --- Specifies the level of protection.
--iccm0_prot_level Data_Only
-
-# iccm0_prot_exceptions --- When the core is configured with ECC or Parity, cause exception generation hardware to be created for uncorrectable errors detected on the ICCM0
--iccm0_prot_exceptions true
-
-# iccm0_sec_lvl --- Specifies the level of secure ICCM0.
--iccm0_sec_lvl Non_Secure
-
-# iccm0_dmi --- This enables external access through a DMI (direct memory interface) port.
--iccm0_dmi true
-
-
-######## Instruction Cache --- com.arc.hardware.Instruction_Cache.1_0 ########
-
-# Create Instruction Cache
--create com.arc.hardware.Instruction_Cache.1_0 "System.CPUisle.ARCv2EM.Instruction Cache"
-
-# ic_size --- This defines the total size of the instruction cache in bytes.
--ic_size 16384
-
-# ic_ways --- This defines the number of cache ways
--ic_ways 2
-
-# ic_bsize --- This defines the cache line length in bytes.
--ic_bsize 64
-
-# ic_disable_on_reset --- The instruction cache may be enabled immediately after reset, depending on this option.  If this option is enabled, the last cache operation is set to failed, and the direct cache-RAM access is enabled.  Furthermore, the instruction cache is invalidated  all cache lines are invalidated and unlocked, and the tag RAM is cleared.
--ic_disable_on_reset false
-
-# ic_feature_level --- This defines the feature level of the cache.
--ic_feature_level 1
-
-# ic_pwr_opt_level --- This selects power-optimization options in the micro-architecture of the instruction cache.
--ic_pwr_opt_level 0
-
-# ic_prot --- Specifies the type of protection built for ICACHE.
--ic_prot None
-
-# ic_prot_level --- Specifies the level of protection.
--ic_prot_level Data_Only
-
-# ic_prot_exceptions --- Builds exception generation hardware for uncorrectable (fatal) errors detected on ICACHE.
--ic_prot_exceptions true
-
-
-######## Interrupt Controller --- com.arc.hardware.Interrupt_Controller.1_0 ########
-
-# Create Interrupt Controller
--create com.arc.hardware.Interrupt_Controller.1_0 "System.CPUisle.ARCv2EM.Interrupt Controller"
-
-# number_of_interrupts --- This is the total number of interrupts available to the core.  Some interrupts are allocated statically to a specific interrupt line (for example, timer interrupts).  For more information on Interrupt and register-file options, see DesignWare ARCv2 ISA Programmers Reference Manual.
--number_of_interrupts 96
-
-# number_of_levels --- Priority levels in the interrupt controller.
--number_of_levels 4
-
-# external_interrupts --- This is the total number of interrupt pins available for external system components.  This parameter must be less than the total number of interrupts.
--external_interrupts 77
-
-# firq_option --- This enables the fast-interrupts option, (priority level 0 interrupts), which uses an alternate register bank (if configured) instead of saving the context to memory.
--firq_option true
-
-
-######## JTAG Interface --- com.arc.hardware.JTAG_Interface.1_0 ########
-
-# Create JTAG Interface
--create com.arc.hardware.JTAG_Interface.1_0 "System.CPUisle.ARCv2EM.JTAG Interface"
-
-######## Timer 0 --- com.arc.hardware.Timer_0.1_0 ########
-
-# Create Timer 0
--create com.arc.hardware.Timer_0.1_0 "System.CPUisle.ARCv2EM.Timer 0"
-
-# timer_0_int_level --- This sets the interrupt level (and implicitly the priority: level 0 is highest) of timer 0.
--timer_0_int_level 1
-
-
-######## Watchdog Timer --- com.arc.hardware.Watchdog_Timer.1_0 ########
-
-# Create Watchdog Timer
--create com.arc.hardware.Watchdog_Timer.1_0 "System.CPUisle.ARCv2EM.Watchdog Timer"
-
-# watchdog_size --- Specifies the bit width of timer's internal counter.
--watchdog_size 32
-
-# watchdog_clk --- Specifies whether the timer should be driven from a separate clock.
--watchdog_clk false
-
-
-######## Real-time Counter --- com.arc.hardware.Real_time_Counter.1_0 ########
-
-# Create Real-time Counter
--create com.arc.hardware.Real_time_Counter.1_0 "System.CPUisle.ARCv2EM.Real-time Counter"
-
-######## Performance Monitor --- com.arc.hardware.Performance_Monitor.1_0 ########
-
-# Create Performance Monitor
--create com.arc.hardware.Performance_Monitor.1_0 "System.CPUisle.ARCv2EM.Performance Monitor"
-
-# pct_counters --- Number of counters for performance monitoring.
--pct_counters 8
-
-
-######## SmaRT --- com.arc.hardware.SmaRT.1_0 ########
-
-# Create SmaRT
--create com.arc.hardware.SmaRT.1_0 System.CPUisle.ARCv2EM.SmaRT
-
-# smart_stack_entries --- This specifies the number of entries in the trace buffer.
--smart_stack_entries 8
-
-# smart_implementation --- Flip-flop = FF-based design.  Memory = memory-based design (provides better density for larger trace buffers).
--smart_implementation flip-flop
-
-
-######## XY --- com.arc.hardware.XY.1_0 ########
-
-# Create XY
--create com.arc.hardware.XY.1_0 System.CPUisle.ARCv2EM.XY
-
-# xy_config --- XY memory configuration:
-# One memory: DCCM only.
-# Two memories: DCCM + Y.
-# Three memories: DCCM + X + Y.
--xy_config dccm_x_y
-
-# xy_size --- Size of X and Y memories if included.
-# X and Y memories both have the same configured size.
--xy_size 16384
-
-# xy_interleave --- Split XY memories into odd/even instances to enable single cycle unaligned access.
--xy_interleave false
-
-# xy_x_base --- Base region for X memory. All accesses to this region will initiate a transfer on the X memory.
--xy_x_base 9
-
-# xy_y_base --- Base region for Y memory. All accesses to this region will initiate a transfer on the Y memory.
--xy_y_base 10
-
-
-######## AGU --- com.arc.hardware.AGU.1_0 ########
-
-# Create AGU
--create com.arc.hardware.AGU.1_0 System.CPUisle.ARCv2EM.AGU
-
-# agu_size --- Predefined configurations of modifiers, address 
-# pointers and offset registers                   
-# <pre>
-# 
-#         address     address                     
-#         pointers    offset regs      modifiers  
-#        ----------- --------------- ------------ 
-# small:     4           2                 4      
-# medium:    8           4                 12     
-# large:     12          8                 24     
-# </pre>
-# 
--agu_size large
-
-# agu_accord --- Enable the accordion stage if operating frequency is critical
--agu_accord true
-
-# agu_wb_depth --- Write buffer depth
--agu_wb_depth 4
-
-
-######## Actionpoints --- com.arc.hardware.Actionpoints.1_0 ########
-
-# Create Actionpoints
--create com.arc.hardware.Actionpoints.1_0 System.CPUisle.ARCv2EM.Actionpoints
-
-# num_actionpoints --- This is the number of trigger events available.
--num_actionpoints 8
-
-# aps_feature --- Selects Actionpoint feature set
--aps_feature min
-
-
-######## Bit stream --- com.arc.hardware.Bit_stream.1_0 ########
-
-# Create Bit stream
--create com.arc.hardware.Bit_stream.1_0 "System.CPUisle.ARCv2EM.Bit stream"
-
-######## Floating-point unit --- com.arc.hardware.Floating_point_unit.1_0 ########
-
-# Create Floating-point unit
--create com.arc.hardware.Floating_point_unit.1_0 "System.CPUisle.ARCv2EM.Floating-point unit"
-
-# fpu_dp_assist --- This enables double-precision acceleration instructions.
--fpu_dp_assist true
-
-# fpu_fma_option --- This enables the fused multiply-add & multiply-subtract instructions.
--fpu_fma_option true
-
-# fpu_mas_cycles --- Make mul/add/sub multicycle to achieve a higher clock speed.
--fpu_mas_cycles 2
-
-# fpu_pipe_impl --- FPU pipelined implementation
--fpu_pipe_impl true
-
-# fpu_div_option --- This enables divide & square-root acceleration
--fpu_div_option true
-
-# fpu_div_cycles --- Controls div/sqrt implementation.
--fpu_div_cycles 17
-
-
-######## Memory Protection Unit --- com.arc.hardware.Memory_Protection_Unit.1_0 ########
-
-# Create Memory Protection Unit
--create com.arc.hardware.Memory_Protection_Unit.1_0 "System.CPUisle.ARCv2EM.Memory Protection Unit"
-
-# mpu_num_regions --- Number of configured memory regions.
--mpu_num_regions 16
-
-# mpu_32b --- Set the minimal region size to be 32 byte instead of 2KB.
--mpu_32b false
-
-# mpu_sid_option --- It will enable SID support in Secure Shield
--mpu_sid_option false
-
-
-######## Real-time trace producer --- com.arc.hardware.Real_time_trace_producer.1_0 ########
-
-# Create Real-time trace producer
--create com.arc.hardware.Real_time_trace_producer.1_0 "System.CPUisle.ARCv2EM.Real-time trace producer"
-
-# rtt_feature_level --- 'small' means that program trace only is available.  `medium' adds data trace.  `full' adds core and aux register trace.
--rtt_feature_level full
-
-
-######## ARCv2EM CCT --- cct.1_0 ########
-
-# Create ARCv2EM CCT
--create cct.1_0 "System.ARCv2EM CCT"
-
-# cct --- 
-# 	Option used to add a CCT to the design for command-line builds
-# 	Without this architect can't add this component to a build
-# 	via a cmdline -create command.  
-# 	with old scripts.
-# 	
--cct true
-
-# no_hostlink --- 
-# This prevents the inclusion of the hostlink library when compiling
-# C or C++ programs.  The resultant executable, if it contains printfs,
-# will print to an internal fixed buffer __mwwrite_buf.  
-# Other hostlink operations that require debugger assistance, such as file
-# opens, will fail.
-# 
-# Hostlink references incur memory cycles at unpredictable times and 
-# so can perturb cycle-timing results.  Without hostlink,
-# the debugger will not in any way interfere with the target while it is running.  
-# Therefore this option is useful for simulation in which you want precisely the
-# same cycle timing to occur each time you run, or for accurate power consumption results.
-# 	
--cct_no_hostlink false
-
-# has_subsystem_cct_flow --- 
-# The above option will check for the presence of subsystem component in the build configuration and suitably modifies the Makefile for the sub-system environment.
-# 	
--has_subsystem_cct_flow false
-
-
-######## BusFabric --- com.arc.hardware.ARCv2MSS.BusFabric.1_0 ########
-
-# Create BusFabric
--create com.arc.hardware.ARCv2MSS.BusFabric.1_0 System.BusFabric
-
-######## ClkCtrl --- com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 ########
-
-# Create ClkCtrl
--create com.arc.hardware.ARCv2MSS.ClkCtrl.1_0 System.ClkCtrl
-
-######## DSP Software --- com.arc.software.dfss.sw_dsp.1_0 ########
-
-# Create DSP Software
--create com.arc.software.dfss.sw_dsp.1_0 "System.DSP Software"
-
-# sw_dsp --- Command line option for Software element 'DSP Software'
--sw_dsp true
-
-
-######## EMSDP_BOARD --- com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 ########
-
-# Create EMSDP_BOARD
--create com.arc.hardware.ARCv2MSS.EMSDP_BOARD.1_0 System.EMSDP_BOARD
-
-# emsdp_sys_freq --- Select the core frequency.
--emsdp_sys_freq 40
-
-
-######## IO Software --- com.arc.software.dfss.sw_io.1_0 ########
-
-# Create IO Software
--create com.arc.software.dfss.sw_io.1_0 "System.IO Software"
-
-# sw_io --- Command line option for Software element 'IO Software'
--sw_io true
-
-
-######## Implementation --- com.arc.hardware.implementation.1_0 ########
-
-# Create Implementation
--create com.arc.hardware.implementation.1_0 System.Implementation
-
-# ClockSpeed --- Target clock speed of the system
--clock_speed 10
-
-# DDR2_clk_Ratio --- DDR2 Clock Vs System Clock Ratio
-# 2x
-# 3x
-# 4x
--ddr2_clk_ratio 3x
-
-# ClockSkew --- The clock skew for the system
--clock_skew 0.2
-
-# HoldMargin --- Margin for hold time checks
--hold_margin 0.05
-
-# Floorplan --- Floorplan definition for relative placement of  RAMs (at CPU-level) or the placement of the rams and CPU hard cores (at multicore level)
--floorplan em4_sensor
-
-# JTAGFrequency --- Select the frequency of the JTAG clock Tck (in MHz).
-# 
-# The JTAG clock speed has to be less than 1/2 of the cpu clock otherwise the signals on the BVCI interface are not guaranteed to be valid.
-# 
-# NOTE: The RTL simulations will work when the JTAG clock frequency is set to half the CPU clock, however this may not be the case when simulating at gate level due to delays on the IO pads.
-# 
-# The default is set to 10 MHz so that there is no conflict when simulating with an ARCangel3 at 30MHz. (30 > 10*2)
-# 
-# The speed of simulation can be greatly increased by using a faster JTAG clock, but a dependency will warn if it exceeds 1/2 of the cpu clock.
-# 
--jtag_tclk 4
-
-# execution_trace_level --- 
-# This traces committed instructions as they execute, and gathers statistics
-# visible in the debugger for counting instructions & cycle delays.
-# At the "stats" level ony the statistics are gathered and no trace is printed.
-# "file" is equivalent to "full", but the results go to a trace .txt file instead.
-# 
--execution_trace_level stats
-
-# tb_trace --- 
-# Enable instruction execution trace.
-# This is available to arc_dev licensees (internal developers) only.
-# 
--tb_trace false
-
-# zero_based_arcnum --- 
-# In a multicore build, number ARCs from 0.
-# If this is not selected, arcs are numbered from 1.
-# (This provides the initial value to the arcnum signal.)
-# 
--zero_based_arcnum true
-
-# generate_ipxact --- 
-# Generate ipxact.xml file describing the CPUisle or archipelago frontier
-# 
--generate_ipxact false
-
-# ipxact_relative_path_names --- 
-# Use relative path names for Verilog files in the ipxact.
-# Otherwise, absolute path names are used.
-# 
--ipxact_relative_path_names true
-
-# optional_encryption --- 
-# When selected, encrypted RTL output is generated.
-# 	
--optional_encryption false
-
-# ignore_encrypt_license --- 
-# When selected, pretend the encryption license is missing.  For testing.
-# 	
--ignore_encrypt_license false
-
-# ignore_clear_license --- 
-# When selected, pretend the cleartest license is missing.  For testing.
-# 	
--ignore_clear_license false
-
-# OPTION_require_archipelago --- 
-# When selected, force use of archipelago.  This is for testing purposes.
-# 	
--require_archipelago false
-
-
-######## Infrastructure Software --- com.arc.software.dfss.sw_infra.1_0 ########
-
-# Create Infrastructure Software
--create com.arc.software.dfss.sw_infra.1_0 "System.Infrastructure Software"
-
-# sw_infra --- Command line option for Software element 'Infrastructure Software'
--sw_infra true
-
-# templateName --- Template name
--template_name siss_combo_sensor_dsp
-
-
-######## subsys_infra --- com.arc.hardware.dfss.subsys_infra.1_0 ########
-
-# Create subsys_infra
--create com.arc.hardware.dfss.subsys_infra.1_0 System.subsys_infra
-
-# subsys_infra --- Command line option for EIA glue logic.
--subsys_infra true
-
-# internal_interrupt --- Connect the IO interrupts internally
--internal_interrupt true
-
-# internal_dma_handshake --- Connect the DMA handshake signals internally
--internal_dma_handshake true
-
-# spi_tb_sw_test_mode --- 
-# This is a secret option, not seen by customers.
-# If you check this, the SPI peripheral's testbenches will be set to SW test mode:
-# The serial interface of the first SPI master io_spi_mstN peripheral is connected to all SPI slave peripherals io_spi_slvN.
-# This is used for testing the SW drivers.
-# 	
--spi_tb_sw_test_mode false
-
-# i3c_tb_sw_test_mode --- 
-# This is a secret option, not seen by customers.
-# If you check this, the I3C peripheral's testbenches will be set to SW test mode:
-# The serial interface of the io_i3cN peripheral is connected to the I2C slave peripherals io_i2c_slv0.
-# This is used for testing the SW drivers.
-# 	
--i3c_tb_sw_test_mode false
-
-# subsys_apex_offset --- Subsystem APEX address offset in the AUX address space. The aperture used by the subsystem is fixed to 0x0010_0000. In general, the APEX address offset must be in the range from 0x0010_0000 to 0xFFF0_0000. However, if your design includes the "UAUX Interface" component, then the APEX address offset must be in the range from 0x0010_0000 to 0x7FF0_0000 to avoid address conflicts with any UAUX components.
--subsys_apex_offset 0x8000_0000
-
-# subsys_uaux_offset --- Subsystem UAUX address offset in the UAUX address space. The UAUX address offset must be an integer multiple of 0x0010_0000 in the range from 0x0000_0000 to 0x7FF0_0000. The aperture reserved for the subsystem is fixed to 0x0010_0000.
--subsys_uaux_offset 0x10_0000
-
-
-######## ARC_RTT --- com.arc.hardware.ARC_RTT.1_0 ########
-
-# Create ARC_RTT
--create com.arc.hardware.ARC_RTT.1_0 System.ARC_RTT
-
-# has_nexus_if --- Please select Nexus interface to offload the data from RTT 
--has_nexus_if true
-
-# has_on_chip_mem --- Please select the on-chip memory option to store the trace data in shared memory 
--has_on_chip_mem true
-
-# nexus_data_wdt --- Please select the Nexus Data Width to offload the data from RTT 
--nexus_data_wdt 16
-
-# internal_memory_size --- Please select internal memory size to capture the trace data 
--internal_memory_size 16k
-
-# ram_type --- Please select Types of internal memories to be inferred for the logic 
--ram_type 1_PORT
-
-# power_domains --- Adds isolation signal inputs/power switch controls for use in UPF flow when configuring power domains.
--rtt_power_domains false
-
-
-######## Tool Configuration --- cgen.1_0 ########
-
-# Create Tool Configuration
--create cgen.1_0 "System.Tool Configuration"
-
-# mwdt_version --- Selects the MetaWare version to be used with the TCF file.
-# Change from the default to an older or newer toolset version if you want the TCF file to be used with an older or newer version of the MetaWare tools.
--mwdt_version O-2018.09
-
-# code_base_addr --- 
-# The base address to assign to the executable code segment in the linker command file when there is no ICCM in the build.  This value is ignored when there is an ICCM.
-# 
--code_base_addr 0x0
-
-# data_base_addr --- 
-# The base address to assign to the data segment in the linker command file when the data is not being mapped to a DCCM.  This value is ignored when the data segment is mapped to a DCCM, as in that case the base address of the DCCM memory is used.
-# 
-# A value of 0xffffffff means that the data segment will not be mapped to any specific address.
-# 
--data_base_addr 0xffff_ffff
-
-# underscores_in_numbers --- Use underscores in hex numbers to improve readability.
--underscores_in_numbers false
-
-# tcf_rebrand --- Alternate branding of TCF (not used)
--rebrand false
-
-
-]]></string>
-  </configuration>
-  <configuration name="assembler_defines" filename="core_config.s">
-    <string><![CDATA[
-.ifndef __core_config_s
-	.define __core_config_s, 1
-	.define _TOOL_CONFIG_VER, 10072
-	.define	core_config_cir_identity,0x00000044
-	.define	core_config_cir_identity_chipid,0
-	.define	core_config_cir_identity_arcnum,0
-	.define	core_config_cir_identity_arcver,68
-	.define	core_config_cir_identity_family,4
-	.define	core_config_cir_identity_corever,4
-	.define	core_config_cir_aux_dccm,0x80000000
-	.define	core_config_bcr_bcr_ver,0x00000002
-	.define	core_config_bcr_bcr_ver_version,2
-	.define	core_config_bcr_vecbase_ac_build,0x00000010
-	.define	core_config_bcr_vecbase_ac_build_version,4
-	.define	core_config_bcr_vecbase_ac_build_vector_config,0
-	.define	core_config_bcr_vecbase_ac_build_addr,0
-	.define	core_config_bcr_mpu_build,0x00001002
-	.define	core_config_bcr_mpu_build_i,0
-	.define	core_config_bcr_mpu_build_s,0
-	.define	core_config_bcr_mpu_build_regions,16
-	.define	core_config_bcr_mpu_build_version,2
-	.define	core_config_bcr_rf_build,0x0000c902
-	.define	core_config_bcr_rf_build_version,2
-	.define	core_config_bcr_rf_build_p,1
-	.define	core_config_bcr_rf_build_e,0
-	.define	core_config_bcr_rf_build_r,0
-	.define	core_config_bcr_rf_build_b,1
-	.define	core_config_bcr_rf_build_d,3
-	.define	core_config_bcr_d_cache_build,0x00215104
-	.define	core_config_bcr_d_cache_build_version,4
-	.define	core_config_bcr_d_cache_build_assoc,1
-	.define	core_config_bcr_d_cache_build_capacity,5
-	.define	core_config_bcr_d_cache_build_bsize,1
-	.define	core_config_bcr_d_cache_build_fl,2
-	.define	core_config_bcr_d_cache_build_ioc,0
-	.define	core_config_bcr_d_cache_build_cp,0
-	.define	core_config_bcr_d_cache_build_u,0
-	.define	core_config_bcr_d_cache_build_cycles,0
-	.define	core_config_bcr_dccm_build,0x00000904
-	.define	core_config_bcr_dccm_build_w,0
-	.define	core_config_bcr_dccm_build_cycles,0
-	.define	core_config_bcr_dccm_build_interleave,0
-	.define	core_config_bcr_dccm_build_size1,0
-	.define	core_config_bcr_dccm_build_size0,9
-	.define	core_config_bcr_dccm_build_version,4
-	.define	core_config_bcr_timer_build,0x00010504
-	.define	core_config_bcr_timer_build_sp1,0
-	.define	core_config_bcr_timer_build_sp0,0
-	.define	core_config_bcr_timer_build_p1,0
-	.define	core_config_bcr_timer_build_p0,1
-	.define	core_config_bcr_timer_build_st1,0
-	.define	core_config_bcr_timer_build_st0,0
-	.define	core_config_bcr_timer_build_rtc,1
-	.define	core_config_bcr_timer_build_rtsc_ver,1
-	.define	core_config_bcr_timer_build_rtsc,0
-	.define	core_config_bcr_timer_build_t0,1
-	.define	core_config_bcr_timer_build_t1,0
-	.define	core_config_bcr_timer_build_version,4
-	.define	core_config_bcr_ap_build,0x00000605
-	.define	core_config_bcr_ap_build_version,5
-	.define	core_config_bcr_ap_build_type,6
-	.define	core_config_bcr_i_cache_build,0x00135104
-	.define	core_config_bcr_i_cache_build_assoc,1
-	.define	core_config_bcr_i_cache_build_version,4
-	.define	core_config_bcr_i_cache_build_capacity,5
-	.define	core_config_bcr_i_cache_build_bsize,3
-	.define	core_config_bcr_i_cache_build_fl,1
-	.define	core_config_bcr_i_cache_build_d,0
-	.define	core_config_bcr_iccm_build,0x00000904
-	.define	core_config_bcr_iccm_build_w0,0
-	.define	core_config_bcr_iccm_build_iccm1_size1,0
-	.define	core_config_bcr_iccm_build_iccm0_size1,0
-	.define	core_config_bcr_iccm_build_iccm1_size0,0
-	.define	core_config_bcr_iccm_build_iccm0_size0,9
-	.define	core_config_bcr_iccm_build_version,4
-	.define	core_config_bcr_xy_build,0x00002220
-	.define	core_config_bcr_xy_build_memsize,2
-	.define	core_config_bcr_xy_build_interleaved,0
-	.define	core_config_bcr_xy_build_config,2
-	.define	core_config_bcr_xy_build_version,32
-	.define	core_config_bcr_dsp_build,0x00003521
-	.define	core_config_bcr_dsp_build_wide,0
-	.define	core_config_bcr_dsp_build_itu_pa,1
-	.define	core_config_bcr_dsp_build_acc_shift,2
-	.define	core_config_bcr_dsp_build_comp,1
-	.define	core_config_bcr_dsp_build_divsqrt,1
-	.define	core_config_bcr_dsp_build_version,33
-	.define	core_config_bcr_multiply_build,0x00022206
-	.define	core_config_bcr_multiply_build_version16x16,2
-	.define	core_config_bcr_multiply_build_dsp,2
-	.define	core_config_bcr_multiply_build_cyc,0
-	.define	core_config_bcr_multiply_build_type,2
-	.define	core_config_bcr_multiply_build_version32x32,6
-	.define	core_config_bcr_swap_build,0x00000003
-	.define	core_config_bcr_swap_build_version,3
-	.define	core_config_bcr_norm_build,0x00000003
-	.define	core_config_bcr_norm_build_version,3
-	.define	core_config_bcr_minmax_build,0x00000002
-	.define	core_config_bcr_minmax_build_version,2
-	.define	core_config_bcr_barrel_build,0x00000303
-	.define	core_config_bcr_barrel_build_version,3
-	.define	core_config_bcr_barrel_build_shift_option,3
-	.define	core_config_bcr_isa_config,0x12447402
-	.define	core_config_bcr_isa_config_res1,0
-	.define	core_config_bcr_isa_config_d,1
-	.define	core_config_bcr_isa_config_res2,0
-	.define	core_config_bcr_isa_config_f,0
-	.define	core_config_bcr_isa_config_c,2
-	.define	core_config_bcr_isa_config_l,0
-	.define	core_config_bcr_isa_config_n,1
-	.define	core_config_bcr_isa_config_a,0
-	.define	core_config_bcr_isa_config_b,0
-	.define	core_config_bcr_isa_config_addr_size,4
-	.define	core_config_bcr_isa_config_lpc_size,7
-	.define	core_config_bcr_isa_config_pc_size,4
-	.define	core_config_bcr_isa_config_version,2
-	.define	core_config_bcr_dmp_pp_build,0xf0000012
-	.define	core_config_bcr_stack_region_build,0x00000002
-	.define	core_config_bcr_erp_build,0x50000004
-	.define	core_config_bcr_erp_build_l,0
-	.define	core_config_bcr_erp_build_wd,2
-	.define	core_config_bcr_erp_build_c,1
-	.define	core_config_bcr_erp_build_mmu,0
-	.define	core_config_bcr_erp_build_rf,0
-	.define	core_config_bcr_erp_build_pc,0
-	.define	core_config_bcr_erp_build_ic,0
-	.define	core_config_bcr_erp_build_dc,0
-	.define	core_config_bcr_erp_build_ip,0
-	.define	core_config_bcr_erp_build_dp,0
-	.define	core_config_bcr_erp_build_version,4
-	.define	core_config_bcr_fpu_build,0x01004f03
-	.define	core_config_bcr_fpu_build_da,1
-	.define	core_config_bcr_fpu_build_dd,0
-	.define	core_config_bcr_fpu_build_dc,0
-	.define	core_config_bcr_fpu_build_df,0
-	.define	core_config_bcr_fpu_build_dp,0
-	.define	core_config_bcr_fpu_build_fd_v1,2
-	.define	core_config_bcr_fpu_build_pi,1
-	.define	core_config_bcr_fpu_build_fd,0
-	.define	core_config_bcr_fpu_build_fm,0
-	.define	core_config_bcr_fpu_build_sd,1
-	.define	core_config_bcr_fpu_build_sc,1
-	.define	core_config_bcr_fpu_build_sf,1
-	.define	core_config_bcr_fpu_build_sp,1
-	.define	core_config_bcr_fpu_build_version,3
-	.define	core_config_bcr_bs_build,0x00000002
-	.define	core_config_bcr_bs_build_version,2
-	.define	core_config_bcr_agu_build,0x01988c02
-	.define	core_config_bcr_agu_build_accordian,1
-	.define	core_config_bcr_agu_build_wb_size,4
-	.define	core_config_bcr_agu_build_num_modifier,24
-	.define	core_config_bcr_agu_build_num_offset,8
-	.define	core_config_bcr_agu_build_num_addr,12
-	.define	core_config_bcr_agu_build_version,2
-	.define	core_config_bcr_dmac_build,0x00120f02
-	.define	core_config_bcr_dmac_build_int_cfg,2
-	.define	core_config_bcr_dmac_build_fifo,1
-	.define	core_config_bcr_dmac_build_chan_mem,0
-	.define	core_config_bcr_dmac_build_channels,15
-	.define	core_config_bcr_dmac_build_version,2
-	.define	core_config_bcr_subsys_build,0x00101063
-	.define	core_config_bcr_subsys_build_version_major,0
-	.define	core_config_bcr_subsys_build_version_minor,2
-	.define	core_config_bcr_subsys_build_version_build,6
-	.define	core_config_bcr_subsys_build_type,3
-	.define	core_config_bcr_core_config,0x00000001
-	.define	core_config_bcr_core_config_turbo_boost,0
-	.define	core_config_bcr_core_config_version,1
-	.define	core_config_bcr_rtt_build,0x00000503
-	.define	core_config_bcr_rtt_build_prod_src_num,0
-	.define	core_config_bcr_rtt_build_fl,2
-	.define	core_config_bcr_rtt_build_pi,1
-	.define	core_config_bcr_rtt_build_version,3
-	.define	core_config_bcr_irq_build,0x134d6001
-	.define	core_config_bcr_irq_build_raz,0
-	.define	core_config_bcr_irq_build_nmi,0
-	.define	core_config_bcr_irq_build_f,1
-	.define	core_config_bcr_irq_build_p,3
-	.define	core_config_bcr_irq_build_exts,77
-	.define	core_config_bcr_irq_build_irqs,96
-	.define	core_config_bcr_irq_build_version,1
-	.define	core_config_bcr_pct_build,0x08080104
-	.define	core_config_bcr_pct_build_version,4
-	.define	core_config_bcr_pct_build_s,1
-	.define	core_config_bcr_pct_build_i,0
-	.define	core_config_bcr_pct_build_c,8
-	.define	core_config_bcr_cc_build,0x006f0004
-	.define	core_config_bcr_cc_build_version,4
-	.define	core_config_bcr_cc_build_cc,111
-	.define	core_config_bcr_smart_build,0x00002003
-	.define	core_config_bcr_smart_build_version,3
-	.define	core_config_bcr_smart_build_stack_size,8
-	.define	core_config_cir_aux_iccm,0x60000000
-	.define	core_config_cir_xccm_base,0x90000000
-	.define	core_config_cir_yccm_base,0xa0000000
-	.define	core_config_cir_subsys_dsp_0_build,0x00001000
-	.define	core_config_cir_subsys_io_0_build,0x171700f0
-	.define	core_config_cir_subsys_io_1_build,0x00000007
-	.define	core_config_cir_subsys_io_2_build,0x00000111
-	.define	core_config_cir_subsys_uaux_offset,0x00100000
-	.define	core_config_cir_subsys_apex_offset,0x80000000
-	.define	core_config_family,4
-	.define	core_config_core_version,4
-	.define	core_config_family_name,"arcv2em"
-	.define	core_config_rgf_num_banks,2
-	.define	core_config_rgf_banked_regs,32
-	.define	core_config_rgf_num_wr_ports,2
-	.define	core_config_endian,"little"
-	.define	core_config_endian_little,1
-	.define	core_config_endian_big,0
-	.define	core_config_lpc_size,32
-	.define	core_config_pc_size,32
-	.define	core_config_addr_size,32
-	.define	core_config_unaligned,1
-	.define	core_config_code_density,1
-	.define	core_config_div_rem,"radix2"
-	.define	core_config_div_rem_radix2,1
-	.define	core_config_swap,1
-	.define	core_config_bitscan,1
-	.define	core_config_mpy_option,"mpyd"
-	.define	core_config_mpy_option_num,8
-	.define	core_config_shift_assist,1
-	.define	core_config_barrel_shifter,1
-	.define	core_config_dsp,1
-	.define	core_config_dsp2,1
-	.define	core_config_dsp_complex,1
-	.define	core_config_dsp_divsqrt,"radix2"
-	.define	core_config_dsp_divsqrt_radix2,1
-	.define	core_config_dsp_itu,1
-	.define	core_config_dsp_accshift,"full"
-	.define	core_config_dsp_accshift_full,1
-	.define	core_config_agu_large,1
-	.define	core_config_agu_wb_depth,4
-	.define	core_config_agu_accord,1
-	.define	core_config_xy,1
-	.define	core_config_xy_config,"dccm_x_y"
-	.define	core_config_xy_config_dccm_x_y,1
-	.define	core_config_xy_size,16384
-	.define	core_config_xy_size_KM,"16K"
-	.define	core_config_xy_x_base,0x90000000
-	.define	core_config_xy_y_base,0xa0000000
-	.define	core_config_bitstream,1
-	.define	core_config_fpus_div,1
-	.define	core_config_fpu_mac,1
-	.define	core_config_fpuda,1
-	.define	core_config_fpus_mpy_slow,1
-	.define	core_config_fpus_div_slow,1
-	.define	core_config_fpu_pipe_impl,1
-	.define	core_config_timer0,1
-	.define	core_config_timer0_level,1
-	.define	core_config_timer0_vector,16
-	.define	core_config_rtc,1
-	.define	core_config_action_points,8
-	.define	core_config_stack_check,1
-	.define	core_config_smart_stack_entries,8
-	.define	core_config_mpu_present,1
-	.define	core_config_mpu,1
-	.define	core_config_mpu_regions,16
-	.define	core_config_interrupts_present,1
-	.define	core_config_interrupts_number,96
-	.define	core_config_interrupts_priorities,4
-	.define	core_config_interrupts_externals,77
-	.define	core_config_interrupts,96
-	.define	core_config_interrupt_priorities,4
-	.define	core_config_ext_interrupts,77
-	.define	core_config_interrupts_firq,1
-	.define	core_config_interrupts_base,0x0
-	.define	core_config_dcache_present,1
-	.define	core_config_dcache_size,16384
-	.define	core_config_dcache_line_size,32
-	.define	core_config_dcache_ways,2
-	.define	core_config_dcache_feature,2
-	.define	core_config_icache_present,1
-	.define	core_config_icache_size,16384
-	.define	core_config_icache_line_size,64
-	.define	core_config_icache_ways,2
-	.define	core_config_icache_feature,1
-	.define	core_config_dccm_present,1
-	.define	core_config_dccm_size,0x20000
-	.define	core_config_dccm_base,0x80000000
-	.define	core_config_iccm_present,1
-	.define	core_config_iccm0_present,1
-	.define	core_config_iccm_size,0x20000
-	.define	core_config_iccm0_size,0x20000
-	.define	core_config_iccm_base,0x60000000
-	.define	core_config_iccm0_base,0x60000000
-	.define	core_config_error_prot_ver,4
-	.define	core_config_ccm_prot_pipelined,1
-	.define	core_config_watchdog,1
-	.define	core_config_watchdog_size,32
-	.define	core_config_pct_counters,8
-	.define	core_config_dmac,1
-	.define	core_config_dmac_channels,16
-	.define	core_config_dmac_registers,0
-	.define	core_config_dmac_fifo_depth,2
-	.define	core_config_dmac_int_config,"multiple_internal"
-	.define	core_config_clock_speed,10
-.endif ; __core_config_s
-
-]]></string>
-  </configuration>
-  <configuration name="C_defines" filename="core_config.h">
-    <string><![CDATA[
-#ifndef __core_config_h
-	#define __core_config_h  1
-	#define _TOOL_CONFIG_VER 10072
-	#define	core_config_cir_identity	0x00000044
-	#define	core_config_cir_identity_chipid	0
-	#define	core_config_cir_identity_arcnum	0
-	#define	core_config_cir_identity_arcver	68
-	#define	core_config_cir_identity_family	4
-	#define	core_config_cir_identity_corever	4
-	#define	core_config_cir_aux_dccm	0x80000000
-	#define	core_config_bcr_bcr_ver	0x00000002
-	#define	core_config_bcr_bcr_ver_version	2
-	#define	core_config_bcr_vecbase_ac_build	0x00000010
-	#define	core_config_bcr_vecbase_ac_build_version	4
-	#define	core_config_bcr_vecbase_ac_build_vector_config	0
-	#define	core_config_bcr_vecbase_ac_build_addr	0
-	#define	core_config_bcr_mpu_build	0x00001002
-	#define	core_config_bcr_mpu_build_i	0
-	#define	core_config_bcr_mpu_build_s	0
-	#define	core_config_bcr_mpu_build_regions	16
-	#define	core_config_bcr_mpu_build_version	2
-	#define	core_config_bcr_rf_build	0x0000c902
-	#define	core_config_bcr_rf_build_version	2
-	#define	core_config_bcr_rf_build_p	1
-	#define	core_config_bcr_rf_build_e	0
-	#define	core_config_bcr_rf_build_r	0
-	#define	core_config_bcr_rf_build_b	1
-	#define	core_config_bcr_rf_build_d	3
-	#define	core_config_bcr_d_cache_build	0x00215104
-	#define	core_config_bcr_d_cache_build_version	4
-	#define	core_config_bcr_d_cache_build_assoc	1
-	#define	core_config_bcr_d_cache_build_capacity	5
-	#define	core_config_bcr_d_cache_build_bsize	1
-	#define	core_config_bcr_d_cache_build_fl	2
-	#define	core_config_bcr_d_cache_build_ioc	0
-	#define	core_config_bcr_d_cache_build_cp	0
-	#define	core_config_bcr_d_cache_build_u	0
-	#define	core_config_bcr_d_cache_build_cycles	0
-	#define	core_config_bcr_dccm_build	0x00000904
-	#define	core_config_bcr_dccm_build_w	0
-	#define	core_config_bcr_dccm_build_cycles	0
-	#define	core_config_bcr_dccm_build_interleave	0
-	#define	core_config_bcr_dccm_build_size1	0
-	#define	core_config_bcr_dccm_build_size0	9
-	#define	core_config_bcr_dccm_build_version	4
-	#define	core_config_bcr_timer_build	0x00010504
-	#define	core_config_bcr_timer_build_sp1	0
-	#define	core_config_bcr_timer_build_sp0	0
-	#define	core_config_bcr_timer_build_p1	0
-	#define	core_config_bcr_timer_build_p0	1
-	#define	core_config_bcr_timer_build_st1	0
-	#define	core_config_bcr_timer_build_st0	0
-	#define	core_config_bcr_timer_build_rtc	1
-	#define	core_config_bcr_timer_build_rtsc_ver	1
-	#define	core_config_bcr_timer_build_rtsc	0
-	#define	core_config_bcr_timer_build_t0	1
-	#define	core_config_bcr_timer_build_t1	0
-	#define	core_config_bcr_timer_build_version	4
-	#define	core_config_bcr_ap_build	0x00000605
-	#define	core_config_bcr_ap_build_version	5
-	#define	core_config_bcr_ap_build_type	6
-	#define	core_config_bcr_i_cache_build	0x00135104
-	#define	core_config_bcr_i_cache_build_assoc	1
-	#define	core_config_bcr_i_cache_build_version	4
-	#define	core_config_bcr_i_cache_build_capacity	5
-	#define	core_config_bcr_i_cache_build_bsize	3
-	#define	core_config_bcr_i_cache_build_fl	1
-	#define	core_config_bcr_i_cache_build_d	0
-	#define	core_config_bcr_iccm_build	0x00000904
-	#define	core_config_bcr_iccm_build_w0	0
-	#define	core_config_bcr_iccm_build_iccm1_size1	0
-	#define	core_config_bcr_iccm_build_iccm0_size1	0
-	#define	core_config_bcr_iccm_build_iccm1_size0	0
-	#define	core_config_bcr_iccm_build_iccm0_size0	9
-	#define	core_config_bcr_iccm_build_version	4
-	#define	core_config_bcr_xy_build	0x00002220
-	#define	core_config_bcr_xy_build_memsize	2
-	#define	core_config_bcr_xy_build_interleaved	0
-	#define	core_config_bcr_xy_build_config	2
-	#define	core_config_bcr_xy_build_version	32
-	#define	core_config_bcr_dsp_build	0x00003521
-	#define	core_config_bcr_dsp_build_wide	0
-	#define	core_config_bcr_dsp_build_itu_pa	1
-	#define	core_config_bcr_dsp_build_acc_shift	2
-	#define	core_config_bcr_dsp_build_comp	1
-	#define	core_config_bcr_dsp_build_divsqrt	1
-	#define	core_config_bcr_dsp_build_version	33
-	#define	core_config_bcr_multiply_build	0x00022206
-	#define	core_config_bcr_multiply_build_version16x16	2
-	#define	core_config_bcr_multiply_build_dsp	2
-	#define	core_config_bcr_multiply_build_cyc	0
-	#define	core_config_bcr_multiply_build_type	2
-	#define	core_config_bcr_multiply_build_version32x32	6
-	#define	core_config_bcr_swap_build	0x00000003
-	#define	core_config_bcr_swap_build_version	3
-	#define	core_config_bcr_norm_build	0x00000003
-	#define	core_config_bcr_norm_build_version	3
-	#define	core_config_bcr_minmax_build	0x00000002
-	#define	core_config_bcr_minmax_build_version	2
-	#define	core_config_bcr_barrel_build	0x00000303
-	#define	core_config_bcr_barrel_build_version	3
-	#define	core_config_bcr_barrel_build_shift_option	3
-	#define	core_config_bcr_isa_config	0x12447402
-	#define	core_config_bcr_isa_config_res1	0
-	#define	core_config_bcr_isa_config_d	1
-	#define	core_config_bcr_isa_config_res2	0
-	#define	core_config_bcr_isa_config_f	0
-	#define	core_config_bcr_isa_config_c	2
-	#define	core_config_bcr_isa_config_l	0
-	#define	core_config_bcr_isa_config_n	1
-	#define	core_config_bcr_isa_config_a	0
-	#define	core_config_bcr_isa_config_b	0
-	#define	core_config_bcr_isa_config_addr_size	4
-	#define	core_config_bcr_isa_config_lpc_size	7
-	#define	core_config_bcr_isa_config_pc_size	4
-	#define	core_config_bcr_isa_config_version	2
-	#define	core_config_bcr_dmp_pp_build	0xf0000012
-	#define	core_config_bcr_stack_region_build	0x00000002
-	#define	core_config_bcr_erp_build	0x50000004
-	#define	core_config_bcr_erp_build_l	0
-	#define	core_config_bcr_erp_build_wd	2
-	#define	core_config_bcr_erp_build_c	1
-	#define	core_config_bcr_erp_build_mmu	0
-	#define	core_config_bcr_erp_build_rf	0
-	#define	core_config_bcr_erp_build_pc	0
-	#define	core_config_bcr_erp_build_ic	0
-	#define	core_config_bcr_erp_build_dc	0
-	#define	core_config_bcr_erp_build_ip	0
-	#define	core_config_bcr_erp_build_dp	0
-	#define	core_config_bcr_erp_build_version	4
-	#define	core_config_bcr_fpu_build	0x01004f03
-	#define	core_config_bcr_fpu_build_da	1
-	#define	core_config_bcr_fpu_build_dd	0
-	#define	core_config_bcr_fpu_build_dc	0
-	#define	core_config_bcr_fpu_build_df	0
-	#define	core_config_bcr_fpu_build_dp	0
-	#define	core_config_bcr_fpu_build_fd_v1	2
-	#define	core_config_bcr_fpu_build_pi	1
-	#define	core_config_bcr_fpu_build_fd	0
-	#define	core_config_bcr_fpu_build_fm	0
-	#define	core_config_bcr_fpu_build_sd	1
-	#define	core_config_bcr_fpu_build_sc	1
-	#define	core_config_bcr_fpu_build_sf	1
-	#define	core_config_bcr_fpu_build_sp	1
-	#define	core_config_bcr_fpu_build_version	3
-	#define	core_config_bcr_bs_build	0x00000002
-	#define	core_config_bcr_bs_build_version	2
-	#define	core_config_bcr_agu_build	0x01988c02
-	#define	core_config_bcr_agu_build_accordian	1
-	#define	core_config_bcr_agu_build_wb_size	4
-	#define	core_config_bcr_agu_build_num_modifier	24
-	#define	core_config_bcr_agu_build_num_offset	8
-	#define	core_config_bcr_agu_build_num_addr	12
-	#define	core_config_bcr_agu_build_version	2
-	#define	core_config_bcr_dmac_build	0x00120f02
-	#define	core_config_bcr_dmac_build_int_cfg	2
-	#define	core_config_bcr_dmac_build_fifo	1
-	#define	core_config_bcr_dmac_build_chan_mem	0
-	#define	core_config_bcr_dmac_build_channels	15
-	#define	core_config_bcr_dmac_build_version	2
-	#define	core_config_bcr_subsys_build	0x00101063
-	#define	core_config_bcr_subsys_build_version_major	0
-	#define	core_config_bcr_subsys_build_version_minor	2
-	#define	core_config_bcr_subsys_build_version_build	6
-	#define	core_config_bcr_subsys_build_type	3
-	#define	core_config_bcr_core_config	0x00000001
-	#define	core_config_bcr_core_config_turbo_boost	0
-	#define	core_config_bcr_core_config_version	1
-	#define	core_config_bcr_rtt_build	0x00000503
-	#define	core_config_bcr_rtt_build_prod_src_num	0
-	#define	core_config_bcr_rtt_build_fl	2
-	#define	core_config_bcr_rtt_build_pi	1
-	#define	core_config_bcr_rtt_build_version	3
-	#define	core_config_bcr_irq_build	0x134d6001
-	#define	core_config_bcr_irq_build_raz	0
-	#define	core_config_bcr_irq_build_nmi	0
-	#define	core_config_bcr_irq_build_f	1
-	#define	core_config_bcr_irq_build_p	3
-	#define	core_config_bcr_irq_build_exts	77
-	#define	core_config_bcr_irq_build_irqs	96
-	#define	core_config_bcr_irq_build_version	1
-	#define	core_config_bcr_pct_build	0x08080104
-	#define	core_config_bcr_pct_build_version	4
-	#define	core_config_bcr_pct_build_s	1
-	#define	core_config_bcr_pct_build_i	0
-	#define	core_config_bcr_pct_build_c	8
-	#define	core_config_bcr_cc_build	0x006f0004
-	#define	core_config_bcr_cc_build_version	4
-	#define	core_config_bcr_cc_build_cc	111
-	#define	core_config_bcr_smart_build	0x00002003
-	#define	core_config_bcr_smart_build_version	3
-	#define	core_config_bcr_smart_build_stack_size	8
-	#define	core_config_cir_aux_iccm	0x60000000
-	#define	core_config_cir_xccm_base	0x90000000
-	#define	core_config_cir_yccm_base	0xa0000000
-	#define	core_config_cir_subsys_dsp_0_build	0x00001000
-	#define	core_config_cir_subsys_io_0_build	0x171700f0
-	#define	core_config_cir_subsys_io_1_build	0x00000007
-	#define	core_config_cir_subsys_io_2_build	0x00000111
-	#define	core_config_cir_subsys_uaux_offset	0x00100000
-	#define	core_config_cir_subsys_apex_offset	0x80000000
-	#define	core_config_family	4
-	#define	core_config_core_version	4
-	#define	core_config_family_name	"arcv2em"
-	#define	core_config_rgf_num_banks	2
-	#define	core_config_rgf_banked_regs	32
-	#define	core_config_rgf_num_wr_ports	2
-	#define	core_config_endian	"little"
-	#define	core_config_endian_little	1
-	#define	core_config_endian_big	0
-	#define	core_config_lpc_size	32
-	#define	core_config_pc_size	32
-	#define	core_config_addr_size	32
-	#define	core_config_unaligned	1
-	#define	core_config_code_density	1
-	#define	core_config_div_rem	"radix2"
-	#define	core_config_div_rem_radix2	1
-	#define	core_config_swap	1
-	#define	core_config_bitscan	1
-	#define	core_config_mpy_option	"mpyd"
-	#define	core_config_mpy_option_num	8
-	#define	core_config_shift_assist	1
-	#define	core_config_barrel_shifter	1
-	#define	core_config_dsp	1
-	#define	core_config_dsp2	1
-	#define	core_config_dsp_complex	1
-	#define	core_config_dsp_divsqrt	"radix2"
-	#define	core_config_dsp_divsqrt_radix2	1
-	#define	core_config_dsp_itu	1
-	#define	core_config_dsp_accshift	"full"
-	#define	core_config_dsp_accshift_full	1
-	#define	core_config_agu_large	1
-	#define	core_config_agu_wb_depth	4
-	#define	core_config_agu_accord	1
-	#define	core_config_xy	1
-	#define	core_config_xy_config	"dccm_x_y"
-	#define	core_config_xy_config_dccm_x_y	1
-	#define	core_config_xy_size	16384
-	#define	core_config_xy_size_KM	"16K"
-	#define	core_config_xy_x_base	0x90000000
-	#define	core_config_xy_y_base	0xa0000000
-	#define	core_config_bitstream	1
-	#define	core_config_fpus_div	1
-	#define	core_config_fpu_mac	1
-	#define	core_config_fpuda	1
-	#define	core_config_fpus_mpy_slow	1
-	#define	core_config_fpus_div_slow	1
-	#define	core_config_fpu_pipe_impl	1
-	#define	core_config_timer0	1
-	#define	core_config_timer0_level	1
-	#define	core_config_timer0_vector	16
-	#define	core_config_rtc	1
-	#define	core_config_action_points	8
-	#define	core_config_stack_check	1
-	#define	core_config_smart_stack_entries	8
-	#define	core_config_mpu_present	1
-	#define	core_config_mpu	1
-	#define	core_config_mpu_regions	16
-	#define	core_config_interrupts_present	1
-	#define	core_config_interrupts_number	96
-	#define	core_config_interrupts_priorities	4
-	#define	core_config_interrupts_externals	77
-	#define	core_config_interrupts	96
-	#define	core_config_interrupt_priorities	4
-	#define	core_config_ext_interrupts	77
-	#define	core_config_interrupts_firq	1
-	#define	core_config_interrupts_base	0x0
-	#define	core_config_dcache_present	1
-	#define	core_config_dcache_size	16384
-	#define	core_config_dcache_line_size	32
-	#define	core_config_dcache_ways	2
-	#define	core_config_dcache_feature	2
-	#define	core_config_icache_present	1
-	#define	core_config_icache_size	16384
-	#define	core_config_icache_line_size	64
-	#define	core_config_icache_ways	2
-	#define	core_config_icache_feature	1
-	#define	core_config_dccm_present	1
-	#define	core_config_dccm_size	0x20000
-	#define	core_config_dccm_base	0x80000000
-	#define	core_config_iccm_present	1
-	#define	core_config_iccm0_present	1
-	#define	core_config_iccm_size	0x20000
-	#define	core_config_iccm0_size	0x20000
-	#define	core_config_iccm_base	0x60000000
-	#define	core_config_iccm0_base	0x60000000
-	#define	core_config_error_prot_ver	4
-	#define	core_config_ccm_prot_pipelined	1
-	#define	core_config_watchdog	1
-	#define	core_config_watchdog_size	32
-	#define	core_config_pct_counters	8
-	#define	core_config_dmac	1
-	#define	core_config_dmac_channels	16
-	#define	core_config_dmac_registers	0
-	#define	core_config_dmac_fifo_depth	2
-	#define	core_config_dmac_int_config	"multiple_internal"
-	#define	core_config_clock_speed	10
-#endif /* __core_config_h */
-
-]]></string>
-  </configuration>
-  <configuration name="core" filename="core.props">
-    <string><![CDATA[
-	core_config.cir.identity=0x00000044
-	core_config.cir.identity.chipid=0
-	core_config.cir.identity.arcnum=0
-	core_config.cir.identity.arcver=68
-	core_config.cir.identity.family=4
-	core_config.cir.identity.corever=4
-	core_config.cir.aux_dccm=0x80000000
-	core_config.bcr.bcr_ver=0x00000002
-	core_config.bcr.bcr_ver.version=2
-	core_config.bcr.vecbase_ac_build=0x00000010
-	core_config.bcr.vecbase_ac_build.version=4
-	core_config.bcr.vecbase_ac_build.vector_config=0
-	core_config.bcr.vecbase_ac_build.addr=0
-	core_config.bcr.mpu_build=0x00001002
-	core_config.bcr.mpu_build.i=0
-	core_config.bcr.mpu_build.s=0
-	core_config.bcr.mpu_build.regions=16
-	core_config.bcr.mpu_build.version=2
-	core_config.bcr.rf_build=0x0000c902
-	core_config.bcr.rf_build.version=2
-	core_config.bcr.rf_build.p=1
-	core_config.bcr.rf_build.e=0
-	core_config.bcr.rf_build.r=0
-	core_config.bcr.rf_build.b=1
-	core_config.bcr.rf_build.d=3
-	core_config.bcr.d_cache_build=0x00215104
-	core_config.bcr.d_cache_build.version=4
-	core_config.bcr.d_cache_build.assoc=1
-	core_config.bcr.d_cache_build.capacity=5
-	core_config.bcr.d_cache_build.bsize=1
-	core_config.bcr.d_cache_build.fl=2
-	core_config.bcr.d_cache_build.ioc=0
-	core_config.bcr.d_cache_build.cp=0
-	core_config.bcr.d_cache_build.u=0
-	core_config.bcr.d_cache_build.cycles=0
-	core_config.bcr.dccm_build=0x00000904
-	core_config.bcr.dccm_build.w=0
-	core_config.bcr.dccm_build.cycles=0
-	core_config.bcr.dccm_build.interleave=0
-	core_config.bcr.dccm_build.size1=0
-	core_config.bcr.dccm_build.size0=9
-	core_config.bcr.dccm_build.version=4
-	core_config.bcr.timer_build=0x00010504
-	core_config.bcr.timer_build.sp1=0
-	core_config.bcr.timer_build.sp0=0
-	core_config.bcr.timer_build.p1=0
-	core_config.bcr.timer_build.p0=1
-	core_config.bcr.timer_build.st1=0
-	core_config.bcr.timer_build.st0=0
-	core_config.bcr.timer_build.rtc=1
-	core_config.bcr.timer_build.rtsc_ver=1
-	core_config.bcr.timer_build.rtsc=0
-	core_config.bcr.timer_build.t0=1
-	core_config.bcr.timer_build.t1=0
-	core_config.bcr.timer_build.version=4
-	core_config.bcr.ap_build=0x00000605
-	core_config.bcr.ap_build.version=5
-	core_config.bcr.ap_build.type=6
-	core_config.bcr.i_cache_build=0x00135104
-	core_config.bcr.i_cache_build.assoc=1
-	core_config.bcr.i_cache_build.version=4
-	core_config.bcr.i_cache_build.capacity=5
-	core_config.bcr.i_cache_build.bsize=3
-	core_config.bcr.i_cache_build.fl=1
-	core_config.bcr.i_cache_build.d=0
-	core_config.bcr.iccm_build=0x00000904
-	core_config.bcr.iccm_build.w0=0
-	core_config.bcr.iccm_build.iccm1_size1=0
-	core_config.bcr.iccm_build.iccm0_size1=0
-	core_config.bcr.iccm_build.iccm1_size0=0
-	core_config.bcr.iccm_build.iccm0_size0=9
-	core_config.bcr.iccm_build.version=4
-	core_config.bcr.xy_build=0x00002220
-	core_config.bcr.xy_build.memsize=2
-	core_config.bcr.xy_build.interleaved=0
-	core_config.bcr.xy_build.config=2
-	core_config.bcr.xy_build.version=32
-	core_config.bcr.dsp_build=0x00003521
-	core_config.bcr.dsp_build.wide=0
-	core_config.bcr.dsp_build.itu_pa=1
-	core_config.bcr.dsp_build.acc_shift=2
-	core_config.bcr.dsp_build.comp=1
-	core_config.bcr.dsp_build.divsqrt=1
-	core_config.bcr.dsp_build.version=33
-	core_config.bcr.multiply_build=0x00022206
-	core_config.bcr.multiply_build.version16x16=2
-	core_config.bcr.multiply_build.dsp=2
-	core_config.bcr.multiply_build.cyc=0
-	core_config.bcr.multiply_build.type=2
-	core_config.bcr.multiply_build.version32x32=6
-	core_config.bcr.swap_build=0x00000003
-	core_config.bcr.swap_build.version=3
-	core_config.bcr.norm_build=0x00000003
-	core_config.bcr.norm_build.version=3
-	core_config.bcr.minmax_build=0x00000002
-	core_config.bcr.minmax_build.version=2
-	core_config.bcr.barrel_build=0x00000303
-	core_config.bcr.barrel_build.version=3
-	core_config.bcr.barrel_build.shift_option=3
-	core_config.bcr.isa_config=0x12447402
-	core_config.bcr.isa_config.res1=0
-	core_config.bcr.isa_config.d=1
-	core_config.bcr.isa_config.res2=0
-	core_config.bcr.isa_config.f=0
-	core_config.bcr.isa_config.c=2
-	core_config.bcr.isa_config.l=0
-	core_config.bcr.isa_config.n=1
-	core_config.bcr.isa_config.a=0
-	core_config.bcr.isa_config.b=0
-	core_config.bcr.isa_config.addr_size=4
-	core_config.bcr.isa_config.lpc_size=7
-	core_config.bcr.isa_config.pc_size=4
-	core_config.bcr.isa_config.version=2
-	core_config.bcr.dmp_pp_build=0xf0000012
-	core_config.bcr.stack_region_build=0x00000002
-	core_config.bcr.erp_build=0x50000004
-	core_config.bcr.erp_build.l=0
-	core_config.bcr.erp_build.wd=2
-	core_config.bcr.erp_build.c=1
-	core_config.bcr.erp_build.mmu=0
-	core_config.bcr.erp_build.rf=0
-	core_config.bcr.erp_build.pc=0
-	core_config.bcr.erp_build.ic=0
-	core_config.bcr.erp_build.dc=0
-	core_config.bcr.erp_build.ip=0
-	core_config.bcr.erp_build.dp=0
-	core_config.bcr.erp_build.version=4
-	core_config.bcr.fpu_build=0x01004f03
-	core_config.bcr.fpu_build.da=1
-	core_config.bcr.fpu_build.dd=0
-	core_config.bcr.fpu_build.dc=0
-	core_config.bcr.fpu_build.df=0
-	core_config.bcr.fpu_build.dp=0
-	core_config.bcr.fpu_build.fd_v1=2
-	core_config.bcr.fpu_build.pi=1
-	core_config.bcr.fpu_build.fd=0
-	core_config.bcr.fpu_build.fm=0
-	core_config.bcr.fpu_build.sd=1
-	core_config.bcr.fpu_build.sc=1
-	core_config.bcr.fpu_build.sf=1
-	core_config.bcr.fpu_build.sp=1
-	core_config.bcr.fpu_build.version=3
-	core_config.bcr.bs_build=0x00000002
-	core_config.bcr.bs_build.version=2
-	core_config.bcr.agu_build=0x01988c02
-	core_config.bcr.agu_build.accordian=1
-	core_config.bcr.agu_build.wb_size=4
-	core_config.bcr.agu_build.num_modifier=24
-	core_config.bcr.agu_build.num_offset=8
-	core_config.bcr.agu_build.num_addr=12
-	core_config.bcr.agu_build.version=2
-	core_config.bcr.dmac_build=0x00120f02
-	core_config.bcr.dmac_build.int_cfg=2
-	core_config.bcr.dmac_build.fifo=1
-	core_config.bcr.dmac_build.chan_mem=0
-	core_config.bcr.dmac_build.channels=15
-	core_config.bcr.dmac_build.version=2
-	core_config.bcr.subsys_build=0x00101063
-	core_config.bcr.subsys_build.version_major=0
-	core_config.bcr.subsys_build.version_minor=2
-	core_config.bcr.subsys_build.version_build=6
-	core_config.bcr.subsys_build.type=3
-	core_config.bcr.core_config=0x00000001
-	core_config.bcr.core_config.turbo_boost=0
-	core_config.bcr.core_config.version=1
-	core_config.bcr.rtt_build=0x00000503
-	core_config.bcr.rtt_build.prod_src_num=0
-	core_config.bcr.rtt_build.fl=2
-	core_config.bcr.rtt_build.pi=1
-	core_config.bcr.rtt_build.version=3
-	core_config.bcr.irq_build=0x134d6001
-	core_config.bcr.irq_build.raz=0
-	core_config.bcr.irq_build.nmi=0
-	core_config.bcr.irq_build.f=1
-	core_config.bcr.irq_build.p=3
-	core_config.bcr.irq_build.exts=77
-	core_config.bcr.irq_build.irqs=96
-	core_config.bcr.irq_build.version=1
-	core_config.bcr.pct_build=0x08080104
-	core_config.bcr.pct_build.version=4
-	core_config.bcr.pct_build.s=1
-	core_config.bcr.pct_build.i=0
-	core_config.bcr.pct_build.c=8
-	core_config.bcr.cc_build=0x006f0004
-	core_config.bcr.cc_build.version=4
-	core_config.bcr.cc_build.cc=111
-	core_config.bcr.smart_build=0x00002003
-	core_config.bcr.smart_build.version=3
-	core_config.bcr.smart_build.stack_size=8
-	core_config.cir.aux_iccm=0x60000000
-	core_config.cir.xccm_base=0x90000000
-	core_config.cir.yccm_base=0xa0000000
-	core_config.cir.subsys_dsp_0_build=0x00001000
-	core_config.cir.subsys_io_0_build=0x171700f0
-	core_config.cir.subsys_io_1_build=0x00000007
-	core_config.cir.subsys_io_2_build=0x00000111
-	core_config.cir.subsys_uaux_offset=0x00100000
-	core_config.cir.subsys_apex_offset=0x80000000
-	core_config.family=4
-	core_config.core_version=4
-	core_config.family_name=arcv2em
-	core_config.rgf_num_banks=2
-	core_config.rgf_banked_regs=32
-	core_config.rgf_num_wr_ports=2
-	core_config.endian=little
-	core_config.endian_little=1
-	core_config.endian_big=0
-	core_config.lpc_size=32
-	core_config.pc_size=32
-	core_config.addr_size=32
-	core_config.unaligned=1
-	core_config.code_density=1
-	core_config.div_rem=radix2
-	core_config.div_rem_radix2=1
-	core_config.swap=1
-	core_config.bitscan=1
-	core_config.mpy_option=mpyd
-	core_config.mpy_option_num=8
-	core_config.shift_assist=1
-	core_config.barrel_shifter=1
-	core_config.dsp=1
-	core_config.dsp2=1
-	core_config.dsp_complex=1
-	core_config.dsp_divsqrt=radix2
-	core_config.dsp_divsqrt_radix2=1
-	core_config.dsp_itu=1
-	core_config.dsp_accshift=full
-	core_config.dsp_accshift_full=1
-	core_config.agu_large=1
-	core_config.agu_wb_depth=4
-	core_config.agu_accord=1
-	core_config.xy=1
-	core_config.xy_config=dccm_x_y
-	core_config.xy_config_dccm_x_y=1
-	core_config.xy_size=16K
-	core_config.xy_x_base=0x90000000
-	core_config.xy_y_base=0xa0000000
-	core_config.bitstream=1
-	core_config.fpus_div=1
-	core_config.fpu_mac=1
-	core_config.fpuda=1
-	core_config.fpus_mpy_slow=1
-	core_config.fpus_div_slow=1
-	core_config.fpu_pipe_impl=1
-	core_config.timer0=1
-	core_config.timer0_level=1
-	core_config.timer0.vector=16
-	core_config.rtc=1
-	core_config.action_points=8
-	core_config.stack_check=1
-	core_config.smart_stack_entries=8
-	core_config.mpu.present=1
-	core_config.mpu=1
-	core_config.mpu.regions=16
-	core_config.interrupts.present=1
-	core_config.interrupts.number=96
-	core_config.interrupts.priorities=4
-	core_config.interrupts.externals=77
-	core_config.interrupts=96
-	core_config.interrupt_priorities=4
-	core_config.ext_interrupts=77
-	core_config.interrupts.firq=1
-	core_config.interrupts.base=0x0
-	core_config.dcache.present=1
-	core_config.dcache.size=16384
-	core_config.dcache.line_size=32
-	core_config.dcache.ways=2
-	core_config.dcache_feature=2
-	core_config.icache.present=1
-	core_config.icache.size=16384
-	core_config.icache.line_size=64
-	core_config.icache.ways=2
-	core_config.icache_feature=1
-	core_config.dccm.present=1
-	core_config.dccm_size=0x20000
-	core_config.dccm_base=0x80000000
-	core_config.iccm.present=1
-	core_config.iccm0.present=1
-	core_config.iccm.size=0x20000
-	core_config.iccm0.size=0x20000
-	core_config.iccm.base=0x60000000
-	core_config.iccm0.base=0x60000000
-	core_config.error_prot_ver=4
-	core_config.ccm_prot_pipelined=1
-	core_config.watchdog=1
-	core_config.watchdog_size=32
-	core_config.pct_counters=8
-	core_config.dmac=1
-	core_config.dmac_channels=16
-	core_config.dmac_registers=0
-	core_config.dmac_fifo_depth=2
-	core_config.dmac_int_config=multiple_internal
-	core_config.clock_speed=10
-]]></string>
-  </configuration>
-  <configuration name="gcc_compiler" filename="gcc.arg">
-    <string><![CDATA[
-	-mcpu=em4_fpuda
-	-mlittle-endian
-	-mcode-density
-	-mdiv-rem
-	-mswap
-	-mnorm
-	-mmpy-option=6
-	-mbarrel-shifter
-	-mfpu=fpuda_all
-	--param l1-cache-size=16384
-	--param l1-cache-line-size=32
-]]></string>
-  </configuration>
-  <configuration name="linker_command_file" filename="link_cmd.txt">
-    <string><![CDATA[
-# SYSTEM memory regions indicate where external memory might be located.
-#   The TCF has no specific knowledge of whether SYSTEM regions contain 
-#   external memory or not.
-# CCMWRAP memory regions indicate unusable portions of the address space
-#   due to CCM memory wrapping into upper addresses beyond its size
-
-MEMORY {
-    IVT     : ORIGIN = 0x00000000, LENGTH = 0x60000000
-    ICCM0   : ORIGIN = 0x60000000, LENGTH = 0x00020000
-#   CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
-#   SYSTEM1 : ORIGIN = 0x70000000, LENGTH = 0x10000000
-    DCCM    : ORIGIN = 0x80000000, LENGTH = 0x00020000
-#   CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
-    XCCM    : ORIGIN = 0x90000000, LENGTH = 0x00004000
-#   CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
-    YCCM    : ORIGIN = 0xa0000000, LENGTH = 0x00004000
-#   CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
-    SYSTEM2 : ORIGIN = 0xb0000000, LENGTH = 0x50000000
-    }
-SECTIONS {
-    GROUP BLOCK(4): {
-	.text? : { *('.text$crt*') }
-        * (TEXT): {}
-    	* (LIT): {}
-	} > ICCM0
-
-    GROUP BLOCK(4): {
-	/* _SDA_BASE_ computed implicitly */
-        .sdata?: {}
-        .sbss?: {}
-        * (DATA): {}
-        * (BSS): {}
-       .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:65536): {}
-       .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:0): {}
-	} > SYSTEM2
-    GROUP BLOCK(4): {
-        .Xdata? : {}
-        } > XCCM
-    GROUP BLOCK(4): {
-        .Ydata? : {}
-        } > YCCM
-    GROUP BLOCK(4) : {
-        .vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
-        } > IVT
-    }
-
-]]></string>
-  </configuration>
-  <configuration name="gnu_linker_command_file" filename="memory.x">
-    <string><![CDATA[
-MEMORY {
-    IVT      : ORIGIN = 0x00000000, LENGTH = 0x60000000
-    ICCM0    : ORIGIN = 0x60000000, LENGTH = 0x00020000
-    CCMWRAP0 : ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
-    SYSTEM1  : ORIGIN = 0x70000000, LENGTH = 0x10000000
-    DCCM     : ORIGIN = 0x80000000, LENGTH = 0x00020000
-    CCMWRAP1 : ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
-    XCCM     : ORIGIN = 0x90000000, LENGTH = 0x00004000
-    CCMWRAP2 : ORIGIN = 0x90004000, LENGTH = 0x0fffc000
-    YCCM     : ORIGIN = 0xa0000000, LENGTH = 0x00004000
-    CCMWRAP3 : ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
-    SYSTEM2  : ORIGIN = 0xb0000000, LENGTH = 0x50000000
-    }
-REGION_ALIAS("startup", ICCM0)
-REGION_ALIAS("text", ICCM0)
-REGION_ALIAS("data", DCCM)
-REGION_ALIAS("sdata", DCCM)
-PROVIDE (__stack_top = (0x8001ffff & -4 ));
-PROVIDE (__end_heap =  (0x8001ffff ));
-]]></string>
-  </configuration>
-  <configuration name="apex_header" filename="apexextensions.h">
-    <string><![CDATA[
-
-/* **** DO NOT EDIT - this file is generated by ARChitect2 ****
- *
- * Description: Header file declaring the compiler extensions for apex components 
- */
-
-#ifndef _apexextensions_H_
-#define _apexextensions_H_
-
-// User extension instruction - dsp_cos
-extern long dsp_cos(long);
-#pragma intrinsic(dsp_cos, opcode => 0x07, sub_opcode => 0x1E , latency_cycles => 8)
-
-// User extension instruction - dsp_sin
-extern long dsp_sin(long);
-#pragma intrinsic(dsp_sin, opcode => 0x07, sub_opcode => 0x1F , latency_cycles => 8)
-
-// User extension instruction - dsp_tan
-extern long dsp_tan(long);
-#pragma intrinsic(dsp_tan, opcode => 0x07, sub_opcode => 0x22 , latency_cycles => 11)
-
-// User extension instruction - dsp_acos
-extern long dsp_acos(long);
-#pragma intrinsic(dsp_acos, opcode => 0x07, sub_opcode => 0x23 , latency_cycles => 31)
-
-// User extension instruction - dsp_asin
-extern long dsp_asin(long);
-#pragma intrinsic(dsp_asin, opcode => 0x07, sub_opcode => 0x24 , latency_cycles => 31)
-
-// User extension instruction - dsp_atan
-extern long dsp_atan(long);
-#pragma intrinsic(dsp_atan, opcode => 0x07, sub_opcode => 0x25 , latency_cycles => 13)
-
-// User extension instruction - dsp_sqrt
-extern long dsp_sqrt(long);
-#pragma intrinsic(dsp_sqrt, opcode => 0x07, sub_opcode => 0x20 , latency_cycles => 31)
-
-// User extension instruction - dsp_sqrt15
-extern long dsp_sqrt15(long);
-#pragma intrinsic(dsp_sqrt15, opcode => 0x07, sub_opcode => 0x21 , latency_cycles => 15)
-
-#define APEX_COM_ARC_HARDWARE_DFSS_DSP_TRIG_PRESENT	1
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO0_PRESENT	1
-
-// User extension aux register io_gpio0_debounce
-#define AR_IO_GPIO0_DEBOUNCE 0x80017048
-#pragma Aux_register(0x80017048, name=>"io_gpio0_debounce")
-
-// User extension aux register io_gpio0_clken
-#define AR_IO_GPIO0_CLKEN 0x80017080
-#pragma Aux_register(0x80017080, name=>"io_gpio0_clken")
-
-// User extension aux register io_gpio0_swporta_dr
-#define AR_IO_GPIO0_SWPORTA_DR 0x80017000
-#pragma Aux_register(0x80017000, name=>"io_gpio0_swporta_dr")
-
-// User extension aux register io_gpio0_swporta_ddr
-#define AR_IO_GPIO0_SWPORTA_DDR 0x80017004
-#pragma Aux_register(0x80017004, name=>"io_gpio0_swporta_ddr")
-
-// User extension aux register io_gpio0_inten
-#define AR_IO_GPIO0_INTEN 0x80017030
-#pragma Aux_register(0x80017030, name=>"io_gpio0_inten")
-
-// User extension aux register io_gpio0_intmask
-#define AR_IO_GPIO0_INTMASK 0x80017034
-#pragma Aux_register(0x80017034, name=>"io_gpio0_intmask")
-
-// User extension aux register io_gpio0_inttype_level
-#define AR_IO_GPIO0_INTTYPE_LEVEL 0x80017038
-#pragma Aux_register(0x80017038, name=>"io_gpio0_inttype_level")
-
-// User extension aux register io_gpio0_int_polarity
-#define AR_IO_GPIO0_INT_POLARITY 0x8001703c
-#pragma Aux_register(0x8001703c, name=>"io_gpio0_int_polarity")
-
-// User extension aux register io_gpio0_intstatus
-#define AR_IO_GPIO0_INTSTATUS 0x80017040
-#pragma Aux_register(0x80017040, name=>"io_gpio0_intstatus")
-
-// User extension aux register io_gpio0_raw_intstatus
-#define AR_IO_GPIO0_RAW_INTSTATUS 0x80017044
-#pragma Aux_register(0x80017044, name=>"io_gpio0_raw_intstatus")
-
-// User extension aux register io_gpio0_porta_eoi
-#define AR_IO_GPIO0_PORTA_EOI 0x8001704c
-#pragma Aux_register(0x8001704c, name=>"io_gpio0_porta_eoi")
-
-// User extension aux register io_gpio0_ext_porta
-#define AR_IO_GPIO0_EXT_PORTA 0x80017050
-#pragma Aux_register(0x80017050, name=>"io_gpio0_ext_porta")
-
-// User extension aux register io_gpio0_ls_sync
-#define AR_IO_GPIO0_LS_SYNC 0x80017060
-#pragma Aux_register(0x80017060, name=>"io_gpio0_ls_sync")
-
-// User extension aux register io_gpio0_int_bothedge
-#define AR_IO_GPIO0_INT_BOTHEDGE 0x80017068
-#pragma Aux_register(0x80017068, name=>"io_gpio0_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST0_PRESENT	1
-
-// User extension aux register io_i2c_mst0_clken
-#define AR_IO_I2C_MST0_CLKEN 0x800120c0
-#pragma Aux_register(0x800120c0, name=>"io_i2c_mst0_clken")
-
-// User extension aux register io_i2c_mst0_con
-#define AR_IO_I2C_MST0_CON 0x80012000
-#pragma Aux_register(0x80012000, name=>"io_i2c_mst0_con")
-
-// User extension aux register io_i2c_mst0_tar
-#define AR_IO_I2C_MST0_TAR 0x80012004
-#pragma Aux_register(0x80012004, name=>"io_i2c_mst0_tar")
-
-// User extension aux register io_i2c_mst0_data_cmd
-#define AR_IO_I2C_MST0_DATA_CMD 0x80012010
-#pragma Aux_register(0x80012010, name=>"io_i2c_mst0_data_cmd")
-
-// User extension aux register io_i2c_mst0_ss_scl_hcnt
-#define AR_IO_I2C_MST0_SS_SCL_HCNT 0x80012014
-#pragma Aux_register(0x80012014, name=>"io_i2c_mst0_ss_scl_hcnt")
-
-// User extension aux register io_i2c_mst0_ss_scl_lcnt
-#define AR_IO_I2C_MST0_SS_SCL_LCNT 0x80012018
-#pragma Aux_register(0x80012018, name=>"io_i2c_mst0_ss_scl_lcnt")
-
-// User extension aux register io_i2c_mst0_fs_scl_hcnt
-#define AR_IO_I2C_MST0_FS_SCL_HCNT 0x8001201c
-#pragma Aux_register(0x8001201c, name=>"io_i2c_mst0_fs_scl_hcnt")
-
-// User extension aux register io_i2c_mst0_fs_scl_lcnt
-#define AR_IO_I2C_MST0_FS_SCL_LCNT 0x80012020
-#pragma Aux_register(0x80012020, name=>"io_i2c_mst0_fs_scl_lcnt")
-
-// User extension aux register io_i2c_mst0_intr_stat
-#define AR_IO_I2C_MST0_INTR_STAT 0x8001202c
-#pragma Aux_register(0x8001202c, name=>"io_i2c_mst0_intr_stat")
-
-// User extension aux register io_i2c_mst0_intr_mask
-#define AR_IO_I2C_MST0_INTR_MASK 0x80012030
-#pragma Aux_register(0x80012030, name=>"io_i2c_mst0_intr_mask")
-
-// User extension aux register io_i2c_mst0_raw_intr_stat
-#define AR_IO_I2C_MST0_RAW_INTR_STAT 0x80012034
-#pragma Aux_register(0x80012034, name=>"io_i2c_mst0_raw_intr_stat")
-
-// User extension aux register io_i2c_mst0_rx_tl
-#define AR_IO_I2C_MST0_RX_TL 0x80012038
-#pragma Aux_register(0x80012038, name=>"io_i2c_mst0_rx_tl")
-
-// User extension aux register io_i2c_mst0_tx_tl
-#define AR_IO_I2C_MST0_TX_TL 0x8001203c
-#pragma Aux_register(0x8001203c, name=>"io_i2c_mst0_tx_tl")
-
-// User extension aux register io_i2c_mst0_clr_intr
-#define AR_IO_I2C_MST0_CLR_INTR 0x80012040
-#pragma Aux_register(0x80012040, name=>"io_i2c_mst0_clr_intr")
-
-// User extension aux register io_i2c_mst0_clr_rx_under
-#define AR_IO_I2C_MST0_CLR_RX_UNDER 0x80012044
-#pragma Aux_register(0x80012044, name=>"io_i2c_mst0_clr_rx_under")
-
-// User extension aux register io_i2c_mst0_clr_rx_over
-#define AR_IO_I2C_MST0_CLR_RX_OVER 0x80012048
-#pragma Aux_register(0x80012048, name=>"io_i2c_mst0_clr_rx_over")
-
-// User extension aux register io_i2c_mst0_clr_tx_over
-#define AR_IO_I2C_MST0_CLR_TX_OVER 0x8001204c
-#pragma Aux_register(0x8001204c, name=>"io_i2c_mst0_clr_tx_over")
-
-// User extension aux register io_i2c_mst0_clr_tx_abrt
-#define AR_IO_I2C_MST0_CLR_TX_ABRT 0x80012054
-#pragma Aux_register(0x80012054, name=>"io_i2c_mst0_clr_tx_abrt")
-
-// User extension aux register io_i2c_mst0_clr_activity
-#define AR_IO_I2C_MST0_CLR_ACTIVITY 0x8001205c
-#pragma Aux_register(0x8001205c, name=>"io_i2c_mst0_clr_activity")
-
-// User extension aux register io_i2c_mst0_clr_stop_det
-#define AR_IO_I2C_MST0_CLR_STOP_DET 0x80012060
-#pragma Aux_register(0x80012060, name=>"io_i2c_mst0_clr_stop_det")
-
-// User extension aux register io_i2c_mst0_clr_start_det
-#define AR_IO_I2C_MST0_CLR_START_DET 0x80012064
-#pragma Aux_register(0x80012064, name=>"io_i2c_mst0_clr_start_det")
-
-// User extension aux register io_i2c_mst0_enable
-#define AR_IO_I2C_MST0_ENABLE 0x8001206c
-#pragma Aux_register(0x8001206c, name=>"io_i2c_mst0_enable")
-
-// User extension aux register io_i2c_mst0_status
-#define AR_IO_I2C_MST0_STATUS 0x80012070
-#pragma Aux_register(0x80012070, name=>"io_i2c_mst0_status")
-
-// User extension aux register io_i2c_mst0_txflr
-#define AR_IO_I2C_MST0_TXFLR 0x80012074
-#pragma Aux_register(0x80012074, name=>"io_i2c_mst0_txflr")
-
-// User extension aux register io_i2c_mst0_rxflr
-#define AR_IO_I2C_MST0_RXFLR 0x80012078
-#pragma Aux_register(0x80012078, name=>"io_i2c_mst0_rxflr")
-
-// User extension aux register io_i2c_mst0_sda_hold
-#define AR_IO_I2C_MST0_SDA_HOLD 0x8001207c
-#pragma Aux_register(0x8001207c, name=>"io_i2c_mst0_sda_hold")
-
-// User extension aux register io_i2c_mst0_tx_abrt_source
-#define AR_IO_I2C_MST0_TX_ABRT_SOURCE 0x80012080
-#pragma Aux_register(0x80012080, name=>"io_i2c_mst0_tx_abrt_source")
-
-// User extension aux register io_i2c_mst0_enable_status
-#define AR_IO_I2C_MST0_ENABLE_STATUS 0x8001209c
-#pragma Aux_register(0x8001209c, name=>"io_i2c_mst0_enable_status")
-
-// User extension aux register io_i2c_mst0_fs_spklen
-#define AR_IO_I2C_MST0_FS_SPKLEN 0x800120a0
-#pragma Aux_register(0x800120a0, name=>"io_i2c_mst0_fs_spklen")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_SLV0_PRESENT	1
-
-// User extension aux register io_i2c_slv0_clken
-#define AR_IO_I2C_SLV0_CLKEN 0x800130c0
-#pragma Aux_register(0x800130c0, name=>"io_i2c_slv0_clken")
-
-// User extension aux register io_i2c_slv0_con
-#define AR_IO_I2C_SLV0_CON 0x80013000
-#pragma Aux_register(0x80013000, name=>"io_i2c_slv0_con")
-
-// User extension aux register io_i2c_slv0_sar
-#define AR_IO_I2C_SLV0_SAR 0x80013008
-#pragma Aux_register(0x80013008, name=>"io_i2c_slv0_sar")
-
-// User extension aux register io_i2c_slv0_data_cmd
-#define AR_IO_I2C_SLV0_DATA_CMD 0x80013010
-#pragma Aux_register(0x80013010, name=>"io_i2c_slv0_data_cmd")
-
-// User extension aux register io_i2c_slv0_intr_stat
-#define AR_IO_I2C_SLV0_INTR_STAT 0x8001302c
-#pragma Aux_register(0x8001302c, name=>"io_i2c_slv0_intr_stat")
-
-// User extension aux register io_i2c_slv0_intr_mask
-#define AR_IO_I2C_SLV0_INTR_MASK 0x80013030
-#pragma Aux_register(0x80013030, name=>"io_i2c_slv0_intr_mask")
-
-// User extension aux register io_i2c_slv0_raw_intr_stat
-#define AR_IO_I2C_SLV0_RAW_INTR_STAT 0x80013034
-#pragma Aux_register(0x80013034, name=>"io_i2c_slv0_raw_intr_stat")
-
-// User extension aux register io_i2c_slv0_rx_tl
-#define AR_IO_I2C_SLV0_RX_TL 0x80013038
-#pragma Aux_register(0x80013038, name=>"io_i2c_slv0_rx_tl")
-
-// User extension aux register io_i2c_slv0_tx_tl
-#define AR_IO_I2C_SLV0_TX_TL 0x8001303c
-#pragma Aux_register(0x8001303c, name=>"io_i2c_slv0_tx_tl")
-
-// User extension aux register io_i2c_slv0_clr_intr
-#define AR_IO_I2C_SLV0_CLR_INTR 0x80013040
-#pragma Aux_register(0x80013040, name=>"io_i2c_slv0_clr_intr")
-
-// User extension aux register io_i2c_slv0_clr_rx_under
-#define AR_IO_I2C_SLV0_CLR_RX_UNDER 0x80013044
-#pragma Aux_register(0x80013044, name=>"io_i2c_slv0_clr_rx_under")
-
-// User extension aux register io_i2c_slv0_clr_rx_over
-#define AR_IO_I2C_SLV0_CLR_RX_OVER 0x80013048
-#pragma Aux_register(0x80013048, name=>"io_i2c_slv0_clr_rx_over")
-
-// User extension aux register io_i2c_slv0_clr_tx_over
-#define AR_IO_I2C_SLV0_CLR_TX_OVER 0x8001304c
-#pragma Aux_register(0x8001304c, name=>"io_i2c_slv0_clr_tx_over")
-
-// User extension aux register io_i2c_slv0_clr_rd_req
-#define AR_IO_I2C_SLV0_CLR_RD_REQ 0x80013050
-#pragma Aux_register(0x80013050, name=>"io_i2c_slv0_clr_rd_req")
-
-// User extension aux register io_i2c_slv0_clr_tx_abrt
-#define AR_IO_I2C_SLV0_CLR_TX_ABRT 0x80013054
-#pragma Aux_register(0x80013054, name=>"io_i2c_slv0_clr_tx_abrt")
-
-// User extension aux register io_i2c_slv0_clr_rx_done
-#define AR_IO_I2C_SLV0_CLR_RX_DONE 0x80013058
-#pragma Aux_register(0x80013058, name=>"io_i2c_slv0_clr_rx_done")
-
-// User extension aux register io_i2c_slv0_clr_activity
-#define AR_IO_I2C_SLV0_CLR_ACTIVITY 0x8001305c
-#pragma Aux_register(0x8001305c, name=>"io_i2c_slv0_clr_activity")
-
-// User extension aux register io_i2c_slv0_clr_stop_det
-#define AR_IO_I2C_SLV0_CLR_STOP_DET 0x80013060
-#pragma Aux_register(0x80013060, name=>"io_i2c_slv0_clr_stop_det")
-
-// User extension aux register io_i2c_slv0_clr_start_det
-#define AR_IO_I2C_SLV0_CLR_START_DET 0x80013064
-#pragma Aux_register(0x80013064, name=>"io_i2c_slv0_clr_start_det")
-
-// User extension aux register io_i2c_slv0_enable
-#define AR_IO_I2C_SLV0_ENABLE 0x8001306c
-#pragma Aux_register(0x8001306c, name=>"io_i2c_slv0_enable")
-
-// User extension aux register io_i2c_slv0_status
-#define AR_IO_I2C_SLV0_STATUS 0x80013070
-#pragma Aux_register(0x80013070, name=>"io_i2c_slv0_status")
-
-// User extension aux register io_i2c_slv0_txflr
-#define AR_IO_I2C_SLV0_TXFLR 0x80013074
-#pragma Aux_register(0x80013074, name=>"io_i2c_slv0_txflr")
-
-// User extension aux register io_i2c_slv0_rxflr
-#define AR_IO_I2C_SLV0_RXFLR 0x80013078
-#pragma Aux_register(0x80013078, name=>"io_i2c_slv0_rxflr")
-
-// User extension aux register io_i2c_slv0_sda_hold
-#define AR_IO_I2C_SLV0_SDA_HOLD 0x8001307c
-#pragma Aux_register(0x8001307c, name=>"io_i2c_slv0_sda_hold")
-
-// User extension aux register io_i2c_slv0_tx_abrt_source
-#define AR_IO_I2C_SLV0_TX_ABRT_SOURCE 0x80013080
-#pragma Aux_register(0x80013080, name=>"io_i2c_slv0_tx_abrt_source")
-
-// User extension aux register io_i2c_slv0_sda_setup
-#define AR_IO_I2C_SLV0_SDA_SETUP 0x80013094
-#pragma Aux_register(0x80013094, name=>"io_i2c_slv0_sda_setup")
-
-// User extension aux register io_i2c_slv0_enable_status
-#define AR_IO_I2C_SLV0_ENABLE_STATUS 0x8001309c
-#pragma Aux_register(0x8001309c, name=>"io_i2c_slv0_enable_status")
-
-// User extension aux register io_i2c_slv0_fs_spklen
-#define AR_IO_I2C_SLV0_FS_SPKLEN 0x800130a0
-#pragma Aux_register(0x800130a0, name=>"io_i2c_slv0_fs_spklen")
-
-// User extension aux register io_i2c_slv0_clr_restart_det
-#define AR_IO_I2C_SLV0_CLR_RESTART_DET 0x800130a8
-#pragma Aux_register(0x800130a8, name=>"io_i2c_slv0_clr_restart_det")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST0_PRESENT	1
-
-// User extension aux register io_spi_mst0_ctrlr0
-#define AR_IO_SPI_MST0_CTRLR0 0x80010000
-#pragma Aux_register(0x80010000, name=>"io_spi_mst0_ctrlr0")
-
-// User extension aux register io_spi_mst0_ctrlr1
-#define AR_IO_SPI_MST0_CTRLR1 0x80010001
-#pragma Aux_register(0x80010001, name=>"io_spi_mst0_ctrlr1")
-
-// User extension aux register io_spi_mst0_spien
-#define AR_IO_SPI_MST0_SPIEN 0x80010002
-#pragma Aux_register(0x80010002, name=>"io_spi_mst0_spien")
-
-// User extension aux register io_spi_mst0_ser
-#define AR_IO_SPI_MST0_SER 0x80010004
-#pragma Aux_register(0x80010004, name=>"io_spi_mst0_ser")
-
-// User extension aux register io_spi_mst0_baudr
-#define AR_IO_SPI_MST0_BAUDR 0x80010005
-#pragma Aux_register(0x80010005, name=>"io_spi_mst0_baudr")
-
-// User extension aux register io_spi_mst0_txftlr
-#define AR_IO_SPI_MST0_TXFTLR 0x80010006
-#pragma Aux_register(0x80010006, name=>"io_spi_mst0_txftlr")
-
-// User extension aux register io_spi_mst0_rxftlr
-#define AR_IO_SPI_MST0_RXFTLR 0x80010007
-#pragma Aux_register(0x80010007, name=>"io_spi_mst0_rxftlr")
-
-// User extension aux register io_spi_mst0_txflr
-#define AR_IO_SPI_MST0_TXFLR 0x80010008
-#pragma Aux_register(0x80010008, name=>"io_spi_mst0_txflr")
-
-// User extension aux register io_spi_mst0_rxflr
-#define AR_IO_SPI_MST0_RXFLR 0x80010009
-#pragma Aux_register(0x80010009, name=>"io_spi_mst0_rxflr")
-
-// User extension aux register io_spi_mst0_sr
-#define AR_IO_SPI_MST0_SR 0x8001000a
-#pragma Aux_register(0x8001000a, name=>"io_spi_mst0_sr")
-
-// User extension aux register io_spi_mst0_imr
-#define AR_IO_SPI_MST0_IMR 0x8001000b
-#pragma Aux_register(0x8001000b, name=>"io_spi_mst0_imr")
-
-// User extension aux register io_spi_mst0_isr
-#define AR_IO_SPI_MST0_ISR 0x8001000c
-#pragma Aux_register(0x8001000c, name=>"io_spi_mst0_isr")
-
-// User extension aux register io_spi_mst0_risr
-#define AR_IO_SPI_MST0_RISR 0x8001000d
-#pragma Aux_register(0x8001000d, name=>"io_spi_mst0_risr")
-
-// User extension aux register io_spi_mst0_txoicr
-#define AR_IO_SPI_MST0_TXOICR 0x8001000e
-#pragma Aux_register(0x8001000e, name=>"io_spi_mst0_txoicr")
-
-// User extension aux register io_spi_mst0_rxoicr
-#define AR_IO_SPI_MST0_RXOICR 0x8001000f
-#pragma Aux_register(0x8001000f, name=>"io_spi_mst0_rxoicr")
-
-// User extension aux register io_spi_mst0_rxuicr
-#define AR_IO_SPI_MST0_RXUICR 0x80010010
-#pragma Aux_register(0x80010010, name=>"io_spi_mst0_rxuicr")
-
-// User extension aux register io_spi_mst0_icr
-#define AR_IO_SPI_MST0_ICR 0x80010012
-#pragma Aux_register(0x80010012, name=>"io_spi_mst0_icr")
-
-// User extension aux register io_spi_mst0_clken
-#define AR_IO_SPI_MST0_CLKEN 0x80010016
-#pragma Aux_register(0x80010016, name=>"io_spi_mst0_clken")
-
-// User extension aux register io_spi_mst0_dr
-#define AR_IO_SPI_MST0_DR 0x80010018
-#pragma Aux_register(0x80010018, name=>"io_spi_mst0_dr")
-
-// User extension aux register io_spi_mst0_rx_sample_dly
-#define AR_IO_SPI_MST0_RX_SAMPLE_DLY 0x8001003c
-#pragma Aux_register(0x8001003c, name=>"io_spi_mst0_rx_sample_dly")
-#define APEX_COM_ARC_HARDWARE_DFSS_SUBSYS_BCR_PRESENT	1
-
-// User extension aux register SUBSYS_BUILD
-#define AR_SUBSYS_BUILD 0xf0
-#pragma Aux_register(0xf0, name=>"SUBSYS_BUILD")
-
-// User extension aux register SUBSYS_DSP_0_BUILD
-#define AR_SUBSYS_DSP_0_BUILD 0xa00
-#pragma Aux_register(0xa00, name=>"SUBSYS_DSP_0_BUILD")
-
-// User extension aux register SUBSYS_DSP_0_CONFIG
-#define AR_SUBSYS_DSP_0_CONFIG 0xa02
-#pragma Aux_register(0xa02, name=>"SUBSYS_DSP_0_CONFIG")
-
-// User extension aux register SUBSYS_IO_0_BUILD
-#define AR_SUBSYS_IO_0_BUILD 0xa04
-#pragma Aux_register(0xa04, name=>"SUBSYS_IO_0_BUILD")
-
-// User extension aux register SUBSYS_IO_1_BUILD
-#define AR_SUBSYS_IO_1_BUILD 0xa05
-#pragma Aux_register(0xa05, name=>"SUBSYS_IO_1_BUILD")
-
-// User extension aux register SUBSYS_IO_2_BUILD
-#define AR_SUBSYS_IO_2_BUILD 0xa06
-#pragma Aux_register(0xa06, name=>"SUBSYS_IO_2_BUILD")
-
-// User extension aux register SUBSYS_UAUX_OFFSET
-#define AR_SUBSYS_UAUX_OFFSET 0xa1e
-#pragma Aux_register(0xa1e, name=>"SUBSYS_UAUX_OFFSET")
-
-// User extension aux register SUBSYS_APEX_OFFSET
-#define AR_SUBSYS_APEX_OFFSET 0xa1f
-#pragma Aux_register(0xa1f, name=>"SUBSYS_APEX_OFFSET")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST1_PRESENT	1
-
-// User extension aux register io_spi_mst1_ctrlr0
-#define AR_IO_SPI_MST1_CTRLR0 0x80010100
-#pragma Aux_register(0x80010100, name=>"io_spi_mst1_ctrlr0")
-
-// User extension aux register io_spi_mst1_ctrlr1
-#define AR_IO_SPI_MST1_CTRLR1 0x80010101
-#pragma Aux_register(0x80010101, name=>"io_spi_mst1_ctrlr1")
-
-// User extension aux register io_spi_mst1_spien
-#define AR_IO_SPI_MST1_SPIEN 0x80010102
-#pragma Aux_register(0x80010102, name=>"io_spi_mst1_spien")
-
-// User extension aux register io_spi_mst1_ser
-#define AR_IO_SPI_MST1_SER 0x80010104
-#pragma Aux_register(0x80010104, name=>"io_spi_mst1_ser")
-
-// User extension aux register io_spi_mst1_baudr
-#define AR_IO_SPI_MST1_BAUDR 0x80010105
-#pragma Aux_register(0x80010105, name=>"io_spi_mst1_baudr")
-
-// User extension aux register io_spi_mst1_txftlr
-#define AR_IO_SPI_MST1_TXFTLR 0x80010106
-#pragma Aux_register(0x80010106, name=>"io_spi_mst1_txftlr")
-
-// User extension aux register io_spi_mst1_rxftlr
-#define AR_IO_SPI_MST1_RXFTLR 0x80010107
-#pragma Aux_register(0x80010107, name=>"io_spi_mst1_rxftlr")
-
-// User extension aux register io_spi_mst1_txflr
-#define AR_IO_SPI_MST1_TXFLR 0x80010108
-#pragma Aux_register(0x80010108, name=>"io_spi_mst1_txflr")
-
-// User extension aux register io_spi_mst1_rxflr
-#define AR_IO_SPI_MST1_RXFLR 0x80010109
-#pragma Aux_register(0x80010109, name=>"io_spi_mst1_rxflr")
-
-// User extension aux register io_spi_mst1_sr
-#define AR_IO_SPI_MST1_SR 0x8001010a
-#pragma Aux_register(0x8001010a, name=>"io_spi_mst1_sr")
-
-// User extension aux register io_spi_mst1_imr
-#define AR_IO_SPI_MST1_IMR 0x8001010b
-#pragma Aux_register(0x8001010b, name=>"io_spi_mst1_imr")
-
-// User extension aux register io_spi_mst1_isr
-#define AR_IO_SPI_MST1_ISR 0x8001010c
-#pragma Aux_register(0x8001010c, name=>"io_spi_mst1_isr")
-
-// User extension aux register io_spi_mst1_risr
-#define AR_IO_SPI_MST1_RISR 0x8001010d
-#pragma Aux_register(0x8001010d, name=>"io_spi_mst1_risr")
-
-// User extension aux register io_spi_mst1_txoicr
-#define AR_IO_SPI_MST1_TXOICR 0x8001010e
-#pragma Aux_register(0x8001010e, name=>"io_spi_mst1_txoicr")
-
-// User extension aux register io_spi_mst1_rxoicr
-#define AR_IO_SPI_MST1_RXOICR 0x8001010f
-#pragma Aux_register(0x8001010f, name=>"io_spi_mst1_rxoicr")
-
-// User extension aux register io_spi_mst1_rxuicr
-#define AR_IO_SPI_MST1_RXUICR 0x80010110
-#pragma Aux_register(0x80010110, name=>"io_spi_mst1_rxuicr")
-
-// User extension aux register io_spi_mst1_icr
-#define AR_IO_SPI_MST1_ICR 0x80010112
-#pragma Aux_register(0x80010112, name=>"io_spi_mst1_icr")
-
-// User extension aux register io_spi_mst1_clken
-#define AR_IO_SPI_MST1_CLKEN 0x80010116
-#pragma Aux_register(0x80010116, name=>"io_spi_mst1_clken")
-
-// User extension aux register io_spi_mst1_dr
-#define AR_IO_SPI_MST1_DR 0x80010118
-#pragma Aux_register(0x80010118, name=>"io_spi_mst1_dr")
-
-// User extension aux register io_spi_mst1_rx_sample_dly
-#define AR_IO_SPI_MST1_RX_SAMPLE_DLY 0x8001013c
-#pragma Aux_register(0x8001013c, name=>"io_spi_mst1_rx_sample_dly")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_MST2_PRESENT	1
-
-// User extension aux register io_spi_mst2_ctrlr0
-#define AR_IO_SPI_MST2_CTRLR0 0x80010200
-#pragma Aux_register(0x80010200, name=>"io_spi_mst2_ctrlr0")
-
-// User extension aux register io_spi_mst2_ctrlr1
-#define AR_IO_SPI_MST2_CTRLR1 0x80010201
-#pragma Aux_register(0x80010201, name=>"io_spi_mst2_ctrlr1")
-
-// User extension aux register io_spi_mst2_spien
-#define AR_IO_SPI_MST2_SPIEN 0x80010202
-#pragma Aux_register(0x80010202, name=>"io_spi_mst2_spien")
-
-// User extension aux register io_spi_mst2_ser
-#define AR_IO_SPI_MST2_SER 0x80010204
-#pragma Aux_register(0x80010204, name=>"io_spi_mst2_ser")
-
-// User extension aux register io_spi_mst2_baudr
-#define AR_IO_SPI_MST2_BAUDR 0x80010205
-#pragma Aux_register(0x80010205, name=>"io_spi_mst2_baudr")
-
-// User extension aux register io_spi_mst2_txftlr
-#define AR_IO_SPI_MST2_TXFTLR 0x80010206
-#pragma Aux_register(0x80010206, name=>"io_spi_mst2_txftlr")
-
-// User extension aux register io_spi_mst2_rxftlr
-#define AR_IO_SPI_MST2_RXFTLR 0x80010207
-#pragma Aux_register(0x80010207, name=>"io_spi_mst2_rxftlr")
-
-// User extension aux register io_spi_mst2_txflr
-#define AR_IO_SPI_MST2_TXFLR 0x80010208
-#pragma Aux_register(0x80010208, name=>"io_spi_mst2_txflr")
-
-// User extension aux register io_spi_mst2_rxflr
-#define AR_IO_SPI_MST2_RXFLR 0x80010209
-#pragma Aux_register(0x80010209, name=>"io_spi_mst2_rxflr")
-
-// User extension aux register io_spi_mst2_sr
-#define AR_IO_SPI_MST2_SR 0x8001020a
-#pragma Aux_register(0x8001020a, name=>"io_spi_mst2_sr")
-
-// User extension aux register io_spi_mst2_imr
-#define AR_IO_SPI_MST2_IMR 0x8001020b
-#pragma Aux_register(0x8001020b, name=>"io_spi_mst2_imr")
-
-// User extension aux register io_spi_mst2_isr
-#define AR_IO_SPI_MST2_ISR 0x8001020c
-#pragma Aux_register(0x8001020c, name=>"io_spi_mst2_isr")
-
-// User extension aux register io_spi_mst2_risr
-#define AR_IO_SPI_MST2_RISR 0x8001020d
-#pragma Aux_register(0x8001020d, name=>"io_spi_mst2_risr")
-
-// User extension aux register io_spi_mst2_txoicr
-#define AR_IO_SPI_MST2_TXOICR 0x8001020e
-#pragma Aux_register(0x8001020e, name=>"io_spi_mst2_txoicr")
-
-// User extension aux register io_spi_mst2_rxoicr
-#define AR_IO_SPI_MST2_RXOICR 0x8001020f
-#pragma Aux_register(0x8001020f, name=>"io_spi_mst2_rxoicr")
-
-// User extension aux register io_spi_mst2_rxuicr
-#define AR_IO_SPI_MST2_RXUICR 0x80010210
-#pragma Aux_register(0x80010210, name=>"io_spi_mst2_rxuicr")
-
-// User extension aux register io_spi_mst2_icr
-#define AR_IO_SPI_MST2_ICR 0x80010212
-#pragma Aux_register(0x80010212, name=>"io_spi_mst2_icr")
-
-// User extension aux register io_spi_mst2_clken
-#define AR_IO_SPI_MST2_CLKEN 0x80010216
-#pragma Aux_register(0x80010216, name=>"io_spi_mst2_clken")
-
-// User extension aux register io_spi_mst2_dr
-#define AR_IO_SPI_MST2_DR 0x80010218
-#pragma Aux_register(0x80010218, name=>"io_spi_mst2_dr")
-
-// User extension aux register io_spi_mst2_rx_sample_dly
-#define AR_IO_SPI_MST2_RX_SAMPLE_DLY 0x8001023c
-#pragma Aux_register(0x8001023c, name=>"io_spi_mst2_rx_sample_dly")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_SPI_SLV0_PRESENT	1
-
-// User extension aux register io_spi_slv0_ctrlr0
-#define AR_IO_SPI_SLV0_CTRLR0 0x80011000
-#pragma Aux_register(0x80011000, name=>"io_spi_slv0_ctrlr0")
-
-// User extension aux register io_spi_slv0_spien
-#define AR_IO_SPI_SLV0_SPIEN 0x80011002
-#pragma Aux_register(0x80011002, name=>"io_spi_slv0_spien")
-
-// User extension aux register io_spi_slv0_txftlr
-#define AR_IO_SPI_SLV0_TXFTLR 0x80011006
-#pragma Aux_register(0x80011006, name=>"io_spi_slv0_txftlr")
-
-// User extension aux register io_spi_slv0_rxftlr
-#define AR_IO_SPI_SLV0_RXFTLR 0x80011007
-#pragma Aux_register(0x80011007, name=>"io_spi_slv0_rxftlr")
-
-// User extension aux register io_spi_slv0_txflr
-#define AR_IO_SPI_SLV0_TXFLR 0x80011008
-#pragma Aux_register(0x80011008, name=>"io_spi_slv0_txflr")
-
-// User extension aux register io_spi_slv0_rxflr
-#define AR_IO_SPI_SLV0_RXFLR 0x80011009
-#pragma Aux_register(0x80011009, name=>"io_spi_slv0_rxflr")
-
-// User extension aux register io_spi_slv0_sr
-#define AR_IO_SPI_SLV0_SR 0x8001100a
-#pragma Aux_register(0x8001100a, name=>"io_spi_slv0_sr")
-
-// User extension aux register io_spi_slv0_imr
-#define AR_IO_SPI_SLV0_IMR 0x8001100b
-#pragma Aux_register(0x8001100b, name=>"io_spi_slv0_imr")
-
-// User extension aux register io_spi_slv0_isr
-#define AR_IO_SPI_SLV0_ISR 0x8001100c
-#pragma Aux_register(0x8001100c, name=>"io_spi_slv0_isr")
-
-// User extension aux register io_spi_slv0_risr
-#define AR_IO_SPI_SLV0_RISR 0x8001100d
-#pragma Aux_register(0x8001100d, name=>"io_spi_slv0_risr")
-
-// User extension aux register io_spi_slv0_txoicr
-#define AR_IO_SPI_SLV0_TXOICR 0x8001100e
-#pragma Aux_register(0x8001100e, name=>"io_spi_slv0_txoicr")
-
-// User extension aux register io_spi_slv0_rxoicr
-#define AR_IO_SPI_SLV0_RXOICR 0x8001100f
-#pragma Aux_register(0x8001100f, name=>"io_spi_slv0_rxoicr")
-
-// User extension aux register io_spi_slv0_rxuicr
-#define AR_IO_SPI_SLV0_RXUICR 0x80011010
-#pragma Aux_register(0x80011010, name=>"io_spi_slv0_rxuicr")
-
-// User extension aux register io_spi_slv0_icr
-#define AR_IO_SPI_SLV0_ICR 0x80011012
-#pragma Aux_register(0x80011012, name=>"io_spi_slv0_icr")
-
-// User extension aux register io_spi_slv0_clken
-#define AR_IO_SPI_SLV0_CLKEN 0x80011016
-#pragma Aux_register(0x80011016, name=>"io_spi_slv0_clken")
-
-// User extension aux register io_spi_slv0_dr
-#define AR_IO_SPI_SLV0_DR 0x80011018
-#pragma Aux_register(0x80011018, name=>"io_spi_slv0_dr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO1_PRESENT	1
-
-// User extension aux register io_gpio1_debounce
-#define AR_IO_GPIO1_DEBOUNCE 0x80017148
-#pragma Aux_register(0x80017148, name=>"io_gpio1_debounce")
-
-// User extension aux register io_gpio1_clken
-#define AR_IO_GPIO1_CLKEN 0x80017180
-#pragma Aux_register(0x80017180, name=>"io_gpio1_clken")
-
-// User extension aux register io_gpio1_swporta_dr
-#define AR_IO_GPIO1_SWPORTA_DR 0x80017100
-#pragma Aux_register(0x80017100, name=>"io_gpio1_swporta_dr")
-
-// User extension aux register io_gpio1_swporta_ddr
-#define AR_IO_GPIO1_SWPORTA_DDR 0x80017104
-#pragma Aux_register(0x80017104, name=>"io_gpio1_swporta_ddr")
-
-// User extension aux register io_gpio1_inten
-#define AR_IO_GPIO1_INTEN 0x80017130
-#pragma Aux_register(0x80017130, name=>"io_gpio1_inten")
-
-// User extension aux register io_gpio1_intmask
-#define AR_IO_GPIO1_INTMASK 0x80017134
-#pragma Aux_register(0x80017134, name=>"io_gpio1_intmask")
-
-// User extension aux register io_gpio1_inttype_level
-#define AR_IO_GPIO1_INTTYPE_LEVEL 0x80017138
-#pragma Aux_register(0x80017138, name=>"io_gpio1_inttype_level")
-
-// User extension aux register io_gpio1_int_polarity
-#define AR_IO_GPIO1_INT_POLARITY 0x8001713c
-#pragma Aux_register(0x8001713c, name=>"io_gpio1_int_polarity")
-
-// User extension aux register io_gpio1_intstatus
-#define AR_IO_GPIO1_INTSTATUS 0x80017140
-#pragma Aux_register(0x80017140, name=>"io_gpio1_intstatus")
-
-// User extension aux register io_gpio1_raw_intstatus
-#define AR_IO_GPIO1_RAW_INTSTATUS 0x80017144
-#pragma Aux_register(0x80017144, name=>"io_gpio1_raw_intstatus")
-
-// User extension aux register io_gpio1_porta_eoi
-#define AR_IO_GPIO1_PORTA_EOI 0x8001714c
-#pragma Aux_register(0x8001714c, name=>"io_gpio1_porta_eoi")
-
-// User extension aux register io_gpio1_ext_porta
-#define AR_IO_GPIO1_EXT_PORTA 0x80017150
-#pragma Aux_register(0x80017150, name=>"io_gpio1_ext_porta")
-
-// User extension aux register io_gpio1_ls_sync
-#define AR_IO_GPIO1_LS_SYNC 0x80017160
-#pragma Aux_register(0x80017160, name=>"io_gpio1_ls_sync")
-
-// User extension aux register io_gpio1_int_bothedge
-#define AR_IO_GPIO1_INT_BOTHEDGE 0x80017168
-#pragma Aux_register(0x80017168, name=>"io_gpio1_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_GPIO2_PRESENT	1
-
-// User extension aux register io_gpio2_debounce
-#define AR_IO_GPIO2_DEBOUNCE 0x80017248
-#pragma Aux_register(0x80017248, name=>"io_gpio2_debounce")
-
-// User extension aux register io_gpio2_clken
-#define AR_IO_GPIO2_CLKEN 0x80017280
-#pragma Aux_register(0x80017280, name=>"io_gpio2_clken")
-
-// User extension aux register io_gpio2_swporta_dr
-#define AR_IO_GPIO2_SWPORTA_DR 0x80017200
-#pragma Aux_register(0x80017200, name=>"io_gpio2_swporta_dr")
-
-// User extension aux register io_gpio2_swporta_ddr
-#define AR_IO_GPIO2_SWPORTA_DDR 0x80017204
-#pragma Aux_register(0x80017204, name=>"io_gpio2_swporta_ddr")
-
-// User extension aux register io_gpio2_inten
-#define AR_IO_GPIO2_INTEN 0x80017230
-#pragma Aux_register(0x80017230, name=>"io_gpio2_inten")
-
-// User extension aux register io_gpio2_intmask
-#define AR_IO_GPIO2_INTMASK 0x80017234
-#pragma Aux_register(0x80017234, name=>"io_gpio2_intmask")
-
-// User extension aux register io_gpio2_inttype_level
-#define AR_IO_GPIO2_INTTYPE_LEVEL 0x80017238
-#pragma Aux_register(0x80017238, name=>"io_gpio2_inttype_level")
-
-// User extension aux register io_gpio2_int_polarity
-#define AR_IO_GPIO2_INT_POLARITY 0x8001723c
-#pragma Aux_register(0x8001723c, name=>"io_gpio2_int_polarity")
-
-// User extension aux register io_gpio2_intstatus
-#define AR_IO_GPIO2_INTSTATUS 0x80017240
-#pragma Aux_register(0x80017240, name=>"io_gpio2_intstatus")
-
-// User extension aux register io_gpio2_raw_intstatus
-#define AR_IO_GPIO2_RAW_INTSTATUS 0x80017244
-#pragma Aux_register(0x80017244, name=>"io_gpio2_raw_intstatus")
-
-// User extension aux register io_gpio2_porta_eoi
-#define AR_IO_GPIO2_PORTA_EOI 0x8001724c
-#pragma Aux_register(0x8001724c, name=>"io_gpio2_porta_eoi")
-
-// User extension aux register io_gpio2_ext_porta
-#define AR_IO_GPIO2_EXT_PORTA 0x80017250
-#pragma Aux_register(0x80017250, name=>"io_gpio2_ext_porta")
-
-// User extension aux register io_gpio2_ls_sync
-#define AR_IO_GPIO2_LS_SYNC 0x80017260
-#pragma Aux_register(0x80017260, name=>"io_gpio2_ls_sync")
-
-// User extension aux register io_gpio2_int_bothedge
-#define AR_IO_GPIO2_INT_BOTHEDGE 0x80017268
-#pragma Aux_register(0x80017268, name=>"io_gpio2_int_bothedge")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST1_PRESENT	1
-
-// User extension aux register io_i2c_mst1_clken
-#define AR_IO_I2C_MST1_CLKEN 0x800121c0
-#pragma Aux_register(0x800121c0, name=>"io_i2c_mst1_clken")
-
-// User extension aux register io_i2c_mst1_con
-#define AR_IO_I2C_MST1_CON 0x80012100
-#pragma Aux_register(0x80012100, name=>"io_i2c_mst1_con")
-
-// User extension aux register io_i2c_mst1_tar
-#define AR_IO_I2C_MST1_TAR 0x80012104
-#pragma Aux_register(0x80012104, name=>"io_i2c_mst1_tar")
-
-// User extension aux register io_i2c_mst1_data_cmd
-#define AR_IO_I2C_MST1_DATA_CMD 0x80012110
-#pragma Aux_register(0x80012110, name=>"io_i2c_mst1_data_cmd")
-
-// User extension aux register io_i2c_mst1_ss_scl_hcnt
-#define AR_IO_I2C_MST1_SS_SCL_HCNT 0x80012114
-#pragma Aux_register(0x80012114, name=>"io_i2c_mst1_ss_scl_hcnt")
-
-// User extension aux register io_i2c_mst1_ss_scl_lcnt
-#define AR_IO_I2C_MST1_SS_SCL_LCNT 0x80012118
-#pragma Aux_register(0x80012118, name=>"io_i2c_mst1_ss_scl_lcnt")
-
-// User extension aux register io_i2c_mst1_fs_scl_hcnt
-#define AR_IO_I2C_MST1_FS_SCL_HCNT 0x8001211c
-#pragma Aux_register(0x8001211c, name=>"io_i2c_mst1_fs_scl_hcnt")
-
-// User extension aux register io_i2c_mst1_fs_scl_lcnt
-#define AR_IO_I2C_MST1_FS_SCL_LCNT 0x80012120
-#pragma Aux_register(0x80012120, name=>"io_i2c_mst1_fs_scl_lcnt")
-
-// User extension aux register io_i2c_mst1_intr_stat
-#define AR_IO_I2C_MST1_INTR_STAT 0x8001212c
-#pragma Aux_register(0x8001212c, name=>"io_i2c_mst1_intr_stat")
-
-// User extension aux register io_i2c_mst1_intr_mask
-#define AR_IO_I2C_MST1_INTR_MASK 0x80012130
-#pragma Aux_register(0x80012130, name=>"io_i2c_mst1_intr_mask")
-
-// User extension aux register io_i2c_mst1_raw_intr_stat
-#define AR_IO_I2C_MST1_RAW_INTR_STAT 0x80012134
-#pragma Aux_register(0x80012134, name=>"io_i2c_mst1_raw_intr_stat")
-
-// User extension aux register io_i2c_mst1_rx_tl
-#define AR_IO_I2C_MST1_RX_TL 0x80012138
-#pragma Aux_register(0x80012138, name=>"io_i2c_mst1_rx_tl")
-
-// User extension aux register io_i2c_mst1_tx_tl
-#define AR_IO_I2C_MST1_TX_TL 0x8001213c
-#pragma Aux_register(0x8001213c, name=>"io_i2c_mst1_tx_tl")
-
-// User extension aux register io_i2c_mst1_clr_intr
-#define AR_IO_I2C_MST1_CLR_INTR 0x80012140
-#pragma Aux_register(0x80012140, name=>"io_i2c_mst1_clr_intr")
-
-// User extension aux register io_i2c_mst1_clr_rx_under
-#define AR_IO_I2C_MST1_CLR_RX_UNDER 0x80012144
-#pragma Aux_register(0x80012144, name=>"io_i2c_mst1_clr_rx_under")
-
-// User extension aux register io_i2c_mst1_clr_rx_over
-#define AR_IO_I2C_MST1_CLR_RX_OVER 0x80012148
-#pragma Aux_register(0x80012148, name=>"io_i2c_mst1_clr_rx_over")
-
-// User extension aux register io_i2c_mst1_clr_tx_over
-#define AR_IO_I2C_MST1_CLR_TX_OVER 0x8001214c
-#pragma Aux_register(0x8001214c, name=>"io_i2c_mst1_clr_tx_over")
-
-// User extension aux register io_i2c_mst1_clr_tx_abrt
-#define AR_IO_I2C_MST1_CLR_TX_ABRT 0x80012154
-#pragma Aux_register(0x80012154, name=>"io_i2c_mst1_clr_tx_abrt")
-
-// User extension aux register io_i2c_mst1_clr_activity
-#define AR_IO_I2C_MST1_CLR_ACTIVITY 0x8001215c
-#pragma Aux_register(0x8001215c, name=>"io_i2c_mst1_clr_activity")
-
-// User extension aux register io_i2c_mst1_clr_stop_det
-#define AR_IO_I2C_MST1_CLR_STOP_DET 0x80012160
-#pragma Aux_register(0x80012160, name=>"io_i2c_mst1_clr_stop_det")
-
-// User extension aux register io_i2c_mst1_clr_start_det
-#define AR_IO_I2C_MST1_CLR_START_DET 0x80012164
-#pragma Aux_register(0x80012164, name=>"io_i2c_mst1_clr_start_det")
-
-// User extension aux register io_i2c_mst1_enable
-#define AR_IO_I2C_MST1_ENABLE 0x8001216c
-#pragma Aux_register(0x8001216c, name=>"io_i2c_mst1_enable")
-
-// User extension aux register io_i2c_mst1_status
-#define AR_IO_I2C_MST1_STATUS 0x80012170
-#pragma Aux_register(0x80012170, name=>"io_i2c_mst1_status")
-
-// User extension aux register io_i2c_mst1_txflr
-#define AR_IO_I2C_MST1_TXFLR 0x80012174
-#pragma Aux_register(0x80012174, name=>"io_i2c_mst1_txflr")
-
-// User extension aux register io_i2c_mst1_rxflr
-#define AR_IO_I2C_MST1_RXFLR 0x80012178
-#pragma Aux_register(0x80012178, name=>"io_i2c_mst1_rxflr")
-
-// User extension aux register io_i2c_mst1_sda_hold
-#define AR_IO_I2C_MST1_SDA_HOLD 0x8001217c
-#pragma Aux_register(0x8001217c, name=>"io_i2c_mst1_sda_hold")
-
-// User extension aux register io_i2c_mst1_tx_abrt_source
-#define AR_IO_I2C_MST1_TX_ABRT_SOURCE 0x80012180
-#pragma Aux_register(0x80012180, name=>"io_i2c_mst1_tx_abrt_source")
-
-// User extension aux register io_i2c_mst1_enable_status
-#define AR_IO_I2C_MST1_ENABLE_STATUS 0x8001219c
-#pragma Aux_register(0x8001219c, name=>"io_i2c_mst1_enable_status")
-
-// User extension aux register io_i2c_mst1_fs_spklen
-#define AR_IO_I2C_MST1_FS_SPKLEN 0x800121a0
-#pragma Aux_register(0x800121a0, name=>"io_i2c_mst1_fs_spklen")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2C_MST2_PRESENT	1
-
-// User extension aux register io_i2c_mst2_clken
-#define AR_IO_I2C_MST2_CLKEN 0x800122c0
-#pragma Aux_register(0x800122c0, name=>"io_i2c_mst2_clken")
-
-// User extension aux register io_i2c_mst2_con
-#define AR_IO_I2C_MST2_CON 0x80012200
-#pragma Aux_register(0x80012200, name=>"io_i2c_mst2_con")
-
-// User extension aux register io_i2c_mst2_tar
-#define AR_IO_I2C_MST2_TAR 0x80012204
-#pragma Aux_register(0x80012204, name=>"io_i2c_mst2_tar")
-
-// User extension aux register io_i2c_mst2_data_cmd
-#define AR_IO_I2C_MST2_DATA_CMD 0x80012210
-#pragma Aux_register(0x80012210, name=>"io_i2c_mst2_data_cmd")
-
-// User extension aux register io_i2c_mst2_ss_scl_hcnt
-#define AR_IO_I2C_MST2_SS_SCL_HCNT 0x80012214
-#pragma Aux_register(0x80012214, name=>"io_i2c_mst2_ss_scl_hcnt")
-
-// User extension aux register io_i2c_mst2_ss_scl_lcnt
-#define AR_IO_I2C_MST2_SS_SCL_LCNT 0x80012218
-#pragma Aux_register(0x80012218, name=>"io_i2c_mst2_ss_scl_lcnt")
-
-// User extension aux register io_i2c_mst2_fs_scl_hcnt
-#define AR_IO_I2C_MST2_FS_SCL_HCNT 0x8001221c
-#pragma Aux_register(0x8001221c, name=>"io_i2c_mst2_fs_scl_hcnt")
-
-// User extension aux register io_i2c_mst2_fs_scl_lcnt
-#define AR_IO_I2C_MST2_FS_SCL_LCNT 0x80012220
-#pragma Aux_register(0x80012220, name=>"io_i2c_mst2_fs_scl_lcnt")
-
-// User extension aux register io_i2c_mst2_intr_stat
-#define AR_IO_I2C_MST2_INTR_STAT 0x8001222c
-#pragma Aux_register(0x8001222c, name=>"io_i2c_mst2_intr_stat")
-
-// User extension aux register io_i2c_mst2_intr_mask
-#define AR_IO_I2C_MST2_INTR_MASK 0x80012230
-#pragma Aux_register(0x80012230, name=>"io_i2c_mst2_intr_mask")
-
-// User extension aux register io_i2c_mst2_raw_intr_stat
-#define AR_IO_I2C_MST2_RAW_INTR_STAT 0x80012234
-#pragma Aux_register(0x80012234, name=>"io_i2c_mst2_raw_intr_stat")
-
-// User extension aux register io_i2c_mst2_rx_tl
-#define AR_IO_I2C_MST2_RX_TL 0x80012238
-#pragma Aux_register(0x80012238, name=>"io_i2c_mst2_rx_tl")
-
-// User extension aux register io_i2c_mst2_tx_tl
-#define AR_IO_I2C_MST2_TX_TL 0x8001223c
-#pragma Aux_register(0x8001223c, name=>"io_i2c_mst2_tx_tl")
-
-// User extension aux register io_i2c_mst2_clr_intr
-#define AR_IO_I2C_MST2_CLR_INTR 0x80012240
-#pragma Aux_register(0x80012240, name=>"io_i2c_mst2_clr_intr")
-
-// User extension aux register io_i2c_mst2_clr_rx_under
-#define AR_IO_I2C_MST2_CLR_RX_UNDER 0x80012244
-#pragma Aux_register(0x80012244, name=>"io_i2c_mst2_clr_rx_under")
-
-// User extension aux register io_i2c_mst2_clr_rx_over
-#define AR_IO_I2C_MST2_CLR_RX_OVER 0x80012248
-#pragma Aux_register(0x80012248, name=>"io_i2c_mst2_clr_rx_over")
-
-// User extension aux register io_i2c_mst2_clr_tx_over
-#define AR_IO_I2C_MST2_CLR_TX_OVER 0x8001224c
-#pragma Aux_register(0x8001224c, name=>"io_i2c_mst2_clr_tx_over")
-
-// User extension aux register io_i2c_mst2_clr_tx_abrt
-#define AR_IO_I2C_MST2_CLR_TX_ABRT 0x80012254
-#pragma Aux_register(0x80012254, name=>"io_i2c_mst2_clr_tx_abrt")
-
-// User extension aux register io_i2c_mst2_clr_activity
-#define AR_IO_I2C_MST2_CLR_ACTIVITY 0x8001225c
-#pragma Aux_register(0x8001225c, name=>"io_i2c_mst2_clr_activity")
-
-// User extension aux register io_i2c_mst2_clr_stop_det
-#define AR_IO_I2C_MST2_CLR_STOP_DET 0x80012260
-#pragma Aux_register(0x80012260, name=>"io_i2c_mst2_clr_stop_det")
-
-// User extension aux register io_i2c_mst2_clr_start_det
-#define AR_IO_I2C_MST2_CLR_START_DET 0x80012264
-#pragma Aux_register(0x80012264, name=>"io_i2c_mst2_clr_start_det")
-
-// User extension aux register io_i2c_mst2_enable
-#define AR_IO_I2C_MST2_ENABLE 0x8001226c
-#pragma Aux_register(0x8001226c, name=>"io_i2c_mst2_enable")
-
-// User extension aux register io_i2c_mst2_status
-#define AR_IO_I2C_MST2_STATUS 0x80012270
-#pragma Aux_register(0x80012270, name=>"io_i2c_mst2_status")
-
-// User extension aux register io_i2c_mst2_txflr
-#define AR_IO_I2C_MST2_TXFLR 0x80012274
-#pragma Aux_register(0x80012274, name=>"io_i2c_mst2_txflr")
-
-// User extension aux register io_i2c_mst2_rxflr
-#define AR_IO_I2C_MST2_RXFLR 0x80012278
-#pragma Aux_register(0x80012278, name=>"io_i2c_mst2_rxflr")
-
-// User extension aux register io_i2c_mst2_sda_hold
-#define AR_IO_I2C_MST2_SDA_HOLD 0x8001227c
-#pragma Aux_register(0x8001227c, name=>"io_i2c_mst2_sda_hold")
-
-// User extension aux register io_i2c_mst2_tx_abrt_source
-#define AR_IO_I2C_MST2_TX_ABRT_SOURCE 0x80012280
-#pragma Aux_register(0x80012280, name=>"io_i2c_mst2_tx_abrt_source")
-
-// User extension aux register io_i2c_mst2_enable_status
-#define AR_IO_I2C_MST2_ENABLE_STATUS 0x8001229c
-#pragma Aux_register(0x8001229c, name=>"io_i2c_mst2_enable_status")
-
-// User extension aux register io_i2c_mst2_fs_spklen
-#define AR_IO_I2C_MST2_FS_SPKLEN 0x800122a0
-#pragma Aux_register(0x800122a0, name=>"io_i2c_mst2_fs_spklen")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART0_PRESENT	1
-
-// User extension aux register io_uart0_clken
-#define AR_IO_UART0_CLKEN 0x800140c0
-#pragma Aux_register(0x800140c0, name=>"io_uart0_clken")
-
-// User extension aux register io_uart0_rbr_thr_dll
-#define AR_IO_UART0_RBR_THR_DLL 0x80014000
-#pragma Aux_register(0x80014000, name=>"io_uart0_rbr_thr_dll")
-
-// User extension aux register io_uart0_ier_dlh
-#define AR_IO_UART0_IER_DLH 0x80014004
-#pragma Aux_register(0x80014004, name=>"io_uart0_ier_dlh")
-
-// User extension aux register io_uart0_iir_fcr
-#define AR_IO_UART0_IIR_FCR 0x80014008
-#pragma Aux_register(0x80014008, name=>"io_uart0_iir_fcr")
-
-// User extension aux register io_uart0_lcr
-#define AR_IO_UART0_LCR 0x8001400c
-#pragma Aux_register(0x8001400c, name=>"io_uart0_lcr")
-
-// User extension aux register io_uart0_mcr
-#define AR_IO_UART0_MCR 0x80014010
-#pragma Aux_register(0x80014010, name=>"io_uart0_mcr")
-
-// User extension aux register io_uart0_lsr
-#define AR_IO_UART0_LSR 0x80014014
-#pragma Aux_register(0x80014014, name=>"io_uart0_lsr")
-
-// User extension aux register io_uart0_msr
-#define AR_IO_UART0_MSR 0x80014018
-#pragma Aux_register(0x80014018, name=>"io_uart0_msr")
-
-// User extension aux register io_uart0_usr
-#define AR_IO_UART0_USR 0x8001407c
-#pragma Aux_register(0x8001407c, name=>"io_uart0_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART1_PRESENT	1
-
-// User extension aux register io_uart1_clken
-#define AR_IO_UART1_CLKEN 0x800141c0
-#pragma Aux_register(0x800141c0, name=>"io_uart1_clken")
-
-// User extension aux register io_uart1_rbr_thr_dll
-#define AR_IO_UART1_RBR_THR_DLL 0x80014100
-#pragma Aux_register(0x80014100, name=>"io_uart1_rbr_thr_dll")
-
-// User extension aux register io_uart1_ier_dlh
-#define AR_IO_UART1_IER_DLH 0x80014104
-#pragma Aux_register(0x80014104, name=>"io_uart1_ier_dlh")
-
-// User extension aux register io_uart1_iir_fcr
-#define AR_IO_UART1_IIR_FCR 0x80014108
-#pragma Aux_register(0x80014108, name=>"io_uart1_iir_fcr")
-
-// User extension aux register io_uart1_lcr
-#define AR_IO_UART1_LCR 0x8001410c
-#pragma Aux_register(0x8001410c, name=>"io_uart1_lcr")
-
-// User extension aux register io_uart1_mcr
-#define AR_IO_UART1_MCR 0x80014110
-#pragma Aux_register(0x80014110, name=>"io_uart1_mcr")
-
-// User extension aux register io_uart1_lsr
-#define AR_IO_UART1_LSR 0x80014114
-#pragma Aux_register(0x80014114, name=>"io_uart1_lsr")
-
-// User extension aux register io_uart1_msr
-#define AR_IO_UART1_MSR 0x80014118
-#pragma Aux_register(0x80014118, name=>"io_uart1_msr")
-
-// User extension aux register io_uart1_usr
-#define AR_IO_UART1_USR 0x8001417c
-#pragma Aux_register(0x8001417c, name=>"io_uart1_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART2_PRESENT	1
-
-// User extension aux register io_uart2_clken
-#define AR_IO_UART2_CLKEN 0x800142c0
-#pragma Aux_register(0x800142c0, name=>"io_uart2_clken")
-
-// User extension aux register io_uart2_rbr_thr_dll
-#define AR_IO_UART2_RBR_THR_DLL 0x80014200
-#pragma Aux_register(0x80014200, name=>"io_uart2_rbr_thr_dll")
-
-// User extension aux register io_uart2_ier_dlh
-#define AR_IO_UART2_IER_DLH 0x80014204
-#pragma Aux_register(0x80014204, name=>"io_uart2_ier_dlh")
-
-// User extension aux register io_uart2_iir_fcr
-#define AR_IO_UART2_IIR_FCR 0x80014208
-#pragma Aux_register(0x80014208, name=>"io_uart2_iir_fcr")
-
-// User extension aux register io_uart2_lcr
-#define AR_IO_UART2_LCR 0x8001420c
-#pragma Aux_register(0x8001420c, name=>"io_uart2_lcr")
-
-// User extension aux register io_uart2_mcr
-#define AR_IO_UART2_MCR 0x80014210
-#pragma Aux_register(0x80014210, name=>"io_uart2_mcr")
-
-// User extension aux register io_uart2_lsr
-#define AR_IO_UART2_LSR 0x80014214
-#pragma Aux_register(0x80014214, name=>"io_uart2_lsr")
-
-// User extension aux register io_uart2_msr
-#define AR_IO_UART2_MSR 0x80014218
-#pragma Aux_register(0x80014218, name=>"io_uart2_msr")
-
-// User extension aux register io_uart2_usr
-#define AR_IO_UART2_USR 0x8001427c
-#pragma Aux_register(0x8001427c, name=>"io_uart2_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_UART3_PRESENT	1
-
-// User extension aux register io_uart3_clken
-#define AR_IO_UART3_CLKEN 0x800143c0
-#pragma Aux_register(0x800143c0, name=>"io_uart3_clken")
-
-// User extension aux register io_uart3_rbr_thr_dll
-#define AR_IO_UART3_RBR_THR_DLL 0x80014300
-#pragma Aux_register(0x80014300, name=>"io_uart3_rbr_thr_dll")
-
-// User extension aux register io_uart3_ier_dlh
-#define AR_IO_UART3_IER_DLH 0x80014304
-#pragma Aux_register(0x80014304, name=>"io_uart3_ier_dlh")
-
-// User extension aux register io_uart3_iir_fcr
-#define AR_IO_UART3_IIR_FCR 0x80014308
-#pragma Aux_register(0x80014308, name=>"io_uart3_iir_fcr")
-
-// User extension aux register io_uart3_lcr
-#define AR_IO_UART3_LCR 0x8001430c
-#pragma Aux_register(0x8001430c, name=>"io_uart3_lcr")
-
-// User extension aux register io_uart3_mcr
-#define AR_IO_UART3_MCR 0x80014310
-#pragma Aux_register(0x80014310, name=>"io_uart3_mcr")
-
-// User extension aux register io_uart3_lsr
-#define AR_IO_UART3_LSR 0x80014314
-#pragma Aux_register(0x80014314, name=>"io_uart3_lsr")
-
-// User extension aux register io_uart3_msr
-#define AR_IO_UART3_MSR 0x80014318
-#pragma Aux_register(0x80014318, name=>"io_uart3_msr")
-
-// User extension aux register io_uart3_usr
-#define AR_IO_UART3_USR 0x8001437c
-#pragma Aux_register(0x8001437c, name=>"io_uart3_usr")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_RX_MST0_PRESENT	1
-
-// User extension aux register io_i2s_rx_mst0_ier
-#define AR_IO_I2S_RX_MST0_IER 0x8001a000
-#pragma Aux_register(0x8001a000, name=>"io_i2s_rx_mst0_ier")
-
-// User extension aux register io_i2s_rx_mst0_irer
-#define AR_IO_I2S_RX_MST0_IRER 0x8001a004
-#pragma Aux_register(0x8001a004, name=>"io_i2s_rx_mst0_irer")
-
-// User extension aux register io_i2s_rx_mst0_cer
-#define AR_IO_I2S_RX_MST0_CER 0x8001a00c
-#pragma Aux_register(0x8001a00c, name=>"io_i2s_rx_mst0_cer")
-
-// User extension aux register io_i2s_rx_mst0_ccr
-#define AR_IO_I2S_RX_MST0_CCR 0x8001a010
-#pragma Aux_register(0x8001a010, name=>"io_i2s_rx_mst0_ccr")
-
-// User extension aux register io_i2s_rx_mst0_rxffr
-#define AR_IO_I2S_RX_MST0_RXFFR 0x8001a014
-#pragma Aux_register(0x8001a014, name=>"io_i2s_rx_mst0_rxffr")
-
-// User extension aux register io_i2s_rx_mst0_lrbr
-#define AR_IO_I2S_RX_MST0_LRBR 0x8001a020
-#pragma Aux_register(0x8001a020, name=>"io_i2s_rx_mst0_lrbr")
-
-// User extension aux register io_i2s_rx_mst0_rrbr
-#define AR_IO_I2S_RX_MST0_RRBR 0x8001a024
-#pragma Aux_register(0x8001a024, name=>"io_i2s_rx_mst0_rrbr")
-
-// User extension aux register io_i2s_rx_mst0_rer
-#define AR_IO_I2S_RX_MST0_RER 0x8001a028
-#pragma Aux_register(0x8001a028, name=>"io_i2s_rx_mst0_rer")
-
-// User extension aux register io_i2s_rx_mst0_rcr
-#define AR_IO_I2S_RX_MST0_RCR 0x8001a030
-#pragma Aux_register(0x8001a030, name=>"io_i2s_rx_mst0_rcr")
-
-// User extension aux register io_i2s_rx_mst0_isr
-#define AR_IO_I2S_RX_MST0_ISR 0x8001a038
-#pragma Aux_register(0x8001a038, name=>"io_i2s_rx_mst0_isr")
-
-// User extension aux register io_i2s_rx_mst0_imr
-#define AR_IO_I2S_RX_MST0_IMR 0x8001a03c
-#pragma Aux_register(0x8001a03c, name=>"io_i2s_rx_mst0_imr")
-
-// User extension aux register io_i2s_rx_mst0_ror
-#define AR_IO_I2S_RX_MST0_ROR 0x8001a040
-#pragma Aux_register(0x8001a040, name=>"io_i2s_rx_mst0_ror")
-
-// User extension aux register io_i2s_rx_mst0_rfcr
-#define AR_IO_I2S_RX_MST0_RFCR 0x8001a048
-#pragma Aux_register(0x8001a048, name=>"io_i2s_rx_mst0_rfcr")
-
-// User extension aux register io_i2s_rx_mst0_rff
-#define AR_IO_I2S_RX_MST0_RFF 0x8001a050
-#pragma Aux_register(0x8001a050, name=>"io_i2s_rx_mst0_rff")
-
-// User extension aux register io_i2s_rx_mst0_rxdma
-#define AR_IO_I2S_RX_MST0_RXDMA 0x8001a1c0
-#pragma Aux_register(0x8001a1c0, name=>"io_i2s_rx_mst0_rxdma")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_I2S_TX_MST0_PRESENT	1
-
-// User extension aux register io_i2s_tx_mst0_ier
-#define AR_IO_I2S_TX_MST0_IER 0x80019000
-#pragma Aux_register(0x80019000, name=>"io_i2s_tx_mst0_ier")
-
-// User extension aux register io_i2s_tx_mst0_iter
-#define AR_IO_I2S_TX_MST0_ITER 0x80019008
-#pragma Aux_register(0x80019008, name=>"io_i2s_tx_mst0_iter")
-
-// User extension aux register io_i2s_tx_mst0_cer
-#define AR_IO_I2S_TX_MST0_CER 0x8001900c
-#pragma Aux_register(0x8001900c, name=>"io_i2s_tx_mst0_cer")
-
-// User extension aux register io_i2s_tx_mst0_ccr
-#define AR_IO_I2S_TX_MST0_CCR 0x80019010
-#pragma Aux_register(0x80019010, name=>"io_i2s_tx_mst0_ccr")
-
-// User extension aux register io_i2s_tx_mst0_txffr
-#define AR_IO_I2S_TX_MST0_TXFFR 0x80019018
-#pragma Aux_register(0x80019018, name=>"io_i2s_tx_mst0_txffr")
-
-// User extension aux register io_i2s_tx_mst0_lthr
-#define AR_IO_I2S_TX_MST0_LTHR 0x80019020
-#pragma Aux_register(0x80019020, name=>"io_i2s_tx_mst0_lthr")
-
-// User extension aux register io_i2s_tx_mst0_rthr
-#define AR_IO_I2S_TX_MST0_RTHR 0x80019024
-#pragma Aux_register(0x80019024, name=>"io_i2s_tx_mst0_rthr")
-
-// User extension aux register io_i2s_tx_mst0_ter
-#define AR_IO_I2S_TX_MST0_TER 0x8001902c
-#pragma Aux_register(0x8001902c, name=>"io_i2s_tx_mst0_ter")
-
-// User extension aux register io_i2s_tx_mst0_tcr
-#define AR_IO_I2S_TX_MST0_TCR 0x80019034
-#pragma Aux_register(0x80019034, name=>"io_i2s_tx_mst0_tcr")
-
-// User extension aux register io_i2s_tx_mst0_isr
-#define AR_IO_I2S_TX_MST0_ISR 0x80019038
-#pragma Aux_register(0x80019038, name=>"io_i2s_tx_mst0_isr")
-
-// User extension aux register io_i2s_tx_mst0_imr
-#define AR_IO_I2S_TX_MST0_IMR 0x8001903c
-#pragma Aux_register(0x8001903c, name=>"io_i2s_tx_mst0_imr")
-
-// User extension aux register io_i2s_tx_mst0_tor
-#define AR_IO_I2S_TX_MST0_TOR 0x80019044
-#pragma Aux_register(0x80019044, name=>"io_i2s_tx_mst0_tor")
-
-// User extension aux register io_i2s_tx_mst0_tfcr
-#define AR_IO_I2S_TX_MST0_TFCR 0x8001904c
-#pragma Aux_register(0x8001904c, name=>"io_i2s_tx_mst0_tfcr")
-
-// User extension aux register io_i2s_tx_mst0_tff
-#define AR_IO_I2S_TX_MST0_TFF 0x80019054
-#pragma Aux_register(0x80019054, name=>"io_i2s_tx_mst0_tff")
-
-// User extension aux register io_i2s_tx_mst0_txdma
-#define AR_IO_I2S_TX_MST0_TXDMA 0x800191c8
-#pragma Aux_register(0x800191c8, name=>"io_i2s_tx_mst0_txdma")
-#define APEX_COM_ARC_HARDWARE_DFSS_IO_PDM_RX0_PRESENT	1
-
-// User extension aux register io_pdm_rx0_pdm_en
-#define AR_IO_PDM_RX0_PDM_EN 0x8001b000
-#pragma Aux_register(0x8001b000, name=>"io_pdm_rx0_pdm_en")
-
-// User extension aux register io_pdm_rx0_pdm_ren
-#define AR_IO_PDM_RX0_PDM_REN 0x8001b004
-#pragma Aux_register(0x8001b004, name=>"io_pdm_rx0_pdm_ren")
-
-// User extension aux register io_pdm_rx0_cer
-#define AR_IO_PDM_RX0_CER 0x8001b00c
-#pragma Aux_register(0x8001b00c, name=>"io_pdm_rx0_cer")
-
-// User extension aux register io_pdm_rx0_rxffr
-#define AR_IO_PDM_RX0_RXFFR 0x8001b014
-#pragma Aux_register(0x8001b014, name=>"io_pdm_rx0_rxffr")
-
-// User extension aux register io_pdm_rx0_rer0
-#define AR_IO_PDM_RX0_RER0 0x8001b028
-#pragma Aux_register(0x8001b028, name=>"io_pdm_rx0_rer0")
-
-// User extension aux register io_pdm_rx0_isr
-#define AR_IO_PDM_RX0_ISR 0x8001b038
-#pragma Aux_register(0x8001b038, name=>"io_pdm_rx0_isr")
-
-// User extension aux register io_pdm_rx0_imr
-#define AR_IO_PDM_RX0_IMR 0x8001b03c
-#pragma Aux_register(0x8001b03c, name=>"io_pdm_rx0_imr")
-
-// User extension aux register io_pdm_rx0_ror
-#define AR_IO_PDM_RX0_ROR 0x8001b040
-#pragma Aux_register(0x8001b040, name=>"io_pdm_rx0_ror")
-
-// User extension aux register io_pdm_rx0_rfcr
-#define AR_IO_PDM_RX0_RFCR 0x8001b048
-#pragma Aux_register(0x8001b048, name=>"io_pdm_rx0_rfcr")
-
-// User extension aux register io_pdm_rx0_rxdma
-#define AR_IO_PDM_RX0_RXDMA 0x8001b1c0
-#pragma Aux_register(0x8001b1c0, name=>"io_pdm_rx0_rxdma")
-
-// User extension aux register io_pdm_rx0_pdm_rr
-#define AR_IO_PDM_RX0_PDM_RR 0x8001b1d0
-#pragma Aux_register(0x8001b1d0, name=>"io_pdm_rx0_pdm_rr")
-
-// User extension aux register io_pdm_rx0_cic_n
-#define AR_IO_PDM_RX0_CIC_N 0x8001b1d4
-#pragma Aux_register(0x8001b1d4, name=>"io_pdm_rx0_cic_n")
-
-// User extension aux register io_pdm_rx0_cic_d
-#define AR_IO_PDM_RX0_CIC_D 0x8001b1d8
-#pragma Aux_register(0x8001b1d8, name=>"io_pdm_rx0_cic_d")
-
-// User extension aux register io_pdm_rx0_dcrc
-#define AR_IO_PDM_RX0_DCRC 0x8001b1dc
-#pragma Aux_register(0x8001b1dc, name=>"io_pdm_rx0_dcrc")
-
-// User extension aux register io_pdm_rx0_brc_b0
-#define AR_IO_PDM_RX0_BRC_B0 0x8001b1e0
-#pragma Aux_register(0x8001b1e0, name=>"io_pdm_rx0_brc_b0")
-
-// User extension aux register io_pdm_rx0_brc_clp
-#define AR_IO_PDM_RX0_BRC_CLP 0x8001b1f0
-#pragma Aux_register(0x8001b1f0, name=>"io_pdm_rx0_brc_clp")
-#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_PRESENT	1
-
-// User extension aux register fpu_build
-#define AR_FPU_BUILD 0xc8
-#pragma Aux_register(0xc8, name=>"fpu_build")
-
-// User extension aux register fpu_ctrl
-#define AR_FPU_CTRL 0x300
-#pragma Aux_register(0x300, name=>"fpu_ctrl")
-
-// User extension aux register fpu_status
-#define AR_FPU_STATUS 0x301
-#pragma Aux_register(0x301, name=>"fpu_status")
-
-// User extension instruction fsmadd
-extern int fsmadd(int,int);
-#pragma intrinsic(fsmadd,opcode=>6,sub_opcode=>5, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsmsub
-extern int fsmsub(int,int);
-#pragma intrinsic(fsmsub,opcode=>6,sub_opcode=>6, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsmul
-extern int fsmul(int,int);
-#pragma intrinsic(fsmul,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsadd
-extern int fsadd(int,int);
-#pragma intrinsic(fsadd,opcode=>6,sub_opcode=>1, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fssub
-extern int fssub(int,int);
-#pragma intrinsic(fssub,opcode=>6,sub_opcode=>2, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fcvt32
-extern int fcvt32(int,int);
-#pragma intrinsic(fcvt32,opcode=>6,sub_opcode=>8, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fsdiv
-extern int fsdiv(int,int);
-#pragma intrinsic(fsdiv,opcode=>6,sub_opcode=>7, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmp
-extern int fscmp(int,int);
-#pragma intrinsic(fscmp,opcode=>6,sub_opcode=>3, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmp
-extern int fscmp_f(int,int);
-#pragma intrinsic(fscmp_f,opcode=>6,sub_opcode=>3, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmpf
-extern int fscmpf(int,int);
-#pragma intrinsic(fscmpf,opcode=>6,sub_opcode=>4, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fscmpf
-extern int fscmpf_f(int,int);
-#pragma intrinsic(fscmpf_f,opcode=>6,sub_opcode=>4, set_flags => 1, flags => "zncv", effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-
-// User extension instruction fssqrt
-extern int fssqrt(int);
-#pragma intrinsic(fssqrt,opcode=>6,sub_opcode=>0, effects=>"auxreg=0xc8:is_read:is_written; auxreg=0x300:is_read:is_written; auxreg=0x301:is_read:is_written")
-#define APEX_COM_ARC_HARDWARE_FLOATING_POINT_UNIT_FPU_DP_ASSIST_PRESENT	1
-
-// User extension aux register aux_dpfp1l
-#define AR_AUX_DPFP1L 0x302
-#pragma Aux_register(0x302, name=>"aux_dpfp1l")
-
-// User extension aux register aux_dpfp1h
-#define AR_AUX_DPFP1H 0x303
-#pragma Aux_register(0x303, name=>"aux_dpfp1h")
-
-// User extension aux register aux_dpfp2l
-#define AR_AUX_DPFP2L 0x304
-#pragma Aux_register(0x304, name=>"aux_dpfp2l")
-
-// User extension aux register aux_dpfp2h
-#define AR_AUX_DPFP2H 0x305
-#pragma Aux_register(0x305, name=>"aux_dpfp2h")
-
-// User extension instruction dmulh11
-extern int dmulh11(int,int);
-#pragma intrinsic(dmulh11,opcode=>6,sub_opcode=>48,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh11
-extern int dmulh11_f(int,int);
-#pragma intrinsic(dmulh11_f,opcode=>6,sub_opcode=>48, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh12
-extern int dmulh12(int,int);
-#pragma intrinsic(dmulh12,opcode=>6,sub_opcode=>49,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh12
-extern int dmulh12_f(int,int);
-#pragma intrinsic(dmulh12_f,opcode=>6,sub_opcode=>49, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh21
-extern int dmulh21(int,int);
-#pragma intrinsic(dmulh21,opcode=>6,sub_opcode=>50,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh21
-extern int dmulh21_f(int,int);
-#pragma intrinsic(dmulh21_f,opcode=>6,sub_opcode=>50, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh22
-extern int dmulh22(int,int);
-#pragma intrinsic(dmulh22,opcode=>6,sub_opcode=>51,blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dmulh22
-extern int dmulh22_f(int,int);
-#pragma intrinsic(dmulh22_f,opcode=>6,sub_opcode=>51, set_flags => 1, flags => "zncv",blocking_cycles=> 7, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh11
-extern int daddh11(int,int);
-#pragma intrinsic(daddh11,opcode=>6,sub_opcode=>52,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh11
-extern int daddh11_f(int,int);
-#pragma intrinsic(daddh11_f,opcode=>6,sub_opcode=>52, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh12
-extern int daddh12(int,int);
-#pragma intrinsic(daddh12,opcode=>6,sub_opcode=>53,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh12
-extern int daddh12_f(int,int);
-#pragma intrinsic(daddh12_f,opcode=>6,sub_opcode=>53, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh21
-extern int daddh21(int,int);
-#pragma intrinsic(daddh21,opcode=>6,sub_opcode=>54,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh21
-extern int daddh21_f(int,int);
-#pragma intrinsic(daddh21_f,opcode=>6,sub_opcode=>54, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh22
-extern int daddh22(int,int);
-#pragma intrinsic(daddh22,opcode=>6,sub_opcode=>55,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction daddh22
-extern int daddh22_f(int,int);
-#pragma intrinsic(daddh22_f,opcode=>6,sub_opcode=>55, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh11
-extern int dsubh11(int,int);
-#pragma intrinsic(dsubh11,opcode=>6,sub_opcode=>56,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh11
-extern int dsubh11_f(int,int);
-#pragma intrinsic(dsubh11_f,opcode=>6,sub_opcode=>56, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh12
-extern int dsubh12(int,int);
-#pragma intrinsic(dsubh12,opcode=>6,sub_opcode=>57,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh12
-extern int dsubh12_f(int,int);
-#pragma intrinsic(dsubh12_f,opcode=>6,sub_opcode=>57, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh21
-extern int dsubh21(int,int);
-#pragma intrinsic(dsubh21,opcode=>6,sub_opcode=>58,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh21
-extern int dsubh21_f(int,int);
-#pragma intrinsic(dsubh21_f,opcode=>6,sub_opcode=>58, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh22
-extern int dsubh22(int,int);
-#pragma intrinsic(dsubh22,opcode=>6,sub_opcode=>59,blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dsubh22
-extern int dsubh22_f(int,int);
-#pragma intrinsic(dsubh22_f,opcode=>6,sub_opcode=>59, set_flags => 1, flags => "zncv",blocking_cycles=> 5, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dexcl1
-extern int dexcl1(int,int);
-#pragma intrinsic(dexcl1,opcode=>6,sub_opcode=>60, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-// User extension instruction dexcl2
-extern int dexcl2(int,int);
-#pragma intrinsic(dexcl2,opcode=>6,sub_opcode=>61, effects=>"auxreg=0x302:is_read:is_written; auxreg=0x303:is_read:is_written; auxreg=0x304:is_read:is_written; auxreg=0x305:is_read:is_written")
-
-
-#endif
-
-
-]]></string>
-  </configuration>
-  <configuration name="apex_assembly" filename="apexextensions.s">
-    <string><![CDATA[
-
-; Assembler directives for eia extensions in this design
-.set apex_com_arc_hardware_dfss_dsp_trig_present,1
-.extInstruction dsp_cos, 7, 0x1E, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_sin, 7, 0x1F, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_tan, 7, 0x22, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_acos, 7, 0x23, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_asin, 7, 0x24, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_atan, 7, 0x25, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_sqrt, 7, 0x20, FLAGS_NONE, SYNTAX_2OP
-.extInstruction dsp_sqrt15, 7, 0x21, FLAGS_NONE, SYNTAX_2OP
- .set apex_com_arc_hardware_dfss_io_gpio0_present,1
- .set apex_com_arc_hardware_dfss_io_i2c_mst0_present,1
- .set apex_com_arc_hardware_dfss_io_i2c_slv0_present,1
- .set apex_com_arc_hardware_dfss_io_spi_mst0_present,1
- .set apex_com_arc_hardware_dfss_subsys_bcr_present,1
- .set apex_com_arc_hardware_dfss_io_spi_mst1_present,1
- .set apex_com_arc_hardware_dfss_io_spi_mst2_present,1
- .set apex_com_arc_hardware_dfss_io_spi_slv0_present,1
- .set apex_com_arc_hardware_dfss_io_gpio1_present,1
- .set apex_com_arc_hardware_dfss_io_gpio2_present,1
- .set apex_com_arc_hardware_dfss_io_i2c_mst1_present,1
- .set apex_com_arc_hardware_dfss_io_i2c_mst2_present,1
- .set apex_com_arc_hardware_dfss_io_uart0_present,1
- .set apex_com_arc_hardware_dfss_io_uart1_present,1
- .set apex_com_arc_hardware_dfss_io_uart2_present,1
- .set apex_com_arc_hardware_dfss_io_uart3_present,1
- .set apex_com_arc_hardware_dfss_io_i2s_rx_mst0_present,1
- .set apex_com_arc_hardware_dfss_io_i2s_tx_mst0_present,1
- .set apex_com_arc_hardware_dfss_io_pdm_rx0_present,1
-.set apex_com_arc_hardware_dfss_io_gpio0_present,1
-.extAuxRegister io_gpio0_debounce,0x80017048,r|w
-.extAuxRegister io_gpio0_clken,0x80017080,r|w
-.extAuxRegister io_gpio0_swporta_dr,0x80017000,r|w
-.extAuxRegister io_gpio0_swporta_ddr,0x80017004,r|w
-.extAuxRegister io_gpio0_inten,0x80017030,r|w
-.extAuxRegister io_gpio0_intmask,0x80017034,r|w
-.extAuxRegister io_gpio0_inttype_level,0x80017038,r|w
-.extAuxRegister io_gpio0_int_polarity,0x8001703c,r|w
-.extAuxRegister io_gpio0_intstatus,0x80017040,r
-.extAuxRegister io_gpio0_raw_intstatus,0x80017044,r
-.extAuxRegister io_gpio0_porta_eoi,0x8001704c,w
-.extAuxRegister io_gpio0_ext_porta,0x80017050,r
-.extAuxRegister io_gpio0_ls_sync,0x80017060,r|w
-.extAuxRegister io_gpio0_int_bothedge,0x80017068,r|w
-.set apex_com_arc_hardware_dfss_io_i2c_mst0_present,1
-.extAuxRegister io_i2c_mst0_clken,0x800120c0,r|w
-.extAuxRegister io_i2c_mst0_con,0x80012000,r|w
-.extAuxRegister io_i2c_mst0_tar,0x80012004,r|w
-.extAuxRegister io_i2c_mst0_data_cmd,0x80012010,r|w
-.extAuxRegister io_i2c_mst0_ss_scl_hcnt,0x80012014,r|w
-.extAuxRegister io_i2c_mst0_ss_scl_lcnt,0x80012018,r|w
-.extAuxRegister io_i2c_mst0_fs_scl_hcnt,0x8001201c,r|w
-.extAuxRegister io_i2c_mst0_fs_scl_lcnt,0x80012020,r|w
-.extAuxRegister io_i2c_mst0_intr_stat,0x8001202c,r
-.extAuxRegister io_i2c_mst0_intr_mask,0x80012030,r|w
-.extAuxRegister io_i2c_mst0_raw_intr_stat,0x80012034,r
-.extAuxRegister io_i2c_mst0_rx_tl,0x80012038,r|w
-.extAuxRegister io_i2c_mst0_tx_tl,0x8001203c,r|w
-.extAuxRegister io_i2c_mst0_clr_intr,0x80012040,r
-.extAuxRegister io_i2c_mst0_clr_rx_under,0x80012044,r
-.extAuxRegister io_i2c_mst0_clr_rx_over,0x80012048,r
-.extAuxRegister io_i2c_mst0_clr_tx_over,0x8001204c,r
-.extAuxRegister io_i2c_mst0_clr_tx_abrt,0x80012054,r
-.extAuxRegister io_i2c_mst0_clr_activity,0x8001205c,r
-.extAuxRegister io_i2c_mst0_clr_stop_det,0x80012060,r
-.extAuxRegister io_i2c_mst0_clr_start_det,0x80012064,r
-.extAuxRegister io_i2c_mst0_enable,0x8001206c,r|w
-.extAuxRegister io_i2c_mst0_status,0x80012070,r
-.extAuxRegister io_i2c_mst0_txflr,0x80012074,r
-.extAuxRegister io_i2c_mst0_rxflr,0x80012078,r
-.extAuxRegister io_i2c_mst0_sda_hold,0x8001207c,r|w
-.extAuxRegister io_i2c_mst0_tx_abrt_source,0x80012080,r
-.extAuxRegister io_i2c_mst0_enable_status,0x8001209c,r
-.extAuxRegister io_i2c_mst0_fs_spklen,0x800120a0,r|w
-.set apex_com_arc_hardware_dfss_io_i2c_slv0_present,1
-.extAuxRegister io_i2c_slv0_clken,0x800130c0,r|w
-.extAuxRegister io_i2c_slv0_con,0x80013000,r|w
-.extAuxRegister io_i2c_slv0_sar,0x80013008,r|w
-.extAuxRegister io_i2c_slv0_data_cmd,0x80013010,r|w
-.extAuxRegister io_i2c_slv0_intr_stat,0x8001302c,r
-.extAuxRegister io_i2c_slv0_intr_mask,0x80013030,r|w
-.extAuxRegister io_i2c_slv0_raw_intr_stat,0x80013034,r
-.extAuxRegister io_i2c_slv0_rx_tl,0x80013038,r|w
-.extAuxRegister io_i2c_slv0_tx_tl,0x8001303c,r|w
-.extAuxRegister io_i2c_slv0_clr_intr,0x80013040,r
-.extAuxRegister io_i2c_slv0_clr_rx_under,0x80013044,r
-.extAuxRegister io_i2c_slv0_clr_rx_over,0x80013048,r
-.extAuxRegister io_i2c_slv0_clr_tx_over,0x8001304c,r
-.extAuxRegister io_i2c_slv0_clr_rd_req,0x80013050,r
-.extAuxRegister io_i2c_slv0_clr_tx_abrt,0x80013054,r
-.extAuxRegister io_i2c_slv0_clr_rx_done,0x80013058,r
-.extAuxRegister io_i2c_slv0_clr_activity,0x8001305c,r
-.extAuxRegister io_i2c_slv0_clr_stop_det,0x80013060,r
-.extAuxRegister io_i2c_slv0_clr_start_det,0x80013064,r
-.extAuxRegister io_i2c_slv0_enable,0x8001306c,r|w
-.extAuxRegister io_i2c_slv0_status,0x80013070,r
-.extAuxRegister io_i2c_slv0_txflr,0x80013074,r
-.extAuxRegister io_i2c_slv0_rxflr,0x80013078,r
-.extAuxRegister io_i2c_slv0_sda_hold,0x8001307c,r|w
-.extAuxRegister io_i2c_slv0_tx_abrt_source,0x80013080,r
-.extAuxRegister io_i2c_slv0_sda_setup,0x80013094,r|w
-.extAuxRegister io_i2c_slv0_enable_status,0x8001309c,r
-.extAuxRegister io_i2c_slv0_fs_spklen,0x800130a0,r|w
-.extAuxRegister io_i2c_slv0_clr_restart_det,0x800130a8,r
-.set apex_com_arc_hardware_dfss_io_spi_mst0_present,1
-.extAuxRegister io_spi_mst0_ctrlr0,0x80010000,r|w
-.extAuxRegister io_spi_mst0_ctrlr1,0x80010001,r|w
-.extAuxRegister io_spi_mst0_spien,0x80010002,r|w
-.extAuxRegister io_spi_mst0_ser,0x80010004,r|w
-.extAuxRegister io_spi_mst0_baudr,0x80010005,r|w
-.extAuxRegister io_spi_mst0_txftlr,0x80010006,r|w
-.extAuxRegister io_spi_mst0_rxftlr,0x80010007,r|w
-.extAuxRegister io_spi_mst0_txflr,0x80010008,r
-.extAuxRegister io_spi_mst0_rxflr,0x80010009,r
-.extAuxRegister io_spi_mst0_sr,0x8001000a,r
-.extAuxRegister io_spi_mst0_imr,0x8001000b,r|w
-.extAuxRegister io_spi_mst0_isr,0x8001000c,r
-.extAuxRegister io_spi_mst0_risr,0x8001000d,r
-.extAuxRegister io_spi_mst0_txoicr,0x8001000e,r
-.extAuxRegister io_spi_mst0_rxoicr,0x8001000f,r
-.extAuxRegister io_spi_mst0_rxuicr,0x80010010,r
-.extAuxRegister io_spi_mst0_icr,0x80010012,r|w
-.extAuxRegister io_spi_mst0_clken,0x80010016,r|w
-.extAuxRegister io_spi_mst0_dr,0x80010018,r|w
-.extAuxRegister io_spi_mst0_rx_sample_dly,0x8001003c,r|w
-.set apex_com_arc_hardware_dfss_subsys_bcr_present,1
-.extAuxRegister SUBSYS_BUILD,0xf0,r
-.extAuxRegister SUBSYS_DSP_0_BUILD,0xa00,r
-.extAuxRegister SUBSYS_DSP_0_CONFIG,0xa02,r
-.extAuxRegister SUBSYS_IO_0_BUILD,0xa04,r
-.extAuxRegister SUBSYS_IO_1_BUILD,0xa05,r
-.extAuxRegister SUBSYS_IO_2_BUILD,0xa06,r
-.extAuxRegister SUBSYS_UAUX_OFFSET,0xa1e,r
-.extAuxRegister SUBSYS_APEX_OFFSET,0xa1f,r
-.set apex_com_arc_hardware_dfss_io_spi_mst1_present,1
-.extAuxRegister io_spi_mst1_ctrlr0,0x80010100,r|w
-.extAuxRegister io_spi_mst1_ctrlr1,0x80010101,r|w
-.extAuxRegister io_spi_mst1_spien,0x80010102,r|w
-.extAuxRegister io_spi_mst1_ser,0x80010104,r|w
-.extAuxRegister io_spi_mst1_baudr,0x80010105,r|w
-.extAuxRegister io_spi_mst1_txftlr,0x80010106,r|w
-.extAuxRegister io_spi_mst1_rxftlr,0x80010107,r|w
-.extAuxRegister io_spi_mst1_txflr,0x80010108,r
-.extAuxRegister io_spi_mst1_rxflr,0x80010109,r
-.extAuxRegister io_spi_mst1_sr,0x8001010a,r
-.extAuxRegister io_spi_mst1_imr,0x8001010b,r|w
-.extAuxRegister io_spi_mst1_isr,0x8001010c,r
-.extAuxRegister io_spi_mst1_risr,0x8001010d,r
-.extAuxRegister io_spi_mst1_txoicr,0x8001010e,r
-.extAuxRegister io_spi_mst1_rxoicr,0x8001010f,r
-.extAuxRegister io_spi_mst1_rxuicr,0x80010110,r
-.extAuxRegister io_spi_mst1_icr,0x80010112,r|w
-.extAuxRegister io_spi_mst1_clken,0x80010116,r|w
-.extAuxRegister io_spi_mst1_dr,0x80010118,r|w
-.extAuxRegister io_spi_mst1_rx_sample_dly,0x8001013c,r|w
-.set apex_com_arc_hardware_dfss_io_spi_mst2_present,1
-.extAuxRegister io_spi_mst2_ctrlr0,0x80010200,r|w
-.extAuxRegister io_spi_mst2_ctrlr1,0x80010201,r|w
-.extAuxRegister io_spi_mst2_spien,0x80010202,r|w
-.extAuxRegister io_spi_mst2_ser,0x80010204,r|w
-.extAuxRegister io_spi_mst2_baudr,0x80010205,r|w
-.extAuxRegister io_spi_mst2_txftlr,0x80010206,r|w
-.extAuxRegister io_spi_mst2_rxftlr,0x80010207,r|w
-.extAuxRegister io_spi_mst2_txflr,0x80010208,r
-.extAuxRegister io_spi_mst2_rxflr,0x80010209,r
-.extAuxRegister io_spi_mst2_sr,0x8001020a,r
-.extAuxRegister io_spi_mst2_imr,0x8001020b,r|w
-.extAuxRegister io_spi_mst2_isr,0x8001020c,r
-.extAuxRegister io_spi_mst2_risr,0x8001020d,r
-.extAuxRegister io_spi_mst2_txoicr,0x8001020e,r
-.extAuxRegister io_spi_mst2_rxoicr,0x8001020f,r
-.extAuxRegister io_spi_mst2_rxuicr,0x80010210,r
-.extAuxRegister io_spi_mst2_icr,0x80010212,r|w
-.extAuxRegister io_spi_mst2_clken,0x80010216,r|w
-.extAuxRegister io_spi_mst2_dr,0x80010218,r|w
-.extAuxRegister io_spi_mst2_rx_sample_dly,0x8001023c,r|w
-.set apex_com_arc_hardware_dfss_io_spi_slv0_present,1
-.extAuxRegister io_spi_slv0_ctrlr0,0x80011000,r|w
-.extAuxRegister io_spi_slv0_spien,0x80011002,r|w
-.extAuxRegister io_spi_slv0_txftlr,0x80011006,r|w
-.extAuxRegister io_spi_slv0_rxftlr,0x80011007,r|w
-.extAuxRegister io_spi_slv0_txflr,0x80011008,r
-.extAuxRegister io_spi_slv0_rxflr,0x80011009,r
-.extAuxRegister io_spi_slv0_sr,0x8001100a,r
-.extAuxRegister io_spi_slv0_imr,0x8001100b,r|w
-.extAuxRegister io_spi_slv0_isr,0x8001100c,r
-.extAuxRegister io_spi_slv0_risr,0x8001100d,r
-.extAuxRegister io_spi_slv0_txoicr,0x8001100e,r
-.extAuxRegister io_spi_slv0_rxoicr,0x8001100f,r
-.extAuxRegister io_spi_slv0_rxuicr,0x80011010,r
-.extAuxRegister io_spi_slv0_icr,0x80011012,r|w
-.extAuxRegister io_spi_slv0_clken,0x80011016,r|w
-.extAuxRegister io_spi_slv0_dr,0x80011018,r|w
-.set apex_com_arc_hardware_dfss_io_gpio1_present,1
-.extAuxRegister io_gpio1_debounce,0x80017148,r|w
-.extAuxRegister io_gpio1_clken,0x80017180,r|w
-.extAuxRegister io_gpio1_swporta_dr,0x80017100,r|w
-.extAuxRegister io_gpio1_swporta_ddr,0x80017104,r|w
-.extAuxRegister io_gpio1_inten,0x80017130,r|w
-.extAuxRegister io_gpio1_intmask,0x80017134,r|w
-.extAuxRegister io_gpio1_inttype_level,0x80017138,r|w
-.extAuxRegister io_gpio1_int_polarity,0x8001713c,r|w
-.extAuxRegister io_gpio1_intstatus,0x80017140,r
-.extAuxRegister io_gpio1_raw_intstatus,0x80017144,r
-.extAuxRegister io_gpio1_porta_eoi,0x8001714c,w
-.extAuxRegister io_gpio1_ext_porta,0x80017150,r
-.extAuxRegister io_gpio1_ls_sync,0x80017160,r|w
-.extAuxRegister io_gpio1_int_bothedge,0x80017168,r|w
-.set apex_com_arc_hardware_dfss_io_gpio2_present,1
-.extAuxRegister io_gpio2_debounce,0x80017248,r|w
-.extAuxRegister io_gpio2_clken,0x80017280,r|w
-.extAuxRegister io_gpio2_swporta_dr,0x80017200,r|w
-.extAuxRegister io_gpio2_swporta_ddr,0x80017204,r|w
-.extAuxRegister io_gpio2_inten,0x80017230,r|w
-.extAuxRegister io_gpio2_intmask,0x80017234,r|w
-.extAuxRegister io_gpio2_inttype_level,0x80017238,r|w
-.extAuxRegister io_gpio2_int_polarity,0x8001723c,r|w
-.extAuxRegister io_gpio2_intstatus,0x80017240,r
-.extAuxRegister io_gpio2_raw_intstatus,0x80017244,r
-.extAuxRegister io_gpio2_porta_eoi,0x8001724c,w
-.extAuxRegister io_gpio2_ext_porta,0x80017250,r
-.extAuxRegister io_gpio2_ls_sync,0x80017260,r|w
-.extAuxRegister io_gpio2_int_bothedge,0x80017268,r|w
-.set apex_com_arc_hardware_dfss_io_i2c_mst1_present,1
-.extAuxRegister io_i2c_mst1_clken,0x800121c0,r|w
-.extAuxRegister io_i2c_mst1_con,0x80012100,r|w
-.extAuxRegister io_i2c_mst1_tar,0x80012104,r|w
-.extAuxRegister io_i2c_mst1_data_cmd,0x80012110,r|w
-.extAuxRegister io_i2c_mst1_ss_scl_hcnt,0x80012114,r|w
-.extAuxRegister io_i2c_mst1_ss_scl_lcnt,0x80012118,r|w
-.extAuxRegister io_i2c_mst1_fs_scl_hcnt,0x8001211c,r|w
-.extAuxRegister io_i2c_mst1_fs_scl_lcnt,0x80012120,r|w
-.extAuxRegister io_i2c_mst1_intr_stat,0x8001212c,r
-.extAuxRegister io_i2c_mst1_intr_mask,0x80012130,r|w
-.extAuxRegister io_i2c_mst1_raw_intr_stat,0x80012134,r
-.extAuxRegister io_i2c_mst1_rx_tl,0x80012138,r|w
-.extAuxRegister io_i2c_mst1_tx_tl,0x8001213c,r|w
-.extAuxRegister io_i2c_mst1_clr_intr,0x80012140,r
-.extAuxRegister io_i2c_mst1_clr_rx_under,0x80012144,r
-.extAuxRegister io_i2c_mst1_clr_rx_over,0x80012148,r
-.extAuxRegister io_i2c_mst1_clr_tx_over,0x8001214c,r
-.extAuxRegister io_i2c_mst1_clr_tx_abrt,0x80012154,r
-.extAuxRegister io_i2c_mst1_clr_activity,0x8001215c,r
-.extAuxRegister io_i2c_mst1_clr_stop_det,0x80012160,r
-.extAuxRegister io_i2c_mst1_clr_start_det,0x80012164,r
-.extAuxRegister io_i2c_mst1_enable,0x8001216c,r|w
-.extAuxRegister io_i2c_mst1_status,0x80012170,r
-.extAuxRegister io_i2c_mst1_txflr,0x80012174,r
-.extAuxRegister io_i2c_mst1_rxflr,0x80012178,r
-.extAuxRegister io_i2c_mst1_sda_hold,0x8001217c,r|w
-.extAuxRegister io_i2c_mst1_tx_abrt_source,0x80012180,r
-.extAuxRegister io_i2c_mst1_enable_status,0x8001219c,r
-.extAuxRegister io_i2c_mst1_fs_spklen,0x800121a0,r|w
-.set apex_com_arc_hardware_dfss_io_i2c_mst2_present,1
-.extAuxRegister io_i2c_mst2_clken,0x800122c0,r|w
-.extAuxRegister io_i2c_mst2_con,0x80012200,r|w
-.extAuxRegister io_i2c_mst2_tar,0x80012204,r|w
-.extAuxRegister io_i2c_mst2_data_cmd,0x80012210,r|w
-.extAuxRegister io_i2c_mst2_ss_scl_hcnt,0x80012214,r|w
-.extAuxRegister io_i2c_mst2_ss_scl_lcnt,0x80012218,r|w
-.extAuxRegister io_i2c_mst2_fs_scl_hcnt,0x8001221c,r|w
-.extAuxRegister io_i2c_mst2_fs_scl_lcnt,0x80012220,r|w
-.extAuxRegister io_i2c_mst2_intr_stat,0x8001222c,r
-.extAuxRegister io_i2c_mst2_intr_mask,0x80012230,r|w
-.extAuxRegister io_i2c_mst2_raw_intr_stat,0x80012234,r
-.extAuxRegister io_i2c_mst2_rx_tl,0x80012238,r|w
-.extAuxRegister io_i2c_mst2_tx_tl,0x8001223c,r|w
-.extAuxRegister io_i2c_mst2_clr_intr,0x80012240,r
-.extAuxRegister io_i2c_mst2_clr_rx_under,0x80012244,r
-.extAuxRegister io_i2c_mst2_clr_rx_over,0x80012248,r
-.extAuxRegister io_i2c_mst2_clr_tx_over,0x8001224c,r
-.extAuxRegister io_i2c_mst2_clr_tx_abrt,0x80012254,r
-.extAuxRegister io_i2c_mst2_clr_activity,0x8001225c,r
-.extAuxRegister io_i2c_mst2_clr_stop_det,0x80012260,r
-.extAuxRegister io_i2c_mst2_clr_start_det,0x80012264,r
-.extAuxRegister io_i2c_mst2_enable,0x8001226c,r|w
-.extAuxRegister io_i2c_mst2_status,0x80012270,r
-.extAuxRegister io_i2c_mst2_txflr,0x80012274,r
-.extAuxRegister io_i2c_mst2_rxflr,0x80012278,r
-.extAuxRegister io_i2c_mst2_sda_hold,0x8001227c,r|w
-.extAuxRegister io_i2c_mst2_tx_abrt_source,0x80012280,r
-.extAuxRegister io_i2c_mst2_enable_status,0x8001229c,r
-.extAuxRegister io_i2c_mst2_fs_spklen,0x800122a0,r|w
-.set apex_com_arc_hardware_dfss_io_uart0_present,1
-.extAuxRegister io_uart0_clken,0x800140c0,r|w
-.extAuxRegister io_uart0_rbr_thr_dll,0x80014000,r|w
-.extAuxRegister io_uart0_ier_dlh,0x80014004,r|w
-.extAuxRegister io_uart0_iir_fcr,0x80014008,r|w
-.extAuxRegister io_uart0_lcr,0x8001400c,r|w
-.extAuxRegister io_uart0_mcr,0x80014010,r|w
-.extAuxRegister io_uart0_lsr,0x80014014,r
-.extAuxRegister io_uart0_msr,0x80014018,r
-.extAuxRegister io_uart0_usr,0x8001407c,r
-.set apex_com_arc_hardware_dfss_io_uart1_present,1
-.extAuxRegister io_uart1_clken,0x800141c0,r|w
-.extAuxRegister io_uart1_rbr_thr_dll,0x80014100,r|w
-.extAuxRegister io_uart1_ier_dlh,0x80014104,r|w
-.extAuxRegister io_uart1_iir_fcr,0x80014108,r|w
-.extAuxRegister io_uart1_lcr,0x8001410c,r|w
-.extAuxRegister io_uart1_mcr,0x80014110,r|w
-.extAuxRegister io_uart1_lsr,0x80014114,r
-.extAuxRegister io_uart1_msr,0x80014118,r
-.extAuxRegister io_uart1_usr,0x8001417c,r
-.set apex_com_arc_hardware_dfss_io_uart2_present,1
-.extAuxRegister io_uart2_clken,0x800142c0,r|w
-.extAuxRegister io_uart2_rbr_thr_dll,0x80014200,r|w
-.extAuxRegister io_uart2_ier_dlh,0x80014204,r|w
-.extAuxRegister io_uart2_iir_fcr,0x80014208,r|w
-.extAuxRegister io_uart2_lcr,0x8001420c,r|w
-.extAuxRegister io_uart2_mcr,0x80014210,r|w
-.extAuxRegister io_uart2_lsr,0x80014214,r
-.extAuxRegister io_uart2_msr,0x80014218,r
-.extAuxRegister io_uart2_usr,0x8001427c,r
-.set apex_com_arc_hardware_dfss_io_uart3_present,1
-.extAuxRegister io_uart3_clken,0x800143c0,r|w
-.extAuxRegister io_uart3_rbr_thr_dll,0x80014300,r|w
-.extAuxRegister io_uart3_ier_dlh,0x80014304,r|w
-.extAuxRegister io_uart3_iir_fcr,0x80014308,r|w
-.extAuxRegister io_uart3_lcr,0x8001430c,r|w
-.extAuxRegister io_uart3_mcr,0x80014310,r|w
-.extAuxRegister io_uart3_lsr,0x80014314,r
-.extAuxRegister io_uart3_msr,0x80014318,r
-.extAuxRegister io_uart3_usr,0x8001437c,r
-.set apex_com_arc_hardware_dfss_io_i2s_rx_mst0_present,1
-.extAuxRegister io_i2s_rx_mst0_ier,0x8001a000,r|w
-.extAuxRegister io_i2s_rx_mst0_irer,0x8001a004,r|w
-.extAuxRegister io_i2s_rx_mst0_cer,0x8001a00c,r|w
-.extAuxRegister io_i2s_rx_mst0_ccr,0x8001a010,r|w
-.extAuxRegister io_i2s_rx_mst0_rxffr,0x8001a014,w
-.extAuxRegister io_i2s_rx_mst0_lrbr,0x8001a020,r
-.extAuxRegister io_i2s_rx_mst0_rrbr,0x8001a024,r
-.extAuxRegister io_i2s_rx_mst0_rer,0x8001a028,r|w
-.extAuxRegister io_i2s_rx_mst0_rcr,0x8001a030,r|w
-.extAuxRegister io_i2s_rx_mst0_isr,0x8001a038,r
-.extAuxRegister io_i2s_rx_mst0_imr,0x8001a03c,r|w
-.extAuxRegister io_i2s_rx_mst0_ror,0x8001a040,r
-.extAuxRegister io_i2s_rx_mst0_rfcr,0x8001a048,r|w
-.extAuxRegister io_i2s_rx_mst0_rff,0x8001a050,w
-.extAuxRegister io_i2s_rx_mst0_rxdma,0x8001a1c0,r
-.set apex_com_arc_hardware_dfss_io_i2s_tx_mst0_present,1
-.extAuxRegister io_i2s_tx_mst0_ier,0x80019000,r|w
-.extAuxRegister io_i2s_tx_mst0_iter,0x80019008,r|w
-.extAuxRegister io_i2s_tx_mst0_cer,0x8001900c,r|w
-.extAuxRegister io_i2s_tx_mst0_ccr,0x80019010,r|w
-.extAuxRegister io_i2s_tx_mst0_txffr,0x80019018,w
-.extAuxRegister io_i2s_tx_mst0_lthr,0x80019020,w
-.extAuxRegister io_i2s_tx_mst0_rthr,0x80019024,w
-.extAuxRegister io_i2s_tx_mst0_ter,0x8001902c,r|w
-.extAuxRegister io_i2s_tx_mst0_tcr,0x80019034,r|w
-.extAuxRegister io_i2s_tx_mst0_isr,0x80019038,r
-.extAuxRegister io_i2s_tx_mst0_imr,0x8001903c,r|w
-.extAuxRegister io_i2s_tx_mst0_tor,0x80019044,r
-.extAuxRegister io_i2s_tx_mst0_tfcr,0x8001904c,r|w
-.extAuxRegister io_i2s_tx_mst0_tff,0x80019054,w
-.extAuxRegister io_i2s_tx_mst0_txdma,0x800191c8,w
-.set apex_com_arc_hardware_dfss_io_pdm_rx0_present,1
-.extAuxRegister io_pdm_rx0_pdm_en,0x8001b000,r|w
-.extAuxRegister io_pdm_rx0_pdm_ren,0x8001b004,r|w
-.extAuxRegister io_pdm_rx0_cer,0x8001b00c,r|w
-.extAuxRegister io_pdm_rx0_rxffr,0x8001b014,w
-.extAuxRegister io_pdm_rx0_rer0,0x8001b028,r|w
-.extAuxRegister io_pdm_rx0_isr,0x8001b038,r
-.extAuxRegister io_pdm_rx0_imr,0x8001b03c,r|w
-.extAuxRegister io_pdm_rx0_ror,0x8001b040,r
-.extAuxRegister io_pdm_rx0_rfcr,0x8001b048,r|w
-.extAuxRegister io_pdm_rx0_rxdma,0x8001b1c0,r
-.extAuxRegister io_pdm_rx0_pdm_rr,0x8001b1d0,r|w
-.extAuxRegister io_pdm_rx0_cic_n,0x8001b1d4,r|w
-.extAuxRegister io_pdm_rx0_cic_d,0x8001b1d8,r|w
-.extAuxRegister io_pdm_rx0_dcrc,0x8001b1dc,r|w
-.extAuxRegister io_pdm_rx0_brc_b0,0x8001b1e0,r|w
-.extAuxRegister io_pdm_rx0_brc_clp,0x8001b1f0,r|w
-.set apex_com_arc_hardware_floating_point_unit_fpu_present,1
-.extAuxRegister fpu_build,0xc8,r
-.extAuxRegister fpu_ctrl,0x300,r|w
-.extAuxRegister fpu_status,0x301,r|w
-.extInstruction fsmadd,6,5,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsmsub,6,6,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsmul,6,0,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsadd,6,1,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fssub,6,2,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fcvt32,6,8,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fsdiv,6,7,SUFFIX_COND,SYNTAX_3OP
-.extInstruction fscmp,6,3,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction fscmpf,6,4,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction fssqrt,6,0,FLAGS_NONE,SYNTAX_2OP
-.set apex_com_arc_hardware_floating_point_unit_fpu_dp_assist_present,1
-.extAuxRegister aux_dpfp1l,0x302,r|w
-.extAuxRegister aux_dpfp1h,0x303,r|w
-.extAuxRegister aux_dpfp2l,0x304,r|w
-.extAuxRegister aux_dpfp2h,0x305,r|w
-.extInstruction dmulh11,6,48,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dmulh12,6,49,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dmulh21,6,50,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dmulh22,6,51,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh11,6,52,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh12,6,53,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh21,6,54,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction daddh22,6,55,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh11,6,56,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh12,6,57,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh21,6,58,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dsubh22,6,59,SUFFIX_COND|SUFFIX_FLAG,SYNTAX_3OP
-.extInstruction dexcl1,6,60,SUFFIX_COND,SYNTAX_3OP
-.extInstruction dexcl2,6,61,SUFFIX_COND,SYNTAX_3OP
-
-]]></string>
-  </configuration>
-</config_list>
-
diff --git a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
index 211437bd9f4..405b9698cca 100644
--- a/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/arc_emsdp_makefile.inc
@@ -18,14 +18,23 @@ ifeq ($(TARGET), arc_emsdp)
   TARGET_ARCH := arc
   ARC_TOOLCHAIN := mwdt
 
-  TCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp_em11d_dfss.tcf
-  LCF_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
-  UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
-  UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
 
   BUILD_ARC_MLI := false
   ARC_MLI_PRE_COMPILED_TARGET := emsdp_em11d_em9d_dfss
 
+ifneq ($(filter no_arc_mli,$(ALL_TAGS)),)
+  MLI_LIB_DIR = arc_mli_package
+  $(eval $(call add_third_party_download,$(EMBARC_MLI_PRE_COMPILED_URL),$(EMBARC_MLI_PRE_COMPILED_MD5),$(MLI_LIB_DIR),))
+else ifeq ($(BUILD_ARC_MLI), true)
+  MLI_LIB_DIR = arc_mli_$(ARC_MLI_PRE_COMPILED_TARGET)
+endif
+
+  TCF_FILE = $(PWD)/$(MAKEFILE_DIR)/downloads/$(MLI_LIB_DIR)/hw/emsdp_em11d_em9d_dfss.tcf
+  LCF_FILE = $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/emsdp.lcf
+  UBOOT_FILE := $(PWD)/$(MAKEFILE_DIR)/targets/arc/emsdp/uboot.env
+  UBOOT_FILE_NAME := $(notdir $(UBOOT_FILE))
+    
+
 include $(MAKEFILE_DIR)/targets/arc/arc_common.inc
   
    ARC_EXTRA_APP_SETTINGS = \
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index db420b7fd1b..d90f8548f31 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -75,7 +75,7 @@ EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embar
 EMBARC_MLI_MD5 := "7eebd730db79c6834399f87e509115fb"
 
 EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC1/embARC_MLI_package.zip"
-EMBARC_MLI_PRE_COMPILED_MD5 := "b85b8b89446757735342795367e37d22"
+EMBARC_MLI_PRE_COMPILED_MD5 := "a66d6afff8daeb40bd3a99c42de048ab"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"

From deaffc11a2cdf5a70032d401ebf1b8029a3332b2 Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Wed, 6 May 2020 10:31:04 -0700
Subject: [PATCH 0122/1533] Add weak frameworks back

---
 tensorflow/lite/experimental/swift/BUILD.apple | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index ea468216d7e..50130fc194a 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -13,6 +13,10 @@ package(
 swift_library(
     name = "TensorFlowLite",
     srcs = glob(["Sources/*.swift"]),
+    linkopts = [
+        "-Wl,-weak_framework,CoreML",
+        "-Wl,-weak_framework,Metal",
+    ],
     module_name = "TensorFlowLite",
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_whitelist(),

From 4756483d5cc8a9ec3ff5057e60fc73f1c4f38e6d Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 5 May 2020 13:14:11 +0200
Subject: [PATCH 0123/1533] Enable Squeeze Op conversion without squeeze_dim
 attribute in explicit batch mode.

---
 .../tf2tensorrt/convert/convert_nodes.cc      | 75 +++++++++++--------
 .../tf2tensorrt/convert/convert_nodes.h       |  8 +-
 .../tf2tensorrt/convert/convert_nodes_test.cc | 10 ++-
 3 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 4aec6a2512c..8bef4450cf3 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -2413,26 +2413,19 @@ Status ConvertExpandDims(OpConverterParams* params) {
 }
 
 Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
-                                const std::vector<int>& trt_axes,
+                                std::vector<int>* input_dims,
                                 nvinfer1::ITensor** output) {
-  const nvinfer1::Dims dims = input->getDimensions();
-  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
-  // Mark axes to remove by setting them to 0.
-  for (int axis : trt_axes) {
-    input_dims[axis] = 0;
-  }
-
 #if IS_TRT_VERSION_GE(6, 0, 0, 0)
   // If the remaining dimensions of a squeeze operation have dynamic sizes, we
   // need to use TRT ops to build the result shape for the squeeze operation.
   // This is because IShuffleLayer::setReshapeDimensions treats -1 as a special
   // value.
-  if (absl::c_any_of(input_dims, [](int i) { return i == -1; })) {
+  if (absl::c_any_of(*input_dims, [](int i) { return i == -1; })) {
     nvinfer1::ITensor* shape = network()->addShape(*input)->getOutput(0);
     std::vector<nvinfer1::ITensor const*> concat_inputs;
-    for (int i = 0; i < input_dims.size(); i++) {
+    for (int i = 0; i < input_dims->size(); i++) {
       // If input dim wasn't set to 0 earlier, we include it in new shape.
-      if (input_dims[i] != 0) {
+      if (input_dims->at(i) != 0) {
         concat_inputs.push_back(
             network()
                 ->addSlice(*shape, {1, {i}}, {1, {1}}, {1, {1}})
@@ -2452,11 +2445,12 @@ Status Converter::SqueezeTensor(nvinfer1::ITensor* input,
   }
 #endif
   // Remove all dims which are equal to 0.
-  input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0),
-                   input_dims.end());
+  input_dims->erase(std::remove(input_dims->begin(), input_dims->end(), 0),
+                   input_dims->end());
   // Reshape tensor.
   nvinfer1::Dims new_dims;
-  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims));
+  VLOG(2) << 'input_dims' << input_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(*input_dims, &new_dims));
   TF_RETURN_IF_ERROR(PrepareTensorForShape(TRT_TensorOrWeights(input), new_dims,
                                            /*validation_only=*/false, output));
   return Status::OK();
@@ -2475,31 +2469,48 @@ Status ConvertSqueeze(OpConverterParams* params) {
   TFAttrs attrs(node_def);
   auto squeeze_dims = attrs.get<std::vector<int64>>("squeeze_dims");
   if (squeeze_dims.empty()) {
-    return errors::Unimplemented(
-        "Squeeze is only implemented for explicit dims, at ", node_def.name());
-  }
-  std::vector<int> trt_axes;
-  trt_axes.reserve(squeeze_dims.size());
-  for (int tf_axis : squeeze_dims) {
-    // If the axis is valid, then convert it to TRT axis, otherwise abort
-    // conversion.
-    int trt_axis;
-    TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
-                                   params->use_implicit_batch, &trt_axis));
-    // Make sure target dimension is size 1 or unknown size (-1)
-    if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
-      return errors::InvalidArgument(
-          "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
-          " cannot be squeezed because it must be size 1, at ",
+    if (params->use_implicit_batch || !HasStaticShape(dims)) {
+      return errors::Unimplemented(
+          "Squeeze is not implemented for empty squeeze_dims, at ",
           node_def.name());
+    } else {
+      // explicit batch mode with static input shape we squeeze all singleton
+      // dimensions
+      for (int& dim : input_dims) {
+        if (dim==1) {
+          // Mark it for removal by setting it to 0
+          dim = 0;
+        }
+      }
+    }
+  } else {
+    std::vector<int> trt_axes;
+    trt_axes.reserve(squeeze_dims.size());
+    for (int tf_axis : squeeze_dims) {
+      // If the axis is valid, then convert it to TRT axis, otherwise abort
+      // conversion.
+      int trt_axis;
+      TF_RETURN_IF_ERROR(ConvertAxis(tf_axis, dims.nbDims, node_def.name(),
+                                   params->use_implicit_batch, &trt_axis));
+      // Make sure target dimension is size 1 or unknown size (-1)
+      if (input_dims[trt_axis] != -1 && input_dims[trt_axis] != 1) {
+        return errors::InvalidArgument(
+            "Dimension ", tf_axis, " with size ", input_dims[trt_axis],
+            " cannot be squeezed because it must be size 1, at ",
+            node_def.name());
+      }
+      trt_axes.push_back(trt_axis);
+    }
+    // Mark axes to remove by setting them to 0.
+    for (int axis : trt_axes) {
+      input_dims[axis] = 0;
     }
-    trt_axes.push_back(trt_axis);
   }
   if (params->validation_only) return Status::OK();
 
   nvinfer1::ITensor* output_tensor = nullptr;
   TF_RETURN_IF_ERROR(params->converter->SqueezeTensor(
-      input_tensor.tensor(), trt_axes, &output_tensor));
+      input_tensor.tensor(), &input_dims, &output_tensor));
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return Status::OK();
 }
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 8608c8226ee..2092aecd657 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -529,11 +529,9 @@ class Converter {
 
   // Helper function to add a squeeze op to the network.
   //
-  // The trt_axes argument lists those axes that need to be squeezed. Each axis
-  // in the list is numbered according to TRT convention (see ConvertAxis for
-  // details).
-  Status SqueezeTensor(nvinfer1::ITensor* input,
-                       const std::vector<int>& trt_axes,
+  // The input_dims argument stores the TRT dimensions of the input tensor,
+  // where the dimensions to be squeezed are replaced by 0.
+  Status SqueezeTensor(nvinfer1::ITensor* input, std::vector<int>* input_dims,
                        nvinfer1::ITensor** output);
 
   // Creates an IConstantLayer using 'weights' whose dimensions are specified by
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 1f30b837450..73726f53e45 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -3129,11 +3129,13 @@ TEST_P(ParameterizedOpConverterTest, ConvertSqueeze) {
       TestParamBase{
           {1, 2, 1, 3},  // input dims
           {},            // input partial dims
-          {2, 1, 3},     // expected output dims
+          {2, 3},        // expected output dims
           {},            // axis
-          Status{
-              error::UNIMPLEMENTED,
-              "Squeeze is only implemented for explicit dims, at my_squeeze"}},
+          trt_mode == TrtTestMode::kExplicitBatch
+              ? Status::OK()
+              : Status{error::UNIMPLEMENTED,
+                       "Squeeze is not implemented for empty squeeze_dims, at "
+                       "my_squeeze"}},
       TestParamBase{{1, 2, 1, 3},
                     {},
                     {2, 1, 3},

From cd8b64f7fbdc2bc8700b71dadb4c51744a752095 Mon Sep 17 00:00:00 2001
From: Bas Aarts <baarts@nvidia.com>
Date: Wed, 6 May 2020 15:10:33 -0700
Subject: [PATCH 0124/1533] add -DDEBUG_BUILD to dbg profile

this prevents issue https://github.com/tensorflow/tensorflow/issues/37498.
The optimized AWS SDK can only be built in release mode. Fall back in debug mode
See: https://github.com/TileDB-Inc/TileDB/issues/1351
---
 .bazelrc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.bazelrc b/.bazelrc
index cf15d0976b1..224238d7c0b 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -163,6 +163,8 @@ build:cuda_clang --action_env TF_CUDA_CLANG=1
 build:dbg --config=opt -c dbg
 # for now, disable arm_neon. see: https://github.com/tensorflow/tensorflow/issues/33360
 build:dbg --cxxopt -DTF_LITE_DISABLE_X86_NEON
+# AWS SDK must be compiled in release mode. see: https://github.com/tensorflow/tensorflow/issues/37498
+build:dbg --copt -DDEBUG_BUILD
 
 build:tensorrt --action_env TF_NEED_TENSORRT=1
 

From 00165602f78b33bc07f9bb8134472bbeceac23cf Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Thu, 7 May 2020 00:54:02 +0000
Subject: [PATCH 0125/1533] Removed API change and added tests.

---
 .../python/keras/integration_test/BUILD       |  10 ++
 .../gradient_checkpoint_test.py               | 160 ++++++++++++++++++
 tensorflow/python/ops/custom_gradient.py      |   9 +-
 3 files changed, 173 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/python/keras/integration_test/gradient_checkpoint_test.py

diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 01c405a86ae..f92f9d14685 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -70,3 +70,13 @@ tf_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+tf_py_test(
+    name = "gradient_checkpoint_test",
+    srcs = ["gradient_checkpoint_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
new file mode 100644
index 00000000000..df23c3abff5
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -0,0 +1,160 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.keras import layers, optimizers
+
+
+def _get_big_cnn_model(img_dim, n_channels, num_partitions,
+                       blocks_per_partition):
+  """Creates a test model whose activations are significantly larger than model size."""
+  model = tf.keras.Sequential()
+  model.add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+  for _ in range(num_partitions):
+    for _ in range(blocks_per_partition):
+      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+  model.add(layers.Flatten())
+  model.add(layers.Dense(32, activation=tf.nn.relu))
+  model.add(layers.Dense(10))
+  return model
+
+
+def _get_split_cnn_model(img_dim, n_channels, num_partitions,
+                         blocks_per_partition):
+  """Creates a test model that is split into `num_partitions` smaller models"""
+  models = [tf.keras.Sequential() for _ in range(num_partitions)]
+  models[0].add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+  for i in range(num_partitions):
+    model = models[i]
+    if i > 0:
+      last_shape = models[i - 1].layers[-1].output_shape
+      model.add(layers.Input(shape=last_shape[1:]))
+    for _ in range(blocks_per_partition):
+      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+  models[-1].add(layers.Flatten())
+  models[-1].add(layers.Dense(32, activation=tf.nn.relu))
+  models[-1].add(layers.Dense(10))
+  return models
+
+
+def _compute_loss(logits, labels):
+  return tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
+                                                     labels=labels))
+
+
+def _limit_gpu_memory():
+  """Helper function to limit GPU memory for testing  """
+  gpus = tf.config.experimental.list_physical_devices('GPU')
+  if gpus:
+    try:
+      tf.config.experimental.set_virtual_device_configuration(
+          gpus[0], [
+              tf.config.experimental.VirtualDeviceConfiguration(
+                  memory_limit=1024)
+          ])
+    except RuntimeError as e:
+      print(e)
+
+
+def _get_dummy_data(img_dim, n_channels, batch_size):
+  inputs = tf.ones([batch_size, img_dim, img_dim, n_channels])
+  labels = tf.ones([batch_size], dtype=tf.int64)
+  return inputs, labels
+
+
+def _train_no_recompute(n_steps):
+  """Trains a single large model without gradient checkpointing."""
+  _limit_gpu_memory()
+  img_dim, n_channels, batch_size = 256, 1, 4
+  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+  model = _get_big_cnn_model(img_dim,
+                             n_channels,
+                             num_partitions=3,
+                             blocks_per_partition=2)
+  optimizer = optimizers.SGD()
+  losses = []
+  tr_vars = model.trainable_variables
+  for _ in range(n_steps):
+    with tf.GradientTape() as tape:
+      logits = model(x)
+      loss = _compute_loss(logits, y)
+      losses.append(loss)
+    grads = tape.gradient(loss, tr_vars)  # tr_vars
+    optimizer.apply_gradients(zip(grads, tr_vars))
+    del grads
+  return losses
+
+
+def _train_with_recompute(n_steps):
+  """Trains a single large model with gradient checkpointing using tf.recompute_grad."""
+  _limit_gpu_memory()
+  img_dim, n_channels, batch_size = 256, 1, 4
+  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+  # This model is the same model as _get_big_cnn_model but split into 3 parts.
+  models = _get_split_cnn_model(img_dim,
+                                n_channels,
+                                num_partitions=3,
+                                blocks_per_partition=2)
+  model1, model2, model3 = models
+  # Apply gradient checkpointing to the submodels using tf.recompute_grad.
+  model1_re = tf.recompute_grad(model1)
+  model2_re = tf.recompute_grad(model2)
+  model3_re = tf.recompute_grad(model3)
+  optimizer = optimizers.SGD()
+  tr_vars = model1.trainable_variables + model2.trainable_variables + model3.trainable_variables
+  losses = []
+  for _ in range(n_steps):
+    with tf.GradientTape() as tape:
+      logits1 = model1_re(x)
+      logits2 = model2_re(logits1)
+      logits3 = model3_re(logits2)
+      loss = _compute_loss(logits3, y)
+      losses.append(loss)
+      grads = tape.gradient(loss, tr_vars)  # tr_vars
+      optimizer.apply_gradients(zip(grads, tr_vars))
+      del grads
+  return losses
+
+
+class GradientCheckpointTest(tf.test.TestCase):
+
+  def test_raises_oom_exception(self):
+    with self.assertRaises(Exception) as context:
+      _train_no_recompute(1)
+    self.assertTrue(
+        context.exception.__class__.__name__ == 'ResourceExhaustedError')
+
+  def test_does_not_raise_oom_exception(self):
+    n_step = 2
+    losses = _train_with_recompute(n_step)
+    self.assertTrue(len(losses) == n_step)
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index a20619f5be7..a5013062936 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -406,17 +406,14 @@ def _graph_mode_decorator(f, args, kwargs):
 
 def _eager_mode_decorator(f, args, kwargs):
   """Implement custom gradient decorator for eager mode."""
-
-  trainable_vars = []
-  if 'trainable_variables' in kwargs:
-    trainable_vars = kwargs.pop('trainable_variables')
-  result, grad_fn = f(*args, **kwargs)
+  with tape_lib.VariableWatcher() as variable_watcher:
+    result, grad_fn = f(*args, **kwargs)
   all_inputs = list(args) + list(kwargs.values())
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
   variables = [
       v.deref()  # pylint: disable=g-complex-comprehension
-      for v in set(v.ref() for v in trainable_vars)
+      for v in set(v.ref() for v in variable_watcher.watched_variables())
       if all(v.deref() is not i for i in all_inputs)
   ]
   grad_argspec = tf_inspect.getfullargspec(grad_fn)

From d98ec8ae65c33bdbb8268e89bb5be9732526dd57 Mon Sep 17 00:00:00 2001
From: Ehsan Toosi <ehsan.nadjaran_toosi@dfki.de>
Date: Thu, 26 Mar 2020 10:31:00 +0100
Subject: [PATCH 0126/1533] [XLA] Adapting HLO-to-LHLO-legalization to use
 Buffer Assignment

---
 .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir  |  53 +++--
 .../xla/transforms/hlo_legalize_to_lhlo.cc    | 189 +++++-------------
 .../compiler/mlir/xla/transforms/rewriters.h  |   7 +-
 .../xla/service/mlir_gpu/kernel_lowering.cc   |  35 +---
 4 files changed, 89 insertions(+), 195 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 262533bbf08..53296b257ae 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -hlo-legalize-to-lhlo %s -o - | FileCheck %s --dump-input-on-failure
+// RUN: xla-opt -hlo-legalize-to-lhlo -buffer-placement %s -o - | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: func @attrs
 func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
@@ -13,33 +13,42 @@ func @attrs_copy(%operand: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+func @return_func(%arg0: tensor<4xf32>) -> tensor<4xf32> {
+  return %arg0 : tensor<4xf32>
+}
+//      CHECK: (%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[TYPE]])
+// CHECK-NEXT: "xla_lhlo.copy"(%[[ARG0]], %[[RESULT]]) : ([[TYPE]], [[TYPE]]) -> ()
+// CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
+
+// -----
+
 // CHECK-LABEL: func @func_op_long
 func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
-  // CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
-  // CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() {temp = true} : memref<4xf32>
   %1 = xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
   %2 = xla_hlo.add %arg0, %1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
   %3 = xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
   %4 = xla_hlo.subtract %arg1, %3 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
   %5 = xla_hlo.multiply %2, %4 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
-  // CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
-  // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
-  // CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
   return %5 : tensor<4xf32>
-  // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 }
+//      CHECK: (%[[NEW_ARG0:.*]]: memref<4xf32>, %[[NEW_ARG1:.*]]: memref<4xf32>, %[[RESULT:.*]]: memref<4xf32>)
+// CHECK-NEXT: %[[MAX_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.maximum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MAX_RESULT]])
+// CHECK-NEXT: %[[ADD_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.add"(%[[NEW_ARG0]], %[[MAX_RESULT]], %[[ADD_RESULT]])
+// CHECK-NEXT: dealloc %[[MAX_RESULT]] : memref<4xf32>
+// CHECK-NEXT: %[[MIN_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.minimum"(%[[NEW_ARG0]], %[[NEW_ARG1]], %[[MIN_RESULT]])
+// CHECK-NEXT: %[[SUB_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.subtract"(%[[NEW_ARG1]], %[[MIN_RESULT]], %[[SUB_RESULT]])
+// CHECK-NEXT: dealloc %[[MIN_RESULT]] : memref<4xf32>
+// CHECK-NEXT: %[[MUL_RESULT:.*]] = alloc() : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %[[SUB_RESULT]], %[[MUL_RESULT]])
+// CHECK-NEXT: dealloc %[[SUB_RESULT]] : memref<4xf32>
+// CHECK-NEXT: dealloc %[[ADD_RESULT]] : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]]) : (memref<4xf32>, memref<4xf32>) -> ()
+// CHECK-NEXT: dealloc %[[MUL_RESULT]] : memref<4xf32>
+// CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
 
 // -----
 
@@ -47,20 +56,20 @@ func @func_op_long(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 func @fusion(%multiplier: memref<2x2xf32>, %summand_1: memref<2x2xf32>,
              %summand_2: memref<2x2xf32>, %result: memref<2x2xf32>) {
   // CHECK: (%{{.*}}: {{.*}}, {{.*}}: {{.*}}, {{.*}}: {{.*}}, %[[RESULT:.*]]: {{.*}})
-  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
-  // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() {temp = true} : memref<2x2xf32>
+  // CHECK-NEXT:  %[[ADD_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_summand_1 = tensor_load %summand_1 : memref<2x2xf32>
   %tensor_summand_2 = tensor_load %summand_2 : memref<2x2xf32>
   %sum = "xla_hlo.add"(%tensor_summand_1, %tensor_summand_2)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.add"(%{{.*}}, %{{.*}}, %[[ADD_RESULT]])
+  // CHECK-NEXT:  %[[MUL_RESULT:.*]] = alloc() : memref<2x2xf32>
   %tensor_multiplier = tensor_load %multiplier : memref<2x2xf32>
   %tensor_result = "xla_hlo.multiply"(%sum, %tensor_multiplier)
       : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.multiply"(%[[ADD_RESULT]], %{{.*}}, %[[MUL_RESULT]])
+  // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.copy"(%[[MUL_RESULT]], %[[RESULT]])
   tensor_store %tensor_result, %result : memref<2x2xf32>
-  // CHECK-NEXT:  dealloc %[[ADD_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT:  dealloc %[[MUL_RESULT]] : memref<2x2xf32>
   // CHECK-NEXT: "xla_lhlo.terminator"() : () -> ()
   "xla_lhlo.terminator"() : () -> ()
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index aa29241048b..756a38fc660 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
@@ -39,16 +40,10 @@ namespace xla_hlo {
 namespace {
 
 constexpr StringRef kTempBufferAttr = "temp";
-
-/// Returns DeallocOp to ensure that CopyOp is not inserted after dealloc.
-Operation* FindInsertionPointForCopy(Value value) {
-  for (const auto& user : value.getUsers()) {
-    if (auto dealloc = dyn_cast<DeallocOp>(user)) {
-      return user;
-    }
-  }
-  return nullptr;
-}
+template <typename T>
+using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
+using StdReturnOpConverter = NonVoidToVoidReturnOpConverter<
+        mlir::ReturnOp, xla_lhlo::TerminatorOp, xla_lhlo::CopyOp>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                    Value shape_operand,
@@ -92,8 +87,9 @@ Value InsertDynamicAllocAndDealloc(Location loc, Value result,
   return alloc;
 }
 
-Value InsertAllocAndDealloc(Location loc, Value result,
-                            ConversionPatternRewriter* rewriter) {
+Value InsertAlloc(Location loc, OpResult result,
+                  BufferAssignmentPlacer* bufferAssignment,
+                  ConversionPatternRewriter* rewriter) {
   auto result_type = result.getType().dyn_cast<ShapedType>();
   if (!result_type || !result_type.hasStaticShape()) {
     result.getDefiningOp()->emitOpError()
@@ -101,31 +97,21 @@ Value InsertAllocAndDealloc(Location loc, Value result,
   }
   auto memref_type =
       MemRefType::get(result_type.getShape(), result_type.getElementType());
-
-  Operation* op = result.getDefiningOp();
-  auto block = op->getBlock();
-
-  OpBuilder allocBuilder(op);
-  allocBuilder.setInsertionPointToStart(block);  // Inserting at the beginning
-  auto alloc = allocBuilder.create<AllocOp>(loc, memref_type);
-
-  alloc.setAttr(kTempBufferAttr, rewriter->getBoolAttr(true));
-
-  allocBuilder.setInsertionPoint(block, std::prev(block->end()));
-  allocBuilder.create<DeallocOp>(loc, alloc);
-
+  OpBuilder::InsertionGuard guard(*rewriter);
+  rewriter->restoreInsertionPoint(
+      bufferAssignment->computeAllocPosition(result));
+  auto alloc = rewriter->create<AllocOp>(loc, memref_type);
   return alloc;
 }
 
 template <typename HloOpTy>
-class HloToLhloOpConverter : public ConversionPattern {
+class HloToLhloOpConverter : public BaseOpConversion<HloOpTy> {
  public:
-  explicit HloToLhloOpConverter(MLIRContext* context)
-      : ConversionPattern(HloOpTy::getOperationName(), 1, context) {}
-
+  using BaseOpConversion<HloOpTy>::BaseOpConversion;
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      HloOpTy hloOp, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
+    Operation* op = hloOp.getOperation();
     const auto& original_results = op->getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : llvm::enumerate(original_results)) {
@@ -135,8 +121,8 @@ class HloToLhloOpConverter : public ConversionPattern {
         return failure();
       }
       if (resultType.hasStaticShape()) {
-        buffer_args.push_back(
-            InsertAllocAndDealloc(op->getLoc(), result.value(), &rewriter));
+        buffer_args.push_back(InsertAlloc(op->getLoc(), result.value(),
+                                          this->bufferAssignment, &rewriter));
       } else {
         SmallVector<Value, 1> results_shape;
         auto shape_type_op = dyn_cast<InferShapedTypeOpInterface>(op);
@@ -156,9 +142,9 @@ class HloToLhloOpConverter : public ConversionPattern {
 };
 
 struct HloToLhloDynamicBroadcastInDimOpConverter
-    : public OpConversionPattern<xla_hlo::DynamicBroadcastInDimOp> {
+: public BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp> {
  public:
-  using OpConversionPattern::OpConversionPattern;
+  using BaseOpConversion<xla_hlo::DynamicBroadcastInDimOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
       xla_hlo::DynamicBroadcastInDimOp op, ArrayRef<Value> operands,
@@ -175,10 +161,9 @@ struct HloToLhloDynamicBroadcastInDimOpConverter
   }
 };
 
-struct HloToLhloReduceOpConverter
-    : public OpConversionPattern<xla_hlo::ReduceOp> {
+struct HloToLhloReduceOpConverter : public BaseOpConversion<xla_hlo::ReduceOp> {
  public:
-  using OpConversionPattern::OpConversionPattern;
+  using BaseOpConversion<xla_hlo::ReduceOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
       xla_hlo::ReduceOp op, ArrayRef<Value> operands,
@@ -194,7 +179,8 @@ struct HloToLhloReduceOpConverter
     const auto& original_results = op.getResults();
     SmallVector<Value, 4> buffer_args(operands.begin(), operands.end());
     for (auto result : original_results) {
-      buffer_args.push_back(InsertAllocAndDealloc(loc, result, &rewriter));
+      buffer_args.push_back(
+          InsertAlloc(loc, result, this->bufferAssignment, &rewriter));
     }
     auto new_op = rewriter.create<xla_lhlo::ReduceOp>(
         loc, llvm::None, buffer_args, op.getAttrs());
@@ -230,12 +216,12 @@ struct HloToLhloReduceOpConverter
   }
 };
 
-class HloToLhloTensorLoadOpConverter : public ConversionPattern {
+class HloToLhloTensorLoadOpConverter
+    : public BaseOpConversion<mlir::TensorLoadOp> {
  public:
-  explicit HloToLhloTensorLoadOpConverter(MLIRContext* context)
-      : ConversionPattern(TensorLoadOp::getOperationName(), 1, context) {}
+  using BaseOpConversion<mlir::TensorLoadOp>::BaseOpConversion;
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      mlir::TensorLoadOp op, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
     rewriter.replaceOp(op, operands);
     return success();
@@ -243,13 +229,13 @@ class HloToLhloTensorLoadOpConverter : public ConversionPattern {
 };
 
 // TODO(b/137624192): Rewrite into a copy and elide copy if possible.
-class HloToLhloTensorStoreOpConverter : public ConversionPattern {
+class HloToLhloTensorStoreOpConverter
+    : public BaseOpConversion<mlir::TensorStoreOp> {
  public:
-  explicit HloToLhloTensorStoreOpConverter(MLIRContext* context)
-      : ConversionPattern(TensorStoreOp::getOperationName(), 1, context) {}
+  using BaseOpConversion<mlir::TensorStoreOp>::BaseOpConversion;
 
   LogicalResult matchAndRewrite(
-      Operation* op, ArrayRef<Value> operands,
+      mlir::TensorStoreOp op, ArrayRef<Value> operands,
       ConversionPatternRewriter& rewriter) const final {
     rewriter.replaceOpWithNewOp<xla_lhlo::CopyOp>(
         op, llvm::None, operands.front(), operands.back());
@@ -291,7 +277,6 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
 //     "xla_lhlo.multiply"(%0, %arg0, %arg3) :
 //         (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-//     dealloc %0 : memref<2x2xf32>
 //     "xla_lhlo.terminator"() : () -> ()
 //   }) : () -> ()
 //   return
@@ -313,14 +298,13 @@ class HloToLhloTensorStoreOpConverter : public ConversionPattern {
 //               %arg1: memref<4xf32>,
 //               %arg2: memref<4xf32>) {
 //   %0 = alloc() : memref<4xf32>
-//   %1 = alloc() : memref<4xf32>
+
 //   "xla_lhlo.maximum"(%arg0, %arg1, %0) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
+//   %1 = alloc() : memref<4xf32>
 //   "xla_lhlo.add"(%arg0, %0, %1) :
 //         (memref<4xf32>, memref<4xf32>, memref<4xf32>) -> ()
 //   "xla_lhlo.copy"(%1, %arg2) : (memref<4xf32>, memref<4xf32>) -> ()
-//   dealloc %0 : memref<4xf32>
-//   dealloc %1 : memref<4xf32>
 //   "xla_lhlo.terminator"() : () -> ()
 // }
 
@@ -346,101 +330,25 @@ struct HloLegalizeToLhlo
     });
 
     auto module = getOperation();
-    populateHLOToLHLOConversionPattern(module.getContext(), &patterns);
-
-    // Do partial conversion so we can have unknown ops in tests.
-    if (failed(applyPartialConversion(module, target, patterns, nullptr))) {
-      signalPassFailure();
-    }
+    BufferAssignmentTypeConverter converter;
+    module.walk([&](FuncOp func) {
+      BufferAssignmentPlacer bufferAssignment(func);
+      OwningRewritePatternList patterns;
+      populateHLOToLHLOConversionPattern(func.getContext(), &bufferAssignment,
+                                         &converter, &patterns);
+      return WalkResult(
+          applyPartialConversion(func, target, patterns, &converter));
+    });
   }
 };
-
-Type ConvertType(Type t) {
-  if (auto tensorType = t.dyn_cast<RankedTensorType>()) {
-    return MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-  }
-  return t;
-}
-
 }  // namespace
 
-/// Transforms FuncOp arguments and results from tensors to buffers. Tensor
-/// results are converted to memrefs and appended to the argument list.
-class HloToLhloFuncOpConverter : public OpConversionPattern<FuncOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    if (funcOp.getBody().getBlocks().size() > 1) {
-      funcOp.emitOpError() << "tensor to buffer conversion expects a single "
-                              "block in the region containing the operation";
-      return failure();
-    }
-
-    auto funcType = funcOp.getType();
-
-    TypeConverter::SignatureConversion conversion(funcType.getNumInputs());
-    for (auto argType : llvm::enumerate(funcType.getInputs())) {
-      conversion.addInputs(argType.index(), ConvertType(argType.value()));
-    }
-    for (auto resType : funcType.getResults()) {
-      conversion.addInputs(ConvertType(resType));
-    }
-    rewriter.updateRootInPlace(funcOp, [&] {
-      funcOp.setType(
-          rewriter.getFunctionType(conversion.getConvertedTypes(), llvm::None));
-      rewriter.applySignatureConversion(&funcOp.getBody(), conversion);
-    });
-    return success();
-  }
-};
-
-/// Transforms ReturnOp to LhloTerminator. CopyOp is inserted to copy each
-/// result to the corresponding buffer argument.
-class StdToLhloReturnOpConverter : public OpConversionPattern<mlir::ReturnOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mlir::ReturnOp returnOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    auto numReturnValues = returnOp.getNumOperands();
-    auto funcOp = returnOp.getParentOfType<FuncOp>();
-    auto numFuncArgs = funcOp.getNumArguments();
-    auto loc = returnOp.getLoc();
-
-    for (auto operand : llvm::enumerate(operands)) {
-      auto returnArgNumber = numFuncArgs - numReturnValues + operand.index();
-      auto dstBuffer = funcOp.getArgument(returnArgNumber);
-      if (dstBuffer == operand.value()) {
-        continue;
-      }
-
-      auto dealloc = FindInsertionPointForCopy(operand.value());
-
-      if (dealloc == nullptr) {
-        returnOp.emitOpError()
-            << "Missing dealloc for operand " << operand.index();
-        return failure();
-      }
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPoint(dealloc);
-      rewriter.create<xla_lhlo::CopyOp>(loc, llvm::None, operand.value(),
-                                        funcOp.getArgument(returnArgNumber));
-    }
-    rewriter.replaceOpWithNewOp<xla_lhlo::TerminatorOp>(returnOp);
-    return success();
-  }
-};
-
-void populateHLOToLHLOConversionPattern(MLIRContext* context,
-                                        OwningRewritePatternList* patterns) {
+void populateHLOToLHLOConversionPattern(
+    MLIRContext* context, BufferAssignmentPlacer* bufferAssignment,
+    TypeConverter* converter, OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
       HloToLhloDynamicBroadcastInDimOpConverter,
-      HloToLhloFuncOpConverter,
       HloToLhloOpConverter<xla_hlo::AbsOp>,
       HloToLhloOpConverter<xla_hlo::AddOp>,
       HloToLhloOpConverter<xla_hlo::AndOp>,
@@ -472,8 +380,9 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context,
       HloToLhloReduceOpConverter,
       HloToLhloTensorLoadOpConverter,
       HloToLhloTensorStoreOpConverter,
-      StdToLhloReturnOpConverter
-  >(context);
+      FunctionAndBlockSignatureConverter,
+      StdReturnOpConverter
+  >(context, bufferAssignment, converter);
   // clang-format on
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index ad81cda19b9..e4f5c9347af 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
+class BufferAssignmentPlacer;
 namespace xla_hlo {
 
 // Collection of rewrite patterns for lowering a general dot product.
@@ -38,9 +39,9 @@ void PopulateXlaToStdPatterns(OwningRewritePatternList *patterns,
                               MLIRContext *ctx);
 
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
-void populateHLOToLHLOConversionPattern(MLIRContext *context,
-                                        OwningRewritePatternList *patterns);
-
+void populateHLOToLHLOConversionPattern(
+    MLIRContext* context, BufferAssignmentPlacer* bufferAssignment,
+    TypeConverter* converter, OwningRewritePatternList* patterns);
 // Collection of rewrite patterns for lowering of HLO to Linalg dialect.
 void populateHLOToLinalgConversionPattern(MLIRContext *context,
                                           OwningRewritePatternList *patterns);
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 33d3690d4ab..c806e95730b 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Transforms/BufferPlacement.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
@@ -60,34 +61,6 @@ namespace {
 
 using ::mlir::xla_lhlo::FusionOp;
 
-// Following are some small transformations that are required to clean up code
-// after lowering from linalg to loops.
-
-// A simple pass that applies lowering of HLO to LHLO only within LHLO ops that
-// contain regions with HLO ops, e.g. FusionOp, ReduceOp, SelectAndScatterOp.
-// This is needed, as these ops are not closed from above and hence nested pass
-// managers can not be applied.
-struct NestedHloRegionsConverter
-    : public mlir::PassWrapper<NestedHloRegionsConverter,
-                               ::mlir::FunctionPass> {
-  void runOnFunction() override {
-    auto& ctx = getContext();
-    mlir::OwningRewritePatternList patterns;
-    mlir::ConversionTarget target(ctx);
-    target.addLegalDialect<::mlir::xla_lhlo::XlaLhloDialect>();
-    ::mlir::xla_hlo::populateHLOToLHLOConversionPattern(&ctx, &patterns);
-
-    getFunction().walk([&](mlir::Operation* op) {
-      if (op->getNumRegions() == 0) {
-        return;
-      }
-      if (failed(applyPartialConversion(op, target, patterns, nullptr))) {
-        signalPassFailure();
-      }
-    });
-  }
-};
-
 // Replaces a FusionOp by the operations contained in its region.
 struct FusionOpRemover
     : public mlir::PassWrapper<FusionOpRemover, ::mlir::FunctionPass> {
@@ -436,8 +409,10 @@ Status LowerLHLOToGPU(mlir::ModuleOp module,
     tiling_for_unrolling.append(tile_sizes.begin(), tile_sizes.end());
   }
 
-  // First, lower bodies of LHLO operations that contain HLO ops.
-  pm.addPass(absl::make_unique<NestedHloRegionsConverter>());
+  // Legalize from HLO to LHLO.
+  pm.addPass(::mlir::xla_hlo::createLegalizeToLhloPass());
+  // Moving `AllocOp`s and inserting missing `DeallocOp`s
+  pm.addPass(::mlir::createBufferPlacementPass());
   // Next, we can strip the outer fusion operation.
   pm.addPass(absl::make_unique<FusionOpRemover>());
   // Remove unnecessary LHLO copies.

From 75d1fdd15ded19724c975dfd02fe08b47f5fef11 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Thu, 7 May 2020 05:37:10 -0700
Subject: [PATCH 0127/1533] Let the test use the new ruy::ReferenceMul.

PiperOrigin-RevId: 310340723
Change-Id: I3302cb48344bd0ca490aee367b6483366436f675
---
 tensorflow/lite/kernels/BUILD                    | 4 ++--
 tensorflow/lite/kernels/cpu_backend_gemm_test.cc | 5 ++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 5b6fe4b5b21..6f6d111fd77 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -357,9 +357,9 @@ cc_test(
         ":cpu_backend_context",
         ":cpu_backend_gemm",
         "@com_google_googletest//:gtest",
-        # ruy's reference path provides the reference implementation
+        # ruy:reference_mul provides the reference implementation
         # that this test compares against.
-        "@ruy//ruy",
+        "@ruy//ruy:reference_mul",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
index 7f148dfa9f1..110eb3a07ef 100644
--- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
+++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include <type_traits>
 
 #include <gtest/gtest.h>
-#include "ruy/ruy.h"  // from @ruy
+#include "ruy/reference_mul.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
 
@@ -353,8 +353,7 @@ void ReferenceGemm(
   ruy::MulParams<AccumScalar, DstScalar> ruy_mul_params;
   cpu_backend_gemm::detail::MakeRuyMulParams(params, &ruy_mul_params);
 
-  ruy::Mul<ruy::Path::kReference>(ruy_lhs, ruy_rhs, ruy_mul_params,
-                                  context->ruy_context(), &ruy_dst);
+  ruy::ReferenceMul(ruy_lhs, ruy_rhs, ruy_mul_params, &ruy_dst);
 }
 
 template <typename LhsScalar, typename RhsScalar, typename AccumScalar,

From d0c5e5c47173a023377f8f7e8762aa8ddb895378 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 7 May 2020 14:47:57 +0000
Subject: [PATCH 0128/1533] Add 
 "//tensorflow/compiler/mlir/tensorflow:graph_optimization_pass_registration"
 to tensorflow/python/_pywrap_mlir.so

This commit adds build target //tensorflow/compiler/mlir/tensorflow:graph_optimization_pass_registration
to tensorflow/python/_pywrap_mlir.so, so that graph optimization could be available in tf-nightly pip wheel.

In the past while graph_optimization_pass_registration is available, it is not packaged with
tf-nightly so it is not possible to access graph optimization if installed from pip wheel.

In order to enable graph optimization pass for pip wheel, graph_optimization_pass_registration
target has to be included in "somewhere" that will be loaded when pip installed tensorflow
is imported.

A natural place would be libtensorflow_framework.so which is the core .so always loaded with `import tensorflow`.

It is possible to include graph_optimization_pass_registration to target libtensorflow_framework.so, see
last attempt on this route:
https://github.com/tensorflow/tensorflow/pull/39231

However, this caused many test failures like:
```
: CommandLine Error: Option 'help-list' registered more than once!
LLVM ERROR: inconsistency in registered CommandLine options
```

The reason is that many tests as a binary also have a copy of the LLVM in its binary, thus causing
multiple copies (one in binary, another one in libtensorflow_framework.so where many tests depends on).

Because there are so many tests, it is really hard to make all the needed changes without break somewhere else.

This commit takes a different approach to include graph_optimization_pass_registration to
tensorflow/python/_pywrap_mlir.so. This shared object is dedicated to mlir related APIs. The current exposed one
is `tf.mlir.experimental.convert_graph_def`.

Because this tensorflow/python/_pywrap_mlir.so already depends on llvm, place graph_optimization_pass will avoid
multiple copies of llvm in multiple locations. This tensorflow/python/_pywrap_mlir.so is also loaded with
`import tensorflow` as part of the python binding.

Ideally it probably would be still preferrale to get the graph_optimization_pass into libtensorflow_framework.so.
That will be investigated further later.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/compiler/mlir/python/BUILD | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/compiler/mlir/python/BUILD b/tensorflow/compiler/mlir/python/BUILD
index 666f89ac72f..1189a926383 100644
--- a/tensorflow/compiler/mlir/python/BUILD
+++ b/tensorflow/compiler/mlir/python/BUILD
@@ -12,6 +12,22 @@ cc_library(
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        # (yongtang) The graph_optimization_pass_registration needs to be part
+        # of a shared object that will be loaded whenever `import tensorflow`
+        # is run. The natural place is libtensorflow_framework.so.
+        # While adding graph_optimization_pass_registration to
+        # libtensorflow_framework.so is possible with some modification in
+        # dependency, many tests will fail due to multiple copies of LLVM.
+        # See https://github.com/tensorflow/tensorflow/pull/39231 for details.
+        # Alternatively, we place graph_optimization_pass_registration here
+        # because:
+        # - tensorflow/python/_pywrap_mlir.so already depends on LLVM anyway
+        # - tensorflow/python/_pywrap_mlir.so always loaded as part of python
+        #   binding
+        # TODO: It might be still preferrable to place graph_optimization_pass
+        # as part of the libtensorflow_framework.so, as it is the central
+        # place for core related components.
+        "//tensorflow/compiler/mlir/tensorflow:graph_optimization_pass_registration",
         "//tensorflow/compiler/mlir/tensorflow:import_utils",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",

From 0cc3e612bd54af061c21f00e3de1767260283c1f Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Thu, 7 May 2020 09:05:20 -0700
Subject: [PATCH 0129/1533] [XLA] Add a memory space propagation pass.

PiperOrigin-RevId: 310371097
Change-Id: I7b58fc0c2a67d69a4b68136acc12e4ca9f16c464
---
 tensorflow/compiler/xla/service/BUILD         |  23 ++
 .../xla/service/memory_space_propagation.cc   |  67 ++++++
 .../xla/service/memory_space_propagation.h    |  46 ++++
 .../service/memory_space_propagation_test.cc  | 203 ++++++++++++++++++
 4 files changed, 339 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/memory_space_propagation.cc
 create mode 100644 tensorflow/compiler/xla/service/memory_space_propagation.h
 create mode 100644 tensorflow/compiler/xla/service/memory_space_propagation_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 799d5654840..348200051ef 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3234,6 +3234,29 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "memory_space_propagation",
+    srcs = ["memory_space_propagation.cc"],
+    hdrs = ["memory_space_propagation.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dataflow_analysis",
+        ":hlo_pass",
+    ],
+)
+
+tf_cc_test(
+    name = "memory_space_propagation_test",
+    srcs = ["memory_space_propagation_test.cc"],
+    deps = [
+        ":hlo_parser",
+        ":memory_space_propagation",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_dce",
     srcs = ["hlo_dce.cc"],
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.cc b/tensorflow/compiler/xla/service/memory_space_propagation.cc
new file mode 100644
index 00000000000..80eb4017477
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.cc
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_propagation.h"
+
+namespace xla {
+
+StatusOr<bool> MemorySpacePropagation::Run(HloModule* module) {
+  bool modified = false;
+  TF_ASSIGN_OR_RETURN(auto dataflow_analysis,
+                      HloDataflowAnalysis::Run(*module));
+  dataflow_analysis_ = std::move(dataflow_analysis);
+
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kFusion) {
+        // Propagate the operand subshapes.
+        for (int operand_idx = 0; operand_idx < instruction->operand_count();
+             ++operand_idx) {
+          modified |=
+              PropagateSubshapes(instruction->operand(operand_idx)->shape(),
+                                 instruction->fused_parameter(operand_idx));
+        }
+
+        // Propagate output subshapes.
+        modified |= PropagateSubshapes(instruction->shape(),
+                                       instruction->fused_expression_root());
+      }
+    }
+  }
+  return modified;
+}
+
+bool MemorySpacePropagation::PropagateSubshapes(
+    const Shape& caller_shape, const HloInstruction* callee_instruction) const {
+  bool modified = false;
+  for (const ShapeUtil::IndexedShape& indexed_shape :
+       ShapeUtil::GetLeafShapes(caller_shape)) {
+    int64 memory_space = indexed_shape.shape.layout().memory_space();
+    const HloValue& value = dataflow_analysis_->GetUniqueValueAt(
+        callee_instruction, indexed_shape.index);
+
+    for (const HloPosition& position : value.positions()) {
+      Shape* shape = ShapeUtil::GetMutableSubshape(
+          position.instruction->mutable_shape(), position.index);
+      if (shape->layout().memory_space() != memory_space) {
+        shape->mutable_layout()->set_memory_space(memory_space);
+        modified = true;
+      }
+    }
+  }
+  return modified;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation.h b/tensorflow/compiler/xla/service/memory_space_propagation.h
new file mode 100644
index 00000000000..65a1dfd14a6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// This is a legalization pass that propagates the memory space in the layout to
+// the fusion computations.
+class MemorySpacePropagation : public HloModulePass {
+ public:
+  ~MemorySpacePropagation() override = default;
+  absl::string_view name() const override { return "memory-space-propagation"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // Given the caller shape (operand or output) and its corresponding
+  // insturction in the fused computation (parameter or root), propagates the
+  // memory space to all the subshapes in the callee side. Returns true if the
+  // module is modified.
+  bool PropagateSubshapes(const Shape& caller_shape,
+                          const HloInstruction* callee_instruction) const;
+
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_PROPAGATION_H_
diff --git a/tensorflow/compiler/xla/service/memory_space_propagation_test.cc b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
new file mode 100644
index 00000000000..8d74958f6aa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/memory_space_propagation_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/memory_space_propagation.h"
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class MemorySpacePropagationTest : public HloTestBase {
+ public:
+  MemorySpacePropagationTest()
+      : HloTestBase(),
+        verifier_(/*layout_sensitive=*/false, /*allow_mixed_precision*/ false) {
+  }
+
+  Status Verify(HloModule* module) { return verifier_.Run(module).status(); }
+
+ private:
+  HloVerifier verifier_;
+};
+
+TEST_F(MemorySpacePropagationTest, NoMemorySpace) {
+  absl::string_view hlo_string = R"(
+  HloModule NoMemorySpace
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)} copy(%param2)
+    %fusion = s32[6]{0:T(128)} fusion(s32[6]{0:T(128)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_FALSE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_ASSERT_OK_AND_ASSIGN(auto ref, ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, NonTupleOutput) {
+  absl::string_view hlo_string = R"(
+  HloModule NonTupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule NonTupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    ROOT %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = s32[6]{0:T(128)S(1)} fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    ROOT %root = s32[6]{0:T(128)} copy(%fusion)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+TEST_F(MemorySpacePropagationTest, TupleOutput) {
+  absl::string_view hlo_string = R"(
+  HloModule TupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)} parameter(0)
+    %add.0 = s32[6]{0:T(128)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %tuple = (s32[6]{0:T(128)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
+    ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
+  }
+  )";
+  absl::string_view expected_hlo_string = R"(
+  HloModule TupleOutput
+
+  %fused_computation {
+    %param_1.3 = s32[1]{0:T(128)} parameter(1)
+    %constant.2 = s32[]{:T(128)} constant(-2147483648)
+    %pad.2 = s32[6]{0:T(128)} pad(s32[1]{0:T(128)} %param_1.3, s32[]{:T(128)} %constant.2), padding=0_5
+    %param_2.3 = s32[5]{0:T(128)S(1)} parameter(2)
+    %pad.3 = s32[6]{0:T(128)} pad(s32[5]{0:T(128)} %param_2.3, s32[]{:T(128)} %constant.2), padding=1_0
+    %maximum.1 = s32[6]{0:T(128)} maximum(s32[6]{0:T(128)} %pad.2, s32[6]{0:T(128)} %pad.3)
+    %param_0.1 = s32[6]{0:T(128)S(1)} parameter(0)
+    %add.0 = s32[6]{0:T(128)S(1)} add(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    %multiply.0 = s32[6]{0:T(128)} multiply(s32[6]{0:T(128)} %maximum.1, s32[6]{0:T(128)} %param_0.1)
+    ROOT %tuple = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) tuple(%add.0, %multiply.0)
+  }
+
+  ENTRY %entry {
+    %param0 = s32[6]{0:T(128)} parameter(0)
+    %param1 = s32[1]{0:T(128)} parameter(1)
+    %param2 = s32[5]{0:T(128)} parameter(2)
+    %arg0 = s32[6]{0:T(128)S(1)} copy(%param0)
+    %arg1 = s32[1]{0:T(128)} copy(%param1)
+    %arg2 = s32[5]{0:T(128)S(1)} copy(%param2)
+    %fusion = (s32[6]{0:T(128)S(1)}, s32[6]{0:T(128)}) fusion(s32[6]{0:T(128)S(1)} %arg0, s32[1]{0:T(128)} %arg1, s32[5]{0:T(128)S(1)} %arg2), kind=kLoop, calls=%fused_computation
+    %gte0 = s32[6]{0:T(128)S(1)} get-tuple-element(%fusion), index=0
+    %gte1 = s32[6]{0:T(128)} get-tuple-element(%fusion), index=1
+    ROOT %root = s32[6]{0:T(128)} add(%gte0, %gte1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+  MemorySpacePropagation memory_space_propagation;
+  EXPECT_TRUE(memory_space_propagation.Run(module.get()).ValueOrDie());
+  TF_EXPECT_OK(Verify(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto ref,
+                          ParseAndReturnVerifiedModule(expected_hlo_string));
+  EXPECT_EQ(module->Hash(), ref->Hash());
+}
+
+}  // namespace
+}  // namespace xla

From 178b6171492285252e176960ca33de645bf8d0ee Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Thu, 7 May 2020 09:13:46 -0700
Subject: [PATCH 0130/1533] Add to the cc_library too

---
 tensorflow/lite/experimental/ios/BUILD.apple | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index a53dd328de9..8e7b32eba91 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -66,6 +66,10 @@ cc_library(
         "//tensorflow/lite/delegates/gpu:metal_delegate.h",
         "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
     ],
+    linkopts = [
+        "-Wl,-weak_framework,CoreML",
+        "-Wl,-weak_framework,Metal",
+    ],
     tags = [
         "nobuilder",
         "swift_module=TensorFlowLiteC",

From 96f4a930dbdd1b5b3c73d262851bb0b867ea0117 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 09:24:28 -0700
Subject: [PATCH 0131/1533] Add named size and count methods for arg, result
 and var methods to AOT models.

PiperOrigin-RevId: 310375046
Change-Id: I3fd5c7fbdcfe141449a0a4d6827f6e5fe14b4e0b
---
 tensorflow/compiler/aot/codegen.cc            | 21 ++++++++
 tensorflow/compiler/aot/codegen_test_h.golden | 48 +++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index c9a36b88795..e4df3090046 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -131,6 +131,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   TF_RETURN_IF_ERROR(XLATypeToCpp(shape.element_type(), &type));
   std::vector<string> dim_vars;
   string dim_sizes, indices;
+  int count = 1;
   if (shape.rank() == 0 ||
       (shape.dimensions_size() == 1 && shape.dimensions(0) == 1)) {
     dim_sizes = "[1]";
@@ -140,6 +141,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
       dim_vars.push_back(absl::StrCat("size_t dim", dim));
       dim_sizes += absl::StrCat("[", shape.dimensions(dim), "]");
       indices += absl::StrCat("[dim", dim, "]");
+      count *= shape.dimensions(dim);
     }
   }
   rewrites->push_back({"{{I}}", absl::StrCat(i)});
@@ -147,6 +149,7 @@ Status AddRewritesForShape(int i, const xla::Shape& shape,
   rewrites->push_back({"{{DIM_VARS}}", absl::StrJoin(dim_vars, ", ")});
   rewrites->push_back({"{{DIM_SIZES}}", dim_sizes});
   rewrites->push_back({"{{INDICES}}", indices});
+  rewrites->push_back({"{{COUNT}}", absl::StrCat(count)});
   return Status::OK();
 }
 
@@ -199,6 +202,12 @@ Status GenArgMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         arg_data({{I}}))){{INDICES}};
   }
+  int arg{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int arg{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     *methods += RewriteWithName(absl::StrCat(i), code, rewrites);
     if (!config.feed(i).name().empty()) {
@@ -246,6 +255,12 @@ Status GenResultMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         result_data({{I}}))){{INDICES}};
   }
+  int result{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int result{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     *methods += RewriteWithName(absl::StrCat(i), code, rewrites);
     if (!config.fetch(i).name().empty()) {
@@ -281,6 +296,12 @@ Status GenVariableMethods(const tf2xla::Config& config,
     return (*static_cast<const {{TYPE}}(*){{DIM_SIZES}}>(
         arg_data({{I}}))){{INDICES}};
   }
+  int var_{{NAME}}_size() const {
+    return {{COUNT}} * sizeof({{TYPE}});
+  }
+  int var_{{NAME}}_count() const {
+    return {{COUNT}};
+  }
 )";
     const tf2xla::Variable& var = config.variable(i - config.feed_size());
     rewrites.emplace_back("{{MAYBE_CONST}}", var.readonly() ? "const " : "");
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index af58ca233f0..d011279dbb7 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -138,6 +138,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1][2]>(
         arg_data(0)))[dim0][dim1];
   }
+  int arg0_size() const {
+    return 2 * sizeof(float);
+  }
+  int arg0_count() const {
+    return 2;
+  }
 
   void set_arg_myfeed_data(const void* data) {
     set_arg_data(0, data);
@@ -156,6 +162,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1][2]>(
         arg_data(0)))[dim0][dim1];
   }
+  int arg_myfeed_size() const {
+    return 2 * sizeof(float);
+  }
+  int arg_myfeed_count() const {
+    return 2;
+  }
 
   void set_arg1_data(const void* data) {
     set_arg_data(1, data);
@@ -174,6 +186,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::int64(*)[3][4]>(
         arg_data(1)))[dim0][dim1];
   }
+  int arg1_size() const {
+    return 12 * sizeof(tensorflow::int64);
+  }
+  int arg1_count() const {
+    return 12;
+  }
 
   // Result methods for managing output buffers. Buffers are in row-major order.
   // Must only be called after a successful Run call. There is a set of methods
@@ -204,6 +222,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::uint32(*)[5][6]>(
         result_data(0)))[dim0][dim1];
   }
+  int result0_size() const {
+    return 30 * sizeof(tensorflow::uint32);
+  }
+  int result0_count() const {
+    return 30;
+  }
 
   tensorflow::uint32* result_myfetch_data() {
     return static_cast<tensorflow::uint32*>(result_data(0));
@@ -219,6 +243,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::uint32(*)[5][6]>(
         result_data(0)))[dim0][dim1];
   }
+  int result_myfetch_size() const {
+    return 30 * sizeof(tensorflow::uint32);
+  }
+  int result_myfetch_count() const {
+    return 30;
+  }
 
   // Methods for managing variable buffers. Buffers are in row-major order.
   //
@@ -261,6 +291,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1]>(
         arg_data(2)))[0];
   }
+  int var_myvar_readonly_size() const {
+    return 1 * sizeof(float);
+  }
+  int var_myvar_readonly_count() const {
+    return 1;
+  }
 
   void set_var_myvar_data(float* data) {
     set_arg_data(3, data);
@@ -279,6 +315,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const float(*)[1]>(
         arg_data(3)))[0];
   }
+  int var_myvar_size() const {
+    return 1 * sizeof(float);
+  }
+  int var_myvar_count() const {
+    return 1;
+  }
 
   void set_var_myvar2_data(tensorflow::int32* data) {
     set_arg_data(4, data);
@@ -297,6 +339,12 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
     return (*static_cast<const tensorflow::int32(*)[5]>(
         arg_data(4)))[dim0];
   }
+  int var_myvar2_size() const {
+    return 5 * sizeof(tensorflow::int32);
+  }
+  int var_myvar2_count() const {
+    return 5;
+  }
 
  private:
   // Number of buffers for the compiled computation.

From 1d4b4a6706b263377ccab18d94161c9ef6ca0133 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Thu, 7 May 2020 09:26:23 -0700
Subject: [PATCH 0132/1533] Export the only one function of a saved model only
 when it matches with exported_names argument.

PiperOrigin-RevId: 310375397
Change-Id: I93fb94f1c2e2d77e39dc4269206438f48cad0e46
---
 .../python/saved_model_to_tfl_flatbuffer.cc   |  4 +++
 .../compiler/mlir/lite/tf_tfl_translate.cc    |  5 +++
 .../mlir/lite/tf_to_tfl_flatbuffer.cc         |  2 +-
 tensorflow/compiler/mlir/python/mlir.cc       |  2 +-
 .../mlir/tensorflow/translate/import_model.cc | 36 +++++++++++++------
 .../mlir/tensorflow/translate/import_model.h  |  1 +
 .../tensorflow/translate/tf_mlir_translate.cc |  5 +--
 .../tensorflow/translate/tf_mlir_translate.h  |  3 +-
 .../compiler/mlir/tf_mlir_translate_main.cc   | 16 ++++-----
 tensorflow/lite/python/lite.py                |  6 +++-
 tensorflow/lite/python/lite_v2_test.py        | 16 +++------
 11 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index c338b723a4a..51fcbb97360 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -146,6 +146,10 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
       saved_model_exported_names.begin(), saved_model_exported_names.end());
   absl::Span<std::string> exported_names(exported_names_in_vector);
 
+  if (exported_names.size() != 1) {
+    return errors::Unimplemented("Only support a single exported name.");
+  }
+
   TF_ASSIGN_OR_RETURN(auto module,
                       ImportSavedModel(model_flags.saved_model_dir(),
                                        model_flags.saved_model_version(), tags,
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 4bc9d9e0c2d..fce1333a491 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -160,6 +160,11 @@ int main(int argc, char **argv) {
         absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     absl::Span<std::string> exported_names(exported_names_vector);
 
+    if (exported_names.size() != 1) {
+      llvm::errs() << "There should be only one exported name";
+      return kTrFailure;
+    }
+
     module = tensorflow::ImportSavedModel(input_file_name, saved_model_version,
                                           tags, exported_names, &context);
   } else {
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index b9ec67736d9..62f64ab63b4 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -174,7 +174,7 @@ StatusOr<mlir::OwningModuleRef> ImportSavedModel(
     return module;
   } else if (saved_model_version == 1) {
     auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, context);
+        input_filename, tags, exported_names, context);
 
     if (!module)
       return tensorflow::errors::InvalidArgument("fail to open input file");
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index d0f6e015922..f22fb519a64 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -112,7 +112,7 @@ std::string ExperimentalConvertSavedModelV1ToMlir(
   // Convert the SavedModelBundle to an MLIR module.
 
   mlir::MLIRContext context;
-  auto module_or = ConvertSavedModelV1ToMlir(bundle, &context);
+  auto module_or = ConvertSavedModelV1ToMlir(bundle, {}, &context);
   if (!module_or.status().ok()) {
     Set_TF_Status_from_Status(status, module_or.status());
     return "// error";
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 49be3da912a..3bb1446213b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
@@ -57,6 +58,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/jit/shape_inference_helpers.h"
 #include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
@@ -65,6 +67,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -2428,8 +2431,8 @@ class SavedModelObjectGraphImporter : public ImporterBase {
   // Main entry point: converts all functions in the given meta graph to an MLIR
   // Module.
   static StatusOr<mlir::OwningModuleRef> Convert(
-      SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
-      absl::Span<std::string> exported_names, bool add_default_attributes);
+      SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
+      mlir::MLIRContext* context, bool add_default_attributes);
 
  private:
   explicit SavedModelObjectGraphImporter(
@@ -3129,8 +3132,8 @@ Status CreateSavedModelIR(
 }
 
 StatusOr<mlir::OwningModuleRef> SavedModelObjectGraphImporter::Convert(
-    SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
-    absl::Span<std::string> exported_names, bool add_default_attributes) {
+    SavedModelV2Bundle* saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context, bool add_default_attributes) {
   GraphDebugInfo dummy_debug_info;
   const GraphDebugInfo& debug_info =
       saved_model->debug_info() ? *saved_model->debug_info() : dummy_debug_info;
@@ -3207,17 +3210,20 @@ class SavedModelSignatureDefImporter {
  public:
   // Main entry point: converts all functions (specified by SignatureDefs) in
   // the given meta graph to an MLIR Module.
-  static StatusOr<mlir::OwningModuleRef> Convert(const SavedModelBundle& bundle,
-                                                 mlir::MLIRContext* context) {
-    SavedModelSignatureDefImporter importer(bundle, context);
+  static StatusOr<mlir::OwningModuleRef> Convert(
+      const SavedModelBundle& bundle, absl::Span<std::string> exported_names,
+      mlir::MLIRContext* context) {
+    SavedModelSignatureDefImporter importer(bundle, exported_names, context);
 
     return importer.ConvertSignatures();
   }
 
  private:
   SavedModelSignatureDefImporter(const SavedModelBundle& bundle,
+                                 absl::Span<std::string> exported_names,
                                  mlir::MLIRContext* context)
       : bundle_(bundle),
+        exported_names_(exported_names),
         module_(mlir::ModuleOp::create(mlir::UnknownLoc::get(context))) {}
 
   // Converts the SavedModel to the SavedModel dialect. Creates an MLIR function
@@ -3250,6 +3256,7 @@ class SavedModelSignatureDefImporter {
       const std::vector<std::pair<std::string, TensorInfo>>& inputs);
 
   const SavedModelBundle& bundle_;
+  absl::Span<std::string> exported_names_;
   mlir::OwningModuleRef module_;
 };
 
@@ -3265,6 +3272,9 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
   GraphDebugInfo debug_info;
   if (bundle_.debug_info != nullptr) debug_info = *bundle_.debug_info;
 
+  llvm::StringSet<> exported_name_set;
+  exported_name_set.insert(exported_names_.begin(), exported_names_.end());
+
   for (const auto& key_and_signature_def : signatures) {
     const std::string& sig_def_key = key_and_signature_def.first;
     const SignatureDef& signature_def = key_and_signature_def.second;
@@ -3274,6 +3284,10 @@ SavedModelSignatureDefImporter::ConvertSignatures() {
     if (sig_def_key == "__saved_model_init_op") {
       continue;
     }
+    if (!exported_name_set.empty() &&
+        exported_name_set.count(sig_def_key) == 0) {
+      continue;
+    }
 
     TF_RETURN_IF_ERROR(ConvertSignature(graphdef, sig_def_key, signature_def,
                                         debug_info, flib_def));
@@ -3556,12 +3570,14 @@ StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
     SavedModelV2Bundle* saved_model, mlir::MLIRContext* context,
     absl::Span<std::string> exported_names, bool add_default_attributes) {
   return SavedModelObjectGraphImporter::Convert(
-      saved_model, context, exported_names, add_default_attributes);
+      saved_model, exported_names, context, add_default_attributes);
 }
 
 StatusOr<mlir::OwningModuleRef> ConvertSavedModelV1ToMlir(
-    const SavedModelBundle& saved_model, mlir::MLIRContext* context) {
-  return SavedModelSignatureDefImporter::Convert(saved_model, context);
+    const SavedModelBundle& saved_model, absl::Span<std::string> exported_names,
+    mlir::MLIRContext* context) {
+  return SavedModelSignatureDefImporter::Convert(saved_model, exported_names,
+                                                 context);
 }
 
 std::string MlirModuleToString(mlir::ModuleOp module, bool show_debug_info) {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
index 8603eadb487..bdb72345201 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.h
@@ -55,6 +55,7 @@ stream_executor::port::StatusOr<mlir::OwningModuleRef> ConvertSavedModelToMlir(
 // expressed with tf_executor dialect.
 stream_executor::port::StatusOr<mlir::OwningModuleRef>
 ConvertSavedModelV1ToMlir(const SavedModelBundle& saved_model,
+                          absl::Span<std::string> exported_names,
                           mlir::MLIRContext* context);
 
 // Serialize a MLIR module to a string.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 2c7f84d8268..6ada0fec4e2 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -141,7 +141,8 @@ mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
 
 mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags, mlir::MLIRContext* context) {
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context) {
   tensorflow::SavedModelBundle bundle;
   tensorflow::SessionOptions session_options;
   // Force saved model states to be restored to CPU.
@@ -155,7 +156,7 @@ mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     return nullptr;
   }
 
-  auto module_or = ConvertSavedModelV1ToMlir(bundle, context);
+  auto module_or = ConvertSavedModelV1ToMlir(bundle, exported_names, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "SavedModel V1 import failed: " << module_or.status();
     return nullptr;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index f498864c8aa..490b7c7d8f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -64,7 +64,8 @@ mlir::OwningModuleRef SavedModelObjectGraphToMlirImport(
 // given MLIR `context`.
 mlir::OwningModuleRef SavedModelSignatureDefsToMlirImport(
     absl::string_view saved_model_dir,
-    const std::unordered_set<std::string>& tags, mlir::MLIRContext* context);
+    const std::unordered_set<std::string>& tags,
+    absl::Span<std::string> exported_names, mlir::MLIRContext* context);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index 62b862f5e21..2e1528e0d60 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -104,26 +104,24 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  std::vector<std::string> exported_names_vector =
+      absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+  absl::Span<std::string> exported_names(exported_names_vector);
+
   if (import_saved_model_object_graph) {
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
-    std::vector<std::string> exported_names =
-        absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
     mlir::MLIRContext context;
 
     auto module = tensorflow::SavedModelObjectGraphToMlirImport(
-        input_filename, tags, absl::Span<std::string>(exported_names),
-        &context);
+        input_filename, tags, exported_names, &context);
     if (!module) return 1;
 
     module->print(output->os());
   } else if (import_saved_model_signature_defs) {
-    std::unordered_set<std::string> tags =
-        absl::StrSplit(saved_model_tags, ',');
     mlir::MLIRContext context;
 
     auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
-        input_filename, tags, &context);
+        input_filename, tags, exported_names, &context);
     if (!module) return 1;
 
     module->print(output->os());
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index b2d58ec8746..61daa699f5a 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -401,7 +401,8 @@ class TFLiteConverterBase(object):
       if not self._contains_function_with_implements_attr(saved_model_proto):
         self.saved_model_dir = None
       else:
-        self._saved_model_exported_names = []
+        if not self._saved_model_exported_names:
+          self._saved_model_exported_names = []
         self._saved_model_version = saved_model_proto.saved_model_schema_version
         if self._saved_model_version not in [1, 2]:
           raise ValueError(
@@ -761,6 +762,9 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     if not signature_keys:
       signature_keys = saved_model.signatures
 
+    if len(signature_keys) != 1:
+      raise ValueError("Only support a single signature key.")
+
     funcs = []
     for key in signature_keys:
       if key not in saved_model.signatures:
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 59f326d4b9f..5470e332b3d 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -469,15 +469,10 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
     save(root, save_dir, {'add': add_func, 'sub': sub_func})
 
-    # Ensure the converter generates.
-    converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
-    self.assertLen(converter._funcs, 2)
-
     # Try converting multiple functions.
     with self.assertRaises(ValueError) as error:
-      _ = converter.convert()
-    self.assertIn('This converter can only convert a single ConcreteFunction',
-                  str(error.exception))
+      _ = lite.TFLiteConverterV2.from_saved_model(save_dir)
+    self.assertIn('Only support a single signature key.', str(error.exception))
 
   @test_util.run_v2_only
   def testNoConcreteFunctionModel(self):
@@ -487,12 +482,9 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
     save(root, save_dir)
 
-    converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
-    self.assertLen(converter._funcs, 0)
-
     with self.assertRaises(ValueError) as error:
-      _ = converter.convert()
-    self.assertIn('No ConcreteFunction is specified.', str(error.exception))
+      _ = lite.TFLiteConverterV2.from_saved_model(save_dir)
+    self.assertIn('Only support a single signature key.', str(error.exception))
 
   @test_util.run_v2_only
   def testKerasSequentialModel(self):

From 70e9708e234c86632e1f24b45e058cb87f8abe29 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <dlibenzi@google.com>
Date: Thu, 7 May 2020 09:32:16 -0700
Subject: [PATCH 0133/1533] Enable XRT cache to be shared among multiple GPU
 devices. Allow XRT GPU work with multi-threaded based replication, where a
 single process see all the available devices.

PiperOrigin-RevId: 310376508
Change-Id: I25715feaf74ceca421ba8939405f58a0bf68ee59
---
 .../compiler/xla/executable_run_options.h     |  1 +
 .../compiler/xrt/kernels/xrt_compile_ops.cc   | 17 ++++----
 .../compiler/xrt/kernels/xrt_execute_op.cc    | 37 +++++++++++-------
 tensorflow/compiler/xrt/xrt.proto             | 21 ++++++++--
 tensorflow/compiler/xrt/xrt_device.cc         | 39 ++++++++++++++++++-
 tensorflow/compiler/xrt/xrt_device.h          |  4 ++
 6 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index 43ee0fdd820..8ae8c418d5d 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -50,6 +50,7 @@ class RunId {
  public:
   // Creates a new, unique RunId.
   RunId();
+  explicit RunId(int64 value) : data_(value) {}
 
   RunId(const RunId&) = default;
   RunId& operator=(const RunId&) = default;
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index 1bcd8561e61..ba6e6a093d6 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -158,7 +158,7 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
     argument_layout_ptrs[i] = &argument_layouts[i];
   }
   xla::ExecutableBuildOptions build_options;
-  build_options.set_device_ordinal(client->default_device_ordinal());
+  build_options.set_device_ordinal(device_ref.device_ordinal());
   build_options.set_num_replicas(num_replicas);
   build_options.set_result_layout(xla::Shape(config.program_shape().result()));
   build_options.set_device_allocator(device_ref.backend()->memory_allocator());
@@ -206,7 +206,8 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, CompilationCacheKey(computation_proto, &key));
 
   // Process-wide cache of XLA executables.
-  auto cache_or = GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0);
+  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+      ctx, /*max_number_of_entries=*/0);
   OP_REQUIRES_OK(ctx, cache_or.status());
   auto cache = cache_or.ConsumeValueOrDie();
 
@@ -259,15 +260,11 @@ void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
   auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());
 
-  ResourceMgr* rm;
-  OP_REQUIRES_OK(ctx, XRTGenericDeviceAccessor::GetResourceManager(ctx, &rm));
-
   // Process-wide cache of XLA executables.
-  XRTCompilationCache* cache;
-  OP_REQUIRES_OK(ctx, rm->Lookup<XRTCompilationCache>(
-                          rm->default_container(),
-                          kXRTCompilationCacheResourceName, &cache));
-  core::ScopedUnref cache_unref(cache);
+  auto cache_or = XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+      ctx, /*max_number_of_entries=*/0);
+  OP_REQUIRES_OK(ctx, cache_or.status());
+  auto cache = cache_or.ConsumeValueOrDie();
 
   const Tensor& keys_tensor = ctx->input(0);
   auto flat_keys = keys_tensor.flat<int64>();
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index b641f333e8b..d39b37387f2 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -149,13 +149,17 @@ xla::StatusOr<InputBuffers> GetChainedOpInputs(
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed, int replica_id) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   VLOG(2) << "Executing computation.";
   xla::ExecutableRunOptions run_options;
   run_options.set_stream(stream);
   run_options.set_allocator(device_ref->backend()->memory_allocator());
   run_options.set_intra_op_thread_pool(&context->eigen_cpu_device());
   run_options.set_rng_seed(rng_seed);
+  if (config.run_id() != 0) {
+    run_options.set_run_id(xla::RunId(config.run_id()));
+  }
   if (executable->executable()
           ->module_config()
           .has_static_device_assignment()) {
@@ -164,8 +168,11 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
   }
   xla::GpuExecutableRunOptions gpu_options;
   std::vector<xla::GlobalDeviceId> gpu_global_ids;
-  if (replica_id >= 0) {
-    gpu_global_ids.emplace_back(replica_id);
+  if (config.local_replica_mapping_size() > 0) {
+    gpu_global_ids.reserve(config.local_replica_mapping_size());
+    for (auto& gid : config.local_replica_mapping()) {
+      gpu_global_ids.emplace_back(xla::GlobalDeviceId(gid));
+    }
     gpu_options.set_gpu_global_device_ids(gpu_global_ids);
   }
   std::shared_ptr<NcclUniqueIdFactory> nccl_factory = GetNcclUniqueIdFactory();
@@ -222,10 +229,11 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     OpKernelContext* context, XRTMemoryManager* memory_manager,
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
-    se::Stream* stream, int rng_seed, int replica_id) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   auto runfn = [&]() {
     return RunExecutable(context, device_ref, executable, input_buffers, stream,
-                         rng_seed, replica_id);
+                         rng_seed, config);
   };
 
   // We pass zero as requested_free_size as there is no simple way to get the
@@ -241,14 +249,15 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> ExecuteComputation(
     XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable,
     const std::vector<InputCoords>& input_coords, bool release_inputs,
-    se::Stream* stream, int rng_seed, int replica_id) {
+    se::Stream* stream, int rng_seed,
+    const xrt::CommonExecutionConfig& config) {
   XRTMemoryManager::WorkingSet working_set(memory_manager);
   TF_ASSIGN_OR_RETURN(InputBuffers input_buffers,
                       GetInputBuffers(&working_set, device_ref->backend(),
                                       input_coords, release_inputs));
   return ExecuteComputation(context, memory_manager.get(), device_ref,
                             executable, input_buffers, stream, rng_seed,
-                            replica_id);
+                            config);
 }
 
 // XRTExecuteOp
@@ -297,8 +306,9 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   bool release_inputs = config_proto.release_input_handles();
   bool release_compilation = config_proto.release_compilation_handle();
 
-  TF_ASSIGN_OR_RETURN(
-      auto cache, GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0));
+  TF_ASSIGN_OR_RETURN(auto cache,
+                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+                          context, /*max_number_of_entries=*/0));
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
   class XRTGenericDeviceAccessor::ScopedRef device_ref;
@@ -330,7 +340,7 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
       RefPtr<XRTTupleAllocation> output_tuple,
       ExecuteComputation(context, memory_manager, &device_ref, executable,
                          input_coords, release_inputs, stream, rng_seed,
-                         config_proto.replica_id()));
+                         config_proto.common_config()));
 
   return CreateExecuteOutput(context, memory_manager.get(),
                              std::move(output_tuple),
@@ -379,8 +389,9 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
   xrt::XRTChainedExecuteConfig config;
   TF_RET_CHECK(ParseFromTString(execution_config.scalar<tstring>()(), &config));
 
-  TF_ASSIGN_OR_RETURN(
-      auto cache, GetOrCreateCompilationCache(rm, /*max_number_of_entries=*/0));
+  TF_ASSIGN_OR_RETURN(auto cache,
+                      XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+                          context, /*max_number_of_entries=*/0));
   // We are guaranteed that the underlying device object won't be deleted out
   // from under us, while the ScopedRef is live.
   class XRTGenericDeviceAccessor::ScopedRef device_ref;
@@ -408,7 +419,7 @@ Status XRTExecuteChainedOp::DoWork(OpKernelContext* context) {
 
     return ExecuteComputation(context, memory_manager.get(), &device_ref,
                               executable, input_buffers, stream, rng_seed,
-                              config.replica_id());
+                              config.common_config());
   };
 
   return ExecuteChained(context, memory_manager, device_ref.backend(),
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 1cbd851f7ef..9a351732c4b 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -111,6 +111,17 @@ message XLATupleNode {
   repeated XLATupleNode tuples = 3;
 }
 
+message CommonExecutionConfig {
+  // The replica index this execute is driving.
+  int32 replica_id = 1;
+  // Mapping local device ordinals to global replica IDs.
+  // local_replica_mapping[LOCAL_DEVICE_ORDINAL] = GLOBAL_REPLICA_ID
+  repeated int32 local_replica_mapping = 2;
+  // The execution run ID used to correlate different XRT execute operations
+  // happeining in parallel from different threads.
+  int64 run_id = 3;
+}
+
 // Options for an XLA execution.
 message XRTExecutionConfig {
   // Local device to run on. This is present because the execute Op
@@ -133,8 +144,9 @@ message XRTExecutionConfig {
   // a single tuple allocation the execution will return a vector of
   // allocations, one for each of the first-level elements of the result tuple.
   bool return_exploded_tuple = 7;
-  // The replica index this execute is driving.
-  int32 replica_id = 8;
+  reserved 8;
+  // The common configuration for XRT execute operations.
+  CommonExecutionConfig common_config = 9;
 }
 
 message XRTChainedExecuteConfig {
@@ -145,8 +157,9 @@ message XRTChainedExecuteConfig {
   // Optional key to disambiguate between executions. This is only needed if
   // multiple host send/recvs may be outstanding concurrently with executions.
   string execution_instance_key = 3;
-  // The replica index this execute is driving.
-  int32 replica_id = 4;
+  reserved 4;
+  // The common configuration for XRT execute operations.
+  CommonExecutionConfig common_config = 5;
 }
 
 // A single chained execute operation. An operation can either be a device data
diff --git a/tensorflow/compiler/xrt/xrt_device.cc b/tensorflow/compiler/xrt/xrt_device.cc
index 1b5557d556d..46954572c5d 100644
--- a/tensorflow/compiler/xrt/xrt_device.cc
+++ b/tensorflow/compiler/xrt/xrt_device.cc
@@ -17,19 +17,56 @@ limitations under the License.
 
 #include "tensorflow/compiler/xrt/xrt_device.h"
 
+#include <map>
+
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
+namespace {
+
+class ResourceMgrArena {
+ public:
+  static ResourceMgrArena* Get() {
+    static ResourceMgrArena* arena = new ResourceMgrArena();
+    return arena;
+  }
+
+  ResourceMgr* GetResourceMgr(const std::string& platform_name) {
+    mutex_lock lock(mutex_);
+    auto it = resource_managers_.find(platform_name);
+    if (it == resource_managers_.end()) {
+      it = resource_managers_.emplace(platform_name, new ResourceMgr()).first;
+    }
+    return it->second;
+  }
+
+ private:
+  mutex mutex_;
+  std::map<std::string, ResourceMgr*> resource_managers_;
+};
+
+}  // namespace
 
 /*static*/ Status XRTGenericDeviceAccessor::GetResourceManager(
     OpKernelContext* ctx, ResourceMgr** rm) {
-  *rm = ctx->resource_manager();
+  const XlaDevice::Metadata* metadata;
+  TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
+  *rm = ResourceMgrArena::Get()->GetResourceMgr(metadata->platform()->Name());
   return Status::OK();
 }
 
+/* static */ xla::StatusOr<RefPtr<XRTCompilationCache>>
+XRTGenericDeviceAccessor::GetOrCreateCompilationCache(
+    OpKernelContext* ctx, int64 max_number_of_entries) {
+  ResourceMgr* rm;
+  TF_RETURN_IF_ERROR(GetResourceManager(ctx, &rm));
+  return tensorflow::GetOrCreateCompilationCache(rm, max_number_of_entries);
+}
+
 /*static*/ Status XRTGenericDeviceAccessor::InitScopedRef(
     OpKernelContext* ctx, int device_ordinal, ScopedRef* scoped_ref) {
   const XlaDevice::Metadata* metadata;
diff --git a/tensorflow/compiler/xrt/xrt_device.h b/tensorflow/compiler/xrt/xrt_device.h
index 5ebee7641f0..02fab315830 100644
--- a/tensorflow/compiler/xrt/xrt_device.h
+++ b/tensorflow/compiler/xrt/xrt_device.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
 
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
@@ -31,6 +32,9 @@ class XRTGenericDeviceAccessor {
  public:
   static Status GetResourceManager(OpKernelContext* ctx, ResourceMgr** rm);
 
+  static xla::StatusOr<RefPtr<XRTCompilationCache>> GetOrCreateCompilationCache(
+      OpKernelContext* ctx, int64 max_number_of_entries);
+
   // We use a ScopedRef pattern here even though it's not strictly necessary,
   // just so that templated uses of this and the TPU accessor class will be as
   // similar as possible.

From a8c163702f16226398a90cb8c3aa2301e8930310 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 09:57:21 -0700
Subject: [PATCH 0134/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310381503
Change-Id: Ia9b0b33e4bf5452944c734b7d648347fb8e4d041
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b537cc30190..f2459cc9334 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 60ffd3d4d6c27aa43c5ca6893e1a29ed654d1334 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 7 May 2020 10:20:18 -0700
Subject: [PATCH 0135/1533] Fix control dependency issue causing shape_assert
 test to fail.

Control dependencies needed to be inserted to guard strided slice
from indexing outside the actual rank of the Tensor.

PiperOrigin-RevId: 310386681
Change-Id: I8c86a43b28a6010ce0214dfe5f09e63da8065a47
---
 .../python/kernel_tests/check_ops_test.py     |  2 --
 tensorflow/python/ops/check_ops.py            | 20 ++++++++++++++-----
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 47f392d7438..37ee8d38f53 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -1688,8 +1688,6 @@ class AssertShapesTest(test.TestCase):
         rank_three_shapes, array_ops.constant(1), correct_rank=3, actual_rank=0)
 
   def test_raises_dynamic_incorrect_rank(self):
-    self.skipTest("b/134600611")
-
     x_value = 5
     rank_two_shapes = [(1, 1), (1, 3), ("a", "b"), (None, None)]
     with ops.Graph().as_default():
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 3085e05eaf6..cefca5defae 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -1845,7 +1845,12 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
                 'Specified by tensor %s dimension %d' %
                 (tensor_name(specified_by_y), specified_at_dim))
 
-          actual_size = sizes.actual_sizes[tensor_dim]
+          # This is extremely subtle. If actual_sizes is dynamic, we must
+          # make sure a control dependency is inserted here so that this slice
+          # can not execute until the rank is asserted to be enough for the
+          # slice to not fail.
+          with ops.control_dependencies(rank_assertions):
+            actual_size = sizes.actual_sizes[tensor_dim]
           if _has_known_value(actual_size) and _has_known_value(specified_size):
             if int(actual_size) != int(specified_size):
               raise ValueError(
@@ -1871,12 +1876,17 @@ def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
           size_assertions.append(
               control_flow_ops.Assert(condition, data_, summarize=summarize))
         else:
-          size = sizes.actual_sizes[tensor_dim]
+          # Not sure if actual_sizes is a constant, but for safety, guard
+          # on rank. See explanation above about actual_sizes need for safety.
+          with ops.control_dependencies(rank_assertions):
+            size = sizes.actual_sizes[tensor_dim]
           size_specifications[size_symbol] = (size, sizes.x, tensor_dim)
 
-    with ops.control_dependencies(rank_assertions):
-      shapes_assertion = control_flow_ops.group(size_assertions)
-    return shapes_assertion
+  # Ensure both assertions actually occur.
+  with ops.control_dependencies(rank_assertions):
+    shapes_assertion = control_flow_ops.group(size_assertions)
+
+  return shapes_assertion
 
 
 # pylint: disable=line-too-long

From 62d50aa8e28209c701e8919d315ed0cb3ca804de Mon Sep 17 00:00:00 2001
From: Lucy Fox <lucyfox@google.com>
Date: Thu, 7 May 2020 10:27:22 -0700
Subject: [PATCH 0136/1533] Emit error messages for all missing legalizations
 in TF to XLA full legalization pass.

A full legalization conversion stops after the first failed conversion encountered. For building the TF to XLA bridge, it is useful for this pass to continue through and emit information about all of the missing ops. Instead, use the Partial conversion mode to get the full set of operations that are not legalizable. The "full" conversion succeeds if this set is empty.

This does not change the behavior when the full legalization pass succeeds. However, if the conversion fails, the outputted error message is now much more useful.

For the sake of demonstrating what this might look like with a large model, I've run this on Transformer with the Unary op lowerings removed. Resulting error message output:

Before this change:
```
Compilation failure: MLIR TF to XLA legalization failed-:64:11: error: failed to legalize operation 'tf.Rsqrt'
-:64:11: note: see current operation: %37 = "tf.Rsqrt"(%33) : (tensor<f32>) -> tensor<f32>
```

After this change (default case):
```
Compilation failure: MLIR TF to XLA legalization failed-:4:3: error: The following operations cannot be legalized: tf.Rsqrt (count: 217); tf.SoftmaxCrossEntropyWithLogits (count: 1); tf.Sqrt (count: 370). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.
-:4:3: error: Emitting more detail about one op that failed to legalize...
-:251:12: error: 'tf.Rsqrt' op is not legalizable
-:251:12: note: see current operation: %224 = "tf.Rsqrt"(%220) : (tensor<f32>) -> tensor<f32>
```

After this change (verbose case, with logging set to 1):
```
Compilation failure: MLIR TF to XLA legalization failed-:4:3: error: The following operations cannot be legalized: tf.Rsqrt (count: 217); tf.SoftmaxCrossEntropyWithLogits (count: 1); tf.Sqrt (count: 370). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.
-:4:3: error: Emitting more detail about one of each type of op that failed to legalize...
-:1769:13: error: 'tf.Rsqrt' op is not legalizable
-:1769:13: note: see current operation: %1742 = "tf.Rsqrt"(%1738) : (tensor<f32>) -> tensor<f32>
-:3308:24: error: 'tf.SoftmaxCrossEntropyWithLogits' op is not legalizable
-:3308:24: note: see current operation: %loss, %backprop = "tf.SoftmaxCrossEntropyWithLogits"(%3495, %3503) : (tensor<768x33708xf32>, tensor<768x33708xf32>) -> (tensor<768xf32>, tensor<768x33708xf32>)
-:6944:13: error: 'tf.Sqrt' op is not legalizable
-:6944:13: note: see current operation: %7319 = "tf.Sqrt"(%7318) : (tensor<f32>) -> tensor<f32>
```
PiperOrigin-RevId: 310388184
Change-Id: Idea12490be1ace18cfb89bda51f5f948bd2d691c
---
 .../tests/legalize-tf-full-conversion.mlir    | 21 ++++++-
 .../mlir/xla/transforms/legalize_tf.cc        | 57 ++++++++++++++++++-
 2 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
index d2b4d269fef..0660af4ed1c 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-full-conversion.mlir
@@ -1,22 +1,24 @@
 // RUN: tf-opt %s -xla-legalize-tf -split-input-file -verify-diagnostics
 
+// expected-error@below{{The following operations cannot be legalized: tf.NoOp (count: 1); tf_executor.fetch (count: 1); tf_executor.graph (count: 1); tf_executor.island (count: 1); tf_executor.yield (count: 1). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
+// expected-error@below{{Emitting more detail about one op that failed to legalize...}}
 func @tf_executor_graph_op() {
-    // expected-error@+1 {{failed to legalize operation 'tf_executor.graph'}}
     tf_executor.graph {
       %0 = tf_executor.island {
+        // expected-error@+1 {{'tf.NoOp' op is not legalizable}}
         "tf.NoOp"() {} : () -> ()
         tf_executor.yield
       }
       tf_executor.fetch
     }
     return
-
 }
 
 // -----
 
+// expected-error@below{{The following operations cannot be legalized: tf.OpA (count: 1). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
 func @tf_unknown_op(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // expected-error@+1 {{failed to legalize operation 'tf.OpA'}}
+  // expected-error@+1 {{'tf.OpA' op is not legalizable}}
   %0 = "tf.OpA"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
@@ -27,3 +29,16 @@ func @tf_known_op(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
   return %0: tensor<2xi32>
 }
+
+// -----
+
+// expected-error@below{{The following operations cannot be legalized: tf.OpA (count: 1); tf.OpB (count: 2). These legalization failure(s) may be due to missing TF to HLO lowerings and/or unsupported attributes, etc.}}
+// expected-error@below{{Emitting more detail about one op that failed to legalize...}}
+func @tf_unknown_known_mix(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // expected-error@+1 {{'tf.OpA' op is not legalizable}}
+  %0 = "tf.OpA"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tf.OpB"(%0, %0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %2 = "tf.Add"(%1, %1) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %3 = "tf.OpB"(%2, %2) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %2: tensor<2xi32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index fb03c9b82e5..de808bc8ad2 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
@@ -4785,6 +4786,51 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
   }
 };
 
+// Emits debug information which includes the number of ops of each type which
+// failed to legalize.
+void EmitLegalizationErrors(Operation *op,
+                            const DenseSet<Operation *> &nonlegalized_ops) {
+  // Track the legalization failures by mapping op name to information about
+  // that failure: the number of unlegalized occurances of the op, and one
+  // example operation that failed.
+  std::map<StringRef, std::pair<int, Operation *>> op_name_to_error_info;
+  DenseSet<Operation *> error_ops;
+  for (Operation *nonlegalized_op : nonlegalized_ops) {
+    // Increment count of this legalization failure.
+    StringRef op_name = nonlegalized_op->getName().getStringRef();
+    // If this emplace is successful, it's the first time we've encountered
+    // this op type. Initialize count to 0 so that after increment, it is 1.
+    auto insertion_result = op_name_to_error_info.emplace(
+        op_name, std::make_pair(0, nonlegalized_op));
+    ++insertion_result.first->second.first;
+  }
+  std::vector<std::string> error_messages;
+  error_messages.reserve(op_name_to_error_info.size());
+  for (const auto &op_info : op_name_to_error_info) {
+    error_messages.push_back(
+        llvm::formatv("{0} (count: {1})", op_info.first, op_info.second.first));
+  }
+  Location loc = op->getLoc();
+  emitError(loc) << "The following operations cannot be legalized: "
+                 << llvm::join(error_messages, "; ")
+                 << ". These legalization failure(s) may be due to missing TF "
+                    "to HLO lowerings and/or unsupported attributes, etc.";
+  // Emit more information about the missing ops. This error message
+  // contains useful details beyond the op name (input and output shapes,
+  // attributes, etc.).
+  if (!VLOG_IS_ON(1) && nonlegalized_ops.size() != 1) {
+    emitError(loc)
+        << "Emitting more detail about one op that failed to legalize...";
+  } else if (VLOG_IS_ON(1)) {
+    emitError(loc) << "Emitting more detail about one of each type of op "
+                      "that failed to legalize...";
+  }
+  for (const auto &op_info : op_name_to_error_info) {
+    op_info.second.second->emitOpError() << "is not legalizable";
+    if (!VLOG_IS_ON(1)) break;
+  }
+}
+
 // Performs the lowering to XLA dialect.
 void LegalizeTF::runOnFunction() {
   if (failed(legalizeTF(getFunction(), allow_partial_conversion_)))
@@ -4841,7 +4887,16 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
   if (!allow_partial_conversion) {
     // Fully qualify ReturnOp here as xla_hlo dialect also defines a ReturnOp.
     target.addLegalOp<ModuleOp, FuncOp, ModuleTerminatorOp, ::mlir::ReturnOp>();
-    return applyFullConversion(op, target, patterns);
+    DenseSet<Operation *> nonlegalized_ops;
+    LogicalResult result = applyPartialConversion(
+        op, target, patterns, /*converter=*/nullptr, &nonlegalized_ops);
+    // In order to enforce that the conversion result is fully converted,
+    // fail if there are any nonlegalized ops in the set.
+    if (failed(result) || !nonlegalized_ops.empty()) {
+      EmitLegalizationErrors(op, nonlegalized_ops);
+      return failure();
+    }
+    return result;
   }
 
   return applyPartialConversion(op, target, patterns);

From d7120e1adebc780cb0b53ac4808182098fa762d2 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 7 May 2020 10:28:31 -0700
Subject: [PATCH 0137/1533] Delete duplicate installation of patchelf.

PiperOrigin-RevId: 310388388
Change-Id: If0be9f1f03a4004b593fa79dec5860c03d83cecc
---
 ...ile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
index 54bb4b3773f..353d946261d 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
@@ -17,7 +17,6 @@ RUN apt-get update && apt-get install -y \
       flex \
       g++ \
       make \
-      patchelf \
       rpm2cpio \
       unar \
       wget \

From 11f9a39e5de0c1958f5d8904bde3222516ffb635 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sun, 3 May 2020 13:41:33 +0200
Subject: [PATCH 0138/1533] Test ConvertUnary in dynamic shape mode

---
 .../tf2tensorrt/convert/convert_nodes_test.cc | 220 +++++++-----------
 1 file changed, 80 insertions(+), 140 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 1f30b837450..1c5aa92189a 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
 
 #include <algorithm>
+#include <functional>
 #include <memory>
 #include <unordered_map>
 #include <vector>
@@ -66,6 +67,7 @@ namespace convert {
 using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
+using ::testing::FloatNear;
 using ::testing::Matcher;
 using ::testing::NanSensitiveFloatNear;
 
@@ -216,6 +218,21 @@ void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
       << "  actual: " << DebugString(rhs);
 }
 
+Matcher<std::vector<float>> ArrayFloatNear(const std::vector<float>& values,
+                                           float max_abs_error = 1e-5,
+                                           bool nan_sensitive = false) {
+  std::vector<Matcher<float>> matchers;
+  matchers.reserve(values.size());
+  for (const float& v : values) {
+    if (nan_sensitive) {
+      matchers.emplace_back(NanSensitiveFloatNear(v, max_abs_error));
+    } else {
+      matchers.emplace_back(FloatNear(v, max_abs_error));
+    }
+  }
+  return ElementsAreArray(matchers);
+}
+
 template <typename T>
 void ExpectArrayNear(const std::vector<T>& lhs, absl::Span<const T> rhs) {
   ASSERT_EQ(lhs.size(), rhs.size());
@@ -5112,135 +5129,54 @@ TEST_F(OpConverterTest, ConvertGather) {
   TestConvertGather<DT_INT32>(this);
 }
 
-TEST_F(OpConverterTest, ConvertUnary) {
+template <typename T>
+NodeDef CreateUnaryOp() {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  return T(s.WithOpName("my_unary"), input).operation.node()->def();
+}
+
+TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
+  const auto& spec = GetParam();
+  const TrtTestMode trt_mode = std::get<0>(spec);
+  const DataType tf_dtype = std::get<1>(spec);
+  TrtPrecisionMode converter_precision = std::get<2>(spec);
   {
     // Input is weights, should fail.
-    Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto neg = ops::Neg(s.WithOpName("my_unary"), input);
-    const NodeDef& node_def = neg.operation.node()->def();
+    Reset(converter_precision, trt_mode);
+    const NodeDef node_def = CreateUnaryOp<ops::Neg>();
     AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Neg must be a tensor, at my_unary");
   }
-
-  // Get nodedef for unary layer.
-  auto get_unary_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "Abs") {
-      auto unary = ops::Abs(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acos") {
-      auto unary = ops::Acos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Acosh") {
-      auto unary = ops::Acosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asin") {
-      auto unary = ops::Asin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Asinh") {
-      auto unary = ops::Asinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atan") {
-      auto unary = ops::Atan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Atanh") {
-      auto unary = ops::Atanh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Ceil") {
-      auto unary = ops::Ceil(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cos") {
-      auto unary = ops::Cos(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Cosh") {
-      auto unary = ops::Cosh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Exp") {
-      auto unary = ops::Exp(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Floor") {
-      auto unary = ops::Floor(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Log") {
-      auto unary = ops::Log(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Neg") {
-      auto unary = ops::Neg(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Reciprocal") {
-      auto unary = ops::Reciprocal(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Rsqrt") {
-      auto unary = ops::Rsqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sin") {
-      auto unary = ops::Sin(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sinh") {
-      auto unary = ops::Sinh(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Sqrt") {
-      auto unary = ops::Sqrt(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    } else if (op_name == "Tan") {
-      auto unary = ops::Tan(s.WithOpName("my_unary"), input);
-      return unary.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
-  };
-  // Get expected output for unary layer.
-  auto get_unary_output = [](string op_name, float input) -> float {
-    if (op_name == "Abs") {
-      return std::abs(input);
-    } else if (op_name == "Acos") {
-      return std::acos(input);
-    } else if (op_name == "Acosh") {
-      return std::acosh(input);
-    } else if (op_name == "Asin") {
-      return std::asin(input);
-    } else if (op_name == "Asinh") {
-      return std::asinh(input);
-    } else if (op_name == "Atan") {
-      return std::atan(input);
-    } else if (op_name == "Atanh") {
-      return std::atanh(input);
-    } else if (op_name == "Ceil") {
-      return std::ceil(input);
-    } else if (op_name == "Cos") {
-      return std::cos(input);
-    } else if (op_name == "Cosh") {
-      return std::cosh(input);
-    } else if (op_name == "Exp") {
-      return std::exp(input);
-    } else if (op_name == "Floor") {
-      return std::floor(input);
-    } else if (op_name == "Log") {
-      return std::log(input);
-    } else if (op_name == "Neg") {
-      return -input;
-    } else if (op_name == "Reciprocal") {
-      return 1.0 / input;
-    } else if (op_name == "Rsqrt") {
-      return 1.0 / std::sqrt(input);
-    } else if (op_name == "Sin") {
-      return std::sin(input);
-    } else if (op_name == "Sinh") {
-      return std::sinh(input);
-    } else if (op_name == "Sqrt") {
-      return std::sqrt(input);
-    } else if (op_name == "Tan") {
-      return std::tan(input);
-    }
-    EXPECT_TRUE(false);
-    return 0;
-  };
-
+  using OpFunc = std::function<NodeDef(void)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
+#define ADD_OP(name, op, compute) \
+  op_map[name] =                  \
+      std::make_pair(CreateUnaryOp<op>, static_cast<ValFunc>(compute))
+  ADD_OP("Abs", ops::Abs, std::abs);
+  ADD_OP("Acos", ops::Acos, std::acos);
+  ADD_OP("Acosh", ops::Acosh, std::acosh);
+  ADD_OP("Asin", ops::Asin, std::asin);
+  ADD_OP("Asinh", ops::Asinh, std::asinh);
+  ADD_OP("Atan", ops::Atan, std::atan);
+  ADD_OP("Atanh", ops::Atanh, std::atanh);
+  ADD_OP("Ceil", ops::Ceil, std::ceil);
+  ADD_OP("Cos", ops::Cos, std::cos);
+  ADD_OP("Cosh", ops::Cosh, std::cosh);
+  ADD_OP("Exp", ops::Exp, std::exp);
+  ADD_OP("Floor", ops::Floor, std::floor);
+  ADD_OP("Log", ops::Log, std::log);
+  ADD_OP("Neg", ops::Neg, [](float x) { return -x; });
+  ADD_OP("Reciprocal", ops::Reciprocal, [](float x) { return 1.0f / x; });
+  ADD_OP("Rsqrt", ops::Rsqrt, [](float x) { return 1.0f / std::sqrt(x); });
+  ADD_OP("Sin", ops::Sin, std::sin);
+  ADD_OP("Sinh", ops::Sinh, std::sinh);
+  ADD_OP("Sqrt", ops::Sqrt, std::sqrt);
+  ADD_OP("Tan", ops::Tan, std::tan);
+#undef ADD_OP
   // Get list of ops to test.
   std::vector<string> ops_to_test;
   // Add all ops supported by ConvertUnary.
@@ -5251,26 +5187,30 @@ TEST_F(OpConverterTest, ConvertUnary) {
   }
   // Add other unary ops to test.
   ops_to_test.push_back("Rsqrt");
-  // Ok.
+  // Prepare test parameters
+  auto p = TestParamBase{
+      {1, 1, 2, 3},  // input dims
+      {},            // input partial dims
+      {1, 1, 2, 3},  // expected output dims
+  };
   for (const string& op_name : ops_to_test) {
-    Reset();
-    NodeDef node_def = get_unary_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
-
-    const std::vector<float> input = {-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
-    const DataVec input_data{{"input", AsTensor<float>(input)}};
-    DataVec output_data{{"my_unary", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); ++i) {
-      const float expected_output = get_unary_output(op_name, input[i]);
-      EXPECT_THAT(GetSpanForData<float>(output_data[0])[i],
-                  NanSensitiveFloatNear(expected_output, 0.0001));
+    SCOPED_TRACE(op_name);
+    Reset(converter_precision, trt_mode);
+    if (!op_map.count(op_name)) {
+      FAIL() << "Unary op test map does not contain op " << op_name;
     }
+    NodeDef node_def = op_map[op_name].first();
+
+    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode);
+    RunValidationAndConversion(node_def, Status::OK(), "my_unary",
+                               p.expected_output_dims);
+
+    std::vector<float> input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+    std::vector<float> output;
+    std::transform(input_values.begin(), input_values.end(),
+                   std::back_inserter(output), op_map[op_name].second);
+    InstantiateBuildAndRun(tf_dtype, "my_unary", this, p, input_values,
+                           ArrayFloatNear(output, 0.0001, true));
   }
 }
 

From 9091f79f06bb8a2cf63a20101ed858105e9c641a Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Thu, 7 May 2020 11:10:21 -0700
Subject: [PATCH 0139/1533] Use persistent buffer in quantize, dequantize and
 xtensa_hifimini/quantize.

PiperOrigin-RevId: 310397757
Change-Id: I6ec93be5d94d765385316324a2702ea6903853c0
---
 tensorflow/lite/micro/kernels/dequantize.cc   | 51 ++++++++++++++-----
 tensorflow/lite/micro/kernels/quantize.cc     | 43 ++++++++++++----
 .../micro/kernels/xtensa_hifimini/quantize.cc | 31 ++++++-----
 3 files changed, 88 insertions(+), 37 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/dequantize.cc b/tensorflow/lite/micro/kernels/dequantize.cc
index 37fb8ffc3c6..4b87c0eb04c 100644
--- a/tensorflow/lite/micro/kernels/dequantize.cc
+++ b/tensorflow/lite/micro/kernels/dequantize.cc
@@ -28,7 +28,27 @@ namespace ops {
 namespace micro {
 namespace dequantize {
 
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -42,10 +62,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(
       context, output->type == kTfLiteFloat32 || output->type == kTfLiteInt32);
 
+  if (output->type == kTfLiteInt32) {
+    const double effective_output_scale =
+        static_cast<double>(input->params.scale) /
+        static_cast<double>(output->params.scale);
+    QuantizeMultiplier(effective_output_scale, &data->output_multiplier,
+                       &data->output_shift);
+  }
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
@@ -76,28 +106,21 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
         return kTfLiteError;
     }
   } else if (output->type == kTfLiteInt32) {
-    int32_t output_multiplier;
-    int output_shift;
-    const double effective_output_scale =
-        static_cast<double>(input->params.scale) /
-        static_cast<double>(output->params.scale);
-    QuantizeMultiplier(effective_output_scale, &output_multiplier,
-                       &output_shift);
     int flat_size =
         MatchingFlatSize(GetTensorShape(input), GetTensorShape(output));
     switch (input->type) {
       case kTfLiteInt16: {
         reference_ops::Requantize(
-            GetTensorData<int16_t>(input), flat_size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int32_t>(output));
+            GetTensorData<int16_t>(input), flat_size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int32_t>(output));
         break;
       }
       case kTfLiteInt8: {
         reference_ops::Requantize(
-            GetTensorData<int8_t>(input), flat_size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int32_t>(output));
+            GetTensorData<int8_t>(input), flat_size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int32_t>(output));
         break;
       }
       default:
@@ -119,7 +142,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace dequantize
 
 TfLiteRegistration* Register_DEQUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/dequantize::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/dequantize::Prepare,
                                  /*invoke=*/dequantize::Eval,
diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index d40471df948..b5bba83beb8 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -26,7 +26,27 @@ namespace ops {
 namespace micro {
 namespace quantize {
 
+struct OpData {
+  // The scaling factor from input to output (aka the 'real multiplier') can
+  // be represented as a fixed point multiplier plus a left shift.
+  int32_t output_multiplier;
+  int output_shift;
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
@@ -48,10 +68,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context,
                  output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
 
+  if (input->type == kTfLiteInt16 && output->type == kTfLiteInt8) {
+    double effective_scale =
+        static_cast<double>(input->params.scale / output->params.scale);
+
+    QuantizeMultiplier(effective_scale, &data->output_multiplier,
+                       &data->output_shift);
+  }
   return kTfLiteOk;
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  OpData* data = static_cast<OpData*>(node->user_data);
+
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
@@ -79,17 +109,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   } else if (input->type == kTfLiteInt16) {
     size_t size = ElementCount(*input->dims);
-    int32_t output_multiplier;
-    int output_shift;
-    double effective_scale =
-        static_cast<double>(input->params.scale / output->params.scale);
     switch (output->type) {
       case kTfLiteInt8:
-        QuantizeMultiplier(effective_scale, &output_multiplier, &output_shift);
         reference_ops::Requantize(
-            GetTensorData<int16_t>(input), size, output_multiplier,
-            output_shift, input->params.zero_point, output->params.zero_point,
-            GetTensorData<int8_t>(output));
+            GetTensorData<int16_t>(input), size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int8_t>(output));
         break;
       default:
         TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
@@ -113,7 +138,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 // AffineQuantize takes scale and zero point and quantizes the float value to
 // quantized output, in int8 or uint8 format.
 TfLiteRegistration* Register_QUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/quantize::Init,
                                  /*free=*/nullptr,
                                  /*prepare=*/quantize::Prepare,
                                  /*invoke=*/quantize::Eval,
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
index 2177bf1c363..29b2544a625 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/quantize.cc
@@ -108,22 +108,24 @@ struct OpData {
   int scale_multiplier = 0;
 };
 
-// This size will work for both the hotword (1) and ambient music (1):
-constexpr int kMaxOpDataSize = 2;
-static int op_data_counter = 0;
-static OpData kStaticOpData[kMaxOpDataSize];
-
-void Free(TfLiteContext* context, void* buffer) { op_data_counter = 0; }
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
+
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* input = GetInput(context, node, 0);
 
-  // TODO(b/132070898): Use statically slotted OpData structures until a
-  // scratch memory API is ready.
-  OpData* op_data = &kStaticOpData[op_data_counter++];
-  node->user_data = op_data;
-
+  // TODO(b/155682734): Fix dangerous input/output scale ratio assumptions.
   op_data->scale_multiplier = xtensa::hifimini::CreateQConstantForInt24(
       0, input->params.scale / output->params.scale);
 
@@ -131,7 +133,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+  TFLITE_DCHECK(node->user_data != nullptr);
+  auto* op_data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -159,8 +162,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 // AffineQuantize takes scale and zero point and quantizes the float value to
 // quantized output, in int8 or uint8 format.
 TfLiteRegistration* Register_QUANTIZE() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
-                                 /*free=*/quantize::Free,
+  static TfLiteRegistration r = {/*init=*/quantize::Init,
+                                 /*free=*/nullptr,
                                  /*prepare=*/quantize::Prepare,
                                  /*invoke=*/quantize::Eval,
                                  /*profiling_string=*/nullptr,

From 9cc74059d9084cf00a9de0f111ec8dab5741c4de Mon Sep 17 00:00:00 2001
From: rposts <rishi@ca.ibm.com>
Date: Thu, 7 May 2020 18:37:19 +0000
Subject: [PATCH 0140/1533] Fix bogomips extraction on s390x arch

---
 tensorflow/core/platform/profile_utils/cpu_utils.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc
index 587c97875a0..b22123a804a 100644
--- a/tensorflow/core/platform/profile_utils/cpu_utils.cc
+++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc
@@ -88,6 +88,8 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr;
      defined(__ppc__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__))
     retval = sscanf(line.c_str(), "clock              : %lfMHz", &cpu_freq);
     freq_factor = 1.0;
+#elif defined(__s390x__)
+    retval = sscanf(line.c_str(), "bogomips per cpu: %lf", &cpu_freq);
 #else
     retval = sscanf(line.c_str(), "bogomips : %lf", &cpu_freq);
 #endif

From 4dcd17de832310608777eb66d78bc374acb6953b Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Thu, 7 May 2020 11:41:45 -0700
Subject: [PATCH 0141/1533] Fix output shape of tf.GatherV2 in legalize-tf.mlir

According to the documentation, the output shape is:
`params.shape[:axis] + indices.shape[batch_dims:] + params.shape[axis + 1:]`

In this case, the varibles are:
params.shape = [16, 2, 3]
indices.shape = [16, 5]
axis = 2
batch_dims = 1

Thus, the output shape is `[16, 2] + [5] + [] = [16, 2, 5]`.

See https://www.tensorflow.org/api_docs/python/tf/gather for more details.

PiperOrigin-RevId: 310404487
Change-Id: I67ff267ddf2878145ccd51c3c4b2fd01c0adff37
---
 .../compiler/mlir/tensorflow/ir/tf_generated_ops.td       | 4 ++--
 tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir       | 8 ++++----
 tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 2b3dd529c3b..7a3c9617e2e 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3256,8 +3256,8 @@ Gather slices from `params` axis `axis` according to `indices`.
 
   let description = [{
 `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
+Produces an output tensor with shape `params.shape[:axis] +
+indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 
 ```python
     # Scalar indices (output is rank(params) - 1).
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index e15101a165e..72a9faea49a 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -3720,11 +3720,11 @@ func @unsorted_segment_max(%data: tensor<8x?x64xf32>, %segment_ids : tensor<?x16
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: @gather_v2
-func @gather_v2(%arg0: tensor<16x2x3xf32>, %arg1: tensor<16x5xi32>) -> tensor<16x2x5x3xf32> {
-  // CHECK: "xla_hlo.torch_index_select"(%arg0, %arg1) {batch_dims = 1 : i64, dim = 2 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>) -> tensor<16x2x5x3xf32>
+func @gather_v2(%arg0: tensor<16x2x3xf32>, %arg1: tensor<16x5xi32>) -> tensor<16x2x5xf32> {
+  // CHECK: "xla_hlo.torch_index_select"(%arg0, %arg1) {batch_dims = 1 : i64, dim = 2 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>) -> tensor<16x2x5xf32>
   %0 = "tf.Const"() { value = dense<[-1]> : tensor<1xi32> } : () -> tensor<1xi32>
-  %1 = "tf.GatherV2"(%arg0, %arg1, %0) {batch_dims = -1 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>, tensor<1xi32>) -> tensor<16x2x5x3xf32>
-  return %1 : tensor<16x2x5x3xf32>
+  %1 = "tf.GatherV2"(%arg0, %arg1, %0) {batch_dims = -1 : i64} : (tensor<16x2x3xf32>, tensor<16x5xi32>, tensor<1xi32>) -> tensor<16x2x5xf32>
+  return %1 : tensor<16x2x5xf32>
 }
 
 // CHECK-LABEL: @gather_v2_dynamic
diff --git a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
index c6104da4a64..7f2a8a1cf1a 100644
--- a/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GatherV2.pbtxt
@@ -30,8 +30,8 @@ END
   summary: "Gather slices from `params` axis `axis` according to `indices`."
   description: <<END
 `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-params.shape[axis + 1:]` where:
+Produces an output tensor with shape `params.shape[:axis] +
+indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 
 ```python
     # Scalar indices (output is rank(params) - 1).

From fc58d58923534e461d735a9a8b460d2dc8691ae5 Mon Sep 17 00:00:00 2001
From: Bas Aarts <baarts@nvidia.com>
Date: Thu, 7 May 2020 12:15:58 -0700
Subject: [PATCH 0142/1533] add __cudaInitModule to cudart_stub.cc

this fixes https://github.com/tensorflow/tensorflow/issues/39280
---
 tensorflow/stream_executor/cuda/cudart_stub.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 5ee106a65fd..1d7a4e378ba 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -131,6 +131,13 @@ extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
   return func_ptr(gridDim, blockDim, sharedMem, stream);
 }
 
+extern char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **fatCubinHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaInitModule");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(fatCubinHandle);
+}
+
 #if CUDART_VERSION >= 10010
 extern void CUDARTAPI __cudaRegisterFatBinaryEnd(void **fatCubinHandle) {
   using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);

From 3d0dac26f57b6235fddaee21705ec4a51eeedd4a Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 7 May 2020 13:09:51 -0700
Subject: [PATCH 0143/1533] Use subTest to improve error reporting on test
 failures in cwise_ops_test.py.

PiperOrigin-RevId: 310422205
Change-Id: I67ba3406fbf7df3ceef59ed3bd084d406cef23bd
---
 .../python/kernel_tests/cwise_ops_test.py     | 480 ++++++++++--------
 1 file changed, 262 insertions(+), 218 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 303d2791d07..8c84bde1431 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -97,23 +97,27 @@ class ComparisonOpTest(test.TestCase):
     for t in dtypes:
       for x in data:
         for y in data:
-          self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
-          self.assertEqual(
-              self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
-          self.assertEqual(
-              self._compareScalar(math_ops.greater, x, y, t), x > y)
-          self.assertEqual(
-              self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
-          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(
-              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
+          with self.subTest(t=t, x=x, y=y):
+            self.assertEqual(self._compareScalar(math_ops.less, x, y, t), x < y)
+            self.assertEqual(
+                self._compareScalar(math_ops.less_equal, x, y, t), x <= y)
+            self.assertEqual(
+                self._compareScalar(math_ops.greater, x, y, t), x > y)
+            self.assertEqual(
+                self._compareScalar(math_ops.greater_equal, x, y, t), x >= y)
+            self.assertEqual(
+                self._compareScalar(math_ops.equal, x, y, t), x == y)
+            self.assertEqual(
+                self._compareScalar(math_ops.not_equal, x, y, t), x != y)
     data = [-1, 0, 1, -1j, 1j, 1 + 1j, 1 - 1j]
     for t in [np.complex64, np.complex128]:
       for x in data:
         for y in data:
-          self.assertEqual(self._compareScalar(math_ops.equal, x, y, t), x == y)
-          self.assertEqual(
-              self._compareScalar(math_ops.not_equal, x, y, t), x != y)
+          with self.subTest(t=t, x=x, y=y):
+            self.assertEqual(
+                self._compareScalar(math_ops.equal, x, y, t), x == y)
+            self.assertEqual(
+                self._compareScalar(math_ops.not_equal, x, y, t), x != y)
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
@@ -126,22 +130,24 @@ class ComparisonOpTest(test.TestCase):
     x = np.linspace(-15, 15, 6).reshape(1, 3, 2)
     y = np.linspace(20, -10, 6).reshape(1, 3, 2)
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(xt, yt, np.less, math_ops.less)
-      self._compare(xt, yt, np.less_equal, math_ops.less_equal)
-      self._compare(xt, yt, np.greater, math_ops.greater)
-      self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
-      self._compare(xt, yt, np.equal, math_ops.equal)
-      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(xt, yt, np.less, math_ops.less)
+        self._compare(xt, yt, np.less_equal, math_ops.less_equal)
+        self._compare(xt, yt, np.greater, math_ops.greater)
+        self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
+        self._compare(xt, yt, np.equal, math_ops.equal)
+        self._compare(xt, yt, np.not_equal, math_ops.not_equal)
     # Complex types do not support ordering but do support equality tests.
     for t in [np.complex64, np.complex128]:
-      xt = x.astype(t)
-      xt -= 1j * xt
-      yt = y.astype(t)
-      yt -= 1j * yt
-      self._compare(xt, yt, np.equal, math_ops.equal)
-      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        xt -= 1j * xt
+        yt = y.astype(t)
+        yt -= 1j * yt
+        self._compare(xt, yt, np.equal, math_ops.equal)
+        self._compare(xt, yt, np.not_equal, math_ops.not_equal)
 
   def _compareBCast(self, xs, ys, dtype, np_func, tf_func):
     x = np.linspace(-15, 15, np.prod(xs)).astype(dtype).reshape(xs)
@@ -178,7 +184,8 @@ class ComparisonOpTest(test.TestCase):
 
     for (xs, ys) in shapes:
       for dtype in dtypes:
-        self._compareBCast(xs, ys, dtype, np_func, tf_func)
+        with self.subTest(xs=xs, ys=ys, dtype=dtype):
+          self._compareBCast(xs, ys, dtype, np_func, tf_func)
 
   def testBCastLess(self):
     self._testBCastByFunc(np.less, math_ops.less)
@@ -209,10 +216,11 @@ class ComparisonOpTest(test.TestCase):
     y = np.arange(0, 10).reshape([5, 2])
     for t in dtypes:
       for f in funcs:
-        with self.assertRaisesRegexp(
-            (ValueError, errors.InvalidArgumentError),
-            "Incompatible shapes|Dimensions must be equal"):
-          f(x.astype(t), y.astype(t))
+        with self.subTest(t=t, f=f):
+          with self.assertRaisesRegexp(
+              (ValueError, errors.InvalidArgumentError),
+              "Incompatible shapes|Dimensions must be equal"):
+            f(x.astype(t), y.astype(t))
 
 
 class LogicalOpTest(test.TestCase):
@@ -241,23 +249,27 @@ class LogicalOpTest(test.TestCase):
     data = [np.array([True]), np.array([False])]
     for use_gpu in [True, False]:
       for x in data:
-        self._not(x, use_gpu)
+        with self.subTest(use_gpu=use_gpu, x=x):
+          self._not(x, use_gpu)
       for x in data:
         for y in data:
-          self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
-                              use_gpu)
-          self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-          self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
-                              use_gpu)
+          with self.subTest(use_gpu=use_gpu, x=x, y=y):
+            self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
+                                use_gpu)
+            self._compareBinary(x, y, np.logical_or, math_ops.logical_or,
+                                use_gpu)
+            self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
+                                use_gpu)
 
   def testTensor(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     for use_gpu in [True, False]:
-      self._not(x, use_gpu)
-      self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
-      self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-      self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+      with self.subTest(use_gpu=use_gpu):
+        self._not(x, use_gpu)
+        self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
+        self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+        self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
 
   def testBCast(self):
     shapes = [
@@ -277,18 +289,22 @@ class LogicalOpTest(test.TestCase):
       x = np.random.randint(0, 2, np.prod(xs)).astype(np.bool).reshape(xs)
       y = np.random.randint(0, 2, np.prod(ys)).astype(np.bool).reshape(ys)
       for use_gpu in [True, False]:
-        self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
-        self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
-        self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+        with self.subTest(xs=xs, ys=ys, use_gpu=use_gpu):
+          self._compareBinary(x, y, np.logical_and, math_ops.logical_and,
+                              use_gpu)
+          self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+          self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor,
+                              use_gpu)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(3, 2, 1)
     for f in [math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor]:
-      with self.assertRaisesWithPredicateMatch(
-          ValueError, lambda e: "Dimensions must" in str(e)):
-        f(x, y)
+      with self.subTest(f=f):
+        with self.assertRaisesWithPredicateMatch(
+            ValueError, lambda e: "Dimensions must" in str(e)):
+          f(x, y)
 
   @test_util.run_deprecated_v1
   def testUsingAsPythonValueFails(self):
@@ -389,11 +405,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testScalar(self):
     self._testScalar(array_ops.where)
@@ -404,11 +421,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testScalarBroadcast(self):
     c = True
@@ -450,11 +468,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testBasic(self):
     self._testBasic(array_ops.where)
@@ -465,11 +484,12 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(fn, c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(fn, c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(fn, c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(fn, c, xt, yt, use_gpu=True)
 
   def testBasicBroadcast(self):
     c0 = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
@@ -478,53 +498,55 @@ class SelectOpTest(test.TestCase):
     c3 = np.random.randint(0, 2, 1).astype(np.bool).reshape(1, 1, 1)
     for c in [c0, c1, c2, c3]:
       # where_v2 only
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 3, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(3, 2) * 100
-      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
-      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      with self.subTest(c=c):
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 3, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(3, 2) * 100
+        self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+        self._testBasicBroadcast(array_ops.where_v2, c, y, x)
 
   def _testGradients(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
     y = np.random.rand(1, 3, 2) * 100
     for t in [np.float16, np.float32, np.float64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      if t == np.float16:
-        # Compare fp16 theoretical gradients to fp32 numerical gradients,
-        # since fp16 numerical gradients are too imprecise unless great
-        # care is taken with choosing the inputs and the delta. This is
-        # a weaker check (in particular, it does not test the op itself,
-        # only its gradient), but it's much better than nothing.
-        self._compareGradientX(fn, c, xt, yt, np.float)
-        self._compareGradientY(fn, c, xt, yt, np.float)
-      else:
-        self._compareGradientX(fn, c, xt, yt)
-        self._compareGradientY(fn, c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        if t == np.float16:
+          # Compare fp16 theoretical gradients to fp32 numerical gradients,
+          # since fp16 numerical gradients are too imprecise unless great
+          # care is taken with choosing the inputs and the delta. This is
+          # a weaker check (in particular, it does not test the op itself,
+          # only its gradient), but it's much better than nothing.
+          self._compareGradientX(fn, c, xt, yt, np.float)
+          self._compareGradientY(fn, c, xt, yt, np.float)
+        else:
+          self._compareGradientX(fn, c, xt, yt)
+          self._compareGradientY(fn, c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testGradients(self):
@@ -536,27 +558,28 @@ class SelectOpTest(test.TestCase):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     for t in [np.float32, np.float64]:
       # where_v2 only
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 3, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(1, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
-      x = np.random.rand(1, 3, 2) * 100
-      y = np.random.rand(3, 2) * 100
-      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      with self.subTest(t=t):
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 3, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(1, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+        x = np.random.rand(1, 3, 2) * 100
+        y = np.random.rand(3, 2) * 100
+        self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
 
   def _testShapeMismatch(self, fn):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
@@ -566,10 +589,11 @@ class SelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      with self.assertRaises(ValueError):
-        fn(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        with self.assertRaises(ValueError):
+          fn(c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
@@ -597,9 +621,10 @@ class SelectOpTest(test.TestCase):
       for c in False, True:
         for a in 7.0, np.nan:
           for b in 5.0, np.nan:
-            x = fn(c, a, b).eval()
-            y = a if c else b
-            self.assertEqual(np.isnan(x), np.isnan(y))
+            with self.subTest(c=c, a=a, b=b):
+              x = fn(c, a, b).eval()
+              y = a if c else b
+              self.assertEqual(np.isnan(x), np.isnan(y))
 
   @test_util.run_deprecated_v1
   def testNan(self):
@@ -677,11 +702,12 @@ class BatchSelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      self._compare(c, xt, yt, use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
-        self._compare(c, xt, yt, use_gpu=True)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        self._compare(c, xt, yt, use_gpu=False)
+        if t in [np.float16, np.float32, np.float64]:
+          self._compare(c, xt, yt, use_gpu=True)
 
   @test_util.run_deprecated_v1
   def testGradients(self):
@@ -689,19 +715,20 @@ class BatchSelectOpTest(test.TestCase):
     x = np.random.rand(16, 2, 8) * 100
     y = np.random.rand(16, 2, 8) * 100
     for t in [np.float16, np.float32, np.float64]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      if t == np.float16:
-        # Compare fp16 theoretical gradients to fp32 numerical gradients,
-        # since fp16 numerical gradients are too imprecise unless great
-        # care is taken with choosing the inputs and the delta. This is
-        # a weaker check (in particular, it does not test the op itself,
-        # only its gradient), but it's much better than nothing.
-        self._compareGradientX(c, xt, yt, np.float)
-        self._compareGradientY(c, xt, yt, np.float)
-      else:
-        self._compareGradientX(c, xt, yt)
-        self._compareGradientY(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        if t == np.float16:
+          # Compare fp16 theoretical gradients to fp32 numerical gradients,
+          # since fp16 numerical gradients are too imprecise unless great
+          # care is taken with choosing the inputs and the delta. This is
+          # a weaker check (in particular, it does not test the op itself,
+          # only its gradient), but it's much better than nothing.
+          self._compareGradientX(c, xt, yt, np.float)
+          self._compareGradientY(c, xt, yt, np.float)
+        else:
+          self._compareGradientX(c, xt, yt)
+          self._compareGradientY(c, xt, yt)
 
   @test_util.run_deprecated_v1
   def testShapeMismatch(self):
@@ -712,10 +739,11 @@ class BatchSelectOpTest(test.TestCase):
         np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64,
         np.complex128
     ]:
-      xt = x.astype(t)
-      yt = y.astype(t)
-      with self.assertRaises(ValueError):
-        array_ops.where(c, xt, yt)
+      with self.subTest(t=t):
+        xt = x.astype(t)
+        yt = y.astype(t)
+        with self.assertRaises(ValueError):
+          array_ops.where(c, xt, yt)
 
 
 class MinMaxOpTest(test.TestCase):
@@ -735,23 +763,26 @@ class MinMaxOpTest(test.TestCase):
     y = np.random.rand(1, 3, 2) * 100.
     for t in [np.float16, np.float32, np.float64, np.uint8, np.int16, np.int32,
               np.int64]:
-      self._compare(x.astype(t), y.astype(t), use_gpu=False)
-      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), y.astype(t), use_gpu=False)
+        self._compare(x.astype(t), y.astype(t), use_gpu=True)
 
   def testDifferentShapes(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(2) * 100.  # should broadcast
     for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
-      self._compare(x.astype(t), y.astype(t), use_gpu=False)
-      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), y.astype(t), use_gpu=False)
+        self._compare(x.astype(t), y.astype(t), use_gpu=True)
 
   def testScalar(self):
     x = np.random.rand(1, 3, 2) * 100.
     y = np.random.rand(1).item() * 100.  # should broadcast
     # dropped np.float64, int64 because TF automatically converts to 32 bit
     for t in [np.float32, np.int32]:
-      self._compare(x.astype(t), t(y), use_gpu=False)
-      self._compare(x.astype(t), t(y), use_gpu=True)
+      with self.subTest(t=t):
+        self._compare(x.astype(t), t(y), use_gpu=False)
+        self._compare(x.astype(t), t(y), use_gpu=True)
 
   def _compareGradientX(self, func, x, y):
     with self.cached_session():
@@ -841,13 +872,15 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        if dtype in (dtypes_lib.complex64,
-                     dtypes_lib.complex128) and tf_func == _FLOORDIV:
-          continue  # floordiv makes no sense for complex
-        self._compareBinary(10, 5, dtype, np_func, tf_func)
+        with self.subTest(dtype=dtype, np_func=np_func, tf_func=tf_func):
+          if dtype in (dtypes_lib.complex64,
+                       dtypes_lib.complex128) and tf_func == _FLOORDIV:
+            continue  # floordiv makes no sense for complex
+          self._compareBinary(10, 5, dtype, np_func, tf_func)
     # Mod only works for int32 and int64.
     for dtype in [dtypes_lib.int32, dtypes_lib.int64]:
-      self._compareBinary(10, 3, dtype, np.mod, _MOD)
+      with self.subTest(dtype=dtype):
+        self._compareBinary(10, 3, dtype, np.mod, _MOD)
 
   def testOverloadComparisons(self):
     dtypes = [
@@ -865,18 +898,20 @@ class MathOpsOverloadTest(test.TestCase):
     ]
     for dtype in dtypes:
       for np_func, tf_func in funcs:
-        self._compareBinary(10, 5, dtype, np_func, tf_func)
+        with self.subTest(dtype=dtype, np_func=np_func, tf_func=tf_func):
+          self._compareBinary(10, 5, dtype, np_func, tf_func)
     logical_funcs = [(np.logical_and, _AND), (np.logical_or, _OR),
                      (np.logical_xor, _XOR), (np.equal, math_ops.equal),
                      (np.not_equal, math_ops.not_equal)]
     for np_func, tf_func in logical_funcs:
-      self._compareBinary(True, False, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(True, True, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(False, False, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary(False, True, dtypes_lib.bool, np_func, tf_func)
-      self._compareBinary([True, True, False, False],
-                          [True, False, True, False], dtypes_lib.bool, np_func,
-                          tf_func)
+      with self.subTest(np_func=np_func, tf_func=tf_func):
+        self._compareBinary(True, False, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(True, True, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(False, False, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary(False, True, dtypes_lib.bool, np_func, tf_func)
+        self._compareBinary([True, True, False, False],
+                            [True, False, True, False], dtypes_lib.bool,
+                            np_func, tf_func)
     self._compareUnary(True, dtypes_lib.bool, np.logical_not, _INV)
     self._compareUnary(False, dtypes_lib.bool, np.logical_not, _INV)
     self._compareUnary([True, False], dtypes_lib.bool, np.logical_not, _INV)
@@ -924,16 +959,17 @@ class IsFiniteInfNanTest(test.TestCase):
         # It is not accurate for very large arguments, so we test for
         # fi.max/100 instead of fi.max here.
         for value in [fi.min, -2, -1, 0, fi.tiny, 1, 2, 1000, fi.max / 100]:
-          x = np.full((size,), value, dtype=dtype)
-          np_y = np.sqrt(x)
-          np_nan = np.isnan(np_y)
-          with test_util.use_gpu():
-            tf_y = math_ops.sqrt(x)
-            tf_nan = math_ops.is_nan(tf_y)
-            if value < 0:
-              self.assertAllEqual(np_nan, self.evaluate(tf_nan))
-            else:
-              self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
+          with self.subTest(dtype=dtype, size=size, value=value):
+            x = np.full((size,), value, dtype=dtype)
+            np_y = np.sqrt(x)
+            np_nan = np.isnan(np_y)
+            with test_util.use_gpu():
+              tf_y = math_ops.sqrt(x)
+              tf_nan = math_ops.is_nan(tf_y)
+              if value < 0:
+                self.assertAllEqual(np_nan, self.evaluate(tf_nan))
+              else:
+                self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
 
 
 class RoundingTest(test.TestCase):
@@ -978,7 +1014,8 @@ class RoundingTest(test.TestCase):
   def testTypes(self):
     self.skipTest("b/131162241")
     for dtype in [np.float16, np.float32, np.float64]:
-      self._testDtype(dtype)
+      with self.subTest(dtype=dtype):
+        self._testDtype(dtype)
 
 
 class ComplexMakeRealImagTest(test.TestCase):
@@ -999,19 +1036,21 @@ class ComplexMakeRealImagTest(test.TestCase):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
     imag = (np.arange(-3, 3) / 5.).reshape([1, 3, 2]).astype(np.float32)
     for use_gpu in [False, True]:
-      self._compareMake(real, imag, use_gpu)
-      self._compareMake(real, 12.0, use_gpu)
-      self._compareMake(23.0, imag, use_gpu)
+      with self.subTest(use_gpu=use_gpu):
+        self._compareMake(real, imag, use_gpu)
+        self._compareMake(real, 12.0, use_gpu)
+        self._compareMake(23.0, imag, use_gpu)
 
   def testRealImagNumericType(self):
     for use_gpu in [True, False]:
       for value in [1., 1j, 1. + 1j]:
-        np_real, np_imag = np.real(value), np.imag(value)
-        with test_util.device(use_gpu=use_gpu):
-          tf_real = math_ops.real(value)
-          tf_imag = math_ops.imag(value)
-          self.assertAllEqual(np_real, self.evaluate(tf_real))
-          self.assertAllEqual(np_imag, self.evaluate(tf_imag))
+        with self.subTest(use_gpu=use_gpu, value=value):
+          np_real, np_imag = np.real(value), np.imag(value)
+          with test_util.device(use_gpu=use_gpu):
+            tf_real = math_ops.real(value)
+            tf_imag = math_ops.imag(value)
+            self.assertAllEqual(np_real, self.evaluate(tf_real))
+            self.assertAllEqual(np_imag, self.evaluate(tf_imag))
 
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
@@ -1079,9 +1118,10 @@ class ComplexMakeRealImagTest(test.TestCase):
   def testRealReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
                   dtypes_lib.float64):
-      x = array_ops.placeholder(dtype)
-      y = math_ops.real(x)
-      self.assertEqual(x, y)
+      with self.subTest(dtype=dtype):
+        x = array_ops.placeholder(dtype)
+        y = math_ops.real(x)
+        self.assertEqual(x, y)
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
@@ -1110,9 +1150,10 @@ class ComplexMakeRealImagTest(test.TestCase):
   def testConjReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
                   dtypes_lib.float32, dtypes_lib.float64):
-      x = array_ops.placeholder(dtype)
-      y = math_ops.conj(x)
-      self.assertEqual(x, y)
+      with self.subTest(dtype=dtype):
+        x = array_ops.placeholder(dtype)
+        y = math_ops.conj(x)
+        self.assertEqual(x, y)
 
   @test_util.run_deprecated_v1
   def testConjString(self):
@@ -1146,10 +1187,11 @@ class ComplexMakeRealImagTest(test.TestCase):
     epsilon = 1e-3
     with self.cached_session():
       for args in [(x_, 0.), (0., x_)]:
-        z = math_ops.reduce_sum(math_ops.abs(math_ops.complex(*args)))
-        jacob_t, jacob_n = gradient_checker.compute_gradient(
-            x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
-        self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
+        with self.subTest(args=args):
+          z = math_ops.reduce_sum(math_ops.abs(math_ops.complex(*args)))
+          jacob_t, jacob_n = gradient_checker.compute_gradient(
+              x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
+          self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
   @test_util.run_deprecated_v1
   def testGradient(self):
@@ -1208,7 +1250,8 @@ class PolyvalTest(test.TestCase):
         np.int32, np.float32, np.float64, np.complex64, np.complex128
     ]:
       for degree in range(5):
-        self._runtest(dtype, degree)
+        with self.subTest(dtype=dtype, degree=degree):
+          self._runtest(dtype, degree)
 
   def testBroadcast(self):
     dtype = np.float32
@@ -1216,15 +1259,16 @@ class PolyvalTest(test.TestCase):
     shapes = [(1,), (2, 1), (1, 2), (2, 2)]
     for x_shape in shapes:
       for coeff_shape in shapes:
-        x = np.random.rand(*x_shape).astype(dtype)
-        coeffs = [
-            np.random.rand(*coeff_shape).astype(dtype)
-            for _ in range(degree + 1)
-        ]
-        np_val = np.polyval(coeffs, x)
-        with self.cached_session():
-          tf_val = math_ops.polyval(coeffs, x)
-          self.assertAllClose(np_val, self.evaluate(tf_val))
+        with self.subTest(x_shape=x_shape, coeff_shape=coeff_shape):
+          x = np.random.rand(*x_shape).astype(dtype)
+          coeffs = [
+              np.random.rand(*coeff_shape).astype(dtype)
+              for _ in range(degree + 1)
+          ]
+          np_val = np.polyval(coeffs, x)
+          with self.cached_session():
+            tf_val = math_ops.polyval(coeffs, x)
+            self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testEmpty(self):
     x = np.random.rand(2, 2).astype(np.float32)

From 2d529fbf9de3678ec85bdaebf6ff321b49288522 Mon Sep 17 00:00:00 2001
From: Andrew Cavanaugh <andrewc@xmos.com>
Date: Thu, 7 May 2020 16:31:22 -0400
Subject: [PATCH 0144/1533] Prcandidate1 (#2)

* Initial commit of XCORE port
---
 .../lite/micro/testing/test_xcore_binary.sh   | 47 +++++++++++++++++++
 .../micro/tools/make/download_and_extract.sh  |  3 ++
 .../tools/make/targets/xcore_makefile.inc     | 24 ++++++++++
 tensorflow/lite/micro/xcore/README.md         | 32 +++++++++++++
 tensorflow/lite/micro/xcore/debug_log.cc      | 17 +++++++
 5 files changed, 123 insertions(+)
 create mode 100755 tensorflow/lite/micro/testing/test_xcore_binary.sh
 create mode 100644 tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
 create mode 100644 tensorflow/lite/micro/xcore/README.md
 create mode 100644 tensorflow/lite/micro/xcore/debug_log.cc

diff --git a/tensorflow/lite/micro/testing/test_xcore_binary.sh b/tensorflow/lite/micro/testing/test_xcore_binary.sh
new file mode 100755
index 00000000000..e059968c885
--- /dev/null
+++ b/tensorflow/lite/micro/testing/test_xcore_binary.sh
@@ -0,0 +1,47 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests an  XS3  binary by executing it using the XSIM simulator and parsing
+# the log output.
+#
+# First argument is the binary location.
+# Second argument is a regular expression that's required to be in the output
+# logs for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_xcore_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}/$1
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+declare -r XCORE_
+mkdir -p ${MICRO_LOG_PATH}
+
+# Get the location of this script file as an absolute path
+SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`"
+SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`"
+XSIM_FLAGS=""
+
+
+xsim $1 ${XSIM_FLAGS} 2>&1 | tee ${MICRO_LOG_FILENAME}
+
+if grep -q "$2" ${MICRO_LOG_FILENAME}
+then
+  echo "$1: PASS"
+  exit 0
+else
+  echo "$1: FAIL - '$2' not found in logs."
+  exit 1
+fi
+
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 2248031f6d1..5a673985cdd 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -137,6 +137,9 @@ download_and_extract() {
     exit 1
   fi
 
+  # delete anything after the '?' in a url that might confound f
+  url=$(echo "${url}" | sed "s/\?.*//")
+
   if [[ "${url}" == *gz ]]; then
     tar -C "${dir}" --strip-components=1 -xzf ${tempfile}
   elif [[ "${url}" == *tar.xz ]]; then
diff --git a/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
new file mode 100644
index 00000000000..9a0f7463688
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
@@ -0,0 +1,24 @@
+# Settings for XMOS XS3 based processors (xcore.ai, ...)
+
+#IMPORTANT: to set up environment variables correctly run the following from the top tensorflow directory:
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads test
+# $ pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+
+ifeq ($(TARGET), xcore)
+  XTIME_URL := "https://www.xmos.com/download/Tools-15---Linux-64%2815.0.0_rc1%29.tgz?key=132D-9DC9-E913-0229-ECE6-D5AB-F511-2B19"
+  XTIME_MD5 := "8f6543c8ac4af7583edf75e62df322a2"
+  $(eval $(call add_third_party_download,$(XTIME_URL),$(XTIME_MD5),xtimecomposer))
+  PLATFORM_FLAGS = -target=XU316-1024-FB265-C32 -mcmodel=large -Os -DXCORE -Wno-xcore-fptrgroup -report
+  CXX_TOOL := xcc
+  CC_TOOL := xcc
+  AR_TOOL := xmosar   
+  override CXXFLAGS := -std=c++11 -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CXXFLAGS += $(PLATFORM_FLAGS) 
+  override CCFLAGS := -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CCFLAGS += $(PLATFORM_FLAGS)
+  TARGET_ARCH := xcore
+  #TARGET_TOOLCHAIN_PREFIX := tensorflow/lite/micro/tools/make/downloads/xtimecomposer/bin/
+  TEST_SCRIPT := tensorflow/lite/micro/testing/test_xcore_binary.sh
+  #GCC_XCORE := $(MAKEFILE_DIR)/downloads/xtimecomposer/bin/
+endif
diff --git a/tensorflow/lite/micro/xcore/README.md b/tensorflow/lite/micro/xcore/README.md
new file mode 100644
index 00000000000..bc217dce2fd
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/README.md
@@ -0,0 +1,32 @@
+# Quickstart to install tools and run unit tests:
+
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test_greedy_memory_planner_test || true && pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd  && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+
+(add -jN to the final make command to run builds / tests in N parallel threads)
+
+# Background information:
+
+* To start from a fresh repo (this will also remove non-xcore builds and downloads):
+```
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads
+```
+* To force xcore.ai tools download from a clean repo:
+```
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test_greedy_memory_planner_test
+```
+(this will fail to build the test, but if it succeeds because you already have tools it will exit quickly)
+
+* To set up environment variables correctly run the following from the top tensorflow directory:
+```
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+    $ pushd ./tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test 
+``` 
+* Assuming tools are already set up the following are the most commonly used commands:
+```   
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" build
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" < name_of_example i.e. hello_world_test >
+```
+
+
diff --git a/tensorflow/lite/micro/xcore/debug_log.cc b/tensorflow/lite/micro/xcore/debug_log.cc
new file mode 100644
index 00000000000..95ef8df6b05
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/debug_log.cc
@@ -0,0 +1,17 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+
+#include <cstdio>
+extern "C" void DebugLog(const char* s) { printf("%s",s); }

From 2acaff3d89dc3b2b98c4e0ea89b8b5b86588298f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 13:23:53 -0700
Subject: [PATCH 0145/1533] Branch TransformLandmarks operation to
 TransformLandmarksV2.

PiperOrigin-RevId: 310424827
Change-Id: I4e8771dfb8060438ed23ba97d7177633ad6aba76
---
 .../delegates/gpu/common/model_builder.cc     | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 6f0a13bc1bd..c536e09d6b5 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -2421,6 +2421,40 @@ class TransformLandmarksOperationParser : public TFLiteOperationParser {
  private:
 };
 
+class TransformLandmarksV2OperationParser : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
+                                       /*runtime_inputs=*/2, /*outputs=*/1));
+    return absl::OkStatus();
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // data
+    RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
+    RETURN_IF_ERROR(reader->AddOutputs(node));
+    std::string op_name = "transform_landmarks_v2";
+    node->operation.type = op_name;
+    BHWC output_shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+
+    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
+    return absl::OkStatus();
+  }
+
+ private:
+};
+
 class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
@@ -2672,6 +2706,9 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       if (custom_name == "TransformLandmarks") {
         return std::make_unique<TransformLandmarksOperationParser>();
       }
+      if (custom_name == "TransformLandmarksV2") {
+        return std::make_unique<TransformLandmarksV2OperationParser>();
+      }
       if (custom_name == "Landmarks2TransformMatrix") {
         return std::make_unique<Landmarks2TransformMatrixOperationParser>();
       }

From 8fef4ea5745a1df32d9df193751558d23862655e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 13:28:41 -0700
Subject: [PATCH 0146/1533] [XLA] Correct WhileLoopInvariantCodeMotion log
 messages.

PiperOrigin-RevId: 310425877
Change-Id: I582ca2b53515d932c7bde72d4ec91986b1482c43
---
 .../xla/service/while_loop_invariant_code_motion.cc         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 2d33184b7d0..1111811d3a3 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -300,7 +300,7 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
 }
 
 StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
-  VLOG(2) << "HLO module before WhileLoopConstantSinking:";
+  VLOG(2) << "HLO module before WhileLoopInvariantCodeMotion:";
   XLA_VLOG_LINES(2, module->ToString());
 
   bool changed = false;
@@ -332,10 +332,10 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
   }
 
   if (changed) {
-    VLOG(2) << "HLO module after WhileLoopConstantSinking:";
+    VLOG(2) << "HLO module after WhileLoopInvariantCodeMotion:";
     XLA_VLOG_LINES(2, module->ToString());
   } else {
-    VLOG(2) << "HLO module unchanged after WhileLoopConstantSinking";
+    VLOG(2) << "HLO module unchanged after WhileLoopInvariantCodeMotion";
   }
 
   return changed;

From d25465937101e2c5092a16efe0b27a72c13f4293 Mon Sep 17 00:00:00 2001
From: Qiao Zhang <zhangqiaorjc@google.com>
Date: Thu, 7 May 2020 13:36:37 -0700
Subject: [PATCH 0147/1533] Internal changes only.

PiperOrigin-RevId: 310427329
Change-Id: Ica8470b3f73b63e0052c65f40a30c0398624ce44
---
 tensorflow/python/eager/benchmarks_test.py | 132 ++++++++-------------
 1 file changed, 48 insertions(+), 84 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 9630ce01ce9..f2f13279927 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -110,14 +110,12 @@ def run_benchmark(func, num_iters, execution_mode=None):
 class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
   def __init__(self):
-    # TODO(b/153054118): Add tf.RandomUniform
-    if not context.is_tfrt_enabled():
-      # used for multiply benchmarks
-      self._m_2 = random_ops.random_uniform([2])
+    # used for multiply benchmarks
+    self._m_2 = random_ops.random_uniform([2])
 
-      # used for matmul benchmarks
-      self._m_2_by_2 = random_ops.random_uniform((2, 2))
-      self._m_100_by_784 = random_ops.random_uniform((100, 784))
+    # used for matmul benchmarks
+    self._m_2_by_2 = random_ops.random_uniform((2, 2))
+    self._m_100_by_784 = random_ops.random_uniform((100, 784))
 
     self._num_iters_2_by_2 = 30000
     self._num_iters_100_by_784 = 30000
@@ -319,17 +317,16 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: math_ops.multiply(m, m)
     self._run(func, num_iters)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("numpy() not supported")
   def benchmark_np_multiply(self):
     self._benchmark_np_multiply(self._m_2, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_multiply_CPU(self):
     with context.device(CPU):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_GPU(self):
     if not context.num_gpus():
       return
@@ -337,13 +334,12 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.gpu()
       self._benchmark_tf_multiply(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_multiply_op_CPU(self):
     with context.device(CPU):
       m = self._m_2.cpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_multiply_op_GPU(self):
     if not context.num_gpus():
       return
@@ -351,7 +347,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.gpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_identity(self):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)
@@ -360,7 +355,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmark_slowpath_tf_identity(self):
     self._run(lambda: gen_array_ops.identity(1), 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_execute_identity(self):
     m = self._m_2
     ctx_handle = context.context()._handle
@@ -498,19 +492,17 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._run(m.value, num_iters)
 
   # Benchmarks for A^2, A of dimension 2 by 2.
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_np_matmul_2_by_2(self):
     self._benchmark_np_matmul(
         self._m_2_by_2, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -520,35 +512,32 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_gen_math_ops_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_fastpath_execute_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_tfe_py_fastpath_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_execute_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -558,14 +547,14 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Mutex corrupt: waiting writer with no waiters")
   def benchmark_defun_matmul_forward_backward_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_defun_matmul_forward_backward(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_forward_backward_2_by_2_CPU_async(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
@@ -575,7 +564,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -584,7 +573,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -596,7 +585,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -605,7 +594,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -614,7 +603,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_defun_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -623,7 +612,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_defun_matmul_2_by_2_GPU_async(self):
     if not context.num_gpus():
       return
@@ -635,28 +624,26 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("function not supported")
   def benchmark_nested_defun_matmul_2_by_2(self):
     m = self._m_2_by_2.cpu()
     self._benchmark_nested_defun_matmul(
         m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
   # Benchmarks for AA.T, A of dimension 100 by 784.
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_np_matmul_100_by_784(self):
     self._benchmark_np_matmul(
         self._m_100_by_784,
         transpose_b=True,
         num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_CPU_async(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -666,35 +653,33 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_100_by_784,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_gen_math_ops_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tfe_py_fastpath_execute_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_tfe_py_fastpath_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("function not supported")
   def benchmark_defun_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -703,7 +688,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tf_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("async not supported")
   def benchmark_tf_matmul_100_by_784_GPU_async(self):
     if not context.num_gpus():
       return
@@ -715,7 +700,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_100_by_784,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_gen_math_ops_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -724,7 +709,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_gen_math_ops_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tfe_py_execute_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -733,7 +718,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_defun_matmul_100_by_784_GPU(self):
     if not context.num_gpus():
       return
@@ -742,7 +727,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_defun_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_nested_defun_matmul_100_by_784(self):
     m = self._m_100_by_784.gpu()
     self._benchmark_nested_defun_matmul(
@@ -815,35 +800,35 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_in_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_in_defun_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_in_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_in_defun_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("defun not supported")
   def benchmark_forwardprop_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(100, 784))
 
@@ -988,25 +973,20 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       func = lambda: array_ops.zeros_like(m)
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_CPU(self):
     self._benchmark_tf_zeros_like(self._m_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_GPU(self):
     self._benchmark_tf_zeros_like(self._m_2_by_2, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_variable_CPU(self):
     m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
     self._benchmark_tf_zeros_like(m)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_zeros_like_variable_GPU(self):
     m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
     self._benchmark_tf_zeros_like(m, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def _benchmark_tf_random_uniform_2_by_2(self,
                                           shape=(2, 2),
                                           dtype=dtypes.int32,
@@ -1018,30 +998,24 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
       self._run(func, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_integer_CPU(self):
     self._benchmark_tf_random_uniform_2_by_2()
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_integer_GPU(self):
     self._benchmark_tf_random_uniform_2_by_2(device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_float_CPU(self):
     self._benchmark_tf_random_uniform_2_by_2(dtype=dtypes.float32)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_float_GPU(self):
     self._benchmark_tf_random_uniform_2_by_2(
         dtype=dtypes.float32, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_default_setting_CPU(self):
     with context.device(CPU):
       func = lambda: random_ops.random_uniform((2, 2))
       self._run(func, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_random_uniform_2_by_2_default_setting_GPU(self):
     with context.device(GPU):
       func = lambda: random_ops.random_uniform((2, 2))
@@ -1063,19 +1037,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
       self._run(func, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_scalar_rate_2_by_2_CPU(self):
     self._benchmark_tf_dropout_2_by_2(is_rate_tensor=False)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_scalar_rate_2_by_2_GPU(self):
     self._benchmark_tf_dropout_2_by_2(is_rate_tensor=False, device=GPU)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_2_by_2_CPU(self):
     self._benchmark_tf_dropout_2_by_2()
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_tf_dropout_2_by_2_GPU(self):
     self._benchmark_tf_dropout_2_by_2(device=GPU)
 
@@ -1088,25 +1058,25 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: array_ops.transpose(m, perm, conjugate)
     self._run(func, num_iters, execution_mode=execution_mode)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_2_by_2_CPU(self):
     with context.device(CPU):
       m = self._m_2_by_2.cpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_tf_transpose_2_by_2_GPU(self):
     with context.device(GPU):
       m = self._m_2_by_2.gpu()
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("ConvertToEagerTensorUncached error")
   def benchmark_tf_transpose_variable_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("Cannot convert array to EagerTensor of dtype int32")
   def benchmark_tf_transpose_variable_2_by_2_GPU(self):
     with context.device(GPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
@@ -1164,26 +1134,23 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       return defined(t1=t, t2=t, t3=t, t4=t, t5=t, t6=t, t7=t, t8=t)
     self._run(signature_computation, 30000)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_matmul_read_variable_op_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_matmul_read_variable(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_matmul_read_variable_op_with_tape_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_matmul_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_read_variable_op_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1191,14 +1158,13 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2.gpu())
       self._benchmark_read_variable(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
   def benchmark_read_variable_op_with_tape_2_by_2_CPU(self):
     with context.device(CPU):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_read_variable_with_tape(
           m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("random ops not supported")
+  @test_util.disable_tfrt("copy to GPU not supported")
   def benchmark_read_variable_op_with_tape_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -1228,7 +1194,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(scan, 100)
 
-  @test_util.disable_tfrt("add not supported, only add_v2")
   def benchmark_fastpath_conversion_type_inference(self):
     c = constant_op.constant(1., dtype=dtypes.float32)
 
@@ -1268,7 +1233,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     xs = [[[np.linspace(0, 1, 21).tolist()] * 20] * 20]
     self._run(lambda: constant_op.constant(xs, dtype=dtypes.float64), 10000)
 
-  @test_util.disable_tfrt("tf.fill not supported")
   def benchmark_list_of_zeros_to_np_array(self):
     values = []
     for _ in range(1000):
@@ -1286,11 +1250,11 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         resources.append(resource_variable_ops.ResourceVariable(self._m_2))
       self._run(lambda: add_all(resources), num_iters)
 
-  @test_util.disable_tfrt("Random uniform needs fallback")
+  @test_util.disable_tfrt("funtion not supported")
   def benchmarkFunctionWithFiveResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(5, 1000)
 
-  @test_util.disable_tfrt("Random uniform needs fallback")
+  @test_util.disable_tfrt("funtion not supported")
   def benchmarkFunctionWithFiveHundredResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(500, 100)
 
@@ -1325,15 +1289,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     with context.device(CPU):
       self._run(benchmark_fn, 10)
 
-  @test_util.disable_tfrt("VarHandleOp needs fallback")
+  @test_util.disable_tfrt("funtion not supported")
   def benchmarkTenThousandResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10000)
 
-  @test_util.disable_tfrt("VarHandleOp needs fallback")
+  @test_util.disable_tfrt("funtion not supported")
   def benchmarkHundredResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(100)
 
-  @test_util.disable_tfrt("VarHandleOp needs fallback")
+  @test_util.disable_tfrt("funtion not supported")
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 

From 662f05dea1db0af3ad9a241cc10b552cfe07d15f Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 7 May 2020 13:39:41 -0700
Subject: [PATCH 0148/1533] Use multiple async executor to launch collectives
 in eager instead of tf.function

This avoid the expensive tracing cost of tf.function. Note that tf.function strips the devices of inputs, so we have to capture the input tensors thus there's no way to cache the tf.function

This makes _get_next_as_optional cheap enough to be enabled across the board. It can also remove the between step gap caused by reading metrics.

PiperOrigin-RevId: 310427983
Change-Id: Ib336be3b7855828de4fa3ad505b8cf86026ead5a
---
 .../python/distribute/cross_device_ops.py     | 68 +++++++++++--------
 .../distribute/cross_device_ops_test.py       | 62 +++++++++++++++++
 .../python/distribute/cross_device_utils.py   | 37 ++++++----
 3 files changed, 126 insertions(+), 41 deletions(-)

diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 0a662908323..8c8970f4aeb 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import threading
 
 import enum
 import six
@@ -31,7 +32,7 @@ from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
+from tensorflow.python.eager import executor
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -948,6 +949,20 @@ class CollectiveAllReduce(CrossDeviceOps):
     self._collective_keys = (collective_keys or
                              cross_device_utils.CollectiveKeys())
     self._communication = communication
+    # In a multi threaded eager program we need to ensure different groups of
+    # collectives don't interleave each other, otherwise there will be deadlock.
+    self._lock = threading.Lock()
+
+    # Collective ops requires all devices to participate and is blocking. In
+    # eager, we need one async executor for each device to be able to launch
+    # them altogether. Note that async doesn't imply concurrency. Within an
+    # async executor operations are still executed sequentially. In graph or
+    # function building, the executors are not used.
+    self._executors = []
+    for _ in range(self._num_gpus_per_worker or 1):
+      # If num_gpus_per_worker is zero, we assume there's only one device (CPU).
+      self._executors.append(executor.new_executor(enable_async=True))
+
     super(CollectiveAllReduce, self).__init__()
 
   @property
@@ -1059,33 +1074,26 @@ class CollectiveAllReduce(CrossDeviceOps):
           "num_workers = %d, communication_hint = %s, num_packs = %d" %
           (batch_size, self._num_workers, communication, len(packs)), 10)
 
-    def batch_fn():
-      """Wrapper function around batched all-reduce calls."""
-      reduced_values = []
-      for pack in packs:
-        # By placing all CollectiveReduce ops in a pack under single name scope,
-        # we ensure they will be picked up by the `ScopedAllocator` grappler
-        # optimizer and packed into a single all-reduce.
-        with ops.name_scope("allreduce"):
-          for per_replica in pack:
-            # Add control dependencies per device from the last gradients to the
-            # current set, in order to serialize NCCL launches.
-            if (communication == CollectiveCommunication.NCCL.value and
-                reduced_values):
-              control_inputs = [g for g in reduced_values[-1]]
-            else:
-              control_inputs = None
-            reduced_values.append(
-                cross_device_utils.build_collective_reduce(
-                    per_replica.values, self._num_workers,
-                    self._collective_keys, "Add", "Id", communication,
-                    control_inputs))
-      return reduced_values
+    reduced_values = []
+    for pack in packs:
+      # By placing all CollectiveReduce ops in a pack under single name scope,
+      # we ensure they will be picked up by the `ScopedAllocator` grappler
+      # optimizer and packed into a single all-reduce.
+      with self._lock, ops.name_scope("allreduce"):
+        for per_replica in pack:
+          # Add control dependencies per device from the last gradients to the
+          # current set, in order to serialize NCCL launches.
+          if (communication == CollectiveCommunication.NCCL.value and
+              reduced_values):
+            control_inputs = list(reduced_values[-1])
+          else:
+            control_inputs = None
+          reduced_values.append(
+              cross_device_utils.build_collective_reduce(
+                  per_replica.values, self._num_workers,
+                  self._collective_keys, "Add", "Id", communication,
+                  control_inputs, executors=self._executors))
 
-    if context.executing_eagerly():
-      batch_fn = def_function.function(batch_fn)
-
-    reduced_values = batch_fn()
     mirrored = []
     # Reverse the order of reduced value to recover the order in the input.
     for value in reversed(reduced_values):
@@ -1134,6 +1142,12 @@ class CollectiveAllReduce(CrossDeviceOps):
       mirrored.append(value_lib.regroup(value, wrap_class=value_lib.Mirrored))
     return mirrored
 
+  def __deepcopy__(self, memo):
+    # distribute_coordinator deep-copies the strategy object, so
+    # CollectiveAllReduce needs to support deep copy as well.
+    return CollectiveAllReduce(self._num_workers, self._num_gpus_per_worker,
+                               self._collective_keys, self._communication)
+
 
 def choose_the_best(devices, session_config=None):
   """Find the best CrossDeviceOps locally given a `tf.compat.v1.ConfigProto`.
diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py
index e1aa2bea97c..09de4306199 100644
--- a/tensorflow/python/distribute/cross_device_ops_test.py
+++ b/tensorflow/python/distribute/cross_device_ops_test.py
@@ -19,6 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
+import os
+import threading
+import time
 
 from absl.testing import parameterized
 import numpy as np
@@ -39,6 +42,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import kernels
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 
@@ -835,6 +839,64 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase,
         variable_length=variable_length,
         local_mode=True)
 
+  @combinations.generate(
+      combinations.combine(
+          required_gpus=2,
+          mode="eager",
+          communication=[
+              CollectiveCommunication.NCCL, CollectiveCommunication.RING
+          ]))
+  def testEagerMultiThread(self, communication):
+    collective, devices, _ = self._get_test_objects(
+        None,
+        None,
+        num_gpus=2,
+        communication=communication,
+        use_strategy_object=False,
+        local_mode=True)
+
+    # We would like to simulate the following sequence:
+    #   thread-0  device0                 device1
+    #   thread-1          device0 device1
+    # If the kernel launch sequence is as-is the program will deadlock since
+    # NCCL requires the launch order to be same on each device.
+    v0 = _make_per_replica([1.0 for _ in devices], devices)
+    v1 = _make_per_replica([2.0 for _ in devices], devices)
+
+    # Add a delay to collective_ops.all_reduce according to the input tensors
+    # index in `sequence.`
+    sequence = [v0.values[0], v1.values[0], v1.values[1], v0.values[1]]
+    all_reduce = collective_ops.all_reduce
+
+    def delayed_all_reduce(input_tensor, *args, **kwargs):
+      for idx, v in enumerate(sequence):
+        if input_tensor is v:
+          time.sleep(idx)
+          break
+      return all_reduce(input_tensor, *args, **kwargs)
+
+    with test.mock.patch.object(collective_ops, "all_reduce",
+                                delayed_all_reduce):
+      # We only use NCCL for batch reduce with two or more values, so we use two
+      # values here.
+
+      def thread_fn():
+        reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v0, v0),
+                                                                     (v0, v0)])
+        self.assertAllEqual(reduced[0].values, [2.0, 2.0])
+        self.assertAllEqual(reduced[1].values, [2.0, 2.0])
+
+      t = threading.Thread(target=thread_fn)
+      t.start()
+      reduced = collective.batch_reduce(reduce_util.ReduceOp.SUM, [(v1, v1),
+                                                                   (v1, v1)])
+      self.assertAllEqual(reduced[0].values, [4.0, 4.0])
+      self.assertAllEqual(reduced[1].values, [4.0, 4.0])
+      t.join()
+
 
 if __name__ == "__main__":
+  # Set default inter op thread pool size to one to ensure we don't exhaust the
+  # thread pool with the additional executors to run collectives in eager.
+  os.environ["TF_NUM_INTEROP_THREADS"] = "1"
   test.main()
diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py
index f9917385b59..d7be93ae2c4 100644
--- a/tensorflow/python/distribute/cross_device_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -337,10 +337,12 @@ def build_collective_reduce(input_tensors,
                             reduction_op='Add',
                             unary_op='Id',
                             communication_hint='AUTO',
-                            control_inputs=None):
+                            control_inputs=None,
+                            executors=None):
   """Build a subgraph that does one full all-reduce, using the collective Op.
 
-  This method must be called in graph mode or inside a tf.function.
+  If called in eager mode, it's required to supply a list of async executors for
+  each input Tensor.
 
   Args:
     input_tensors: tensors within a single worker graph that are to be reduced
@@ -355,6 +357,7 @@ def build_collective_reduce(input_tensors,
       implementation.
     control_inputs: if not None, add control edges between control_inputs and
       (index-wise) corresponding collective_reduce tensors
+    executors: a list of async executor. Required for eager execution.
 
   Returns:
     An array of final tensors, one per device, computed by the full reduction.
@@ -362,9 +365,11 @@ def build_collective_reduce(input_tensors,
   Raises:
     ValueError: There must be at least two tensors over all the workers.
   """
-  assert not context.executing_eagerly(), (
-      'build_collective_reduce can only be called in graph mode or inside '
-      'tf.function')
+  if context.executing_eagerly():
+    if (not executors or len(executors) != len(input_tensors) or
+        not all(e.is_async() for e in executors)):
+      raise ValueError(
+          'collectives requires async executors for each device in eager mode')
 
   group_size = len(input_tensors) * num_workers
   if group_size < 2:
@@ -375,15 +380,19 @@ def build_collective_reduce(input_tensors,
 
   out_tensors = []
   for idx, input_tensor in enumerate(input_tensors):
-    with ops.device(input_tensor.device):
-      with ops.control_dependencies(
-          _control_input(input_tensors, control_inputs, idx)):
-        out_tensor = collective_ops.all_reduce(input_tensor, group_size,
-                                               group_key, instance_key,
-                                               reduction_op, unary_op,
-                                               subdiv_offsets,
-                                               communication_hint)
-      out_tensors.append(out_tensor)
+    if context.executing_eagerly():
+      executor_scope = context.executor_scope(executors[idx])
+    else:
+      executor_scope = ops.NullContextmanager()
+    with executor_scope, \
+         ops.device(input_tensor.device), \
+         ops.control_dependencies(
+             _control_input(input_tensors, control_inputs, idx)):
+      out_tensor = collective_ops.all_reduce(input_tensor, group_size,
+                                             group_key, instance_key,
+                                             reduction_op, unary_op,
+                                             subdiv_offsets, communication_hint)
+    out_tensors.append(out_tensor)
   return out_tensors
 
 
From 69ed1e9a24424fa296b22176165528eec2888c11 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 13:40:41 -0700
Subject: [PATCH 0149/1533] Update Eigen to:
 https://gitlab.com/libeigen/eigen/-/commit/49f1aeb60d9f759859fce0d16aa5d1ecc7168d51

PiperOrigin-RevId: 310428269
Change-Id: If1048213ab92969db2c6b41f84435ecf2d47fa8e
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 56f36a7b004..77e2538a107 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -250,11 +250,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "d96aa8eda6dbf80e313c992a59e9e9451f420a6b9f58ef30aa41bffdc9df2f1b",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-1e41406c362788057b3adcd9a25b73f43e6e6492",
+        sha256 = "2c7c0aec4271dfca6b8a7707e2112f67c4cb3bdf7c89c0e98d3fcd39707c4468",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/1e41406c362788057b3adcd9a25b73f43e6e6492/eigen-1e41406c362788057b3adcd9a25b73f43e6e6492.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/1e41406c362788057b3adcd9a25b73f43e6e6492/eigen-1e41406c362788057b3adcd9a25b73f43e6e6492.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/49f1aeb60d9f759859fce0d16aa5d1ecc7168d51/eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/49f1aeb60d9f759859fce0d16aa5d1ecc7168d51/eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51.tar.gz",
         ],
     )
 

From 5bf0bab331720d2b1cd1ff862b646bdef45b7206 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Thu, 7 May 2020 20:55:49 +0000
Subject: [PATCH 0150/1533] [ROCm] Fix for ROCm CSB breakage on 200507

The following PR/commit introduces a build error on the ROCm platform

https://github.com/tensorflow/tensorflow/pull/38802

The error is caused by a call to the `CsrgemmBufferSize` routine which only exists on the CUDA side. The call to it was not guarded by the same #if block that guards the function declaration + definition. Adding the missing #if block fixes the issue.

This PR also adds some explicit `GOOGLE_CUDA &&` and `|| TENSORFLOW_USE_ROCM` conditions to some `#if` to make things clear.
---
 tensorflow/core/kernels/cuda_sparse.h            | 10 +++++-----
 tensorflow/core/kernels/sparse/mat_mul_op.cc     | 16 +++++-----------
 .../core/kernels/sparse/sparse_mat_mul_op.cc     | 16 +++++++++-------
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/kernels/cuda_sparse.h b/tensorflow/core/kernels/cuda_sparse.h
index eb69469b615..2d41cc72421 100644
--- a/tensorflow/core/kernels/cuda_sparse.h
+++ b/tensorflow/core/kernels/cuda_sparse.h
@@ -259,7 +259,7 @@ class GpuSparse {
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-coo2csr.
   Status Coo2csr(const int* cooRowInd, int nnz, int m, int* csrRowPtr) const;
 
-#if CUDA_VERSION < 10020
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM
   // Sparse-dense matrix multiplication C = alpha * op(A) * op(B)  + beta * C,
   // where A is a sparse matrix in CSR format, B and C are dense tall
   // matrices.  This routine allows transposition of matrix B, which
@@ -311,7 +311,7 @@ class GpuSparse {
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrmv_mergepath
   //
   // **NOTE** This is an in-place operation for data in y.
-#if CUDA_VERSION < 10020
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10020)) || TENSORFLOW_USE_ROCM
   template <typename Scalar>
   Status Csrmv(gpusparseOperation_t transA, int m, int n, int nnz,
                const Scalar* alpha_host, const gpusparseMatDescr_t descrA,
@@ -366,7 +366,7 @@ class GpuSparse {
                  Scalar* csrSortedValC, int* csrSortedRowPtrC,
                  int* csrSortedColIndC, void* workspace);
 
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
   // Computes sparse-sparse matrix multiplication of matrices
   // stored in CSR format.  This is part zero: calculate required workspace
   // size.
@@ -383,7 +383,7 @@ class GpuSparse {
   // output.  csrSortedRowPtrC must be preallocated on device with
   // m + 1 entries.  See:
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
   Status CsrgemmNnz(gpusparseOperation_t transA, gpusparseOperation_t transB,
                     int m, int k, int n, const gpusparseMatDescr_t descrA,
                     int nnzA, const int* csrSortedRowPtrA,
@@ -408,7 +408,7 @@ class GpuSparse {
   // addition.  csrValC and csrColIndC must be allocated on the device
   // with nnzTotalDevHostPtr entries (as calculated by CsrgemmNnz).  See:
   // http://docs.nvidia.com/cuda/cusparse/index.html#cusparse-lt-t-gt-csrgemm.
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
   template <typename Scalar>
   Status Csrgemm(gpusparseOperation_t transA, gpusparseOperation_t transB,
                  int m, int k, int n, const gpusparseMatDescr_t descrA,
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index a0834800446..50fa0ec88ea 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -728,12 +728,14 @@ namespace {
 template <typename T>
 struct GPUDataType;
 
+// GPUDataType templates are currently not instantiated in the ROCm flow
+// So leaving out the #elif TENSORFLOW_USE_ROCM blocks for now
+// hipblas library is not (yet) being pulled in via rocm_configure.bzl
+// so cannot reference tyeps from hipblas headers here
 template <>
 struct GPUDataType<Eigen::half> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_16F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_R_16F;
 #endif
 };
 
@@ -741,8 +743,6 @@ template <>
 struct GPUDataType<float> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_32F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_R_32F;
 #endif
 };
 
@@ -750,8 +750,6 @@ template <>
 struct GPUDataType<std::complex<float>> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_C_32F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_C_32F;
 #endif
 };
 
@@ -759,8 +757,6 @@ template <>
 struct GPUDataType<double> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_R_64F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_R_64F;
 #endif
 };
 
@@ -768,8 +764,6 @@ template <>
 struct GPUDataType<std::complex<double>> {
 #if GOOGLE_CUDA
   static constexpr cudaDataType_t type = CUDA_C_64F;
-#elif TENSORFLOW_USE_ROCM
-  static constexpr hipblasDataType_t type = HIPBLAS_C_64F;
 #endif
 };
 
@@ -957,7 +951,7 @@ class CSRSparseMatrixMatVec<GPUDevice, T> {
       const int n = a.dense_shape_host(1);
       const int nnz = a.values.size();
       DCHECK_EQ(nnz, a.col_ind.size());
-#if CUDA_VERSION >= 10020
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10020)
       TF_RETURN_IF_ERROR(cuda_sparse.Csrmv(transA_, m, n, nnz, &alpha,
                                            a.values.data(), a.row_ptr.data(),
                                            a.col_ind.data(), x, &beta, y));
diff --git a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
index 7325d5f6873..fb652e13d15 100644
--- a/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_mat_mul_op.cc
@@ -417,7 +417,7 @@ class CSRSparseMatMulGPUOp : public OpKernel {
     }
     auto b_input_dense_shape = b_input_matrix->dense_shape().vec<int64>();
 
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
     size_t maxWorkspaceSize = 0;
     for (int i = 0; i < batch_size; ++i) {
       // Calculate maximum workspace size over batch.
@@ -558,7 +558,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         initialized_(false),
         transpose_a_(transpose_a),
         adjoint_a_(adjoint_a),
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
         transpose_b_(transpose_b) {
 #else
         transpose_b_(transpose_b),
@@ -573,7 +573,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
                           : GPUSPARSE(OPERATION_NON_TRANSPOSE);
   }
 
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
   ~CSRSparseSparseMatrixMatMul() {
     if (initialized_) {
       cusparseDestroyCsrgemm2Info(info_);
@@ -591,7 +591,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
     TF_RETURN_IF_ERROR(descrA_.Initialize());
     TF_RETURN_IF_ERROR(descrB_.Initialize());
     TF_RETURN_IF_ERROR(descrC_.Initialize());
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
     TF_RETURN_IF_GPUSPARSE_ERROR(cusparseCreateCsrgemm2Info(&info_));
 #endif
     initialized_ = true;
@@ -600,6 +600,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
 
   Status GetWorkspaceSize(const ConstCSRComponent<T>& a,
                           const ConstCSRComponent<T>& b, size_t* bufferSize) {
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
     DCHECK(initialized_);
     const int m =
         a.dense_shape_host(a.dense_shape_host.size() - (transpose_a_ ? 1 : 2));
@@ -621,6 +622,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(), a.col_ind.data(),
         descrB_.descr(), nnzB, b.row_ptr.data(), b.col_ind.data(), info_,
         bufferSize));
+#endif
 
     return Status::OK();
   }
@@ -650,7 +652,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
 
     *output_nnz = -1;
 
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
     TF_RETURN_IF_ERROR(cuda_sparse_.CsrgemmNnz(
         transA_, transB_, m, n, k, descrA_.descr(), nnzA, a.row_ptr.data(),
         a.col_ind.data(), descrB_.descr(), nnzB, b.row_ptr.data(),
@@ -693,7 +695,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
         b.dense_shape_host(b.dense_shape_host.size() - (transpose_b_ ? 2 : 1));
     DCHECK_EQ(n, c->dense_shape_host(c->dense_shape_host.size() - 1));
 
-#if CUDA_VERSION < 10000
+#if (GOOGLE_CUDA && (CUDA_VERSION < 10000)) || TENSORFLOW_USE_ROCM
     TF_RETURN_IF_ERROR(cuda_sparse_.Csrgemm(
         transA_, transB_, m, k, n, descrA_.descr(), nnzA, a.values.data(),
         a.row_ptr.data(), a.col_ind.data(), descrB_.descr(), nnzB,
@@ -732,7 +734,7 @@ struct CSRSparseSparseMatrixMatMul<GPUDevice, T>
   GpuSparseMatrixDescriptor descrC_;
   gpusparseOperation_t transA_;
   gpusparseOperation_t transB_;
-#if CUDA_VERSION >= 10000
+#if GOOGLE_CUDA && (CUDA_VERSION >= 10000)
   csrgemm2Info_t info_;
 #endif
 };

From 917e8d35e54061bb5e51b496895572978f79cb15 Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Thu, 7 May 2020 14:06:49 -0700
Subject: [PATCH 0151/1533] Use strategy default device (if not None) as the
 destination for strategy.reduce.

This fixes a bug where strategy.reduce is not called within a strategy.scope in MultiworkerMirroredStrategy, and the strategy incorrectly tries to reduce values to "/job:localhost/replica:0/task:0/device:CPU:0", which is not a valid device in MultiworkerMirroredStrategy.

PiperOrigin-RevId: 310433243
Change-Id: Ifa365629366a79620c6b43e06375ab5747040275
---
 tensorflow/python/distribute/BUILD            | 22 +++++++
 .../python/distribute/distribute_lib.py       |  5 +-
 .../python/distribute/strategy_common_test.py | 65 +++++++++++++++++++
 3 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/python/distribute/strategy_common_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 5dccb47fb19..add23bef4b9 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1647,3 +1647,25 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+cuda_py_test(
+    name = "strategy_common_test",
+    srcs = ["strategy_common_test.py"],
+    tags = [
+        "multi_and_single_gpu",
+        # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
+        # runner can run on guitar.
+        "noguitar",
+    ],
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        ":combinations",
+        ":reduce_util",
+        ":strategy_combinations",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index d17a594cb5e..6baa15f59c1 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1912,9 +1912,8 @@ class StrategyExtendedV2(object):
 
   def _reduce(self, reduce_op, value):
     # Default implementation until we have an implementation for each strategy.
-    return self._local_results(
-        self.reduce_to(reduce_op, value,
-                       device_util.current() or "/device:CPU:0"))[0]
+    dst = device_util.current() or self._default_device or "/device:CPU:0"
+    return self._local_results(self.reduce_to(reduce_op, value, dst))[0]
 
   def reduce_to(self, reduce_op, value, destinations, experimental_hints=None):
     """Combine (via e.g. sum or mean) values across replicas.
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
new file mode 100644
index 00000000000..c277310b6a0
--- /dev/null
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for common methods in strategy classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class StrategyReduceTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers] +
+          strategy_combinations.strategies_minus_tpu,
+          mode=['eager']))
+  def testSimpleReduce(self, strategy):
+
+    def fn_eager():
+
+      def replica_fn():
+        return array_ops.ones((), dtypes.float32)
+
+      per_replica_value = strategy.run(replica_fn)
+      return strategy.reduce(
+          reduce_util.ReduceOp.SUM, value=per_replica_value, axis=None)
+
+    fn_graph = def_function.function(fn_eager)
+
+    # Run reduce under the strategy scope to explicitly enter
+    # strategy default_device scope.
+    with strategy.scope():
+      self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+      self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+    # Run reduce without a strategy scope to implicitly enter
+    # strategy default_device scope.
+    self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
+    self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
+
+
+if __name__ == '__main__':
+  combinations.main()

From a5df6056fb68c10199ecf03e138587c743c92009 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Thu, 7 May 2020 14:11:02 -0700
Subject: [PATCH 0152/1533] Remove multi_worker_callback_tf1_test for lack of
 resource to continue maintenance with the focus on tf2/eager.

PiperOrigin-RevId: 310434045
Change-Id: Iad8c619220a4219dba38abe8ff771aa67bd9d240
---
 tensorflow/python/keras/distribute/BUILD      |  24 -
 .../multi_worker_callback_tf1_test.py         | 597 ------------------
 2 files changed, 621 deletions(-)
 delete mode 100644 tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py

diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index d379e4f3349..a7b1caa3b02 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -362,30 +362,6 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
-    name = "multi_worker_callback_tf1_test",
-    srcs = ["multi_worker_callback_tf1_test.py"],
-    # TODO(b/132384649): Enable for guitar and oss tests.
-    shard_count = 24,
-    tags = [
-        "manual",
-        "no_oss",
-        "noguitar",
-        "notap",
-    ],
-    deps = [
-        ":distribute",
-        ":multi_worker_testing_utils",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:distribute_config",
-        "//tensorflow/python/distribute:distribute_coordinator",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/keras",
-    ],
-)
-
 py_test(
     name = "multi_worker_callback_tf2_test",
     srcs = ["multi_worker_callback_tf2_test.py"],
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py
deleted file mode 100644
index 95a235e7b33..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf1_test.py
+++ /dev/null
@@ -1,597 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for Keras callbacks in multi-worker training with TF1."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import os
-import sys
-import tempfile
-import threading
-
-from absl.testing import parameterized
-
-from tensorflow.python import keras
-from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks
-from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
-from tensorflow.python.platform import test
-
-
-def get_strategy_object(strategy_cls):
-  if strategy_cls == mirrored_strategy.MirroredStrategy:
-    return strategy_cls(mirrored_strategy.all_local_devices())
-  else:
-    # CollectiveAllReduceStrategy and ParameterServerStrategy.
-    return strategy_cls()
-
-
-def generate_callback_test_function(custom_callable):
-  """Generic template for callback tests using mnist synthetic dataset."""
-
-  @combinations.generate(
-      combinations.combine(
-          mode=['graph'],
-          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
-          required_gpus=[0, 1],
-          file_format=['h5', 'tf']))
-  def test_template(self, strategy_cls, file_format):
-    num_workers = 2
-    num_epoch = 2
-
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-
-    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
-      """Simulates an Independent Worker inside of a thread."""
-      with test.mock.patch.object(dc, '_run_std_server',
-                                  self._make_mock_run_std_server()):
-        strategy = get_strategy_object(strategy_cls)
-        batch_size = 64
-        steps = 2
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-        with strategy.scope():
-          model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-
-        custom_callable(
-            model,
-            self,
-            train_ds,
-            num_epoch,
-            steps,
-            strategy,
-            saving_filepath=kwargs['saving_filepath'],
-            barrier=kwargs['barrier'],
-            threading_local=kwargs['threading_local'])
-
-    # Pass saving_filepath from the parent thread to ensure every worker has the
-    # same filepath to save.
-    saving_filepath = os.path.join(self.get_temp_dir(),
-                                   'checkpoint.' + file_format)
-    barrier = dc._Barrier(2)
-    threading_local = threading.local()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        saving_filepath=saving_filepath,
-        barrier=barrier,
-        threading_local=threading_local)
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    threads_to_join = []
-    strategy = get_strategy_object(strategy_cls)
-    if strategy.extended.experimental_between_graph:
-      for ts in threads.values():
-        threads_to_join.extend(ts)
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-
-  return test_template
-
-
-class KerasMultiWorkerCallbackTest(test_base.IndependentWorkerTestBase,
-                                   parameterized.TestCase):
-  """KerasMultiWorkerCallbackTest for TF1.
-
-  TODO(rchao): Migrate all tests in this class to
-  `multi_worker_callback_tf2_test`.
-  """
-
-  # The callables of the actual testing content to be run go below.
-  @staticmethod
-  def callableForTestChiefOnlyCallback(model, test_obj, train_ds, num_epoch,
-                                       steps, strategy, saving_filepath,
-                                       **kwargs):
-
-    class ChiefOnly(keras.callbacks.Callback):
-
-      def __init__(self):
-        self._chief_worker_only = True
-        self.filtered_correctly = True
-
-      def on_train_begin(self, logs):
-        if not multi_worker_util.is_chief():
-          # Non-chief workers shouldn't run this callback.
-          self.filtered_correctly = False
-
-    cb = ChiefOnly()
-    model.fit(
-        x=train_ds, epochs=num_epoch, steps_per_epoch=steps, callbacks=[cb])
-
-    test_obj.assertTrue(cb.filtered_correctly)
-
-  @staticmethod
-  def callableForTestModelCheckpointSavesOnChiefButNotOtherwise(
-      model, test_obj, train_ds, num_epoch, steps, strategy, saving_filepath,
-      **kwargs):
-
-    extension = os.path.splitext(saving_filepath)[1]
-
-    # Incorporate type/index information and thread id in saving_filepath to
-    # ensure every worker has a unique path. Note that in normal use case the
-    # saving_filepath will be the same for all workers, but we use different
-    # ones here just to test out chief saves checkpoint but non-chief doesn't.
-
-    saving_filepath = os.path.join(
-        test_obj.get_temp_dir(), 'checkpoint_%s_%d%s' %
-        (test_base.get_task_type(), test_base.get_task_index(), extension))
-
-    # The saving_filepath shouldn't exist at the beginning (as it's unique).
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[callbacks.ModelCheckpoint(filepath=saving_filepath)])
-
-    # If it's chief, the model should be saved; if not, the model shouldn't.
-    test_obj.assertEqual(
-        training_state.checkpoint_exists(saving_filepath), test_base.is_chief())
-
-  @staticmethod
-  def initialFitting(test_obj, model, train_ds, num_epoch, steps,
-                     saving_filepath):
-    # The saving_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True)
-        ])
-
-    # The saving_filepath should exist after fitting with callback. Both chief
-    # and non-chief worker should both see it exists (which was saved only by
-    # chief).
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    history_after_one_more_epoch = model.fit(
-        x=train_ds, epochs=1, steps_per_epoch=steps)
-
-    # The saving_filepath should continue to exist (if it did) after fitting
-    # without callback.
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    return saving_filepath, history_after_one_more_epoch
-
-  @staticmethod
-  def callableForTestLoadWeightFromModelCheckpoint(model, test_obj, train_ds,
-                                                   num_epoch, steps, strategy,
-                                                   saving_filepath, **kwargs):
-    filepaths = []
-    real_mkstemp = tempfile.mkstemp
-    def mocked_mkstemp():
-      # Only non-chief should call tempfile.mkstemp() inside fit() in sync
-      # training.
-      assert not test_base.is_chief()
-      file_handle, temp_file_name = real_mkstemp()
-      extension = os.path.splitext(saving_filepath)[1]
-      temp_filepath = temp_file_name + extension
-      filepaths.append(temp_filepath)
-      return file_handle, temp_file_name
-
-    # Mock tempfile.mkstemp() so the filepaths can be stored and verified later.
-    with test.mock.patch.object(tempfile, 'mkstemp', mocked_mkstemp):
-      saving_filepath, history_after_one_more_epoch = \
-          KerasMultiWorkerCallbackTest.initialFitting(
-              test_obj, model, train_ds, num_epoch, steps, saving_filepath)
-
-      with strategy.scope():
-        model.load_weights(saving_filepath)
-
-      history_after_loading_weight_and_one_more_epoch = model.fit(
-          x=train_ds, epochs=1, steps_per_epoch=steps)
-
-      test_obj.assertAllClose(
-          history_after_one_more_epoch.history,
-          history_after_loading_weight_and_one_more_epoch.history,
-          rtol=5e-5)
-
-    # Verify the temp files are indeed removed (no trace left behind).
-    for filepath in filepaths:
-      assert not training_state.checkpoint_exists(filepath)
-
-  @staticmethod
-  def callableForTestModelRestoreCallback(model, test_obj, train_ds, num_epoch,
-                                          steps, strategy, saving_filepath,
-                                          **kwargs):
-
-    saving_filepath, history_after_one_more_epoch = \
-        KerasMultiWorkerCallbackTest.initialFitting(
-            test_obj, model, train_ds, num_epoch, steps, saving_filepath)
-
-    # The model should get restored to the weights previously saved, by
-    # adding a ModelCheckpoint callback (which results in a
-    # _ModelRestoreCallback being added), with load_weights_on_restart=True.
-    history_after_model_restoring_and_one_more_epoch = model.fit(
-        x=train_ds,
-        epochs=1,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath,
-                save_weights_only=True,
-                load_weights_on_restart=True)
-        ])
-
-    # Asserting the history one epoch after initial fitting and one epoch after
-    # restoring are closed.
-    test_obj.assertAllClose(
-        history_after_one_more_epoch.history,
-        history_after_model_restoring_and_one_more_epoch.history,
-        rtol=5e-5)
-
-    history_one_more_epoch_without_model_restoring = model.fit(
-        x=train_ds, epochs=1, steps_per_epoch=steps)
-
-    # Ensuring training for another epoch gives different result.
-    test_obj.assertNotAllClose(
-        history_after_model_restoring_and_one_more_epoch.history,
-        history_one_more_epoch_without_model_restoring.history,
-        rtol=5e-5)
-
-  @staticmethod
-  def callableForTestBackupModelRemoved(model, test_obj, train_ds, num_epoch,
-                                        steps, strategy, saving_filepath,
-                                        **kwargs):
-
-    # `barrier` object needs to be passed in from parent
-    # thread so both threads refer to the same object.
-    barrier = kwargs['barrier']
-
-    num_epoch = 3
-
-    # Testing the backup filepath `multi_worker_training_state` uses.
-    _, backup_filepath = training_state._get_backup_filepath(saving_filepath)
-
-    # The backup_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-    # Callback to verify that the backup file exists in the middle of training.
-    class BackupFilepathVerifyingCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        if epoch > 1:
-          # Asserting that after the first two epochs, the backup file should
-          # exist.
-          test_obj.assertTrue(training_state.checkpoint_exists(backup_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True),
-            BackupFilepathVerifyingCallback()
-        ])
-
-    # Sync on the two threads so we make sure the backup file is removed before
-    # we move on.
-    barrier.wait()
-
-    # The back up file should not exist at successful exit of `model.fit()`.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-  @staticmethod
-  def callableForTestBackupModelNotRemovedIfInterrupted(model, test_obj,
-                                                        train_ds, num_epoch,
-                                                        steps, strategy,
-                                                        saving_filepath,
-                                                        **kwargs):
-
-    # `barrier` object needs to be passed in from parent
-    # thread so both threads refer to the same object.
-    barrier = kwargs['barrier']
-
-    num_epoch = 4
-
-    # Testing the backup filepath `multi_worker_training_state` uses.
-    _, backup_filepath = training_state._get_backup_filepath(saving_filepath)
-
-    # The backup_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(backup_filepath))
-
-    # Callback to interrupt in the middle of training.
-    class InterruptingCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs=None):
-        if epoch == 2:
-          raise RuntimeError('Interrupting!')
-
-    try:
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          steps_per_epoch=steps,
-          callbacks=[
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath, save_weights_only=True),
-              InterruptingCallback()
-          ])
-    except RuntimeError as e:
-      if 'Interrupting!' not in e.message:
-        raise
-
-    # Sync on the two threads.
-    barrier.wait()
-
-    # The back up file should exist after interruption of `model.fit()`.
-    test_obj.assertTrue(training_state.checkpoint_exists(backup_filepath))
-
-  @staticmethod
-  def callableForTestUnmatchedModelFile(model, test_obj, train_ds, num_epoch,
-                                        steps, strategy, saving_filepath,
-                                        **kwargs):
-
-    # The saving_filepath shouldn't exist at the beginning.
-    test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
-
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=[
-            callbacks.ModelCheckpoint(
-                filepath=saving_filepath, save_weights_only=True)
-        ])
-
-    (train_ds, _), (_, _) = testing_utils.get_test_data(
-        train_samples=10, test_samples=10, input_shape=(3,), num_classes=2)
-
-    # Switch to a model of different structure.
-    with strategy.scope():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(5, input_dim=3, activation='relu'))
-      model.add(keras.layers.Dense(2, activation='softmax'))
-      model.compile(
-          loss='categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
-
-    test_obj.assertTrue(training_state.checkpoint_exists(saving_filepath))
-
-    if saving_filepath.endswith('.tf'):
-      test_obj.skipTest('Loading mismatched TF checkpoint would cause Fatal '
-                        'Python error: Aborted. Skipping.')
-
-    # Unmatched format. Should raise ValueError.
-    with test_obj.assertRaisesRegexp(ValueError, 'Error loading file from'):
-      model.fit(
-          x=train_ds,
-          epochs=num_epoch,
-          batch_size=8,
-          callbacks=[
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath,
-                  save_weights_only=True,
-                  load_weights_on_restart=True)
-          ])
-
-  @staticmethod
-  def callableForTestReduceLROnPlateau(model, test_obj, train_ds, num_epoch,
-                                       steps, strategy, saving_filepath,
-                                       **kwargs):
-
-    cbks = [
-        callbacks.ReduceLROnPlateau(
-            monitor='loss',
-            factor=0.1,
-            min_delta=1,
-            patience=1,
-            cooldown=5,
-            verbose=1)
-    ]
-
-    # It is expected that the learning rate would drop by `factor` within
-    # 3 epochs with `min_delta=1`.
-    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.0001, atol=1e-8)
-
-    # It is expected that the learning rate would drop by another `factor`
-    # within 3 epochs with `min_delta=1`.
-    model.fit(x=train_ds, epochs=3, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.00001, atol=1e-8)
-
-  @staticmethod
-  def callableForTestEarlyStopping(model, test_obj, train_ds, num_epoch, steps,
-                                   strategy, saving_filepath, **kwargs):
-
-    class EpochCounterCallback(callbacks.Callback):
-
-      def on_epoch_begin(self, epoch, logs):
-        self.last_epoch = epoch
-
-    epoch_counter_cbk = EpochCounterCallback()
-    cbks = [
-        callbacks.EarlyStopping(
-            monitor='loss', min_delta=0.05, patience=1, verbose=1),
-        epoch_counter_cbk
-    ]
-
-    # Empirically, it is expected that `model.fit()` would terminate around the
-    # 22th epoch. Asserting that it should have been stopped before the 50th
-    # epoch to avoid flakiness and be more predictable.
-    model.fit(x=train_ds, epochs=100, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertLess(epoch_counter_cbk.last_epoch, 50)
-
-  @staticmethod
-  def callableForTestLearningRateScheduler(model, test_obj, train_ds, num_epoch,
-                                           steps, strategy, saving_filepath,
-                                           **kwargs):
-
-    cbks = [
-        callbacks.LearningRateScheduler(
-            schedule=lambda x: 1. / (1. + x), verbose=1)
-    ]
-
-    # It is expected that with `epochs=2`, the learning rate would drop to
-    # 1 / (1 + 2) = 0.5.
-    model.fit(x=train_ds, epochs=2, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.5, atol=1e-8)
-
-    # It is expected that with `epochs=4`, the learning rate would drop to
-    # 1 / (1 + 4) = 0.25.
-    model.fit(x=train_ds, epochs=4, steps_per_epoch=steps, callbacks=cbks)
-    test_obj.assertAllClose(
-        float(K.get_value(model.optimizer.lr)), 0.25, atol=1e-8)
-
-  # pylint: disable=g-doc-args
-  @staticmethod
-  def callableForTestIntermediateDirForFTAreRemoved(model, test_obj, train_ds,
-                                                    num_epoch, steps, strategy,
-                                                    saving_filepath, **kwargs):
-    """Testing that the temporary directory are removed.
-
-    Some temporary directories are created for the purpose of fault tolerance.
-    This test ensures that such directories should have been removed at the time
-    `model.fit()` finishes successfully.
-    """
-
-    # `threading_local` and `barrier` objects have to be passed in from parent
-    # thread so both threads refer to the same object.
-    threading_local = kwargs['threading_local']
-    barrier = kwargs['barrier']
-
-    # Two threads will each has one copy of `temp_dirs_supposed_to_be_removed`
-    # list.
-    threading_local.temp_dirs_supposed_to_be_removed = []
-
-    callbacks_list = [
-        callbacks.ModelCheckpoint(
-            filepath=saving_filepath,
-            save_weights_only=True,
-            load_weights_on_restart=True),
-    ]
-
-    # Keep the references to the real function objects.
-    real_os_path_join = os.path.join
-    real_tempfile_mkdtemp = tempfile.mkdtemp
-
-    # Make a `os.path.join` wrapper, which will be patched onto the real
-    # function, so the temporary directories can be tracked.
-    def wrapper_os_path_join(path, *paths):
-      join_result = real_os_path_join(path, *paths)
-      if len(paths) == 1 and paths[0] == 'backup':
-        threading_local.temp_dirs_supposed_to_be_removed.append(join_result)
-      return join_result
-
-    # Likewise for `tempfile.mkdtemp`.
-    def wrapper_tempfile_mkdtemp():
-      result = real_tempfile_mkdtemp()
-      threading_local.temp_dirs_supposed_to_be_removed.append(result)
-      return result
-
-    # Now the two threads must sync here: if they are out of sync, one thread
-    # can go ahead and patch `os.path.join` while the other has not even
-    # assigned the real `os.path.join` to `real_os_path_join`. If this happened,
-    # the "real" `os.path.join` the slower thread would see is actually the
-    # wrapper of the other.
-    barrier.wait()
-
-    # Note that `os.path.join` will respect the second patch (there are two
-    # patches because of the two threads). Both threads will refer to the same
-    # copy of `wrapper_os_path_join` because of the `barrier` preceding
-    # `model.fit()`. Likewise for `wrapper_tempfile_mkdtemp`.
-    os.path.join = wrapper_os_path_join
-    tempfile.mkdtemp = wrapper_tempfile_mkdtemp
-
-    barrier.wait()
-    model.fit(
-        x=train_ds,
-        epochs=num_epoch,
-        steps_per_epoch=steps,
-        callbacks=callbacks_list)
-
-    # Sync before un-patching to prevent either thread from accessing the real
-    # functions. Also to make sure `model.fit()` is done on both threads (so we
-    # can safely assert the directories are removed).
-    barrier.wait()
-    os.path.join = real_os_path_join
-    tempfile.mkdtemp = real_tempfile_mkdtemp
-
-    # There should be directory (names) that are supposed to be removed.
-    test_obj.assertTrue(threading_local.temp_dirs_supposed_to_be_removed)
-    for temp_dir_supposed_to_be_removed in (
-        threading_local.temp_dirs_supposed_to_be_removed):
-      # They should have been removed and thus don't exist.
-      test_obj.assertFalse(os.path.exists(temp_dir_supposed_to_be_removed))
-
-  # The actual testing methods go here.
-  test_chief_only_callback = generate_callback_test_function(
-      callableForTestChiefOnlyCallback.__func__)
-  test_model_checkpoint_saves_on_chief_but_not_otherwise = \
-      generate_callback_test_function(
-          callableForTestModelCheckpointSavesOnChiefButNotOtherwise.__func__)
-  test_load_weight_from_model_checkpoint = generate_callback_test_function(
-      callableForTestLoadWeightFromModelCheckpoint.__func__)
-  test_model_restore_callback = generate_callback_test_function(
-      callableForTestModelRestoreCallback.__func__)
-  test_unmatched_model_file = generate_callback_test_function(
-      callableForTestUnmatchedModelFile.__func__)
-  test_reduce_lr_on_plateau = generate_callback_test_function(
-      callableForTestReduceLROnPlateau.__func__)
-  test_early_stopping = generate_callback_test_function(
-      callableForTestEarlyStopping.__func__)
-  test_learning_rate_scheduler = generate_callback_test_function(
-      callableForTestLearningRateScheduler.__func__)
-  test_intermediate_dir_for_ft_are_removed = generate_callback_test_function(
-      callableForTestIntermediateDirForFTAreRemoved.__func__)
-  test_backup_model_removed = generate_callback_test_function(
-      callableForTestBackupModelRemoved.__func__)
-  test_backup_model_not_removed_if_interrupted = \
-      generate_callback_test_function(
-          callableForTestBackupModelNotRemovedIfInterrupted.__func__)
-
-
-if __name__ == '__main__':
-  with test.mock.patch.object(sys, 'exit', os._exit):
-    test.main()

From e87a0f09d52e70280f3307e1fdc7bba622f7d9ec Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 7 May 2020 15:11:19 -0700
Subject: [PATCH 0153/1533] Whitelist a few more unary and binary TensorFlow
 ops for the fallback path

Enabled the passing tests.

Also, fixed the lowering for TensorFlow Round op which is not directly map to the HLO Round op. This is correctly handled by the fallback path.

PiperOrigin-RevId: 310445636
Change-Id: I4b1b869cbd6eddf141f84b18901de10c40a8d346
---
 .../compiler/mlir/xla/tests/legalize-tf.mlir  |  7 ---
 .../xla/transforms/legalize_tf_patterns.td    |  1 -
 .../xla/transforms/legalize_tf_with_tf2xla.cc | 51 +++++++++++++++----
 tensorflow/compiler/tests/binary_ops_test.py  |  2 -
 tensorflow/compiler/tests/unary_ops_test.py   |  7 ++-
 5 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 72a9faea49a..aef9b17d3db 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -2205,13 +2205,6 @@ func @sin_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   return %0 : tensor<*xf32>
 }
 
-// CHECK-LABEL: func @round
-func @round(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK:  "xla_hlo.round_nearest_afz"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  %0 = "tf.Round"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-  return %0 : tensor<2xf32>
-}
-
 // CHECK-LABEL: func @rsqrt
 func @rsqrt(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK:  "xla_hlo.rsqrt"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index b2a7c1e7f62..d53dbdcaaeb 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -543,7 +543,6 @@ foreach Mapping = [
                    [TF_LogicalNotOp, HLO_NotOp],
                    [TF_NegOp, HLO_NegOp],
                    [TF_RealOp, HLO_RealOp],
-                   [TF_RoundOp, HLO_RoundOp],
                    [TF_RsqrtOp, HLO_RsqrtOp],
                    [TF_SinOp, HLO_SinOp],
                    [TF_SqrtOp, HLO_SqrtOp],
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 551462572f1..bc2a8b606da 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -83,31 +83,48 @@ static bool IsOpWhitelisted(Operation* op) {
   // clang-format off
   static llvm::SmallDenseSet<mlir::TypeID, 512> ops = {
     TypeID::get<TF::AbsOp>(),
+    TypeID::get<TF::AcoshOp>(),
+    TypeID::get<TF::AcosOp>(),
     TypeID::get<TF::AddNOp>(),
     TypeID::get<TF::AddV2Op>(),
+    TypeID::get<TF::ApproximateEqualOp>(),
+    TypeID::get<TF::AsinhOp>(),
+    TypeID::get<TF::AsinOp>(),
     TypeID::get<TF::Atan2Op>(),
+    TypeID::get<TF::AtanhOp>(),
+    TypeID::get<TF::AtanOp>(),
     TypeID::get<TF::BatchMatMulV2Op>(),
-    TypeID::get<TF::BiasAddOp>(),
     TypeID::get<TF::BiasAddGradOp>(),
+    TypeID::get<TF::BiasAddOp>(),
     TypeID::get<TF::BitwiseAndOp>(),
     TypeID::get<TF::BitwiseOrOp>(),
     TypeID::get<TF::BitwiseXorOp>(),
     TypeID::get<TF::CastOp>(),
     TypeID::get<TF::ComplexAbsOp>(),
+    TypeID::get<TF::CoshOp>(),
     TypeID::get<TF::DataFormatDimMapOp>(),
     TypeID::get<TF::DataFormatVecPermuteOp>(),
+    TypeID::get<TF::DigammaOp>(),
     TypeID::get<TF::DivNoNanOp>(),
+    TypeID::get<TF::EluGradOp>(),
+    TypeID::get<TF::EluOp>(),
     TypeID::get<TF::EqualOp>(),
+    TypeID::get<TF::ErfcOp>(),
+    TypeID::get<TF::ErfOp>(),
+    TypeID::get<TF::Expm1Op>(),
     TypeID::get<TF::FloorDivOp>(),
     TypeID::get<TF::FloorModOp>(),
-    TypeID::get<TF::GreaterOp>(),
-    TypeID::get<TF::GreaterEqualOp>(),
     TypeID::get<TF::GatherNdOp>(),
-    TypeID::get<TF::InvOp>(),
+    TypeID::get<TF::GreaterEqualOp>(),
+    TypeID::get<TF::GreaterOp>(),
     TypeID::get<TF::InvertOp>(),
+    TypeID::get<TF::InvOp>(),
+    TypeID::get<TF::LeakyReluGradOp>(),
+    TypeID::get<TF::LeakyReluOp>(),
     TypeID::get<TF::LeftShiftOp>(),
-    TypeID::get<TF::LessOp>(),
     TypeID::get<TF::LessEqualOp>(),
+    TypeID::get<TF::LessOp>(),
+    TypeID::get<TF::LgammaOp>(),
     TypeID::get<TF::LogicalAndOp>(),
     TypeID::get<TF::LogicalNotOp>(),
     TypeID::get<TF::LogicalOrOp>(),
@@ -119,18 +136,34 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::PlaceholderWithDefaultOp>(),
     TypeID::get<TF::PowOp>(),
     TypeID::get<TF::RealDivOp>(),
+    TypeID::get<TF::ReciprocalGradOp>(),
+    TypeID::get<TF::Relu6GradOp>(),
     TypeID::get<TF::RightShiftOp>(),
-    TypeID::get<TF::SinOp>(),
+    TypeID::get<TF::RintOp>(),
+    TypeID::get<TF::RoundOp>(),
     TypeID::get<TF::SelectV2Op>(),
-    TypeID::get<TF::SubOp>(),
+    TypeID::get<TF::SeluGradOp>(),
+    TypeID::get<TF::SeluOp>(),
+    TypeID::get<TF::SigmoidGradOp>(),
+    TypeID::get<TF::SinhOp>(),
+    TypeID::get<TF::SinOp>(),
+    TypeID::get<TF::SoftplusGradOp>(),
+    TypeID::get<TF::SoftsignGradOp>(),
+    TypeID::get<TF::SoftsignOp>(),
+    TypeID::get<TF::SqrtGradOp>(),
     TypeID::get<TF::SquareOp>(),
+    TypeID::get<TF::SubOp>(),
+    TypeID::get<TF::TanOp>(),
     TypeID::get<TF::TransposeOp>(),
     TypeID::get<TF::TruncateDivOp>(),
-    TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::TruncatedNormalOp>(),
+    TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::UnpackOp>(),
+    TypeID::get<TF::XdivyOp>(),
     TypeID::get<TF::XlaDotOp>(),
-    TypeID::get<TF::XlaPadOp>()
+    TypeID::get<TF::XlaPadOp>(),
+    TypeID::get<TF::Xlog1pyOp>(),
+    TypeID::get<TF::XlogyOp>()
   };
   // clang-format on
 
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 92ea1cfaf87..9d48758928e 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -73,8 +73,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
       self.assertAllCloseAccordingToType(
           result[i], expected[i], rtol=rtol, atol=atol)
 
-  @test_util.disable_mlir_bridge(
-      "F16 type is not supported in CreateDenseElementsAttrFromLiteral")
   def testFloatOps(self):
     for dtype in self.float_types:
       if dtype == dtypes.bfloat16.as_numpy_dtype:
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 0c4c7bacdf3..dc11c24b6d2 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -186,8 +186,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
         self._assertOpOutputMatchesExpected(
             math_ops.cos, x, expected=np.cos(x), rtol=tol, atol=1e-5)
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/153812660): Handle tf.Softmax compilation")
   def testFloatOps(self):
     for dtype in self.float_types:
       x = np.arange(-0.90, 0.90, 0.25)
@@ -514,6 +512,11 @@ class UnaryOpsTest(xla_test.XLATestCase):
               ],
               dtype=dtype))
 
+  @test_util.disable_mlir_bridge(
+      "TODO(b/153812660): Handle tf.QuantizeAndDequantize compilation")
+  def testQuantizeAndDequantize(self):
+    for dtype in self.float_types:
+
       def quantize_and_dequantize_v2(x):
         return array_ops.quantize_and_dequantize_v2(
             x, -127, 127, signed_input=True, num_bits=8)

From f77d548c962dffd06b88dbb3a3ea6947c791a558 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 15:27:10 -0700
Subject: [PATCH 0154/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310448486
Change-Id: I89d5c142dcab5049a33237dc781c92dff57b04ae
---
 tensorflow/go/op/wrappers.go | 48 ++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f2459cc9334..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1950,8 +1950,8 @@ func GatherV2BatchDims(value int64) GatherV2Attr {
 // Gather slices from `params` axis `axis` according to `indices`.
 //
 // `indices` must be an integer tensor of any dimension (usually 0-D or 1-D).
-// Produces an output tensor with shape `params.shape[:axis] + indices.shape +
-// params.shape[axis + 1:]` where:
+// Produces an output tensor with shape `params.shape[:axis] +
+// indices.shape[batch_dims:] + params.shape[axis + 1:]` where:
 //
 // ```python
 //     # Scalar indices (output is rank(params) - 1).
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 0e517a6fc0dc40926f68f4b7bf9c0337e3bc55a9 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Thu, 7 May 2020 16:03:23 -0700
Subject: [PATCH 0155/1533] Implement fast deferred-decoding Python stack trace
 class.

PiperOrigin-RevId: 310454564
Change-Id: Ia49ff70aa874393cb8321046887eed6646bcd1e6
---
 tensorflow/python/BUILD                       |  38 +++++++
 tensorflow/python/eager/BUILD                 |   1 +
 tensorflow/python/eager/benchmarks_test.py    |   5 +
 tensorflow/python/lib/core/py_util.h          |  10 ++
 tensorflow/python/util/stack_trace.cc         |  34 ++++++
 tensorflow/python/util/stack_trace.h          | 101 ++++++++++++++++++
 .../util/stack_trace_binding_for_test.cc      |  29 +++++
 tensorflow/python/util/stack_trace_test.py    |  49 +++++++++
 8 files changed, 267 insertions(+)
 create mode 100644 tensorflow/python/util/stack_trace.cc
 create mode 100644 tensorflow/python/util/stack_trace.h
 create mode 100644 tensorflow/python/util/stack_trace_binding_for_test.cc
 create mode 100644 tensorflow/python/util/stack_trace_test.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4729ce9d743..324788e6aa7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1028,6 +1028,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:script_ops_op_lib",
+        "//tensorflow/core/platform:logging",
         "//third_party/python_runtime:headers",
     ],
 )
@@ -5548,6 +5549,43 @@ tf_py_test(
     ],
 )
 
+cc_library(
+    name = "stack_trace",
+    srcs = ["util/stack_trace.cc"],
+    hdrs = ["util/stack_trace.h"],
+    deps = [
+        ":py_util",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+pybind_extension(
+    name = "_stack_trace_binding_for_test",
+    testonly = True,
+    srcs = ["util/stack_trace_binding_for_test.cc"],
+    # TODO(b/138203821): change to "util._tf_stack" once the bug is fixed.
+    module_name = "_stack_trace_binding_for_test",
+    deps = [
+        ":stack_trace",
+        "//tensorflow/core:test",
+        "//third_party/python_runtime:headers",  # buildcleaner: keep
+        "@pybind11",
+    ],
+)
+
+tf_py_test(
+    name = "stack_trace_test",
+    srcs = ["util/stack_trace_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":_stack_trace_binding_for_test",
+        ":client_testlib",
+    ],
+)
+
 py_library(
     name = "util",
     srcs = glob(
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index c08cb8cc1c3..056f35f90dd 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -637,6 +637,7 @@ cuda_py_test(
         ":function",
         ":remote",
         ":test",
+        "//tensorflow/python:_stack_trace_binding_for_test",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:random_ops",
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index f2f13279927..8a07c11349a 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -38,6 +38,7 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python import _stack_trace_binding_for_test
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import benchmarks_test_base
@@ -1301,6 +1302,10 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
+  def benchmarkPythonStackTrace_1000_times(self):
+    self._run(lambda: _stack_trace_binding_for_test.stack_trace_n_times(1000),
+              1)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
index a9f39d39461..1f3998152f8 100644
--- a/tensorflow/python/lib/core/py_util.h
+++ b/tensorflow/python/lib/core/py_util.h
@@ -16,12 +16,22 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
 #define TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
 
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+
 // Fetch the exception message as a string. An exception must be set
 // (PyErr_Occurred() must be true).
 string PyExceptionFetch();
+
+// Assert that Python GIL is held.
+inline void DCheckPyGilState() {
+#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 4
+  DCHECK(PyGILState_Check());
+#endif
+}
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
diff --git a/tensorflow/python/util/stack_trace.cc b/tensorflow/python/util/stack_trace.cc
new file mode 100644
index 00000000000..b4d6e3ca866
--- /dev/null
+++ b/tensorflow/python/util/stack_trace.cc
@@ -0,0 +1,34 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/util/stack_trace.h"
+
+namespace tensorflow {
+std::string StackTrace::ToString() const {
+  DCheckPyGilState();
+
+  std::ostringstream result;
+  for (int i = size_ - 1; i >= 0; --i) {
+    result << "  File \"" << PyUnicode_AsUTF8(code_objs_[i]->co_filename)
+           << "\", line "
+           << PyCode_Addr2Line(code_objs_[i], last_instructions_[i]) << ", in "
+           << PyUnicode_AsUTF8(code_objs_[i]->co_name)
+           << "\n    <source line unimplemented>\n";
+    // TODO(kkb): Add source code line.  See tf_stack.cc's
+    // FrameSummary::line() function.
+  }
+  return result.str();
+}
+}  // namespace tensorflow
diff --git a/tensorflow/python/util/stack_trace.h b/tensorflow/python/util/stack_trace.h
new file mode 100644
index 00000000000..b04104c4618
--- /dev/null
+++ b/tensorflow/python/util/stack_trace.h
@@ -0,0 +1,101 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
+#define TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
+
+#include <Python.h>
+#include <frameobject.h>
+
+#include <array>
+#include <sstream>
+#include <string>
+
+#include "absl/base/attributes.h"
+#include "absl/base/optimization.h"
+#include "tensorflow/python/lib/core/py_util.h"
+
+namespace tensorflow {
+
+// A class for capturing Python stack trace.
+class StackTrace final {
+ public:
+  static constexpr int kMaxDepth = 10;
+
+  StackTrace() : size_(0) {}
+
+  // Returns `StackTrace` object that captures the current Python stack trace.
+  // Python GIL must be aquiared beforehand.
+  ABSL_MUST_USE_RESULT
+  ABSL_ATTRIBUTE_HOT
+  static inline StackTrace Capture() {
+    DCheckPyGilState();
+
+    StackTrace result;
+    const PyFrameObject* frame = PyThreadState_GET()->frame;
+    int i = 0;
+    for (; i < kMaxDepth && frame != nullptr; frame = frame->f_back, ++i) {
+      PyCodeObject* code_obj = frame->f_code;
+      DCHECK(frame->f_trace == nullptr);
+      DCHECK(code_obj != nullptr);
+
+      Py_INCREF(code_obj);
+      result.code_objs_[i] = code_obj;
+      result.last_instructions_[i] = frame->f_lasti;
+    }
+    result.size_ = i;
+    return result;
+  }
+
+  ABSL_ATTRIBUTE_HOT
+  ~StackTrace() {
+    DCheckPyGilState();
+    for (int i = 0; i < size_; ++i) Py_DECREF(code_objs_[i]);
+  }
+
+  StackTrace(StackTrace&& other) {
+    code_objs_ = other.code_objs_;
+    last_instructions_ = other.last_instructions_;
+    size_ = other.size_;
+    other.size_ = 0;
+  }
+
+  ABSL_ATTRIBUTE_HOT
+  StackTrace& operator=(StackTrace&& other) {
+    this->~StackTrace();
+    code_objs_ = other.code_objs_;
+    last_instructions_ = other.last_instructions_;
+    size_ = other.size_;
+    other.size_ = 0;
+    return *this;
+  }
+
+  // Returns string representation of the captured stack trace.
+  std::string ToString() const;
+
+  // TODO(kkb): Implement structured stack trace object getter.
+
+ private:
+  std::array<PyCodeObject*, kMaxDepth> code_objs_;
+  std::array<int, kMaxDepth> last_instructions_;
+  int size_;
+
+  StackTrace(const StackTrace&) = delete;
+  StackTrace& operator=(const StackTrace&) = delete;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
diff --git a/tensorflow/python/util/stack_trace_binding_for_test.cc b/tensorflow/python/util/stack_trace_binding_for_test.cc
new file mode 100644
index 00000000000..37290017acd
--- /dev/null
+++ b/tensorflow/python/util/stack_trace_binding_for_test.cc
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/python/util/stack_trace.h"
+
+PYBIND11_MODULE(_stack_trace_binding_for_test, m) {
+  m.def("to_string",
+        []() { return tensorflow::StackTrace::Capture().ToString(); });
+  m.def("stack_trace_n_times", [](int n) {
+    for (int i = 0; i < n; ++i) {
+      auto stack_trace = tensorflow::StackTrace::Capture();
+      tensorflow::testing::DoNotOptimize(stack_trace);
+    }
+  });
+}
diff --git a/tensorflow/python/util/stack_trace_test.py b/tensorflow/python/util/stack_trace_test.py
new file mode 100644
index 00000000000..6d6aee8f819
--- /dev/null
+++ b/tensorflow/python/util/stack_trace_test.py
@@ -0,0 +1,49 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for fast Python stack trace utility."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import traceback
+
+from tensorflow.python import _stack_trace_binding_for_test
+from tensorflow.python.platform import test
+
+
+# Our stack tracing doesn't have source code line yet, so erase for now.
+def erase_line(frame_summary):
+  return [
+      frame_summary.filename, frame_summary.lineno, frame_summary.name,
+      '<source line unimplemented>'
+  ]
+
+
+class StackTraceTest(test.TestCase):
+
+  def testStackTrace(self):
+    our_stack = _stack_trace_binding_for_test.to_string()
+    true_stack = traceback.extract_stack(limit=10)
+    true_stack = [erase_line(fs) for fs in true_stack]
+    true_stack[-1][1] -= 1  # true_stack capturing was one line below.
+    true_stack = ''.join(traceback.format_list(true_stack))
+
+    self.assertEqual(our_stack, true_stack)
+
+
+if __name__ == '__main__':
+  test.main()

From 64a321145898e92031b72281d38f4903ec09afab Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Thu, 7 May 2020 16:19:50 -0700
Subject: [PATCH 0156/1533] Add util function to determine CPU host device from
 TPU device.

Associated CPU host device is always of the same job, worker, replica and task but is of type CPU and id 0.

PiperOrigin-RevId: 310457752
Change-Id: Iafb366943c1347bfaceb01e60b53122cc4b31479
---
 .../utils/tpu_rewrite_device_util.cc          | 23 +++++++++++
 .../utils/tpu_rewrite_device_util.h           | 11 ++++++
 .../utils/tpu_rewrite_device_util_test.cc     | 39 +++++++++++++++++++
 3 files changed, 73 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 6cf2781e48d..ddbcc91e834 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -447,4 +447,27 @@ std::string GetDeviceAliasForLogicalCore(int core_index) {
   return llvm::formatv("{0}_{1}", kTPUReplicatedCore, core_index).str();
 }
 
+StatusOr<std::string> GetCPUHostForTPUDevice(llvm::StringRef tpu_device) {
+  Device device;
+  if (!DeviceNameUtils::ParseFullName(tpu_device.str(), &device))
+    return errors::InvalidArgument("'", tpu_device.str(),
+                                   "' is not a valid device");
+
+  device.type = DEVICE_CPU;
+  device.id = 0;
+  return DeviceNameUtils::ParsedNameToString(device);
+}
+
+StatusOr<llvm::SmallVector<std::string, 8>> GetCPUHostsForTPUDevices(
+    llvm::ArrayRef<std::string> tpu_devices) {
+  llvm::SmallVector<std::string, 8> cpu_devices;
+  cpu_devices.reserve(tpu_devices.size());
+  for (const auto& tpu_device : tpu_devices) {
+    TF_ASSIGN_OR_RETURN(cpu_devices.emplace_back(),
+                        GetCPUHostForTPUDevice(tpu_device));
+  }
+
+  return cpu_devices;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index dd296a13f4b..47ce7f14ea8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -216,6 +216,17 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
 // logical core.
 std::string GetDeviceAliasForLogicalCore(int core_index);
 
+// Finds associated CPU host device for given TPU device. This assumes a
+// matching CPU host device exists based on TPU device name. An error will be
+// returned if the TPU device name is invalid.
+StatusOr<std::string> GetCPUHostForTPUDevice(llvm::StringRef tpu_device);
+
+// Finds associated CPU host devices for given TPU devices. This assumes a
+// matching CPU host device exist based on each TPU device name. An error will
+// be returned if a TPU device name is invalid.
+StatusOr<llvm::SmallVector<std::string, 8>> GetCPUHostsForTPUDevices(
+    llvm::ArrayRef<std::string> tpu_devices);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 87319f2adeb..57e123a5f9a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -552,5 +552,44 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(computation_device_2.replica_device_ids(1), 3);
 }
 
+struct ParameterizedCPUHostForTPUDeviceTest
+    : ::testing::TestWithParam<std::tuple<std::string, std::string>> {};
+
+TEST_P(ParameterizedCPUHostForTPUDeviceTest, CPUHostForTPUDevice) {
+  auto status_or_device = GetCPUHostForTPUDevice(std::get<0>(GetParam()));
+  TF_ASSERT_OK(status_or_device.status());
+  EXPECT_EQ(status_or_device.ValueOrDie(), std::get<1>(GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    CPUHostForTPUDevice, ParameterizedCPUHostForTPUDeviceTest,
+    ::testing::Values(
+        std::make_tuple("/job:worker/replica:0/task:0/device:TPU:0",
+                        "/job:worker/replica:0/task:0/device:CPU:0"),
+        std::make_tuple("/job:worker/replica:0/task:1/device:TPU:1",
+                        "/job:worker/replica:0/task:1/device:CPU:0")));
+
+TEST(TPURewriteDeviceUtilTest, CPUHostForTPUDeviceInvalidDevice) {
+  auto status_or_device = GetCPUHostForTPUDevice("bad_device");
+  ASSERT_FALSE(status_or_device.ok());
+}
+
+TEST(TPURewriteDeviceUtilTest, CPUHostsForTPUDevices) {
+  auto status_or_devices =
+      GetCPUHostsForTPUDevices({"/job:worker/replica:0/task:0/device:TPU:0",
+                                "/job:worker/replica:0/task:1/device:TPU:1"});
+  TF_ASSERT_OK(status_or_devices.status());
+  const auto& devices = status_or_devices.ValueOrDie();
+  ASSERT_EQ(devices.size(), 2);
+  EXPECT_EQ(devices[0], "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(devices[1], "/job:worker/replica:0/task:1/device:CPU:0");
+}
+
+TEST(TPURewriteDeviceUtilTest, CPUHostsForTPUDevicesInvalidDevice) {
+  auto status_or_devices = GetCPUHostsForTPUDevices(
+      {"/job:worker/replica:0/task:0/device:TPU:0", "bad_device"});
+  ASSERT_FALSE(status_or_devices.ok());
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow

From 0f4beb0759004c9427c48ce1209471c13080006d Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Thu, 7 May 2020 09:47:07 +0900
Subject: [PATCH 0157/1533] Static link GPU OpenGL tests

This change fixes issue #39025
---
 .../lite/delegates/gpu/gl/kernels/BUILD       | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index d2ef617a8e2..700a553a125 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -73,6 +73,7 @@ cc_library(
 cc_test(
     name = "add_test",
     srcs = ["add_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -102,6 +103,7 @@ cc_library(
 cc_test(
     name = "concat_test",
     srcs = ["concat_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -136,6 +138,7 @@ cc_library(
 cc_test(
     name = "conv_test",
     srcs = ["conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -176,6 +179,7 @@ cc_library(
 cc_test(
     name = "depthwise_conv_test",
     srcs = ["depthwise_conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -205,6 +209,7 @@ cc_library(
 cc_test(
     name = "elementwise_test",
     srcs = ["elementwise_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -235,6 +240,7 @@ cc_library(
 cc_test(
     name = "fully_connected_test",
     srcs = ["fully_connected_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -263,6 +269,7 @@ cc_library(
 cc_test(
     name = "lstm_test",
     srcs = ["lstm_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -292,6 +299,7 @@ cc_library(
 cc_test(
     name = "max_unpooling_test",
     srcs = ["max_unpooling_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -322,6 +330,7 @@ cc_library(
 cc_test(
     name = "mean_test",
     srcs = ["mean_test.cc"],
+    linkstatic = True,
     tags = [
         "notap",
         "tflite_not_portable_ios",
@@ -351,6 +360,7 @@ cc_library(
 cc_test(
     name = "mul_test",
     srcs = ["mul_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -380,6 +390,7 @@ cc_library(
 cc_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -409,6 +420,7 @@ cc_library(
 cc_test(
     name = "pooling_test",
     srcs = ["pooling_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -440,6 +452,7 @@ cc_library(
 cc_test(
     name = "prelu_test",
     srcs = ["prelu_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -471,6 +484,7 @@ cc_library(
 cc_test(
     name = "quantize_and_dequantize_test",
     srcs = ["quantize_and_dequantize_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -501,6 +515,7 @@ cc_library(
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -529,6 +544,7 @@ cc_library(
 cc_test(
     name = "reshape_test",
     srcs = ["reshape_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -558,6 +574,7 @@ cc_library(
 cc_test(
     name = "slice_test",
     srcs = ["slice_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -589,6 +606,7 @@ cc_library(
 cc_test(
     name = "softmax_test",
     srcs = ["softmax_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -618,6 +636,7 @@ cc_library(
 cc_test(
     name = "space_to_depth_test",
     srcs = ["space_to_depth_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -679,6 +698,7 @@ cc_library(
 cc_test(
     name = "transpose_conv_test",
     srcs = ["transpose_conv_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",
@@ -708,6 +728,7 @@ cc_library(
 cc_test(
     name = "resize_test",
     srcs = ["resize_test.cc"],
+    linkstatic = True,
     tags = tf_gpu_tests_tags() + [
         "notap",
         "tflite_not_portable_ios",

From 74833a04e032766a27890ff882d669d9e484a497 Mon Sep 17 00:00:00 2001
From: leslie-fang-intel <leslie.fang@intel.com>
Date: Fri, 8 May 2020 07:34:54 +0800
Subject: [PATCH 0158/1533] fix dequantize op regression issue

---
 tensorflow/core/kernels/dequantize_op.cc | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 0f5a7019b1f..3b38daf0067 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -61,7 +61,9 @@ class DequantizeOp : public OpKernel {
                                 " is '" +
                                 DataTypeString(ctx->output_type(0)) + "'"));
 
+    need_cast_ = true;
     if (ctx->output_type(0) == DT_FLOAT) {
+      need_cast_ = false;
       OP_REQUIRES(ctx,
                   (mode_string == "MIN_COMBINED" ||
                    mode_string == "MIN_FIRST" || mode_string == "SCALED"),
@@ -98,8 +100,9 @@ class DequantizeOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    Tensor float_output = tensorflow::Tensor(DT_FLOAT, input.shape());
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    Tensor float_output =
+        need_cast_ ? tensorflow::Tensor(DT_FLOAT, input.shape()) : *output;
     if (num_slices == 1) {
       const float min_range = input_min_tensor.flat<float>()(0);
       const float max_range = input_max_tensor.flat<float>()(0);
@@ -128,10 +131,12 @@ class DequantizeOp : public OpKernel {
                         max_ranges(i), output_tensor.template chip<1>(i));
       }
     }
-    S* out_ptr = output->flat<S>().data();
-    float* in_ptr = float_output.flat<float>().data();
-    for (int64 i = 0; i < float_output.NumElements(); ++i) {
-      out_ptr[i] = static_cast<S>(in_ptr[i]);
+    if (need_cast_) {
+      S* out_ptr = output->flat<S>().data();
+      float* in_ptr = float_output.flat<float>().data();
+      for (int64 i = 0; i < float_output.NumElements(); ++i) {
+        out_ptr[i] = static_cast<S>(in_ptr[i]);
+      }
     }
   }
 
@@ -219,6 +224,7 @@ class DequantizeOp : public OpKernel {
   int mode_;
   int axis_;
   bool narrow_range_;
+  bool need_cast_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Dequantize")

From 9ab73c7bd10158e9ba4570ad331497254b05aa75 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 7 May 2020 16:39:46 -0700
Subject: [PATCH 0159/1533] Implement fast deferred-decoding Python stack trace
 class.

PiperOrigin-RevId: 310461070
Change-Id: I1e3131e9088ed106aabf47f51a06d2c86e29e7e7
---
 tensorflow/python/BUILD                       |  38 -------
 tensorflow/python/eager/BUILD                 |   1 -
 tensorflow/python/eager/benchmarks_test.py    |   5 -
 tensorflow/python/lib/core/py_util.h          |  10 --
 tensorflow/python/util/stack_trace.cc         |  34 ------
 tensorflow/python/util/stack_trace.h          | 101 ------------------
 .../util/stack_trace_binding_for_test.cc      |  29 -----
 tensorflow/python/util/stack_trace_test.py    |  49 ---------
 8 files changed, 267 deletions(-)
 delete mode 100644 tensorflow/python/util/stack_trace.cc
 delete mode 100644 tensorflow/python/util/stack_trace.h
 delete mode 100644 tensorflow/python/util/stack_trace_binding_for_test.cc
 delete mode 100644 tensorflow/python/util/stack_trace_test.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 324788e6aa7..4729ce9d743 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1028,7 +1028,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:script_ops_op_lib",
-        "//tensorflow/core/platform:logging",
         "//third_party/python_runtime:headers",
     ],
 )
@@ -5549,43 +5548,6 @@ tf_py_test(
     ],
 )
 
-cc_library(
-    name = "stack_trace",
-    srcs = ["util/stack_trace.cc"],
-    hdrs = ["util/stack_trace.h"],
-    deps = [
-        ":py_util",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:optional",
-    ],
-)
-
-pybind_extension(
-    name = "_stack_trace_binding_for_test",
-    testonly = True,
-    srcs = ["util/stack_trace_binding_for_test.cc"],
-    # TODO(b/138203821): change to "util._tf_stack" once the bug is fixed.
-    module_name = "_stack_trace_binding_for_test",
-    deps = [
-        ":stack_trace",
-        "//tensorflow/core:test",
-        "//third_party/python_runtime:headers",  # buildcleaner: keep
-        "@pybind11",
-    ],
-)
-
-tf_py_test(
-    name = "stack_trace_test",
-    srcs = ["util/stack_trace_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":_stack_trace_binding_for_test",
-        ":client_testlib",
-    ],
-)
-
 py_library(
     name = "util",
     srcs = glob(
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 056f35f90dd..c08cb8cc1c3 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -637,7 +637,6 @@ cuda_py_test(
         ":function",
         ":remote",
         ":test",
-        "//tensorflow/python:_stack_trace_binding_for_test",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:random_ops",
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 8a07c11349a..f2f13279927 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -38,7 +38,6 @@ import numpy as np
 import six
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
-from tensorflow.python import _stack_trace_binding_for_test
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import backprop  # pylint: disable=unused-import
 from tensorflow.python.eager import benchmarks_test_base
@@ -1302,10 +1301,6 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
-  def benchmarkPythonStackTrace_1000_times(self):
-    self._run(lambda: _stack_trace_binding_for_test.stack_trace_n_times(1000),
-              1)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/lib/core/py_util.h b/tensorflow/python/lib/core/py_util.h
index 1f3998152f8..a9f39d39461 100644
--- a/tensorflow/python/lib/core/py_util.h
+++ b/tensorflow/python/lib/core/py_util.h
@@ -16,22 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
 #define TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
 
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
-
 // Fetch the exception message as a string. An exception must be set
 // (PyErr_Occurred() must be true).
 string PyExceptionFetch();
-
-// Assert that Python GIL is held.
-inline void DCheckPyGilState() {
-#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 4
-  DCHECK(PyGILState_Check());
-#endif
-}
-
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_PYTHON_LIB_CORE_PY_UTIL_H_
diff --git a/tensorflow/python/util/stack_trace.cc b/tensorflow/python/util/stack_trace.cc
deleted file mode 100644
index b4d6e3ca866..00000000000
--- a/tensorflow/python/util/stack_trace.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/python/util/stack_trace.h"
-
-namespace tensorflow {
-std::string StackTrace::ToString() const {
-  DCheckPyGilState();
-
-  std::ostringstream result;
-  for (int i = size_ - 1; i >= 0; --i) {
-    result << "  File \"" << PyUnicode_AsUTF8(code_objs_[i]->co_filename)
-           << "\", line "
-           << PyCode_Addr2Line(code_objs_[i], last_instructions_[i]) << ", in "
-           << PyUnicode_AsUTF8(code_objs_[i]->co_name)
-           << "\n    <source line unimplemented>\n";
-    // TODO(kkb): Add source code line.  See tf_stack.cc's
-    // FrameSummary::line() function.
-  }
-  return result.str();
-}
-}  // namespace tensorflow
diff --git a/tensorflow/python/util/stack_trace.h b/tensorflow/python/util/stack_trace.h
deleted file mode 100644
index b04104c4618..00000000000
--- a/tensorflow/python/util/stack_trace.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
-#define TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
-
-#include <Python.h>
-#include <frameobject.h>
-
-#include <array>
-#include <sstream>
-#include <string>
-
-#include "absl/base/attributes.h"
-#include "absl/base/optimization.h"
-#include "tensorflow/python/lib/core/py_util.h"
-
-namespace tensorflow {
-
-// A class for capturing Python stack trace.
-class StackTrace final {
- public:
-  static constexpr int kMaxDepth = 10;
-
-  StackTrace() : size_(0) {}
-
-  // Returns `StackTrace` object that captures the current Python stack trace.
-  // Python GIL must be aquiared beforehand.
-  ABSL_MUST_USE_RESULT
-  ABSL_ATTRIBUTE_HOT
-  static inline StackTrace Capture() {
-    DCheckPyGilState();
-
-    StackTrace result;
-    const PyFrameObject* frame = PyThreadState_GET()->frame;
-    int i = 0;
-    for (; i < kMaxDepth && frame != nullptr; frame = frame->f_back, ++i) {
-      PyCodeObject* code_obj = frame->f_code;
-      DCHECK(frame->f_trace == nullptr);
-      DCHECK(code_obj != nullptr);
-
-      Py_INCREF(code_obj);
-      result.code_objs_[i] = code_obj;
-      result.last_instructions_[i] = frame->f_lasti;
-    }
-    result.size_ = i;
-    return result;
-  }
-
-  ABSL_ATTRIBUTE_HOT
-  ~StackTrace() {
-    DCheckPyGilState();
-    for (int i = 0; i < size_; ++i) Py_DECREF(code_objs_[i]);
-  }
-
-  StackTrace(StackTrace&& other) {
-    code_objs_ = other.code_objs_;
-    last_instructions_ = other.last_instructions_;
-    size_ = other.size_;
-    other.size_ = 0;
-  }
-
-  ABSL_ATTRIBUTE_HOT
-  StackTrace& operator=(StackTrace&& other) {
-    this->~StackTrace();
-    code_objs_ = other.code_objs_;
-    last_instructions_ = other.last_instructions_;
-    size_ = other.size_;
-    other.size_ = 0;
-    return *this;
-  }
-
-  // Returns string representation of the captured stack trace.
-  std::string ToString() const;
-
-  // TODO(kkb): Implement structured stack trace object getter.
-
- private:
-  std::array<PyCodeObject*, kMaxDepth> code_objs_;
-  std::array<int, kMaxDepth> last_instructions_;
-  int size_;
-
-  StackTrace(const StackTrace&) = delete;
-  StackTrace& operator=(const StackTrace&) = delete;
-};
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_PYTHON_UTIL_STACK_TRACE_H_
diff --git a/tensorflow/python/util/stack_trace_binding_for_test.cc b/tensorflow/python/util/stack_trace_binding_for_test.cc
deleted file mode 100644
index 37290017acd..00000000000
--- a/tensorflow/python/util/stack_trace_binding_for_test.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/python/util/stack_trace.h"
-
-PYBIND11_MODULE(_stack_trace_binding_for_test, m) {
-  m.def("to_string",
-        []() { return tensorflow::StackTrace::Capture().ToString(); });
-  m.def("stack_trace_n_times", [](int n) {
-    for (int i = 0; i < n; ++i) {
-      auto stack_trace = tensorflow::StackTrace::Capture();
-      tensorflow::testing::DoNotOptimize(stack_trace);
-    }
-  });
-}
diff --git a/tensorflow/python/util/stack_trace_test.py b/tensorflow/python/util/stack_trace_test.py
deleted file mode 100644
index 6d6aee8f819..00000000000
--- a/tensorflow/python/util/stack_trace_test.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Tests for fast Python stack trace utility."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import traceback
-
-from tensorflow.python import _stack_trace_binding_for_test
-from tensorflow.python.platform import test
-
-
-# Our stack tracing doesn't have source code line yet, so erase for now.
-def erase_line(frame_summary):
-  return [
-      frame_summary.filename, frame_summary.lineno, frame_summary.name,
-      '<source line unimplemented>'
-  ]
-
-
-class StackTraceTest(test.TestCase):
-
-  def testStackTrace(self):
-    our_stack = _stack_trace_binding_for_test.to_string()
-    true_stack = traceback.extract_stack(limit=10)
-    true_stack = [erase_line(fs) for fs in true_stack]
-    true_stack[-1][1] -= 1  # true_stack capturing was one line below.
-    true_stack = ''.join(traceback.format_list(true_stack))
-
-    self.assertEqual(our_stack, true_stack)
-
-
-if __name__ == '__main__':
-  test.main()

From 9242d0c50952ba79627d457254961f7935f29b21 Mon Sep 17 00:00:00 2001
From: Scott Wegner <swegner@google.com>
Date: Thu, 7 May 2020 16:41:51 -0700
Subject: [PATCH 0160/1533] Fix documentation compatibility tag formatting.

PiperOrigin-RevId: 310461418
Change-Id: I824ac86b0519d7429bd68189fbf088c30484a681
---
 tensorflow/python/framework/ops.py            | 20 +++++++++++--------
 .../python/keras/optimizer_v2/rmsprop.py      | 13 +++++++-----
 tensorflow/python/training/adam.py            | 13 +++++++-----
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index f43663d5396..9b8f7cf4fde 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -6261,10 +6261,12 @@ def add_to_collection(name, value):
   Args:
     name: The key for the collection. For example, the `GraphKeys` class
       contains many standard names for collections.
-    value: The value to add to the collection.  @compatibility(eager)
-      Collections are only supported in eager when variables are created inside
-      an EagerVariableStore (e.g. as part of a layer or template).
-      @end_compatibility
+    value: The value to add to the collection.
+
+  @compatibility(eager)
+  Collections are only supported in eager when variables are created inside
+  an EagerVariableStore (e.g. as part of a layer or template).
+  @end_compatibility
   """
   get_default_graph().add_to_collection(name, value)
 
@@ -6279,10 +6281,12 @@ def add_to_collections(names, value):
   Args:
     names: The key for the collections. The `GraphKeys` class contains many
       standard names for collections.
-    value: The value to add to the collections.  @compatibility(eager)
-      Collections are only supported in eager when variables are created inside
-      an EagerVariableStore (e.g. as part of a layer or template).
-      @end_compatibility
+    value: The value to add to the collections.
+
+  @compatibility(eager)
+  Collections are only supported in eager when variables are created inside
+  an EagerVariableStore (e.g. as part of a layer or template).
+  @end_compatibility
   """
   get_default_graph().add_to_collections(names, value)
 
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
index 5de5e59b385..d1deaf34f45 100644
--- a/tensorflow/python/keras/optimizer_v2/rmsprop.py
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -121,16 +121,19 @@ class RMSprop(optimizer_v2.OptimizerV2):
         Setting this to `True` may help with training, but is slightly more
         expensive in terms of computation and memory. Defaults to `False`.
       name: Optional name prefix for the operations created when applying
-        gradients. Defaults to "RMSprop".  @compatibility(eager) When eager
-        execution is enabled, `learning_rate`, `decay`, `momentum`, and
-        `epsilon` can each be a callable that takes no arguments and returns the
-        actual value to use. This can be useful for changing these values across
-        different invocations of optimizer functions. @end_compatibility
+        gradients. Defaults to "RMSprop".
       **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`,
         `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip
         gradients by value, `decay` is included for backward compatibility to
         allow time inverse decay of learning rate. `lr` is included for backward
         compatibility, recommended to use `learning_rate` instead.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `decay`, `momentum`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(RMSprop, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 615ac587c21..93bacbdc0bb 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -92,11 +92,14 @@ class AdamOptimizer(optimizer.Optimizer):
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
-        Defaults to "Adam".  @compatibility(eager) When eager execution is
-        enabled, `learning_rate`, `beta1`, `beta2`, and `epsilon` can each be a
-        callable that takes no arguments and returns the actual value to use.
-        This can be useful for changing these values across different
-        invocations of optimizer functions. @end_compatibility
+        Defaults to "Adam".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(AdamOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate

From 797ac7bf875ac1924fd7c8e27acb7fc74c091b7c Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 7 May 2020 16:45:04 -0700
Subject: [PATCH 0161/1533] Disable a test from sanitizers.

PiperOrigin-RevId: 310461926
Change-Id: I744cd0fd8d3b47e1ef4445b3ff9bd48ea7ff76fe
---
 tensorflow/python/keras/distribute/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index a7b1caa3b02..87625446e2f 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -430,6 +430,11 @@ py_test(
     srcs = ["multi_worker_tutorial_test.py"],
     python_version = "PY3",
     shard_count = 5,
+    tags = [
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],  # TODO(b/156029134)
     deps = [
         "//tensorflow/python:platform",
         "//tensorflow/python/data/ops:dataset_ops",

From 79acb0824b8bbb1fb887d2ff625f2f170d80fe1f Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Thu, 7 May 2020 16:51:02 -0700
Subject: [PATCH 0162/1533] [tf.data] Bug fix: make_csv_dataset should not
 modify mutable parameters passed into it

Fixes #39186.

PiperOrigin-RevId: 310462826
Change-Id: I8b8dd6f16f9fab6b1e02410dc6e8c91f748772f6
---
 .../kernel_tests/csv_dataset_test.py          | 11 ++++++++--
 .../python/data/experimental/ops/readers.py   | 20 ++++++++++++-------
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index 941ca209848..13948305aea 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -41,9 +41,9 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _setup_files(self, inputs, linebreak='\n', compression_type=None):
     filenames = []
-    for i, ip in enumerate(inputs):
+    for i, file_rows in enumerate(inputs):
       fn = os.path.join(self.get_temp_dir(), 'temp_%d.csv' % i)
-      contents = linebreak.join(ip).encode('utf-8')
+      contents = linebreak.join(file_rows).encode('utf-8')
       if compression_type is None:
         with open(fn, 'wb') as f:
           f.write(contents)
@@ -580,6 +580,13 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]],
           record_defaults=record_defaults)
 
+  def testCsvDataset_immutableParams(self):
+    inputs = [['a,b,c', '1,2,3', '4,5,6']]
+    filenames = self._setup_files(inputs)
+    select_cols = ['a', 'c']
+    _ = readers.make_csv_dataset(
+        filenames, batch_size=1, select_columns=select_cols)
+    self.assertAllEqual(select_cols, ['a', 'c'])
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index 8795a206bb1..b8f4c34f40e 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -183,24 +183,30 @@ def _get_sorted_col_indices(select_columns, column_names):
   """Transforms select_columns argument into sorted column indices."""
   names_to_indices = {n: i for i, n in enumerate(column_names)}
   num_cols = len(column_names)
-  for i, v in enumerate(select_columns):
+
+  results = []
+  for v in select_columns:
+    # If value is already an int, check if it's valid.
     if isinstance(v, int):
       if v < 0 or v >= num_cols:
         raise ValueError(
             "Column index %d specified in select_columns out of valid range." %
             v)
-      continue
-    if v not in names_to_indices:
+      results.append(v)
+    # Otherwise, check that it's a valid column name and convert to the
+    # the relevant column index.
+    elif v not in names_to_indices:
       raise ValueError(
           "Value '%s' specified in select_columns not a valid column index or "
           "name." % v)
-    select_columns[i] = names_to_indices[v]
+    else:
+      results.append(names_to_indices[v])
 
   # Sort and ensure there are no duplicates
-  result = sorted(set(select_columns))
-  if len(result) != len(select_columns):
+  results = sorted(set(results))
+  if len(results) != len(select_columns):
     raise ValueError("select_columns contains duplicate columns")
-  return result
+  return results
 
 
 def _maybe_shuffle_and_repeat(

From 491d6e42ce118a0c0156ce93eca67541069c617f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 16:56:39 -0700
Subject: [PATCH 0163/1533] This will break once kwargs has no key of
 'partitioner'.

PiperOrigin-RevId: 310463715
Change-Id: I45fcd670172a8d96f509fbf3e4ba64556386b058
---
 tensorflow/python/tpu/tpu.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index c70a26f2b4d..28eba69b7da 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -1353,7 +1353,7 @@ def split_compile_and_replicate(computation,
 
       def custom_getter(getter, name, *args, **kwargs):
         """Variables on TPU have a few restrictions."""
-        partitioner = kwargs["partitioner"]
+        partitioner = kwargs.get("partitioner", None)
         if partitioner is not None:
           kwargs["partitioner"] = None
           logging.warning(

From 4d94fe13faca561ad0c334e707632356ad071acb Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 7 May 2020 17:04:56 -0700
Subject: [PATCH 0164/1533] Enable MLIR bridge for ops that are already
 supported

* Removed SameOperandsAndResultShape from ClipByValue op as the op has multiple operands and UnchangedShape function doesn't mean all shapes are equal.

PiperOrigin-RevId: 310465081
Change-Id: Id56c0d67e1b70638c57df5f4d1e5da1875529064
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 23 ++++++++++++++++++-
 .../xla/transforms/legalize_tf_with_tf2xla.cc |  2 ++
 tensorflow/compiler/tests/BUILD               |  2 ++
 tensorflow/compiler/tests/binary_ops_test.py  |  1 -
 tensorflow/compiler/tests/concat_ops_test.py  |  2 ++
 tensorflow/compiler/tests/ternary_ops_test.py |  6 +++++
 tensorflow/compiler/tests/xla_ops_test.py     |  1 -
 7 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 7a3c9617e2e..a33b339220f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1217,7 +1217,7 @@ that are not a number (NaN) or infinity (Inf). Otherwise, passes `tensor` as-is.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect, SameOperandsAndResultType]> {
+def TF_ClipByValueOp : TF_Op<"ClipByValue", [NoSideEffect]> {
   let summary = "Clips tensor values to a specified min and max.";
 
   let description = [{
@@ -1682,6 +1682,27 @@ Given an input tensor, this function computes hyperbolic cosine of every
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CrossOp : TF_Op<"Cross", [NoSideEffect]> {
+  let summary = "Compute the pairwise cross product.";
+
+  let description = [{
+`a` and `b` must be the same shape; they can either be simple 3-element vectors,
+or any shape where the innermost dimension is 3. In the latter case, each pair
+of corresponding 3-element vectors is cross-multiplied independently.
+  }];
+
+  let arguments = (ins
+    TF_IntOrFpTensor:$a,
+    TF_IntOrFpTensor:$b
+  );
+
+  let results = (outs
+    TF_IntOrFpTensor:$product
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [AllTypesMatch<["input", "output"]>, NoSideEffect]> {
   let summary = "An Op to sum inputs across replicated TPU instances.";
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index bc2a8b606da..be6ba167419 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -100,8 +100,10 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::BitwiseOrOp>(),
     TypeID::get<TF::BitwiseXorOp>(),
     TypeID::get<TF::CastOp>(),
+    TypeID::get<TF::ClipByValueOp>(),
     TypeID::get<TF::ComplexAbsOp>(),
     TypeID::get<TF::CoshOp>(),
+    TypeID::get<TF::CrossOp>(),
     TypeID::get<TF::DataFormatDimMapOp>(),
     TypeID::get<TF::DataFormatVecPermuteOp>(),
     TypeID::get<TF::DigammaOp>(),
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index cd22b527444..5cca1e69f53 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -470,6 +470,7 @@ tf_xla_py_test(
     name = "concat_ops_test",
     size = "medium",
     srcs = ["concat_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "many_xla_args",
@@ -1342,6 +1343,7 @@ tf_xla_py_test(
     name = "ternary_ops_test",
     size = "medium",
     srcs = ["ternary_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 9d48758928e..bd0131920f9 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1511,7 +1511,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([1, 0], dtype=np.int32),
           expected=np.array([[1 + 1j, 3 + 3j], [2 - 2j, 4 - 4j]], dtype=dtype))
 
-  @test_util.disable_mlir_bridge("Enable tf.Cross Compilation")
   def testCross(self):
     for dtype in self.float_types:
       self._testBinary(
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 10dd2d6542c..f35ded924d5 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
@@ -293,6 +294,7 @@ class ConcatTest(xla_test.XLATestCase):
 
   # The purpose of this is to ensure that XLA on GPU will not run out of memory
   # with too many arguments.
+  @test_util.disable_mlir_bridge("TODO(b/153895138): Debug.")
   def testConcatLargeNumberOfTensors(self):
     if "CPU" in self.device:
       self.skipTest("This test can time out on CPU, so we will just allow "
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index a2a47f19a6e..ff3558729e5 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -24,6 +24,7 @@ import scipy.special as sps
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -47,6 +48,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       {'start': 1, 'end': 2, 'num': 1},
       {'start': 1, 'end': 4, 'num': 3},
       {'start': 0, 'end': 41, 'num': 42})
+  @test_util.disable_mlir_bridge('Requires dynamic shape handling')
   def testLinspace(self, start, end, num):
     expected = np.linspace(start, end, num, dtype=np.float32)
     result = self._testTernary(
@@ -74,6 +76,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
         np.int32(2),
         expected=np.array([1, 3, 5], dtype=np.int32))
 
+  @test_util.disable_mlir_bridge('TODO(b/155949336)')
   def testSelect(self):
     for dtype in self.numeric_types:
       self._testTernary(
@@ -179,6 +182,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
           np.array([8, 9], dtype=dtype),
           expected=np.array([[7, 9], [8, 7], [8, 9]], dtype=dtype))
 
+  @test_util.disable_mlir_bridge('TODO(b/155097273)')
   def testSlice(self):
     for dtype in self.numeric_types:
       self._testTernary(
@@ -211,6 +215,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
             upper,
             expected=np.minimum(np.maximum(x, lower), upper))
 
+  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetaincSanity(self):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
@@ -248,6 +253,7 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
           'atol': 2e-4
       },
   )
+  @test_util.disable_mlir_bridge('Enable tf.Betainc Compilation')
   def testBetainc(self, sigma, rtol, atol):
     # This operation is only supported for float32 and float64.
     for dtype in self.numeric_types & {np.float32, np.float64}:
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 3b304df9024..b01c5aea4fa 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -72,7 +72,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                 np.array([7, 11], dtype=dtype)),
           expected=np.array([[8, 13], [10, 15]], dtype=dtype))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testBroadcast(self):
     for dtype in self.numeric_types:
       v = np.arange(4, dtype=np.int32).astype(dtype).reshape([2, 2])

From 804dd8af35e48acbe9443c881efc002272de860f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 17:16:24 -0700
Subject: [PATCH 0165/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310466885
Change-Id: I263a0919eccf5a29b687c62a1f70afb57e337ac3
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 1a0391276725f75265a526a5d5ae10e8e09022cb Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 7 May 2020 17:21:16 -0700
Subject: [PATCH 0166/1533] [Executor] Optimize
 `PropagatorState::FindOrCreateChildFrame()`.

At present, the `PropagatorState` must look up the "frame_name" and "parallel_iterations" attributes on the `Enter` node's `NodeDef` each time it propagates its outputs. This information is mostly cached already in the `ImmutableExecutorState`.

This change makes the following optimizations:

1. Cache a `FrameInfo*` for each enter node and provide O(1) lookup using an `std::vector<FrameInfo*>`.
2. Add `FrameInfo::parallel_iterations`.
3. Perform `frame_name` and `parallel_iterations` resolution once at `ImmutableExecutorState` construction time.
4. Avoid building and storing `std::string FrameState::frame_name`, which is only used for verbose logging. Instead use the `uint64 FrameState::frame_id` as the key in all data structures, to optimize lookup. This is safe because we already depend on a lack of collisions between frame IDs (since the frame IDs are used in rendezvous keys when tensors are sent between devices).

This change also modifies the "executor_test.cc" loop microbenchmarks to cover different numbers of loop variables (which, in the lowered case, directly translates to the number of "Enter" ops).

PiperOrigin-RevId: 310467528
Change-Id: Id70d284e99fd7537156a3a5e10da827aea4791f9
---
 tensorflow/core/common_runtime/BUILD          |  1 +
 .../core/common_runtime/executor_test.cc      | 90 ++++++++++++++-----
 .../immutable_executor_state.cc               | 41 +++++++--
 .../common_runtime/immutable_executor_state.h | 36 +++++---
 .../core/common_runtime/propagator_state.cc   | 64 +++++++------
 .../core/common_runtime/propagator_state.h    |  9 +-
 6 files changed, 158 insertions(+), 83 deletions(-)

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index eb506d29571..76a3c276e2d 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1212,6 +1212,7 @@ cc_library(
         ":propagator_debug_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:hash",
         "//tensorflow/core/profiler/lib:traceme",
     ],
 )
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 9a1b7cff813..dd65b5dce1d 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -549,7 +549,8 @@ BENCHMARK(BM_FeedInputFetchOutput);
 //
 // ...using the functional `WhileOp` (if `lower` is false) or the
 // `Switch`/`Merge`-style of control flow (if `lower` is true).
-static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
+static void BM_WhileLoopHelper(int iters, int loop_iters, int loop_vars,
+                               bool lower) {
   testing::StopTiming();
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 
@@ -558,20 +559,44 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
 
   // Define the loop body as a function: `x = x + 1`.
   const Tensor one_t = test::AsScalar<int32>(1);
+
+  std::vector<string> args;
+  args.reserve(loop_vars);
+  args.push_back("x: int32");
+  for (int i = 1; i < loop_vars; ++i) {
+    args.push_back(strings::StrCat("x", i, ": int32"));
+  }
+
+  std::vector<string> body_rets;
+  body_rets.reserve(loop_vars);
+  body_rets.push_back("y: int32");
+  for (int i = 1; i < loop_vars; ++i) {
+    body_rets.push_back(strings::StrCat("y", i, ": int32"));
+  }
+
+  std::vector<FunctionDefHelper::Node> body_nodes;
+  body_nodes.reserve(1 + loop_vars);
+  body_nodes.push_back(
+      {{"one"}, "Const", {}, {{"value", one_t}, {"dtype", DT_INT32}}});
+  body_nodes.push_back({{"y"}, "Add", {"x", "one"}, {{"T", DT_INT32}}});
+  for (int i = 1; i < loop_vars; ++i) {
+    body_nodes.push_back({{strings::StrCat("y", i)},
+                          "Identity",
+                          {strings::StrCat("x", i)},
+                          {{"T", DT_INT32}}});
+  }
+
   *f_lib_proto.add_function() = FunctionDefHelper::Define(
       // Name
       "XPlusOne",
       // Args
-      {"x: int32"},
+      args,
       // Return values
-      {"y: int32"},
+      body_rets,
       // Attr def
       {},
       // Nodes
-      {
-          {{"one"}, "Const", {}, {{"value", one_t}, {"dtype", DT_INT32}}},
-          {{"y"}, "Add", {"x", "one"}, {{"T", DT_INT32}}},
-      });
+      body_nodes);
 
   // Define the loop condition as a function: `x < loop_iters`.
   const Tensor loop_iters_t = test::AsScalar<int32>(loop_iters);
@@ -579,7 +604,7 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
       // Name
       "LessThanOrEqualToN",
       // Args
-      {"x: int32"},
+      args,
       // Return values
       {"z: bool"},
       // Attr def
@@ -594,7 +619,12 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
   TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
   auto a = ops::Const(root.WithOpName("A"), 0, {});
   Node* while_node;
-  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  std::vector<NodeBuilder::NodeOut> inputs;
+  std::vector<DataType> input_types(loop_vars, DT_INT32);
+  inputs.reserve(loop_vars);
+  for (int i = 0; i < loop_vars; ++i) {
+    inputs.push_back(NodeBuilder::NodeOut(a.node()));
+  }
   AttrValue int32_attr;
   int32_attr.set_type(DT_INT32);
   AttrValue cond_func;
@@ -604,7 +634,7 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
   TF_ASSERT_OK(
       NodeBuilder("while", "While", &root.graph()->flib_def())
           .Input(inputs)
-          .Attr("T", {DT_INT32})
+          .Attr("T", input_types)
           .Attr("cond", cond_func)
           .Attr("body", body_func)
           .Attr("parallel_iterations", 100)
@@ -635,21 +665,33 @@ static void BM_WhileLoopHelper(int iters, int loop_iters, bool lower) {
   test::Benchmark("cpu", graph.release()).Run(iters);
 }
 
-static void BM_LoweredWhileLoop(int iters, int loop_iters) {
-  BM_WhileLoopHelper(iters, loop_iters, /* lower= */ true);
+static void BM_LoweredWhileLoop(int iters, int loop_iters, int loop_vars) {
+  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ true);
 }
-BENCHMARK(BM_LoweredWhileLoop)->Arg(0);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(1);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(10);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(100);
-BENCHMARK(BM_LoweredWhileLoop)->Arg(1000);
+BENCHMARK(BM_LoweredWhileLoop)
+    ->ArgPair(0, 1)
+    ->ArgPair(1, 1)
+    ->ArgPair(10, 1)
+    ->ArgPair(100, 1)
+    ->ArgPair(1000, 1)
+    ->ArgPair(0, 100)
+    ->ArgPair(1, 100)
+    ->ArgPair(10, 100)
+    ->ArgPair(100, 100)
+    ->ArgPair(1000, 100);
 
-static void BM_FunctionalWhileLoop(int iters, int loop_iters) {
-  BM_WhileLoopHelper(iters, loop_iters, /* lower= */ false);
+static void BM_FunctionalWhileLoop(int iters, int loop_iters, int loop_vars) {
+  BM_WhileLoopHelper(iters, loop_iters, loop_vars, /* lower= */ false);
 }
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(0);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(1);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(10);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(100);
-BENCHMARK(BM_FunctionalWhileLoop)->Arg(1000);
+BENCHMARK(BM_FunctionalWhileLoop)
+    ->ArgPair(0, 1)
+    ->ArgPair(1, 1)
+    ->ArgPair(10, 1)
+    ->ArgPair(100, 1)
+    ->ArgPair(1000, 1)
+    ->ArgPair(0, 100)
+    ->ArgPair(1, 100)
+    ->ArgPair(10, 100)
+    ->ArgPair(100, 100)
+    ->ArgPair(1000, 100);
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
index a98d9f0feaa..03d12a0e98a 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.cc
+++ b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/edgeset.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_node_util.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -39,9 +40,6 @@ ImmutableExecutorState::~ImmutableExecutorState() {
       params_.delete_kernel(item->kernel);
     }
   }
-  for (auto fiter : frame_info_) {
-    delete fiter.second;
-  }
 }
 
 namespace {
@@ -71,11 +69,16 @@ void GetMaxPendingCounts(const Node* n, size_t* max_pending,
 
 ImmutableExecutorState::FrameInfo* ImmutableExecutorState::EnsureFrameInfo(
     const string& fname) {
-  auto slot = &frame_info_[fname];
-  if (*slot == nullptr) {
-    *slot = new FrameInfo;
+  auto iter = frame_info_.find(fname);
+  if (iter != frame_info_.end()) {
+    return iter->second.get();
+  } else {
+    auto frame_info = absl::make_unique<FrameInfo>(fname);
+    absl::string_view fname_view = frame_info->name;
+    auto emplace_result =
+        frame_info_.emplace(fname_view, std::move(frame_info));
+    return emplace_result.first->second.get();
   }
-  return *slot;
 }
 
 Status ImmutableExecutorState::Initialize(const Graph& graph) {
@@ -89,7 +92,7 @@ Status ImmutableExecutorState::Initialize(const Graph& graph) {
     EnsureFrameInfo(it)->nodes =
         absl::make_unique<std::vector<const NodeItem*>>();
   }
-  root_frame_info_ = frame_info_[""];
+  root_frame_info_ = frame_info_[""].get();
 
   pending_ids_.resize(gview_.num_nodes());
 
@@ -157,6 +160,28 @@ Status ImmutableExecutorState::Initialize(const Graph& graph) {
       TF_RETURN_IF_ERROR(
           GetNodeAttr(n->attrs(), "is_constant", &is_constant_enter));
       item->is_constant_enter = is_constant_enter;
+
+      string frame_name;
+      TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "frame_name", &frame_name));
+      FrameInfo* frame_info = frame_info_[frame_name].get();
+
+      int parallel_iterations;
+      TF_RETURN_IF_ERROR(
+          GetNodeAttr(n->attrs(), "parallel_iterations", &parallel_iterations));
+
+      if (frame_info->parallel_iterations == -1) {
+        frame_info->parallel_iterations = parallel_iterations;
+      } else if (frame_info->parallel_iterations != parallel_iterations) {
+        LOG(WARNING) << "Loop frame \"" << frame_name
+                     << "\" had two different values for parallel_iterations: "
+                     << frame_info->parallel_iterations << " vs. "
+                     << parallel_iterations << ".";
+      }
+
+      if (enter_frame_info_.size() <= id) {
+        enter_frame_info_.resize(id + 1);
+      }
+      enter_frame_info_[id] = frame_info;
     } else {
       item->is_constant_enter = false;
     }
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.h b/tensorflow/core/common_runtime/immutable_executor_state.h
index 50c98939ea8..a35edfe227c 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.h
+++ b/tensorflow/core/common_runtime/immutable_executor_state.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/graph_view.h"
 #include "tensorflow/core/common_runtime/local_executor_params.h"
 #include "tensorflow/core/common_runtime/pending_counts.h"
@@ -41,11 +42,16 @@ class Graph;
 class ImmutableExecutorState {
  public:
   struct FrameInfo {
-    FrameInfo()
-        : input_count(0),
+    explicit FrameInfo(string name)
+        : name(std::move(name)),
+          input_count(0),
           total_inputs(0),
           pending_counts(nullptr),
-          nodes(nullptr) {}
+          nodes(nullptr),
+          parallel_iterations(-1) {}
+
+    // The name of the frame.
+    string name;
 
     // The total number of inputs to a frame.
     int input_count;
@@ -63,6 +69,9 @@ class ImmutableExecutorState {
 
     // The nodes in a frame. Used only for debugging.
     std::unique_ptr<std::vector<const NodeItem*>> nodes;
+
+    // The number of iterations of this frame that can execute concurrently.
+    int32 parallel_iterations;
   };
 
   explicit ImmutableExecutorState(const LocalExecutorParams& p)
@@ -83,17 +92,13 @@ class ImmutableExecutorState {
   }
   const std::vector<const NodeItem*>& root_nodes() const { return root_nodes_; }
 
-  const FrameInfo* get_frame_info(const string& frame_name) const {
-    auto it_frame_info = frame_info_.find(frame_name);
-    if (it_frame_info == frame_info_.end()) {
-      return nullptr;
-    } else {
-      return it_frame_info->second;
-    }
-  }
-
   const FrameInfo& get_root_frame_info() const { return *root_frame_info_; }
 
+  const FrameInfo& get_enter_frame_info(const NodeItem& node_item) const {
+    DCHECK(node_item.is_enter);
+    return *enter_frame_info_[node_item.node_id];
+  }
+
   bool requires_control_flow_support() const { return requires_control_flow_; }
 
   // Copies the pending counts for nodes in this graph to the given array.
@@ -135,9 +140,14 @@ class ImmutableExecutorState {
   // Mapping from frame name to static information about the frame.
   // TODO(yuanbyu): We could cache it along with the graph so to avoid
   // the overhead of constructing it for each executor instance.
-  gtl::FlatMap<string, FrameInfo*> frame_info_;
+  absl::flat_hash_map<absl::string_view, std::unique_ptr<FrameInfo>>
+      frame_info_;
   const FrameInfo* root_frame_info_;  // Not owned.
 
+  // If the graph contains any "Enter" or "RefEnter" nodes, this vector maps
+  // dense node IDs to the corresponding FrameInfo.
+  std::vector<FrameInfo*> enter_frame_info_;
+
   // If `requires_control_flow_` is false, this points to an array of initial
   // pending counts for the nodes in the graph, indexed by node ID.
   std::unique_ptr<std::atomic<int32>[]> atomic_pending_counts_;
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index 4fd5e0f97d9..30529dec742 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/propagator_state.h"
 
 #include "tensorflow/core/common_runtime/graph_view.h"
+#include "tensorflow/core/common_runtime/immutable_executor_state.h"
 #include "tensorflow/core/common_runtime/propagator_debug_utils.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/hash/hash.h"
+#include "tensorflow/core/platform/hash.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
@@ -33,14 +35,14 @@ PropagatorState::PropagatorState(const ImmutableExecutorState& immutable_state,
   // We assume root_frame_->frame_name.empty().
   root_frame_ = new FrameState(immutable_state_, 1);
   root_frame_->frame_id = 0;  // must be 0
-  root_frame_->InitializeFrameInfo(root_frame_->frame_name);
+  root_frame_->InitializeFrameInfo(immutable_state_.get_root_frame_info());
 
   // Initialize iteration 0.
   root_frame_->SetIteration(
       0, new PropagatorState::IterationState(0, root_frame_->pending_counts,
                                              root_frame_->total_input_tensors));
 
-  outstanding_frames_.insert({root_frame_->frame_name, root_frame_});
+  outstanding_frames_.emplace(root_frame_->frame_id, root_frame_);
 }
 
 PropagatorState::~PropagatorState() {
@@ -224,16 +226,16 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
                                              const NodeItem& node_item,
                                              FrameState** child) {
   // Get the child frame name.
-  AttrSlice attrs(node_item.kernel->def());
-  const string& enter_name = GetNodeAttrString(attrs, "frame_name");
-  DCHECK(!enter_name.empty()) << "Could not find \"frame_name\" attr in node "
-                              << node_item.kernel->name();
-  const string child_name = strings::StrCat(
-      frame->frame_name, ";", iter_state->iter_num, ";", enter_name);
+  const ImmutableExecutorState::FrameInfo& frame_info =
+      immutable_state_.get_enter_frame_info(node_item);
+
+  const uint64 child_id = Hash64Combine(
+      frame->frame_id,
+      Hash64Combine(iter_state->iter_num, Hash64(frame_info.name)));
 
   {
-    mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
+    tf_shared_lock executor_lock(mu_);
+    auto it = outstanding_frames_.find(child_id);
     if (it != outstanding_frames_.end()) {
       *child = it->second;
       return;
@@ -242,20 +244,18 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
 
   // Need to create a new frame instance.
   // Note that this new frame instance is created without any locks.
-  if (vlog_) VLOG(2) << "Create frame: " << child_name;
+  if (vlog_) {
+    const string child_name = strings::StrCat(
+        frame->frame_name, ";", iter_state->iter_num, ";", frame_info.name);
+    VLOG(2) << "Create frame: " << child_name << " id: " << child_id;
+  }
 
-  int parallel_iters;
-  bool found_parallel_iters =
-      TryGetNodeAttr(attrs, "parallel_iterations", &parallel_iters);
-  DCHECK(found_parallel_iters)
-      << "Could not find \"parallel_iterations\" attr in node "
-      << node_item.kernel->name();
-  FrameState* temp = new FrameState(immutable_state_, parallel_iters);
-  temp->frame_name = child_name;
-  temp->frame_id = Hash64(child_name);
+  FrameState* temp =
+      new FrameState(immutable_state_, frame_info.parallel_iterations);
+  temp->frame_id = child_id;
   temp->parent_frame = frame;
   temp->parent_iter = iter_state;
-  temp->InitializeFrameInfo(enter_name);
+  temp->InitializeFrameInfo(frame_info);
 
   // Initialize iteration 0.
   {
@@ -266,13 +266,13 @@ void PropagatorState::FindOrCreateChildFrame(FrameState* frame,
 
   {
     mutex_lock executor_lock(mu_);
-    auto it = outstanding_frames_.find(child_name);
+    auto it = outstanding_frames_.find(child_id);
     if (it != outstanding_frames_.end()) {
       *child = it->second;
     } else {
       mutex_lock frame_lock(frame->mu);
       iter_state->outstanding_frame_count++;
-      outstanding_frames_[child_name] = temp;
+      outstanding_frames_[child_id] = temp;
       *child = temp;
       temp = nullptr;
     }
@@ -349,11 +349,10 @@ void PropagatorState::DeleteFrame(FrameState* frame, TaggedNodeSeq* ready) {
   }
 
   // Delete the frame.
-  const string& frame_name = frame->frame_name;
-  if (vlog_) VLOG(2) << "Delete frame " << frame_name;
+  if (vlog_) VLOG(2) << "Delete frame " << frame->frame_id;
   {
     mutex_lock executor_lock(mu_);
-    outstanding_frames_.erase(frame_name);
+    outstanding_frames_.erase(frame->frame_id);
   }
   delete frame;
 }
@@ -655,14 +654,11 @@ bool PropagatorState::FrameState::CleanupIterations(IterationState* iter_state,
 }
 
 void PropagatorState::FrameState::InitializeFrameInfo(
-    const string& enter_name) {
-  const ImmutableExecutorState::FrameInfo* finfo =
-      immutable_state.get_frame_info(enter_name);
-  DCHECK_NE(finfo, nullptr);
-  pending_counts = finfo->pending_counts.get();
-  total_input_tensors = finfo->total_inputs;
-  num_pending_inputs = finfo->input_count;
-  nodes = finfo->nodes.get();
+    const ImmutableExecutorState::FrameInfo& finfo) {
+  pending_counts = finfo.pending_counts.get();
+  total_input_tensors = finfo.total_inputs;
+  num_pending_inputs = finfo.input_count;
+  nodes = finfo.nodes.get();
 }
 
 void PropagatorState::FrameState::SetIteration(int64 iter,
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index 459e28a83ee..d61adeff5c4 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -279,7 +279,7 @@ class PropagatorState {
     // during structured traversal: parent_frame->mu < mu.
     mutex mu;
 
-    void InitializeFrameInfo(const string& enter_name);
+    void InitializeFrameInfo(const ImmutableExecutorState::FrameInfo& finfo);
 
     inline IterationState* GetIteration(int64 iter)
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu) {
@@ -447,12 +447,13 @@ class PropagatorState {
   // The root frame in which the execution of this step is started.
   FrameState* root_frame_;
 
-  // Mapping from frame name to outstanding frames. A new frame is created
+  // Mapping from frame ID to outstanding frames. A new frame is created
   // at some iteration of an active frame. So the unique key for the new
-  // child frame is composed of the name of the parent frame, the iteration
+  // child frame is a hash composed of the ID of the parent frame, the iteration
   // number at which the parent frame is creating the new frame, and the
   // name of the new frame from nodedef.
-  gtl::FlatMap<string, FrameState*> outstanding_frames_ TF_GUARDED_BY(mu_);
+  absl::flat_hash_map<uint64, FrameState*> outstanding_frames_
+      TF_GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(PropagatorState);
 };

From cab22e9d46cc653798a802eb57380eebd0cc7e16 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Thu, 7 May 2020 17:28:51 -0700
Subject: [PATCH 0167/1533] Don't concatenate empty tensors

PiperOrigin-RevId: 310468853
Change-Id: I30c5006d172c49d1203e92eda7c040f80ca4a1ed
---
 tensorflow/core/kernels/list_kernels.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 855506e9d8a..37fc1b3ae08 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -435,8 +435,10 @@ class TensorListConcat : public OpKernel {
     for (int i = 0; i < tensor_list->tensors().size(); i++) {
       const Tensor& element_tensor = tensor_list->tensors()[i];
       if (element_tensor.dtype() != DT_INVALID) {
-        inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
-            element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+        if (element_tensor.NumElements() > 0) {
+          inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+              element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+        }
       } else {
         AllocatorAttributes attr;
         if (element_dtype_ == DT_VARIANT) {

From f8972015fb81588c994fcea65330924f564b9bf4 Mon Sep 17 00:00:00 2001
From: josh meyer <joshua.richard.meyer@gmail.com>
Date: Thu, 7 May 2020 17:31:34 -0700
Subject: [PATCH 0168/1533] better error msg for incorrect WAV format chunk

---
 tensorflow/core/lib/wav/wav_io.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index d318059e8f6..fd0f796d93c 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -235,7 +235,7 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
       ReadValue<uint32>(wav_string, &format_chunk_size, &offset));
   if ((format_chunk_size != 16) && (format_chunk_size != 18)) {
     return errors::InvalidArgument(
-        "Bad file size for WAV: Expected 16 or 18, but got", format_chunk_size);
+        "Bad format chunk size for WAV: Expected 16 or 18, but got", format_chunk_size);
   }
   uint16 audio_format;
   TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &audio_format, &offset));

From acfe8165248aab4d5c6d77c06f9b8449473ed86c Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Thu, 7 May 2020 17:54:15 -0700
Subject: [PATCH 0169/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/910871532101

PiperOrigin-RevId: 310472224
Change-Id: I4afb11f72d186f62025ce0ccd36679c15de3e67f
---
 tensorflow/workspace.bzl |  4 ++--
 third_party/mlir/BUILD   | 31 +++++++++++++++++++++++--------
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 77e2538a107..85c0ca29fe9 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "307cfdf5338641e3a895857ef02dc9da35cd0eb6"
-    LLVM_SHA256 = "5e75125ecadee4f91e07c20bf6612d740913a677348fd33c7264ee8fe7d12b17"
+    LLVM_COMMIT = "91087153210132a4c2d3cf19a4526d8f395cb5a4"
+    LLVM_SHA256 = "b2e2314ce2d4a7f0da436063c922d716171415d1b5e85889235d9eab1ecb98c1"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 8074eb5e290..925fad7414f 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -657,6 +657,7 @@ gentbl(
     td_file = "include/mlir/Dialect/Shape/IR/ShapeOps.td",
     td_srcs = [
         ":StdOpsTdFiles",
+        "include/mlir/Dialect/Shape/IR/ShapeBase.td",
         "include/mlir/Interfaces/InferTypeOpInterface.td",
     ],
 )
@@ -715,24 +716,35 @@ cc_library(
     ],
 )
 
+gentbl(
+    name = "StandardOpsTransformsPassIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [(
+        "-gen-pass-decls",
+        "include/mlir/Dialect/StandardOps/Transforms/Passes.h.inc",
+    )],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/StandardOps/Transforms/Passes.td",
+    td_srcs = [":PassBaseTdFiles"],
+)
+
 cc_library(
     name = "StandardOpsTransforms",
-    srcs = glob(
-        [
-            "lib/Dialect/StandardOps/Transforms/*.cpp",
-            "lib/Dialect/StandardOps/Transforms/*.h",
-        ],
-    ),
-    hdrs = glob([
-        "include/mlir/Dialect/StandardOps/Transforms/*.h",
+    srcs = glob([
+        "lib/Dialect/StandardOps/Transforms/*.cpp",
+        "lib/Dialect/StandardOps/Transforms/*.h",
     ]),
+    hdrs = glob(["include/mlir/Dialect/StandardOps/Transforms/*.h"]),
     includes = ["include"],
     deps = [
         ":Analysis",
         ":ControlFlowInterfaces",
         ":IR",
+        ":Pass",
         ":StandardOps",
+        ":StandardOpsTransformsPassIncGen",
         ":Support",
+        ":Transforms",
         "@llvm-project//llvm:support",
     ],
 )
@@ -2471,6 +2483,7 @@ cc_library(
         ":NVVMDialect",
         ":Parser",
         ":Pass",
+        ":StandardOpsTransforms",
         ":StandardToSPIRVConversions",
         ":StandardToStandard",
         ":Support",
@@ -2568,6 +2581,8 @@ cc_library(
         ":SPIRVPassIncGen",
         ":Shape",
         ":StandardOps",
+        ":StandardOpsTransforms",
+        ":StandardOpsTransformsPassIncGen",
         ":StandardToSPIRVConversions",
         ":StandardToStandard",
         ":Transforms",

From 3e31be22cfcd22e7fced904df4c5be014b7d24b7 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Thu, 7 May 2020 18:46:23 -0700
Subject: [PATCH 0170/1533] Make regularizers API more consistent.

API change (backwards compatible): rename argument `l` in aliases `l1()` and `l2()` to `l1` and `l2` respectively. `l` argument name still works and takes priority over the default value, if passed by name.

API addition: add `L1` and `L2` classes (previously named `l1` and `l2`). Aliases `l1` and `l2` still work.

Not an actual API change, but nevertheless reflect in golden API files: refactor `l1` and `l2` functions into classes `L1` and `L2`. This makes them consistent with the rest of the API and removes the need for deserialization special cases.

PiperOrigin-RevId: 310478018
Change-Id: I46d433a2149a97615ca311932c070ab23e8adae2
---
 .../python/keras/layers/serialization_test.py |   8 +-
 tensorflow/python/keras/regularizers.py       | 115 +++++++++++-------
 .../keras/tests/add_loss_correctness_test.py  |   2 +-
 .../python/keras/utils/generic_utils_test.py  |   6 +-
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 +++
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 +++
 .../v1/tensorflow.keras.regularizers.l1.pbtxt |  18 +++
 .../v1/tensorflow.keras.regularizers.l2.pbtxt |  18 +++
 .../v1/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 +++
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 +++
 .../v2/tensorflow.keras.regularizers.l1.pbtxt |  18 +++
 .../v2/tensorflow.keras.regularizers.l2.pbtxt |  18 +++
 .../v2/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 14 files changed, 258 insertions(+), 65 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt

diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index b18a0fbd8cc..920881c6a3e 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -53,7 +53,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -88,7 +88,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -116,7 +116,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters(
       [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
@@ -135,7 +135,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
   def test_serialize_deserialize_lstm(self, layer):
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 973d916f7e0..48efd413267 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -14,13 +14,14 @@
 # ==============================================================================
 """Built-in regularizers.
 """
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
@@ -60,8 +61,8 @@ class Regularizer(object):
   >>> layer = tf.keras.layers.Dense(
   ...     5, input_dim=5,
   ...     kernel_initializer='ones',
-  ...     kernel_regularizer=tf.keras.regularizers.l1(0.01),
-  ...     activity_regularizer=tf.keras.regularizers.l2(0.01))
+  ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
+  ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
   >>> tensor = tf.ones(shape=(5, 5)) * 2.0
   >>> out = layer(tensor)
 
@@ -73,9 +74,9 @@ class Regularizer(object):
   ## Available penalties
 
   ```python
-  tf.keras.regularizers.l1(0.3)  # L1 Regularization Penalty
-  tf.keras.regularizers.l2(0.1)  # L2 Regularization Penalty
-  tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)  # L1 + L2 penalties
+  tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
+  tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
+  tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
   ```
 
   ## Directly calling a regularizer
@@ -84,7 +85,7 @@ class Regularizer(object):
   as if it is a one-argument function.
 
   E.g.
-  >>> regularizer = tf.keras.regularizers.l2(2.)
+  >>> regularizer = tf.keras.regularizers.L2(2.)
   >>> tensor = tf.ones(shape=(5, 5))
   >>> regularizer(tensor)
   <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
@@ -194,7 +195,7 @@ class Regularizer(object):
 
 @keras_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
-  r"""A regularizer that applies both L1 and L2 regularization penalties.
+  """A regularizer that applies both L1 and L2 regularization penalties.
 
   The L1 regularization penalty is computed as:
   `loss = l1 * reduce_sum(abs(x))`
@@ -202,19 +203,23 @@ class L1L2(Regularizer):
   The L2 regularization penalty is computed as
   `loss = l2 * reduce_sum(square(x))`
 
+  L1L2 may be passed to a layer as a string identifier:
+
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
+
+  In this case, the default values used are `l1=0.01` and `l2=0.01`.
+
   Attributes:
       l1: Float; L1 regularization factor.
       l2: Float; L2 regularization factor.
   """
 
   def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
-    self.l1 = K.cast_to_floatx(l1)
-    self.l2 = K.cast_to_floatx(l2)
+    self.l1 = backend.cast_to_floatx(l1)
+    self.l2 = backend.cast_to_floatx(l2)
 
   def __call__(self, x):
-    if not self.l1 and not self.l2:
-      return K.constant(0.)
-    regularization = 0.
+    regularization = backend.constant(0.)
     if self.l1:
       regularization += self.l1 * math_ops.reduce_sum(math_ops.abs(x))
     if self.l2:
@@ -225,39 +230,64 @@ class L1L2(Regularizer):
     return {'l1': float(self.l1), 'l2': float(self.l2)}
 
 
-# Aliases.
-
-
-@keras_export('keras.regularizers.l1')
-def l1(l=0.01):
-  r"""Create a regularizer that applies an L1 regularization penalty.
+@keras_export('keras.regularizers.L1', 'keras.regularizers.l1')
+class L1(Regularizer):
+  """A regularizer that applies a L1 regularization penalty.
 
   The L1 regularization penalty is computed as:
-  `loss = l * reduce_sum(abs(x))`
+  `loss = l1 * reduce_sum(abs(x))`
 
-  Arguments:
-      l: Float; L1 regularization factor.
+  L1 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L1 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
+
+  In this case, the default value used is `l1=0.01`.
+
+  Attributes:
+      l1: Float; L1 regularization factor.
   """
-  return L1L2(l1=l)
+
+  def __init__(self, l1=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l1 = kwargs.pop('l', l1)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l1 = backend.cast_to_floatx(l1)
+
+  def __call__(self, x):
+    return self.l1 * math_ops.reduce_sum(math_ops.abs(x))
+
+  def get_config(self):
+    return {'l1': float(self.l1)}
 
 
-@keras_export('keras.regularizers.l2')
-def l2(l=0.01):
-  r"""Create a regularizer that applies an L2 regularization penalty.
+@keras_export('keras.regularizers.L2', 'keras.regularizers.l2')
+class L2(Regularizer):
+  """A regularizer that applies a L2 regularization penalty.
 
   The L2 regularization penalty is computed as:
-  `loss = l * reduce_sum(square(x))`
+  `loss = l2 * reduce_sum(square(x))`
 
-  Arguments:
-      l: Float; L2 regularization factor.
+  L2 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L2 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
+
+  In this case, the default value used is `l2=0.01`.
+
+  Attributes:
+      l2: Float; L2 regularization factor.
   """
-  return L1L2(l2=l)
+
+  def __init__(self, l2=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l2 = kwargs.pop('l', l2)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l2 = backend.cast_to_floatx(l2)
+
+  def __call__(self, x):
+    return self.l2 * math_ops.reduce_sum(math_ops.square(x))
+
+  def get_config(self):
+    return {'l2': float(self.l2)}
 
 
 @keras_export('keras.regularizers.l1_l2')
@@ -280,6 +310,11 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
+# Deserialization aliases.
+l1 = L1
+l2 = L2
+
+
 @keras_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
@@ -287,6 +322,10 @@ def serialize(regularizer):
 
 @keras_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
+  if config == 'l1_l2':
+    # Special case necessary since the defaults used for "l1_l2" (string)
+    # differ from those of the L1L2 class.
+    return L1L2(l1=0.01, l2=0.01)
   return deserialize_keras_object(
       config,
       module_objects=globals(),
@@ -296,18 +335,12 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.regularizers.get')
 def get(identifier):
+  """Retrieve a regularizer instance from a config or identifier."""
   if identifier is None:
     return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
-    identifier = str(identifier)
-    # We have to special-case functions that return classes.
-    # TODO(omalleyt): Turn these into classes or class aliases.
-    special_cases = ['l1', 'l2', 'l1_l2']
-    if identifier in special_cases:
-      # Treat like a class.
-      return deserialize({'class_name': identifier, 'config': {}})
     return deserialize(str(identifier))
   elif callable(identifier):
     return identifier
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index 323a2626c15..a19eec75ffb 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -288,7 +288,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           model_layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
-      y = np.ones((10, 1), 'float32')
+      y = np.zeros((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       model.compile(
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 334758871fa..ddaa60c3c24 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -201,7 +201,7 @@ class SerializeKerasObjectTest(test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableInt)
     self.assertEqual(new_layer.units, 3)
 
@@ -253,7 +253,7 @@ class SerializeKerasObjectTest(test.TestCase):
     self.assertEqual(new_layer.name, 'SerializableNestedInt')
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
@@ -293,7 +293,7 @@ class SerializeKerasObjectTest(test.TestCase):
             'SerializableNestedInt': SerializableNestedInt
         })
     self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L1L2)
+    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
     self.assertIsInstance(new_layer.units, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertIs(new_layer.units.fn, serializable_fn)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"

From c3b61201ebd9edf645cc27c66acc30f1a46b964a Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Thu, 7 May 2020 19:51:24 -0700
Subject: [PATCH 0171/1533] Try https and see if that works.

PiperOrigin-RevId: 310485120
Change-Id: I0a17e6e02142aea6b6386be0e20b66ac586950a4
---
 .../lite/g3doc/tutorials/model_maker_image_classification.ipynb | 2 +-
 .../lite/g3doc/tutorials/model_maker_text_classification.ipynb  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index 64a03f0fc85..ee46795f5c8 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -101,7 +101,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install git+git://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
       ]
     },
     {
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
index 4c6a8a72154..8261d6c9e34 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -101,7 +101,7 @@
       },
       "outputs": [],
       "source": [
-        "!pip install git+git://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
+        "!pip install git+https://github.com/tensorflow/examples.git#egg=tensorflow-examples[model_maker]"
       ]
     },
     {

From 310e8548beafd6f2d532626c5e4857e6f89c390f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Thu, 7 May 2020 20:22:08 -0700
Subject: [PATCH 0172/1533] CalculateOutputShape for concatenation of BHWDC
 tensors.

PiperOrigin-RevId: 310488010
Change-Id: Ib8fcdcbcb8c62a858ca3338f358c0d4a7683c141
---
 .../lite/delegates/gpu/common/operations.cc   | 56 +++++++++++++++++++
 .../lite/delegates/gpu/common/operations.h    |  6 ++
 2 files changed, 62 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 28ce67b1ce3..3924f91f952 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -562,6 +562,62 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
   return absl::OkStatus();
 }
 
+absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWDC* output_shape) {
+  BHWDC new_shape = input[0];
+  switch (attr.axis) {
+    case Axis::CHANNELS:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].h != new_shape.h || input[i].w != new_shape.w ||
+            input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Height, Width and Depth must be the same when concatenating "
+              "by channels axis");
+        }
+        new_shape.c += input[i].c;
+      }
+      break;
+    case Axis::HEIGHT:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].c != new_shape.c ||
+            input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Width, Depth and Channels must be the same when concatenating "
+              "by height axis");
+        }
+        new_shape.h += input[i].h;
+      }
+      break;
+    case Axis::WIDTH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
+            input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Height, Depth and Channels must be the same when concatenating "
+              "by width axis");
+        }
+        new_shape.w += input[i].w;
+      }
+      break;
+    case Axis::DEPTH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].h != new_shape.h ||
+            input[i].c != new_shape.c) {
+          return absl::InvalidArgumentError(
+              "Width, Height and Channels must be the same when concatenating "
+              "by depth axis");
+        }
+        new_shape.d += input[i].d;
+      }
+      break;
+    default:
+      return absl::InvalidArgumentError("Invalid axis");
+  }
+  *output_shape = new_shape;
+  return absl::OkStatus();
+}
+
 Padding2D CalculateSamePadding(const BHWC& input,
                                const Convolution2DAttributes& attr) {
   return MakeSamePadding(input, attr);
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index 4eb41dfe1a3..f8bfc77f610 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -206,6 +206,12 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
                                   const ConcatAttributes& attr,
                                   BHWC* output_shape);
 
+// @return shape of a tensor after Concat operation is applied to the given
+//         input.
+absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
+                                  const ConcatAttributes& attr,
+                                  BHWDC* output_shape);
+
 // @return padding for pooling operation to make sure output keep the same shape
 // as the given input.
 Padding2D CalculateSamePadding(const BHWC& input,

From 392d3a0ab81d16e8815385175e3daa9d023cca67 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Thu, 7 May 2020 20:34:24 -0700
Subject: [PATCH 0173/1533] Fix enable_v2_dtype_behavior() doctest failure.

The issue is policy.py has the following in its docstring:

```
>>> # Optionaly set policy back to float32 if any other models use float32
>>> tf.keras.mixed_precision.experimental.set_policy('float32')
```

This sets global state that affects enable_v2_dtype_behavior() if run after policy.py. Specifically, the enable_v2_dtype_behavior() docstring calls disable_v2_dtype_behavior(). When disabled, the default global policy is None instead of float32. However, if the global policy is explicitly set to float32, as in the policy.py docstring, it will still be float32 even when v2 dtype behavior is disabled.

To fix, I removed the parts of the docstring that disable V2 behavior. This is an inelegant fix, so I'll try to think of a better solution later.

PiperOrigin-RevId: 310489347
Change-Id: I9186016f21874114b1d67aebe47c95e13c354cae
---
 tensorflow/python/keras/engine/base_layer_utils.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 586efda4680..c5e00d8e38e 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -676,16 +676,8 @@ def enable_v2_dtype_behavior():
   float32) instead of None. In addition, layers will automatically cast
   floating-point inputs to the layer's dtype.
 
-  >>> tf.compat.v1.keras.layers.disable_v2_dtype_behavior()
   >>> x = tf.ones((4, 4, 4, 4), dtype='float64')
   >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
-  >>> print(layer.dtype)  # None since V2 behavior is disabled
-  None
-  >>> y = layer(x)  # Doesn't cast inputs since V2 dtype behavior is disabled
-  >>> print(y.dtype.name)
-  float64
-  >>> tf.compat.v1.keras.layers.enable_v2_dtype_behavior()
-  >>> layer = tf.keras.layers.Conv2D(filters=4, kernel_size=2)
   >>> print(layer.dtype)  # float32 since V2 dtype behavior is enabled
   float32
   >>> y = layer(x)  # Layer casts inputs since V2 dtype behavior is enabled

From 5e8a44c3d2f92711befc890714daabb0e0ebbe2b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 20:52:29 -0700
Subject: [PATCH 0174/1533] Internal change

PiperOrigin-RevId: 310491314
Change-Id: I05e44ae5a38912dff33539df5a0e220fc19315ee
---
 .../python/keras/layers/serialization_test.py |   8 +-
 tensorflow/python/keras/regularizers.py       | 115 +++++++-----------
 .../keras/tests/add_loss_correctness_test.py  |   2 +-
 .../python/keras/utils/generic_utils_test.py  |   6 +-
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 ---
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 ---
 .../v1/tensorflow.keras.regularizers.l1.pbtxt |  18 ---
 .../v1/tensorflow.keras.regularizers.l2.pbtxt |  18 ---
 .../v1/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 ---
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 ---
 .../v2/tensorflow.keras.regularizers.l1.pbtxt |  18 ---
 .../v2/tensorflow.keras.regularizers.l2.pbtxt |  18 ---
 .../v2/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 14 files changed, 65 insertions(+), 258 deletions(-)
 delete mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt

diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index 920881c6a3e..b18a0fbd8cc 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -53,7 +53,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
+                     keras.regularizers.L1L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -88,7 +88,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
+                     keras.regularizers.L1L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -116,7 +116,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L2)
+                     keras.regularizers.L1L2)
 
   @parameterized.parameters(
       [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
@@ -135,7 +135,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L2)
+                     keras.regularizers.L1L2)
 
   @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
   def test_serialize_deserialize_lstm(self, layer):
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 48efd413267..973d916f7e0 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -14,14 +14,13 @@
 # ==============================================================================
 """Built-in regularizers.
 """
-# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras import backend
+from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
@@ -61,8 +60,8 @@ class Regularizer(object):
   >>> layer = tf.keras.layers.Dense(
   ...     5, input_dim=5,
   ...     kernel_initializer='ones',
-  ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
-  ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
+  ...     kernel_regularizer=tf.keras.regularizers.l1(0.01),
+  ...     activity_regularizer=tf.keras.regularizers.l2(0.01))
   >>> tensor = tf.ones(shape=(5, 5)) * 2.0
   >>> out = layer(tensor)
 
@@ -74,9 +73,9 @@ class Regularizer(object):
   ## Available penalties
 
   ```python
-  tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
-  tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
-  tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
+  tf.keras.regularizers.l1(0.3)  # L1 Regularization Penalty
+  tf.keras.regularizers.l2(0.1)  # L2 Regularization Penalty
+  tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)  # L1 + L2 penalties
   ```
 
   ## Directly calling a regularizer
@@ -85,7 +84,7 @@ class Regularizer(object):
   as if it is a one-argument function.
 
   E.g.
-  >>> regularizer = tf.keras.regularizers.L2(2.)
+  >>> regularizer = tf.keras.regularizers.l2(2.)
   >>> tensor = tf.ones(shape=(5, 5))
   >>> regularizer(tensor)
   <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
@@ -195,7 +194,7 @@ class Regularizer(object):
 
 @keras_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
-  """A regularizer that applies both L1 and L2 regularization penalties.
+  r"""A regularizer that applies both L1 and L2 regularization penalties.
 
   The L1 regularization penalty is computed as:
   `loss = l1 * reduce_sum(abs(x))`
@@ -203,23 +202,19 @@ class L1L2(Regularizer):
   The L2 regularization penalty is computed as
   `loss = l2 * reduce_sum(square(x))`
 
-  L1L2 may be passed to a layer as a string identifier:
-
-  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
-
-  In this case, the default values used are `l1=0.01` and `l2=0.01`.
-
   Attributes:
       l1: Float; L1 regularization factor.
       l2: Float; L2 regularization factor.
   """
 
   def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
-    self.l1 = backend.cast_to_floatx(l1)
-    self.l2 = backend.cast_to_floatx(l2)
+    self.l1 = K.cast_to_floatx(l1)
+    self.l2 = K.cast_to_floatx(l2)
 
   def __call__(self, x):
-    regularization = backend.constant(0.)
+    if not self.l1 and not self.l2:
+      return K.constant(0.)
+    regularization = 0.
     if self.l1:
       regularization += self.l1 * math_ops.reduce_sum(math_ops.abs(x))
     if self.l2:
@@ -230,64 +225,39 @@ class L1L2(Regularizer):
     return {'l1': float(self.l1), 'l2': float(self.l2)}
 
 
-@keras_export('keras.regularizers.L1', 'keras.regularizers.l1')
-class L1(Regularizer):
-  """A regularizer that applies a L1 regularization penalty.
+# Aliases.
+
+
+@keras_export('keras.regularizers.l1')
+def l1(l=0.01):
+  r"""Create a regularizer that applies an L1 regularization penalty.
 
   The L1 regularization penalty is computed as:
-  `loss = l1 * reduce_sum(abs(x))`
+  `loss = l * reduce_sum(abs(x))`
 
-  L1 may be passed to a layer as a string identifier:
+  Arguments:
+      l: Float; L1 regularization factor.
 
-  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
-
-  In this case, the default value used is `l1=0.01`.
-
-  Attributes:
-      l1: Float; L1 regularization factor.
+  Returns:
+    An L1 Regularizer with the given regularization factor.
   """
-
-  def __init__(self, l1=0.01, **kwargs):  # pylint: disable=redefined-outer-name
-    l1 = kwargs.pop('l', l1)  # Backwards compatibility
-    if kwargs:
-      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
-    self.l1 = backend.cast_to_floatx(l1)
-
-  def __call__(self, x):
-    return self.l1 * math_ops.reduce_sum(math_ops.abs(x))
-
-  def get_config(self):
-    return {'l1': float(self.l1)}
+  return L1L2(l1=l)
 
 
-@keras_export('keras.regularizers.L2', 'keras.regularizers.l2')
-class L2(Regularizer):
-  """A regularizer that applies a L2 regularization penalty.
+@keras_export('keras.regularizers.l2')
+def l2(l=0.01):
+  r"""Create a regularizer that applies an L2 regularization penalty.
 
   The L2 regularization penalty is computed as:
-  `loss = l2 * reduce_sum(square(x))`
+  `loss = l * reduce_sum(square(x))`
 
-  L2 may be passed to a layer as a string identifier:
+  Arguments:
+      l: Float; L2 regularization factor.
 
-  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
-
-  In this case, the default value used is `l2=0.01`.
-
-  Attributes:
-      l2: Float; L2 regularization factor.
+  Returns:
+    An L2 Regularizer with the given regularization factor.
   """
-
-  def __init__(self, l2=0.01, **kwargs):  # pylint: disable=redefined-outer-name
-    l2 = kwargs.pop('l', l2)  # Backwards compatibility
-    if kwargs:
-      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
-    self.l2 = backend.cast_to_floatx(l2)
-
-  def __call__(self, x):
-    return self.l2 * math_ops.reduce_sum(math_ops.square(x))
-
-  def get_config(self):
-    return {'l2': float(self.l2)}
+  return L1L2(l2=l)
 
 
 @keras_export('keras.regularizers.l1_l2')
@@ -310,11 +280,6 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
-# Deserialization aliases.
-l1 = L1
-l2 = L2
-
-
 @keras_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
@@ -322,10 +287,6 @@ def serialize(regularizer):
 
 @keras_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
-  if config == 'l1_l2':
-    # Special case necessary since the defaults used for "l1_l2" (string)
-    # differ from those of the L1L2 class.
-    return L1L2(l1=0.01, l2=0.01)
   return deserialize_keras_object(
       config,
       module_objects=globals(),
@@ -335,12 +296,18 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.regularizers.get')
 def get(identifier):
-  """Retrieve a regularizer instance from a config or identifier."""
   if identifier is None:
     return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
+    identifier = str(identifier)
+    # We have to special-case functions that return classes.
+    # TODO(omalleyt): Turn these into classes or class aliases.
+    special_cases = ['l1', 'l2', 'l1_l2']
+    if identifier in special_cases:
+      # Treat like a class.
+      return deserialize({'class_name': identifier, 'config': {}})
     return deserialize(str(identifier))
   elif callable(identifier):
     return identifier
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index a19eec75ffb..323a2626c15 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -288,7 +288,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           model_layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
-      y = np.zeros((10, 1), 'float32')
+      y = np.ones((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       model.compile(
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index ddaa60c3c24..334758871fa 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -201,7 +201,7 @@ class SerializeKerasObjectTest(test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
+                     keras.regularizers.L1L2)
     self.assertEqual(new_layer.units.__class__, SerializableInt)
     self.assertEqual(new_layer.units, 3)
 
@@ -253,7 +253,7 @@ class SerializeKerasObjectTest(test.TestCase):
     self.assertEqual(new_layer.name, 'SerializableNestedInt')
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L2)
+                     keras.regularizers.L1L2)
     self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
@@ -293,7 +293,7 @@ class SerializeKerasObjectTest(test.TestCase):
             'SerializableNestedInt': SerializableNestedInt
         })
     self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
+    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L1L2)
     self.assertIsInstance(new_layer.units, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertIs(new_layer.units.fn, serializable_fn)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
deleted file mode 100644
index 5cb133ca85d..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.L1"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
deleted file mode 100644
index c5b706d1d2f..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.L2"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
deleted file mode 100644
index eb769a0dc44..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.l1"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
deleted file mode 100644
index fda5c76ecd2..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.l2"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
index 96a4b193b1b..bb10d41d704 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
@@ -1,29 +1,13 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
-  member {
-    name: "L1"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "L2"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "l1"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "l2"
-    mtype: "<type \'type\'>"
-  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -32,10 +16,18 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "l1"
+    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
+  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
+  member_method {
+    name: "l2"
+    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
+  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
deleted file mode 100644
index 5cb133ca85d..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.L1"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
deleted file mode 100644
index c5b706d1d2f..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.L2"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
deleted file mode 100644
index eb769a0dc44..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.l1"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
deleted file mode 100644
index fda5c76ecd2..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.keras.regularizers.l2"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
-  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
index 96a4b193b1b..bb10d41d704 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
@@ -1,29 +1,13 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
-  member {
-    name: "L1"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "L2"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "l1"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "l2"
-    mtype: "<type \'type\'>"
-  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -32,10 +16,18 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "l1"
+    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
+  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
+  member_method {
+    name: "l2"
+    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
+  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"

From fc8be36183799a9946cd8cec64fa993bd7131b18 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Thu, 7 May 2020 21:21:58 -0700
Subject: [PATCH 0175/1533] Add op sanity checks to the following TFLite ops:

GreaterOp
HardSwishOp
LeakyReluOp
LessEqualOp
LessOp
LocalResponseNormalizationOp
LogSoftmaxOp
LogisticOp
MatrixDiagOp
MatrixSetDiagOp
MaxPool2DOp
MaxPoolingWithArgMax2DOp
MaxUnpooling2DOp
MeanOp
NotEqualOp
NumericVerifyOp
OneHotOp
PReluOp
PackOp
PadOp
PadV2Op
QConstOp

PiperOrigin-RevId: 310494530
Change-Id: Id3da36a47d1febbd2743de6ed5c195a9b2f5ff64
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td  | 213 +++++++++++++------
 tensorflow/compiler/mlir/lite/tests/ops.mlir |  29 +--
 2 files changed, 160 insertions(+), 82 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index d103c07b986..966c56a1464 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -247,7 +247,14 @@ class TFL_TFTypesWithSameBits<int i, int j, int num> :
     Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
         CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
 
-class TFL_OperandIsNoneOrHasRankLessThanOrEqualTo<int n, int m> :
+class TFL_TFOperandTypesWithSameBits<int i, int j, int num> :
+  And<[
+    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+        CPred<"getElementTypeOrSelf($_op.getOperand(" # i # ")).isUnsignedInteger(" # num # ")">]>,
+    Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
+        CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
+
+class TFL_OperandIsNoneOrHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[
       CPred<"$_op.getOperand(" # n # ").getType().isa<NoneType>()">,
@@ -255,13 +262,13 @@ class TFL_OperandIsNoneOrHasRankLessThanOrEqualTo<int n, int m> :
       CPred<"$_op.getOperand(" # n #
       ").getType().cast<ShapedType>().getRank() <= " # m>]>>;
 
-class TFL_OperandHasRankLessThanOrEqualTo<int n, int m> :
+class TFL_OperandHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
       CPred<"$_op.getOperand(" # n #
       ").getType().cast<ShapedType>().getRank() <= " # m>]>>;
 
-class TFL_OperandHasRankGreaterThanOrEqualTo<int n, int m> :
+class TFL_OperandHasRankAtLeast<int n, int m> :
   PredOpTrait<"operand " # n # " is at least " # m # "-D",
     Or<[TFL_OperandIsUnrankedPred<n>,
       CPred<"$_op.getOperand(" # n #
@@ -300,6 +307,18 @@ class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
             "quant::QuantizedType::castToStorageType("
                 "getElementTypeOrSelf($_op.getOperand(" # j # ")))">]>]>]>;
 
+// This is a quantization-aware version of TCresVTEtIsSameAsOp
+class TFL_TCopVTEtAreSameAt<int i, int j> : Or<[
+  TCopVTEtAreSameAt<[i, j]>,
+  TFL_TFOperandTypesWithSameBits<i, j, 8>,
+  And<[
+    SubstLeaves<"$_self", "getElementTypeOrSelf($_op.getOperand(" # j # "))",
+      quant_QuantizedType.predicate>,
+    CPred<"quant::QuantizedType::castToStorageType("
+              "getElementTypeOrSelf($_op.getOperand(" # i # "))) == "
+          "quant::QuantizedType::castToStorageType("
+              "getElementTypeOrSelf($_op.getOperand(" # j # ")))">]>]>;
+
 //===----------------------------------------------------------------------===//
 // TFL op common constraints.
 //===----------------------------------------------------------------------===//
@@ -963,7 +982,11 @@ def TFL_ScatterNdOp : TFL_Op<"scatter_nd", [
 
 // Same type check of lhs and rhs is handled by the ResultsBroadcastableShape trait.
 def TFL_LessEqualOp : TFL_Op<"less_equal", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Less_equal operator";
 
   let description = [{
@@ -971,8 +994,8 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
   }];
 
   let arguments = (
-      ins TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$lhs,
-      TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$rhs);
+      ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8]>:$lhs,
+      TFL_TensorOf<[F32, I32, I64, QI8, QUI8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -985,9 +1008,12 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [
   let hasOptions = 0;
 }
 
-def TFL_LocalResponseNormalizationOp : TFL_Op<"local_response_normalization",
-                                             [NoSideEffect]> {
-    let summary = "Local Response Normalization.";
+def TFL_LocalResponseNormalizationOp : TFL_Op<"local_response_normalization", [
+    TFL_OperandHasRank<0, 4>,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultType,
+    NoSideEffect]> {
+  let summary = "Local Response Normalization.";
 
   let description = [{
 The 4-D `input` tensor is treated as a 3-D array of 1-D vectors (along the last
@@ -1004,7 +1030,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   }];
 
   let arguments = (ins
-      TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+      TFL_FpTensor:$input,
       I32Attr:$radius,
       F32Attr:$bias,
       F32Attr:$alpha,
@@ -1012,7 +1038,7 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, QI8, QUI8]>:$output
+    TFL_FpTensor:$output
   );
 
   let hasOptions = 1;
@@ -1048,7 +1074,7 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   NoSideEffect,
   TFL_OperandHasAtleastRank<0, 1>,
   PredOpTrait<"operand and result must have the same element type",
-    TCresVTEtIsSameAsOp<0, 0>>]> {
+    TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = [{
     Returns a tensor with the provided diagonal and everything else padded with zeros.
   }];
@@ -1061,17 +1087,21 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$diagonal
+    TFL_TensorOf<[F32, I8, I16, I32, I64, TFL_Uint8, QUI8, QI8, TFL_Quint8]>:$diagonal
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, TFL_Uint8, QUI8, QI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 0;
 }
 
-def TFL_MatrixSetDiagOp : TFL_Op<"matrix_set_diag", [NoSideEffect]> {
+def TFL_MatrixSetDiagOp : TFL_Op<"matrix_set_diag", [
+    TFL_OperandHasAtleastRank<0, 2>,
+    PredOpTrait<"input and result must have the same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
   let summary = [{
     Returns a batched matrix tensor with new batched diagonal values.
   }];
@@ -1083,12 +1113,12 @@ innermost matrices.  These will be overwritten by the values in `diagonal`.
   }];
 
   let arguments = (ins
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$input,
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$diagonal
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$input,
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$diagonal
   );
 
   let results = (outs
-    TensorOf<[F32, I32, I64, I8, QI8, QI16, QUI8, TFL_Uint8, TFL_Quint8]>:$output
+    TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QI16, QUI8, TFL_Quint8]>:$result
   );
 
   let hasOptions = 0;
@@ -1206,7 +1236,12 @@ larger than 0.
 }
 
 def TFL_NotEqualOp : TFL_Op<"not_equal", [
-    ResultsBroadcastableShape, Commutative, NoSideEffect, NoQuantizableResult]> {
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    BinaryOpSameElementTypeConstraint,
+    ResultsBroadcastableShape,
+    Commutative,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Not_equal operator";
 
   let description = [{
@@ -1214,8 +1249,8 @@ def TFL_NotEqualOp : TFL_Op<"not_equal", [
   }];
 
   let arguments = (
-      ins AnyTensor:$lhs,
-      AnyTensor:$rhs);
+      ins TFL_TensorOf<[I1, F32, I32, I64, QUI8, QI8, TFL_Quint8, TFL_Str]>:$lhs,
+      TFL_TensorOf<[I1, F32, I32, I64, QUI8, QI8, TFL_Quint8, TFL_Str]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -1284,7 +1319,7 @@ def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
      PredOpTrait<"value and output must have same element type",
        TFL_TCresVTEtIsSameAsOp<0, 1>>,
      TFL_OperandHasRank<0, 1>,
-     TFL_OperandHasRankGreaterThanOrEqualTo<1, 2>
+     TFL_OperandHasRankAtLeast<1, 2>
     ]> {
   let summary = "Embedding lookup operator";
 
@@ -1502,7 +1537,11 @@ def TFL_FloorModOp : TFL_Op<"floor_mod", [
 }
 
 def TFL_GreaterOp : TFL_Op<"greater", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Greater operator";
 
   let description = [{
@@ -1510,10 +1549,10 @@ def TFL_GreaterOp : TFL_Op<"greater", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$lhs,
+    TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_BoolTensor:$output);
 
   let builders = [TFL_ComparisonBinaryBuilder];
 
@@ -1523,8 +1562,9 @@ def TFL_GreaterOp : TFL_Op<"greater", [
 }
 
 def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
-                                          SameOperandsAndResultShape,
-                                          TFL_GpuTargetOp]> {
+                                           SameOperandsAndResultShape,
+                                           SameOperandsAndResultType,
+                                           TFL_GpuTargetOp]> {
   let summary = "Hardswish activation function.";
   let description = [{
     Computes hard-swish activation function
@@ -1563,29 +1603,34 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
   let customOption = "L2NormOptions";
 }
 
-def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [NoSideEffect, SameOperandsAndResultType]> {
+def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [
+    SameOperandsAndResultShape,
+    NoSideEffect,
+    SameOperandsAndResultType]> {
   let summary = "Leaky Relu operator";
 
-  // TODO(jpienaar): Add type restriction. This op is only defined for
-  // restricted (floating point) types.
   let description = [{
     Element-wise Leaky ReLU operator
       x -> x >= 0 ? x : (alpha * x)
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$input,
     // Slope of the activation function at x < 0.
     F32Attr:$alpha
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$output);
 
   let hasOptions = 0b1;
 }
 
 def TFL_LessOp : TFL_Op<"less", [
-    ResultsBroadcastableShape, NoSideEffect, NoQuantizableResult]> {
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    NoSideEffect,
+    NoQuantizableResult]> {
   let summary = "Less operator";
 
   let description = [{
@@ -1593,8 +1638,8 @@ def TFL_LessOp : TFL_Op<"less", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$lhs,
+    TFL_TensorOf<[F32, I32, I64, QUI8, QI8, TFL_Quint8]>:$rhs);
 
   let results = (outs TFL_BoolTensor:$output);
 
@@ -1655,6 +1700,8 @@ def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> {
 
 def TFL_LogisticOp: TFL_Op<"logistic", [
     NoSideEffect,
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultShape,
     // zero_point = 0
     // scale = 1. / (max_value + 1)
@@ -1667,9 +1714,9 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     Computes element-wise Sigmoid of input
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, QUI16]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$y);
 }
 
 def TFL_LogOp: TFL_Op<"log", [
@@ -1690,10 +1737,10 @@ def TFL_LogOp: TFL_Op<"log", [
   let hasFolder = 1;
 }
 
-// TODO(b/130643170): Adds some constraint for the input/output element types.
 def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     NoSideEffect,
     SameOperandsAndResultShape,
+    SameOperandsAndResultType,
     // zero_point = max_value
     // scale = -log_softmax_output_min / (max_value + 1)
     FixedResultScale<Int8UniformQuantizedType<127, 625, -4>>,
@@ -1706,9 +1753,9 @@ def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
       input - log(reduce_sum(exp(input), dim))
   }];
 
-  let arguments = (ins AnyTensor:$input);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -1727,6 +1774,9 @@ def MaxPoolOperandAndResultConstraints : PredOpTrait<"MaxPool2D operand and "
     TFL_TCresVTEtIsSameAsOp<0, 0>]>>;
 
 def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
+    TFL_OperandHasRank<0, 4>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     MaxPoolOperandAndResultConstraints,
     SameOperandsAndResultsScale,
@@ -1741,7 +1791,7 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QUI8, QI8, QI16, TFL_Quint8]>:$input,
     TFL_PaddingAttr:$padding,
     I32Attr:$stride_w,
     I32Attr:$stride_h,
@@ -1750,7 +1800,7 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8, QI16, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 
@@ -1782,7 +1832,11 @@ def TFL_MaximumOp : TFL_Op<"maximum", [
   let hasOptions = 0;
 }
 
-def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, TFL_GpuTargetOp]> {
+def TFL_MeanOp : TFL_Op<"mean", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    TFL_GpuTargetOp]> {
   let summary = "Mean operator";
 
   let description = [{
@@ -1794,13 +1848,13 @@ def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, TFL_GpuTargetOp]> {
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
     TFL_TensorOf<[I32, I64]>:$axis,
     BoolAttr:$keep_dims
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, TFL_Uint8]>:$output);
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Uint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -1821,14 +1875,14 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> {
   let arguments = (ins
     TFL_TensorOf<[I32, I64]>:$indices,
     TFL_I32Tensor:$depth,
-    TFL_TensorOf<[F32, I32, I64, I1]>:$on_value,
-    TFL_TensorOf<[F32, I32, I64, I1]>:$off_value,
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$on_value,
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$off_value,
 
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I1]>:$output
+    TFL_TensorOf<[F32, I32, I64, I1, I8, UI8]>:$output
   );
 
   let hasOptions = 1;
@@ -2032,7 +2086,11 @@ def TFL_NegOp: TFL_Op<"neg", [NoSideEffect, SameOperandsAndResultType]> {
   let hasFolder = 1;
 }
 
-def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
+def TFL_PackOp : TFL_Op<"pack", [
+    PredOpTrait<"values and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale]> {
   let summary = "Packs a list of tensors along a dimension into one tensor";
 
   let description = [{
@@ -2063,14 +2121,14 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
   }];
 
   let arguments = (ins
-    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$values,
+    TFL_VariadicTensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$values,
 
-    I32Attr:$values_count,
+    Confined<I32Attr, [IntPositive]>:$values_count,
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I16, I32, I64, QI8, QUI8, QI16]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QI8, QUI8, QI16, TFL_Quint8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2081,8 +2139,11 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect, SameOperandsAndResultsScale]> {
 }
 
 def TFL_PadOp : TFL_Op<"pad", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
     TFL_GpuTargetOp]> {
@@ -2113,22 +2174,25 @@ def TFL_PadOp : TFL_Op<"pad", [
     ```
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+  let arguments = (ins TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$padding);
 
-  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
 
 def TFL_PadV2Op : TFL_Op<"padv2", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_OperandHasRank<1, 2>,
     TFL_OperandHasRank<2, 0>,
     TFL_OperandRankEquals1DimOfOperand<0, 1>,
     PredOpTrait<"input and constant value operands must have same element type",
-      TCopVTEtAreSameAt<[0, 2]>>]> {
+      TFL_TCopVTEtAreSameAt<0, 2>>]> {
   let summary = "Padding operator v2";
 
   let description = [{
@@ -2159,11 +2223,11 @@ def TFL_PadV2Op : TFL_Op<"padv2", [
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$padding,
-    TFL_TensorOf<[F32, I8, I32, I64]>:$constant_values);
+    TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$constant_values);
 
-  let results = (outs TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, UI8, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -2191,9 +2255,22 @@ def TFL_PowOp : TFL_Op<"pow", [ResultsBroadcastableShape,
   let builders = [TFL_BroadcastableBinaryBuilder];
 }
 
-def TFL_PReluOp : TFL_Op<"prelu", [NoSideEffect,
-                                   TFL_GpuTargetOp,
-                                   SameOperandsAndResultsScale]> {
+def TFL_PReluOp : TFL_Op<"prelu", [
+    NoSideEffect,
+    ResultsBroadcastableShape,
+    TFL_GpuTargetOp,
+    TFL_OperandHasRankAtMost<0, 4>,
+    TFL_OperandHasRankAtMost<1, 4>,
+    BinaryOpSameElementTypeConstraint,
+    PredOpTrait<"input and output must have the same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    PredOpTrait<"'alpha' should have one less rank than 'input'.",
+      Or<[TFL_OperandIsUnrankedPred<0>,
+          TFL_OperandIsUnrankedPred<1>,
+          CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() == "
+                "$_op.getOperand(1).getType().cast<ShapedType>().getRank() "
+                "+ 1">]>>,
+    SameOperandsAndResultsScale]> {
   let summary = "Parameterized Relu operator";
 
   let description = [{
@@ -2206,11 +2283,11 @@ def TFL_PReluOp : TFL_Op<"prelu", [NoSideEffect,
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QUI8]>:$input,
-    TFL_TensorOf<[F32, QUI8]>:$alpha
+    ins TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$alpha
   );
 
-  let results = (outs TFL_TensorOf<[F32, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output);
 
   let verifier = [{ return Verify(*this); }];
 }
@@ -2887,7 +2964,7 @@ def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    TFL_OperandHasRankLessThanOrEqualTo<0, 4>
+    TFL_OperandHasRankAtMost<0, 4>
   ]> {
   let summary = "DepthToSpace operator";
 
@@ -3224,7 +3301,7 @@ def TFL_QConstOp : Op<TFL_Dialect, "pseudo_qconst", [
     ElementsAttr:$value
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<
     "OpBuilder &, OperationState &state, TypeAttr qtype, Attribute value",
@@ -3849,7 +3926,7 @@ def TFL_NumericVerifyOp : Op<TFL_Dialect, "NumericVerify", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[QI8, QUI8, QI16, QUI16]>:$input,
+    TFL_TensorOf<[QI8, QUI8, QI16, F16, TFL_Quint8]>:$input,
     TFL_TensorOf<[F32]>:$ref,
 
     // Attributes
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index b1d1d81af37..f42e06350e5 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -573,7 +573,7 @@ func @testLogistic(tensor<1x2x3x4x5xf32>) -> tensor<1x2x3x4x5xf32> {
 // test invalid Logistic input
 func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 ^bb0(%arg0: tensor<?xi32>):
-  // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of 32-bit float or QI8 type or QUI8 type or QI16 type or QUI16 type values}}
+  // expected-error @+1 {{'tfl.logistic' op operand #0 must be tensor of 32-bit float or QI8 type or QUI8 type or QI16 type or TFLite quint8 type values, but got 'tensor<?xi32>'}}
   %0 = "tfl.logistic"(%arg0): (tensor<?xi32>) -> tensor<?xi32>
   return %0#0 : tensor<?xi32>
 }
@@ -1252,10 +1252,10 @@ func @testOneHot(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %
 
 // -----
 
-func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xi8> {
-  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit signless integer or 64-bit signless integer or 1-bit signless integer values}}
-  %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi8>
-  return %0 : tensor<*xi8>
+func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>) -> tensor<*xi16> {
+  // expected-error @+1 {{'tfl.one_hot' op result #0 must be tensor of 32-bit float or 32-bit signless integer or 64-bit signless integer or 1-bit signless integer or 8-bit signless integer or 8-bit unsigned integer values, but got 'tensor<*xi16>'}}
+  %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<*xi16>
+  return %0 : tensor<*xi16>
 }
 
 // -----
@@ -1489,7 +1489,8 @@ func @testEmbeddingLookupValueAndResultElementTypeTraitFailed(%arg0 : tensor<?xi
 
 // -----
 
-func @testQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
+func @testWrongQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>> {
+  // expected-error @+1 {{'tfl.local_response_normalization' op operand #0 must be tensor of 32-bit float values, but got 'tensor<1x56x56x192x!quant.uniform<u8:f32, 2.000000e-02>>'}}
   %0 = "tfl.local_response_normalization"(%arg0) {alpha = 9.99999974E-5 : f32, beta = 5.000000e-01 : f32, bias = 2.000000e+00 : f32, radius = 5 : i32} : (tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>) -> tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
   return %0 : tensor<1x56x56x192x!quant.uniform<u8:f32, 0.02>>
 }
@@ -1523,32 +1524,32 @@ func @testDepthToSpaceInvalidOutputType(%arg0: tensor<1x1x1x4xf32>) -> tensor<1x
 
 // -----
 
-func @testPReluWrongOutputRank(%arg0: tensor<10x10x10x10xf32>, %arg1: tensor<1x1x10xf32>) -> tensor<10x10x10xf32> {
-  // expected-error @+1 {{'input' and 'output' should have the same rank}}
-  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<10x10x10x10xf32>, tensor<1x1x10xf32>) -> tensor<10x10x10xf32>
-  return %0 : tensor<10x10x10xf32>
+func @testPReluWrongOutputRank(%arg0: tensor<10x10x10x10xf32>, %arg1: tensor<10x10x10x10xf32>) -> tensor<10x10xf32> {
+  // expected-error @+1 {{'tfl.prelu' op result type '10x10' not broadcast compatible with broadcasted operands's shapes '10x10x10x10'}}
+  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<10x10x10x10xf32>, tensor<10x10x10x10xf32>) -> tensor<10x10xf32>
+  return %0 : tensor<10x10xf32>
 }
 
 // -----
 
 func @testPReluWrongOutputShape(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<2x3x4xf32>) -> tensor<1x2x3x5xf32> {
-  // expected-error @+1 {{'input' and 'output' should have the same shape}}
+  // expected-error @+1 {{'tfl.prelu' op result type '1x2x3x5' not broadcast compatible with broadcasted operands's shapes '1x2x3x4'}}
   %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<1x2x3x4xf32>, tensor<2x3x4xf32>) -> tensor<1x2x3x5xf32>
   return %0 : tensor<1x2x3x5xf32>
 }
 
 // -----
 
-func @testPReluWrongAlphaRank(%arg0: tensor<7x3x2x14xf32>, %arg1: tensor<2x7x3x2x14xf32>) -> tensor<7x3x2x14xf32> {
+func @testPReluWrongAlphaRank(%arg0: tensor<7x3x2x14xf32>, %arg1: tensor<7x3x2x14xf32>) -> tensor<7x3x2x14xf32> {
   // expected-error @+1 {{'alpha' should have one less rank than 'input'.}}
-  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<7x3x2x14xf32>, tensor<2x7x3x2x14xf32>) -> tensor<7x3x2x14xf32>
+  %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<7x3x2x14xf32>, tensor<7x3x2x14xf32>) -> tensor<7x3x2x14xf32>
   return %0 : tensor<7x3x2x14xf32>
 }
 
 // -----
 
 func @testPReluInvalidBroadcast(%arg0: tensor<15x14x2x14xf32>, %arg1: tensor<1x1x3xf32>) -> tensor<15x14x2x14xf32> {
-  // expected-error @+1 {{'alpha' is not broadcastable at dimension 2.}}
+  // expected-error @+1 {{'tfl.prelu' op operands don't have broadcast-compatible shapes}}
   %0 = "tfl.prelu"(%arg0, %arg1) : (tensor<15x14x2x14xf32>, tensor<1x1x3xf32>) -> tensor<15x14x2x14xf32>
   return %0 : tensor<15x14x2x14xf32>
 }

From e5c4725babbd67fc3958fd4dfee97c59e273535f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 21:57:33 -0700
Subject: [PATCH 0176/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310498231
Change-Id: I476256e02f1d651ac8b29ba84507170a4afd8ae6
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From b25fb1fe32094b60f5a53ad5f986ad65a9f05919 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Thu, 7 May 2020 22:48:51 -0700
Subject: [PATCH 0177/1533] Enable Keras/RNN model via conversion routes that
 accept a keras model.

PiperOrigin-RevId: 310505146
Change-Id: Ic873c76a546e98038174f1fa9926a9d31f80e1fe
---
 tensorflow/lite/build_def.bzl          |   1 +
 tensorflow/lite/python/BUILD           |   1 +
 tensorflow/lite/python/lite.py         | 301 ++++++++++++++++++-------
 tensorflow/lite/python/lite_test.py    |  10 +-
 tensorflow/lite/python/lite_v2_test.py |  14 +-
 5 files changed, 236 insertions(+), 91 deletions(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index f6cdb981328..4af4bd4aae8 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -702,6 +702,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "//tensorflow/lite/python:lite",
                 "//tensorflow/python:client_testlib",
             ] + flex_dep(target_op_sets),
+            timeout = "long",
         )
 
 def if_tflite_experimental_runtime(if_eager, if_non_eager, if_none = []):
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 7248792523e..c1f37c81b7f 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -157,6 +157,7 @@ py_test(
     name = "lite_v2_test",
     srcs = ["lite_v2_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 61daa699f5a..ce59c56a1d0 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -20,6 +20,8 @@ from __future__ import division
 from __future__ import print_function
 
 import enum
+import shutil
+import tempfile
 import warnings
 
 from absl import logging
@@ -413,7 +415,7 @@ class TFLiteConverterBase(object):
 class TFLiteConverterBaseV2(TFLiteConverterBase):
   """Converter subclass to share functionality between V2 converters."""
 
-  def _convert(self, graph_def, input_tensors, output_tensors):
+  def convert(self, graph_def, input_tensors, output_tensors):
     """Converts a TensorFlow GraphDef based on instance variables.
 
     Args:
@@ -570,7 +572,115 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
         graph.get_tensor_by_name(signature_def.outputs[key].name)
         for key in signature_def.outputs
     ]
-    return self._convert(meta_graph.graph_def, input_tensors, output_tensors)
+    return super(TFLiteSavedModelConverterV2,
+                 self).convert(meta_graph.graph_def, input_tensors,
+                               output_tensors)
+
+
+class TFLiteKerasModelConverterV2(TFLiteConverterBaseV2):
+  """Converts the given Keras model into TensorFlow Lite model."""
+
+  def __init__(self, keras_model, trackable_obj=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      keras_model: tf.Keras.Model.
+      trackable_obj: tf.AutoTrackable object associated with `funcs`. A
+        reference to this object needs to be maintained so that Variables do not
+        get garbage collected since functions have a weak reference to
+        Variables. This is only required when the tf.AutoTrackable object is not
+        maintained by the user (e.g. `from_saved_model`).
+    """
+    super(TFLiteKerasModelConverterV2, self).__init__()
+    self._keras_model = keras_model
+    self._trackable_obj = trackable_obj
+
+  def convert(self):
+    """Converts a keras model based on instance variables.
+
+    Returns:
+      The converted data in serialized format.
+
+    Raises:
+      ValueError:
+        Multiple concrete functions are specified.
+        Input shape is not specified.
+        Invalid quantization parameters.
+    """
+    temp_dir = tempfile.mkdtemp()
+    try:
+      self._keras_model.save(temp_dir, save_format="tf")
+      self.saved_model_dir = temp_dir
+      self._saved_model_tags = set([_tag_constants.SERVING])
+      self._saved_model_exported_names = [
+          _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      ]
+      self._parse_saved_model_args()
+      if self.saved_model_dir:
+        graph = _ops.Graph()
+        saved_model = _loader_impl.SavedModelLoader(self.saved_model_dir)
+        saved_model.load_graph(graph, tags=self._saved_model_tags)
+        meta_graph = saved_model.get_meta_graph_def_from_tags(
+            self._saved_model_tags)
+        signature_def = meta_graph.signature_def[
+            _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
+        input_tensors = [
+            graph.get_tensor_by_name(signature_def.inputs[key].name)
+            for key in signature_def.inputs
+        ]
+        output_tensors = [
+            graph.get_tensor_by_name(signature_def.outputs[key].name)
+            for key in signature_def.outputs
+        ]
+        self._trackable_obj = _load(self.saved_model_dir,
+                                    self._saved_model_tags)
+        return super(TFLiteKerasModelConverterV2,
+                     self).convert(meta_graph.graph_def, input_tensors,
+                                   output_tensors)
+    finally:
+      shutil.rmtree(temp_dir, True)
+
+    input_signature = None
+    # If the model's call is not a `tf.function`, then we need to first get its
+    # input signature from `model_input_signature` method. We can't directly
+    # call `trace_model_call` because otherwise the batch dimension is set
+    # to None.
+    # Once we have better support for dynamic shapes, we can remove this.
+    if not isinstance(self._keras_model.call, _def_function.Function):
+      # Pass `keep_original_batch_size=True` will ensure that we get an input
+      # signature including the batch dimension specified by the user.
+      input_signature = _saving_utils.model_input_signature(
+          self._keras_model, keep_original_batch_size=True)
+
+    func = _saving_utils.trace_model_call(self._keras_model, input_signature)
+    concrete_func = func.get_concrete_function()
+    self._funcs = [concrete_func]
+
+    frozen_func, graph_def = (
+        _convert_to_constants.convert_variables_to_constants_v2_as_graph(
+            self._funcs[0], lower_control_flow=False))
+
+    input_tensors = [
+        tensor for tensor in frozen_func.inputs
+        if tensor.dtype != _dtypes.resource
+    ]
+    output_tensors = frozen_func.outputs
+
+    # Run a Grappler pass.
+    grappler_config = self._grappler_config()
+    # Skip running grappler when there are no optimizers to run. If not,
+    # grappler will run with the default optimizer set and it will lead to
+    # causing an unexpected behavior.
+    if grappler_config.graph_options.rewrite_options.optimizers:
+      graph_def = _run_graph_optimizations(
+          graph_def,
+          input_tensors,
+          output_tensors,
+          config=grappler_config,
+          graph=frozen_func.graph)
+
+    return super(TFLiteKerasModelConverterV2,
+                 self).convert(graph_def, input_tensors, output_tensors)
 
 
 class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
@@ -638,7 +748,8 @@ class TFLiteFrozenGraphConverterV2(TFLiteConverterBaseV2):
           config=grappler_config,
           graph=frozen_func.graph)
 
-    return self._convert(graph_def, input_tensors, output_tensors)
+    return super(TFLiteFrozenGraphConverterV2,
+                 self).convert(graph_def, input_tensors, output_tensors)
 
 
 @_tf_export("lite.TFLiteConverter", v1=[])
@@ -790,21 +901,7 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
     Returns:
       TFLiteConverter object.
     """
-    input_signature = None
-    # If the model's call is not a `tf.function`, then we need to first get its
-    # input signature from `model_input_signature` method. We can't directly
-    # call `trace_model_call` because otherwise the batch dimension is set
-    # to None.
-    # Once we have better support for dynamic shapes, we can remove this.
-    if not isinstance(model.call, _def_function.Function):
-      # Pass `keep_original_batch_size=True` will ensure that we get an input
-      # signature including the batch dimension specified by the user.
-      input_signature = _saving_utils.model_input_signature(
-          model, keep_original_batch_size=True)
-
-    func = _saving_utils.trace_model_call(model, input_signature)
-    concrete_func = func.get_concrete_function()
-    return cls([concrete_func])
+    return TFLiteKerasModelConverterV2(model)
 
   # pylint: disable=useless-super-delegation
   def convert(self):
@@ -964,7 +1061,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
       raise ValueError("std_dev and mean must be defined when inference_type "
                        "or inference_input_type is QUANTIZED_UINT8 or INT8.")
 
-  def _convert(self):
+  def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.
 
     Returns:
@@ -1247,8 +1344,86 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
     self._output_tensors = result[2]
     self._parse_saved_model_args()
 
+
+class TFLiteKerasModelConverter(TFLiteConverterBaseV1):
+  """Converts the given SavedModel into TensorFlow Lite model."""
+
+  def __init__(self,
+               model_file,
+               input_arrays=None,
+               input_shapes=None,
+               output_arrays=None,
+               custom_objects=None):
+    """Constructor for TFLiteConverter.
+
+    Args:
+      model_file: Full filepath of HDF5 file containing the tf.keras model.
+      input_arrays: List of input tensors to freeze graph with. Uses input
+        arrays from SignatureDef when none are provided. (default None)
+      input_shapes: Dict of strings representing input tensor names to list of
+        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+        Automatically determined when input shapes is None (e.g., {"foo" :
+          None}). (default None)
+      output_arrays: List of output tensors to freeze graph with. Uses output
+        arrays from SignatureDef when none are provided. (default None)
+      custom_objects: Dict mapping names (strings) to custom classes or
+        functions to be considered during model deserialization. (default None)
+
+    Raises:
+      ValueError: Invalid arguments.
+    """
+    super(TFLiteKerasModelConverter,
+          self).__init__(experimental_debug_info_func=None)
+    # Handles Keras when Eager mode is enabled.
+    if context.executing_eagerly():
+      if input_arrays or output_arrays:
+        raise ValueError("`input_arrays` and `output_arrays` are unsupported "
+                         "with Eager mode. If your model requires any of these "
+                         "parameters, please use disable_eager_execution().")
+
+      _keras.backend.set_learning_phase(False)
+      keras_model = _keras.models.load_model(model_file, custom_objects)
+
+      function = _saving_utils.trace_model_call(keras_model)
+      concrete_func = function.get_concrete_function()
+
+      frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
+          concrete_func, lower_control_flow=False)
+      _set_tensor_shapes(frozen_func.inputs, input_shapes)
+      self._keras_model = keras_model
+      self._graph_def = frozen_func.graph.as_graph_def()
+      self._input_tensors = frozen_func.inputs
+      self._output_tensors = frozen_func.outputs
+      self._debug_info_func = _build_debug_info_func(frozen_func.graph)
+      return
+
+    # Handles Keras when Eager mode is disabled.
+    _keras.backend.clear_session()
+    _keras.backend.set_learning_phase(False)
+    keras_model = _keras.models.load_model(model_file, custom_objects)
+    sess = _keras.backend.get_session()
+
+    # Get input and output tensors.
+    if input_arrays:
+      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
+    else:
+      input_tensors = keras_model.inputs
+
+    if output_arrays:
+      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
+    else:
+      output_tensors = keras_model.outputs
+    _set_tensor_shapes(input_tensors, input_shapes)
+
+    graph_def = _freeze_graph(sess, input_tensors, output_tensors)
+    self._keras_model = keras_model
+    self._graph_def = graph_def
+    self._input_tensors = input_tensors
+    self._output_tensors = output_tensors
+    self._debug_info_func = _build_debug_info_func(sess.graph)
+
   def convert(self):
-    """Converts a TensorFlow GraphDef based on instance variables.
+    """Converts a Keras model based on instance variables.
 
     Returns:
       The converted data in serialized format. Either a TFLite Flatbuffer or a
@@ -1259,7 +1434,28 @@ class TFLiteSavedModelConverter(TFLiteConverterBaseV1):
         Input shape is not specified.
         None value for dimension in input_tensor.
     """
-    return self._convert()
+    temp_dir = tempfile.mkdtemp()
+    try:
+      self._keras_model.save(temp_dir, save_format="tf")
+      tag_set = set([_tag_constants.SERVING])
+      signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+      result = _freeze_saved_model(temp_dir, None, None, None, tag_set,
+                                   signature_key)
+
+      self.saved_model_dir = temp_dir
+      self._saved_model_tags = tag_set
+      self._saved_model_exported_names = [signature_key]
+      self._parse_saved_model_args()
+      if self.saved_model_dir:
+        self._graph_def = result[0]
+        self._input_tensors = result[1]
+        self._output_tensors = result[2]
+        self._debug_info_func = _build_debug_info_func(result[3])
+        return super(TFLiteKerasModelConverter, self).convert()
+    finally:
+      shutil.rmtree(temp_dir, True)
+
+    return super(TFLiteKerasModelConverter, self).convert()
 
 
 class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
@@ -1308,20 +1504,6 @@ class TFLiteFrozenGraphConverter(TFLiteConverterBaseV1):
       self._input_arrays_with_shape = input_arrays_with_shape
       self._output_arrays = output_arrays
 
-  def convert(self):
-    """Converts a TensorFlow GraphDef based on instance variables.
-
-    Returns:
-      The converted data in serialized format. Either a TFLite Flatbuffer or a
-      Graphviz graph depending on value in `output_format`.
-
-    Raises:
-      ValueError:
-        Input shape is not specified.
-        None value for dimension in input_tensor.
-    """
-    return self._convert()
-
 
 @_tf_export(v1=["lite.TFLiteConverter"])
 class TFLiteConverter(TFLiteFrozenGraphConverter):
@@ -1649,53 +1831,8 @@ class TFLiteConverter(TFLiteFrozenGraphConverter):
     Returns:
       TFLiteConverter class.
     """
-    # Handles Keras when Eager mode is enabled.
-    if context.executing_eagerly():
-      if input_arrays or output_arrays:
-        raise ValueError("`input_arrays` and `output_arrays` are unsupported "
-                         "with Eager mode. If your model requires any of these "
-                         "parameters, please use disable_eager_execution().")
-
-      _keras.backend.set_learning_phase(False)
-      keras_model = _keras.models.load_model(model_file, custom_objects)
-
-      function = _saving_utils.trace_model_call(keras_model)
-      concrete_func = function.get_concrete_function()
-
-      frozen_func = _convert_to_constants.convert_variables_to_constants_v2(
-          concrete_func, lower_control_flow=False)
-      _set_tensor_shapes(frozen_func.inputs, input_shapes)
-      return cls(
-          frozen_func.graph.as_graph_def(),
-          frozen_func.inputs,
-          frozen_func.outputs,
-          experimental_debug_info_func=_build_debug_info_func(
-              frozen_func.graph))
-
-    # Handles Keras when Eager mode is disabled.
-    _keras.backend.clear_session()
-    _keras.backend.set_learning_phase(False)
-    keras_model = _keras.models.load_model(model_file, custom_objects)
-    sess = _keras.backend.get_session()
-
-    # Get input and output tensors.
-    if input_arrays:
-      input_tensors = _get_tensors_from_tensor_names(sess.graph, input_arrays)
-    else:
-      input_tensors = keras_model.inputs
-
-    if output_arrays:
-      output_tensors = _get_tensors_from_tensor_names(sess.graph, output_arrays)
-    else:
-      output_tensors = keras_model.outputs
-    _set_tensor_shapes(input_tensors, input_shapes)
-
-    graph_def = _freeze_graph(sess, input_tensors, output_tensors)
-    return cls(
-        graph_def,
-        input_tensors,
-        output_tensors,
-        experimental_debug_info_func=_build_debug_info_func(sess.graph))
+    return TFLiteKerasModelConverter(model_file, input_arrays, input_shapes,
+                                     output_arrays, custom_objects)
 
   # pylint: disable=useless-super-delegation
   def convert(self):
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 530c514eb96..9ddd09edca6 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -1895,7 +1895,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
@@ -1990,7 +1990,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertTrue(([2, 3] == input_details[0]['shape']).all())
 
   def testSequentialModelOutputArray(self):
@@ -2109,12 +2109,12 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 2)
-    self.assertEqual('input_a', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'input_a')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
 
-    self.assertEqual('input_b', input_details[1]['name'])
+    self.assertEndsWith(input_details[1]['name'], 'input_b')
     self.assertEqual(np.float32, input_details[1]['dtype'])
     self.assertTrue(([1, 3] == input_details[1]['shape']).all())
     self.assertEqual((0., 0.), input_details[1]['quantization'])
@@ -2165,7 +2165,7 @@ class FromKerasFile(TestModels, parameterized.TestCase):
 
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual('dense_input', input_details[0]['name'])
+    self.assertEndsWith(input_details[0]['name'], 'dense_input')
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertTrue(([1, 3] == input_details[0]['shape']).all())
     self.assertEqual((0., 0.), input_details[0]['quantization'])
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 5470e332b3d..4768892f359 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -213,9 +213,11 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
         self.units = units
 
       def build(self, input_shape):
-        self.w = self.add_weight(shape=(input_shape[-1], self.units),
-                                 initializer='random_normal',
-                                 trainable=True)
+        self.w = self.add_weight(
+            'weight',
+            shape=(input_shape[-1], self.units),
+            initializer='random_normal',
+            trainable=True)
         self.min_var = self.add_weight(
             'min',
             initializer=tf.keras.initializers.Constant(-6.0),
@@ -748,7 +750,10 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     input_data = tf.constant(
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     rnn_obj = rnn_layer(units=10, input_shape=(10, 10))
-    model = tf.keras.models.Sequential([rnn_obj])
+    model = tf.keras.models.Sequential([
+        tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'),
+        rnn_obj,
+    ])
 
     # Convert model.
     converter = lite.TFLiteConverterV2.from_keras_model(model)
@@ -787,6 +792,7 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     input_data = tf.constant(
         np.array(np.random.random_sample((1, 10, 10)), dtype=np.float32))
     model = tf.keras.models.Sequential()
+    model.add(tf.keras.layers.Input(batch_size=1, shape=(10, 10), name='input'))
     model.add(
         tf.keras.layers.Bidirectional(
             recurrent_v2.LSTM(units=10, return_sequences=True),

From 0ea9a77e77e6a94cd2dc2baa84f502affd738970 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 May 2020 23:46:49 -0700
Subject: [PATCH 0178/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310510145
Change-Id: I11cac456a5b72142784980e456d85f0bdd890b9d
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 6047d50555e2b00433000f659caadbc00c83d42d Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Fri, 8 May 2020 00:03:02 -0700
Subject: [PATCH 0179/1533] Add i386 to ObjC podspec

PiperOrigin-RevId: 310511492
Change-Id: Ie1dbb853d7b4bc055647b1f8b66af181534cfc07
---
 .../lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec   | 2 +-
 tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec    | 2 +-
 .../lite/experimental/objc/TensorFlowLiteObjC.podspec.template  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
index 21194bbb455..e039fb57114 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC-nightly.podspec
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
index 4b04c5e65f2..c673cfad759 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|
diff --git a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
index 47e60f33c47..fc9e10e4a2c 100644
--- a/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
+++ b/tensorflow/lite/experimental/objc/TensorFlowLiteObjC.podspec.template
@@ -33,7 +33,7 @@ Pod::Spec.new do |s|
     'HEADER_SEARCH_PATHS' =>
       '"${PODS_TARGET_SRCROOT}" ' +
       '"${PODS_TARGET_SRCROOT}/' + objc_dir  + 'apis"',
-    'VALID_ARCHS' => 'x86_64 armv7 arm64',
+    'VALID_ARCHS' => 'i386 x86_64 armv7 arm64',
   }
 
   s.test_spec 'Tests' do |ts|

From 682d67e1fe24746cdc66240cb760f7ba2db5a75e Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 8 May 2020 01:01:32 -0700
Subject: [PATCH 0180/1533] Add a kernel generator tool.

The tool takes ops defined in the TF dialect and creates cubin.

PiperOrigin-RevId: 310517625
Change-Id: I9cfe0d69eee9bf5c6c72791d109c1da582e72c73
---
 .../compiler/mlir/tools/kernel_gen/BUILD      |  49 ++++
 .../mlir/tools/kernel_gen/build_defs.bzl      |  96 +++++++
 .../mlir/tools/kernel_gen/cubin_creator.cc    | 264 ++++++++++++++++++
 .../mlir/tools/kernel_gen/cubin_creator.h     |  41 +++
 .../mlir/tools/kernel_gen/tf_to_cubin.cc      | 118 ++++++++
 tensorflow/stream_executor/gpu/BUILD          |   1 +
 6 files changed, 569 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/BUILD
 create mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl
 create mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
 create mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
 create mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
new file mode 100644
index 00000000000..d4269c336e9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -0,0 +1,49 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+
+licenses(["notice"])
+
+cc_library(
+    name = "cubin_creator",
+    srcs = ["cubin_creator.cc"],
+    hdrs = ["cubin_creator.h"],
+    copts = if_cuda(["-DGOOGLE_CUDA=1"]),
+    deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:GPUDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:Transforms",
+        "//tensorflow/compiler/mlir/xla:hlo",
+        "//tensorflow/compiler/mlir/xla:lhlo",
+        "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
+        "//tensorflow/compiler/mlir/xla:xla_materialize_broadcasts",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir/xla:xla_unfuse_batch_norm",  # buildcleaner: keep
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service/gpu:stream_executor_util",
+        "//tensorflow/compiler/xla/service/gpu:target_constants",
+        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
+        "//tensorflow/compiler/xla/service/mlir_gpu:kernel_lowering",
+        "//tensorflow/core:cuda_libdevice_path",
+        "//tensorflow/core:lib",
+    ] + if_cuda(["//tensorflow/stream_executor/gpu:asm_compiler"]),
+)
+
+tf_cc_binary(
+    name = "tf_to_cubin",
+    srcs = ["tf_to_cubin.cc"],
+    deps = [
+        ":cubin_creator",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl b/tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl
new file mode 100644
index 00000000000..cec9968e65b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl
@@ -0,0 +1,96 @@
+load("//third_party/gpus/cuda:build_defs.bzl", "cuda_gpu_select_list")
+
+def _lookup_file(filegroup, path):
+    """Extracts file at (relative) path in filegroup."""
+    for file in filegroup.files.to_list():
+        if file.path.endswith(path):
+            return file
+    return None
+
+def _gen_kernel_image_hdr_impl(ctx):
+    if not ctx.attr.gpu_archs:
+        fail("No GPU architecture specified, use --config=cuda or similar.")
+
+    name = ctx.attr.name
+    tile_sizes = ctx.attr.tile_size.replace("x", ",")
+    same_shape = []
+    if ctx.attr.same_shape:
+        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
+
+    cubins = []
+    images = []
+    for arch in ctx.attr.gpu_archs:
+        filename = "%s.%s.cubin" % (name, arch)
+        cubin = ctx.actions.declare_file(filename)
+        ctx.actions.run(
+            outputs = [cubin],
+            executable = ctx.executable._tool,
+            arguments = same_shape + [
+                "--tile_sizes=%s" % tile_sizes,
+                "--arch=%s" % arch.split("_")[1],
+                "--output=%s" % cubin.path,
+                ctx.attr.op,
+            ],
+            mnemonic = "compile",
+        )
+        cubins.append(cubin)
+        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
+
+    # Generate fatbin file from all cubins.
+    fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    ctx.actions.run(
+        outputs = [fatbin],
+        inputs = cubins,
+        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
+        arguments = [
+            "--64",
+            "--cmdline=--compile-only",
+            "--link",
+            "--compress-all",
+            "--create=%s" % fatbin.path,
+        ] + images,
+        mnemonic = "fatbinary",
+    )
+
+    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [fatbin],
+        tools = [bin2c],
+        command = "%s --static --const --type=int --name=%s %s 1> %s" %
+                  (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
+        mnemonic = "bin2c",
+    )
+
+_gen_kernel_image_hdr = rule(
+    implementation = _gen_kernel_image_hdr_impl,
+    output_to_genfiles = True,
+    attrs = {
+        "op": attr.string(mandatory = True),
+        "tile_size": attr.string(mandatory = True),
+        "same_shape": attr.string(),
+        "out": attr.output(mandatory = True),
+        "symbol": attr.string(mandatory = True),
+        "gpu_archs": attr.string_list(mandatory = True),
+        "_cuda_root": attr.label(
+            default = Label("//third_party/gpus/cuda:cuda_root"),
+        ),
+        "_tool": attr.label(
+            executable = True,
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            cfg = "host",
+        ),
+    },
+)
+
+def gen_kernel_image_hdr(name, op, tile_size, same_shape = None):
+    """Generates a C header with fatbin data from a Tensorflow op."""
+    _gen_kernel_image_hdr(
+        name = name,
+        op = op,
+        tile_size = tile_size,
+        same_shape = same_shape,
+        out = "include/tfrt/gpu/ops/tf/%s.h" % name,
+        symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
+        gpu_archs = cuda_gpu_select_list("sm_{}"),
+    )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
new file mode 100644
index 00000000000..46af4e4c94c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -0,0 +1,264 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- cubin_creator.cc -----------------------------------------*- C++ -*-===//
+//
+// This file implements the function to compile a TF kernel function to a cubin.
+//
+//===----------------------------------------------------------------------===//
+#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/memory/memory.h"
+#include "absl/strings/escaping.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/NVVMIR.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+#include "tensorflow/compiler/xla/service/gpu/target_constants.h"
+#include "tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.h"
+#include "tensorflow/core/platform/cuda_libdevice_path.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/path.h"
+#if GOOGLE_CUDA
+#include "tensorflow/stream_executor/gpu/asm_compiler.h"
+#endif
+
+namespace {
+using tensorflow::Status;
+using xla::InternalError;
+using xla::StatusOr;
+
+StatusOr<std::string> GetLibdeviceDir(
+    const xla::HloModuleConfig& hlo_module_config) {
+  for (const string& cuda_root : tensorflow::CandidateCudaRoots(
+           hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
+    string libdevice_dir =
+        tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
+    VLOG(2) << "Looking for libdevice at " << libdevice_dir;
+    if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
+      VLOG(2) << "Found libdevice dir " << libdevice_dir;
+      return libdevice_dir;
+    }
+  }
+  return InternalError(
+      "Can't find libdevice directory ${CUDA_DIR}/nvvm/libdevice");
+}
+
+struct MaterializeBroadcastsPass
+    : public mlir::PassWrapper<MaterializeBroadcastsPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::ConversionTarget conversionTarget(getContext());
+    mlir::OwningRewritePatternList conversionPatterns;
+
+    // Consider the xla_hlo dialect legal for tests.
+    conversionTarget.addLegalDialect<mlir::xla_hlo::XlaHloDialect>();
+    // The conversion uses helpers from the Standard dialect.
+    conversionTarget.addLegalDialect<mlir::StandardOpsDialect>();
+
+    mlir::xla_hlo::SetupMaterializeBroadcastsLegality(&getContext(),
+                                                      &conversionTarget);
+    mlir::xla_hlo::PopulateMaterializeBroadcastsPatterns(&getContext(),
+                                                         &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns))) {
+      return signalPassFailure();
+    }
+  }
+};
+
+struct UnfuseBatchNormPass
+    : public mlir::PassWrapper<UnfuseBatchNormPass, mlir::FunctionPass> {
+  void runOnFunction() override {
+    mlir::OwningRewritePatternList patterns;
+    mlir::xla_hlo::PopulateUnfuseBatchNormPatterns(&getContext(), &patterns);
+    mlir::applyPatternsAndFoldGreedily(getOperation(), patterns);
+  }
+};
+
+Status LowerTfOpToLhloWithDynamicShapes(mlir::ModuleOp module) {
+  mlir::PassManager pm(module.getContext());
+  auto enable_if_vlog_is_on = [](mlir::Pass* pass, mlir::Operation* op) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
+  pm.addNestedPass<mlir::FuncOp>(mlir::xla_hlo::createLegalizeTFPass(false));
+  pm.addNestedPass<mlir::FuncOp>(
+      absl::make_unique<MaterializeBroadcastsPass>());
+  pm.addNestedPass<mlir::FuncOp>(absl::make_unique<UnfuseBatchNormPass>());
+  pm.addPass(mlir::xla_hlo::createLegalizeToLhloPass());
+  pm.addNestedPass<mlir::FuncOp>(mlir::xla_lhlo::createLhloCopyRemovalPass());
+
+  if (failed(pm.run(module))) {
+    return InternalError("Lowering TF to LHLO failed.");
+  }
+  return Status::OK();
+}
+
+struct PropagateStaticKnowledge
+    : public mlir::PassWrapper<PropagateStaticKnowledge,
+                               mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
+  explicit PropagateStaticKnowledge(mlir::FunctionType type,
+                                    llvm::ArrayRef<unsigned> same_shape_)
+      : func_type(type), same_shape(same_shape_) {}
+
+  void runOnOperation() override {
+    // We know due to tensorflow ABI that the offset is always 0 and that the
+    // innermost stride is always 1. To make this visible to the compiler,
+    // we insert constants into the code and replace usages accordingly.
+    // We do not change the signature so that we keep a somewhat stable ABI
+    // that is easy to undertand by tools.
+    mlir::LLVM::LLVMFuncOp func = getOperation();
+    mlir::OpBuilder b(func.getBody());
+    auto index_type = func.getArgument(3).getType();
+    mlir::Value one = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
+    mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
+        func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
+    unsigned arg_pos = 0;
+    std::vector<unsigned> positions;
+    for (mlir::Type arg_type : func_type.getInputs()) {
+      positions.push_back(arg_pos);
+      func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
+      arg_pos += 3 + arg_type.cast<mlir::ShapedType>().getRank() * 2;
+      func.getArgument(arg_pos - 1).replaceAllUsesWith(one);
+    }
+
+    // If we have knowledge that some arguments have the same shape, we
+    // can use that here. Simply replace usages of the shape parameters within
+    // the function body to a single shape parameter.
+    if (!same_shape.empty()) {
+      int first = same_shape.front();
+      int first_offset = positions.at(first);
+      mlir::ShapedType first_type =
+          func_type.getInput(first).cast<mlir::ShapedType>();
+      unsigned rank = first_type.getRank();
+      for (int same : same_shape.drop_front(1)) {
+        unsigned same_offset = positions.at(same);
+        auto same_type = func_type.getInput(same).cast<mlir::ShapedType>();
+        if (same_type.getRank() != rank) {
+          func.emitOpError() << "same shape constraints on arguments with "
+                                "non-matching shapes: #"
+                             << first << " and #" << same;
+          signalPassFailure();
+        }
+
+        for (int i = 0; i < 2 * rank; ++i) {
+          // Replace uses for second arg data with first arg.
+          auto same_arg = func.getArgument(same_offset + 3 + i);
+          auto first_arg = func.getArgument(first_offset + 3 + i);
+          same_arg.replaceAllUsesWith(first_arg);
+        }
+      }
+    }
+  }
+
+  mlir::FunctionType func_type;
+  llvm::ArrayRef<unsigned> same_shape;
+};
+
+Status PropagateStaticShapeKnowledgeToKernel(
+    mlir::ModuleOp module, llvm::ArrayRef<unsigned> same_shape) {
+  // Grab the original signature from the single function.
+  auto func = *module.getBody()->op_begin<mlir::FuncOp>();
+
+  mlir::PassManager pm(module.getContext());
+  auto enable_if_vlog_is_on = [](mlir::Pass*, mlir::Operation*) {
+    return VLOG_IS_ON(1);
+  };
+  pm.enableIRPrinting(/*shouldPrintBeforePass=*/{},
+                      /*shouldPrintAfterPass=*/enable_if_vlog_is_on,
+                      /*printModuleScope=*/false,
+                      /*printAfterOnlyOnChange=*/false, llvm::dbgs());
+  auto& kernel_pm = pm.nest<::mlir::gpu::GPUModuleOp>();
+  kernel_pm.addNestedPass<mlir::LLVM::LLVMFuncOp>(
+      absl::make_unique<PropagateStaticKnowledge>(func.getType(), same_shape));
+
+  if (failed(pm.run(module))) {
+    return InternalError("Static knowledge propagation failed.");
+  }
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<std::vector<uint8>> tensorflow::kernel_gen::GenerateCubinForTfCode(
+    llvm::StringRef tf_code, std::pair<int, int> compute_capability,
+    llvm::ArrayRef<unsigned> tile_sizes, llvm::ArrayRef<unsigned> same_shape,
+    llvm::ArrayRef<unsigned> unroll_factors) {
+  mlir::MLIRContext context;
+  context.allowUnregisteredDialects();  // TODO(b/152572127)
+  mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
+
+  TF_RETURN_IF_ERROR(LowerTfOpToLhloWithDynamicShapes(module.get()));
+  TF_RETURN_IF_ERROR(
+      xla::mlir_gpu::LowerLHLOToGPU(module.get(), tile_sizes, unroll_factors,
+                                    /*collapseParallelLoops=*/false));
+  TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
+  TF_RETURN_IF_ERROR(
+      PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+
+  mlir::OwningModuleRef kernel_module =
+      xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
+  auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module);
+  if (!llvmModule) {
+    return InternalError("Could not translate MLIR module to NVVM");
+  }
+
+  llvmModule->setModuleIdentifier("acme");
+  llvmModule->setDataLayout(xla::gpu::nvptx::kDataLayout);
+
+  xla::HloModuleConfig config;
+  config.set_debug_options(xla::GetDebugOptionsFromFlags());
+
+  TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
+  TF_ASSIGN_OR_RETURN(std::string ptx, xla::gpu::nvptx::CompileToPtx(
+                                           llvmModule.get(), compute_capability,
+                                           config, libdevice_dir));
+  VLOG(1) << ptx;
+
+#if GOOGLE_CUDA
+  return tensorflow::se::CompileGpuAsm(
+      std::get<0>(compute_capability), std::get<1>(compute_capability),
+      ptx.c_str(), xla::gpu::PtxOptsFromConfig(config));
+#else
+  return InternalError(
+      "GOOGLE_CUDA not defined. Did you specify --config=cuda ?");
+#endif
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
new file mode 100644
index 00000000000..c8746330c49
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+//===- cubin_creator.h ------------------------------------------*- C++ -*-===//
+//
+// This file declares the function to compile a TF kernel function to a cubin.
+//
+//===----------------------------------------------------------------------===//
+#ifndef TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
+
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace tensorflow {
+namespace kernel_gen {
+xla::StatusOr<std::vector<uint8>> GenerateCubinForTfCode(
+    llvm::StringRef tf_code, std::pair<int, int> compute_capability = {7, 5},
+    llvm::ArrayRef<unsigned> tile_sizes = {16, 64},
+    llvm::ArrayRef<unsigned> same_shape = {},
+    llvm::ArrayRef<unsigned> unroll_factors = {});
+}  // namespace kernel_gen
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_CUBIN_CREATOR_H_
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
new file mode 100644
index 00000000000..d39edd89e34
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
@@ -0,0 +1,118 @@
+// Copyright 2020 The TensorFlow Runtime Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//===- tf_to_cubin.cc -------------------------------------------*- C++ -*-===//
+//
+// This file implements the entry point to compile a tf op to a cubin file.
+//
+//===----------------------------------------------------------------------===//
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace {
+bool ParseStringList(std::string string_list, std::vector<uint32>* result) {
+  result->clear();
+  uint32 item;
+  auto items = absl::StrSplit(string_list, ',');
+  for (const auto& item_str : items) {
+    if (!absl::SimpleAtoi(item_str, &item)) {
+      LOG(ERROR) << "Expected token " << item_str << " to be an integer";
+      return false;
+    }
+    result->push_back(item);
+  }
+  return true;
+}
+}  // namespace
+
+int main(int argc, char** argv) {
+  std::string output_file = "foo.bin";
+  int32 architecture = 50;
+  std::vector<uint32> tile_sizes;
+  std::vector<uint32> unroll_factors;
+  std::vector<uint32> same_shape;
+
+  auto parse_tile_sizes = [&tile_sizes](std::string tile_sizes_str) {
+    if (!ParseStringList(tile_sizes_str, &tile_sizes)) {
+      return false;
+    }
+    // Initialize with the default.
+    if (tile_sizes.empty()) {
+      tile_sizes.push_back(16);
+      tile_sizes.push_back(64);
+    }
+    return true;
+  };
+
+  auto parse_unroll_factors =
+      [&unroll_factors](std::string unroll_factors_str) {
+        return ParseStringList(unroll_factors_str, &unroll_factors);
+      };
+
+  auto parse_same_shape = [&same_shape](std::string same_shape_str) {
+    return ParseStringList(same_shape_str, &same_shape);
+  };
+
+  std::vector<tensorflow::Flag> flag_list = {
+      tensorflow::Flag("output", &output_file, "output file"),
+      tensorflow::Flag("arch", &architecture,
+                       "target architecture (e.g. 50 for sm_50)"),
+      tensorflow::Flag("tile_sizes", parse_tile_sizes, "16,64",
+                       "tile sizes to use"),
+      tensorflow::Flag("unroll_factors", parse_unroll_factors, "",
+                       "factors to unroll by, separated by commas"),
+      tensorflow::Flag("same_shape", parse_same_shape, "",
+                       "arguments with same shape, separated by commas"),
+  };
+  bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
+  tensorflow::port::InitMain("usage", &argc, &argv);
+  if (!parse_ok) {
+    return 1;
+  }
+
+  std::pair<int32, int32> compute_capability(architecture / 10,
+                                             architecture % 10);
+
+  auto cubin = tensorflow::kernel_gen::GenerateCubinForTfCode(
+      argv[1], compute_capability, tile_sizes, same_shape, unroll_factors);
+
+  if (!cubin.ok()) {
+    LOG(ERROR) << cubin.status();
+    return 1;
+  }
+
+  std::vector<uint8> cubin_data = cubin.ConsumeValueOrDie();
+
+  auto status = tensorflow::WriteStringToFile(
+      tensorflow::Env::Default(), output_file,
+      absl::string_view{reinterpret_cast<char*>(cubin_data.data()),
+                        cubin_data.size()});
+
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+    return 1;
+  }
+
+  return 0;
+}
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 5cb1642083e..8766234e40d 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -222,6 +222,7 @@ cc_library(
     hdrs = if_gpu_is_configured(["asm_compiler.h"]),
     copts = tf_copts(),
     visibility = [
+        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
         "//tensorflow/compiler/xla/service/gpu:__subpackages__",
         "//tensorflow/compiler/xla/service/mlir_gpu:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",

From 40eb54166ec56e83e381878c543b96dc29c9c952 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 02:03:06 -0700
Subject: [PATCH 0181/1533] Update GraphDef version to 395.

PiperOrigin-RevId: 310522784
Change-Id: I0ee8133d77a258bfb43638a76048d2f6ec217a35
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 4df199d935b..915af0fc054 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 394  // Updated: 2020/5/7
+#define TF_GRAPH_DEF_VERSION 395  // Updated: 2020/5/8
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 12c148369a8250427fea28127fb2b48a7b46ff65 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 02:03:17 -0700
Subject: [PATCH 0182/1533] compat: Update forward compatibility horizon to
 2020-05-08

PiperOrigin-RevId: 310522811
Change-Id: Ia3577a7377f6886d6dfb16d0a922bfeaa61dd281
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 51597fb0596..0feb78072e0 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 7)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 8)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 966cba56f3c6b4c6f2b8f9ca19f89de23c0d8fa2 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 8 May 2020 03:06:20 -0700
Subject: [PATCH 0183/1533] Internal BUILD file change.

PiperOrigin-RevId: 310528908
Change-Id: Ibb8563e67665ebf5e758056f47ac7a9c8e2bba84
---
 tensorflow/compiler/mlir/xla/BUILD   | 1 -
 tensorflow/compiler/xla/BUILD        | 1 -
 tensorflow/stream_executor/gpu/BUILD | 1 -
 3 files changed, 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index e4309d5eef0..e0e93c3b195 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -23,7 +23,6 @@ package_group(
         "//tensorflow/compiler/xla/...",
         "//third_party/iree/...",
         "//third_party/mlir_edge/...",
-        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 1350f9e3e0b..0193bea9d6d 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -17,7 +17,6 @@ package_group(
         "//tensorflow/compiler/...",
         "//tensorflow/python/tpu/...",
         "//third_party/py/jax/...",
-        "//third_party/tf_runtime/tools/tf_kernel_gen/...",
     ],
 )
 
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 8766234e40d..9744fc82593 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -227,7 +227,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/mlir_gpu:__subpackages__",
         "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/stream_executor:__subpackages__",
-        "//third_party/tf_runtime/tools/tf_kernel_gen:__subpackages__",
     ],
     deps = if_gpu_is_configured([
         ":gpu_asm_opts",

From 00fee732f23b80ea9acfb502dd3a43cc27d0e1a6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 04:34:06 -0700
Subject: [PATCH 0184/1533] [XLA][MLIR] Lower complex operations to std
 dialect.

Lower complex operations `ComplexOp`, `RealOp`, and `ImagOp` to their
equivalents in the standard dialect, `CreateComplexOp`, `ReOp`, and `ImOp`.

PiperOrigin-RevId: 310536400
Change-Id: I5c375af5e183bdc58de34bcc16c10416ea7a4ba7
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    | 70 +++++++++----------
 .../compiler/mlir/xla/ir/hlo_ops_base.td      | 18 ++---
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   | 37 +++++-----
 .../xla/tests/lhlo-legalize-to-linalg.mlir    | 42 +++++++++++
 .../xla/transforms/map_xla_to_scalar_op.h     | 22 ++++++
 .../xla/transforms/xla_legalize_to_linalg.cc  |  9 ++-
 6 files changed, 130 insertions(+), 68 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index dabf03d3c9f..c2eef4a90b2 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -95,6 +95,7 @@ def HLO_CreateTokenOp : HLO_Op<"create_token", [NoSideEffect]> {
 // XLA unary elementwise op definitions.
 //===----------------------------------------------------------------------===//
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
+
 class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
       Type TensorType>: HLO_Op<mnemonic,
         !listconcat(traits, [InferShapedTypeOpInterface])> {
@@ -161,6 +162,16 @@ def HLO_Expm1Op: HLO_UnaryElementwiseOp<"exponential_minus_one",
 def HLO_FloorOp: HLO_UnaryElementwiseOp<"floor",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_FloorOp;
 
+def HLO_ImagOp: HLO_Op<
+    "imag", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ImagOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value val">];
+
+  let arguments = (ins HLO_ComplexTensor);
+  let results = (outs HLO_FpTensor);
+  let hasFolder = 1;
+}
+
 def HLO_IsFiniteOp: HLO_UnaryElementwiseOp<"is_finite",
     [NoSideEffect, SameOperandsAndResultShape], HLO_Tensor>,
     BASE_HLO_IsFiniteOp {
@@ -188,6 +199,16 @@ def HLO_PopulationCountOp: HLO_UnaryElementwiseOp<"popcnt",
     [NoSideEffect, SameOperandsAndResultType], HLO_IntTensor>,
     BASE_HLO_PopulationCountOp;
 
+def HLO_RealOp: HLO_Op<
+    "real", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_RealOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value val">];
+
+  let arguments = (ins HLO_ComplexTensor);
+  let results = (outs HLO_FpTensor);
+  let hasFolder = 1;
+}
+
 def HLO_RoundOp: HLO_UnaryElementwiseOp<"round_nearest_afz",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_RoundOp;
 
@@ -212,47 +233,11 @@ def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
     [ResultsAreFloatLike, NoSideEffect, SameOperandsAndResultType],
     HLO_FpOrComplexTensor>, BASE_HLO_TanhOp;
 
-//===----------------------------------------------------------------------===//
-// XLA complex unary elementwise op definitions.
-//===----------------------------------------------------------------------===//
-// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
-
-def HLO_ComplexOp: HLO_Op<"complex",
-    [NoSideEffect, SameOperandsElementType, SameOperandsAndResultShape]>,
-    BASE_HLO_ComplexOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value lhs, Value rhs">];
-
-  let arguments = (ins HLO_FpTensor:$lhs, HLO_FpTensor:$rhs);
-  let results = (outs HLO_ComplexTensor);
-  let hasFolder = 1;
-}
-
-def HLO_ImagOp: HLO_Op<
-    "imag", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_ImagOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
-  let results = (outs HLO_FpTensor);
-  let hasFolder = 1;
-}
-
-def HLO_RealOp: HLO_Op<
-    "real", [NoSideEffect, SameOperandsAndResultShape]>, BASE_HLO_RealOp {
-  let builders = [OpBuilder<
-    "OpBuilder &, OperationState &tblgen_state, Value val">];
-
-  let arguments = (ins HLO_ComplexTensor);
-  let results = (outs HLO_FpTensor);
-  let hasFolder = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
 //===----------------------------------------------------------------------===//
-
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
+
 class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
         HLO_Op<mnemonic, !listconcat(traits, [InferShapedTypeOpInterface])> {
   let arguments = (ins
@@ -293,6 +278,17 @@ def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
 def HLO_Atan2Op : HLO_BinaryElementwiseOp<"atan2",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_Atan2Op;
 
+def HLO_ComplexOp: HLO_Op<"complex",
+    [NoSideEffect, SameOperandsElementType, SameOperandsAndResultShape]>,
+    BASE_HLO_ComplexOp {
+  let builders = [OpBuilder<
+    "OpBuilder &, OperationState &tblgen_state, Value lhs, Value rhs">];
+
+  let arguments = (ins HLO_FpTensor:$lhs, HLO_FpTensor:$rhs);
+  let results = (outs HLO_ComplexTensor);
+  let hasFolder = 1;
+}
+
 def HLO_DivOp : HLO_BinaryElementwiseOp<"divide",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_DivOp {
 }
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index c087ffd1f40..b5de675f13f 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -150,15 +150,6 @@ class BASE_HLO_ClzOp {
   }];
 }
 
-class BASE_HLO_ComplexOp {
-  string summary = "Complex operator";
-
-  string description = [{
-    Performs element-wise conversion of a pair of real and imaginary values to
-    a complex value.
-  }];
-}
-
 class BASE_HLO_ConvertOp {
   string summary = "Convert operator";
 
@@ -400,6 +391,15 @@ class BASE_HLO_AddOp {
   }];
 }
 
+class BASE_HLO_ComplexOp {
+  string summary = "Complex operator";
+
+  string description = [{
+    Performs element-wise conversion of a pair of real and imaginary values to
+    a complex value.
+  }];
+}
+
 class BASE_HLO_DivOp {
   string summary = "Division operator";
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 3abd117f570..d7e838a6f2b 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -92,10 +92,20 @@ def LHLO_CosOp: LHLO_UnaryElementwiseOp<"cosine">, BASE_HLO_CosOp;
 
 def LHLO_ExpOp: LHLO_UnaryElementwiseOp<"exponential">, BASE_HLO_ExpOp;
 
+def LHLO_ImagOp: LHLO_Op<"imag", [SameOperandsShape]>, BASE_HLO_ImagOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_LogOp: LHLO_UnaryElementwiseOp<"log">, BASE_HLO_LogOp;
 
 def LHLO_NegOp: LHLO_UnaryElementwiseOp<"negate">, BASE_HLO_NegOp;
 
+def LHLO_RealOp: LHLO_Op<"real", [SameOperandsShape]>, BASE_HLO_RealOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_RsqrtOp: LHLO_UnaryElementwiseOp<"rsqrt">, BASE_HLO_RsqrtOp;
 
 def LHLO_SqrtOp: LHLO_UnaryElementwiseOp<"sqrt">, BASE_HLO_SqrtOp;
@@ -106,27 +116,6 @@ def LHLO_SinOp: LHLO_UnaryElementwiseOp<"sine">, BASE_HLO_SinOp;
 
 def LHLO_TanhOp: LHLO_UnaryElementwiseOp<"tanh">, BASE_HLO_TanhOp;
 
-//===----------------------------------------------------------------------===//
-// XLA complex unary elementwise op definitions.
-//===----------------------------------------------------------------------===//
-// See https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions
-
-def LHLO_ComplexOp: LHLO_Op<"complex", [SameOperandsShape]>, BASE_HLO_ComplexOp {
-  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
-                       Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
-                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_ImagOp: LHLO_Op<"imag", [SameOperandsShape]>, BASE_HLO_ImagOp {
-  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
-                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
-def LHLO_RealOp: LHLO_Op<"real", [SameOperandsShape]>, BASE_HLO_RealOp {
-  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$input,
-                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
-}
-
 //===----------------------------------------------------------------------===//
 // XLA binary elementwise op definitions.
 //===----------------------------------------------------------------------===//
@@ -144,6 +133,12 @@ class LHLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
 
 def LHLO_AddOp : LHLO_BinaryElementwiseOp<"add", []>, BASE_HLO_AddOp;
 
+def LHLO_ComplexOp: LHLO_Op<"complex", [SameOperandsShape]>, BASE_HLO_ComplexOp {
+  let arguments = (ins Arg<LHLO_Buffer, "", [MemRead]>:$lhs,
+                       Arg<LHLO_Buffer, "", [MemRead]>:$rhs,
+                       Arg<LHLO_Buffer, "", [MemWrite]>:$output);
+}
+
 def LHLO_DivOp : LHLO_BinaryElementwiseOp<"divide", []>, BASE_HLO_DivOp;
 
 def LHLO_MaxOp : LHLO_BinaryElementwiseOp<"maximum", []>, BASE_HLO_MaxOp;
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index 0fc30ed4901..3605e8e7e4c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -523,6 +523,48 @@ func @tanh(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
 // CHECK-NEXT:   %[[RESULT:.*]] = tanh %[[OPERAND_IN]] : f32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
 
+// -----
+
+// CHECK-LABEL: func @complex
+func @complex(%real: memref<2x2xf32>,
+              %imag: memref<2x2xf32>,
+              %cplx: memref<2x2xcomplex<f32>>) {
+  "xla_lhlo.complex"(%real, %imag, %cplx)
+      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xcomplex<f32>>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[RE:.*]]: f32, %[[IM:.*]]: f32, %[[CP:.*]]: complex<f32>):
+// CHECK-NEXT:   %[[RESULT:.*]] = create_complex %[[RE]], %[[IM]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @real
+func @real(%cplx: memref<2x2xcomplex<f32>>,
+           %real: memref<2x2xf32>) {
+  "xla_lhlo.real"(%cplx, %real)
+      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[REAL_OUT:.*]]: f32):
+// CHECK-NEXT:   %[[REAL:.*]] = re %[[CPLX_IN:.*]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[REAL]] : f32
+
+// -----
+
+// CHECK-LABEL: func @imag
+func @imag(%cplx: memref<2x2xcomplex<f32>>,
+           %imag: memref<2x2xf32>) {
+  "xla_lhlo.imag"(%cplx, %imag)
+      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
+  return
+}
+// CHECK:      linalg.generic
+// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[IMAG_OUT:.*]]: f32):
+// CHECK-NEXT:   %[[IMAG:.*]] = im %[[CPLX_IN:.*]] : complex<f32>
+// CHECK-NEXT:   linalg.yield %[[IMAG]] : f32
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
index dceb73efb33..982ec4f4593 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
@@ -227,6 +227,28 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::CeilOp>(
       loc, result_types, args, b);
 }
 
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ComplexOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<CreateComplexOp>{}(loc, result_types, args,
+                                                       b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::RealOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<ReOp>{}(loc, result_types, args, b);
+}
+
+template <>
+inline Value MapLhloOpToStdScalarOp<xla_lhlo::ImagOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
+    OpBuilder* b) {
+  return MapLhloOpToStdScalarOpImpl<ImOp>{}(loc, result_types, args, b);
+}
+
 template <>
 inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
     Location loc, ArrayRef<Type> result_types, ArrayRef<Value> args,
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index 9cce6799288..799a20aa693 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -84,7 +84,8 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
       emitError(loc, "lhlo to linalg conversion expects ranked args");
       return failure();
     }
-    if (!argType.getElementType().isSignlessIntOrFloat()) {
+    auto elemTy = argType.getElementType();
+    if (!elemTy.isSignlessIntOrFloat() && !elemTy.template isa<ComplexType>()) {
       return failure();
     }
 
@@ -618,17 +619,20 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::AndOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CeilOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CompareOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::ComplexOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ConvertOp>,
                    // TODO(ataei): Remove this pattern, CopyOp is folded away.
                    PointwiseToLinalgConverter<xla_lhlo::CopyOp>,
                    PointwiseToLinalgConverter<xla_lhlo::CosOp>,
                    PointwiseToLinalgConverter<xla_lhlo::DivOp>,
                    PointwiseToLinalgConverter<xla_lhlo::ExpOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::ImagOp>,
                    PointwiseToLinalgConverter<xla_lhlo::LogOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MaxOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MinOp>,
                    PointwiseToLinalgConverter<xla_lhlo::MulOp>,
                    PointwiseToLinalgConverter<xla_lhlo::NegOp>,
+                   PointwiseToLinalgConverter<xla_lhlo::RealOp>,
                    PointwiseToLinalgConverter<xla_lhlo::RemOp>,
                    PointwiseToLinalgConverter<xla_lhlo::RsqrtOp>,
                    PointwiseToLinalgConverter<xla_lhlo::SelectOp>,
@@ -716,16 +720,19 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_hlo::AndOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CeilOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CompareOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ComplexOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::ConvertOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CopyOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::CosOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::DivOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::ExpOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::ImagOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::LogOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MaxOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MinOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::MulOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::NegOp, false>,
+                   PointwiseToLinalgConverter<xla_hlo::RealOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::RemOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::RsqrtOp, false>,
                    PointwiseToLinalgConverter<xla_hlo::SelectOp, false>,

From 2f2d1a3a2f4e154017c4ab1aff0a18473ecaa494 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 04:34:34 -0700
Subject: [PATCH 0185/1533] Minor docstring change: delete "normal".

PiperOrigin-RevId: 310536445
Change-Id: I11abd556a0775964bde428d4518b066396de14d7
---
 tensorflow/python/keras/layers/preprocessing/normalization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index b087a2101c7..cf9600a63ab 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -41,7 +41,7 @@ _VARIANCE_NAME = 'variance'
 class Normalization(CombinerPreprocessingLayer):
   """Feature-wise normalization of the data.
 
-  This layer will coerce its inputs into a normal distribution centered around
+  This layer will coerce its inputs into a distribution centered around
   0 with standard deviation 1. It accomplishes this by precomputing the mean and
   variance of the data, and calling (input-mean)/sqrt(var) at runtime.
 

From 00636f5858515a8cdf32fa2c0b0caf7e45ae99fe Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Fri, 8 May 2020 07:39:15 -0700
Subject: [PATCH 0186/1533] [XLA:CPU] Fuse reduce-window

I think this was simply an oversight, benchmarks are neutral to positive.

PiperOrigin-RevId: 310555511
Change-Id: I51676c3a3965b6b02a26b4821cb6c26f356c939c
---
 .../xla/service/cpu/cpu_instruction_fusion.cc |  1 +
 .../cpu/cpu_instruction_fusion_test.cc        | 28 +++++++++++++++++++
 .../xla/service/cpu/elemental_ir_emitter.cc   |  3 +-
 .../compiler/xla/service/cpu/ir_emitter.cc    |  7 +++--
 .../compiler/xla/service/cpu/ir_emitter.h     |  1 +
 .../cpu/tests/tree_reduction_rewriter_test.cc | 10 +++++--
 6 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 97e0a518499..4ce30b7d046 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -36,6 +36,7 @@ bool CanBeLoopFused(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kGather ||
          hlo.opcode() == HloOpcode::kIota || hlo.opcode() == HloOpcode::kPad ||
          hlo.opcode() == HloOpcode::kReduce ||
+         hlo.opcode() == HloOpcode::kReduceWindow ||
          hlo.opcode() == HloOpcode::kReshape ||
          hlo.opcode() == HloOpcode::kReverse ||
          hlo.opcode() == HloOpcode::kSlice ||
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index aa1fa09a6dc..1505cfc5b8e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -945,6 +945,34 @@ ENTRY main {
   EXPECT_TRUE(fused_something);
   EXPECT_THAT(module->entry_computation()->root_instruction(), op::Fusion());
 }
+
+TEST_F(InstructionFusionTest, FuseReduceWindow) {
+  absl::string_view module_string = R"(
+HloModule module
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY main {
+  a = f32[50,60]{1,0} parameter(0)
+  b = f32[50,60]{1,0} parameter(1)
+  c = f32[50,60]{1,0} multiply(a, b)
+  init = f32[] constant(0)
+  ROOT r = f32[50,60] reduce-window(c, init), window={size=2x3 pad=0_1x1_1},
+                                              to_apply=add
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_TRUE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Fusion());
+}
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index e21ca01c803..43a8368386a 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -125,7 +125,8 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return ir_emitter_->EmitElementalReduceWindow(
             Cast<HloReduceWindowInstruction>(hlo),
-            operand_to_generator.at(hlo->operand(0)), index);
+            operand_to_generator.at(hlo->operand(0)),
+            operand_to_generator.at(hlo->operand(1)), index);
       };
     case HloOpcode::kConvolution:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index c19fa779b60..b3c385d7c7f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -705,6 +705,7 @@ llvm::Value* IrEmitter::EmitElementalMap(
 StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
     const HloReduceWindowInstruction* reduce_window,
     const llvm_ir::ElementGenerator& input_generator,
+    const llvm_ir::ElementGenerator& initial_value_generator,
     const llvm_ir::IrArray::Index& index) {
   const HloInstruction* operand = reduce_window->operand(0);
   const Window& window = reduce_window->window();
@@ -716,8 +717,10 @@ StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
       llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
       "reduce_window_accumulator_address", &b_,
       MinimumAlignmentForPrimitiveType(operand_element_type));
-  Store(Load(GetEmittedValueFor(reduce_window->operand(1))),
-        accumulator_address);
+  TF_ASSIGN_OR_RETURN(
+      llvm::Value* const initial_value,
+      initial_value_generator(llvm_ir::IrArray::Index(index.GetType())));
+  Store(initial_value, accumulator_address);
 
   llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_);
   std::vector<int64> window_size;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index cc5aa3f37fc..229fc7f47c5 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -122,6 +122,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Value*> EmitElementalReduceWindow(
       const HloReduceWindowInstruction* reduce_window,
       const llvm_ir::ElementGenerator& input_generator,
+      const llvm_ir::ElementGenerator& initial_value_generator,
       const llvm_ir::IrArray::Index& index);
   // Emit code to emit the element at `index` for a convolution instruction.
   StatusOr<llvm::Value*> EmitElementalConvolution(
diff --git a/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
index bcb7da0e6cf..59b40f1a67e 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
@@ -53,12 +53,18 @@ ENTRY main {
 
   MatchOptimizedHlo(hlo_text,
                     R"(
+; CHECK-LABEL: %fused_computation (param_0.1: f32[32]) -> f32[] {
+; CHECK-NEXT:    %param_0.1 = f32[32]{0} parameter(0)
+; CHECK-NEXT:    %zero.1 = f32[] constant(0)
+; CHECK-NEXT:    %reduce-window.2 = f32[1]{0} reduce-window(%param_0.1, %zero.1), window={size=32 stride=32}, to_apply=%add
+; CHECK-NEXT:    ROOT %reshape.1 = f32[] reshape(%reduce-window.2)
+; CHECK-NEXT:   }
+
 ; CHECK-LABEL: ENTRY %main (input: f32[1000]) -> f32[] {
 ; CHECK-NEXT:    %input = f32[1000]{0} parameter(0)
 ; CHECK-NEXT:    %zero = f32[] constant(0)
 ; CHECK-NEXT:    %reduce-window = f32[32]{0} reduce-window(%input, %zero)
-; CHECK-NEXT:    %reduce-window.1 = f32[1]{0} reduce-window(%reduce-window, %zero), window={size=32 stride=32}, to_apply=%add
-; CHECK-NEXT:    ROOT %bitcast = f32[] bitcast(%reduce-window.1)
+; CHECK-NEXT:    ROOT %fusion = f32[] fusion(%reduce-window), kind=kLoop, calls=%fused_computation
       )");
 }
 

From dad62f51c9a20d715ce447851c72f9c510959c83 Mon Sep 17 00:00:00 2001
From: Andrew Cavanaugh <andrewc@xmos.com>
Date: Fri, 8 May 2020 10:53:06 -0400
Subject: [PATCH 0187/1533] Update download_and_extract.sh

---
 tensorflow/lite/micro/tools/make/download_and_extract.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 5a673985cdd..dca6a309583 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -137,7 +137,7 @@ download_and_extract() {
     exit 1
   fi
 
-  # delete anything after the '?' in a url that might confound f
+  # delete anything after the '?' in a url that may mask true file extension
   url=$(echo "${url}" | sed "s/\?.*//")
 
   if [[ "${url}" == *gz ]]; then

From cd2f4d16282d36c47547ba72d762ad967ce1d024 Mon Sep 17 00:00:00 2001
From: Bas Aarts <baarts@nvidia.com>
Date: Fri, 8 May 2020 07:58:05 -0700
Subject: [PATCH 0188/1533] address comments on commit
 fc58d58923534e461d735a9a8b460d2dc8691ae5

---
 tensorflow/stream_executor/cuda/cudart_stub.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 1d7a4e378ba..3b9e0f2937b 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -132,9 +132,9 @@ extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
 }
 
 extern char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **fatCubinHandle);
+  using FuncPtr = char(CUDARTAPI *)(void **fatCubinHandle);
   static auto func_ptr = LoadSymbol<FuncPtr>("__cudaInitModule");
-  if (!func_ptr) return GetSymbolNotFoundError();
+  if (!func_ptr) return 0;
   return func_ptr(fatCubinHandle);
 }
 

From b6c57a6e4b2e17256abeb7ba993e7cb130dd188b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 08:30:25 -0700
Subject: [PATCH 0189/1533] [XLA:CPU] Fuse reduce-window

I think this was simply an oversight, benchmarks are neutral to positive.

PiperOrigin-RevId: 310561614
Change-Id: Ic26be583cca8e11a5136769ef1eb974858432d25
---
 .../xla/service/cpu/cpu_instruction_fusion.cc |  1 -
 .../cpu/cpu_instruction_fusion_test.cc        | 28 -------------------
 .../xla/service/cpu/elemental_ir_emitter.cc   |  3 +-
 .../compiler/xla/service/cpu/ir_emitter.cc    |  7 ++---
 .../compiler/xla/service/cpu/ir_emitter.h     |  1 -
 .../cpu/tests/tree_reduction_rewriter_test.cc | 10 ++-----
 6 files changed, 5 insertions(+), 45 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 4ce30b7d046..97e0a518499 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -36,7 +36,6 @@ bool CanBeLoopFused(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kGather ||
          hlo.opcode() == HloOpcode::kIota || hlo.opcode() == HloOpcode::kPad ||
          hlo.opcode() == HloOpcode::kReduce ||
-         hlo.opcode() == HloOpcode::kReduceWindow ||
          hlo.opcode() == HloOpcode::kReshape ||
          hlo.opcode() == HloOpcode::kReverse ||
          hlo.opcode() == HloOpcode::kSlice ||
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 1505cfc5b8e..aa1fa09a6dc 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -945,34 +945,6 @@ ENTRY main {
   EXPECT_TRUE(fused_something);
   EXPECT_THAT(module->entry_computation()->root_instruction(), op::Fusion());
 }
-
-TEST_F(InstructionFusionTest, FuseReduceWindow) {
-  absl::string_view module_string = R"(
-HloModule module
-
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-ENTRY main {
-  a = f32[50,60]{1,0} parameter(0)
-  b = f32[50,60]{1,0} parameter(1)
-  c = f32[50,60]{1,0} multiply(a, b)
-  init = f32[] constant(0)
-  ROOT r = f32[50,60] reduce-window(c, init), window={size=2x3 pad=0_1x1_1},
-                                              to_apply=add
-}
-)";
-
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(module_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
-                          CpuInstructionFusion().Run(module.get()));
-  EXPECT_TRUE(fused_something);
-  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Fusion());
-}
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index 43a8368386a..e21ca01c803 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -125,8 +125,7 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return ir_emitter_->EmitElementalReduceWindow(
             Cast<HloReduceWindowInstruction>(hlo),
-            operand_to_generator.at(hlo->operand(0)),
-            operand_to_generator.at(hlo->operand(1)), index);
+            operand_to_generator.at(hlo->operand(0)), index);
       };
     case HloOpcode::kConvolution:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index b3c385d7c7f..c19fa779b60 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -705,7 +705,6 @@ llvm::Value* IrEmitter::EmitElementalMap(
 StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
     const HloReduceWindowInstruction* reduce_window,
     const llvm_ir::ElementGenerator& input_generator,
-    const llvm_ir::ElementGenerator& initial_value_generator,
     const llvm_ir::IrArray::Index& index) {
   const HloInstruction* operand = reduce_window->operand(0);
   const Window& window = reduce_window->window();
@@ -717,10 +716,8 @@ StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
       llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
       "reduce_window_accumulator_address", &b_,
       MinimumAlignmentForPrimitiveType(operand_element_type));
-  TF_ASSIGN_OR_RETURN(
-      llvm::Value* const initial_value,
-      initial_value_generator(llvm_ir::IrArray::Index(index.GetType())));
-  Store(initial_value, accumulator_address);
+  Store(Load(GetEmittedValueFor(reduce_window->operand(1))),
+        accumulator_address);
 
   llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_);
   std::vector<int64> window_size;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 229fc7f47c5..cc5aa3f37fc 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -122,7 +122,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Value*> EmitElementalReduceWindow(
       const HloReduceWindowInstruction* reduce_window,
       const llvm_ir::ElementGenerator& input_generator,
-      const llvm_ir::ElementGenerator& initial_value_generator,
       const llvm_ir::IrArray::Index& index);
   // Emit code to emit the element at `index` for a convolution instruction.
   StatusOr<llvm::Value*> EmitElementalConvolution(
diff --git a/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
index 59b40f1a67e..bcb7da0e6cf 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/tree_reduction_rewriter_test.cc
@@ -53,18 +53,12 @@ ENTRY main {
 
   MatchOptimizedHlo(hlo_text,
                     R"(
-; CHECK-LABEL: %fused_computation (param_0.1: f32[32]) -> f32[] {
-; CHECK-NEXT:    %param_0.1 = f32[32]{0} parameter(0)
-; CHECK-NEXT:    %zero.1 = f32[] constant(0)
-; CHECK-NEXT:    %reduce-window.2 = f32[1]{0} reduce-window(%param_0.1, %zero.1), window={size=32 stride=32}, to_apply=%add
-; CHECK-NEXT:    ROOT %reshape.1 = f32[] reshape(%reduce-window.2)
-; CHECK-NEXT:   }
-
 ; CHECK-LABEL: ENTRY %main (input: f32[1000]) -> f32[] {
 ; CHECK-NEXT:    %input = f32[1000]{0} parameter(0)
 ; CHECK-NEXT:    %zero = f32[] constant(0)
 ; CHECK-NEXT:    %reduce-window = f32[32]{0} reduce-window(%input, %zero)
-; CHECK-NEXT:    ROOT %fusion = f32[] fusion(%reduce-window), kind=kLoop, calls=%fused_computation
+; CHECK-NEXT:    %reduce-window.1 = f32[1]{0} reduce-window(%reduce-window, %zero), window={size=32 stride=32}, to_apply=%add
+; CHECK-NEXT:    ROOT %bitcast = f32[] bitcast(%reduce-window.1)
       )");
 }
 

From 175ec5e02e4f7bc1662c6a6b0bde2c50292ba638 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Fri, 8 May 2020 19:01:07 +0300
Subject: [PATCH 0190/1533] arc_mli slicing: Got rid of hand-written MIN/MAX
 macro

---
 .../lite/micro/kernels/arc_mli/mli_slicers.cc | 10 +++++-----
 .../micro/kernels/arc_mli/scratch_buf_mgr.cc  | 19 ++++++++++---------
 .../micro/kernels/arc_mli/scratch_buffers.cc  |  3 +--
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
index 91bae5caa38..11065f00646 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/mli_slicers.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "mli_slicers.h"
 
-#define MAX(A,B) (((A) > (B))? (A): (B))
-#define MIN(A,B) (((A) > (B))? (B): (A)) 
+#include <algorithm>
+
 
 namespace tflite {
 namespace ops {
@@ -75,11 +75,11 @@ void TensorSlicer::ComputeSubTensor(void) {
   // begin and end spans the complete input region including padding areas.
   const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_;
   // end is clipped to the end of the full input region. this is needed for cases where the last slice is smaller than the rest.
-  const int end = MIN(begin + sub_cfg_.size[sliceDim_] + overlap_, full_tensor_->shape[sliceDim_] + pad_post_);
+  const int end = std::min(begin + sub_cfg_.size[sliceDim_] + overlap_, full_tensor_->shape[sliceDim_] + pad_post_);
   // The start coordinate of the subtensor is clipped to zero
-  cfg_new.offset[sliceDim_] = MAX(begin, 0);
+  cfg_new.offset[sliceDim_] = std::max(begin, 0);
   // and the stop coordinate is clipped to the size of the full tensor
-  const int stop_coord = MIN(end, full_tensor_->shape[sliceDim_]);
+  const int stop_coord = std::min(end, static_cast<int>(full_tensor_->shape[sliceDim_]));
   // compute the size of the subtensor
   cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_];
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
index d030d04170c..097908e30ab 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
@@ -15,9 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.h"
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
+#include <algorithm>
 #include <limits.h>
-#define MAX(A,B) (((A) > (B))? (A): (B))
-#define MIN(A,B) (((A) > (B))? (B): (A)) 
+
 
 namespace tflite {
 namespace ops {
@@ -242,19 +243,19 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
     *out_slice_height = out_height;
   } else {
     // First compute how many lines fit into the input tensor, and compute how many output lines can be computed with that.
-    max_lines_in = MIN(in_height, in->capacity / line_size_in);
+    max_lines_in = std::min(in_height, static_cast<int>(in->capacity) / line_size_in);
     if (max_lines_in >= in_height) {
       max_out_lines_for_input = out_height;
     } else if (2 * max_lines_in >= in_height) {
       // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case.
-      max_out_lines_for_input = (max_lines_in + MIN(padding_top, padding_bot) - kernel_height + 1) / stride_height;
+      max_out_lines_for_input = (max_lines_in + std::min(padding_top, padding_bot) - kernel_height + 1) / stride_height;
     } else {
       max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height; // TODO add padding exceptions and test by makin fit=false;
     }
     // Ten compute how many ouput lines fit into the output tensor.
-    max_lines_out = MIN(out_height, out->capacity / line_size_out);
+    max_lines_out = std::min(out_height, static_cast<int>(out->capacity) / line_size_out);
     // the smallest of the two determines the slice height for the output, and the derived sliceheight for the input.
-    *out_slice_height = MIN(max_out_lines_for_input, max_lines_out);
+    *out_slice_height = std::min(max_out_lines_for_input, max_lines_out);
     *in_slice_height = *out_slice_height * stride_height;
   }
 
@@ -282,11 +283,11 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
     *slice_channels = channels;
   } else {
     // First compute how many channels fit into the weights tensor
-    max_ch_weigths = MIN(channels, weights->capacity / ch_size_w);
+    max_ch_weigths = std::min(channels, static_cast<int>(weights->capacity) / ch_size_w);
     // Ten compute how many channels fit into the bias tensor.
-    max_ch_bias = MIN(channels, bias->capacity / ch_size_b);
+    max_ch_bias = std::min(channels, static_cast<int>(bias->capacity) / ch_size_b);
     // the smallest of the two determines the slice size
-    *slice_channels = MIN(max_ch_weigths, max_ch_bias);
+    *slice_channels = std::min(max_ch_weigths, max_ch_bias);
   }
 
   if (*slice_channels > 0) {
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
index a770e4ccd66..6b56770f1f7 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.cc
@@ -14,9 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/micro/kernels/arc_mli/scratch_buffers.h"
+
 #include <limits.h>
-#define MAX(A,B) (((A) > (B))? (A): (B))
-#define MIN(A,B) (((A) > (B))? (B): (A)) 
 
 namespace tflite {
 namespace ops {

From 57d2ac3f07068e18f98f84b1f8667f919c8a7e5e Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Fri, 8 May 2020 09:08:24 -0700
Subject: [PATCH 0191/1533] Reuse the rendezvous provided by the
 OpKernelContext for PartitionedCallOp. This will allow send/recv across
 different tf.functions.

PiperOrigin-RevId: 310567623
Change-Id: I64cc5c3c44fa6c64684570e6f20808c6b193172b
---
 tensorflow/core/kernels/partitioned_function_ops.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index a85f3f449fd..3045fd050d5 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -245,6 +245,7 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   run_opts.source_device =
       lib->device() == nullptr ? "" : lib->device()->name();
   run_opts.allow_dead_tensors = true;
+  run_opts.rendezvous = ctx->rendezvous();
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();

From 3345a4083e5e6124b7cdfe308469b3eec8763360 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 09:17:12 -0700
Subject: [PATCH 0192/1533] Fold iOS-specific targets into the portable
 targets.

PiperOrigin-RevId: 310569018
Change-Id: Ic1bc71bbd6dd0f10c6d5442c5a30e587852a1eab
---
 tensorflow/cc/saved_model/BUILD |  1 -
 tensorflow/core/BUILD           | 58 ++++++++-------------------------
 2 files changed, 14 insertions(+), 45 deletions(-)

diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 882b4032f76..a20cc9c9945 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -4,7 +4,6 @@
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
-    "if_ios",
     "if_mobile",
     "if_not_mobile",
     "tf_cc_test",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index b4bec2a6907..fb57be9d6c7 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1417,54 +1417,12 @@ cc_library(
     ],
 )
 
-# Native library support for iOS applications.
-#
-# bazel  build --config=ios_x86_64 \
-# :ios_tensorflow_lib
-cc_library(
-    name = "ios_tensorflow_lib",
-    srcs = if_ios([
-        ":portable_op_registrations_and_gradients",
-        "//tensorflow/core/kernels:android_core_ops",
-        "//tensorflow/core/kernels:android_extended_ops",
-    ]),
-    copts = tf_copts() + tf_opts_nortti_if_lite_protos() + ["-Os"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":portable_tensorflow_lib_lite",
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "//third_party/fft2d:fft2d_headers",
-        "@com_google_protobuf//:protobuf",
-        "@fft2d",
-        "@gemmlowp",
-    ],
-    alwayslink = 1,
-)
-
 alias(
     name = "ios_tensorflow_lib_lite",
     actual = ":portable_tensorflow_lib_lite",
     visibility = ["//visibility:public"],
 )
 
-cc_library(
-    name = "ios_tensorflow_test_lib",
-    testonly = 1,
-    srcs = if_ios([":android_test_srcs"]),
-    copts = tf_copts() + ["-Os"],
-    tags = [
-        "manual",
-        "notap",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":ios_tensorflow_lib",
-        "//tensorflow/core/platform/default/build_config:gtest",
-        "//third_party/eigen3",
-    ],
-)
-
 # Full TensorFlow library with operator support. Use this unless reducing
 # binary size (by packaging a reduced operator set) is a concern.
 alias(
@@ -1473,10 +1431,16 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "ios_tensorflow_lib",
+    actual = ":portable_tensorflow_lib",
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "portable_tensorflow_lib",
     srcs = if_mobile([":portable_op_registrations_and_gradients"]),
-    copts = tf_copts() + tf_opts_nortti_if_lite_protos(),
+    copts = tf_copts() + tf_opts_nortti_if_lite_protos() + if_ios(["-Os"]),
     features = tf_features_nomodules_if_mobile(),
     tags = [
         "manual",
@@ -1559,6 +1523,12 @@ alias(
     visibility = ["//visibility:public"],
 )
 
+alias(
+    name = "ios_tensorflow_test_lib",
+    actual = ":portable_tensorflow_test_lib",
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "portable_tensorflow_test_lib",
     testonly = 1,
@@ -1569,7 +1539,7 @@ cc_library(
         "//tensorflow/core/framework:android_test_hdrs",
         "//tensorflow/core/util:android_test_hdrs",
     ],
-    copts = tf_copts(android_optimization_level_override = None),
+    copts = tf_copts(android_optimization_level_override = None) + if_ios(["-Os"]),
     features = tf_features_nomodules_if_mobile() + tf_opts_nortti_if_lite_protos(),
     tags = [
         "manual",

From 56a99d6c28fd0ff238bd834bd32c2d3fc894b86c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 09:28:28 -0700
Subject: [PATCH 0193/1533] configure.py: Simplify bazel version check

Use `bazel --version` instead of `bazel --batch version`.

`bazel --batch` fails on Windows with a "command line too long" error, if user's environment variable is too large.
Example: https://buildkite.com/bazel/bazelisk-plus-incompatible-flags/builds/496#f5fbd8db-7421-43f4-a018-555af9856be4

`bazel --version` can also print the Bazel version without starting Bazel server, it's even faster than `bazel --batch version`.

PiperOrigin-RevId: 310570885
Change-Id: Iafc4c90f0ff57610e5f77bee230e81e78d9f1289
---
 configure.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/configure.py b/configure.py
index ac9ed0c4d88..a003265f3c9 100644
--- a/configure.py
+++ b/configure.py
@@ -479,13 +479,13 @@ def check_bazel_version(min_version, max_version):
   if which('bazel') is None:
     print('Cannot find bazel. Please install bazel.')
     sys.exit(0)
-  curr_version = run_shell(
-      ['bazel', '--batch', '--bazelrc=/dev/null', 'version'])
 
-  for line in curr_version.split('\n'):
-    if 'Build label: ' in line:
-      curr_version = line.split('Build label: ')[1]
-      break
+  stderr = open(os.devnull, 'wb')
+  curr_version = run_shell(['bazel', '--version'],
+                           allow_non_zero = True,
+                           stderr = stderr)
+  if curr_version.startswith('bazel '):
+    curr_version = curr_version.split('bazel ')[1]
 
   min_version_int = convert_version_to_int(min_version)
   curr_version_int = convert_version_to_int(curr_version)

From 4cb881647952314124219f5753edd0fca7b06a74 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 8 May 2020 09:37:45 -0700
Subject: [PATCH 0194/1533] [Executor] Avoid unnecessary
 `NodeItem::input_type()` calls in `PrepareInputs()`.

We currently unconditionally read the input type of all inputs in order to handle the increasingly rare (and deprecated!) reference-typed input case. This change caches whether there are any ref-typed inputs to a node in a bit in the fixed-length part of a `NodeItem`, and only evaluates the input type if that bit is true.

In addition, this change avoids calling `InlinedVector<TensorValue>::clear()` before calling `InlinedVector<TensorValue>::resize()`. Because we overwrite all of the values in the vector, there is no need to clear it before resizing. In some cases this can avoid a free/malloc of the underlying vector storage.

PiperOrigin-RevId: 310572347
Change-Id: Ie5be8eb32fd4eba522f3b661cf9f5099d5263c6f
---
 tensorflow/core/common_runtime/executor.cc   | 24 ++++++++++++--------
 tensorflow/core/common_runtime/graph_view.cc |  2 ++
 tensorflow/core/common_runtime/graph_view.h  |  2 ++
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1f2a364258f..74de6b28d3f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -811,16 +811,14 @@ template <class PropagatorStateType>
 Status ExecutorState<PropagatorStateType>::PrepareInputs(
     const NodeItem& item, Entry* first_input, TensorValueVec* inputs,
     AllocatorAttributeVec* input_alloc_attrs, bool* is_input_dead) {
-  inputs->clear();
   inputs->resize(item.num_inputs);
-  input_alloc_attrs->clear();
   input_alloc_attrs->resize(item.num_inputs);
 
   *is_input_dead = false;
 
-  bool is_merge = item.is_merge;
   for (int i = 0; i < item.num_inputs; ++i) {
-    const bool expect_ref = IsRefType(item.input_type(i));
+    const bool expect_ref = TF_PREDICT_FALSE(item.is_any_input_ref_typed) &&
+                            IsRefType(item.input_type(i));
     Entry* entry = first_input + i;
     (*input_alloc_attrs)[i] = entry->alloc_attr;
 
@@ -830,7 +828,10 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
     switch (entry->state) {
       case Entry::State::NO_VALUE: {
         // Only merge and transfer nodes can have no-value inputs.
-        if (!is_merge) {
+        inp->mutex_if_ref = nullptr;
+        if (item.is_merge) {
+          inp->tensor = nullptr;
+        } else {
           DCHECK(item.is_transfer_node)
               << item.kernel->name() << " - input " << i;
           entry->state = Entry::State::HAS_CONST_TENSOR;
@@ -846,17 +847,18 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
       }
 
       case Entry::State::HAS_VALUE: {
-        if (expect_ref) {
+        if (TF_PREDICT_FALSE(expect_ref)) {
           return AttachDef(
               errors::InvalidArgument(i, "-th input expects a ref type"),
               item.kernel->def());
         }
+        inp->mutex_if_ref = nullptr;
         inp->tensor = entry->val.get();
         break;
       }
 
       case Entry::State::HAS_CONST_TENSOR: {
-        if (expect_ref) {
+        if (TF_PREDICT_FALSE(expect_ref)) {
           return AttachDef(
               errors::InvalidArgument(i, "-th input expects a ref type"),
               item.kernel->def());
@@ -865,6 +867,7 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
         // stores a non-const `Tensor*`, and relies on the `OpKernelContext`
         // accessors making dynamic checks that prevent using an immutable
         // tensor as a mutable tensor.
+        inp->mutex_if_ref = nullptr;
         inp->tensor = const_cast<Tensor*>(entry->const_tensor);
         break;
       }
@@ -872,8 +875,8 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
       case Entry::State::HAS_REF_TENSOR: {
         {
           tf_shared_lock ml(*entry->ref_tensor.mu);
-          if (!entry->ref_tensor.tensor->IsInitialized() &&
-              !item.is_initialization_op) {
+          if (TF_PREDICT_FALSE(!entry->ref_tensor.tensor->IsInitialized() &&
+                               !item.is_initialization_op)) {
             return AttachDef(errors::FailedPrecondition(
                                  "Attempting to use uninitialized value ",
                                  item.kernel->requested_input(i)),
@@ -896,12 +899,13 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
           }
           entry->state = Entry::State::HAS_VALUE;
 
+          inp->mutex_if_ref = nullptr;
           inp->tensor = entry->val.get();
           // The dtype of entry->ref_tensor.tensor could have been changed by
           // another operation that ran after the operation that "produced" it
           // executed, so re-validate that the type of the dereferenced tensor
           // matches the expected input type.
-          if (item.input_type(i) != inp->tensor->dtype()) {
+          if (TF_PREDICT_FALSE(item.input_type(i) != inp->tensor->dtype())) {
             return AttachDef(
                 errors::InvalidArgument(
                     i, "-th input expects type ",
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index 7db0781551d..7a63e06814a 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -191,9 +191,11 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
 
   DCHECK_LT(DataType_MAX, 255);  // Must fit in uint8
   uint8* input_types = item->input_type_base();
+  item->is_any_input_ref_typed = false;
   for (int i = 0; i < num_inputs; i++) {
     input_types[i] = static_cast<uint8>(n->input_type(i));
     DCHECK_EQ(item->input_type(i), n->input_type(i));
+    item->is_any_input_ref_typed |= IsRefType(n->input_type(i));
   }
 
   // Check ScopedAllocatorAttrs and forward_from.  Also assign output_types.
diff --git a/tensorflow/core/common_runtime/graph_view.h b/tensorflow/core/common_runtime/graph_view.h
index 6d31555ed9a..38eb3e33bcb 100644
--- a/tensorflow/core/common_runtime/graph_view.h
+++ b/tensorflow/core/common_runtime/graph_view.h
@@ -81,6 +81,8 @@ struct NodeItem {
                                                      // of any output edge is a
                                                      // merge or control trigger
                                                      // node.
+  bool is_any_input_ref_typed : 1;  // True iff any IsRefType(dt) for dt in this
+                                    // node's input types.
 
   // The kernel for this node.
   OpKernel* kernel = nullptr;

From 631d5bc1335041d07f8ed0a1f0ab61c427eeedd2 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 09:41:54 -0700
Subject: [PATCH 0195/1533] Replace ptr_util.h with absl/memory/memory.h in the
 profiler

PiperOrigin-RevId: 310573036
Change-Id: I38e99e02b5c164f21660a80b6336cbe842a9d2a7
---
 tensorflow/core/profiler/lib/BUILD                    | 3 +--
 tensorflow/core/profiler/lib/profiler_session.cc      | 4 ++--
 tensorflow/core/profiler/rpc/BUILD                    | 3 +--
 tensorflow/core/profiler/rpc/profiler_server.cc       | 1 -
 tensorflow/core/profiler/rpc/profiler_service_impl.cc | 4 ++--
 5 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 33486685fb8..b3028c717bf 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -47,12 +47,11 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:framework",
         "//tensorflow/core/platform",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
-        "//tensorflow/core/util:ptr_util",
+        "@com_google_absl//absl/memory",
     ] + if_not_android([
         ":profiler_utils",
         "//tensorflow/core/profiler/internal:profiler_factory",
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index b907f74179c..d7976b85351 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -15,13 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/env_var.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
@@ -44,7 +44,7 @@ ProfileOptions GetOptions(const ProfileOptions& opts) {
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create(
     const ProfileOptions& options) {
-  return WrapUnique(new ProfilerSession(options));
+  return absl::WrapUnique(new ProfilerSession(options));
 }
 
 /*static*/ std::unique_ptr<ProfilerSession> ProfilerSession::Create() {
diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index d8af53fe8f9..b5b631fe8bb 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -14,13 +14,13 @@ cc_library(
         ["//tensorflow_serving/model_servers:__pkg__"],
     ),
     deps = [
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
     ],
@@ -36,7 +36,6 @@ cc_library(
     ],
     deps = [
         ":profiler_service_impl",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index 36f0f9efad9..4d2f3c38c65 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 8f1be23594a..35aaa26bd1e 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "grpcpp/support/status.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -81,7 +81,7 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
 }  // namespace
 
 std::unique_ptr<grpc::ProfilerService::Service> CreateProfilerService() {
-  return MakeUnique<ProfilerServiceImpl>();
+  return absl::make_unique<ProfilerServiceImpl>();
 }
 
 }  // namespace tensorflow

From 5909adaa16902adc29a6132798c3559308d3572d Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Fri, 8 May 2020 09:57:47 -0700
Subject: [PATCH 0196/1533] Adding a FromBuffer method to construct
 tensorflow::cc::Tensors from user provided buffers. This also unblocks adding
 unit tests to tensorflow::cc::Tensor.

PiperOrigin-RevId: 310575934
Change-Id: I2da02d2f26cb3337e3e2c207bf2718c0f6c0b48a
---
 tensorflow/cc/experimental/base/public/BUILD  |   1 +
 .../cc/experimental/base/public/tensor.h      |  70 +++++-
 tensorflow/cc/experimental/base/tests/BUILD   |  21 ++
 .../cc/experimental/base/tests/tensor_test.cc | 212 ++++++++++++++++++
 4 files changed, 297 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/cc/experimental/base/tests/BUILD
 create mode 100644 tensorflow/cc/experimental/base/tests/tensor_test.cc

diff --git a/tensorflow/cc/experimental/base/public/BUILD b/tensorflow/cc/experimental/base/public/BUILD
index 4249d7918c8..93acf1bd319 100644
--- a/tensorflow/cc/experimental/base/public/BUILD
+++ b/tensorflow/cc/experimental/base/public/BUILD
@@ -57,6 +57,7 @@ cc_library(
         "tensor.h",
     ],
     deps = [
+        ":status",
         "//tensorflow/c:tf_datatype",
         "//tensorflow/c:tf_tensor",
     ],
diff --git a/tensorflow/cc/experimental/base/public/tensor.h b/tensorflow/cc/experimental/base/public/tensor.h
index 1afdbcad50c..26b0e5dc55e 100644
--- a/tensorflow/cc/experimental/base/public/tensor.h
+++ b/tensorflow/cc/experimental/base/public/tensor.h
@@ -19,10 +19,13 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <functional>
 #include <memory>
+#include <vector>
 
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_tensor.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
 
 namespace tensorflow {
 namespace cc {
@@ -30,19 +33,38 @@ namespace cc {
 // Tensor represents an n-dimensional array of values.
 class Tensor {
  public:
-  // TODO(bmzhao): Add a factory function that constructs a Tensor from a char
-  // buffer, with an options struct (to specify the buffer's layout, device?,
-  // whether to create a TFRT or TF tensor, whether we should take ownership of
-  // the memory, etc). This requires extending TF_NewTensor with an options
-  // struct:
-  // https://github.com/tensorflow/tensorflow/blob/3c520614a3c056d56afdc79b59979b9b0087f8b9/tensorflow/c/tf_tensor.h#L77-L80
+  using DeleterCallback = std::function<void(void*, size_t)>;
+
+  // Constructs a Tensor from user provided buffer.
+  //
+  // Params:
+  //  dtype - The dtype of the tensor's data.
+  //  shape - A shape vector, where each element corresponds to the size of
+  //          the tensor's corresponding dimension.
+  //  data - Pointer to a buffer of memory to construct a Tensor out of.
+  //  len - The length (in bytes) of `data`
+  //  deleter - A std::function to be called when the Tensor no longer needs the
+  //            memory in `data`. This can be used to free `data`, or
+  //            perhaps decrement a refcount associated with `data`, etc.
+  //  status - Set to OK on success and an error on failure.
+  // Returns:
+  // If an error occurred, status->ok() will be false, and the returned
+  // Tensor must not be used.
+  // TODO(bmzhao): Add Runtime as an argument to this function so we can swap to
+  // a TFRT backed tensor.
+  // TODO(bmzhao): Add benchmarks on overhead for this function; we can
+  // consider using int64_t* + length rather than vector.
+  static Tensor FromBuffer(TF_DataType dtype, const std::vector<int64_t>& shape,
+                           void* data, size_t len, DeleterCallback deleter,
+                           Status* status);
 
   // TODO(bmzhao): In the case we construct a tensor from non-owned memory,
   // we should offer a way to deep copy the tensor into a new tensor, which
   // owns the underlying memory. This could be a .deepcopy()/clone() method.
 
   // TODO(bmzhao): In the future, we want to relax the non-copyability
-  // constraint. To do so, we can add a C API function that acts like CopyFrom:
+  // constraint. To do so, we can add a C API function that acts like
+  // CopyFrom:
   // https://github.com/tensorflow/tensorflow/blob/08931c1e3e9eb2e26230502d678408e66730826c/tensorflow/core/framework/tensor.h#L301-L311
 
   // Tensor is movable, but not copyable
@@ -85,6 +107,16 @@ class Tensor {
   // This object retains ownership of the pointer.
   TF_Tensor* GetTFTensor() const { return tensor_.get(); }
 
+  struct DeleterStruct {
+    std::function<void(void*, size_t)> deleter;
+  };
+
+  static void DeleterFunction(void* memory, size_t len, void* deleter_struct) {
+    DeleterStruct* deleter = reinterpret_cast<DeleterStruct*>(deleter_struct);
+    deleter->deleter(memory, len);
+    delete deleter;
+  }
+
   struct TFTensorDeleter {
     void operator()(TF_Tensor* p) const { TF_DeleteTensor(p); }
   };
@@ -111,6 +143,30 @@ inline size_t Tensor::num_bytes() const {
   return TF_TensorByteSize(tensor_.get());
 }
 
+inline Tensor Tensor::FromBuffer(TF_DataType dtype,
+                                 const std::vector<int64_t>& shape, void* data,
+                                 size_t len, DeleterCallback deleter,
+                                 Status* status) {
+  // Credit to apassos@ for this technique:
+  // Despite the fact that our API takes a std::function deleter, we are able
+  // to maintain ABI stability because:
+  // 1. Only a function pointer is sent across the C API (&DeleterFunction)
+  // 2. DeleterFunction is defined in the same build artifact that constructed
+  //    the std::function (so there isn't confusion about std::function ABI).
+  // Note that 2. is satisifed by the fact that this is a header-only API, where
+  // the function implementations are inline.
+
+  DeleterStruct* deleter_struct = new DeleterStruct{deleter};
+  TF_Tensor* tensor = TF_NewTensor(dtype, shape.data(), shape.size(), data, len,
+                                   &DeleterFunction, deleter_struct);
+  if (tensor == nullptr) {
+    status->SetStatus(TF_INVALID_ARGUMENT,
+                      "Failed to create tensor for input buffer");
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
 }  // namespace cc
 }  // namespace tensorflow
 
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
new file mode 100644
index 00000000000..a2b634a70f4
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -0,0 +1,21 @@
+# Tests for the C++ header-only base types.
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+tf_cc_test(
+    name = "tensor_test",
+    srcs = [
+        "tensor_test.cc",
+    ],
+    deps = [
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/experimental/base/public:tensor",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/tests/tensor_test.cc b/tensorflow/cc/experimental/base/tests/tensor_test.cc
new file mode 100644
index 00000000000..86a50bac5cd
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensor_test.cc
@@ -0,0 +1,212 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+#include <stddef.h>
+
+#include <cstdint>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+// Each of the following struct types have two members: a kDType that
+// corresponds to a TF_Datatype enum value, and a typedef "type"
+// of its corresponding C++ type. These types allow us to write Dtype-agnostic
+// tests via GoogleTest's TypedTests:
+// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
+struct FloatType {
+  using type = float;
+  static constexpr TF_DataType kDType = TF_FLOAT;
+};
+
+struct DoubleType {
+  using type = double;
+  static constexpr TF_DataType kDType = TF_DOUBLE;
+};
+
+struct Int32Type {
+  using type = int32_t;
+  static constexpr TF_DataType kDType = TF_INT32;
+};
+
+struct UINT8Type {
+  using type = uint8_t;
+  static constexpr TF_DataType kDType = TF_UINT8;
+};
+
+struct INT8Type {
+  using type = int8_t;
+  static constexpr TF_DataType kDType = TF_INT8;
+};
+
+struct INT64Type {
+  using type = int64_t;
+  static constexpr TF_DataType kDType = TF_INT64;
+};
+
+struct UINT16Type {
+  using type = uint16_t;
+  static constexpr TF_DataType kDType = TF_UINT16;
+};
+
+struct UINT32Type {
+  using type = uint32_t;
+  static constexpr TF_DataType kDType = TF_UINT32;
+};
+
+struct UINT64Type {
+  using type = uint64_t;
+  static constexpr TF_DataType kDType = TF_UINT64;
+};
+
+using SimpleTypes =
+    ::testing::Types<FloatType, DoubleType, Int32Type, UINT8Type, INT8Type,
+                     INT64Type, UINT16Type, UINT32Type, UINT64Type>;
+
+template <typename T>
+class ConstructScalarTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(ConstructScalarTensorTest, SimpleTypes);
+
+// This test constructs a scalar tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(ConstructScalarTensorTest, ValidTensorAttributesAfterConstruction) {
+  cc::Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  typename TypeParam::type value = 42;
+  cc::Tensor tensor =
+      cc::Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                             /*data=*/&value,
+                             /*len=*/sizeof(value),
+                             /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 0);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  EXPECT_EQ(*reinterpret_cast<typename TypeParam::type*>(tensor.data()), 42);
+  EXPECT_EQ(tensor.num_bytes(), sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), 1);
+}
+
+template <typename T>
+class Construct1DTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct1DTensorTest, SimpleTypes);
+
+// This test constructs a 1D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
+  cc::Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 1 vector.
+  std::vector<int64_t> shape;
+  shape.push_back(value.size());
+
+  cc::Tensor tensor = cc::Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 1);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+template <typename T>
+class Construct2DTensorTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct2DTensorTest, SimpleTypes);
+
+// This test constructs a 2D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
+  cc::Status status;
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 2 vector with shape 2 x 3.
+  std::vector<int64_t> shape({2, 3});
+
+  cc::Tensor tensor = cc::Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 2);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+TEST(CPPTensorAPI, ConstructTensorFromBuffer) {
+  bool done = false;
+  cc::Status status;
+  std::vector<int32_t> data_vector({12, 14, 20, 18, 39, 42, 100});
+  {
+    // data_vector is a rank 1 tensor.
+    std::vector<int64_t> shape;
+    shape.push_back(data_vector.size());
+
+    cc::Tensor::DeleterCallback callback = [&done](void* data, size_t len) {
+      done = true;
+    };
+
+    cc::Tensor tensor =
+        cc::Tensor::FromBuffer(/*dtype=*/TF_INT32, /*shape=*/shape,
+                               /*data=*/data_vector.data(),
+                               /*len=*/data_vector.size() * sizeof(int32_t),
+                               /*deleter=*/callback, &status);
+    ASSERT_TRUE(status.ok()) << status.message();
+  }
+  // At this point, tensor has been destroyed, and the deleter callback should
+  // have run.
+  EXPECT_TRUE(done);
+}
+
+}  // namespace
+}  // namespace tensorflow

From cd69c497e5299cdf44743273c95e6b895c140f2e Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 10:01:05 -0700
Subject: [PATCH 0197/1533] Replace str_util.h with absl/strings in profiler

PiperOrigin-RevId: 310576543
Change-Id: Ie4311fe44e525fc45d8751dd73e4013270ad4e71
---
 tensorflow/core/profiler/convert/BUILD         |  1 +
 .../profiler/convert/xplane_to_step_events.cc  | 18 +++++++++---------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 387efc831ff..15a1ad03be3 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -310,6 +310,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
index 78bd3dbee0f..c7dcd6266d2 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
-#include "tensorflow/core/lib/strings/str_util.h"
+#include "absl/strings/match.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -25,19 +25,19 @@ namespace profiler {
 namespace {
 
 inline bool IsExplicitHostStepMarker(absl::string_view event_name) {
-  return (str_util::StartsWith(event_name, "train") ||
-          str_util::StartsWith(event_name, "test") ||
-          str_util::StartsWith(event_name, "TraceContext")) &&
-         !str_util::StrContains(event_name, "/");
+  return (absl::StartsWith(event_name, "train") ||
+          absl::StartsWith(event_name, "test") ||
+          absl::StartsWith(event_name, "TraceContext")) &&
+         !absl::StrContains(event_name, "/");
 }
 
 // Returns true if the given event_name should be considered as real computation
 // on CPU.
 inline bool IsRealCpuCompute(absl::string_view event_name) {
-  bool not_real = str_util::StartsWith(event_name, "EagerExecute") ||
-                  str_util::StartsWith(event_name, "EagerLocalExecute") ||
-                  str_util::StartsWith(event_name, "EagerKernelExecute") ||
-                  str_util::StartsWith(event_name, "FunctionRun") ||
+  bool not_real = absl::StartsWith(event_name, "EagerExecute") ||
+                  absl::StartsWith(event_name, "EagerLocalExecute") ||
+                  absl::StartsWith(event_name, "EagerKernelExecute") ||
+                  absl::StartsWith(event_name, "FunctionRun") ||
                   IsExplicitHostStepMarker(event_name);
   return !not_real;
 }

From 8e43239e14abd39f683e32346523c16fac172850 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 10:09:46 -0700
Subject: [PATCH 0198/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310578363
Change-Id: I018eb4be6727c3e0868fbb796eedad09f5fc8ffe
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From b265dd5fb3be0b7a860b5419c0a2285f9693ae6d Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 8 May 2020 10:12:12 -0700
Subject: [PATCH 0199/1533] Set the AssignedDevice of the lowered control flow
 nodes (Enter, Switch, Merge, NextIteration, Exit) to be the same as the
 assigned device of the input loop variable if that is available during
 lowering.

PiperOrigin-RevId: 310578865
Change-Id: I7118c26054be2d8fb239c3ed03b9a3e5c4685ef6
---
 .../core/common_runtime/lower_while_op.cc     | 57 +++++++-----
 .../common_runtime/lower_while_op_test.cc     | 93 +++++++++++++++++++
 2 files changed, 125 insertions(+), 25 deletions(-)

diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index e9d322721f2..a28959703e5 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -238,12 +238,14 @@ Status LowerWhileHelper::CreateEnterNodes() {
   TF_RETURN_IF_ERROR(while_op_->input_edges(&edges));
   for (const Edge* edge : edges) {
     Node* enter_node;
-    NodeBuilder builder = NodeBuilder(NewName("enter"), "Enter",
-                                      graph_->op_registry(), &debug_info_)
-                              .Input(NodeOut(edge->src(), edge->src_output()))
-                              .Attr("frame_name", name_)
-                              .Attr("parallel_iterations", parallel_iterations_)
-                              .Device(while_op_->requested_device());
+    NodeBuilder builder =
+        NodeBuilder(NewName("enter"), "Enter", graph_->op_registry(),
+                    &debug_info_)
+            .Input(NodeOut(edge->src(), edge->src_output()))
+            .Attr("frame_name", name_)
+            .Attr("parallel_iterations", parallel_iterations_)
+            .Device(edge->src()->requested_device())
+            .AssignedDevice(edge->src()->assigned_device_name());
     if (IsResource(edge->dst_input())) {
       builder.Attr("is_constant", true);
     }
@@ -282,7 +284,8 @@ Status LowerWhileHelper::CreateMergeNodes() {
         NodeBuilder(NewName("merge"), "Merge", graph_->op_registry(),
                     &debug_info_)
             .Input({NodeOut(enter_node, 0), NodeOut(enter_node, 0)})
-            .Device(while_op_->requested_device())
+            .Device(enter_node->requested_device())
+            .AssignedDevice(enter_node->assigned_device_name())
             .Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
   }
@@ -323,21 +326,19 @@ Status LowerWhileHelper::CreateSwitchNodes() {
       TF_RETURN_IF_ERROR(while_op_->input_node(i, &input_node));
       op_name = strings::StrCat(input_node->name(), "_switch");
     }
+    Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
     Node* switch_node;
     string op_type = "Switch";
-    if (IsRefType(
-            merge_nodes_[op_input_output_to_lowered_node_[i]]->output_type(
-                0))) {
+    if (IsRefType(merge_node->output_type(0))) {
       op_type = "RefSwitch";
     }
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName(op_name), op_type, graph_->op_registry(),
-                    &debug_info_)
-            .Input(
-                NodeOut(merge_nodes_[op_input_output_to_lowered_node_[i]], 0))
-            .Input(NodeOut(loop_cond_node_, 0))
-            .Device(while_op_->requested_device())
-            .Finalize(graph_, &switch_node));
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName(op_name), op_type,
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(merge_node, 0))
+                           .Input(NodeOut(loop_cond_node_, 0))
+                           .Device(merge_node->requested_device())
+                           .AssignedDevice(merge_node->assigned_device_name())
+                           .Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
   return Status::OK();
@@ -392,7 +393,10 @@ Status LowerWhileHelper::CreateExitNodes() {
                       &debug_info_)
               .Input(NodeOut(switch_nodes_[op_input_output_to_lowered_node_[i]],
                              0))
-              .Device(while_op_->requested_device())
+              .Device(switch_nodes_[op_input_output_to_lowered_node_[i]]
+                          ->requested_device())
+              .AssignedDevice(switch_nodes_[op_input_output_to_lowered_node_[i]]
+                                  ->assigned_device_name())
               .Finalize(graph_, &exit_node));
       exit_nodes_.emplace_back(exit_node);
       outputs.emplace_back(NodeOut(exit_node, 0));
@@ -440,12 +444,15 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
     if (IsResource(i)) {
       continue;
     }
-    TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
-                                   graph_->op_registry(), &debug_info_)
-                           .Input(NodeOut(body_call_node_, i))
-                           .ControlInput(body_call_node_)
-                           .Device(while_op_->requested_device())
-                           .Finalize(graph_, &next_iteration));
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(NewName("next_iteration"), "NextIteration",
+                    graph_->op_registry(), &debug_info_)
+            .Input(NodeOut(body_call_node_, i))
+            .ControlInput(body_call_node_)
+            .Device(while_op_->requested_device())
+            .AssignedDevice(merge_nodes_[op_input_output_to_lowered_node_[i]]
+                                ->assigned_device_name())
+            .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 0fc005cfb6f..65b9b523444 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -169,6 +169,99 @@ TEST(LowerWhileOpTest, Simple) {
   }
 }
 
+TEST(LowerWhileOpTest, ForwardAssignedInputDevice) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+
+  TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
+  auto type = DT_FLOAT;
+  Node* placeholder;
+  TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
+                  .Attr("dtype", type)
+                  .Finalize(graph.get(), &placeholder));
+  const string assigned_device_name = "/job:localhost/replica:0/task:0/gpu:0";
+  placeholder->set_assigned_device_name(assigned_device_name);
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(placeholder)});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwo");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &graph->flib_def())
+          .Input(inputs)
+          .Attr("T", {type})
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(graph.get(), &while_node));
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  const Node* placeholder_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->name() == "placed_node") {
+      placeholder_node = op;
+    }
+  }
+  ASSERT_NE(placeholder_node, nullptr);
+  // Verify the assigned device of the Enter node.
+  int enter_consumers = 0;
+  const Node* enter_node = nullptr;
+  for (const Node* consumer : placeholder_node->out_nodes()) {
+    if (consumer->type_string() == "Enter") {
+      enter_consumers += 1;
+      enter_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(enter_consumers, 1);
+  // Verify the assigned device of the Merge node.
+  int merge_consumers = 0;
+  const Node* merge_node = nullptr;
+  for (const Node* consumer : enter_node->out_nodes()) {
+    if (consumer->type_string() == "Merge") {
+      merge_consumers += 1;
+      merge_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(merge_consumers, 1);
+  // Verify the assigned device of the NextIteration node.
+  int next_iteration_consumers = 0;
+  for (const Node* consumer : merge_node->in_nodes()) {
+    if (consumer->type_string() == "NextIteration") {
+      next_iteration_consumers += 1;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(next_iteration_consumers, 1);
+  // Verify the assigned device of the Switch node.
+  int switch_consumers = 0;
+  const Node* switch_node = nullptr;
+  for (const Node* consumer : merge_node->out_nodes()) {
+    if (consumer->type_string() == "Switch") {
+      switch_consumers += 1;
+      switch_node = consumer;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(switch_consumers, 1);
+  // Verify the assigned device of the Exit node.
+  int exit_consumers = 0;
+  for (const Node* consumer : switch_node->out_nodes()) {
+    if (consumer->type_string() == "Exit") {
+      exit_consumers += 1;
+      ASSERT_EQ(consumer->assigned_device_name(), assigned_device_name);
+    }
+  }
+  ASSERT_EQ(exit_consumers, 1);
+}
+
 TEST(LowerWhileOpTest, MultipleInputs) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 

From 35fc29e5d1a6ffa29fa6207378c997233a69c9cb Mon Sep 17 00:00:00 2001
From: Parker Schuh <parkers@google.com>
Date: Fri, 8 May 2020 10:25:20 -0700
Subject: [PATCH 0200/1533] Internal visibility addition.

PiperOrigin-RevId: 310581614
Change-Id: Ib4e7a80999211cec4be399f8c8925039c126acb9
---
 tensorflow/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f2018220a56..bf3af3c31b4 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -517,6 +517,7 @@ package_group(
         "//perftools/accelerators/xprof/api/...",
         "//third_party/py/autograph/...",
         "//third_party/swift/tensorflow/x10/...",
+        "//third_party/swift/tensorflow_apis/...",
         "//tensorflow/...",
         "//tensorflow_estimator/python/estimator/...",
         "//tensorflow_models/official/...",

From b08e6cd85aba13749accab67f9f94f00621ecb9c Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 10:28:51 -0700
Subject: [PATCH 0201/1533] Remove use of gtl/cleanup.h and hash.h in
 cupti_tracer

PiperOrigin-RevId: 310582340
Change-Id: I1c087a126947c1eced849deb3f8a5689ebe08d08
---
 .../profiler/internal/gpu/cupti_tracer.cc     | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 1110e103d57..3db6a8d029d 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -18,8 +18,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -286,19 +284,14 @@ void CUPTIAPI FreeCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
           << reinterpret_cast<uintptr_t>(buffer) << std::dec
           << " size: " << size << " valid_size: " << valid_size;
 
-  // Ensure buffer is free when this function returns.
-  auto buffer_cleanup =
-      gtl::MakeCleanup([buffer] { port::AlignedFree(buffer); });
+  if (valid_size > 0) {
+    VLOG(3) << "Activity profile for stream " << stream_id;
 
-  if (valid_size <= 0) {
-    return;
+    CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
+    cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
+        .IgnoreError();
   }
-
-  VLOG(3) << "Activity profile for stream " << stream_id;
-
-  CuptiTracer *cupti_tracer = CuptiTracer::GetCuptiTracerSingleton();
-  cupti_tracer->ProcessActivityBuffer(context, stream_id, buffer, valid_size)
-      .IgnoreError();
+  port::AlignedFree(buffer);
 }
 
 void AddKernelEventUponApiExit(CuptiTraceCollector *collector, uint32 device_id,
@@ -984,7 +977,7 @@ class CudaEventRecorder {
   using StreamKey = std::pair<CUcontext, CUstream>;
 
   absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
-  absl::flat_hash_map<StreamKey, StreamInfo, hash<StreamKey>> stream_infos_;
+  absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
 };
 
 // This hook uses cuda events to measure device side activities.

From 546319f28a6e4640dfc9ab8cc8f0615cbebdaef1 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Fri, 8 May 2020 10:36:54 -0700
Subject: [PATCH 0202/1533] Add skip_if_error context manager in test_util.py
 to conveniently skip errors that are not related to what is being tested.

Fix lingering test flakiness.

PiperOrigin-RevId: 310583994
Change-Id: I15925753b6faf9dc5bf3603231b248aa02965c19
---
 tensorflow/python/distribute/BUILD            |  1 +
 .../distribute/multi_process_runner_test.py   |  2 +-
 .../multi_worker_continuous_run_test.py       | 17 ++++--
 tensorflow/python/framework/test_util.py      | 32 ++++++++++
 tensorflow/python/framework/test_util_test.py | 61 +++++++++++++++++++
 .../distribute/multi_worker_tutorial_test.py  | 11 ++--
 6 files changed, 113 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index add23bef4b9..a7e62a2dc7c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -452,6 +452,7 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 47c3a744419..1413777d0bc 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -160,7 +160,7 @@ class MultiProcessRunnerTest(test.TestCase):
       for i in range(0, 10):
         print(
             'index {}, iteration {}'.format(self._worker_idx(), i), flush=True)
-        time.sleep(1)
+        time.sleep(5)
 
     mpr = multi_process_runner.MultiProcessRunner(
         proc_func,
diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
index 90484a12423..437255c1015 100644
--- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py
+++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
@@ -33,10 +33,13 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import config
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 
+
 NUM_WORKERS = 5
 
 
@@ -84,9 +87,10 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
       for _ in range(20):
         worker_step_fn(worker_id)
 
-    multi_process_runner.run(
-        worker_fn,
-        cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      multi_process_runner.run(
+          worker_fn,
+          cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
 
   @combinations.generate(combinations.combine(mode=['eager']))
   def testVariableInitializationWithChangingShape(self, mode):
@@ -116,9 +120,10 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
       for i in range(20):
         worker_step_fn(worker_id, num_dims=(i + 1))
 
-    multi_process_runner.run(
-        worker_fn,
-        cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      multi_process_runner.run(
+          worker_fn,
+          cluster_spec=test_base.create_cluster_spec(num_workers=NUM_WORKERS))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index fa2a4f63e6c..d5bbd889166 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -460,6 +460,38 @@ def skip_if(condition):
   return real_skip_if
 
 
+@contextlib.contextmanager
+def skip_if_error(test_obj, error_type, messages=None):
+  """Context manager to skip cases not considered failures by the tests.
+
+  Note that this does not work if used in setUpClass/tearDownClass.
+  Usage in setUp/tearDown works fine just like regular test methods.
+
+  Args:
+    test_obj: A test object provided as `self` in the test methods; this object
+      is usually an instance of `unittest.TestCase`'s subclass and should have
+      `skipTest` method.
+    error_type: The error type to skip. Note that if `messages` are given, both
+      `error_type` and `messages` need to match for the test to be skipped.
+    messages: Optional, a string or list of strings. If `None`, the test will be
+      skipped if `error_type` matches what is raised; otherwise, the test is
+      skipped if any of the `messages` is contained in the message of the error
+      raised, and `error_type` matches the error raised.
+
+  Yields:
+    Nothing.
+  """
+  if messages:
+    messages = nest.flatten(messages)
+  try:
+    yield
+  except error_type as e:
+    if not messages or any([message in str(e) for message in messages]):
+      test_obj.skipTest("Skipping error: {}".format(str(e)))
+    else:
+      raise
+
+
 def enable_c_shapes(fn):
   """No-op. TODO(b/74620627): Remove this."""
   return fn
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index b5cb903c666..2bd75c3919e 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -22,6 +22,7 @@ import collections
 import copy
 import random
 import threading
+import unittest
 import weakref
 
 from absl.testing import parameterized
@@ -808,6 +809,66 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(tested_codepaths, set(["present", "future"]))
 
 
+class SkipTestTest(test_util.TensorFlowTestCase):
+
+  def _verify_test_in_set_up_or_tear_down(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError,
+                                   ["foo bar", "test message"]):
+        raise ValueError("test message")
+    try:
+      with self.assertRaisesRegexp(ValueError, "foo bar"):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError("foo bar")
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+  def setUp(self):
+    super(SkipTestTest, self).setUp()
+    self._verify_test_in_set_up_or_tear_down()
+
+  def tearDown(self):
+    super(SkipTestTest, self).tearDown()
+    self._verify_test_in_set_up_or_tear_down()
+
+  def test_skip_if_error_should_skip(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError, "test message"):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_with_list(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError,
+                                   ["foo bar", "test message"]):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_without_expected_message(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError):
+        raise ValueError("test message")
+
+  def test_skip_if_error_should_skip_without_error_message(self):
+    with self.assertRaises(unittest.SkipTest):
+      with test_util.skip_if_error(self, ValueError):
+        raise ValueError()
+
+  def test_skip_if_error_should_raise_message_mismatch(self):
+    try:
+      with self.assertRaisesRegexp(ValueError, "foo bar"):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError("foo bar")
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+  def test_skip_if_error_should_raise_no_message(self):
+    try:
+      with self.assertRaisesRegexp(ValueError, ""):
+        with test_util.skip_if_error(self, ValueError, "test message"):
+          raise ValueError()
+    except unittest.SkipTest:
+      raise RuntimeError("Test is not supposed to skip.")
+
+
 # Its own test case to reproduce variable sharing issues which only pop up when
 # setUp() is overridden and super() is not called.
 class GraphAndEagerNoVariableSharing(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index 0a9c5547f5a..1a46bcd7499 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -28,6 +28,8 @@ from tensorflow.python.distribute import collective_all_reduce_strategy
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.datasets import mnist
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
@@ -122,10 +124,11 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
           steps_per_epoch=70,
           callbacks=callbacks)
 
-    mpr_result = multi_process_runner.run(
-        proc_func,
-        multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
-        list_stdout=True)
+    with test_util.skip_if_error(self, errors_impl.UnavailableError):
+      mpr_result = multi_process_runner.run(
+          proc_func,
+          multi_worker_test_base.create_cluster_spec(num_workers=num_workers),
+          list_stdout=True)
 
     def extract_accuracy(worker_id, input_string):
       match = re.match(

From a37a3569f4faece52d56fffb9aef3757cf6d03f1 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Fri, 8 May 2020 11:02:28 -0700
Subject: [PATCH 0203/1533] Add a nested tf.function with control flow test.

PiperOrigin-RevId: 310589571
Change-Id: Icb71cd7f50d77fe4b67ba21bedf415cdc8ff24bd
---
 .../custom_training_loop_models_test.py       | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/distribute/custom_training_loop_models_test.py
index 3c748bd7364..48f2af0349a 100644
--- a/tensorflow/python/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_models_test.py
@@ -378,6 +378,46 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
     for model_v, model2_v in zip(model.variables, model2.variables):
       self.assertAllClose(model_v.numpy(), model2_v.numpy())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def test_nested_tf_functions_with_control_flow(self, distribution):
+    inputs = np.random.random((10, 3)).astype(np.float32)
+    targets = np.ones((10, 4), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)).repeat()
+    dataset = dataset.batch(10, drop_remainder=True)
+    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
+
+    def get_model():
+      x = keras.layers.Input(shape=(3,), name="input")
+      y = keras.layers.Dense(4, name="dense")(x)
+      model = keras.Model(x, y)
+      return model
+
+    with distribution.scope():
+      model = get_model()
+      optimizer = keras.optimizer_v2.gradient_descent.SGD(0.1, momentum=0.01)
+
+    @def_function.function
+    def train_step(iterator):
+
+      def step_fn(inputs):
+        images, targets = inputs
+        with backprop.GradientTape() as tape:
+          outputs = model(images)
+          loss = math_ops.reduce_sum(outputs - targets)
+        grads = tape.gradient(loss, model.variables)
+        optimizer.apply_gradients(zip(grads, model.variables))
+
+      distribution.run(step_fn, args=(next(iterator),))
+
+    @def_function.function
+    def train_steps(iterator):
+      for _ in math_ops.range(10):
+        train_step(iterator)
+
+    train_steps(input_iterator)
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.all_strategies,

From 54fa7e441969792fa03f897cbd1fdbaf2e4d18e4 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 8 May 2020 11:07:04 -0700
Subject: [PATCH 0204/1533] Resubmitting the change to make iterator deleter
 not on host memory

PiperOrigin-RevId: 310590461
Change-Id: Iea22ee212637252f66ccc9bc1e96dffe8b3cc3a7
---
 tensorflow/compiler/jit/xla_device_ops.h     | 10 ++++------
 tensorflow/core/kernels/data/dataset_utils.h |  5 ++++-
 tensorflow/core/kernels/data/iterator_ops.cc |  6 ++----
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 34ff0c55615..17e4226405a 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -180,12 +180,10 @@ class XlaAssignVariableOp : public OpKernel {
       data::MakeIteratorOp);                                                   \
   REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE),            \
                           data::AnonymousIteratorHandleOp);                    \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("AnonymousIteratorV2").Device(DEVICE).HostMemory("deleter"),        \
-      data::AnonymousIteratorHandleOp);                                        \
-  REGISTER_KERNEL_BUILDER(                                                     \
-      Name("DeleteIterator").Device(DEVICE).HostMemory("deleter"),             \
-      data::DeleteIteratorOp);                                                 \
+  REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV2").Device(DEVICE),          \
+                          data::AnonymousIteratorHandleOp);                    \
+  REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE),               \
+                          data::DeleteIteratorOp);                             \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE),              \
                           data::IteratorGetNextOp);                            \
   REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE),    \
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index d8ae7190a7f..70ca70176e8 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -63,7 +63,10 @@ class AnonymousResourceOp : public OpKernel {
 
     if (create_deleter_) {
       Tensor* deleter_t;
-      OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t));
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(
+          ctx, ctx->allocate_output(1, TensorShape({}), &deleter_t, attr));
       deleter_t->scalar<Variant>()() =
           ResourceDeleter(handle, ctx->resource_manager());
     }
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index a2e8ca13192..9fb3c5fb46e 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -1102,9 +1102,8 @@ REGISTER_KERNEL_BUILDER(
     MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_CPU).Priority(2),
                         DeleteIteratorOp);
-REGISTER_KERNEL_BUILDER(
-    Name("DeleteIterator").Device(DEVICE_GPU).HostMemory("deleter").Priority(1),
-    DeleteIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("DeleteIterator").Device(DEVICE_GPU).Priority(1),
+                        DeleteIteratorOp);
 REGISTER_KERNEL_BUILDER(
     Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
     AnonymousIteratorHandleOp);
@@ -1116,7 +1115,6 @@ REGISTER_KERNEL_BUILDER(
     AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("AnonymousIteratorV2")
                             .Device(DEVICE_GPU)
-                            .HostMemory("deleter")
                             .Priority(1),
                         AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),

From ae3c619cf71bc39e52cbd0e0c75bef1bf4143be5 Mon Sep 17 00:00:00 2001
From: Jing Dong <jingdong@google.com>
Date: Fri, 8 May 2020 11:08:50 -0700
Subject: [PATCH 0205/1533] Move c_api_tfrt to core/tfrt/eager (NFC)

c_api_tfrt is an implementation of the C API, so it should not be in c/eager/.
Move it to core/tfrt/eager to mirror the setup for the current TF runtime
directory core/common_runtime/eager.

PiperOrigin-RevId: 310590751
Change-Id: I6840756c321c29eec2a6b648c3484ec4fc8bd46e
---
 tensorflow/c/eager/BUILD    | 2 --
 tensorflow/c/eager/c_api.cc | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 3d3fc7065a4..d3059df1bef 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -16,7 +16,6 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -609,7 +608,6 @@ filegroup(
         ],
         exclude = [
             "c_api_experimental.cc",
-            "*c_api_tfrt*",
             "*test*",
             "*dlpack*",
         ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 9651a47d6ac..73c2f7824b2 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #ifdef PLATFORM_GOOGLE
-#include "tensorflow/c/eager/c_api_tfrt.h"
+#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
 #endif
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/context.h"

From 820d915c56d419feaeb3b4421b191009886befad Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Fri, 8 May 2020 11:28:48 -0700
Subject: [PATCH 0206/1533] Reduce space required for TFLM by dropping two
 non-needed pointers from MicroInterpreter.

PiperOrigin-RevId: 310594944
Change-Id: I779def7c2e627bec64556a752981db98b6070884
---
 tensorflow/lite/micro/micro_interpreter.cc | 10 ++++------
 tensorflow/lite/micro/micro_interpreter.h  |  4 +---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index c5d35407648..2d774d0a139 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -87,8 +87,6 @@ MicroInterpreter::MicroInterpreter(const Model* model,
     return;
   }
   subgraph_ = (*subgraphs)[0];
-  tensors_ = subgraph_->tensors();
-  operators_ = subgraph_->operators();
 
   context_.impl_ = static_cast<void*>(&context_helper_);
   context_.ReportError = context_helper_.ReportOpError;
@@ -112,7 +110,7 @@ MicroInterpreter::MicroInterpreter(const Model* model,
 
 MicroInterpreter::~MicroInterpreter() {
   if (node_and_registrations_ != nullptr) {
-    for (size_t i = 0; i < operators_->size(); ++i) {
+    for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
       TfLiteNode* node = &(node_and_registrations_[i].node);
       const TfLiteRegistration* registration =
           node_and_registrations_[i].registration;
@@ -171,7 +169,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   context_.RequestScratchBufferInArena = nullptr;
   context_.GetScratchBuffer = nullptr;
 
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
@@ -195,7 +193,7 @@ TfLiteStatus MicroInterpreter::AllocateTensors() {
   // in Prepare stage.
   context_.RequestScratchBufferInArena =
       context_helper_.RequestScratchBufferInArena;
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     // Set node idx to annotate the lifetime for scratch buffers.
     context_helper_.SetNodeIndex(i);
     auto* node = &(node_and_registrations_[i].node);
@@ -237,7 +235,7 @@ TfLiteStatus MicroInterpreter::Invoke() {
     TF_LITE_ENSURE_OK(&context_, AllocateTensors());
   }
 
-  for (size_t i = 0; i < operators_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
     auto* node = &(node_and_registrations_[i].node);
     auto* registration = node_and_registrations_[i].registration;
 
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index b2046128c78..15f53b681a6 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -132,7 +132,7 @@ class MicroInterpreter {
 
   TfLiteStatus initialization_status() const { return initialization_status_; }
 
-  size_t operators_size() const { return operators_->size(); }
+  size_t operators_size() const { return subgraph_->operators()->size(); }
 
   // For debugging only.
   const NodeAndRegistration node_and_registration(int node_index) const {
@@ -163,8 +163,6 @@ class MicroInterpreter {
   bool tensors_allocated_;
 
   TfLiteStatus initialization_status_;
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
-  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
 
   const SubGraph* subgraph_;
   internal::ContextHelper context_helper_;

From 85fb70ad19b47d3166a3e70893968075103efd91 Mon Sep 17 00:00:00 2001
From: Geeta Chavan <geetac@google.com>
Date: Fri, 8 May 2020 11:34:03 -0700
Subject: [PATCH 0207/1533] Merge release notes and version updates to master

PiperOrigin-RevId: 310596034
Change-Id: Icdd7dabb0fc2f6e56c4ce7c7a9be70cdc643cc68
---
 RELEASE.md                            | 144 ++++++++++++++++++++++++++
 tensorflow/tensorflow.bzl             |   2 +-
 tensorflow/tools/pip_package/setup.py |   4 +-
 3 files changed, 147 insertions(+), 3 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index b5d088821e4..6c8921cf492 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,147 @@
+# Release 2.2.0
+
+TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update).
+
+Coinciding with this change, new releases of [TensorFlow's Docker images](https://hub.docker.com/r/tensorflow/tensorflow/) provide Python 3 exclusively. Because all images now use Python 3, Docker tags containing `-py3` will no longer be provided and existing `-py3` tags like `latest-py3` will not be updated.
+
+## Major Features and Improvements
+
+* Replaced the scalar type for string tensors from `std::string` to `tensorflow::tstring` which is now ABI stable.
+* A new Profiler for TF 2 for CPU/GPU/TPU. It offers both device and host performance analysis, including input pipeline and TF Ops. Optimization advisory is provided whenever possible. Please see [this tutorial](https://www.tensorflow.org/tensorboard/tensorboard_profiling_keras) and [guide](https://www.tensorflow.org/guide/profiler) for usage guidelines.
+* Export C++ functions to Python using `pybind11` as opposed to `SWIG` as a part of our [deprecation of swig efforts](https://github.com/tensorflow/community/blob/master/rfcs/20190208-pybind11.md).
+* `tf.distribute`:
+  * Support added for global sync `BatchNormalization` by using the newly added `tf.keras.layers.experimental.SyncBatchNormalization` layer. This layer will sync `BatchNormalization` statistics every step across all replicas taking part in sync training.
+  * Performance improvements for GPU multi-worker distributed training using `tf.distribute.experimental.MultiWorkerMirroredStrategy`
+    * Update NVIDIA `NCCL` to `2.5.7-1` for better performance and performance tuning. Please see [nccl developer guide](https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/env.html) for more information on this.
+    * Support gradient `allreduce` in `float16`. See this [example](https://github.com/tensorflow/models/blob/master/official/staging/training/grad_utils.py) usage.
+    * Experimental support of [all reduce gradient packing](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/CollectiveHints) to allow overlapping gradient aggregation with backward path computation.
+    * Deprecated `experimental_run_v2` method for distribution strategies and renamed the method `run` as it is no longer experimental.
+    * Add CompositeTensor support for DistributedIterators. This should help prevent unnecessary function retracing and memory leaks.
+* `tf.keras`:
+  * `Model.fit` major improvements:
+     * You can now use custom training logic with `Model.fit` by overriding `Model.train_step`.
+     * Easily write state-of-the-art training loops without worrying about all of the features `Model.fit` handles for you (distribution strategies, callbacks, data formats, looping logic, etc)
+     * See the default [`Model.train_step`](https://github.com/tensorflow/tensorflow/blob/1381fc8e15e22402417b98e3881dfd409998daea/tensorflow/python/keras/engine/training.py#L540) for an example of what this function should look like. Same applies for validation and inference via `Model.test_step` and `Model.predict_step`.
+     * SavedModel uses its own `Model._saved_model_inputs_spec` attr now instead of
+       relying on `Model.inputs` and `Model.input_names`, which are no longer set for subclass Models.
+       This attr is set in eager, `tf.function`, and graph modes. This gets rid of the need for users to
+       manually call `Model._set_inputs` when using Custom Training Loops(CTLs).
+     * Dynamic shapes are supported for generators by calling the Model on the first batch we "peek" from the generator.
+       This used to happen implicitly in `Model._standardize_user_data`. Long-term, a solution where the
+       `DataAdapter` doesn't need to call the Model is probably preferable.
+  * The SavedModel format now supports all Keras built-in layers (including metrics, preprocessing layers, and stateful RNN layers)
+  * Update Keras batch normalization layer to use the running mean and average computation in the `fused_batch_norm`. You should see significant performance improvements when using `fused_batch_norm` in Eager mode.
+
+* `tf.lite`:
+  * Enable TFLite experimental new converter by default.
+* XLA
+  * XLA now builds and works on windows. All prebuilt packages come with XLA available.
+  * XLA can be [enabled for a `tf.function`](https://www.tensorflow.org/xla#explicit_compilation_with_tffunction
+) with “compile or throw exception” semantics on CPU and GPU.
+
+## Breaking Changes
+* `tf.keras`:
+  * In `tf.keras.applications` the name of the "top" layer has been standardized to "predictions". This is only a problem if your code relies on the exact name of the layer.
+  * Huber loss function has been updated to be consistent with other Keras losses. It now computes mean over the last axis of per-sample losses before applying the reduction function.
+* AutoGraph no longer converts functions passed to `tf.py_function`, `tf.py_func` and `tf.numpy_function`.
+* Deprecating `XLA_CPU` and `XLA_GPU` devices with this release.
+* Increasing the minimum bazel version to build TF to 2.0.0 to use Bazel's `cc_experimental_shared_library`.
+* Keras compile/fit behavior for functional and subclassed models have been unified. Model properties such as `metrics`, `metrics_names` will now be available only after **training/evaluating the model on actual data** for functional models. `metrics` will **now include** model `loss` and output losses.`loss_functions` property has been removed from the model. This was an undocumented property that was accidentally public and has now been removed.
+
+## Known Caveats
+* The current TensorFlow release now **requires** [gast](https://pypi.org/project/gast/) version 0.3.3.
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * Removed `autotune_algorithm` from experimental optimization options.
+* TF Core:
+  * `tf.constant` always creates CPU tensors irrespective of the current device context.
+  * Eager `TensorHandles` maintain a list of mirrors for any copies to local or remote devices. This avoids any redundant copies due to op execution.
+  * For `tf.Tensor` & `tf.Variable`, `.experimental_ref()` is no longer experimental and is available as simply `.ref()`.
+  * `pfor/vectorized_map`: Added support for vectorizing 56 more ops. Vectorizing `tf.cond` is also supported now.
+  * Set as much partial shape as we can infer statically within the gradient impl of the gather op.
+  * Gradient of `tf.while_loop` emits `StatelessWhile` op if `cond` and body functions are stateless. This allows multiple gradients while ops to run in parallel under distribution strategy.
+  * Speed up `GradientTape` in eager mode by auto-generating list of op inputs/outputs which are unused and hence not cached for gradient functions.
+  * Support `back_prop=False` in `while_v2` but mark it as deprecated.
+  * Improve error message when attempting to use `None` in data-dependent control flow.
+  * Add `RaggedTensor.numpy()`.
+  * Update `RaggedTensor.__getitem__` to preserve uniform dimensions & allow indexing into uniform dimensions.
+  * Update `tf.expand_dims` to always insert the new dimension as a non-ragged dimension.
+  * Update `tf.embedding_lookup` to use `partition_strategy` and `max_norm` when `ids` is ragged.
+  * Allow `batch_dims==rank(indices)` in `tf.gather`.
+  * Add support for bfloat16 in `tf.print`.
+* `tf.distribute`:
+  * Support `embedding_column` with variable-length input features for `MultiWorkerMirroredStrategy`.
+* `tf.keras`:
+  * Added `experimental_aggregate_gradients` argument to `tf.keras.optimizer.Optimizer.apply_gradients`. This allows custom gradient aggregation and processing aggregated gradients in custom training loop.
+  * Allow `pathlib.Path` paths for loading models via Keras API.
+* `tf.function`/AutoGraph:
+  * AutoGraph is now available in `ReplicaContext.merge_call`, `Strategy.extended.update` and `Strategy.extended.update_non_slot`.
+  * Experimental support for shape invariants has been enabled in `tf.function`. See the API docs for `tf.autograph.experimental.set_loop_options` for additonal info.
+  * AutoGraph error messages now exclude frames corresponding to APIs internal to AutoGraph.
+  * Improve shape inference for `tf.function` input arguments to unlock more Grappler optimizations in TensorFlow 2.x.
+  * Improve automatic control dependency management of resources by allowing resource reads to occur in parallel and synchronizing only on writes.
+  * Fix execution order of multiple stateful calls to `experimental_run_v2` in `tf.function`.
+  * You can now iterate over `RaggedTensors` using a for loop inside `tf.function`.
+* `tf.lite`:
+  *  Migrated the `tf.lite` C inference API out of experimental into lite/c.
+  * Add an option to disallow `NNAPI` CPU / partial acceleration on Android 10
+  * TFLite Android AARs now include the C headers and APIs are required to use TFLite from native code.
+  * Refactors the delegate and delegate kernel sources to allow usage in the linter.
+  * Limit delegated ops to actually supported ones if a device name is specified or `NNAPI` CPU Fallback is disabled.
+  * TFLite now supports `tf.math.reciprocal1` op by lowering to `tf.div op`.
+  * TFLite's unpack op now supports boolean tensor inputs.
+  * Microcontroller and embedded code moved from experimental to main TensorFlow Lite folder
+  * Check for large TFLite tensors.
+  * Fix GPU delegate crash with C++17.
+  * Add 5D support to TFLite `strided_slice`.
+  * Fix error in delegation of `DEPTH_TO_SPACE` to `NNAPI` causing op not to be accelerated.
+  * Fix segmentation fault when running a model with LSTM nodes using `NNAPI` Delegate
+  * Fix `NNAPI` delegate failure when an operand for Maximum/Minimum operation is a scalar.
+  * Fix `NNAPI` delegate failure when Axis input for reduce operation is a scalar.
+  * Expose option to limit the number of partitions that will be delegated to `NNAPI`.
+  * If a target accelerator is specified, use its feature level to determine operations to delegate instead of SDK version.
+* `tf.random`:
+  * Various random number generation improvements:
+    * Add a fast path for default `random_uniform`
+    * `random_seed` documentation improvement.
+    * `RandomBinomial` broadcasts and appends the sample shape to the left rather than the right.
+  * Added `tf.random.stateless_binomial`, `tf.random.stateless_gamma`, `tf.random.stateless_poisson`
+  * `tf.random.stateless_uniform` now supports unbounded sampling of `int` types.
+* Math and Linear Algebra:
+  * Add `tf.linalg.LinearOperatorTridiag`.
+  * Add `LinearOperatorBlockLowerTriangular`
+  * Add broadcasting support to tf.linalg.triangular_solve[#26204](https://github.com/tensorflow/tensorflow/issues/26204), tf.math.invert_permutation.
+  * Add `tf.math.sobol_sample` op.
+  * Add `tf.math.xlog1py`.
+  * Add `tf.math.special.{dawsn,expi,fresnel_cos,fresnel_sin,spence}`.
+  * Add a Modified Discrete Cosine Transform (MDCT) and its inverse to `tf.signal`.
+* TPU Enhancements:
+  * Refactor `TpuClusterResolver` to move shared logic to a separate pip package.
+  * Support configuring TPU software version from cloud tpu client.
+  * Allowed TPU embedding weight decay factor to be multiplied by learning rate.
+* XLA Support:
+  * Add standalone XLA AOT runtime target + relevant .cc sources to pip package.
+  * Add check for memory alignment to MemoryAllocation::MemoryAllocation() on 32-bit ARM. This ensures a deterministic early exit instead of a hard to debug bus error later.
+  * `saved_model_cli aot_compile_cpu` allows you to compile saved models to XLA header+object files and include them in your C++ programs.
+  * Enable `Igamma`, `Igammac` for XLA.
+* Deterministic Op Functionality:
+  * XLA reduction emitter is deterministic when the environment variable `TF_DETERMINISTIC_OPS` is set to "true" or "1". This extends deterministic `tf.nn.bias_add` back-prop functionality (and therefore also deterministic back-prop of bias-addition in Keras layers) to include when XLA JIT complilation is enabled.
+  * Fix problem, when running on a CUDA GPU and when either environment variable `TF_DETERMINSTIC_OPS` or environment variable `TF_CUDNN_DETERMINISTIC` is set to "true" or "1", in which some layer configurations led to an exception with the message "No algorithm worked!"
+* Tracing and Debugging:
+  * Add source, destination name to `_send` traceme to allow easier debugging.
+  * Add traceme event to `fastpathexecute`.
+* Other:
+  * Fix an issue with AUC.reset_states for multi-label AUC [#35852](https://github.com/tensorflow/tensorflow/issues/35852)
+  * Fix the TF upgrade script to not delete files when there is a parsing error and the output mode is `in-place`.
+  * Move `tensorflow/core:framework/*_pyclif` rules to `tensorflow/core/framework:*_pyclif`.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+372046933, 8bitmp3, aaronhma, Abin Shahab, Aditya Patwardhan, Agoniii, Ahti Kitsik, Alan Yee, Albin Joy, Alex Hoffman, Alexander Grund, Alexandre E. Eichenberger, Amit Kumar Jaiswal, amoitra, Andrew Anderson, Angus-Luo, Anthony Barbier, Anton Kachatkou, Anuj Rawat, archis, Arpan-Dhatt, Arvind Sundararajan, Ashutosh Hathidara, autoih, Bairen Yi, Balint Cristian, Bas Aarts, BashirSbaiti, Basit Ayantunde, Ben Barsdell, Benjamin Gaillard, boron, Brett Koonce, Bryan Cutler, Christian Goll, Christian Sachs, Clayne Robison, comet, Daniel Falbel, Daria Zhuravleva, darsh8200, David Truby, Dayananda-V, deepakm, Denis Khalikov, Devansh Singh, Dheeraj R Reddy, Diederik Van Liere, Diego Caballero, Dominic Jack, dothinking, Douman, Drake Gens, Duncan Riach, Ehsan Toosi, ekuznetsov139, Elena Zhelezina, elzino, Ending2015a, Eric Schweitz, Erik Zettel, Ethan Saadia, Eugene Kuznetsov, Evgeniy Zheltonozhskiy, Ewout Ter Hoeven, exfalso, FAIJUL, Fangjun Kuang, Fei Hu, Frank Laub, Frederic Bastien, Fredrik Knutsson, frreiss, Frédéric Rechtenstein, fsx950223, Gaurav Singh, gbaned, George Grzegorz Pawelczak, George Sterpu, Gian Marco Iodice, Giorgio Arena, Hans Gaiser, Hans Pabst, Haoyu Wu, Harry Slatyer, hsahovic, Hugo, Hugo Sjöberg, IrinaM21, jacco, Jake Tae, Jean-Denis Lesage, Jean-Michel Gorius, Jeff Daily, Jens Elofsson, Jerry Shih, jerryyin, Jin Mingjian, Jinjing Zhou, JKIsaacLee, jojimonv, Jonathan Dekhtiar, Jose Ignacio Gomez, Joseph-Rance, Judd, Julian Gross, Kaixi Hou, Kaustubh Maske Patil, Keunwoo Choi, Kevin Hanselman, Khor Chean Wei, Kilaru Yasaswi Sri Chandra Gandhi, Koan-Sin Tan, Koki Ibukuro, Kristian Holsheimer, kurileo, Lakshay Tokas, Lee Netherton, leike666666, Leslie-Fang-Intel, Li, Guizi, LIUJIAN435, Lukas Geiger, Lyo Nguyen, madisetti, Maher Jendoubi, Mahmoud Abuzaina, Manuel Freiberger, Marcel Koester, Marco Jacopo Ferrarotti, Markus Franke, marload, Mbah-Javis, mbhuiyan, Meng Zhang, Michael Liao, MichaelKonobeev, Michal Tarnowski, Milan Straka, minoring, Mohamed Nour Abouelseoud, MoussaMM, Mrinal Jain, mrTsjolder, Måns Nilsson, Namrata Bhave, Nicholas Gao, Niels Ole Salscheider, nikochiko, Niranjan Hasabnis, Nishidha Panpaliya, nmostafa, Noah Trenaman, nuka137, Officium, Owen L - Sfe, Pallavi G, Paul Andrey, Peng Sun, Peng Wu, Phil Pearl, PhilipMay, pingsutw, Pooya Davoodi, PragmaTwice, pshiko, Qwerty71, R Gomathi, Rahul Huilgol, Richard Xiao, Rick Wierenga, Roberto Rosmaninho, ruchit2801, Rushabh Vasani, Sami, Sana Damani, Sarvesh Dubey, Sasan Jafarnejad, Sergii Khomenko, Shane Smiskol, Shaochen Shi, sharkdtu, Shawn Presser, ShengYang1, Shreyash Patodia, Shyam Sundar Dhanabalan, Siju Samuel, Somyajit Chakraborty Sam, Srihari Humbarwadi, srinivasan.narayanamoorthy, Srishti Yadav, Steph-En-M, Stephan Uphoff, Stephen Mugisha, SumanSudhir, Taehun Kim, Tamas Bela Feher, TengLu, Tetragramm, Thierry Herrmann, Tian Jin, tigertang, Tom Carchrae, Tom Forbes, Trent Lo, Victor Peng, vijayphoenix, Vincent Abriou, Vishal Bhola, Vishnuvardhan Janapati, vladbataev, VoVAllen, Wallyss Lima, Wen-Heng (Jack) Chung, wenxizhu, William D. Irons, William Zhang, Xiaoming (Jason) Cui, Xiaoquan Kong, Xinan Jiang, Yasir Modak, Yasuhiro Matsumoto, Yaxun (Sam) Liu, Yong Tang, Ytyt-Yt, yuan, Yuan Mingshuai, Yuan Tang, Yuki Ueda, Yusup, zhangshijin, zhuwenxi
+
 # Release 2.0.1
 
 ## Bug Fixes and Other Changes
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d9229e00306..61d6656ec80 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -58,7 +58,7 @@ load(
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.1.0"
+VERSION = "2.2.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 
 # Sanitize a dependency so that it works correctly from code that includes
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 622480102a9..f61e00c01d5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -50,7 +50,7 @@ DOCLINES = __doc__.split('\n')
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.1.0'
+_VERSION = '2.2.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',
@@ -65,7 +65,7 @@ REQUIRED_PACKAGES = [
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
     'tensorboard >= 2.2.0, < 2.3.0',
-    'tensorflow_estimator >= 2.1.0, < 2.2.0',
+    'tensorflow_estimator >= 2.2.0, < 2.3.0',
     'termcolor >= 1.1.0',
     'wrapt >= 1.11.1',
     # python3 requires wheel 0.26

From ddd57e10ac2ddb8695ced57f58c3be5aa66c80cd Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Fri, 8 May 2020 11:43:47 -0700
Subject: [PATCH 0208/1533] Create Ubuntu 16 scripts and configs to resurrect
 libtensorflow releases.

PiperOrigin-RevId: 310597955
Change-Id: Iab4a87796e5bd143f3a7ea790a4ee2322c6bac87
---
 tensorflow/opensource_only.files              |  2 ++
 .../ubuntu_16/libtensorflow/cpu/build.sh      | 29 ++++++++++++++++++
 .../ubuntu_16/libtensorflow/gpu/build.sh      | 30 +++++++++++++++++++
 3 files changed, 61 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
 create mode 100644 tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index d4df3df079e..9ca7bb4fe28 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -343,6 +343,8 @@ tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nightly_release.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/nonpip.sh
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
+tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/sanity/build.sh
 tensorflow/tools/ci_build/release/ubuntu_16/tpu_py37_full/nonpip.sh
 tensorflow/tools/ci_build/release/windows/cpu_libtensorflow/nightly.bat
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
new file mode 100644
index 00000000000..abb85c18711
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/cpu/build.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
new file mode 100644
index 00000000000..c399ed2680f
--- /dev/null
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/libtensorflow/gpu/build.sh
@@ -0,0 +1,30 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -e
+
+# Source the external common scripts.
+source tensorflow/tools/ci_build/release/common.sh
+
+
+# Install latest bazel
+install_bazelisk
+which bazel
+
+# Install realpath
+sudo apt-get install realpath
+
+export TF_NEED_CUDA=1
+
+./tensorflow/tools/ci_build/linux/libtensorflow.sh

From 6ef6c8d5e5c1661295d9e64c1ecf16d688ebb256 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 11:44:28 -0700
Subject: [PATCH 0209/1533] Change the logging from warning to error when there
 is an exception at session close.

PiperOrigin-RevId: 310598070
Change-Id: Id50e116527814ed945aaeb575d811d362afe502a
---
 tensorflow/python/training/monitored_session.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index d77278e98f4..ab63f4237da 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -1189,7 +1189,7 @@ class _WrappedSession(object):
       try:
         self._sess.close()
       except _PREEMPTION_ERRORS as e:
-        logging.warning(
+        logging.error(
             'An error occurred when attempting to close the '
             'session. This may be due to a preemption in a '
             'connected worker or parameter server. Error: %s', e)

From 825b22ac19ada8969502587d3d7b7a3ab17d87fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 11:46:19 -0700
Subject: [PATCH 0210/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310598397
Change-Id: If465adb72db9054bb2a3e43e2d58422070d0e90c
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 0a3199d1bd15793555a2e1c9bc7653b40f698799 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Fri, 8 May 2020 12:03:15 -0700
Subject: [PATCH 0211/1533] Add a comment to note that the structs are
 "semi-ABI stable" in TFL common.h.

PiperOrigin-RevId: 310601795
Change-Id: I8922a1e1f66e194429111758c6e50f50da6d2f69
---
 tensorflow/lite/c/common.h                                   | 3 +++
 tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 12ddf9945fd..9657c7e564c 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -29,6 +29,9 @@ limitations under the License.
 // TfLiteDelegate - allows delegation of nodes to alternative backends.
 //
 // Some abstractions in this file are created and managed by Interpreter.
+//
+// NOTE: The order of values in these structs are "semi-ABI stable". New values
+// should be added only to the end of structs and never reordered.
 
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 12ddf9945fd..9657c7e564c 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -29,6 +29,9 @@ limitations under the License.
 // TfLiteDelegate - allows delegation of nodes to alternative backends.
 //
 // Some abstractions in this file are created and managed by Interpreter.
+//
+// NOTE: The order of values in these structs are "semi-ABI stable". New values
+// should be added only to the end of structs and never reordered.
 
 #ifndef TENSORFLOW_LITE_C_COMMON_H_
 #define TENSORFLOW_LITE_C_COMMON_H_

From 497c869b070b33ee082f9b92040f0403dce2ccd7 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 7 May 2020 08:31:14 -0700
Subject: [PATCH 0212/1533] Fix the copy bug.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 7 +++++--
 .../compiler/xla/service/gpu/kernel_mapping_scheme.h       | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index ec5f10bd2e8..7084736ac3c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2016,7 +2016,9 @@ void IrEmitterUnnested::EmitTile(
 
   // True iff all threads always execute all instructions in the tiling
   // dimension X.
-  bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0;
+  bool x_tile_fits =
+      mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0 &&
+      mapping_scheme.GetRowContiguous();
 
   // The outer loop below is simply doing:
   //
@@ -2731,7 +2733,8 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*num_threads_y=*/kNumRows,
                                      /*num_threads_x=*/kWarpSize,
                                      /*indexing_order=*/kLinearIndexingX,
-                                     /*vector_size=*/1);
+                                     /*vector_size=*/1,
+                                     /*row_contiguous=*/false);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 5e15d0767a1..d9f80172bcb 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -90,13 +90,14 @@ class KernelMappingScheme {
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
                       int64 num_threads_x, IndexingOrder indexing_order,
-                      int vector_size)
+                      int vector_size, bool row_contiguous = false)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
         indexing_order_(indexing_order),
-        vector_size_(vector_size) {
+        vector_size_(vector_size),
+	row_contiguous_(row_contiguous) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
@@ -134,6 +135,7 @@ class KernelMappingScheme {
 
   IndexingOrder GetIndexingOrder() const { return indexing_order_; }
   int GetVectorSize() const { return vector_size_; }
+  bool GetRowContiguous() const {return row_contiguous_; }
 
  private:
   // The number of elements in each dimension.
@@ -159,6 +161,7 @@ class KernelMappingScheme {
   // to trigger vectorized loads on GPUs while keeping memory
   // coalescing.
   const int vector_size_;
+  const bool row_contiguous_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.

From 696f2a8bd7dc0b80d94382c4cb42aa4ccdc2527c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 12:46:14 -0700
Subject: [PATCH 0213/1533] Branch LandmarksToTransformMatrix operation to
 LandmarksToTransformMatrixV2.

PiperOrigin-RevId: 310610035
Change-Id: I5a284b3b539ddff85ba15c6c08967cab600d3dc8
---
 .../delegates/gpu/common/model_builder.cc     | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index c536e09d6b5..46856a70a7c 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -2485,6 +2485,37 @@ class Landmarks2TransformMatrixOperationParser : public TFLiteOperationParser {
   }
 };
 
+class Landmarks2TransformMatrixV2OperationParser
+    : public TFLiteOperationParser {
+ public:
+  absl::Status IsSupported(const TfLiteContext* context,
+                           const TfLiteNode* tflite_node,
+                           const TfLiteRegistration* registration) final {
+    return CheckInputsOutputs(context, tflite_node, /*runtime_inputs=*/1,
+                              /*outputs=*/1);
+  }
+
+  absl::Status Parse(const TfLiteNode* tflite_node,
+                     const TfLiteRegistration* registration,
+                     GraphFloat32* graph, ObjectReader* reader) final {
+    Node* node = graph->NewNode();
+    RETURN_IF_ERROR(reader->AddInput(node, 0));  // landmarks
+    RETURN_IF_ERROR(reader->AddOutputs(node));   // transform matrix
+
+    const std::string op_name = "landmarks_to_transform_matrix_v2";
+    node->operation.type = op_name;
+    BHWC output_shape;
+    RETURN_IF_ERROR(
+        ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
+                              tflite_node->custom_initial_data_size,
+                              &(node->operation.attributes), &output_shape));
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = output_shape;
+    return absl::OkStatus();
+  }
+};
+
 class AlignmentPointsToTransformMatrixOperationParser
     : public TFLiteOperationParser {
  public:
@@ -2712,6 +2743,9 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       if (custom_name == "Landmarks2TransformMatrix") {
         return std::make_unique<Landmarks2TransformMatrixOperationParser>();
       }
+      if (custom_name == "Landmarks2TransformMatrixV2") {
+        return std::make_unique<Landmarks2TransformMatrixV2OperationParser>();
+      }
       if (custom_name == "AlignmentPointsToTransformMatrix") {
         return std::make_unique<
             AlignmentPointsToTransformMatrixOperationParser>();

From 7e6ea2114888f5b23070ec50622f4d46faa72e4c Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Fri, 8 May 2020 12:47:17 -0700
Subject: [PATCH 0214/1533] Support running a function with packed input
 handles through C APIs.

Introduce a C API TFE_CreatePackedTensorHandle which creates a TFE_TensorHandle referring to multiple TFE_TensorHandles.

PiperOrigin-RevId: 310610230
Change-Id: Icc0ffd5c58ad7780eca38d552c1a2f4617f04891
---
 tensorflow/c/eager/c_api_experimental.cc      |  19 ++
 tensorflow/c/eager/c_api_experimental.h       |   8 +
 tensorflow/c/eager/c_api_remote_test.cc       | 186 ++++++++++++++++++
 tensorflow/c/eager/c_api_test.cc              |  49 +----
 tensorflow/c/eager/c_api_test_util.cc         |  51 +++++
 tensorflow/c/eager/c_api_test_util.h          |   5 +
 tensorflow/core/common_runtime/eager/BUILD    |  21 ++
 .../core/common_runtime/eager/context.cc      |  15 +-
 .../core/common_runtime/eager/context.h       |   3 +
 .../core/common_runtime/eager/context_test.cc |  10 +
 .../core/common_runtime/eager/execute.cc      |   9 +
 .../core/common_runtime/eager/execute_node.cc |  94 +++++++--
 .../core/common_runtime/eager/execute_node.h  |  15 ++
 .../common_runtime/eager/execute_node_test.cc | 126 ++++++++++++
 .../common_runtime/eager/kernel_and_device.cc |   5 +-
 .../common_runtime/eager/kernel_and_device.h  |   7 +-
 .../common_runtime/eager/tensor_handle.cc     |  15 +-
 .../core/common_runtime/eager/tensor_handle.h |   4 +
 .../eager/tensor_handle_test.cc               |   3 +-
 .../replicate_per_replica_nodes.cc            |   3 +
 .../eager/eager_service_impl_test.cc          |   8 +-
 21 files changed, 586 insertions(+), 70 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/eager/execute_node_test.cc

diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index 820650e315f..dd9e5e111d9 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
@@ -638,3 +639,21 @@ TFE_TensorHandle* TFE_NewTensorHandleFromTensor(TFE_Context* ctx, TF_Tensor* t,
   return tensorflow::wrap(
       tensorflow::unwrap(ctx)->CreateLocalHandle(t->tensor));
 }
+
+TFE_TensorHandle* TFE_CreatePackedTensorHandle(TFE_Context* ctx,
+                                               TFE_TensorHandle** handles,
+                                               int* num_handles,
+                                               TF_Status* status) {
+  std::vector<tensorflow::TensorHandle*> tensor_handles;
+  tensor_handles.reserve(*num_handles);
+  for (int i = 0; i < *num_handles; ++i) {
+    tensor_handles.push_back(
+        tensorflow::TensorHandleFromInterface(tensorflow::unwrap(handles[i])));
+  }
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  tensorflow::TensorHandle* handle = nullptr;
+  status->status = tensorflow::TensorHandle::CreatePackedHandle(
+      std::move(tensor_handles), context, &handle);
+  return tensorflow::wrap(handle);
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 33adce40da0..584f7222111 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -541,6 +541,14 @@ TF_CAPI_EXPORT extern TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx,
 TF_CAPI_EXPORT TFE_TensorHandle* TFE_NewTensorHandleFromTensor(
     TFE_Context* ctx, TF_Tensor* t, TF_Status* status);
 
+// Create a packed TensorHandle with the given list of TensorHandles.
+// If `handles` are on the same device, assign the same device to the packed
+// handle; if `handles` are on different deivces, assign a CompositeDevice to
+// it.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_CreatePackedTensorHandle(
+    TFE_Context* ctx, TFE_TensorHandle** handles, int* num_handles,
+    TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 0f988b1456d..12c63675c87 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -351,6 +351,192 @@ TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncOrdering) {
                                 /*heavy_load_on_streaming_rpc=*/true);
 }
 
+// Add the values of three variables on three different tasks.
+string AddVariablesFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'AddVariablesFunction'"
+      "      input_arg {"
+      "        name: 'var'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'sum'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read1'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read2'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:2/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add1'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read1:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add2'"
+      "      op: 'Add'"
+      "      input: 'add1:z:0'"
+      "      input: 'read2:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'sum'"
+      "      value: 'add2:z:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+TEST(CAPI, TestFunctionWithPackedInput) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(/*enable=*/true));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  const char task0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  // Create one variable per task.
+  TFE_TensorHandle* h0 = TestVariable(ctx, 1.0, task0_name);
+  TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
+  TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
+
+  // Pack 3 variable handles into one TFE_TensorHandle.
+  int num_replicas = 3;
+  std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &num_replicas, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  EXPECT_EQ(TFE_TensorHandleDataType(packed_handle), TF_RESOURCE);
+  EXPECT_EQ(TFE_TensorHandleNumDims(packed_handle, status), 0);
+  EXPECT_EQ(TFE_TensorHandleNumElements(packed_handle, status), 1);
+
+  const string composite_device_name =
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
+  EXPECT_EQ(TFE_TensorHandleDeviceName(packed_handle, status),
+            composite_device_name);
+  EXPECT_EQ(TFE_TensorHandleBackingDeviceName(packed_handle, status),
+            composite_device_name);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // Register and run a function which returns the sum of 3 variables.
+  const string function_def = AddVariablesFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "AddVariablesFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, packed_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(packed_handle);
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  float sum = 0;
+  EXPECT_EQ(sizeof(sum), TF_TensorByteSize(t));
+  memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(sum, 6.0);
+
+  TFE_DeleteTensorHandle(h0);
+  TFE_DeleteTensorHandle(h1);
+  TFE_DeleteTensorHandle(h2);
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+  TFE_ContextRemoveFunction(ctx, "AddVariablesFunction", status);
+  TFE_DeleteContext(ctx);
+
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
 void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 3160cb0e585..548bf1337bb 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1132,51 +1132,6 @@ void BM_ExecuteFunction(int iters, int async) {
 }
 BENCHMARK(BM_ExecuteFunction)->Arg(0)->Arg(1);
 
-TFE_TensorHandle* CreateVariable(TFE_Context* ctx, float value,
-                                 TF_Status* status) {
-  // Create the variable handle.
-  TFE_Op* op = TFE_NewOp(ctx, "VarHandleOp", status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
-  TFE_OpSetAttrShape(op, "shape", {}, 0, status);
-  TFE_OpSetAttrString(op, "container", "", 0);
-  TFE_OpSetAttrString(op, "shared_name", "", 0);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_TensorHandle* var_handle = nullptr;
-  int num_retvals = 1;
-  TFE_Execute(op, &var_handle, &num_retvals, status);
-  TFE_DeleteOp(op);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  CHECK_EQ(1, num_retvals);
-
-  // Assign 'value' to it.
-  op = TFE_NewOp(ctx, "AssignVariableOp", status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
-  TFE_OpAddInput(op, var_handle, status);
-
-  // Convert 'value' to a TF_Tensor then a TFE_TensorHandle.
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> t(
-      TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(value)), TF_DeleteTensor);
-  memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get()));
-
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      value_handle(TFE_NewTensorHandle(t.get(), status),
-                   TFE_DeleteTensorHandle);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  TFE_OpAddInput(op, value_handle.get(), status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  num_retvals = 0;
-  TFE_Execute(op, nullptr, &num_retvals, status);
-  TFE_DeleteOp(op);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  CHECK_EQ(0, num_retvals);
-
-  return var_handle;
-}
-
 TEST(CAPI, Variables) {
   // Variables use resource handles, so this is really a test for resource
   // tensor handling.
@@ -1186,7 +1141,7 @@ TEST(CAPI, Variables) {
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_TensorHandle* var_handle = CreateVariable(ctx, 12.0, status);
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 12.0);
   ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
@@ -1227,7 +1182,7 @@ void BM_ReadVariable(int iters) {
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
 
-  TFE_TensorHandle* var_handle = CreateVariable(ctx, 5.0, status);
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 5.0);
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index e67e17963b3..bbdc4c8f410 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -133,6 +133,57 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2(TFE_Context* ctx) {
   return th;
 }
 
+TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
+                               const tensorflow::string& device_name) {
+  TF_Status* status = TF_NewStatus();
+  // Create the variable handle.
+  TFE_Op* op = TFE_NewOp(ctx, "VarHandleOp", status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpSetAttrShape(op, "shape", {}, 0, status);
+  TFE_OpSetAttrString(op, "container", "", 0);
+  TFE_OpSetAttrString(op, "shared_name", "", 0);
+  if (!device_name.empty()) {
+    TFE_OpSetDevice(op, device_name.c_str(), status);
+  }
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_TensorHandle* var_handle = nullptr;
+  int num_retvals = 1;
+  TFE_Execute(op, &var_handle, &num_retvals, status);
+  TFE_DeleteOp(op);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  CHECK_EQ(1, num_retvals);
+
+  // Assign 'value' to it.
+  op = TFE_NewOp(ctx, "AssignVariableOp", status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpAddInput(op, var_handle, status);
+
+  // Convert 'value' to a TF_Tensor then a TFE_TensorHandle.
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> t(
+      TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(value)), TF_DeleteTensor);
+  memcpy(TF_TensorData(t.get()), &value, TF_TensorByteSize(t.get()));
+
+  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
+      value_handle(TFE_NewTensorHandle(t.get(), status),
+                   TFE_DeleteTensorHandle);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  TFE_OpAddInput(op, value_handle.get(), status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  TFE_DeleteOp(op);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  CHECK_EQ(0, num_retvals);
+
+  TF_DeleteStatus(status);
+
+  return var_handle;
+}
+
 TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   TF_Status* status = TF_NewStatus();
 
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 11ae6d1181b..4c43f8d5833 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -42,6 +42,11 @@ TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2(TFE_Context* ctx);
 // Return a tensor handle containing a 3x2 matrix of floats
 TFE_TensorHandle* TestMatrixTensorHandle3X2(TFE_Context* ctx);
 
+// Return a variable handle referring to a variable with the given initial value
+// on the given device.
+TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
+                               const tensorflow::string& device_name = "");
+
 // Return an add op multiplying `a` by `b`.
 TFE_Op* AddOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 695342d5e7a..2b2313d91ff 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -305,6 +305,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":attr_builder",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
@@ -369,6 +370,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/strings",
@@ -396,6 +398,24 @@ cc_library(
     }) + if_mkl([":mkl_eager_op_rewrite"]),
 )
 
+tf_cc_test(
+    name = "execute_node_test",
+    srcs = ["execute_node_test.cc"],
+    deps = [
+        ":context",
+        ":core",
+        ":execute",
+        ":kernel_and_device",
+        ":tensor_handle",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
 cc_library(
     name = "mkl_eager_op_rewrite",
     srcs = ["mkl_eager_op_rewrite.cc"],
@@ -466,6 +486,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":tensor_handle",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 4e5bc934c38..35780077aa8 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -853,6 +853,18 @@ Status EagerContext::FindDeviceFromName(const char* device_name,
   return status;
 }
 
+Status EagerContext::FindCompositeDeviceFromName(
+    const char* device_name, CompositeDevice** device) const {
+  tf_shared_lock l(composite_devices_mu_);
+  for (const auto& d : composite_devices_) {
+    if (d.second->name() == device_name) {
+      *device = d.second.get();
+      return Status::OK();
+    }
+  }
+  return errors::NotFound("Unknown composite device: ", device_name);
+}
+
 Status EagerContext::FindCustomDeviceFromName(const string& device_name,
                                               CustomDevice** dev) const {
   auto dev_it = custom_devices_.find(device_name);
@@ -904,8 +916,7 @@ Status EagerContext::FindOrCreateCompositeDevice(
                                             composite_devices_.size(), &s);
   TF_RETURN_IF_ERROR(s);
   *composite_device = device.get();
-  // TODO(b/145922293): Add the composite device to the device set of pflr in
-  // order to make placer recognize it.
+  pflr_->AddCompositeDevice(*composite_device);
   composite_devices_.emplace(hash_key, std::move(device));
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index aad318886a9..c5404773ba6 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -483,6 +483,9 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
 
   Status FindDeviceFromName(const char* device_name, Device** device) const;
 
+  Status FindCompositeDeviceFromName(const char* device_name,
+                                     CompositeDevice** device) const;
+
   Status FindCustomDeviceFromName(const string& device_name,
                                   CustomDevice** dev) const;
 
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index 9154a288a84..f83e3f0b45d 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -180,6 +180,10 @@ TEST_F(EagerContextTest, CompositeDevice) {
                                                       &composite_device_0));
   EXPECT_EQ(composite_device_0->name(),
             "/job:worker/replica:0/task:0/device:COMPOSITE:0");
+  CompositeDevice* device = nullptr;
+  TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
+      "/job:worker/replica:0/task:0/device:COMPOSITE:0", &device));
+  EXPECT_EQ(device, composite_device_0);
   CompositeDevice* composite_device_1 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_1));
@@ -190,6 +194,12 @@ TEST_F(EagerContextTest, CompositeDevice) {
                                                       &composite_device_2));
   EXPECT_EQ(composite_device_2->name(),
             "/job:worker/replica:0/task:0/device:COMPOSITE:1");
+  TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
+      "/job:worker/replica:0/task:0/device:COMPOSITE:1", &device));
+  EXPECT_EQ(device, composite_device_2);
+
+  EXPECT_TRUE(errors::IsNotFound(context()->FindCompositeDeviceFromName(
+      "/job:worker/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 95d85bfbcc9..35dd9990054 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -367,6 +367,7 @@ Status GetOrCreateKernelAndDevice(
   Fprint128 cache_key = op->MutableAttrs()->CacheKey(op->DeviceName());
 
   std::vector<Device*> input_dev_ptrs;
+  absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_variable_dtypes_and_shapes;
   // We can eliminate some overhead by running simple functions using regular
@@ -410,6 +411,13 @@ Status GetOrCreateKernelAndDevice(
       Device* input_device;
       TF_RETURN_IF_ERROR(GetDeviceForInput(ctx, input, &input_device));
       input_dev_ptrs.push_back(input_device);
+      CompositeDevice* composite_device = nullptr;
+      if (ctx.FindCompositeDeviceFromName(input_device->name().c_str(),
+                                          &composite_device)
+              .ok()) {
+        composite_devices[input_device->name()] =
+            composite_device->underlying_devices();
+      }
       cache_key =
           FingerprintCat128(cache_key, Fingerprint128(input_device->name()));
 
@@ -520,6 +528,7 @@ Status GetOrCreateKernelAndDevice(
 #endif  // IS_MOBILE_PLATFORM
       kernel.reset(new KernelAndDeviceFunc(
           flr, ctx.pflr(), std::move(input_dev_ptrs),
+          std::move(composite_devices),
           std::move(input_resource_variable_dtypes_and_shapes), runner,
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU(), op->Name(),
           [&ctx](const int64 step_id) { return ctx.CreateRendezvous(step_id); },
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index b7bebd4ba11..3197d3e0ac7 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -17,6 +17,51 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
+
+#if !defined(IS_MOBILE_PLATFORM)
+bool ExecuteNodeArgs::IsRemote(EagerContext* ctx, Device* input_device,
+                               TensorHandle* handle) {
+  uint64 context_view_id = ctx->GetContextViewId();
+  if (handle->Type() == TensorHandle::REMOTE ||
+      handle->HasRemoteMirror(input_device, context_view_id)) {
+    if (!has_remote_inputs_) {
+      has_remote_inputs_ = true;
+    }
+    return true;
+  }
+  return false;
+}
+#endif  // IS_MOBILE_PLATFORM
+
+Status ExecuteNodeArgs::InitPackedHandle(const int index, EagerContext* ctx,
+                                         Device* input_device,
+                                         TensorHandle* packed_handle) {
+  int num_handles = packed_handle->NumPackedHandles();
+  packed_args_.emplace(index, gtl::InlinedVector<TensorValue, 4>(num_handles));
+  TensorValue* packed_arg_flat = &(packed_args_[index][0]);
+  for (int i = 0; i < num_handles; ++i) {
+    TensorHandle* h = nullptr;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
+    // We have validated that h->device() is not a CustomDevice when
+    // constructing a pack TensorHandle.
+    const Status status =
+        h->TensorValue(absl::get<Device*>(h->device()), &packed_arg_flat[i]);
+    if (!status.ok()) {
+#if !defined(IS_MOBILE_PLATFORM)
+      if (IsRemote(ctx, input_device, h)) {
+        continue;
+      }
+#endif  // IS_MOBILE_PLATFORM
+      if (h->Type() == TensorHandle::PACKED) {
+        return errors::InvalidArgument(
+            "Nested packed handles are not supported");
+      }
+      return status;
+    }
+  }
+  return Status::OK();
+}
+
 Status ExecuteNodeArgs::Init(
     EagerContext* ctx, const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
     const core::RefCountPtr<KernelAndDevice>& kernel) {
@@ -35,16 +80,17 @@ Status ExecuteNodeArgs::Init(
       Status s = in->TensorValue(ctx->CanonicalDevice(d), &tensor_args_flat[i]);
       if (!s.ok()) {
 #if !defined(IS_MOBILE_PLATFORM)
-        uint64 context_view_id = ctx->GetContextViewId();
-        if (in->Type() == TensorHandle::REMOTE ||
-            in->HasRemoteMirror(d, context_view_id)) {
-          if (!has_remote_inputs_) {
-            has_remote_inputs_ = true;
-          }
+        if (IsRemote(ctx, d, in)) {
           continue;
         }
 #endif
-        return s;
+        if (in->Type() != TensorHandle::PACKED) {
+          return s;
+        }
+        if (!has_packed_inputs_) {
+          has_packed_inputs_ = true;
+        }
+        TF_RETURN_IF_ERROR(InitPackedHandle(i, ctx, d, in));
       }
     }
   }
@@ -54,24 +100,44 @@ Status ExecuteNodeArgs::Init(
     serialize_remote_handle_ =
         [ctx, &op_inputs](const FunctionArgIndex& index,
                           eager::RemoteTensorHandle* handle) -> Status {
-      if (index.sub_index >= 0) {
-        return errors::InvalidArgument("Got unexpected sub_index ",
-                                       index.sub_index, " for argument ",
-                                       index.index);
+      TensorHandle* h = op_inputs[index.index];
+      if (op_inputs[index.index]->Type() == TensorHandle::PACKED) {
+        TF_RETURN_IF_ERROR(
+            op_inputs[index.index]->ExtractPackedHandle(index.sub_index, &h));
       }
-      VariantDevice variant_device = op_inputs[index.index]->device();
+      VariantDevice variant_device = h->device();
       if (VariantDeviceIsCustom(variant_device)) {
         return errors::Internal(
             "Custom devices and remote execution are currently not supported "
             "together.");
       }
       Device* device = absl::get<Device*>(variant_device);
-      return ctx->RemoteMgr()->SerializeRemoteTensorHandle(
-          op_inputs[index.index], handle, device, device->name());
+      return ctx->RemoteMgr()->SerializeRemoteTensorHandle(h, handle, device,
+                                                           device->name());
     };
   }
 #endif  // !IS_MOBILE_PLATFORM
   return Status::OK();
 }
 
+Status ExecuteNodeArgs::GetLocalArg(const FunctionArgIndex& index,
+                                    Tensor* val) const {
+  Status s = EagerKernelArgs::GetLocalArg(index, val);
+  if (s.ok()) {
+    return Status::OK();
+  }
+  if (packed_args_.contains(index.index)) {
+    Tensor* arg = packed_args_.at(index.index).at(index.sub_index).tensor;
+    if (arg) {
+      *val = *arg;
+      return Status::OK();
+    } else {
+      return errors::NotFound("Argument (", index.index, ",", index.sub_index,
+                              ") has no local tensor.");
+    }
+  } else {
+    return s;
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index d416f58bbcd..7924471066e 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <string>
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -54,6 +55,8 @@ class ExecuteNodeArgs : public EagerKernelArgs {
               const absl::InlinedVector<TensorHandle*, 4>& op_inputs,
               const core::RefCountPtr<KernelAndDevice>& kernel);
 
+  Status GetLocalArg(const FunctionArgIndex& index, Tensor* val) const override;
+
   bool HasRemoteOrPackedInputs() const override {
     return has_remote_inputs_ || has_packed_inputs_;
   };
@@ -66,8 +69,20 @@ class ExecuteNodeArgs : public EagerKernelArgs {
 #endif  // IS_MOBILE_PLATFORM
 
  private:
+#if !defined(IS_MOBILE_PLATFORM)
+  // Returns whether `handle` is a remote handle or has a remote mirror on
+  // `input_device`
+  bool IsRemote(EagerContext* ctx, Device* input_device, TensorHandle* handle);
+#endif  // IS_MOBILE_PLATFORM
+
+  // Initialize a packed TensorHandle which is the `index`-th argument.
+  Status InitPackedHandle(const int index, EagerContext* ctx,
+                          Device* input_device, TensorHandle* packed_handle);
+
   bool has_remote_inputs_ = false;
   bool has_packed_inputs_ = false;
+  // Maps from the index of a packed arg to a list of sub-args.
+  absl::flat_hash_map<int, gtl::InlinedVector<TensorValue, 4>> packed_args_;
 #if !defined(IS_MOBILE_PLATFORM)
   std::function<Status(const FunctionArgIndex&, eager::RemoteTensorHandle*)>
       serialize_remote_handle_;
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
new file mode 100644
index 00000000000..970307de851
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -0,0 +1,126 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/execute_node.h"
+
+#include "tensorflow/core/common_runtime/composite_device.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+class TestKernelAndDeviceFunc final : public KernelAndDeviceFunc {
+ public:
+  TestKernelAndDeviceFunc(std::vector<Device*> input_devices,
+                          Device* host_cpu_device)
+      : KernelAndDeviceFunc(
+            /*flr=*/nullptr, /*pflr=*/nullptr, /*input_devices=*/{},
+            /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+            /*runner=*/nullptr, /*collective_executor=*/nullptr,
+            host_cpu_device, /*name=*/"",
+            /*rendezvous_creator=*/nullptr, /*get_op_id=*/nullptr),
+        test_input_devices_(std::move(input_devices)) {}
+
+  Device* InputDevice(int i) const override { return test_input_devices_[i]; }
+
+ private:
+  std::vector<Device*> test_input_devices_;
+};
+
+TEST(ExecuteNodeTest, ExecuteNodeArgs) {
+  StaticDeviceMgr device_mgr(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
+  Device* device0 = device_mgr.ListDevices().at(0);
+  StaticDeviceMgr remote_device_mgr(
+      DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:1"));
+  Device* device1 = remote_device_mgr.ListDevices().at(0);
+
+  Status s;
+  std::unique_ptr<CompositeDevice> composite_device =
+      CompositeDevice::MakeDevice({device0->name(), device1->name()},
+                                  /*unique_device_id=*/0, &s);
+  TF_ASSERT_OK(s);
+
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &device_mgr, false, nullptr, nullptr, nullptr);
+
+  DataType dtype = DT_FLOAT;
+  Tensor t0(dtype, TensorShape({}));
+  // Create two local TensorHandles
+  t0.scalar<float>()() = {1.0f};
+  TensorHandle* h0 =
+      TensorHandle::CreateLocalHandle(std::move(t0), device0, device0, ctx);
+  Tensor t1(dtype, TensorShape({}));
+  t1.scalar<float>()() = {2.0f};
+  TensorHandle* h1 =
+      TensorHandle::CreateLocalHandle(std::move(t1), device0, device0, ctx);
+  // Create two remote TensorHandles
+  TensorHandle* h2 = TensorHandle::CreateLazyRemoteHandle(
+      /*op_id=*/1, /*output_num=*/0, dtype, device1, ctx);
+  TensorHandle* h3 = TensorHandle::CreateLazyRemoteHandle(
+      /*op_id=*/2, /*output_num=*/1, dtype, device1, ctx);
+  // Create a packed TensorHandle
+  TensorHandle* packed_h = nullptr;
+  TF_ASSERT_OK(TensorHandle::CreatePackedHandle({h1, h2}, ctx, &packed_h));
+
+  // LOCAL, PACKED, REMOTE
+  absl::InlinedVector<TensorHandle*, 4> inputs = {h0, packed_h, h3};
+
+  std::vector<Device*> input_devices;
+  for (auto* h : inputs) {
+    input_devices.push_back(absl::get<Device*>(h->DeviceOrHostCPU(*ctx)));
+  }
+  const core::RefCountPtr<KernelAndDevice> kernel(
+      new TestKernelAndDeviceFunc(std::move(input_devices), device0));
+
+  ExecuteNodeArgs args(inputs.size());
+  TF_EXPECT_OK(args.Init(ctx, inputs, kernel));
+  EXPECT_TRUE(args.HasRemoteOrPackedInputs());
+  Tensor local0;
+  TF_EXPECT_OK(args.GetLocalArg(FunctionArgIndex(0), &local0));
+  EXPECT_EQ(local0.flat<float>().size(), 1);
+  EXPECT_EQ(local0.flat<float>()(0), 1.0);
+  Tensor local1;
+  TF_EXPECT_OK(args.GetLocalArg(FunctionArgIndex(1, 0), &local1));
+  EXPECT_EQ(local1.flat<float>().size(), 1);
+  EXPECT_EQ(local1.flat<float>()(0), 2.0);
+  eager::RemoteTensorHandle remote0;
+  TF_EXPECT_OK(args.GetRemoteArg(FunctionArgIndex(1, 1), &remote0));
+  EXPECT_EQ(remote0.op_id(), 1);
+  EXPECT_EQ(remote0.output_num(), 0);
+  eager::RemoteTensorHandle remote1;
+  TF_EXPECT_OK(args.GetRemoteArg(FunctionArgIndex(2), &remote1));
+  EXPECT_EQ(remote1.op_id(), 2);
+  EXPECT_EQ(remote1.output_num(), 1);
+
+  h0->Unref();
+  h1->Unref();
+  h2->Unref();
+  h3->Unref();
+  packed_h->Unref();
+  ctx->Unref();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 98d71959e2d..3c586e8188a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -158,6 +158,7 @@ Status KernelAndDeviceFunc::InstantiateFunc(const NodeDef& ndef,
   for (const Device* device : input_devices_) {
     options.input_devices.push_back(device->name());
   }
+  options.composite_devices = composite_devices_;
   options.input_resource_dtypes_and_shapes = input_resource_dtypes_and_shapes_;
 
   const auto& it = ndef.attr().find("executor_type");
@@ -425,7 +426,9 @@ Device* KernelAndDeviceOp::InputDevice(int i) const {
 }
 
 Device* KernelAndDeviceFunc::InputDevice(int i) const {
-  if (input_dtypes_[i] == DT_RESOURCE) {
+  if ((input_dtypes_[i] == DT_RESOURCE) &&
+      (composite_devices_.find(input_devices_[i]->name()) ==
+       composite_devices_.end())) {
     return host_cpu_device_;
   } else {
     return input_devices_[i];
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index a740b898262..d2c54322513 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
@@ -241,7 +242,7 @@ class KernelAndDeviceOp final : public KernelAndDevice {
 // Represents a multi-device function. Functions can also be run using
 // various function-calling kernels including CallOp and PartitionedCallOp.
 // In such cases, KernelAndDeviceOp is used.
-class KernelAndDeviceFunc final : public KernelAndDevice {
+class KernelAndDeviceFunc : public KernelAndDevice {
  public:
   // `flr` can be nullptr.
   // `pflr` must not be nullptr.
@@ -249,6 +250,7 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   KernelAndDeviceFunc(
       FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
       std::vector<Device*> input_devices,
+      absl::flat_hash_map<string, const std::vector<string>*> composite_devices,
       std::unordered_map<int, DtypeAndPartialTensorShape>
           input_resource_dtypes_and_shapes,
       std::function<void(std::function<void()>)>* runner,
@@ -261,6 +263,7 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
         pflr_(pflr),
         handle_(kInvalidHandle),
         input_devices_(std::move(input_devices)),
+        composite_devices_(std::move(composite_devices)),
         input_resource_dtypes_and_shapes_(
             std::move(input_resource_dtypes_and_shapes)),
         name_(name),
@@ -320,6 +323,8 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
   // CPU devices are not null. Resource handles' devices are actual backing
   // devices.
   std::vector<Device*> input_devices_;
+  // Maps from a CompositeDevice name to a list of physical device names.
+  absl::flat_hash_map<string, const std::vector<string>*> composite_devices_;
   std::unordered_map<int, DtypeAndPartialTensorShape>
       input_resource_dtypes_and_shapes_;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 1d7b4ea5d6c..eef46b691ce 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -124,6 +124,10 @@ string TensorHandle::PackedTensorHandleData::DebugString() const {
   return debug_str;
 }
 
+int TensorHandle::PackedTensorHandleData::NumPackedHandles() const {
+  return handles_.size();
+}
+
 Status TensorHandle::PackedTensorHandleData::ExtractPackedHandle(
     const int index, TensorHandle** handle) const {
   if (index < 0 || index >= handles_.size()) {
@@ -185,6 +189,13 @@ Status TensorHandle::GetResourceAllowedDevices(std::vector<string>* result) {
   return GetResourceHandleInfoImpl(get_resource_info);
 }
 
+int TensorHandle::NumPackedHandles() const {
+  if (Type() != PACKED) {
+    return 0;
+  }
+  return absl::get<PackedTensorHandleData>(data_).NumPackedHandles();
+}
+
 Status TensorHandle::ExtractPackedHandle(const int index,
                                          TensorHandle** handle) const {
   if (Type() != PACKED) {
@@ -315,8 +326,8 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
       return errors::InvalidArgument(
           "CustomDevice is not supported for packing.");
     } else {
-      devices.push_back(
-          absl::get<Device*>(handle->DeviceOrHostCPU(*ctx))->name());
+      devices.push_back(handle->op_device() ? handle->op_device()->name()
+                                            : ctx->HostCPU()->name());
     }
   }
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 7908f39d4b4..45e7a3815a8 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -231,6 +231,8 @@ class TensorHandle : public AbstractTensorHandleInterface,
       std::vector<DtypeAndPartialTensorShape>* result);
   Status GetResourceAllowedDevices(std::vector<string>* result);
 
+  // Returns the number of packed handles. 0 if the handle type is not PACKED.
+  int NumPackedHandles() const;
   // It's called on a packed TensorHandle. Extract a handle with the given
   // index.
   Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
@@ -316,6 +318,8 @@ class TensorHandle : public AbstractTensorHandleInterface,
     void Poison(Status status);
     string DebugString() const;
 
+    // Number of packed handles.
+    int NumPackedHandles() const;
     // Extract a handle on the given index.
     Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index c823b6aa9b0..2bcde7dce5b 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -164,6 +164,7 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   h2->Unref();
   h3->Unref();
 
+  EXPECT_EQ(packed_handle->NumPackedHandles(), 4);
   EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
   EXPECT_EQ(packed_handle->dtype, dtype);
   TensorShape packed_shape;
@@ -185,7 +186,7 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   const std::vector<TensorHandle::HandleType> expected_handle_types = {
       TensorHandle::LOCAL, TensorHandle::LOCAL, TensorHandle::REMOTE,
       TensorHandle::REMOTE};
-  for (int i = 0; i < 4; ++i) {
+  for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
     TensorHandle* h = nullptr;
     TF_ASSERT_OK(packed_handle->ExtractPackedHandle(i, &h));
     EXPECT_EQ(absl::get<Device*>(h->device()), ListDevices().at(i));
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 3609a5e7e1f..cfbcde82ce2 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -195,6 +195,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
   for (Node* n : graph->op_nodes()) {
     if (composite_device_names.find(n->assigned_device_name()) !=
         composite_device_names.end()) {
+      // TODO(b/145922293): Validate that an _Arg node assigned to a
+      // CompositeDevice should have an attribute indicating that the _Arg node
+      // represents a packed input.
       composite_device_to_cluster_nodes[n->assigned_device_name()].push_back(n);
     }
   }
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 92e92f47356..9930bb86e6b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -728,7 +728,9 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
   core::RefCountPtr<KernelAndDeviceFunc> kernel = nullptr;
   const int64 op_id = 2;
   kernel.reset(new KernelAndDeviceFunc(
-      flr, eager_pflr_.get(), std::move(input_dev_ptrs), {}, /*runner=*/nullptr,
+      flr, eager_pflr_.get(), std::move(input_dev_ptrs),
+      /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+      /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
       [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
       [=]() { return op_id; }));
@@ -773,7 +775,9 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
   core::RefCountPtr<KernelAndDeviceFunc> kernel = nullptr;
   const int64 op_id = 2;
   kernel.reset(new KernelAndDeviceFunc(
-      flr, eager_pflr_.get(), std::move(input_dev_ptrs), {}, /*runner=*/nullptr,
+      flr, eager_pflr_.get(), std::move(input_dev_ptrs),
+      /*composite_devices=*/{}, /*input_resource_dtypes_and_shapes=*/{},
+      /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
       [ctx](const int64 step_id) { return ctx->CreateRendezvous(step_id); },
       [=]() { return op_id; }));

From 738a28685bc1a5714ee2ea40d431156f526c3e0b Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Fri, 8 May 2020 13:10:15 -0700
Subject: [PATCH 0215/1533] Enabling DNNL SGEMM and removing all code related
 to MKL matmuls.

---
 .../core/common_runtime/mkl_layout_pass.cc    | 12 ++-
 tensorflow/core/kernels/mkl_matmul_op.cc      | 82 ++-----------------
 tensorflow/core/ops/math_ops.cc               |  2 +-
 3 files changed, 18 insertions(+), 78 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 2941845a604..55355363106 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -499,7 +499,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
          CopyAttrsAll, LrnGradRewrite, kRewriteForLayoutPropagation});
     rinfo_.push_back({csinfo_.matmul,
                       mkl_op_registry::GetMklOpName(csinfo_.matmul),
-                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
+                      CopyAttrsAll, MatMulRewrite, kRewriteForOpNameChange});
     rinfo_.push_back(
         {csinfo_.leakyrelu, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu),
          CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation});
@@ -1473,6 +1473,16 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
+  static bool MatMulRewrite(const Node* n) {
+    DataType T;
+    GetNodeAttr(n->def(), "T", &T);
+    if ((T == DT_FLOAT) || (T == DT_BFLOAT16)) {
+      VLOG(2) << "Rewriting MatMul to _MklMatMul";
+      return true;
+    }
+    return false;
+  }
+
   static bool DequantizeRewrite(const Node* n) {
     DCHECK(n);
     Node* input = nullptr;
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 3a7c864d10e..83785af8910 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -31,13 +31,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/kernels/mkl_matmul_ops_common.h"
 #include "tensorflow/core/util/mkl_util.h"
-
-// This header file is part of MKL ML, need equivalent file in MKL DNN
-#ifndef INTEL_MKL_DNN_ONLY
-#include "mkl_cblas.h"
-#endif
-
-#include "mkldnn.h"
+#include "mkldnn.hpp"
 
 namespace tensorflow {
 
@@ -157,21 +151,11 @@ class MklMatMulOp : public OpKernel {
     // 1.0 and 0.0 respectively.
     const float alpha = 1.0f;
     const float beta = 0.0f;
-#if defined(INTEL_MKL_DNN_ONLY)
-    const char* const ftrans[] = {"N", "T", "C"};
-    int index_transa = transa ? 1 : 0;
-    int index_transb = transb ? 1 : 0;
-    VLOG(2) << "MKL DNN SGEMM called";
-    // MKL DNN only supports the Fortran api and requires column major while
-    // Tensorflow uses row major so we reverse the order A and B
-    mkldnn_sgemm(ftrans[index_transb], ftrans[index_transa], &n, &m, &k, &alpha,
-                 b, &ldb, a, &lda, &beta, c, &ldc);
-#else
-    // MKL ML binary uses CBLAS API
-    cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
-                ldb, beta, c, ldc);
-#endif
+    char char_transa = transa ? 'T' : 'N';
+    char char_transb = transb ? 'T' : 'N';
+    VLOG(2) << "MKL DNN SGEMM CALLED";
+    dnnl_sgemm(char_transa, char_transb, m, n, k, alpha,
+                 a, lda, b, ldb, beta, c, ldc);
   }
 
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
@@ -205,53 +189,6 @@ class MklMatMulOp : public OpKernel {
     FloatToBFloat16(c_float.flat<float>().data(), c, c_float.NumElements());
   }
 #endif  // ENABLE_INTEL_MKL_BFLOAT16
-
-// MKL-DNN only supports SGEMM and bfloat16-GEMM.
-#ifndef INTEL_MKL_DNN_ONLY
-
-  // Matrix-Matrix Multiplication with FP64 tensors. For detailed info about
-  // parameters, look at FP32 function description.
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const double* a, const int lda,
-                   const double* b, const int ldb, double* c, const int ldc) {
-    const double alpha = 1.0;
-    const double beta = 0.0;
-    cblas_dgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
-                ldb, beta, c, ldc);
-  }
-
-  // Matrix-Matrix Multiplication with Complex64 (std::complex<float>) tensors.
-  // For detailed info about parameters, look at FP32 function description.
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const complex64* a, const int lda,
-                   const complex64* b, const int ldb, complex64* c,
-                   int const ldc) {
-    const MKL_Complex8 alpha = {1.0f, 0.0f};
-    const MKL_Complex8 beta = {0.0f, 0.0f};
-    cblas_cgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
-                reinterpret_cast<const MKL_Complex8*>(a), lda,
-                reinterpret_cast<const MKL_Complex8*>(b), ldb, &beta,
-                reinterpret_cast<MKL_Complex8*>(c), ldc);
-  }
-
-  // Matrix-Matrix Multiplication with Complex128 (std::complex<double>)
-  // tensors. For detailed info about parameters, look at FP32 function
-  // description.
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const complex128* a, const int lda,
-                   const complex128* b, const int ldb, complex128* c,
-                   const int ldc) {
-    const MKL_Complex16 alpha = {1.0, 0.0};
-    const MKL_Complex16 beta = {0.0, 0.0};
-    cblas_zgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
-                transb ? CblasTrans : CblasNoTrans, m, n, k, &alpha,
-                reinterpret_cast<const MKL_Complex16*>(a), lda,
-                reinterpret_cast<const MKL_Complex16*>(b), ldb, &beta,
-                reinterpret_cast<MKL_Complex16*>(c), ldc);
-  }
-#endif  // !INTEL_MKL_DNN_ONLY
 };
 
 #define REGISTER_CPU(T)                                   \
@@ -269,13 +206,6 @@ TF_CALL_float(REGISTER_CPU);
 #ifdef ENABLE_INTEL_MKL_BFLOAT16
 TF_CALL_bfloat16(REGISTER_CPU);
 #endif  // ENABLE_INTEL_MKL_BFLOAT16
-
-#ifndef INTEL_MKL_DNN_ONLY
-TF_CALL_double(REGISTER_CPU);
-TF_CALL_complex64(REGISTER_CPU);
-TF_CALL_complex128(REGISTER_CPU);
-#endif  // !INTEL_MKL_DNN_ONLY
 #endif  // ENABLE_MKL
-
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7ac003379d4..d00731f223a 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -936,7 +936,7 @@ REGISTER_OP("_MklMatMul")
     .Output("product: T")
     .Attr("transpose_a: bool = false")
     .Attr("transpose_b: bool = false")
-    .Attr("T: {bfloat16, float, double, complex64, complex128}")
+    .Attr("T: {bfloat16, float}")
     .SetShapeFn(shape_inference::MatMulShape);
 #endif  // INTEL_MKL
 

From 0a97e70d0cc51afd659364fe203f8c72eded6bb3 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 13:21:22 -0700
Subject: [PATCH 0216/1533] Use protobuf.h instead of proto_serialization.h in
 profiler

PiperOrigin-RevId: 310616754
Change-Id: Ib1e5bc13a027530d167baf5c6369a5e582c25009
---
 tensorflow/core/profiler/convert/xplane_to_tf_functions.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index 15cc98df9fb..f768d3b7ae6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/strings/proto_serialization.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
@@ -243,9 +243,9 @@ class TfFunctionExecutions {
 
 }  // namespace
 
-std::string DebugString(const TfFunctionDb tf_function_db) {
+std::string DebugString(const TfFunctionDb& tf_function_db) {
   std::string str;
-  ::tensorflow::protobuf::TextFormat::PrintToString(tf_function_db, &str);
+  protobuf::TextFormat::PrintToString(tf_function_db, &str);
   return str;
 }
 

From 0c5b1b8ab20f18b5f7329cd9bec58ca419da063f Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 13:23:40 -0700
Subject: [PATCH 0217/1533] Replace lib/core/{errors,status}.h with platform/
 headers in profiler

PiperOrigin-RevId: 310617182
Change-Id: I7faa6f4020278117dd902bedb5f25c37016286ba
---
 .../core/profiler/convert/xplane_to_profile_response.cc       | 2 +-
 tensorflow/core/profiler/internal/cpu/host_tracer.cc          | 4 ++--
 tensorflow/core/profiler/internal/cpu/python_tracer.cc        | 4 ++--
 tensorflow/core/profiler/internal/gpu/cupti_tracer.cc         | 2 +-
 tensorflow/core/profiler/internal/gpu/cupti_tracer.h          | 4 ++--
 tensorflow/core/profiler/internal/gpu/device_tracer.cc        | 2 +-
 tensorflow/core/profiler/internal/profiler_interface.h        | 2 +-
 tensorflow/core/profiler/lib/profiler_session.cc              | 2 +-
 tensorflow/core/profiler/lib/profiler_session.h               | 2 +-
 tensorflow/core/profiler/rpc/client/capture_profile.cc        | 4 ++--
 tensorflow/core/profiler/rpc/client/capture_profile.h         | 2 +-
 tensorflow/core/profiler/rpc/client/save_profile.cc           | 2 +-
 tensorflow/core/profiler/rpc/client/save_profile.h            | 2 +-
 tensorflow/core/profiler/rpc/profiler_service_impl.cc         | 2 +-
 tensorflow/core/profiler/utils/tfstreamz_utils.cc             | 1 -
 15 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index b0259bb8865..74dd3435a4b 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -15,7 +15,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/human_readable_json.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 753d8c53b9c..30b87c84fa2 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
diff --git a/tensorflow/core/profiler/internal/cpu/python_tracer.cc b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
index aa259f53cfa..103db6e0c71 100644
--- a/tensorflow/core/profiler/internal/cpu/python_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 3db6a8d029d..9119c3d5d0b 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index c6e0c50b093..e236afc5c41 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/container/node_hash_set.h"
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/gpu/cupti_interface.h"
 
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index 3fb502dcde2..ac6662c8432 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -27,9 +27,9 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/abi.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index 2605e834f09..79dfc7af2b2 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index d7976b85351..497ee76b2af 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 
 #include "absl/memory/memory.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 1c20876d9d0..83d0683f740 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index ebc74c9252c..0d42c4f7bcf 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/util/events_writer.h"
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 404912ef716..1bde73f66f1 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -17,7 +17,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index ab2e494871c..47d69765da4 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -24,9 +24,9 @@ limitations under the License.
 #include "absl/strings/strip.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 // Windows.h #defines ERROR, but it is also used in
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index d9070f06c71..6d8bfba5e63 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 35aaa26bd1e..555f4c3366a 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -21,9 +21,9 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.cc b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
index 5fef494fc3b..493420fef96 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.cc
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/monitoring/collected_metrics.h"
 #include "tensorflow/core/lib/monitoring/collection_registry.h"

From d0b0e9c2043b22ed855b61095d48dc9c936ca44c Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Fri, 8 May 2020 13:32:39 -0700
Subject: [PATCH 0218/1533] enabling mkldnn threadpool build options

---
 .bazelrc                                    |   5 +
 tensorflow/tensorflow.bzl                   |   8 +-
 tensorflow/workspace.bzl                    |  11 ++
 third_party/mkl/build_defs.bzl              |   1 +
 third_party/mkl_dnn/BUILD                   |   8 ++
 third_party/mkl_dnn/build_defs.bzl          |  16 +++
 third_party/mkl_dnn/mkldnn_threadpool.BUILD | 133 ++++++++++++++++++++
 7 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 third_party/mkl_dnn/mkldnn_threadpool.BUILD

diff --git a/.bazelrc b/.bazelrc
index cf15d0976b1..2efdbad2e5f 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -143,6 +143,11 @@ build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
+build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
+build:mkl_threadpool -c opt
+
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d9229e00306..ed780092ce1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -48,6 +48,7 @@ load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "//third_party/ngraph:build_defs.bzl",
@@ -327,6 +328,9 @@ def tf_copts(
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
         if_mkl_v1_open_source_only(["-DENABLE_MKLDNN_V1"]) +
+        if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
+        if_mkldnn_threadpool(["-DENABLE_MKLDNN_V1"]) +
+        if_mkldnn_threadpool(["-DINTEL_MKL_DNN_ONLY"]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
@@ -348,7 +352,9 @@ def tf_copts(
     )
 
 def tf_openmp_copts():
-    return if_mkl_lnx_x64(["-fopenmp"])
+    # TODO(intel-mkl): Remove -fopenmp for threadpool after removing all
+    # omp pragmas in tensorflow/core.
+    return if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fopenmp"])
 
 def tfe_xla_copts():
     return select({
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 56f36a7b004..ab895dd6a99 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -232,6 +232,17 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "mkl_dnn_tp",
+        build_file = clean_dep("//third_party/mkl_dnn:mkldnn_threadpool.BUILD"),
+        sha256 = "54737bcb4dc1961d32ee75da3ecc529fa48198f8b2ca863a079e19a9c4adb70f",
+        strip_prefix = "oneDNN-1.4",
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
+        ],
+    )
+
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 4b8fb83eb09..f69d27dd094 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -107,6 +107,7 @@ def mkl_deps():
     return select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_tp//:mkl_dnn"],
         "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
         "@org_tensorflow//third_party/mkl:build_with_mkl": [
             "@org_tensorflow//third_party/mkl:intel_binary_blob",
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index 774e5b0e2c0..9e617e0055a 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -27,6 +27,14 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_mkldnn_threadpool",
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_mkldnn_threadpool": "true",
+    },
+    visibility = ["//visibility:public"],
+)
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index af05333c947..5778d136e9b 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -29,3 +29,19 @@ def if_mkl_v1_open_source_only(if_true, if_false = []):
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": if_true,
         "//conditions:default": if_false,
     })
+
+def if_mkldnn_threadpool(if_true, if_false = []):
+    """Returns `if_true` if MKL-DNN v1.x is used.
+
+    Shorthand for select()'ing on whether we're building with
+    MKL-DNN v1.x open source library only, without depending on MKL binary form.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with MKL-DNN v1.x open source library only. Otherwise, the
+    select statement evaluates to if_false.
+
+    """
+    return select({
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/third_party/mkl_dnn/mkldnn_threadpool.BUILD b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
new file mode 100644
index 00000000000..35175b7f90f
--- /dev/null
+++ b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
@@ -0,0 +1,133 @@
+exports_files(["LICENSE"])
+
+load(
+    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
+    "if_mkl_open_source_only",
+    "if_mkldnn_threadpool",
+)
+load(
+    "@org_tensorflow//third_party:common.bzl",
+    "template_rule",
+)
+
+config_setting(
+    name = "clang_linux_x86_64",
+    values = {
+        "cpu": "k8",
+        "define": "using_clang=true",
+    },
+)
+
+template_rule(
+    name = "dnnl_config_h",
+    src = "include/dnnl_config.h.in",
+    out = "include/dnnl_config.h",
+    substitutions = {
+        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
+        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
+        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+    },
+)
+# Create the file mkldnn_version.h with MKL-DNN version numbers.
+# Currently, the version numbers are hard coded here. If MKL-DNN is upgraded then
+# the version numbers have to be updated manually. The version numbers can be
+# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
+# set to "version_major.version_minor.version_patch". The git hash version can
+# be set to NA.
+# TODO(agramesh1) Automatically get the version numbers from CMakeLists.txt.
+
+template_rule(
+    name = "dnnl_version_h",
+    src = "include/dnnl_version.h.in",
+    out = "include/dnnl_version.h",
+    substitutions = {
+        "@DNNL_VERSION_MAJOR@": "1",
+        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_HASH@": "N/A",
+    },
+)
+
+cc_library(
+    name = "mkl_dnn",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/common/*.hpp",
+        "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
+        "src/cpu/**/*.cpp",
+        "src/cpu/**/*.hpp",
+        "src/cpu/xbyak/*.h",
+    ]) + if_mkldnn_threadpool([
+        ":dnnl_config_h",
+    ]) + [":dnnl_version_h"],
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-DUSE_MKL",
+        "-DUSE_CBLAS",
+    ] + if_mkl_open_source_only([
+        "-UUSE_MKL",
+        "-UUSE_CBLAS",
+    ]) + if_mkldnn_threadpool([
+        "-UUSE_MKL",
+        "-UUSE_CBLAS",
+    ]) + select({
+        "@org_tensorflow//tensorflow:linux_x86_64": ["-fopenmp-simd"],
+        # TODO(ibiryukov): enable openmp with clang by including libomp as a
+        # dependency.
+        ":clang_linux_x86_64": [],
+        "//conditions:default": [],
+    }),
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+        "src/cpu/xbyak",
+    ],
+    visibility = ["//visibility:public"],
+    deps = select({
+        "@org_tensorflow//tensorflow:linux_x86_64": [
+            "@mkl_linux//:mkl_headers",
+            "@mkl_linux//:mkl_libs_linux",
+        ],
+        "@org_tensorflow//tensorflow:macos": [
+            "@mkl_darwin//:mkl_headers",
+            "@mkl_darwin//:mkl_libs_darwin",
+        ],
+        "@org_tensorflow//tensorflow:windows": [
+            "@mkl_windows//:mkl_headers",
+            "@mkl_windows//:mkl_libs_windows",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "mkldnn_single_threaded",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/common/*.hpp",
+        "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
+        "src/cpu/**/*.cpp",
+        "src/cpu/**/*.hpp",
+        "src/cpu/xbyak/*.h",
+    ]) + [":dnnl_config_h"],
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
+    ],
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+        "src/cpu/xbyak",
+    ],
+    visibility = ["//visibility:public"],
+)

From f1ddb29ce910e714f2211ca9b14e47f0cf31f05e Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 8 May 2020 14:03:59 -0700
Subject: [PATCH 0219/1533] Add a test

---
 .../compiler/xla/service/gpu/tests/BUILD      | 14 +++++
 .../service/gpu/tests/gpu_copy_alone_test.cc  | 61 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc

diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index e04dba418d9..dae63be683f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -235,6 +235,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gpu_copy_alone_test",
+    srcs = [
+        "gpu_copy_alone_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
new file mode 100644
index 00000000000..ac3111aaae1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// WARNING: This tests must be alone in its file!  Otherwise, the
+// error isn't caught. We expect and CUDA_ERROR_ILLEGAL_ADDRESS to be
+// thrown with the old buggy code.
+class CopyAloneNoOptTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test MultiOutputStore contain a MOF fusion and XLA optimizer pass doesn't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(CopyAloneNoOptTest, CopyTranspose) {
+    const char* hlo_text = R"(
+HloModule mod
+ENTRY main {
+  %param = f32[8,32,32,32,16]{4,3,2,1,0} parameter(0)
+  ROOT %copy = f32[8,32,32,32,16]{3,2,1,4,0} copy(f32[8,32,32,32,16]{4,3,2,1,0} %param)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                      R"(
+CHECK-NOT: ld.global.nc.v2
+)");
+
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From 3d42daf076d7caa8ad07182aa10dab4f90a1e45d Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 8 May 2020 14:28:44 -0700
Subject: [PATCH 0220/1533] NFC: rename a variable

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc       | 2 +-
 .../compiler/xla/service/gpu/kernel_mapping_scheme.h      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 7084736ac3c..a78ffc8dd1a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2734,7 +2734,7 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*num_threads_x=*/kWarpSize,
                                      /*indexing_order=*/kLinearIndexingX,
                                      /*vector_size=*/1,
-                                     /*row_contiguous=*/false);
+                                     /*is_row_contiguous=*/false);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index d9f80172bcb..99fa60a24c2 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -90,14 +90,14 @@ class KernelMappingScheme {
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
                       int64 num_threads_x, IndexingOrder indexing_order,
-                      int vector_size, bool row_contiguous = false)
+                      int vector_size, bool is_row_contiguous = false)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
         indexing_order_(indexing_order),
         vector_size_(vector_size),
-	row_contiguous_(row_contiguous) {
+	is_row_contiguous_(is_row_contiguous) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
@@ -135,7 +135,7 @@ class KernelMappingScheme {
 
   IndexingOrder GetIndexingOrder() const { return indexing_order_; }
   int GetVectorSize() const { return vector_size_; }
-  bool GetRowContiguous() const {return row_contiguous_; }
+  bool GetRowContiguous() const {return is_row_contiguous_; }
 
  private:
   // The number of elements in each dimension.
@@ -161,7 +161,7 @@ class KernelMappingScheme {
   // to trigger vectorized loads on GPUs while keeping memory
   // coalescing.
   const int vector_size_;
-  const bool row_contiguous_;
+  const bool is_row_contiguous_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.

From f596266023da216b24115b28195c26ea6caddd0c Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Fri, 8 May 2020 14:29:27 -0700
Subject: [PATCH 0221/1533] Fix requested_device of NextIteration node.

PiperOrigin-RevId: 310629600
Change-Id: Ic097448918a59b7ae42683a5f8d2014f97e22447
---
 tensorflow/core/common_runtime/BUILD          |   2 +
 .../core/common_runtime/lower_while_op.cc     |  17 +--
 .../common_runtime/lower_while_op_test.cc     | 141 ++++++++++++++++++
 3 files changed, 151 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 76a3c276e2d..484bdeee3bd 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -2511,10 +2511,12 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index a28959703e5..90fdc886c50 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -444,15 +444,14 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
     if (IsResource(i)) {
       continue;
     }
-    TF_RETURN_IF_ERROR(
-        NodeBuilder(NewName("next_iteration"), "NextIteration",
-                    graph_->op_registry(), &debug_info_)
-            .Input(NodeOut(body_call_node_, i))
-            .ControlInput(body_call_node_)
-            .Device(while_op_->requested_device())
-            .AssignedDevice(merge_nodes_[op_input_output_to_lowered_node_[i]]
-                                ->assigned_device_name())
-            .Finalize(graph_, &next_iteration));
+    Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
+    TF_RETURN_IF_ERROR(NodeBuilder(NewName("next_iteration"), "NextIteration",
+                                   graph_->op_registry(), &debug_info_)
+                           .Input(NodeOut(body_call_node_, i))
+                           .ControlInput(body_call_node_)
+                           .Device(merge_node->requested_device())
+                           .AssignedDevice(merge_node->assigned_device_name())
+                           .Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/lower_while_op_test.cc b/tensorflow/core/common_runtime/lower_while_op_test.cc
index 65b9b523444..9d7870f891d 100644
--- a/tensorflow/core/common_runtime/lower_while_op_test.cc
+++ b/tensorflow/core/common_runtime/lower_while_op_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/strings/match.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/ops/array_ops.h"
@@ -25,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -262,6 +264,145 @@ TEST(LowerWhileOpTest, ForwardAssignedInputDevice) {
   ASSERT_EQ(exit_consumers, 1);
 }
 
+TEST(LowerWhileOpTest, ForwardRequestedInputDevice) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for cond and body.
+  FunctionDefLibrary f_lib_proto;
+  *f_lib_proto.add_function() = test::function::XTimesTwo();
+  *f_lib_proto.add_function() = test::function::LessThanOrEqualToN(8);
+
+  TF_ASSERT_OK(graph->AddFunctionLibrary(f_lib_proto));
+  auto type = DT_FLOAT;
+  // We will place the loop var on the gpu:0.
+  const string gpu_0_device = "/job:localhost/replica:0/task:0/gpu:0";
+  // We will place loop's control input on the gpu:1.
+  const string gpu_1_device = "/job:localhost/replica:0/task:0/gpu:1";
+  // We will place While op on gpu:2.
+  const string gpu_2_device = "/job:localhost/replica:0/task:0/gpu:2";
+  Node* gpu_0_ph;
+  TF_CHECK_OK(NodeBuilder("placed_node", "Placeholder")
+                  .Attr("dtype", type)
+                  .Device(gpu_0_device)
+                  .Finalize(graph.get(), &gpu_0_ph));
+  Node* control_in;
+  // Add a control input to the While op to trigger the creation of a
+  // LoopExecuted node.
+  TF_CHECK_OK(NodeBuilder("control_in", "Placeholder")
+                  .Attr("dtype", type)
+                  .Device(gpu_1_device)
+                  .Finalize(graph.get(), &control_in));
+  Node* while_node;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(gpu_0_ph)});
+  AttrValue cond_func;
+  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  AttrValue body_func;
+  body_func.mutable_func()->set_name("XTimesTwo");
+  TF_ASSERT_OK(
+      NodeBuilder("while", "While", &graph->flib_def())
+          .Input(inputs)
+          .ControlInput(control_in)
+          .Device(gpu_2_device)
+          .Attr("T", {type})
+          .Attr("cond", cond_func)
+          .Attr("body", body_func)
+          .Attr("parallel_iterations", 100)
+          .Attr(LowerFunctionalOpsPass::kLowerUsingSwitchMergeAttr, true)
+          .Finalize(graph.get(), &while_node));
+
+  // Create an empty Const node with control dep from the While op.
+  // This triggers the creation of a LoopExecuted node.
+  Node* control_out;
+  TensorProto proto;
+  proto.set_dtype(DT_FLOAT);
+  TensorShape empty_shape({0});
+  empty_shape.AsProto(proto.mutable_tensor_shape());
+  TF_ASSERT_OK(NodeBuilder("control_out", "Const")
+                   .ControlInput(while_node)
+                   .Attr("dtype", DT_FLOAT)
+                   .Attr("value", proto)
+                   .Finalize(graph.get(), &control_out));
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  const Node* placeholder_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->name() == "placed_node") {
+      placeholder_node = op;
+    }
+  }
+  ASSERT_NE(placeholder_node, nullptr);
+  // Verify the requested device of the Enter node.
+  int enter_consumers = 0;
+  const Node* enter_node = nullptr;
+  for (const Node* consumer : placeholder_node->out_nodes()) {
+    if (consumer->type_string() == "Enter") {
+      enter_consumers += 1;
+      enter_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(enter_consumers, 1);
+  // Verify the requested device of the Merge node.
+  int merge_consumers = 0;
+  const Node* merge_node = nullptr;
+  for (const Node* consumer : enter_node->out_nodes()) {
+    if (consumer->type_string() == "Merge") {
+      merge_consumers += 1;
+      merge_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(merge_consumers, 1);
+  // Verify the requested device of the NextIteration node.
+  int next_iteration_consumers = 0;
+  for (const Node* consumer : merge_node->in_nodes()) {
+    if (consumer->type_string() == "NextIteration") {
+      next_iteration_consumers += 1;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(next_iteration_consumers, 1);
+  // Verify the requested device of the Switch node.
+  int switch_consumers = 0;
+  const Node* switch_node = nullptr;
+  for (const Node* consumer : merge_node->out_nodes()) {
+    if (consumer->type_string() == "Switch") {
+      switch_consumers += 1;
+      switch_node = consumer;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(switch_consumers, 1);
+  // Verify the requested device of the Exit node.
+  int exit_consumers = 0;
+  for (const Node* consumer : switch_node->out_nodes()) {
+    if (consumer->type_string() == "Exit") {
+      exit_consumers += 1;
+      ASSERT_EQ(consumer->requested_device(), gpu_0_device);
+    }
+  }
+  ASSERT_EQ(exit_consumers, 1);
+  // Verify the requested device of LoopControlInputs.
+  const Node* loop_control_inputs_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (absl::StrContains(op->name(), "LoopControlInputs")) {
+      loop_control_inputs_node = op;
+    }
+  }
+  ASSERT_NE(loop_control_inputs_node, nullptr);
+  ASSERT_EQ(loop_control_inputs_node->requested_device(), gpu_2_device);
+  // Verify the requested device of LoopExecuted.
+  const Node* loop_executed_node = nullptr;
+  for (const auto* op : graph->op_nodes()) {
+    if (absl::StrContains(op->name(), "LoopExecuted")) {
+      loop_executed_node = op;
+    }
+  }
+  ASSERT_NE(loop_executed_node, nullptr);
+  ASSERT_EQ(loop_executed_node->requested_device(), gpu_2_device);
+}
+
 TEST(LowerWhileOpTest, MultipleInputs) {
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
 

From fe40ddcc5e8dd3425d55b7d44fbdba3d83ea8108 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Fri, 8 May 2020 14:33:59 -0700
Subject: [PATCH 0222/1533] Add support for quantizing prelu, now that its a
 builtin operator. It has two inputs. First input must match output scale.

PiperOrigin-RevId: 310630492
Change-Id: I7212cec0b0fa5cc26fd1545935c86bd534c0f260
---
 tensorflow/lite/testing/op_tests/prelu.py     | 26 +++++++++++++++++--
 .../lite/tools/optimize/operator_property.cc  |  6 +++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/testing/op_tests/prelu.py b/tensorflow/lite/testing/op_tests/prelu.py
index f927c7a8b00..480736a76fe 100644
--- a/tensorflow/lite/testing/op_tests/prelu.py
+++ b/tensorflow/lite/testing/op_tests/prelu.py
@@ -35,12 +35,33 @@ def make_prelu_tests(options):
           # channel.
           "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
           "shared_axes": [[1, 2], [1]],
+          "fully_quantize": [False],
+          "input_range": [(-10, 10)],
       },
       {
           # 2D-3D example. Share the 2nd axis.
           "input_shape": [[20, 20], [20, 20, 20]],
           "shared_axes": [[1]],
-      }
+          "fully_quantize": [False],
+          "input_range": [(-10, 10)],
+      },
+      # Quantized cases.
+      {
+          # The canonical case for image processing is having a 4D `input`
+          # (NHWC)and `shared_axes`=[1, 2], so the alpha parameter is per
+          # channel.
+          "input_shape": [[1, 10, 10, 3], [3, 3, 3, 3]],
+          "shared_axes": [[1, 2], [1]],
+          "fully_quantize": [True],
+          "input_range": [(-10, 10)],
+      },
+      {
+          # 2D-3D example. Share the 2nd axis.
+          "input_shape": [[20, 20], [20, 20, 20]],
+          "shared_axes": [[1]],
+          "fully_quantize": [True],
+          "input_range": [(-10, 10)],
+      },
   ]
 
   def build_graph(parameters):
@@ -64,7 +85,8 @@ def make_prelu_tests(options):
     for dim in range(1, len(input_shape)):
       alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
 
-    alpha_values = create_tensor_data(np.float32, alpha_shape)
+    alpha_values = create_tensor_data(
+        np.float32, alpha_shape, min_value=-1, max_value=1)
 
     # There should be only 1 trainable variable tensor.
     variables = tf.compat.v1.all_variables()
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 71fdad87bd2..94093ef8698 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -815,6 +815,12 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
       property.outputs = {{0, {}}};
       property.version = 2;
       break;
+    case BuiltinOperator_PRELU:
+      property.inputs = {{0, {}}, {1, {}}};
+      property.outputs = {{0, {}}};
+      property.restrict_same_input_output_scale = true;
+      property.version = 1;
+      break;
     case BuiltinOperator_LEAKY_RELU:
     case BuiltinOperator_RELU:
     case BuiltinOperator_RELU6:

From cc6145c190622a5e64f7a914acbaaf739a915a72 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 14:43:56 -0700
Subject: [PATCH 0223/1533] Cleanup headers in capture_profile.cc

PiperOrigin-RevId: 310632408
Change-Id: Ie1f2a3d8c295344fe30bd12221f8d78524766ef6
---
 tensorflow/core/profiler/rpc/client/BUILD              | 1 -
 tensorflow/core/profiler/rpc/client/capture_profile.cc | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 43ebb35230c..7a98c55a4a0 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -11,7 +11,6 @@ cc_library(
     visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
     deps = [
         ":save_profile",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index 0d42c4f7bcf..5335d18da3e 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -21,12 +21,11 @@ limitations under the License.
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
-#include "tensorflow/core/util/events_writer.h"
 
 namespace tensorflow {
 namespace profiler {

From c9e079ac5b2e7c12d28452e509fa6a6a725e5fc9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 14:57:14 -0700
Subject: [PATCH 0224/1533] pfor: change converters for Equal and NotEqual to
 handle incompatible_shape_error attribute.

PiperOrigin-RevId: 310634898
Change-Id: Ifcd7741c438d571c1024c77ff90c71e10267fa04
---
 .../python/ops/parallel_for/math_test.py      | 128 ++++++++++--------
 tensorflow/python/ops/parallel_for/pfor.py    |   8 +-
 2 files changed, 73 insertions(+), 63 deletions(-)

diff --git a/tensorflow/python/ops/parallel_for/math_test.py b/tensorflow/python/ops/parallel_for/math_test.py
index 773195283d6..8e18b9968fe 100644
--- a/tensorflow/python/ops/parallel_for/math_test.py
+++ b/tensorflow/python/ops/parallel_for/math_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
@@ -150,72 +151,81 @@ class MathTest(PForTestCase, parameterized.TestCase):
       self._test_loop_fn(loop_fn, 3)
 
   def test_binary_cwise_ops(self):
-    logical_ops = [
-        math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor
-    ]
+    # Enable tensor equality to test `equal` and `not_equal` ops below.
+    default_equality = framework_ops.Tensor._USE_EQUALITY
+    framework_ops.enable_tensor_equality()
+    try:
+      logical_ops = [
+          math_ops.logical_and, math_ops.logical_or, math_ops.logical_xor
+      ]
 
-    # Wrapper functions restricting the range of inputs of zeta and polygamma.
-    def safe_polygamma(x, y):
-      return math_ops.polygamma(
-          math_ops.round(clip_ops.clip_by_value(y, 1, 10)), x * x + 1)
+      # Wrapper functions restricting the range of inputs of zeta and polygamma.
+      def safe_polygamma(x, y):
+        return math_ops.polygamma(
+            math_ops.round(clip_ops.clip_by_value(y, 1, 10)), x * x + 1)
 
-    def safe_zeta(x, y):
-      return math_ops.zeta(x * x + 1, y * y)
+      def safe_zeta(x, y):
+        return math_ops.zeta(x * x + 1, y * y)
 
-    float_ops = [
-        math_ops.add,
-        math_ops.add_v2,
-        math_ops.atan2,
-        math_ops.complex,
-        math_ops.div,
-        math_ops.divide,
-        math_ops.div_no_nan,
-        math_ops.equal,
-        math_ops.floor_mod,
-        math_ops.greater,
-        math_ops.greater_equal,
-        math_ops.igamma,
-        math_ops.igammac,
-        math_ops.igamma_grad_a,
-        math_ops.less,
-        math_ops.less_equal,
-        math_ops.maximum,
-        math_ops.minimum,
-        math_ops.mod,
-        math_ops.multiply,
-        math_ops.not_equal,
-        math_ops.pow,
-        math_ops.squared_difference,
-        math_ops.subtract,
-        math_ops.truncate_mod,
-        safe_polygamma,
-        safe_zeta,
-    ]
-    # FloorDiv fails on XLA due floor's discontinuities exacerbating small
-    # division differences.
-    if not test_util.is_xla_enabled():
-      float_ops += [math_ops.floor_div]
-    for op in logical_ops + float_ops:
-      x = random_ops.random_uniform([7, 3, 5])
-      y = random_ops.random_uniform([3, 5])
-      if op in logical_ops:
-        x = x > 0
-        y = y > 0
+      float_ops = [
+          math_ops.add,
+          math_ops.add_v2,
+          math_ops.atan2,
+          math_ops.complex,
+          math_ops.div,
+          math_ops.divide,
+          math_ops.div_no_nan,
+          math_ops.equal,
+          lambda x, y: framework_ops.convert_to_tensor(x == y),
+          lambda x, y: framework_ops.convert_to_tensor(x != y),
+          math_ops.floor_mod,
+          math_ops.greater,
+          math_ops.greater_equal,
+          math_ops.igamma,
+          math_ops.igammac,
+          math_ops.igamma_grad_a,
+          math_ops.less,
+          math_ops.less_equal,
+          math_ops.maximum,
+          math_ops.minimum,
+          math_ops.mod,
+          math_ops.multiply,
+          math_ops.not_equal,
+          math_ops.pow,
+          math_ops.squared_difference,
+          math_ops.subtract,
+          math_ops.truncate_mod,
+          safe_polygamma,
+          safe_zeta,
+      ]
+      # FloorDiv fails on XLA due floor's discontinuities exacerbating small
+      # division differences.
+      if not test_util.is_xla_enabled():
+        float_ops += [math_ops.floor_div]
+      for op in logical_ops + float_ops:
+        x = random_ops.random_uniform([7, 3, 5])
+        y = random_ops.random_uniform([3, 5])
+        if op in logical_ops:
+          x = x > 0
+          y = y > 0
 
-      output_dtypes = []
+        output_dtypes = []
 
-      # pylint: disable=cell-var-from-loop
-      def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = array_ops.gather(y, i)
-        outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
-        del output_dtypes[:]
-        output_dtypes.extend(t.dtype for t in outputs)
-        return outputs
+        # pylint: disable=cell-var-from-loop
+        def loop_fn(i):
+          x1 = array_ops.gather(x, i)
+          y1 = array_ops.gather(y, i)
+          outputs = [op(x, y), op(x1, y), op(x, y1), op(x1, y1), op(x1, x1)]
+          del output_dtypes[:]
+          output_dtypes.extend(t.dtype for t in outputs)
+          return outputs
 
-      # pylint: enable=cell-var-from-loop
+        # pylint: enable=cell-var-from-loop
 
-      self._test_loop_fn(loop_fn, 3)
+        self._test_loop_fn(loop_fn, 3)
+    finally:
+      if not default_equality:
+        framework_ops.disable_tensor_equality()
 
   def test_approximate_equal(self):
     x = random_ops.random_uniform([3, 5])
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index bece477e754..c4621758702 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -2784,8 +2784,8 @@ def _convert_equal(pfor_input):
   x = pfor_input.input(0)[0]
   y = pfor_input.input(1)[0]
   incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
-  assert incompatible_shape_error
-  return wrap(math_ops.equal(x, y), True)
+  return wrap(gen_math_ops.equal(
+      x, y, incompatible_shape_error=incompatible_shape_error), True)
 
 
 @RegisterPFor("NotEqual")
@@ -2794,8 +2794,8 @@ def _convert_not_equal(pfor_input):
   x = pfor_input.input(0)[0]
   y = pfor_input.input(1)[0]
   incompatible_shape_error = pfor_input.get_attr("incompatible_shape_error")
-  assert incompatible_shape_error
-  return wrap(math_ops.not_equal(x, y), True)
+  return wrap(gen_math_ops.not_equal(
+      x, y, incompatible_shape_error=incompatible_shape_error), True)
 
 
 @RegisterPFor("ApproximateEqual")

From aa48faeb0873abbfbf693c930d822e21b10c064d Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Fri, 8 May 2020 15:06:55 -0700
Subject: [PATCH 0225/1533] Replace processing of `tf_device.launch` with
 `tf_device.cluster` in TPUExtractOutsideCompilation pass.

PiperOrigin-RevId: 310636668
Change-Id: I8a9fbf2ea402d86c4d42b4e210539fc3136c97d4
---
 .../tpu_extract_outside_compilation.mlir      | 49 +++++++++----------
 .../tpu_extract_outside_compilation.cc        | 49 ++++++++++---------
 2 files changed, 51 insertions(+), 47 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index b2e8f116827..3cb693ee571 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -3,12 +3,12 @@
 // Tests that missing `_xla_outside_compilation` attribute value results in an error.
 
 func @missing_outside_compilation_attribute() -> () {
-  "tf_device.launch"() ( {
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     // expected-error@+1 {{attribute '_xla_outside_compilation' is empty}}
     "tf.B"() {_xla_outside_compilation = ""} : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -18,11 +18,11 @@ func @missing_outside_compilation_attribute() -> () {
 
 // CHECK-LABEL: func @no_outside_compilation
 func @no_outside_compilation() -> tensor<?xi32> {
-  %0 = "tf_device.launch"() ( {
+  %0 = "tf_device.cluster"() ( {
     %1 = "tf.A"() : () -> tensor<?xi32>
     %2 = "tf.B"(%1) : (tensor<?xi32>) -> tensor<?xi32>
     tf_device.return %2 : tensor<?xi32>
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<?xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -36,16 +36,15 @@ func @nodep_single_outside_compilation() -> () {
    // CHECK-NEXT: "tf_device.launch"
    // CHECK-NEXT: "tf.B"
    // CHECK-NOT: _xla_outside_compilation
-   // CHECK: "tf_device.launch"
+   // CHECK: "tf_device.cluster"
    // CHECK-NEXT: "tf.A"
-   // CHECK: device = "tpu0"
-   // CHECK-SAME: launch_attr = "launch_attr"
-  "tf_device.launch"() ( {
+   // CHECK: cluster_attr = "cluster_attr"
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.C"() : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -59,19 +58,18 @@ func @nodep_single_cluster_multiple_ops_outside_compilation() -> () {
    // CHECK-NEXT: "tf.C"
    // CHECK-NEXT: "tf.D"
    // CHECK-NOT: _xla_outside_compilation
-   // CHECK: "tf_device.launch"
+   // CHECK: "tf_device.cluster"
    // CHECK-NEXT: "tf.A"
    // CHECK-NEXT: "tf.E"
-   // CHECK: device = "tpu0"
-   // CHECK-SAME: launch_attr = "launch_attr"
-  "tf_device.launch"() ( {
+   // CHECK: cluster_attr = "cluster_attr"
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.C"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.D"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.E"() : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -80,15 +78,16 @@ func @nodep_single_cluster_multiple_ops_outside_compilation() -> () {
 // CHECK-LABEL: func @nodep_multiple_outside_compilation
 func @nodep_multiple_outside_compilation() -> () {
    // CHECK: "tf_device.parallel_execute"
-   // CHECK-COUNT-3: "tf_device.launch"
-  "tf_device.launch"() ( {
+   // CHECK-COUNT-2: "tf_device.launch"
+   // CHECK: "tf_device.cluster"
+  "tf_device.cluster"() ( {
     "tf.A"() : () -> ()
     "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
     "tf.C"() : () -> ()
     "tf.D"() {_xla_outside_compilation = "cluster2"} : () -> ()
     "tf.E"() : () -> ()
     tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  }) {cluster_attr = "cluster_attr"} : () -> ()
   return
 }
 
@@ -100,17 +99,17 @@ func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tens
   // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
       // CHECK-NEXT: "tf_device.launch"
-      // CHECK: %[[TPU_LAUNCH_OUTPUT:[0-9]*]] = "tf_device.launch"
+      // CHECK: %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
         // CHECK: tf_device.return
-      // CHECK: tf_device.return %[[TPU_LAUNCH_OUTPUT]]
+      // CHECK: tf_device.return %[[TPU_CLUSTER_OUTPUT]]
       // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
   %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2 = "tf_device.launch"() ( {
+    %2 = "tf_device.cluster"() ( {
       "tf.A"() : () -> ()
       "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
       %3 = "tf.C"() : () -> tensor<?xi32>
       tf_device.return %3 : tensor<?xi32>
-    }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
     tf_device.return %2 : tensor<?xi32>
   }
 
@@ -125,17 +124,17 @@ func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> te
   // CHECK: %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
     // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
       // CHECK-NEXT: "tf_device.launch"
-      // CHECK: %[[TPU_LAUNCH_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
+      // CHECK: %[[TPU_CLUSTER_OUTPUT:[0-9]*]]:2 = "tf_device.cluster"
         // CHECK: tf_device.return
-      // CHECK: tf_device.return %[[TPU_LAUNCH_OUTPUT]]
+      // CHECK: tf_device.return %[[TPU_CLUSTER_OUTPUT]]
     // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
   %1:4 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
-    %2, %3 = "tf_device.launch"() ( {
+    %2, %3 = "tf_device.cluster"() ( {
       %4 = "tf.A"() : () -> tensor<?xf32>
       "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
       %5 = "tf.C"() : () -> tensor<?xi32>
       tf_device.return %4, %5  : tensor<?xf32>, tensor<?xi32>
-    }) {device = "tpu0", launch_attr = "launch_attr"} : () -> (tensor<?xf32>, tensor<?xi32>)
+    }) {cluster_attr = "cluster_attr"} : () -> (tensor<?xf32>, tensor<?xi32>)
     tf_device.return %2, %3 : tensor<?xf32>, tensor<?xi32>
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 4e20cd9d64b..4281b85bd7f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -34,7 +34,7 @@ constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 constexpr char kDeviceAttr[] = "device";
 
 // Mapping for `_xla_outside_compilation` attribute to ops of a cluster.
-using ClusterMap =
+using OutsideClusterMap =
     llvm::SmallDenseMap<llvm::StringRef, llvm::SmallVector<Operation*, 8>, 8>;
 
 // This pass extracts a CPU computation cluster with `_xla_outside_compilation`
@@ -51,7 +51,8 @@ struct TPUExtractOutsideCompilation
 // Collects and clusters ops in `block` with the same `_xla_outside_compilation`
 // attribute into `clusters` This returns an error if a
 // `_xla_outside_compilation` attribute of an op is empty.
-LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters) {
+LogicalResult CollectAndGroupOutsideClusterOps(Block* block,
+                                               OutsideClusterMap* clusters) {
   for (Operation& op : *block) {
     if (auto attr = op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
       if (attr.getValue().empty())
@@ -67,7 +68,7 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters) {
 }
 
 // Moves `cluster_ops` to associated `launch_op` body.
-void MoveClusterOpsToLaunchOp(
+void MoveOutsideClusterOpsToLaunchOp(
     tf_device::LaunchOp launch_op,
     const llvm::SmallVector<Operation*, 8>& cluster_ops) {
   MLIRContext* context = launch_op.getContext();
@@ -84,8 +85,8 @@ void MoveClusterOpsToLaunchOp(
 }
 
 // Creates a `tf_device::LaunchOp` to wrap cluster ops.
-tf_device::LaunchOp CreateLaunchOpForCluster(OpBuilder* builder,
-                                             Operation* last_cluster_op) {
+tf_device::LaunchOp CreateLaunchOpForOutsideCluster(
+    OpBuilder* builder, Operation* last_cluster_op) {
   // TODO(b/154363171): Set the CPU device.
   // An empty string placeholder is used for the device as that will be later
   // populated with the device of the associated TPUReplicateMetadata op.
@@ -117,14 +118,14 @@ void PropagateParallelExecuteReturnToReplicate(
 
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
 // 'launch` as regions.
-void CreateParallelExecuteFromClusters(tf_device::LaunchOp launch,
-                                       const ClusterMap& clusters) {
-  OpBuilder builder(launch);
+void CreateParallelExecuteFromOutsideClusters(
+    tf_device::ClusterOp tpu_cluster, const OutsideClusterMap& clusters) {
+  OpBuilder builder(tpu_cluster);
   // Create parallel_execute regions.  The original TPU cluster computation
   // is the extra region.
   int num_regions = 1 + clusters.size();
   auto parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
-      launch.getLoc(), num_regions, launch.results().getTypes());
+      tpu_cluster.getLoc(), num_regions, tpu_cluster.results().getTypes());
 
   // Move outside compilation clusters to parallel_execute regions.
   for (const auto& cluster : llvm::enumerate(clusters)) {
@@ -134,21 +135,23 @@ void CreateParallelExecuteFromClusters(tf_device::LaunchOp launch,
         parallel_execute_op.GetRegionBlockWithIndex(cluster.index());
     builder.setInsertionPointToEnd(&outside_block);
     tf_device::LaunchOp launch_op =
-        CreateLaunchOpForCluster(&builder, cluster_ops.back());
-    MoveClusterOpsToLaunchOp(launch_op, cluster_ops);
+        CreateLaunchOpForOutsideCluster(&builder, cluster_ops.back());
+    MoveOutsideClusterOpsToLaunchOp(launch_op, cluster_ops);
     builder.setInsertionPointToEnd(&outside_block);
     // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
     // regions either through communication with TPU parallel_execute regions
     // or modifying parallel_execute returns.
-    builder.create<tf_device::ReturnOp>(launch.getLoc(), ArrayRef<Value>{});
+    builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
+                                        ArrayRef<Value>{});
   }
 
   // Move the launch body to last parallel_execute block.
   Block& inside_block =
       parallel_execute_op.GetRegionBlockWithIndex(num_regions - 1);
   builder.setInsertionPointToEnd(&inside_block);
-  builder.create<tf_device::ReturnOp>(launch.getLoc(), launch.getResults());
-  launch.getOperation()->moveBefore(inside_block.getTerminator());
+  builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
+                                      tpu_cluster.getResults());
+  tpu_cluster.getOperation()->moveBefore(inside_block.getTerminator());
 
   PropagateParallelExecuteReturnToReplicate(parallel_execute_op);
   // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
@@ -157,17 +160,19 @@ void CreateParallelExecuteFromClusters(tf_device::LaunchOp launch,
 }
 
 void TPUExtractOutsideCompilation::runOnFunction() {
-  auto extract_result = getFunction().walk([&](tf_device::LaunchOp launch) {
-    ClusterMap clusters;
-    if (failed(CollectAndGroupClusterOps(&launch.GetBody(), &clusters)))
-      return WalkResult::interrupt();
+  auto extract_result =
+      getFunction().walk([&](tf_device::ClusterOp tpu_cluster) {
+        OutsideClusterMap clusters;
+        if (failed(CollectAndGroupOutsideClusterOps(&tpu_cluster.GetBody(),
+                                                    &clusters)))
+          return WalkResult::interrupt();
 
-    if (clusters.empty()) return WalkResult::advance();
+        if (clusters.empty()) return WalkResult::advance();
 
-    CreateParallelExecuteFromClusters(launch, clusters);
+        CreateParallelExecuteFromOutsideClusters(tpu_cluster, clusters);
 
-    return WalkResult::advance();
-  });
+        return WalkResult::advance();
+      });
 
   if (extract_result.wasInterrupted()) return signalPassFailure();
 }

From 4f9bc2565f02b872dd889ea432dd48d76c6e5ef2 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 15:39:07 -0700
Subject: [PATCH 0226/1533] Cleanup headers in save_profile.cc

PiperOrigin-RevId: 310642299
Change-Id: I0c0f5afa63cc68301eee615ddee717c4870fe5e4
---
 tensorflow/core/profiler/rpc/client/BUILD     |  2 +-
 .../core/profiler/rpc/client/save_profile.cc  | 25 +++++++++++++------
 .../core/profiler/rpc/client/save_profile.h   |  3 +++
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index 7a98c55a4a0..bde5708065e 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -27,8 +27,8 @@ cc_library(
     deps = [
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
-        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index 47d69765da4..e328bf1dae4 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -15,20 +15,28 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 
-#include <cstdio>
-#include <ctime>
+#include <initializer_list>
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <string>
 #include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
-#include "tensorflow/core/lib/io/compression.h"
+#include "tensorflow/core/lib/io/zlib_compression_options.h"
+#include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/platform/file_system.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
+
 // Windows.h #defines ERROR, but it is also used in
 // tensorflow/core/util/event.proto
 #undef ERROR
@@ -56,9 +64,9 @@ string ProfilerJoinPathImpl(std::initializer_list<absl::string_view> paths) {
 
     path = absl::StripPrefix(path, kPathSep);
     if (absl::EndsWith(result, kPathSep)) {
-      strings::StrAppend(&result, path);
+      absl::StrAppend(&result, path);
     } else {
-      strings::StrAppend(&result, kPathSep, path);
+      absl::StrAppend(&result, kPathSep, path);
     }
   }
 
@@ -75,7 +83,8 @@ string ProfilerJoinPath(const T&... args) {
 constexpr char kProtoTraceFileName[] = "trace";
 constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
 
-Status DumpToolDataToLogDirectory(StringPiece run_dir, const string& host,
+Status DumpToolDataToLogDirectory(absl::string_view run_dir,
+                                  absl::string_view host,
                                   const ProfileToolData& tool,
                                   std::ostream* os) {
   // Don't save the intermediate results for combining the per host tool data.
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.h b/tensorflow/core/profiler/rpc/client/save_profile.h
index 6d8bfba5e63..2e8fc96390a 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.h
+++ b/tensorflow/core/profiler/rpc/client/save_profile.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_CLIENT_SAVE_PROFILE_H_
 
+#include <ostream>
+
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 
 namespace tensorflow {

From bca497ef47c8ba6cafe2ce38360c3fcdf2ff12a8 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Fri, 8 May 2020 15:41:25 -0700
Subject: [PATCH 0227/1533] Convert speech training to produce an
 int8-quantized model

PiperOrigin-RevId: 310642639
Change-Id: I9178e0d17079dd4ef4727f018b1a32f6b06fc91c
---
 .../train/train_micro_speech_model.ipynb      | 2021 +----------------
 1 file changed, 1 insertion(+), 2020 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
index 40f56f8012b..2a64ecd7078 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
+++ b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
@@ -1,2020 +1 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "train_micro_speech_model.ipynb",
-      "provenance": [],
-      "collapsed_sections": []
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "accelerator": "GPU"
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "pO4-CY_TCZZS",
-        "colab_type": "text"
-      },
-      "source": [
-        "# Train a Simple Audio Recognition Model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "BaFfr7DHRmGF",
-        "colab_type": "text"
-      },
-      "source": [
-        "This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n",
-        "\n",
-        "The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "</table>\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XaVtYN4nlCft",
-        "colab_type": "text"
-      },
-      "source": [
-        "**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n",
-        "\n",
-        "## Configure Defaults\n",
-        "\n",
-        "**MODIFY** the following constants for your specific use case."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ludfxbNIaegy",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# A comma-delimited list of the words you want to train for.\n",
-        "# The options are: yes,no,up,down,left,right,on,off,stop,go\n",
-        "# All the other words will be used to train an \"unknown\" label and silent\n",
-        "# audio data with no spoken words will be used to train a \"silence\" label.\n",
-        "WANTED_WORDS = \"yes,no\"\n",
-        "\n",
-        "# The number of steps and learning rates can be specified as comma-separated\n",
-        "# lists to define the rate at each stage. For example,\n",
-        "# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n",
-        "# will run 12,000 training loops in total, with a rate of 0.001 for the first\n",
-        "# 8,000, and 0.0001 for the final 3,000.\n",
-        "TRAINING_STEPS = \"12000,3000\"\n",
-        "LEARNING_RATE = \"0.001,0.0001\"\n",
-        "\n",
-        "# Calculate the total number of steps, which is used to identify the checkpoint\n",
-        "# file name.\n",
-        "TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n",
-        "\n",
-        "# Print the configuration to confirm it\n",
-        "!echo \"Training these words:\" $WANTED_WORDS\n",
-        "!echo \"Training steps in each stage:\" $TRAINING_STEPS\n",
-        "!echo \"Learning rate in each stage:\" $LEARNING_RATE\n",
-        "!echo \"Total number of training steps:\" $TOTAL_STEPS"
-      ],
-      "execution_count": 1,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Training these words: yes,no\n",
-            "Training steps in each stage: 12000,3000\n",
-            "Learning rate in each stage: 0.001,0.0001\n",
-            "Total number of training steps: 15000\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "gCgeOpvY9pAi",
-        "colab_type": "text"
-      },
-      "source": [
-        "**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "Nd1iM1o2ymvA",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Calculate the percentage of 'silence' and 'unknown' training samples required\n",
-        "# to ensure that we have equal number of samples for each label.\n",
-        "number_of_labels = WANTED_WORDS.count(',') + 1\n",
-        "number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n",
-        "equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n",
-        "SILENT_PERCENTAGE = equal_percentage_of_training_samples\n",
-        "UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n",
-        "\n",
-        "# Constants which are shared during training and inference\n",
-        "PREPROCESS = 'micro'\n",
-        "WINDOW_STRIDE ='20'\n",
-        "MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n",
-        "                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n",
-        "QUANTIZE = '1' # For booleans, we provide 1 or 0 (instead of True or False)\n",
-        "\n",
-        "# Constants used during training only\n",
-        "VERBOSITY = 'WARN'\n",
-        "EVAL_STEP_INTERVAL = '1000'\n",
-        "SAVE_STEP_INTERVAL = '5000'\n",
-        "\n",
-        "# Constants for training directories and filepaths\n",
-        "DATASET_DIR =  'dataset/'\n",
-        "LOGS_DIR = 'logs/'\n",
-        "TRAIN_DIR = 'train/' # for training checkpoints and other files.\n",
-        "\n",
-        "# Constants for inference directories and filepaths\n",
-        "import os\n",
-        "MODELS_DIR = 'models/'\n",
-        "os.mkdir(MODELS_DIR)\n",
-        "MODEL_TF = MODELS_DIR + 'model.pb'\n",
-        "MODEL_TFLITE = MODELS_DIR + 'model.tflite'\n",
-        "MODEL_TFLITE_MICRO = MODELS_DIR + 'model.cc'"
-      ],
-      "execution_count": 2,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "6rLYpvtg9P4o",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Setup Environment\n",
-        "\n",
-        "Install Dependencies"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "ed_XpUrU5DvY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "%tensorflow_version 1.x\n",
-        "import tensorflow as tf"
-      ],
-      "execution_count": 3,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "TensorFlow 1.x selected.\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "T9Ty5mR58E4i",
-        "colab_type": "text"
-      },
-      "source": [
-        "**DELETE** any old data from previous runs\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "APGx0fEh7hFF",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"
-      ],
-      "execution_count": 5,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "GfEUlfFBizio",
-        "colab_type": "text"
-      },
-      "source": [
-        "Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "yZArmzT85SLq",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!git clone -q https://github.com/tensorflow/tensorflow"
-      ],
-      "execution_count": 6,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "nS9swHLSi7Bi",
-        "colab_type": "text"
-      },
-      "source": [
-        "Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "q4qF1VxP3UE4",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "%load_ext tensorboard\n",
-        "%tensorboard --logdir {LOGS_DIR}"
-      ],
-      "execution_count": 7,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "x1J96Ron-O4R",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Training\n",
-        "\n",
-        "The following script downloads the dataset and begin training."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "VJsEZx6lynbY",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n",
-        "--data_dir={DATASET_DIR} \\\n",
-        "--wanted_words={WANTED_WORDS} \\\n",
-        "--silence_percentage={SILENT_PERCENTAGE} \\\n",
-        "--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n",
-        "--preprocess={PREPROCESS} \\\n",
-        "--window_stride={WINDOW_STRIDE} \\\n",
-        "--model_architecture={MODEL_ARCHITECTURE} \\\n",
-        "--quantize={QUANTIZE} \\\n",
-        "--how_many_training_steps={TRAINING_STEPS} \\\n",
-        "--learning_rate={LEARNING_RATE} \\\n",
-        "--train_dir={TRAIN_DIR} \\\n",
-        "--summaries_dir={LOGS_DIR} \\\n",
-        "--verbosity={VERBOSITY} \\\n",
-        "--eval_step_interval={EVAL_STEP_INTERVAL} \\\n",
-        "--save_step_interval={SAVE_STEP_INTERVAL} \\"
-      ],
-      "execution_count": 8,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "..\n",
-            "..\n",
-            "..\n",
-            "..\n",
-            "WARNING:tensorflow:Confusion Matrix:\n",
-            " [[205   0   0   1]\n",
-            " [  3 162  13  28]\n",
-            " [  3   9 401   6]\n",
-            " [  2  22   6 375]]\n",
-            "W0402 00:25:28.115174 139938153863040 train.py:320] Confusion Matrix:\n",
-            " [[205   0   0   1]\n",
-            " [  3 162  13  28]\n",
-            " [  3   9 401   6]\n",
-            " [  2  22   6 375]]\n",
-            "WARNING:tensorflow:Final test accuracy = 92.5% (N=1236)\n",
-            "W0402 00:25:28.115574 139938153863040 train.py:322] Final test accuracy = 92.5% (N=1236)\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "XQUJLrdS-ftl",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Generate a TensorFlow Model for Inference\n",
-        "\n",
-        "Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "xyc3_eLh9sAg",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n",
-        "--wanted_words=$WANTED_WORDS \\\n",
-        "--window_stride_ms=$WINDOW_STRIDE \\\n",
-        "--preprocess=$PREPROCESS \\\n",
-        "--model_architecture=$MODEL_ARCHITECTURE \\\n",
-        "--quantize=$QUANTIZE \\\n",
-        "--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'$TOTAL_STEPS \\\n",
-        "--output_file=$MODEL_TF \\"
-      ],
-      "execution_count": 9,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "..\n",
-            "..\n",
-            "..\n",
-            "..\n",
-            "INFO:tensorflow:Restoring parameters from /content/train/tiny_conv.ckpt-15000\n",
-            "I0402 00:25:47.086113 140352379615104 saver.py:1284] Restoring parameters from /content/train/tiny_conv.ckpt-15000\n",
-            "INFO:tensorflow:Froze 12 variables.\n",
-            "I0402 00:25:47.663757 140352379615104 graph_util_impl.py:334] Froze 12 variables.\n",
-            "INFO:tensorflow:Converted 12 variables to const ops.\n",
-            "I0402 00:25:47.665771 140352379615104 graph_util_impl.py:394] Converted 12 variables to const ops.\n",
-            "INFO:tensorflow:Saved frozen graph to /content/models/model.pb\n",
-            "I0402 00:25:47.667117 140352379615104 freeze.py:186] Saved frozen graph to /content/models/model.pb\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_DBGDxVI-nKG",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Generate a TensorFlow Lite Model\n",
-        "\n",
-        "Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n",
-        "\n",
-        "The following cell will also print the model size, which will be under 20 kilobytes."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "lBj_AyCh1cC0",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "input_tensor = 'Reshape_2'\n",
-        "output_tensor = 'labels_softmax'\n",
-        "\n",
-        "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
-        "    MODEL_TF, [input_tensor], [output_tensor])\n",
-        "converter.inference_type = tf.uint8\n",
-        "converter.quantized_input_stats = {input_tensor: (0.0, 9.8077)} # (mean, standard deviation)\n",
-        "tflite_model = converter.convert()\n",
-        "\n",
-        "tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n",
-        "print(\"Model is %d bytes\" % tflite_model_size)\n"
-        ],
-      "execution_count": 10,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "Model is 18288 bytes\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "dt6Zqbxu-wIi",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Generate a TensorFlow Lite for MicroControllers Model\n",
-        "Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "XohZOTjR8ZyE",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Install xxd if it is not available\n",
-        "!apt-get update && apt-get -qq install xxd\n",
-        "# Convert to a C source file\n",
-        "!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n",
-        "# Update variable names\n",
-        "REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n",
-        "!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"
-      ],
-      "execution_count": 11,
-      "outputs": []
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "2pQnN0i_-0L2",
-        "colab_type": "text"
-      },
-      "source": [
-        "## Deploy to a Microcontroller\n",
-        "\n",
-        "Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n",
-        "\n",
-        "**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n",
-        "\n",
-        "**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "metadata": {
-        "id": "eoYyh0VU8pca",
-        "colab_type": "code",
-        "colab": {}
-      },
-      "source": [
-        "# Print the C source file\n",
-        "!cat {MODEL_TFLITE_MICRO}"
-      ],
-      "execution_count": 12,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "text": [
-            "unsigned char g_model[] = {\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,\n",
-            "  0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,\n",
-            "  0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x47, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,\n",
-            "  0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,\n",
-            "  0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x0a, 0x00, 0x00, 0x00,\n",
-            "  0x60, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,\n",
-            "  0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x0e, 0xba, 0xff, 0xff, 0x38, 0x00, 0x00, 0x00,\n",
-            "  0xbc, 0xb9, 0xff, 0xff, 0xc0, 0xb9, 0xff, 0xff, 0x1e, 0xba, 0xff, 0xff,\n",
-            "  0xe0, 0x01, 0x00, 0x00, 0xcc, 0xb9, 0xff, 0xff, 0xd0, 0xb9, 0xff, 0xff,\n",
-            "  0x2e, 0xba, 0xff, 0xff, 0x60, 0x03, 0x00, 0x00, 0x36, 0xba, 0xff, 0xff,\n",
-            "  0x7c, 0x06, 0x00, 0x00, 0x3e, 0xba, 0xff, 0xff, 0x68, 0x45, 0x00, 0x00,\n",
-            "  0xec, 0xb9, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e,\n",
-            "  0x30, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n",
-            "  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,\n",
-            "  0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,\n",
-            "  0x10, 0xfa, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x2c, 0x45, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x9c, 0x44, 0x00, 0x00,\n",
-            "  0x8c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00,\n",
-            "  0x68, 0x01, 0x00, 0x00, 0x3c, 0x02, 0x00, 0x00, 0x50, 0x05, 0x00, 0x00,\n",
-            "  0x8e, 0xbb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,\n",
-            "  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n",
-            "  0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00,\n",
-            "  0x94, 0xfa, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,\n",
-            "  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xc6, 0xd0, 0xd0, 0x3d, 0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xbc, 0xff, 0xff,\n",
-            "  0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x04, 0xfb, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x09, 0xf5, 0x83, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x14, 0x71, 0x83, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x72, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n",
-            "  0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,\n",
-            "  0x64, 0xbc, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2d, 0x95, 0x98, 0x38,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x27, 0xff, 0xff, 0xff, 0x97, 0xff, 0xff, 0xff,\n",
-            "  0x58, 0x00, 0x00, 0x00, 0x66, 0xff, 0xff, 0xff, 0x13, 0xff, 0xff, 0xff,\n",
-            "  0x72, 0xfe, 0xff, 0xff, 0x5d, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0xea, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x05, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n",
-            "  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0xec, 0xfb, 0xff, 0xff,\n",
-            "  0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x5a, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,\n",
-            "  0x31, 0x00, 0x00, 0x00, 0x54, 0xfc, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,\n",
-            "  0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x9c, 0xd2, 0xb5, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x48, 0x18, 0x1f, 0x41, 0x01, 0x00, 0x00, 0x00, 0x4a, 0x21, 0x4b, 0xc1,\n",
-            "  0xc2, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,\n",
-            "  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,\n",
-            "  0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,\n",
-            "  0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57,\n",
-            "  0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72,\n",
-            "  0x73, 0x00, 0x00, 0x00, 0xe4, 0xfc, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n",
-            "  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8a, 0x0f, 0x3b, 0x3a,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0xfc, 0x0b, 0xb4, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xd9, 0x26, 0xbf, 0xbd, 0x80, 0x02, 0x00, 0x00, 0x60, 0x38, 0xab, 0xcb,\n",
-            "  0xfa, 0x7e, 0xa2, 0x55, 0x6e, 0x87, 0xa5, 0x9b, 0xb4, 0x66, 0x5c, 0x6f,\n",
-            "  0xae, 0xdb, 0xcd, 0xb6, 0xc2, 0x60, 0xa9, 0x7d, 0xd4, 0xac, 0xa6, 0x90,\n",
-            "  0x87, 0x6b, 0x50, 0x95, 0xde, 0xcd, 0xaa, 0xa1, 0x9c, 0x65, 0xb5, 0x6d,\n",
-            "  0xb0, 0xa5, 0xa5, 0x7f, 0x73, 0x95, 0x63, 0x81, 0x7a, 0xc6, 0xaf, 0x82,\n",
-            "  0x69, 0x89, 0xc3, 0x3c, 0x47, 0x73, 0x89, 0x4f, 0x33, 0xbc, 0x85, 0x5d,\n",
-            "  0x69, 0x11, 0x5b, 0xb9, 0xf1, 0x95, 0x8f, 0x5c, 0x7c, 0x59, 0x6c, 0xa0,\n",
-            "  0xa5, 0x7c, 0x5a, 0x7c, 0xb5, 0xa9, 0x7e, 0xa1, 0xb8, 0x65, 0xb3, 0x86,\n",
-            "  0xc1, 0x9f, 0x5c, 0x86, 0x7f, 0x74, 0x52, 0xa8, 0xc9, 0xc5, 0x71, 0x96,\n",
-            "  0x7a, 0x65, 0xc7, 0x69, 0x94, 0xa7, 0x65, 0x68, 0x69, 0x8d, 0x6d, 0x9e,\n",
-            "  0x59, 0xd4, 0x75, 0x7a, 0x4f, 0x70, 0xca, 0x48, 0x25, 0x8a, 0x69, 0x4d,\n",
-            "  0x2a, 0xa6, 0x76, 0x69, 0x6a, 0x02, 0x3b, 0xa2, 0xea, 0xc2, 0x73, 0x6b,\n",
-            "  0x86, 0x4d, 0x3a, 0xa2, 0xa2, 0x88, 0x4e, 0x6c, 0xb3, 0x83, 0x39, 0x93,\n",
-            "  0xa6, 0x85, 0xb8, 0x7a, 0xa8, 0x7d, 0x2e, 0x7b, 0x7f, 0x69, 0x56, 0xb5,\n",
-            "  0xbb, 0xae, 0x23, 0x78, 0x67, 0x5c, 0xd2, 0x82, 0x7d, 0x96, 0x46, 0x74,\n",
-            "  0x70, 0x72, 0x6a, 0x90, 0x43, 0xce, 0x44, 0x75, 0x4a, 0x58, 0xc7, 0x5c,\n",
-            "  0x34, 0x84, 0x46, 0x4b, 0x41, 0x6c, 0x62, 0x83, 0x7e, 0x01, 0x9b, 0x9b,\n",
-            "  0xeb, 0xf7, 0x58, 0x6f, 0x8a, 0x43, 0xb3, 0x9f, 0x9c, 0x9e, 0x55, 0xa8,\n",
-            "  0xaa, 0x84, 0x8f, 0x8f, 0xb0, 0x9e, 0xc8, 0x81, 0xb6, 0x80, 0xa0, 0x81,\n",
-            "  0x86, 0x73, 0x5d, 0xdc, 0xb9, 0xae, 0xa2, 0x6c, 0x46, 0x67, 0xfa, 0x79,\n",
-            "  0x89, 0xaf, 0xa0, 0x74, 0x76, 0x85, 0x72, 0xb1, 0x2a, 0xbb, 0xa0, 0x6d,\n",
-            "  0x4f, 0x50, 0xc9, 0x5d, 0x2f, 0xaa, 0x9c, 0x63, 0x3f, 0x59, 0x63, 0x90,\n",
-            "  0x73, 0x1e, 0xb3, 0x94, 0xcd, 0xff, 0x3c, 0x63, 0x9b, 0x59, 0xc5, 0xa2,\n",
-            "  0x9f, 0x9a, 0x53, 0xab, 0xb0, 0x74, 0xb2, 0x6f, 0x8a, 0xa7, 0xd5, 0x8d,\n",
-            "  0xb8, 0x7e, 0x9e, 0x78, 0x84, 0x61, 0x66, 0xe7, 0xa7, 0x9f, 0xb7, 0x45,\n",
-            "  0x24, 0x61, 0xfd, 0x69, 0x87, 0xb8, 0xb2, 0x7a, 0x7c, 0x58, 0x64, 0xa3,\n",
-            "  0x07, 0xa9, 0xaf, 0x69, 0x49, 0x2f, 0xc2, 0x46, 0x3b, 0xaf, 0x9a, 0x70,\n",
-            "  0x6b, 0x25, 0x5f, 0x9d, 0x82, 0x33, 0xa1, 0x54, 0xae, 0xff, 0x31, 0x5d,\n",
-            "  0xaf, 0x51, 0xb2, 0x82, 0x9c, 0xa9, 0x5b, 0x8c, 0xab, 0x75, 0xb3, 0x32,\n",
-            "  0x42, 0xbd, 0xcd, 0x77, 0xb6, 0x67, 0x9a, 0x5f, 0x6c, 0x71, 0x6e, 0xc2,\n",
-            "  0xac, 0x97, 0x9f, 0x4b, 0x21, 0x6a, 0xfc, 0x77, 0x83, 0xa1, 0xa3, 0x6a,\n",
-            "  0x7a, 0x6d, 0x5e, 0x87, 0x02, 0xa6, 0x8f, 0x7f, 0x5c, 0x2e, 0xc1, 0x51,\n",
-            "  0x4a, 0xa7, 0x96, 0x79, 0x83, 0x2e, 0x5a, 0x84, 0x82, 0x5c, 0x61, 0x3a,\n",
-            "  0x4a, 0xff, 0x2a, 0x51, 0xa4, 0x6b, 0x82, 0x5e, 0x67, 0xb3, 0x71, 0x80,\n",
-            "  0xad, 0x62, 0x59, 0x40, 0x26, 0xd7, 0xcf, 0x68, 0xab, 0x7c, 0x6a, 0x69,\n",
-            "  0x5b, 0x7c, 0x84, 0xbc, 0x95, 0x68, 0x77, 0x63, 0x3f, 0x85, 0xed, 0x7b,\n",
-            "  0x71, 0xa0, 0x76, 0x90, 0x8c, 0x6c, 0x61, 0x81, 0x16, 0x74, 0x72, 0x94,\n",
-            "  0x74, 0x37, 0xb5, 0x3d, 0x55, 0x96, 0x86, 0xad, 0x87, 0x39, 0x59, 0x88,\n",
-            "  0x5b, 0x65, 0x60, 0x33, 0x33, 0xe6, 0x2b, 0x4a, 0xb6, 0x82, 0x50, 0x56,\n",
-            "  0x51, 0x97, 0x71, 0x83, 0xa6, 0x60, 0x57, 0x51, 0x58, 0xe4, 0xd0, 0x87,\n",
-            "  0xa1, 0x78, 0x4c, 0x67, 0x72, 0x74, 0x86, 0xc6, 0x60, 0x47, 0x50, 0x96,\n",
-            "  0x67, 0x96, 0xdd, 0x7d, 0x63, 0x85, 0x5e, 0x98, 0xa2, 0x64, 0x5f, 0x8a,\n",
-            "  0x3b, 0x40, 0x54, 0xcb, 0xa0, 0x61, 0xa7, 0x44, 0x5f, 0x6d, 0x57, 0xb3,\n",
-            "  0xb9, 0x2e, 0x61, 0x8e, 0x54, 0x78, 0x85, 0x58, 0x43, 0xb0, 0x27, 0x5d,\n",
-            "  0x8a, 0x7c, 0x8a, 0x58, 0x40, 0x83, 0x82, 0x9b, 0x6c, 0x60, 0x6b, 0x72,\n",
-            "  0x7f, 0xde, 0xc9, 0x7d, 0x6f, 0x5f, 0x90, 0x7e, 0x7e, 0x7e, 0x8b, 0xe5,\n",
-            "  0x51, 0x37, 0x7a, 0xa9, 0xa2, 0xc5, 0xd3, 0x81, 0x32, 0x4b, 0x80, 0xa9,\n",
-            "  0xc5, 0x76, 0x56, 0x99, 0x33, 0x19, 0x72, 0xe6, 0xdb, 0x90, 0xa8, 0x50,\n",
-            "  0x65, 0x44, 0x77, 0xdb, 0xc7, 0x48, 0x65, 0x8d, 0x3d, 0x7f, 0xa2, 0x7c,\n",
-            "  0x53, 0x55, 0x26, 0x49, 0x5d, 0x7d, 0xa2, 0x6d, 0x3b, 0x5b, 0x87, 0x64,\n",
-            "  0x3a, 0x5b, 0x8d, 0x93, 0x7a, 0xb4, 0xca, 0x6d, 0x16, 0x5a, 0x99, 0x82,\n",
-            "  0x8d, 0x6a, 0x92, 0xa0, 0x39, 0x2c, 0x95, 0xc8, 0xb8, 0xf5, 0xc8, 0x66,\n",
-            "  0x2a, 0x45, 0x84, 0x9c, 0xc7, 0x8e, 0x61, 0x7b, 0x43, 0x28, 0x86, 0xff,\n",
-            "  0xd2, 0xc8, 0x9c, 0x46, 0x65, 0x33, 0x82, 0xd8, 0xcb, 0x73, 0x63, 0x80,\n",
-            "  0xda, 0xc0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,\n",
-            "  0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f,\n",
-            "  0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f, 0x46, 0x61, 0x6b, 0x65,\n",
-            "  0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e,\n",
-            "  0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f, 0x74, 0x72, 0x61, 0x6e,\n",
-            "  0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,\n",
-            "  0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x87, 0xff, 0xdb, 0x39,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0xd8, 0xb2, 0x5d, 0x3d, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x37, 0xdc, 0x56, 0xbd, 0x80, 0x3e, 0x00, 0x00, 0x67, 0x6d, 0x74, 0x77,\n",
-            "  0x35, 0x66, 0x87, 0x95, 0x8e, 0x82, 0x5e, 0x70, 0x6e, 0xa7, 0x60, 0x64,\n",
-            "  0x86, 0x5e, 0x93, 0x7a, 0x76, 0x74, 0x71, 0x8c, 0x61, 0x71, 0x60, 0x8b,\n",
-            "  0x83, 0x48, 0x8b, 0x5f, 0x95, 0x99, 0x5b, 0x59, 0x49, 0x44, 0x79, 0x62,\n",
-            "  0x8e, 0x77, 0x71, 0x89, 0x64, 0x46, 0x8f, 0x8e, 0x80, 0x73, 0x71, 0x81,\n",
-            "  0x85, 0x4a, 0x73, 0x57, 0x66, 0x58, 0x75, 0x93, 0x99, 0x58, 0x8a, 0x7b,\n",
-            "  0x87, 0x81, 0xa1, 0x46, 0x79, 0x6c, 0x83, 0x7a, 0x92, 0x74, 0x6f, 0x6b,\n",
-            "  0x79, 0x77, 0x97, 0x8a, 0x95, 0x75, 0xa2, 0x49, 0x80, 0x4e, 0x7f, 0x6d,\n",
-            "  0xaa, 0xac, 0x6c, 0x5d, 0x57, 0x82, 0x97, 0x77, 0x6f, 0x75, 0x95, 0x73,\n",
-            "  0x7e, 0x51, 0x9f, 0x5b, 0x54, 0x92, 0x60, 0x72, 0x80, 0x6a, 0x92, 0x83,\n",
-            "  0x9b, 0x85, 0x7b, 0x4d, 0x55, 0x4d, 0xb2, 0x7d, 0x65, 0x95, 0x76, 0x42,\n",
-            "  0x61, 0x49, 0xa2, 0x73, 0x9f, 0x7d, 0x7c, 0x54, 0x51, 0x76, 0xa1, 0x7f,\n",
-            "  0x86, 0x69, 0x98, 0x59, 0x6d, 0x84, 0x9f, 0x7b, 0x86, 0x79, 0x88, 0x55,\n",
-            "  0x9c, 0x72, 0x95, 0x8a, 0x91, 0x7a, 0x77, 0x95, 0x7b, 0x87, 0x87, 0x85,\n",
-            "  0x95, 0x72, 0x77, 0x59, 0x7c, 0x80, 0x90, 0x8f, 0x8a, 0x62, 0x76, 0x9f,\n",
-            "  0x64, 0x84, 0x71, 0x7e, 0x7c, 0x66, 0x8e, 0x94, 0x6e, 0xaa, 0x77, 0x5c,\n",
-            "  0x6b, 0x63, 0x68, 0x82, 0x89, 0x46, 0x61, 0x74, 0x8e, 0x85, 0x6b, 0x57,\n",
-            "  0x74, 0x50, 0x87, 0x66, 0x87, 0x98, 0x59, 0x7d, 0xa2, 0x59, 0x75, 0x64,\n",
-            "  0x72, 0x8c, 0x6a, 0x92, 0x8c, 0x56, 0x88, 0x7a, 0x6e, 0x77, 0x9c, 0x82,\n",
-            "  0x7e, 0x5a, 0x91, 0x80, 0x9c, 0x9e, 0x60, 0x8b, 0x6d, 0x76, 0x8d, 0x68,\n",
-            "  0x6c, 0x70, 0x6f, 0x8b, 0x61, 0x6e, 0x86, 0x78, 0x81, 0x81, 0x77, 0x79,\n",
-            "  0x76, 0x69, 0x7d, 0x7b, 0x96, 0x8b, 0x95, 0x91, 0xa2, 0x7b, 0x86, 0x8d,\n",
-            "  0x8b, 0x89, 0x86, 0x5a, 0x5c, 0x4d, 0x96, 0x80, 0x81, 0x55, 0x80, 0x80,\n",
-            "  0x7a, 0x76, 0x99, 0x98, 0x61, 0x95, 0x5a, 0x78, 0x5a, 0x6c, 0x89, 0x81,\n",
-            "  0x98, 0x77, 0x62, 0x77, 0x93, 0x4d, 0x9f, 0x77, 0x72, 0x87, 0x95, 0x71,\n",
-            "  0x65, 0x72, 0xac, 0x8c, 0xa2, 0x89, 0x90, 0x7b, 0x67, 0x60, 0x8a, 0xb3,\n",
-            "  0x72, 0x8f, 0x5c, 0x82, 0x74, 0x76, 0x7c, 0x85, 0x78, 0x6b, 0x97, 0x6d,\n",
-            "  0x86, 0x82, 0x76, 0x84, 0x89, 0x89, 0x7f, 0x6a, 0x7a, 0x7f, 0x6c, 0x77,\n",
-            "  0x80, 0x35, 0x7d, 0x66, 0x96, 0x7e, 0x88, 0x55, 0x6b, 0x55, 0x7c, 0xa7,\n",
-            "  0x7f, 0x9f, 0x64, 0x8b, 0xa0, 0x81, 0x80, 0x97, 0xaf, 0x7a, 0x7d, 0x61,\n",
-            "  0x7a, 0x77, 0x6f, 0x8c, 0x5e, 0x69, 0x6b, 0x94, 0x70, 0x6a, 0x66, 0x5d,\n",
-            "  0x78, 0x6e, 0x76, 0x64, 0xa0, 0x73, 0x8f, 0xa2, 0x9d, 0x50, 0x8e, 0x52,\n",
-            "  0x51, 0x85, 0x78, 0x83, 0x8f, 0x94, 0x83, 0x7c, 0x9c, 0x64, 0x59, 0x7d,\n",
-            "  0x66, 0x6a, 0x73, 0x80, 0x6a, 0x9b, 0x92, 0x7e, 0x7a, 0x78, 0x7d, 0xa0,\n",
-            "  0x8a, 0x9b, 0x61, 0x9e, 0x6c, 0x64, 0x6c, 0x8e, 0x86, 0x75, 0x8a, 0x95,\n",
-            "  0x8e, 0x89, 0x87, 0x8a, 0x5d, 0x8b, 0x82, 0x7c, 0x60, 0x63, 0x85, 0x85,\n",
-            "  0x63, 0x96, 0xa3, 0x7f, 0x93, 0x78, 0x8c, 0x86, 0x7b, 0x78, 0x8e, 0x71,\n",
-            "  0x72, 0x8b, 0x8a, 0x5e, 0x8d, 0x75, 0x78, 0xa3, 0x84, 0x67, 0xa7, 0x54,\n",
-            "  0x6c, 0x80, 0x8e, 0xa8, 0x83, 0x51, 0x6e, 0x9f, 0x8b, 0x86, 0x75, 0x95,\n",
-            "  0x7f, 0x7a, 0x80, 0x81, 0x8d, 0x9c, 0x83, 0x8a, 0x7b, 0x8a, 0x74, 0x6f,\n",
-            "  0x8d, 0x96, 0x5b, 0x9c, 0x8d, 0x7b, 0x83, 0x79, 0x7f, 0x65, 0x7e, 0x87,\n",
-            "  0x7c, 0x5d, 0x71, 0x97, 0x77, 0x44, 0x9a, 0x7f, 0xaa, 0x56, 0x75, 0x5f,\n",
-            "  0x7c, 0x51, 0x8c, 0x90, 0x84, 0x9a, 0x49, 0x5d, 0x86, 0x52, 0x94, 0x95,\n",
-            "  0x5b, 0x86, 0x66, 0x7d, 0x51, 0x4f, 0x7a, 0x91, 0x6d, 0x6e, 0x72, 0x70,\n",
-            "  0x83, 0x4f, 0x9b, 0x9a, 0x8a, 0x77, 0x6a, 0xa1, 0x71, 0x60, 0x61, 0x98,\n",
-            "  0x67, 0x4e, 0x7a, 0x8a, 0x53, 0x6b, 0x99, 0xa0, 0x91, 0x46, 0x8a, 0x8b,\n",
-            "  0x47, 0x78, 0xa9, 0x7b, 0x71, 0x6c, 0x81, 0x68, 0x53, 0x73, 0xaf, 0x70,\n",
-            "  0x62, 0x6d, 0x69, 0x97, 0x70, 0x83, 0x5f, 0x7f, 0x81, 0x87, 0x65, 0x93,\n",
-            "  0x67, 0x87, 0x70, 0x82, 0x79, 0x9e, 0x80, 0x77, 0x6c, 0x80, 0x92, 0x81,\n",
-            "  0x8d, 0x8c, 0x89, 0x8b, 0x4e, 0x91, 0x77, 0x84, 0x99, 0x8c, 0x71, 0x88,\n",
-            "  0x57, 0x7a, 0x9a, 0x8c, 0x82, 0x9b, 0x97, 0x72, 0x69, 0xac, 0x7c, 0x62,\n",
-            "  0x85, 0x7d, 0x76, 0x7f, 0x59, 0x85, 0x68, 0x63, 0x94, 0x8b, 0x7b, 0x92,\n",
-            "  0x7b, 0x6f, 0x77, 0x98, 0x66, 0x78, 0x74, 0x99, 0x85, 0x8c, 0x94, 0x89,\n",
-            "  0x6c, 0x77, 0x89, 0x80, 0x79, 0x8a, 0xa6, 0x95, 0xa9, 0x86, 0x6f, 0x95,\n",
-            "  0x90, 0x69, 0x98, 0x85, 0xa0, 0x7f, 0x56, 0xab, 0x6f, 0x5a, 0x94, 0x8b,\n",
-            "  0x5a, 0x72, 0x61, 0x83, 0x54, 0x70, 0x8d, 0x8d, 0x9c, 0x5e, 0x36, 0x9b,\n",
-            "  0x84, 0x32, 0x6e, 0x84, 0x79, 0x72, 0x64, 0x95, 0x83, 0x58, 0x67, 0x6c,\n",
-            "  0x9e, 0x8d, 0x6e, 0x9e, 0x4f, 0x78, 0x71, 0x85, 0x75, 0x60, 0x4d, 0x7d,\n",
-            "  0x64, 0x89, 0x8e, 0x89, 0x6e, 0x92, 0x53, 0x7c, 0x86, 0x8f, 0xa9, 0xb0,\n",
-            "  0x8e, 0x5e, 0x76, 0x96, 0x65, 0x7c, 0x8a, 0x89, 0x75, 0x8f, 0x65, 0x94,\n",
-            "  0x6c, 0x6c, 0x8d, 0x6d, 0x66, 0x6a, 0x62, 0x98, 0x53, 0x8f, 0x67, 0x76,\n",
-            "  0x80, 0x89, 0x66, 0x60, 0x55, 0x81, 0x85, 0x61, 0x75, 0x78, 0x80, 0x92,\n",
-            "  0x6f, 0x79, 0x66, 0x64, 0x99, 0xa7, 0x88, 0xa1, 0x86, 0x6b, 0x94, 0x88,\n",
-            "  0x77, 0x83, 0x8f, 0x61, 0x72, 0x7c, 0x6f, 0x8f, 0x61, 0x56, 0x8a, 0x7b,\n",
-            "  0x66, 0x8b, 0x98, 0x9d, 0x82, 0x65, 0x77, 0x98, 0x55, 0x83, 0x7a, 0x8c,\n",
-            "  0x74, 0x79, 0x6e, 0x85, 0x82, 0x9a, 0x7d, 0x8d, 0x76, 0x72, 0x64, 0x81,\n",
-            "  0x9a, 0x8d, 0x9f, 0x7b, 0x7c, 0x7b, 0x7b, 0x84, 0x90, 0x6b, 0xa4, 0x84,\n",
-            "  0x98, 0x6f, 0x81, 0xb8, 0x6f, 0x6c, 0x87, 0x6d, 0x8c, 0x72, 0x53, 0x85,\n",
-            "  0x59, 0x4d, 0x9c, 0x94, 0x7d, 0x6f, 0x4f, 0x82, 0x5d, 0x71, 0x6e, 0x78,\n",
-            "  0x61, 0x61, 0x34, 0x71, 0x6a, 0x5a, 0x73, 0xa3, 0x89, 0x65, 0x4d, 0x80,\n",
-            "  0x5c, 0x51, 0x81, 0x8e, 0x6c, 0x53, 0x4a, 0x95, 0x3b, 0x72, 0xa7, 0x86,\n",
-            "  0x7f, 0x75, 0x61, 0xa3, 0x85, 0x6c, 0x99, 0x88, 0x7c, 0x64, 0x7a, 0x8d,\n",
-            "  0x81, 0x7b, 0x6a, 0x7b, 0x8f, 0x74, 0x6d, 0xae, 0x42, 0x67, 0x88, 0xa1,\n",
-            "  0x90, 0x4d, 0x7c, 0x7b, 0x62, 0x55, 0x9a, 0x80, 0x4d, 0x76, 0x5c, 0x88,\n",
-            "  0x60, 0x86, 0x6f, 0x65, 0x67, 0x77, 0x8a, 0x97, 0x99, 0x7c, 0x89, 0x78,\n",
-            "  0x92, 0xa7, 0x6a, 0x7f, 0x8e, 0x88, 0x9d, 0xa1, 0x7b, 0xb0, 0x69, 0x8c,\n",
-            "  0x7e, 0x51, 0x76, 0x84, 0x7d, 0x91, 0x7a, 0x88, 0x7b, 0x88, 0x92, 0x79,\n",
-            "  0x6d, 0x82, 0x6c, 0x8a, 0x99, 0x62, 0x82, 0x9d, 0x99, 0x97, 0x78, 0x6a,\n",
-            "  0x6e, 0x83, 0x64, 0x7d, 0x8c, 0x78, 0x7c, 0x7a, 0x7d, 0x7b, 0x77, 0x84,\n",
-            "  0x76, 0x57, 0x63, 0x85, 0x97, 0x94, 0x80, 0x92, 0x88, 0x73, 0x91, 0x91,\n",
-            "  0x8f, 0x6d, 0x99, 0x86, 0x91, 0x7f, 0x8b, 0x87, 0x98, 0x62, 0x84, 0x70,\n",
-            "  0x97, 0x7b, 0x2e, 0x9b, 0x6e, 0x2a, 0xa4, 0x9c, 0x79, 0x88, 0x54, 0x81,\n",
-            "  0x4f, 0x41, 0xa0, 0x85, 0xaf, 0x9a, 0x47, 0x5a, 0x7d, 0x62, 0x7a, 0x84,\n",
-            "  0x81, 0x6e, 0x41, 0xb4, 0x60, 0x47, 0x8f, 0x98, 0x6c, 0x3c, 0x3b, 0x73,\n",
-            "  0x59, 0x55, 0x7c, 0xb0, 0x6e, 0x5f, 0x61, 0x97, 0x73, 0x59, 0x9f, 0x92,\n",
-            "  0x89, 0x5c, 0x70, 0x96, 0x5c, 0x7c, 0x7c, 0x64, 0x7e, 0x54, 0x5c, 0x94,\n",
-            "  0x56, 0x73, 0x8d, 0x95, 0x59, 0x83, 0x6c, 0x99, 0x6e, 0x5e, 0x7a, 0x99,\n",
-            "  0x83, 0x93, 0x88, 0x76, 0x5a, 0x5a, 0xa5, 0x95, 0x5d, 0x63, 0x8f, 0x6e,\n",
-            "  0x74, 0x65, 0x85, 0x86, 0x98, 0x83, 0x7b, 0x8a, 0x5c, 0x5e, 0x7f, 0x88,\n",
-            "  0x78, 0x68, 0x8f, 0x9f, 0x94, 0x8d, 0x74, 0x7b, 0x6a, 0x91, 0x7a, 0x9a,\n",
-            "  0x70, 0x67, 0xb2, 0x92, 0x75, 0x4e, 0x74, 0xa3, 0x68, 0x74, 0x91, 0x80,\n",
-            "  0x55, 0x8e, 0x88, 0x73, 0x70, 0x81, 0xa1, 0xb8, 0x96, 0x48, 0x67, 0xb2,\n",
-            "  0x76, 0xa1, 0x98, 0xa9, 0x61, 0x6c, 0x5f, 0x98, 0x84, 0x92, 0xa9, 0x83,\n",
-            "  0x9e, 0x74, 0x7b, 0xa2, 0x6f, 0x72, 0x95, 0xa3, 0xb9, 0x80, 0x81, 0x7b,\n",
-            "  0x65, 0x6b, 0x96, 0x8b, 0xae, 0x79, 0x2b, 0x86, 0x5c, 0x2c, 0x8b, 0xa3,\n",
-            "  0x84, 0x74, 0x53, 0x7c, 0x54, 0x4a, 0x65, 0x89, 0xa6, 0x89, 0x47, 0x77,\n",
-            "  0x50, 0x6d, 0x8b, 0x94, 0x8a, 0x61, 0x32, 0x7c, 0x6f, 0x47, 0x78, 0xa2,\n",
-            "  0x9f, 0x42, 0x42, 0x71, 0x78, 0x76, 0x9e, 0x88, 0x70, 0x70, 0x56, 0x8a,\n",
-            "  0x83, 0x95, 0xa7, 0x9d, 0x9d, 0x88, 0x9a, 0x92, 0x48, 0x63, 0xaf, 0x91,\n",
-            "  0x6c, 0x75, 0x5d, 0x5e, 0x83, 0x86, 0xaa, 0x6f, 0x79, 0x84, 0x67, 0x79,\n",
-            "  0x63, 0x69, 0x8e, 0x81, 0x6a, 0x96, 0x8d, 0x86, 0x7b, 0x9f, 0xaa, 0x8e,\n",
-            "  0x63, 0x89, 0x9a, 0x7a, 0x5e, 0x7c, 0x87, 0x83, 0x81, 0x64, 0x7e, 0x59,\n",
-            "  0x6d, 0x5c, 0xa4, 0x72, 0x78, 0x85, 0x9b, 0x79, 0x85, 0x7d, 0x9c, 0x7d,\n",
-            "  0x9c, 0x5c, 0x66, 0x75, 0x66, 0x72, 0xb4, 0x7c, 0x83, 0x9e, 0x90, 0xae,\n",
-            "  0x69, 0x71, 0xb0, 0x84, 0x86, 0x50, 0x66, 0xab, 0x75, 0x96, 0xa8, 0x6c,\n",
-            "  0x87, 0x7b, 0x7e, 0x7c, 0x60, 0x55, 0x96, 0xb0, 0x6a, 0x79, 0x42, 0x9c,\n",
-            "  0x97, 0xa8, 0xb2, 0x9a, 0xa0, 0x84, 0x68, 0x90, 0x90, 0x98, 0x67, 0x9c,\n",
-            "  0xa3, 0x81, 0x71, 0xaa, 0x93, 0x6a, 0x84, 0x8c, 0x77, 0x79, 0x4d, 0x82,\n",
-            "  0x45, 0x1e, 0x7b, 0x94, 0x86, 0x86, 0x26, 0x82, 0x41, 0x6f, 0x8b, 0x86,\n",
-            "  0xa4, 0x80, 0x38, 0x71, 0x5e, 0x5b, 0x9a, 0x73, 0x86, 0x60, 0x5a, 0x9d,\n",
-            "  0x7b, 0x53, 0x89, 0xa0, 0x99, 0x76, 0x57, 0x81, 0x76, 0x5a, 0x9e, 0x85,\n",
-            "  0x5a, 0x7b, 0x56, 0x74, 0x71, 0x6a, 0x9c, 0x68, 0x7e, 0x76, 0x7d, 0x7f,\n",
-            "  0x52, 0x71, 0x85, 0xa2, 0x96, 0x63, 0x73, 0x7c, 0x7a, 0x97, 0x9f, 0x7c,\n",
-            "  0x77, 0x77, 0x59, 0x6b, 0x62, 0x77, 0xbc, 0x6b, 0x7c, 0x79, 0x75, 0x90,\n",
-            "  0x67, 0x82, 0x92, 0x9c, 0x81, 0x92, 0x84, 0x7a, 0x72, 0x5b, 0x86, 0x82,\n",
-            "  0x87, 0x73, 0x87, 0x7c, 0x57, 0x76, 0xa6, 0x7d, 0x7d, 0x94, 0x6a, 0x67,\n",
-            "  0x76, 0x89, 0x9a, 0x6d, 0x7d, 0xa4, 0x6d, 0x7e, 0x74, 0x7e, 0x8f, 0xad,\n",
-            "  0x99, 0x55, 0x5c, 0x82, 0x75, 0x9e, 0xae, 0x76, 0x6b, 0x93, 0x5d, 0x92,\n",
-            "  0x6e, 0x54, 0x88, 0x8f, 0x6a, 0x72, 0x64, 0x93, 0x6e, 0x63, 0x8c, 0xa7,\n",
-            "  0xa6, 0x7a, 0x57, 0x9f, 0x94, 0x91, 0xbd, 0xa4, 0x92, 0x7a, 0x68, 0x9d,\n",
-            "  0x7d, 0x6b, 0x6b, 0xbc, 0xad, 0x7a, 0x73, 0x92, 0x7b, 0x6d, 0x91, 0x6a,\n",
-            "  0x66, 0x8d, 0x34, 0x9b, 0x75, 0x3b, 0x93, 0x78, 0x88, 0x58, 0x1a, 0x7f,\n",
-            "  0x52, 0x61, 0xa3, 0xb1, 0x9c, 0x60, 0x1d, 0x90, 0x7b, 0x37, 0x9f, 0x84,\n",
-            "  0xa3, 0x6c, 0x2e, 0xac, 0x73, 0x62, 0x92, 0x9a, 0x94, 0x6b, 0x5c, 0x82,\n",
-            "  0x5f, 0x4c, 0x9a, 0x8c, 0x76, 0x69, 0x77, 0x5f, 0x5d, 0x91, 0x80, 0x9a,\n",
-            "  0x60, 0x4c, 0x7b, 0x57, 0x67, 0x6b, 0x92, 0x93, 0x64, 0x91, 0x55, 0x75,\n",
-            "  0x41, 0x82, 0x78, 0x68, 0xa2, 0x55, 0x6a, 0x69, 0x59, 0x70, 0x8a, 0x7b,\n",
-            "  0x70, 0x6e, 0x63, 0x83, 0x7f, 0xa4, 0x80, 0x85, 0x86, 0x93, 0x7e, 0x6f,\n",
-            "  0x7b, 0x94, 0xa4, 0xa7, 0x97, 0x7a, 0x87, 0x64, 0x4a, 0x97, 0x94, 0x6a,\n",
-            "  0x96, 0x73, 0x5e, 0x79, 0x6a, 0x99, 0x86, 0xa0, 0x93, 0xac, 0x79, 0x76,\n",
-            "  0x7f, 0x7b, 0xa7, 0x75, 0x8a, 0x71, 0x53, 0x87, 0x93, 0x7f, 0x9e, 0x7b,\n",
-            "  0x81, 0x70, 0x68, 0x8b, 0x8c, 0x9c, 0xaf, 0xa7, 0x6a, 0x9b, 0x49, 0x6d,\n",
-            "  0x67, 0x80, 0x8b, 0x86, 0x9f, 0x80, 0x74, 0x7a, 0x96, 0x74, 0xc8, 0x9d,\n",
-            "  0xa4, 0x74, 0x71, 0x6c, 0x75, 0x6a, 0x9a, 0x95, 0x97, 0x8c, 0x6e, 0x8a,\n",
-            "  0x85, 0x62, 0x5f, 0x7e, 0x9e, 0x6b, 0x48, 0x93, 0x44, 0x37, 0x83, 0xa2,\n",
-            "  0x97, 0x72, 0x25, 0x79, 0x32, 0x39, 0x68, 0x8f, 0x93, 0x61, 0x2b, 0x96,\n",
-            "  0x94, 0x43, 0x82, 0x6e, 0x8f, 0x6d, 0x53, 0x9b, 0x65, 0x50, 0x70, 0x9d,\n",
-            "  0x7d, 0x53, 0x3b, 0x86, 0x77, 0x6c, 0xa6, 0x90, 0x6b, 0x3e, 0x7b, 0x7a,\n",
-            "  0x50, 0x81, 0xb4, 0x76, 0xa5, 0x74, 0x8b, 0x73, 0x79, 0x69, 0xa8, 0x9a,\n",
-            "  0x82, 0x4a, 0x5e, 0x6c, 0x8d, 0x66, 0xa3, 0x80, 0x8d, 0x74, 0x5b, 0x7c,\n",
-            "  0x77, 0xaa, 0x82, 0x69, 0x5e, 0x7d, 0x7f, 0x63, 0xa3, 0x8c, 0xb3, 0x9a,\n",
-            "  0x81, 0x8f, 0x7b, 0x77, 0x60, 0x89, 0x6a, 0x82, 0x5a, 0x7a, 0x71, 0x61,\n",
-            "  0x93, 0x73, 0x8b, 0xb0, 0xa2, 0x92, 0x7c, 0x84, 0x8b, 0x72, 0x91, 0x8d,\n",
-            "  0x91, 0x80, 0x6c, 0x75, 0x7a, 0xb3, 0x95, 0x5e, 0xa5, 0x5d, 0x54, 0x8b,\n",
-            "  0x63, 0x91, 0xa7, 0x68, 0x96, 0x4c, 0x5a, 0x86, 0x76, 0x82, 0xb6, 0xa0,\n",
-            "  0x68, 0x6b, 0x53, 0x76, 0x60, 0x65, 0x90, 0xaf, 0x82, 0x66, 0x80, 0x7b,\n",
-            "  0x84, 0xa0, 0xb0, 0xb8, 0x81, 0x6e, 0x81, 0x8a, 0x74, 0x6e, 0x97, 0xa8,\n",
-            "  0x89, 0x7b, 0x7b, 0x6e, 0x63, 0x74, 0x5a, 0x7b, 0x7e, 0x84, 0x40, 0x95,\n",
-            "  0x73, 0x3c, 0x7c, 0x72, 0x9b, 0x92, 0x27, 0x87, 0x69, 0x5b, 0x99, 0x8a,\n",
-            "  0xa8, 0x65, 0x36, 0x8f, 0x86, 0x3e, 0xa1, 0x79, 0x9f, 0x4d, 0x41, 0xc5,\n",
-            "  0x8c, 0x6a, 0x7e, 0x7f, 0x68, 0x49, 0x5c, 0x91, 0x50, 0x6a, 0x8c, 0x81,\n",
-            "  0x75, 0x4c, 0x6a, 0x74, 0x8a, 0x87, 0xa0, 0x93, 0x7e, 0x6d, 0x52, 0x79,\n",
-            "  0x86, 0x6a, 0x68, 0x6c, 0x83, 0x67, 0x79, 0x73, 0x6f, 0x72, 0x97, 0x84,\n",
-            "  0x8b, 0x78, 0x64, 0x69, 0x8f, 0x92, 0x86, 0x61, 0x5d, 0x85, 0x70, 0x64,\n",
-            "  0x7d, 0xa3, 0x92, 0xa0, 0x72, 0x71, 0x5d, 0x63, 0x7c, 0x70, 0xaf, 0x6f,\n",
-            "  0x93, 0x6a, 0x7e, 0x7f, 0x64, 0xab, 0x85, 0x73, 0x8f, 0x8a, 0x7e, 0x5f,\n",
-            "  0x7a, 0x6f, 0xaa, 0x71, 0x97, 0x7d, 0x60, 0x7c, 0x48, 0x69, 0xa9, 0xaa,\n",
-            "  0x98, 0x7c, 0x61, 0x85, 0x66, 0x97, 0xa2, 0x73, 0x74, 0x65, 0x52, 0x67,\n",
-            "  0x79, 0x8a, 0x79, 0x71, 0x85, 0x6e, 0x6d, 0x67, 0x5e, 0x7f, 0xb9, 0x93,\n",
-            "  0x96, 0x53, 0x69, 0x6e, 0x7f, 0x8f, 0xab, 0x93, 0xa9, 0x70, 0x6e, 0x71,\n",
-            "  0x7e, 0x87, 0x98, 0x7a, 0xae, 0x90, 0x64, 0x88, 0x8a, 0x4f, 0x6d, 0x9e,\n",
-            "  0xac, 0x7e, 0x31, 0x92, 0x50, 0x26, 0x95, 0xb2, 0x90, 0x99, 0x0c, 0x84,\n",
-            "  0x40, 0x4f, 0x8f, 0x76, 0xa4, 0x46, 0x4c, 0x9d, 0x8b, 0x57, 0x81, 0x79,\n",
-            "  0x7b, 0x47, 0x4d, 0x9c, 0x5f, 0x3b, 0x6f, 0x90, 0x7a, 0x3f, 0x66, 0x9d,\n",
-            "  0x6c, 0x45, 0x8b, 0x71, 0x79, 0x62, 0x72, 0x78, 0x93, 0x95, 0x7e, 0x86,\n",
-            "  0x7a, 0x6b, 0x77, 0x74, 0x6b, 0x86, 0xa4, 0x7e, 0x84, 0x48, 0x78, 0x75,\n",
-            "  0x6e, 0x8b, 0x8e, 0x56, 0x69, 0x7b, 0x59, 0x68, 0x5d, 0x77, 0x69, 0x66,\n",
-            "  0x67, 0x9f, 0x75, 0x7b, 0x76, 0x64, 0xc1, 0x78, 0x7d, 0x74, 0x82, 0x73,\n",
-            "  0x73, 0x90, 0xb8, 0x82, 0x7e, 0x70, 0x7b, 0x7a, 0x64, 0xa1, 0x7e, 0x85,\n",
-            "  0x83, 0x81, 0x60, 0x7b, 0x91, 0x82, 0x6f, 0x95, 0xa0, 0x86, 0x6d, 0x88,\n",
-            "  0x75, 0x8d, 0x94, 0x90, 0x76, 0x6d, 0x6e, 0x79, 0x64, 0x74, 0xa8, 0xb1,\n",
-            "  0x92, 0x6e, 0x61, 0x79, 0x74, 0x91, 0x95, 0x74, 0x65, 0x74, 0x5e, 0x7f,\n",
-            "  0x8b, 0x60, 0x9b, 0x9f, 0x74, 0x77, 0x4c, 0x66, 0x7c, 0x80, 0x97, 0x98,\n",
-            "  0x9d, 0x86, 0x55, 0x8a, 0x8a, 0x79, 0x8c, 0x82, 0xb0, 0x7d, 0x63, 0x8c,\n",
-            "  0x5d, 0x5b, 0x82, 0x58, 0x84, 0x56, 0x51, 0x92, 0x75, 0x24, 0x97, 0x92,\n",
-            "  0x75, 0x6e, 0x19, 0x8e, 0x47, 0x3e, 0x7b, 0x7b, 0x87, 0x6b, 0x3f, 0xa9,\n",
-            "  0x59, 0x40, 0x86, 0x74, 0x69, 0x4a, 0x2d, 0xad, 0x91, 0x62, 0xb2, 0xa9,\n",
-            "  0x74, 0x6c, 0x47, 0x94, 0x51, 0x75, 0xb2, 0x6f, 0x75, 0x4b, 0x60, 0xa2,\n",
-            "  0x8e, 0x6a, 0xa4, 0x79, 0x6f, 0x57, 0x80, 0x8c, 0x6c, 0x8e, 0x9e, 0x74,\n",
-            "  0x70, 0x5f, 0x66, 0x80, 0x80, 0x89, 0xb5, 0x8a, 0x7a, 0x96, 0x87, 0x7a,\n",
-            "  0x7b, 0x85, 0x90, 0x79, 0x59, 0x6d, 0x77, 0x8c, 0x8f, 0x82, 0xb3, 0x9c,\n",
-            "  0x6a, 0x6a, 0x6b, 0x70, 0x77, 0x89, 0x96, 0x86, 0x94, 0x72, 0x7e, 0x72,\n",
-            "  0xa9, 0x93, 0x8d, 0x7a, 0x6d, 0x8f, 0x66, 0x72, 0x9a, 0x91, 0x9e, 0x98,\n",
-            "  0xa0, 0x8b, 0x50, 0x76, 0x5c, 0x74, 0xbc, 0x9a, 0x98, 0x73, 0x80, 0x7d,\n",
-            "  0x73, 0x7c, 0xc0, 0x8b, 0x86, 0x7a, 0x66, 0x86, 0x83, 0x72, 0x8f, 0x96,\n",
-            "  0x98, 0x56, 0x45, 0x7b, 0x77, 0x92, 0xac, 0x8a, 0xae, 0x43, 0x33, 0x73,\n",
-            "  0x78, 0x83, 0x98, 0x84, 0x86, 0x78, 0x54, 0x7e, 0x70, 0x5f, 0xa6, 0xa1,\n",
-            "  0x94, 0x81, 0x73, 0x8d, 0x83, 0x5b, 0x88, 0x71, 0xb2, 0x91, 0x50, 0x99,\n",
-            "  0x6b, 0x47, 0x72, 0x92, 0x87, 0x6d, 0x07, 0x99, 0x57, 0x3d, 0x8d, 0x83,\n",
-            "  0x9d, 0x49, 0x40, 0x9d, 0x5c, 0x57, 0x95, 0x73, 0x6e, 0x4b, 0x49, 0xab,\n",
-            "  0x97, 0x58, 0x8b, 0x7a, 0x7a, 0x48, 0x47, 0x8b, 0x7e, 0x5d, 0xa9, 0x6d,\n",
-            "  0x8a, 0x3f, 0x60, 0x82, 0x86, 0x98, 0xa9, 0x7c, 0x74, 0x59, 0x9b, 0x80,\n",
-            "  0x4e, 0x75, 0x9c, 0x5e, 0x75, 0x8c, 0x67, 0x7e, 0x78, 0x75, 0x87, 0x6c,\n",
-            "  0x79, 0x73, 0x63, 0x77, 0x6e, 0x7a, 0x8d, 0x73, 0x4e, 0x72, 0x4a, 0x7c,\n",
-            "  0x8f, 0x79, 0x70, 0x7a, 0x70, 0x73, 0x7b, 0x7a, 0x62, 0xa1, 0x7b, 0x63,\n",
-            "  0x9a, 0x89, 0x76, 0x64, 0x84, 0x7d, 0x9c, 0x94, 0xb0, 0x7f, 0x6c, 0x7b,\n",
-            "  0x8d, 0x89, 0x89, 0x7b, 0x9d, 0x99, 0x64, 0x8b, 0x5c, 0x88, 0xa6, 0x8e,\n",
-            "  0x81, 0x86, 0x7e, 0x85, 0x73, 0x72, 0xad, 0x5d, 0x5f, 0x7e, 0x63, 0x74,\n",
-            "  0x64, 0xa1, 0x9c, 0x83, 0x7c, 0x83, 0x7b, 0x7b, 0x71, 0xa0, 0x9e, 0xaf,\n",
-            "  0x89, 0x79, 0x4c, 0x7c, 0x8c, 0x78, 0x91, 0x87, 0x8a, 0x87, 0x5e, 0x85,\n",
-            "  0x7b, 0x61, 0x9c, 0x88, 0xa5, 0x8d, 0x7c, 0x9c, 0x6b, 0x47, 0x95, 0x85,\n",
-            "  0x81, 0x80, 0x59, 0xb2, 0x4f, 0x3d, 0xae, 0x8c, 0x8d, 0x71, 0x11, 0x95,\n",
-            "  0x31, 0x65, 0x9d, 0xa0, 0x8e, 0x64, 0x42, 0xb9, 0x6a, 0x5c, 0x91, 0x82,\n",
-            "  0x91, 0x50, 0x33, 0xb2, 0x7a, 0x54, 0xac, 0x88, 0x92, 0x61, 0x4e, 0xad,\n",
-            "  0x65, 0x5c, 0x91, 0xb0, 0x72, 0x65, 0x4a, 0x79, 0x68, 0x77, 0x75, 0x5f,\n",
-            "  0x79, 0x6d, 0x6f, 0x7c, 0x4d, 0x71, 0xb8, 0x78, 0x8a, 0x87, 0x6e, 0x72,\n",
-            "  0x7d, 0x79, 0x87, 0x80, 0x5a, 0x78, 0x77, 0x78, 0x80, 0x8f, 0x8c, 0x56,\n",
-            "  0x7a, 0x8b, 0x62, 0x82, 0x5a, 0x96, 0x82, 0x68, 0x71, 0x5d, 0x75, 0x65,\n",
-            "  0x93, 0xb5, 0x71, 0x82, 0x82, 0x8a, 0x4b, 0x7c, 0x62, 0x6f, 0xc1, 0x86,\n",
-            "  0x9d, 0x90, 0x63, 0x71, 0x86, 0x9e, 0x9f, 0x77, 0x90, 0x97, 0x68, 0x81,\n",
-            "  0x5a, 0x8c, 0xab, 0x5e, 0x81, 0x76, 0x83, 0x79, 0x8f, 0xa1, 0x89, 0x79,\n",
-            "  0x81, 0x8a, 0x7e, 0x6c, 0x65, 0x79, 0xc7, 0x89, 0x92, 0x68, 0x78, 0x70,\n",
-            "  0x65, 0x96, 0x9e, 0x82, 0x7d, 0x5f, 0x7b, 0x77, 0x72, 0x84, 0x7e, 0x92,\n",
-            "  0x97, 0x7b, 0x6e, 0x67, 0x81, 0xa1, 0x9a, 0xab, 0x8d, 0x78, 0x61, 0x78,\n",
-            "  0x52, 0x66, 0xaa, 0x77, 0x75, 0xa3, 0x5e, 0xa0, 0x51, 0x40, 0x68, 0xb0,\n",
-            "  0x9a, 0x93, 0x11, 0x82, 0x69, 0x48, 0x9c, 0x77, 0x8d, 0x62, 0x36, 0xac,\n",
-            "  0x6c, 0x4c, 0xa3, 0xab, 0x8f, 0x32, 0x4f, 0xa9, 0x80, 0x68, 0xab, 0x7a,\n",
-            "  0x90, 0x61, 0x5c, 0xa5, 0x84, 0x4c, 0x8c, 0x7a, 0x95, 0x54, 0x72, 0xa0,\n",
-            "  0x66, 0x85, 0xb3, 0x91, 0x69, 0x64, 0x68, 0x56, 0x66, 0x8d, 0xa0, 0x9f,\n",
-            "  0x7a, 0x88, 0x5d, 0x7d, 0x48, 0x80, 0x7f, 0x7c, 0x7c, 0x99, 0x65, 0x81,\n",
-            "  0x73, 0x8b, 0x8c, 0x61, 0x44, 0x60, 0x53, 0x8e, 0x64, 0x80, 0x9c, 0x74,\n",
-            "  0x5d, 0x70, 0x8f, 0x5a, 0x68, 0x7a, 0x82, 0xa1, 0x75, 0x7b, 0x83, 0x60,\n",
-            "  0x75, 0x5e, 0xa2, 0x94, 0x6a, 0x88, 0x78, 0x71, 0x95, 0x70, 0x8b, 0x86,\n",
-            "  0x7e, 0x94, 0x5f, 0x65, 0x5f, 0xb1, 0x97, 0x99, 0x94, 0x84, 0x88, 0x7d,\n",
-            "  0x50, 0x8c, 0xaa, 0x81, 0x7b, 0x7c, 0x77, 0x65, 0x5e, 0x91, 0x9c, 0x89,\n",
-            "  0x8c, 0x85, 0x75, 0x62, 0x7b, 0x78, 0xc3, 0x7a, 0x62, 0x8c, 0x66, 0x6f,\n",
-            "  0x79, 0x7a, 0x9c, 0x6d, 0x7c, 0x6b, 0x5c, 0x7d, 0x6d, 0x54, 0x93, 0x87,\n",
-            "  0x7a, 0x7a, 0x50, 0x85, 0x60, 0x56, 0x5e, 0x6b, 0x90, 0x7c, 0x52, 0xa5,\n",
-            "  0x54, 0x42, 0x7b, 0x75, 0x83, 0x8c, 0x2c, 0xa6, 0x6f, 0x62, 0x78, 0x78,\n",
-            "  0x86, 0x36, 0x4b, 0xaa, 0x86, 0x54, 0x92, 0x8d, 0x7f, 0x53, 0x37, 0xbe,\n",
-            "  0x86, 0x7a, 0x90, 0x7e, 0x8e, 0x50, 0x58, 0xa6, 0x82, 0x58, 0x73, 0x74,\n",
-            "  0x66, 0x5c, 0x6a, 0x7f, 0xa2, 0x69, 0xbd, 0xa9, 0x74, 0x76, 0x75, 0x6f,\n",
-            "  0x45, 0x6c, 0xa5, 0x79, 0x82, 0x67, 0x56, 0x7c, 0x7f, 0x81, 0x67, 0x6d,\n",
-            "  0x81, 0x87, 0x71, 0x69, 0x69, 0x81, 0x85, 0x84, 0x5a, 0x8c, 0x5f, 0x73,\n",
-            "  0x80, 0x9c, 0x9e, 0x90, 0x77, 0xa0, 0x9c, 0x6c, 0x73, 0x8a, 0x84, 0x72,\n",
-            "  0x87, 0xa1, 0x67, 0x64, 0x5d, 0x9b, 0x9d, 0x9b, 0x97, 0x83, 0x5f, 0x61,\n",
-            "  0x77, 0x91, 0xa0, 0x8f, 0x8a, 0x6c, 0x45, 0x5f, 0x6d, 0xa6, 0x9b, 0x76,\n",
-            "  0x86, 0x93, 0x91, 0x7d, 0x54, 0x61, 0xa4, 0x6a, 0x5b, 0x69, 0x5f, 0x6d,\n",
-            "  0x83, 0xaf, 0xa0, 0x78, 0x9d, 0x62, 0x65, 0x69, 0x5f, 0x78, 0xbf, 0x91,\n",
-            "  0x7b, 0x7b, 0x52, 0x5d, 0x70, 0x78, 0xa9, 0x87, 0x93, 0x74, 0x61, 0x74,\n",
-            "  0x8c, 0x61, 0x97, 0x86, 0x9b, 0x7c, 0x7d, 0x75, 0x4b, 0x64, 0xa7, 0x81,\n",
-            "  0x8a, 0x9c, 0x29, 0xa2, 0x5f, 0x38, 0x6a, 0xb0, 0x82, 0x53, 0x1a, 0xa7,\n",
-            "  0x38, 0x47, 0x97, 0x90, 0x8d, 0x41, 0x25, 0xa7, 0x65, 0x63, 0x8b, 0x79,\n",
-            "  0x8f, 0x3e, 0x21, 0xd0, 0x5e, 0x5d, 0x9d, 0x68, 0x75, 0x3e, 0x68, 0xb6,\n",
-            "  0x6a, 0x50, 0x9a, 0x71, 0x81, 0x45, 0x6d, 0x9a, 0x7f, 0x86, 0x9c, 0x63,\n",
-            "  0x7d, 0x74, 0x69, 0x7d, 0x5a, 0x6a, 0x8d, 0x72, 0x6b, 0x69, 0x4c, 0x6f,\n",
-            "  0x7c, 0x8e, 0xa6, 0x83, 0x70, 0x65, 0x5f, 0x78, 0x69, 0x67, 0x7f, 0x8d,\n",
-            "  0x58, 0x76, 0x4a, 0x85, 0x80, 0x89, 0x9f, 0x91, 0x52, 0x62, 0x72, 0x60,\n",
-            "  0x7b, 0x5c, 0x77, 0x6f, 0x9d, 0xa4, 0x98, 0x70, 0x6f, 0xad, 0x94, 0x9f,\n",
-            "  0x7b, 0x89, 0x74, 0x7e, 0x5d, 0x8d, 0xab, 0x98, 0x8f, 0x90, 0x82, 0x84,\n",
-            "  0x60, 0x7c, 0xb7, 0x8e, 0x79, 0x83, 0x56, 0x86, 0x87, 0x79, 0x95, 0x75,\n",
-            "  0x78, 0x71, 0x58, 0x73, 0x87, 0x5d, 0xc6, 0x9f, 0x75, 0x61, 0x4f, 0x71,\n",
-            "  0x91, 0x88, 0xb3, 0x8c, 0x7d, 0x7c, 0x6a, 0x75, 0x6d, 0x66, 0x8e, 0x94,\n",
-            "  0x96, 0x74, 0x59, 0x6f, 0x6d, 0x65, 0xb0, 0x8e, 0x7b, 0x89, 0x7a, 0x6a,\n",
-            "  0x7d, 0x57, 0x82, 0x7a, 0x61, 0x9f, 0x50, 0xab, 0x57, 0x46, 0x86, 0x8d,\n",
-            "  0xa3, 0x96, 0x18, 0xab, 0x51, 0x6e, 0xb3, 0x7e, 0x90, 0x6d, 0x6d, 0xc0,\n",
-            "  0x54, 0x35, 0x96, 0x84, 0x8e, 0x49, 0x28, 0xe4, 0x81, 0x5f, 0x9b, 0x87,\n",
-            "  0x8c, 0x33, 0x56, 0xb4, 0x61, 0x5e, 0x8b, 0x81, 0x99, 0x61, 0x6b, 0x96,\n",
-            "  0x75, 0x82, 0x9e, 0x7c, 0x90, 0x63, 0x64, 0x6b, 0x55, 0x6e, 0xb6, 0x7f,\n",
-            "  0x5f, 0x55, 0x65, 0x60, 0x35, 0x8a, 0x85, 0x91, 0x4d, 0x62, 0x90, 0x90,\n",
-            "  0x57, 0x5a, 0x9f, 0x7b, 0x4c, 0x86, 0x73, 0x83, 0x4a, 0x6d, 0xb0, 0x67,\n",
-            "  0x65, 0x89, 0x54, 0x68, 0x89, 0x7b, 0x72, 0x4f, 0x7a, 0x93, 0x61, 0x7e,\n",
-            "  0x79, 0x89, 0x8f, 0x9c, 0x7b, 0x70, 0x48, 0x67, 0x82, 0x75, 0xaa, 0x92,\n",
-            "  0x9a, 0x8f, 0x79, 0x8c, 0x64, 0x94, 0x98, 0x83, 0x7c, 0x8f, 0x5c, 0x77,\n",
-            "  0x70, 0x90, 0x91, 0x88, 0x7d, 0x51, 0x5d, 0x5d, 0x8b, 0x9f, 0xbc, 0x78,\n",
-            "  0x9e, 0x73, 0x67, 0x6d, 0x82, 0x8d, 0xc9, 0x86, 0x96, 0x6a, 0x5d, 0x79,\n",
-            "  0x7e, 0x6b, 0xb2, 0x79, 0x88, 0x85, 0x65, 0x73, 0x75, 0x6b, 0x9e, 0x7f,\n",
-            "  0x8e, 0x94, 0x8e, 0x7d, 0x74, 0x61, 0x97, 0x56, 0x97, 0x6b, 0x30, 0xb6,\n",
-            "  0x5f, 0x5a, 0xaa, 0xa5, 0x85, 0x5d, 0x01, 0xbc, 0x79, 0x63, 0x6e, 0x82,\n",
-            "  0x72, 0x26, 0x4f, 0xc8, 0x98, 0x56, 0x85, 0x9a, 0x81, 0x1f, 0x48, 0xcf,\n",
-            "  0x84, 0x74, 0x75, 0x87, 0xae, 0x43, 0x6f, 0xdf, 0x6a, 0x4e, 0x97, 0x5d,\n",
-            "  0x8f, 0x37, 0x55, 0x89, 0x7d, 0x82, 0xb1, 0x89, 0x6d, 0x52, 0x65, 0x8b,\n",
-            "  0x71, 0x87, 0x8d, 0x6a, 0x99, 0x5d, 0x65, 0x78, 0x67, 0x8d, 0x7b, 0x51,\n",
-            "  0x60, 0x8a, 0x59, 0x72, 0x78, 0x93, 0x88, 0x75, 0x46, 0x60, 0x6e, 0x79,\n",
-            "  0x7b, 0x9d, 0x9c, 0x8c, 0x5c, 0x7c, 0x69, 0x71, 0x60, 0x6f, 0xb0, 0x7d,\n",
-            "  0x4c, 0x5e, 0x88, 0x77, 0x74, 0x6a, 0x6f, 0x9a, 0xa2, 0x83, 0x48, 0x5a,\n",
-            "  0x6e, 0xa2, 0x8b, 0x7a, 0x65, 0x5b, 0x4b, 0x80, 0x5b, 0x8f, 0xaf, 0x8e,\n",
-            "  0x93, 0x4a, 0x59, 0x6e, 0x5e, 0x89, 0x91, 0x87, 0x73, 0x6a, 0x47, 0x6c,\n",
-            "  0x6c, 0x81, 0xad, 0x5a, 0x76, 0x51, 0x51, 0x6c, 0x80, 0x92, 0x9d, 0xae,\n",
-            "  0x90, 0x71, 0x6c, 0x7a, 0x7c, 0x84, 0xa7, 0x7d, 0x82, 0x7c, 0x80, 0x59,\n",
-            "  0x7d, 0x86, 0xa9, 0x94, 0x8e, 0x7b, 0x7c, 0x67, 0x67, 0x66, 0x8f, 0x49,\n",
-            "  0x5d, 0xa4, 0x4a, 0xbc, 0x5a, 0x34, 0xa7, 0xaa, 0x9e, 0x86, 0x17, 0xc0,\n",
-            "  0x53, 0x67, 0x76, 0xae, 0x8d, 0x37, 0x4a, 0xd6, 0x76, 0x69, 0x95, 0x7a,\n",
-            "  0x8a, 0x0e, 0x3f, 0xe8, 0x60, 0x4d, 0x9e, 0x90, 0xad, 0x44, 0x46, 0xc5,\n",
-            "  0x4c, 0x6e, 0x72, 0x8c, 0x89, 0x49, 0x51, 0xa0, 0x60, 0x84, 0x84, 0x9d,\n",
-            "  0xa4, 0x5a, 0x84, 0x8d, 0x69, 0x6a, 0x97, 0x78, 0x72, 0x66, 0x72, 0x9b,\n",
-            "  0x74, 0x7a, 0x95, 0x7c, 0x7a, 0x6e, 0x74, 0x7f, 0x65, 0x94, 0x77, 0x7e,\n",
-            "  0x85, 0x6d, 0x65, 0x7b, 0x63, 0x7b, 0x87, 0x49, 0x80, 0x74, 0x74, 0x85,\n",
-            "  0x6e, 0x78, 0xad, 0x66, 0x8a, 0x65, 0x54, 0x7c, 0x4e, 0x62, 0x97, 0x7f,\n",
-            "  0x82, 0x6c, 0x58, 0x79, 0x91, 0x94, 0xb3, 0x7a, 0x88, 0x82, 0x60, 0x7f,\n",
-            "  0x8c, 0xa7, 0x7b, 0x93, 0x77, 0x49, 0x6f, 0x6f, 0x5a, 0x8d, 0x93, 0x8b,\n",
-            "  0x87, 0x59, 0x7d, 0x5e, 0x83, 0x7e, 0x8c, 0x7a, 0x91, 0x4e, 0x6f, 0x89,\n",
-            "  0x8a, 0x87, 0x8b, 0x85, 0x8e, 0x43, 0x63, 0x8d, 0x90, 0x6c, 0xa5, 0x73,\n",
-            "  0x8a, 0x78, 0x5f, 0x73, 0x88, 0x57, 0x9e, 0x8f, 0x7f, 0x91, 0x70, 0x77,\n",
-            "  0x8a, 0x76, 0xa2, 0x77, 0x53, 0x86, 0x51, 0xd8, 0xa9, 0x5b, 0x9b, 0x96,\n",
-            "  0x7c, 0x71, 0x01, 0xd4, 0x56, 0x4a, 0x95, 0xab, 0x91, 0x54, 0x45, 0xe5,\n",
-            "  0x74, 0x4f, 0x87, 0x6a, 0xa2, 0x3e, 0x47, 0xff, 0x91, 0x4d, 0x94, 0x97,\n",
-            "  0x6d, 0x74, 0x77, 0xe0, 0x5d, 0x4e, 0x5f, 0x73, 0x70, 0x3a, 0x68, 0xb2,\n",
-            "  0x78, 0x61, 0x8c, 0x77, 0xa8, 0x57, 0x8c, 0x99, 0x23, 0x5a, 0x84, 0x78,\n",
-            "  0x9b, 0x7f, 0x5e, 0xa0, 0x49, 0x84, 0x83, 0x94, 0x99, 0x4d, 0x8d, 0x9a,\n",
-            "  0x86, 0x90, 0x9b, 0x51, 0x75, 0x73, 0x78, 0x89, 0x59, 0x64, 0x78, 0x91,\n",
-            "  0x72, 0x9c, 0x72, 0x7e, 0x65, 0x6a, 0x80, 0xaa, 0x94, 0x65, 0x6d, 0x87,\n",
-            "  0x73, 0x93, 0x97, 0x7d, 0x99, 0x63, 0x75, 0x89, 0x67, 0xa1, 0x90, 0x7f,\n",
-            "  0x88, 0x65, 0x6d, 0x8f, 0x7d, 0x62, 0x91, 0xa7, 0x8b, 0x73, 0x51, 0x88,\n",
-            "  0x66, 0x66, 0x99, 0xa7, 0x7c, 0x54, 0x82, 0x67, 0x64, 0x8a, 0x95, 0x7c,\n",
-            "  0x8a, 0x5d, 0x5e, 0x68, 0x4b, 0x75, 0x92, 0x7a, 0x9f, 0x66, 0x71, 0x8d,\n",
-            "  0x76, 0x72, 0x8e, 0x77, 0x76, 0x8c, 0x5b, 0x88, 0x9a, 0x92, 0x7c, 0x74,\n",
-            "  0x95, 0xaa, 0x71, 0x77, 0x97, 0x93, 0x9e, 0x62, 0x96, 0x6a, 0x49, 0xd8,\n",
-            "  0x81, 0x99, 0xae, 0x87, 0x6c, 0x76, 0x3e, 0xd9, 0x6e, 0x95, 0xa3, 0x86,\n",
-            "  0x60, 0x6c, 0x5c, 0xbe, 0x98, 0x8a, 0x99, 0x7c, 0x47, 0x45, 0x69, 0xeb,\n",
-            "  0x9d, 0x7d, 0xbb, 0x90, 0x66, 0x69, 0x70, 0xc6, 0x7b, 0x59, 0x9e, 0x87,\n",
-            "  0x58, 0x76, 0x7c, 0xae, 0x72, 0x7d, 0x9f, 0x92, 0x82, 0x58, 0x51, 0x7a,\n",
-            "  0x5d, 0x77, 0xa8, 0x7c, 0x56, 0x68, 0x88, 0x8a, 0x7e, 0x8a, 0x98, 0x68,\n",
-            "  0x64, 0x79, 0x6e, 0x7a, 0x60, 0x96, 0x98, 0x60, 0x60, 0x71, 0x60, 0x8e,\n",
-            "  0x7c, 0x8c, 0x92, 0x92, 0x77, 0x80, 0x90, 0x91, 0x81, 0x82, 0x9c, 0x80,\n",
-            "  0x61, 0x7f, 0x5a, 0x8e, 0x88, 0x7c, 0x8e, 0x79, 0x69, 0x8e, 0x4e, 0x7e,\n",
-            "  0x84, 0x9e, 0x67, 0x72, 0x5c, 0x78, 0x7b, 0x8c, 0x65, 0x7d, 0x8e, 0xa4,\n",
-            "  0x5e, 0x7a, 0x5c, 0x97, 0x6a, 0x81, 0xab, 0x85, 0x4d, 0x73, 0x83, 0x96,\n",
-            "  0x8b, 0x7d, 0xa6, 0x69, 0x74, 0x86, 0x73, 0x79, 0x52, 0x8c, 0xa0, 0x86,\n",
-            "  0x64, 0x7b, 0x84, 0x77, 0x87, 0x93, 0x7d, 0x6d, 0x98, 0x6d, 0x88, 0x5f,\n",
-            "  0x7c, 0x84, 0x92, 0x82, 0x81, 0x76, 0x85, 0x77, 0x98, 0x85, 0x88, 0x68,\n",
-            "  0x7d, 0x71, 0x3c, 0xf1, 0x83, 0x86, 0xa2, 0xb3, 0x6e, 0x77, 0x53, 0xe8,\n",
-            "  0xa8, 0xc7, 0xb3, 0x83, 0x93, 0x83, 0x63, 0xe8, 0x94, 0xb3, 0x86, 0x6e,\n",
-            "  0x75, 0x5d, 0x54, 0xf0, 0x89, 0xa7, 0x94, 0xb1, 0x7e, 0x91, 0x9a, 0xb8,\n",
-            "  0x91, 0x7e, 0x99, 0x50, 0x71, 0x82, 0x8a, 0x91, 0x7a, 0x8a, 0x8b, 0x80,\n",
-            "  0x64, 0x6a, 0x5f, 0xbe, 0x5d, 0x96, 0xb1, 0x82, 0x45, 0x71, 0x8b, 0x95,\n",
-            "  0x7c, 0x9b, 0x89, 0x6d, 0x5b, 0x73, 0x81, 0x90, 0x76, 0xab, 0xa6, 0x88,\n",
-            "  0x62, 0x7d, 0x75, 0x99, 0x7a, 0x8b, 0x6e, 0x9b, 0x83, 0x89, 0x99, 0x93,\n",
-            "  0x81, 0x9e, 0x8a, 0x76, 0x75, 0x7d, 0x6c, 0x93, 0x68, 0x7a, 0x8d, 0x78,\n",
-            "  0x88, 0x93, 0x66, 0xa5, 0x6c, 0xae, 0xb1, 0x83, 0x72, 0x8f, 0x6b, 0x7b,\n",
-            "  0x79, 0x9b, 0x98, 0x7c, 0x82, 0x84, 0x7d, 0x7d, 0x71, 0x7c, 0xb0, 0x81,\n",
-            "  0x74, 0x89, 0x72, 0x89, 0x98, 0xa0, 0x7d, 0x62, 0x2f, 0x50, 0x7d, 0x8b,\n",
-            "  0x4c, 0x83, 0x87, 0x89, 0x57, 0x9e, 0x92, 0x8c, 0x81, 0x7e, 0xb9, 0x95,\n",
-            "  0x7f, 0x76, 0x8e, 0x90, 0x9d, 0x68, 0x78, 0x95, 0x7d, 0xab, 0x84, 0x8a,\n",
-            "  0x64, 0x9f, 0x80, 0x94, 0x8d, 0x89, 0x76, 0x8e, 0x6f, 0x8b, 0x75, 0x7d,\n",
-            "  0x89, 0x74, 0x67, 0x8a, 0x7d, 0x63, 0x79, 0x6d, 0x79, 0x8a, 0x78, 0x7f,\n",
-            "  0x7a, 0x9b, 0x70, 0x70, 0x84, 0x86, 0x80, 0x95, 0x5a, 0x77, 0x80, 0x91,\n",
-            "  0x9c, 0x92, 0x76, 0x81, 0x69, 0x89, 0x78, 0xa5, 0x7a, 0x8d, 0x86, 0x64,\n",
-            "  0x8f, 0x8d, 0x7d, 0xa1, 0x8c, 0x7b, 0x77, 0x7e, 0x80, 0x93, 0x86, 0x68,\n",
-            "  0x90, 0x9c, 0x71, 0x8c, 0x68, 0x52, 0x85, 0x88, 0x89, 0x92, 0x64, 0x8f,\n",
-            "  0x74, 0x64, 0x7c, 0x88, 0x8d, 0x97, 0x77, 0x97, 0x91, 0xac, 0x74, 0x7f,\n",
-            "  0x60, 0x7e, 0x6e, 0x70, 0x86, 0x83, 0x7f, 0x81, 0x6f, 0x94, 0x62, 0xa4,\n",
-            "  0x86, 0x7d, 0x90, 0x7c, 0x89, 0x63, 0x7b, 0x89, 0x75, 0xa1, 0x67, 0x69,\n",
-            "  0xa6, 0x76, 0x69, 0x9c, 0x71, 0x79, 0x76, 0x7a, 0x8e, 0x78, 0x94, 0x75,\n",
-            "  0x5a, 0x76, 0x6b, 0x91, 0x84, 0x75, 0x72, 0x93, 0x79, 0x7e, 0x75, 0x9a,\n",
-            "  0x6f, 0x7a, 0x7b, 0x80, 0x5f, 0x90, 0x74, 0x7d, 0x9b, 0x76, 0x70, 0x89,\n",
-            "  0x8f, 0x5f, 0x7f, 0x9c, 0x93, 0x6d, 0x81, 0x7f, 0x8d, 0x7d, 0x74, 0x5d,\n",
-            "  0x75, 0x88, 0x7b, 0x91, 0x75, 0x6b, 0x7f, 0x8c, 0x71, 0x74, 0x87, 0x88,\n",
-            "  0x83, 0x75, 0x77, 0x96, 0x7f, 0x67, 0x7d, 0x95, 0x81, 0x5c, 0x71, 0x5c,\n",
-            "  0x6e, 0x75, 0x86, 0x92, 0x5d, 0x7a, 0x77, 0x9f, 0x6e, 0x79, 0x68, 0x60,\n",
-            "  0x94, 0x88, 0x88, 0x88, 0x79, 0x7e, 0x8a, 0x6d, 0x84, 0xa7, 0x5b, 0x8e,\n",
-            "  0x67, 0x9c, 0x7e, 0x75, 0x82, 0x96, 0x7c, 0x7b, 0x72, 0x85, 0x8c, 0xa3,\n",
-            "  0x96, 0x5b, 0x93, 0x67, 0x7e, 0x9f, 0x71, 0x82, 0x79, 0x8c, 0x93, 0x9d,\n",
-            "  0x6b, 0x90, 0x8a, 0x8a, 0x55, 0x82, 0x94, 0x74, 0x7d, 0xaa, 0x81, 0x78,\n",
-            "  0x8a, 0x8d, 0x83, 0x7b, 0x97, 0x92, 0x68, 0x64, 0x8c, 0x5d, 0x78, 0x9b,\n",
-            "  0x73, 0x95, 0x78, 0x77, 0x6f, 0x61, 0x7c, 0x9d, 0x85, 0x6e, 0x84, 0x4c,\n",
-            "  0x87, 0x57, 0x93, 0x68, 0x8e, 0x77, 0x78, 0x72, 0x87, 0x91, 0x5f, 0x7e,\n",
-            "  0xa6, 0x75, 0x66, 0x86, 0x7a, 0x7d, 0x70, 0x6f, 0x87, 0x8b, 0x74, 0x85,\n",
-            "  0x7d, 0x8b, 0x7f, 0x70, 0x7e, 0x82, 0x84, 0x75, 0x89, 0xa6, 0x7b, 0x7a,\n",
-            "  0xa5, 0x69, 0x73, 0x74, 0x82, 0x65, 0x8f, 0x98, 0x7b, 0x77, 0x84, 0x92,\n",
-            "  0x73, 0x8a, 0xa1, 0x93, 0x80, 0x81, 0x72, 0x8a, 0x6b, 0x75, 0x8f, 0x98,\n",
-            "  0x73, 0x74, 0x6f, 0x70, 0x51, 0x6a, 0x84, 0x9e, 0x78, 0x9b, 0x8c, 0x81,\n",
-            "  0x7e, 0x75, 0x80, 0x88, 0x73, 0x4e, 0x71, 0x74, 0x8c, 0x74, 0x6a, 0x84,\n",
-            "  0x7f, 0x6b, 0x78, 0xab, 0x77, 0xa2, 0x98, 0x93, 0x77, 0x75, 0x72, 0x5c,\n",
-            "  0x60, 0x74, 0x84, 0x67, 0x83, 0x7d, 0x7f, 0x7c, 0x5c, 0x72, 0x70, 0x7f,\n",
-            "  0x6c, 0x84, 0x90, 0xab, 0x97, 0x7f, 0x6b, 0x82, 0x7f, 0x78, 0x73, 0x7d,\n",
-            "  0x8f, 0x8e, 0x8a, 0x8f, 0x8d, 0xa3, 0x74, 0x6e, 0x5e, 0x8c, 0x94, 0x86,\n",
-            "  0x57, 0xb0, 0x79, 0xa8, 0x7b, 0x8d, 0x83, 0x77, 0x89, 0xb6, 0x60, 0x9d,\n",
-            "  0x77, 0x59, 0x72, 0x4d, 0x6f, 0x94, 0x71, 0x75, 0x61, 0x96, 0x86, 0x5d,\n",
-            "  0x84, 0x68, 0x86, 0x82, 0x8d, 0x70, 0x9a, 0x86, 0x73, 0x64, 0x74, 0x7d,\n",
-            "  0x80, 0x5a, 0x64, 0x81, 0xa1, 0x71, 0x77, 0x65, 0xa3, 0x76, 0xa3, 0x9d,\n",
-            "  0x73, 0x7b, 0x8f, 0x7b, 0x79, 0x7d, 0x6c, 0x85, 0x8e, 0x75, 0x65, 0x6a,\n",
-            "  0x87, 0x70, 0x68, 0x8e, 0x76, 0x5d, 0x66, 0x7c, 0x83, 0x83, 0x7e, 0x89,\n",
-            "  0x59, 0x8c, 0x75, 0x59, 0x87, 0x7e, 0x7f, 0x90, 0x6b, 0x7b, 0x7e, 0x6d,\n",
-            "  0x6e, 0x86, 0x69, 0x92, 0x83, 0x8f, 0x8a, 0x60, 0x78, 0x75, 0x61, 0x91,\n",
-            "  0x73, 0x66, 0x86, 0x86, 0x9f, 0x6f, 0x7b, 0x9a, 0x7c, 0x54, 0x75, 0x8e,\n",
-            "  0x7e, 0x72, 0x8e, 0x98, 0x94, 0x5f, 0x71, 0x7c, 0x95, 0x9f, 0x8e, 0x83,\n",
-            "  0x96, 0x4b, 0x8d, 0x84, 0x81, 0x7d, 0x70, 0x84, 0x70, 0x53, 0x8d, 0x84,\n",
-            "  0x5a, 0x91, 0x88, 0x9a, 0x8f, 0x69, 0x8b, 0x52, 0x85, 0x89, 0x6e, 0x99,\n",
-            "  0x79, 0x89, 0x9a, 0x82, 0x6e, 0x8b, 0x65, 0x62, 0x80, 0xa8, 0x8f, 0x8a,\n",
-            "  0x71, 0x61, 0x7e, 0x7d, 0x7e, 0xaa, 0x7f, 0xa0, 0x5e, 0x67, 0x90, 0x86,\n",
-            "  0x6d, 0xac, 0x74, 0x50, 0x61, 0x91, 0x7d, 0x69, 0x8b, 0x7f, 0x81, 0x7a,\n",
-            "  0x93, 0x8c, 0x72, 0x64, 0x98, 0x88, 0x91, 0x83, 0x69, 0x6d, 0x78, 0x7a,\n",
-            "  0x68, 0x7c, 0x76, 0x81, 0xa7, 0x88, 0x8f, 0x79, 0x7d, 0x6c, 0x8a, 0x60,\n",
-            "  0x88, 0x6d, 0x79, 0x9d, 0x80, 0x82, 0x66, 0x7d, 0x7e, 0x96, 0x78, 0x70,\n",
-            "  0x9b, 0x70, 0x7e, 0x90, 0x77, 0x94, 0x7b, 0x89, 0x78, 0x84, 0x74, 0x6d,\n",
-            "  0x7d, 0xa7, 0x75, 0x97, 0x85, 0x83, 0x86, 0x65, 0x75, 0x9a, 0x7c, 0x68,\n",
-            "  0x87, 0x82, 0x75, 0x68, 0x4c, 0x8a, 0x68, 0x93, 0x7d, 0x88, 0x84, 0x72,\n",
-            "  0x58, 0x81, 0x5d, 0x83, 0x89, 0x63, 0x83, 0x7d, 0x8e, 0x75, 0x8c, 0x88,\n",
-            "  0x7f, 0x57, 0x8c, 0x8f, 0xa6, 0x71, 0x8a, 0x95, 0x88, 0x51, 0x74, 0x8a,\n",
-            "  0x8a, 0x98, 0x72, 0x80, 0x8a, 0x52, 0x90, 0x66, 0x54, 0x8e, 0x7f, 0x94,\n",
-            "  0x81, 0x49, 0x84, 0x70, 0x5c, 0x93, 0x89, 0x6d, 0x82, 0x7f, 0x70, 0x5d,\n",
-            "  0x87, 0x8a, 0x71, 0x70, 0x6f, 0xa1, 0x90, 0x9f, 0x74, 0x7c, 0x8c, 0x8b,\n",
-            "  0x72, 0xbf, 0x89, 0x90, 0x5c, 0x8c, 0x75, 0x72, 0x6f, 0xb2, 0x84, 0x6d,\n",
-            "  0x61, 0x80, 0x7d, 0x7a, 0x66, 0xaa, 0x75, 0x71, 0x89, 0x6d, 0x69, 0x72,\n",
-            "  0x73, 0x98, 0x8c, 0x78, 0x5a, 0x8e, 0x8c, 0x81, 0x55, 0x81, 0x96, 0x67,\n",
-            "  0x6f, 0x71, 0x74, 0x7d, 0x8e, 0x66, 0x9a, 0x67, 0xaa, 0x81, 0x90, 0x79,\n",
-            "  0x89, 0x59, 0x86, 0x66, 0x8f, 0x7d, 0x7e, 0xa2, 0xa4, 0x99, 0x68, 0x7a,\n",
-            "  0x8c, 0x73, 0x85, 0x77, 0x8b, 0x74, 0x75, 0x66, 0xaa, 0x98, 0x59, 0x8b,\n",
-            "  0x91, 0x6c, 0x76, 0x73, 0x87, 0xa4, 0x82, 0x82, 0x63, 0x70, 0x7e, 0x73,\n",
-            "  0x96, 0x97, 0x6f, 0x86, 0x81, 0x6f, 0x83, 0x82, 0x7b, 0x82, 0xa3, 0xa7,\n",
-            "  0x95, 0x77, 0x84, 0x65, 0x9b, 0x94, 0x6e, 0xb0, 0x75, 0x66, 0x78, 0x82,\n",
-            "  0x9c, 0x7a, 0x5f, 0xab, 0x99, 0x2f, 0x7f, 0x68, 0xa4, 0x69, 0x8f, 0x9a,\n",
-            "  0x91, 0x56, 0x6e, 0x75, 0x63, 0x9b, 0x9e, 0x97, 0x95, 0x68, 0x80, 0x6a,\n",
-            "  0x40, 0x95, 0x53, 0x72, 0x6f, 0x6b, 0x91, 0x78, 0x7f, 0x93, 0x70, 0x8d,\n",
-            "  0x62, 0x83, 0x7e, 0x64, 0x5b, 0xaa, 0x70, 0x6c, 0x7e, 0x9c, 0x88, 0x76,\n",
-            "  0x60, 0x70, 0x66, 0x69, 0x84, 0x97, 0x9d, 0x63, 0x5e, 0x9a, 0x7e, 0x52,\n",
-            "  0x58, 0xb8, 0x95, 0x7c, 0x4d, 0x96, 0x8f, 0x70, 0x71, 0xbf, 0x83, 0x83,\n",
-            "  0x9e, 0x70, 0x6f, 0x57, 0x70, 0x9a, 0x8d, 0x6e, 0x98, 0x5a, 0x69, 0x6f,\n",
-            "  0x90, 0x71, 0x8a, 0x5d, 0x8e, 0x6e, 0x69, 0x7a, 0x90, 0x86, 0x89, 0x88,\n",
-            "  0xb6, 0x77, 0x84, 0x79, 0x76, 0x86, 0x86, 0x7c, 0xbf, 0x6d, 0x5c, 0x90,\n",
-            "  0xa1, 0x93, 0x72, 0x63, 0x9a, 0x82, 0x7b, 0x61, 0x91, 0x76, 0x82, 0x96,\n",
-            "  0xb9, 0x80, 0x77, 0x7f, 0xa0, 0x73, 0x61, 0x80, 0x83, 0xc1, 0x92, 0x67,\n",
-            "  0x7c, 0x81, 0x90, 0x67, 0x8b, 0xbe, 0x81, 0x91, 0x6c, 0x7e, 0x8d, 0x6c,\n",
-            "  0x62, 0x83, 0x7e, 0x72, 0x64, 0x8a, 0x83, 0x82, 0xaa, 0x8c, 0x74, 0xab,\n",
-            "  0x79, 0x85, 0x91, 0x79, 0x90, 0x68, 0x5c, 0x9a, 0x7c, 0x36, 0x80, 0x6e,\n",
-            "  0x93, 0x76, 0x5e, 0xa0, 0xa5, 0x63, 0x73, 0x7e, 0x8d, 0x94, 0x63, 0x99,\n",
-            "  0x8f, 0x6a, 0x7f, 0x57, 0x57, 0x6f, 0x6d, 0x86, 0x8e, 0x6b, 0x8d, 0x53,\n",
-            "  0x94, 0xba, 0x84, 0x6f, 0x5a, 0x7b, 0x8c, 0x5f, 0x73, 0x93, 0x8b, 0x87,\n",
-            "  0x6f, 0x9e, 0x8a, 0x87, 0x62, 0x97, 0x86, 0x7c, 0x69, 0xab, 0xa1, 0x95,\n",
-            "  0x42, 0x8c, 0x8b, 0x66, 0x68, 0x99, 0xa8, 0x74, 0x80, 0xa5, 0x7d, 0x82,\n",
-            "  0x55, 0xb3, 0x6f, 0x81, 0xa8, 0x9a, 0x80, 0x67, 0x62, 0x7f, 0x78, 0x93,\n",
-            "  0x90, 0x83, 0x83, 0x7b, 0x77, 0x73, 0x8c, 0x56, 0xa7, 0x85, 0x7b, 0x71,\n",
-            "  0x8f, 0x5d, 0x92, 0x69, 0xbe, 0x5e, 0x7f, 0x7f, 0x8e, 0x71, 0x84, 0x75,\n",
-            "  0x95, 0x69, 0x88, 0x6b, 0x96, 0x85, 0x78, 0x39, 0xc2, 0x86, 0x7c, 0x99,\n",
-            "  0xa1, 0x94, 0x6b, 0x86, 0xb5, 0x5e, 0x7e, 0x6e, 0x81, 0x95, 0x6a, 0x88,\n",
-            "  0x7b, 0x92, 0x8f, 0x68, 0x97, 0x77, 0x84, 0x73, 0x68, 0x96, 0x5a, 0x92,\n",
-            "  0x66, 0x74, 0x74, 0x6c, 0x7d, 0x81, 0x6c, 0x93, 0x7f, 0x72, 0x86, 0x74,\n",
-            "  0xbf, 0x8f, 0x53, 0xa4, 0x89, 0x76, 0xa0, 0x87, 0x97, 0x6a, 0x6b, 0xb1,\n",
-            "  0x91, 0x50, 0x74, 0x68, 0xa3, 0x60, 0x8d, 0xbc, 0xc1, 0x3e, 0x62, 0x59,\n",
-            "  0x71, 0x72, 0x6d, 0x80, 0x9f, 0x52, 0x82, 0x6b, 0x5d, 0x7f, 0x74, 0x7e,\n",
-            "  0x74, 0x84, 0x8a, 0x59, 0x5c, 0x85, 0x6d, 0x9c, 0x75, 0x9a, 0x88, 0x89,\n",
-            "  0x81, 0x9f, 0x81, 0x88, 0x6a, 0x94, 0x84, 0x5f, 0x6b, 0x9b, 0x83, 0x4f,\n",
-            "  0x7e, 0xca, 0x99, 0x6d, 0x45, 0x7f, 0x87, 0x71, 0x69, 0xad, 0x95, 0x53,\n",
-            "  0x6e, 0x9b, 0x90, 0x73, 0x5d, 0xb0, 0x8d, 0x67, 0x83, 0x82, 0xa3, 0x70,\n",
-            "  0x70, 0x92, 0x82, 0x9a, 0x8a, 0x69, 0x6a, 0x6e, 0x7f, 0x89, 0xa4, 0x76,\n",
-            "  0x97, 0x62, 0x94, 0x80, 0x87, 0x55, 0x80, 0x76, 0xb3, 0x7e, 0x7e, 0x71,\n",
-            "  0x94, 0x88, 0x8e, 0x74, 0xb6, 0x4d, 0x7b, 0x73, 0x90, 0x86, 0x7c, 0x66,\n",
-            "  0xb5, 0x80, 0x7f, 0x84, 0x87, 0x82, 0x67, 0x83, 0x97, 0x91, 0x8a, 0x78,\n",
-            "  0x8b, 0x83, 0x5d, 0x84, 0x82, 0x9f, 0x8c, 0x91, 0x84, 0x8b, 0x6a, 0x68,\n",
-            "  0x86, 0x82, 0x73, 0x77, 0x7b, 0x83, 0x6a, 0x84, 0x92, 0x93, 0x90, 0x8b,\n",
-            "  0x4c, 0x94, 0x98, 0x76, 0xb8, 0x7b, 0xa0, 0xa2, 0x7d, 0x3e, 0x95, 0x88,\n",
-            "  0xa3, 0x6f, 0x5e, 0xc8, 0x9a, 0x52, 0x81, 0x86, 0xa3, 0x79, 0x88, 0xc3,\n",
-            "  0xbd, 0x54, 0x6c, 0x5e, 0x83, 0x8a, 0x98, 0x88, 0x92, 0x66, 0x73, 0x5b,\n",
-            "  0x6c, 0x7f, 0x6e, 0x97, 0x8d, 0x58, 0x89, 0x6e, 0x65, 0x7a, 0x7d, 0x7c,\n",
-            "  0x7e, 0x89, 0x94, 0x89, 0x55, 0xb8, 0x8f, 0x82, 0x6c, 0x9c, 0x96, 0x5e,\n",
-            "  0x6f, 0xb2, 0x70, 0x76, 0x95, 0xc8, 0x86, 0x78, 0x49, 0xac, 0x7e, 0x6c,\n",
-            "  0x68, 0xb6, 0xaf, 0x89, 0x68, 0xa5, 0x72, 0x85, 0x69, 0x9c, 0x94, 0x84,\n",
-            "  0xa4, 0x97, 0x91, 0x61, 0x7a, 0xa3, 0x8f, 0x8e, 0x93, 0x80, 0x8d, 0x76,\n",
-            "  0x74, 0x84, 0x9b, 0x79, 0x97, 0x4e, 0x67, 0x87, 0x9b, 0x69, 0x85, 0x7d,\n",
-            "  0xb2, 0x68, 0x76, 0x63, 0xa2, 0x86, 0x97, 0x7f, 0xb5, 0x63, 0x79, 0x76,\n",
-            "  0x8a, 0x7c, 0x7c, 0x91, 0xb1, 0x42, 0x7d, 0x7a, 0x8c, 0x8e, 0x72, 0xab,\n",
-            "  0xb8, 0x76, 0xab, 0x81, 0x98, 0x85, 0x56, 0x98, 0x84, 0x9f, 0x70, 0x86,\n",
-            "  0x76, 0x88, 0x70, 0x8d, 0x71, 0x7b, 0x7a, 0x8d, 0x76, 0x75, 0x62, 0x80,\n",
-            "  0x81, 0x94, 0x82, 0x6e, 0x57, 0x8d, 0xaf, 0x84, 0xbf, 0x85, 0x82, 0xa7,\n",
-            "  0x80, 0x89, 0x95, 0x81, 0x91, 0x49, 0x72, 0xa1, 0xa7, 0x3f, 0x72, 0x8b,\n",
-            "  0x99, 0x72, 0x86, 0xb2, 0xc3, 0x61, 0x55, 0x77, 0x86, 0x77, 0x83, 0xa7,\n",
-            "  0x95, 0x5a, 0x68, 0x68, 0x6a, 0x63, 0x6a, 0x77, 0x93, 0x7c, 0x88, 0x62,\n",
-            "  0x79, 0x84, 0x8b, 0x82, 0x58, 0x8f, 0x9c, 0x56, 0x77, 0xb1, 0x65, 0x8c,\n",
-            "  0x76, 0x91, 0x83, 0x5b, 0x62, 0x91, 0x87, 0x68, 0x71, 0xb0, 0x87, 0x64,\n",
-            "  0x62, 0x91, 0x94, 0x58, 0x7f, 0xac, 0xa3, 0x84, 0x75, 0xaa, 0xa3, 0x4d,\n",
-            "  0x7a, 0xc2, 0x84, 0x8a, 0x6d, 0xa2, 0x76, 0x74, 0x8c, 0x9e, 0x7c, 0x71,\n",
-            "  0x86, 0x70, 0x6d, 0x79, 0x9a, 0x74, 0xb0, 0x8d, 0xa5, 0x7e, 0x6b, 0x63,\n",
-            "  0x96, 0x74, 0x99, 0x76, 0xd0, 0x62, 0x85, 0x9d, 0x8f, 0x6d, 0x83, 0x88,\n",
-            "  0xb0, 0x62, 0x9b, 0x87, 0x91, 0x82, 0x7a, 0x90, 0x9c, 0x61, 0x6d, 0x97,\n",
-            "  0x84, 0x7c, 0x74, 0x8e, 0x8b, 0x75, 0x9a, 0x7e, 0x7c, 0x7d, 0x96, 0x81,\n",
-            "  0x94, 0x69, 0x83, 0x6f, 0x8e, 0x7c, 0x7b, 0x7a, 0x73, 0x98, 0x74, 0x9e,\n",
-            "  0x72, 0x8c, 0x5f, 0x7d, 0x99, 0x79, 0x5b, 0x73, 0x65, 0x78, 0xa5, 0x7d,\n",
-            "  0xa2, 0x98, 0x91, 0x91, 0x87, 0x7b, 0x8c, 0x82, 0xb8, 0x6b, 0x82, 0xba,\n",
-            "  0xa5, 0x3f, 0x83, 0x7a, 0x9b, 0x73, 0x93, 0xa1, 0xbe, 0x55, 0x6b, 0x75,\n",
-            "  0x94, 0x7d, 0x9c, 0xa1, 0x82, 0x50, 0x75, 0x5a, 0x88, 0x6e, 0x72, 0x7f,\n",
-            "  0x99, 0x64, 0x72, 0x49, 0x69, 0x79, 0x6d, 0x94, 0x73, 0x79, 0x80, 0x6f,\n",
-            "  0x72, 0xbc, 0x9d, 0x71, 0x7a, 0x9d, 0x8a, 0x55, 0x74, 0xaa, 0xa1, 0x85,\n",
-            "  0x7e, 0xc4, 0xa0, 0x7e, 0x50, 0x99, 0x68, 0x8c, 0x8a, 0xb0, 0x99, 0x6c,\n",
-            "  0x6d, 0xaf, 0x7b, 0x7b, 0x79, 0xba, 0x8a, 0x7a, 0x9d, 0x8b, 0x67, 0x87,\n",
-            "  0x76, 0xa9, 0x7f, 0x7e, 0x8b, 0x7b, 0x87, 0x84, 0x82, 0x74, 0xa3, 0x91,\n",
-            "  0x9a, 0x6a, 0x93, 0x7e, 0x87, 0x5b, 0x95, 0x89, 0xbb, 0x5d, 0x74, 0x6c,\n",
-            "  0x88, 0x7e, 0x81, 0x7e, 0xb6, 0x6b, 0x91, 0x92, 0x83, 0x78, 0x79, 0x95,\n",
-            "  0x90, 0x5e, 0x68, 0x8f, 0xa8, 0x92, 0x66, 0x8e, 0x6b, 0x8c, 0x86, 0x80,\n",
-            "  0x7e, 0x7e, 0x70, 0x84, 0x7d, 0x71, 0x67, 0x94, 0x71, 0x69, 0x84, 0x8f,\n",
-            "  0x6c, 0x72, 0x85, 0x83, 0x69, 0x76, 0x57, 0x62, 0x83, 0x96, 0x83, 0x77,\n",
-            "  0x64, 0x5f, 0xae, 0x7c, 0xa7, 0x88, 0x91, 0x8c, 0x9e, 0x7f, 0xa8, 0x8a,\n",
-            "  0x93, 0x6f, 0x58, 0xae, 0xb4, 0x4b, 0x7f, 0x64, 0x9f, 0x5a, 0x9e, 0xb6,\n",
-            "  0xa6, 0x6b, 0x79, 0x84, 0x6b, 0x7c, 0x8b, 0x94, 0x85, 0x60, 0x6b, 0x55,\n",
-            "  0x79, 0x68, 0x77, 0x75, 0x85, 0x5c, 0x91, 0x5e, 0x5a, 0x71, 0x68, 0x7b,\n",
-            "  0x73, 0x91, 0x6c, 0x6e, 0x71, 0x8b, 0x76, 0x86, 0x99, 0xb8, 0x91, 0x68,\n",
-            "  0x51, 0xa7, 0x6f, 0x7a, 0x8a, 0xc3, 0x8e, 0x65, 0x64, 0x9e, 0x80, 0x78,\n",
-            "  0x6c, 0xc5, 0xa2, 0x75, 0x71, 0xa5, 0x96, 0x4f, 0x70, 0xa4, 0x7a, 0x7c,\n",
-            "  0x8c, 0x80, 0x89, 0x97, 0x9a, 0x9a, 0x85, 0x89, 0x92, 0x8f, 0x81, 0x6f,\n",
-            "  0x82, 0x6a, 0xb8, 0x74, 0x8f, 0x51, 0x7b, 0x8b, 0x8c, 0x55, 0x7e, 0x8c,\n",
-            "  0xb2, 0x41, 0x85, 0x77, 0x9c, 0x73, 0x75, 0x8d, 0x9f, 0x64, 0x92, 0x77,\n",
-            "  0xa0, 0x87, 0x5f, 0x71, 0x85, 0x68, 0x8a, 0x78, 0x91, 0x78, 0x75, 0x7a,\n",
-            "  0x81, 0x67, 0x96, 0x64, 0x96, 0x85, 0x7a, 0x7e, 0x83, 0x74, 0x82, 0x8f,\n",
-            "  0x98, 0x75, 0x77, 0x84, 0x7e, 0x88, 0x94, 0x7d, 0x79, 0x8c, 0x47, 0x79,\n",
-            "  0x96, 0x7f, 0x8e, 0x90, 0x50, 0x7f, 0xa3, 0x77, 0xa8, 0x7f, 0x65, 0x9f,\n",
-            "  0xb9, 0x4c, 0xa7, 0x7f, 0xaa, 0x6e, 0xa2, 0xb0, 0xb8, 0x51, 0x6b, 0x74,\n",
-            "  0xaa, 0x63, 0x6c, 0xa3, 0xb6, 0x5e, 0x74, 0x6a, 0x75, 0x69, 0x87, 0x7f,\n",
-            "  0x9d, 0x71, 0x73, 0x72, 0x70, 0x57, 0x5a, 0x7e, 0x8b, 0x64, 0x9a, 0x4d,\n",
-            "  0x97, 0x81, 0x7b, 0x75, 0x6e, 0x92, 0x5f, 0x67, 0x7e, 0xaa, 0x90, 0x7a,\n",
-            "  0x92, 0xae, 0x92, 0x68, 0x79, 0x9d, 0x4f, 0x6c, 0x79, 0xb4, 0x9c, 0x58,\n",
-            "  0x86, 0x8e, 0x62, 0x72, 0x71, 0xc1, 0xac, 0x7d, 0x7a, 0x94, 0x8f, 0x7b,\n",
-            "  0x88, 0xa8, 0x8d, 0x82, 0x75, 0x9b, 0x5f, 0x83, 0x82, 0xb3, 0x7a, 0x93,\n",
-            "  0x94, 0x76, 0x70, 0x7e, 0x72, 0x7e, 0x8f, 0x8c, 0xa7, 0x53, 0x72, 0x77,\n",
-            "  0x7a, 0x64, 0xa8, 0x83, 0xc5, 0x56, 0x71, 0x7b, 0x96, 0x73, 0x7c, 0x73,\n",
-            "  0x93, 0x49, 0x83, 0x99, 0xa2, 0x83, 0x74, 0x79, 0xa4, 0x61, 0x8e, 0x84,\n",
-            "  0x7a, 0x7d, 0x56, 0x98, 0x97, 0x6d, 0x87, 0x8c, 0x7a, 0x77, 0x6a, 0x67,\n",
-            "  0x8a, 0x6f, 0xa2, 0x82, 0x8d, 0x85, 0x6d, 0x8f, 0x7e, 0x74, 0x72, 0x74,\n",
-            "  0x91, 0x75, 0x58, 0x7f, 0x9e, 0x7c, 0x9c, 0x75, 0x61, 0x6f, 0x85, 0x7b,\n",
-            "  0xbe, 0x84, 0x85, 0x9b, 0x8c, 0x3b, 0x9a, 0x90, 0xab, 0x77, 0x8e, 0xa2,\n",
-            "  0xbd, 0x55, 0x96, 0x70, 0xa8, 0x78, 0x98, 0x9c, 0xc3, 0x67, 0x6e, 0x81,\n",
-            "  0x70, 0x75, 0x96, 0x9c, 0x8a, 0x5b, 0x73, 0x54, 0x69, 0x6c, 0x5d, 0x82,\n",
-            "  0x99, 0x5b, 0x8c, 0x6d, 0x87, 0x80, 0x67, 0x86, 0x88, 0x7c, 0x70, 0x6b,\n",
-            "  0x75, 0xab, 0x8e, 0x79, 0x90, 0x91, 0xaf, 0x67, 0x5c, 0xa1, 0x5c, 0x6f,\n",
-            "  0x75, 0xa1, 0x95, 0x5f, 0x82, 0x8f, 0x78, 0x5d, 0x7c, 0xb8, 0x8a, 0x8a,\n",
-            "  0x6a, 0x98, 0x6e, 0x51, 0x6b, 0xaa, 0x7d, 0x7c, 0x80, 0x94, 0x79, 0x6d,\n",
-            "  0xaa, 0x8a, 0x7e, 0x77, 0xa4, 0x78, 0xa5, 0x6d, 0x7c, 0x75, 0xa8, 0x6f,\n",
-            "  0xa6, 0x51, 0x8e, 0x80, 0x96, 0x5b, 0x9d, 0x7b, 0xb8, 0x4e, 0x6c, 0x87,\n",
-            "  0x95, 0x7c, 0x78, 0x71, 0xb0, 0x5a, 0x99, 0xa0, 0x90, 0x87, 0x65, 0x8b,\n",
-            "  0x98, 0x68, 0x92, 0x76, 0x82, 0x77, 0x6a, 0x8a, 0x91, 0x84, 0x87, 0x8b,\n",
-            "  0x87, 0x84, 0x7a, 0x81, 0x77, 0x55, 0x8e, 0x86, 0x7a, 0x74, 0x65, 0x88,\n",
-            "  0x62, 0x51, 0xa1, 0x91, 0x88, 0x76, 0x5f, 0x89, 0x9f, 0x86, 0x66, 0x67,\n",
-            "  0x64, 0x75, 0x9e, 0x74, 0xc1, 0x80, 0x58, 0xa9, 0x8f, 0x5e, 0x94, 0x88,\n",
-            "  0xaf, 0x6f, 0x6c, 0xa4, 0xa1, 0x4d, 0x68, 0x66, 0xc2, 0x6e, 0x89, 0x9b,\n",
-            "  0xa3, 0x5a, 0x63, 0x5b, 0x9c, 0x7a, 0x93, 0x76, 0x9d, 0x6d, 0x71, 0x5d,\n",
-            "  0x80, 0x66, 0x79, 0x80, 0x7c, 0x65, 0x74, 0x64, 0x88, 0x90, 0x79, 0x89,\n",
-            "  0x72, 0x88, 0x67, 0x75, 0x6a, 0x96, 0x56, 0x67, 0x88, 0xa1, 0x8c, 0x6c,\n",
-            "  0x55, 0xb2, 0x8a, 0x71, 0x88, 0xdc, 0x7a, 0x72, 0x94, 0x9d, 0x7c, 0x76,\n",
-            "  0x6a, 0xaa, 0xa8, 0x7f, 0x80, 0xa0, 0x6b, 0x6f, 0x84, 0xe0, 0x68, 0x93,\n",
-            "  0xa6, 0x99, 0x69, 0x68, 0x93, 0xa0, 0x93, 0x6b, 0x87, 0x8b, 0x80, 0x90,\n",
-            "  0x90, 0x89, 0x8f, 0x7f, 0xaf, 0x6f, 0x82, 0x6d, 0x94, 0x70, 0x97, 0x8f,\n",
-            "  0xb0, 0x40, 0x9b, 0x67, 0x78, 0x86, 0x90, 0x8b, 0xa7, 0x51, 0x7f, 0x79,\n",
-            "  0x90, 0x71, 0x6d, 0x80, 0x95, 0x63, 0x7d, 0x87, 0xa0, 0x7e, 0x7b, 0x85,\n",
-            "  0x8e, 0x6d, 0xa1, 0x76, 0x70, 0x7b, 0x66, 0x87, 0x90, 0x7a, 0x86, 0x88,\n",
-            "  0x89, 0x87, 0x6a, 0x91, 0x78, 0x74, 0x76, 0x8d, 0x7e, 0x86, 0x63, 0x90,\n",
-            "  0x98, 0x7d, 0x4a, 0x85, 0x4f, 0x9d, 0xa2, 0x7c, 0xb4, 0x88, 0x78, 0xb5,\n",
-            "  0x8f, 0x3f, 0xa7, 0x7d, 0xa4, 0x7c, 0x60, 0x9c, 0xa8, 0x41, 0x6b, 0x7f,\n",
-            "  0xa2, 0x7f, 0x68, 0xaa, 0xb4, 0x73, 0x56, 0x62, 0x87, 0x72, 0xa5, 0x7c,\n",
-            "  0x97, 0x69, 0x58, 0x6b, 0x89, 0x57, 0x51, 0x80, 0x92, 0x7a, 0x7c, 0x4c,\n",
-            "  0x7c, 0x7b, 0x69, 0x5f, 0x90, 0x77, 0x78, 0x67, 0x7a, 0xad, 0x79, 0x5c,\n",
-            "  0x9c, 0xbf, 0xa6, 0x64, 0x53, 0xb3, 0x5e, 0x59, 0x86, 0xb9, 0x94, 0x65,\n",
-            "  0x70, 0x9d, 0x7a, 0x80, 0x7c, 0xae, 0x9c, 0x7b, 0x66, 0xae, 0x83, 0x5f,\n",
-            "  0x81, 0xc5, 0x8b, 0x7e, 0x9b, 0x89, 0x84, 0x7f, 0x7c, 0xa5, 0x5c, 0x89,\n",
-            "  0x8a, 0x75, 0x99, 0x6d, 0x8e, 0x90, 0x9f, 0x81, 0x81, 0x6b, 0x87, 0x76,\n",
-            "  0x92, 0x6f, 0xab, 0x95, 0x95, 0x4c, 0x97, 0x72, 0x80, 0x87, 0x83, 0x87,\n",
-            "  0xa3, 0x59, 0xad, 0x74, 0x93, 0x7f, 0x77, 0x78, 0x8d, 0x66, 0x9b, 0x7a,\n",
-            "  0x7d, 0x95, 0x64, 0x7f, 0x6d, 0x5c, 0x8e, 0x94, 0x92, 0x82, 0x60, 0x8d,\n",
-            "  0x75, 0x55, 0x8c, 0x8b, 0x8f, 0x86, 0x7d, 0x7c, 0x74, 0x57, 0x78, 0x9d,\n",
-            "  0x71, 0x65, 0x66, 0x7f, 0xaa, 0x92, 0x66, 0x81, 0x5a, 0x71, 0xa6, 0x78,\n",
-            "  0x9d, 0x8a, 0x5a, 0x8a, 0x91, 0x59, 0xb7, 0x5c, 0xc3, 0x73, 0x89, 0x9d,\n",
-            "  0xa7, 0x62, 0x77, 0x72, 0x9f, 0x92, 0x6a, 0x9f, 0xaa, 0x71, 0x6b, 0x5e,\n",
-            "  0x7d, 0x73, 0x8d, 0x89, 0xba, 0x61, 0x73, 0x6e, 0x71, 0x8a, 0x79, 0x7c,\n",
-            "  0x94, 0x76, 0x76, 0x65, 0x81, 0x6f, 0x4e, 0x75, 0x6e, 0x8b, 0x7d, 0x50,\n",
-            "  0x56, 0xb8, 0x72, 0x67, 0x93, 0xc6, 0x88, 0x6f, 0x57, 0xb7, 0x80, 0x4c,\n",
-            "  0x97, 0xc4, 0xb6, 0x71, 0x72, 0x9e, 0x6f, 0x72, 0x8d, 0xa5, 0x8f, 0x89,\n",
-            "  0x74, 0xae, 0x78, 0x70, 0x6e, 0xbb, 0x8f, 0x73, 0x74, 0x8b, 0x5e, 0x86,\n",
-            "  0x8b, 0x8a, 0x72, 0x71, 0x84, 0x84, 0x77, 0xa3, 0xa6, 0x73, 0xa4, 0x7e,\n",
-            "  0xab, 0x5d, 0x75, 0x96, 0x94, 0x5f, 0x8b, 0x74, 0x9c, 0x63, 0x8d, 0x81,\n",
-            "  0x80, 0x6a, 0x91, 0x88, 0x93, 0x53, 0x80, 0x75, 0x79, 0x8d, 0x78, 0x74,\n",
-            "  0x7c, 0x73, 0xb2, 0x89, 0x8e, 0xab, 0x75, 0x6c, 0x7a, 0x79, 0x99, 0x77,\n",
-            "  0x7d, 0x89, 0x5a, 0x81, 0x7c, 0x75, 0x6a, 0x7e, 0x8c, 0x83, 0x78, 0x8e,\n",
-            "  0x62, 0x76, 0x77, 0x6b, 0x79, 0x66, 0x6e, 0x82, 0xa1, 0x8d, 0x52, 0x79,\n",
-            "  0x70, 0x7d, 0xa9, 0x6a, 0x95, 0x7f, 0x59, 0x94, 0x8f, 0x73, 0xb7, 0x85,\n",
-            "  0xb3, 0x80, 0x77, 0x9f, 0xb8, 0x4d, 0x82, 0x7c, 0xa0, 0xa4, 0x7b, 0x8c,\n",
-            "  0xa9, 0x78, 0x62, 0x6b, 0x8a, 0x93, 0x80, 0x68, 0x9b, 0x6d, 0x6b, 0x7b,\n",
-            "  0x84, 0x8f, 0x86, 0x70, 0x70, 0x73, 0x84, 0x4f, 0x7c, 0x75, 0x64, 0x8d,\n",
-            "  0x6e, 0x81, 0x7c, 0x72, 0x81, 0xb0, 0x74, 0x65, 0xa7, 0xae, 0x80, 0x70,\n",
-            "  0x5e, 0xa4, 0x58, 0x54, 0x8e, 0xa7, 0x96, 0x65, 0x66, 0x8b, 0x6c, 0x5d,\n",
-            "  0x6b, 0xbe, 0x94, 0x79, 0x80, 0xa1, 0x91, 0x78, 0x6d, 0xc2, 0x82, 0x85,\n",
-            "  0x81, 0x7d, 0x88, 0x79, 0x93, 0x96, 0x7f, 0x7e, 0x7d, 0x92, 0x75, 0xa2,\n",
-            "  0x9f, 0x7b, 0x92, 0x77, 0x8a, 0x7c, 0x80, 0x8b, 0x9b, 0x64, 0xa5, 0x74,\n",
-            "  0xa1, 0x74, 0x7f, 0x7e, 0x85, 0x78, 0x9c, 0x86, 0x9f, 0x62, 0x8f, 0x7f,\n",
-            "  0x8a, 0x90, 0x6d, 0x7d, 0x93, 0x61, 0x9d, 0x81, 0x9b, 0x99, 0x69, 0x87,\n",
-            "  0x74, 0x7d, 0x8e, 0x8e, 0x7b, 0x7c, 0x6a, 0x71, 0x7d, 0x7f, 0x74, 0x74,\n",
-            "  0x7b, 0x65, 0x6e, 0x91, 0x7c, 0x6e, 0x80, 0x8c, 0x8a, 0x6c, 0x6b, 0x76,\n",
-            "  0xad, 0x94, 0x64, 0x81, 0x69, 0x7b, 0xac, 0x76, 0x9f, 0x71, 0x85, 0x85,\n",
-            "  0x8b, 0x66, 0xb5, 0x87, 0xb3, 0x63, 0x8b, 0x95, 0x8e, 0x50, 0x91, 0x77,\n",
-            "  0xa1, 0x99, 0x64, 0x81, 0xb3, 0x63, 0x6e, 0x7a, 0x7f, 0x73, 0x7a, 0x7b,\n",
-            "  0x93, 0x6d, 0x75, 0x75, 0x7c, 0x7b, 0x59, 0x7c, 0x7c, 0x68, 0x67, 0x78,\n",
-            "  0x79, 0x75, 0x53, 0x86, 0x84, 0x84, 0x91, 0x71, 0x85, 0xb1, 0x84, 0x64,\n",
-            "  0x88, 0xc0, 0x94, 0x5f, 0x6f, 0x9b, 0x69, 0x67, 0x97, 0x94, 0x88, 0x6a,\n",
-            "  0x7e, 0x94, 0x9e, 0x7f, 0x81, 0x9c, 0xa7, 0x7f, 0x7a, 0xa2, 0x63, 0x69,\n",
-            "  0x82, 0xc2, 0x5e, 0x8d, 0x7c, 0x89, 0x63, 0x93, 0x84, 0xb8, 0x76, 0x89,\n",
-            "  0x96, 0x87, 0x79, 0x88, 0xa6, 0x8e, 0x9b, 0x93, 0x9c, 0x5d, 0x92, 0x92,\n",
-            "  0x82, 0x5e, 0x85, 0x88, 0xad, 0x73, 0xa4, 0x6f, 0x74, 0x8e, 0x77, 0x89,\n",
-            "  0x9b, 0x6e, 0x82, 0x76, 0x93, 0xae, 0x82, 0x87, 0x76, 0x6f, 0x80, 0x76,\n",
-            "  0x95, 0x8e, 0x5e, 0x85, 0x7b, 0x68, 0x7f, 0x7c, 0x82, 0x94, 0x80, 0x91,\n",
-            "  0x77, 0x71, 0x7c, 0x94, 0x80, 0x62, 0x65, 0x7c, 0x5e, 0x70, 0x76, 0x75,\n",
-            "  0x7b, 0x60, 0x5f, 0x69, 0xb3, 0x6e, 0x95, 0x9d, 0x5a, 0x5b, 0x9e, 0x6e,\n",
-            "  0xa6, 0x80, 0x5d, 0xa5, 0x83, 0x5b, 0xa4, 0x80, 0xb3, 0x79, 0x83, 0xb6,\n",
-            "  0xa3, 0x73, 0x84, 0x67, 0x8d, 0x8f, 0x9d, 0x78, 0xb8, 0x8a, 0x7b, 0x6c,\n",
-            "  0x85, 0x87, 0x6d, 0x75, 0xae, 0x75, 0x53, 0x71, 0x6b, 0x87, 0x67, 0x7b,\n",
-            "  0x7f, 0x86, 0x58, 0x73, 0x7d, 0x87, 0x5d, 0x7f, 0x7d, 0x63, 0x92, 0x65,\n",
-            "  0x7a, 0x9c, 0x6f, 0x87, 0x81, 0xa9, 0x91, 0x54, 0x66, 0x8e, 0x58, 0x6d,\n",
-            "  0x92, 0xc2, 0xa9, 0x7b, 0x6e, 0x96, 0x7c, 0x60, 0x7e, 0xa8, 0x85, 0x94,\n",
-            "  0x90, 0x8b, 0x77, 0x79, 0x77, 0xa7, 0x8f, 0x83, 0x80, 0x99, 0x8c, 0x80,\n",
-            "  0x93, 0x9c, 0x73, 0x9e, 0x75, 0x90, 0x67, 0x74, 0x99, 0x98, 0x7e, 0x76,\n",
-            "  0x9f, 0x82, 0x90, 0x95, 0x9d, 0x5f, 0x95, 0x98, 0x8c, 0x5f, 0x77, 0x83,\n",
-            "  0x7b, 0x72, 0x85, 0x7c, 0x97, 0x74, 0x81, 0x80, 0x8d, 0x89, 0x7d, 0x69,\n",
-            "  0x95, 0x85, 0x83, 0x5e, 0x95, 0x74, 0x54, 0x7f, 0x6c, 0x67, 0x9b, 0x83,\n",
-            "  0x88, 0x8e, 0x6f, 0x96, 0x81, 0x7f, 0x6e, 0x87, 0x8f, 0x6f, 0x61, 0x87,\n",
-            "  0x63, 0x66, 0x72, 0x77, 0x75, 0x6d, 0x59, 0x7d, 0xaa, 0x85, 0x62, 0x83,\n",
-            "  0x97, 0x94, 0x96, 0x89, 0x9d, 0x90, 0x7d, 0x91, 0x78, 0x57, 0xa0, 0x7f,\n",
-            "  0xa2, 0x62, 0x63, 0x99, 0x77, 0x71, 0x7f, 0x61, 0x99, 0x89, 0x6f, 0xa2,\n",
-            "  0xae, 0x92, 0x88, 0x51, 0x87, 0x7a, 0x6f, 0x89, 0xa8, 0x89, 0x64, 0x81,\n",
-            "  0x84, 0x79, 0x5b, 0x73, 0x82, 0x6e, 0x7e, 0x5d, 0x8f, 0x82, 0x51, 0x69,\n",
-            "  0x8e, 0x76, 0x8b, 0x58, 0x89, 0xb2, 0x52, 0x72, 0x7f, 0xae, 0x96, 0x5a,\n",
-            "  0x80, 0xa1, 0x74, 0x62, 0x8d, 0xbe, 0x87, 0x6c, 0x6d, 0xad, 0x83, 0x5a,\n",
-            "  0x6c, 0xa5, 0x7f, 0x7c, 0x7a, 0xa1, 0x75, 0x6d, 0x85, 0xbe, 0x91, 0x8e,\n",
-            "  0x96, 0x8c, 0x87, 0x74, 0x8b, 0x82, 0x96, 0x8f, 0x8f, 0x93, 0x8f, 0x8c,\n",
-            "  0x9a, 0x78, 0x73, 0x6e, 0x91, 0x8d, 0x7e, 0x81, 0x81, 0x52, 0x90, 0x85,\n",
-            "  0x77, 0x66, 0x7e, 0x75, 0x8a, 0x67, 0x72, 0x76, 0x82, 0x7b, 0x6e, 0x67,\n",
-            "  0x96, 0x7b, 0x75, 0x76, 0x8d, 0x76, 0x7f, 0x79, 0x84, 0x7b, 0x57, 0x81,\n",
-            "  0x76, 0x80, 0x67, 0x8c, 0x7c, 0x80, 0x67, 0x85, 0x79, 0x5b, 0x97, 0x74,\n",
-            "  0x91, 0x75, 0x82, 0x75, 0x6b, 0x94, 0x7e, 0x85, 0x8e, 0x77, 0x5d, 0x78,\n",
-            "  0xb5, 0x8b, 0x73, 0x7f, 0x62, 0x8f, 0xb1, 0x7d, 0xa2, 0x85, 0x6b, 0x92,\n",
-            "  0x75, 0x75, 0xb8, 0x7d, 0xb3, 0x67, 0x5f, 0xa6, 0x9b, 0x85, 0x9a, 0x67,\n",
-            "  0xbe, 0x8d, 0x92, 0x88, 0xa5, 0x7c, 0xaa, 0x5a, 0x71, 0x7b, 0x70, 0x77,\n",
-            "  0xa0, 0xa4, 0x5e, 0x55, 0x6b, 0x8e, 0x53, 0x89, 0x8a, 0x5a, 0x7c, 0x54,\n",
-            "  0x7c, 0x8b, 0x53, 0x77, 0x67, 0x77, 0x67, 0x5d, 0x91, 0xac, 0x78, 0x81,\n",
-            "  0x8e, 0xb5, 0x6d, 0x58, 0x78, 0xa6, 0x7c, 0x85, 0x87, 0xb3, 0x76, 0x5d,\n",
-            "  0x7c, 0x87, 0x57, 0x68, 0x82, 0x8f, 0x89, 0x76, 0x86, 0x9f, 0x6c, 0x68,\n",
-            "  0x7c, 0x87, 0x79, 0x9f, 0x86, 0x9e, 0x83, 0x70, 0x8d, 0xb2, 0x84, 0x71,\n",
-            "  0x71, 0x91, 0x9f, 0x8e, 0x83, 0x84, 0x87, 0x80, 0x94, 0x80, 0x7d, 0x8d,\n",
-            "  0x7c, 0x56, 0x5f, 0x80, 0x7d, 0x84, 0x61, 0x6e, 0x69, 0x80, 0x8b, 0x67,\n",
-            "  0xa4, 0x8b, 0x98, 0x7a, 0x8a, 0x6c, 0x77, 0x66, 0x7d, 0x6e, 0x84, 0x78,\n",
-            "  0x82, 0x7d, 0x61, 0x88, 0x6e, 0x53, 0x92, 0x75, 0x88, 0x77, 0x82, 0x9f,\n",
-            "  0x9e, 0x6f, 0x9c, 0x76, 0x91, 0x78, 0x69, 0x7f, 0x71, 0x6c, 0x6f, 0x7d,\n",
-            "  0x83, 0x6e, 0x3c, 0x84, 0x90, 0x8b, 0x71, 0x69, 0x75, 0x81, 0xc8, 0x84,\n",
-            "  0xa7, 0x8a, 0x8a, 0x90, 0x96, 0x86, 0x9e, 0x68, 0x99, 0x84, 0x8c, 0xa0,\n",
-            "  0x8a, 0x71, 0x7d, 0x41, 0xa1, 0x98, 0x77, 0x91, 0xaa, 0x86, 0x96, 0x5e,\n",
-            "  0x86, 0x76, 0xa7, 0x83, 0xac, 0x86, 0x66, 0x46, 0x6a, 0x81, 0x64, 0x77,\n",
-            "  0x67, 0x53, 0x80, 0x59, 0x73, 0x71, 0x63, 0x71, 0x76, 0x86, 0x62, 0x4f,\n",
-            "  0x83, 0xa4, 0x5d, 0x66, 0x93, 0x87, 0x87, 0x5b, 0x7f, 0x9d, 0x61, 0x9d,\n",
-            "  0x94, 0xa4, 0x84, 0x75, 0x67, 0xb3, 0x7b, 0x6d, 0x64, 0x98, 0x62, 0x77,\n",
-            "  0x7d, 0x98, 0x8e, 0x75, 0x7d, 0xa6, 0xa4, 0x8c, 0x83, 0x8b, 0x7a, 0x97,\n",
-            "  0x6c, 0x7f, 0x66, 0x7f, 0x8f, 0x98, 0x72, 0x6e, 0x75, 0x65, 0x80, 0x8d,\n",
-            "  0x88, 0x7d, 0x8c, 0x8d, 0x67, 0x68, 0xab, 0x8c, 0x8b, 0x76, 0x87, 0x69,\n",
-            "  0x88, 0x6c, 0x83, 0x6e, 0x88, 0x64, 0xa8, 0x67, 0xa5, 0x5b, 0x65, 0x60,\n",
-            "  0x6b, 0x62, 0x76, 0x78, 0x8c, 0x5b, 0x61, 0x6f, 0x66, 0x65, 0x92, 0x67,\n",
-            "  0x84, 0x7b, 0x80, 0x86, 0x7b, 0x6c, 0x86, 0x7a, 0x72, 0x7b, 0x4d, 0x94,\n",
-            "  0x80, 0x67, 0x8e, 0x8d, 0x7f, 0x79, 0x65, 0x78, 0xa3, 0x71, 0x80, 0x74,\n",
-            "  0xa7, 0xa8, 0x97, 0x78, 0x91, 0x77, 0x98, 0x86, 0x82, 0x64, 0xa5, 0x6e,\n",
-            "  0x7a, 0x5d, 0x6f, 0xad, 0x9b, 0x7a, 0x91, 0x4b, 0xa1, 0x75, 0x95, 0x76,\n",
-            "  0xac, 0x9d, 0xa3, 0x65, 0x65, 0x6a, 0x81, 0x8b, 0x9f, 0x67, 0x6b, 0x6a,\n",
-            "  0x60, 0x5b, 0x77, 0x96, 0x73, 0x78, 0x5a, 0x77, 0x5f, 0x68, 0x70, 0x72,\n",
-            "  0x78, 0x65, 0x81, 0x20, 0x86, 0x99, 0x80, 0x7a, 0xa5, 0xb1, 0x69, 0x45,\n",
-            "  0x7d, 0xa6, 0x7d, 0x85, 0xaa, 0xa9, 0x65, 0x60, 0x75, 0x9b, 0x61, 0x92,\n",
-            "  0x91, 0x8f, 0x8a, 0x81, 0x88, 0x9c, 0x81, 0x7d, 0x7b, 0x8f, 0x7e, 0x9e,\n",
-            "  0x82, 0x94, 0x95, 0x80, 0x73, 0xae, 0x7b, 0x7a, 0x79, 0x8c, 0x8b, 0x65,\n",
-            "  0x71, 0x75, 0x8d, 0x7a, 0x90, 0x83, 0x7b, 0x77, 0x71, 0x4f, 0x70, 0x95,\n",
-            "  0x87, 0x69, 0x97, 0x8e, 0x70, 0x92, 0x6e, 0x91, 0x9d, 0x72, 0x75, 0x82,\n",
-            "  0xad, 0x81, 0x78, 0x8d, 0x6f, 0x65, 0x88, 0x86, 0x8c, 0x8e, 0x59, 0x8b,\n",
-            "  0x67, 0x69, 0x8b, 0x78, 0x7f, 0x59, 0x73, 0x87, 0x6f, 0x86, 0x66, 0x7c,\n",
-            "  0x96, 0x68, 0x59, 0x78, 0x67, 0x92, 0x7b, 0x76, 0x80, 0x6e, 0x4a, 0x7b,\n",
-            "  0x99, 0x67, 0x72, 0x9c, 0x7a, 0x80, 0x76, 0x5f, 0x8e, 0x4f, 0x71, 0x77,\n",
-            "  0xab, 0x78, 0x99, 0x50, 0x83, 0x65, 0x78, 0x8c, 0xbb, 0x8d, 0x4e, 0x54,\n",
-            "  0x81, 0x6f, 0x7f, 0x91, 0xb9, 0x79, 0x9c, 0x65, 0x5a, 0x5a, 0x73, 0x8c,\n",
-            "  0x9a, 0xac, 0x99, 0x44, 0x7d, 0x4f, 0x78, 0x5a, 0x7d, 0x79, 0x57, 0x44,\n",
-            "  0x6f, 0x6a, 0x75, 0x7f, 0x5f, 0x6f, 0x72, 0x62, 0x7f, 0x89, 0x57, 0x91,\n",
-            "  0x8d, 0x83, 0x7e, 0x63, 0x8c, 0x95, 0x48, 0x78, 0xa9, 0x88, 0x84, 0x5b,\n",
-            "  0x8c, 0xa5, 0x65, 0x71, 0x88, 0x82, 0x7e, 0xa4, 0x8d, 0x7d, 0x7d, 0x8d,\n",
-            "  0x91, 0x7c, 0x73, 0x7d, 0x99, 0x89, 0x6d, 0xa1, 0x98, 0x84, 0x8b, 0x6b,\n",
-            "  0x89, 0x86, 0x84, 0x7e, 0x86, 0x87, 0x78, 0x8c, 0x96, 0x92, 0x5a, 0xa0,\n",
-            "  0x64, 0x73, 0x91, 0x88, 0x8f, 0x6b, 0x96, 0x5c, 0x99, 0x62, 0x78, 0x6c,\n",
-            "  0x87, 0x4d, 0x5d, 0x69, 0x7b, 0x81, 0x4a, 0x61, 0x71, 0x69, 0x7d, 0x91,\n",
-            "  0x67, 0x92, 0x68, 0x6f, 0x50, 0x5e, 0x61, 0x7e, 0x81, 0x70, 0x5f, 0x7b,\n",
-            "  0x6b, 0x55, 0x71, 0x6c, 0x70, 0x53, 0x3f, 0x80, 0x6e, 0x57, 0x96, 0x84,\n",
-            "  0x75, 0x51, 0x60, 0x9a, 0x7f, 0xa5, 0x80, 0x94, 0x95, 0x74, 0x7c, 0x83,\n",
-            "  0xa0, 0x93, 0x5d, 0x92, 0x83, 0x66, 0x67, 0x8a, 0x8b, 0x9b, 0x81, 0x69,\n",
-            "  0x73, 0x91, 0x6b, 0x79, 0x93, 0x88, 0x64, 0x68, 0x81, 0x8c, 0x6f, 0x81,\n",
-            "  0x6f, 0x80, 0x68, 0x5f, 0x9c, 0x95, 0x76, 0x93, 0x87, 0x68, 0x83, 0x94,\n",
-            "  0x8b, 0x85, 0x72, 0x7f, 0x64, 0x8c, 0x6a, 0x95, 0x8d, 0x80, 0x69, 0x6b,\n",
-            "  0x98, 0x86, 0x75, 0x92, 0x7a, 0x7f, 0x5b, 0x7f, 0x9b, 0x57, 0x99, 0x8d,\n",
-            "  0x8a, 0x7b, 0x58, 0x73, 0x88, 0x6d, 0x8a, 0x8c, 0x8e, 0x82, 0x85, 0xaa,\n",
-            "  0x72, 0xa6, 0x7f, 0x7a, 0x83, 0x59, 0x6d, 0x6e, 0x79, 0x83, 0x88, 0x84,\n",
-            "  0x74, 0x85, 0x74, 0x78, 0x80, 0x7c, 0x97, 0x86, 0x94, 0x65, 0x7e, 0x80,\n",
-            "  0x6f, 0x97, 0x70, 0x74, 0x92, 0x76, 0x71, 0x91, 0x85, 0x72, 0x6e, 0x84,\n",
-            "  0x78, 0x7e, 0x88, 0x79, 0x7f, 0x80, 0x83, 0x7a, 0x85, 0x75, 0x82, 0x81,\n",
-            "  0x82, 0x7b, 0x7a, 0xa0, 0x76, 0x7f, 0x75, 0xa7, 0x67, 0x8e, 0x81, 0x98,\n",
-            "  0xa5, 0x86, 0x77, 0x78, 0x7f, 0x97, 0x90, 0x86, 0x80, 0x6b, 0x89, 0x66,\n",
-            "  0x9b, 0x5c, 0x8b, 0x74, 0xac, 0x89, 0x89, 0x92, 0x92, 0xa8, 0x61, 0x85,\n",
-            "  0x8c, 0x86, 0x88, 0x91, 0x92, 0x66, 0x63, 0x6c, 0x7a, 0x80, 0x7d, 0x90,\n",
-            "  0x6f, 0x7f, 0x92, 0x94, 0x8e, 0x7a, 0x86, 0x98, 0xa1, 0x59, 0x71, 0x8c,\n",
-            "  0x63, 0xa3, 0x60, 0x7d, 0x88, 0x6a, 0x83, 0x6e, 0x7a, 0x94, 0x7b, 0x81,\n",
-            "  0x7d, 0x83, 0x77, 0x7e, 0x63, 0xab, 0x75, 0x7b, 0x71, 0x8f, 0x76, 0x6e,\n",
-            "  0x78, 0x7b, 0x79, 0x86, 0x69, 0x67, 0x67, 0x70, 0x6c, 0x7a, 0x6c, 0x84,\n",
-            "  0x74, 0xa2, 0x74, 0x77, 0x8a, 0x58, 0x7d, 0xa0, 0x65, 0x7b, 0x79, 0x71,\n",
-            "  0x7c, 0x3c, 0x85, 0x96, 0x59, 0x76, 0x6a, 0x94, 0xa5, 0x5b, 0x70, 0x99,\n",
-            "  0x7f, 0x9a, 0x69, 0x7c, 0x6f, 0x79, 0x72, 0x8b, 0x83, 0x6e, 0x73, 0x7f,\n",
-            "  0x6f, 0x6d, 0x7e, 0xa3, 0x72, 0x87, 0x83, 0x8c, 0x8c, 0x70, 0x77, 0x75,\n",
-            "  0xa4, 0x5a, 0x89, 0x7d, 0xa0, 0x97, 0x67, 0x80, 0x78, 0x7e, 0x86, 0x6a,\n",
-            "  0x7b, 0x9c, 0x77, 0x67, 0x7b, 0x74, 0x7f, 0xa5, 0x90, 0x94, 0x92, 0x4d,\n",
-            "  0x7a, 0x79, 0x9f, 0x87, 0x64, 0x6e, 0x6d, 0x59, 0x83, 0x54, 0x79, 0x82,\n",
-            "  0x6c, 0x74, 0x82, 0x98, 0x77, 0x90, 0x85, 0xa4, 0x88, 0x81, 0x71, 0x85,\n",
-            "  0x90, 0x8e, 0x88, 0x68, 0x51, 0x6d, 0x71, 0x7b, 0x80, 0xbc, 0xa5, 0x57,\n",
-            "  0x8f, 0x9f, 0x95, 0x89, 0xb1, 0x96, 0x69, 0x65, 0x61, 0x73, 0x6f, 0x6c,\n",
-            "  0x5b, 0x95, 0x99, 0x7f, 0x76, 0x9d, 0x7c, 0x7d, 0x8d, 0xb1, 0x8f, 0x6a,\n",
-            "  0x76, 0x95, 0x74, 0x7a, 0x7b, 0xae, 0x77, 0x76, 0x6d, 0x99, 0x7d, 0x80,\n",
-            "  0x6e, 0x89, 0x7f, 0x74, 0x6f, 0x72, 0x89, 0x8b, 0x86, 0x7b, 0x7c, 0x72,\n",
-            "  0x6b, 0x4f, 0x71, 0x94, 0x80, 0x96, 0x83, 0x7e, 0x75, 0x74, 0x68, 0x83,\n",
-            "  0x95, 0x8c, 0x85, 0x7a, 0x82, 0x74, 0x85, 0x83, 0x8c, 0x7e, 0x7a, 0xa0,\n",
-            "  0x8e, 0x67, 0x6b, 0x82, 0x9b, 0x66, 0x6c, 0x8a, 0x88, 0x7e, 0x74, 0x9e,\n",
-            "  0x88, 0x82, 0x73, 0x73, 0x79, 0x7c, 0x72, 0x6b, 0x74, 0x8b, 0xa4, 0xa4,\n",
-            "  0xa3, 0x73, 0x73, 0x88, 0x8d, 0x94, 0x84, 0x9a, 0x9e, 0x93, 0x6c, 0x86,\n",
-            "  0x7a, 0x7a, 0x7e, 0xaa, 0x66, 0x8f, 0x99, 0xa4, 0x70, 0x4c, 0x6f, 0x66,\n",
-            "  0x8a, 0xaa, 0x69, 0x80, 0x6a, 0x5e, 0x71, 0x8f, 0x8b, 0x84, 0x75, 0x9d,\n",
-            "  0x5c, 0x60, 0x61, 0x4a, 0x6f, 0x91, 0x78, 0x6e, 0x8c, 0x62, 0x88, 0x75,\n",
-            "  0x64, 0x7c, 0x7d, 0x92, 0x9b, 0x96, 0x62, 0x72, 0x6c, 0x6f, 0x87, 0x5d,\n",
-            "  0xa0, 0xa7, 0x7c, 0x58, 0x6e, 0x8c, 0x82, 0x84, 0x7f, 0x8b, 0x54, 0x77,\n",
-            "  0x5b, 0x9a, 0x6a, 0x78, 0x5d, 0xb9, 0x8e, 0x7d, 0x6e, 0xa1, 0x66, 0x7c,\n",
-            "  0x87, 0xd2, 0x7a, 0x6c, 0x82, 0xa1, 0x83, 0x59, 0x64, 0x9e, 0x65, 0x6d,\n",
-            "  0x77, 0x80, 0x7c, 0x9a, 0x50, 0x9f, 0x8b, 0x7a, 0x73, 0x80, 0x92, 0x6d,\n",
-            "  0x97, 0x7f, 0x74, 0x6a, 0x5f, 0x44, 0x7d, 0x99, 0x95, 0x91, 0x8f, 0x6a,\n",
-            "  0x63, 0x56, 0x89, 0x96, 0xba, 0xa6, 0x71, 0x98, 0x9d, 0x3a, 0x8f, 0x77,\n",
-            "  0x6d, 0x76, 0x68, 0xb4, 0x8d, 0x79, 0x7a, 0x83, 0x7f, 0x96, 0x75, 0x94,\n",
-            "  0x9e, 0x51, 0x83, 0x5b, 0x66, 0x73, 0xa1, 0xbc, 0x8c, 0x70, 0x88, 0x80,\n",
-            "  0x92, 0x60, 0x7d, 0xa9, 0x97, 0x74, 0x7d, 0x98, 0x7b, 0x78, 0x85, 0xa7,\n",
-            "  0x8f, 0x8c, 0x91, 0x9d, 0x6a, 0x80, 0x6c, 0x8e, 0x8e, 0x91, 0x76, 0x8b,\n",
-            "  0x79, 0x59, 0x7d, 0x9c, 0x69, 0x83, 0x8c, 0x95, 0x8e, 0x75, 0x9d, 0x83,\n",
-            "  0x92, 0x99, 0x8a, 0x59, 0x61, 0x54, 0x63, 0x86, 0x83, 0x86, 0x98, 0x83,\n",
-            "  0x73, 0x74, 0x91, 0x52, 0x60, 0x8a, 0x7c, 0x57, 0xbc, 0x9d, 0x86, 0x6b,\n",
-            "  0x63, 0xa2, 0x78, 0x80, 0x75, 0xb1, 0x74, 0x76, 0x69, 0x8b, 0x7e, 0x76,\n",
-            "  0x7b, 0xb3, 0x77, 0x5b, 0x6c, 0x8b, 0x83, 0x80, 0x7f, 0xd1, 0x7c, 0x58,\n",
-            "  0x6f, 0x98, 0x71, 0x57, 0x60, 0xd0, 0x84, 0x62, 0x74, 0xa6, 0x8f, 0x7b,\n",
-            "  0x70, 0xaa, 0x81, 0x6b, 0x7f, 0x89, 0x6a, 0x74, 0x5a, 0x8c, 0x9c, 0x77,\n",
-            "  0x5d, 0x84, 0x63, 0x94, 0x8e, 0x91, 0x83, 0x4a, 0x49, 0x74, 0x6b, 0x70,\n",
-            "  0xc0, 0xa0, 0x6a, 0x90, 0x8e, 0x5a, 0x70, 0x96, 0xab, 0x72, 0x7e, 0xba,\n",
-            "  0xa7, 0x46, 0x86, 0x5d, 0x90, 0x76, 0x95, 0x8d, 0xa5, 0x40, 0x82, 0x8a,\n",
-            "  0x7d, 0x5e, 0x73, 0x94, 0x9d, 0x58, 0x8c, 0x8b, 0x69, 0x6c, 0x9a, 0x90,\n",
-            "  0xaa, 0x6f, 0x85, 0x8d, 0x64, 0x58, 0x7b, 0x97, 0xa9, 0x79, 0xa5, 0xa2,\n",
-            "  0x5f, 0x57, 0x9a, 0xb4, 0x89, 0x70, 0x84, 0x73, 0x46, 0x6c, 0x6e, 0x87,\n",
-            "  0x70, 0x94, 0x8a, 0x8a, 0x69, 0x7b, 0x6c, 0x68, 0x8e, 0xa2, 0x90, 0x84,\n",
-            "  0x78, 0x45, 0x63, 0x78, 0x7f, 0x90, 0x9f, 0x90, 0x68, 0x43, 0x92, 0x77,\n",
-            "  0x78, 0x77, 0x82, 0x7d, 0x8f, 0x6a, 0x7a, 0x70, 0x76, 0x75, 0x87, 0x63,\n",
-            "  0xbc, 0x8e, 0x6a, 0x71, 0x51, 0x51, 0x75, 0x6b, 0x8a, 0xb4, 0x6a, 0x5b,\n",
-            "  0x99, 0x84, 0x76, 0x84, 0x74, 0xaf, 0x86, 0x6a, 0x53, 0x97, 0x6e, 0x8e,\n",
-            "  0x61, 0xc4, 0x7e, 0x5d, 0x4d, 0x96, 0x73, 0x73, 0x53, 0xc0, 0x8f, 0x68,\n",
-            "  0x58, 0xae, 0x81, 0x83, 0x62, 0x98, 0x7b, 0x89, 0x54, 0x86, 0x78, 0x67,\n",
-            "  0x70, 0x9b, 0x63, 0x5f, 0x2d, 0x77, 0x84, 0x79, 0x6b, 0xa4, 0x7b, 0x65,\n",
-            "  0x45, 0x65, 0x56, 0x86, 0xbb, 0x8a, 0x8e, 0x92, 0x86, 0x48, 0x7c, 0x6d,\n",
-            "  0xb4, 0x7d, 0x56, 0xa4, 0x86, 0x52, 0x8b, 0x6a, 0x8d, 0x5b, 0x9d, 0xa2,\n",
-            "  0xbf, 0x36, 0x7c, 0x99, 0x9d, 0x65, 0x75, 0xa4, 0x9f, 0x6a, 0x7c, 0x6b,\n",
-            "  0x6f, 0x55, 0x70, 0x7f, 0xc2, 0x38, 0x6e, 0xa4, 0x74, 0x4c, 0x75, 0xbb,\n",
-            "  0xa4, 0x75, 0x8e, 0x8f, 0x56, 0x65, 0x57, 0x92, 0x73, 0x7f, 0x7d, 0x86,\n",
-            "  0x65, 0x76, 0x92, 0x84, 0x70, 0xa8, 0x91, 0x5b, 0x69, 0x74, 0x8e, 0x82,\n",
-            "  0x78, 0x8a, 0xaa, 0x71, 0x70, 0x50, 0x85, 0x82, 0x7d, 0x94, 0xa0, 0x76,\n",
-            "  0x6d, 0x55, 0x86, 0x79, 0x71, 0x7f, 0x9b, 0x71, 0x8a, 0x42, 0x87, 0x64,\n",
-            "  0x57, 0x88, 0xa0, 0x77, 0xa8, 0x91, 0x72, 0x65, 0x7e, 0x6b, 0x7e, 0x81,\n",
-            "  0x8d, 0x97, 0x7e, 0x6a, 0x92, 0x88, 0x84, 0x7a, 0x61, 0xa9, 0x86, 0x59,\n",
-            "  0x6c, 0x87, 0x61, 0x72, 0x4f, 0xc8, 0x99, 0x6c, 0x66, 0xa3, 0x80, 0x8b,\n",
-            "  0x5c, 0xc0, 0x69, 0x7a, 0x6c, 0xb8, 0x8e, 0x91, 0x51, 0x9f, 0x8c, 0x85,\n",
-            "  0x75, 0x96, 0x8c, 0x84, 0x6b, 0xa6, 0x71, 0x62, 0x42, 0x60, 0x74, 0x72,\n",
-            "  0x92, 0x91, 0x70, 0x5b, 0x3d, 0x71, 0x5e, 0x91, 0xa3, 0xa5, 0x6a, 0x7c,\n",
-            "  0x60, 0x58, 0x82, 0x80, 0xa3, 0x73, 0x8f, 0xa0, 0xb2, 0x4b, 0x94, 0x5e,\n",
-            "  0x9f, 0x75, 0x4d, 0x83, 0xbc, 0x42, 0x5e, 0x80, 0x8f, 0x59, 0x53, 0xac,\n",
-            "  0xb2, 0x45, 0x68, 0x7d, 0x9a, 0x65, 0x8a, 0xaa, 0xa0, 0x4e, 0x77, 0x72,\n",
-            "  0x4d, 0x62, 0x6e, 0x98, 0x8c, 0x73, 0x92, 0x5a, 0x49, 0x55, 0x7b, 0x98,\n",
-            "  0x8d, 0x84, 0x80, 0x8e, 0x2e, 0x56, 0x78, 0x73, 0x7b, 0x8f, 0x9a, 0x69,\n",
-            "  0x73, 0x68, 0x7a, 0x88, 0x78, 0xa5, 0xb1, 0x5c, 0x8f, 0x55, 0x71, 0x99,\n",
-            "  0x7a, 0xa9, 0xb0, 0x75, 0x69, 0x44, 0x5f, 0x66, 0x81, 0x7d, 0x9e, 0x4f,\n",
-            "  0x66, 0x7f, 0x87, 0x7d, 0x5d, 0x7c, 0x95, 0x62, 0xa5, 0x86, 0x90, 0x6f,\n",
-            "  0x60, 0xa5, 0x6e, 0x70, 0x80, 0x96, 0x6f, 0x55, 0x77, 0x87, 0x99, 0x7b,\n",
-            "  0x21, 0xaa, 0x7f, 0x60, 0x63, 0xae, 0x47, 0x79, 0x44, 0xb5, 0x83, 0x6e,\n",
-            "  0x6d, 0x93, 0x76, 0x54, 0x4b, 0xad, 0x91, 0x6b, 0x6a, 0x9c, 0x8c, 0x83,\n",
-            "  0x62, 0x8a, 0x88, 0x71, 0x73, 0xa0, 0x75, 0x95, 0x54, 0x80, 0x92, 0x65,\n",
-            "  0x45, 0x80, 0x63, 0x9a, 0x93, 0x9b, 0x78, 0x4e, 0x4d, 0x5f, 0x69, 0x9e,\n",
-            "  0xbd, 0xa5, 0x75, 0x6b, 0x6e, 0x6a, 0x82, 0x97, 0xab, 0x60, 0x76, 0xb3,\n",
-            "  0xc1, 0x39, 0x82, 0x5b, 0x71, 0x31, 0x7b, 0x9c, 0xb5, 0x4f, 0x75, 0x79,\n",
-            "  0x6c, 0x5d, 0x80, 0xa6, 0x9c, 0x53, 0x6f, 0x85, 0x84, 0x5e, 0x7d, 0xb5,\n",
-            "  0x95, 0x5f, 0x7c, 0x98, 0x72, 0x7c, 0x67, 0x99, 0xbb, 0x6c, 0x73, 0x66,\n",
-            "  0x59, 0x5c, 0x6c, 0x9a, 0x9b, 0x72, 0x9b, 0x5f, 0x4b, 0x51, 0x63, 0x84,\n",
-            "  0x74, 0xa0, 0xb3, 0x6e, 0x63, 0xa0, 0x84, 0x90, 0x71, 0x91, 0xba, 0x64,\n",
-            "  0x6d, 0x72, 0x78, 0x83, 0x6f, 0x8e, 0xbd, 0x64, 0x69, 0x60, 0x95, 0x67,\n",
-            "  0x70, 0x93, 0x78, 0x4d, 0x91, 0x3f, 0x7b, 0x6d, 0x69, 0x87, 0x7d, 0x8a,\n",
-            "  0xa3, 0x95, 0x9d, 0x66, 0x6d, 0x8b, 0x7a, 0x75, 0x94, 0x7b, 0x89, 0x52,\n",
-            "  0x66, 0x65, 0x79, 0x84, 0x49, 0x9c, 0x60, 0x66, 0x3e, 0xab, 0x4a, 0x86,\n",
-            "  0x54, 0xcd, 0x7c, 0x83, 0x7c, 0xac, 0x8b, 0x53, 0x67, 0xbb, 0x7c, 0x6d,\n",
-            "  0x72, 0xb3, 0x83, 0x85, 0x4f, 0x97, 0x86, 0x60, 0x7d, 0x93, 0x70, 0x8b,\n",
-            "  0x64, 0x78, 0x82, 0x73, 0x54, 0x87, 0x6c, 0xaa, 0x6f, 0x97, 0x8d, 0x51,\n",
-            "  0x2d, 0x50, 0x75, 0xa9, 0xc2, 0x94, 0x8d, 0x6f, 0x6d, 0x71, 0x7b, 0x87,\n",
-            "  0x93, 0x67, 0x7d, 0xa5, 0xa2, 0x4f, 0x99, 0x83, 0x95, 0x49, 0x70, 0x9c,\n",
-            "  0xcf, 0x37, 0x84, 0x86, 0x94, 0x5c, 0x95, 0xa1, 0xb6, 0x73, 0x80, 0x8d,\n",
-            "  0x89, 0x62, 0x6f, 0xb4, 0xa1, 0x5b, 0x64, 0x91, 0x41, 0x4f, 0x53, 0xa6,\n",
-            "  0xae, 0x75, 0x84, 0x82, 0x58, 0x8e, 0x63, 0x95, 0xa3, 0x8d, 0x8b, 0x76,\n",
-            "  0x5d, 0x78, 0x80, 0x82, 0x6e, 0x9d, 0xb8, 0x7d, 0x64, 0x8a, 0x7e, 0x80,\n",
-            "  0x72, 0x99, 0xcf, 0x76, 0x66, 0x77, 0x7c, 0x81, 0x71, 0x6f, 0xa1, 0x6c,\n",
-            "  0x6b, 0x70, 0x80, 0x7c, 0x6d, 0x83, 0x8e, 0x74, 0x7a, 0x58, 0x69, 0x53,\n",
-            "  0x58, 0x7d, 0x7f, 0x84, 0x96, 0x9c, 0x75, 0x6e, 0x62, 0x7c, 0x88, 0x7e,\n",
-            "  0x7f, 0x98, 0x93, 0x61, 0x98, 0x98, 0x80, 0x83, 0x2e, 0x7d, 0x64, 0x69,\n",
-            "  0x50, 0xa5, 0x38, 0x96, 0x2e, 0xc5, 0x66, 0x56, 0x64, 0xaa, 0x63, 0x64,\n",
-            "  0x6d, 0xb3, 0x8a, 0x6c, 0x59, 0xb6, 0x69, 0x7a, 0x54, 0x91, 0x58, 0x96,\n",
-            "  0x6b, 0x9f, 0x6d, 0x88, 0x4a, 0x82, 0x94, 0x67, 0x38, 0x93, 0x60, 0x87,\n",
-            "  0x8c, 0x93, 0x8c, 0x52, 0x31, 0x43, 0x66, 0xa9, 0xb3, 0x7a, 0x88, 0x64,\n",
-            "  0x60, 0x5b, 0x80, 0x84, 0xb7, 0x5a, 0x7a, 0x9d, 0x92, 0x50, 0x89, 0x80,\n",
-            "  0x72, 0x51, 0x7f, 0x85, 0xae, 0x47, 0x76, 0x9a, 0x7a, 0x74, 0x6d, 0x93,\n",
-            "  0xbd, 0x42, 0x72, 0x6d, 0x58, 0x5e, 0x6e, 0xa4, 0xb5, 0x4e, 0x76, 0x8f,\n",
-            "  0x75, 0x9b, 0x5d, 0x92, 0xad, 0x77, 0x7f, 0x73, 0x62, 0x7d, 0x65, 0xaf,\n",
-            "  0x98, 0x87, 0x80, 0x7c, 0x61, 0x81, 0x45, 0xa0, 0x84, 0x99, 0xbb, 0x72,\n",
-            "  0x86, 0x8f, 0x70, 0x97, 0x6a, 0x8a, 0xd3, 0x70, 0x7c, 0x91, 0x77, 0x82,\n",
-            "  0x70, 0x8c, 0xd5, 0x6c, 0x7f, 0x51, 0x5f, 0x69, 0x72, 0x89, 0x9a, 0x68,\n",
-            "  0x79, 0x70, 0x8b, 0x80, 0x52, 0x98, 0x86, 0x7a, 0xa0, 0x7b, 0x61, 0x6e,\n",
-            "  0x66, 0x6f, 0x77, 0x78, 0x64, 0xac, 0x7e, 0x73, 0x5d, 0x71, 0x6f, 0x80,\n",
-            "  0x2e, 0xa9, 0x90, 0x5c, 0x56, 0xa1, 0x32, 0x88, 0x55, 0xb9, 0x67, 0x6f,\n",
-            "  0x5c, 0xa5, 0x87, 0x61, 0x6b, 0xbd, 0x77, 0x7c, 0x62, 0xae, 0x7c, 0x7a,\n",
-            "  0x66, 0xac, 0x7a, 0x62, 0x5c, 0x9a, 0x58, 0x89, 0x5a, 0x74, 0x72, 0x66,\n",
-            "  0x5c, 0x8e, 0x51, 0x8e, 0x99, 0x92, 0xa0, 0x49, 0x31, 0x55, 0x68, 0x99,\n",
-            "  0xba, 0x82, 0xa2, 0x7a, 0x5e, 0x6f, 0x84, 0x98, 0x96, 0x52, 0x73, 0x99,\n",
-            "  0xb4, 0x5e, 0x7c, 0x59, 0x7d, 0x4a, 0x7e, 0xa0, 0xbe, 0x63, 0x67, 0x8e,\n",
-            "  0x7f, 0x71, 0x80, 0xaf, 0x93, 0x4e, 0x78, 0x7e, 0x6d, 0x52, 0x66, 0xb3,\n",
-            "  0x94, 0x56, 0x84, 0x8f, 0x50, 0x6d, 0x65, 0xa8, 0xb3, 0x4b, 0x91, 0x7f,\n",
-            "  0x4c, 0x8d, 0x69, 0x79, 0x95, 0x8f, 0x8f, 0x7c, 0x66, 0x98, 0x75, 0x9b,\n",
-            "  0x73, 0x9b, 0xac, 0x79, 0x6e, 0x84, 0x69, 0x9e, 0x80, 0xa0, 0xb0, 0x6c,\n",
-            "  0x46, 0x8b, 0x3f, 0x7a, 0x79, 0x79, 0xb3, 0x62, 0x6b, 0x60, 0x67, 0x81,\n",
-            "  0x4a, 0x7e, 0xa7, 0x8c, 0x74, 0x7f, 0x67, 0x4c, 0x4b, 0x8c, 0x8e, 0x67,\n",
-            "  0x78, 0x9d, 0x94, 0x79, 0x75, 0x7c, 0x86, 0x7b, 0x67, 0x9f, 0xa4, 0x61,\n",
-            "  0x5b, 0x6e, 0x85, 0x70, 0x20, 0xa5, 0x66, 0x5e, 0x55, 0xad, 0x3e, 0x7c,\n",
-            "  0x2d, 0xb4, 0x78, 0x6f, 0x4c, 0xc6, 0x7e, 0x6d, 0x54, 0xb4, 0x71, 0x78,\n",
-            "  0x54, 0xc3, 0x66, 0x6e, 0x4a, 0xa0, 0x7b, 0x85, 0x66, 0x94, 0x75, 0x8d,\n",
-            "  0x34, 0x88, 0x71, 0x4e, 0x49, 0x8a, 0x3b, 0x9c, 0x88, 0x76, 0x7f, 0x6a,\n",
-            "  0x37, 0x64, 0x66, 0xb6, 0xa3, 0x82, 0x76, 0x82, 0x6d, 0x65, 0x6f, 0x8c,\n",
-            "  0x99, 0x5e, 0x77, 0xa1, 0x99, 0x51, 0xa1, 0x67, 0x6f, 0x4c, 0x7f, 0x9e,\n",
-            "  0xad, 0x40, 0x65, 0x82, 0x76, 0x66, 0x72, 0xb5, 0xb2, 0x5b, 0x71, 0x8a,\n",
-            "  0x76, 0x74, 0x52, 0xa0, 0x91, 0x37, 0x86, 0x72, 0x6c, 0x75, 0x62, 0xa5,\n",
-            "  0xb6, 0x57, 0x75, 0x90, 0x3e, 0x7f, 0x49, 0x9f, 0x8e, 0x92, 0x81, 0x87,\n",
-            "  0x69, 0x9e, 0x6b, 0x86, 0x8d, 0xb1, 0x9e, 0x65, 0x6f, 0x93, 0x70, 0x79,\n",
-            "  0x7b, 0x87, 0xbe, 0x59, 0x69, 0x7a, 0x56, 0x7a, 0x81, 0x7d, 0xb8, 0x67,\n",
-            "  0x67, 0x7f, 0x54, 0x8f, 0x71, 0x85, 0xa0, 0x74, 0x89, 0x5d, 0x67, 0x52,\n",
-            "  0x65, 0x96, 0x89, 0x84, 0x81, 0x83, 0x82, 0x9a, 0x85, 0x73, 0x78, 0x62,\n",
-            "  0x87, 0x98, 0x75, 0x6a, 0x73, 0x95, 0x86, 0x71, 0x11, 0x9a, 0x91, 0x66,\n",
-            "  0x6e, 0xa4, 0x35, 0x89, 0x47, 0xbb, 0x5e, 0x46, 0x3a, 0xa8, 0x70, 0x4a,\n",
-            "  0x65, 0xb9, 0x70, 0x96, 0x66, 0xcf, 0x80, 0x79, 0x60, 0xa4, 0x79, 0x70,\n",
-            "  0x68, 0x92, 0x7f, 0x89, 0x6b, 0x87, 0x77, 0x67, 0x5b, 0x74, 0x3f, 0x9e,\n",
-            "  0x94, 0x9b, 0xa1, 0x61, 0x4b, 0x66, 0x70, 0xad, 0xb7, 0x67, 0x70, 0x6c,\n",
-            "  0x3f, 0x5b, 0x94, 0x88, 0xb3, 0x4f, 0x97, 0x97, 0x8c, 0x55, 0xb8, 0x78,\n",
-            "  0x60, 0x25, 0x51, 0x91, 0xcd, 0x44, 0x6f, 0x85, 0x5c, 0x65, 0x67, 0xa5,\n",
-            "  0x9e, 0x5f, 0x6d, 0x85, 0x6d, 0x56, 0x80, 0xae, 0x79, 0x63, 0x4f, 0x7d,\n",
-            "  0x5f, 0x6b, 0x6e, 0xa7, 0x8e, 0x76, 0x8f, 0x90, 0x6e, 0x8c, 0x88, 0x92,\n",
-            "  0x81, 0x81, 0x96, 0x7d, 0x48, 0x6b, 0x3f, 0xa1, 0x8c, 0xa2, 0x9f, 0x7f,\n",
-            "  0x77, 0x97, 0x73, 0x9c, 0x67, 0x95, 0xae, 0x77, 0x7f, 0x7a, 0x52, 0x7e,\n",
-            "  0x91, 0x77, 0xa8, 0x54, 0x6a, 0x74, 0x52, 0x8a, 0x67, 0x8e, 0x90, 0x8d,\n",
-            "  0x8b, 0x52, 0x72, 0x5a, 0x73, 0x8f, 0x94, 0x87, 0x7c, 0x88, 0x89, 0x76,\n",
-            "  0x77, 0x88, 0x5c, 0x77, 0x8f, 0x94, 0xac, 0x58, 0x70, 0x79, 0x75, 0x8a,\n",
-            "  0x20, 0x9c, 0x91, 0x55, 0x55, 0xa4, 0x5b, 0x84, 0x30, 0xc6, 0x8a, 0x51,\n",
-            "  0x31, 0xc3, 0x72, 0x6b, 0x65, 0xb9, 0x79, 0x7d, 0x62, 0xad, 0x88, 0x75,\n",
-            "  0x37, 0xb0, 0x76, 0x8a, 0x7d, 0x85, 0x7f, 0xb4, 0x46, 0x9c, 0x83, 0x7b,\n",
-            "  0x79, 0x78, 0x56, 0xac, 0x8d, 0xa2, 0xa9, 0x54, 0x44, 0x5a, 0x63, 0xb2,\n",
-            "  0xa8, 0x72, 0xa4, 0x6b, 0x5d, 0x4d, 0x8e, 0x95, 0x9e, 0x4a, 0x98, 0x8c,\n",
-            "  0xb0, 0x5c, 0xa5, 0x75, 0x83, 0x3b, 0x46, 0x92, 0xa7, 0x3b, 0x6a, 0x75,\n",
-            "  0x59, 0x57, 0x52, 0xa1, 0xab, 0x54, 0x68, 0x7c, 0x94, 0x6e, 0x5b, 0x9a,\n",
-            "  0xa3, 0x5d, 0x73, 0x74, 0x5a, 0x63, 0x56, 0x9e, 0xc1, 0x71, 0x82, 0x79,\n",
-            "  0x49, 0x92, 0x63, 0xa6, 0x99, 0x7d, 0x71, 0x81, 0x5e, 0x90, 0x5c, 0x8b,\n",
-            "  0x7e, 0xb4, 0xa0, 0x8c, 0x67, 0x93, 0x4e, 0x72, 0x65, 0x83, 0xb5, 0x77,\n",
-            "  0x83, 0x92, 0x43, 0x67, 0x8c, 0x81, 0xb1, 0x75, 0x6a, 0x61, 0x66, 0x6f,\n",
-            "  0x5d, 0x7f, 0x8d, 0x7b, 0x6b, 0x68, 0x6f, 0x85, 0x6e, 0x87, 0x97, 0x89,\n",
-            "  0x9b, 0x81, 0x7e, 0x7e, 0x9d, 0x83, 0x6b, 0x6a, 0xa5, 0x92, 0x7e, 0x70,\n",
-            "  0x60, 0x8f, 0x6f, 0x8b, 0x15, 0xa6, 0x66, 0x4e, 0x61, 0xbc, 0x38, 0x67,\n",
-            "  0x46, 0xab, 0x84, 0x5e, 0x3a, 0xac, 0x74, 0x58, 0x76, 0xc4, 0x7a, 0x76,\n",
-            "  0x67, 0xc0, 0x76, 0x6f, 0x52, 0xa6, 0xa2, 0x97, 0x76, 0xa6, 0x7f, 0x99,\n",
-            "  0x5d, 0xa5, 0x5f, 0x60, 0x58, 0x88, 0x3f, 0x9e, 0x7d, 0x81, 0x71, 0x63,\n",
-            "  0x42, 0x55, 0x3e, 0xbd, 0xa9, 0x7a, 0xa5, 0x67, 0x62, 0x7a, 0x80, 0x9e,\n",
-            "  0xc3, 0x54, 0x7f, 0x9f, 0x93, 0x73, 0xbd, 0x79, 0x74, 0x2e, 0x54, 0x9e,\n",
-            "  0xaa, 0x76, 0x68, 0x80, 0x78, 0x64, 0x57, 0x93, 0xa4, 0x56, 0x75, 0x72,\n",
-            "  0x81, 0x7f, 0x48, 0xad, 0x89, 0x67, 0x60, 0x7e, 0x7a, 0x83, 0x6e, 0x95,\n",
-            "  0xb0, 0x57, 0x89, 0x91, 0x4d, 0x86, 0x78, 0x7b, 0x74, 0x8c, 0x8f, 0x8d,\n",
-            "  0x67, 0xa4, 0x64, 0x8d, 0x77, 0x9a, 0xa1, 0x88, 0x6e, 0x94, 0x33, 0x95,\n",
-            "  0x81, 0x76, 0xc6, 0x7d, 0x7d, 0x85, 0x5a, 0x6e, 0x8e, 0x69, 0x9e, 0x71,\n",
-            "  0x82, 0x81, 0x59, 0x5b, 0x71, 0x9a, 0x91, 0x8e, 0x80, 0x69, 0x71, 0x73,\n",
-            "  0x6e, 0x9a, 0x95, 0x94, 0x7b, 0x80, 0x82, 0x7e, 0x76, 0x84, 0x70, 0x72,\n",
-            "  0x9c, 0xa0, 0x77, 0x66, 0x55, 0xa1, 0x8c, 0x73, 0x35, 0xa0, 0x68, 0x4d,\n",
-            "  0x3b, 0xaa, 0x44, 0x6f, 0x3c, 0xc0, 0x96, 0x78, 0x33, 0xbd, 0x64, 0x5b,\n",
-            "  0x75, 0xd2, 0x83, 0x87, 0x59, 0xbd, 0x80, 0x80, 0x6e, 0x8e, 0x65, 0x7a,\n",
-            "  0x87, 0xb6, 0x8d, 0x94, 0x39, 0x95, 0x8b, 0x5d, 0x66, 0x71, 0x4e, 0x9f,\n",
-            "  0x96, 0x8a, 0x98, 0x47, 0x41, 0x6c, 0x4c, 0xac, 0x95, 0x81, 0x90, 0x75,\n",
-            "  0x59, 0x4c, 0xa2, 0x93, 0x99, 0x58, 0x7b, 0xaf, 0xa3, 0x52, 0xb0, 0x6c,\n",
-            "  0x5f, 0x47, 0x6e, 0x8e, 0xae, 0x3d, 0x81, 0x6d, 0x78, 0x52, 0x4f, 0x81,\n",
-            "  0x80, 0x68, 0x4b, 0x81, 0x74, 0x71, 0x67, 0xa7, 0x9a, 0x55, 0x84, 0x72,\n",
-            "  0x64, 0x6b, 0x6e, 0x9d, 0xab, 0x76, 0x79, 0x85, 0x40, 0x84, 0x80, 0x85,\n",
-            "  0x70, 0x91, 0x9a, 0x81, 0x5b, 0x89, 0x6b, 0x8a, 0x92, 0x8c, 0xa4, 0x7b,\n",
-            "  0x75, 0x89, 0x54, 0x76, 0x69, 0x69, 0xb3, 0x6c, 0x47, 0x7d, 0x4c, 0x7f,\n",
-            "  0x81, 0x86, 0x8f, 0x63, 0x71, 0x6a, 0x63, 0x67, 0x7c, 0x8f, 0xa0, 0x68,\n",
-            "  0x86, 0x58, 0x5b, 0x87, 0x6a, 0x82, 0x89, 0x78, 0x9d, 0x8d, 0xaa, 0x82,\n",
-            "  0x6e, 0xa4, 0x6f, 0x6d, 0x70, 0x9f, 0x7f, 0x77, 0x41, 0xa5, 0x86, 0x61,\n",
-            "  0x2d, 0x99, 0xa9, 0x5f, 0x5a, 0xb3, 0x51, 0x70, 0x5a, 0xce, 0x77, 0x68,\n",
-            "  0x2c, 0xb8, 0x90, 0x44, 0x58, 0xb9, 0x74, 0x8e, 0x70, 0xb3, 0x9a, 0x75,\n",
-            "  0x6d, 0xc0, 0x9e, 0x8e, 0x8d, 0xa8, 0x7b, 0xa8, 0x4a, 0x89, 0x6e, 0x7f,\n",
-            "  0x5d, 0x6e, 0x46, 0x91, 0x6d, 0x81, 0x89, 0x3e, 0x35, 0x69, 0x44, 0xaf,\n",
-            "  0x99, 0x8d, 0x94, 0x54, 0x60, 0x5b, 0xaf, 0x97, 0x92, 0x4e, 0x80, 0xae,\n",
-            "  0x9e, 0x62, 0xa3, 0x77, 0x6e, 0x5d, 0x71, 0xa0, 0xa6, 0x59, 0x84, 0x5d,\n",
-            "  0x65, 0x4a, 0x69, 0xa1, 0xa1, 0x40, 0x75, 0x65, 0x6b, 0x68, 0x60, 0xb3,\n",
-            "  0x92, 0x27, 0x70, 0x67, 0x9b, 0x5e, 0x50, 0xaf, 0xae, 0x64, 0x7a, 0x6e,\n",
-            "  0x61, 0x94, 0x3b, 0x8f, 0x86, 0x7f, 0x98, 0x88, 0x7a, 0x7f, 0x61, 0x7b,\n",
-            "  0x64, 0x96, 0x96, 0x79, 0x5c, 0x96, 0x52, 0x92, 0x76, 0x7e, 0xc4, 0x60,\n",
-            "  0x6d, 0x7b, 0x41, 0x8c, 0x7b, 0x8e, 0x9a, 0x66, 0x79, 0x95, 0x67, 0x6a,\n",
-            "  0x7a, 0x9b, 0xa9, 0x85, 0x6d, 0x66, 0x55, 0x65, 0x76, 0x8b, 0x90, 0x86,\n",
-            "  0x88, 0x8b, 0x8f, 0x7e, 0x83, 0x7c, 0x75, 0x5f, 0x78, 0x96, 0x76, 0x47,\n",
-            "  0x54, 0x9c, 0x8d, 0x7d, 0x24, 0x9f, 0x79, 0x5c, 0x55, 0xb2, 0x3b, 0x67,\n",
-            "  0x4e, 0xd2, 0x90, 0x79, 0x3c, 0xc3, 0x8b, 0x4a, 0x7c, 0xd7, 0x70, 0x75,\n",
-            "  0x5b, 0xaf, 0xa8, 0x6b, 0x59, 0xc1, 0x6d, 0x5f, 0x5d, 0x96, 0x87, 0x9a,\n",
-            "  0x5d, 0x7f, 0x8e, 0x6d, 0x5c, 0x75, 0x3f, 0xb6, 0x8e, 0x81, 0x7b, 0x31,\n",
-            "  0x47, 0x67, 0x56, 0xb6, 0x90, 0x71, 0x89, 0x63, 0x61, 0x75, 0x8d, 0x8b,\n",
-            "  0x97, 0x62, 0x62, 0x85, 0x9c, 0x64, 0xb7, 0x61, 0x71, 0x3f, 0x6c, 0x8b,\n",
-            "  0xaa, 0x43, 0x82, 0x70, 0x52, 0x52, 0x80, 0xaa, 0x9e, 0x5d, 0x90, 0x69,\n",
-            "  0x8a, 0x77, 0x6d, 0x9f, 0x9e, 0x5f, 0x84, 0x61, 0x87, 0x70, 0x43, 0xab,\n",
-            "  0x97, 0x6e, 0x84, 0x6c, 0x5d, 0x82, 0x64, 0x85, 0x83, 0x7e, 0x82, 0x7c,\n",
-            "  0x7b, 0x91, 0x55, 0x7e, 0x77, 0x88, 0xba, 0x71, 0x6d, 0x7b, 0x71, 0x8a,\n",
-            "  0x7f, 0x84, 0xb5, 0x63, 0x4a, 0x9a, 0x3c, 0x70, 0x7a, 0x99, 0xa3, 0x50,\n",
-            "  0x84, 0x82, 0x56, 0x4c, 0x74, 0x8e, 0xa3, 0x77, 0x8f, 0x4e, 0x5f, 0x6d,\n",
-            "  0x97, 0x89, 0xa0, 0x6b, 0x7c, 0x8c, 0x85, 0x82, 0x8e, 0xa1, 0x89, 0x5b,\n",
-            "  0x7f, 0x8b, 0x8f, 0x5e, 0x74, 0x96, 0x8a, 0x7d, 0x15, 0x7b, 0x8f, 0x88,\n",
-            "  0x5f, 0xa7, 0x63, 0x5b, 0x39, 0xbd, 0x96, 0x56, 0x4c, 0xb4, 0x7b, 0x53,\n",
-            "  0x5a, 0xaf, 0x79, 0x7b, 0x5c, 0xa6, 0xaa, 0x74, 0x5f, 0xa0, 0x76, 0x9e,\n",
-            "  0x71, 0x9a, 0x60, 0xa4, 0x33, 0x87, 0x66, 0x66, 0x64, 0x7d, 0x6d, 0xac,\n",
-            "  0x9e, 0x8c, 0x78, 0x4f, 0x3d, 0x7b, 0x53, 0xb1, 0x97, 0x8a, 0x96, 0x6e,\n",
-            "  0x60, 0x4b, 0xa9, 0x9e, 0x93, 0x6e, 0x93, 0xb7, 0xae, 0x46, 0xb9, 0x60,\n",
-            "  0x72, 0x46, 0x80, 0x95, 0xb5, 0x57, 0x82, 0x53, 0x6e, 0x4e, 0x5b, 0xa2,\n",
-            "  0x9a, 0x3d, 0x8b, 0x6c, 0x84, 0x65, 0x69, 0xa1, 0x8c, 0x60, 0x83, 0x74,\n",
-            "  0x73, 0x53, 0x5d, 0x7e, 0x7f, 0x79, 0x6e, 0x81, 0x89, 0x8f, 0x51, 0x81,\n",
-            "  0x99, 0x97, 0x81, 0x8a, 0x87, 0x83, 0x43, 0x90, 0x89, 0x94, 0x93, 0x7a,\n",
-            "  0x66, 0x80, 0x82, 0x82, 0x79, 0x85, 0xb0, 0x6b, 0x87, 0x7b, 0x53, 0x89,\n",
-            "  0x79, 0x9d, 0xab, 0x6e, 0x82, 0x84, 0x50, 0x8f, 0x7e, 0x74, 0x90, 0x74,\n",
-            "  0x6e, 0x65, 0x84, 0x70, 0x82, 0x7a, 0x9e, 0x6d, 0x8f, 0x62, 0xb2, 0x84,\n",
-            "  0x78, 0x7e, 0x72, 0x5a, 0x7a, 0x85, 0x8c, 0x4b, 0x70, 0x99, 0x87, 0x78,\n",
-            "  0x26, 0x95, 0xb9, 0x77, 0x4d, 0xb6, 0x51, 0x6a, 0x41, 0xbf, 0x76, 0x68,\n",
-            "  0x56, 0xb6, 0x80, 0x53, 0x83, 0xaf, 0x87, 0x79, 0x79, 0xb4, 0x89, 0x7d,\n",
-            "  0x47, 0x9d, 0xa0, 0x86, 0x89, 0xc3, 0x6d, 0x99, 0x41, 0x89, 0x9a, 0x59,\n",
-            "  0x54, 0x83, 0x79, 0x9d, 0x7b, 0x73, 0x88, 0x4a, 0x42, 0x64, 0x7a, 0x9f,\n",
-            "  0x7b, 0x6e, 0x71, 0x7b, 0x6a, 0x61, 0xae, 0xa3, 0xa0, 0x68, 0x95, 0x9d,\n",
-            "  0x94, 0x49, 0x8b, 0x70, 0x8a, 0x5f, 0x49, 0xbb, 0xa7, 0x4a, 0xa1, 0x59,\n",
-            "  0x59, 0x59, 0x6d, 0xa0, 0x9f, 0x50, 0xa0, 0x7b, 0x75, 0x49, 0x5a, 0x8c,\n",
-            "  0x84, 0x68, 0x78, 0x57, 0x7a, 0x6e, 0x6b, 0x87, 0x9c, 0x7b, 0x84, 0x83,\n",
-            "  0x79, 0x7d, 0x5a, 0x77, 0x77, 0x6f, 0x6f, 0x7c, 0x8f, 0x83, 0x40, 0x62,\n",
-            "  0x6a, 0x87, 0xab, 0x74, 0x86, 0x96, 0x7a, 0x7d, 0x7b, 0x81, 0x9a, 0x65,\n",
-            "  0x60, 0x82, 0x61, 0x73, 0x71, 0x77, 0xa7, 0x79, 0x87, 0x8c, 0x4e, 0x72,\n",
-            "  0x8d, 0x89, 0x94, 0x6d, 0x75, 0x6d, 0x6e, 0x82, 0x7a, 0x8d, 0xa9, 0x77,\n",
-            "  0x77, 0x7c, 0x74, 0xa7, 0xb7, 0x67, 0x75, 0x67, 0x7e, 0x9f, 0x73, 0x60,\n",
-            "  0x6c, 0x95, 0x7f, 0x62, 0x31, 0x70, 0x85, 0x7a, 0x5f, 0xc0, 0x69, 0x66,\n",
-            "  0x71, 0xb0, 0x81, 0x5d, 0x48, 0xc9, 0x86, 0x39, 0x93, 0xa4, 0x8e, 0x7c,\n",
-            "  0x5e, 0xbb, 0x98, 0x5c, 0x74, 0x9c, 0x89, 0x6d, 0x74, 0xbd, 0x8e, 0x6e,\n",
-            "  0x5f, 0x9a, 0x6d, 0x70, 0x57, 0x9c, 0x58, 0xb7, 0x8e, 0x94, 0xa0, 0x3f,\n",
-            "  0x39, 0x75, 0x6f, 0xb4, 0xa2, 0x94, 0xa9, 0x70, 0x61, 0x8a, 0x70, 0x92,\n",
-            "  0xa7, 0x7f, 0x7f, 0x8d, 0x7a, 0x73, 0xa1, 0x5f, 0x8a, 0x4a, 0x65, 0xaa,\n",
-            "  0x92, 0x6e, 0x98, 0x51, 0x81, 0x47, 0x57, 0xb8, 0x89, 0x50, 0x8a, 0x6d,\n",
-            "  0x8b, 0x50, 0x8a, 0x86, 0x9b, 0x7d, 0x5b, 0x4a, 0x68, 0x74, 0x53, 0x9b,\n",
-            "  0x94, 0x74, 0x7c, 0x6f, 0x62, 0x86, 0x5b, 0x8f, 0x82, 0x96, 0x6e, 0x7c,\n",
-            "  0x80, 0x8f, 0x47, 0x5b, 0x70, 0x95, 0x97, 0x77, 0x8d, 0x8e, 0x69, 0x62,\n",
-            "  0x78, 0x8f, 0xbf, 0x5e, 0x76, 0xae, 0x4d, 0x84, 0x73, 0x76, 0xab, 0x6f,\n",
-            "  0x7f, 0x8c, 0x4b, 0x7d, 0x96, 0x7d, 0xb3, 0x55, 0x78, 0x8d, 0x76, 0x73,\n",
-            "  0x8d, 0x8e, 0x98, 0x6a, 0x91, 0x86, 0x6d, 0x8c, 0x7d, 0x93, 0x97, 0x56,\n",
-            "  0x79, 0x8f, 0xa3, 0x7f, 0x7e, 0x82, 0xa0, 0x63, 0x3d, 0x6b, 0x88, 0x5e,\n",
-            "  0x61, 0xc0, 0x45, 0x5f, 0x66, 0xb0, 0x6c, 0x6d, 0x29, 0xd5, 0x95, 0x3b,\n",
-            "  0x77, 0xaa, 0x62, 0x70, 0x63, 0xce, 0x8c, 0x6e, 0x56, 0xaa, 0x77, 0x6e,\n",
-            "  0x90, 0xcc, 0x6d, 0x7e, 0x41, 0x9f, 0x88, 0x4f, 0x5d, 0xb4, 0x4c, 0x9b,\n",
-            "  0x80, 0x97, 0x98, 0x59, 0x4c, 0x71, 0x53, 0xb4, 0x90, 0x97, 0x93, 0x90,\n",
-            "  0x46, 0x63, 0xa6, 0x87, 0x9d, 0x56, 0x7f, 0xab, 0x8e, 0x68, 0xc6, 0x5d,\n",
-            "  0x6e, 0x58, 0x4b, 0x85, 0xa1, 0x70, 0x8a, 0x60, 0x84, 0x44, 0x68, 0x8e,\n",
-            "  0x9b, 0x3a, 0x8c, 0x57, 0x91, 0x4c, 0x6b, 0x9c, 0xa7, 0x64, 0x82, 0x5f,\n",
-            "  0x68, 0x6d, 0x4d, 0xa1, 0x6c, 0x91, 0x6c, 0x6b, 0x64, 0x97, 0x86, 0x81,\n",
-            "  0x8d, 0x8e, 0x80, 0x72, 0x88, 0x96, 0x5d, 0x6e, 0x7c, 0x67, 0x97, 0x69,\n",
-            "  0x95, 0x93, 0x61, 0x8b, 0x9b, 0x7d, 0xc8, 0x6f, 0x85, 0x80, 0x67, 0x68,\n",
-            "  0x90, 0x6b, 0xcc, 0x7c, 0xa3, 0xa0, 0x58, 0x81, 0x7a, 0x8d, 0x9f, 0x65,\n",
-            "  0x81, 0x82, 0x78, 0x6b, 0x85, 0x7b, 0x9b, 0x69, 0x86, 0x6c, 0x83, 0x6c,\n",
-            "  0x8e, 0x59, 0xab, 0x56, 0x7c, 0x7f, 0x7b, 0x84, 0x71, 0x63, 0x7d, 0x73,\n",
-            "  0x60, 0x8b, 0x7a, 0x7b, 0x5e, 0xbb, 0x4b, 0x40, 0x30, 0xcc, 0x80, 0x65,\n",
-            "  0x6c, 0xb7, 0x80, 0x35, 0x7d, 0xa3, 0x5c, 0x6c, 0x49, 0xa6, 0x9b, 0x7b,\n",
-            "  0x53, 0xba, 0x62, 0x76, 0x78, 0xa0, 0x72, 0x80, 0x78, 0x93, 0x87, 0x62,\n",
-            "  0x64, 0x84, 0x6f, 0xa1, 0x70, 0x90, 0x9a, 0x6b, 0x42, 0x55, 0x6d, 0xc5,\n",
-            "  0xa6, 0x8a, 0x79, 0x64, 0x4c, 0x72, 0x7b, 0xa9, 0xa3, 0x70, 0x84, 0x8f,\n",
-            "  0x63, 0x7a, 0x9c, 0x4e, 0x5a, 0x76, 0x91, 0x67, 0xaf, 0x76, 0xbf, 0x46,\n",
-            "  0x62, 0x3f, 0x7d, 0xa7, 0x8d, 0x62, 0x90, 0x5b, 0x9a, 0x44, 0x51, 0x80,\n",
-            "  0xa6, 0x7e, 0x8d, 0x6a, 0x73, 0x65, 0x72, 0x82, 0x99, 0xb4, 0x6a, 0x75,\n",
-            "  0x85, 0x90, 0x47, 0x62, 0x9e, 0x95, 0x94, 0x78, 0x89, 0x74, 0x5d, 0xa3,\n",
-            "  0x7f, 0x9d, 0x7d, 0x63, 0x96, 0x86, 0x8d, 0xa2, 0x95, 0xab, 0xae, 0x5d,\n",
-            "  0x93, 0x8d, 0x3d, 0x76, 0x9e, 0x9c, 0xc4, 0x71, 0x7d, 0xa3, 0x75, 0x7e,\n",
-            "  0x6d, 0x9d, 0xa3, 0x7f, 0x94, 0x89, 0x47, 0x71, 0x8b, 0x95, 0xb1, 0x72,\n",
-            "  0x90, 0x53, 0x7e, 0x8f, 0x8c, 0x90, 0xa1, 0x4d, 0x59, 0x62, 0x73, 0xa0,\n",
-            "  0x69, 0x88, 0x86, 0x71, 0x60, 0x3b, 0x81, 0x57, 0x7d, 0x86, 0x58, 0x63,\n",
-            "  0x7d, 0x98, 0x74, 0x67, 0x5d, 0xb0, 0x67, 0x45, 0x9b, 0xa9, 0x94, 0x68,\n",
-            "  0x43, 0x8b, 0x85, 0x56, 0x63, 0x96, 0x87, 0x78, 0x88, 0xbf, 0x92, 0x8d,\n",
-            "  0x60, 0xa8, 0x7e, 0x7e, 0x78, 0x80, 0x66, 0x92, 0x6e, 0x97, 0xab, 0x7f,\n",
-            "  0x4f, 0x65, 0x59, 0xb0, 0x9b, 0x6b, 0x9f, 0x70, 0x6f, 0x5c, 0xac, 0x95,\n",
-            "  0xa3, 0x54, 0x8e, 0xa9, 0x9e, 0x8c, 0xa5, 0x66, 0x5f, 0x5b, 0x6c, 0x83,\n",
-            "  0x90, 0x73, 0x85, 0x64, 0x61, 0x51, 0x4a, 0x63, 0xa1, 0x96, 0x7e, 0x4e,\n",
-            "  0x87, 0x60, 0x68, 0xb5, 0x9a, 0x8d, 0x75, 0x4e, 0x8a, 0x7a, 0x5f, 0x9f,\n",
-            "  0x74, 0x80, 0x69, 0x6d, 0x73, 0x92, 0x79, 0x7e, 0x85, 0x68, 0x83, 0x9d,\n",
-            "  0xb6, 0x9d, 0x6e, 0x8f, 0x78, 0x91, 0xaf, 0x8f, 0xa0, 0x9d, 0x73, 0x55,\n",
-            "  0x91, 0x8f, 0xb2, 0x76, 0x97, 0xab, 0x63, 0x63, 0x68, 0x7b, 0xab, 0x5c,\n",
-            "  0x77, 0xae, 0x4c, 0x72, 0x6e, 0x93, 0xb8, 0x51, 0x79, 0x84, 0x7d, 0x6b,\n",
-            "  0x7f, 0x8a, 0xba, 0x68, 0x7a, 0x43, 0x9a, 0x8d, 0x77, 0x8a, 0x6d, 0x56,\n",
-            "  0x79, 0x41, 0x7a, 0x4b, 0x81, 0x7a, 0x5c, 0x68, 0x58, 0x36, 0x6f, 0x6f,\n",
-            "  0x9f, 0xa6, 0x5f, 0x60, 0x4e, 0x67, 0x70, 0x4c, 0x69, 0x69, 0x94, 0x63,\n",
-            "  0x6d, 0x7b, 0x88, 0x9e, 0x6d, 0x98, 0x69, 0x68, 0x88, 0x80, 0x80, 0x7a,\n",
-            "  0x8e, 0x78, 0x5e, 0x8d, 0x7e, 0x91, 0x76, 0x64, 0x7e, 0x7f, 0x4e, 0xc9,\n",
-            "  0x79, 0x8f, 0x9c, 0x82, 0x3d, 0x62, 0x63, 0xc3, 0xb8, 0x7b, 0x72, 0x7b,\n",
-            "  0x50, 0x56, 0x95, 0x72, 0x8f, 0x6b, 0x90, 0x9d, 0x76, 0xa4, 0xa5, 0x79,\n",
-            "  0x54, 0x4f, 0x59, 0x85, 0xc5, 0x92, 0x97, 0x4d, 0x6f, 0x69, 0x77, 0x7f,\n",
-            "  0x71, 0x7c, 0x87, 0x59, 0x98, 0x61, 0x80, 0x81, 0x88, 0x6b, 0x6d, 0x7f,\n",
-            "  0x7f, 0x77, 0x60, 0xa2, 0x96, 0x73, 0x69, 0x86, 0x83, 0x8d, 0x60, 0x66,\n",
-            "  0x88, 0x8c, 0x93, 0x67, 0x98, 0x82, 0x7e, 0x91, 0x99, 0x59, 0x8e, 0x6e,\n",
-            "  0x90, 0xa1, 0x62, 0x8a, 0x98, 0x7b, 0xc8, 0x67, 0x85, 0x8d, 0x6c, 0xa1,\n",
-            "  0xa1, 0x92, 0xd0, 0x49, 0x85, 0x76, 0x89, 0x75, 0x88, 0x83, 0xa3, 0x77,\n",
-            "  0x85, 0x68, 0x82, 0x83, 0x7f, 0x79, 0xae, 0x85, 0x76, 0x84, 0x80, 0x9a,\n",
-            "  0x9d, 0x7b, 0x83, 0x90, 0x79, 0x88, 0x79, 0x9a, 0x93, 0x6c, 0x69, 0x79,\n",
-            "  0x5f, 0x90, 0x81, 0x7b, 0x87, 0x9d, 0x86, 0x82, 0x7a, 0x77, 0x71, 0x85,\n",
-            "  0x8b, 0x99, 0x8f, 0x7b, 0x58, 0x98, 0x84, 0x6e, 0x9a, 0xa1, 0x7a, 0x8c,\n",
-            "  0x77, 0xa8, 0x86, 0x93, 0x7b, 0x90, 0x79, 0x8a, 0x85, 0x8f, 0x84, 0x97,\n",
-            "  0x73, 0x83, 0x7b, 0x76, 0x8e, 0xa1, 0x89, 0x8a, 0x83, 0x9c, 0x65, 0x68,\n",
-            "  0x7b, 0x89, 0x92, 0x84, 0x6d, 0x90, 0x61, 0x78, 0x98, 0x8c, 0x8d, 0x87,\n",
-            "  0xa0, 0x99, 0x79, 0x7b, 0x69, 0xa4, 0x7a, 0x8d, 0x73, 0x71, 0x70, 0x80,\n",
-            "  0x82, 0x77, 0x81, 0x67, 0x75, 0x97, 0x71, 0x73, 0x85, 0x6d, 0x8e, 0x86,\n",
-            "  0x6e, 0x80, 0x86, 0x9e, 0x6f, 0x70, 0x67, 0x59, 0x65, 0x89, 0x67, 0x8b,\n",
-            "  0x7d, 0x68, 0x69, 0x7a, 0x5b, 0x7e, 0x87, 0xa1, 0x92, 0x7b, 0x64, 0x7e,\n",
-            "  0x76, 0x72, 0x71, 0xab, 0x7c, 0x83, 0x6f, 0xa1, 0x86, 0x76, 0x71, 0x6f,\n",
-            "  0x91, 0x77, 0x6c, 0x71, 0x92, 0x78, 0x70, 0x7f, 0x6e, 0x65, 0x77, 0x93,\n",
-            "  0x7e, 0x6c, 0x85, 0x9d, 0x78, 0x8b, 0x7c, 0x5f, 0x94, 0x86, 0x7c, 0x7f,\n",
-            "  0x83, 0x6e, 0x72, 0x9e, 0x6e, 0x6b, 0x8d, 0x91, 0x97, 0x8b, 0x7b, 0x72,\n",
-            "  0x86, 0x75, 0x7f, 0x96, 0x7d, 0x81, 0xa1, 0x55, 0xa6, 0x88, 0x96, 0x87,\n",
-            "  0x93, 0x68, 0x89, 0x72, 0x6f, 0x9c, 0x75, 0x7c, 0x79, 0x6c, 0x74, 0x84,\n",
-            "  0x7d, 0xa4, 0x86, 0x84, 0x84, 0x8d, 0x63, 0x7a, 0x63, 0xbc, 0x7e, 0x93,\n",
-            "  0x80, 0x8d, 0x71, 0x7a, 0x5f, 0x8c, 0x74, 0x96, 0x7e, 0x9b, 0x9d, 0x8d,\n",
-            "  0x5b, 0xa4, 0x71, 0x5e, 0x83, 0x78, 0x86, 0x7f, 0x70, 0x99, 0x87, 0x85,\n",
-            "  0x8e, 0x81, 0x93, 0x80, 0x89, 0xa0, 0x7a, 0x77, 0x8e, 0x73, 0x5f, 0x80,\n",
-            "  0x6d, 0x87, 0x5b, 0x7a, 0x85, 0x7c, 0x85, 0x63, 0x61, 0x9d, 0x6f, 0x68,\n",
-            "  0x77, 0x86, 0x61, 0x6d, 0x84, 0x98, 0x7c, 0x78, 0x69, 0x84, 0x91, 0x6d,\n",
-            "  0x81, 0xa1, 0x6c, 0x62, 0x95, 0x6d, 0x86, 0x8b, 0x95, 0x8f, 0x5e, 0x86,\n",
-            "  0x73, 0xa1, 0x83, 0x58, 0x5f, 0x8e, 0x76, 0x79, 0x9e, 0x92, 0x7c, 0x7b,\n",
-            "  0x81, 0x8b, 0x83, 0x7b, 0x78, 0x75, 0x70, 0x83, 0x70, 0x5a, 0x6a, 0x59,\n",
-            "  0xa3, 0x82, 0x7a, 0x91, 0x8b, 0x6e, 0x82, 0x8e, 0x70, 0x73, 0x91, 0x76,\n",
-            "  0xa5, 0x7f, 0x70, 0x81, 0x6f, 0x85, 0x94, 0xa6, 0x8c, 0x50, 0x76, 0x6e,\n",
-            "  0x64, 0x95, 0xa0, 0x64, 0x6c, 0x68, 0x8e, 0x8b, 0xa1, 0x7d, 0xa0, 0x7f,\n",
-            "  0x76, 0x8b, 0x7b, 0x93, 0x7b, 0x6e, 0x7e, 0x64, 0x8a, 0xa7, 0x78, 0x64,\n",
-            "  0x93, 0x67, 0x7d, 0x68, 0x5c, 0xa0, 0x76, 0x98, 0xaf, 0x80, 0x55, 0x96,\n",
-            "  0x97, 0x9c, 0x78, 0x75, 0x87, 0x85, 0x77, 0x77, 0x62, 0x93, 0x76, 0x68,\n",
-            "  0xa0, 0x80, 0x81, 0x7f, 0x9a, 0x68, 0x74, 0x69, 0x94, 0x77, 0x77, 0x72,\n",
-            "  0x90, 0x9a, 0x6f, 0x95, 0x89, 0x6b, 0x6b, 0x94, 0x7e, 0x9c, 0x6f, 0x67,\n",
-            "  0x8f, 0x82, 0x80, 0x92, 0x76, 0x80, 0x65, 0x9b, 0x6a, 0x7c, 0x75, 0x5a,\n",
-            "  0x87, 0xa1, 0x69, 0x7a, 0x79, 0x9e, 0x9a, 0x58, 0x81, 0x92, 0x72, 0x67,\n",
-            "  0x90, 0x80, 0x82, 0x61, 0x9f, 0x9e, 0x6a, 0x8d, 0x8d, 0x8a, 0x73, 0x81,\n",
-            "  0x68, 0x7f, 0x5b, 0x59, 0x98, 0x89, 0x71, 0x72, 0x58, 0x7b, 0x94, 0x5d,\n",
-            "  0xa9, 0x8b, 0x72, 0x7b, 0x65, 0x73, 0x5b, 0x8b, 0x7d, 0x86, 0x6e, 0x8c,\n",
-            "  0x66, 0x6f, 0x6b, 0x8b, 0x71, 0x80, 0x7f, 0x70, 0x70, 0x88, 0x70, 0x7e,\n",
-            "  0x84, 0x89, 0x7f, 0x81, 0x87, 0x77, 0x71, 0x88, 0x7f, 0x8f, 0x5e, 0x80,\n",
-            "  0x5d, 0xa1, 0x89, 0x77, 0x93, 0x8e, 0x55, 0x64, 0x88, 0x9a, 0x8b, 0x80,\n",
-            "  0x77, 0x6f, 0x91, 0x83, 0x6b, 0x9b, 0x85, 0x5c, 0x57, 0x7e, 0xa9, 0x63,\n",
-            "  0x83, 0xaa, 0x7c, 0xa1, 0x91, 0x5f, 0x68, 0x76, 0x7a, 0x97, 0x96, 0x84,\n",
-            "  0xca, 0x8d, 0x8c, 0x8b, 0x71, 0x81, 0x88, 0x92, 0xaa, 0x74, 0x49, 0x7a,\n",
-            "  0x90, 0x93, 0x7a, 0x61, 0x8c, 0x66, 0x71, 0xa0, 0xab, 0x7d, 0x86, 0x6c,\n",
-            "  0x9f, 0x77, 0x67, 0x6a, 0x89, 0x89, 0x88, 0x70, 0xad, 0x88, 0x69, 0x84,\n",
-            "  0x70, 0x8f, 0x79, 0x7c, 0x66, 0xa6, 0x71, 0x8d, 0x77, 0x99, 0x69, 0x76,\n",
-            "  0x79, 0x7d, 0x9c, 0x6f, 0x64, 0x8b, 0x70, 0x82, 0x69, 0xa4, 0x65, 0x6e,\n",
-            "  0x7f, 0x9e, 0x7e, 0x84, 0x8c, 0x9c, 0x6c, 0x5b, 0x6e, 0xa7, 0x6d, 0x7a,\n",
-            "  0x92, 0x78, 0x9a, 0x6f, 0x81, 0x91, 0x71, 0x7d, 0x6b, 0x99, 0x6b, 0x92,\n",
-            "  0x5e, 0x7e, 0x64, 0x95, 0x78, 0x90, 0x6f, 0x68, 0x8a, 0x85, 0x6f, 0x88,\n",
-            "  0x64, 0x66, 0x7f, 0x78, 0x7c, 0x95, 0x66, 0x6c, 0x76, 0x6a, 0x9b, 0x8f,\n",
-            "  0x9d, 0x78, 0x86, 0x95, 0x73, 0x66, 0x6d, 0x71, 0x8b, 0x7f, 0x6f, 0x70,\n",
-            "  0x64, 0x94, 0xa0, 0x83, 0x6b, 0x6d, 0x85, 0x89, 0x68, 0x92, 0x8e, 0x51,\n",
-            "  0x81, 0x85, 0x86, 0x6e, 0x83, 0x85, 0x8a, 0x5e, 0x68, 0xbf, 0xc4, 0xa5,\n",
-            "  0x8b, 0x67, 0x86, 0x59, 0x85, 0x9e, 0x96, 0x67, 0x82, 0x7c, 0x6c, 0x80,\n",
-            "  0x84, 0xae, 0x9d, 0x80, 0xc2, 0x58, 0x5d, 0x95, 0x85, 0x8b, 0x7f, 0x5d,\n",
-            "  0xc7, 0x75, 0x75, 0x87, 0xa2, 0x8c, 0x62, 0x71, 0x9c, 0x61, 0x7f, 0x9c,\n",
-            "  0xca, 0x8d, 0x89, 0x6e, 0x7c, 0x71, 0x81, 0x99, 0x95, 0xa4, 0x76, 0x6f,\n",
-            "  0x64, 0x7b, 0x6c, 0x72, 0x8b, 0x83, 0x70, 0x70, 0x8b, 0xa4, 0x69, 0x76,\n",
-            "  0x6e, 0x8d, 0x7a, 0x80, 0x8f, 0x9e, 0x73, 0x4b, 0x75, 0x78, 0x77, 0x7b,\n",
-            "  0x8e, 0x92, 0x88, 0x49, 0x54, 0x9f, 0x7a, 0x7f, 0x68, 0x9f, 0x7f, 0x57,\n",
-            "  0x6b, 0xad, 0x85, 0x6f, 0x81, 0xa1, 0x96, 0x6f, 0x73, 0x8d, 0x5e, 0x65,\n",
-            "  0x7a, 0x8c, 0x7c, 0x6a, 0x7e, 0x7a, 0x6a, 0x97, 0x59, 0x86, 0x62, 0x77,\n",
-            "  0x70, 0x7a, 0x68, 0x62, 0x68, 0x86, 0x7e, 0x76, 0x9a, 0x7f, 0x6c, 0x7e,\n",
-            "  0x8a, 0x76, 0x65, 0x8f, 0x7d, 0x65, 0x76, 0xa4, 0x95, 0x62, 0x78, 0x97,\n",
-            "  0x7a, 0x6e, 0x7a, 0x7a, 0x7e, 0x91, 0x8c, 0x8a, 0x91, 0x82, 0x89, 0x6d,\n",
-            "  0x87, 0x90, 0x69, 0x71, 0x96, 0xa6, 0x7c, 0x7c, 0xa8, 0xa8, 0x62, 0x77,\n",
-            "  0x76, 0x99, 0xdd, 0x76, 0x8a, 0x5c, 0x86, 0x6a, 0x69, 0x9c, 0xa5, 0x7d,\n",
-            "  0x78, 0x6a, 0x88, 0x77, 0x77, 0xae, 0x8a, 0x99, 0xcb, 0x85, 0x59, 0x84,\n",
-            "  0x7b, 0x97, 0x8a, 0x82, 0xc5, 0x65, 0x8c, 0x93, 0xc3, 0x8c, 0x87, 0x64,\n",
-            "  0x91, 0x41, 0x70, 0xa8, 0xd1, 0x8b, 0x82, 0x71, 0x9c, 0x71, 0x4e, 0x86,\n",
-            "  0x98, 0x86, 0x7f, 0x7e, 0x69, 0x99, 0x79, 0x78, 0x77, 0xb3, 0x6b, 0x80,\n",
-            "  0x84, 0x8b, 0x56, 0x73, 0x84, 0x95, 0x82, 0x94, 0x5b, 0x92, 0x83, 0x46,\n",
-            "  0x66, 0x89, 0x6d, 0x61, 0x99, 0xa6, 0x99, 0x3f, 0x6c, 0xab, 0x5d, 0x5f,\n",
-            "  0x6c, 0x8e, 0x6b, 0x4a, 0x72, 0xb6, 0x6c, 0x75, 0x78, 0xa6, 0x6f, 0x5b,\n",
-            "  0x56, 0x8b, 0x57, 0x74, 0x8f, 0xab, 0x53, 0x56, 0x5d, 0x63, 0x63, 0x8b,\n",
-            "  0x65, 0x78, 0x71, 0x67, 0x7a, 0x62, 0x8d, 0x78, 0x99, 0x76, 0x94, 0x7a,\n",
-            "  0xa3, 0x70, 0x55, 0x87, 0x7e, 0x7c, 0x57, 0x57, 0x6e, 0x79, 0x94, 0x8f,\n",
-            "  0x86, 0x80, 0x90, 0x7d, 0x7d, 0x7f, 0x7f, 0x68, 0x41, 0x86, 0x8c, 0x6f,\n",
-            "  0x8a, 0x7f, 0x87, 0x8a, 0x7e, 0x7f, 0x5d, 0x71, 0x91, 0x81, 0x93, 0x71,\n",
-            "  0x91, 0xc6, 0x70, 0x4a, 0x74, 0xa8, 0xf3, 0x72, 0xa7, 0x80, 0x7e, 0x41,\n",
-            "  0x84, 0xa3, 0xb6, 0x94, 0xba, 0x84, 0x70, 0x74, 0x71, 0xac, 0x9f, 0x9d,\n",
-            "  0xe4, 0x67, 0x6a, 0x87, 0x92, 0x8e, 0x92, 0x82, 0xdb, 0x5e, 0x9b, 0x90,\n",
-            "  0xd5, 0x87, 0x8d, 0x7c, 0x9c, 0x3c, 0x6c, 0xab, 0xc2, 0x86, 0x83, 0x79,\n",
-            "  0x6c, 0x61, 0x51, 0xa9, 0x99, 0x79, 0x72, 0x80, 0x6f, 0x85, 0x57, 0x6c,\n",
-            "  0x81, 0x86, 0x6e, 0x88, 0x87, 0x8d, 0x8e, 0x81, 0x67, 0x88, 0x62, 0x99,\n",
-            "  0x87, 0xab, 0x8f, 0x57, 0x60, 0x77, 0x64, 0x81, 0x96, 0xa3, 0x81, 0x3d,\n",
-            "  0x4e, 0xb9, 0x57, 0x6e, 0x99, 0xad, 0x6a, 0x3e, 0x74, 0x96, 0x7e, 0x79,\n",
-            "  0x65, 0xa4, 0x7c, 0x6a, 0x53, 0x87, 0x56, 0x6f, 0x5e, 0x97, 0x85, 0x42,\n",
-            "  0x56, 0x6b, 0x67, 0x78, 0x7d, 0xa6, 0x7c, 0x7c, 0x7d, 0x78, 0x7b, 0x84,\n",
-            "  0x99, 0x7b, 0x89, 0x71, 0x76, 0x8b, 0x76, 0x73, 0x7d, 0x83, 0x56, 0x4f,\n",
-            "  0x86, 0x72, 0x83, 0x88, 0x6a, 0x93, 0x69, 0x90, 0x6c, 0x73, 0x6f, 0x63,\n",
-            "  0x55, 0x88, 0x6b, 0x88, 0x7c, 0x86, 0x87, 0x7b, 0x6c, 0x7e, 0x60, 0x57,\n",
-            "  0xa8, 0x81, 0xa3, 0x72, 0xba, 0xbf, 0x66, 0x65, 0x70, 0xb9, 0xe4, 0x78,\n",
-            "  0x99, 0x67, 0x8c, 0x72, 0x88, 0x96, 0xb5, 0x72, 0x8a, 0x66, 0x81, 0x39,\n",
-            "  0x85, 0x93, 0xa0, 0x9c, 0xdf, 0x74, 0x8a, 0x6d, 0x93, 0xa1, 0x8c, 0x7a,\n",
-            "  0xb5, 0x4b, 0x89, 0xae, 0xba, 0x9c, 0x96, 0x9a, 0xb4, 0x33, 0x5a, 0xb1,\n",
-            "  0xcd, 0x88, 0x84, 0x63, 0x8c, 0x5e, 0x71, 0x6d, 0xa7, 0x8a, 0x62, 0x85,\n",
-            "  0x77, 0x75, 0x62, 0x79, 0x96, 0x73, 0x4f, 0x7d, 0x93, 0x8a, 0x88, 0x7e,\n",
-            "  0x59, 0x6c, 0x7f, 0x87, 0x6f, 0x91, 0x88, 0x59, 0x6d, 0x83, 0x70, 0x7c,\n",
-            "  0x7f, 0x8d, 0x7f, 0x26, 0x41, 0xcf, 0x6b, 0x6e, 0x75, 0xa3, 0x90, 0x5e,\n",
-            "  0x3a, 0x94, 0x61, 0x9a, 0x6f, 0x9f, 0x69, 0x7d, 0x55, 0x8c, 0x60, 0x7c,\n",
-            "  0x93, 0x85, 0x85, 0x4b, 0x54, 0x71, 0x60, 0x8a, 0x6d, 0x8c, 0x9c, 0x7e,\n",
-            "  0x5b, 0x79, 0x74, 0x7b, 0x7b, 0x9d, 0x5b, 0x65, 0x81, 0x82, 0x66, 0x89,\n",
-            "  0x82, 0x72, 0x77, 0x78, 0x75, 0x76, 0x6b, 0x74, 0x89, 0x73, 0x6c, 0x6b,\n",
-            "  0x77, 0x7e, 0x67, 0x84, 0x41, 0x90, 0x58, 0x87, 0x98, 0x60, 0x96, 0x81,\n",
-            "  0x6b, 0x74, 0x7d, 0x56, 0x72, 0x71, 0x9a, 0x7d, 0xc5, 0xd0, 0x88, 0x6e,\n",
-            "  0x4d, 0xbe, 0xef, 0x8a, 0xa7, 0x92, 0x82, 0x67, 0x7f, 0x91, 0xc5, 0x7d,\n",
-            "  0xad, 0x77, 0x6b, 0x4e, 0x8e, 0x99, 0x9b, 0x8e, 0xc7, 0x7f, 0x8a, 0x8e,\n",
-            "  0x8f, 0x87, 0x9c, 0x75, 0xb0, 0x53, 0x75, 0x97, 0xc7, 0x98, 0xa4, 0xa4,\n",
-            "  0x80, 0x41, 0x79, 0xc3, 0xdb, 0x86, 0x9d, 0x75, 0x7f, 0x67, 0x7a, 0x96,\n",
-            "  0xc3, 0x83, 0x54, 0x8e, 0x6f, 0xa8, 0x7c, 0x65, 0x78, 0x7e, 0x59, 0xa3,\n",
-            "  0x8a, 0x97, 0x8b, 0x82, 0x5e, 0x66, 0x82, 0x9b, 0x9e, 0x9f, 0x70, 0x49,\n",
-            "  0x55, 0x88, 0x8a, 0x7e, 0x90, 0xa7, 0x6b, 0x3b, 0x28, 0xc0, 0x63, 0x7e,\n",
-            "  0x60, 0x90, 0x7c, 0x3f, 0x54, 0x9c, 0x7d, 0x8a, 0x6a, 0xa9, 0x6f, 0x61,\n",
-            "  0x76, 0x86, 0x64, 0x88, 0x72, 0xa5, 0x6b, 0x4d, 0x56, 0x6c, 0x52, 0xa1,\n",
-            "  0x84, 0x69, 0x69, 0x5b, 0x71, 0x84, 0x76, 0x9b, 0x92, 0x70, 0x86, 0x8b,\n",
-            "  0x71, 0x68, 0x56, 0x92, 0x76, 0x8f, 0x8f, 0x72, 0x5a, 0x77, 0x6f, 0x92,\n",
-            "  0x72, 0x72, 0x5e, 0x7a, 0x70, 0x73, 0x60, 0x7d, 0x5a, 0x93, 0x7f, 0x6b,\n",
-            "  0x89, 0x6b, 0xa1, 0x85, 0x5c, 0x8d, 0x76, 0x7c, 0x6f, 0x73, 0x96, 0x6d,\n",
-            "  0xbb, 0xad, 0x53, 0x53, 0x5f, 0x9a, 0xe2, 0x8d, 0xa7, 0x6d, 0x8a, 0x5b,\n",
-            "  0x85, 0x9c, 0xb4, 0x7b, 0xb3, 0x52, 0x75, 0x7f, 0x7a, 0x8c, 0x91, 0x7e,\n",
-            "  0xca, 0x5f, 0x64, 0x71, 0x85, 0x9a, 0x91, 0x72, 0xbd, 0x6e, 0x9b, 0x81,\n",
-            "  0x8f, 0xa8, 0xac, 0x7d, 0xb4, 0x5f, 0x45, 0xc5, 0xc8, 0x7a, 0x93, 0x8e,\n",
-            "  0x7b, 0x41, 0x69, 0x94, 0x8b, 0x76, 0x59, 0x81, 0x73, 0x92, 0x8e, 0x63,\n",
-            "  0x8e, 0x74, 0x33, 0xa5, 0x9c, 0xa2, 0x88, 0x48, 0x5d, 0x8c, 0x7d, 0xa6,\n",
-            "  0x68, 0x9a, 0x6f, 0x58, 0x6c, 0x8f, 0x77, 0x65, 0x97, 0x9d, 0x7a, 0x37,\n",
-            "  0x59, 0xab, 0x6e, 0x8f, 0x7a, 0xae, 0x65, 0x3e, 0x46, 0xa9, 0x82, 0x82,\n",
-            "  0x9c, 0x9d, 0x62, 0x79, 0x66, 0x7f, 0x5e, 0x88, 0x9e, 0x8f, 0x84, 0x71,\n",
-            "  0x5d, 0x6d, 0x70, 0xa0, 0x69, 0x92, 0x7f, 0x70, 0x66, 0x6f, 0x75, 0x8c,\n",
-            "  0x96, 0x7a, 0x85, 0x6a, 0x5a, 0x7c, 0x72, 0x8a, 0x8d, 0x7b, 0x8b, 0x5c,\n",
-            "  0x76, 0x69, 0x70, 0x7f, 0x74, 0xa1, 0x71, 0x91, 0x5a, 0x8c, 0x6e, 0x83,\n",
-            "  0x52, 0x78, 0x71, 0x6d, 0xa9, 0x63, 0x9d, 0x81, 0x52, 0x9e, 0x5d, 0x60,\n",
-            "  0x76, 0x93, 0x97, 0x67, 0xce, 0xc1, 0x75, 0x5e, 0x5f, 0x8c, 0xea, 0x76,\n",
-            "  0xad, 0x7a, 0x7d, 0x62, 0x85, 0x92, 0xd0, 0x6a, 0xbc, 0x53, 0x55, 0x5c,\n",
-            "  0x6d, 0x89, 0x9e, 0x71, 0xd2, 0x8b, 0x64, 0x61, 0x85, 0x9a, 0x77, 0x75,\n",
-            "  0xb9, 0x67, 0x8a, 0xac, 0x90, 0x8a, 0xb4, 0x91, 0xbb, 0x58, 0x94, 0xaf,\n",
-            "  0xb2, 0x76, 0xa2, 0x71, 0x95, 0x5e, 0x73, 0xa5, 0x92, 0x8c, 0x52, 0x96,\n",
-            "  0x53, 0x95, 0x84, 0x91, 0x93, 0x7a, 0x40, 0x88, 0xab, 0xa5, 0x63, 0x70,\n",
-            "  0x66, 0x88, 0x7e, 0x92, 0x89, 0x84, 0x78, 0x57, 0x3d, 0x8d, 0x84, 0x77,\n",
-            "  0x9b, 0x87, 0x5e, 0x4e, 0x42, 0xa0, 0x76, 0x8a, 0x77, 0x90, 0x83, 0x4c,\n",
-            "  0x42, 0x9b, 0x75, 0x7a, 0x88, 0x94, 0x98, 0x69, 0x4c, 0xa2, 0x6b, 0x7b,\n",
-            "  0x6e, 0x9b, 0x5d, 0x5f, 0x53, 0x6a, 0x63, 0x95, 0x69, 0x8a, 0x61, 0x75,\n",
-            "  0x6c, 0x7a, 0x58, 0x89, 0x84, 0x8f, 0x6b, 0x5a, 0x71, 0x6f, 0x59, 0x89,\n",
-            "  0x7d, 0x87, 0x5f, 0x77, 0x4b, 0x61, 0x77, 0x92, 0x67, 0x8e, 0x5c, 0x6f,\n",
-            "  0x5b, 0x77, 0x76, 0x6b, 0x44, 0x9d, 0x9f, 0x7f, 0x8b, 0x94, 0x9e, 0x7c,\n",
-            "  0x62, 0x94, 0x60, 0x55, 0x77, 0x8f, 0xa6, 0x62, 0xb5, 0xb2, 0x3c, 0x61,\n",
-            "  0x5c, 0x99, 0xeb, 0x5b, 0x90, 0x6c, 0x7f, 0x5f, 0x75, 0xa6, 0xcf, 0x77,\n",
-            "  0x98, 0x5d, 0x75, 0x69, 0x7f, 0x8a, 0xa7, 0x73, 0xc8, 0x74, 0x70, 0x82,\n",
-            "  0x76, 0x8f, 0xa2, 0x7a, 0xa4, 0x7a, 0x66, 0x81, 0x9b, 0x8f, 0x9e, 0x8b,\n",
-            "  0xa1, 0x51, 0x7b, 0xba, 0xc8, 0x90, 0xab, 0x92, 0x72, 0x57, 0x5b, 0xa3,\n",
-            "  0xb0, 0x7f, 0x4c, 0x7d, 0x5f, 0x8e, 0x6c, 0x7d, 0x71, 0x7e, 0x4e, 0x87,\n",
-            "  0xb7, 0x97, 0x7a, 0x4c, 0x5f, 0x72, 0x78, 0x84, 0x82, 0x7e, 0x63, 0x65,\n",
-            "  0x68, 0x78, 0x73, 0x85, 0x90, 0x99, 0x80, 0x57, 0x42, 0x8b, 0x8a, 0x77,\n",
-            "  0x71, 0x97, 0x6d, 0x44, 0x41, 0x8f, 0x78, 0x7d, 0x95, 0x81, 0x95, 0x5f,\n",
-            "  0x64, 0x87, 0x66, 0x80, 0x89, 0x9a, 0x61, 0x4d, 0x68, 0x7b, 0x72, 0x73,\n",
-            "  0x85, 0x92, 0x77, 0x7d, 0x73, 0x77, 0x54, 0x7a, 0x77, 0x7d, 0x7d, 0x7a,\n",
-            "  0x6e, 0x8e, 0x4f, 0x7d, 0x80, 0x9a, 0x79, 0x8b, 0x7b, 0x68, 0x6e, 0x86,\n",
-            "  0x7f, 0x93, 0x7a, 0x76, 0x72, 0x85, 0x6a, 0x7b, 0x57, 0x84, 0x96, 0x9a,\n",
-            "  0x8f, 0x91, 0x9b, 0x72, 0x73, 0x91, 0x53, 0x66, 0x76, 0x80, 0xae, 0x63,\n",
-            "  0xbf, 0x99, 0x5e, 0x77, 0x73, 0x9c, 0xd8, 0x74, 0xa7, 0x79, 0x52, 0x64,\n",
-            "  0x82, 0x95, 0xc7, 0x4f, 0xa8, 0x4f, 0x6d, 0x42, 0x7c, 0x89, 0xab, 0x83,\n",
-            "  0xc0, 0x82, 0x6a, 0x5f, 0x83, 0x92, 0xa8, 0x76, 0xc1, 0x77, 0x6e, 0x7b,\n",
-            "  0xa3, 0x9b, 0xaf, 0x87, 0xab, 0x60, 0x8d, 0xc2, 0xd2, 0x83, 0xb2, 0x78,\n",
-            "  0x8d, 0x39, 0x57, 0x9c, 0x90, 0x8e, 0x6e, 0x6a, 0x74, 0x79, 0x81, 0x6d,\n",
-            "  0x6f, 0x8e, 0x77, 0x92, 0x93, 0x7d, 0x5f, 0x68, 0x6a, 0x6c, 0x80, 0x8f,\n",
-            "  0x99, 0x84, 0x4f, 0x64, 0x5c, 0x93, 0x7c, 0x91, 0x98, 0x82, 0x62, 0x3f,\n",
-            "  0x41, 0x9f, 0x5d, 0x89, 0x98, 0x89, 0x73, 0x50, 0x32, 0xa8, 0xa0, 0x7a,\n",
-            "  0xa0, 0x95, 0x78, 0x69, 0x74, 0x7c, 0x89, 0x7b, 0x80, 0x65, 0x56, 0x6b,\n",
-            "  0x69, 0x78, 0x62, 0x87, 0xaf, 0x94, 0x7a, 0x64, 0x53, 0x86, 0x45, 0x99,\n",
-            "  0x88, 0x79, 0x4d, 0x74, 0x59, 0x91, 0x5f, 0x7b, 0x88, 0x90, 0x80, 0x86,\n",
-            "  0x7d, 0x7b, 0x64, 0xa3, 0x7f, 0x74, 0x89, 0x80, 0x7d, 0x7c, 0x7a, 0x87,\n",
-            "  0x5f, 0x8a, 0x5a, 0x72, 0x79, 0x74, 0x8c, 0x7c, 0x86, 0x91, 0x6e, 0x5d,\n",
-            "  0x61, 0x8e, 0xa2, 0x68, 0xd4, 0x92, 0x67, 0x66, 0x62, 0xa1, 0xf3, 0x63,\n",
-            "  0x91, 0x81, 0x74, 0x5f, 0x88, 0x98, 0xbb, 0x5a, 0x9b, 0x54, 0x6a, 0x5c,\n",
-            "  0x75, 0x88, 0xad, 0x7c, 0xb4, 0x7c, 0x69, 0x74, 0x84, 0x76, 0x9d, 0x9a,\n",
-            "  0xb0, 0x91, 0x5d, 0xa3, 0xa4, 0x7f, 0xbb, 0x80, 0xa4, 0x5d, 0x83, 0xaf,\n",
-            "  0xb7, 0x66, 0xb0, 0x7f, 0x89, 0x4b, 0x72, 0x9e, 0x99, 0x7c, 0x66, 0x71,\n",
-            "  0x6a, 0x6f, 0x6d, 0x67, 0x8d, 0x6d, 0x46, 0xa5, 0x9b, 0x84, 0x7a, 0x61,\n",
-            "  0x64, 0x5c, 0x88, 0x89, 0x95, 0x8c, 0x70, 0x4b, 0x6c, 0x85, 0x83, 0x8b,\n",
-            "  0x98, 0x87, 0x6a, 0x44, 0x4d, 0x9d, 0x78, 0x71, 0x78, 0x7e, 0x91, 0x5b,\n",
-            "  0x3f, 0x9f, 0x80, 0x62, 0xa7, 0x95, 0x5d, 0x74, 0x65, 0x9c, 0x6d, 0x7a,\n",
-            "  0x98, 0x79, 0x80, 0x61, 0x49, 0x82, 0x65, 0x92, 0x80, 0x96, 0x7c, 0x72,\n",
-            "  0x4f, 0x76, 0x5e, 0x8d, 0x97, 0xa5, 0x72, 0x57, 0x79, 0x87, 0x67, 0x87,\n",
-            "  0x80, 0x84, 0x7c, 0x6f, 0x66, 0x6b, 0x70, 0x9b, 0x64, 0x90, 0x59, 0x96,\n",
-            "  0x7a, 0x6f, 0x75, 0x89, 0x4e, 0x8a, 0x62, 0x6e, 0x9c, 0x8c, 0x9a, 0x78,\n",
-            "  0x8e, 0x91, 0x3d, 0x50, 0x72, 0x92, 0x9f, 0x63, 0xda, 0x92, 0x72, 0x60,\n",
-            "  0x59, 0xa6, 0xd0, 0x56, 0xc1, 0x6b, 0x5e, 0x76, 0x6e, 0x81, 0xbb, 0x4b,\n",
-            "  0xbb, 0x59, 0x68, 0x4f, 0x77, 0x87, 0xa1, 0x73, 0xbf, 0x65, 0x56, 0x67,\n",
-            "  0x77, 0x84, 0x8a, 0x7e, 0xb8, 0x85, 0x66, 0xa6, 0x99, 0xa0, 0xa5, 0x73,\n",
-            "  0x8d, 0x4a, 0x7d, 0xab, 0xb0, 0x6a, 0x94, 0x84, 0x87, 0x4c, 0x74, 0xa3,\n",
-            "  0xb3, 0xa9, 0x62, 0x7a, 0x71, 0x7f, 0x53, 0x79, 0x7a, 0x7c, 0x5e, 0x8f,\n",
-            "  0xa0, 0x90, 0x5c, 0x76, 0x6c, 0x92, 0x70, 0x9c, 0xb3, 0x8b, 0x7e, 0x57,\n",
-            "  0x5b, 0x9d, 0x96, 0x85, 0x70, 0x93, 0x8b, 0x67, 0x4c, 0x9c, 0x6a, 0x83,\n",
-            "  0x84, 0x90, 0x8e, 0x60, 0x56, 0xb3, 0x87, 0x7d, 0x86, 0x88, 0x79, 0x5b,\n",
-            "  0x58, 0x94, 0x92, 0x8e, 0x90, 0x76, 0x58, 0x51, 0x52, 0x63, 0x57, 0x88,\n",
-            "  0x9b, 0x7a, 0x85, 0x6c, 0x8b, 0x87, 0x5f, 0x8b, 0x90, 0x92, 0x81, 0x64,\n",
-            "  0x52, 0x8b, 0x77, 0x94, 0x96, 0x98, 0x69, 0x5b, 0x79, 0x87, 0x61, 0x96,\n",
-            "  0x7b, 0x9a, 0x61, 0x74, 0x7e, 0x8b, 0x82, 0x92, 0x4f, 0x87, 0x7f, 0x80,\n",
-            "  0x74, 0x97, 0x98, 0x7a, 0x79, 0x97, 0x65, 0x67, 0x66, 0xb1, 0xb1, 0x49,\n",
-            "  0xd6, 0x97, 0x58, 0x47, 0x62, 0x94, 0xd5, 0x82, 0xa0, 0x60, 0x3f, 0x67,\n",
-            "  0x6c, 0x9d, 0xb6, 0x58, 0xb1, 0x6e, 0x58, 0x4e, 0x7c, 0x83, 0x8b, 0x83,\n",
-            "  0xd5, 0x62, 0x8d, 0x84, 0x84, 0x8c, 0xa9, 0x6e, 0xac, 0x7f, 0x6d, 0x88,\n",
-            "  0xab, 0x8b, 0xb1, 0x77, 0x9b, 0x46, 0x76, 0xa7, 0xb8, 0x7b, 0xc5, 0x6e,\n",
-            "  0x73, 0x62, 0x68, 0x95, 0xab, 0x7c, 0x6f, 0x74, 0x56, 0x71, 0x61, 0x83,\n",
-            "  0x8a, 0x73, 0x54, 0x94, 0x86, 0x91, 0x60, 0x69, 0x65, 0x6b, 0x76, 0x85,\n",
-            "  0xae, 0x87, 0x8f, 0x55, 0x41, 0x98, 0x68, 0x87, 0x5e, 0x7a, 0x80, 0x38,\n",
-            "  0x50, 0xaf, 0x93, 0x79, 0x57, 0x96, 0x7b, 0x53, 0x4e, 0xc0, 0xa0, 0x85,\n",
-            "  0x87, 0x95, 0x86, 0x70, 0x4c, 0x9f, 0x77, 0x7d, 0x8b, 0x7a, 0x7b, 0x6d,\n",
-            "  0x57, 0x74, 0x81, 0x7d, 0xa2, 0x79, 0x64, 0x6c, 0x55, 0x70, 0x3c, 0x88,\n",
-            "  0x8a, 0x7a, 0x58, 0x72, 0x71, 0x7d, 0x6a, 0x8d, 0x78, 0x7e, 0x95, 0x8b,\n",
-            "  0x84, 0x7e, 0x73, 0x7c, 0x7e, 0x67, 0x89, 0x8b, 0x6d, 0x68, 0x66, 0x73,\n",
-            "  0x5a, 0x93, 0x82, 0x85, 0x97, 0x6b, 0x9a, 0x72, 0x51, 0xa2, 0x4f, 0x67,\n",
-            "  0x67, 0x7e, 0xbb, 0x37, 0xe3, 0x9c, 0x57, 0x5b, 0x6f, 0xa0, 0xdc, 0x5c,\n",
-            "  0xa6, 0x7c, 0x71, 0x77, 0x72, 0x88, 0xd0, 0x4d, 0x93, 0x58, 0x74, 0x6d,\n",
-            "  0x8f, 0x77, 0xa3, 0x76, 0xb7, 0x76, 0x6d, 0x6d, 0x6f, 0x7b, 0xaa, 0x6d,\n",
-            "  0xaa, 0x6a, 0x72, 0x98, 0x8d, 0x98, 0xb0, 0x52, 0x76, 0x5d, 0x61, 0xb7,\n",
-            "  0xac, 0x90, 0xa5, 0x75, 0x7e, 0x3d, 0x5b, 0x9a, 0xbf, 0x81, 0x83, 0x7b,\n",
-            "  0x5c, 0x77, 0x74, 0x82, 0x8d, 0x7e, 0x4f, 0x9f, 0x8f, 0x97, 0x7c, 0x75,\n",
-            "  0x5b, 0x73, 0x97, 0x73, 0x85, 0x7f, 0x70, 0x5a, 0x53, 0x81, 0x81, 0x89,\n",
-            "  0x73, 0x8d, 0x8a, 0x5c, 0x5f, 0x84, 0x86, 0x6f, 0x76, 0x78, 0x82, 0x6d,\n",
-            "  0x4f, 0xbb, 0x91, 0x61, 0x7e, 0x97, 0x6c, 0x67, 0x62, 0x83, 0x61, 0x7d,\n",
-            "  0x89, 0x76, 0x7b, 0x67, 0x56, 0x74, 0x49, 0x7b, 0x6b, 0x8b, 0x89, 0x74,\n",
-            "  0x5b, 0x7f, 0x78, 0x7b, 0x80, 0x7e, 0x63, 0x71, 0x5e, 0x91, 0x81, 0x92,\n",
-            "  0x7b, 0x90, 0x9c, 0x7a, 0x73, 0x85, 0x79, 0x9b, 0x66, 0x93, 0x60, 0x87,\n",
-            "  0x79, 0x69, 0x73, 0x8b, 0x53, 0x8c, 0x8d, 0x68, 0x93, 0xa0, 0x91, 0x65,\n",
-            "  0x57, 0x8d, 0x71, 0x65, 0x6c, 0x7e, 0xb3, 0x4f, 0xc7, 0xaa, 0x5a, 0x77,\n",
-            "  0x6e, 0x85, 0xe4, 0x6c, 0xa3, 0x89, 0x69, 0x54, 0x6d, 0x99, 0xb9, 0x77,\n",
-            "  0xa0, 0x80, 0x85, 0x71, 0x70, 0x78, 0x99, 0x66, 0xaf, 0x8a, 0x59, 0x64,\n",
-            "  0x54, 0x62, 0xbf, 0x5c, 0xbd, 0x77, 0x7f, 0xab, 0x95, 0x85, 0xaa, 0x6e,\n",
-            "  0xaa, 0x5a, 0x7b, 0x9f, 0xc3, 0x65, 0x93, 0x64, 0x7c, 0x2d, 0x4e, 0x8f,\n",
-            "  0xb2, 0x5f, 0x4e, 0x61, 0x64, 0x73, 0x56, 0x75, 0x79, 0x90, 0x5c, 0x81,\n",
-            "  0x8a, 0x8c, 0x70, 0x64, 0x74, 0x86, 0x86, 0x82, 0xab, 0x7e, 0x62, 0x4f,\n",
-            "  0x51, 0x89, 0x7b, 0x88, 0x73, 0x97, 0x77, 0x75, 0x5c, 0x9e, 0x97, 0x70,\n",
-            "  0x5a, 0x98, 0x7a, 0x54, 0x47, 0x99, 0xab, 0x5d, 0x91, 0xa0, 0x64, 0x51,\n",
-            "  0x57, 0x88, 0x88, 0x85, 0x81, 0x83, 0xa1, 0x89, 0x6a, 0x88, 0x69, 0x81,\n",
-            "  0x92, 0x63, 0x6a, 0x71, 0x72, 0x6a, 0x75, 0x8e, 0x90, 0x9d, 0x69, 0x60,\n",
-            "  0x73, 0x95, 0x79, 0x7b, 0x79, 0x7f, 0x77, 0x6e, 0x69, 0x63, 0x60, 0xa0,\n",
-            "  0x84, 0x91, 0x80, 0x96, 0x92, 0x70, 0x69, 0x7c, 0x3f, 0x90, 0x5c, 0x79,\n",
-            "  0x82, 0x63, 0x8d, 0x63, 0x56, 0x8a, 0x8e, 0x7a, 0x5c, 0x8d, 0xb8, 0x4e,\n",
-            "  0xb6, 0x84, 0x57, 0x79, 0x59, 0x79, 0xe8, 0x7e, 0xa8, 0x71, 0x61, 0x62,\n",
-            "  0x89, 0x71, 0xb7, 0x83, 0x7b, 0x53, 0x86, 0x88, 0x74, 0x71, 0xb1, 0x61,\n",
-            "  0xae, 0x7e, 0x8f, 0x69, 0x6b, 0x69, 0xb2, 0x6d, 0xb1, 0x7f, 0x5c, 0x9f,\n",
-            "  0xaa, 0x8c, 0xbd, 0x74, 0xaa, 0x5b, 0x7f, 0xa5, 0xb0, 0x6e, 0xc1, 0x5c,\n",
-            "  0x94, 0x34, 0x5b, 0xa6, 0xbc, 0x49, 0x75, 0x5b, 0x6e, 0x74, 0x7a, 0x92,\n",
-            "  0x92, 0x79, 0x78, 0x8a, 0x9e, 0x97, 0x7c, 0x5f, 0x76, 0x86, 0x59, 0x81,\n",
-            "  0x83, 0x7a, 0x65, 0x5b, 0x42, 0x95, 0x84, 0x99, 0x81, 0x8d, 0x6a, 0x5e,\n",
-            "  0x59, 0xb7, 0x96, 0x8a, 0x77, 0x86, 0x7a, 0x67, 0x3b, 0xa8, 0xae, 0x7a,\n",
-            "  0xa0, 0x97, 0x6c, 0x73, 0x5b, 0x9b, 0x77, 0x84, 0x7a, 0x77, 0x75, 0x6f,\n",
-            "  0x7d, 0x7a, 0x71, 0x86, 0x6c, 0x6f, 0x7d, 0x71, 0x68, 0x60, 0x64, 0x86,\n",
-            "  0x90, 0x75, 0x6a, 0x61, 0x60, 0x87, 0x68, 0x99, 0x87, 0x7e, 0x92, 0x87,\n",
-            "  0x87, 0x5f, 0x60, 0x91, 0x68, 0x8c, 0x7b, 0x67, 0x79, 0x5d, 0x67, 0x77,\n",
-            "  0x47, 0x72, 0x76, 0x88, 0x82, 0xa2, 0x7a, 0x5d, 0x64, 0x87, 0x75, 0x78,\n",
-            "  0x5e, 0x6f, 0xa4, 0x52, 0xc2, 0x9d, 0x81, 0x89, 0x55, 0x86, 0xc9, 0x6f,\n",
-            "  0x95, 0x71, 0x9d, 0x87, 0x95, 0x74, 0xac, 0x7f, 0x95, 0x6c, 0x68, 0x66,\n",
-            "  0x8a, 0x5f, 0x96, 0x69, 0x95, 0x79, 0x7f, 0x71, 0x86, 0x7e, 0x98, 0x71,\n",
-            "  0xac, 0x8f, 0x75, 0xa5, 0xac, 0x7a, 0xca, 0x63, 0xa0, 0x63, 0x69, 0xbf,\n",
-            "  0xae, 0x62, 0xc9, 0x46, 0x74, 0x2c, 0x66, 0x96, 0xb7, 0x70, 0x7c, 0x6b,\n",
-            "  0x7b, 0x90, 0x72, 0x74, 0x8d, 0x5f, 0x63, 0x93, 0x97, 0x78, 0x79, 0x64,\n",
-            "  0x67, 0x84, 0x64, 0x82, 0x90, 0x83, 0x91, 0x5f, 0x72, 0x93, 0x91, 0xae,\n",
-            "  0x6d, 0x99, 0x5b, 0x69, 0x54, 0x9f, 0x97, 0x80, 0x80, 0xa4, 0x91, 0x66,\n",
-            "  0x65, 0xa4, 0xa7, 0x7b, 0x97, 0x87, 0x72, 0x68, 0x6a, 0x96, 0x7b, 0x79,\n",
-            "  0x69, 0x83, 0x6f, 0x85, 0x6b, 0x92, 0x7f, 0x71, 0x84, 0x87, 0x6a, 0x7b,\n",
-            "  0x63, 0x72, 0x5f, 0x87, 0x98, 0x7b, 0x96, 0x71, 0x62, 0x90, 0x71, 0xa3,\n",
-            "  0x8c, 0x77, 0x90, 0x6f, 0x83, 0x76, 0x65, 0x87, 0x72, 0x8a, 0x64, 0x87,\n",
-            "  0x75, 0x75, 0x6d, 0x84, 0x54, 0x89, 0x88, 0xa0, 0x87, 0x73, 0x7f, 0x6f,\n",
-            "  0x5f, 0x90, 0x5e, 0x94, 0x5d, 0x61, 0xa6, 0x56, 0xb3, 0x91, 0x95, 0x75,\n",
-            "  0x4d, 0x74, 0xd9, 0x87, 0x92, 0x74, 0x7f, 0x79, 0x97, 0x6e, 0x90, 0x54,\n",
-            "  0x84, 0x5d, 0x5f, 0x75, 0x8b, 0x84, 0xa6, 0x75, 0xb4, 0x77, 0x78, 0x85,\n",
-            "  0x90, 0x76, 0xbd, 0x78, 0xd1, 0xa0, 0x5d, 0x96, 0xa9, 0x7c, 0xc1, 0x61,\n",
-            "  0xc2, 0x71, 0x8b, 0xa5, 0xa5, 0x5b, 0xc8, 0x50, 0x7b, 0x4b, 0x93, 0x99,\n",
-            "  0xae, 0x72, 0x67, 0x54, 0x81, 0x89, 0x96, 0x81, 0x6e, 0x68, 0x55, 0x7f,\n",
-            "  0x93, 0x8c, 0x5e, 0x65, 0x6c, 0x84, 0x7f, 0x8f, 0x9e, 0x7b, 0x73, 0x7f,\n",
-            "  0x51, 0x63, 0x8a, 0x8b, 0x6b, 0x9b, 0x9d, 0x57, 0x68, 0x89, 0x98, 0x70,\n",
-            "  0x73, 0xa3, 0x7f, 0x69, 0x44, 0x89, 0xae, 0x68, 0x89, 0x80, 0x7e, 0x6d,\n",
-            "  0x70, 0x95, 0x85, 0x65, 0x91, 0x7f, 0x66, 0x74, 0x96, 0x72, 0x60, 0x7a,\n",
-            "  0x87, 0x85, 0x79, 0x54, 0x53, 0x6c, 0x88, 0x87, 0xa9, 0x90, 0x75, 0x8b,\n",
-            "  0x69, 0x98, 0x7d, 0x95, 0x85, 0x7a, 0x8b, 0x82, 0x87, 0x6f, 0x86, 0x7f,\n",
-            "  0x74, 0xab, 0x93, 0x6c, 0x8a, 0x78, 0x68, 0x81, 0x62, 0x88, 0x78, 0x91,\n",
-            "  0x8b, 0x55, 0xa7, 0x58, 0x64, 0x88, 0x71, 0x93, 0x7d, 0x69, 0xbc, 0x58,\n",
-            "  0xbe, 0x9a, 0x6f, 0x74, 0x6f, 0x7f, 0xeb, 0x9e, 0xb7, 0x60, 0x63, 0x98,\n",
-            "  0x82, 0x77, 0x94, 0x63, 0x80, 0x6f, 0x7d, 0x8f, 0x8b, 0x85, 0xa5, 0x62,\n",
-            "  0xad, 0x86, 0x5f, 0x76, 0x88, 0x74, 0xa5, 0x66, 0xa5, 0x94, 0x88, 0x9b,\n",
-            "  0x87, 0x9e, 0xa8, 0x5a, 0xc9, 0x81, 0x92, 0xcd, 0xb5, 0x67, 0xb9, 0x63,\n",
-            "  0x86, 0x65, 0x8d, 0xad, 0x98, 0x7c, 0x8a, 0x40, 0x67, 0x65, 0x60, 0x71,\n",
-            "  0x8e, 0x84, 0x73, 0x64, 0x98, 0x80, 0x73, 0x81, 0x48, 0x75, 0x71, 0x9e,\n",
-            "  0x73, 0x89, 0x89, 0x68, 0x73, 0xa6, 0x84, 0x8a, 0x7e, 0x9f, 0x78, 0x83,\n",
-            "  0x60, 0x77, 0xa1, 0x87, 0x76, 0xab, 0x74, 0x57, 0x6d, 0x99, 0xa5, 0x5e,\n",
-            "  0x9d, 0x91, 0x6d, 0x6a, 0x76, 0x9c, 0x7b, 0x66, 0x96, 0x84, 0x85, 0x6e,\n",
-            "  0x6c, 0x75, 0x86, 0x6a, 0x71, 0x67, 0x8a, 0x66, 0x66, 0x68, 0x73, 0x90,\n",
-            "  0x92, 0x68, 0x8f, 0x71, 0x82, 0x7e, 0x71, 0xad, 0x9f, 0x84, 0x9e, 0x7d,\n",
-            "  0x77, 0x6b, 0x67, 0x8f, 0x73, 0x9a, 0x91, 0x74, 0x8a, 0x74, 0x5a, 0x87,\n",
-            "  0x37, 0x80, 0x8c, 0x8f, 0x7f, 0x75, 0xa8, 0x49, 0x63, 0x9b, 0x67, 0x68,\n",
-            "  0x4f, 0x87, 0xbf, 0x59, 0x9c, 0xbe, 0x93, 0x7e, 0x6f, 0x8a, 0xea, 0x77,\n",
-            "  0x83, 0x7a, 0x75, 0x8e, 0x7d, 0x50, 0x95, 0x60, 0x74, 0x60, 0x6f, 0x97,\n",
-            "  0x72, 0x5c, 0xa3, 0x6d, 0xb9, 0x86, 0x7b, 0x89, 0x9a, 0x76, 0xc7, 0x56,\n",
-            "  0xba, 0x86, 0x8d, 0x93, 0xa9, 0x98, 0xbb, 0x6a, 0x97, 0x74, 0x68, 0x84,\n",
-            "  0xc3, 0x65, 0xb6, 0x68, 0x89, 0x58, 0x87, 0xa1, 0xac, 0x60, 0x65, 0x68,\n",
-            "  0x7d, 0x98, 0x67, 0x8f, 0x8e, 0x84, 0x50, 0x75, 0x83, 0x91, 0x8a, 0x90,\n",
-            "  0x66, 0x74, 0x96, 0x89, 0x81, 0x7a, 0x7a, 0x64, 0x7f, 0x73, 0x8f, 0x95,\n",
-            "  0x8c, 0x89, 0x96, 0x76, 0x7a, 0x6c, 0x89, 0x91, 0x6d, 0x84, 0x68, 0x8d,\n",
-            "  0x47, 0x94, 0x9a, 0x67, 0x8f, 0x89, 0x8e, 0x79, 0x73, 0xa8, 0x7f, 0x6c,\n",
-            "  0x80, 0x64, 0x75, 0x81, 0x96, 0x9c, 0x68, 0x65, 0x76, 0x68, 0x74, 0x72,\n",
-            "  0x68, 0x76, 0x62, 0x6d, 0x6e, 0x6a, 0x84, 0x65, 0x8a, 0x73, 0x76, 0x91,\n",
-            "  0x78, 0x7c, 0x7a, 0x88, 0x6a, 0x87, 0x60, 0x99, 0x88, 0x75, 0x7b, 0x71,\n",
-            "  0x81, 0x7b, 0x76, 0x7d, 0x58, 0x75, 0x65, 0xa3, 0x95, 0x7e, 0x96, 0x3e,\n",
-            "  0x4c, 0x97, 0x86, 0x7a, 0x62, 0x92, 0xd1, 0x72, 0x8e, 0xaa, 0x85, 0x8e,\n",
-            "  0x59, 0x5f, 0xec, 0x77, 0x96, 0x66, 0x91, 0x9a, 0x89, 0x6c, 0xa2, 0x69,\n",
-            "  0x7d, 0x6e, 0x76, 0x63, 0x82, 0x72, 0x9c, 0x72, 0xa3, 0x75, 0x85, 0x7b,\n",
-            "  0x6d, 0x96, 0xc2, 0x69, 0xa7, 0x6a, 0x6b, 0x83, 0xa2, 0x7d, 0xce, 0x5c,\n",
-            "  0x94, 0x61, 0x7d, 0xae, 0xc3, 0x6d, 0x9f, 0x3c, 0x52, 0x4d, 0x8e, 0x92,\n",
-            "  0xae, 0x6e, 0x70, 0x5a, 0x76, 0x84, 0x7f, 0x72, 0x92, 0x72, 0x76, 0x5e,\n",
-            "  0x73, 0x8e, 0x82, 0x6d, 0x72, 0x81, 0x79, 0x94, 0x81, 0x88, 0x8b, 0x81,\n",
-            "  0x72, 0x72, 0x69, 0x84, 0x59, 0x6e, 0x74, 0x7d, 0x66, 0x74, 0x8d, 0x7b,\n",
-            "  0x7d, 0x7e, 0x7a, 0x83, 0x4d, 0x7e, 0x6a, 0x5a, 0x87, 0x66, 0x84, 0xa5,\n",
-            "  0x50, 0x5d, 0x6a, 0x8e, 0x87, 0x74, 0x88, 0x7c, 0x7d, 0x6c, 0x93, 0x98,\n",
-            "  0x8c, 0x76, 0x7f, 0xa3, 0x6e, 0x5d, 0x7d, 0x9f, 0x7c, 0x7a, 0x98, 0x88,\n",
-            "  0x74, 0x73, 0x50, 0x8c, 0x78, 0x8b, 0x71, 0x77, 0x9d, 0x56, 0x71, 0x85,\n",
-            "  0x6b, 0x8a, 0x93, 0x82, 0x8c, 0x79, 0x68, 0x8b, 0x57, 0x7b, 0x7c, 0x8a,\n",
-            "  0x6c, 0x87, 0x98, 0x54, 0x63, 0x7e, 0x78, 0x6b, 0x63, 0x77, 0xc1, 0x52,\n",
-            "  0xcd, 0xab, 0x75, 0x8e, 0x64, 0x68, 0xce, 0x68, 0x88, 0x6d, 0x67, 0x6d,\n",
-            "  0x68, 0x76, 0xa7, 0x78, 0x83, 0x67, 0x65, 0x5b, 0x8f, 0x63, 0x90, 0x5b,\n",
-            "  0xa1, 0x6f, 0x6a, 0x88, 0x70, 0x5c, 0x78, 0x49, 0xbc, 0x85, 0x8d, 0x8e,\n",
-            "  0xa3, 0x90, 0x97, 0x84, 0xa2, 0x46, 0x7a, 0x8e, 0x9e, 0xb1, 0xaa, 0x53,\n",
-            "  0x7d, 0x6b, 0x72, 0x86, 0x8c, 0x67, 0x6b, 0x48, 0x6f, 0x9c, 0x51, 0x94,\n",
-            "  0x6d, 0x66, 0x8e, 0x90, 0x79, 0x81, 0x66, 0x9f, 0x82, 0x9f, 0x98, 0x97,\n",
-            "  0x7c, 0x86, 0x7f, 0x57, 0x57, 0x83, 0x97, 0x8f, 0x73, 0x6f, 0x75, 0x6c,\n",
-            "  0x56, 0x8f, 0x7f, 0x73, 0x71, 0x84, 0x7d, 0x5f, 0x69, 0x69, 0x8e, 0x67,\n",
-            "  0x8a, 0x7f, 0x8c, 0x5a, 0x7a, 0x67, 0x82, 0x5a, 0x7a, 0x68, 0x73, 0x58,\n",
-            "  0x84, 0x83, 0x8d, 0x6d, 0x83, 0x72, 0x80, 0x7a, 0x8e, 0x7a, 0x68, 0x88,\n",
-            "  0x65, 0x74, 0x78, 0x73, 0x83, 0x97, 0x7b, 0x84, 0x77, 0x6d, 0x95, 0x99,\n",
-            "  0x76, 0x69, 0x5f, 0x9b, 0x7c, 0x75, 0x91, 0x80, 0x7b, 0x73, 0x6f, 0x9f,\n",
-            "  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,\n",
-            "  0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n",
-            "  0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62,\n",
-            "  0x69, 0x61, 0x73, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0xaa, 0xcc, 0xe2, 0x37, 0x10, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,\n",
-            "  0xfd, 0xfd, 0xff, 0xff, 0x53, 0xfe, 0xff, 0xff, 0x74, 0x01, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,\n",
-            "  0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n",
-            "  0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,\n",
-            "  0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,\n",
-            "  0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,\n",
-            "  0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,\n",
-            "  0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n",
-            "  0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,\n",
-            "  0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,\n",
-            "  0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n",
-            "  0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n",
-            "  0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,\n",
-            "  0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,\n",
-            "  0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04\n",
-            "};\n",
-            "unsigned int g_model_len = 18288;\n"
-          ],
-          "name": "stdout"
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","outputId":"1667d949-267c-4588-fe25-c0674d1dd074","executionInfo":{"status":"ok","timestamp":1588895159583,"user_tz":420,"elapsed":3711,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}},"colab":{"base_uri":"https://localhost:8080/","height":85}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","!echo \"Training these words:\" $WANTED_WORDS\n","!echo \"Training steps in each stage:\" $TRAINING_STEPS\n","!echo \"Learning rate in each stage:\" $LEARNING_RATE\n","!echo \"Total number of training steps:\" $TOTAL_STEPS"],"execution_count":1,"outputs":[{"output_type":"stream","text":["Training these words: yes,no\n","Training steps in each stage: 12000,3000\n","Learning rate in each stage: 0.001,0.0001\n","Total number of training steps: 15000\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE =20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 9.8077\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'$TOTAL_STEPS \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.quantized_input_stats = {\"Reshape_1\": (QUANT_INPUT_MIN, QUANT_INPUT_MAX)}\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","outputId":"d4a7c3eb-3d74-40e6-9eb5-7d2ffc5e3b6d","executionInfo":{"status":"ok","timestamp":1588901109389,"user_tz":420,"elapsed":9673,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}},"colab":{"base_uri":"https://localhost:8080/","height":51}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":15,"outputs":[{"output_type":"stream","text":["Float accuracy is 91.343042% (N=1236)\n","Quantized accuracy is 90.857605% (N=1236)\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":442},"outputId":"415d733c-86c4-4f19-9aa0-edc4112e6efb","executionInfo":{"status":"ok","timestamp":1588901187730,"user_tz":420,"elapsed":11964,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":16,"outputs":[{"output_type":"stream","text":["Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n","Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease\n","Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]\n","Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease\n","Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n","Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease\n","Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release\n","Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release\n","Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n","Get:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease [15.4 kB]\n","Get:11 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [908 kB]\n","Get:12 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [844 kB]\n","Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n","Get:16 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main Sources [1,814 kB]\n","Get:17 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [1,376 kB]\n","Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [1,205 kB]\n","Get:19 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main amd64 Packages [875 kB]\n","Fetched 7,294 kB in 3s (2,429 kB/s)\n","Reading package lists... Done\n","Selecting previously unselected package xxd.\n","(Reading database ... 144429 files and directories currently installed.)\n","Preparing to unpack .../xxd_2%3a8.0.1453-1ubuntu1.3_amd64.deb ...\n","Unpacking xxd (2:8.0.1453-1ubuntu1.3) ...\n","Setting up xxd (2:8.0.1453-1ubuntu1.3) ...\n","Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","outputId":"dbaba37d-8a8d-4e11-d780-478971d9ee95","colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"status":"ok","timestamp":1588901241295,"user_tz":420,"elapsed":1288,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":17,"outputs":[{"output_type":"stream","text":["unsigned char g_model[] = {\n","  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,\n","  0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,\n","  0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n","  0x64, 0x49, 0x00, 0x00, 0x34, 0x42, 0x00, 0x00, 0x1c, 0x42, 0x00, 0x00,\n","  0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n","  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,\n","  0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0xd4, 0x41, 0x00, 0x00, 0xb4, 0x41, 0x00, 0x00,\n","  0x24, 0x03, 0x00, 0x00, 0xf4, 0x02, 0x00, 0x00, 0xec, 0x02, 0x00, 0x00,\n","  0xe4, 0x02, 0x00, 0x00, 0xc4, 0x02, 0x00, 0x00, 0xbc, 0x02, 0x00, 0x00,\n","  0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0xee, 0xbc, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00,\n","  0xd0, 0xb9, 0xff, 0xff, 0xd4, 0xb9, 0xff, 0xff, 0x0a, 0xbd, 0xff, 0xff,\n","  0x04, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0xd4, 0x3e, 0x2e, 0xa6,\n","  0xd9, 0x4c, 0x23, 0x25, 0xd3, 0x2f, 0x09, 0xcb, 0xf6, 0x04, 0xc4, 0x1d,\n","  0xe5, 0x46, 0xf2, 0xcf, 0xd5, 0x53, 0x0c, 0x2b, 0x28, 0x06, 0xf8, 0xe9,\n","  0xe1, 0xdb, 0xdd, 0xf0, 0xbe, 0x0c, 0xfc, 0xa5, 0xb9, 0x1b, 0xca, 0x13,\n","  0x0d, 0xed, 0x0b, 0xd3, 0xff, 0xc8, 0x0d, 0xee, 0x04, 0xfe, 0xe1, 0x08,\n","  0xd9, 0xec, 0x26, 0x06, 0x0c, 0xcb, 0x1b, 0xc3, 0xf8, 0x81, 0xd5, 0xbc,\n","  0xc8, 0x48, 0xe6, 0x46, 0x0e, 0x34, 0x09, 0x0c, 0xea, 0x23, 0xe0, 0x14,\n","  0x17, 0xf5, 0xe0, 0x07, 0xe2, 0x3a, 0xaa, 0xea, 0x05, 0x5f, 0x26, 0x31,\n","  0x4e, 0xf6, 0xce, 0xe6, 0x0b, 0xed, 0xa7, 0xea, 0xbe, 0x08, 0xa4, 0x1b,\n","  0xd0, 0x50, 0x11, 0x2a, 0x16, 0xd3, 0xca, 0x11, 0xeb, 0xd8, 0xcb, 0xeb,\n","  0xfc, 0xee, 0xa5, 0x12, 0xda, 0x19, 0xfd, 0x1e, 0x1e, 0xc1, 0xc8, 0xe7,\n","  0xfc, 0x99, 0xae, 0xca, 0xe9, 0x57, 0x19, 0xe8, 0x1e, 0xff, 0xc4, 0xef,\n","  0xdc, 0x0d, 0x25, 0xef, 0x1c, 0xef, 0x2e, 0xed, 0xf3, 0x39, 0xd6, 0x76,\n","  0xe5, 0x4b, 0xb2, 0x2d, 0x4a, 0xf0, 0xf5, 0xcb, 0xc7, 0xf4, 0xbe, 0xea,\n","  0xcb, 0xed, 0xce, 0x0a, 0xa4, 0x69, 0x1a, 0x34, 0x0a, 0xdc, 0xca, 0x37,\n","  0xd4, 0xdf, 0x34, 0xe6, 0xf1, 0xd2, 0xb9, 0x1d, 0xb1, 0x42, 0xa3, 0x3a,\n","  0x0f, 0xc0, 0xc3, 0x0a, 0xcf, 0xc4, 0xe7, 0xd2, 0xfa, 0x62, 0x14, 0x18,\n","  0x49, 0xe1, 0x07, 0xe2, 0xec, 0x29, 0x4c, 0xd0, 0x53, 0xda, 0xdb, 0xe8,\n","  0xf9, 0x2f, 0x0e, 0xf6, 0x17, 0x2a, 0x23, 0x29, 0x7d, 0xec, 0x04, 0x2b,\n","  0x27, 0xf8, 0xb2, 0xdc, 0xbf, 0xec, 0xec, 0xb0, 0xe4, 0x62, 0x01, 0x42,\n","  0x28, 0xe2, 0x13, 0xe7, 0x13, 0xf3, 0xd3, 0xe1, 0xf7, 0xc3, 0xee, 0xf9,\n","  0xc4, 0x62, 0xfc, 0x58, 0x12, 0xc5, 0x02, 0x19, 0xe3, 0xe1, 0xf0, 0xe8,\n","  0xc4, 0x5e, 0xf9, 0xf3, 0x31, 0xce, 0xf0, 0xc0, 0xf8, 0x2e, 0x34, 0x37,\n","  0x7f, 0xc7, 0xa1, 0xdf, 0xf3, 0x31, 0xf8, 0xed, 0x27, 0x11, 0xc9, 0x19,\n","  0x72, 0xf3, 0x18, 0x1b, 0x2b, 0xe6, 0xef, 0xd8, 0xd1, 0xd4, 0x14, 0xf8,\n","  0xd5, 0x51, 0x40, 0x42, 0x2d, 0xe5, 0x0b, 0x94, 0x03, 0xf4, 0xde, 0xdf,\n","  0xf1, 0xc0, 0x08, 0xf9, 0xc4, 0x71, 0xf5, 0x75, 0x20, 0xc8, 0xf9, 0xcb,\n","  0xe0, 0x0c, 0x81, 0xf5, 0xc2, 0x6f, 0x25, 0xe3, 0x15, 0xca, 0x40, 0xac,\n","  0xe6, 0x37, 0x60, 0xb4, 0x30, 0xb8, 0x19, 0xdb, 0xf1, 0x22, 0x56, 0xfe,\n","  0x02, 0xf7, 0xfb, 0x0e, 0x68, 0xe6, 0x5e, 0x81, 0x15, 0xe4, 0xc5, 0xd9,\n","  0xc3, 0xbd, 0x42, 0xe5, 0xbe, 0x2f, 0xde, 0x3d, 0x04, 0xe3, 0x4a, 0x97,\n","  0xdb, 0xf6, 0xb1, 0xdf, 0xe5, 0xb2, 0x4b, 0xf2, 0xbc, 0x5e, 0x22, 0x7f,\n","  0xfd, 0xd7, 0x37, 0xda, 0xd2, 0x1a, 0x22, 0xf8, 0xbf, 0x69, 0x1b, 0x22,\n","  0x07, 0xcc, 0x11, 0xa3, 0xf8, 0x2c, 0x35, 0xdf, 0x60, 0xc8, 0xc9, 0xd9,\n","  0xeb, 0x0c, 0x4e, 0x2e, 0x28, 0xe4, 0x44, 0x02, 0x7f, 0xda, 0x62, 0x25,\n","  0x14, 0xe6, 0xbd, 0xe1, 0xcf, 0x9c, 0x50, 0x17, 0xff, 0x1e, 0xc3, 0x3c,\n","  0x25, 0xde, 0x4c, 0x14, 0xf7, 0xfc, 0x02, 0xe1, 0xdd, 0xd3, 0x3d, 0xf8,\n","  0xef, 0x49, 0x0c, 0x7b, 0x0a, 0xff, 0x24, 0x34, 0xfe, 0x2b, 0x14, 0x0b,\n","  0xb6, 0x4f, 0xc5, 0x23, 0xe6, 0xe2, 0x12, 0x9f, 0xeb, 0x21, 0xc9, 0x45,\n","  0x35, 0xcc, 0xbf, 0xea, 0x01, 0xf4, 0xe0, 0x15, 0x0e, 0xe8, 0x9d, 0xff,\n","  0x54, 0xc7, 0xec, 0x27, 0x32, 0xed, 0xe3, 0xef, 0xd6, 0xa7, 0xf5, 0xea,\n","  0xfa, 0x09, 0xc3, 0x32, 0x1d, 0xfd, 0x05, 0x19, 0x03, 0xf6, 0x05, 0xe9,\n","  0xed, 0xe6, 0x05, 0x64, 0xf0, 0x35, 0xdc, 0x61, 0x12, 0x1d, 0x20, 0x3c,\n","  0x0f, 0x33, 0xf8, 0x12, 0xa1, 0x1c, 0x81, 0x1d, 0xdc, 0xe1, 0x0a, 0x99,\n","  0xd1, 0xf7, 0x9f, 0xc9, 0x1b, 0xd8, 0x32, 0xf2, 0xee, 0xb3, 0xaf, 0x0f,\n","  0x01, 0xdd, 0x49, 0xf8, 0x7c, 0xa6, 0xbd, 0xac, 0x36, 0xeb, 0x0f, 0x01,\n","  0xdb, 0xca, 0xb8, 0xb8, 0xf8, 0xf6, 0xf9, 0x27, 0x32, 0xf8, 0xde, 0xef,\n","  0x19, 0xff, 0xf9, 0xf7, 0xf3, 0xde, 0xc7, 0x93, 0xfb, 0x1e, 0x1d, 0x50,\n","  0xf3, 0x31, 0xc5, 0x00, 0x18, 0x27, 0xb8, 0x1a, 0x9e, 0xdf, 0xd0, 0x2c,\n","  0xce, 0xe0, 0xa3, 0xa9, 0x9d, 0xb8, 0xaf, 0x67, 0x13, 0xd3, 0x19, 0xf7,\n","  0xed, 0x81, 0xb1, 0x3d, 0xe9, 0xd5, 0x00, 0xf4, 0x45, 0x93, 0xcd, 0x62,\n","  0x1e, 0xd6, 0x3a, 0x08, 0xd9, 0xb9, 0xd2, 0x1e, 0xeb, 0xe9, 0xbb, 0x1e,\n","  0x1f, 0xf9, 0xe0, 0x20, 0xf6, 0xf2, 0x30, 0xf9, 0xfe, 0xfb, 0xe9, 0x66,\n","  0xeb, 0xf5, 0x13, 0x40, 0xcf, 0x2d, 0xce, 0x0f, 0xe9, 0x06, 0x9a, 0x0c,\n","  0x64, 0xbc, 0xff, 0xff, 0x9a, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x31, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0xbc, 0xff, 0xff,\n","  0x88, 0xbc, 0xff, 0xff, 0xbe, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, 0xfe, 0xff, 0xff,\n","  0x78, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x43, 0xfd, 0xff, 0xff, 0xa9, 0xff, 0xff, 0xff, 0x97, 0xfc, 0xff, 0xff,\n","  0xea, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x3e, 0x00, 0x00,\n","  0xf5, 0xf9, 0xff, 0x08, 0xea, 0x05, 0x0f, 0x0c, 0xf3, 0x0e, 0xf6, 0x0f,\n","  0xfa, 0x01, 0x11, 0xf1, 0xf6, 0xea, 0xfc, 0x0f, 0xfc, 0xf1, 0xdd, 0x0e,\n","  0x1c, 0xef, 0xe6, 0xff, 0x05, 0xe8, 0x03, 0x11, 0xf6, 0xf1, 0x11, 0x0c,\n","  0xd7, 0x08, 0xf5, 0x30, 0xd9, 0x10, 0x14, 0x11, 0x10, 0x17, 0xee, 0x23,\n","  0x0c, 0xeb, 0x00, 0x06, 0xf6, 0xf7, 0x18, 0x0e, 0x18, 0x13, 0xfe, 0xfa,\n","  0xf3, 0xdd, 0xfa, 0xfb, 0x01, 0xfd, 0xe5, 0xe4, 0x00, 0x0d, 0xfe, 0x09,\n","  0xe9, 0x0a, 0x10, 0x1d, 0xf8, 0xf4, 0x0a, 0x1a, 0x10, 0x12, 0x18, 0xf1,\n","  0xfc, 0x1d, 0x00, 0x25, 0xd8, 0x08, 0xf8, 0xff, 0x06, 0x19, 0xf5, 0x0f,\n","  0x1c, 0x17, 0x0c, 0x16, 0xf3, 0x29, 0x20, 0x32, 0xfe, 0x19, 0xfb, 0x02,\n","  0x04, 0x15, 0xf3, 0x2b, 0x06, 0x14, 0x0e, 0xde, 0x04, 0x0e, 0xfc, 0x2d,\n","  0x1b, 0xdb, 0xec, 0xee, 0x00, 0xf6, 0x01, 0x33, 0x02, 0xe7, 0x06, 0xdd,\n","  0xf9, 0x03, 0x13, 0x03, 0xf8, 0xec, 0x14, 0xe4, 0x0f, 0xfa, 0xd4, 0x22,\n","  0x00, 0x11, 0x09, 0x02, 0x0e, 0xf4, 0x05, 0xfb, 0x04, 0x15, 0x04, 0x03,\n","  0xff, 0x0f, 0x09, 0xf2, 0xeb, 0xfc, 0x06, 0x00, 0xe5, 0x0a, 0xf2, 0xfc,\n","  0xfd, 0x12, 0xee, 0xe9, 0xf2, 0xfd, 0xf9, 0xf3, 0xce, 0x0f, 0xe9, 0xee,\n","  0xff, 0x14, 0x15, 0x0b, 0xcb, 0x03, 0xf2, 0x1b, 0xdb, 0x09, 0x1d, 0x07,\n","  0xd8, 0xde, 0xe6, 0x13, 0xd8, 0xf0, 0xe6, 0x00, 0xe7, 0xec, 0xd3, 0x00,\n","  0xc5, 0x25, 0xdb, 0x0a, 0xde, 0x1f, 0xd9, 0x11, 0xc1, 0x06, 0x01, 0x2e,\n","  0x09, 0x19, 0x09, 0x0f, 0xbe, 0x00, 0xf7, 0x08, 0x10, 0x12, 0xff, 0x10,\n","  0xf4, 0x05, 0xdf, 0x16, 0xe7, 0xe6, 0xef, 0xf4, 0xdd, 0x18, 0x18, 0x16,\n","  0xeb, 0x1a, 0xd7, 0xdb, 0xee, 0x15, 0xf1, 0x1e, 0xfc, 0x02, 0xfe, 0x0a,\n","  0xed, 0x17, 0x1c, 0x39, 0x01, 0xde, 0x06, 0xf3, 0xdb, 0x27, 0xfc, 0x1e,\n","  0xe4, 0x01, 0x03, 0x1d, 0xc5, 0x0d, 0xea, 0x0b, 0xfe, 0x05, 0xfc, 0x10,\n","  0xc2, 0x06, 0x0a, 0x51, 0xf4, 0xd8, 0xe8, 0x03, 0xcd, 0x1a, 0xe7, 0x13,\n","  0xfb, 0xfd, 0xe2, 0x2a, 0xf7, 0x0d, 0xea, 0x29, 0xfc, 0xea, 0x1c, 0x08,\n","  0x0a, 0x13, 0xfc, 0xf8, 0x15, 0xf3, 0x06, 0xe9, 0x1d, 0x0c, 0x1c, 0x14,\n","  0xdc, 0x17, 0x16, 0xff, 0x00, 0x06, 0x0c, 0xfe, 0x0c, 0x0a, 0xe6, 0x18,\n","  0xef, 0xd6, 0x1d, 0xee, 0xd2, 0x1c, 0xfe, 0x0d, 0xec, 0xfc, 0xe8, 0x02,\n","  0xf8, 0x13, 0xf9, 0x17, 0x08, 0xf8, 0xf9, 0x06, 0x04, 0x07, 0xcf, 0x07,\n","  0xfb, 0xde, 0xf2, 0x0c, 0xe4, 0xf2, 0x1d, 0xdd, 0xd7, 0xfd, 0xec, 0xfd,\n","  0xd8, 0xd9, 0x0a, 0xf5, 0xf4, 0x02, 0x1f, 0x0e, 0xf8, 0x1a, 0xe0, 0x06,\n","  0x0a, 0x23, 0xf6, 0x1f, 0xea, 0x07, 0xde, 0x00, 0xf5, 0x10, 0xe7, 0x06,\n","  0xf3, 0xe1, 0x0a, 0x2a, 0xf0, 0x00, 0x18, 0x09, 0xe8, 0xd6, 0xec, 0x00,\n","  0xef, 0x1c, 0xf2, 0x07, 0xf1, 0xf5, 0x16, 0x13, 0xdf, 0x0f, 0xdd, 0x1b,\n","  0x10, 0xdb, 0xfb, 0x07, 0xda, 0x17, 0xdf, 0x28, 0xf5, 0xe9, 0x07, 0x0b,\n","  0x02, 0xf4, 0xf0, 0x0e, 0xda, 0x1e, 0x1d, 0xff, 0xde, 0x0e, 0x1e, 0x24,\n","  0xf5, 0xfc, 0x08, 0x1f, 0xff, 0x12, 0x09, 0x18, 0x20, 0xd8, 0x08, 0xf0,\n","  0xef, 0x07, 0x02, 0x19, 0xe8, 0xf3, 0x02, 0x03, 0xdf, 0x22, 0x0e, 0x04,\n","  0x0d, 0xf9, 0xea, 0x1c, 0xf1, 0x17, 0x08, 0x02, 0x0b, 0x02, 0x00, 0x22,\n","  0xf0, 0x0e, 0xdf, 0x07, 0xea, 0x01, 0xf3, 0xef, 0xfb, 0xff, 0x07, 0xfd,\n","  0xf7, 0xf2, 0x14, 0x1e, 0x17, 0xe7, 0x12, 0xf8, 0xee, 0xfc, 0x09, 0xe0,\n","  0x08, 0xd5, 0x07, 0xff, 0x11, 0xf7, 0xee, 0x14, 0xfd, 0xe0, 0xda, 0x03,\n","  0xd5, 0xcd, 0x04, 0xe5, 0xea, 0xde, 0xf7, 0x02, 0x0b, 0xfb, 0x03, 0x10,\n","  0xf7, 0xcf, 0x0c, 0xfb, 0xee, 0x06, 0x0a, 0x12, 0x0e, 0xd7, 0xfb, 0x06,\n","  0xf6, 0xe0, 0xfb, 0xf1, 0xec, 0xf6, 0x13, 0xf6, 0x0a, 0xea, 0x24, 0x0a,\n","  0xfd, 0xe6, 0xf8, 0x19, 0x06, 0xe2, 0x05, 0x20, 0x08, 0xe3, 0xd8, 0x05,\n","  0x00, 0xcd, 0xeb, 0x0f, 0xfd, 0xec, 0xf6, 0xfc, 0xe1, 0xf8, 0xf4, 0xfe,\n","  0xdf, 0x10, 0xf8, 0x0d, 0xf3, 0xf9, 0x06, 0x06, 0xd5, 0xfb, 0x16, 0x18,\n","  0x00, 0xfe, 0xf9, 0x17, 0x12, 0xe2, 0xfb, 0xf8, 0xe5, 0x06, 0x29, 0xdf,\n","  0xfb, 0xfd, 0x08, 0x11, 0xf8, 0x10, 0x13, 0x03, 0xe1, 0xf9, 0xf8, 0xfd,\n","  0x06, 0xf2, 0x11, 0xff, 0xf8, 0xfe, 0x12, 0xf5, 0xf2, 0xe1, 0x26, 0x0b,\n","  0xe9, 0xfe, 0x04, 0xf1, 0xeb, 0xfd, 0x0c, 0x26, 0xfd, 0xfb, 0x12, 0xf8,\n","  0xfd, 0x01, 0x03, 0x05, 0x09, 0x27, 0x28, 0xff, 0x0f, 0x0a, 0xe9, 0xff,\n","  0x00, 0xec, 0xf7, 0xf4, 0x04, 0x03, 0x08, 0x10, 0xfe, 0xf3, 0x1f, 0xf5,\n","  0xf0, 0xff, 0x0a, 0x20, 0x0c, 0xd4, 0xef, 0xdb, 0xf5, 0xf4, 0x1a, 0x02,\n","  0xfe, 0xda, 0x04, 0xe4, 0x0b, 0xd9, 0x1a, 0xee, 0xfd, 0xc6, 0xf8, 0x0d,\n","  0xec, 0xfe, 0x19, 0xe1, 0x1f, 0xc5, 0x1d, 0x02, 0xf6, 0xd6, 0x04, 0xe6,\n","  0x06, 0xe4, 0x0c, 0xf0, 0x31, 0xe8, 0xe2, 0xec, 0x1d, 0xe8, 0x0f, 0x02,\n","  0x2d, 0xe8, 0xf1, 0xf7, 0x0f, 0xf9, 0x13, 0xfd, 0x1f, 0xd8, 0x24, 0x17,\n","  0xfb, 0xf8, 0x01, 0xe3, 0x14, 0xaf, 0x14, 0x01, 0x1c, 0xe5, 0x10, 0xf2,\n","  0x16, 0xd3, 0xed, 0xe3, 0x15, 0x02, 0x27, 0xeb, 0x1e, 0x12, 0x19, 0xff,\n","  0x16, 0xeb, 0x13, 0x11, 0xfa, 0x14, 0xf4, 0x02, 0x11, 0x08, 0xfc, 0xf9,\n","  0x07, 0xdc, 0x1c, 0xeb, 0x16, 0xf0, 0x1c, 0x06, 0x08, 0xfa, 0xf9, 0x11,\n","  0xee, 0x07, 0xf3, 0x06, 0xfd, 0xfd, 0x19, 0xf9, 0xf1, 0xe2, 0x1f, 0xf2,\n","  0x0f, 0xe9, 0x0c, 0xfb, 0x1d, 0x03, 0x02, 0xe2, 0x1c, 0x11, 0xfb, 0xf7,\n","  0x04, 0x04, 0x18, 0xe7, 0x27, 0xe2, 0xfc, 0xf5, 0x06, 0x00, 0x08, 0xfd,\n","  0x15, 0xdb, 0x16, 0xfe, 0x04, 0x08, 0xf8, 0xff, 0xfb, 0xeb, 0xeb, 0xfe,\n","  0xed, 0xf4, 0xf0, 0xe4, 0xfe, 0x22, 0x09, 0x02, 0x21, 0xc8, 0x0b, 0xe4,\n","  0xf4, 0xf2, 0x04, 0x02, 0xef, 0xce, 0x13, 0x07, 0xfa, 0xe0, 0xff, 0xf1,\n","  0xfe, 0xd5, 0xfc, 0xdc, 0x0f, 0xf2, 0x05, 0x10, 0x00, 0xd4, 0x24, 0xea,\n","  0x1e, 0xe3, 0x2a, 0x18, 0xf3, 0xd2, 0x01, 0xe0, 0x0e, 0xdb, 0x2a, 0xeb,\n","  0x02, 0xdd, 0xec, 0xd7, 0x12, 0xec, 0x31, 0xfc, 0x25, 0xd9, 0x04, 0x08,\n","  0x15, 0xd0, 0xe8, 0x14, 0x18, 0xf9, 0xfa, 0xf6, 0x24, 0xea, 0x0a, 0x06,\n","  0x02, 0xfb, 0x05, 0xea, 0x02, 0xf0, 0x04, 0xf1, 0x1f, 0x13, 0x04, 0x17,\n","  0x14, 0xf0, 0x0d, 0x10, 0x03, 0x05, 0x26, 0xec, 0xfe, 0xe8, 0x19, 0xe9,\n","  0x0a, 0xee, 0xe4, 0x04, 0x2a, 0xec, 0x1b, 0x06, 0x05, 0xff, 0xd7, 0xf5,\n","  0x1c, 0x0c, 0x20, 0xfe, 0xe3, 0xe1, 0x11, 0xdc, 0x2b, 0x03, 0x04, 0x1d,\n","  0x1a, 0xd4, 0x1d, 0xea, 0x06, 0x04, 0x04, 0x1a, 0x1e, 0xef, 0x00, 0xe0,\n","  0x1e, 0xf8, 0x0c, 0xfe, 0x12, 0xd8, 0x0b, 0xe5, 0xf2, 0x03, 0x21, 0x06,\n","  0x01, 0x22, 0xef, 0xf3, 0xfb, 0xfb, 0x25, 0x17, 0x08, 0xeb, 0xf3, 0xec,\n","  0xf4, 0x06, 0x21, 0xec, 0xe3, 0xe3, 0xe4, 0xe5, 0xf9, 0xe8, 0x0d, 0xec,\n","  0x1c, 0xc3, 0x0b, 0xdf, 0x12, 0x05, 0xe6, 0xdd, 0xde, 0xc5, 0xe6, 0xea,\n","  0x1a, 0xf1, 0x0f, 0xe3, 0x11, 0xcf, 0xea, 0xe5, 0xfe, 0xf6, 0x02, 0x0b,\n","  0x0e, 0xd5, 0x03, 0xd6, 0x11, 0x02, 0x2d, 0xfc, 0xed, 0xec, 0xee, 0xfa,\n","  0xf8, 0xf2, 0x01, 0x0e, 0x19, 0xf1, 0x14, 0x03, 0x1a, 0xf3, 0x0c, 0xf9,\n","  0xf5, 0xf4, 0xf2, 0xdf, 0xf0, 0xd6, 0x32, 0xf6, 0x18, 0x06, 0xf3, 0x01,\n","  0x02, 0xe8, 0x09, 0x14, 0xff, 0x0f, 0x23, 0x26, 0x05, 0xf3, 0x08, 0xf3,\n","  0x16, 0xfb, 0xed, 0x0d, 0x13, 0xe8, 0x25, 0xf1, 0xe9, 0xf2, 0xf5, 0x0c,\n","  0x19, 0xf0, 0x1f, 0xfa, 0x00, 0xe4, 0xfe, 0x22, 0xf2, 0xd5, 0x14, 0xe9,\n","  0x06, 0xe9, 0xfe, 0x13, 0x07, 0x08, 0x00, 0xfd, 0x16, 0xdb, 0xe0, 0x12,\n","  0x07, 0x14, 0x09, 0x1c, 0x17, 0x10, 0x20, 0xd3, 0xfd, 0xe9, 0x25, 0xfb,\n","  0x19, 0xd8, 0x0b, 0xf9, 0xf3, 0xde, 0xfe, 0x21, 0x12, 0xec, 0xf4, 0xe4,\n","  0xf7, 0xff, 0x21, 0xef, 0x26, 0x0f, 0xf9, 0xee, 0xe6, 0x03, 0x2f, 0xf7,\n","  0x0e, 0x10, 0xfa, 0x08, 0x0b, 0xfa, 0xe9, 0xff, 0xf9, 0xdd, 0x01, 0xe3,\n","  0xfb, 0x01, 0xfc, 0xf4, 0x1a, 0xb9, 0xf6, 0xd5, 0x1b, 0x01, 0xfd, 0xe2,\n","  0x03, 0xd2, 0x11, 0xf5, 0x10, 0xd9, 0x07, 0x07, 0xe1, 0xc1, 0xff, 0xd4,\n","  0x10, 0xef, 0x23, 0x10, 0x01, 0xba, 0x09, 0xd1, 0xfd, 0xe3, 0x0d, 0xe3,\n","  0x00, 0xcf, 0x03, 0xcd, 0xfd, 0xf9, 0xfe, 0xe9, 0x07, 0xe4, 0x04, 0xfc,\n","  0xf1, 0x00, 0x21, 0x01, 0xf6, 0x01, 0xda, 0x14, 0xe8, 0xd9, 0x14, 0x05,\n","  0x08, 0x01, 0x26, 0xf8, 0xfb, 0xc1, 0x2c, 0x1a, 0x06, 0xed, 0xef, 0xf5,\n","  0xf1, 0x00, 0x0e, 0x19, 0x1f, 0x08, 0xff, 0x0c, 0x04, 0xf6, 0x25, 0x17,\n","  0x1a, 0x0b, 0xeb, 0xe6, 0x0f, 0x10, 0x13, 0x14, 0x12, 0xfa, 0x22, 0xee,\n","  0xe6, 0x0b, 0x2d, 0xf9, 0x1e, 0xf0, 0x04, 0x09, 0x00, 0x0f, 0x2f, 0x05,\n","  0xe8, 0xf9, 0x03, 0xd7, 0x02, 0xea, 0x1f, 0xfd, 0x22, 0xed, 0xf1, 0xed,\n","  0xfe, 0xdc, 0x0d, 0x0e, 0x0c, 0xf0, 0x19, 0xf1, 0x09, 0xe0, 0x2c, 0xfb,\n","  0x02, 0xdc, 0xf3, 0xd9, 0x32, 0xf7, 0x09, 0xe3, 0x09, 0x17, 0x03, 0xf3,\n","  0x08, 0x01, 0x1b, 0xfa, 0x06, 0xfa, 0x1f, 0x15, 0x16, 0xe7, 0x16, 0xfe,\n","  0xfe, 0xf4, 0xe0, 0xe2, 0x12, 0x21, 0xfa, 0x15, 0x00, 0xcb, 0x07, 0xb6,\n","  0x1b, 0xf2, 0x34, 0xfa, 0xfd, 0xba, 0x19, 0xd4, 0x2c, 0xde, 0xf2, 0x1c,\n","  0x0c, 0xc5, 0xef, 0xe4, 0x0a, 0xfb, 0x03, 0x03, 0xf2, 0xcd, 0x01, 0xe0,\n","  0xf2, 0xf6, 0xf5, 0x0a, 0xf6, 0xc5, 0x0d, 0xe2, 0x09, 0xdc, 0x00, 0x05,\n","  0x10, 0xe1, 0x14, 0xf7, 0x02, 0x08, 0x14, 0x12, 0xf5, 0xf8, 0x1c, 0xe9,\n","  0xf5, 0xf1, 0x26, 0xd8, 0x16, 0x06, 0x00, 0xf8, 0xf4, 0xe0, 0x32, 0x03,\n","  0x07, 0x15, 0xea, 0x10, 0xf2, 0xfa, 0x17, 0x1f, 0x07, 0x07, 0x17, 0x06,\n","  0x06, 0xe7, 0x05, 0xfe, 0xe5, 0x1b, 0x16, 0xff, 0xf8, 0xfe, 0x2c, 0xf8,\n","  0x00, 0x03, 0xf3, 0xf3, 0xf3, 0xf0, 0xfb, 0xdf, 0x02, 0xe5, 0x16, 0xed,\n","  0xf9, 0x01, 0x23, 0x03, 0x16, 0xe6, 0xfe, 0xeb, 0x00, 0xf0, 0x27, 0x1b,\n","  0xeb, 0xee, 0x03, 0xe9, 0x02, 0xd8, 0x2f, 0xe4, 0x0d, 0xde, 0x14, 0xe3,\n","  0xfd, 0xf6, 0x13, 0x06, 0x10, 0xf4, 0xeb, 0xe5, 0x19, 0xf0, 0x17, 0xea,\n","  0x15, 0x0d, 0xe4, 0x0b, 0x31, 0xf3, 0x13, 0x1b, 0xf9, 0xe0, 0x0b, 0xfc,\n","  0x09, 0x03, 0x26, 0xe6, 0xeb, 0xd1, 0xd9, 0xc8, 0x00, 0xf7, 0x26, 0x0a,\n","  0x08, 0xd4, 0xe3, 0xd6, 0x1b, 0x06, 0x1a, 0xed, 0xf4, 0xee, 0xfd, 0xe7,\n","  0x14, 0xe1, 0x06, 0x11, 0xf9, 0xaa, 0xf6, 0xd7, 0x0c, 0xdf, 0x25, 0x17,\n","  0x11, 0xd8, 0xfa, 0x08, 0x0e, 0xed, 0x29, 0x0c, 0xec, 0xeb, 0x0b, 0x02,\n","  0xf3, 0xfb, 0x19, 0x1c, 0x13, 0x11, 0x10, 0xeb, 0x0d, 0xef, 0x11, 0xff,\n","  0x14, 0xe4, 0xd9, 0x02, 0xed, 0xe6, 0x23, 0xdf, 0xfb, 0xf4, 0xef, 0xee,\n","  0xf9, 0xf2, 0x24, 0x04, 0x03, 0x02, 0x0b, 0x0e, 0xed, 0x08, 0x19, 0xf9,\n","  0xf2, 0x02, 0xf4, 0x02, 0xf0, 0x1b, 0x03, 0x08, 0xf7, 0xe7, 0xf9, 0xf3,\n","  0xf7, 0x15, 0x11, 0x18, 0x18, 0x0e, 0x13, 0x13, 0x0d, 0x0e, 0x0e, 0x06,\n","  0xfb, 0xe8, 0x13, 0x09, 0x07, 0xf2, 0x24, 0x0c, 0x22, 0xf8, 0x08, 0xef,\n","  0xee, 0xec, 0x25, 0x09, 0x17, 0xde, 0xfb, 0xdd, 0x0d, 0xd0, 0x3c, 0x29,\n","  0x13, 0xf5, 0xeb, 0xeb, 0xfc, 0xd2, 0x33, 0xf9, 0x05, 0xe0, 0x15, 0x04,\n","  0x08, 0xfd, 0x14, 0x14, 0xfe, 0x0a, 0xee, 0xe7, 0x14, 0xfb, 0x15, 0xef,\n","  0x07, 0xdf, 0x12, 0x14, 0x00, 0xf0, 0xff, 0x03, 0xf9, 0xe5, 0xf7, 0xcf,\n","  0x07, 0xeb, 0x0b, 0xd8, 0xf4, 0xce, 0xe1, 0xaf, 0x20, 0x0b, 0xfa, 0x09,\n","  0xf6, 0xbf, 0x18, 0xe9, 0x06, 0xcc, 0x03, 0xf4, 0x0e, 0xb8, 0x08, 0xd0,\n","  0x07, 0xe9, 0x10, 0x17, 0x0a, 0xcf, 0x21, 0xf7, 0x03, 0xf9, 0x26, 0xe0,\n","  0x04, 0xe8, 0x0c, 0xff, 0x0b, 0xfe, 0x16, 0x16, 0xfe, 0xda, 0x17, 0x04,\n","  0xfd, 0x0b, 0x15, 0x0d, 0xf8, 0x08, 0xf9, 0xf3, 0x00, 0xe8, 0x07, 0x0a,\n","  0xf4, 0xf9, 0x0e, 0xdc, 0xfb, 0xe3, 0xfe, 0x09, 0xff, 0x07, 0xfa, 0xfd,\n","  0xe6, 0x05, 0xf9, 0x0e, 0xf2, 0xef, 0xfe, 0xf6, 0x04, 0xee, 0x2d, 0x0e,\n","  0x04, 0xe7, 0xec, 0xfb, 0xf1, 0x08, 0x17, 0x04, 0xf9, 0xf9, 0x15, 0xff,\n","  0x00, 0xfc, 0x23, 0xf6, 0x00, 0x1a, 0xf4, 0x1c, 0x02, 0x04, 0x1e, 0x11,\n","  0x00, 0xee, 0xf3, 0xe6, 0xed, 0xfa, 0x24, 0xe0, 0xfb, 0xe7, 0x10, 0xd7,\n","  0xdc, 0xf5, 0x4c, 0xf3, 0x19, 0x01, 0xf9, 0xef, 0x00, 0xee, 0x13, 0xeb,\n","  0xf9, 0xd7, 0x0b, 0xf1, 0xef, 0x05, 0x45, 0xf7, 0x01, 0x0b, 0xf3, 0xfa,\n","  0x0d, 0x10, 0x18, 0x1c, 0xf5, 0xf5, 0x0a, 0xef, 0x0c, 0x19, 0x06, 0xf8,\n","  0x06, 0xf1, 0x29, 0xd0, 0x0c, 0x07, 0x17, 0xf7, 0x18, 0xb0, 0x26, 0xcf,\n","  0x16, 0x01, 0x03, 0xf4, 0xf0, 0xc8, 0x04, 0xe8, 0x1a, 0xf4, 0x0f, 0xeb,\n","  0x0e, 0xb6, 0x00, 0xd3, 0x04, 0xf8, 0x26, 0xf8, 0x1a, 0xa8, 0xf9, 0xcb,\n","  0x04, 0xeb, 0x22, 0x0a, 0x0d, 0xcd, 0xeb, 0xea, 0x03, 0xe2, 0x09, 0xed,\n","  0x0b, 0xe3, 0x09, 0xf1, 0xf1, 0xec, 0x21, 0xee, 0x0e, 0xf4, 0x1c, 0x04,\n","  0xee, 0xfb, 0x0d, 0x1a, 0xfc, 0xf4, 0xfe, 0xef, 0x06, 0xe0, 0x13, 0x0e,\n","  0xfd, 0x05, 0x0b, 0x1d, 0xfd, 0xf6, 0x09, 0x1b, 0x04, 0x27, 0xf5, 0x0e,\n","  0xf0, 0xed, 0x1e, 0xf7, 0xea, 0xfa, 0x1a, 0xf9, 0xe5, 0x07, 0x15, 0x0e,\n","  0x00, 0xea, 0xfa, 0xe9, 0xf7, 0xec, 0x31, 0xec, 0x04, 0x09, 0x10, 0xec,\n","  0xfd, 0xe4, 0x27, 0x00, 0x0c, 0xdc, 0xdc, 0xde, 0xed, 0xe9, 0x1f, 0xe4,\n","  0xfa, 0x02, 0xd9, 0xfe, 0x06, 0xf1, 0x15, 0xee, 0xf1, 0xf3, 0x14, 0xe2,\n","  0x00, 0xdb, 0x28, 0x17, 0x09, 0xdc, 0xfe, 0xea, 0xfc, 0x14, 0x20, 0x13,\n","  0xf9, 0xed, 0xf1, 0xe8, 0xfd, 0x04, 0x3a, 0xfd, 0x00, 0x15, 0xf1, 0xee,\n","  0x10, 0xe3, 0x0b, 0x20, 0x10, 0xeb, 0x10, 0xc3, 0x14, 0xf8, 0x03, 0x0b,\n","  0x11, 0xc3, 0x27, 0xc5, 0x2d, 0xdb, 0x15, 0x0e, 0xf5, 0xce, 0xfa, 0xd8,\n","  0x1c, 0xf0, 0x20, 0x04, 0xec, 0xc4, 0xf9, 0xda, 0x1c, 0xd9, 0x01, 0x05,\n","  0x1f, 0xbb, 0xf8, 0xff, 0xef, 0x06, 0x10, 0xe3, 0x02, 0xe6, 0xdb, 0xee,\n","  0x02, 0xfe, 0xfc, 0x15, 0xfe, 0xf0, 0xdb, 0xfb, 0xf5, 0xfc, 0x16, 0x02,\n","  0xed, 0x01, 0x12, 0xe2, 0x06, 0xeb, 0x10, 0x16, 0x03, 0xed, 0x1a, 0x07,\n","  0xf0, 0xe4, 0x29, 0xf5, 0xfa, 0xe1, 0x07, 0xe8, 0xf8, 0xfd, 0xf5, 0x03,\n","  0xfc, 0x18, 0x03, 0xe2, 0x00, 0xf7, 0x13, 0xf9, 0xe4, 0x10, 0x25, 0xfc,\n","  0x0e, 0x1f, 0x1c, 0x12, 0x1e, 0xfd, 0x01, 0xf9, 0xef, 0x1d, 0x17, 0x1b,\n","  0x04, 0xfd, 0x25, 0x12, 0xf5, 0x20, 0x0a, 0x02, 0x03, 0xff, 0xe6, 0xe5,\n","  0xf4, 0x05, 0x42, 0x1a, 0x0b, 0xdc, 0xfd, 0xed, 0xf3, 0xd0, 0x43, 0xf3,\n","  0x10, 0x09, 0x0a, 0xed, 0xff, 0xe2, 0x1b, 0x1d, 0x08, 0xe4, 0xfe, 0xf7,\n","  0xff, 0xf9, 0x2e, 0xfa, 0xf8, 0xe7, 0xe7, 0xeb, 0xfd, 0xfe, 0x30, 0x06,\n","  0x00, 0x1d, 0x12, 0xf4, 0x0d, 0xf4, 0x1c, 0xed, 0x01, 0xd2, 0x17, 0xb3,\n","  0x0c, 0x0c, 0xf4, 0x1e, 0x26, 0xd8, 0xf7, 0xbd, 0x24, 0xe7, 0x11, 0x12,\n","  0xf9, 0xb9, 0xf6, 0xde, 0x3c, 0xf7, 0xfe, 0x0c, 0x16, 0xc5, 0x14, 0xcd,\n","  0x24, 0x06, 0xfa, 0x21, 0x03, 0xcb, 0xf7, 0xf0, 0xfc, 0xff, 0xfe, 0xf8,\n","  0x0a, 0xed, 0xdf, 0xe4, 0x0f, 0x19, 0x10, 0x0f, 0xf9, 0xf9, 0x11, 0xf3,\n","  0xf1, 0xf1, 0x33, 0xdc, 0x02, 0xd6, 0xde, 0xe0, 0xf9, 0xec, 0xfe, 0x09,\n","  0xfc, 0xd4, 0xeb, 0x0b, 0xec, 0xe3, 0x10, 0x0e, 0x0d, 0x13, 0x00, 0xe6,\n","  0xf2, 0xf2, 0x12, 0xec, 0x05, 0xf7, 0xff, 0x03, 0x02, 0x0f, 0x0c, 0x00,\n","  0xf3, 0xfc, 0x02, 0xd9, 0xf0, 0x02, 0xef, 0xfa, 0x06, 0xda, 0x0a, 0xe4,\n","  0xf6, 0x10, 0x14, 0x03, 0x12, 0xe6, 0x25, 0x09, 0x06, 0xf1, 0x26, 0x04,\n","  0xfa, 0xe1, 0xdd, 0xfa, 0xef, 0x06, 0x11, 0xfd, 0xf9, 0xf8, 0xfd, 0xe8,\n","  0xf8, 0x0b, 0x24, 0x22, 0xf9, 0xd1, 0x1a, 0xfe, 0xf0, 0xed, 0x3c, 0xfd,\n","  0xf6, 0xfc, 0xe4, 0xf6, 0xf1, 0x05, 0x25, 0xf9, 0xee, 0x1b, 0x0d, 0xe2,\n","  0xf8, 0xff, 0x2b, 0x16, 0xf6, 0xf4, 0x27, 0xe0, 0x02, 0x05, 0x0a, 0x11,\n","  0x1d, 0xd1, 0xfb, 0xcb, 0x17, 0xf3, 0x23, 0xf9, 0x17, 0xb7, 0xec, 0x9f,\n","  0x1d, 0xf2, 0x0f, 0x27, 0x10, 0xc5, 0xfa, 0xdf, 0x21, 0xe7, 0x0e, 0x01,\n","  0x06, 0xb1, 0x02, 0xe2, 0x0e, 0xf8, 0x07, 0x04, 0x1a, 0xc1, 0x04, 0xed,\n","  0xfe, 0xf6, 0x0c, 0x1c, 0x1d, 0xe1, 0xe5, 0xed, 0x03, 0xd7, 0xfb, 0x28,\n","  0x00, 0xdf, 0xe9, 0xcd, 0xef, 0x04, 0x20, 0xe9, 0x10, 0xde, 0x00, 0xee,\n","  0xf3, 0xd0, 0x02, 0x09, 0x0b, 0x0e, 0xee, 0xf8, 0xea, 0xf3, 0x31, 0x0d,\n","  0xf7, 0x1e, 0x0f, 0xe9, 0xe9, 0xff, 0x16, 0xda, 0x12, 0xf3, 0xec, 0x1c,\n","  0xfd, 0x04, 0x0a, 0x09, 0x01, 0xed, 0xf9, 0x0d, 0xf9, 0x12, 0xfc, 0x08,\n","  0xfa, 0xd6, 0x12, 0x0b, 0x02, 0xff, 0xfe, 0x06, 0x0f, 0xe3, 0xf0, 0xdb,\n","  0xf0, 0xf5, 0x0f, 0x17, 0x0d, 0xe4, 0x2a, 0xf4, 0x13, 0xe9, 0x3a, 0x0c,\n","  0x04, 0x11, 0xee, 0xf0, 0xf1, 0xf5, 0x31, 0x04, 0xf2, 0x04, 0x14, 0x02,\n","  0xfd, 0xe7, 0x2a, 0xf6, 0xff, 0x17, 0xed, 0xea, 0xe1, 0xf9, 0x27, 0x20,\n","  0x0b, 0xe6, 0x1f, 0xfe, 0x00, 0xf9, 0x10, 0x05, 0x04, 0x0e, 0xf0, 0xf7,\n","  0x18, 0x17, 0x13, 0xf0, 0x21, 0xcd, 0xf9, 0xcd, 0x13, 0xfb, 0x05, 0xe6,\n","  0x1b, 0xba, 0xf5, 0xb2, 0x2b, 0xd4, 0x19, 0x18, 0xf4, 0xc8, 0xee, 0xce,\n","  0x31, 0xf4, 0xec, 0x2d, 0xfa, 0xc0, 0xeb, 0xe9, 0x0e, 0xe0, 0x2f, 0xfe,\n","  0x17, 0xd1, 0x09, 0xfc, 0xf6, 0xdc, 0xf1, 0x00, 0x11, 0xd2, 0xf4, 0xe4,\n","  0xfc, 0x0f, 0x02, 0x27, 0x0e, 0xdd, 0x19, 0x08, 0x03, 0xf8, 0x1f, 0xeb,\n","  0xfa, 0x0d, 0xf1, 0x11, 0x0c, 0xe4, 0x31, 0x07, 0x02, 0xe7, 0xec, 0xf0,\n","  0xe7, 0x02, 0x1b, 0xf0, 0xf8, 0x22, 0xfa, 0xe2, 0xfd, 0xf2, 0x13, 0x17,\n","  0x0d, 0xf3, 0xfc, 0x01, 0xe4, 0xe2, 0x01, 0x09, 0xf4, 0xf1, 0x0c, 0x0d,\n","  0x00, 0xf9, 0xfa, 0x07, 0x0c, 0xf4, 0xf5, 0xe9, 0xfa, 0x2f, 0x3d, 0x11,\n","  0xef, 0x0b, 0x12, 0x04, 0xed, 0xfb, 0x17, 0x0e, 0x0d, 0xfb, 0xfb, 0xe1,\n","  0x0e, 0xf0, 0x22, 0x13, 0x07, 0xed, 0xee, 0xda, 0xf2, 0xe8, 0x48, 0x07,\n","  0xfc, 0xd2, 0xe3, 0xf0, 0xfa, 0xf9, 0x10, 0x0c, 0xe7, 0xeb, 0x01, 0xd3,\n","  0xfb, 0xff, 0x3b, 0xf9, 0xf8, 0xef, 0xe9, 0xea, 0xe3, 0x01, 0x03, 0x04,\n","  0xfb, 0xf9, 0x1a, 0x1e, 0x18, 0xf4, 0x05, 0x22, 0x21, 0xc9, 0x0c, 0xbf,\n","  0x27, 0xfb, 0x06, 0x1d, 0x17, 0xce, 0x0e, 0xb7, 0x3c, 0xfa, 0xea, 0x0f,\n","  0x12, 0xa6, 0xff, 0xd6, 0x25, 0xd4, 0x1e, 0xe4, 0x12, 0xaf, 0xdd, 0xd6,\n","  0x2c, 0xfc, 0x08, 0xf5, 0x0e, 0xbb, 0x0a, 0xe2, 0x06, 0xfc, 0x27, 0x2e,\n","  0x0f, 0xc7, 0xf8, 0x00, 0x00, 0x04, 0x1c, 0x0b, 0x0e, 0x04, 0x17, 0x11,\n","  0x06, 0x0c, 0x17, 0x13, 0xfb, 0xf3, 0xe0, 0xe7, 0x06, 0xdf, 0x0b, 0x11,\n","  0x01, 0xfb, 0xef, 0x05, 0xf3, 0xc7, 0x01, 0xfc, 0xfc, 0x0b, 0x04, 0x00,\n","  0x04, 0x13, 0x25, 0x2a, 0x05, 0xfb, 0x24, 0xf1, 0xe6, 0xfd, 0x19, 0x09,\n","  0x01, 0xe0, 0xf2, 0xf5, 0x03, 0xfd, 0xfe, 0x06, 0x08, 0xe9, 0xde, 0x1a,\n","  0xfd, 0x17, 0x1b, 0x11, 0x0c, 0xf7, 0x0c, 0xf6, 0xfb, 0xf6, 0x29, 0x1b,\n","  0x1e, 0x00, 0xea, 0xe2, 0xfe, 0xeb, 0x1d, 0x22, 0xff, 0x15, 0xec, 0xcd,\n","  0xef, 0xc4, 0x18, 0x15, 0xed, 0xed, 0x08, 0xeb, 0xf8, 0xe4, 0x35, 0x08,\n","  0x0b, 0xe4, 0x13, 0xf4, 0xf6, 0xff, 0x12, 0xfc, 0xfc, 0x05, 0x0b, 0xf6,\n","  0xeb, 0x07, 0x0d, 0x0f, 0xf8, 0x21, 0xf0, 0xe1, 0x1e, 0xf2, 0xf1, 0xfe,\n","  0x2b, 0xe6, 0x2a, 0xd2, 0x15, 0xf1, 0x02, 0xfc, 0x22, 0xce, 0xe2, 0xbc,\n","  0x35, 0xf9, 0x1e, 0x1c, 0x17, 0xaf, 0xf7, 0xfa, 0x2a, 0xea, 0x13, 0xfe,\n","  0x08, 0xbe, 0x1b, 0xcf, 0x19, 0x16, 0x00, 0x1b, 0x1c, 0xbe, 0xe9, 0xee,\n","  0x05, 0xe6, 0xec, 0x03, 0x26, 0xd2, 0xec, 0x0c, 0xf7, 0xeb, 0xf8, 0xf8,\n","  0x1f, 0xde, 0xf3, 0xdd, 0x0f, 0x01, 0x26, 0xf9, 0x00, 0xf0, 0xe9, 0xe0,\n","  0x0f, 0xc3, 0x0b, 0xe9, 0x01, 0xee, 0x03, 0xd8, 0xf4, 0xee, 0x29, 0x14,\n","  0xf2, 0xfe, 0xf1, 0x09, 0xfc, 0x09, 0x0e, 0xfe, 0x06, 0x04, 0xfb, 0x07,\n","  0xf0, 0xfe, 0x24, 0xfa, 0xf7, 0xf9, 0x0b, 0xfa, 0xf1, 0xf3, 0x1c, 0xf9,\n","  0x05, 0xdb, 0x09, 0xf9, 0x10, 0xf5, 0x17, 0x2d, 0x09, 0xf9, 0xf3, 0x06,\n","  0xfd, 0xe4, 0x07, 0xf6, 0xff, 0xfb, 0xfe, 0xf7, 0xfb, 0xf2, 0x22, 0xfe,\n","  0xfb, 0xfb, 0x12, 0xe4, 0xf0, 0xec, 0x2a, 0x1c, 0xf8, 0xfa, 0x01, 0xd9,\n","  0xef, 0x00, 0x1d, 0x06, 0xf8, 0xff, 0x05, 0x0b, 0xf4, 0x00, 0x38, 0x16,\n","  0xf3, 0xf5, 0x1e, 0x07, 0xde, 0x0b, 0x32, 0x25, 0xfe, 0x03, 0x0d, 0x0a,\n","  0x1f, 0x05, 0x28, 0x01, 0x19, 0xd3, 0xff, 0xc2, 0x0a, 0x01, 0xf6, 0x1e,\n","  0x24, 0xda, 0xf9, 0xb2, 0x4f, 0xef, 0xf9, 0x13, 0xf5, 0xd2, 0xd7, 0xe6,\n","  0x37, 0xf4, 0x02, 0x09, 0x05, 0xa3, 0xf7, 0xd9, 0x14, 0xf2, 0x0b, 0x05,\n","  0x36, 0xbd, 0x0c, 0x17, 0xfc, 0xfa, 0x22, 0x27, 0x1f, 0xc2, 0xf6, 0xf3,\n","  0xff, 0xe6, 0x25, 0x17, 0x08, 0xd0, 0x04, 0x1a, 0xfb, 0xff, 0x08, 0x24,\n","  0xf1, 0xf3, 0x15, 0xf4, 0xf6, 0xf2, 0x12, 0xe5, 0x01, 0xd8, 0xec, 0x17,\n","  0x00, 0xd9, 0x08, 0x11, 0x04, 0x11, 0x02, 0xe9, 0xea, 0xe9, 0x20, 0xf4,\n","  0x12, 0xe7, 0xe3, 0x00, 0xfe, 0x10, 0x1d, 0xeb, 0xfe, 0xe6, 0xd6, 0x05,\n","  0xfa, 0xf3, 0x14, 0x19, 0x03, 0xdc, 0x0e, 0xe3, 0xf7, 0xfd, 0x31, 0xf3,\n","  0x05, 0x11, 0xf5, 0xe3, 0x01, 0x05, 0x2c, 0x03, 0x15, 0xdf, 0x21, 0x0e,\n","  0xe7, 0xfb, 0x09, 0x0c, 0xfb, 0xf9, 0x1b, 0xdc, 0xe3, 0xf3, 0x14, 0xdb,\n","  0x02, 0xe8, 0x0a, 0xfd, 0xf7, 0xf9, 0x05, 0xdb, 0xfb, 0xe7, 0xf2, 0xfe,\n","  0xf5, 0xe5, 0x10, 0xdd, 0x00, 0xf0, 0xe0, 0xf5, 0xf0, 0x04, 0x19, 0x24,\n","  0xff, 0xe4, 0xf0, 0xf0, 0x23, 0x19, 0x17, 0xf6, 0x11, 0xdd, 0xdf, 0xde,\n","  0x2a, 0xee, 0x0a, 0xfb, 0x2b, 0xc5, 0x05, 0xb4, 0x51, 0xf3, 0x09, 0x10,\n","  0x0a, 0xb3, 0xfd, 0xe6, 0x48, 0xdf, 0x14, 0x0b, 0x1b, 0xcc, 0xd9, 0xfa,\n","  0x15, 0xe5, 0xff, 0x24, 0x30, 0xbf, 0x05, 0x02, 0x09, 0x14, 0x25, 0x18,\n","  0x2d, 0xc2, 0xfe, 0xf5, 0x0a, 0x17, 0xfd, 0x03, 0x15, 0xd3, 0x21, 0x11,\n","  0x10, 0xe5, 0x02, 0xe3, 0xf7, 0x06, 0x15, 0xfa, 0xf5, 0xd3, 0x17, 0x02,\n","  0xf9, 0x05, 0x16, 0xe0, 0x16, 0xd4, 0x0c, 0xe9, 0xf4, 0xfd, 0x28, 0x15,\n","  0x04, 0xe2, 0x03, 0xfd, 0xf6, 0xf5, 0xfb, 0xf8, 0xf4, 0xf1, 0x10, 0xe6,\n","  0x02, 0xfe, 0x03, 0xca, 0xe8, 0x05, 0x14, 0x02, 0xf9, 0xdc, 0xef, 0xf7,\n","  0x09, 0x0f, 0x1e, 0x11, 0xfb, 0xfb, 0x13, 0x23, 0xf8, 0x06, 0x14, 0x12,\n","  0x1b, 0x13, 0x2a, 0xf4, 0x04, 0xe5, 0x24, 0x1c, 0x03, 0xf8, 0x01, 0xd3,\n","  0xe4, 0xd0, 0x3d, 0xe7, 0x0c, 0xde, 0xf1, 0xe3, 0xf1, 0xe8, 0x12, 0xf1,\n","  0x10, 0xdb, 0xe5, 0xd3, 0xe5, 0xf7, 0x0f, 0xeb, 0xf9, 0xee, 0x18, 0xe5,\n","  0xe9, 0x13, 0x18, 0x26, 0x14, 0x00, 0xfc, 0xf7, 0x2b, 0x0f, 0x05, 0xf5,\n","  0x39, 0xd3, 0xf1, 0xd8, 0x29, 0xf4, 0x0f, 0x15, 0x14, 0xbc, 0x00, 0xc9,\n","  0x3f, 0xe1, 0x05, 0x11, 0x23, 0xb4, 0xe3, 0xf6, 0x51, 0xde, 0x26, 0xf6,\n","  0x27, 0xb3, 0xf7, 0xdd, 0x2d, 0xf1, 0x10, 0x09, 0x3d, 0xcd, 0xea, 0xf1,\n","  0x0c, 0x0e, 0xfe, 0x21, 0x24, 0xd6, 0xf9, 0x08, 0xff, 0xee, 0x12, 0x08,\n","  0xfd, 0xe8, 0x19, 0xeb, 0x0b, 0xeb, 0x0f, 0x23, 0x0e, 0xd1, 0xfe, 0xf1,\n","  0xf3, 0xd7, 0xf7, 0x1f, 0xff, 0xe5, 0xfe, 0x12, 0x05, 0xee, 0x13, 0x20,\n","  0x22, 0xdd, 0x03, 0x19, 0x08, 0xee, 0xfd, 0x01, 0x12, 0x1a, 0xfc, 0x0c,\n","  0xf5, 0xf4, 0xfd, 0xef, 0x05, 0xe8, 0x17, 0x08, 0xf2, 0xea, 0x08, 0x13,\n","  0x03, 0xff, 0xf0, 0xe9, 0xfe, 0xff, 0x22, 0xfb, 0xff, 0xee, 0x0c, 0xfb,\n","  0xff, 0x06, 0x27, 0x01, 0x08, 0xe3, 0x0c, 0xf1, 0x06, 0xe4, 0x19, 0x0d,\n","  0x0e, 0xe1, 0xdc, 0xe8, 0xdb, 0xed, 0x2a, 0x0a, 0x06, 0xfd, 0x0e, 0xfb,\n","  0xfb, 0x06, 0x25, 0x27, 0xfc, 0xf2, 0xf5, 0xf6, 0xef, 0xf7, 0x35, 0xf2,\n","  0xe9, 0xea, 0x05, 0xf1, 0xdf, 0x06, 0x16, 0xf2, 0xfe, 0xde, 0xf0, 0x05,\n","  0x2c, 0x25, 0x0a, 0x15, 0x0e, 0xc2, 0x03, 0xad, 0x3a, 0xee, 0x09, 0x27,\n","  0x31, 0xb8, 0x20, 0xb5, 0x53, 0xd7, 0x09, 0xea, 0x0b, 0xc9, 0x04, 0xf9,\n","  0x61, 0xda, 0xde, 0x19, 0x2d, 0xc3, 0xe7, 0xd4, 0x1b, 0xe7, 0xf9, 0x0f,\n","  0x43, 0xc2, 0xff, 0xe6, 0x0c, 0xef, 0x13, 0xf3, 0x1b, 0xe0, 0x0b, 0x08,\n","  0x05, 0x03, 0x09, 0x03, 0x23, 0xf4, 0xe8, 0xf5, 0x15, 0xfe, 0xee, 0xe8,\n","  0x06, 0xe1, 0xe8, 0xf0, 0x20, 0xb3, 0xf4, 0x02, 0x06, 0xe4, 0xfa, 0x14,\n","  0x02, 0xef, 0x13, 0x16, 0x08, 0x0f, 0x0e, 0x22, 0x0b, 0xed, 0xf3, 0x1b,\n","  0x1d, 0x01, 0x22, 0xec, 0x01, 0xe0, 0xf5, 0x18, 0x0c, 0xd5, 0xff, 0x0e,\n","  0x09, 0x06, 0x0b, 0xf1, 0x12, 0xe2, 0xe4, 0xd5, 0x07, 0xfb, 0xfc, 0xfe,\n","  0xf7, 0xf7, 0x04, 0x02, 0xfe, 0xee, 0x05, 0x06, 0x04, 0xd9, 0x00, 0x06,\n","  0xfb, 0x01, 0x28, 0x06, 0x09, 0xfe, 0x1c, 0xd7, 0xf9, 0xdc, 0x1a, 0xf3,\n","  0xf6, 0xc9, 0xfd, 0xfe, 0x06, 0xdc, 0x09, 0xf6, 0xfe, 0xe7, 0x18, 0xf9,\n","  0xf7, 0xe4, 0x24, 0xf5, 0xe9, 0x0a, 0x08, 0xf0, 0xf1, 0x08, 0x2c, 0xfd,\n","  0xf9, 0xe4, 0xf9, 0x03, 0x38, 0x05, 0x0d, 0xf6, 0x1e, 0xda, 0xfc, 0xb9,\n","  0x58, 0x01, 0xff, 0xf5, 0x33, 0xb4, 0xf7, 0xb7, 0x72, 0x12, 0x14, 0xf7,\n","  0xff, 0xd5, 0x06, 0xda, 0x61, 0xd0, 0x06, 0x05, 0x1e, 0xca, 0x0a, 0xfa,\n","  0x30, 0xcf, 0xfa, 0xf2, 0x31, 0xd2, 0x0d, 0xcd, 0x2f, 0xd8, 0x13, 0x13,\n","  0x2c, 0xcc, 0x08, 0xd6, 0x23, 0xd9, 0x12, 0x11, 0x18, 0xfa, 0x0c, 0xe3,\n","  0x18, 0xef, 0xef, 0x00, 0x26, 0xf0, 0xf3, 0xe7, 0x1e, 0xc9, 0x0e, 0x26,\n","  0x04, 0xeb, 0xf0, 0x0a, 0x26, 0xc9, 0xf6, 0xfb, 0x0c, 0xf1, 0x11, 0x00,\n","  0x18, 0xec, 0x10, 0x07, 0x0e, 0x06, 0xde, 0xed, 0x0b, 0xd8, 0x13, 0xfe,\n","  0x05, 0xfc, 0x00, 0xd0, 0x13, 0x07, 0x1f, 0xf2, 0x11, 0x13, 0x0a, 0x1d,\n","  0x10, 0xf8, 0xfd, 0x06, 0x02, 0x06, 0xf5, 0xdf, 0x10, 0xfa, 0x11, 0xe0,\n","  0xf7, 0xf5, 0xf9, 0xe8, 0x0d, 0xda, 0x02, 0xf3, 0xf2, 0xef, 0x0c, 0xe9,\n","  0xfc, 0xc3, 0x18, 0x12, 0xea, 0xfb, 0x08, 0x0f, 0xf7, 0xdf, 0x23, 0x08,\n","  0x03, 0xeb, 0xe9, 0x1e, 0xf2, 0xe2, 0x13, 0xea, 0x01, 0xf2, 0xec, 0xe8,\n","  0xed, 0x0d, 0x15, 0xfc, 0x0f, 0xfd, 0x03, 0xfd, 0x61, 0xee, 0x12, 0xe4,\n","  0x01, 0xd0, 0x0d, 0xc4, 0x4a, 0x10, 0x07, 0x1d, 0x2e, 0xab, 0xe3, 0xa9,\n","  0x7f, 0xf8, 0x1f, 0xe3, 0x00, 0xe5, 0xe6, 0xcd, 0x6c, 0xc4, 0x2a, 0xfb,\n","  0x18, 0xd8, 0xf7, 0xb7, 0x49, 0xf7, 0x19, 0xe2, 0x2e, 0xe3, 0xf5, 0xfd,\n","  0x33, 0xfa, 0x0b, 0xfd, 0x0a, 0xdc, 0xf0, 0x0c, 0x34, 0xd0, 0x02, 0xf4,\n","  0x22, 0xe4, 0xf8, 0xe3, 0x2f, 0xe4, 0x11, 0xe5, 0x0e, 0x0c, 0x1e, 0xe6,\n","  0x21, 0xe8, 0x10, 0xfa, 0x07, 0xfa, 0xef, 0x03, 0x01, 0xde, 0x02, 0x08,\n","  0x0d, 0xdc, 0x17, 0x00, 0x01, 0xe1, 0x1c, 0x0e, 0xfc, 0x02, 0x04, 0xe8,\n","  0x07, 0xee, 0x06, 0xff, 0x09, 0xcd, 0x1a, 0xd1, 0x18, 0x2c, 0xff, 0xf4,\n","  0xf4, 0xee, 0x19, 0xec, 0x1b, 0xf4, 0x09, 0x0e, 0x02, 0xee, 0x15, 0xe3,\n","  0x0f, 0xe4, 0x02, 0x08, 0xfb, 0x15, 0x09, 0xf1, 0x01, 0xcd, 0x22, 0x19,\n","  0xee, 0x04, 0x1f, 0xd7, 0x0c, 0xd5, 0x10, 0xea, 0x0c, 0x06, 0x14, 0xd1,\n","  0xef, 0xef, 0x22, 0x22, 0xf1, 0xf1, 0xfc, 0x0d, 0xf7, 0x00, 0x0e, 0x07,\n","  0xf4, 0x0d, 0x12, 0x01, 0xde, 0x1d, 0x04, 0xe5, 0x03, 0x15, 0xe8, 0xda,\n","  0x62, 0x0f, 0x1a, 0xeb, 0x13, 0xd1, 0x09, 0xe7, 0x79, 0x25, 0xfb, 0xff,\n","  0x43, 0xa8, 0xef, 0xa4, 0x61, 0xfe, 0x15, 0x16, 0x28, 0xbc, 0x07, 0xd6,\n","  0x59, 0xd3, 0x00, 0xf0, 0x18, 0xcb, 0x05, 0xca, 0x2f, 0x08, 0xf4, 0x2d,\n","  0x1f, 0xe5, 0x07, 0xfb, 0x1c, 0x0e, 0x26, 0xf3, 0x3c, 0xd1, 0xe7, 0xf7,\n","  0x0f, 0xf2, 0xfc, 0x24, 0x3a, 0xf4, 0xfa, 0xfc, 0x09, 0xe1, 0x0e, 0x00,\n","  0x06, 0xe2, 0x04, 0xe8, 0x15, 0xdd, 0xf6, 0x06, 0x21, 0xe5, 0xfb, 0xe7,\n","  0xfe, 0xed, 0xfb, 0x14, 0x1c, 0xdd, 0xf8, 0xf6, 0x26, 0x02, 0x02, 0xf1,\n","  0xf7, 0xd3, 0x13, 0xeb, 0x18, 0x03, 0x12, 0xf4, 0xe5, 0xf0, 0xef, 0xe9,\n","  0x2c, 0x0d, 0xe3, 0x19, 0x12, 0xc8, 0xdd, 0xee, 0x08, 0x0b, 0xee, 0x19,\n","  0xf9, 0xf3, 0xf4, 0xf9, 0x0a, 0xfd, 0xf2, 0x0e, 0x15, 0xf8, 0xd6, 0x03,\n","  0x1f, 0xe9, 0xfd, 0x04, 0x15, 0x1f, 0x21, 0xe1, 0x0c, 0xf8, 0xec, 0xf4,\n","  0xee, 0x0c, 0xef, 0xfd, 0x0a, 0xf4, 0x06, 0x14, 0x10, 0xe1, 0xdd, 0x0b,\n","  0x0b, 0x05, 0x0e, 0x0f, 0x01, 0xf7, 0xfd, 0xe0, 0xe2, 0x26, 0x28, 0x26,\n","  0x10, 0x00, 0xe8, 0xfd, 0xfa, 0xec, 0xf7, 0x14, 0x08, 0xff, 0xf7, 0x0c,\n","  0x06, 0x09, 0xf3, 0x0b, 0xf3, 0xfe, 0xec, 0xfd, 0x1a, 0xf8, 0xf1, 0xdb,\n","  0xfe, 0x0f, 0xff, 0x0b, 0x17, 0x1f, 0xfb, 0xe7, 0x0c, 0x13, 0x10, 0xf6,\n","  0x04, 0x11, 0xf3, 0xfd, 0xec, 0xd0, 0xf3, 0xfa, 0x01, 0xfe, 0x03, 0x07,\n","  0x0d, 0xde, 0xf8, 0x05, 0xee, 0xf0, 0xff, 0x08, 0xff, 0xf0, 0x1d, 0x05,\n","  0x14, 0xea, 0xfe, 0x04, 0xf1, 0x0e, 0x19, 0xfb, 0x1a, 0xff, 0xef, 0xf2,\n","  0x02, 0xf5, 0xe7, 0x0e, 0xe4, 0x1d, 0xfa, 0x14, 0xf0, 0xde, 0xf0, 0xe4,\n","  0xf6, 0x04, 0x07, 0xe6, 0xf1, 0x1b, 0xff, 0xfb, 0x16, 0x02, 0x01, 0x10,\n","  0x08, 0x14, 0x08, 0x03, 0xf7, 0x01, 0x02, 0xf6, 0xf9, 0xe7, 0xe9, 0xf5,\n","  0x05, 0x14, 0xfc, 0xe1, 0xfb, 0x20, 0x03, 0x18, 0xfa, 0xe9, 0xf0, 0x1d,\n","  0xf9, 0xf0, 0xfb, 0xed, 0x0a, 0xd9, 0xf4, 0xeb, 0xed, 0x05, 0xf7, 0x0b,\n","  0x0f, 0xf0, 0x0a, 0x07, 0xee, 0xdd, 0x17, 0x08, 0xfb, 0x1c, 0xf4, 0x23,\n","  0xfd, 0x0f, 0x07, 0xdf, 0x03, 0x1f, 0xed, 0xf1, 0xfd, 0xfb, 0xdc, 0x0a,\n","  0x18, 0xf9, 0x00, 0xea, 0xf7, 0xe8, 0xf6, 0x07, 0xee, 0xf8, 0xec, 0xf7,\n","  0x04, 0x0e, 0x0f, 0x00, 0x18, 0xfc, 0x09, 0x1a, 0xfb, 0x00, 0xe5, 0xff,\n","  0x0f, 0x08, 0xeb, 0xfc, 0x0f, 0xe6, 0x14, 0x03, 0xf6, 0xfc, 0x0f, 0xfc,\n","  0x0b, 0xf2, 0x1c, 0x06, 0xf9, 0x09, 0xf9, 0xdf, 0x14, 0xfb, 0xd6, 0xeb,\n","  0xfb, 0xeb, 0x0d, 0x0b, 0x15, 0xe6, 0xf6, 0x04, 0x17, 0xfc, 0x10, 0xf4,\n","  0x05, 0xf7, 0xf7, 0xf2, 0xf9, 0xf0, 0xfc, 0x10, 0x08, 0x0d, 0xe1, 0x0c,\n","  0x06, 0x12, 0xf1, 0xfd, 0x10, 0x2a, 0xfb, 0xec, 0x0c, 0x05, 0x0b, 0x18,\n","  0x2b, 0x0c, 0x08, 0xeb, 0x22, 0xfb, 0xfe, 0x07, 0x08, 0x17, 0x0d, 0xed,\n","  0xe8, 0xf2, 0x0d, 0xdf, 0x14, 0xf5, 0xed, 0xe3, 0x00, 0x06, 0xfb, 0x15,\n","  0x01, 0x03, 0xf9, 0xfe, 0x08, 0x14, 0x01, 0xf3, 0xe4, 0xfb, 0xfe, 0xde,\n","  0x0f, 0xe8, 0xff, 0xf1, 0x03, 0xe5, 0x18, 0xff, 0xfd, 0x02, 0x10, 0xec,\n","  0xfb, 0xf5, 0x12, 0x06, 0x0c, 0xde, 0x0f, 0x0e, 0x03, 0xf1, 0xf9, 0x02,\n","  0xfa, 0x01, 0x07, 0xf3, 0x02, 0x0f, 0x03, 0x13, 0xf4, 0xee, 0x0a, 0x04,\n","  0x0f, 0x1c, 0x1a, 0x03, 0x08, 0x06, 0xf6, 0x16, 0xff, 0xec, 0x14, 0xfe,\n","  0x09, 0xf5, 0x06, 0x1d, 0xf3, 0xf0, 0x22, 0xf7, 0x28, 0xe3, 0x09, 0x28,\n","  0xf2, 0x1a, 0x1c, 0x0e, 0x1a, 0xd5, 0xf6, 0xdd, 0x03, 0xce, 0xff, 0x03,\n","  0xf5, 0xf2, 0x14, 0x02, 0x11, 0xd2, 0x08, 0xfa, 0xf2, 0xf7, 0xf6, 0xef,\n","  0xf8, 0xea, 0xf3, 0xf7, 0xe7, 0x0e, 0x03, 0xf5, 0x07, 0x04, 0x21, 0xf5,\n","  0xec, 0xf6, 0xf1, 0x0f, 0x09, 0x0a, 0x06, 0x03, 0x14, 0xee, 0x03, 0x26,\n","  0x01, 0x0a, 0x09, 0xf8, 0x0a, 0x17, 0xf6, 0x19, 0x1c, 0xfc, 0x0f, 0xf1,\n","  0xf8, 0x06, 0xf7, 0xd9, 0x0b, 0x0e, 0x04, 0xda, 0x03, 0xe8, 0x15, 0x0a,\n","  0x35, 0xfe, 0x03, 0xe5, 0x07, 0xfc, 0x11, 0xfa, 0xfc, 0xf4, 0xe9, 0x06,\n","  0xfd, 0xe4, 0x15, 0x07, 0x10, 0xef, 0xf6, 0xfc, 0x13, 0x14, 0x08, 0x09,\n","  0x12, 0xe6, 0xfb, 0xe1, 0x17, 0x04, 0xf8, 0xfc, 0xfc, 0xf1, 0xf3, 0xee,\n","  0x27, 0x0d, 0xf7, 0xfd, 0x0a, 0xf7, 0x14, 0x00, 0x0d, 0xff, 0xf3, 0x0a,\n","  0xf9, 0x01, 0x04, 0xfd, 0xf2, 0xf4, 0x13, 0x16, 0xfb, 0x09, 0xe4, 0xef,\n","  0xf8, 0xf1, 0x10, 0xff, 0x14, 0xfa, 0xda, 0xf6, 0xff, 0xff, 0xfb, 0x10,\n","  0x0b, 0x08, 0x0d, 0xf8, 0x04, 0x10, 0xf8, 0xf2, 0x10, 0x00, 0x16, 0x0b,\n","  0x00, 0x00, 0x14, 0x0b, 0xee, 0xf7, 0x0e, 0x0b, 0xf8, 0xed, 0xf6, 0x0f,\n","  0xff, 0xc1, 0xfc, 0x04, 0xf6, 0x0a, 0xfa, 0x01, 0xe3, 0xdc, 0x05, 0x07,\n","  0x00, 0x27, 0x01, 0x06, 0xe1, 0xeb, 0x25, 0x05, 0xf1, 0x22, 0x17, 0x1a,\n","  0x0a, 0xff, 0x15, 0x18, 0xf3, 0x0f, 0x01, 0x19, 0xfd, 0x0e, 0xec, 0x08,\n","  0xfa, 0xfd, 0x0f, 0xeb, 0x09, 0x0e, 0xe2, 0x23, 0x07, 0xfa, 0xef, 0xfe,\n","  0xe9, 0xfc, 0x27, 0x0d, 0x08, 0xf9, 0x0d, 0xf8, 0x1f, 0x15, 0x15, 0xd7,\n","  0x1d, 0x1a, 0x0e, 0x12, 0x10, 0x23, 0x0d, 0xef, 0xf4, 0x04, 0xff, 0xec,\n","  0x05, 0xfc, 0x05, 0x07, 0xf0, 0x0c, 0xfb, 0xf9, 0x07, 0xf4, 0x01, 0x0b,\n","  0xf5, 0x02, 0x14, 0xfa, 0xe3, 0xee, 0xe5, 0x08, 0xea, 0x11, 0x08, 0x0f,\n","  0xfc, 0xfc, 0xf4, 0xfb, 0xf6, 0x37, 0x0f, 0xea, 0xfe, 0xfe, 0xf6, 0xf5,\n","  0x11, 0x27, 0xed, 0xe9, 0xfb, 0x09, 0xfb, 0x05, 0xeb, 0xf8, 0x00, 0xf0,\n","  0xf1, 0x0c, 0x2b, 0x07, 0xe3, 0x0d, 0x27, 0xdc, 0x06, 0x22, 0xf3, 0x02,\n","  0xf9, 0x0a, 0x07, 0x24, 0xfe, 0x0a, 0x17, 0x1a, 0x07, 0xf7, 0xee, 0xf3,\n","  0x14, 0x0c, 0x04, 0x08, 0xf2, 0xec, 0xf7, 0x1d, 0xf1, 0xef, 0xf8, 0xef,\n","  0x19, 0xe8, 0x1d, 0x1a, 0xe1, 0xd8, 0x0c, 0xee, 0xe7, 0x17, 0x16, 0xe4,\n","  0xf4, 0xe8, 0x26, 0x08, 0x05, 0x24, 0x06, 0x0b, 0xf7, 0xe8, 0x27, 0x17,\n","  0xe5, 0xe7, 0xeb, 0xe8, 0x0d, 0xe2, 0xf7, 0x11, 0xfd, 0xdb, 0xf9, 0x17,\n","  0xfc, 0x15, 0x0f, 0x17, 0xe6, 0xeb, 0xf4, 0xf9, 0x03, 0x19, 0xe0, 0x1e,\n","  0x09, 0xed, 0xfe, 0xf7, 0x2a, 0x26, 0x12, 0x1a, 0xed, 0xe9, 0x0b, 0xf5,\n","  0x15, 0x20, 0x1c, 0x07, 0x07, 0xf7, 0x0a, 0x0d, 0x0f, 0x1e, 0x1a, 0xe6,\n","  0x0f, 0x24, 0x03, 0x1b, 0x20, 0xfc, 0x13, 0x04, 0x0c, 0x03, 0xfe, 0xea,\n","  0x00, 0x07, 0xec, 0x0f, 0xde, 0x16, 0x19, 0x07, 0xe7, 0xe5, 0x15, 0xfd,\n","  0xd4, 0x1a, 0xfb, 0x01, 0x07, 0xdb, 0x04, 0xfe, 0xda, 0x20, 0xf9, 0x0f,\n","  0xce, 0xf6, 0x19, 0x14, 0xe6, 0x2f, 0xed, 0x0b, 0x02, 0xfb, 0xd8, 0xf8,\n","  0xec, 0x1f, 0x03, 0xfe, 0x14, 0x1e, 0xfd, 0x00, 0xff, 0x13, 0xf4, 0xfb,\n","  0x01, 0x08, 0xd7, 0x03, 0x03, 0xe0, 0x03, 0xef, 0xfe, 0x0a, 0xe3, 0x05,\n","  0x03, 0x0b, 0x1e, 0xf0, 0xf1, 0x16, 0x18, 0x01, 0xfb, 0xe5, 0xf5, 0xdc,\n","  0x03, 0xed, 0x02, 0xff, 0x0b, 0x1a, 0xf7, 0x24, 0xf9, 0xda, 0x1a, 0xe7,\n","  0x05, 0x1d, 0xf8, 0xf1, 0xf6, 0xf2, 0xd6, 0xf0, 0xfb, 0x16, 0xf1, 0x10,\n","  0x17, 0xf5, 0x08, 0x09, 0xf7, 0xfa, 0xed, 0x02, 0x09, 0xfc, 0xf1, 0xf2,\n","  0xfd, 0xea, 0xfc, 0x01, 0x07, 0x06, 0x09, 0x06, 0x08, 0xfb, 0xea, 0x0c,\n","  0x03, 0x1e, 0x0b, 0x2b, 0xe3, 0xf1, 0x0b, 0xe4, 0x1b, 0x27, 0xea, 0x1c,\n","  0x0b, 0xfb, 0x01, 0x04, 0x1c, 0x26, 0xf2, 0xf2, 0xf6, 0xf2, 0xfb, 0xfb,\n","  0x05, 0x2c, 0xef, 0xe9, 0xfb, 0x05, 0x10, 0x0b, 0x08, 0x05, 0x1c, 0xf1,\n","  0xd2, 0x07, 0x0b, 0xe0, 0xf9, 0x03, 0xe7, 0xf3, 0xfa, 0x12, 0xee, 0xf3,\n","  0xe0, 0xf8, 0x0e, 0xf0, 0xf1, 0x30, 0x17, 0x01, 0x00, 0xe0, 0x1a, 0xfe,\n","  0xde, 0x2c, 0x03, 0x05, 0x00, 0xe5, 0xf7, 0x02, 0xfb, 0x34, 0xdd, 0x08,\n","  0x09, 0x06, 0x1f, 0x0a, 0x00, 0x14, 0xec, 0xdd, 0xf7, 0xf0, 0xdb, 0xe9,\n","  0xf8, 0x14, 0xff, 0xee, 0xf5, 0xf9, 0x12, 0x01, 0x0c, 0xf7, 0xfd, 0x23,\n","  0xff, 0x0d, 0x19, 0x12, 0xfa, 0xf6, 0xf9, 0xfe, 0xe6, 0x00, 0x21, 0x0b,\n","  0xf8, 0xfd, 0x15, 0xfb, 0xee, 0xf2, 0xfe, 0x0a, 0x12, 0x1d, 0x09, 0xee,\n","  0xf4, 0xc4, 0xff, 0xe7, 0xfd, 0x2a, 0x22, 0x00, 0xe9, 0xff, 0xea, 0xf1,\n","  0xfb, 0x15, 0xe0, 0x19, 0xde, 0xe6, 0xf1, 0x00, 0xee, 0xfd, 0xf5, 0x0a,\n","  0x00, 0xfd, 0x0a, 0x0d, 0xf4, 0xf9, 0xf2, 0xe6, 0x02, 0x15, 0x1c, 0x00,\n","  0xee, 0xfb, 0xfe, 0xed, 0xf0, 0x3e, 0xff, 0x2f, 0xf6, 0xf7, 0xf7, 0xda,\n","  0x11, 0x22, 0x15, 0x26, 0xfc, 0xfe, 0xfb, 0xfc, 0xf6, 0x2f, 0x02, 0x14,\n","  0x18, 0xe9, 0x14, 0x19, 0x14, 0x22, 0x02, 0xfd, 0xff, 0x1a, 0x13, 0xf9,\n","  0xfd, 0x08, 0x06, 0xeb, 0xeb, 0x1e, 0xf0, 0xf6, 0xf4, 0x01, 0xf9, 0x0f,\n","  0xe5, 0x03, 0xf4, 0xea, 0x02, 0xe0, 0x04, 0x09, 0xe2, 0x2d, 0xf7, 0x16,\n","  0x04, 0xde, 0xd8, 0xf2, 0xe2, 0x46, 0xe3, 0x08, 0xe8, 0x0d, 0xf6, 0xfc,\n","  0xfb, 0x2b, 0xf6, 0x0d, 0xe4, 0x01, 0xfa, 0x03, 0xeb, 0x28, 0x03, 0x24,\n","  0x1d, 0xf3, 0xff, 0xe9, 0xe7, 0x19, 0x1a, 0xe3, 0x04, 0xf7, 0xed, 0xfd,\n","  0x02, 0x04, 0x14, 0x09, 0x09, 0x1c, 0x0b, 0x08, 0x09, 0xe8, 0x0b, 0xef,\n","  0x04, 0x02, 0xfe, 0x19, 0xfc, 0xf4, 0x08, 0xf8, 0xef, 0xd4, 0x04, 0x13,\n","  0xf6, 0x1c, 0x16, 0x0b, 0xe1, 0xc3, 0xe0, 0xc7, 0x0f, 0x40, 0x12, 0xff,\n","  0xdf, 0x02, 0xf5, 0xf2, 0xfd, 0x0a, 0xfa, 0x12, 0xef, 0xe6, 0xfb, 0x0c,\n","  0xfa, 0x0d, 0xfa, 0x18, 0xed, 0xfe, 0x21, 0xf9, 0xed, 0xf3, 0x00, 0x1f,\n","  0xfc, 0x08, 0x1d, 0x20, 0xdd, 0x14, 0xf8, 0x0e, 0x15, 0x40, 0xeb, 0x30,\n","  0xdb, 0x09, 0xfc, 0xf1, 0xee, 0x1d, 0x0d, 0x3a, 0x02, 0x0c, 0x0d, 0xf3,\n","  0x2b, 0x2c, 0x0e, 0x0a, 0x04, 0xf6, 0xfe, 0xe6, 0x17, 0x21, 0xee, 0x0a,\n","  0x11, 0x05, 0xf4, 0x19, 0x05, 0x2b, 0xe7, 0xfa, 0xfa, 0x25, 0x08, 0xd8,\n","  0xdd, 0xf6, 0xf6, 0x22, 0xf0, 0xfa, 0x06, 0xdf, 0xe5, 0xe1, 0x09, 0xf2,\n","  0xfc, 0x2d, 0x07, 0xfa, 0xf2, 0xe8, 0xf7, 0xee, 0xf7, 0x46, 0x03, 0xfb,\n","  0xe9, 0xf7, 0x07, 0x01, 0x1b, 0x23, 0xf3, 0x09, 0xff, 0x07, 0xfa, 0xeb,\n","  0xfb, 0x38, 0x05, 0xf1, 0xed, 0xf9, 0x13, 0xfd, 0xf9, 0x16, 0x04, 0x12,\n","  0x00, 0x06, 0xf1, 0xf2, 0x0c, 0xfe, 0xf4, 0xd7, 0x08, 0x15, 0xe2, 0x11,\n","  0x14, 0x0c, 0x02, 0xeb, 0x06, 0x21, 0x00, 0x0c, 0x14, 0x0a, 0x24, 0xfe,\n","  0xda, 0xdb, 0x0f, 0x0a, 0xf5, 0x3a, 0x11, 0xe3, 0xed, 0xcc, 0xfb, 0xbb,\n","  0x12, 0x27, 0x0a, 0x02, 0xe8, 0x00, 0xfe, 0xf2, 0xfe, 0x1c, 0x05, 0xfb,\n","  0xf9, 0x0c, 0xf8, 0x1c, 0xe9, 0xfa, 0xe5, 0x10, 0xdc, 0xea, 0xdb, 0xfd,\n","  0xe4, 0x0a, 0xe9, 0xf5, 0xe9, 0x01, 0x2a, 0x19, 0xf9, 0x10, 0xfc, 0xff,\n","  0x06, 0x27, 0x0a, 0x4c, 0xe9, 0x03, 0xf4, 0x10, 0x25, 0x48, 0xef, 0x3f,\n","  0xfe, 0x00, 0xf9, 0x0a, 0x21, 0x2d, 0x08, 0x18, 0x0a, 0xed, 0x06, 0xe4,\n","  0x2d, 0x13, 0x09, 0x0c, 0x0c, 0x0f, 0x11, 0x06, 0x18, 0x18, 0xf0, 0xff,\n","  0xf2, 0x1e, 0xf8, 0x13, 0xe6, 0xf3, 0xea, 0x1e, 0xf5, 0x18, 0xfb, 0x1c,\n","  0xe2, 0xdb, 0x13, 0xf8, 0x03, 0x35, 0xfc, 0xf8, 0xed, 0xf1, 0x05, 0xf6,\n","  0x0b, 0x3c, 0xfe, 0x06, 0xe1, 0x0f, 0x03, 0x07, 0x11, 0x29, 0x16, 0x0e,\n","  0xec, 0x01, 0xf3, 0xf3, 0x11, 0x29, 0x07, 0x04, 0x15, 0x11, 0x10, 0xf0,\n","  0x04, 0x11, 0xf2, 0x22, 0x08, 0x0b, 0xff, 0xe8, 0x08, 0xf5, 0x00, 0xe1,\n","  0x01, 0x09, 0x04, 0xfd, 0x03, 0xea, 0x06, 0xf6, 0x01, 0x08, 0xed, 0x0d,\n","  0xfe, 0x0f, 0x07, 0x00, 0xe3, 0xd8, 0x02, 0x1e, 0xf3, 0x3d, 0x35, 0x0f,\n","  0xcb, 0xe2, 0x13, 0xd6, 0x0c, 0x4e, 0x16, 0xe3, 0xe0, 0xf2, 0xf4, 0xf4,\n","  0xf5, 0x28, 0xf8, 0xf8, 0xe8, 0x05, 0xe8, 0x12, 0xf9, 0x04, 0xee, 0x0e,\n","  0xdf, 0xee, 0xed, 0x0d, 0xe1, 0xff, 0xfa, 0x0c, 0xfd, 0x05, 0xe7, 0x13,\n","  0x07, 0x1c, 0xfd, 0x05, 0x0c, 0x42, 0xf3, 0x42, 0xf8, 0x17, 0xe6, 0x0d,\n","  0x03, 0x3a, 0x27, 0x33, 0x03, 0x02, 0x1c, 0x10, 0x15, 0x31, 0xdf, 0x18,\n","  0x00, 0xfb, 0x02, 0xe1, 0x12, 0x29, 0x23, 0x05, 0x03, 0x0c, 0x07, 0x11,\n","  0x0e, 0x06, 0xfa, 0xf8, 0x05, 0x1d, 0x24, 0x10, 0x00, 0xfa, 0xea, 0x10,\n","  0x06, 0xf4, 0xfd, 0x16, 0xec, 0xf0, 0x0c, 0x0e, 0x0d, 0x1b, 0xf0, 0xfd,\n","  0xf9, 0xe4, 0x16, 0xeb, 0x01, 0x2b, 0x0b, 0xeb, 0xef, 0x10, 0xdf, 0x04,\n","  0x14, 0x2f, 0x03, 0xe5, 0xf6, 0x11, 0x26, 0xf9, 0xf8, 0x36, 0x14, 0xf0,\n","  0x03, 0x05, 0xf3, 0xed, 0x13, 0x0a, 0x07, 0x03, 0xfb, 0xff, 0xf6, 0xe7,\n","  0x08, 0xf4, 0xf5, 0x08, 0x05, 0xfe, 0xe8, 0x1e, 0x05, 0x08, 0xe2, 0xfd,\n","  0x25, 0x14, 0x1c, 0x15, 0x0b, 0x1b, 0x0e, 0x01, 0xf1, 0xe1, 0x19, 0x15,\n","  0xec, 0x4b, 0x26, 0xfe, 0xd9, 0xe4, 0x09, 0xd5, 0xf9, 0x40, 0xfe, 0xff,\n","  0xf0, 0x07, 0x10, 0xfa, 0x0a, 0x29, 0x08, 0x01, 0xdd, 0xf1, 0xe8, 0x12,\n","  0xf3, 0x13, 0xf8, 0x02, 0xe2, 0xe2, 0x28, 0x11, 0xee, 0xf6, 0xd4, 0x0c,\n","  0xee, 0x19, 0xff, 0x10, 0xf2, 0x25, 0xff, 0xfd, 0x0d, 0x19, 0x06, 0x53,\n","  0x06, 0x06, 0xf9, 0x1a, 0x0a, 0x45, 0xe5, 0x43, 0x0a, 0x05, 0xfd, 0x0e,\n","  0x00, 0x17, 0x02, 0x14, 0xf4, 0x12, 0x08, 0xd7, 0x14, 0x0d, 0xf3, 0x1a,\n","  0x0b, 0xfe, 0x21, 0x1b, 0x2f, 0x1e, 0xf1, 0xf4, 0xfb, 0x21, 0x08, 0x00,\n","  0xf3, 0xfe, 0x02, 0x0e, 0x00, 0xfb, 0x13, 0xfc, 0xee, 0xf0, 0x04, 0x04,\n","  0x04, 0x10, 0x02, 0xfd, 0xdc, 0xfc, 0x0c, 0xe8, 0xfa, 0x36, 0xf4, 0x0a,\n","  0xf1, 0xfa, 0xf8, 0xea, 0x00, 0x23, 0x0c, 0x18, 0x00, 0xfb, 0x12, 0xf1,\n","  0xff, 0x29, 0x05, 0x05, 0x05, 0x0a, 0xf6, 0xec, 0x0c, 0x12, 0x05, 0x16,\n","  0xff, 0x11, 0x16, 0xed, 0x01, 0x0c, 0x16, 0x1a, 0xf6, 0x06, 0x09, 0x00,\n","  0x0b, 0x21, 0xf3, 0x0b, 0xdd, 0x1a, 0x12, 0x14, 0x06, 0x2f, 0x11, 0xdc,\n","  0xff, 0xdf, 0x1d, 0x17, 0xe4, 0x3f, 0x0c, 0xf5, 0xe6, 0xdb, 0xfa, 0xdf,\n","  0x0c, 0x33, 0x27, 0xe8, 0xe8, 0xf1, 0x0f, 0x06, 0x05, 0x18, 0x11, 0xfb,\n","  0xe2, 0xf9, 0xd9, 0xef, 0xf7, 0x0c, 0xf3, 0xfe, 0xce, 0xec, 0xfd, 0x04,\n","  0xfe, 0xf5, 0xf6, 0x19, 0xe0, 0x0f, 0x0a, 0x0a, 0xed, 0x0c, 0xf9, 0x26,\n","  0x08, 0x30, 0x00, 0x2a, 0xe7, 0xfd, 0xf4, 0xfc, 0x0c, 0x1c, 0xe9, 0x5e,\n","  0x1c, 0x0b, 0x07, 0xf0, 0x10, 0x23, 0xea, 0x17, 0xfc, 0x01, 0x0d, 0xfb,\n","  0x2f, 0x2f, 0xe7, 0xfb, 0x04, 0x1b, 0x0f, 0xd9, 0x14, 0x21, 0xf0, 0x0e,\n","  0xe9, 0x1c, 0x0d, 0xdd, 0xf2, 0x0c, 0xe7, 0x09, 0x01, 0x12, 0x0b, 0xe5,\n","  0xe4, 0xe7, 0x05, 0xdb, 0x10, 0x25, 0xf1, 0xfa, 0xfa, 0xeb, 0x18, 0x0f,\n","  0xf7, 0x3d, 0x22, 0xf3, 0xed, 0xfa, 0x01, 0x0d, 0x1b, 0x28, 0x00, 0xe8,\n","  0xfc, 0x0d, 0xf3, 0x00, 0x00, 0x16, 0x19, 0x05, 0x0b, 0x07, 0xfb, 0xfe,\n","  0x18, 0x15, 0x24, 0xeb, 0xf2, 0x16, 0x0a, 0xe6, 0x13, 0x03, 0xf5, 0xff,\n","  0x04, 0x00, 0xfe, 0x0e, 0x03, 0xf3, 0x0e, 0x0d, 0x04, 0x27, 0xeb, 0x0d,\n","  0x09, 0x23, 0x15, 0xf3, 0xdb, 0xf0, 0xf6, 0x14, 0x06, 0x1f, 0x19, 0xfa,\n","  0xe0, 0xe9, 0xfc, 0xd8, 0x09, 0x54, 0x21, 0xfb, 0xd8, 0x0d, 0xd8, 0x07,\n","  0x12, 0x1f, 0x04, 0x14, 0xdd, 0x03, 0x14, 0xf1, 0xdf, 0xfa, 0x01, 0x0e,\n","  0xe4, 0xfa, 0x0c, 0x20, 0xe6, 0x06, 0xf4, 0xfc, 0xf2, 0xf9, 0x01, 0x20,\n","  0xed, 0x18, 0xf3, 0x07, 0xfe, 0x3c, 0xfa, 0x3f, 0xfc, 0x14, 0x0b, 0xfe,\n","  0x0d, 0x29, 0x15, 0x30, 0x04, 0x01, 0xe6, 0x0f, 0x19, 0x23, 0xff, 0x22,\n","  0x0d, 0xfc, 0xfe, 0x13, 0x1a, 0x26, 0xd8, 0x1a, 0x13, 0x03, 0xfd, 0xfc,\n","  0x0c, 0x12, 0xed, 0x08, 0x18, 0x11, 0x1b, 0xfe, 0x19, 0xfa, 0xf4, 0x07,\n","  0xf6, 0x11, 0x09, 0xf5, 0x02, 0xdb, 0x21, 0xe0, 0x06, 0x13, 0xec, 0x06,\n","  0xf3, 0xe5, 0xfb, 0xfb, 0xf9, 0x2d, 0xf7, 0xe4, 0xfe, 0xfb, 0xdc, 0xf9,\n","  0x03, 0x1d, 0x0c, 0xec, 0xf1, 0x14, 0x03, 0x00, 0x04, 0x16, 0x1d, 0x00,\n","  0x18, 0x01, 0x20, 0xf8, 0x0f, 0x26, 0x11, 0xdb, 0x06, 0x0a, 0xe9, 0xf1,\n","  0x0b, 0xfe, 0x07, 0xf7, 0xff, 0xfe, 0x08, 0xff, 0x05, 0xf4, 0x13, 0x05,\n","  0xe3, 0x02, 0x24, 0xfe, 0x0e, 0x0d, 0x01, 0xea, 0xec, 0xd9, 0x0e, 0x0e,\n","  0xf2, 0x2f, 0x23, 0x11, 0xf3, 0xcd, 0x0c, 0xea, 0x1f, 0x49, 0x16, 0x04,\n","  0xec, 0x0a, 0x18, 0xef, 0x20, 0x0e, 0x20, 0xda, 0xc3, 0xfd, 0x09, 0xeb,\n","  0xe5, 0x20, 0xfb, 0x06, 0xe7, 0x04, 0xfc, 0x10, 0xfa, 0xf6, 0xfb, 0xee,\n","  0xe6, 0x0b, 0xee, 0x13, 0xeb, 0x11, 0xea, 0xed, 0x20, 0x34, 0x0f, 0x2e,\n","  0xf3, 0x1c, 0x00, 0xd4, 0x15, 0x3e, 0x12, 0x31, 0xf4, 0x06, 0xf9, 0xdd,\n","  0x11, 0x1c, 0x23, 0x11, 0xf8, 0xfb, 0x11, 0xfb, 0x19, 0x10, 0xd8, 0x24,\n","  0x10, 0x18, 0x0d, 0x27, 0x04, 0x0f, 0xdf, 0xf5, 0x08, 0x07, 0x12, 0xdb,\n","  0x08, 0x01, 0x07, 0xfe, 0xf3, 0x00, 0x09, 0xf9, 0x01, 0xd3, 0x00, 0xf9,\n","  0x05, 0x1d, 0xf9, 0xf2, 0xf4, 0xf9, 0x1a, 0xfd, 0xf2, 0x38, 0x01, 0x12,\n","  0xef, 0xf6, 0x06, 0xfb, 0x0a, 0x1c, 0xf6, 0x10, 0x06, 0x05, 0xf2, 0x03,\n","  0xf9, 0x07, 0x07, 0xf8, 0x0f, 0xff, 0xf3, 0xff, 0x17, 0x18, 0x08, 0x0d,\n","  0xf2, 0xff, 0xf1, 0x03, 0x2e, 0xfb, 0xff, 0xd5, 0xf0, 0x05, 0x01, 0x0d,\n","  0xf5, 0xf0, 0xeb, 0x05, 0x0c, 0x0d, 0xff, 0x13, 0x0c, 0x13, 0x24, 0xf1,\n","  0xf9, 0xf1, 0x07, 0x06, 0xe9, 0x45, 0x2c, 0x0e, 0xdc, 0xe5, 0x1c, 0xea,\n","  0x0e, 0x4e, 0x32, 0x05, 0xed, 0xfb, 0xfa, 0xf6, 0x0d, 0x15, 0xfb, 0xfb,\n","  0xe2, 0xf7, 0xea, 0xfb, 0xf1, 0x14, 0xef, 0x07, 0xf8, 0x08, 0x1d, 0x24,\n","  0xed, 0x06, 0xe3, 0xed, 0xf1, 0x09, 0x1f, 0x0e, 0xef, 0x1b, 0xec, 0xfb,\n","  0x10, 0x2c, 0x08, 0x3e, 0xef, 0x0d, 0x07, 0xf4, 0x0c, 0x35, 0x18, 0x30,\n","  0xf5, 0xf7, 0xf4, 0xf8, 0x12, 0x29, 0x00, 0x0e, 0xfe, 0x00, 0x03, 0xe6,\n","  0x13, 0x29, 0x1f, 0x22, 0x08, 0xfc, 0x0c, 0x06, 0x1f, 0x16, 0x0d, 0x0c,\n","  0xff, 0xf6, 0xfd, 0x1a, 0xfc, 0x00, 0xef, 0xff, 0x09, 0x0f, 0x0c, 0x02,\n","  0xfd, 0xe2, 0x0a, 0xf5, 0xfb, 0x1b, 0xf2, 0xdc, 0xff, 0xf7, 0x14, 0xf9,\n","  0x17, 0x2a, 0x19, 0x1c, 0xfc, 0x0d, 0xf0, 0x02, 0x09, 0x22, 0x13, 0x05,\n","  0x0c, 0x02, 0xff, 0x0c, 0x04, 0x08, 0x0d, 0xd3, 0x0b, 0x04, 0x12, 0xe4,\n","  0x0a, 0x16, 0x00, 0xf1, 0x10, 0x13, 0x07, 0xe6, 0x2a, 0xf1, 0xf3, 0xdf,\n","  0x08, 0x11, 0x0b, 0x07, 0x08, 0x0b, 0xe9, 0xef, 0xed, 0x0d, 0x06, 0x1e,\n","  0x06, 0x1d, 0x04, 0xf9, 0xfe, 0xde, 0xf6, 0x1b, 0xea, 0x4d, 0x12, 0xfd,\n","  0xe1, 0xec, 0x1e, 0xeb, 0xfc, 0x2f, 0x0b, 0x01, 0xdc, 0x03, 0xf7, 0xef,\n","  0x08, 0x07, 0x16, 0x04, 0xd2, 0x07, 0x08, 0xf0, 0xe7, 0x13, 0xfd, 0x04,\n","  0xdf, 0xf3, 0xfb, 0x25, 0xef, 0x06, 0x00, 0x07, 0xf1, 0x0d, 0x05, 0x00,\n","  0x01, 0x1a, 0xf9, 0xf1, 0x09, 0x42, 0x19, 0x2b, 0x0b, 0x12, 0xfc, 0x16,\n","  0x15, 0x2b, 0x19, 0x27, 0xfa, 0xfb, 0x04, 0xec, 0x15, 0x0e, 0x26, 0x26,\n","  0x11, 0xef, 0xf9, 0xeb, 0x29, 0x23, 0xf9, 0x05, 0xf6, 0x01, 0x17, 0x14,\n","  0x08, 0x14, 0x0a, 0x03, 0x05, 0x05, 0x10, 0x02, 0x0f, 0x0a, 0x0e, 0x0a,\n","  0x00, 0xff, 0x02, 0x03, 0xf0, 0xec, 0xe3, 0xf2, 0xf4, 0x16, 0x08, 0xf0,\n","  0x07, 0xda, 0x20, 0x05, 0x17, 0x34, 0x0b, 0xda, 0x02, 0xeb, 0x05, 0x14,\n","  0xfb, 0x19, 0x10, 0xe8, 0x08, 0xfa, 0xed, 0x07, 0xfe, 0x25, 0xf3, 0x1d,\n","  0xfd, 0xf6, 0xfa, 0xe5, 0x10, 0x12, 0x28, 0x09, 0x01, 0xfc, 0x0e, 0xde,\n","  0x2f, 0x05, 0x18, 0xfa, 0xf4, 0xf4, 0xed, 0x1d, 0x05, 0xfc, 0x01, 0xfd,\n","  0x10, 0x0a, 0x1c, 0x09, 0x23, 0x21, 0x0c, 0x18, 0xfe, 0xdc, 0xf7, 0x1d,\n","  0xea, 0x3c, 0x0d, 0x0c, 0x07, 0xe7, 0xe4, 0xe0, 0x03, 0x2c, 0xf7, 0xea,\n","  0xd2, 0x01, 0xfd, 0xe7, 0x24, 0x19, 0x04, 0xf1, 0xce, 0x02, 0xda, 0xe9,\n","  0xf7, 0x1d, 0xf2, 0x00, 0xd7, 0x15, 0x13, 0x15, 0xf1, 0x0b, 0xf6, 0xe8,\n","  0xf3, 0x11, 0xe5, 0x12, 0xea, 0x1f, 0xee, 0x18, 0x0c, 0x39, 0x02, 0x1c,\n","  0x03, 0x13, 0xf6, 0x1f, 0x0b, 0x39, 0xfb, 0x1d, 0x04, 0x03, 0xfb, 0xe9,\n","  0x12, 0x29, 0xfd, 0xfc, 0x18, 0x13, 0xff, 0x13, 0x18, 0x30, 0x0b, 0x1f,\n","  0xf9, 0x04, 0x02, 0xf8, 0x17, 0xfb, 0x26, 0xfe, 0x13, 0x02, 0x12, 0x1e,\n","  0x03, 0xfa, 0xf2, 0x06, 0x04, 0xe9, 0xed, 0x22, 0xfa, 0xfa, 0xf9, 0x00,\n","  0xfa, 0x18, 0x1b, 0xf9, 0x17, 0xf9, 0xed, 0x1c, 0xff, 0x22, 0x08, 0xfc,\n","  0xfd, 0x0a, 0xfe, 0x0e, 0x00, 0x17, 0xf9, 0x0b, 0xfa, 0x18, 0x0f, 0xff,\n","  0x01, 0x14, 0x27, 0xda, 0x10, 0x0a, 0xf0, 0xde, 0x10, 0x16, 0xfd, 0xef,\n","  0xfb, 0x04, 0x04, 0xf9, 0x2d, 0xfb, 0x00, 0xfe, 0xf5, 0xff, 0xfe, 0xdf,\n","  0x0a, 0x17, 0xfa, 0x04, 0xf6, 0x17, 0xf7, 0x11, 0xf4, 0x2a, 0xfa, 0x26,\n","  0x09, 0xfa, 0xdf, 0x16, 0xea, 0x29, 0x03, 0xf4, 0xdc, 0xdc, 0xf2, 0xd7,\n","  0x0a, 0x3e, 0x01, 0xfe, 0xf6, 0x04, 0x0a, 0x00, 0x0a, 0x38, 0xfb, 0xf7,\n","  0xdc, 0x02, 0x0b, 0xff, 0xeb, 0x0e, 0xee, 0x00, 0xe8, 0x0c, 0x0c, 0x23,\n","  0xf1, 0x00, 0xf0, 0x11, 0xec, 0x25, 0xf0, 0x0a, 0xfa, 0x23, 0xf7, 0xf2,\n","  0x10, 0x1b, 0x09, 0x08, 0x04, 0x10, 0xed, 0x03, 0x19, 0x33, 0x16, 0x1c,\n","  0xfb, 0x19, 0x08, 0x07, 0x07, 0x26, 0xfc, 0x11, 0x19, 0x05, 0xfb, 0xf6,\n","  0x38, 0x0e, 0xed, 0x2a, 0x0a, 0x14, 0x13, 0xe5, 0x15, 0x01, 0x07, 0x08,\n","  0xfc, 0x05, 0x1d, 0xf2, 0x08, 0x01, 0xde, 0xf3, 0x0d, 0xe8, 0x1b, 0xff,\n","  0xf3, 0xf5, 0xfa, 0xea, 0x07, 0x16, 0x25, 0x01, 0x07, 0xfc, 0x09, 0x12,\n","  0xf5, 0x12, 0xfc, 0x04, 0xf6, 0x0c, 0xef, 0xfd, 0x05, 0x2c, 0x04, 0x13,\n","  0xf8, 0xfc, 0xec, 0x15, 0x04, 0xfb, 0x05, 0x14, 0x03, 0xea, 0x02, 0xfe,\n","  0x0c, 0x14, 0x0d, 0x15, 0xfb, 0x07, 0xfd, 0xeb, 0x24, 0x06, 0xff, 0xeb,\n","  0xf4, 0x07, 0xfb, 0x22, 0x07, 0xe8, 0xee, 0x0e, 0xfb, 0xfe, 0xde, 0x19,\n","  0xf1, 0x38, 0xf6, 0x00, 0x0c, 0xf0, 0x00, 0x11, 0xfe, 0x36, 0xef, 0xd6,\n","  0xe6, 0xed, 0x0f, 0xe7, 0xfe, 0x38, 0xf2, 0x21, 0xe0, 0x02, 0xe4, 0x06,\n","  0x03, 0x1f, 0x08, 0xd9, 0xfd, 0xf6, 0x13, 0xfc, 0xe1, 0x0b, 0xfd, 0xfc,\n","  0xf2, 0xff, 0x29, 0x1a, 0xef, 0x04, 0xfb, 0xf0, 0xe2, 0x1d, 0x0a, 0x16,\n","  0xe3, 0x26, 0xdd, 0x0c, 0xf6, 0x23, 0xf0, 0x13, 0xfc, 0x0b, 0x10, 0x0e,\n","  0x0f, 0x35, 0xfb, 0x16, 0x14, 0x20, 0x04, 0x01, 0x0a, 0x0e, 0xf1, 0x0f,\n","  0x08, 0xf8, 0xf9, 0xf8, 0x1c, 0x32, 0x1a, 0x14, 0x05, 0x0c, 0x10, 0xda,\n","  0x24, 0x25, 0x13, 0x1f, 0x14, 0xfb, 0x06, 0xdf, 0x01, 0x0c, 0xfa, 0x03,\n","  0x00, 0xe5, 0x15, 0xf4, 0xf3, 0xdf, 0x0d, 0xee, 0xfe, 0x0e, 0x0e, 0x27,\n","  0x11, 0xe9, 0x0c, 0x11, 0xed, 0x2b, 0x03, 0x16, 0x09, 0xec, 0x06, 0xe7,\n","  0xf2, 0x33, 0x04, 0x09, 0xf8, 0x0f, 0x0e, 0xf2, 0xfd, 0xfa, 0x04, 0xf4,\n","  0x10, 0x0a, 0x0b, 0xfc, 0x02, 0xfc, 0xfc, 0xf8, 0x1e, 0x04, 0xe8, 0xdb,\n","  0x10, 0xf8, 0x0a, 0x0f, 0xfb, 0xf8, 0x1f, 0xee, 0x0e, 0xde, 0xed, 0xd8,\n","  0x09, 0x0a, 0x1c, 0x2c, 0x06, 0x35, 0x01, 0x0a, 0x0f, 0xf5, 0xf7, 0x0f,\n","  0xf7, 0x30, 0x15, 0x08, 0xcf, 0xf9, 0x20, 0xd9, 0x05, 0x35, 0xe3, 0x04,\n","  0xe1, 0x12, 0xf0, 0x04, 0xfa, 0x1d, 0xfd, 0xff, 0x04, 0xfb, 0x28, 0xfd,\n","  0xed, 0x2f, 0xf5, 0x04, 0xe6, 0x04, 0xe6, 0x16, 0xe5, 0xe8, 0xf2, 0xeb,\n","  0xf1, 0x02, 0x07, 0x08, 0xe9, 0x22, 0xed, 0xf6, 0x01, 0x3c, 0xf4, 0x17,\n","  0xe5, 0x0e, 0x07, 0x06, 0x00, 0x34, 0xfc, 0x1a, 0x1a, 0x17, 0x0c, 0xfe,\n","  0xf7, 0x1d, 0xe9, 0x30, 0x11, 0xf8, 0x19, 0x05, 0x1d, 0x1b, 0xda, 0xfe,\n","  0x07, 0xf1, 0x08, 0xf2, 0x2b, 0xff, 0xef, 0x01, 0xf9, 0x04, 0x05, 0xfb,\n","  0xf8, 0x06, 0x0d, 0x04, 0x14, 0xfb, 0x14, 0x06, 0xf2, 0xe3, 0xfe, 0x07,\n","  0xf4, 0x0c, 0xfd, 0x1d, 0x18, 0xeb, 0x05, 0xee, 0x12, 0x24, 0x00, 0x0b,\n","  0xff, 0xf9, 0x01, 0xec, 0xfa, 0x1e, 0x1b, 0xfe, 0x01, 0x07, 0x26, 0x06,\n","  0x02, 0x0c, 0xf7, 0x03, 0x1c, 0xf2, 0x14, 0xdc, 0x09, 0x1f, 0xf4, 0x14,\n","  0x0e, 0x0c, 0xf8, 0xec, 0x1c, 0x0f, 0xf8, 0xf8, 0x0a, 0xf7, 0x1b, 0xfb,\n","  0xfe, 0x1b, 0xfa, 0xee, 0x05, 0x06, 0xef, 0x20, 0xe5, 0x4e, 0xef, 0xea,\n","  0xf5, 0xe7, 0x06, 0x17, 0xd8, 0x1e, 0x12, 0xfa, 0xed, 0xf5, 0x01, 0xf2,\n","  0xfe, 0x2a, 0x07, 0xfd, 0xdd, 0x01, 0xfa, 0x02, 0x12, 0x2f, 0xf5, 0x0e,\n","  0xf2, 0xff, 0x03, 0xfc, 0xe7, 0x23, 0xd8, 0x08, 0xef, 0x00, 0xef, 0x0c,\n","  0xe4, 0xe7, 0xf6, 0xfc, 0xcb, 0x18, 0x0d, 0x0d, 0xe9, 0x12, 0x0c, 0x00,\n","  0xf8, 0x23, 0xea, 0x28, 0xeb, 0x26, 0xfa, 0xe5, 0x1a, 0x32, 0x1a, 0x1b,\n","  0x15, 0x16, 0xf1, 0x07, 0xf5, 0x2b, 0x01, 0x11, 0x12, 0x0f, 0x09, 0x00,\n","  0x23, 0x23, 0xf4, 0xfb, 0x08, 0x0b, 0x10, 0x1f, 0x21, 0x0e, 0x0a, 0x08,\n","  0x0a, 0xff, 0x01, 0x17, 0xf2, 0xe3, 0xdc, 0x12, 0x0f, 0x05, 0x16, 0xec,\n","  0xf3, 0xef, 0xeb, 0xeb, 0xfb, 0x12, 0x11, 0xf8, 0x17, 0xe7, 0x0c, 0xf6,\n","  0x08, 0x0e, 0x15, 0xe4, 0x0a, 0x00, 0xd8, 0xf8, 0xf6, 0x00, 0x08, 0x22,\n","  0xfd, 0xfa, 0x0c, 0xfe, 0x08, 0x14, 0xfc, 0x04, 0x06, 0xfa, 0x15, 0xf5,\n","  0x0f, 0xf6, 0xf0, 0x03, 0x03, 0xf2, 0x0e, 0xf1, 0x27, 0xf0, 0xf0, 0xf4,\n","  0xf9, 0x07, 0xf0, 0x07, 0x02, 0x1b, 0xfc, 0x00, 0xf2, 0xfc, 0x13, 0x06,\n","  0xef, 0x2a, 0xf2, 0xeb, 0xf9, 0xe5, 0xfa, 0x0c, 0xe6, 0x20, 0xf4, 0x03,\n","  0xe2, 0x12, 0xfb, 0x03, 0x0b, 0x39, 0xed, 0x0f, 0xf4, 0x10, 0x04, 0xf8,\n","  0x04, 0x27, 0xee, 0x05, 0xfe, 0x00, 0x07, 0xf3, 0xd6, 0x0e, 0xe8, 0x10,\n","  0xd9, 0xf6, 0x04, 0x10, 0xd3, 0xf3, 0xef, 0x01, 0xf8, 0x18, 0x00, 0x1b,\n","  0xec, 0x28, 0xe1, 0x18, 0xee, 0x38, 0x13, 0x1b, 0x0d, 0x1b, 0xf8, 0x05,\n","  0xec, 0x20, 0xe6, 0x10, 0x0a, 0x14, 0x0a, 0xe4, 0x0a, 0x1b, 0x05, 0xef,\n","  0x1b, 0xed, 0xf7, 0x07, 0x13, 0x0d, 0x1a, 0x06, 0x05, 0x09, 0x05, 0xff,\n","  0x32, 0x04, 0xee, 0x10, 0x1d, 0xf5, 0x0e, 0xe8, 0x0e, 0x07, 0x1e, 0xf4,\n","  0xff, 0x00, 0xfa, 0x0d, 0xf6, 0xdd, 0xf8, 0xfd, 0xed, 0x1a, 0x1c, 0xe0,\n","  0x0d, 0xf7, 0xfe, 0xe6, 0xf9, 0x1a, 0x25, 0x0a, 0x11, 0x04, 0xf7, 0xe8,\n","  0x0a, 0x09, 0x09, 0x0b, 0x03, 0x03, 0x1e, 0xe8, 0x07, 0x1a, 0xdf, 0x15,\n","  0x03, 0xe5, 0xdb, 0xe9, 0x1c, 0xf4, 0x0e, 0xf5, 0xfb, 0x08, 0x03, 0xfe,\n","  0x15, 0xfa, 0xfc, 0x22, 0xef, 0x01, 0xf8, 0x0a, 0xf2, 0x04, 0x0a, 0x15,\n","  0xdf, 0x09, 0x0b, 0x18, 0xef, 0x28, 0x05, 0x05, 0xe1, 0xf2, 0x16, 0x1a,\n","  0xd9, 0x1a, 0xfc, 0x01, 0xe2, 0x11, 0x25, 0xdc, 0xe9, 0x13, 0xf6, 0xf9,\n","  0xd5, 0xfd, 0xfd, 0xf9, 0xd2, 0x14, 0xf3, 0x0c, 0xd5, 0x08, 0x15, 0xff,\n","  0xe1, 0xfa, 0xe0, 0xf9, 0xfe, 0x10, 0x00, 0x15, 0xd5, 0xf0, 0x05, 0x1d,\n","  0xf1, 0x28, 0x04, 0xf7, 0xea, 0x20, 0xf8, 0x0e, 0x08, 0x29, 0x10, 0x0a,\n","  0xf1, 0x21, 0x0d, 0x16, 0xfc, 0x24, 0xf0, 0x1e, 0x14, 0x09, 0xec, 0x07,\n","  0x1a, 0x1f, 0x09, 0xf0, 0x19, 0xe5, 0x19, 0xf4, 0x04, 0x0d, 0x01, 0x00,\n","  0xf4, 0xf8, 0x07, 0xfd, 0x18, 0x10, 0x1d, 0x01, 0x0a, 0xfc, 0x18, 0x1b,\n","  0xf5, 0x15, 0xfa, 0xf9, 0x09, 0xe7, 0x0d, 0x0d, 0xff, 0xfe, 0xf0, 0xf2,\n","  0xf9, 0x10, 0x0f, 0x09, 0xfa, 0xdb, 0xf2, 0xe4, 0xfa, 0x14, 0x08, 0x0b,\n","  0x04, 0xfb, 0x12, 0xe8, 0xf7, 0x02, 0x05, 0x0d, 0xfe, 0x0d, 0xe4, 0xf9,\n","  0xef, 0x20, 0x0d, 0xe6, 0xff, 0x00, 0xea, 0xf0, 0x09, 0x07, 0x02, 0x0a,\n","  0x14, 0xec, 0x1e, 0xe4, 0x20, 0xf8, 0xf5, 0x05, 0x02, 0x09, 0x0a, 0x10,\n","  0xf8, 0xff, 0x01, 0xf3, 0x16, 0x10, 0x00, 0x02, 0xda, 0x24, 0xd2, 0x00,\n","  0x0c, 0xff, 0xdd, 0x13, 0xdc, 0x2b, 0xd6, 0xf5, 0xfd, 0x1d, 0xf7, 0x0d,\n","  0xd9, 0x18, 0xee, 0x0e, 0xef, 0x0f, 0xf2, 0xf8, 0xd5, 0x11, 0xdd, 0x0f,\n","  0xf2, 0xfb, 0xfc, 0x07, 0xda, 0x01, 0xe1, 0xf2, 0xf4, 0xf8, 0xf6, 0xfa,\n","  0xd4, 0xfa, 0xee, 0xdd, 0xce, 0x06, 0xf7, 0x0c, 0xf5, 0x1a, 0xe8, 0x0a,\n","  0xf5, 0x1e, 0xf4, 0x12, 0xf1, 0x23, 0x02, 0x09, 0xfa, 0x00, 0xea, 0x21,\n","  0xed, 0x04, 0xf7, 0x05, 0xf9, 0x1a, 0xf7, 0x09, 0x0c, 0xf8, 0x08, 0x20,\n","  0x12, 0x21, 0xdd, 0x08, 0x04, 0xfd, 0x17, 0x08, 0x06, 0x17, 0xec, 0x12,\n","  0x05, 0xfb, 0x07, 0x14, 0x01, 0x1c, 0x13, 0xf4, 0x1e, 0xea, 0x09, 0xf0,\n","  0x1a, 0x04, 0xfb, 0xfe, 0xf2, 0x1e, 0x1a, 0xfb, 0x20, 0xef, 0x10, 0xff,\n","  0x04, 0x19, 0x09, 0x07, 0x15, 0xee, 0xda, 0xc8, 0x01, 0x22, 0xf5, 0xe3,\n","  0x05, 0xfb, 0x18, 0xf2, 0xf1, 0x07, 0xfb, 0x05, 0xf8, 0xef, 0xf9, 0xf4,\n","  0xfa, 0xfa, 0xf0, 0xfa, 0x00, 0xe5, 0xf1, 0x0a, 0xfc, 0xf5, 0xe9, 0x25,\n","  0xff, 0xed, 0xf8, 0xfd, 0xec, 0xf0, 0xfc, 0x00, 0xf3, 0x0b, 0x0a, 0x1c,\n","  0xdb, 0x03, 0x0b, 0x0b, 0xfa, 0xfc, 0xda, 0xfc, 0xef, 0x2d, 0xff, 0xf5,\n","  0x0b, 0x04, 0xfa, 0xe2, 0xd8, 0x03, 0x0f, 0x1c, 0xf8, 0x0c, 0x1c, 0x1e,\n","  0xe3, 0x12, 0x18, 0xeb, 0xe7, 0x18, 0xfb, 0xdd, 0xcc, 0xf2, 0xf2, 0xe3,\n","  0xfe, 0x11, 0xed, 0x15, 0xf6, 0xff, 0xdf, 0xf9, 0xfb, 0x04, 0x03, 0x23,\n","  0xc6, 0x1b, 0x2e, 0xfa, 0x0a, 0x2d, 0x02, 0x08, 0xf2, 0x14, 0xeb, 0xe5,\n","  0xfb, 0xf7, 0x03, 0x15, 0x12, 0x06, 0x1d, 0x05, 0x07, 0x10, 0x0d, 0x0c,\n","  0x13, 0x0c, 0xf5, 0xf6, 0x14, 0x05, 0xee, 0xfa, 0xf6, 0x06, 0x0c, 0xe3,\n","  0x06, 0xf9, 0xea, 0xf6, 0x23, 0xea, 0x0e, 0xfb, 0xf6, 0x10, 0x17, 0x11,\n","  0x08, 0xfb, 0x25, 0xef, 0x07, 0x1e, 0xf4, 0xeb, 0xed, 0x27, 0xef, 0xf7,\n","  0xfa, 0xe5, 0xee, 0xe9, 0x06, 0x18, 0xed, 0xea, 0x09, 0xec, 0x0e, 0x01,\n","  0xfc, 0xe9, 0xff, 0xec, 0xee, 0xe0, 0x1f, 0xe7, 0xfb, 0x12, 0x20, 0x07,\n","  0x16, 0xea, 0x08, 0xd3, 0x1d, 0xe8, 0xea, 0xff, 0xf1, 0xf2, 0x0f, 0xd1,\n","  0x24, 0xfb, 0xe3, 0x1b, 0xff, 0xe4, 0xef, 0xf3, 0xfd, 0xf8, 0xfd, 0xe0,\n","  0xf1, 0x01, 0x11, 0x0d, 0x0f, 0xf5, 0x05, 0xe3, 0xf8, 0xe3, 0xe5, 0xfd,\n","  0x01, 0x03, 0xfc, 0x04, 0x0f, 0x08, 0xfe, 0xf8, 0xeb, 0x1b, 0x0e, 0xdc,\n","  0x19, 0xf9, 0x11, 0xe6, 0x0e, 0xfd, 0x0d, 0x14, 0xfa, 0x06, 0x20, 0x01,\n","  0x00, 0x0e, 0x0f, 0xe5, 0x24, 0xe0, 0x0e, 0xf3, 0xfa, 0xfe, 0xf6, 0xfb,\n","  0x05, 0x18, 0xfe, 0xf7, 0xe9, 0x1b, 0x07, 0xe0, 0xf9, 0xf2, 0x0f, 0xc7,\n","  0xf8, 0x08, 0x01, 0x16, 0xfa, 0xfe, 0x0a, 0xed, 0xfd, 0xfd, 0xf9, 0x14,\n","  0xf7, 0xe0, 0x0c, 0xf9, 0xf9, 0x0d, 0x0e, 0x06, 0x03, 0x1f, 0x17, 0x00,\n","  0xfa, 0x00, 0xfb, 0x15, 0x12, 0xe4, 0xde, 0xf7, 0x03, 0xec, 0x14, 0x22,\n","  0x02, 0xfe, 0x0a, 0xfe, 0x06, 0xf8, 0xfe, 0xed, 0x0a, 0xed, 0x0b, 0x08,\n","  0xe4, 0xec, 0xf8, 0xe5, 0x06, 0x07, 0xe8, 0xf4, 0xfa, 0xe0, 0xfa, 0x0f,\n","  0x05, 0x13, 0x0a, 0xf3, 0xf2, 0x09, 0x19, 0x0c, 0xfa, 0xf1, 0xfc, 0xec,\n","  0xe2, 0x01, 0xfa, 0xfb, 0x0b, 0xfc, 0x09, 0x06, 0x0d, 0x11, 0x04, 0xf2,\n","  0x0a, 0xf7, 0x28, 0x0f, 0xf4, 0x17, 0xda, 0xff, 0xfb, 0xe9, 0x26, 0x02,\n","  0xf5, 0x17, 0x00, 0xe2, 0xfe, 0x10, 0xe8, 0x05, 0x21, 0xed, 0x0b, 0x1e,\n","  0x01, 0x00, 0xf5, 0xff, 0x29, 0xe8, 0x11, 0xf4, 0x20, 0xf9, 0x16, 0xec,\n","  0x0c, 0xd9, 0x23, 0xf0, 0x0f, 0x27, 0xdd, 0xe9, 0x05, 0xd9, 0x0d, 0x20,\n","  0x0e, 0x34, 0xfe, 0xda, 0x27, 0xf9, 0x1a, 0x26, 0x17, 0x0b, 0xd8, 0xd7,\n","  0xfe, 0x0a, 0x0c, 0xe3, 0xf8, 0x0c, 0xdd, 0xf9, 0x12, 0xef, 0xff, 0x1b,\n","  0x01, 0xf7, 0xe9, 0xf6, 0x05, 0xf1, 0x1d, 0xdf, 0xf3, 0x05, 0xf2, 0xf2,\n","  0x00, 0x05, 0x00, 0xf8, 0x0f, 0xdf, 0x00, 0x03, 0x00, 0x13, 0x0b, 0xf0,\n","  0x0e, 0xec, 0x13, 0x11, 0x17, 0xf1, 0x07, 0xe7, 0x1d, 0xe1, 0xe7, 0xe9,\n","  0x07, 0x00, 0x1e, 0x21, 0x0b, 0xe8, 0xeb, 0xf5, 0x14, 0xf0, 0x10, 0xe9,\n","  0xf5, 0xf1, 0xf2, 0xcd, 0xf3, 0x0e, 0x10, 0xff, 0x11, 0xf0, 0x03, 0xf8,\n","  0x1c, 0x0f, 0x13, 0xeb, 0xf1, 0xe3, 0x0e, 0xfc, 0x02, 0x0b, 0xf3, 0xf2,\n","  0xfb, 0x01, 0x01, 0xfc, 0xee, 0xfb, 0xeb, 0xf9, 0xf4, 0x0d, 0x03, 0xfe,\n","  0xda, 0x13, 0x15, 0xfa, 0x14, 0xf1, 0xd9, 0x09, 0xf5, 0x07, 0x04, 0x03,\n","  0xfe, 0x15, 0xfe, 0x0b, 0x0e, 0xf9, 0x20, 0xeb, 0x11, 0xef, 0xf8, 0xf7,\n","  0x21, 0xeb, 0x0e, 0x0f, 0x0d, 0x1e, 0x13, 0x02, 0xec, 0xef, 0xf5, 0xf4,\n","  0xf9, 0x1a, 0xf3, 0xf8, 0x01, 0xfb, 0xfd, 0xe1, 0xfb, 0x22, 0x09, 0xee,\n","  0xf0, 0xf0, 0x0c, 0xf9, 0x12, 0x3d, 0xfe, 0xef, 0xec, 0x02, 0x0e, 0x15,\n","  0x0c, 0x18, 0x25, 0xf2, 0x0d, 0xf4, 0x13, 0xf6, 0x0e, 0x02, 0xf1, 0xf4,\n","  0x0d, 0xff, 0x0f, 0xf0, 0x20, 0x00, 0xf4, 0xf2, 0x12, 0xfc, 0x2a, 0xda,\n","  0xe8, 0xe6, 0xfa, 0xfc, 0xf7, 0x14, 0x07, 0xf2, 0x11, 0xe9, 0x00, 0x00,\n","  0x02, 0x02, 0x06, 0x10, 0x05, 0xf6, 0x17, 0xf7, 0xf6, 0x0a, 0x06, 0xea,\n","  0xfc, 0xf6, 0xe9, 0x01, 0x05, 0xfd, 0x01, 0x12, 0x12, 0xdd, 0xf4, 0xfb,\n","  0x02, 0x10, 0xf6, 0x03, 0x06, 0xf5, 0x24, 0xed, 0x16, 0xf0, 0xfe, 0x1e,\n","  0xff, 0xf8, 0x20, 0xeb, 0xf7, 0xf4, 0x01, 0x09, 0x1f, 0x03, 0x0e, 0x07,\n","  0xe8, 0xe1, 0x04, 0xf1, 0x0e, 0xec, 0x1c, 0x07, 0x0a, 0xf3, 0xf3, 0xed,\n","  0xfe, 0xf3, 0x0d, 0xf9, 0xeb, 0xf9, 0xff, 0xf5, 0x1f, 0xf9, 0x14, 0xff,\n","  0xee, 0x09, 0xda, 0xfc, 0x09, 0xe2, 0x24, 0xfa, 0x00, 0x15, 0x0b, 0xf7,\n","  0x00, 0xf8, 0x1b, 0x08, 0x09, 0xea, 0x13, 0x11, 0xf5, 0x0c, 0x11, 0x10,\n","  0xfa, 0xf6, 0x11, 0xf5, 0xed, 0x05, 0x17, 0xff, 0xfd, 0x0f, 0x19, 0xe7,\n","  0xf3, 0x39, 0xf7, 0x04, 0x06, 0xf6, 0x01, 0x07, 0xf5, 0x38, 0x03, 0xfe,\n","  0xf8, 0xfe, 0xea, 0xfe, 0xfb, 0x43, 0x0f, 0x04, 0x21, 0xe5, 0x14, 0x1a,\n","  0xdb, 0x20, 0x24, 0xfa, 0x23, 0xdb, 0x24, 0xf8, 0xfa, 0x0f, 0x02, 0x00,\n","  0xf0, 0xd6, 0x07, 0x0e, 0xe6, 0xeb, 0x01, 0x14, 0x08, 0x12, 0x0e, 0x16,\n","  0xf4, 0xe5, 0x02, 0xed, 0x0c, 0x07, 0xe0, 0x13, 0xf7, 0xf6, 0xea, 0xf2,\n","  0xf2, 0x08, 0xe6, 0xe6, 0xfc, 0xe9, 0x0d, 0x0b, 0xf3, 0xff, 0xf7, 0xe0,\n","  0x1c, 0xed, 0x0b, 0xfe, 0x0a, 0x0f, 0xfb, 0x10, 0xf5, 0xf2, 0x1e, 0x1d,\n","  0xdf, 0x0a, 0x25, 0x0e, 0x09, 0x11, 0xf4, 0xf4, 0xfc, 0xf1, 0x0a, 0x0b,\n","  0x0f, 0x1b, 0x14, 0x09, 0x0f, 0xf7, 0xf9, 0x04, 0x06, 0x19, 0x0b, 0x0b,\n","  0xea, 0xdc, 0xf9, 0x03, 0x05, 0xf6, 0x10, 0x20, 0xfc, 0xea, 0xe9, 0x0b,\n","  0x0c, 0x1c, 0xe1, 0x11, 0x18, 0x0d, 0x09, 0x11, 0x0a, 0xf6, 0xf5, 0xf7,\n","  0x07, 0x14, 0x1c, 0x09, 0xf3, 0x04, 0x0d, 0x02, 0xed, 0x25, 0x13, 0xf2,\n","  0xe9, 0x07, 0xfe, 0xf3, 0xed, 0xf3, 0xdf, 0x00, 0xe9, 0x27, 0xf3, 0xf6,\n","  0xe6, 0xd8, 0x11, 0xfc, 0x13, 0x50, 0xfd, 0x0d, 0xd8, 0xd8, 0x1e, 0x0f,\n","  0xe4, 0x49, 0x01, 0x1c, 0xfb, 0xee, 0x20, 0x19, 0x23, 0x2a, 0xfe, 0x10,\n","  0x02, 0xd0, 0x1d, 0x15, 0x1e, 0x3f, 0x07, 0x09, 0x02, 0xc5, 0x1c, 0x22,\n","  0xe9, 0x0a, 0x07, 0xfe, 0x00, 0x00, 0x09, 0x04, 0x0f, 0x04, 0x09, 0x09,\n","  0xe6, 0x14, 0x10, 0x0d, 0xe9, 0xea, 0xf6, 0x18, 0x07, 0x36, 0xe7, 0xf9,\n","  0xe7, 0xde, 0xe4, 0xf6, 0x1d, 0x12, 0xfd, 0xee, 0xf1, 0xed, 0x05, 0x0d,\n","  0x08, 0x10, 0xf9, 0xee, 0x09, 0xf9, 0xf9, 0x09, 0xf1, 0xff, 0xea, 0xf4,\n","  0xeb, 0xfd, 0xf3, 0x1e, 0xff, 0xe6, 0xdb, 0x09, 0xf7, 0xf9, 0x1c, 0x15,\n","  0x0c, 0xfb, 0xf2, 0xfc, 0x02, 0x19, 0xe5, 0x2a, 0xe6, 0xfc, 0xda, 0xec,\n","  0xea, 0x05, 0xfd, 0x0a, 0xe2, 0xec, 0x01, 0xf4, 0x08, 0x27, 0xf8, 0x25,\n","  0x0c, 0xf6, 0xf1, 0xec, 0xe5, 0xfb, 0x05, 0x0d, 0xed, 0xe3, 0xf0, 0xdc,\n","  0x05, 0xe4, 0x09, 0xf6, 0xfe, 0xdf, 0xff, 0xf9, 0xe9, 0x05, 0x26, 0xfe,\n","  0x03, 0x08, 0xfe, 0x07, 0xed, 0xfb, 0xef, 0xfa, 0xe0, 0xfd, 0xf0, 0x14,\n","  0x0b, 0x3c, 0x06, 0x04, 0xfe, 0xe0, 0x04, 0xfa, 0xe2, 0x4a, 0x07, 0x0d,\n","  0xee, 0xda, 0x03, 0xf3, 0xee, 0x40, 0x07, 0x0e, 0xfd, 0xbd, 0x11, 0xfe,\n","  0x20, 0x41, 0xdd, 0x09, 0xf6, 0xd6, 0x20, 0xfe, 0xec, 0x20, 0xf1, 0xff,\n","  0x06, 0xc9, 0x15, 0x02, 0xfb, 0x04, 0x1f, 0xe8, 0xe5, 0xe4, 0xfc, 0xe4,\n","  0x06, 0xfe, 0x23, 0xf3, 0xfe, 0x1b, 0xe2, 0xf5, 0xfd, 0xce, 0x17, 0xea,\n","  0xdc, 0x16, 0xf0, 0x0f, 0xd7, 0xd9, 0x11, 0xf9, 0xf9, 0x19, 0x15, 0x10,\n","  0x04, 0xf3, 0xef, 0xff, 0xff, 0xff, 0xf0, 0xdf, 0xfb, 0xe8, 0x1d, 0xff,\n","  0x1a, 0xf9, 0xed, 0xfb, 0xfe, 0x03, 0xd6, 0x13, 0x09, 0x04, 0x0a, 0x10,\n","  0x1e, 0xfb, 0xfc, 0x17, 0x04, 0xf4, 0x00, 0x0b, 0xf9, 0x05, 0x14, 0x2b,\n","  0xe7, 0xe2, 0x0c, 0x06, 0x10, 0x29, 0x16, 0x2b, 0xdb, 0xe1, 0xf5, 0x00,\n","  0xf0, 0xf8, 0x13, 0x2b, 0x01, 0xea, 0xf5, 0xdf, 0xe9, 0xe5, 0x00, 0x16,\n","  0xf4, 0xf8, 0xff, 0xfb, 0xfe, 0xd8, 0x29, 0x11, 0xf2, 0xfe, 0xdd, 0xf0,\n","  0x1d, 0xfb, 0x1a, 0x09, 0xf9, 0x0e, 0x1d, 0xff, 0xda, 0x08, 0x0f, 0x06,\n","  0xf3, 0xec, 0xe0, 0x1b, 0xef, 0x2a, 0xf9, 0x0e, 0xef, 0xb9, 0xf2, 0x08,\n","  0xfe, 0x58, 0xfd, 0x14, 0xfd, 0xea, 0xea, 0x17, 0xf4, 0x30, 0x1e, 0x08,\n","  0x0c, 0xe2, 0x23, 0xe7, 0x09, 0x38, 0x05, 0x18, 0x12, 0xd1, 0x1a, 0x24,\n","  0x07, 0x2f, 0x16, 0xe1, 0x01, 0xc5, 0x1e, 0x00, 0x08, 0x13, 0xe1, 0xdd,\n","  0xff, 0xe5, 0xea, 0xe0, 0xe4, 0xee, 0xe8, 0xeb, 0xe1, 0x1a, 0xfe, 0xf5,\n","  0xef, 0xbe, 0xe9, 0xff, 0xff, 0x0f, 0xec, 0xfa, 0xcc, 0xd1, 0x09, 0xfc,\n","  0x06, 0x23, 0x0e, 0xf7, 0xe8, 0xec, 0xdb, 0x07, 0x18, 0x09, 0xdc, 0xf6,\n","  0xec, 0xe2, 0x02, 0x15, 0x13, 0x0a, 0xea, 0xee, 0x17, 0xf8, 0xf0, 0x31,\n","  0x04, 0xf2, 0x0a, 0x07, 0xfe, 0x05, 0x0d, 0x2f, 0xfe, 0xe2, 0xf5, 0x1f,\n","  0x13, 0x2b, 0xe4, 0x21, 0xe0, 0xdc, 0x04, 0x0e, 0x01, 0x14, 0x25, 0x47,\n","  0xe6, 0xf6, 0xe5, 0x05, 0x07, 0x0c, 0x12, 0x23, 0xeb, 0xe1, 0xd5, 0x18,\n","  0xe4, 0xe5, 0xde, 0x12, 0xf0, 0x01, 0x04, 0xf2, 0x07, 0xe1, 0xed, 0xeb,\n","  0x03, 0xf2, 0x0d, 0x0c, 0xf3, 0xf1, 0xfb, 0x14, 0xfc, 0x26, 0xe6, 0xe9,\n","  0xee, 0xf4, 0xfe, 0x0a, 0xdf, 0xe2, 0xfa, 0x0e, 0x0b, 0x27, 0xe9, 0xf9,\n","  0xec, 0xc4, 0x16, 0xe3, 0x00, 0x50, 0x0b, 0x0f, 0x0d, 0xd2, 0x0d, 0x01,\n","  0xf7, 0x35, 0xfe, 0x1f, 0x1f, 0xe1, 0x0c, 0x05, 0x19, 0x2f, 0xfd, 0xf4,\n","  0x0d, 0xc3, 0x1d, 0xff, 0x11, 0x29, 0x04, 0xfe, 0x01, 0xc8, 0x2c, 0x02,\n","  0xf5, 0x26, 0xf6, 0xdd, 0xfe, 0xe3, 0x0b, 0x09, 0xe6, 0xe0, 0xfc, 0xdf,\n","  0xfa, 0x17, 0xf0, 0xfa, 0xf0, 0xd7, 0x02, 0xe9, 0x00, 0x1d, 0xd8, 0x04,\n","  0xcb, 0xd3, 0x1a, 0xe2, 0x12, 0x0d, 0xdc, 0xf6, 0x06, 0x08, 0x0d, 0xfb,\n","  0x27, 0x06, 0xf5, 0xdc, 0xfc, 0xdc, 0xfb, 0x2a, 0x0f, 0x16, 0xf3, 0xd9,\n","  0xfa, 0xf7, 0xf0, 0x2d, 0x0a, 0x0a, 0xf8, 0x01, 0xd7, 0x04, 0xfe, 0x3e,\n","  0x07, 0xd4, 0xe6, 0xf3, 0x1d, 0x19, 0x07, 0x3c, 0x01, 0xca, 0xea, 0x06,\n","  0x0a, 0x06, 0xe9, 0x20, 0xe7, 0xdf, 0xd6, 0x03, 0x06, 0x16, 0xf9, 0x42,\n","  0xf0, 0xf0, 0xd8, 0xfc, 0xfe, 0xee, 0x18, 0x0f, 0xf3, 0xf9, 0xf7, 0x03,\n","  0x08, 0xf0, 0xf8, 0xf6, 0x09, 0xd9, 0xfc, 0x06, 0xed, 0x1c, 0x0f, 0x07,\n","  0x0b, 0x00, 0xfc, 0xfb, 0xc4, 0xfe, 0xf9, 0xfb, 0xe1, 0x02, 0x0d, 0x02,\n","  0x02, 0x2e, 0xff, 0xed, 0xef, 0x9c, 0xfb, 0x09, 0x16, 0x4d, 0x27, 0x1d,\n","  0x0e, 0xce, 0xf8, 0x1b, 0x16, 0x4e, 0x12, 0x24, 0x14, 0xeb, 0x04, 0x0b,\n","  0xfd, 0x20, 0x28, 0xfb, 0x18, 0xb5, 0xff, 0xf2, 0x15, 0x15, 0xff, 0xf6,\n","  0x1d, 0xb7, 0x34, 0xe3, 0x02, 0x13, 0xfb, 0x09, 0x09, 0xdc, 0xfa, 0xf1,\n","  0x13, 0x06, 0x22, 0xe1, 0xf2, 0x15, 0xea, 0xdb, 0xee, 0xbf, 0x07, 0xe7,\n","  0xfe, 0x2a, 0xe4, 0xe9, 0xe8, 0xcf, 0xfa, 0xf3, 0x23, 0x16, 0x0f, 0xf9,\n","  0xed, 0xe9, 0x09, 0x01, 0x0c, 0x00, 0xef, 0xea, 0xe6, 0xd8, 0x13, 0x11,\n","  0x0f, 0x2b, 0xdf, 0xd6, 0xfb, 0xf8, 0xf8, 0x30, 0x0a, 0x00, 0xf2, 0xd8,\n","  0xf7, 0x16, 0xfd, 0x45, 0x07, 0xf9, 0xd5, 0x03, 0x0d, 0x17, 0x12, 0x47,\n","  0xf8, 0xe0, 0xed, 0xfb, 0xf9, 0x06, 0xfb, 0x36, 0xfe, 0xe9, 0x01, 0xea,\n","  0xef, 0x13, 0x0d, 0x40, 0xe1, 0xf5, 0xe6, 0xfe, 0xff, 0xf9, 0xe9, 0x06,\n","  0xdd, 0xf2, 0xe1, 0x02, 0x01, 0xf0, 0xeb, 0x03, 0xf8, 0x09, 0x0a, 0x09,\n","  0xf8, 0x01, 0x01, 0x00, 0xff, 0x09, 0xf4, 0x0e, 0xd1, 0x11, 0xe4, 0x0f,\n","  0xe0, 0xe6, 0x05, 0x24, 0x00, 0x3e, 0xf7, 0xf4, 0xe7, 0xc9, 0xf8, 0x1c,\n","  0x1a, 0x44, 0x05, 0x12, 0xfe, 0xc4, 0x26, 0x09, 0x16, 0x53, 0x11, 0x22,\n","  0x16, 0xe5, 0x16, 0xf5, 0x24, 0x34, 0x08, 0x0b, 0x0a, 0xbf, 0xfb, 0xdf,\n","  0x0f, 0x2a, 0x0b, 0xf6, 0x30, 0xa4, 0x3a, 0xf9, 0x08, 0x2f, 0x20, 0xe6,\n","  0x01, 0xea, 0x10, 0x26, 0xf0, 0xf7, 0x05, 0xd1, 0xf2, 0x07, 0x03, 0x13,\n","  0xeb, 0xcc, 0x07, 0xf0, 0xfe, 0x16, 0xf3, 0xfb, 0xd0, 0xd8, 0x10, 0xe4,\n","  0x0a, 0x09, 0xdc, 0x0b, 0xe5, 0xdf, 0xed, 0x00, 0x13, 0x09, 0xf7, 0xf7,\n","  0xf8, 0xf4, 0xf6, 0x1c, 0x28, 0xfc, 0xe5, 0xf2, 0xe6, 0x01, 0x0b, 0x41,\n","  0x0e, 0xe8, 0xf5, 0xe0, 0x14, 0x02, 0x12, 0x38, 0x02, 0xd9, 0xd2, 0x0d,\n","  0x04, 0x10, 0x06, 0x4f, 0x07, 0xdd, 0x03, 0xf0, 0xf9, 0x0e, 0xff, 0x3e,\n","  0xef, 0xf4, 0xed, 0xed, 0x04, 0x11, 0xed, 0x4e, 0xdd, 0xeb, 0xe1, 0xfd,\n","  0xff, 0xfe, 0xef, 0x0f, 0xba, 0xeb, 0xe7, 0xe3, 0x0b, 0xee, 0x0f, 0xd9,\n","  0xf7, 0x0b, 0xff, 0xed, 0xff, 0x09, 0x03, 0xfd, 0xff, 0xf2, 0x09, 0xf0,\n","  0xdd, 0x1d, 0xed, 0x07, 0xde, 0xe8, 0x05, 0xfd, 0x11, 0x3f, 0x04, 0xf5,\n","  0xe4, 0x9e, 0x01, 0x26, 0x30, 0x5c, 0x08, 0x12, 0x11, 0xcc, 0x07, 0xf3,\n","  0x24, 0x38, 0xf9, 0x1b, 0x1f, 0xd6, 0x16, 0x18, 0x1d, 0x28, 0x11, 0x05,\n","  0x21, 0xa4, 0x09, 0xfa, 0x1d, 0x2f, 0xe5, 0xdd, 0x15, 0xa6, 0x24, 0xe2,\n","  0xf6, 0x0c, 0xd6, 0xfb, 0x1c, 0xd9, 0x1c, 0x06, 0xf8, 0xf9, 0x1e, 0xdd,\n","  0xf9, 0x12, 0xec, 0x10, 0xdb, 0xb6, 0x0b, 0xe4, 0xf1, 0x32, 0xd0, 0x18,\n","  0xd2, 0xce, 0xf5, 0xf5, 0x26, 0x08, 0xf8, 0x0b, 0xfc, 0xfa, 0x11, 0xf1,\n","  0x0b, 0x13, 0xfc, 0xfe, 0xf0, 0xf3, 0xfa, 0x16, 0x0e, 0x02, 0xd6, 0x17,\n","  0xf3, 0x10, 0xf9, 0x47, 0x07, 0xfd, 0xe6, 0xf9, 0x00, 0x13, 0x04, 0x2b,\n","  0xf6, 0xe5, 0xf8, 0xeb, 0x0e, 0xfd, 0x1a, 0x2e, 0xee, 0xf2, 0xf4, 0xf0,\n","  0xfc, 0xf9, 0xeb, 0x47, 0xf7, 0xe9, 0xd5, 0xec, 0xf7, 0x0c, 0xee, 0x3c,\n","  0xe4, 0xfb, 0xd4, 0xf3, 0x02, 0x03, 0x03, 0x22, 0xd1, 0x01, 0xeb, 0x0c,\n","  0xf8, 0xde, 0x0e, 0x15, 0xf8, 0x10, 0xf8, 0x04, 0xfc, 0x0f, 0x05, 0x06,\n","  0xff, 0xe1, 0x0e, 0xe1, 0xf3, 0x14, 0xf7, 0x09, 0xda, 0xfd, 0xfe, 0xf8,\n","  0x16, 0x2c, 0x0a, 0xfc, 0xed, 0xb0, 0x15, 0x07, 0x1a, 0x48, 0x03, 0x1f,\n","  0x05, 0xcc, 0x14, 0xee, 0x2c, 0x47, 0x0b, 0x0d, 0x29, 0xda, 0x0b, 0x1b,\n","  0x25, 0x31, 0x1e, 0xf8, 0x1d, 0xc5, 0x26, 0xe9, 0x14, 0x1b, 0x0c, 0xe8,\n","  0x0d, 0x91, 0x1d, 0x07, 0x06, 0x21, 0x22, 0xd9, 0xfc, 0xd1, 0x17, 0x05,\n","  0x03, 0xd7, 0xe2, 0xfa, 0xfa, 0x09, 0xf3, 0xf6, 0xe1, 0xc6, 0xf6, 0xe1,\n","  0xff, 0x18, 0xdd, 0x19, 0xe8, 0xee, 0x1f, 0xeb, 0xfe, 0x1b, 0xe5, 0x11,\n","  0xdc, 0xf7, 0x06, 0x0c, 0xf8, 0x0a, 0xee, 0xf9, 0xdb, 0x06, 0x04, 0x21,\n","  0x0a, 0x1f, 0x05, 0x15, 0xe6, 0xfc, 0x1f, 0x30, 0x0e, 0xe6, 0xe0, 0x03,\n","  0xff, 0x11, 0xe2, 0x43, 0xfd, 0xe1, 0xf6, 0x13, 0xf7, 0x04, 0x21, 0x59,\n","  0x0c, 0xf8, 0xcd, 0x11, 0xf9, 0x0c, 0xf3, 0x5c, 0x00, 0xd7, 0xf3, 0x0f,\n","  0xfa, 0x16, 0x0f, 0x45, 0xe9, 0xed, 0xec, 0xff, 0xf5, 0x0c, 0x1e, 0x1a,\n","  0xd1, 0xfe, 0xe6, 0xfc, 0x0b, 0xfa, 0xe2, 0xef, 0xf5, 0x0b, 0x02, 0x10,\n","  0xea, 0x10, 0xf3, 0xfe, 0xf3, 0xe8, 0x17, 0x08, 0xdd, 0x28, 0x15, 0x04,\n","  0xf2, 0xdf, 0xfd, 0xff, 0x1d, 0x39, 0xf6, 0xf3, 0xdd, 0xc1, 0x0c, 0x09,\n","  0x18, 0x60, 0x0c, 0x22, 0x0a, 0xe3, 0x34, 0xf7, 0x26, 0x5b, 0x0a, 0x18,\n","  0x09, 0xd7, 0x29, 0x18, 0x2b, 0x44, 0xe1, 0x16, 0x14, 0xd0, 0x21, 0xfb,\n","  0x15, 0x0b, 0x06, 0xe4, 0x07, 0xa5, 0x2a, 0x02, 0xf8, 0x10, 0x15, 0xe1,\n","  0x0c, 0xe3, 0x19, 0xfc, 0x0f, 0xef, 0xf3, 0xf9, 0xed, 0x01, 0xd7, 0x05,\n","  0xe7, 0xbe, 0xdf, 0xe8, 0xff, 0x05, 0xf0, 0x12, 0xcf, 0xcf, 0xf3, 0xf3,\n","  0x13, 0x1e, 0xef, 0x11, 0xe1, 0x01, 0xf0, 0x0e, 0x26, 0xed, 0xef, 0xfb,\n","  0xee, 0xf7, 0xfe, 0x17, 0x0a, 0x18, 0xd6, 0x09, 0xf3, 0x05, 0xec, 0x28,\n","  0x0b, 0xef, 0xd2, 0xf8, 0xec, 0x05, 0x00, 0x36, 0x03, 0xf8, 0xe8, 0x1f,\n","  0x1d, 0x00, 0x1a, 0x3c, 0xfe, 0xee, 0xea, 0xf2, 0xf6, 0x0d, 0x00, 0x3c,\n","  0xf4, 0xe1, 0xf2, 0xf7, 0x0d, 0x10, 0x21, 0x47, 0xf7, 0x0a, 0xea, 0xfe,\n","  0x0c, 0x07, 0xfd, 0x26, 0xbe, 0xfb, 0xd6, 0x04, 0x11, 0xf9, 0x12, 0xfd,\n","  0xf3, 0x17, 0xff, 0xe6, 0x0d, 0xfc, 0xf7, 0xee, 0xf9, 0xf1, 0x07, 0xf6,\n","  0xfc, 0x0d, 0x03, 0x01, 0xdd, 0xe7, 0x01, 0x14, 0x23, 0x31, 0x09, 0xff,\n","  0x02, 0xa4, 0x14, 0xe5, 0x22, 0x5c, 0xf9, 0x01, 0x18, 0xcb, 0x13, 0x05,\n","  0x13, 0x50, 0x0d, 0x11, 0x12, 0xe8, 0x2f, 0xe6, 0x33, 0x37, 0x1a, 0xf4,\n","  0x2b, 0xc0, 0x20, 0xdd, 0x12, 0x1c, 0xe8, 0xca, 0x0b, 0xa8, 0x1d, 0xe3,\n","  0x16, 0xf8, 0xf5, 0xcf, 0x0f, 0xeb, 0xfb, 0xfc, 0xfe, 0x00, 0xf0, 0xe1,\n","  0xe0, 0x11, 0xe1, 0x03, 0xef, 0xc2, 0xfa, 0xfc, 0xf3, 0x11, 0xf1, 0x26,\n","  0xdb, 0xbe, 0xef, 0xee, 0x13, 0x0c, 0xf1, 0xe3, 0xdc, 0xfb, 0x00, 0xfa,\n","  0x07, 0xe7, 0xe4, 0xe5, 0xdd, 0xfd, 0x0f, 0x17, 0x15, 0x13, 0xf8, 0xe8,\n","  0xfb, 0x04, 0x0e, 0x33, 0x09, 0xfe, 0xcc, 0xfa, 0x00, 0x0a, 0x25, 0x33,\n","  0x09, 0xfb, 0xd3, 0xf0, 0x00, 0x06, 0xfd, 0x39, 0x08, 0xf8, 0xd0, 0x0c,\n","  0x0c, 0x13, 0xf5, 0x47, 0xf4, 0xe9, 0xe9, 0xf7, 0xfa, 0x1e, 0xfe, 0x50,\n","  0xf7, 0xfe, 0xdc, 0x19, 0xf4, 0xf7, 0xfc, 0x29, 0xd2, 0xf9, 0xd7, 0xe0,\n","  0xfb, 0xed, 0xed, 0xf7, 0xe1, 0x04, 0xe3, 0x16, 0xf5, 0xfc, 0x08, 0x03,\n","  0xe8, 0xda, 0x00, 0x10, 0x02, 0x1a, 0xe5, 0xf9, 0xca, 0x08, 0x23, 0x00,\n","  0x22, 0x3b, 0x11, 0x0f, 0xf0, 0xc2, 0x1d, 0xdf, 0x24, 0x49, 0x21, 0x1a,\n","  0x08, 0xdc, 0x0c, 0x0a, 0x2d, 0x51, 0x0b, 0x27, 0x1f, 0xdd, 0x2c, 0xf5,\n","  0x13, 0x31, 0xe1, 0xfe, 0x25, 0xc8, 0x0c, 0x28, 0x1a, 0x13, 0xfe, 0xca,\n","  0x0c, 0xa4, 0x08, 0x1e, 0x1b, 0x11, 0x06, 0xf0, 0x08, 0xd5, 0x03, 0xec,\n","  0x08, 0x09, 0x01, 0x09, 0xeb, 0xf5, 0xf8, 0xf7, 0xef, 0xb5, 0xda, 0xf7,\n","  0xf2, 0x0c, 0xe7, 0x0c, 0xce, 0xcb, 0x03, 0xf8, 0x13, 0x1a, 0xfe, 0xf6,\n","  0xe4, 0xfe, 0xf0, 0x0b, 0x11, 0xe0, 0xeb, 0x13, 0xd8, 0xfa, 0x1c, 0x06,\n","  0x19, 0x2d, 0xdd, 0xf5, 0xe4, 0xfd, 0xe5, 0x27, 0x07, 0xfd, 0xbe, 0xdc,\n","  0x00, 0x13, 0x1e, 0x2a, 0x26, 0xf8, 0xf6, 0x0b, 0x13, 0x1e, 0xf4, 0x37,\n","  0x0b, 0xf6, 0xcb, 0xe6, 0xfb, 0x0f, 0xf6, 0x48, 0xfb, 0xe3, 0xd6, 0x01,\n","  0xf5, 0x13, 0xf1, 0x22, 0xe9, 0xfe, 0xe9, 0xf5, 0x0b, 0x11, 0xfa, 0x1c,\n","  0xe2, 0xfb, 0xd3, 0x06, 0x0e, 0xf1, 0x0b, 0x0e, 0xf5, 0x13, 0x27, 0x10,\n","  0xea, 0xfc, 0x18, 0x0f, 0xf3, 0xe6, 0x0e, 0x09, 0x00, 0x10, 0x02, 0xfe,\n","  0xc1, 0xfc, 0xfa, 0x0a, 0x1a, 0x3e, 0x15, 0x01, 0xe3, 0xb5, 0x1d, 0x08,\n","  0x21, 0x58, 0xeb, 0x14, 0xf2, 0xcb, 0x1a, 0x04, 0x2d, 0x59, 0xe0, 0x21,\n","  0x0a, 0xf9, 0x2d, 0xe9, 0x18, 0x26, 0x03, 0xf8, 0x2d, 0xc2, 0x1a, 0x06,\n","  0x16, 0x03, 0x19, 0xcf, 0x2a, 0xae, 0x32, 0xfc, 0x0a, 0x15, 0x1e, 0xde,\n","  0x19, 0xd3, 0x0d, 0xed, 0x0c, 0xf4, 0x02, 0xf8, 0xdc, 0x03, 0x11, 0xff,\n","  0xf7, 0xa7, 0x0c, 0xf5, 0x0c, 0x0b, 0xee, 0xeb, 0xd6, 0xcd, 0x0d, 0xde,\n","  0xf8, 0x11, 0xf4, 0xfb, 0xd3, 0xff, 0xf3, 0x00, 0x17, 0xe9, 0xf6, 0xeb,\n","  0xe6, 0xef, 0x12, 0x04, 0x1d, 0x34, 0xd9, 0xf9, 0xd6, 0xff, 0xff, 0x2b,\n","  0xfb, 0x09, 0xef, 0xf4, 0xeb, 0x10, 0xfd, 0x30, 0x26, 0x04, 0xe2, 0x04,\n","  0x09, 0x06, 0xe5, 0x41, 0x02, 0xff, 0xed, 0x17, 0x05, 0xfd, 0x08, 0x39,\n","  0xfd, 0xfd, 0xce, 0xdd, 0xed, 0x03, 0x05, 0x30, 0x01, 0x12, 0xec, 0xfb,\n","  0xff, 0x10, 0x1a, 0x21, 0xe2, 0x08, 0xeb, 0xf9, 0x07, 0xf5, 0xf0, 0xd8,\n","  0xea, 0xfb, 0xea, 0xf4, 0x10, 0xf8, 0x1c, 0xfd, 0xfd, 0xe9, 0x16, 0xfd,\n","  0x0d, 0x0e, 0xf9, 0x06, 0xbb, 0xed, 0x08, 0x00, 0x3d, 0x3d, 0xd7, 0x12,\n","  0xda, 0xb9, 0x0d, 0x05, 0x10, 0x4f, 0x02, 0x1c, 0x1b, 0xd1, 0x02, 0xf0,\n","  0x12, 0x4b, 0x11, 0xfc, 0x20, 0x0a, 0x05, 0x01, 0x0e, 0x3c, 0x05, 0xf9,\n","  0x23, 0xcb, 0x1b, 0x15, 0x2b, 0xfe, 0x20, 0xda, 0x23, 0xa0, 0x1e, 0x12,\n","  0x1a, 0x09, 0x14, 0xd0, 0x11, 0xcc, 0x07, 0xfa, 0x13, 0x08, 0xed, 0x11,\n","  0xdf, 0x05, 0xf9, 0x1e, 0xfb, 0xc5, 0x12, 0xf5, 0x0d, 0x2f, 0xe5, 0xfc,\n","  0xd0, 0xd4, 0xe9, 0x05, 0xf8, 0x17, 0x04, 0x07, 0xd6, 0xe1, 0x0c, 0xef,\n","  0x19, 0xec, 0x01, 0xff, 0xd5, 0xf3, 0x07, 0x18, 0x0f, 0x12, 0xc2, 0xf9,\n","  0xd8, 0x07, 0xf0, 0x2f, 0x03, 0x10, 0xe2, 0x16, 0x05, 0x14, 0x0a, 0x30,\n","  0x21, 0xf4, 0xd6, 0x08, 0x08, 0x1e, 0x12, 0x3c, 0x01, 0x05, 0xd9, 0x0d,\n","  0xfb, 0x18, 0xfb, 0x2f, 0x02, 0xf2, 0xee, 0x08, 0xf3, 0x14, 0x09, 0x3d,\n","  0x02, 0x13, 0xf4, 0xfc, 0xe7, 0x1d, 0x23, 0x43, 0xe7, 0x07, 0xd5, 0x1c,\n","  0x0b, 0xe5, 0xf5, 0x08, 0xe6, 0xe4, 0xea, 0x08, 0xeb, 0x0e, 0x00, 0x06,\n","  0xd8, 0x10, 0xfe, 0x06, 0x09, 0x28, 0x03, 0xf7, 0xb2, 0xe8, 0x02, 0x27,\n","  0x22, 0x28, 0xe8, 0x04, 0xd9, 0xb5, 0x12, 0x08, 0x22, 0x4b, 0x0a, 0x29,\n","  0x0d, 0xf0, 0x11, 0xdc, 0x24, 0x46, 0x02, 0x21, 0x18, 0xf8, 0x0d, 0x03,\n","  0x36, 0x3c, 0x04, 0xf5, 0x1c, 0xc6, 0x1f, 0x05, 0x11, 0x0c, 0xe2, 0xcc,\n","  0x2b, 0x92, 0x11, 0xfb, 0x22, 0x01, 0xf7, 0xd8, 0x16, 0xd5, 0xf4, 0x00,\n","  0x20, 0xff, 0x02, 0xec, 0xe6, 0x0f, 0xef, 0x06, 0xf4, 0xb4, 0xf2, 0xd3,\n","  0xf7, 0x17, 0x02, 0x00, 0xd3, 0xb2, 0xdb, 0x1b, 0x05, 0x24, 0xdf, 0xef,\n","  0xce, 0xea, 0x21, 0x04, 0x01, 0xf1, 0xf1, 0xfa, 0xd7, 0xeb, 0xf6, 0x01,\n","  0x19, 0x14, 0xeb, 0x09, 0xe5, 0x04, 0x0d, 0x16, 0x08, 0x14, 0xf1, 0x19,\n","  0xf0, 0x18, 0x05, 0x30, 0x0e, 0xe2, 0xea, 0xeb, 0xf1, 0x1b, 0xe6, 0x28,\n","  0x0c, 0x11, 0xd9, 0x01, 0xfd, 0x06, 0x0b, 0x38, 0x05, 0xdf, 0xd4, 0x1d,\n","  0xe8, 0x08, 0x0e, 0x3d, 0xfb, 0x04, 0xee, 0x04, 0xf2, 0x11, 0x0e, 0x35,\n","  0xe3, 0x13, 0xeb, 0x11, 0x05, 0x01, 0xf9, 0x07, 0xec, 0xff, 0x21, 0x26,\n","  0xf4, 0xf0, 0xf4, 0x00, 0xc7, 0xf4, 0xf3, 0x22, 0xfe, 0x20, 0x0e, 0x0e,\n","  0xbd, 0x01, 0xfa, 0x01, 0x25, 0x42, 0xff, 0x10, 0xdb, 0xca, 0x14, 0x1f,\n","  0x17, 0x4f, 0xfa, 0x2e, 0x04, 0xe7, 0x19, 0xe0, 0x17, 0x40, 0xfd, 0x11,\n","  0x02, 0xfc, 0x1e, 0xee, 0x21, 0x30, 0x1a, 0x0a, 0x27, 0xd3, 0x0f, 0x13,\n","  0x1e, 0x05, 0x02, 0xee, 0x1f, 0x9a, 0x05, 0x1f, 0x12, 0xfd, 0x14, 0xea,\n","  0x0c, 0xcf, 0x06, 0xea, 0x27, 0xf1, 0xfb, 0xf2, 0xe2, 0x1f, 0x04, 0xee,\n","  0xe5, 0xbe, 0xee, 0xe7, 0xea, 0x19, 0xed, 0x01, 0xc8, 0xd8, 0x10, 0x17,\n","  0x12, 0x16, 0xe9, 0x09, 0xd0, 0xfb, 0xf4, 0x20, 0x0c, 0x14, 0xfb, 0x03,\n","  0xcf, 0xff, 0xf3, 0xfe, 0xfd, 0x1b, 0xe8, 0xf0, 0xdc, 0xf6, 0xd7, 0x13,\n","  0x11, 0x07, 0xe6, 0xdf, 0xe5, 0x08, 0x05, 0x2e, 0x0c, 0xef, 0xc4, 0xec,\n","  0xf1, 0x0a, 0xe9, 0x14, 0xf2, 0x1f, 0xf3, 0x0d, 0xfe, 0x08, 0x29, 0x34,\n","  0x09, 0xfb, 0xd4, 0xf1, 0xe0, 0x30, 0x06, 0x54, 0xfa, 0xfd, 0xe6, 0x16,\n","  0xfe, 0x12, 0xe5, 0x1f, 0xea, 0x02, 0xfa, 0xe1, 0x06, 0xf7, 0xe8, 0xe9,\n","  0xe3, 0x0d, 0x02, 0xfe, 0xe9, 0xfc, 0xfb, 0x1a, 0xeb, 0xf9, 0x06, 0x04,\n","  0x0a, 0x11, 0x09, 0xf4, 0xbe, 0x04, 0x18, 0xf7, 0x35, 0x3a, 0xf1, 0xf6,\n","  0xdc, 0xbf, 0x14, 0xf7, 0x16, 0x4b, 0xe8, 0x20, 0x03, 0xd6, 0x15, 0xfc,\n","  0x1f, 0x38, 0xea, 0x0b, 0x12, 0x2e, 0x0c, 0xd5, 0x30, 0x2b, 0x00, 0x00,\n","  0x2d, 0xc7, 0x15, 0xd6, 0x1c, 0xeb, 0xec, 0xcc, 0x2c, 0x99, 0x14, 0xf4,\n","  0x12, 0x09, 0x1e, 0xf5, 0xf7, 0xc4, 0xf7, 0xf8, 0x0f, 0xe7, 0x0c, 0xf4,\n","  0xf6, 0xfb, 0x00, 0x01, 0xe6, 0xce, 0xe6, 0x23, 0xe9, 0x0c, 0xf2, 0xf1,\n","  0xc6, 0xf5, 0x1a, 0xfc, 0xf5, 0x0e, 0xfc, 0xfa, 0xc2, 0xef, 0x0a, 0x1f,\n","  0xed, 0x1c, 0xcf, 0xfd, 0xd1, 0xfb, 0x0a, 0x07, 0x11, 0x2b, 0xe4, 0x01,\n","  0xd9, 0x0c, 0xfd, 0x3f, 0x02, 0x09, 0xe4, 0xee, 0xea, 0x06, 0xf3, 0x2d,\n","  0x1c, 0xe6, 0xd6, 0x1c, 0xfd, 0x0d, 0x17, 0x25, 0xf4, 0x18, 0xfe, 0xe7,\n","  0xfd, 0xff, 0x04, 0x36, 0xfe, 0x06, 0xda, 0xee, 0xf1, 0x20, 0x02, 0x41,\n","  0xee, 0x18, 0xdc, 0xf9, 0xf4, 0x27, 0x03, 0x2f, 0xee, 0x19, 0xe3, 0xf1,\n","  0x10, 0xf2, 0xdf, 0xe2, 0xdf, 0x0b, 0x23, 0x09, 0x02, 0xfc, 0xe2, 0x11,\n","  0xca, 0xf1, 0xf1, 0xf0, 0xf1, 0x10, 0xfe, 0x0c, 0xbd, 0xfc, 0x1b, 0x1a,\n","  0x30, 0x3a, 0x04, 0x03, 0xce, 0xce, 0x21, 0x00, 0xfc, 0x4a, 0xda, 0x1b,\n","  0x05, 0xdc, 0x07, 0xeb, 0x0b, 0x41, 0x21, 0x17, 0x0e, 0x0d, 0x1f, 0xfe,\n","  0x2c, 0x29, 0xe6, 0x15, 0x26, 0xb9, 0x06, 0xf7, 0x22, 0x09, 0x03, 0xea,\n","  0x30, 0x95, 0x28, 0xf6, 0x20, 0x02, 0xfc, 0xf3, 0xf6, 0xce, 0xee, 0x00,\n","  0x2d, 0xee, 0xf2, 0xf1, 0xd6, 0x0b, 0xe3, 0x08, 0xfa, 0xe1, 0xe2, 0x0c,\n","  0xef, 0x22, 0xf1, 0x06, 0xd3, 0xed, 0x08, 0x1b, 0xfc, 0x03, 0xec, 0x03,\n","  0xb6, 0x03, 0xec, 0xfd, 0xfd, 0xf3, 0xd9, 0x0e, 0xd4, 0xd7, 0xd5, 0x15,\n","  0x0c, 0x1a, 0xd9, 0xeb, 0xdd, 0x11, 0x14, 0x1c, 0x10, 0x07, 0xe9, 0xf0,\n","  0xdf, 0x07, 0xdb, 0x15, 0x1e, 0xe8, 0xe6, 0xe1, 0x00, 0x13, 0x12, 0x1b,\n","  0xef, 0x0a, 0xc8, 0xfd, 0x0d, 0x0f, 0x0a, 0x40, 0x07, 0xf6, 0xcb, 0x02,\n","  0xde, 0x16, 0x13, 0x4f, 0xfb, 0x08, 0xd3, 0xee, 0xde, 0x07, 0xe5, 0x22,\n","  0xe7, 0xfe, 0xec, 0xea, 0x06, 0xf0, 0xfe, 0xdf, 0xd6, 0xd5, 0xfb, 0x14,\n","  0xf9, 0xff, 0x0d, 0xfd, 0xd2, 0xeb, 0x02, 0x03, 0xf5, 0x06, 0xf8, 0xfb,\n","  0xb0, 0xee, 0x06, 0x1a, 0x22, 0x47, 0x0d, 0xf9, 0xd0, 0xec, 0x03, 0xd5,\n","  0x0c, 0x3f, 0x07, 0x1b, 0xf1, 0xcc, 0x03, 0xec, 0x1d, 0x47, 0xeb, 0xf6,\n","  0x04, 0x18, 0x19, 0x09, 0x27, 0x27, 0x01, 0xeb, 0x18, 0xda, 0x10, 0xf9,\n","  0x1f, 0xf7, 0x11, 0xe2, 0x2e, 0xa8, 0x0a, 0x05, 0x2f, 0x06, 0xf9, 0x02,\n","  0x18, 0xc5, 0x0f, 0x20, 0x24, 0xe0, 0xf4, 0xea, 0xd5, 0xf0, 0xeb, 0xf3,\n","  0xef, 0xd1, 0x11, 0xfd, 0xeb, 0xf8, 0xfa, 0x02, 0xcd, 0xe6, 0x11, 0xf8,\n","  0x04, 0x01, 0xcd, 0x15, 0xd4, 0xea, 0x08, 0xe3, 0x0e, 0x1e, 0xe5, 0x0d,\n","  0xe7, 0xe8, 0x04, 0xd6, 0x02, 0x30, 0xe6, 0x06, 0xe5, 0xf6, 0x08, 0x0e,\n","  0x09, 0x1a, 0xf1, 0x08, 0xed, 0x13, 0xfb, 0x2b, 0x0c, 0xf1, 0xe5, 0xfc,\n","  0x03, 0x19, 0xfb, 0x24, 0x00, 0x1a, 0xcd, 0xf8, 0x03, 0x12, 0x0c, 0x21,\n","  0x14, 0x00, 0xc4, 0xf2, 0xe3, 0x08, 0x09, 0x2f, 0xf0, 0x04, 0xe4, 0xdd,\n","  0xd7, 0x2d, 0x24, 0x26, 0xf3, 0x05, 0xfd, 0x1e, 0xfb, 0xf4, 0x07, 0xe8,\n","  0xce, 0xd8, 0xe9, 0xe1, 0x09, 0x0d, 0xdc, 0x0a, 0xd5, 0x00, 0xf8, 0xd7,\n","  0x0d, 0x05, 0xfe, 0x02, 0xbd, 0x07, 0x14, 0xe9, 0x18, 0x23, 0xe2, 0xfa,\n","  0xbf, 0xeb, 0x19, 0x1d, 0x18, 0x39, 0xf4, 0x13, 0xe2, 0xe6, 0x08, 0x10,\n","  0x11, 0x2b, 0xf6, 0x29, 0x08, 0x2c, 0x15, 0xfb, 0x33, 0x1f, 0x25, 0x24,\n","  0x24, 0xe4, 0x1e, 0xfd, 0x0f, 0x0e, 0xfc, 0xe9, 0x1d, 0xb9, 0x0a, 0xe4,\n","  0x36, 0xed, 0x10, 0xf4, 0x03, 0xd2, 0x04, 0xff, 0x14, 0xe5, 0x1f, 0xf7,\n","  0xe4, 0x0c, 0xdf, 0x0d, 0xfc, 0xf1, 0x1a, 0xee, 0xe2, 0x0b, 0xe7, 0xe9,\n","  0xbf, 0xee, 0x03, 0xe8, 0xeb, 0xf9, 0xf7, 0x22, 0xab, 0x0a, 0xef, 0x0a,\n","  0x12, 0x05, 0xfc, 0xea, 0xfc, 0xee, 0x20, 0xf2, 0x01, 0x27, 0xc8, 0xf6,\n","  0xf1, 0x04, 0x00, 0x0a, 0x03, 0x17, 0xf7, 0xe2, 0xe2, 0x16, 0xef, 0x0e,\n","  0x15, 0xfb, 0xd5, 0xee, 0xe3, 0x2c, 0x15, 0x13, 0xef, 0x15, 0xe6, 0xe3,\n","  0xf1, 0x2a, 0xed, 0x32, 0xef, 0x0d, 0xea, 0xf6, 0xe6, 0x27, 0xfc, 0x3e,\n","  0xeb, 0x09, 0xe7, 0xef, 0xf4, 0x0f, 0xf2, 0x0a, 0xfa, 0x0e, 0xda, 0xf6,\n","  0xfe, 0xfc, 0x12, 0xe3, 0xd7, 0x03, 0x10, 0xf9, 0xfd, 0x0a, 0x28, 0x0e,\n","  0xd1, 0xf5, 0xdc, 0xfc, 0xf4, 0x0f, 0xf3, 0x1b, 0xb8, 0x16, 0x0c, 0xf4,\n","  0xf8, 0x24, 0xf9, 0x18, 0xc6, 0xf1, 0x0d, 0xf3, 0x22, 0x2b, 0x06, 0x2a,\n","  0xf2, 0xfe, 0xf3, 0xe7, 0x10, 0x10, 0x02, 0x15, 0x15, 0x4c, 0x01, 0xe5,\n","  0x0d, 0xfc, 0x17, 0x19, 0x36, 0xeb, 0x01, 0xfe, 0x0a, 0xf3, 0xfa, 0x09,\n","  0x31, 0xa5, 0x04, 0x2c, 0x32, 0x0d, 0xe8, 0x05, 0x04, 0xc2, 0x0c, 0x1b,\n","  0x4e, 0xe3, 0xeb, 0xf0, 0xb5, 0x06, 0xee, 0xea, 0x16, 0xea, 0xde, 0xfd,\n","  0xcf, 0x24, 0xf0, 0x04, 0xcf, 0xeb, 0xf5, 0x07, 0xe3, 0xff, 0xde, 0x11,\n","  0xce, 0xee, 0x13, 0x0e, 0x0a, 0xfb, 0xfb, 0xf5, 0xda, 0xed, 0x04, 0xe9,\n","  0x17, 0x34, 0xec, 0x11, 0xe2, 0xff, 0x04, 0x07, 0x15, 0x26, 0xf7, 0xff,\n","  0x07, 0x20, 0x1d, 0x34, 0x13, 0x0d, 0xfd, 0xf9, 0x04, 0x39, 0xf5, 0x1e,\n","  0xfd, 0x28, 0xf5, 0x04, 0x02, 0x3a, 0xea, 0x2a, 0xe7, 0xf8, 0x03, 0xda,\n","  0xf7, 0x27, 0xfc, 0x2b, 0xd5, 0x15, 0xf4, 0x14, 0xdf, 0x1c, 0xfc, 0x2e,\n","  0xdc, 0x17, 0xee, 0xe7, 0xe9, 0x06, 0x0a, 0xe0, 0xd9, 0xd9, 0xee, 0x13,\n","  0xe5, 0x1e, 0x06, 0x02, 0xe9, 0xfa, 0xfb, 0xf9, 0xed, 0x10, 0xe8, 0x0a,\n","  0xdf, 0x0f, 0x2e, 0xee, 0x0a, 0x21, 0xfc, 0xff, 0xe9, 0xe5, 0x1b, 0xe5,\n","  0xfb, 0x29, 0x05, 0x23, 0xfa, 0x11, 0x07, 0x09, 0xeb, 0x11, 0x1c, 0x12,\n","  0xf9, 0x33, 0x09, 0xfa, 0x1a, 0x20, 0xdd, 0x03, 0x24, 0xf6, 0xf8, 0x04,\n","  0x14, 0x07, 0xef, 0xf6, 0x0c, 0xd1, 0x0f, 0x0f, 0x36, 0x05, 0x06, 0xeb,\n","  0xf6, 0xcf, 0x03, 0xfd, 0x3a, 0xe2, 0x03, 0xf6, 0xd7, 0x11, 0x0b, 0x0a,\n","  0xf7, 0xfe, 0x00, 0x10, 0xdf, 0xf5, 0xdb, 0x1b, 0xca, 0x13, 0x02, 0xf9,\n","  0x03, 0xef, 0x10, 0xf8, 0xe3, 0x1e, 0xe8, 0x13, 0x16, 0x12, 0xea, 0xe9,\n","  0xee, 0x15, 0xfd, 0xef, 0xfe, 0x11, 0xf1, 0x05, 0x00, 0x1d, 0x02, 0xf0,\n","  0x00, 0x21, 0xe9, 0xfa, 0xdc, 0x1b, 0xef, 0x16, 0x24, 0x19, 0x0f, 0x10,\n","  0xf2, 0x35, 0xed, 0x1b, 0x01, 0x2c, 0x1f, 0xea, 0x01, 0x3f, 0x15, 0x0e,\n","  0xfe, 0x1d, 0xed, 0xf0, 0xf8, 0x22, 0x09, 0x01, 0x03, 0x1c, 0xfa, 0x08,\n","  0xe5, 0x39, 0x02, 0x27, 0x15, 0x19, 0xe3, 0xff, 0xe0, 0x11, 0x0e, 0x0b,\n","  0x01, 0x1a, 0xfb, 0xf3, 0x07, 0x01, 0xec, 0x0e, 0x06, 0xf9, 0xfb, 0x12,\n","  0xf6, 0x12, 0x17, 0x00, 0xf5, 0x04, 0xfa, 0x15, 0x07, 0xff, 0xf3, 0xfa,\n","  0x20, 0xf5, 0x0d, 0x0e, 0x0e, 0xf1, 0xd9, 0x03, 0x11, 0x1a, 0xfb, 0x0e,\n","  0xed, 0xe9, 0xe5, 0xf1, 0x04, 0x14, 0x0f, 0xf3, 0x15, 0xec, 0xfd, 0x0b,\n","  0x04, 0x0f, 0xf8, 0x1b, 0x08, 0xf4, 0xe1, 0x1c, 0x10, 0x0f, 0x06, 0xf8,\n","  0xed, 0xee, 0x05, 0x0d, 0xff, 0x22, 0xec, 0xe8, 0xf8, 0x0c, 0xdb, 0x0e,\n","  0x18, 0xe6, 0xf0, 0x03, 0xf2, 0xed, 0x06, 0xef, 0xf5, 0x19, 0x01, 0x12,\n","  0xf4, 0xe4, 0x29, 0x29, 0x12, 0xdb, 0x03, 0x0e, 0x0e, 0x07, 0x1a, 0x0c,\n","  0xed, 0x01, 0x09, 0x06, 0x00, 0xfe, 0x0b, 0xd8, 0x13, 0xf0, 0x00, 0x1c,\n","  0xf8, 0x0c, 0xf7, 0x0c, 0x0b, 0x15, 0xf8, 0x15, 0xf0, 0x28, 0x10, 0x1e,\n","  0xe6, 0xf0, 0xfa, 0x06, 0xec, 0xff, 0x0b, 0xfc, 0xfe, 0x03, 0x10, 0x0a,\n","  0xea, 0xed, 0xf7, 0xff, 0xeb, 0xf6, 0xea, 0xe7, 0xf7, 0x0c, 0xe9, 0x23,\n","  0xfe, 0xe3, 0xec, 0xd6, 0x04, 0xfa, 0x05, 0x0a, 0xf7, 0xf0, 0xf4, 0xd9,\n","  0xf3, 0xd6, 0xf4, 0xf7, 0xf1, 0xdf, 0xfc, 0xde, 0x06, 0x10, 0x08, 0x03,\n","  0x16, 0x03, 0x18, 0xe7, 0x0d, 0xfc, 0xf9, 0x02, 0xee, 0x04, 0xf7, 0xec,\n","  0x15, 0x05, 0xf0, 0x0b, 0xf6, 0x1a, 0x09, 0x03, 0x23, 0xff, 0xe4, 0xf3,\n","  0xed, 0xfc, 0xf4, 0xf7, 0x18, 0x17, 0x26, 0xdb, 0xe3, 0x0b, 0x03, 0xda,\n","  0x26, 0xfb, 0x08, 0xf6, 0xff, 0x0f, 0x0d, 0xf8, 0xff, 0xf7, 0xf9, 0xf7,\n","  0xe4, 0xf4, 0xf0, 0x1a, 0x02, 0x09, 0xf6, 0xfd, 0xee, 0x1a, 0x07, 0xed,\n","  0x14, 0x03, 0xe8, 0xf7, 0x07, 0xfd, 0x1b, 0x1e, 0x35, 0xfb, 0xe6, 0xf4,\n","  0xf6, 0x17, 0xf0, 0xed, 0xfc, 0x0f, 0xfd, 0x11, 0xef, 0x03, 0x11, 0x07,\n","  0x1a, 0xf7, 0xef, 0xef, 0x0b, 0x15, 0x14, 0xf8, 0x1c, 0x0d, 0x1d, 0xf7,\n","  0x10, 0xec, 0x1f, 0x0a, 0x05, 0x11, 0x0b, 0xda, 0xe7, 0xee, 0xfd, 0xdc,\n","  0x15, 0xf0, 0xfd, 0xeb, 0xe1, 0x16, 0xf9, 0x06, 0x02, 0xeb, 0x09, 0x03,\n","  0x04, 0xe7, 0x19, 0x15, 0xff, 0xf0, 0x05, 0xf5, 0xf7, 0x0a, 0x11, 0xe7,\n","  0xf8, 0x15, 0x10, 0xf8, 0xfe, 0x11, 0x05, 0x00, 0x0d, 0xee, 0xde, 0x00,\n","  0xe5, 0x0f, 0xf0, 0x05, 0xf6, 0x1a, 0x0b, 0x08, 0x10, 0x13, 0x0c, 0xf1,\n","  0x1e, 0xf2, 0x01, 0xfb, 0x1b, 0x0b, 0x05, 0x05, 0x1b, 0x2d, 0xde, 0x0b,\n","  0xed, 0x11, 0xfc, 0xfe, 0x2a, 0x0d, 0xfc, 0xf8, 0xf9, 0xf4, 0x13, 0xe4,\n","  0x1a, 0xf6, 0xf0, 0xf3, 0x1d, 0x01, 0x18, 0xdb, 0xf8, 0x0f, 0xf9, 0xf5,\n","  0x11, 0x1b, 0x0f, 0xf9, 0x19, 0xf6, 0x05, 0xf3, 0xf5, 0x3a, 0x12, 0xdd,\n","  0x08, 0xe4, 0x15, 0xfb, 0x01, 0x3f, 0xfd, 0x1e, 0x03, 0xf0, 0x06, 0xf9,\n","  0xfe, 0x13, 0x03, 0x17, 0x1d, 0xea, 0xf6, 0xec, 0xe3, 0x05, 0xe0, 0x0f,\n","  0xf4, 0xfd, 0x01, 0xea, 0x13, 0xf8, 0xe1, 0x07, 0x10, 0xed, 0xff, 0xf0,\n","  0xfe, 0xd8, 0xfe, 0x06, 0xf0, 0xfe, 0x0b, 0x00, 0xf1, 0x24, 0xe8, 0xfc,\n","  0x20, 0x0d, 0x13, 0xed, 0x00, 0x0e, 0x05, 0x10, 0x04, 0x0a, 0x18, 0xdc,\n","  0xfa, 0x08, 0xed, 0x0c, 0x0f, 0x05, 0x0a, 0xe0, 0xe6, 0xe6, 0x12, 0x02,\n","  0x05, 0xf2, 0x04, 0xee, 0x1f, 0xf6, 0xe2, 0xf2, 0xff, 0x00, 0x05, 0xf5,\n","  0x25, 0xe4, 0xf4, 0xf7, 0x00, 0x05, 0x06, 0xeb, 0x1d, 0xf8, 0x03, 0xe7,\n","  0x06, 0xef, 0x06, 0xeb, 0x1c, 0xfc, 0x00, 0x16, 0x06, 0xfe, 0x02, 0xfc,\n","  0x01, 0xeb, 0xe9, 0x08, 0x00, 0x05, 0x0a, 0x14, 0x02, 0xf6, 0xdd, 0xff,\n","  0x18, 0x1b, 0x07, 0x14, 0x00, 0x02, 0x03, 0x06, 0x0a, 0x07, 0xf1, 0x25,\n","  0xf3, 0x02, 0x06, 0x07, 0x0c, 0x0c, 0x19, 0x07, 0x06, 0x0d, 0xf1, 0xfb,\n","  0xec, 0x0c, 0x03, 0x09, 0xfa, 0x29, 0xf5, 0x08, 0x0a, 0xff, 0xf5, 0x00,\n","  0xfe, 0x3e, 0x12, 0xee, 0x18, 0xe4, 0xef, 0x10, 0xe3, 0x3f, 0x08, 0x14,\n","  0x06, 0xf7, 0x16, 0x1c, 0x21, 0x17, 0xfd, 0x10, 0xd9, 0xee, 0xf7, 0x0a,\n","  0xf8, 0x09, 0x00, 0x11, 0x17, 0xec, 0xe8, 0xe3, 0xfe, 0xf4, 0xe8, 0x0b,\n","  0xf3, 0x06, 0x17, 0x04, 0x01, 0xe7, 0xe6, 0x00, 0xe0, 0x0a, 0x02, 0x04,\n","  0x04, 0xf7, 0xf6, 0xda, 0x1f, 0x16, 0xe5, 0xfc, 0xf0, 0x1d, 0xfd, 0xfb,\n","  0x15, 0x0c, 0xf7, 0x09, 0xeb, 0x15, 0x0a, 0xe7, 0xf6, 0x0e, 0xfb, 0xeb,\n","  0x00, 0xee, 0xe2, 0xff, 0x05, 0x13, 0x04, 0xe9, 0x09, 0x0b, 0xeb, 0xfb,\n","  0x02, 0x17, 0x01, 0xf3, 0x03, 0x07, 0x09, 0xe1, 0xfb, 0x03, 0x02, 0x0b,\n","  0xe2, 0x27, 0x07, 0xe7, 0x09, 0x0e, 0x19, 0xfa, 0xf4, 0x02, 0x09, 0xfc,\n","  0x04, 0x04, 0x07, 0x21, 0x0a, 0x09, 0xfe, 0x03, 0xf0, 0x0f, 0xf0, 0x19,\n","  0xef, 0xf7, 0x19, 0xdb, 0x0f, 0x35, 0xe1, 0xf5, 0x24, 0xf2, 0x04, 0xe6,\n","  0x05, 0x21, 0xea, 0x30, 0x10, 0x1d, 0xe7, 0x08, 0x01, 0x20, 0xf4, 0x24,\n","  0x0d, 0x12, 0xfa, 0x07, 0x0e, 0x0f, 0xf1, 0x14, 0xe7, 0x10, 0x15, 0xef,\n","  0x0c, 0xeb, 0xf6, 0x07, 0x05, 0x48, 0xce, 0xf6, 0xea, 0xe9, 0x04, 0x1d,\n","  0xf3, 0x45, 0xea, 0xf6, 0xf9, 0xdc, 0xfb, 0x10, 0x0c, 0x25, 0xff, 0xf5,\n","  0xfe, 0xf2, 0x1f, 0x01, 0x0b, 0x06, 0xd6, 0x1b, 0xe8, 0x03, 0x04, 0x0f,\n","  0x0e, 0xe5, 0xf7, 0x07, 0x04, 0xfa, 0x04, 0x0d, 0x03, 0xf4, 0x00, 0xf8,\n","  0xfc, 0xfd, 0x07, 0x07, 0x14, 0x06, 0x17, 0xec, 0x1d, 0x24, 0xef, 0x01,\n","  0xff, 0xf5, 0xec, 0xfb, 0x19, 0x17, 0x16, 0x06, 0x06, 0xe5, 0xdb, 0x1b,\n","  0x0e, 0x04, 0xe7, 0xe7, 0xfe, 0x07, 0xf5, 0xf1, 0xf3, 0x0c, 0x23, 0xfb,\n","  0xf6, 0x09, 0xd6, 0xd9, 0xe5, 0x0d, 0xe1, 0xf4, 0x13, 0x08, 0xe9, 0x0e,\n","  0x03, 0x19, 0x04, 0x0d, 0x04, 0x0e, 0xf5, 0x1e, 0xe6, 0xef, 0x0e, 0xfa,\n","  0x07, 0xe1, 0x14, 0xf4, 0xfb, 0xfa, 0x0f, 0x0c, 0x02, 0xfc, 0xda, 0xf4,\n","  0xf9, 0x1f, 0x0b, 0x0f, 0x09, 0x19, 0x06, 0xfb, 0x0e, 0x43, 0xfe, 0x0f,\n","  0x13, 0x04, 0xea, 0xfe, 0x16, 0x3f, 0x14, 0x4a, 0xff, 0xf5, 0xda, 0xf7,\n","  0x0f, 0x01, 0xed, 0x10, 0xfb, 0x27, 0xe9, 0x01, 0xfe, 0x12, 0x0c, 0x0b,\n","  0x01, 0x25, 0x07, 0xee, 0xfe, 0x10, 0xf7, 0xf9, 0x04, 0x63, 0xd7, 0x13,\n","  0xf4, 0xd8, 0xf3, 0x11, 0x11, 0x50, 0xe3, 0x15, 0xf2, 0xc6, 0x22, 0x13,\n","  0x08, 0x2a, 0xe0, 0x17, 0xfe, 0xf1, 0xe7, 0xf8, 0x2e, 0x1b, 0xed, 0x14,\n","  0x1c, 0xf9, 0xf9, 0xf0, 0xf2, 0xf1, 0xff, 0xdc, 0xff, 0xfc, 0x0a, 0x07,\n","  0x00, 0xf3, 0x00, 0x1d, 0x0d, 0xfa, 0xe3, 0x07, 0xfb, 0xde, 0x02, 0x1e,\n","  0xfe, 0x18, 0xf1, 0xfe, 0x10, 0x00, 0xec, 0xfa, 0x18, 0x23, 0x21, 0xfc,\n","  0x02, 0xf0, 0x04, 0x07, 0xf8, 0x08, 0xf4, 0xee, 0x0d, 0xe9, 0xe7, 0xe4,\n","  0x05, 0xf5, 0x07, 0xe9, 0xf7, 0x04, 0xe9, 0xde, 0x0b, 0x20, 0x21, 0x03,\n","  0x07, 0xec, 0xe6, 0xeb, 0xf8, 0xed, 0xf0, 0xeb, 0x06, 0x09, 0x08, 0xf4,\n","  0x13, 0xe8, 0xf3, 0xfd, 0xfa, 0xfa, 0xfb, 0xf5, 0xfd, 0x09, 0xf8, 0x03,\n","  0xfd, 0x11, 0xfa, 0xf9, 0xfa, 0x14, 0xe1, 0x14, 0x03, 0x11, 0xe7, 0x29,\n","  0x1c, 0x55, 0x07, 0x17, 0x1c, 0x07, 0xf1, 0x14, 0x14, 0x28, 0x28, 0x66,\n","  0xfd, 0x0e, 0xd3, 0x24, 0x18, 0x0a, 0x0a, 0x1c, 0xf7, 0x2d, 0xfe, 0xfb,\n","  0x0e, 0xf6, 0x09, 0xf6, 0x0b, 0x24, 0xeb, 0xf8, 0xf6, 0x0d, 0x03, 0x08,\n","  0x03, 0x71, 0xe8, 0xf5, 0xdd, 0xe0, 0xe9, 0x08, 0xf0, 0x52, 0xf0, 0x08,\n","  0xe0, 0xd4, 0x0c, 0xe5, 0x20, 0x37, 0xe0, 0x03, 0xf9, 0xe9, 0x00, 0xf0,\n","  0x10, 0x12, 0x00, 0x15, 0x10, 0xfd, 0xee, 0x03, 0x22, 0xf0, 0x0b, 0xfc,\n","  0x08, 0xf1, 0x04, 0x11, 0xfe, 0x0c, 0xec, 0x05, 0xf4, 0xfc, 0x0a, 0xf8,\n","  0x0d, 0xee, 0xe1, 0xe1, 0x29, 0x0f, 0x2a, 0x06, 0xfe, 0xea, 0xf0, 0xf7,\n","  0x27, 0x0b, 0xf2, 0x07, 0xf5, 0xdb, 0xf8, 0x19, 0xf5, 0x05, 0xda, 0xf3,\n","  0x01, 0xec, 0xea, 0x15, 0xfb, 0x1d, 0x00, 0xde, 0xeb, 0xfe, 0xf0, 0x01,\n","  0x04, 0x03, 0xfc, 0x04, 0xf7, 0x1a, 0xf8, 0xda, 0x0c, 0xfb, 0x03, 0xeb,\n","  0xf8, 0x08, 0xdc, 0xff, 0xed, 0xf7, 0xf5, 0xfd, 0x07, 0x06, 0xfc, 0xf6,\n","  0x02, 0xf8, 0xf3, 0x11, 0x0e, 0xe9, 0xf1, 0x18, 0xf2, 0x0c, 0x00, 0x22,\n","  0x10, 0xea, 0x10, 0x16, 0x24, 0x42, 0x0d, 0x26, 0x06, 0x15, 0xde, 0xe9,\n","  0x05, 0x1d, 0xec, 0x4c, 0xfd, 0x23, 0xf1, 0x1e, 0x1c, 0xf9, 0x02, 0x19,\n","  0xff, 0x10, 0xe3, 0xf0, 0xff, 0xf5, 0xfe, 0x03, 0x1a, 0x29, 0xcf, 0xdb,\n","  0xe2, 0x0f, 0xf4, 0xf1, 0x0d, 0x5a, 0xfd, 0x27, 0xde, 0xe8, 0xff, 0x17,\n","  0xdd, 0x52, 0xd9, 0x15, 0xdf, 0xd5, 0x00, 0xc9, 0x0f, 0x39, 0x03, 0xee,\n","  0xe5, 0xe2, 0xe8, 0xf5, 0x2e, 0x1b, 0x03, 0xf3, 0x19, 0x0c, 0xf8, 0xe9,\n","  0x13, 0xf4, 0x00, 0xe2, 0x0f, 0x05, 0x02, 0x17, 0x06, 0xf1, 0xe9, 0x1b,\n","  0x1c, 0x11, 0xd9, 0xef, 0x03, 0xe5, 0xeb, 0x14, 0x10, 0x05, 0xfe, 0x01,\n","  0xfd, 0xef, 0x11, 0x1e, 0x12, 0x0a, 0x13, 0xea, 0xf2, 0xf0, 0xf4, 0x19,\n","  0x04, 0x05, 0x01, 0xe6, 0x0c, 0xfe, 0xf4, 0x25, 0x1a, 0x12, 0x1e, 0xdd,\n","  0xfc, 0x06, 0xd7, 0x11, 0x12, 0xfe, 0xe4, 0xe0, 0x03, 0xef, 0xe3, 0x14,\n","  0x06, 0xf9, 0x06, 0x00, 0x0e, 0x08, 0xe2, 0x01, 0xf5, 0xfb, 0xfe, 0xf6,\n","  0x02, 0xfc, 0xf5, 0x12, 0x00, 0xf1, 0x07, 0x01, 0x14, 0x0f, 0x06, 0xf6,\n","  0xee, 0x38, 0x21, 0x1a, 0x18, 0xe5, 0xff, 0x0d, 0xf7, 0x46, 0xea, 0x1c,\n","  0x07, 0xf0, 0xdc, 0xf9, 0x19, 0x13, 0x15, 0x44, 0x08, 0x1a, 0xd2, 0x05,\n","  0x18, 0xf4, 0x17, 0x1a, 0xf9, 0x23, 0xe9, 0xff, 0x16, 0xff, 0xe9, 0x0f,\n","  0xf6, 0x2b, 0xe8, 0xec, 0xe7, 0xf8, 0x20, 0x10, 0x15, 0x5d, 0xdb, 0x00,\n","  0xe4, 0xe3, 0xe1, 0x2b, 0x04, 0x4e, 0xec, 0x05, 0xe3, 0xb5, 0xf7, 0xda,\n","  0x16, 0x2c, 0xe8, 0xfd, 0x01, 0xfd, 0x10, 0xe9, 0x11, 0x17, 0xec, 0x13,\n","  0x1d, 0x15, 0xeb, 0xf5, 0x09, 0x00, 0xf8, 0x20, 0x0e, 0xf5, 0xef, 0x0a,\n","  0x03, 0xec, 0x13, 0x2a, 0x02, 0xfb, 0x1d, 0xea, 0xf0, 0x01, 0xea, 0xf9,\n","  0x16, 0x0b, 0x01, 0x07, 0xfd, 0xf4, 0xe1, 0xff, 0x19, 0x04, 0x14, 0xeb,\n","  0xf5, 0xf8, 0xfc, 0xf5, 0x0d, 0x0e, 0xde, 0xe2, 0x15, 0xff, 0xfa, 0xe5,\n","  0x03, 0x25, 0xf6, 0xec, 0xf9, 0x06, 0xfe, 0x29, 0xee, 0xfc, 0xee, 0xe5,\n","  0x0d, 0xea, 0xe5, 0x01, 0x01, 0xf8, 0x0d, 0xeb, 0x09, 0x00, 0xca, 0xff,\n","  0x0c, 0x03, 0xf1, 0xef, 0xf7, 0xf1, 0xed, 0x03, 0xf2, 0xe8, 0xe9, 0xe9,\n","  0x07, 0xfc, 0xeb, 0x1f, 0xdb, 0x19, 0x01, 0x17, 0x03, 0x0e, 0xfb, 0x11,\n","  0x08, 0x51, 0xdc, 0x2d, 0x09, 0xef, 0xeb, 0x18, 0x07, 0x21, 0xec, 0x4b,\n","  0xf7, 0x43, 0xd9, 0x00, 0x00, 0xee, 0xf5, 0x19, 0xe4, 0x25, 0xe3, 0xfc,\n","  0x09, 0x05, 0xf6, 0x11, 0x07, 0x30, 0xcb, 0x0f, 0xef, 0x04, 0x01, 0x0f,\n","  0x06, 0x4b, 0xfa, 0xf2, 0xe7, 0xe9, 0xea, 0x10, 0x0d, 0x4f, 0xe5, 0xf2,\n","  0xf9, 0xd3, 0x07, 0xe4, 0x22, 0x37, 0xeb, 0xed, 0xfb, 0xf5, 0xda, 0xd7,\n","  0x16, 0x12, 0x0d, 0x11, 0x07, 0x1f, 0x11, 0xe0, 0xff, 0xf2, 0x07, 0x1d,\n","  0xfa, 0x03, 0xfe, 0xf6, 0xf4, 0xe6, 0xde, 0xe9, 0x05, 0xed, 0xfd, 0xfa,\n","  0xf3, 0x03, 0xe8, 0x01, 0x26, 0x20, 0xfd, 0xf3, 0x04, 0xd1, 0xff, 0x09,\n","  0x28, 0x20, 0xfc, 0xfe, 0x02, 0xed, 0x03, 0x02, 0x0d, 0x04, 0xe5, 0xd4,\n","  0x04, 0xf8, 0xea, 0xfb, 0xfc, 0x14, 0x1b, 0xd6, 0x0b, 0xfb, 0xf9, 0x15,\n","  0xf5, 0xf6, 0x08, 0xd9, 0x03, 0x05, 0xed, 0x00, 0x12, 0xfe, 0xfb, 0xf6,\n","  0x13, 0xf3, 0xd7, 0xe3, 0xed, 0xfd, 0x13, 0xfb, 0x00, 0xf2, 0xe6, 0x29,\n","  0xfc, 0x09, 0x01, 0xdf, 0x03, 0x08, 0x04, 0xfe, 0x07, 0x25, 0xf4, 0x1d,\n","  0x0a, 0xdb, 0xf6, 0x1a, 0x09, 0x41, 0x12, 0x2c, 0x0a, 0xf6, 0xe4, 0xf9,\n","  0x0e, 0x13, 0x27, 0x45, 0xe1, 0x29, 0xd8, 0x05, 0x0c, 0xf0, 0x09, 0x19,\n","  0xf5, 0x1f, 0xef, 0xef, 0x08, 0xeb, 0xf2, 0x0f, 0x0c, 0x24, 0xde, 0x06,\n","  0xee, 0xfb, 0xf5, 0x14, 0x18, 0x30, 0xdd, 0x0f, 0xe8, 0xfb, 0x1a, 0x1b,\n","  0x06, 0x50, 0xe0, 0xfa, 0xf5, 0xb5, 0xf2, 0xf5, 0x27, 0x35, 0xed, 0x12,\n","  0xf3, 0x08, 0x0a, 0xd1, 0x1a, 0x15, 0xf7, 0x03, 0xf3, 0x0c, 0xec, 0xe7,\n","  0xff, 0xfb, 0xef, 0x2a, 0xf7, 0xf5, 0x12, 0x13, 0x08, 0xe3, 0x00, 0x2a,\n","  0x05, 0x0f, 0xfb, 0xf1, 0xed, 0x13, 0xf5, 0x02, 0x0c, 0x14, 0xf1, 0xf4,\n","  0xfa, 0xdb, 0x00, 0x03, 0x26, 0x2e, 0x26, 0x08, 0xf5, 0xe9, 0xfd, 0xe9,\n","  0x04, 0x20, 0x13, 0xcc, 0xfe, 0xf9, 0x02, 0x15, 0xf7, 0x05, 0xea, 0xc3,\n","  0xee, 0xfa, 0xf8, 0x10, 0xf8, 0xf1, 0xfe, 0xdb, 0x07, 0x06, 0xdb, 0xfa,\n","  0x08, 0x01, 0x23, 0xfb, 0x0d, 0xff, 0xdf, 0xf0, 0xfc, 0xfd, 0x03, 0xff,\n","  0x02, 0x0b, 0xf7, 0x04, 0xea, 0xf0, 0x0a, 0x19, 0x04, 0xfa, 0xee, 0x00,\n","  0xf5, 0x25, 0x09, 0x24, 0x09, 0xfc, 0xff, 0xff, 0x11, 0x39, 0x05, 0x2a,\n","  0xf8, 0xf9, 0xcc, 0x28, 0x08, 0x05, 0x07, 0x4c, 0xe3, 0x27, 0xd4, 0x06,\n","  0xf8, 0xe8, 0xf9, 0x1d, 0xee, 0x10, 0xdb, 0x06, 0xfd, 0xf2, 0x05, 0xf9,\n","  0x16, 0x26, 0xe3, 0xf3, 0xf8, 0x00, 0xdd, 0xf9, 0x16, 0x3b, 0xe9, 0xfa,\n","  0xe8, 0xfd, 0xf0, 0x26, 0xf1, 0x30, 0xc5, 0xe0, 0xe6, 0xbd, 0xf1, 0xd7,\n","  0x00, 0x24, 0xf6, 0x19, 0xea, 0xca, 0xf1, 0xf8, 0x1f, 0x16, 0xf7, 0xf2,\n","  0xf7, 0x16, 0x00, 0xf6, 0x09, 0xe5, 0x06, 0xfb, 0x12, 0x1f, 0xfc, 0xe7,\n","  0xf8, 0xfc, 0xed, 0x01, 0x03, 0x13, 0x07, 0xff, 0xd3, 0x17, 0xfb, 0x01,\n","  0x12, 0x1d, 0x1c, 0xf6, 0xf1, 0xef, 0xf3, 0x02, 0x15, 0x22, 0x06, 0xed,\n","  0xff, 0xea, 0xef, 0x11, 0x0d, 0x0d, 0xe7, 0xe4, 0xff, 0x09, 0x02, 0xf8,\n","  0xf0, 0x00, 0x02, 0xe2, 0x0d, 0x0c, 0xf7, 0x1b, 0xfa, 0xff, 0xe3, 0xe8,\n","  0x10, 0xe9, 0xea, 0x01, 0x0e, 0xfe, 0x1f, 0xf8, 0x0b, 0x04, 0xe7, 0xfe,\n","  0xf9, 0x02, 0x01, 0xf5, 0x09, 0xf8, 0xfe, 0x0a, 0xfb, 0x06, 0x1b, 0xe2,\n","  0x00, 0xef, 0xde, 0x15, 0xf8, 0x2d, 0xf1, 0x1a, 0x05, 0xff, 0xf0, 0x11,\n","  0x00, 0x41, 0xe2, 0x26, 0x14, 0xd3, 0xde, 0xf3, 0x09, 0x0d, 0xfa, 0x28,\n","  0xdc, 0x37, 0xc7, 0x06, 0xf3, 0xf9, 0x07, 0x27, 0xe9, 0x14, 0xd4, 0x24,\n","  0xfa, 0x04, 0x13, 0x08, 0xf7, 0x11, 0xf0, 0x0d, 0x01, 0x03, 0x06, 0x16,\n","  0x08, 0x47, 0xe2, 0x13, 0xe6, 0xf0, 0xdc, 0x21, 0xf4, 0x3e, 0xeb, 0x19,\n","  0xe2, 0xcc, 0xf5, 0xf7, 0x15, 0x34, 0xde, 0x2c, 0xe0, 0xd6, 0xde, 0xd3,\n","  0x11, 0x0f, 0x01, 0x0f, 0xf2, 0x14, 0x02, 0xee, 0x16, 0xdb, 0xe1, 0xfd,\n","  0x01, 0x13, 0x1d, 0x09, 0x14, 0xf2, 0xd2, 0x05, 0xfe, 0x0a, 0xe9, 0x03,\n","  0x0b, 0x13, 0xf2, 0x21, 0x35, 0x0d, 0x0a, 0xf6, 0xed, 0xf5, 0xf5, 0x0d,\n","  0x2c, 0x2a, 0xf3, 0xec, 0xf3, 0xde, 0xef, 0x0c, 0x07, 0x06, 0x16, 0xd1,\n","  0xf4, 0xfe, 0xe7, 0x1c, 0xf9, 0xfe, 0xf3, 0xc6, 0x04, 0x01, 0xef, 0x03,\n","  0xeb, 0x04, 0x06, 0xd1, 0x05, 0xee, 0xf7, 0x19, 0x25, 0x09, 0x2a, 0xff,\n","  0x20, 0x11, 0xf3, 0x02, 0x0c, 0xf7, 0x08, 0xf2, 0x00, 0xf5, 0xd9, 0x24,\n","  0xfd, 0xfb, 0xe7, 0x06, 0x04, 0xd9, 0x0f, 0xe2, 0xf5, 0x16, 0x03, 0x07,\n","  0xfd, 0xf3, 0xe3, 0xfa, 0xfc, 0x30, 0x27, 0x22, 0x04, 0xf3, 0xdf, 0x0b,\n","  0x12, 0x09, 0xe5, 0x2d, 0xf9, 0x34, 0xbb, 0x13, 0xeb, 0xff, 0xe8, 0x0a,\n","  0xf4, 0x03, 0xea, 0xed, 0xdf, 0xf8, 0x0a, 0xfe, 0x07, 0x31, 0xe7, 0xe8,\n","  0xfc, 0x03, 0x03, 0x03, 0x1a, 0x2a, 0xe5, 0x0a, 0xe5, 0x0d, 0x1d, 0x2a,\n","  0xed, 0x40, 0xd3, 0x05, 0xee, 0xc5, 0xda, 0xf8, 0x12, 0x3f, 0xe6, 0xfc,\n","  0xde, 0xe0, 0xd6, 0xc6, 0x0b, 0x0d, 0x05, 0x01, 0xe7, 0x18, 0xd7, 0xec,\n","  0x05, 0xed, 0xfb, 0x19, 0x0d, 0xf9, 0x03, 0x02, 0x0a, 0xe9, 0xe2, 0x1e,\n","  0x0e, 0x11, 0x05, 0xe6, 0xed, 0x05, 0xe5, 0xe0, 0x1d, 0x18, 0xfb, 0xed,\n","  0xf1, 0xcf, 0xf7, 0x17, 0x2f, 0x20, 0x0a, 0x11, 0x02, 0xed, 0xf0, 0x01,\n","  0x0d, 0x14, 0x09, 0xc8, 0xf0, 0x00, 0xf9, 0xf9, 0xf1, 0x01, 0xe5, 0xce,\n","  0x02, 0xf4, 0xdb, 0x13, 0xfe, 0x07, 0xf5, 0xee, 0x05, 0xe9, 0xef, 0x25,\n","  0x1a, 0x1a, 0x0d, 0x02, 0x18, 0x05, 0xc8, 0xe2, 0xf8, 0xf1, 0x00, 0xf9,\n","  0x1a, 0xf7, 0xf5, 0xf0, 0xef, 0x07, 0xff, 0xf0, 0xee, 0xeb, 0xf5, 0x28,\n","  0xd0, 0x15, 0x1b, 0x1e, 0x08, 0xdc, 0xeb, 0xfa, 0xf3, 0x3b, 0xee, 0x18,\n","  0x03, 0xfa, 0xdb, 0x11, 0x00, 0x10, 0x21, 0x43, 0xe6, 0x39, 0xea, 0xfd,\n","  0xf0, 0xf3, 0xef, 0x1f, 0xd8, 0x19, 0xbc, 0xfb, 0xea, 0xf5, 0xda, 0x01,\n","  0x0e, 0x09, 0xd3, 0xf7, 0x01, 0xfa, 0xec, 0x12, 0x0a, 0x40, 0xd9, 0xec,\n","  0xea, 0x05, 0x13, 0x17, 0xed, 0x4b, 0xc5, 0xfc, 0xf0, 0xc8, 0xf7, 0x07,\n","  0x02, 0x2a, 0xe4, 0xef, 0xd7, 0xed, 0x04, 0xcc, 0x00, 0xf7, 0xf8, 0x0c,\n","  0xe7, 0x1d, 0xfc, 0xe3, 0x07, 0xd8, 0xf7, 0x06, 0x00, 0x15, 0x0c, 0xff,\n","  0x0c, 0xe6, 0xf2, 0xda, 0x1a, 0x1a, 0x0f, 0x04, 0xec, 0xf2, 0xe4, 0x15,\n","  0x13, 0x10, 0x04, 0xf0, 0x01, 0xeb, 0x04, 0x02, 0x21, 0x29, 0x25, 0x03,\n","  0xf9, 0xde, 0xe4, 0x07, 0x0b, 0x13, 0x13, 0xce, 0x1c, 0xfd, 0xed, 0xf3,\n","  0x00, 0x14, 0x1b, 0xd1, 0x0b, 0xf3, 0xf0, 0x06, 0x01, 0x0a, 0x05, 0xe0,\n","  0x16, 0xe2, 0xec, 0xeb, 0x07, 0x05, 0x03, 0xd8, 0x14, 0x02, 0xdd, 0xf8,\n","  0x16, 0x03, 0x07, 0xda, 0x27, 0xf0, 0xf3, 0x10, 0xfc, 0x14, 0xf3, 0x0f,\n","  0x01, 0x0f, 0xfe, 0xee, 0xe0, 0x14, 0x02, 0x22, 0xfd, 0xd8, 0xff, 0xfe,\n","  0xe7, 0x2b, 0x21, 0x2d, 0x05, 0xfc, 0xcb, 0x07, 0xe4, 0x12, 0x17, 0x36,\n","  0xe4, 0x23, 0xf6, 0x19, 0xcf, 0x05, 0xd7, 0x16, 0xd4, 0xfb, 0xc2, 0x20,\n","  0xe3, 0xfe, 0xe9, 0xf8, 0xfc, 0xfd, 0xee, 0x15, 0xf0, 0xf4, 0xe2, 0x12,\n","  0x04, 0x39, 0xdc, 0xff, 0xf9, 0xf4, 0xf9, 0x0b, 0xf4, 0x45, 0xed, 0x0e,\n","  0xcd, 0xda, 0x16, 0xfc, 0x15, 0x37, 0xe4, 0x26, 0xe1, 0xda, 0x22, 0xd8,\n","  0xfc, 0x03, 0x06, 0x06, 0xec, 0x01, 0x04, 0xec, 0x1f, 0xdf, 0xfa, 0xf6,\n","  0x1c, 0x0a, 0x22, 0xda, 0xf7, 0xea, 0x07, 0xe2, 0x0d, 0x0e, 0x04, 0xfa,\n","  0xf1, 0x01, 0xe7, 0x10, 0x2a, 0x18, 0x0d, 0xfa, 0xf0, 0xe9, 0x03, 0xf5,\n","  0x18, 0x24, 0x1b, 0xf0, 0xf2, 0xe0, 0xf2, 0xea, 0x1a, 0x05, 0x13, 0xde,\n","  0x0d, 0xfb, 0xe6, 0x15, 0x0a, 0xf9, 0x0d, 0xe0, 0x00, 0x00, 0xf6, 0x12,\n","  0xf6, 0x09, 0x06, 0xe4, 0x0c, 0xfb, 0xe7, 0xeb, 0xff, 0xfe, 0xf2, 0xde,\n","  0x21, 0x14, 0x03, 0x04, 0x0d, 0xff, 0x21, 0xe9, 0x24, 0xf9, 0x03, 0x00,\n","  0xf0, 0xfb, 0xff, 0xf5, 0xf6, 0x20, 0xfd, 0x25, 0xe7, 0x06, 0xf8, 0x09,\n","  0x00, 0xdf, 0xef, 0xf0, 0xdd, 0x30, 0xde, 0x33, 0x11, 0xe9, 0x01, 0x04,\n","  0x00, 0x13, 0xf7, 0x32, 0xf9, 0x1f, 0xf6, 0x05, 0xda, 0xfb, 0x1c, 0x1d,\n","  0xdf, 0x18, 0xda, 0x10, 0xda, 0x04, 0x1a, 0xe0, 0x15, 0x09, 0xd7, 0x0c,\n","  0xe6, 0x19, 0xf9, 0x0a, 0xfe, 0x47, 0xdb, 0x09, 0xdf, 0x00, 0xe8, 0x22,\n","  0xe6, 0x4e, 0xd7, 0x0d, 0xde, 0xeb, 0xf7, 0x0d, 0x0f, 0x2d, 0xe5, 0xfd,\n","  0xdf, 0xe7, 0x14, 0xed, 0x0c, 0x09, 0xf7, 0x11, 0x02, 0x1c, 0x0f, 0xcc,\n","  0x1e, 0xf7, 0xf2, 0xf1, 0x09, 0x0e, 0xe2, 0xfb, 0xfd, 0xe4, 0x07, 0x07,\n","  0x15, 0xfd, 0x28, 0xf6, 0xf2, 0xec, 0xe7, 0xf5, 0x17, 0x1e, 0xe8, 0x0a,\n","  0xec, 0xd7, 0xe9, 0x27, 0x1f, 0x36, 0xf0, 0xee, 0xf6, 0xe0, 0xf2, 0x0e,\n","  0x26, 0x1d, 0x0d, 0xdd, 0x02, 0xe7, 0xe0, 0x09, 0xf9, 0x0d, 0xde, 0xe2,\n","  0xfe, 0xef, 0xee, 0x06, 0xfa, 0x1a, 0x0e, 0xd9, 0x10, 0xf9, 0x10, 0x08,\n","  0x0f, 0xfb, 0xf8, 0xcf, 0x23, 0x13, 0xf5, 0x04, 0x07, 0x0e, 0x22, 0xfd,\n","  0x29, 0xf3, 0xf2, 0x18, 0xf1, 0x04, 0x0b, 0x11, 0x00, 0x09, 0xed, 0x07,\n","  0xe0, 0x27, 0x07, 0x1e, 0xe4, 0xda, 0x0e, 0xdb, 0xd6, 0x3d, 0x0f, 0x25,\n","  0x03, 0xdf, 0xdc, 0x20, 0xe5, 0x09, 0xea, 0x36, 0xee, 0x29, 0xda, 0x00,\n","  0xde, 0xfc, 0x27, 0x21, 0xdb, 0x38, 0xed, 0x05, 0xd2, 0xfe, 0xf4, 0xf4,\n","  0x07, 0xf5, 0xd3, 0xfc, 0xe1, 0x0e, 0xf7, 0x19, 0x00, 0x35, 0xe0, 0xf6,\n","  0x01, 0xfe, 0x08, 0x27, 0xf1, 0x52, 0xec, 0x06, 0xdd, 0xd7, 0x1a, 0xfd,\n","  0x0b, 0x2d, 0xdf, 0x02, 0xd8, 0xe9, 0xdd, 0xec, 0x0b, 0x14, 0x0a, 0x02,\n","  0xec, 0x1d, 0x08, 0xde, 0x11, 0xec, 0x0e, 0x0b, 0x2b, 0xee, 0x0c, 0xf4,\n","  0x0c, 0xdc, 0x03, 0x05, 0x17, 0xfb, 0xf5, 0xf9, 0xea, 0xeb, 0x06, 0x12,\n","  0x20, 0x20, 0x0e, 0xf9, 0x00, 0xd7, 0x11, 0x03, 0x1d, 0x2b, 0x01, 0xe5,\n","  0x01, 0xf6, 0xf9, 0x03, 0x10, 0x1d, 0x14, 0xcd, 0xfd, 0xe3, 0xe9, 0x1d,\n","  0xfa, 0x11, 0x07, 0xd9, 0x09, 0xfd, 0xeb, 0x02, 0xfc, 0x0e, 0xe6, 0xe9,\n","  0x0c, 0xeb, 0xeb, 0x08, 0x0e, 0xf6, 0xf7, 0xe2, 0x14, 0x09, 0x02, 0xfd,\n","  0x09, 0x02, 0x0e, 0xe9, 0x31, 0x05, 0xfc, 0x11, 0xef, 0x05, 0x05, 0x09,\n","  0xfc, 0xd7, 0x09, 0xee, 0xdc, 0x05, 0x0e, 0x18, 0xd2, 0xd3, 0xf1, 0x26,\n","  0xcc, 0x3a, 0x0d, 0x2a, 0x22, 0xf7, 0xce, 0x14, 0xed, 0x1a, 0xfe, 0x54,\n","  0xf3, 0x2f, 0xfb, 0x14, 0xe6, 0xf3, 0xe7, 0x15, 0xd3, 0x1a, 0xe2, 0x0b,\n","  0xe0, 0xf6, 0x0a, 0xf6, 0xe8, 0xea, 0xf4, 0x17, 0xd4, 0x11, 0xfb, 0x11,\n","  0xfd, 0x37, 0xf6, 0x0b, 0xe3, 0x00, 0x05, 0x1a, 0xdc, 0x59, 0xd7, 0x17,\n","  0xbf, 0xe7, 0xe3, 0x2a, 0xfa, 0x30, 0xfb, 0xf3, 0xde, 0xee, 0x01, 0xfd,\n","  0x10, 0x1b, 0x06, 0xf5, 0xee, 0xf8, 0xd9, 0xe4, 0x0a, 0xe0, 0x01, 0x13,\n","  0x22, 0x0d, 0xda, 0xfe, 0x06, 0xeb, 0xfe, 0xe9, 0x1c, 0x1e, 0xff, 0xfb,\n","  0xfb, 0xf9, 0xe6, 0xec, 0x14, 0x2d, 0x01, 0x09, 0xd9, 0xd9, 0x0c, 0xe7,\n","  0x2e, 0x12, 0xef, 0xe9, 0xfe, 0xee, 0x0b, 0x13, 0x0d, 0x0d, 0xf1, 0xf2,\n","  0xf6, 0xf5, 0xf0, 0x1e, 0xf6, 0xf7, 0x01, 0xcd, 0xfd, 0x04, 0xed, 0xfd,\n","  0xfd, 0x22, 0x16, 0xde, 0x09, 0xee, 0xee, 0xe3, 0x19, 0x14, 0xd7, 0xee,\n","  0x1d, 0xf6, 0x02, 0xfd, 0x21, 0x02, 0xe0, 0xfa, 0x21, 0xf4, 0xfe, 0xf8,\n","  0xf4, 0xf1, 0xff, 0x19, 0xf5, 0x02, 0xea, 0x23, 0x01, 0x07, 0x0e, 0x2f,\n","  0xf2, 0xec, 0x04, 0xfb, 0xd9, 0x40, 0xee, 0x19, 0x30, 0xed, 0xf5, 0xe7,\n","  0xe6, 0x20, 0xf6, 0x27, 0xf3, 0x2a, 0xec, 0x14, 0xff, 0x00, 0x0f, 0x05,\n","  0xd0, 0x0d, 0xfe, 0x1e, 0xd7, 0xf8, 0xeb, 0xfe, 0xf3, 0x16, 0xe9, 0x07,\n","  0xec, 0x10, 0x19, 0x08, 0x0e, 0x2a, 0x11, 0x08, 0xe1, 0xe9, 0x11, 0x1c,\n","  0xf1, 0x53, 0xd4, 0xf8, 0xc7, 0xed, 0xf4, 0x03, 0xf1, 0x29, 0xfc, 0xf0,\n","  0xc3, 0xf5, 0xf4, 0x0c, 0x21, 0x11, 0xf9, 0x0b, 0xe0, 0xfc, 0x08, 0xfc,\n","  0x12, 0xe1, 0x18, 0x03, 0x17, 0x0f, 0xfc, 0xdb, 0x06, 0xeb, 0x05, 0x0f,\n","  0x17, 0x0b, 0x0f, 0x16, 0xdd, 0xf1, 0xf1, 0xfd, 0x0f, 0x29, 0xec, 0x21,\n","  0xe7, 0xe3, 0x0c, 0x09, 0x0d, 0x10, 0xe7, 0x12, 0xf6, 0xe9, 0x01, 0x0f,\n","  0x17, 0xf0, 0xec, 0xd2, 0xff, 0x0c, 0xd5, 0x0b, 0xff, 0x09, 0xf5, 0xe1,\n","  0xfe, 0xf5, 0x0f, 0x0e, 0x01, 0x18, 0xfa, 0xe7, 0x02, 0x0f, 0xf4, 0x1b,\n","  0x24, 0x03, 0xfe, 0xf1, 0x16, 0x0d, 0xfa, 0xf3, 0x13, 0x17, 0x04, 0xec,\n","  0x17, 0x00, 0xf0, 0x17, 0x01, 0xff, 0x0e, 0xfb, 0xfc, 0x23, 0x0c, 0x02,\n","  0xe8, 0x00, 0xfe, 0x1c, 0xd8, 0xd9, 0x05, 0xda, 0xa8, 0x2a, 0xdf, 0x3f,\n","  0x0d, 0xe6, 0xe8, 0x0d, 0xf1, 0x17, 0x15, 0x57, 0xe5, 0x24, 0xfa, 0x0b,\n","  0xe9, 0xef, 0xfc, 0x03, 0xd0, 0x2e, 0xe7, 0x1e, 0xe8, 0x18, 0x15, 0xfe,\n","  0xfd, 0x0b, 0xe9, 0x0d, 0xe8, 0x00, 0x00, 0x08, 0xfd, 0x27, 0xea, 0x01,\n","  0xda, 0x01, 0xfa, 0x2a, 0xf8, 0x46, 0x03, 0x04, 0xd8, 0xed, 0xfb, 0x28,\n","  0xed, 0x43, 0xeb, 0x17, 0xce, 0xed, 0xe0, 0xe2, 0x0b, 0x18, 0x03, 0x24,\n","  0xdd, 0xfe, 0xee, 0xf3, 0x27, 0xef, 0x00, 0x1b, 0x1d, 0x19, 0xee, 0xf0,\n","  0x0b, 0x03, 0xf6, 0x03, 0x12, 0x1f, 0x0a, 0x1b, 0xf0, 0xec, 0xf5, 0x02,\n","  0x25, 0x15, 0x0d, 0x23, 0xe0, 0xe1, 0x14, 0xf5, 0x07, 0x24, 0xf8, 0x0e,\n","  0xf3, 0x00, 0xf8, 0xf4, 0x07, 0x00, 0xfa, 0xe5, 0x06, 0xe9, 0xf7, 0x16,\n","  0x13, 0xfb, 0x04, 0xe6, 0xf1, 0xde, 0x07, 0xfc, 0x08, 0x05, 0xf2, 0xfe,\n","  0x0f, 0xfd, 0xea, 0xe8, 0x0e, 0x03, 0xf6, 0xe9, 0x15, 0xf5, 0xf4, 0xf1,\n","  0x04, 0xf8, 0x06, 0xe6, 0x23, 0xe5, 0xff, 0xf1, 0xfa, 0x00, 0xf4, 0xfb,\n","  0xf6, 0x08, 0x02, 0xfe, 0x07, 0x08, 0xfe, 0x10, 0xef, 0xd6, 0x02, 0xed,\n","  0xbd, 0x21, 0xff, 0x29, 0x22, 0xe3, 0xe1, 0xfa, 0xfe, 0x33, 0x00, 0x31,\n","  0xf0, 0x1b, 0x03, 0x06, 0xf1, 0xe5, 0xe8, 0xdb, 0xe6, 0x23, 0xeb, 0x09,\n","  0xef, 0x0b, 0x1b, 0xfc, 0xff, 0x00, 0xee, 0x0a, 0xc2, 0x14, 0xfe, 0x0e,\n","  0xf8, 0x33, 0xe6, 0x0f, 0xeb, 0x08, 0x27, 0x24, 0xf0, 0x54, 0xea, 0xeb,\n","  0xc4, 0xe1, 0xf3, 0xe2, 0xfd, 0x3c, 0xfe, 0xfa, 0xca, 0xec, 0xef, 0x1a,\n","  0x08, 0x13, 0x03, 0x17, 0xf2, 0x09, 0xfa, 0xe8, 0x26, 0xee, 0x09, 0x22,\n","  0x07, 0x0f, 0x08, 0xf8, 0x00, 0x02, 0xfe, 0xda, 0x2a, 0x0d, 0x0e, 0x23,\n","  0xf8, 0xfa, 0xfb, 0xf5, 0x1e, 0x14, 0xf8, 0xf2, 0xdd, 0xf9, 0xf1, 0x1b,\n","  0x0e, 0x0e, 0xf9, 0xf7, 0xe6, 0xfb, 0x0a, 0x03, 0xf6, 0x12, 0xf8, 0xff,\n","  0xed, 0x14, 0x04, 0x12, 0x12, 0x0b, 0x05, 0xf1, 0x03, 0xe2, 0xf2, 0x18,\n","  0xfb, 0x00, 0xf1, 0xc6, 0x0f, 0xfd, 0xeb, 0xef, 0x16, 0xf8, 0x00, 0xf5,\n","  0x20, 0xf7, 0x04, 0xf0, 0x08, 0x06, 0x0a, 0xf9, 0x11, 0xf4, 0xf8, 0xec,\n","  0xf2, 0x0d, 0x15, 0xe0, 0xe4, 0x07, 0xef, 0xfd, 0xf3, 0x13, 0xfa, 0x1e,\n","  0xf2, 0xee, 0xec, 0xf3, 0xc9, 0x18, 0x13, 0x34, 0x16, 0xeb, 0xf3, 0xe0,\n","  0xd2, 0x24, 0xf4, 0x25, 0xe4, 0xfe, 0x03, 0x04, 0xf0, 0xfc, 0xf6, 0xf1,\n","  0xcc, 0xfe, 0xf9, 0xeb, 0xdb, 0x1c, 0xf0, 0x18, 0xf6, 0xff, 0x00, 0x0b,\n","  0xe0, 0x14, 0xf6, 0x14, 0xff, 0x2b, 0x19, 0xd9, 0xed, 0xf8, 0x00, 0x07,\n","  0xe2, 0x48, 0xf0, 0xf3, 0xd5, 0xeb, 0x04, 0xef, 0xee, 0x38, 0xf4, 0x20,\n","  0xc9, 0xfe, 0xe9, 0x08, 0x0a, 0x0d, 0xf7, 0x22, 0xf8, 0x12, 0xfe, 0xe4,\n","  0x25, 0xdb, 0x0d, 0x05, 0xfe, 0x02, 0xe6, 0x03, 0xfc, 0xf5, 0x08, 0xee,\n","  0x16, 0x20, 0x07, 0xf3, 0xfb, 0xec, 0x00, 0xec, 0x16, 0x22, 0x00, 0x1e,\n","  0xe0, 0xd6, 0x00, 0x1a, 0x09, 0x1a, 0xeb, 0x13, 0xfe, 0xe2, 0xf7, 0xff,\n","  0xe6, 0xf3, 0x28, 0xf9, 0x01, 0xf8, 0xee, 0xe8, 0x07, 0x0a, 0x1d, 0xf0,\n","  0xed, 0xde, 0x06, 0x15, 0xfd, 0xf5, 0x13, 0xcf, 0xfc, 0x00, 0x0b, 0xdc,\n","  0x0e, 0xf9, 0xfa, 0xed, 0x12, 0xfa, 0xf7, 0x07, 0x22, 0xe6, 0x01, 0xee,\n","  0x0e, 0xef, 0xf5, 0x13, 0x01, 0xe9, 0xed, 0x03, 0xe7, 0xda, 0xdc, 0x19,\n","  0xf5, 0x1d, 0x07, 0x2c, 0xee, 0xd5, 0xf1, 0xf2, 0xc6, 0x21, 0x01, 0x3a,\n","  0x0e, 0xe0, 0x06, 0x0f, 0xdc, 0x33, 0xed, 0x30, 0xec, 0x02, 0x23, 0x06,\n","  0x01, 0x13, 0xfe, 0xed, 0xdb, 0xfc, 0x13, 0xf9, 0xfc, 0x0d, 0x09, 0x18,\n","  0xeb, 0x0d, 0xeb, 0x02, 0xcc, 0x10, 0x1a, 0xed, 0x05, 0x2c, 0xf6, 0x07,\n","  0xf0, 0xf0, 0xf0, 0x17, 0xe9, 0x33, 0xeb, 0x19, 0xb8, 0xf0, 0xeb, 0xed,\n","  0xdf, 0x1b, 0xf6, 0xfa, 0xd2, 0xf4, 0xfe, 0x18, 0xf1, 0x09, 0xf0, 0xe5,\n","  0xd8, 0x08, 0x1a, 0x19, 0x16, 0xe1, 0x12, 0x02, 0x14, 0x1c, 0x02, 0x1e,\n","  0x11, 0xf1, 0x08, 0x01, 0x0b, 0x1d, 0xfe, 0x21, 0xed, 0xf1, 0x03, 0x0c,\n","  0x21, 0xfe, 0xfa, 0x18, 0xec, 0xe4, 0x02, 0x09, 0x15, 0x03, 0x0b, 0x0c,\n","  0xfb, 0xeb, 0xfa, 0xfc, 0xf7, 0xf8, 0xf0, 0x14, 0xf5, 0xfa, 0x0a, 0x09,\n","  0x13, 0x06, 0xea, 0xf5, 0x06, 0xfb, 0xfa, 0x0f, 0x10, 0xf9, 0xfa, 0xe7,\n","  0xf2, 0xe2, 0x01, 0x14, 0x06, 0x02, 0xf3, 0xfe, 0x0c, 0xfb, 0xf0, 0xfa,\n","  0x07, 0xe9, 0xea, 0xf1, 0xf5, 0xf7, 0xe5, 0xf7, 0xf8, 0xf5, 0xf9, 0xf5,\n","  0xe7, 0xfd, 0xe7, 0xfc, 0xe2, 0x28, 0x0e, 0x00, 0xf3, 0xd9, 0x10, 0x16,\n","  0xf4, 0x39, 0xe8, 0x28, 0xed, 0xf3, 0xdc, 0x07, 0x0f, 0x3a, 0xec, 0x09,\n","  0xeb, 0xf7, 0x16, 0x09, 0xf9, 0x1f, 0x02, 0x26, 0xd9, 0xfb, 0x0a, 0xf5,\n","  0xf5, 0x2b, 0xe7, 0xfb, 0xfd, 0x05, 0x1b, 0x0a, 0x05, 0x16, 0xf3, 0xfd,\n","  0xe8, 0x23, 0x01, 0xe7, 0xfb, 0x0f, 0x0d, 0x03, 0xe0, 0x3c, 0x1f, 0xe3,\n","  0xd2, 0x13, 0xfd, 0xeb, 0xe4, 0x11, 0xfb, 0x08, 0xde, 0xeb, 0x02, 0xf2,\n","  0x10, 0xf8, 0xf0, 0x16, 0xe3, 0x21, 0x05, 0x14, 0x17, 0xe6, 0xf7, 0xed,\n","  0x09, 0x19, 0x14, 0x23, 0x0b, 0xfe, 0x13, 0x26, 0x00, 0x25, 0xef, 0xee,\n","  0x05, 0x00, 0x07, 0xf3, 0xfb, 0x15, 0xf5, 0xfc, 0xe0, 0xf4, 0xf4, 0xe4,\n","  0xe5, 0x10, 0xf6, 0x03, 0xdc, 0xe5, 0x09, 0xf5, 0xe7, 0xf1, 0xf1, 0xdb,\n","  0x01, 0x09, 0xfd, 0xdc, 0xdc, 0xfc, 0x12, 0xee, 0x15, 0xdf, 0x15, 0xe1,\n","  0xf9, 0x08, 0x05, 0xde, 0x04, 0xea, 0x25, 0x1d, 0x12, 0xd9, 0xf0, 0xdf,\n","  0x31, 0xe1, 0xe5, 0xe5, 0xf6, 0xf1, 0x00, 0xe9, 0x07, 0xf2, 0x08, 0x0b,\n","  0xee, 0xe1, 0xfa, 0x06, 0x76, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0xc9, 0x01, 0x00, 0x00, 0x59, 0xfe, 0xff, 0xff,\n","  0x8f, 0xfe, 0xff, 0xff, 0x50, 0x01, 0x00, 0x00, 0x60, 0xfb, 0xff, 0xff,\n","  0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,\n","  0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x58, 0xfa, 0xff, 0xff, 0xbc, 0x01, 0x00, 0x00,\n","  0xb0, 0x01, 0x00, 0x00, 0xa4, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x78, 0x01, 0x00, 0x00, 0x18, 0x01, 0x00, 0x00,\n","  0xb4, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0xaa, 0xfe, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n","  0xce, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x03, 0x00, 0x00, 0x00,\n","  0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x1a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00,\n","  0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,\n","  0x07, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,\n","  0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x28, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x38, 0x00, 0x00, 0x00,\n","  0x2c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x1a, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0x00,\n","  0x38, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,\n","  0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00,\n","  0x10, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0xac, 0x04, 0x00, 0x00, 0x44, 0x04, 0x00, 0x00, 0xc4, 0x03, 0x00, 0x00,\n","  0x4c, 0x03, 0x00, 0x00, 0xd0, 0x02, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00,\n","  0x20, 0x02, 0x00, 0x00, 0xb4, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,\n","  0x6c, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xd4, 0xff, 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n","  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n","  0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa8, 0x07, 0x00, 0x00,\n","  0xf2, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x4c, 0x00, 0x00, 0x00,\n","  0x07, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xe4, 0xfb, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,\n","  0x13, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n","  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,\n","  0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n","  0xb4, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,\n","  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00,\n","  0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0xd6, 0x72, 0xec, 0x39, 0x57, 0x66, 0x72, 0x3a,\n","  0x1e, 0xe6, 0x14, 0x3a, 0x27, 0x15, 0x3a, 0x39, 0x33, 0xb7, 0x25, 0x3a,\n","  0xf6, 0x03, 0x80, 0x3a, 0xd2, 0x73, 0x28, 0x39, 0x79, 0xbb, 0x5c, 0x3a,\n","  0x12, 0x00, 0x00, 0x00, 0x66, 0x69, 0x72, 0x73, 0x74, 0x5f, 0x77, 0x65,\n","  0x69, 0x67, 0x68, 0x74, 0x73, 0x2f, 0x72, 0x65, 0x61, 0x64, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x32, 0xfd, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x09, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xfd, 0xff, 0xff,\n","  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x4a, 0xb2, 0xf3, 0x39, 0x1f, 0x00, 0x00, 0x00, 0x66, 0x69, 0x6e, 0x61,\n","  0x6c, 0x5f, 0x66, 0x63, 0x5f, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73,\n","  0x2f, 0x72, 0x65, 0x61, 0x64, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70,\n","  0x6f, 0x73, 0x65, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xa0, 0x0f, 0x00, 0x00, 0x9a, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x58, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xbb, 0xb0, 0xba, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0xd8, 0x1c, 0x35, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x3b, 0xcf, 0x3e, 0xc1, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,\n","  0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x06, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,\n","  0x2c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x0f, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,\n","  0x32, 0x2f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x42, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x14, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,\n","  0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0xba, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x60, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x8c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,\n","  0x61, 0x70, 0x65, 0x5f, 0x31, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa8, 0x07, 0x00, 0x00,\n","  0x2e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x60, 0x00, 0x00, 0x00,\n","  0x09, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,\n","  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,\n","  0xbd, 0xad, 0x93, 0x3d, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x1a, 0x93, 0x41,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0xaa, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,\n","  0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0xc4, 0x94, 0x0c, 0x38, 0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d,\n","  0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,\n","  0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x02, 0xa4, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n","  0x8c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x50, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x7c, 0x67, 0x40, 0x38, 0x32, 0x3f, 0xc5, 0x38, 0x5e, 0x53, 0x72, 0x38,\n","  0x90, 0x6b, 0x97, 0x37, 0xd6, 0xd8, 0x86, 0x38, 0xc2, 0x56, 0xd0, 0x38,\n","  0xf3, 0x12, 0x89, 0x37, 0x92, 0x9d, 0xb3, 0x38, 0x0b, 0x00, 0x00, 0x00,\n","  0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x70, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xca, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x06, 0x02, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x72, 0xe6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x19,\n","  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x00, 0x16, 0x0a, 0x00, 0x0e, 0x00, 0x07, 0x00,\n","  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00,\n","  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,\n","  0x03, 0x00, 0x00, 0x00\n","};\n","unsigned int g_model_len = 18952;\n"],"name":"stdout"}]}]}
\ No newline at end of file

From b8d991c9b4d4ba8c9156d4958c83504791ff9929 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 8 May 2020 15:45:44 -0700
Subject: [PATCH 0228/1533] [tf.lite] Fix issue with direct ByteBuffer inputs
 and dynamic graphs

In these graphs, the input tensor pointers may get "refreshed" during
invocation. This refresh is fine if the original pointer came from the
arena, but if it comes from something like the direct ByteBuffer raw
address, the input data will be lost.

Avoid this by simply using memcpy from the direct ByteBuffer. This is
still quite fast, but avoids the hack where we simply inject the
direct ByteBuffer address as the tensor buffer pointer.

A longer term solution will formally allow providing "custom" allocated
regions to tensor inputs, but until then, do the safe thing.

PiperOrigin-RevId: 310643333
Change-Id: I05dfebd24617ebb1af7eb281ff9e530b01669093
---
 tensorflow/lite/java/BUILD                    |   1 +
 .../main/java/org/tensorflow/lite/Tensor.java |   2 +-
 .../lite/java/src/main/native/tensor_jni.cc   |  16 ++++++-
 .../org/tensorflow/lite/InterpreterTest.java  |  45 +++++++++++++++++-
 tensorflow/lite/testdata/dynamic_shapes.bin   | Bin 0 -> 5264 bytes
 5 files changed, 60 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/lite/testdata/dynamic_shapes.bin

diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 49c2136ffb4..2fcb4b631be 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -240,6 +240,7 @@ java_test(
     data = [
         "src/testdata/add.bin",
         "src/testdata/add_unknown_dimensions.bin",
+        "//tensorflow/lite:testdata/dynamic_shapes.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
         "//tensorflow/lite:testdata/multi_add_flex.bin",
     ],
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 89a2a6a0639..cc9a6a451ac 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -196,7 +196,7 @@ public final class Tensor {
   }
 
   private void setTo(Buffer src) {
-    // Note that we attempt to use zero-copy optimization for direct, native-ordered buffers.
+    // Note that we attempt to use a direct memcpy optimization for direct, native-ordered buffers.
     // There are no base Buffer#order() or Buffer#put() methods, so again we have to ugly cast.
     if (src instanceof ByteBuffer) {
       ByteBuffer srcBuffer = (ByteBuffer) src;
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 99be71ba37d..dfa4e22162a 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -402,14 +402,26 @@ JNIEXPORT void JNICALL Java_org_tensorflow_lite_Tensor_writeDirectBuffer(
   TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
 
-  char* src_data_raw = static_cast<char*>(env->GetDirectBufferAddress(src));
+  void* src_data_raw = env->GetDirectBufferAddress(src);
   if (!src_data_raw) {
     ThrowException(env, kIllegalArgumentException,
                    "Input ByteBuffer is not a direct buffer");
     return;
   }
 
-  tensor->data.raw = src_data_raw;
+  if (!tensor->data.data) {
+    ThrowException(env, kIllegalArgumentException,
+                   "Internal error: Tensor hasn't been allocated.");
+    return;
+  }
+
+  // Historically, we would simply overwrite the tensor buffer pointer with
+  // the direct Buffer address. However, that is generally unsafe, and
+  // specifically wrong if the graph happens to have dynamic shapes where
+  // arena-allocated input buffers will be refreshed during invocation.
+  // TODO(b/156094015): Explore whether this is actually faster than
+  // using ByteBuffer.put(ByteBuffer).
+  memcpy(tensor->data.data, src_data_raw, tensor->bytes);
 }
 
 JNIEXPORT void JNICALL
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index cd782c7f5aa..6b6799eaad9 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -40,6 +40,8 @@ public final class InterpreterTest {
       "tensorflow/lite/testdata/multi_add_flex.bin";
   private static final String UNKNOWN_DIMS_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/add_unknown_dimensions.bin";
+  private static final String DYNAMIC_SHAPES_MODEL_PATH =
+      "tensorflow/lite/testdata/dynamic_shapes.bin";
 
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
   private static final ByteBuffer MULTIPLE_INPUTS_MODEL_BUFFER =
@@ -48,6 +50,8 @@ public final class InterpreterTest {
       TestUtils.getTestFileAsBuffer(FLEX_MODEL_PATH);
   private static final ByteBuffer UNKNOWN_DIMS_MODEL_PATH_BUFFER =
       TestUtils.getTestFileAsBuffer(UNKNOWN_DIMS_MODEL_PATH);
+  private static final ByteBuffer DYNAMIC_SHAPES_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(DYNAMIC_SHAPES_MODEL_PATH);
 
   @Test
   public void testInterpreter() throws Exception {
@@ -434,7 +438,7 @@ public final class InterpreterTest {
     interpreter.close();
   }
 
-  /** Smoke test validating that flex model loading fails when the flex delegate is not linked. */
+  // Smoke test validating that flex model loading fails when the flex delegate is not linked.
   @Test
   public void testFlexModel() throws Exception {
     try {
@@ -573,6 +577,45 @@ public final class InterpreterTest {
     }
   }
 
+  private static FloatBuffer fill(FloatBuffer buffer, float value) {
+    while (buffer.hasRemaining()) {
+      buffer.put(value);
+    }
+    buffer.rewind();
+    return buffer;
+  }
+
+  // Regression test case to ensure that graphs with dynamically computed shapes work properly.
+  // Historically, direct ByteBuffer addresses would overwrite the arena-allocated tensor input
+  // pointers. Normally this works fine, but for dynamic graphs, the original input tensor pointers
+  // may be "restored" at invocation time by the arena allocator, resetting the direct ByteBuffer
+  // address and leading to stale input data being used.
+  @Test
+  public void testDynamicShapesWithDirectBufferInputs() {
+    try (Interpreter interpreter = new Interpreter(DYNAMIC_SHAPES_MODEL_BUFFER)) {
+      ByteBuffer input0 =
+          ByteBuffer.allocateDirect(8 * 42 * 1024 * 4).order(ByteOrder.nativeOrder());
+      ByteBuffer input1 =
+          ByteBuffer.allocateDirect(1 * 90 * 1024 * 4).order(ByteOrder.nativeOrder());
+      ByteBuffer input2 = ByteBuffer.allocateDirect(1 * 4).order(ByteOrder.nativeOrder());
+      Object[] inputs = {input0, input1, input2};
+
+      fill(input0.asFloatBuffer(), 2.0f);
+      fill(input1.asFloatBuffer(), 0.5f);
+      // Note that the value of this input dictates the shape of the output.
+      fill(input2.asFloatBuffer(), 1.0f);
+
+      FloatBuffer output = FloatBuffer.allocate(8 * 1 * 1024);
+      Map<Integer, Object> outputs = new HashMap<>();
+      outputs.put(0, output);
+
+      interpreter.runForMultipleInputsOutputs(inputs, outputs);
+
+      FloatBuffer expected = fill(FloatBuffer.allocate(8 * 1 * 1024), 2.0f);
+      assertThat(output.array()).usingTolerance(0.1f).containsExactly(expected.array()).inOrder();
+    }
+  }
+
   private static native long getNativeHandleForDelegate();
 
   private static native long getNativeHandleForInvalidDelegate();
diff --git a/tensorflow/lite/testdata/dynamic_shapes.bin b/tensorflow/lite/testdata/dynamic_shapes.bin
new file mode 100644
index 0000000000000000000000000000000000000000..268d457131a9b0e664378214bd588a908228edc9
GIT binary patch
literal 5264
zcmaKwUyM~(6~?!Sg;K;3=|3FXYaK1tDFeg5Nx?X#qf#9j!Vp6UalGwZ!;N?D&^vc<
zXbfo@V~A;pF+BLtG(0qf7}5|jF{B}8Vt62AYz&zg9*FUw52nG2wIbHB$KP+CyDw+B
zw1<3i_St*w^?hrtz4tkn?hwM>Zx3$@VRh&ZOT+T8GIWJC^gcvS2%iM=;1byTiNJUW
z3*bqx8?#}s6AXdfpQ968Wa2c~0Iq#DgcD#8T<@X{c7Pk73E?Ez4VHqLRm{c50`qTy
z_rVQt9sCu%4gLbIfvaF1{1ME7OW-1y1sA}1FayqlGvG9M9ZW+(2;nUFG`Pi|onU??
ze(npQcLihj5)(K;p!4^Hun)|B3M@s>l6*eqF;8a8e288EMHsKvM;lZ1W_7$WdbrY<
ztWMO!TYt+krjyqYHm={eY5hhTt&7mZuHk3)l%AfbW8bWdt*2)d*}q*2Q}Ze~27U@i
z6QeoC%hftf?1=$k`@lT041!tW83yNxrx%<>=m3~mMn9M%ww>S-@eP90I6Xtb2f!?`
z4ud%oHVo$QZ|<G|t_d%nF$rG-UDWy#HU2d?2$q960)7b$fa?VMDzM%Q1oS!>2G-yV
zH~?0H3+S(b3+Uh3ieX19h5=$Yf&PbDG4v3_Ecyq)dGv1pGsMs>5P1S|-CzcxgWxne
zH-r8E$}(;h+{L^>7xVeEce5<TXk1?fUVHzYWzI9sZZH7Mzj;7P$hhpV{mfW};x~`b
zY>(^H@wf562X5!O7Q8zqk9_Mu30#D4kh{5-ZwP2BpIqsBkmp8a@?iN;W%SU*WOKCq
ze6xDE+;q)ptM7y#{vGa{G46YfSHDR^{f7N7eM4I{cz+!9=XrYA=Y6-bte3l5U2C@0
zr0%4La)gnIA2r9z)3HBsUxWLtocn%loS$-ji?PI~ude{g7%sK=)bmk%+Ko@uMmKVI
zZsHpzo;TrBUy9K&eQWn+@F?j0PnPZAZme?HzB8r==xgV=Q7NW)5$7zN>#3V-pvWng
zoIQYL98SXNT2hNO7yxn>*OGN->4#>cT%V+{6(UUI#^Lyee`MKX<cQ`<ZHqY_W=uOd
zy?+UOnCJh#*#AM6ou*H1YB&Rm8uV{Xk8-b1G{(!d>Wh`_?N;B&3sdFBqK>KYzD8wi
zisd&^AFbBMs?S#@ixZ~C3s1Ys)75ci9O>*U^z4~@q0zjZ7d5GU-FsQKmK@Uj-U|Mx
zKjnMtKeOx)b8}x$F?)Xud_Y~eJ?C@jzxm!WH_jabcZ>ag`VUrWwTaQ6Of+g^eZ!UV
zkB{uB)TZc9wMd#Spy`cVQ^%Qdm9eD5b>Q_ya6GTwQO9P`w1K;IQj_cBZkjme!me_2
z*Ho=j!yvJ%;m-9Zm%N_`=IfqZ=i~DBF63*`9LZZ;i)l}L_nd66McT&i{Dcc~v!>|U
z?pU!Vm*3B_WM5*;wr9W>fbm>moZAv(R;&8Wh2!mgxA<)9izgoA^6PI0KGu^%njQS#
z<qR`^Z65`;9|3Ydo1bryyTo(iR)3p&(VlNpM}PV{5BG0!?vAr<o-ujl)7F@b{TF$z
zRy%g3rNRBHzEnTs;%wtD`NU@JhcUiVF}KI5pFV`XT{DjJDh|AU<GnxM_e<9CtPcH`
zfO}@L*{F_H#zrS=?AOt4{dC1^K5AMO)?)3~;4;2AcvX9CGjBa0oi}E3ITrSFEm4aa
zyu0QO=egPC^Y2<(99OSe^kYs&;#+J>oM-G^*d8T5&-k>zlTUNG7C8S;x&LoDwm^<t
zJNg&JeN<e}=4o%WR^isMmXE=E6JBkKyz<HWFz~(mEH>O8khh6*d5!nuc#U&U{=V+u
z-5JlQRq@--tS^JTmGe@{wVKvpC#O1m_M6jhNNv=+hL-(#<!I2ehQH>*7_>`yF`kqc
zKE1+7z8*$g4lV}A<q{|1{Z+mOJNDQcoMp*Y|F(T^dTyM{&$+n%)NIXqfLe=ueU<tc
zU6ITAUHU{E%?`dtIKx~c>arHLjX80?^S?N6#PR<8yjvot+S2+r-c~Kzn_7E)y#n{~
z)_&{dxdhCi-{sEFucfg*&Jl8}!S{xBa}SDhE&g@1)tdN?fARj?>}O28_QpB>2I$y-
z51Jd}SPA4Cz*c^F9aE<<yqoLqacpaT-ozZJGigxgcgoF!mB#n`xAXgBvYE%?oU2j8
z3hI>q4)NV#zq;kMEthwBe~{N|i_fQ8eC`Ex$!FVnPnTs`ytam-{i>M5JJ<eZY|Wiq
zX10sS^`|%W%jel6Y`Y(lKEI31lePBk=dMzXv2Z=9{cqt;{`G6Svo>2c(H-O5Zv%p}
z%sa)|BUj>kEkBFmT33hudO)YYZL!w!iv@hWSMX&%-)Z@BzUxR|CE#9Pyl%EdANm<$
z&Qq<kOZ()1YkpS6`uko;zV6)figJ$pzr@$i3%=-z*UJT8#+7`T=i+|WIQhDUuPZHI
z=F01G!H+Q|Kb`SyZpYEXoVDz2eaM&M^KL)4n0wcS{_dXpEgo%guKQ#fpY=O@rW)Ii
zN%U#1KE`Lep8A}_=kE(X>1tp0^AKxc3?Ci$?RQ}}b5^sTQ`|bUM#gQwkH+eHPyWn7
z`<t!rP^B?`JLVexo@@Ejrwe$0!H@exKknJidc>^|&f?=#L6i2W9`^GXHTe!GYVy6?
z$A?X`dZ<>JysarP=Vr^Fnw--*!=fLb#oaZ<l7=DXKgBuZeDMV6&(Eb;!+iILP@=zc
zPj1{mcf4M~&&vfrSby`+^vUfxqYlp_{l(v(eevghxQNZ0g?Z-VT#LVlw%;<v7#6*M
z`f~c+(cW*WrI9Pe!c?yOov@76@Bh(RW3DCXY)!i4X&vX$-~c{f1HLz0pW4}0--{rv
zC7+LBce!P2zqTddm{B_4cG@GVRHNF~#8&>!Qd{CkF*|P>Bn{TbDQp*N=D!gB4>H*7
AG5`Po

literal 0
HcmV?d00001


From fe3db61ad23fdb13faa0407da2768bfb3fd2b300 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 15:51:46 -0700
Subject: [PATCH 0229/1533] Merge portable full/lite proto libraries.

PiperOrigin-RevId: 310644217
Change-Id: I1589760a126a265f28b0be379b9b36ded3ddb3c9
---
 tensorflow/compiler/tf2xla/BUILD |  2 +-
 tensorflow/core/BUILD            | 14 +++++---------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index a5332385994..c2ad1255a35 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -81,7 +81,7 @@ tf_portable_proto_library(
     name = "portable_tf2xla_proto",
     config_string = "allow_all:true",
     header_outs = ["//tensorflow/compiler/tf2xla/tf2xla.proto.h"],
-    portable_deps = ["//tensorflow/core:portable_proto_lib_full_runtime"],
+    portable_deps = ["//tensorflow/core:portable_proto_lib"],
     proto_deps = [
         ":tf2xla_proto",
         "//tensorflow/core:protos_all",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index fb57be9d6c7..28ef943f021 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -83,7 +83,6 @@ load(
     "tf_gen_op_libs",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_lite_protos",
-    "tf_opts_nortti_if_mobile",
     "tf_portable_full_lite_protos",
     "transitive_hdrs",
 )
@@ -100,9 +99,6 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 
-# buildifier: disable=same-origin-load
-# Placeholder: load("//tensorflow:tensorflow.bzl", "tf_portable_proto_lib")
-
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_deps")
 
@@ -3087,6 +3083,11 @@ alias(
     actual = "//tensorflow/core/platform:cuda_libdevice_path",
 )
 
+# Normalize CORE_PROTO_SRCS to generate valid output file names.
+PORTABLE_PROTO_HEADERS_OUT = tf_android_core_proto_headers(CORE_PROTO_SRCS) + [
+    "//google/protobuf/any.proto.h",
+]
+
 transitive_hdrs(
     name = "headers",
     visibility = ["//tensorflow:__subpackages__"],
@@ -3099,8 +3100,3 @@ transitive_hdrs(
         "//tensorflow/core/platform:platform_strings",
     ],
 )
-
-# Normalize CORE_PROTO_SRCS to generate valid output file names.
-PORTABLE_PROTO_HEADERS_OUT = tf_android_core_proto_headers(CORE_PROTO_SRCS) + [
-    "//google/protobuf/any.proto.h",
-]

From 110026283f097763f02e69e865dd3fc652a30c5f Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Fri, 8 May 2020 15:53:55 -0700
Subject: [PATCH 0230/1533] Fix RandomZoom layer for outputing 0 values.

PiperOrigin-RevId: 310644590
Change-Id: I19569a6fde0b4c232d549072a1f5755a6c2ff268
---
 .../preprocessing/image_preprocessing.py      | 348 +++++++++---------
 .../preprocessing/image_preprocessing_test.py | 341 ++++++++++++++---
 2 files changed, 474 insertions(+), 215 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 4f909b648b6..137fb2cb410 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -51,13 +51,27 @@ _RESIZE_METHODS = {
     'mitchellcubic': ResizeMethod.MITCHELLCUBIC
 }
 
+# Visually the data format should be NWHC instead of NHWC.
+H_AXIS = 2
+W_AXIS = 1
+
+
+def check_fill_mode_and_interpolation(fill_mode, interpolation):
+  if fill_mode not in {'reflect', 'wrap', 'constant'}:
+    raise NotImplementedError(
+        'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
+        '`constant` are supported.'.format(fill_mode))
+  if interpolation not in {'nearest', 'bilinear'}:
+    raise NotImplementedError('Unknown `interpolation` {}. Only `nearest` and '
+                              '`bilinear` are supported.'.format(interpolation))
+
 
 @keras_export('keras.layers.experimental.preprocessing.Resizing')
 class Resizing(Layer):
   """Image resizing layer.
 
   Resize the batched image input to target height and width. The input should
-  be a 4-D tensor in the format of NHWC.
+  be a 4-D tensor, channels_last format.
 
   Arguments:
     height: Integer, the height of the output shape.
@@ -84,14 +98,14 @@ class Resizing(Layer):
   def call(self, inputs):
     outputs = image_ops.resize_images_v2(
         images=inputs,
-        size=[self.target_height, self.target_width],
+        size=[self.target_width, self.target_height],
         method=self._interpolation_method)
     return outputs
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], self.target_height, self.target_width, input_shape[3]])
+        [input_shape[0], self.target_width, self.target_height, input_shape[3]])
 
   def get_config(self):
     config = {
@@ -109,11 +123,11 @@ class CenterCrop(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, height, width, channels)`, data_format='channels_last'.
+    `(samples, width, height, channels)`, data_format='channels_last'.
 
   Output shape:
     4D tensor with shape:
-    `(samples, target_height, target_width, channels)`.
+    `(samples, target_width, target_height, channels)`.
 
   If the input height/width is even and the target height/width is odd (or
   inversely), the input image is left-padded by 1 pixel.
@@ -132,9 +146,8 @@ class CenterCrop(Layer):
 
   def call(self, inputs):
     inputs_shape = array_ops.shape(inputs)
-    h_axis, w_axis = 1, 2
-    img_hd = inputs_shape[h_axis]
-    img_wd = inputs_shape[w_axis]
+    img_hd = inputs_shape[H_AXIS]
+    img_wd = inputs_shape[W_AXIS]
     img_hd_diff = img_hd - self.target_height
     img_wd_diff = img_wd - self.target_width
     checks = []
@@ -151,16 +164,16 @@ class CenterCrop(Layer):
     with ops.control_dependencies(checks):
       bbox_h_start = math_ops.cast(img_hd_diff / 2, dtypes.int32)
       bbox_w_start = math_ops.cast(img_wd_diff / 2, dtypes.int32)
-      bbox_begin = array_ops.stack([0, bbox_h_start, bbox_w_start, 0])
+      bbox_begin = array_ops.stack([0, bbox_w_start, bbox_h_start, 0])
       bbox_size = array_ops.stack(
-          [-1, self.target_height, self.target_width, -1])
+          [-1, self.target_width, self.target_height, -1])
       outputs = array_ops.slice(inputs, bbox_begin, bbox_size)
       return outputs
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], self.target_height, self.target_width, input_shape[3]])
+        [input_shape[0], self.target_width, self.target_height, input_shape[3]])
 
   def get_config(self):
     config = {
@@ -184,11 +197,11 @@ class RandomCrop(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, height, width, channels)`, data_format='channels_last'.
+    `(samples, width, height, channels)`, data_format='channels_last'.
 
   Output shape:
     4D tensor with shape:
-    `(samples, target_height, target_width, channels)`.
+    `(samples, target_width, target_height, channels)`.
 
   Arguments:
     height: Integer, the height of the output shape.
@@ -213,10 +226,10 @@ class RandomCrop(Layer):
       """Cropped inputs with stateless random ops."""
       input_shape = array_ops.shape(inputs)
       crop_size = array_ops.stack(
-          [input_shape[0], self.height, self.width, input_shape[3]])
+          [input_shape[0], self.width, self.height, input_shape[3]])
       check = control_flow_ops.Assert(
           math_ops.reduce_all(input_shape >= crop_size),
-          [self.height, self.width])
+          [self.width, self.height])
       input_shape = control_flow_ops.with_dependencies([check], input_shape)
       limit = input_shape - crop_size + 1
       offset = stateless_random_ops.stateless_random_uniform(
@@ -230,9 +243,9 @@ class RandomCrop(Layer):
     def resize_and_center_cropped_inputs():
       """Deterministically resize to shorter side and center crop."""
       input_shape = array_ops.shape(inputs)
-      input_height_t = input_shape[1]
-      input_width_t = input_shape[2]
-      ratio_cond = (input_height_t / input_width_t > 1.)
+      input_height_t = input_shape[H_AXIS]
+      input_width_t = input_shape[W_AXIS]
+      ratio_cond = (input_height_t / input_width_t > (self.height / self.width))
       # pylint: disable=g-long-lambda
       resized_height = tf_utils.smart_cond(
           ratio_cond,
@@ -244,14 +257,14 @@ class RandomCrop(Layer):
                                 input_width_t.dtype))
       # pylint: enable=g-long-lambda
       resized_inputs = image_ops.resize_images_v2(
-          images=inputs, size=array_ops.stack([resized_height, resized_width]))
+          images=inputs, size=array_ops.stack([resized_width, resized_height]))
 
       img_hd_diff = resized_height - self.height
       img_wd_diff = resized_width - self.width
       bbox_h_start = math_ops.cast(img_hd_diff / 2, dtypes.int32)
       bbox_w_start = math_ops.cast(img_wd_diff / 2, dtypes.int32)
-      bbox_begin = array_ops.stack([0, bbox_h_start, bbox_w_start, 0])
-      bbox_size = array_ops.stack([-1, self.height, self.width, -1])
+      bbox_begin = array_ops.stack([0, bbox_w_start, bbox_h_start, 0])
+      bbox_size = array_ops.stack([-1, self.width, self.height, -1])
       outputs = array_ops.slice(resized_inputs, bbox_begin, bbox_size)
       return outputs
 
@@ -259,14 +272,14 @@ class RandomCrop(Layer):
                                  resize_and_center_cropped_inputs)
     original_shape = inputs.shape.as_list()
     batch_size, num_channels = original_shape[0], original_shape[3]
-    output_shape = [batch_size] + [self.height, self.width] + [num_channels]
+    output_shape = [batch_size] + [self.width, self.height] + [num_channels]
     output.set_shape(output_shape)
     return output
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], self.height, self.width, input_shape[3]])
+        [input_shape[0], self.width, self.height, input_shape[3]])
 
   def get_config(self):
     config = {
@@ -332,11 +345,11 @@ class RandomFlip(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, height, width, channels)`, data_format='channels_last'.
+    `(samples, width, height, channels)`, data_format='channels_last'.
 
   Output shape:
     4D tensor with shape:
-    `(samples, height, width, channels)`, data_format='channels_last'.
+    `(samples, width, height, channels)`, data_format='channels_last'.
 
   Attributes:
     mode: String indicating which flip mode to use. Can be "horizontal",
@@ -407,17 +420,24 @@ class RandomTranslation(Layer):
   """Randomly translate each image during training.
 
   Arguments:
-    height_factor: a positive float represented as fraction of value, or a tuple
-      of size 2 representing lower and upper bound for shifting vertically. When
-      represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `height_factor=(0.2, 0.3)` results in an output
-      height varying in the range `[original - 20%, original + 30%]`.
-      `height_factor=0.2` results in an output height varying in the range
-      `[original - 20%, original + 20%]`.
-    width_factor: a positive float represented as fraction of value, or a tuple
+    height_factor: a float represented as fraction of value, or a tuple
+      of size 2 representing lower and upper bound for shifting vertically.
+      A negative value means shifting image up, while a positive value
+      means shifting image down. When represented as a single positive float,
+      this value is used for both the upper and lower bound. For instance,
+      `height_factor=(-0.2, 0.3)` results in an output shifted by a random
+      amount in the range [-20%, +30%].
+      `height_factor=0.2` results in an output height shifted by a random
+      amount in the range [-20%, +20%].
+    width_factor: a float represented as fraction of value, or a tuple
       of size 2 representing lower and upper bound for shifting horizontally.
-      When represented as a single float, this value is used for both the upper
-      and lower bound.
+      A negative value means shifting image left, while a positive value
+      means shifting image right. When represented as a single positive float,
+      this value is used for both the upper and lower bound. For instance,
+      `width_factor=(-0.2, 0.3)` results in an output shifted left by 20%, and
+      shifted right by 30%.
+      `width_factor=0.2` results in an output height shifted left or right
+      by 20%.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -432,16 +452,16 @@ class RandomTranslation(Layer):
     name: A string, the name of the layer.
 
   Input shape:
-    4D tensor with shape: `(samples, height, width, channels)`,
+    4D tensor with shape: `(samples, width, height, channels)`,
       data_format='channels_last'.
 
   Output shape:
-    4D tensor with shape: `(samples, height, width, channels)`,
+    4D tensor with shape: `(samples, width, height, channels)`,
       data_format='channels_last'.
 
   Raise:
-    ValueError: if lower bound is not between [0, 1], or upper bound is
-      negative.
+    ValueError: if either bound is not between [0, 1], or upper bound is
+      less than lower bound.
   """
 
   def __init__(self,
@@ -454,38 +474,34 @@ class RandomTranslation(Layer):
                **kwargs):
     self.height_factor = height_factor
     if isinstance(height_factor, (tuple, list)):
-      self.height_lower = abs(height_factor[0])
+      self.height_lower = height_factor[0]
       self.height_upper = height_factor[1]
     else:
-      self.height_lower = self.height_upper = height_factor
-    if self.height_upper < 0.:
-      raise ValueError('`height_factor` cannot have negative values as upper '
-                       'bound, got {}'.format(height_factor))
+      self.height_lower = -height_factor
+      self.height_upper = height_factor
+    if self.height_upper < self.height_lower:
+      raise ValueError('`height_factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(height_factor))
     if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
       raise ValueError('`height_factor` must have values between [-1, 1], '
                        'got {}'.format(height_factor))
 
     self.width_factor = width_factor
     if isinstance(width_factor, (tuple, list)):
-      self.width_lower = abs(width_factor[0])
+      self.width_lower = width_factor[0]
       self.width_upper = width_factor[1]
     else:
-      self.width_lower = self.width_upper = width_factor
-    if self.width_upper < 0.:
-      raise ValueError('`width_factor` cannot have negative values as upper '
-                       'bound, got {}'.format(width_factor))
+      self.width_lower = -width_factor
+      self.width_upper = width_factor
+    if self.width_upper < self.width_lower:
+      raise ValueError('`width_factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(width_factor))
     if abs(self.width_lower) > 1. or abs(self.width_upper) > 1.:
       raise ValueError('`width_factor` must have values between [-1, 1], '
                        'got {}'.format(width_factor))
 
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
+
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -501,22 +517,24 @@ class RandomTranslation(Layer):
       """Translated inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
+      h_axis, w_axis = H_AXIS, W_AXIS
       img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
       img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
       height_translate = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.height_lower,
-          maxval=self.height_upper)
+          minval=self.height_lower,
+          maxval=self.height_upper,
+          dtype=dtypes.float32)
       height_translate = height_translate * img_hd
       width_translate = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.width_lower,
-          maxval=self.width_upper)
+          minval=self.width_lower,
+          maxval=self.width_upper,
+          dtype=dtypes.float32)
       width_translate = width_translate * img_wd
       translations = math_ops.cast(
-          array_ops.concat([height_translate, width_translate], axis=1),
-          dtype=inputs.dtype)
+          array_ops.concat([width_translate, height_translate], axis=1),
+          dtype=dtypes.float32)
       return transform(
           inputs,
           get_translation_matrix(translations),
@@ -585,8 +603,8 @@ def transform(images,
   """Applies the given transform(s) to the image(s).
 
   Args:
-    images: A tensor of shape (num_images, num_rows, num_columns, num_channels)
-      (NHWC), (num_rows, num_columns, num_channels) (HWC), or (num_rows,
+    images: A tensor of shape (num_images, num_rows, num_columns, num_channels),
+      (num_rows, num_columns, num_channels) (HWC), or (num_rows,
       num_columns) (HW). The rank must be statically known (the shape is not
       `TensorShape(None)`.
     transforms: Projective transform matrix/matrices. A vector of length 8 or
@@ -713,9 +731,15 @@ class RandomRotation(Layer):
     `(samples, height, width, channels)`, data_format='channels_last'.
 
   Attributes:
-    factor: a positive float represented as fraction of 2pi, or a tuple of size
+    factor: a float represented as fraction of 2pi, or a tuple of size
       2 representing lower and upper bound for rotating clockwise and
-      counter-clockwise. When represented as a single float, lower = upper.
+      counter-clockwise. A positive values means rotating counter clock-wise,
+      while a negative value means clock-wise. When represented as a single
+      float, this value is used for both the upper and lower bound. For
+      instance, `factor=(-0.2, 0.3)` results in an output
+      rotation by a random amount in the range `[-20% * 2pi, 30% * 2pi]`.
+      `factor=0.2` results in an output rotating by a random amount in the range
+      `[-20% * 2pi, 20% * 2pi]`.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -736,8 +760,8 @@ class RandomRotation(Layer):
       data_format='channels_last'.
 
   Raise:
-    ValueError: if lower bound is not between [0, 1], or upper bound is
-      negative.
+    ValueError: if either bound is not between [0, 1], or upper bound is
+      less than lower bound.
   """
 
   def __init__(self,
@@ -752,18 +776,12 @@ class RandomRotation(Layer):
       self.lower = factor[0]
       self.upper = factor[1]
     else:
-      self.lower = self.upper = factor
-    if self.lower < 0. or self.upper < 0.:
+      self.lower = -factor
+      self.upper = factor
+    if self.upper < self.lower:
       raise ValueError('Factor cannot have negative values, '
                        'got {}'.format(factor))
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -779,13 +797,12 @@ class RandomRotation(Layer):
       """Rotated inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       min_angle = self.lower * 2. * np.pi
       max_angle = self.upper * 2. * np.pi
       angles = self._rng.uniform(
-          shape=[batch_size], minval=-min_angle, maxval=max_angle)
+          shape=[batch_size], minval=min_angle, maxval=max_angle)
       return transform(
           inputs,
           get_rotation_matrix(angles, img_hd, img_wd),
@@ -815,16 +832,23 @@ class RandomZoom(Layer):
   """Randomly zoom each image during training.
 
   Arguments:
-    height_factor: a positive float represented as fraction of value, or a tuple
-      of size 2 representing lower and upper bound for zooming horizontally.
-      When represented as a single float, this value is used for both the
-      upper and lower bound. For instance, `height_factor=(0.2, 0.3)` result in
-      an output zoom varying in the range `[original * 20%, original * 30%]`.
-    width_factor: a positive float represented as fraction of value, or a tuple
+    height_factor: a float represented as fraction of value, or a tuple
       of size 2 representing lower and upper bound for zooming vertically.
       When represented as a single float, this value is used for both the
-      upper and lower bound. For instance, `width_factor=(0.2, 0.3)` result in
-      an output zoom varying in the range `[original * 20%, original * 30%]`.
+      upper and lower bound. A positive value means zooming out, while a
+      negative value means zooming in.
+      For instance, `height_factor=(0.2, 0.3)` result in an output zoomed out
+      by a random amount in the range [+20%, +30%].
+      `height_factor=(-0.3, -0.2)` result in an output zoomed in by a random
+      amount in the range [+20%, +30%].
+    width_factor: a float represented as fraction of value, or a tuple
+      of size 2 representing lower and upper bound for zooming horizontally.
+      When represented as a single float, this value is used for both the
+      upper and lower bound.
+      For instance, `width_factor=(0.2, 0.3)` result in an output zooming out
+      between 20% to 30%.
+      `width_factor=(-0.3, -0.2)` result in an output zooming in between 20%
+      to 30%.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -863,35 +887,27 @@ class RandomZoom(Layer):
       self.height_lower = height_factor[0]
       self.height_upper = height_factor[1]
     else:
-      self.height_lower = self.height_upper = height_factor
-    if self.height_lower < 0. or self.height_upper < 0.:
-      raise ValueError('`height_factor` cannot have negative values, '
+      self.height_lower = -height_factor
+      self.height_upper = height_factor
+
+    if abs(self.height_lower) > 1. or abs(self.height_upper) > 1.:
+      raise ValueError('`height_factor` must have values between [-1, 1], '
                        'got {}'.format(height_factor))
-    if self.height_lower > self.height_upper:
-      raise ValueError('`height_factor` cannot have lower bound larger than '
-                       'upper bound, got {}.'.format(height_factor))
 
     self.width_factor = width_factor
     if isinstance(width_factor, (tuple, list)):
       self.width_lower = width_factor[0]
       self.width_upper = width_factor[1]
     else:
-      self.width_lower = self.width_upper = width_factor
-    if self.width_lower < 0. or self.width_upper < 0.:
-      raise ValueError('`width_factor` cannot have negative values, '
-                       'got {}'.format(width_factor))
-    if self.width_lower > self.width_upper:
-      raise ValueError('`width_factor` cannot have lower bound larger than '
-                       'upper bound, got {}.'.format(width_factor))
+      self.width_lower = -width_factor
+      self.width_upper = width_factor
+
+    if self.width_lower < -1. or self.width_upper < -1.:
+      raise ValueError('`width_factor` must have values larger than -1, '
+                       'got {}'.format(width_factor))
+
+    check_fill_mode_and_interpolation(fill_mode, interpolation)
 
-    if fill_mode not in {'reflect', 'wrap', 'constant'}:
-      raise NotImplementedError(
-          'Unknown `fill_mode` {}. Only `reflect`, `wrap` and '
-          '`constant` are supported.'.format(fill_mode))
-    if interpolation not in {'nearest', 'bilinear'}:
-      raise NotImplementedError(
-          'Unknown `interpolation` {}. Only `nearest` and '
-          '`bilinear` are supported.'.format(interpolation))
     self.fill_mode = fill_mode
     self.interpolation = interpolation
     self.seed = seed
@@ -907,22 +923,19 @@ class RandomZoom(Layer):
       """Zoomed inputs with random ops."""
       inputs_shape = array_ops.shape(inputs)
       batch_size = inputs_shape[0]
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       height_zoom = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.height_lower,
-          maxval=self.height_upper)
-      height_zoom = height_zoom * img_hd
+          minval=1. + self.height_lower,
+          maxval=1. + self.height_upper)
       width_zoom = self._rng.uniform(
           shape=[batch_size, 1],
-          minval=-self.width_lower,
-          maxval=self.width_upper)
-      width_zoom = width_zoom * img_wd
+          minval=1. + self.width_lower,
+          maxval=1. + self.width_upper)
       zooms = math_ops.cast(
-          array_ops.concat([height_zoom, width_zoom], axis=1),
-          dtype=inputs.dtype)
+          array_ops.concat([width_zoom, height_zoom], axis=1),
+          dtype=dtypes.float32)
       return transform(
           inputs, get_zoom_matrix(zooms, img_hd, img_wd),
           fill_mode=self.fill_mode,
@@ -974,8 +987,8 @@ def get_zoom_matrix(zooms, image_height, image_width, name=None):
     #      [0 0 1]]
     # where the last entry is implicit.
     # Zoom matrices are always float32.
-    x_offset = ((image_height + 1.) / 2.0) * (zooms[:, 0, None] - 1.)
-    y_offset = ((image_width + 1.) / 2.0) * (zooms[:, 1, None] - 1.)
+    x_offset = ((image_width - 1.) / 2.0) * (1.0 - zooms[:, 0, None])
+    y_offset = ((image_height - 1.) / 2.0) * (1.0 - zooms[:, 1, None])
     return array_ops.concat(
         values=[
             zooms[:, 0, None],
@@ -1073,11 +1086,11 @@ class RandomHeight(Layer):
     factor: A positive float (fraction of original height), or a tuple of size 2
       representing lower and upper bound for resizing vertically. When
       represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output height
-      varying in the range `[original + 20%, original + 30%]`. `factor=(-0.2,
-      0.3)` results in an output height varying in the range `[original - 20%,
-      original + 30%]`. `factor=0.2` results in an output height varying in the
-      range `[original - 20%, original + 20%]`.
+      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
+      height changed by a random amount in the range `[20%, 30%]`.
+      `factor=(-0.2, 0.3)` results in an output with height changed by a random
+      amount in the range `[-20%, +30%]. `factor=0.2` results in an output with
+      height changed by a random amount in the range `[-20%, +20%]`.
     interpolation: String, the interpolation method. Defaults to `bilinear`.
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
@@ -1085,10 +1098,10 @@ class RandomHeight(Layer):
     name: A string, the name of the layer.
 
   Input shape:
-    4D tensor with shape: `(samples, height, width, channels)`
+    4D tensor with shape: `(samples, width, height, channels)`
       (data_format='channels_last').
   Output shape:
-    4D tensor with shape: `(samples, random_height, width, channels)`.
+    4D tensor with shape: `(samples, width, random_height, channels)`.
   """
 
   def __init__(self,
@@ -1099,12 +1112,17 @@ class RandomHeight(Layer):
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
-      self.height_lower = -factor[0]
+      self.height_lower = factor[0]
       self.height_upper = factor[1]
     else:
-      self.height_lower = self.height_upper = factor
-    if self.height_lower > 1.:
-      raise ValueError('`factor` cannot have abs lower bound larger than 1.0, '
+      self.height_lower = -factor
+      self.height_upper = factor
+
+    if self.height_upper < self.height_lower:
+      raise ValueError('`factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(factor))
+    if self.height_lower < -1. or self.height_upper < -1.:
+      raise ValueError('`factor` must have values larger than -1, '
                        'got {}'.format(factor))
     self.interpolation = interpolation
     self._interpolation_method = get_interpolation(interpolation)
@@ -1120,19 +1138,18 @@ class RandomHeight(Layer):
     def random_height_inputs():
       """Inputs height-adjusted with random ops."""
       inputs_shape = array_ops.shape(inputs)
-      h_axis, w_axis = 1, 2
-      img_hd = math_ops.cast(inputs_shape[h_axis], dtypes.float32)
-      img_wd = inputs_shape[w_axis]
+      img_hd = math_ops.cast(inputs_shape[H_AXIS], dtypes.float32)
+      img_wd = inputs_shape[W_AXIS]
       height_factor = self._rng.uniform(
           shape=[],
-          minval=(1.0 - self.height_lower),
+          minval=(1.0 + self.height_lower),
           maxval=(1.0 + self.height_upper))
       adjusted_height = math_ops.cast(height_factor * img_hd, dtypes.int32)
-      adjusted_size = array_ops.stack([adjusted_height, img_wd])
+      adjusted_size = array_ops.stack([img_wd, adjusted_height])
       output = image_ops.resize_images_v2(
           images=inputs, size=adjusted_size, method=self._interpolation_method)
       original_shape = inputs.shape.as_list()
-      output_shape = [original_shape[0]] + [None] + original_shape[2:4]
+      output_shape = original_shape[0:2] + [None] + [original_shape[3]]
       output.set_shape(output_shape)
       return output
 
@@ -1141,7 +1158,7 @@ class RandomHeight(Layer):
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], None, input_shape[2], input_shape[3]])
+        [input_shape[0], input_shape[1], None, input_shape[3]])
 
   def get_config(self):
     config = {
@@ -1163,14 +1180,14 @@ class RandomWidth(Layer):
   By default, this layer is inactive during inference.
 
   Arguments:
-    factor: A positive float (fraction of original width), or a tuple of
-      size 2 representing lower and upper bound for resizing horizontally. When
+    factor: A positive float (fraction of original height), or a tuple of size 2
+      representing lower and upper bound for resizing vertically. When
       represented as a single float, this value is used for both the upper and
-      lower bound. For instance, `factor=(0.2, 0.3)` results in an output width
-      varying in the range `[original + 20%, original + 30%]`. `factor=(-0.2,
-      0.3)` results in an output width varying in the range `[original - 20%,
-      original + 30%]`. `factor=0.2` results in an output width varying in the
-      range `[original - 20%, original + 20%]`.
+      lower bound. For instance, `factor=(0.2, 0.3)` results in an output with
+      width changed by a random amount in the range `[20%, 30%]`.
+      `factor=(-0.2, 0.3)` results in an output with width changed by a random
+      amount in the range `[-20%, +30%]. `factor=0.2` results in an output with
+      width changed by a random amount in the range `[-20%, +20%]`.
     interpolation: String, the interpolation method. Defaults to `bilinear`.
       Supports `bilinear`, `nearest`, `bicubic`, `area`, `lanczos3`, `lanczos5`,
       `gaussian`, `mitchellcubic`
@@ -1179,11 +1196,11 @@ class RandomWidth(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, height, width, channels)` (data_format='channels_last').
+    `(samples, width, height, channels)` (data_format='channels_last').
 
   Output shape:
     4D tensor with shape:
-    `(samples, random_height, width, channels)`.
+    `(samples, random_width, height, channels)`.
   """
 
   def __init__(self,
@@ -1194,12 +1211,16 @@ class RandomWidth(Layer):
                **kwargs):
     self.factor = factor
     if isinstance(factor, (tuple, list)):
-      self.width_lower = -factor[0]
+      self.width_lower = factor[0]
       self.width_upper = factor[1]
     else:
-      self.width_lower = self.width_upper = factor
-    if self.width_lower > 1.:
-      raise ValueError('`factor` cannot have abs lower bound larger than 1.0, '
+      self.width_lower = -factor
+      self.width_upper = factor
+    if self.width_upper < self.width_lower:
+      raise ValueError('`factor` cannot have upper bound less than '
+                       'lower bound, got {}'.format(factor))
+    if self.width_lower < -1. or self.width_upper < -1.:
+      raise ValueError('`factor` must have values larger than -1, '
                        'got {}'.format(factor))
     self.interpolation = interpolation
     self._interpolation_method = get_interpolation(interpolation)
@@ -1215,19 +1236,18 @@ class RandomWidth(Layer):
     def random_width_inputs():
       """Inputs width-adjusted with random ops."""
       inputs_shape = array_ops.shape(inputs)
-      h_axis, w_axis = 1, 2
-      img_hd = inputs_shape[h_axis]
-      img_wd = math_ops.cast(inputs_shape[w_axis], dtypes.float32)
+      img_hd = inputs_shape[H_AXIS]
+      img_wd = math_ops.cast(inputs_shape[W_AXIS], dtypes.float32)
       width_factor = self._rng.uniform(
           shape=[],
-          minval=(1.0 - self.width_lower),
+          minval=(1.0 + self.width_lower),
           maxval=(1.0 + self.width_upper))
       adjusted_width = math_ops.cast(width_factor * img_wd, dtypes.int32)
-      adjusted_size = array_ops.stack([img_hd, adjusted_width])
+      adjusted_size = array_ops.stack([adjusted_width, img_hd])
       output = image_ops.resize_images_v2(
           images=inputs, size=adjusted_size, method=self._interpolation_method)
       original_shape = inputs.shape.as_list()
-      output_shape = original_shape[0:2] + [None] + [original_shape[3]]
+      output_shape = [original_shape[0]] + [None] + original_shape[2:4]
       output.set_shape(output_shape)
       return output
 
@@ -1236,7 +1256,7 @@ class RandomWidth(Layer):
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], input_shape[1], None, input_shape[3]])
+        [input_shape[0], None, input_shape[2], input_shape[3]])
 
   def get_config(self):
     config = {
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index a741ee1c069..95630a5b853 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -48,8 +48,8 @@ class ResizingTest(keras_parameterized.TestCase):
       testing_utils.layer_test(
           image_preprocessing.Resizing,
           kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          expected_output_shape=(None, expected_height, expected_width,
+          input_shape=(num_samples, orig_width, orig_height, channels),
+          expected_output_shape=(None, expected_width, expected_height,
                                  channels))
 
   @parameterized.named_parameters(
@@ -74,6 +74,40 @@ class ResizingTest(keras_parameterized.TestCase):
     with CustomObjectScope({'Resizing': image_preprocessing.Resizing}):
       self._run_test(kwargs, expected_height, expected_width)
 
+  def test_down_sampling_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 16), (1, 4, 4, 1)).astype(dtype)
+        layer = image_preprocessing.Resizing(
+            height=2, width=2, interpolation='nearest')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 7],
+            [13, 15]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_up_sampling_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 4), (1, 2, 2, 1)).astype(dtype)
+        layer = image_preprocessing.Resizing(
+            height=4, width=4, interpolation='nearest')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 1, 1],
+            [0, 0, 1, 1],
+            [2, 2, 3, 3],
+            [2, 2, 3, 3]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 4, 4, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   @parameterized.named_parameters(
       ('reshape_bilinear_10_by_4', {'interpolation': 'bilinear'}, 10, 4))
   def test_reshaping(self, kwargs, expected_height, expected_width):
@@ -92,13 +126,13 @@ class ResizingTest(keras_parameterized.TestCase):
 
 
 def get_numpy_center_crop(images, expected_height, expected_width):
-  orig_height = images.shape[1]
-  orig_width = images.shape[2]
+  orig_height = images.shape[2]
+  orig_width = images.shape[1]
   height_start = int((orig_height - expected_height) / 2)
   width_start = int((orig_width - expected_width) / 2)
   height_end = height_start + expected_height
   width_end = width_start + expected_width
-  return images[:, height_start:height_end, width_start:width_end, :]
+  return images[:, width_start:width_end, height_start:height_end, :]
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -112,17 +146,17 @@ class CenterCropTest(keras_parameterized.TestCase):
     channels = 3
     kwargs = {'height': expected_height, 'width': expected_width}
     input_images = np.random.random(
-        (num_samples, orig_height, orig_width, channels)).astype(np.float32)
+        (num_samples, orig_width, orig_height, channels)).astype(np.float32)
     expected_output = get_numpy_center_crop(
         input_images, expected_height, expected_width)
     with tf_test_util.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.CenterCrop,
           kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
+          input_shape=(num_samples, orig_width, orig_height, channels),
           input_data=input_images,
           expected_output=expected_output,
-          expected_output_shape=(None, expected_height, expected_width,
+          expected_output_shape=(None, expected_width, expected_height,
                                  channels))
 
   @parameterized.named_parameters(
@@ -176,8 +210,8 @@ class RandomCropTest(keras_parameterized.TestCase):
       testing_utils.layer_test(
           image_preprocessing.RandomCrop,
           kwargs=kwargs,
-          input_shape=(num_samples, orig_height, orig_width, channels),
-          expected_output_shape=(None, expected_height, expected_width,
+          input_shape=(num_samples, orig_width, orig_height, channels),
+          expected_output_shape=(None, expected_width, expected_height,
                                  channels))
 
   @parameterized.named_parameters(
@@ -199,16 +233,16 @@ class RandomCropTest(keras_parameterized.TestCase):
     height, width = 3, 4
     height_offset = np.random.randint(low=0, high=3)
     width_offset = np.random.randint(low=0, high=5)
-    mock_offset = [0, height_offset, width_offset, 0]
+    mock_offset = [0, width_offset, height_offset, 0]
     with test.mock.patch.object(
         stateless_random_ops, 'stateless_random_uniform',
         return_value=mock_offset):
       with tf_test_util.use_gpu():
         layer = image_preprocessing.RandomCrop(height, width)
-        inp = np.random.random((12, 5, 8, 3))
+        inp = np.random.random((12, 8, 5, 3))
         actual_output = layer(inp, training=1)
-        expected_output = inp[:, height_offset:(height_offset + height),
-                              width_offset:(width_offset + width), :]
+        expected_output = inp[:, width_offset:(width_offset + width),
+                              height_offset:(height_offset + height), :]
         self.assertAllClose(expected_output, actual_output)
 
   @parameterized.named_parameters(
@@ -223,7 +257,7 @@ class RandomCropTest(keras_parameterized.TestCase):
     with CustomObjectScope({'RandomCrop': image_preprocessing.RandomCrop}):
       self._run_test(expected_height, expected_width)
 
-  def test_predicting_with_mock_longer_height(self):
+  def test_predicting_with_mock_longer_width(self):
     np.random.seed(1337)
     height, width = 3, 3
     inp = np.random.random((12, 10, 6, 3))
@@ -235,16 +269,15 @@ class RandomCropTest(keras_parameterized.TestCase):
       expected_output = resized_inp[:, 1:4, :, :]
       self.assertAllClose(expected_output, actual_output)
 
-  def test_predicting_with_mock_longer_width(self):
+  def test_predicting_with_mock_longer_height(self):
     np.random.seed(1337)
     height, width = 4, 6
     inp = np.random.random((12, 8, 16, 3))
     with tf_test_util.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
-      resized_inp = image_ops.resize_images_v2(
-          inp, size=[4, 8])
-      expected_output = resized_inp[:, :, 1:7, :]
+      resized_inp = image_ops.resize_images_v2(inp, size=[6, 12])
+      expected_output = resized_inp[:, :, 4:8, :]
       self.assertAllClose(expected_output, actual_output)
 
   def test_config_with_custom_name(self):
@@ -475,21 +508,129 @@ class RandomTranslationTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(
       ('random_translate_4_by_6', .4, .6), ('random_translate_3_by_2', .3, .2),
-      ('random_translate_tuple_factor', (.5, .4), (.2, .3)))
+      ('random_translate_tuple_factor', (-.5, .4), (.2, .3)))
   def test_random_translation(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
-  def test_random_translation_negative_lower(self):
-    mock_offset = np.random.random((12, 1))
-    with test.mock.patch.object(
-        gen_stateful_random_ops, 'stateful_uniform', return_value=mock_offset):
-      with self.cached_session(use_gpu=True):
-        layer = image_preprocessing.RandomTranslation((-0.2, .3), .4)
-        layer_2 = image_preprocessing.RandomTranslation((0.2, .3), .4)
-        inp = np.random.random((12, 5, 8, 3)).astype(np.float32)
-        actual_output = layer(inp, training=1)
-        actual_output_2 = layer_2(inp, training=1)
-        self.assertAllClose(actual_output, actual_output_2)
+  def test_random_translation_up_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(-.2, -.2), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24],
+            [20, 21, 22, 23, 24]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_up_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(-.2, -.2), width_factor=0., fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19],
+            [20, 21, 22, 23, 24],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_down_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by .2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.2, .2), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 1, 2, 3, 4],
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_down_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.2, .2), width_factor=0., fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 1, 2, 3, 4],
+            [5, 6, 7, 8, 9],
+            [10, 11, 12, 13, 14],
+            [15, 16, 17, 18, 19]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_left_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by .2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=0., width_factor=(-.2, -.2))
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 2, 3, 4, 4],
+            [6, 7, 8, 9, 9],
+            [11, 12, 13, 14, 14],
+            [16, 17, 18, 19, 19],
+            [21, 22, 23, 24, 24]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_translation_left_numeric_constant(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (1, 5, 5, 1)).astype(dtype)
+        # Shifting by -.2 * 5 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=0., width_factor=(-.2, -.2), fill_mode='constant')
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 2, 3, 4, 0],
+            [6, 7, 8, 9, 0],
+            [11, 12, 13, 14, 0],
+            [16, 17, 18, 19, 0],
+            [21, 22, 23, 24, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_translation_inference(self):
     with CustomObjectScope(
@@ -768,7 +909,7 @@ class RandomRotationTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(('random_rotate_4', .4),
                                   ('random_rotate_3', .3),
-                                  ('random_rotate_tuple_factor', (.5, .4)))
+                                  ('random_rotate_tuple_factor', (-.5, .4)))
   def test_random_rotation(self, factor):
     self._run_test(factor)
 
@@ -808,22 +949,55 @@ class RandomZoomTest(keras_parameterized.TestCase):
           expected_output_shape=(None, orig_height, orig_width, channels))
 
   @parameterized.named_parameters(
-      ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
-      ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
+      ('random_zoom_4_by_6', -.4, -.6), ('random_zoom_2_by_3', -.2, -.3),
+      ('random_zoom_tuple_factor', (-.4, -.5), (-.2, -.3)))
   def test_random_zoom_in(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
   @parameterized.named_parameters(
-      ('random_zoom_4_by_6', 1.4, 1.6), ('random_zoom_2_by_3', 1.2, 1.3),
-      ('random_zoom_tuple_factor', (1.4, 1.5), (1.2, 1.3)))
+      ('random_zoom_4_by_6', .4, .6), ('random_zoom_2_by_3', .2, .3),
+      ('random_zoom_tuple_factor', (.4, .5), (.2, .3)))
   def test_random_zoom_out(self, height_factor, width_factor):
     self._run_test(height_factor, width_factor)
 
-  def test_random_zoom_invalid_factor(self):
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomZoom((.5, .4), .2)
-    with self.assertRaises(ValueError):
-      image_preprocessing.RandomZoom(.2, (.5, .4))
+  def test_random_zoom_in_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((-.5, -.5), (-.5, -.5),
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [6, 7, 7, 8, 8],
+            [11, 12, 12, 13, 13],
+            [11, 12, 12, 13, 13],
+            [16, 17, 17, 18, 18],
+            [16, 17, 17, 18, 18]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_zoom_out_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((.5, .5), (.5, .5),
+                                               fill_mode='constant',
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 6, 7, 9, 0],
+            [0, 11, 12, 14, 0],
+            [0, 21, 22, 24, 0],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_zoom_inference(self):
     with CustomObjectScope(
@@ -853,15 +1027,15 @@ class RandomHeightTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     with tf_test_util.use_gpu():
-      img = np.random.random((num_samples, orig_height, orig_width, channels))
+      img = np.random.random((num_samples, orig_width, orig_height, channels))
       layer = image_preprocessing.RandomHeight(factor)
       img_out = layer(img, training=True)
       self.assertEqual(img_out.shape[0], 2)
-      self.assertEqual(img_out.shape[2], 8)
+      self.assertEqual(img_out.shape[1], 8)
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_height_4_by_6', (.4, .6)),
-                                  ('random_height_3_by_2', (.3, 1.2)),
+                                  ('random_height_3_by_2', (-.3, .2)),
                                   ('random_height_3', .3))
   def test_random_height_basic(self, factor):
     self._run_test(factor)
@@ -872,10 +1046,42 @@ class RandomHeightTest(keras_parameterized.TestCase):
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
       with tf_test_util.use_gpu():
-        img = np.random.random((12, 5, 8, 3))
+        img = np.random.random((12, 8, 5, 3))
         layer = image_preprocessing.RandomHeight(.4)
         img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[1], 3)
+        self.assertEqual(img_out.shape[2], 3)
+
+  def test_random_height_longer_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
+        layer = image_preprocessing.RandomHeight(factor=(1., 1.))
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0.25, 0.75, 1],
+            [2, 2.25, 2.75, 3],
+            [4, 4.25, 4.75, 5]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 3, 4, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_height_shorter_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
+        layer = image_preprocessing.RandomHeight(
+            factor=(-.5, -.5), interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [1, 3],
+            [5, 7]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_height_invalid_factor(self):
     with self.assertRaises(ValueError):
@@ -908,15 +1114,15 @@ class RandomWidthTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     with tf_test_util.use_gpu():
-      img = np.random.random((num_samples, orig_height, orig_width, channels))
+      img = np.random.random((num_samples, orig_width, orig_height, channels))
       layer = image_preprocessing.RandomWidth(factor)
       img_out = layer(img, training=True)
       self.assertEqual(img_out.shape[0], 2)
-      self.assertEqual(img_out.shape[1], 5)
+      self.assertEqual(img_out.shape[2], 5)
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_width_4_by_6', (.4, .6)),
-                                  ('random_width_3_by_2', (.3, 1.2)),
+                                  ('random_width_3_by_2', (-.3, .2)),
                                   ('random_width_3', .3))
   def test_random_width_basic(self, factor):
     self._run_test(factor)
@@ -927,10 +1133,43 @@ class RandomWidthTest(keras_parameterized.TestCase):
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
       with tf_test_util.use_gpu():
-        img = np.random.random((12, 8, 5, 3))
+        img = np.random.random((12, 5, 8, 3))
         layer = image_preprocessing.RandomWidth(.4)
         img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[2], 3)
+        self.assertEqual(img_out.shape[1], 3)
+
+  def test_random_width_longer_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
+        layer = image_preprocessing.RandomWidth(factor=(1., 1.))
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 1, 2],
+            [0.75, 1.75, 2.75],
+            [2.25, 3.25, 4.25],
+            [3, 4, 5]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 4, 3, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_width_shorter_numeric(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
+        layer = image_preprocessing.RandomWidth(
+            factor=(-.5, -.5), interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [2, 3],
+            [6, 7]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 2, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
 
   def test_random_width_invalid_factor(self):
     with self.assertRaises(ValueError):

From 40835281112e42fd34af2aea4d3788d15b8e795a Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 8 May 2020 16:08:20 -0700
Subject: [PATCH 0231/1533] [Executor] Fix segfault when using verbose logging
 in the executor module.

Selective vlog-enabling for the executor module would trigger a crash in `SimplePropagatorState::DumpState()` (called when the executor executes a kernel that produces an error status). Because `SimplePropagatorState` is in a separate module, and selective vlogging might not be enabled for that module as well, the necessary state was not created, leading to a null pointer dereference.

The fix is to share the executor's vlog setting with the propagators, so that if vlogging is enabled for the executor, `SimplePropagatorState` will create the necessary debugging structures (i.e. the `SimplePropgagatorState::active_` vector).

PiperOrigin-RevId: 310646985
Change-Id: Ic4220bfdb7e0800e1ad8a5eb6d468db5886367ad
---
 tensorflow/core/common_runtime/executor.cc                | 2 +-
 tensorflow/core/common_runtime/propagator_state.cc        | 4 ++--
 tensorflow/core/common_runtime/propagator_state.h         | 3 ++-
 tensorflow/core/common_runtime/simple_propagator_state.cc | 8 ++++----
 tensorflow/core/common_runtime/simple_propagator_state.h  | 5 +++--
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 74de6b28d3f..447a9e0ae77 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -403,7 +403,7 @@ ExecutorState<PropagatorStateType>::ExecutorState(
       runner_(args.runner),
       sync_on_finish_(args.sync_on_finish),
       run_all_kernels_inline_(args.run_all_kernels_inline),
-      propagator_(immutable_state, step_id_),
+      propagator_(immutable_state, step_id_, vlog_),
       num_outstanding_ops_(0) {
   if (args.user_intra_op_threadpool != nullptr) {
     Device* device = immutable_state_.params().device;
diff --git a/tensorflow/core/common_runtime/propagator_state.cc b/tensorflow/core/common_runtime/propagator_state.cc
index 30529dec742..a6639b1132e 100644
--- a/tensorflow/core/common_runtime/propagator_state.cc
+++ b/tensorflow/core/common_runtime/propagator_state.cc
@@ -26,10 +26,10 @@ limitations under the License.
 namespace tensorflow {
 
 PropagatorState::PropagatorState(const ImmutableExecutorState& immutable_state,
-                                 int64 step_id)
+                                 int64 step_id, bool vlog)
     : immutable_state_(immutable_state),
       step_id_(step_id),
-      vlog_(VLOG_IS_ON(1)) {
+      vlog_(vlog || VLOG_IS_ON(1)) {
   // We start the entire execution in iteration 0 of the root frame
   // so let us create the root frame and the state for iteration 0.
   // We assume root_frame_->frame_name.empty().
diff --git a/tensorflow/core/common_runtime/propagator_state.h b/tensorflow/core/common_runtime/propagator_state.h
index d61adeff5c4..167519ccc73 100644
--- a/tensorflow/core/common_runtime/propagator_state.h
+++ b/tensorflow/core/common_runtime/propagator_state.h
@@ -45,7 +45,8 @@ typedef gtl::InlinedVector<AllocatorAttributes, 4> AllocatorAttributeVec;
 // adding them to a `TaggedNodeSeq`.
 class PropagatorState {
  public:
-  PropagatorState(const ImmutableExecutorState& immutable_state, int64 step_id);
+  PropagatorState(const ImmutableExecutorState& immutable_state, int64 step_id,
+                  bool vlog);
   ~PropagatorState();
 
  private:
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.cc b/tensorflow/core/common_runtime/simple_propagator_state.cc
index 48fac96dd3d..01322cc3514 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.cc
+++ b/tensorflow/core/common_runtime/simple_propagator_state.cc
@@ -23,16 +23,16 @@ limitations under the License.
 namespace tensorflow {
 
 SimplePropagatorState::SimplePropagatorState(
-    const ImmutableExecutorState& immutable_state, int64 step_id)
+    const ImmutableExecutorState& immutable_state, int64 step_id, bool vlog)
     : SimplePropagatorState(immutable_state, step_id,
-                            immutable_state.get_root_frame_info()) {}
+                            immutable_state.get_root_frame_info(), vlog) {}
 
 SimplePropagatorState::SimplePropagatorState(
     const ImmutableExecutorState& immutable_state, int64 step_id,
-    const ImmutableExecutorState::FrameInfo& finfo)
+    const ImmutableExecutorState::FrameInfo& finfo, bool vlog)
     : immutable_state_(immutable_state),
       step_id_(step_id),
-      vlog_(VLOG_IS_ON(1)),
+      vlog_(vlog || VLOG_IS_ON(1)),
       input_tensors_(finfo.total_inputs),
       pending_(
           new std::atomic<int32>[immutable_state.graph_view().num_nodes()]),
diff --git a/tensorflow/core/common_runtime/simple_propagator_state.h b/tensorflow/core/common_runtime/simple_propagator_state.h
index 1aee4c7ff2f..024341e5048 100644
--- a/tensorflow/core/common_runtime/simple_propagator_state.h
+++ b/tensorflow/core/common_runtime/simple_propagator_state.h
@@ -47,7 +47,7 @@ namespace tensorflow {
 class SimplePropagatorState {
  public:
   SimplePropagatorState(const ImmutableExecutorState& immutable_state,
-                        int64 step_id);
+                        int64 step_id, bool vlog);
   ~SimplePropagatorState();
 
   // A `TaggedNode` corresponds to a single invocation of a node's kernel,
@@ -157,7 +157,8 @@ class SimplePropagatorState {
  private:
   SimplePropagatorState(const ImmutableExecutorState& immutable_state_,
                         int64 step_id,
-                        const ImmutableExecutorState::FrameInfo& finfo);
+                        const ImmutableExecutorState::FrameInfo& finfo,
+                        bool vlog);
 
   const ImmutableExecutorState& immutable_state_;
   const int64 step_id_;

From 31f56e0d9a3213768fd9342c0c19290048000b3e Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Fri, 8 May 2020 16:19:32 -0700
Subject: [PATCH 0232/1533] Remove the special handling of TRTEngineOp in
 saved_model loader.

PiperOrigin-RevId: 310648600
Change-Id: Id13d89b4aae66155322b57c2b53a3ac802550c9b
---
 tensorflow/python/saved_model/function_deserialization.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index b36a1f27456..aeca90bdfd1 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -332,11 +332,6 @@ def load_function_def_library(library, load_shared_name_suffix=None):
 
     functions[fdef.signature.name] = func
     renamed_functions[func.name] = func
-    if any(op.type == "TRTEngineOp" for op in func_graph.get_operations()):
-      # TODO(b/150708051): Remove this hack once TensorRT SavedModel integration
-      # is fixed. Currently it's leaking memory to maintain bug compatibility
-      # with previous behavior.
-      func.add_to_graph(ops.get_default_graph())
 
   return functions
 

From 68952b2608d0b5bd614adb33b753288f99be66f9 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
Date: Sat, 9 May 2020 05:16:03 +0530
Subject: [PATCH 0233/1533] Added POC for tf_program

Signed-off-by: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
---
 tensorflow/python/tf_program/BUILD            |  22 +
 tensorflow/python/tf_program/mlir_gen.py      | 452 ++++++++++++++++++
 .../python/tf_program/mlir_wrapper/BUILD      |  36 ++
 .../python/tf_program/mlir_wrapper/attrs.cc   |  25 +
 .../tf_program/mlir_wrapper/basic_classes.cc  |  49 ++
 .../tf_program/mlir_wrapper/builders.cc       |  51 ++
 .../tf_program/mlir_wrapper/mlir_util.h       |  25 +
 .../tf_program/mlir_wrapper/mlir_wrapper.cc   |  42 ++
 .../python/tf_program/mlir_wrapper/ops.cc     | 194 ++++++++
 .../python/tf_program/mlir_wrapper/types.cc   |  48 ++
 tensorflow/python/tf_program/pywrap_tfd.py    | 149 ++++++
 tensorflow/python/tf_program/tests/BUILD      |  31 ++
 .../tf_program/tests/filecheck_wrapper.cc     |  36 ++
 .../python/tf_program/tests/mlir_gen_test.py  | 228 +++++++++
 14 files changed, 1388 insertions(+)
 create mode 100644 tensorflow/python/tf_program/BUILD
 create mode 100644 tensorflow/python/tf_program/mlir_gen.py
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/BUILD
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/attrs.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/builders.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/mlir_util.h
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/ops.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/types.cc
 create mode 100644 tensorflow/python/tf_program/pywrap_tfd.py
 create mode 100644 tensorflow/python/tf_program/tests/BUILD
 create mode 100644 tensorflow/python/tf_program/tests/filecheck_wrapper.cc
 create mode 100644 tensorflow/python/tf_program/tests/mlir_gen_test.py

diff --git a/tensorflow/python/tf_program/BUILD b/tensorflow/python/tf_program/BUILD
new file mode 100644
index 00000000000..69eb9e7c031
--- /dev/null
+++ b/tensorflow/python/tf_program/BUILD
@@ -0,0 +1,22 @@
+package(licenses = ["notice"])
+
+py_library(
+    name = "pywrap_tfd",
+    srcs = ["pywrap_tfd.py"],
+    deps = [
+        "//tensorflow/python/tf_program/mlir_wrapper",
+    ],
+)
+
+py_library(
+    name = "mlir_gen",
+    srcs = ["mlir_gen.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pywrap_tfd",
+        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/types",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/python/tf_program/mlir_gen.py b/tensorflow/python/tf_program/mlir_gen.py
new file mode 100644
index 00000000000..74622d44424
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_gen.py
@@ -0,0 +1,452 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+'''mlir_gen: Generate mlir code from python code'''
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast as ast
+import tensorflow.python.tf_program.pywrap_tfd as tfp
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import naming
+from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import annos
+from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
+from tensorflow.python.types import core
+
+class SymbolTable:
+  '''
+  Symbol Table for python code
+  '''
+  def __init__(self):
+    self.symbols = []
+    self.enter_scope()
+
+  def enter_scope(self):
+    '''
+    Enter a new scope - at function level
+    '''
+    self.symbols.append({'types': {}, 'symbols': {}})
+    self.curr_table = self.symbols[len(self.symbols)-1]
+
+  def insert_symbol(self, name, value):
+    self.curr_table['symbols'][name] = value
+    self.curr_table['types'][name] = value.getType()
+    return value
+
+  def insert_type(self, name, type_):
+    self.curr_table['types'][name] = type_
+
+  def exit_scope(self):
+    self.symbols.pop()
+    self.curr_table = self.symbols[len(self.symbols)-1]
+
+  def lookup(self, name):
+    curr_idx = len(self.symbols)-1
+    while curr_idx >= 0 and (not name in self.symbols[curr_idx]['symbols']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['symbols'][name]
+
+  def lookup_type(self, name):
+    curr_idx = len(self.symbols)-1
+    while curr_idx >= 0 and (not name in self.symbols[curr_idx]['types']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['types'][name]
+
+  def __repr__(self):
+    s = '\n'.join(' ' * idx * 2 + str(table)
+                  for idx, table in enumerate(self.symbols))
+    return s
+
+class ProcessType(ast.NodeVisitor):
+  '''
+  Visit a node and return processed type
+  Currently only visits annotations and gives their type
+  '''
+  def __init__(self, prog, ctx):
+    self.prog = prog
+    self.ctx = ctx
+
+  def visit_Attribute(self, node):
+    # Supported: core.Tensor
+    value = self.visit(node.value)
+    if value is None or not hasattr(value, node.attr):
+      raise AttributeError(str(type(value)) + ' has no attribute ' + node.attr)
+    attr = getattr(value, node.attr)
+
+    if attr == core.Tensor:
+      return tfp.UnrankedTensorType.get(tfp.IntegerType.get(32, self.prog.ctx))
+    return attr
+
+  def visit_Name(self, node):
+    if node.id == 'int':
+      return tfp.IntegerType.get(32, self.prog.ctx)
+    if node.id == 'bool':
+      return tfp.IntegerType.get(1, self.prog.ctx)
+    if node.id in self.ctx.info.namespace:
+      return self.ctx.info.namespace[node.id]
+
+class MLIRGen(ast.NodeVisitor):
+  '''
+  Visit the AST and generate MLIR code
+  Requires liveness, reading_definitions
+  '''
+  def __init__(self, ctx):
+    self.ctx = ctx
+    self.symbol_table = SymbolTable()
+    self.prog = tfp.TFProgram()
+    self.opbuilder = None
+
+  def visit_block(self, block):
+    return [self.visit(item) for item in block]
+
+  def process_type(self, node):
+    return ProcessType(self.prog, self.ctx).visit(node)
+
+  def visit_Assign(self, node):
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # If it is a tuple of values, assign one to each in targets
+      # TODO: This currently is assuming that all elts in targets[0] are Name
+      # objects. This might not be always True.
+      for key, val in zip(node.targets[0].elts, value):
+        self.symbol_table.insert_symbol(key.id, val)
+    else:
+      self.symbol_table.insert_symbol(node.targets[0].id, value)
+
+  def visit_BinOp(self, node):
+    left = self.visit(node.left)
+    right = self.visit(node.right)
+    if isinstance(node.op, ast.Sub):
+      return tfp.Tf_SubOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), left,
+          right).getResult(0)
+    if isinstance(node.op, ast.Add):
+      return tfp.Tf_AddV2Op.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), left,
+          right).getResult(0)
+
+  def visit_BoolOp(self, node):
+    values = [self.visit(value) for value in node.values]
+    if isinstance(node.op, ast.Or):
+      return tfp.OrOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), values).getResult(0)
+    if isinstance(node.op, ast.And):
+      return tfp.AndOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), values).getResult(0)
+
+  def visit_Call(self, node):
+    func = self.visit(node.func)
+    args = [self.visit(arg) for arg in node.args]
+    callop = tfp.Tf_LegacyCallOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(),
+        func.getType().getResults(), args, func.getName())
+    if callop.getNumResults() == 1:
+      return callop[0]
+    return tuple(callop.getResult(idx) for idx in range(callop.getNumResults()))
+
+  def visit_Compare(self, node):
+    left = self.visit(node.left)
+    opb = self.opbuilder
+    for op, right in zip(node.ops, node.comparators):
+      if isinstance(op, ast.Eq):
+        left = tfp.Tf_EqualOp.create(opb, opb.getUnknownLoc(), left,
+                                     self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Lt):
+        left = tfp.Tf_LessOp.create(opb, opb.getUnknownLoc(), left,
+                                    self.visit(right)).getResult(0)
+      elif isinstance(op, ast.LtE):
+        left = tfp.Tf_LessEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                         self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Gt):
+        left = tfp.Tf_GreaterOp.create(opb, opb.getUnknownLoc(), left,
+                                       self.visit(right)).getResult(0)
+      elif isinstance(op, ast.GtE):
+        left = tfp.Tf_GreaterEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                            self.visit(right)).getResult(0)
+      elif isinstance(op, ast.NotEq):
+        left = tfp.Tf_NotEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                        self.visit(right)).getResult(0)
+      else:
+        print(op)
+        raise NotImplementedError("CompareOp operator not recognized")
+    return left
+
+  def visit_Constant(self, node):
+    opb = self.opbuilder
+    value = None
+    if isinstance(node.value, int):
+      value = tfp.Tf_ConstOp.create(
+          opb, opb.getUnknownLoc(), tfp.IntegerAttr.get(tfp.IntegerType.get(
+              32, self.prog.ctx), node.value)).getResult(0)
+    return value
+
+  def visit_FunctionDef(self, node):
+    # Cache the current builder
+    cache_builder = self.opbuilder
+    inputs, outputs = [], []
+
+    for arg in node.args.args:
+      inputs.append(self.process_type(arg.annotation))
+
+    if node.returns:
+      outputs = [self.process_type(node.returns)]
+
+    currfunc = self.prog.add_function(
+        self.ctx.namer.new_symbol(node.name, []),
+        self.prog.get_function_type(inputs, outputs))
+
+    # Add the function to symbol table and enter new scope
+    self.symbol_table.insert_symbol(node.name, currfunc)
+    self.symbol_table.enter_scope()
+
+    # Add arguments to symbol table
+    for arg, value in zip(node.args.args, currfunc.getArguments()):
+      self.symbol_table.insert_symbol(arg.id, value)
+    self.opbuilder = tfp.OpBuilder(currfunc.getBody())
+
+    self.visit_block(node.body)
+    self.symbol_table.exit_scope()
+    self.opbuilder = cache_builder
+
+  def visit_If(self, node):
+    cond = self.visit(node.test)
+
+    # Create ifop
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
+    modified_in_cond = list(body_scope.modified | orelse_scope.modified)
+    outputs = [self.symbol_table.lookup_type(str(var))
+               for var in modified_in_cond]
+    ifop = tfp.IfOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(), cond, outputs)
+
+    # Cache the builder
+    cache_builder = self.opbuilder
+
+    # Visit body
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(0))
+    # Enter scope to avoid values generated inside the region to come in symbol table
+    self.symbol_table.enter_scope()
+    for stmt in node.body:
+      self.visit(stmt)
+    retvals = [self.symbol_table.lookup(str(varname))
+               for varname in modified_in_cond]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Visit orelse
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(1))
+    self.symbol_table.enter_scope()
+    for stmt in node.orelse:
+      self.visit(stmt)
+    retvals = [self.symbol_table.lookup(str(varname))
+               for varname in modified_in_cond]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Reset builder and enter return values in symbol table
+    self.opbuilder = cache_builder
+    for idx, var in enumerate(modified_in_cond):
+      self.symbol_table.insert_symbol(str(var), ifop.getResult(idx))
+
+    if ifop.getNumResults() == 1:
+      return ifop.getResult(0)
+
+    return tuple(ifop.getResult(i) for i in range(ifop.getNumResults()))
+
+  def visit_Name(self, node):
+    if self.symbol_table.lookup(node.id):
+      return self.symbol_table.lookup(node.id)
+    raise NotImplementedError('Symbol not found' + node.id)
+
+  def visit_Return(self, node):
+    opb = self.opbuilder
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # For more than one return values
+      return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), list(value))
+    return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), [value])
+
+  def visit_Tuple(self, node):
+    return tuple(self.visit(elt) for elt in node.elts)
+
+  def visit_UnaryOp(self, node):
+    operand = self.visit(node.operand)
+    if isinstance(node.op, ast.USub):
+      return tfp.Tf_NegOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), operand).getResult(0)
+
+  def _get_basic_loop_vars(self, modified, live_in, live_out):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to simple symbols (e.g. `x`).
+    basic_loop_vars = []
+    for s in modified:
+      if s.is_composite():
+        # TODO: Raise an error when this happens for a TF loop.
+        continue
+      # Variables not live into or out of the loop are considered local to the
+      # loop.
+      if s not in live_in and s not in live_out:
+        continue
+      basic_loop_vars.append(s)
+    return frozenset(basic_loop_vars)
+
+  def _get_composite_loop_vars(self, modified, live_in):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to composite symbols (e.g. `self.x`).
+    composite_loop_vars = []
+    for s in modified:
+      if not s.is_composite():
+        continue
+      # Mutations made to objects created inside the loop will appear as writes
+      # to composite symbols. Because these mutations appear as modifications
+      # made to composite symbols, we check whether the composite's parent is
+      # actually live into the loop.
+      # Example:
+      #   while cond:
+      #     x = Foo()
+      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      #
+      # Note that some parents might not be symbols - for example, in x['foo'],
+      # 'foo' is a parent, but it's a literal, not a symbol. We don't check the
+      # liveness of literals.
+      support_set_symbols = tuple(
+          sss for sss in s.support_set if sss.is_symbol())
+      if not all(sss in live_in for sss in support_set_symbols):
+        continue
+      composite_loop_vars.append(s)
+    return frozenset(composite_loop_vars)
+
+  def _get_loop_vars(self, node, modified):
+    # [This is directly from python/autograph/converters/control_flow.py]
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
+    live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
+    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    reserved_symbols = body_scope.referenced
+
+    basic_loop_vars = self._get_basic_loop_vars(modified, live_in, live_out)
+    composite_loop_vars = self._get_composite_loop_vars(modified, live_in)
+    loop_vars = tuple(basic_loop_vars | composite_loop_vars)
+
+    # Variable that are used or defined inside the loop, but not defined
+    # before entering the loop. Only simple variables must be defined. The
+    # composite ones will be implicitly checked at runtime.
+    undefined_lives = basic_loop_vars - defined_in
+
+    return loop_vars, reserved_symbols, undefined_lives
+
+  def visit_While(self, node):
+
+    # Create a new WhileOp
+    # `inputs` are initial values for loop variables
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    loop_vars, _, _ = self._get_loop_vars(node, body_scope.modified)
+    inputs = [self.symbol_table.lookup(str(name))
+              for name in loop_vars]
+    types = [input_.getType() for input_ in inputs]
+    while_op = tfp.WhileOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(), inputs, types)
+
+    # cache the current builder
+    cache_builder = self.opbuilder
+
+    # Process cond
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_), while_op.getRegion(0).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(0))
+    tfp.ReturnOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(), [self.visit(node.test)])
+    self.symbol_table.exit_scope()
+
+    # Process body
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_), while_op.getRegion(1).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(1))
+    self.visit_block(node.body)
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), [
+        self.symbol_table.lookup(str(name)) for name in loop_vars])
+    self.symbol_table.exit_scope()
+
+    # Enter new values as symbols
+    for idx, var in enumerate(loop_vars):
+      self.symbol_table.insert_symbol(str(var), while_op.getResult(idx))
+
+    # Restore builder
+    self.opbuilder = cache_builder
+
+def mlir_gen_internal(node, entity_info):
+  '''
+  Returns mlir module for unprocessed node `node`
+  '''
+  namer = naming.Namer({})
+  graphs = cfg.build(node)
+  ctx = transformer.Context(entity_info, namer, None)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx)
+  node = reaching_definitions.resolve(node, ctx, graphs)
+  node = reaching_fndefs.resolve(node, ctx, graphs)
+  node = liveness.resolve(node, ctx, graphs)
+  mlir_generator = MLIRGen(ctx)
+  mlir_generator.visit(node)
+  return mlir_generator.prog
+
+def mlir_gen(func):
+  '''
+  Parse a function and return TFProgram
+  '''
+  node, source = parser.parse_entity(func, future_features=())
+  entity_info = transformer.EntityInfo(
+      name=func.__name__,
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace=inspect_utils.getnamespace(func))
+  return mlir_gen_internal(node, entity_info)
+
+def mlir_gen_from_source(source=None, src_file=None):
+  if source is None:
+    source = open(src_file).read()
+  node = ast.parse(source)
+  entity_info = transformer.EntityInfo(
+      name="mlir_module",
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace={})
+  return mlir_gen_internal(node, entity_info)
diff --git a/tensorflow/python/tf_program/mlir_wrapper/BUILD b/tensorflow/python/tf_program/mlir_wrapper/BUILD
new file mode 100644
index 00000000000..ec600dcf67e
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/BUILD
@@ -0,0 +1,36 @@
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(licenses = ["notice"])
+
+tf_python_pybind_extension(
+    name = "mlir_wrapper",
+    srcs = ["mlir_wrapper.cc"],
+    module_name = "mlir_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
+
+cc_library(
+    name = "mlir_util",
+    srcs = [
+        "attrs.cc",
+        "basic_classes.cc",
+        "builders.cc",
+        "ops.cc",
+        "types.cc",
+    ],
+    hdrs = ["mlir_util.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/python/tf_program/mlir_wrapper/attrs.cc b/tensorflow/python/tf_program/mlir_wrapper/attrs.cc
new file mode 100644
index 00000000000..16ccc27ef2b
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/attrs.cc
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Types.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_attrs(py::module& m) {
+  py::class_<mlir::Attribute>(m, "Attribute");
+  py::class_<mlir::IntegerAttr, mlir::Attribute>(m, "IntegerAttr")
+      .def("get",
+           py::overload_cast<mlir::Type, int64_t>(&mlir::IntegerAttr::get));
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc b/tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc
new file mode 100644
index 00000000000..dabd012c0e7
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/FileCheck.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_basic_classes(py::module& m) {
+  py::class_<mlir::MLIRContext>(m, "MLIRContext").def(py::init<>());
+
+  py::class_<mlir::Location>(m, "Location");
+
+  py::class_<mlir::UnknownLoc>(m, "UnknownLoc")
+      .def("get", &mlir::UnknownLoc::get);
+
+  py::class_<mlir::Region>(m, "Region")
+      .def("back", &mlir::Region::back, py::return_value_policy::reference)
+      .def("front", &mlir::Region::front, py::return_value_policy::reference)
+      .def("add_block", [](mlir::Region& r) { r.push_back(new mlir::Block); })
+      .def("push_back", &mlir::Region::push_back)
+      .def("size", [](mlir::Region& r) { return r.getBlocks().size(); })
+      .def("front", &mlir::Region::front, py::return_value_policy::reference);
+  py::class_<mlir::Block::iterator>(m, "Block_Iterator");
+  py::class_<mlir::Block>(m, "Block")
+      .def("new", ([]() { return new mlir::Block; }),
+           py::return_value_policy::reference)
+      .def("end", &mlir::Block::end)
+      .def("addArgument", &mlir::Block::addArgument);
+
+  py::class_<mlir::Value>(m, "Value").def("getType", &mlir::Value::getType);
+  py::class_<mlir::OpResult, mlir::Value>(m, "OpResult");
+  py::class_<mlir::BlockArgument, mlir::Value>(m, "BlockArgument");
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/builders.cc b/tensorflow/python/tf_program/mlir_wrapper/builders.cc
new file mode 100644
index 00000000000..0e83c4d7e5e
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/builders.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Builders.h"
+
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_builders(py::module& m) {
+  py::class_<mlir::Builder>(m, "Builder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def("getFunctionType",
+           [](mlir::Builder& b, std::vector<mlir::Type> inputs,
+              std::vector<mlir::Type> outputs) {
+             return b.getFunctionType(llvm::ArrayRef<mlir::Type>(inputs),
+                                      llvm::ArrayRef<mlir::Type>(outputs));
+           });
+  py::class_<mlir::OpBuilder>(m, "OpBuilder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def(py::init<mlir::Region&>())
+      .def(py::init<mlir::Operation*>())
+      .def(py::init<mlir::Block*, mlir::Block::iterator>())
+      .def("getUnknownLoc", &mlir::OpBuilder::getUnknownLoc)
+      .def("setInsertionPoint",
+           py::overload_cast<mlir::Block*, mlir::Block::iterator>(
+               &mlir::OpBuilder::setInsertionPoint))
+      .def("saveInsertionPoint", &mlir::OpBuilder::saveInsertionPoint)
+      .def("restoreInsertionPoint", &mlir::OpBuilder::restoreInsertionPoint)
+      .def(
+          "createOperation",
+          [](mlir::OpBuilder& opb, mlir::OperationState& state) {
+            return opb.createOperation(state);
+          },
+          py::return_value_policy::reference)
+      .def("getContext", &mlir::OpBuilder::getContext,
+           py::return_value_policy::reference);
+
+  py::class_<mlir::OpBuilder::InsertPoint>(m, "OpBuilder_InsertionPoint")
+      .def("getBlock", &mlir::OpBuilder::InsertPoint::getBlock);
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/mlir_util.h b/tensorflow/python/tf_program/mlir_wrapper/mlir_util.h
new file mode 100644
index 00000000000..66350ad978a
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/mlir_util.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+void init_basic_classes(py::module& m);
+void init_types(py::module& m);
+void init_builders(py::module& m);
+void init_ops(py::module& m);
+void init_attrs(py::module& m);
diff --git a/tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc b/tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc
new file mode 100644
index 00000000000..55b0425c445
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+void init_basic_classes(py::module& m);
+void init_types(py::module& m);
+void init_builders(py::module& m);
+void init_ops(py::module& m);
+void init_attrs(py::module& m);
+
+PYBIND11_MODULE(mlir_wrapper, m) {
+  m.def("registerDialects", []() {
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+  });
+
+  init_basic_classes(m);
+  init_types(m);
+  init_builders(m);
+  init_ops(m);
+  init_attrs(m);
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/ops.cc b/tensorflow/python/tf_program/mlir_wrapper/ops.cc
new file mode 100644
index 00000000000..0391e31b9c2
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/ops.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_ops(py::module& m) {
+  py::class_<mlir::Operation, std::unique_ptr<mlir::Operation, py::nodelete>>(
+      m, "Operation")
+      .def("getRegion", &mlir::Operation::getRegion,
+           py::return_value_policy::reference)
+      .def("getResult", &mlir::Operation::getResult)
+      .def("dump", &mlir::Operation::dump)
+      .def("getNumResults", &mlir::Operation::getNumResults);
+
+  py::class_<mlir::OperationState>(m, "OperationState")
+      .def(py::init([](mlir::Location loc, std::string name) {
+        return mlir::OperationState(loc, llvm::StringRef(name));
+      }))
+      .def("addTypes",
+           [](mlir::OperationState& state, std::vector<mlir::Type> tys) {
+             state.addTypes(mlir::ArrayRef<mlir::Type>(tys));
+           })
+      .def("addOperands",
+           [](mlir::OperationState& os, std::vector<mlir::Value> ops) {
+             os.addOperands(mlir::ArrayRef<mlir::Value>(ops));
+           })
+      .def("addRegion", py::overload_cast<>(&mlir::OperationState::addRegion),
+           py::return_value_policy::reference);
+
+  py::class_<mlir::ModuleOp>(m, "ModuleOp")
+      .def("create",
+           [](mlir::Location loc) { return mlir::ModuleOp::create(loc); })
+      .def("push_back",
+           [](mlir::ModuleOp& m, mlir::FuncOp f) { m.push_back(f); })
+      .def("dump", &mlir::ModuleOp::dump)
+      .def("getAsStr", [](mlir::ModuleOp& m) {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        m.print(os);
+        return os.str();
+      });
+
+  py::class_<mlir::FuncOp>(m, "FuncOp")
+      .def("create",
+           [](mlir::Location location, std::string name,
+              mlir::FunctionType type) {
+             auto func = mlir::FuncOp::create(location, name, type);
+             func.addEntryBlock();
+             return func;
+           })
+      .def(
+          "getBody",
+          [](mlir::FuncOp& f) -> mlir::Region& { return f.getBody(); },
+          py::return_value_policy::reference)
+      .def("getArguments",
+           [](mlir::FuncOp& f) { return f.getArguments().vec(); })
+      .def("getName", [](mlir::FuncOp& f) { return f.getName().str(); })
+      .def("getType", &mlir::FuncOp::getType);
+
+  py::class_<mlir::ReturnOp>(m, "ReturnOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Value> values) -> mlir::Operation* {
+             return opb
+                 .create<mlir::ReturnOp>(loc,
+                                         mlir::ArrayRef<mlir::Value>(values))
+                 .getOperation();
+           });
+
+  // mlir::TF::AddOp
+  py::class_<mlir::TF::AddV2Op>(m, "Tf_AddV2Op")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::AddV2Op>(loc, x, y).getOperation();
+           });
+
+  py::class_<mlir::TF::AnyOp>(m, "Tf_AnyOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value input,
+              mlir::Value reduction_indices,
+              bool keep_dims = false) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::AnyOp>(loc, opb.getI1Type(), input,
+                                          reduction_indices, keep_dims)
+                 .getOperation();
+           });
+
+  // mlir::TF::ConstOp
+  py::class_<mlir::TF::ConstOp>(m, "Tf_ConstOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Attribute value) -> mlir::Operation* {
+             return opb.create<mlir::TF::ConstOp>(loc, value).getOperation();
+           });
+
+  // mlir::TF::EqualOp
+  py::class_<mlir::TF::EqualOp>(m, "Tf_EqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::EqualOp>(loc, x, y, opb.getBoolAttr(true))
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterEqualOp
+  py::class_<mlir::TF::GreaterEqualOp>(m, "Tf_GreaterEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterEqualOp>(loc, x, y)
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterOp
+  py::class_<mlir::TF::GreaterOp>(m, "Tf_GreaterOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LegacyCallOp
+  py::class_<mlir::TF::LegacyCallOp>(m, "Tf_LegacyCallOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Type> output, std::vector<mlir::Value> args,
+              std::string f) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::LegacyCallOp>(
+                     loc, mlir::ArrayRef<mlir::Type>(output),
+                     mlir::ArrayRef<mlir::Value>(args), mlir::StringRef(f))
+                 .getOperation();
+           });
+
+  // mlir::TF::LessEqualOp
+  py::class_<mlir::TF::LessEqualOp>(m, "Tf_LessEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessEqualOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LessOp
+  py::class_<mlir::TF::LessOp>(m, "Tf_LessOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::NegOp
+  py::class_<mlir::TF::NegOp>(m, "Tf_NegOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Value x) -> mlir::Operation* {
+             return opb.create<mlir::TF::NegOp>(loc, x).getOperation();
+           });
+
+  py::class_<mlir::TF::NotEqualOp>(m, "Tf_NotEqualOp")
+      .def("create", [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+                        mlir::Value y) {
+        return opb
+            .create<mlir::TF::NotEqualOp>(
+                loc, x, y, mlir::BoolAttr::get(true, opb.getContext()))
+            .getOperation();
+      });
+
+  // mlir::TF::SubOp
+  py::class_<mlir::TF::SubOp>(m, "Tf_SubOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::SubOp>(loc, x, y).getOperation();
+           });
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/types.cc b/tensorflow/python/tf_program/mlir_wrapper/types.cc
new file mode 100644
index 00000000000..461d10bd160
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/types.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/StandardTypes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_types(py::module& m) {
+  // Type
+  py::class_<mlir::Type> Type(m, "Type");
+  Type.def("getKind", &mlir::Type::getKind);
+
+  // Type Enums
+  py::enum_<mlir::StandardTypes::Kind>(Type, "StandardTypes_Kind")
+      .value("BF16", mlir::StandardTypes::BF16);
+
+  // Type Sub-classes
+  py::class_<mlir::FunctionType, mlir::Type>(m, "FunctionType")
+      .def("getResults",
+           [](mlir::FunctionType& ft) { return ft.getResults().vec(); });
+
+  py::class_<mlir::FloatType, mlir::Type>(m, "FloatType")
+      .def("get", &mlir::FloatType::get);
+
+  py::class_<mlir::IntegerType, mlir::Type>(m, "IntegerType")
+      .def("get", py::overload_cast<unsigned, mlir::MLIRContext*>(
+                      &mlir::IntegerType::get));
+
+  py::class_<mlir::UnrankedTensorType, mlir::Type>(m, "UnrankedTensorType")
+      .def("get", &mlir::UnrankedTensorType::get);
+
+  py::class_<mlir::RankedTensorType, mlir::Type>(m, "RankedTensorType")
+      .def("get", [](std::vector<int64_t> shape, mlir::Type ty) {
+        return mlir::RankedTensorType::get(mlir::ArrayRef<int64_t>(shape), ty);
+      });
+}
diff --git a/tensorflow/python/tf_program/pywrap_tfd.py b/tensorflow/python/tf_program/pywrap_tfd.py
new file mode 100644
index 00000000000..fa9415206cf
--- /dev/null
+++ b/tensorflow/python/tf_program/pywrap_tfd.py
@@ -0,0 +1,149 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+'''
+Intermediate between python bindings for MLIR and mlir generation for tensorflow
+program. This passes most of the mlir classes as is, but adds a few new
+operations and the basic structure for a tensorflow program
+'''
+
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.tf_program.mlir_wrapper import mlir_wrapper as mlir
+
+# Class Definitions
+OpBuilder = mlir.OpBuilder
+Block = mlir.Block
+
+# Types
+Type = mlir.Type
+IntegerType = mlir.IntegerType
+FloatType = mlir.FloatType
+RankedTensorType = mlir.RankedTensorType
+UnrankedTensorType = mlir.UnrankedTensorType
+IntegerAttr = mlir.IntegerAttr
+
+# Standard Ops
+ReturnOp = mlir.ReturnOp
+
+# TF Dialect Ops
+Tf_AnyOp = mlir.Tf_AnyOp
+Tf_AddV2Op = mlir.Tf_AddV2Op
+Tf_ConstOp = mlir.Tf_ConstOp
+Tf_EqualOp = mlir.Tf_EqualOp
+Tf_GreaterEqualOp = mlir.Tf_GreaterEqualOp
+Tf_GreaterOp = mlir.Tf_GreaterOp
+Tf_LegacyCallOp = mlir.Tf_LegacyCallOp
+Tf_LessEqualOp = mlir.Tf_LessEqualOp
+Tf_LessOp = mlir.Tf_LessOp
+Tf_NegOp = mlir.Tf_NegOp
+Tf_NotEqualOp = mlir.Tf_NotEqualOp
+Tf_SubOp = mlir.Tf_SubOp
+
+class IfOp:
+  '''
+  tfp.if(cond) ({body}, {orelse}) : type
+  If `cond` is true, `body` is executed, otherwise `orelse` is executed
+  '''
+  @classmethod
+  def create(cls, opb, loc, cond, outputs):
+    state = mlir.OperationState(loc, "tfp.If")
+    state.addOperands([cond])
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # body region
+    state.addRegion().push_back(Block.new())  # orelse region
+    return opb.createOperation(state)
+
+class OrOp:
+  '''
+  tfp.Or(ops...)
+  This is like tf.Any, except that the first dimension is opened into `ops`.
+  Returns a tensor of 1-bit integers which is "Logical OR" of the coressponding
+  elements in ops...
+  '''
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.Or")
+    state.addTypes([
+        UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+class AndOp:
+  '''
+  tfp.And(ops...)
+  This is like tf.All, except that the first dimension is opened to `ops`.
+  Returns a tensor of 1-bit integers which is "Logical AND" of the coressponding
+  elements in ops...
+  '''
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.And")
+    state.addTypes([
+        UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+class WhileOp:
+  '''
+  tfp.While(init-vals, {
+    ^bb1(cond-args):
+      cond-region
+      return cond
+  }, {
+    ^bb1(body-args):
+      body-region
+  })
+  As long as `cond-region` returns a "true"-like value, the body-region
+  is executed and the arguments are replaced by its return values for the next
+  iteration
+  '''
+  @classmethod
+  def create(cls, opb, loc, inputs, outputs):
+    state = mlir.OperationState(loc, "tfp.While")
+    state.addOperands(inputs)
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # cond region
+    state.addRegion().push_back(Block.new())  # body region
+    return opb.createOperation(state)
+
+class TFProgram:
+  '''
+  Python wrap for a Tensorflow Program (essentially an mlir Module)
+  '''
+  def __init__(self):
+    mlir.registerDialects()
+    self.ctx = mlir.MLIRContext()
+    self.builder = mlir.Builder(self.ctx)
+    self.module = mlir.ModuleOp.create(mlir.UnknownLoc.get(self.ctx))
+    self.curr_func = None
+
+  def add_function(self, name, func_type):
+    self.curr_func = mlir.FuncOp.create(
+        mlir.UnknownLoc.get(self.ctx), name, func_type)
+    self.module.push_back(self.curr_func)
+    return self.curr_func
+
+  def get_function_type(self, inputs, outputs):
+    return self.builder.getFunctionType(inputs, outputs)
+
+  def dump(self):
+    self.module.dump()
+
+  def __str__(self):
+    return self.module.getAsStr()
diff --git a/tensorflow/python/tf_program/tests/BUILD b/tensorflow/python/tf_program/tests/BUILD
new file mode 100644
index 00000000000..6b62d5528ec
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/BUILD
@@ -0,0 +1,31 @@
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(licenses = ["notice"])
+
+tf_python_pybind_extension(
+    name = "filecheck_wrapper",
+    srcs = ["filecheck_wrapper.cc"],
+    module_name = "filecheck_wrapper",
+    deps = [
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:support",
+        "@pybind11",
+    ],
+)
+
+py_test(
+    name = "mlir_gen_test",
+    size = "small",
+    testonly = True,
+    srcs = ["mlir_gen_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":filecheck_wrapper",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/tf_program:mlir_gen",
+        "//tensorflow/python/types",
+    ],
+)
diff --git a/tensorflow/python/tf_program/tests/filecheck_wrapper.cc b/tensorflow/python/tf_program/tests/filecheck_wrapper.cc
new file mode 100644
index 00000000000..4d3d4af4f11
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/filecheck_wrapper.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "llvm/Support/FileCheck.h"
+#include "llvm/Support/SourceMgr.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(filecheck_wrapper, m) {
+  m.def("check", [](std::string input, std::string check) {
+    llvm::FileCheckRequest fcr;
+    llvm::FileCheck fc(fcr);
+    llvm::SourceMgr SM = llvm::SourceMgr();
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
+                          llvm::SMLoc());
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(check),
+                          llvm::SMLoc());
+    llvm::Regex regex = fc.buildCheckPrefixRegex();
+    fc.readCheckFile(SM, llvm::StringRef(check), regex);
+    return fc.checkInput(SM, llvm::StringRef(input));
+  });
+}
diff --git a/tensorflow/python/tf_program/tests/mlir_gen_test.py b/tensorflow/python/tf_program/tests/mlir_gen_test.py
new file mode 100644
index 00000000000..664d561fb6a
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/mlir_gen_test.py
@@ -0,0 +1,228 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+'''Tests for `mlir_gen` module'''
+
+# pylint: disable=missing-function-docstring
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.types import core
+from tensorflow.python.tf_program.mlir_gen import mlir_gen
+
+import tensorflow.python.tf_program.tests.filecheck_wrapper as fw
+
+class MLIRGenTestBase(test.TestCase):
+
+  def _check_code(self, mlir_code, exp_mlir_code):
+    return self.assertTrue(fw.check(str(mlir_code), exp_mlir_code))
+
+class MLIRGenTest(MLIRGenTestBase):
+  '''MLIR Generation Tests for Tensorflow Program'''
+
+  def test_simple(self):
+
+    def test_fn():
+      pass
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r'''
+      CHECK-LABEL: @test_fn
+    '''
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_argument(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      return x
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r'''
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+        CHECK-NEXT: return %arg0 : tensor<*xi32>
+    '''
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_constant(self):
+    def test_fn()->int:
+      return 23
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+      CHECK: return %[[r0]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_BoolOp(self):
+    def test_fn(x: bool, y: bool)->bool:
+      return x or y or x and x and y
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: i1, %arg1: i1) -> i1
+      CHECK: %[[r0:[0-9]+]] = "tfp.And"(%arg0, %arg0, %arg1) : (i1, i1, i1) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tfp.Or"(%arg0, %arg1, %[[r0]]) : (i1, i1, tensor<*xi1>) -> tensor<*xi1>
+      return %[[r1]] : tensor<*xi1>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Call(self):
+    def test_fn():
+      def f1():
+        return 23
+      def f2():
+        return f1()
+      f2()
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn()
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f2} : () -> ()
+      CHECK: }
+      CHECK-LABEL: func @f1() {
+        CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+        CHECK: return %[[r0]] : tensor<i32>
+      CHECK: }
+      CHECK-LABEL: func @f2() {
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f1} : () -> ()
+      }
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Compare(self):
+    def test_fn(x: core.Tensor, y: core.Tensor, z: core.Tensor):
+      return x > y < z
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>)
+      CHECK: %[[r0:[0-9]+]] = "tf.Greater"(%arg0, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tf.Less"(%[[r0]], %arg2) : (tensor<*xi1>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Assign_BinOp(self):
+    def test_fn()->int:
+      y = 12 + 23 - 24
+      return y
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.AddV2"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: %[[r1:[0-9]+]] = "tf.Sub"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: return %[[r1]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_if(self):
+    def test_fn(x: core.Tensor)->int:
+      res = 0
+      if x > 0:
+        res = 1
+      elif x < 0:
+        res = -1
+      else:
+        res = 0
+      return res
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> i32
+      
+      CHECK: %[[r1:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK-NEXT: %[[r2:[0-9]+]] = "tfp.If"(%[[r1]]) ( {
+        CHECK: return %{{[0-9]+}} : tensor<i32>
+      CHECK-NEXT: },  {
+        CHECK: %[[r3:[0-9]+]] = "tf.Less"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK: %[[r4:[0-9]+]] = "tfp.If"(%[[r3]]) ( {
+          CHECK: %[[r5:[0-9]+]] = "tf.Neg"(%{{[0-9]+}}) : (tensor<i32>) -> tensor<i32>
+          CHECK: return %[[r5]] : tensor<i32>
+        CHECK-NEXT: },  {
+          CHECK: return %{{[0-9]+}} : tensor<i32>
+        CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+        CHECK: return %[[r4]] : tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+      CHECK-NEXT: return %[[r2]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_while(self):
+    def test_fn(x: core.Tensor)->core.Tensor:
+      s = 0
+      while x > 0:
+        s = s + x
+      return s
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+
+      CHECK: %[[r1:[0-9]+]] = "tfp.While"(%0) ( {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r2:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK-NEXT: return %[[r2]] : tensor<*xi1>
+      CHECK-NEXT: },  {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r3:[0-9]+]] = "tf.AddV2"(%arg1, %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
+        CHECK-NEXT: return %[[r3]] : tensor<*xi32>
+      CHECK-NEXT: }) : (tensor<i32>) -> tensor<i32>
+      CHECK-NEXT: return %[[r1]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_fibonacci(self):
+    def test_fn(x: core.Tensor)->core.Tensor:
+      res, idx = 0, 2
+      a, b = 0, 1
+      if x == 0 or x == 1:
+        res = x
+      else:
+        while idx <= x:
+          res = a + b
+          a = b
+          b = res
+          idx = idx + 1
+      return res
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+      CHECK: %[[r5:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r7:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r8:[0-9]+]] = "tfp.Or"(%[[r5]], %[[r7]]) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+      
+      CHECK: %[[r9:[0-9]+]]:4 = "tfp.If"(%[[r8]]) ( {
+        CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>
+        CHECK-NEXT: },  {
+        CHECK-NEXT: %[[r10:[0-9]+]]:4 = "tfp.While"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) ( {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r11:[0-9]+]] = "tf.LessEqual"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>) -> tensor<*xi1>
+          CHECK-NEXT: return %[[r11]] : tensor<*xi1>
+        CHECK-NEXT: },  {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r12:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK: %[[r13:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+        CHECK-NEXT: }) : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+        CHECK-NEXT: return %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+      CHECK-NEXT: return %[[r9]]#{{[0-9]+}} : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+
+if __name__ == '__main__':
+  test.main()

From 8efd27dceb967e0ad49f5a7dfea1c4c49af4b84c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 16:49:25 -0700
Subject: [PATCH 0234/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310653370
Change-Id: I486c302aa28530ed7147217e9912aace835b6d43
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From ebd34b3dc11919155974fdef3ead2df87ad30844 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Fri, 8 May 2020 16:58:02 -0700
Subject: [PATCH 0235/1533] Use subTest to improve error reporting on linear
 algebra ops.

e.g. cholesky_op_test.py, linalg_ops_test.py, lu_ops_test.py

PiperOrigin-RevId: 310654829
Change-Id: I82ba700cf02fb8122a43eabda530c6f60574e372
---
 .../python/kernel_tests/cholesky_op_test.py   | 18 ++--
 .../python/kernel_tests/linalg_ops_test.py    | 82 ++++++++++---------
 tensorflow/python/kernel_tests/lu_op_test.py  | 63 +++++++-------
 3 files changed, 89 insertions(+), 74 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index e17a029c5ff..7d5f7715eb1 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -114,12 +114,14 @@ class CholeskyOpTest(test.TestCase):
   def testBasic(self):
     data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]])
     for dtype in (np.float32, np.float64):
-      self._verifyCholesky(data.astype(dtype))
+      with self.subTest(dtype=dtype):
+        self._verifyCholesky(data.astype(dtype))
     for dtype in (np.complex64, np.complex128):
-      complex_data = np.tril(1j * data, -1).astype(dtype)
-      complex_data += np.triu(-1j * data, 1).astype(dtype)
-      complex_data += data
-      self._verifyCholesky(complex_data)
+      with self.subTest(dtype=dtype):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyCholesky(complex_data)
 
   def testBatch(self):
     simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
@@ -131,13 +133,15 @@ class CholeskyOpTest(test.TestCase):
     # Generate random positive-definite matrices.
     matrices = np.random.rand(10, 5, 5)
     for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T, matrices[i])
+      with self.subTest(i=i):
+        matrices[i] = np.dot(matrices[i].T, matrices[i])
     self._verifyCholesky(matrices)
 
     # Generate random complex valued positive-definite matrices.
     matrices = np.random.rand(10, 5, 5) + 1j * np.random.rand(10, 5, 5)
     for i in xrange(10):
-      matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
+      with self.subTest(i=i):
+        matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 20cd128783e..916d9a4b8c8 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -66,10 +66,11 @@ class CholeskySolveTest(test.TestCase):
                _RandomPDMatrix(n, self.rng)]).astype(np_type)
           chol = linalg_ops.cholesky(array)
           for k in range(1, 3):
-            rhs = self.rng.randn(2, n, k).astype(np_type)
-            x = linalg_ops.cholesky_solve(chol, rhs)
-            self.assertAllClose(
-                rhs, math_ops.matmul(array, x).eval(), atol=atol)
+            with self.subTest(n=n, np_type=np_type, atol=atol, k=k):
+              rhs = self.rng.randn(2, n, k).astype(np_type)
+              x = linalg_ops.cholesky_solve(chol, rhs)
+              self.assertAllClose(
+                  rhs, math_ops.matmul(array, x).eval(), atol=atol)
 
 
 class LogdetTest(test.TestCase):
@@ -82,24 +83,26 @@ class LogdetTest(test.TestCase):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
-        matrix = _RandomPDMatrix(n, self.rng, np_dtype)
-        _, logdet_np = np.linalg.slogdet(matrix)
-        with self.session(use_gpu=True):
-          # Create 2 x n x n matrix
-          # matrix = np.array(
-          #     [_RandomPDMatrix(n, self.rng, np_dtype),
-          #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
-          logdet_tf = linalg.logdet(matrix)
-          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
+        with self.subTest(n=n, np_dtype=np_dtype, atol=atol):
+          matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+          _, logdet_np = np.linalg.slogdet(matrix)
+          with self.session(use_gpu=True):
+            # Create 2 x n x n matrix
+            # matrix = np.array(
+            #     [_RandomPDMatrix(n, self.rng, np_dtype),
+            #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
+            logdet_tf = linalg.logdet(matrix)
+            self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                            (np.complex64, 0.05), (np.complex128, 1e-5)]:
-      matrix = (np.eye(20) * 1e-6).astype(np_dtype)
-      _, logdet_np = np.linalg.slogdet(matrix)
-      with self.session(use_gpu=True):
-        logdet_tf = linalg.logdet(matrix)
-        self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
+      with self.subTest(np_dtype=np_dtype, atol=atol):
+        matrix = (np.eye(20) * 1e-6).astype(np_dtype)
+        _, logdet_np = np.linalg.slogdet(matrix)
+        with self.session(use_gpu=True):
+          logdet_tf = linalg.logdet(matrix)
+          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
 
 class SlogdetTest(test.TestCase):
@@ -112,7 +115,20 @@ class SlogdetTest(test.TestCase):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
                              (np.complex64, 0.05), (np.complex128, 1e-5)]:
-        matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+        with self.subTest(n=n, np_dtype=np_dtype, atol=atol):
+          matrix = _RandomPDMatrix(n, self.rng, np_dtype)
+          sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
+          with self.session(use_gpu=True):
+            sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
+            self.assertAllClose(
+                log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+            self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
+
+  def test_works_with_underflow_case(self):
+    for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
+                           (np.complex64, 0.05), (np.complex128, 1e-5)]:
+      with self.subTest(np_dtype=np_dtype, atol=atol):
+        matrix = (np.eye(20) * 1e-6).astype(np_dtype)
         sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
         with self.session(use_gpu=True):
           sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
@@ -120,30 +136,20 @@ class SlogdetTest(test.TestCase):
               log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
           self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
-  def test_works_with_underflow_case(self):
-    for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
-                           (np.complex64, 0.05), (np.complex128, 1e-5)]:
-      matrix = (np.eye(20) * 1e-6).astype(np_dtype)
-      sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
-      with self.session(use_gpu=True):
-        sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-        self.assertAllClose(
-            log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
-        self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
-
 
 class AdjointTest(test.TestCase):
 
   def test_compare_to_numpy(self):
     for dtype in np.float64, np.float64, np.complex64, np.complex128:
-      matrix_np = np.array([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j,
-                                                       6 + 6j]]).astype(dtype)
-      expected_transposed = np.conj(matrix_np.T)
-      with self.session():
-        matrix = ops.convert_to_tensor(matrix_np)
-        transposed = linalg.adjoint(matrix)
-        self.assertEqual((3, 2), transposed.get_shape())
-        self.assertAllEqual(expected_transposed, self.evaluate(transposed))
+      with self.subTest(dtype=dtype):
+        matrix_np = np.array([[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j,
+                                                         6 + 6j]]).astype(dtype)
+        expected_transposed = np.conj(matrix_np.T)
+        with self.session():
+          matrix = ops.convert_to_tensor(matrix_np)
+          transposed = linalg.adjoint(matrix)
+          self.assertEqual((3, 2), transposed.get_shape())
+          self.assertAllEqual(expected_transposed, self.evaluate(transposed))
 
 
 class EyeTest(parameterized.TestCase, test.TestCase):
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 1c0280c3ce6..7935b66f4af 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -128,14 +128,16 @@ class LuOpTest(test.TestCase):
 
     for dtype in (np.float32, np.float64):
       for output_idx_type in (dtypes.int32, dtypes.int64):
-        self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
+        with self.subTest(dtype=dtype, output_idx_type=output_idx_type):
+          self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
 
     for dtype in (np.complex64, np.complex128):
       for output_idx_type in (dtypes.int32, dtypes.int64):
-        complex_data = np.tril(1j * data, -1).astype(dtype)
-        complex_data += np.triu(-1j * data, 1).astype(dtype)
-        complex_data += data
-        self._verifyLu(complex_data, output_idx_type=output_idx_type)
+        with self.subTest(dtype=dtype, output_idx_type=output_idx_type):
+          complex_data = np.tril(1j * data, -1).astype(dtype)
+          complex_data += np.triu(-1j * data, 1).astype(dtype)
+          complex_data += data
+          self._verifyLu(complex_data, output_idx_type=output_idx_type)
 
   def testPivoting(self):
     # This matrix triggers partial pivoting because the first diagonal entry
@@ -144,38 +146,41 @@ class LuOpTest(test.TestCase):
     self._verifyLu(data.astype(np.float32))
 
     for dtype in (np.float32, np.float64):
-      self._verifyLu(data.astype(dtype))
-      _, p = linalg_ops.lu(data)
-      p_val = self.evaluate([p])
-      # Make sure p_val is not the identity permutation.
-      self.assertNotAllClose(np.arange(3), p_val)
+      with self.subTest(dtype=dtype):
+        self._verifyLu(data.astype(dtype))
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
 
     for dtype in (np.complex64, np.complex128):
-      complex_data = np.tril(1j * data, -1).astype(dtype)
-      complex_data += np.triu(-1j * data, 1).astype(dtype)
-      complex_data += data
-      self._verifyLu(complex_data)
-      _, p = linalg_ops.lu(data)
-      p_val = self.evaluate([p])
-      # Make sure p_val is not the identity permutation.
-      self.assertNotAllClose(np.arange(3), p_val)
+      with self.subTest(dtype=dtype):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data)
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
 
   def testInvalidMatrix(self):
     # LU factorization gives an error when the input is singular.
     # Note: A singular matrix may return without error but it won't be a valid
     # factorization.
     for dtype in self.float_types:
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(
-            linalg_ops.lu(
-                np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
-                         dtype=dtype)))
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(
-            linalg_ops.lu(
-                np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
-                          [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
-                         dtype=dtype)))
+      with self.subTest(dtype=dtype):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                           dtype=dtype)))
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                           dtype=dtype)))
 
   def testBatch(self):
     simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)

From 4204c5f8565c9b5f7027c8d1b7ba7897754a297e Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Fri, 8 May 2020 17:09:34 -0700
Subject: [PATCH 0236/1533] Remove `_enable_legacy_iterators` flag in
 TPUStrategy.

PiperOrigin-RevId: 310656609
Change-Id: I0d4eaa8139689df1b6f644c87b1cd4028831dd52
---
 tensorflow/python/distribute/tpu_strategy.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 82a4a803510..6e51b84a1d1 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -140,10 +140,6 @@ class TPUStrategy(distribute_lib.Strategy):
     distribute_lib.distribution_strategy_replica_gauge.get_cell(
         "num_replicas_per_worker").set(self.extended.num_replicas_per_host)
 
-    # TODO(b/155193424): Enable OwnedMultiDeviceIterator on TPU Pod.
-    if self.extended.num_hosts > 1:
-      self._enable_legacy_iterators = True
-
   # TODO(cjfj): Modify `_call_for_each_replica` in `TPUExtended` such that this
   # can use the default implementation.
   # This implementation runs a single step. It does not use infeed or outfeed.

From 837b493f3cdcad50d126865a0a3a528c24cc6fca Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Fri, 8 May 2020 17:13:23 -0700
Subject: [PATCH 0237/1533] Implement Numpy to tensor conversion for TFRT.

PiperOrigin-RevId: 310657168
Change-Id: I3133a28194f41586f377d688dc64bff52f120d33
---
 tensorflow/c/eager/context_interface.h        | 14 +++++
 .../core/common_runtime/eager/context.cc      | 23 ++++++++
 .../core/common_runtime/eager/context.h       |  5 ++
 tensorflow/python/BUILD                       |  2 +
 tensorflow/python/client/tf_session_helper.cc |  7 ++-
 tensorflow/python/lib/core/ndarray_tensor.cc  | 58 ++++++++++++++-----
 tensorflow/python/lib/core/ndarray_tensor.h   | 20 ++++---
 tensorflow/python/lib/core/py_seq_tensor.cc   | 12 ++--
 8 files changed, 112 insertions(+), 29 deletions(-)

diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/context_interface.h
index 9377bf0be12..d21ab45e579 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/context_interface.h
@@ -59,6 +59,20 @@ class AbstractContextInterface {
   virtual AbstractTensorInterface* CreateTensor(
       DataType dtype, absl::Span<const int64> dim_sizes) = 0;
 
+  typedef void (*MemoryReleaser)(void* data, size_t len, void* arg);
+
+  // Create a tensor instance from the given data buffer and description.
+  // `memory_releaser` will be called on destruction, and it's responsible for
+  // cleaning up the underlying buffer. `convert_string` indicates whether it
+  // has to handle tstring conversion. Expected to be removed once tstring
+  // migration is done.
+  virtual AbstractTensorInterface* CreateTensor(DataType dtype,
+                                                const int64_t* dims,
+                                                int num_dims, void* data,
+                                                size_t len, bool convert_string,
+                                                MemoryReleaser memory_releaser,
+                                                void* memory_releaser_arg) = 0;
+
   // Create a handle to wrap and manage a Tensor
   virtual AbstractTensorHandleInterface* CreateLocalHandle(
       AbstractTensorInterface* t) = 0;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 35780077aa8..b8dfe92aac6 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
 
+#include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/c/eager/operation_interface.h"
 #include "tensorflow/c/eager/tensor_handle_interface.h"
@@ -168,6 +169,28 @@ AbstractTensorInterface* EagerContext::CreateTensor(
   return new TensorInterface(Tensor(dtype, TensorShape(dim_sizes)));
 }
 
+AbstractTensorInterface* EagerContext::CreateTensor(
+    DataType dtype, const int64_t* dims, int num_dims, void* data, size_t len,
+    bool convert_string, MemoryReleaser memory_releaser,
+    void* memory_releaser_arg) {
+  TF_Tensor* tensor_wrapper =
+      TF_NewTensor(static_cast<TF_DataType>(dtype), dims, num_dims, data, len,
+                   memory_releaser, memory_releaser_arg);
+
+  if (convert_string) {
+    tensorflow::Tensor tensor;
+    Status status = TF_TensorToTensor(tensor_wrapper, &tensor);
+    TF_DeleteTensor(tensor_wrapper);
+    if (!status.ok()) return nullptr;
+    return new TensorInterface(std::move(tensor));
+  } else {
+    AbstractTensorInterface* result = nullptr;
+    std::swap(result, tensor_wrapper->tensor);
+    TF_DeleteTensor(tensor_wrapper);
+    return result;
+  }
+}
+
 std::unique_ptr<SavedModelAPI> EagerContext::LoadSavedModelAPI(
     const std::string& directory,
     const absl::optional<std::unordered_set<std::string>>& tags,
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index c5404773ba6..683425919d1 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -173,6 +173,11 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
 
   AbstractTensorInterface* CreateTensor(
       DataType dtype, absl::Span<const int64> dim_sizes) override;
+  AbstractTensorInterface* CreateTensor(DataType dtype, const int64_t* dims,
+                                        int num_dims, void* data, size_t len,
+                                        bool convert_string,
+                                        MemoryReleaser memory_releaser,
+                                        void* memory_releaser_arg) override;
 
   AbstractTensorHandleInterface* CreateLocalHandle(
       AbstractTensorInterface* t) override;
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4729ce9d743..0b046ea8d61 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -996,6 +996,8 @@ cc_library(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_tensor_internal",
+        "//tensorflow/c/eager:tfe_context_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc
index 78a1613c86c..cb960fd599a 100644
--- a/tensorflow/python/client/tf_session_helper.cc
+++ b/tensorflow/python/client/tf_session_helper.cc
@@ -89,7 +89,8 @@ void TF_Run_wrapper_helper(TF_DeprecatedSession* session, const char* handle,
     input_names.push_back(key_string);
 
     inputs_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = PyArrayToTF_Tensor(value, &inputs_safe.back());
+    s = NdarrayToTensor(nullptr /*ctx*/, value, &inputs_safe.back(),
+                        true /*convert_to_string*/);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
@@ -367,7 +368,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   // cleaned up properly.
   //
   // Memory management:
-  // PyArrayToTF_Tensor() creates a new ndarray PyObject from the input
+  // NdarrayToTensor() creates a new ndarray PyObject from the input
   // ndarray. We manage the new ndarray's lifetime in order to keep the
   // underlying data buffer alive (the new ndarray also guarantees a contiguous
   // data buffer). The new ndarray's data buffer is used to create the
@@ -382,7 +383,7 @@ void TF_SessionRun_wrapper_helper(TF_Session* session, const char* handle,
   std::vector<Safe_TF_TensorPtr> input_vals_safe;
   for (PyObject* ndarray : input_ndarrays) {
     input_vals_safe.emplace_back(make_safe(static_cast<TF_Tensor*>(nullptr)));
-    s = PyArrayToTF_Tensor(ndarray, &input_vals_safe.back());
+    s = NdarrayToTensor(nullptr, ndarray, &input_vals_safe.back(), true);
     if (!s.ok()) {
       Set_TF_Status_from_Status(out_status, s);
       return;
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 2f9972c81bf..2afd2888e8f 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cstring>
 
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -488,8 +490,9 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
   return Status::OK();
 }
 
-Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor) {
-  DCHECK(out_tensor != nullptr);
+Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
+                       Safe_TF_TensorPtr* ret, bool convert_string) {
+  DCHECK(ret != nullptr);
 
   // Make sure we dereference this array object in case of error, etc.
   Safe_PyObjectPtr array_safe(make_safe(
@@ -515,26 +518,52 @@ Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor) {
   if (dtype == TF_RESOURCE) {
     size_t size = PyArray_NBYTES(array);
     array_safe.release();
-    *out_tensor = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array),
-                                         size, &DelayedNumpyDecref, array));
+
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), {}, 0, PyArray_DATA(array),
+          size, convert_string, &DelayedNumpyDecref, array)});
+    } else {
+      *ret = make_safe(TF_NewTensor(dtype, {}, 0, PyArray_DATA(array), size,
+                                    &DelayedNumpyDecref, array));
+    }
 
   } else if (dtype != TF_STRING) {
     size_t size = PyArray_NBYTES(array);
     array_safe.release();
-    *out_tensor = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
-                                         PyArray_DATA(array), size,
-                                         &DelayedNumpyDecref, array));
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
+          PyArray_DATA(array), size, convert_string, &DelayedNumpyDecref,
+          array)});
+    } else {
+      *ret = make_safe(TF_NewTensor(dtype, dims.data(), dims.size(),
+                                    PyArray_DATA(array), size,
+                                    &DelayedNumpyDecref, array));
+    }
+
   } else {
     size_t size = 0;
     void* encoded = nullptr;
     TF_RETURN_IF_ERROR(EncodePyBytesArray(array, nelems, &size, &encoded));
-    *out_tensor = make_safe(TF_NewTensor(
-        dtype, dims.data(), dims.size(), encoded, size,
-        [](void* data, size_t len, void* arg) {
-          delete[] reinterpret_cast<char*>(data);
-        },
-        nullptr));
+    if (ctx) {
+      *ret = make_safe(new TF_Tensor{tensorflow::unwrap(ctx)->CreateTensor(
+          static_cast<tensorflow::DataType>(dtype), dims.data(), dims.size(),
+          encoded, size, convert_string,
+          [](void* data, size_t len, void* arg) {
+            delete[] reinterpret_cast<char*>(data);
+          },
+          nullptr)});
+    } else {
+      *ret = make_safe(TF_NewTensor(
+          dtype, dims.data(), dims.size(), encoded, size,
+          [](void* data, size_t len, void* arg) {
+            delete[] reinterpret_cast<char*>(data);
+          },
+          nullptr));
+    }
   }
+
   return Status::OK();
 }
 
@@ -543,7 +572,8 @@ TF_Tensor* TF_TensorFromTensor(const tensorflow::Tensor& src, Status* status);
 
 Status NdarrayToTensor(PyObject* obj, Tensor* ret) {
   Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
-  Status s = PyArrayToTF_Tensor(obj, &tf_tensor);
+  Status s = NdarrayToTensor(nullptr /*ctx*/, obj, &tf_tensor,
+                             false /*convert_string*/);
   if (!s.ok()) {
     return s;
   }
diff --git a/tensorflow/python/lib/core/ndarray_tensor.h b/tensorflow/python/lib/core/ndarray_tensor.h
index c5cd24cff2d..38c098417d5 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.h
+++ b/tensorflow/python/lib/core/ndarray_tensor.h
@@ -28,15 +28,21 @@ Status TF_TensorToMaybeAliasedPyArray(Safe_TF_TensorPtr tensor,
 
 Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray);
 
-// Converts the given numpy ndarray to a (safe) TF_Tensor. The returned
-// TF_Tensor in `out_tensor` may have its own Python reference to `ndarray`s
-// data. After `out_tensor` is destroyed, this reference must (eventually) be
-// decremented via ClearDecrefCache().
-//
-// `out_tensor` must be non-null. Caller retains ownership of `ndarray`.
-Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor);
+// Creates a tensor in 'ret' from the input `ndarray`. The returned TF_Tensor
+// in `ret` may have its own Python reference to `ndarray`s data. After `ret`
+// is destroyed, this reference must (eventually) be decremented via
+// ClearDecrefCache().
+// `convert_string` indicates whether it has to handle tstring conversion.
+// Expected to be removed once tstring migration is done.
+ABSL_MUST_USE_RESULT
+Status NdarrayToTensor(TFE_Context* ctx, PyObject* ndarray,
+                       Safe_TF_TensorPtr* ret, bool convert_string);
 
 // Creates a tensor in 'ret' from the input Ndarray.
+// TODO(kkb): This is an old conversion function that does not support TFRT.
+// Currently it's used for session, py_func, and an internal project.  Migrate
+// them.
+ABSL_MUST_USE_RESULT
 Status NdarrayToTensor(PyObject* obj, Tensor* ret);
 
 // Creates a numpy array in 'ret' which either aliases the content of 't' or has
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index ecf4a92f0e7..22829f546b1 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -681,9 +681,11 @@ typedef Converter<bool> BoolConverter;
 // The two may share underlying storage so changes to one may reflect in the
 // other.
 TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
-  tensorflow::Tensor tensor;
-  tensorflow::Status status = tensorflow::NdarrayToTensor(obj, &tensor);
-  if (!status.ok()) {
+  Safe_TF_TensorPtr tf_tensor = make_safe(static_cast<TF_Tensor*>(nullptr));
+  Status status = tensorflow::NdarrayToTensor(ctx, obj, &tf_tensor,
+                                              true /*convert_string*/);
+
+  if (TF_PREDICT_FALSE(!status.ok())) {
     PyErr_SetString(PyExc_ValueError,
                     tensorflow::strings::StrCat(
                         "Failed to convert a NumPy array to a Tensor (",
@@ -692,8 +694,8 @@ TFE_TensorHandle* NumpyToTFE_TensorHandle(TFE_Context* ctx, PyObject* obj) {
     return nullptr;
   }
 
-  TensorInterface t(std::move(tensor));
-  return tensorflow::wrap(tensorflow::unwrap(ctx)->CreateLocalHandle(&t));
+  return tensorflow::wrap(
+      tensorflow::unwrap(ctx)->CreateLocalHandle(tf_tensor->tensor));
 }
 
 }  // namespace

From 2b4f4efe00177e9bad7d7ced61474c7a0cc87e5d Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 17:29:23 -0700
Subject: [PATCH 0238/1533] Remove timing code from profiler

PiperOrigin-RevId: 310659150
Change-Id: I6e84b2f9195dda37d4d6677097aadc70a46c7616
---
 tensorflow/core/profiler/utils/event_span.cc | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 1c64f7bf6bb..9768331b88f 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -14,9 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/event_span.h"
 
-#include <chrono>  // NOLINT
-#include <ctime>
-#include <thread>  // NOLINT
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -269,10 +266,7 @@ void CombineStepEvents(const StepEvents& src, StepEvents* dst) {
 
 // Converts from overlapped step-events to non-overlapped step-events.
 StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
-  auto start_time = std::chrono::steady_clock::now();
   StepEvents non_overlapped_step_events;
-
-  // We could parallelize the following loop if necessary.
   for (const auto& step_events : overlapped_step_events) {
     const auto& step_id = step_events.first;
     const auto& step_details = step_events.second;
@@ -281,12 +275,6 @@ StepEvents ToNonOverlappedStepEvents(const StepEvents& overlapped_step_events) {
     *non_overlapped_step_events[step_id].MutableEvents() =
         ToNonOverlappedEvents(step_details.Events());
   }
-  auto end_time = std::chrono::steady_clock::now();
-  auto elapsed_time_us = std::chrono::duration_cast<std::chrono::microseconds>(
-      end_time - start_time);
-  double elapsed_time_ms = elapsed_time_us.count() / 1000.0;
-  LOG(INFO) << "Generation of step-events took " << elapsed_time_ms << " ms"
-            << std::endl;
   return non_overlapped_step_events;
 }
 

From 4385e797a91c73762901d6c07f19def50db20014 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 8 May 2020 17:29:27 -0700
Subject: [PATCH 0239/1533] Remove unused header

PiperOrigin-RevId: 310659171
Change-Id: I3c435ba0d734cce16c684c39ef0a5793d33ccad6
---
 tensorflow/core/profiler/utils/xplane_visitor.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index 52aa60bb2e6..8cd805c5cdb 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
 
 #include <functional>
-#include <unordered_map>
 #include <utility>
 
 #include "absl/container/flat_hash_map.h"

From d2fdc7b0129e5ccc1a78553cc98c361d805893a9 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Fri, 8 May 2020 17:44:10 -0700
Subject: [PATCH 0240/1533] Import and export constants of complex types for
 HLO

PiperOrigin-RevId: 310660936
Change-Id: I732ec0d8f16a71b0529408a677f6c144ce299228
---
 tensorflow/compiler/mlir/xla/hlo_utils.cc              |  4 ++++
 tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc        |  2 ++
 .../compiler/mlir/xla/tests/translate/export.mlir      |  6 ++++++
 .../compiler/mlir/xla/tests/translate/import.hlotxt    | 10 +++++++---
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/hlo_utils.cc b/tensorflow/compiler/mlir/xla/hlo_utils.cc
index c685cc296fd..dc801f64ede 100644
--- a/tensorflow/compiler/mlir/xla/hlo_utils.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_utils.cc
@@ -139,6 +139,10 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
       return CreateDenseAttrFromLiteral<uint32>(type, literal);
     case PrimitiveType::U64:
       return CreateDenseAttrFromLiteral<uint64>(type, literal);
+    case PrimitiveType::C64:
+      return CreateDenseAttrFromLiteral<complex64>(type, literal);
+    case PrimitiveType::C128:
+      return CreateDenseAttrFromLiteral<complex128>(type, literal);
     default:
       return tensorflow::errors::Internal(
           absl::StrCat("Unsupported type: ", PrimitiveType_Name(element_type)));
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index a1fb6b559e3..21668b7e059 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -933,6 +933,8 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U16, uint16)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U32, uint32)
     ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U64, uint64)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C64, std::complex<float>)
+    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C128, std::complex<double>)
     case xla::PrimitiveType::F16: {
       llvm::SmallVector<xla::half, 16> values;
       values.reserve(attr.getNumElements());
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 3650307ea94..ed06863cbf4 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -294,6 +294,12 @@ func @main() {
   // CHECK: f16[4] constant({1, -4, -65504, 0.015625}
   %cst_8 = constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
 
+  // CHECK: c64[] constant((1, 0))
+  %cst_9 = constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+
+  // CHECK: c128[] constant((1, 0))
+  %cst_10 = constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+
   return
 }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 75471e3a090..207a8f2eabc 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -212,10 +212,14 @@ add {
   // CHECK: dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
   %constant.3 = bf16[4] constant({1, 2, 3, 4})
 
+  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
+  %constant.4 = c64[] constant((1, 0))
+
+  // CHECK: dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
+  %constant.5 = c128[] constant((1, 0))
+
   // CHECK: dense<[1.000000e+00, -4.000000e+00, -6.550400e+04, 1.562500e-02]> : tensor<4xf16>
-  ROOT %constant.4 = f16[4] constant({1, -4, -65504, 0.015625})
-
-
+  ROOT %constant.6 = f16[4] constant({1, -4, -65504, 0.015625})
 }
 
 // TODO(b/129422361) Potentially update when copy, reshape, and conv have actual

From 4ece5df0d86d83875ccf89d549fa7836b8981b2d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 19:14:57 -0700
Subject: [PATCH 0241/1533] Enable memory_space verification for fusion input
 parameters.

PiperOrigin-RevId: 310670198
Change-Id: I915172aded7a78e8a6f2203859b586493e83ab43
---
 tensorflow/compiler/xla/service/hlo_verifier.cc | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 0911af10f38..91f51ea33cf 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -697,11 +697,7 @@ Status ShapeVerifier::HandleFusion(HloInstruction* fusion) {
   }
   for (HloInstruction* fused_param : fused_parameters) {
     int64 param_no = fused_param->parameter_number();
-    // Since fusion buffers aren't materialized, fusion parameters will not have
-    // the same memory space as the fusion operand.
-    if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape(),
-                    /*minor_to_major_only=*/false,
-                    /*ignore_memory_space=*/true)) {
+    if (!ShapesSame(fused_param->shape(), fusion->operand(param_no)->shape())) {
       return InternalError(
           "Shape mismatch between parameter number %d and its operand in "
           "%s.",

From 3528e494a24379873d1cc35022a3d88b5d771654 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Fri, 8 May 2020 19:32:09 -0700
Subject: [PATCH 0242/1533] Add missing ASSERT_EQ on status after API call in
 c_api_unified_experimental_test

PiperOrigin-RevId: 310671562
Change-Id: Id0b07d6889340d631f6144f8fa6dd3f3309b3776
---
 tensorflow/c/eager/c_api_unified_experimental_test.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 170b82333d8..8c9aa97ea8f 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -131,6 +131,7 @@ TEST(UnifedCAPI, TestBasicGraph) {
   string fn_name = "double";
   TF_AbstractFunction* func = TF_ExecutionContextToFunction(
       graph_ctx, fn_name.c_str(), 1, placeholder_t, 1, output_t, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TF_DeleteAbstractTensor(placeholder_t);
   TF_DeleteAbstractTensor(output_t);
 

From 431d009ecbb7d3b128a5cb4298261d4531a95b32 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 19:35:23 -0700
Subject: [PATCH 0243/1533] Split table management off into a table_utils file.

PiperOrigin-RevId: 310671808
Change-Id: Ifd6b18aff3e7873225887e03dfa171e7577a1cae
---
 .../python/keras/layers/preprocessing/BUILD   |  39 +++
 .../layers/preprocessing/index_lookup.py      | 217 ++++------------
 .../layers/preprocessing/index_lookup_test.py |   8 +-
 .../layers/preprocessing/index_lookup_v1.py   |  39 +--
 .../keras/layers/preprocessing/table_utils.py | 192 ++++++++++++++
 .../layers/preprocessing/table_utils_test.py  | 243 ++++++++++++++++++
 6 files changed, 524 insertions(+), 214 deletions(-)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/table_utils.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/table_utils_test.py

diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 78b00d6c16e..501c99fe890 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -110,6 +110,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":table_utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -145,6 +146,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "table_utils",
+    srcs = [
+        "table_utils.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras/engine:base_preprocessing_layer",
+        "//tensorflow/python/ops/ragged",
+    ],
+)
+
 py_library(
     name = "text_vectorization",
     srcs = [
@@ -412,6 +437,20 @@ distribute_py_test(
     ],
 )
 
+tf_py_test(
+    name = "table_utils_test",
+    srcs = ["table_utils_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":table_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "text_vectorization_test",
     size = "medium",
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index 812eeca7ea3..d6c8a07c8ba 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -24,17 +24,11 @@ import operator
 import numpy as np
 
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import base_preprocessing_layer
-from tensorflow.python.ops import array_ops
+from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.ops import lookup_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops.ragged import ragged_functional_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import gfile
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -100,23 +94,29 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
                reserve_zero=True,
                mask_zero=False,
                **kwargs):
-    allowed_dtypes = [dtypes.string, dtypes.int64]
+    invert = False
+    if invert:
+      allowed_dtypes = [dtypes.int32, dtypes.int64]
+    else:
+      allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]
+
     if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
-      raise ValueError(
-          "TextVectorization may only have a dtype of string or int64.")
-    elif "dtype" not in kwargs:
-      kwargs["dtype"] = dtypes.string
+      raise ValueError("TextVectorization may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.int64 if invert else dtypes.string
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
     # are creating a 0-element vocab, which doesn't make sense.
     if max_tokens is not None and max_tokens <= 1:
-      raise ValueError("max_tokens must be greater than 1.")
+      raise ValueError("If set, max_tokens must be greater than 1.")
 
-    # For now, limit the num_oov_tokens to one.
     if num_oov_tokens < 0:
       raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
                        num_oov_tokens)
 
+    self.invert = invert
     self.max_tokens = max_tokens
     self.num_oov_tokens = num_oov_tokens
     self.reserve_zero = reserve_zero
@@ -167,91 +167,24 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # counting code in the Model object doesn't throw an attribute error.
     tracked_table.shape = tensor_shape.TensorShape((0,))
 
-    self._inverse_table = None
+    if self.num_oov_tokens <= 1:
+      oov_tokens = None
+    else:
+      oov_start = 1 if reserve_zero else 0
+      oov_tokens = list(range(oov_start, self._reserved_values))
+
+    self._table_handler = table_utils.TableHandler(
+        table=self._table,
+        oov_tokens=oov_tokens,
+        use_v1_apis=self._use_v1_apis())
 
     if vocabulary is not None:
       if isinstance(vocabulary, str):
-        vocabulary = self._get_vocabulary_from_file(vocabulary)
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
+      table_utils.validate_vocabulary_is_unique(vocabulary)
 
-      vocabulary_set = set(vocabulary)
-      if len(vocabulary) != len(vocabulary_set):
-        repeated_items = [
-            item for item, count in collections.Counter(vocabulary).items()
-            if count > 1
-        ]
-        raise ValueError("The passed vocabulary has at least one repeated "
-                         "term. Please uniquify your dataset before passing "
-                         "it to IndexLookup(). The repeated terms are %s" %
-                         repeated_items)
       self.set_vocabulary(vocabulary)
 
-  def _get_vocabulary_from_file(self, vocabulary_path):
-    vocab = []
-    with gfile.GFile(vocabulary_path, "r") as reader:
-      while True:
-        # Get the next line, and break if it is None.
-        text = reader.readline()
-        if not text:
-          break
-
-        # Convert the raw text into UTF8 and strip whitespace.
-        if isinstance(text, str):
-          token = text
-        elif isinstance(text, bytes):
-          token = text.decode("utf-8", "ignore")
-        token = token.strip()
-        vocab.append(token)
-    return vocab
-
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    return (keys.numpy(), values.numpy())
-
-  def vocab_size(self):
-    return self._table.size().numpy()
-
-  def _clear_table(self):
-    keys, _ = self._table.export()
-    self._table.remove(keys)
-    if self._inverse_table:
-      keys, _ = self._inverse_table.export()
-      self._inverse_table.remove(keys)
-
-  def _insert_table_data(self, keys, values):
-    if len(values) != len(keys):
-      raise RuntimeError("Size mismatch between values and key arrays. "
-                         "Keys had size %s, values had size %s." %
-                         (len(keys), len(values)))
-    self._table.insert(keys, values)
-    if self._inverse_table:
-      self._inverse_table.insert(values, keys)
-
-  def _initialize_inverse_table(self):
-    keys, values = self._table.export()
-    self._inverse_table.insert(values, keys)
-
-  def _to_numpy(self, preprocessed_data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(preprocessed_data, np.ndarray):
-      return preprocessed_data
-    return np.array(preprocessed_data.to_list())
-  # End of V1/V2 shim points.
-
-  def _assert_same_type(self, expected_type, values, value_name):
-    if dtypes.as_dtype(expected_type) != dtypes.as_dtype(values.dtype):
-      raise RuntimeError("Expected %s type %s, got %s" %
-                         (value_name, expected_type, values.dtype))
-
-  def _convert_to_ndarray(self, x, dtype=None):
-    array = np.array(x) if isinstance(x, (list, tuple)) else x
-    if dtype not in (None, dtypes.string):
-      # If the dtype is an integer, we do permissive casting. This allows
-      # users to examine int32 data if the dtype is int64 without trouble.
-      np_dtype = dtypes.as_dtype(dtype).as_numpy_dtype
-      if np.can_cast(array.dtype, np_dtype):
-        array = array.astype(np_dtype, casting="safe")
-    return array
-
   def compute_output_shape(self, input_shape):
     return input_shape
 
@@ -281,10 +214,10 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     super(IndexLookup, self).adapt(data, reset_state)
 
   def get_vocabulary(self):
-    if self.vocab_size() == 0:
+    if self._table_handler.vocab_size() == 0:
       return []
 
-    keys, values = self._get_table_data()
+    keys, values = self._table_handler.data()
     # This is required because the MutableHashTable doesn't preserve insertion
     # order, but we rely on the order of the array to assign indices.
     if self.dtype == dtypes.string:
@@ -292,6 +225,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     else:
       return [x for _, x in sorted(zip(values, keys))]
 
+  def vocab_size(self):
+    return self._table_handler.vocab_size()
+
   def get_config(self):
     config = {
         "max_tokens": self.max_tokens,
@@ -329,7 +265,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
       ValueError: If there are too many inputs, the inputs do not match, or
         input data is missing.
     """
-    current_table_size = self.vocab_size()
+    current_table_size = self._table_handler.vocab_size()
     total_vocab_size = len(vocab) + (current_table_size if append else 0)
     if self.max_tokens is not None and total_vocab_size > self._max_elements:
       raise ValueError(
@@ -338,93 +274,28 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
           "token(s) are automatically added to the number of tokens." %
           (total_vocab_size, self.max_tokens))
 
-    start_index = self._reserved_values + (self.vocab_size() if append else 0)
+    start_index = self._reserved_values + (current_table_size if append else 0)
     values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
-    vocab = self._convert_to_ndarray(vocab, self.dtype)
-    self._assert_same_type(self.dtype, vocab, "vocab")
+    vocab = table_utils.convert_to_ndarray(vocab, self.dtype)
+    table_utils.assert_same_type(self.dtype, vocab, "vocab")
 
-    values = self._convert_to_ndarray(values, self._output_dtype)
-    self._assert_same_type(self._output_dtype, values, "values")
+    values = table_utils.convert_to_ndarray(values, self._output_dtype)
+    table_utils.assert_same_type(self._output_dtype, values, "values")
 
-    if not append and self.vocab_size() > 0:
-      self._clear_table()
-    self._insert_table_data(vocab, values)
+    if not append and current_table_size > 0:
+      self._table_handler.clear()
+    self._table_handler.insert(vocab, values)
 
   def _set_state_variables(self, updates):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
     self.set_vocabulary(updates[_VOCAB_NAME])
 
-  def __call__(self, inputs, invert=False, **kwargs):
-    if invert and not self._inverse_table:
-      # If the user wants to perform an inverse lookup, we need to build an
-      # inverse lookup table and initialize it to have the inverse of the
-      # forward table's vocabulary.
-      self._inverse_table = lookup_ops.MutableHashTable(
-          key_dtype=self._output_dtype,
-          value_dtype=self.dtype,
-          default_value="",
-          name=(self._name + "_inverse_index_table"))
+  def call(self, inputs):
+    return self._table_handler.lookup(inputs)
 
-      tracked_inverse_table = self._add_trackable(
-          self._inverse_table, trainable=False)
-      # This is a workaround for summary() on this layer. Because the table is
-      # not mutable during training, the effective number of parameters (and so
-      # the weight shape) is 0; we add this as an attr so that the parameter
-      # counting code in the Model object doesn't throw an attribute error.
-      tracked_inverse_table.shape = tensor_shape.TensorShape((0,))
-
-      # This is a workaround for saving not working yet for MutableHashTables.
-      # By replacing the existing function call by an explicit failure, we
-      # can provide a more user-friendly error message.
-      def fail(_):
-        raise NotImplementedError(
-            "Saving is not yet supported for IndexLookup layers.")
-
-      self._inverse_table._list_extra_dependencies_for_serialization = fail  # pylint: disable=protected-access
-      self._initialize_inverse_table()
-
-    return super(IndexLookup, self).__call__(inputs, invert=invert, **kwargs)
-
-  def replace_oov_buckets(self, inputs, lookups):
-    if self.num_oov_tokens <= 1:
-      return lookups
-
-    if inputs.dtype.is_integer:
-      inputs = string_ops.as_string(inputs)
-    hashed_inputs = string_ops.string_to_hash_bucket_fast(
-        inputs, num_buckets=self.num_oov_tokens)
-    if self.reserve_zero:
-      hashed_inputs = math_ops.add(hashed_inputs, 1)
-    return array_ops.where(math_ops.equal(lookups, -1), hashed_inputs, lookups)
-
-  def call(self, inputs, invert=False):
-    table = self._inverse_table if invert else self._table
-    # The table lookup ops don't natively support ragged tensors, so if we have
-    # a RT we need to use map_flat_values to look up every element.
-    if ragged_tensor.is_ragged(inputs):
-      indexed_data = ragged_functional_ops.map_flat_values(table.lookup, inputs)
-      if not invert:
-        indexed_data = ragged_functional_ops.map_flat_values(
-            self.replace_oov_buckets, inputs, indexed_data)
-    elif isinstance(
-        inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
-      if not invert:
-        values = self.replace_oov_buckets(inputs.values,
-                                          table.lookup(inputs.values))
-      indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
-                                                inputs.dense_shape)
-    else:
-      indexed_data = table.lookup(inputs)
-      if not invert:
-        indexed_data = self.replace_oov_buckets(inputs, indexed_data)
-      # (b/149446477): output does not preserve input shape.
-      indexed_data.set_shape(inputs.shape)
-
-    # Composite tensors can pass tensor values through, which will cause
-    # errors if this is the only layer in the model. To fix this, pass
-    # the output through an identity op.
-    return array_ops.identity(indexed_data)
+  def _use_v1_apis(self):
+    return False
 
 
 class _IndexLookupAccumulator(
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index 54305b3d6d7..3c5b5757ec2 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -261,7 +261,7 @@ class CategoricalEncodingMultiOOVTest(
     vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
     input_array = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 2]],
-        values=np.array([13, 132], dtype=np.int64),
+        values=np.array([13, 133], dtype=np.int64),
         dense_shape=[3, 4])
 
     expected_indices = [[0, 0], [1, 2]]
@@ -295,7 +295,7 @@ class CategoricalEncodingMultiOOVTest(
 
   def test_ragged_int_input_multi_bucket(self):
     vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
-    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
                                               dtype=np.int64)
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
@@ -560,7 +560,7 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
 class InverseLookupOutputTest(keras_parameterized.TestCase,
                               preprocessing_test_utils.PreprocessingLayerTest):
 
-  def test_inverse_output(self):
+  def DISABLE_test_inverse_output(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
                             ["fire", "and", "earth", "michigan"]])
@@ -579,7 +579,7 @@ class InverseLookupOutputTest(keras_parameterized.TestCase,
     self.assertAllEqual(expected_ints, int_outputs)
     self.assertAllEqual(expected_strings, string_outputs)
 
-  def test_inverse_output_serialization(self):
+  def DISABLE_test_inverse_output_serialization(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
                             ["fire", "and", "earth", "michigan"]])
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
index c6e0b6ed286..47fea11dd57 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_v1.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import index_lookup
-from tensorflow.python.ops.ragged import ragged_tensor_value
 
 
 class IndexLookup(index_lookup.IndexLookup,
@@ -59,37 +56,5 @@ class IndexLookup(index_lookup.IndexLookup,
       this option is set, reserve_zero must also be set. Defaults to False.
   """
 
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    np_keys = K.get_session().run(keys)
-    np_values = K.get_session().run(values)
-    return (np_keys, np_values)
-
-  def vocab_size(self):
-    return K.get_session().run(self._table.size())
-
-  def _clear_table(self):
-    keys, _ = self._table.export()
-    K.get_session().run(self._table.remove(keys))
-    if self._inverse_table:
-      keys, _ = self._inverse_table.export()
-      K.get_session().run(self._inverse_table.remove(keys))
-
-  def _insert_table_data(self, keys, values):
-    K.get_session().run(self._table.insert(keys, values))
-    if self._inverse_table:
-      K.get_session().run(self._inverse_table.insert(values, keys))
-
-  def _initialize_inverse_table(self):
-    keys, values = self._table.export()
-    K.get_session().run(self._inverse_table.insert(values, keys))
-
-  def _to_numpy(self, data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(data, np.ndarray):
-      return data
-    session = K.get_session()
-    data = session.run(data)
-    if isinstance(data, ragged_tensor_value.RaggedTensorValue):
-      data = np.array(data.to_list())
-    return data
+  def _use_v1_apis(self):
+    return True
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
new file mode 100644
index 00000000000..88e9d95e2ed
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -0,0 +1,192 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for working with tf.lookup tables in Keras."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_functional_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import gfile
+
+
+class TableHandler(object):
+  """Wrapper object that holds a lookup table and provides accessors."""
+
+  def __init__(self, table, oov_tokens=None, use_v1_apis=False):
+    self.table = table
+    self.use_v1_apis = use_v1_apis
+    if oov_tokens is None:
+      self.oov_tokens = oov_tokens
+    else:
+      if not isinstance(oov_tokens, (list, tuple, np.ndarray)):
+        oov_tokens = [oov_tokens]
+      self.oov_tokens = math_ops.cast(oov_tokens, table._value_dtype)  # pylint: disable=protected-access
+
+  def data(self):
+    keys, values = self.table.export()
+    return (self._eval(keys), self._eval(values))
+
+  def vocab_size(self):
+    return self._eval(self.table.size())
+
+  def clear(self):
+    keys, _ = self.table.export()
+    self._run(self.table.remove(keys))
+
+  def insert(self, keys, values):
+    if len(values) != len(keys):
+      raise RuntimeError("Size mismatch between values and key arrays. "
+                         "Keys had size %s, values had size %s." %
+                         (len(keys), len(values)))
+    self._run(self.table.insert(keys, values))
+
+  def _replace_oov_buckets(self, inputs, lookups):
+    """Replace the default OOV value with one of the OOV bucket values."""
+    if self.oov_tokens is None:
+      return lookups
+
+    num_oov_elements = self.oov_tokens.shape.num_elements()
+    if inputs.dtype.is_integer:
+      oov_indices = math_ops.floormod(inputs, num_oov_elements)
+    else:
+      oov_indices = string_ops.string_to_hash_bucket_fast(
+          inputs, num_buckets=num_oov_elements)
+
+    oov_values = array_ops.gather(self.oov_tokens, oov_indices)
+    oov_locations = math_ops.equal(lookups, self.table._default_value)  # pylint: disable=protected-access
+
+    return array_ops.where(oov_locations, oov_values, lookups)
+
+  def _ragged_lookup(self, inputs):
+    """Perform a table lookup on a ragged tensor."""
+    # The table lookup ops don't natively support ragged tensors, so if we have
+    # a RT we need to use map_flat_values to look up every element.
+    indexed_data = ragged_functional_ops.map_flat_values(
+        self.table.lookup, inputs)
+    indexed_data = ragged_functional_ops.map_flat_values(
+        self._replace_oov_buckets, inputs, indexed_data)
+    # Composite tensors can pass tensor values through, which will cause
+    # errors if all operations in the TF graph do so. We can break this chain
+    # with an identity here.
+    return array_ops.identity(indexed_data)
+
+  def _sparse_lookup(self, inputs):
+    """Perform a table lookup on a sparse tensor."""
+    values = self.table.lookup(inputs.values)
+    values = self._replace_oov_buckets(inputs.values, values)
+    indexed_data = sparse_tensor.SparseTensor(inputs.indices, values,
+                                              inputs.dense_shape)
+    # Composite tensors can pass tensor values through, which will cause
+    # errors if all operations in the TF graph do so. We can break this chain
+    # with an identity here.
+    return array_ops.identity(indexed_data)
+
+  def _tensor_lookup(self, inputs):
+    """Perform a table lookup on a tf.tensor."""
+    values = self.table.lookup(inputs)
+    indexed_data = self._replace_oov_buckets(inputs, values)
+    # (b/149446477): output does not preserve input shape.
+    indexed_data.set_shape(inputs.shape)
+    return indexed_data
+
+  def lookup(self, inputs):
+    """Perform a table lookup."""
+    # Sparse tensors don't play nicely with tensor conversion, so we handle
+    # them before attempting to convert lists or arrays to tensors.
+    if isinstance(
+        inputs, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
+      return self._sparse_lookup(inputs)
+
+    # Try to convert lists/arrays to tensors or RaggedTensors.
+    inputs = ragged_tensor.convert_to_tensor_or_ragged_tensor(inputs)
+
+    # Run the lookup operation on the converted tensor.
+    if ragged_tensor.is_ragged(inputs):
+      return self._ragged_lookup(inputs)
+    else:
+      return self._tensor_lookup(inputs)
+
+  def _eval(self, tensor):
+    if self.use_v1_apis:
+      return K.get_session().run(tensor)
+    else:
+      return tensor.numpy()
+
+  def _run(self, op):
+    if self.use_v1_apis:
+      K.get_session().run(op)
+
+
+def get_vocabulary_from_file(vocabulary_path, encoding="utf-8"):
+  """Read a vocabulary in from a file."""
+  vocab = []
+  with gfile.GFile(vocabulary_path, "r") as reader:
+    while True:
+      # Get the next line, and break if it is None.
+      text = reader.readline()
+      if not text:
+        break
+
+      # Convert the raw text and strip whitespace.
+      if isinstance(text, str):
+        token = text
+      elif isinstance(text, bytes):
+        token = text.decode(encoding, "ignore")
+      token = token.strip()
+      vocab.append(token)
+  return vocab
+
+
+def validate_vocabulary_is_unique(vocabulary):
+  """Validate that a vocabulary contains no repeated tokens."""
+  vocabulary_set = set(vocabulary)
+  if len(vocabulary) != len(vocabulary_set):
+    repeated_items = [
+        item for item, count in collections.Counter(vocabulary).items()
+        if count > 1
+    ]
+    raise ValueError("The passed vocabulary has at least one repeated "
+                     "term. Please uniquify your dataset. The repeated terms "
+                     "are %s" % repeated_items)
+
+
+def assert_same_type(expected_type, values, value_name):
+  """Assert that 'values' is of type 'expected_type'."""
+  if dtypes.as_dtype(expected_type) != dtypes.as_dtype(values.dtype):
+    raise RuntimeError("Expected %s type %s, got %s" %
+                       (value_name, expected_type, values.dtype))
+
+
+def convert_to_ndarray(x, dtype=None):
+  """Convert 'x' to a numpy array."""
+  array = np.array(x) if isinstance(x, (list, tuple)) else x
+  if dtype not in (None, dtypes.string):
+    # If the dtype is an integer, we do permissive casting. This allows
+    # users to examine int32 data if the dtype is int64 without trouble.
+    np_dtype = dtypes.as_dtype(dtype).as_numpy_dtype
+    if np.can_cast(array.dtype, np_dtype):
+      array = array.astype(np_dtype, casting="safe")
+  return array
+
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
new file mode 100644
index 00000000000..60a891f6ba8
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
@@ -0,0 +1,243 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras lookup table utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import table_utils
+from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import test
+
+
+def get_table(dtype=dtypes.string, oov_tokens=None):
+  table = lookup_ops.MutableHashTable(
+      key_dtype=dtype,
+      value_dtype=dtypes.int64,
+      default_value=-7,
+      name="index_table")
+  return table_utils.TableHandler(
+      table, oov_tokens, use_v1_apis=(not context.executing_eagerly()))
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_string_input(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=["fire", "michigan"],
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_string_input(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant(
+        [["earth", "wind", "fire"], ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    table = get_table(oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1])
+    table.insert(vocab_data, range(2, len(vocab_data) + 2))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]], values=["fire", "ohio"], dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 132], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 1]
+    expected_dense_shape = [3, 4]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
+                                               ["fire", "and", "earth",
+                                                "ohio"]])
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 132]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 1]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_tensor_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = np.array([[13, 132], [13, 133]], dtype=np.int64)
+    expected_values = [[6, 1], [6, 2]]
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_values, output_data)
+
+  def test_tensor_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = [["earth", "wind", "fire", "michigan"],
+                   ["fire", "and", "earth", "ohio"]]
+    expected_output = [[3, 4, 6, 1], [6, 5, 3, 2]]
+
+    table = get_table(oov_tokens=[1, 2])
+    table.insert(vocab_data, range(3, len(vocab_data) + 3))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class IndexLookupOutputTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output_default_lookup_value(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, -7]]
+
+    table = get_table(oov_tokens=None)
+    table.insert(vocab_data, range(1, len(vocab_data) + 1))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+  def test_output_shape(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+
+    table = get_table()
+    table.insert(vocab_data, range(1, len(vocab_data) + 1))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(input_array.shape[1:], output_data.shape[1:])
+
+  def test_int_output_no_reserved_zero_default_lookup_value(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[0, 1, 2, 3], [3, 2, 0, -7]]
+
+    table = get_table(oov_tokens=None)
+    table.insert(vocab_data, range(len(vocab_data)))
+    output_data = table.lookup(input_array)
+
+    self.assertAllEqual(expected_output, output_data)
+
+
+if __name__ == "__main__":
+  test.main()

From ef2112e41529aa792e082b2b109426a657bafdff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 19:46:20 -0700
Subject: [PATCH 0244/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310672610
Change-Id: Ie475d7e5ba154a275b8c4cc4997d50ed9b70d93b
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 4909933889c01eb8bf691b9e3aea89f3c156f944 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Fri, 8 May 2020 19:49:18 -0700
Subject: [PATCH 0245/1533] Introduce some common constants for TPU.

PiperOrigin-RevId: 310672767
Change-Id: I04794ac10cd6b4d03c5dc0221a17bd35ee5e650f
---
 tensorflow/core/tpu/BUILD            | 19 ++++++++
 tensorflow/core/tpu/tpu_defs.cc      | 28 ++++++++++++
 tensorflow/core/tpu/tpu_defs.h       | 48 ++++++++++++++++++++
 tensorflow/core/tpu/tpu_init_mode.cc | 66 ++++++++++++++++++++++++++++
 tensorflow/core/tpu/tpu_init_mode.h  | 47 ++++++++++++++++++++
 5 files changed, 208 insertions(+)
 create mode 100644 tensorflow/core/tpu/tpu_defs.cc
 create mode 100644 tensorflow/core/tpu/tpu_defs.h
 create mode 100644 tensorflow/core/tpu/tpu_init_mode.cc
 create mode 100644 tensorflow/core/tpu/tpu_init_mode.h

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 6184f52d240..43b2d93b917 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -1,6 +1,10 @@
 # Description: Utilities for TPU Operations
 
 package(
+    default_visibility = [
+        "//tensorflow/core/tpu:__subpackages__",
+        "//tensorflow/stream_executor/tpu:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
@@ -32,3 +36,18 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:tpu_embedding_output_layout_proto_cc",
     ],
 )
+
+cc_library(
+    name = "tpu_defs",
+    srcs = ["tpu_defs.cc"],
+    hdrs = ["tpu_defs.h"],
+)
+
+cc_library(
+    name = "tpu_init_mode",
+    srcs = ["tpu_init_mode.cc"],
+    hdrs = ["tpu_init_mode.h"],
+    deps = [
+        "//tensorflow/core:lib",
+    ],
+)
diff --git a/tensorflow/core/tpu/tpu_defs.cc b/tensorflow/core/tpu/tpu_defs.cc
new file mode 100644
index 00000000000..dc370ea2ba7
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_defs.cc
@@ -0,0 +1,28 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_defs.h"
+
+namespace tensorflow {
+
+const char* const DEVICE_TPU_NODE = "TPU";
+const char* const TPU_FAST_MEM_ATTR = "_TPU_FAST_MEM";
+const char* const DEVICE_TPU_REPLICATED_CORE = "TPU_REPLICATED_CORE";
+const char* const DEVICE_TPU_SYSTEM = "TPU_SYSTEM";
+const char* const DEVICE_TPU_XLA_JIT = "XLA_TPU_JIT";
+const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR =
+    "_mirrored_variable_indices";
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_defs.h b/tensorflow/core/tpu/tpu_defs.h
new file mode 100644
index 00000000000..b2a6e3ce303
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_defs.h
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Common definitions related to TPUs.
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+#define TENSORFLOW_CORE_TPU_TPU_DEFS_H_
+
+namespace tensorflow {
+
+// Name of the TPU device, which corresponds to a single core.
+extern const char* const DEVICE_TPU_NODE;  // "TPU";
+
+// The TPU_REPLICATED_CORE device is a virtual device corresponding to one core
+// of a replicated TPU computation. Only valid within the body of a
+// TPUReplicate computation.
+extern const char* const DEVICE_TPU_REPLICATED_CORE;
+
+extern const char* const DEVICE_TPU_SYSTEM;  // "TPU_SYSTEM";
+
+// Name of the XLA_TPU_JIT compilation device, which is an internal device to
+// compile graphs for TPU. Not registered as a device; no operators can be
+// assigned to this device by a user.
+extern const char* const DEVICE_TPU_XLA_JIT;  // "XLA_TPU_JIT";
+
+// Attribute used internally to pass "is_mirrored_variable" attribute on
+// TPUReplicatedInput nodes to _TPUReplicate.
+extern const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR;
+
+// Attribute used internally to annoate ops which might consume TPU FastMem
+// variable.
+extern const char* const TPU_FAST_MEM_ATTR;  // "_TPU_FAST_MEM"
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_DEFS_H_
diff --git a/tensorflow/core/tpu/tpu_init_mode.cc b/tensorflow/core/tpu/tpu_init_mode.cc
new file mode 100644
index 00000000000..42952df29d8
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_init_mode.cc
@@ -0,0 +1,66 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_init_mode.h"
+
+#include <atomic>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+mutex init_mode_mutex(LINKER_INITIALIZED);
+TPUInitMode init_mode TF_GUARDED_BY(init_mode_mutex);
+
+}  // namespace
+
+namespace test {
+
+void ForceSetTPUInitMode(const TPUInitMode mode) {
+  mutex_lock l(init_mode_mutex);
+  init_mode = mode;
+}
+
+}  // namespace test
+
+Status SetTPUInitMode(const TPUInitMode mode) {
+  if (mode == TPUInitMode::kNone) {
+    return errors::InvalidArgument("State cannot be set to: ",
+                                   static_cast<int>(mode));
+  }
+  {
+    mutex_lock l(init_mode_mutex);
+    if (init_mode != TPUInitMode::kNone && mode != init_mode) {
+      return errors::FailedPrecondition(
+          "TPUInit already attempted with mode: ", static_cast<int>(init_mode),
+          " and cannot be changed to: ", static_cast<int>(mode),
+          ". You are most probably trying to initialize the TPU system, both "
+          "using the explicit API and using an initialization Op within the "
+          "graph; please choose one. ");
+    }
+    init_mode = mode;
+  }
+  return Status::OK();
+}
+
+TPUInitMode GetTPUInitMode() {
+  mutex_lock l(init_mode_mutex);
+  return init_mode;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_init_mode.h b/tensorflow/core/tpu/tpu_init_mode.h
new file mode 100644
index 00000000000..73ca68ad8a0
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_init_mode.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+#define TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+enum class TPUInitMode : int { kNone, kGlobal, kRegular };
+
+// Sets the TPU initialization mode appropriately.
+//
+// Requires that mode is not kNone, and mode doesn't transition kGlobal
+// <-> kRegular.
+//
+// IMPLEMENTATION DETAILS:
+// Used internally to record the current mode and type of API used for TPU
+// initialization in a global static variable.
+Status SetTPUInitMode(TPUInitMode mode);
+
+// Returns the current TPUInitMode.
+TPUInitMode GetTPUInitMode();
+
+namespace test {
+
+// Forces the tpu init mode to be changed.
+void ForceSetTPUInitMode(TPUInitMode mode);
+
+}  // namespace test
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_INIT_MODE_H_

From 5e5b33c233a3067199f42af6dd5476155735dc01 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Fri, 8 May 2020 19:50:57 -0700
Subject: [PATCH 0246/1533] Allow kernels to take different scales for prelu

PiperOrigin-RevId: 310672881
Change-Id: Ibb3044112cf3136892e1b509d18e2585a67384db
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  3 +-
 tensorflow/lite/kernels/activations.cc        | 41 +++++++++++--------
 .../lite/kernels/internal/reference/prelu.h   |  8 ++--
 tensorflow/lite/kernels/internal/types.h      |  6 ++-
 tensorflow/lite/micro/kernels/prelu.cc        | 26 ++++++++----
 tensorflow/lite/micro/kernels/prelu_test.cc   | 12 +++---
 tensorflow/lite/testing/op_tests/prelu.py     |  2 +-
 .../lite/tools/optimize/operator_property.cc  |  2 +-
 8 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 966c56a1464..13b8ae83e34 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -2269,8 +2269,7 @@ def TFL_PReluOp : TFL_Op<"prelu", [
           TFL_OperandIsUnrankedPred<1>,
           CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() == "
                 "$_op.getOperand(1).getType().cast<ShapedType>().getRank() "
-                "+ 1">]>>,
-    SameOperandsAndResultsScale]> {
+                "+ 1">]>>]> {
   let summary = "Parameterized Relu operator";
 
   let description = [{
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 3343985e4f2..84420b8eb9f 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -84,8 +84,10 @@ struct LeakyReluOpData : public OpData {
 };
 
 struct PreluOpData : public OpData {
-  int32_t output_multiplier = 0;
-  int output_shift = 0;
+  int32_t output_multiplier_1 = 0;
+  int32_t output_shift_1 = 0;
+  int32_t output_multiplier_2 = 0;
+  int32_t output_shift_2 = 0;
 };
 
 struct HardSwishData {
@@ -664,7 +666,6 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
       output->type == kTfLiteInt16) {
-    // This scale check is actually needed for quantized path:
     // prelu(x) = x if x >= 0 else x * alpha.
     // So if we translate that for quantized computation:
     //
@@ -676,19 +677,19 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
     // ouput_q = (input_q - input_zp) * input_scale / output_scale + output_q
     // else:
     // output_q = (input_q - input_zp) * (alpha_q - alpha_zp) * input_scale
-    //            * alpha_scale / output_scale +output_q
+    //            * alpha_scale / output_scale + output_q
     //
-    // So we have two float values which we need to translate into multiplier
-    // shift languages.
-    // For simplicity & efficiency, if we make sure input_scale
-    // & output_scale are the same, we only need to translate the latter one
-    // into multiplier & shift format.
-    TF_LITE_ENSURE(context,
-                   std::abs(input->params.scale - output->params.scale) < 1e-4);
-    double real_multiplier =
+    // So for input_q - input_zp >= 0:
+    // output real multiplier 1 is input_scale / output_scale;
+    // for input_q - input_zp < 0:
+    // output real multiplier 2 is input_scale  * alpha_scale/ output_scale.
+    double real_multiplier_1 = input->params.scale / output->params.scale;
+    double real_multiplier_2 =
         input->params.scale * alpha->params.scale / output->params.scale;
-    QuantizeMultiplierSmallerThanOneExp(
-        real_multiplier, &data->output_multiplier, &data->output_shift);
+    QuantizeMultiplier(real_multiplier_1, &data->output_multiplier_1,
+                       &data->output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &data->output_multiplier_2,
+                       &data->output_shift_2);
   }
 
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
@@ -1171,8 +1172,10 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = data->output_multiplier;
-      op_params.output_shift = data->output_shift;
+      op_params.output_multiplier_1 = data->output_multiplier_1;
+      op_params.output_shift_1 = data->output_shift_1;
+      op_params.output_multiplier_2 = data->output_multiplier_2;
+      op_params.output_shift_2 = data->output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
           GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
@@ -1184,8 +1187,10 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = data->output_multiplier;
-      op_params.output_shift = data->output_shift;
+      op_params.output_multiplier_1 = data->output_multiplier_1;
+      op_params.output_shift_1 = data->output_shift_1;
+      op_params.output_multiplier_2 = data->output_multiplier_2;
+      op_params.output_shift_2 = data->output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
           op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
           GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index d3d7d78a4a4..50d9ad24dd9 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -48,14 +48,16 @@ inline void BroadcastPrelu4DSlow(
               params.input_offset + input_data[input_index];
           int32 output_value;
           if (input_value >= 0) {
-            output_value = input_value;
+            output_value = MultiplyByQuantizedMultiplier(
+                input_value, params.output_multiplier_1, params.output_shift_1);
           } else {
             auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
             const int32 alpha_value =
                 params.alpha_offset + alpha_data[alpha_index];
+
             output_value = MultiplyByQuantizedMultiplier(
-                input_value * alpha_value, params.output_multiplier,
-                params.output_shift);
+                input_value * alpha_value, params.output_multiplier_2,
+                params.output_shift_2);
           }
           output_value += params.output_offset;
 
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index cbdedd88901..52d74d1eca4 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -972,8 +972,10 @@ struct PreluParams {
   int32 input_offset;
   int32 alpha_offset;
   int32 output_offset;
-  int32 output_multiplier;
-  int output_shift;
+  int32 output_multiplier_1;
+  int32 output_shift_1;
+  int32 output_multiplier_2;
+  int32 output_shift_2;
 };
 
 struct PoolParams {
diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index a20d2c88225..2c575269cca 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -64,14 +64,20 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  int32_t output_multiplier = 0;
-  int output_shift = 0;
+  int32_t output_multiplier_1 = 0;
+  int output_shift_1 = 0;
+  int32_t output_multiplier_2 = 0;
+  int output_shift_2 = 0;
   if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
-    double real_multiplier = static_cast<double>(input->params.scale) *
-                             static_cast<double>(alpha->params.scale) /
-                             static_cast<double>(output->params.scale);
-    QuantizeMultiplierSmallerThanOneExp(real_multiplier, &output_multiplier,
-                                        &output_shift);
+    double real_multiplier_1 = static_cast<double>(input->params.scale) *
+                               static_cast<double>(output->params.scale);
+    double real_multiplier_2 = static_cast<double>(input->params.scale) *
+                               static_cast<double>(alpha->params.scale) /
+                               static_cast<double>(output->params.scale);
+    QuantizeMultiplier(real_multiplier_1, &output_multiplier_1,
+                       &output_shift_1);
+    QuantizeMultiplier(real_multiplier_2, &output_multiplier_2,
+                       &output_shift_2);
   }
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -86,8 +92,10 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.input_offset = -input->params.zero_point;
       op_params.alpha_offset = -alpha->params.zero_point;
       op_params.output_offset = output->params.zero_point;
-      op_params.output_multiplier = output_multiplier;
-      op_params.output_shift = output_shift;
+      op_params.output_multiplier_1 = output_multiplier_1;
+      op_params.output_shift_1 = output_shift_1;
+      op_params.output_multiplier_2 = output_multiplier_2;
+      op_params.output_shift_2 = output_shift_2;
       reference_ops::BroadcastPrelu4DSlow(
           op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
           GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 4b35dac5849..d6c851a2726 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -154,14 +154,14 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
   const int output_dims_count = 12;
   float output_data[output_dims_count];
-  tflite::testing::TestPreluFloat({4, 1, 2, 2, 3},  // input shape
+  tflite::testing::TestPreluFloat({1, 2, 2, 3},  // input shape
                                   {
                                       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
                                       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
                                       -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
                                       -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
                                   },
-                                  {3, 1, 1, 3},        // alpha shape
+                                  {1, 1, 1, 3},        // alpha shape
                                   {0.0f, 1.0f, 2.0f},  // alpha values
                                   {
                                       0.0f, 0.0f, 0.0f,    // Row 1, Column 1
@@ -169,7 +169,7 @@ TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
                                       0.0f, -1.0f, -2.0f,  // Row 2, Column 1
                                       0.0f, -2.0f, -4.0f,  // Row 1, Column 2
                                   },
-                                  {4, 1, 2, 2, 3},  // output shape
+                                  {1, 2, 2, 3},  // output shape
                                   output_data);
 }
 
@@ -182,13 +182,13 @@ TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
   const int output_dims_count = 12;
   uint8_t output_data[output_dims_count];
   tflite::testing::TestPreluQuantized(
-      {4, 1, 2, 2, 3},  // input shape
+      {1, 2, 2, 3},  // input shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
        F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax), F2Q(-0.25f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax)},
-      kMin, kMax, {3, 1, 1, 3},  // alpha shape
+      kMin, kMax, {1, 1, 1, 3},  // alpha shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(-0.5f, kMin, kMax)},
       kMin, kMax,
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
@@ -196,7 +196,7 @@ TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
        F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax),
        F2Q(0.125f, kMin, kMax)},
-      {4, 1, 2, 2, 3},  // output shape
+      {1, 2, 2, 3},  // output shape
       kMin, kMax, output_data);
 }
 
diff --git a/tensorflow/lite/testing/op_tests/prelu.py b/tensorflow/lite/testing/op_tests/prelu.py
index 480736a76fe..bc5875739ed 100644
--- a/tensorflow/lite/testing/op_tests/prelu.py
+++ b/tensorflow/lite/testing/op_tests/prelu.py
@@ -86,7 +86,7 @@ def make_prelu_tests(options):
       alpha_shape.append(1 if dim in shared_axes else input_shape[dim])
 
     alpha_values = create_tensor_data(
-        np.float32, alpha_shape, min_value=-1, max_value=1)
+        np.float32, alpha_shape, min_value=-5, max_value=5)
 
     # There should be only 1 trainable variable tensor.
     variables = tf.compat.v1.all_variables()
diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc
index 94093ef8698..e3dbd0b839f 100644
--- a/tensorflow/lite/tools/optimize/operator_property.cc
+++ b/tensorflow/lite/tools/optimize/operator_property.cc
@@ -818,7 +818,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index,
     case BuiltinOperator_PRELU:
       property.inputs = {{0, {}}, {1, {}}};
       property.outputs = {{0, {}}};
-      property.restrict_same_input_output_scale = true;
+      property.restrict_same_input_output_scale = false;
       property.version = 1;
       break;
     case BuiltinOperator_LEAKY_RELU:

From 7772deb5a0261fcc04b28baa9d47dca1f5a3306f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 21:45:57 -0700
Subject: [PATCH 0247/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310681869
Change-Id: Ief592b57d27c6746a1d5b8ec8c0b1281a94e9697
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 2e6dd1948e0d2e3f39b954e3fda16922165ef05d Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 8 May 2020 21:50:36 -0700
Subject: [PATCH 0248/1533] Remove redundant negations in HiFi mini's
 fully_connected implementation.

PiperOrigin-RevId: 310682194
Change-Id: I9d661cc2360d4ee2e81380d8a9fe5335d3afa625
---
 .../micro/kernels/xtensa_hifimini/fully_connected.cc     | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index c8da67b5af8..c8bba633de7 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -152,10 +152,8 @@ TfLiteStatus CalculateOpData(TfLiteContext* context,
   double real_multiplier = 0.0;
   TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
       context, input, filter, bias, output, &real_multiplier));
-  int exponent;
-  xtensa::hifimini::QuantizeMultiplier(real_multiplier,
-                                       &data->output_multiplier, &exponent);
-  data->output_shift = -exponent;
+  xtensa::hifimini::QuantizeMultiplier(
+      real_multiplier, &data->output_multiplier, &data->output_shift);
   return CalculateActivationRangeQuantized(context, activation, output,
                                            &data->output_activation_min,
                                            &data->output_activation_max);
@@ -200,8 +198,7 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
   op_params.output_multiplier = data.output_multiplier;
-  // TODO(b/138810107): Figure out whether output shift should be inverted
-  op_params.output_shift = -data.output_shift;
+  op_params.output_shift = data.output_shift;
   op_params.quantized_activation_min = data.output_activation_min;
   op_params.quantized_activation_max = data.output_activation_max;
 

From b695497e0b11a645b0898d9a7108d085754a3c8c Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 8 May 2020 21:51:07 -0700
Subject: [PATCH 0249/1533] Code cleanup: Use the combined left-or-right shift
 instruction.

The Xtensa compiler probably already did this optimization, as there is absolutely no difference in the generated binary.

PiperOrigin-RevId: 310682234
Change-Id: I87f93d737563acc80f7b708fccd4af6ddd467a93
---
 .../lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h   | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index 918192c4d8f..2ed3e45ece1 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -65,11 +65,7 @@ inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
   ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
 
   // Shift right if shift amount is positive, left if shift amount is negative.
-  if (shift_amount >= 0) {
-    result_56 = AE_Q56S_SRA(result_56, shift_amount);
-  } else {
-    result_56 = AE_Q56S_SLA(result_56, -shift_amount);
-  }
+  result_56 = AE_SLAASQ56S(result_56, shift_amount);
 
   // Round off the bottom 16 bits.
   // Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.

From 5274341cbe1ded351ce5d87f3a2a0c0631ed86f3 Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Fri, 8 May 2020 22:00:26 -0700
Subject: [PATCH 0250/1533] Make tf.While work with ConcreteFunction.

PiperOrigin-RevId: 310682925
Change-Id: Ib6ce9845cb8901653f8298f540341e89a5019b91
---
 tensorflow/python/ops/functional_ops.py | 39 ++++++++++++++++++-------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index fe66e8ccdfb..8ec925824de 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -863,11 +863,24 @@ def Gradient(inputs, f, name=None):
   return symbolic_gradient(input=inputs, Tout=tlist, f=f, name=name)
 
 
+def _GetInputDtypes(func):
+  """Returns the input dtypes of func, excluding dtypes for captured inputs."""
+  if isinstance(func, function._DefinedFunction):  # pylint: disable=protected-access
+    return func.declared_input_types
+
+  # We assume that `func` is a ConcreteFunction here, but we are not able to
+  # verify since importing eager function library will cause cyclic dependence.
+  #
+  # ConcreteFunction.inputs includes captured inputs.
+  num_non_captured_inputs = len(func.inputs) - len(func.captured_inputs)
+  inputs_without_captured = func.inputs[:num_non_captured_inputs]
+  return [t.dtype for t in inputs_without_captured]
+
+
 def _LoopBodyCaptureWrapper(func):
   """Returns a wrapper for `func` that handles loop-carried captured inputs."""
 
-  @function.Defun(
-      *func.declared_input_types, func_name="%s_Wrapper" % func.name)
+  @function.Defun(*_GetInputDtypes(func), func_name="%s_Wrapper" % func.name)
   def Wrapper(*args):
     """A wrapper that handles loop-carried captured inputs."""
     result = func(*args)
@@ -877,11 +890,11 @@ def _LoopBodyCaptureWrapper(func):
     if isinstance(result, ops.Operation):
       return extra_args
     # Unary functions return a single Tensor value.
-    elif not isinstance(result, tuple):
+    elif not isinstance(result, (list, tuple)):
       return (result,) + extra_args
     # N-ary functions return a tuple of Tensors.
     else:
-      return result + extra_args
+      return result + type(result)(extra_args)
 
   return Wrapper
 
@@ -917,19 +930,23 @@ def While(input_, cond, body, name=None, hostmem=None):
     raise ValueError("While op 'cond' argument must be a function "
                      "without implicitly captured inputs.")
 
-  if cond.declared_input_types != body.declared_input_types:
+  cond_input_types = _GetInputDtypes(cond)
+  body_input_types = _GetInputDtypes(body)
+
+  if cond_input_types != body_input_types:
     raise ValueError(
         "While op 'cond' and 'body' signatures do not match. %r vs %r" %
-        (cond.declared_input_types, body.declared_input_types))
+        (cond_input_types, body_input_types))
 
   if body.captured_inputs:
-    cond_dtypes = list(
-        body.declared_input_types) + [t.dtype for t in body.captured_inputs]
+    cond_dtypes = list(body_input_types) + [
+        t.dtype for t in body.captured_inputs
+    ]
 
     @function.Defun(*cond_dtypes, func_name="%s_Wrapper" % cond.name)
     def CondWrapper(*args):
       """A wrapper that handles loop-carried captured inputs."""
-      return cond(*args[:len(body.declared_input_types)])
+      return cond(*args[:len(body_input_types)])
 
     ret = gen_functional_ops._while(
         input_ + body.captured_inputs,
@@ -1184,8 +1201,8 @@ def partitioned_call(args,
   if hasattr(f, "graph"):
     _set_read_only_resource_inputs_attr(op, f.graph)
     if hasattr(f.graph, "collective_manager_ids_used"):
-      ops.set_int_list_attr(
-          op, acd.COLLECTIVE_MANAGER_IDS, f.graph.collective_manager_ids_used)
+      ops.set_int_list_attr(op, acd.COLLECTIVE_MANAGER_IDS,
+                            f.graph.collective_manager_ids_used)
   return outputs if outputs else op
 
 
From 331da14e5fbebb4323e92a6c330d0e58c09610ff Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 8 May 2020 22:03:31 -0700
Subject: [PATCH 0251/1533] Add NDEBUG to default make build.

PiperOrigin-RevId: 310683267
Change-Id: Ifbced7b6db5ad1981c95203a45bb1dd9992a4589
---
 tensorflow/lite/micro/tools/make/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 8599a27df52..1331163a410 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -86,8 +86,8 @@ else ifeq ($(BUILD_TYPE), release)
 	CXXFLAGS += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
 	CCFLAGS  += -DNDEBUG -O3 -DTF_LITE_STRIP_ERROR_STRINGS
 else
-	CXXFLAGS += -O3
-	CCFLAGS  += -O3
+	CXXFLAGS += -DNDEBUG -O3
+	CCFLAGS  += -DNDEBUG -O3
 endif
 
 # This library is the main target for this makefile. It will contain a minimal

From e3bb5eee1a6bed31cbe473c1b992afab08218f01 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 8 May 2020 22:06:54 -0700
Subject: [PATCH 0252/1533] Allow Op version of 0 to match the first registered
 Op of that type.

PiperOrigin-RevId: 310683602
Change-Id: I3a9d5e6d19439f989f0603437b28721d09dbaa3d
---
 .../lite/micro/micro_mutable_op_resolver.h    |  2 +
 .../micro/micro_mutable_op_resolver_test.cc   | 37 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index ac304352a57..2f6d4d27823 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -40,6 +40,7 @@ class MicroOpResolver : public OpResolver {
       const TfLiteRegistration& registration = registrations_[i];
       if ((registration.builtin_code == op) &&
           (registration.version == MicroOpResolverAnyVersion() ||
+           version == MicroOpResolverAnyVersion() ||
            registration.version == version)) {
         return &registration;
       }
@@ -53,6 +54,7 @@ class MicroOpResolver : public OpResolver {
       if ((registration.builtin_code == BuiltinOperator_CUSTOM) &&
           (strcmp(registration.custom_name, op) == 0) &&
           (registration.version == MicroOpResolverAnyVersion() ||
+           version == MicroOpResolverAnyVersion() ||
            registration.version == version)) {
         return &registration;
       }
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index 0619591523a..cf39994acec 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -137,4 +137,41 @@ TF_LITE_MICRO_TEST(TestZeroVersionRegistration) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
 }
 
+TF_LITE_MICRO_TEST(TestZeroModelVersion) {
+  using tflite::MicroOpResolver;
+  using tflite::OpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  MicroOpResolver<2> micro_op_resolver;
+  micro_op_resolver.AddCustom("mock_custom", &r, 1, 2);
+  TF_LITE_MICRO_EXPECT_EQ(2, micro_op_resolver.GetRegistrationLength());
+  OpResolver* resolver = &micro_op_resolver;
+
+  // If the Op version in the model is 0, we should always get the first
+  // registration.
+  const TfLiteRegistration* registration = resolver->FindOp("mock_custom", 0);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+  TF_LITE_MICRO_EXPECT_EQ(1, registration->version);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+
+  // If a non-zero version is requested, the correct version'd op should be
+  // returned. TODO(b/151245712): Realistically, we are better off removing
+  // these version checks altogether.
+  for (int i = 1; i <= 2; ++i) {
+    registration = resolver->FindOp("mock_custom", i);
+    TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+    TF_LITE_MICRO_EXPECT_EQ(i, registration->version);
+    TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
+  }
+
+  registration = resolver->FindOp("mock_custom", 42);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+}
+
 TF_LITE_MICRO_TESTS_END

From 44bfdefeaa6d1450a50aa27f18eb22d371773408 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Fri, 8 May 2020 22:24:13 -0700
Subject: [PATCH 0253/1533] Internal change

PiperOrigin-RevId: 310685005
Change-Id: I0dc135ad2be1c275ba7e1103c704e05a5ade5da6
---
 tensorflow/compiler/mlir/tfrt/BUILD           | 27 ++++++
 .../mlir/tfrt/saved_model/saved_model.cc      | 96 +++++++++++++++++++
 .../mlir/tfrt/saved_model/saved_model.h       | 75 +++++++++++++++
 3 files changed, 198 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
 create mode 100644 tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h

diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 88e214f601b..78787245bd6 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -128,6 +128,33 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "saved_model",
+    srcs = [
+        "saved_model/saved_model.cc",
+    ],
+    hdrs = [
+        "saved_model/saved_model.h",
+    ],
+    deps = [
+        ":tf_to_corert",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@tf_runtime//:core_runtime",
+        "@tf_runtime//:hostcontext",
+        "@tf_runtime//:mlirtobef",
+        "@tf_runtime//:support",
+        "@tf_runtime//:tensor",
+    ],
+)
+
 cc_library(
     name = "compatibility_analysis",
     srcs = [
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
new file mode 100644
index 00000000000..8a187cf43a8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+#include "tfrt/bef_converter/mlir_to_bef.h"
+#include "tfrt/core_runtime/core_runtime.h"
+#include "tfrt/core_runtime/op_handler.h"
+#include "tfrt/host_context/host_context.h"
+#include "tfrt/tensor/dense_host_tensor_view.h"
+
+namespace tensorflow {
+
+void MapFunctionGlobalTensorCapturesFromTFSavedModelMLIR(
+    mlir::ModuleOp module,
+    llvm::function_ref<void(
+        llvm::StringRef func_name,
+        llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> global_tensors)>
+        map_fn) {
+  // Create global_tensors for each functions.
+  mlir::SymbolTable symbol_table(module);
+  module.walk([&symbol_table, map_fn](mlir::FuncOp func) {
+    // Use the exported name as the function name, and skip non-exported
+    // functions.
+    auto func_names = mlir::tf_saved_model::GetExportedNames(func);
+    if (func_names.empty()) return;
+
+    // Here we walk through each arguments and find out the variables used by
+    // this function.
+    llvm::SmallVector<mlir::tf_saved_model::GlobalTensorOp, 4> global_tensors;
+    for (unsigned i = 0, e = func.getNumArguments(); i != e; ++i) {
+      if (auto variable =
+              mlir::tf_saved_model::LookupBoundInput(func, i, symbol_table)) {
+        global_tensors.push_back(variable);
+      }
+    }
+
+    for (auto func_name : func_names) map_fn(func_name, global_tensors);
+  });
+}
+
+Status CompileTFSavedModelMLIRToBEF(const TFRTSavedModelCompileOptions& options,
+                                    mlir::ModuleOp module,
+                                    tfrt::AlignedBuffer<8>* bef_buffer) {
+  VLOG(1) << "TF Dialect: " << tensorflow::MlirModuleToString(module);
+
+  // Lower MLIR TF Dialect to MLIR TFRT CoreRT dialect.
+  mlir::PassManager pm(module.getContext());
+
+  tensorflow::CoreRTPipelineOptions pass_options;
+  if (!options.default_device.empty()) {
+    pass_options.default_device = options.default_device;
+  }
+  if (!options.force_data_format.empty()) {
+    pass_options.force_data_format = options.force_data_format;
+  }
+  pass_options.enable_optimizer = options.enable_optimizer;
+  tensorflow::CreateTFExecutorToCoreRTPipeline(pm, pass_options);
+
+  if (mlir::failed(pm.run(module)))
+    return tensorflow::errors::Internal(
+        "failed to lower TF Dialect to CoreRT dialect.");
+
+  VLOG(1) << "TFRT Dialect: " << tensorflow::MlirModuleToString(module);
+
+  auto bef =
+      tfrt::ConvertMLIRToBEF(module, /* disable_optional_sections = */ true);
+  if (bef.empty())
+    return tensorflow::errors::Internal("failed to convert MLIR to BEF.");
+
+  assert(bef_buffer);
+  bef_buffer->assign(bef.begin(), bef.end());
+
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
new file mode 100644
index 00000000000..de24ea20958
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
@@ -0,0 +1,75 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/core/platform/status.h"
+#include "tfrt/core_runtime/tensor_handle.h"
+#include "tfrt/support/aligned_buffer.h"
+
+namespace tfrt {
+class CoreRuntime;
+}
+
+namespace mlir {
+class ModuleOp;
+}
+
+namespace tensorflow {
+
+struct TFRTSavedModelCompileOptions {
+  // TODO(tf-runtime-team): Ideally, compiler should make the decision where
+  // to place the variable.
+  std::string variable_device = "cpu";
+  std::string default_device = "cpu";
+
+  // Enable compiler optimization in TFRT dialect.
+  bool enable_optimizer = true;
+
+  // Force data format for all layout sensitive operations, eg. setting it to
+  // "NHWC" will changes all data format in the graph to "NHWC" by inserting
+  // or removing related tf.Transpose op. Currently the supported formats are
+  // "NHWC" and "NCHW".
+  //
+  // TODO(tf-runtime-team): Ideally compiler should figure out whether the
+  // data format should be changed, instead of controlled by users.
+  std::string force_data_format;
+};
+
+// Map captured global tensors for each function.
+void MapFunctionGlobalTensorCapturesFromTFSavedModelMLIR(
+    mlir::ModuleOp module,
+    llvm::function_ref<
+        void(llvm::StringRef func_name,
+             llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> captures)>
+        map_fn);
+
+// Compile MLIR in TF saved model dialect into BEF.
+Status CompileTFSavedModelMLIRToBEF(const TFRTSavedModelCompileOptions& options,
+                                    mlir::ModuleOp module,
+                                    tfrt::AlignedBuffer<8>* bef_buffer);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_

From c14c618d85aea3fce9df87368f8a5e18c81073d1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 May 2020 23:45:40 -0700
Subject: [PATCH 0254/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310690982
Change-Id: Icb0f880ee70de2fb8ff29e5ab56491c70c2e9e7c
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 68f6f46a144b400a3ec3cf532dd53fe5edefd16d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 01:46:13 -0700
Subject: [PATCH 0255/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310698171
Change-Id: Ib49db3d8b7206dfb8313dc90ccb8c9a762fa7558
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From ad4dca653bc0febf562bdcf6b4dbf88919d9e4ba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 02:02:42 -0700
Subject: [PATCH 0256/1533] compat: Update forward compatibility horizon to
 2020-05-09

PiperOrigin-RevId: 310698996
Change-Id: Iff9db143d389ef1e654c9fba14e2ad935c97c199
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 0feb78072e0..f0700514b36 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 8)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 9)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From efc69b7b037d0a098b9842cc5171dc57e53cfa50 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 02:02:43 -0700
Subject: [PATCH 0257/1533] Update GraphDef version to 396.

PiperOrigin-RevId: 310698997
Change-Id: Ia20369474c102a9b009cec2cacebc91f722f4a5e
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 915af0fc054..94395f4c2a5 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 395  // Updated: 2020/5/8
+#define TF_GRAPH_DEF_VERSION 396  // Updated: 2020/5/9
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 728dc291a9378872304f45857838b96666f8c741 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 03:45:58 -0700
Subject: [PATCH 0258/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310704760
Change-Id: I1339bbb0fe5ad42e3a4ffcb69228a84d590e5d2a
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 3f37c0e264bda211a5c16d6bc7d97f40e2d8d68c Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
Date: Sat, 9 May 2020 17:27:08 +0530
Subject: [PATCH 0259/1533] Added no_oss_py2 tag and changed srcs_version for
 tests to PY3

Signed-off-by: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
---
 tensorflow/python/tf_program/tests/BUILD | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/tf_program/tests/BUILD b/tensorflow/python/tf_program/tests/BUILD
index 6b62d5528ec..2d36e4b295f 100644
--- a/tensorflow/python/tf_program/tests/BUILD
+++ b/tensorflow/python/tf_program/tests/BUILD
@@ -20,8 +20,11 @@ py_test(
     testonly = True,
     srcs = ["mlir_gen_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+    ],
     deps = [
         ":filecheck_wrapper",
         "//tensorflow/python:client_testlib",

From f87b2f87f62d424941ea6c26a33a7e85be1bbc6e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 05:47:36 -0700
Subject: [PATCH 0260/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310711617
Change-Id: Ief6657ce25434e893087726dd71fcb0fb762f8eb
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 6bb6011359cb19272c128f0e805e435e6f9a5187 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Sat, 9 May 2020 07:02:39 -0700
Subject: [PATCH 0261/1533] Enable MLIR saved model import by default in
 TFLiteConverterV2's saved model API

PiperOrigin-RevId: 310715565
Change-Id: I5d517be4c51457f7203f79cf8649a2f71eefe310
---
 tensorflow/lite/python/lite.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index ce59c56a1d0..99be58f4376 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -386,8 +386,13 @@ class TFLiteConverterBase(object):
         return True
     return False
 
-  def _parse_saved_model_args(self):
-    """Parses SavedModel arguments from the given Keras/RNN SavedModel."""
+  def _parse_saved_model_args(self, always_enable_saved_model_import=False):
+    """Parses SavedModel arguments from the given Keras/RNN SavedModel.
+
+    Args:
+      always_enable_saved_model_import: Bool. When the value is true, it enables
+        MLIR saved model import path regardless of checking the conditions.
+    """
     if not self.experimental_new_converter:
       self.saved_model_dir = None
       return
@@ -400,16 +405,17 @@ class TFLiteConverterBase(object):
         # frozen graph def path.
         self.saved_model_dir = None
         return
-      if not self._contains_function_with_implements_attr(saved_model_proto):
+      if (not always_enable_saved_model_import and
+          not self._contains_function_with_implements_attr(saved_model_proto)):
         self.saved_model_dir = None
-      else:
-        if not self._saved_model_exported_names:
-          self._saved_model_exported_names = []
-        self._saved_model_version = saved_model_proto.saved_model_schema_version
-        if self._saved_model_version not in [1, 2]:
-          raise ValueError(
-              "SavedModel file format({0}) is not supported".format(
-                  self._saved_model_version))
+        return
+
+      if not self._saved_model_exported_names:
+        self._saved_model_exported_names = []
+      self._saved_model_version = saved_model_proto.saved_model_schema_version
+      if self._saved_model_version not in [1, 2]:
+        raise ValueError("SavedModel file format({0}) is not supported".format(
+            self._saved_model_version))
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
@@ -542,7 +548,7 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
     self._saved_model_tags = saved_model_tags
     self._saved_model_exported_names = saved_model_exported_names
     self._trackable_obj = trackable_obj
-    self._parse_saved_model_args()
+    self._parse_saved_model_args(always_enable_saved_model_import=True)
 
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.

From de5b0cfd434c7a9f848ab100b70a6be16e48280b Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Sat, 9 May 2020 12:19:01 -0700
Subject: [PATCH 0262/1533] [XLA] Introducing AllGather HLO and a decomposition
 pass

PiperOrigin-RevId: 310732059
Change-Id: I637b2622464f60410f8085e810fb2ae3ee74a66c
---
 tensorflow/compiler/xla/client/xla_builder.cc |  43 +++++
 tensorflow/compiler/xla/client/xla_builder.h  |  16 ++
 .../compiler/xla/client/xla_builder_test.cc   |  12 ++
 tensorflow/compiler/xla/service/BUILD         |  36 ++++
 .../xla/service/all_gather_decomposer.cc      | 154 +++++++++++++++++
 .../xla/service/all_gather_decomposer.h       |  51 ++++++
 .../xla/service/all_gather_decomposer_test.cc | 161 ++++++++++++++++++
 .../compiler/xla/service/bfloat16_support.cc  |   1 +
 .../compiler/xla/service/dfs_hlo_visitor.h    |   1 +
 .../service/dfs_hlo_visitor_with_default.h    |   3 +
 .../compiler/xla/service/hlo_cost_analysis.cc |   4 +
 .../compiler/xla/service/hlo_cost_analysis.h  |   1 +
 tensorflow/compiler/xla/service/hlo_dce.cc    |   8 +-
 .../compiler/xla/service/hlo_graph_dumper.cc  |   1 +
 .../compiler/xla/service/hlo_instruction.cc   |  31 ++++
 .../compiler/xla/service/hlo_instruction.h    |  10 ++
 .../compiler/xla/service/hlo_instructions.cc  |  45 +++++
 .../compiler/xla/service/hlo_instructions.h   |  32 ++++
 tensorflow/compiler/xla/service/hlo_opcode.h  |   1 +
 tensorflow/compiler/xla/service/hlo_parser.cc |  29 ++++
 .../compiler/xla/service/hlo_parser_test.cc   |  37 ++++
 .../compiler/xla/service/hlo_verifier.cc      |  34 ++++
 .../compiler/xla/service/hlo_verifier.h       |   1 +
 .../xla/service/instruction_fusion.cc         |   1 +
 .../compiler/xla/service/layout_assignment.cc |   1 +
 .../compiler/xla/service/shape_inference.cc   |  11 ++
 .../compiler/xla/service/shape_inference.h    |   6 +
 27 files changed, 726 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/all_gather_decomposer.cc
 create mode 100644 tensorflow/compiler/xla/service/all_gather_decomposer.h
 create mode 100644 tensorflow/compiler/xla/service/all_gather_decomposer_test.cc

diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index a779086f1d5..0b146a43e64 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -2203,6 +2203,39 @@ XlaOp XlaBuilder::BatchNormGrad(XlaOp operand, XlaOp scale, XlaOp batch_mean,
   });
 }
 
+XlaOp XlaBuilder::AllGather(XlaOp operand, int64 all_gather_dimension,
+                            int64 shard_count,
+                            absl::Span<const ReplicaGroup> replica_groups,
+                            const absl::optional<ChannelHandle>& channel_id,
+                            const absl::optional<Layout>& layout) {
+  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
+
+    TF_ASSIGN_OR_RETURN(Shape inferred_shape,
+                        ShapeInference::InferAllGatherShape(
+                            *operand_shape, all_gather_dimension, shard_count));
+    if (layout) {
+      *inferred_shape.mutable_layout() = *layout;
+      instr.set_constrain_layout(true);
+    }
+    *instr.mutable_shape() = inferred_shape.ToProto();
+
+    instr.add_dimensions(all_gather_dimension);
+    for (const ReplicaGroup& group : replica_groups) {
+      *instr.add_replica_groups() = group;
+    }
+    if (channel_id.has_value()) {
+      instr.set_channel_id(channel_id->handle());
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        auto all_gather,
+        AddInstruction(std::move(instr), HloOpcode::kAllGather, {operand}));
+    return all_gather;
+  });
+}
+
 XlaOp XlaBuilder::CrossReplicaSum(
     XlaOp operand, absl::Span<const ReplicaGroup> replica_groups) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -3470,6 +3503,16 @@ XlaOp ReduceWindowWithGeneralPadding(
       base_dilations, window_dilations, padding);
 }
 
+XlaOp AllGather(const XlaOp operand, int64 all_gather_dimension,
+                int64 shard_count,
+                absl::Span<const ReplicaGroup> replica_groups,
+                const absl::optional<ChannelHandle>& channel_id,
+                const absl::optional<Layout>& layout) {
+  return operand.builder()->AllGather(operand, all_gather_dimension,
+                                      shard_count, replica_groups, channel_id,
+                                      layout);
+}
+
 XlaOp CrossReplicaSum(const XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups) {
   return operand.builder()->CrossReplicaSum(operand, replica_groups);
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 2ab4c575862..bfb97d7721f 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -549,6 +549,12 @@ class XlaBuilder {
   XlaOp CrossReplicaSum(XlaOp operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
+  XlaOp AllGather(
+      XlaOp operand, int64 all_gather_dimension, int64 shard_count,
+      absl::Span<const ReplicaGroup> replica_groups = {},
+      const absl::optional<ChannelHandle>& channel_id = absl::nullopt,
+      const absl::optional<Layout>& layout = absl::nullopt);
+
   XlaOp AllReduce(
       XlaOp operand, const XlaComputation& computation,
       absl::Span<const ReplicaGroup> replica_groups = {},
@@ -992,6 +998,11 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding);
   friend XlaOp CrossReplicaSum(XlaOp operand,
                                absl::Span<const ReplicaGroup> replica_groups);
+  friend XlaOp AllGather(XlaOp operand, int64 all_gather_dimension,
+                         int64 shard_count,
+                         absl::Span<const ReplicaGroup> replica_groups,
+                         const absl::optional<ChannelHandle>& channel_id,
+                         const absl::optional<Layout>& layout);
   friend XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
                          absl::Span<const ReplicaGroup> replica_groups,
                          const absl::optional<ChannelHandle>& channel_id,
@@ -1771,6 +1782,11 @@ XlaOp ReduceWindowWithGeneralPadding(
 XlaOp CrossReplicaSum(XlaOp operand,
                       absl::Span<const ReplicaGroup> replica_groups = {});
 
+XlaOp AllGather(XlaOp operand, int64 all_gather_dimension, int64 shard_count,
+                absl::Span<const ReplicaGroup> replica_groups = {},
+                const absl::optional<ChannelHandle>& channel_id = absl::nullopt,
+                const absl::optional<Layout>& layout = absl::nullopt);
+
 // Enqueues an operation that do an AllReduce of the operand cross cores. Here
 // AllReduce means doing a reduction on the input operand cross cores and then
 // broadcasting the reduction result to those cores. The reduction function is
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 1fa839b2014..e1733cd179c 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -381,6 +381,18 @@ TEST_F(XlaBuilderTest, Transpose) {
   EXPECT_THAT(root, op::Transpose(op::Parameter()));
 }
 
+TEST_F(XlaBuilderTest, AllGather) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
+  AllGather(x, /*all_gather_dimension=*/1, /*shard_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {4, 64})));
+}
+
 TEST_F(XlaBuilderTest, AllToAll) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 348200051ef..499c4e25828 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2397,6 +2397,42 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "all_gather_decomposer",
+    srcs = ["all_gather_decomposer.cc"],
+    hdrs = ["all_gather_decomposer.h"],
+    deps = [
+        ":hlo",
+        ":hlo_casting_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "all_gather_decomposer_test",
+    srcs = ["all_gather_decomposer_test.cc"],
+    deps = [
+        ":all_gather_decomposer",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.cc b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
new file mode 100644
index 00000000000..ad63218eca8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
@@ -0,0 +1,154 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
+
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+// Creates a computation of x + y.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
+  HloComputation::Builder sum_b("add");
+  auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+  auto y = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+  if (type == PRED) {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kOr, x, y));
+  } else {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kAdd, x, y));
+  }
+  HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
+  return reduction;
+}
+
+Status DecomposeAllGather(HloAllGatherInstruction* ag, int64 partition_count,
+                          HloComputation* comp) {
+  auto zero = comp->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(ag->shape().element_type())));
+  zero = comp->AddInstruction(
+      HloInstruction::CreateBroadcast(ag->shape(), zero, {}));
+  auto zero_index = comp->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+  std::vector<HloInstruction*> start_indices(ag->shape().rank(), zero_index);
+  auto shard_id_from_subgroup = [&](HloInstruction* replica_or_global_id) {
+    if (ag->replica_groups().empty()) {
+      return replica_or_global_id;
+    }
+    if (ag->replica_groups().size() == 1) {
+      // Whether the group is {1, 2, ..., N - 1}.
+      bool trivial_group = true;
+      for (int64 i = 0; i < ag->replica_groups()[0].replica_ids_size(); ++i) {
+        if (ag->replica_groups()[0].replica_ids(i) != i) {
+          trivial_group = false;
+          break;
+        }
+      }
+      if (trivial_group) {
+        CHECK_EQ(partition_count, ag->replica_groups()[0].replica_ids_size());
+        return replica_or_global_id;
+      }
+    }
+    // Create a table of shard IDs for each replica_or_global_id, then slice it
+    // using replica_or_global_id.
+    std::vector<int32> shard_ids(ag->replica_groups().size() *
+                                 ag->replica_groups()[0].replica_ids_size());
+    for (const auto& group : ag->replica_groups()) {
+      for (int64 i = 0; i < group.replica_ids_size(); ++i) {
+        shard_ids[group.replica_ids(i)] = i;
+      }
+    }
+    auto id_table = comp->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<int32>(shard_ids)));
+    auto shard_id = comp->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::MakeShape(S32, {1}), id_table, {replica_or_global_id}, {1}));
+    shard_id = comp->AddInstruction(
+        HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), shard_id));
+    return shard_id;
+  };
+  HloInstruction* shard_id;
+  if (ag->channel_id().has_value()) {
+    if (ag->use_global_device_ids()) {
+      auto pid = comp->AddInstruction(HloInstruction::CreatePartitionId());
+      auto rid = comp->AddInstruction(HloInstruction::CreateReplicaId());
+      auto pcount = comp->AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(partition_count)));
+      auto global_id = comp->AddInstruction(HloInstruction::CreateBinary(
+          pid->shape(), HloOpcode::kAdd, pid,
+          comp->AddInstruction(HloInstruction::CreateBinary(
+              pid->shape(), HloOpcode::kMultiply, rid, pcount))));
+      shard_id = shard_id_from_subgroup(global_id);
+    } else {
+      TF_RET_CHECK(!ag->replica_groups().empty());
+      TF_RET_CHECK(ag->replica_groups()[0].replica_ids_size() == 1);
+      shard_id = comp->AddInstruction(HloInstruction::CreatePartitionId());
+    }
+  } else {
+    shard_id = shard_id_from_subgroup(
+        comp->AddInstruction(HloInstruction::CreateReplicaId()));
+  }
+  start_indices[ag->all_gather_dimension()] =
+      comp->AddInstruction(HloInstruction::CreateBinary(
+          shard_id->shape(), HloOpcode::kMultiply, shard_id,
+          comp->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<int32>(ag->operand(0)->shape().dimensions(
+                  ag->all_gather_dimension()))))));
+  auto dus = comp->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      zero->shape(), zero, ag->mutable_operand(0), start_indices));
+  auto ar = comp->AddInstruction(HloInstruction::CreateAllReduce(
+      dus->shape(), {dus},
+      MakeBinaryAdd(dus->shape().element_type(), comp->parent()),
+      ag->replica_groups(),
+      /*constrain_layout=*/ag->constrain_layout(), ag->channel_id(),
+      ag->use_global_device_ids()));
+  TF_RETURN_IF_ERROR(ag->ReplaceAllUsesWith(ar));
+  TF_RETURN_IF_ERROR(comp->RemoveInstructionAndUnusedOperands(ag));
+  return Status::OK();
+}
+
+StatusOr<bool> AllGatherDecomposer::Run(HloModule* module) {
+  bool changed = false;
+  for (auto comp : module->MakeNonfusionComputations()) {
+    for (auto hlo : comp->MakeInstructionPostOrder()) {
+      if (hlo->opcode() != HloOpcode::kAllGather) {
+        continue;
+      }
+      auto ag = Cast<HloAllGatherInstruction>(hlo);
+      if (should_decompose_(*ag)) {
+        TF_RETURN_IF_ERROR(DecomposeAllGather(ag, partition_count_, comp));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.h b/tensorflow/compiler/xla/service/all_gather_decomposer.h
new file mode 100644
index 00000000000..d1983e37383
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.h
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// AllGatherDecomposer is a pass which converts unsupported all-gathers into
+// dynamic-update-slices and all-reduces.
+class AllGatherDecomposer : public HloModulePass {
+ public:
+  AllGatherDecomposer(
+      std::function<bool(const HloAllGatherInstruction&)> should_decompose,
+      int64 partition_count)
+      : should_decompose_(std::move(should_decompose)),
+        partition_count_(partition_count) {}
+  explicit AllGatherDecomposer(int64 partition_count)
+      : should_decompose_(
+            [](const HloAllGatherInstruction& ag) { return true; }),
+        partition_count_(partition_count) {}
+  absl::string_view name() const override { return "all_gather_decomposer"; }
+
+  // Run AllGatherDecomposer pass on computations in 'module'.
+  // Returns whether the 'module' was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  std::function<bool(const HloAllGatherInstruction&)> should_decompose_;
+  int64 partition_count_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_ALL_GATHER_DECOMPOSER_H_
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
new file mode 100644
index 00000000000..ebcd66ffa07
--- /dev/null
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/all_gather_decomposer.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+using AllGatherDecomposerTest = HloTestBase;
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGather) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={}, dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::ReplicaId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossPartitionAllGather) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={{0}}, channel_id=1,
+    dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::PartitionId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithTrivialGroup) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0), replica_groups={{0,1,2,3}},
+    dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::AllReduce(op::DynamicUpdateSlice(
+          op::Broadcast(op::Constant()), op::Parameter(0), op::Constant(),
+          op::Multiply(op::ReplicaId(), op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithSubgroups) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0),
+    replica_groups={{2,1,0,3}, {4,6,7,5}}, dimensions={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  auto id =
+      AllOf(op::Shape("s32[]"),
+            op::Reshape(op::DynamicSlice(op::Constant(), op::ReplicaId())));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllReduce(op::DynamicUpdateSlice(
+                  op::Broadcast(op::Constant()), op::Parameter(0),
+                  op::Constant(), op::Multiply(id, op::Constant()))));
+}
+
+TEST_F(AllGatherDecomposerTest, CrossReplicaAllGatherWithSubgroupsGlobalIds) {
+  const string module_str = R"(
+HloModule module
+
+ENTRY entry {
+  param0 = f32[10,20] parameter(0)
+  ROOT ag = f32[10,80] all-gather(param0),
+    replica_groups={{2,1,0,3}, {4,6,7,5}}, dimensions={1}, channel_id=1,
+    use_global_device_ids=true
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule((module_str)));
+  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
+  EXPECT_TRUE(changed);
+  LOG(ERROR) << module->ToString();
+  auto global_id =
+      op::Add(op::PartitionId(), op::Multiply(op::ReplicaId(), op::Constant()));
+  auto id = AllOf(op::Shape("s32[]"),
+                  op::Reshape(op::DynamicSlice(op::Constant(), global_id)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllReduce(op::DynamicUpdateSlice(
+                  op::Broadcast(op::Constant()), op::Parameter(0),
+                  op::Constant(), op::Multiply(id, op::Constant()))));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_support.cc b/tensorflow/compiler/xla/service/bfloat16_support.cc
index abb695fa486..30d764225c2 100644
--- a/tensorflow/compiler/xla/service/bfloat16_support.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_support.cc
@@ -79,6 +79,7 @@ bool BFloat16Support::EffectiveOperandPrecisionIsOutputPrecision(
     const HloInstruction& hlo, int64 operand_index) {
   switch (hlo.opcode()) {
     case HloOpcode::kAbs:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllToAll:
     case HloOpcode::kBroadcast:
     case HloOpcode::kClamp:
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index cadea620ec6..caea9d9095a 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -116,6 +116,7 @@ class DfsHloVisitorBase {
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
   virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
   virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
+  virtual Status HandleAllGather(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index baa9240fb56..9cd220245ba 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -98,6 +98,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCholesky(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandleAllGather(HloInstructionPtr crs) override {
+    return DefaultAction(crs);
+  }
   Status HandleAllReduce(HloInstructionPtr crs) override {
     return DefaultAction(crs);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 94a4df43cf4..32a9038b15a 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -707,6 +707,10 @@ Status HloCostAnalysis::HandleCholesky(const HloInstruction* hlo) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleAllGather(const HloInstruction* hlo) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleAllReduce(const HloInstruction* crs) {
   // We assume 2 replicas, so that each output element is the sum of two input
   // elements.
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 915c4dcbe84..9fdb42185fb 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -76,6 +76,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleFft(const HloInstruction* fft) override;
   Status HandleTriangularSolve(const HloInstruction* hlo) override;
   Status HandleCholesky(const HloInstruction* hlo) override;
+  Status HandleAllGather(const HloInstruction* hlo) override;
   Status HandleAllReduce(const HloInstruction* crs) override;
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_dce.cc b/tensorflow/compiler/xla/service/hlo_dce.cc
index b8e3f83b515..900b557b4dc 100644
--- a/tensorflow/compiler/xla/service/hlo_dce.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce.cc
@@ -47,16 +47,14 @@ StatusOr<bool> HloDCE::RunOnComputation(
   // computation's instruction while simultaneously removing instructions.
   std::vector<HloInstruction*> dead_roots;
   for (auto* instruction : computation->instructions()) {
+    auto maybe_collective_op = DynCast<HloAllReduceInstruction>(instruction);
     if (instruction != computation->root_instruction() &&
         instruction->user_count() == 0 &&
         computation->IsSafelyRemovable(instruction) &&
         (!instruction->HasSideEffect() ||
          (remove_cross_partition_collective_ops &&
-          ((instruction->opcode() == HloOpcode::kAllReduce &&
-            !Cast<HloAllReduceInstruction>(instruction)->constrain_layout()) ||
-           (instruction->opcode() == HloOpcode::kAllToAll &&
-            !Cast<HloAllToAllInstruction>(instruction)->constrain_layout()) ||
-           instruction->opcode() == HloOpcode::kCollectivePermute)))) {
+          (maybe_collective_op != nullptr &&
+           !maybe_collective_op->constrain_layout())))) {
       dead_roots.push_back(instruction);
     }
   }
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 47a455ac3f4..cd2a61d7eff 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1057,6 +1057,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGetDimensionSize:
     case HloOpcode::kSetDimensionSize:
       return kGray;
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 27fac19587e..9e9c8b0913b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -388,6 +388,24 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                   proto.outfeed_config());
       break;
     }
+    case HloOpcode::kAllGather: {
+      absl::optional<int64> channel_id;
+      if (proto.channel_id() > 0) {
+        channel_id = proto.channel_id();
+      }
+
+      TF_RET_CHECK(proto.dimensions_size() == 1)
+          << "AllGather cannot have more than 1 all-gather dimensions";
+      TF_RET_CHECK(all_operands().size() == 1)
+          << "AllGather must have a single operand";
+      int64 all_gather_dimension = proto.dimensions(0);
+      instruction = CreateAllGather(
+          shape, operands(0), all_gather_dimension,
+          std::vector<ReplicaGroup>(proto.replica_groups().begin(),
+                                    proto.replica_groups().end()),
+          proto.constrain_layout(), channel_id, proto.use_global_device_ids());
+      break;
+    }
     case HloOpcode::kAllReduce: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "AllReduce should have 1 called computation but sees "
@@ -929,6 +947,15 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
       shape, operand, exponent_bits, mantissa_bits);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllGather(
+    const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+    const absl::optional<int64>& channel_id, bool use_global_device_ids) {
+  return absl::make_unique<HloAllGatherInstruction>(
+      shape, operand, all_gather_dimension, replica_groups, constrain_layout,
+      channel_id, use_global_device_ids);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateAllReduce(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
@@ -1518,6 +1545,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kParameter:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kReducePrecision:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
@@ -1997,6 +2025,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kReducePrecision:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
@@ -2851,6 +2880,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleConvolution(this);
     case HloOpcode::kFft:
       return visitor->HandleFft(this);
+    case HloOpcode::kAllGather:
+      return visitor->HandleAllGather(this);
     case HloOpcode::kAllReduce:
       return visitor->HandleAllReduce(this);
     case HloOpcode::kAllToAll:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 923138862a7..8be7a034877 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -618,6 +618,16 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, const int exponent_bits,
       const int mantissa_bits);
 
+  // Creates an all-gather op, which concats the operands of all participants
+  // along all_gather_dimension. The replica_groups, channel_id, and
+  // use_global_device_ids arguments are identical to those in all-reduce,
+  // except that the order of the group members determines the concatenation
+  // order of inputs from different participants.
+  static std::unique_ptr<HloInstruction> CreateAllGather(
+      const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
+
   // Creates a cross replica reduction op.
   //
   // `reduction_computation`: the reduction function.
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index eb821d40e78..d5bdd674563 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -556,6 +556,51 @@ bool HloCollectiveInstruction::IdenticalSlowPath(
                        });
 }
 
+HloAllGatherInstruction::HloAllGatherInstruction(
+    const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+    const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+    const absl::optional<int64>& channel_id, bool use_global_device_ids)
+    : HloCollectiveInstruction(HloOpcode::kAllGather, shape, {operand},
+                               replica_groups, constrain_layout, channel_id),
+      all_gather_dimension_(all_gather_dimension),
+      use_global_device_ids_(use_global_device_ids) {}
+
+std::vector<string> HloAllGatherInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  std::vector<string> result =
+      HloCollectiveInstruction::ExtraAttributesToStringImpl(options);
+  result.push_back(StrCat("dimensions={", all_gather_dimension_, "}"));
+  if (use_global_device_ids_) {
+    result.push_back("use_global_device_ids=true");
+  }
+  return result;
+}
+
+std::unique_ptr<HloInstruction>
+HloAllGatherInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* /*context*/) const {
+  return absl::make_unique<HloAllGatherInstruction>(
+      shape, new_operands[0], all_gather_dimension(), replica_groups(),
+      constrain_layout(), channel_id(), use_global_device_ids());
+}
+
+HloInstructionProto HloAllGatherInstruction::ToProto() const {
+  HloInstructionProto proto = HloCollectiveInstruction::ToProto();
+  proto.add_dimensions(all_gather_dimension_);
+  return proto;
+}
+
+bool HloAllGatherInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloAllGatherInstruction&>(other);
+  return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) &&
+         all_gather_dimension_ == casted_other.all_gather_dimension() &&
+         use_global_device_ids() == casted_other.use_global_device_ids();
+}
+
 HloAllReduceInstruction::HloAllReduceInstruction(
     const Shape& shape, absl::Span<HloInstruction* const> operands,
     HloComputation* reduce_computation,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index eecd02d891e..ae78d365cfa 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -348,6 +348,38 @@ class HloCollectiveInstruction : public HloChannelInstruction {
   bool constrain_layout_;
 };
 
+class HloAllGatherInstruction : public HloCollectiveInstruction {
+ public:
+  explicit HloAllGatherInstruction(
+      const Shape& shape, HloInstruction* operand, int64 all_gather_dimension,
+      const std::vector<ReplicaGroup>& replica_groups, bool constrain_layout,
+      const absl::optional<int64>& channel_id, bool use_global_device_ids);
+  // Same as HloAllReduceInstruction::use_global_device_ids.
+  bool use_global_device_ids() const { return use_global_device_ids_; }
+
+  // The dimension on which data from different participants are concatenated.
+  int64 all_gather_dimension() const { return all_gather_dimension_; }
+
+ protected:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64 all_gather_dimension_;
+  bool use_global_device_ids_;
+};
+
 class HloAllReduceInstruction : public HloCollectiveInstruction {
  public:
   explicit HloAllReduceInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 2d66237de59..664fa10a990 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -48,6 +48,7 @@ namespace xla {
   V(kAdd, "add", 2)                                                    \
   V(kAddDependency, "add-dependency", 2)                               \
   V(kAfterAll, "after-all", kHloOpcodeIsVariadic)                      \
+  V(kAllGather, "all-gather", 1)                                       \
   V(kAllReduce, "all-reduce", kHloOpcodeIsVariadic)                    \
   V(kAllToAll, "all-to-all", kHloOpcodeIsVariadic)                     \
   V(kAtan2, "atan2", 2)                                                \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index a9c3cacc4c4..2a90c95850c 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -850,6 +850,35 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
           HloInstruction::CreateBitcastConvert(shape, operands[0]));
       break;
     }
+    case HloOpcode::kAllGather: {
+      optional<std::vector<std::vector<int64>>> tmp_groups;
+      optional<std::vector<int64>> replica_group_ids;
+      optional<int64> channel_id;
+      optional<std::vector<int64>> dimensions;
+      optional<bool> constrain_layout;
+      optional<bool> use_global_device_ids;
+      attrs["replica_groups"] = {/*required=*/false,
+                                 AttrTy::kBracedInt64ListList, &tmp_groups};
+      attrs["channel_id"] = {/*required=*/false, AttrTy::kInt64, &channel_id};
+      attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
+                             &dimensions};
+      attrs["constrain_layout"] = {/*required=*/false, AttrTy::kBool,
+                                   &constrain_layout};
+      attrs["use_global_device_ids"] = {/*required=*/false, AttrTy::kBool,
+                                        &use_global_device_ids};
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      std::vector<ReplicaGroup> replica_groups;
+      if (tmp_groups) {
+        replica_groups = CreateReplicaGroups(*tmp_groups);
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateAllGather(
+          shape, operands[0], dimensions->at(0), replica_groups,
+          constrain_layout ? *constrain_layout : false, channel_id,
+          use_global_device_ids ? *use_global_device_ids : false));
+      break;
+    }
     case HloOpcode::kAllReduce: {
       optional<std::vector<std::vector<int64>>> tmp_groups;
       optional<HloComputation*> to_apply;
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 7e66b4e648d..e18014a3071 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1480,6 +1480,43 @@ ENTRY CRS {
 
 )"
 },
+// all-gather
+{
+"AllGather",
+R"(HloModule AllGather
+
+ENTRY AllGather {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,128]{0,1} all-gather(input), replica_groups={}, dimensions={1}
+}
+
+)"
+},
+// all-gather with constrained layout
+{
+"AllGatherWithLayout",
+R"(HloModule AllGather
+
+ENTRY AllGather {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,128]{0,1} all-gather(input), replica_groups={}, constrain_layout=true, dimensions={1}
+}
+
+)"
+},
+// all-gather with subgroups
+{
+"AllGatherWithSubgroups",
+R"(HloModule AllGatherWithSubgroups
+
+ENTRY AllGatherWithSubgroups {
+  input = f32[128,32]{0,1} parameter(0)
+  ROOT ag = f32[128,64]{0,1} all-gather(input), replica_groups={{0,1},{2,3}}, dimensions={1}
+}
+
+)",
+/*replica_count=*/4,
+},
 // all-to-all
 {
 "AllToAll",
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 91f51ea33cf..360c8e50d55 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -236,6 +236,40 @@ static Status CheckReplicaGroups(HloInstruction* hlo) {
   return Status::OK();
 }
 
+Status ShapeVerifier::HandleAllGather(HloInstruction* hlo) {
+  auto ag = Cast<HloAllGatherInstruction>(hlo);
+  TF_RETURN_IF_ERROR(CheckReplicaGroups(ag));
+  TF_RET_CHECK(ag->all_gather_dimension() >= 0);
+  TF_RET_CHECK(ag->all_gather_dimension() < ag->shape().rank());
+  TF_RET_CHECK(ag->all_gather_dimension() < ag->operand(0)->shape().rank());
+  if (ag->use_global_device_ids() && ag->replica_groups().empty()) {
+    return InternalError(
+        "Replica group must be specified when use_global_device_ids is true");
+  }
+
+  int64 shard_count = CeilOfRatio(
+      ag->shape().dimensions(ag->all_gather_dimension()),
+      ag->operand(0)->shape().dimensions(ag->all_gather_dimension()));
+  if (ag->channel_id().has_value()) {
+    if (ag->use_global_device_ids()) {
+      TF_RET_CHECK(shard_count == ag->replica_groups()[0].replica_ids_size());
+    } else {
+      if (ag->replica_groups().empty() ||
+          ag->replica_groups()[0].replica_ids_size() != 1) {
+        return InternalError(
+            "Replica group size must be 1 when use_global_device_ids is "
+            "false if the all-gather is also cross-partition");
+      }
+    }
+  } else if (!ag->replica_groups().empty()) {
+    // Cross-replica all-gather: shard count is subgroup size.
+    TF_RET_CHECK(shard_count == ag->replica_groups()[0].replica_ids_size());
+  }
+  return CheckShape(ag, ShapeInference::InferAllGatherShape(
+                            ag->operand(0)->shape(), ag->all_gather_dimension(),
+                            shard_count));
+}
+
 Status ShapeVerifier::HandleAllReduce(HloInstruction* crs) {
   TF_RETURN_IF_ERROR(CheckReplicaGroups(crs));
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 2e83361a591..7a2d3dc2e6c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -56,6 +56,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleFft(HloInstruction* fft) override;
   Status HandleCholesky(HloInstruction* hlo) override;
   Status HandleTriangularSolve(HloInstruction* hlo) override;
+  Status HandleAllGather(HloInstruction* hlo) override;
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 99242c9ca21..1bc3d24274c 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -145,6 +145,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCholesky:
     case HloOpcode::kConditional:
     case HloOpcode::kConvolution:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index a67c677bd03..84654bf3213 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2179,6 +2179,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kConditional:
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
+    case HloOpcode::kAllGather:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kDivide:
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index f3c8eec1751..8d6ef9faba9 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -1999,6 +1999,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   return a;
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferAllGatherShape(
+    const Shape& operand_shape, int64 all_gather_dimension, int64 shard_count) {
+  TF_RET_CHECK(all_gather_dimension > 0);
+  TF_RET_CHECK(all_gather_dimension < operand_shape.rank());
+  TF_RET_CHECK(shard_count > 0);
+  auto shape = operand_shape;
+  shape.set_dimensions(all_gather_dimension,
+                       shard_count * shape.dimensions(all_gather_dimension));
+  return shape;
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferAllReduceShape(
     absl::Span<const Shape* const> operand_shapes) {
   for (const Shape* operand_shape : operand_shapes) {
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 2e96a77aa22..2cb5930d098 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -123,6 +123,12 @@ class ShapeInference {
   // Infers the shape produced by the given triangular solve operation.
   static StatusOr<Shape> InferCholeskyShape(const Shape& a);
 
+  // Infers the shape produced by an all-gather with the given operand shape,
+  // concat dimension, and shard count.
+  static StatusOr<Shape> InferAllGatherShape(const Shape& operand_shape,
+                                             int64 all_gather_dimension,
+                                             int64 shard_count);
+
   // Infers the shape produced by a cross replica sum with the given operand
   // shapes.
   static StatusOr<Shape> InferAllReduceShape(

From a2a26ef378bfdf08bca4cf7ae936fc8a740eaaab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 15:46:30 -0700
Subject: [PATCH 0263/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310743414
Change-Id: I5c6792bf46e7944880e58ccd40fd36b99b5a0c30
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 264439782babb9eaf72a19623af64433723d2181 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Sat, 9 May 2020 16:18:38 -0700
Subject: [PATCH 0264/1533] Fix the myth from height vs width. The cause is due
 to ImageProjectiveTransformV2 accepts transform matrix with the first row on
 width and the second row on height change.

PiperOrigin-RevId: 310745160
Change-Id: I58eaf26fccb836304a5cc2b0f543d94dcf3da1e4
---
 .../preprocessing/image_preprocessing.py      |  71 +++++-----
 .../preprocessing/image_preprocessing_test.py | 124 ++++++++++++------
 2 files changed, 116 insertions(+), 79 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 137fb2cb410..05a6e84e6cc 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -51,9 +51,8 @@ _RESIZE_METHODS = {
     'mitchellcubic': ResizeMethod.MITCHELLCUBIC
 }
 
-# Visually the data format should be NWHC instead of NHWC.
-H_AXIS = 2
-W_AXIS = 1
+H_AXIS = 1
+W_AXIS = 2
 
 
 def check_fill_mode_and_interpolation(fill_mode, interpolation):
@@ -71,7 +70,7 @@ class Resizing(Layer):
   """Image resizing layer.
 
   Resize the batched image input to target height and width. The input should
-  be a 4-D tensor, channels_last format.
+  be a 4-D tensor in the format of NHWC.
 
   Arguments:
     height: Integer, the height of the output shape.
@@ -98,14 +97,14 @@ class Resizing(Layer):
   def call(self, inputs):
     outputs = image_ops.resize_images_v2(
         images=inputs,
-        size=[self.target_width, self.target_height],
+        size=[self.target_height, self.target_width],
         method=self._interpolation_method)
     return outputs
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], self.target_width, self.target_height, input_shape[3]])
+        [input_shape[0], self.target_height, self.target_width, input_shape[3]])
 
   def get_config(self):
     config = {
@@ -123,11 +122,11 @@ class CenterCrop(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, width, height, channels)`, data_format='channels_last'.
+    `(samples, height, width, channels)`, data_format='channels_last'.
 
   Output shape:
     4D tensor with shape:
-    `(samples, target_width, target_height, channels)`.
+    `(samples, target_height, target_width, channels)`.
 
   If the input height/width is even and the target height/width is odd (or
   inversely), the input image is left-padded by 1 pixel.
@@ -164,16 +163,16 @@ class CenterCrop(Layer):
     with ops.control_dependencies(checks):
       bbox_h_start = math_ops.cast(img_hd_diff / 2, dtypes.int32)
       bbox_w_start = math_ops.cast(img_wd_diff / 2, dtypes.int32)
-      bbox_begin = array_ops.stack([0, bbox_w_start, bbox_h_start, 0])
+      bbox_begin = array_ops.stack([0, bbox_h_start, bbox_w_start, 0])
       bbox_size = array_ops.stack(
-          [-1, self.target_width, self.target_height, -1])
+          [-1, self.target_height, self.target_width, -1])
       outputs = array_ops.slice(inputs, bbox_begin, bbox_size)
       return outputs
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], self.target_width, self.target_height, input_shape[3]])
+        [input_shape[0], self.target_height, self.target_width, input_shape[3]])
 
   def get_config(self):
     config = {
@@ -197,11 +196,11 @@ class RandomCrop(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, width, height, channels)`, data_format='channels_last'.
+    `(samples, height, width, channels)`, data_format='channels_last'.
 
   Output shape:
     4D tensor with shape:
-    `(samples, target_width, target_height, channels)`.
+    `(samples, target_height, target_width, channels)`.
 
   Arguments:
     height: Integer, the height of the output shape.
@@ -226,10 +225,10 @@ class RandomCrop(Layer):
       """Cropped inputs with stateless random ops."""
       input_shape = array_ops.shape(inputs)
       crop_size = array_ops.stack(
-          [input_shape[0], self.width, self.height, input_shape[3]])
+          [input_shape[0], self.height, self.width, input_shape[3]])
       check = control_flow_ops.Assert(
           math_ops.reduce_all(input_shape >= crop_size),
-          [self.width, self.height])
+          [self.height, self.width])
       input_shape = control_flow_ops.with_dependencies([check], input_shape)
       limit = input_shape - crop_size + 1
       offset = stateless_random_ops.stateless_random_uniform(
@@ -257,14 +256,14 @@ class RandomCrop(Layer):
                                 input_width_t.dtype))
       # pylint: enable=g-long-lambda
       resized_inputs = image_ops.resize_images_v2(
-          images=inputs, size=array_ops.stack([resized_width, resized_height]))
+          images=inputs, size=array_ops.stack([resized_height, resized_width]))
 
       img_hd_diff = resized_height - self.height
       img_wd_diff = resized_width - self.width
       bbox_h_start = math_ops.cast(img_hd_diff / 2, dtypes.int32)
       bbox_w_start = math_ops.cast(img_wd_diff / 2, dtypes.int32)
-      bbox_begin = array_ops.stack([0, bbox_w_start, bbox_h_start, 0])
-      bbox_size = array_ops.stack([-1, self.width, self.height, -1])
+      bbox_begin = array_ops.stack([0, bbox_h_start, bbox_w_start, 0])
+      bbox_size = array_ops.stack([-1, self.height, self.width, -1])
       outputs = array_ops.slice(resized_inputs, bbox_begin, bbox_size)
       return outputs
 
@@ -272,14 +271,14 @@ class RandomCrop(Layer):
                                  resize_and_center_cropped_inputs)
     original_shape = inputs.shape.as_list()
     batch_size, num_channels = original_shape[0], original_shape[3]
-    output_shape = [batch_size] + [self.width, self.height] + [num_channels]
+    output_shape = [batch_size] + [self.height, self.width] + [num_channels]
     output.set_shape(output_shape)
     return output
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], self.width, self.height, input_shape[3]])
+        [input_shape[0], self.height, self.width, input_shape[3]])
 
   def get_config(self):
     config = {
@@ -345,11 +344,11 @@ class RandomFlip(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, width, height, channels)`, data_format='channels_last'.
+    `(samples, height, width, channels)`, data_format='channels_last'.
 
   Output shape:
     4D tensor with shape:
-    `(samples, width, height, channels)`, data_format='channels_last'.
+    `(samples, height, width, channels)`, data_format='channels_last'.
 
   Attributes:
     mode: String indicating which flip mode to use. Can be "horizontal",
@@ -452,11 +451,11 @@ class RandomTranslation(Layer):
     name: A string, the name of the layer.
 
   Input shape:
-    4D tensor with shape: `(samples, width, height, channels)`,
+    4D tensor with shape: `(samples, height, width, channels)`,
       data_format='channels_last'.
 
   Output shape:
-    4D tensor with shape: `(samples, width, height, channels)`,
+    4D tensor with shape: `(samples, height, width, channels)`,
       data_format='channels_last'.
 
   Raise:
@@ -603,8 +602,8 @@ def transform(images,
   """Applies the given transform(s) to the image(s).
 
   Args:
-    images: A tensor of shape (num_images, num_rows, num_columns, num_channels),
-      (num_rows, num_columns, num_channels) (HWC), or (num_rows,
+    images: A tensor of shape (num_images, num_rows, num_columns, num_channels)
+      (NHWC), (num_rows, num_columns, num_channels) (HWC), or (num_rows,
       num_columns) (HW). The rank must be statically known (the shape is not
       `TensorShape(None)`.
     transforms: Projective transform matrix/matrices. A vector of length 8 or
@@ -1098,10 +1097,10 @@ class RandomHeight(Layer):
     name: A string, the name of the layer.
 
   Input shape:
-    4D tensor with shape: `(samples, width, height, channels)`
+    4D tensor with shape: `(samples, height, width, channels)`
       (data_format='channels_last').
   Output shape:
-    4D tensor with shape: `(samples, width, random_height, channels)`.
+    4D tensor with shape: `(samples, random_height, width, channels)`.
   """
 
   def __init__(self,
@@ -1145,11 +1144,11 @@ class RandomHeight(Layer):
           minval=(1.0 + self.height_lower),
           maxval=(1.0 + self.height_upper))
       adjusted_height = math_ops.cast(height_factor * img_hd, dtypes.int32)
-      adjusted_size = array_ops.stack([img_wd, adjusted_height])
+      adjusted_size = array_ops.stack([adjusted_height, img_wd])
       output = image_ops.resize_images_v2(
           images=inputs, size=adjusted_size, method=self._interpolation_method)
       original_shape = inputs.shape.as_list()
-      output_shape = original_shape[0:2] + [None] + [original_shape[3]]
+      output_shape = [original_shape[0]] + [None] + original_shape[2:4]
       output.set_shape(output_shape)
       return output
 
@@ -1158,7 +1157,7 @@ class RandomHeight(Layer):
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], input_shape[1], None, input_shape[3]])
+        [input_shape[0], None, input_shape[2], input_shape[3]])
 
   def get_config(self):
     config = {
@@ -1196,11 +1195,11 @@ class RandomWidth(Layer):
 
   Input shape:
     4D tensor with shape:
-    `(samples, width, height, channels)` (data_format='channels_last').
+    `(samples, height, width, channels)` (data_format='channels_last').
 
   Output shape:
     4D tensor with shape:
-    `(samples, random_width, height, channels)`.
+    `(samples, height, random_width, channels)`.
   """
 
   def __init__(self,
@@ -1243,11 +1242,11 @@ class RandomWidth(Layer):
           minval=(1.0 + self.width_lower),
           maxval=(1.0 + self.width_upper))
       adjusted_width = math_ops.cast(width_factor * img_wd, dtypes.int32)
-      adjusted_size = array_ops.stack([adjusted_width, img_hd])
+      adjusted_size = array_ops.stack([img_hd, adjusted_width])
       output = image_ops.resize_images_v2(
           images=inputs, size=adjusted_size, method=self._interpolation_method)
       original_shape = inputs.shape.as_list()
-      output_shape = [original_shape[0]] + [None] + original_shape[2:4]
+      output_shape = original_shape[0:2] + [None] + [original_shape[3]]
       output.set_shape(output_shape)
       return output
 
@@ -1256,7 +1255,7 @@ class RandomWidth(Layer):
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
     return tensor_shape.TensorShape(
-        [input_shape[0], None, input_shape[2], input_shape[3]])
+        [input_shape[0], input_shape[1], None, input_shape[3]])
 
   def get_config(self):
     config = {
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 95630a5b853..28c9955c9dd 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -48,8 +48,8 @@ class ResizingTest(keras_parameterized.TestCase):
       testing_utils.layer_test(
           image_preprocessing.Resizing,
           kwargs=kwargs,
-          input_shape=(num_samples, orig_width, orig_height, channels),
-          expected_output_shape=(None, expected_width, expected_height,
+          input_shape=(num_samples, orig_height, orig_width, channels),
+          expected_output_shape=(None, expected_height, expected_width,
                                  channels))
 
   @parameterized.named_parameters(
@@ -126,13 +126,13 @@ class ResizingTest(keras_parameterized.TestCase):
 
 
 def get_numpy_center_crop(images, expected_height, expected_width):
-  orig_height = images.shape[2]
-  orig_width = images.shape[1]
+  orig_height = images.shape[1]
+  orig_width = images.shape[2]
   height_start = int((orig_height - expected_height) / 2)
   width_start = int((orig_width - expected_width) / 2)
   height_end = height_start + expected_height
   width_end = width_start + expected_width
-  return images[:, width_start:width_end, height_start:height_end, :]
+  return images[:, height_start:height_end, width_start:width_end, :]
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -146,17 +146,17 @@ class CenterCropTest(keras_parameterized.TestCase):
     channels = 3
     kwargs = {'height': expected_height, 'width': expected_width}
     input_images = np.random.random(
-        (num_samples, orig_width, orig_height, channels)).astype(np.float32)
+        (num_samples, orig_height, orig_width, channels)).astype(np.float32)
     expected_output = get_numpy_center_crop(
         input_images, expected_height, expected_width)
     with tf_test_util.use_gpu():
       testing_utils.layer_test(
           image_preprocessing.CenterCrop,
           kwargs=kwargs,
-          input_shape=(num_samples, orig_width, orig_height, channels),
+          input_shape=(num_samples, orig_height, orig_width, channels),
           input_data=input_images,
           expected_output=expected_output,
-          expected_output_shape=(None, expected_width, expected_height,
+          expected_output_shape=(None, expected_height, expected_width,
                                  channels))
 
   @parameterized.named_parameters(
@@ -210,8 +210,8 @@ class RandomCropTest(keras_parameterized.TestCase):
       testing_utils.layer_test(
           image_preprocessing.RandomCrop,
           kwargs=kwargs,
-          input_shape=(num_samples, orig_width, orig_height, channels),
-          expected_output_shape=(None, expected_width, expected_height,
+          input_shape=(num_samples, orig_height, orig_width, channels),
+          expected_output_shape=(None, expected_height, expected_width,
                                  channels))
 
   @parameterized.named_parameters(
@@ -233,16 +233,16 @@ class RandomCropTest(keras_parameterized.TestCase):
     height, width = 3, 4
     height_offset = np.random.randint(low=0, high=3)
     width_offset = np.random.randint(low=0, high=5)
-    mock_offset = [0, width_offset, height_offset, 0]
+    mock_offset = [0, height_offset, width_offset, 0]
     with test.mock.patch.object(
         stateless_random_ops, 'stateless_random_uniform',
         return_value=mock_offset):
       with tf_test_util.use_gpu():
         layer = image_preprocessing.RandomCrop(height, width)
-        inp = np.random.random((12, 8, 5, 3))
+        inp = np.random.random((12, 5, 8, 3))
         actual_output = layer(inp, training=1)
-        expected_output = inp[:, width_offset:(width_offset + width),
-                              height_offset:(height_offset + height), :]
+        expected_output = inp[:, height_offset:(height_offset + height),
+                              width_offset:(width_offset + width), :]
         self.assertAllClose(expected_output, actual_output)
 
   @parameterized.named_parameters(
@@ -257,7 +257,22 @@ class RandomCropTest(keras_parameterized.TestCase):
     with CustomObjectScope({'RandomCrop': image_preprocessing.RandomCrop}):
       self._run_test(expected_height, expected_width)
 
-  def test_predicting_with_mock_longer_width(self):
+  def test_random_crop_full_height(self):
+    self._run_test(5, 2)
+
+  def test_random_crop_full_width(self):
+    self._run_test(3, 8)
+
+  def test_random_crop_full(self):
+    np.random.seed(1337)
+    height, width = 8, 16
+    inp = np.random.random((12, 8, 16, 3))
+    with tf_test_util.use_gpu():
+      layer = image_preprocessing.RandomCrop(height, width)
+      actual_output = layer(inp, training=0)
+      self.assertAllClose(inp, actual_output)
+
+  def test_predicting_with_mock_longer_height(self):
     np.random.seed(1337)
     height, width = 3, 3
     inp = np.random.random((12, 10, 6, 3))
@@ -269,15 +284,15 @@ class RandomCropTest(keras_parameterized.TestCase):
       expected_output = resized_inp[:, 1:4, :, :]
       self.assertAllClose(expected_output, actual_output)
 
-  def test_predicting_with_mock_longer_height(self):
+  def test_predicting_with_mock_longer_width(self):
     np.random.seed(1337)
     height, width = 4, 6
     inp = np.random.random((12, 8, 16, 3))
     with tf_test_util.use_gpu():
       layer = image_preprocessing.RandomCrop(height, width)
       actual_output = layer(inp, training=0)
-      resized_inp = image_ops.resize_images_v2(inp, size=[6, 12])
-      expected_output = resized_inp[:, :, 4:8, :]
+      resized_inp = image_ops.resize_images_v2(inp, size=[4, 8])
+      expected_output = resized_inp[:, :, 1:7, :]
       self.assertAllClose(expected_output, actual_output)
 
   def test_config_with_custom_name(self):
@@ -572,6 +587,29 @@ class RandomTranslationTest(keras_parameterized.TestCase):
         expected_output = np.reshape(expected_output, (1, 5, 5, 1))
         self.assertAllEqual(expected_output, output_image)
 
+  def test_random_translation_asymmetric_size_numeric_reflect(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 16), (1, 8, 2, 1)).astype(dtype)
+        # Shifting by .5 * 8 = 1 pixel.
+        layer = image_preprocessing.RandomTranslation(
+            height_factor=(.5, .5), width_factor=0.)
+        output_image = layer(input_image)
+        # pyformat: disable
+        expected_output = np.asarray([
+            [6, 7],
+            [4, 5],
+            [2, 3],
+            [0, 1],
+            [0, 1],
+            [2, 3],
+            [4, 5],
+            [6, 7],
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 8, 2, 1))
+        self.assertAllEqual(expected_output, output_image)
+
   def test_random_translation_down_numeric_constant(self):
     for dtype in (np.int64, np.float32):
       with tf_test_util.use_gpu():
@@ -1027,11 +1065,11 @@ class RandomHeightTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     with tf_test_util.use_gpu():
-      img = np.random.random((num_samples, orig_width, orig_height, channels))
+      img = np.random.random((num_samples, orig_height, orig_width, channels))
       layer = image_preprocessing.RandomHeight(factor)
       img_out = layer(img, training=True)
       self.assertEqual(img_out.shape[0], 2)
-      self.assertEqual(img_out.shape[1], 8)
+      self.assertEqual(img_out.shape[2], 8)
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_height_4_by_6', (.4, .6)),
@@ -1046,38 +1084,39 @@ class RandomHeightTest(keras_parameterized.TestCase):
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
       with tf_test_util.use_gpu():
-        img = np.random.random((12, 8, 5, 3))
+        img = np.random.random((12, 5, 8, 3))
         layer = image_preprocessing.RandomHeight(.4)
         img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[2], 3)
+        self.assertEqual(img_out.shape[1], 3)
 
   def test_random_height_longer_numeric(self):
     for dtype in (np.int64, np.float32):
       with tf_test_util.use_gpu():
-        input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
+        input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
         layer = image_preprocessing.RandomHeight(factor=(1., 1.))
         output_image = layer(np.expand_dims(input_image, axis=0))
         # pyformat: disable
         expected_output = np.asarray([
-            [0, 0.25, 0.75, 1],
-            [2, 2.25, 2.75, 3],
-            [4, 4.25, 4.75, 5]
+            [0, 1, 2],
+            [0.75, 1.75, 2.75],
+            [2.25, 3.25, 4.25],
+            [3, 4, 5]
         ]).astype(dtype)
         # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 3, 4, 1))
+        expected_output = np.reshape(expected_output, (1, 4, 3, 1))
         self.assertAllEqual(expected_output, output_image)
 
   def test_random_height_shorter_numeric(self):
     for dtype in (np.int64, np.float32):
       with tf_test_util.use_gpu():
-        input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
+        input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
         layer = image_preprocessing.RandomHeight(
             factor=(-.5, -.5), interpolation='nearest')
         output_image = layer(np.expand_dims(input_image, axis=0))
         # pyformat: disable
         expected_output = np.asarray([
-            [1, 3],
-            [5, 7]
+            [2, 3],
+            [6, 7]
         ]).astype(dtype)
         # pyformat: enable
         expected_output = np.reshape(expected_output, (1, 2, 2, 1))
@@ -1114,11 +1153,11 @@ class RandomWidthTest(keras_parameterized.TestCase):
     orig_width = 8
     channels = 3
     with tf_test_util.use_gpu():
-      img = np.random.random((num_samples, orig_width, orig_height, channels))
+      img = np.random.random((num_samples, orig_height, orig_width, channels))
       layer = image_preprocessing.RandomWidth(factor)
       img_out = layer(img, training=True)
       self.assertEqual(img_out.shape[0], 2)
-      self.assertEqual(img_out.shape[2], 5)
+      self.assertEqual(img_out.shape[1], 5)
       self.assertEqual(img_out.shape[3], 3)
 
   @parameterized.named_parameters(('random_width_4_by_6', (.4, .6)),
@@ -1133,39 +1172,38 @@ class RandomWidthTest(keras_parameterized.TestCase):
     with test.mock.patch.object(
         gen_stateful_random_ops, 'stateful_uniform', return_value=mock_factor):
       with tf_test_util.use_gpu():
-        img = np.random.random((12, 5, 8, 3))
+        img = np.random.random((12, 8, 5, 3))
         layer = image_preprocessing.RandomWidth(.4)
         img_out = layer(img, training=True)
-        self.assertEqual(img_out.shape[1], 3)
+        self.assertEqual(img_out.shape[2], 3)
 
   def test_random_width_longer_numeric(self):
     for dtype in (np.int64, np.float32):
       with tf_test_util.use_gpu():
-        input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
+        input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
         layer = image_preprocessing.RandomWidth(factor=(1., 1.))
         output_image = layer(np.expand_dims(input_image, axis=0))
         # pyformat: disable
         expected_output = np.asarray([
-            [0, 1, 2],
-            [0.75, 1.75, 2.75],
-            [2.25, 3.25, 4.25],
-            [3, 4, 5]
+            [0, 0.25, 0.75, 1],
+            [2, 2.25, 2.75, 3],
+            [4, 4.25, 4.75, 5]
         ]).astype(dtype)
         # pyformat: enable
-        expected_output = np.reshape(expected_output, (1, 4, 3, 1))
+        expected_output = np.reshape(expected_output, (1, 3, 4, 1))
         self.assertAllEqual(expected_output, output_image)
 
   def test_random_width_shorter_numeric(self):
     for dtype in (np.int64, np.float32):
       with tf_test_util.use_gpu():
-        input_image = np.reshape(np.arange(0, 8), (4, 2, 1)).astype(dtype)
+        input_image = np.reshape(np.arange(0, 8), (2, 4, 1)).astype(dtype)
         layer = image_preprocessing.RandomWidth(
             factor=(-.5, -.5), interpolation='nearest')
         output_image = layer(np.expand_dims(input_image, axis=0))
         # pyformat: disable
         expected_output = np.asarray([
-            [2, 3],
-            [6, 7]
+            [1, 3],
+            [5, 7]
         ]).astype(dtype)
         # pyformat: enable
         expected_output = np.reshape(expected_output, (1, 2, 2, 1))

From 38d2ef6be9bc000dba1ef5a50cf62d7e03d21c7e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 17:46:40 -0700
Subject: [PATCH 0265/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310749869
Change-Id: I139fa89fc58578773e4a6b7138aaa94e08831a72
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From c4ab2d38bcd48b5ae9cca7fe94ac11a8a28683fa Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 10 May 2020 03:37:15 +0000
Subject: [PATCH 0266/1533] Add mlir_graph_optimization_pass.h header to pip
 wheel

This PR adds mlir_graph_optimization_pass.h header to tf-nightly pip wheel.
mlir_graph_optimization_pass.h is a header file that allows to register
mlir based graph optimizaton (either part of the tensorflow, or externally
registered).
However, it is not part of the pip install so it is not possible
to register with installed version of tensorflow. This PR adds the header
file to be part of the pip install.
This PR is related to 39231

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/tools/pip_package/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 70d88f294bc..2c3734d2fc2 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -28,6 +28,7 @@ transitive_hdrs(
     deps = [
         "//tensorflow/c/experimental:network",
         "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",

From 1b49bd1f2a9e87671f6541897099f3c17cf8a2f6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 9 May 2020 21:47:36 -0700
Subject: [PATCH 0267/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310762888
Change-Id: Id1d2fe3d14ab288584b0df1c787f669bf2a92182
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e1bed0f7af858d4d40ad6501994f819895330803 Mon Sep 17 00:00:00 2001
From: Shunya Ueta <shunyaueta01@gmail.com>
Date: Sun, 10 May 2020 18:01:42 +0900
Subject: [PATCH 0268/1533] Remove Python2 badge in raspberry Pi into README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 27032043e07..a6de2984dd8 100644
--- a/README.md
+++ b/README.md
@@ -112,8 +112,8 @@ Build Type               | Status
 **Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly/)
 **Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
 **Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)                                                                                                                                                                            | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
-**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv6l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
-**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv7l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
+**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html) | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
+**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html) | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
 
 ### Community Supported Builds
 

From 4ac1e3206c5adfd7c7477bbbe347cc47aff21836 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 02:02:48 -0700
Subject: [PATCH 0269/1533] Update GraphDef version to 397.

PiperOrigin-RevId: 310776300
Change-Id: I10b72e3c13e4976d6c297c6c92695b9ddb5ee689
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 94395f4c2a5..b7f2b76d6ba 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 396  // Updated: 2020/5/9
+#define TF_GRAPH_DEF_VERSION 397  // Updated: 2020/5/10
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From dc5547788083933462b26b32b09e90de7970e02d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 02:02:56 -0700
Subject: [PATCH 0270/1533] compat: Update forward compatibility horizon to
 2020-05-10

PiperOrigin-RevId: 310776306
Change-Id: If593a9e9239523601c3742f53226b8bb1a1d245c
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index f0700514b36..17f559eaf17 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 9)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 10)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 7e08dcef2cbc8e4cbf8a652b612c847e3186f345 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 03:46:37 -0700
Subject: [PATCH 0271/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310781767
Change-Id: Iee363e35f9774992684f35e968846dd733811ada
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From d7162002663f6824ab5edc600f9031b91c5ba708 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 04:47:55 -0700
Subject: [PATCH 0272/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/c5e0967e4cf0

PiperOrigin-RevId: 310785048
Change-Id: I2d93edb9c66c4262c985fa088f88ad22e3e6cada
---
 .../compiler/mlir/lite/flatbuffer_export.cc    |  2 +-
 .../compiler/mlir/tensorflow/ir/tf_ops.cc      |  2 +-
 .../tensorflow/transforms/shape_inference.cc   |  4 ++--
 .../mlir/tensorflow/translate/import_model.cc  | 13 +++++++------
 tensorflow/compiler/mlir/xla/ir/chlo_ops.cc    | 18 +++++++-----------
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc     |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td     |  5 ++---
 .../transforms/test_infer_shaped_type_pass.cc  |  3 ++-
 8 files changed, 23 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index e9192388070..6a631b1433d 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -1020,7 +1020,7 @@ Optional<BufferOffset<tflite::Operator>> Translator::BuildOperator(
       if (!inst->getMutableAttrDict().getAttrs().empty()) {
         os << " {";
         bool first = true;
-        for (auto& named_attr : inst->getMutableAttrDict().getDictionary()) {
+        for (auto& named_attr : inst->getAttrDictionary()) {
           os << (!first ? ", " : "");
           first = false;
           named_attr.first.print(os);
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 3a4e9e5985e..edba135e0f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -984,7 +984,7 @@ void ConstOp::build(OpBuilder &builder, OperationState &result, Type type,
 
 LogicalResult ConstOp::inferReturnTypes(
     MLIRContext *context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type> &inferredReturnTypes) {
   for (NamedAttribute named_attr : attributes) {
     if (named_attr.first.strref() != "value") continue;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 789088bd585..efe82c4268b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -323,8 +323,8 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
   Operation* op = infer_ti.getOperation();
   SmallVector<Type, 4> inferred;
   LogicalResult res = infer_ti.inferReturnTypes(
-      op->getContext(), op->getLoc(), op->getOperands(), op->getAttrs(),
-      op->getRegions(), inferred);
+      op->getContext(), op->getLoc(), op->getOperands(),
+      op->getAttrDictionary(), op->getRegions(), inferred);
   if (failed(res)) {
     op->emitOpError("failed to refine type as inference failed");
     return false;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 3bb1446213b..a613ce1f920 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -112,6 +112,7 @@ static inline absl::string_view StringRefToView(llvm::StringRef ref) {
 }
 
 namespace tensorflow {
+using mlir::NamedAttrList;
 using mlir::TensorType;
 using mlir::TF::VarHandleOp;
 using mlir::tf_saved_model::GlobalTensorOp;
@@ -309,9 +310,9 @@ class ImporterBase {
   // AttrValue {name : foo, attrs : {k1 : bar, k2 : rfc}}, it will convert it to
   // a list of MLIR Attributes: [{base_name : foo}, {base_name.k1 : bar},
   // {base_name.k2 : rfc}}.
-  Status ConvertFunctionCallAttribute(
-      const std::string& base_name, const AttrValue& value,
-      llvm::SmallVector<mlir::NamedAttribute, 4>* attributes);
+  Status ConvertFunctionCallAttribute(const std::string& base_name,
+                                      const AttrValue& value,
+                                      NamedAttrList* attributes);
 
   // Helper to create either a tf_executor operation or a TF operation wrapped
   // in an island. When convert_to_legacy_call is true, converts the operation
@@ -1092,9 +1093,9 @@ StatusOr<ImporterBase::ElementSubtypes> ImporterBase::ConvertSubtypes(
   return subtypes;
 }
 
-Status ImporterBase::ConvertFunctionCallAttribute(
-    const std::string& base_name, const AttrValue& value,
-    llvm::SmallVector<mlir::NamedAttribute, 4>* attributes) {
+Status ImporterBase::ConvertFunctionCallAttribute(const std::string& base_name,
+                                                  const AttrValue& value,
+                                                  NamedAttrList* attributes) {
   TF_ASSIGN_OR_RETURN(auto func_attr,
                       ConvertFunctionCallName(value.func().name()));
   attributes->push_back(builder_.getNamedAttr(base_name, func_attr));
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index bc6842a617e..5322668aa2e 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -97,16 +97,12 @@ static Type GetBroadcastType(Type x, Type y, Type element_type,
 
 LogicalResult InferBroadcastBinaryOpReturnTypeComponents(
     MLIRContext* context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, Type element_type,
+    DictionaryAttr attributes, Type element_type,
     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
   // Find broadcast_dimensions.
-  DenseIntElementsAttr broadcast_dimensions;
-  for (auto attr : attributes) {
-    if (attr.first == "broadcast_dimensions") {
-      broadcast_dimensions = attr.second.dyn_cast<DenseIntElementsAttr>();
-      break;
-    }
-  }
+  DenseIntElementsAttr broadcast_dimensions =
+      attributes.get("broadcast_dimensions")
+          .dyn_cast_or_null<DenseIntElementsAttr>();
 
   ShapedType lhs_type = operands[0].getType().dyn_cast<ShapedType>();
   ShapedType rhs_type = operands[1].getType().dyn_cast<ShapedType>();
@@ -168,7 +164,7 @@ LogicalResult ReifyBroadcastBinaryOpReturnTypeShapes(
 
 LogicalResult BroadcastComplexOp::inferReturnTypeComponents(
     MLIRContext* context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
   ShapedType lhs_type = operands[0].getType().dyn_cast<ShapedType>();
   if (!lhs_type) {
@@ -191,7 +187,7 @@ LogicalResult BroadcastComplexOp::reifyReturnTypeShapes(
 
 LogicalResult BroadcastCompareOp::inferReturnTypeComponents(
     MLIRContext* context, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
   Type element_type = IntegerType::get(1, context);
   return InferBroadcastBinaryOpReturnTypeComponents(context, location, operands,
@@ -211,7 +207,7 @@ LogicalResult BroadcastCompareOp::reifyReturnTypeShapes(
 #define BROADCAST_INFER_SHAPE_TYPE_OP_DEFS(Op)                                \
   LogicalResult Op::inferReturnTypeComponents(                                \
       MLIRContext* context, Optional<Location> location, ValueRange operands, \
-      ArrayRef<NamedAttribute> attributes, RegionRange regions,               \
+      DictionaryAttr attributes, RegionRange regions,                         \
       SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {           \
     return InferBroadcastBinaryOpReturnTypeComponents(                        \
         context, location, operands, attributes, /*element_type=*/nullptr,    \
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index cb7372a762c..874f4a1587e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -1240,7 +1240,7 @@ static LogicalResult Verify(SelectOp op) {
 // the return type based on operand type.
 LogicalResult SelectOp::inferReturnTypes(
     MLIRContext*, Optional<Location> location, ValueRange operands,
-    ArrayRef<NamedAttribute> attributes, RegionRange regions,
+    DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type>& inferredReturnTypes) {
   auto x_type = operands[1].getType();
   auto y_type = operands[2].getType();
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index c2eef4a90b2..917a50f74ea 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -104,8 +104,7 @@ class HLO_UnaryElementwiseOp<string mnemonic, list<OpTrait> traits,
     let extraClassDeclaration = [{
       static  LogicalResult inferReturnTypeComponents(
           MLIRContext* context, Optional<Location> location,
-          ValueRange operands, ArrayRef<NamedAttribute> attributes,
-          RegionRange regions,
+          ValueRange operands, DictionaryAttr attributes, RegionRange regions,
           SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
         return failure();
       }
@@ -254,7 +253,7 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
   let extraClassDeclaration = [{
     static  LogicalResult inferReturnTypeComponents(
         MLIRContext* context, Optional<Location> location, ValueRange operands,
-        ArrayRef<NamedAttribute> attributes, RegionRange regions,
+        DictionaryAttr attributes, RegionRange regions,
         SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
       return failure();
     }
diff --git a/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc b/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
index 8976bd5b7d2..71441656c08 100644
--- a/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/test_infer_shaped_type_pass.cc
@@ -38,7 +38,8 @@ struct InferReturnTypeComponentsPattern : public RewritePattern {
     SmallVector<ShapedTypeComponents, 4> components;
     if (failed(defining_op_int.inferReturnTypeComponents(
             op->getContext(), op->getLoc(), defining_op->getOperands(),
-            defining_op->getAttrs(), defining_op->getRegions(), components))) {
+            defining_op->getAttrDictionary(), defining_op->getRegions(),
+            components))) {
       return failure();
     }
 

From 06b3c80de804b4485fa9638fba033e49dfb67ef2 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Sun, 10 May 2020 05:37:52 -0700
Subject: [PATCH 0273/1533] Bump open source llvm revision to
 c5e0967e4cf0f1337bec772949e6cede4c01354b

PiperOrigin-RevId: 310787873
Change-Id: I005afa67af527700821db03f35e18d74ac79ad44
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 85c0ca29fe9..5b479c54b8b 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "91087153210132a4c2d3cf19a4526d8f395cb5a4"
-    LLVM_SHA256 = "b2e2314ce2d4a7f0da436063c922d716171415d1b5e85889235d9eab1ecb98c1"
+    LLVM_COMMIT = "c5e0967e4cf0f1337bec772949e6cede4c01354b"
+    LLVM_SHA256 = "5d8dbddd78fbc1c08825b178aff0a0f04722d83280eb93be55a174391f1885ce"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From b8681699f0e792f870b2113a4af64b3ad5c8615d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 07:46:57 -0700
Subject: [PATCH 0274/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310794700
Change-Id: Ieafdee5e921be05da223963899fae1a6eb657366
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 1fd83f35e2179f24a02344c659737777ec9f4a90 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 09:46:28 -0700
Subject: [PATCH 0275/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310801764
Change-Id: Ifb8b452e62a46ec0c023a6118fd58ca403e26d81
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From cca2943a3ab9dc57399a4ce3b857a9fa540a27cc Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Sun, 10 May 2020 11:14:38 -0700
Subject: [PATCH 0276/1533] Fix for Micro make patching error on MacOS

PiperOrigin-RevId: 310806870
Change-Id: I83f559284ea947e424a75682f2d970c5dbb14e12
---
 tensorflow/lite/micro/tools/make/download_and_extract.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 2248031f6d1..a403019d192 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -86,7 +86,7 @@ patch_kissfft() {
 # CIFAR10 test dataset.
 patch_cifar10_dataset() {
   xxd -l 30730 -i ${1}/test_batch.bin ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
-  sed -i "s/unsigned char/const unsigned char/g" ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
+  sed -i -E "s/unsigned char/const unsigned char/g" ${1}/../../../../examples/image_recognition_experimental/first_10_cifar_images.h
 }
 
 build_embarc_mli() {

From 28492e5b5bb79740f770419018467c55c2fe9bae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 11:47:03 -0700
Subject: [PATCH 0277/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310808359
Change-Id: I9f404e9aa158de8264ca3b4cc33a352d8db82ed1
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 85fcfd9191cb4fdefa3ed8690bddd4b8eda0836d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 10 May 2020 18:52:18 +0000
Subject: [PATCH 0278/1533] Update protobuf-java to 3.9.2

This PR updates protobuf-java to 3.9.2, to match C++ version
in tensorflow/workspace.bzl (3.9.2), and to fix the issue raised
in 39381.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/java/maven/proto/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index ce1acc20b00..aa4a9bb4618 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.5.1</version>
+      <version>3.9.2</version>
     </dependency>
   </dependencies>
 

From 4ce786df351817f47e9574192ef66358e76ccf29 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Sun, 10 May 2020 14:54:10 -0700
Subject: [PATCH 0279/1533] Refactor ConvertToTensorProto to avoid some
 duplication

Share implementation to populate tensors unless the element type requires special handling.

PiperOrigin-RevId: 310818801
Change-Id: I27b4d9111578e9ecbec663853aad9ed85e46defc
---
 .../mlir/tensorflow/utils/convert_tensor.cc   | 213 +++++++-----------
 1 file changed, 81 insertions(+), 132 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index fcfef565952..b492945fe8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -207,12 +208,11 @@ mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
 
 // Converts an MLIR dense string elements attribute to a TensorFlow tensor
 // proto.
-Status ConvertStringElementsAttr(const DenseStringElementsAttr attr,
-                                 TensorProto* output_tensor) {
-  for (const auto& val : attr.getRawStringData()) {
-    output_tensor->add_string_val(val.data(), val.size());
-  }
-  return Status::OK();
+void ConvertStringElementsAttr(
+    const DenseStringElementsAttr attr,
+    protobuf::RepeatedPtrField<std::string>* output) {
+  for (const auto& val : attr.getRawStringData())
+    output->Add({val.data(), val.size()});
 }
 
 // Converts an MLIR opaque elements attribute to a TensorFlow tensor proto.
@@ -226,139 +226,80 @@ Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
   return InvalidArgument("Unexpected elements attribute type from MLIR.");
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the double_val field updated.
-Status ConvertDoubleElementsAttr(const ElementsAttr attr,
-                                 TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_double_val(elts.getSplatValue<double>());
-    } else {
-      for (auto value : elts.getValues<double>())
-        output_tensor->add_double_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the float_val field updated.
-Status ConvertFloatElementsAttr(const ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_float_val(elts.getSplatValue<float>());
-    } else {
-      for (auto value : elts.getValues<float>())
-        output_tensor->add_float_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the half_val field updated.
-Status ConvertHalfElementsAttr(const ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_half_val(
-          (*elts.begin()).bitcastToAPInt().getSExtValue());
-    } else {
-      for (const auto& value : elts.getFloatValues())
-        output_tensor->add_half_val(value.bitcastToAPInt().getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int_val field updated.
-Status ConvertIntElementsAttr(const mlir::ElementsAttr attr,
-                              TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertBfloat16ElementsAttr(const mlir::ElementsAttr attr,
-                                   TensorProto* output_tensor) {
-  auto elts = attr.dyn_cast<DenseFPElementsAttr>();
-  if (!elts) {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
-  }
-
-  // Bfloat16 is internally represented as `double` in MLIR.
-  if (elts.isSplat()) {
-    double v = elts.getSplatValue<double>();
-    bfloat16 bf16_val = static_cast<bfloat16>(v);
-    output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+// Converts an MLIR elements attribute and adds it to specified repeated field.
+template <typename T>
+void ConvertElementsAttr(const mlir::DenseElementsAttr attr,
+                         protobuf::RepeatedField<T>* output) {
+  if (attr.isSplat()) {
+    output->Add(attr.getSplatValue<T>());
   } else {
-    for (auto v : elts.getValues<double>()) {
+    for (auto value : attr.getValues<T>()) output->Add(value);
+  }
+}
+
+// Converts an MLIR elements attribute containing half values and adds it to
+// specified repeated field.
+void ConvertHalfElementsAttr(const DenseFPElementsAttr attr,
+                             protobuf::RepeatedField<int>* output_tensor) {
+  if (attr.isSplat()) {
+    output_tensor->Add((*attr.begin()).bitcastToAPInt().getSExtValue());
+  } else {
+    for (const llvm::APFloat value : attr.getFloatValues())
+      output_tensor->Add(value.bitcastToAPInt().getSExtValue());
+  }
+}
+
+// Converts an MLIR elements attribute containing int values and adds it to
+// specified repeated field.
+void ConvertIntElementsAttr(const mlir::DenseIntElementsAttr attr,
+                            protobuf::RepeatedField<int>* output) {
+  if (attr.isSplat()) {
+    output->Add((*attr.begin()).getSExtValue());
+  } else {
+    for (const llvm::APInt val : attr) output->Add(val.getSExtValue());
+  }
+}
+
+void ConvertBfloat16ElementsAttr(const mlir::DenseFPElementsAttr attr,
+                                 protobuf::RepeatedField<int>* output) {
+  // Bfloat16 is internally represented as `double` in MLIR.
+  if (attr.isSplat()) {
+    double v = attr.getSplatValue<double>();
+    bfloat16 bf16_val = static_cast<bfloat16>(v);
+    output->Add(absl::bit_cast<int16>(bf16_val));
+  } else {
+    for (auto v : attr.getValues<double>()) {
       bfloat16 bf16_val = static_cast<bfloat16>(v);
-      output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+      output->Add(absl::bit_cast<int16>(bf16_val));
     }
   }
-
-  return Status::OK();
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int64_val field updated.
-Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int64_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int64_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with bool_val field updated.
-Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    for (const auto& val : elts) {
-      output_tensor->add_bool_val(val.getBoolValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertToTensorProto(const ElementsAttr attr,
-                            TensorProto* output_tensor) {
+Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
   auto type = attr.getType();
   auto shape = type.getShape();
   DataType output_dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &output_dtype));
-  output_tensor->set_dtype(output_dtype);
-  ConvertToTensorShapeProto(shape, output_tensor->mutable_tensor_shape());
+  output->set_dtype(output_dtype);
+  ConvertToTensorShapeProto(shape, output->mutable_tensor_shape());
+
+  if (attr.isa<OpaqueElementsAttr>())
+    return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(), output);
+
+  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
+  if (!dense_attr) return errors::InvalidArgument("Unsupported elements attr");
 
   switch (output_dtype) {
     case DT_FLOAT:
-      return ConvertFloatElementsAttr(attr, output_tensor);
+      ConvertElementsAttr<float>(dense_attr, output->mutable_float_val());
+      break;
     case DT_HALF:
-      // Handles both DenseFPElementsAttr and OpaqueElementsAttr.
-      return ConvertHalfElementsAttr(attr, output_tensor);
+      ConvertHalfElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                              output->mutable_half_val());
+      break;
     case DT_DOUBLE:
-      return ConvertDoubleElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_double_val());
+      break;
     case DT_QUINT8:
     case DT_UINT8:
     case DT_INT8:
@@ -366,20 +307,28 @@ Status ConvertToTensorProto(const ElementsAttr attr,
     case DT_UINT16:
     case DT_INT16:
     case DT_INT32:
-      return ConvertIntElementsAttr(attr, output_tensor);
+      ConvertIntElementsAttr(dense_attr.cast<DenseIntElementsAttr>(),
+                             output->mutable_int_val());
+      break;
     case DT_INT64:
-      return ConvertInt64ElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_int64_val());
+      break;
     case DT_BOOL:
-      return ConvertBoolElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_bool_val());
+      break;
     case DT_BFLOAT16:
-      return ConvertBfloat16ElementsAttr(attr, output_tensor);
+      ConvertBfloat16ElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                                  output->mutable_half_val());
+      break;
     case DT_STRING:
-      return ConvertStringElementsAttr(attr.cast<DenseStringElementsAttr>(),
-                                       output_tensor);
+      ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
+                                output->mutable_string_val());
+      break;
     default:
-      return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(),
-                                       output_tensor);
+      return errors::Unimplemented(absl::StrCat("Unimplemented data type ",
+                                                DataTypeString(output_dtype)));
   }
+  return Status::OK();
 }
 
 Status ConvertToTensor(const mlir::ElementsAttr attr, Tensor* output_tensor) {

From 02b34b853deb653d293e0c1c3d5d2e0b1453445c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 15:27:57 -0700
Subject: [PATCH 0280/1533] Refactor ConvertToTensorProto to avoid some
 duplication

Share implementation to populate tensors unless the element type requires special handling.

PiperOrigin-RevId: 310820768
Change-Id: Ibdf11da1e9e41b3f2ddb10a43563da244f044b62
---
 .../mlir/tensorflow/utils/convert_tensor.cc   | 191 +++++++++++-------
 1 file changed, 121 insertions(+), 70 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index b492945fe8b..fcfef565952 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -208,11 +207,12 @@ mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
 
 // Converts an MLIR dense string elements attribute to a TensorFlow tensor
 // proto.
-void ConvertStringElementsAttr(
-    const DenseStringElementsAttr attr,
-    protobuf::RepeatedPtrField<std::string>* output) {
-  for (const auto& val : attr.getRawStringData())
-    output->Add({val.data(), val.size()});
+Status ConvertStringElementsAttr(const DenseStringElementsAttr attr,
+                                 TensorProto* output_tensor) {
+  for (const auto& val : attr.getRawStringData()) {
+    output_tensor->add_string_val(val.data(), val.size());
+  }
+  return Status::OK();
 }
 
 // Converts an MLIR opaque elements attribute to a TensorFlow tensor proto.
@@ -226,80 +226,139 @@ Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
   return InvalidArgument("Unexpected elements attribute type from MLIR.");
 }
 
-// Converts an MLIR elements attribute and adds it to specified repeated field.
-template <typename T>
-void ConvertElementsAttr(const mlir::DenseElementsAttr attr,
-                         protobuf::RepeatedField<T>* output) {
-  if (attr.isSplat()) {
-    output->Add(attr.getSplatValue<T>());
-  } else {
-    for (auto value : attr.getValues<T>()) output->Add(value);
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
+// with the double_val field updated.
+Status ConvertDoubleElementsAttr(const ElementsAttr attr,
+                                 TensorProto* output_tensor) {
+  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
+    if (elts.isSplat()) {
+      output_tensor->add_double_val(elts.getSplatValue<double>());
+    } else {
+      for (auto value : elts.getValues<double>())
+        output_tensor->add_double_val(value);
+    }
+    return Status::OK();
   }
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
 }
 
-// Converts an MLIR elements attribute containing half values and adds it to
-// specified repeated field.
-void ConvertHalfElementsAttr(const DenseFPElementsAttr attr,
-                             protobuf::RepeatedField<int>* output_tensor) {
-  if (attr.isSplat()) {
-    output_tensor->Add((*attr.begin()).bitcastToAPInt().getSExtValue());
-  } else {
-    for (const llvm::APFloat value : attr.getFloatValues())
-      output_tensor->Add(value.bitcastToAPInt().getSExtValue());
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
+// with the float_val field updated.
+Status ConvertFloatElementsAttr(const ElementsAttr attr,
+                                TensorProto* output_tensor) {
+  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
+    if (elts.isSplat()) {
+      output_tensor->add_float_val(elts.getSplatValue<float>());
+    } else {
+      for (auto value : elts.getValues<float>())
+        output_tensor->add_float_val(value);
+    }
+    return Status::OK();
   }
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
 }
 
-// Converts an MLIR elements attribute containing int values and adds it to
-// specified repeated field.
-void ConvertIntElementsAttr(const mlir::DenseIntElementsAttr attr,
-                            protobuf::RepeatedField<int>* output) {
-  if (attr.isSplat()) {
-    output->Add((*attr.begin()).getSExtValue());
-  } else {
-    for (const llvm::APInt val : attr) output->Add(val.getSExtValue());
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
+// with the half_val field updated.
+Status ConvertHalfElementsAttr(const ElementsAttr attr,
+                               TensorProto* output_tensor) {
+  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
+    if (elts.isSplat()) {
+      output_tensor->add_half_val(
+          (*elts.begin()).bitcastToAPInt().getSExtValue());
+    } else {
+      for (const auto& value : elts.getFloatValues())
+        output_tensor->add_half_val(value.bitcastToAPInt().getSExtValue());
+    }
+    return Status::OK();
   }
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
 }
 
-void ConvertBfloat16ElementsAttr(const mlir::DenseFPElementsAttr attr,
-                                 protobuf::RepeatedField<int>* output) {
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
+// with the int_val field updated.
+Status ConvertIntElementsAttr(const mlir::ElementsAttr attr,
+                              TensorProto* output_tensor) {
+  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
+    if (elts.isSplat()) {
+      output_tensor->add_int_val((*elts.begin()).getSExtValue());
+    } else {
+      for (const auto& val : elts)
+        output_tensor->add_int_val(val.getSExtValue());
+    }
+    return Status::OK();
+  }
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
+}
+
+Status ConvertBfloat16ElementsAttr(const mlir::ElementsAttr attr,
+                                   TensorProto* output_tensor) {
+  auto elts = attr.dyn_cast<DenseFPElementsAttr>();
+  if (!elts) {
+    return ConvertOpaqueElementsAttr(attr, output_tensor);
+  }
+
   // Bfloat16 is internally represented as `double` in MLIR.
-  if (attr.isSplat()) {
-    double v = attr.getSplatValue<double>();
+  if (elts.isSplat()) {
+    double v = elts.getSplatValue<double>();
     bfloat16 bf16_val = static_cast<bfloat16>(v);
-    output->Add(absl::bit_cast<int16>(bf16_val));
+    output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
   } else {
-    for (auto v : attr.getValues<double>()) {
+    for (auto v : elts.getValues<double>()) {
       bfloat16 bf16_val = static_cast<bfloat16>(v);
-      output->Add(absl::bit_cast<int16>(bf16_val));
+      output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
     }
   }
+
+  return Status::OK();
 }
 
-Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
+// with the int64_val field updated.
+Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr,
+                                TensorProto* output_tensor) {
+  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
+    if (elts.isSplat()) {
+      output_tensor->add_int64_val((*elts.begin()).getSExtValue());
+    } else {
+      for (const auto& val : elts)
+        output_tensor->add_int64_val(val.getSExtValue());
+    }
+    return Status::OK();
+  }
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
+}
+
+// Converts an MLIR elements attribute to a TensorFlow tensor proto
+// with bool_val field updated.
+Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr,
+                               TensorProto* output_tensor) {
+  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
+    for (const auto& val : elts) {
+      output_tensor->add_bool_val(val.getBoolValue());
+    }
+    return Status::OK();
+  }
+  return ConvertOpaqueElementsAttr(attr, output_tensor);
+}
+
+Status ConvertToTensorProto(const ElementsAttr attr,
+                            TensorProto* output_tensor) {
   auto type = attr.getType();
   auto shape = type.getShape();
   DataType output_dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &output_dtype));
-  output->set_dtype(output_dtype);
-  ConvertToTensorShapeProto(shape, output->mutable_tensor_shape());
-
-  if (attr.isa<OpaqueElementsAttr>())
-    return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(), output);
-
-  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
-  if (!dense_attr) return errors::InvalidArgument("Unsupported elements attr");
+  output_tensor->set_dtype(output_dtype);
+  ConvertToTensorShapeProto(shape, output_tensor->mutable_tensor_shape());
 
   switch (output_dtype) {
     case DT_FLOAT:
-      ConvertElementsAttr<float>(dense_attr, output->mutable_float_val());
-      break;
+      return ConvertFloatElementsAttr(attr, output_tensor);
     case DT_HALF:
-      ConvertHalfElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
-                              output->mutable_half_val());
-      break;
+      // Handles both DenseFPElementsAttr and OpaqueElementsAttr.
+      return ConvertHalfElementsAttr(attr, output_tensor);
     case DT_DOUBLE:
-      ConvertElementsAttr(dense_attr, output->mutable_double_val());
-      break;
+      return ConvertDoubleElementsAttr(attr, output_tensor);
     case DT_QUINT8:
     case DT_UINT8:
     case DT_INT8:
@@ -307,28 +366,20 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
     case DT_UINT16:
     case DT_INT16:
     case DT_INT32:
-      ConvertIntElementsAttr(dense_attr.cast<DenseIntElementsAttr>(),
-                             output->mutable_int_val());
-      break;
+      return ConvertIntElementsAttr(attr, output_tensor);
     case DT_INT64:
-      ConvertElementsAttr(dense_attr, output->mutable_int64_val());
-      break;
+      return ConvertInt64ElementsAttr(attr, output_tensor);
     case DT_BOOL:
-      ConvertElementsAttr(dense_attr, output->mutable_bool_val());
-      break;
+      return ConvertBoolElementsAttr(attr, output_tensor);
     case DT_BFLOAT16:
-      ConvertBfloat16ElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
-                                  output->mutable_half_val());
-      break;
+      return ConvertBfloat16ElementsAttr(attr, output_tensor);
     case DT_STRING:
-      ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
-                                output->mutable_string_val());
-      break;
+      return ConvertStringElementsAttr(attr.cast<DenseStringElementsAttr>(),
+                                       output_tensor);
     default:
-      return errors::Unimplemented(absl::StrCat("Unimplemented data type ",
-                                                DataTypeString(output_dtype)));
+      return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(),
+                                       output_tensor);
   }
-  return Status::OK();
 }
 
 Status ConvertToTensor(const mlir::ElementsAttr attr, Tensor* output_tensor) {

From f8ffa8a8dd98df48610d64e7f2a994438468d8e5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 15:45:51 -0700
Subject: [PATCH 0281/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310821618
Change-Id: I61f2d8179e2a9adc812c464db8a789f9427026d6
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From b1e6a4eceb8aaf7c8417628c429a98e98ed733dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 17:45:56 -0700
Subject: [PATCH 0282/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310828877
Change-Id: If9332da49b46a5d23f9f65fdb82994b627aee0c6
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 9f320410a2cc0de6e3cdc8d372bfa79676f85058 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 19:26:58 -0700
Subject: [PATCH 0283/1533] Support align_corners and half_pixel_centers for
 resize ops in NNAPI delegate.

PiperOrigin-RevId: 310835481
Change-Id: I6538e64b453bc3b633a5656a8130ed2139781a94
---
 .../delegates/nnapi/acceleration_test_list.cc |  8 ++--
 .../lite/delegates/nnapi/nnapi_delegate.cc    | 47 ++++++++++++-------
 .../lite/kernels/resize_bilinear_test.cc      |  8 ----
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 01d56fb2102..cc9e049123e 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -300,13 +300,15 @@ VariedShapeSpec/ReshapeOpTest/RegularShapes/1
 VariedShapeSpec/ReshapeOpTest/WithStretchDimension/1
 
 # resize_bilinear_test
+// align_corners & half_pixel_centers are not implemented in NNAPI before API 30
+ResizeBilinearOpTest/ResizeBilinearOpTest.+HalfPixelCenters.*,30
 // Only models with constant size tensor are accelerated
 ResizeBilinearOpTest/ResizeBilinearOpTest/.+/0,29
 
 # resize_nearest_neighbor_test
-// align_corners & half_pixel_centers are not implemented in NNAPI.
--ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*,29
--ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*,29
+// align_corners & half_pixel_centers are not implemented in NNAPI before API 30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*,30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*,30
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 67e038e962e..e790d423434 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1648,13 +1648,14 @@ bool NNAPIDelegateKernel::Validate(
       }
       auto builtin =
           reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
-      Expect(!builtin->align_corners,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support align_corners == true.", &val_ctx);
-      // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-      Expect(!builtin->half_pixel_centers,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      if (android_sdk_version <= kMinSdkVersionForNNAPI12) {
+        Expect(!builtin->align_corners,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support align_corners == true.", &val_ctx);
+        Expect(!builtin->half_pixel_centers,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      }
       if (android_sdk_version < kMinSdkVersionForNNAPI12) {
         Expect(input.type == kTfLiteFloat32,
                NNAPIValidationFailureType::kUnsupportedInputType,
@@ -1668,14 +1669,14 @@ bool NNAPIDelegateKernel::Validate(
       ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
       auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
           node->builtin_data);
-      // TODO(b/149823713): Update when NNAPI delegate can support align_corners
-      // & half_pixel_centers.
-      Expect(!builtin->align_corners,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support align_corners == true.", &val_ctx);
-      Expect(!builtin->half_pixel_centers,
-             NNAPIValidationFailureType::kUnsupportedOperandValue,
-             "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      if (android_sdk_version <= kMinSdkVersionForNNAPI12) {
+        Expect(!builtin->align_corners,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support align_corners == true.", &val_ctx);
+        Expect(!builtin->half_pixel_centers,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI does not support half_pixel_centers == true.", &val_ctx);
+      }
     } break;
     case kTfLiteBuiltinSqueeze: {
       ExpectOpVersion(version, 1, &val_ctx);
@@ -2436,6 +2437,14 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       const int output_width = output.dims->data[2];
       mapping_args.builder->AddScalarInt32Operand(output_width);
       mapping_args.builder->AddScalarInt32Operand(output_height);
+      auto builtin = reinterpret_cast<TfLiteResizeBilinearParams*>(
+          mapping_args.node->builtin_data);
+      if (builtin->align_corners == true ||
+          builtin->half_pixel_centers == true) {
+        mapping_args.builder->AddScalarBoolOperand(false);  // Use NHWC format
+        mapping_args.builder->AddScalarBoolOperand(builtin->align_corners);
+        mapping_args.builder->AddScalarBoolOperand(builtin->half_pixel_centers);
+      }
       *nn_op_type = ANEURALNETWORKS_RESIZE_BILINEAR;
     } break;
     case kTfLiteBuiltinResizeNearestNeighbor: {
@@ -2445,7 +2454,13 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[1]);
       mapping_args.builder->AddScalarInt32Operand(new_shape.data.i32[0]);
       mapping_args.builder->AddScalarBoolOperand(false);  // Use NHWC format
-
+      auto builtin = reinterpret_cast<TfLiteResizeNearestNeighborParams*>(
+          mapping_args.node->builtin_data);
+      if (builtin->align_corners == true ||
+          builtin->half_pixel_centers == true) {
+        mapping_args.builder->AddScalarBoolOperand(builtin->align_corners);
+        mapping_args.builder->AddScalarBoolOperand(builtin->half_pixel_centers);
+      }
       *nn_op_type = ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR;
     } break;
     case kTfLiteBuiltinSqueeze: {
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index 5cbba026010..d4d414ae29c 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -190,10 +190,6 @@ TEST_P(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
 
 TEST_P(ResizeBilinearOpTest,
        TwoDimensionalResizeWithTwoBatches_HalfPixelCenters) {
-  // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
   ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3},
                           GetParam(), /**half_pixel_centers**/ true);
   m.SetInput<float>({
@@ -253,10 +249,6 @@ TEST_P(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatchesUInt8) {
 
 TEST_P(ResizeBilinearOpTest,
        TwoDimensionalResizeWithTwoBatchesUInt8_HalfPixelCenters) {
-  // TODO(b/147696142): Update when NNAPI delegate can support TF2 behavior.
-  if (SingleOpModel::GetForceUseNnapi()) {
-    return;
-  }
   ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3}, GetParam(),
                           /**half_pixel_centers**/ true);
   m.SetInput<uint8>({

From ed400dea16540c75d4e659494adc25a5ffc8bc9c Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Sun, 10 May 2020 19:37:06 -0700
Subject: [PATCH 0284/1533] Log the rendezvous address for debugging purpose.
 This is very helpful for debugging blocking issues.

PiperOrigin-RevId: 310836009
Change-Id: I68f7b349de0c07b5e09518f2e5a2983eb4d16302
---
 tensorflow/core/kernels/sendrecv_ops.cc | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 93830515040..3d94fe1b6a5 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -92,14 +92,16 @@ void SendOp::Compute(OpKernelContext* ctx) {
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
     // Use the cached rendezvous key.
-    VLOG(2) << "Send " << parsed_key_.buf_;
+    VLOG(2) << "Send " << parsed_key_.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     ctx->SetStatus(ctx->rendezvous()->Send(parsed_key_, args, ctx->input(0),
                                            ctx->is_input_dead()));
     return;
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
-    VLOG(2) << "Send " << in_loop_parsed.buf_;
+    VLOG(2) << "Send " << in_loop_parsed.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     OP_REQUIRES_OK(ctx,
                    Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed));
 
@@ -200,13 +202,15 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
 
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {
-    VLOG(2) << "Recv " << parsed_key_.buf_;
+    VLOG(2) << "Recv " << parsed_key_.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     ctx->rendezvous()->RecvAsync(parsed_key_, args,
                                  make_recv_callback(ctx, std::move(done)));
   } else {
     Rendezvous::ParsedKey in_loop_parsed;
     GetRendezvousKey(key_prefix_, frame_iter, &in_loop_parsed.buf_);
-    VLOG(2) << "Recv " << in_loop_parsed.buf_;
+    VLOG(2) << "Recv " << in_loop_parsed.buf_ << " using "
+            << reinterpret_cast<uintptr_t>(ctx->rendezvous());
     OP_REQUIRES_OK_ASYNC(
         ctx, Rendezvous::ParseKey(in_loop_parsed.buf_, &in_loop_parsed), done);
     ctx->rendezvous()->RecvAsync(in_loop_parsed, args,

From 7e6348104ecec8880a97b90a0bb089653a18b7db Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 19:45:52 -0700
Subject: [PATCH 0285/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310836559
Change-Id: I2ed2607edc69fbd23a47c58c6276cfb7f67230ff
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From b7d7b6d7dbc56d83eafcf2e44eb5066e18ebdadb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 21:46:42 -0700
Subject: [PATCH 0286/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310846764
Change-Id: I9c41426656112ebdb45549584f356485bccd2e06
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From c6d2369174a69d3f873100f02788a50c396395ea Mon Sep 17 00:00:00 2001
From: marload <rladhkstn8@gmail.com>
Date: Mon, 11 May 2020 15:37:57 +0900
Subject: [PATCH 0287/1533] Refactoring: Format String -> Format Method

---
 configure.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/configure.py b/configure.py
index a003265f3c9..ca2ff597a31 100644
--- a/configure.py
+++ b/configure.py
@@ -144,7 +144,7 @@ def write_to_bazelrc(line):
 
 
 def write_action_env_to_bazelrc(var_name, var):
-  write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var)))
+  write_to_bazelrc('build --action_env {}="{}"'.format(var_name, str(var)))
 
 
 def run_shell(cmd, allow_non_zero=False, stderr=None):
@@ -205,7 +205,7 @@ def setup_python(environ_cp):
   # Get PYTHON_BIN_PATH, default is the current running python.
   default_python_bin_path = sys.executable
   ask_python_bin_path = ('Please specify the location of python. [Default is '
-                         '%s]: ') % default_python_bin_path
+                         '{}]: ').format(default_python_bin_path)
   while True:
     python_bin_path = get_from_env_or_user_or_default(environ_cp,
                                                       'PYTHON_BIN_PATH',
@@ -215,9 +215,9 @@ def setup_python(environ_cp):
     if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
       break
     elif not os.path.exists(python_bin_path):
-      print('Invalid python path: %s cannot be found.' % python_bin_path)
+      print('Invalid python path: {} cannot be found.'.format(python_bin_path))
     else:
-      print('%s is not executable.  Is it the python binary?' % python_bin_path)
+      print('{} is not executable.  Is it the python binary?'.format(python_bin_path))
     environ_cp['PYTHON_BIN_PATH'] = ''
 
   # Convert python path to Windows style before checking lib and version
@@ -236,7 +236,7 @@ def setup_python(environ_cp):
       default_python_lib_path = python_lib_paths[0]
       python_lib_path = get_input(
           'Please input the desired Python library path to use.  '
-          'Default is [%s]\n' % python_lib_paths[0])
+          'Default is [{}]\n'.format(python_lib_paths[0]))
       if not python_lib_path:
         python_lib_path = default_python_lib_path
     environ_cp['PYTHON_LIB_PATH'] = python_lib_path
@@ -252,7 +252,7 @@ def setup_python(environ_cp):
   # Set-up env variables used by python_configure.bzl
   write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
   write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
+  write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
   # If choosen python_lib_path is from a path specified in the PYTHONPATH
@@ -266,7 +266,7 @@ def setup_python(environ_cp):
   with open(
       os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
       'w') as f:
-    f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
+    f.write('export PYTHON_BIN_PATH="{}"'.format(python_bin_path))
 
 
 def reset_tf_configure_bazelrc():
@@ -320,11 +320,11 @@ def get_var(environ_cp,
       Raise the error to avoid infinitely looping.
   """
   if not question:
-    question = 'Do you wish to build TensorFlow with %s support?' % query_item
+    question = 'Do you wish to build TensorFlow with {} support?'.format(query_item)
   if not yes_reply:
-    yes_reply = '%s support will be enabled for TensorFlow.' % query_item
+    yes_reply = '{} support will be enabled for TensorFlow.'.format(query_item)
   if not no_reply:
-    no_reply = 'No %s' % yes_reply
+    no_reply = 'No {}'.format(yes_reply)
 
   yes_reply += '\n'
   no_reply += '\n'
@@ -368,7 +368,7 @@ def get_var(environ_cp,
         print(no_reply)
         var = False
     else:
-      print('Invalid selection: %s' % user_input_origin)
+      print('Invalid selection: {}'.format(user_input_origin))
   return var
 
 
From ca55b85c00ba06d772e6a285b118ed060cc36062 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 10 May 2020 23:46:24 -0700
Subject: [PATCH 0288/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310855894
Change-Id: I69968fa127faeb5ec180e552cc2e2082a5d15dcf
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 6725f80c0618952a0d139ea56f33c38a0686ced7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 02:02:57 -0700
Subject: [PATCH 0289/1533] Update GraphDef version to 398.

PiperOrigin-RevId: 310869430
Change-Id: I2e60eb43eeacea7286a932c94f809a3bfb117501
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b7f2b76d6ba..db436f3e511 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 397  // Updated: 2020/5/10
+#define TF_GRAPH_DEF_VERSION 398  // Updated: 2020/5/11
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From aa37de465e6336a42bff68ffbe1bcb40d43b0c75 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 02:02:59 -0700
Subject: [PATCH 0290/1533] compat: Update forward compatibility horizon to
 2020-05-11

PiperOrigin-RevId: 310869439
Change-Id: I7ed313dbac4466b6471cbc6c670282c9ae392257
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 17f559eaf17..a7cf5b25363 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 10)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 11)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From b5ab47572a6bd97c5555c3ac77187f457e59447e Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 11 May 2020 02:05:32 -0700
Subject: [PATCH 0291/1533] Use int32_t and uint32_t instead of int32 and
 uint32.

Also change string to std::string.

PiperOrigin-RevId: 310869780
Change-Id: I5a5d295ae259626121247726404891749e72b74f
---
 .../mlir/tools/kernel_gen/cubin_creator.cc       |  4 ++--
 .../mlir/tools/kernel_gen/tf_to_cubin.cc         | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index 46af4e4c94c..45d10214a42 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -64,9 +64,9 @@ using xla::StatusOr;
 
 StatusOr<std::string> GetLibdeviceDir(
     const xla::HloModuleConfig& hlo_module_config) {
-  for (const string& cuda_root : tensorflow::CandidateCudaRoots(
+  for (const std::string& cuda_root : tensorflow::CandidateCudaRoots(
            hlo_module_config.debug_options().xla_gpu_cuda_data_dir())) {
-    string libdevice_dir =
+    std::string libdevice_dir =
         tensorflow::io::JoinPath(cuda_root, "nvvm", "libdevice");
     VLOG(2) << "Looking for libdevice at " << libdevice_dir;
     if (tensorflow::Env::Default()->IsDirectory(libdevice_dir).ok()) {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
index d39edd89e34..c9b447f5cad 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
@@ -31,9 +31,9 @@
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace {
-bool ParseStringList(std::string string_list, std::vector<uint32>* result) {
+bool ParseStringList(std::string string_list, std::vector<uint32_t>* result) {
   result->clear();
-  uint32 item;
+  uint32_t item;
   auto items = absl::StrSplit(string_list, ',');
   for (const auto& item_str : items) {
     if (!absl::SimpleAtoi(item_str, &item)) {
@@ -48,10 +48,10 @@ bool ParseStringList(std::string string_list, std::vector<uint32>* result) {
 
 int main(int argc, char** argv) {
   std::string output_file = "foo.bin";
-  int32 architecture = 50;
-  std::vector<uint32> tile_sizes;
-  std::vector<uint32> unroll_factors;
-  std::vector<uint32> same_shape;
+  int32_t architecture = 50;
+  std::vector<uint32_t> tile_sizes;
+  std::vector<uint32_t> unroll_factors;
+  std::vector<uint32_t> same_shape;
 
   auto parse_tile_sizes = [&tile_sizes](std::string tile_sizes_str) {
     if (!ParseStringList(tile_sizes_str, &tile_sizes)) {
@@ -91,8 +91,8 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  std::pair<int32, int32> compute_capability(architecture / 10,
-                                             architecture % 10);
+  std::pair<int32_t, int32_t> compute_capability(architecture / 10,
+                                                 architecture % 10);
 
   auto cubin = tensorflow::kernel_gen::GenerateCubinForTfCode(
       argv[1], compute_capability, tile_sizes, same_shape, unroll_factors);

From f208ff6827e17fe773cf59192abaaa3f90bd16ad Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Mon, 11 May 2020 17:14:11 +0800
Subject: [PATCH 0292/1533] [tflite] reformat/cleanup label_image readme.md

---
 .../lite/examples/label_image/README.md       | 124 +++++++++++++-----
 1 file changed, 88 insertions(+), 36 deletions(-)

diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md
index 09e9e77b86a..9d37c153361 100644
--- a/tensorflow/lite/examples/label_image/README.md
+++ b/tensorflow/lite/examples/label_image/README.md
@@ -90,48 +90,100 @@ adb push tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp  /data/l
 adb push /tmp/labels.txt /data/local/tmp
 ```
 
-Run it, `adb shell "/data/local/tmp/label_image \ -m
-/data/local/tmp/mobilenet_v1_1.0_224.tflite \ -i
-/data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt"` then you
-should see something like the followings: `Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
-TensorFlow Lite runtime. invoked average time: 25.03 ms 0.907071: 653 military
-uniform 0.0372416: 907 Windsor tie 0.00733753: 466 bulletproof vest 0.00592852:
-458 bow tie 0.00414091: 514 cornet`
+Run it,
+```
+adb shell "/data/local/tmp/label_image \
+ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
+ -i /data/local/tmp/grace_hopper.bmp \
+ -l /data/local/tmp/labels.txt"
+```
+then you should see something like the followings:
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite
+resolved reporter
+INFO: Initialized
+TensorFlow Lite runtime.
+invoked
+average time: 25.03 ms
+0.907071: 653 military uniform
+0.0372416: 907 Windsor tie
+0.00733753: 466 bulletproof vest
+0.00592852: 458 bow tie
+0.00414091: 514 cornet
+```
 
-Run the model with NNAPI delegate (`-a 1`), `adb shell
-"/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
--i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -a 1 -f 1"`
-then you should see something like the followings: `Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
-TensorFlow Lite runtime. INFO: Created TensorFlow Lite delegate for NNAPI.
-Applied NNAPI delegate. invoked average time:10.348 ms 0.905401: 653 military
-uniform 0.0379589: 907 Windsor tie 0.00735866: 466 bulletproof vest 0.00605307:
-458 bow tie 0.00422573: 514 cornet`
+Run the model with NNAPI delegate (`-a 1`),
+```
+adb shell "/data/local/tmp/label_image \
+ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
+ -i /data/local/tmp/grace_hopper.bmp \
+ -l /data/local/tmp/labels.txt -a 1 -f 1"
+```
+then you should see something like the followings:
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite
+resolved reporter
+INFO: Initialized
+TensorFlow Lite runtime.
+INFO: Created TensorFlow Lite delegate for NNAPI.
+Applied NNAPI delegate.
+invoked
+average time:10.348 ms
+0.905401: 653 military uniform
+0.0379589: 907 Windsor tie
+0.00735866: 466 bulletproof vest
+0.00605307: 458 bow tie
+0.00422573: 514 cornet
+```
 
 To run a model with the Hexagon Delegate, assuming we have followed the
 [Hexagon Delegate Guide](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
-and installed Hexagon libraries in `/data/local/tmp`. Run it `adb shell
-"/data/local/tmp/label_image \ -m
-/data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \ -i
-/data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -j 1"` then you
-should see something like the followings: ``` Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224_quant.tflite resolved reporter INFO:
-Initialized TensorFlow Lite runtime. INFO: Created TensorFlow Lite delegate for
-Hexagon. INFO: Hexagon delegate: 31 nodes delegated out of 31 nodes.
+and installed Hexagon libraries in `/data/local/tmp`. Run it
+```
+adb shell \
+  "/data/local/tmp/label_image \
+  -m /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite \
+  -i /data/local/tmp/grace_hopper.bmp \
+  -l /data/local/tmp/labels.txt -j 1"
+```
+then you should see something like the followings:
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224_quant.tflite
+resolved reporter
+INFO: Initialized TensorFlow Lite runtime.
+loaded libcdsprpc.so
+INFO: Created TensorFlow Lite delegate for Hexagon.
+INFO: Hexagon delegate: 31 nodes delegated out of 31 nodes with 1 partitions.
 
-remote_handle_control available and used Applied Hexagon delegate.invoked
-average time: 8.307 ms 0.729412: 653 military uniform 0.0980392: 907 Windsor tie
-0.0313726: 466 bulletproof vest 0.0313726: 458 bow tie 0.0117647: 700 panpipe
+Applied Hexagon delegate.invoked
+average time: 4.231 ms
+0.639216: 458 bow tie
+0.329412: 653 military uniform
+0.00784314: 835 suit
+0.00784314: 611 jersey
+0.00392157: 514 cornet
 ```
 
-Run the model with the XNNPACK delegate (`-x 1`), `adb shell
-"/data/local/tmp/label_image \ -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
--i /data/local/tmp/grace_hopper.bmp \ -l /data/local/tmp/labels.txt -x 1"` then
-you should see something like the followings: `Loaded model
-/data/local/tmp/mobilenet_v1_1.0_224.tflite resolved reporter INFO: Initialized
-TensorFlow Lite runtime. Applied XNNPACK delegate.invoked average time: 11.0237
-ms 0.90707: 653 military uniform 0.0372418: 907 Windsor tie 0.0073376: 466
-bulletproof vest 0.00592856: 458 bow tie 0.00414093: 514 cornet`
+Run the model with the XNNPACK delegate (`-x 1`),
+```
+adb shell \
+  "/data/local/tmp/label_image \
+  -m /data/local/tmp/mobilenet_v1_1.0_224.tflite \
+  -i /data/local/tmp/grace_hopper.bmp \
+  -l /data/local/tmp/labels.txt -x 1"
+```
+then you should see something like the followings:
+```
+Loaded model /data/local/tmp/mobilenet_v1_1.0_224.tflite
+resolved reporter
+INFO: Initialized TensorFlow Lite runtime.
+Applied XNNPACK delegate.invoked
+average time: 17.33 ms
+0.90707: 653 military uniform
+0.0372418: 907 Windsor tie
+0.0073376: 466 bulletproof vest
+0.00592857: 458 bow tie
+0.00414093: 514 cornet
+```
 
 See the `label_image.cc` source code for other command line options.

From 7cb0b5767c549df17a52173ef33ec7d2487d25e2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 02:32:17 -0700
Subject: [PATCH 0293/1533] Tolerate differences equal to `tolerated` threshold
 in MinMaxApproximatelyEqual.

PiperOrigin-RevId: 310872715
Change-Id: I5b56efad6c31efa144a72f3a30843a98fec0a6f1
---
 .../lite/toco/graph_transformations/hardcode_min_max.cc       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index 9816cc1df6a..171d522daa7 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -271,8 +271,8 @@ bool MinMaxApproximatelyEqual(const MinMax& minmax1, const MinMax& minmax2) {
   const double magnitude =
       std::min(minmax1.max - minmax1.min, minmax2.max - minmax2.min);
   const double tolerated = 1e-6 * magnitude;
-  return std::abs(minmax1.min - minmax2.min) < tolerated &&
-         std::abs(minmax1.max - minmax2.max) < tolerated;
+  return std::abs(minmax1.min - minmax2.min) <= tolerated &&
+         std::abs(minmax1.max - minmax2.max) <= tolerated;
 }
 
 // Propagates MinMax from any of the listed arrays, to all others.

From f8867620dcc60433b9a83a5af5b96276e83127d6 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Mon, 11 May 2020 12:36:37 +0300
Subject: [PATCH 0294/1533] Explanatory comments in slicing tests files (ARC
 specific) + URL to the latest embarc_MLI library

---
 tensorflow/lite/micro/kernels/arc_mli/README.md          | 2 +-
 .../lite/micro/kernels/arc_mli/conv_slicing_test.cc      | 9 +++++++++
 .../micro/kernels/arc_mli/depthwise_conv_slicing_test.cc | 9 +++++++++
 .../kernels/arc_mli/fully_connected_slicing_test.cc      | 9 +++++++++
 .../lite/micro/kernels/arc_mli/pooling_slicing_test.cc   | 8 ++++++++
 .../lite/micro/tools/make/third_party_downloads.inc      | 8 ++++----
 6 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/arc_mli/README.md b/tensorflow/lite/micro/kernels/arc_mli/README.md
index 2b2e194e757..33e46ca871d 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/README.md
+++ b/tensorflow/lite/micro/kernels/arc_mli/README.md
@@ -16,7 +16,7 @@ In case MLI implementation can’t be used, kernels in this folder fallback to T
 
 For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the application. For a custom target ARC-based platform, MLI sources are downloaded and compiled during project generation phase. To build library from sources for ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command:
 
-    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project.
+    make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project
 
 If an application exclusively uses accelerated MLI kernel implementations, one can strip out TFLM reference kernel implementations to reduce code size of application. Build application with `MLI_ONLY=true` option in generated project (after the project was built):
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
index 27e30856f6c..9eb9d6499dd 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
@@ -13,6 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This test checks that slicing logic doesn`t affect result of convolution
+// kernel
+//
+// This test doesn`t replace default convolution test
+// (tensorflow/lite/micro/kernels/conv_test.cc). It is added to the whole
+// testset only in case MLI for ARC platform is used during generation (which is
+// handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms. 
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
index fb9dd46c1e4..e6a87ff82e6 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
@@ -13,6 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This test checks that slicing logic doesn`t affect result of depthwise
+// convolution kernel
+//
+// This test doesn`t replace default depthwise convolution test
+// (tensorflow/lite/micro/kernels/depthwise_conv_test.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
index 78cb2873c54..0bd264a5f1b 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected_slicing_test.cc
@@ -13,6 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This test checks that slicing logic doesn`t affect result of fully
+// connected kernel
+//
+// This test doesn`t replace default fully connected test
+// (tensorflow/lite/micro/kernels/fully_connected_test.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
 #include <cstdint>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
index 63737a41791..381420f1f7d 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
@@ -13,6 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// This test checks that slicing logic doesn`t affect result of pooling kernels
+//
+// This test doesn`t replace default pooling test
+// (tensorflow/lite/micro/kernels/pooling.cc). It is added to the
+// whole testset only in case MLI for ARC platform is used during generation
+// (which is handled in arc_mli.inc). So such tests won`t be generated for other
+// platforms.
+
 #include <cstdint>
 
 #include "tensorflow/lite/c/builtin_op_data.h"
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index d90f8548f31..91f3f1b5263 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -71,11 +71,11 @@ PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
 PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
 PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
 
-EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/7026ad09bb7f967324eb29e069f776bc44a08886.zip"
-EMBARC_MLI_MD5 := "7eebd730db79c6834399f87e509115fb"
+EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
+EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
 
-EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC1/embARC_MLI_package.zip"
-EMBARC_MLI_PRE_COMPILED_MD5 := "a66d6afff8daeb40bd3a99c42de048ab"
+EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
+EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
 
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"

From 476ec938b253a9479de09aab88dceec6f0a304ed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 02:59:02 -0700
Subject: [PATCH 0295/1533] Install auditwheel in the multi-python container
 and move patchelf installation into the combined package installation step.

PiperOrigin-RevId: 310875761
Change-Id: I6591396e1a2b0d99fc85123746702dd331b8fdcb
---
 ...10.1-cudnn7-ubuntu16.04-manylinux2010-multipython | 12 ++++--------
 .../install/install_pip_packages_by_version.sh       |  1 +
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
index 353d946261d..2e520f62cde 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
@@ -55,17 +55,16 @@ RUN /install/install_bootstrap_deb_packages.sh
 COPY install/install_deb_packages.sh /install/
 RUN /install/install_deb_packages.sh
 
-# Install patchelf to facilitate the creation of manylinux2010 whls.
-COPY install/install_patchelf.sh /install/
-RUN /install/install_patchelf.sh
-
-# Install additional dependencies to build Python from source.
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
 RUN apt-get update && apt-get install -y \
     libncurses5-dev \
     libgdbm-dev \
     libnss3-dev \
     libreadline-dev \
     libffi-dev \
+    patchelf \
       && \
     rm -rf /var/lib/apt/lists/*
 
@@ -86,9 +85,6 @@ RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
 RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
 RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
 
-# Install auditwheel to create manylinux2010 compliant binaries
-RUN pip3 install auditwheel
-
 ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
 COPY install/install_latest_clang.sh /install/
 RUN /install/install_latest_clang.sh
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index d9953db3b5a..81e5f2b6406 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -26,6 +26,7 @@ if [[ ! -x "$(which "${PIP}")" ]]; then
 fi
 
 PACKAGES=(
+  "auditwheel"
   "wheel"
   "setuptools"
   "virtualenv"

From 54ba3f51472968b92c94d5c809869cab03d203ac Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Mon, 11 May 2020 04:24:20 -0700
Subject: [PATCH 0296/1533] Bump open source LLVM revision to
 f40fc7b8d60e6897fd9514a46a26f4b84f57577a

PiperOrigin-RevId: 310886191
Change-Id: Ibee0560c691301ed3de92e40c2c5ef22b107210d
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 5b479c54b8b..ea25bfe6bc7 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "c5e0967e4cf0f1337bec772949e6cede4c01354b"
-    LLVM_SHA256 = "5d8dbddd78fbc1c08825b178aff0a0f04722d83280eb93be55a174391f1885ce"
+    LLVM_COMMIT = "f40fc7b8d60e6897fd9514a46a26f4b84f57577a"
+    LLVM_SHA256 = "8f5ea1c26fc922b1b0752f6bd63b31d4137e04630828cb3a4f94a9bbdbcf575d"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From edabfee6e0a2847966ff80060e91fcfdde452428 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 06:31:02 -0700
Subject: [PATCH 0297/1533] Increase the minimum support iOS version from 10 to
 11.

PiperOrigin-RevId: 310899980
Change-Id: I40bcece83fe070b8eeb950c500c473b62f03221f
---
 tensorflow/lite/delegates/gpu/BUILD           |  2 +-
 tensorflow/lite/delegates/gpu/metal/BUILD     | 10 ++---
 .../lite/delegates/gpu/metal/kernels/BUILD    | 38 +++++++++----------
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 099f653a1b8..2581232bc2b 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -167,7 +167,7 @@ ios_static_framework(
         "metal_delegate.h",
         "metal_delegate_internal.h",
     ],
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     deps = [":metal_delegate"],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/metal/BUILD b/tensorflow/lite/delegates/gpu/metal/BUILD
index 192c787b0db..4db8f3d071d 100644
--- a/tensorflow/lite/delegates/gpu/metal/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/BUILD
@@ -80,7 +80,7 @@ objc_library(
 ios_unit_test(
     name = "common_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -117,7 +117,7 @@ objc_library(
 ios_unit_test(
     name = "compiled_model_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -205,7 +205,7 @@ objc_library(
 ios_unit_test(
     name = "inference_context_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -235,7 +235,7 @@ ios_application(
         "iphone",
     ],
     infoplists = ["Info.plist"],
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     provisioning_profile = "//tensorflow/lite/delegates/gpu/metal:provisioning_profile.mobileprovision",
     tags = tf_gpu_tests_tags() + [
         "local",
@@ -267,7 +267,7 @@ objc_library(
 
 ios_unit_test(
     name = "ComponentsTests",
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + ["notap"],
     test_host = ":TestApplication",
diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
index a1052b8adf4..657e9b53a59 100644
--- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD
@@ -71,7 +71,7 @@ objc_library(
 ios_unit_test(
     name = "add_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -109,7 +109,7 @@ objc_library(
 ios_unit_test(
     name = "concat_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -151,7 +151,7 @@ objc_library(
 ios_unit_test(
     name = "conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -213,7 +213,7 @@ objc_library(
 ios_unit_test(
     name = "depthwise_conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -253,7 +253,7 @@ objc_library(
 ios_unit_test(
     name = "elementwise_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -293,7 +293,7 @@ objc_library(
 ios_unit_test(
     name = "fully_connected_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -332,7 +332,7 @@ objc_library(
 ios_unit_test(
     name = "max_unpooling_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -371,7 +371,7 @@ objc_library(
 ios_unit_test(
     name = "mean_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = [
         "notap",
@@ -450,7 +450,7 @@ objc_library(
 ios_unit_test(
     name = "padding_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -490,7 +490,7 @@ objc_library(
 ios_unit_test(
     name = "pooling_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -530,7 +530,7 @@ objc_library(
 ios_unit_test(
     name = "prelu_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -569,7 +569,7 @@ objc_library(
 ios_unit_test(
     name = "relu_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -608,7 +608,7 @@ objc_library(
 ios_unit_test(
     name = "resize_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -648,7 +648,7 @@ objc_library(
 ios_unit_test(
     name = "reshape_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -687,7 +687,7 @@ objc_library(
 ios_unit_test(
     name = "slice_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -727,7 +727,7 @@ objc_library(
 ios_unit_test(
     name = "softmax_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -764,7 +764,7 @@ objc_library(
 ios_unit_test(
     name = "space_to_depth_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -804,7 +804,7 @@ objc_library(
 ios_unit_test(
     name = "transpose_conv_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",
@@ -885,7 +885,7 @@ objc_library(
 ios_unit_test(
     name = "winograd_test",
     testonly = 1,
-    minimum_os_version = "10.0",
+    minimum_os_version = "11.0",
     runner = tflite_ios_lab_runner("IOS_LATEST"),
     tags = tf_gpu_tests_tags() + [
         "notap",

From 3354e9eb51820f106c863a01a94b8a18cebdd72a Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 11 May 2020 07:01:48 -0700
Subject: [PATCH 0298/1533] [XLA:CPU/GPU] Merge the elemental implementations
 of Reduce/ReduceWindow from CPU&GPU

Maintaining one of them is more than enough. This takes Reduce from CPU because
it has more features and ReduceWindow from GPU because it's tested in a fusion
context. This should have no impact on features or performance.

PiperOrigin-RevId: 310904046
Change-Id: I8cc3ceeb50200978d747be70fa05d09e897b6de7
---
 .../xla/service/cpu/elemental_ir_emitter.cc   |  22 --
 .../xla/service/cpu/elemental_ir_emitter.h    |   6 +
 .../compiler/xla/service/cpu/ir_emitter.cc    | 190 ---------------
 .../compiler/xla/service/cpu/ir_emitter.h     |  13 +-
 .../xla/service/elemental_ir_emitter.cc       | 222 ++++++++++++++++++
 .../xla/service/elemental_ir_emitter.h        |  20 ++
 .../xla/service/gpu/elemental_ir_emitter.cc   | 139 -----------
 .../xla/service/gpu/elemental_ir_emitter.h    |  11 +
 8 files changed, 261 insertions(+), 362 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index e21ca01c803..ccd17bb791d 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -121,12 +121,6 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
         return ir_emitter_->EmitElementalMap(*Cast<HloMapInstruction>(hlo),
                                              operands, llvm_ir::IrName(hlo));
       };
-    case HloOpcode::kReduceWindow:
-      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
-        return ir_emitter_->EmitElementalReduceWindow(
-            Cast<HloReduceWindowInstruction>(hlo),
-            operand_to_generator.at(hlo->operand(0)), index);
-      };
     case HloOpcode::kConvolution:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return ir_emitter_->EmitElementalConvolution(
@@ -134,22 +128,6 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
             operand_to_generator.at(hlo->operand(0)),
             operand_to_generator.at(hlo->operand(1)), index);
       };
-    case HloOpcode::kReduce:
-      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
-        auto reduce_instr = Cast<HloReduceInstruction>(hlo);
-        std::vector<llvm_ir::ElementGenerator> input_generators;
-        for (const HloInstruction* instr : reduce_instr->inputs()) {
-          input_generators.push_back(operand_to_generator.at(instr));
-        }
-
-        std::vector<llvm_ir::ElementGenerator> initial_value_generators;
-        for (const HloInstruction* instr : reduce_instr->init_values()) {
-          initial_value_generators.push_back(operand_to_generator.at(instr));
-        }
-        return ir_emitter_->EmitElementalReduce(
-            reduce_instr, std::move(input_generators),
-            std::move(initial_value_generators), index);
-      };
     default:
       return ElementalIrEmitter::MakeElementGenerator(hlo,
                                                       operand_to_generator);
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
index e3fba9306b7..5c9f6677ab3 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.h
@@ -44,6 +44,12 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                   llvm::Value* value) override;
 
+  StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name) override {
+    return ir_emitter_->EmitThreadLocalCall(callee, parameters, name);
+  }
+
   IrEmitter* ir_emitter_;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index c19fa779b60..78d859cb34a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -702,94 +702,6 @@ llvm::Value* IrEmitter::EmitElementalMap(
                                             elemental_operands, name);
 }
 
-StatusOr<llvm::Value*> IrEmitter::EmitElementalReduceWindow(
-    const HloReduceWindowInstruction* reduce_window,
-    const llvm_ir::ElementGenerator& input_generator,
-    const llvm_ir::IrArray::Index& index) {
-  const HloInstruction* operand = reduce_window->operand(0);
-  const Window& window = reduce_window->window();
-
-  // We fold inputs into the accumulator and initialize it to
-  // the initial value on the reduce_window.
-  PrimitiveType operand_element_type = operand->shape().element_type();
-  llvm::Value* accumulator_address = llvm_ir::EmitAllocaAtFunctionEntry(
-      llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-      "reduce_window_accumulator_address", &b_,
-      MinimumAlignmentForPrimitiveType(operand_element_type));
-  Store(Load(GetEmittedValueFor(reduce_window->operand(1))),
-        accumulator_address);
-
-  llvm_ir::ForLoopNest loops(IrName(reduce_window, "inner"), &b_);
-  std::vector<int64> window_size;
-  for (const auto& dim : window.dimensions()) {
-    window_size.push_back(dim.size());
-  }
-  const llvm_ir::IrArray::Index window_index = loops.AddLoopsForShape(
-      ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-  CHECK_EQ(window_index.size(), index.size());
-
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-  std::vector<llvm::Value*> input_multi_index(index.size());
-  llvm::Value* in_bounds_condition = nullptr;
-  for (size_t i = 0; i < index.size(); ++i) {
-    llvm::Value* strided_index =
-        NSWMul(index[i], b_.getInt64(window.dimensions(i).stride()));
-    input_multi_index[i] = NSWSub(
-        NSWAdd(strided_index,
-               NSWMul(window_index[i],
-                      b_.getInt64(window.dimensions(i).window_dilation()))),
-        b_.getInt64(window.dimensions(i).padding_low()));
-
-    // We need to verify that we are not in the dilated base area.
-    llvm::Value* dilation_condition =
-        ICmpEQ(SRem(input_multi_index[i],
-                    b_.getInt64(window.dimensions(i).base_dilation())),
-               b_.getInt64(0));
-    if (in_bounds_condition == nullptr) {
-      in_bounds_condition = dilation_condition;
-    } else {
-      in_bounds_condition = And(in_bounds_condition, dilation_condition);
-    }
-
-    // Apply base dilation to the index.
-    input_multi_index[i] =
-        SDiv(input_multi_index[i],
-             b_.getInt64(window.dimensions(i).base_dilation()));
-
-    // We need to check if 0 <= input_multi_index[i] < bound, as otherwise we
-    // are in the padding so that we can skip the computation. That is
-    // equivalent to input_multi_index[i] < bound as an *unsigned* comparison,
-    // since a negative value will wrap to a large positive value.
-    llvm::Value* index_condition =
-        ICmpULT(input_multi_index[i],
-                b_.getInt64(ShapeUtil::GetDimension(operand->shape(), i)));
-    if (in_bounds_condition == nullptr) {
-      in_bounds_condition = index_condition;
-    } else {
-      in_bounds_condition = And(in_bounds_condition, index_condition);
-    }
-  }
-  CHECK(in_bounds_condition != nullptr);
-
-  llvm_ir::LlvmIfData if_data =
-      llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
-  SetToFirstInsertPoint(if_data.true_block, &b_);
-
-  // We are not in the padding, so carry out the computation.
-  llvm_ir::IrArray::Index input_index(input_multi_index, operand->shape(),
-                                      b_.getInt64Ty());
-  TF_ASSIGN_OR_RETURN(llvm::Value* const input_value,
-                      input_generator(input_index));
-  llvm::Value* result = EmitScalarReturningThreadLocalCall(
-      *reduce_window->to_apply(), {Load(accumulator_address), input_value},
-      "reducer_function");
-  Store(result, accumulator_address);
-
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-  return Load(accumulator_address);
-}
-
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // Pseudo code for reduce window:
   //
@@ -2099,108 +2011,6 @@ StatusOr<bool> IrEmitter::EmitVectorizedReduce(
   return true;
 }
 
-StatusOr<llvm::Value*> IrEmitter::EmitElementalReduce(
-    const HloReduceInstruction* reduce,
-    std::vector<llvm_ir::ElementGenerator> input_generators,
-    std::vector<llvm_ir::ElementGenerator> initial_value_generators,
-    const llvm_ir::IrArray::Index& index) {
-  const Shape& out_shape = reduce->shape();
-  bool is_variadic = !out_shape.IsArray();
-  int accumulators_count = 1;
-  if (is_variadic) {
-    CHECK(out_shape.IsTuple());
-    accumulators_count = out_shape.tuple_shapes_size();
-  }
-
-  absl::Span<const int64> reduced_dimensions(reduce->dimensions());
-
-  std::vector<llvm::Value*> accumulator_addrs;
-  std::vector<llvm::Type*> accumulator_types;
-  for (int i = 0; i < accumulators_count; i++) {
-    const Shape& element_shape =
-        is_variadic ? out_shape.tuple_shapes(i) : out_shape;
-    PrimitiveType accumulator_type = element_shape.element_type();
-    llvm::Type* accumulator_llvm_type =
-        llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
-    accumulator_types.push_back(accumulator_llvm_type);
-
-    // Initialize an accumulator with init_value.
-    llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-        accumulator_llvm_type, "accumulator_" + std::to_string(i), &b_,
-        MinimumAlignmentForPrimitiveType(accumulator_type));
-    TF_ASSIGN_OR_RETURN(
-        llvm::Value* const init_value,
-        initial_value_generators[i](llvm_ir::IrArray::Index(index.GetType())));
-    Store(init_value, accumulator_addr);
-    accumulator_addrs.push_back(accumulator_addr);
-  }
-
-  // The enclosing loops go over all the target elements. Now we have to compute
-  // the actual target element. For this, we build a new loop nest to iterate
-  // over all the reduction dimensions in the argument.
-  // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
-  // are placed for each dimension in dimensions, and all the rest are nullptrs.
-  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-  const HloInstruction* arg = reduce->operand(0);
-  std::vector<llvm::Value*> input_multi_index =
-      loops.AddLoopsForShapeOnDimensions(arg->shape(), reduced_dimensions,
-                                         "reduction_dim");
-
-  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-  // Build a full index for the input argument, using input_multi_index as the
-  // base. In input_multi_index only the reduction dimensions are filled in. We
-  // fill in the rest of the dimensions with induction Value*s taken from
-  // 'index' which iterates over the target array.  See the high-level
-  // description in the XLA documentation for details.
-  llvm_ir::IrArray::Index::const_iterator it = index.begin();
-
-  for (auto& i : input_multi_index) {
-    if (i == nullptr) {
-      i = *it++;
-    }
-  }
-  CHECK(index.end() == it);
-  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
-                                      b_.getInt64Ty());
-
-  std::vector<llvm::Value*> reduction_operands;
-  for (llvm::Value* accum : accumulator_addrs) {
-    llvm::Value* accum_value = Load(accum);
-    reduction_operands.push_back(accum_value);
-  }
-
-  for (int i = 0; i < accumulators_count; i++) {
-    TF_ASSIGN_OR_RETURN(llvm::Value* const input_element,
-                        input_generators[i](input_index));
-    reduction_operands.push_back(input_element);
-  }
-
-  std::vector<llvm::Value*> results = EmitThreadLocalCall(
-      *reduce->to_apply(), reduction_operands, "reduce_function");
-
-  CHECK(results.size() == accumulators_count);
-  for (int i = 0; i < accumulators_count; i++) {
-    Store(results[i], accumulator_addrs[i]);
-  }
-  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-
-  if (is_variadic) {
-    // Emit a structure, as that what the LoopEmitter expects.
-    llvm::Value* returned_structure = llvm::UndefValue::get(
-        llvm::StructType::get(b_.getContext(), accumulator_types));
-    for (int i = 0; i < accumulators_count; i++) {
-      llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
-      returned_structure =
-          b_.CreateInsertValue(returned_structure, accumulator_value, i);
-    }
-    return returned_structure;
-  } else {
-    CHECK_EQ(accumulator_addrs.size(), 1);
-    return Load(accumulator_addrs[0]);
-  }
-}
-
 Status IrEmitter::HandleReduce(HloInstruction* reduce) {
   auto arg = reduce->mutable_operand(0);
   auto init_value = reduce->mutable_operand(1);
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index cc5aa3f37fc..c5e05db40bd 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -58,6 +58,8 @@ namespace cpu {
 // functions.
 class IrEmitter : public DfsHloVisitorWithDefault,
                   public IrBuilderMixin<IrEmitter> {
+  friend class CpuElementalIrEmitter;
+
  public:
   using GeneratorForOperandIrArrays =
       std::function<std::vector<llvm_ir::IrArray>()>;
@@ -118,23 +120,12 @@ class IrEmitter : public DfsHloVisitorWithDefault,
       const HloMapInstruction& map_instr,
       absl::Span<llvm::Value* const> elemental_operands,
       absl::string_view name);
-  // Emit code to emit the element at `index` for a reduce window instruction.
-  StatusOr<llvm::Value*> EmitElementalReduceWindow(
-      const HloReduceWindowInstruction* reduce_window,
-      const llvm_ir::ElementGenerator& input_generator,
-      const llvm_ir::IrArray::Index& index);
   // Emit code to emit the element at `index` for a convolution instruction.
   StatusOr<llvm::Value*> EmitElementalConvolution(
       const HloConvolutionInstruction* convolution,
       const llvm_ir::ElementGenerator& input_generator,
       const llvm_ir::ElementGenerator& kernel_generator,
       const llvm_ir::IrArray::Index& index);
-  // Emit code to emit the element at `index` for a reduce instruction.
-  StatusOr<llvm::Value*> EmitElementalReduce(
-      const HloReduceInstruction* reduce,
-      std::vector<llvm_ir::ElementGenerator> input_generators,
-      std::vector<llvm_ir::ElementGenerator> initial_value_generator,
-      const llvm_ir::IrArray::Index& index);
 
  protected:
   //
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 30300b8c195..b4ea18a8a1e 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -2422,6 +2422,28 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  -> StatusOr<llvm::Value*> {
         return EmitElementalDot(hlo, operand_to_generator, dot_result_index);
       };
+    case HloOpcode::kReduceWindow:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        return EmitElementalReduceWindow(
+            Cast<HloReduceWindowInstruction>(hlo),
+            operand_to_generator.at(hlo->operand(0)),
+            operand_to_generator.at(hlo->operand(1)), index);
+      };
+    case HloOpcode::kReduce:
+      return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
+        auto reduce_instr = Cast<HloReduceInstruction>(hlo);
+        std::vector<llvm_ir::ElementGenerator> input_generators;
+        for (const HloInstruction* instr : reduce_instr->inputs()) {
+          input_generators.push_back(operand_to_generator.at(instr));
+        }
+
+        std::vector<llvm_ir::ElementGenerator> initial_value_generators;
+        for (const HloInstruction* instr : reduce_instr->init_values()) {
+          initial_value_generators.push_back(operand_to_generator.at(instr));
+        }
+        return EmitElementalReduce(reduce_instr, std::move(input_generators),
+                                   std::move(initial_value_generators), index);
+      };
     default:
       return [hlo](const IrArray::Index& index) {
         return Unimplemented("Unhandled opcode for elemental IR emission: %s",
@@ -2451,4 +2473,204 @@ llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
   return complex;
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduceWindow(
+    const HloReduceWindowInstruction* reduce_window,
+    const llvm_ir::ElementGenerator& input_generator,
+    const llvm_ir::ElementGenerator& initial_value_generator,
+    const llvm_ir::IrArray::Index& index) {
+  // Pseudocode:
+  // for each index I in output
+  //   value = init_value
+  //   for each index W in window
+  //     for each dimension i from 0 to rank - 1
+  //       (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
+  //     if I in bounds of input
+  //       value = function(value, input[I])
+  //     output[O] = value
+  const HloInstruction* operand = reduce_window->operand(0);
+  const Window& window = reduce_window->window();
+
+  PrimitiveType operand_element_type = operand->shape().element_type();
+  llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
+      llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
+      "reduce_window_accum_ptr", b_);
+  {
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value* const init_value,
+        initial_value_generator(llvm_ir::IrArray::Index(index.GetType())));
+    Store(init_value, accum_ptr);
+  }
+
+  llvm::Type* index_type = index.GetType();
+  auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
+    return index.GetConstantWithIndexType(c);
+  };
+
+  llvm_ir::ForLoopNest loops(IrName(reduce_window), b_, index_type);
+  std::vector<int64> window_size;
+  for (const auto& dim : window.dimensions()) {
+    window_size.push_back(dim.size());
+  }
+  const IrArray::Index window_index = loops.AddLoopsForShape(
+      ShapeUtil::MakeShape(operand_element_type, window_size), "window");
+  CHECK_EQ(window_index.size(), index.size());
+
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
+
+  std::vector<llvm::Value*> input_multi_index(index.size());
+  llvm::Value* in_bounds = b_->getInt1(true);
+  for (size_t i = 0; i < index.size(); ++i) {
+    llvm::Value* stridden_index =
+        NSWMul(index[i], index_typed_const(window.dimensions(i).stride()));
+    input_multi_index[i] = NSWSub(
+        NSWAdd(
+            stridden_index,
+            NSWMul(window_index[i],
+                   index_typed_const(window.dimensions(i).window_dilation()))),
+        index_typed_const(window.dimensions(i).padding_low()));
+
+    // We need to verify that we are not in the dilated base area.
+    llvm::Value* dilation_condition =
+        ICmpEQ(SRem(input_multi_index[i],
+                    index_typed_const(window.dimensions(i).base_dilation())),
+               index_typed_const(0));
+    in_bounds = And(in_bounds, dilation_condition);
+
+    // Apply base dilation to the index.
+    input_multi_index[i] =
+        SDiv(input_multi_index[i],
+             index_typed_const(window.dimensions(i).base_dilation()));
+
+    // We must check whether 0 <= input_multi_index[i] < bound, as
+    // otherwise we are in the pad and so can skip the computation. This
+    // comparison is equivalent to the unsigned comparison
+    // input_multi_index[i] < bound, as a negative value wraps to a large
+    // positive value.
+    in_bounds = And(in_bounds,
+                    ICmpULT(input_multi_index[i],
+                            index_typed_const(operand->shape().dimensions(i))));
+  }
+
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
+  SetToFirstInsertPoint(if_data.true_block, b_);
+
+  // We are not in pad, so do the computation.
+  IrArray::Index input_index(input_multi_index, operand->shape(), index_type);
+  TF_ASSIGN_OR_RETURN(llvm::Value * input_value, input_generator(input_index));
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> accum_values,
+      EmitThreadLocalCall(*reduce_window->to_apply(),
+                          {Load(accum_ptr), input_value}, "reducer_function"));
+  CHECK_EQ(accum_values.size(), 1);
+  Store(accum_values[0], accum_ptr);
+
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
+  return Load(accum_ptr);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduce(
+    const HloReduceInstruction* reduce,
+    std::vector<llvm_ir::ElementGenerator> input_generators,
+    std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+    const llvm_ir::IrArray::Index& index) {
+  const Shape& out_shape = reduce->shape();
+  bool is_variadic = !out_shape.IsArray();
+  int accumulators_count = 1;
+  if (is_variadic) {
+    CHECK(out_shape.IsTuple());
+    accumulators_count = out_shape.tuple_shapes_size();
+  }
+
+  absl::Span<const int64> reduced_dimensions(reduce->dimensions());
+
+  std::vector<llvm::Value*> accumulator_addrs;
+  std::vector<llvm::Type*> accumulator_types;
+  llvm::Type* index_type = index.GetType();
+  for (int i = 0; i < accumulators_count; i++) {
+    const Shape& element_shape =
+        is_variadic ? out_shape.tuple_shapes(i) : out_shape;
+    PrimitiveType accumulator_type = element_shape.element_type();
+    llvm::Type* accumulator_llvm_type =
+        llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
+    accumulator_types.push_back(accumulator_llvm_type);
+
+    // Initialize an accumulator with init_value.
+    llvm::AllocaInst* accumulator_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+        accumulator_llvm_type, "accumulator_" + std::to_string(i), b());
+    TF_ASSIGN_OR_RETURN(
+        llvm::Value* const init_value,
+        initial_value_generators[i](llvm_ir::IrArray::Index(index_type)));
+    Store(init_value, accumulator_addr);
+    accumulator_addrs.push_back(accumulator_addr);
+  }
+
+  // The enclosing loops go over all the target elements. Now we have to compute
+  // the actual target element. For this, we build a new loop nest to iterate
+  // over all the reduction dimensions in the argument.
+  // AddLoopsForShapeOnDimensions will return an Index where induction Value*s
+  // are placed for each dimension in dimensions, and all the rest are nullptrs.
+  llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), b(), index_type);
+  const HloInstruction* arg = reduce->operand(0);
+  std::vector<llvm::Value*> input_multi_index =
+      loops.AddLoopsForShapeOnDimensions(arg->shape(), reduced_dimensions,
+                                         "reduction_dim");
+
+  SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
+
+  // Build a full index for the input argument, using input_multi_index as the
+  // base. In input_multi_index only the reduction dimensions are filled in. We
+  // fill in the rest of the dimensions with induction Value*s taken from
+  // 'index' which iterates over the target array.  See the high-level
+  // description in the XLA documentation for details.
+  auto it = index.begin();
+
+  for (auto& i : input_multi_index) {
+    if (i == nullptr) {
+      i = *it++;
+    }
+  }
+  CHECK(index.end() == it);
+  llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
+                                      index_type);
+
+  std::vector<llvm::Value*> reduction_operands;
+  for (llvm::Value* accum : accumulator_addrs) {
+    llvm::Value* accum_value = Load(accum);
+    reduction_operands.push_back(accum_value);
+  }
+
+  for (int i = 0; i < accumulators_count; i++) {
+    TF_ASSIGN_OR_RETURN(llvm::Value* const input_element,
+                        input_generators[i](input_index));
+    reduction_operands.push_back(input_element);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> results,
+      EmitThreadLocalCall(*reduce->to_apply(), reduction_operands,
+                          "reduce_function"));
+
+  CHECK(results.size() == accumulators_count);
+  for (int i = 0; i < accumulators_count; i++) {
+    Store(results[i], accumulator_addrs[i]);
+  }
+  SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
+
+  if (is_variadic) {
+    // Emit a structure, as that what the LoopEmitter expects.
+    llvm::Value* returned_structure = llvm::UndefValue::get(
+        llvm::StructType::get(b()->getContext(), accumulator_types));
+    for (int i = 0; i < accumulators_count; i++) {
+      llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
+      returned_structure =
+          b()->CreateInsertValue(returned_structure, accumulator_value, i);
+    }
+    return returned_structure;
+  } else {
+    CHECK_EQ(accumulator_addrs.size(), 1);
+    return Load(accumulator_addrs[0]);
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 94e8f1d6400..270ec358f5e 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -17,12 +17,16 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_ELEMENTAL_IR_EMITTER_H_
 
 #include <unordered_map>
+#include <vector>
 
+#include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -220,6 +224,22 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       const HloToElementGeneratorMap& operand_to_generator,
       const llvm_ir::IrArray::Index& dot_result_index);
 
+  virtual StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view name) = 0;
+
+  StatusOr<llvm::Value*> EmitElementalReduceWindow(
+      const HloReduceWindowInstruction* reduce_window,
+      const llvm_ir::ElementGenerator& input_generator,
+      const llvm_ir::ElementGenerator& initial_value_generator,
+      const llvm_ir::IrArray::Index& index);
+
+  StatusOr<llvm::Value*> EmitElementalReduce(
+      const HloReduceInstruction* reduce,
+      std::vector<llvm_ir::ElementGenerator> input_generators,
+      std::vector<llvm_ir::ElementGenerator> initial_value_generators,
+      const llvm_ir::IrArray::Index& index);
+
   llvm::IRBuilder<>* const b_;
 
   llvm::Module* module_;
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index c6df786fb51..0a44cd8cc69 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -323,145 +323,6 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
         }
         return compute_nested_(*hlo->to_apply(), operand_elements);
       };
-    case HloOpcode::kReduceWindow:
-      // Pseudocode:
-      // for each index I in output
-      //   value = init_value
-      //   for each index W in window
-      //     for each dimension i from 0 to rank - 1
-      //       (input index I)[i] = O[i] * stride[i] + W[i] - pad_low[i]
-      //     if I in bounds of input
-      //       value = function(value, input[I])
-      //     output[O] = value
-      return [=, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        const Window& window = hlo->window();
-
-        PrimitiveType operand_element_type = operand->shape().element_type();
-        llvm::Value* accum_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(operand_element_type, module_),
-            "reduce_window_accum_ptr", b_);
-        {
-          TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
-                              operand_to_generator.at(hlo->operand(1))(
-                                  IrArray::Index(index.GetType())));
-          Store(init_value, accum_ptr);
-        }
-
-        llvm::Type* index_type = index.GetType();
-        auto index_typed_const = [&](uint64 c) -> llvm::Constant* {
-          return index.GetConstantWithIndexType(c);
-        };
-
-        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
-        std::vector<int64> window_size;
-        for (const auto& dim : window.dimensions()) {
-          window_size.push_back(dim.size());
-        }
-        const IrArray::Index window_index = loops.AddLoopsForShape(
-            ShapeUtil::MakeShape(operand_element_type, window_size), "window");
-        CHECK_EQ(window_index.size(), index.size());
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
-
-        std::vector<llvm::Value*> input_multi_index(index.size());
-        llvm::Value* in_bounds = b_->getInt1(true);
-        for (size_t i = 0; i < index.size(); ++i) {
-          llvm::Value* stridden_index = NSWMul(
-              index[i], index_typed_const(window.dimensions(i).stride()));
-          input_multi_index[i] = NSWSub(
-              NSWAdd(stridden_index,
-                     NSWMul(window_index[i],
-                            index_typed_const(
-                                window.dimensions(i).window_dilation()))),
-              index_typed_const(window.dimensions(i).padding_low()));
-
-          // We need to verify that we are not in the dilated base area.
-          llvm::Value* dilation_condition = ICmpEQ(
-              SRem(input_multi_index[i],
-                   index_typed_const(window.dimensions(i).base_dilation())),
-              index_typed_const(0));
-          in_bounds = And(in_bounds, dilation_condition);
-
-          // Apply base dilation to the index.
-          input_multi_index[i] =
-              SDiv(input_multi_index[i],
-                   index_typed_const(window.dimensions(i).base_dilation()));
-
-          // We must check whether 0 <= input_multi_index[i] < bound, as
-          // otherwise we are in the pad and so can skip the computation. This
-          // comparison is equivalent to the unsigned comparison
-          // input_multi_index[i] < bound, as a negative value wraps to a large
-          // positive value.
-          in_bounds =
-              And(in_bounds,
-                  ICmpULT(input_multi_index[i],
-                          index_typed_const(operand->shape().dimensions(i))));
-        }
-
-        llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", b_);
-        SetToFirstInsertPoint(if_data.true_block, b_);
-
-        // We are not in pad, so do the computation.
-        IrArray::Index input_index(input_multi_index, operand->shape(),
-                                   index_type);
-        TF_ASSIGN_OR_RETURN(llvm::Value * input_value,
-                            operand_to_generator.at(operand)(input_index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * accum_value,
-            compute_nested_(*hlo->to_apply(), {Load(accum_ptr), input_value}));
-        Store(accum_value, accum_ptr);
-
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b_);
-        return Load(accum_ptr);
-      };
-    case HloOpcode::kReduce:
-      // TODO(b/118332391): This should be supported.
-      CHECK_EQ(hlo->operand_count(), 2) << "Did not expect variadic reduce";
-      return [=, &operand_to_generator](
-                 const IrArray::Index& output_index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* operand = hlo->operand(0);
-        llvm::Value* accum_ptr =
-            b()->CreateAlloca(llvm_ir::PrimitiveTypeToIrType(
-                hlo->shape().element_type(), module_));
-        llvm::Type* index_type = output_index.GetType();
-        TF_ASSIGN_OR_RETURN(llvm::Value * init_value,
-                            operand_to_generator.at(hlo->operand(1))(
-                                IrArray::Index(index_type)));
-        b()->CreateStore(init_value, accum_ptr);
-
-        llvm_ir::ForLoopNest loops(IrName(hlo), b_, index_type);
-        std::vector<llvm::Value*> input_multi_index =
-            loops.AddLoopsForShapeOnDimensions(
-                operand->shape(), hlo->dimensions(), "reduction_dim");
-        if (!ShapeUtil::IsScalar(hlo->shape())) {
-          // Here only input_multi_index[hlo->dimensions()] are non-null, so we
-          // must set the rest.
-          size_t j = 0;
-          for (auto& i : input_multi_index) {
-            if (i == nullptr) {
-              i = output_index[j++];
-            }
-          }
-          CHECK_EQ(output_index.size(), j);
-        }
-        llvm_ir::IrArray::Index input_index(
-            input_multi_index, hlo->operand(0)->shape(), index_type);
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b());
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * input_value,
-            operand_to_generator.at(hlo->operand(0))(input_index));
-        TF_ASSIGN_OR_RETURN(
-            llvm::Value * accum_value,
-            compute_nested_(*hlo->to_apply(),
-                            {b()->CreateLoad(accum_ptr), input_value}));
-        b()->CreateStore(accum_value, accum_ptr);
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), b());
-        return b()->CreateLoad(accum_ptr);
-      };
     default:
       return ElementalIrEmitter::MakeElementGenerator(hlo,
                                                       operand_to_generator);
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index c8a58a21980..c846a6fa939 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -92,6 +92,17 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitComplexAbs(PrimitiveType prim_type,
                                         llvm::Value* value) override;
 
+  StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
+      const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
+      absl::string_view) override {
+    // TODO(b/118332391): Supported variadic return values.
+    auto result = compute_nested_(callee, parameters);
+    if (!result.ok()) {
+      return result.status();
+    }
+    return std::vector<llvm::Value*>{result.ValueOrDie()};
+  }
+
   llvm::Value* EmitThreadId() override;
 
  private:

From 3ba00b04c16ed75cfa97237adf8089c711f030ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 07:02:46 -0700
Subject: [PATCH 0299/1533] Shard a slow test

PiperOrigin-RevId: 310904207
Change-Id: Ic3fd8eb46d70482ca283c7bed6869274d8353acc
---
 tensorflow/compiler/tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 5cca1e69f53..515e8aaf06c 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1386,6 +1386,7 @@ tf_xla_py_test(
     size = "medium",
     srcs = ["fused_batchnorm_test.py"],
     python_version = "PY3",
+    shard_count = 5,
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],

From d32ec0bf0bc2f3910e32526e6b7968e1ede68e21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 07:46:45 -0700
Subject: [PATCH 0300/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310910059
Change-Id: Id6a6ac5962fe7ebd00947d299e0dbf6272278af5
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From d5c1743ddef5ce653645dc85fd7437c044df9e7a Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 11 May 2020 07:47:39 -0700
Subject: [PATCH 0301/1533] [XLA:CPU/GPU] Merge the emission of elemental kMap

There's not a lot of duplication here, but no need to have it twice.

PiperOrigin-RevId: 310910166
Change-Id: I6dfff87d56f4cc1788344300e826975cc38fe452
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../xla/service/cpu/elemental_ir_emitter.cc   | 12 ---------
 .../compiler/xla/service/cpu/ir_emitter.cc    |  7 -----
 .../compiler/xla/service/cpu/ir_emitter.h     |  5 ----
 .../xla/service/elemental_ir_emitter.cc       | 26 +++++++++++++++++++
 .../xla/service/elemental_ir_emitter.h        |  5 ++++
 .../xla/service/gpu/elemental_ir_emitter.cc   | 24 -----------------
 .../xla/service/gpu/elemental_ir_emitter.h    |  4 ---
 8 files changed, 32 insertions(+), 52 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 499c4e25828..3349528ebc2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3846,6 +3846,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:transform_utils",
     ],
diff --git a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
index ccd17bb791d..05364a4492b 100644
--- a/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/elemental_ir_emitter.cc
@@ -109,18 +109,6 @@ llvm_ir::ElementGenerator CpuElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const HloToElementGeneratorMap& operand_to_generator) {
   switch (hlo->opcode()) {
-    case HloOpcode::kMap:
-      return [this, hlo, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        std::vector<llvm::Value*> operands;
-        for (int i = 0; i < hlo->operand_count(); i++) {
-          TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                              operand_to_generator.at(hlo->operand(i))(index));
-          operands.push_back(operand_value);
-        }
-        return ir_emitter_->EmitElementalMap(*Cast<HloMapInstruction>(hlo),
-                                             operands, llvm_ir::IrName(hlo));
-      };
     case HloOpcode::kConvolution:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return ir_emitter_->EmitElementalConvolution(
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 78d859cb34a..2b715bfa17a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -695,13 +695,6 @@ Status IrEmitter::HandleTuple(HloInstruction* tuple) {
   return Status::OK();
 }
 
-llvm::Value* IrEmitter::EmitElementalMap(
-    const HloMapInstruction& map_instr,
-    absl::Span<llvm::Value* const> elemental_operands, absl::string_view name) {
-  return EmitScalarReturningThreadLocalCall(*map_instr.to_apply(),
-                                            elemental_operands, name);
-}
-
 Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   // Pseudo code for reduce window:
   //
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index c5e05db40bd..24524c67b11 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -115,11 +115,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // Emit an LLVM global variable for every constant buffer allocation.
   Status EmitConstantGlobals();
 
-  // Emit code to map one element according to `map_instr`.
-  llvm::Value* EmitElementalMap(
-      const HloMapInstruction& map_instr,
-      absl::Span<llvm::Value* const> elemental_operands,
-      absl::string_view name);
   // Emit code to emit the element at `index` for a convolution instruction.
   StatusOr<llvm::Value*> EmitElementalConvolution(
       const HloConvolutionInstruction* convolution,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index b4ea18a8a1e..8cb660de46c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -2422,6 +2422,21 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                  -> StatusOr<llvm::Value*> {
         return EmitElementalDot(hlo, operand_to_generator, dot_result_index);
       };
+    case HloOpcode::kMap:
+      return [this, hlo, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        std::vector<llvm::Value*> operands;
+        for (int i = 0; i < hlo->operand_count(); i++) {
+          TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                              operand_to_generator.at(hlo->operand(i))(index));
+          operands.push_back(operand_value);
+        }
+        std::vector<llvm_ir::ElementGenerator> input_generators;
+        for (const HloInstruction* instr : hlo->operands()) {
+          input_generators.push_back(operand_to_generator.at(instr));
+        }
+        return EmitElementalMap(Cast<HloMapInstruction>(hlo), operands);
+      };
     case HloOpcode::kReduceWindow:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
         return EmitElementalReduceWindow(
@@ -2473,6 +2488,17 @@ llvm::Value* ElementalIrEmitter::EmitComposeComplex(const HloInstruction* op,
   return complex;
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalMap(
+    const HloMapInstruction* map_instr,
+    absl::Span<llvm::Value* const> elemental_operands) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<llvm::Value*> values,
+      EmitThreadLocalCall(*map_instr->to_apply(), elemental_operands,
+                          llvm_ir::IrName(map_instr)));
+  CHECK_EQ(values.size(), 1);
+  return values[0];
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalReduceWindow(
     const HloReduceWindowInstruction* reduce_window,
     const llvm_ir::ElementGenerator& input_generator,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index 270ec358f5e..06a9d7b194c 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
@@ -228,6 +229,10 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view name) = 0;
 
+  StatusOr<llvm::Value*> EmitElementalMap(
+      const HloMapInstruction* map_instr,
+      absl::Span<llvm::Value* const> elemental_operands);
+
   StatusOr<llvm::Value*> EmitElementalReduceWindow(
       const HloReduceWindowInstruction* reduce_window,
       const llvm_ir::ElementGenerator& input_generator,
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 0a44cd8cc69..1be0b1b4e7b 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -305,29 +305,5 @@ llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
   return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
 }
 
-llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator(
-    const HloInstruction* hlo,
-    const HloToElementGeneratorMap& operand_to_generator) {
-  switch (hlo->opcode()) {
-    case HloOpcode::kMap:
-      return [=, &operand_to_generator](
-                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        TF_RET_CHECK(!hlo->operands().empty())
-            << "Zero operand map not implemented in GPU backend.";
-        TF_RET_CHECK(hlo->to_apply()->num_parameters() > 0);
-        std::vector<llvm::Value*> operand_elements;
-        for (HloInstruction* operand : hlo->operands()) {
-          TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                              operand_to_generator.at(operand)(index));
-          operand_elements.push_back(value);
-        }
-        return compute_nested_(*hlo->to_apply(), operand_elements);
-      };
-    default:
-      return ElementalIrEmitter::MakeElementGenerator(hlo,
-                                                      operand_to_generator);
-  }
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index c846a6fa939..3c4e9f7c1e6 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -47,10 +47,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
                         llvm::Module* module, llvm::IRBuilder<>* b,
                         NestedComputer compute_nested);
 
-  llvm_ir::ElementGenerator MakeElementGenerator(
-      const HloInstruction* hlo,
-      const HloToElementGeneratorMap& operand_to_generator) override;
-
  protected:
   StatusOr<llvm::Value*> EmitFloatBinaryOp(const HloInstruction* op,
                                            llvm::Value* lhs_value,

From 1631fa86e742f31b8adc1fafbc39bc66f5c9f375 Mon Sep 17 00:00:00 2001
From: Mingxing Tan <tanmingxing@google.com>
Date: Mon, 11 May 2020 08:36:12 -0700
Subject: [PATCH 0302/1533] Delete the useless print.

PiperOrigin-RevId: 310918371
Change-Id: Ie88d67e1aee503b38049656579edd84b1dccff0b
---
 tensorflow/core/profiler/internal/tfprof_stats.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 22b3bdc2042..56e6e2bcba3 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -58,7 +58,6 @@ TFStats::TFStats(std::unique_ptr<GraphDef> graph,
       ckpt_reader_(std::move(ckpt_reader)) {
   CHECK(graph) << "Must at least have GraphDef";
 
-  absl::PrintF("Parsing Inputs...\n");
   AddGraph(std::move(graph));
   if (run_meta && run_meta->has_step_stats()) {
     AddRunMeta(0, std::move(run_meta));

From 1514dd40257720e1f2d4fd65000c0d2e5ca7d02c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 08:42:41 -0700
Subject: [PATCH 0303/1533] Internal change only.

PiperOrigin-RevId: 310919530
Change-Id: I001957cc435c019c3611d728c8954cfe610ea6c8
---
 tensorflow/core/BUILD                             | 4 ++--
 tensorflow/core/platform/build_config.bzl         | 2 ++
 tensorflow/core/platform/default/build_config.bzl | 5 ++++-
 tensorflow/tensorflow.bzl                         | 4 ++--
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 28ef943f021..7f34bfaa186 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -112,6 +112,7 @@ load(
     "tf_kernel_tests_linkstatic",
     "tf_lib_proto_parsing_deps",
     "tf_portable_deps_no_runtime",
+    "tf_portable_proto_lib",
     "tf_proto_library",
     "tf_proto_library_cc",
     "tf_protos_all_impl",
@@ -1378,10 +1379,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":protos_all_cc_impl",
         "//tensorflow/core/util:stats_calculator_portable",
         "//tensorflow/core:mobile_additional_lib_deps",
-    ] + tf_portable_deps_no_runtime(),
+    ] + tf_portable_proto_lib() + tf_portable_deps_no_runtime(),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl
index f0613cdc069..ab452562245 100644
--- a/tensorflow/core/platform/build_config.bzl
+++ b/tensorflow/core/platform/build_config.bzl
@@ -26,6 +26,7 @@ load(
     _tf_platform_alias = "tf_platform_alias",
     _tf_platform_deps = "tf_platform_deps",
     _tf_portable_deps_no_runtime = "tf_portable_deps_no_runtime",
+    _tf_portable_proto_lib = "tf_portable_proto_lib",
     _tf_proto_library = "tf_proto_library",
     _tf_proto_library_cc = "tf_proto_library_cc",
     _tf_proto_library_py = "tf_proto_library_py",
@@ -65,6 +66,7 @@ tf_lib_proto_parsing_deps = _tf_lib_proto_parsing_deps
 tf_logging_deps = _tf_logging_deps
 tf_platform_alias = _tf_platform_alias
 tf_platform_deps = _tf_platform_deps
+tf_portable_proto_lib = _tf_portable_proto_lib
 tf_portable_deps_no_runtime = _tf_portable_deps_no_runtime
 tf_proto_library = _tf_proto_library
 tf_proto_library_cc = _tf_proto_library_cc
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 18a8285ece1..fd6e78addce 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -727,6 +727,9 @@ def tf_protobuf_deps():
         otherwise = [clean_dep("@com_google_protobuf//:protobuf_headers")],
     )
 
+def tf_portable_proto_lib():
+    return ["//tensorflow/core:protos_all_cc_impl"]
+
 def tf_protobuf_compiler_deps():
     return if_static(
         [
@@ -764,7 +767,7 @@ def tf_portable_deps_no_runtime():
         "@nsync//:nsync_cpp",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
-    ] + tf_protobuf_deps()
+    ]
 
 def tf_google_mobile_srcs_no_runtime():
     return []
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 61d6656ec80..f56330b428a 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -193,10 +193,10 @@ def if_macos(a, otherwise = []):
         "//conditions:default": otherwise,
     })
 
-def if_ios(a):
+def if_ios(a, otherwise = []):
     return select({
         clean_dep("//tensorflow:ios"): a,
-        "//conditions:default": [],
+        "//conditions:default": otherwise,
     })
 
 def if_ios_x86_64(a):

From a785878c0f533c51f6353d8dbecab8f19331a712 Mon Sep 17 00:00:00 2001
From: Alex Stark <starka@google.com>
Date: Mon, 11 May 2020 09:01:42 -0700
Subject: [PATCH 0304/1533] Build clean up (OVIC).

PiperOrigin-RevId: 310923311
Change-Id: I93755c4c44dabc91651c7c173806fb44957d6bc9
---
 tensorflow/lite/java/ovic/BUILD | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/lite/java/ovic/BUILD b/tensorflow/lite/java/ovic/BUILD
index 947fbee1a45..e64bd3036ac 100644
--- a/tensorflow/lite/java/ovic/BUILD
+++ b/tensorflow/lite/java/ovic/BUILD
@@ -58,7 +58,6 @@ android_library(
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -75,7 +74,6 @@ java_library(
         "//tensorflow/lite/java:tensorflowlite_java",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -114,7 +112,6 @@ android_library(
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )
 
@@ -131,6 +128,5 @@ java_library(
         "//tensorflow/lite/java:tensorflowlite_java",
         "//tensorflow/lite/java/src/main/native",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
     ],
 )

From fea2279d1091b268df6ce4ff422f330e481efed7 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Mon, 11 May 2020 09:18:02 -0700
Subject: [PATCH 0305/1533] Support MirrorPad in hexagon

PiperOrigin-RevId: 310926466
Change-Id: I71ba751663f2229aaa0093c8441c5d017832a3ce
---
 .../experimental/delegates/hexagon/README.md  |   1 +
 .../delegates/hexagon/builders/BUILD          |   2 +
 .../hexagon/builders/mirror_pad_builder.cc    | 112 +++++++++++++++
 .../hexagon/builders/mirror_pad_builder.h     |  49 +++++++
 .../delegates/hexagon/builders/op_builder.cc  |   2 +
 .../delegates/hexagon/builders/op_factory.h   |   1 +
 .../delegates/hexagon/builders/tests/BUILD    |   1 +
 .../hexagon/builders/tests/mirror_pad_test.cc | 127 ++++++++++++++++++
 .../experimental/delegates/hexagon/utils.cc   |  12 ++
 9 files changed, 307 insertions(+)
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc

diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index 5cf71fdb5bf..07f1a92bdec 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -80,6 +80,7 @@ are verified in `IsNodeSupportedByHexagon`:
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
 * MaxPool2D (without any activation) (b/129276536)
+* MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
 * Pad: Only supports 0 padding (b/139277813)
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index ae8ffe293e9..ff764984de9 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -19,6 +19,7 @@ cc_library(
         "hardswish_builder.cc",
         "l2_normalization_builder.cc",
         "matmul_builder.cc",
+        "mirror_pad_builder.cc",
         "neg_op_builder.cc",
         "op_builder.cc",
         "pad_builder.cc",
@@ -45,6 +46,7 @@ cc_library(
         "hardswish_builder.h",
         "l2_normalization_builder.h",
         "matmul_builder.h",
+        "mirror_pad_builder.h",
         "neg_op_builder.h",
         "op_builder.h",
         "pad_builder.h",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc
new file mode 100644
index 00000000000..2a04088f4f3
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.cc
@@ -0,0 +1,112 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus MirrorPadOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                                  const TfLiteIntArray* outputs,
+                                                  TfLiteContext* context) {
+  static int quant_bound_shape[] = {1, 1, 1, 1};
+  int tensor_id;
+
+  // Input data tensor.
+  tensor_id = inputs->data[0];
+  const auto& input_tensor = context->tensors[tensor_id];
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+
+  // Padding tensor.
+  // Should be a constant.
+  tensor_id = inputs->data[1];
+  const auto& padding_tensor = context->tensors[tensor_id];
+  if (padding_tensor.dims->size != 2 || padding_tensor.dims->data[0] > 4 ||
+      padding_tensor.dims->data[1] != 2) {
+    TF_LITE_KERNEL_LOG(context, "Invalid padding tensor shape");
+    return kTfLiteError;
+  }
+  paddings_shape_ = {1, 1, 4, 2};
+  std::vector<int> padding_data(8, 0);
+  // Hexagon always expects padding data for each dimension in order {b, h, w,
+  // d}. This start value ensures we pad the non-relevant dimensions with 0.
+  int padding_data_start = 8 - padding_tensor.dims->data[0] * 2;
+  for (int i = 0; i < padding_tensor.dims->data[0] * 2; ++i) {
+    padding_data[padding_data_start + i] = padding_tensor.data.i32[i];
+  }
+  auto* const_padding_node = graph_builder_->AddConstNodeWithData(
+      paddings_shape_.data(), reinterpret_cast<char*>(padding_data.data()),
+      padding_data.size() * sizeof(padding_data[0]));
+  AddInput(TensorID(const_padding_node->GetID(), 0));
+  // Padding type.
+  const TfLiteMirrorPaddingParams* params =
+      reinterpret_cast<const TfLiteMirrorPaddingParams*>(builtin_data_);
+  if (params->mode == kTfLiteMirrorPaddingReflect) {
+    SetPaddingType(NN_PAD_MIRROR_REFLECT);
+  } else if (params->mode == kTfLiteMirrorPaddingSymmetric) {
+    SetPaddingType(NN_PAD_MIRROR_SYMMETRIC);
+  }
+
+  // Min/max values for input tensor.
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_min_),
+      sizeof(input_min_));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_max_),
+      sizeof(input_max_));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+
+  // Hexagon outputs for this node.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MirrorPadOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                                 TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+MirrorPadOpBuilder::~MirrorPadOpBuilder() {}
+
+OpBuilder* CreateMirrorPadBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new MirrorPadOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h
new file mode 100644
index 00000000000..6fcb2606701
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/mirror_pad_builder.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
+
+#include <vector>
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MirrorPadOpBuilder : public OpBuilder {
+ public:
+  explicit MirrorPadOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+  ~MirrorPadOpBuilder() override;
+
+ private:
+  TensorID node_output_;
+  float input_min_, input_max_;
+  std::vector<int> paddings_shape_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIRROR_PAD_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index e20127ac6c1..c7432e64c79 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -43,6 +43,8 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateReduceBuilder(this, OP_QuantizedSum_8to32);
     case kTfLiteBuiltinPad:
       return CreatePadBuilder(this, OP_QuantizedPad_8);
+    case kTfLiteBuiltinMirrorPad:
+      return CreateMirrorPadBuilder(this, OP_MirrorPad_8);
     case kTfLiteBuiltinFullyConnected:
       return CreateMatMulBuilder(this, OP_QuantizedMatMul_8x8to32);
     case kTfLiteBuiltinAveragePool2d:
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index e7236fb0e00..0beb88cc68e 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -35,6 +35,7 @@ OpBuilder* CreatePool2DBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateReshapeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateSoftmaxBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateReduceBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMirrorPadBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreatePadBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateResizeNearestNeighborBuilder(GraphBuilder* graph_builder,
                                               int op_type);
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index b1df59c4098..47a78dca6ac 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -30,6 +30,7 @@ hexagon_op_tests(
         "conv_test.cc",
         "l2_norm_test.cc",
         "matmul_test.cc",
+        "mirror_pad_test.cc",
         "mul_test.cc",
         "neg_test.cc",
         "pad_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc
new file mode 100644
index 00000000000..4caf96ac8ce
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/mirror_pad_test.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename T>
+class MirrorPadOpModel : public SingleOpModelWithHexagon {
+ public:
+  MirrorPadOpModel(const TensorData& input,
+                   std::initializer_list<int> paddings_shape,
+                   std::initializer_list<int> paddings,
+                   const TensorData& output, const tflite::MirrorPadMode mode) {
+    input_id_ = AddInput(input);
+    padding_matrix_id_ =
+        AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    output_id_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions,
+                 CreateMirrorPadOptions(builder_, mode).Union());
+    BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+
+ protected:
+  int input_id_;
+  int padding_matrix_id_;
+  int output_id_;
+};
+
+TEST(MirrorPadTest, EmptyPad_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {0, 0, 0, 0},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {2, 3}, -1.0, 1.0}, {2, 2},
+                                 {1, 1, 1, 1}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3,
+                                4, 4, 5, 6, 6, 4, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {1, 1, 1, 1},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2,
+                                5, 4, 5, 6, 5, 2, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Reflect_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {2, 3}, -1.0, 1.0}, {2, 2},
+                                 {1, 0, 1, 0}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Symmetric_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {0, 1, 0, 1},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Reflect_Int8) {
+  MirrorPadOpModel<int8_t> model({TensorType_INT8, {3}, -1.0, 1.0}, {1, 2},
+                                 {0, 2}, {TensorType_INT8, {}, -1.0, 1.0},
+                                 tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int8_t>(model.input_tensor_id(), {1, 2, 3});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Symmetric_UInt8) {
+  MirrorPadOpModel<uint8_t> model({TensorType_UINT8, {3}, -1.0, 1.0}, {1, 2},
+                                  {0, 2}, {TensorType_UINT8, {}, -1.0, 1.0},
+                                  tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_Whole_UInt8) {
+  MirrorPadOpModel<uint8_t> model(
+      {TensorType_UINT8, {2, 3}, -1.0, 1.0}, {2, 2}, {1, 1, 2, 2},
+      {TensorType_UINT8, {}, -1.0, 1.0}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<uint8_t>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1,
+                                6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1}));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index df7d7424e37..d9d14804b49 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -80,6 +80,7 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinL2Normalization:
     case kTfLiteBuiltinLogistic:
     case kTfLiteBuiltinMaxPool2d:
+    case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
     case kTfLiteBuiltinPad:
     case kTfLiteBuiltinQuantize:
@@ -159,6 +160,17 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       // causes an unexpected shift in dequantized values.
       return false;
     }
+    case kTfLiteBuiltinMirrorPad: {
+      if (!InputsWithCorrectTypes(
+              node, context, {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) ||
+          !IsConstantTensor(GetInput(context, node, 1)))
+        return false;
+      const TfLiteMirrorPaddingParams* params =
+          reinterpret_cast<const TfLiteMirrorPaddingParams*>(
+              node->builtin_data);
+      return params->mode == kTfLiteMirrorPaddingReflect ||
+             params->mode == kTfLiteMirrorPaddingSymmetric;
+    }
     case kTfLiteBuiltinPad: {
       // TODO(b/139277813): Currently we only support padding with the default
       // of 0. Add support for user-defined constant if required.

From 47b75bbbec382facd58062c6d59b8e7c2e1560be Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 11 May 2020 09:22:01 -0700
Subject: [PATCH 0306/1533] Remove the special handling of TRTEngineOp in
 saved_model loader.

PiperOrigin-RevId: 310927351
Change-Id: I9307315f0c52019a7e39d13e94efb49858c7f9ce
---
 tensorflow/python/saved_model/function_deserialization.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index aeca90bdfd1..b36a1f27456 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -332,6 +332,11 @@ def load_function_def_library(library, load_shared_name_suffix=None):
 
     functions[fdef.signature.name] = func
     renamed_functions[func.name] = func
+    if any(op.type == "TRTEngineOp" for op in func_graph.get_operations()):
+      # TODO(b/150708051): Remove this hack once TensorRT SavedModel integration
+      # is fixed. Currently it's leaking memory to maintain bug compatibility
+      # with previous behavior.
+      func.add_to_graph(ops.get_default_graph())
 
   return functions
 

From 12f310ccab666d0d659b0bf99b0051ca74fe59c0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 09:46:39 -0700
Subject: [PATCH 0307/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 310932304
Change-Id: Ie1198dc6b08cb1227fb24a4a5984b89841eb96ed
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 90e1344ab2f335e2e2b300110d493e5293ba16d1 Mon Sep 17 00:00:00 2001
From: Stefano Galarraga <galarragas@google.com>
Date: Mon, 11 May 2020 09:54:40 -0700
Subject: [PATCH 0308/1533] Fix ANEURALNETWORKS_BAD_DATA execution failures
 with sum/max/min/reduce operations with scalar inputs.

PiperOrigin-RevId: 310933989
Change-Id: I17db703e275564fb5f2e2f50e41b30096dc5c6f4
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index e790d423434..002c29915c6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -3769,7 +3769,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
         }
       } else if (reg->builtin_code == kTfLiteBuiltinMaximum ||
                  reg->builtin_code == kTfLiteBuiltinMinimum) {
-        const TfLiteTensor& operand_tensor = context->tensors[input_pos];
+        const TfLiteTensor& operand_tensor =
+            context->tensors[node->inputs->data[input_pos]];
         if (operand_tensor.dims->size == 0) {
           int tensor_index;
 
@@ -3814,7 +3815,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
                   reg->builtin_code == kTfLiteBuiltinSum) &&
                  (input_pos == 1)) {
         // The axis needs, be converted to a tensor if specified as scalar
-        const TfLiteTensor& axis_tensor = context->tensors[1];
+        const TfLiteTensor& axis_tensor =
+            context->tensors[node->inputs->data[input_pos]];
         if (axis_tensor.dims->size == 0) {
           TF_LITE_ENSURE_STATUS(
               builder.AddVectorInt32Operand(axis_tensor.data.i32, 1));

From eb75c470f909daa72fec6f397ba4892d9d178842 Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Mon, 11 May 2020 10:18:23 -0700
Subject: [PATCH 0309/1533] adding version number.

---
 third_party/mkl_dnn/mkldnn_threadpool.BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/mkl_dnn/mkldnn_threadpool.BUILD b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
index 35175b7f90f..7209b8a62d0 100644
--- a/third_party/mkl_dnn/mkldnn_threadpool.BUILD
+++ b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
@@ -42,7 +42,7 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_MINOR@": "4",
         "@DNNL_VERSION_PATCH@": "0",
         "@DNNL_VERSION_HASH@": "N/A",
     },

From 1e5884b2eccc5584df0784ff24e4c4210ae60e6d Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 11 May 2020 10:23:56 -0700
Subject: [PATCH 0310/1533] IWYU in profiler/utils

PiperOrigin-RevId: 310940374
Change-Id: Ide3196d2384db3d47997887e6246f36d1abe8cc8
---
 tensorflow/core/profiler/utils/BUILD          | 26 +++++++++++---
 tensorflow/core/profiler/utils/cost_utils.cc  | 15 ++++++++
 tensorflow/core/profiler/utils/cost_utils.h   | 10 +++---
 .../core/profiler/utils/derived_timeline.cc   | 12 +++++++
 .../core/profiler/utils/derived_timeline.h    |  6 ++++
 .../profiler/utils/derived_timeline_test.cc   |  4 ++-
 tensorflow/core/profiler/utils/errors.cc      |  2 ++
 tensorflow/core/profiler/utils/event_span.cc  |  8 +++++
 tensorflow/core/profiler/utils/event_span.h   |  3 +-
 .../core/profiler/utils/group_events.cc       | 14 +++++++-
 tensorflow/core/profiler/utils/group_events.h |  7 ++++
 .../core/profiler/utils/group_events_test.cc  |  3 ++
 .../profiler/utils/hardware_type_utils.cc     |  1 +
 .../core/profiler/utils/kernel_stats_utils.cc | 12 ++++---
 .../profiler/utils/op_metrics_db_utils.cc     | 11 ++++--
 tensorflow/core/profiler/utils/op_utils.cc    | 10 ++++--
 tensorflow/core/profiler/utils/op_utils.h     |  3 --
 tensorflow/core/profiler/utils/tf_op_utils.cc |  5 ++-
 tensorflow/core/profiler/utils/tf_op_utils.h  |  2 +-
 .../core/profiler/utils/tf_op_utils_test.cc   |  1 +
 .../core/profiler/utils/tf_xplane_visitor.h   |  1 +
 .../core/profiler/utils/tfstreamz_utils.cc    | 36 ++++++++++++-------
 .../core/profiler/utils/tfstreamz_utils.h     |  6 ++--
 tensorflow/core/profiler/utils/timespan.h     |  3 ++
 .../core/profiler/utils/xplane_builder.cc     | 10 +++++-
 .../core/profiler/utils/xplane_builder.h      | 21 ++++++-----
 .../profiler/utils/xplane_builder_test.cc     |  4 +++
 .../core/profiler/utils/xplane_schema.cc      |  3 ++
 .../core/profiler/utils/xplane_schema.h       |  3 +-
 .../core/profiler/utils/xplane_utils.cc       |  9 +++++
 tensorflow/core/profiler/utils/xplane_utils.h |  1 +
 .../core/profiler/utils/xplane_utils_test.cc  |  5 +++
 .../core/profiler/utils/xplane_visitor.cc     |  9 +++++
 .../core/profiler/utils/xplane_visitor.h      |  5 ++-
 34 files changed, 217 insertions(+), 54 deletions(-)

diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index ad26dcc5774..7f72781e54c 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
@@ -83,7 +84,6 @@ cc_library(
     hdrs = ["tf_op_utils.h"],
     deps = [
         "//tensorflow/core:regexp_internal",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -96,6 +96,7 @@ tf_cc_test(
         ":tf_op_utils",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -156,6 +157,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -170,7 +172,6 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -196,7 +197,6 @@ tf_cc_test(
     name = "xplane_utils_test",
     srcs = ["xplane_utils_test.cc"],
     deps = [
-        ":time_utils",
         ":xplane_builder",
         ":xplane_utils",
         ":xplane_visitor",
@@ -205,6 +205,8 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -232,6 +234,7 @@ cc_library(
     deps = [
         ":xplane_schema",
         ":xplane_visitor",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
     ],
 )
 
@@ -243,9 +246,11 @@ cc_library(
     deps = [
         ":tf_op_utils",
         ":tf_xplane_visitor",
+        ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
         ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -263,10 +268,13 @@ tf_cc_test(
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
+        ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -281,10 +289,13 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler/costs:cost_estimator",
+        "//tensorflow/core/grappler/costs:op_context",
         "//tensorflow/core/grappler/costs:op_level_cost_estimator",
         "//tensorflow/core/grappler/costs:op_performance_data_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -296,6 +307,7 @@ cc_library(
         ":group_events",
         ":tf_op_utils",
         ":tf_xplane_visitor",
+        ":time_utils",
         ":timespan",
         ":trace_utils",
         ":xplane_builder",
@@ -305,8 +317,10 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -321,6 +335,8 @@ tf_cc_test(
         ":xplane_builder",
         ":xplane_schema",
         ":xplane_utils",
+        ":xplane_visitor",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
@@ -347,10 +363,10 @@ cc_library(
         ":xplane_builder",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:protos_all_cc",
         "//tensorflow/core/profiler/protobuf:tfstreamz_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
diff --git a/tensorflow/core/profiler/utils/cost_utils.cc b/tensorflow/core/profiler/utils/cost_utils.cc
index 754aa655af3..a94f09bb79c 100644
--- a/tensorflow/core/profiler/utils/cost_utils.cc
+++ b/tensorflow/core/profiler/utils/cost_utils.cc
@@ -15,12 +15,27 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/cost_utils.h"
 
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/strip.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
+#include "tensorflow/core/grappler/costs/op_context.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/cost_utils.h b/tensorflow/core/profiler/utils/cost_utils.h
index f1095556c2b..a778bca5330 100644
--- a/tensorflow/core/profiler/utils/cost_utils.h
+++ b/tensorflow/core/profiler/utils/cost_utils.h
@@ -15,12 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_COST_UTILS_H_
 
-#include <set>
+#include <string>
 
-#include "absl/strings/string_view.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/grappler/costs/cost_estimator.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -46,7 +47,8 @@ class TfOpRoofLineCostEstimator
   OpRoofLineStats Predict(const XEventVisitor& event);
 
  private:
-  std::set<string> unsupported_ops_;  // summary for unsupported ops.
+  absl::flat_hash_set<std::string>
+      unsupported_ops_;  // summary for unsupported ops.
 
   TF_DISALLOW_COPY_AND_ASSIGN(TfOpRoofLineCostEstimator);
 };
diff --git a/tensorflow/core/profiler/utils/derived_timeline.cc b/tensorflow/core/profiler/utils/derived_timeline.cc
index c99d8e82cb7..112c0977763 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline.cc
@@ -14,15 +14,27 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
diff --git a/tensorflow/core/profiler/utils/derived_timeline.h b/tensorflow/core/profiler/utils/derived_timeline.h
index 61b62bdc8da..cd4da7996c5 100644
--- a/tensorflow/core/profiler/utils/derived_timeline.h
+++ b/tensorflow/core/profiler/utils/derived_timeline.h
@@ -15,7 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_DERIVED_TIMELINE_H_
 
+#include <functional>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
index f3e6b66f087..76a0188480a 100644
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline_test.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 
-#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/errors.cc b/tensorflow/core/profiler/utils/errors.cc
index d829ee06709..9c678e98a43 100644
--- a/tensorflow/core/profiler/utils/errors.cc
+++ b/tensorflow/core/profiler/utils/errors.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/errors.h"
 
+#include "absl/strings/string_view.h"
+
 namespace tensorflow {
 namespace profiler {
 
diff --git a/tensorflow/core/profiler/utils/event_span.cc b/tensorflow/core/profiler/utils/event_span.cc
index 9768331b88f..5e0413c4ba2 100644
--- a/tensorflow/core/profiler/utils/event_span.cc
+++ b/tensorflow/core/profiler/utils/event_span.cc
@@ -14,11 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/event_span.h"
 
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 36b31722968..1adc6a75d82 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_EVENT_SPAN_H_
 
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/core/platform/logging.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 60d12c0862d..42961492225 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -15,13 +15,25 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/group_events.h"
 
-#include <stack>
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 1140f2dab8d..4b6fc58e3b8 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -16,9 +16,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_GROUP_EVENTS_H_
 
+#include <functional>
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 6b6a0d2a19d..11996ba4068 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -16,12 +16,15 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/group_events.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/hardware_type_utils.cc b/tensorflow/core/profiler/utils/hardware_type_utils.cc
index 75896c03851..e2a4004555b 100644
--- a/tensorflow/core/profiler/utils/hardware_type_utils.cc
+++ b/tensorflow/core/profiler/utils/hardware_type_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils.cc b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
index 14038d5c177..c40c3a89c9c 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils.cc
@@ -15,15 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 
+#include <algorithm>
+#include <string>
 #include <tuple>
 #include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 
 namespace tensorflow {
@@ -34,15 +36,15 @@ void ParseKernelLaunchParams(absl::string_view xstat_kernel_details,
   const std::vector<absl::string_view> params =
       absl::StrSplit(xstat_kernel_details, absl::ByAnyChar(":\n"));
 
-  constexpr uint32_t kNumDimensions = 3;
-  for (uint32_t dim = 0; dim < kNumDimensions; ++dim) {
+  constexpr uint32 kNumDimensions = 3;
+  for (uint32 dim = 0; dim < kNumDimensions; ++dim) {
     kernel->add_block_dim(1);
     kernel->add_grid_dim(1);
   }
 
   // Process value pairs.
-  for (uint32_t ii = 0; ii < params.size(); ii += 2) {
-    uint32_t value = 0;
+  for (uint32 ii = 0; ii < params.size(); ii += 2) {
+    uint32 value = 0;
     if (params[ii] == "registers_per_thread" &&
         absl::SimpleAtoi(params[ii + 1], &value)) {
       kernel->set_registers_per_thread(value);
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
index 06307d6d102..863d2f79819 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 
+#include <algorithm>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
@@ -40,7 +45,7 @@ class DeviceTfOpMetricsDbBuilder : public OpMetricsDbBuilder {
         /*hlo_module_id=*/0, tf_op_name);
     if (tf_op_metrics->category().empty()) {
       tf_op_metrics->set_category(
-          tf_op_type == kUnknownOp ? "Unknown" : string(tf_op_type));
+          tf_op_type == kUnknownOp ? "Unknown" : std::string(tf_op_type));
     }
     tf_op_metrics->set_is_eager(device_op_metrics.is_eager());
     // The occurrences of a TF-op is the maximum among the occurrences of all
@@ -89,8 +94,8 @@ uint64 IdleTimePs(const OpMetricsDb& metrics_db) {
 void AddIdleOp(OpMetricsDb* db) {
   uint64 idle_time_ps = IdleTimePs(*db);
   OpMetrics* metrics = db->add_metrics_db();
-  metrics->set_name(string(kIdle));
-  metrics->set_category(string(kIdle));
+  metrics->set_name(std::string(kIdle));
+  metrics->set_category(std::string(kIdle));
   metrics->set_occurrences(0);
   metrics->set_time_ps(idle_time_ps);
   metrics->set_self_time_ps(idle_time_ps);
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index 74ce13def0a..921e0617902 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -15,8 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/op_utils.h"
 
+#include <algorithm>
+#include <string>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -69,9 +75,9 @@ void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
   if (op_metrics->category().empty())
     op_metrics->set_category(category == kUnknownOp ? "unknown"
-                                                    : string(category));
+                                                    : std::string(category));
   if (op_metrics->provenance().empty())
-    op_metrics->set_provenance(string(provenance));
+    op_metrics->set_provenance(std::string(provenance));
   op_metrics->set_is_eager(op_metrics->is_eager() || is_eager);
   op_metrics->set_occurrences(op_metrics->occurrences() + occurrences);
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index 8aaa0f4f5c2..f94328d1b8d 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -16,13 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_
 
-#include <string>
-
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.cc b/tensorflow/core/profiler/utils/tf_op_utils.cc
index 5a4204440a3..630a74c4e47 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
-#include "absl/strings/strip.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/regexp.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/tf_op_utils.h b/tensorflow/core/profiler/utils/tf_op_utils.h
index d1ac69e2976..b8af9463d51 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils.h
+++ b/tensorflow/core/profiler/utils/tf_op_utils.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TF_OP_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TF_OP_UTILS_H_
 
+#include <string>
 #include <vector>
 
-#include "absl/base/attributes.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 
diff --git a/tensorflow/core/profiler/utils/tf_op_utils_test.cc b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
index fa5169557d1..136dbee2430 100644
--- a/tensorflow/core/profiler/utils/tf_op_utils_test.cc
+++ b/tensorflow/core/profiler/utils/tf_op_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/tf_xplane_visitor.h b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
index 33a170f8efd..17a7b94ef92 100644
--- a/tensorflow/core/profiler/utils/tf_xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/tf_xplane_visitor.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TF_XPLANE_VISITOR_H_
 
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.cc b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
index 493420fef96..f4cbaa84100 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.cc
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
@@ -14,36 +14,46 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/tfstreamz_utils.h"
 
+#include <map>
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/monitoring/collected_metrics.h"
-#include "tensorflow/core/lib/monitoring/collection_registry.h"
-#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/lib/monitoring/metric_def.h"
+#include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/tfstreamz.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
 
 namespace tensorflow {
 namespace profiler {
 
 namespace {
-string ConstructXStatName(const string& name, const monitoring::Point& point) {
+
+std::string ConstructXStatName(absl::string_view name,
+                               const monitoring::Point& point) {
   if (point.labels.empty()) {
-    return name;
+    return std::string(name);
   }
   return absl::Substitute(
       "$0{$1}", name,
-      absl::StrJoin(point.labels, ", ",
-                    [](string* out, const monitoring::Point::Label& label) {
-                      absl::StrAppend(out, label.name, "=", label.value);
-                    }));
+      absl::StrJoin(
+          point.labels, ", ",
+          [](std::string* out, const monitoring::Point::Label& label) {
+            absl::StrAppend(out, label.name, "=", label.value);
+          }));
 }
 
-string SerializePercentile(const monitoring::Percentiles& percentiles) {
+std::string SerializePercentile(const monitoring::Percentiles& percentiles) {
   tfstreamz::Percentiles output;
   output.set_unit_of_measure(
       static_cast<tfstreamz::UnitOfMeasure>(percentiles.unit_of_measure));
@@ -81,11 +91,11 @@ Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
     xevent.SetEndTimestampNs(snapshot.end_time_ns);
     auto& metric_descriptor_map = snapshot.metrics->metric_descriptor_map;
     for (const auto& point_set : snapshot.metrics->point_set_map) {
-      const string& metric_name = point_set.first;
+      const std::string& metric_name = point_set.first;
       // Each metrics have multiple points corresponding to different labels.
       for (const auto& point : point_set.second->points) {
         // Generates one KPI metric for each point.
-        string stat_name = ConstructXStatName(metric_name, *point);
+        std::string stat_name = ConstructXStatName(metric_name, *point);
         auto* metadata = xplane.GetOrCreateStatMetadata(stat_name);
         auto it = metric_descriptor_map.find(metric_name);
         if (it != metric_descriptor_map.end()) {
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.h b/tensorflow/core/profiler/utils/tfstreamz_utils.h
index ae8e4079bcb..1ab21ed1b5e 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.h
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.h
@@ -15,11 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TFSTREAMZ_UTILS_H_
 
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/lib/monitoring/collected_metrics.h"
-#include "tensorflow/core/lib/monitoring/collection_registry.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/timespan.h b/tensorflow/core/profiler/utils/timespan.h
index bccbeaa796f..82775af1415 100644
--- a/tensorflow/core/profiler/utils/timespan.h
+++ b/tensorflow/core/profiler/utils/timespan.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_TIMESPAN_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_TIMESPAN_H_
 
+#include <algorithm>
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/core/profiler/utils/xplane_builder.cc b/tensorflow/core/profiler/utils/xplane_builder.cc
index 9e66a15cc36..f923f3982f4 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.cc
+++ b/tensorflow/core/profiler/utils/xplane_builder.cc
@@ -14,6 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 
+#include <algorithm>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
@@ -54,7 +62,7 @@ XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(
   return metadata;
 }
 
-XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(string&& name) {
+XEventMetadata* XPlaneBuilder::GetOrCreateEventMetadata(std::string&& name) {
   XEventMetadata*& metadata = event_metadata_by_name_[name];
   if (metadata == nullptr) {
     metadata =
diff --git a/tensorflow/core/profiler/utils/xplane_builder.h b/tensorflow/core/profiler/utils/xplane_builder.h
index 803cc7b89c2..b0d743a0caf 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.h
+++ b/tensorflow/core/profiler/utils/xplane_builder.h
@@ -15,10 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_BUILDER_H_
 
+#include <stddef.h>
+
+#include <string>
+#include <utility>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -53,12 +58,12 @@ class XStatsBuilder {
   void AddStatValue(const XStatMetadata& metadata, absl::string_view value,
                     bool is_bytes = false) {
     if (is_bytes) {
-      AddStat(metadata)->set_bytes_value(string(value));
+      AddStat(metadata)->set_bytes_value(std::string(value));
     } else {
-      AddStat(metadata)->set_str_value(string(value));
+      AddStat(metadata)->set_str_value(std::string(value));
     }
   }
-  void AddStatValue(const XStatMetadata& metadata, string&& value,
+  void AddStatValue(const XStatMetadata& metadata, std::string&& value,
                     bool is_bytes = false) {
     if (is_bytes) {
       AddStat(metadata)->set_bytes_value(std::move(value));
@@ -160,7 +165,7 @@ class XLineBuilder {
 
   int64 NumEvents() { return line_->events_size(); }
 
-  void SetName(absl::string_view name) { line_->set_name(string(name)); }
+  void SetName(absl::string_view name) { line_->set_name(std::string(name)); }
 
   void SetNameIfEmpty(absl::string_view name) {
     if (line_->name().empty()) SetName(name);
@@ -205,7 +210,7 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
   int64 Id() { return plane_->id(); }
   void SetId(int64 id) { plane_->set_id(id); }
 
-  void SetName(absl::string_view name) { plane_->set_name(string(name)); }
+  void SetName(absl::string_view name) { plane_->set_name(std::string(name)); }
 
   void ReserveLines(size_t num_lines) {
     plane_->mutable_lines()->Reserve(num_lines);
@@ -222,7 +227,7 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
 
   XEventMetadata* GetOrCreateEventMetadata(int64 metadata_id);
   XEventMetadata* GetOrCreateEventMetadata(absl::string_view name);
-  XEventMetadata* GetOrCreateEventMetadata(string&& name);
+  XEventMetadata* GetOrCreateEventMetadata(std::string&& name);
   inline XEventMetadata* GetOrCreateEventMetadata(const char* name) {
     return GetOrCreateEventMetadata(absl::string_view(name));
   }
@@ -251,7 +256,7 @@ void XStatsBuilder<T>::AddStat(const XStatMetadata& key, const XStat& stat,
   if (stat.value_case() == XStat::kRefValue) {
     const auto& stat_metadata_map = src.stat_metadata();
     const auto it = stat_metadata_map.find(stat.ref_value());
-    if (ABSL_PREDICT_FALSE(it == stat_metadata_map.end())) {
+    if (TF_PREDICT_FALSE(it == stat_metadata_map.end())) {
       // the reference value in stat is not found in XStatMetadata from src.
       return;
     }
diff --git a/tensorflow/core/profiler/utils/xplane_builder_test.cc b/tensorflow/core/profiler/utils/xplane_builder_test.cc
index cb8749703a2..e55e01d8233 100644
--- a/tensorflow/core/profiler/utils/xplane_builder_test.cc
+++ b/tensorflow/core/profiler/utils/xplane_builder_test.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 
+#include <string>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 51bc4d03810..f8ff31b078a 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 97e54a7fc2f..31ff90155f5 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -16,11 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_SCHEMA_H_
 
-#include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "absl/types/span.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index b2cc1fd46a5..7f5221c5391 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -14,12 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 4f0a8b82646..49087c49cd8 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/utils/xplane_utils_test.cc b/tensorflow/core/profiler/utils/xplane_utils_test.cc
index b9b15b2e8a9..04e06fcb05b 100644
--- a/tensorflow/core/profiler/utils/xplane_utils_test.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils_test.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 
+#include <string>
+
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.cc b/tensorflow/core/profiler/utils/xplane_visitor.cc
index ab97271a69a..42068b7c61a 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.cc
+++ b/tensorflow/core/profiler/utils/xplane_visitor.cc
@@ -14,7 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/utils/xplane_visitor.h b/tensorflow/core/profiler/utils/xplane_visitor.h
index 8cd805c5cdb..4120a2821ca 100644
--- a/tensorflow/core/profiler/utils/xplane_visitor.h
+++ b/tensorflow/core/profiler/utils/xplane_visitor.h
@@ -15,8 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
 #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_VISITOR_H_
 
+#include <stddef.h>
+
 #include <functional>
-#include <utility>
+#include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"

From d2568b7da935e93c84e7a8ab8be47e19ca8e35ab Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 11 May 2020 10:29:15 -0700
Subject: [PATCH 0311/1533] Update tfrt hash in tf

PiperOrigin-RevId: 310941537
Change-Id: Ifee117aec55fc8008ece037d65e4525a56d215fc
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index ea25bfe6bc7..9a41d947922 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -162,8 +162,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
-    TFRT_COMMIT = "0bad623e8d99ace05f7f60e9e7f8b53ec813d66a"
-    TFRT_SHA256 = "d002429866d2d824a80dcf6c1602a15398412bc01324200d371c55b13b9a4b27"
+    TFRT_COMMIT = "d8e16ab39a26772027c1846386a9a0fef613e470"
+    TFRT_SHA256 = "d51a6fecae1319d225cc84614c96370ea3eb4e8824b79b7da6bf0e5caeb8fbdd"
     TFRT_URLS = [
         "http://mirror.tensorflow.org/github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
         "https://github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),

From 4da3e08cd5d5627352c462ef4ceee59fee36dd4f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Mon, 11 May 2020 10:29:22 -0700
Subject: [PATCH 0312/1533] Fix typo in the test names: UnifedCAPI->UnifiedCAPI

PiperOrigin-RevId: 310941575
Change-Id: Ie6b16eb317bd15e98de54652568f25827cf147df
---
 .../c/eager/c_api_unified_experimental_test.cc     | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 8c9aa97ea8f..bd99189852e 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -29,7 +29,7 @@ using tensorflow::string;
 namespace tensorflow {
 namespace {
 
-TEST(UnifedCAPI, TestBasicEager) {
+TEST(UnifiedCAPI, TestBasicEager) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -81,7 +81,7 @@ TEST(UnifedCAPI, TestBasicEager) {
   TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifedCAPI, TestBasicGraph) {
+TEST(UnifiedCAPI, TestBasicGraph) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
@@ -185,7 +185,7 @@ TEST(UnifedCAPI, TestBasicGraph) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
-TEST(UnifedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -201,7 +201,7 @@ TEST(UnifedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
+TEST(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
@@ -222,7 +222,7 @@ TEST(UnifedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
+TEST(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
@@ -243,7 +243,7 @@ TEST(UnifedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifedCAPI, TestExecutingEagerOpInGraphModeRaises) {
+TEST(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   // Build an Eager context.
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -289,7 +289,7 @@ TEST(UnifedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifedCAPI, TestExecutingGraphOpInEagerModeRaises) {
+TEST(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());

From 0975574df38cecd6f5643d0c188342cef96b463e Mon Sep 17 00:00:00 2001
From: Kaixi Hou <kaixih@nvidia.com>
Date: Mon, 11 May 2020 10:46:01 -0700
Subject: [PATCH 0313/1533] Minor changes

---
 tensorflow/core/kernels/conv_2d_gpu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/conv_2d_gpu.h b/tensorflow/core/kernels/conv_2d_gpu.h
index 90d85e6f04e..297016160ad 100644
--- a/tensorflow/core/kernels/conv_2d_gpu.h
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -210,7 +210,7 @@ __global__ void ShuffleInTensor3Simple(int nthreads,
   }
 }
 
-constexpr int kUnroll = 4;
+static constexpr int kUnroll = 4;
 
 template <typename T, int sp0, int sp1, int sp2, bool conjugate = false>
 __global__ void ShuffleInTensor3SimpleVector(int nthreads,
@@ -246,7 +246,7 @@ __global__ void ShuffleInTensor3SimpleVector(int nthreads,
     *out = *reinterpret_cast<float2*>(buf);
   }
 
-  for(; output_index < nthreads; output_index++) {
+  for (; output_index < nthreads; ++output_index) {
     Index<3> output_tensor_index = FlatToTensorIndex(output_index, output_dims);
 
     Index<3> input_tensor_index;

From 764e3a790eea85cbf8e275ef504c76335a3236f0 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 11 May 2020 17:44:32 +0000
Subject: [PATCH 0314/1533] Add uint32/uint64 support for tf.tile

This PR tries to address the issue raised in 39405 where
there is no uint32/uint64 support for tf.tile.

The related kernel impl for uint32 and uint64 has been added in this PR.

This PR fixes 39405

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/BUILD                 |  2 ++
 .../core/kernels/tile_functor_cpu_uint32.cc   | 29 +++++++++++++++++++
 .../core/kernels/tile_functor_cpu_uint64.cc   | 29 +++++++++++++++++++
 tensorflow/core/kernels/tile_ops.cc           |  6 ++++
 4 files changed, 66 insertions(+)
 create mode 100644 tensorflow/core/kernels/tile_functor_cpu_uint32.cc
 create mode 100644 tensorflow/core/kernels/tile_functor_cpu_uint64.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 5f85fe99018..4a1b9318f29 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1337,6 +1337,8 @@ tf_kernel_library(
         "tile_functor_cpu_int32.cc",
         "tile_functor_cpu_int64.cc",
         "tile_functor_cpu_int8.cc",
+        "tile_functor_cpu_uint32.cc",
+        "tile_functor_cpu_uint64.cc",
         "tile_functor_cpu_tstring.cc",
         "tile_functor_cpu_uint8.cc",
         "tile_functor_sycl.cc",
diff --git a/tensorflow/core/kernels/tile_functor_cpu_uint32.cc b/tensorflow/core/kernels/tile_functor_cpu_uint32.cc
new file mode 100644
index 00000000000..4dd44eeea0f
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu_uint32.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/tile_functor_cpu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template struct Tile<CPUDevice, uint32, int32>;
+template struct Tile<CPUDevice, uint32, int64>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_functor_cpu_uint64.cc b/tensorflow/core/kernels/tile_functor_cpu_uint64.cc
new file mode 100644
index 00000000000..ec1eb7b0946
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu_uint64.cc
@@ -0,0 +1,29 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/kernels/tile_functor_cpu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template struct Tile<CPUDevice, uint64, int32>;
+template struct Tile<CPUDevice, uint64, int64>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index cd047ed9d4a..75c34fb1bf7 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -139,6 +139,8 @@ TF_CALL_uint8(DECLARE_TYPE);
 TF_CALL_int32(DECLARE_TYPE);
 TF_CALL_int16(DECLARE_TYPE);
 TF_CALL_int64(DECLARE_TYPE);
+TF_CALL_uint32(DECLARE_TYPE);
+TF_CALL_uint64(DECLARE_TYPE);
 TF_CALL_half(DECLARE_TYPE);
 TF_CALL_complex64(DECLARE_TYPE);
 TF_CALL_complex128(DECLARE_TYPE);
@@ -240,6 +242,8 @@ class TileOp : public OpKernel {
     TF_CALL_int32(HANDLE_TYPE_NAME);
     TF_CALL_int16(HANDLE_TYPE_NAME);
     TF_CALL_int64(HANDLE_TYPE_NAME);
+    TF_CALL_uint32(HANDLE_TYPE_NAME);
+    TF_CALL_uint64(HANDLE_TYPE_NAME);
     TF_CALL_half(HANDLE_TYPE_NAME);
     TF_CALL_tstring(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
     TF_CALL_complex64(HANDLE_TYPE_NAME);
@@ -319,6 +323,8 @@ TF_CALL_int8(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int32(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int16(HANDLE_TYPE_NAME_CPU);
 TF_CALL_int64(HANDLE_TYPE_NAME_CPU);
+TF_CALL_uint32(HANDLE_TYPE_NAME_CPU);
+TF_CALL_uint64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_half(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);

From c65b6f9356d9232f1edd5be4aafe5b8f377a6fd9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 11 May 2020 16:48:53 +0000
Subject: [PATCH 0315/1533] Add test case for uint32/uint64 support of tf.tile

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/shape_ops_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index 7dde89c9818..6c2f2e236f2 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -500,6 +500,8 @@ class TileTest(test.TestCase, parameterized.TestCase):
         "int16": (dtypes.int16, int),
         "int32": (dtypes.int32, int),
         "int64": (dtypes.int64, int),
+        "uint32": (dtypes.uint32, int),
+        "uint64": (dtypes.uint64, int),
         bytes: (dtypes.string, bytes)
     }
     for dtype_np, (dtype_tf, cast) in types_to_test.items():

From f4de9245768c8935b158ce3ccb4cff654101c31d Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 11 May 2020 10:53:05 -0700
Subject: [PATCH 0316/1533] Export the `keras.preprocessing.image.smart_resize`
 utility.

PiperOrigin-RevId: 310947004
Change-Id: I518259ac4e88bd82301dc9aeccd868aca8a83e11
---
 tensorflow/python/keras/preprocessing/image.py                | 3 ++-
 .../api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt  | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index 3af573fa036..aa4801504cc 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -49,6 +49,7 @@ random_brightness = image.random_brightness
 apply_affine_transform = image.apply_affine_transform
 
 
+@keras_export('keras.preprocessing.image.smart_resize', v1=[])
 def smart_resize(x, size, interpolation='bilinear'):
   """Resize images to a target size without aspect ratio distortion.
 
@@ -65,7 +66,7 @@ def smart_resize(x, size, interpolation='bilinear'):
   ```
 
   However, if you do this, you distort the aspect ratio of your images, since
-  in general they do not all have the same aspect ratio. This is
+  in general they do not all have the same aspect ratio as `size`. This is
   fine in many cases, but not always (e.g. for GANs this can be a problem).
 
   Note that passing the argument `preserve_aspect_ratio=True` to `resize`
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
index 0b49aa9f3d4..e59c78cc496 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.preprocessing.image.pbtxt
@@ -68,4 +68,8 @@ tf_module {
     name: "save_img"
     argspec: "args=[\'path\', \'x\', \'data_format\', \'file_format\', \'scale\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "smart_resize"
+    argspec: "args=[\'x\', \'size\', \'interpolation\'], varargs=None, keywords=None, defaults=[\'bilinear\'], "
+  }
 }

From e4939f779e05d564ff8107312fb4ebadba14a026 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 10:57:33 -0700
Subject: [PATCH 0317/1533]  legalize operation 'tf.AllToAll'. Legalization
 target is xla_hlo.all_to_all.

PiperOrigin-RevId: 310947977
Change-Id: Ib906830860821b590f1bc55e497b4ffbb193e1da
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 38 +++++++++++++++++++
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 15 ++++++++
 .../xla/transforms/legalize_tf_patterns.td    |  7 ++++
 3 files changed, 60 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index a33b339220f..07ee70dafa3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -192,6 +192,44 @@ retained with length 1.
   let verifier = [{ return Verify(*this); }];
 }
 
+def TF_AllToAllOp : TF_Op<"AllToAll", [NoSideEffect]> {
+  let summary = "An Op to exchange data across TPU replicas.";
+
+  let description = [{
+On each replica, the input is split into `split_count` blocks along
+`split_dimension` and send to the other replicas given group_assignment. After
+receiving `split_count` - 1 blocks from other replicas, we concatenate the
+blocks along `concat_dimension` as the output.
+
+For example, suppose there are 2 TPU replicas:
+replica 0 receives input: `[[A, B]]`
+replica 1 receives input: `[[C, D]]`
+
+group_assignment=`[[0, 1]]`
+concat_dimension=0
+split_dimension=1
+split_count=2
+
+replica 0's output: `[[A], [C]]`
+replica 1's output: `[[B], [D]]`
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    I32Tensor:$group_assignment,
+
+    I64Attr:$concat_dimension,
+    I64Attr:$split_dimension,
+    I64Attr:$split_count
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I1, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_AngleOp : TF_Op<"Angle", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Returns the argument of a complex number.";
 
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index aef9b17d3db..61bf5de18cb 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -4096,6 +4096,21 @@ func @xla_dynamic_update_slice2(%arg0: tensor<4xf32>, %arg1: tensor<2xf32>, %arg
   return %0 : tensor<4xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// AllToAll op legalizations.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @alltoall_basic
+func @alltoall_basic(%input: tensor<10xf32>) -> tensor<10xf32> {
+  %group_assignment = "tf.Const" () {
+    value = dense<[[0, 2, 4, 6], [1, 3, 5, 7], [3, 5, 6, 8]]> : tensor<3x4xi32>
+  } : () -> tensor<3x4xi32>
+  %result = "tf.AllToAll"(%input, %group_assignment) {T = f32, concat_dimension = 1 : i64, split_count = 2 : i64, split_dimension = 0 : i64} :  (tensor<10xf32>, tensor<3x4xi32>)  -> tensor<10xf32>
+  // CHECK: xla_hlo.all_to_all
+  // CHECK-SAME: replica_groups = dense<{{\[}}[0, 2, 4, 6], [1, 3, 5, 7], [3, 5, 6, 8]]> : tensor<3x4xi64>
+  return %result : tensor<10xf32>
+}
+
 //===----------------------------------------------------------------------===//
 // Cumsum op legalizations.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index d53dbdcaaeb..036bbf65bc6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -273,6 +273,13 @@ def : Pat<(TF_CrossReplicaSumOp $input, (TF_ConstOp $group_assignment)),
           (HLO_CrossReplicaSumOp $input,
             (CastElementsToI64Elements $group_assignment))>;
 
+//===----------------------------------------------------------------------===//
+// All2All op patterns.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(TF_AllToAllOp AnyRankedTensor:$input, (TF_ConstOp $group_assignment), I64Attr:$concat_dimension, $split_dimension, $split_count),
+          (HLO_AllToAllOp $input, $split_dimension, $concat_dimension, $split_count, (CastElementsToI64Elements $group_assignment))>;
+
 //===----------------------------------------------------------------------===//
 // FFT op patterns.
 //===----------------------------------------------------------------------===//

From 04d51a661f1b6ded834f0ba39aeae5e6eb00dd3d Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 11 May 2020 11:08:51 -0700
Subject: [PATCH 0318/1533] bump notebook versions

PiperOrigin-RevId: 310950572
Change-Id: Iab721a49a009b66f636aa9a337a1667d0f26c4ab
---
 .../lite/g3doc/tutorials/model_maker_image_classification.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
index ee46795f5c8..464a5d1b5ef 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_image_classification.ipynb
@@ -49,7 +49,7 @@
       "metadata": {
         "colab_type": "text",
         "id": "nDABAblytltI"
-      },
+      }, 
       "source": [
         "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
         "  \u003ctd\u003e\n",

From 7ac062b354fcf6ab29b6e2e539641cb3cca25815 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 11:42:05 -0700
Subject: [PATCH 0319/1533] [Profiler] Add optimization advices for
 tf-function.

PiperOrigin-RevId: 310957738
Change-Id: I97988752045bb4037098ebd6f4c63f0a65b2cf64
---
 tensorflow/core/profiler/convert/BUILD        |  4 ++
 .../op_stats_to_input_pipeline_analysis.cc    |  5 +-
 .../convert/op_stats_to_overview_page.cc      | 47 ++++++++++++++++---
 .../convert/op_stats_to_overview_page.h       |  9 ++++
 .../convert/xplane_to_tf_functions.cc         | 21 +++++++++
 .../convert/xplane_to_tf_functions_test.cc    |  8 ++++
 .../profiler/protobuf/overview_page.proto     |  3 ++
 .../core/profiler/protobuf/tf_function.proto  |  3 ++
 tensorflow/core/profiler/utils/BUILD          |  8 ++++
 tensorflow/core/profiler/utils/html_utils.h   | 36 ++++++++++++++
 10 files changed, 133 insertions(+), 11 deletions(-)
 create mode 100644 tensorflow/core/profiler/utils/html_utils.h

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 15a1ad03be3..e033dbb1ba6 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -92,7 +92,9 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/utils:errors",
+        "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
@@ -118,6 +120,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:errors",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:html_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
@@ -390,6 +393,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:xplane_schema",
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index ca2a6c28875..14b6cebc424 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/errors.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -327,10 +328,6 @@ InputOpDetails ConvertOpMetricsToInputOpDetails(const OpMetrics& op_metrics,
   return details;
 }
 
-string AnchorElement(absl::string_view url, absl::string_view text) {
-  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
-}
-
 // Returns the ratio of the host-to-device time in each step to the step-time.
 double RatioOfHostToDeviceTimeToStepTime(
     const OpMetricsDb& host_tf_metrics_db,
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index e19690a6606..69f076f0296 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -30,7 +30,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
 #include "tensorflow/core/profiler/utils/errors.h"
+#include "tensorflow/core/profiler/utils/html_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -44,16 +46,17 @@ namespace {
 // statement of suggestion will be made.
 constexpr double kLowPrecisionPercentThreshold = 10;
 
+struct TfFunctionInfo {
+  absl::string_view function_name;
+  double expensive_call_percent;
+};
+
 OverviewPageTip MakeOverviewPageTip(const string& text) {
   OverviewPageTip tip;
   tip.set_link(text);
   return tip;
 }
 
-string AnchorElement(const string& url, const string& text) {
-  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
-}
-
 // Makes a recommendation for looking up a document.
 // doc_url is expected to be already be escaped suitably for use in an HTML
 // attribute.
@@ -125,10 +128,12 @@ void SetCommonRecommendation(const string& input_classification,
                              const string& input_statement,
                              const string& output_statement,
                              HardwareType hardware_type,
+                             const string& tf_function_statement_html,
                              OverviewPageRecommendation* re) {
   re->set_bottleneck(input_classification);
   re->set_statement(input_statement);
   re->set_output_statement(output_statement);
+  re->set_tf_function_statement_html(tf_function_statement_html);
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
@@ -245,6 +250,33 @@ OverviewPageRunEnvironment ComputeRunEnvironment(
   return re;
 }
 
+std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
+  std::vector<TfFunctionInfo> candidates;
+  for (const auto& name_fun : tf_function_db.tf_functions()) {
+    const auto& fun = name_fun.second;
+    if (fun.expensive_call_percent() >= kTfFunctionReportThresholdInPercent) {
+      candidates.push_back({name_fun.first, fun.expensive_call_percent()});
+    }
+  }
+  if (candidates.empty()) return "";
+  auto cmp = [](const TfFunctionInfo& a, const TfFunctionInfo& b) {
+    return a.expensive_call_percent > b.expensive_call_percent;
+  };
+  // Sorts candidates in descending order of expensive_call_percent.
+  absl::c_sort(candidates, cmp);
+  std::string expensive_functions = "";
+  auto num_functions_shown = std::min(3ul, candidates.size());
+  for (auto i = 0; i < num_functions_shown; i++) {
+    if (i > 0) absl::StrAppend(&expensive_functions, ", ");
+    absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,
+                    "\"");
+  }
+  if (candidates.size() > num_functions_shown)
+    absl::StrAppend(&expensive_functions, " and more");
+  return absl::StrCat("Expensive tf-functions detected (", expensive_functions,
+                      ") due to either retracing or eager execution.");
+}
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type) {
   OverviewPage overview_page;
@@ -258,9 +290,10 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
       overview_page.input_analysis().step_details());
   *overview_page.mutable_recommendation() = ComputeGenericRecommendation(
       bottleneck, op_stats.device_op_metrics_db().precision_stats());
-  SetCommonRecommendation(bottleneck.input_classification(),
-                          bottleneck.input_statement(), "", hardware_type,
-                          overview_page.mutable_recommendation());
+  SetCommonRecommendation(
+      bottleneck.input_classification(), bottleneck.input_statement(), "",
+      hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
+      overview_page.mutable_recommendation());
   return overview_page;
 }
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index e6d12708e9f..ba04f1b41b7 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -29,10 +29,16 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+// Reports tf-function optimization opportunity in the Overview Page if the
+// expensive-call-time percentage is over this threshold for at least one of
+// the tf-functions profiled.
+const double kTfFunctionReportThresholdInPercent = 20;
+
 void SetCommonRecommendation(const string& input_classification,
                              const string& input_statement,
                              const string& output_statement,
                              HardwareType hardware_type,
+                             const string& tf_function_statement_html,
                              OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
@@ -47,6 +53,9 @@ OverviewPageRunEnvironment ComputeRunEnvironment(
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type);
 
+// Returns a html which provides tf-function related recommendation.
+std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db);
+
 void SetRemarks(const OpStats& op_stats, OverviewPageAnalysis* analysis);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index f768d3b7ae6..c49a0750c10 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -54,6 +55,21 @@ std::pair<TfFunctionExecutionMode, TfFunctionCompiler> Decode(
   DCHECK(false);
 }
 
+double ComputeExpensiveCallPercent(const TfFunction& tf_function) {
+  // Computes the expensiveness in terms of time (rather than count).
+  uint64 total_call_time_ps = 0;
+  uint64 expensive_call_time_ps = 0;
+  for (const auto& mode_metrics : tf_function.metrics()) {
+    const auto mode = mode_metrics.first;
+    const auto& metrics = mode_metrics.second;
+    total_call_time_ps += metrics.self_time_ps();
+    if (mode == TRACED_MODE || mode == EAGER_MODE) {
+      expensive_call_time_ps += metrics.self_time_ps();
+    }
+  }
+  return SafeDivide(100.0 * expensive_call_time_ps, total_call_time_ps);
+}
+
 // Each invocation of a tf-function creates an ActivationRecord.
 struct ActivationRecord {
   std::string function_name;               // name of the tf-function.
@@ -133,6 +149,7 @@ void CombineTfFunction(const TfFunction& src, TfFunction* dst) {
       CombineTfFunctionMetrics(src_metrics, dst_metrics);
     }
   }
+  dst->set_expensive_call_percent(ComputeExpensiveCallPercent(*dst));
 }
 
 // Execution history of all tf-functions invoked.
@@ -210,6 +227,10 @@ class TfFunctionExecutions {
       metrics->set_count(metrics->count() + 1);
       metrics->set_self_time_ps(metrics->self_time_ps() + self_time_ps);
     }
+    for (auto& name_fun : *result.mutable_tf_functions()) {
+      TfFunction& fun = name_fun.second;
+      fun.set_expensive_call_percent(ComputeExpensiveCallPercent(fun));
+    }
     return result;
   }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
index 253ef1a74f9..3ac23fc82f9 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
@@ -33,6 +33,8 @@ const absl::string_view kTracedXla = "traced-xla";
 const absl::string_view kNotTracedNonXla = "notTraced-nonXla";
 const absl::string_view kNotTracedXla = "notTraced-xla";
 
+constexpr double kMaxError = 0.001;
+
 TfFunctionDb ConvertXSpaceToTfFunctionDb(const XSpace& space) {
   TfFunctionDb result;
   const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
@@ -75,6 +77,8 @@ TEST(ConvertXPlaneToTfFunctions, CombineTwoThreads) {
       tf_function_db.tf_functions().at(kFunctionName);
   EXPECT_EQ(tf_function.total_tracing_count(), 4);
   EXPECT_EQ(tf_function.compiler(), MIXED_COMPILER);
+  EXPECT_NEAR(tf_function.expensive_call_percent(), 90, kMaxError);
+
   const auto& metrics = tf_function.metrics();
   EXPECT_EQ(metrics.size(), 2);
   EXPECT_EQ(metrics.count(TRACED_MODE), 1);
@@ -108,6 +112,7 @@ TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
       tf_function_db.tf_functions().at(kOuterFunctionName);
   EXPECT_EQ(outer.total_tracing_count(), 1);
   EXPECT_EQ(outer.compiler(), OTHER_COMPILER);
+  EXPECT_NEAR(outer.expensive_call_percent(), 100, kMaxError);
   const auto& outer_metrics = outer.metrics();
   EXPECT_EQ(outer_metrics.size(), 1);
   EXPECT_EQ(outer_metrics.count(TRACED_MODE), 1);
@@ -118,6 +123,7 @@ TEST(ConvertXPlaneToTfFunctions, NestedFunctions) {
       tf_function_db.tf_functions().at(kInnerFunctionName);
   EXPECT_EQ(inner.total_tracing_count(), 0);
   EXPECT_EQ(inner.compiler(), XLA_COMPILER);
+  EXPECT_NEAR(inner.expensive_call_percent(), 0, kMaxError);
   const auto& inner_metrics = inner.metrics();
   EXPECT_EQ(inner_metrics.size(), 1);
   EXPECT_EQ(inner_metrics.count(NOT_TRACED_MODE), 1);
@@ -148,6 +154,7 @@ TEST(ConvertXPlaneToTfFunctions, EagerPlusConcrete) {
       tf_function_db.tf_functions().at(kEagerFunctionName);
   EXPECT_EQ(eager.total_tracing_count(), 0);
   EXPECT_EQ(eager.compiler(), INVALID_COMPILER);
+  EXPECT_NEAR(eager.expensive_call_percent(), 100, kMaxError);
   const auto& eager_metrics = eager.metrics();
   EXPECT_EQ(eager_metrics.size(), 1);
   EXPECT_EQ(eager_metrics.count(EAGER_MODE), 1);
@@ -158,6 +165,7 @@ TEST(ConvertXPlaneToTfFunctions, EagerPlusConcrete) {
       tf_function_db.tf_functions().at(kConcreteFunctionName);
   EXPECT_EQ(concrete.total_tracing_count(), 0);
   EXPECT_EQ(concrete.compiler(), INVALID_COMPILER);
+  EXPECT_NEAR(concrete.expensive_call_percent(), 0, kMaxError);
   const auto& concrete_metrics = concrete.metrics();
   EXPECT_EQ(concrete_metrics.size(), 1);
   EXPECT_EQ(concrete_metrics.count(CONCRETE_MODE), 1);
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 8c83dbd0871..018aa759cc5 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -84,6 +84,9 @@ message OverviewPageRecommendation {
   // A statement for output that recommends the next steps for investigating the
   // bottleneck.
   string output_statement = 9;
+  // A statement that recommends the next steps for investigating tf-function
+  // related bottleneck (it is a html so that it can link to other tools/docs.
+  string tf_function_statement_html = 10;
   // A list of tips for improving host performance.
   repeated OverviewPageTip host_tips = 3;
   // A list of tips for improving device performance.
diff --git a/tensorflow/core/profiler/protobuf/tf_function.proto b/tensorflow/core/profiler/protobuf/tf_function.proto
index fe07c00c8d3..1f5e1530475 100644
--- a/tensorflow/core/profiler/protobuf/tf_function.proto
+++ b/tensorflow/core/profiler/protobuf/tf_function.proto
@@ -49,6 +49,9 @@ message TfFunction {
   int64 total_tracing_count = 2;
   // Compiler used to compile this function.
   TfFunctionCompiler compiler = 3;
+  // Percentage of time spent in the expensive calls to this function in the
+  // profiled period.
+  double expensive_call_percent = 4;
 }
 
 // Statistics for all tf-functions.
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 7f72781e54c..ca20236d63b 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -51,6 +51,14 @@ cc_library(
     hdrs = ["math_utils.h"],
 )
 
+cc_library(
+    name = "html_utils",
+    hdrs = ["html_utils.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "op_metrics_db_utils",
     srcs = ["op_metrics_db_utils.cc"],
diff --git a/tensorflow/core/profiler/utils/html_utils.h b/tensorflow/core/profiler/utils/html_utils.h
new file mode 100644
index 00000000000..215d9f51d5b
--- /dev/null
+++ b/tensorflow/core/profiler/utils/html_utils.h
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Creates a html that links to the given url with the given text.
+inline std::string AnchorElement(absl::string_view url,
+                                 absl::string_view text) {
+  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HTML_UTILS_H_

From 73b3a13b8864aaac09e5741eb317348174e37fec Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 11 May 2020 11:42:17 -0700
Subject: [PATCH 0320/1533] Refactor ConvertToTensorProto to avoid some
 duplication

Share implementation to populate tensors unless the element type requires special handling.

Rolls forward wrongly reverted commit

PiperOrigin-RevId: 310957792
Change-Id: I59608881e859b8c26045206073cab6462bf1ef7b
---
 .../mlir/tensorflow/utils/convert_tensor.cc   | 213 +++++++-----------
 1 file changed, 81 insertions(+), 132 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index fcfef565952..b492945fe8b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -207,12 +208,11 @@ mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
 
 // Converts an MLIR dense string elements attribute to a TensorFlow tensor
 // proto.
-Status ConvertStringElementsAttr(const DenseStringElementsAttr attr,
-                                 TensorProto* output_tensor) {
-  for (const auto& val : attr.getRawStringData()) {
-    output_tensor->add_string_val(val.data(), val.size());
-  }
-  return Status::OK();
+void ConvertStringElementsAttr(
+    const DenseStringElementsAttr attr,
+    protobuf::RepeatedPtrField<std::string>* output) {
+  for (const auto& val : attr.getRawStringData())
+    output->Add({val.data(), val.size()});
 }
 
 // Converts an MLIR opaque elements attribute to a TensorFlow tensor proto.
@@ -226,139 +226,80 @@ Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
   return InvalidArgument("Unexpected elements attribute type from MLIR.");
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the double_val field updated.
-Status ConvertDoubleElementsAttr(const ElementsAttr attr,
-                                 TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_double_val(elts.getSplatValue<double>());
-    } else {
-      for (auto value : elts.getValues<double>())
-        output_tensor->add_double_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the float_val field updated.
-Status ConvertFloatElementsAttr(const ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_float_val(elts.getSplatValue<float>());
-    } else {
-      for (auto value : elts.getValues<float>())
-        output_tensor->add_float_val(value);
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the half_val field updated.
-Status ConvertHalfElementsAttr(const ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseFPElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_half_val(
-          (*elts.begin()).bitcastToAPInt().getSExtValue());
-    } else {
-      for (const auto& value : elts.getFloatValues())
-        output_tensor->add_half_val(value.bitcastToAPInt().getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int_val field updated.
-Status ConvertIntElementsAttr(const mlir::ElementsAttr attr,
-                              TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertBfloat16ElementsAttr(const mlir::ElementsAttr attr,
-                                   TensorProto* output_tensor) {
-  auto elts = attr.dyn_cast<DenseFPElementsAttr>();
-  if (!elts) {
-    return ConvertOpaqueElementsAttr(attr, output_tensor);
-  }
-
-  // Bfloat16 is internally represented as `double` in MLIR.
-  if (elts.isSplat()) {
-    double v = elts.getSplatValue<double>();
-    bfloat16 bf16_val = static_cast<bfloat16>(v);
-    output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+// Converts an MLIR elements attribute and adds it to specified repeated field.
+template <typename T>
+void ConvertElementsAttr(const mlir::DenseElementsAttr attr,
+                         protobuf::RepeatedField<T>* output) {
+  if (attr.isSplat()) {
+    output->Add(attr.getSplatValue<T>());
   } else {
-    for (auto v : elts.getValues<double>()) {
+    for (auto value : attr.getValues<T>()) output->Add(value);
+  }
+}
+
+// Converts an MLIR elements attribute containing half values and adds it to
+// specified repeated field.
+void ConvertHalfElementsAttr(const DenseFPElementsAttr attr,
+                             protobuf::RepeatedField<int>* output_tensor) {
+  if (attr.isSplat()) {
+    output_tensor->Add((*attr.begin()).bitcastToAPInt().getSExtValue());
+  } else {
+    for (const llvm::APFloat value : attr.getFloatValues())
+      output_tensor->Add(value.bitcastToAPInt().getSExtValue());
+  }
+}
+
+// Converts an MLIR elements attribute containing int values and adds it to
+// specified repeated field.
+void ConvertIntElementsAttr(const mlir::DenseIntElementsAttr attr,
+                            protobuf::RepeatedField<int>* output) {
+  if (attr.isSplat()) {
+    output->Add((*attr.begin()).getSExtValue());
+  } else {
+    for (const llvm::APInt val : attr) output->Add(val.getSExtValue());
+  }
+}
+
+void ConvertBfloat16ElementsAttr(const mlir::DenseFPElementsAttr attr,
+                                 protobuf::RepeatedField<int>* output) {
+  // Bfloat16 is internally represented as `double` in MLIR.
+  if (attr.isSplat()) {
+    double v = attr.getSplatValue<double>();
+    bfloat16 bf16_val = static_cast<bfloat16>(v);
+    output->Add(absl::bit_cast<int16>(bf16_val));
+  } else {
+    for (auto v : attr.getValues<double>()) {
       bfloat16 bf16_val = static_cast<bfloat16>(v);
-      output_tensor->add_half_val(absl::bit_cast<int16>(bf16_val));
+      output->Add(absl::bit_cast<int16>(bf16_val));
     }
   }
-
-  return Status::OK();
 }
 
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with the int64_val field updated.
-Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr,
-                                TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    if (elts.isSplat()) {
-      output_tensor->add_int64_val((*elts.begin()).getSExtValue());
-    } else {
-      for (const auto& val : elts)
-        output_tensor->add_int64_val(val.getSExtValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-// Converts an MLIR elements attribute to a TensorFlow tensor proto
-// with bool_val field updated.
-Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr,
-                               TensorProto* output_tensor) {
-  if (auto elts = attr.dyn_cast<DenseIntElementsAttr>()) {
-    for (const auto& val : elts) {
-      output_tensor->add_bool_val(val.getBoolValue());
-    }
-    return Status::OK();
-  }
-  return ConvertOpaqueElementsAttr(attr, output_tensor);
-}
-
-Status ConvertToTensorProto(const ElementsAttr attr,
-                            TensorProto* output_tensor) {
+Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
   auto type = attr.getType();
   auto shape = type.getShape();
   DataType output_dtype;
   TF_RETURN_IF_ERROR(ConvertToDataType(type, &output_dtype));
-  output_tensor->set_dtype(output_dtype);
-  ConvertToTensorShapeProto(shape, output_tensor->mutable_tensor_shape());
+  output->set_dtype(output_dtype);
+  ConvertToTensorShapeProto(shape, output->mutable_tensor_shape());
+
+  if (attr.isa<OpaqueElementsAttr>())
+    return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(), output);
+
+  auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>();
+  if (!dense_attr) return errors::InvalidArgument("Unsupported elements attr");
 
   switch (output_dtype) {
     case DT_FLOAT:
-      return ConvertFloatElementsAttr(attr, output_tensor);
+      ConvertElementsAttr<float>(dense_attr, output->mutable_float_val());
+      break;
     case DT_HALF:
-      // Handles both DenseFPElementsAttr and OpaqueElementsAttr.
-      return ConvertHalfElementsAttr(attr, output_tensor);
+      ConvertHalfElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                              output->mutable_half_val());
+      break;
     case DT_DOUBLE:
-      return ConvertDoubleElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_double_val());
+      break;
     case DT_QUINT8:
     case DT_UINT8:
     case DT_INT8:
@@ -366,20 +307,28 @@ Status ConvertToTensorProto(const ElementsAttr attr,
     case DT_UINT16:
     case DT_INT16:
     case DT_INT32:
-      return ConvertIntElementsAttr(attr, output_tensor);
+      ConvertIntElementsAttr(dense_attr.cast<DenseIntElementsAttr>(),
+                             output->mutable_int_val());
+      break;
     case DT_INT64:
-      return ConvertInt64ElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_int64_val());
+      break;
     case DT_BOOL:
-      return ConvertBoolElementsAttr(attr, output_tensor);
+      ConvertElementsAttr(dense_attr, output->mutable_bool_val());
+      break;
     case DT_BFLOAT16:
-      return ConvertBfloat16ElementsAttr(attr, output_tensor);
+      ConvertBfloat16ElementsAttr(dense_attr.cast<DenseFPElementsAttr>(),
+                                  output->mutable_half_val());
+      break;
     case DT_STRING:
-      return ConvertStringElementsAttr(attr.cast<DenseStringElementsAttr>(),
-                                       output_tensor);
+      ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
+                                output->mutable_string_val());
+      break;
     default:
-      return ConvertOpaqueElementsAttr(attr.cast<OpaqueElementsAttr>(),
-                                       output_tensor);
+      return errors::Unimplemented(absl::StrCat("Unimplemented data type ",
+                                                DataTypeString(output_dtype)));
   }
+  return Status::OK();
 }
 
 Status ConvertToTensor(const mlir::ElementsAttr attr, Tensor* output_tensor) {

From 74226e113a3a75eeb5febc75c248955115f4ff6f Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 11 May 2020 11:49:52 -0700
Subject: [PATCH 0321/1533] Added Pad3DAttributes attributes and function for
 shape calculation with Pad3DAttributes.

PiperOrigin-RevId: 310959402
Change-Id: I88c67fd20bb78cef6fe12ddc059ae17d343e8160
---
 tensorflow/lite/delegates/gpu/common/operations.cc |  8 ++++++++
 tensorflow/lite/delegates/gpu/common/operations.h  | 11 +++++++++++
 2 files changed, 19 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 3924f91f952..bdcf6f605cc 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -506,6 +506,14 @@ BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr) {
               attr.appended.c + attr.prepended.c + input.c);
 }
 
+BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr) {
+  return BHWDC(attr.appended.b + attr.prepended.b + input.b,
+               attr.appended.h + attr.prepended.h + input.h,
+               attr.appended.w + attr.prepended.w + input.w,
+               attr.appended.d + attr.prepended.d + input.d,
+               attr.appended.c + attr.prepended.c + input.c);
+}
+
 BHWC CalculateOutputShape(const BHWC& input,
                           const FullyConnectedAttributes& attr) {
   return BHWC(input.b, 1, 1, attr.weights.shape.o);
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index f8bfc77f610..d0268eee585 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -431,6 +431,17 @@ struct PadAttributes {
 // @return shape of a tensor after Pad operation is applied to the given input.
 BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr);
 
+struct Pad3DAttributes {
+  PaddingContentType type = PaddingContentType::ZEROS;
+
+  BHWDC prepended;
+  BHWDC appended;
+};
+
+// @return shape of a tensor after Pad3D operation is applied to the given
+// input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Pad3DAttributes& attr);
+
 struct ConstTensorAttributes {
   Tensor<BHWC, DataType::FLOAT32> tensor;
 };

From 4727d0180fb693fb7cfd70b372b606752f8efa45 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Mon, 11 May 2020 11:50:00 -0700
Subject: [PATCH 0322/1533] Split tensorflow.python.tpu module doctests into
 different target.

These TPU tests do not yet run in OSS.

PiperOrigin-RevId: 310959419
Change-Id: I2a1662e52f25da9c4c58c018c83729dc6da9008d
---
 tensorflow/python/tpu/tpu_embedding.py |  8 ++++----
 tensorflow/tools/docs/BUILD            | 24 ++++++++++++++++++++++++
 tensorflow/tools/docs/tf_doctest.py    |  6 ++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/tpu/tpu_embedding.py b/tensorflow/python/tpu/tpu_embedding.py
index fa07a929acc..d1848f34502 100644
--- a/tensorflow/python/tpu/tpu_embedding.py
+++ b/tensorflow/python/tpu/tpu_embedding.py
@@ -828,7 +828,7 @@ class TPUEmbedding(object):
   ...     end_learning_rate=0.0)
   >>> wordpiece_table_config = TableConfig(
   ...   vocabulary_size=119547,
-  ...   dimension=768,
+  ...   dimension=256,
   ...   learning_rate_fn=learning_rate_fn)
   >>> wordpiece_feature_config = FeatureConfig(
   ...   table_id='bert/embeddings/word_embeddings',
@@ -846,11 +846,11 @@ class TPUEmbedding(object):
   ...  batch_size=128,
   ...  mode=TRAINING,
   ...  optimization_parameters=optimization_parameters,
-  ...  device_config=DeviceConfig(
-  ...    num_cores=64, num_hosts=4, job_name='tpu_worker'))
+  ...  master='')
   >>> with tf.Graph().as_default():
   ...   init_tpu_op = tf.compat.v1.tpu.initialize_system(
-  ...     embedding_config=tpu_embedding.config_proto, job='tpu_worker')
+  ...     embedding_config=tpu_embedding.config_proto)
+  ...   tf.compat.v1.Session().run(init_tpu_op)
   """
 
   # TODO(shizhiw): Consider adding a field to FeatureConfig that indicates that
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index c092f21addb..49083affb45 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -2,6 +2,7 @@
 #   Doc generator
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 
 package(
     default_visibility = ["//tensorflow:__subpackages__"],
@@ -22,6 +23,7 @@ py_library(
 py_test(
     name = "tf_doctest",
     srcs = ["tf_doctest.py"],
+    args = ["--module_prefix_skip=tpu."],
     python_version = "PY3",
     tags = [
         "no_oss_py2",
@@ -40,6 +42,28 @@ py_test(
     ],
 )
 
+tpu_py_test(
+    name = "tf_doctest_tpu",
+    srcs = ["tf_doctest.py"],
+    args = ["--module=tpu."],
+    disable_experimental = True,
+    disable_v3 = True,
+    main = "tf_doctest.py",
+    python_version = "PY3",
+    tags = [
+        "no_oss",
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [
+        ":tf_doctest_lib",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/preprocessing",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "tf_doctest_test",
     srcs = ["tf_doctest_test.py"],
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index 19624659e37..f694b83782e 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -43,6 +43,8 @@ tf.keras.preprocessing = preprocessing
 FLAGS = flags.FLAGS
 
 flags.DEFINE_string('module', None, 'A specific module to run doctest on.')
+flags.DEFINE_list('module_prefix_skip', [],
+                  'A list of modules to ignore when resolving modules.')
 flags.DEFINE_boolean('list', None,
                      'List all the modules in the core package imported.')
 flags.DEFINE_string('file', None, 'A specific file to run doctest on.')
@@ -50,6 +52,7 @@ flags.DEFINE_string('file', None, 'A specific file to run doctest on.')
 flags.mark_flags_as_mutual_exclusive(['module', 'file'])
 flags.mark_flags_as_mutual_exclusive(['list', 'file'])
 
+# Both --module and --module_prefix_skip are relative to PACKAGE.
 PACKAGE = 'tensorflow.python.'
 
 
@@ -140,6 +143,9 @@ def load_tests(unused_loader, tests, unused_ignore):
     tf_modules = get_module_and_inject_docstring(FLAGS.file)
 
   for module in tf_modules:
+    if any(module.__name__.startswith(PACKAGE + prefix)
+           for prefix in FLAGS.module_prefix_skip):
+      continue
     testcase = TfTestCase()
     tests.addTests(
         doctest.DocTestSuite(

From fb9c5cf3358c3462addfa83d5509207ba2d9a67a Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Mon, 11 May 2020 11:51:11 -0700
Subject: [PATCH 0323/1533] Build _TPUCompileMlir and TPUCompileSucceededAssert
 ops outside of parallel_execute region.

Parallel_execute op may be added by outside compilation.  The compile and compile succeed ops should be lifted from the parallel_execute op.

PiperOrigin-RevId: 310959666
Change-Id: Ic9be8ee33699b130e710cce2a391902ad9c979be
---
 .../mlir/tensorflow/tests/tpu_rewrite.mlir    | 35 +++++++++++++++++++
 .../tensorflow/transforms/tpu_rewrite_pass.cc | 13 ++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index af0119dab8f..b8a48bbb379 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -1222,6 +1222,41 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
+// Tests simple case of `tf_device.cluster_func` on TPU with replication and parallel_execute.
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
+  // CHECK-LABEL: func @replicated_parallel_tpu_cluster_func
+  func @replicated_parallel_tpu_cluster_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+    %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      // CHECK: "tf._TPUCompileMlir"
+      // CHECK: "tf.TPUCompileSucceededAssert"
+      // CHECK: "tf_device.parallel_execute"
+      // CHECK:    "tf.TPUExecute"
+      %3 = "tf_device.parallel_execute"() ( {
+        "tf.D"() : () -> ()
+        tf_device.return
+      }, {
+        %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
+
+        tf_device.return %4 : tensor<?xi32>
+      }) : () -> (tensor<?xi32>)
+      tf_device.return %3 : tensor<?xi32>
+    }
+    %2 = "tf.C"(%1#1) : (tensor<?xi32>) -> tensor<?xi32>
+    return %2 : tensor<?xi32>
+  }
+
+  func @tpu0_func(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = "tf.B"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+}
+
+// -----
+
 // Tests devices are set properly for non replicated model parallelism.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"]} {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 98ff0de7645..ae1bdff97e0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -92,7 +92,7 @@ constexpr char kBadArrayAttrLengthMsg[] =
 //
 // Would become following ops (unimportant attributes, types are omitted):
 //    %1 = "tf.Shape"(%0)
-//    %2:2 = "tf.MLIRCompileToTPU"(%1) {module = "<Serialized @tpu_func>"}
+//    %2:2 = "tf._TPUCompileMlir"(%1) {module = "<Serialized @tpu_func>"}
 //    "tf.TPUCompileSucceededAssert"(%2#0)
 //    %3 = "tf.TPUExecute"(%0, %2#1)
 //    %4 = "tf.SomeOp"(%3)
@@ -687,6 +687,16 @@ LogicalResult Rewrite(
   // Create compile op.
   auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
   builder->setInsertionPoint(cluster_func);
+
+  // Create the TPUCompileMlir and TPUCompileSucceededAssert outside of
+  // parallel_execute region if it exists.
+  if (llvm::isa<tf_device::ParallelExecuteOp>(cluster_func.getParentOp())) {
+    // Currently, outside compilation and model parallelism are not supported
+    // together.
+    assert(num_cores_per_replica == 1);
+    builder->setInsertionPoint(cluster_func.getParentOp());
+  }
+
   Operation* compile_op = BuildCompileOp(
       cluster_func, num_replicas, num_cores_per_replica,
       tpu_device_assignment.compilation_device,
@@ -712,6 +722,7 @@ LogicalResult Rewrite(
       num_cores_per_replica, cluster_func, &output_shardings);
   if (failed(result)) return failure();
 
+  builder->setInsertionPoint(cluster_func);
   if (num_cores_per_replica > 1) {
     // For model parallelism, tf_device.parallel_execute is used to express
     // concurrent device execution across multiple logical devices.

From dadd064215d1fbf80eac7fee588aa5ad31ccf1c2 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 11 May 2020 11:55:40 -0700
Subject: [PATCH 0324/1533] Simplify tf::ConstOp's inferReturnTypes

Interface has now changed to use DictionaryAttr so use its query methods instead. Need to verify attribute still as this can be called before the op is created so we can't rely on the op's verification from catching these yet (this should be refactored upstream).

PiperOrigin-RevId: 310960561
Change-Id: I97eb17084e288613078896209aa3fec09f261dae
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index edba135e0f0..85baff5e0d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -986,18 +986,15 @@ LogicalResult ConstOp::inferReturnTypes(
     MLIRContext *context, Optional<Location> location, ValueRange operands,
     DictionaryAttr attributes, RegionRange regions,
     SmallVectorImpl<Type> &inferredReturnTypes) {
-  for (NamedAttribute named_attr : attributes) {
-    if (named_attr.first.strref() != "value") continue;
-    auto value = named_attr.second;
-    if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
-      inferredReturnTypes.assign({elem_attr.getType()});
-      return success();
-    }
-    return emitOptionalError(location,
-                             "attribute 'value' failed to satisfy constraint: "
-                             "constant vector/tensor");
+  auto value = attributes.get("value");
+  if (!value) return emitOptionalError(location, "missing attribute 'value'");
+  if (auto elem_attr = value.dyn_cast<ElementsAttr>()) {
+    inferredReturnTypes.assign({elem_attr.getType()});
+    return success();
   }
-  return emitOptionalError(location, "missing attribute 'value'");
+  return emitOptionalError(location,
+                           "attribute 'value' failed to satisfy constraint: "
+                           "constant vector/tensor");
 }
 
 //===----------------------------------------------------------------------===//

From d000961fcd283638ff2fd9fadb0a3c9fcce5db07 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 11 May 2020 19:12:51 +0000
Subject: [PATCH 0325/1533] Bazel buildifier lint fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 4a1b9318f29..daa6093a460 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1337,9 +1337,9 @@ tf_kernel_library(
         "tile_functor_cpu_int32.cc",
         "tile_functor_cpu_int64.cc",
         "tile_functor_cpu_int8.cc",
+        "tile_functor_cpu_tstring.cc",
         "tile_functor_cpu_uint32.cc",
         "tile_functor_cpu_uint64.cc",
-        "tile_functor_cpu_tstring.cc",
         "tile_functor_cpu_uint8.cc",
         "tile_functor_sycl.cc",
     ],

From 62100290ef219da6c67fbf7de13c473bf6fde4cc Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Mon, 11 May 2020 12:15:12 -0700
Subject: [PATCH 0326/1533] Merge TPU execution device with host device (NFC).

For every execution device, its associated host device is also returned, when calling GetTPUCompilationAndExecutionDevices.

PiperOrigin-RevId: 310964957
Change-Id: I7d6af5053d32413d5b6086693ab54c0ed2215efc
---
 .../tensorflow/transforms/tpu_rewrite_pass.cc |  31 ++--
 .../utils/tpu_rewrite_device_util.cc          |  63 +++----
 .../utils/tpu_rewrite_device_util.h           |  46 +++---
 .../utils/tpu_rewrite_device_util_test.cc     | 155 +++++++++---------
 4 files changed, 145 insertions(+), 150 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index ae1bdff97e0..f5e9da915c8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -448,19 +448,20 @@ Operation* BuildCompileOp(
 // core, and all replica devices per core are grouped together.
 void AssignDevicesToReplicate(
     tf_device::ReplicateOp replicate,
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     OpBuilder* builder) {
   if (!replicate) return;
 
-  const int num_replicas = execution_devices.size();
-  const int num_cores_per_replica = execution_devices.front().size();
+  const int num_replicas = tpu_devices.size();
+  const int num_cores_per_replica = tpu_devices.front().size();
 
   llvm::SmallVector<NamedAttribute, 8> device_attrs;
   for (int core = 0; core < num_cores_per_replica; ++core) {
     llvm::SmallVector<StringRef, 8> devices_by_core;
     devices_by_core.reserve(num_replicas);
     for (int replica = 0; replica < num_replicas; ++replica)
-      devices_by_core.push_back(execution_devices[replica][core]);
+      devices_by_core.push_back(tpu_devices[replica][core].device);
 
     device_attrs.push_back(
         builder->getNamedAttr(tensorflow::GetDeviceAliasForLogicalCore(core),
@@ -492,11 +493,12 @@ LogicalResult BuildExecuteOp(
 // Creates a tf_device.parallel_execute op that wraps TPUExecute op to
 // represent execution of TPU program in multiple logical cores.
 LogicalResult BuildParallelExecuteOp(
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     llvm::ArrayRef<xla::OpSharding> output_sharding_config,
     Operation* compile_op, tf_device::ClusterFuncOp cluster_func,
     OpBuilder* builder, tf_device::ParallelExecuteOp* parallel_execute_op) {
-  const int num_cores_per_replica = execution_devices.front().size();
+  const int num_cores_per_replica = tpu_devices.front().size();
   // parallel_execute op returns concatenated list of return values of
   // all its regions.
   //
@@ -528,7 +530,7 @@ LogicalResult BuildParallelExecuteOp(
       num_cores_per_replica, cluster_func, builder, &input_list);
   if (failed(result)) return failure();
 
-  const bool replicated = execution_devices.size() != 1;
+  const bool replicated = tpu_devices.size() != 1;
   // For each logical core, create a region with TPUExecute op.
   assert(input_list.size() == num_cores_per_replica);
   for (int core = 0; core < num_cores_per_replica; ++core) {
@@ -553,7 +555,7 @@ LogicalResult BuildParallelExecuteOp(
     // op.
     std::string device = replicated
                              ? tensorflow::GetDeviceAliasForLogicalCore(core)
-                             : execution_devices.front()[core];
+                             : tpu_devices.front()[core].device;
 
     auto region_launch_op =
         WrapOpInLaunch(builder, region.getParent()->getLoc(), execute, device);
@@ -566,13 +568,14 @@ LogicalResult BuildParallelExecuteOp(
 }
 
 tf_device::LaunchOp AssignDevicesToReplicatedExecute(
-    llvm::ArrayRef<llvm::SmallVector<std::string, 8>> execution_devices,
+    llvm::ArrayRef<llvm::SmallVector<tensorflow::TPUDeviceAndHost, 8>>
+        tpu_devices,
     Operation* execute_op, OpBuilder* builder) {
-  const bool replicated = execution_devices.size() != 1;
+  const bool replicated = tpu_devices.size() != 1;
   // If computation is replicated, use aliased device. Otherwise there is only
   // one execution device and the device is assigned to the execute op.
   std::string device = replicated ? tensorflow::GetDeviceAliasForLogicalCore(0)
-                                  : execution_devices.front().front();
+                                  : tpu_devices.front().front().device;
 
   return WrapOpInLaunch(builder, execute_op->getLoc(), execute_op, device);
 }
@@ -714,7 +717,7 @@ LogicalResult Rewrite(
   BuildTPUCompileSucceededAssertOp(
       compile_op, tpu_device_assignment.compilation_device, builder);
 
-  AssignDevicesToReplicate(replicate, tpu_device_assignment.execution_devices,
+  AssignDevicesToReplicate(replicate, tpu_device_assignment.tpu_devices,
                            builder);
 
   llvm::SmallVector<xla::OpSharding, 4> output_shardings;
@@ -728,7 +731,7 @@ LogicalResult Rewrite(
     // concurrent device execution across multiple logical devices.
 
     tf_device::ParallelExecuteOp execute_op;
-    result = BuildParallelExecuteOp(tpu_device_assignment.execution_devices,
+    result = BuildParallelExecuteOp(tpu_device_assignment.tpu_devices,
                                     output_shardings, compile_op, cluster_func,
                                     builder, &execute_op);
     if (failed(result)) return failure();
@@ -751,7 +754,7 @@ LogicalResult Rewrite(
     if (failed(result)) return failure();
 
     tf_device::LaunchOp launch_op = AssignDevicesToReplicatedExecute(
-        tpu_device_assignment.execution_devices, execute_op, builder);
+        tpu_device_assignment.tpu_devices, execute_op, builder);
     cluster_func.replaceAllUsesWith(launch_op);
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index ddbcc91e834..06c10c26835 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -164,12 +164,19 @@ std::string GetTPUCompilationDevice(Device system_device) {
   return DeviceNameUtils::ParsedNameToString(system_device);
 }
 
+// Finds the host CPU device for a given TPU device.
+std::string GetCPUHostDeviceForTPUDevice(Device tpu_device) {
+  tpu_device.type = DEVICE_CPU;
+  tpu_device.id = 0;
+  return DeviceNameUtils::ParsedNameToString(tpu_device);
+}
+
 // Determines execution devices when topology and device assignment are not
 // defined. This is a special case where a single core computation is replicated
 // to every core in the mesh. TPU devices are simply added to
 // `execution_devices` of one replica. `num_replicas` must be 1 or the total
 // number of TPU devices available, and `num_cores_per_replica` must be 1.
-StatusOr<ExecutionDevices> GetFullMeshTPUExecutionDeviceAssignment(
+StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices) {
   const int num_tasks = tpu_devices.size();
@@ -185,17 +192,18 @@ StatusOr<ExecutionDevices> GetFullMeshTPUExecutionDeviceAssignment(
         "'num_cores_per_replica' must be equal to 1, got ",
         num_cores_per_replica);
 
-  ExecutionDevices execution_devices;
-  execution_devices.reserve(num_replicas);
+  TPUDevicesAndHosts devices_and_hosts;
+  devices_and_hosts.reserve(num_replicas);
   for (int i = 0; i < num_replicas; ++i) {
     const int task = i / num_tpus_per_task;
     const int device = i % num_tpus_per_task;
-    execution_devices.push_back(
-        {tensorflow::DeviceNameUtils::ParsedNameToString(
-            tpu_devices[task][device])});
+    const auto& tpu_device = tpu_devices[task][device];
+    devices_and_hosts.push_back({TPUDeviceAndHost(
+        /*device=*/tensorflow::DeviceNameUtils::ParsedNameToString(tpu_device),
+        /*host=*/GetCPUHostDeviceForTPUDevice(tpu_device))});
   }
 
-  return execution_devices;
+  return devices_and_hosts;
 }
 
 // Helper struct for keeping track of task and device for an associated TPU
@@ -326,7 +334,7 @@ StatusOr<xla::Array4D<TaskAndDevice>> ParseTopologyAttr(
 //  - number of device coordinates (in tuple 3) match number 'num_replicas' *
 //    'num_cores_per_replica'
 //  - a TPU device associated with each device coordinate
-StatusOr<std::pair<ExecutionDevices, xla::DeviceAssignmentProto>>
+StatusOr<std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>>
 GetGeneralTPUExecutionDeviceAssignment(
     int num_replicas, int num_cores_per_replica,
     llvm::ArrayRef<llvm::SmallVector<Device, 8>> tpu_devices,
@@ -361,9 +369,9 @@ GetGeneralTPUExecutionDeviceAssignment(
   std::vector<bool> used_device_ids(
       location_to_id(bound_x - 1, bound_y - 1, bound_z - 1, bound_core - 1),
       false);
-  ExecutionDevices execution_devices(
-      num_replicas,
-      llvm::SmallVector<std::string, 8>(num_cores_per_replica, ""));
+  TPUDevicesAndHosts devices_and_hosts(
+      num_replicas, llvm::SmallVector<TPUDeviceAndHost, 8>(
+                        num_cores_per_replica, TPUDeviceAndHost()));
   xla::DeviceAssignment device_assignment(num_replicas, num_cores_per_replica);
   int pos = 0;
   for (int replica = 0; replica < num_replicas; ++replica) {
@@ -393,16 +401,18 @@ GetGeneralTPUExecutionDeviceAssignment(
 
       used_device_ids[device_id] = true;
       device_assignment(replica, logical_core) = device_id;
-      execution_devices[replica][logical_core] =
-          DeviceNameUtils::ParsedNameToString(tpu_devices[task][device]);
+      auto& device_and_host = devices_and_hosts[replica][logical_core];
+      const auto& tpu_device = tpu_devices[task][device];
+      device_and_host.device = DeviceNameUtils::ParsedNameToString(tpu_device);
+      device_and_host.host = GetCPUHostDeviceForTPUDevice(tpu_device);
     }
   }
 
   xla::DeviceAssignmentProto device_assignment_proto;
   TF_RETURN_IF_ERROR(device_assignment.Serialize(&device_assignment_proto));
 
-  return std::pair<ExecutionDevices, xla::DeviceAssignmentProto>(
-      std::move(execution_devices), std::move(device_assignment_proto));
+  return std::pair<TPUDevicesAndHosts, xla::DeviceAssignmentProto>(
+      std::move(devices_and_hosts), std::move(device_assignment_proto));
 }
 
 }  // anonymous namespace
@@ -447,27 +457,4 @@ std::string GetDeviceAliasForLogicalCore(int core_index) {
   return llvm::formatv("{0}_{1}", kTPUReplicatedCore, core_index).str();
 }
 
-StatusOr<std::string> GetCPUHostForTPUDevice(llvm::StringRef tpu_device) {
-  Device device;
-  if (!DeviceNameUtils::ParseFullName(tpu_device.str(), &device))
-    return errors::InvalidArgument("'", tpu_device.str(),
-                                   "' is not a valid device");
-
-  device.type = DEVICE_CPU;
-  device.id = 0;
-  return DeviceNameUtils::ParsedNameToString(device);
-}
-
-StatusOr<llvm::SmallVector<std::string, 8>> GetCPUHostsForTPUDevices(
-    llvm::ArrayRef<std::string> tpu_devices) {
-  llvm::SmallVector<std::string, 8> cpu_devices;
-  cpu_devices.reserve(tpu_devices.size());
-  for (const auto& tpu_device : tpu_devices) {
-    TF_ASSIGN_OR_RETURN(cpu_devices.emplace_back(),
-                        GetCPUHostForTPUDevice(tpu_device));
-  }
-
-  return cpu_devices;
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 47ce7f14ea8..5fdb6b8768b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -30,29 +30,40 @@ limitations under the License.
 namespace tensorflow {
 using stream_executor::port::StatusOr;
 
-// TPU devices to be used for execution (e.g. devices for TPUExecute ops). They
-// are ordered by `num_replicas` followed by `num_cores_per_replica`.
-using ExecutionDevices =
-    llvm::SmallVector<llvm::SmallVector<std::string, 8>, 8>;
+// A TPU device for execution alongside its associated host CPU device.
+struct TPUDeviceAndHost {
+  TPUDeviceAndHost() {}
+  TPUDeviceAndHost(llvm::StringRef device, llvm::StringRef host)
+      : device(device), host(host) {}
 
-// TPU compilation device, execution devices, and optionally execution device
-// IDs. Execution device IDs are populated if `topology` and `device_assignment`
-// are provided.
+  std::string device;
+  std::string host;
+};
+
+// TPU devices to be used for execution (e.g. devices for TPUExecute ops) and
+// their associated host CPU devices (for outside compilation). They are ordered
+// by `num_replicas` followed by `num_cores_per_replica`.
+using TPUDevicesAndHosts =
+    llvm::SmallVector<llvm::SmallVector<TPUDeviceAndHost, 8>, 8>;
+
+// TPU compilation device, execution and associated host devices, and optionally
+// execution device IDs. Execution device IDs are populated if `topology` and
+// `device_assignment` are provided.
 struct TPUDeviceAssignment {
   TPUDeviceAssignment(llvm::StringRef compilation_device,
-                      ExecutionDevices&& execution_devices)
+                      TPUDevicesAndHosts&& tpu_devices)
       : compilation_device(compilation_device),
-        execution_devices(std::move(execution_devices)) {}
+        tpu_devices(std::move(tpu_devices)) {}
 
   TPUDeviceAssignment(llvm::StringRef compilation_device,
-                      ExecutionDevices&& execution_devices,
+                      TPUDevicesAndHosts&& tpu_devices,
                       xla::DeviceAssignmentProto&& xla_device_assignment)
       : compilation_device(compilation_device),
-        execution_devices(std::move(execution_devices)),
+        tpu_devices(std::move(tpu_devices)),
         xla_device_assignment(std::move(xla_device_assignment)) {}
 
   std::string compilation_device;
-  ExecutionDevices execution_devices;
+  TPUDevicesAndHosts tpu_devices;
   llvm::Optional<xla::DeviceAssignmentProto> xla_device_assignment;
 };
 
@@ -216,17 +227,6 @@ StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
 // logical core.
 std::string GetDeviceAliasForLogicalCore(int core_index);
 
-// Finds associated CPU host device for given TPU device. This assumes a
-// matching CPU host device exists based on TPU device name. An error will be
-// returned if the TPU device name is invalid.
-StatusOr<std::string> GetCPUHostForTPUDevice(llvm::StringRef tpu_device);
-
-// Finds associated CPU host devices for given TPU devices. This assumes a
-// matching CPU host device exist based on each TPU device name. An error will
-// be returned if a TPU device name is invalid.
-StatusOr<llvm::SmallVector<std::string, 8>> GetCPUHostsForTPUDevices(
-    llvm::ArrayRef<std::string> tpu_devices);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_REWRITE_DEVICE_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 57e123a5f9a..7ac5635a6e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -323,30 +323,46 @@ TEST(TPURewriteDeviceUtilTest, ValidFullMeshDeviceAssignment) {
 
   TF_ASSERT_OK(status_or.status());
 
-  auto& tpu_device_assignment = status_or.ValueOrDie();
+  const auto& tpu_device_assignment = status_or.ValueOrDie();
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 8);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 1);
+  const auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 8);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 1);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
-  EXPECT_EQ(execution_devices[2][0],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][0].device,
             "/job:worker/replica:0/task:0/device:TPU:2");
-  EXPECT_EQ(execution_devices[3][0],
+  EXPECT_EQ(tpu_devices[2][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][0].device,
             "/job:worker/replica:0/task:0/device:TPU:3");
-  EXPECT_EQ(execution_devices[4][0],
+  EXPECT_EQ(tpu_devices[3][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[4][0].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[5][0],
+  EXPECT_EQ(tpu_devices[4][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[5][0].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
-  EXPECT_EQ(execution_devices[6][0],
+  EXPECT_EQ(tpu_devices[5][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[6][0].device,
             "/job:worker/replica:0/task:1/device:TPU:2");
-  EXPECT_EQ(execution_devices[7][0],
+  EXPECT_EQ(tpu_devices[6][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[7][0].device,
             "/job:worker/replica:0/task:1/device:TPU:3");
+  EXPECT_EQ(tpu_devices[7][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
 
   EXPECT_FALSE(tpu_device_assignment.xla_device_assignment.hasValue());
 }
@@ -410,30 +426,46 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh2x2x2) {
 
   TF_ASSERT_OK(status_or.status());
 
-  auto& tpu_device_assignment = status_or.ValueOrDie();
+  const auto& tpu_device_assignment = status_or.ValueOrDie();
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 4);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 2);
+  const auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 4);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 2);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[0][1],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][1].device,
             "/job:worker/replica:0/task:1/device:TPU:3");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
-  EXPECT_EQ(execution_devices[1][1],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][1].device,
             "/job:worker/replica:0/task:1/device:TPU:2");
-  EXPECT_EQ(execution_devices[2][0],
+  EXPECT_EQ(tpu_devices[1][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][0].device,
             "/job:worker/replica:0/task:0/device:TPU:3");
-  EXPECT_EQ(execution_devices[2][1],
+  EXPECT_EQ(tpu_devices[2][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[2][1].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[3][0],
+  EXPECT_EQ(tpu_devices[2][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][0].device,
             "/job:worker/replica:0/task:0/device:TPU:2");
-  EXPECT_EQ(execution_devices[3][1],
+  EXPECT_EQ(tpu_devices[3][0].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[3][1].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
+  EXPECT_EQ(tpu_devices[3][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
 
   auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
   ASSERT_TRUE(xla_device_assignment.hasValue());
@@ -511,23 +543,35 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(tpu_device_assignment.compilation_device,
             "/job:worker/replica:0/task:0/device:CPU:0");
 
-  auto& execution_devices = tpu_device_assignment.execution_devices;
-  ASSERT_EQ(execution_devices.size(), 2);
-  for (const auto& replica_execution_device : execution_devices)
-    ASSERT_EQ(replica_execution_device.size(), 3);
+  auto& tpu_devices = tpu_device_assignment.tpu_devices;
+  ASSERT_EQ(tpu_devices.size(), 2);
+  for (const auto& replica_tpu_devices : tpu_devices)
+    ASSERT_EQ(replica_tpu_devices.size(), 3);
 
-  EXPECT_EQ(execution_devices[0][0],
+  EXPECT_EQ(tpu_devices[0][0].device,
             "/job:worker/replica:0/task:1/device:TPU:1");
-  EXPECT_EQ(execution_devices[0][1],
+  EXPECT_EQ(tpu_devices[0][0].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][1].device,
             "/job:worker/replica:0/task:1/device:TPU:0");
-  EXPECT_EQ(execution_devices[0][2],
+  EXPECT_EQ(tpu_devices[0][1].host,
+            "/job:worker/replica:0/task:1/device:CPU:0");
+  EXPECT_EQ(tpu_devices[0][2].device,
             "/job:worker/replica:0/task:2/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][0],
+  EXPECT_EQ(tpu_devices[0][2].host,
+            "/job:worker/replica:0/task:2/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][0].device,
             "/job:worker/replica:0/task:2/device:TPU:1");
-  EXPECT_EQ(execution_devices[1][1],
+  EXPECT_EQ(tpu_devices[1][0].host,
+            "/job:worker/replica:0/task:2/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][1].device,
             "/job:worker/replica:0/task:0/device:TPU:0");
-  EXPECT_EQ(execution_devices[1][2],
+  EXPECT_EQ(tpu_devices[1][1].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
+  EXPECT_EQ(tpu_devices[1][2].device,
             "/job:worker/replica:0/task:0/device:TPU:1");
+  EXPECT_EQ(tpu_devices[1][2].host,
+            "/job:worker/replica:0/task:0/device:CPU:0");
 
   auto& xla_device_assignment = tpu_device_assignment.xla_device_assignment;
   ASSERT_TRUE(xla_device_assignment.hasValue());
@@ -552,44 +596,5 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(computation_device_2.replica_device_ids(1), 3);
 }
 
-struct ParameterizedCPUHostForTPUDeviceTest
-    : ::testing::TestWithParam<std::tuple<std::string, std::string>> {};
-
-TEST_P(ParameterizedCPUHostForTPUDeviceTest, CPUHostForTPUDevice) {
-  auto status_or_device = GetCPUHostForTPUDevice(std::get<0>(GetParam()));
-  TF_ASSERT_OK(status_or_device.status());
-  EXPECT_EQ(status_or_device.ValueOrDie(), std::get<1>(GetParam()));
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    CPUHostForTPUDevice, ParameterizedCPUHostForTPUDeviceTest,
-    ::testing::Values(
-        std::make_tuple("/job:worker/replica:0/task:0/device:TPU:0",
-                        "/job:worker/replica:0/task:0/device:CPU:0"),
-        std::make_tuple("/job:worker/replica:0/task:1/device:TPU:1",
-                        "/job:worker/replica:0/task:1/device:CPU:0")));
-
-TEST(TPURewriteDeviceUtilTest, CPUHostForTPUDeviceInvalidDevice) {
-  auto status_or_device = GetCPUHostForTPUDevice("bad_device");
-  ASSERT_FALSE(status_or_device.ok());
-}
-
-TEST(TPURewriteDeviceUtilTest, CPUHostsForTPUDevices) {
-  auto status_or_devices =
-      GetCPUHostsForTPUDevices({"/job:worker/replica:0/task:0/device:TPU:0",
-                                "/job:worker/replica:0/task:1/device:TPU:1"});
-  TF_ASSERT_OK(status_or_devices.status());
-  const auto& devices = status_or_devices.ValueOrDie();
-  ASSERT_EQ(devices.size(), 2);
-  EXPECT_EQ(devices[0], "/job:worker/replica:0/task:0/device:CPU:0");
-  EXPECT_EQ(devices[1], "/job:worker/replica:0/task:1/device:CPU:0");
-}
-
-TEST(TPURewriteDeviceUtilTest, CPUHostsForTPUDevicesInvalidDevice) {
-  auto status_or_devices = GetCPUHostsForTPUDevices(
-      {"/job:worker/replica:0/task:0/device:TPU:0", "bad_device"});
-  ASSERT_FALSE(status_or_devices.ok());
-}
-
 }  // anonymous namespace
 }  // namespace tensorflow

From a3c55c2437b021afb9b5f9be61bba0ac4bba902a Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Mon, 11 May 2020 12:15:57 -0700
Subject: [PATCH 0327/1533] Reduce space required for TFLM by dropping two
 non-needed pointers from MicroAllocator.

PiperOrigin-RevId: 310965121
Change-Id: I1d7abe758ef14bfe722c24bfe8878a37a4c738f8
---
 tensorflow/lite/micro/micro_allocator.cc | 20 +++++++++-----------
 tensorflow/lite/micro/micro_allocator.h  |  2 --
 2 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index c2b32166467..54ce3383a08 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -388,10 +388,8 @@ TfLiteStatus MicroAllocator::Init() {
     return kTfLiteError;
   }
   subgraph_ = (*subgraphs)[0];
-  tensors_ = subgraph_->tensors();
-  operators_ = subgraph_->operators();
 
-  context_->tensors_size = tensors_->size();
+  context_->tensors_size = subgraph_->tensors()->size();
   context_->tensors =
       reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
           sizeof(TfLiteTensor) * context_->tensors_size,
@@ -405,9 +403,9 @@ TfLiteStatus MicroAllocator::Init() {
   }
 
   // Initialize runtime tensors in context_ using the flatbuffer.
-  for (size_t i = 0; i < tensors_->size(); ++i) {
+  for (size_t i = 0; i < subgraph_->tensors()->size(); ++i) {
     TfLiteStatus status = internal::InitializeRuntimeTensor(
-        memory_allocator_, *tensors_->Get(i), model_->buffers(),
+        memory_allocator_, *subgraph_->tensors()->Get(i), model_->buffers(),
         error_reporter_, &context_->tensors[i]);
     if (status != kTfLiteOk) {
       TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
@@ -472,7 +470,7 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
 
   auto* output = reinterpret_cast<NodeAndRegistration*>(
       memory_allocator_->AllocateFromTail(
-          sizeof(NodeAndRegistration) * operators_->size(),
+          sizeof(NodeAndRegistration) * subgraph_->operators()->size(),
           alignof(NodeAndRegistration)));
   if (output == nullptr) {
     TF_LITE_REPORT_ERROR(
@@ -483,8 +481,8 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
   TfLiteStatus status = kTfLiteOk;
   auto* opcodes = model_->operator_codes();
   MicroBuiltinDataAllocator builtin_data_allocator(memory_allocator_);
-  for (size_t i = 0; i < operators_->size(); ++i) {
-    const auto* op = operators_->Get(i);
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
+    const auto* op = subgraph_->operators()->Get(i);
     size_t index = op->opcode_index();
     if (index >= opcodes->size()) {
       TF_LITE_REPORT_ERROR(error_reporter_,
@@ -567,7 +565,7 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
 
     AllocationInfoBuilder builder(error_reporter_, &tmp_allocator);
     TF_LITE_ENSURE_STATUS(
-        builder.Init(tensors_->size(), scratch_buffer_count_));
+        builder.Init(subgraph_->tensors()->size(), scratch_buffer_count_));
     TF_LITE_ENSURE_STATUS(builder.AddTensors(subgraph_, context_->tensors));
     TF_LITE_ENSURE_STATUS(builder.AddScratchBuffers(scratch_buffer_handles_));
     const AllocationInfo* allocation_info = builder.Finish();
@@ -606,8 +604,8 @@ TfLiteStatus MicroAllocator::FinishTensorAllocation() {
 
   // Data in variables need to be kept for the next invocation so allocating
   // them from the tail (persistent area).
-  if (AllocateVariables(tensors_, context_->tensors, memory_allocator_) !=
-      kTfLiteOk) {
+  if (AllocateVariables(subgraph_->tensors(), context_->tensors,
+                        memory_allocator_) != kTfLiteOk) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
         "Failed to allocate variables. Please increase arena size.");
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index b16f814071c..6a6e1e03e53 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -135,8 +135,6 @@ class MicroAllocator {
   size_t scratch_buffer_count_ = 0;
 
   const SubGraph* subgraph_;
-  const flatbuffers::Vector<flatbuffers::Offset<Operator>>* operators_;
-  const flatbuffers::Vector<flatbuffers::Offset<Tensor>>* tensors_;
 };
 
 }  // namespace tflite

From 092ee88aa239ed3b1ca568982a9709e074bfd046 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 11 May 2020 12:26:01 -0700
Subject: [PATCH 0328/1533] Re-update the TFRT hash.

PiperOrigin-RevId: 310967113
Change-Id: Icc0968613cd6f75b0116b86c88dcffc94a99e72a
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9a41d947922..abfd35949ec 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -162,8 +162,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
-    TFRT_COMMIT = "d8e16ab39a26772027c1846386a9a0fef613e470"
-    TFRT_SHA256 = "d51a6fecae1319d225cc84614c96370ea3eb4e8824b79b7da6bf0e5caeb8fbdd"
+    TFRT_COMMIT = "341ba0448c117af4e29ae3911141265ee8e57860"
+    TFRT_SHA256 = "27716458f8ca7d91fc2d0f681127dbdd478eea78d6da5153c51b4696ebd14d55"
     TFRT_URLS = [
         "http://mirror.tensorflow.org/github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
         "https://github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),

From 833be6f54c74138daa38bd5b7d34179f9cb9e6bb Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 11 May 2020 12:44:35 -0700
Subject: [PATCH 0329/1533] Reduce inference functions exposed

This retains every call sequence as is (e.g., only caller to  InferShapeForFunctionType was inference pass and that was always preceded by InferShapeUntilFixPoint, so now the case where we do not refine the arg shapes, simply does that).

PiperOrigin-RevId: 310971563
Change-Id: Ibd0357c20f2cf2c856a200099070b4457efe9f57
---
 .../tensorflow/transforms/shape_inference.cc  | 34 +++++++++++++------
 .../tensorflow/transforms/shape_inference.h   | 19 +----------
 .../transforms/shape_inference_pass.cc        |  6 ++--
 3 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index efe82c4268b..41902c46b40 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -536,6 +536,11 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
   return ic->MakeShape(dims);
 }
 
+// Performs shape inference on the provided op and return true if the type of
+// at least one result has been changed.
+// A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
+// `graph_version` indicates the current GraphDef compatibility versions
+// (the versions field in graph.proto).
 bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
                                   int64_t graph_version) {
   assert(tf_dialect == op->getDialect());
@@ -755,6 +760,12 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   return changed;
 }
 
+// Infers shape on the provided region, including nested ones, iterate until fix
+// point with a limit of max_iteration. Returns success if fix point is reached
+// before max_iteration.
+LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
+                                      int64_t max_iteration = 10);
+
 // Updates input types and refine shapes inside body of functions that are
 // attached to ControlFlow ops (If/While). These functions include Then/Else
 // branches of IfOp and Cond/Body functions of WhileOp. These functions share
@@ -945,6 +956,19 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                     int64_t graph_version) {
+  if (arg_shapes.empty()) {
+    if (failed(InferShapeUntilFixPoint(&func.getBody(), graph_version)))
+      return failure();
+    // TODO(b/156276510): Verify that it is always fine to refine a function's
+    // return type, as long as we do not change the argument shapes.
+    if (auto return_types = InferShapeForFunctionReturnType(func)) {
+      func.setType(mlir::FunctionType::get(func.getType().getInputs(),
+                                           return_types.getValue(),
+                                           func.getContext()));
+    }
+
+    return success();
+  }
   mlir::FunctionType func_type = func.getType();
   bool needs_refinement = false;
   llvm::SmallVector<mlir::Type, 4> new_arg_types;
@@ -998,15 +1022,5 @@ LogicalResult InferShapeForFunction(FuncOp func,
   return success();
 }
 
-LogicalResult InferShapeForFunctionType(FuncOp func) {
-  if (auto return_types = InferShapeForFunctionReturnType(func)) {
-    func.setType(mlir::FunctionType::get(func.getType().getInputs(),
-                                         return_types.getValue(),
-                                         func.getContext()));
-  }
-
-  return success();
-}
-
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
index 0524ec678ed..e36d8d56d6d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -27,30 +27,13 @@ namespace mlir {
 
 namespace TF {
 
-// Performs shape inference on the provided op and return true if the type of
-// at least one result has been changed.
-// A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
-// `graph_version` indicates the current GraphDef compatibility versions
-// (the versions field in graph.proto).
-bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
-                                  int64_t graph_version);
-
-// Infers shape on the provided region, including nested ones, iterate until fix
-// point with a limit of max_iteration. Returns success if fix point is reached
-// before max_iteration.
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration = 10);
-
 // Given a list of refined shapes matching the function arguments of func, runs
 // shape inference over the function to propagate this updated information.
+// If arg_shapes are empty, then argument shapes will be left unchanged.
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                     int64_t graph_version);
 
-// Refines the return type of the given function by folding tf.Cast that
-// precedes the return instruction.
-LogicalResult InferShapeForFunctionType(FuncOp func);
-
 }  // namespace TF
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index 48e4e77ce0f..acdfc0eb039 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -58,10 +58,8 @@ struct ShapeInference
     }
     int64_t producer = producer_or.ValueOrDie();
     for (auto func : module.getOps<FuncOp>()) {
-      InferShapeUntilFixPoint(&func.getBody(), producer);
-      // TODO(yuanzx): Verify that it is always fine to refine a function's
-      // return type, as long as we do not change the argument shapes.
-      InferShapeForFunctionType(func);
+      if (failed(InferShapeForFunction(func, /*arg_shapes=*/{}, producer)))
+        return signalPassFailure();
     }
   }
 };

From 203d8f502c3a18fd0334c4120ff384be90d100eb Mon Sep 17 00:00:00 2001
From: Jing Dong <jingdong@google.com>
Date: Mon, 11 May 2020 12:49:56 -0700
Subject: [PATCH 0330/1533] Remove disable_tfrt for scalars are not handled
 reason, as scalars are handled properly with TFRT.

PiperOrigin-RevId: 310972818
Change-Id: I918d193bc639c9b12dbdc982b8c9065edea01056
---
 tensorflow/python/eager/benchmarks_test.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index f2f13279927..227fca5ea6f 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -180,22 +180,18 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()  # Warmup.
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_float_constant(self):
     self._benchmark_create_constant(42.0, dtype=None)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_float_constant_uncached(self):
     self._benchmark_create_constant(42.0, dtype=None, cached=False)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_int32_constant(self):
     if context.num_gpus():
       return  # int32 constants are always allocated on CPU.
 
     self._benchmark_create_constant(42, dtype=dtypes.int32)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_create_int32_constant_uncached(self):
     if context.num_gpus():
       return  # int32 constants are always allocated on CPU.
@@ -211,21 +207,17 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()  # Warmup.
       self._run(func, 30000)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_float_scalars(self):
     self._benchmark_add(42.0, 24.0)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_int32_scalars(self):
     self._benchmark_add(42, 24)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_float_scalar_tensor(self):
     tensor_a = constant_op.constant(42.0)
     tensor_b = constant_op.constant(24.0)
     self._benchmark_add(tensor_a, tensor_b)
 
-  @test_util.disable_tfrt("Scalars are not handled correctly")
   def benchmark_add_int32_scalar_tensor(self):
     tensor_a = constant_op.constant(42)
     tensor_b = constant_op.constant(24)

From 7a691ccd98590e62e885553611eff4fde3d26e93 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 13:01:44 -0700
Subject: [PATCH 0331/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/728cf6d86b4f

PiperOrigin-RevId: 310975358
Change-Id: I2d0978a23c371702c5d83266c8214223bf267630
---
 tensorflow/compiler/mlir/lite/BUILD           |  2 +-
 tensorflow/compiler/mlir/tensorflow/BUILD     |  2 +-
 tensorflow/compiler/mlir/xla/BUILD            |  4 +-
 .../xla/transforms/lhlo_legalize_to_gpu.cc    |  7 +-
 .../lhlo_legalize_to_parallel_loops.cc        | 97 +++++++++----------
 .../compiler/xla/service/mlir_gpu/BUILD       |  4 +-
 .../xla/service/mlir_gpu/kernel_lowering.cc   | 14 +--
 third_party/mlir/BUILD                        | 74 +++++++-------
 third_party/mlir/test.BUILD                   |  2 +-
 9 files changed, 102 insertions(+), 104 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index d907d28b2c7..f99b2806faf 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -695,9 +695,9 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopOpsTransforms",
         "@llvm-project//mlir:MlirTranslateMain",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Translation",
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 9099f2be2e1..5c5b74a30ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -556,7 +556,7 @@ cc_library(
     deps = [
         ":tensorflow",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopOpsTransforms",
+        "@llvm-project//mlir:SCFTransforms",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index e0e93c3b195..d9108e8f3bc 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -240,8 +240,8 @@ cc_library(
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
@@ -278,8 +278,8 @@ cc_library(
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Transforms",
     ],
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
index e6f3ac02d4f..f0eb3cc1a0f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_gpu.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
@@ -112,7 +112,7 @@ class LhloReduceToGPULaunchConverter : public OpConversionPattern<ReduceOp> {
       auto step = rewriter.create<mlir::ConstantOp>(
           loc, rewriter.getIndexType(),
           rewriter.getIntegerAttr(rewriter.getIndexType(), 1));
-      auto loop = rewriter.create<mlir::loop::ForOp>(loc, zero, upper, step);
+      auto loop = rewriter.create<mlir::scf::ForOp>(loc, zero, upper, step);
 
       rewriter.setInsertionPointToStart(loop.getBody());
       // Compute memrefs for the value to reduce. This makes it easier to just
@@ -173,8 +173,7 @@ struct LhloLegalizeToGpu : public PassWrapper<LhloLegalizeToGpu, FunctionPass> {
     OwningRewritePatternList patterns;
     ConversionTarget target(getContext());
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
-                           gpu::GPUDialect, loop::LoopOpsDialect,
-                           XlaLhloDialect>();
+                           gpu::GPUDialect, scf::SCFDialect, XlaLhloDialect>();
     target.addIllegalOp<ReduceOp>();
     auto func = getFunction();
     patterns.insert<LhloReduceToGPULaunchConverter>(func.getContext());
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index 54b3acd3787..c5f5b39e04c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -64,12 +64,12 @@ Value ApplySingleResultLhloCode(Location loc, ValueRange operands,
 // into a reduction operator of loop.reduce by doing buffer allocation for
 // scalar arguments and the result of `loop.reduce` to make it compatible with
 // LHLO ops.
-void ConvertToReductionOperator(Location loc, loop::ReduceOp reduce_op,
+void ConvertToReductionOperator(Location loc, scf::ReduceOp reduce_op,
                                 Block* lhlo_block, OpBuilder* b) {
   Block& loop_reduce_op_body = reduce_op.reductionOperator().front();
   OpBuilder::InsertionGuard guard(*b);
   b->setInsertionPointToStart(&loop_reduce_op_body);
-  b->create<loop::ReduceReturnOp>(
+  b->create<scf::ReduceReturnOp>(
       loc, ApplySingleResultLhloCode(loc, loop_reduce_op_body.getArguments(),
                                      lhlo_block, b));
 }
@@ -136,9 +136,9 @@ MappedIvs MapWindowIvsToInput(OpTy op, ValueRange ivs, ValueRange window_ivs,
   return mapped_ivs;
 }
 
-// Returns loop::Parallel over a shaped value with static or dynamic shape.
-loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
-                                   OpBuilder* b) {
+// Returns scf::Parallel over a shaped value with static or dynamic shape.
+scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
+                                  OpBuilder* b) {
   Value zero = b->create<ConstantIndexOp>(loc, 0);
   Value one = b->create<ConstantIndexOp>(loc, 1);
 
@@ -151,10 +151,10 @@ loop::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
     lower.push_back(zero);
     step.push_back(one);
   }
-  return b->create<loop::ParallelOp>(loc, lower, upper, step);
+  return b->create<scf::ParallelOp>(loc, lower, upper, step);
 }
 
-// Converts `xla_lhlo.ReduceOp` into two loop::ParallelOp and a loop::ReduceOp.
+// Converts `xla_lhlo.ReduceOp` into two scf::ParallelOp and a scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops if there are
 // any. The inner `ParalleOp` refers to the reduction loops and `ReduceOp`
 // contains the reduction operator.
@@ -197,7 +197,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     // TODO(b/137624192) Implement variadic reduce.
     if (xla_reduce_op.out().size() != 1) return failure();
 
-    loop::ReduceOp reduce_op =
+    scf::ReduceOp reduce_op =
         CreateReduceOpInNestedParallelLoops(xla_reduce_op, &rewriter);
     ConvertToReductionOperator(xla_reduce_op.getLoc(), reduce_op,
                                &xla_reduce_op.body().front(), &rewriter);
@@ -225,7 +225,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
   //    } : f32
   //    loop.yield
   //  }
-  loop::ReduceOp CreateReduceOpInNestedParallelLoops(
+  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceOp xla_reduce_op,
       ConversionPatternRewriter* rewriter) const {
     auto loc = xla_reduce_op.getLoc();
@@ -254,13 +254,13 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     SmallVector<Value, 1> init_value = {
         rewriter->create<LoadOp>(loc, *xla_reduce_op.init_values().begin())};
     // Outer ParallelOp is not needed if it is a reduction across all dims.
-    loop::ParallelOp outer;
+    scf::ParallelOp outer;
     if (!parallel_lower.empty()) {
-      outer = rewriter->create<loop::ParallelOp>(loc, parallel_lower,
-                                                 parallel_upper, parallel_step);
+      outer = rewriter->create<scf::ParallelOp>(loc, parallel_lower,
+                                                parallel_upper, parallel_step);
       rewriter->setInsertionPointToStart(outer.getBody());
     }
-    loop::ParallelOp inner = rewriter->create<loop::ParallelOp>(
+    scf::ParallelOp inner = rewriter->create<scf::ParallelOp>(
         loc, reduce_lower, reduce_upper, reduce_step, init_value);
     Value reduction_result = *inner.getResults().begin();
 
@@ -294,7 +294,7 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
     rewriter->setInsertionPointToStart(inner.getBody());
     Value elem = rewriter->create<mlir::LoadOp>(
         loc, *xla_reduce_op.operands().begin(), indices);
-    return rewriter->create<loop::ReduceOp>(loc, elem);
+    return rewriter->create<scf::ReduceOp>(loc, elem);
   }
 };
 
@@ -314,8 +314,8 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 //     accumulator = reduction_operator(output[O], value)
 //   output[O] = accumulator
 //
-// Converts `xla_lhlo.ReduceWindowOp` into two loop::ParallelOp and a
-// loop::ReduceOp.
+// Converts `xla_lhlo.ReduceWindowOp` into two scf::ParallelOp and a
+// scf::ReduceOp.
 // The outper `ParallelOp` refers to the parallel loops that traverese output
 // buffer. The inner `ParalleOp` refers to the reduction loops that traverse
 // reduction windows and `ReduceOp` contains the reduction operator.
@@ -366,12 +366,12 @@ class ReduceWindowOpConverter
   LogicalResult matchAndRewrite(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op, ArrayRef<Value> /*args*/,
       ConversionPatternRewriter& rewriter) const final {
-    loop::ParallelOp output_loop, window_loop;
+    scf::ParallelOp output_loop, window_loop;
     std::tie(output_loop, window_loop) =
         CreateParallelLoopsToTraverseOutputAndWindow(xla_reduce_window_op,
                                                      &rewriter);
 
-    loop::ReduceOp reduce_op = CreateReduceOpInNestedParallelLoops(
+    scf::ReduceOp reduce_op = CreateReduceOpInNestedParallelLoops(
         xla_reduce_window_op, output_loop, window_loop, &rewriter);
 
     ConvertToReductionOperator(xla_reduce_window_op.getLoc(), reduce_op,
@@ -381,7 +381,7 @@ class ReduceWindowOpConverter
   }
 
  private:
-  std::pair<loop::ParallelOp, loop::ParallelOp>
+  std::pair<scf::ParallelOp, scf::ParallelOp>
   CreateParallelLoopsToTraverseOutputAndWindow(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op,
       ConversionPatternRewriter* rewriter) const {
@@ -405,7 +405,7 @@ class ReduceWindowOpConverter
       window_upper.push_back(
           rewriter->create<ConstantIndexOp>(loc, window_dim.getSExtValue()));
     }
-    auto window_loop = rewriter->create<loop::ParallelOp>(
+    auto window_loop = rewriter->create<scf::ParallelOp>(
         loc, window_lower, window_upper, window_step, init_value);
 
     Value reduction_result = *window_loop.getResults().begin();
@@ -414,9 +414,9 @@ class ReduceWindowOpConverter
     return std::make_pair(output_loop, window_loop);
   }
 
-  loop::ReduceOp CreateReduceOpInNestedParallelLoops(
+  scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceWindowOp xla_reduce_window_op,
-      loop::ParallelOp output_loop, loop::ParallelOp window_loop,
+      scf::ParallelOp output_loop, scf::ParallelOp window_loop,
       ConversionPatternRewriter* rewriter) const {
     rewriter->setInsertionPointToStart(window_loop.getBody());
     auto loc = xla_reduce_window_op.getLoc();
@@ -436,20 +436,20 @@ class ReduceWindowOpConverter
         xla_reduce_window_op, output_loop.getInductionVars(),
         window_loop.getInductionVars(), rewriter);
 
-    auto elem_or_init = rewriter->create<loop::IfOp>(
+    auto elem_or_init = rewriter->create<scf::IfOp>(
         loc, xla_operand_type.getElementType(), mapped_ivs.in_bounds,
         /*withElseRegion=*/true);
 
     OpBuilder then_builder = elem_or_init.getThenBodyBuilder();
     Value elem = then_builder.create<mlir::LoadOp>(
         loc, xla_reduce_window_op.operand(), mapped_ivs.ivs);
-    then_builder.create<loop::YieldOp>(loc, elem);
+    then_builder.create<scf::YieldOp>(loc, elem);
 
     OpBuilder else_builder = elem_or_init.getElseBodyBuilder();
-    else_builder.create<loop::YieldOp>(loc, *window_loop.initVals().begin());
+    else_builder.create<scf::YieldOp>(loc, *window_loop.initVals().begin());
 
-    return rewriter->create<loop::ReduceOp>(loc,
-                                            *elem_or_init.results().begin());
+    return rewriter->create<scf::ReduceOp>(loc,
+                                           *elem_or_init.results().begin());
   }
 };
 
@@ -490,7 +490,7 @@ class SelectAndScatterOpConverter
       ConversionPatternRewriter& rewriter) const final {
     auto loc = s_and_s_op.getLoc();
     InitializeOutput(s_and_s_op, &rewriter);
-    loop::ParallelOp loop_over_src =
+    scf::ParallelOp loop_over_src =
         MakeLoopOverShape(loc, s_and_s_op.source(), &rewriter);
     rewriter.setInsertionPointToStart(loop_over_src.getBody());
 
@@ -520,7 +520,7 @@ class SelectAndScatterOpConverter
     auto loc = s_and_s_op.getLoc();
     Value init_value = b->create<LoadOp>(loc, s_and_s_op.init_value());
 
-    loop::ParallelOp loop_over_output =
+    scf::ParallelOp loop_over_output =
         MakeLoopOverShape(loc, s_and_s_op.out(), b);
     OpBuilder::InsertionGuard guard(*b);
     b->setInsertionPointToStart(loop_over_output.getBody());
@@ -531,10 +531,10 @@ class SelectAndScatterOpConverter
   struct WindowLoops {
     SmallVector<Value, 2> selected_ivs;
     SmallVector<Value, 2> window_ivs;
-    loop::ForOp inner_loop;
+    scf::ForOp inner_loop;
   };
   WindowLoops InsertWindowLoops(xla_lhlo::SelectAndScatterOp s_and_s_op,
-                                loop::ParallelOp loop_over_src,
+                                scf::ParallelOp loop_over_src,
                                 OpBuilder* b) const {
     auto loc = s_and_s_op.getLoc();
     Value zero = b->create<ConstantIndexOp>(loc, 0);
@@ -558,12 +558,12 @@ class SelectAndScatterOpConverter
          s_and_s_op.window_dimensions()->getIntValues()) {
       Value upper = b->create<ConstantIndexOp>(loc, window_dim.getSExtValue());
       result.inner_loop =
-          b->create<loop::ForOp>(loc, zero, upper, one, iter_args);
+          b->create<scf::ForOp>(loc, zero, upper, one, iter_args);
       if (b->getInsertionBlock() == loop_over_src.getBody()) {
         ip = b->saveInsertionPoint();
         result.selected_ivs = result.inner_loop.getResults().take_front(rank);
       } else {
-        b->create<loop::YieldOp>(loc, result.inner_loop.getResults());
+        b->create<scf::YieldOp>(loc, result.inner_loop.getResults());
       }
       b->setInsertionPointToStart(result.inner_loop.getBody());
       iter_args = ValueRange{result.inner_loop.getRegionIterArgs()};
@@ -599,7 +599,7 @@ class SelectAndScatterOpConverter
   };
 
   SmallVector<Value, 2> SelectIvs(xla_lhlo::SelectAndScatterOp s_and_s_op,
-                                  loop::ParallelOp loop_over_src,
+                                  scf::ParallelOp loop_over_src,
                                   OpBuilder* b) const {
     auto loc = s_and_s_op.getLoc();
 
@@ -614,7 +614,7 @@ class SelectAndScatterOpConverter
 
     IterArgs ivs_val_flag(window_loops.inner_loop.getRegionIterArgs());
 
-    auto if_in_bounds = inner_loop_b.create<loop::IfOp>(
+    auto if_in_bounds = inner_loop_b.create<scf::IfOp>(
         loc, window_loops.inner_loop.getResultTypes(), mapped_ivs.in_bounds,
         /*withElseRegion=*/true);
 
@@ -623,16 +623,16 @@ class SelectAndScatterOpConverter
       OpBuilder in_bounds_then_b = if_in_bounds.getThenBodyBuilder();
       auto select_or_init_results = SelectOrInitialize(
           s_and_s_op, mapped_ivs.ivs, &ivs_val_flag, &in_bounds_then_b);
-      in_bounds_then_b.create<loop::YieldOp>(loc, select_or_init_results);
+      in_bounds_then_b.create<scf::YieldOp>(loc, select_or_init_results);
     }
 
     // Case when we are in the pad.
     {
       OpBuilder in_bounds_else_b = if_in_bounds.getElseBodyBuilder();
-      in_bounds_else_b.create<loop::YieldOp>(loc, ivs_val_flag.to_vector());
+      in_bounds_else_b.create<scf::YieldOp>(loc, ivs_val_flag.to_vector());
     }
 
-    inner_loop_b.create<loop::YieldOp>(loc, if_in_bounds.getResults());
+    inner_loop_b.create<scf::YieldOp>(loc, if_in_bounds.getResults());
     return window_loops.selected_ivs;
   }
 
@@ -647,8 +647,8 @@ class SelectAndScatterOpConverter
     Value operand_elem =
         b->create<LoadOp>(loc, s_and_s_op.operand(), operand_ivs);
     auto if_init =
-        b->create<loop::IfOp>(loc, iter_arg_types, ivs_val_flag->is_init(),
-                              /*withElseRegion=*/true);
+        b->create<scf::IfOp>(loc, iter_arg_types, ivs_val_flag->is_init(),
+                             /*withElseRegion=*/true);
     // Init == true, i.e. iter args are already initialized with a selected
     // element in boundaries of the operand. Select function has to be computed
     // here.
@@ -660,32 +660,31 @@ class SelectAndScatterOpConverter
           ApplySingleResultLhloCode(loc, {operand_elem, ivs_val_flag->value()},
                                     &lhlo_select, &if_init_then_b);
 
-      auto if_pred =
-          if_init_then_b.create<loop::IfOp>(loc, iter_arg_types, pred,
-                                            /*withElseRegion=*/true);
+      auto if_pred = if_init_then_b.create<scf::IfOp>(loc, iter_arg_types, pred,
+                                                      /*withElseRegion=*/true);
 
       // Pred == true, therefore pack newly selected ivs, val and init flag back
       // to iter_args and return.
       {
         OpBuilder if_pred_then_b = if_pred.getThenBodyBuilder();
-        if_pred_then_b.create<loop::YieldOp>(
+        if_pred_then_b.create<scf::YieldOp>(
             loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
       }
 
       // Pred == false, therefore return old iter_args.
       {
         OpBuilder if_pred_else_b = if_pred.getElseBodyBuilder();
-        if_pred_else_b.create<loop::YieldOp>(loc, ivs_val_flag->to_vector());
+        if_pred_else_b.create<scf::YieldOp>(loc, ivs_val_flag->to_vector());
       }
 
-      if_init_then_b.create<loop::YieldOp>(loc, if_pred.getResults());
+      if_init_then_b.create<scf::YieldOp>(loc, if_pred.getResults());
     }
     // Init == false, i.e. only pad was visited before and this is the first
     // element in the boundaries of the operand.
     {
       OpBuilder if_init_else_b = if_init.getElseBodyBuilder();
 
-      if_init_else_b.create<loop::YieldOp>(
+      if_init_else_b.create<scf::YieldOp>(
           loc, IterArgs{operand_ivs, operand_elem, true_i1}.to_vector());
     }
     return if_init.getResults();
@@ -708,7 +707,7 @@ struct LhloLegalizeToParallelLoops
 
     ConversionTarget target(getContext());
     target.addLegalDialect<linalg::LinalgDialect, StandardOpsDialect,
-                           loop::LoopOpsDialect, XlaLhloDialect>();
+                           scf::SCFDialect, XlaLhloDialect>();
     target.addIllegalOp<xla_lhlo::ReduceOp, xla_lhlo::ReduceWindowOp,
                         xla_lhlo::SelectAndScatterOp>();
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index cd679f7412e..a57e4300d6e 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -185,11 +185,11 @@ cc_library(
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopOps",
-        "@llvm-project//mlir:LoopOpsTransforms",
         "@llvm-project//mlir:LoopsToGPUPass",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 33d3690d4ab..5328d9e93a1 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -31,9 +31,9 @@ limitations under the License.
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/LoopOps.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/LoopOps/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/SCF/Transforms.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
@@ -132,7 +132,7 @@ struct StoreForwardingPass
     // No store operation found. Continue search outside of the parallel
     // loop if block is in a parallel loop.
     if (auto parallelOp =
-            llvm::dyn_cast<mlir::loop::ParallelOp>(block->getParentOp())) {
+            llvm::dyn_cast<mlir::scf::ParallelOp>(block->getParentOp())) {
       return findStore(parallelOp.getOperation(), matches);
     }
     return {};
@@ -388,8 +388,8 @@ struct MapParallelLoops
 struct FuseInnerParallelLoops
     : public mlir::PassWrapper<FuseInnerParallelLoops, mlir::FunctionPass> {
   void runOnFunction() override {
-    getFunction().walk([](mlir::loop::ParallelOp op) {
-      mlir::loop::naivelyFuseParallelOps(op.region());
+    getFunction().walk([](mlir::scf::ParallelOp op) {
+      mlir::scf::naivelyFuseParallelOps(op.region());
     });
   }
 };
@@ -401,7 +401,7 @@ struct ParallelLoopCollapsingToFirstDim
   void runOnOperation() override {
     mlir::Operation* module = getOperation();
 
-    module->walk([&](mlir::loop::ParallelOp op) {
+    module->walk([&](mlir::scf::ParallelOp op) {
       unsigned num_loops = op.getNumLoops();
       std::vector<unsigned> combinedLoops;
       combinedLoops.reserve(num_loops);
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 925fad7414f..75b32c73260 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -297,9 +297,9 @@ cc_library(
 )
 
 filegroup(
-    name = "LoopOpsTdFiles",
+    name = "SCFTdFiles",
     srcs = [
-        "include/mlir/Dialect/LoopOps/LoopOps.td",
+        "include/mlir/Dialect/SCF/SCFOps.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
         "include/mlir/Interfaces/SideEffects.td",
@@ -308,26 +308,26 @@ filegroup(
 )
 
 gentbl(
-    name = "LoopOpsIncGen",
+    name = "SCFIncGen",
     strip_include_prefix = "include",
     tbl_outs = [
         (
             "-gen-op-decls",
-            "include/mlir/Dialect/LoopOps/LoopOps.h.inc",
+            "include/mlir/Dialect/SCF/SCFOps.h.inc",
         ),
         (
             "-gen-op-defs",
-            "include/mlir/Dialect/LoopOps/LoopOps.cpp.inc",
+            "include/mlir/Dialect/SCF/SCFOps.cpp.inc",
         ),
         (
             "-gen-dialect-decls",
-            "include/mlir/Dialect/LoopOps/LoopOpsDialect.h.inc",
+            "include/mlir/Dialect/SCF/SCFOpsDialect.h.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/LoopOps/LoopOps.td",
+    td_file = "include/mlir/Dialect/SCF/SCFOps.td",
     td_srcs = [
-        ":LoopOpsTdFiles",
+        ":SCFTdFiles",
     ],
 )
 
@@ -337,30 +337,30 @@ gentbl(
     tbl_outs = [
         (
             "-gen-pass-decls",
-            "include/mlir/Dialect/LoopOps/Passes.h.inc",
+            "include/mlir/Dialect/SCF/Passes.h.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/LoopOps/Passes.td",
+    td_file = "include/mlir/Dialect/SCF/Passes.td",
     td_srcs = [
         ":PassBaseTdFiles",
     ],
 )
 
 cc_library(
-    name = "LoopOpsTransforms",
+    name = "SCFTransforms",
     srcs = glob([
-        "lib/Dialect/LoopOps/Transforms/*.cpp",
-        "lib/Dialect/LoopOps/Transforms/*.h",
+        "lib/Dialect/SCF/Transforms/*.cpp",
+        "lib/Dialect/SCF/Transforms/*.h",
     ]),
-    hdrs = ["include/mlir/Dialect/LoopOps/Passes.h"],
+    hdrs = ["include/mlir/Dialect/SCF/Passes.h"],
     includes = ["include"],
     deps = [
         ":Affine",
         ":IR",
-        ":LoopOps",
         ":LoopPassIncGen",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Transforms",
         "@llvm-project//llvm:support",
@@ -521,8 +521,8 @@ cc_library(
         ":AffinePassIncGen",
         ":Analysis",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -559,8 +559,8 @@ cc_library(
         ":Affine",
         ":ConversionPassIncGen",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -588,17 +588,17 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopOps",
+    name = "SCFDialect",
     srcs = glob(
         [
-            "lib/Dialect/LoopOps/*.cpp",
-            "lib/Dialect/LoopOps/*.h",
-            "lib/Dialect/LoopOps/EDSC/*.cpp",
+            "lib/Dialect/SCF/*.cpp",
+            "lib/Dialect/SCF/*.h",
+            "lib/Dialect/SCF/EDSC/*.cpp",
         ],
     ),
     hdrs = glob([
-        "include/mlir/Dialect/LoopOps/*.h",
-        "include/mlir/Dialect/LoopOps/EDSC/*.h",
+        "include/mlir/Dialect/SCF/*.h",
+        "include/mlir/Dialect/SCF/EDSC/*.h",
     ]),
     includes = ["include"],
     deps = [
@@ -606,7 +606,7 @@ cc_library(
         ":EDSC",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOpsIncGen",
+        ":SCFIncGen",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -1113,9 +1113,9 @@ cc_library(
         ":GPUDialect",
         ":GPUPassIncGen",
         ":IR",
-        ":LoopOps",
         ":ParallelLoopMapperAttrGen",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -1324,8 +1324,8 @@ cc_library(
         ":GPUDialect",
         ":GPUToSPIRVIncGen",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":SPIRVDialect",
         ":SPIRVLowering",
         ":StandardToSPIRVConversions",
@@ -1883,7 +1883,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOps",
+        ":SCFDialect",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -2000,8 +2000,8 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LoopLikeInterface",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":SideEffects",
         ":StandardOps",
         ":Support",
@@ -2037,8 +2037,8 @@ cc_library(
         ":GPUDialect",
         ":GPUTransforms",
         ":IR",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2061,9 +2061,9 @@ cc_library(
         ":Affine",
         ":ConversionPassIncGen",
         ":GPUDialect",
-        ":LoopOps",
         ":LoopsToGPU",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -2085,8 +2085,8 @@ cc_library(
         ":ConversionPassIncGen",
         ":IR",
         ":LLVMDialect",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -2292,7 +2292,7 @@ cc_library(
         ":Affine",
         ":CallOpInterfaces",
         ":IR",
-        ":LoopOps",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         "@llvm-project//llvm:support",
@@ -2479,10 +2479,10 @@ cc_library(
         ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
-        ":LoopOpsTransforms",
         ":NVVMDialect",
         ":Parser",
         ":Pass",
+        ":SCFTransforms",
         ":StandardOpsTransforms",
         ":StandardToSPIRVConversions",
         ":StandardToStandard",
@@ -2566,8 +2566,6 @@ cc_library(
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
         ":LinalgTransforms",
-        ":LoopOps",
-        ":LoopOpsTransforms",
         ":LoopPassIncGen",
         ":LoopsToGPUPass",
         ":NVVMDialect",
@@ -2575,6 +2573,8 @@ cc_library(
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
+        ":SCFDialect",
+        ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
         ":SPIRVLowering",
@@ -3245,8 +3245,8 @@ cc_library(
         ":LinalgOps",
         ":LinalgPassIncGen",
         ":LinalgStructuredOpsIncGen",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":TransformUtils",
@@ -3367,8 +3367,8 @@ cc_library(
         ":IR",
         ":LLVMDialect",
         ":LLVMTransforms",
-        ":LoopOps",
         ":Pass",
+        ":SCFDialect",
         ":StandardOps",
         ":Support",
         ":Transforms",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index c3dd157af83..a0312a54b68 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -163,8 +163,8 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopOps",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",

From d57f24db3af4dc99423f07735ee3ce719fa7abff Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Mon, 11 May 2020 13:07:10 -0700
Subject: [PATCH 0332/1533] Add tpu_strategy into TPU doctests.

Switch the doctest --module flag from single string to list.
Convert tpu_strategy sample code to doctest format -- this revealed major
errors in the TPUStrategyV1 sample!

PiperOrigin-RevId: 310976505
Change-Id: I402cf3ad49d62ebd2f36346b0c6beaa3b3187c08
---
 tensorflow/python/distribute/tpu_strategy.py | 75 ++++++++++----------
 tensorflow/tools/docs/BUILD                  |  4 +-
 tensorflow/tools/docs/tf_doctest.py          | 13 ++--
 3 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index 6e51b84a1d1..b574c523ccd 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -96,35 +96,34 @@ def validate_run_function(fn):
 
 @tf_export("distribute.experimental.TPUStrategy", v1=[])
 class TPUStrategy(distribute_lib.Strategy):
-  """TPU distribution strategy implementation."""
+  """TPU distribution strategy implementation.
+
+  To construct a TPUStrategy object, you need to run the
+  initialization code as below:
+
+  >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  >>> tf.config.experimental_connect_to_cluster(resolver)
+  >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+  >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
+
+  While using distribution strategies, the variables created within strategy's
+  scope will be replicated across all the replicas and can be kept in sync
+  using all-reduce algorithms.
+
+  To run TF2 programs on TPUs, you can either use `.compile` and
+  `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
+  training loop by calling `strategy.run` directly. Note that
+  TPUStrategy doesn't support pure eager execution, so please make sure the
+  function passed into `strategy.run` is a `tf.function` or
+  `strategy.run` is called inside a `tf.function` if eager
+  behavior is enabled.
+  """
 
   def __init__(self,
                tpu_cluster_resolver=None,
                device_assignment=None):
     """Synchronous training in TPU donuts or Pods.
 
-    To construct a TPUStrategy object, you need to run the
-    initialization code as below:
-
-    ```python
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu)
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.experimental.TPUStrategy(resolver)
-    ```
-
-    While using distribution strategies, the variables created within strategy's
-    scope will be replicated across all the replicas and can be kept in sync
-    using all-reduce algorithms.
-
-    To run TF2 programs on TPUs, you can either use `.compile` and
-    `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized
-    training loop by calling `strategy.run` directly. Note that
-    TPUStrategy doesn't support pure eager execution, so please make sure the
-    function passed into `strategy.run` is a `tf.function` or
-    `strategy.run` is called inside a `tf.function` if eager
-    behavior is enabled.
-
     Args:
       tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
@@ -209,26 +208,26 @@ class TPUStrategyV1(distribute_lib.StrategyV1):
     Users can pass strategy specific options to `options` argument. An example
     to enable bucketizing dynamic shapes in `TPUStrategy.run`
     is:
-    ```python
 
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tf.config.experimental_connect_to_cluster(resolver)
-    tf.tpu.experimental.initialize_tpu_system(resolver)
-    strategy = tf.distribute.experimental.TPUStrategy(tpu='')
+    >>> resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    >>> tf.config.experimental_connect_to_cluster(resolver)
+    >>> tf.tpu.experimental.initialize_tpu_system(resolver)
+    >>> strategy = tf.distribute.experimental.TPUStrategy(resolver)
 
-    options = tf.distribute.RunOptions()
-    options.experimental_bucketizing_dynamic_shape = True
+    >>> options = tf.distribute.RunOptions(
+    ...     experimental_bucketizing_dynamic_shape=True)
 
-    iterator = iter(inputs)
+    >>> dataset = tf.data.Dataset.range(
+    ...    strategy.num_replicas_in_sync, output_type=dtypes.float32).batch(
+    ...        strategy.num_replicas_in_sync, drop_remainder=True)
+    >>> input_iterator = iter(strategy.experimental_distribute_dataset(dataset))
 
-    @tf.function()
-    def step_fn(inputs):
-      output = tf.reduce_sum(inputs)
-      return output
+    >>> @tf.function()
+    ... def step_fn(inputs):
+    ...  output = tf.reduce_sum(inputs)
+    ...  return output
 
-      strategy.run(step_fn, args=(next(iterator),),
-                                   options=options)
-    ```
+    >>> strategy.run(step_fn, args=(next(input_iterator),), options=options)
 
     Args:
       fn: The function to run. The output must be a `tf.nest` of `Tensor`s.
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 49083affb45..c0442a5986d 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -23,7 +23,7 @@ py_library(
 py_test(
     name = "tf_doctest",
     srcs = ["tf_doctest.py"],
-    args = ["--module_prefix_skip=tpu."],
+    args = ["--module_prefix_skip=tpu.,distribute.tpu_strategy"],
     python_version = "PY3",
     tags = [
         "no_oss_py2",
@@ -45,7 +45,7 @@ py_test(
 tpu_py_test(
     name = "tf_doctest_tpu",
     srcs = ["tf_doctest.py"],
-    args = ["--module=tpu."],
+    args = ["--module=tpu.,distribute.tpu_strategy"],
     disable_experimental = True,
     disable_v3 = True,
     main = "tf_doctest.py",
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index f694b83782e..fc81d33cfde 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -42,7 +42,7 @@ tf.keras.preprocessing = preprocessing
 
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string('module', None, 'A specific module to run doctest on.')
+flags.DEFINE_list('module', [], 'A list of specific module to run doctest on.')
 flags.DEFINE_list('module_prefix_skip', [],
                   'A list of modules to ignore when resolving modules.')
 flags.DEFINE_boolean('list', None,
@@ -71,23 +71,24 @@ def find_modules():
   return tf_modules
 
 
-def filter_on_submodules(all_modules, submodule):
-  """Filters all the modules based on the module flag.
+def filter_on_submodules(all_modules, submodules):
+  """Filters all the modules based on the modules flag.
 
   The module flag has to be relative to the core package imported.
-  For example, if `submodule=keras.layers` then, this function will return
+  For example, if `module=keras.layers` then, this function will return
   all the modules in the submodule.
 
   Args:
     all_modules: All the modules in the core package.
-    submodule: Submodule to filter from all the modules.
+    submodules: Submodules to filter from all the modules.
 
   Returns:
     All the modules in the submodule.
   """
 
   filtered_modules = [
-      mod for mod in all_modules if PACKAGE + submodule in mod.__name__
+      mod for mod in all_modules
+      if any(PACKAGE + submodule in mod.__name__ for submodule in submodules)
   ]
   return filtered_modules
 

From a1c0d548f663f4e96c36a6fb0981d6e913be728a Mon Sep 17 00:00:00 2001
From: Robert Suderman <suderman@google.com>
Date: Mon, 11 May 2020 13:44:49 -0700
Subject: [PATCH 0333/1533] Lowering InplaceUpdate to DynamicUpdateSlice

PiperOrigin-RevId: 310984087
Change-Id: I1409e7bdae43417ff65f47e306e767a05e4b7166
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 25 +++++++
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 35 +++++++++
 .../mlir/xla/transforms/legalize_tf.cc        | 75 +++++++++++++++++--
 3 files changed, 129 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 07ee70dafa3..2092a75a519 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3601,6 +3601,31 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
+def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
+  let summary = [{
+    Create a copy of `x` with the updated specified rows 'i' with values 'v'.
+
+  }];
+
+  let description = [{
+    Creates a copy of tensor 'x' and updates the columns specified in tensor 'i'
+    with the values 'v'. Originally this function was mutative however for 
+    compilation we make this operation create / operate on a copy.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    I32Tensor:$i,
+    TF_Tensor:$v
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_InvOp : TF_Op<"Inv", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 61bf5de18cb..7e6bb9c273d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -4074,6 +4074,41 @@ func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
   return %0 : tensor<4x16xf32>
 }
 
+// CHECK-LABEL: inplace_update_one
+func @inplace_update_one(%arg0: tensor<8x4xf32>, %arg1: tensor<1x4xf32>, %arg2: tensor<1xi32>) -> tensor<8x4xf32> {
+  // CHECK-DAG: [[CST:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[SLICE1:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE2:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[1, 4]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-DAG: [[RESHAPE1:%.+]] = "xla_hlo.reshape"([[SLICE1]])
+  // CHECK-DAG: [[UPDATE:%.+]] = "xla_hlo.dynamic-update-slice"(%arg0, [[SLICE2]], [[RESHAPE1]], [[CST]])
+  %0 = "tf.InplaceUpdate"(%arg0, %arg2, %arg1) : (tensor<8x4xf32>, tensor<1xi32>, tensor<1x4xf32>) -> tensor<8x4xf32>
+
+  // CHECK: return [[UPDATE]]
+  return %0 : tensor<8x4xf32>
+}
+
+// CHECK-LABEL: inplace_update_three
+func @inplace_update_three(%arg0: tensor<8x8x4xf32>, %arg1: tensor<3x8x4xf32>, %arg2: tensor<3xi32>) -> tensor<8x8x4xf32> {
+  // CHECK-DAG: [[CST:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[SLICE1:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE2:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<2> : tensor<1xi64>, start_indices = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE3:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<3> : tensor<1xi64>, start_indices = dense<2> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SLICE4:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[1, 8, 4]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[SLICE5:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[2, 8, 4]> : tensor<3xi64>, start_indices = dense<[1, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[SLICE6:%.+]] = "xla_hlo.slice"(%arg1) {limit_indices = dense<[3, 8, 4]> : tensor<3xi64>, start_indices = dense<[2, 0, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>}
+  // CHECK-DAG: [[RESHAPE1:%.+]] = "xla_hlo.reshape"([[SLICE1]])
+  // CHECK-DAG: [[RESHAPE2:%.+]] = "xla_hlo.reshape"([[SLICE2]])
+  // CHECK-DAG: [[RESHAPE3:%.+]] = "xla_hlo.reshape"([[SLICE3]])
+  // CHECK-DAG: [[UPDATE1:%.+]] = "xla_hlo.dynamic-update-slice"(%arg0, [[SLICE4]], [[RESHAPE1]], [[CST]], [[CST]])
+  // CHECK-DAG: [[UPDATE2:%.+]] = "xla_hlo.dynamic-update-slice"([[UPDATE1]], [[SLICE5]], [[RESHAPE2]], [[CST]], [[CST]])
+  // CHECK-DAG: [[UPDATE3:%.+]] = "xla_hlo.dynamic-update-slice"([[UPDATE2]], [[SLICE6]], [[RESHAPE3]], [[CST]], [[CST]])
+  %0 = "tf.InplaceUpdate"(%arg0, %arg2, %arg1) : (tensor<8x8x4xf32>, tensor<3xi32>, tensor<3x8x4xf32>) -> tensor<8x8x4xf32>
+
+  // CHECK:  return [[UPDATE3]] : tensor<8x8x4xf32>
+  return %0 : tensor<8x8x4xf32>
+}
+
+
 // CHECK-LABEL: xla_dynamic_update_slice
 func @xla_dynamic_update_slice(%arg0: tensor<4x16xf32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2xi32>) -> tensor<4x16xf32> {
   // CHECK: [[SLICE0:%.+]] = "xla_hlo.slice"(%arg2) {limit_indices = dense<1> : tensor<1xi64>, start_indices = dense<0> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} : (tensor<2xi32>) -> tensor<1xi32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index de808bc8ad2..9d68d3a164b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -4182,6 +4182,68 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
   }
 };
 
+// Converts a TF InplaceUpdate op to DynamicUpdateSlice HLO.
+class ConvertInplaceUpdateOp : public OpRewritePattern<TF::InplaceUpdateOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::InplaceUpdateOp op,
+                                PatternRewriter &rewriter) const override {
+    auto input = op.x();
+    auto indices = op.i();
+    auto updates = op.v();
+
+    // Slice each row of `i` and `v` to perform a separate dynamic-update-slice
+    // on the contents of `x`.
+    auto input_type = input.getType().cast<ShapedType>();
+    auto updates_type = updates.getType().cast<ShapedType>();
+    auto indices_type = indices.getType().cast<ShapedType>();
+    if (!indices_type.hasStaticShape()) return failure();
+
+    if (indices_type.getRank() != 1) return failure();
+
+    SmallVector<Type, 4> unpacked_indices_type(
+        indices_type.getDimSize(0),
+        RankedTensorType::get({}, indices_type.getElementType()));
+    auto zero_attr = IntegerAttr::get(rewriter.getIntegerType(64), 0);
+    auto unpacked_indices = rewriter.create<TF::UnpackOp>(
+        op.getLoc(), unpacked_indices_type, indices, zero_attr);
+
+    SmallVector<int64_t, 4> split_updates_shape;
+    split_updates_shape.append(updates_type.getShape().begin(),
+                               updates_type.getShape().end());
+    split_updates_shape.front() = 1;
+    SmallVector<Type, 4> split_updates_type;
+    split_updates_type.resize(
+        updates_type.getShape().front(),
+        RankedTensorType::get(split_updates_shape,
+                              updates_type.getElementType()));
+
+    auto cst =
+        rewriter.create<xla_hlo::ConstOp>(op.getLoc(), zero_attr).getResult();
+    auto split_updates = rewriter.create<TF::SplitOp>(
+        op.getLoc(), split_updates_type, cst, updates);
+
+    SmallVector<Value, 6> input_indices;
+    input_indices.resize(input_type.getRank(), cst);
+
+    SmallVector<int64_t, 6> starts(updates_type.getRank(), 0);
+    SmallVector<int64_t, 6> strides(updates_type.getRank(), 1);
+    SmallVector<int64_t, 6> limits(updates_type.getShape().begin(),
+                                   updates_type.getShape().end());
+
+    for (auto pair :
+         llvm::zip(unpacked_indices.output(), split_updates.output())) {
+      input_indices.front() = std::get<0>(pair);
+      input = rewriter.create<xla_hlo::DynamicUpdateSliceOp>(
+          op.getLoc(), op.getType(), input, std::get<1>(pair), input_indices);
+    }
+
+    rewriter.replaceOp(op, input);
+    return success();
+  }
+};
+
 // Converts a TF XlaDynamicUpdateSlice op to DynamicUpdateSlice HLO.
 class ConvertXlaDynamicUpdateSliceOp
     : public OpRewritePattern<TF::XlaDynamicUpdateSliceOp> {
@@ -4863,12 +4925,13 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
       ConvertConv3DBackpropInputOp, ConvertCumsumOp, ConvertDiagPartOp,
       ConvertEinsumOp, ConvertFusedBatchNormGradOp,
       ConvertFusedBatchNormGradV2Op, ConvertFusedBatchNormGradV3Op,
-      ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp, ConvertLinSpaceOp,
-      ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertMaxPool2DOp,
-      ConvertMaxPool3DOp, ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp,
-      ConvertMeanOp, ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp,
-      ConvertProdOp, ConvertQrOp, ConvertRangeOp, ConvertSelectV2Op,
-      ConvertSigmoidOp, ConvertSizeOp, ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
+      ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp,
+      ConvertInplaceUpdateOp, ConvertLinSpaceOp, ConvertMaxOp, ConvertMinOp,
+      ConvertAvgPoolOp, ConvertMaxPool2DOp, ConvertMaxPool3DOp,
+      ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp, ConvertMeanOp,
+      ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertQrOp,
+      ConvertRangeOp, ConvertSelectV2Op, ConvertSigmoidOp, ConvertSizeOp,
+      ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
       ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSplitOp, ConvertSplitVOp,
       ConvertStridedSliceOp, ConvertStridedSliceGradOp, ConvertSumOp,
       ConvertTensorScatterUpdateOp, ConvertTileOp, ConvertTopKV2Op,

From 0a980f296919766407af45b95c9e8aa290f72569 Mon Sep 17 00:00:00 2001
From: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Date: Tue, 5 May 2020 10:54:54 +0000
Subject: [PATCH 0334/1533] ROCm 3.5 (hip-clang) build fixes

---
 .../service/gpu/llvm_gpu_backend/gpu_backend_lib.cc |  2 +-
 .../stream_executor/rocm/rocm_gpu_executor.cc       |  4 ++++
 .../clang/bin/crosstool_wrapper_driver_rocm.tpl     |  4 +++-
 third_party/gpus/cuda_configure.bzl                 | 13 +++++++++----
 third_party/gpus/rocm_configure.bzl                 |  2 ++
 5 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 060a0375271..497dcda4361 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -689,7 +689,7 @@ std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
     llvm::Triple target_triple, int amdgpu_version,
     const HloModuleConfig& hlo_module_config) {
   return GetTargetMachine(target_triple, absl::StrCat("gfx", amdgpu_version),
-                          hlo_module_config, "-code-object-v3");
+                          hlo_module_config, "+code-object-v3");
 }
 
 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index e22a243a70b..216602a7597 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -132,6 +132,10 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     VLOG(3) << "Unloading  HSACO module " << module;
     GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
+    const char* mem_it = nullptr;
+    for (auto x : in_memory_modules_)
+      if (x.second == module) mem_it = x.first;
+    if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
   }
   return true;
 }
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index f5ac7b39dfd..89275128a9c 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -179,7 +179,7 @@ def InvokeHipcc(argv, log=False):
   # Also we need to retain warning about uninitialised shared variable as
   # warning only, even when -Werror option is specified.
   if HIPCC_IS_HIPCLANG:
-    hipccopts += ' --include=hip/hip_runtime.h -Wno-error=cuda-shared-init '
+    hipccopts += ' --include=hip/hip_runtime.h '
   hipccopts += ' ' + hipcc_compiler_options
   # Use -fno-gpu-rdc by default for early GPU kernel finalization
   # This flag would trigger GPU kernels be generated at compile time, instead
@@ -258,6 +258,8 @@ def main():
     gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
+    if HIPCC_IS_HIPCLANG:
+      gpu_linker_flags.append("-lrt")
 
     if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
     return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 545aeebe97a..ce924fe4cd2 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -808,23 +808,28 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
     cmd = \"""%s \""",
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions=None):
     """Returns a rule to recursively copy a directory."""
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
+    post_cmd=''
+    if exceptions!=None:
+      outs = [x for x in outs if not any([x.startswith(y) for y in exceptions])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
-
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    if exceptions!=None:
+      for x in exceptions:
+        post_cmd+=" ; rm -fR " + x.replace(src_dir, out_dir)
     return """genrule(
     name = "%s",
     outs = [
 %s
     ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
+    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
+)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
 
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 3c345e6724b..3f518fb05f1 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -615,6 +615,8 @@ def _create_local_rocm_repository(repository_ctx):
             name = "rocm-include",
             src_dir = rocm_toolkit_path + "/include",
             out_dir = "rocm/include",
+            exceptions = [rocm_toolkit_path + "/include/gtest", 
+              rocm_toolkit_path + "/include/gmock"],
         ),
         make_copy_dir_rule(
             repository_ctx,

From 37c858077b3a1f64b7b91a7ec16df52fd53fc129 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <laurenzo@google.com>
Date: Mon, 11 May 2020 13:48:18 -0700
Subject: [PATCH 0335/1533] Bump open source llvm revision to
 728cf6d86b4f20144ac10517afb0cb978beac124

PiperOrigin-RevId: 310984827
Change-Id: I2732d50314636eb6ff8b8c85b6268f9a41d4bfcd
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index abfd35949ec..36dc0c2b101 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "f40fc7b8d60e6897fd9514a46a26f4b84f57577a"
-    LLVM_SHA256 = "8f5ea1c26fc922b1b0752f6bd63b31d4137e04630828cb3a4f94a9bbdbcf575d"
+    LLVM_COMMIT = "728cf6d86b4f20144ac10517afb0cb978beac124"
+    LLVM_SHA256 = "41a24cf437be40c8a790b1095e6bfc3a9d531a44275abecddf2eda1835658bcc"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 0e90ebc8429aedba3d654527c20987074c1c43f3 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 11 May 2020 14:10:22 -0700
Subject: [PATCH 0336/1533] Fix build failure in windows

PiperOrigin-RevId: 310989703
Change-Id: Ib0b9511975d20a1905eab22f3bac4ec2985fccfc
---
 tensorflow/core/profiler/convert/op_stats_to_overview_page.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 69f076f0296..ec3a6dc1525 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -265,7 +265,9 @@ std::string TfFunctionRecommendationHtml(const TfFunctionDb& tf_function_db) {
   // Sorts candidates in descending order of expensive_call_percent.
   absl::c_sort(candidates, cmp);
   std::string expensive_functions = "";
-  auto num_functions_shown = std::min(3ul, candidates.size());
+  auto num_functions_shown = std::min(
+      static_cast<decltype(candidates)::size_type>(3), candidates.size());
+
   for (auto i = 0; i < num_functions_shown; i++) {
     if (i > 0) absl::StrAppend(&expensive_functions, ", ");
     absl::StrAppend(&expensive_functions, "\"", candidates[i].function_name,

From b88fb5a878924657e09f6d3af7446e4eeb732071 Mon Sep 17 00:00:00 2001
From: Robert Suderman <suderman@google.com>
Date: Mon, 11 May 2020 14:11:48 -0700
Subject: [PATCH 0337/1533] ReverseV2 lowering to xla_hlo.reverse

PiperOrigin-RevId: 310990027
Change-Id: I2ce035aa9a2b2f8d4b02f916858d8b26b9d3d7b1
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 17 ++++++++-
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 38 +++++++++++++++++++
 .../mlir/xla/transforms/legalize_tf.cc        | 15 ++++++++
 .../xla/transforms/legalize_tf_patterns.td    | 10 +++++
 4 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 874f4a1587e..68eafb8b33e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -1170,9 +1170,22 @@ OpFoldResult CopyOp::fold(ArrayRef<Attribute> operands) { return getOperand(); }
 //===----------------------------------------------------------------------===//
 
 OpFoldResult ReverseOp::fold(ArrayRef<Attribute> operands) {
+  auto input = operand();
+
   // No dimensions to reverse.
-  if (dimensions().getNumElements() == 0) return operand();
-  return nullptr;
+  if (dimensions().getNumElements() == 0) return input;
+
+  llvm::SmallVector<APInt, 5> new_dims;
+  new_dims.reserve(dimensions().getNumElements());
+
+  auto shaped_type = input.getType().cast<ShapedType>();
+  for (auto dim : dimensions().getValues<APInt>()) {
+    if (shaped_type.getDimSize(dim.getLimitedValue()) != 1) {
+      return nullptr;
+    }
+  }
+
+  return input;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 7e6bb9c273d..a5353beb772 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1596,6 +1596,44 @@ func @unhandled_partitioned_call_2(%arg0: tensor<i32>, %arg1: tensor<*xi32>) ->
   return %0, %1 : tensor<i32>, tensor<i32>
 }
 
+
+//===----------------------------------------------------------------------===//
+// ReverseV2 op legalization.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: @reverse_func_32
+func @reverse_func_32(%arg0: tensor<5xi32>) -> tensor<5xi32> {
+  %axis = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi32>) -> tensor<5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5xi32>
+  return %reversed : tensor<5xi32>
+}
+
+// CHECK-LABEL: @reverse_func_64
+func @reverse_func_64(%arg0: tensor<5xi32>) -> tensor<5xi32> {
+  %axis = "tf.Const"() {value = dense<0> : tensor<1xi64>} : () -> (tensor<1xi64>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<0> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5xi32>, tensor<1xi64>) -> tensor<5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5xi32>
+  return %reversed : tensor<5xi32>
+}
+
+// CHECK-LABEL: @reverse_func_neg
+func @reverse_func_neg(%arg0: tensor<5x5xi32>) -> tensor<5x5xi32> {
+  %axis = "tf.Const"() {value = dense<[-1]> : tensor<1xi32>} : () -> (tensor<1xi32>)
+
+  // CHECK: [[VAL:%.+]] = "xla_hlo.reverse"(%arg0) {dimensions = dense<1> : tensor<1xi64>}
+  %reversed = "tf.ReverseV2"(%arg0, %axis) : (tensor<5x5xi32>, tensor<1xi32>) -> tensor<5x5xi32>
+
+  // CHECK: return [[VAL]] : tensor<5x5xi32>
+  return %reversed : tensor<5x5xi32>
+}
+
 //===----------------------------------------------------------------------===//
 // StatefulPartitionedCall op legalization.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 9d68d3a164b..a0a5e47ad65 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -2590,6 +2590,21 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
   }
 };
 
+ElementsAttr ConvertAxisAttr(Value val, ElementsAttr attr, Builder *builder) {
+  auto int_attr = attr.cast<DenseIntElementsAttr>();
+  auto type = val.getType().cast<ShapedType>();
+
+  SmallVector<int64_t, 6> axis;
+  axis.reserve(int_attr.getNumElements());
+
+  int64_t rank = type.getRank();
+  for (auto val : int_attr.getValues<APInt>()) {
+    axis.push_back((val.getSExtValue() + rank) % rank);
+  }
+
+  return builder->getI64TensorAttr(axis);
+}
+
 /// Converts the LinSpace tensorflow op to a xla_hlo.iota op with a scaling
 /// and offset applied to generate the linspace values. The output tensor needs
 /// to have a static shape.  The implementation is defined in C++ because there
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 036bbf65bc6..2a27c1f2966 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -520,6 +520,16 @@ foreach callOp = [TF_PartitionedCallOp, TF_StatefulPartitionedCallOp] in {
           [(ArgTypesMatchCallee $op, $args, $f)]>;
 }
 
+//===----------------------------------------------------------------------===//
+// Reverse op patterns.
+//===----------------------------------------------------------------------===//
+
+// Handles axis conversion for TF reverse.
+def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1, &$_builder)">;
+
+def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (TF_ConstOp $axis)),
+    (HLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
+
 //===----------------------------------------------------------------------===//
 // Ternary op patterns.
 //===----------------------------------------------------------------------===//

From 250bb66c85786d1a2b4410e8886ba207f013a0d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 14:19:00 -0700
Subject: [PATCH 0338/1533] Adjust the tool data for OSS TF memory profiler.

PiperOrigin-RevId: 310991749
Change-Id: I7f9a6e1493b091c53459038d9310e9733b12b87d
---
 tensorflow/core/profiler/convert/BUILD        |  1 -
 .../convert/xplane_to_profile_response.cc     | 30 ++++++++++++-------
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index e033dbb1ba6..d5c347678a3 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -262,7 +262,6 @@ cc_library(
         ":xplane_to_memory_profile",
         ":xplane_to_op_stats",
         ":xplane_to_trace_events",
-        "//tensorflow/core:human_readable_json",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index 74dd3435a4b..1643bbf8098 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/human_readable_json.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
@@ -65,20 +64,26 @@ void AddToolData(absl::string_view tool_name, const Proto& tool_output,
 }
 
 template <typename Proto>
-Status AddJsonToolData(absl::string_view tool_name, const Proto& tool_output,
-                       ProfileResponse* response) {
-  std::string json_output;
-  TF_RETURN_IF_ERROR(ProtoToHumanReadableJson(tool_output, &json_output,
-                                              /*ignore_accuracy_loss=*/true));
-  auto* tool_data = response->add_tool_data();
-  tool_data->set_name(string(tool_name));
-  tool_data->mutable_data()->append(json_output.data(), json_output.size());
+Status ConvertProtoToJson(const Proto& proto_output, std::string* json_output) {
+  protobuf::util::JsonPrintOptions json_options;
+  json_options.always_print_primitive_fields = true;
+  auto status = protobuf::util::MessageToJsonString(proto_output, json_output,
+                                                    json_options);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece (or absl::string_view) to
+    // tensorflow::StringPiece.
+    auto error_msg = status.message();
+    return errors::Internal(
+        strings::StrCat("Could not convert proto to JSON string: ",
+                        StringPiece(error_msg.data(), error_msg.length())));
+  }
   return Status::OK();
 }
 
 // Returns the tool name with extension.
 string ToolName(absl::string_view tool) {
   if (tool == kTraceViewer) return "trace.json.gz";
+  if (tool == kMemoryProfile) return "memory_profile.json.gz";
   return absl::StrCat(tool, ".pb");
 }
 
@@ -130,8 +135,11 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
   if (tools.contains(kMemoryProfile)) {
     if (const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreads)) {
       MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
-      TF_RETURN_IF_ERROR(
-          AddJsonToolData(ToolName(kMemoryProfile), memory_profile, response));
+      std::string json_output;
+      TF_RETURN_IF_ERROR(ConvertProtoToJson(memory_profile, &json_output));
+      TF_RETURN_IF_ERROR(SaveGzippedToolDataToTensorboardProfile(
+          req.repository_root(), req.session_id(), req.host_name(),
+          ToolName(kMemoryProfile), json_output));
     }
   }
   return Status::OK();

From 3b46518b3acafd8233f81bf4a46bbfc7e00ea3f7 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 11 May 2020 14:35:06 -0700
Subject: [PATCH 0339/1533] [tf.data service] Fix max_outstanding_requests.

This fixes a bug where max_outstanding_requests wasn't being propagated to the DataServiceDataset::Iterator. There are also a bunch of minor improvements in logging.

PiperOrigin-RevId: 310995646
Change-Id: Iff442327ddc38f7f307b0ba989b024933e88f1a0
---
 tensorflow/core/data/service/master_impl.cc   |  8 ++++-
 tensorflow/core/data/service/master_impl.h    |  2 +-
 tensorflow/core/data/service/worker_impl.cc   |  2 ++
 .../core/kernels/data/experimental/BUILD      |  1 +
 .../experimental/data_service_dataset_op.cc   | 22 ++++++++-----
 .../data/experimental/ops/data_service_ops.py | 33 ++++++++++++++-----
 .../kernel_tests/data_service_ops_test.py     | 15 +++++++++
 7 files changed, 65 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/data/service/master_impl.cc b/tensorflow/core/data/service/master_impl.cc
index 6e2c95c475e..336ab068c40 100644
--- a/tensorflow/core/data/service/master_impl.cc
+++ b/tensorflow/core/data/service/master_impl.cc
@@ -169,7 +169,11 @@ Status DataServiceMasterImpl::GetOrCreateJob(
   if (job != nullptr) {
     TF_RETURN_IF_ERROR(ValidateMatchingJob(**job, requested_processing_mode,
                                            request->dataset_id()));
-    response->set_job_id((*job)->job_id());
+    int64 job_id = (*job)->job_id();
+    response->set_job_id(job_id);
+    VLOG(3) << "Found existing job for name=" << request->job_name()
+            << ", index=" << request->job_name_index()
+            << ". job_id: " << job_id;
     return Status::OK();
   }
   int64 job_id;
@@ -177,6 +181,8 @@ Status DataServiceMasterImpl::GetOrCreateJob(
                                request->job_name(), &job_id));
   named_jobs_[key] = jobs_[job_id];
   response->set_job_id(job_id);
+  VLOG(3) << "Created job " << job_id << " for dataset "
+          << request->dataset_id() << " and name " << request->job_name();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/master_impl.h b/tensorflow/core/data/service/master_impl.h
index de25ea0d6a8..e8b70e84d0f 100644
--- a/tensorflow/core/data/service/master_impl.h
+++ b/tensorflow/core/data/service/master_impl.h
@@ -75,7 +75,7 @@ class DataServiceMasterImpl {
     }
 
     std::string DebugString() {
-      return absl::StrCat("id: ", worker_id_, "address: ", address_);
+      return absl::StrCat("id: ", worker_id_, " address: ", address_);
     }
 
    private:
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 7395244a569..8d00825227b 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -84,6 +84,7 @@ Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
 
 Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
     EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  VLOG(3) << "Received request to process task " << task_def.task_id();
   standalone::Dataset::Params params;
   std::unique_ptr<standalone::Dataset> dataset;
   TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(
@@ -100,6 +101,7 @@ Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
   task.id = task_def.task_id();
   task.dataset = std::move(dataset);
   task.iterator = std::move(iterator);
+  VLOG(3) << "Began processing for task " << task_def.task_id();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index d61c574cb35..4ddfd99951c 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -138,6 +138,7 @@ tf_kernel_library(
         "//tensorflow/core/kernels/data:dataset_utils",
         "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/kernels/data:serialization_utils",
+        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 8c336686deb..697f4d99a1e 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/snappy.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
@@ -178,7 +179,10 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params, int64 iterator_index)
-        : DatasetIterator<Dataset>(params), iterator_index_(iterator_index) {}
+        : DatasetIterator<Dataset>(params),
+          iterator_index_(iterator_index),
+          max_outstanding_requests_(params.dataset->max_outstanding_requests_) {
+    }
 
     ~Iterator() override {
       mutex_lock l(mu_);
@@ -390,21 +394,23 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         // TODO(aaudibert): add backoff and max retries.
         int64 deadline_micros =
             Env::Default()->NowMicros() + kRetryTimeoutMicros;
-        Status s = FetchElement(task_thread, deadline_micros);
+        Status s = GetElement(task_thread, deadline_micros);
         if (!s.ok()) {
-          LOG(WARNING) << "Failed to fetch element from worker at "
+          LOG(WARNING) << "Failed to get element from worker at "
                        << task_thread->address << ": " << s;
         }
       }
     }
 
-    // Fetches an element from a task and adds the element to `results_`.
+    // Gets an element from a task and adds the element to `results_`.
     //
     // If the task reaches end_of_sequence or is cancelled (e.g. due to a
-    // worker dying), FetchElement returns Status::OK() without adding to
+    // worker dying), GetElement returns Status::OK() without adding to
     // `results_`.
-    Status FetchElement(TaskThread* task_thread, int64 deadline_micros) {
-      VLOG(3) << "Fetching an element for task id " << task_thread->task_id;
+    Status GetElement(TaskThread* task_thread, int64 deadline_micros) {
+      VLOG(3) << "Getting an element for task id " << task_thread->task_id;
+      tensorflow::profiler::TraceMe activity(
+          "GetElement", tensorflow::profiler::TraceMeLevel::kInfo);
       CompressedElement compressed;
       bool end_of_sequence;
       for (int num_retries = 0;; ++num_retries) {
@@ -453,7 +459,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       }
       results_.push(std::move(element));
       cv_.notify_all();
-      VLOG(3) << "Fetched an element for task id " << task_thread->task_id;
+      VLOG(3) << "Got an element for task id " << task_thread->task_id;
       return Status::OK();
     }
 
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index c1c23668db0..67dfadb4841 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -84,15 +84,29 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
     if task_refresh_interval_hint_ms is None:
       task_refresh_interval_hint_ms = dataset_ops.AUTOTUNE
 
+    self._dataset_id = ops.convert_to_tensor(
+        dataset_id, dtype=dtypes.int64, name="dataset_id")
+    self._processing_mode = ops.convert_to_tensor(
+        processing_mode, dtype=dtypes.string, name="processing_mode")
+    self._address = ops.convert_to_tensor(
+        address, dtype=dtypes.string, name="address")
+    self._protocol = ops.convert_to_tensor(
+        protocol, dtype=dtypes.string, name="protocol")
+    self._job_name = ops.convert_to_tensor(
+        job_name, dtype=dtypes.string, name="job_name")
+    self._max_outstanding_requests = ops.convert_to_tensor(
+        max_outstanding_requests,
+        dtype=dtypes.int64,
+        name="max_outstanding_requests")
     self._element_spec = input_dataset.element_spec
 
     variant_tensor = gen_experimental_dataset_ops.data_service_dataset(
-        dataset_id=dataset_id,
-        processing_mode=processing_mode,
-        address=address,
-        protocol=protocol,
-        job_name=job_name,
-        max_outstanding_requests=max_outstanding_requests,
+        dataset_id=self._dataset_id,
+        processing_mode=self._processing_mode,
+        address=self._address,
+        protocol=self._protocol,
+        job_name=self._job_name,
+        max_outstanding_requests=self._max_outstanding_requests,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms,
         iteration_counter=gen_experimental_dataset_ops.dummy_iteration_counter(
         ),
@@ -297,5 +311,8 @@ def distribute(processing_mode,
   Returns:
     Dataset: A `Dataset` of the elements produced by the data service.
   """
-  return _distribute(processing_mode, service, job_name,
-                     max_outstanding_requests)
+  return _distribute(
+      processing_mode=processing_mode,
+      service=service,
+      job_name=job_name,
+      max_outstanding_requests=max_outstanding_requests)
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index eac1c674b2d..217c586caef 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -215,6 +215,21 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
         val = next(iterator).numpy()
       self.assertEqual(i, val)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testMaxOutstandingRequests(self):
+    num_elements = 10
+    num_workers = 3
+    service = self.create_cluster(num_workers)
+    ds = dataset_ops.Dataset.range(num_elements)
+    ds = ds.apply(
+        data_service_ops._distribute(
+            "parallel_epochs",
+            service,
+            max_outstanding_requests=1,
+            task_refresh_interval_hint_ms=20))
+    self.assertCountEqual(num_workers * list(range(num_elements)),
+                          self.getDatasetOutput(ds))
+
   @combinations.generate(test_base.eager_only_combinations())
   def testInsideFunction(self):
     num_workers = 3

From d4a9a8a6498184f8bac13ff0b2564b4bbc91faa5 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 11 May 2020 14:42:18 -0700
Subject: [PATCH 0340/1533] IWYU in profiler/convert

PiperOrigin-RevId: 310997293
Change-Id: I3ef1f017680f440b573e641a629787447c69038a
---
 tensorflow/core/profiler/convert/BUILD        | 67 +++++++++++++++----
 .../convert/op_metrics_db_combiner.cc         |  1 +
 .../profiler/convert/op_metrics_to_record.cc  |  2 +
 .../op_stats_to_input_pipeline_analysis.cc    | 52 +++++++-------
 .../op_stats_to_input_pipeline_analysis.h     | 11 ++-
 .../convert/op_stats_to_overview_page.cc      | 48 +++++++------
 .../convert/op_stats_to_overview_page.h       | 10 ++-
 .../profiler/convert/op_stats_to_tf_stats.cc  |  3 +-
 .../convert/op_stats_to_tf_stats_test.cc      | 12 ++--
 .../convert/step_events_to_steps_db.cc        |  8 +++
 .../convert/step_events_to_steps_db.h         |  1 +
 .../profiler/convert/trace_events_to_json.cc  |  5 ++
 .../convert/xplane_to_kernel_stats_db.cc      |  8 ++-
 .../convert/xplane_to_kernel_stats_db.h       |  3 +-
 .../convert/xplane_to_memory_profile.cc       | 11 ++-
 .../convert/xplane_to_memory_profile_test.cc  |  4 ++
 .../convert/xplane_to_op_metrics_db.cc        | 10 +++
 .../convert/xplane_to_op_metrics_db.h         |  3 +-
 .../convert/xplane_to_op_metrics_db_test.cc   |  3 +
 .../profiler/convert/xplane_to_op_stats.cc    | 11 +++
 .../convert/xplane_to_op_stats_test.cc        |  6 +-
 .../convert/xplane_to_profile_response.cc     |  4 ++
 .../convert/xplane_to_profile_response.h      |  2 -
 .../xplane_to_profile_response_test.cc        |  3 +-
 .../profiler/convert/xplane_to_step_events.cc |  8 +++
 .../profiler/convert/xplane_to_step_events.h  |  2 +-
 .../convert/xplane_to_step_events_test.cc     |  6 ++
 .../convert/xplane_to_tf_functions.cc         |  8 ++-
 .../profiler/convert/xplane_to_tf_functions.h |  3 +-
 .../convert/xplane_to_tf_functions_test.cc    |  5 ++
 .../convert/xplane_to_trace_events.cc         | 13 ++++
 .../profiler/convert/xplane_to_trace_events.h |  3 +-
 .../convert/xplane_to_trace_events_test.cc    |  3 +-
 33 files changed, 244 insertions(+), 95 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index d5c347678a3..369d26a92d9 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -17,15 +17,18 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:cost_utils",
-        "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:op_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:trace_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -35,9 +38,11 @@ tf_cc_test(
     srcs = ["xplane_to_op_metrics_db_test.cc"],
     deps = [
         ":xplane_to_op_metrics_db",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
@@ -86,12 +91,12 @@ cc_library(
         ":op_stats_to_input_pipeline_analysis",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/platform:logging",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/utils:errors",
         "//tensorflow/core/profiler/utils:html_utils",
@@ -125,7 +130,6 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/util:stats_calculator_portable",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -138,13 +142,12 @@ cc_library(
     hdrs = ["op_stats_to_tf_stats.h"],
     deps = [
         ":op_metrics_to_record",
+        "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
-        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:time_utils",
-        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -155,13 +158,18 @@ tf_cc_test(
     deps = [
         ":op_stats_to_tf_stats",
         ":xplane_to_op_stats",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
         "//tensorflow/core/profiler/utils:time_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -174,6 +182,9 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
+        "//tensorflow/core/profiler/utils:timespan",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -208,6 +219,7 @@ cc_library(
     srcs = ["xplane_to_op_stats.cc"],
     hdrs = ["xplane_to_op_stats.h"],
     deps = [
+        ":op_metrics_db_combiner",
         ":step_events_to_steps_db",
         ":xplane_to_kernel_stats_db",
         ":xplane_to_op_metrics_db",
@@ -216,15 +228,20 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
+        "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -242,11 +259,15 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
+        "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -271,6 +292,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -286,12 +308,14 @@ tf_cc_test(
     srcs = ["xplane_to_profile_response_test.cc"],
     deps = [
         ":xplane_to_profile_response",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -305,14 +329,16 @@ cc_library(
     hdrs = ["xplane_to_step_events.h"],
     deps = [
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -322,12 +348,16 @@ tf_cc_test(
     srcs = ["xplane_to_step_events_test.cc"],
     deps = [
         ":xplane_to_step_events",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -341,7 +371,9 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -357,6 +389,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
@@ -372,14 +406,14 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:trace_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -396,11 +430,10 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -417,10 +450,13 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -432,15 +468,18 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/framework:protos_all_cc",
         "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -450,10 +489,14 @@ tf_cc_test(
     srcs = ["xplane_to_memory_profile_test.cc"],
     deps = [
         ":xplane_to_memory_profile",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index 3f601bb9134..8229d1020b9 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.cc b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
index b51c679776b..8e28199b827 100644
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_to_record.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 
+#include <iterator>
 #include <tuple>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 14b6cebc424..83673458d21 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -15,11 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 
+#include <math.h>
+
 #include <algorithm>
-#include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -27,7 +28,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
@@ -104,7 +104,7 @@ StepSummary GetStepSummaryForSampleStats(const Stat<double>& sample_stats) {
     avg = sdv = min = max = 0.0;
   } else {
     avg = sample_stats.avg();
-    sdv = std::sqrt(sample_stats.sample_variance());
+    sdv = sqrt(sample_stats.sample_variance());
     min = sample_stats.min();
     max = sample_stats.max();
   }
@@ -244,7 +244,7 @@ enum class InputOpCategory {
   kPreprocessing      // data preprocessing.
 };
 
-string InputOpCategoryString(InputOpCategory category) {
+std::string InputOpCategoryString(InputOpCategory category) {
   switch (category) {
     case InputOpCategory::kEnqueue:
       return "Enqueue";
@@ -359,9 +359,9 @@ double RatioOfHostToDeviceTimeToStepTime(
 }
 
 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
-                          string* kernel_launch_classification,
-                          string* kernel_launch_statement) {
-  string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
+                          std::string* kernel_launch_classification,
+                          std::string* kernel_launch_statement) {
+  std::string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
   if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
     *kernel_launch_classification = "high";
     *kernel_launch_statement = absl::StrCat(
@@ -386,14 +386,14 @@ void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
 }
 
 void AllOtherAnalysis(bool all_other_reported, double all_other_percent,
-                      string* all_other_classification,
-                      string* all_other_statement) {
+                      std::string* all_other_classification,
+                      std::string* all_other_statement) {
   if (all_other_reported) {
     *all_other_classification = "no";
     *all_other_statement = "";
     return;
   }
-  string percent_str = absl::StrFormat("%.1lf", all_other_percent);
+  std::string percent_str = absl::StrFormat("%.1lf", all_other_percent);
   if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
     *all_other_classification = "high";
     *all_other_statement =
@@ -585,9 +585,10 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
 }
 
 bool InputAnalysis(double input_percent, double all_other_percent,
-                   string* input_classification, string* input_statement) {
+                   std::string* input_classification,
+                   std::string* input_statement) {
   absl::string_view non_input_time = "other time";
-  string infeed_percent_str = absl::StrFormat("%.1lf", input_percent);
+  std::string infeed_percent_str = absl::StrFormat("%.1lf", input_percent);
   if (input_percent >= kHighlyInfeedBoundThresholdInPercent) {
     *input_classification = "host";
     *input_statement = absl::StrCat(
@@ -607,7 +608,8 @@ bool InputAnalysis(double input_percent, double all_other_percent,
     // Input analysis says it is not input-bound, but "All-Other" time
     // is significant. It could still be input-bound (or Python overhead).
     *input_classification = "both";
-    string all_other_percent_str = absl::StrFormat("%.1lf", all_other_percent);
+    std::string all_other_percent_str =
+        absl::StrFormat("%.1lf", all_other_percent);
     *input_statement = absl::StrCat(
         "Your program is POTENTIALLY input-bound because ",
         all_other_percent_str,
@@ -627,8 +629,8 @@ bool InputAnalysis(double input_percent, double all_other_percent,
   }
 }
 
-void OutputAnalysis(double output_percent, string* output_classification,
-                    string* output_statement) {
+void OutputAnalysis(double output_percent, std::string* output_classification,
+                    std::string* output_statement) {
   string tc_outfeed_percent_str = absl::StrFormat("%.1lf", output_percent);
   if (output_percent >= kHighlyOutfeedBoundThresholdInPercent) {
     *output_classification = "host";
@@ -700,19 +702,19 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   double kernel_launch_percent =
       100.0 * total_host_prepare_ms / total_step_time_ms;
   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
-  string input_classification;
-  string input_statement;
+  std::string input_classification;
+  std::string input_statement;
   bool all_other_reported =
       InputAnalysis(input_percent, all_other_percent, &input_classification,
                     &input_statement);
 
-  string kernel_launch_classification;
-  string kernel_launch_statement;
+  std::string kernel_launch_classification;
+  std::string kernel_launch_statement;
   KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
                        &kernel_launch_classification, &kernel_launch_statement);
 
-  string all_other_classification;
-  string all_other_statement;
+  std::string all_other_classification;
+  std::string all_other_statement;
   AllOtherAnalysis(all_other_reported, all_other_percent,
                    &all_other_classification, &all_other_statement);
 
@@ -726,9 +728,9 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   return analysis;
 }
 
-string GetSummaryNextStep(absl::string_view input_classification,
-                          const InputTimeBreakdown& breakdown) {
-  string summary_next_step;
+std::string GetSummaryNextStep(absl::string_view input_classification,
+                               const InputTimeBreakdown& breakdown) {
+  std::string summary_next_step;
   if (input_classification == "host" || input_classification == "both") {
     if (!TfDataInUse(breakdown)) {
       summary_next_step = absl::StrCat(
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
index 738daeaac12..93b4df0b2c2 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -16,12 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_INPUT_PIPELINE_ANALYSIS_H_
 
+#include <string>
+
 #include "google/protobuf/any.pb.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 
@@ -50,16 +53,18 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
 // Returns true if explanation for "All Others" time is also included in
 // input_statement.
 bool InputAnalysis(double input_percent, double all_other_percent,
-                   string* input_classification, string* input_statement);
+                   std::string* input_classification,
+                   std::string* input_statement);
 
-void OutputAnalysis(double output_percent, string* output_classification,
-                    string* output_statement);
+void OutputAnalysis(double output_percent, std::string* output_classification,
+                    std::string* output_statement);
 
 string GetSummaryNextStep(absl::string_view input_classification,
                           const InputTimeBreakdown& breakdown);
 
 void AddErrorMessages(const OpStats& op_stats,
                       InputPipelineAnalysisResult* result);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index ec3a6dc1525..bec92e0d998 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -15,13 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
 
-#include <algorithm>
-#include <utility>
+#include <string>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
@@ -30,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
 #include "tensorflow/core/profiler/utils/errors.h"
 #include "tensorflow/core/profiler/utils/html_utils.h"
@@ -51,20 +50,18 @@ struct TfFunctionInfo {
   double expensive_call_percent;
 };
 
-OverviewPageTip MakeOverviewPageTip(const string& text) {
+OverviewPageTip MakeOverviewPageTip(std::string text) {
   OverviewPageTip tip;
-  tip.set_link(text);
+  tip.set_link(std::move(text));
   return tip;
 }
 
 // Makes a recommendation for looking up a document.
 // doc_url is expected to be already be escaped suitably for use in an HTML
 // attribute.
-OverviewPageTip MakeOverviewPageTipDocLink(const string& doc_url,
-                                           const string& text) {
-  OverviewPageTip tip;
-  tip.set_link(AnchorElement(doc_url, text));
-  return tip;
+OverviewPageTip MakeOverviewPageTipDocLink(absl::string_view doc_url,
+                                           absl::string_view text) {
+  return MakeOverviewPageTip(AnchorElement(doc_url, text));
 }
 
 void ComputeHostTips(OverviewPageRecommendation* re) {
@@ -78,12 +75,13 @@ void ComputeHostTips(OverviewPageRecommendation* re) {
 
 void ComputeDeviceTips(HardwareType hardware_type,
                        OverviewPageRecommendation* re) {
-  const string& device_name = HardwareType_Name(hardware_type);
-  string timeline_name =
-      (hardware_type == tensorflow::profiler::TPU) ? "TPU core" : device_name;
-  string op_stats_toolname = (hardware_type == tensorflow::profiler::TPU)
-                                 ? "op_profile"
-                                 : "tensorflow_stats";
+  absl::string_view device_name = HardwareType_Name(hardware_type);
+  absl::string_view timeline_name = device_name;
+  absl::string_view op_stats_toolname = "tensorflow_stats";
+  if (hardware_type == tensorflow::profiler::TPU) {
+    timeline_name = "TPU core";
+    op_stats_toolname = "op_profile";
+  }
   *re->add_device_tips() = MakeOverviewPageTip(
       absl::StrCat(op_stats_toolname,
                    " (identify the time-consuming operations "
@@ -124,16 +122,16 @@ std::string GeneratePrecisionStatement(const PrecisionStats& precision_stats) {
 
 }  // namespace
 
-void SetCommonRecommendation(const string& input_classification,
-                             const string& input_statement,
-                             const string& output_statement,
+void SetCommonRecommendation(absl::string_view input_classification,
+                             absl::string_view input_statement,
+                             absl::string_view output_statement,
                              HardwareType hardware_type,
-                             const string& tf_function_statement_html,
+                             absl::string_view tf_function_statement_html,
                              OverviewPageRecommendation* re) {
-  re->set_bottleneck(input_classification);
-  re->set_statement(input_statement);
-  re->set_output_statement(output_statement);
-  re->set_tf_function_statement_html(tf_function_statement_html);
+  re->set_bottleneck(std::string(input_classification));
+  re->set_statement(std::string(input_statement));
+  re->set_output_statement(std::string(output_statement));
+  re->set_tf_function_statement_html(std::string(tf_function_statement_html));
   ComputeHostTips(re);
   ComputeDeviceTips(hardware_type, re);
   ComputeDocumentationTips(re);
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index ba04f1b41b7..b4b3991a18d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
 
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
@@ -34,11 +32,11 @@ namespace profiler {
 // the tf-functions profiled.
 const double kTfFunctionReportThresholdInPercent = 20;
 
-void SetCommonRecommendation(const string& input_classification,
-                             const string& input_statement,
-                             const string& output_statement,
+void SetCommonRecommendation(absl::string_view input_classification,
+                             absl::string_view input_statement,
+                             absl::string_view output_statement,
                              HardwareType hardware_type,
-                             const string& tf_function_statement_html,
+                             absl::string_view tf_function_statement_html,
                              OverviewPageRecommendation* re);
 
 OverviewPageRecommendation ComputeGenericRecommendation(
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
index da409f89a60..e23813a5b5d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats.cc
@@ -15,13 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 
-#include "absl/container/flat_hash_set.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
-#include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 3e098da7eb8..9ca83b51a70 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
-#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
-#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -75,8 +79,8 @@ TEST(OpStatsToTfStats, GpuTfStats) {
                        kKernel3DurationNs, /*on_device=*/true, kKernel3,
                        &device_plane, &stream2);
 
-  const OpStats& op_stats = ConvertXSpaceToOpStats(space);
-  const TfStatsDatabase& tf_stats = ConvertOpStatsToTfStats(op_stats);
+  const OpStats op_stats = ConvertXSpaceToOpStats(space);
+  const TfStatsDatabase tf_stats = ConvertOpStatsToTfStats(op_stats);
 
   // TfOp1, TfOp2, Idle
   EXPECT_EQ(3, tf_stats.with_idle().tf_stats_record_size());
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index ed0d83ade2f..e4713cd73fb 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -15,10 +15,18 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 
 #include <sstream>
+#include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.h b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
index b3ea74e905f..9db65163f7a 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.h
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_STEP_EVENTS_TO_STEPS_DB_H_
 
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 
diff --git a/tensorflow/core/profiler/convert/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_events_to_json.cc
index 9c8176c10ad..07e32ced9d0 100644
--- a/tensorflow/core/profiler/convert/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/convert/trace_events_to_json.cc
@@ -15,9 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/trace_events_to_json.h"
 
+#include <algorithm>
+#include <map>
+#include <utility>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "include/json/json.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
index 785902e2a50..023d6a73d77 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
@@ -15,16 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 
+#include <functional>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
index 04bd0e8ae5f..9c7fca22887 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h
@@ -17,9 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_KERNEL_STATS_DB_H_
 
 #include <functional>
-#include <vector>
 
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -31,6 +29,7 @@ KernelStatsDb ConvertDeviceTraceXPlaneToKernelStatsDb(
     const XPlane& device_trace,
     const std::function<void(const XEventVisitor&, KernelReport*)>&
         on_kernel_fn);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index 1695bd34d73..5b2a7489241 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -15,21 +15,28 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 
-#include <cstddef>
+#include <algorithm>
 #include <string>
 #include <tuple>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index 1173e4d5c72..e0d87ac7567 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
index 09df59e44d9..4a369b8b96a 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
@@ -15,21 +15,31 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 
+#include <algorithm>
+#include <memory>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/op_stack.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/cost_utils.h"
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
index 1a785d0335f..f2d7fc702fc 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h
@@ -21,10 +21,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/op_utils.h"
 #include "tensorflow/core/profiler/utils/tf_op_utils.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
index 3e577d00e1c..8bd0443b8f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index 7fdd6ffd8cb..f008219cbd2 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
@@ -23,12 +27,19 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
+#include "tensorflow/core/profiler/utils/tf_op_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index c7b140b6a67..7b4652f6c0b 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
+#include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index 1643bbf8098..e6fe74942fc 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -15,7 +15,10 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
@@ -32,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.h b/tensorflow/core/profiler/convert/xplane_to_profile_response.h
index 84b9fdd914b..03ca13f1788 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.h
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_PROFILE_RESPONSE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_PROFILE_RESPONSE_H_
 
-#include "absl/container/flat_hash_set.h"
-#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
index d4965a9975c..ad9ca1028f6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response_test.cc
@@ -14,13 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
index c7dcd6266d2..7bb7cd6943c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
@@ -15,10 +15,18 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.h b/tensorflow/core/profiler/convert/xplane_to_step_events.h
index a7ac3b9e89e..62fc89813a1 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.h
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
-#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
index 3e1610c2e0f..36e6a2c3091 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
@@ -15,7 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index c49a0750c10..b25cdc4d219 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 
+#include <algorithm>
 #include <stack>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -29,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
index 470b22d34b8..df55ac79bb8 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TF_FUNCTIONS_H_
 
+#include <string>
+
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
index 3ac23fc82f9..25e56d17418 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
@@ -15,12 +15,17 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 
+#include <string>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
index 901f3be764a..c404f7bb7e4 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.cc
@@ -15,8 +15,21 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 
+#include <stddef.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events.h b/tensorflow/core/profiler/convert/xplane_to_trace_events.h
index 5c6fbead805..b7bddb7b366 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events.h
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events.h
@@ -16,7 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_EVENTS_H_
 
-#include "absl/strings/str_split.h"
+#include <string>
+
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
index afff5e60d97..b9a9fe09981 100644
--- a/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_events_test.cc
@@ -16,8 +16,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
 
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {

From 708c42b0fd4269dcee75d2df6be6098ae53afb6f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 14:59:48 -0700
Subject: [PATCH 0341/1533] Internal micro demo.

PiperOrigin-RevId: 311000914
Change-Id: Ic056bfe5e155d442f0fc410699af9e57034eb287
---
 .../lite/micro/apollo3evb/micro_time.cc       | 72 +++++++++++++++++++
 1 file changed, 72 insertions(+)
 create mode 100644 tensorflow/lite/micro/apollo3evb/micro_time.cc

diff --git a/tensorflow/lite/micro/apollo3evb/micro_time.cc b/tensorflow/lite/micro/apollo3evb/micro_time.cc
new file mode 100644
index 00000000000..12c9ae5c633
--- /dev/null
+++ b/tensorflow/lite/micro/apollo3evb/micro_time.cc
@@ -0,0 +1,72 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Reference implementation of timer functions.  Platforms are not required to
+// implement these timer methods, but they are required to enable profiling.
+
+// On platforms that have a POSIX stack or C library, it can be written using
+// methods from <sys/time.h> or clock() from <time.h>.
+
+// To add an equivalent function for your own platform, create your own
+// implementation file, and place it in a subfolder with named after the OS
+// you're targeting. For example, see the Cortex M bare metal version in
+// tensorflow/lite/micro/bluepill/micro_timer.cc or the mbed one on
+// tensorflow/lite/micro/mbed/micro_timer.cc.
+
+#include "tensorflow/lite/micro/micro_time.h"
+
+// These are headers from Ambiq's Apollo3 SDK.
+#include "am_bsp.h"         // NOLINT
+#include "am_mcu_apollo.h"  // NOLINT
+#include "am_util.h"        // NOLINT
+
+namespace tflite {
+namespace {
+
+// Select CTIMER 1 as benchmarking timer on Sparkfun Edge. This timer must not
+// be used elsewhere.
+constexpr int kTimerNum = 1;
+
+// Clock set to operate at 12MHz.
+constexpr int kClocksPerSecond = 12e6;
+
+}  // namespace
+
+int32_t ticks_per_second() { return kClocksPerSecond; }
+
+// Calling this method enables a timer that runs for eternity. The user is
+// responsible for avoiding trampling on this timer's config, otherwise timing
+// measurements may no longer be valid.
+int32_t GetCurrentTimeTicks() {
+  // TODO(b/150808076): Split out initialization, intialize in interpreter.
+  static bool is_initialized = false;
+  if (!is_initialized) {
+    am_hal_ctimer_config_t timer_config;
+    // Operate as a 32-bit timer.
+    timer_config.ui32Link = 1;
+    // Set timer A to continuous mode at 12MHz.
+    timer_config.ui32TimerAConfig =
+        AM_HAL_CTIMER_FN_CONTINUOUS | AM_HAL_CTIMER_HFRC_12MHZ;
+
+    am_hal_ctimer_stop(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_clear(kTimerNum, AM_HAL_CTIMER_BOTH);
+    am_hal_ctimer_config(kTimerNum, &timer_config);
+    am_hal_ctimer_start(kTimerNum, AM_HAL_CTIMER_TIMERA);
+    is_initialized = true;
+  }
+  return CTIMERn(kTimerNum)->TMR0;
+}
+
+}  // namespace tflite

From b247403af80fec171592c1698701b3f9dc04a7d1 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Mon, 11 May 2020 15:03:17 -0700
Subject: [PATCH 0342/1533] Mark multi-process utilities with subprocess module
 as deprecated in favor of using MultiProcessRunner.

PiperOrigin-RevId: 311001636
Change-Id: I33aabec4376a9702fec77016c957bd8f9edbec5e
---
 .../python/distribute/multi_worker_test_base.py     | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/python/distribute/multi_worker_test_base.py b/tensorflow/python/distribute/multi_worker_test_base.py
index a80ca831e4b..408cad2ca0a 100644
--- a/tensorflow/python/distribute/multi_worker_test_base.py
+++ b/tensorflow/python/distribute/multi_worker_test_base.py
@@ -50,6 +50,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import server_lib
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 
@@ -559,6 +560,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
     return subprocess.Popen(
         cmd_args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
 
+  @deprecation.deprecated(
+      None, '`run_multiple_tasks_in_processes` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def run_multiple_tasks_in_processes(self, cmd_args, cluster_spec):
     """Run `cmd_args` in a process for each task in `cluster_spec`."""
     processes = {}
@@ -570,6 +575,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
         processes[task_type].append(p)
     return processes
 
+  @deprecation.deprecated(
+      None, '`join_independent_workers` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def join_independent_workers(self, worker_processes):
     return_codes = []
     for p in nest.flatten(worker_processes):
@@ -585,6 +594,10 @@ class MultiWorkerMultiProcessTest(test.TestCase):
     for return_code in return_codes:
       self.assertEqual(return_code, 0)
 
+  @deprecation.deprecated(
+      None, '`stream_stderr` is deprecated; any new test '
+      'requiring multiple processes should use `multi_process_runner` for '
+      'better support of log printing, streaming, and more functionality.')
   def stream_stderr(self, processes, print_only_first=False):
     """Consume stderr of all processes and print to stdout.
 

From 4bab47f1d12cfdaee10d0e8ad087973a5a1c2560 Mon Sep 17 00:00:00 2001
From: Robert Suderman <suderman@google.com>
Date: Mon, 11 May 2020 15:04:32 -0700
Subject: [PATCH 0343/1533] Update Materialize Broadcasts for same-rank
 broadcasts

PiperOrigin-RevId: 311001875
Change-Id: Ib5743ffa5d3605c9a58def1952ad8bd0eed24682
---
 .../xla/tests/materialize-broadcasts.mlir     | 11 +++
 .../xla/transforms/materialize_broadcasts.cc  | 72 +++++++++----------
 2 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
index 4050340ce49..2340650dda8 100644
--- a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
@@ -20,6 +20,17 @@ func @addBroadcastLhs(%arg0: tensor<4xf32>, %arg1: tensor<1x4xf32>) -> tensor<1x
 
 // -----
 
+// CHECK-LABEL: @addBroadcastEqual
+func @addBroadcastEqual(%arg0: tensor<4x1xf32>, %arg1: tensor<1x4xf32>) -> tensor<4x4xf32> {
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x1xf32>) -> tensor<4x4xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<4x4xf32>
+  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<4x4xf32>
+  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4x1xf32>, tensor<1x4xf32>) -> tensor<4x4xf32>
+  return %0 : tensor<4x4xf32>
+}
+
+// -----
+
 // CHECK-LABEL: @addBroadcastMultidimension
 func @addBroadcastMultidimension(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1x4xf32>) -> tensor<1x1x4xf32> {
   // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1x4xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index a4ffa57957e..bf666400900 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -50,12 +50,6 @@ static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
 template <typename SrcOp>
 bool CreateStaticBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
                                        Value *out_lhs, Value *out_rhs) {
-  if (!op.broadcast_dimensions().hasValue()) {
-    // Note: the op may still have an implicit broadcast on it, such as
-    // for (tensor<1xf32>, tensor<4xf32>).
-    return false;
-  }
-
   // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
   // replacing the original LHS and RHS args in the source op with the results
   // of the broadcasts.
@@ -79,25 +73,7 @@ bool CreateStaticBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
 
   auto lhs_rank = lhs_ranked_type.getRank();
   auto rhs_rank = rhs_ranked_type.getRank();
-
-  // Set broadcast_dimensions to [0, ..., rank] for the higher rank arg.
-  // Use the original op.broadcast_dimensions for the lower rank arg.
-  auto higher_rank_broadcast_dims =
-      GetI64ElementsAttrForSeq(0, std::max(lhs_rank, rhs_rank), rewriter);
-  DenseIntElementsAttr lhs_broadcast_dims;
-  DenseIntElementsAttr rhs_broadcast_dims;
-  if (lhs_rank > rhs_rank) {
-    lhs_broadcast_dims = higher_rank_broadcast_dims;
-    rhs_broadcast_dims = op.broadcast_dimensions().getValue();
-  } else if (lhs_rank < rhs_rank) {
-    lhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    rhs_broadcast_dims = higher_rank_broadcast_dims;
-  } else {
-    // This shouldn't happen for legal ops. If the broadcast_dimensions
-    // attribute is set, the ranks should be different.
-    // TODO(scotttodd): Add a custom verification for ops and assert here.
-    return false;
-  }
+  ArrayRef<int64_t> op_shape = op_ranked_type.getShape();
 
   // BroadcastInDimOp must have the same element type for operands and results,
   // so preserve the original output shape and the original input element type.
@@ -105,16 +81,32 @@ bool CreateStaticBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
   //   broadcast_in_dim (tensor<1x4xf32>) -> tensor<1x4xf32>
   //   broadcast_in_dim (tensor<4xf32>) -> tensor<1x4xf32>
   //   SrcOp (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xi1>
-  ArrayRef<int64_t> op_shape = op_ranked_type.getShape();
-  auto lhs_type =
-      RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
-  auto rhs_type =
-      RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
+  if (lhs_ranked_type.getShape() != op_ranked_type.getShape()) {
+    auto type =
+        RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
+    DenseIntElementsAttr attr = GetI64ElementsAttrForSeq(0, lhs_rank, rewriter);
+    if (lhs_rank < rhs_rank) {
+      attr = op.broadcast_dimensions().getValue();
+    }
 
-  *out_lhs = rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), lhs_type,
-                                                      lhs, lhs_broadcast_dims);
-  *out_rhs = rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), rhs_type,
-                                                      rhs, rhs_broadcast_dims);
+    lhs =
+        rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), type, lhs, attr);
+  }
+
+  if (rhs_ranked_type.getShape() != op_ranked_type.getShape()) {
+    auto type =
+        RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
+    DenseIntElementsAttr attr = GetI64ElementsAttrForSeq(0, rhs_rank, rewriter);
+    if (rhs_rank < lhs_rank) {
+      attr = op.broadcast_dimensions().getValue();
+    }
+
+    rhs =
+        rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), type, rhs, attr);
+  }
+
+  *out_lhs = lhs;
+  *out_rhs = rhs;
   return true;
 }
 
@@ -359,9 +351,15 @@ struct CompareWithBroadcastConvert : public OpRewritePattern<CompareOp> {
 
 void SetupMaterializeBroadcastsLegality(MLIRContext *context,
                                         ConversionTarget *conversionTarget) {
-#define ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OpType) \
-  conversionTarget->addDynamicallyLegalOp<OpType>(      \
-      [](OpType op) { return !op.broadcast_dimensions().hasValue(); });
+#define ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OpType)           \
+  conversionTarget->addDynamicallyLegalOp<OpType>([](OpType op) { \
+    if (op.broadcast_dimensions().hasValue()) return false;       \
+    auto l = op.lhs().getType().cast<ShapedType>();               \
+    auto r = op.rhs().getType().cast<ShapedType>();               \
+    if (!l.hasRank() || !r.hasRank()) return false;               \
+    return l.getShape() == r.getShape();                          \
+  });
+
   // Binary elementwise ops.
   ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(AddOp);
   ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(Atan2Op);

From 46e6af455ac0cedecd313bf122c877482cd95486 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 15:06:43 -0700
Subject: [PATCH 0344/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311002313
Change-Id: I85f404dcda5811df67d5c05ad7182ef4be365acb
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 28bbc65d66ad25065961716c6b8d626fc25027e1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 15:11:46 -0700
Subject: [PATCH 0345/1533] allow early termination of Profile Rpc for
 programmatic mode.

PiperOrigin-RevId: 311003274
Change-Id: I7e0e81c03aa96db6c272244316a53fab16fe3ebd
---
 .../core/profiler/profiler_service.proto      | 11 ++++++++
 .../profiler/rpc/profiler_service_impl.cc     | 28 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/tensorflow/core/profiler/profiler_service.proto b/tensorflow/core/profiler/profiler_service.proto
index 37ca4084e42..a096a10efe2 100644
--- a/tensorflow/core/profiler/profiler_service.proto
+++ b/tensorflow/core/profiler/profiler_service.proto
@@ -10,6 +10,10 @@ import "tensorflow/core/profiler/profiler_service_monitor_result.proto";
 service ProfilerService {
   // Starts a profiling session, blocks until it completes, and returns data.
   rpc Profile(ProfileRequest) returns (ProfileResponse) {}
+  // Signal to terminate the Profile rpc for a on-going profiling session,
+  // The Profile rpc will return successfully and prematurely without timeout.
+  // This is used by programmatic mode to end the session in workers.
+  rpc Terminate(TerminateRequest) returns (TerminateResponse) {}
   // Collects profiling data and returns user-friendly metrics.
   rpc Monitor(MonitorRequest) returns (MonitorResponse) {}
 }
@@ -81,6 +85,13 @@ message ProfileResponse {
   // next-field: 8
 }
 
+message TerminateRequest {
+  // Which session id to terminate.
+  string session_id = 1;
+}
+
+message TerminateResponse {}
+
 message MonitorRequest {
   // Duration for which to profile between each update.
   uint64 duration_ms = 1;
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 555f4c3366a..8cf052f165b 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
 #include "grpcpp/support/status.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -24,9 +25,12 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
@@ -66,6 +70,11 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
       if (ctx->IsCancelled()) {
         return ::grpc::Status::CANCELLED;
       }
+      if (TF_PREDICT_FALSE(IsStopped(req->session_id()))) {
+        mutex_lock lock(mutex_);
+        stop_signals_per_session_.erase(req->session_id());
+        break;
+      }
     }
 
     status = CollectDataToResponse(*req, profiler.get(), response);
@@ -76,6 +85,25 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
 
     return ::grpc::Status::OK;
   }
+
+  ::grpc::Status Terminate(::grpc::ServerContext* ctx,
+                           const TerminateRequest* req,
+                           TerminateResponse* response) override {
+    mutex_lock lock(mutex_);
+    stop_signals_per_session_[req->session_id()] = true;
+    return ::grpc::Status::OK;
+  }
+
+ private:
+  bool IsStopped(const std::string& session_id) {
+    mutex_lock lock(mutex_);
+    auto it = stop_signals_per_session_.find(session_id);
+    return it != stop_signals_per_session_.end() && it->second;
+  }
+
+  mutex mutex_;
+  absl::flat_hash_map<std::string, bool> stop_signals_per_session_
+      GUARDED_BY(mutex_);
 };
 
 }  // namespace

From 7c09d15f9fcc14343343c247ebf5b8e0afe3e4aa Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 11 May 2020 15:12:02 -0700
Subject: [PATCH 0346/1533] Fix an ASAN failure due to uninitialized RemoteMgr.

PiperOrigin-RevId: 311003331
Change-Id: I67a8e53c04f6ddf3f18334a3bc18e342dab23bf0
---
 .../common_runtime/eager/execute_node_test.cc | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 970307de851..99f030322df 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 
+#include <memory>
+
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
@@ -49,9 +51,12 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
   StaticDeviceMgr device_mgr(
       DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:0"));
   Device* device0 = device_mgr.ListDevices().at(0);
-  StaticDeviceMgr remote_device_mgr(
+  auto remote_device_mgr = absl::make_unique<DynamicDeviceMgr>();
+  std::vector<std::unique_ptr<Device>> remote_devices;
+  remote_devices.emplace_back(
       DeviceFactory::NewDevice("CPU", {}, "/job:localhost/replica:0/task:1"));
-  Device* device1 = remote_device_mgr.ListDevices().at(0);
+  TF_ASSERT_OK(remote_device_mgr->AddDevices(std::move(remote_devices)));
+  Device* device1 = remote_device_mgr->ListDevices().at(0);
 
   Status s;
   std::unique_ptr<CompositeDevice> composite_device =
@@ -65,6 +70,17 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
       tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
       &device_mgr, false, nullptr, nullptr, nullptr);
 
+  // Set a RemoteMgr to the EagerContext.
+  auto remote_mgr = absl::make_unique<eager::RemoteMgr>(
+      /*is_master=*/true, ctx);
+  TF_ASSERT_OK(ctx->InitializeRemoteMaster(
+      /*server=*/nullptr, /*worker_env=*/nullptr,
+      /*worker_session=*/nullptr, /*remote_eager_workers=*/nullptr,
+      std::move(remote_device_mgr), /*remote_contexts=*/{},
+      EagerContext::NewContextId(),
+      /*r=*/nullptr, &device_mgr, /*keep_alive_secs*/ 600,
+      /*cluster_flr=*/nullptr, std::move(remote_mgr)));
+
   DataType dtype = DT_FLOAT;
   Tensor t0(dtype, TensorShape({}));
   // Create two local TensorHandles

From ce43a59c72ea12859b695117fe7cc07b35699484 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Mon, 11 May 2020 15:12:42 -0700
Subject: [PATCH 0347/1533] Internal change

PiperOrigin-RevId: 311003458
Change-Id: I1a5923edadf3a0101a43dd6dd605c37402b017e4
---
 .../mlir/tfrt/saved_model/saved_model.cc      | 43 +++++++++++++++++--
 .../mlir/tfrt/saved_model/saved_model.h       | 13 +++---
 2 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
index 8a187cf43a8..92571148cff 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
@@ -29,11 +29,25 @@ limitations under the License.
 #include "tfrt/tensor/dense_host_tensor_view.h"
 
 namespace tensorflow {
+namespace {
 
-void MapFunctionGlobalTensorCapturesFromTFSavedModelMLIR(
+llvm::StringRef ProcessIndexPath(mlir::ArrayAttr index_path) {
+  if (index_path.size() == 1 && index_path[0].isa<mlir::StringAttr>()) {
+    // TODO(chky): Support cases where index_path is not a single string.
+    return index_path[0].cast<mlir::StringAttr>().getValue();
+  }
+  return "";
+}
+
+}  // namespace
+
+void MapFunctionSignaturesFromTFSavedModelMLIR(
     mlir::ModuleOp module,
     llvm::function_ref<void(
         llvm::StringRef func_name,
+        llvm::ArrayRef<std::pair<llvm::StringRef, llvm::StringRef>>
+            input_names_and_devices,
+        llvm::ArrayRef<llvm::StringRef> output_names,
         llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> global_tensors)>
         map_fn) {
   // Create global_tensors for each functions.
@@ -44,17 +58,38 @@ void MapFunctionGlobalTensorCapturesFromTFSavedModelMLIR(
     auto func_names = mlir::tf_saved_model::GetExportedNames(func);
     if (func_names.empty()) return;
 
-    // Here we walk through each arguments and find out the variables used by
-    // this function.
+    // Here we walk through each arguments and find out the input/output names,
+    // and input devices, variables used by this function.
+    llvm::SmallVector<std::pair<llvm::StringRef, llvm::StringRef>, 4>
+        input_names_and_devices;
     llvm::SmallVector<mlir::tf_saved_model::GlobalTensorOp, 4> global_tensors;
     for (unsigned i = 0, e = func.getNumArguments(); i != e; ++i) {
+      if (auto input_index_path = func.getArgAttrOfType<mlir::ArrayAttr>(
+              i, "tf_saved_model.index_path")) {
+        std::pair<llvm::StringRef, llvm::StringRef> name_and_device;
+        name_and_device.first = ProcessIndexPath(input_index_path);
+        if (auto input_device =
+                func.getArgAttrOfType<mlir::StringAttr>(i, "tf.device")) {
+          name_and_device.second = input_device.getValue();
+        }
+        input_names_and_devices.push_back(name_and_device);
+      }
       if (auto variable =
               mlir::tf_saved_model::LookupBoundInput(func, i, symbol_table)) {
         global_tensors.push_back(variable);
       }
     }
 
-    for (auto func_name : func_names) map_fn(func_name, global_tensors);
+    llvm::SmallVector<llvm::StringRef, 4> output_names;
+    for (unsigned i = 0, e = func.getNumResults(); i != e; ++i) {
+      if (auto output_index_path = func.getResultAttrOfType<mlir::ArrayAttr>(
+              i, "tf_saved_model.index_path")) {
+        output_names.push_back(ProcessIndexPath(output_index_path));
+      }
+    }
+
+    for (auto func_name : func_names)
+      map_fn(func_name, input_names_and_devices, output_names, global_tensors);
   });
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
index de24ea20958..06a6c5a22f9 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
@@ -57,12 +57,15 @@ struct TFRTSavedModelCompileOptions {
   std::string force_data_format;
 };
 
-// Map captured global tensors for each function.
-void MapFunctionGlobalTensorCapturesFromTFSavedModelMLIR(
+// Map signatures (eg. input/output names, variables) for each function.
+void MapFunctionSignaturesFromTFSavedModelMLIR(
     mlir::ModuleOp module,
-    llvm::function_ref<
-        void(llvm::StringRef func_name,
-             llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> captures)>
+    llvm::function_ref<void(
+        llvm::StringRef func_name,
+        llvm::ArrayRef<std::pair<llvm::StringRef, llvm::StringRef>>
+            input_names_and_devices,
+        llvm::ArrayRef<llvm::StringRef> output_names,
+        llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> global_tensors)>
         map_fn);
 
 // Compile MLIR in TF saved model dialect into BEF.

From cd6eebfecab9b93863e7e20acec1ba0481f6b95f Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Mon, 11 May 2020 15:13:30 -0700
Subject: [PATCH 0348/1533] Fix benchmark naming in reporting

PiperOrigin-RevId: 311003642
Change-Id: I8424106584190b48e5dd7cd6b4031962a1ceeb27
---
 tensorflow/python/eager/benchmarks_test_base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/benchmarks_test_base.py b/tensorflow/python/eager/benchmarks_test_base.py
index 552d844c32d..3d81d08ccbf 100644
--- a/tensorflow/python/eager/benchmarks_test_base.py
+++ b/tensorflow/python/eager/benchmarks_test_base.py
@@ -32,4 +32,6 @@ class MicroBenchmarksBase(test.Benchmark):
         "examples_per_sec": float("{0:.3f}".format(num_iters / total_time)),
         "us_per_example": float("{0:.3f}".format(total_time * 1e6 / num_iters))
     }
-    self.report_benchmark(iters=num_iters, wall_time=mean_us, extras=extras)
+    benchmark_name = self._get_benchmark_name()
+    self.report_benchmark(
+        iters=num_iters, wall_time=mean_us, extras=extras, name=benchmark_name)

From af9492fdaf8abcbc73fbaa83eb964a94f8b8b02d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 15:20:52 -0700
Subject: [PATCH 0349/1533] Change default Grappler timeout from 5 to 20
 minutes. Despite Grappler having become faster, graphs sizes have grown
 faster still, and the original default timeout is exceeded too frequently in
 some applications.

PiperOrigin-RevId: 311005112
Change-Id: I97a158cd4ae534871f501b7beecab296688bac56
---
 tensorflow/core/grappler/optimizers/meta_optimizer.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index f47265f6334..cd0d44e8e12 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -114,12 +114,12 @@ FunctionDefLibrary GetFunctionDefLibraryStub(
 }
 
 uint64 DeadlineMicroSeconds(const RewriterConfig& cfg) {
-  const uint64 kFiveMinutesInUsec = 5 * 60 * 1000 * 1000;
+  const uint64 kTwentyMinutesInUsec = 20 * 60 * 1000 * 1000;
   if (cfg.meta_optimizer_timeout_ms() < 0) {
     return 0;
   } else {
     return cfg.meta_optimizer_timeout_ms() == 0
-               ? Env::Default()->NowMicros() + kFiveMinutesInUsec
+               ? Env::Default()->NowMicros() + kTwentyMinutesInUsec
                : Env::Default()->NowMicros() +
                      cfg.meta_optimizer_timeout_ms() * 1000;
   }

From 83d4a86ee172e1138b591454d4e5698e5af4e3d8 Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Mon, 11 May 2020 15:23:54 -0700
Subject: [PATCH 0350/1533] Add support for lowering convert op to Linalg from
 floating-point to integer.

PiperOrigin-RevId: 311005769
Change-Id: Icdf2e29dc0b9de9cb44390a6268352a5906ed902
---
 .../mlir/xla/tests/hlo-legalize-to-linalg.mlir      | 12 ++++++++++++
 .../mlir/xla/tests/lhlo-legalize-to-linalg.mlir     | 13 +++++++++++++
 .../mlir/xla/transforms/map_xla_to_scalar_op.h      |  8 +++-----
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
index aa949a01388..a856ee5e83c 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
@@ -530,3 +530,15 @@ func @convert_f64_to_f32(%input: tensor<2x2xf64>) -> tensor<2x2xf32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f64):
 // CHECK-NEXT:   %[[RESULT:.*]] = fptrunc %[[OPERAND_IN]] : f64 to f32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
+
+// -----
+
+// CHECK-LABEL: func @convert_f32_to_i32
+func @convert_f32_to_i32(%input: tensor<2x2xf32>) -> tensor<2x2xi32> {
+  %result = "xla_hlo.convert"(%input) : (tensor<2x2xf32>) -> tensor<2x2xi32>
+  return %result : tensor<2x2xi32>
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32):
+// CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index 3605e8e7e4c..bb8010b520c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -411,6 +411,19 @@ func @convert_f32_to_f32(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
 
 // -----
 
+// CHECK-LABEL: func @convert_f32_to_i32
+func @convert_f32_to_i32(%input: memref<2x2xf32>, %result: memref<2x2xi32>) {
+  "xla_lhlo.convert"(%input, %result)
+      : (memref<2x2xf32>, memref<2x2xi32>) -> ()
+  return
+}
+// CHECK: linalg.generic
+// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]: i32):
+// CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
+// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
 // CHECK-LABEL: func @cos
 func @cos(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
   "xla_lhlo.cosine"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
index 982ec4f4593..c317dc36b3c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_xla_to_scalar_op.h
@@ -281,11 +281,9 @@ inline Value MapLhloOpToStdScalarOp<xla_lhlo::ConvertOp>(
     // No conversion is needed for the same width integers
     return args.front();
   }
-  // TODO(dfki-ehna): Add other primitive type conversions
-  // if (mlir::FpToSiOp::areCastCompatible(sourceType, targetType)) {
-  //   return b.create<mlir::FpToSiOp>(loc, result_types,
-  //   args,mlir::None);
-  // }
+  if (mlir::FPToSIOp::areCastCompatible(sourceType, targetType)) {
+    return b->create<mlir::FPToSIOp>(loc, result_types, args, mlir::None);
+  }
   return nullptr;
 }
 

From dd2ea875d92eeb83e81b1cb92e29e61d488e98b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 15:47:55 -0700
Subject: [PATCH 0351/1533] Internal change

PiperOrigin-RevId: 311010133
Change-Id: Ia351a2e3da167f03dd6e061fd9ae508f2b5875e3
---
 .../python/keras/layers/convolutional.py      |  88 ++-------
 .../python/keras/layers/convolutional_test.py | 169 ++++--------------
 tensorflow/python/ops/nn_ops.py               |  18 +-
 .../v1/tensorflow.keras.layers.-conv1-d.pbtxt |   2 +-
 .../v1/tensorflow.keras.layers.-conv2-d.pbtxt |   2 +-
 .../v1/tensorflow.keras.layers.-conv3-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-convolution1-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-convolution2-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-convolution3-d.pbtxt |   2 +-
 .../v2/tensorflow.keras.layers.-conv1-d.pbtxt |   2 +-
 .../v2/tensorflow.keras.layers.-conv2-d.pbtxt |   2 +-
 .../v2/tensorflow.keras.layers.-conv3-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-convolution1-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-convolution2-d.pbtxt |   2 +-
 ...sorflow.keras.layers.-convolution3-d.pbtxt |   2 +-
 15 files changed, 67 insertions(+), 232 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 8e6078139c9..f7148ccd4e9 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -78,11 +78,6 @@ class Conv(Layer):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved
-      separately with `filters / groups` filters. The output is the
-      concatenation of all the `groups` results along the channel axis.
-      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied.
     use_bias: Boolean, whether the layer uses a bias.
@@ -105,15 +100,13 @@ class Conv(Layer):
     name: A string, the name of the layer.
   """
 
-  def __init__(self,
-               rank,
+  def __init__(self, rank,
                filters,
                kernel_size,
                strides=1,
                padding='valid',
                data_format=None,
                dilation_rate=1,
-               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -135,11 +128,6 @@ class Conv(Layer):
     if filters is not None and not isinstance(filters, int):
       filters = int(filters)
     self.filters = filters
-    self.groups = groups or 1
-    if filters is not None and filters % self.groups != 0:
-      raise ValueError(
-          'The number of filters must be evenly divisible by the number of '
-          'groups. Received: groups={}, filters={}'.format(groups, filters))
     self.kernel_size = conv_utils.normalize_tuple(
         kernel_size, rank, 'kernel_size')
     if not all(self.kernel_size):
@@ -167,14 +155,7 @@ class Conv(Layer):
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_channel = self._get_input_channel(input_shape)
-    if input_channel % self.groups != 0:
-      raise ValueError(
-          'The number of input channels must be evenly divisible by the number '
-          'of groups. Received groups={}, but the input has {} channels '
-          '(full input shape is {}).'.format(self.groups, input_channel,
-                                             input_shape))
-    kernel_shape = self.kernel_size + (input_channel // self.groups,
-                                       self.filters)
+    kernel_shape = self.kernel_size + (input_channel, self.filters)
 
     self.kernel = self.add_weight(
         name='kernel',
@@ -269,38 +250,22 @@ class Conv(Layer):
 
   def get_config(self):
     config = {
-        'filters':
-            self.filters,
-        'kernel_size':
-            self.kernel_size,
-        'strides':
-            self.strides,
-        'padding':
-            self.padding,
-        'data_format':
-            self.data_format,
-        'dilation_rate':
-            self.dilation_rate,
-        'groups':
-            self.groups,
-        'activation':
-            activations.serialize(self.activation),
-        'use_bias':
-            self.use_bias,
-        'kernel_initializer':
-            initializers.serialize(self.kernel_initializer),
-        'bias_initializer':
-            initializers.serialize(self.bias_initializer),
-        'kernel_regularizer':
-            regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer':
-            regularizers.serialize(self.bias_regularizer),
+        'filters': self.filters,
+        'kernel_size': self.kernel_size,
+        'strides': self.strides,
+        'padding': self.padding,
+        'data_format': self.data_format,
+        'dilation_rate': self.dilation_rate,
+        'activation': activations.serialize(self.activation),
+        'use_bias': self.use_bias,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint':
-            constraints.serialize(self.kernel_constraint),
-        'bias_constraint':
-            constraints.serialize(self.bias_constraint)
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
     }
     base_config = super(Conv, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -406,11 +371,6 @@ class Conv1D(Conv):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved
-      separately with `filters / groups` filters. The output is the
-      concatenation of all the `groups` results along the channel axis.
-      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -453,7 +413,6 @@ class Conv1D(Conv):
                padding='valid',
                data_format='channels_last',
                dilation_rate=1,
-               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -472,7 +431,6 @@ class Conv1D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
-        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -559,11 +517,6 @@ class Conv2D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved
-      separately with `filters / groups` filters. The output is the
-      concatenation of all the `groups` results along the channel axis.
-      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -613,7 +566,6 @@ class Conv2D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1),
-               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -632,7 +584,6 @@ class Conv2D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
-        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -704,11 +655,6 @@ class Conv3D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
-    groups: A positive integer specifying the number of groups in which the
-      input is split along the channel axis. Each group is convolved
-      separately with `filters / groups` filters. The output is the
-      concatenation of all the `groups` results along the channel axis.
-      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -764,7 +710,6 @@ class Conv3D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1, 1),
-               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -783,7 +728,6 @@ class Conv3D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
-        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 7ddeb9af4de..528bc14adf4 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -26,9 +26,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import nn
-from tensorflow.python.ops import random_ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -50,41 +47,20 @@ class Conv1DTest(keras_parameterized.TestCase):
           expected_output_shape=expected_output_shape)
 
   @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }, (None, 5, 2)),
-      ('padding_same', {
-          'padding': 'same'
-      }, (None, 7, 2)),
-      ('padding_same_dilation_2', {
-          'padding': 'same',
-          'dilation_rate': 2
-      }, (None, 7, 2)),
-      ('padding_same_dilation_3', {
-          'padding': 'same',
-          'dilation_rate': 3
-      }, (None, 7, 2)),
-      ('padding_causal', {
-          'padding': 'causal'
-      }, (None, 7, 2)),
-      ('strides', {
-          'strides': 2
-      }, (None, 3, 2)),
-      ('dilation_rate', {
-          'dilation_rate': 2
-      }, (None, 3, 2)),
-      # Only runs on GPU with CUDA, groups are not supported on CPU.
-      # https://github.com/tensorflow/tensorflow/issues/29005
-      ('group', {
-          'groups': 3,
-          'filters': 6
-      }, (None, 5, 6), True),
+      ('padding_valid', {'padding': 'valid'}, (None, 5, 2)),
+      ('padding_same', {'padding': 'same'}, (None, 7, 2)),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2},
+       (None, 7, 2)),
+      ('padding_same_dilation_3', {'padding': 'same', 'dilation_rate': 3},
+       (None, 7, 2)),
+      ('padding_causal', {'padding': 'causal'}, (None, 7, 2)),
+      ('strides', {'strides': 2}, (None, 3, 2)),
+      ('dilation_rate', {'dilation_rate': 2}, (None, 3, 2)),
   )
-  def test_conv1d(self, kwargs, expected_output_shape, requires_gpu=False):
-    kwargs['filters'] = kwargs.get('filters', 2)
+  def test_conv1d(self, kwargs, expected_output_shape):
+    kwargs['filters'] = 2
     kwargs['kernel_size'] = 3
-    if not requires_gpu or test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape)
+    self._run_test(kwargs, expected_output_shape)
 
   def test_conv1d_regularizers(self):
     kwargs = {
@@ -172,38 +148,20 @@ class Conv2DTest(keras_parameterized.TestCase):
           expected_output_shape=expected_output_shape)
 
   @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }, (None, 5, 4, 2)),
-      ('padding_same', {
-          'padding': 'same'
-      }, (None, 7, 6, 2)),
-      ('padding_same_dilation_2', {
-          'padding': 'same',
-          'dilation_rate': 2
-      }, (None, 7, 6, 2)),
-      ('strides', {
-          'strides': (2, 2)
-      }, (None, 3, 2, 2)),
-      ('dilation_rate', {
-          'dilation_rate': (2, 2)
-      }, (None, 3, 2, 2)),
+      ('padding_valid', {'padding': 'valid'}, (None, 5, 4, 2)),
+      ('padding_same', {'padding': 'same'}, (None, 7, 6, 2)),
+      ('padding_same_dilation_2', {'padding': 'same', 'dilation_rate': 2},
+       (None, 7, 6, 2)),
+      ('strides', {'strides': (2, 2)}, (None, 3, 2, 2)),
+      ('dilation_rate', {'dilation_rate': (2, 2)}, (None, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {
-          'data_format': 'channels_first'
-      }, None, True),
-      # Only runs on GPU with CUDA, groups are not supported on CPU.
-      # https://github.com/tensorflow/tensorflow/issues/29005
-      ('group', {
-          'groups': 3,
-          'filters': 6
-      }, (None, 5, 4, 6), True),
+      ('data_format', {'data_format': 'channels_first'}),
   )
-  def test_conv2d(self, kwargs, expected_output_shape=None, requires_gpu=False):
-    kwargs['filters'] = kwargs.get('filters', 2)
+  def test_conv2d(self, kwargs, expected_output_shape=None):
+    kwargs['filters'] = 2
     kwargs['kernel_size'] = (3, 3)
-    if not requires_gpu or test.is_gpu_available(cuda_only=True):
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, expected_output_shape)
 
   def test_conv2d_regularizers(self):
@@ -265,34 +223,18 @@ class Conv3DTest(keras_parameterized.TestCase):
           expected_output_shape=expected_output_shape)
 
   @parameterized.named_parameters(
-      ('padding_valid', {
-          'padding': 'valid'
-      }, (None, 3, 5, 4, 2)),
-      ('padding_same', {
-          'padding': 'same'
-      }, (None, 5, 7, 6, 2)),
-      ('strides', {
-          'strides': (2, 2, 2)
-      }, (None, 2, 3, 2, 2)),
-      ('dilation_rate', {
-          'dilation_rate': (2, 2, 2)
-      }, (None, 1, 3, 2, 2)),
+      ('padding_valid', {'padding': 'valid'}, (None, 3, 5, 4, 2)),
+      ('padding_same', {'padding': 'same'}, (None, 5, 7, 6, 2)),
+      ('strides', {'strides': (2, 2, 2)}, (None, 2, 3, 2, 2)),
+      ('dilation_rate', {'dilation_rate': (2, 2, 2)}, (None, 1, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {
-          'data_format': 'channels_first'
-      }, None, True),
-      # Only runs on GPU with CUDA, groups are not supported on CPU.
-      # https://github.com/tensorflow/tensorflow/issues/29005
-      ('group', {
-          'groups': 3,
-          'filters': 6
-      }, (None, 3, 5, 4, 6), True),
+      ('data_format', {'data_format': 'channels_first'}),
   )
-  def test_conv3d(self, kwargs, expected_output_shape=None, requires_gpu=False):
-    kwargs['filters'] = kwargs.get('filters', 2)
+  def test_conv3d(self, kwargs, expected_output_shape=None):
+    kwargs['filters'] = 2
     kwargs['kernel_size'] = (3, 3, 3)
-    if not requires_gpu or test.is_gpu_available(cuda_only=True):
+    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, expected_output_shape)
 
   def test_conv3d_regularizers(self):
@@ -356,57 +298,6 @@ class Conv3DTest(keras_parameterized.TestCase):
             input_data=input_data)
 
 
-class GroupedConvTest(keras_parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('Conv1D', keras.layers.Conv1D),
-      ('Conv2D', keras.layers.Conv2D),
-      ('Conv3D', keras.layers.Conv3D),
-  )
-  def test_group_conv_incorrect_use(self, layer):
-    with self.assertRaisesRegexp(ValueError, 'The number of filters'):
-      layer(16, 3, groups=3)
-    with self.assertRaisesRegexp(ValueError, 'The number of input channels'):
-      layer(16, 3, groups=4).build((32, 12, 12, 3))
-
-  @parameterized.named_parameters(
-      ('Conv1D', keras.layers.Conv1D, (32, 12, 32)),
-      ('Conv2D', keras.layers.Conv2D, (32, 12, 12, 32)),
-      ('Conv3D', keras.layers.Conv3D, (32, 12, 12, 12, 32)),
-  )
-  def test_group_conv(self, layer_cls, input_shape):
-    if test.is_gpu_available(cuda_only=True):
-      with test_util.use_gpu():
-        inputs = random_ops.random_uniform(shape=input_shape)
-
-        layer = layer_cls(16, 3, groups=4)
-        layer.build(input_shape)
-
-        input_slices = array_ops.split(inputs, 4, axis=-1)
-        weight_slices = array_ops.split(layer.kernel, 4, axis=-1)
-        expected_outputs = array_ops.concat([
-            nn.convolution_v2(inputs, weights)
-            for inputs, weights in zip(input_slices, weight_slices)
-        ],
-                                            axis=-1)
-
-        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
-
-  def test_group_conv_depthwise(self):
-    if test.is_gpu_available(cuda_only=True):
-      with test_util.use_gpu():
-        inputs = random_ops.random_uniform(shape=(3, 27, 27, 32))
-
-        layer = keras.layers.Conv2D(32, 3, groups=32)
-        layer.build((3, 27, 27, 32))
-
-        weights_dw = array_ops.reshape(layer.kernel, [3, 3, 32, 1])
-        expected_outputs = nn.depthwise_conv2d(
-            inputs, weights_dw, strides=[1, 1, 1, 1], padding='VALID')
-
-        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
-
-
 @keras_parameterized.run_all_keras_modes
 class Conv1DTransposeTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 8cc428e6e53..de5be20aa84 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1063,12 +1063,12 @@ class Convolution(object):
       input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1)
       spatial_dims = range(2, num_spatial_dims + 2)
 
-    filter_dim = tensor_shape.dimension_at_index(filter_shape, num_spatial_dims)
-    if not (input_channels_dim % filter_dim).is_compatible_with(0):
-      raise ValueError("The number of input channels is not divisible by the "
-                       "corresponding number of output filters. Received: "
-                       "input channels={}, output filters={}".format(
-                           input_channels_dim, filter_dim))
+    if not input_channels_dim.is_compatible_with(
+        filter_shape[num_spatial_dims]):
+      raise ValueError(
+          "number of input channels does not match corresponding dimension of "
+          "filter, {} != {}".format(input_channels_dim,
+                                    filter_shape[num_spatial_dims]))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -1857,9 +1857,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
   Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
   horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-
+  
   Usage Example:
-
+  
   >>> x_in = np.array([[
   ...   [[2], [1], [2], [0], [1]],
   ...   [[1], [3], [2], [2], [3]],
@@ -3279,7 +3279,7 @@ def softmax(logits, axis=None, name=None, dim=None):
       Tensor.
     RuntimeError: If a registered conversion function returns an invalid
       value.
-
+      
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 3319122f50b..cb2f7f03e56 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 30736595ce5..9d847c759a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 84527b26a39..68cbf32998e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index ccb783c33bf..76d66200fbc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 179acdc7966..8d874ede685 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index b7db8afd065..f97c7617dbd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 3319122f50b..cb2f7f03e56 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 30736595ce5..9d847c759a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 84527b26a39..68cbf32998e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index ccb783c33bf..76d66200fbc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 179acdc7966..8d874ede685 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index b7db8afd065..f97c7617dbd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"

From 482c01ff403af0e02420e74cdbb0a6f3c6388f28 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Mon, 11 May 2020 16:03:34 -0700
Subject: [PATCH 0352/1533] [XLA] Expose SPMD/MPMD as an option

PiperOrigin-RevId: 311013124
Change-Id: Id357570af6996b41791fdf52d1704bee5ee56231
---
 .../compiler/xla/client/executable_build_options.cc      | 6 ++++++
 .../compiler/xla/client/executable_build_options.h       | 6 ++++++
 tensorflow/compiler/xla/service/compile_only_service.cc  | 1 +
 tensorflow/compiler/xla/service/compiler.h               | 1 +
 tensorflow/compiler/xla/service/hlo_module.cc            | 2 ++
 tensorflow/compiler/xla/service/hlo_module_config.h      | 9 +++++++++
 tensorflow/compiler/xla/service/local_service.cc         | 2 ++
 tensorflow/compiler/xla/service/service.cc               | 2 ++
 tensorflow/compiler/xla/xla.proto                        | 4 ++++
 9 files changed, 33 insertions(+)

diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index cd52e2f5e45..404f9eb7519 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -70,6 +70,12 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_num_partitions(
   return *this;
 }
 
+ExecutableBuildOptions& ExecutableBuildOptions::set_use_spmd_partitioning(
+    bool use_spmd_partitioning) {
+  use_spmd_partitioning_ = use_spmd_partitioning;
+  return *this;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_device_assignment(
     const DeviceAssignment& device_assignment) {
   device_assignment_ = device_assignment;
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 360ad0260df..9a7fdd974b1 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -77,6 +77,11 @@ class ExecutableBuildOptions {
   int num_partitions() const { return num_partitions_; }
   ExecutableBuildOptions& set_num_partitions(int num_partitions);
 
+  // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
+  // num_partitions > 1 and XLA is requested to partition the input program.
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+  ExecutableBuildOptions& set_use_spmd_partitioning(bool use_spmd_partitioning);
+
   // If set, this specifies a static device assignment for the computation.
   // Otherwise, the computation will be compiled generically and can be run with
   // any device assignment compatible with the computation's replica and
@@ -104,6 +109,7 @@ class ExecutableBuildOptions {
   se::DeviceMemoryAllocator* device_allocator_ = nullptr;
   int num_replicas_ = 1;
   int num_partitions_ = 1;
+  bool use_spmd_partitioning_ = false;
   absl::optional<DeviceAssignment> device_assignment_;
   bool alias_passthrough_params_ = false;
 };
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 8c76e912011..ce9c8a4ea62 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -91,6 +91,7 @@ CompileOnlyService::CompileAheadOfTime(
     TF_RETURN_IF_ERROR(options.static_device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
   }
+  execution_options.set_use_spmd_partitioning(options.use_spmd_partitioning());
   for (const AotXlaComputationInstance& instance : computations) {
     TF_RET_CHECK(instance.computation.has_host_program_shape());
     *execution_options.mutable_shape_with_output_layout() =
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index cf646159a38..57b24e372e6 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -76,6 +76,7 @@ class AotCompilationOptions {
 
   virtual int64 replica_count() const { return 0; }
   virtual int64 num_cores() const { return 0; }
+  virtual bool use_spmd_partitioning() const { return false; }
 
   // Optional allocator that may be used for allocating temp space on the device
   // during compilation.
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index de65ed99303..9722d5c2b76 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -420,6 +420,8 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromShape(
     if (execution_options->num_partitions() > 0) {
       module_config.set_num_partitions(execution_options->num_partitions());
     }
+    module_config.set_use_spmd_partitioning(
+        execution_options->use_spmd_partitioning());
     if (execution_options->has_device_assignment()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<DeviceAssignment> device_assignment,
                           DeviceAssignment::Deserialize(
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index b31a9ae6ca5..61ea8392d94 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -128,6 +128,11 @@ class HloModuleConfig {
   }
   int64 num_partitions() const { return num_partitions_; }
 
+  void set_use_spmd_partitioning(bool use_spmd_partitioning) {
+    use_spmd_partitioning_ = use_spmd_partitioning;
+  }
+  bool use_spmd_partitioning() const { return use_spmd_partitioning_; }
+
   // Return a string which unambiguously represents all the fields of this data
   // structure. Used for generating a cache key for storing the compiled
   // executable.
@@ -216,6 +221,10 @@ class HloModuleConfig {
   // The number of partitions (model parallelism) to compile this binary for.
   int64 num_partitions_ = 1;
 
+  // Whether to use SPMD (true) or MPMD (false) when num_partitions_ > 0 and XLA
+  // needs to partition the module.
+  bool use_spmd_partitioning_ = false;
+
   // The target maximum parallelism at which to partition HLOs for parallel
   // execution on the CPU backend.
   int64 intra_op_parallelism_threads_ = -1;
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index ef8ddfc1a76..c80646e0c70 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -112,6 +112,8 @@ ExecutionOptions CreateExecutionOptions(
   }
   execution_options.set_num_replicas(build_options.num_replicas());
   execution_options.set_num_partitions(build_options.num_partitions());
+  execution_options.set_use_spmd_partitioning(
+      build_options.use_spmd_partitioning());
   if (build_options.has_device_assignment()) {
     TF_CHECK_OK(build_options.device_assignment().Serialize(
         execution_options.mutable_device_assignment()));
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index ab71c30dcae..2ed5e709d81 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -313,6 +313,8 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     if (execution_options->num_partitions() > 0) {
       config->set_num_partitions(execution_options->num_partitions());
     }
+    config->set_use_spmd_partitioning(
+        execution_options->use_spmd_partitioning());
     config->set_seed(execution_options->seed());
     config->set_launch_id(execution_options->launch_id());
     config->set_debug_options(execution_options->debug_options());
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index a015af674af..f4b08f454b9 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -333,6 +333,10 @@ message ExecutionOptions {
 
   // Used to identify a set of programs that should be launch together.
   int32 launch_id = 10;
+
+  // Indicates whether to use SPMD (true) or MPMD (false) partitioning when
+  // num_partitions > 1 and XLA is requested to partition the input program.
+  bool use_spmd_partitioning = 11;
 }
 
 message GetDeviceHandlesRequest {

From f0ce18dea2ee13e2c405f2cb5983e79180b5cfb1 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 11 May 2020 16:14:14 -0700
Subject: [PATCH 0353/1533] Update the TensorFlow version properly.

PiperOrigin-RevId: 311015192
Change-Id: Ice0f7b264b7903878402c989d0c4cb5fd03e205c
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index db436f3e511..48fc0ce2a78 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 1
+#define TF_MINOR_VERSION 2
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",

From 27509ffb1e8c5300694cad64ed95c999245adac7 Mon Sep 17 00:00:00 2001
From: Juhyun Lee <impjdi@google.com>
Date: Mon, 11 May 2020 16:42:37 -0700
Subject: [PATCH 0354/1533] Change a couple bad variable types.

PiperOrigin-RevId: 311020378
Change-Id: Ib66822ad724af3048d0fadbcea699d89351fa89e
---
 tensorflow/lite/delegates/gpu/delegate.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 58da8862937..4b6727e66e7 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -263,12 +263,12 @@ class DelegateKernel {
 
     input_refs->clear();
     output_refs->clear();
-    const auto& inputs = graph->inputs();
+    const auto inputs = graph->inputs();
     input_refs->reserve(inputs.size());
     for (const auto& input : inputs) {
       input_refs->push_back(input->tensor.ref);
     }
-    const auto& outputs = graph->outputs();
+    const auto outputs = graph->outputs();
     output_refs->reserve(outputs.size());
     for (const auto& output : outputs) {
       output_refs->push_back(output->tensor.ref);

From abaffb8ad1ca8ddd35d1434e3a8dfe4c8d4b30ef Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 11 May 2020 16:54:05 -0700
Subject: [PATCH 0355/1533] Convert complex and unsigned integer tensors to and
 from dense elements attr

PiperOrigin-RevId: 311022341
Change-Id: Ib1f5bd9f3a0a857e60c50cf999d4371c987c091d
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../mlir/tensorflow/utils/convert_tensor.cc   |  32 +++++-
 .../tensorflow/utils/convert_tensor_test.cc   | 101 +++++++++++-------
 3 files changed, 96 insertions(+), 38 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 5c5b74a30ae..0edf0f33a23 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -823,6 +823,7 @@ cc_library(
         ":mangling_util",
         ":tensorflow_attributes",
         ":tensorflow_types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index b492945fe8b..b28f26b6c3c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -132,13 +133,21 @@ StatusOr<ElementsAttr> ConvertTensor(const Tensor& input_tensor,
   case DTYPE:                      \
     return ConvertFlatTensor<CTYPE>(input_tensor, type);
 
-  // TODO(fengliuai): customize the conversions for more types.
+  // TODO(fengliuai): customize the conversions for quantized and string types.
   switch (input_dtype) {
     CONVERT_FLAT(DT_BOOL, bool)
     CONVERT_FLAT(DT_FLOAT, float)
     CONVERT_FLAT(DT_DOUBLE, double)
+    CONVERT_FLAT(DT_INT8, int8)
+    CONVERT_FLAT(DT_INT16, int16)
     CONVERT_FLAT(DT_INT32, int32)
     CONVERT_FLAT(DT_INT64, int64)
+    CONVERT_FLAT(DT_UINT8, uint8)
+    CONVERT_FLAT(DT_UINT16, uint16)
+    CONVERT_FLAT(DT_UINT32, uint32)
+    CONVERT_FLAT(DT_UINT64, uint64)
+    CONVERT_FLAT(DT_COMPLEX64, std::complex<float>)
+    CONVERT_FLAT(DT_COMPLEX128, std::complex<double>)
 
     // BFLOAT16 is a special case that it needs to be cast to double type to
     // match its storage type.
@@ -215,6 +224,15 @@ void ConvertStringElementsAttr(
     output->Add({val.data(), val.size()});
 }
 
+template <typename T>
+void ConvertComplexElementsAttr(const mlir::DenseElementsAttr attr,
+                                protobuf::RepeatedField<T>* output) {
+  for (const auto& val : attr.getValues<std::complex<T>>()) {
+    output->Add(val.real());
+    output->Add(val.imag());
+  }
+}
+
 // Converts an MLIR opaque elements attribute to a TensorFlow tensor proto.
 Status ConvertOpaqueElementsAttr(const ElementsAttr attr,
                                  TensorProto* output_tensor) {
@@ -310,6 +328,12 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
       ConvertIntElementsAttr(dense_attr.cast<DenseIntElementsAttr>(),
                              output->mutable_int_val());
       break;
+    case DT_UINT32:
+      ConvertElementsAttr(dense_attr, output->mutable_uint32_val());
+      break;
+    case DT_UINT64:
+      ConvertElementsAttr(dense_attr, output->mutable_uint64_val());
+      break;
     case DT_INT64:
       ConvertElementsAttr(dense_attr, output->mutable_int64_val());
       break;
@@ -324,6 +348,12 @@ Status ConvertToTensorProto(const ElementsAttr attr, TensorProto* output) {
       ConvertStringElementsAttr(dense_attr.cast<DenseStringElementsAttr>(),
                                 output->mutable_string_val());
       break;
+    case DT_COMPLEX64:
+      ConvertComplexElementsAttr(dense_attr, output->mutable_scomplex_val());
+      break;
+    case DT_COMPLEX128:
+      ConvertComplexElementsAttr(dense_attr, output->mutable_dcomplex_val());
+      break;
     default:
       return errors::Unimplemented(absl::StrCat("Unimplemented data type ",
                                                 DataTypeString(output_dtype)));
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index d711c19baae..bf96e3d1df4 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
 
 #include <cstring>
+#include <initializer_list>
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -99,48 +100,74 @@ TEST(ConvertTypeToTensorTypeTest, ConvertStringTensor) {
   EXPECT_EQ(string_values[3], mlir::StringRef("four"));
 }
 
-TEST(ConvertTypeToTensorTypeTest, Convert16BitFloats) {
+class ConvertTensorTest : public ::testing::Test {
+ protected:
+  template <typename T>
+  void VerifyConversion(std::initializer_list<T> values, DataType dtype,
+                        mlir::Type expected_ty) {
+    mlir::Builder b(expected_ty.getContext());
+    Tensor tensor(dtype, TensorShape({static_cast<int64>(values.size())}));
+    tensor.flat<T>().setValues(values);
+
+    auto value_or = ConvertTensor(tensor, &b);
+    TF_ASSERT_OK(value_or.status());
+    auto attr = value_or.ValueOrDie();
+
+    EXPECT_EQ(attr.getType().getElementType(), expected_ty);
+
+    Tensor out;
+    TF_ASSERT_OK(ConvertToTensor(attr, &out));
+
+    test::ExpectTensorEqual<T>(tensor, out);
+  }
+};
+
+TEST_F(ConvertTensorTest, Simple) {
   RegisterDialects();
+
   mlir::MLIRContext context;
-  mlir::Builder b(&context);
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<Eigen::half>(
+      {Eigen::half(1.0)}, DT_HALF, mlir::FloatType::getF16(&context)));
+  ASSERT_NO_FATAL_FAILURE(
+      VerifyConversion<bfloat16>({bfloat16(1.0), bfloat16(-1.0)}, DT_BFLOAT16,
+                                 mlir::FloatType::getBF16(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<float>(
+      {1.0, -1.0}, DT_FLOAT, mlir::FloatType::getF32(&context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<double>(
+      {1.0, -1.0}, DT_DOUBLE, mlir::FloatType::getF64(&context)));
 
-  {
-    // Create the sample tensor to convert.
-    Tensor tensor(DT_HALF, TensorShape({1}));
-    auto Tt = tensor.flat<Eigen::half>();
-    Tt.setValues({Eigen::half(1.0)});
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int8>(
+      {1, -1}, DT_INT8, mlir::IntegerType::get(8, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int16>(
+      {1, -1}, DT_INT16, mlir::IntegerType::get(16, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int32>(
+      {1, -1}, DT_INT32, mlir::IntegerType::get(32, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<int64>(
+      {1, -1}, DT_INT64, mlir::IntegerType::get(64, &context)));
 
-    auto value_or = ConvertTensor(tensor, &b);
-    TF_EXPECT_OK(value_or.status());
-    auto attr = value_or.ValueOrDie();
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint8>(
+      {1, 2}, DT_UINT8,
+      mlir::IntegerType::get(
+          8, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint16>(
+      {1, 2}, DT_UINT16,
+      mlir::IntegerType::get(
+          16, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint32>(
+      {1, 2}, DT_UINT32,
+      mlir::IntegerType::get(
+          32, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<uint64>(
+      {1, 2}, DT_UINT64,
+      mlir::IntegerType::get(
+          64, mlir::IntegerType::SignednessSemantics::Unsigned, &context)));
 
-    EXPECT_TRUE(attr.isa<mlir::DenseFPElementsAttr>());
-    EXPECT_TRUE(attr.getType().getElementType().isF16());
-
-    Tensor out;
-    TF_ASSERT_OK(ConvertToTensor(attr, &out));
-
-    test::ExpectTensorEqual<Eigen::half>(tensor, out);
-  }
-
-  {
-    // Create the sample tensor to convert.
-    Tensor tensor(DT_BFLOAT16, TensorShape({2}));
-    auto Tt = tensor.flat<bfloat16>();
-    Tt.setValues({bfloat16(1.0), bfloat16(-1.0)});
-
-    auto value_or = ConvertTensor(tensor, &b);
-    TF_EXPECT_OK(value_or.status());
-    auto attr = value_or.ValueOrDie();
-
-    EXPECT_TRUE(attr.isa<mlir::DenseFPElementsAttr>());
-    EXPECT_TRUE(attr.getType().getElementType().isBF16());
-
-    Tensor out;
-    TF_ASSERT_OK(ConvertToTensor(attr, &out));
-
-    test::ExpectTensorEqual<bfloat16>(tensor, out);
-  }
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<std::complex<float>>(
+      {{0.0, 1.0}, {1.0, 0.0}}, DT_COMPLEX64,
+      mlir::ComplexType::get(mlir::FloatType::getF32(&context))));
+  ASSERT_NO_FATAL_FAILURE(VerifyConversion<std::complex<double>>(
+      {{0.0, 1.0}, {1.0, 0.0}}, DT_COMPLEX128,
+      mlir::ComplexType::get(mlir::FloatType::getF64(&context))));
 }
 
 }  // namespace

From ede0f7d719e8f238c6b7b7b5efeddae441fa95bc Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 11 May 2020 17:04:50 -0700
Subject: [PATCH 0356/1533] Auto-generate ConjugateTranpose TensorFlow op

PiperOrigin-RevId: 311024330
Change-Id: Ie2e6811816beb64ceef43129987ad8cbc71e01ec
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 2092a75a519..9a29fa4f8a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1446,6 +1446,30 @@ tf.conj(input) ==> [-2.25 - 4.75j, 3.25 - 5.75j]
   let hasCanonicalizer = 1;
 }
 
+def TF_ConjugateTransposeOp : TF_Op<"ConjugateTranspose", [NoSideEffect]> {
+  let summary = [{
+Shuffle dimensions of x according to a permutation and conjugate the result.
+  }];
+
+  let description = [{
+The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+  `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+  `y[i,j,k,...,s,t,u] == conj(x[perm[i], perm[j], perm[k],...,perm[s], perm[t], perm[u]])`
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    TF_I32OrI64Tensor:$perm
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr Tperm = TF_DerivedOperandTypeAttr<1>;
+}
+
 def TF_Conv2DOp : TF_Op<"Conv2D", [NoSideEffect, TF_LayoutSensitiveInterface]> {
   let summary = [{
 Computes a 2-D convolution given 4-D `input` and `filter` tensors.

From 8a512d33b71746cbc17f5f3af5900caa821f68b8 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 11 May 2020 17:09:19 -0700
Subject: [PATCH 0357/1533] Enable compiler tests disabled because of
 incomplete complex types support

Removed ResultsAreFloatLike from HLO_TanhOp as it does support complex types

PiperOrigin-RevId: 311025151
Change-Id: I48ecf138d164dae4b4decf32c0ab24c1b0866b77
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td             |  2 +-
 .../mlir/xla/transforms/legalize_tf_with_tf2xla.cc     |  3 +++
 tensorflow/compiler/tests/binary_ops_test.py           | 10 ----------
 tensorflow/compiler/tests/ternary_ops_test.py          |  1 -
 tensorflow/compiler/tests/unary_ops_test.py            |  3 +--
 5 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 917a50f74ea..0db9563a4c1 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -229,7 +229,7 @@ def HLO_SqrtOp: HLO_UnaryElementwiseOp<"sqrt",
     BASE_HLO_SqrtOp;
 
 def HLO_TanhOp: HLO_UnaryElementwiseOp<"tanh",
-    [ResultsAreFloatLike, NoSideEffect, SameOperandsAndResultType],
+    [NoSideEffect, SameOperandsAndResultType],
     HLO_FpOrComplexTensor>, BASE_HLO_TanhOp;
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index be6ba167419..8b663af3f9e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -102,6 +102,7 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::CastOp>(),
     TypeID::get<TF::ClipByValueOp>(),
     TypeID::get<TF::ComplexAbsOp>(),
+    TypeID::get<TF::ConjugateTransposeOp>(),
     TypeID::get<TF::CoshOp>(),
     TypeID::get<TF::CrossOp>(),
     TypeID::get<TF::DataFormatDimMapOp>(),
@@ -135,9 +136,11 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::MulOp>(),
     TypeID::get<TF::NegOp>(),
     TypeID::get<TF::NotEqualOp>(),
+    TypeID::get<TF::PadOp>(),
     TypeID::get<TF::PlaceholderWithDefaultOp>(),
     TypeID::get<TF::PowOp>(),
     TypeID::get<TF::RealDivOp>(),
+    TypeID::get<TF::ReciprocalOp>(),
     TypeID::get<TF::ReciprocalGradOp>(),
     TypeID::get<TF::Relu6GradOp>(),
     TypeID::get<TF::RightShiftOp>(),
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index bd0131920f9..00ed6d83e2e 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1096,8 +1096,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
             x,
             expected=np.matmul(x, x.transpose([0, 1, 3, 2])))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testExpandDims(self):
     for dtype in self.numeric_types:
       self._testBinary(
@@ -1195,8 +1193,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
         np.full([1, 1, 3, 5], 3., dtype=np.float32),
         expected=np.full([4, 5, 1, 2, 5], 18., dtype=np.float32))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testPad(self):
     for dtype, pad_type in itertools.product(
         self.numeric_types, [np.int32, np.int64]):
@@ -1337,8 +1333,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
               ],
               dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testReshape(self):
     for dtype in self.numeric_types:
       self._testBinary(
@@ -1471,8 +1465,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
                [1, 2]],
               dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testTranspose(self):
     for dtype in self.numeric_types:
       self._testBinary(
@@ -1491,8 +1483,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
           np.array([1, 0], dtype=np.int32),
           expected=np.array([[1, 3], [2, 4]], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "TODO(b/155097273): Handle complex dtype constants")
   def testConjugateTranspose(self):
     for dtype in self.complex_types:
       self._testBinary(
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index ff3558729e5..4525e5ffed7 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -182,7 +182,6 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
           np.array([8, 9], dtype=dtype),
           expected=np.array([[7, 9], [8, 7], [8, 9]], dtype=dtype))
 
-  @test_util.disable_mlir_bridge('TODO(b/155097273)')
   def testSlice(self):
     for dtype in self.numeric_types:
       self._testTernary(
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index dc11c24b6d2..3e36f67615b 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -601,8 +601,7 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([-1, -0.5, 0, 0.3], dtype=dtype),
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      "Complex types not supported in CreateDenseElementsAttrFromLiteral")
+  @test_util.disable_mlir_bridge("TODO(b/156135423): Fix ConvertSigmoidOp")
   def testComplexOps(self):
     for dtype in self.complex_types:
 

From 066a31872fddda3d82241f84b18643b9cc8e8bf6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 17:11:48 -0700
Subject: [PATCH 0358/1533] Fix Data type class/method naming collision.

PiperOrigin-RevId: 311025587
Change-Id: Ib04670cd2ea0cde44d6a15721609e766c92392ae
---
 tensorflow/lite/delegates/gpu/cl/api.cc    | 10 +++++-----
 tensorflow/lite/delegates/gpu/cl/tensor.cc |  2 +-
 tensorflow/lite/delegates/gpu/cl/tensor.h  |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 7ffb5604d83..475eed4dccc 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -352,10 +352,10 @@ class GlBufferHolder : public TensorTie {
 };
 
 TensorObject TensorToObj(const Tensor& tensor) {
-  if (tensor.StorageType() == TensorStorageType::BUFFER) {
+  if (tensor.GetStorageType() == TensorStorageType::BUFFER) {
     return OpenClBuffer{tensor.GetMemoryPtr()};
   }
-  if (tensor.StorageType() == TensorStorageType::IMAGE_BUFFER) {
+  if (tensor.GetStorageType() == TensorStorageType::IMAGE_BUFFER) {
     return OpenClBuffer{tensor.GetMemoryPtrForWriting()};
   }
   return OpenClTexture{tensor.GetMemoryPtr()};
@@ -516,9 +516,9 @@ TensorObjectDef TensorToDef(const Tensor& tensor) {
   def.dimensions.h = tensor.Height();
   def.dimensions.w = tensor.Width();
   def.dimensions.c = tensor.Channels();
-  def.object_def.data_layout = ToDataLayout(tensor.StorageType());
-  def.object_def.data_type = tensor.DataType();
-  def.object_def.object_type = ToObjectType(tensor.StorageType());
+  def.object_def.data_layout = ToDataLayout(tensor.GetStorageType());
+  def.object_def.data_type = tensor.GetDataType();
+  def.object_def.object_type = ToObjectType(tensor.GetStorageType());
   def.object_def.user_provided = false;
   return def;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index f01975e2347..4a52508af0e 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -29,7 +29,7 @@ namespace cl {
 namespace {
 
 absl::Status CreateImageBufferFromBuffer(const CLContext& context,
-                                         cl_mem memory, enum DataType data_type,
+                                         cl_mem memory, DataType data_type,
                                          int width, cl_mem* result) {
   cl_image_format format;
   cl_image_desc desc;
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index d59ef838888..cb7d4263a5c 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -75,8 +75,8 @@ class Tensor {
   int4 GetWHSB() const { return int4(shape_.w, shape_.h, Slices(), shape_.b); }
   int4 GetWHDS() const { return int4(shape_.w, shape_.h, shape_.d, Slices()); }
 
-  enum DataType DataType() const { return descriptor_.data_type; }
-  TensorStorageType StorageType() const { return descriptor_.storage_type; }
+  DataType GetDataType() const { return descriptor_.data_type; }
+  TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
 
   // for profiling and memory statistics
   uint64_t GetMemorySizeInBytes() const;

From 949ce47d99051c721ad4b062fd126d6d50dc185a Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 11 May 2020 17:20:43 -0700
Subject: [PATCH 0359/1533] Release the GIL in file_io operations and reacquire
 it for error handling or Python specific operations.

PiperOrigin-RevId: 311027099
Change-Id: I5822df0b598f92efd14ea7ca4858b1a6181862d8
---
 tensorflow/python/lib/core/pybind11_status.h | 14 ++++
 tensorflow/python/lib/io/file_io_wrapper.cc  | 78 +++++++++++++++-----
 2 files changed, 72 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/lib/core/pybind11_status.h b/tensorflow/python/lib/core/pybind11_status.h
index feb974798de..3f9991c6577 100644
--- a/tensorflow/python/lib/core/pybind11_status.h
+++ b/tensorflow/python/lib/core/pybind11_status.h
@@ -69,6 +69,20 @@ inline void MaybeRaiseRegisteredFromStatus(const tensorflow::Status& status) {
   }
 }
 
+inline void MaybeRaiseRegisteredFromStatusWithGIL(
+    const tensorflow::Status& status) {
+  if (!status.ok()) {
+    // Acquire GIL for throwing exception.
+    pybind11::gil_scoped_acquire acquire;
+
+    PyErr_SetObject(PyExceptionRegistry::Lookup(status.code()),
+                    pybind11::make_tuple(pybind11::none(), pybind11::none(),
+                                         status.error_message())
+                        .ptr());
+    throw pybind11::error_already_set();
+  }
+}
+
 inline void MaybeRaiseFromTFStatus(TF_Status* status) {
   TF_Code code = TF_GetCode(status);
   if (code != TF_OK) {
diff --git a/tensorflow/python/lib/io/file_io_wrapper.cc b/tensorflow/python/lib/io/file_io_wrapper.cc
index de806a9c969..0a2410b69e1 100644
--- a/tensorflow/python/lib/io/file_io_wrapper.cc
+++ b/tensorflow/python/lib/io/file_io_wrapper.cc
@@ -42,50 +42,65 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       py::gil_scoped_release release;
       status = tensorflow::Env::Default()->FileExists(filename);
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("DeleteFile", [](const std::string& filename) {
-    tensorflow::MaybeRaiseRegisteredFromStatus(
-        tensorflow::Env::Default()->DeleteFile(filename));
+    py::gil_scoped_release release;
+    tensorflow::Status status =
+        tensorflow::Env::Default()->DeleteFile(filename);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("ReadFileToString", [](const std::string& filename) {
     std::string data;
+    py::gil_scoped_release release;
     const auto status =
         ReadFileToString(tensorflow::Env::Default(), filename, &data);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return py::bytes(data);
   });
   m.def("WriteStringToFile",
         [](const std::string& filename, tensorflow::StringPiece data) {
-          return WriteStringToFile(tensorflow::Env::Default(), filename, data);
+          py::gil_scoped_release release;
+          const auto status =
+              WriteStringToFile(tensorflow::Env::Default(), filename, data);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("GetChildren", [](const std::string& dirname) {
     std::vector<std::string> results;
+    py::gil_scoped_release release;
     const auto status =
         tensorflow::Env::Default()->GetChildren(dirname, &results);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return results;
   });
   m.def("GetMatchingFiles", [](const std::string& pattern) {
     std::vector<std::string> results;
+    py::gil_scoped_release release;
     const auto status =
         tensorflow::Env::Default()->GetMatchingPaths(pattern, &results);
+    pybind11::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return results;
   });
   m.def("CreateDir", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     const auto status = tensorflow::Env::Default()->CreateDir(dirname);
     if (tensorflow::errors::IsAlreadyExists(status)) {
       return;
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("RecursivelyCreateDir", [](const std::string& dirname) {
-    tensorflow::MaybeRaiseRegisteredFromStatus(
-        tensorflow::Env::Default()->RecursivelyCreateDir(dirname));
+    py::gil_scoped_release release;
+    const auto status =
+        tensorflow::Env::Default()->RecursivelyCreateDir(dirname);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("CopyFile",
         [](const std::string& src, const std::string& target, bool overwrite) {
+          py::gil_scoped_release release;
           auto* env = tensorflow::Env::Default();
           tensorflow::Status status;
           if (!overwrite && env->FileExists(target).ok()) {
@@ -93,10 +108,11 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
           } else {
             status = env->CopyFile(src, target);
           }
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("RenameFile",
         [](const std::string& src, const std::string& target, bool overwrite) {
+          py::gil_scoped_release release;
           auto* env = tensorflow::Env::Default();
           tensorflow::Status status;
           if (!overwrite && env->FileExists(target).ok()) {
@@ -104,9 +120,10 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
           } else {
             status = env->RenameFile(src, target);
           }
-          tensorflow::MaybeRaiseRegisteredFromStatus(status);
+          tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         });
   m.def("DeleteRecursively", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     tensorflow::int64 undeleted_files;
     tensorflow::int64 undeleted_dirs;
     auto status = tensorflow::Env::Default()->DeleteRecursively(
@@ -115,23 +132,25 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       status =
           tensorflow::errors::PermissionDenied("could not fully delete dir");
     }
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
   });
   m.def("IsDirectory", [](const std::string& dirname) {
+    py::gil_scoped_release release;
     const auto status = tensorflow::Env::Default()->IsDirectory(dirname);
     // FAILED_PRECONDITION response means path exists but isn't a dir.
     if (tensorflow::errors::IsFailedPrecondition(status)) {
       return false;
     }
 
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
     return true;
   });
   m.def("HasAtomicMove", [](const std::string& path) {
+    py::gil_scoped_release release;
     bool has_atomic_move;
     const auto status =
         tensorflow::Env::Default()->HasAtomicMove(path, &has_atomic_move);
-    tensorflow::MaybeRaiseRegisteredFromStatus(status);
+    tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
     return has_atomic_move;
   });
 
@@ -141,9 +160,11 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
       .def_readonly("is_directory", &tensorflow::FileStatistics::is_directory);
 
   m.def("Stat", [](const std::string& filename) {
+    py::gil_scoped_release release;
     std::unique_ptr<tensorflow::FileStatistics> self(
         new tensorflow::FileStatistics);
     const auto status = tensorflow::Env::Default()->Stat(filename, self.get());
+    py::gil_scoped_acquire acquire;
     tensorflow::MaybeRaiseRegisteredFromStatus(status);
     return self.release();
   });
@@ -151,66 +172,83 @@ PYBIND11_MODULE(_pywrap_file_io, m) {
   using tensorflow::WritableFile;
   py::class_<WritableFile>(m, "WritableFile")
       .def(py::init([](const std::string& filename, const std::string& mode) {
+        py::gil_scoped_release release;
         auto* env = tensorflow::Env::Default();
         std::unique_ptr<WritableFile> self;
         const auto status = mode.find("a") == std::string::npos
                                 ? env->NewWritableFile(filename, &self)
                                 : env->NewAppendableFile(filename, &self);
+        py::gil_scoped_acquire acquire;
         tensorflow::MaybeRaiseRegisteredFromStatus(status);
         return self.release();
       }))
       .def("append",
            [](WritableFile* self, tensorflow::StringPiece data) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Append(data));
+             const auto status = self->Append(data);
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
            })
       // TODO(slebedev): Make WritableFile::Tell const and change self
       // to be a reference.
       .def("tell",
            [](WritableFile* self) {
              tensorflow::int64 pos = -1;
+             py::gil_scoped_release release;
              const auto status = self->Tell(&pos);
-             tensorflow::MaybeRaiseRegisteredFromStatus(status);
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
              return pos;
            })
       .def("flush",
            [](WritableFile* self) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Flush());
+             py::gil_scoped_release release;
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Flush());
            })
       .def("close", [](WritableFile* self) {
-        tensorflow::MaybeRaiseRegisteredFromStatus(self->Close());
+        py::gil_scoped_release release;
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Close());
       });
 
   using tensorflow::io::BufferedInputStream;
   py::class_<BufferedInputStream>(m, "BufferedInputStream")
       .def(py::init([](const std::string& filename, size_t buffer_size) {
+        py::gil_scoped_release release;
         std::unique_ptr<tensorflow::RandomAccessFile> file;
         const auto status =
             tensorflow::Env::Default()->NewRandomAccessFile(filename, &file);
-        tensorflow::MaybeRaiseRegisteredFromStatus(status);
+        tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(status);
         std::unique_ptr<tensorflow::io::RandomAccessInputStream> input_stream(
             new tensorflow::io::RandomAccessInputStream(file.release(),
                                                         /*owns_file=*/true));
+        py::gil_scoped_acquire acquire;
         return new BufferedInputStream(input_stream.release(), buffer_size,
                                        /*owns_input_stream=*/true);
       }))
       .def("read",
            [](BufferedInputStream* self, tensorflow::int64 bytes_to_read) {
+             py::gil_scoped_release release;
              tensorflow::tstring result;
              const auto status = self->ReadNBytes(bytes_to_read, &result);
              if (!status.ok() && !tensorflow::errors::IsOutOfRange(status)) {
                result.clear();
                tensorflow::MaybeRaiseRegisteredFromStatus(status);
              }
+             py::gil_scoped_acquire acquire;
              return py::bytes(result);
            })
       .def("readline",
            [](BufferedInputStream* self) {
-             return py::bytes(self->ReadLineAsString());
+             py::gil_scoped_release release;
+             auto output = self->ReadLineAsString();
+             py::gil_scoped_acquire acquire;
+             return py::bytes(output);
            })
       .def("seek",
            [](BufferedInputStream* self, tensorflow::int64 pos) {
-             tensorflow::MaybeRaiseRegisteredFromStatus(self->Seek(pos));
+             py::gil_scoped_release release;
+             tensorflow::MaybeRaiseRegisteredFromStatusWithGIL(self->Seek(pos));
            })
-      .def("tell", [](BufferedInputStream* self) { return self->Tell(); });
+      .def("tell", [](BufferedInputStream* self) {
+        py::gil_scoped_release release;
+        return self->Tell();
+      });
 }
 }  // namespace

From 22a24beeeebbba045e107174c9e110bd07ba6f24 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 11 May 2020 17:28:54 -0700
Subject: [PATCH 0360/1533] Add a TF-internal visibility declaration for the
 parallel device

PiperOrigin-RevId: 311028466
Change-Id: Ic19ed07c49b796c94e0fd3370aa0cf5c83fe3fd6
---
 tensorflow/c/eager/parallel_device/BUILD           | 1 +
 tensorflow/python/distribute/parallel_device/BUILD | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index f4dbcc6cead..92947be79f8 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -27,6 +27,7 @@ cc_library(
     name = "parallel_device",
     srcs = [":sources"],
     hdrs = [":headers"],
+    visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c/eager:c_api",
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index e7526a56f66..43c6029f3d2 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,4 +1,5 @@
 package(
+    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
 )
 

From b53ed4d560aaeb7a92185f4fbf2562e5e274456a Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Mon, 11 May 2020 17:30:14 -0700
Subject: [PATCH 0361/1533] Fix the ImageDataGenerator methods to return Keras
 sequences instead of just generators. This makes it so that Keras `fit`
 avoids an infinite loop when users pass the results of
 ImageDataGenerator.flow* directly to fit/evaluate/predict.

PiperOrigin-RevId: 311028701
Change-Id: Ia5c3b01b3c8fa6b842bddb881ced64e4b89fe2ba
---
 tensorflow/python/keras/preprocessing/BUILD   |   1 +
 .../python/keras/preprocessing/image.py       | 415 ++++++++++++++++++
 .../python/keras/preprocessing/image_test.py  |  12 +-
 3 files changed, 427 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/preprocessing/BUILD b/tensorflow/python/keras/preprocessing/BUILD
index 403bc6e4808..24260fb71db 100644
--- a/tensorflow/python/keras/preprocessing/BUILD
+++ b/tensorflow/python/keras/preprocessing/BUILD
@@ -85,6 +85,7 @@ tf_py_test(
     deps = [
         ":image",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
index aa4801504cc..953962c7771 100644
--- a/tensorflow/python/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 # pylint: disable=invalid-name
 # pylint: disable=g-import-not-at-top
+# pylint: disable=g-classes-have-attributes
 """Set of tools for real-time data augmentation on image data.
 """
 from __future__ import absolute_import
@@ -35,6 +36,7 @@ from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
@@ -459,6 +461,123 @@ class NumpyArrayIterator(image.NumpyArrayIterator, Iterator):
         **kwargs)
 
 
+class DataFrameIterator(image.DataFrameIterator, Iterator):
+  """Iterator capable of reading images from a directory on disk as a dataframe.
+
+  Arguments:
+      dataframe: Pandas dataframe containing the filepaths relative to
+        `directory` (or absolute paths if `directory` is None) of the images in
+        a string column. It should include other column/s
+          depending on the `class_mode`: - if `class_mode` is `"categorical"`
+            (default value) it must include the `y_col` column with the class/es
+            of each image. Values in column can be string/list/tuple if a single
+            class or list/tuple if multiple classes. - if `class_mode` is
+            `"binary"` or `"sparse"` it must include the given `y_col` column
+            with class values as strings. - if `class_mode` is `"raw"` or
+            `"multi_output"` it should contain the columns specified in `y_col`.
+            - if `class_mode` is `"input"` or `None` no extra column is needed.
+      directory: string, path to the directory to read images from. If `None`,
+        data in `x_col` column should be absolute paths.
+      image_data_generator: Instance of `ImageDataGenerator` to use for random
+        transformations and normalization. If None, no transformations and
+        normalizations are made.
+      x_col: string, column in `dataframe` that contains the filenames (or
+        absolute paths if `directory` is `None`).
+      y_col: string or list, column/s in `dataframe` that has the target data.
+      weight_col: string, column in `dataframe` that contains the sample
+          weights. Default: `None`.
+      target_size: tuple of integers, dimensions to resize input images to.
+      color_mode: One of `"rgb"`, `"rgba"`, `"grayscale"`. Color mode to read
+        images.
+      classes: Optional list of strings, classes to use (e.g. `["dogs",
+        "cats"]`). If None, all classes in `y_col` will be used.
+      class_mode: one of "binary", "categorical", "input", "multi_output",
+          "raw", "sparse" or None. Default: "categorical".
+          Mode for yielding the targets:
+          - `"binary"`: 1D numpy array of binary labels,
+          - `"categorical"`: 2D numpy array of one-hot encoded labels. Supports
+            multi-label output.
+          - `"input"`: images identical to input images (mainly used to work
+            with autoencoders),
+          - `"multi_output"`: list with the values of the different columns,
+          - `"raw"`: numpy array of values in `y_col` column(s),
+          - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
+            are returned (the generator will only yield batches of image data,
+            which is useful to use in `model.predict_generator()`).
+      batch_size: Integer, size of a batch.
+      shuffle: Boolean, whether to shuffle the data between epochs.
+      seed: Random seed for data shuffling.
+      data_format: String, one of `channels_first`, `channels_last`.
+      save_to_dir: Optional directory where to save the pictures being yielded,
+        in a viewable format. This is useful for visualizing the random
+        transformations being applied, for debugging purposes.
+      save_prefix: String prefix to use for saving sample images (if
+        `save_to_dir` is set).
+      save_format: Format to use for saving sample images (if `save_to_dir` is
+        set).
+      subset: Subset of data (`"training"` or `"validation"`) if
+        validation_split is set in ImageDataGenerator.
+      interpolation: Interpolation method used to resample the image if the
+        target size is different from that of the loaded image. Supported
+        methods are "nearest", "bilinear", and "bicubic". If PIL version 1.1.3
+        or newer is installed, "lanczos" is also supported. If PIL version 3.4.0
+        or newer is installed, "box" and "hamming" are also supported. By
+        default, "nearest" is used.
+      dtype: Dtype to use for the generated arrays.
+      validate_filenames: Boolean, whether to validate image filenames in
+        `x_col`. If `True`, invalid images will be ignored. Disabling this
+        option
+      can lead to speed-up in the instantiation of this class. Default: `True`.
+  """
+
+  def __init__(
+      self,
+      dataframe,
+      directory=None,
+      image_data_generator=None,
+      x_col='filename',
+      y_col='class',
+      weight_col=None,
+      target_size=(256, 256),
+      color_mode='rgb',
+      classes=None,
+      class_mode='categorical',
+      batch_size=32,
+      shuffle=True,
+      seed=None,
+      data_format='channels_last',
+      save_to_dir=None,
+      save_prefix='',
+      save_format='png',
+      subset=None,
+      interpolation='nearest',
+      dtype='float32',
+      validate_filenames=True):
+    super(DataFrameIterator, self).__init__(
+        dataframe=dataframe,
+        directory=directory,
+        image_data_generator=image_data_generator,
+        x_col=x_col,
+        y_col=y_col,
+        weight_col=weight_col,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        data_format=data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset,
+        interpolation=interpolation,
+        dtype=dtype,
+        validate_filenames=validate_filenames
+    )
+
+
 @keras_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(image.ImageDataGenerator):
   """Generate batches of tensor image data with real-time data augmentation.
@@ -686,6 +805,302 @@ class ImageDataGenerator(image.ImageDataGenerator):
         validation_split=validation_split,
         **kwargs)
 
+  def flow(self,
+           x,
+           y=None,
+           batch_size=32,
+           shuffle=True,
+           sample_weight=None,
+           seed=None,
+           save_to_dir=None,
+           save_prefix='',
+           save_format='png',
+           subset=None):
+    """Takes data & label arrays, generates batches of augmented data.
+
+    Arguments:
+        x: Input data. Numpy array of rank 4 or a tuple. If tuple, the first
+          element should contain the images and the second element another numpy
+          array or a list of numpy arrays that gets passed to the output without
+          any modifications. Can be used to feed the model miscellaneous data
+          along with the images. In case of grayscale data, the channels axis of
+          the image array should have value 1, in case of RGB data, it should
+          have value 3, and in case of RGBA data, it should have value 4.
+        y: Labels.
+        batch_size: Int (default: 32).
+        shuffle: Boolean (default: True).
+        sample_weight: Sample weights.
+        seed: Int (default: None).
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: Str (default: `''`). Prefix to use for filenames of saved
+          pictures (only relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+
+    Returns:
+        An `Iterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array of image data
+            (in the case of a single image input) or a list
+            of numpy arrays (in the case with
+            additional inputs) and `y` is a numpy array
+            of corresponding labels. If 'sample_weight' is not None,
+            the yielded tuples are of the form `(x, y, sample_weight)`.
+            If `y` is None, only the numpy array `x` is returned.
+    """
+    return NumpyArrayIterator(
+        x,
+        y,
+        self,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        sample_weight=sample_weight,
+        seed=seed,
+        data_format=self.data_format,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset)
+
+  def flow_from_directory(self,
+                          directory,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='png',
+                          follow_links=False,
+                          subset=None,
+                          interpolation='nearest'):
+    """Takes the path to a directory & generates batches of augmented data.
+
+    Arguments:
+        directory: string, path to the target directory. It should contain one
+          subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images inside
+          each of the subdirectories directory tree will be included in the
+          generator. See [this script](
+            https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
+              for more details.
+        target_size: Tuple of integers `(height, width)`, defaults to `(256,
+          256)`. The dimensions to which all images found will be resized.
+        color_mode: One of "grayscale", "rgb", "rgba". Default: "rgb". Whether
+          the images will be converted to have 1, 3, or 4 channels.
+        classes: Optional list of class subdirectories
+            (e.g. `['dogs', 'cats']`). Default: None. If not provided, the list
+              of classes will be automatically inferred from the subdirectory
+              names/structure under `directory`, where each subdirectory will be
+              treated as a different class (and the order of the classes, which
+              will map to the label indices, will be alphanumeric). The
+              dictionary containing the mapping from class names to class
+              indices can be obtained via the attribute `class_indices`.
+        class_mode: One of "categorical", "binary", "sparse",
+            "input", or None. Default: "categorical".
+            Determines the type of label arrays that are returned: -
+              "categorical" will be 2D one-hot encoded labels, - "binary" will
+              be 1D binary labels, "sparse" will be 1D integer labels, - "input"
+              will be images identical to input images (mainly used to work with
+              autoencoders). - If None, no labels are returned (the generator
+              will only yield batches of image data, which is useful to use with
+              `model.predict_generator()`). Please note that in case of
+              class_mode None, the data still needs to reside in a subdirectory
+              of `directory` for it to work correctly.
+        batch_size: Size of the batches of data (default: 32).
+        shuffle: Whether to shuffle the data (default: True) If set to False,
+          sorts the data in alphanumeric order.
+        seed: Optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: Str. Prefix to use for filenames of saved pictures (only
+          relevant if `save_to_dir` is set).
+        save_format: One of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        follow_links: Whether to follow symlinks inside
+            class subdirectories (default: False).
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image. Supported
+          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
+          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
+          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
+          supported. By default, `"nearest"` is used.
+
+    Returns:
+        A `DirectoryIterator` yielding tuples of `(x, y)`
+            where `x` is a numpy array containing a batch
+            of images with shape `(batch_size, *target_size, channels)`
+            and `y` is a numpy array of corresponding labels.
+    """
+    return DirectoryIterator(
+        directory,
+        self,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        follow_links=follow_links,
+        subset=subset,
+        interpolation=interpolation)
+
+  def flow_from_dataframe(self,
+                          dataframe,
+                          directory=None,
+                          x_col='filename',
+                          y_col='class',
+                          weight_col=None,
+                          target_size=(256, 256),
+                          color_mode='rgb',
+                          classes=None,
+                          class_mode='categorical',
+                          batch_size=32,
+                          shuffle=True,
+                          seed=None,
+                          save_to_dir=None,
+                          save_prefix='',
+                          save_format='png',
+                          subset=None,
+                          interpolation='nearest',
+                          validate_filenames=True,
+                          **kwargs):
+    """Takes the dataframe and the path to a directory + generates batches.
+
+     The generated batches contain augmented/normalized data.
+
+    **A simple tutorial can be found **[here](
+                                http://bit.ly/keras_flow_from_dataframe).
+
+    Arguments:
+        dataframe: Pandas dataframe containing the filepaths relative to
+          `directory` (or absolute paths if `directory` is None) of the images
+          in a string column. It should include other column/s
+            depending on the `class_mode`: - if `class_mode` is `"categorical"`
+              (default value) it must include the `y_col` column with the
+              class/es of each image. Values in column can be string/list/tuple
+              if a single class or list/tuple if multiple classes. - if
+              `class_mode` is `"binary"` or `"sparse"` it must include the given
+              `y_col` column with class values as strings. - if `class_mode` is
+              `"raw"` or `"multi_output"` it should contain the columns
+              specified in `y_col`. - if `class_mode` is `"input"` or `None` no
+              extra column is needed.
+        directory: string, path to the directory to read images from. If `None`,
+          data in `x_col` column should be absolute paths.
+        x_col: string, column in `dataframe` that contains the filenames (or
+          absolute paths if `directory` is `None`).
+        y_col: string or list, column/s in `dataframe` that has the target data.
+        weight_col: string, column in `dataframe` that contains the sample
+            weights. Default: `None`.
+        target_size: tuple of integers `(height, width)`, default: `(256, 256)`.
+          The dimensions to which all images found will be resized.
+        color_mode: one of "grayscale", "rgb", "rgba". Default: "rgb". Whether
+          the images will be converted to have 1 or 3 color channels.
+        classes: optional list of classes (e.g. `['dogs', 'cats']`). Default is
+          None. If not provided, the list of classes will be automatically
+          inferred from the `y_col`, which will map to the label indices, will
+          be alphanumeric). The dictionary containing the mapping from class
+          names to class indices can be obtained via the attribute
+          `class_indices`.
+        class_mode: one of "binary", "categorical", "input", "multi_output",
+            "raw", sparse" or None. Default: "categorical".
+            Mode for yielding the targets:
+            - `"binary"`: 1D numpy array of binary labels,
+            - `"categorical"`: 2D numpy array of one-hot encoded labels.
+              Supports multi-label output.
+            - `"input"`: images identical to input images (mainly used to work
+              with autoencoders),
+            - `"multi_output"`: list with the values of the different columns,
+            - `"raw"`: numpy array of values in `y_col` column(s),
+            - `"sparse"`: 1D numpy array of integer labels, - `None`, no targets
+              are returned (the generator will only yield batches of image data,
+              which is useful to use in `model.predict_generator()`).
+        batch_size: size of the batches of data (default: 32).
+        shuffle: whether to shuffle the data (default: True)
+        seed: optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to optionally
+          specify a directory to which to save the augmented pictures being
+          generated (useful for visualizing what you are doing).
+        save_prefix: str. Prefix to use for filenames of saved pictures (only
+          relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg"
+            (only relevant if `save_to_dir` is set). Default: "png".
+        subset: Subset of data (`"training"` or `"validation"`) if
+          `validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if the
+          target size is different from that of the loaded image. Supported
+          methods are `"nearest"`, `"bilinear"`, and `"bicubic"`. If PIL version
+          1.1.3 or newer is installed, `"lanczos"` is also supported. If PIL
+          version 3.4.0 or newer is installed, `"box"` and `"hamming"` are also
+          supported. By default, `"nearest"` is used.
+        validate_filenames: Boolean, whether to validate image filenames in
+          `x_col`. If `True`, invalid images will be ignored. Disabling this
+          option can lead to speed-up in the execution of this function.
+          Defaults to `True`.
+        **kwargs: legacy arguments for raising deprecation warnings.
+
+    Returns:
+        A `DataFrameIterator` yielding tuples of `(x, y)`
+        where `x` is a numpy array containing a batch
+        of images with shape `(batch_size, *target_size, channels)`
+        and `y` is a numpy array of corresponding labels.
+    """
+    if 'has_ext' in kwargs:
+      tf_logging.warn(
+          'has_ext is deprecated, filenames in the dataframe have '
+          'to match the exact filenames in disk.', DeprecationWarning)
+    if 'sort' in kwargs:
+      tf_logging.warn(
+          'sort is deprecated, batches will be created in the'
+          'same order than the filenames provided if shuffle'
+          'is set to False.', DeprecationWarning)
+    if class_mode == 'other':
+      tf_logging.warn(
+          '`class_mode` "other" is deprecated, please use '
+          '`class_mode` "raw".', DeprecationWarning)
+      class_mode = 'raw'
+    if 'drop_duplicates' in kwargs:
+      tf_logging.warn(
+          'drop_duplicates is deprecated, you can drop duplicates '
+          'by using the pandas.DataFrame.drop_duplicates method.',
+          DeprecationWarning)
+
+    return DataFrameIterator(
+        dataframe,
+        directory,
+        self,
+        x_col=x_col,
+        y_col=y_col,
+        weight_col=weight_col,
+        target_size=target_size,
+        color_mode=color_mode,
+        classes=classes,
+        class_mode=class_mode,
+        data_format=self.data_format,
+        batch_size=batch_size,
+        shuffle=shuffle,
+        seed=seed,
+        save_to_dir=save_to_dir,
+        save_prefix=save_prefix,
+        save_format=save_format,
+        subset=subset,
+        interpolation=interpolation,
+        validate_filenames=validate_filenames)
+
+
 keras_export('keras.preprocessing.image.random_rotation')(random_rotation)
 keras_export('keras.preprocessing.image.random_shift')(random_shift)
 keras_export('keras.preprocessing.image.random_shear')(random_shear)
diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index d7da420318f..a577381874e 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -25,6 +25,9 @@ import tempfile
 import numpy as np
 
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.preprocessing import image as preprocessing_image
 from tensorflow.python.platform import test
 
@@ -52,7 +55,7 @@ def _generate_test_images():
   return [rgb_images, gray_images]
 
 
-class TestImage(test.TestCase):
+class TestImage(keras_parameterized.TestCase):
 
   @test_util.run_v2_only
   def test_smart_resize(self):
@@ -319,14 +322,21 @@ class TestImage(test.TestCase):
     self.assertEqual(
         len(set(train_iterator.filenames) & set(filenames)), num_training)
 
+    model = sequential.Sequential([layers.Flatten(), layers.Dense(2)])
+    model.compile(optimizer='sgd', loss='mse')
+    model.fit(train_iterator, epochs=1)
+
     shutil.rmtree(tmp_folder)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_25_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.25)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_40_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.40)
 
+  @keras_parameterized.run_all_keras_modes
   def test_directory_iterator_with_validation_split_50_percent(self):
     self.directory_iterator_with_validation_split_test_helper(0.50)
 

From c87bf9be0f65898d42bc7e024c9822fc292b790e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 17:32:26 -0700
Subject: [PATCH 0362/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311029049
Change-Id: Ie8ca7a5e3ae39d65aa12a4ac10878520c22584d2
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 1d8f37a926b8615aab45a283b69d78232e77e6bc Mon Sep 17 00:00:00 2001
From: Xunkai Zhang <xunkai@google.com>
Date: Mon, 11 May 2020 17:36:37 -0700
Subject: [PATCH 0363/1533] [tfls.release] Enable jar merging for bazel build
 of metadata lib.

PiperOrigin-RevId: 311029704
Change-Id: I30a36a2805a0e2f4c5065873f1ec03d6e6b81a00
---
 tensorflow/lite/experimental/support/metadata/java/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
index f1cd6173b9e..82b6e9866a9 100644
--- a/tensorflow/lite/experimental/support/metadata/java/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/java/BUILD
@@ -25,6 +25,10 @@ java_library(
     name = "tensorflow-lite-support-metadata-lib",
     srcs = glob(["src/java/org/tensorflow/lite/support/metadata/**/*.java"]),
     javacopts = JAVACOPTS,
+    resource_jars = [
+        "//tensorflow/lite/experimental/support/metadata:libmetadata_schema_java.jar",
+        "//tensorflow/lite/experimental/support/metadata:libschema_fbs_java.jar",
+    ],
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_java",
         "//tensorflow/lite/experimental/support/metadata:schema_fbs_java",

From 5aefa24d787feb9b20257f47ed76eae43c9159ad Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 11 May 2020 17:41:07 -0700
Subject: [PATCH 0364/1533] IWYU in profiler/rpc

PiperOrigin-RevId: 311030427
Change-Id: If8a07153240f46a3333135587c36fdd59f661c1f
---
 tensorflow/core/profiler/rpc/BUILD                  |  2 --
 tensorflow/core/profiler/rpc/client/BUILD           |  2 ++
 .../core/profiler/rpc/client/capture_profile.cc     | 13 ++++++++++---
 .../core/profiler/rpc/client/capture_profile.h      |  2 +-
 tensorflow/core/profiler/rpc/client/save_profile.cc |  1 -
 tensorflow/core/profiler/rpc/profiler_server.cc     |  8 +++++---
 .../core/profiler/rpc/profiler_service_impl.cc      | 11 ++++++-----
 .../core/profiler/rpc/profiler_service_impl.h       |  6 ++----
 8 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/profiler/rpc/BUILD b/tensorflow/core/profiler/rpc/BUILD
index b5b631fe8bb..1e572dfd9bd 100644
--- a/tensorflow/core/profiler/rpc/BUILD
+++ b/tensorflow/core/profiler/rpc/BUILD
@@ -19,9 +19,7 @@ cc_library(
         "//tensorflow/core/profiler/convert:xplane_to_profile_response",
         "//tensorflow/core/profiler/lib:profiler_session_headers",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
     ],
 )
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index bde5708065e..609f98aa6c1 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -12,7 +12,9 @@ cc_library(
     deps = [
         ":save_profile",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler:profiler_analysis_proto_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler:profiler_service_proto_cc",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc
index 5335d18da3e..a8642aff54a 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc
@@ -14,18 +14,25 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/rpc/client/capture_profile.h"
 
+#include <iostream>
+#include <limits>
+#include <memory>
 #include <vector>
 
 #include "grpcpp/grpcpp.h"
-#include "absl/strings/escaping.h"
-#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/profiler_analysis.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_analysis.pb.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/rpc/client/save_profile.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.h b/tensorflow/core/profiler/rpc/client/capture_profile.h
index 1bde73f66f1..c809d2099ae 100644
--- a/tensorflow/core/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/core/profiler/rpc/client/capture_profile.h
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/rpc/client/save_profile.cc b/tensorflow/core/profiler/rpc/client/save_profile.cc
index e328bf1dae4..9cf2e291692 100644
--- a/tensorflow/core/profiler/rpc/client/save_profile.cc
+++ b/tensorflow/core/profiler/rpc/client/save_profile.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include <initializer_list>
 #include <memory>
-#include <ostream>
 #include <sstream>
 #include <string>
 #include <vector>
diff --git a/tensorflow/core/profiler/rpc/profiler_server.cc b/tensorflow/core/profiler/rpc/profiler_server.cc
index 4d2f3c38c65..f05a829fb93 100644
--- a/tensorflow/core/profiler/rpc/profiler_server.cc
+++ b/tensorflow/core/profiler/rpc/profiler_server.cc
@@ -16,17 +16,19 @@ limitations under the License.
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
 
 #include <memory>
-#include <utility>
+#include <string>
 
 #include "grpcpp/grpcpp.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
 namespace tensorflow {
 
 void ProfilerServer::StartProfilerServer(int32 port) {
-  string server_address = absl::StrCat("0.0.0.0:", port);
+  std::string server_address = absl::StrCat("0.0.0.0:", port);
   service_ = CreateProfilerService();
   ::grpc::ServerBuilder builder;
   builder.AddListeningPort(server_address, ::grpc::InsecureServerCredentials());
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
index 8cf052f165b..0a234d7e4da 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc
@@ -15,21 +15,22 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/profiler_service_impl.h"
 
+#include <memory>
+
 #include "grpcpp/support/status.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
+#include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 #include "tensorflow/core/profiler/profiler_service.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 
@@ -65,7 +66,7 @@ class ProfilerServiceImpl : public grpc::ProfilerService::Service {
     }
 
     Env* env = Env::Default();
-    for (size_t i = 0; i < req->duration_ms(); ++i) {
+    for (uint64 i = 0; i < req->duration_ms(); ++i) {
       env->SleepForMicroseconds(EnvTime::kMillisToMicros);
       if (ctx->IsCancelled()) {
         return ::grpc::Status::CANCELLED;
diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.h b/tensorflow/core/profiler/rpc/profiler_service_impl.h
index 4a7636cf101..00a850acbf2 100644
--- a/tensorflow/core/profiler/rpc/profiler_service_impl.h
+++ b/tensorflow/core/profiler/rpc/profiler_service_impl.h
@@ -15,10 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_PROFILER_RPC_PROFILER_SERVICE_IMPL_H_
 
-#include "grpcpp/grpcpp.h"
-#include "grpcpp/server_context.h"
-#include "grpcpp/support/status.h"
-#include "tensorflow/core/profiler/lib/profiler_session.h"
+#include <memory>
+
 #include "tensorflow/core/profiler/profiler_service.grpc.pb.h"
 
 namespace tensorflow {

From 775d6b649731e207a32b69250b7071149f5869cd Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 11 May 2020 17:47:12 -0700
Subject: [PATCH 0365/1533] Deprecate `set_learning_phase()` backend utility.

Deprecating the getter (`learning_phase()`) can't be done at this time because it is used in all core layers. Changing it in the core layers requires changing the default value of the `training` arg in their call methods, which is not a backwards compatible change.

PiperOrigin-RevId: 311031244
Change-Id: Ic956a502a83e8bd3f98834ec39c1b55920d91779
---
 tensorflow/python/keras/backend.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 503f6cf0e92..2700fae9e29 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -393,6 +393,9 @@ def _default_learning_phase():
           False, shape=(), name='keras_learning_phase')
 
 
+@deprecated('2020-10-11',
+            'Simply pass a True/False value to the `training` argument '
+            'of the `__call__` method of your layer or model.')
 @keras_export('keras.backend.set_learning_phase')
 def set_learning_phase(value):
   """Sets the learning phase to a fixed value.

From 9f7d843e3f99e1d6a879d2accd0207fde64b2f8f Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Mon, 11 May 2020 17:55:13 -0700
Subject: [PATCH 0366/1533] [tf.data] Add comment to ShouldWait for the
 nondeterministic case to explain the condition it's checking for

PiperOrigin-RevId: 311032271
Change-Id: Ib83e68934898d22b810332a71e5efe142e62e139
---
 tensorflow/core/kernels/data/parallel_map_dataset_op.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 5dae096d5b5..7b8f697d2d3 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -621,6 +621,11 @@ class ParallelMapIterator : public DatasetBaseIterator {
       return false;
     }
     if (!deterministic_) {
+      // Iterate through in-flight results and returns the first one that is
+      // found to be available and not end-of-input. If the first result (in
+      // order) is end-of-input, we know that all earlier iterations have
+      // already been completed, so it is safe to return that result for the
+      // caller to process end of iteration.
       for (auto it = invocation_results_.begin();
            it != invocation_results_.end(); ++it) {
         if ((*it)->notification.HasBeenNotified() &&

From 75d9ed443afa611b85d0563153b41013d7c10868 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 11 May 2020 18:10:26 -0700
Subject: [PATCH 0367/1533] Enable tests blocked by b/117943489

PiperOrigin-RevId: 311034341
Change-Id: Iab60ac2be4c1352aa39d3e266df09ba45a2feada
---
 .../autograph/operators/data_structures_test.py   |  3 ++-
 .../python/autograph/utils/tensor_list_test.py    |  1 -
 .../python/kernel_tests/tensor_array_ops_test.py  | 15 +++++++--------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index c5a3a3d1cac..5d835fd3771 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -106,11 +106,12 @@ class ListTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
     l2 = data_structures.list_append(l1, 2)
+
     with self.cached_session() as sess:
       self.assertAllEqual(self.evaluate(l1.stack()), [1])
       self.assertAllEqual(self.evaluate(l2.stack()), [1, 2])
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index bbbc3bf6918..017d97bb040 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -34,7 +34,6 @@ class TensorListTest(test.TestCase):
   def _shape(self, shape_tuple):
     return constant(shape_tuple, dtypes.int32)
 
-  @test_util.run_v1_only("b/117943489")
   def test_dynamic_list_append(self):
     l = []
     l = tl.dynamic_list_append(l, 1)
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 33879232fd3..5d587954858 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -1021,7 +1021,7 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
@@ -1251,7 +1251,6 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
-  @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
 
@@ -1340,11 +1339,11 @@ class TensorArrayTest(test.TestCase):
       grad = gradients_impl.gradients(ys=[r], xs=[x])
       self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
@@ -1422,7 +1421,7 @@ class TensorArrayTest(test.TestCase):
           v2_msg if control_flow_util.ENABLE_CONTROL_FLOW_V2 else v1_msg):
         ta.stack().eval()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
@@ -1445,11 +1444,11 @@ class TensorArrayTest(test.TestCase):
       # first dimension of zero
       self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1476,7 +1475,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_v1_only("b/117943489")
+  @test_util.run_deprecated_v1
   def testSkipEagerTensorArrayScatterPartialReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(

From 26ddaca376d63b69c96a9b63a7895c7d0ec02f25 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 11 May 2020 18:12:42 -0700
Subject: [PATCH 0368/1533] Add utilities for creating delegates in TF Lite The
 change has - SimpleDelegateKernelInterface: Represents a Kernel which handles
 a subgraph to be delegated. It has Init/Prepare/Invoke which are going to be
 called during inference. Delegate owner should implement this interface to
 build/prepare/invoke the delegated subgraph. - SimpleDelegate: This class
 wraps TFLiteDelegate and users need to implement the interface and then Call
 GetFinalizedDelegate() to get TfLiteDelegate* that can be passed to
 ModifyGraphWithDelegate.

PiperOrigin-RevId: 311034686
Change-Id: If3e8a3e769660cc5a78d142f0758f6f0663fe778
---
 tensorflow/lite/delegates/utils/BUILD         |  36 ++++
 .../lite/delegates/utils/simple_delegate.cc   | 140 +++++++++++++
 .../lite/delegates/utils/simple_delegate.h    | 109 ++++++++++
 .../delegates/utils/simple_delegate_test.cc   | 194 ++++++++++++++++++
 4 files changed, 479 insertions(+)
 create mode 100644 tensorflow/lite/delegates/utils/BUILD
 create mode 100644 tensorflow/lite/delegates/utils/simple_delegate.cc
 create mode 100644 tensorflow/lite/delegates/utils/simple_delegate.h
 create mode 100644 tensorflow/lite/delegates/utils/simple_delegate_test.cc

diff --git a/tensorflow/lite/delegates/utils/BUILD b/tensorflow/lite/delegates/utils/BUILD
new file mode 100644
index 00000000000..069da167455
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/BUILD
@@ -0,0 +1,36 @@
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "simple_delegate",
+    srcs = [
+        "simple_delegate.cc",
+    ],
+    hdrs = [
+        "simple_delegate.h",
+    ],
+    deps = [
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
+        "//tensorflow/lite/kernels/internal:compatibility",
+    ],
+)
+
+cc_test(
+    name = "simple_delegate_test",
+    srcs = ["simple_delegate_test.cc"],
+    deps = [
+        ":simple_delegate",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.cc b/tensorflow/lite/delegates/utils/simple_delegate.cc
new file mode 100644
index 00000000000..51736e56d26
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate.cc
@@ -0,0 +1,140 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/delegates/utils.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+namespace {
+TfLiteRegistration GetDelegateKernelRegistration(
+    SimpleDelegateInterface* delegate) {
+  TfLiteRegistration kernel_registration;
+  kernel_registration.profiling_string = nullptr;
+  kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
+  kernel_registration.custom_name = delegate->name();
+  kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
+    delete reinterpret_cast<SimpleDelegateKernelInterface*>(buffer);
+  };
+  kernel_registration.init = [](TfLiteContext* context, const char* buffer,
+                                size_t length) -> void* {
+    const TfLiteDelegateParams* params =
+        reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    if (params == nullptr) {
+      TF_LITE_KERNEL_LOG(context, "NULL TfLiteDelegateParams passed.");
+      return nullptr;
+    }
+    auto* delegate =
+        reinterpret_cast<SimpleDelegateInterface*>(params->delegate->data_);
+    std::unique_ptr<SimpleDelegateKernelInterface> delegate_kernel(
+        delegate->CreateDelegateKernelInterface());
+    if (delegate_kernel->Init(context, params) != kTfLiteOk) {
+      return nullptr;
+    }
+    return delegate_kernel.release();
+  };
+  kernel_registration.prepare = [](TfLiteContext* context,
+                                   TfLiteNode* node) -> TfLiteStatus {
+    if (node->user_data == nullptr) {
+      TF_LITE_KERNEL_LOG(context, "Delegate kernel was not initialized");
+      return kTfLiteError;
+    }
+    SimpleDelegateKernelInterface* delegate_kernel =
+        reinterpret_cast<SimpleDelegateKernelInterface*>(node->user_data);
+    return delegate_kernel->Prepare(context, node);
+  };
+  kernel_registration.invoke = [](TfLiteContext* context,
+                                  TfLiteNode* node) -> TfLiteStatus {
+    SimpleDelegateKernelInterface* delegate_kernel =
+        reinterpret_cast<SimpleDelegateKernelInterface*>(node->user_data);
+    TFLITE_DCHECK(delegate_kernel != nullptr);
+    return delegate_kernel->Invoke(context, node);
+  };
+
+  return kernel_registration;
+}
+
+TfLiteStatus DelegatePrepare(TfLiteContext* context,
+                             TfLiteDelegate* base_delegate) {
+  auto* delegate =
+      reinterpret_cast<SimpleDelegateInterface*>(base_delegate->data_);
+  delegates::IsNodeSupportedFn node_supported_fn =
+      [=](TfLiteContext* context, TfLiteNode* node,
+          TfLiteRegistration* registration,
+          std::string* unsupported_details) -> bool {
+    return delegate->IsNodeSupportedByDelegate(registration, node, context);
+  };
+  // TODO(b/149484598): Update to have method that gets all supported nodes.
+  delegates::GraphPartitionHelper helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+
+  const auto delegate_partitions = helper.GetFirstNLargestPartitions();
+
+  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
+  // element to represent TfLiteIntArray.size which is the 1st element of
+  // TfLiteIntArray C struct.
+  std::vector<int> supported_nodes(1);
+  for (const auto partition : delegate_partitions) {
+    auto* nodes = partition->nodes_to_replace;
+    supported_nodes.insert(supported_nodes.end(), nodes->data,
+                           nodes->data + nodes->size);
+  }
+  // Set first element to the number of nodes to replace.
+  supported_nodes[0] = supported_nodes.size() - 1;
+
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "%s delegate: %d nodes delegated out of %d nodes with "
+                  "%d partitions.\n",
+                  delegate->name(), supported_nodes[0],
+                  helper.num_total_nodes(), delegate_partitions.size());
+  TfLiteRegistration delegate_kernel_registration =
+      GetDelegateKernelRegistration(delegate);
+
+  return context->ReplaceNodeSubsetsWithDelegateKernels(
+      context, delegate_kernel_registration,
+      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), base_delegate);
+}
+}  // namespace
+
+TfLiteDelegate* TfLiteDelegateFactory::CreateSimpleDelegate(
+    std::unique_ptr<SimpleDelegateInterface> simple_delegate) {
+  if (simple_delegate == nullptr) {
+    return nullptr;
+  }
+  auto delegate = new TfLiteDelegate();
+  delegate->Prepare = &DelegatePrepare;
+  delegate->flags = kTfLiteDelegateFlagsNone;
+  delegate->CopyFromBufferHandle = nullptr;
+  delegate->CopyToBufferHandle = nullptr;
+  delegate->FreeBufferHandle = nullptr;
+  delegate->data_ = simple_delegate.release();
+  return delegate;
+}
+
+void TfLiteDelegateFactory::DeleteSimpleDelegate(TfLiteDelegate* delegate) {
+  if (!delegate) return;
+  SimpleDelegateInterface* simple_delegate =
+      reinterpret_cast<SimpleDelegateInterface*>(delegate->data_);
+  delete simple_delegate;
+  delete delegate;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.h b/tensorflow/lite/delegates/utils/simple_delegate.h
new file mode 100644
index 00000000000..bf35fbc47aa
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate.h
@@ -0,0 +1,109 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file has utilities that facilitates creating new delegates.
+// - SimpleDelegateKernelInterface: Represents a Kernel which handles a subgraph
+// to be delegated. It has Init/Prepare/Invoke which are going to be called
+// during inference, similar to TFLite Kernels. Delegate owner should implement
+// this interface to build/prepare/invoke the delegated subgraph.
+// - SimpleDelegateInterface:
+// This class wraps TFLiteDelegate and users need to implement the interface and
+// then Call GetFinalizedDelegate() to get TfLiteDelegate* that can be passed to
+// ModifyGraphWithDelegate.
+#ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+#define TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
+
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+
+// Users should inherit from this class and implement the interface below.
+// Each instance represents a single part of the graph (subgraph).
+class SimpleDelegateKernelInterface {
+ public:
+  virtual ~SimpleDelegateKernelInterface() {}
+
+  // Initializes a delegated subgraph.
+  // The nodes in the subgraph are inside TfLiteDelegateParams->nodes_to_replace
+  virtual TfLiteStatus Init(TfLiteContext* context,
+                            const TfLiteDelegateParams* params) = 0;
+
+  // Will be called by the framework. Should handle any needed preparation
+  // for the subgraph e.g. allocating buffers, compiling model.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) = 0;
+
+  // Actual subgraph inference should happen on this call.
+  // Returns status, and signalling any errors.
+  virtual TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) = 0;
+};
+
+// Pure Interface that clients should implement.
+// The Interface represents a delegate capabilities and provide factory
+// for SimpleDelegateKernelInterface
+//
+// Clients should implement the following methods:
+// - IsNodeSupportedByDelegate
+// - name
+// - CreateDelegateKernelInterface
+class SimpleDelegateInterface {
+ public:
+  SimpleDelegateInterface() {}
+
+  virtual ~SimpleDelegateInterface() {}
+
+  // Returns true if 'node' is supported by the delegate. False otherwise.
+  virtual bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                         const TfLiteNode* node,
+                                         TfLiteContext* context) const = 0;
+
+  // Returns a name that identifies the delegate.
+  // This name is used for debugging/logging/profiling.
+  virtual const char* name() const = 0;
+
+  // Returns instance of an object that implements the interface
+  // SimpleDelegateKernelInterface.
+  // An instance of SimpleDelegateKernelInterface represents one subgraph to
+  // be delegated.
+  // Caller takes ownership of the returned object.
+  virtual std::unique_ptr<SimpleDelegateKernelInterface>
+  CreateDelegateKernelInterface() = 0;
+};
+
+// Factory class that provides two static methods
+// CreateSimpleDelegate
+// DeleteSimpleDelegate
+// Which should be used to construct TfLiteDelegate from
+// Simple Delegate and delete TfLiteDelegate and SimpleDelegate give
+// tfLiteDelegate* created from 'CreateSimpleDelegate' method.
+// Users should use these methods to Create and Destroy the delegate.
+class TfLiteDelegateFactory {
+ public:
+  // Creates TfLiteDelegate from the provided SimpleDelegateInterface.
+  // The returned TfLiteDelegate should be deleted using DeleteSimpleDelegate.
+  static TfLiteDelegate* CreateSimpleDelegate(
+      std::unique_ptr<SimpleDelegateInterface> simple_delegate);
+
+  // Deletes 'delegate' the passed pointer must be the one returned
+  // from GetFinalizedDelegate.
+  // This function will destruct the SimpleDelegate object too.
+  static void DeleteSimpleDelegate(TfLiteDelegate* delegate);
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
diff --git a/tensorflow/lite/delegates/utils/simple_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
new file mode 100644
index 00000000000..fa6d528a537
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+
+namespace tflite {
+namespace {
+// Delegate options.
+struct TestSimpleDelegateOptions {
+  // Allowed ops to delegate.
+  int allowed_builtin_code;
+  // Report error during init.
+  bool error_during_init = false;
+  // Report error during prepare.
+  bool error_during_prepare = false;
+  // Report error during invoke.
+  bool error_during_invoke = false;
+};
+
+// Dummy delegate kernel.
+class TestSimpleDelegateKernel : public SimpleDelegateKernelInterface {
+ public:
+  explicit TestSimpleDelegateKernel(TestSimpleDelegateOptions options)
+      : options_(options) {}
+
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override {
+    return !options_.error_during_init ? kTfLiteOk : kTfLiteError;
+  }
+
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override {
+    return !options_.error_during_prepare ? kTfLiteOk : kTfLiteError;
+  }
+
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) override {
+    return !options_.error_during_invoke ? kTfLiteOk : kTfLiteError;
+  }
+
+ private:
+  TestSimpleDelegateOptions options_;
+};
+
+// Simple delegate which implements the interface of SimpleDelegateInterface.
+// This holds the Delegate capabilities.
+class TestSimpleDelegate : public SimpleDelegateInterface {
+ public:
+  explicit TestSimpleDelegate(TestSimpleDelegateOptions options)
+      : options_(options) {}
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override {
+    return options_.allowed_builtin_code == registration->builtin_code;
+  }
+
+  const char* name() const override { return "TestSimpleDelegate"; }
+
+  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
+      override {
+    return std::make_unique<TestSimpleDelegateKernel>(options_);
+  }
+
+ private:
+  TestSimpleDelegateOptions options_;
+};
+
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration* reg = ops::builtin::Register_ADD();
+    void* builtin_data_1 = malloc(sizeof(int));
+    void* builtin_data_2 = malloc(sizeof(int));
+    void* builtin_data_3 = malloc(sizeof(int));
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, builtin_data_1,
+                                        reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, builtin_data_2,
+                                        reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, builtin_data_3,
+                                        reg);
+  }
+
+  void TearDown() override {
+    interpreter_.reset();
+    TfLiteDelegateFactory::DeleteSimpleDelegate(delegate_);
+  }
+
+ protected:
+  std::unique_ptr<Interpreter> interpreter_;
+  TfLiteDelegate* delegate_ = nullptr;
+};
+
+TEST_F(TestDelegate, BasicDelegate) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  interpreter_->ModifyGraphWithDelegate(delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  int node = interpreter_->execution_plan()[0];
+  const auto* node_and_reg = interpreter_->node_and_registration(node);
+  EXPECT_EQ("TestSimpleDelegate", node_and_reg->second.custom_name);
+
+  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
+      node_and_reg->first.builtin_data);
+  ASSERT_EQ(params->nodes_to_replace->size, 3);
+  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
+  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
+  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
+
+  ASSERT_EQ(params->input_tensors->size, 2);
+  EXPECT_EQ(params->input_tensors->data[0], 0);
+  EXPECT_EQ(params->input_tensors->data[1], 1);
+
+  ASSERT_EQ(params->output_tensors->size, 2);
+  EXPECT_EQ(params->output_tensors->data[0], 3);
+  EXPECT_EQ(params->output_tensors->data[1], 4);
+}
+
+TEST_F(TestDelegate, NoNodesToDelegate) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinSub;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  interpreter_->ModifyGraphWithDelegate(delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+}
+
+TEST_F(TestDelegate, DelegateFailedPrepare) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_prepare = true;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteDelegateError,
+            interpreter_->ModifyGraphWithDelegate(delegate_));
+}
+
+TEST_F(TestDelegate, DelegateFailedInvoke) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_invoke = true;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteOk, interpreter_->ModifyGraphWithDelegate(delegate_));
+  ASSERT_EQ(kTfLiteError, interpreter_->Invoke());
+}
+
+TEST_F(TestDelegate, DelegateFailedInit) {
+  TestSimpleDelegateOptions options;
+  options.allowed_builtin_code = kTfLiteBuiltinAdd;
+  options.error_during_init = true;
+  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+      std::make_unique<TestSimpleDelegate>(options));
+  ASSERT_EQ(kTfLiteDelegateError,
+            interpreter_->ModifyGraphWithDelegate(delegate_));
+}
+}  // namespace
+}  // namespace tflite

From cf4fb9ed5efebc4f2575f6faf9592c1b0f27d073 Mon Sep 17 00:00:00 2001
From: RJ Skerry-Ryan <rjryan@google.com>
Date: Mon, 11 May 2020 19:19:37 -0700
Subject: [PATCH 0369/1533] tf.signal: Remove set_shape from documentation for
 linear_to_mel_weight_matrix.

tf.tensordot has shape inference now.

PiperOrigin-RevId: 311042725
Change-Id: I4ded418173bdcc5238d05f73a12ca8e387bbac04
---
 tensorflow/python/ops/signal/mel_ops.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/ops/signal/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
index aa0769166a4..b95876bc977 100644
--- a/tensorflow/python/ops/signal/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -128,8 +128,6 @@ def linear_to_mel_weight_matrix(num_mel_bins=20,
       # S has shape [..., num_spectrogram_bins].
       # M has shape [..., num_mel_bins].
       M = tf.tensordot(S, A, 1)
-      # tf.tensordot does not support shape inference for this case yet.
-      M.set_shape(S.shape[:-1].concatenate(A.shape[-1:]))
 
   Args:
     num_mel_bins: Python int. How many bands in the resulting mel spectrum.

From 0ba495f24a2a093606af3de6f394ed0b000ac5c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 19:32:24 -0700
Subject: [PATCH 0370/1533] fix a typo

PiperOrigin-RevId: 311043957
Change-Id: I7a82832032f2dd664bf4852446ae5ba7b82ff203
---
 tensorflow/python/autograph/g3doc/reference/control_flow.md   | 2 +-
 tensorflow/python/autograph/g3doc/reference/generated_code.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/autograph/g3doc/reference/control_flow.md b/tensorflow/python/autograph/g3doc/reference/control_flow.md
index 79cc0f31450..cf580af7330 100644
--- a/tensorflow/python/autograph/g3doc/reference/control_flow.md
+++ b/tensorflow/python/autograph/g3doc/reference/control_flow.md
@@ -164,7 +164,7 @@ after if
 #### Python values modified in TensorFlow control flow become Tensors
 
 If a symbol is modified in a TensorFlow control flow statement, then it becomes
-a `tf.Tensor`, even if it started off as a Python promitive value.
+a `tf.Tensor`, even if it started off as a Python primitive value.
 
 For example, the conditional below will run as a `tf.cond` (its condition is a
 `tf.Tensor`), which in turn will cause `i` to become a `tf.Tensor`.
diff --git a/tensorflow/python/autograph/g3doc/reference/generated_code.md b/tensorflow/python/autograph/g3doc/reference/generated_code.md
index b62911b7203..389fa53a065 100644
--- a/tensorflow/python/autograph/g3doc/reference/generated_code.md
+++ b/tensorflow/python/autograph/g3doc/reference/generated_code.md
@@ -66,7 +66,7 @@ print(inspect.getsourcefile(converted_f))
 ```
 
 `tf.autograph.to_code` is a shortcut to obtain the generated code, and it's
-equivalent with calling `inspect.getsource(tf.autograph.to_code(f))`.
+equivalent with calling `inspect.getsource(tf.autograph.to_graph(f))`.
 
 #### Recording diagnostic information: `tf.autograph.set_verbosity`
 

From 64618d8d7caf6837cf7393f72748e79321282c6d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 May 2020 19:55:02 -0700
Subject: [PATCH 0371/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311046060
Change-Id: I57024f482d9212fe7fd0815f5c0b5a8e89fc493f
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From cd24c2bdc771aa3576ef1fc1699374eabdc5447b Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Tue, 12 May 2020 11:06:01 +0800
Subject: [PATCH 0372/1533] [tflite] add fp16 support for evaluation tools

---
 .../lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc   | 2 ++
 .../lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h    | 3 +++
 .../lite/tools/delegates/default_execution_provider.cc       | 4 ++++
 .../lite/tools/evaluation/evaluation_delegate_provider.cc    | 4 ++++
 .../lite/tools/evaluation/proto/evaluation_stages.proto      | 3 +++
 .../lite/tools/evaluation/stages/tflite_inference_stage.cc   | 1 +
 .../tools/evaluation/tasks/coco_object_detection/run_eval.cc | 5 +++++
 .../tasks/imagenet_image_classification/run_eval.cc          | 5 +++++
 .../lite/tools/evaluation/tasks/inference_diff/run_eval.cc   | 3 +++
 9 files changed, 30 insertions(+)

diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index f318dc68d09..61c2acb8b2e 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -141,6 +141,8 @@ class CompositeObserver : public ImagenetModelEvaluator::Observer {
       tflite::Flag::CreateFlag(kNumRanksFlag, &params.num_ranks,
                                "Generates the top-1 to top-k accuracy values"
                                "where k = num_ranks. Default: 10"),
+      tflite::Flag::CreateFlag("allow_fp16", &params.allow_fp16,
+                               "allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 65d4a2c49f8..323069383c3 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -78,6 +78,9 @@ class ImagenetModelEvaluator {
 
     // Number of interpreter threads.
     int num_interpreter_threads = 1;
+
+    // allow fp16
+    bool allow_fp16 = false;
   };
 
   // An evaluation observer.
diff --git a/tensorflow/lite/tools/delegates/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
index f75fd791072..67c38308206 100644
--- a/tensorflow/lite/tools/delegates/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -30,6 +30,7 @@ class DefaultExecutionProvider : public DelegateProvider {
                              ToolParam::Create<int32_t>(0));
     default_params_.AddParam("min_nodes_per_partition",
                              ToolParam::Create<int32_t>(0));
+    default_params_.AddParam("allow_fp16", ToolParam::Create<bool>(false));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -44,6 +45,7 @@ std::vector<Flag> DefaultExecutionProvider::CreateFlags(
   std::vector<Flag> flags = {
       CreateFlag<int32_t>("num_threads", params,
                           "number of threads used for inference on CPU."),
+      CreateFlag<bool>("allow_fp16", params, "allow_fp16"),
       CreateFlag<int32_t>("max_delegated_partitions", params,
                           "Max number of partitions to be delegated."),
       CreateFlag<int32_t>(
@@ -61,6 +63,8 @@ void DefaultExecutionProvider::LogParams(const ToolParams& params) const {
                    << params.Get<int32_t>("max_delegated_partitions") << "]";
   TFLITE_LOG(INFO) << "Min nodes per partition : ["
                    << params.Get<int32_t>("min_nodes_per_partition") << "]";
+  TFLITE_LOG(INFO) << "allow_fp16: ["
+                   << params.Get<bool>("allow_fp16") << "]";
 }
 
 TfLiteDelegatePtr DefaultExecutionProvider::CreateTfLiteDelegate(
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index 42f2666ba9b..a7625441406 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -132,6 +132,10 @@ tools::ToolParams DelegateProviders::GetAllParams(
     tool_params.Set<int32_t>("num_threads", params.num_threads());
   }
 
+  if (params.has_allow_fp16()) {
+    tool_params.Set<bool>("allow_fp16", params.allow_fp16());
+  }
+
   const auto type = params.delegate();
   switch (type) {
     case TfliteInferenceParams::NNAPI:
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index 09765d71726..c7d033eb111 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -121,6 +121,9 @@ message TfliteInferenceParams {
   // This helps benchmark cases where extensive pre-processing might not be
   // required for every input.
   optional int32 invocations_per_run = 4 [default = 1];
+
+  // allow_fp16
+  optional bool allow_fp16 = 5 [default = false];
 }
 
 // Metrics specific to TFLite inference.
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 365a00c3cd1..8189140e953 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -95,6 +95,7 @@ TfLiteStatus TfliteInferenceStage::Init(
     return kTfLiteError;
   }
   interpreter_->SetNumThreads(params.num_threads());
+  interpreter_->SetAllowFp16PrecisionForFp32(params.allow_fp16());
 
   if (!delegate_providers) {
     std::string error_message;
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 765e8fc6465..1ff4e55c270 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -65,6 +65,7 @@ class CocoObjectDetection : public TaskExecutor {
   bool debug_mode_;
   std::string delegate_;
   int num_interpreter_threads_;
+  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -104,6 +105,9 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
+      tflite::Flag::CreateFlag(
+          "allow_fp16", &allow_fp16_,
+          "allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   DelegateProviders delegate_providers;
@@ -132,6 +136,7 @@ absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  inference_params->set_allow_fp16(allow_fp16_);
 
   // Get ground truth data.
   absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 13eeb313ad4..1e1cf86732a 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -67,6 +67,7 @@ class ImagenetClassification : public TaskExecutor {
   std::string delegate_;
   int num_images_;
   int num_interpreter_threads_;
+  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -106,6 +107,9 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
+      tflite::Flag::CreateFlag(
+          "allow_fp16", &allow_fp16_,
+          "allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -155,6 +159,7 @@ absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  inference_params->set_allow_fp16(allow_fp16_);
   classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
 
   ImageClassificationStage eval(eval_config);
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index 814ebe3b3bf..de41fb96a03 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -50,6 +50,7 @@ class InferenceDiff : public TaskExecutor {
   std::string delegate_;
   int num_runs_;
   int num_interpreter_threads_;
+  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -71,6 +72,7 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
+      tflite::Flag::CreateFlag("allow_fp16", &allow_fp16_, "allow fp16")
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -88,6 +90,7 @@ absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
   // generating random data.
   inference_params->set_invocations_per_run(3);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  inference_params->set_allow_fp16(allow_fp16_);
   if (!delegate_.empty() &&
       inference_params->delegate() == TfliteInferenceParams::NONE) {
     TFLITE_LOG(WARN) << "Unsupported TFLite delegate: " << delegate_;

From 365ea0f33a065d5d103bb4d296f79dd6c1187bc1 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 11 May 2020 20:15:16 -0700
Subject: [PATCH 0373/1533] Handle compilation of ops with dynamic shapes in
 fallback path

* Pass through operands of TensorCast op in export to HLO
* Run CreateTFShapeInferencePass after the fallback lowering pass to update function
  signature types.

Enable XlaPad op test for MLIR bridge.

PiperOrigin-RevId: 311048226
Change-Id: Ifb1715af45314963aab97288f007efae40c5b4af
---
 .../mlir/tensorflow/utils/compile_mlir_util.cc     |  6 ++++++
 tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc    | 14 ++++++++++++++
 .../compiler/mlir/xla/tests/translate/export.mlir  | 13 +++++++++++++
 tensorflow/compiler/tests/ternary_ops_test.py      |  3 ++-
 tensorflow/compiler/tests/xla_ops_test.py          |  8 ++++----
 5 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 2374687c920..e8ca691f961 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -293,6 +293,12 @@ Status ConvertMLIRToXlaComputation(
   tf2xla.addPass(mlir::xla_hlo::createLegalizeTfWithTf2XlaPass(device_type));
   tf2xla.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
 
+  // Run shape inference pass to propagate shapes through tensor_cast operations
+  // from static to dynamic shapes. This could be generated if the shape
+  // inference was originally missing in a TF op but the corresponding HLO op
+  // had static shape after lowering.
+  tf2xla.addPass(mlir::TF::CreateTFShapeInferencePass());
+
   // Run LegalizeTFPass again because the previous legalization passes can
   // expose more graph pruning and canonicalization opportunities that are
   // necessary for the second LegalizeTFPass(allow_partial_conversion=false)
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 21668b7e059..295cdcaaf23 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -985,6 +985,20 @@ LogicalResult ConvertToHloModule::Lower(
   if (auto call_op = dyn_cast<mlir::CallOp>(inst)) {
     return LowerFunctionCall(&call_op, builder, &value_map);
   }
+  if (auto op = dyn_cast<mlir::TensorCastOp>(inst)) {
+    Value operand = op.getOperand();
+    auto ty = operand.getType().dyn_cast<ShapedType>();
+    // If this was a cast from a static shaped tensors, then it is a noop for
+    // export to HLO and we can use the operand.
+    if (!ty || !ty.hasStaticShape()) {
+      inst->emitOpError()
+          << "requires static shaped operand for HLO translation";
+      return failure();
+    }
+
+    value_map[op.getResult()] = value_map[operand];
+    return success();
+  }
 
   // TODO(jpienaar): This doesn't support layouts yet.
   if (matchPattern(inst, m_Constant(&const_attr))) {
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index ed06863cbf4..15fa91588a5 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -1044,3 +1044,16 @@ func @main(%arg0: tensor<4xui8>) -> (tensor<4xui8>) {
 // CHECK: ENTRY
 // CHECK: %[[ARG0:.*]] = u8[4] parameter(0)
 //  ROOT %[[RESULT:.*]] = u8[4] not(u8[4] %[[ARG0]])
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg0: tensor<4xi32>) -> (tensor<*xi32>) {
+  %0 = "xla_hlo.not"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
+  %1 = tensor_cast %0 : tensor<4xi32> to tensor<*xi32>
+  return %1 : tensor<*xi32>
+}
+
+// CHECK: ENTRY
+// CHECK: %[[ARG0:.*]] = s32[4] parameter(0)
+//  ROOT %[[RESULT:.*]] = s32[4] not(s32[4] %[[ARG0]])
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index 4525e5ffed7..a1bb64eb88d 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -48,7 +48,8 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       {'start': 1, 'end': 2, 'num': 1},
       {'start': 1, 'end': 4, 'num': 3},
       {'start': 0, 'end': 41, 'num': 42})
-  @test_util.disable_mlir_bridge('Requires dynamic shape handling')
+  @test_util.disable_mlir_bridge(
+      'TODO(b/156174708): Dynamic result types not supported')
   def testLinspace(self, start, end, num):
     expected = np.linspace(start, end, num, dtype=np.float32)
     result = self._testTernary(
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index b01c5aea4fa..6dc2d1cfa18 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -80,7 +80,8 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(v,),
           expected=np.tile(v, (7, 42, 1, 1)))
 
-  @test_util.disable_mlir_bridge('Dynamic result types not supported')
+  @test_util.disable_mlir_bridge(
+      'TODO(b/156174708): Dynamic result types not supported')
   def testShiftRightLogical(self):
     self._assertOpOutputMatchesExpected(
         xla.shift_right_logical,
@@ -92,7 +93,8 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
         args=(np.array([0xFFFFFFFF, 16], dtype=np.uint32), np.uint32(4)),
         expected=np.array([0x0FFFFFFF, 1], dtype=np.uint32))
 
-  @test_util.disable_mlir_bridge('Dynamic result types not supported')
+  @test_util.disable_mlir_bridge(
+      'TODO(b/156174708): Dynamic result types not supported')
   def testShiftRightArithmetic(self):
     self._assertOpOutputMatchesExpected(
         xla.shift_right_arithmetic,
@@ -194,8 +196,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(np.array([1, 2, 3], dtype=dtype),),
           expected=np.array([-1, -2, -3], dtype=dtype))
 
-  @test_util.disable_mlir_bridge(
-      'Requires XlaPad op shape inference to have static result types')
   def testPad(self):
     for dtype in self.numeric_types:
 

From 875f71cf8b1804951d03f50e73f6e4f2cc11af2a Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 11 May 2020 20:19:17 -0700
Subject: [PATCH 0374/1533] Lower TF XlaConv op using the fallback lowering
 pass

PiperOrigin-RevId: 311048596
Change-Id: Iec444a0680cce33ec898f3202d77c05d4bd9d4ee
---
 .../compiler/mlir/xla/ir/mlir_hlo_builder.cc  | 39 ++++++++++++++
 .../compiler/mlir/xla/ir/mlir_hlo_builder.h   | 10 ++++
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      |  1 +
 .../xla/transforms/legalize_tf_with_tf2xla.cc |  1 +
 tensorflow/compiler/tests/xla_ops_test.py     |  7 +--
 tensorflow/compiler/xla/client/xla_builder.cc | 52 ++++++++++++-------
 tensorflow/compiler/xla/client/xla_builder.h  | 10 ++++
 7 files changed, 96 insertions(+), 24 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 99d1da74fc5..cc334d8654f 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -56,6 +56,20 @@ static mlir::DenseIntElementsAttr GetI64ElementsAttr(
   return mlir::DenseIntElementsAttr::get(ty, mlir_values);
 }
 
+static mlir::DenseIntElementsAttr ConvertPadding(
+    absl::Span<const std::pair<tensorflow::int64, tensorflow::int64>> padding,
+    mlir::Builder* builder) {
+  llvm::SmallVector<int64_t, 8> elements;
+  elements.reserve(padding.size() * 2);
+  for (const auto& vals : padding) {
+    elements.push_back(vals.first);
+    elements.push_back(vals.second);
+  }
+  auto ty = mlir::RankedTensorType::get(
+      {static_cast<int64_t>(padding.size()), 2}, builder->getIntegerType(64));
+  return mlir::DenseIntElementsAttr::get(ty, elements);
+}
+
 MlirHloBuilder::~MlirHloBuilder() = default;
 
 StatusOr<XlaOp> MlirHloBuilder::MakeXlaOp(mlir::Value val) {
@@ -79,6 +93,31 @@ XlaOp MlirHloBuilder::ConstantLiteral(const LiteralSlice& literal) {
   });
 }
 
+StatusOr<XlaOp> MlirHloBuilder::ConvGeneralDilatedInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
+  TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
+                                         shape, builder_));
+  mlir::ArrayAttr config_attr;
+  if (precision_config)
+    config_attr = ConvertPrecisionConfig(precision_config, &builder_);
+  auto op = builder_.create<mlir::xla_hlo::ConvOp>(
+      loc_, ty, GetValue(lhs), GetValue(rhs),
+      GetI64ElementsAttr(window_strides, &builder_),
+      ConvertPadding(padding, &builder_),
+      GetI64ElementsAttr(lhs_dilation, &builder_),
+      GetI64ElementsAttr(rhs_dilation, &builder_),
+      ConvertConvDimensionNumbers(dimension_numbers, &builder_),
+      builder_.getI64IntegerAttr(feature_group_count),
+      builder_.getI64IntegerAttr(batch_group_count), config_attr);
+  return MakeXlaOp(op);
+}
+
 StatusOr<XlaOp> MlirHloBuilder::TransposeInternal(
     const Shape& shape, XlaOp operand, absl::Span<const int64> permutation) {
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index dbcb6856971..5a84d60cdc2 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -110,6 +110,16 @@ class MlirHloBuilder : public XlaBuilder {
  private:
   XlaOp ConstantLiteral(const LiteralSlice& literal) override;
 
+  StatusOr<XlaOp> ConvGeneralDilatedInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config) override;
+
   StatusOr<XlaOp> TransposeInternal(
       const Shape& shape, XlaOp operand,
       absl::Span<const int64> permutation) override;
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 295cdcaaf23..228a26b5abd 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -985,6 +985,7 @@ LogicalResult ConvertToHloModule::Lower(
   if (auto call_op = dyn_cast<mlir::CallOp>(inst)) {
     return LowerFunctionCall(&call_op, builder, &value_map);
   }
+
   if (auto op = dyn_cast<mlir::TensorCastOp>(inst)) {
     Value operand = op.getOperand();
     auto ty = operand.getType().dyn_cast<ShapedType>();
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 8b663af3f9e..787d967f3f6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -165,6 +165,7 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::UnpackOp>(),
     TypeID::get<TF::XdivyOp>(),
+    TypeID::get<TF::XlaConvOp>(),
     TypeID::get<TF::XlaDotOp>(),
     TypeID::get<TF::XlaPadOp>(),
     TypeID::get<TF::Xlog1pyOp>(),
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 6dc2d1cfa18..b9a8dc738d1 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -80,8 +80,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           args=(v,),
           expected=np.tile(v, (7, 42, 1, 1)))
 
-  @test_util.disable_mlir_bridge(
-      'TODO(b/156174708): Dynamic result types not supported')
+  @test_util.disable_mlir_bridge('Dynamic result types not supported')
   def testShiftRightLogical(self):
     self._assertOpOutputMatchesExpected(
         xla.shift_right_logical,
@@ -93,8 +92,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
         args=(np.array([0xFFFFFFFF, 16], dtype=np.uint32), np.uint32(4)),
         expected=np.array([0x0FFFFFFF, 1], dtype=np.uint32))
 
-  @test_util.disable_mlir_bridge(
-      'TODO(b/156174708): Dynamic result types not supported')
+  @test_util.disable_mlir_bridge('Dynamic result types not supported')
   def testShiftRightArithmetic(self):
     self._assertOpOutputMatchesExpected(
         xla.shift_right_arithmetic,
@@ -111,7 +109,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                       xla_data_pb2.PrecisionConfig.HIGHEST)
 
   @parameterized.parameters(*PRECISION_VALUES)
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testConv(self, precision):
     for dtype in set(self.float_types).intersection(
         set([dtypes.bfloat16.as_numpy_dtype, np.float32])):
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 0b146a43e64..bd70ce80082 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -1301,7 +1301,6 @@ XlaOp XlaBuilder::ConvGeneralDilated(
     int64 feature_group_count, int64 batch_group_count,
     const PrecisionConfig* precision_config) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape* lhs_shape, GetShapePtr(lhs));
     TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
     TF_RETURN_IF_ERROR(
@@ -1314,30 +1313,45 @@ XlaOp XlaBuilder::ConvGeneralDilated(
       window_dimensions[i] =
           rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
     }
-    TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
+
+    TF_ASSIGN_OR_RETURN(Window window,
                         ShapeInference::InferWindowFromDimensions(
                             window_dimensions, window_strides, padding,
                             lhs_dilation, rhs_dilation));
-
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferConvolveShape(
-                         *lhs_shape, *rhs_shape, feature_group_count,
-                         batch_group_count, instr.window(), dimension_numbers));
-    *instr.mutable_shape() = shape.ToProto();
-
-    *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
-    instr.set_feature_group_count(feature_group_count);
-    instr.set_batch_group_count(batch_group_count);
-
-    if (precision_config != nullptr) {
-      *instr.mutable_precision_config() = *precision_config;
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kConvolution,
-                          {lhs, rhs});
+    TF_ASSIGN_OR_RETURN(Shape shape,
+                        ShapeInference::InferConvolveShape(
+                            *lhs_shape, *rhs_shape, feature_group_count,
+                            batch_group_count, window, dimension_numbers));
+    return ConvGeneralDilatedInternal(shape, lhs, rhs, window, window_strides,
+                                      padding, lhs_dilation, rhs_dilation,
+                                      dimension_numbers, feature_group_count,
+                                      batch_group_count, precision_config);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::ConvGeneralDilatedInternal(
+    const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+    absl::Span<const int64> window_strides,
+    absl::Span<const std::pair<int64, int64>> padding,
+    absl::Span<const int64> lhs_dilation, absl::Span<const int64> rhs_dilation,
+    const ConvolutionDimensionNumbers& dimension_numbers,
+    int64 feature_group_count, int64 batch_group_count,
+    const PrecisionConfig* precision_config) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  *instr.mutable_window() = window;
+  *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
+  instr.set_feature_group_count(feature_group_count);
+  instr.set_batch_group_count(batch_group_count);
+
+  if (precision_config != nullptr) {
+    *instr.mutable_precision_config() = *precision_config;
+  }
+
+  return AddInstruction(std::move(instr), HloOpcode::kConvolution, {lhs, rhs});
+}
+
 XlaOp XlaBuilder::Fft(XlaOp operand, const FftType fft_type,
                       const absl::Span<const int64> fft_length) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index bfb97d7721f..33fe62e9322 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -491,6 +491,16 @@ class XlaBuilder {
                            int64 batch_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
+  virtual StatusOr<XlaOp> ConvGeneralDilatedInternal(
+      const Shape& shape, XlaOp lhs, XlaOp rhs, const Window& window,
+      absl::Span<const int64> window_strides,
+      absl::Span<const std::pair<int64, int64>> padding,
+      absl::Span<const int64> lhs_dilation,
+      absl::Span<const int64> rhs_dilation,
+      const ConvolutionDimensionNumbers& dimension_numbers,
+      int64 feature_group_count, int64 batch_group_count,
+      const PrecisionConfig* precision_config);
+
   XlaOp Fft(XlaOp operand, FftType fft_type,
             absl::Span<const int64> fft_length);
 

From 6598d11e8b8ea3f33c65091d5ffdfacbbc98cfad Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Mon, 11 May 2020 20:24:30 -0700
Subject: [PATCH 0375/1533] Whitelist XlaBroadcastHelperOp and enable tests
 XlaDynamicUpdateSliceOp

PiperOrigin-RevId: 311049106
Change-Id: I6f47f784f744ba3e60f3f377fa90412b1114d3b5
---
 .../compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc      | 1 +
 tensorflow/compiler/tests/BUILD                                  | 1 +
 tensorflow/compiler/tests/xla_ops_test.py                        | 1 -
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 787d967f3f6..86a2defd3a8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -165,6 +165,7 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::TruncateModOp>(),
     TypeID::get<TF::UnpackOp>(),
     TypeID::get<TF::XdivyOp>(),
+    TypeID::get<TF::XlaBroadcastHelperOp>(),
     TypeID::get<TF::XlaConvOp>(),
     TypeID::get<TF::XlaDotOp>(),
     TypeID::get<TF::XlaPadOp>(),
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 515e8aaf06c..ea4ba8dab6b 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -562,6 +562,7 @@ tf_xla_py_test(
     name = "dynamic_slice_ops_test",
     size = "small",
     srcs = ["dynamic_slice_ops_test.py"],
+    enable_mlir_bridge = True,
     python_version = "PY3",
     tags = [
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index b9a8dc738d1..1f83701ea7c 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -51,7 +51,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
         equality_fn = self.assertAllClose
       equality_fn(result, expected, rtol=1e-3)
 
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testAdd(self):
     for dtype in self.numeric_types:
       self._assertOpOutputMatchesExpected(

From bec0b3823311c8a2f07831b92edf0cd8e04ad9c5 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 11 May 2020 21:51:28 -0700
Subject: [PATCH 0376/1533] Fix a comment about device/op_device.

PiperOrigin-RevId: 311058019
Change-Id: Icaf5be7bb456f3b38b4057da6de51214ac9716a0
---
 tensorflow/core/protobuf/remote_tensor_handle.proto | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/protobuf/remote_tensor_handle.proto b/tensorflow/core/protobuf/remote_tensor_handle.proto
index 10995226a9b..36e3f810b73 100644
--- a/tensorflow/core/protobuf/remote_tensor_handle.proto
+++ b/tensorflow/core/protobuf/remote_tensor_handle.proto
@@ -21,11 +21,11 @@ message RemoteTensorHandle {
   int64 op_id = 1;
   // The index into the outputs of the operation that produced this tensor.
   int32 output_num = 2;
-  // Device of the operation that produced this tensor. Cannot be empty.
+  // Device where the tensor is located. Cannot be empty.
   // For multi-device functions, it's the default device passed to placer.
   string device = 3;
-  // Device where the tensor is located. Can be empty if the operation producing
-  // this tensor is a multi-device function.
+  // Device of the operation producing this tensor. Can be empty if the
+  // operation producing this tensor is a multi-device function.
   string op_device = 4;
   // Tensor type.
   DataType dtype = 5;

From 4926e23ba4772e3007017dda9ee1585d30ca012b Mon Sep 17 00:00:00 2001
From: Xunkai Zhang <xunkai@google.com>
Date: Mon, 11 May 2020 22:10:49 -0700
Subject: [PATCH 0377/1533] [tfls.image] Provide TensorImage#getWidth and
 TensorImage#getHeight as they are not trivial.

PiperOrigin-RevId: 311060642
Change-Id: Ie6d8043ffe82cb6276cb919d9d799c0740ef29c0
---
 .../lite/support/image/TensorImage.java       | 61 ++++++++++++++++++-
 .../support/tensorbuffer/TensorBuffer.java    |  2 +-
 2 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
index b19ef2e3b62..bced23e6f67 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/image/TensorImage.java
@@ -231,6 +231,26 @@ public class TensorImage {
     return container.getDataType();
   }
 
+  /**
+   * Gets the image width.
+   *
+   * @throws IllegalStateException if the TensorImage never loads data.
+   * @throws IllegalArgumentException if the container data is corrupted.
+   */
+  public int getWidth() {
+    return container.getWidth();
+  }
+
+  /**
+   * Gets the image height.
+   *
+   * @throws IllegalStateException if the TensorImage never loads data.
+   * @throws IllegalArgumentException if the container data is corrupted.
+   */
+  public int getHeight() {
+    return container.getHeight();
+  }
+
   // Requires tensor shape [h, w, 3] or [1, h, w, 3].
   static void checkImageTensorShape(int[] shape) {
     SupportPreconditions.checkArgument(
@@ -273,6 +293,41 @@ public class TensorImage {
       isBufferUpdated = true;
     }
 
+    int getWidth() {
+      SupportPreconditions.checkState(
+          isBitmapUpdated || isBufferUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
+      if (isBitmapUpdated) {
+        return bitmapImage.getWidth();
+      }
+      return getBufferDimensionSize(-2);
+    }
+
+    int getHeight() {
+      SupportPreconditions.checkState(
+          isBitmapUpdated || isBufferUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
+      if (isBitmapUpdated) {
+        return bitmapImage.getHeight();
+      }
+      return getBufferDimensionSize(-3);
+    }
+
+    // Internal helper method to get the size of one dimension in the shape of the `bufferImage`.
+    // Requires `isBufferUpdated` is true.
+    // Throws `IllegalArgumentException` if data is corrupted.
+    private int getBufferDimensionSize(int dim) {
+      int[] shape = bufferImage.getShape();
+      // The defensive check is needed because bufferImage might be invalidly changed by user
+      // (a.k.a internal data is corrupted)
+      TensorImage.checkImageTensorShape(shape);
+      dim = dim % shape.length;
+      if (dim < 0) {
+        dim += shape.length;
+      }
+      return shape[dim];
+    }
+
     public DataType getDataType() {
       return dataType;
     }
@@ -284,7 +339,8 @@ public class TensorImage {
         return bitmapImage;
       }
       if (!isBufferUpdated) {
-        throw new IllegalStateException("Both buffer and bitmap data are obsolete.");
+        throw new IllegalStateException(
+            "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
       }
       if (bufferImage.getDataType() != DataType.UINT8) {
         throw new IllegalStateException(
@@ -310,7 +366,8 @@ public class TensorImage {
         return bufferImage;
       }
       SupportPreconditions.checkArgument(
-          isBitmapUpdated, "Both buffer and bitmap data are obsolete.");
+          isBitmapUpdated,
+          "Both buffer and bitmap data are obsolete. Forgot to call TensorImage#load?");
       int requiredFlatSize = bitmapImage.getWidth() * bitmapImage.getHeight() * 3;
       if (bufferImage == null
           || (!bufferImage.isDynamic() && bufferImage.getFlatSize() != requiredFlatSize)) {
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
index 16622a25333..fa05be363a6 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/tensorbuffer/TensorBuffer.java
@@ -379,13 +379,13 @@ public abstract class TensorBuffer {
 
     // Check if the new shape is the same as current shape.
     int newFlatSize = computeFlatSize(shape);
+    this.shape = shape.clone();
     if (flatSize == newFlatSize) {
       return;
     }
 
     // Update to the new shape.
     flatSize = newFlatSize;
-    this.shape = shape.clone();
     buffer = ByteBuffer.allocateDirect(flatSize * getTypeSize());
     buffer.order(ByteOrder.nativeOrder());
   }

From b76ef65778eb1ebd67d5f43d88b1e353c1c41a0c Mon Sep 17 00:00:00 2001
From: Peng Sun <peng.sun@arm.com>
Date: Mon, 11 May 2020 08:53:29 +0100
Subject: [PATCH 0378/1533] add bias to transpose_conv TESTs.

---
 .../lite/testing/op_tests/transpose_conv.py   | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 654856f0d88..ce30860e289 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -38,6 +38,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 4, 1], [1, 10, 10, 3], [3, 20, 20, 1]],
           "filter_size": [[1, 1], [1, 2], [3, 3]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],
@@ -50,6 +51,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -60,6 +62,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -70,13 +73,26 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 4, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 8, 6, 2]],
           "fully_quantize": [True]
-      }
+      },
+      {
+          "input_shape": [[1, 3, 3, 1]],
+          "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [True],
+          "strides": [[1, 1, 1, 1]],
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "channel_multiplier": [1],
+          "output_shape": [[1, 3, 3, 2]],
+          "fully_quantize": [True]
+      },
+
   ]
 
   def get_tensor_shapes(parameters):
@@ -124,6 +140,13 @@ def make_transpose_conv_tests(options):
           strides=parameters["strides"],
           padding=parameters["padding"],
           data_format=parameters["data_format"])
+      if parameters["has_bias"]:
+        bias_input = create_tensor_data(np.float32,
+                                        (parameters["output_shape"][-1],),
+                                        min_value=-1,
+                                        max_value=1)
+        out = tf.nn.bias_add(out, bias_input,
+                             data_format=parameters["data_format"])
 
     return input_tensors, [out]
 

From 65773fd394162eaada4b200d6bc3d4c2058e17e1 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 12 May 2020 01:34:43 -0700
Subject: [PATCH 0379/1533] Retrieve CUDA targets to build in nvcc wrapper from
 clang command line option.

Hard-coding it (through a repo rule) in one place is bad enough. The proper solution would be to make CUDA targets a bazel 'feature' and map it to compiler flags in crosstools. The more pressing requirement though is to allow compiling a mix of SASS and PTX binaries, instead of SASS+PTX for every CUDA target.

PiperOrigin-RevId: 311081931
Change-Id: If6aea7bfa08e21984471ce3593e0df3ac2c21798
---
 .../crosstool_wrapper_driver_is_not_gcc.tpl   | 36 ++++++++-----------
 .../windows/msvc_wrapper_for_nvcc.py.tpl      | 32 +++++++----------
 third_party/gpus/cuda_configure.bzl           |  8 +----
 3 files changed, 29 insertions(+), 47 deletions(-)

diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index f3b2ae6846d..303339e77f7 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -53,13 +53,6 @@ NVCC_PATH = '%{nvcc_path}'
 PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
 NVCC_VERSION = '%{cuda_version}'
 
-
-# TODO(amitpatankar): Benchmark enabling all capabilities by default.
-# Environment variable for supported TF CUDA Compute Capabilities
-# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
-CUDA_COMPUTE_ENV_VAR = 'TF_CUDA_COMPUTE_CAPABILITIES'
-DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
-
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
 
@@ -78,7 +71,8 @@ def GetOptionValue(argv, option):
   """
 
   parser = ArgumentParser()
-  parser.add_argument('-' + option, nargs='*', action='append')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-').replace('-', '_')
   args, _ = parser.parse_known_args(argv)
   if not args or not vars(args)[option]:
     return []
@@ -180,17 +174,17 @@ def InvokeNvcc(argv, log=False):
 
   host_compiler_options = GetHostCompilerOptions(argv)
   nvcc_compiler_options = GetNvccOptions(argv)
-  opt_option = GetOptionValue(argv, 'O')
-  m_options = GetOptionValue(argv, 'm')
+  opt_option = GetOptionValue(argv, '-O')
+  m_options = GetOptionValue(argv, '-m')
   m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
-  include_options = GetOptionValue(argv, 'I')
-  out_file = GetOptionValue(argv, 'o')
-  depfiles = GetOptionValue(argv, 'MF')
-  defines = GetOptionValue(argv, 'D')
+  include_options = GetOptionValue(argv, '-I')
+  out_file = GetOptionValue(argv, '-o')
+  depfiles = GetOptionValue(argv, '-MF')
+  defines = GetOptionValue(argv, '-D')
   defines = ''.join([' -D' + define for define in defines])
-  undefines = GetOptionValue(argv, 'U')
+  undefines = GetOptionValue(argv, '-U')
   undefines = ''.join([' -U' + define for define in undefines])
-  std_options = GetOptionValue(argv, 'std')
+  std_options = GetOptionValue(argv, '-std')
   # Supported -std flags as of CUDA 9.0. Only keep last to mimic gcc/clang.
   nvcc_allowed_std_options = ["c++03", "c++11", "c++14"]
   std_options = ''.join([' -std=' + define
@@ -198,7 +192,7 @@ def InvokeNvcc(argv, log=False):
 
   # The list of source files get passed after the -c option. I don't know of
   # any other reliable way to just get the list of source files to be compiled.
-  src_files = GetOptionValue(argv, 'c')
+  src_files = GetOptionValue(argv, '-c')
 
   # Pass -w through from host to nvcc, but don't do anything fancier with
   # warnings-related flags, since they're not necessarily the same across
@@ -224,13 +218,12 @@ def InvokeNvcc(argv, log=False):
   srcs = ' '.join(src_files)
   out = ' -o ' + out_file[0]
 
-  supported_cuda_compute_capabilities = [ %{cuda_compute_capabilities} ]
   nvccopts = '-D_FORCE_INLINES '
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
+  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+    capability = capability[len('sm_'):]
     nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
         capability, capability, capability)
-  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines
   nvccopts += std_options
@@ -272,6 +265,7 @@ def main():
   if args.x and args.x[0] == 'cuda':
     if args.cuda_log: Log('-x cuda')
     leftover = [pipes.quote(s) for s in leftover]
+    args.cuda_log = True
     if args.cuda_log: Log('using nvcc')
     return InvokeNvcc(leftover, log=args.cuda_log)
 
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index 46e8aef3606..c10fb826494 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -37,13 +37,6 @@ GCC_HOST_COMPILER_PATH = ('%{gcc_host_compiler_path}')
 NVCC_PATH = '%{nvcc_path}'
 NVCC_VERSION = '%{cuda_version}'
 NVCC_TEMP_DIR = "%{nvcc_tmp_dir}"
-DEFAULT_CUDA_COMPUTE_CAPABILITIES = '3.5,6.0'
-
-# Taken from environment variable for supported TF CUDA Compute Capabilities
-# eg. export TF_CUDA_COMPUTE_CAPABILITIES=3.5,3.7,5.2,6.0,6.1,7.0
-supported_cuda_compute_capabilities = os.environ.get(
-    'TF_CUDA_COMPUTE_CAPABILITIES',
-    DEFAULT_CUDA_COMPUTE_CAPABILITIES).split(',')
 
 def Log(s):
   print('gpus/crosstool: {0}'.format(s))
@@ -53,7 +46,7 @@ def GetOptionValue(argv, option):
   """Extract the list of values for option from options.
 
   Args:
-    option: The option whose value to extract, without the leading '/'.
+    option: The option whose value to extract.
 
   Returns:
     1. A list of values, either directly following the option,
@@ -62,10 +55,11 @@ def GetOptionValue(argv, option):
     2. The leftover options.
   """
 
-  parser = ArgumentParser(prefix_chars='/')
-  parser.add_argument('/' + option, nargs='*', action='append')
+  parser = ArgumentParser(prefix_chars='-/')
+  parser.add_argument(option, nargs='*', action='append')
+  option = option.lstrip('-/').replace('-', '_')
   args, leftover = parser.parse_known_args(argv)
-  if args and vars(args)[option]:
+  if args and vars(args).get(option):
     return (sum(vars(args)[option], []), leftover)
   return ([], leftover)
 
@@ -122,18 +116,18 @@ def InvokeNvcc(argv, log=False):
 
   nvcc_compiler_options, argv = GetNvccOptions(argv)
 
-  opt_option, argv = GetOptionValue(argv, 'O')
+  opt_option, argv = GetOptionValue(argv, '/O')
   opt = ['-g']
   if (len(opt_option) > 0 and opt_option[0] != 'd'):
     opt = ['-O2']
 
-  include_options, argv = GetOptionValue(argv, 'I')
+  include_options, argv = GetOptionValue(argv, '/I')
   includes = ["-I " + include for include in include_options]
 
-  defines, argv = GetOptionValue(argv, 'D')
+  defines, argv = GetOptionValue(argv, '/D')
   defines = ['-D' + define for define in defines]
 
-  undefines, argv = GetOptionValue(argv, 'U')
+  undefines, argv = GetOptionValue(argv, '/U')
   undefines = ['-U' + define for define in undefines]
 
   # The rest of the unrecognized options should be passed to host compiler
@@ -142,10 +136,10 @@ def InvokeNvcc(argv, log=False):
   m_options = ["-m64"]
 
   nvccopts = ['-D_FORCE_INLINES']
-  for capability in supported_cuda_compute_capabilities:
-    capability = capability.replace('.', '')
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
+  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
   nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 545aeebe97a..c587f117deb 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -840,10 +840,7 @@ def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
         "--cuda-gpu-arch=sm_" + cap.replace(".", "")
         for cap in compute_capabilities
     ]
-
-    # Capabilities are handled in the "crosstool_wrapper_driver_is_not_gcc" for nvcc
-    # TODO(csigg): Make this consistent with cuda clang and pass unconditionally.
-    return "if_cuda_clang(%s)" % str(capability_flags)
+    return str(capability_flags)
 
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
@@ -1092,9 +1089,6 @@ def _create_local_cuda_repository(repository_ctx):
             "%{cuda_version}": cuda_config.cuda_version,
             "%{nvcc_path}": nvcc_path,
             "%{gcc_host_compiler_path}": str(cc),
-            "%{cuda_compute_capabilities}": ", ".join(
-                ["\"%s\"" % c for c in cuda_config.compute_capabilities],
-            ),
             "%{nvcc_tmp_dir}": _get_nvcc_tmp_dir_for_windows(repository_ctx),
         }
         repository_ctx.template(

From f4c54a716543a57fbbd0e163312136dc47414b13 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Tue, 12 May 2020 01:45:25 -0700
Subject: [PATCH 0380/1533] Add TPU Configuration C API

PiperOrigin-RevId: 311082982
Change-Id: I18031e1c84d28b37cbf1cdd68372e351d2da476a
---
 tensorflow/core/tpu/BUILD              |  8 ++++
 tensorflow/core/tpu/tpu_config_c_api.h | 54 ++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100644 tensorflow/core/tpu/tpu_config_c_api.h

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 43b2d93b917..4ea5fc39929 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -51,3 +51,11 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+cc_library(
+    name = "tpu_config_c_api",
+    hdrs = ["tpu_config_c_api.h"],
+    deps = [
+        "//tensorflow/c:tf_status",
+    ],
+)
diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
new file mode 100644
index 00000000000..334a6a19325
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -0,0 +1,54 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
+#define TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
+
+#include <cstddef>
+
+#include "tensorflow/c/tf_status.h"
+
+typedef struct TpuSerializedProto TpuSerializedProto;
+
+extern "C" {
+
+bool TPUHostInitialized();
+
+// TODO(frankchn): Modify API to take in raw values instead of Tensors.
+void ConfigureDistributedTpuOp_DoWork(size_t input_size,
+                                      TpuSerializedProto** inputs,
+                                      TpuSerializedProto* output,
+                                      TF_Status* status);
+
+void WaitForDistributedTpuOp_DoWork(size_t input_size,
+                                    TpuSerializedProto** inputs,
+                                    TpuSerializedProto* output,
+                                    TF_Status* status);
+
+void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
+
+void InitializeHostForDistributedTpuOp_DoWork(
+    size_t input_size, TpuSerializedProto** inputs,
+    bool enable_whole_mesh_compilations, TpuSerializedProto* output,
+    TF_Status* status);
+
+void SetGlobalTPUArrayOp_DoWork(size_t input_size, TpuSerializedProto** inputs,
+                                TF_Status* status);
+
+void DisconnectDistributedTpuChipsOp_DoWork(TpuSerializedProto* output,
+                                            TF_Status* status);
+}
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_

From c2118a3c96a42e2ce52a965d356aaa6ee44c7d9f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 02:02:58 -0700
Subject: [PATCH 0381/1533] Update GraphDef version to 399.

PiperOrigin-RevId: 311084694
Change-Id: Ia06021ab41ddb9dafb1f3fbd173f9a8e5cc0efa1
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 48fc0ce2a78..23e6138d553 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 398  // Updated: 2020/5/11
+#define TF_GRAPH_DEF_VERSION 399  // Updated: 2020/5/12
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From e89413399b0e80b2412ef7262f03c4796fbe8779 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 02:03:03 -0700
Subject: [PATCH 0382/1533] compat: Update forward compatibility horizon to
 2020-05-12

PiperOrigin-RevId: 311084706
Change-Id: Icbb98cd9e3465fe0634c5b52c2b0e4825cbdc654
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index a7cf5b25363..627979a5cb1 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 11)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 12)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 13ce8851cb96a848c49f2050b3b98ee1762a5ad0 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 12 May 2020 02:30:36 -0700
Subject: [PATCH 0383/1533] Use uint8_t, uint32_t and uint32_t.

Also delete unused build_defs.bzl.

PiperOrigin-RevId: 311087719
Change-Id: Iaa27b214e5d2e5227c4a5d454cb244ee70311086
---
 .../mlir/tools/kernel_gen/build_defs.bzl      | 96 -------------------
 .../mlir/tools/kernel_gen/cubin_creator.cc    | 30 +++---
 .../mlir/tools/kernel_gen/cubin_creator.h     | 11 ++-
 .../mlir/tools/kernel_gen/tf_to_cubin.cc      |  2 +-
 4 files changed, 22 insertions(+), 117 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl b/tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl
deleted file mode 100644
index cec9968e65b..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/build_defs.bzl
+++ /dev/null
@@ -1,96 +0,0 @@
-load("//third_party/gpus/cuda:build_defs.bzl", "cuda_gpu_select_list")
-
-def _lookup_file(filegroup, path):
-    """Extracts file at (relative) path in filegroup."""
-    for file in filegroup.files.to_list():
-        if file.path.endswith(path):
-            return file
-    return None
-
-def _gen_kernel_image_hdr_impl(ctx):
-    if not ctx.attr.gpu_archs:
-        fail("No GPU architecture specified, use --config=cuda or similar.")
-
-    name = ctx.attr.name
-    tile_sizes = ctx.attr.tile_size.replace("x", ",")
-    same_shape = []
-    if ctx.attr.same_shape:
-        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
-
-    cubins = []
-    images = []
-    for arch in ctx.attr.gpu_archs:
-        filename = "%s.%s.cubin" % (name, arch)
-        cubin = ctx.actions.declare_file(filename)
-        ctx.actions.run(
-            outputs = [cubin],
-            executable = ctx.executable._tool,
-            arguments = same_shape + [
-                "--tile_sizes=%s" % tile_sizes,
-                "--arch=%s" % arch.split("_")[1],
-                "--output=%s" % cubin.path,
-                ctx.attr.op,
-            ],
-            mnemonic = "compile",
-        )
-        cubins.append(cubin)
-        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
-
-    # Generate fatbin file from all cubins.
-    fatbin = ctx.actions.declare_file("%s.fatbin" % name)
-    ctx.actions.run(
-        outputs = [fatbin],
-        inputs = cubins,
-        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
-        arguments = [
-            "--64",
-            "--cmdline=--compile-only",
-            "--link",
-            "--compress-all",
-            "--create=%s" % fatbin.path,
-        ] + images,
-        mnemonic = "fatbinary",
-    )
-
-    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
-    ctx.actions.run_shell(
-        outputs = [ctx.outputs.out],
-        inputs = [fatbin],
-        tools = [bin2c],
-        command = "%s --static --const --type=int --name=%s %s 1> %s" %
-                  (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
-        mnemonic = "bin2c",
-    )
-
-_gen_kernel_image_hdr = rule(
-    implementation = _gen_kernel_image_hdr_impl,
-    output_to_genfiles = True,
-    attrs = {
-        "op": attr.string(mandatory = True),
-        "tile_size": attr.string(mandatory = True),
-        "same_shape": attr.string(),
-        "out": attr.output(mandatory = True),
-        "symbol": attr.string(mandatory = True),
-        "gpu_archs": attr.string_list(mandatory = True),
-        "_cuda_root": attr.label(
-            default = Label("//third_party/gpus/cuda:cuda_root"),
-        ),
-        "_tool": attr.label(
-            executable = True,
-            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
-            cfg = "host",
-        ),
-    },
-)
-
-def gen_kernel_image_hdr(name, op, tile_size, same_shape = None):
-    """Generates a C header with fatbin data from a Tensorflow op."""
-    _gen_kernel_image_hdr(
-        name = name,
-        op = op,
-        tile_size = tile_size,
-        same_shape = same_shape,
-        out = "include/tfrt/gpu/ops/tf/%s.h" % name,
-        symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
-        gpu_archs = cuda_gpu_select_list("sm_{}"),
-    )
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index 45d10214a42..b1c4b1beae1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -136,7 +136,7 @@ struct PropagateStaticKnowledge
     : public mlir::PassWrapper<PropagateStaticKnowledge,
                                mlir::OperationPass<mlir::LLVM::LLVMFuncOp>> {
   explicit PropagateStaticKnowledge(mlir::FunctionType type,
-                                    llvm::ArrayRef<unsigned> same_shape_)
+                                    llvm::ArrayRef<uint32_t> same_shape_)
       : func_type(type), same_shape(same_shape_) {}
 
   void runOnOperation() override {
@@ -152,8 +152,8 @@ struct PropagateStaticKnowledge
         func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 1));
     mlir::Value zero = b.create<mlir::LLVM::ConstantOp>(
         func.getLoc(), index_type, b.getIntegerAttr(b.getIndexType(), 0));
-    unsigned arg_pos = 0;
-    std::vector<unsigned> positions;
+    uint32_t arg_pos = 0;
+    std::vector<uint32_t> positions;
     for (mlir::Type arg_type : func_type.getInputs()) {
       positions.push_back(arg_pos);
       func.getArgument(arg_pos + 2).replaceAllUsesWith(zero);
@@ -165,13 +165,13 @@ struct PropagateStaticKnowledge
     // can use that here. Simply replace usages of the shape parameters within
     // the function body to a single shape parameter.
     if (!same_shape.empty()) {
-      int first = same_shape.front();
-      int first_offset = positions.at(first);
+      auto first = same_shape.front();
+      auto first_offset = positions.at(first);
       mlir::ShapedType first_type =
           func_type.getInput(first).cast<mlir::ShapedType>();
-      unsigned rank = first_type.getRank();
-      for (int same : same_shape.drop_front(1)) {
-        unsigned same_offset = positions.at(same);
+      uint32_t rank = first_type.getRank();
+      for (auto same : same_shape.drop_front(1)) {
+        uint32_t same_offset = positions.at(same);
         auto same_type = func_type.getInput(same).cast<mlir::ShapedType>();
         if (same_type.getRank() != rank) {
           func.emitOpError() << "same shape constraints on arguments with "
@@ -180,7 +180,7 @@ struct PropagateStaticKnowledge
           signalPassFailure();
         }
 
-        for (int i = 0; i < 2 * rank; ++i) {
+        for (uint32_t i = 0; i < 2 * rank; ++i) {
           // Replace uses for second arg data with first arg.
           auto same_arg = func.getArgument(same_offset + 3 + i);
           auto first_arg = func.getArgument(first_offset + 3 + i);
@@ -191,11 +191,11 @@ struct PropagateStaticKnowledge
   }
 
   mlir::FunctionType func_type;
-  llvm::ArrayRef<unsigned> same_shape;
+  llvm::ArrayRef<uint32_t> same_shape;
 };
 
 Status PropagateStaticShapeKnowledgeToKernel(
-    mlir::ModuleOp module, llvm::ArrayRef<unsigned> same_shape) {
+    mlir::ModuleOp module, llvm::ArrayRef<uint32_t> same_shape) {
   // Grab the original signature from the single function.
   auto func = *module.getBody()->op_begin<mlir::FuncOp>();
 
@@ -218,10 +218,10 @@ Status PropagateStaticShapeKnowledgeToKernel(
 }
 }  // namespace
 
-StatusOr<std::vector<uint8>> tensorflow::kernel_gen::GenerateCubinForTfCode(
-    llvm::StringRef tf_code, std::pair<int, int> compute_capability,
-    llvm::ArrayRef<unsigned> tile_sizes, llvm::ArrayRef<unsigned> same_shape,
-    llvm::ArrayRef<unsigned> unroll_factors) {
+StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
+    llvm::StringRef tf_code, std::pair<int32_t, int32_t> compute_capability,
+    llvm::ArrayRef<uint32_t> tile_sizes, llvm::ArrayRef<uint32_t> same_shape,
+    llvm::ArrayRef<uint32_t> unroll_factors) {
   mlir::MLIRContext context;
   context.allowUnregisteredDialects();  // TODO(b/152572127)
   mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
index c8746330c49..47626ba9d0d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.h
@@ -30,11 +30,12 @@ limitations under the License.
 
 namespace tensorflow {
 namespace kernel_gen {
-xla::StatusOr<std::vector<uint8>> GenerateCubinForTfCode(
-    llvm::StringRef tf_code, std::pair<int, int> compute_capability = {7, 5},
-    llvm::ArrayRef<unsigned> tile_sizes = {16, 64},
-    llvm::ArrayRef<unsigned> same_shape = {},
-    llvm::ArrayRef<unsigned> unroll_factors = {});
+xla::StatusOr<std::vector<uint8_t>> GenerateCubinForTfCode(
+    llvm::StringRef tf_code,
+    std::pair<int32_t, int32_t> compute_capability = {7, 5},
+    llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
+    llvm::ArrayRef<uint32_t> same_shape = {},
+    llvm::ArrayRef<uint32_t> unroll_factors = {});
 }  // namespace kernel_gen
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
index c9b447f5cad..8edc567e777 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_cubin.cc
@@ -102,7 +102,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  std::vector<uint8> cubin_data = cubin.ConsumeValueOrDie();
+  std::vector<uint8_t> cubin_data = cubin.ConsumeValueOrDie();
 
   auto status = tensorflow::WriteStringToFile(
       tensorflow::Env::Default(), output_file,

From 880d6d0115c7a410ad96494ce0652f1ccaead997 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 03:46:10 -0700
Subject: [PATCH 0384/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311095555
Change-Id: I9de5f4ab4570586d39e4cb3b66f4b5a515156a59
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e8c854a5cceb3be6fd827d54d4743c365abf0043 Mon Sep 17 00:00:00 2001
From: Andrew Stevens <andrew.stevens@infineon.com>
Date: Tue, 12 May 2020 12:59:37 +0200
Subject: [PATCH 0385/1533] Fix: resolve formatting/naming nits from review

---
 .../compiler/mlir/lite/transforms/optimize.cc |  9 ++---
 .../mlir/lite/transforms/prepare_quantize.cc  |  7 ++--
 .../mlir/lite/transforms/prepare_tf.cc        | 37 ++++++++-----------
 3 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 36b786a521a..f57cc3ed17f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -206,8 +206,7 @@ DenseElementsAttr GetShape(Value output_val) {
       llvm::makeArrayRef(shape));
 }
 
-static Type getShapeStrippedType(TypeAttr type_attr)
-{
+static Type GetShapeStrippedType(TypeAttr type_attr) {
   auto type = type_attr.getValue();
   auto shaped_type = type.dyn_cast<ShapedType>();
   if (shaped_type) {
@@ -220,13 +219,13 @@ static Type getShapeStrippedType(TypeAttr type_attr)
 bool NotFromQuantOpDifferentQuant(Value val, TypeAttr qtype_attr) {
   auto val_defn_op = val.getDefiningOp();
   TFL::QuantizeOp q_op = llvm::dyn_cast_or_null<TFL::QuantizeOp>(val_defn_op);
-  if( !q_op)
+  if (!q_op)
     return true;
 
   // Ignore shape details - weŕe really only trying to
   // check if quantization is the same.
-  auto stripped_src_qtype = getShapeStrippedType(q_op.qtypeAttr());
-  auto stripped_qtype = getShapeStrippedType(qtype_attr);
+  auto stripped_src_qtype = GetShapeStrippedType(q_op.qtypeAttr());
+  auto stripped_qtype = GetShapeStrippedType(qtype_attr);
   return stripped_src_qtype == stripped_qtype;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 54040d63503..50acb3d3fe5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -255,7 +255,7 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
     }
     auto dq_arg = dq_op.getOperand();
 
-    if (!dq_arg.hasOneUse()  ) {
+    if (!dq_arg.hasOneUse()) {
       // The initial quanization is used sompleace else ... so it might be
       // reasonable for it to requantized for another purpose.
       // TODO: ideally would want to still check whether requanization narrows 
@@ -265,13 +265,12 @@ void PrepareQuantizePass::SanityCheckAndAdjustment(FuncOp func) {
 
     // Invariant: 
     // isa<quant::QuantizeCastOp>(dq_arg.getDefiningOp()) -->
-    // getdq_arg.getType() != q_op.getResult().getType() 
+    // getdq_arg.getType() != q_op.getResult().getType()
     //
     // as otherwise qdq pair would have been optimized away.
-
     auto qd_arg_def_q_op = 
       dyn_cast_or_null<quant::QuantizeCastOp>(dq_arg.getDefiningOp());
-    if(!qd_arg_def_q_op) {
+    if (!qd_arg_def_q_op) {
       return;
     }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 80e48fd1c83..aa1baea73fb 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -88,10 +88,9 @@ struct FetchConstantMinMaxInputs {
   using AttrType = DenseFPElementsAttr;
   bool operator () (TFFakeQuantOp tf_op, AttrType &min_value, AttrType &max_value) const {
     Value min = tf_op.min(), max = tf_op.max();
-    ;
-    // TODO This is likely redundant (Identity elimination rule are in
-    // prepare_patterns.td.  If not, its certainly, incomplete as neither
-    // IdentityN ops Nor chains of Identiy* (not sooo rare) are handled
+
+    // TODO: incomplete  neither IdentityN ops 
+    // nor chains of Identity* (not rare) are handled
     if (auto id1 = dyn_cast_or_null<TF::IdentityOp>(min.getDefiningOp()))
       min = id1.input();
     if (auto id2 = dyn_cast_or_null<TF::IdentityOp>(max.getDefiningOp()))
@@ -229,25 +228,19 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
 //
 // Three instances of the rule to cover the three different types of
 // TF::FakeQuant operators
-// 
-using PreparePerTensorFakeQuant =
-    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxVarsOp, 
-                                        false,
-                                        FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsOp>
-                                       >;
+//
+using PreparePerTensorFakeQuant = InsertTFLQuantOpsAfterTFFakeQuantOp<
+    TF::FakeQuantWithMinMaxVarsOp, false,
+    FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsOp>>;
 
-using PreparePerChannelFakeQuant =
-    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxVarsPerChannelOp,
-                                        true,
-                                        FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsPerChannelOp>
-                                       >;
+using PreparePerChannelFakeQuant = InsertTFLQuantOpsAfterTFFakeQuantOp<
+    TF::FakeQuantWithMinMaxVarsPerChannelOp, true,
+    FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsPerChannelOp>>;
 
 using PreparePerTensorFakeQuantWithMinMaxArgs =
-    InsertTFLQuantOpsAfterTFFakeQuantOp<TF::FakeQuantWithMinMaxArgsOp,
-                                        false,
-                                        FetchMinMaxAttrs<TF::FakeQuantWithMinMaxArgsOp>
-                                       >;
-
+    InsertTFLQuantOpsAfterTFFakeQuantOp<
+        TF::FakeQuantWithMinMaxArgsOp, false,
+        FetchMinMaxAttrs<TF::FakeQuantWithMinMaxArgsOp>>;
 
 // Templated class for declaring a converter from some TensorFlow convolution
 // op into its counterpart in TensorFlow Lite.
@@ -692,8 +685,8 @@ void PrepareTFPass::runOnFunction() {
   // parameters from the TF Quant ops, thus this pattern should run with the
   // first `applyPatternsGreedily` method, which would otherwise removes the
   // TF FakeQuant ops by the constant folding.
-  patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant, PreparePerTensorFakeQuantWithMinMaxArgs>(ctx);
-
+  patterns.insert<PreparePerTensorFakeQuant, PreparePerChannelFakeQuant,
+                  PreparePerTensorFakeQuantWithMinMaxArgs>(ctx);
 
   // This pattern will try to identify and optimize for dilated convolution.
   // e.g. Patterns like "SpaceToBatchND -> Conv2D -> BatchToSpaceND" will be

From 53b0f6f7a6d25ce858d03b67ab391865d87ac4eb Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Tue, 12 May 2020 03:58:17 -0700
Subject: [PATCH 0386/1533] Factor out a virtual function for creating the
 TFLite interpreter so that subclass could create a customized one.

PiperOrigin-RevId: 311096576
Change-Id: Id5fb83c5471e07013263eddd1040df9ec8acbd69
---
 tensorflow/lite/tools/benchmark/BUILD              |  1 +
 .../lite/tools/benchmark/benchmark_tflite_model.cc | 14 +++++++++-----
 .../lite/tools/benchmark/benchmark_tflite_model.h  |  3 +++
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 357072226af..f6cb71749f8 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -142,6 +142,7 @@ cc_library(
         ":profiling_listener",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/profiling:platform_profiler",
         "//tensorflow/lite/profiling:profile_summary_formatter",
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 489780e4f69..969713cce73 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/base/attributes.h"
 #include "absl/strings/numbers.h"
 #include "ruy/profiler/profiler.h"  // from @ruy
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/op_resolver.h"
@@ -596,17 +597,20 @@ TfLiteStatus BenchmarkTfLiteModel::ResetInputsAndOutputs() {
   return kTfLiteOk;
 }
 
-TfLiteStatus BenchmarkTfLiteModel::Init() {
-  TF_LITE_ENSURE_STATUS(LoadModel());
-
+TfLiteStatus BenchmarkTfLiteModel::InitInterpreter() {
   auto resolver = GetOpResolver();
-
   const int32_t num_threads = params_.Get<int32_t>("num_threads");
   tflite::InterpreterBuilder(*model_, *resolver)(&interpreter_, num_threads);
   if (!interpreter_) {
-    TFLITE_LOG(ERROR) << "Failed to construct interpreter";
+    TFLITE_LOG(ERROR) << "Failed to initialize the interpreter";
     return kTfLiteError;
   }
+  return kTfLiteOk;
+}
+
+TfLiteStatus BenchmarkTfLiteModel::Init() {
+  TF_LITE_ENSURE_STATUS(LoadModel());
+  TF_LITE_ENSURE_STATUS(InitInterpreter());
 
   // Install profilers if necessary right after interpreter is created so that
   // any memory allocations inside the TFLite runtime could be recorded if the
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index b56390b3775..cc87743b531 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -74,6 +74,9 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   // Allow subclasses to create a customized Op resolver during init.
   virtual std::unique_ptr<tflite::OpResolver> GetOpResolver() const;
 
+  // Allow subclass to initialize a customized tflite interpereter.
+  virtual TfLiteStatus InitInterpreter();
+
   // Create a BenchmarkListener that's specifically for TFLite profiling if
   // necessary.
   virtual std::unique_ptr<BenchmarkListener> MayCreateProfilingListener() const;

From 2b7ac134dac7b2009a82d674d702d62fb5b92dd1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 05:46:06 -0700
Subject: [PATCH 0387/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311107613
Change-Id: Icf2222c5129d216a786954eb5e36e9aa1a5f90db
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 84c241fe5cc96d21329bf06366bcd757ef0e8d4e Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Tue, 12 May 2020 05:52:39 -0700
Subject: [PATCH 0388/1533] Add RaggedTensor dispatch for tf.nn.dropout.

For now, `noise_shape` is not supported when `x` is ragged.  Support for `noise_shape` with ragged tensors will be added later.

PiperOrigin-RevId: 311108234
Change-Id: I6f9ce69b5dd08d9d73f9093767363cfd4b5370ea
---
 tensorflow/python/ops/nn_ops.py               |  3 +++
 .../python/ops/ragged/ragged_dispatch.py      | 23 +++++++++++++++++++
 .../python/ops/ragged/ragged_dispatch_test.py |  8 ++++++-
 3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index de5be20aa84..248c57c1ba5 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -45,6 +45,7 @@ from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import device_context
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
@@ -4513,6 +4514,7 @@ def _get_noise_shape(x, noise_shape):
 
 
 @tf_export(v1=["nn.dropout"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Please use `rate` instead of `keep_prob`. "
                              "Rate should be set to `rate = 1 - keep_prob`.",
                              "keep_prob")
@@ -4567,6 +4569,7 @@ def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
 
 
 @tf_export("nn.dropout", v1=[])
+@dispatch.add_dispatch_support
 def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
   """Computes dropout: randomly sets elements to zero to prevent overfitting.
 
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
index dd5bd782462..f13bed07ba0 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
@@ -453,6 +454,26 @@ def _ragged_dynamic_partition(data, partitions, num_partitions, name=None):
                                                      num_partitions, name)
   return [result[i] for i in range(num_partitions)]
 
+
+def _ragged_nn_dropout_v1(x, keep_prob=None, noise_shape=None, seed=None,
+                          name=None, rate=None):
+  if noise_shape is not None:
+    raise ValueError('noise_shape is not supported yet for RaggedTensor x')
+  with ops.name_scope(name, 'RaggedNNDropout', [x, rate]):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+    return x.with_flat_values(nn_ops.dropout(x.flat_values, keep_prob=keep_prob,
+                                             seed=seed, rate=rate))
+
+
+def _ragged_nn_dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
+  if noise_shape is not None:
+    raise ValueError('noise_shape is not supported yet for RaggedTensor x')
+  with ops.name_scope(name, 'RaggedNNDropout', [x, rate]):
+    x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+    return x.with_flat_values(nn_ops.dropout_v2(x.flat_values, rate=rate,
+                                                seed=seed))
+
+
 # (original_op, ragged_op, ragged_args)
 _RAGGED_DISPATCH_OPS = [
     (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather,
@@ -497,6 +518,8 @@ _RAGGED_DISPATCH_OPS = [
     (math_ops.reduce_mean, ragged_math_ops.reduce_mean, ['input_tensor']),
     (math_ops.reduce_any, ragged_math_ops.reduce_any, ['input_tensor']),
     (math_ops.reduce_all, ragged_math_ops.reduce_all, ['input_tensor']),
+    (nn_ops.dropout, _ragged_nn_dropout_v1, ['x']),
+    (nn_ops.dropout_v2, _ragged_nn_dropout_v2, ['x']),
 ]
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 0ce9a6f9771..60d9f6c8713 100644
--- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_bitwise_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_dispatch
@@ -232,6 +233,10 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
           {'op': array_ops.check_numerics,
            'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
            'message': 'check-numerics'},
+          {'op': nn_ops.dropout,
+           'x': ragged_factory_ops.constant_value([[-2.0, 3.0], [-3.0]]),
+           'rate': 0.5,
+           'seed': 1},
       ]
       )  # pyformat: disable
   def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
@@ -820,7 +825,8 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
         'strings.substr', 'strings.to_hash_bucket_fast',
         'strings.to_hash_bucket_strong', 'strings.to_hash_bucket',
         'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv',
-        'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse'
+        'truncatemod', 'zeros_like', 'dynamic_partition', 'reverse',
+        'nn.dropout',
     ]
 
     # Ops that should be listed as supported in v1 only.

From a672463c591eb68f1dff9c49aa3cd62bdf3bd3d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 07:48:31 -0700
Subject: [PATCH 0389/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311123770
Change-Id: Ia34913e58b7c1b296a37f90240295369be4b8872
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 773813a42606a362c9c2d7bc0e129ab87d73cd8d Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Tue, 12 May 2020 07:56:12 -0700
Subject: [PATCH 0390/1533] Bug fix for composite tensors in SavedModel

PiperOrigin-RevId: 311124917
Change-Id: I1e0b2fb6484cb9049445943594a1dec5760e2620
---
 tensorflow/python/saved_model/utils_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 42e971d050d..0f635b6bf85 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -178,7 +178,7 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
     spec = struct_coder.decode_proto(spec_proto)
     components = [_get_tensor(component.name) for component in
                   tensor_info.composite_tensor.components]
-    return spec._from_components(components)  # pylint: disable=protected-access
+    return nest.pack_sequence_as(spec, components, expand_composites=True)
   else:
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 

From df1ea3b0d4c834fe1dba719c27e3291f9af25c96 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 12 May 2020 15:08:56 +0000
Subject: [PATCH 0391/1533] Update docstring of tf.split to clarify
 num_or_size_splits

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fbb977f8d9a..8d1284da3d0 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1919,9 +1919,9 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   See also `tf.unstack`.
 
-  If `num_or_size_splits` is an integer, then `value` is split along the
-  dimension `axis` into `num_split` smaller tensors. This requires that
-  `value.shape[axis]` is divisible by `num_split`.
+  If `num_or_size_splits` is an integer,  we call it num_split and
+  `value` is split along the dimension `axis` into `num_split` smaller
+  tensors. This requires that `value.shape[axis]` is divisible by `num_split`.
 
   If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits`
   and `value` is split into `len(size_splits)` elements. The shape of the `i`-th

From 6bb9ca398dfbd110534341140ea2970a32cfaf60 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 08:10:51 -0700
Subject: [PATCH 0392/1533] Resolve trivial aliases for portable TensorFlow
 targets.

PiperOrigin-RevId: 311127183
Change-Id: I9011f48a3a753d0fdae5cff869a1b28ff1ccda3a
---
 tensorflow/cc/BUILD                                | 10 +++++-----
 tensorflow/core/BUILD                              |  6 +++---
 tensorflow/core/kernels/BUILD                      |  6 +++---
 tensorflow/tools/android/inference_interface/BUILD |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index e8cb40f153b..e1fad8e697a 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -178,7 +178,7 @@ cc_library_with_android_deps(
     name = "ops",
     srcs = ["framework/ops.cc"],
     hdrs = ["framework/ops.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     deps = [
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -197,7 +197,7 @@ cc_library_with_android_deps(
         "framework/scope_internal.h",
     ],
     hdrs = ["framework/scope.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     common_deps = [
         ":ops",
     ],
@@ -237,7 +237,7 @@ cc_library_with_android_deps(
     name = "client_session",
     srcs = ["client/client_session.cc"],
     hdrs = ["client/client_session.h"],
-    android_deps = ["//tensorflow/core:android_tensorflow_lib"],
+    android_deps = ["//tensorflow/core:portable_tensorflow_lib"],
     common_deps = [
         ":ops",
         ":scope",
@@ -275,7 +275,7 @@ cc_library_with_android_deps(
     srcs = ["ops/const_op.cc"],
     hdrs = ["ops/const_op.h"],
     android_deps = [
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ],
     common_deps = [
         ":ops",
@@ -304,7 +304,7 @@ cc_library_with_android_deps(
     srcs = ["ops/while_loop.cc"],
     hdrs = ["ops/while_loop.h"],
     android_deps = [
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ],
     common_deps = [
         ":cc_ops",
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7f34bfaa186..a655a9509d3 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1271,7 +1271,7 @@ filegroup(
         "//tensorflow/core/platform:mobile_srcs_no_runtime",
         "//tensorflow/core/public:mobile_srcs_no_runtime",
         "//tensorflow/core/util:mobile_srcs_no_runtime",
-        "//tensorflow/core/util/ctc:android_srcs",
+        "//tensorflow/core/util/ctc:mobile_srcs",
     ] + glob(
         [
             "client/**/*.cc",
@@ -1301,12 +1301,12 @@ filegroup(
         "//tensorflow/core/common_runtime/eager:srcs",
         "//tensorflow/core/framework:mobile_srcs_only_runtime",
         "//tensorflow/core/graph:mobile_srcs_only_runtime",
-        "//tensorflow/core/kernels:android_srcs",
+        "//tensorflow/core/kernels:mobile_srcs",
         "//tensorflow/core/lib/io:mobile_srcs_only_runtime",
         "//tensorflow/core/profiler:mobile_srcs",
         "//tensorflow/core/public:mobile_srcs_only_runtime",
         "//tensorflow/core/util/sparse:mobile_srcs_only_runtime",
-        "//tensorflow/core/util/tensor_bundle:android_srcs",
+        "//tensorflow/core/util/tensor_bundle:mobile_srcs",
         "//tensorflow/core/util:mobile_srcs_only_runtime",
 
         # Sources for which we already have granular targets.
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index e47c681bb61..7cfb6fcae67 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -7096,7 +7096,7 @@ cc_library(
 
 build_test(
     name = "android_tensorflow_kernels_build_test",
-    targets = [":android_tensorflow_kernels"],
+    targets = [":portable_tensorflow_kernels"],
 )
 
 cc_library(
@@ -7109,7 +7109,7 @@ cc_library(
         "//tensorflow/core:android_gif_internal",
         "//tensorflow/core:android_jpeg_internal",
         "//tensorflow/core:android_png_internal",
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
     ],
     alwayslink = 1,
 )
@@ -7126,7 +7126,7 @@ cc_library(
     linkopts = ["-ldl"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/tools/android/inference_interface/BUILD b/tensorflow/tools/android/inference_interface/BUILD
index cbd161f05b3..fb3ab00f9bc 100644
--- a/tensorflow/tools/android/inference_interface/BUILD
+++ b/tensorflow/tools/android/inference_interface/BUILD
@@ -34,7 +34,7 @@ cc_library(
     copts = tf_copts(),
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/core:android_tensorflow_lib_lite",
+        "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/java/src/main/native",
     ],
     alwayslink = 1,
@@ -83,7 +83,7 @@ cc_binary(
     ],
     deps = [
         ":android_tensorflow_inference_jni",
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
         LINKER_SCRIPT,
     ],
 )

From 275874a436c06be8d13521c291bde77d9e697c1a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 12 May 2020 15:15:45 +0000
Subject: [PATCH 0393/1533] Combine into one block in example of docstring so
 that it could be rendered correctly

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8d1284da3d0..8aa5d66f402 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1931,13 +1931,15 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   For example:
 
   >>> x = tf.Variable(tf.random.uniform([5, 30], -1, 1))
-
-  Split `x` into 3 tensors along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors along dimension 1:
+  ...
   >>> s0, s1, s2 = tf.split(x, num_or_size_splits=3, axis=1)
   >>> tf.shape(s0).numpy()
   array([ 5, 10], dtype=int32)
-
-  Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
+  ...
   >>> split0, split1, split2 = tf.split(x, [4, 15, 11], 1)
   >>> tf.shape(split0).numpy()
   array([5, 4], dtype=int32)

From 1658986a76169aeb17c7382c8c82fba649adad59 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 08:54:02 -0700
Subject: [PATCH 0394/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/123bee602a26

PiperOrigin-RevId: 311133851
Change-Id: I5e85bed33bf2295752f2f862d5a4295d5e2a2817
---
 .../hlo_to_lhlo_with_xla/passthrough.mlir     |  4 ++-
 .../transforms/xla_hlo_to_lhlo_with_xla.cc    | 25 ++++++++++---------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
index cda1dc481a7..6a2b68adac3 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/passthrough.mlir
@@ -8,7 +8,9 @@
 // CHECK-SAME: ) {
 func @main(%value: tensor<2x2xf32>) -> tensor<2x2xf32> {
   // The only expected instruction is a copy from the input into the output.
-  // CHECK: %[[OUTPUT:.*]] = std.view %[[ARG1]][][] : memref<16xi8> to memref<2x2xf32>
+  // CHECK: %[[C0:.*]] = constant 0 : index
+  // CHECK: %[[C02:.*]] = constant 0 : index
+  // CHECK: %[[OUTPUT:.*]] = std.view %[[ARG1]][%[[C02]]][] : memref<16xi8> to memref<2x2xf32>
   // CHECK: xla_lhlo.copy
   // CHECK-SAME: %[[ARG0]], %[[OUTPUT]]
   return %value : tensor<2x2xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
index 436a3e701e1..a12bd9e7c1a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_hlo_to_lhlo_with_xla.cc
@@ -251,17 +251,15 @@ Value LhloDialectEmitter::GetOrCreateView(
 
   // Create the view for this slice size, possible with an affine map to model
   // the offset. The result is cached in the slices_ map.
-  SmallVector<AffineMap, 1> offset_map;
-  if (slice.offset()) {
-    offset_map.push_back(AffineMap::get(
-        /*dimCount=*/1, /*symbolCount=*/0,
-        {getAffineDimExpr(0, builder_.getContext()) + slice.offset()},
-        builder_.getContext()));
-  }
-  auto slice_type = MemRefType::get({slice.size()}, i8_type_, offset_map);
+  // The std.view result type does not carry the static offset: this is not
+  // useful information. Rather, the view op must have the static offset.
+  auto slice_type = MemRefType::get({slice.size()}, i8_type_, {});
 
-  auto slice_view = builder_.create<ViewOp>(
-      alloc_buffer.getLoc(), slice_type, alloc_buffer, /*operands=*/llvm::None);
+  Value byte_shift =
+      builder_.create<ConstantIndexOp>(alloc_buffer.getLoc(), slice.offset());
+  auto slice_view =
+      builder_.create<ViewOp>(alloc_buffer.getLoc(), slice_type, alloc_buffer,
+                              byte_shift, /*sizes=*/ArrayRef<Value>{});
   slices_.insert({slice_key, slice_view});
   return slice_view;
 }
@@ -277,9 +275,12 @@ StatusOr<Value> LhloDialectEmitter::GetOrCreateView(
   Value slice_view = GetOrCreateView(out_slice);
   TF_ASSIGN_OR_RETURN(Type out_type, ::xla::ConvertShapeToType<MemRefType>(
                                          target_shape, builder_));
+  Value byte_shift =
+      builder_.create<ConstantIndexOp>(builder_.getUnknownLoc(), 0);
   if (slice_view.getType() != out_type)
-    slice_view = builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type,
-                                         slice_view, llvm::None);
+    slice_view =
+        builder_.create<ViewOp>(builder_.getUnknownLoc(), out_type, slice_view,
+                                byte_shift, /*sizes=*/ArrayRef<Value>{});
   return slice_view;
 }
 

From 169ceceaf2ea81650959bb7f3352b7575e98df66 Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Tue, 12 May 2020 09:04:18 -0700
Subject: [PATCH 0395/1533] Minor fix an error in error message

PiperOrigin-RevId: 311135827
Change-Id: I765bd43c7a3c8bde9d20b25b7f9ebcc77c71c896
---
 tensorflow/core/common_runtime/eager/eager_executor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index 13630a01ea9..7850978410f 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -98,7 +98,7 @@ const char* EagerExecutor::StateStringLocked() {
 
 Status EagerExecutor::SyncExecute(EagerNode* node) {
   if (Async()) {
-    return errors::Internal("Executor does not support sync execution");
+    return errors::Internal("Executor does not support async execution");
   }
   if (node->AsAsync() != nullptr) {
     return errors::Internal("Executor does not support executing async nodes");

From adb282e47c7c73813270b082a23e674cf7087885 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 12 May 2020 09:24:51 -0700
Subject: [PATCH 0396/1533] Bump open source llvm revision to
 123bee602a260150ff55c74287f583a67ee78f36

PiperOrigin-RevId: 311139944
Change-Id: I31557f69d4c4cea061157fcff411f384dddeef05
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 36dc0c2b101..fe548fdec05 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "728cf6d86b4f20144ac10517afb0cb978beac124"
-    LLVM_SHA256 = "41a24cf437be40c8a790b1095e6bfc3a9d531a44275abecddf2eda1835658bcc"
+    LLVM_COMMIT = "123bee602a260150ff55c74287f583a67ee78f36"
+    LLVM_SHA256 = "313ec75e47ea3f128724a61b8b6b45b7d305ba2ae57a5084b4bf1f881b4ec8f2"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From cb92c9b87392a373f66d2b662ff6e50d4b57551c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 14 Apr 2020 20:50:12 +0000
Subject: [PATCH 0397/1533] Fix issue in tf.image.extract_glimpse

This PR is to re-apply PR 12829. While 12829 was merged before,
for some reason it was reverted at one point. The guess is that
there are some internal testing that caused the revert. This
PR will try to submit again, and fix any internal tests that fails.

This fix tries to fix the issue raised in 2134 where
`tf.image.extract_glimpse` does not work as expected
when `centered=False` and `normalized=False`

This fix fixes 2134.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/eigen_attention.h | 27 ++++++++++++++---------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index c5158e65d8a..7cf5c53dfca 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -101,21 +101,26 @@ struct GlimpseExtractionOp {
     for (Index i = 0; i < batch_size; ++i) {
       float x = offsets_[i].first, y = offsets_[i].second;
 
-      // Un-normalize coordinates back to pixel space if normalized.
       if (normalized_) {
+        // Un-normalize coordinates back to pixel space if normalized.
         x *= input_width;
         y *= input_height;
+        if (centered_) {
+          // Un-center if coordinates are centered on the image center.
+          x /= 2.0f;
+          y /= 2.0f;
+          x += input_width / 2.0f;
+          y += input_height / 2.0f;
+          // Remove half of the glimpse window.
+          x -= width_ / 2.0f;
+          y -= height_ / 2.0f;
+        }
+      } else {
+        if (centered_) {
+          x += input_width / 2.0f;
+          y += input_height / 2.0f;
+        }
       }
-      // Un-center if coordinates are centered on the image center.
-      if (centered_) {
-        x /= 2.0f;
-        y /= 2.0f;
-        x += input_width / 2.0f;
-        y += input_height / 2.0f;
-      }
-      // Remove half of the glimpse window.
-      x -= width_ / 2.0f;
-      y -= height_ / 2.0f;
 
       const Index offset_x = (Index)x;
       const Index offset_y = (Index)y;

From 3e2bcc33e527a27edf7011bfd11aa395a68cb9e4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 14 Apr 2020 20:53:51 +0000
Subject: [PATCH 0398/1533] Add test cases for tf.image.extract_glimpse

Add test cases for tf.image.extract_glimpse with
centered=False and normalized=False

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/attention_ops_test.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 87e709fc69e..8799980668a 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -236,6 +236,18 @@ class ExtractGlimpseTest(test.TestCase):
                       [0, 0, 0, 0, 0, 0, 0]]),
           self.evaluate(result2)[0, :, :, 0])
 
+  def testGlimpseNonNormalizedNonCentered(self):
+    img = constant_op.constant(np.arange(25).reshape((1, 5, 5, 1)),
+                               dtype=dtypes.float32)
+    with self.test_session():
+      result1 = image_ops.extract_glimpse(img, [3, 3], [[0, 0]],
+                                          centered=False, normalized=False)
+      result2 = image_ops.extract_glimpse(img, [3, 3], [[1, 0]],
+                                          centered=False, normalized=False)
+      self.assertAllEqual(np.asarray([[0, 1, 2], [5, 6, 7], [10, 11, 12]]),
+                          result1.eval()[0, :, :, 0])
+      self.assertAllEqual(np.asarray([[5, 6, 7], [10, 11, 12], [15, 16, 17]]),
+                          result2.eval()[0, :, :, 0])
 
 if __name__ == '__main__':
   test.main()

From 35efb74fb72efde43122dd41da3dfc93dbf5be18 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 14 Apr 2020 20:58:33 +0000
Subject: [PATCH 0399/1533] Fix test failure caused by API changes in tests

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/attention_ops_test.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 8799980668a..21db05fac2f 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -240,14 +240,14 @@ class ExtractGlimpseTest(test.TestCase):
     img = constant_op.constant(np.arange(25).reshape((1, 5, 5, 1)),
                                dtype=dtypes.float32)
     with self.test_session():
-      result1 = image_ops.extract_glimpse(img, [3, 3], [[0, 0]],
-                                          centered=False, normalized=False)
-      result2 = image_ops.extract_glimpse(img, [3, 3], [[1, 0]],
-                                          centered=False, normalized=False)
+      result1 = image_ops.extract_glimpse_v2(img, [3, 3], [[0, 0]],
+                                             centered=False, normalized=False)
+      result2 = image_ops.extract_glimpse_v2(img, [3, 3], [[1, 0]],
+                                             centered=False, normalized=False)
       self.assertAllEqual(np.asarray([[0, 1, 2], [5, 6, 7], [10, 11, 12]]),
-                          result1.eval()[0, :, :, 0])
+                          self.evaluate(result1)[0, :, :, 0])
       self.assertAllEqual(np.asarray([[5, 6, 7], [10, 11, 12], [15, 16, 17]]),
-                          result2.eval()[0, :, :, 0])
+                          self.evaluate(result2)[0, :, :, 0])
 
 if __name__ == '__main__':
   test.main()

From 677f75990460f3b68a66651001e25c5bde4aa374 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 14 Apr 2020 21:47:37 +0000
Subject: [PATCH 0400/1533] Fix test failure due to changes of the fix for
 centered=False and normalized=False

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/attention_ops_test.py | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 21db05fac2f..feec82aa051 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -211,28 +211,33 @@ class ExtractGlimpseTest(test.TestCase):
       # [ 0.  0.  0.]
       # [ 0.  0.  0.]
       result1 = image_ops.extract_glimpse_v2(
-          img, [3, 3], [[-2, 2]],
+          img, [3, 3], [[-2, -2]],
           centered=False,
           normalized=False,
           noise='zero')
       self.assertAllEqual(
-          np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]),
+          np.asarray([[0, 0, 0],
+                      [0, 0, 0],
+                      [0, 0, 0]]),
           self.evaluate(result1)[0, :, :, 0])
 
       # Result 2:
+      # [ 12.  13.  14.   0.   0.   0.   0.]
+      # [ 17.  18.  19.   0.   0.   0.   0.]
+      # [ 22.  23.  24.   0.   0.   0.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]
       # [  0.   0.   0.   0.   0.   0.   0.]
-      # [  0.   0.   1.   2.   3.   4.   0.]
-      # [  0.   5.   6.   7.   8.   9.   0.]
-      # [  0.  10.  11.  12.  13.  14.   0.]
-      # [  0.  15.  16.  17.  18.  19.   0.]
-      # [  0.  20.  21.  22.  23.  24.   0.]
       # [  0.   0.   0.   0.   0.   0.   0.]
       result2 = image_ops.extract_glimpse_v2(
           img, [7, 7], [[0, 0]], normalized=False, noise='zero')
       self.assertAllEqual(
-          np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0],
-                      [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0],
-                      [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0],
+          np.asarray([[12, 13, 14, 0, 0, 0, 0],
+                      [17, 18, 19, 0, 0, 0, 0],
+                      [22, 23, 24, 0, 0, 0, 0],
+                      [0, 0, 0, 0, 0, 0, 0],
+                      [0, 0, 0, 0, 0, 0, 0],
+                      [0, 0, 0, 0, 0, 0, 0],
                       [0, 0, 0, 0, 0, 0, 0]]),
           self.evaluate(result2)[0, :, :, 0])
 

From 3fc74213ba34f5748be1c3ac3f9199b225d10b64 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 15 Apr 2020 15:25:42 +0000
Subject: [PATCH 0401/1533] Fix incorrect doc test

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/image_ops_impl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e6a5cdbf4e8..c84c9e701c4 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4148,10 +4148,10 @@ def extract_glimpse_v2(
   >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
   ...                         centered=False, normalized=False)
   <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-  array([[[[0.],
-           [1.]],
-          [[3.],
-           [4.]]]], dtype=float32)>
+  array([[[[4.],
+           [5.]],
+          [[7.],
+           [8.]]]], dtype=float32)>
 
   Args:
     input: A `Tensor` of type `float32`. A 4-D float tensor of shape

From 353d22eb433b1494b6bafbfde126bd999499a79e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 15 Apr 2020 15:26:42 +0000
Subject: [PATCH 0402/1533] Fix incorrect doc example with centered=False

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/image_ops_impl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index c84c9e701c4..bd0722f32f9 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4063,10 +4063,10 @@ def extract_glimpse(
   >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
   ...                         centered=False, normalized=False)
   <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-  array([[[[0.],
-           [1.]],
-          [[3.],
-           [4.]]]], dtype=float32)>
+  array([[[[4.],
+           [5.]],
+          [[7.],
+           [8.]]]], dtype=float32)>
 
   Args:
     input: A `Tensor` of type `float32`. A 4-D float tensor of shape

From 756b7ed2d65843d52c8e02ca6350fd51fb638a55 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 27 Apr 2020 17:09:52 +0000
Subject: [PATCH 0403/1533] Use ExtractGlimpseV2 and ExtractGlimpse to make
 sure C++ kernel is backward compatible

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/attention_ops.cc  |  8 +++-
 tensorflow/core/kernels/eigen_attention.h | 50 ++++++++++++++++-------
 tensorflow/core/ops/image_ops.cc          | 35 ++++++++++++++++
 tensorflow/python/ops/image_ops_impl.py   | 10 ++---
 4 files changed, 82 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/attention_ops.cc b/tensorflow/core/kernels/attention_ops.cc
index f555c0fd679..6e5e07a9fb1 100644
--- a/tensorflow/core/kernels/attention_ops.cc
+++ b/tensorflow/core/kernels/attention_ops.cc
@@ -32,6 +32,8 @@ namespace tensorflow {
 class ExtractGlimpseOp : public OpKernel {
  public:
   explicit ExtractGlimpseOp(OpKernelConstruction* context) : OpKernel(context) {
+    const string& op = context->def().op();
+    version_ = (op == "ExtractGlimpse") ? 1 : 2;
     OP_REQUIRES_OK(context, context->GetAttr("normalized", &normalized_));
     OP_REQUIRES_OK(context, context->GetAttr("centered", &centered_));
     bool uniform_noise = false;
@@ -117,21 +119,23 @@ class ExtractGlimpseOp : public OpKernel {
       // calling TensorFlow operates with (y,x) as indices.
       offset_vec.push_back(Eigen::IndexPair<float>(offset_x, offset_y));
     }
-
     output->tensor<float, 4>().swap_layout().device(
         context->eigen_cpu_device()) =
         Eigen::ExtractGlimpses(input.tensor<float, 4>().swap_layout(),
                                output_width, output_height, offset_vec,
-                               normalized_, centered_, noise_);
+                               normalized_, centered_, noise_, version_);
   }
 
  private:
   bool normalized_;
   bool centered_;
   Eigen::ExtractGlimpsesNoiseMode noise_;
+  int32 version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("ExtractGlimpse").Device(DEVICE_CPU),
                         ExtractGlimpseOp);
+REGISTER_KERNEL_BUILDER(Name("ExtractGlimpseV2").Device(DEVICE_CPU),
+                        ExtractGlimpseOp);
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/eigen_attention.h b/tensorflow/core/kernels/eigen_attention.h
index 7cf5c53dfca..ca61e223c21 100644
--- a/tensorflow/core/kernels/eigen_attention.h
+++ b/tensorflow/core/kernels/eigen_attention.h
@@ -56,13 +56,15 @@ struct GlimpseExtractionOp {
   GlimpseExtractionOp(const Index width, const Index height,
                       const std::vector<IndexPair<float> >& offsets,
                       const bool normalized, const bool centered,
-                      const ExtractGlimpsesNoiseMode noise)
+                      const ExtractGlimpsesNoiseMode noise,
+		      const int version)
       : width_(width),
         height_(height),
         offsets_(offsets),
         normalized_(normalized),
         centered_(centered),
-        noise_(noise) {}
+        noise_(noise),
+        version_(version) {}
 
   template <typename Input>
   DSizes<Index, 4> dimensions(const Input& input) const {
@@ -101,24 +103,42 @@ struct GlimpseExtractionOp {
     for (Index i = 0; i < batch_size; ++i) {
       float x = offsets_[i].first, y = offsets_[i].second;
 
-      if (normalized_) {
+      if (version_ == 1) {
         // Un-normalize coordinates back to pixel space if normalized.
-        x *= input_width;
-        y *= input_height;
+        if (normalized_) {
+          x *= input_width;
+          y *= input_height;
+        }
+        // Un-center if coordinates are centered on the image center.
         if (centered_) {
-          // Un-center if coordinates are centered on the image center.
           x /= 2.0f;
           y /= 2.0f;
           x += input_width / 2.0f;
           y += input_height / 2.0f;
-          // Remove half of the glimpse window.
-          x -= width_ / 2.0f;
-          y -= height_ / 2.0f;
         }
+        // Remove half of the glimpse window.
+        x -= width_ / 2.0f;
+        y -= height_ / 2.0f;
       } else {
-        if (centered_) {
-          x += input_width / 2.0f;
-          y += input_height / 2.0f;
+        if (normalized_) {
+          // Un-normalize coordinates back to pixel space if normalized.
+          x *= input_width;
+          y *= input_height;
+          if (centered_) {
+            // Un-center if coordinates are centered on the image center.
+            x /= 2.0f;
+            y /= 2.0f;
+            x += input_width / 2.0f;
+            y += input_height / 2.0f;
+            // Remove half of the glimpse window.
+            x -= width_ / 2.0f;
+            y -= height_ / 2.0f;
+          }
+        } else {
+          if (centered_) {
+            x += input_width / 2.0f;
+            y += input_height / 2.0f;
+          }
         }
       }
 
@@ -248,6 +268,7 @@ struct GlimpseExtractionOp {
   const bool normalized_;
   const bool centered_;
   const ExtractGlimpsesNoiseMode noise_;
+  const int version_;
 };
 }  // namespace
 
@@ -260,7 +281,8 @@ ExtractGlimpses(
     const typename internal::traits<Input>::Index height,
     const std::vector<IndexPair<float> >& offsets, const bool normalized = true,
     const bool centered = true,
-    const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM) {
+    const ExtractGlimpsesNoiseMode noise = ExtractGlimpsesNoiseMode::UNIFORM,
+    const int version = 2) {
   EIGEN_STATIC_ASSERT(internal::traits<Input>::Layout == ColMajor,
                       YOU_MADE_A_PROGRAMMING_MISTAKE);
   EIGEN_STATIC_ASSERT(internal::traits<Input>::NumDimensions == 4,
@@ -268,7 +290,7 @@ ExtractGlimpses(
 
   typedef typename internal::traits<Input>::Index Index;
   const GlimpseExtractionOp<Index> op(width, height, offsets, normalized,
-                                      centered, noise);
+                                      centered, noise, version);
   return input.customOp(op);
 }
 
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index 418f1e20e37..e11f14b8538 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -756,6 +756,41 @@ REGISTER_OP("ExtractGlimpse")
                                    c->Dim(input, 3));
     });
 
+REGISTER_OP("ExtractGlimpseV2")
+    .Input("input: float")
+    .Input("size: int32")
+    .Input("offsets: float")
+    .Output("glimpse: float")
+    .Attr("centered: bool = true")
+    .Attr("normalized: bool = true")
+    .Attr("uniform_noise: bool = true")
+    .Attr("noise: string = 'uniform'")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input));
+      ShapeHandle offsets;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &offsets));
+
+      DimensionHandle batch_dim;
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(input, 0), c->Dim(offsets, 0), &batch_dim));
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(offsets, 1), 2, &unused));
+
+      bool uniform_noise = false;
+      TF_RETURN_IF_ERROR(c->GetAttr("uniform_noise", &uniform_noise));
+      string noise;
+      TF_RETURN_IF_ERROR(c->GetAttr("noise", &noise));
+      if (uniform_noise && (!noise.empty() && noise != "uniform")) {
+        return errors::InvalidArgument(
+            "The uniform_noise and noise should not be specified at the same "
+            "time");
+      }
+
+      return SetOutputToSizedImage(c, batch_dim, 1 /* size_input_idx */,
+                                   c->Dim(input, 3));
+    });
+
 // --------------------------------------------------------------------------
 
 REGISTER_OP("CropAndResize")
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bd0722f32f9..49f44872ebf 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4063,10 +4063,10 @@ def extract_glimpse(
   >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
   ...                         centered=False, normalized=False)
   <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-  array([[[[4.],
-           [5.]],
-          [[7.],
-           [8.]]]], dtype=float32)>
+  array([[[[0.],
+           [1.]],
+          [[3.],
+           [4.]]]], dtype=float32)>
 
   Args:
     input: A `Tensor` of type `float32`. A 4-D float tensor of shape
@@ -4176,7 +4176,7 @@ def extract_glimpse_v2(
   Returns:
     A `Tensor` of type `float32`.
   """
-  return gen_image_ops.extract_glimpse(
+  return gen_image_ops.extract_glimpse_v2(
       input=input,
       size=size,
       offsets=offsets,

From 9b84edeb4f866f137073f04f1e10296d19ef9e76 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 27 Apr 2020 17:11:04 +0000
Subject: [PATCH 0404/1533] Expand test case to cover both old kernel
 (ExtractGlimpse) and new kernel (ExtractGlimpseV2)

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/attention_ops_test.py | 48 +++++++++++++++++++
 tensorflow/python/ops/image_ops_impl.py       | 10 ++--
 2 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index feec82aa051..80e2a816834 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import image_ops
 from tensorflow.python.platform import test
 
@@ -196,6 +197,53 @@ class ExtractGlimpseTest(test.TestCase):
         expected_rows=[None, None, None, 1, 2, 3, 4],
         expected_cols=[56, 57, 58, 59, 60])
 
+  def testGlimpseNoiseZeroV1Compatible(self):
+    # Note: The old versions of extract_glimpse was incorrect in implementation.
+    # This test is for compatibility so that graph save in old versions behave
+    # the same. Notice the API uses gen_image_ops.extract_glimpse() on purpose.
+    #
+    # Image:
+    # [  0.   1.   2.   3.   4.]
+    # [  5.   6.   7.   8.   9.]
+    # [ 10.  11.  12.  13.  14.]
+    # [ 15.  16.  17.  18.  19.]
+    # [ 20.  21.  22.  23.  24.]
+    img = constant_op.constant(
+        np.arange(25).reshape((1, 5, 5, 1)), dtype=dtypes.float32)
+    with self.test_session():
+      # Result 1:
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      # [ 0.  0.  0.]
+      result1 = gen_image_ops.extract_glimpse(
+          img, [3, 3], [[-2, 2]],
+          centered=False,
+          normalized=False,
+          noise='zero',
+          uniform_noise=False)
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0], [0, 0, 0], [0, 0, 0]]),
+          self.evaluate(result1)[0, :, :, 0])
+
+      # Result 2:
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      # [  0.   0.   1.   2.   3.   4.   0.]
+      # [  0.   5.   6.   7.   8.   9.   0.]
+      # [  0.  10.  11.  12.  13.  14.   0.]
+      # [  0.  15.  16.  17.  18.  19.   0.]
+      # [  0.  20.  21.  22.  23.  24.   0.]
+      # [  0.   0.   0.   0.   0.   0.   0.]
+      result2 = gen_image_ops.extract_glimpse(
+          img, [7, 7], [[0, 0]], normalized=False, noise='zero',
+          uniform_noise=False)
+      self.assertAllEqual(
+          np.asarray([[0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 3, 4, 0],
+                      [0, 5, 6, 7, 8, 9, 0], [0, 10, 11, 12, 13, 14, 0],
+                      [0, 15, 16, 17, 18, 19, 0], [0, 20, 21, 22, 23, 24, 0],
+                      [0, 0, 0, 0, 0, 0, 0]]),
+          self.evaluate(result2)[0, :, :, 0])
+
+
   def testGlimpseNoiseZero(self):
     # Image:
     # [  0.   1.   2.   3.   4.]
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 49f44872ebf..e86dee798a8 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4063,10 +4063,10 @@ def extract_glimpse(
   >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
   ...                         centered=False, normalized=False)
   <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-  array([[[[0.],
-           [1.]],
-          [[3.],
-           [4.]]]], dtype=float32)>
+  array([[[[4.],
+           [5.]],
+          [[7.],
+           [8.]]]], dtype=float32)>
 
   Args:
     input: A `Tensor` of type `float32`. A 4-D float tensor of shape
@@ -4091,7 +4091,7 @@ def extract_glimpse(
   Returns:
     A `Tensor` of type `float32`.
   """
-  return gen_image_ops.extract_glimpse(
+  return gen_image_ops.extract_glimpse_v2(
       input=input,
       size=size,
       offsets=offsets,

From 8e3bc844b1a081def879f563d49fee82e3a819ae Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 12 May 2020 09:31:06 -0700
Subject: [PATCH 0405/1533] Add support for a device ID op in parallel_device

The op doesn't really make sense to register kernels for, so I'm not registering it anywhere by default yet; it's currently just registered in the parallel device tests.

PiperOrigin-RevId: 311141160
Change-Id: Iff1839112dac6fe3406e4b31f0e6f7239809a5bb
---
 tensorflow/c/eager/parallel_device/BUILD      | 17 ++++++
 .../eager/parallel_device/parallel_device.cc  | 51 ++++++++++++++++
 .../parallel_device/parallel_device_ops.cc    | 26 ++++++++
 .../parallel_device/parallel_device_test.cc   | 59 ++++++++++++++-----
 .../python/distribute/parallel_device/BUILD   | 23 ++++++++
 .../parallel_device/parallel_device.py        | 20 +++++++
 .../parallel_device/parallel_device_test.py   |  6 ++
 7 files changed, 186 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/c/eager/parallel_device/parallel_device_ops.cc

diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 92947be79f8..3b2640e14d1 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -44,6 +44,7 @@ tf_cc_test(
     srcs = ["parallel_device_test.cc"],
     deps = [
         ":parallel_device",
+        ":parallel_device_ops",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c/eager:c_api",
@@ -53,3 +54,19 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+# Note: ParallelDevice-specific ops are experimental and not currently linked in
+# to TensorFlow by default, just used in a few tests.
+filegroup(
+    name = "parallel_device_ops_srcs",
+    srcs = ["parallel_device_ops.cc"],
+    visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
+)
+
+cc_library(
+    name = "parallel_device_ops",
+    srcs = [":parallel_device_ops_srcs"],
+    visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/core:framework"],
+    alwayslink = 1,
+)
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index e6846809fcf..27c2699c4c2 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -92,6 +92,10 @@ class ParallelDevice {
                                                        TFE_TensorHandle* tensor,
                                                        TF_Status* status) const;
 
+  // A parallel tensor with scalar integers numbering component devices.
+  std::unique_ptr<ParallelTensor> DeviceIDs(TFE_Context* context,
+                                            TF_Status* status) const;
+
   // Takes a description of a single operation being executed on the
   // ParallelDevice, and in turn runs one operation per component device with
   // its corresponding inputs from the input ParallelTensors (or
@@ -208,6 +212,46 @@ std::unique_ptr<ParallelTensor> ParallelDevice::CopyToParallelDevice(
                                            status);
 }
 
+std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
+    TFE_Context* context, TF_Status* status) const {
+  // TODO(allenl): We could cache DeviceIDs (keyed by context).
+  std::vector<TensorHandlePtr> components;
+  components.reserve(underlying_devices_.size());
+  for (int device_index = 0; device_index < underlying_devices_.size();
+       ++device_index) {
+    int64_t* device_id = new int64_t;
+    *device_id = device_index;
+    std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+        TF_NewTensor(
+            TF_INT64, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            sizeof(int64_t),
+            [](void* data, size_t, void* arg) {
+              delete reinterpret_cast<int64_t*>(data);
+            },
+            nullptr),
+        TF_DeleteTensor);
+    // TODO(allenl): Here and when executing regular operations, we could hold
+    // on to one TFE_Op per device and just call TFE_ResetOp to avoid parsing
+    // device names repeatedly.
+    OpPtr const_op(TFE_NewOp(context, "Const", status));
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetDevice(const_op.get(), underlying_devices_[device_index].c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT64);
+    TFE_TensorHandle* device_handle;
+    int num_outputs = 1;
+    TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    components.emplace_back(device_handle);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  return ParallelTensor::FromTensorHandles(*this, std::move(components),
+                                           status);
+}
+
 absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     TFE_Context* context, std::vector<MaybeParallelTensorUnowned> inputs,
     const char* operation_name, const TFE_OpAttrs* attributes,
@@ -282,6 +326,13 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     }
     result.emplace(std::move(outputs));
     return result;
+  } else if (operation_name == std::string("DeviceID")) {
+    std::vector<MaybeParallelTensorOwned> result_content;
+    result_content.reserve(1);
+    result_content.push_back(DeviceIDs(context, status));
+    if (TF_GetCode(status) != TF_OK) return result;
+    result.emplace(std::move(result_content));
+    return result;
   }
   absl::optional<std::vector<std::unique_ptr<ParallelTensor>>>
       maybe_parallel_results(
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
new file mode 100644
index 00000000000..1decffca047
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+// TODO(allenl): Figure out if we need this op, and if so whether we should move
+// it to core TF. Right now the eager C API does some checking of op
+// registrations before calling into custom devices, but we may be able to avoid
+// that.
+REGISTER_OP("DeviceID")
+    .Output("device_id: int64")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
index 9b0613b0391..fdc140407df 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
@@ -278,14 +278,15 @@ TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
 }
 
 // Assert that `handle` is equal to `expected_value`.
-void AssertScalarFloatEq(TFE_TensorHandle* handle, float expected_value) {
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_zero(
       TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  ASSERT_EQ(expected_value,
-            *static_cast<float*>(TF_TensorData(value_zero.get())));
+  EXPECT_EQ(expected_value,
+            *static_cast<value_type*>(TF_TensorData(value_zero.get())));
 }
 
 template <std::size_t num_devices>
@@ -343,8 +344,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     ExtractPerDeviceValues(context, read.get(), &components, status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    AssertScalarFloatEq(components[0].get(), 20.);
-    AssertScalarFloatEq(components[1].get(), 20.);
+    ExpectScalarEq<float>(components[0].get(), 20.);
+    ExpectScalarEq<float>(components[1].get(), 20.);
 
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
@@ -373,8 +374,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     ExtractPerDeviceValues(context, read.get(), &components, status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    AssertScalarFloatEq(components[0].get(), 23.);
-    AssertScalarFloatEq(components[1].get(), 18.);
+    ExpectScalarEq<float>(components[0].get(), 23.);
+    ExpectScalarEq<float>(components[1].get(), 18.);
 
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
@@ -383,6 +384,32 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
         TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
     ASSERT_EQ(underlying_devices[1], second_device);
   }
+  // Compute the device ID twice and verify the result
+  for (int i = 0; i < 2; ++i) {
+    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_OpSetDevice(op.get(), device_name, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_TensorHandle* result_handle;
+    int num_retvals = 1;
+    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, result_handle, &components, status.get());
+    TFE_DeleteTensorHandle(result_handle);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<int64_t>(components[0].get(), 0);
+    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
 }
 
 TEST(PARALLEL_DEVICE, TestBasicCPU) {
@@ -498,8 +525,8 @@ TEST(PARALLEL_DEVICE, TestExplicitCopies) {
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     // The value of the original tensor is replicated on each device.
-    AssertScalarFloatEq(components[0].get(), 3.);
-    AssertScalarFloatEq(components[1].get(), 3.);
+    ExpectScalarEq<float>(components[0].get(), 3.);
+    ExpectScalarEq<float>(components[1].get(), 3.);
 
     // Verify that the mirrors are placed on the component devices.
     std::string first_device =
@@ -630,7 +657,7 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
                          &second_components, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-  AssertScalarFloatEq(second_components[1].get(), 9.);
+  ExpectScalarEq<float>(second_components[1].get(), 9.);
 
   // Verify that the mirrors are placed on the component devices.
   std::string first_device = TFE_TensorHandleBackingDeviceName(
@@ -644,8 +671,8 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
   std::array<TensorHandlePtr, 2> first_components;
   ExtractPerDeviceValues(context.get(), second_components[0].get(),
                          &first_components, status.get());
-  AssertScalarFloatEq(first_components[0].get(), 3.);
-  AssertScalarFloatEq(first_components[1].get(), 6.);
+  ExpectScalarEq<float>(first_components[0].get(), 3.);
+  ExpectScalarEq<float>(first_components[1].get(), 6.);
 
   first_device = TFE_TensorHandleBackingDeviceName(first_components[0].get(),
                                                    status.get());
@@ -806,8 +833,8 @@ TEST(PARALLEL_DEVICE, TestCollective) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 3.);
-  AssertScalarFloatEq(result_components[1].get(), 3.);
+  ExpectScalarEq<float>(result_components[0].get(), 3.);
+  ExpectScalarEq<float>(result_components[1].get(), 3.);
 }
 
 void RegisterCollectiveMulFunction(TFE_Context* context,
@@ -909,8 +936,8 @@ TEST(PARALLEL_DEVICE, TestFunction) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 7. * 9.);
-  AssertScalarFloatEq(result_components[1].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[0].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[1].get(), 7. * 9.);
 
   std::string first_device = TFE_TensorHandleBackingDeviceName(
       result_components[0].get(), status.get());
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index 43c6029f3d2..930816d4407 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,3 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
@@ -14,6 +17,7 @@ py_library(
     srcs = ["parallel_device.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":parallel_device_ops",
         ":saving",
         "//tensorflow/python:_pywrap_parallel_device",
     ],
@@ -26,6 +30,25 @@ py_library(
     deps = ["//tensorflow/python:framework_ops"],
 )
 
+tf_gen_op_wrapper_py(
+    name = "parallel_device_ops_py",
+    out = "gen_parallel_device_ops.py",
+    deps = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+)
+
+tf_custom_op_library(
+    name = "_parallel_device_ops.so",
+    srcs = ["//tensorflow/c/eager/parallel_device:parallel_device_ops_srcs"],
+)
+
+tf_custom_op_py_library(
+    name = "parallel_device_ops",
+    dso = [":_parallel_device_ops.so"],
+    kernels = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+    visibility = ["//tensorflow:internal"],
+    deps = [":parallel_device_ops_py"],
+)
+
 py_test(
     name = "parallel_device_test",
     srcs = ["parallel_device_test.py"],
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device.py b/tensorflow/python/distribute/parallel_device/parallel_device.py
index 982b061cdb7..2dbdc653a64 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device.py
@@ -22,11 +22,17 @@ import contextlib
 import threading
 
 from tensorflow.python import _pywrap_parallel_device
+from tensorflow.python.distribute.parallel_device import gen_parallel_device_ops
 from tensorflow.python.distribute.parallel_device import saving
 from tensorflow.python.eager import context
+from tensorflow.python.framework import load_library
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
 from tensorflow.python.tpu.ops import tpu_ops
 
+load_library.load_op_library(
+    resource_loader.get_path_to_datafile("_parallel_device_ops.so"))
+
 _next_device_number = 0
 _next_device_number_lock = threading.Lock()
 
@@ -58,6 +64,8 @@ class ParallelDevice(object):
     device, device_info = _pywrap_parallel_device.GetParallelDeviceCapsules(
         self.name, self.components)
     context.register_custom_device(device, self.name, device_info)
+    with ops.device(self.name):
+      self._device_ids = gen_parallel_device_ops.device_id()
 
   def pack(self, tensors):
     """Create a tensor on the parallel device from a sequence of tensors.
@@ -84,6 +92,18 @@ class ParallelDevice(object):
       return tpu_ops.tpu_replicated_output(
           parallel_tensor, num_replicas=len(self.components))
 
+  @property
+  def device_ids(self):
+    """A parallel tensor with scalar integers numbering component devices.
+
+    Each device ID is placed on its corresponding device, in the same order as
+    the `components` constructor argument.
+
+    Returns:
+      A parallel tensor containing 0 on the first device, 1 on the second, etc.
+    """
+    return self._device_ids
+
   # TODO(allenl): Fixing saving in Python is a bit odd. One alternative would be
   # to provide a hook for the custom device to create save specs/etc., then call
   # that hook from the default variable implementation if the variable is on a
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index d3f3417eca9..e35eb601cc5 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -119,6 +119,12 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[0], outputs[0].backing_device)
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
+  def test_device_id(self):
+    device_ids = self.device.unpack(self.device.device_ids)
+    self.assertAllClose([0, 1], device_ids)
+    self.assertIn(self.device.components[0], device_ids[0].backing_device)
+    self.assertIn(self.device.components[1], device_ids[1].backing_device)
+
   def test_collective_reduce(self):
     with ops.device(self.device.name):
       x = self.device.pack(

From 3116ec3708443de4360c631f62a23b26eccd6763 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 09:35:57 -0700
Subject: [PATCH 0406/1533] Resolve trivial aliases for portable TensorFlow
 targets.

PiperOrigin-RevId: 311142154
Change-Id: I702bddcc4b6dfb69d9a8747770fc88826603b1aa
---
 tensorflow/lite/delegates/flex/BUILD | 14 +++++++-------
 tensorflow/lite/testing/BUILD        |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 9fe80605e39..d69d2207e63 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -26,7 +26,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
@@ -66,7 +66,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
@@ -103,7 +103,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -137,7 +137,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/common_runtime/eager:context",
@@ -183,7 +183,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/common_runtime/eager:context",
@@ -211,7 +211,7 @@ tf_cc_test(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
@@ -245,7 +245,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 9d50f1ad604..df85f659bf3 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -329,7 +329,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -368,7 +368,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -408,7 +408,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -443,7 +443,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )

From f4628678066c72309d3fd121af1aaf54d9905ca3 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 12 May 2020 09:46:28 -0700
Subject: [PATCH 0407/1533] [XLA:Python] Make sure xla_client is always
 imported before TPU client extension.

PiperOrigin-RevId: 311144400
Change-Id: Ia499185c36b5596b7aa25c44e51fd07696f85cfe
---
 tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
index ef0caff0ae6..6d4482af43f 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 from absl import logging
 
+# Import xla_client to load shared C++ extensions (just CompileOptions at the
+# time of writing).
+from tensorflow.compiler.xla.python import xla_client  # pylint: disable=unused-import
 from tensorflow.compiler.xla.python.tpu_driver.client import tpu_client_extension as _tpu_client
 
 
From d2bc2b66a3a0e373d3a6ecf56d45955ae9375591 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 09:46:51 -0700
Subject: [PATCH 0408/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311144477
Change-Id: Iaa231e5c7e87d6e930b37003675adb307dad79b4
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 8c80414bacb3aaf5327b60d8538274e3d8cc7a7c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 27 Apr 2020 17:43:44 +0000
Subject: [PATCH 0409/1533] Add api_def_ExtractGlimpseV2.pbtxt

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../base_api/api_def_ExtractGlimpseV2.pbtxt   | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt
new file mode 100644
index 00000000000..160b864a007
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "ExtractGlimpseV2"
+  in_arg {
+    name: "input"
+    description: <<END
+A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+END
+  }
+  in_arg {
+    name: "size"
+    description: <<END
+A 1-D tensor of 2 elements containing the size of the glimpses
+to extract.  The glimpse height must be specified first, following
+by the glimpse width.
+END
+  }
+  in_arg {
+    name: "offsets"
+    description: <<END
+A 2-D integer tensor of shape `[batch_size, 2]` containing
+the y, x locations of the center of each window.
+END
+  }
+  out_arg {
+    name: "glimpse"
+    description: <<END
+A tensor representing the glimpses `[batch_size,
+glimpse_height, glimpse_width, channels]`.
+END
+  }
+  attr {
+    name: "centered"
+    description: <<END
+indicates if the offset coordinates are centered relative to
+the image, in which case the (0, 0) offset is relative to the center
+of the input images. If false, the (0,0) offset corresponds to the
+upper left corner of the input images.
+END
+  }
+  attr {
+    name: "normalized"
+    description: <<END
+indicates if the offset coordinates are normalized.
+END
+  }
+  attr {
+    name: "uniform_noise"
+    description: <<END
+indicates if the noise should be generated using a
+uniform distribution or a Gaussian distribution.
+END
+  }
+  attr {
+    name: "noise"
+    description: <<END
+indicates if the noise should `uniform`, `gaussian`, or
+`zero`. The default is `uniform` which means the the noise type
+will be decided by `uniform_noise`.
+END
+  }
+  summary: "Extracts a glimpse from the input tensor."
+  description: <<END
+Returns a set of windows called glimpses extracted at location
+`offsets` from the input tensor. If the windows only partially
+overlaps the inputs, the non overlapping areas will be filled with
+random noise.
+
+The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+glimpse_width, channels]`. The channels and batch dimensions are the
+same as that of the input tensor. The height and width of the output
+windows are specified in the `size` parameter.
+
+The argument `normalized` and `centered` controls how the windows are built:
+
+* If the coordinates are normalized but not centered, 0.0 and 1.0
+  correspond to the minimum and maximum of each height and width
+  dimension.
+* If the coordinates are both normalized and centered, they range from
+  -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+  left corner, the lower right corner is located at (1.0, 1.0) and the
+  center is at (0, 0).
+* If the coordinates are not normalized they are interpreted as
+  numbers of pixels.
+END
+}

From e65a439361fbb5c0dd3ab1f44b8d19e915b29e9c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 27 Apr 2020 23:08:17 +0000
Subject: [PATCH 0410/1533] Update API golden

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt      | 1 +
 tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt       | 4 ++++
 tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt       | 4 ++++
 3 files changed, 9 insertions(+)

diff --git a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt
index 160b864a007..aeb87346ab2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExtractGlimpseV2.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ExtractGlimpseV2"
+  visibility: HIDDEN
   in_arg {
     name: "input"
     description: <<END
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index e622768979c..e15cb321f37 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1472,6 +1472,10 @@ tf_module {
     name: "ExtractGlimpse"
     argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
   }
+  member_method {
+    name: "ExtractGlimpseV2"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
+  }
   member_method {
     name: "ExtractImagePatches"
     argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index e622768979c..e15cb321f37 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1472,6 +1472,10 @@ tf_module {
     name: "ExtractGlimpse"
     argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
   }
+  member_method {
+    name: "ExtractGlimpseV2"
+    argspec: "args=[\'input\', \'size\', \'offsets\', \'centered\', \'normalized\', \'uniform_noise\', \'noise\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'True\', \'True\', \'uniform\', \'None\'], "
+  }
   member_method {
     name: "ExtractImagePatches"
     argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "

From 23890c005abf30410e8a2092b7fe426a47bde2c4 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 28 Apr 2020 19:22:49 +0000
Subject: [PATCH 0411/1533] Reroute tf.compat.v1.extract_glimpse to use
 gen_image_ops.extract_glimpse (old API)

This fix reroute tf.compat.v1.extract_glimpse to use gen_image_ops.extract_glimpse,
so that the behavior of TF 1.x remains the same.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/image_ops_impl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e86dee798a8..a86d3af2492 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4091,7 +4091,7 @@ def extract_glimpse(
   Returns:
     A `Tensor` of type `float32`.
   """
-  return gen_image_ops.extract_glimpse_v2(
+  return gen_image_ops.extract_glimpse(
       input=input,
       size=size,
       offsets=offsets,

From 960bbc2d1bb95efd65177fdbdd70a63781eecfab Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 29 Apr 2020 20:13:46 +0000
Subject: [PATCH 0412/1533] Update RELEADE.md to capture the breaking change of
 `tf.image.extract_glimpse`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 RELEASE.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 6c8921cf492..673d854d1b9 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,8 @@
+# Release 2.3.0
+
+## Breaking Changes
+* `tf.image.extract_glimpse` has been updated to correctly process the case where `centered=False` and `normalized=False`. This is a breaking change as the output is different from (incorrect) previous versions. Note this breaking change only impacts `tf.image.extract_glimpse` and `tf.compat.v2.image.extract_glimpse` API endpoints. The behavior of `tf.compat.v1.image.extract_glimpse` does not change. The behavior of exsiting C++ kernel `ExtractGlimpse` does not change as well, so saved models will not be impacted.
+
 # Release 2.2.0
 
 TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update).

From c00af599966359e4e0090cfd5191441354052068 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 6 May 2020 22:19:40 +0000
Subject: [PATCH 0413/1533] Update doc example of v1 to keep old behavior with
 usage of tf.compat.v1.image.extract_glimpse

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/image_ops_impl.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index a86d3af2492..633725da511 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -4060,13 +4060,13 @@ def extract_glimpse(
   ...          [[6.0],
   ...           [7.0],
   ...           [8.0]]]]
-  >>> tf.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
-  ...                         centered=False, normalized=False)
+  >>> tf.compat.v1.image.extract_glimpse(x, size=(2, 2), offsets=[[1, 1]],
+  ...                                    centered=False, normalized=False)
   <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
-  array([[[[4.],
-           [5.]],
-          [[7.],
-           [8.]]]], dtype=float32)>
+  array([[[[0.],
+           [1.]],
+          [[3.],
+           [4.]]]], dtype=float32)>
 
   Args:
     input: A `Tensor` of type `float32`. A 4-D float tensor of shape

From 2ffde8a33949bdf3209d58729f7c56045a621deb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 10:07:20 -0700
Subject: [PATCH 0414/1533] This breaks multi-python: The local gen_build_info
 rule calls into find_cuda_config, which only works in the remote image.

This is additionally brittle: relying on TF_CUDA_VERSION being an action_env is poisoning our caches, and running find_cuda_conifg multiple times is bugprone.

I think the better way to do this is to put the information from the repo_rule into a file template as part of the repo rule configuration (cuda_configure.bzl). Then we can just include that file, instead of trying to do that as part of the action.

PiperOrigin-RevId: 311148754
Change-Id: I80daa8652a85b2a1897c15117e6422bfd21cee6a
---
 tensorflow/python/BUILD                       |  19 ----
 .../python/keras/layers/recurrent_v2.py       |   4 +-
 tensorflow/python/platform/build_info_test.py |   6 +-
 tensorflow/python/platform/self_check.py      |   5 +-
 tensorflow/python/platform/sysconfig.py       |  28 -----
 tensorflow/python/platform/sysconfig_test.py  |  38 -------
 tensorflow/tensorflow.bzl                     |  25 ++---
 .../api/golden/v1/tensorflow.sysconfig.pbtxt  |   4 -
 .../api/golden/v2/tensorflow.sysconfig.pbtxt  |   4 -
 tensorflow/tools/build_info/BUILD             |   1 -
 tensorflow/tools/build_info/gen_build_info.py | 101 +++++++++---------
 tensorflow/tools/pip_package/setup.py         |  54 +++-------
 third_party/gpus/BUILD                        |   6 --
 13 files changed, 85 insertions(+), 210 deletions(-)
 delete mode 100644 tensorflow/python/platform/sysconfig_test.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index bf17c828d66..0b046ea8d61 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -264,7 +264,6 @@ py_library(
     deps = [
         ":_pywrap_util_port",
         ":lib",
-        ":platform_build_info",
         ":pywrap_tfe",
         ":util",
         "//tensorflow/core:protos_all_py",
@@ -329,24 +328,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "sysconfig_test",
-    size = "small",
-    srcs = ["platform/sysconfig_test.py"],
-    data = [
-        "platform/sysconfig.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":platform",
-        ":platform_test",
-    ],
-)
-
 tf_py_test(
     name = "flags_test",
     size = "small",
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 9605c296885..a9d5ef8587c 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -601,7 +601,7 @@ def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
   # (6 * units)
   bias = array_ops.split(K.flatten(bias), 6)
 
-  if build_info.build_info['is_cuda_build']:
+  if build_info.is_cuda_build:
     # Note that the gate order for CuDNN is different from the canonical format.
     # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need
     # to be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
@@ -1361,7 +1361,7 @@ def gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
   # so that mathematically it is same as the canonical LSTM implementation.
   full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
 
-  if build_info.build_info['is_rocm_build']:
+  if build_info.is_rocm_build:
     # ROCm MIOpen's weight sequence for LSTM is different from both canonical
     # and Cudnn format
     # MIOpen: [i, f, o, c] Cudnn/Canonical: [i, f, c, o]
diff --git a/tensorflow/python/platform/build_info_test.py b/tensorflow/python/platform/build_info_test.py
index be253885715..f0df0b756cc 100644
--- a/tensorflow/python/platform/build_info_test.py
+++ b/tensorflow/python/platform/build_info_test.py
@@ -25,10 +25,8 @@ from tensorflow.python.platform import test
 class BuildInfoTest(test.TestCase):
 
   def testBuildInfo(self):
-    self.assertEqual(build_info.build_info['is_rocm_build'],
-                     test.is_built_with_rocm())
-    self.assertEqual(build_info.build_info['is_cuda_build'],
-                     test.is_built_with_cuda())
+    self.assertEqual(build_info.is_rocm_build, test.is_built_with_rocm())
+    self.assertEqual(build_info.is_cuda_build, test.is_built_with_cuda())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/platform/self_check.py b/tensorflow/python/platform/self_check.py
index c10c4108c7d..f6cf7705e13 100644
--- a/tensorflow/python/platform/self_check.py
+++ b/tensorflow/python/platform/self_check.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import os
 
-MSVCP_DLL_NAMES = "msvcp_dll_names"
 
 try:
   from tensorflow.python.platform import build_info
@@ -43,9 +42,9 @@ def preload_check():
     # we load the Python extension, so that we can raise an actionable error
     # message if they are not found.
     import ctypes  # pylint: disable=g-import-not-at-top
-    if MSVCP_DLL_NAMES in build_info.build_info:
+    if hasattr(build_info, "msvcp_dll_names"):
       missing = []
-      for dll_name in build_info.build_info[MSVCP_DLL_NAMES].split(","):
+      for dll_name in build_info.msvcp_dll_names.split(","):
         try:
           ctypes.WinDLL(dll_name)
         except OSError:
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index a155ef04a4c..721ad99c60a 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -24,7 +24,6 @@ import platform as _platform
 from tensorflow.python.framework.versions import CXX11_ABI_FLAG as _CXX11_ABI_FLAG
 from tensorflow.python.framework.versions import MONOLITHIC_BUILD as _MONOLITHIC_BUILD
 from tensorflow.python.framework.versions import VERSION as _VERSION
-from tensorflow.python.platform import build_info
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -85,30 +84,3 @@ def get_link_flags():
     else:
       flags.append('-l:libtensorflow_framework.so.%s' % ver)
   return flags
-
-
-@tf_export('sysconfig.get_build_info')
-def get_build_info():
-  """Get a dictionary describing TensorFlow's build environment.
-
-  Values are generated when TensorFlow is compiled, and are static for each
-  TensorFlow package. The return value is a dictionary with string keys such as:
-
-    - cuda_version
-    - cudnn_version
-    - tensorrt_version
-    - nccl_version
-    - is_cuda_build
-    - is_rocm_build
-    - msvcp_dll_names
-    - nvcuda_dll_name
-    - cudart_dll_name
-    - cudnn_dll_name
-
-  Note that the actual keys and values returned by this function is subject to
-  change across different versions of TensorFlow or across platforms.
-
-  Returns:
-    A Dictionary describing TensorFlow's build environment.
-  """
-  return build_info.build_info
diff --git a/tensorflow/python/platform/sysconfig_test.py b/tensorflow/python/platform/sysconfig_test.py
deleted file mode 100644
index 3e5956bf4f7..00000000000
--- a/tensorflow/python/platform/sysconfig_test.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.platform import googletest
-from tensorflow.python.platform import sysconfig
-from tensorflow.python.platform import test
-
-
-class SysconfigTest(googletest.TestCase):
-
-  def test_get_build_info_works(self):
-    build_info = sysconfig.get_build_info()
-    self.assertIsInstance(build_info, dict)
-
-  def test_rocm_cuda_info_matches(self):
-    build_info = sysconfig.get_build_info()
-    self.assertEqual(build_info["is_rocm_build"], test.is_built_with_rocm())
-    self.assertEqual(build_info["is_cuda_build"], test.is_built_with_cuda())
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 7d35ee7d8f8..f56330b428a 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2593,10 +2593,6 @@ def tf_version_info_genrule(name, out):
         arguments = "--generate \"$@\" --git_tag_override=${GIT_TAG_OVERRIDE:-}",
     )
 
-def dict_to_kv(d):
-    """Convert a dictionary to a space-joined list of key=value pairs."""
-    return " " + " ".join(["%s=%s" % (k, v) for k, v in d.items()])
-
 def tf_py_build_info_genrule(name, out):
     _local_genrule(
         name = name,
@@ -2604,17 +2600,16 @@ def tf_py_build_info_genrule(name, out):
         exec_tool = "//tensorflow/tools/build_info:gen_build_info",
         arguments =
             "--raw_generate \"$@\" " +
-            " --key_value" +
-            " is_rocm_build=" + if_rocm("True", "False") +
-            " is_cuda_build=" + if_cuda("True", "False") +
-            # TODO(angerson) Can we reliably load CUDA compute capabilities here?
-            if_windows(dict_to_kv({
-                "msvcp_dll_names": "msvcp140.dll,msvcp140_1.dll",
-            }), "") + if_windows_cuda(dict_to_kv({
-                "nvcuda_dll_name": "nvcuda.dll",
-                "cudart_dll_name": "cudart64_$$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
-                "cudnn_dll_name": "cudnn64_$${TF_CUDNN_VERSION:-}.dll",
-            }), ""),
+            " --is_config_cuda " + if_cuda("True", "False") +
+            " --is_config_rocm " + if_rocm("True", "False") +
+            " --key_value " +
+            if_cuda(" cuda_version_number=${TF_CUDA_VERSION:-} cudnn_version_number=${TF_CUDNN_VERSION:-} ", "") +
+            if_windows(" msvcp_dll_names=msvcp140.dll,msvcp140_1.dll ", "") +
+            if_windows_cuda(" ".join([
+                "nvcuda_dll_name=nvcuda.dll",
+                "cudart_dll_name=cudart64_$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
+                "cudnn_dll_name=cudnn64_${TF_CUDNN_VERSION:-}.dll",
+            ]), ""),
     )
 
 def cc_library_with_android_deps(
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
index 7b05d382f6c..811ca18cdb4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "MONOLITHIC_BUILD"
     mtype: "<type \'int\'>"
   }
-  member_method {
-    name: "get_build_info"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_compile_flags"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
index 7b05d382f6c..811ca18cdb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "MONOLITHIC_BUILD"
     mtype: "<type \'int\'>"
   }
-  member_method {
-    name: "get_build_info"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_compile_flags"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index f1292408448..556dd0c86f0 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -14,7 +14,6 @@ py_binary(
     srcs_version = "PY2AND3",
     tags = ["no-remote-exec"],
     deps = [
-        "//third_party/gpus:find_cuda_config",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index c8330d9310f..df9068fb3d1 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -1,4 +1,4 @@
-# Lint as: python3
+# Lint as: python2, python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,62 +19,50 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import os
-import platform
-import sys
 
 import six
 
-# CUDA library gathering is only valid in OSS
-try:
-  from third_party.gpus import find_cuda_config  # pylint: disable=g-import-not-at-top
-except ImportError:
-  find_cuda_config = None
 
-
-def write_build_info(filename, key_value_list):
+def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
   """Writes a Python that describes the build.
 
   Args:
     filename: filename to write to.
+    is_config_cuda: Whether this build is using CUDA.
+    is_config_rocm: Whether this build is using ROCm.
     key_value_list: A list of "key=value" strings that will be added to the
-      module's "build_info" dictionary as additional entries.
+      module as additional fields.
+
+  Raises:
+    ValueError: If `key_value_list` includes the key "is_cuda_build", which
+      would clash with one of the default fields.
   """
+  module_docstring = "\"\"\"Generates a Python module containing information "
+  module_docstring += "about the build.\"\"\""
 
-  build_info = {}
-  for arg in key_value_list:
-    key, value = six.ensure_str(arg).split("=")
-    if value.lower() == "true":
-      build_info[key] = True
-    elif value.lower() == "false":
-      build_info[key] = False
-    else:
-      build_info[key] = value
+  build_config_rocm_bool = "False"
+  build_config_cuda_bool = "False"
 
-  # Generate cuda_build_info, a dict describing the CUDA component versions
-  # used to build TensorFlow.
-  if find_cuda_config and build_info.get("is_cuda_build", False):
-    libs = ["_", "cuda", "cudnn"]
-    if platform.system() == "Linux":
-      if os.environ.get("TF_NEED_TENSORRT", "0") == "1":
-        libs.append("tensorrt")
-      if "TF_NCCL_VERSION" in os.environ:
-        libs.append("nccl")
-    # find_cuda_config accepts libraries to inspect as argv from the command
-    # line. We can work around this restriction by setting argv manually
-    # before calling find_cuda_config.
-    backup_argv = sys.argv
-    sys.argv = libs
-    cuda = find_cuda_config.find_cuda_config()
+  if is_config_rocm == "True":
+    build_config_rocm_bool = "True"
+  elif is_config_cuda == "True":
+    build_config_cuda_bool = "True"
 
-    build_info["cuda_version"] = cuda["cuda_version"]
-    build_info["cudnn_version"] = cuda["cudnn_version"]
-    build_info["tensorrt_version"] = cuda.get("tensorrt_version", None)
-    build_info["nccl_version"] = cuda.get("nccl_version", None)
-    sys.argv = backup_argv
+  key_value_pair_stmts = []
+  if key_value_list:
+    for arg in key_value_list:
+      key, value = six.ensure_str(arg).split("=")
+      if key == "is_cuda_build":
+        raise ValueError("The key \"is_cuda_build\" cannot be passed as one of "
+                         "the --key_value arguments.")
+      if key == "is_rocm_build":
+        raise ValueError("The key \"is_rocm_build\" cannot be passed as one of "
+                         "the --key_value arguments.")
+      key_value_pair_stmts.append("%s = %r" % (key, value))
+  key_value_pair_content = "\n".join(key_value_pair_stmts)
 
   contents = """
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -88,21 +76,33 @@ def write_build_info(filename, key_value_list):
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-\"\"\"Auto-generated module providing information about the build.\"\"\"
+%s
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
+is_rocm_build = %s
+is_cuda_build = %s
 
-build_info = {build_info}
-""".format(build_info=build_info)
+%s
+""" % (module_docstring, build_config_rocm_bool, build_config_cuda_bool,
+       key_value_pair_content)
   open(filename, "w").write(contents)
 
 
 parser = argparse.ArgumentParser(
     description="""Build info injection into the PIP package.""")
 
+parser.add_argument(
+    "--is_config_cuda",
+    type=str,
+    help="'True' for CUDA GPU builds, 'False' otherwise.")
+
+parser.add_argument(
+    "--is_config_rocm",
+    type=str,
+    help="'True' for ROCm GPU builds, 'False' otherwise.")
+
 parser.add_argument("--raw_generate", type=str, help="Generate build_info.py")
 
 parser.add_argument(
@@ -110,7 +110,10 @@ parser.add_argument(
 
 args = parser.parse_args()
 
-if args.raw_generate:
-  write_build_info(args.raw_generate, args.key_value)
+if (args.raw_generate is not None) and (args.is_config_cuda is not None) and (
+    args.is_config_rocm is not None):
+  write_build_info(args.raw_generate, args.is_config_cuda, args.is_config_rocm,
+                   args.key_value)
 else:
-  raise RuntimeError("--raw_generate must be used.")
+  raise RuntimeError(
+      "--raw_generate, --is_config_cuda and --is_config_rocm must be used")
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index fafe494bed4..f61e00c01d5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -1,4 +1,3 @@
-# lint as: python3
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -44,8 +43,6 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-from tensorflow.python.platform import build_info
-
 DOCLINES = __doc__.split('\n')
 
 # This version string is semver compatible, but incompatible with pip.
@@ -85,22 +82,6 @@ REQUIRED_PACKAGES = [
     'scipy == 1.2.2;python_version<"3"',
 ]
 
-# Generate a footer describing the CUDA technology this release was built
-# against.
-GPU_DESCRIPTION = ''
-if build_info.build_info['is_cuda_build']:
-  gpu_header = ('\nTensorFlow {} for NVIDIA GPUs was built with these '
-                'platform and library versions:\n\n  - ').format(_VERSION)
-  bi = build_info.build_info
-  trt_ver = bi['tensorrt_version']
-  nccl_ver = bi['nccl_version']
-  GPU_DESCRIPTION = gpu_header + '\n  - '.join([
-      'NVIDIA CUDA ' + bi['cuda_version'],
-      'NVIDIA cuDNN ' + bi['cudnn_version'],
-      'NVIDIA NCCL ' + 'not enabled' if not nccl_ver else nccl_ver,
-      'NVIDIA TensorRT ' + 'not enabled' if not trt_ver else trt_ver,
-  ])
-
 if sys.byteorder == 'little':
   # grpcio does not build correctly on big-endian machines due to lack of
   # BoringSSL support.
@@ -136,8 +117,7 @@ CONSOLE_SCRIPTS = [
     # even though the command is not removed, just moved to a different wheel.
     'tensorboard = tensorboard.main:run_main',
     'tf_upgrade_v2 = tensorflow.tools.compatibility.tf_upgrade_v2_main:main',
-    'estimator_ckpt_converter = '
-    'tensorflow_estimator.python.estimator.tools.checkpoint_converter:main',
+    'estimator_ckpt_converter = tensorflow_estimator.python.estimator.tools.checkpoint_converter:main',
 ]
 # pylint: enable=line-too-long
 
@@ -181,10 +161,11 @@ class InstallHeaders(Command):
   """
   description = 'install C/C++ header files'
 
-  user_options = [
-      ('install-dir=', 'd', 'directory to install header files to'),
-      ('force', 'f', 'force installation (overwrite existing files)'),
-  ]
+  user_options = [('install-dir=', 'd',
+                   'directory to install header files to'),
+                  ('force', 'f',
+                   'force installation (overwrite existing files)'),
+                 ]
 
   boolean_options = ['force']
 
@@ -194,7 +175,8 @@ class InstallHeaders(Command):
     self.outfiles = []
 
   def finalize_options(self):
-    self.set_undefined_options('install', ('install_headers', 'install_dir'),
+    self.set_undefined_options('install',
+                               ('install_headers', 'install_dir'),
                                ('force', 'force'))
 
   def mkdir_and_copy_file(self, header):
@@ -254,7 +236,9 @@ so_lib_paths = [
 
 matches = []
 for path in so_lib_paths:
-  matches.extend(['../' + x for x in find_files('*', path) if '.py' not in x])
+  matches.extend(
+      ['../' + x for x in find_files('*', path) if '.py' not in x]
+  )
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
@@ -274,16 +258,17 @@ headers = (
     list(find_files('*.h', 'tensorflow/stream_executor')) +
     list(find_files('*.h', 'google/com_google_protobuf/src')) +
     list(find_files('*.inc', 'google/com_google_protobuf/src')) +
-    list(find_files('*', 'third_party/eigen3')) +
-    list(find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
-    list(find_files('*.inc', 'tensorflow/include/external/com_google_absl')) +
-    list(find_files('*', 'tensorflow/include/external/eigen_archive')))
+    list(find_files('*', 'third_party/eigen3')) + list(
+        find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
+    list(
+        find_files('*.inc', 'tensorflow/include/external/com_google_absl'))
+    + list(find_files('*', 'tensorflow/include/external/eigen_archive')))
 
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
     description=DOCLINES[0],
-    long_description='\n'.join(DOCLINES[2:]) + GPU_DESCRIPTION,
+    long_description='\n'.join(DOCLINES[2:]),
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
@@ -304,11 +289,6 @@ setup(
         ] + matches,
     },
     zip_safe=False,
-    # Accessible with importlib.metadata.metadata('tf-pkg-name').items()
-    platforms=[
-        '{}:{}'.format(key, value)
-        for key, value in build_info.build_info.items()
-    ],
     distclass=BinaryDistribution,
     cmdclass={
         'install_headers': InstallHeaders,
diff --git a/third_party/gpus/BUILD b/third_party/gpus/BUILD
index d570c4894ce..e69de29bb2d 100644
--- a/third_party/gpus/BUILD
+++ b/third_party/gpus/BUILD
@@ -1,6 +0,0 @@
-# Expose find_cuda_config.py as a library so other tools can reference it.
-py_library(
-    name = "find_cuda_config",
-    srcs = ["find_cuda_config.py"],
-    visibility = ["//visibility:public"],
-)

From b661070db9d29a2679310fe063b21582eeed9769 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 12 May 2020 10:11:10 -0700
Subject: [PATCH 0415/1533] IWYU in profiler/internal

PiperOrigin-RevId: 311149561
Change-Id: I71100194af937fc66d44c32265b8fe8febf070df
---
 tensorflow/core/profiler/internal/BUILD        | 16 ++++++----------
 .../core/profiler/internal/annotation_stack.cc |  4 ++++
 .../core/profiler/internal/annotation_stack.h  |  1 +
 tensorflow/core/profiler/internal/cpu/BUILD    | 16 ++++++++++------
 .../core/profiler/internal/cpu/host_tracer.cc  |  9 +++++++--
 .../profiler/internal/cpu/host_tracer_test.cc  | 18 ++++++++++++------
 .../profiler/internal/cpu/host_tracer_utils.cc |  3 +++
 .../internal/cpu/metadata_collector.cc         |  6 ++++++
 .../profiler/internal/cpu/python_tracer.cc     |  9 ++++-----
 .../core/profiler/internal/parse_annotation.cc |  3 +++
 .../core/profiler/internal/parse_annotation.h  |  1 -
 .../profiler/internal/parse_annotation_test.cc |  3 +++
 .../core/profiler/internal/profiler_factory.cc |  6 ++++++
 .../core/profiler/internal/profiler_factory.h  |  1 +
 .../profiler/internal/profiler_interface.h     |  1 -
 .../internal/scoped_annotation_test.cc         | 13 ++++++++-----
 .../core/profiler/internal/traceme_recorder.cc | 10 ++++++++++
 .../core/profiler/internal/traceme_recorder.h  |  2 --
 .../profiler/internal/traceme_recorder_test.cc | 15 ++++++++++++---
 19 files changed, 96 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 9fab42cd54a..85fa4e7fc44 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -423,8 +423,10 @@ tf_cc_test(
     deps = [
         ":traceme_recorder",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -434,7 +436,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
     ],
 )
@@ -444,6 +445,7 @@ cc_library(
     hdrs = ["profiler_factory.h"],
     deps = [
         ":profiler_interface",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ] + if_static([
         ":profiler_factory_impl",
     ]),
@@ -461,8 +463,7 @@ cc_library(
     deps = [
         ":profiler_interface",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ],
     alwayslink = True,
 )
@@ -513,15 +514,10 @@ tf_cc_test(
     srcs = ["scoped_annotation_test.cc"],
     deps = [
         ":annotation_stack",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "@com_google_absl//absl/strings",
     ],
@@ -544,6 +540,6 @@ tf_cc_test(
         ":parse_annotation",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/annotation_stack.cc b/tensorflow/core/profiler/internal/annotation_stack.cc
index 4cfd1027a68..4c15ca47c3d 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.cc
+++ b/tensorflow/core/profiler/internal/annotation_stack.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
+#include <atomic>
+
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace profiler {
 namespace internal {
diff --git a/tensorflow/core/profiler/internal/annotation_stack.h b/tensorflow/core/profiler/internal/annotation_stack.h
index 38cd962cb32..e626c4c73cc 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.h
+++ b/tensorflow/core/profiler/internal/annotation_stack.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index e156667c5a7..c24c8c7d456 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -26,10 +27,10 @@ cc_library(
     srcs = ["host_tracer.cc"],
     deps = [
         ":host_tracer_utils",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/internal:traceme_recorder",
@@ -50,14 +51,17 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -67,17 +71,14 @@ cc_library(
     copts = ["-fexceptions"],
     features = ["-use_header_modules"],
     deps = [
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/python/profiler/internal:python_hooks",
-        "@com_google_absl//absl/strings",
     ],
     alwayslink = True,
 )
@@ -86,9 +87,12 @@ cc_library(
     name = "metadata_collector",
     srcs = ["metadata_collector.cc"],
     deps = [
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_debug_info_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 30b87c84fa2..be1a7a2777b 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
@@ -119,8 +124,8 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
           std::vector<absl::string_view> parts =
               absl::StrSplit(event.name, kUserMetadataMarker);
           if (parts.size() >= 2) {
-            ns->set_node_name(string(parts[0]));
-            ns->set_timeline_label(string(parts[1]));
+            ns->set_node_name(std::string(parts[0]));
+            ns->set_timeline_label(std::string(parts[1]));
           } else {
             ns->set_node_name(std::move(event.name));
           }
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index e32ba92de66..499b7b6b564 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -12,17 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <ostream>
 #include <string>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -38,13 +44,13 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-NodeExecStats MakeNodeStats(const string& name, uint32 thread_id,
-                            const string& label = "") {
+NodeExecStats MakeNodeStats(absl::string_view name, uint32 thread_id,
+                            absl::string_view label = "") {
   NodeExecStats ns;
-  ns.set_node_name(name);
+  ns.set_node_name(std::string(name));
   ns.set_thread_id(thread_id);
   if (!label.empty()) {
-    ns.set_timeline_label(label);
+    ns.set_timeline_label(std::string(label));
   }
   return ns;
 }
@@ -109,7 +115,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsRunMetadata) {
 
 TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   uint32 thread_id;
-  string thread_name = "MyThreadName";
+  std::string thread_name = "MyThreadName";
   XSpace space;
 
   // We start a thread with a known and controled name. As of the time of
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
index a4709ae2113..2e5d8ac1770 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index c6aa7840920..58da20ae3c5 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -13,17 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/cpu/python_tracer.cc b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
index 103db6e0c71..d684cb8f768 100644
--- a/tensorflow/core/profiler/internal/cpu/python_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <utility>
-#include <vector>
+#include <memory>
 
-#include "absl/strings/str_split.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/python/profiler/internal/python_hooks.h"
 
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index 2a3fa3f8454..32c26befa3d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
 #include <stack>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation.h b/tensorflow/core/profiler/internal/parse_annotation.h
index 6c2e536962b..bb0f12217d3 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.h
+++ b/tensorflow/core/profiler/internal/parse_annotation.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 
-#include <utility>
 #include <vector>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation_test.cc b/tensorflow/core/profiler/internal/parse_annotation_test.cc
index 4d4a2d5ea95..e5d876ac5af 100644
--- a/tensorflow/core/profiler/internal/parse_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.cc b/tensorflow/core/profiler/internal/profiler_factory.cc
index e2bae59b892..5152e79bdc8 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.cc
+++ b/tensorflow/core/profiler/internal/profiler_factory.cc
@@ -14,8 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.h b/tensorflow/core/profiler/internal/profiler_factory.h
index 6bcdcf28c3c..c223d7275d9 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.h
+++ b/tensorflow/core/profiler/internal/profiler_factory.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index 79dfc7af2b2..9fe85e38652 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
index 70a627fd640..50c1244b9ee 100644
--- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
 namespace tensorflow {
@@ -48,11 +49,13 @@ TEST(ScopedAnnotation, Simple) {
   EXPECT_EQ(AnnotationStack::Get(), "");  // not enabled
 }
 
-string GenerateRandomString(int length) { return string(length, 'a'); }
+std::string GenerateRandomString(int length) {
+  return std::string(length, 'a');
+}
 
 void BM_ScopedAnnotationDisabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     ScopedAnnotation trace(annotation);
@@ -64,7 +67,7 @@ BENCHMARK(BM_ScopedAnnotationDisabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
@@ -78,7 +81,7 @@ BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled_Nested(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
index 365e3992bc3..268585bde8c 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -16,8 +16,18 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <algorithm>
+#include <atomic>
+#include <new>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 8b5b32cf4bc..1da7d4cebb1 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 
-#include <stddef.h>
-
 #include <atomic>
 #include <vector>
 
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
index 90478881361..8d7abc94e8f 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -15,19 +15,28 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 
 #include <atomic>
+#include <istream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using ::testing::ElementsAre;
+
 MATCHER_P(Named, name, "") { return arg.name == name; }
 
 constexpr static uint64 kNanosInSec = 1000000000;
@@ -45,7 +54,7 @@ TEST(RecorderTest, SingleThreaded) {
 
   ASSERT_EQ(results.size(), 1);
   EXPECT_THAT(results[0].events,
-              ::testing::ElementsAre(Named("during1"), Named("during2")));
+              ElementsAre(Named("during1"), Named("during2")));
 }
 
 void SpinNanos(int nanos) {

From 2407170febcdc37fbe90d9f5d8968f2b94ec17dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 10:48:33 -0700
Subject: [PATCH 0416/1533] Add json translation for tfjs mlir converter. TFJS
 ops are registered as TF custom ops, and utilize export_graphdef.cc to build
 out the GraphDef object that could contain both TF and TFJS dialects.

PiperOrigin-RevId: 311158257
Change-Id: I7313a5a01f12ef742a97fd5e9ff2bbffe8498b0c
---
 tensorflow/compiler/mlir/runlit.cfg.py        |   6 +-
 tensorflow/compiler/mlir/runlit.site.cfg.py   |   1 +
 .../mlir/tensorflow/utils/export_utils.cc     |  23 ++-
 .../mlir/tensorflow/utils/export_utils.h      |   7 +
 tensorflow/compiler/mlir/tfjs/BUILD           | 101 +++++++++-
 tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h   |   1 +
 tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD |  23 +++
 .../compiler/mlir/tfjs/tests/e2e/add.pbtxt    |  78 ++++++++
 .../compiler/mlir/tfjs/tests/e2e/prelu.pbtxt  | 175 ++++++++++++++++++
 .../compiler/mlir/tfjs/tf_tfjs_passes.cc      |   8 +-
 .../mlir/tfjs/translate/json_translate.cc     | 105 +++++++++++
 .../mlir/tfjs/translate/json_translate.h      |  31 ++++
 .../mlir/tfjs/translate/tf_tfjs_translate.cc  | 173 +++++++++++++++++
 .../mlir/tfjs/translate/tf_to_tfjs_json.cc    | 152 +++++++++++++++
 .../mlir/tfjs/translate/tf_to_tfjs_json.h     |  63 +++++++
 15 files changed, 938 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
 create mode 100644 tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
 create mode 100644 tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/json_translate.h
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h

diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 6d3131a781c..f1271d0da24 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -70,9 +70,9 @@ tool_dirs = config.mlir_tf_tools_dirs + [
 ]
 tool_names = [
     'mlir-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
-    'flatbuffer_to_string', 'flatbuffer_translate', 'tf-mlir-translate',
-    'mlir-tflite-runner', 'tfcompile', 'json_to_flatbuffer', 'xla-gpu-opt',
-    'xla-opt'
+    'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
+    'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
+    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index 661e6200df3..3e7596c75d7 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -44,6 +44,7 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/mlir',
     'tensorflow/compiler/mlir/lite',
     'tensorflow/compiler/mlir/tensorflow',
+    'tensorflow/compiler/mlir/tfjs',
     'tensorflow/compiler/mlir/xla',
     'tensorflow/compiler/aot',
     'tensorflow/compiler/xla/service/mlir_gpu',
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index cc795259893..4877cbc4a44 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -59,6 +59,18 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+// static TensorFlow op prefix set.
+std::set<std::string>* GlobalOpPrefixes() {
+  static std::set<std::string>* global_op_prefixes = [] {
+    std::set<std::string>* result = new std::set<std::string>;
+    result->insert("tf.");
+    result->insert("_tf.");
+    result->insert("tf_executor.");
+    return result;
+  }();
+  return global_op_prefixes;
+}
+
 // Converts a location to the debug information for the node def.
 Status ConvertLocation(mlir::Location inst_loc,
                        NodeDef::ExperimentalDebugInfo* debug_info) {
@@ -268,8 +280,10 @@ StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef op_name) {
   // - ".sink" or ".Sink": only the NextIteration operation has this suffix. We
   // don't need to consider ".source"/".Source" because the nodes with this
   // suffix are skipped by the caller and will not be added to the graph.
-  if (!op_name.consume_front("_tf.") && !op_name.consume_front("tf.") &&
-      !op_name.consume_front("tf_executor.")) {
+  auto prefixes = GlobalOpPrefixes();
+  if (std::none_of(prefixes->begin(), prefixes->end(), [&](std::string prefix) {
+        return op_name.consume_front(prefix);
+      })) {
     return errors::FailedPrecondition("op node '", op_name.str(),
                                       "' was not a TF op!");
   }
@@ -506,4 +520,9 @@ bool IsLegacyCallInstruction(mlir::Operation* inst) {
          inst->getName().getStringRef().compare("_tf.LegacyCall") == 0;
 }
 
+Status AddTensorFlowOpPrefix(std::string prefix) {
+  GlobalOpPrefixes()->insert(prefix);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 32ed528bd0d..58fe39fa4e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -34,10 +34,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
+namespace mlir {
+class ShapedType;
+}  // namespace mlir
+
 namespace tensorflow {
 
 using stream_executor::port::StatusOr;
 
+// Add custom op prefix for TensorFlow dialects.
+Status AddTensorFlowOpPrefix(std::string);
+
 // Maps an MLIR op name in the TensorFlow dialect or the TensorFlow control
 // dialect back into a TensorFlow valid op name.
 StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 9b731d2c912..806a77e9c38 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -1,4 +1,5 @@
 load("//third_party/mlir:tblgen.bzl", "gentbl")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -131,10 +132,106 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
     ],
 )
+
+cc_library(
+    name = "json_translate_lib",
+    srcs = [
+        "translate/json_translate.cc",
+    ],
+    hdrs = [
+        "translate/json_translate.h",
+    ],
+    deps = [
+        ":tensorflow_js",
+        ":tensorflow_js_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:export_utils",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tf_to_tfjs_json",
+    srcs = ["translate/tf_to_tfjs_json.cc"],
+    hdrs = [
+        "translate/tf_to_tfjs_json.h",
+    ],
+    deps = [
+        ":json_translate_lib",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_binary(
+    name = "json_translate",
+    deps = [
+        ":json_translate_lib",
+        "@llvm-project//mlir:MlirTranslateMain",
+    ],
+)
+
+filegroup(
+    name = "tf_tfjs_translate_main",
+    srcs = [
+        "translate/tf_tfjs_translate.cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf_tfjs_translate",
+    srcs = [":tf_tfjs_translate_main"],
+    deps = [
+        ":json_translate_lib",
+        ":tensorflow_js_passes",
+        ":tf_to_tfjs_json",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 318895de79c..545183a052b 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+
 namespace mlir {
 namespace tfjs {
 
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
new file mode 100644
index 00000000000..5c8d37da2f0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
@@ -0,0 +1,23 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+licenses(["notice"])
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "pbtxt",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir/tfjs:tf_tfjs_translate",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
new file mode 100644
index 00000000000..f6a324fdc13
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
@@ -0,0 +1,78 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Mul -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "Add"
+  op: "Add"
+  input: "input0"
+  input: "input1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "Add"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 27
+}
+
+# CHECK: "name": "input0"
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "input1",
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Add"
+# CHECK-NEXT: "op": "AddV2"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "input0"
+# CHECK-NEXT: "input1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul1"
+# CHECK-NEXT: "op": "Mul"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Add"
+# CHECK-NEXT: "Add"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul"
+# CHECK-NEXT: "op": "_Retval"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Mul1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "library"
+# CHECK: "versions"
+# CHECK: "producer": 27
+
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
new file mode 100644
index 00000000000..810db71f5e0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
@@ -0,0 +1,175 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0 -tf-input-data-types=DT_FLOAT -tf-input-shapes=10 -tf-output-arrays=Add -tf-custom-opdefs="name: 'Prelu' input_arg: { name: 'x' type: DT_FLOAT } input_arg: { name: 'alpha' type: DT_FLOAT } output_arg: { name: 'c' type: DT_FLOAT }" -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "alpha"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Neg"
+  op: "Neg"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu1"
+  op: "Relu"
+  input: "Neg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "alpha"
+  input: "Relu1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Add"
+  op: "Add"
+  input: "Relu"
+  input: "Mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "main"
+  op: "_Retval"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 344
+}
+
+# CHECK: "node":
+# CHECK: "name": "input0",
+# CHECK-NEXT: "op": "Placeholder",
+# CHECK-NEXT: "attr":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul",
+# CHECK-NEXT: "op": "Const",
+# CHECK-NEXT: "attr":
+# CHECK: "value":
+# CHECK: "tensor":
+# CHECK: "dtype": "DT_FLOAT",
+# CHECK: "tensorShape": {},
+# CHECK: "floatVal":
+# CHECK: -0.5
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul1",
+# CHECK-NEXT: "op": "Prelu",
+# CHECK-NEXT: "input":
+# CHECK: "input0",
+# CHECK: "Add.Relu.Neg.Relu1.Mul"
+# CHECK: "attr":
+# CHECK: "_output_shapes":
+# CHECK: "list":
+# CHECK: "shape":
+# CHECK: "dim":
+# CHECK: "size": "10"
+# CHECK: "experimentalDebugInfo": {}
+# CHECK: "name": "Add",
+# CHECK-NEXT: "op": "_Retval",
+# CHECK-NEXT: "input":
+# CHECK: "Add.Relu.Neg.Relu1.Mul1"
+# CHECK: "attr":
+# CHECK: "T":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "library": {},
+# CHECK: "versions":
+# CHECK: "producer": 344
+
diff --git a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
index 631bb1ae2af..a445937570e 100644
--- a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
+++ b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
 
@@ -47,6 +46,11 @@ void AddTFToTFJSConversionPasses(mlir::OpPassManager* pm) {
   // Canonicalize, CSE etc.
   pm->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   pm->addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
+
+  // raise to executor dialect in order to use GraphDef converter
+  pm->addNestedPass<mlir::FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+  pm->addNestedPass<mlir::FuncOp>(mlir::CreateBreakUpIslandsPass());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
new file mode 100644
index 00000000000..7f4b8ffae09
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+
+using mlir::ModuleOp;
+using mlir::TranslateFromMLIRRegistration;
+using std::string;
+using tensorflow::Status;
+using xla::StatusOr;
+
+// Translates the given MLIR module in the TFJS dialect to TFJS JSON
+// format. Returns false on success.
+//
+bool tfjs::MlirToJSONTranslateFunction(ModuleOp module,
+                                       std::string* serialized_json) {
+  string json_output;
+  // Allow TF to treat TFJS ops as TF ops.
+  if (!tensorflow::AddTensorFlowOpPrefix("tfjs.").ok()) {
+    LOG(ERROR) << "Failed to add tfjs op prefix.";
+    return false;
+  }
+  tensorflow::GraphExportConfig confs;
+  confs.export_shapes = true;
+  confs.export_library = true;
+  tensorflow::FunctionLibraryDefinition flib_def(
+      tensorflow::OpRegistry::Global(), tensorflow::FunctionDefLibrary());
+  absl::flat_hash_set<tensorflow::Node*> control_ret_nodes;
+  auto graph = absl::make_unique<tensorflow::Graph>(flib_def);
+  auto status = tensorflow::ConvertMlirToGraph(module, confs, &graph, &flib_def,
+                                               &control_ret_nodes);
+  if (!status.ok()) {
+    LOG(ERROR) << "Graph export failed: " << status;
+    return false;
+  }
+  auto graphdef = absl::make_unique<tensorflow::GraphDef>();
+  graph->ToGraphDef(graphdef.get());
+
+  // Replace the _Arg nodes of the main function with Placeholder op.
+  auto nodes = graphdef->mutable_node();
+  for (const auto& node : llvm::enumerate(*nodes)) {
+    if (node.value().op() == "_Arg") {
+      nodes->Mutable(node.index())->set_op("Placeholder");
+    }
+  }
+
+  tensorflow::protobuf::util::JsonPrintOptions json_options;
+  json_options.add_whitespace = true;
+  auto jsonStatus = tensorflow::protobuf::util::MessageToJsonString(
+      *graphdef, &json_output, json_options);
+  if (!jsonStatus.ok()) {
+    LOG(ERROR) << "Proto2Json failed: " << status;
+    return false;
+  }
+  *serialized_json = std::move(json_output);
+  return true;
+}
+
+static mlir::LogicalResult MlirToJSONFileTranslateFunction(
+    ModuleOp module, llvm::raw_ostream& output) {
+  std::string serialized_json;
+  if (!tfjs::MlirToJSONTranslateFunction(module, &serialized_json))
+    return mlir::failure();
+
+  output << serialized_json;
+  return mlir::success();
+}
+
+static TranslateFromMLIRRegistration MLIRToJSONFileTranslate(
+    "mlir-to-tfjs-json", MlirToJSONFileTranslateFunction);
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.h b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
new file mode 100644
index 00000000000..0a931f770ad
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+
+#include <string>
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tfjs {
+
+// Translates the given MLIR `module` into a JSON string. Returns true if
+// translation fails, otherwise returns false.
+bool MlirToJSONTranslateFunction(mlir::ModuleOp module,
+                                 std::string* serialized_json);
+}  // namespace tfjs
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
new file mode 100644
index 00000000000..e735a3c7b8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
@@ -0,0 +1,173 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <string>
+
+#include "absl/strings/str_split.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
+#include "tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h"
+#include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+using llvm::cl::opt;
+using mlir::MLIRContext;
+using stream_executor::port::StatusOr;
+
+// NOLINTNEXTLINE
+opt<std::string> input_file_name(llvm::cl::Positional,
+                                 llvm::cl::desc("<input file>"),
+                                 llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_object_graph(
+    "savedmodel-objectgraph-to-mlir",
+    llvm::cl::desc("Import a saved model to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_signature_defs(
+    "savedmodel-signaturedefs-to-mlir",
+    llvm::cl::desc("Import a saved model V1 to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_tags(
+    "tf-savedmodel-tags",
+    llvm::cl::desc("Tags used to indicate which MetaGraphDef to import, "
+                   "separated by ','"),
+    llvm::cl::init("serve"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_exported_names(
+    "tf-savedmodel-exported-names",
+    llvm::cl::desc("Names to export from SavedModel, separated by ','. Empty "
+                   "(the default) means export all."),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> output_file_name("o", llvm::cl::desc("<output file>"),
+                                  llvm::cl::value_desc("filename"),
+                                  llvm::cl::init("-"));
+// NOLINTNEXTLINE
+opt<bool> input_mlir(
+    "input-mlir",
+    llvm::cl::desc("Take input TensorFlow model in textual MLIR instead of "
+                   "GraphDef format"),
+    llvm::cl::init(false), llvm::cl::Hidden);
+// NOLINTNEXTLINE
+opt<bool> output_mlir(
+    "output-mlir",
+    llvm::cl::desc("Output MLIR rather than JSON for the generated TFJS model"),
+    llvm::cl::init(false));
+
+// The following approach allows injecting opdefs in addition
+// to those that are already part of the global TF registry  to be linked in
+// prior to importing the graph. The primary goal is for support of custom ops.
+// This is not intended to be a general solution for custom ops for the future
+// but mainly for supporting older models like mobilenet_ssd. More appropriate
+// mechanisms, such as op hints or using functions to represent composable ops
+// like https://github.com/tensorflow/community/pull/113 should be encouraged
+// going forward.
+// NOLINTNEXTLINE
+llvm::cl::list<std::string> custom_opdefs(
+    "tf-custom-opdefs", llvm::cl::desc("List of custom opdefs when importing "
+                                       "graphdef"));
+
+// Debugging flag to print function mapping in the JSON.
+// NOLINTNEXTLINE
+static opt<bool> print_function_result_mapping(
+    "print-function-result-mapping",
+    llvm::cl::desc(
+        "Print the mapping of function result to json output buffer"),
+    llvm::cl::init(false));
+
+enum TranslationStatus { kTrSuccess, kTrFailure };
+
+static int PrintFunctionResultMapping(const std::string& result) {
+  std::cout << result << std::endl;
+  return kTrSuccess;
+}
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "TF GraphDef to TFJS JSON converter\n");
+
+  MLIRContext context;
+  llvm::SourceMgr source_mgr;
+  mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
+
+  StatusOr<mlir::OwningModuleRef> module;
+
+  if (import_saved_model_object_graph || import_saved_model_signature_defs) {
+    if (input_mlir)
+      module = tensorflow::errors::InvalidArgument(
+          "Importing saved model should not have input_mlir set");
+    module = tensorflow::ImportSavedModel(
+        import_saved_model_object_graph, import_saved_model_signature_defs,
+        custom_opdefs, input_file_name, saved_model_tags,
+        saved_model_exported_names, &context);
+  } else {
+    module = tensorflow::LoadFromGraphdefOrMlirSource(
+        input_file_name, input_mlir, custom_opdefs, debug_info_file,
+        input_arrays, input_dtypes, input_shapes, output_arrays,
+        /*prune_unused_nodes=*/true, &source_mgr, &context);
+  }
+
+  // If errors occur, the library call in the above already logged the error
+  // message. So we can just return here.
+  if (!module.ok()) return kTrFailure;
+
+  mlir::PassManager pm(&context);
+
+  tensorflow::AddTFToTFJSConversionPasses(&pm);
+
+  std::string result;
+  auto status = tensorflow::ConvertTFOpsToTfjsJSON(module.ValueOrDie().get(),
+                                                   output_mlir, &result, &pm);
+  if (!status.ok()) return kTrFailure;
+
+  std::string error_msg;
+  auto output = mlir::openOutputFile(output_file_name, &error_msg);
+  if (output == nullptr) {
+    llvm::errs() << error_msg << '\n';
+    return kTrFailure;
+  }
+  output->os() << result;
+  output->keep();
+
+  // Print out debugging info related to function mapping.
+  if (print_function_result_mapping) return PrintFunctionResultMapping(result);
+  return kTrSuccess;
+}
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
new file mode 100644
index 00000000000..7dc9ea049ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
@@ -0,0 +1,152 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+using mlir::MLIRContext;
+using mlir::ModuleOp;
+using mlir::OwningModuleRef;
+using stream_executor::port::StatusOr;
+
+namespace {
+tensorflow::Status RegisterCustomOps(
+    const std::vector<std::string>& extra_tf_opdefs) {
+  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
+                                                           &opdef)) {
+      LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
+      return errors::InvalidArgument("fail to parse extra OpDef");
+    }
+    // Register extra opdefs.
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return Status::OK();
+        });
+  }
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, MLIRContext* context) {
+  // Set up the input file.
+  std::string error_message;
+  auto file = mlir::openInputFile(input_filename, &error_message);
+  if (!file) {
+    llvm::errs() << error_message << "\n";
+    return errors::InvalidArgument("fail to open input file");
+  }
+
+  if (input_mlir) {
+    source_mgr->AddNewSourceBuffer(std::move(file), llvm::SMLoc());
+    return OwningModuleRef(mlir::parseSourceFile(*source_mgr, context));
+  }
+
+  TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+
+  return tensorflow::GraphdefToMlirTranslateFunction(
+      file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
+      input_shapes, output_arrays, /*control_output_arrays=*/"",
+      prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+      /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+      /*enable_shape_inference=*/true, context);
+}
+
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager) {
+  mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
+                                                    /*propagate=*/true);
+  if (failed(pass_manager->run(module))) {
+    return statusHandler.ConsumeStatus();
+  }
+
+  if (export_to_mlir) {
+    llvm::raw_string_ostream os(*result);
+    module.print(os);
+    return Status::OK();
+  }
+
+  return tfjs::MlirToJSONTranslateFunction(module, result)
+             ? Status::OK()
+             : statusHandler.ConsumeStatus();
+}
+
+StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context) {
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  std::vector<std::string> exported_names_in_vector =
+      absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+  absl::Span<std::string> exported_names(exported_names_in_vector);
+  if (import_saved_model) {
+    auto module = tensorflow::SavedModelObjectGraphToMlirImport(
+        input_filename, tags, absl::Span<std::string>(exported_names), context);
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else if (import_saved_model_v1) {
+    auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
+        input_filename, tags, exported_names, context);
+
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else {
+    return tensorflow::errors::InvalidArgument(
+        "Should be either saved model v1 or v2");
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
new file mode 100644
index 00000000000..d68f0e7d46e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+// Load a TF model from a GraphDef definition or a TF control flow dialect MLIR
+// source into a MLIR module. If `input_mlir` is true, load from a MLIR source
+// file; otherwise, load from a GraphDef.
+// Setting prune_unused_nodes to true, would prune unreachable nodes if
+// output_arrays is specified.
+stream_executor::port::StatusOr<mlir::OwningModuleRef>
+LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
+
+// Load Saved model (either v1 or v2) into MLIR.
+stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context);
+
+// Taking a MLIR module in TF executor dialect and a set of parameters,
+// applies a set of passes to convert the module to TFJS dialect and
+// serializes the result to JSON string.
+// If `export_to_mlir` is true, the result is exported in MLIR text format,
+// otherwise exported in JSON.
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_

From 88acf9fcc52b17def5f3600dcf02744cf655fec1 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 12 May 2020 11:15:46 -0700
Subject: [PATCH 0417/1533] Install the `wrapt` pip package.

PiperOrigin-RevId: 311164427
Change-Id: Ia1b287cf2285861dbc86be2349d4c322061dbbf8
---
 tensorflow/tools/ci_build/release/common.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index a6ef52b8bea..bb40042e3af 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -146,6 +146,7 @@ function install_pip_deps {
   ${PIP_CMD} install --user --upgrade attrs
   ${PIP_CMD} install --user --upgrade tf-estimator-nightly
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
+  ${PIP_CMD} install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_16_pip_installations)
 }
 
@@ -178,6 +179,7 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install PyYAML==3.13 --user
   "${PIP_CMD}" install --user --upgrade tf-estimator-nightly
   "${PIP_CMD}" install --user --upgrade tb-nightly
+  "${PIP_CMD}" install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_pip_installations)
 }
 
@@ -219,6 +221,7 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade attrs
   ${PIP_CMD} install --user --upgrade tf-estimator-nightly
+  ${PIP_CMD} install --user --upgrade wrapt
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
 }
 

From a2afd0e3588725f3839522e75e324febc9aaeaf5 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 12 May 2020 11:24:53 -0700
Subject: [PATCH 0418/1533] Refactor MLIR TF shape inference to have a context

This enables reusing the partial results computed/caching the query results (ValuePortResultMap). This also reduce some arguments being passed around (else in the follow up I'd need to pass a context everywhere). Should be NFC change.

PiperOrigin-RevId: 311166241
Change-Id: Icb6ea66c6c16a06d4bc9077225f1d7a783548dca
---
 .../tensorflow/transforms/shape_inference.cc  | 292 ++++++++++--------
 1 file changed, 162 insertions(+), 130 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 41902c46b40..5a2cae38062 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -66,8 +66,7 @@ using tensorflow::shape_inference::ShapeHandle;
 namespace mlir {
 namespace TF {
 namespace {
-Optional<llvm::SmallVector<mlir::Type, 4>> InferShapeForFunctionReturnType(
-    FuncOp func) {
+Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
   // Find any return ops.
   SmallVector<ReturnOp, 4> return_ops;
   for (Block& block : func) {
@@ -137,9 +136,9 @@ void AddCastBackForUnsupportedNonTFUses(Operation* op, Value result,
       cast_op = b.create<TF::CastOp>(op->getLoc(), old_type, result,
                                      /*truncate=*/b.getBoolAttr(false));
     }
-    return mlir::Value(cast_op);
+    return Value(cast_op);
   };
-  for (OpOperand& use : llvm::make_early_inc_range(result.getUses())) {
+  for (OpOperand& use : make_early_inc_range(result.getUses())) {
     if (use.getOwner()->getDialect() != tf_dialect &&
         !IsSupportedNonTFOp(use.getOwner()))
       use.set(get_cast_op());
@@ -162,7 +161,7 @@ Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
 bool InferShapeForPassThroughOps(OperandRange pass_through_operands,
                                  Operation* op, Dialect* tf_dialect) {
   bool changed = false;
-  for (auto entry : llvm::zip(pass_through_operands, op->getResults())) {
+  for (auto entry : zip(pass_through_operands, op->getResults())) {
     Type operand_type = std::get<0>(entry).getType();
     Value result = std::get<1>(entry);
     if (result.getType() == operand_type) continue;
@@ -204,7 +203,7 @@ bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
         tf_dialect);
   }
   // TODO(b/155227679): Use OpInterface instead of hard-coding for TensorCastOp.
-  if (auto tensor_cast = dyn_cast<mlir::TensorCastOp>(op)) {
+  if (auto tensor_cast = dyn_cast<TensorCastOp>(op)) {
     return InferShapeForPassThroughOps(
         tensor_cast.getOperation()->getOperands(), op, tf_dialect);
   }
@@ -254,7 +253,7 @@ GetSubtypes(Type type) {
 // match the i-th operand type). Returns true if anything is changed.
 bool PassThroughOperandTypes(OperandRange operands, ResultRange results) {
   bool changed = false;
-  for (auto entry : llvm::zip(operands, results)) {
+  for (auto entry : zip(operands, results)) {
     Type operand_type = std::get<0>(entry).getType();
     Type result_type = std::get<1>(entry).getType();
     if (operand_type == result_type) continue;
@@ -291,14 +290,13 @@ bool InferShapeForCall(Operation* op) {
   CallInterfaceCallable callable = call_op.getCallableForCallee();
   SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>();
   if (!sym) return false;
-  FuncOp func =
-      dyn_cast<mlir::FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
+  FuncOp func = dyn_cast<FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
   if (!func) return false;
 
   bool changed = false;
   // Map each of the results of the call to the returned type of the
   // function.
-  for (auto result : llvm::zip(op->getResults(), func.getType().getResults())) {
+  for (auto result : zip(op->getResults(), func.getType().getResults())) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
     // Skip already statically shaped results.
     if (!CanBeRefined(std::get<0>(result).getType())) continue;
@@ -335,7 +333,7 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
   // Map each of the results of the call to the returned type of the
   // function.
   bool changed = false;
-  for (auto result : llvm::zip(op->getResults(), inferred)) {
+  for (auto result : zip(op->getResults(), inferred)) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
 
     // Inserts a cast back to the original type if any user is not in the
@@ -356,7 +354,7 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
 // so for tf.Const -> tensor<10x20xf32>, [0,2,18] would point to a unique output
 // scalar value).
 struct ValuePort {
-  llvm::PointerUnion<Operation*, BlockArgument> producer;
+  PointerUnion<Operation*, BlockArgument> producer;
   SmallVector<unsigned int, 2> port;
 
   bool operator==(const ValuePort& other) const {
@@ -374,39 +372,38 @@ struct ValuePort {
       port = {0};
     }
   }
-  ValuePort(llvm::PointerUnion<Operation*, BlockArgument> producer,
+  ValuePort(PointerUnion<Operation*, BlockArgument> producer,
             SmallVector<unsigned int, 2> port)
       : producer(producer), port(port) {}
 
-  llvm::raw_ostream& print(llvm::raw_ostream& os) const {
+  raw_ostream& print(raw_ostream& os) const {
     if (auto* op = producer.dyn_cast<Operation*>())
       os << "op " << op->getName();
     if (auto ba = producer.dyn_cast<BlockArgument>())
       os << "block_arg " << ba.getArgNumber();
-    os << llvm::formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
+    os << formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
     return os;
   }
 };
 
 struct ValuePortHasher {
   std::size_t operator()(const ValuePort& other) const {
-    return llvm::hash_combine(
-        llvm::hash_value(other.producer.getOpaqueValue()),
-        llvm::hash_value(ArrayRef<unsigned int>(other.port)));
+    return hash_combine(llvm::hash_value(other.producer.getOpaqueValue()),
+                        hash_value(ArrayRef<unsigned int>(other.port)));
   }
 };
 
 using ValuePortResultMap =
     std::unordered_map<ValuePort, Attribute, ValuePortHasher>;
-using ComputedQueryFn = llvm::function_ref<bool(ValuePort)>;
-using ValueQueryFn = llvm::function_ref<Attribute(const ValuePort&)>;
-using ValuePortInputs = llvm::SmallVectorImpl<ValuePort>;
+using ComputedQueryFn = function_ref<bool(ValuePort)>;
+using ValueQueryFn = function_ref<Attribute(const ValuePort&)>;
+using ValuePortInputs = SmallVectorImpl<ValuePort>;
 
-// TODO(jpienaar): InputsRequiredForOutput and ComputeOutputComponent are
+// TODO(jpienaar): ComputeInputsRequiredForOutput and ComputeOutputComponent are
 // intended to be switched to op interfaces once more refined.
-LogicalResult InputsRequiredForOutput(ValuePort value_port,
-                                      ComputedQueryFn has_been_computed,
-                                      ValuePortInputs* inputs) {
+LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                             ComputedQueryFn has_been_computed,
+                                             ValuePortInputs* inputs) {
   auto op = value_port.producer.dyn_cast<Operation*>();
   auto& port = value_port.port;
   if (!op) return failure();
@@ -460,26 +457,94 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
   return nullptr;
 }
 
-ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
+// Context used during ShapeInference. This class contains common information
+// that is required by the individual shape inference helper functions (e.g.,
+// TF Graph version, constant values computed, etc.)
+class ShapeInference {
+ public:
+  ShapeInference(int64_t graph_version, MLIRContext* context);
+
+  LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                               ValuePortInputs* inputs) {
+    return ::mlir::TF::ComputeInputsRequiredForOutput(
+        value_port,
+        [this](const ValuePort& port) {
+          return results_.find(port) != results_.end();
+        },
+        inputs);
+  }
+
+  Attribute ComputeOutputComponent(const ValuePort& value_port) {
+    return ::mlir::TF::ComputeOutputComponent(
+        value_port, [this](const ValuePort& port) { return results_[port]; });
+  }
+
+  // Returns ShapeHandle if the op result could be computed as shape.
+  ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic);
+
+  void RecordValue(const ValuePort& value_port, Attribute value) {
+    results_[value_port] = value;
+  }
+
+  // Performs shape inference on the provided op and return true if the type of
+  // at least one result has been changed.
+  // A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
+  // `graph_version` indicates the current GraphDef compatibility versions
+  // (the versions field in graph.proto).
+  bool InferShapeForSingleOperation(Operation* op);
+
+  // Infers shape on the provided region, including nested ones, iterate until
+  // fix point with a limit of max_iteration. Returns success if fix point is
+  // reached before max_iteration.
+  LogicalResult InferShapeUntilFixPoint(Region* region,
+                                        int64_t max_iteration = 10);
+
+  // Updates input types and refine shapes inside body of functions that are
+  // attached to ControlFlow ops (If/While). These functions include Then/Else
+  // branches of IfOp and Cond/Body functions of WhileOp. These functions share
+  // following common properties:
+  //   1) They are never reused, ie. having a single use in module.
+  //   2) Their input types match those of their parent ops (excluding inputs
+  //      like predicate).
+  // Returns a boolean indicating whether any change has been applied.
+  LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
+                                              ArrayRef<Type> input_types,
+                                              int64_t max_iteration);
+
+  // Propagate the shapes to the functions named.
+  LogicalResult PropagateShapeToFunctions(
+      ModuleOp module, Operation::operand_type_range input_types,
+      ArrayRef<StringRef> func_names, int64_t max_iteration);
+
+  // Shape propagation for call/control flow ops.
+  LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
+                                                    int64_t max_iteration);
+
+ private:
+  // Mapping between ValuePort (which corresponds to an OpResult or smaller,
+  // e.g., first element of OpResult produded) to an Attribute if the ValuePort
+  // corresponds to a constant value.
+  ValuePortResultMap results_;
+  int64_t graph_version_;
+  MLIRContext* context_;
+  Dialect* tf_dialect_;
+};
+
+ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
+    : graph_version_(graph_version) {
+  context_ = context;
+  tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
+}
+
+ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
+                                                 InferenceContext* ic) {
   LLVM_DEBUG(result.print(llvm::dbgs() << "\nEvaluate partially "));
   auto rt = result.getType().dyn_cast<RankedTensorType>();
   if (!rt || !rt.hasStaticShape() || rt.getRank() != 1) return {};
   int dim_size = rt.getDimSize(0);
 
   // Worklist to direct partial evaluation.
-  llvm::SmallVector<ValuePort, 4> worklist;
-  // The ValuePort evaluated results.
-  // TODO(jpienaar): This could be cached across invocations (e.g., part of some
-  // inference context).
-  ValuePortResultMap evaluated;
-  // Returns whether a ValuePort has been previously computed.
-  auto has_been_computed = [&evaluated](const ValuePort& port) {
-    return evaluated.find(port) != evaluated.end();
-  };
-  // Returns previously computed ValuePort value.
-  auto values = [&evaluated](const ValuePort& port) -> Attribute {
-    return evaluated[port];
-  };
+  SmallVector<ValuePort, 4> worklist;
 
   // Simple evaluator that attempts to partially evaluate the input value even
   // if unable to evaluate the complete output. Below follows a simple stack
@@ -498,7 +563,7 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
       LLVM_DEBUG(front.print(llvm::errs() << "\nWorklist front "));
 
       SmallVector<ValuePort, 4> inputs;
-      auto res = InputsRequiredForOutput(front, has_been_computed, &inputs);
+      auto res = ComputeInputsRequiredForOutput(front, &inputs);
       if (failed(res)) {
         // Abort if unable to find which required inputs need to be computed.
         worklist.clear();
@@ -513,16 +578,16 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
         continue;
       }
 
-      auto ret = ComputeOutputComponent(front, values);
+      auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
-      evaluated[front] = ret;
+      RecordValue(front, ret);
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
       if (worklist.empty()) {
         LLVM_DEBUG(llvm::dbgs() << "[root node]\n");
-        if (auto dea = ret.dyn_cast<mlir::DenseIntElementsAttr>()) {
+        if (auto dea = ret.dyn_cast<DenseIntElementsAttr>()) {
           if (dea.getNumElements() != 1) {
             LLVM_DEBUG(llvm::errs() << "Unexpected number of elements\n");
             return {};
@@ -536,14 +601,8 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
   return ic->MakeShape(dims);
 }
 
-// Performs shape inference on the provided op and return true if the type of
-// at least one result has been changed.
-// A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
-// `graph_version` indicates the current GraphDef compatibility versions
-// (the versions field in graph.proto).
-bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
-                                  int64_t graph_version) {
-  assert(tf_dialect == op->getDialect());
+bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
+  assert(tf_dialect_ == op->getDialect());
   // The shape function of these ops sometimes does not propagate subtypes
   // (handle shapes) for resource and variant types. We use a simple passthrough
   // to make sure they are preserved in the output.
@@ -555,7 +614,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // If no result for this op needs shape inference, we have a fast-path return.
   // But if the type is a resource/variant, we do not skip it because we might
   // not have the handle shapes.
-  if (llvm::none_of(op->getResultTypes(), CanBeRefined)) {
+  if (none_of(op->getResultTypes(), CanBeRefined)) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for statically shaped op '"
                             << op->getName() << "'.\n");
     return false;
@@ -570,8 +629,8 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // This is necessary to avoid reprocessing the tf.Cast that are inserted at
   // the end of this function.
   if (isa<CastOp>(op) &&
-      llvm::all_of(op->getResult(0).getUsers(), [&](Operation* user) {
-        return user->getDialect() != tf_dialect;
+      all_of(op->getResult(0).getUsers(), [&](Operation* user) {
+        return user->getDialect() != tf_dialect_;
       })) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for tf.Cast with no TF "
                                "dialect operation users '"
@@ -651,7 +710,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // Perform the shape inference using an InferenceContext with the input
   // shapes. This object is abstracting the information that the ShapeInference
   // function operates on.
-  InferenceContext c(graph_version, *node_def, op_reg_data->op_def,
+  InferenceContext c(graph_version_, *node_def, op_reg_data->op_def,
                      input_shapes, input_tensors,
                      /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
   auto status = c.Run(op_reg_data->shape_inference_fn);
@@ -664,7 +723,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // Determine if, during shape computation, the shape functions attempted to
   // query an input operand as shape where the input was not known/constant.
   bool requires_inputs =
-      llvm::any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
+      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
         return c.requested_input_tensor_as_partial_shape(input) &&
                !input_tensors[input];
       });
@@ -728,7 +787,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
         new_element_type.isa<TF::VariantType>()) {
       auto handle_shapes_types = c.output_handle_shapes_and_types(output);
       if (handle_shapes_types) {
-        llvm::SmallVector<mlir::TensorType, 1> subtypes;
+        SmallVector<TensorType, 1> subtypes;
         OpBuilder b(op);
         for (const auto& shape_n_type : *handle_shapes_types) {
           Type element_type;
@@ -748,7 +807,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
     if (result.getType() == new_type) continue;
     // Inserts a cast back to the original type if any user is not in the TF
     // dialect.
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect,
+    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_,
                                        result.getType());
     // Finally we inferred the shape and replace the type for this result.
     result.setType(new_type);
@@ -760,29 +819,13 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   return changed;
 }
 
-// Infers shape on the provided region, including nested ones, iterate until fix
-// point with a limit of max_iteration. Returns success if fix point is reached
-// before max_iteration.
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration = 10);
-
-// Updates input types and refine shapes inside body of functions that are
-// attached to ControlFlow ops (If/While). These functions include Then/Else
-// branches of IfOp and Cond/Body functions of WhileOp. These functions share
-// following common properties:
-//   1) They are never reused, ie. having a single use in module.
-//   2) Their input types match those of their parent ops (excluding inputs like
-//      predicate).
-// Returns a boolean indicating whether any change has been applied.
-LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
-                                            llvm::ArrayRef<Type> input_types,
-                                            int64_t graph_version,
-                                            int64_t max_iteration) {
+LogicalResult ShapeInference::RefineShapeForControlFlowFunc(
+    FuncOp func, ArrayRef<Type> input_types, int64_t max_iteration) {
   ModuleOp module = func.getParentOfType<ModuleOp>();
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
   if (num_uses != 1) {
-    func.emitWarning(llvm::formatv(
+    func.emitWarning(formatv(
         "expected control flow function {0} to have exactly 1 use, found {1}.",
         func.getName(), num_uses));
     return failure();
@@ -796,8 +839,7 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
     arg_and_idx.value().setType(input_types[arg_and_idx.index()]);
   }
 
-  auto res =
-      InferShapeUntilFixPoint(&func.getBody(), graph_version, max_iteration);
+  auto res = InferShapeUntilFixPoint(&func.getBody(), max_iteration);
   if (failed(res)) return res;
 
   auto new_return_types = InferShapeForFunctionReturnType(func);
@@ -809,20 +851,18 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
   return success();
 }
 
-LogicalResult PropagateShapeToFunctions(
+LogicalResult ShapeInference::PropagateShapeToFunctions(
     ModuleOp module, Operation::operand_type_range input_types,
-    llvm::ArrayRef<StringRef> func_names, int64_t graph_version,
-    int64_t max_iteration) {
-  bool success = true;
+    ArrayRef<StringRef> func_names, int64_t max_iteration) {
+  bool all_succeeded = true;
   auto types = llvm::to_vector<4>(input_types);
   for (auto func_name : func_names) {
     FuncOp func = module.lookupSymbol<FuncOp>(func_name);
-    if (failed(RefineShapeForControlFlowFunc(func, types, graph_version,
-                                             max_iteration))) {
-      success = false;
-    }
+    all_succeeded =
+        succeeded(RefineShapeForControlFlowFunc(func, types, max_iteration)) &&
+        all_succeeded;
   }
-  return mlir::success(success);
+  return success(all_succeeded);
 }
 
 // If the callee has only one use, propagates any constant operand of call_op to
@@ -842,7 +882,7 @@ void PropagateConstantToCallee(CallOpInterface call_op,
     // the constant inside the function.
     for (auto arg : func.getArguments()) {
       auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
-      if (llvm::isa_and_nonnull<TF::ConstOp>(operand)) {
+      if (isa_and_nonnull<TF::ConstOp>(operand)) {
         arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
       }
     }
@@ -861,33 +901,31 @@ void PropagateConstantFromCallee(CallOpInterface call_op,
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
     auto retval_op = retval.value().getDefiningOp();
-    if (llvm::isa_and_nonnull<TF::ConstOp>(retval_op)) {
+    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
       op->getResult(retval.index())
           .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
     }
   }
 }
 
-LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
-                                                  int64_t graph_version,
-                                                  int64_t max_iteration) {
+LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
+    Operation* op, int64_t max_iteration) {
   ModuleOp module = op->getParentOfType<ModuleOp>();
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     return PropagateShapeToFunctions(
-        module, llvm::drop_begin(if_op.getOperandTypes(), 1),
-        {if_op.then_branch(), if_op.else_branch()}, graph_version,
-        max_iteration);
+        module, drop_begin(if_op.getOperandTypes(), 1),
+        {if_op.then_branch(), if_op.else_branch()}, max_iteration);
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
     return PropagateShapeToFunctions(module, while_op.getOperandTypes(),
                                      {while_op.cond(), while_op.body()},
-                                     graph_version, max_iteration);
+                                     max_iteration);
   } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
     CallInterfaceCallable callable = call_op.getCallableForCallee();
     if (SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>()) {
       PropagateConstantToCallee(call_op, sym, module);
       if (failed(PropagateShapeToFunctions(
               module, call_op.getArgOperands().getTypes(),
-              {sym.getRootReference()}, graph_version, max_iteration))) {
+              {sym.getRootReference()}, max_iteration))) {
         return failure();
       }
       PropagateConstantFromCallee(call_op, sym, module);
@@ -900,13 +938,10 @@ LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
   return success();
 }
 
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration) {
-  MLIRContext* ctx = region->getContext();
-  Dialect* tf_dialect = ctx->getRegisteredDialect<TensorFlowDialect>();
-
-  // An operation folder that is used to attempt folding before inference.
-  OperationFolder folder(ctx);
+LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
+                                                      int64_t max_iteration) {
+  // An operation folder that is used to attempt folding before inference._
+  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -919,14 +954,14 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
                << "Shape inference, iteration " << iteration << "\n");
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
-        changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect);
+        changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
         // TODO(jpienaar): Debug why we can't just return here. We end up with
         // additional constant due to the propagation of constant into attached
         // function if we return already.
       }
 
-      if (op->getDialect() != tf_dialect) {
-        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect);
+      if (op->getDialect() != tf_dialect_) {
+        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect_);
         return;
       }
 
@@ -935,13 +970,12 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.
-      if (failed(PropagateShapeIntoAttachedFunctions(op, graph_version,
-                                                     max_iteration))) {
+      if (failed(PropagateShapeIntoAttachedFunctions(op, max_iteration))) {
         op->emitWarning() << "unable to refine shape of attached function "
                              "arguments and bodies";
       }
 
-      changed |= InferShapeForSingleOperation(op, tf_dialect, graph_version);
+      changed |= InferShapeForSingleOperation(op);
     });
   }
 
@@ -956,44 +990,43 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                     int64_t graph_version) {
+  ShapeInference context(graph_version, func.getContext());
   if (arg_shapes.empty()) {
-    if (failed(InferShapeUntilFixPoint(&func.getBody(), graph_version)))
+    if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
       return failure();
     // TODO(b/156276510): Verify that it is always fine to refine a function's
     // return type, as long as we do not change the argument shapes.
     if (auto return_types = InferShapeForFunctionReturnType(func)) {
-      func.setType(mlir::FunctionType::get(func.getType().getInputs(),
-                                           return_types.getValue(),
-                                           func.getContext()));
+      func.setType(FunctionType::get(func.getType().getInputs(),
+                                     return_types.getValue(),
+                                     func.getContext()));
     }
 
     return success();
   }
-  mlir::FunctionType func_type = func.getType();
+  FunctionType func_type = func.getType();
   bool needs_refinement = false;
-  llvm::SmallVector<mlir::Type, 4> new_arg_types;
+  SmallVector<Type, 4> new_arg_types;
   new_arg_types.reserve(func_type.getNumInputs());
 
   // Update argument types in-place using the provided arg_shapes.
   for (size_t i = 0; i < func_type.getNumInputs(); ++i) {
     ArrayRef<int64_t> shape = arg_shapes[i];
-    mlir::Type element_type;
-    if (auto input_ty =
-            func_type.getInput(i).dyn_cast<mlir::RankedTensorType>()) {
+    Type element_type;
+    if (auto input_ty = func_type.getInput(i).dyn_cast<RankedTensorType>()) {
       if (!input_ty || input_ty.getShape().size() != shape.size()) {
         return failure();
       }
       element_type = input_ty.getElementType();
     } else {
-      auto unranked_input_ty =
-          func_type.getInput(i).dyn_cast<mlir::TensorType>();
+      auto unranked_input_ty = func_type.getInput(i).dyn_cast<TensorType>();
       if (!unranked_input_ty) {
         return failure();
       }
       element_type = unranked_input_ty.getElementType();
     }
 
-    auto new_arg_type = mlir::RankedTensorType::get(shape, element_type);
+    auto new_arg_type = RankedTensorType::get(shape, element_type);
     if (new_arg_type != func_type.getInput(i)) {
       // If the new type is more detailed, trigger shape inference.
       func.getArgument(i).setType(new_arg_type);
@@ -1006,18 +1039,17 @@ LogicalResult InferShapeForFunction(FuncOp func,
     return success();
   }
 
-  mlir::LogicalResult result =
-      mlir::TF::InferShapeUntilFixPoint(&func.getBody(), graph_version);
+  LogicalResult result = context.InferShapeUntilFixPoint(&func.getBody());
   if (failed(result)) {
     return failure();
   }
 
   auto return_types = InferShapeForFunctionReturnType(func);
-  func.setType(mlir::FunctionType::get(new_arg_types,
-                                       return_types.hasValue()
-                                           ? return_types.getValue()
-                                           : func.getType().getResults(),
-                                       func.getContext()));
+  func.setType(FunctionType::get(new_arg_types,
+                                 return_types.hasValue()
+                                     ? return_types.getValue()
+                                     : func.getType().getResults(),
+                                 func.getContext()));
 
   return success();
 }

From 6e2654d882563116c2965215818b59c3abc8cc23 Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Tue, 12 May 2020 21:35:27 +0300
Subject: [PATCH 0419/1533] Removed named section pragmas from shared example
 code

---
 .../examples/person_detection_experimental/main_functions.cc    | 2 --
 .../person_detection_experimental/person_detection_test.cc      | 2 --
 2 files changed, 4 deletions(-)

diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index 552b52c9c51..719f16b2d36 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -42,9 +42,7 @@ TfLiteTensor* input = nullptr;
 
 // An area of memory to use for input, output, and intermediate arrays.
 constexpr int kTensorArenaSize = 125 * 1024;
-#pragma Bss(".tensor_arena")
 static uint8_t tensor_arena[kTensorArenaSize];
-#pragma Bss()
 }  // namespace
 
 // The name of this function is important for Arduino compatibility.
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index 9c7212648cc..b0979735d4f 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -28,9 +28,7 @@ limitations under the License.
 
 // Create an area of memory to use for input, output, and intermediate arrays.
 constexpr int tensor_arena_size = 125 * 1024;
-#pragma Bss(".tensor_arena")
 uint8_t tensor_arena[tensor_arena_size];
-#pragma Bss()
 
 TF_LITE_MICRO_TESTS_BEGIN
 

From 563a8a5ce0f9583ccfcbda97a9b0c9fd8d3620d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 11:32:47 -0700
Subject: [PATCH 0420/1533] Add bz2-devel so python will be compiled with bz2
 support.

PiperOrigin-RevId: 311167882
Change-Id: Ideeb21ae9bd8507d0e2cad4c95d4e81fb0d344fa
---
 ...rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
index 2e520f62cde..9c85091563e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
@@ -59,11 +59,12 @@ RUN /install/install_deb_packages.sh
 # - dependencies to build Python from source
 # - patchelf, as it is required by auditwheel
 RUN apt-get update && apt-get install -y \
-    libncurses5-dev \
+    libbz2-dev \
+    libffi-dev \
     libgdbm-dev \
+    libncurses5-dev \
     libnss3-dev \
     libreadline-dev \
-    libffi-dev \
     patchelf \
       && \
     rm -rf /var/lib/apt/lists/*

From b730a73909790d08fbfbf8977e77ab5b57d2d2e6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 11:44:13 -0700
Subject: [PATCH 0421/1533] Update manylinux docker image to latest hash.

PiperOrigin-RevId: 311170200
Change-Id: Icebc7ba48fc5de8b9a638d39ecc87e6a47140e08
---
 third_party/toolchains/preconfig/generate/containers.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index b1d0389a16d..9be398f5f2d 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -9,7 +9,7 @@ container_digests = {
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:cc7f760195d7bbe283b45ae740409751d0b74d8ffbdc2f7a3cb62c71a71fbe25",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:c460570b88eab3da92f06fdf30098d89be4de0f3b010ee3d39086f4d000dd3b8",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:13aa5e700bb609521cd4365d4152d7d8f4118cae7ce174ce7d54cc529e21766a",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }

From 1712a14d011035b61cdce1c578646f557ef422da Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 12 May 2020 11:53:10 -0700
Subject: [PATCH 0422/1533] [tfdbg2] Ensure initialization on
 DebugEventsWriter.WriteGraphExecutionTrace()

Background:
- When a TF Graph that contains tfdbg2's `DebugIdentityV2` ops is transferred to
  a remote server (tensorflow_std_server) and executed remotely, the
  `DebugEventsWriter.Init()` method is not called beforehand. This is different
  from the case where the Graph is executed on the localhost, where the `Init()`
  method is called from Python binding when the
  `tf.debugging.experimental.enable_dump_debug_info()` API is called.
- This can cause the remotely-executing Graph to fail to write debug logs
- This CL corrects that by calling `Init()` from the `WriteGraphExecutionTrace()`
  method (i.e., the method used by the `DebugIdentityV2` op.
PiperOrigin-RevId: 311171858
Change-Id: I0726ff363a991b1a9edb8b3d824b09374100d338
---
 tensorflow/core/kernels/debug_ops.h           |  6 +-
 tensorflow/core/util/debug_events_writer.cc   | 50 ++++++++-------
 tensorflow/core/util/debug_events_writer.h    | 26 ++++----
 .../core/util/debug_events_writer_test.cc     | 63 +++++++++++++------
 4 files changed, 88 insertions(+), 57 deletions(-)

diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 00356778026..42364e416ea 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -435,9 +435,9 @@ class DebugIdentityV2Op : public OpKernel {
     for (const string& dump_root : dump_roots_) {
       tfdbg::DebugEventsWriter* debug_events_writer =
           tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root);
-      debug_events_writer->WriteGraphExecutionTrace(
-          tfdbg_context_id_, device_name_, op_name_, output_slot_,
-          tensor_debug_mode_, tensor);
+      OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
+                                  tfdbg_context_id_, device_name_, op_name_,
+                                  output_slot_, tensor_debug_mode_, tensor));
     }
     context->set_output(0, tensor);
   }
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index 595f92d07c0..d9c3393ce3c 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -179,7 +179,7 @@ Status DebugEventsWriter::Init() {
   metadata->set_tensorflow_version(TF_VERSION_STRING);
   metadata->set_file_version(
       strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
-  SerializeAndWriteDebugEvent(&debug_event, METADATA);
+  TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
 
@@ -189,38 +189,38 @@ Status DebugEventsWriter::Init() {
   return Status::OK();
 }
 
-void DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
+Status DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
   DebugEvent debug_event;
   debug_event.set_allocated_source_file(source_file);
-  SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
+  return SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
 }
 
-void DebugEventsWriter::WriteStackFrameWithId(
+Status DebugEventsWriter::WriteStackFrameWithId(
     StackFrameWithId* stack_frame_with_id) {
   DebugEvent debug_event;
   debug_event.set_allocated_stack_frame_with_id(stack_frame_with_id);
-  SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
+  return SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
 }
 
-void DebugEventsWriter::WriteGraphOpCreation(
+Status DebugEventsWriter::WriteGraphOpCreation(
     GraphOpCreation* graph_op_creation) {
   DebugEvent debug_event;
   debug_event.set_allocated_graph_op_creation(graph_op_creation);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
+Status DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
   DebugEvent debug_event;
   debug_event.set_allocated_debugged_graph(debugged_graph);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteExecution(Execution* execution) {
+Status DebugEventsWriter::WriteExecution(Execution* execution) {
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_execution(execution);
-    SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
+    return SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -234,16 +234,18 @@ void DebugEventsWriter::WriteExecution(Execution* execution) {
     if (execution_buffer_.size() > circular_buffer_size_) {
       execution_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(
+Status DebugEventsWriter::WriteGraphExecutionTrace(
     GraphExecutionTrace* graph_execution_trace) {
+  TF_RETURN_IF_ERROR(Init());
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
-    SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
+    return SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -257,15 +259,14 @@ void DebugEventsWriter::WriteGraphExecutionTrace(
     if (graph_execution_trace_buffer_.size() > circular_buffer_size_) {
       graph_execution_trace_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                                 const string& device_name,
-                                                 const string& op_name,
-                                                 int32 output_slot,
-                                                 int32 tensor_debug_mode,
-                                                 const Tensor& tensor_value) {
+Status DebugEventsWriter::WriteGraphExecutionTrace(
+    const string& tfdbg_context_id, const string& device_name,
+    const string& op_name, int32 output_slot, int32 tensor_debug_mode,
+    const Tensor& tensor_value) {
   std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
   trace->set_tfdbg_context_id(tfdbg_context_id);
   if (!op_name.empty()) {
@@ -279,7 +280,7 @@ void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
   }
   trace->set_device_name(device_name);
   tensor_value.AsProtoTensorContent(trace->mutable_tensor_proto());
-  WriteGraphExecutionTrace(trace.release());
+  return WriteGraphExecutionTrace(trace.release());
 }
 
 void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
@@ -487,8 +488,8 @@ Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
   return Status::OK();
 }
 
-void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                                    DebugEventFileType type) {
+Status DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                                      DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
   if (writer != nullptr) {
@@ -497,6 +498,11 @@ void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
     string str;
     debug_event->AppendToString(&str);
     (*writer)->WriteSerializedDebugEvent(str);
+    return Status::OK();
+  } else {
+    return errors::Internal(
+        "Unable to find debug events file writer for DebugEventsFileType ",
+        type);
   }
 }
 
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 6d219d7c9ef..39835adf1a6 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -119,27 +119,27 @@ class DebugEventsWriter {
   // The four DebugEvent fields below are written _without_ the circular buffer.
   // Source file contents are written to the *.source_files file.
   // Takes ownership of source_file.
-  void WriteSourceFile(SourceFile* source_file);
+  Status WriteSourceFile(SourceFile* source_file);
   // Stack frames are written to the *.code_locations file.
   // Takes ownership of stack_frame_with_id.
-  void WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
+  Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
   // Graph op creation events are written to the *.graphs file.
   // Takes ownership of graph_op_creation.
-  void WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
+  Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
   // Debugged graphs are written to the *.graphs file.
   // Takes ownership of debugged_graph.
-  void WriteDebuggedGraph(DebuggedGraph* debugged_graph);
+  Status WriteDebuggedGraph(DebuggedGraph* debugged_graph);
 
   // The two DebugEvent fields below are written to the circular buffer
   // and saved to disk only at the FlushExecutionFiles() call.
   // Execution events (eager execution of an op or a tf.function) are written to
   // the *.execution file.
   // Takes ownership of execution.
-  void WriteExecution(Execution* execution);
+  Status WriteExecution(Execution* execution);
   // Graph execution traces (graph-internal tensor values or their summaries)
   // are written to the *.graph_execution_traces file.
   // Takes ownership of graph_execution_trace.
-  void WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
+  Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
 
   // Write a graph execution trace without using a protocol buffer.
   // Instead, pass the raw values related to the graph execution trace.
@@ -155,11 +155,11 @@ class DebugEventsWriter {
   //   tensor_value: The value of the tensor that describes the tensor(s)
   //     that this trace is concerned with. The semantics of this tensor value
   //     depends on the value of `tensor_debug_mode`.
-  void WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                const string& device_name,
-                                const string& op_name, int32 output_slot,
-                                int32 tensor_debug_mode,
-                                const Tensor& tensor_value);
+  Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
+                                  const string& device_name,
+                                  const string& op_name, int32 output_slot,
+                                  int32 tensor_debug_mode,
+                                  const Tensor& tensor_value);
 
   // Writes a serialized DebugEvent to one of the debug-events files
   // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES
@@ -217,8 +217,8 @@ class DebugEventsWriter {
   // Initialize the TFRecord writer for non-metadata file type.
   Status InitNonMetadataFile(DebugEventFileType type);
 
-  void SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                   DebugEventFileType type);
+  Status SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                     DebugEventFileType type);
 
   void SelectWriter(DebugEventFileType type,
                     std::unique_ptr<SingleDebugEventFileWriter>** writer);
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 66cde55864b..bd0c731bc90 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -263,7 +263,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_1->add_lines("");
   source_file_1->add_lines("print(tf.constant([42.0]))");
   source_file_1->add_lines("");
-  writer->WriteSourceFile(source_file_1);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_1));
 
   SourceFile* source_file_2 = new SourceFile();
   source_file_2->set_file_path("/home/tf_programs/train.py");
@@ -271,7 +271,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_2->add_lines("import tensorflow.keras as keras");
   source_file_2->add_lines("");
   source_file_2->add_lines("model = keras.Sequential()");
-  writer->WriteSourceFile(source_file_2);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_2));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -336,8 +336,8 @@ TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
   file_line_col->set_func("my_func");
   file_line_col->set_code("  x = x ** 2.0");
 
-  writer->WriteStackFrameWithId(stack_frame_1);
-  writer->WriteStackFrameWithId(stack_frame_2);
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_1));
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_2));
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
 
@@ -382,12 +382,12 @@ TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
   GraphOpCreation* graph_op_creation = new GraphOpCreation();
   graph_op_creation->set_op_type("MatMul");
   graph_op_creation->set_op_name("Dense_1/MatMul");
-  writer->WriteGraphOpCreation(graph_op_creation);
+  TF_ASSERT_OK(writer->WriteGraphOpCreation(graph_op_creation));
 
   DebuggedGraph* debugged_graph = new DebuggedGraph();
   debugged_graph->set_graph_id("deadbeaf");
   debugged_graph->set_graph_name("my_func_graph");
-  writer->WriteDebuggedGraph(debugged_graph);
+  TF_ASSERT_OK(writer->WriteDebuggedGraph(debugged_graph));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -428,7 +428,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     thread_pool->Schedule(fn);
@@ -469,7 +469,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -512,16 +512,16 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
       source_file->set_file_path(
           strings::Printf("/home/tf_programs/program_%.2d.py", index));
       source_file->set_host_name("localhost.localdomain");
-      writer->WriteSourceFile(source_file);
+      TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     } else if (index % 3 == 1) {
       StackFrameWithId* stack_frame = new StackFrameWithId();
       stack_frame->set_id(strings::Printf("e%.2d", index));
-      writer->WriteStackFrameWithId(stack_frame);
+      TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame));
     } else {
       GraphOpCreation* op_creation = new GraphOpCreation();
       op_creation->set_op_type("Log");
       op_creation->set_op_name(strings::Printf("Log_%.2d", index));
-      writer->WriteGraphOpCreation(op_creation);
+      TF_ASSERT_OK(writer->WriteGraphOpCreation(op_creation));
     }
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -586,7 +586,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   std::vector<DebugEvent> actuals;
@@ -611,7 +611,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -637,7 +637,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Abs");
     execution->add_input_tensor_ids(counter.fetch_add(1));
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -682,7 +682,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   std::vector<DebugEvent> actuals;
@@ -695,6 +695,31 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   TF_ASSERT_OK(writer->Close());
 }
 
+TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
+  const size_t kCyclicBufferSize = -1;
+  DebugEventsWriter* writer =
+      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  // NOTE(cais): `writer->Init()` is not called here before
+  // WriteGraphExecutionTrace() is called. This test checks that this is okay
+  // and the `GraphExecutionTrace` gets written correctly even without `Init()`
+  // being called first. This scenario can happen when a TF Graph with tfdbg
+  // debug ops are executed on a remote TF server.
+
+  GraphExecutionTrace* trace = new GraphExecutionTrace();
+  trace->set_tfdbg_context_id(strings::Printf("graph_0"));
+  TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
+  TF_ASSERT_OK(writer->FlushExecutionFiles());
+
+  std::vector<DebugEvent> actuals;
+  ReadDebugEventProtos(writer, DebugEventFileType::GRAPH_EXECUTION_TRACES,
+                       &actuals);
+  EXPECT_EQ(actuals.size(), 1);
+  EXPECT_EQ(actuals[0].graph_execution_trace().tfdbg_context_id(), "graph_0");
+
+  // Close the writer so the files can be safely deleted.
+  TF_ASSERT_OK(writer->Close());
+}
+
 TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   const size_t kCyclicBufferSize = 10;
   DebugEventsWriter* writer =
@@ -706,7 +731,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -731,7 +756,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(
         strings::Printf("new_graph_%.2ld", counter.fetch_add(1)));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -818,7 +843,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 
@@ -834,7 +859,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
   for (size_t i = 0; i < kNumEvents; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 

From 1de39b575611a252531d0238eefb8a394fa96286 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 11:57:52 -0700
Subject: [PATCH 0423/1533] Implement outside compilation head extraction.

PiperOrigin-RevId: 311172756
Change-Id: Id3dbcbd1582a01ec94424dbb8b08bb475466568c
---
 ...extract_head_tail_outside_compilation.mlir |  83 ++++++--
 .../mlir/tensorflow/transforms/passes.h       |   2 +-
 ...u_extract_head_tail_outside_compilation.cc | 194 ++++++++++++++++--
 3 files changed, 247 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 77ca08c089a..eb67bdcc914 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -1,13 +1,17 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-head-tail-outside-compilation | FileCheck %s --dump-input-on-failure
 
-// Tests extraction of a single outside compiled cluster with no input or output dependecies.
+// Tests extraction of a outside compiled ops at head of TPU computation.
 
-// CHECK-LABEL: func @nodep_single_head_outside_compilation
-func @nodep_single_head_outside_compilation() -> () {
-   // CHECK: "tf.A"
-   // CHECK-NEXT: "tf_device.launch"
-  "tf_device.launch"() ( {
-    "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
+func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+  // CHECK:      tf_device.launch
+  // CHECK:        "tf.A"
+  // CHECK-NEXT:   tf_device.return
+  //
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.C"
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
     "tf.B"() : () -> ()
     "tf.C"() : () -> ()
     tf_device.return
@@ -15,15 +19,62 @@ func @nodep_single_head_outside_compilation() -> () {
   return
 }
 
-// CHECK-LABEL: func @nodep_multiple_head_outside_compilation
-func @nodep_multiple_head_outside_compilation() -> () {
-   // CHECK: "tf.A"
-   // CHECK-NEXT: "tf.B"
-   // CHECK-NEXT: "tf_device.launch"
-  "tf_device.launch"() ( {
-    "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() : () -> ()
+// CHECK-LABEL: func @multiple_head_outside_compilation
+func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+  // CHECK:        %[[A_OUT:.*]] = "tf.A"
+  // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+  // CHECK:        "tf.C"
+  // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+  //
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+    "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
+    "tf.D"(%1) : (tensor<i32>) -> ()
+    tf_device.return
+  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
+func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
+  // CHECK-NOT:  tf_device.launch
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.A"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
+    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+    "tf.C"(%1) : (tensor<i32>) -> ()
+    tf_device.return
+  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
+func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
+  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+  // CHECK:        %[[A_OUT:.*]] = "tf.A"
+  // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+  // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+  //
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.B"
+  // CHECK:        "tf.C"
+  // CHECK:        "tf.E"
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+    %1 = "tf.B"() {} : () -> (tensor<i32>)
+    %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+    %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+    %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
     tf_device.return
   }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index c1d99c2dee3..0b1ff2beebb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -258,7 +258,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass();
 
 // Creates a pass that extracts outside compilation (CPU ops inside TPU cluster)
 // at head/tail of TPU cluster to run before/after TPU computation.
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractHeadTailOutsideCompilationPass();
 
 // Creates a pass that extract outside compilation (CPU ops inside TPU cluster)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 141feeb6b24..b9e214470cd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -14,11 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <type_traits>
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -30,30 +42,182 @@ namespace {
 
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
-struct TPUExtractHeadTailOutsideCompilation
-    : public PassWrapper<TPUExtractHeadTailOutsideCompilation, FunctionPass> {
-  void runOnFunction() override;
-};
+bool HasOutsideCompilationAttribute(Operation* op) {
+  return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr;
+}
 
-void TPUExtractHeadTailOutsideCompilation::runOnFunction() {
-  getFunction().walk([&](tf_device::LaunchOp launch) {
-    Block& launch_block = launch.GetBody();
-    for (auto& op : llvm::make_early_inc_range(launch_block.getOperations())) {
-      // TODO(b/155115766): Handle outputs that should be inputs to TPU
-      // LaunchOp.
-      if (auto attr =
-              op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
-        op.moveBefore(launch);
-      } else {
+// Returns whether all operands of `op` are from values inside the
+// `input_value_set`.
+bool OpContainsOperandsFromSet(Operation* op,
+                               const llvm::SetVector<Value>& input_value_set) {
+  for (auto operand : op->getOperands())
+    if (input_value_set.count(operand) == 0) return false;
+
+  return true;
+}
+
+void RecordOutsideCompiledOpsAndUsages(
+    Operation* op, llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops,
+    llvm::SetVector<Value>* outside_compiled_op_usages) {
+  if (HasOutsideCompilationAttribute(op) &&
+      OpContainsOperandsFromSet(op, *outside_compiled_op_usages)) {
+    outside_compiled_ops->insert(op);
+    outside_compiled_op_usages->insert(op->getResults().begin(),
+                                       op->getResults().end());
+  }
+}
+
+// Traverses the MLIR graph and returns a set of ops that
+// are connected to inputs of TPU computation and outside compiled.
+void ExtractOutsideCompiledOpsConnectedToHead(
+    Value input_value, llvm::SetVector<Value>* values_used_in_host_cluster,
+    llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops) {
+  llvm::SmallSetVector<Operation*, 4> parent_outside_compiled_ops_at_head;
+  for (auto& usage : input_value.getUses()) {
+    auto head_operation = usage.getOwner();
+    RecordOutsideCompiledOpsAndUsages(head_operation,
+                                      &parent_outside_compiled_ops_at_head,
+                                      values_used_in_host_cluster);
+  }
+
+  // Traverse the graph and find all outside compiled ops connected from
+  // the `input_value`.
+  while (!parent_outside_compiled_ops_at_head.empty()) {
+    llvm::SmallSetVector<Operation*, 4> connected_outside_compiled_ops;
+    for (auto head_outside_compiled_op : parent_outside_compiled_ops_at_head) {
+      auto op_results = head_outside_compiled_op->getOpResults();
+      for (auto op_result : op_results) {
+        for (auto& use : op_result.getUses()) {
+          auto connected_op = use.getOwner();
+          RecordOutsideCompiledOpsAndUsages(connected_op,
+                                            &connected_outside_compiled_ops,
+                                            values_used_in_host_cluster);
+        }
+      }
+    }
+
+    outside_compiled_ops->insert(parent_outside_compiled_ops_at_head.begin(),
+                                 parent_outside_compiled_ops_at_head.end());
+    std::swap(parent_outside_compiled_ops_at_head,
+              connected_outside_compiled_ops);
+  }
+}
+
+// TODO(hongjunchoi): Also handle ops without inputs that are outside
+// compiled.
+//
+// Returns set of ops that are outside compiled and are directly connected
+// to inputs to the TPU computation.
+llvm::SmallSetVector<Operation*, 4> IdentifyOutsideCompiledOpsAtHead(
+    tf_device::ClusterOp tpu_cluster) {
+  llvm::SmallSetVector<Operation*, 4> outside_compiled_at_head_ops;
+  llvm::SetVector<Value> values_used_in_cluster;
+  auto& cluster_region = tpu_cluster.body();
+  getUsedValuesDefinedAbove(cluster_region, cluster_region,
+                            values_used_in_cluster);
+
+  auto input_value_list = llvm::to_vector<8>(values_used_in_cluster);
+  for (auto input_value : input_value_list)
+    ExtractOutsideCompiledOpsConnectedToHead(
+        input_value, &values_used_in_cluster, &outside_compiled_at_head_ops);
+  return outside_compiled_at_head_ops;
+}
+
+// Returns output values of extracted outside compiled cluster at head that
+// are used by the TPU computation.
+llvm::SmallVector<Value, 8> GetHeadExtractedClusterOutputs(
+    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
+  llvm::SmallVector<Value, 8> outputs;
+  outputs.reserve(head_outside_compiled_ops.size());
+
+  for (auto op : head_outside_compiled_ops) {
+    for (Operation* user : op->getUsers()) {
+      if (!head_outside_compiled_ops.count(user)) {
+        outputs.append(op->result_begin(), op->result_end());
         break;
       }
     }
+  }
+
+  return outputs;
+}
+
+// Creates new tf_device.launch op with outside compiled ops extracted
+// from the head of TPU computation.
+llvm::Optional<tf_device::LaunchOp> IsolateHeadExtractedOpsToLaunchOp(
+    OpBuilder* builder, tf_device::ClusterOp cluster,
+    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
+  if (head_outside_compiled_ops.empty())
+    return llvm::Optional<tf_device::LaunchOp>();
+
+  // Create tf_device.launch op to separate all extracted outside compiled ops
+  // before the tf_device.cluster.
+  auto output_values =
+      GetHeadExtractedClusterOutputs(head_outside_compiled_ops);
+
+  llvm::SmallVector<Type, 8> output_return_types;
+  output_return_types.reserve(output_values.size());
+  for (auto output : output_values)
+    output_return_types.emplace_back(output.getType());
+
+  builder->setInsertionPoint(cluster);
+  auto host_launch_op = builder->create<tf_device::LaunchOp>(
+      cluster.getLoc(), builder->getStringAttr(""), output_return_types);
+
+  // Replace all usages of outside compiled ops that are used in TPU
+  // computation with the results of the above created launch op.
+  for (auto output_and_index : llvm::enumerate(output_values)) {
+    auto output_index = output_and_index.index();
+    auto output = output_and_index.value();
+    for (auto& use : output.getUses()) {
+      if (!head_outside_compiled_ops.count(use.getOwner()))
+        use.set(host_launch_op.getResult(output_index));
+    }
+  }
+
+  // Create terminator op for the newly created launch op.
+  host_launch_op.body().push_back(new Block());
+  builder->setInsertionPointToEnd(&host_launch_op.GetBody());
+  auto terminator = builder->create<tf_device::ReturnOp>(
+      host_launch_op.getLoc(), output_values);
+
+  // Move all outside compile ops from cluster op to launch op.
+  for (auto outside_compiled_op : head_outside_compiled_ops)
+    outside_compiled_op->moveBefore(terminator);
+
+  return host_launch_op;
+}
+
+struct TPUExtractHeadTailOutsideCompilation
+    : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
+  // Get runtime devices information from the closest parent module.
+  auto module = getOperation();
+  mlir::TF::RuntimeDevices devices;
+  if (failed(tensorflow::GetDevicesFromOp(module, &devices)))
+    return signalPassFailure();
+
+  OpBuilder builder(&getContext());
+  module.walk([&](tf_device::ClusterOp cluster) {
+    auto head_outside_compiled_ops = IdentifyOutsideCompiledOpsAtHead(cluster);
+    IsolateHeadExtractedOpsToLaunchOp(&builder, cluster,
+                                      head_outside_compiled_ops);
+
+    // TODO(b/156030523): Update device attribute of newly created host launch
+    // op as well as enclosing Replicate op (if TPU computation is replicated)
+    // with host device names.
+
+    // TODO(b/155115766): Implement tail outside compiled op extraction.
   });
 }
 
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractHeadTailOutsideCompilationPass() {
   return std::make_unique<TPUExtractHeadTailOutsideCompilation>();
 }

From 9f58e6902cea5d26e68635d1c766c2dc6125577a Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 12 May 2020 12:26:39 -0700
Subject: [PATCH 0424/1533] [XLA/GPU] Make Thunk::Initialize() happen at
 compile-time, not run-time. This simplifies GpuExecutable for MLIR
 transition.

PiperOrigin-RevId: 311178815
Change-Id: Ib9c8b8a2f8719c0cd8b342ab07af6e8cb65d82bf
---
 tensorflow/compiler/xla/service/gpu/BUILD     | 12 ++++++-----
 .../xla/service/gpu/amdgpu_compiler.cc        | 10 ++++-----
 .../xla/service/gpu/amdgpu_compiler.h         |  2 +-
 .../xla/service/gpu/conditional_thunk.cc      |  4 ++--
 .../xla/service/gpu/conditional_thunk.h       |  2 +-
 .../compiler/xla/service/gpu/for_thunk.cc     |  4 ++--
 .../compiler/xla/service/gpu/for_thunk.h      |  2 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 21 +++++++++++++------
 .../compiler/xla/service/gpu/gpu_compiler.h   |  7 +++----
 .../xla/service/gpu/gpu_executable.cc         | 10 ++++-----
 .../compiler/xla/service/gpu/gpu_executable.h | 18 ++++++----------
 .../compiler/xla/service/gpu/gpu_types.h      | 17 +++++++++++++++
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  9 ++++----
 .../compiler/xla/service/gpu/kernel_thunk.h   |  4 +---
 .../xla/service/gpu/nvptx_compiler.cc         | 11 ++++------
 .../compiler/xla/service/gpu/nvptx_compiler.h |  2 +-
 .../xla/service/gpu/sequential_thunk.cc       |  4 ++--
 .../xla/service/gpu/sequential_thunk.h        |  2 +-
 tensorflow/compiler/xla/service/gpu/thunk.h   |  5 ++---
 .../compiler/xla/service/gpu/while_thunk.cc   |  6 +++---
 .../compiler/xla/service/gpu/while_thunk.h    |  2 +-
 .../service/mlir_gpu/mlir_compiler_impl.cc    |  7 ++++---
 .../compiler/xla/tests/llvm_compiler_test.cc  |  7 +++----
 23 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 61bc41283e1..8f8263a85f9 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -17,15 +17,15 @@ load(
     "tf_cuda_library",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_is_configured",
+)
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 package(
@@ -86,6 +86,7 @@ cc_library(
     name = "gpu_types",
     hdrs = ["gpu_types.h"],
     deps = [
+        "//tensorflow/compiler/xla:types",
         "@com_google_absl//absl/types:variant",
     ],
 )
@@ -405,6 +406,7 @@ cc_library(
     deps = [
         ":buffer_allocations",
         ":gpu_executable_run_options",
+        ":gpu_types",
         ":hlo_execution_profiler",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/service:hlo",
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index 974db02b1b3..485aff0c4d8 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -104,11 +104,9 @@ GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return isa_version;
 }
 
-StatusOr<std::pair<std::string, std::vector<uint8>>>
-AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
-                                    llvm::Module* llvm_module,
-                                    GpuVersion gpu_version,
-                                    se::StreamExecutor* stream_exec) {
+StatusOr<GpuTargetBinary> AMDGPUCompiler::CompileTargetBinary(
+    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
+    se::StreamExecutor* stream_exec) {
   if (rocdl_dir_.empty()) {
     // Compute rocdl_dir_ just once and cache it in this member.
     rocdl_dir_ = GetROCDLDir(module->config());
@@ -129,7 +127,7 @@ AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
     user_post_optimization_hook_(*llvm_module);
   }
 
-  return std::pair<std::string, std::vector<uint8>>("", std::move(hsaco));
+  return GpuTargetBinary{"", std::move(hsaco)};
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
index acc5e021e3d..9033585763b 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
@@ -39,7 +39,7 @@ class AMDGPUCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+  StatusOr<GpuTargetBinary> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index e31f45942b1..5e7d89c7aee 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -50,7 +50,7 @@ void ConditionalThunk::ComputeAnnotations() {
   }
 }
 
-Status ConditionalThunk::Initialize(const GpuExecutable& executable,
+Status ConditionalThunk::Initialize(const GpuTargetBinary& target_binary,
                                     se::StreamExecutor* executor) {
   if (branch_index_is_bool_) {
     TF_RET_CHECK(branch_thunks_.size() == 2);
@@ -58,7 +58,7 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable,
     TF_RET_CHECK(!branch_thunks_.empty());
   }
   for (auto& branch_thunk : branch_thunks_) {
-    TF_RETURN_IF_ERROR(branch_thunk->Initialize(executable, executor));
+    TF_RETURN_IF_ERROR(branch_thunk->Initialize(target_binary, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index 404e2131eff..ba69e1a38ec 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -52,7 +52,7 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 0a97f668b38..aacc9deb739 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -39,9 +39,9 @@ void ForThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status ForThunk::Initialize(const GpuExecutable& executable,
+Status ForThunk::Initialize(const GpuTargetBinary& target_binary,
                             se::StreamExecutor* executor) {
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 57402f70627..57657b6825f 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -38,7 +38,7 @@ class ForThunk : public Thunk {
   ForThunk& operator=(const ForThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 5f6dfd7d3a5..533ff52a90d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -565,8 +565,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   GpuVersion gpu_version = GetGpuVersion(stream_exec);
 
-  using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
-  TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
+  TF_ASSIGN_OR_RETURN(GpuTargetBinary backend_result,
                       CompileTargetBinary(module.get(), &llvm_module,
                                           gpu_version, stream_exec));
 
@@ -578,6 +577,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                             thunk_schedule->ToString());
   }
 
+  std::vector<Thunk*> thunks;
+  for (Thunk* thunk : thunk_schedule->TotalOrder()) {
+    thunks.push_back(thunk);
+  }
+
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
 
@@ -597,14 +601,19 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   }
 
   auto* gpu_executable = new GpuExecutable(
-      backend_result.first, backend_result.second, gpu_version,
-      std::move(thunk_schedule), std::move(module),
-      std::move(buffer_assignment), std::move(profile_printer),
-      std::move(profile_index_map));
+      std::move(backend_result), gpu_version, std::move(thunk_schedule),
+      std::move(module), std::move(buffer_assignment),
+      std::move(profile_printer), std::move(profile_index_map));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
   }
+
+  for (Thunk* thunk : thunks) {
+    TF_RETURN_IF_ERROR(
+        thunk->Initialize(gpu_executable->target_binary(), stream_exec));
+  }
+
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index b52af5392d1..deb5d785777 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -74,10 +74,9 @@ class GpuCompiler : public LLVMCompiler {
 
   virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0;
 
-  virtual StatusOr<std::pair<std::string, std::vector<uint8>>>
-  CompileTargetBinary(const HloModule* hlo_module, llvm::Module* llvm_module,
-                      GpuVersion gpu_version,
-                      se::StreamExecutor* stream_exec) = 0;
+  virtual StatusOr<GpuTargetBinary> CompileTargetBinary(
+      const HloModule* hlo_module, llvm::Module* llvm_module,
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) = 0;
 
   Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 2df6b50d361..ebd3630635b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -52,16 +52,15 @@ using ::tensorflow::profiler::ScopedAnnotation;
 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
 GpuExecutable::GpuExecutable(
-    const string& text, const std::vector<uint8>& binary,
-    GpuVersion gpu_version, std::unique_ptr<const ThunkSchedule> thunk_schedule,
+    GpuTargetBinary target_binary, GpuVersion gpu_version,
+    std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::shared_ptr<HloModule> hlo_module,
     std::shared_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
-      text_(text),
-      binary_(binary),
+      target_binary_(std::move(target_binary)),
       gpu_version_(gpu_version),
       thunk_schedule_(std::move(thunk_schedule)),
       assignment_(std::move(assignment)) {
@@ -176,7 +175,6 @@ Status GpuExecutable::ExecuteThunks(
     // module, we won't get any data, but that's probably an OK trade-off.
     ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
 
-    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -469,7 +467,7 @@ const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
 int64 GpuExecutable::SizeOfGeneratedCodeInBytes() {
   // Non-empty PTX but empty cubin: compilation must have failed, return
   // "unknown".
-  if (binary().empty() && !text_.empty()) {
+  if (binary().empty() && !text().empty()) {
     return -1;
   }
   return binary().size();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 045a36c099b..29441c60b04 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -52,8 +52,7 @@ class GpuExecutable : public Executable {
   // We need to share ownership of hlo_module and assignment with profiler to
   // safely keep a reference to these objects during tracing period, thus they
   // are passed as shared pointers.
-  GpuExecutable(const string& text, const std::vector<uint8>& binary,
-                GpuVersion gpu_version,
+  GpuExecutable(GpuTargetBinary target_binary, GpuVersion gpu_version,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::shared_ptr<HloModule> hlo_module,
                 std::shared_ptr<const BufferAssignment> assignment,
@@ -73,12 +72,14 @@ class GpuExecutable : public Executable {
 
   // Returns the compiled code for the computation. The compiled code is PTX in
   // Cuda and unused empty string in ROCm.
-  const string& text() const { return text_; }
+  const string& text() const { return target_binary_.text; }
 
   // Returns the binary stored in this GpuExecutable. The binary is cubin in
   // Cuda, and HSA code object in ROCm. It may be empty, in which case
   // compilation is left up to the GPU driver.
-  const std::vector<uint8>& binary() const { return binary_; }
+  const std::vector<uint8>& binary() const { return target_binary_.binary; }
+
+  const GpuTargetBinary& target_binary() const { return target_binary_; }
 
   // ExecuteAsyncOnStream will fail if the compute capability of the stream
   // doesn't match the compute capability passed to this object's constructor.
@@ -131,14 +132,7 @@ class GpuExecutable : public Executable {
   // This string should be modified only before ExecuteOnStream.
   string ir_module_string_;
 
-  // The compiled code for the computation.
-  const string text_;
-
-  // The GPU machine code for the computation, targeting GPUs at
-  // compute_capability_.
-  //
-  // May be empty, in which case we leave compilation up to the GPU driver.
-  const std::vector<uint8> binary_;
+  const GpuTargetBinary target_binary_;
 
   // The GPU version for compute compatibility check.
   GpuVersion gpu_version_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_types.h b/tensorflow/compiler/xla/service/gpu/gpu_types.h
index 1c51040fb82..5c8b8093d65 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_types.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_types.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 
+#include <string>
+#include <vector>
+
 #include "absl/types/variant.h"
+#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
@@ -25,6 +29,19 @@ namespace gpu {
 // it comprises a pair of integers denoting major and minor version.
 // On ROCm platform, it comprises one integer for AMD GCN ISA version.
 using GpuVersion = absl::variant<std::pair<int, int>, int>;
+
+// A struct to carry around compiled results by the GPU assembler.
+struct GpuTargetBinary {
+  GpuTargetBinary(const GpuTargetBinary& other) = delete;
+  GpuTargetBinary(GpuTargetBinary&& other) = default;
+
+  // The text format of the compiled result, e.g. PTX.
+  std::string text;
+
+  // The actual compiled binary.
+  std::vector<tensorflow::uint8> binary;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index d976b5d8d4d..0b5010ea66b 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -42,7 +41,7 @@ KernelThunk::KernelThunk(absl::Span<const BufferAllocation* const> args,
       kernel_name_(kernel_name),
       unroll_factor_(unroll_factor) {}
 
-Status KernelThunk::Initialize(const GpuExecutable& executable,
+Status KernelThunk::Initialize(const GpuTargetBinary& target_binary,
                                se::StreamExecutor* executor) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -55,8 +54,10 @@ Status KernelThunk::Initialize(const GpuExecutable& executable,
   if (kernel_cache_.end() == it) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::KernelBase> kernel,
-        CreateKernel(kernel_name_, args_.size(), executable.text(),
-                     executable.binary(), executor));
+        CreateKernel(kernel_name_, args_.size(), target_binary.text,
+                     target_binary.binary, executor));
+    CHECK(!target_binary.binary.empty());
+    CHECK(kernel);
 
     kernel_cache_.emplace(executor, std::move(kernel));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 88351881f3a..97a1d08a57e 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -35,8 +35,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class GpuExecutable;
-
 // This class stores everything that StreamExecutor needs for launching a
 // kernel. It implements the ExecuteOnStream interface for GpuExecutable to
 // invoke the corresponding kernel.
@@ -58,7 +56,7 @@ class KernelThunk : public Thunk {
   int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 0196267d904..cf6fe9292e5 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -295,11 +295,9 @@ GpuVersion NVPTXCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return std::make_pair(cc_major, cc_minor);
 }
 
-StatusOr<std::pair<std::string, std::vector<uint8>>>
-NVPTXCompiler::CompileTargetBinary(const HloModule* module,
-                                   llvm::Module* llvm_module,
-                                   GpuVersion gpu_version,
-                                   se::StreamExecutor* stream_exec) {
+StatusOr<GpuTargetBinary> NVPTXCompiler::CompileTargetBinary(
+    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
+    se::StreamExecutor* stream_exec) {
   std::pair<int, int> compute_capability =
       absl::get<std::pair<int, int>>(gpu_version);
 
@@ -340,8 +338,7 @@ NVPTXCompiler::CompileTargetBinary(const HloModule* module,
       stream_exec, ptx, compute_capability.first, compute_capability.second,
       module->config());
 
-  return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
-                                                    std::move(cubin));
+  return GpuTargetBinary{std::move(ptx), std::move(cubin)};
 }
 
 std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index e69be947522..ec550b5b2ff 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -48,7 +48,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+  StatusOr<GpuTargetBinary> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index 025ca60ef0c..bd260336c28 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -34,10 +34,10 @@ void SequentialThunk::ComputeAnnotations() {
   }
 }
 
-Status SequentialThunk::Initialize(const GpuExecutable& executable,
+Status SequentialThunk::Initialize(const GpuTargetBinary& target_binary,
                                    se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(executable, executor));
+    TF_RETURN_IF_ERROR(thunk->Initialize(target_binary, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 3abb82c0b66..b5475664733 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -40,7 +40,7 @@ class SequentialThunk : public Thunk {
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index e9be41b74de..7aff9ca47b7 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -30,8 +31,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class GpuExecutable;
-
 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
 //
@@ -97,7 +96,7 @@ class Thunk {
   // This may be called multiple times.  Its main purpose is to give us a chance
   // to do initialization outside of ExecuteOnStream() so that the
   // time spent initializing doesn't count towards our execution profile.
-  virtual Status Initialize(const GpuExecutable& /*executable*/,
+  virtual Status Initialize(const GpuTargetBinary& /*target_binary*/,
                             se::StreamExecutor* /*executor*/) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 4134cd39832..2650508093e 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -45,11 +45,11 @@ void WhileThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status WhileThunk::Initialize(const GpuExecutable& executable,
+Status WhileThunk::Initialize(const GpuTargetBinary& target_binary,
                               se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(
-      condition_thunk_sequence_->Initialize(executable, executor));
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
+      condition_thunk_sequence_->Initialize(target_binary, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 31db01b72ba..77ee0104a1f 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
   WhileThunk& operator=(const WhileThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 35ac3b2bf63..667cdef8f6c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -549,10 +549,11 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   }
 
   // TODO(b/137624192): Add profiling support.
+
   return {absl::make_unique<GpuExecutable>(
-      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
-      emission_context.releaseHloModule(), std::move(buffer_assignment),
-      nullptr, nullptr)};
+      xla::gpu::GpuTargetBinary{ptx, cubin}, GetGpuVersion(stream_exec),
+      std::move(thunk_schedule), emission_context.releaseHloModule(),
+      std::move(buffer_assignment), nullptr, nullptr)};
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 1947f517bd9..16ed02296b7 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -55,16 +55,15 @@ class GpuDummyCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) { return 0; }
 
-  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+  StatusOr<GpuTargetBinary> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec) {
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) override {
     if (user_post_optimization_hook_) {
       user_post_optimization_hook_(*llvm_module);
     }
 
     std::vector<uint8> compiled_results;
-    return std::pair<std::string, std::vector<uint8>>(
-        "", std::move(compiled_results));
+    return GpuTargetBinary{"", std::move(compiled_results)};
   }
 };
 }  // namespace gpu

From 88dfd8ce6dc063659e4fb9b8a6a040b8a673c466 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 12 May 2020 12:28:52 -0700
Subject: [PATCH 0425/1533] Update hexagon_delegate guide to reference delegate
 readme

PiperOrigin-RevId: 311179262
Change-Id: Ic3dd3851facd12c8e1e8adde3f9f60a31355e430
---
 .../g3doc/performance/hexagon_delegate.md     | 38 +------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 51af59891dc..60fe9465bf4 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -259,43 +259,7 @@ ro.board.platform`).
     *   This is tentatively planned for a future release, though there is no
         concrete timeline.
 *   Which ops are supported by the delegate?
-    *   Initial list of supported ops:
-        *   Add
-        *   ArgMax
-        *   ArgMin
-        *   AveragePool2D (without any activation)
-        *   Concat
-        *   Conv2D with following constraints:
-            *   stride width/height <= 3
-        *   DepthToSpace
-        *   DepthwiseConv2D with following constraints:
-            *   Filter width == 3
-            *   depth_multiplier == 1
-            *   dilation only supported when stride == 1
-            *   Otherwise, stride height/width <= 3
-        *   FullyConnected (without any activation)
-        *   Hardswish
-        *   L2Normalization (without any activation)
-        *   Logistic (aka Sigmoid)
-        *   MaxPool2D (without any activation)
-        *   Mul (without any activation)
-        *   Neg
-        *   Pad: Only supports 0 padding
-        *   Relu
-        *   Relu6
-        *   Reshape
-        *   Resize Bilinear with following constraints:
-            *   Requested size <= 65
-        *   Resize Nearest Neighbor
-        *   SoftMax
-        *   SpaceToDepth
-        *   Split
-        *   Sub
-        *   Tanh
-        *   Transpose
-        *   TransposeConv2D with following constraints:
-            *   stride height/width <= 3
-            *   dilation height/width == 1
+    *   See the current list of [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
 *   How can I tell that the model is using the DSP when I enable the delegate?
     *   Two log messages will be printed when you enable the delegate - one to
         indicate if the delegate was created and another to indicate how many

From 5100abc4af068b6915a26b9f6531b9fec4da4c06 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Tue, 12 May 2020 12:29:59 -0700
Subject: [PATCH 0426/1533] Initial checkin of C++ header-only TensorHandle as
 part of RFC https://github.com/tensorflow/community/pull/207.

PiperOrigin-RevId: 311179503
Change-Id: Ib3cfb2547150d09ee655db6ca6bc72ef3ef7adde
---
 tensorflow/c/eager/c_api.cc                   |   2 +-
 tensorflow/c/eager/c_api.h                    |   2 +-
 tensorflow/cc/experimental/base/public/BUILD  |  14 ++
 .../cc/experimental/base/public/runtime.h     |   3 +
 .../base/public/runtime_builder.h             |   2 +
 .../cc/experimental/base/public/status.h      |   3 +
 .../cc/experimental/base/public/tensor.h      |   2 +
 .../experimental/base/public/tensorhandle.h   |  98 ++++++++++
 tensorflow/cc/experimental/base/tests/BUILD   |  29 +++
 .../cc/experimental/base/tests/tensor_test.cc | 101 +++-------
 .../base/tests/tensor_types_test_util.h       |  76 ++++++++
 .../base/tests/tensorhandle_test.cc           | 184 ++++++++++++++++++
 .../experimental/public/concrete_function.h   |   2 +
 .../public/concrete_function_list.h           |   2 +
 .../experimental/public/function_metadata.h   |   2 +
 .../experimental/public/saved_model_api.h     |   2 +
 .../tests/saved_model_api_test.cc             |  27 +--
 17 files changed, 462 insertions(+), 89 deletions(-)
 create mode 100644 tensorflow/cc/experimental/base/public/tensorhandle.h
 create mode 100644 tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
 create mode 100644 tensorflow/cc/experimental/base/tests/tensorhandle_test.cc

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 73c2f7824b2..5c01ccb82bb 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -924,7 +924,7 @@ extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
       context->GetDevicePlacementPolicy());
 }
 
-TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
+TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
   if (!status->status.ok()) return nullptr;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 070b3a9bb60..5afe3047dd7 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -137,7 +137,7 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
 // placed in memory of different devices or remote address spaces.
 typedef struct TFE_TensorHandle TFE_TensorHandle;
 
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t,
                                                             TF_Status* status);
 // Indicates that the caller will not be using `h` any more.
 TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
diff --git a/tensorflow/cc/experimental/base/public/BUILD b/tensorflow/cc/experimental/base/public/BUILD
index 93acf1bd319..045d4e6cd97 100644
--- a/tensorflow/cc/experimental/base/public/BUILD
+++ b/tensorflow/cc/experimental/base/public/BUILD
@@ -62,3 +62,17 @@ cc_library(
         "//tensorflow/c:tf_tensor",
     ],
 )
+
+cc_library(
+    name = "tensorhandle",
+    hdrs = [
+        "tensorhandle.h",
+    ],
+    deps = [
+        ":runtime",
+        ":status",
+        ":tensor",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/public/runtime.h b/tensorflow/cc/experimental/base/public/runtime.h
index 47fd8869647..711a38c233a 100644
--- a/tensorflow/cc/experimental/base/public/runtime.h
+++ b/tensorflow/cc/experimental/base/public/runtime.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Runtime represents an opaque instance of a Tensorflow runtime, with its own
@@ -40,6 +41,7 @@ class Runtime {
  private:
   friend class RuntimeBuilder;
   friend class SavedModelAPI;
+  friend class TensorHandle;
 
   // Wraps a TFE_Context. Takes ownership of ctx.
   explicit Runtime(TFE_Context* ctx) : ctx_(ctx) {}
@@ -63,6 +65,7 @@ class Runtime {
 };
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
diff --git a/tensorflow/cc/experimental/base/public/runtime_builder.h b/tensorflow/cc/experimental/base/public/runtime_builder.h
index ed3c93ae135..737e06cb2c6 100644
--- a/tensorflow/cc/experimental/base/public/runtime_builder.h
+++ b/tensorflow/cc/experimental/base/public/runtime_builder.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // RuntimeBuilder is a builder used to construct a tensorflow::cc::Runtime.
@@ -79,6 +80,7 @@ inline std::unique_ptr<Runtime> RuntimeBuilder::Build(Status* status) {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
diff --git a/tensorflow/cc/experimental/base/public/status.h b/tensorflow/cc/experimental/base/public/status.h
index f91f2caccd8..98c8cf6ced2 100644
--- a/tensorflow/cc/experimental/base/public/status.h
+++ b/tensorflow/cc/experimental/base/public/status.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Status is a wrapper around an error code and an optional error message.
@@ -57,6 +58,7 @@ class Status {
   friend class RuntimeBuilder;
   friend class Runtime;
   friend class SavedModelAPI;
+  friend class TensorHandle;
 
   // Wraps a TF_Status*, and takes ownership of it.
   explicit Status(TF_Status* status) : status_(status) {}
@@ -88,6 +90,7 @@ inline void Status::SetStatus(TF_Code code, const std::string& msg) {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
diff --git a/tensorflow/cc/experimental/base/public/tensor.h b/tensorflow/cc/experimental/base/public/tensor.h
index 26b0e5dc55e..fc447262ce1 100644
--- a/tensorflow/cc/experimental/base/public/tensor.h
+++ b/tensorflow/cc/experimental/base/public/tensor.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Tensor represents an n-dimensional array of values.
@@ -168,6 +169,7 @@ inline Tensor Tensor::FromBuffer(TF_DataType dtype,
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
diff --git a/tensorflow/cc/experimental/base/public/tensorhandle.h b/tensorflow/cc/experimental/base/public/tensorhandle.h
new file mode 100644
index 00000000000..99453ee7ea8
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/tensorhandle.h
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// An opaque representation of a tensor computed/managed by the Tensorflow
+// runtime (tensorflow:cc::Runtime). Unlike a tensor, a Tensorhandle may refer
+// to tensors placed in memory of different devices or remote address spaces.
+// Note that tensorflow::cc::Runtime MUST outlive all TensorHandles created
+// from it.
+class TensorHandle {
+ public:
+  // Unwraps a Tensor from the given TensorHandle. If an error occurred,
+  // status->ok() will be false, and the returned Tensor must not be used.
+  Tensor Resolve(Status* status);
+
+  // Constructs a TensorHandle from a Tensor. If an error occurred,
+  // status->ok() will be false, and the returned TensorHandle must not be used.
+  static TensorHandle FromTensor(const Tensor& tensor, const Runtime& runtime,
+                                 Status* status);
+
+  // TensorHandle is movable, and not copyable
+  TensorHandle(TensorHandle&&) = default;
+  TensorHandle& operator=(TensorHandle&&) = default;
+
+ private:
+  // Wraps a TFE_TensorHandle. Takes ownership of handle.
+  explicit TensorHandle(TFE_TensorHandle* handle) : handle_(handle) {}
+
+  // TensorHandle is not copyable
+  TensorHandle(const TensorHandle&) = delete;
+  TensorHandle& operator=(const TensorHandle&) = delete;
+
+  // Returns the underlying TFE_TensorHandle that this object wraps.
+  // This object retains ownership of the pointer.
+  TFE_TensorHandle* GetTFETensorHandle() const { return handle_.get(); }
+
+  // Deletes the currently wrapped TFE_TensorHandle, and swaps it with handle,
+  // and takes ownership of handle.
+  void Reset(TFE_TensorHandle* handle) { handle_.reset(handle); }
+
+  struct TFETensorHandleDeleter {
+    void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+  };
+  std::unique_ptr<TFE_TensorHandle, TFETensorHandleDeleter> handle_;
+};
+
+inline Tensor TensorHandle::Resolve(Status* status) {
+  TF_Tensor* tensor =
+      TFE_TensorHandleResolve(handle_.get(), status->GetTFStatus());
+  if (!status->ok()) {
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
+inline TensorHandle TensorHandle::FromTensor(const Tensor& tensor,
+                                             const Runtime& runtime,
+                                             Status* status) {
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandleFromTensor(
+      runtime.GetTFEContext(), tensor.GetTFTensor(), status->GetTFStatus());
+  if (!status->ok()) {
+    return TensorHandle(nullptr);
+  }
+  return TensorHandle(tensor_handle);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
index a2b634a70f4..f449d618f72 100644
--- a/tensorflow/cc/experimental/base/tests/BUILD
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -5,12 +5,22 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "tensor_types_test_util",
+    testonly = True,
+    hdrs = ["tensor_types_test_util.h"],
+    deps = [
+        "//tensorflow/c:tf_datatype",
+    ],
+)
+
 tf_cc_test(
     name = "tensor_test",
     srcs = [
         "tensor_test.cc",
     ],
     deps = [
+        ":tensor_types_test_util",
         "//tensorflow/c:tf_datatype",
         "//tensorflow/cc/experimental/base/public:status",
         "//tensorflow/cc/experimental/base/public:tensor",
@@ -19,3 +29,22 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+tf_cc_test(
+    name = "tensorhandle_test",
+    srcs = [
+        "tensorhandle_test.cc",
+    ],
+    deps = [
+        ":tensor_types_test_util",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/cc/experimental/base/public:runtime",
+        "//tensorflow/cc/experimental/base/public:runtime_builder",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/experimental/base/public:tensor",
+        "//tensorflow/cc/experimental/base/public:tensorhandle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/tests/tensor_test.cc b/tensorflow/cc/experimental/base/tests/tensor_test.cc
index 86a50bac5cd..33f9ab637e8 100644
--- a/tensorflow/cc/experimental/base/tests/tensor_test.cc
+++ b/tensorflow/cc/experimental/base/tests/tensor_test.cc
@@ -16,69 +16,22 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/tensor.h"
 
 #include <stddef.h>
-
-#include <cstdint>
+#include <stdint.h>
 
 #include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace tensorflow {
 namespace {
 
-// Each of the following struct types have two members: a kDType that
-// corresponds to a TF_Datatype enum value, and a typedef "type"
-// of its corresponding C++ type. These types allow us to write Dtype-agnostic
-// tests via GoogleTest's TypedTests:
-// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
-struct FloatType {
-  using type = float;
-  static constexpr TF_DataType kDType = TF_FLOAT;
-};
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
 
-struct DoubleType {
-  using type = double;
-  static constexpr TF_DataType kDType = TF_DOUBLE;
-};
-
-struct Int32Type {
-  using type = int32_t;
-  static constexpr TF_DataType kDType = TF_INT32;
-};
-
-struct UINT8Type {
-  using type = uint8_t;
-  static constexpr TF_DataType kDType = TF_UINT8;
-};
-
-struct INT8Type {
-  using type = int8_t;
-  static constexpr TF_DataType kDType = TF_INT8;
-};
-
-struct INT64Type {
-  using type = int64_t;
-  static constexpr TF_DataType kDType = TF_INT64;
-};
-
-struct UINT16Type {
-  using type = uint16_t;
-  static constexpr TF_DataType kDType = TF_UINT16;
-};
-
-struct UINT32Type {
-  using type = uint32_t;
-  static constexpr TF_DataType kDType = TF_UINT32;
-};
-
-struct UINT64Type {
-  using type = uint64_t;
-  static constexpr TF_DataType kDType = TF_UINT64;
-};
-
-using SimpleTypes =
-    ::testing::Types<FloatType, DoubleType, Int32Type, UINT8Type, INT8Type,
-                     INT64Type, UINT16Type, UINT32Type, UINT64Type>;
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
 
 template <typename T>
 class ConstructScalarTensorTest : public ::testing::Test {};
@@ -88,14 +41,13 @@ TYPED_TEST_SUITE(ConstructScalarTensorTest, SimpleTypes);
 // and verifies the expected dimensions, dtype, value, number of bytes, and
 // number of elements.
 TYPED_TEST(ConstructScalarTensorTest, ValidTensorAttributesAfterConstruction) {
-  cc::Status status;
+  Status status;
   TF_DataType dtype = TypeParam::kDType;
   typename TypeParam::type value = 42;
-  cc::Tensor tensor =
-      cc::Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
-                             /*data=*/&value,
-                             /*len=*/sizeof(value),
-                             /*deleter=*/[](void*, size_t) {}, &status);
+  Tensor tensor = Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                                     /*data=*/&value,
+                                     /*len=*/sizeof(value),
+                                     /*deleter=*/[](void*, size_t) {}, &status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   EXPECT_EQ(tensor.dims(), 0);
@@ -113,7 +65,7 @@ TYPED_TEST_SUITE(Construct1DTensorTest, SimpleTypes);
 // and verifies the expected dimensions, dtype, value, number of bytes, and
 // number of elements.
 TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
-  cc::Status status;
+  Status status;
   TF_DataType dtype = TypeParam::kDType;
   // This is our 1D tensor of varying dtype.
   std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
@@ -121,7 +73,7 @@ TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
   std::vector<int64_t> shape;
   shape.push_back(value.size());
 
-  cc::Tensor tensor = cc::Tensor::FromBuffer(
+  Tensor tensor = Tensor::FromBuffer(
       /*dtype=*/dtype, /*shape=*/shape,
       /*data=*/value.data(),
       /*len=*/value.size() * sizeof(typename TypeParam::type),
@@ -130,7 +82,7 @@ TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
 
   EXPECT_EQ(tensor.dims(), 1);
   EXPECT_EQ(tensor.dtype(), dtype);
-  gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
@@ -152,14 +104,14 @@ TYPED_TEST_SUITE(Construct2DTensorTest, SimpleTypes);
 // and verifies the expected dimensions, dtype, value, number of bytes, and
 // number of elements.
 TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
-  cc::Status status;
+  Status status;
   TF_DataType dtype = TypeParam::kDType;
   // This is our 1D tensor of varying dtype.
   std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
   // Shape is Rank 2 vector with shape 2 x 3.
   std::vector<int64_t> shape({2, 3});
 
-  cc::Tensor tensor = cc::Tensor::FromBuffer(
+  Tensor tensor = Tensor::FromBuffer(
       /*dtype=*/dtype, /*shape=*/shape,
       /*data=*/value.data(),
       /*len=*/value.size() * sizeof(typename TypeParam::type),
@@ -169,7 +121,7 @@ TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
 
   EXPECT_EQ(tensor.dims(), 2);
   EXPECT_EQ(tensor.dtype(), dtype);
-  gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
@@ -185,22 +137,22 @@ TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
 
 TEST(CPPTensorAPI, ConstructTensorFromBuffer) {
   bool done = false;
-  cc::Status status;
+  Status status;
   std::vector<int32_t> data_vector({12, 14, 20, 18, 39, 42, 100});
   {
     // data_vector is a rank 1 tensor.
     std::vector<int64_t> shape;
     shape.push_back(data_vector.size());
 
-    cc::Tensor::DeleterCallback callback = [&done](void* data, size_t len) {
+    Tensor::DeleterCallback callback = [&done](void* data, size_t len) {
       done = true;
     };
 
-    cc::Tensor tensor =
-        cc::Tensor::FromBuffer(/*dtype=*/TF_INT32, /*shape=*/shape,
-                               /*data=*/data_vector.data(),
-                               /*len=*/data_vector.size() * sizeof(int32_t),
-                               /*deleter=*/callback, &status);
+    Tensor tensor =
+        Tensor::FromBuffer(/*dtype=*/TF_INT32, /*shape=*/shape,
+                           /*data=*/data_vector.data(),
+                           /*len=*/data_vector.size() * sizeof(int32_t),
+                           /*deleter=*/callback, &status);
     ASSERT_TRUE(status.ok()) << status.message();
   }
   // At this point, tensor has been destroyed, and the deleter callback should
@@ -209,4 +161,3 @@ TEST(CPPTensorAPI, ConstructTensorFromBuffer) {
 }
 
 }  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
new file mode 100644
index 00000000000..af9cad7529b
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+
+#include <stdint.h>
+
+#include "tensorflow/c/tf_datatype.h"
+
+namespace tensorflow {
+
+// Each of the following struct types have two members: a kDType that
+// corresponds to a TF_Datatype enum value, and a typedef "type"
+// of its corresponding C++ type. These types allow us to write Dtype-agnostic
+// tests via GoogleTest's TypedTests:
+// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
+struct FloatType {
+  using type = float;
+  static constexpr TF_DataType kDType = TF_FLOAT;
+};
+
+struct DoubleType {
+  using type = double;
+  static constexpr TF_DataType kDType = TF_DOUBLE;
+};
+
+struct Int32Type {
+  using type = int32_t;
+  static constexpr TF_DataType kDType = TF_INT32;
+};
+
+struct UINT8Type {
+  using type = uint8_t;
+  static constexpr TF_DataType kDType = TF_UINT8;
+};
+
+struct INT8Type {
+  using type = int8_t;
+  static constexpr TF_DataType kDType = TF_INT8;
+};
+
+struct INT64Type {
+  using type = int64_t;
+  static constexpr TF_DataType kDType = TF_INT64;
+};
+
+struct UINT16Type {
+  using type = uint16_t;
+  static constexpr TF_DataType kDType = TF_UINT16;
+};
+
+struct UINT32Type {
+  using type = uint32_t;
+  static constexpr TF_DataType kDType = TF_UINT32;
+};
+
+struct UINT64Type {
+  using type = uint64_t;
+  static constexpr TF_DataType kDType = TF_UINT64;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
diff --git a/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
new file mode 100644
index 00000000000..cfeaba4e392
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/experimental/base/public/tensorhandle.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/runtime_builder.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
+using tensorflow::experimental::cc::TensorHandle;
+
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
+
+template <typename T>
+class ConstructScalarTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(ConstructScalarTensorHandleTest, SimpleTypes);
+
+// This test constructs a scalar tensor for each of the types in "SimpleTypes",
+// then wraps it in a TensorHandle. We then unwrap it back into a Tensor, and
+// verify the expected dims, dtype, value, num bytes, and num elements.
+TYPED_TEST(ConstructScalarTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  typename TypeParam::type value = 42;
+  Tensor original_tensor =
+      Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                         /*data=*/&value,
+                         /*len=*/sizeof(value),
+                         /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 0);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  EXPECT_EQ(*reinterpret_cast<typename TypeParam::type*>(tensor.data()), 42);
+  EXPECT_EQ(tensor.num_bytes(), sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), 1);
+}
+
+template <typename T>
+class Construct1DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct1DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 1D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct1DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 1 vector.
+  std::vector<int64_t> shape;
+  shape.push_back(value.size());
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 1);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+template <typename T>
+class Construct2DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct2DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 2D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct2DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 2 vector with shape 2 x 3.
+  std::vector<int64_t> shape({2, 3});
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 2);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function.h b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
index f57ba052f1a..1adaf70b01a 100644
--- a/tensorflow/cc/saved_model/experimental/public/concrete_function.h
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/function_metadata.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // ConcreteFunction is an executable "function" loaded from a SavedModelAPI.
@@ -54,6 +55,7 @@ inline const FunctionMetadata* ConcreteFunction::GetFunctionMetadata() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
index bab95278eac..88cb779ef15 100644
--- a/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // ConcreteFunctionList helps convert an opaque pointer to an array of
@@ -56,6 +57,7 @@ inline std::vector<ConcreteFunction*> ConcreteFunctionList::ToVector() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/function_metadata.h b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
index c3dcc45af0e..11e1a860d84 100644
--- a/tensorflow/cc/saved_model/experimental/public/function_metadata.h
+++ b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // FunctionMetadata stores additional function information, including
@@ -40,6 +41,7 @@ class FunctionMetadata final {
 };
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
index 814479de213..04018bf2aab 100644
--- a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
+++ b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function_list.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // SavedModelAPI offers a way to load Tensorflow Saved Models
@@ -155,6 +156,7 @@ inline std::vector<ConcreteFunction*> SavedModelAPI::ListFunctions() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
index 155c58604bf..7f7f6b09a6d 100644
--- a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -26,10 +26,14 @@ limitations under the License.
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace tensorflow {
 
 namespace {
 
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::SavedModelAPI;
+using tensorflow::experimental::cc::Status;
+
 constexpr char kTestData[] = "cc/saved_model/testdata";
 
 std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
@@ -43,21 +47,21 @@ std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
 class CPPSavedModelAPITest : public ::testing::TestWithParam<bool> {};
 
 TEST_P(CPPSavedModelAPITest, LoadsSavedModelWithTags) {
-  cc::Status status;
-  cc::RuntimeBuilder builder;
+  Status status;
+  RuntimeBuilder builder;
   bool use_tfrt = GetParam();
   if (use_tfrt) {
     GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
   }
 
   builder.SetUseTFRT(use_tfrt);
-  std::unique_ptr<cc::Runtime> runtime = builder.Build(&status);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
   std::unordered_set<std::string> tags = {"serve"};
-  std::unique_ptr<cc::SavedModelAPI> model =
-      cc::SavedModelAPI::Load(model_dir, *runtime, &status, &tags);
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status, &tags);
 
   // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
   // That unblocks writing other tests that require a TF_SavedModel*,
@@ -67,20 +71,20 @@ TEST_P(CPPSavedModelAPITest, LoadsSavedModelWithTags) {
 }
 
 TEST_P(CPPSavedModelAPITest, LoadsSavedModel) {
-  cc::Status status;
-  cc::RuntimeBuilder builder;
+  Status status;
+  RuntimeBuilder builder;
   bool use_tfrt = GetParam();
   if (use_tfrt) {
     GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
   }
 
   builder.SetUseTFRT(use_tfrt);
-  std::unique_ptr<cc::Runtime> runtime = builder.Build(&status);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
-  std::unique_ptr<cc::SavedModelAPI> model =
-      cc::SavedModelAPI::Load(model_dir, *runtime, &status);
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status);
 
   // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
   // That unblocks writing other tests that require a TF_SavedModel*,
@@ -94,4 +98,3 @@ INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticCPPSavedModelTests,
 
 }  // namespace
 
-}  // namespace tensorflow

From 417b97cd7468830f881a7867192355bd42f8c99d Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Tue, 12 May 2020 19:50:13 +0000
Subject: [PATCH 0427/1533] Modified recompute_grad to handle fwd mode diff

---
 tensorflow/python/eager/forwardprop_test.py   | 24 ++++++---
 .../python/keras/integration_test/BUILD       |  3 +-
 .../gradient_checkpoint_test.py               |  3 +-
 tensorflow/python/ops/custom_gradient.py      | 49 ++++++++++++-------
 4 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index aad179ffb6b..611e9ce2b2a 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -177,7 +177,8 @@ def _test_gradients(testcase,
                     order,
                     delta=1e-3,
                     rtol=1e-2,
-                    atol=1e-6):
+                    atol=1e-6,
+                    recompute=False):
   """Tests forward/backward jacobians of `f`'s [0, `order`)-order gradients."""
   if order < 1:
     raise ValueError(
@@ -190,14 +191,20 @@ def _test_gradients(testcase,
         order=order - 1,
         delta=delta,
         rtol=rtol,
-        atol=atol)
+        atol=atol,
+        recompute=recompute)
   sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(
       f, primals, delta=delta)
   testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
-  sym_jac_fwd = _jacfwd(f, primals)
-  testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol)
-  # And the symbolic computations should be much closer.
-  testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
+  if not recompute:
+    sym_jac_fwd = _jacfwd(f, primals)
+    testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol)
+    # And the symbolic computations should be much closer.
+    testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
+  else:
+    with testcase.assertRaisesRegexp(ValueError,
+                                     "recompute_grad tried to transpose"):
+      sym_jac_fwd = _jacfwd(f, primals)
 
 
 class ForwardpropTest(test.TestCase, parameterized.TestCase):
@@ -357,7 +364,10 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     def f(x):
       return math_ops.reduce_prod(math_ops.tanh(x)**2)
 
-    _test_gradients(self, f, [constant_op.constant([1.])], order=3)
+    _test_gradients(self,
+                    f, [constant_op.constant([1.])],
+                    order=3,
+                    recompute=True)
 
   def testExceptionInCustomGradientNotSwallowed(self):
 
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index f92f9d14685..b7d9957a12e 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -2,6 +2,7 @@
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 package(
     default_visibility = [
@@ -71,7 +72,7 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "gradient_checkpoint_test",
     srcs = ["gradient_checkpoint_test.py"],
     python_version = "PY3",
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
index df23c3abff5..92c53b3ab70 100644
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -127,7 +127,8 @@ def _train_with_recompute(n_steps):
   model2_re = tf.recompute_grad(model2)
   model3_re = tf.recompute_grad(model3)
   optimizer = optimizers.SGD()
-  tr_vars = model1.trainable_variables + model2.trainable_variables + model3.trainable_variables
+  tr_vars = (model1.trainable_variables + model2.trainable_variables +
+             model3.trainable_variables)
   losses = []
   for _ in range(n_steps):
     with tf.GradientTape() as tape:
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index a5013062936..e32c0820e93 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -482,27 +482,42 @@ def recompute_grad(f):
   def inner(*args, **kwargs):
     """Inner function closure for calculating gradients."""
     current_var_scope = variable_scope.get_variable_scope()
-
     with tape_lib.stop_recording():
       result = f(*args, **kwargs)
-
+    @custom_gradient
     def grad(*dresult, **grad_kwargs):
-      """Gradient function calculation for inner function."""
-      variables = grad_kwargs.get("variables")
-      with backprop.GradientTape() as t:
-        id_args = [gen_array_ops.identity(x) for x in args]
-        t.watch(id_args)
+      """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
+
+      def grad_eval():
+        """Gradient function calculation for reverse mode autodiff."""
+        variables = grad_kwargs.get("variables")
+        with backprop.GradientTape() as t:
+          id_args = [gen_array_ops.identity(x) for x in args]
+          t.watch(id_args)
+          if variables is not None:
+            t.watch(variables)
+          with ops.control_dependencies(dresult):
+            with variable_scope.variable_scope(current_var_scope):
+              result = f(*id_args, **kwargs)
+        kw_vars = []
         if variables is not None:
-          t.watch(variables)
-        with ops.control_dependencies(dresult):
-          with variable_scope.variable_scope(current_var_scope):
-            result = f(*id_args, **kwargs)
-      kw_vars = []
-      if variables is not None:
-        kw_vars = list(variables)
-      grads = t.gradient(
-          result, list(id_args) + kw_vars, output_gradients=dresult)
-      return grads[:len(id_args)], grads[len(id_args):]
+          kw_vars = list(variables)
+        grads = t.gradient(result,
+                           list(id_args) + kw_vars,
+                           output_gradients=dresult)
+        if len(grads) == 1 and None in grads:
+          return 0
+        return grads[:len(id_args)], grads[len(id_args):]
+
+      def transpose(*t_args, **t_kwargs):
+        """Gradient function calculation for forward mode autodiff."""
+        # Just throw an error since gradients / activations are not stored on tape for recompute.
+        raise ValueError(
+            "recompute_grad tried to transpose {}."
+            "Consider not using recompute_grad in forward mode autodiff".format(
+                f.__name__))
+
+      return grad_eval(), transpose
 
     return result, grad
 

From 9eac27f8bb3404567d6db6698c3163e12f09d960 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 12:55:58 -0700
Subject: [PATCH 0428/1533] Add Int8/BF16 1D un-tiled layout support for TPU.
 Host can transfer data to/from device directly in host layout and pack/unpack
 will be done on device side.

PiperOrigin-RevId: 311184816
Change-Id: Ib08ef8ec0c3189455b3459af223a2960ca46a0ac
---
 .../compiler/xla/service/hlo_verifier.cc      |  6 ++++--
 .../compiler/xla/service/layout_assignment.cc | 10 ++-------
 .../xla/service/layout_assignment_test.cc     | 21 -------------------
 3 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 360c8e50d55..d15a36532eb 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -662,9 +662,11 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
           shape_size_function_(bitcast->operand(0)->shape())) {
     return InternalError(
         "Bitcast cannot have different shape sizes of output (%d) and operand "
-        "(%d)",
+        "(%d) (%s) (%s)",
         shape_size_function_(bitcast->shape()),
-        shape_size_function_(bitcast->operand(0)->shape()));
+        shape_size_function_(bitcast->operand(0)->shape()),
+        bitcast->shape().ToString(true),
+        bitcast->operand(0)->shape().ToString(true));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 84654bf3213..13699f3adf9 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -951,7 +951,8 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
                 if (!Shape::Equal()
                          .IgnoreDynamicDimension()
                          .MinorToMajorOnlyInLayout()(instruction_subshape,
-                                                     buffer->shape())) {
+                                                     buffer->shape()) &&
+                    instruction->opcode() != HloOpcode::kBitcast) {
                   return InternalError(
                       "Layout of instruction %s at index {%s} does not match "
                       "source LogicalBuffer %s: %s vs %s",
@@ -1798,13 +1799,6 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // potential bugs in the layout assignment pass that may accidentally use the
   // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kBitcast) {
-      // bitcasts are inherently layout sensitive and so a bitcast instruction
-      // present in the IR before layout assignment is a bug.
-      return InternalError(
-          "Unexpected bitcast operation seen during layout assignment: %s.",
-          instruction->ToString());
-    }
     // Some instructions carry mandatory layouts in their shape.
     if (instruction->opcode() != HloOpcode::kInfeed &&
         !IsLayoutConstrainedCustomCall(instruction) &&
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 304a80c7a52..6e575247e6b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -814,27 +814,6 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   EXPECT_THAT(false_result->opcode(), HloOpcode::kCopy);
 }
 
-TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
-  auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  builder.AddInstruction(
-      HloInstruction::CreateBitcast(constant0->shape(), constant0));
-  auto m = CreateNewVerifiedModule();
-  m->AddEntryComputation(builder.Build());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-  LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(m.get()).status();
-  EXPECT_FALSE(error_status.ok());
-  EXPECT_THAT(
-      error_status.error_message(),
-      ::testing::HasSubstr(
-          "Unexpected bitcast operation seen during layout assignment"));
-}
-
 TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
   // Pin non matching layouts to parameter and root.
   const char* module_str = R"(

From e036f1bd8f5a3f64276f9c79892998f770598337 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Tue, 12 May 2020 12:56:18 -0700
Subject: [PATCH 0429/1533] Code cleanup: Use the combined left-or-right shift
 instruction.

The Xtensa compiler probably already did this optimization, as there is absolutely no difference in the generated binary.

PiperOrigin-RevId: 311184878
Change-Id: I28891223b89987bd23304701a210c2c6d49ab7f2
---
 .../lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h   | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index 2ed3e45ece1..918192c4d8f 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -65,7 +65,11 @@ inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
   ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
 
   // Shift right if shift amount is positive, left if shift amount is negative.
-  result_56 = AE_SLAASQ56S(result_56, shift_amount);
+  if (shift_amount >= 0) {
+    result_56 = AE_Q56S_SRA(result_56, shift_amount);
+  } else {
+    result_56 = AE_Q56S_SLA(result_56, -shift_amount);
+  }
 
   // Round off the bottom 16 bits.
   // Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.

From dd4585014b0d8a4e7a8ed4f63a98b2e0d290a41d Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Tue, 12 May 2020 13:18:53 -0700
Subject: [PATCH 0430/1533] Threadpool api support for misc ops.

---
 tensorflow/core/kernels/mkl_lrn_op.cc           |  2 +-
 .../kernels/mkl_requantize_per_channel_op.cc    |  7 ++++---
 tensorflow/core/kernels/mkl_slice_op.cc         | 17 ++++++++---------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 532dbaa79b4..a11e7ebcbf5 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -88,7 +88,6 @@ class MklLRNOp : public OpKernel {
     workspace_enabled_ = false;
     OP_REQUIRES_OK(context,
                    context->GetAttr("workspace_enabled", &workspace_enabled_));
-    fwd_stream_.reset(new CPU_STREAM(cpu_engine_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -169,6 +168,7 @@ class MklLRNOp : public OpKernel {
           lrn_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
 
       std::vector<primitive> net;
+      fwd_stream_.reset(CreateStream(context, cpu_engine_));
 #ifdef ENABLE_MKLDNN_V1
       net.push_back(lrn_forward(lrn_prim_desc));
       std::vector<std::unordered_map<int, memory>> net_args;
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
index f7b72f77cb9..0a0464f648b 100644
--- a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
@@ -130,9 +130,10 @@ class MklRequantizePerChannelOp : public OpKernel {
               GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(input_mem_prim),
               GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(output_mem_prim),
               cpu_engine_, reorder_attr);
-      mkldnn::stream reorder_stream = CPU_STREAM(cpu_engine_);
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine_));
 #ifndef ENABLE_MKLDNN_V1
-      reorder_stream.submit(
+      reorder_stream->submit(
           {mkldnn::reorder(reorder_pd, *input_mem_prim, *output_mem_prim)});
 #else
       std::unordered_map<int, mkldnn::memory> reorder_args = {
@@ -140,7 +141,7 @@ class MklRequantizePerChannelOp : public OpKernel {
           {MKLDNN_ARG_TO, *output_mem_prim}};
       std::unique_ptr<mkldnn::primitive> reorder_prim(
           new mkldnn::reorder(reorder_pd));
-      reorder_prim->execute(reorder_stream, reorder_args);
+      reorder_prim->execute(*reorder_stream, reorder_args);
 #endif  // !ENABLE_MKLDNN_V1
 
       Tensor* output_min = nullptr;
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index 699c3d44eb7..02471f4a6f6 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -181,22 +181,21 @@ template <typename T>
 class MklSlicePrimitive : public MklPrimitive {
  public:
   explicit MklSlicePrimitive(const MklSliceParams& sliceParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.slice_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     Setup(sliceParams);
   }
 
   ~MklSlicePrimitive() {}
 
-  void Execute(const MklSliceParams& sliceParams) {
+  void Execute(const MklSliceParams& sliceParams, std::shared_ptr<stream> slice_stream) {
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.slice_primitives, context_.slice_stream,
+    execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
 #else
-    context_.slice_stream->submit(context_.slice_primitives);
+    slice_stream->submit(context_.slice_primitives);
 #endif
 
     // We should set it back to DummyData so as to make the primitive
@@ -228,8 +227,6 @@ class MklSlicePrimitive : public MklPrimitive {
         : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_;
-
   void Setup(const MklSliceParams& sliceParams) {
     // Actually, DummyData will not be used in computation,
     // because the real data will be filled before execution.
@@ -465,7 +462,7 @@ class MklSliceOp : public OpKernel {
       auto op_md =
           MklDnnData<T>::CreateBlockedMemDesc(input_dims, input_strides);
 #ifdef ENABLE_MKLDNN_V1
-      src.CheckReorderToOpMem(op_md, cpu_engine);
+      src.CheckReorderToOpMem(op_md, cpu_engine, context);
 #else
       auto op_pd = memory::primitive_desc(op_md, cpu_engine);
       src.CheckReorderToOpMem(op_pd);
@@ -492,7 +489,9 @@ class MklSliceOp : public OpKernel {
       MklSlicePrimitive<T>* reorder_prim =
           MklSlicePrimitiveFactory<T>::Get(sliceParams);
       // Execute slice reorder.
-      reorder_prim->Execute(sliceParams);
+      std::shared_ptr<stream> slice_stream;
+      slice_stream.reset(CreateStream(context, reorder_prim->GetEngine()));
+      reorder_prim->Execute(sliceParams, slice_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +

From 0fdf345ac2917d48f132d7d1ee96f1ffea464fdb Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Tue, 12 May 2020 13:25:30 -0700
Subject: [PATCH 0431/1533] Threadpool support for quantize, dequantize and
 transpose op.

---
 tensorflow/core/kernels/mkl_dequantize_op.cc |  8 ++---
 tensorflow/core/kernels/mkl_quantize_op.cc   | 31 +++++++++-----------
 tensorflow/core/kernels/mkl_transpose_op.cc  |  4 ++-
 3 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 8737581c726..06570c1db1c 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -155,7 +155,8 @@ class MklDequantizeOp : public OpKernel {
       // Also it does not define round_nearest (enum).
       attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
 #endif  // !ENABLE_MKLDNN_V1
-      stream reorder_stream = CPU_STREAM(cpu_engine);
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine));
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
@@ -169,11 +170,10 @@ class MklDequantizeOp : public OpKernel {
       reorder_net_args.push_back({{MKLDNN_ARG_FROM, *src.GetUsrMem()},
                                   { MKLDNN_ARG_TO,
                                     *dst.GetUsrMem() }});
-      execute_primitives(net, std::make_shared<stream>(reorder_stream),
-                         reorder_net_args);
+      execute_primitives(net, reorder_stream, reorder_net_args);
 #else
       net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
-      reorder_stream.submit(net);
+      reorder_stream->submit(net);
 #endif  // ENABLE_MKLDNN_V1
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl_quantize_op.cc
index d049b5f58d2..0e583e96530 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl_quantize_op.cc
@@ -77,7 +77,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
  public:
   explicit MklReorderWithScalePrimitive(
       const MklReorderWithScaleFwdParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create reorder primitive
     Setup(fwdParams);
   }
@@ -86,14 +86,14 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
 
   std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
-  void Execute(void* src_data, void* dst_data) {
+  void Execute(void* src_data, void* dst_data,
+               std::shared_ptr<stream> reorder_stream) {
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
 #ifndef ENABLE_MKLDNN_V1
-    context_.reorder_stream->submit(context_.net);
+    reorder_stream->submit(context_.net);
 #else
-    context_.reorder_prim->execute(*context_.reorder_stream,
-                                   context_.prim_args);
+    context_.reorder_prim->execute(*reorder_stream, context_.prim_args);
 #endif  // !ENABLE_MKLDNN_V1
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
@@ -124,12 +124,9 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
         : src_mem(nullptr),
           dst_mem(nullptr),
           reorder_pd(nullptr),
-          reorder_prim(nullptr),
-          reorder_stream(nullptr) {}
+          reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_;
-
   // Reorder primitive setup
   void Setup(const MklReorderWithScaleFwdParams& fwdParams) {
     // Create memory descriptors for reorder data with specified format
@@ -163,7 +160,6 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     context_.prim_args.insert({MKLDNN_ARG_FROM, *context_.src_mem});
     context_.prim_args.insert({MKLDNN_ARG_TO, *context_.dst_mem});
 #endif  // !ENABLE_MKLDNN_V1
-    context_.reorder_stream.reset(new CPU_STREAM(cpu_engine_));
   }
 };
 
@@ -231,9 +227,8 @@ class MklQuantizeV2Op : public OpKernel {
   explicit MklQuantizeV2Op(OpKernelConstruction* ctx) : OpKernel(ctx) {
     string mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("mode", &mode_string));
-    OP_REQUIRES(ctx,
-                (mode_string == "MIN_COMBINED" || mode_string == "MIN_FIRST" ||
-                 mode_string == "SCALED"),
+    OP_REQUIRES(ctx, (mode_string == "MIN_COMBINED" ||
+                      mode_string == "MIN_FIRST" || mode_string == "SCALED"),
                 errors::InvalidArgument("Mode string must be 'MIN_COMBINED',"
                                         " 'MIN_FIRST', or 'SCALED', is '" +
                                         mode_string + "'"));
@@ -247,9 +242,8 @@ class MklQuantizeV2Op : public OpKernel {
 
     string round_mode_string;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
-    OP_REQUIRES(ctx,
-                (round_mode_string == "HALF_AWAY_FROM_ZERO" ||
-                 round_mode_string == "HALF_TO_EVEN"),
+    OP_REQUIRES(ctx, (round_mode_string == "HALF_AWAY_FROM_ZERO" ||
+                      round_mode_string == "HALF_TO_EVEN"),
                 errors::InvalidArgument("Round mode string must be "
                                         "'HALF_AWAY_FROM_ZERO' or "
                                         "'HALF_TO_EVEN', is '" +
@@ -491,7 +485,10 @@ class MklQuantizeV2Op : public OpKernel {
     MklReorderWithScalePrimitive* reorder_prim =
         MklReorderWithScalePrimitiveFactory<T>::Get(src.GetUsrMem(),
                                                     dst.GetUsrMem(), fwdParams);
-    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle());
+    std::shared_ptr<stream> cpu_stream;
+    cpu_stream.reset(CreateStream(ctx, reorder_prim->GetEngine()));
+    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle(),
+                          cpu_stream);
 
     output_min_tensor->flat<float>()(0) = min_range;
     output_max_tensor->flat<float>()(0) = max_range;
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index a82aed0243c..77a68afa752 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -144,12 +144,14 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
 
     std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
+    std::shared_ptr<stream> transpose_stream;
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
+    transpose_stream.reset(CreateStream(context, prim->GetEngine()));
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
-    execute_primitives(net, prim->GetStream(), net_args);
+    execute_primitives(net, transpose_stream, net_args);
 #else
     std::shared_ptr<stream> transpose_stream;
     transpose_stream.reset(new CPU_STREAM(cpu_engine));

From 4db5a38b6e02c45422988b9d04aa4f1f1201eb69 Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Tue, 12 May 2020 13:32:38 -0700
Subject: [PATCH 0432/1533] Threadpool api implementation for concat and fused
 batchnorm op.

---
 tensorflow/core/kernels/mkl_concat_op.cc      | 49 +++++------
 .../core/kernels/mkl_fused_batch_norm_op.cc   | 85 ++++++++-----------
 2 files changed, 59 insertions(+), 75 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d918327ef5f..969e43b4f79 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <vector>
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -32,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using mkldnn::concat;
 using mkldnn::stream;
@@ -184,13 +184,12 @@ class EigenConcatBaseOp : public OpKernel {
       const auto in = values[i];
       const bool in_is_scalar = TensorShapeUtils::IsScalar(input_shapes[i]);
       OP_REQUIRES(
-          c,
-          (input_shapes[i].dims() == input_dims) ||
-              (input_is_scalar && in_is_scalar),
+          c, (input_shapes[i].dims() == input_dims) ||
+                 (input_is_scalar && in_is_scalar),
           errors::InvalidArgument(
               "ConcatOp : Ranks of all input tensors should match: shape[0] = ",
-              input_shape.DebugString(), " vs. shape[", i,
-              "] = ", input_shapes[i].DebugString()));
+              input_shape.DebugString(), " vs. shape[", i, "] = ",
+              input_shapes[i].DebugString()));
       if (in.NumElements() > 0) {
         int64 inputs_flat_dim1 = in.NumElements() / inputs_flat_dim0;
         inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
@@ -265,8 +264,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConcatFwdPrimitive(const MklConcatFwdParams& concat_fwd_dims,
                                  const std::vector<memory::desc>& srcs_md)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create concat primitive
     Setup(concat_fwd_dims, srcs_md);
   }
@@ -278,7 +276,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
   //   dst_data:    output data buffer of dst
   void Execute(const std::vector<mkldnn::memory>& in_data,
                const mkldnn::memory& dst_data,
-               const MklConcatFwdParams& concat_fwd_dims) {
+               const MklConcatFwdParams& concat_fwd_dims,
+               std::shared_ptr<stream> fwd_stream) {
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem_shdptr[i]->set_data_handle(
@@ -292,10 +291,10 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     }
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+    execute_primitives(context_.fwd_primitives, fwd_stream,
                        context_.fwd_primitives_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // After exec, set data handle back
@@ -335,7 +334,6 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::concat::primitive_desc> fwd_pd;
     std::shared_ptr<mkldnn::primitive> concat_fwd;
 
-    std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -343,10 +341,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
 #endif  // ENABLE_MKLDNN_V1
 
     ConcatFwdContext()
-        : dst_mem(nullptr),
-          fwd_pd(nullptr),
-          concat_fwd(nullptr),
-          fwd_stream(nullptr) {}
+        : dst_mem(nullptr), fwd_pd(nullptr), concat_fwd(nullptr) {}
   };
 
   // Creates the src and dst memory descriptor for mkl concat
@@ -417,7 +412,6 @@ class MklConcatFwdPrimitive : public MklPrimitive {
   }
 
   struct ConcatFwdContext context_;
-  engine cpu_engine_;
 };
 
 // Class to create/cache the mkl concat primitives based on the
@@ -758,7 +752,7 @@ class MklConcatOp : public OpKernel {
         for (int k = 0; k < input_tensors.size(); k++) {
           if (input_tensors[k].NumElements() > 0) {
             srcs[k].CheckReorderToOpMem(
-                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine));
+                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine), context);
             inputs.push_back(srcs[k].GetOpMem());
           }
         }
@@ -796,7 +790,8 @@ class MklConcatOp : public OpKernel {
           if (dnn_shape_dst.IsMklTensor())
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
-          stream concat_stream = CPU_STREAM(cpu_engine);
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
 #ifdef ENABLE_MKLDNN_V1
           auto concat_op = concat(concat_pd);
           std::unordered_map<int, memory> net_args = {
@@ -805,12 +800,12 @@ class MklConcatOp : public OpKernel {
           for (int i = 0; i < inputs.size(); ++i) {
             net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
           }
-          concat_op.execute(concat_stream, net_args);
+          concat_op.execute(*fwd_cpu_stream, net_args);
 #else
           auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
           std::vector<primitive> net;
           net.push_back(concat_op);
-          concat_stream.submit(net).wait();
+          fwd_cpu_stream->submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
         } else {
           MklConcatFwdPrimitive<T>* concat_fwd = nullptr;
@@ -835,9 +830,11 @@ class MklConcatOp : public OpKernel {
           dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
                                                : dst_md;
           dst.SetUsrMem(dst_md, dst_tensor);
-
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, concat_fwd->GetEngine()));
           // Execute concat
-          concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims);
+          concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims,
+                              fwd_cpu_stream);
         }
 
         // For quantized concat, min and max outputs are also computed.
@@ -868,9 +865,9 @@ class MklConcatOp : public OpKernel {
         DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 115b3597964..04b6e342404 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #ifdef INTEL_MKL
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -22,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define GET_FLAG(bn_flag) static_cast<int>(BN_FLAGS::bn_flag)
 #define IS_SET(cflag) (context_.flags & GET_FLAG(cflag))
@@ -71,13 +71,7 @@ template <typename T, typename U>
 class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormFwdPrimitive(const MklBatchNormFwdParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-#ifdef ENABLE_MKLDNN_V1
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
-#else
-    context_.fwd_stream.reset(
-        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
-#endif
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bn_fwd == nullptr) Setup(fwdParams);
   }
 
@@ -90,7 +84,8 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   //   mean_data:     output data buffer of means
   //   variance_data: output data buffer of variances
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
-               U* mean_data, U* variance_data) {
+               U* mean_data, U* variance_data,
+               std::shared_ptr<stream> fwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -106,10 +101,10 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     }
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
-                       context_.net_args);
+    execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream.reset(new stream(stream::kind::eager_nostore));
+    fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     context_.src_mem->set_data_handle(DummyData);
@@ -164,7 +159,6 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
 
     // BatchNorm forward primitive.
     std::shared_ptr<mkldnn::primitive> bn_fwd;
-    std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -179,8 +173,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           dst_mem(nullptr),
           mean_mem(nullptr),
           variance_mem(nullptr),
-          bn_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          bn_fwd(nullptr) {}
   };
 
   void Setup(const MklBatchNormFwdParams& fwdParams) {
@@ -323,7 +316,6 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   }
 
   struct BatchNormFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T, typename U>
@@ -419,13 +411,7 @@ template <typename T, typename U>
 class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormBwdPrimitive(const MklBatchNormBwdParams& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-#ifdef ENABLE_MKLDNN_V1
-    context_.bwd_stream.reset(new CPU_STREAM(cpu_engine_));
-#else
-    context_.bwd_stream.reset(
-        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
-#endif
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bn_bwd == nullptr) Setup(bwdParams);
   }
 
@@ -445,7 +431,8 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
   //                          on CPU as of now.
   void Execute(const T* src_data, const U* mean_data, const U* variance_data,
                const T* diff_dst_data, const U* weights_data, T* diff_src_data,
-               U* diff_weights_data, U* res_space_data) {
+               U* diff_weights_data, U* res_space_data,
+               std::shared_ptr<stream> bwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.mean_mem->set_data_handle(
@@ -467,10 +454,10 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     // Execute backward batch-normalization primitives.
     DCHECK_EQ(context_.bwd_primitives.size(), context_.net_args.size());
-    execute_primitives(context_.bwd_primitives, context_.bwd_stream,
-                       context_.net_args);
+    execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
-    context_.bwd_stream->submit(context_.bwd_primitives);
+    bwd_stream.reset(new stream(stream::kind::eager_nostore));
+    bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // After execution, set data handle back to DummyData.
@@ -523,7 +510,6 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
     // Backward batch-normalization primitive.
     std::shared_ptr<mkldnn::primitive> bn_bwd;
     std::vector<mkldnn::primitive> bwd_primitives;
-    std::shared_ptr<mkldnn::stream> bwd_stream;
 
 #ifdef ENABLE_MKLDNN_V1
     std::vector<std::unordered_map<int, memory>> net_args;
@@ -536,8 +522,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
           diff_dst_mem(nullptr),
           weights_mem(nullptr),
           diff_weights_mem(nullptr),
-          diff_src_mem(nullptr),
-          bwd_stream(nullptr) {}
+          diff_src_mem(nullptr) {}
   };
 
   void Setup(const MklBatchNormBwdParams& bwdParams) {
@@ -546,7 +531,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
             ? GET_FLAG(use_scale_shift)
             : (GET_FLAG(use_scale_shift) | GET_FLAG(use_global_stats));
 
-    // Memory descriptors.
+// Memory descriptors.
 #ifndef ENABLE_MKLDNN_V1
     auto src_md = memory::desc({bwdParams.src_dims}, MklDnnType<T>(),
                                bwdParams.src_format);
@@ -619,7 +604,6 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
   }
 
   struct BatchNormBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T, typename U>
@@ -838,8 +822,10 @@ class MklFusedBatchNormOp : public OpKernel {
       std::shared_ptr<BatchNormFwdPd> bn_fwd_pd = bn_fwd->GetBatchNormFwdPd();
       if (IS_SRC_REORDER_NEEDED(src_md, bn_fwd_pd, bn_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_SRC_DESC_FROM_OP_PD(bn_fwd_pd), cpu_engine_));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_SRC_DESC_FROM_OP_PD(bn_fwd_pd),
+                                   cpu_engine_),
+            context);
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
@@ -865,9 +851,10 @@ class MklFusedBatchNormOp : public OpKernel {
       T* dst_data = dst_tensor->flat<T>().data();
 
       // Execute
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, bn_fwd->GetEngine()));
       bn_fwd->Execute(src_data, weights_op_data, dst_data, mean_op_data,
-                      variance_op_data);
-
+                      variance_op_data, fwd_cpu_stream);
       float adjust_factor = 1.0;
       if (is_training_) {
         size_t orig_size = src_dims[0] * src_dims[2] * src_dims[3];
@@ -881,9 +868,6 @@ class MklFusedBatchNormOp : public OpKernel {
       auto batch_variance_data = batch_variance_tensor->flat<U>().data();
       auto est_mean_data = est_mean_tensor.flat<U>().data();
       auto est_variance_data = est_variance_tensor.flat<U>().data();
-
-      // TODO(intel-tf): Merge the `is_training && exponential_avg_factor == 1`
-      // case with the `else` (`!is_training`) case if possible.
       if (is_training_) {
         if (exponential_avg_factor_ == U(1.0)) {
           for (int k = 0; k < depth_; k++) {
@@ -907,9 +891,9 @@ class MklFusedBatchNormOp : public OpKernel {
         std::memcpy(batch_variance_data, variance_data, depth_ * sizeof(U));
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -1208,8 +1192,10 @@ class MklFusedBatchNormGradOp : public OpKernel {
       std::shared_ptr<BatchNormBwdPd> bn_bwd_pd = bn_bwd->GetBatchNormBwdPd();
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bn_bwd_pd, bn_bwd)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_DIFF_DST_DESC_FROM_OP_PD(bn_bwd_pd), cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(bn_bwd_pd),
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
@@ -1246,10 +1232,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
                             : nullptr);
 
       // Execute
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, bn_bwd->GetEngine()));
       bn_bwd->Execute(src_data, mean_data, variance_data, diff_dst_data,
                       weights_data, diff_src_data, diff_weights_data,
-                      res_space_data);
-
+                      res_space_data, bwd_cpu_stream);
       // Allocate output TF tensors diff_scale and diff_shift.
       Tensor* diff_scale_tensor = nullptr;
       Tensor* diff_shift_tensor = nullptr;
@@ -1266,9 +1253,9 @@ class MklFusedBatchNormGradOp : public OpKernel {
                   reinterpret_cast<char*>(diff_weights_data + depth_),
                   depth_ * sizeof(U));
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));

From 117c75d3117ac8babe84393dce32dbc2dd2dbe36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 13:43:09 -0700
Subject: [PATCH 0433/1533] Add layout config to HloModuleConfig.

PiperOrigin-RevId: 311193903
Change-Id: I9b6680c5a9919804e449ed617bd6bc310800183e
---
 .../compiler/xla/service/hlo_module_config.h      | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 61ea8392d94..833d0fe59d0 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -204,6 +204,14 @@ class HloModuleConfig {
 
   std::vector<std::vector<int64>>* mutable_dot_config() { return &dot_config_; }
 
+  absl::Span<const std::vector<std::vector<int64>>> layout_config() const {
+    return layout_config_;
+  }
+
+  std::vector<std::vector<std::vector<int64>>>* mutable_layout_config() {
+    return &layout_config_;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -241,6 +249,9 @@ class HloModuleConfig {
   FusionConfigCollection fusion_config_collection_ =
       FusionConfigCollection::kOff;
 
+  // TODO(b/155665133): Consolidate fusion, dot, and layout config into a proto
+  // similar to backend config.
+
   // Custom fusion configuration, where fusion_config_[c][v] control if node v
   // in computation c must be fused to all its consumers (true) or not (false).
   std::vector<std::vector<bool>> fusion_config_;
@@ -249,6 +260,10 @@ class HloModuleConfig {
   // how to convert dot operation v (sorted topologically and by computation) to
   // convolution.
   std::vector<std::vector<int64>> dot_config_;
+
+  // Layout configuration, where layout_config_[v][i] controls the layout
+  // decision i of operation v.
+  std::vector<std::vector<std::vector<int64>>> layout_config_;
 };
 
 }  // namespace xla

From fd895bf2b98250929a442e5cf689f6bb272ac52c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 13:47:34 -0700
Subject: [PATCH 0434/1533] [BUILD] Create a separate BUILD file for
 "tensorflow/core/protobuf"`.

This change leaves all existing targets in "tensorflow/core/BUILD" in place, with some becoming aliases. In future, we will remove aliases and point to the new locations.

PiperOrigin-RevId: 311194740
Change-Id: Id413277651b260641c1c2e06cb54d16629e6e662
---
 tensorflow/compiler/xla/service/gpu/BUILD     |   8 +-
 tensorflow/core/BUILD                         | 194 +++++++-----------
 tensorflow/core/lib/core/BUILD                |   9 +-
 tensorflow/core/platform/BUILD                |   2 +-
 .../core/platform/default/build_config.bzl    |   4 +-
 tensorflow/core/protobuf/BUILD                | 182 ++++++++++++++++
 tensorflow/go/saved_model.go                  |   2 +-
 tensorflow/go/signature.go                    |   2 +-
 tensorflow/go/signature_test.go               |   2 +-
 tensorflow/python/BUILD                       |  24 +--
 10 files changed, 282 insertions(+), 147 deletions(-)
 create mode 100644 tensorflow/core/protobuf/BUILD

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 8f8263a85f9..bff8734de5f 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -686,7 +686,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
@@ -722,7 +722,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -1676,7 +1676,7 @@ tf_proto_library_cc(
     protodeps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core:autotuning_proto",
+        "//tensorflow/core/protobuf:autotuning_proto",
     ],
 )
 
@@ -1687,8 +1687,8 @@ cc_library(
     deps = [
         ":gpu_autotuning_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/core:autotuning_proto_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a655a9509d3..6b4874a8393 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -105,7 +105,6 @@ load("//tensorflow:tensorflow.bzl", "tf_monitoring_deps")
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_all_protos",
     "tf_additional_lib_deps",
     "tf_additional_test_deps",
     "tf_jspb_proto_library",
@@ -114,11 +113,9 @@ load(
     "tf_portable_deps_no_runtime",
     "tf_portable_proto_lib",
     "tf_proto_library",
-    "tf_proto_library_cc",
     "tf_protos_all_impl",
     "tf_protos_grappler_impl",
     "tf_protos_profiler_impl",
-    "tf_pyclif_proto_library",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -181,18 +178,18 @@ package_group(name = "friends")
 # filegroup; e.g.  ones with individual proto_library targets.
 # LINT.IfChange
 COMMON_PROTO_SRCS = [
-    "protobuf/bfc_memory_map.proto",
-    "protobuf/config.proto",
-    "protobuf/cluster.proto",
-    "protobuf/debug.proto",
-    "protobuf/device_filters.proto",
-    "protobuf/device_properties.proto",
-    "protobuf/graph_debug_info.proto",
-    "protobuf/queue_runner.proto",
-    "protobuf/rewriter_config.proto",
-    "protobuf/tensor_bundle.proto",
-    "protobuf/saver.proto",
-    "protobuf/verifier_config.proto",
+    "//tensorflow/core/protobuf:bfc_memory_map.proto",
+    "//tensorflow/core/protobuf:config.proto",
+    "//tensorflow/core/protobuf:cluster.proto",
+    "//tensorflow/core/protobuf:debug.proto",
+    "//tensorflow/core/protobuf:device_filters.proto",
+    "//tensorflow/core/protobuf:device_properties.proto",
+    "//tensorflow/core/protobuf:graph_debug_info.proto",
+    "//tensorflow/core/protobuf:queue_runner.proto",
+    "//tensorflow/core/protobuf:rewriter_config.proto",
+    "//tensorflow/core/protobuf:tensor_bundle.proto",
+    "//tensorflow/core/protobuf:saver.proto",
+    "//tensorflow/core/protobuf:verifier_config.proto",
 ]
 
 EXAMPLE_PROTO_SRCS = [
@@ -239,7 +236,7 @@ PROFILER_PROTO_SRCS = [
 ]
 
 ERROR_CODES_PROTO_SRCS = [
-    "protobuf/error_codes.proto",
+    "//tensorflow/core/protobuf:error_codes.proto",
     "//tensorflow/core/lib/core:error_codes.proto",
 ]
 # LINT.ThenChange(//tensorflow/core/portable_proto_config.asciipb)
@@ -252,11 +249,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        ":core_protos",
-        ":error_codes_proto_impl",
         "//tensorflow/core/example:protos_all",
         "//tensorflow/core/framework:protos_all",
         "//tensorflow/core/lib/core:error_codes_proto",
+        "//tensorflow/core/profiler/protobuf:xplane_proto",
+        "//tensorflow/core/profiler:profiler_options_proto",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:for_core_protos",
         "//tensorflow/core/util:protos_all",
         "//tensorflow/core/util:test_log_proto_impl",
     ],
@@ -1603,20 +1602,13 @@ alias(
     [
         alias(
             name = "protobuf_%s_pyclif%s" % (proto_name, target_suffix),
-            actual = ":protobuf/%s_pyclif%s" % (proto_name, target_suffix),
+            actual = "//tensorflow/core/protobuf:%s_pyclif%s" % (proto_name, target_suffix),
             visibility = ["//visibility:public"],
         )
         for target_suffix in [
             "",
             "_pb2",
         ]
-    ] + [
-        tf_pyclif_proto_library(
-            name = "protobuf/%s_pyclif" % proto_name,
-            proto_lib = ":protos_all",
-            proto_srcfile = "protobuf/%s.proto" % proto_name,
-            visibility = ["//visibility:public"],
-        ),
     ]
     for proto_name in [
         "config",
@@ -1630,77 +1622,74 @@ alias(
 # -----------------------------------------------------------------------------
 # Internal targets
 
-tf_proto_library(
+alias(
     name = "autotuning_proto",
-    srcs = ["protobuf/autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library(
+alias(
+    name = "autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:autotuning_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
     name = "conv_autotuning_proto",
-    srcs = ["protobuf/conv_autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        "//tensorflow/stream_executor:dnn_proto",
-    ],
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "worker_proto",
-    srcs = ["protobuf/worker.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
-    name = "worker_service_proto",
-    srcs = ["protobuf/worker_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":worker_proto"],
+alias(
+    name = "conv_autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "master_proto",
-    srcs = ["protobuf/master.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//tensorflow:internal"],
-)
-
-tf_proto_library_cc(
-    name = "master_service_proto",
-    srcs = ["protobuf/master_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":master_proto"],
+alias(
+    name = "worker_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "eager_service_proto",
-    srcs = ["protobuf/eager_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    cc_stubby_versions = ["2"],
-    protodeps = tf_additional_all_protos(),
+alias(
+    name = "worker_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_proto_cc",
+    visibility = [
+        "//learning/brain/frameworks/uptc:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "eager_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:eager_service_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
@@ -2112,49 +2101,14 @@ cc_library(
     ],
 )
 
-tf_proto_library(
+alias(
     name = "error_codes_proto_impl",
-    srcs = ["protobuf/error_codes.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl",
 )
 
-tf_proto_library(
-    name = "core_protos",
-    srcs = COMMON_PROTO_SRCS + [
-        # Protos which are not needed on mobile builds, but should be included
-        # in protos_all.
-        #
-        # Note that some protos are in neither core_proto_srcs nor this
-        # filegroup; e.g. ones with individual proto_library targets.
-        "protobuf/control_flow.proto",
-        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
-        # "protobuf/critical_section.proto",
-        "protobuf/data/experimental/snapshot.proto",
-        "protobuf/debug_event.proto",
-        "protobuf/meta_graph.proto",
-        "protobuf/named_tensor.proto",
-        "protobuf/remote_tensor_handle.proto",
-        "protobuf/saved_model.proto",
-        "protobuf/saved_object_graph.proto",
-        "protobuf/struct.proto",
-        "protobuf/tensorflow_server.proto",
-        "protobuf/trackable_object_graph.proto",
-        "protobuf/transport_options.proto",
-    ],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        ":error_codes_proto_impl",
-        "//tensorflow/core/example:protos_all",
-        "//tensorflow/core/framework:protos_all",
-        "//tensorflow/core/lib/core:error_codes_proto",
-        "//tensorflow/core/profiler/protobuf:xplane_proto",
-        "//tensorflow/core/profiler:profiler_options_proto",
-        "//tensorflow/core/util:protos_all",
-        "//tensorflow/core/util:test_log_proto_impl",
-    ],
-    visibility = ["//visibility:private"],
+alias(
+    name = "error_codes_proto_impl_cc",
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
 )
 
 alias(
@@ -2446,13 +2400,9 @@ alias(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
-    name = "replay_log_proto",
-    srcs = ["protobuf/replay_log.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        ":master_proto",
-    ] + tf_additional_all_protos(),
+alias(
+    name = "replay_log_proto_cc",
+    actual = "//tensorflow/core/protobuf:replay_log_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index 80ad4943f16..491e4c5e7aa 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -138,10 +138,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        "//tensorflow/core:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
     ],
-    visibility = ["//tensorflow/core:__subpackages__"],
-    exports = ["//tensorflow/core:error_codes_proto_impl"],
+    visibility = [
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow/core/protobuf:__subpackages__",
+    ],
+    exports = ["//tensorflow/core/protobuf:error_codes_proto_impl"],
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 819f8fcdadb..c7ff378d2ac 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -621,7 +621,7 @@ cc_library(
         ":stringpiece",
         ":stringprintf",
         ":types",
-        "//tensorflow/core:error_codes_proto_impl_cc",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base",
     ],
 )
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index fd6e78addce..2dc4fdc0fd9 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -577,8 +577,8 @@ def tf_additional_all_protos():
 
 def tf_protos_all_impl():
     return [
-        clean_dep("//tensorflow/core:autotuning_proto_cc_impl"),
-        clean_dep("//tensorflow/core:conv_autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:conv_autotuning_proto_cc_impl"),
         clean_dep("//tensorflow/core:protos_all_cc_impl"),
     ]
 
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
new file mode 100644
index 00000000000..a374c808a14
--- /dev/null
+++ b/tensorflow/core/protobuf/BUILD
@@ -0,0 +1,182 @@
+# For platform specific build config
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_proto_library_cc",
+    "tf_pyclif_proto_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow_models:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+COMMON_PROTO_SRCS = [
+    "bfc_memory_map.proto",
+    "config.proto",
+    "cluster.proto",
+    "debug.proto",
+    "device_filters.proto",
+    "device_properties.proto",
+    "graph_debug_info.proto",
+    "queue_runner.proto",
+    "rewriter_config.proto",
+    "tensor_bundle.proto",
+    "saver.proto",
+    "verifier_config.proto",
+]
+
+[
+    [
+        tf_pyclif_proto_library(
+            name = "%s_pyclif" % proto_name,
+            proto_lib = ":for_core_protos",
+            proto_srcfile = "%s.proto" % proto_name,
+            visibility = ["//visibility:public"],
+        ),
+    ]
+    for proto_name in [
+        "config",
+        "device_properties",
+        "graph_debug_info",
+        "meta_graph",
+        "saved_model",
+    ]
+]
+
+tf_proto_library(
+    name = "autotuning_proto",
+    srcs = ["autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "conv_autotuning_proto",
+    srcs = ["conv_autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        "//tensorflow/stream_executor:dnn_proto",
+    ],
+)
+
+tf_proto_library_cc(
+    name = "worker_proto",
+    srcs = ["worker.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "worker_service_proto",
+    srcs = ["worker_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":worker_proto"],
+)
+
+tf_proto_library_cc(
+    name = "master_proto",
+    srcs = ["master.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_proto_library_cc(
+    name = "master_service_proto",
+    srcs = ["master_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":master_proto"],
+)
+
+tf_proto_library_cc(
+    name = "eager_service_proto",
+    srcs = ["eager_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+)
+
+tf_proto_library_cc(
+    name = "replay_log_proto",
+    srcs = ["replay_log.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":master_proto",
+    ] + tf_additional_all_protos(),
+)
+
+tf_proto_library(
+    name = "error_codes_proto_impl",
+    srcs = ["error_codes.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+exports_files(
+    srcs = ["error_codes.proto"] + COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+)
+
+tf_proto_library(
+    name = "for_core_protos",
+    srcs = COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        ":error_codes_proto_impl",
+        "//tensorflow/core/framework:protos_all",
+    ],
+)
diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go
index 7aa1e83cbc4..64ae82e3b01 100644
--- a/tensorflow/go/saved_model.go
+++ b/tensorflow/go/saved_model.go
@@ -22,7 +22,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 // #include <stdlib.h>
diff --git a/tensorflow/go/signature.go b/tensorflow/go/signature.go
index 8aac0e2ec93..c2db0c75247 100644
--- a/tensorflow/go/signature.go
+++ b/tensorflow/go/signature.go
@@ -16,7 +16,7 @@ limitations under the License.
 
 package tensorflow
 
-import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 
 // #include "tensorflow/c/c_api.h"
 import "C"
diff --git a/tensorflow/go/signature_test.go b/tensorflow/go/signature_test.go
index e6927f3cebd..f9fa8427819 100644
--- a/tensorflow/go/signature_test.go
+++ b/tensorflow/go/signature_test.go
@@ -20,9 +20,9 @@ import (
 	"fmt"
 	"testing"
 
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
 	tspb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/tensor_shape_go_proto"
 	typb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/types_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 func TestSignatureFromProto(t *testing.T) {
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 0b046ea8d61..11da45fbcbb 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -655,15 +655,15 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/types:optional",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
             "//tensorflow/core:version_lib",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )
@@ -8049,14 +8049,14 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )

From 1186e3f2098793952aa82bf356dfe51b967fb26c Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 12 May 2020 13:49:58 -0700
Subject: [PATCH 0435/1533] The callback slowness warning has started firing in
 many situations where only built-in callbacks are called (possibly only due
 to logging). For the time being, its threshold must be increased.

PiperOrigin-RevId: 311195250
Change-Id: Idff476f4650970b372bc11a25b043825b17742d5
---
 tensorflow/python/keras/callbacks.py      | 16 +++++++++++-----
 tensorflow/python/keras/callbacks_test.py |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 6748a572805..db326ea32f0 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -307,14 +307,20 @@ class CallbackList(object):
       end_hook_name = hook_name
       begin_hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
 
-      threshold_time = 0.5 * batch_time
+      threshold_time = 1.5 * batch_time
       warning_msg = ('Callbacks method `{hook}` is slow compared to '
-                     'the batch time. Check your callbacks.')
+                     'the batch time (batch time: {batch_time:.4f}s vs '
+                     '`{hook}` time: {cbk_time:.4f}s). Check your callbacks.')
       if self._timing[begin_hook_name] > threshold_time:
-        logging.warning(warning_msg.format(hook=begin_hook_name))
+        logging.warning(warning_msg.format(
+            hook=begin_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[begin_hook_name]))
       if self._timing[end_hook_name] > threshold_time:
-        logging.warning(warning_msg.format(hook=end_hook_name))
-
+        logging.warning(warning_msg.format(
+            hook=end_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[end_hook_name]))
       self._check_timing = False
       self._batch_start_time = None
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 9d15f87ed79..2f1256ee3ee 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -302,8 +302,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
           epochs=10,
           callbacks=[SleepCallback()])
     warning_msg = ('Callbacks method `on_train_batch_end` is slow compared '
-                   'to the batch time. Check your callbacks.')
-    self.assertIn(warning_msg, warning_messages)
+                   'to the batch time')
+    self.assertIn(warning_msg, '\n'.join(warning_messages))
 
   @keras_parameterized.run_with_all_model_types(exclude_models='functional')
   @keras_parameterized.run_all_keras_modes

From 1638fe218d6003345460e33b7a38a8a322887d79 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Tue, 12 May 2020 14:04:36 -0700
Subject: [PATCH 0436/1533] Fix for adhering to latest clang style guide.

PiperOrigin-RevId: 311197936
Change-Id: I014b041ff03f656587651da9a4977688d501d330
---
 tensorflow/core/framework/shape_inference_testutil.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 40a6d53d223..361f7ed13c1 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
 
 #include <vector>
+
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -90,7 +91,7 @@ class ShapeInferenceTestutil {
         ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
             op, i, "e")                                                     \
             .error_message();                                               \
-    const std::string& substring = error_substring;                         \
+    const std::string substring = error_substring;                          \
     EXPECT_NE("", error_message);                                           \
     EXPECT_TRUE(absl::StrContains(error_message, substring))                \
         << "Expected to see '" << substring << "' in '" << error_message    \

From f581c55e4d01e4ebdeaebf6c095aff547745d893 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Tue, 12 May 2020 14:16:20 -0700
Subject: [PATCH 0437/1533] Introduce persistent, read-only TFLite tensor type

Several operators (rank, shape) are critical for preserving the ability
to resize graphs correctly at runtime. However, introduction of such ops
in the graph currently makes it impossible to fully propagate shapes when
tensors are allocated. This also prevents delegation of the graph for most
delegates, as it introduces dynamic shapes.

Introduce a new, persistent tensor type that can be treated as "constant"
at the time of TfLiteRegistration::Prepare. This tensor type is allocated
immediately when requested, similar to a dynamic tensor, but promises that
its contents will be populated after the "producing" node is prepared, and
that it won't change across subsequent evals.

Update Rank/Shape operators to use this tensor allocation type.

A follow-up CL will introduce a new pseudo-constant tensor check
that can be used by various kernels to avoid making them dynamic.

PiperOrigin-RevId: 311199934
Change-Id: I050704be7d1ff264fc1a852efade53d4021cb034
---
 tensorflow/lite/c/common.c                    |  6 ++-
 tensorflow/lite/c/common.h                    | 14 +++++--
 tensorflow/lite/core/subgraph.cc              |  9 +++--
 tensorflow/lite/kernels/kernel_util.h         | 12 ++++++
 tensorflow/lite/kernels/rank.cc               | 18 ++++++---
 tensorflow/lite/kernels/rank_test.cc          | 13 +++++--
 tensorflow/lite/kernels/shape.cc              | 17 +++++---
 tensorflow/lite/kernels/shape_test.cc         | 13 +++++--
 .../lite/micro/micro_optional_debug_tools.cc  |  2 +
 tensorflow/lite/optional_debug_tools.cc       |  2 +
 tensorflow/lite/python/lite_test.py           | 39 +++++++++++++++++--
 .../benchmark/experimental/c/c_api_types.h    | 14 +++++--
 12 files changed, 129 insertions(+), 30 deletions(-)

diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c
index f70a60002dd..e6b47896528 100644
--- a/tensorflow/lite/c/common.c
+++ b/tensorflow/lite/c/common.c
@@ -79,7 +79,8 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic) {
+  if (t->allocation_type == kTfLiteDynamic ||
+      t->allocation_type == kTfLitePersistentRo) {
     free(t->data.raw);
   }
   t->data.raw = NULL;
@@ -172,7 +173,8 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 }
 
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
-  if (tensor->allocation_type != kTfLiteDynamic) {
+  if (tensor->allocation_type != kTfLiteDynamic &&
+      tensor->allocation_type != kTfLitePersistentRo) {
     return;
   }
   // TODO(b/145340303): Tensor data should be aligned.
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 9657c7e564c..ab150e87d93 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -321,15 +321,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 4cebd059a80..7f4e0e286ea 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -1183,7 +1183,8 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
   if (tensor->allocation_type == kTfLiteArenaRw ||
       tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+      tensor->allocation_type == kTfLiteArenaRwPersistent ||
+      tensor->allocation_type == kTfLitePersistentRo) {
     tensor_resized_since_op_invoke_ |=
         TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
     if (tensor->type != kTfLiteString) {
@@ -1195,14 +1196,16 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
         return kTfLiteError;
       }
 
-      // Realloc space for kTfLiteDynamic tensors.
+      // Realloc space for heap-allocated tensors.
       TfLiteTensorRealloc(bytesRequired, tensor);
       tensor->bytes = bytesRequired;
     }
     if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
     tensor->dims = new_size;
 
-    if (tensor->allocation_type != kTfLiteDynamic) {
+    // Reset arena-allocated tensors; they will be allocated later.
+    if (tensor->allocation_type == kTfLiteArenaRw ||
+        tensor->allocation_type == kTfLiteArenaRwPersistent) {
       tensor->data.raw = nullptr;
     }
   } else {
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index ad068ddd3fd..5793b08616d 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -87,6 +87,10 @@ inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
 }
 
 // Determines whether tensor is constant.
+// TODO(b/138199592): Introduce new query which checks for constant OR
+// persistent-read-only, which would be useful for most tensor kernels that
+// are potentially dynamic based on the input tensor value availability at the
+// time of prepare.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteMmapRo;
 }
@@ -105,6 +109,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
   }
 }
 
+// Sets tensor to persistent and read-only.
+inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLitePersistentRo) {
+    tensor->allocation_type = kTfLitePersistentRo;
+    tensor->data.raw = nullptr;
+  }
+}
+
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
 inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
index 8e27ebcc325..53fd92f1682 100644
--- a/tensorflow/lite/kernels/rank.cc
+++ b/tensorflow/lite/kernels/rank.cc
@@ -30,19 +30,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   output->type = kTfLiteInt32;
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the rank immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Rank produces a 0-D int32 Tensor representing the rank of input.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(0);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 0);
 
+  // Immediately propagate the known rank to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   if (output->type == kTfLiteInt32) {
     int32_t* output_data = GetTensorData<int32_t>(output);
     *output_data = NumDimensions(input);
@@ -53,6 +57,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace rank
 
 TfLiteRegistration* Register_RANK() {
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
index f3dc97126ba..5373a0a66fe 100644
--- a/tensorflow/lite/kernels/rank_test.cc
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -43,6 +43,9 @@ class RankOpModel : public SingleOpModel {
 
   std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -51,6 +54,13 @@ class RankOpModel : public SingleOpModel {
 
 TEST(RankOpTest, InputTypeFloat) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_FLOAT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
@@ -59,7 +69,6 @@ TEST(RankOpTest, InputTypeFloat) {
 
 TEST(RankOpTest, InputTypeInt) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -67,7 +76,6 @@ TEST(RankOpTest, InputTypeInt) {
 
 TEST(RankOpTest, ScalarTensor) {
   RankOpModel model({}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -75,7 +83,6 @@ TEST(RankOpTest, ScalarTensor) {
 
 TEST(RankOpTest, EmptyTensor) {
   RankOpModel model({1, 0}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
   EXPECT_TRUE(model.GetOutputShape().empty());
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
index 88794fefac4..d979f083f70 100644
--- a/tensorflow/lite/kernels/shape.cc
+++ b/tensorflow/lite/kernels/shape.cc
@@ -54,19 +54,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the shape immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Shape always produces a 1-dimensional output tensor, where each output
   // element is the length of the corresponding input tensor's dimension.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(1);
   output_size->data[0] = NumDimensions(input);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TFLITE_DCHECK_EQ(NumDimensions(output), 1);
   TFLITE_DCHECK_EQ(SizeOfDimension(output, 0), NumDimensions(input));
 
+  // Immediately propagate the known shape to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   switch (output->type) {
     case kTfLiteInt32:
       ExtractShape(input, GetTensorData<int32_t>(output));
@@ -81,6 +84,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace shape
 
 TfLiteRegistration* Register_SHAPE() {
diff --git a/tensorflow/lite/kernels/shape_test.cc b/tensorflow/lite/kernels/shape_test.cc
index 6a7dad4d3e0..3eeb83f5000 100644
--- a/tensorflow/lite/kernels/shape_test.cc
+++ b/tensorflow/lite/kernels/shape_test.cc
@@ -45,6 +45,9 @@ class ShapeOpModel : public SingleOpModel {
   int32_t GetOutputSize() { return GetTensorSize(output_); }
   std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -54,6 +57,13 @@ class ShapeOpModel : public SingleOpModel {
 TEST(ShapeOpTest, OutTypeInt) {
   ShapeOpModel<int32_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
@@ -63,7 +73,6 @@ TEST(ShapeOpTest, OutTypeInt) {
 TEST(ShapeOpTest, OutTypeInt64) {
   ShapeOpModel<int64_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT64);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
@@ -71,7 +80,6 @@ TEST(ShapeOpTest, OutTypeInt64) {
 
 TEST(ShapeOpTest, ScalarTensor) {
   ShapeOpModel<int32_t> model({}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_EQ(model.GetOutputSize(), 0);
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({0}));
@@ -79,7 +87,6 @@ TEST(ShapeOpTest, ScalarTensor) {
 
 TEST(ShapeOpTest, EmptyTensor) {
   ShapeOpModel<int32_t> model({1, 0}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 70f16c78d79..42c42aea9f8 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -95,6 +95,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index c5ccdb98390..2e25b0a17f7 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -77,6 +77,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 9ddd09edca6..1bcb2ce0ee4 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -269,9 +269,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                                                   [out_tensor])
     converter.inference_input_type = lite_constants.QUANTIZED_UINT8
     converter.inference_type = lite_constants.FLOAT
-    converter.quantized_input_stats = {
-        'Placeholder': (0., 1.)
-    }  # mean, std_dev
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1327,6 +1325,41 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
+  def testResizeWithShape(self):
+    with ops.Graph().as_default():
+      # Construct a graph with a dynamically shapped input and an internal node
+      # that relies on the output of that input's shape.
+      in_tensor = array_ops.placeholder(
+          shape=[None, None], dtype=dtypes.float32)
+      in_tensor2 = [[1, 2], [3, 4]]
+      out_tensor = array_ops.reshape(in_tensor2, array_ops.shape(in_tensor))
+      sess = session.Session()
+
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.experimental_new_converter = True
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertTrue(([1, 1] == input_details[0]['shape']).all())
+    self.assertTrue(([-1, -1] == input_details[0]['shape_signature']).all())
+
+    # Resize tensor and invoke.
+    interpreter.resize_tensor_input(0, [4])
+    interpreter.allocate_tensors()
+    interpreter.invoke()
+
+    # The output should be reshaped properly according to the resized input.
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.int32, output_details[0]['dtype'])
+    self.assertTrue(([4] == output_details[0]['shape']).all())
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue(([1, 2, 3, 4] == output_data).all())
+
   def testResizingIntermediateDynamicTensor(self):
     # This is a regression test for the case where shape of dynamic output
     # tensors changes between invocations.
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 9657c7e564c..ab150e87d93 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -321,15 +321,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.

From ec2837b2a112ae3ada2c10173c12ed9b2f129b02 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 14:23:34 -0700
Subject: [PATCH 0438/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311201348
Change-Id: I35ed38a57bbcf68e980b69a50190a033d2b34d4e
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 7b73cfa737035ea1934fbfd364a4aae34cf7ceb0 Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Tue, 12 May 2020 14:28:24 -0700
Subject: [PATCH 0439/1533] Update TensorFlow Lite external documentation

PiperOrigin-RevId: 311202267
Change-Id: Ibbca1409b33469e8a3d407e330dafc7ae079089d
---
 .../lite/g3doc/convert/1x_compatibility.md    | 131 ++++++------
 .../performance/post_training_quantization.md | 190 ++++++++++--------
 2 files changed, 169 insertions(+), 152 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/1x_compatibility.md b/tensorflow/lite/g3doc/convert/1x_compatibility.md
index adb2af4d8ad..9f9f277a8d9 100644
--- a/tensorflow/lite/g3doc/convert/1x_compatibility.md
+++ b/tensorflow/lite/g3doc/convert/1x_compatibility.md
@@ -1,30 +1,32 @@
-# TensorFlow 1.x compatibility
+# TensorFlow 1.x Compatibility <a name="differences"></a>
 
-The `tf.lite.TFLiteConverter` was updated between TensorFlow 1.X and 2.0. This
-document explains the differences between the 1.X and 2.0 versions of the
-converter, and provides information about how to use the 1.X version if
-required.
+The `tf.lite.TFLiteConverter` Python API was updated between TensorFlow 1.x and
+2.x. This document explains the differences between the two versions, and
+provides information about how to use the 1.x version if required.
 
-## Summary of changes in Python API between 1.X and 2.0 <a name="differences"></a>
-
-The following section summarizes the changes in the Python API from 1.X to 2.0.
 If any of the changes raise concerns, please file a
-[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+[GitHub Issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Formats supported by `TFLiteConverter`
+Note: We highly recommend that you
+[migrate your TensorFlow 1.x code to TensorFlow 2.x code](https://www.tensorflow.org/guide/migrate)
+.
 
-The 2.0 version of the converter supports SavedModel and Keras model files
-generated in both 1.X and 2.0. However, the conversion process no longer
-supports "frozen graph" `GraphDef` files generated in 1.X.
+## Model formats
 
-#### Converting frozen graphs
+#### SavedModel and Keras
 
-Users who want to convert frozen graph `GraphDef` files (`.pb` files) to
-TensorFlow Lite should use `tf.compat.v1.lite.TFLiteConverter`.
+The `tf.lite.TFLiteConverter` API supports SavedModel and Keras HDF5 files
+generated in both TensorFlow 1.x and 2.x.
 
-The following snippet shows a frozen graph file being converted:
+#### Frozen Graph
+
+Note: TensorFlow 2.x no longer supports the generation of frozen graph models.
+
+The `tf.compat.v1.lite.TFLiteConverter` API supports frozen graph models
+generated in TensorFlow 1.x, as shown below:
 
 ```python
+import tensorflow as tf
 # Path to the frozen graph file
 graph_def_file = 'frozen_graph.pb'
 # A list of the names of the model's input tensors
@@ -32,70 +34,68 @@ input_arrays = ['input_name']
 # A list of the names of the model's output tensors
 output_arrays = ['output_name']
 # Load and convert the frozen graph
-converter = lite.TFLiteConverter.from_frozen_graph(
+converter = tf.lite.TFLiteConverter.from_frozen_graph(
   graph_def_file, input_arrays, output_arrays)
 tflite_model = converter.convert()
 # Write the converted model to disk
 open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-### Quantization-aware training
+## Converter attributes
 
-The following attributes and methods associated with
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize)
-have been removed from `TFLiteConverter` in TensorFlow 2.0:
+#### Renamed attributes
 
-*   `inference_type`
-*   `inference_input_type`
-*   `quantized_input_stats`
-*   `default_ranges_stats`
-*   `reorder_across_fake_quant`
-*   `change_concat_input_ranges`
-*   `post_training_quantize` - Deprecated in the 1.X API
-*   `get_input_arrays()`
+The following 1.x attribute has been renamed in 2.x.
 
-The rewriter function that supports quantization-aware training does not support
-models generated by TensorFlow 2.0. Additionally, TensorFlow Lite’s quantization
-API is being reworked and streamlined in a direction that supports
-quantization-aware training through the Keras API. These attributes will be
-removed in the 2.0 API until the new quantization API is launched. Users who
-want to convert models generated by the rewriter function can use
-`tf.compat.v1.lite.TFLiteConverter`.
+*   `target_ops` has been renamed to `target_spec.supported_ops` - In 2.x, in
+    line with future additions to the optimization framework, it has become an
+    attribute of `TargetSpec` and has been renamed to `supported_ops`.
 
-### Changes to `TFLiteConverter` attributes
+#### Unsupported attributes
 
-The `target_ops` attribute has become an attribute of `TargetSpec` and renamed
-to `supported_ops` in line with future additions to the optimization framework.
+The following 1.x attributes have been removed in 2.x.
 
-Additionally, the following attributes have been removed:
-
-*   `drop_control_dependency` (default: `True`)
-*   _Graph visualization_ - The recommended approach for visualizing a
-    TensorFlow Lite graph in TensorFlow 2.0 will be to use
-    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
-    Unlike GraphViz, it enables users to visualize the graph after post training
-    quantization has occurred. The following attributes related to graph
-    visualization will be removed:
+*   _Quantization_ - In 2.x,
+    [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+    is supported through the Keras API and
+    [post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
+    uses fewer streamlined converter flags. Thus, the following attributes and
+    methods related to quantization have been removed:
+    *   `inference_type`
+    *   `quantized_input_stats`
+    *   `post_training_quantize`
+    *   `default_ranges_stats`
+    *   `reorder_across_fake_quant`
+    *   `change_concat_input_ranges`
+    *   `get_input_arrays()`
+*   _Visualization_ - In 2.x, the recommended approach for visualizing a
+    TensorFlow Lite graph is to use
+    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
+    . Unlike GraphViz, it enables users to visualize the graph after post
+    training quantization has occurred. Thus, the following attributes related
+    to graph visualization have been removed:
     *   `output_format`
     *   `dump_graphviz_dir`
     *   `dump_graphviz_video`
+*   _Frozen graph_ - In 2.x, the frozen graph model format has been removed.
+    Thus, the following attribute related to frozen graphs has been removed:
+    *   `drop_control_dependency`
 
-### General API changes
+## Unsupported APIs
 
-The following section explains several significant API changes between
-TensorFlow 1.X and 2.0.
+The following section explains several significant features in 1.x that have
+been removed in 2.x.
 
-#### Conversion methods
+#### Conversion APIs
 
-The following methods that were previously deprecated in 1.X will no longer be
-exported in 2.0:
+The following methods were deprecated in 1.x and have been removed in 2.x:
 
 *   `lite.toco_convert`
 *   `lite.TocoConverter`
 
-#### `lite.constants`
+#### `lite.constants` API
 
-The `lite.constants` API was removed in 2.0 in order to decrease duplication
+The `lite.constants` API was removed in 2.x in order to decrease duplication
 between TensorFlow and TensorFlow Lite. The following list maps the
 `lite.constant` type to the TensorFlow type:
 
@@ -106,12 +106,15 @@ between TensorFlow and TensorFlow Lite. The following list maps the
 *   `lite.constants.STRING`: `tf.string`
 *   `lite.constants.QUANTIZED_UINT8`: `tf.uint8`
 
-Additionally, `lite.constants.TFLITE` and `lite.constants.GRAPHVIZ_DOT` were
-removed due to the deprecation of the `output_format` flag in `TFLiteConverter`.
+Additionally, the deprecation of the `output_format` flag in `TFLiteConverter`
+led to the removal of the following constants:
 
-#### `lite.OpHint`
+*   `lite.constants.TFLITE`
+*   `lite.constants.GRAPHVIZ_DOT`
 
-The `OpHint` API is currently not available in 2.0 due to an incompatibility
-with the 2.0 APIs. This API enables conversion of LSTM based models. Support for
-LSTMs in 2.0 is being investigated. All related `lite.experimental` APIs have
-been removed due to this issue.
+#### `lite.OpHint` API
+
+The `OpHint` API is currently unsupported due to an incompatibility with the 2.x
+APIs. This API enables conversion of LSTM based models. Support for LSTMs in 2.x
+is being investigated. All related `lite.experimental` APIs have been removed
+due to this issue.
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 194d102d43d..a526be75b61 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -4,51 +4,44 @@ Post-training quantization is a conversion technique that can reduce model size
 while also improving CPU and hardware accelerator latency, with little
 degradation in model accuracy. You can perform these techniques using an
 already-trained float TensorFlow model when you convert it to TensorFlow Lite
-format.
+format using the [TensorFlow Lite Converter](../convert/).
 
 Note: The procedures on this page require TensorFlow 1.15 or higher.
 
-
-### Optimization options
+### Optimization Methods
 
 There are several post-training quantization options to choose from. Here is a
 summary table of the choices and the benefits they provide:
 
-| Technique                 | Benefits                  | Hardware            |
-| ------------------------- | ------------------------- | ------------------- |
-| Dynamic range             | 4x smaller, 2-3x speedup, | CPU                 |
-: quantization              : accuracy                  :                     :
-| Full integer quantization | 4x smaller, 3x+ speedup   | CPU, Edge TPU, etc. |
-| Float16 quantization      | 2x smaller, potential GPU | CPU/GPU             |
-:                           : acceleration              :                     :
+| Technique            | Benefits                  | Hardware         |
+| -------------------- | ------------------------- | ---------------- |
+| Dynamic range        | 4x smaller, 2-3x speedup  | CPU              |
+: quantization         :                           :                  :
+| Full integer         | 4x smaller, 3x+ speedup   | CPU, Edge TPU,   |
+: quantization         :                           : Microcontrollers :
+| Float16 quantization | 2x smaller, potential GPU | CPU, GPU         |
+:                      : acceleration              :                  :
 
 This decision tree can help determine which post-training quantization method is
 best for your use case:
 
 ![post-training optimization options](images/optimization.jpg)
 
-Alternatively, you might achieve higher accuracy if you perform
-[quantization-aware training](
-https://github.com/tensorflow/tensorflow/tree/r1.14/tensorflow/contrib/quantize).
-However, doing so requires some model modifications to add fake quantization
-nodes, whereas the post-training quantization techniques on this page use an
-existing pre-trained model.
-
 ### Dynamic range quantization
 
 The simplest form of post-training quantization statically quantizes only the
-weights from floating point to 8-bits of precision. This technique is enabled as
-an option in the [TensorFlow Lite converter](../convert/):
+weights from floating point to integer, which has 8-bits of precision:
 
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-At inference, weights are converted from 8-bits of precision to floating point and
-computed using floating-point kernels. This conversion is done once and cached to reduce latency.
+At inference, weights are converted from 8-bits of precision to floating point
+and computed using floating-point kernels. This conversion is done once and
+cached to reduce latency.
 
 To further improve latency, "dynamic-range" operators dynamically quantize
 activations based on their range to 8-bits and perform computations with 8-bit
@@ -58,89 +51,105 @@ point, so that the speedup with dynamic-range ops is less than a full
 fixed-point computation. Dynamic-range ops are available for the most
 compute-intensive operators in a network:
 
-*  [tf.contrib.layers.fully_connected](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected)
-*  [tf.nn.conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)
-*  [tf.nn.embedding_lookup](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup)
-*  [BasicRNN](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicRNNCell)
-*  [tf.nn.bidirectional_dynamic_rnn for BasicRNNCell type](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
+*   `tf.keras.layers.Dense`
+*   `tf.keras.layers.Conv2D`
+*   `tf.keras.layers.LSTM`
+*   `tf.nn.embedding_lookup`
+*   `tf.compat.v1.nn.rnn_cell.BasicRNNCell`
+*   `tf.compat.v1.nn.bidirectional_dynamic_rnn`
+*   `tf.compat.v1.nn.dynamic_rnn`
 
-
-### Full integer quantization of weights and activations
+### Full integer quantization
 
 You can get further latency improvements, reductions in peak memory usage, and
-access to integer only hardware accelerators by making sure all model math is
-quantized.
+access to integer only hardware devices or accelerators by making sure all model
+math is integer quantized.
 
 To do this, you need to measure the dynamic range of activations and inputs by
-supplying a representative data set. You can simply create an input data
-generator and provide it to our converter. For example:
+supplying sample input data to the converter. Refer to the
+`representative_dataset_gen()` function used in the following code.
 
-```
+#### Integer with float fallback (using default float input/output)
+
+In order to fully integer quantize a model, but use float operators when they
+don't have an integer implementation (to ensure conversion occurs smoothly), use
+the following steps:
+
+<pre>
 import tensorflow as tf
-
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
     # Get sample input data as a numpy array in a method of your choosing.
     yield [input]
-
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.representative_dataset = representative_dataset_gen
+converter.representative_dataset = representative_dataset_gen</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-The resulting model should be fully quantized, but any
-ops that do not have quantized implementations are left in
-floating point. This allows conversion to occur smoothly, but the model won't be
-compatible with accelerators that require full integer quantization.
+Note: This won't be compatible with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU). For convenience
+during inference, the input and output still remain float in order to have the
+same interface as the original float only model.
 
-Additionally, the model still uses float input and output for convenience.
+#### Integer only
 
-To ensure compatibility with some accelerators (such as the Coral Edge TPU), you
-can enforce full integer quantization for all ops and use integer input and
-output by adding the following lines before you convert:
+*This is a common use case for
+[TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
+and [Coral Edge TPUs](https://coral.ai/).*
 
-```
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-converter.inference_input_type = tf.uint8
-converter.inference_output_type = tf.uint8
-```
+Additionally, to ensure compatibility with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU), you can enforce
+full integer quantization for all ops including the input and output, by using
+the following steps:
 
-The first line makes the converter throw an error if it encounters an operation
-it cannot currently quantize.
-
-Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
-
-
-### Float16 quantization of weights
-
-You can reduce the size of a floating point model by quantizing the weights to
-float16, the IEEE standard for 16-bit floating point numbers. The advantages of
-this quantization are as follows:
-
--   reduce model size by up to half (since all weights are now half the original
-    size)
--   minimal loss in accuracy
--   some delegates (e.g. the GPU delegate) can operate directly on float16 data,
-    which results in faster execution than float32 computations.
-
-This quantization may not be a good choice if you need maximum performance (a
-quantization to fixed point math would be better in that case). To enable
-float16 quantization of weights, specify "DEFAULT" optimization as above and
-then specify that float16 is in supported types for the target_spec:
-
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]
+def representative_dataset_gen():
+  for _ in range(num_calibration_steps):
+    # Get sample input data as a numpy array in a method of your choosing.
+    yield [input]
+converter.representative_dataset = representative_dataset_gen
+<b>converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]</b>
+<b>converter.inference_input_type = tf.int8</b>  # or tf.uint8
+<b>converter.inference_output_type = tf.int8</b>  # or tf.uint8
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-By default, a float16 quantized model will "dequantize" the weights values to
-float32 when run on the CPU. The GPU delegate will not perform this
-dequantization, since it can operate on float16 data.
+Note: The converter will throw an error if it encounters an operation it cannot
+currently quantize.
+
+### Float16 quantization
+
+You can reduce the size of a floating point model by quantizing the weights to
+float16, the IEEE standard for 16-bit floating point numbers. To enable float16
+quantization of weights, use the following steps:
+
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]</b>
+tflite_quant_model = converter.convert()
+</pre>
+
+The advantages of this quantization are as follows:
+
+*   Reduce model size by up to half (since all weights are now half the original
+    size).
+*   Minimal loss in accuracy.
+*   Supports some delegates (e.g. the GPU delegate) can operate directly on
+    float16 data, which results in faster execution than float32 computations.
+
+The disadvantages of this quantization are as follows:
+
+*   Not a good choice for maximum performance (a quantization to fixed point
+    math would be better in that case).
+*   By default, a float16 quantized model will "dequantize" the weights values
+    to float32 when run on the CPU. (Note that the GPU delegate will not perform
+    this dequantization, since it can operate on float16 data.)
 
 ### Model accuracy
 
@@ -152,13 +161,18 @@ accuracy of the quantized model to verify that any degradation in accuracy is
 within acceptable limits. There is a tool to evaluate
 [TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/ilsvrc/README.md){:.external}.
 
-If the accuracy drop is too high, consider using
-[quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
+Alternatively, if the accuracy drop is too high, consider using
+[quantization aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+. However, doing so requires modifications during model training to add fake
+quantization nodes, whereas the post-training quantization techniques on this
+page use an existing pre-trained model.
 
 ### Representation for quantized tensors
 
 8-bit quantization approximates floating point values using the following
-formula. `real_value = (int8_value - zero_point) * scale`.
+formula.
+
+$$real\_value = (int8\_value - zero\_point) \times scale$$
 
 The representation has two main parts:
 

From 38e503d845d0c45c42b4b19f76548b140a608a7f Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Tue, 12 May 2020 21:47:01 +0000
Subject: [PATCH 0440/1533] Addressed PR comments

---
 tensorflow/python/eager/forwardprop_test.py | 27 +++++++--------------
 tensorflow/python/ops/custom_gradient.py    |  4 +--
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 611e9ce2b2a..c32de30a2b3 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -177,8 +177,7 @@ def _test_gradients(testcase,
                     order,
                     delta=1e-3,
                     rtol=1e-2,
-                    atol=1e-6,
-                    recompute=False):
+                    atol=1e-6):
   """Tests forward/backward jacobians of `f`'s [0, `order`)-order gradients."""
   if order < 1:
     raise ValueError(
@@ -191,21 +190,14 @@ def _test_gradients(testcase,
         order=order - 1,
         delta=delta,
         rtol=rtol,
-        atol=atol,
-        recompute=recompute)
+        atol=atol)
   sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(
       f, primals, delta=delta)
   testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
-  if not recompute:
-    sym_jac_fwd = _jacfwd(f, primals)
-    testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol)
-    # And the symbolic computations should be much closer.
-    testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
-  else:
-    with testcase.assertRaisesRegexp(ValueError,
-                                     "recompute_grad tried to transpose"):
-      sym_jac_fwd = _jacfwd(f, primals)
-
+  sym_jac_fwd = _jacfwd(f, primals)
+  testcase.assertAllClose(num_jac, sym_jac_fwd, rtol=rtol, atol=atol)
+  # And the symbolic computations should be much closer.
+  testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
 
 class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
@@ -364,10 +356,9 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     def f(x):
       return math_ops.reduce_prod(math_ops.tanh(x)**2)
 
-    _test_gradients(self,
-                    f, [constant_op.constant([1.])],
-                    order=3,
-                    recompute=True)
+    with self.assertRaisesRegexp(NotImplementedError,
+                                 "recompute_grad tried to transpose"):
+      _test_gradients(self, f, [constant_op.constant([1.])], order=3)
 
   def testExceptionInCustomGradientNotSwallowed(self):
 
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index e32c0820e93..d0f06718911 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -512,8 +512,8 @@ def recompute_grad(f):
       def transpose(*t_args, **t_kwargs):
         """Gradient function calculation for forward mode autodiff."""
         # Just throw an error since gradients / activations are not stored on tape for recompute.
-        raise ValueError(
-            "recompute_grad tried to transpose {}."
+        raise NotImplementedError(
+            "recompute_grad tried to transpose grad of {}. "
             "Consider not using recompute_grad in forward mode autodiff".format(
                 f.__name__))
 

From 1e07fa6448c01346054812ba0f0f71717f8156ff Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 12 May 2020 14:49:02 -0700
Subject: [PATCH 0441/1533] Fix Resize Nearest Neighbor versioning.
 align_corners support was added in version 3.

PiperOrigin-RevId: 311206537
Change-Id: Ief12bdbbbedf5cf390315c5ee50a57e2000001ee
---
 tensorflow/lite/toco/tflite/operator.cc       |  2 +
 .../lite/tools/versioning/op_version.cc       | 12 +++-
 tensorflow/lite/tools/versioning/op_version.h |  1 +
 .../lite/tools/versioning/op_version_test.cc  | 60 +++++++++++++++++++
 4 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 57b791a1a94..917fd24c952 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -1118,6 +1118,7 @@ class ResizeBilinear
         GetVersioningOpSig(builtin_op(), op_signature);
     op_sig.options.resize.half_pixel_centers =
         resize_bilinear_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_bilinear_op.align_corners;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
@@ -1147,6 +1148,7 @@ class ResizeNearestNeighbor
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
     op_sig.options.resize.half_pixel_centers = resize_nn_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_nn_op.align_corners;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 56aa9d5d0a9..9022afca629 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -363,13 +363,20 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     case BuiltinOperator_RESIZE_BILINEAR:
-    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
       if (op_sig.options.resize.half_pixel_centers) {
         return 3;
       } else if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
       return 1;
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
+      if (op_sig.options.resize.half_pixel_centers ||
+          op_sig.options.resize.align_corners) {
+        return 3;
+      } else if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
 
     case BuiltinOperator_MAXIMUM:
     case BuiltinOperator_MINIMUM:
@@ -612,6 +619,8 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       if (resize_bilinear_option) {
         op_sig.options.resize.half_pixel_centers =
             resize_bilinear_option->half_pixel_centers();
+        op_sig.options.resize.align_corners =
+            resize_bilinear_option->align_corners();
       }
     } break;
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
@@ -620,6 +629,7 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       if (resize_nn_option) {
         op_sig.options.resize.half_pixel_centers =
             resize_nn_option->half_pixel_centers();
+        op_sig.options.resize.align_corners = resize_nn_option->align_corners();
       }
     } break;
     // TODO(b/150176627): Add tests for GetOpSignature.
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index fba6c943462..4b0fe8836e2 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -48,6 +48,7 @@ typedef struct {
     } lstm;
     struct {
       bool half_pixel_centers;
+      bool align_corners;
     } resize;
     struct {
       int32_t num_dims;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 7d9039ff848..f0d8259d764 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -594,4 +594,64 @@ TEST(OpVersionTEst, VersioningFillTest) {
                                                         TensorType_INT32}};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
+TEST(OpVersionTest, VersioningResizeBilinearTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is still version 1.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
+TEST(OpVersionTest, VersioningResizeNearestNeighborTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is version 3.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
 }  // namespace tflite

From c5caa29b5e6d10079020673a0dbd0035214df94d Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 12 May 2020 14:50:15 -0700
Subject: [PATCH 0442/1533] Make `core.Tensor` the base type for Tensor and
 replace the `register_dense_tensor_like` with direct subclassing.

PiperOrigin-RevId: 311206817
Change-Id: Id8ae234516d5409d6b70612a99f9f0b3ed53dc7e
---
 tensorflow/BUILD                              |  7 +++
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/distribute/values.py        | 10 ++--
 tensorflow/python/distribute/values_test.py   | 11 ++--
 tensorflow/python/framework/ops.py            | 56 ++-----------------
 tensorflow/python/framework/ops_test.py       | 50 -----------------
 tensorflow/python/framework/tensor_util.py    |  3 +-
 tensorflow/python/keras/engine/training_v1.py |  3 +-
 .../experimental/autocast_variable.py         |  4 +-
 tensorflow/python/ops/array_ops.py            | 15 ++---
 .../python/ops/resource_variable_ops.py       |  7 +--
 tensorflow/python/ops/variable_scope.py       |  5 +-
 tensorflow/python/ops/variables.py            |  5 +-
 tensorflow/python/profiler/BUILD              |  1 +
 tensorflow/python/types/BUILD                 |  5 +-
 .../api/golden/v1/tensorflow.-tensor.pbtxt    |  1 +
 .../api/golden/v2/tensorflow.-tensor.pbtxt    |  1 +
 17 files changed, 52 insertions(+), 133 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index bf3af3c31b4..ab4316d5ed0 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -530,6 +530,13 @@ package_group(name = "ndarray_tensor_allow_list")
 # TODO(b/154762408) Remove this package group once it's no longer needed.
 package_group(name = "composite_tensor_whitelist")
 
+# Packages that use private types symbols, until they are exported.
+# TODO(b/154650521) Remove.
+package_group(
+    name = "types_whitelist",
+    packages = ["//learning/deepmind/tensorflow/replicator/..."],
+)
+
 filegroup(
     name = "intel_binary_blob",
     data = if_mkl_ml(
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 11da45fbcbb..a49e4b74def 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -230,6 +230,7 @@ py_library(
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
+        "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 4fe3d287ccc..444915aa123 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -422,7 +423,8 @@ class DistributedVarOp(object):
     return hash((self.name, self.graph, self.traceback, self.type))
 
 
-class DistributedVariable(DistributedDelegate, variables_lib.Variable):
+class DistributedVariable(DistributedDelegate, variables_lib.Variable,
+                          core.Tensor):
   """Holds a map from replica to variables."""
 
   # TODO(josh11b): Support changing the set of variables if e.g. if new
@@ -741,9 +743,6 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     pass
 
 
-ops.register_dense_tensor_like_type(DistributedVariable)
-
-
 def _validate_colocate_extended(v, extended):
   variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
   if variable_strategy.extended is not extended:
@@ -1380,7 +1379,7 @@ def value_container(val):
   return val
 
 
-class AggregatingVariable(variables_lib.Variable):
+class AggregatingVariable(variables_lib.Variable, core.Tensor):
   """A wrapper around a variable that aggregates updates across replicas."""
 
   def __init__(self, strategy, v, aggregation):
@@ -1649,4 +1648,3 @@ def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
 
 ops.register_tensor_conversion_function(AggregatingVariable,
                                         _tensor_conversion_aggregate)
-ops.register_dense_tensor_like_type(AggregatingVariable)
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index daa7e5563d3..67ed86b4047 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.saved_model.model_utils import mode_keys
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 
 
@@ -623,10 +624,10 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       v = variables_lib.Variable(
           0., synchronization=synchronization, aggregation=aggregation)
     # In cross replica context.
-    self.assertTrue(ops.is_dense_tensor_like(v))
+    self.assertIsInstance(v, core.Tensor)
     # In replica context.
     distribution.run(
-        lambda v: self.assertTrue(ops.is_dense_tensor_like(v)), args=(v,))
+        lambda v: self.assertIsInstance(v, core.Tensor), args=(v,))
 
   def testAssignReturnValueIsTensorLike(self, distribution, synchronization,
                                         aggregation):
@@ -645,9 +646,9 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       # values is not allowed when aggregation is SUM. See
       # `cross_device_ops.reduce_non_distributed_value`.
       delta = array_ops.identity(1.)
-      self.assertTrue(ops.is_dense_tensor_like(v.assign(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_sub(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_add(delta)))
+      self.assertIsInstance(v.assign(delta), core.Tensor)
+      self.assertIsInstance(v.assign_sub(delta), core.Tensor)
+      self.assertIsInstance(v.assign_add(delta), core.Tensor)
 
     # In cross replica context we return a PerReplica which is not Tensor like
     # yet.
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 9b8f7cf4fde..43652d51eae 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -62,6 +62,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core as core_tf_types
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
@@ -213,53 +214,11 @@ def _as_graph_element(obj):
   return None
 
 
-_TENSOR_LIKE_TYPES = tuple()
-
-
+# Deprecated - do not use.
+# This API to avoid breaking estimator and tensorflow-mesh which depend on this
+# internal API. The stub should be safe to use after TF 2.3 is released.
 def is_dense_tensor_like(t):
-  """EXPERIMENTAL: Returns true if `t` implements the tensor interface.
-
-  See `register_dense_tensor_like_type()` for the current definition of a
-  "tensor-like type".
-
-  Args:
-    t: An object.
-
-  Returns:
-    True iff `t` is an instance of one of the registered "tensor-like" types.
-  """
-  return isinstance(t, _TENSOR_LIKE_TYPES)
-
-
-def register_dense_tensor_like_type(tensor_type):
-  """EXPERIMENTAL: Registers `tensor_type` as implementing the tensor interface.
-
-  A "tensor-like type" can represent a single dense tensor, and implements
-  the `name`, `dtype` and `shape` properties.
-
-  Args:
-    tensor_type: A type implementing the tensor interface.
-
-  Raises:
-    TypeError: If `tensor_type` does not implement the tensor interface.
-  """
-  if not (hasattr(tensor_type, "name") and
-          isinstance(tensor_type.name, property)):
-    raise TypeError("Type %s does not define a `name` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "dtype") and
-          isinstance(tensor_type.dtype, property)):
-    raise TypeError("Type %s does not define a `dtype` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "shape") and
-          isinstance(tensor_type.shape, property)):
-    raise TypeError("Type %s does not define a `shape` property" %
-                    tensor_type.__name__)
-  # We expect this list to be small, so choose quadratic complexity
-  # for registration, so that we have a tuple that can be used for
-  # more efficient `isinstance` checks later.
-  global _TENSOR_LIKE_TYPES
-  _TENSOR_LIKE_TYPES = tuple(list(_TENSOR_LIKE_TYPES) + [tensor_type])
+  return isinstance(t, core_tf_types.Tensor)
 
 
 def uid():
@@ -304,7 +263,7 @@ def disable_tensor_equality():
 
 # TODO(mdan): This object should subclass Symbol, not just Tensor.
 @tf_export("Tensor")
-class Tensor(internal.NativeObject):
+class Tensor(internal.NativeObject, core_tf_types.Tensor):
   """A tensor is a multidimensional array of elements represented by a
 
   `tf.Tensor` object.  All elements are of a single known data type.
@@ -1305,9 +1264,6 @@ class _EagerTensorBase(Tensor):
 EagerTensor = pywrap_tfe.TFE_Py_InitEagerTensor(_EagerTensorBase)
 
 
-register_dense_tensor_like_type(Tensor)
-
-
 @tf_export(v1=["convert_to_tensor"])
 def convert_to_tensor_v1(value,
                          dtype=None,
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 20f58a00cfe..322df8ffac8 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -3268,56 +3268,6 @@ class DeprecatedTest(test_util.TensorFlowTestCase):
         test_ops.old()
 
 
-class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
-
-  @test_util.disable_tfrt("Graph is not supported yet.")
-  def testSuccess(self):
-    op = ops.Operation(
-        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
-    t = op.outputs[0]
-    self.assertTrue(ops.is_dense_tensor_like(t))
-
-    v = variables.Variable([17])
-    self.assertTrue(ops.is_dense_tensor_like(v))
-
-  class BadClassNoName(object):
-    pass
-
-  class BadClassBadName(object):
-
-    def name(self):
-      pass
-
-  class BadClassNoDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-  class BadClassBadDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-    def dtype(self):
-      pass
-
-  def testBadClass(self):
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoName)
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadName)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoDtype)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadDtype)
-
-
 class NameScopeTest(test_util.TensorFlowTestCase):
 
   def testStripAndPrependScope(self):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 50388595c3d..968b635250a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.types import core
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -1009,7 +1010,7 @@ def is_tensor(x):  # pylint: disable=invalid-name
     `True` if `x` is a tensor or "tensor-like", `False` if not.
   """
   return (isinstance(x, internal.NativeObject) or
-          ops.is_dense_tensor_like(x) or
+          isinstance(x, core.Tensor) or
           getattr(x, "is_tensor_like", False))
 
 
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 0a40ce3899b..16188af833a 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -62,6 +62,7 @@ from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -3143,7 +3144,7 @@ def _convert_scipy_sparse_tensor(value, expected_input):
     The possibly-converted 'value'.
   """
   if issparse is not None and issparse(value):
-    if ops.is_dense_tensor_like(expected_input):
+    if isinstance(expected_input, core.Tensor):
       if ops.executing_eagerly_outside_functions():
         # In TF2 we do not silently densify sparse matrices.
         raise ValueError('A SciPy sparse matrix was passed to a model '
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index c43ca21ea06..29e5a68c854 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -23,9 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.types import core
 
 
-class AutoCastVariable(variables.Variable):
+class AutoCastVariable(variables.Variable, core.Tensor):
   """Variable that will cast itself to a different dtype in applicable contexts.
 
   This class wraps a floating-point `tf.Variable`. It emulates the variable
@@ -417,7 +418,6 @@ class AutoCastVariable(variables.Variable):
 
 ops.register_tensor_conversion_function(AutoCastVariable,
                                         AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
-ops.register_dense_tensor_like_type(AutoCastVariable)
 
 
 def create_autocast_variable(variable):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 33aac84d77f..1cb6fdbd726 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
@@ -1381,13 +1382,13 @@ def _autopacking_helper(list_or_tuple, dtype, name):
   if context.executing_eagerly():
     # NOTE: Fast path when all the items are tensors, this doesn't do any type
     # checking.
-    if all(ops.is_dense_tensor_like(elem) for elem in list_or_tuple):
+    if all(isinstance(elem, core.Tensor) for elem in list_or_tuple):
       return gen_array_ops.pack(list_or_tuple, name=name)
   must_pack = False
   converted_elems = []
   with ops.name_scope(name) as scope:
     for i, elem in enumerate(list_or_tuple):
-      if ops.is_dense_tensor_like(elem):
+      if isinstance(elem, core.Tensor):
         if dtype is not None and elem.dtype.base_dtype != dtype:
           raise TypeError("Cannot convert a list containing a tensor of dtype "
                           "%s to %s (Tensor is: %r)" %
@@ -1396,7 +1397,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
         must_pack = True
       elif isinstance(elem, (list, tuple)):
         converted_elem = _autopacking_helper(elem, dtype, str(i))
-        if ops.is_dense_tensor_like(converted_elem):
+        if isinstance(converted_elem, core.Tensor):
           must_pack = True
         converted_elems.append(converted_elem)
       else:
@@ -1404,7 +1405,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
     if must_pack:
       elems_as_tensors = []
       for i, elem in enumerate(converted_elems):
-        if ops.is_dense_tensor_like(elem):
+        if isinstance(elem, core.Tensor):
           elems_as_tensors.append(elem)
         else:
           # NOTE(mrry): This is inefficient, but it enables us to
@@ -1429,7 +1430,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
     such object exists.
   """
   for elem in list_or_tuple:
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       return elem.dtype.base_dtype
     elif isinstance(elem, (list, tuple)):
       maybe_dtype = _get_dtype_from_nested_lists(elem)
@@ -1441,7 +1442,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
 def _cast_nested_seqs_to_dtype(dtype):
 
   def _maybe_cast(elem):
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       if dtype != elem.dtype.base_dtype:
         elem = gen_math_ops.cast(elem, dtype)
     return elem
@@ -1455,7 +1456,7 @@ _NON_AUTOPACKABLE_TYPES.add(np.ndarray)
 
 def _should_not_autopack(v):
   # The condition we really want is
-  #    ops.is_dense_tensor_like(...)
+  #    any(isinstance(elem, core.Tensor))
   # but it is >5x slower due to abc.ABCMeta.__instancecheck__.
   # pylint: disable=unidiomatic-typecheck
   # TODO(slebedev): add nest.all?
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index f99f886f210..d8a7765a208 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
@@ -330,7 +331,7 @@ def variable_accessed(variable):
     tape.variable_accessed(variable)
 
 
-class BaseResourceVariable(variables.VariableV1):
+class BaseResourceVariable(variables.VariableV1, core.Tensor):
   """A python variable from an existing handle."""
 
   # TODO(wangpeng): Deprecate `constraint` when callers no long pass it in.
@@ -1830,7 +1831,6 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(BaseResourceVariable,
                                         _dense_var_to_tensor)
-ops.register_dense_tensor_like_type(BaseResourceVariable)
 
 
 class _UnreadVariable(BaseResourceVariable):
@@ -1955,9 +1955,6 @@ class _UnreadVariable(BaseResourceVariable):
     return self._parent_op
 
 
-ops.register_dense_tensor_like_type(_UnreadVariable)
-
-
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
   """Gradient for read op."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index d65cd235ca8..81c3f9a2f70 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
@@ -1000,7 +1001,7 @@ class _VariableStore(object):
     return initializer, initializing_from_value
 
 
-class _LazyEvalTensor(object):
+class _LazyEvalTensor(core.Tensor):
   """A Tensor-like object that only evaluates its thunk when used."""
 
   def __init__(self, thunk):
@@ -1069,8 +1070,6 @@ session.register_session_run_conversion_functions(
     lambda fetch: ([fetch._master_tensor], lambda fetched_vals: fetched_vals[0])  # pylint: disable=protected-access
     )
 
-ops.register_dense_tensor_like_type(_LazyEvalTensor)
-
 
 # To stop regularization, use this regularizer
 @tf_export(v1=["no_regularizer"])
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 1080778e3d3..d3df0659b5a 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -47,6 +47,7 @@ from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.types import core
 
 
 def default_variable_creator(_, **kwds):
@@ -264,6 +265,7 @@ class VariableMetaclass(type):
 
 
 @tf_export("Variable", v1=[])
+# TODO(mdan): This should subclass core.Tensor, and not all its subclasses?
 class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
   """See the [variable guide](https://tensorflow.org/guide/variable).
 
@@ -1551,7 +1553,7 @@ class VariableV1(Variable):
 
 
 # TODO(apassos): do not repeat all comments here
-class RefVariable(VariableV1):
+class RefVariable(VariableV1, core.Tensor):
   """Ref-based implementation of variables."""
 
   def __init__(
@@ -3032,7 +3034,6 @@ class PartitionedVariable(object):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(RefVariable,
                                         RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
-ops.register_dense_tensor_like_type(RefVariable)
 
 
 @tf_export(v1=["global_variables"])
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index e5ca60843e3..b6565f594c9 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -226,6 +226,7 @@ py_library(
     deps = [
         "//tensorflow/python:util",
         "//tensorflow/python/profiler/internal:_pywrap_traceme",
+        "//tensorflow/python/types",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index f35ca7fb803..e93bf5c10b3 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -27,6 +27,9 @@ py_strict_library(
         "internal.py",
     ],
     srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:types_whitelist",
+    ],
     deps = [],
 )
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index 4a30fae1da9..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index 4a30fae1da9..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"

From 65321b89c7898fb5184a64d1f42066fa14c7175f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 12 May 2020 14:53:22 -0700
Subject: [PATCH 0443/1533] Disable collective ops xla test on gpu

PiperOrigin-RevId: 311207584
Change-Id: Ibb51f5ee646edbc39d65af8a47495b21751604be
---
 tensorflow/compiler/xla/tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index c8a242c156a..1ad1f8363cf 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1909,7 +1909,7 @@ xla_test(
         # This test is tagged "manual" because it requires multiple GPUs, and
         # Forge only supports single-GPU tests.  Guitar skips "manual" tests
         # unless they're also tagged "guitar".
-        "guitar",
+        #  "guitar",  # Re-enable after b/156405690 is fixed.
         "manual",
         "multi_gpu",
         "no_oss",

From f66f384729b2a2f70fd01902f49b0b7a95be9f26 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <laurenzo@google.com>
Date: Tue, 12 May 2020 15:01:00 -0700
Subject: [PATCH 0444/1533] Make CHLO->HLO patterns extend OpRewritePattern vs
 OpConversionPattern.

* In the absence of type conversion, this is more generally compatible (ie. with the greedy rewriter).
* Consistent with the rest of the legalize_tf patterns.

PiperOrigin-RevId: 311209137
Change-Id: I3a409dbc307c141753c73ae7731276c61a2728d0
---
 .../xla/transforms/chlo_legalize_to_hlo.cc    | 41 +++++++++----------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
index a20511a95fc..0c9585a817f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -33,24 +33,23 @@ namespace {
 // Converts binary ops that statically are determined to not broadcast directly
 // to the corresponding xla_hlo non-broadcasting op.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertTrivialNonBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
+struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
     // Only rewrite for statically determinable non-broadcasting cases.
-    auto lhs = operands[0].getType().dyn_cast<RankedTensorType>();
-    auto rhs = operands[1].getType().dyn_cast<RankedTensorType>();
-    if (!lhs || !rhs) return failure();
+    auto lhs_type = op.lhs().getType().template dyn_cast<RankedTensorType>();
+    auto rhs_type = op.rhs().getType().template dyn_cast<RankedTensorType>();
+    if (!lhs_type || !rhs_type) return failure();
 
     // Requires rank broadcast.
-    if (lhs.getRank() != rhs.getRank()) return failure();
+    if (lhs_type.getRank() != rhs_type.getRank()) return failure();
     // Any dynamic dimension may require broadcasting and requires more
     // analysis.
-    if (!lhs.hasStaticShape() || !rhs.hasStaticShape()) return failure();
+    if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape())
+      return failure();
 
-    for (auto extents : llvm::zip(lhs.getShape(), rhs.getShape())) {
+    for (auto extents : llvm::zip(lhs_type.getShape(), rhs_type.getShape())) {
       auto lhs_extent = std::get<0>(extents);
       auto rhs_extent = std::get<1>(extents);
       if (lhs_extent != rhs_extent) {
@@ -58,9 +57,8 @@ struct ConvertTrivialNonBroadcastBinaryOp
       }
     }
 
-    rewriter.replaceOp(
-        op, {Adaptor::CreateOp(op, op.getResult().getType(), operands[0],
-                               operands[1], rewriter)});
+    rewriter.replaceOp(op, {Adaptor::CreateOp(op, op.getResult().getType(),
+                                              op.lhs(), op.rhs(), rewriter)});
     return success();
   }
 };
@@ -83,14 +81,13 @@ struct ConvertTrivialNonBroadcastBinaryOp
 // Whether that is of any practical benefit remains to be seen.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 struct ConvertRankedDynamicBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
+    : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
     // Only support ranked operands.
-    Value lhs = operands[0];
-    Value rhs = operands[1];
+    Value lhs = op.lhs();
+    Value rhs = op.rhs();
     auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
     auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
     auto result_type =

From f38355dab31bb466e9fdc900089dcd4abba536d6 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Tue, 12 May 2020 15:06:44 -0700
Subject: [PATCH 0445/1533] Add TF_AllTypesMatch trait in TensorFlow dialect.

TF_AllTypesMatch trait takes a list of operands/results/attributes and verifies that they have cast compatible types i.e., a single runtime type for all the values. It handles Resource subtypes as well and assumes all variant subtypes are cast compatible.

AllTypesMatch trait on TF dialect ops can be replaced with this trait which handles unranked/dynamic shapes and TF subtypes.

PiperOrigin-RevId: 311210249
Change-Id: I0d0dec247ff256c0b23aa9b5750912eaefc064f5
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |   6 +-
 .../compiler/mlir/tensorflow/ir/tf_op_base.td |  10 +
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  55 +-----
 .../compiler/mlir/tensorflow/ir/tf_types.cc   | 174 ++++++++++++++----
 .../compiler/mlir/tensorflow/ir/tf_types.h    |   6 +
 .../mlir/tensorflow/tests/tf-ops.mlir         |  27 ++-
 6 files changed, 181 insertions(+), 97 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 9a29fa4f8a9..bddf064f5c6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1765,7 +1765,7 @@ of corresponding 3-element vectors is cross-multiplied independently.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [AllTypesMatch<["input", "output"]>, NoSideEffect]> {
+def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "An Op to sum inputs across replicated TPU instances.";
 
   let description = [{
@@ -1789,7 +1789,7 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CumsumOp : TF_Op<"Cumsum", [AllTypesMatch<["x", "out"]>, NoSideEffect]> {
+def TF_CumsumOp : TF_Op<"Cumsum", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
   let summary = "Compute the cumulative sum of the tensor `x` along `axis`.";
 
   let description = [{
@@ -4350,7 +4350,7 @@ cublas.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [AllTypesMatch<["input", "band"]>, NoSideEffect]> {
+def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [NoSideEffect, TF_AllTypesMatch<["input", "band"]>]> {
   let summary = [{
 Copy a tensor setting everything outside a central band in each innermost matrix to zero.
   }];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index cb17341cefd..cd20cc79c17 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -70,6 +70,16 @@ class TF_OpIsBroadcastableToRes<int opId, int resId> : And<[
               "$_op.getOperand(" # opId # ").getType(), "
               "$_op.getResult(" # resId # ").getType())">]>;
 
+
+class TF_AllTypesMatchPred<list<string> values> :
+    CPred<"TF::AreCastCompatible(llvm::makeArrayRef({"# StrJoin<values>.result #"}))">;
+
+class TF_AllTypesMatch<list<string> names> :
+    PredOpTrait<
+        "all of {" # StrJoin<names>.result # "} have dynamically equal types ",
+        TF_AllTypesMatchPred<
+            !foreach(n, names, !subst("$_self", "$" # n, "$_self.getType()"))>>;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 85baff5e0d7..82ddc80875a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -110,47 +110,6 @@ static inline bool HasRankAtMost(Value value, int64_t rank) {
   return !type || type.getRank() <= rank;
 }
 
-// Returns true if the given pair of TensorFlow types can be cast to one
-// another. In other words, a single run-time value is legal for both the types.
-// For example, tensor<*xf32> and tensor<3xf32> are cast compatible.
-static bool AreCastCompatible(Type a, Type b) {
-  if (TensorCastOp::areCastCompatible(a, b)) return true;
-
-  // Resource types may optionally contain subtypes information that does not
-  // match. Check subtypes compatibility when possible, otherwise treat them as
-  // compatible.
-  auto a_or_element_type = getElementTypeOrSelf(a);
-  auto b_or_element_type = getElementTypeOrSelf(b);
-
-  auto a_kind = a_or_element_type.getKind();
-  auto b_kind = b_or_element_type.getKind();
-
-  if (a_kind == TensorFlowTypes::RESOURCE &&
-      b_kind == TensorFlowTypes::RESOURCE) {
-    auto a_resource_type = a_or_element_type.dyn_cast<ResourceType>();
-    auto b_resource_type = b_or_element_type.dyn_cast<ResourceType>();
-    bool a_has_subtype = !a_resource_type.getSubtypes().empty();
-    bool b_has_subtype = !b_resource_type.getSubtypes().empty();
-
-    if (!a_has_subtype || !b_has_subtype) return true;
-
-    assert(a_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-    assert(b_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-
-    return TensorCastOp::areCastCompatible(
-        a_resource_type.getSubtypes().front(),
-        b_resource_type.getSubtypes().front());
-  }
-
-  // Variant types may optionally contain subtypes information that need not
-  // match.  It is also not possible to compare subtypes for compatibility as
-  // their interpretation depends on the ops operating on them. So, accept all
-  // pairs of variant types.
-  return a_kind == TensorFlowTypes::VARIANT &&
-         b_kind == TensorFlowTypes::VARIANT;
-}
 
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
@@ -1413,7 +1372,7 @@ static LogicalResult Verify(DynamicStitchOp op) {
       auto expected_out_ty =
           RankedTensorType::get(expected_shape, out_ty.getElementType());
 
-      if (!AreCastCompatible(out_ty, expected_out_ty)) {
+      if (!AreCastCompatible({out_ty, expected_out_ty})) {
         return op.emitOpError() << "has invalid output type; should be "
                                    "compatible with inferred type "
                                 << expected_out_ty;
@@ -1814,14 +1773,14 @@ static LogicalResult Verify(IfOp op) {
   for (unsigned i = 0; i < expectedNumInputs; ++i) {
     auto operandType = op.getOperand(i + 1).getType().cast<TensorType>();
     auto thenInputType = thenFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, thenInputType))
+    if (!AreCastCompatible({operandType, thenInputType}))
       return op.emitError(
           llvm::formatv("then branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
                         thenInputType, operandType, i));
 
     auto elseInputType = elseFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, elseInputType))
+    if (!AreCastCompatible({operandType, elseInputType}))
       return op.emitError(
           llvm::formatv("else branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
@@ -1829,7 +1788,7 @@ static LogicalResult Verify(IfOp op) {
 
     // If branches have incompatible input types that means that no tensor can
     // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible(thenInputType, elseInputType))
+    if (!AreCastCompatible({thenInputType, elseInputType}))
       return op.emitError(llvm::formatv(
           "branches inputs have incompatible types {0} and {1} at index {2}",
           thenInputType, elseInputType, i));
@@ -1845,14 +1804,14 @@ static LogicalResult Verify(IfOp op) {
   for (unsigned i = 0; i < expectedNumResults; ++i) {
     auto resultType = op.getResult(i).getType().cast<TensorType>();
     auto thenResultType = thenFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(thenResultType, resultType))
+    if (!AreCastCompatible({thenResultType, resultType}))
       return op.emitError(
           llvm::formatv("then branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
                         thenResultType, resultType, i));
 
     auto elseResultType = elseFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(elseResultType, resultType))
+    if (!AreCastCompatible({elseResultType, resultType}))
       return op.emitError(
           llvm::formatv("else branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
@@ -3789,7 +3748,7 @@ static LogicalResult Verify(WhileOp op) {
         auto aType = a.second[idx];
         auto bType = b.second[idx];
 
-        if (!AreCastCompatible(aType, bType))
+        if (!AreCastCompatible({aType, bType}))
           return op.emitError(llvm::formatv(
               "{0} type {1} is incompatible with {2} type {3} at index {4}",
               a.first, aType, b.first, bType, idx));
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index 6c3cd7fac92..d312e5e409b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -28,6 +28,134 @@ llvm::Optional<llvm::ArrayRef<int64_t>> GetShape(mlir::Value value) {
   if (shaped_type.hasRank()) return shaped_type.getShape();
   return llvm::None;
 }
+
+// Merges cast compatible shapes and returns a more refined shape. The two
+// shapes are cast compatible if they have the same rank and at each dimension,
+// either both have same size or one of them is dynamic. Returns false if the
+// given shapes are not cast compatible. The refined shape is same or more
+// precise than the two input shapes.
+bool GetCastCompatibleShape(llvm::ArrayRef<int64_t> a_shape,
+                            llvm::ArrayRef<int64_t> b_shape,
+                            llvm::SmallVectorImpl<int64_t>* refined_shape) {
+  if (a_shape.size() != b_shape.size()) return false;
+  int64_t rank = a_shape.size();
+  refined_shape->reserve(rank);
+  for (auto dims : llvm::zip(a_shape, b_shape)) {
+    int64_t dim1 = std::get<0>(dims);
+    int64_t dim2 = std::get<1>(dims);
+
+    if (mlir::ShapedType::isDynamic(dim1)) {
+      refined_shape->push_back(dim2);
+      continue;
+    }
+    if (mlir::ShapedType::isDynamic(dim2)) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    if (dim1 == dim2) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+//
+// The two types are considered cast compatible if they have dynamically equal
+// shapes and element type. For element types that do not have subtypes, they
+// must be equal. However for TensorFlow types such as Resource and Variant,
+// that also have subtypes, we recursively check for subtype compatibilty for
+// Resource types and assume all variant types are cast compatible. If either
+// one of `a` or `b` have empty subtypes, they are considered cast compatible.
+//
+// The returned type is same or more precise than the input types. For example,
+// if `a` and `b` are cast compatible types tensor<2x?x?xf32> and
+// tensor<?x4x?xf32> respectively, the returned type is tensor<2x4x?xf32>.
+//
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
+                                 bool may_ignore_ref_type_a) {
+  // Fast path if everything is equal.
+  if (a == b) return b;
+
+  auto a_tt = a.dyn_cast<mlir::TensorType>();
+  auto b_tt = b.dyn_cast<mlir::TensorType>();
+
+  // If only one of a or b is a tensor type, they are incompatible.
+  if (static_cast<bool>(a_tt) ^ static_cast<bool>(b_tt)) return nullptr;
+
+  // For non-tensor types, we do not need to worry about shape and can return
+  // early.
+  if (!a_tt && !b_tt) {
+    // Remove ref types.
+    if (may_ignore_ref_type_a) {
+      if (auto ref_type = a.dyn_cast<mlir::TF::TensorFlowRefType>()) {
+        a = ref_type.RemoveRef();
+        if (a == b) return a;
+      }
+    }
+    if (a.getKind() != b.getKind()) return nullptr;
+
+    // If either is not a type that contain subtypes then the types are not cast
+    // compatible.
+    auto a_wst = a.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    auto b_wst = b.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    if (!a_wst || !b_wst) return nullptr;
+
+    // For Variant types we are more permissive right now and accept all pairs
+    // of Variant types. If we are more constrainted and check compatibility of
+    // subtypes, we might reject valid graphs.
+    // TODO(prakalps): Variant doesn't have a subtype, we assign it
+    // one, so we should only assign it one when we know the subtype. Then we
+    // can be more constrained and check subtypes for cast compatibility as
+    // well.
+    if (a.isa<mlir::TF::VariantType>()) return a;
+
+    // For Resource types, we recursively check the subtypes for cast
+    // compatibility, if possible. Otherwise treat them as compatible.
+    auto a_wst_st = a_wst.GetSubtypes();
+    auto b_wst_st = b_wst.GetSubtypes();
+    if (a_wst_st.empty() || b_wst_st.empty()) return a;
+    if (a_wst_st.size() != b_wst_st.size()) return nullptr;
+    llvm::SmallVector<mlir::TensorType, 4> refined_subtypes;
+    for (auto subtypes : llvm::zip(a_wst_st, b_wst_st)) {
+      mlir::Type refined_st =
+          GetCastCompatibleType(std::get<0>(subtypes), std::get<1>(subtypes),
+                                /*may_ignore_ref_type_a=*/false);
+      if (!refined_st) return nullptr;
+      refined_subtypes.push_back(refined_st.cast<mlir::TensorType>());
+    }
+
+    return mlir::TF::ResourceType::get(refined_subtypes, a.getContext());
+  }
+
+  // For tensor types, check compatibility of both element type and shape.
+  mlir::Type refined_element_ty = GetCastCompatibleType(
+      a_tt.getElementType(), b_tt.getElementType(), may_ignore_ref_type_a);
+  if (!refined_element_ty) return nullptr;
+
+  if (!a_tt.hasRank() && !b_tt.hasRank()) {
+    return mlir::UnrankedTensorType::get(refined_element_ty);
+  }
+  if (!a_tt.hasRank()) {
+    return mlir::RankedTensorType::get(b_tt.getShape(), refined_element_ty);
+  }
+  if (!b_tt.hasRank()) {
+    return mlir::RankedTensorType::get(a_tt.getShape(), refined_element_ty);
+  }
+
+  llvm::SmallVector<int64_t, 8> refined_shape;
+  if (!GetCastCompatibleShape(a_tt.getShape(), b_tt.getShape(), &refined_shape))
+    return nullptr;
+
+  return mlir::RankedTensorType::get(refined_shape, refined_element_ty);
+}
 }  // namespace
 
 namespace mlir {
@@ -224,44 +352,16 @@ bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
 
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs) {
-  // Fast path if everything is equal.
-  if (lhs == rhs) return true;
+  return GetCastCompatibleType(lhs, rhs, may_ignore_ref_type_lhs) != nullptr;
+}
 
-  // In TF all values are tensors.
-  auto lhs_tt = lhs.cast<TensorType>();
-  auto rhs_tt = rhs.cast<TensorType>();
-
-  // Verify matching element types. These should be identical dynamically,
-  // so this allows for types not yet fully refined.
-  auto lhs_et = lhs_tt.getElementType();
-  auto rhs_et = rhs_tt.getElementType();
-  if (lhs_et == rhs_et) return true;
-
-  // Remove ref types.
-  if (may_ignore_ref_type_lhs) {
-    if (auto ref_type = lhs_et.dyn_cast<TF::TensorFlowRefType>()) {
-      lhs_et = ref_type.RemoveRef();
-      if (lhs_et == rhs_et) return true;
-    }
-  }
-
-  if (lhs_et.getKind() != rhs_et.getKind()) return false;
-
-  // If either is not type that contain subtypes then the element types don't
-  // match.
-  auto lhs_wst = lhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  auto rhs_wst = rhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (!lhs_wst || !rhs_wst) return false;
-
-  // Consider the subtype recursively.
-  auto lhs_wst_st = lhs_wst.GetSubtypes();
-  auto rhs_wst_st = rhs_wst.GetSubtypes();
-  if (lhs_wst_st.empty() || rhs_wst_st.empty()) return true;
-  if (lhs_wst_st.size() != rhs_wst_st.size()) return false;
-  for (auto subtypes : llvm::zip(lhs_wst_st, rhs_wst_st)) {
-    if (!HasCompatibleElementTypes(std::get<0>(subtypes),
-                                   std::get<1>(subtypes)))
-      return false;
+bool AreCastCompatible(ArrayRef<Type> types) {
+  Type common = types.front();
+  for (auto type : types.drop_front()) {
+    Type refined_type =
+        GetCastCompatibleType(common, type, /*may_ignore_ref_type_a=*/false);
+    if (!refined_type) return false;
+    common = refined_type;
   }
   return true;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index d1e6a74a0c5..4c99aae4706 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -313,6 +313,12 @@ bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs);
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs = false);
 
+// Returns true if all TensorFlow types can be cast to one
+// another. In other words, a single run-time value is legal for both the types.
+// For example, tensor<*xf32>, tensor<?xf32> and tensor<3xf32> are cast
+// compatible.
+bool AreCastCompatible(ArrayRef<Type> types);
+
 }  // end namespace TF
 }  // end namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 118ce2e8645..ffa287e0e53 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -881,20 +881,29 @@ func @testValidMatrixBandPartOpUnranked(%arg0: tensor<*xbf16>, %arg1: tensor<i64
 
 // -----
 
-// Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
-  return %0 : tensor<64x64xbf16>
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpUnrankedBand
+func @testValidMatrixBandPartOpUnrankedBand(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
+  return %0 : tensor<*xbf16>
+}
+
+// -----
+
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpCompatibleDynamicShapes
+func @testValidMatrixBandPartOpCompatibleDynamicShapes(%arg0: tensor<?x10x?xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<?x?x8xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<?x10x?xbf16>, tensor<i64>, tensor<i64>) -> tensor<?x?x8xbf16>
+  return %0 : tensor<?x?x8xbf16>
 }
 
 // -----
 
 // Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
-  return %0 : tensor<*xbf16>
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
+  // expected-error @+1 {{op failed to verify that all of {input, band} have dynamically equal types}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
+  return %0 : tensor<64x64xbf16>
 }
 
 // -----

From e4c22494e716b34f148f8154ad23f77b7d68ac9c Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Tue, 12 May 2020 22:33:07 +0000
Subject: [PATCH 0446/1533] Addressed PR comments

---
 tensorflow/python/eager/forwardprop_test.py |  2 +-
 tensorflow/python/ops/custom_gradient.py    | 40 ++++++++++-----------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index c32de30a2b3..0c9ffaa0816 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -349,7 +349,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
+  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test?
   def testCustomGradientRecomputeGrad(self):
 
     @custom_gradient.recompute_grad
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d0f06718911..6489aff117f 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -484,30 +484,26 @@ def recompute_grad(f):
     current_var_scope = variable_scope.get_variable_scope()
     with tape_lib.stop_recording():
       result = f(*args, **kwargs)
+
     @custom_gradient
     def grad(*dresult, **grad_kwargs):
       """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
-
-      def grad_eval():
-        """Gradient function calculation for reverse mode autodiff."""
-        variables = grad_kwargs.get("variables")
-        with backprop.GradientTape() as t:
-          id_args = [gen_array_ops.identity(x) for x in args]
-          t.watch(id_args)
-          if variables is not None:
-            t.watch(variables)
-          with ops.control_dependencies(dresult):
-            with variable_scope.variable_scope(current_var_scope):
-              result = f(*id_args, **kwargs)
-        kw_vars = []
+      # Gradient calculation for reverse mode autodiff.
+      variables = grad_kwargs.get("variables")
+      with backprop.GradientTape() as t:
+        id_args = [gen_array_ops.identity(x) for x in args]
+        t.watch(id_args)
         if variables is not None:
-          kw_vars = list(variables)
-        grads = t.gradient(result,
-                           list(id_args) + kw_vars,
-                           output_gradients=dresult)
-        if len(grads) == 1 and None in grads:
-          return 0
-        return grads[:len(id_args)], grads[len(id_args):]
+          t.watch(variables)
+        with ops.control_dependencies(dresult):
+          with variable_scope.variable_scope(current_var_scope):
+            result = f(*id_args, **kwargs)
+      kw_vars = []
+      if variables is not None:
+        kw_vars = list(variables)
+      grads = t.gradient(result,
+                         list(id_args) + kw_vars,
+                         output_gradients=dresult)
 
       def transpose(*t_args, **t_kwargs):
         """Gradient function calculation for forward mode autodiff."""
@@ -517,7 +513,9 @@ def recompute_grad(f):
             "Consider not using recompute_grad in forward mode autodiff".format(
                 f.__name__))
 
-      return grad_eval(), transpose
+      if len(grads) == 1 and None in grads:
+        return 0, transpose
+      return (grads[:len(id_args)], grads[len(id_args):]), transpose
 
     return result, grad
 

From 9a43ab39f2db65d5526773c9c6b45f2087e4c1c7 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 12 May 2020 15:41:09 -0700
Subject: [PATCH 0447/1533] Modify op version in optimize only if convertor
 version < quantized version.

PiperOrigin-RevId: 311216743
Change-Id: Iaac04750d0d302e9bba11b223c2885d6a36d74b3
---
 tensorflow/lite/tools/optimize/model_utils.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index 26dcff222bd..ae868cf21b8 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -134,8 +134,10 @@ void SetOperatorCodeVersion(ModelT* model) {
       OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
       operator_property::OperatorProperty property =
           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
-      if (property.quantizable) {
-        // Only update the versions of quantizable operations.
+      if (property.quantizable && op_code->version < property.version) {
+        // Only update the versions of quantizable operations if the original
+        // version is lesser than minimum quantized one mentioned by
+        // OperatorProperty.
         op_code->version = property.version;
       }
     }

From ce11d03f84bb182c0eb4bdda0d838c58c83a9e24 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 12 May 2020 15:51:38 -0700
Subject: [PATCH 0448/1533] [XLA/GPU] Make Thunk::Initialize() happen at
 compile-time, not run-time. This simplifies GpuExecutable for MLIR
 transition.

PiperOrigin-RevId: 311218613
Change-Id: I42aaca015689b19c8a6343f1cac50451e6d0cf84
---
 tensorflow/compiler/xla/service/gpu/BUILD     | 12 +++++------
 .../xla/service/gpu/amdgpu_compiler.cc        | 10 +++++----
 .../xla/service/gpu/amdgpu_compiler.h         |  2 +-
 .../xla/service/gpu/conditional_thunk.cc      |  4 ++--
 .../xla/service/gpu/conditional_thunk.h       |  2 +-
 .../compiler/xla/service/gpu/for_thunk.cc     |  4 ++--
 .../compiler/xla/service/gpu/for_thunk.h      |  2 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 21 ++++++-------------
 .../compiler/xla/service/gpu/gpu_compiler.h   |  7 ++++---
 .../xla/service/gpu/gpu_executable.cc         | 10 +++++----
 .../compiler/xla/service/gpu/gpu_executable.h | 18 ++++++++++------
 .../compiler/xla/service/gpu/gpu_types.h      | 17 ---------------
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  9 ++++----
 .../compiler/xla/service/gpu/kernel_thunk.h   |  4 +++-
 .../xla/service/gpu/nvptx_compiler.cc         | 11 ++++++----
 .../compiler/xla/service/gpu/nvptx_compiler.h |  2 +-
 .../xla/service/gpu/sequential_thunk.cc       |  4 ++--
 .../xla/service/gpu/sequential_thunk.h        |  2 +-
 tensorflow/compiler/xla/service/gpu/thunk.h   |  5 +++--
 .../compiler/xla/service/gpu/while_thunk.cc   |  6 +++---
 .../compiler/xla/service/gpu/while_thunk.h    |  2 +-
 .../service/mlir_gpu/mlir_compiler_impl.cc    |  7 +++----
 .../compiler/xla/tests/llvm_compiler_test.cc  |  7 ++++---
 23 files changed, 78 insertions(+), 90 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index bff8734de5f..0f6b2cb72e6 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -17,15 +17,15 @@ load(
     "tf_cuda_library",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-    "if_rocm_is_configured",
-)
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 package(
@@ -86,7 +86,6 @@ cc_library(
     name = "gpu_types",
     hdrs = ["gpu_types.h"],
     deps = [
-        "//tensorflow/compiler/xla:types",
         "@com_google_absl//absl/types:variant",
     ],
 )
@@ -406,7 +405,6 @@ cc_library(
     deps = [
         ":buffer_allocations",
         ":gpu_executable_run_options",
-        ":gpu_types",
         ":hlo_execution_profiler",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/service:hlo",
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index 485aff0c4d8..974db02b1b3 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -104,9 +104,11 @@ GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return isa_version;
 }
 
-StatusOr<GpuTargetBinary> AMDGPUCompiler::CompileTargetBinary(
-    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
-    se::StreamExecutor* stream_exec) {
+StatusOr<std::pair<std::string, std::vector<uint8>>>
+AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
+                                    llvm::Module* llvm_module,
+                                    GpuVersion gpu_version,
+                                    se::StreamExecutor* stream_exec) {
   if (rocdl_dir_.empty()) {
     // Compute rocdl_dir_ just once and cache it in this member.
     rocdl_dir_ = GetROCDLDir(module->config());
@@ -127,7 +129,7 @@ StatusOr<GpuTargetBinary> AMDGPUCompiler::CompileTargetBinary(
     user_post_optimization_hook_(*llvm_module);
   }
 
-  return GpuTargetBinary{"", std::move(hsaco)};
+  return std::pair<std::string, std::vector<uint8>>("", std::move(hsaco));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
index 9033585763b..acc5e021e3d 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
@@ -39,7 +39,7 @@ class AMDGPUCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<GpuTargetBinary> CompileTargetBinary(
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 5e7d89c7aee..e31f45942b1 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -50,7 +50,7 @@ void ConditionalThunk::ComputeAnnotations() {
   }
 }
 
-Status ConditionalThunk::Initialize(const GpuTargetBinary& target_binary,
+Status ConditionalThunk::Initialize(const GpuExecutable& executable,
                                     se::StreamExecutor* executor) {
   if (branch_index_is_bool_) {
     TF_RET_CHECK(branch_thunks_.size() == 2);
@@ -58,7 +58,7 @@ Status ConditionalThunk::Initialize(const GpuTargetBinary& target_binary,
     TF_RET_CHECK(!branch_thunks_.empty());
   }
   for (auto& branch_thunk : branch_thunks_) {
-    TF_RETURN_IF_ERROR(branch_thunk->Initialize(target_binary, executor));
+    TF_RETURN_IF_ERROR(branch_thunk->Initialize(executable, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index ba69e1a38ec..404e2131eff 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -52,7 +52,7 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index aacc9deb739..0a97f668b38 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -39,9 +39,9 @@ void ForThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status ForThunk::Initialize(const GpuTargetBinary& target_binary,
+Status ForThunk::Initialize(const GpuExecutable& executable,
                             se::StreamExecutor* executor) {
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 57657b6825f..57402f70627 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -38,7 +38,7 @@ class ForThunk : public Thunk {
   ForThunk& operator=(const ForThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 533ff52a90d..5f6dfd7d3a5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -565,7 +565,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   GpuVersion gpu_version = GetGpuVersion(stream_exec);
 
-  TF_ASSIGN_OR_RETURN(GpuTargetBinary backend_result,
+  using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
+  TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
                       CompileTargetBinary(module.get(), &llvm_module,
                                           gpu_version, stream_exec));
 
@@ -577,11 +578,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                             thunk_schedule->ToString());
   }
 
-  std::vector<Thunk*> thunks;
-  for (Thunk* thunk : thunk_schedule->TotalOrder()) {
-    thunks.push_back(thunk);
-  }
-
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
 
@@ -601,19 +597,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   }
 
   auto* gpu_executable = new GpuExecutable(
-      std::move(backend_result), gpu_version, std::move(thunk_schedule),
-      std::move(module), std::move(buffer_assignment),
-      std::move(profile_printer), std::move(profile_index_map));
+      backend_result.first, backend_result.second, gpu_version,
+      std::move(thunk_schedule), std::move(module),
+      std::move(buffer_assignment), std::move(profile_printer),
+      std::move(profile_index_map));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
   }
-
-  for (Thunk* thunk : thunks) {
-    TF_RETURN_IF_ERROR(
-        thunk->Initialize(gpu_executable->target_binary(), stream_exec));
-  }
-
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index deb5d785777..b52af5392d1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -74,9 +74,10 @@ class GpuCompiler : public LLVMCompiler {
 
   virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0;
 
-  virtual StatusOr<GpuTargetBinary> CompileTargetBinary(
-      const HloModule* hlo_module, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec) = 0;
+  virtual StatusOr<std::pair<std::string, std::vector<uint8>>>
+  CompileTargetBinary(const HloModule* hlo_module, llvm::Module* llvm_module,
+                      GpuVersion gpu_version,
+                      se::StreamExecutor* stream_exec) = 0;
 
   Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index ebd3630635b..2df6b50d361 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -52,15 +52,16 @@ using ::tensorflow::profiler::ScopedAnnotation;
 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
 GpuExecutable::GpuExecutable(
-    GpuTargetBinary target_binary, GpuVersion gpu_version,
-    std::unique_ptr<const ThunkSchedule> thunk_schedule,
+    const string& text, const std::vector<uint8>& binary,
+    GpuVersion gpu_version, std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::shared_ptr<HloModule> hlo_module,
     std::shared_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
-      target_binary_(std::move(target_binary)),
+      text_(text),
+      binary_(binary),
       gpu_version_(gpu_version),
       thunk_schedule_(std::move(thunk_schedule)),
       assignment_(std::move(assignment)) {
@@ -175,6 +176,7 @@ Status GpuExecutable::ExecuteThunks(
     // module, we won't get any data, but that's probably an OK trade-off.
     ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
 
+    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -467,7 +469,7 @@ const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
 int64 GpuExecutable::SizeOfGeneratedCodeInBytes() {
   // Non-empty PTX but empty cubin: compilation must have failed, return
   // "unknown".
-  if (binary().empty() && !text().empty()) {
+  if (binary().empty() && !text_.empty()) {
     return -1;
   }
   return binary().size();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 29441c60b04..045a36c099b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -52,7 +52,8 @@ class GpuExecutable : public Executable {
   // We need to share ownership of hlo_module and assignment with profiler to
   // safely keep a reference to these objects during tracing period, thus they
   // are passed as shared pointers.
-  GpuExecutable(GpuTargetBinary target_binary, GpuVersion gpu_version,
+  GpuExecutable(const string& text, const std::vector<uint8>& binary,
+                GpuVersion gpu_version,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::shared_ptr<HloModule> hlo_module,
                 std::shared_ptr<const BufferAssignment> assignment,
@@ -72,14 +73,12 @@ class GpuExecutable : public Executable {
 
   // Returns the compiled code for the computation. The compiled code is PTX in
   // Cuda and unused empty string in ROCm.
-  const string& text() const { return target_binary_.text; }
+  const string& text() const { return text_; }
 
   // Returns the binary stored in this GpuExecutable. The binary is cubin in
   // Cuda, and HSA code object in ROCm. It may be empty, in which case
   // compilation is left up to the GPU driver.
-  const std::vector<uint8>& binary() const { return target_binary_.binary; }
-
-  const GpuTargetBinary& target_binary() const { return target_binary_; }
+  const std::vector<uint8>& binary() const { return binary_; }
 
   // ExecuteAsyncOnStream will fail if the compute capability of the stream
   // doesn't match the compute capability passed to this object's constructor.
@@ -132,7 +131,14 @@ class GpuExecutable : public Executable {
   // This string should be modified only before ExecuteOnStream.
   string ir_module_string_;
 
-  const GpuTargetBinary target_binary_;
+  // The compiled code for the computation.
+  const string text_;
+
+  // The GPU machine code for the computation, targeting GPUs at
+  // compute_capability_.
+  //
+  // May be empty, in which case we leave compilation up to the GPU driver.
+  const std::vector<uint8> binary_;
 
   // The GPU version for compute compatibility check.
   GpuVersion gpu_version_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_types.h b/tensorflow/compiler/xla/service/gpu/gpu_types.h
index 5c8b8093d65..1c51040fb82 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_types.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_types.h
@@ -16,11 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 
-#include <string>
-#include <vector>
-
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
@@ -29,19 +25,6 @@ namespace gpu {
 // it comprises a pair of integers denoting major and minor version.
 // On ROCm platform, it comprises one integer for AMD GCN ISA version.
 using GpuVersion = absl::variant<std::pair<int, int>, int>;
-
-// A struct to carry around compiled results by the GPU assembler.
-struct GpuTargetBinary {
-  GpuTargetBinary(const GpuTargetBinary& other) = delete;
-  GpuTargetBinary(GpuTargetBinary&& other) = default;
-
-  // The text format of the compiled result, e.g. PTX.
-  std::string text;
-
-  // The actual compiled binary.
-  std::vector<tensorflow::uint8> binary;
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 0b5010ea66b..d976b5d8d4d 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -41,7 +42,7 @@ KernelThunk::KernelThunk(absl::Span<const BufferAllocation* const> args,
       kernel_name_(kernel_name),
       unroll_factor_(unroll_factor) {}
 
-Status KernelThunk::Initialize(const GpuTargetBinary& target_binary,
+Status KernelThunk::Initialize(const GpuExecutable& executable,
                                se::StreamExecutor* executor) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -54,10 +55,8 @@ Status KernelThunk::Initialize(const GpuTargetBinary& target_binary,
   if (kernel_cache_.end() == it) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::KernelBase> kernel,
-        CreateKernel(kernel_name_, args_.size(), target_binary.text,
-                     target_binary.binary, executor));
-    CHECK(!target_binary.binary.empty());
-    CHECK(kernel);
+        CreateKernel(kernel_name_, args_.size(), executable.text(),
+                     executable.binary(), executor));
 
     kernel_cache_.emplace(executor, std::move(kernel));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 97a1d08a57e..88351881f3a 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -35,6 +35,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class GpuExecutable;
+
 // This class stores everything that StreamExecutor needs for launching a
 // kernel. It implements the ExecuteOnStream interface for GpuExecutable to
 // invoke the corresponding kernel.
@@ -56,7 +58,7 @@ class KernelThunk : public Thunk {
   int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index cf6fe9292e5..0196267d904 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -295,9 +295,11 @@ GpuVersion NVPTXCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return std::make_pair(cc_major, cc_minor);
 }
 
-StatusOr<GpuTargetBinary> NVPTXCompiler::CompileTargetBinary(
-    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
-    se::StreamExecutor* stream_exec) {
+StatusOr<std::pair<std::string, std::vector<uint8>>>
+NVPTXCompiler::CompileTargetBinary(const HloModule* module,
+                                   llvm::Module* llvm_module,
+                                   GpuVersion gpu_version,
+                                   se::StreamExecutor* stream_exec) {
   std::pair<int, int> compute_capability =
       absl::get<std::pair<int, int>>(gpu_version);
 
@@ -338,7 +340,8 @@ StatusOr<GpuTargetBinary> NVPTXCompiler::CompileTargetBinary(
       stream_exec, ptx, compute_capability.first, compute_capability.second,
       module->config());
 
-  return GpuTargetBinary{std::move(ptx), std::move(cubin)};
+  return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
+                                                    std::move(cubin));
 }
 
 std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index ec550b5b2ff..e69be947522 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -48,7 +48,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<GpuTargetBinary> CompileTargetBinary(
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index bd260336c28..025ca60ef0c 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -34,10 +34,10 @@ void SequentialThunk::ComputeAnnotations() {
   }
 }
 
-Status SequentialThunk::Initialize(const GpuTargetBinary& target_binary,
+Status SequentialThunk::Initialize(const GpuExecutable& executable,
                                    se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(target_binary, executor));
+    TF_RETURN_IF_ERROR(thunk->Initialize(executable, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index b5475664733..3abb82c0b66 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -40,7 +40,7 @@ class SequentialThunk : public Thunk {
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 7aff9ca47b7..e9be41b74de 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -31,6 +30,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class GpuExecutable;
+
 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
 //
@@ -96,7 +97,7 @@ class Thunk {
   // This may be called multiple times.  Its main purpose is to give us a chance
   // to do initialization outside of ExecuteOnStream() so that the
   // time spent initializing doesn't count towards our execution profile.
-  virtual Status Initialize(const GpuTargetBinary& /*target_binary*/,
+  virtual Status Initialize(const GpuExecutable& /*executable*/,
                             se::StreamExecutor* /*executor*/) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 2650508093e..4134cd39832 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -45,11 +45,11 @@ void WhileThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status WhileThunk::Initialize(const GpuTargetBinary& target_binary,
+Status WhileThunk::Initialize(const GpuExecutable& executable,
                               se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(
-      condition_thunk_sequence_->Initialize(target_binary, executor));
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
+      condition_thunk_sequence_->Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 77ee0104a1f..31db01b72ba 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
   WhileThunk& operator=(const WhileThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 667cdef8f6c..35ac3b2bf63 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -549,11 +549,10 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   }
 
   // TODO(b/137624192): Add profiling support.
-
   return {absl::make_unique<GpuExecutable>(
-      xla::gpu::GpuTargetBinary{ptx, cubin}, GetGpuVersion(stream_exec),
-      std::move(thunk_schedule), emission_context.releaseHloModule(),
-      std::move(buffer_assignment), nullptr, nullptr)};
+      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
+      emission_context.releaseHloModule(), std::move(buffer_assignment),
+      nullptr, nullptr)};
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 16ed02296b7..1947f517bd9 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -55,15 +55,16 @@ class GpuDummyCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) { return 0; }
 
-  StatusOr<GpuTargetBinary> CompileTargetBinary(
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec) override {
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) {
     if (user_post_optimization_hook_) {
       user_post_optimization_hook_(*llvm_module);
     }
 
     std::vector<uint8> compiled_results;
-    return GpuTargetBinary{"", std::move(compiled_results)};
+    return std::pair<std::string, std::vector<uint8>>(
+        "", std::move(compiled_results));
   }
 };
 }  // namespace gpu

From 8788846283c461a7475af4ce1921d5a2d8b075c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 15:55:20 -0700
Subject: [PATCH 0449/1533] Fix `alignment_hint < sizeof(void*)` in
 BuiltinDataAllocator::Allocate

Handle the case when `alignof(T)` return value `< sizeof(void*)` and causes
the fail of `aligned_alloc()`. Fix by using `sizeof(void*)` as `alignment_hint`
in this case.

PiperOrigin-RevId: 311219237
Change-Id: Ib5d9c194ac00f17f4f3a47bf98cba0afbdce5840
---
 tensorflow/lite/BUILD                  |  1 +
 tensorflow/lite/interpreter_builder.cc | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index a2ab4854165..14babee2da7 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -253,6 +253,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index e32e0768995..fb87702fd13 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
@@ -209,7 +210,15 @@ class MallocDataAllocator : public BuiltinDataAllocator {
  public:
   void* Allocate(size_t size, size_t alignment_hint) override {
 #ifdef TFLITE_USE_STD_ALIGNED_ALLOC
-    return aligned_alloc(alignment_hint, size);
+    // Ensure that alignment is a power of two and a multiple of sizeof(void *)
+    // and that size is an integral multiple of alignment.
+    size_t used_alignment = std::max(alignment_hint, sizeof(void*));
+    size_t used_size =
+        ((size + used_alignment - 1) / used_alignment) * used_alignment;
+    TFLITE_DCHECK(
+        (used_alignment != 0) &&
+        ((used_alignment & (used_alignment - 1)) == 0));  // is power-of-two
+    return aligned_alloc(used_alignment, used_size);
 #else
     return malloc(size);
 #endif

From 3200e57b9cacf2883cfd28b18a9edf71bafaefca Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 12 May 2020 15:59:52 -0700
Subject: [PATCH 0450/1533] Update the OSS image to pickup latest tf estimator
 PIP package.

PiperOrigin-RevId: 311220029
Change-Id: I1bbcdf92f410a074d6918cf61cfb380daa74339e
---
 ....rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 |  7 +++++++
 .../Dockerfile.rbe.ubuntu16.04-manylinux2010      | 15 +++++++--------
 .../toolchains/preconfig/generate/containers.bzl  |  4 ++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
index df4b847b6f7..91d501109d0 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
@@ -75,6 +75,13 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
        /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
index 516129ccd43..a14b9ac2a3e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
@@ -73,13 +73,12 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
\ No newline at end of file
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 9be398f5f2d..8e6f48df99e 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -2,13 +2,13 @@
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
-    "ubuntu16.04-manylinux2010": "sha256:b5227c4069980005336dd5cf04e3122974984da3396a514a06d7db3a7ae7b2f9",
+    "ubuntu16.04-manylinux2010": "sha256:d5b056506e14eb216b6e27988814617a09dea77ec1ab46972072038f9df3e728",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0",
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:cc7f760195d7bbe283b45ae740409751d0b74d8ffbdc2f7a3cb62c71a71fbe25",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:1e4e888f14a3d5b127151f7970487613a46ca957babe0432786627c78c0b1a36",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:13aa5e700bb609521cd4365d4152d7d8f4118cae7ce174ce7d54cc529e21766a",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",

From b79631972128ab60c1f646dca68867459f5cb102 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 07:06:27 +0800
Subject: [PATCH 0451/1533] list command line flags in readme

And update some `usage()` descriptions
---
 .../lite/examples/label_image/README.md       | 27 ++++++++++++++++++-
 .../lite/examples/label_image/label_image.cc  |  6 ++---
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/examples/label_image/README.md b/tensorflow/lite/examples/label_image/README.md
index 9d37c153361..9ca8fd05e09 100644
--- a/tensorflow/lite/examples/label_image/README.md
+++ b/tensorflow/lite/examples/label_image/README.md
@@ -138,7 +138,7 @@ average time:10.348 ms
 
 To run a model with the Hexagon Delegate, assuming we have followed the
 [Hexagon Delegate Guide](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/hexagon_delegate.md)
-and installed Hexagon libraries in `/data/local/tmp`. Run it
+and installed Hexagon libraries in `/data/local/tmp`. Run it wth (`-j 1`)
 ```
 adb shell \
   "/data/local/tmp/label_image \
@@ -186,4 +186,29 @@ average time: 17.33 ms
 0.00414093: 514 cornet
 ```
 
+With `-h` or any other unsupported flags, `label_image` will list 
+supported options
+```
+sargo:/data/local/tmp $ ./label_image  -h                                                                                          
+./label_image: invalid option -- h
+label_image
+--accelerated, -a: [0|1], use Android NNAPI or not
+--old_accelerated, -d: [0|1], use old Android NNAPI delegate or not
+--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not
+--count, -c: loop interpreter->Invoke() for certain times
+--gl_backend, -g: [0|1]: use GL GPU Delegate on Android
+--hexagon_delegate, -j: [0|1]: use Hexagon Delegate on Android
+--input_mean, -b: input mean
+--input_std, -s: input standard deviation
+--image, -i: image_name.bmp
+--labels, -l: labels for the model
+--tflite_model, -m: model_name.tflite
+--profiling, -p: [0|1], profiling or not
+--num_results, -r: number of results to show
+--threads, -t: number of threads
+--verbose, -v: [0|1] print more information
+--warmup_runs, -w: number of warmup runs
+--xnnpack_delegate, -x [0:1]: xnnpack delegate
+```
+
 See the `label_image.cc` source code for other command line options.
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index ec744d70381..364ac325967 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -362,8 +362,8 @@ void display_usage() {
       << "--old_accelerated, -d: [0|1], use old Android NNAPI delegate or not\n"
       << "--allow_fp16, -f: [0|1], allow running fp32 models with fp16 or not\n"
       << "--count, -c: loop interpreter->Invoke() for certain times\n"
-      << "--gl_backend, -g: use GL GPU Delegate on Android\n"
-      << "--hexagon_delegate: use Hexagon Delegate on Android\n"
+      << "--gl_backend, -g: [0|1]: use GL GPU Delegate on Android\n"
+      << "--hexagon_delegate, -j: [0|1]: use Hexagon Delegate on Android\n"
       << "--input_mean, -b: input mean\n"
       << "--input_std, -s: input standard deviation\n"
       << "--image, -i: image_name.bmp\n"
@@ -374,7 +374,7 @@ void display_usage() {
       << "--threads, -t: number of threads\n"
       << "--verbose, -v: [0|1] print more information\n"
       << "--warmup_runs, -w: number of warmup runs\n"
-      << "--xnnpack_delegate, -x: xnnpack delegate\n"
+      << "--xnnpack_delegate, -x [0:1]: xnnpack delegate\n"
       << "\n";
 }
 

From 1afe51a60cbda6fc42b157f6393063052208da70 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 16:02:45 -0700
Subject: [PATCH 0452/1533] [tf.data] Update the node destructor to solve the
 stack overflow problem.

PiperOrigin-RevId: 311220597
Change-Id: I7efaa889a27e52c0d05bec9778a7f40976a5e90e
---
 tensorflow/core/framework/model.h | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 1c3b64f4a0d..97ac9dd35ae 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -142,7 +142,31 @@ class Node {
         metrics_(name_),
         output_(args.output.get()) {}
 
-  virtual ~Node() { FlushMetrics(); }
+  virtual ~Node() {
+    // Clear the sub-nodes instead of relying on implicit shared pointer
+    // destructor to avoid potential stack overflow when the tree is deep.
+    std::deque<std::shared_ptr<Node>> queue;
+    {
+      mutex_lock l(mu_);
+      while (inputs_.size() > 0) {
+        queue.push_back(inputs_.front());
+        inputs_.pop_front();
+      }
+    }
+    while (!queue.empty()) {
+      auto node = queue.back();
+      queue.pop_back();
+      {
+        mutex_lock l(node->mu_);
+        while (node->inputs_.size() > 0) {
+          queue.push_back(node->inputs_.front());
+          node->inputs_.pop_front();
+        }
+      }
+    }
+
+    FlushMetrics();
+  }
 
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_) {

From 2ca6769ae8b49f4e17cdbe7c9976da84ec3e3ca7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 12 May 2020 16:09:03 -0700
Subject: [PATCH 0453/1533] Update docstring to remove confusing term
 num_split/size_split, based on review

Also removes unnecessary `...`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8aa5d66f402..523020df772 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1919,27 +1919,25 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   See also `tf.unstack`.
 
-  If `num_or_size_splits` is an integer,  we call it num_split and
-  `value` is split along the dimension `axis` into `num_split` smaller
-  tensors. This requires that `value.shape[axis]` is divisible by `num_split`.
+  If `num_or_size_splits` is an integer,  then `value` is split along the
+  dimension `axis` into `num_or_size_splits` smaller tensors. This requires that
+  `value.shape[axis]` is divisible by `num_or_size_splits`.
 
-  If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits`
-  and `value` is split into `len(size_splits)` elements. The shape of the `i`-th
+  If `num_or_size_splits` is a 1-D Tensor (or list), then `value` is split into
+  `len(num_or_size_splits)` elements. The shape of the `i`-th
   element has the same size as the `value` except along dimension `axis` where
-  the size is `size_splits[i]`.
+  the size is `num_or_size_splits[i]`.
 
   For example:
 
   >>> x = tf.Variable(tf.random.uniform([5, 30], -1, 1))
   >>>
-  >>> # Split `x` into 3 tensors along dimension 1:
-  ...
+  >>> # Split `x` into 3 tensors along dimension 1
   >>> s0, s1, s2 = tf.split(x, num_or_size_splits=3, axis=1)
   >>> tf.shape(s0).numpy()
   array([ 5, 10], dtype=int32)
   >>>
   >>> # Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
-  ...
   >>> split0, split1, split2 = tf.split(x, [4, 15, 11], 1)
   >>> tf.shape(split0).numpy()
   array([5, 4], dtype=int32)

From 3924ce6cc0afa3c7b46c70b16ed3284082fc0ece Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 12 May 2020 16:14:32 -0700
Subject: [PATCH 0454/1533] Enable Reduce Mean op in Hexagon delegate for
 uint8/int8. Added requantize op after Mean to make sure the output is with
 correct scale.

PiperOrigin-RevId: 311222782
Change-Id: Idf0a627fd1da3bc13d68b276dbcf8cc07011c435
---
 .../experimental/delegates/hexagon/README.md  |  1 +
 .../hexagon/builders/reduce_builder.cc        | 68 +++++++++++--------
 .../hexagon/builders/tests/reduce_test.cc     | 52 +++++++++-----
 .../experimental/delegates/hexagon/utils.cc   |  7 +-
 4 files changed, 82 insertions(+), 46 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index 07f1a92bdec..a97342c9fdc 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -80,6 +80,7 @@ are verified in `IsNodeSupportedByHexagon`:
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
 * MaxPool2D (without any activation) (b/129276536)
+* Mean
 * MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
index 8401f76cf4d..066c82560a8 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace delegates {
@@ -35,9 +36,7 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                              std::numeric_limits<uint8_t>::min(),
-                              std::numeric_limits<uint8_t>::max());
+  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_);
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
@@ -63,37 +62,48 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     return kTfLiteError;
   }
 
+  auto& output_tensor = context->tensors[outputs->data[0]];
   int output_batch_size, output_height_size, output_width_size,
       output_depth_size;
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
-          &output_depth_size, context->tensors[outputs->data[0]].dims);
+          &output_depth_size, output_tensor.dims);
 
-  // Hexagon's sum-reduction outputs int32, so we shrink it down to UInt8.
-  if (op_node_.op_type == OP_QuantizedSum_8to32) {
-    const auto& reduce_out = AddOutput(sizeof(int32_t), 4,
-                                       {output_batch_size, output_height_size,
-                                        output_width_size, output_depth_size});
-    const auto& reduce_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    const auto& reduce_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  float output_min = -1, output_max = -1;
+  ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max);
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_min),
+      sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_max),
+      sizeof(output_max));
+  // Min/max values for output tensor.
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
 
-    auto* quantize_output_op = graph_builder_->AddNode(GetTFLiteNodeID());
-    quantize_output_op->SetOpType(OP_QuantizeDownAndShrinkRange_32to8);
-    quantize_output_op->AddInput(reduce_out);
-    quantize_output_op->AddInput(reduce_out_min);
-    quantize_output_op->AddInput(reduce_out_max);
-    node_output_ =
-        quantize_output_op->AddOutput(sizeof(uint8_t), 4,
-                                      {output_batch_size, output_height_size,
-                                       output_width_size, output_depth_size});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  } else {
-    node_output_ = AddOutput(sizeof(uint8_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  }
+  // Add outputs
+  size_t output_element_size = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetSizeOfType(context, output_tensor.type, &output_element_size));
+  auto mean_output = AddOutput(output_element_size, 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  auto mean_out_min = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  auto mean_out_max = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  // Mean op doesn't honor the passed min/max for output, so we need
+  // to add requantize.
+  auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  requantize_op->SetOpType(OP_Requantize_8to8);
+  requantize_op->AddInput(mean_output);
+  requantize_op->AddInput(mean_out_min);
+  requantize_op->AddInput(mean_out_max);
+  requantize_op->AddInput(TensorID(output_min_const->GetID(), 0));
+  requantize_op->AddInput(TensorID(output_max_const->GetID(), 0));
+  node_output_ =
+      requantize_op->AddOutput(sizeof(uint8_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
index 7e4f95ffa96..a3cd8c8255b 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
-// TODO(b/148390890): All tests are disabled, enable after fix is availabel
-// and op is enabled.
+// TODO(b/148390890): Reduce Sum tests are disabled, enable after fix is
+// available and op is enabled.
 class ReduceOpModel : public SingleOpModelWithHexagon {
  public:
   ReduceOpModel(BuiltinOperator type, const TensorData& input,
@@ -49,32 +49,52 @@ class ReduceOpModel : public SingleOpModelWithHexagon {
   int output_;
 };
 
-TEST(ReduceOpModel, DISABLED_MeanNotKeepDims) {
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {2}, false);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {2}, -1.0, 1.0}, {1}, {2}, false);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.4, 0.4}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
 }
 
-TEST(ReduceOpModel, DISABLED_MeanKeepDims) {
+TEST(ReduceOpModel, MeanNotKeepDims_Uint8) {
+  TestMeanImpl<TensorType_UINT8, uint8_t>();
+}
+
+TEST(ReduceOpModel, MeanNotKeepDims_Int8) {
+  TestMeanImpl<TensorType_INT8, int8_t>();
+}
+
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanKeppDimsImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {3}, true);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {3}, -1.0, 1.0}, {1}, {3}, true);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Int8) {
+  TestMeanKeppDimsImpl<TensorType_INT8, int8_t>();
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Uint8) {
+  TestMeanKeppDimsImpl<TensorType_UINT8, uint8_t>();
 }
 
 TEST(ReduceOpModel, DISABLED_SumNotKeepDims) {
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index d9d14804b49..1df0a6df66c 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -80,6 +80,7 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinL2Normalization:
     case kTfLiteBuiltinLogistic:
     case kTfLiteBuiltinMaxPool2d:
+    case kTfLiteBuiltinMean:
     case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
     case kTfLiteBuiltinPad:
@@ -154,11 +155,15 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       return IsActivationReluOrNone(sub_params->activation);
     }
     case kTfLiteBuiltinSum:
-    case kTfLiteBuiltinMean: {
       // TODO(b/139277813): Enable these when they pass unit tests. These seem
       // to recompute the output min/max instead of taking them as inputs, which
       // causes an unexpected shift in dequantized values.
       return false;
+    case kTfLiteBuiltinMean: {
+      return InputsWithCorrectTypes(
+                 node, context,
+                 {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) &&
+             IsConstantTensor(GetInput(context, node, 1));
     }
     case kTfLiteBuiltinMirrorPad: {
       if (!InputsWithCorrectTypes(

From bf0c10e3c4a23b49108c08e8fe32bbb71070e69f Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Tue, 12 May 2020 16:36:53 -0700
Subject: [PATCH 0455/1533] Refactor Softmax and use new memory API.

PiperOrigin-RevId: 311226484
Change-Id: Id044b77f385d6606d272c263d46aae76466e9987
---
 .../micro/kernels/xtensa_hifimini/softmax.cc  | 65 ++++++++-----------
 1 file changed, 27 insertions(+), 38 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index c77e9d1173c..c95fd0e40a4 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -34,9 +34,6 @@ namespace {
 // registrations for selective types (e.g. compile without float support), this
 // can be removed. Otherwise, any HiFi specific optimizations should land here.
 
-// This size will work for both the hotword (1) and ambient music (0):
-static SoftmaxParams kStaticOpData;
-
 TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
@@ -47,11 +44,13 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       if (output->type == kTfLiteInt16) {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int16_t>::min());
         // NOTE: Current int16 softmax output does not require symmetric scaling
         // - so no need to verify scale here.
       } else {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int8_t>::min());
         TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
       }
     }
@@ -71,29 +70,18 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input,
-                              TfLiteTensor* output,
-                              const SoftmaxParams& op_params) {
-  switch (output->type) {
-    case kTfLiteInt16:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
-      return kTfLiteOk;
-    case kTfLiteInt8:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
-      return kTfLiteOk;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
-  }
-}
-
 }  // namespace
 
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams),
+                                        &data) == kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
 
@@ -103,10 +91,8 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  // TODO(b/132070898): Use statically slotted SoftmaxParams structures until a
-  // scratch memory API is ready.
-  SoftmaxParams* op_params = &kStaticOpData;
-  node->user_data = op_params;
+  TFLITE_DCHECK(node->user_data != nullptr);
+  SoftmaxParams* op_params = static_cast<SoftmaxParams*>(node->user_data);
 
   TF_LITE_ENSURE_STATUS(
       CalculateSoftmaxOpData(context, input, output, params, op_params));
@@ -120,19 +106,22 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
-  switch (input->type) {
-    case kTfLiteInt8:
-      return SoftmaxQuantized(context, input, output, *op_params);
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
+  if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
+    // TODO(b/155656675): Const ref params can be slow on xtensa.
+    tflite::reference_ops::Softmax(
+        *op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(output), GetTensorData<int16_t>(output));
+    return kTfLiteOk;
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
   }
 }
 }  // namespace activations
 
 TfLiteRegistration* Register_SOFTMAX() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/activations::SoftmaxInit,
                                  /*free=*/nullptr,
                                  /*prepare=*/activations::SoftmaxPrepare,
                                  /*invoke=*/activations::SoftmaxEval,

From 194efd1d28235fa15d26ce395f3ef72919183f0b Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 12 May 2020 16:40:59 -0700
Subject: [PATCH 0456/1533] Disable collective ops xla test on gpu

PiperOrigin-RevId: 311227213
Change-Id: Ib9b84515e22e86561ae63c4d94ed49d3e4573c7a
---
 tensorflow/compiler/xla/tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 1ad1f8363cf..c8a242c156a 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1909,7 +1909,7 @@ xla_test(
         # This test is tagged "manual" because it requires multiple GPUs, and
         # Forge only supports single-GPU tests.  Guitar skips "manual" tests
         # unless they're also tagged "guitar".
-        #  "guitar",  # Re-enable after b/156405690 is fixed.
+        "guitar",
         "manual",
         "multi_gpu",
         "no_oss",

From 2d452266176737db8c4dedb7a9e6521c2beb1d49 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 12 May 2020 16:52:16 -0700
Subject: [PATCH 0457/1533] Move tf.keras.layers.featureDenseFeature back to
 Keras package.

PiperOrigin-RevId: 311229082
Change-Id: I3317086f3b6c53da0f6d0cc4f5558afcd74b264b
---
 tensorflow/python/feature_column/BUILD        |  20 -
 .../feature_column/feature_column_lib.py      |   8 +-
 .../feature_column/feature_column_v2_test.py  | 291 ------------
 .../feature_column/keras_integration_test.py  |   2 +-
 .../sequence_feature_column_test.py           |  49 ---
 .../feature_column/serialization_test.py      |  66 ---
 tensorflow/python/keras/feature_column/BUILD  |  78 ++++
 .../python/keras/feature_column/__init__.py   |   0
 .../feature_column/dense_features.py          |   5 -
 .../feature_column/dense_features_test.py     | 416 +++++++++++++++++-
 .../feature_column/dense_features_v2.py       |   7 +-
 .../feature_column/dense_features_v2_test.py  |   2 +-
 ...equence_feature_column_integration_test.py |   2 +-
 .../python/keras/layers/serialization.py      |  18 +-
 .../saving/saved_model/saved_model_test.py    |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   4 +-
 17 files changed, 509 insertions(+), 463 deletions(-)
 create mode 100644 tensorflow/python/keras/feature_column/__init__.py
 rename tensorflow/python/{ => keras}/feature_column/dense_features.py (97%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_test.py (62%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2.py (94%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2_test.py (99%)

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d67cdf9cc06..786c26c009a 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -55,8 +55,6 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
-        "dense_features.py",
-        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -126,15 +124,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -177,15 +166,6 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_v2_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index afe14f55bfc..bda20ff3f2c 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.feature_column.dense_features_v2 import *
-from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.keras.feature_column.dense_features_v2 import *
+from tensorflow.python.keras.feature_column.dense_features import *
 from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index fe769850fb0..a13f38a5203 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -5582,23 +5581,6 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -6271,156 +6253,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                           self.evaluate(predictions))
 
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertItemsEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -7326,129 +7158,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
       self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
diff --git a/tensorflow/python/feature_column/keras_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
index e0677e84e50..456c0204350 100644
--- a/tensorflow/python/feature_column/keras_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -23,12 +23,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column import dense_features_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index 3d5d24ec03a..d0cf5ee7670 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,7 +24,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -111,54 +110,6 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = dense_features.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = dense_features.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 78b72746ac9..881ca0cca5e 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import test_util
@@ -114,71 +113,6 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = dense_features.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 650efcceb52..94097c28d73 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -12,11 +12,88 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "feature_column",
+    srcs = ["__init__.py"],
     deps = [
+        ":dense_features",
+        ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
+py_library(
+    name = "dense_features",
+    srcs = [
+        "dense_features.py",
+    ],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:backend",
+    ],
+)
+
+py_library(
+    name = "dense_features_v2",
+    srcs = [
+        "dense_features_v2.py",
+    ],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
 py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
@@ -59,6 +136,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
+        ":dense_features",
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/keras/feature_column/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
similarity index 97%
rename from tensorflow/python/feature_column/dense_features.py
rename to tensorflow/python/keras/feature_column/dense_features.py
index 6feef185815..820f1a6b1b7 100644
--- a/tensorflow/python/feature_column/dense_features.py
+++ b/tensorflow/python/keras/feature_column/dense_features.py
@@ -23,7 +23,6 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
@@ -173,7 +172,3 @@ class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
           cols_to_output_tensors[column] = processed_tensors
         output_tensors.append(processed_tensors)
     return self._verify_and_concat_tensors(output_tensors)
-
-
-layer_serialization.inject_feature_column_v1_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
similarity index 62%
rename from tensorflow/python/feature_column/dense_features_test.py
rename to tensorflow/python/keras/feature_column/dense_features_test.py
index 7cd523dcc14..ec07964bcbe 100644
--- a/tensorflow/python/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -18,19 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
@@ -676,5 +678,417 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
+class IndicatorColumnTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+    # Provide sparse input and get dense result.
+    l = df.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertCountEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = df.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SequenceFeatureColumnsTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = df.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = df.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
similarity index 94%
rename from tensorflow/python/feature_column/dense_features_v2.py
rename to tensorflow/python/keras/feature_column/dense_features_v2.py
index 405c5d63249..e4dc22f1bbe 100644
--- a/tensorflow/python/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -18,10 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import serialization as layer_serialization
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -94,7 +93,3 @@ class DenseFeatures(dense_features.DenseFeatures):
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
     super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
-
-
-layer_serialization.inject_feature_column_v2_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/feature_column/dense_features_v2_test.py
rename to tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 71cb163a7d9..95fc8b7ac1e 100644
--- a/tensorflow/python/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
index 8784182e23b..b1100bf7b07 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -24,11 +24,11 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import init_ops_v2
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 67aaf1d6eb8..fc7feda07a5 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -64,23 +64,11 @@ ALL_V2_MODULES = (
     recurrent_v2,
     preprocessing_normalization
 )
-FEATURE_COLUMN_V1_OBJECTS = {}
-FEATURE_COLUMN_V2_OBJECTS = {}
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
-def inject_feature_column_v1_objects(name, cls):
-  global FEATURE_COLUMN_V1_OBJECTS
-  FEATURE_COLUMN_V1_OBJECTS[name] = cls
-
-
-def inject_feature_column_v2_objects(name, cls):
-  global FEATURE_COLUMN_V2_OBJECTS
-  FEATURE_COLUMN_V2_OBJECTS[name] = cls
-
-
 def populate_deserializable_objects():
   """Populates dict ALL_OBJECTS with every built-in layer.
   """
@@ -134,9 +122,11 @@ def populate_deserializable_objects():
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
 
   if tf2.enabled():
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V2_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
   else:
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V1_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
 
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 9cbe8607a54..5e9ccc2d37a 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -48,6 +47,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ecda1603325..ba9156d7f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index f7137f0d09b..130a9954202 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"

From ce5488f85ff359f707bb0243ca3a7f2f48cba01f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 12 May 2020 16:54:30 -0700
Subject: [PATCH 0458/1533] Disable memory_optimizer_test on windows

PiperOrigin-RevId: 311229436
Change-Id: Iafaebef3e4574e87b9442c541d5d3de1432426aa
---
 tensorflow/core/grappler/optimizers/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 0b8846faf05..b880055b47d 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -531,7 +531,10 @@ cc_library(
 tf_cuda_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
+    tags = [
+        "no_cuda_on_cpu_tap",  # Do not re-enable again without actually testing.
+        "no_windows",  # b/56402646
+    ],
     deps = [
         ":gpu_swapping_kernels",
         ":gpu_swapping_ops",

From bb15c97379f197a6a46ec1446d8fb0b292b860ba Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 12 May 2020 17:16:41 -0700
Subject: [PATCH 0459/1533] Restructure the Keras class hierarchy for Network,
 Model and Sequential.

The intention of this change is to reduce the code complexity within Keras class, especially for Network, which currently contains logic for both subclass Model and functional Model.

After this change, the subclass model and functional model become individual class and become self contained.

1. Model is now the base class for subclass model. It doesn't contains network structure management, and the topology will be created within __init__ and __call__, which is for user to implement. It also contains compile/fit/eval/predict, which is the basic functionality for model training.

2. Functional is created based on existing Network class. It extends the Model, which allows it leverage compile/fit/eval/predict. In addition, it also take input/output as init parameter and manage the network topology.

3. Sequential model is now a subclass of Functional, since it will use Functional's method to manage it topology (layer stacking).

Model(input, output) will create a Functional under the hood, and behave the same way as before.

PiperOrigin-RevId: 311232972
Change-Id: I6dd32e089cd294d35d5a1f3684e1a1ae1a0ab320
---
 tensorflow/python/keras/engine/BUILD          |   6 +-
 tensorflow/python/keras/engine/base_layer.py  |  14 +-
 .../python/keras/engine/base_layer_v1.py      |   6 +-
 .../engine/{network.py => functional.py}      | 905 +-----------------
 .../{network_test.py => functional_test.py}   |  67 +-
 tensorflow/python/keras/engine/sequential.py  |  31 +-
 tensorflow/python/keras/engine/training.py    | 881 +++++++++++++++--
 tensorflow/python/keras/engine/training_v1.py |  19 +-
 .../python/keras/layers/serialization.py      |   2 +-
 .../python/keras/layers/wrappers_test.py      |   3 +-
 tensorflow/python/keras/models.py             |  19 +-
 .../python/keras/saving/hdf5_format_test.py   |  13 +-
 .../python/keras/saving/saved_model/load.py   |  25 +-
 .../saving/saved_model/model_serialization.py |   8 +-
 .../saved_model/network_serialization.py      |  15 +-
 .../python/keras/utils/version_utils_test.py  |   4 +-
 tensorflow/python/keras/utils/vis_utils.py    |  25 +-
 .../golden/v1/tensorflow.keras.-model.pbtxt   |   3 +-
 .../v1/tensorflow.keras.-sequential.pbtxt     |   2 +-
 ...low.keras.experimental.-linear-model.pbtxt |   3 +-
 ....keras.experimental.-wide-deep-model.pbtxt |   3 +-
 .../v1/tensorflow.keras.models.-model.pbtxt   |   3 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   2 +-
 .../golden/v2/tensorflow.keras.-model.pbtxt   |   3 +-
 .../v2/tensorflow.keras.-sequential.pbtxt     |   2 +-
 ...low.keras.experimental.-linear-model.pbtxt |   3 +-
 ....keras.experimental.-wide-deep-model.pbtxt |   3 +-
 .../v2/tensorflow.keras.models.-model.pbtxt   |   3 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   2 +-
 29 files changed, 1023 insertions(+), 1052 deletions(-)
 rename tensorflow/python/keras/engine/{network.py => functional.py} (58%)
 rename tensorflow/python/keras/engine/{network_test.py => functional_test.py} (97%)

diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 203e481170f..1ff15d7e2e1 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -21,8 +21,8 @@ py_library(
     srcs = [
         "__init__.py",
         "compile_utils.py",
+        "functional.py",
         "input_layer.py",
-        "network.py",
         "node.py",
         "partial_batch_padding_handler.py",
         "saving.py",
@@ -460,9 +460,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "network_test",
+    name = "functional_test",
     size = "medium",
-    srcs = ["network_test.py"],
+    srcs = ["functional_test.py"],
     python_version = "PY3",
     shard_count = 8,
     tags = [
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 210f56ae87a..f6fa17df5c2 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -1006,13 +1006,23 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     """Whether the layer is dynamic (eager-only); set in the constructor."""
     # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
     #                    then this cache logic must be updated.
-    return self._dynamic
+    return self._dynamic or any(layer.dynamic
+                                for layer in self._unique_sublayers())
+
+  def _unique_sublayers(self):
+    # Model.layers will use this as implementation, but we can't expose this
+    # one as the public property since it might conflict with subclass layers
+    # which also have user defined layers property.
+    self._maybe_create_attribute('_layers', [])
+    return list(
+        trackable_layer_utils.filter_empty_layer_containers(self._layers))
 
   @property
   @doc_controls.do_not_doc_inheritable
   @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return self._stateful or any(
+        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
 
   @stateful.setter
   @trackable_layer_utils.invalidate_recursive_cache('stateful')
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 626892752c8..24d12ae4d59 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -833,13 +833,15 @@ class Layer(base_layer.Layer):
   def dynamic(self):
     # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
     #                    then this cache logic must be updated.
-    return self._dynamic
+    return self._dynamic or any(layer.dynamic
+                                for layer in self._unique_sublayers())
 
   @property
   @doc_controls.do_not_generate_docs
   @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return self._stateful or any(
+        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
 
   @stateful.setter
   @trackable_layer_utils.invalidate_recursive_cache('stateful')
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/functional.py
similarity index 58%
rename from tensorflow/python/keras/engine/network.py
rename to tensorflow/python/keras/engine/functional.py
index 87d1953ace5..80eb6cb27d5 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -22,84 +22,46 @@ from __future__ import print_function
 import collections
 import copy
 import itertools
-import json
-import os
 
-import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
+from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.saving import hdf5_format
-from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import network_serialization
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
-from tensorflow.python.training.tracking import tracking
-from tensorflow.python.training.tracking import util as trackable_utils
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import doc_controls
 
 
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-except ImportError:
-  h5py = None
+# pylint: disable=g-classes-have-attributes
+class Functional(training_lib.Model):
+  """A `Functional` model is a `Model` defined as a directed graph of layers.
 
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-
-class Network(base_layer.Layer):
-  """A `Network` is a composition of layers.
-
-  `Network` is the topological form of a "model". A `Model`
-  is simply a `Network` with added training routines.
-
-  Two types of `Networks` exist: Graph Networks and Subclass Networks. Graph
-  networks are used in the Keras Functional and Sequential APIs. Subclassed
-  networks are used when a user subclasses the `Model` class. In general,
-  more Keras features are supported with Graph Networks than with Subclassed
-  Networks, specifically:
+  Three types of `Model` exist: subclassed `Model`, `Functional` model,
+  and `Sequential` (a special case of `Functional`).
+  In general, more Keras features are supported with `Functional`
+  than with subclassed `Model`s, specifically:
 
   - Model cloning (`keras.models.clone`)
   - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
   - Whole-model saving (`model.save()`)
 
-  A Graph Network can be instantiated by passing two arguments to `__init__`.
-  The first argument is the `keras.Input` Tensors that represent the inputs
-  to the Network. The second argument specifies the output Tensors that
-  represent the outputs of this Network. Both arguments can be a nested
-  structure of Tensors.
+  A `Functional` model can be instantiated by passing two arguments to
+  `__init__`. The first argument is the `keras.Input` Tensors that represent
+  the inputs to the model. The second argument specifies the output
+  tensors that represent the outputs of this model. Both arguments can be a
+  nested structure of tensors.
 
   Example:
 
@@ -107,10 +69,10 @@ class Network(base_layer.Layer):
   inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
   t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
   outputs = keras.layers.Add()([t, inputs['x2'])
-  network = Network(inputs, outputs)
+  model = keras.Model(inputs, outputs)
   ```
 
-  A Graph Network constructed using the Functional API can also include raw
+  A `Functional` model constructed using the Functional API can also include raw
   TensorFlow functions, with the exception of functions that create Variables
   or assign ops.
 
@@ -120,38 +82,14 @@ class Network(base_layer.Layer):
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(1)(inputs)
   outputs = tf.nn.relu(x)
-  network = Network(inputs, outputs)
+  model = keras.Model(inputs, outputs)
   ```
 
-  Subclassed Networks can be instantiated via `name` and (optional) `dynamic`
-  keyword arguments. Subclassed Networks keep track of their Layers, and their
-  `call` method can be overridden. Subclassed Networks are typically created
-  indirectly, by subclassing the `Model` class.
-
-  Example:
-
-  ```
-  class MyModel(keras.Model):
-    def __init__(self):
-      super(MyModel, self).__init__(name='my_model', dynamic=False)
-
-      self.layer1 = keras.layers.Dense(10, activation='relu')
-
-    def call(self, inputs):
-      return self.layer1(inputs)
-  ```
-
-  Allowed args in `super().__init__`:
-    name: String name of the model.
-    dynamic: (Subclassed models only) Set this to `True` if your model should
-      only be run eagerly, and should not be used to generate a static
-      computation graph. This attribute is automatically set for Functional API
-      models.
+  Arguments:
+    inputs: List of input tensors (must be created via `tf.keras.Input()`).
+    outputs: List of outputs tensors.
+    name: String, optional. Name of the model.
     trainable: Boolean, whether the model's variables should be trainable.
-    dtype: (Subclassed models only) Default dtype of the model's weights (
-      default of `None` means use the type of the first input). This attribute
-      has no effect on Functional API models, which do not have weights of their
-      own.
   """
 
   # See tf.Module for the usage of this property.
@@ -160,79 +98,31 @@ class Network(base_layer.Layer):
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
       ('_layer_call_argspecs', '_compiled_trainable_state',
        '_output_mask_cache', '_output_tensor_cache', '_output_shape_cache'),
-      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES
+      training_lib.Model._TF_MODULE_IGNORED_PROPERTIES
   ))
 
-  def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
-    # Signature detection
-    if (len(args) == 2 or
-        len(args) == 1 and 'outputs' in kwargs or
-        'inputs' in kwargs and 'outputs' in kwargs):
-      # Graph network
-      self._init_graph_network(*args, **kwargs)
-    else:
-      # Subclassed network
-      self._init_subclassed_network(**kwargs)
-
-    tf_utils.assert_no_legacy_layers(self.layers)
-
-  # Several Network methods have "no_automatic_dependency_tracking"
-  # annotations. Since Network does automatic dependency tracking on attribute
-  # assignment, including for common data structures such as lists, by default
-  # we'd have quite a few empty dependencies which users don't care about (or
-  # would need some way to ignore dependencies automatically, which is confusing
-  # when applied to user code). Some attributes, such as _layers, would cause
-  # structural issues (_layers being the place where Layers assigned to tracked
-  # attributes are stored).
-  #
-  # Aside from these aesthetic and structural issues, useless dependencies on
-  # empty lists shouldn't cause issues; adding or removing them will not break
-  # checkpoints, but may cause "all Python objects matched" assertions to fail
-  # (in which case less strict assertions may be substituted if necessary).
   @trackable.no_automatic_dependency_tracking
-  def _base_init(self, **kwargs):
-    # The following are implemented as property functions:
-    # self.trainable_weights
-    # self.non_trainable_weights
-    # self.input_spec
-    # self.losses
-    # self.updates
-
-    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
-                                           'name', 'autocast'})
-
-    super(Network, self).__init__(**kwargs)
-
-    self.input_names = None
-    self.output_names = None
-    self._saved_model_inputs_spec = None
-
-    # This is True for Sequential networks and Functional networks.
-    self._compute_output_and_mask_jointly = False
-
-    # Don't reset compilation if already done. This may occur if calling
-    # `__init__` (or `_init_graph_network`) on an already-compiled model
-    # such as a Sequential model. Sequential models may need to rebuild
-    # themselves after compilation.
-    self._maybe_create_attribute('_is_compiled', False)
-    self._maybe_create_attribute('optimizer', None)
-
-    self._trackable_saver = (
-        trackable_utils.saver_with_op_caching(self))
+  def __init__(self, inputs=None, outputs=None, name=None, trainable=True):
+    # generic_utils.validate_kwargs(
+    #     kwargs, {'name', 'trainable'},
+    #     'Functional models may only specify `name` and `trainable` keyword '
+    #     'arguments during initialization. Got an unexpected argument:')
+    super(Functional, self).__init__(name=name, trainable=trainable)
+    self._init_graph_network(inputs, outputs)
 
   @trackable.no_automatic_dependency_tracking
-  def _init_graph_network(self, inputs, outputs, **kwargs):
-    generic_utils.validate_kwargs(
-        kwargs, {'name', 'trainable'},
-        'Functional models may only specify `name` and `trainable` keyword '
-        'arguments during initialization. Got an unexpected argument:')
+  def _init_graph_network(self, inputs, outputs):
+    # This method is needed for Sequential to reinitialize graph network when
+    # layer is added or removed.
+    self._is_graph_network = True
+
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
       inputs = inputs[0]
     if isinstance(outputs, list) and len(nest.flatten(outputs)) == 1:
       outputs = outputs[0]
-    self._nested_outputs = outputs
     self._nested_inputs = inputs
+    self._nested_outputs = outputs
     self.inputs = nest.flatten(inputs)
     self.outputs = nest.flatten(outputs)
 
@@ -247,7 +137,6 @@ class Network(base_layer.Layer):
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
       base_layer_utils.create_keras_history(self._nested_outputs)
 
-    self._base_init(**kwargs)
     self._validate_graph_inputs_and_outputs()
 
     # A Network does not create weights of its own, thus it is already
@@ -255,7 +144,6 @@ class Network(base_layer.Layer):
     self.built = True
     self._build_input_shape = nest.map_structure(lambda x: x.shape, inputs)
     self._compute_output_and_mask_jointly = True
-    self._is_graph_network = True
     # `_expects_training_arg` is True since the `training` argument is always
     # present in the signature of the `call` method of a graph network.
     self._expects_training_arg = True
@@ -325,6 +213,7 @@ class Network(base_layer.Layer):
 
     self._compute_tensor_usage_count()
     self._set_save_spec(self._nested_inputs)
+    tf_utils.assert_no_legacy_layers(self.layers)
 
   @property
   def input(self):
@@ -340,9 +229,7 @@ class Network(base_layer.Layer):
       RuntimeError: If called in Eager mode.
       AttributeError: If no inbound nodes are found.
     """
-    if self._is_graph_network:
-      return self._nested_inputs
-    return super(Network, self).input
+    return self._nested_inputs
 
   @property
   def input_shape(self):
@@ -360,9 +247,7 @@ class Network(base_layer.Layer):
         AttributeError: if the layer has no defined input_shape.
         RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return nest.map_structure(backend.int_shape, self.input)
-    return super(Network, self).input_shape
+    return nest.map_structure(backend.int_shape, self.input)
 
   @property
   def output(self):
@@ -379,9 +264,7 @@ class Network(base_layer.Layer):
         layers.
       RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return self._nested_outputs
-    return super(Network, self).output
+    return self._nested_outputs
 
   @property
   def output_shape(self):
@@ -398,9 +281,7 @@ class Network(base_layer.Layer):
         AttributeError: if the layer has no defined output shape.
         RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return nest.map_structure(backend.int_shape, self.output)
-    return super(Network, self).output_shape
+    return nest.map_structure(backend.int_shape, self.output)
 
   def _set_output_names(self):
     """Assigns unique names to the Network's outputs.
@@ -421,29 +302,9 @@ class Network(base_layer.Layer):
       uniquified.append(proposal)
     self.output_names = uniquified
 
-  @trackable.no_automatic_dependency_tracking
-  def _init_subclassed_network(self, **kwargs):
-    self._base_init(**kwargs)
-    self._is_graph_network = False
-    self.inputs = None
-    self.outputs = None
-
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    if self._is_graph_network:
-      return any(layer.dynamic for layer in self.layers)
-    return self._dynamic or any(layer.dynamic for layer in self.layers)
-
   @property
   def _layer_checkpoint_dependencies(self):
     """Dictionary of layer dependencies to be included in the checkpoint."""
-    # Use getattr because this function can be called from __setattr__, at which
-    # point the _is_graph_network attribute has not been created.
-    if (not getattr(self, '_is_graph_network', False) and
-        base_layer_utils.is_subclassed(self)):
-      return {}  # Only add layer dependencies for graph networks
-
     weight_layer_index = 0
 
     dependencies = collections.OrderedDict()
@@ -470,14 +331,14 @@ class Network(base_layer.Layer):
     dependencies = [
         trackable.TrackableReference(name=name, ref=layer)
         for name, layer in self._layer_checkpoint_dependencies.items()]
-    dependencies.extend(super(Network, self)._checkpoint_dependencies)
+    dependencies.extend(super(Functional, self)._checkpoint_dependencies)
     return dependencies
 
   def _lookup_dependency(self, name):
     layer_dependencies = self._layer_checkpoint_dependencies
     if name in layer_dependencies:
       return layer_dependencies[name]
-    return super(Network, self)._lookup_dependency(name)
+    return super(Functional, self)._lookup_dependency(name)
 
   def _handle_deferred_layer_dependencies(self, layers):
     """Handles layer checkpoint dependencies that are added after init."""
@@ -488,263 +349,17 @@ class Network(base_layer.Layer):
         self._handle_deferred_dependencies(name=layer_to_name[layer],
                                            trackable=layer)
 
-  def __setattr__(self, name, value):
-    if not getattr(self, '_self_setattr_tracking', True):
-      super(Network, self).__setattr__(name, value)
-      return
-
-    if all(
-        isinstance(v, (base_layer.Layer,
-                       data_structures.TrackableDataStructure)) or
-        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
-      try:
-        self._is_graph_network
-      except AttributeError:
-        # six.raise_from supresses the original AttributeError from being raised
-        six.raise_from(
-            RuntimeError('It looks like you are subclassing `Model` and you '
-                         'forgot to call `super(YourClass, self).__init__()`.'
-                         ' Always start with this line.'), None)
-
-    super(Network, self).__setattr__(name, value)
-
-    # Keep track of metric instance created in subclassed model/layer.
-    # We do this so that we can maintain the correct order of metrics by adding
-    # the instance to the `metrics` list as soon as it is created.
-    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-    if isinstance(value, metrics_module.Metric):
-      self._metrics.append(value)
-
   @property
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
-  def stateful(self):
-    return any(getattr(layer, 'stateful', False) for layer in self.layers)
-
-  def reset_states(self):
-    for layer in self.layers:
-      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
-        layer.reset_states()
-
-  @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='This property should not be used in TensorFlow 2.0, '
-      'as updates are applied automatically.')
-  @doc_controls.do_not_generate_docs
-  def state_updates(self):
-    """Deprecated, do NOT use!
-
-    Returns the `updates` from all layers that are stateful.
-
-    This is useful for separating training updates and
-    state updates, e.g. when we need to update a layer's internal state
-    during prediction.
-
-    Returns:
-        A list of update ops.
-    """
-    state_updates = []
-    for layer in self.layers:
-      if getattr(layer, 'stateful', False):
-        if hasattr(layer, 'updates'):
-          state_updates += layer.updates
-    return state_updates
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self._dedup_weights(self._undeduplicated_weights)
-
-  @property
-  def _undeduplicated_weights(self):
-    """Returns the undeduplicated list of all layer variables/weights."""
-    self._assert_weights_created()
-    weights = []
-    for layer in self._layers:
-      weights += layer.weights
-    weights += (self._trainable_weights + self._non_trainable_weights)
-    return weights
-
-  @property
-  @tracking.cached_per_instance
   def _should_compute_mask(self):
-    return self._is_graph_network and super(Network, self)._should_compute_mask
+    return True
 
   def compute_mask(self, inputs, mask):
-    if not self._is_graph_network:
-      return None
-
     # TODO(omalleyt): b/123540974 This function is not really safe to call
     # by itself because it will duplicate any updates and losses in graph
     # mode by `call`ing the Layers again.
     output_tensors = self._run_internal_graph(inputs, mask=mask)
     return nest.map_structure(lambda t: t._keras_mask, output_tensors)
 
-  @property
-  def layers(self):
-    return list(
-        trackable_layer_utils.filter_empty_layer_containers(self._layers))
-
-  def get_layer(self, name=None, index=None):
-    """Retrieves a layer based on either its name (unique) or index.
-
-    If `name` and `index` are both provided, `index` will take precedence.
-    Indices are based on order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: String, name of layer.
-        index: Integer, index of layer.
-
-    Returns:
-        A layer instance.
-
-    Raises:
-        ValueError: In case of invalid layer name or index.
-    """
-    # TODO(fchollet): We could build a dictionary based on layer names
-    # since they are constant, but we have not done that yet.
-    if index is not None and name is not None:
-      raise ValueError('Provide only a layer name or a layer index.')
-
-    if index is not None:
-      if len(self.layers) <= index:
-        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
-                         ' but model only has ' + str(len(self.layers)) +
-                         ' layers.')
-      else:
-        return self.layers[index]
-
-    if name is not None:
-      for layer in self.layers:
-        if layer.name == name:
-          return layer
-      raise ValueError('No such layer: ' + name + '.')
-    raise ValueError('Provide either a layer name or layer index.')
-
-  @property
-  def trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._trainable_weights))
-
-  @property
-  def non_trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_non_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._non_trainable_weights +
-            self._trainable_weights))
-
-  @generic_utils.default
-  def build(self, input_shape):
-    """Builds the model based on input shapes received.
-
-    This is to be used for subclassed models, which do not know at instantiation
-    time what their inputs look like.
-
-    This method only exists for users who want to call `model.build()` in a
-    standalone way (as a substitute for calling the model on real data to
-    build it). It will never be called by the framework (and thus it will
-    never throw unexpected errors in an unrelated workflow).
-
-    Args:
-     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
-         are tuples, integers, or TensorShapes.
-
-    Raises:
-      ValueError:
-        1. In case of invalid user-provided data (not of type tuple,
-           list, or TensorShape).
-        2. If the model requires call arguments that are agnostic
-           to the input shapes (positional or kwarg in call signature).
-        3. If not all layers were properly built.
-        4. If float type inputs are not supported within the layers.
-
-      In each of these cases, the user should build their model by calling it
-      on real tensor data.
-    """
-    if self._is_graph_network:
-      super(Network, self).build(input_shape)
-      return
-
-    # If subclass network
-    if input_shape is None:
-      raise ValueError('Input shape must be defined when calling build on a '
-                       'model subclass network.')
-    valid_types = (tuple, list, tensor_shape.TensorShape)
-    if not isinstance(input_shape, valid_types):
-      raise ValueError('Specified input shape is not one of the valid types. '
-                       'Please specify a batch input shape of type tuple or '
-                       'list of input shapes. User provided '
-                       'input type: {}'.format(type(input_shape)))
-
-    if input_shape and not self.inputs:
-      # We create placeholders for the `None`s in the shape and build the model
-      # in a Graph. Since tf.Variable is compatible with both eager execution
-      # and graph building, the variables created after building the model in
-      # a Graph are still valid when executing eagerly.
-      if context.executing_eagerly():
-        graph = func_graph.FuncGraph('build_graph')
-      else:
-        graph = backend.get_graph()
-      with graph.as_default():
-        if isinstance(input_shape, list):
-          x = [base_layer_utils.generate_placeholders_from_shape(shape)
-               for shape in input_shape]
-        elif isinstance(input_shape, dict):
-          x = {
-              k: base_layer_utils.generate_placeholders_from_shape(shape)
-              for k, shape in input_shape.items()
-          }
-        else:
-          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
-
-        kwargs = {}
-        call_signature = self._call_full_argspec
-        call_args = call_signature.args
-        # Exclude `self`, `inputs`, and any argument with a default value.
-        if len(call_args) > 2:
-          if call_signature.defaults:
-            call_args = call_args[2:-len(call_signature.defaults)]
-          else:
-            call_args = call_args[2:]
-          for arg in call_args:
-            if arg == 'training':
-              # Case where `training` is a positional arg with no default.
-              kwargs['training'] = False
-            else:
-              # Has invalid call signature with unknown positional arguments.
-              raise ValueError(
-                  'Currently, you cannot build your model if it has '
-                  'positional or keyword arguments that are not '
-                  'inputs to the model, but are required for its '
-                  '`call` method. Instead, in order to instantiate '
-                  'and build your model, `call` your model on real '
-                  'tensor data with all expected call arguments.')
-        elif len(call_args) < 2:
-          # Signature without `inputs`.
-          raise ValueError('You can only call `build` on a model if its `call` '
-                           'method accepts an `inputs` argument.')
-        try:
-          self.call(x, **kwargs)
-        except (errors.InvalidArgumentError, TypeError):
-          raise ValueError('You cannot build your model by calling `build` '
-                           'if your layers do not support float type inputs. '
-                           'Instead, in order to instantiate and build your '
-                           'model, `call` your model on real tensor data (of '
-                           'the correct dtype).')
-
-    super(Network, self).build(input_shape)
-
   def call(self, inputs, training=None, mask=None):
     """Calls the model on new inputs.
 
@@ -763,17 +378,10 @@ class Network(base_layer.Layer):
         A tensor if there is a single output, or
         a list of tensors if there are more than one outputs.
     """
-    if not self._is_graph_network:
-      raise NotImplementedError('When subclassing the `Model` class, you should'
-                                ' implement a `call` method.')
-
     return self._run_internal_graph(
         inputs, training=training, mask=mask)
 
   def compute_output_shape(self, input_shape):
-    if not self._is_graph_network:
-      return super(Network, self).compute_output_shape(input_shape)
-
     # Convert any shapes in tuple format to TensorShapes.
     input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
 
@@ -975,8 +583,6 @@ class Network(base_layer.Layer):
     return tensor
 
   def get_config(self):
-    if not self._is_graph_network:
-      raise NotImplementedError
     return copy.deepcopy(get_network_config(self))
 
   @classmethod
@@ -1002,373 +608,6 @@ class Network(base_layer.Layer):
     connect_ancillary_layers(model, created_layers)
     return model
 
-  def save(self,
-           filepath,
-           overwrite=True,
-           include_optimizer=True,
-           save_format=None,
-           signatures=None,
-           options=None):
-    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
-
-    The savefile includes:
-
-    - The model architecture, allowing to re-instantiate the model.
-    - The model weights.
-    - The state of the optimizer, allowing to resume training
-        exactly where you left off.
-
-    This allows you to save the entirety of the state of a model
-    in a single file.
-
-    Saved models can be reinstantiated via `keras.models.load_model`.
-    The model returned by `load_model` is a compiled model ready to be used
-    (unless the saved model was never compiled in the first place).
-
-    Models built with the Sequential and Functional API can be saved to both the
-    HDF5 and SavedModel formats. Subclassed models can only be saved with the
-    SavedModel format.
-
-    Note that the model weights may have different scoped names after being
-    loaded. Scoped names include the model/layer names, such as
-    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
-     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
-
-    Arguments:
-        filepath: String, PathLike, path to SavedModel or H5 file to save the
-            model.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        include_optimizer: If True, save optimizer's state together.
-        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
-            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
-            and 'h5' in TF 1.X.
-        signatures: Signatures to save with the SavedModel. Applicable to the
-            'tf' format only. Please see the `signatures` argument in
-            `tf.saved_model.save` for details.
-        options: Optional `tf.saved_model.SaveOptions` object that specifies
-            options for saving to SavedModel.
-
-    Example:
-
-    ```python
-    from keras.models import load_model
-
-    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
-    del model  # deletes the existing model
-
-    # returns a compiled model
-    # identical to the previous one
-    model = load_model('my_model.h5')
-    ```
-    """
-    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
-                    signatures, options)
-
-  def save_weights(self, filepath, overwrite=True, save_format=None):
-    """Saves all layer weights.
-
-    Either saves in HDF5 or in TensorFlow format based on the `save_format`
-    argument.
-
-    When saving in HDF5 format, the weight file has:
-      - `layer_names` (attribute), a list of strings
-          (ordered names of model layers).
-      - For every layer, a `group` named `layer.name`
-          - For every such layer group, a group attribute `weight_names`,
-              a list of strings
-              (ordered names of weights tensor of the layer).
-          - For every weight in the layer, a dataset
-              storing the weight value, named after the weight tensor.
-
-    When saving in TensorFlow format, all objects referenced by the network are
-    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
-    instances or `Optimizer` instances assigned to object attributes. For
-    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
-    outputs)`, `Layer` instances used by the network are tracked/saved
-    automatically. For user-defined classes which inherit from `tf.keras.Model`,
-    `Layer` instances must be assigned to object attributes, typically in the
-    constructor. See the documentation of `tf.train.Checkpoint` and
-    `tf.keras.Model` for details.
-
-    While the formats are the same, do not mix `save_weights` and
-    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
-    loaded using `Model.load_weights`. Checkpoints saved using
-    `tf.train.Checkpoint.save` should be restored using the corresponding
-    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
-    `save_weights` for training checkpoints.
-
-    The TensorFlow format matches objects and variables by starting at a root
-    object, `self` for `save_weights`, and greedily matching attribute
-    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
-    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
-    means saving a `tf.keras.Model` using `save_weights` and loading into a
-    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
-    the `Model`'s variables. See the [guide to training
-    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
-    on the TensorFlow format.
-
-    Arguments:
-        filepath: String or PathLike, path to the file to save the weights to.
-            When saving in TensorFlow format, this is the prefix used for
-            checkpoint files (multiple files are generated). Note that the '.h5'
-            suffix causes weights to be saved in HDF5 format.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
-            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
-            `None` defaults to 'tf'.
-
-    Raises:
-        ImportError: If h5py is not available when attempting to save in HDF5
-            format.
-        ValueError: For invalid/unknown format arguments.
-    """
-    self._assert_weights_created()
-    filepath = path_to_string(filepath)
-    filepath_is_h5 = _is_hdf5_filepath(filepath)
-    if save_format is None:
-      if filepath_is_h5:
-        save_format = 'h5'
-      else:
-        save_format = 'tf'
-    else:
-      user_format = save_format.lower().strip()
-      if user_format in ('tensorflow', 'tf'):
-        save_format = 'tf'
-      elif user_format in ('hdf5', 'h5', 'keras'):
-        save_format = 'h5'
-      else:
-        raise ValueError(
-            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-                save_format,))
-    if save_format == 'tf' and filepath_is_h5:
-      raise ValueError(
-          ('save_weights got save_format="tf"/"tensorflow", but the '
-           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
-           'when saving in TensorFlow format.')
-          % filepath)
-
-    if save_format == 'h5' and h5py is None:
-      raise ImportError(
-          '`save_weights` requires h5py when saving in hdf5.')
-    if save_format == 'tf':
-      check_filepath = filepath + '.index'
-    else:
-      check_filepath = filepath
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(check_filepath):
-      proceed = ask_to_proceed_with_overwrite(check_filepath)
-      if not proceed:
-        return
-    if save_format == 'h5':
-      with h5py.File(filepath, 'w') as f:
-        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
-    else:
-      if context.executing_eagerly():
-        session = None
-      else:
-        session = backend.get_session()
-      optimizer = getattr(self, 'optimizer', None)
-      if (optimizer
-          and not isinstance(optimizer, trackable.Trackable)):
-        logging.warning(
-            ('This model was compiled with a Keras optimizer (%s) but is being '
-             'saved in TensorFlow format with `save_weights`. The model\'s '
-             'weights will be saved, but unlike with TensorFlow optimizers in '
-             'the TensorFlow format the optimizer\'s state will not be '
-             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
-            % (optimizer,))
-      self._trackable_saver.save(filepath, session=session)
-      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      checkpoint_management.update_checkpoint_state_internal(
-          save_dir=os.path.dirname(filepath),
-          model_checkpoint_path=filepath,
-          save_relative_paths=True,
-          all_model_checkpoint_paths=[filepath])
-
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
-
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
-
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
-
-    Arguments:
-        filepath: String or PathLike, path to the weights file to load. For
-            weight files in TensorFlow format, this is the file prefix (the
-            same as was passed to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
-
-    Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
-    """
-
-    if skip_mismatch and not by_name:
-      raise ValueError(
-          'When calling model.load_weights, skip_mismatch can only be set to '
-          'True when by_name is True.')
-
-    filepath = path_to_string(filepath)
-    if _is_hdf5_filepath(filepath):
-      save_format = 'h5'
-    else:
-      try:
-        py_checkpoint_reader.NewCheckpointReader(filepath)
-        save_format = 'tf'
-      except errors_impl.DataLossError:
-        # The checkpoint is not readable in TensorFlow format. Try HDF5.
-        save_format = 'h5'
-    if save_format == 'tf':
-      status = self._trackable_saver.restore(filepath)
-      if by_name:
-        raise NotImplementedError(
-            'Weights may only be loaded based on topology into Models when '
-            'loading TensorFlow-formatted weights (got by_name=True to '
-            'load_weights).')
-      if not context.executing_eagerly():
-        session = backend.get_session()
-        # Restore existing variables (if any) immediately, and set up a
-        # streaming restore for any variables created in the future.
-        trackable_utils.streaming_restore(status=status, session=session)
-      status.assert_nontrivial_match()
-      return status
-    if h5py is None:
-      raise ImportError(
-          '`load_weights` requires h5py when loading weights from HDF5.')
-    if self._is_graph_network and not self.built:
-      raise NotImplementedError(
-          'Unable to load weights saved in HDF5 format into a subclassed '
-          'Model which has not created its variables yet. Call the Model '
-          'first, then load the weights.')
-    self._assert_weights_created()
-    with h5py.File(filepath, 'r') as f:
-      if 'layer_names' not in f.attrs and 'model_weights' in f:
-        f = f['model_weights']
-      if by_name:
-        hdf5_format.load_weights_from_hdf5_group_by_name(
-            f, self.layers, skip_mismatch=skip_mismatch)
-      else:
-        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
-
-  def _updated_config(self):
-    """Util shared between different serialization methods.
-
-    Returns:
-        Model config with Keras version information added.
-    """
-    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-    config = self.get_config()
-    model_config = {
-        'class_name': self.__class__.__name__,
-        'config': config,
-        'keras_version': keras_version,
-        'backend': backend.backend()
-    }
-    return model_config
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the network configuration.
-
-    To load a network from a JSON save file, use
-    `keras.models.model_from_json(json_string, custom_objects={})`.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `json.dumps()`.
-
-    Returns:
-        A JSON string.
-    """
-    model_config = self._updated_config()
-    return json.dumps(
-        model_config, default=serialization.get_json_type, **kwargs)
-
-  def to_yaml(self, **kwargs):
-    """Returns a yaml string containing the network configuration.
-
-    To load a network from a yaml save file, use
-    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
-
-    `custom_objects` should be a dictionary mapping
-    the names of custom losses / layers / etc to the corresponding
-    functions / classes.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `yaml.dump()`.
-
-    Returns:
-        A YAML string.
-
-    Raises:
-        ImportError: if yaml module is not found.
-    """
-    if yaml is None:
-      raise ImportError(
-          'Requires yaml module installed (`pip install pyyaml`).')
-    return yaml.dump(self._updated_config(), **kwargs)
-
-  def summary(self, line_length=None, positions=None, print_fn=None):
-    """Prints a string summary of the network.
-
-    Arguments:
-        line_length: Total length of printed lines
-            (e.g. set this to adapt the display to different
-            terminal window sizes).
-        positions: Relative or absolute positions of log elements
-            in each line. If not provided,
-            defaults to `[.33, .55, .67, 1.]`.
-        print_fn: Print function to use. Defaults to `print`.
-            It will be called on each line of the summary.
-            You can set it to a custom function
-            in order to capture the string summary.
-
-    Raises:
-        ValueError: if `summary()` is called before the model is built.
-    """
-    if not self.built:
-      raise ValueError('This model has not yet been built. '
-                       'Build the model first by calling `build()` or calling '
-                       '`fit()` with some data, or specify '
-                       'an `input_shape` argument in the first layer(s) for '
-                       'automatic build.')
-    layer_utils.print_summary(self,
-                              line_length=line_length,
-                              positions=positions,
-                              print_fn=print_fn)
-
   def _validate_graph_inputs_and_outputs(self):
     """Validates the inputs and outputs of a Graph Network."""
     # Check for redundancy in inputs.
@@ -1542,30 +781,9 @@ class Network(base_layer.Layer):
     self._tensor_usage_count = tensor_usage_count
 
   def _assert_weights_created(self):
-    """Asserts that all the weights for the network have been created.
-
-    For a non-dynamic network, the weights must already be created after the
-    layer has been called. For a dynamic network, the exact list of weights can
-    never be known for certain since it may change at any time during execution.
-
-    We run this check right before accessing weights or getting the Numpy value
-    for the current weights. Otherwise, if the layer has never been called,
-    the user would just get an empty list, which is misleading.
-
-    Raises:
-      ValueError: if the weights of the network has not yet been created.
-    """
-    if self.dynamic:
-      return
-    if (not self._is_graph_network and
-        'build' in self.__class__.__dict__ and
-        not self.built):
-      # For any model that has customized build() method but hasn't
-      # been invoked yet, this will cover both sequential and subclass model.
-      raise ValueError('Weights for model %s have not yet been created. '
-                       'Weights are created when the Model is first called on '
-                       'inputs or `build()` is called with an `input_shape`.' %
-                       self.name)
+    # Override the implementation in Model.
+    # The Functional model should always have weight created already.
+    return
 
   def _graph_network_add_loss(self, symbolic_loss):
     new_nodes, new_layers = _map_subgraph_network(self.inputs, [symbolic_loss])
@@ -1587,42 +805,11 @@ class Network(base_layer.Layer):
     new_layers.append(add_metric_layer)
     self._insert_layers(new_layers, new_nodes)
 
-  @trackable.no_automatic_dependency_tracking
-  def _set_save_spec(self, inputs):
-    if self._saved_model_inputs_spec is not None:
-      return  # Already set.
-
-    input_names = self.input_names
-    if not input_names:
-      input_names = compile_utils.create_pseudo_input_names(inputs)
-
-    flat_inputs = nest.flatten(inputs)
-    specs = []
-    for name, tensor in zip(input_names, flat_inputs):
-      specs.append(
-          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
-    specs = nest.pack_sequence_as(inputs, specs)
-
-    self._saved_model_inputs_spec = specs
-
-  def _get_save_spec(self, dynamic_batch=True):
-    if self._saved_model_inputs_spec is None:
-      return None
-
-    return nest.map_structure(
-        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
-        self._saved_model_inputs_spec)
-
   @property
   def _trackable_saved_model_saver(self):
     return network_serialization.NetworkSavedModelSaver(self)
 
 
-def _is_hdf5_filepath(filepath):
-  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
-          filepath.endswith('.hdf5'))
-
-
 def _make_node_key(layer_name, node_index):
   return layer_name + '_ib-' + str(node_index)
 
@@ -1830,7 +1017,7 @@ def _map_subgraph_network(inputs, outputs):
 def _should_skip_first_node(layer):
   """Returns True if the first layer node should not be saved or loaded."""
   # Networks start with a pre-existing node linking their input to output.
-  return issubclass(layer.__class__, Network) and layer._is_graph_network
+  return isinstance(layer, Functional)
 
 
 def _deserialize_keras_tensors(kwargs, layer_map):
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/functional_test.py
similarity index 97%
rename from tensorflow/python/keras/engine/network_test.py
rename to tensorflow/python/keras/engine/functional_test.py
index b4e8adf2c49..90fc9f2697f 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
-from tensorflow.python.keras.engine import network as network_lib
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.utils import layer_utils
@@ -89,7 +89,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
       self.assertEqual(len(layer.updates), 3)
 
-      network = network_lib.Network(x2, y2)
+      network = functional.Functional(x2, y2)
       self.assertEqual(len(network.updates), 3)
 
       x3 = input_layer_lib.Input(shape=(1,))
@@ -120,7 +120,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     dense_a = layers.Dense(4, name='dense_a')
     dense_b = layers.Dense(2, name='dense_b')
     y = dense_b(dense_a(x))
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     # test various get_layer by index
     self.assertEqual(network.get_layer(index=1), dense_a)
@@ -251,7 +251,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       x = input_layer_lib.Input(shape=(32,))
       dense = layers.Dense(2)
       y = dense(x)
-      network = network_lib.Network(x, y, name='dense_network')
+      network = functional.Functional(x, y, name='dense_network')
 
       # test basic attributes
       self.assertEqual(network.name, 'dense_network')
@@ -740,7 +740,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     else:
       x = input_layer_lib.Input(shape=(32,))
       y = MaskedLayer()(x)  # pylint: disable=not-callable
-      network = network_lib.Network(x, y)
+      network = functional.Functional(x, y)
 
       # test callability on Input
       x_2 = input_layer_lib.Input(shape=(32,))
@@ -1102,7 +1102,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
   def test_subclassed_error_if_init_not_called(self):
 
-    class MyNetwork(network_lib.Network):
+    class MyNetwork(training_lib.Model):
 
       def __init__(self):
         self._foo = [layers.Dense(10), layers.Dense(10)]
@@ -1124,10 +1124,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     inputs = input_layer_lib.Input(shape=(32,))
     outputs = layers.Dense(4)(inputs)
 
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dtype='int64')
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dynamic=False)
 
@@ -1136,8 +1138,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertFalse(model.trainable)
     self.assertFalse(model.dynamic)
 
+    class SubclassModel(training_lib.Model):
+      pass
     # Subclassed model
-    model = training_lib.Model(
+    model = SubclassModel(
         name='subclassed', trainable=True, dtype='int64', dynamic=True)
     self.assertEqual('subclassed', model.name)
     self.assertTrue(model.dynamic)
@@ -1150,9 +1154,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     input_tensor2 = input_layer_lib.Input(shape=[10], name='b')
     output_tensor1 = layers.Dense(units=10)(input_tensor1)
 
-    net = network_lib.Network(
+    net = functional.Functional(
         inputs=[input_tensor1, input_tensor2], outputs=[output_tensor1])
-    net2 = network_lib.Network.from_config(net.get_config())
+    net2 = functional.Functional.from_config(net.get_config())
     self.assertLen(net2.inputs, 2)
     self.assertEqual('a', net2.layers[0].name)
     self.assertEqual('b', net2.layers[1].name)
@@ -1180,8 +1184,8 @@ class DeferredModeTest(keras_parameterized.TestCase):
       self.assertEqual(x.shape.as_list(), [None, 2])
 
     outputs = layers.Dense(4)(x)
-    network = network_lib.Network(inputs, outputs)
-    self.assertIsInstance(network, network_lib.Network)
+    network = functional.Functional(inputs, outputs)
+    self.assertIsInstance(network, functional.Functional)
 
     if context.executing_eagerly():
       # It should be possible to call such a network on EagerTensors.
@@ -1204,7 +1208,7 @@ class DeferredModeTest(keras_parameterized.TestCase):
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
     c = layers.Dense(2)(c)
 
-    network = network_lib.Network([input_a, input_b], [a, c])
+    network = functional.Functional([input_a, input_b], [a, c])
     if context.executing_eagerly():
       a_val = constant_op.constant(
           np.random.random((10, 32)).astype('float32'))
@@ -1484,9 +1488,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x2': input_layer_lib.Input(shape=(1,))
     }
     outputs = layers.Add()([inputs['x1'], inputs['x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network({
         'x': array_ops.ones((1, 1), 'float32'),
@@ -1509,9 +1513,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x*x': layers.Multiply()([inputs, inputs])
     }
 
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network(array_ops.ones((1, 1), 'float32'))
     result = self.evaluate(result_tensor)
@@ -1531,7 +1535,8 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x1+x2': layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
         'x1*x2': layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
     }
-    inner_network = network_lib.Network(inner_inputs, inner_outputs)
+    inner_network = functional.Functional(
+        inner_inputs, inner_outputs)
 
     inputs = [
         input_layer_lib.Input(shape=(1,)),
@@ -1539,9 +1544,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     ]
     middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
     outputs = layers.Add()([middle['x1+x2'], middle['x1*x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     # Computes: `(x1+x2) + (x1*x2)`
     result_tensor = network(
@@ -1735,13 +1740,13 @@ class DTypeTest(keras_parameterized.TestCase):
   def test_graph_network_dtype(self):
     inputs = input_layer_lib.Input((10,))
     outputs = layers.Dense(10)(inputs)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
     self.assertEqual(network.dtype, 'float32')
 
   @testing_utils.enable_v2_dtype_behavior
   def test_subclassed_network_dtype(self):
 
-    class IdentityNetwork(network_lib.Network):
+    class IdentityNetwork(training_lib.Model):
 
       def call(self, inputs):
         return inputs
@@ -1785,11 +1790,11 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
   def layer_and_network_test(self):
     # Top level layer
-    network = network_lib.Network()
+    network = functional.Functional()
 
     layer_0 = AttrTrackingLayer()
 
-    sub_network = network_lib.Network()
+    sub_network = functional.Functional()
     layer_1 = AttrTrackingLayer(dynamic=True)
     layer_2 = AttrTrackingLayer()
     sub_network.sub_layers = [layer_1, layer_2]
@@ -1887,7 +1892,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     x = input_layer_lib.Input(shape=(None, 32))
     dense = layers.Dense(2)
     y = dense(x)
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     for i in range(999, 1024):
       self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
@@ -1895,7 +1900,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_2d_inputs_squeezed_to_1d(self):
     input_1d = input_layer_lib.Input(shape=())
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10, 1))
     y = net(x)
@@ -1904,7 +1909,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_1d_inputs_expanded_to_2d(self):
     input_1d = input_layer_lib.Input(shape=(1,))
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10,))
     y = net(x)
@@ -1927,14 +1932,14 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=True)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     # Hard-coded value passed during construction is respected.
     self.assertAllEqual(network(x, training=False), x)
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=False)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     network(x, training=True)
     # Hard-coded value passed during construction is respected.
@@ -1942,7 +1947,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=None)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     # `None` value passed during construction is overridden.
     self.assertAllEqual(network(x, training=True), x)
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 2d5abac7fd6..d07ed477ba9 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -26,8 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
@@ -35,7 +35,6 @@ from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.deprecation import deprecated
@@ -48,7 +47,7 @@ SINGLE_LAYER_OUTPUT_ERROR_MSG = ('All layers in a Sequential model should have '
 
 
 @keras_export('keras.Sequential', 'keras.models.Sequential')
-class Sequential(training.Model):
+class Sequential(functional.Functional):
   """`Sequential` groups a linear stack of layers into a `tf.keras.Model`.
 
   `Sequential` provides training and inference features on this model.
@@ -113,7 +112,9 @@ class Sequential(training.Model):
       layers: Optional list of layers to add to the model.
       name: Optional name for the model.
     """
-    super(Sequential, self).__init__(name=name, autocast=False)
+    # Skip the init in FunctionalModel since model doesn't have input/output yet
+    super(functional.Functional, self).__init__(  # pylint: disable=bad-super-call
+        name=name, autocast=False)
     self.supports_masking = True
     self._compute_output_and_mask_jointly = True
     self._auto_track_sub_layers = False
@@ -152,11 +153,6 @@ class Sequential(training.Model):
       return layers[1:]
     return layers[:]
 
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    return any(layer.dynamic for layer in self.layers)
-
   @trackable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
@@ -233,7 +229,7 @@ class Sequential(training.Model):
       self.built = True
 
     if set_inputs or self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
       self._graph_initialized = True
     else:
       self._layers.append(layer)
@@ -267,7 +263,7 @@ class Sequential(training.Model):
     elif self._graph_initialized:
       self.layers[-1]._outbound_nodes = []
       self.outputs = [self.layers[-1].output]
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
       self.built = True
 
   @trackable.no_automatic_dependency_tracking
@@ -341,7 +337,7 @@ class Sequential(training.Model):
             # case, we fall back to the legacy deferred behavior.
             # TODO(fchollet): consider raising here, as we should not be
             # supporting such layers.
-            self._init_graph_network(inputs, outputs, name=self.name)
+            self._init_graph_network(inputs, outputs)
             self._graph_initialized = True
           except:  # pylint:disable=bare-except
             self._use_legacy_deferred_behavior = True
@@ -350,7 +346,7 @@ class Sequential(training.Model):
   @generic_utils.default
   def build(self, input_shape=None):
     if self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
     else:
       if input_shape is None:
         raise ValueError('You must provide an `input_shape` argument.')
@@ -380,7 +376,7 @@ class Sequential(training.Model):
 
     if self._graph_initialized:
       if not self.built:
-        self._init_graph_network(self.inputs, self.outputs, name=self.name)
+        self._init_graph_network(self.inputs, self.outputs)
       return super(Sequential, self).call(inputs, training=training, mask=mask)
 
     outputs = inputs  # handle the corner case where self.layers is empty
@@ -519,6 +515,13 @@ class Sequential(training.Model):
         return False
     return True
 
+  def _assert_weights_created(self):
+    if self._graph_initialized:
+      return
+    # When the graph has not been initialized, use the Model's implementation to
+    # to check if the weights has been created.
+    super(functional.Functional, self)._assert_weights_created()  # pylint: disable=bad-super-call
+
 
 def _get_shape_tuple(t):
   if hasattr(t, 'shape'):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index bb68ffca2ed..52bf42a099d 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 import copy
 import itertools
+import json
+import os
+import six
 
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.distribute import distribute_coordinator as dc
@@ -31,19 +34,31 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as callbacks_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import data_adapter
-from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
+from tensorflow.python.keras.saving import hdf5_format
+from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import model_serialization
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -52,12 +67,33 @@ from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
 
 
 _keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
@@ -97,8 +133,25 @@ def disable_multi_worker(method):
       target=method, decorator_func=_method_wrapper)
 
 
+def inject_functional_model_class(cls):
+  from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.engine import training_v1  # pylint: disable=g-import-not-at-top
+  if cls == Model or cls == training_v1.Model:
+    return functional.Functional
+
+  cls.__bases__ = tuple(inject_functional_model_class(base)
+                        for base in cls.__bases__)
+  return cls
+
+
+def is_functional_model_init_params(args, kwargs):
+  return (len(args) == 2 or
+          len(args) == 1 and 'outputs' in kwargs or
+          'inputs' in kwargs and 'outputs' in kwargs)
+
+
 @keras_export('keras.Model', 'keras.models.Model')
-class Model(network.Network, version_utils.ModelVersionSelector):
+class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   """`Model` groups layers into an object with training and inference features.
 
   Arguments:
@@ -174,11 +227,61 @@ class Model(network.Network, version_utils.ModelVersionSelector):
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(
       itertools.chain(('_train_counter', '_test_counter', '_predict_counter',
                        '_steps_per_execution'),
-                      network.Network._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
+                      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
 
+  def __new__(cls, *args, **kwargs):
+    # Signature detection
+    if is_functional_model_init_params(args, kwargs) and cls == Model:
+      # Functional model
+      from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+      return functional.Functional(*args, **kwargs)
+    else:
+      return super(Model, cls).__new__(cls, *args, **kwargs)
+
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, *args, **kwargs):
-    super(Model, self).__init__(*args, **kwargs)
-    _keras_api_gauge.get_cell('model').set(True)
+    # Special case for Subclassed Functional Model, which we couldn't detect
+    # when __new__ is called. We only realize it is a functional model when it
+    # calls super.__init__ with input and output tensor.
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    if (is_functional_model_init_params(args, kwargs) and
+        not isinstance(self, functional.Functional)):
+      inject_functional_model_class(self.__class__)
+      functional.Functional.__init__(self, *args, **kwargs)
+      return
+
+    # The following are implemented as property functions:
+    # self.trainable_weights
+    # self.non_trainable_weights
+    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
+                                           'name', 'autocast'})
+    super(Model, self).__init__(**kwargs)
+    # By default, Model is a subclass model, which is not in graph network.
+    self._is_graph_network = False
+
+    self.inputs = None
+    self.outputs = None
+    self.input_names = None
+    self.output_names = None
+    # stop_training is used by callback to stop training when error happens
+    self.stop_training = False
+    self.history = None
+    # These objects are used in the default `Model.compile`. They are not
+    # guaranteed to be set after `Model.compile` is called, as users can
+    # override compile with custom logic.
+    self.compiled_loss = None
+    self.compiled_metrics = None
+
+    # This is True for Sequential networks and Functional networks.
+    self._compute_output_and_mask_jointly = False
+
+    # Don't reset compilation if already done. This may occur if calling
+    # `__init__` (or `_init_graph_network`) on an already-compiled model
+    # such as a Sequential model. Sequential models may need to rebuild
+    # themselves after compilation.
+    self._maybe_create_attribute('_is_compiled', False)
+    self._maybe_create_attribute('optimizer', None)
+
     # Model must be created under scope of DistStrat it will be trained with.
     if ds_context.has_strategy():
       self._distribution_strategy = ds_context.get_strategy()
@@ -186,23 +289,20 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       self._distribution_strategy = None
     # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
     self._run_eagerly = None
-    self.stop_training = False
     # Initialize cache attrs.
     self._reset_compile_cache()
 
     # Fault-tolerance handler. Set in `ModelCheckpoint`.
     self._training_state = None
-    self.history = None
-
-    # These objects are used in the default `Model.compile`. They are not
-    # guaranteed to be set after `Model.compile` is called, as users can
-    # override compile with custom logic.
-    self.compiled_loss = None
-    self.compiled_metrics = None
+    self._saved_model_inputs_spec = None
+    self._trackable_saver = (
+        trackable_utils.saver_with_op_caching(self))
 
     self._steps_per_execution = None
 
     self._init_batch_counters()
+    self._base_model_initialized = True
+    _keras_api_gauge.get_cell('model').set(True)
 
   @trackable.no_automatic_dependency_tracking
   def _init_batch_counters(self):
@@ -214,67 +314,153 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     self._predict_counter = variables.Variable(
         0, dtype='int64', aggregation=agg)
 
-  def get_weights(self):
-    """Retrieves the weights of the model.
+  def __setattr__(self, name, value):
+    if not getattr(self, '_self_setattr_tracking', True):
+      super(Model, self).__setattr__(name, value)
+      return
 
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    with self.distribute_strategy.scope():
-      return super(Model, self).get_weights()
+    if all(
+        isinstance(v, (base_layer.Layer,
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
+      try:
+        self._base_model_initialized
+      except AttributeError:
+        # six.raise_from supresses the original AttributeError from being raised
+        six.raise_from(
+            RuntimeError('It looks like you are subclassing `Model` and you '
+                         'forgot to call `super(YourClass, self).__init__()`.'
+                         ' Always start with this line.'), None)
 
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+    super(Model, self).__setattr__(name, value)
 
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
+    # Keep track of metric instance created in subclassed model/layer.
+    # We do this so that we can maintain the correct order of metrics by adding
+    # the instance to the `metrics` list as soon as it is created.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    if isinstance(value, metrics_module.Metric):
+      self._metrics.append(value)
 
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
+  @generic_utils.default
+  def build(self, input_shape):
+    """Builds the model based on input shapes received.
 
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
+    This is to be used for subclassed models, which do not know at instantiation
+    time what their inputs look like.
 
-    Arguments:
-        filepath: String, path to the weights file to load. For weight files in
-            TensorFlow format, this is the file prefix (the same as was passed
-            to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
+    This method only exists for users who want to call `model.build()` in a
+    standalone way (as a substitute for calling the model on real data to
+    build it). It will never be called by the framework (and thus it will
+    never throw unexpected errors in an unrelated workflow).
 
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
+    Args:
+     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
+         are tuples, integers, or TensorShapes.
 
     Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
+      ValueError:
+        1. In case of invalid user-provided data (not of type tuple,
+           list, or TensorShape).
+        2. If the model requires call arguments that are agnostic
+           to the input shapes (positional or kwarg in call signature).
+        3. If not all layers were properly built.
+        4. If float type inputs are not supported within the layers.
+
+      In each of these cases, the user should build their model by calling it
+      on real tensor data.
     """
-    if dist_utils.is_tpu_strategy(self._distribution_strategy):
-      if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
-        raise ValueError('Load weights is not yet supported with TPUStrategy '
-                         'with steps_per_run greater than 1.')
-    return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
+    if self._is_graph_network:
+      super(Model, self).build(input_shape)
+      return
+
+    if input_shape is None:
+      raise ValueError('Input shape must be defined when calling build on a '
+                       'model subclass network.')
+    valid_types = (tuple, list, tensor_shape.TensorShape)
+    if not isinstance(input_shape, valid_types):
+      raise ValueError('Specified input shape is not one of the valid types. '
+                       'Please specify a batch input shape of type tuple or '
+                       'list of input shapes. User provided '
+                       'input type: {}'.format(type(input_shape)))
+
+    if input_shape and not self.inputs:
+      # We create placeholders for the `None`s in the shape and build the model
+      # in a Graph. Since tf.Variable is compatible with both eager execution
+      # and graph building, the variables created after building the model in
+      # a Graph are still valid when executing eagerly.
+      if context.executing_eagerly():
+        graph = func_graph.FuncGraph('build_graph')
+      else:
+        graph = backend.get_graph()
+      with graph.as_default():
+        if isinstance(input_shape, list):
+          x = [base_layer_utils.generate_placeholders_from_shape(shape)
+               for shape in input_shape]
+        elif isinstance(input_shape, dict):
+          x = {
+              k: base_layer_utils.generate_placeholders_from_shape(shape)
+              for k, shape in input_shape.items()
+          }
+        else:
+          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
+
+        kwargs = {}
+        call_signature = self._call_full_argspec
+        call_args = call_signature.args
+        # Exclude `self`, `inputs`, and any argument with a default value.
+        if len(call_args) > 2:
+          if call_signature.defaults:
+            call_args = call_args[2:-len(call_signature.defaults)]
+          else:
+            call_args = call_args[2:]
+          for arg in call_args:
+            if arg == 'training':
+              # Case where `training` is a positional arg with no default.
+              kwargs['training'] = False
+            else:
+              # Has invalid call signature with unknown positional arguments.
+              raise ValueError(
+                  'Currently, you cannot build your model if it has '
+                  'positional or keyword arguments that are not '
+                  'inputs to the model, but are required for its '
+                  '`call` method. Instead, in order to instantiate '
+                  'and build your model, `call` your model on real '
+                  'tensor data with all expected call arguments.')
+        elif len(call_args) < 2:
+          # Signature without `inputs`.
+          raise ValueError('You can only call `build` on a model if its `call` '
+                           'method accepts an `inputs` argument.')
+        try:
+          self.call(x, **kwargs)
+        except (errors.InvalidArgumentError, TypeError):
+          raise ValueError('You cannot build your model by calling `build` '
+                           'if your layers do not support float type inputs. '
+                           'Instead, in order to instantiate and build your '
+                           'model, `call` your model on real tensor data (of '
+                           'the correct dtype).')
+
+    super(Model, self).build(input_shape)
+
+  def call(self, inputs, training=None, mask=None):
+    """Calls the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        training: Boolean or boolean scalar tensor, indicating whether to run
+          the `Network` in training mode or inference mode.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    raise NotImplementedError('When subclassing the `Model` class, you should '
+                              'implement a `call` method.')
 
   def compile(self,
               optimizer='rmsprop',
@@ -399,6 +585,10 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         dtype='int64',
         aggregation=variables.VariableAggregationV2.ONLY_FIRST_REPLICA)
 
+  @property
+  def _should_compute_mask(self):
+    return False
+
   @property
   def metrics(self):
     """Returns the model's metrics added using `compile`, `add_metric` APIs.
@@ -1661,6 +1851,564 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         verbose=verbose,
         callbacks=callbacks)
 
+  ######################################################################
+  # Functions below are not training related. They are for model weights
+  # tracking, save/load, serialization, etc.
+  ######################################################################
+
+  @property
+  def trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._trainable_weights))
+
+  @property
+  def non_trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_non_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._non_trainable_weights +
+            self._trainable_weights))
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    with self.distribute_strategy.scope():
+      return super(Model, self).get_weights()
+
+  def save(self,
+           filepath,
+           overwrite=True,
+           include_optimizer=True,
+           save_format=None,
+           signatures=None,
+           options=None):
+    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
+
+    The savefile includes:
+
+    - The model architecture, allowing to re-instantiate the model.
+    - The model weights.
+    - The state of the optimizer, allowing to resume training
+        exactly where you left off.
+
+    This allows you to save the entirety of the state of a model
+    in a single file.
+
+    Saved models can be reinstantiated via `keras.models.load_model`.
+    The model returned by `load_model` is a compiled model ready to be used
+    (unless the saved model was never compiled in the first place).
+
+    Models built with the Sequential and Functional API can be saved to both the
+    HDF5 and SavedModel formats. Subclassed models can only be saved with the
+    SavedModel format.
+
+    Note that the model weights may have different scoped names after being
+    loaded. Scoped names include the model/layer names, such as
+    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
+     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+
+    Arguments:
+        filepath: String, PathLike, path to SavedModel or H5 file to save the
+            model.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
+            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
+            and 'h5' in TF 1.X.
+        signatures: Signatures to save with the SavedModel. Applicable to the
+            'tf' format only. Please see the `signatures` argument in
+            `tf.saved_model.save` for details.
+        options: Optional `tf.saved_model.SaveOptions` object that specifies
+            options for saving to SavedModel.
+
+    Example:
+
+    ```python
+    from keras.models import load_model
+
+    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
+    del model  # deletes the existing model
+
+    # returns a compiled model
+    # identical to the previous one
+    model = load_model('my_model.h5')
+    ```
+    """
+    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
+                    signatures, options)
+
+  def save_weights(self, filepath, overwrite=True, save_format=None):
+    """Saves all layer weights.
+
+    Either saves in HDF5 or in TensorFlow format based on the `save_format`
+    argument.
+
+    When saving in HDF5 format, the weight file has:
+      - `layer_names` (attribute), a list of strings
+          (ordered names of model layers).
+      - For every layer, a `group` named `layer.name`
+          - For every such layer group, a group attribute `weight_names`,
+              a list of strings
+              (ordered names of weights tensor of the layer).
+          - For every weight in the layer, a dataset
+              storing the weight value, named after the weight tensor.
+
+    When saving in TensorFlow format, all objects referenced by the network are
+    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
+    instances or `Optimizer` instances assigned to object attributes. For
+    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
+    outputs)`, `Layer` instances used by the network are tracked/saved
+    automatically. For user-defined classes which inherit from `tf.keras.Model`,
+    `Layer` instances must be assigned to object attributes, typically in the
+    constructor. See the documentation of `tf.train.Checkpoint` and
+    `tf.keras.Model` for details.
+
+    While the formats are the same, do not mix `save_weights` and
+    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
+    loaded using `Model.load_weights`. Checkpoints saved using
+    `tf.train.Checkpoint.save` should be restored using the corresponding
+    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
+    `save_weights` for training checkpoints.
+
+    The TensorFlow format matches objects and variables by starting at a root
+    object, `self` for `save_weights`, and greedily matching attribute
+    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
+    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
+    means saving a `tf.keras.Model` using `save_weights` and loading into a
+    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
+    the `Model`'s variables. See the [guide to training
+    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
+    on the TensorFlow format.
+
+    Arguments:
+        filepath: String or PathLike, path to the file to save the weights to.
+            When saving in TensorFlow format, this is the prefix used for
+            checkpoint files (multiple files are generated). Note that the '.h5'
+            suffix causes weights to be saved in HDF5 format.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
+            `None` defaults to 'tf'.
+
+    Raises:
+        ImportError: If h5py is not available when attempting to save in HDF5
+            format.
+        ValueError: For invalid/unknown format arguments.
+    """
+    self._assert_weights_created()
+    filepath = path_to_string(filepath)
+    filepath_is_h5 = _is_hdf5_filepath(filepath)
+    if save_format is None:
+      if filepath_is_h5:
+        save_format = 'h5'
+      else:
+        save_format = 'tf'
+    else:
+      user_format = save_format.lower().strip()
+      if user_format in ('tensorflow', 'tf'):
+        save_format = 'tf'
+      elif user_format in ('hdf5', 'h5', 'keras'):
+        save_format = 'h5'
+      else:
+        raise ValueError(
+            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+                save_format,))
+    if save_format == 'tf' and filepath_is_h5:
+      raise ValueError(
+          ('save_weights got save_format="tf"/"tensorflow", but the '
+           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+           'when saving in TensorFlow format.')
+          % filepath)
+
+    if save_format == 'h5' and h5py is None:
+      raise ImportError(
+          '`save_weights` requires h5py when saving in hdf5.')
+    if save_format == 'tf':
+      check_filepath = filepath + '.index'
+    else:
+      check_filepath = filepath
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(check_filepath):
+      proceed = ask_to_proceed_with_overwrite(check_filepath)
+      if not proceed:
+        return
+    if save_format == 'h5':
+      with h5py.File(filepath, 'w') as f:
+        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
+    else:
+      if context.executing_eagerly():
+        session = None
+      else:
+        session = backend.get_session()
+      optimizer = getattr(self, 'optimizer', None)
+      if (optimizer
+          and not isinstance(optimizer, trackable.Trackable)):
+        logging.warning(
+            ('This model was compiled with a Keras optimizer (%s) but is being '
+             'saved in TensorFlow format with `save_weights`. The model\'s '
+             'weights will be saved, but unlike with TensorFlow optimizers in '
+             'the TensorFlow format the optimizer\'s state will not be '
+             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
+            % (optimizer,))
+      self._trackable_saver.save(filepath, session=session)
+      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
+      checkpoint_management.update_checkpoint_state_internal(
+          save_dir=os.path.dirname(filepath),
+          model_checkpoint_path=filepath,
+          save_relative_paths=True,
+          all_model_checkpoint_paths=[filepath])
+
+  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+
+    If `by_name` is False weights are loaded based on the network's
+    topology. This means the architecture should be the same as when the weights
+    were saved.  Note that layers that don't have weights are not taken into
+    account in the topological ordering, so adding or removing layers is fine as
+    long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers only if they share the
+    same name. This is useful for fine-tuning or transfer-learning models where
+    some of the layers have changed.
+
+    Only topological loading (`by_name=False`) is supported when loading weights
+    from the TensorFlow format. Note that topological loading differs slightly
+    between TensorFlow and HDF5 formats for user-defined classes inheriting from
+    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
+    TensorFlow format loads based on the object-local names of attributes to
+    which layers are assigned in the `Model`'s constructor.
+
+    Arguments:
+        filepath: String, path to the weights file to load. For weight files in
+            TensorFlow format, this is the file prefix (the same as was passed
+            to `save_weights`).
+        by_name: Boolean, whether to load weights by name or by topological
+            order. Only topological loading is supported for weight files in
+            TensorFlow format.
+        skip_mismatch: Boolean, whether to skip loading of layers where there is
+            a mismatch in the number of weights, or a mismatch in the shape of
+            the weight (only valid when `by_name=True`).
+
+    Returns:
+        When loading a weight file in TensorFlow format, returns the same status
+        object as `tf.train.Checkpoint.restore`. When graph building, restore
+        ops are run automatically as soon as the network is built (on first call
+        for user-defined classes inheriting from `Model`, immediately if it is
+        already built).
+
+        When loading weights in HDF5 format, returns `None`.
+
+    Raises:
+        ImportError: If h5py is not available and the weight file is in HDF5
+            format.
+        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
+          `False`.
+    """
+    if dist_utils.is_tpu_strategy(self._distribution_strategy):
+      if (self._distribution_strategy.extended.steps_per_run > 1 and
+          (not _is_hdf5_filepath(filepath))):
+        raise ValueError('Load weights is not yet supported with TPUStrategy '
+                         'with steps_per_run greater than 1.')
+    if skip_mismatch and not by_name:
+      raise ValueError(
+          'When calling model.load_weights, skip_mismatch can only be set to '
+          'True when by_name is True.')
+
+    filepath = path_to_string(filepath)
+    if _is_hdf5_filepath(filepath):
+      save_format = 'h5'
+    else:
+      try:
+        py_checkpoint_reader.NewCheckpointReader(filepath)
+        save_format = 'tf'
+      except errors_impl.DataLossError:
+        # The checkpoint is not readable in TensorFlow format. Try HDF5.
+        save_format = 'h5'
+    if save_format == 'tf':
+      status = self._trackable_saver.restore(filepath)
+      if by_name:
+        raise NotImplementedError(
+            'Weights may only be loaded based on topology into Models when '
+            'loading TensorFlow-formatted weights (got by_name=True to '
+            'load_weights).')
+      if not context.executing_eagerly():
+        session = backend.get_session()
+        # Restore existing variables (if any) immediately, and set up a
+        # streaming restore for any variables created in the future.
+        trackable_utils.streaming_restore(status=status, session=session)
+      status.assert_nontrivial_match()
+      return status
+    if h5py is None:
+      raise ImportError(
+          '`load_weights` requires h5py when loading weights from HDF5.')
+    if not self._is_graph_network and not self.built:
+      raise ValueError(
+          'Unable to load weights saved in HDF5 format into a subclassed '
+          'Model which has not created its variables yet. Call the Model '
+          'first, then load the weights.')
+    self._assert_weights_created()
+    with h5py.File(filepath, 'r') as f:
+      if 'layer_names' not in f.attrs and 'model_weights' in f:
+        f = f['model_weights']
+      if by_name:
+        hdf5_format.load_weights_from_hdf5_group_by_name(
+            f, self.layers, skip_mismatch=skip_mismatch)
+      else:
+        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
+
+  def _updated_config(self):
+    """Util shared between different serialization methods.
+
+    Returns:
+        Model config with Keras version information added.
+    """
+    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+    config = self.get_config()
+    model_config = {
+        'class_name': self.__class__.__name__,
+        'config': config,
+        'keras_version': keras_version,
+        'backend': backend.backend()
+    }
+    return model_config
+
+  def get_config(self):
+    raise NotImplementedError
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    # Since only FunctionalModel produces config, the model can only
+    # be constructed for FunctionalModel
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    return functional.Functional.from_config(
+        config, custom_objects=custom_objects)
+
+  def to_json(self, **kwargs):
+    """Returns a JSON string containing the network configuration.
+
+    To load a network from a JSON save file, use
+    `keras.models.model_from_json(json_string, custom_objects={})`.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `json.dumps()`.
+
+    Returns:
+        A JSON string.
+    """
+    model_config = self._updated_config()
+    return json.dumps(
+        model_config, default=serialization.get_json_type, **kwargs)
+
+  def to_yaml(self, **kwargs):
+    """Returns a yaml string containing the network configuration.
+
+    To load a network from a yaml save file, use
+    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+    `custom_objects` should be a dictionary mapping
+    the names of custom losses / layers / etc to the corresponding
+    functions / classes.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `yaml.dump()`.
+
+    Returns:
+        A YAML string.
+
+    Raises:
+        ImportError: if yaml module is not found.
+    """
+    if yaml is None:
+      raise ImportError(
+          'Requires yaml module installed (`pip install pyyaml`).')
+    return yaml.dump(self._updated_config(), **kwargs)
+
+  def reset_states(self):
+    for layer in self.layers:
+      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
+        layer.reset_states()
+
+  @property
+  @deprecation.deprecated(
+      date=None,
+      instructions='This property should not be used in TensorFlow 2.0, '
+      'as updates are applied automatically.')
+  @doc_controls.do_not_generate_docs
+  def state_updates(self):
+    """Deprecated, do NOT use!
+
+    Returns the `updates` from all layers that are stateful.
+
+    This is useful for separating training updates and
+    state updates, e.g. when we need to update a layer's internal state
+    during prediction.
+
+    Returns:
+        A list of update ops.
+    """
+    state_updates = []
+    for layer in self.layers:
+      if getattr(layer, 'stateful', False):
+        if hasattr(layer, 'updates'):
+          state_updates += layer.updates
+    return state_updates
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self._dedup_weights(self._undeduplicated_weights)
+
+  @property
+  def _undeduplicated_weights(self):
+    """Returns the undeduplicated list of all layer variables/weights."""
+    self._assert_weights_created()
+    weights = []
+    for layer in self._layers:
+      weights += layer.weights
+    weights += (self._trainable_weights + self._non_trainable_weights)
+    return weights
+
+  def summary(self, line_length=None, positions=None, print_fn=None):
+    """Prints a string summary of the network.
+
+    Arguments:
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements
+            in each line. If not provided,
+            defaults to `[.33, .55, .67, 1.]`.
+        print_fn: Print function to use. Defaults to `print`.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+
+    Raises:
+        ValueError: if `summary()` is called before the model is built.
+    """
+    if not self.built:
+      raise ValueError('This model has not yet been built. '
+                       'Build the model first by calling `build()` or calling '
+                       '`fit()` with some data, or specify '
+                       'an `input_shape` argument in the first layer(s) for '
+                       'automatic build.')
+    layer_utils.print_summary(self,
+                              line_length=line_length,
+                              positions=positions,
+                              print_fn=print_fn)
+
+  @property
+  def layers(self):
+    return self._unique_sublayers()
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    If `name` and `index` are both provided, `index` will take precedence.
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # TODO(fchollet): We could build a dictionary based on layer names
+    # since they are constant, but we have not done that yet.
+    if index is not None and name is not None:
+      raise ValueError('Provide only a layer name or a layer index.')
+
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
+      else:
+        return self.layers[index]
+
+    if name is not None:
+      for layer in self.layers:
+        if layer.name == name:
+          return layer
+      raise ValueError('No such layer: ' + name + '.')
+    raise ValueError('Provide either a layer name or layer index.')
+
+  @trackable.no_automatic_dependency_tracking
+  def _set_save_spec(self, inputs):
+    if self._saved_model_inputs_spec is not None:
+      return  # Already set.
+
+    input_names = self.input_names
+    if not input_names:
+      input_names = compile_utils.create_pseudo_input_names(inputs)
+
+    flat_inputs = nest.flatten(inputs)
+    specs = []
+    for name, tensor in zip(input_names, flat_inputs):
+      specs.append(
+          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
+    specs = nest.pack_sequence_as(inputs, specs)
+
+    self._saved_model_inputs_spec = specs
+
+  def _get_save_spec(self, dynamic_batch=True):
+    if self._saved_model_inputs_spec is None:
+      return None
+
+    return nest.map_structure(
+        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
+        self._saved_model_inputs_spec)
+
+  def _assert_weights_created(self):
+    """Asserts that all the weights for the model have been created.
+
+    For a non-dynamic model, the weights must already be created after the
+    layer has been called. For a dynamic model, the exact list of weights can
+    never be known for certain since it may change at any time during execution.
+
+    We run this check right before accessing weights or getting the Numpy value
+    for the current weights. Otherwise, if the layer has never been called,
+    the user would just get an empty list, which is misleading.
+
+    Raises:
+      ValueError: if the weights of the network has not yet been created.
+    """
+    if self.dynamic:
+      return
+
+    if ('build' in self.__class__.__dict__ and
+        self.__class__ != Model and
+        not self.built):
+      # For any model that has customized build() method but hasn't
+      # been invoked yet, this will cover both sequential and subclass model.
+      # Also make sure to exclude Model class itself which has build() defined.
+      raise ValueError('Weights for model %s have not yet been created. '
+                       'Weights are created when the Model is first called on '
+                       'inputs or `build()` is called with an `input_shape`.' %
+                       self.name)
+
   def _check_call_args(self, method_name):
     """Check that `call` has only one positional arg."""
     # Always allow first arg, regardless of arg name.
@@ -1990,3 +2738,8 @@ def _disallow_inside_tf_function(method_name):
         'directly on `Tensor`s inside a `tf.function` like: `model(x)`.'
     ).format(method_name=method_name)
     raise RuntimeError(error_msg)
+
+
+def _is_hdf5_filepath(filepath):
+  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
+          filepath.endswith('.hdf5'))
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 16188af833a..c137c6e517a 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -43,7 +43,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
@@ -181,8 +181,8 @@ class Model(training_lib.Model):
                 self._compile_time_distribution_strategy)
     if strategy:
       with strategy.scope():
-        return network.Network.get_weights(self)
-    return network.Network.get_weights(self)
+        return base_layer.Layer.get_weights(self)
+    return base_layer.Layer.get_weights(self)
 
   def load_weights(self, filepath, by_name=False, skip_mismatch=False):
     """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
@@ -232,7 +232,7 @@ class Model(training_lib.Model):
     """
     if distributed_training_utils.is_tpu_strategy(self._distribution_strategy):
       if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
+          (not training_lib._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
         raise ValueError('Load weights is not yet supported with TPUStrategy '
                          'with steps_per_run greater than 1.')
     return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
@@ -491,6 +491,11 @@ class Model(training_lib.Model):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
     metrics = []
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics.
+        return super(Model, self).metrics
       metrics += self._compile_metric_functions
     metrics.extend(self._metrics)
     metrics.extend(_get_metrics_from_layers(self._layers))
@@ -504,6 +509,12 @@ class Model(training_lib.Model):
     # losses for backward compatibility.
     metrics_names = ['loss']
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics name
+        return super(Model, self).metrics_names
+
       # Add output loss metric names to the metric names list.
       if len(self._training_endpoints) > 1:
         metrics_names.extend([
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index fc7feda07a5..30be3d485df 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -114,7 +114,7 @@ def populate_deserializable_objects():
 
   LOCAL.ALL_OBJECTS['Input'] = input_layer.Input
   LOCAL.ALL_OBJECTS['InputSpec'] = input_spec.InputSpec
-  LOCAL.ALL_OBJECTS['Network'] = models.Network
+  LOCAL.ALL_OBJECTS['Functional'] = models.Functional
   LOCAL.ALL_OBJECTS['Model'] = models.Model
   LOCAL.ALL_OBJECTS['SequenceFeatures'] = SequenceFeatures
   LOCAL.ALL_OBJECTS['Sequential'] = models.Sequential
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index a3173f4d11f..bb22db25591 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -377,7 +377,8 @@ class TimeDistributedTest(keras_parameterized.TestCase):
           input_layer.compute_output_shape([None, 2, 4]).as_list(),
           [None, 2, 8])
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  # TODO(scottzhu): check why v1 session failed.
   def test_TimeDistributed_with_mask_first_implementation(self):
     np.random.seed(100)
     rnn_layer = keras.layers.LSTM(4, return_sequences=True, stateful=True)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index eaffb90e64b..9f5099e100e 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_v1
@@ -31,7 +31,6 @@ from tensorflow.python.keras.engine.base_layer import AddMetric
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import generic_utils
@@ -45,6 +44,7 @@ from tensorflow.python.util.tf_export import keras_export
 # API entries importable from `keras.models`:
 Model = training.Model  # pylint: disable=invalid-name
 Sequential = sequential.Sequential  # pylint: disable=invalid-name
+Functional = functional.Functional  # pylint: disable=invalid-name
 save_model = save.save_model
 load_model = save.load_model
 model_from_config = model_config.model_from_config
@@ -193,12 +193,12 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if not callable(layer_fn):
     raise ValueError('Expected `layer_fn` argument to be a callable.')
 
-  model_config, created_layers = _clone_layers_and_model_config(
+  model_configs, created_layers = _clone_layers_and_model_config(
       model, new_input_layers, layer_fn)
   # Reconstruct model from the config, using the cloned layers.
   input_tensors, output_tensors, created_layers = (
-      network.reconstruct_from_config(model_config,
-                                      created_layers=created_layers))
+      functional.reconstruct_from_config(model_configs,
+                                         created_layers=created_layers))
   metrics_names = model.metrics_names
   model = Model(input_tensors, output_tensors, name=model.name)
   # Layers not directly tied to outputs of the Model, such as loss layers
@@ -209,8 +209,8 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if ancillary_layers:
     new_nodes = nest.flatten([
         layer.inbound_nodes[1:]
-        if network._should_skip_first_node(layer) else layer.inbound_nodes
-        for layer in created_layers.values()
+        if functional._should_skip_first_node(layer)
+        else layer.inbound_nodes for layer in created_layers.values()
     ])
     _insert_ancillary_layers(model, ancillary_layers, metrics_names, new_nodes)
   return model
@@ -244,7 +244,8 @@ def _clone_layers_and_model_config(model, input_layers, layer_fn):
       created_layers[layer.name] = layer_fn(layer)
     return {}
 
-  config = network.get_network_config(model, serialize_layer_fn=_copy_layer)
+  config = functional.get_network_config(
+      model, serialize_layer_fn=_copy_layer)
   return config, created_layers
 
 
@@ -495,7 +496,7 @@ def _in_place_subclassed_model_reset(model):
     # This will not work for nested subclassed models used as layers.
     # This would be theoretically possible to support, but would add complexity.
     # Only do it if users complain.
-    if isinstance(layer, Network) and not layer._is_graph_network:
+    if isinstance(layer, training.Model) and not layer._is_graph_network:
       raise ValueError('We do not support the use of nested subclassed models '
                        'in `model_to_estimator` at this time. Found nested '
                        'model: %s' % layer)
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index cae58329005..757385a25ea 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -1210,7 +1210,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   def test_incompatible_checkpoint(self):
     save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
-    m = keras.Model()
+    m = DummySubclassModel()
     with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
       m.load_weights(save_path)
     m.dense = keras.layers.Dense(2)
@@ -1222,7 +1222,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_directory_passed(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1235,7 +1235,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_relative_path(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       os.chdir(self.get_temp_dir())
 
@@ -1266,7 +1266,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_nonexistent_prefix_directory(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1276,5 +1276,10 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       m.load_weights(prefix)
       self.assertEqual(42., self.evaluate(v))
 
+
+class DummySubclassModel(training.Model):
+  pass
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 5ffeb0671a1..13af49e3a0d 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -62,9 +62,9 @@ layers_module = LazyLoader(
 input_layer = LazyLoader(
     "input_layer", globals(),
     "tensorflow.python.keras.engine.input_layer")
-network_lib = LazyLoader(
-    "network_lib", globals(),
-    "tensorflow.python.keras.engine.network")
+functional_lib = LazyLoader(
+    "functional_lib", globals(),
+    "tensorflow.python.keras.engine.functional")
 training_lib = LazyLoader(
     "training_lib", globals(),
     "tensorflow.python.keras.engine.training")
@@ -142,7 +142,7 @@ def _is_graph_network(layer):
   # pylint: disable=protected-access
   if isinstance(layer, RevivedNetwork):
     return False
-  elif isinstance(layer, network_lib.Network):
+  elif isinstance(layer, functional_lib.Functional):
     return (layer._is_graph_network or
             isinstance(layer, models_lib.Sequential))
   return False
@@ -371,7 +371,8 @@ class KerasObjectLoader(tf_load.Loader):
     # functional or Sequential model.
     model_is_functional_or_sequential = (
         metadata.get('is_graph_network', False) or
-        metadata['class_name'] == 'Sequential')
+        metadata['class_name'] == 'Sequential' or
+        metadata['class_name'] == 'Functional')
     if not (generic_utils.validate_config(config) and
             model_is_functional_or_sequential):
       return None  # Revive as custom model.
@@ -383,7 +384,8 @@ class KerasObjectLoader(tf_load.Loader):
     if class_name == 'Sequential':
       model = models_lib.Sequential(name=config['name'])
     else:
-      model = models_lib.Model(name=config['name'])
+      model = models_lib.Functional(
+          inputs=[], outputs=[], name=config['name'])
 
     # Record this model and its layers. This will later be used to reconstruct
     # the model.
@@ -561,10 +563,11 @@ class KerasObjectLoader(tf_load.Loader):
         if not model.built and not isinstance(input_specs, dict):
           model.build(input_shapes)
     else:
-      (inputs, outputs, created_layers) = network_lib.reconstruct_from_config(
-          config, created_layers={layer.name: layer for layer in layers})
+      (inputs, outputs,
+       created_layers) = functional_lib.reconstruct_from_config(
+           config, created_layers={layer.name: layer for layer in layers})
       model.__init__(inputs, outputs, name=config['name'])
-      network_lib.connect_ancillary_layers(model, created_layers)
+      functional_lib.connect_ancillary_layers(model, created_layers)
 
     # Set model dtype and trainable status.
     _set_network_attributes_from_metadata(model)
@@ -764,7 +767,7 @@ def revive_custom_object(identifier, metadata):
   revived_classes = {
       '_tf_keras_layer': (RevivedLayer, base_layer.Layer),
       '_tf_keras_input_layer': (RevivedInputLayer, input_layer.InputLayer),
-      '_tf_keras_network': (RevivedNetwork, network_lib.Network),
+      '_tf_keras_network': (RevivedNetwork, functional_lib.Functional),
       '_tf_keras_model': (RevivedNetwork, model_class),
       '_tf_keras_sequential': (RevivedNetwork, models_lib.Sequential),
   }
@@ -852,7 +855,7 @@ def _revive_setter(layer, name, value):
       layer._track_trackable(value, name=name)
     layer._serialized_attributes[name] = value
     # pylint: enable=protected-access
-  elif (isinstance(layer, network_lib.Network) and
+  elif (isinstance(layer, functional_lib.Functional) and
         re.match(r'^layer(_with_weights)?-[\d+]', name) is not None):
     # Edges named "layer-n" or "layer_with_weights-n", which are tracked in
     # network._track_layers, should not be added as an attribute.
diff --git a/tensorflow/python/keras/saving/saved_model/model_serialization.py b/tensorflow/python/keras/saving/saved_model/model_serialization.py
index 412fb0b54e5..c711e82a045 100644
--- a/tensorflow/python/keras/saving/saved_model/model_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/model_serialization.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
-from tensorflow.python.keras.saving.saved_model import network_serialization
+from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.saving.saved_model import save_impl
 
 
-class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
+class ModelSavedModelSaver(layer_serialization.LayerSavedModelSaver):
   """Model SavedModel serialization."""
 
   @property
@@ -33,6 +33,10 @@ class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
 
   def _python_properties_internal(self):
     metadata = super(ModelSavedModelSaver, self)._python_properties_internal()
+    # Network stateful property is dependent on the child layers.
+    metadata.pop('stateful')
+    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
+
     metadata.update(
         saving_utils.model_metadata(
             self.obj, include_optimizer=True, require_config=False))
diff --git a/tensorflow/python/keras/saving/saved_model/network_serialization.py b/tensorflow/python/keras/saving/saved_model/network_serialization.py
index 1c94377e3db..c98cba47155 100644
--- a/tensorflow/python/keras/saving/saved_model/network_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/network_serialization.py
@@ -18,22 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.saving.saved_model import layer_serialization
+from tensorflow.python.keras.saving.saved_model import model_serialization
 
 
-# Network serialization is pretty much the same as layer serialization.
-class NetworkSavedModelSaver(layer_serialization.LayerSavedModelSaver):
+# FunctionalModel serialization is pretty much the same as Model serialization.
+class NetworkSavedModelSaver(model_serialization.ModelSavedModelSaver):
   """Network serialization."""
 
   @property
   def object_identifier(self):
     return '_tf_keras_network'
-
-  def _python_properties_internal(self):
-    metadata = super(NetworkSavedModelSaver, self)._python_properties_internal()
-
-    # Network stateful property is dependent on the child layers.
-    metadata.pop('stateful')
-
-    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
-    return metadata
diff --git a/tensorflow/python/keras/utils/version_utils_test.py b/tensorflow/python/keras/utils/version_utils_test.py
index 76e888ca553..0a3cd53f3c0 100644
--- a/tensorflow/python/keras/utils/version_utils_test.py
+++ b/tensorflow/python/keras/utils/version_utils_test.py
@@ -53,12 +53,12 @@ class SplitUtilsTest(keras_parameterized.TestCase):
     inputs = keras.Input(10)
     outputs = keras.layers.Dense(1)(inputs)
     model = keras.Model(inputs, outputs)
-    self._check_model_class(model.__class__)
+    self._check_model_class(model.__class__.__bases__[0])
     self._check_layer_class(model)
 
   def test_sequential_model(self):
     model = keras.Sequential([keras.layers.Dense(1)])
-    model_class = model.__class__.__bases__[0]
+    model_class = model.__class__.__bases__[0].__bases__[0]
     self._check_model_class(model_class)
     self._check_layer_class(model)
 
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 87c436a5bd7..158f6c83748 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -55,10 +55,10 @@ def check_pydot():
 
 
 def is_wrapped_model(layer):
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
   from tensorflow.python.keras.layers import wrappers
   return (isinstance(layer, wrappers.Wrapper) and
-          isinstance(layer.layer, network.Network))
+          isinstance(layer.layer, functional.Functional))
 
 
 def add_edge(dot, src, dst):
@@ -98,7 +98,7 @@ def model_to_dot(model,
   """
   from tensorflow.python.keras.layers import wrappers
   from tensorflow.python.keras.engine import sequential
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
 
   if not check_pydot():
     message = (
@@ -147,7 +147,8 @@ def model_to_dot(model,
     class_name = layer.__class__.__name__
 
     if isinstance(layer, wrappers.Wrapper):
-      if expand_nested and isinstance(layer.layer, network.Network):
+      if expand_nested and isinstance(layer.layer,
+                                      functional.Functional):
         submodel_wrapper = model_to_dot(layer.layer, show_shapes,
                                         show_layer_names, rankdir,
                                         expand_nested,
@@ -162,7 +163,7 @@ def model_to_dot(model,
         child_class_name = layer.layer.__class__.__name__
         class_name = '{}({})'.format(class_name, child_class_name)
 
-    if expand_nested and isinstance(layer, network.Network):
+    if expand_nested and isinstance(layer, functional.Functional):
       submodel_not_wrapper = model_to_dot(layer, show_shapes,
                                           show_layer_names, rankdir,
                                           expand_nested,
@@ -200,7 +201,8 @@ def model_to_dot(model,
                                                      inputlabels,
                                                      outputlabels)
 
-    if not expand_nested or not isinstance(layer, network.Network):
+    if not expand_nested or not isinstance(
+        layer, functional.Functional):
       node = pydot.Node(layer_id, label=label)
       dot.add_node(node)
 
@@ -218,16 +220,17 @@ def model_to_dot(model,
             add_edge(dot, inbound_layer_id, layer_id)
           else:
             # if inbound_layer is not Model or wrapped Model
-            if (not isinstance(inbound_layer, network.Network) and
+            if (not isinstance(inbound_layer,
+                               functional.Functional) and
                 not is_wrapped_model(inbound_layer)):
               # if current layer is not Model or wrapped Model
-              if (not isinstance(layer, network.Network) and
+              if (not isinstance(layer, functional.Functional) and
                   not is_wrapped_model(layer)):
                 assert dot.get_node(inbound_layer_id)
                 assert dot.get_node(layer_id)
                 add_edge(dot, inbound_layer_id, layer_id)
               # if current layer is Model
-              elif isinstance(layer, network.Network):
+              elif isinstance(layer, functional.Functional):
                 add_edge(dot, inbound_layer_id,
                          sub_n_first_node[layer.name].get_name())
               # if current layer is wrapped Model
@@ -236,9 +239,9 @@ def model_to_dot(model,
                 name = sub_w_first_node[layer.layer.name].get_name()
                 add_edge(dot, layer_id, name)
             # if inbound_layer is Model
-            elif isinstance(inbound_layer, network.Network):
+            elif isinstance(inbound_layer, functional.Functional):
               name = sub_n_last_node[inbound_layer.name].get_name()
-              if isinstance(layer, network.Network):
+              if isinstance(layer, functional.Functional):
                 output_name = sub_n_first_node[layer.name].get_name()
                 add_edge(dot, name, output_name)
               else:
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 272396239d7..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 8979491971f..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 448ea60cc0f..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 8e1d9927434..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 13c3416fc0c..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 9218cbea99e..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 272396239d7..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 8979491971f..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 448ea60cc0f..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 8e1d9927434..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 13c3416fc0c..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 9218cbea99e..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"

From 9da04eec59c1e99d7b4c0bfd29f9efa09598cf68 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 12 May 2020 17:23:01 -0700
Subject: [PATCH 0460/1533] IWYU in profiler/lib

PiperOrigin-RevId: 311233906
Change-Id: I9207e56c017112eba2f59ee57d67c9825a015818
---
 tensorflow/core/profiler/lib/BUILD               |  3 +++
 tensorflow/core/profiler/lib/annotated_traceme.h |  4 ++++
 tensorflow/core/profiler/lib/profiler_session.cc | 11 ++++++++++-
 tensorflow/core/profiler/lib/profiler_session.h  |  2 ++
 tensorflow/core/profiler/lib/scoped_annotation.h |  1 +
 tensorflow/core/profiler/lib/traceme.h           |  4 ++++
 6 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index b3028c717bf..6316fd118fc 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -52,12 +52,14 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "@com_google_absl//absl/memory",
+        "//tensorflow/core:protos_all_cc",
     ] + if_not_android([
         ":profiler_utils",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
     ]),
     alwayslink = True,
 )
@@ -109,6 +111,7 @@ cc_library(
         ":traceme",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index f40c1e9ad92..c3257e2adbe 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 
+#include <utility>
+
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 497ee76b2af..9783cd14f95 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -15,12 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 
+#include <memory>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/platform/env_time.h"
-#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/env_var.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -28,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/profiler_utils.h"
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #endif
 
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 83d0683f740..6f92b047eb7 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -21,9 +21,11 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h
index 61b0cf42dd6..2cad5fd4708 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 8b42f187850..af93ac11b1e 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 
+#include <new>
+#include <utility>
+
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/core/platform/env_time.h"

From 6d583589fe3f1fd95290df760abe165526c18585 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 18:01:00 -0700
Subject: [PATCH 0461/1533] Flush denormals to zero in eager mode.

PiperOrigin-RevId: 311239051
Change-Id: Iefbc09c82e07af29580319fee024965a2c554378
---
 .../common_runtime/eager/kernel_and_device.cc |  4 +++
 .../python/kernel_tests/denormal_test.py      | 33 +++++++++----------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 3c586e8188a..bf7c083f24b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -281,6 +283,8 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
+    port::ScopedFlushDenormal flush;
+    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index d824e95f213..6e073f0d526 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,7 +23,6 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,32 +34,30 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine(
-    ) == "s390x" or platform.machine() == "aarch64":
+  def _flushDenormalsTest(self, dtypes):
+    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
+        platform.machine() == "aarch64"):
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    with self.cached_session(use_gpu=use_gpu):
-      array_ops.identity(7).eval()
-      for dtype in dtypes:
-        tiny = np.finfo(dtype).tiny
-        # Small shape to test main thread, large shape to test thread pool
-        for shape in (), (1 << 20,):
-          flush = 0.1 * constant_op.constant(tiny, shape=shape)
-          self.assertAllEqual(flush.eval(), np.zeros(shape))
-          # Make sure the flags don't leak out
-          self.testPythonHasDenormals()
+    for dtype in dtypes:
+      tiny = np.finfo(dtype).tiny
+      # Small shape to test main thread, large shape to test thread pool
+      for shape in (), (1 << 20,):
+        flush = 0.1 * constant_op.constant(tiny, shape=shape)
+        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
+        # Make sure the flags don't leak out
+        self.testPythonHasDenormals()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
+    self._flushDenormalsTest(dtypes=(np.float32,))
 
 
 if __name__ == "__main__":

From 4e9b6b454e1d057513ac477b2cd65f5925f91cc8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 01:29:03 +0000
Subject: [PATCH 0462/1533] Fix the issue of tf.divide's return value is not a
 tensor

This PR fixes the issue of 39475 where tf.divide's return value
is not a tensor in case x, y in divide(x, y) are both primitive python
types.

The reason was that tf.divide relies on implict `x / y`. However,
if both x and y are not tensor, the return value will fall through
python and will not be a tensor.

This PR fixes the issue.

This PR fixes 39475.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py      | 7 ++++++-
 tensorflow/python/ops/math_ops_test.py | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index f062047cec2..b981af72e83 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -72,6 +72,7 @@ from __future__ import print_function
 
 import numpy as np
 import six
+import sys
 from six.moves import builtins
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -438,9 +439,13 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
+    if not (isinstance(x, ops.Tensor)  or isinstance(y, ops.Tensor)):
+      if sys.version_info.major < 3:
+        return _truediv_python2(x, y)
+      else:
+        return _truediv_python3(x, y)
     return x / y
 
-
 @tf_export("math.multiply", "multiply")
 @dispatch.add_dispatch_support
 def multiply(x, y, name=None):
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 2405eec9e49..dab0ea88ba8 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -495,6 +495,12 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     # Consistent with desire to get numerator
     self.assertAllEqual(tf_result, expanded_nums)
 
+  def testWithPythonValue(self):
+    # Test case for GitHub issue 39475:
+    # https://github.com/tensorflow/tensorflow/issues/39475
+    x = math_ops.divide(5,  2)
+    self.assertTrue(isinstance(x, ops.Tensor))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class DivNoNanTest(test_util.TensorFlowTestCase):

From e617aabd09291f752dc0c56f337dcd5031bd754f Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Tue, 12 May 2020 18:25:28 -0700
Subject: [PATCH 0463/1533] Add optional error reporting and status returns to
 MicroOpResolver

PiperOrigin-RevId: 311242225
Change-Id: Ibb92991c3ab161c1aac5d828f8f4f3e17cdecd8b
---
 .../examples/micro_speech/main_functions.cc   |  24 ++--
 .../lite/micro/micro_mutable_op_resolver.h    |  55 ++++++---
 .../micro/micro_mutable_op_resolver_test.cc   | 105 +++++++++++++++++-
 3 files changed, 157 insertions(+), 27 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index 23c63a32986..d3989c07333 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -74,14 +74,22 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  static tflite::MicroOpResolver<3> micro_op_resolver(error_reporter);
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+          tflite::ops::micro::Register_DEPTHWISE_CONV_2D()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_FULLY_CONNECTED,
+          tflite::ops::micro::Register_FULLY_CONNECTED()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                                   tflite::ops::micro::Register_SOFTMAX()) !=
+      kTfLiteOk) {
+    return;
+  }
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 2f6d4d27823..ead9be490a3 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -34,6 +34,9 @@ inline int MicroOpResolverAnyVersion() { return 0; }
 template <unsigned int tOpCount = TFLITE_REGISTRATIONS_MAX>
 class MicroOpResolver : public OpResolver {
  public:
+  explicit MicroOpResolver(ErrorReporter* error_reporter = nullptr)
+      : error_reporter_(error_reporter) {}
+
   const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
                                    int version) const override {
     for (unsigned int i = 0; i < registrations_len_; ++i) {
@@ -62,11 +65,16 @@ class MicroOpResolver : public OpResolver {
     return nullptr;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int version = 1) {
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration, int version = 1) {
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Couldn't register builtin op #%d, resolver size "
+                             "is too small (%d)",
+                             op, tOpCount);
+      }
+      return kTfLiteError;
     }
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
@@ -74,20 +82,32 @@ class MicroOpResolver : public OpResolver {
     *new_registration = *registration;
     new_registration->builtin_code = op;
     new_registration->version = version;
+
+    return kTfLiteOk;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int min_version, int max_version) {
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration, int min_version,
+                          int max_version) {
     for (int version = min_version; version <= max_version; ++version) {
-      AddBuiltin(op, registration, version);
+      TfLiteStatus add_status = AddBuiltin(op, registration, version);
+      if (add_status != kTfLiteOk) {
+        return add_status;
+      }
     }
+    return kTfLiteOk;
   }
 
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int version = 1) {
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int version = 1) {
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
+            "Couldn't register custom op '%s', resolver size is too small (%d)",
+            name, tOpCount);
+      }
+      return kTfLiteError;
     }
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
@@ -96,13 +116,19 @@ class MicroOpResolver : public OpResolver {
     new_registration->builtin_code = BuiltinOperator_CUSTOM;
     new_registration->custom_name = name;
     new_registration->version = version;
+
+    return kTfLiteOk;
   }
 
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int min_version, int max_version) {
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int min_version, int max_version) {
     for (int version = min_version; version <= max_version; ++version) {
-      AddCustom(name, registration, version);
+      TfLiteStatus add_status = AddCustom(name, registration, version);
+      if (add_status != kTfLiteOk) {
+        return add_status;
+      }
     }
+    return kTfLiteOk;
   }
 
   unsigned int GetRegistrationLength() { return registrations_len_; }
@@ -110,6 +136,7 @@ class MicroOpResolver : public OpResolver {
  private:
   TfLiteRegistration registrations_[tOpCount];
   unsigned int registrations_len_ = 0;
+  ErrorReporter* error_reporter_;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index cf39994acec..61ab0e3bec9 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -35,6 +34,22 @@ TfLiteStatus MockPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus MockInvoke(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
+
+class MockErrorReporter : public ErrorReporter {
+ public:
+  MockErrorReporter() : has_been_called_(false) {}
+  int Report(const char* format, va_list args) override {
+    has_been_called_ = true;
+    return 0;
+  };
+
+  bool HasBeenCalled() { return has_been_called_; }
+
+ private:
+  bool has_been_called_;
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
 }  // namespace
 }  // namespace tflite
 
@@ -52,8 +67,10 @@ TF_LITE_MICRO_TEST(TestOperations) {
   // We need space for 7 operators because of 2 ops, one with 3 versions, one
   // with 4 versions.
   MicroOpResolver<7> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 1, 3);
-  micro_op_resolver.AddCustom("mock_custom", &r, 1, 4);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 1, 3));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 1, 4));
   OpResolver* resolver = &micro_op_resolver;
 
   const TfLiteRegistration* registration =
@@ -96,8 +113,10 @@ TF_LITE_MICRO_TEST(TestOpRegistrationOverflow) {
   MicroOpResolver<4> micro_op_resolver;
   // Register 7 ops, but only 4 is expected because the class is created with
   // that limit..
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 0, 2);
-  micro_op_resolver.AddCustom("mock_custom", &r, 0, 3);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 0, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 0, 3));
   OpResolver* resolver = &micro_op_resolver;
 
   TF_LITE_MICRO_EXPECT_EQ(4, micro_op_resolver.GetRegistrationLength());
@@ -174,4 +193,80 @@ TF_LITE_MICRO_TEST(TestZeroModelVersion) {
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 }
 
+TF_LITE_MICRO_TEST(TestBuiltinRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestCustomRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom_0", &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom_1", &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestBuiltinVersionRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
+                                            BuiltinOperator_RELU, &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestCustomVersionRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddCustom("mock_custom_0", &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddCustom("mock_custom_1", &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
 TF_LITE_MICRO_TESTS_END

From 047d788ea42397f22aa3f6d80c9c9dce53f564b5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 18:34:46 -0700
Subject: [PATCH 0464/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311243401
Change-Id: I89adb2b883527f2e548665b7710bfb03d71b32cd
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From f690a054c599d51d7c8e9ae83c7d0ebd70f80cca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 19:45:42 -0700
Subject: [PATCH 0465/1533] Flush denormals to zero in eager mode.

PiperOrigin-RevId: 311251058
Change-Id: I6ddca2fabc904e8e7400735aaddef361ba0b8778
---
 .../common_runtime/eager/kernel_and_device.cc |  4 ---
 .../python/kernel_tests/denormal_test.py      | 33 ++++++++++---------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index bf7c083f24b..3c586e8188a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,10 +35,8 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -283,8 +281,6 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
-    port::ScopedFlushDenormal flush;
-    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index 6e073f0d526..d824e95f213 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,6 +23,7 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -34,30 +35,32 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, dtypes):
-    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
-        platform.machine() == "aarch64"):
+  def _flushDenormalsTest(self, use_gpu, dtypes):
+    if platform.machine() == "ppc64le" or platform.machine(
+    ) == "s390x" or platform.machine() == "aarch64":
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    for dtype in dtypes:
-      tiny = np.finfo(dtype).tiny
-      # Small shape to test main thread, large shape to test thread pool
-      for shape in (), (1 << 20,):
-        flush = 0.1 * constant_op.constant(tiny, shape=shape)
-        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
-        # Make sure the flags don't leak out
-        self.testPythonHasDenormals()
+    with self.cached_session(use_gpu=use_gpu):
+      array_ops.identity(7).eval()
+      for dtype in dtypes:
+        tiny = np.finfo(dtype).tiny
+        # Small shape to test main thread, large shape to test thread pool
+        for shape in (), (1 << 20,):
+          flush = 0.1 * constant_op.constant(tiny, shape=shape)
+          self.assertAllEqual(flush.eval(), np.zeros(shape))
+          # Make sure the flags don't leak out
+          self.testPythonHasDenormals()
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  @test_util.run_deprecated_v1
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_deprecated_v1
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(dtypes=(np.float32,))
+    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
 
 
 if __name__ == "__main__":

From e0157b592c2c3b0f75226bc4aaf0c5bc1df69974 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 10:45:58 +0800
Subject: [PATCH 0466/1533] add fp16 option to NNAPI delegate

adding a field for relaxing fp32 to fp16 into the NNAPI delegate
option and changing evaluation tools accordingly
---
 .../lite/delegates/nnapi/nnapi_delegate.cc       | 16 ++++++++++++----
 tensorflow/lite/delegates/nnapi/nnapi_delegate.h |  5 +++++
 .../lite/delegates/nnapi/nnapi_delegate_kernel.h |  1 +
 .../accuracy/ilsvrc/imagenet_model_evaluator.cc  |  4 ++--
 .../accuracy/ilsvrc/imagenet_model_evaluator.h   |  2 +-
 .../tools/delegates/nnapi_delegate_provider.cc   | 15 ++++++++++++++-
 .../evaluation/evaluation_delegate_provider.cc   |  4 ++--
 .../evaluation/proto/evaluation_stages.proto     |  4 ++--
 .../evaluation/stages/tflite_inference_stage.cc  |  1 -
 .../tasks/coco_object_detection/run_eval.cc      |  6 +++---
 .../imagenet_image_classification/run_eval.cc    |  6 +++---
 .../evaluation/tasks/inference_diff/run_eval.cc  |  5 +++--
 12 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 002c29915c6..867d03f5227 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -3151,7 +3151,8 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
                                     "creating NNAPI model", nnapi_errno);
     nn_model_.reset(model);
 
-    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->input_tensors,
+    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->delegate,
+                                     params->input_tensors,
                                      params->output_tensors, nnapi_errno));
   }
 
@@ -3202,6 +3203,7 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
 
   const auto delegate_options =
       StatefulNnApiDelegate::GetOptions(node->delegate);
+
   ANeuralNetworksCompilation* compilation = nullptr;
   if (!nnapi_devices_.empty()) {
     // Compile for the selected accelerator.
@@ -3875,8 +3877,9 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
 }
 
 TfLiteStatus NNAPIDelegateKernel::BuildGraph(
-    TfLiteContext* context, const TfLiteIntArray* input_tensors,
-    const TfLiteIntArray* output_tensors, int* nnapi_errno) {
+    TfLiteContext* context, TfLiteDelegate* delegate,
+    const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
+    int* nnapi_errno) {
   // Build the ops and tensors.
   TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context, nnapi_errno));
   // Map input and output tensor indices to ANN
@@ -3885,6 +3888,7 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
   std::vector<uint32_t> outputs;
   outputs.reserve(output_tensors->size);
 
+  const auto delegate_options = StatefulNnApiDelegate::GetOptions(delegate);
   size_t total_input_byte_size = 0;
   // Make the TensorFlow Lite inputs and outputs to ann_indices.
   for (int i : TfLiteIntArrayView(input_tensors)) {
@@ -3941,11 +3945,13 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
           outputs.data()),
       "identifying model inputs and outputs", nnapi_errno);
 
+  auto allow_fp16 =
+      context->allow_fp32_relax_to_fp16 | delegate_options.allow_fp16;
   if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context,
         nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-            nn_model_.get(), context->allow_fp32_relax_to_fp16),
+            nn_model_.get(), allow_fp16),
         "set relaxed computation mode for fp32 if possible", nnapi_errno);
   }
 
@@ -4021,6 +4027,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
       options.max_number_delegated_partitions;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
+  delegate_data_.allow_fp16 = options.allow_fp16;
   Prepare = DoPrepare;
   CopyFromBufferHandle = DoCopyFromBufferHandle;
   CopyToBufferHandle = DoCopyToBufferHandle;
@@ -4048,6 +4055,7 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
   options.disallow_nnapi_cpu = delegate_data->disallow_nnapi_cpu;
   options.max_number_delegated_partitions =
       delegate_data->max_number_delegated_partitions;
+  options.allow_fp16 = delegate_data->allow_fp16;
   return options;
 }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index fe777ea99aa..1bd9fb5c49f 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -89,6 +89,9 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The selection is currently done sorting partitions in decreasing order
     // of number of nodes and selecting them until the limit is reached.
     int max_number_delegated_partitions = 3;
+
+    // allow fp32 compuation to be run in fp16
+    bool allow_fp16 = false;
   };
 
   // Uses default options.
@@ -184,6 +187,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Maximum number of NNAPI partition to delegate. Zero or negative means
     // no limit. Copied from StatefulNnApiDelegate::Options
     int max_number_delegated_partitions;
+    // allow fp32 computation to be run in fp32
+    bool allow_fp16 = false;
 
     ~Data();
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index b35bf0224fd..60151196372 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -349,6 +349,7 @@ class NNAPIDelegateKernel {
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
+                          TfLiteDelegate* delegate,
                           const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 61c2acb8b2e..64ce87ae8aa 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -141,8 +141,8 @@ class CompositeObserver : public ImagenetModelEvaluator::Observer {
       tflite::Flag::CreateFlag(kNumRanksFlag, &params.num_ranks,
                                "Generates the top-1 to top-k accuracy values"
                                "where k = num_ranks. Default: 10"),
-      tflite::Flag::CreateFlag("allow_fp16", &params.allow_fp16,
-                               "allow fp16"),
+      tflite::Flag::CreateFlag("nnapi_allow_fp16", &params.nnapi_allow_fp16,
+                               "allow fp16 in nnapi"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 323069383c3..3ba22cbc2af 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -80,7 +80,7 @@ class ImagenetModelEvaluator {
     int num_interpreter_threads = 1;
 
     // allow fp16
-    bool allow_fp16 = false;
+    bool nnapi_allow_fp16 = false;
   };
 
   // An evaluation observer.
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index f3ed8743e54..6492ba82849 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -33,6 +33,8 @@ class NnapiDelegateProvider : public DelegateProvider {
                              ToolParam::Create<std::string>(""));
     default_params_.AddParam("disable_nnapi_cpu",
                              ToolParam::Create<bool>(false));
+    default_params_.AddParam("nnapi_allow_fp16",
+                             ToolParam::Create<bool>(false));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -56,7 +58,9 @@ std::vector<Flag> NnapiDelegateProvider::CreateFlags(ToolParams* params) const {
           "nnapi_accelerator_name", params,
           "the name of the nnapi accelerator to use (requires Android Q+)"),
       CreateFlag<bool>("disable_nnapi_cpu", params,
-                       "Disable the NNAPI CPU device")};
+                       "Disable the NNAPI CPU device"),
+      CreateFlag<bool>("nnapi_allow_fp16", params,
+                       "Allow fp32 computation to be run in fp16")};
 
   return flags;
 }
@@ -83,6 +87,10 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
       TFLITE_LOG(INFO) << "disable_nnapi_cpu: ["
                        << params.Get<bool>("disable_nnapi_cpu") << "]";
     }
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      TFLITE_LOG(INFO) << "nnapi_allow_fp16: ["
+                       << params.Get<bool>("nnapi_allow_fp16") << "]";
+    }
   }
 #endif
 }
@@ -99,6 +107,11 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
     } else if (params.Get<bool>("disable_nnapi_cpu")) {
       options.disallow_nnapi_cpu = true;
     }
+
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      options.allow_fp16 = true;
+    }
+
     std::string string_execution_preference =
         params.Get<std::string>("nnapi_execution_preference");
     // Only set execution preference if user explicitly passes one. Otherwise,
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index a7625441406..ea07378a8fa 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -132,8 +132,8 @@ tools::ToolParams DelegateProviders::GetAllParams(
     tool_params.Set<int32_t>("num_threads", params.num_threads());
   }
 
-  if (params.has_allow_fp16()) {
-    tool_params.Set<bool>("allow_fp16", params.allow_fp16());
+  if (params.has_nnapi_allow_fp16()) {
+    tool_params.Set<bool>("nnapi_allow_fp16", params.nnapi_allow_fp16());
   }
 
   const auto type = params.delegate();
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index c7d033eb111..cecdb22c637 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -122,8 +122,8 @@ message TfliteInferenceParams {
   // required for every input.
   optional int32 invocations_per_run = 4 [default = 1];
 
-  // allow_fp16
-  optional bool allow_fp16 = 5 [default = false];
+  // nnapi_allow_fp16
+  optional bool nnapi_allow_fp16 = 5 [default = false];
 }
 
 // Metrics specific to TFLite inference.
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 8189140e953..365a00c3cd1 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -95,7 +95,6 @@ TfLiteStatus TfliteInferenceStage::Init(
     return kTfLiteError;
   }
   interpreter_->SetNumThreads(params.num_threads());
-  interpreter_->SetAllowFp16PrecisionForFp32(params.allow_fp16());
 
   if (!delegate_providers) {
     std::string error_message;
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 1ff4e55c270..de1ae6e2e94 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -106,8 +106,8 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
       tflite::Flag::CreateFlag(
-          "allow_fp16", &allow_fp16_,
-          "allow fp16"),
+          "nnapi_allow_fp16", &allow_fp16_,
+          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   DelegateProviders delegate_providers;
@@ -136,7 +136,7 @@ absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_allow_fp16(allow_fp16_);
+  inference_params->set_nnapi_allow_fp16(allow_fp16_);
 
   // Get ground truth data.
   absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 1e1cf86732a..8a7fd864c6e 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -108,8 +108,8 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
       tflite::Flag::CreateFlag(
-          "allow_fp16", &allow_fp16_,
-          "allow fp16"),
+          "nnapi_allow_fp16", &allow_fp16_,
+          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -159,7 +159,7 @@ absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_allow_fp16(allow_fp16_);
+  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
 
   ImageClassificationStage eval(eval_config);
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index de41fb96a03..c85d997974b 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -72,7 +72,8 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
-      tflite::Flag::CreateFlag("allow_fp16", &allow_fp16_, "allow fp16")
+      tflite::Flag::CreateFlag("nnapi_allow_fp16", &allow_fp16_,
+                               "nnapi allow fp16")
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -90,7 +91,7 @@ absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
   // generating random data.
   inference_params->set_invocations_per_run(3);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_allow_fp16(allow_fp16_);
+  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   if (!delegate_.empty() &&
       inference_params->delegate() == TfliteInferenceParams::NONE) {
     TFLITE_LOG(WARN) << "Unsupported TFLite delegate: " << delegate_;

From efa921a7702b6afee571da9de52aea801c519968 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 11:03:16 +0800
Subject: [PATCH 0467/1533] add `nnapi_allow_fp16` option to benchmark_model

---
 .../lite/tools/benchmark/benchmark_performance_options.cc     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index cafef6fa133..c2d9374506e 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -233,6 +233,7 @@ void BenchmarkPerformanceOptions::ResetPerformanceOptions() {
   single_option_run_params_->Set<std::string>("nnapi_accelerator_name", "");
   single_option_run_params_->Set<bool>("disable_nnapi_cpu", false);
   single_option_run_params_->Set<int>("max_delegated_partitions", 0);
+  single_option_run_params_->Set<bool>("nnapi_allow_fp16", false);
 #endif
 #if defined(TFLITE_ENABLE_HEXAGON)
   single_option_run_params_->Set<bool>("use_hexagon", false);
@@ -302,6 +303,9 @@ void BenchmarkPerformanceOptions::CreatePerformanceOptions() {
                         BenchmarkParam::Create<bool>(false));
         params.AddParam("max_delegated_partitions",
                         BenchmarkParam::Create<int>(0));
+        params.AddParam("max_delegated_partitions",
+        params.AddParam("nnapi_allow_fp16",
+                        BenchmarkParam::Create<bool>(false));
         all_run_params_.emplace_back(std::move(params));
       }
     }

From 1c74b32aa27dc0d40a9ce1f883ea632d399a7b9a Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Tue, 12 May 2020 21:23:08 -0700
Subject: [PATCH 0468/1533] Validate remote resource devices before safe access
 of resources.

Cluster updates (due to recreated distribution strategies, remote worker failures, etc.) can lead to crashing failures with segfaults when accessing resources created before the update. Some common patterns are:
* Accessing datasets created on old remote workers;
* Accessing variables created on failed workers;
* Garbage collecting datasets/iterators created on old remote workers;

This CL validate the remote devices to make sure the access is safe before executing the ops by looking up the device in a set of device pointers and checking its incarnation ID. Remote workers on restarted devices will have different incarnation IDs, and accessing resources on those devices will fail gracefully.

PiperOrigin-RevId: 311261000
Change-Id: Ifc07862229b06301e0275fe80975565d9df28152
---
 tensorflow/c/eager/c_api_cluster_test.cc      | 120 ++++++++++++++++++
 tensorflow/c/eager/c_api_test.cc              |   2 +
 tensorflow/c/eager/c_api_test_util.cc         |   1 +
 tensorflow/core/common_runtime/device_mgr.cc  |   5 +
 tensorflow/core/common_runtime/device_mgr.h   |  10 ++
 .../core/common_runtime/dynamic_device_mgr.cc |   7 +
 .../core/common_runtime/eager/execute.cc      |  17 +++
 .../common_runtime/eager/tensor_handle.cc     |  20 +++
 .../core/common_runtime/eager/tensor_handle.h |   6 +
 .../eager/tensor_handle_test.cc               | 101 ++++++++++++++-
 10 files changed, 286 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/eager/c_api_cluster_test.cc b/tensorflow/c/eager/c_api_cluster_test.cc
index 8f585d6f02c..252a0408758 100644
--- a/tensorflow/c/eager/c_api_cluster_test.cc
+++ b/tensorflow/c/eager/c_api_cluster_test.cc
@@ -50,6 +50,13 @@ tensorflow::ServerDef GetServerDef(int num_tasks) {
   return GetServerDef("localhost", num_tasks);
 }
 
+void ReplaceTaskInServerDef(tensorflow::ServerDef* server_def, int task_index) {
+  tensorflow::JobDef* job_def = server_def->mutable_cluster()->mutable_job(0);
+  int port = tensorflow::testing::PickUnusedPortOrDie();
+  job_def->mutable_tasks()->at(task_index) =
+      tensorflow::strings::StrCat("localhost:", port);
+}
+
 void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle,
                                     const std::vector<float>& expected_values) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
@@ -101,6 +108,22 @@ void CheckRemoteMatMulExecutesOK(TFE_Context* ctx,
   TF_DeleteStatus(status);
 }
 
+// Read the value of variable `var` and save it into `out_value`.
+void ReadVariable(TFE_Context* ctx, TFE_TensorHandle* var,
+                  TFE_TensorHandle** out_value) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpAddInput(op, var, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 1;
+  TFE_Execute(op, out_value, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+  TF_DeleteStatus(status);
+}
+
 void TestRemoteExecuteChangeServerDef(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
@@ -243,6 +266,102 @@ TEST(CAPI, RemoteExecuteUpdateServerDefAsync) {
   TestRemoteExecuteUpdateServerDef(true);
 }
 
+void TestRemoteExecuteUpdateServerDefResourceAccess(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char dev0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char dev1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+
+  TFE_TensorHandle* var_handle0 = TestVariable(ctx, 1.0, dev0_name);
+  EXPECT_NE(var_handle0, nullptr);
+  TFE_TensorHandle* var_handle1 = TestVariable(ctx, 2.0, dev1_name);
+  EXPECT_NE(var_handle1, nullptr);
+
+  TFE_TensorHandle* value_handle = nullptr;
+  ReadVariable(ctx, var_handle1, &value_handle);
+  CheckTFE_TensorHandleHasFloats(value_handle, {2});
+  TFE_DeleteTensorHandle(value_handle);
+
+  // Start a new worker to replace task:1
+  ReplaceTaskInServerDef(&server_def, 1);
+  server_def.set_task_index(1);
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  // Update server def to replace the remote device with the device info on the
+  // new worker (different incarnation ID).
+  server_def.set_task_index(0);
+  string serialized_update = server_def.SerializeAsString();
+  TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
+                             serialized_update.size(), status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // The device of var_handle0 is local device which is the same before and
+  // after cluster update. Remove resource with valid device should succeed.
+  TFE_Op* op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle0, status);
+  TFE_OpSetDevice(op, dev0_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  // The device of var_handle1 is remote device, which was replaced during
+  // cluster update. Removing resource with invalid device should fail
+  // gracefully (i.e., with error status) instead of crashing with segfaults.
+  op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle1, status);
+  TFE_OpSetDevice(op, dev1_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  TFE_DeleteTensorHandle(var_handle0);
+  TFE_DeleteTensorHandle(var_handle1);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccess) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(false);
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccessAsync) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(true);
+}
+
 void TestRemoteExecuteUpdateServerDefWithFailures(bool async) {
   // Fail fast on GetStatus requests so we can get errors instead of timeout
   // when updating cluster with non-exsitent worker
@@ -282,6 +401,7 @@ void TestRemoteExecuteUpdateServerDefWithFailures(bool async) {
   int port = tensorflow::testing::PickUnusedPortOrDie();
   job_def->mutable_tasks()->insert(
       {2, tensorflow::strings::StrCat("localhost:", port)});
+  server_def.set_task_index(0);
   string serialized_update = server_def.SerializeAsString();
   TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
                              serialized_update.size(), status);
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 548bf1337bb..724176505ba 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1203,6 +1203,8 @@ void BM_ReadVariable(int iters) {
     CHECK_EQ(0, TFE_TensorHandleNumDims(h, status));
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
     h = nullptr;
+    TFE_OpAddInput(op, var_handle, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(op);
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index bbdc4c8f410..29b624b8537 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -150,6 +150,7 @@ TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
   TFE_TensorHandle* var_handle = nullptr;
   int num_retvals = 1;
   TFE_Execute(op, &var_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
   TFE_DeleteOp(op);
   if (TF_GetCode(status) != TF_OK) return nullptr;
   CHECK_EQ(1, num_retvals);
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index c7583c374f2..0b693085da3 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -45,6 +45,7 @@ StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
     }
     const auto& t = d->device_type();
     device_type_counts_[t]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     if (cpu_device_ == nullptr && t == "CPU" && d->parsed_name().id == 0) {
       cpu_device_ = d.get();
     }
@@ -123,6 +124,10 @@ Status StaticDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool StaticDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void StaticDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index 56248b39078..83a0d0cc29c 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/lib/core/arena.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -56,6 +57,11 @@ class DeviceMgr {
   // Accepts either a full device name, or just the replica-local suffix.
   virtual Status LookupDevice(StringPiece name, Device** device) const = 0;
 
+  // Check if the current device manager contains device with the given
+  // incarnation ID. Looking up by incarnation IDs because they are randomly
+  // generated and not intentionally reused (unlike device pointers).
+  virtual bool ContainsDevice(int64 device_incarnation) const = 0;
+
   // Clears given containers of all devices if 'container' is
   // non-empty. Otherwise, clears default containers of all devices.
   virtual void ClearContainers(gtl::ArraySlice<string> containers) const = 0;
@@ -86,6 +92,7 @@ class StaticDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -95,6 +102,7 @@ class StaticDeviceMgr : public DeviceMgr {
 
   StringPiece CopyToBackingStore(StringPiece s);
 
+  absl::flat_hash_set<int64> device_incarnation_set_;
   std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
   core::Arena name_backing_store_;  // Storage for keys in device_map_
   std::unordered_map<string, int> device_type_counts_;
@@ -117,6 +125,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -140,6 +149,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   std::unordered_map<Device*, std::unique_ptr<Device>> dynamic_devices_
       TF_GUARDED_BY(devices_mu_);
 
+  absl::flat_hash_set<int64> device_incarnation_set_ TF_GUARDED_BY(devices_mu_);
   std::unordered_map<string, Device*> device_map_ TF_GUARDED_BY(devices_mu_);
 
   std::unordered_map<string, int> device_type_counts_
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index f35fa7e416a..f47de47c5b9 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -92,6 +92,11 @@ Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool DynamicDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  tf_shared_lock l(devices_mu_);
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void DynamicDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
@@ -138,6 +143,7 @@ Status DynamicDeviceMgr::AddDevices(
       device_map_[name] = d.get();
     }
     device_type_counts_[d->device_type()]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     dynamic_devices_.emplace(d.get(), std::move(d));
   }
   return Status::OK();
@@ -171,6 +177,7 @@ Status DynamicDeviceMgr::RemoveDevices(std::vector<Device*> devices) {
       device_map_.erase(name);
     }
     device_type_counts_[d->device_type()]--;
+    device_incarnation_set_.erase(d->attributes().incarnation());
     dynamic_devices_.erase(it);
   }
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 35dd9990054..3036e6d7989 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -874,6 +874,19 @@ bool IsPinnableOp(const string& op_type) {
          !absl::StartsWith(op_type, "XRT");
 }
 
+// Validate if the remote device with the given incarnation is valid in the
+// remote device manager of the current eager context.
+Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
+                                        int64 device_incarnation) {
+  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
+    return Status::OK();
+  }
+  return errors::InvalidArgument(
+      "Resource input tensor contains an invalid device. This might happen "
+      "when the client has connected to a different cluster, or some remote "
+      "workers have been restarted.");
+}
+
 // The Op device may be updated if:
 // - A resource touching input is specified: all resource-touching ops run in
 // the device the resource is, regardless of anything else that has been
@@ -935,6 +948,10 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
   for (int i = 0; i < op->Inputs().size(); ++i) {
     TensorHandle* tensor_handle = op->Inputs()[i];
     if (tensor_handle->dtype == DT_RESOURCE) {
+      if (tensor_handle->resource_remote_device_incarnation() != 0) {
+        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
+            &ctx, tensor_handle->resource_remote_device_incarnation()));
+      }
       Device* resource_device = tensor_handle->resource_device();
       DVLOG(2) << "for op " << op->Name() << " input " << i << " "
                << DataTypeString(tensor_handle->dtype)
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index eef46b691ce..dfe3e4a1426 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -49,6 +49,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+int64 GetRemoteDeviceIncarnation(Device* device) {
+  if (device == nullptr || device->IsLocal()) return 0;
+  return device->attributes().incarnation();
+}
+}  // namespace
+
 TensorHandle::PackedTensorHandleData::PackedTensorHandleData(
     std::vector<TensorHandle*>&& handles, const TensorShape& shape)
     : handles_(std::move(handles)), shape_(shape) {
@@ -244,6 +251,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       device_((!ctx || d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
@@ -258,6 +267,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       op_device_(op_device),
       resource_device_(
           GetResourceDevice(t.flat<class ResourceHandle>()(0), ctx)),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       resource_handle_info_(
           {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
@@ -274,6 +285,7 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, CustomDevice* d,
       device_(d),
       op_device_(nullptr),
       resource_device_(nullptr),
+      resource_remote_device_incarnation_(0),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   // TODO(allenl): Figure out a better op_device story for custom devices,
@@ -297,6 +309,8 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
       device_((d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>) {
   DVLOG(3) << "Creating empty Local TensorHandle: " << this
@@ -354,6 +368,8 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
       device_(device),
       op_device_(device),
       resource_device_(dtype == DT_RESOURCE ? device : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<PackedTensorHandleData>, std::move(handles),
             shape) {
@@ -376,6 +392,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             remote_task, ctx) {
@@ -398,6 +416,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             ctx->GetContextViewId()) {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 45e7a3815a8..25d7fea3200 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -133,6 +133,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   VariantDevice device() const { return device_; }
   Device* op_device() const { return op_device_; }
   Device* resource_device() const { return resource_device_; }
+  int64 resource_remote_device_incarnation() const {
+    return resource_remote_device_incarnation_;
+  }
 
   VariantDevice DeviceOrHostCPU(const EagerContext& ctx) const;
 
@@ -265,6 +268,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
   // backing the resource. Else resource_device_ is nullptr.
   tensorflow::Device* const resource_device_;
+  // Incarnation ID of the resource device if it locates on a remote device, or
+  // 0 if it locates on a local device.
+  const int64 resource_remote_device_incarnation_;
 
   mutable mutex mu_;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 2bcde7dce5b..779158375de 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -66,17 +67,28 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
   ctx->Unref();
 }
 
-static Device* CreateDevice(const char* type, const char* name) {
+static Device* CreateDevice(const char* type, const char* name,
+                            bool is_local = true) {
   class FakeDevice : public Device {
    public:
-    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    explicit FakeDevice(const DeviceAttributes& attr, bool is_local)
+        : Device(nullptr, attr), is_local_(is_local) {}
     Status Sync() override { return Status::OK(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+    bool IsLocal() const override { return is_local_; }
+
+   private:
+    const bool is_local_;
   };
   DeviceAttributes attr;
   attr.set_name(name);
   attr.set_device_type(type);
-  return new FakeDevice(attr);
+  int64 incarnation = random::New64();
+  while (incarnation == 0) {
+    incarnation = random::New64();
+  }
+  attr.set_incarnation(incarnation);
+  return new FakeDevice(attr, is_local);
 }
 
 }  // namespace
@@ -204,4 +216,87 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   packed_handle->Unref();
 }
 
+TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d0));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {2};
+  Tensor t(dtype, shape);
+
+  Device* d = local_device_mgr.ListDevices()[0];
+  TensorHandle* th =
+      TensorHandle::CreateLocalHandle(std::move(t), d, d, d, ctx);
+  // Remote device incarnation for local resource should be 0 (invalid)
+  EXPECT_EQ(0, th->resource_remote_device_incarnation());
+  // Local device manager must contain the resource device.
+  EXPECT_TRUE(local_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr new_device_mgr(std::move(d1));
+  EXPECT_FALSE(new_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  th->Unref();
+  ctx->Unref();
+}
+
+TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
+  std::unique_ptr<Device> d_local(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d_local));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:worker/task:0/device:CPU:0", false));
+  Device* d0_ptr = d0.get();
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:worker/task:1/device:CPU:0", false));
+  Device* d1_ptr = d1.get();
+
+  DynamicDeviceMgr remote_device_mgr;
+  std::vector<std::unique_ptr<Device>> vector_d0;
+  vector_d0.emplace_back(std::move(d0));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d0)));
+
+  TensorHandle* th0 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d0_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  std::vector<std::unique_ptr<Device>> vector_d1;
+  vector_d1.emplace_back(std::move(d1));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d1)));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  TensorHandle* th1 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d1_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+
+  std::vector<Device*> remove_d1{d1_ptr};
+  TF_ASSERT_OK(remote_device_mgr.RemoveDevices(std::move(remove_d1)));
+  EXPECT_FALSE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  th0->Unref();
+  th1->Unref();
+  ctx->Unref();
+}
+
 }  // namespace tensorflow

From 296993a42ca74d7a49efcaa92d3b0dd427551980 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Tue, 12 May 2020 21:30:26 -0700
Subject: [PATCH 0469/1533] Remove deprecated variants of DynamicSlice and
 DynamicUpdateSlice builders

Upgraded existing users by converting 1d start_slices to a list of scalars. I am expecting this to be performance neutral as these tensors are expected to be small. I decided against having the XlaBuilder do this internally as I guess we want to discourage usage of vector indices.

PiperOrigin-RevId: 311261628
Change-Id: I4b779a58cfca1699bdf5104c236bc6453fd419bc
---
 .../tf2xla/kernels/dynamic_slice_ops.cc       | 28 +++++++---
 .../compiler/tf2xla/kernels/slice_op.cc       | 22 +++++---
 tensorflow/compiler/xla/client/xla_builder.cc | 51 -------------------
 tensorflow/compiler/xla/client/xla_builder.h  | 16 ------
 tensorflow/compiler/xla/tests/while_test.cc   |  2 +-
 5 files changed, 36 insertions(+), 83 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index bb2c0d9ddb8..5dbc083368c 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -28,6 +28,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+absl::InlinedVector<xla::XlaOp, 4> SliceVector(xla::XlaOp input, int64 rank) {
+  absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+  scalar_indices.reserve(rank);
+  for (int i = 0; i < rank; i++)
+    scalar_indices.push_back(
+        xla::Reshape(xla::Slice(input, {i}, {i + 1}, {1}), {}));
+  return scalar_indices;
+}
+
 class DynamicUpdateSliceOp : public XlaOpKernel {
  public:
   explicit DynamicUpdateSliceOp(OpKernelConstruction* context)
@@ -41,21 +50,23 @@ class DynamicUpdateSliceOp : public XlaOpKernel {
     const TensorShape update_shape = ctx->InputShape("update");
     const TensorShape index_shape = ctx->InputShape("indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(index_shape) &&
-            index_shape.num_elements() == input_shape.dims(),
+            index_shape.num_elements() == rank,
         errors::InvalidArgument("index must be a vector with length equal to "
                                 "the number of input dimensions"));
     OP_REQUIRES(
-        ctx, input_shape.dims() == update_shape.dims(),
+        ctx, rank == update_shape.dims(),
         errors::InvalidArgument("input and update must have the same rank,"
                                 " input shape is ",
                                 input_shape.DebugString(), "; update shape is ",
                                 update_shape.DebugString()));
 
+    xla::XlaOp indices = ctx->Input("indices");
     xla::XlaOp result = xla::DynamicUpdateSlice(
-        ctx->Input("input"), ctx->Input("update"), ctx->Input("indices"));
+        ctx->Input("input"), ctx->Input("update"), SliceVector(indices, rank));
     ctx->SetOutput(0, result);
   }
 };
@@ -76,17 +87,18 @@ class DynamicSliceOp : public XlaOpKernel {
     const TensorShape start_indices_shape = ctx->InputShape("start_indices");
     const TensorShape size_indices_shape = ctx->InputShape("size_indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(start_indices_shape) &&
-                    start_indices_shape.num_elements() == input_shape.dims(),
+                    start_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "start_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
-                    input_shape.dims(), " and start_indices has shape ",
+                    rank, " and start_indices has shape ",
                     start_indices_shape.DebugString()));
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(size_indices_shape) &&
-                    size_indices_shape.num_elements() == input_shape.dims(),
+                    size_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "size_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
@@ -96,8 +108,10 @@ class DynamicSliceOp : public XlaOpKernel {
     std::vector<int64> size_indices;
     OP_REQUIRES_OK(
         ctx, ctx->ConstantInputAsIntVector("size_indices", &size_indices));
+
+    xla::XlaOp start_indices = ctx->Input("start_indices");
     xla::XlaOp result = xla::DynamicSlice(
-        ctx->Input("input"), ctx->Input("start_indices"), size_indices);
+        ctx->Input("input"), SliceVector(start_indices, rank), size_indices);
     ctx->SetOutput(0, result);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 17d0b87edda..7f274c6b00f 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -42,19 +42,17 @@ class SliceOp : public XlaOpKernel {
     const TensorShape begin_tensor_shape = ctx->InputShape(1);
     const TensorShape size_tensor_shape = ctx->InputShape(2);
 
+    const int input_dims = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(begin_tensor_shape) &&
             TensorShapeUtils::IsVector(size_tensor_shape) &&
-            begin_tensor_shape.num_elements() == input_shape.dims() &&
-            size_tensor_shape.num_elements() == input_shape.dims(),
+            begin_tensor_shape.num_elements() == input_dims &&
+            size_tensor_shape.num_elements() == input_dims,
         errors::InvalidArgument(
             "Expected begin and size arguments to be 1-D tensors of size ",
-            input_shape.dims(), ", but got shapes ",
-            begin_tensor_shape.DebugString(), " and ",
-            size_tensor_shape.DebugString(), " instead."));
-
-    const int input_dims = input_shape.dims();
+            input_dims, ", but got shapes ", begin_tensor_shape.DebugString(),
+            " and ", size_tensor_shape.DebugString(), " instead."));
 
     std::vector<int64> begin;
     std::vector<int64> size;
@@ -129,7 +127,15 @@ class SliceOp : public XlaOpKernel {
                                             input_shape.dim_size(i), "], but ",
                                             "got ", size[i]));
       }
-      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), ctx->Input(1), size));
+
+      absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+      scalar_indices.reserve(input_dims);
+      xla::XlaOp begin = ctx->Input("begin");
+      for (int i = 0; i < input_dims; i++)
+        scalar_indices.push_back(
+            xla::Reshape(xla::Slice(begin, {i}, {i + 1}, {1}), {}));
+
+      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), scalar_indices, size));
     }
   }
 };
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index bd70ce80082..6539817d524 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -860,28 +860,6 @@ XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64 start_index,
   });
 }
 
-XlaOp XlaBuilder::DynamicSlice(XlaOp operand, XlaOp start_indices,
-                               absl::Span<const int64> slice_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferDynamicSliceShape(
-                         *operand_shape, {*start_indices_shape}, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
-                          {operand, start_indices});
-  });
-}
-
 XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64> slice_sizes) {
@@ -910,26 +888,6 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
   });
 }
 
-XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                     XlaOp start_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape,
-        ShapeInference::InferDynamicUpdateSliceShape(
-            *operand_shape, *update_shape, {*start_indices_shape}));
-    *instr.mutable_shape() = shape.ToProto();
-
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          {operand, update, start_indices});
-  });
-}
-
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -3152,20 +3110,11 @@ XlaOp SliceInDim(const XlaOp operand, int64 start_index, int64 limit_index,
                                        stride, dimno);
 }
 
-XlaOp DynamicSlice(const XlaOp operand, const XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes) {
-  return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
-}
 XlaOp DynamicSlice(const XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
 
-XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
-                         const XlaOp start_indices) {
-  return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
-}
-
 XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
                          absl::Span<const XlaOp> start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 33fe62e9322..24b0cba3a1b 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -421,14 +421,9 @@ class XlaBuilder {
   virtual XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                            int64 stride, int64 dimno);
 
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                     absl::Span<const int64> slice_sizes);
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64> slice_sizes);
 
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
 
@@ -858,14 +853,10 @@ class XlaBuilder {
   friend XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                           int64 stride, int64 dimno);
 
-  friend XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                            absl::Span<const int64> slice_sizes);
   friend XlaOp DynamicSlice(XlaOp operand,
                             absl::Span<const XlaOp> start_indices,
                             absl::Span<const int64> slice_sizes);
 
-  friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                  XlaOp start_indices);
   friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                   absl::Span<const XlaOp> start_indices);
 
@@ -1438,10 +1429,6 @@ XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
 XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes);
-
 // Enqueues a dynamic update slice operation onto the computation, which
 // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
 // The shape of 'update' determines the shape of the slice of 'operand'
@@ -1462,9 +1449,6 @@ XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
 XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                          absl::Span<const XlaOp> start_indices);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
-
 // Enqueues a concatenate instruction onto the computation. 'operands' must
 // have >= 1 entry.
 XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 5a482305513..d575bbb1f3e 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -863,7 +863,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // Starts = iteration * 2;
     auto starts = Mul(iteration, ConstantR0<int32>(&builder, 2));
     // UpdateSlice.
-    auto out1 = DynamicUpdateSlice(input, update, starts);
+    auto out1 = DynamicUpdateSlice(input, update, {starts});
 
     Tuple(&builder, {out0, out1});
     body = builder.Build().ConsumeValueOrDie();

From 2db6e2e05da4ebea6d2faffb94a955abcb5248f9 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 12 May 2020 21:34:16 -0700
Subject: [PATCH 0470/1533] Support Core ML 3 for Core ML delegate when running
 on iOS 13

Added option to choose Core ML 2 even when running on iOS 13. Currently Reshape is not supported in Core ML 3, because it only accepts 5D tensor and it's not likely for model to have 5D tensor when coming from TFLite.

PiperOrigin-RevId: 311261992
Change-Id: I9161cfb734b11ccd053c8a22a142f8cf72132e5a
---
 .../delegates/coreml/builders/op_builder.cc   | 10 +++
 .../delegates/coreml/builders/op_builder.h    |  4 +
 .../delegates/coreml/builders/op_validator.h  |  3 +-
 .../coreml/builders/reshape_op_builder.cc     |  6 +-
 .../delegates/coreml/coreml_delegate.h        |  7 ++
 .../delegates/coreml/coreml_delegate.mm       | 85 +++++++++++--------
 .../delegates/coreml/coreml_delegate_kernel.h |  3 +
 .../coreml/coreml_delegate_kernel.mm          | 27 +++---
 .../delegates/coreml/coreml_executor.h        |  1 +
 .../delegates/coreml/coreml_executor.mm       | 46 ++++++++--
 .../swift/Sources/CoreMLDelegate.swift        |  4 +
 .../lite/g3doc/performance/coreml_delegate.md | 12 ++-
 12 files changed, 153 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 09c386b55f0..2581b58f1e4 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -87,6 +87,16 @@ OpBuilder* GraphBuilder::AddBuilder(
 
 CoreML::Specification::Model* GraphBuilder::BuildModel() {
   CoreML::Specification::Model* model = new CoreML::Specification::Model();
+  if (coreml_version_ == 2) {  // Core ML 2, iOS >= 12.0
+    model->set_specificationversion(3);
+  } else if (coreml_version_ == 3) {  // Core ML 3, iOS >= 13.0
+    model->set_specificationversion(4);
+    model->mutable_neuralnetwork()->set_arrayinputshapemapping(
+        CoreML::Specification::EXACT_ARRAY_MAPPING);
+  } else {
+    fprintf(stderr, "Unsupported Core ML version: %d\n", coreml_version_);
+    return nullptr;
+  }
   auto* neural_network = model->mutable_neuralnetwork();
   for (auto& builder : builders_) {
     CoreML::Specification::NeuralNetworkLayer* layer = builder->Build();
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
index 5367ae20d2f..c59c30a5a28 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
@@ -52,6 +52,8 @@ class TensorID {
 // API is experimental and subject to change.
 class GraphBuilder {
  public:
+  explicit GraphBuilder(int coreml_version) : coreml_version_(coreml_version) {}
+
   // Returns pointer to the created builder. Ownership still belongs
   // to the GraphBuilder.
   OpBuilder* AddBuilder(int builtin_code, const TfLiteNode* node);
@@ -79,6 +81,8 @@ class GraphBuilder {
   // This information is used to mark constant tensors that are used as input.
   bool IsTensorUsed(int tflite_tensor_index);
 
+  const int coreml_version_;
+
  private:
   std::vector<std::unique_ptr<OpBuilder>> builders_;
   // Index in the vector is the tflite_tensor_index, the value
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
index b0fe24ee288..501a304706c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
@@ -32,7 +32,8 @@ bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context);
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version);
 bool IsResizeBilinearOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
index 33040e2e070..b7b78653d36 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
@@ -114,7 +114,11 @@ TfLiteStatus ReshapeOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
 }
 
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context) {
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version) {
+  if (coreml_version >= 3) {
+    return false;
+  }
   if (node->inputs->size == 1) {
     const auto* params =
         reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
index 0d75afc8e34..8ad81040499 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
@@ -31,6 +31,13 @@ typedef enum {
 typedef struct {
   // Only create delegate when Neural Engine is available on the device.
   TfLiteCoreMlDelegateEnabledDevices enabled_devices;
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  int coreml_version;
   // This sets the maximum number of Core ML delegates created.
   // Each graph corresponds to one delegated node subset in the
   // TFLite model. Set this to 0 to delegate all possible partitions.
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 5d0564ebc48..58728659894 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -36,7 +36,7 @@ constexpr int kMinNodesPerCoreMlDelegate = 2;
 using delegates::coreml::CoreMlDelegateKernel;
 
 bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfLiteNode* node,
-                               TfLiteContext* context) {
+                               TfLiteContext* context, const TfLiteCoreMlDelegateOptions* options) {
   if (@available(iOS 11.0, *)) {
   } else {
     return false;
@@ -120,7 +120,8 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
       return true;
     }
     case kTfLiteBuiltinReshape: {
-      return delegates::coreml::IsReshapeOpSupported(registration, node, context);
+      return delegates::coreml::IsReshapeOpSupported(registration, node, context,
+                                                     options->coreml_version);
     }
     case kTfLiteBuiltinResizeBilinear: {
       return delegates::coreml::IsResizeBilinearOpSupported(registration, node, context);
@@ -142,6 +143,39 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
   return false;
 }
 
+class CoreMlDelegate : public TfLiteDelegate {
+ public:
+  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
+      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {
+    {
+      if (@available(iOS 13.0, *)) {
+        if (params_.coreml_version != 2 && params_.coreml_version != 3) {
+          NSLog(@"coreml_version must be 2 or 3. Setting to 3.");
+          params_.coreml_version = 3;
+        }
+      } else if (@available(iOS 12.0, *)) {
+        if (params_.coreml_version != 2) {
+          NSLog(@"coreml_version must be 2 - using Core ML version 2.");
+          params_.coreml_version = 2;
+        }
+      }
+      if (params_.max_delegated_partitions <= 0) {
+        params_.max_delegated_partitions = std::numeric_limits<int>::max();
+      }
+      if (params_.min_nodes_per_partition <= 0) {
+        params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
+      }
+    }
+  }
+
+  TfLiteCoreMlDelegateOptions* params() { return &params_; }
+
+  bool VerifyDelegate() { return true; }
+
+ private:
+  TfLiteCoreMlDelegateOptions params_;
+};
+
 TfLiteRegistration GetCoreMlKernelRegistration() {
   // This is the registration for the Delegate Node that gets added to
   // the TFLite graph instead of the subGraph it replaces it.
@@ -158,8 +192,10 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
   };
   kernel_registration.init = [](TfLiteContext* context, const char* buffer,
                                 size_t length) -> void* {
-    const TfLiteDelegateParams* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel();
+    const auto* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    const auto* coreml_options =
+        (reinterpret_cast<CoreMlDelegate*>(params->delegate))->params();
+    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel(coreml_options->coreml_version);
     if (coreml_kernel->Init(context, params) != kTfLiteOk) {
       delete coreml_kernel;
       return nullptr;
@@ -187,14 +223,12 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
 }
 
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  const auto* params =
-      reinterpret_cast<TfLiteCoreMlDelegateOptions*>(delegate->data_);
+  const auto* params = reinterpret_cast<TfLiteCoreMlDelegateOptions*>(delegate->data_);
 
-  delegates::IsNodeSupportedFn node_supported_fn =
-      [=](TfLiteContext* context, TfLiteNode* node,
-          TfLiteRegistration* registration,
-          std::string* unsupported_details) -> bool {
-    return IsNodeSupportedByDelegate(registration, node, context);
+  delegates::IsNodeSupportedFn node_supported_fn = [=](TfLiteContext* context, TfLiteNode* node,
+                                                       TfLiteRegistration* registration,
+                                                       std::string* unsupported_details) -> bool {
+    return IsNodeSupportedByDelegate(registration, node, context, params);
   };
 
   delegates::GraphPartitionHelper helper(context, node_supported_fn);
@@ -214,7 +248,8 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 
   // Set first element to the number of nodes to replace.
   supported_nodes[0] = supported_nodes.size() - 1;
-  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO, "CoreML delegate: %d nodes delegated out of %d nodes, "
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "CoreML delegate: %d nodes delegated out of %d nodes, "
                   "with %d partitions.\n",
                   supported_nodes[0], helper.num_total_nodes(), delegate_partitions.size());
 
@@ -223,28 +258,6 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
       reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
 }
 
-class CoreMlDelegate : public TfLiteDelegate {
- public:
-  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
-      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {
-    {
-      if (params_.max_delegated_partitions <= 0) {
-        params_.max_delegated_partitions = std::numeric_limits<int>::max();
-      }
-      if (params_.min_nodes_per_partition <= 0) {
-        params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
-      }
-    }
-  }
-
-  TfLiteCoreMlDelegateOptions* params() { return &params_; }
-
-  bool VerifyDelegate() { return true; }
-
- private:
-  TfLiteCoreMlDelegateOptions params_;
-};
-
 TfLiteDelegate* CreateCoreMlDelegate(const TfLiteCoreMlDelegateOptions* options) {
   TfLiteDelegate* delegate = new CoreMlDelegate(options);
   if (!static_cast<CoreMlDelegate*>(delegate)->VerifyDelegate()) {
@@ -288,7 +301,7 @@ bool IsNeuralEngineAvailable() {
 }  // namespace
 
 TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* options) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     if (options->enabled_devices == TfLiteCoreMlDelegateDevicesWithNeuralEngine &&
         !IsNeuralEngineAvailable()) {
       NSLog(@"This device does not have Neural Engine, so Core ML delegate will not be enabled. "
@@ -299,7 +312,7 @@ TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* op
     return tflite::CreateCoreMlDelegate(options);
   } else {
     NSLog(@"Core ML delegate is not supported in this iOS version. "
-           "Minimum required iOS version is 11.0.");
+           "Minimum required iOS version is 12.0.");
     return nullptr;
   }
 }
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
index 04053ea81c1..8c983fb11aa 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
@@ -29,6 +29,8 @@ namespace coreml {
 // implements Init/Prepare/Invoke as TFLite kernel nodes.
 class CoreMlDelegateKernel {
  public:
+  explicit CoreMlDelegateKernel(int coreml_version)
+      : coreml_version_(coreml_version) {}
   // Initialize the delegated graph and add required nodes.
   TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params);
 
@@ -56,6 +58,7 @@ class CoreMlDelegateKernel {
   std::unique_ptr<delegates::coreml::GraphBuilder> builder_;
   std::unique_ptr<CoreML::Specification::Model> model_;
   ::CoreMlExecutor* executor_;
+  int coreml_version_;
 
   std::vector<int> input_tensor_ids_;
   std::vector<TensorData> inputs_;
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
index a36837bcc44..6a668bc971b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
@@ -60,7 +60,7 @@ void TransposeToHWC(const float* chw, float* hwc, const TfLiteIntArray* hwc_dims
 
 TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
                                         const TfLiteDelegateParams* delegate_params) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     executor_ = [[::CoreMlExecutor alloc] init];
     TF_LITE_ENSURE_STATUS(BuildModel(context, delegate_params));
     // Serialize the model protocol buffer and compile it.
@@ -76,7 +76,7 @@ TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
     }
     return kTfLiteOk;
   } else {
-    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 11.0.");
+    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 12.0.");
     return kTfLiteError;
   }
 }
@@ -104,6 +104,9 @@ void CoreMlDelegateKernel::AddOutputTensors(const TfLiteIntArray* output_tensors
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
     multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+    if (coreml_version_ >= 3) {
+      multi_array->mutable_shape()->Add(batch_size);
+    }
     multi_array->mutable_shape()->Add(depth_size);
     multi_array->mutable_shape()->Add(height_size);
     multi_array->mutable_shape()->Add(width_size);
@@ -114,7 +117,7 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
                                               const TfLiteDelegateParams* delegate_params) {
   TfLiteNode* node;
   TfLiteRegistration* reg;
-  builder_.reset(new delegates::coreml::GraphBuilder());
+  builder_.reset(new delegates::coreml::GraphBuilder(coreml_version_));
   // Add Inputs
   AddInputTensors(delegate_params->input_tensors, context);
   // Build all ops.
@@ -144,8 +147,6 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
     return kTfLiteError;
   }
   AddOutputTensors(delegate_params->output_tensors, context);
-  // TODO(karimnosseir): Set correct version ?
-  model_->set_specificationversion(1);
   auto* model_description = model_->mutable_description();
   for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
     const int tensor_id = delegate_params->input_tensors->data[i];
@@ -158,6 +159,9 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
       int batch_size, height_size, width_size, depth_size;
       GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
       multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+      if (coreml_version_ >= 3) {
+        multi_array->mutable_shape()->Add(batch_size);
+      }
       multi_array->mutable_shape()->Add(depth_size);
       multi_array->mutable_shape()->Add(height_size);
       multi_array->mutable_shape()->Add(width_size);
@@ -181,9 +185,12 @@ TfLiteStatus CoreMlDelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* n
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor->dims);
 
-    inputs_.push_back({std::vector<float>(input_size),
-                       builder_->GetTensorName(tensor_index),
-                       {depth_size, height_size, width_size}});
+    std::vector<int> input_shape = {depth_size, height_size, width_size};
+    if (coreml_version_ >= 3) {
+      input_shape.insert(input_shape.begin(), batch_size);
+    }
+    inputs_.push_back(
+        {std::vector<float>(input_size), builder_->GetTensorName(tensor_index), input_shape});
   }
 
   outputs_.reserve(node->outputs->size);
@@ -222,9 +229,7 @@ TfLiteStatus CoreMlDelegateKernel::Invoke(TfLiteContext* context, TfLiteNode* no
   }
 }
 
-CoreMlDelegateKernel::~CoreMlDelegateKernel() {
-  [executor_ cleanup];
-}
+CoreMlDelegateKernel::~CoreMlDelegateKernel() { [executor_ cleanup]; }
 
 }  // namespace coreml
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
index edec3020cbc..5ce0a0ade6c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
@@ -45,4 +45,5 @@ struct TensorData {
 @property MLModel* model API_AVAILABLE(ios(11));
 @property NSString* mlModelFilePath;
 @property NSString* compiledModelFilePath;
+@property(nonatomic, readonly) int coreMlVersion;
 @end
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
index 2091c0d7ca0..1f808e08d49 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
@@ -39,17 +39,22 @@ NSURL* createTemporaryFile() {
   NSSet* _featureNames;
 }
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs;
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion;
 - (MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE(ios(11));
 - (NSSet<NSString*>*)featureNames;
 
+@property(nonatomic, readonly) int coreMlVersion;
+
 @end
 
 @implementation MultiArrayFeatureProvider
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs {
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion {
   self = [super init];
   _inputs = inputs;
+  _coreMlVersion = coreMlVersion;
   for (auto& input : *_inputs) {
     if (input.name.empty()) {
       return nil;
@@ -74,8 +79,31 @@ NSURL* createTemporaryFile() {
   for (auto& input : *_inputs) {
     if ([featureName cStringUsingEncoding:NSUTF8StringEncoding] == input.name) {
       // TODO(b/141492326): Update shape handling for higher ranks
-      NSArray* shape = @[ @(input.shape[0]), @(input.shape[1]), @(input.shape[2]) ];
-      NSArray* strides = @[ @(input.shape[1] * input.shape[2]), @(input.shape[2]), @1 ];
+      NSArray* shape = @[
+        @(input.shape[0]),
+        @(input.shape[1]),
+        @(input.shape[2]),
+      ];
+      NSArray* strides = @[
+        @(input.shape[1] * input.shape[2]),
+        @(input.shape[2]),
+        @1,
+      ];
+
+      if ([self coreMlVersion] >= 3) {
+        shape = @[
+          @(input.shape[0]),
+          @(input.shape[1]),
+          @(input.shape[2]),
+          @(input.shape[3]),
+        ];
+        strides = @[
+          @(input.shape[1] * input.shape[2] * input.shape[3]),
+          @(input.shape[2] * input.shape[3]),
+          @(input.shape[3]),
+          @1,
+        ];
+      };
       NSError* error = nil;
       MLMultiArray* mlArray = [[MLMultiArray alloc] initWithDataPointer:(float*)input.data.data()
                                                                   shape:shape
@@ -106,7 +134,7 @@ NSURL* createTemporaryFile() {
   }
   NSError* error = nil;
   MultiArrayFeatureProvider* inputFeature =
-      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs];
+      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs coreMlVersion:[self coreMlVersion]];
   if (inputFeature == nil) {
     NSLog(@"inputFeature is not initialized.");
     return NO;
@@ -153,6 +181,14 @@ NSURL* createTemporaryFile() {
 - (NSURL*)saveModel:(CoreML::Specification::Model*)model {
   NSURL* modelUrl = createTemporaryFile();
   NSString* modelPath = [modelUrl path];
+  if (model->specificationversion() == 3) {
+    _coreMlVersion = 2;
+  } else if (model->specificationversion() == 4) {
+    _coreMlVersion = 3;
+  } else {
+    NSLog(@"Only Core ML models with specification version 3 or 4 are supported");
+    return nil;
+  }
   // Flush data to file.
   // TODO(karimnosseir): Can we mmap this instead of actual writing it to phone ?
   std::ofstream file_stream([modelPath UTF8String], std::ios::out | std::ios::binary);
diff --git a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
index 9862de31e2c..5a1526d45ea 100644
--- a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
@@ -35,6 +35,7 @@ public final class CoreMLDelegate: Delegate {
     self.options = options
     var delegateOptions = TfLiteCoreMlDelegateOptions()
     delegateOptions.enabled_devices = options.enabledDevices.cEnabledDevices
+    delegateOptions.coreml_version = Int32(options.coreMLVersion)
     delegateOptions.max_delegated_partitions = Int32(options.maxDelegatedPartitions)
     delegateOptions.min_nodes_per_partition = Int32(options.minNodesPerPartition)
     guard let delegate = TfLiteCoreMlDelegateCreate(&delegateOptions) else { return nil }
@@ -72,6 +73,9 @@ extension CoreMLDelegate {
     /// value is `.neuralEngine` indicating that the delegate is enabled for Neural Engine devices
     /// only.
     public var enabledDevices: EnabledDevices = .neuralEngine
+    /// Target Core ML version for the model conversion. When it's not set, Core ML version will
+    /// be set to highest available version for the platform.
+    public var coreMLVersion = 0
     /// The maximum number of Core ML delegate partitions created. Each graph corresponds to one
     /// delegated node subset in the TFLite model. The default value is `0` indicating that all
     /// possible partitions are delegated.
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index da3b943fd89..c267347cf3f 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -6,7 +6,7 @@ which results in faster model inference on iOS devices.
 
 Note: This delegate is in experimental (beta) phase.
 
-Note: Core ML delegate is using Core ML version 2.1.
+Note: Core ML delegate supports Core ML version 2 and later.
 
 **Supported iOS versions and devices:**
 
@@ -158,6 +158,14 @@ for more detail. Alternatively, you can implement your own set of blacklist
 devices using other libraries such as
 [DeviceKit](https://github.com/devicekit/DeviceKit).
 
+### Using older Core ML version
+
+Although iOS 13 supprots Core ML 3, the model might work better when it is
+converted with Core ML 2 model specification. The target conversion version is
+set to the latest version by default, but you can change this by setting
+`coreMLVersion` (in Swift, `coreml_version` in C API) in the delegate option to
+older version.
+
 ## Supported ops
 
 Following ops are supported by the Core ML delegate.
@@ -187,6 +195,8 @@ Following ops are supported by the Core ML delegate.
 *   ReluN1To1
 *   Relu6
 *   Reshape
+    *   Only supported when target Core ML version is 2, not supported when
+        targeting Core ML 3.
 *   ResizeBilinear
 *   SoftMax
 *   Tanh

From 816582d7eaf62bc12252791ef7701d329edee6ff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 21:46:06 -0700
Subject: [PATCH 0471/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311263158
Change-Id: I8cec18a5d0a7d93af71ec0a913936cf9c24c8131
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e71443f838b2a16aea5b8ff38cc8e211449206d1 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 12 May 2020 22:19:52 -0700
Subject: [PATCH 0472/1533] Updated image_test to work with newer versions of
 keras_preprocessing

PiperOrigin-RevId: 311266601
Change-Id: I0cbcea629b4fff04c50628432515a8766dc10ec8
---
 tensorflow/python/keras/preprocessing/image_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index a577381874e..d2f4b18f7dd 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -146,8 +146,7 @@ class TestImage(keras_parameterized.TestCase):
       generator = preprocessing_image.ImageDataGenerator(
           data_format='unknown')
 
-    generator = preprocessing_image.ImageDataGenerator(
-        zoom_range=(2, 2))
+    generator = preprocessing_image.ImageDataGenerator(zoom_range=(2., 2.))
 
   def test_image_data_generator_fit(self):
     generator = preprocessing_image.ImageDataGenerator(

From 088fc3a9b5701eec8073489417143a32ef25cdd5 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 12 May 2020 22:46:26 -0700
Subject: [PATCH 0473/1533] Support setting Core ML delegate's target Core ML
 version in benchmark

PiperOrigin-RevId: 311269200
Change-Id: I343794d6af948c554d05a89c9e432c0975ddfa6c
---
 tensorflow/lite/tools/benchmark/README.md                   | 1 +
 tensorflow/lite/tools/delegates/README.md                   | 3 +++
 tensorflow/lite/tools/delegates/coreml_delegate_provider.cc | 6 ++++++
 3 files changed, 10 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a4f632c40a9..c44129cbbd3 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -87,6 +87,7 @@ the reported data on hexagon is in cycles, not in ms like on cpu.
 
 #### CoreML delegate
 *   `use_coreml`: `bool` (default=false)
+*   `coreml_version`: `int` (default=0)
 
 #### External delegate
 *   `external_delegate_path`: `string` (default="")
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index f0e15e9e71a..709fcffb24d 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -93,6 +93,9 @@ TFLite delegate.
 *   `use_coreml`: `bool` (default=false) \
     Whether to use the [Core ML delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/delegates/coreml).
     This option is only available in iOS.
+*   `coreml_version`: `int` (default=0) \
+    Target Core ML version for model conversion. The default value is 0 and it
+    means using the newest version that's available on the device.
 
 ### External delegate provider
 *   `external_delegate_path`: `string` (default="") \
diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index 0d1a8ade368..c29555716a4 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -32,6 +32,7 @@ class CoreMlDelegateProvider : public DelegateProvider {
   CoreMlDelegateProvider() {
 #if defined(REAL_IPHONE_DEVICE)
     default_params_.AddParam("use_coreml", ToolParam::Create<bool>(true));
+    default_params_.AddParam("coreml_version", ToolParam::Create<int>(0));
 #endif
   }
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -49,6 +50,10 @@ std::vector<Flag> CoreMlDelegateProvider::CreateFlags(
 #if defined(REAL_IPHONE_DEVICE)
   std::vector<Flag> flags = {
       CreateFlag<bool>("use_coreml", params, "use Core ML"),
+      CreateFlag<int>("coreml_version", params,
+                      "Target Core ML version for model conversion. "
+                      "The default value is 0 and it means using the newest "
+                      "version that's available on the device."),
   };
   return flags;
 #else
@@ -71,6 +76,7 @@ TfLiteDelegatePtr CoreMlDelegateProvider::CreateTfLiteDelegate(
   if (params.Get<bool>("use_coreml")) {
     TfLiteCoreMlDelegateOptions coreml_opts = {
         .enabled_devices = TfLiteCoreMlDelegateAllDevices};
+    coreml_opts.coreml_version = params.Get<int>("coreml_version");
     coreml_opts.max_delegated_partitions =
         params.Get<int>("max_delegated_partitions");
     coreml_opts.min_nodes_per_partition =

From d5b3ec27d1d6bb157588ff3033a3d9bd2e46711f Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 12 May 2020 23:14:53 -0700
Subject: [PATCH 0474/1533] Allow dynamically configuring device placement

Enable setting soft device placement as well as logging dynamically.
This required ensuring the device placement policy was part of the cache
key.

Further, we fix the logging to ensure in eager mode if a kernel is
retrieved from the kernel cache, then the execution is still logged. We
also log closer to the actual op execution to avoid logging before all
checks have been done.

PiperOrigin-RevId: 311271808
Change-Id: I9765228894f84a3447cc03332a2559f6d933165b
---
 tensorflow/c/eager/c_api_experimental.cc      | 14 ++++++
 tensorflow/c/eager/c_api_experimental.h       | 12 +++++
 .../core/common_runtime/eager/context.h       |  7 +--
 .../core/common_runtime/eager/execute.cc      | 45 ++++++++++++-------
 tensorflow/python/client/session_test.py      | 11 ++++-
 tensorflow/python/eager/context.py            | 16 +++----
 tensorflow/python/eager/core_test.py          |  1 -
 tensorflow/python/framework/config_test.py    | 16 +++----
 tensorflow/python/tfe_wrapper.cc              | 12 +++++
 9 files changed, 95 insertions(+), 39 deletions(-)

diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index dd9e5e111d9..0d71b11531b 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -657,3 +657,17 @@ TFE_TensorHandle* TFE_CreatePackedTensorHandle(TFE_Context* ctx,
       std::move(tensor_handles), context, &handle);
   return tensorflow::wrap(handle);
 }
+
+void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                       TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetAllowSoftPlacement(enable);
+}
+
+void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                      TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetLogDevicePlacement(enable);
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 584f7222111..1b8efe61ee0 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -549,6 +549,18 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_CreatePackedTensorHandle(
     TFE_Context* ctx, TFE_TensorHandle** handles, int* num_handles,
     TF_Status* status);
 
+// Configure soft device placement policy for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx,
+                                                      unsigned char enable,
+                                                      TF_Status* status);
+
+// Configure device placement policy logging for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx,
+                                                     unsigned char enable,
+                                                     TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 683425919d1..d034aaf2f9c 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -300,7 +300,9 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
 
   bool LogDevicePlacement() const { return log_device_placement_; }
+  void SetLogDevicePlacement(bool enable) { log_device_placement_ = enable; }
   bool AllowSoftPlacement() const { return allow_soft_placement_; }
+  void SetAllowSoftPlacement(bool enable) { allow_soft_placement_ = enable; }
   bool LogMemory() const { return log_memory_; }
 
   Rendezvous* GetRendezvous() const { return rendezvous_; }
@@ -625,9 +627,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   mutex metadata_mu_;
   RunMetadata run_metadata_ TF_GUARDED_BY(metadata_mu_);
   GraphCollector graph_collector_;
-  // TODO(fishx): Allow update following two bool after context creation.
-  const bool log_device_placement_;
-  const bool allow_soft_placement_;
+  std::atomic<bool> log_device_placement_;
+  std::atomic<bool> allow_soft_placement_;
 
   // Information related to step containers.
   std::atomic<int> num_active_steps_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 3036e6d7989..f6b4370bbdc 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -365,6 +365,9 @@ Status GetOrCreateKernelAndDevice(
   Device* device = absl::get<Device*>(op->Device());
 
   Fprint128 cache_key = op->MutableAttrs()->CacheKey(op->DeviceName());
+  /// Include soft placement policy in cache key since the placement strategy
+  // can change and thus affect which kernel is picked.
+  cache_key = FingerprintCat128(cache_key, ctx.AllowSoftPlacement());
 
   std::vector<Device*> input_dev_ptrs;
   absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
@@ -488,13 +491,6 @@ Status GetOrCreateKernelAndDevice(
                << KernelsRegisteredForOp(op->Name());
       op->SetDevice(device);
     }
-    if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-      string msg = strings::StrCat("Executing op ", ndef.op(), " in device ",
-                                   DeviceNameOrUnspecified(device));
-      if (!logging::LogToListeners(msg)) {
-        LOG(INFO) << msg;
-      }
-    }
 
     FunctionLibraryRuntime* flr =
         device == nullptr ? nullptr : ctx.func_lib(device);
@@ -607,6 +603,14 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   int num_outputs = kernel->num_outputs();
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
 
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   GraphCollector* graph_collector = nullptr;
   if (ctx.ShouldStoreGraphs()) {
     graph_collector = ctx.GetGraphCollector();
@@ -841,6 +845,16 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       ctx.GetContextViewId(), eager_client.get(),
       op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
       op->Inputs(), {retvals, num_outputs}));
+
+  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat(
+        "Executing op ", op->Name(), " on task ",
+        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   Status s = executor.AddOrExecute(std::move(node));
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
@@ -1119,15 +1133,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
     return EagerLocalExecute(op, retvals, num_retvals);
   }
 
-  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = strings::StrCat(
-        "Executing op ", op->Name(), " on task ",
-        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
-    if (!logging::LogToListeners(msg)) {
-      LOG(INFO) << msg;
-    }
-  }
-
 #if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
       "Eager's remote execution is not available on mobile devices.");
@@ -1428,6 +1433,14 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
     return;
   }
 
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   GraphCollector* graph_collector = nullptr;
   if (ctx.ShouldStoreGraphs()) {
     graph_collector = ctx.GetGraphCollector();
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index dd8e64ac182..1c244c1b297 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1917,6 +1917,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         a = constant_op.constant(1)
         b = constant_op.constant(2)
         c = a + b
+        # Ensure if the same kernel with the same arguments is executed then its
+        # execution is logged.
+        d = a + b
     else:
       # Passing the config to the server, but not the session should still
       # result in logging device placement.
@@ -1925,12 +1928,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(1)
       b = constant_op.constant(2)
       c = a + b
+      d = a + b
       with session.Session(server.target) as sess:
         with CaptureStderr() as log:
-          sess.run(c)
+          c, d = sess.run([c, d])
 
+    self.assertEqual(c, 3)
+    self.assertEqual(d, 3)
     # Ensure that we did log device placement.
-    self.assertTrue('/replica:0/task:0/device:CPU:0' in str(log), str(log))
+    add_executions = [l for l in str(log).splitlines() if 'AddV2' in l]
+    self.assertEqual(len(add_executions), 2)
 
   @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 182b8478420..86b3d5cf95f 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1509,9 +1509,11 @@ class Context(object):
     return self.config.allow_soft_placement
 
   @soft_device_placement.setter
-  def soft_device_placement(self, enabled):
-    self._soft_device_placement = enabled
+  def soft_device_placement(self, enable):
+    if self._context_handle is not None:
+      pywrap_tfe.TFE_ContextSetSoftDevicePlacement(self._handle, enable)
 
+    self._soft_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
@@ -1519,15 +1521,11 @@ class Context(object):
     return self.config.log_device_placement
 
   @log_device_placement.setter
-  def log_device_placement(self, enabled):
-    if self._log_device_placement == enabled:
-      return
-
+  def log_device_placement(self, enable):
     if self._context_handle is not None:
-      raise RuntimeError(
-          "Device placement logging must be set at program startup")
+      pywrap_tfe.TFE_ContextSetLogDevicePlacement(self._handle, enable)
 
-    self._log_device_placement = enabled
+    self._log_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 47b3966827f..c1401fc56ee 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -1112,5 +1112,4 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == '__main__':
-  context.set_log_device_placement(True)
   test.main()
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index b07bb874385..3051f1d0623 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -159,7 +159,6 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertFalse(config.get_soft_device_placement())
 
-    @def_function.function
     def mod():
       with ops.device('/device:GPU:0'):
         a = constant_op.constant(1.0)
@@ -172,8 +171,10 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         config.get_soft_device_placement(),
         context.context().soft_device_placement)
 
-    # Since soft placement is enabled, the mod operation should work with CPU
+    # Since soft placement is enabled, the mod operation should fallback to CPU
+    # with pure eager execution as well as functions
     mod()
+    def_function.function(mod)()
 
     config.set_soft_device_placement(False)
     self.assertEqual(config.get_soft_device_placement(), False)
@@ -182,8 +183,11 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         context.context().soft_device_placement)
 
     # Since soft placement is disabled, the mod operation should fail on GPU
+    # with pure eager execution as well as functions
     with self.assertRaises(errors.InvalidArgumentError):
       mod()
+    with self.assertRaises(errors.InvalidArgumentError):
+      def_function.function(mod)()
 
   @reset_eager
   def testLogDevicePlacement(self):
@@ -203,12 +207,8 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
 
     context.ensure_initialized()
 
-    with self.assertRaises(RuntimeError):
-      context.set_log_device_placement(True)
-
-    # If the setting the device placement is a no-op, do not throw a runtime
-    # exception.
-    context.set_log_device_placement(False)
+    # Changing the device placement should not throw an exception
+    context.set_log_device_placement(True)
 
   @reset_eager
   def testEnableMlirBridge(self):
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index ec54efa61cf..836cafbd494 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -488,6 +488,18 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     // NOTE: different from TFE_ContextSyncExecutors that raises potential
     // errors, deliberately ignore executor statuses in cleanup.
   });
+  m.def("TFE_ContextSetSoftDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
+  m.def("TFE_ContextSetLogDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
 
   // TFE_Executor logic
   m.def(

From 843f3da02df95e95593af625091646e3ed49b8d6 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 12 May 2020 23:26:08 -0700
Subject: [PATCH 0475/1533] Disable flaky test

PiperOrigin-RevId: 311272834
Change-Id: Id8af3ac197f65dde4ae50c9b5ad63d2d328652f6
---
 tensorflow/python/ops/parallel_for/control_flow_ops_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 11380b2dac2..01776808525 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -1400,6 +1400,8 @@ class StatelessIfTest(PForTestCase):
 class IfTest(PForTestCase):
 
   def test_read_var(self):
+    self.skipTest("b/156438918")  # Flaky
+
     x = [1, 2, 3, 4, 5.]
     y = 2.5
     z = resource_variable_ops.ResourceVariable(5.)

From a88c46347c20f6e4875f4c1c75ffc5b5bf38edb8 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 15:35:18 +0800
Subject: [PATCH 0476/1533] change and cleanup per review

---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc        | 9 ++++-----
 tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h  | 4 ++--
 .../tools/accuracy/ilsvrc/imagenet_model_evaluator.cc    | 2 --
 .../tools/accuracy/ilsvrc/imagenet_model_evaluator.h     | 3 ---
 .../tools/benchmark/benchmark_performance_options.cc     | 3 ---
 .../lite/tools/delegates/default_execution_provider.cc   | 4 ----
 .../lite/tools/delegates/nnapi_delegate_provider.cc      | 2 +-
 .../tools/evaluation/evaluation_delegate_provider.cc     | 4 ----
 .../evaluation/tasks/coco_object_detection/run_eval.cc   | 5 -----
 .../tasks/imagenet_image_classification/run_eval.cc      | 5 -----
 .../tools/evaluation/tasks/inference_diff/run_eval.cc    | 4 ----
 11 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 867d03f5227..ff6ad0dc0d9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -3151,7 +3151,7 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
                                     "creating NNAPI model", nnapi_errno);
     nn_model_.reset(model);
 
-    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->delegate,
+    TF_LITE_ENSURE_STATUS(BuildGraph(context, delegate_options,
                                      params->input_tensors,
                                      params->output_tensors, nnapi_errno));
   }
@@ -3203,7 +3203,6 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
 
   const auto delegate_options =
       StatefulNnApiDelegate::GetOptions(node->delegate);
-
   ANeuralNetworksCompilation* compilation = nullptr;
   if (!nnapi_devices_.empty()) {
     // Compile for the selected accelerator.
@@ -3877,7 +3876,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
 }
 
 TfLiteStatus NNAPIDelegateKernel::BuildGraph(
-    TfLiteContext* context, TfLiteDelegate* delegate,
+    TfLiteContext* context,
+    const StatefulNnApiDelegate::Options& delegate_options,
     const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
     int* nnapi_errno) {
   // Build the ops and tensors.
@@ -3888,7 +3888,6 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
   std::vector<uint32_t> outputs;
   outputs.reserve(output_tensors->size);
 
-  const auto delegate_options = StatefulNnApiDelegate::GetOptions(delegate);
   size_t total_input_byte_size = 0;
   // Make the TensorFlow Lite inputs and outputs to ann_indices.
   for (int i : TfLiteIntArrayView(input_tensors)) {
@@ -4025,9 +4024,9 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   delegate_data_.disallow_nnapi_cpu = options.disallow_nnapi_cpu;
   delegate_data_.max_number_delegated_partitions =
       options.max_number_delegated_partitions;
+  delegate_data_.allow_fp16 = options.allow_fp16;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
-  delegate_data_.allow_fp16 = options.allow_fp16;
   Prepare = DoPrepare;
   CopyFromBufferHandle = DoCopyFromBufferHandle;
   CopyToBufferHandle = DoCopyToBufferHandle;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 60151196372..5d0ea63ab4c 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -349,8 +349,8 @@ class NNAPIDelegateKernel {
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
-                          TfLiteDelegate* delegate,
-                          const TfLiteIntArray* input_tensors,
+                          const StatefulNnApiDelegate::Options& options,
+			  const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
 };
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 64ce87ae8aa..f318dc68d09 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -141,8 +141,6 @@ class CompositeObserver : public ImagenetModelEvaluator::Observer {
       tflite::Flag::CreateFlag(kNumRanksFlag, &params.num_ranks,
                                "Generates the top-1 to top-k accuracy values"
                                "where k = num_ranks. Default: 10"),
-      tflite::Flag::CreateFlag("nnapi_allow_fp16", &params.nnapi_allow_fp16,
-                               "allow fp16 in nnapi"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 3ba22cbc2af..65d4a2c49f8 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -78,9 +78,6 @@ class ImagenetModelEvaluator {
 
     // Number of interpreter threads.
     int num_interpreter_threads = 1;
-
-    // allow fp16
-    bool nnapi_allow_fp16 = false;
   };
 
   // An evaluation observer.
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index c2d9374506e..cfce23c4595 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -303,9 +303,6 @@ void BenchmarkPerformanceOptions::CreatePerformanceOptions() {
                         BenchmarkParam::Create<bool>(false));
         params.AddParam("max_delegated_partitions",
                         BenchmarkParam::Create<int>(0));
-        params.AddParam("max_delegated_partitions",
-        params.AddParam("nnapi_allow_fp16",
-                        BenchmarkParam::Create<bool>(false));
         all_run_params_.emplace_back(std::move(params));
       }
     }
diff --git a/tensorflow/lite/tools/delegates/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
index 67c38308206..f75fd791072 100644
--- a/tensorflow/lite/tools/delegates/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -30,7 +30,6 @@ class DefaultExecutionProvider : public DelegateProvider {
                              ToolParam::Create<int32_t>(0));
     default_params_.AddParam("min_nodes_per_partition",
                              ToolParam::Create<int32_t>(0));
-    default_params_.AddParam("allow_fp16", ToolParam::Create<bool>(false));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -45,7 +44,6 @@ std::vector<Flag> DefaultExecutionProvider::CreateFlags(
   std::vector<Flag> flags = {
       CreateFlag<int32_t>("num_threads", params,
                           "number of threads used for inference on CPU."),
-      CreateFlag<bool>("allow_fp16", params, "allow_fp16"),
       CreateFlag<int32_t>("max_delegated_partitions", params,
                           "Max number of partitions to be delegated."),
       CreateFlag<int32_t>(
@@ -63,8 +61,6 @@ void DefaultExecutionProvider::LogParams(const ToolParams& params) const {
                    << params.Get<int32_t>("max_delegated_partitions") << "]";
   TFLITE_LOG(INFO) << "Min nodes per partition : ["
                    << params.Get<int32_t>("min_nodes_per_partition") << "]";
-  TFLITE_LOG(INFO) << "allow_fp16: ["
-                   << params.Get<bool>("allow_fp16") << "]";
 }
 
 TfLiteDelegatePtr DefaultExecutionProvider::CreateTfLiteDelegate(
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index 6492ba82849..2fbfb791e8c 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -88,7 +88,7 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
                        << params.Get<bool>("disable_nnapi_cpu") << "]";
     }
     if (params.Get<bool>("nnapi_allow_fp16")) {
-      TFLITE_LOG(INFO) << "nnapi_allow_fp16: ["
+      TFLITE_LOG(INFO) << "Allow fp16 in NNAPI: ["
                        << params.Get<bool>("nnapi_allow_fp16") << "]";
     }
   }
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index ea07378a8fa..42f2666ba9b 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -132,10 +132,6 @@ tools::ToolParams DelegateProviders::GetAllParams(
     tool_params.Set<int32_t>("num_threads", params.num_threads());
   }
 
-  if (params.has_nnapi_allow_fp16()) {
-    tool_params.Set<bool>("nnapi_allow_fp16", params.nnapi_allow_fp16());
-  }
-
   const auto type = params.delegate();
   switch (type) {
     case TfliteInferenceParams::NNAPI:
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index de1ae6e2e94..765e8fc6465 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -65,7 +65,6 @@ class CocoObjectDetection : public TaskExecutor {
   bool debug_mode_;
   std::string delegate_;
   int num_interpreter_threads_;
-  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -105,9 +104,6 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
-      tflite::Flag::CreateFlag(
-          "nnapi_allow_fp16", &allow_fp16_,
-          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   DelegateProviders delegate_providers;
@@ -136,7 +132,6 @@ absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_nnapi_allow_fp16(allow_fp16_);
 
   // Get ground truth data.
   absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 8a7fd864c6e..13eeb313ad4 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -67,7 +67,6 @@ class ImagenetClassification : public TaskExecutor {
   std::string delegate_;
   int num_images_;
   int num_interpreter_threads_;
-  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -107,9 +106,6 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
-      tflite::Flag::CreateFlag(
-          "nnapi_allow_fp16", &allow_fp16_,
-          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -159,7 +155,6 @@ absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
 
   ImageClassificationStage eval(eval_config);
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index c85d997974b..814ebe3b3bf 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -50,7 +50,6 @@ class InferenceDiff : public TaskExecutor {
   std::string delegate_;
   int num_runs_;
   int num_interpreter_threads_;
-  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -72,8 +71,6 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
-      tflite::Flag::CreateFlag("nnapi_allow_fp16", &allow_fp16_,
-                               "nnapi allow fp16")
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -91,7 +88,6 @@ absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
   // generating random data.
   inference_params->set_invocations_per_run(3);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   if (!delegate_.empty() &&
       inference_params->delegate() == TfliteInferenceParams::NONE) {
     TFLITE_LOG(WARN) << "Unsupported TFLite delegate: " << delegate_;

From 9083aa48e7634edcbc41d63804e5df662e6a8c4b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 01:55:49 -0700
Subject: [PATCH 0477/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311289097
Change-Id: Ic47747fe7d0fd7269c0203be9b1009e400b4b297
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 5eb1be50238dca9a5b92757391b4750b3529aae3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 02:01:33 -0700
Subject: [PATCH 0478/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/58bc507b6fe6

PiperOrigin-RevId: 311289597
Change-Id: I1471895afdb961a19df531bc566898e486162d96
---
 tensorflow/compiler/mlir/lite/BUILD           |  2 +-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  2 +-
 tensorflow/compiler/mlir/tensorflow/BUILD     |  4 ++--
 .../compiler/mlir/tensorflow/ir/tf_op_base.td |  2 +-
 tensorflow/compiler/mlir/tfjs/BUILD           |  2 +-
 tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td  |  2 +-
 tensorflow/compiler/mlir/tfrt/BUILD           |  2 +-
 .../runtime_fallback/runtime_fallback_ops.td  |  2 +-
 tensorflow/compiler/mlir/xla/BUILD            |  4 ++--
 tensorflow/compiler/mlir/xla/ir/chlo_ops.td   |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 +-
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   |  2 +-
 third_party/mlir/BUILD                        | 24 +++++++++----------
 third_party/mlir/test.BUILD                   |  2 +-
 14 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index f99b2806faf..9b5b0c209e5 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -31,7 +31,7 @@ filegroup(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 13b8ae83e34..fdf1501dbef 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -20,7 +20,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
 include "tensorflow/compiler/mlir/lite/quantization/quantization.td"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 0edf0f33a23..54b560ed6ce 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -36,7 +36,7 @@ filegroup(
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -1075,7 +1075,7 @@ genrule(
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "ir/tf_generated_ops.td",
         "ir/tf_op_base.td",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index cd20cc79c17..dbd8ab0fae2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TF_OP_BASE
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 806a77e9c38..ac629ac4573 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -40,7 +40,7 @@ gentbl(
         "ir/tfjs_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
index 172347bc0f5..134aa010d8c 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TFJS_DIALECT
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TensorFlow.js dialect definitions
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 78787245bd6..edcfc574452 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -40,7 +40,7 @@ filegroup(
     srcs = [
         "runtime_fallback/runtime_fallback_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@tf_runtime//:OpBaseTdFiles",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
index aeed800a1c3..c33c6f8d73d 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
@@ -20,7 +20,7 @@ limitations under the License.
 #define TFRT_DELEGATE_DIALECT
 
 include "tfrt/tfrt_op_base.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // Type definitions
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index d9108e8f3bc..590595a668f 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -38,7 +38,7 @@ filegroup(
         "ir/lhlo_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -822,7 +822,7 @@ genrule(
     name = "operator_writer_inc",
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         ":ir/hlo_ops.td",
         ":ir/hlo_ops_base.td",
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
index a244985c9b5..f9672c1a95a 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
@@ -31,7 +31,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def HLOClient_Dialect : Dialect {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 0db9563a4c1..f78ac7624d2 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_utils.td"
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index d7e838a6f2b..db75bbd1f67 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -19,7 +19,7 @@ limitations under the License.
 #define LHLO_OPS
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 75b32c73260..ce5468fe679 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -178,7 +178,7 @@ filegroup(
         "include/mlir/Dialect/Affine/IR/AffineOps.td",
         "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -217,7 +217,7 @@ filegroup(
         "include/mlir/Dialect/AVX512/AVX512.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/IR/OpBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -302,7 +302,7 @@ filegroup(
         "include/mlir/Dialect/SCF/SCFOps.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -374,7 +374,7 @@ filegroup(
         "include/mlir/IR/OpAsmInterface.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         ":OpBaseTdFiles",
     ],
@@ -997,7 +997,7 @@ filegroup(
         "include/mlir/Dialect/GPU/GPUOps.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/IR/SymbolInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1130,7 +1130,7 @@ filegroup(
         "include/mlir/Dialect/LLVMIR/LLVMOps.td",
         "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1417,7 +1417,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/NVVMOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1489,7 +1489,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/ROCDLOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1541,7 +1541,7 @@ filegroup(
         "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ] + glob(["include/mlir/Dialect/SPIRV/*.td"]),
 )
@@ -2244,7 +2244,7 @@ gentbl(
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Interfaces/SideEffects.td",
+    td_file = "include/mlir/Interfaces/SideEffectInterfaces.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
@@ -2910,7 +2910,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/Quant/QuantOps.td",
         "include/mlir/Dialect/Quant/QuantOpsBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -3390,7 +3390,7 @@ exports_files(
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index a0312a54b68..c19d312d082 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -77,7 +77,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
     test = True,
 )

From 9c8ca4905e335120e8fb19ea316674416ed1a27e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 02:02:49 -0700
Subject: [PATCH 0479/1533] Update GraphDef version to 400.

PiperOrigin-RevId: 311289755
Change-Id: Ibae7d2dcd3f4b697e7f2735183c62d4669ead6ba
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 23e6138d553..68df6a1b632 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 399  // Updated: 2020/5/12
+#define TF_GRAPH_DEF_VERSION 400  // Updated: 2020/5/13
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 101d46ab716931f27c76b86c2f4d1e5780b43e64 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 02:02:55 -0700
Subject: [PATCH 0480/1533] compat: Update forward compatibility horizon to
 2020-05-13

PiperOrigin-RevId: 311289765
Change-Id: I6167b9a3d737248f831fbd4405339a9e59220944
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 627979a5cb1..26d291877cb 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 12)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 13)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 8e073e237ed258dac220d3cc1a177a08e43f2c0d Mon Sep 17 00:00:00 2001
From: "Felix E. Klee" <felix.klee@inka.de>
Date: Wed, 13 May 2020 17:47:52 +0800
Subject: [PATCH 0481/1533] Fix typo preventing compilation

`idf.py build` returned:

../main/esp/app_camera_esp.h:46:27: error: 'FRAMESIZE_96x96'
undeclared (first use in this function); did you mean
'FRAMESIZE_96X96'?
---
 .../lite/micro/examples/person_detection/esp/app_camera_esp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h b/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h
index 403fb4defb1..e8cbe2177a9 100644
--- a/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h
+++ b/tensorflow/lite/micro/examples/person_detection/esp/app_camera_esp.h
@@ -30,7 +30,7 @@ limitations under the License.
 #define CAMERA_PIXEL_FORMAT PIXFORMAT_GRAYSCALE
 
 /*
- * FRAMESIZE_96x96,    // 96x96
+ * FRAMESIZE_96X96,    // 96x96
  * FRAMESIZE_QQVGA,    // 160x120
  * FRAMESIZE_QQVGA2,   // 128x160
  * FRAMESIZE_QCIF,     // 176x144
@@ -43,7 +43,7 @@ limitations under the License.
  * FRAMESIZE_SXGA,     // 1280x1024
  * FRAMESIZE_UXGA,     // 1600x1200
  */
-#define CAMERA_FRAME_SIZE FRAMESIZE_96x96
+#define CAMERA_FRAME_SIZE FRAMESIZE_96X96
 
 #if CONFIG_CAMERA_MODEL_WROVER_KIT
 #define PWDN_GPIO_NUM -1

From ab67ad7c4490c268abd7d46f457fbe1c425fe070 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 13 May 2020 02:44:55 -0700
Subject: [PATCH 0482/1533] Bump open source llvm revision to
 1c44430e738ba83eefe6d56a245ee30649d8988d

PiperOrigin-RevId: 311293944
Change-Id: I97e99d957847f7e7664549795c1a3fd30fedd987
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index fe548fdec05..83e74f3d105 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -162,8 +162,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
-    TFRT_COMMIT = "341ba0448c117af4e29ae3911141265ee8e57860"
-    TFRT_SHA256 = "27716458f8ca7d91fc2d0f681127dbdd478eea78d6da5153c51b4696ebd14d55"
+    TFRT_COMMIT = "26fb26d716545388edb9785f8f4b3e60a4ad5092"
+    TFRT_SHA256 = "f7419a3eaab8b7137a4de5b428045a731d93da91ef1bce9ba91fab81ed23a676"
     TFRT_URLS = [
         "http://mirror.tensorflow.org/github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
         "https://github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "123bee602a260150ff55c74287f583a67ee78f36"
-    LLVM_SHA256 = "313ec75e47ea3f128724a61b8b6b45b7d305ba2ae57a5084b4bf1f881b4ec8f2"
+    LLVM_COMMIT = "1c44430e738ba83eefe6d56a245ee30649d8988d"
+    LLVM_SHA256 = "81ad47eaf74dfaea1befbe7b41facfd9bcee5ca3d5635325584dbabf4bf1fa5e"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From eed4bb5cc10125abc6d175050062372dce34bfd2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 03:47:08 -0700
Subject: [PATCH 0483/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311299753
Change-Id: Ia881aec05fa7e6a9a5f0a559c79bf3ab5fa954a3
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From c117a875a220dd9e097027f308566e6a9398bc18 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 20:26:04 +0800
Subject: [PATCH 0484/1533] fix bad indent and remove leftover

---
 tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h        | 2 +-
 tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 5d0ea63ab4c..668fdf5b5f6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -350,7 +350,7 @@ class NNAPIDelegateKernel {
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
                           const StatefulNnApiDelegate::Options& options,
-			  const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
 };
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index cecdb22c637..09765d71726 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -121,9 +121,6 @@ message TfliteInferenceParams {
   // This helps benchmark cases where extensive pre-processing might not be
   // required for every input.
   optional int32 invocations_per_run = 4 [default = 1];
-
-  // nnapi_allow_fp16
-  optional bool nnapi_allow_fp16 = 5 [default = false];
 }
 
 // Metrics specific to TFLite inference.

From dc4c6d305ba3d2de4a795ec77b483b0fa695b9ee Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Wed, 13 May 2020 06:03:54 -0700
Subject: [PATCH 0485/1533] Change the default value of 'use_coreml' parameter
 to false

PiperOrigin-RevId: 311313238
Change-Id: Id85bd1ad4b86cafbddeba924714256587a7da732
---
 tensorflow/lite/tools/delegates/coreml_delegate_provider.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index c29555716a4..c6509618aee 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -31,7 +31,7 @@ class CoreMlDelegateProvider : public DelegateProvider {
  public:
   CoreMlDelegateProvider() {
 #if defined(REAL_IPHONE_DEVICE)
-    default_params_.AddParam("use_coreml", ToolParam::Create<bool>(true));
+    default_params_.AddParam("use_coreml", ToolParam::Create<bool>(false));
     default_params_.AddParam("coreml_version", ToolParam::Create<int>(0));
 #endif
   }

From 5530521a577d7b939391d4c1bf4672b26e7abac4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 06:50:32 -0700
Subject: [PATCH 0486/1533] Qualify uses of std::string

PiperOrigin-RevId: 311319203
Change-Id: Ia312681455cb0518879cf323518914f49ea88b33
---
 .../core/grappler/inputs/trivial_test_graph_input_yielder.h   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
index 74e5080a30f..bf776bcd2bc 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -30,7 +30,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
  public:
   TrivialTestGraphInputYielder(int num_stages, int width, int tensor_size,
                                bool insert_queue,
-                               const std::vector<string>& device_names);
+                               const std::vector<std::string>& device_names);
   bool NextItem(GrapplerItem* item) override;
 
  private:
@@ -38,7 +38,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
   const int width_;
   const int tensor_size_;
   const bool insert_queue_;
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 };
 
 }  // end namespace grappler

From e4702e19bb1ef0d5fc4e63833fcc88e533371f96 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Wed, 13 May 2020 06:58:18 -0700
Subject: [PATCH 0487/1533] Add SSE4 path for Tanh and Logistic.

PiperOrigin-RevId: 311320167
Change-Id: Ie62fd09adf8e41827796d2102c5f1d505429a139
---
 .../internal/optimized/optimized_ops.h        | 101 ++++++++++++++++++
 tensorflow/workspace.bzl                      |   8 +-
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 5f183de7269..a6d37f4f1ed 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -4332,6 +4332,41 @@ inline void Logistic(const LogisticParams& params,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output0.raw().v);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                       output1.raw().v);
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F0 output = gemmlowp::logistic(input);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output.raw().v);
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
@@ -4438,6 +4473,72 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+            reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr + 8)))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 83e74f3d105..31389d7c459 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -354,11 +354,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "gemmlowp",
-        sha256 = "6678b484d929f2d0d3229d8ac4e3b815a950c86bb9f17851471d143f6d4f7834",  # SHARED_GEMMLOWP_SHA
-        strip_prefix = "gemmlowp-12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3",
+        sha256 = "43146e6f56cb5218a8caaab6b5d1601a083f1f31c06ff474a4378a7d35be9cfb",  # SHARED_GEMMLOWP_SHA
+        strip_prefix = "gemmlowp-fda83bdc38b118cc6b56753bd540caa49e570745",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
-            "https://github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
+            "https://github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
         ],
     )
 

From 0e7612ea0d2ee8ddd65e7bf3e96800911384976e Mon Sep 17 00:00:00 2001
From: Michael Moffitt <moffitt@google.com>
Date: Wed, 13 May 2020 09:21:30 -0500
Subject: [PATCH 0488/1533] Removes duplicate space from retracing warning
 message

---
 tensorflow/python/eager/def_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 48c9b06fa38..c61f39111b1 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -109,7 +109,7 @@ class _FrequentTracingDetector(object):
             "retracing. Tracing is expensive and the excessive number of "
             "tracings could be due to (1) creating @tf.function repeatedly in "
             "a loop, (2) passing tensors with different shapes, (3) passing "
-            "Python objects instead of tensors. For (1), please  define your "
+            "Python objects instead of tensors. For (1), please define your "
             "@tf.function outside of the loop. For (2), @tf.function has "
             "experimental_relax_shapes=True option that relaxes argument "
             "shapes that can avoid unnecessary retracing. For (3), please "

From f8429e72fc992f6b9b353e8db2ae846a1c69d7b8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 07:18:43 -0700
Subject: [PATCH 0489/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/897d8ee5cd69

PiperOrigin-RevId: 311323011
Change-Id: I0d60709d46dffa171e299a7e8bdfc9a1ae43fc06
---
 .../tests/chlo_infer_shape_type_methods.mlir  |  4 ++--
 .../chlo_legalize_to_hlo_broadcasts.mlir      | 12 +++++-----
 .../xla/tests/legalize-tf-BatchMatMulV2.mlir  |  8 +++----
 third_party/mlir/BUILD                        | 24 -------------------
 third_party/mlir/test.BUILD                   |  2 +-
 5 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
index ce0243e416c..d67a7d09f7c 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
@@ -6,8 +6,8 @@
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>,
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?xf32>
 func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<1xindex> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[BCAST_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[EXTENTS:.+]] = "shape.to_extent_tensor"(%[[BCAST_S]])
   // CHECK: return %[[EXTENTS]]
diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 2bc1e0c6852..7194f7034b5 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -14,8 +14,8 @@ func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
@@ -31,8 +31,8 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
@@ -48,8 +48,8 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
index 08df9fd3808..3605e2a0d5c 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -7,8 +7,8 @@
 func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_basic
 // CHECK-SAME:        ([[LHS:%.*]]: tensor<1x4x2xf32>, [[RHS:%.*]]: tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-// CHECK:           [[LHSSHAPE:%.*]] = "shape.shape_of"([[LHS]]) : (tensor<1x4x2xf32>) -> !shape.shape
-// CHECK:           [[RHSSHAPE:%.*]] = "shape.shape_of"([[RHS]]) : (tensor<3x2x4xf32>) -> !shape.shape
+// CHECK:           [[LHSSHAPE:%.*]] = shape.shape_of [[LHS]] : tensor<1x4x2xf32>
+// CHECK:           [[RHSSHAPE:%.*]] = shape.shape_of [[RHS]] : tensor<3x2x4xf32>
 // CHECK:           [[CM2:%.*]] = constant -2 : i32
 // CHECK:           [[LHSHEAD:%.*]], [[LHSTAIL:%.*]] = "shape.split_at"([[LHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
 // CHECK:           [[RHSHEAD:%.*]], [[RHSTAIL:%.*]] = "shape.split_at"([[RHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
@@ -86,8 +86,8 @@ func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2
 // CHECK:           [[RHSIM:%.*]] = "xla_hlo.imag"([[RHS]])
 // CHECK:           [[RHSIMNEG:%.*]] = "xla_hlo.negate"([[RHSIM]])
 // CHECK:           [[RHSCONJ:%.*]] = "xla_hlo.complex"([[RHSRE]], [[RHSIMNEG]])
-// CHECK:           "shape.shape_of"([[LHSCONJ]])
-// CHECK:           "shape.shape_of"([[RHSCONJ]])
+// CHECK:           shape.shape_of [[LHSCONJ]]
+// CHECK:           shape.shape_of [[RHSCONJ]]
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
   return %0 : tensor<5x4xcomplex<f32>>
 }
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index ce5468fe679..8b61ce98dab 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -1801,28 +1801,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "StandardToStandard",
-    srcs = glob([
-        "lib/Conversion/StandardToStandard/*.cpp",
-        "lib/Conversion/StandardToStandard/*.h",
-    ]),
-    hdrs = glob([
-        "include/mlir/Conversion/StandardToStandard/*.h",
-    ]),
-    includes = [
-        "include",
-        "lib/Conversion/StandardToStandard",
-    ],
-    deps = [
-        ":ConversionPassIncGen",
-        ":IR",
-        ":Pass",
-        ":StandardOps",
-        ":Transforms",
-    ],
-)
-
 cc_library(
     name = "SPIRVSerialization",
     srcs = glob(
@@ -2485,7 +2463,6 @@ cc_library(
         ":SCFTransforms",
         ":StandardOpsTransforms",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -2584,7 +2561,6 @@ cc_library(
         ":StandardOpsTransforms",
         ":StandardOpsTransformsPassIncGen",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index c19d312d082..eb5d8a650eb 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -106,7 +106,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:StandardToStandard",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],

From cac1acba3f47ace5027dde0f45df15ff508f2d7a Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 13 May 2020 07:52:00 -0700
Subject: [PATCH 0490/1533] Bump open source llvm revision to
 897d8ee5cd693e17f95a7e84194bca4c089a520b

PiperOrigin-RevId: 311327327
Change-Id: Ib247eab7624ca88b999ccd871f0b1fb0f824ef1f
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 31389d7c459..7cc156a2985 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "1c44430e738ba83eefe6d56a245ee30649d8988d"
-    LLVM_SHA256 = "81ad47eaf74dfaea1befbe7b41facfd9bcee5ca3d5635325584dbabf4bf1fa5e"
+    LLVM_COMMIT = "897d8ee5cd693e17f95a7e84194bca4c089a520b"
+    LLVM_SHA256 = "994677daedf23bc93ce04f1a527c07c09b7fbbd0986d867b60bd6710057a40de"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From d11a1769c509b303e814ddbfcf3d60a07e993440 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 08:16:59 -0700
Subject: [PATCH 0491/1533] Separate out creating arguments from
 PromoteResourcesToArguments in PromoteResourcesToArgsPass (NFC).

This simplifies some logic where there is special handling depending on if the resource was originally an argument already present or a VarHandleOp.

PiperOrigin-RevId: 311331121
Change-Id: I603f007c28558e3604c62fb991ac82ca560e143e
---
 .../tests/promote_resources_to_args.mlir      |  76 ++--
 .../transforms/promote_resources_to_args.cc   | 357 ++++++++++--------
 2 files changed, 251 insertions(+), 182 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index e7f4873594b..eb6d40d20d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -1,11 +1,11 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-resources-to-args | FileCheck %s -dump-input-on-failure
 
 // One resource, one read. The initial value of the resource is read.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD]])
   // CHECK: return %[[PACK]]
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -19,8 +19,8 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource, one write. The initial value of the resource is not read.
-// CHECK-LABEL: func @main() -> (tensor<f32> {tf.resource_name = "x"})
-func @main() {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: return %[[CONST]]
@@ -33,12 +33,12 @@ func @main() {
 // -----
 
 // One resource, two reads using different resource handles.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg0)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -56,12 +56,12 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // Two resources, two reads using different resources.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}, %arg1: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}, %arg2: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg2)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -79,12 +79,12 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource with read and write. The initial value of the resource is read.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %{{[0-9]*}})
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %{{[0-9]*}})
   // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %[[ADD1]])
-  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg0, %[[ADD2]])
+  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg1, %[[ADD2]])
   // CHECK: return %[[PACK]], %[[ADD1]]
 
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -102,8 +102,8 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource with read and write. The initial value of the resource is not read.
-// CHECK-LABEL: func @main() -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
   // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%[[CONST]], %[[CONST]])
@@ -138,8 +138,8 @@ func @cond_true(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<f32>) ->
   return %2 : tensor<f32>
 }
 
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
   %0 = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>} : () -> tensor<f32>
   %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %2 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
@@ -157,10 +157,11 @@ func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outp
 // Tests resource passed in as an argument is not modified and not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
-  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
-  // CHECK-NEXT: "tf.AddV2"(%[[ARG_0]], %[[ARG_0]])
+// CHECK-SAME: %arg0: tensor<i1>
+// CHECK-SAME: %[[ARG_1:[a-z0-9]+]]: tensor<f32>
+func @main(%arg0: tensor<i1>, %arg1: tensor<!tf.resource<tensor<f32>>>) {
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  // CHECK-NEXT: "tf.AddV2"(%[[ARG_1]], %[[ARG_1]])
   %1 = "tf.AddV2"(%0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT: return
   return
@@ -171,9 +172,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests resource passed in as an argument is modified but not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   // CHECK-NEXT: %[[CONST:[a-z0-9]+]] = "tf.Const"
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -186,9 +188,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests last resource assign is returned as a result.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -204,9 +207,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // returns the same value prior.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -221,9 +225,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
 // Tests read interleaved between writes.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   // CHECK-NEXT: %[[CONST_0:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -312,9 +317,10 @@ func @main() {
 // Tests resource argument has users that are not ReadVariableOp or
 // AssignVariableOp.
 
-// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'}}
+// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
 func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
   %0 = "tf.VarIsInitializedOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
+  %1 = "tf.UnknownOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   return %0 : tensor<i1>
 }
 
@@ -323,7 +329,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
 // Tests VarHandleOp has users that are not removed.
 
 func @main() -> tensor<i1> {
-  // expected-error@+1 {{expects no uses but used by operations: tf.UnknownOp, tf.VarIsInitializedOp}}
+  // expected-error@+1 {{expects users to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
   %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %1 = "tf.VarIsInitializedOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   %2 = "tf.UnknownOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index fa4fe461317..9001c00bebe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -47,11 +47,14 @@ limitations under the License.
 //  . Dead functions have already been removed, as resource arguments in dead
 //    functions can cause the pass to fail.
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -73,31 +76,117 @@ constexpr char kResourceFunctionMsg[] =
     "expects function level resource argument";
 constexpr char kInvalidResourceMsg[] =
     "expects resource to be a VarHandleOp or function argument";
+constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
-// Records the input argument index and the current live value for a resource
-// variable.
-//
-// . If the input argument already exists or has been added, input_index is the
-//   index of the function, and live_value_or_type tracks the live value of the
-//   resource.
-//
-// . If the input argument has not been added in the pass, input_index is
-//   kInputUnassigned, live_value_or_type represents the type of the resource.
-//   (a) If this resource is read, add a new argument whose type is obtained
-//       from live_value_or_type, and input_index and live_value_or_type will be
-//       updated to reference the new argument.
-//   (b) If this resource is written, live_value_or_type will track the new
-//       value of the resource. input_index will remain to be kInputUnassigned.
+// Collects names of users of a resource that are not `tf.ReadVariableOp` and
+// not `tf.AssignVariableOp`.
+llvm::SmallSet<llvm::StringRef, 1> GetCompositeResourceUserNames(
+    Value resource) {
+  // SmallSet will use a vector when there is only one element and use std::set
+  // when there are more than one elements. This ensures that the operations in
+  // the error message are ordered.
+  llvm::SmallSet<llvm::StringRef, 1> composite_users;
+  for (Operation* user : resource.getUsers())
+    if (!llvm::isa<TF::ReadVariableOp>(user) &&
+        !llvm::isa<TF::AssignVariableOp>(user))
+      composite_users.insert(user->getName().getStringRef());
+
+  return composite_users;
+}
+
+// Checks if `tf.VarHandleOp` has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateVarHandle(TF::VarHandleOp var_handle_op) {
+  auto resource_type =
+      getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
+  if (resource_type.getSubtypes().size() != 1)
+    return var_handle_op.emitOpError()
+           << "expects resource type to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(var_handle_op);
+  if (!composite_ops.empty())
+    return var_handle_op.emitOpError()
+           << "expects users to be 'tf.ReadVariableOp' or "
+              "'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Checks if resource argument has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateResourceArgument(FuncOp function,
+                                             BlockArgument resource_arg,
+                                             TF::ResourceType resource_type) {
+  if (resource_type.getSubtypes().size() != 1)
+    return function.emitError()
+           << "expects resource type of argument "
+           << resource_arg.getArgNumber() << " to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(resource_arg);
+  if (!composite_ops.empty())
+    return function.emitError()
+           << "expects users of resource argument "
+           << resource_arg.getArgNumber()
+           << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Adds resource arguments for every unique (name) variable handle. Associated
+// `tf.VarHandleOp` are removed from the function. Variable shared names are
+// returned in `var_handle_shared_names` based on the ordering of added resource
+// arguments.
+mlir::LogicalResult PromoteVarHandlesToArguments(
+    FuncOp function, bool add_validation,
+    llvm::SmallVectorImpl<std::string>* var_handle_shared_names) {
+  Block& block = function.front();
+  auto func_type = function.getType();
+
+  auto func_arg_types = llvm::to_vector<4>(func_type.getInputs());
+  llvm::SmallDenseMap<llvm::StringRef, int> var_arg_index_by_name;
+  for (auto var_handle_op :
+       llvm::make_early_inc_range(block.getOps<TF::VarHandleOp>())) {
+    if (add_validation && failed(ValidateVarHandle(var_handle_op)))
+      return failure();
+
+    llvm::StringRef name = var_handle_op.shared_nameAttr().getValue();
+    auto it = var_arg_index_by_name.insert({name, func_arg_types.size()});
+    if (it.second) {
+      var_handle_shared_names->emplace_back(name);
+      auto resource_type = var_handle_op.resource().getType();
+      func_arg_types.push_back(resource_type);
+      var_handle_op.resource().replaceAllUsesWith(
+          block.addArgument(resource_type));
+    } else {
+      var_handle_op.resource().replaceAllUsesWith(
+          block.getArgument(it.first->getSecond()));
+    }
+    var_handle_op.erase();
+  }
+
+  if (!var_handle_shared_names->empty())
+    function.setType(FunctionType::get(func_arg_types, func_type.getResults(),
+                                       function.getContext()));
+
+  return success();
+}
+
+// Records the current live value for a resource variable and whether a read or
+// write on the variable occurred.
 struct ResourceInfo {
-  static constexpr int64_t kInputUnassigned = -1;
-  int64_t input_index;
-  llvm::PointerUnion<Value, Type> live_value_or_type;
+  Value live_value = nullptr;
+  bool read = false;
+  bool write = false;
 };
 
-using ArgOrName = llvm::PointerUnion<BlockArgument, Attribute>;
-using ResourceMap = llvm::SmallDenseMap<ArgOrName, ResourceInfo>;
-
-LogicalResult PromoteResourcesToArguments(FuncOp function) {
+LogicalResult PromoteResourcesToArguments(
+    FuncOp function, llvm::ArrayRef<std::string> var_handle_shared_names) {
   Block& block = function.front();
 
   auto return_op = llvm::dyn_cast_or_null<ReturnOp>(block.getTerminator());
@@ -105,82 +194,61 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
     return function.emitError(
         "expects 'main' function to have a MLIR ReturnOp");
 
-  ResourceMap resource_map;
+  llvm::SmallVector<ResourceInfo, 4> resources(function.getNumArguments());
   auto argument_types = llvm::to_vector<4>(function.getType().getInputs());
+  bool has_resources = false;
+  auto add_resource_argument = [&](BlockArgument arg,
+                                   TF::ResourceType resource_type) {
+    Type arg_type = resource_type.getSubtypes().front();
+    arg.setType(arg_type);
+    resources[arg.getArgNumber()].live_value = arg;
+    argument_types[arg.getArgNumber()] = arg_type;
+    has_resources = true;
+  };
 
-  // Loop through the resource arguments in the function and store a mapping
-  // from that argument to its index and itself as the current live value.
-  for (BlockArgument& func_arg : function.getArguments()) {
+  // Loop through the non `tf.VarHandleOp` resource arguments in the function,
+  // validate its uses and subtype, and store a mapping from that argument to
+  // itself as the current live value.
+  auto func_args = function.getArguments().take_front(
+      function.getNumArguments() - var_handle_shared_names.size());
+  for (BlockArgument& func_arg : func_args) {
     auto resource_type =
         getElementTypeOrSelf(func_arg.getType()).dyn_cast<TF::ResourceType>();
     if (!resource_type) continue;
-    if (resource_type.getSubtypes().size() != 1)
-      return function.emitError()
-             << "expects resource type of argument " << func_arg.getArgNumber()
-             << " to have one subtype, got " << resource_type;
+    if (failed(ValidateResourceArgument(function, func_arg, resource_type)))
+      return failure();
 
-    for (auto* user : func_arg.getUsers())
-      if (!llvm::isa<TF::ReadVariableOp>(user) &&
-          !llvm::isa<TF::AssignVariableOp>(user))
-        return function.emitError()
-               << "expects users of resource argument "
-               << func_arg.getArgNumber()
-               << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'";
-
-    Type arg_type = resource_type.getSubtypes().front();
-    func_arg.setType(arg_type);
-    resource_map[func_arg] = {func_arg.getArgNumber(), func_arg};
-    argument_types[func_arg.getArgNumber()] = arg_type;
+    add_resource_argument(func_arg, resource_type);
   }
 
-  // Loop through the VarHandleOp in the function. When the first VarHandleOp
-  // for a resource variable is encountered, add an entry to the resource_map to
-  // record the information. Do not add a new function argument yet.
-  for (auto var_handle_op : block.getOps<TF::VarHandleOp>()) {
-    if (resource_map.count(var_handle_op.shared_nameAttr())) continue;
-
+  // Loop through `tf.VarHandleOp` resource arguments in the function and store
+  // a mapping from that argument to itself as the current live value. No
+  // validations are necessary here as these arguments were validated prior to
+  // being added.
+  auto var_handle_args =
+      function.getArguments().take_back(var_handle_shared_names.size());
+  for (BlockArgument& var_handle_arg : var_handle_args) {
     auto resource_type =
-        getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
-    if (resource_type.getSubtypes().size() != 1)
-      return var_handle_op.emitOpError()
-             << "expects resource type to have one subtype, got "
-             << resource_type;
-
-    resource_map[var_handle_op.shared_nameAttr()] = {
-        ResourceInfo::kInputUnassigned, resource_type.getSubtypes().front()};
+        getElementTypeOrSelf(var_handle_arg.getType()).cast<TF::ResourceType>();
+    add_resource_argument(var_handle_arg, resource_type);
   }
 
-  if (resource_map.empty()) return success();
+  if (!has_resources) return success();
 
   // We initially assign the argument for a resource as the live value for the
   // resource. We then walk through the operations in the function in their
   // lexical order, to update the live value for the resource when we see a
   // store to the resource and replace reads of the resource with uses of its
-  // live value. For the reads, if the resource does not have a live value yet,
-  // we add a new argument and use it as the live value.
+  // live value.
   for (Operation& op : llvm::make_early_inc_range(block)) {
     if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(&op)) {
       if (auto func_arg = read_op.resource().dyn_cast<BlockArgument>()) {
         if (func_arg.getOwner() != &block)
           return read_op.emitOpError(kResourceFunctionMsg);
 
-        // resource_map[func_arg] is always a Value when func_arg is a
-        // BlockArgument.
-        read_op.value().replaceAllUsesWith(
-            resource_map[func_arg].live_value_or_type.get<Value>());
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     read_op.resource().getDefiningOp())) {
-        ResourceInfo& info = resource_map[var_handle_op.shared_nameAttr()];
-        if (auto live_value = info.live_value_or_type.dyn_cast<Value>()) {
-          read_op.value().replaceAllUsesWith(live_value);
-        } else {
-          auto arg_type = info.live_value_or_type.get<Type>();
-          BlockArgument arg = block.addArgument(arg_type);
-          info.input_index = argument_types.size();
-          info.live_value_or_type = arg;
-          argument_types.push_back(arg_type);
-          read_op.value().replaceAllUsesWith(arg);
-        }
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.read = true;
+        read_op.value().replaceAllUsesWith(resource_info.live_value);
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -191,11 +259,9 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
         if (func_arg.getOwner() != &block)
           return write_op.emitOpError(kResourceFunctionMsg);
 
-        resource_map[func_arg].live_value_or_type = write_op.value();
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     write_op.resource().getDefiningOp())) {
-        resource_map[var_handle_op.shared_nameAttr()].live_value_or_type =
-            write_op.value();
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.write = true;
+        resource_info.live_value = write_op.value();
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -206,67 +272,68 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
 
   const int64_t num_results_before = function.getNumResults();
   auto return_operands = llvm::to_vector<4>(return_op.getOperands());
-  return_operands.reserve(num_results_before + resource_map.size());
   auto result_types = llvm::to_vector<4>(return_op.getOperandTypes());
-  result_types.reserve(num_results_before + resource_map.size());
-  llvm::SmallVector<std::pair<int64_t, Attribute>, 4> output_only_resources;
-  output_only_resources.reserve(resource_map.size());
+  llvm::SmallVector<std::pair<int64_t, llvm::StringRef>, 4>
+      output_only_resources;
   llvm::SmallVector<std::pair<int64_t, int64_t>, 4> input_output_alias;
-  input_output_alias.reserve(resource_map.size());
 
-  // Collect new return values and either (a) output-only resource attributes
-  // (if the resource is not promoted to an argument) or (b) mapping from
-  // resource input index to output alias (if the resource has been promoted to
-  // an argument). If the last live value is itself (argument), then that live
-  // value will not be returned as the resource is unmodified.
-  for (auto& resource : resource_map) {
-    int64_t input_index = resource.getSecond().input_index;
-    auto live_value = resource.getSecond().live_value_or_type.dyn_cast<Value>();
-    if (input_index == ResourceInfo::kInputUnassigned) {
-      if (!live_value) continue;
-
-      output_only_resources.push_back(
-          {return_operands.size(), resource.getFirst().dyn_cast<Attribute>()});
-    } else {
-      // live_value is not nullptr because any input-assigned resource has a
-      // Value as live_value.
-      auto live_arg = live_value.dyn_cast<BlockArgument>();
-      if (live_arg && live_arg.getOwner() == &block &&
-          live_arg.getArgNumber() == input_index)
-        continue;
-
-      input_output_alias.push_back({input_index, return_operands.size()});
-    }
-    return_operands.push_back(live_value);
-    result_types.push_back(live_value.getType());
-  }
-
-  // Erase all VarHandleOp.
-  for (Operation& op : llvm::make_early_inc_range(function.front())) {
-    auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(op);
-    if (!var_handle_op) continue;
-    if (!var_handle_op.use_empty()) {
-      // SmallSet will use a vector when there is only one element and use
-      // std::set when there are more than one elements. This ensures that
-      // the operations in the error message are ordered.
-      llvm::SmallSet<std::string, 2> unique_operations;
-      llvm::for_each(
-          var_handle_op.getOperation()->getUsers(), [&](Operation* user) {
-            unique_operations.insert(user->getName().getStringRef().str());
-          });
-
-      return var_handle_op.emitOpError(
-                 "expects no uses but used by operations: ")
-             << llvm::join(unique_operations.begin(), unique_operations.end(),
-                           ", ");
-    }
-
-    op.erase();
-  }
-
-  // Rewrite return if more results need to be returned by the function.
+  // Collect new return values for variable writes and either (a) output-only
+  // resource attributes (if the resource is not promoted to an argument) or (b)
+  // mapping from resource input index to output alias (if the resource has been
+  // promoted to an argument). Resource arguments that were originally
+  // `tf.VarHandleOp` but not read are collected and then removed.
   OpBuilder builder(return_op);
-  if (!output_only_resources.empty() || !input_output_alias.empty()) {
+  const int var_handles_start_idx =
+      function.getNumArguments() - var_handle_shared_names.size();
+  int new_argument_index = 0;
+  llvm::SmallVector<int, 4> argument_indices_to_remove;
+  for (auto resource_and_index : llvm::enumerate(resources)) {
+    const auto& resource = resource_and_index.value();
+    if (!resource.live_value) {
+      // Ignore non resource arguments.
+      ++new_argument_index;
+      continue;
+    }
+
+    const auto index = resource_and_index.index();
+    const bool is_var_handle = index >= var_handles_start_idx;
+    if (resource.write) {
+      if (!is_var_handle || resource.read) {
+        input_output_alias.push_back(
+            {new_argument_index, return_operands.size()});
+      } else if (is_var_handle) {
+        output_only_resources.push_back(
+            {return_operands.size(),
+             var_handle_shared_names[index - var_handles_start_idx]});
+      }
+      return_operands.push_back(resource.live_value);
+      result_types.push_back(resource.live_value.getType());
+    }
+
+    if (is_var_handle && !resource.read) {
+      assert(block.getArgument(index).getUses().empty());
+      argument_indices_to_remove.push_back(index);
+    } else {
+      if (is_var_handle) {
+        // Add resource_name attribute to VarHandleOp read.
+        function.setArgAttr(
+            new_argument_index, kResourceNameArgAttr,
+            builder.getStringAttr(
+                var_handle_shared_names[index - var_handles_start_idx]));
+      }
+      ++new_argument_index;
+    }
+  }
+
+  // Remove unread var handle arguments.
+  for (int argument_index_to_remove :
+       llvm::reverse(argument_indices_to_remove)) {
+    block.eraseArgument(argument_index_to_remove);
+    argument_types.erase(argument_types.begin() + argument_index_to_remove);
+  }
+
+  // Rewrite return if there are variable writes.
+  if (return_operands.size() > num_results_before) {
     builder.create<ReturnOp>(return_op.getLoc(), return_operands);
     return_op.erase();
   }
@@ -274,17 +341,10 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
   // Update function argument and result types with new resource subtypes.
   function.setType(builder.getFunctionType(argument_types, result_types));
 
-  // Add resource_name attribute to the input argument for the resources.
-  for (auto& resource : resource_map) {
-    if (auto attr = resource.getFirst().dyn_cast<Attribute>()) {
-      int64_t input_index = resource.getSecond().input_index;
-      if (input_index != ResourceInfo::kInputUnassigned)
-        function.setArgAttr(input_index, "tf.resource_name", attr);
-    }
-  }
   // Add resource_name attribute to the output for the resources.
   for (auto& resource : output_only_resources)
-    function.setResultAttr(resource.first, "tf.resource_name", resource.second);
+    function.setResultAttr(resource.first, kResourceNameArgAttr,
+                           builder.getStringAttr(resource.second));
 
   // Add aliasing_output attribute to the input argument for the resources that
   // are updated by the function.
@@ -315,8 +375,11 @@ void PromoteResourcesToArgsPass::runOnOperation() {
     return signalPassFailure();
   }
 
+  llvm::SmallVector<std::string, 4> var_handle_shared_names;
   if (failed(ResourceLiftingForFunctionalControlFlow(main_func)) ||
-      failed(PromoteResourcesToArguments(main_func)))
+      failed(PromoteVarHandlesToArguments(main_func, /*add_validation=*/true,
+                                          &var_handle_shared_names)) ||
+      failed(PromoteResourcesToArguments(main_func, var_handle_shared_names)))
     return signalPassFailure();
 }
 

From c87e5c70c282a73565138099da864b258cc3b2ff Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Wed, 13 May 2020 08:50:44 -0700
Subject: [PATCH 0492/1533] Fix batch_to_space's formatting.

PiperOrigin-RevId: 311336470
Change-Id: I74d238ae04c0e9938b765ca4bc8fac44e7872866
---
 tensorflow/python/ops/array_ops.py | 140 ++++++++++++++++-------------
 1 file changed, 80 insertions(+), 60 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 1cb6fdbd726..0ee37e186fb 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3820,68 +3820,88 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
            block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] *
            block_shape[M-1] - crops[M-1,0] - crops[M-1,1],  input_shape[M+1],
            ..., input_shape[N-1]]
-      Some Examples:
-      (1) For the following input of shape `[4, 1, 1, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[[1]]],
-          [[[2]]],
-          [[[3]]],
-          [[[4]]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 1]` and value:
-         ``` x = [[[[1], [2]],
-                   [[3], [4]]]] ```
-      (2) For the following input of shape `[4, 1, 1, 3]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[1,  2,   3]],
-          [[4,  5,   6]],
-          [[7,  8,   9]],
-          [[10, 11, 12]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 3]` and value:
-         ```python
-         x = [[[[1, 2, 3], [4,  5,  6 ]],
-               [[7, 8, 9], [10, 11, 12]]]]
-         ```
-      (3) For the following
-         input of shape `[4, 2, 2, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         x = [[[[1], [3]], [[ 9], [11]]],
-              [[[2], [4]], [[10], [12]]],
-              [[[5], [7]], [[13], [15]]],
-              [[[6], [8]], [[14], [16]]]]
-         ```
-         The output tensor has shape `[1, 4, 4, 1]` and value:
-         ```python
-         x = [[[1],  [2],  [ 3], [ 4]],
-              [[5],  [6],  [ 7], [ 8]],
-              [[9],  [10], [11], [12]],
-              [[13], [14], [15], [16]]]
-         ```
-       (4) For the following input of shape
-          `[8, 1, 3, 1]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
-          ```python
-          x = [[[[0], [ 1], [ 3]]],
-               [[[0], [ 9], [11]]],
-               [[[0], [ 2], [ 4]]],
-               [[[0], [10], [12]]],
-               [[[0], [ 5], [ 7]]],
-               [[[0], [13], [15]]],
-               [[[0], [ 6], [ 8]]],
-               [[[0], [14], [16]]]]
-          ```
-          The output tensor has shape `[2, 2, 4, 1]` and value:
-          ```python
-          x = [[[[ 1], [ 2], [ 3], [ 4]],
-                [[ 5], [ 6], [ 7], [ 8]]],
-               [[[ 9], [10], [11], [12]],
-                [[13], [14], [15], [16]]]] ```
     name: A name for the operation (optional).
 
+  Examples:
+
+  (1) For the following input of shape `[4, 1, 1, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[[1]]],
+      [[[2]]],
+      [[[3]]],
+      [[[4]]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 1]` and value:
+
+     ```
+     x = [[[[1], [2]],
+         [[3], [4]]]]
+     ```
+
+  (2) For the following input of shape `[4, 1, 1, 3]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[1,  2,   3]],
+      [[4,  5,   6]],
+      [[7,  8,   9]],
+      [[10, 11, 12]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 3]` and value:
+
+    ```python
+     x = [[[[1, 2, 3], [4,  5,  6 ]],
+           [[7, 8, 9], [10, 11, 12]]]]
+     ```
+
+  (3) For the following
+     input of shape `[4, 2, 2, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     x = [[[[1], [3]], [[ 9], [11]]],
+          [[[2], [4]], [[10], [12]]],
+          [[[5], [7]], [[13], [15]]],
+          [[[6], [8]], [[14], [16]]]]
+     ```
+
+    The output tensor has shape `[1, 4, 4, 1]` and value:
+
+    ```python
+     x = [[[1],  [2],  [ 3], [ 4]],
+          [[5],  [6],  [ 7], [ 8]],
+          [[9],  [10], [11], [12]],
+          [[13], [14], [15], [16]]]
+     ```
+
+   (4) For the following input of shape
+      `[8, 1, 3, 1]`,
+      `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
+
+      ```python
+      x = [[[[0], [ 1], [ 3]]],
+           [[[0], [ 9], [11]]],
+           [[[0], [ 2], [ 4]]],
+           [[[0], [10], [12]]],
+           [[[0], [ 5], [ 7]]],
+           [[[0], [13], [15]]],
+           [[[0], [ 6], [ 8]]],
+           [[[0], [14], [16]]]]
+      ```
+
+      The output tensor has shape `[2, 2, 4, 1]` and value:
+
+      ```python
+      x = [[[[ 1], [ 2], [ 3], [ 4]],
+            [[ 5], [ 6], [ 7], [ 8]]],
+           [[[ 9], [10], [11], [12]],
+            [[13], [14], [15], [16]]]]
+      ```
+
   Returns:
     A `Tensor`. Has the same type as `input`.
   """

From 13f445fb39c84526a838b59a3bf48031031543f2 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 08:58:13 -0700
Subject: [PATCH 0493/1533] For python op generation: add dispatch to all
 generated ops (don't skip ops with VISIBILITY=HIDDEN)

PiperOrigin-RevId: 311337843
Change-Id: Ibe9a4c31e3776e1b4dce23bd4f686025dbe5a31d
---
 tensorflow/python/framework/python_op_gen.cc | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 857cc7b6638..02b659528b0 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -371,9 +371,7 @@ void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
   strings::StrAppend(&result_, function_setup);
-  if (api_def_.visibility() == ApiDef::VISIBLE) {
-    strings::StrAppend(&result_, "  try:\n  ");
-  }
+  strings::StrAppend(&result_, "  try:\n  ");
   strings::StrAppend(
       &result_, "  _, _, _op, _outputs = _op_def_library._apply_op_helper(\n");
   AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
@@ -690,9 +688,7 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
-  if (api_def_.visibility() == ApiDef::VISIBLE) {
-    strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
-  }
+  strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
 
   AddExport();
   AddDefLine(function_name_, parameters);
@@ -955,8 +951,6 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
 }
 
 void GenEagerPythonOp::AddDispatch(const string& prefix) {
-  if (api_def_.visibility() != ApiDef::VISIBLE) return;
-
   strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
   strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
   AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));

From e8ac8116c76824c370cbbb9fc3a77b637d160106 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 09:02:09 -0700
Subject: [PATCH 0494/1533] Add PromoteVarHandlesToArgsPass to lift
 tf.VarHandleOps from function bodies to function arguments.

This reuses PromoteResourcesToArgsPass in terms of converting tf.VarHandleOps to function resource arguments.

PiperOrigin-RevId: 311338598
Change-Id: Ic83f1234fa51b7536ce1d3f88ee89404e9ab6689
---
 .../tests/promote_resources_to_args.mlir      |  4 +-
 .../tests/promote_var_handles_to_args.mlir    | 46 ++++++++++++++++
 .../mlir/tensorflow/transforms/passes.h       |  4 ++
 .../transforms/promote_resources_to_args.cc   | 55 ++++++++++++++++---
 4 files changed, 100 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index eb6d40d20d9..60663f4bd4a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -276,7 +276,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<!tf.resource<
 
 // Tests main function with multiple blocks.
 
-// expected-error@+1 {{expects 'main' function to have 1 block, got 2}}
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
 func @main() {
   br ^bb1
 ^bb1:
@@ -287,7 +287,7 @@ func @main() {
 
 // Tests main function is terminated with a non MLIR ReturnOp.
 
-// expected-error@+1 {{expects 'main' function to have a MLIR ReturnOp}}
+// expected-error@+1 {{expects function 'main' to have a MLIR ReturnOp}}
 func @main() {
 ^bb0:
   tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
new file mode 100644
index 00000000000..5e53a457ecb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
@@ -0,0 +1,46 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-var-handles-to-args | FileCheck %s -dump-input-on-failure
+
+// Tests main function with multiple blocks.
+
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
+func @main() {
+  br ^bb1
+^bb1:
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @no_args
+// CHECK-SAME: (%arg0: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @no_args() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @some_args
+// CHECK-SAME: (%arg0: tensor<i1>, %arg1: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @some_args(%arg0: tensor<i1>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @unique_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"}, %arg1: tensor<!tf.resource<tensor<i32>>> {tf.resource_name = "y"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @unique_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "y"} : () -> tensor<!tf.resource<tensor<i32>>>
+  return
+}
+
+// CHECK-LABEL: func @duplicate_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 0b1ff2beebb..81d0259d2d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -91,6 +91,10 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass();
 // of their aliasing output arguments.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass();
 
+// Creates a pass that promotes tf.VarHandleOp to resource arguments for all
+// functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
+
 // Marks function visibility using tf.entry_function specification. That is,
 // functions with tf.entry_function attributes are marked with public
 // visibility while the other functions are marked with private visibility.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index 9001c00bebe..cece23b4750 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -78,6 +78,16 @@ constexpr char kInvalidResourceMsg[] =
     "expects resource to be a VarHandleOp or function argument";
 constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
+// Checks if a function has only one block.
+mlir::LogicalResult CheckSingleBlockFunction(FuncOp function) {
+  if (!hasSingleElement(function.getBlocks()))
+    return function.emitError()
+           << "expects function '" << function.getName()
+           << "' to have 1 block, got " << function.getBlocks().size();
+
+  return success();
+}
+
 // Collects names of users of a resource that are not `tf.ReadVariableOp` and
 // not `tf.AssignVariableOp`.
 llvm::SmallSet<llvm::StringRef, 1> GetCompositeResourceUserNames(
@@ -191,8 +201,8 @@ LogicalResult PromoteResourcesToArguments(
 
   auto return_op = llvm::dyn_cast_or_null<ReturnOp>(block.getTerminator());
   if (!return_op)
-    return function.emitError(
-        "expects 'main' function to have a MLIR ReturnOp");
+    return function.emitError() << "expects function '" << function.getName()
+                                << "' to have a MLIR ReturnOp";
 
   llvm::SmallVector<ResourceInfo, 4> resources(function.getNumArguments());
   auto argument_types = llvm::to_vector<4>(function.getType().getInputs());
@@ -369,11 +379,7 @@ void PromoteResourcesToArgsPass::runOnOperation() {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!hasSingleElement(main_func.getBlocks())) {
-    main_func.emitError() << "expects 'main' function to have 1 block, got "
-                          << main_func.getBlocks().size();
-    return signalPassFailure();
-  }
+  if (failed(CheckSingleBlockFunction(main_func))) return signalPassFailure();
 
   llvm::SmallVector<std::string, 4> var_handle_shared_names;
   if (failed(ResourceLiftingForFunctionalControlFlow(main_func)) ||
@@ -383,15 +389,50 @@ void PromoteResourcesToArgsPass::runOnOperation() {
     return signalPassFailure();
 }
 
+class PromoteVarHandlesToArgsPass
+    : public PassWrapper<PromoteVarHandlesToArgsPass, OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override;
+};
+
+void PromoteVarHandlesToArgsPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  MLIRContext* context = module.getContext();
+  for (auto function : module.getOps<FuncOp>()) {
+    if (failed(CheckSingleBlockFunction(function))) return signalPassFailure();
+
+    llvm::SmallVector<std::string, 4> var_handle_shared_names;
+    PromoteVarHandlesToArguments(function, /*add_validation=*/false,
+                                 &var_handle_shared_names);
+
+    // Add resource names for each `tf.VarHandleOp` that were promoted to
+    // resource arguments.
+    const int var_handle_args_offset =
+        function.getNumArguments() - var_handle_shared_names.size();
+    for (auto var_name_and_index : llvm::enumerate(var_handle_shared_names))
+      function.setArgAttr(var_name_and_index.index() + var_handle_args_offset,
+                          kResourceNameArgAttr,
+                          StringAttr::get(var_name_and_index.value(), context));
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass() {
   return std::make_unique<PromoteResourcesToArgsPass>();
 }
 
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass() {
+  return std::make_unique<PromoteVarHandlesToArgsPass>();
+}
+
 static PassRegistration<PromoteResourcesToArgsPass> pass(
     "tf-promote-resources-to-args",
     "Promote resources reads/writes to function inputs/outputs.");
 
+static PassRegistration<PromoteVarHandlesToArgsPass> var_handle_pass(
+    "tf-promote-var-handles-to-args",
+    "Promote tf.VarHandleOps to function arguments.");
+
 }  // namespace TF
 }  // namespace mlir

From 260cba17979ac92d2a365159b9f00dc1922aff2c Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 09:07:33 -0700
Subject: [PATCH 0495/1533] Add support for global operation dispatchers. 
 (This is intended for use by TF-internal classes only.)

PiperOrigin-RevId: 311339670
Change-Id: Id4491f9152cec34aaa78a3d90797e0a6bbc1dea3
---
 tensorflow/python/util/dispatch.py      | 21 +++++++++
 tensorflow/python/util/dispatch_test.py | 58 ++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index e94e3345348..3868da14b44 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,6 +39,10 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
+# OpDispatchers which should be used for all operations.
+_GLOBAL_DISPATCHERS = []
+
+
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -82,6 +86,19 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
+class GlobalOpDispatcher(object):
+  """Abstract base class for TensorFlow global operator dispatchers."""
+
+  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
+
+  def handle(self, op, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+
+  def register(self):
+    """Register this dispatcher as a handler for all ops."""
+    _GLOBAL_DISPATCHERS.append(self)
+
+
 def dispatch(op, *args, **kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
@@ -101,6 +118,10 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
+  for dispatcher in _GLOBAL_DISPATCHERS:
+    result = dispatcher.handle(op, args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 89999fcf843..bd35c391924 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,6 +45,47 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
+class TensorTracer(object):
+  """An object used to trace TensorFlow graphs.
+
+  This is an example class that is used to test global op dispatchers.  The
+  global op dispatcher for TensorTracers is defined below.
+  """
+
+  def __init__(self, name, args=None, kwargs=None):
+    self.name = name
+    self.args = args
+    self.kwargs = kwargs
+
+  def __repr__(self):
+    if self.args is None and self.kwargs is None:
+      return self.name
+    else:
+      args = [str(x) for x in self.args]
+      args += sorted(
+          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
+      return "{}({})".format(self.name, ", ".join(args))
+
+
+class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for TensorTracer."""
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a TensorTracer.
+    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
+            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    return TensorTracer(op.__name__, args, kwargs)
+
+  def is_tensor_tracer_arg(self, value):
+    if isinstance(value, TensorTracer):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, TensorTracer) for x in value):
+        return True
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -131,8 +172,21 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
+  def testGlobalDispatcher(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
+      self.assertEqual(
+          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
 
 if __name__ == "__main__":
   googletest.main()
-
-

From 8d25e4bf616b7ae4ed101c580a23421616bf674c Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 13 May 2020 09:08:28 -0700
Subject: [PATCH 0496/1533] Disable failed test for now.

PiperOrigin-RevId: 311339876
Change-Id: Ie6cbff49091892e39127e511ddfe5874ebe0576d
---
 tensorflow/python/keras/layers/cudnn_recurrent_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 9cf132d68df..d25851f6569 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -267,6 +267,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
       self.assertEqual(len(layer.trainable_weights), 3)
       self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  # TODO(b/156439419): Reenable after the bug is fixed.
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
@@ -274,9 +275,9 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
           model_nest_level=[1, 2], model_type=['seq', 'func']))
   @test_util.run_v1_only('b/120911602, b/112083752')
   @test_util.run_gpu_only
-  def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
-                                             bidirectional, implementation,
-                                             model_nest_level, model_type):
+  def DISALBED_test_load_weights_between_noncudnn_rnn(
+      self, rnn_type, to_cudnn, bidirectional, implementation,
+      model_nest_level, model_type):
     input_size = 10
     timesteps = 6
     input_shape = (timesteps, input_size)

From 410c66fa83537ea07e5158df915744931f461bfa Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 16:22:40 +0000
Subject: [PATCH 0497/1533] Explain the reason to call
 _truediv_python3/_div_python2 explicitly (not through registered '/'

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py      | 21 +++++++++++++++++++--
 tensorflow/python/ops/math_ops_test.py |  3 +--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b981af72e83..2c141483eb1 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -439,13 +439,30 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    if not (isinstance(x, ops.Tensor)  or isinstance(y, ops.Tensor)):
+    # tf.math.divide will compute python style division x / y. As python 2
+    # and python 3 have very much different semantics on `/` (__div__ vs.
+    # __truediv__), it would be natural to just use `x / y` as the operator
+    # '/' has already been registered for tensors, see
+    # _OverrideBinaryOperatorHelper for more details.
+    # However, in case both x and y are not tensors, the registered '/'
+    # _OverrideBinaryOperatorHelper will not take effect. In this case,
+    # python's default '/' operator will take effect which result in the return
+    # value of `tf.math.divide` as a non-Tensor.
+    # For that reason we excplicitly calls _truediv_python3/_div_python2
+    # in case both x and y are not tensors.
+    # Since _truediv_python3/_div_python2 operates on tensors and will convert
+    # to tensor if needed. This avoid the situation of the following if not
+    # explicitly calling _truediv_python3/_div_python2:
+    # >>> tf.divide(5, 2)
+    # 2.5 <= should be <tf.Tensor: shape=(), dtype=float64, numpy=2.5> instead.
+    if not (isinstance(x, ops.Tensor) or isinstance(y, ops.Tensor)):
       if sys.version_info.major < 3:
-        return _truediv_python2(x, y)
+        return _div_python2(x, y)
       else:
         return _truediv_python3(x, y)
     return x / y
 
+
 @tf_export("math.multiply", "multiply")
 @dispatch.add_dispatch_support
 def multiply(x, y, name=None):
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index dab0ea88ba8..1debed531b6 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -498,10 +498,9 @@ class DivAndModTest(test_util.TensorFlowTestCase):
   def testWithPythonValue(self):
     # Test case for GitHub issue 39475:
     # https://github.com/tensorflow/tensorflow/issues/39475
-    x = math_ops.divide(5,  2)
+    x = math_ops.divide(5, 2)
     self.assertTrue(isinstance(x, ops.Tensor))
 
-
 @test_util.run_all_in_graph_and_eager_modes
 class DivNoNanTest(test_util.TensorFlowTestCase):
 

From 902ffede1f57b4a11c8e570e414a960392413878 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 13 May 2020 09:48:37 -0700
Subject: [PATCH 0498/1533] Internal change

PiperOrigin-RevId: 311347501
Change-Id: I2f98f650bc5119d0aa977bed9d4c5b1305523f86
---
 tensorflow/compiler/mlir/BUILD                |   5 -
 tensorflow/compiler/mlir/tfrt/BUILD           | 183 -------
 .../mlir/tfrt/analysis/analysis.proto         |  25 -
 .../tfrt/analysis/compatibility_analysis.cc   | 193 -------
 .../tfrt/analysis/compatibility_analysis.h    |  30 --
 .../dialect_static_registration.cc            |  31 --
 .../tfrt/runtime_fallback/lower_tf_to_tfd.cc  | 390 --------------
 .../runtime_fallback_combine.cc               |  80 ---
 .../runtime_fallback/runtime_fallback_ops.cc  |  45 --
 .../runtime_fallback/runtime_fallback_ops.h   |  41 --
 .../runtime_fallback/runtime_fallback_ops.td  | 158 ------
 .../mlir/tfrt/saved_model/saved_model.cc      | 131 -----
 .../mlir/tfrt/saved_model/saved_model.h       |  78 ---
 tensorflow/compiler/mlir/tfrt/tests/BUILD     |  19 -
 .../compiler/mlir/tfrt/tests/analysis/BUILD   |  19 -
 .../analysis/compatibility_analysis.mlir      |  65 ---
 .../compiler/mlir/tfrt/tests/basics.mlir      |  19 -
 .../mlir/tfrt/tests/err_partial_convert.mlir  |   9 -
 tensorflow/compiler/mlir/tfrt/tests/opt.mlir  |  26 -
 .../mlir/tfrt/tests/tf_to_corert/BUILD        |  19 -
 .../tfrt/tests/tf_to_corert/attributes.mlir   |  21 -
 .../mlir/tfrt/tests/tf_to_corert/basic.mlir   |  34 --
 .../tests/tf_to_corert/derived_attrs.mlir     |  21 -
 .../tests/tf_to_corert/device_conversion.mlir |  12 -
 .../mlir/tfrt/tests/tf_to_corert/fold.mlir    |  12 -
 .../tests/tf_to_corert/string_tensor.mlir     |  10 -
 .../tf_executor_to_corert_pipeline.mlir       |  24 -
 .../mlir/tfrt/tests/tf_to_tfd_lowering.mlir   | 111 ----
 .../compiler/mlir/tfrt/tf_legalize_to_hex.cc  | 163 ------
 .../compiler/mlir/tfrt/transforms/optimize.cc | 122 -----
 .../compiler/mlir/tfrt/transforms/passes.h    |  64 ---
 .../mlir/tfrt/transforms/tf_to_corert.cc      | 484 ------------------
 32 files changed, 2644 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/tfrt/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
 delete mode 100644 tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
 delete mode 100644 tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/basics.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/opt.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/transforms/passes.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc

diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 4a4d566f163..c0066ecda03 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -77,10 +77,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tfjs:tensorflow_js_passes",
-        "//tensorflow/compiler/mlir/tfrt:lower_tf_to_tfd_alwayslink",
-        "//tensorflow/compiler/mlir/tfrt:runtime_fallback_opdefs_alwayslink",
-        "//tensorflow/compiler/mlir/tfrt:tf_legalize_to_tfrt",
-        "//tensorflow/compiler/mlir/tfrt:tf_to_corert",
     ],
 )
 
@@ -152,7 +148,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
         "//tensorflow/compiler/mlir/tensorflow:translate_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tfrt:compatibility_analysis",
         "//tensorflow/compiler/mlir/xla:xla_mlir_translate",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
deleted file mode 100644
index edcfc574452..00000000000
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ /dev/null
@@ -1,183 +0,0 @@
-load("//third_party/mlir:tblgen.bzl", "gentbl")
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library_cc")
-
-# TF to TFRT kernels conversion.
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-package_group(
-    name = "friends",
-    packages = [
-        "//learning/brain/experimental/tfrt/...",
-        "//tensorflow/compiler/...",
-        "//tensorflow/core/runtime_fallback/...",
-        "//tensorflow/core/tfrt/experimental/saved_model/...",
-        "//third_party/tf_runtime_google/...",
-    ],
-)
-
-cc_library(
-    name = "tf_legalize_to_tfrt",
-    srcs = [
-        "tf_legalize_to_hex.cc",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-    alwayslink = 1,
-)
-
-filegroup(
-    name = "runtime_fallback_ops_td_files",
-    srcs = [
-        "runtime_fallback/runtime_fallback_ops.td",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
-        "@tf_runtime//:OpBaseTdFiles",
-    ],
-)
-
-gentbl(
-    name = "runtime_fallback_ops_inc_gen",
-    tbl_outs = [
-        (
-            "-gen-op-decls",
-            "runtime_fallback_ops.h.inc",
-        ),
-        (
-            "-gen-op-defs",
-            "runtime_fallback_ops.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "runtime_fallback/runtime_fallback_ops.td",
-    td_includes = [
-        "external/tf_runtime/include",
-    ],
-    td_srcs = [
-        ":runtime_fallback_ops_td_files",
-    ],
-)
-
-cc_library(
-    name = "runtime_fallback_opdefs_alwayslink",
-    srcs = [
-        "runtime_fallback/dialect_static_registration.cc",
-        "runtime_fallback/runtime_fallback_combine.cc",
-        "runtime_fallback/runtime_fallback_ops.cc",
-    ],
-    hdrs = [
-        "runtime_fallback/runtime_fallback_ops.h",
-    ],
-    deps = [
-        ":runtime_fallback_ops_inc_gen",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SideEffects",
-        "@llvm-project//mlir:Support",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-        "@tf_runtime//:tensor_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "lower_tf_to_tfd_alwayslink",
-    srcs = ["runtime_fallback/lower_tf_to_tfd.cc"],
-    deps = [
-        "runtime_fallback_opdefs_alwayslink",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Transforms",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "tf_to_corert",
-    srcs = [
-        "transforms/optimize.cc",
-        "transforms/tf_to_corert.cc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/platform:tstring",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Transforms",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-        "@tf_runtime//:core_runtime_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "saved_model",
-    srcs = [
-        "saved_model/saved_model.cc",
-    ],
-    hdrs = [
-        "saved_model/saved_model.h",
-    ],
-    deps = [
-        ":tf_to_corert",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/core/platform:status",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@tf_runtime//:core_runtime",
-        "@tf_runtime//:hostcontext",
-        "@tf_runtime//:mlirtobef",
-        "@tf_runtime//:support",
-        "@tf_runtime//:tensor",
-    ],
-)
-
-cc_library(
-    name = "compatibility_analysis",
-    srcs = [
-        "analysis/compatibility_analysis.cc",
-    ],
-    hdrs = [
-        "analysis/compatibility_analysis.h",
-    ],
-    deps = [
-        ":analysis/analysis_proto_cc",
-        ":tf_to_corert",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/core:lib_proto_parsing",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Translation",
-    ],
-    alwayslink = 1,
-)
-
-tf_proto_library_cc(
-    name = "analysis/analysis_proto",
-    srcs = ["analysis/analysis.proto"],
-    cc_api_version = 2,
-)
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto b/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
deleted file mode 100644
index 0716a243bb3..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
+++ /dev/null
@@ -1,25 +0,0 @@
-syntax = "proto3";
-
-package mlir.tfrt;
-
-message CompatibilityAnalysisReportProto {
-  bool unknown_dialect = 1;
-  bool ref_variable = 2;
-  bool incompatible_variable = 3;
-  bool incompatible_attribute = 4;
-  bool control_flow_v1 = 5;
-
-  // TODO(chky): add more checks, eg. tensor datatypes.
-}
-
-message CompatibilityAnalysisProto {
-  CompatibilityAnalysisReportProto summary = 1;
-
-  message OpInfo {
-    int32 count = 1;
-
-    CompatibilityAnalysisReportProto report = 2;
-  }
-
-  map<string, OpInfo> ops = 2;
-}
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
deleted file mode 100644
index 7e9c5544c25..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h"
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace tensorflow {
-namespace {
-
-class CompatibilityAnalysis {
- public:
-  void AnalyzeOperation(mlir::Operation* op);
-
-  const mlir::tfrt::CompatibilityAnalysisProto& GetResult() const {
-    return analysis_;
-  }
-
- private:
-  // Return true if some attributes in the op are not supported.
-  bool AnalyzeOpAttributes(mlir::Operation* op);
-  // Return true if this op has unsupported operation (eg. mutate) on resource
-  // variables.
-  bool AnalyzeVariable(mlir::Operation* op);
-
-  void UpdateReport(
-      const mlir::tfrt::CompatibilityAnalysisReportProto& new_report,
-      mlir::tfrt::CompatibilityAnalysisReportProto* old_report);
-
-  mlir::tfrt::CompatibilityAnalysisProto analysis_;
-};
-
-void CompatibilityAnalysis::AnalyzeOperation(mlir::Operation* op) {
-  // Skip the standard ops that are allowed in tf dialect.
-  if (llvm::isa<mlir::ReturnOp>(op) || llvm::isa<mlir::FuncOp>(op) ||
-      llvm::isa<mlir::ModuleOp>(op) || llvm::isa<mlir::ModuleTerminatorOp>(op))
-    return;
-
-  auto op_name = op->getName();
-
-  std::string name = op_name.getStringRef().str();
-
-  mlir::tfrt::CompatibilityAnalysisReportProto op_report;
-
-  if (op_name.getDialect() ==
-      mlir::TF::TensorFlowDialect::getDialectNamespace()) {
-    // Analyze op attributes.
-    if (AnalyzeOpAttributes(op)) op_report.set_incompatible_attribute(true);
-
-    // Analyze variable operations.
-    if (AnalyzeVariable(op)) op_report.set_incompatible_variable(true);
-
-    // Reference variable is not supported.
-    if (op_name.getStringRef() == "tf.VariableV2")
-      op_report.set_ref_variable(true);
-  } else if (op_name.getDialect() == "tf_executor") {
-    if (llvm::isa<mlir::tf_executor::SwitchOp>(op) ||
-        llvm::isa<mlir::tf_executor::SwitchNOp>(op) ||
-        llvm::isa<mlir::tf_executor::MergeOp>(op) ||
-        llvm::isa<mlir::tf_executor::EnterOp>(op) ||
-        llvm::isa<mlir::tf_executor::NextIterationSourceOp>(op) ||
-        llvm::isa<mlir::tf_executor::NextIterationSinkOp>(op)) {
-      op_report.set_control_flow_v1(true);
-    } else {
-      // Skip the rest of the tf_executor ops as they can be handled.
-      //
-      // TODO(chky): consider adding whitelist here.
-      return;
-    }
-  } else {
-    // Mark unknown dialect in the report.
-    op_report.set_unknown_dialect(true);
-  }
-
-  auto& op_info = (*analysis_.mutable_ops())[name];
-  op_info.set_count(op_info.count() + 1);
-
-  UpdateReport(op_report, op_info.mutable_report());
-  UpdateReport(op_report, analysis_.mutable_summary());
-}
-
-bool CompatibilityAnalysis::AnalyzeOpAttributes(mlir::Operation* op) {
-  // tf.Const gets special handling so it is always compatible.
-  if (llvm::isa<mlir::TF::ConstOp>(op)) return false;
-
-  // TODO(chky): Derived attributes should be also analyzed here.
-  for (auto attr : op->getAttrs()) {
-    if (attr.first.strref() == "_output_shapes") continue;
-    if (attr.first.strref() == "_class") continue;
-
-    // Symbol attributes (eg. function names) is currently not supported.
-    //
-    // TODO(chky): CoreRT should ideally support function call operatoins.
-    // Remove this condition once that is implemented.
-    if (attr.second.isa<mlir::FlatSymbolRefAttr>()) return true;
-
-    // Currently only tensors of simple dtypes (i1, i32, i64, f32, f64) are
-    // supported.
-    if (auto elements_attr = attr.second.dyn_cast<mlir::ElementsAttr>()) {
-      if (!elements_attr.isa<mlir::DenseElementsAttr>()) return true;
-      auto element_type = elements_attr.getType().getElementType();
-      if (element_type.isa<mlir::TF::TensorFlowType>()) return true;
-    }
-
-    // Currently only arrays of simple element types (i1, i32, i64, f32, f64)
-    // are supported.
-    if (auto array_attr = attr.second.dyn_cast<mlir::ArrayAttr>()) {
-      if (array_attr.size() > 0) {
-        if (array_attr[0].isa<mlir::ElementsAttr>()) return true;
-
-        if (array_attr[0].isa<mlir::StringAttr>()) return true;
-
-        if (array_attr[0].isa<mlir::FlatSymbolRefAttr>()) return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool CompatibilityAnalysis::AnalyzeVariable(mlir::Operation* op) {
-  // Currently only supported variable op is ReadVariableOp.
-  if (llvm::isa<mlir::TF::ReadVariableOp>(op)) return false;
-
-  for (auto value : op->getOperands()) {
-    auto type = value.getType();
-    if (auto tensor_type = type.dyn_cast<mlir::TensorType>()) {
-      auto element_type = tensor_type.getElementType();
-      if (element_type.isa<mlir::TF::ResourceType>()) return true;
-    }
-  }
-
-  return false;
-}
-
-void CompatibilityAnalysis::UpdateReport(
-    const mlir::tfrt::CompatibilityAnalysisReportProto& new_report,
-    mlir::tfrt::CompatibilityAnalysisReportProto* old_report) {
-  if (new_report.unknown_dialect()) old_report->set_unknown_dialect(true);
-
-  if (new_report.ref_variable()) old_report->set_ref_variable(true);
-
-  if (new_report.incompatible_variable())
-    old_report->set_incompatible_variable(true);
-
-  if (new_report.incompatible_attribute())
-    old_report->set_incompatible_attribute(true);
-
-  if (new_report.control_flow_v1()) old_report->set_control_flow_v1(true);
-}
-
-}  // namespace
-
-mlir::tfrt::CompatibilityAnalysisProto AnalyzeTFCompatibility(
-    mlir::ModuleOp op) {
-  CompatibilityAnalysis analysis;
-  op.walk([&analysis](mlir::Operation* op) { analysis.AnalyzeOperation(op); });
-  return analysis.GetResult();
-}
-
-static mlir::TranslateFromMLIRRegistration registration(
-    "analyze-tf-for-tfrt", [](mlir::ModuleOp op, llvm::raw_ostream& output) {
-      auto analysis_proto = AnalyzeTFCompatibility(op);
-      std::string text_proto;
-      if (tensorflow::protobuf::TextFormat::PrintToString(analysis_proto,
-                                                          &text_proto)) {
-        output << text_proto;
-        return mlir::success();
-      }
-
-      return mlir::failure();
-    });
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h b/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
deleted file mode 100644
index 7250a9493bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
-
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfrt/analysis/analysis.pb.h"
-
-namespace tensorflow {
-
-// Analyze a MLIR module in tf dialect.
-mlir::tfrt::CompatibilityAnalysisProto AnalyzeTFCompatibility(
-    mlir::ModuleOp op);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
deleted file mode 100644
index 7632b0546fa..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===- dialect_static_registration.cc -------------------------------------===//
-//
-// This file registers the RuntimeFallbackDialect.
-//
-//===----------------------------------------------------------------------===//
-
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-namespace mlir {
-namespace tfd {
-
-// Static initialization for dialect registration.
-static DialectRegistration<RuntimeFallbackDialect> tfd_registration;
-
-}  // namespace tfd
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
deleted file mode 100644
index 5f831c9ef6a..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
+++ /dev/null
@@ -1,390 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstddef>
-#include <string>
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-#include "tfrt/basic_kernels/opdefs/basic_kernels.h"
-
-namespace mlir {
-namespace {
-
-constexpr const char kTmpLoweringCastOpName[] = "tmp_lowering_cast_op";
-
-static Type GetChainType(MLIRContext* context) {
-  auto hexDialect = Identifier::get("hex", context);
-  return OpaqueType::get(hexDialect, "chain", context);
-}
-
-static Type GetTfdTensorType(MLIRContext* context) {
-  auto tfdDialect = Identifier::get("tfd", context);
-  return OpaqueType::get(tfdDialect, "tf_tensor", context);
-}
-
-struct TfToTfdLoweringPass
-    : public PassWrapper<TfToTfdLoweringPass, OperationPass<ModuleOp>> {
-  void runOnOperation() final;
-};
-
-class FuncOpSignatureConversion : public OpConversionPattern<FuncOp> {
- public:
-  using OpConversionPattern<FuncOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const override {
-    auto ctx = funcOp.getContext();
-    auto chain_type = GetChainType(ctx);
-    auto tfd_tensor_type = GetTfdTensorType(ctx);
-    FunctionType type = funcOp.getType();
-
-    // Convert function return results. The lowered function is expected to
-    // return a chain as the first return result. For each original TF tensor,
-    // the lowered function returns a TFD tensor instead.
-    llvm::SmallVector<Type, 2> converted_results;
-    if (type.getNumResults() > 0) {
-      // Add a chain as the first return result.
-      converted_results.push_back(chain_type);
-
-      // Convert the original TF tensor return results.
-      for (unsigned i = 0, e = type.getNumResults(); i != e; ++i) {
-        if (auto tensor_type = type.getResult(i).dyn_cast<TensorType>()) {
-          // Each TF tensor is converted to a TFD tensor.
-          converted_results.push_back(tfd_tensor_type);
-        } else {
-          // Only handle TF tensor conversion for now.
-          return failure();
-        }
-      }
-    }
-
-    // Create the new function signature. The lowered function is expected to
-    // take a Chain as the first argument. Then for each TF tensor argument,
-    // expect a TFD tensor argument instead.
-    TypeConverter::SignatureConversion new_func_sig(type.getNumInputs() + 1);
-    if (type.getNumInputs() > 0) {
-      // Add the first chain argument.
-      new_func_sig.addInputs(chain_type);
-      for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-        // For each original TF tensor type, convert it to one TFD tensor type.
-        if (auto tensor_type = type.getInput(i).dyn_cast<TensorType>()) {
-          new_func_sig.addInputs(i, {tfd_tensor_type});
-        } else {
-          // Only handle TF tensor argument for now.
-          return failure();
-        }
-      }
-    }
-    // Each function has a single region. In general, each region can have
-    // multiple blocks. Assume that all TF-dialect functions only have a
-    // single entry block.
-    Block* entry = &funcOp.front();
-
-    // Tell the rewriter to convert the region signature. After this, the
-    // function region takes the new function signature, which means index
-    // shifts by one.
-    Block* convertedEntry =
-        rewriter.applySignatureConversion(&funcOp.getBody(), new_func_sig);
-
-    {
-      // Generate the "fake" mapping ops. The insertion guard restores rewriter
-      // insertion pointer when it gets out of scope.
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPointToStart(convertedEntry);
-      // Replace block arguments. For example,
-      // func @example(i64, i1) -> i64 {
-      //   ^bb0(%a: i64, %cond: i1):  // replacing this.
-      for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-        // For each original block argument, create a fake op that takes the
-        // input the input chain argument to the function, and the tfd tensor
-        // argument, and returns the original TF tensor input. Note that the
-        // function signature has been replaced, so entry->getArgument(0) is the
-        // input chain. And we need to add 1 to index to get the original
-        // argument.
-        Type orig_input = type.getInput(i);
-        OperationState tmp_lowering_cast_op(
-            funcOp.getLoc(), kTmpLoweringCastOpName,
-            {convertedEntry->getArgument(0),
-             convertedEntry->getArgument(i + 1)},
-            orig_input, {});
-        Value repl_value =
-            rewriter.createOperation(tmp_lowering_cast_op)->getResult(0);
-        // Replace original uses of TF tensor block argument with the result of
-        // the fake op. This sets up the lowering passes for individual ops
-        // which at this point still expect TF tensors rather than TFD tensor
-        // inputs.
-        rewriter.replaceUsesOfBlockArgument(entry->getArgument(i), repl_value);
-      }
-    }
-
-    // Create a new function op with an updated signature.
-    auto new_func_op = rewriter.cloneWithoutRegions(funcOp);
-    rewriter.inlineRegionBefore(funcOp.getBody(), new_func_op.getBody(),
-                                new_func_op.end());
-    new_func_op.setType(FunctionType::get(new_func_sig.getConvertedTypes(),
-                                          converted_results, ctx));
-    // Remove the old function op.
-    rewriter.eraseOp(funcOp);
-    return success();
-  }
-};
-
-// Lower each TF op to a tfd.delegate_kernel op. For example,
-//
-// %1 = "tf.ReadVariableOp"(%arg) {
-//     dtype = "tfdtype$DT_FLOAT"
-// } : (tensor<*x!tf.resource>) -> tensor<10xf32>
-//
-// would be lowered to
-//
-// %1:2 = "tfd.delegate_kernel"(%chain_in, %arg) {
-//   _name = "tf.ReadVariableOp",
-//   attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-// } : (!hex.chain, !tfd.tf_tensor) -> (!hex.chain, !tfd.tf_tensor)
-//
-// Each tfd.delegate_kernel op expects a chain as the first input. This chain
-// may come from the first function argument or the previous converted op
-// output. The rest of inputs would be converted to a tfd tensor input.
-// Each tfd.delegate_kernel op returns a chain as the first output. Each
-// original output TensorType is converted a tfd tensor type.
-// The TF op name becomes an _name attribute. Each TF attribute is lowered to
-// two TFD attributes, one for the name, one for the type and value.
-//
-// Because delegate_kernel ops are threaded through chains, we lowered to a
-// serial execution plan.
-// TODO(zhangqiaorjc): Do analysis to allow concurrent execution.
-template <typename TF_OP>
-class TFOpConversion : public OpConversionPattern<TF_OP> {
- public:
-  using OpConversionPattern<TF_OP>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      TF_OP op, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter)  // NOLINT(google-runtime-references
-      const override {
-    auto ctx = op.getContext();
-    // Handle new op operands.
-    // Delegate kernel expects the first argument to be a chain, followed by
-    // original arguments to the target TF op converted to TFD tensors.
-    llvm::SmallVector<Value, 4> delegate_kernel_op_operands;
-    int num_new_operands = op.getOperation()->getNumOperands() + 1;
-    delegate_kernel_op_operands.reserve(num_new_operands);
-
-    // Get the input chain from the previous delegate_kernel op or first block
-    // argument.
-    Value chain_input = nullptr;
-    auto* block = op.getOperation()->getBlock();
-    assert(block->isEntryBlock() && "only supports a single block");
-    // Find a previous delegate_kernel op for its output chain.
-    auto* prev_op = op.getOperation()->getPrevNode();
-    while (prev_op != nullptr && !isa<tfd::DelegateKernelOp>(prev_op)) {
-      prev_op = prev_op->getPrevNode();
-    }
-    if (prev_op != nullptr) {
-      // There is another delegate kernel op before this op.
-      auto prev_op_result_0 = prev_op->getResult(0);
-      assert(prev_op_result_0.getType() == GetChainType(ctx));
-      chain_input = prev_op_result_0;
-    } else {
-      // This op is the first delegate kernel op in a block.
-      auto arg_0 = block->getArgument(0);
-      assert(arg_0.getType() == GetChainType(ctx));
-      chain_input = arg_0;
-    }
-    delegate_kernel_op_operands.push_back(chain_input);
-
-    // Convert each TensorType operand to the corresponding TFD tensor operand.
-    for (auto operand : operands) {
-      auto* tmp_lowering_cast_op = operand.getDefiningOp();
-      assert(tmp_lowering_cast_op->getName().getStringRef() ==
-             kTmpLoweringCastOpName);
-      delegate_kernel_op_operands.push_back(
-          tmp_lowering_cast_op->getOperand(1));
-    }
-
-    // Handle new op results.
-    llvm::SmallVector<Type, 4> delegate_kernel_op_results;
-    // The first output is a chain.
-    delegate_kernel_op_results.push_back(GetChainType(ctx));
-    // For each original output, there is a corresponding TFD tensor output.
-    for (int i = 0, e = op.getOperation()->getNumResults(); i != e; ++i) {
-      delegate_kernel_op_results.push_back(GetTfdTensorType(ctx));
-    }
-
-    // Convert TF attribute to TFD attribute.
-    llvm::SmallVector<NamedAttribute, 4> delegate_kernel_op_attributes;
-    NamedAttribute op_name_attr(Identifier::get("_name", ctx),
-                                StringAttr::get(op.getOperationName(), ctx));
-    delegate_kernel_op_attributes.push_back(op_name_attr);
-
-    int attr_idx = 0;
-    for (const NamedAttribute& tf_attr : op.getAttrs()) {
-      // Small std::string benefits from small string optimization in libc++.
-      NamedAttribute attr_name(
-          Identifier::get("attr" + std::to_string(attr_idx) + "_name", ctx),
-          StringAttr::get(tf_attr.first, ctx));
-      NamedAttribute attr_value(
-          Identifier::get("attr" + std::to_string(attr_idx) + "_value", ctx),
-          tf_attr.second);
-      delegate_kernel_op_attributes.push_back(attr_name);
-      delegate_kernel_op_attributes.push_back(attr_value);
-      attr_idx++;
-    }
-
-    // Replace the TF op with TFD delegate kernel op.
-    auto new_op = rewriter.create<tfd::DelegateKernelOp>(
-        op.getLoc(), delegate_kernel_op_results, delegate_kernel_op_operands,
-        delegate_kernel_op_attributes);
-
-    // Create lowering cast ops for non-chain results.
-    llvm::SmallVector<Value, 4> lowering_cast_ops_values;
-    // Skip the first result. It's a chain which has no current users.
-    for (int i = 1, e = new_op.getOperation()->getNumResults(); i != e; ++i) {
-      Type orig_input = op.getType();
-      OperationState tmp_lowering_cast_op(new_op.getLoc(),
-                                          kTmpLoweringCastOpName,
-                                          {new_op.getOperation()->getResult(0),
-                                           new_op.getOperation()->getResult(i)},
-                                          {orig_input}, {});
-      Value repl_value =
-          rewriter.createOperation(tmp_lowering_cast_op)->getResult(0);
-      lowering_cast_ops_values.push_back(repl_value);
-    }
-
-    rewriter.replaceOp(op, lowering_cast_ops_values);
-    return success();
-  }
-};
-
-class ReturnOpConversion : public OpConversionPattern<ReturnOp> {
- public:
-  using OpConversionPattern<ReturnOp>::OpConversionPattern;
-
-  // Replace std.return with hex.return. The first result is always a chain and
-  // each original TF tensor result is converted to a TFD tensor.
-  LogicalResult matchAndRewrite(
-      ReturnOp return_op, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const override {
-    auto ctx = return_op.getContext();
-    Value chain_output = nullptr;
-    llvm::SmallVector<Value, 4> new_return_op_operands;
-    new_return_op_operands.reserve(return_op.getNumOperands() + 1);
-    // Convert each TF tensor operand to the corresponding TFD tensor operand.
-    for (auto operand : operands) {
-      auto* tmp_lowering_cast_op = operand.getDefiningOp();
-      if (tmp_lowering_cast_op->getName().getStringRef() !=
-          kTmpLoweringCastOpName) {
-        assert(false && "unexpected producer of operand");
-      }
-      if (chain_output == nullptr) {
-        // Get the input chain from the previous op or first block argument.
-        auto* block = return_op.getOperation()->getBlock();
-        if (!block->isEntryBlock()) {
-          assert(false && "only supports a single block");
-        }
-        // Find a previous delegate_kernel op for its output chain.
-        auto* prev_op = return_op.getOperation()->getPrevNode();
-        while (prev_op != nullptr && !isa<tfd::DelegateKernelOp>(prev_op)) {
-          prev_op = prev_op->getPrevNode();
-        }
-        if (prev_op != nullptr) {
-          // There is another delegate kernel op before this op.
-          auto prev_op_result_0 = prev_op->getResult(0);
-          if (prev_op_result_0.getType() != GetChainType(ctx)) {
-            assert(false &&
-                   "delegate kernel must produce chain as the first result");
-          }
-          chain_output = prev_op_result_0;
-        } else {
-          // This op is the first delegate kernel op in a block.
-          auto arg_0 = block->getArgument(0);
-          if (arg_0.getType() != GetChainType(ctx)) {
-            assert(false && "first block argument must be a chain");
-          }
-          chain_output = arg_0;
-        }
-        new_return_op_operands.push_back(chain_output);
-      }
-      new_return_op_operands.push_back(tmp_lowering_cast_op->getOperand(1));
-    }
-    // Replace the old std.return op with the new hex.return op.
-    rewriter.create<tfrt::hex::ReturnOp>(return_op.getLoc(),
-                                         new_return_op_operands);
-    rewriter.eraseOp(return_op);
-
-    return success();
-  }
-};
-
-void TfToTfdLoweringPass::runOnOperation() {
-  ConversionTarget target(getContext());
-
-  // Make tmp_lowering_cast_op legal for conversion. But delete them after the
-  // passes.
-  OperationName tmp_lowering_cast_op_name(kTmpLoweringCastOpName,
-                                          &getContext());
-  target.setOpAction(tmp_lowering_cast_op_name,
-                     ConversionTarget::LegalizationAction::Legal);
-
-  // target.addLegalDialect<TF::TensorFlowDialect,
-  // tfd::RuntimeFallbackDialect>();
-  target.addLegalDialect<tfd::RuntimeFallbackDialect>();
-
-  target.addDynamicallyLegalOp<FuncOp>([](FuncOp function) {
-    // Returns true if this function is legal, i.e. all inputs and outputs are
-    // TFRT types.
-    FunctionType type = function.getType();
-    for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-      if (type.getInput(i).isa<TensorType>()) return false;
-    }
-    for (unsigned i = 0, e = type.getNumResults(); i != e; ++i) {
-      if (type.getResult(i).isa<TensorType>()) return false;
-    }
-    return true;
-  });
-
-  target.addLegalOp<mlir::ModuleTerminatorOp, mlir::ModuleOp,
-                    tfrt::hex::ReturnOp>();
-
-  OwningRewritePatternList patterns;
-  patterns.insert<FuncOpSignatureConversion, TFOpConversion<TF::ReadVariableOp>,
-                  TFOpConversion<TF::MatMulOp>, TFOpConversion<TF::AddV2Op>,
-                  TFOpConversion<TF::ReluOp>, TFOpConversion<TF::IdentityOp>,
-                  ReturnOpConversion>(&getContext());
-
-  if (failed(applyPartialConversion(getOperation(), target, patterns)))
-    signalPassFailure();
-
-  // Delete the tmp_lowering_cast_op's since they are illegal.
-  getOperation().walk([&tmp_lowering_cast_op_name](Operation* op) {
-    if (op->getName() == tmp_lowering_cast_op_name) op->erase();
-  });
-}
-
-}  // namespace
-}  // namespace mlir
-
-static mlir::PassRegistration<mlir::TfToTfdLoweringPass> pass(
-    "tf-to-tfd-lowering", "Lowers the TF dialect to Runtime Fallback dialect.");
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
deleted file mode 100644
index 4fd57af55cc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===----------------------------------------------------------------------===//
-//
-// This file implements a set of simple combiners for optimizing operations in
-// the Runtime Fallback dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-// This optimizes the following scenario:
-// %tft0, %c2 = "tfd.move_dht_to_tft"(%dht0, %c1)
-//     : (!dht.host_tensor, !hex.chain) -> (!tfd.tf_tensor, !hex.chain)
-// %dht1, %c3 = "tfd.convert_tft_to_dht"(%tft0, %c2)
-//     : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-// some_op %dht1, %c3
-//
-// becomes
-// some_op %dht0, %c1
-
-struct SimplifyDoubleConversion
-    : public mlir::OpRewritePattern<mlir::tfd::ConvertTftToDhtOp> {
-  // We register this pattern to match every tfd.move_dht_to_tft op.
-  // The "benefit" is used by the framework to order the patterns and process
-  // them in order of profitability.
-  explicit SimplifyDoubleConversion(mlir::MLIRContext* context)
-      : mlir::OpRewritePattern<mlir::tfd::ConvertTftToDhtOp>(context,
-                                                             /*benefit=*/1) {}
-
-  // This method attempts to match a pattern and rewrite it. The rewriter
-  // argument is the orchestrator of the sequence of rewrites. The pattern is
-  // expected to interact with it to perform any changes to the IR from here.
-  mlir::LogicalResult matchAndRewrite(
-      mlir::tfd::ConvertTftToDhtOp op,
-      mlir::PatternRewriter& rewriter) const override {
-    // Look through the inputs of the ConvertTftToDhtOp.
-    mlir::Value convert_op_input_0 = op.getOperand(0);
-    mlir::Value convert_op_input_1 = op.getOperand(1);
-    mlir::tfd::MoveDhtToTftOp move_input_op_0 =
-        llvm::dyn_cast_or_null<mlir::tfd::MoveDhtToTftOp>(
-            convert_op_input_0.getDefiningOp());
-    mlir::tfd::MoveDhtToTftOp move_input_op_1 =
-        llvm::dyn_cast_or_null<mlir::tfd::MoveDhtToTftOp>(
-            convert_op_input_1.getDefiningOp());
-
-    // The inputs should be MoveDhtToTftOp.
-    if (!move_input_op_0 || !move_input_op_1) return mlir::failure();
-    // Both inputs are the same MoveDhtToTftOp.
-    if (move_input_op_0 != move_input_op_1) return mlir::failure();
-
-    // Use the rewriter to replace the ConvertTftToDhtOp's users with the
-    // operands of MoveDhtToTftOp.
-    rewriter.replaceOp(
-        op, {move_input_op_0.getOperand(0), move_input_op_0.getOperand(1)});
-    return mlir::success();
-  }
-};
-
-// Register rewrite pattern as "canonicalization" patterns on the MoveDhtToTftOp
-// so that they can be picked up by the Canonicalization framework.
-void mlir::tfd::ConvertTftToDhtOp::getCanonicalizationPatterns(
-    OwningRewritePatternList& results, MLIRContext* context) {
-  results.insert<SimplifyDoubleConversion>(context);
-}
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
deleted file mode 100644
index 9c69154673b..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-namespace mlir {
-namespace tfd {
-
-//===----------------------------------------------------------------------===//
-// TfrtDelegate Dialect
-//===----------------------------------------------------------------------===//
-
-RuntimeFallbackDialect::RuntimeFallbackDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tfd", context) {
-  allowUnknownTypes();
-
-  allowUnknownOperations();
-
-  addOperations<
-#define GET_OP_LIST
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.cc.inc"
-      >();
-}
-
-//===----------------------------------------------------------------------===//
-// TableGen'd op method definitions
-//===----------------------------------------------------------------------===//
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.cc.inc"
-
-}  // namespace tfd
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
deleted file mode 100644
index 009d565e40d..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the Runtime Fallback dialect.
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
-
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
-
-namespace mlir {
-namespace tfd {
-
-// Dialect for TFRT delegate operations.
-class RuntimeFallbackDialect : public Dialect {
- public:
-  explicit RuntimeFallbackDialect(MLIRContext* context);
-  static StringRef getDialectNamespace() { return "tfd"; }
-};
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.h.inc"
-
-}  // namespace tfd
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
deleted file mode 100644
index c33c6f8d73d..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the definition file for the Runtime Fallback Dialect.
-
-#ifdef TFRT_DELEGATE_DIALECT
-#else
-#define TFRT_DELEGATE_DIALECT
-
-include "tfrt/tfrt_op_base.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-//===----------------------------------------------------------------------===//
-// Type definitions
-//===----------------------------------------------------------------------===//
-def TfTensorType : OpaqueType<"tfd", "tf_tensor", "!tfd.tf_tensor type">;
-
-//===----------------------------------------------------------------------===//
-// Runtime Fallback Dialect definitions
-//===----------------------------------------------------------------------===//
-
-def RuntimeFallback_Dialect : Dialect {
-  let name = "tfd";
-
-  let description = [{
-    The Runtime Fallback dialect.
-
-    This dialect contains operations to run existing TF kernels on TFRT by
-    invoking TF Eager API.
-  }];
-
-  let cppNamespace = "tfd";
-}
-
-//===----------------------------------------------------------------------===//
-// Runtime Fallback Dialect Ops definitions
-//===----------------------------------------------------------------------===//
-
-// Base class for the operation in this dialect.
-class RuntimeFallbackDialect_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<RuntimeFallback_Dialect, mnemonic, traits> { }
-
-def InitEagerContextOp : RuntimeFallbackDialect_Op<"init_eager_context"> {
-  let summary = "eager context initialization operation";
-  let description = [{
-    The "tfd.init_eager_context" operation takes an input chain, creates and
-    initializes the TF EagerContext and returns an output chain.
-
-    Example:
-      %c1 = "tfd.init_eager_context"(%c0): (!hex.chain) -> !hex.chain
-  }];
-
-  let arguments = (ins ChainType);
-  let results = (outs ChainType);
-}
-
-def DelegateKernelOp : RuntimeFallbackDialect_Op<"delegate_kernel"> {
-  let summary = "delegate kernel operation";
-  let description = [{
-    The "tfd.delegate_kernel" operation takes an input chain, and arbitrary
-    number of input arguments, and runs a specified TF op via TFE C API. It
-    returns an output chain and variable number of outputs from the TF op.
-
-    The input arguments and attributes are passed to the TF op. The ouputs are
-    outputs of the TF op.
-
-    Note that `_name` is a required attribute specifying the TF op to run.
-    TFRT attributes are sorted alphabetically, passed in as positional
-    attributes to the TFRT kernel, rather than as named attributes.
-
-    Example:
-      To run "tf.MatMul" op, which has two boolean attributes,
-        1. Set _name = "MatMul"
-        2. For each TF attribute, split it into two attributes, one for name of
-           the TF attribute, and the other for the type and value of the
-           attribute value. Attribute value is a string with the format of
-           "type$val", where type can be "bool", "string", "tfdtype", "tfshape",
-           "tftensor".
-           The value serialization format can be found in attr_util.h.
-
-      %out_c, %out_tensor = "tfd.delegate_kernel"(
-        %in_c, %in1_tensor, %in2_tensor) {
-        _name = "MatMul",
-        attr1_name = "transpose_a", attr1_value = "bool$false",
-        attr2_name = "transpose_b", attr2_value = "bool$false"
-      } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor) -> (
-        !hex.chain, !tfd.tf_tensor)
-  }];
-
-  let arguments = (ins ChainType, Variadic<AnyType>);
-  let results = (outs ChainType, Variadic<AnyType>);
-}
-
-def PrintTftOp : RuntimeFallbackDialect_Op<"print_tft"> {
-  let summary = "print TF tensor operation";
-  let description = [{
-    The "tfd.print_tft" operation prints the input TF tensor. It takes an input
-    TF tensor to be printed and an input chain, and returns an output chain.
-
-    Example:
-      %c1 = "tfd.print_tft"(%t, %c) : (!tfd.tf_tensor, !hex.chain) -> !hex.chain
-
-  }];
-
-  let arguments = (ins TfTensorType, ChainType);
-  let results = (outs ChainType);
-}
-
-def ConvertTftToDhtOp : RuntimeFallbackDialect_Op<"convert_tft_to_dht", [NoSideEffect]> {
-  let summary = "convert TF tensor to TFRT DHT tensor operation";
-  let description = [{
-    The "tfd.convert_tft_to_dht" operation converts a TF tensor to a TFRT
-    DenseHostTensor.
-
-    It takes as input a TF Tensor and an input chain, and returns a converted
-    TFRT DHT tensor and an output chain.
-
-    Example:
-      %dht, %c0 = "tfd.convert_tft_to_dht"(%tft, %c)
-      : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-  }];
-
-  let arguments = (ins TfTensorType, ChainType);
-  // Enable registering canonicalization patterns with this operation.
-  let hasCanonicalizer = 1;
-  let results = (outs TensorType, ChainType);
-}
-
-def MoveDhtToTftOp : RuntimeFallbackDialect_Op<"move_dht_to_tft", [NoSideEffect]> {
-  let summary = "convert TFRT DHT tensor to DHT tensor operation";
-  let description = [{
-    The "tfd.move_dht_to_tft" operation moves a TFRT tensor into a TF Tensor.
-
-    It takes as input a TFRT Tensor and an input chain, and returns a TF tensor
-    with the same underlying buffer and an output chain.
-
-    Example:
-      %dht, %c0 = "tfd.convert_tft_to_dht"(%tft, %c)
-        : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-  }];
-
-  let arguments = (ins TensorType, ChainType);
-  let results = (outs TfTensorType, ChainType);
-}
-
-#endif // TFRT_DELEGATE_DIALECT
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
deleted file mode 100644
index 92571148cff..00000000000
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
-
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tfrt/bef_converter/mlir_to_bef.h"
-#include "tfrt/core_runtime/core_runtime.h"
-#include "tfrt/core_runtime/op_handler.h"
-#include "tfrt/host_context/host_context.h"
-#include "tfrt/tensor/dense_host_tensor_view.h"
-
-namespace tensorflow {
-namespace {
-
-llvm::StringRef ProcessIndexPath(mlir::ArrayAttr index_path) {
-  if (index_path.size() == 1 && index_path[0].isa<mlir::StringAttr>()) {
-    // TODO(chky): Support cases where index_path is not a single string.
-    return index_path[0].cast<mlir::StringAttr>().getValue();
-  }
-  return "";
-}
-
-}  // namespace
-
-void MapFunctionSignaturesFromTFSavedModelMLIR(
-    mlir::ModuleOp module,
-    llvm::function_ref<void(
-        llvm::StringRef func_name,
-        llvm::ArrayRef<std::pair<llvm::StringRef, llvm::StringRef>>
-            input_names_and_devices,
-        llvm::ArrayRef<llvm::StringRef> output_names,
-        llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> global_tensors)>
-        map_fn) {
-  // Create global_tensors for each functions.
-  mlir::SymbolTable symbol_table(module);
-  module.walk([&symbol_table, map_fn](mlir::FuncOp func) {
-    // Use the exported name as the function name, and skip non-exported
-    // functions.
-    auto func_names = mlir::tf_saved_model::GetExportedNames(func);
-    if (func_names.empty()) return;
-
-    // Here we walk through each arguments and find out the input/output names,
-    // and input devices, variables used by this function.
-    llvm::SmallVector<std::pair<llvm::StringRef, llvm::StringRef>, 4>
-        input_names_and_devices;
-    llvm::SmallVector<mlir::tf_saved_model::GlobalTensorOp, 4> global_tensors;
-    for (unsigned i = 0, e = func.getNumArguments(); i != e; ++i) {
-      if (auto input_index_path = func.getArgAttrOfType<mlir::ArrayAttr>(
-              i, "tf_saved_model.index_path")) {
-        std::pair<llvm::StringRef, llvm::StringRef> name_and_device;
-        name_and_device.first = ProcessIndexPath(input_index_path);
-        if (auto input_device =
-                func.getArgAttrOfType<mlir::StringAttr>(i, "tf.device")) {
-          name_and_device.second = input_device.getValue();
-        }
-        input_names_and_devices.push_back(name_and_device);
-      }
-      if (auto variable =
-              mlir::tf_saved_model::LookupBoundInput(func, i, symbol_table)) {
-        global_tensors.push_back(variable);
-      }
-    }
-
-    llvm::SmallVector<llvm::StringRef, 4> output_names;
-    for (unsigned i = 0, e = func.getNumResults(); i != e; ++i) {
-      if (auto output_index_path = func.getResultAttrOfType<mlir::ArrayAttr>(
-              i, "tf_saved_model.index_path")) {
-        output_names.push_back(ProcessIndexPath(output_index_path));
-      }
-    }
-
-    for (auto func_name : func_names)
-      map_fn(func_name, input_names_and_devices, output_names, global_tensors);
-  });
-}
-
-Status CompileTFSavedModelMLIRToBEF(const TFRTSavedModelCompileOptions& options,
-                                    mlir::ModuleOp module,
-                                    tfrt::AlignedBuffer<8>* bef_buffer) {
-  VLOG(1) << "TF Dialect: " << tensorflow::MlirModuleToString(module);
-
-  // Lower MLIR TF Dialect to MLIR TFRT CoreRT dialect.
-  mlir::PassManager pm(module.getContext());
-
-  tensorflow::CoreRTPipelineOptions pass_options;
-  if (!options.default_device.empty()) {
-    pass_options.default_device = options.default_device;
-  }
-  if (!options.force_data_format.empty()) {
-    pass_options.force_data_format = options.force_data_format;
-  }
-  pass_options.enable_optimizer = options.enable_optimizer;
-  tensorflow::CreateTFExecutorToCoreRTPipeline(pm, pass_options);
-
-  if (mlir::failed(pm.run(module)))
-    return tensorflow::errors::Internal(
-        "failed to lower TF Dialect to CoreRT dialect.");
-
-  VLOG(1) << "TFRT Dialect: " << tensorflow::MlirModuleToString(module);
-
-  auto bef =
-      tfrt::ConvertMLIRToBEF(module, /* disable_optional_sections = */ true);
-  if (bef.empty())
-    return tensorflow::errors::Internal("failed to convert MLIR to BEF.");
-
-  assert(bef_buffer);
-  bef_buffer->assign(bef.begin(), bef.end());
-
-  return Status::OK();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
deleted file mode 100644
index 06a6c5a22f9..00000000000
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
-
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "tensorflow/core/platform/status.h"
-#include "tfrt/core_runtime/tensor_handle.h"
-#include "tfrt/support/aligned_buffer.h"
-
-namespace tfrt {
-class CoreRuntime;
-}
-
-namespace mlir {
-class ModuleOp;
-}
-
-namespace tensorflow {
-
-struct TFRTSavedModelCompileOptions {
-  // TODO(tf-runtime-team): Ideally, compiler should make the decision where
-  // to place the variable.
-  std::string variable_device = "cpu";
-  std::string default_device = "cpu";
-
-  // Enable compiler optimization in TFRT dialect.
-  bool enable_optimizer = true;
-
-  // Force data format for all layout sensitive operations, eg. setting it to
-  // "NHWC" will changes all data format in the graph to "NHWC" by inserting
-  // or removing related tf.Transpose op. Currently the supported formats are
-  // "NHWC" and "NCHW".
-  //
-  // TODO(tf-runtime-team): Ideally compiler should figure out whether the
-  // data format should be changed, instead of controlled by users.
-  std::string force_data_format;
-};
-
-// Map signatures (eg. input/output names, variables) for each function.
-void MapFunctionSignaturesFromTFSavedModelMLIR(
-    mlir::ModuleOp module,
-    llvm::function_ref<void(
-        llvm::StringRef func_name,
-        llvm::ArrayRef<std::pair<llvm::StringRef, llvm::StringRef>>
-            input_names_and_devices,
-        llvm::ArrayRef<llvm::StringRef> output_names,
-        llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> global_tensors)>
-        map_fn);
-
-// Compile MLIR in TF saved model dialect into BEF.
-Status CompileTFSavedModelMLIRToBEF(const TFRTSavedModelCompileOptions& options,
-                                    mlir::ModuleOp module,
-                                    tfrt::AlignedBuffer<8>* bef_buffer);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
diff --git a/tensorflow/compiler/mlir/tfrt/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/BUILD
deleted file mode 100644
index 4faa8d2efe8..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
deleted file mode 100644
index fc7c142ea73..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-mlir-translate",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir b/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
deleted file mode 100644
index 5943997a1bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
+++ /dev/null
@@ -1,65 +0,0 @@
-// RUN: tf-mlir-translate -analyze-tf-for-tfrt %s | FileCheck %s
-
-func @main(%serialized: tensor<32x!tf.string>,
-           %names : tensor<32x!tf.string>,
-           %dense_keys : tensor<2x!tf.string>,
-           %dense_default_0 : tensor<?xf32>,
-           %dense_default_1 : tensor<?xf32>) {
-  // CHECK:      summary {
-  // CHECK-NEXT:   ref_variable: true
-  // CHECK-NEXT:   incompatible_variable: true
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.AssignVariableOp"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:       incompatible_variable: true
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.Const"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 2
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.ParseExampleV2"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.VarHandleOp"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.VariableV2"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:       ref_variable: true
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  %0 = "tf.VariableV2"() {shape = #tf.shape<2>, container = "", shared_name = ""} : () -> tensor<!tf.int32ref>
-  %1 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
-  %2 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
-  "tf.AssignVariableOp"(%2, %1) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
-  %empty_str_vector = "tf.Const"()
-    {dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B207D207D"> : tensor<0x!tf.string>}
-      : () -> tensor<0x!tf.string>
-  %result:2 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %dense_keys, %empty_str_vector, %dense_default_0, %dense_default_1)
-    {dense_shapes = [#tf.shape<>, #tf.shape<>], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 2, 0, 0]> : vector<6xi32>}
-      : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<?xf32>, tensor<?xf32>) -> (tensor<32xf32>, tensor<32xf32>)
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/basics.mlir b/tensorflow/compiler/mlir/tfrt/tests/basics.mlir
deleted file mode 100644
index 650bd04b882..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/basics.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: tf-opt -tf-legalize-to-hex %s -o -| FileCheck %s
-
-
-// CHECK-LABEL: func @constants() {
-func @constants() {
-  // CHECK: "hex.constant_int"() {value = 1 : i32}
-  %0 = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "x", value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: "hex.constant_int"() {value = 42 : i32}
-  %1 = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "y", value = dense<42> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
-  // CHECK: hex.return
-  return
-}
-
-// CHECK-LABEL: func @add
-func @add(%arg0: tensor<1xi32>) {
-  // CHECK: hex.add_int
-  %2 = "tf.Add"(%arg0, %arg0) {T = "tfdtype$DT_INT32", device = "", name = "z"} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir b/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
deleted file mode 100644
index 410ff299883..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: tf-opt %s -tf-legalize-to-hex  -verify-diagnostics
-
-func @partial_convert() {
-  %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // expected-error @+1 {{failed to legalize operation 'tf.Const'}}
-  %1 = "tf.Const"() {value = dense<42> : tensor<2xi32>} : () -> tensor<2xi32>
-  %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<2xi32>) -> tensor<2xi32>
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/opt.mlir b/tensorflow/compiler/mlir/tfrt/tests/opt.mlir
deleted file mode 100644
index 6f27fa6d7e4..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/opt.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: tf-opt %s -pass-pipeline='func(canonicalize)' | FileCheck %s
-
-// CHECK-LABEL: func @simplify_double_conversion_test(
-func @simplify_double_conversion_test() {
-  // CHECK: %[[CREATE:.*]] = dht.create
-  // CHECK: %[[FILL:.*]] = dht.fill
-  // CHECK: dht.print_tensor %[[CREATE]], %[[FILL]]
-  %c0 = hex.new.chain
-
-  // Create 2x2 dht<i32, 2> with value 1
-  %dht0 = dht.create_uninitialized_tensor.i32.2 [2 : i32, 2 : i32]
-  %c1 = dht.fill_tensor_with_constant.i32 %dht0, %c0 1 : i32
-
-  // Convert dht to tf tensor
-  %tft0, %c2 = "tfd.move_dht_to_tft"(%dht0, %c1)
-      : (!t.tensor, !hex.chain) -> (!tfd.tf_tensor, !hex.chain)
-
-  // Convert tf tensor back to dht
-  %dht1, %c3 = "tfd.convert_tft_to_dht"(%tft0, %c2)
-      : (!tfd.tf_tensor, !hex.chain) -> (!t.tensor, !hex.chain)
-
-  // Print the result dht
-  %c4 = dht.print_tensor %dht1, %c3
-
-  hex.return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
deleted file mode 100644
index 4faa8d2efe8..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
deleted file mode 100644
index 6c129c4be22..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-module attributes {tf_saved_model.semantics} {
-
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "y", type = tensor<1x3xf32>, value = dense<[[1.67482901, -0.529208779, -0.803792417]]> : tensor<1x3xf32>} : () -> ()
-
-// CHECK-LABEL: func @basic
-func @func_basic(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []})
-  attributes {tf_saved_model.exported_names = ["basic"]} {
-  %1 = "tf.ReadVariableOp"(%arg1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-
-  // CHECK: {{%.*}} = corert.executeop({{%.*}}) "tf.MatMul"
-  // CHECK-SAME: {T = f32, transpose_a = false, transpose_b = false}
-  %2 = "tf.MatMul"(%arg0, %1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
-}
-
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
deleted file mode 100644
index 40b0332b61c..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-NOT: tf_saved_model.semantics
-module attributes {tf_saved_model.semantics} {
-
-// CHECK-NOT: "tf_saved_model.global_tensor"
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "y", type = tensor<1x3xf32>, value = dense<[[1.67482901, -0.529208779, -0.803792417]]> : tensor<1x3xf32>} : () -> ()
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "z", type = tensor<3xf32>, value = dense<[1.67482901, -0.529208779, -0.803792417]> : tensor<3xf32>} : () -> ()
-
-// CHECK-LABEL: func @basic
-// CHECK-SAME: ([[arg0:%.*]]: !corert.tensorhandle, [[arg1:%.*]]: !corert.tensorhandle,
-// CHECK-SAME: [[arg2:%.*]]: !corert.tensorhandle) -> !corert.tensorhandle {
-func @func_basic(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y},
-    %arg2: tensor<!tf.resource<tensor<3xf32>>> {tf_saved_model.bound_input = @z})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []})
-  attributes {tf_saved_model.exported_names = ["basic"]} {
-  // CHECK-NEXT: [[cpu_device:%.*]] = corert.get_device "cpu"
-  // CHECK-NEXT: [[r0:%.*]] = corert.executeop([[cpu_device]]) "tf.MatMul"([[arg0]], [[arg1]])
-  // CHECK-NEXT: [[r1:%.*]] = corert.executeop([[cpu_device]]) "tf.BiasAdd"([[r0]], [[arg2]])
-  // CHECK-NEXT: [[r2:%.*]] = corert.executeop([[cpu_device]]) "tf.Tanh"([[r1]])
-  // CHECK-NEXT: hex.return [[r2]] : !corert.tensorhandle
-
-  %0 = "tf.ReadVariableOp"(%arg2) {_output_shapes = ["tfshape$dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<3xf32>>>) -> tensor<3xf32>
-  %1 = "tf.ReadVariableOp"(%arg1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-  %2 = "tf.MatMul"(%arg0, %1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  %3 = "tf.BiasAdd"(%2, %0) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], data_format = "NHWC", device = "cpu"} : (tensor<3x3xf32>, tensor<3xf32>) -> tensor<3x3xf32>
-  %4 = "tf.Tanh"(%3) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  %5 = "tf.Identity"(%4) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  return %5 : tensor<3x3xf32>
-}
-
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
deleted file mode 100644
index 774ea0526bd..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @derived_attrs
-func @derived_attrs(
-  %serialized: tensor<?x!tf.string>,
-  %names: tensor<0x!tf.string>,
-  %sparse_keys: tensor<0x!tf.string>,
-  %dense_keys: tensor<1x!tf.string>,
-  %ragged_keys: tensor<0x!tf.string>,
-  %dense_default: tensor<0xi64>) -> tensor<?xi64> {
-
-  %dense_value =
-    "tf.ParseExampleV2"(%serialized, %names, %sparse_keys, %dense_keys, %ragged_keys, %dense_default)
-    // CHECK: Tdense = [i64]
-    // CHECK-SAME: dense_shapes = [#corert.shape<>]
-    { device = "cpu", num_sparse = 0 : i64, dense_shapes = [#tf.shape<>], result_segment_sizes = dense<[0, 0, 0, 1, 0, 0]> : vector<6xi32>}
-      : (tensor<?x!tf.string>, tensor<0x!tf.string>, tensor<0x!tf.string>, tensor<1x!tf.string>, tensor<0x!tf.string>, tensor<0xi64>)
-      -> tensor<?xi64>
-
-  return %dense_value : tensor<?xi64>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
deleted file mode 100644
index 7077523b1e2..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @device_test
-func @device_test(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<1x3xf32> {tf_saved_model.index_path = [0]})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []}) {
-  // CHECK: {{%.*}} = corert.get_device "gpu"
-
-  %2 = "tf.MatMul"(%arg0, %arg1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "gpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
deleted file mode 100644
index 950cef928a9..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: tf-opt -corert-optimize %s | FileCheck %s
-
-// CHECK-LABEL: func @fold_test
-func @fold_test(%arg: !corert.tensorhandle) -> !corert.tensorhandle {
-    %cpu = corert.get_device "cpu"
-    // CHECK-NOT: tf.Const
-    %0 = corert.executeop(%cpu) "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-    // CHECK: "_tf.Transpose"({{%.*}})
-    // CHECK-SAME: perm = dense<[0, 3, 1, 2]> : tensor<4xi32>
-    %1 = corert.executeop(%cpu) "tf.Transpose"(%arg, %0) {T = f32, Tperm = i32} : 1
-    hex.return %1 : !corert.tensorhandle
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
deleted file mode 100644
index b1306be825c..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @string_tensor
-func @string_tensor() -> (tensor<0x!tf.string>, tensor<7x!tf.string>) {
-  // CHECK: {shape = [0], value = []}
-  %0 = "tf.Const"() {value = dense<[]> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
-  // CHECK: {shape = [7], value = ["has_login_page_feature", "num_terms_inside_postform", "num_terms_outside_postform", "num_terms_outside_postform_without_bp", "query_params_contains_url", "title_with_login_phase", "url_contains_login_terms"]}
-  %1 = "tf.Const"() {value = dense<["has_login_page_feature", "num_terms_inside_postform", "num_terms_outside_postform", "num_terms_outside_postform_without_bp", "query_params_contains_url", "title_with_login_phase", "url_contains_login_terms"]> : tensor<7x!tf.string>} : () -> tensor<7x!tf.string>
-  return %0, %1 : tensor<0x!tf.string>, tensor<7x!tf.string>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
deleted file mode 100644
index 5c44f558280..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: tf-opt -tf-executor-to-corert-pipeline %s | FileCheck %s
-
-// CHECK-LABEL: func @basic
-// CHECK-SAME: ([[arg0:%.*]]: !corert.tensorhandle, [[arg1:%.*]]: !corert.tensorhandle)
-// CHECK-NEXT: [[cpu:%.*]] = corert.get_device "cpu"
-// CHECK-NEXT: [[res:%.*]] = corert.executeop([[cpu]]) "tf.MatMul"([[arg0]], [[arg1]])
-// CHECK-NEXT: hex.return [[res]] : !corert.tensorhandle
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 293 : i32}} {
-  func @basic(%arg0: tensor<3x1xf32>,
-              %arg1: tensor<!tf.resource<tensor<1x3xf32>>>
-  ) -> tensor<3x3xf32> {
-    %0 = tf_executor.graph {
-      %outputs, %control = tf_executor.island wraps "tf.Const"() {value = dense<0.899999976> : tensor<f32>} : () -> tensor<f32>
-      %outputs_0, %control_0 = tf_executor.island {
-        %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<*x!tf.resource>
-        %2 = "tf.ReadVariableOp"(%1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "", dtype = f32} : (tensor<*x!tf.resource>) -> tensor<1x3xf32>
-        %3 = "tf.MatMul"(%arg0, %2) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-        tf_executor.yield %3 : tensor<3x3xf32>
-      }
-      tf_executor.fetch %outputs_0, %control_0 : tensor<3x3xf32>, !tf_executor.control
-    }
-    return %0 : tensor<3x3xf32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
deleted file mode 100644
index 5968a590f91..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: tf-opt %s -tf-to-tfd-lowering | FileCheck %s
-
-// CHECK: func @inference_call(
-// CHECK-SAME: %arg0: !hex.chain,
-// CHECK-SAME: %arg1: !tfd.tf_tensor,
-// CHECK-SAME: %arg2: !tfd.tf_tensor,
-// CHECK-SAME: %arg3: !tfd.tf_tensor,
-// CHECK-SAME: %arg4: !tfd.tf_tensor,
-// CHECK-SAME: %arg5: !tfd.tf_tensor
-// CHECK-SAME: ) -> (!hex.chain, !tfd.tf_tensor)
-func @inference_call(
-  %arg0: tensor<?x784xf32>,
-  %arg1: tensor<*x!tf.resource>,
-  %arg2: tensor<*x!tf.resource>,
-  %arg3: tensor<*x!tf.resource>,
-  %arg4: tensor<*x!tf.resource>
-  )-> tensor<?x10xf32> {
-    // CHECK: %0:2 = "tfd.delegate_kernel"(%arg0, %arg5)
-    // CHECK-SAME: _name = "tf.ReadVariableOp"
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: (!hex.chain, !tfd.tf_tensor) -> (!hex.chain, !tfd.tf_tensor)
-    %0 = "tf.ReadVariableOp"(%arg4) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<10xf32>
-
-    // CHECK: %1:2 = "tfd.delegate_kernel"(%0#0, %arg3) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp"
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %1 = "tf.ReadVariableOp"(%arg2) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<512xf32>
-
-    // CHECK: %2:2 = "tfd.delegate_kernel"(%1#0, %arg4) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %2 = "tf.ReadVariableOp"(%arg3) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<512x10xf32>
-
-    // CHECK: %3:2 = "tfd.delegate_kernel"(%2#0, %arg2) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %3 = "tf.ReadVariableOp"(%arg1) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<784x512xf32>
-
-    // CHECK: %4:2 = "tfd.delegate_kernel"(%3#0, %arg1, %3#1) {
-    // CHECK-SAME: _name = "tf.MatMul",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT",
-    // CHECK-SAME: attr1_name = "transpose_a", attr1_value = false,
-    // CHECK-SAME: attr2_name = "transpose_b", attr2_value = false
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %4 = "tf.MatMul"(%arg0, %3) {
-      dtype = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false
-    } : (tensor<?x784xf32>, tensor<784x512xf32>) -> tensor<?x512xf32>
-
-    // CHECK: %5:2 = "tfd.delegate_kernel"(%4#0, %4#1, %1#1) {
-    // CHECK-SAME: _name = "tf.AddV2"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %5 = "tf.AddV2"(%4, %1)
-      : (tensor<?x512xf32>, tensor<512xf32>)-> tensor<?x512xf32>
-
-    // CHECK: %6:2 = "tfd.delegate_kernel"(%5#0, %5#1) {
-    // CHECK-SAME: _name = "tf.Relu",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %6 = "tf.Relu"(%5) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x512xf32>) -> tensor<?x512xf32>
-
-    // CHECK: %7:2 = "tfd.delegate_kernel"(%6#0, %6#1, %2#1) {
-    // CHECK-SAME: _name = "tf.MatMul",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT",
-    // CHECK-SAME: attr1_name = "transpose_a", attr1_value = false,
-    // CHECK-SAME: attr2_name = "transpose_b", attr2_value = false
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %7 = "tf.MatMul"(%6, %2) {
-      dtype = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false
-    } : (tensor<?x512xf32>, tensor<512x10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: %8:2 = "tfd.delegate_kernel"(%7#0, %7#1, %0#1) {
-    // CHECK-SAME: _name = "tf.AddV2",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %8 = "tf.AddV2"(%7, %0) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x10xf32>, tensor<10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: %9:2 = "tfd.delegate_kernel"(%8#0, %8#1) {
-    // CHECK-SAME: _name = "tf.Identity",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %9 = "tf.Identity"(%8) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: hex.return %9#0, %9#1 : !hex.chain, !tfd.tf_tensor
-    return %9 : tensor<?x10xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc b/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
deleted file mode 100644
index 9d13955490b..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements lowering of Tf dialect to TFRT Hex kernels.
-//
-// Current lowering is a placeholder performing trivial conversion
-// for integer constants and additions.
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-
-namespace mlir {
-namespace {
-
-// Pattern rewrite rules for "tf.Const", "tf.Add" and "return" ops.
-bool isInt32LikeType(Type t) {
-  if (t.isSignlessInteger(32)) return true;
-  if (auto ttype = t.dyn_cast<RankedTensorType>()) {
-    if (ttype.hasStaticShape() && ttype.getNumElements() == 1 &&
-        ttype.getElementType().isSignlessInteger(32))
-      return true;
-  }
-  return false;
-}
-
-// Replaces 32-bit integer TF::ConstOp with "hex.constant_int" op.
-struct ConstOpConversion : public ConversionPattern {
-  explicit ConstOpConversion(MLIRContext *context)
-      : ConversionPattern(TF::ConstOp::getOperationName(), 1, context) {}
-
-  LogicalResult matchAndRewrite(
-      Operation *op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    auto constOp = cast<TF::ConstOp>(op);
-    if (!isInt32LikeType(constOp.getType())) return failure();
-
-    auto valueAttr = constOp.value();
-    auto newAttr = Attribute();
-
-    // Convert constant op if it has an integer or dense elements attribute.
-    // Other kinds of element attributes are not converted for now.
-    if (valueAttr.isa<IntegerAttr>()) {
-      newAttr = valueAttr;
-    } else if (auto v = valueAttr.dyn_cast<SplatElementsAttr>()) {
-      if (v.isSplat()) newAttr = v.getSplatValue();
-    }
-    if (!newAttr) return failure();
-
-    mlir::OperationState state(constOp.getLoc(), "hex.constant_int");
-    state.types.push_back(rewriter.getIntegerType(32));
-    state.addAttribute("value", newAttr);
-    auto newOp = rewriter.createOperation(state);
-    rewriter.replaceOp(op, newOp->getResult(0));
-    return success();
-  }
-};
-
-// Replaces 32-bit integer TF::Add op with "hex.add_int" op.
-struct AddOpConversion : public ConversionPattern {
-  explicit AddOpConversion(MLIRContext *context)
-      : ConversionPattern(TF::AddOp::getOperationName(), 1, context) {}
-
-  LogicalResult matchAndRewrite(
-      Operation *op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    auto addOp = cast<TF::AddOp>(op);
-
-    if (!isInt32LikeType(operands[0].getType()) ||
-        !isInt32LikeType(operands[1].getType()))
-      return failure();
-
-    auto int32Ty = rewriter.getIntegerType(32);
-    mlir::OperationState state(addOp.getLoc(), "hex.add_int", operands,
-                               {int32Ty}, {});
-    auto newOp = rewriter.createOperation(state);
-    rewriter.replaceOp(op, newOp->getResult(0));
-    return success();
-  }
-};
-
-// Replaces return op that has no arguments with "hex.return" op.
-struct ReturnOpConversion : public OpConversionPattern<ReturnOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      ReturnOp srcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    if (srcOp.getNumOperands() != 0) return failure();
-
-    mlir::OperationState state(srcOp.getLoc(), "hex.return");
-    rewriter.createOperation(state);
-
-    rewriter.eraseOp(srcOp);
-    return success();
-  }
-};
-
-// Legalize TF operations to host program dialect.
-struct TfLegalizeToHex
-    : public PassWrapper<TfLegalizeToHex, OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    auto *ctx = &getContext();
-    TypeConverter converter;
-    converter.addConversion([](Type type) -> Type {
-      // Convert single element tensor type of int32s to int32 type
-      if (isInt32LikeType(type)) {
-        return IntegerType::get(32, type.getContext());
-      }
-      return Type();
-    });
-
-    OwningRewritePatternList patterns;
-
-    // For now, replace only int32 TF::OpConst, TF::OpAdd and OpReturn with
-    // "hex.constant_int", "hex.add_int" and "hex.return", respectively.
-    patterns.insert<ConstOpConversion, AddOpConversion, ReturnOpConversion>(
-        ctx);
-
-    ConversionTarget target(*ctx);
-    const auto legal = ConversionTarget::LegalizationAction::Legal;
-    target.setOpAction(OperationName(StringRef("hex.constant_int"), ctx),
-                       legal);
-    target.setOpAction(OperationName(StringRef("hex.add_int"), ctx), legal);
-    target.setOpAction(OperationName(StringRef("hex.return"), ctx), legal);
-    target.addLegalOp<ModuleOp, ModuleTerminatorOp, FuncOp>();
-
-    auto result =
-        applyFullConversion(getOperation(), target, patterns, &converter);
-    if (failed(result)) signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToHexPass() {
-  return std::make_unique<TfLegalizeToHex>();
-}
-
-static PassRegistration<TfLegalizeToHex> pass(
-    "tf-legalize-to-hex",
-    "Convert TF dialect to the TF runtime host program dialect.");
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc b/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
deleted file mode 100644
index 9e06ba1f4bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements the optimzation passe on TFRT CoreRuntime dialect.
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tfrt/core_runtime/opdefs/core_runtime.h"
-
-namespace tensorflow {
-namespace {
-
-// Implement a constant fold pattern for corert dialect. The following pattern
-// will be matched:
-//
-// %0 = corert.executeop(%cpu) "tf.Const"()
-//  {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-// %1 = corert.executeop(%cpu) "tf.Transpose"(%arg, %0)
-//  {T = f32, Tperm = i32} : 1
-//
-// And it will converted to:
-//
-// %1 = corert.executeop(%cpu) "_tf.Transpose"(%arg)
-//  {T = f32, Tperm = i32, perm = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-//
-class CoreRTExecuteOpRewritePattern
-    : public mlir::OpRewritePattern<tfrt::corert::ExecuteOp> {
- public:
-  CoreRTExecuteOpRewritePattern(
-      mlir::MLIRContext *context,
-      ArrayRef<std::pair<StringRef, ArrayRef<StringRef>>> ops_to_attrs)
-      : OpRewritePattern(context),
-        ops_to_attrs_(ops_to_attrs.begin(), ops_to_attrs.end()) {}
-
-  mlir::LogicalResult matchAndRewrite(
-      tfrt::corert::ExecuteOp op,
-      mlir::PatternRewriter &rewriter) const override {
-    auto attr_names = ops_to_attrs_.lookup(op.op_name());
-    if (attr_names.empty()) return failure();
-
-    SmallVector<mlir::Value, 4> new_operands;
-    SmallVector<std::pair<StringRef, Attribute>, 4> new_attributes;
-    op.getOpAttrs(&new_attributes);
-    assert(op.operands().size() == attr_names.size());
-    for (const auto &iter : llvm::zip(op.operands(), attr_names)) {
-      mlir::Value arg = std::get<0>(iter);
-      StringRef name = std::get<1>(iter);
-
-      Attribute const_attr;
-      if (!name.empty() && matchPattern(arg, m_Constant(&const_attr))) {
-        // Convert the folded argument to an attribute.
-        new_attributes.push_back({name, const_attr});
-      } else {
-        // Keep the argument that is not folded.
-        new_operands.push_back(arg);
-      }
-    }
-
-    if (new_operands.size() == op.operands().size()) return failure();
-
-    SmallString<32> new_op_name{"_"};
-    new_op_name += op.op_name();
-
-    rewriter.replaceOpWithNewOp<tfrt::corert::ExecuteOp>(
-        op, op.getResultTypes(), op.device(), new_operands, new_attributes,
-        new_op_name);
-
-    return success();
-  }
-
- private:
-  // Map from op_name to attr_names. The attr_names indicates the name of the
-  // attribute to which each constant-folded argument is converted. An empty
-  // string means this argument should not be folded.
-  llvm::DenseMap<StringRef, ArrayRef<StringRef>> ops_to_attrs_;
-};
-
-struct CoreRTOptimizePass
-    : public mlir::PassWrapper<CoreRTOptimizePass, FunctionPass> {
-  void runOnFunction() override {
-    mlir::OwningRewritePatternList patterns;
-    auto func = getFunction();
-
-    static constexpr StringRef kMeanAttrs[] = {"", "reduction_indices"};
-    static constexpr StringRef kPadAttrs[] = {"", "paddings"};
-    static constexpr StringRef kTransposeAttrs[] = {"", "perm"};
-
-    static constexpr std::pair<StringRef, ArrayRef<StringRef>> kOpsToAttrs[] = {
-        {"tf.Mean", kMeanAttrs},
-        {"tf.Pad", kPadAttrs},
-        {"tf.Transpose", kTransposeAttrs},
-    };
-
-    patterns.insert<CoreRTExecuteOpRewritePattern>(&getContext(), kOpsToAttrs);
-
-    mlir::applyPatternsAndFoldGreedily(func, patterns);
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> CreateCoreRTOptimizePass() {
-  return std::make_unique<CoreRTOptimizePass>();
-}
-
-static mlir::PassRegistration<CoreRTOptimizePass> pass("corert-optimize",
-                                                       "Optimizes corert.");
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
deleted file mode 100644
index be0bf0fbd1f..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-
-namespace tensorflow {
-
-// Create a pass that converts MLIR TF dialect to MLIR TFRT CoreRT dialect.
-std::unique_ptr<mlir::Pass> CreateTFToCoreRTConversionPass();
-
-// Run TFToCoreRTConversionPass as a free function. Useful for reusing the pass
-// logic in a custom pass with additional conversions.
-mlir::LogicalResult TFToCoreRTConversionPassRun(
-    mlir::MLIRContext* context, mlir::ModuleOp* module,
-    mlir::ConversionTarget* target, mlir::OwningRewritePatternList* patterns);
-
-// Create the corert optimization pass.
-std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> CreateCoreRTOptimizePass();
-
-struct CoreRTPipelineOptions
-    : public mlir::PassPipelineOptions<CoreRTPipelineOptions> {
-  Option<std::string> default_device{
-      *this, "default-device", llvm::cl::desc("default device assignment"),
-      llvm::cl::init("cpu")};
-  Option<bool> enable_optimizer{
-      *this, "enable-optimizer",
-      llvm::cl::desc("run optimization passes on corert dialect"),
-      llvm::cl::init(false)};
-  Option<std::string> force_data_format{
-      *this, "force-data-format",
-      llvm::cl::desc("force data format for all layout sensitive operations")};
-};
-
-// Creates a pipeline of passes that lowers MLIR TF Executor dialect to TF
-// dialect for CoreRT purposes.
-void CreateTFExecutorToTFPipeline(
-    mlir::OpPassManager& pm, const CoreRTPipelineOptions& options);  // NOLINT
-
-// Creates a pipeline of passes that converts MLIR TF Executor dialect to CoreRT
-// dialect.
-void CreateTFExecutorToCoreRTPipeline(
-    mlir::OpPassManager& pm, const CoreRTPipelineOptions& options);  // NOLINT
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc
deleted file mode 100644
index 0784dc4ffea..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements lowering of TF dialect to TFRT CoreRuntime ExecuteOp.
-// This lowering pass is heavily experimental and incomplete. External code
-// should not depend on the code here. And please do not take example on it as
-// "the path forward" for this.
-
-#include <vector>
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/Passes.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Pass/PassOptions.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/tstring.h"
-#include "tfrt/basic_kernels/opdefs/basic_kernels.h"
-#include "tfrt/core_runtime/opdefs/attributes.h"
-#include "tfrt/core_runtime/opdefs/core_runtime.h"
-
-namespace tensorflow {
-namespace {
-
-// TODO(chky): define these dialect types instead of using opaque types.
-mlir::Type CreateDeviceType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("corert"), "device",
-                               builder->getContext());
-}
-
-mlir::Type CreateTensorHandleType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("corert"), "tensorhandle",
-                               builder->getContext());
-}
-
-mlir::Type CreateStringType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("hex"), "string",
-                               builder->getContext());
-}
-
-// A helper class for converting CoreRT types and attributes.
-class CoreRTConverter : public mlir::TypeConverter {
- public:
-  explicit CoreRTConverter(mlir::MLIRContext *context)
-      : builder_(context),
-        device_type_(CreateDeviceType(&builder_)),
-        tensor_handle_type_(CreateTensorHandleType(&builder_)) {
-    addConversion([](Type type) { return type; });
-    addConversion([=](TensorType type) { return tensor_handle_type_; });
-  }
-
-  // Create a single attribute that contains the named attribute lists. It is an
-  // array of pairs. The key must be a string attribute, and the value can be
-  // any attribute that is supported by CoreRuntime.
-  mlir::ArrayAttr CreateOpAttrs(ArrayRef<NamedAttribute> attrs) {
-    llvm::SmallVector<mlir::Attribute, 4> attr_array;
-    for (auto key_and_value : attrs) {
-      if (!IsUnusedAttribute(key_and_value.first)) {
-        auto converted = ConvertAttribute(key_and_value.second);
-        if (!converted) return {};
-
-        mlir::StringAttr key = builder_.getStringAttr(key_and_value.first);
-        attr_array.push_back(builder_.getArrayAttr({key, converted}));
-      }
-    }
-    return builder_.getArrayAttr(attr_array);
-  }
-
-  // Convert the device attribute in `op` to a device value produced by the
-  // corresponding GetDeviceOp in the current block. If there does not exist
-  // one, insert a GetDeviceOp to the beginning of the block and return the
-  // device value.
-  Value ConvertDevice(mlir::Operation *op,
-                      ConversionPatternRewriter *rewriter) const {
-    auto device_attr = op->getAttr("device");
-    if (!device_attr) {
-      op->emitOpError("device attribute not found.");
-      return {};
-    }
-
-    auto device_name = device_attr.cast<mlir::StringAttr>().getValue();
-    if (device_name.empty()) {
-      op->emitOpError("device has not been assigned.");
-      return {};
-    }
-
-    op->removeAttr(rewriter->getIdentifier("device"));
-
-    auto *block = op->getBlock();
-
-    if (auto get_device_op = GetDeviceOrNull(device_name, block))
-      return get_device_op.device();
-
-    ConversionPatternRewriter::InsertionGuard insertion_guard(*rewriter);
-    rewriter->setInsertionPointToStart(block);
-    return rewriter
-        ->create<tfrt::corert::GetDeviceOp>(block->getParent()->getLoc(),
-                                            device_type(), device_name)
-        .device();
-  }
-
-  mlir::Type device_type() const { return device_type_; }
-  mlir::Type tensor_handle_type() const { return tensor_handle_type_; }
-
- private:
-  // TODO(chky): attributes "_output_shapes" should be removed by any tool that
-  // generates TF MLIR dialect, as they are not used by CoreRuntime. Remove this
-  // filtering logic once unused attributes are cleaned up in the upper layer.
-  bool IsUnusedAttribute(llvm::StringRef name) const {
-    return name == "_output_shapes";
-  }
-
-  // Returns the converted attribute in TFRT dialect. If the conversion fails,
-  // returns a null attribute instead.
-  mlir::Attribute ConvertAttribute(mlir::Attribute attr) {
-    // The supported attributes here should be kept consistent with
-    // //third_party/tf_runtime/include/tfrt/core_runtime/op_attr_type.h
-    //
-    // Currently, not all tensorflow data types are supported. Unranked shape
-    // attributes are not supported yet.
-
-    // Return directly if the attribute is already supported.
-    if (attr.isa<mlir::IntegerAttr>() || attr.isa<mlir::FloatAttr>() ||
-        attr.isa<mlir::BoolAttr>() || attr.isa<mlir::TypeAttr>() ||
-        attr.isa<mlir::StringAttr>() ||
-        attr.isa<mlir::DenseIntOrFPElementsAttr>())
-      return attr;
-
-    // Convert the attribute to the corresponding format in TFRT dialect if
-    // needed.
-    if (auto shape_attr = attr.dyn_cast<mlir::TF::ShapeAttr>()) {
-      if (!shape_attr.hasRank()) return {};
-      return tfrt::corert::ShapeAttr::get(builder_.getContext(),
-                                          shape_attr.getShape());
-    }
-
-    // For arrays, we recursively convert the elements.
-    if (auto array_attr = attr.dyn_cast<mlir::ArrayAttr>()) {
-      llvm::SmallVector<mlir::Attribute, 8> attrs;
-      attrs.reserve(array_attr.size());
-      for (auto attr : array_attr) {
-        auto converted = ConvertAttribute(attr);
-        if (!converted) return {};
-        attrs.push_back(converted);
-      }
-      return builder_.getArrayAttr(attrs);
-    }
-
-    return {};
-  }
-
-  // Find a GetDeviceOp that matches the device_name at the beginning of the
-  // block. Return nullptr if it does not find one.
-  tfrt::corert::GetDeviceOp GetDeviceOrNull(StringRef device_name,
-                                            Block *block) const {
-    for (auto &op : *block) {
-      auto get_device_op = llvm::dyn_cast<tfrt::corert::GetDeviceOp>(&op);
-      if (!get_device_op) break;
-      if (get_device_op.device_name() == device_name) return get_device_op;
-    }
-    return nullptr;
-  }
-
-  mlir::Builder builder_;
-  mlir::Type device_type_;
-  mlir::Type tensor_handle_type_;
-};
-
-// Lower a tf.Const op that creates a string tensor to a native
-// corert.create_string_tensor op.
-class CoreRTConstStringTensorOpConversion
-    : public mlir::OpConversionPattern<mlir::TF::ConstOp> {
- public:
-  CoreRTConstStringTensorOpConversion(mlir::MLIRContext *context,
-                                      CoreRTConverter *corert_converter)
-      : mlir::OpConversionPattern<mlir::TF::ConstOp>(context),
-        corert_converter_(*corert_converter) {}
-
-  LogicalResult matchAndRewrite(
-      mlir::TF::ConstOp op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    if (!op.dtype().isa<mlir::TF::StringType>()) return failure();
-
-    DenseStringElementsAttr attr = op.value().cast<DenseStringElementsAttr>();
-
-    llvm::SmallVector<Attribute, 4> values;
-    values.reserve(attr.getNumElements());
-    for (const auto &element : attr.getRawStringData())
-      values.push_back(rewriter.getStringAttr(
-          llvm::StringRef(element.data(), element.size())));
-
-    // Create the shape attribute from the tensor shape.
-    ArrayRef<int64_t> shape = op.value().getType().getShape();
-    llvm::SmallVector<mlir::Attribute, 4> dims;
-    dims.reserve(shape.size());
-    auto i64_type = rewriter.getIntegerType(64);
-    for (auto dim : shape)
-      dims.push_back(rewriter.getIntegerAttr(i64_type, dim));
-
-    auto new_op = rewriter.create<tfrt::corert::ConstStringTensorOp>(
-        op.getLoc(), corert_converter_.tensor_handle_type(),
-        rewriter.getArrayAttr(dims), rewriter.getArrayAttr(values));
-
-    rewriter.replaceOp(op, new_op.result());
-
-    return success();
-  }
-
- private:
-  CoreRTConverter &corert_converter_;
-};
-
-// Convert TF dialect operations with no side effects to CoreRT ExecuteOp. For
-// example,
-//
-// %0 = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-//
-// is converted to
-//
-// %result = corert.executeop(%device)
-//    "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (!corert.tensorhandle, !corert.tensorhandle) -> !corert.tensorhandle
-//
-// Note that it will fail to match if some attributes are not supported.
-template <typename TF_Op>
-class CoreRTExecuteOpConversion : public mlir::OpConversionPattern<TF_Op> {
- public:
-  CoreRTExecuteOpConversion(mlir::MLIRContext *context,
-                            CoreRTConverter *corert_converter)
-      : mlir::OpConversionPattern<TF_Op>(context),
-        corert_converter_(*corert_converter) {}
-
-  LogicalResult matchAndRewrite(
-      TF_Op op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    mlir::StringAttr op_name = rewriter.getStringAttr(op.getOperationName());
-
-    llvm::SmallVector<Type, 4> result_types;
-    for (auto type : op.getOperation()->getResultTypes())
-      result_types.push_back(corert_converter_.convertType(type));
-
-    // Get the device, or create one if there does not exist one.
-    auto device = corert_converter_.ConvertDevice(op, &rewriter);
-    if (!device) return failure();
-
-    auto derived_attrs = op.materializeDerivedAttributes();
-    for (auto named_attr : derived_attrs) {
-      op.setAttr(named_attr.first, named_attr.second);
-    }
-
-    ArrayAttr op_attrs = corert_converter_.CreateOpAttrs(op.getAttrs());
-    if (!op_attrs) return failure();
-
-    auto new_op = rewriter.create<tfrt::corert::ExecuteOp>(
-        op.getLoc(), result_types, device, operands, op_attrs, op_name);
-
-    rewriter.replaceOp(op, new_op.results());
-    return success();
-  }
-
- private:
-  CoreRTConverter &corert_converter_;
-};
-
-// Deletes the op and forwards the arguments.
-template <typename TF_Op>
-class PassThroughConversion : public mlir::OpConversionPattern<TF_Op> {
- public:
-  explicit PassThroughConversion(MLIRContext *context)
-      : mlir::OpConversionPattern<TF_Op>(context) {}
-
-  LogicalResult matchAndRewrite(
-      TF_Op op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    // Just forward the arguments to results.
-    rewriter.replaceOp(op, operands);
-    return success();
-  }
-};
-
-// Convert standard ReturnOp to hex.return.
-//
-// TODO(chky): conversion to hex kernels should come from a common tf_to_hex
-// library.
-class ReturnOpConversion : public mlir::OpConversionPattern<mlir::ReturnOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mlir::ReturnOp op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<tfrt::hex::ReturnOp>(op, operands);
-    return success();
-  }
-};
-
-// Convert TF dialect to CoreRT dialect.
-class TFToCoreRTConversionPass
-    : public mlir::PassWrapper<TFToCoreRTConversionPass,
-                               OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    auto module = getOperation();
-    mlir::ConversionTarget target(getContext());
-    mlir::OwningRewritePatternList patterns;
-    if (failed(TFToCoreRTConversionPassRun(&getContext(), &module, &target,
-                                           &patterns)))
-      signalPassFailure();
-  }
-};
-
-}  // namespace
-
-LogicalResult TFToCoreRTConversionPassRun(
-    mlir::MLIRContext *context, mlir::ModuleOp *module,
-    mlir::ConversionTarget *target, mlir::OwningRewritePatternList *patterns) {
-  module->removeAttr("tf_saved_model.semantics");
-
-  mlir::Builder builder(context);
-  auto bound_id = builder.getIdentifier("tf_saved_model.bound_input");
-  auto path_id = builder.getIdentifier("tf_saved_model.index_path");
-
-  module->walk([bound_id, path_id, module](mlir::Operation *op) mutable {
-    if (auto func_op = dyn_cast<mlir::FuncOp>(op)) {
-      // Remove tf_saved_model specific function arg attributes.
-      for (unsigned i = 0, e = func_op.getNumArguments(); i != e; ++i) {
-        func_op.removeArgAttr(i, bound_id);
-        func_op.removeArgAttr(i, path_id);
-      }
-      for (unsigned i = 0, e = func_op.getNumResults(); i != e; ++i) {
-        func_op.removeResultAttr(i, bound_id);
-        func_op.removeResultAttr(i, path_id);
-      }
-      if (auto exported_names = func_op.getAttrOfType<mlir::ArrayAttr>(
-              "tf_saved_model.exported_names")) {
-        // Create a function for each exported name.
-        //
-        // TODO(b/148477882): TFRT dialect should have similar concepts of
-        // exported names so that a function can be referenced by multiple
-        // exported names.
-        func_op.removeAttr("tf_saved_model.exported_names");
-        for (auto exported_name : exported_names) {
-          auto exported_func_op = func_op.clone();
-          exported_func_op.setName(
-              exported_name.cast<mlir::StringAttr>().getValue());
-          module->insert(module->begin(), exported_func_op);
-        }
-        func_op.erase();
-      }
-    } else if (isa<mlir::tf_saved_model::GlobalTensorOp>(op)) {
-      // Remove all global_tensor_ops.
-      op->erase();
-    }
-  });
-
-  CoreRTConverter corert_converter(context);
-
-  target->addLegalDialect<tfrt::corert::CoreRTDialect>();
-  target->addLegalDialect<tfrt::hex::HexDialect>();
-  target->addIllegalDialect<TF::TensorFlowDialect>();
-  target->addDynamicallyLegalOp<mlir::FuncOp>([&corert_converter](FuncOp op) {
-    return corert_converter.isSignatureLegal(op.getType());
-  });
-
-  patterns->insert<PassThroughConversion<TF::ReadVariableOp>,
-                   PassThroughConversion<TF::IdentityOp>, ReturnOpConversion>(
-      context);
-
-  // Here we use one specialized pattern for tf.Const with string tensors as
-  // it will incorrect to use ExecuteOp pattern to convert string tensor
-  // attribute.
-  patterns->insert<CoreRTConstStringTensorOpConversion>(context,
-                                                        &corert_converter);
-
-  // TODO(b/148823030): Pattern registration for TF operations is not
-  // sustainable currently. We need to figure out a plan
-  patterns->insert<CoreRTExecuteOpConversion<TF::AddV2Op>,
-                   // TODO(chky): Move the ReadVariableOp + Identity pattern
-                   // to optimizer.
-                   // CoreRTExecuteOpConversion<TF::IdentityOp>,
-                   CoreRTExecuteOpConversion<TF::MulOp>,
-                   CoreRTExecuteOpConversion<TF::BiasAddOp>,
-                   CoreRTExecuteOpConversion<TF::Conv2DOp>,
-                   CoreRTExecuteOpConversion<TF::ConcatV2Op>,
-                   CoreRTExecuteOpConversion<TF::ConstOp>,
-                   CoreRTExecuteOpConversion<TF::CastOp>,
-                   CoreRTExecuteOpConversion<TF::ExpandDimsOp>,
-                   CoreRTExecuteOpConversion<TF::TransposeOp>,
-                   CoreRTExecuteOpConversion<TF::FusedBatchNormV3Op>,
-                   CoreRTExecuteOpConversion<TF::FusedBatchNormExOp>,
-                   CoreRTExecuteOpConversion<TF::MatMulOp>,
-                   CoreRTExecuteOpConversion<TF::MaxPoolOp>,
-                   CoreRTExecuteOpConversion<TF::MeanOp>,
-                   CoreRTExecuteOpConversion<TF::PadOp>,
-                   CoreRTExecuteOpConversion<TF::ParseExampleV2Op>,
-                   CoreRTExecuteOpConversion<TF::ReluOp>,
-                   CoreRTExecuteOpConversion<TF::SoftmaxOp>,
-                   CoreRTExecuteOpConversion<TF::ShapeOp>,
-                   CoreRTExecuteOpConversion<TF::TanhOp>>(context,
-                                                          &corert_converter);
-
-  mlir::populateFuncOpTypeConversionPattern(*patterns, context,
-                                            corert_converter);
-  return mlir::applyPartialConversion(*module, *target, *patterns);
-}
-
-std::unique_ptr<mlir::Pass> CreateTFToCoreRTConversionPass() {
-  return std::make_unique<TFToCoreRTConversionPass>();
-}
-
-void CreateTFExecutorToTFPipeline(mlir::OpPassManager &pm,
-                                  const CoreRTPipelineOptions &options) {
-  // First, we prune unused operations in MLIR in TF Executor dialect.
-  pm.addPass(mlir::tf_executor::CreateTFExecutorGraphPruningPass());
-
-  // Then we pass the MLIR module through the TF standard pipeline, which for
-  // instances does shape inference, canonicalization, inlining, etc.
-  mlir::TF::StandardPipelineOptions tf_options;
-  tf_options.enable_inliner = true;
-  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
-
-  // After all standard passes run layout optimization to assign optimal data
-  // format for all layout sensitive operations.
-  mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
-  layout_optimization_options.force_data_format =
-      options.force_data_format.getValue();
-  mlir::TF::CreateLayoutOptimizationPipeline(pm, layout_optimization_options);
-
-  // Run canonicalization pipeline to remove unused constants and bypassed
-  // transpose operations left in the IR after layout optimization.
-  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-
-  if (options.default_device == "gpu")
-    pm.addNestedPass<mlir::FuncOp>(mlir::TF::CreateGpuOpFusionPass());
-
-  // Then we assign default devices.
-  pm.addNestedPass<mlir::FuncOp>(
-      mlir::TF::CreateSimpleTFDeviceAssignmentPass(options.default_device));
-}
-
-void CreateTFExecutorToCoreRTPipeline(mlir::OpPassManager &pm,
-                                      const CoreRTPipelineOptions &options) {
-  CreateTFExecutorToTFPipeline(pm, options);
-
-  // Convert it to MLIR in CoreRT dialect.
-  pm.addPass(CreateTFToCoreRTConversionPass());
-
-  // Run optimizer on the MLIR module in CoreRT dialect.
-  if (options.enable_optimizer)
-    pm.addNestedPass<mlir::FuncOp>(CreateCoreRTOptimizePass());
-}
-
-static mlir::PassRegistration<TFToCoreRTConversionPass> pass(
-    "tf-to-corert",
-    "Convert Tensorflow dialect to TFRT's CoreRuntime dialect.");
-
-static mlir::PassPipelineRegistration<CoreRTPipelineOptions> pipeline(
-    "tf-executor-to-corert-pipeline",
-    "Convert Tensorflow Executor dialect to TFRT's CoreRuntime dialect, and "
-    "also apply necessary optimization passes.",
-    CreateTFExecutorToCoreRTPipeline);
-
-}  // namespace tensorflow

From 21b04b6fe0c5bc6a8dd0cc2f414760f47b142ae9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 10:02:03 -0700
Subject: [PATCH 0499/1533] Add support for global operation dispatchers. 
 (This is intended for use by TF-internal classes only.)

PiperOrigin-RevId: 311350209
Change-Id: Ib095f019fc6825409b490d7dec7e86116955b746
---
 tensorflow/python/util/dispatch.py      | 21 ---------
 tensorflow/python/util/dispatch_test.py | 58 +------------------------
 2 files changed, 2 insertions(+), 77 deletions(-)

diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index 3868da14b44..e94e3345348 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,10 +39,6 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
-# OpDispatchers which should be used for all operations.
-_GLOBAL_DISPATCHERS = []
-
-
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -86,19 +82,6 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
-class GlobalOpDispatcher(object):
-  """Abstract base class for TensorFlow global operator dispatchers."""
-
-  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
-
-  def handle(self, op, args, kwargs):
-    """Handle the specified operation with the specified arguments."""
-
-  def register(self):
-    """Register this dispatcher as a handler for all ops."""
-    _GLOBAL_DISPATCHERS.append(self)
-
-
 def dispatch(op, *args, **kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
@@ -118,10 +101,6 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
-  for dispatcher in _GLOBAL_DISPATCHERS:
-    result = dispatcher.handle(op, args, kwargs)
-    if result is not OpDispatcher.NOT_SUPPORTED:
-      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index bd35c391924..89999fcf843 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,47 +45,6 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
-class TensorTracer(object):
-  """An object used to trace TensorFlow graphs.
-
-  This is an example class that is used to test global op dispatchers.  The
-  global op dispatcher for TensorTracers is defined below.
-  """
-
-  def __init__(self, name, args=None, kwargs=None):
-    self.name = name
-    self.args = args
-    self.kwargs = kwargs
-
-  def __repr__(self):
-    if self.args is None and self.kwargs is None:
-      return self.name
-    else:
-      args = [str(x) for x in self.args]
-      args += sorted(
-          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
-      return "{}({})".format(self.name, ", ".join(args))
-
-
-class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
-  """Global op dispatcher for TensorTracer."""
-
-  def handle(self, op, args, kwargs):
-    # Dispatcher only applies if at least one arg is a TensorTracer.
-    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
-            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
-      return self.NOT_SUPPORTED
-
-    return TensorTracer(op.__name__, args, kwargs)
-
-  def is_tensor_tracer_arg(self, value):
-    if isinstance(value, TensorTracer):
-      return True
-    if isinstance(value, (list, tuple)):
-      if any(isinstance(x, TensorTracer) for x in value):
-        return True
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -172,21 +131,8 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
-  def testGlobalDispatcher(self):
-    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
-    try:
-      TensorTracerOpDispatcher().register()
-
-      x = TensorTracer("x")
-      y = TensorTracer("y")
-      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
-      self.assertEqual(
-          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
-
-    finally:
-      # Clean up.
-      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
-
 
 if __name__ == "__main__":
   googletest.main()
+
+

From 840e8b64a1a8ccbd88bf00621019912ec17c16a9 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 13 May 2020 10:20:30 -0700
Subject: [PATCH 0500/1533] Set up TFRT OSS dependency in Tensorflow.

PiperOrigin-RevId: 311354250
Change-Id: I79f65da3dbde9ea21d412860fb63b417818268ee
---
 tensorflow/workspace.bzl | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7cc156a2985..6a958e1b00f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -162,19 +162,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
-    TFRT_COMMIT = "26fb26d716545388edb9785f8f4b3e60a4ad5092"
-    TFRT_SHA256 = "f7419a3eaab8b7137a4de5b428045a731d93da91ef1bce9ba91fab81ed23a676"
-    TFRT_URLS = [
-        "http://mirror.tensorflow.org/github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
-        "https://github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
-    ]
-    tf_http_archive(
-        name = "tf_runtime",
-        sha256 = TFRT_SHA256,
-        strip_prefix = "runtime-" + TFRT_COMMIT,
-        urls = TFRT_URLS,
-    )
-
     tf_http_archive(
         name = "XNNPACK",
         sha256 = "15a300dec0d483af67310ed2edf76a6eff643e1438d0612ad00a372add472c22",

From 88c4ee01021e67e7eaf49a32de381e751d484495 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Wed, 13 May 2020 10:48:49 -0700
Subject: [PATCH 0501/1533] Adding TensorHandleList as the returned type for
 TF_ConcreteFunctionGetCaptures.

PiperOrigin-RevId: 311360593
Change-Id: Ic292aef980339e5bd5e360eea391bbee4751caf9
---
 .../c/experimental/saved_model/internal/BUILD | 37 +++++++++++++--
 .../saved_model/internal/concrete_function.cc |  9 ++--
 .../saved_model/internal/tensorhandle_list.cc | 39 +++++++++++++++
 .../internal/tensorhandle_list_type.h         | 37 +++++++++++++++
 .../c/experimental/saved_model/public/BUILD   |  7 +++
 .../saved_model/public/c_saved_model_api.h    |  1 +
 .../saved_model/public/concrete_function.h    |  6 +--
 .../public/concrete_function_list.h           | 16 +++++--
 .../saved_model/public/tensorhandle_list.h    | 47 +++++++++++++++++++
 9 files changed, 184 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
 create mode 100644 tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
 create mode 100644 tensorflow/c/experimental/saved_model/public/tensorhandle_list.h

diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 7a694f4f803..5c51e26f925 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -31,9 +31,6 @@ cc_library(
         "//tensorflow/c/experimental/saved_model/public:concrete_function.h",
     ],
     copts = tf_copts(),
-    # TODO(bmzhao): Remove this as we refactor C API to granular targets,
-    # so that we can depend on c/eager/c_api_unified_experimental.h.
-    features = ["-layering_check"],
     visibility = [
         "//tensorflow/c/experimental/saved_model/public:__pkg__",
     ],
@@ -41,6 +38,8 @@ cc_library(
         ":concrete_function_type",
         ":function_metadata",
         ":function_metadata_type",
+        ":tensorhandle_list",
+        ":tensorhandle_list_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
@@ -160,6 +159,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tensorhandle_list",
+    srcs = [
+        "tensorhandle_list.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:tensorhandle_list.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":tensorhandle_list_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle_list_type",
+    hdrs = [
+        "tensorhandle_list_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/eager:tensor_handle_interface",
+    ],
+)
+
 tf_cc_test(
     name = "saved_model_api_test",
     size = "small",
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
index 4884f9e2e97..dd54416ddf9 100644
--- a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 
-#include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/function_metadata_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
 
 extern "C" {
 
@@ -29,10 +29,9 @@ TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(TF_ConcreteFunction* func) {
       &tensorflow::unwrap(func)->GetFunctionMetadata()));
 }
 
-TF_OutputList* TF_ConcreteFunctionGetCaptures(TF_ConcreteFunction* func) {
-  // TODO(bmzhao): Refactor TF_OutputList struct definition into a separate
-  // internal header, and implement this function.
-  return nullptr;
+const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
+    TF_ConcreteFunction* func) {
+  return tensorflow::wrap(&tensorflow::unwrap(func)->GetCaptures());
 }
 
 TFE_Op* TF_ConcreteFunctionGetCallOp(TF_ConcreteFunction* func) {
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
new file mode 100644
index 00000000000..6ef937591aa
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
+
+#include <stddef.h>
+
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
+
+extern "C" {
+
+size_t TF_TensorHandleListSize(const TF_TensorHandleList* list) {
+  return tensorflow::unwrap(list)->size();
+}
+
+TFE_TensorHandle* TF_TensorHandleListGet(const TF_TensorHandleList* list,
+                                         int i) {
+  return tensorflow::wrap((*tensorflow::unwrap(list))[i]);
+}
+
+void TF_DeleteTensorHandleList(const TF_TensorHandleList* list) {
+  delete tensorflow::unwrap(list);
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
new file mode 100644
index 00000000000..8cbec2806a8
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+
+// Internal structures used by the SavedModel C API. These are likely to
+// change and should not be depended on.
+
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(
+    std::vector<tensorflow::AbstractTensorHandleInterface*>,
+    TF_TensorHandleList)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index af65e05e7f6..0cfa0a2c005 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -24,6 +24,7 @@ exports_files(
         "concrete_function_list.h",
         "function_metadata.h",
         "saved_model_api.h",
+        "tensorhandle_list.h",
     ],
     visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
 )
@@ -39,6 +40,7 @@ cc_library(
         ":concrete_function_list",
         ":function_metadata",
         ":saved_model_api",
+        ":tensorhandle_list",
     ],
 )
 
@@ -61,3 +63,8 @@ alias(
     name = "saved_model_api",
     actual = "//tensorflow/c/experimental/saved_model/internal:saved_model_api",
 )
+
+alias(
+    name = "tensorhandle_list",
+    actual = "//tensorflow/c/experimental/saved_model/internal:tensorhandle_list",
+)
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
index 30f533f140a..aae95a5477c 100644
--- a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 // IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
index 351d8daed8e..2a87214270c 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
 
 #include "tensorflow/c/c_api_macros.h"
-#include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,7 +36,7 @@ TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
     TF_ConcreteFunction* func);
 
 // Returns a list of TensorHandles implicitly captured by this function.
-TF_CAPI_EXPORT extern TF_OutputList* TF_ConcreteFunctionGetCaptures(
+TF_CAPI_EXPORT extern const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
     TF_ConcreteFunction* func);
 
 // Returns a TFE_Op suitable for executing this function.
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function_list.h b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
index 7add847259c..e35546751f1 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
@@ -21,19 +21,27 @@ limitations under the License.
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 // An opaque type that is acts like a list of TF_ConcreteFunction pointers.
 typedef struct TF_ConcreteFunctionList TF_ConcreteFunctionList;
 
 // Returns the size of `list`.
-TF_CAPI_EXPORT size_t
-TF_ConcreteFunctionListSize(TF_ConcreteFunctionList* list);
+TF_CAPI_EXPORT extern size_t TF_ConcreteFunctionListSize(
+    TF_ConcreteFunctionList* list);
 
 // Returns the `i`th TF_ConcreteFunction in the list.
-TF_CAPI_EXPORT TF_ConcreteFunction* TF_ConcreteFunctionListGet(
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_ConcreteFunctionListGet(
     TF_ConcreteFunctionList* list, int i);
 
 // Deletes `list`.
-TF_CAPI_EXPORT void TF_DeleteConcreteFunctionList(
+TF_CAPI_EXPORT extern void TF_DeleteConcreteFunctionList(
     TF_ConcreteFunctionList* list);
 
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
new file mode 100644
index 00000000000..393708aa2bf
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that is acts like a list of TF_ConcreteFunction pointers.
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_TensorHandleListSize(
+    const TF_TensorHandleList* list);
+
+// Returns the `i`th TFE_TensorHandle in the list.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TF_TensorHandleListGet(
+    const TF_TensorHandleList* list, int i);
+
+// Deletes `list`.
+TF_CAPI_EXPORT extern void TF_DeleteTensorHandleList(
+    const TF_TensorHandleList* list);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_

From b97bf5ae0be96a3e00aa12a096263c9de08f474c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 11:06:08 -0700
Subject: [PATCH 0502/1533] Flush denormals to zero in eager mode.

PiperOrigin-RevId: 311364546
Change-Id: I42efa6b19b8193c49bc581879b04ce3d05a13607
---
 .../common_runtime/eager/kernel_and_device.cc |  4 +++
 .../python/kernel_tests/denormal_test.py      | 33 +++++++++----------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 3c586e8188a..bf7c083f24b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -281,6 +283,8 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
+    port::ScopedFlushDenormal flush;
+    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index d824e95f213..6e073f0d526 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,7 +23,6 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,32 +34,30 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine(
-    ) == "s390x" or platform.machine() == "aarch64":
+  def _flushDenormalsTest(self, dtypes):
+    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
+        platform.machine() == "aarch64"):
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    with self.cached_session(use_gpu=use_gpu):
-      array_ops.identity(7).eval()
-      for dtype in dtypes:
-        tiny = np.finfo(dtype).tiny
-        # Small shape to test main thread, large shape to test thread pool
-        for shape in (), (1 << 20,):
-          flush = 0.1 * constant_op.constant(tiny, shape=shape)
-          self.assertAllEqual(flush.eval(), np.zeros(shape))
-          # Make sure the flags don't leak out
-          self.testPythonHasDenormals()
+    for dtype in dtypes:
+      tiny = np.finfo(dtype).tiny
+      # Small shape to test main thread, large shape to test thread pool
+      for shape in (), (1 << 20,):
+        flush = 0.1 * constant_op.constant(tiny, shape=shape)
+        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
+        # Make sure the flags don't leak out
+        self.testPythonHasDenormals()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
+    self._flushDenormalsTest(dtypes=(np.float32,))
 
 
 if __name__ == "__main__":

From 59239ab4990468323df7c8237713bbae7a77b548 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Wed, 13 May 2020 11:18:28 -0700
Subject: [PATCH 0503/1533] Use round-robin approach to reading from tf.data
 service workers.

PiperOrigin-RevId: 311367134
Change-Id: I5408de5d85c13514c55681ecf09dcecec5c2168a
---
 .../experimental/data_service_dataset_op.cc   | 264 +++++++++++-------
 1 file changed, 167 insertions(+), 97 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 697f4d99a1e..56077a671fb 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -189,7 +189,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       VLOG(1) << "Destroying data service dataset iterator for job id "
               << job_id_;
       cancelled_ = true;
-      cv_.notify_all();
+      worker_thread_cv_.notify_all();
+      manager_thread_cv_.notify_all();
+      get_next_cv_.notify_all();
       // Thread destructors will block until the threads finish, no need to wait
       // here.
     }
@@ -222,12 +224,16 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             });
       }
 
-      while (results_.empty() && !job_finished_ && !cancelled_) {
-        cv_.wait(l);
+      while (results_.empty() && !job_finished_ && !cancelled_ &&
+             status_.ok()) {
+        get_next_cv_.wait(l);
       }
       if (cancelled_) {
         return errors::Cancelled("Data service iterator was cancelled");
       }
+      if (!status_.ok()) {
+        return status_;
+      }
       if (results_.empty()) {
         *end_of_sequence = true;
         return Status::OK();
@@ -236,7 +242,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       *end_of_sequence = false;
       out_tensors->swap(results_.front());
       results_.pop();
-      cv_.notify_all();
+      worker_thread_cv_.notify_one();
 
       return Status::OK();
     }
@@ -259,16 +265,21 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     }
 
    private:
-    typedef struct TaskThread {
-      int64 task_id;
-      // Cached address of the worker for task `task_id`.
-      std::string address;
-      std::unique_ptr<DataServiceWorkerClient> worker;
-      std::unique_ptr<Thread> thread;
-      bool end_of_sequence = false;
-      // Indicates that the thread has finished running.
-      bool finished = false;
-    } TaskThread;
+    struct Task {
+      Task(int64 task_id, const std::string& address,
+           std::unique_ptr<DataServiceWorkerClient> worker)
+          : task_id(task_id), address(address), worker(std::move(worker)) {}
+
+      const int64 task_id;
+      // Address of the tf.data service worker for task `task_id`.
+      const std::string address;
+      // Client for fetching task elements from the tf.data service worker.
+      const std::unique_ptr<DataServiceWorkerClient> worker;
+      // Indicates whether a worker thread is currently processing the task.
+      bool in_use TF_GUARDED_BY(&Iterator::mu_) = false;
+      // Indicates whether the worker has returned end_of_sequence for the task.
+      bool end_of_sequence TF_GUARDED_BY(&Iterator::mu_) = false;
+    };
 
     // Periodically refresh the task list.
     // Maintain one thread fetching elements for each task.
@@ -286,22 +297,23 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             int64 remaining_time = next_check - Env::Default()->NowMicros();
             VLOG(3) << "Task thread manager waiting for " << remaining_time
                     << "us";
-            cv_.wait_for(l, std::chrono::microseconds(remaining_time));
+            manager_thread_cv_.wait_for(
+                l, std::chrono::microseconds(remaining_time));
           }
           if (cancelled_) {
             VLOG(3) << "Task thread manager finished";
             return;
           }
         }
-        UpdateTaskThreads(&master, ctx.get());
+        UpdateTasks(&master);
+        UpdateWorkerThreads(ctx.get());
         next_check = Env::Default()->NowMicros() +
                      dataset()->task_refresh_interval_ms_ * 1000;
       }
     }
 
-    void UpdateTaskThreads(DataServiceMasterClient* master,
-                           IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
-      VLOG(3) << "Updating task threads";
+    void UpdateTasks(DataServiceMasterClient* master) LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Updating tasks";
       std::vector<TaskInfo> tasks;
       bool job_finished;
       Status s = master->GetTasks(job_id_, &tasks, &job_finished);
@@ -310,94 +322,119 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
                      << s;
         return;
       }
-      absl::flat_hash_set<int64> task_ids;
+      absl::flat_hash_map<int64, TaskInfo> task_id_to_task;
+      for (auto& task : tasks) {
+        task_id_to_task[task.id()] = task;
+      }
       mutex_lock l(mu_);
       job_finished_ = job_finished;
-      for (auto& task : tasks) {
-        task_ids.insert(task.id());
-        if (task_threads_.contains(task.id())) {
-          continue;
-        }
-        task_threads_[task.id()] = absl::make_unique<TaskThread>();
-        TaskThread* task_thread = task_threads_[task.id()].get();
-        task_thread->task_id = task.id();
-        task_thread->address = task.worker_address();
-        num_unfinished_tasks_++;
-        outstanding_requests_++;
-        auto done = [this, task_thread]() {
-          mutex_lock l(mu_);
-          num_unfinished_tasks_--;
-          outstanding_requests_--;
-          cv_.notify_all();
-          task_thread->finished = true;
-          VLOG(3) << "Task thread " << task_thread->task_id << " finished";
-        };
-        task_thread->thread =
-            ctx->StartThread("tf-data-service-task_thread",
-                             [this, task_thread, done = std::move(done)]() {
-                               RunTaskThread(task_thread, std::move(done));
-                             });
+      if (job_finished) {
+        get_next_cv_.notify_all();
+        return;
       }
-      // Mark deleted tasks and clean up finished task threads.
-      for (auto it = task_threads_.begin(); it != task_threads_.end();) {
-        TaskThread* task_thread = it->second.get();
-        if (task_thread->finished) {
-          task_threads_.erase(it++);
+      for (int i = 0; i < tasks_.size(); ++i) {
+        std::shared_ptr<Task> task = tasks_[i];
+        if (task_id_to_task.contains(task->task_id)) {
+          // Remove already-known tasks from `task_id_to_task`, so that at the
+          // end of the loop, only new tasks remain.
+          task_id_to_task.erase(task->task_id);
+        } else {
+          // Task has been removed.
+          if (task->end_of_sequence) {
+            finished_tasks_--;
+          }
+          tasks_[i] = tasks_[tasks_.size() - 1];
+          tasks_.pop_back();
+        }
+      }
+      for (auto& new_task_entry : task_id_to_task) {
+        TaskInfo& task_info = new_task_entry.second;
+        std::unique_ptr<DataServiceWorkerClient> worker;
+        Status s = CreateDataServiceWorkerClient(task_info.worker_address(),
+                                                 dataset()->protocol_, &worker);
+        if (!s.ok()) {
+          status_ = s;
+          get_next_cv_.notify_all();
           continue;
         }
-        if (!task_ids.contains(task_thread->task_id)) {
-          VLOG(3) << "Marking removed task thread " << task_thread->task_id
-                  << " as finished";
-          task_thread->end_of_sequence = true;
-        }
-        ++it;
+        tasks_.push_back(std::make_shared<Task>(
+            task_info.id(), task_info.worker_address(), std::move(worker)));
       }
       if (dataset()->max_outstanding_requests_ == model::kAutotune) {
         // Adjust max_outstanding_requests to account for newly added tasks.
-        max_outstanding_requests_ = task_threads_.size();
+        max_outstanding_requests_ = tasks_.size();
       }
     }
 
-    void RunTaskThread(TaskThread* task_thread, std::function<void()> done) {
+    void UpdateWorkerThreads(IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
+      mutex_lock l(mu_);
+      while (num_running_worker_threads_ < max_outstanding_requests_) {
+        num_running_worker_threads_++;
+        outstanding_requests_++;
+        auto done = [this]() {
+          mutex_lock l(mu_);
+          num_running_worker_threads_--;
+          outstanding_requests_--;
+          VLOG(3) << "Exiting worker thread";
+        };
+        worker_threads_.push_back(ctx->StartThread(
+            "tf-data-service-task_thread", [this, done = std::move(done)]() {
+              RunWorkerThread(std::move(done));
+            }));
+      }
+    }
+
+    void RunWorkerThread(std::function<void()> done) {
       auto cleanup = gtl::MakeCleanup([done = std::move(done)]() { done(); });
-      VLOG(3) << "Starting task thread for task " << task_thread->task_id
-              << " with worker address " << task_thread->address;
+      VLOG(3) << "Starting worker thread";
+      std::shared_ptr<Task> task_to_process;
       while (true) {
-        if (!task_thread->worker) {
-          Status s = CreateDataServiceWorkerClient(
-              task_thread->address, dataset()->protocol_, &task_thread->worker);
-          if (!s.ok()) {
-            LOG(WARNING) << "Failed to create a worker client for "
-                         << task_thread->address << ": " << s;
-          }
-        }
         {
           mutex_lock l(mu_);
-          if (task_thread->end_of_sequence) {
-            VLOG(3) << "Task thread" << task_thread->task_id
-                    << " reached end_of_sequence";
-            return;
+          if (task_to_process) {
+            task_to_process->in_use = false;
+            task_to_process = nullptr;
+            worker_thread_cv_.notify_one();
           }
           outstanding_requests_--;
-          while (!cancelled_ && results_.size() + outstanding_requests_ >=
-                                    max_outstanding_requests_) {
-            VLOG(3) << "Task thread for task " << task_thread->task_id
-                    << " waiting. results_.size()=" << results_.size()
-                    << " outstanding_requests_=" << outstanding_requests_;
-            cv_.wait(l);
+          while (!cancelled_ && !(SpaceInBuffer() && TaskAvailable())) {
+            if (VLOG_IS_ON(3)) {
+              VLOG(3) << "Sleeping with results_.size=" << results_.size()
+                      << ", outstanding_requests_=" << outstanding_requests_
+                      << ", max_oustanding_requests="
+                      << max_outstanding_requests_
+                      << " finished_tasks=" << finished_tasks_
+                      << " tasks_.size()=" << tasks_.size();
+            }
+            worker_thread_cv_.wait(l);
           }
-          outstanding_requests_++;
           if (cancelled_) {
             return;
           }
+          outstanding_requests_++;
+          // Search for a task to update.
+          int num_tasks = tasks_.size();
+          for (int i = 0; i < num_tasks; ++i) {
+            int index = (next_task_index_ + i) % num_tasks;
+            std::shared_ptr<Task>& task = tasks_[index];
+            if (!task->in_use && !task->end_of_sequence) {
+              task->in_use = true;
+              task_to_process = task;
+              next_task_index_ = (index + 1) % num_tasks;
+              break;
+            }
+          }
+          DCHECK(task_to_process != nullptr);
+          VLOG(3) << "Processing task " << task_to_process->task_id;
         }
-        // TODO(aaudibert): add backoff and max retries.
         int64 deadline_micros =
             Env::Default()->NowMicros() + kRetryTimeoutMicros;
-        Status s = GetElement(task_thread, deadline_micros);
+        Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
-          LOG(WARNING) << "Failed to get element from worker at "
-                       << task_thread->address << ": " << s;
+          mutex_lock l(mu_);
+          status_ = s;
+          get_next_cv_.notify_all();
+          return;
         }
       }
     }
@@ -407,18 +444,27 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     // If the task reaches end_of_sequence or is cancelled (e.g. due to a
     // worker dying), GetElement returns Status::OK() without adding to
     // `results_`.
-    Status GetElement(TaskThread* task_thread, int64 deadline_micros) {
-      VLOG(3) << "Getting an element for task id " << task_thread->task_id;
+    Status GetElement(Task* task, int64 deadline_micros)
+        TF_LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Getting an element for task id " << task->task_id;
       tensorflow::profiler::TraceMe activity(
           "GetElement", tensorflow::profiler::TraceMeLevel::kInfo);
       CompressedElement compressed;
       bool end_of_sequence;
       for (int num_retries = 0;; ++num_retries) {
-        Status s = task_thread->worker->GetElement(
-            task_thread->task_id, &compressed, &end_of_sequence);
+        Status s = task->worker->GetElement(task->task_id, &compressed,
+                                            &end_of_sequence);
         if (s.ok()) {
           break;
         }
+        if (errors::IsNotFound(s)) {
+          // This indicates that the worker was restarted. The restarted worker
+          // will get a new task, and the old task is lost.
+          mutex_lock l(mu_);
+          finished_tasks_++;
+          task->end_of_sequence = true;
+          return Status::OK();
+        }
         // Retry all errors that could indicate preemption.
         if (!errors::IsUnavailable(s) && !errors::IsCancelled(s) &&
             !errors::IsAborted(s)) {
@@ -428,7 +474,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(mu_);
           // If `UpdateTaskThreads` finds that the task has been cancelled, it
           // will set end_of_sequence to `true`.
-          if (task_thread->end_of_sequence || cancelled_) {
+          if (task->end_of_sequence || cancelled_) {
             return Status::OK();
           }
         }
@@ -454,21 +500,31 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       }
       mutex_lock l(mu_);
       if (end_of_sequence) {
-        task_thread->end_of_sequence = true;
+        task->end_of_sequence = true;
+        finished_tasks_++;
         return Status::OK();
       }
       results_.push(std::move(element));
-      cv_.notify_all();
-      VLOG(3) << "Got an element for task id " << task_thread->task_id;
+      get_next_cv_.notify_all();
+      VLOG(3) << "Got an element for task id " << task->task_id;
       return Status::OK();
     }
 
+    bool SpaceInBuffer() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return results_.size() + outstanding_requests_ <
+             max_outstanding_requests_;
+    }
+
+    bool TaskAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return finished_tasks_ + outstanding_requests_ < tasks_.size();
+    }
+
     const int64 iterator_index_;
 
     mutex mu_;
-    // TODO(aaudibert): split this into a couple cvs for different conditions
-    // so that we can use notify_one and avoid unnecessary wakeups.
-    condition_variable cv_ TF_GUARDED_BY(mu_);
+    condition_variable get_next_cv_ TF_GUARDED_BY(mu_);
+    condition_variable worker_thread_cv_ TF_GUARDED_BY(mu_);
+    condition_variable manager_thread_cv_ TF_GUARDED_BY(mu_);
     bool cancelled_ TF_GUARDED_BY(mu_) = false;
 
     int64 outstanding_requests_ TF_GUARDED_BY(mu_) = 0;
@@ -476,17 +532,31 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     // at the same time. This count includes both in-progress requests for
     // elements as well as completed requests which haven't yet been produced.
     int64 max_outstanding_requests_ TF_GUARDED_BY(mu_);
+
+    // The number of threads in `worker_threads_` which are still running.
+    int64 num_running_worker_threads_ TF_GUARDED_BY(mu_) = 0;
+
+    // The index of the next task in `tasks_` to read from.
+    int64 next_task_index_ TF_GUARDED_BY(mu_) = 0;
+
+    // The number tasks in the `tasks_` list that have reached end_of_sequence.
+    int64 finished_tasks_ TF_GUARDED_BY(mu_) = 0;
+
+    // List of tasks to read from.
+    std::vector<std::shared_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
+
+    // A status to be returned from the next call to `GetNext`. This is set by
+    // asynchronous threads when they encounter errors.
+    Status status_ TF_GUARDED_BY(mu_) = Status::OK();
     std::queue<std::vector<Tensor>> results_ TF_GUARDED_BY(mu_);
 
     // Set once in Initialize().
     int64 job_id_;
-    int64 num_unfinished_tasks_ TF_GUARDED_BY(mu_) = 0;
 
     bool job_finished_ = false;
-    // Must come second to last so that task threads are joined before
+    // Must be ordered second to last so that worker threads are joined before
     // destroying other fields.
-    absl::flat_hash_map<int64, std::unique_ptr<TaskThread>> task_threads_
-        TF_GUARDED_BY(mu_);
+    std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
     // Must be ordered last so that the thread is joined before destroying other
     // fields.
     std::unique_ptr<Thread> task_thread_manager_ GUARDED_BY(mu_);

From d45abae4e938be8f6bac8b9a1e1344241a30e2a1 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 13 May 2020 11:20:11 -0700
Subject: [PATCH 0504/1533] [XLA] Move SPMD partitioner to third_party

This change moves the work on SPMD partitioning that the XLA team has been working on in the past 12 months.

PiperOrigin-RevId: 311367525
Change-Id: If174527128c222c53736dc8db2ef1ea4177fb476
---
 tensorflow/compiler/xla/service/BUILD         |   31 +
 .../compiler/xla/service/hlo_sharding_util.cc |  574 ++
 .../compiler/xla/service/hlo_sharding_util.h  |  143 +
 .../xla/service/hlo_sharding_util_test.cc     |  206 +
 tensorflow/compiler/xla/service/spmd/BUILD    |   69 +
 .../xla/service/spmd/spmd_partitioner.cc      | 4655 +++++++++++++++++
 .../xla/service/spmd/spmd_partitioner.h       |  435 ++
 .../xla/service/spmd/spmd_partitioner_test.cc | 3191 +++++++++++
 .../xla/service/spmd/spmd_partitioner_util.cc |  662 +++
 .../xla/service/spmd/spmd_partitioner_util.h  |  229 +
 10 files changed, 10195 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_util.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_util.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/BUILD
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 3349528ebc2..126b62a8eb2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -460,6 +460,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_sharding_util",
+    srcs = [
+        "hlo_sharding_util.cc",
+    ],
+    hdrs = [
+        "hlo_sharding_util.h",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_sharding_util_test",
+    srcs = [
+        "hlo_sharding_util_test.cc",
+    ],
+    deps = [
+        ":hlo_sharding_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_parameter_binding_test",
     srcs = ["dynamic_parameter_binding_test.cc"],
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
new file mode 100644
index 00000000000..129091ca06f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -0,0 +1,574 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include <map>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count) {
+  int64 device = 0;
+  int64 count = 0;
+  for (auto& it : device_map) {
+    if (it.second > count) {
+      count = it.second;
+      device = it.first;
+    }
+  }
+  if (top_count != nullptr) {
+    *top_count = count;
+  }
+  return count > 0 ? absl::optional<int64>(device) : absl::optional<int64>();
+}
+
+Status AssignComputationDevice(HloComputation* computation, int64 device) {
+  VLOG(4) << "Assigning device " << device << " to " << computation->name()
+          << " computation";
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (!instruction->has_sharding()) {
+      VLOG(4) << "Assigning device " << device << " to " << instruction->name();
+      instruction->set_device_sharding(device);
+    }
+  }
+  return Status::OK();
+}
+
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions) {
+  std::map<int64, int64> device_map;
+  for (HloInstruction* instruction : instructions) {
+    if (instruction->has_sharding()) {
+      for (auto& it : instruction->sharding().UsedDevices(nullptr)) {
+        // The UsedDevices() API returns a map<device, occurrence_count>.
+        device_map[it.first] += it.second;
+      }
+    }
+  }
+  return SelectDominantDevice(device_map, nullptr);
+}
+
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor) {
+  int64 instruction_count = 0;
+  std::map<int64, int64> device_map;
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      int64 count = 1;
+      if (instruction->has_sharding()) {
+        for (auto& it : instruction->sharding().UsedDevices(&count)) {
+          // The UsedDevices() API returns a map<device, occurrence_count>.
+          device_map[it.first] += it.second;
+        }
+      }
+      instruction_count += count;
+    }
+  }
+  int64 count;
+  absl::optional<int64> device = SelectDominantDevice(device_map, &count);
+  absl::optional<int64> dominant_device;
+  if (device) {
+    double factor =
+        static_cast<double>(count) / static_cast<double>(instruction_count);
+    if (factor >= dominant_factor) {
+      dominant_device = device;
+    }
+  }
+  return dominant_device;
+}
+
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+  const int64 rank = dimensions.size();
+  std::vector<int64> tile_assignment_dim(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    tile_assignment_dim[i] = sharding.tile_assignment().dim(dimensions[i]);
+  }
+  Array<int64> tile_assignment = sharding.tile_assignment();
+  tile_assignment.Reshape(tile_assignment_dim);
+  tile_assignment.Each([&](absl::Span<const int64> indices, int64* value) {
+    std::vector<int64> src_indices(indices.size(), -1);
+    for (int64 i = 0; i < indices.size(); ++i) {
+      src_indices[dimensions[i]] = indices[i];
+    }
+    *value = sharding.tile_assignment()(src_indices);
+  });
+  return HloSharding::Tile(tile_assignment);
+}
+
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+
+  // In case of a tiled sharding the reshaped sharding will be a valid if the
+  // reshape is composed from the following operations:
+  // * Adding or removing dimensions with size 1.
+  // * Merging consecutive dimensions where only the most major is sharded.
+  // * Splitting a dimension to consecutive dimensions.
+  // * Any reshaping of unsharded dimensions.
+  // Note that merge and split can happen consecutively on the same dimension,
+  // e.g., f32[1024,256,1024] to f32[128,2048,1024] can be considered that 1024
+  // gets split into 128 and 8, but 8 then gets merged with 256. We use stacks
+  // to make supporting such cases easy.
+  const Shape tile_shape = sharding.TileShape(source_shape);
+  std::vector<int64> target_tile_assignment_dimensions;
+  std::vector<int64> source_dims_stack(source_shape.rank());
+  std::vector<int64> target_dims_stack(target_shape.rank());
+  std::vector<int64> sharding_tile_dims_stack(source_shape.rank());
+  for (int64 i = 0; i < source_shape.rank(); ++i) {
+    source_dims_stack[i] = source_shape.dimensions(source_shape.rank() - 1 - i);
+    sharding_tile_dims_stack[i] =
+        sharding.tile_assignment().dim(source_shape.rank() - 1 - i);
+  }
+  for (int64 i = 0; i < target_shape.rank(); ++i) {
+    target_dims_stack[i] = target_shape.dimensions(target_shape.rank() - 1 - i);
+  }
+  while (!source_dims_stack.empty() || !target_dims_stack.empty()) {
+    if (target_dims_stack.empty()) {
+      if (Product(sharding_tile_dims_stack) != 1) {
+        return absl::nullopt;
+      }
+      break;
+    }
+    int64 s_size = 1;
+    int64 t_size = 1;
+    int64 s_partitions = 1;
+    if (!source_dims_stack.empty()) {
+      s_size = source_dims_stack.back();
+      source_dims_stack.pop_back();
+      s_partitions = sharding_tile_dims_stack.back();
+      sharding_tile_dims_stack.pop_back();
+    }
+    t_size = target_dims_stack.back();
+    target_dims_stack.pop_back();
+    if (s_partitions * Product(sharding_tile_dims_stack) == 1) {
+      // No more partitions left.
+      target_tile_assignment_dimensions.push_back(1);
+      continue;
+    }
+    if (s_size == t_size) {
+      // Same dimension.
+      target_tile_assignment_dimensions.push_back(s_partitions);
+    } else if (t_size == 1) {
+      // Trivial dimension added.
+      target_tile_assignment_dimensions.push_back(1);
+      source_dims_stack.push_back(s_size);
+      sharding_tile_dims_stack.push_back(s_partitions);
+    } else if (s_size == 1) {
+      // Trivial dimension removed.
+      if (s_partitions != 1) {
+        return absl::nullopt;
+      }
+      target_dims_stack.push_back(t_size);
+    } else if (s_size > t_size) {
+      // Dimension split.
+      if (s_size % t_size != 0 || t_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      target_tile_assignment_dimensions.push_back(s_partitions);
+      // We have part of the s_size unprocessed, so put it back to stack.
+      source_dims_stack.push_back(s_size / t_size);
+      sharding_tile_dims_stack.push_back(1);
+    } else {
+      // Dimension merge. Also merge the source dimension with the next, and
+      // process it next time.
+      if (s_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      CHECK(!source_dims_stack.empty());
+      if (sharding_tile_dims_stack.back() != 1 && s_size != s_partitions) {
+        // If the next dimension to combine is sharded, we require that the
+        // current dimension's shard size to be 1. Otherwise, the new shard
+        // would be non-contiguous.
+        return absl::nullopt;
+      }
+      source_dims_stack.back() *= s_size;
+      sharding_tile_dims_stack.back() *= s_partitions;
+      target_dims_stack.push_back(t_size);
+    }
+  }
+  Array<int64> new_tile_assignment = sharding.tile_assignment();
+  new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims) {
+  CHECK(!sharding.IsTuple() && !sharding.IsTileMaximal());
+  CHECK_NE(absl::c_find(dims, dim), dims.end()) << "dim is not in dims";
+  // We optimize the tile assignment on the single dimension dim in a way to
+  // minimize communication among devices caused by the reshard:
+  // +---+---+               +---+---+              +-+-+-+-+
+  // |   |   |               |   0   |              | | | | |
+  // | 0 | 1 |               +-------+              | | | | |
+  // |   |   |  reshape on   |   1   |  reshape on  | | | | |
+  // +---+---+   dim 0  =>   +-------+   dim 1  =>  |0|2|1|3|
+  // |   |   |               |   2   |              | | | | |
+  // | 2 | 3 |               +-------+              | | | | |
+  // |   |   |               |   3   |              | | | | |
+  // +---+---+               +---+---+              +-+-+-+-+
+
+  std::vector<int64> tile_dims(sharding.tile_assignment().num_dimensions(), 1);
+  // Handle ignore dimensions.
+  std::vector<int64> ignore_sizes;
+  int64 ignore_size = 1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (absl::c_find(dims, i) == dims.end()) {
+      int64 size = sharding.tile_assignment().dim(i);
+      ignore_sizes.push_back(size);
+      tile_dims[i] = size;
+      ignore_size *= size;
+    }
+  }
+
+  using Buckets = std::vector<std::vector<int64>>;
+  Array<Buckets> buckets(ignore_sizes,
+                         Buckets(sharding.tile_assignment().dim(dim)));
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> index, int64 device) {
+        std::vector<int64> ignore_index;
+        for (int64 i = 0; i < index.size(); ++i) {
+          if (absl::c_find(dims, i) == dims.end()) {
+            ignore_index.push_back(index[i]);
+          }
+        }
+        buckets(ignore_index)[index[dim]].push_back(device);
+      });
+  std::vector<int64> devices;
+  buckets.Each([&](absl::Span<const int64> index, const Buckets& buckets) {
+    for (auto& bucket : buckets) {
+      devices.insert(devices.end(), bucket.begin(), bucket.end());
+    }
+  });
+  tile_dims[dim] = devices.size() / ignore_size;
+  Array<int64> tile_assignment(tile_dims);
+  tile_assignment.SetValues(devices);
+  return HloSharding::Tile(tile_assignment);
+}
+
+bool ContainsTileSharding(const HloModule& module) {
+  for (const HloComputation* computation : module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->has_sharding() &&
+          !instruction->sharding().IsTileMaximal()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> output_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.offset_dims(), i)) {
+      output_tile_assignment_dims.push_back(1);
+    } else {
+      output_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(output_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo) {
+  if (output_sharding.IsTileMaximal()) {
+    return output_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          output_sharding.tile_assignment().dim(i));
+    }
+  }
+  Array<int64> new_tile_assignment = output_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
+  if (hlo.sharding().IsTileMaximal()) {
+    return hlo.sharding();
+  }
+
+  const GatherDimensionNumbers& dnums = hlo.gather_dimension_numbers();
+  std::vector<int64> tile_assignment_dims(hlo.shape().rank());
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      tile_assignment_dims[i] = hlo.sharding().tile_assignment().dim(i);
+      num_elements *= hlo.sharding().tile_assignment().dim(i);
+    } else {
+      tile_assignment_dims[i] = 1;
+    }
+  }
+  if (num_elements == hlo.sharding().tile_assignment().num_elements()) {
+    // Output sharding is only on non offset dimensions. We use output sharding
+    // to shard this gather op directly.
+    return hlo.sharding();
+  }
+
+  if (num_elements == 1) {
+    // Output sharding is only on offset dimensions. We do not shard this gather
+    // op. Return a tile maximal sharding with the first device in output
+    // sharding tile assignment.
+    return HloSharding::AssignDevice(*hlo.sharding().tile_assignment().begin());
+  }
+
+  // Output sharding is on both offset and non offset dimensions. We shard the
+  // gather op only on non offset dimensions.
+  // For example:
+  // - the gather op has sharding [2,2]{0,1,2,3},
+  // - first dimension is non offset dimension,
+  // - second dimension is offset dimension,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(hlo.shape().rank(), 0LL),
+      slice_limits(hlo.shape().rank());
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      slice_limits[i] = hlo.sharding().tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      hlo.sharding().tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.update_window_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          data_sharding.tile_assignment().dim(i));
+    }
+  }
+  if (index_tile_assignment_dims.size() < hlo->operand(1)->shape().rank()) {
+    index_tile_assignment_dims.push_back(1);
+  }
+  Array<int64> new_tile_assignment = data_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> data_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.update_window_dims(), i)) {
+      data_tile_assignment_dims.push_back(1);
+    } else {
+      data_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(data_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  // Only shard on first "number of scatter_window_dims" dimensions.
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  int64 num_elements = 1;
+  int64 index_dim = 0;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      num_elements *= index_sharding.tile_assignment().dim(index_dim);
+      index_dim++;
+    }
+  }
+  if (num_elements == index_sharding.tile_assignment().num_elements()) {
+    // Index sharding is only on scatter_window_dims. We use this index sharding
+    // directly.
+    return index_sharding;
+  }
+
+  // Index sharding is only on update_window_dims. We do not shard this scatter
+  // op. Return a tile maximal sharding with the first device in index sharding
+  // tile assignment.
+  if (num_elements == 1) {
+    return HloSharding::AssignDevice(*index_sharding.tile_assignment().begin());
+  }
+
+  const int64 index_rank = hlo.operand(1)->shape().rank();
+  std::vector<int64> slice_starts(index_rank, 0LL), slice_limits(index_rank);
+  for (int64 i = 0; i < index_rank; ++i) {
+    if (i < index_dim) {
+      slice_limits[i] = index_sharding.tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      index_sharding.tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  const int64 data_rank = hlo.operand(2)->shape().rank();
+  std::vector<int64> tile_assignment_dims(data_rank, 1LL);
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      CHECK_LT(i, data_rank);
+      tile_assignment_dims[i] = data_sharding.tile_assignment().dim(i);
+      num_elements *= data_sharding.tile_assignment().dim(i);
+    }
+  }
+  if (num_elements == data_sharding.tile_assignment().num_elements()) {
+    // Data sharding is only on scatter_window_dims. We use this data sharding
+    // directly.
+    return data_sharding;
+  }
+
+  if (num_elements == 1) {
+    // Data sharding is only on update_window_dims. We do not shard this
+    // scatter op. Return a tile maximal sharding with the first device in
+    // data sharding tile assignment.
+    return HloSharding::AssignDevice(*data_sharding.tile_assignment().begin());
+  }
+
+  // Data sharding is on both update_window_dims and scatter_window_dims. We
+  // shard the scatter op only on scatter_window_dims. For example:
+  // - the scatter data has sharding [2,2]{0,1,2,3},
+  // - first dimension is scatter_window_dims,
+  // - second dimension is update_window_dims,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(data_rank, 0LL);
+  Array<int64> tile_assignment =
+      data_sharding.tile_assignment().Slice(slice_starts, tile_assignment_dims);
+  return HloSharding::Tile(tile_assignment);
+}
+
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter) {
+  auto computation = scatter.to_apply();
+  // We only handle computations with 2 parameters and only 1 calculation.
+  if (computation->instruction_count() != 3) {
+    return Status(
+        tensorflow::error::Code::INVALID_ARGUMENT,
+        "Expected scatter reduce computation with 2 parameters and only 1 "
+        "calculation");
+  }
+
+  auto root_instruction = computation->root_instruction();
+  if (root_instruction->opcode() == HloOpcode::kAdd ||
+      root_instruction->opcode() == HloOpcode::kOr) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::Zero(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMultiply ||
+             root_instruction->opcode() == HloOpcode::kAnd) {
+    return std::make_pair(HloInstruction::CreateConstant(
+                              LiteralUtil::One(scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMaximum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MinValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMinimum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MaxValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  }
+
+  return Status(tensorflow::error::Code::INVALID_ARGUMENT,
+                "Expected scatter reduce computation which is "
+                "add/or/multiply/add/min/max");
+}
+
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices) {
+  std::vector<int64> devices;
+  if (sharding.IsReplicated()) {
+    for (int64 d : available_devices) {
+      if (!HloSharding::IsReservedDevice(d)) {
+        devices.push_back(d);
+      }
+    }
+    return devices;
+  }
+
+  for (int64 i : available_devices) {
+    if (sharding.UsesDevice(i)) {
+      devices.push_back(i);
+    }
+  }
+  DCHECK(std::all_of(sharding.tile_assignment().begin(),
+                     sharding.tile_assignment().end(), [&](int64 device) {
+                       return std::find(available_devices.begin(),
+                                        available_devices.end(),
+                                        device) != available_devices.end();
+                     }));
+  return devices;
+}
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
new file mode 100644
index 00000000000..00d9434a34d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+
+#include <map>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+// Given a map<device, occurrence_count>, selects the device with higher
+// occurrence count (if any). If top_count in not nullptr, it will receive the
+// count of the dominant device returned.
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count);
+
+// Assigns all the instructions of a computation, to a given device.
+// This API does not recurse into called computations, and does not assign
+// instructions which already have sharding.
+Status AssignComputationDevice(HloComputation* computation, int64 device);
+
+// Given an instruction container, returns the device which is most commonly
+// occurring among the instructions.
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions);
+
+// Given a set of computations, tries to extract the dominant device. A device
+// is dominant if the combined occurrence among all the instructions of the
+// input computations, is greater/equal than/to dominant_factor (real number
+// from 0 to 1).
+// This API does not recurse into called computations.
+// If no device exists that satisfies the condition, the returned optional will
+// hold no value.
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor);
+
+// Returns the HloSharding with the tile dimensions and tile assignment
+// transposed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions);
+
+// Returns the HloSharding with the tile shape reshaped based on the source and
+// target shapes and the tile assignment adjusted to correspond to the new tile
+// shape or absl::nullopt if the resulting reshape would create an invalid
+// sharding (non continuous or non uniformly sized tiles). In case of a tile
+// maximal sharding returns the original sharding.
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding);
+
+// Returns a sharding tiled on unique dimension dim by reshaping the tile
+// assignment of the sharding argument. Only dimensions in the dims span
+// argument are considered for reshaping, the others are ignored.
+// Assumptions: sharding is tile sharded, and dim must be included in dims.
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims);
+
+// Returns true if the provided module includes one or more instructions with
+// a tile sharding.
+bool ContainsTileSharding(const HloModule& module);
+
+// Returns the preferred output sharding for a gather op based on the sharding
+// of the indces.
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred index sharding for a gather op based on the sharding
+// of the output.
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new HloSharding for a gather op so that only non offset dimensions
+// are sharded. Assume "result" is returned by this function. It is ensured that
+// "GetIndexSharding(result, hlo)" will have the same number of elements as
+// "result".
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo);
+
+// Returns the preferred index sharding for a scatter op based on the sharding
+// of the data.
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred data sharding for a scatter op based on the sharding
+// of the index.
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new index sharding for a scatter op so that we only shard on first
+// "number of scatter_window_dims" dimensions. Assume "result" is returned by
+// this function. It is ensured that "ScatterDataSharding(result, hlo)" will
+// have the same number of elements as "result".
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo);
+
+// Returns a new data sharding for a scatter op so that we only shard on
+// scatter_window_dims. Assume "result" is returned by this function. It is
+// ensured that "ScatterIndexSharding(result, hlo)" will have the same number of
+// elements as "result".
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo);
+
+// Returns an identity value and an HloOpcode for reduce computation of scatter
+// instruction.
+// - If computation is add/or, return 0/false with corresponding op code;
+// - If computation is multiply/and, return 1/true with corresponding op code.
+// - If computation is min/max, return max value/min value with corresponding op
+//   code.
+// - Otherwise, return error status.
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter);
+
+// Given a sharding and a list of devices in the topology, return a
+// list of the devices that `sharding` applies to.
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices);
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
new file mode 100644
index 00000000000..02496c75965
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+namespace {
+
+TEST(HloShardingUtilTest, TransposeShardingReplicated) {
+  EXPECT_EQ(TransposeSharding(HloSharding::Replicate(), {0, 1, 2}),
+            HloSharding::Replicate());
+}
+
+TEST(HloShardingUtilTest, TransposeShardingTiled) {
+  HloSharding input = HloSharding::Tile(Array4D<int64>({{{{0, 1}}, {{2, 3}}}}));
+  HloSharding output =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {2}}}, {{{1}, {3}}}}));
+  EXPECT_EQ(TransposeSharding(input, {3, 0, 1, 2}), output);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::AssignDevice(7);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {20, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 4, 7});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 4, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 16, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7, 5, 3});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 15, 2, 14});
+  Array<int64> sharding_array({2, 1, 1, 1});
+  sharding_array(0, 0, 0, 0) = 0;
+  sharding_array(1, 0, 0, 0) = 1;
+  HloSharding sharding = HloSharding::Tile(sharding_array);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {3, 1, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 1, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {1}}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}}, {{1}}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTrivialDImensionInsertedToEnd) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 16});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 1});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, NoopReshapeShardingEmptyTile) {
+  Shape shape = ShapeUtil::MakeShape(F32, {7, 1, 1});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result = ReshapeSharding(shape, shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingScalar) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim0) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0}, {1}, {2}, {3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim1) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0, 2, 1, 3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim0) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(
+      result.tile_assignment(),
+      Array3D<int64>({{{0}}, {{1}}, {{2}}, {{3}}, {{4}}, {{5}}, {{6}}, {{7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim1) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0}, {1}, {4}, {5}, {2}, {3}, {6}, {7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim2) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/2, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 4, 6, 1, 3, 5, 7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim2_Batch1) {
+  // Tile sharding in batch dimension, i.e.
+  // sharding={devices[2,2,2]0,1,2,3,4,5,6,7,8}.
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  // Reshape on dimensions {1, 2} only, therefore ignoring batch dimension 0.
+  HloSharding result = ReshapeToTileDimension(sharding, /*dim=*/2,
+                                              /*dims=*/{1, 2});
+  // Expected result is {devices=[2,1,4]0,2,1,3,4,6,5,7}, i.e. the two
+  // non-batch dimensions {{0, 1}, {2, 3}} and {{4, 5}, {6, 7}} are individually
+  // reshaped to tile dimension 2, i.e. {{0, 2, 1, 3}}, {{4, 6, 5, 7}}.
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 1, 3}}, {{4, 6, 5, 7}}}));
+}
+
+}  // namespace
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
new file mode 100644
index 00000000000..5be6a04f934
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -0,0 +1,69 @@
+# Description: SPMD partitioning pass.
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+cc_library(
+    name = "spmd_partitioner",
+    srcs = [
+        "spmd_partitioner.cc",
+        "spmd_partitioner_util.cc",
+    ],
+    hdrs = [
+        "spmd_partitioner.h",
+        "spmd_partitioner_util.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_cse",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_query",
+        "//tensorflow/compiler/xla/service:hlo_sharding_util",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/core/platform:numbers",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "spmd_partitioner_test",
+    srcs = ["spmd_partitioner_test.cc"],
+    deps = [
+        ":spmd_partitioner",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
new file mode 100644
index 00000000000..fd865342ca3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -0,0 +1,4655 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include <float.h>
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/numbers.h"
+
+namespace xla {
+namespace spmd {
+
+string SpmdLogger::MakeReport() {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory during transformation *****\n");
+
+  std::sort(entries_.begin(), entries_.end(),
+            [](auto const& entry0, auto const& entry1) {
+              return entry0.first > entry1.first;
+            });
+  for (int64 i = 0;
+       i < std::min<int64>(report_instruction_count_, entries_.size()); ++i) {
+    absl::StrAppend(
+        &report, "\n  ",
+        tensorflow::strings::HumanReadableNumBytes(entries_[i].first), " : ",
+        entries_[i].second, "\n");
+  }
+
+  return report;
+}
+
+void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
+                                  const std::vector<HloInstruction*>& group) {
+  string report = hlo->ToString();
+  int64 max_value = -1;
+  for (HloInstruction* inst : group) {
+    if (inst->shape().IsTuple()) {
+      continue;
+    }
+    max_value =
+        std::max<int64>(max_value, ShapeUtil::ByteSizeOf(inst->shape(), 4));
+    absl::StrAppend(&report, "     * ", inst->ToString(), "\n");
+  }
+  entries_.push_back(std::make_pair(max_value, report));
+}
+
+/* static */ string SpmdLogger::ReportBeforePartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage before partition *****\n");
+  absl::StrAppend(&report, "\n  ** Replicated instructions\n");
+  absl::StrAppend(&report, ReportMemoryUsage(
+                               module,
+                               [](const HloInstruction* hlo) {
+                                 return !hlo->has_sharding() ||
+                                        hlo->sharding().IsReplicated();
+                               },
+                               report_instruction_count));
+  absl::StrAppend(&report, "\n  ** All instructions\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+/* static */ string SpmdLogger::ReportAfterPartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage after partition *****\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+template <typename F>
+/* static */ string SpmdLogger::ReportMemoryUsage(
+    const HloModule& module, const F& filter, int64 report_instruction_count) {
+  string report;
+  std::vector<HloInstruction*> instructions;
+  instructions.reserve(module.instruction_count());
+
+  for (auto computation : module.computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    for (auto hlo : computation->instructions()) {
+      if (hlo->shape().IsTuple() ||
+          ShapeUtil::IsEffectiveScalar(hlo->shape())) {
+        continue;
+      }
+      if (filter(hlo)) {
+        instructions.push_back(hlo);
+      }
+    }
+  }
+
+  const auto add_report = [&](std::vector<HloInstruction*>* insts) {
+    std::sort(insts->begin(), insts->end(),
+              [](const HloInstruction* inst0, const HloInstruction* inst1) {
+                return ShapeUtil::ByteSizeOf(inst0->shape()) >
+                       ShapeUtil::ByteSizeOf(inst1->shape());
+              });
+    for (int64 i = 0;
+         i < std::min<int64>(report_instruction_count, insts->size()); ++i) {
+      absl::StrAppend(&report, "  ",
+                      tensorflow::strings::HumanReadableNumBytes(
+                          ShapeUtil::ByteSizeOf((*insts)[i]->shape())),
+                      " : ", (*insts)[i]->ToString(), "\n");
+    }
+  };
+
+  add_report(&instructions);
+  return report;
+}
+
+namespace {
+
+// Returns the replica group configuration where each replica belongs to its own
+// group.
+std::vector<ReplicaGroup> CreateReplicaGroups(int64 num_replicas) {
+  std::vector<ReplicaGroup> groups(num_replicas);
+  for (int64 i = 0; i < num_replicas; ++i) {
+    groups[i].add_replica_ids(i);
+  }
+  return groups;
+}
+
+bool CanReshardWithAllToAll(const HloSharding& source,
+                            const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) != UniqueTiledDim(target);
+}
+
+bool CanReshardWithCollectivePermute(const HloSharding& source,
+                                     const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) == UniqueTiledDim(target) && source != target;
+}
+
+// Clears all sharding attributes from instructions in the module. This must be
+// called only after all SPMD transformation is complete.
+Status ClearShardingAttributes(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      // Keep sharding annotation on Infeed and entry parameters since they're
+      // used by HloReplicationAnalysis later (for ArCrsCombiner).
+      if (hlo->opcode() == HloOpcode::kInfeed) {
+        continue;
+      }
+      if (hlo->opcode() == HloOpcode::kParameter &&
+          computation == module->entry_computation()) {
+        continue;
+      }
+      hlo->clear_sharding();
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+HloInstruction* SpmdBuilder::AddInstruction(
+    std::unique_ptr<HloInstruction> instruction) {
+  HloInstruction* hlo =
+      HloComputation::Builder::AddInstruction(std::move(instruction));
+  if (visiting_hlo_) {
+    instructions_[visiting_hlo_].push_back(hlo);
+  }
+  return hlo;
+}
+
+PartitionedHlo PartitionedHlo::Reshard(const HloSharding& target) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first == target) {
+      return entry.second;
+    }
+  }
+  cache.emplace_back(target, ReshardNoCache(target));
+  state_.reshard_cache->per_hlo_cache[cache.back().second.hlo()]
+      .reshard_cache.emplace_back(sharding(), *this);
+  return cache.back().second;
+}
+
+PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
+  VLOG(2) << "Resharding " << hlo_->ToString() << " from "
+          << hlo_->sharding().ToString() << " to " << target.ToString();
+  const Shape& shape = hlo_->shape();
+  CHECK(shape.IsTuple() || !target.IsTuple());
+
+  // Tuple shape instructions may have non-tuple sharding, which means that the
+  // same sharding applies to all the leaves.
+  if (shape.IsTuple() && !target.IsTuple()) {
+    return Reshard(target.GetTupleSharding(shape).ValueOrDie());
+  }
+
+  // For a tuple shape, recursively apply Reshard to all the leaves and return
+  // a tuple instruction.
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      auto subshape = ShapeUtil::GetTupleElementShape(shape, i);
+      auto element = state_.b->AddInstruction(
+          HloInstruction::CreateGetTupleElement(subshape, hlo(), i));
+      element->set_sharding(sharding().GetSubSharding(shape, {i}));
+      elements.push_back(
+          PartitionedHlo(
+              element, ShapeUtil::GetTupleElementShape(base_shape_, i), state_)
+              .Reshard(target.GetSubSharding(shape, {i}))
+              .hlo());
+    }
+    auto tuple =
+        state_.b->AddInstruction(HloInstruction::CreateTuple(elements));
+    tuple->set_sharding(target);
+    return PartitionedHlo(tuple, base_shape_, state_);
+  }
+
+  if (sharding() == target) {
+    return *this;
+  }
+
+  if (shape.element_type() == TOKEN) {
+    return *this;
+  }
+
+  if (CanReshardWithCollectivePermute(sharding(), target)) {
+    return ReshardWithCollectivePermute(target);
+  }
+
+  if (CanReshardWithAllToAll(sharding(), target)) {
+    return ReshardWithAllToAll(target);
+  }
+
+  // If not replicated yet, first replicate and then reshard to use one of the
+  // two implementations below.
+  if (!sharding().IsReplicated()) {
+    return Replicate().Reshard(target);
+  }
+
+  // 'Replicated' to 'SingleDevice'.
+  if (target.IsTileMaximal()) {
+    auto copy = state_.b->AddInstruction(
+        HloInstruction::CreateUnary(hlo_->shape(), HloOpcode::kCopy, hlo_));
+    copy->set_sharding(target);
+    return PartitionedHlo(copy, base_shape_, state_);
+  }
+
+  // 'Replicated' to 'Tiled'.
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+  auto shard_shape = MakePartitionedShape(shape, target);
+  auto slice = state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, padded_hlo,
+      MakePartitionOffsets(shape, target, state_.partition_id, state_.b),
+      shard_shape.dimensions()));
+  slice->set_sharding(target);
+  return PartitionedHlo(slice, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::PadWithValue(HloInstruction* pad_value) const {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+  if (sharding.IsReplicated() || EvenlyPartitions(base_shape_, sharding)) {
+    return *this;
+  }
+  CHECK(!sharding.IsTileMaximal());
+  auto index_shape = ShapeUtil::ChangeElementType(shape, S32);
+  auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+  auto get_mask_for_dim = [&](int64 dim, HloInstruction* start_index) {
+    // Comparison: iota + start_index < valid_size
+    auto iota =
+        state_.b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, start_index, {}));
+    auto index_in_full_shape =
+        state_.b->AddInstruction(HloInstruction::CreateBinary(
+            index_shape, HloOpcode::kAdd, iota, broadcast_start_index));
+    auto valid_size = state_.b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(base_shape_.dimensions(dim))));
+    auto broadcast_valid_size = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, valid_size, {}));
+    return state_.b->AddInstruction(HloInstruction::CreateCompare(
+        mask_shape, index_in_full_shape, broadcast_valid_size,
+        ComparisonDirection::kLt));
+  };
+
+  HloInstruction* mask = nullptr;
+  auto offsets = MakePartitionOffsets(base_shape_, sharding,
+                                      state_.partition_id, state_.b);
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (base_shape_.dimensions(i) % sharding.tile_assignment().dim(i) == 0) {
+      continue;
+    }
+    if (mask == nullptr) {
+      mask = get_mask_for_dim(i, offsets[i]);
+    } else {
+      mask = state_.b->AddInstruction(
+          HloInstruction::CreateBinary(mask->shape(), HloOpcode::kAnd, mask,
+                                       get_mask_for_dim(i, offsets[i])));
+    }
+  }
+
+  if (mask == nullptr) {
+    return *this;
+  }
+
+  auto broadcast_pad_value = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, pad_value, {}));
+  auto result = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, mask, hlo_, broadcast_pad_value));
+  result->set_sharding(sharding);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+absl::optional<PartitionedHlo::WindowedInputShardReturnValue>
+PartitionedHlo::ReshardAsWindowedInput(const Window& window,
+                                       const HloSharding& target,
+                                       HloInstruction* pad_value,
+                                       bool mask_invalid_region) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].window_reshard_cache;
+  for (auto& entry : cache) {
+    if (std::get<0>(entry) == target &&
+        protobuf_util::ProtobufEquals(std::get<1>(entry), window)) {
+      return std::get<2>(entry);
+    }
+  }
+  auto update_cache = [&](WindowedInputShardReturnValue result) {
+    cache.emplace_back(target, window, std::move(result));
+    return std::get<2>(cache.back());
+  };
+  VLOG(2) << "ReshardAsWindowedInput()\n"
+          << "\twindow:" << window_util::ToString(window)
+          << "\ttarget sharding:" << target.ToString();
+
+  CHECK(!target.IsTileMaximal());
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(target, state_.partition_id, state_.b);
+  auto shard_shape = base_shape_;
+
+  std::vector<MultiplyAddDivideOffsetCalculation> start_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<HloInstruction*> dynamic_slice_offset_on_output(
+      base_shape_.rank(), nullptr);
+
+  Window shard_window = window;
+  auto padded_shape = base_shape_;
+  std::vector<HloInstruction*> offsets_on_padded_shape(base_shape_.rank());
+  std::vector<int64> per_shard_window_counts(base_shape_.rank());
+  std::vector<int64> explicit_left_padding(base_shape_.rank());
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    // Do not pad non-partitioned dimensions.
+    int64 shard_count = target.tile_assignment().dim(i);
+    if (shard_count == 1) {
+      offsets_on_padded_shape[i] = state_.b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      continue;
+    }
+    const auto& wd = window.dimensions(i);
+    if (wd.window_dilation() != 1) {
+      // TODO(yuanzx): Support window dilation.
+      VLOG(2) << "Failed to reshard window operand due to window dilation";
+      return absl::nullopt;
+    }
+    int64 full_size =
+        base_shape_.dimensions(i) +
+        (wd.base_dilation() - 1) * (base_shape_.dimensions(i) - 1) +
+        wd.padding_high() + wd.padding_low();
+    if (full_size < wd.size()) {
+      VLOG(2) << "Failed to reshard window operand because the window size is "
+                 "larger than padded base size";
+      return absl::nullopt;
+    }
+    int64 window_count = (full_size - wd.size()) / wd.stride() + 1;
+    per_shard_window_counts[i] = CeilOfRatio(window_count, shard_count);
+    if (wd.stride() != 1 &&
+        (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation() != 0) {
+      // TODO(yuanzx): Support this case.
+      VLOG(2) << "Failed to reshard window operand due to non-trivial dilation";
+      return absl::nullopt;
+    }
+
+    // We use explicit padding for full dilations, then use padding_low and
+    // padding_high on the sharded op for the remaining. padding_low and
+    // padding_high are now given initial values, which will be later updated if
+    // dilation is not 1.
+    auto swd = shard_window.mutable_dimensions(i);
+    explicit_left_padding[i] = wd.padding_low() / wd.base_dilation();
+    swd->set_padding_low(wd.padding_low() % wd.base_dilation());
+    swd->set_padding_high(0);
+
+    // Calculation for the first element needed on the 'padded-but-not-dilated'
+    // shape. The start on the dilated shape could be a hole, so we add
+    // wd.base_dilation() - 1 to the constant term to skip the leading holes.
+    start_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        wd.base_dilation() - 1 - swd->padding_low(), wd.base_dilation());
+    int64 dilated_shard_size =
+        wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+    limit_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        dilated_shard_size + wd.base_dilation() - 1 - swd->padding_low(),
+        wd.base_dilation());
+
+    offsets_on_padded_shape[i] = start_on_padded_calculations[i].Calculate(
+        partition_ordinals[i], state_.b);
+
+    auto shard_size_function =
+        limit_on_padded_calculations[i] - start_on_padded_calculations[i];
+    int64 max_shard_size = shard_size_function.MaxInRange(0, shard_count);
+    shard_shape.set_dimensions(i, max_shard_size);
+    padded_shape.set_dimensions(
+        i, limit_on_padded_calculations[i].Calculate(shard_count - 1));
+
+    // For base dilation, calculate the needed padding_low and padding_high, as
+    // well as the offset for the output if a dynamic slice is needed after the
+    // sharded op.
+    if (wd.base_dilation() != 1) {
+      // Returns the offset of a shard's first valid element in the dilated
+      // shard.
+      auto get_first_valid_element_offset_on_dilated_shard =
+          [&](int64 shard_ordinal) {
+            return start_on_padded_calculations[i].Calculate(shard_ordinal) *
+                       wd.base_dilation() +
+                   swd->padding_low() -
+                   wd.stride() * per_shard_window_counts[i] * shard_ordinal;
+          };
+      CHECK_EQ(get_first_valid_element_offset_on_dilated_shard(0),
+               swd->padding_low());
+
+      // Determine swd->padding_high.
+      for (int64 shard_ordinal = 0; shard_ordinal < shard_count;
+           ++shard_ordinal) {
+        int64 wanted_limit_on_dilated_shard =
+            wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+        int64 actual_limit_on_dilated_shard_without_pad_high =
+            get_first_valid_element_offset_on_dilated_shard(shard_ordinal) +
+            (max_shard_size - 1) * wd.base_dilation() + 1;
+        swd->set_padding_high(std::max<int64>(
+            swd->padding_high(),
+            wanted_limit_on_dilated_shard -
+                actual_limit_on_dilated_shard_without_pad_high));
+      }
+
+      // Determine swd->padding_low and output dynamic slice index.
+      if (wd.stride() == 1) {
+        int64 max_pad_low = get_first_valid_element_offset_on_dilated_shard(0);
+        bool all_same = true;
+        for (int64 shard_ordinal = 1; shard_ordinal < shard_count;
+             ++shard_ordinal) {
+          int64 start =
+              get_first_valid_element_offset_on_dilated_shard(shard_ordinal);
+          if (start != swd->padding_low()) {
+            all_same = false;
+          }
+          max_pad_low = std::max(max_pad_low, start);
+        }
+        if (!all_same) {
+          auto start_on_padded_input =
+              start_on_padded_calculations[i].Calculate(partition_ordinals[i],
+                                                        state_.b);
+          // We will calculate
+          //   max_pad_low - (first_window - required_first_window)
+          // which equals
+          //   required_first_window - (first_window - max_pad_low)
+          auto first_window_minus_max_pad_low =
+              MultiplyAddDivideOffsetCalculation(
+                  wd.base_dilation(), swd->padding_low() - max_pad_low, 1)
+                  .Calculate(start_on_padded_input, state_.b);
+          auto required_first_window =
+              MultiplyAddDivideOffsetCalculation(per_shard_window_counts[i], 0,
+                                                 1)
+                  .Calculate(partition_ordinals[i], state_.b);
+          dynamic_slice_offset_on_output[i] =
+              state_.b->AddInstruction(HloInstruction::CreateBinary(
+                  required_first_window->shape(), HloOpcode::kSubtract,
+                  required_first_window, first_window_minus_max_pad_low));
+        }
+        swd->set_padding_low(max_pad_low);
+      } else {
+        CHECK_EQ(
+            (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation(), 0)
+            << "General base dilation not yet implemented.";
+        // padding_low on all shards should equal the initially assigned
+        // swd->padding_low(), i.e., the padding_low() on the original window.
+      }
+    }
+  }
+
+  // Returns the output dynamic slice offset when needed, and absl::nullopt
+  // otherwise.
+  auto get_dynamic_slice_offset_on_output_if_needed =
+      [&]() -> absl::optional<std::vector<HloInstruction*>> {
+    if (absl::c_all_of(
+            dynamic_slice_offset_on_output,
+            [](HloInstruction* offset) { return offset == nullptr; })) {
+      return absl::nullopt;
+    }
+    auto zero = state_.b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    for (int64 i = 0; i < dynamic_slice_offset_on_output.size(); ++i) {
+      if (dynamic_slice_offset_on_output[i] == nullptr) {
+        dynamic_slice_offset_on_output[i] = zero;
+      }
+    }
+    return dynamic_slice_offset_on_output;
+  };
+
+  // If the currrent HLO is replicated, pad then slice.
+  if (sharding().IsReplicated()) {
+    PaddingConfig padding_config;
+    for (int64 i = 0; i < base_shape_.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      // Do not pad non-partitioned dimensions.
+      if (target.tile_assignment().dim(i) == 1) {
+        padding_config_dim->set_edge_padding_low(0);
+        padding_config_dim->set_edge_padding_high(0);
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(explicit_left_padding[i]);
+      padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                                explicit_left_padding[i] -
+                                                base_shape_.dimensions(i));
+    }
+    auto padded_hlo = ShapeUtil::Compatible(padded_shape, base_shape_)
+                          ? hlo_
+                          : state_.b->AddInstruction(HloInstruction::CreatePad(
+                                padded_shape, hlo_, pad_value, padding_config));
+    auto sharded_input =
+        state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+            shard_shape, padded_hlo, offsets_on_padded_shape,
+            shard_shape.dimensions()));
+    return update_cache(WindowedInputShardReturnValue{
+        sharded_input, shard_window,
+        get_dynamic_slice_offset_on_output_if_needed()});
+  }
+
+  if (target != sharding()) {
+    return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+  }
+
+  // Halo exchange.
+  HloInstruction* visiting_hlo = hlo_;
+  auto original_shard_shape = MakePartitionedShape(base_shape_, target);
+
+  std::vector<OffsetCalculation> left_halo_size_functions(base_shape_.rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(base_shape_.rank());
+  // TODO(yuanzx): We are concatenating on each sharded dimension one at time,
+  // and in the second dimension (and beyond) we create halos by slicing the
+  // concat in the previous dimension, which is not optimal. We should generate
+  // halos only concating slices, instead of slicing concats.
+  for (int dim = 0; dim < base_shape_.rank(); ++dim) {
+    int64 shard_count = target.tile_assignment().dim(dim);
+    if (shard_count == 1) {
+      continue;
+    }
+    int64 input_shard_size =
+        CeilOfRatio(base_shape_.dimensions(dim), shard_count);
+
+    // Left halo. The size of the halo is derived by subtracting the first read
+    // element offset of the i'th partition from the limit of the (i-1)'th
+    // partition.
+    MultiplyAddDivideOffsetCalculation shard_limit_of_previous_on_padded(
+        input_shard_size, explicit_left_padding[dim], 1);
+    left_halo_size_functions[dim] =
+        shard_limit_of_previous_on_padded - start_on_padded_calculations[dim];
+
+    // Right halo.
+    MultiplyAddDivideOffsetCalculation shard_start_of_next_on_padded(
+        input_shard_size, input_shard_size + explicit_left_padding[dim], 1);
+    right_halo_size_functions[dim] =
+        limit_on_padded_calculations[dim] - shard_start_of_next_on_padded;
+
+    auto resharded = ExchangeHaloAndGetValidData(
+        visiting_hlo, base_shape_, left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding[dim],
+        padded_shape.dimensions(dim), shard_shape.dimensions(dim), dim, target,
+        offsets_on_padded_shape[dim], pad_value, partition_ordinals[dim],
+        state_.collective_ops_creator, state_.next_channel_id, state_.b,
+        mask_invalid_region);
+    if (!resharded) {
+      VLOG(1) << "ReshardAsWindowedInput failed without replicate first: halo "
+                 "is beyond the neighbor.";
+      return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+    }
+    visiting_hlo = *resharded;
+  }
+  return update_cache(WindowedInputShardReturnValue{
+      visiting_hlo, shard_window,
+      get_dynamic_slice_offset_on_output_if_needed()});
+}
+
+PartitionedHlo PartitionedHlo::Replicate() {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  if (sharding.IsReplicated()) {
+    return *this;
+  }
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first.IsReplicated()) {
+      return entry.second;
+    }
+  }
+  auto update_cache = [&](PartitionedHlo resharded) {
+    state_.reshard_cache->per_hlo_cache[resharded.hlo()]
+        .reshard_cache.emplace_back(sharding, *this);
+    cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
+    return cache.back().second;
+  };
+  // 'Single Device' to 'Repliated'.
+  if (sharding.IsTileMaximal()) {
+    return update_cache(Broadcast());
+  }
+
+  // 'Tiled' to 'Replicated'.
+  Shape padded_base_shape = shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  auto zero = state_.b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  auto zero_bcast = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(padded_base_shape, zero, {}));
+  auto dus = state_.b->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      padded_base_shape, zero_bcast, hlo_,
+      MakePartitionOffsets(padded_base_shape, sharding, state_.partition_id,
+                           state_.b)));
+  HloComputation* reduction =
+      MakeBinaryAdd(shape.element_type(), state_.module);
+
+  auto all_reduce =
+      state_.collective_ops_creator.create_cross_partition_all_reduce(
+          state_.b, dus, reduction, NewChannel());
+  HloInstruction* result = all_reduce;
+  if (!ShapeUtil::Compatible(base_shape_, padded_base_shape)) {
+    std::vector<int64> start_indices(shape.rank(), 0);
+    std::vector<int64> strides(shape.rank(), 1);
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        base_shape_, result, start_indices, base_shape_.dimensions(), strides));
+  }
+  result->set_sharding(HloSharding::Replicate());
+  return update_cache(PartitionedHlo(result, base_shape_, state_));
+}
+
+PartitionedHlo PartitionedHlo::Broadcast() const {
+  const Shape& shape = hlo_->shape();
+  const HloSharding& sharding = hlo_->sharding();
+  CHECK(sharding.HasUniqueDevice());
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  auto src_core_id = state_.b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR0<uint32>(sharding.GetUniqueDevice())));
+  Shape bcast_shape = ShapeUtil::ChangeElementType(shape, PRED);
+  auto is_src_core = state_.b->AddInstruction(HloInstruction::CreateBroadcast(
+      bcast_shape,
+      state_.b->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::MakeShape(PRED, {}), state_.partition_id, src_core_id,
+          ComparisonDirection::kEq)),
+      {}));
+
+  auto zero = state_.b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  auto zero_bcast = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, zero, {}));
+  auto operand = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, is_src_core, hlo(), zero_bcast));
+  HloComputation* reduction =
+      MakeBinaryAdd(shape.element_type(), state_.module);
+
+  auto result = state_.collective_ops_creator.create_cross_partition_all_reduce(
+      state_.b, operand, reduction, NewChannel());
+  result->set_sharding(HloSharding::Replicate());
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
+    const HloSharding& target) const {
+  int64 partition_count = sharding().tile_assignment().num_elements();
+  absl::optional<int64> input_partition_dim = UniqueTiledDim(sharding());
+  absl::optional<int64> output_partition_dim = UniqueTiledDim(target);
+  CHECK(input_partition_dim.has_value());
+  CHECK(output_partition_dim.has_value());
+
+  // If the device order is different in the target, fix the order with
+  // ReshardWithCollectivePermute.
+  auto input_tile_fixed_device_order = target.tile_assignment();
+  input_tile_fixed_device_order.Reshape(
+      sharding().tile_assignment().dimensions());
+  auto input_sharding_fixed_device_order =
+      HloSharding::Tile(input_tile_fixed_device_order);
+  if (input_sharding_fixed_device_order != sharding()) {
+    auto fixed_order =
+        ReshardWithCollectivePermute(input_sharding_fixed_device_order);
+    return fixed_order.ReshardWithAllToAll(target);
+  }
+
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+
+  // The order of ids in the group must follow the target sharding.
+  std::vector<ReplicaGroup> groups(1);
+  for (int64 device : target.tile_assignment()) {
+    groups[0].add_replica_ids(device);
+  }
+
+  HloInstruction* result = nullptr;
+
+  // Split along the split dimension (output_partition_dim) of the all-to-all
+  // output.
+  std::vector<int64> dimensions;
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    if (i == *output_partition_dim) {
+      dimensions.push_back(partition_count);
+      dimensions.push_back(padded_hlo->shape().dimensions(i) / partition_count);
+    } else {
+      dimensions.push_back(padded_hlo->shape().dimensions(i));
+    }
+  }
+  auto reshape = state_.b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(base_shape_.element_type(), dimensions),
+      padded_hlo));
+  // After the reshape, it is guaranteed to have at least 3 dimensions.
+  auto all_to_all =
+      state_.collective_ops_creator.create_cross_partition_all_to_all(
+          state_.b, {reshape}, groups, (*state_.next_channel_id)++,
+          output_partition_dim);
+
+  // Reorder the split dimension of the reshape to be located in front of the
+  // input partition dimension, so the two dimensions can be combined.
+  int64 new_input_partition_dim = (*output_partition_dim < *input_partition_dim)
+                                      ? *input_partition_dim + 1
+                                      : *input_partition_dim;
+  std::vector<int64> permutation;
+  for (int64 i = 0; i < all_to_all->shape().rank(); ++i) {
+    if (i == *output_partition_dim) {
+      continue;
+    }
+    if (i == new_input_partition_dim) {
+      permutation.push_back(*output_partition_dim);
+    }
+    permutation.push_back(i);
+  }
+  auto transpose = state_.b->AddInstruction(HloInstruction::CreateTranspose(
+      ShapeInference::InferTransposeShape(all_to_all->shape(), permutation)
+          .ValueOrDie(),
+      all_to_all, permutation));
+
+  // Combine the split dimension and the input partition dimension.
+  auto new_shape = ShapeInference::InferAllToAllShape(
+                       padded_hlo->shape(), *output_partition_dim,
+                       *input_partition_dim, partition_count)
+                       .ValueOrDie();
+  result = state_.b->AddInstruction(
+      HloInstruction::CreateReshape(new_shape, transpose));
+
+  const Shape result_shape = MakePartitionedShape(base_shape_, target);
+  if (result_shape != result->shape()) {
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        result_shape, result, std::vector<int64>(result_shape.rank(), 0),
+        result_shape.dimensions(), std::vector<int64>(result_shape.rank(), 1)));
+  }
+  result->set_sharding(target);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
+    const HloSharding& target) const {
+  CHECK(CanReshardWithCollectivePermute(sharding(), target));
+  std::vector<std::pair<int64, int64>> src_dst_pairs;
+  sharding().tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 src_device) {
+        int64 dst_device = target.tile_assignment()(indices);
+        if (dst_device != src_device) {
+          src_dst_pairs.emplace_back(src_device, dst_device);
+        }
+      });
+  auto cp =
+      state_.collective_ops_creator.create_cross_partition_collective_permute(
+          state_.b, hlo(), src_dst_pairs, (*state_.next_channel_id)++);
+  cp->set_sharding(target);
+  return PartitionedHlo(cp, base_shape_, state_);
+}
+
+SpmdPartitioningVisitor::SpmdPartitioningVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger, SpmdPartitionerOptions options,
+    SpmdPartitioner* partitioner)
+    : changed_(false),
+      module_(computation->parent()),
+      num_partitions_(num_partitions),
+      num_replicas_(num_replicas),
+      collective_ops_creator_(collective_ops_creator),
+      next_channel_id_(next_channel_id),
+      b_(SpmdBuilder(computation->name() + "_spmd", /*hlo=*/nullptr)),
+      partition_id_(collective_ops_creator_.create_partition_id(&b_)),
+      logger_(logger),
+      options_(std::move(options)),
+      partitioner_(partitioner) {}
+
+Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
+  if (hlo->HasSideEffect()) {
+    return Unimplemented("Side-effect ops cannot be replicated: %s",
+                         hlo->ToString());
+  }
+
+  if (hlo->IsElementwise() && hlo->operand_count() > 0) {
+    return HandleElementwise(hlo);
+  }
+
+  if (!hlo->sharding().IsTileMaximal()) {
+    VLOG(1) << "Not partitioned in SPMD mode (DefaultAction):"
+            << hlo->ToString();
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      VLOG(1) << "  operand " << i
+              << " sharding:" << hlo->operand(i)->sharding().ToString();
+    }
+  }
+
+  // If the instruction cannot be partitioned, replicate the instruction unless
+  // the instruction has side-effect.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(HloSharding::Replicate()).hlo());
+  }
+  auto clone =
+      b_.AddInstruction(hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+  clone->set_sharding(HloSharding::Replicate());
+  clone->set_metadata(hlo->metadata());
+  SetPartitionedHlo(hlo,
+                    PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+                        .Reshard(hlo->sharding()));
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
+  visiting_hlo_ = hlo;
+  b_.set_visiting_hlo(hlo);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
+  logger_->RegisterLogEntry(GetPartitionedHlo(hlo).hlo(),
+                            b_.derived_instructions(hlo));
+  visiting_hlo_ = nullptr;
+  b_.set_visiting_hlo(nullptr);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(hlo->sharding()).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  const Shape shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  const int64 dimension = hlo->concatenate_dimension();
+  if (sharding.tile_assignment().dim(dimension) == 1) {
+    std::vector<HloInstruction*> new_operands;
+    for (HloInstruction* operand : hlo->operands()) {
+      new_operands.push_back(
+          GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    }
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(shard_shape, new_operands));
+    });
+    return Status::OK();
+  }
+
+  // If the concatenate dimension is along one of the partitioned dimensions,
+  // allocate the full output shape, each partition updates its owned region,
+  // all-reduce across partitions, and then slice its output region.
+
+  // We currently don't support subgroup all-reduce along partitions, so more
+  // than 1 partitioned dimensions is not supported.
+  if (sharding.tile_assignment().dim(dimension) != num_partitions_) {
+    return DefaultAction(hlo);
+  }
+
+  // temp_output_shape is the output shape where the concatenate dimension
+  // is changed to the full (and padded to shard count) dimension size.
+  auto temp_output_shape = MakePartitionedShape(hlo->shape(), sharding);
+  temp_output_shape.set_dimensions(
+      dimension, temp_output_shape.dimensions(dimension) *
+                     sharding.tile_assignment().dim(dimension));
+  auto temp_output = CreateZero(temp_output_shape, &b_);
+
+  // Offset of each operand along the concatenate dimension.
+  int64 offset = 0;
+  for (HloInstruction* operand : hlo->operands()) {
+    auto spmd_operand = GetPartitionedHlo(operand).Reshard(sharding).hlo();
+    std::vector<HloInstruction*> start_indices(
+        hlo->shape().rank(), b_.AddInstruction(HloInstruction::CreateConstant(
+                                 LiteralUtil::Zero(S32))));
+    start_indices[dimension] =
+        MultiplyAddDivideOffsetCalculation(
+            spmd_operand->shape().dimensions(dimension), offset, 1)
+            .Calculate(MakeTiledPartitionOrdinals(sharding, partition_id_,
+                                                  &b_)[dimension],
+                       &b_);
+    temp_output = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        temp_output_shape, temp_output, spmd_operand, start_indices));
+    offset += operand->shape().dimensions(dimension);
+  }
+  auto all_reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+      &b_, temp_output, MakeBinaryAdd(hlo->shape().element_type(), module_),
+      NewChannel());
+  SetPartitionedHlo(hlo, [&] {
+    auto start_indices =
+        MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+    start_indices[dimension] = MultiplyAddDivideOffsetCalculation(
+                                   shard_shape.dimensions(dimension), 0, 1)
+                                   .Calculate(start_indices[dimension], &b_);
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, all_reduce, start_indices, shard_shape.dimensions()));
+  });
+
+  return Status::OK();
+}
+
+// If partitioning in the operand only happens in dimensions in passthrough
+// dimensions (offset dimensions in the gather output (or scatter update) that
+// have the same size as the operand), returns the corresponding output (or
+// update) sharding by passing through the input sharding.
+absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
+    const PartitionedHlo& operand, const Shape& update_or_gather_shape,
+    absl::Span<const int64> collapsed_or_inserted_dims,
+    absl::Span<const int64> index_map,
+    absl::Span<const int64> offset_or_window_dims,
+    absl::Span<const int64> slice_size) {
+  if (operand.sharding().IsTileMaximal()) {
+    return operand.sharding();
+  }
+  std::vector<int64> passthrough_tile(update_or_gather_shape.rank(), 1);
+  int64 collapsed = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    int64 dim_partitions = operand.sharding().tile_assignment().dim(i);
+    if (absl::c_linear_search(collapsed_or_inserted_dims, i) ||
+        absl::c_linear_search(index_map, i)) {
+      if (dim_partitions > 1) {
+        return absl::nullopt;
+      }
+      collapsed++;
+      continue;
+    }
+    if (slice_size[i] != operand.base_shape().dimensions(i) &&
+        dim_partitions > 1) {
+      return absl::nullopt;
+    }
+    int64 offset_dim = offset_or_window_dims[i - collapsed];
+    if (i - collapsed > 0 &&
+        offset_dim < offset_or_window_dims[i - collapsed - 1]) {
+      // Output offsets are transposed, we do not support this case.
+      return absl::nullopt;
+    }
+    passthrough_tile[offset_dim] = dim_partitions;
+  }
+  Array<int64> tile_assignment = operand.sharding().tile_assignment();
+  tile_assignment.Reshape(passthrough_tile);
+  return HloSharding::Tile(tile_assignment);
+}
+
+// Returns whether partitioning in the operand only happens in dimensions with
+// gather/scatter slice size 1.
+bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+    const PartitionedHlo& operand, absl::Span<const int64> index_map,
+    absl::Span<const int64> slice_size, int64 num_partitions) {
+  if (operand.sharding().IsTileMaximal()) {
+    return false;
+  }
+  int64 trivial_slice_dims_partitions = 1;
+  for (int64 dim : index_map) {
+    if (slice_size[dim] == 1) {
+      trivial_slice_dims_partitions *=
+          operand.sharding().tile_assignment().dim(dim);
+    }
+  }
+  return trivial_slice_dims_partitions == num_partitions;
+}
+
+// Returns the min and max for the indices (replicated) in a scatter/gather
+// which has the operand partitioned on trivial slice dimensions (slice size 1).
+std::pair<HloInstruction*, HloInstruction*>
+IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+    const PartitionedHlo& operand, const PartitionedHlo& replicated_indices,
+    HloInstruction* partition_id, absl::Span<const int64> index_map,
+    int64 index_vector_dim, SpmdBuilder* b) {
+  auto operand_offsets = MakePartitionOffsets(
+      operand.base_shape(), operand.sharding(), partition_id, b);
+  // Find the per-dimension index bounds.
+  std::vector<HloInstruction*> min_indices;
+  std::vector<HloInstruction*> max_indices;
+  for (int64 i = 0; i < index_map.size(); ++i) {
+    int64 dim = index_map[i];
+    int64 partitions = operand.sharding().tile_assignment().dim(dim);
+    if (partitions == 1) {
+      min_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(), 0, b));
+      max_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(),
+          operand.base_shape().dimensions(dim), b));
+      continue;
+    }
+    auto offset = operand_offsets[dim];
+    if (offset->shape().element_type() !=
+        replicated_indices.base_shape().element_type()) {
+      offset = b->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::MakeShape(replicated_indices.base_shape().element_type(),
+                               {}),
+          offset));
+    }
+    min_indices.push_back(offset);
+    auto partition_size_minus_1 =
+        CreateR0WithType<int32>(replicated_indices.base_shape().element_type(),
+                                operand.hlo()->shape().dimensions(dim) - 1, b);
+    max_indices.push_back(b->AddInstruction(HloInstruction::CreateBinary(
+        offset->shape(), HloOpcode::kAdd, offset, partition_size_minus_1)));
+  }
+  // Broadcast the index bounds to the same shape as the indices.
+  HloInstruction* broadcast_min;
+  HloInstruction* broadcast_max;
+  if (index_vector_dim < replicated_indices.base_shape().rank()) {
+    // The index vector is an R1, we need to reshape individual bounds to
+    // [1], and concat them if there are more than one.
+    for (int64 i = 0; i < min_indices.size(); ++i) {
+      min_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(min_indices[i]->shape().element_type(), {1}),
+          min_indices[i]));
+      max_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(max_indices[i]->shape().element_type(), {1}),
+          max_indices[i]));
+    }
+    int64 slice_dims = max_indices.size();
+    if (slice_dims > 1) {
+      min_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(min_indices[0]->shape().element_type(),
+                               {slice_dims}),
+          min_indices, 0));
+      max_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          min_indices[0]->shape(), max_indices, 0));
+    }
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {index_vector_dim}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {index_vector_dim}));
+  } else {
+    CHECK_EQ(max_indices.size(), 1);
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {}));
+  }
+  return {broadcast_min, broadcast_max};
+}
+
+Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
+  auto scatter = Cast<HloScatterInstruction>(hlo);
+  auto dnums = scatter->scatter_dimension_numbers();
+  auto operand = GetPartitionedHlo(scatter->operand(0));
+  auto indices = GetPartitionedHlo(scatter->operand(1));
+  auto updates = GetPartitionedHlo(scatter->operand(2));
+  std::vector<int64> slice_size(operand.base_shape().rank(), 1);
+  int64 num_update_window_dims = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.inserted_window_dims(), i)) {
+      continue;
+    }
+    slice_size[i] = updates.base_shape().dimensions(
+        dnums.update_window_dims(num_update_window_dims++));
+  }
+  std::vector<int64> inserted_window_dims(dnums.inserted_window_dims().begin(),
+                                          dnums.inserted_window_dims().end());
+  std::vector<int64> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  std::vector<int64> update_window_dims(dnums.update_window_dims().begin(),
+                                        dnums.update_window_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, updates.base_shape(), inserted_window_dims,
+        scatter_dims_to_operand_dims, update_window_dims, slice_size);
+    // Handle pass through cases if we can use compatible sharding for update.
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(*maybe_passthrough);
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), indices.hlo(), updates.hlo(),
+          scatter->to_apply(), dnums, scatter->indices_are_sorted(),
+          scatter->unique_indices()));
+      pscatter->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, scatter_dims_to_operand_dims, slice_size,
+            num_partitions_) &&
+        ShapeUtil::ByteSizeOf(updates.base_shape()) <
+            ShapeUtil::ByteSizeOf(scatter->shape())) {
+      // Operand is sharded on trivial slice dims (update slice size 1). We can
+      // adjust the indices on each partition by subtracting the offsets. Then
+      // we execute a scatter on full updated indices, and out-of-bound accesses
+      // will have no effect on the result as guaranteed by the scatter
+      // semantics.
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(HloSharding::Replicate());
+      HloInstruction* indices_min;
+      HloInstruction* indices_max_unused;
+      std::tie(indices_min, indices_max_unused) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, scatter_dims_to_operand_dims,
+              dnums.index_vector_dim(), &b_);
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.hlo()->shape(), HloOpcode::kSubtract, indices.hlo(),
+          indices_min));
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), adjusted_indices,
+          updates.hlo(), scatter->to_apply(), dnums,
+          scatter->indices_are_sorted(), scatter->unique_indices()));
+      pscatter->set_sharding(operand.sharding());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0)).Reshard(sharding);
+
+  // Create a window config to represent the slice.
+  Window window;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(hlo->slice_strides(i));
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    dim->set_padding_low(-hlo->slice_starts(i));
+    dim->set_padding_high(hlo->slice_limits(i) -
+                          hlo->operand(0)->shape().dimensions(i));
+    dim->set_base_dilation(1);
+  }
+
+  auto reshard_operand = operand.ReshardAsWindowedInput(
+      window, sharding,
+      CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+      /*mask_invalid_region=*/false);
+  if (!reshard_operand.has_value()) {
+    return DefaultAction(hlo);
+  }
+  TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+  const Shape& operand_shape = reshard_operand->sharded_input->shape();
+
+  std::vector<int64> start_indices = hlo->slice_starts();
+  std::vector<int64> limit_indices = hlo->slice_limits();
+  std::vector<int64> strides = hlo->slice_strides();
+  bool need_slice = false;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    auto dim = reshard_operand->shard_window.dimensions(i);
+    start_indices[i] = -dim.padding_low();
+    limit_indices[i] = operand_shape.dimensions(i) + dim.padding_high();
+    if (start_indices[i] != 0 || strides[i] != 1 ||
+        limit_indices[i] != operand_shape.dimensions(i)) {
+      need_slice = true;
+    }
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    if (need_slice) {
+      auto shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+      return b_.AddInstruction(HloInstruction::CreateSlice(
+          shard_shape, reshard_operand->sharded_input, start_indices,
+          limit_indices, strides));
+    }
+    return reshard_operand->sharded_input;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
+  HloSharding sharding = hlo->sharding();
+  if (hlo->shape().IsTuple()) {
+    // Check that all elements are sharded in the same way.
+    if (hlo->shape().tuple_shapes_size() == 0) {
+      return DefaultAction(hlo);
+    }
+    sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+    for (int64 i = 1; i < hlo->operand_count(); ++i) {
+      if (sharding != hlo->sharding().GetSubSharding(hlo->shape(), {i})) {
+        return DefaultAction(hlo);
+      }
+    }
+  }
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 dim : hlo->dimensions()) {
+    if (sharding.tile_assignment().dim(dim) > 1) {
+      return DefaultAction(hlo);
+    }
+  }
+  // Reshard operands to the same as the output.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
+  if (hlo->custom_call_target() == "SPMDFullToShardShape") {
+    // This op switches from auto partitioning to manual partitioning.
+    auto input_partitioned = GetPartitionedHlo(hlo->operand(0));
+    if (!EvenlyPartitions(hlo->shape(), input_partitioned.sharding())) {
+      input_partitioned = input_partitioned.PadWithValue(
+          CreateR0WithType(hlo->shape().element_type(), 0, &b_));
+    }
+    auto input = input_partitioned.hlo();
+    CHECK(hlo->sharding().IsReplicated());
+    CHECK(ShapeUtil::Compatible(input->shape(), hlo->shape()));
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() == "SPMDShardToFullShape") {
+    // This op switches from manual partitioning to auto partitioning.
+    auto input = GetPartitionedHlo(hlo->operand(0)).hlo();
+    CHECK(input->sharding().IsReplicated());
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    CHECK(ShapeUtil::Compatible(
+        copy->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() != "TopK") {
+    return DefaultAction(hlo);
+  }
+
+  if (!hlo->operand(0)->has_sharding()) {
+    return DefaultAction(hlo);
+  }
+
+  const HloSharding& sharding = hlo->operand(0)->sharding();
+  if (sharding.IsTileMaximal() || sharding.IsReplicated()) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 sort_dim = 1;
+  const int64 shard_count = sharding.tile_assignment().dim(sort_dim);
+
+  if (shard_count <= 1) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 input_size = hlo->operand(0)->shape().dimensions(sort_dim);
+  const int64 batch_size = hlo->shape().tuple_shapes(0).dimensions(0);
+  const int64 k = hlo->shape().tuple_shapes(0).dimensions(sort_dim);
+  const int64 per_partition_size = CeilOfRatio(input_size, shard_count);
+
+  if (k >= per_partition_size) {
+    return DefaultAction(hlo);
+  }
+
+  auto input = hlo->operand(0);
+  const auto element_type = input->shape().element_type();
+
+  // Pad input with minimal value.
+  auto min_value = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::MinValue(element_type)));
+  // TODO(wangtao): add test to see if -NaN < -Inf in BF16.
+  if (element_type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    min_value = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<float>(-float_pad_value)));
+  }
+  auto partitioned_input = GetPartitionedHlo(input).PadWithValue(min_value);
+
+  // Each partition needs to do TopK separately, thus the base shape
+  // becomes [batch_size, k * shard_count].
+  const Shape replicated_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(hlo->operand(0)->shape().element_type(),
+                            {batch_size, k * shard_count}),
+       ShapeUtil::MakeShape(S32, {batch_size, k * shard_count})});
+  auto custom_call_sharding =
+      sharding.GetTupleSharding(replicated_shape).ValueOrDie();
+  auto shard_shape =
+      MakePartitionedShape(replicated_shape, custom_call_sharding);
+  auto topk = b_.AddInstruction(
+      hlo->CloneWithNewOperands(shard_shape, {partitioned_input.hlo()}));
+  topk->set_sharding(custom_call_sharding);
+  // Partition customcall.
+  PartitionedHlo partitioned_topk(topk, replicated_shape,
+                                  MakePartitioningState());
+  topk = partitioned_topk.hlo();
+
+  // Get value from TopK.
+  HloInstruction* value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(0), topk, 0));
+  value_gte->set_sharding(sharding);
+  // Partition GetTupleElement of value.
+  PartitionedHlo value_partitioned_gte(
+      value_gte, partitioned_topk.base_shape().tuple_shapes(0),
+      MakePartitioningState());
+  // Reshard value to be replicated.
+  auto replicated_value_gte =
+      value_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Get index from TopK.
+  HloInstruction* index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(1), topk, 1));
+  auto partition_id_s32 = b_.AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::MakeShape(S32, partition_id_->shape().dimensions()),
+      partition_id_));
+  // Add per partition offset to index, index returned from CustomCall always
+  // starts from 0.
+  auto index_offset = b_.AddInstruction(HloInstruction::CreateBroadcast(
+      index_gte->shape(),
+      b_.AddInstruction(HloInstruction::CreateBinary(
+          partition_id_s32->shape(), HloOpcode::kMultiply, partition_id_s32,
+          b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<int32>(per_partition_size))))),
+      {}));
+  index_gte = b_.AddInstruction(HloInstruction::CreateBinary(
+      index_offset->shape(), HloOpcode::kAdd, index_gte, index_offset));
+  index_gte->set_sharding(sharding);
+  // Parttion GetTupleElement of index.
+  PartitionedHlo index_partitioned_gte(
+      index_gte, partitioned_topk.base_shape().tuple_shapes(1),
+      MakePartitioningState());
+  // Reshard index to be replicated.
+  auto replicated_index_gte =
+      index_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Creates replicated sort to do TopK, the input is value and index pairs
+  // from all the partitions. The reason to use Sort instead of CustomCall TopK
+  // is CustomCall only takes value as input. There will be an extra Gather
+  // to get the correct index if CustomCall is used here.
+
+  // Create comparator for the sort.
+  XlaBuilder b("Sort.Compare");
+  XlaComputation comparator = CreateScalarComparisonComputation(
+      "compare-value-and-index", {input->shape().element_type(), S32}, {Gt, Lt},
+      &b);
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comparator.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comparator.proto(), config));
+  HloCloneContext context(module_);
+  auto compare_computation =
+      module_->DeepCloneComputation(new_module->entry_computation(), &context);
+  auto sort = b_.AddInstruction(HloInstruction::CreateSort(
+      replicated_shape, sort_dim, {replicated_value_gte, replicated_index_gte},
+      compare_computation, true));
+  sort->set_sharding(
+      HloSharding::Replicate().GetTupleSharding(sort->shape()).ValueOrDie());
+  PartitionedHlo replicated_sort(sort, replicated_shape,
+                                 MakePartitioningState());
+
+  // Slice value and index from top-k for output.
+  HloInstruction* sort_value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(0), replicated_sort.hlo(),
+          0));
+  HloInstruction* sort_index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(1), replicated_sort.hlo(),
+          1));
+  const Shape& hlo_shape = sort_value_gte->shape();
+  auto hlo_dims = hlo_shape.dimensions();
+  std::vector<int64> start_indices(hlo_shape.dimensions_size(), 0);
+  std::vector<int64> limit_indices(hlo_dims.begin(), hlo_dims.end());
+  std::vector<int64> strides(hlo_shape.dimensions_size(), sort_dim);
+  limit_indices[sort_dim] = k;
+  auto output_shape = hlo_shape;
+  output_shape.set_dimensions(sort_dim, k);
+  // Slice value from final sort.
+  HloInstruction* slice_sort_value =
+      b_.AddInstruction(HloInstruction::CreateSlice(
+          output_shape, sort_value_gte, start_indices, limit_indices, strides));
+  // Slice index from final sort.
+  auto index_output_shape = sort_index_gte->shape();
+  index_output_shape.set_dimensions(sort_dim, k);
+  HloInstruction* slice_index_value = b_.AddInstruction(
+      HloInstruction::CreateSlice(index_output_shape, sort_index_gte,
+                                  start_indices, limit_indices, strides));
+  auto create_tuple = b_.AddInstruction(
+      HloInstruction::CreateTuple({slice_sort_value, slice_index_value}));
+  create_tuple->set_sharding(HloSharding::Replicate());
+
+  SetPartitionedHlo(hlo, PartitionedHlo(create_tuple, create_tuple->shape(),
+                                        MakePartitioningState())
+                             .Reshard(hlo->sharding()));
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  std::vector<int64> inverse_dimensions(hlo->shape().rank());
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    inverse_dimensions[hlo->dimensions(i)] = i;
+  }
+  auto desired_operand_sharding =
+      hlo_sharding_util::TransposeSharding(sharding, inverse_dimensions);
+
+  auto operand = GetPartitionedHlo(hlo->operand(0))
+                     .Reshard(desired_operand_sharding)
+                     .hlo();
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  // The output shape is the source and the operand shape is the target to get
+  // the aligned sharding for the operand.
+  auto desired_operand_sharding = hlo_sharding_util::ReshapeSharding(
+      hlo->shape(), hlo->operand(0)->shape(), hlo->sharding());
+  if (desired_operand_sharding.has_value()) {
+    auto operand_hlo = operand.Reshard(*desired_operand_sharding).hlo();
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand_hlo}));
+    });
+    return Status::OK();
+  }
+
+  // Try use halo exchange for certain split-dim/merge-dims cases.
+  // ReshapeSharding failed in these cases probably due to uneven partitioning,
+  // where halo exchange could help. Specifically we check the following
+  // conditions to detect supported cases:
+  // 1) Both input and output are partitioned on one dimension.
+  // 2) The combined size of dimensions before the partitioned dimension are the
+  // same on input and output. This means we don't need to consider the major
+  // dimensions.
+  // 3) Let A = the input size on the partitioned dimension, and
+  //        B = the output size on the partitioned dimension; then
+  //    either A % B == 0 (split dim) or B % A == 0 (merge dims).
+  auto maybe_input_sharded_dim = UniqueTiledDim(operand.sharding());
+  auto maybe_output_sharded_dim = UniqueTiledDim(sharding);
+  if (!maybe_input_sharded_dim || !maybe_output_sharded_dim) {
+    return DefaultAction(hlo);
+  }
+  int64 input_sharded_dim = *maybe_input_sharded_dim;
+  int64 output_sharded_dim = *maybe_output_sharded_dim;
+  // Check that the major dims before the sharded dim have the same total size
+  // for input and output.
+  int64 input_major_dims_size = 1;
+  for (int64 i = 0; i < input_sharded_dim; ++i) {
+    input_major_dims_size *= operand.base_shape().dimensions(i);
+  }
+  int64 output_major_dims_size = 1;
+  for (int64 i = 0; i < output_sharded_dim; ++i) {
+    output_major_dims_size *= hlo->shape().dimensions(i);
+  }
+  if (input_major_dims_size != output_major_dims_size) {
+    return DefaultAction(hlo);
+  }
+  // Fix potential device ordering mismatch in tile assignment.
+  Array<int64> new_input_tile_assignment = sharding.tile_assignment();
+  new_input_tile_assignment.Reshape(
+      operand.sharding().tile_assignment().dimensions());
+  operand = operand.Reshard(HloSharding::Tile(new_input_tile_assignment));
+
+  int64 input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
+  int64 output_dim_size = hlo->shape().dimensions(output_sharded_dim);
+  auto input_shard_shape =
+      MakePartitionedShape(operand.base_shape(), operand.sharding());
+  auto output_shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+  if (input_dim_size % output_dim_size == 0) {
+    // Split dim.
+    int64 split_factor = input_dim_size / output_dim_size;
+    int64 output_shard_size = output_shard_shape.dimensions(output_sharded_dim);
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == input_sharded_dim) {
+        dim->set_padding_high(output_shard_size * split_factor *
+                                  num_partitions_ -
+                              input_dim_size);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_operand = operand.ReshardAsWindowedInput(
+        window, operand.sharding(),
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_operand.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_operand->sharded_input->shape().dimensions(input_sharded_dim),
+        output_shard_size * split_factor);
+    SetPartitionedHlo(hlo, [&] {
+      // Do a local reshape.
+      return b_.AddInstruction(HloInstruction::CreateReshape(
+          output_shard_shape, reshard_operand->sharded_input));
+    });
+    return Status::OK();
+  } else if (output_dim_size % input_dim_size == 0) {
+    // Merge dims.
+    int64 merge_factor = output_dim_size / input_dim_size;
+    // First reshape locally. (The sharded dimension could include padded data.)
+    auto tmp_shard_shape = output_shard_shape;
+    tmp_shard_shape.set_dimensions(
+        output_sharded_dim,
+        input_shard_shape.dimensions(input_sharded_dim) * merge_factor);
+    auto tmp_reshape = b_.AddInstruction(
+        HloInstruction::CreateReshape(tmp_shard_shape, operand.hlo()));
+    tmp_reshape->set_metadata(hlo->metadata());
+    tmp_reshape->set_sharding(hlo->sharding());
+    auto tmp_full_shape = tmp_shard_shape;
+    tmp_full_shape.set_dimensions(
+        output_sharded_dim,
+        tmp_shard_shape.dimensions(output_sharded_dim) * num_partitions_);
+    auto tmp_output =
+        PartitionedHlo(tmp_reshape, tmp_full_shape, MakePartitioningState());
+
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < tmp_shard_shape.rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == output_sharded_dim) {
+        dim->set_padding_high(output_dim_size -
+                              tmp_shard_shape.dimensions(output_sharded_dim) *
+                                  num_partitions_);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_output = tmp_output.ReshardAsWindowedInput(
+        window, sharding,
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_output.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_output->sharded_input->shape().dimensions(input_sharded_dim),
+        output_shard_shape.dimensions(output_sharded_dim));
+    SetPartitionedHlo(hlo, [&] { return reshard_output->sharded_input; });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    int64 dimension = Cast<HloIotaInstruction>(hlo)->iota_dimension();
+    auto iota = b_.AddInstruction(HloInstruction::CreateIota(
+        MakePartitionedShape(hlo->shape(), sharding), dimension));
+
+    if (sharding.tile_assignment().dim(dimension) > 1) {
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(sharding, partition_id_, &b_);
+      auto multiplier = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(iota->shape().dimensions(dimension))));
+      auto offset = b_.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {}), HloOpcode::kMultiply,
+          partition_ordinals[dimension], multiplier));
+      if (iota->shape().element_type() != S32) {
+        offset = b_.AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::MakeShape(iota->shape().element_type(), {}), offset));
+      }
+      auto broadcast = b_.AddInstruction(
+          HloInstruction::CreateBroadcast(iota->shape(), offset, {}));
+      return b_.AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota, broadcast));
+    }
+
+    return iota;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSingleDevice(const HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  int64 device = hlo->sharding().GetUniqueDevice();
+  const HloSharding sharding = HloSharding::AssignDevice(device);
+
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> operand_shapes;
+  for (const HloInstruction* operand : hlo->operands()) {
+    operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    operand_shapes.push_back(operand->shape());
+  }
+  auto operand = b_.AddInstruction(HloInstruction::CreateTuple(operands));
+  auto operand_shape = ShapeUtil::MakeTupleShape(operand_shapes);
+
+  auto on_device = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(device)));
+  auto pred = b_.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}), partition_id_, on_device,
+      ComparisonDirection::kEq));
+
+  SpmdBuilder true_b("true_computation", visiting_hlo_);
+  HloComputation* true_computation;
+  {
+    auto param = true_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "true_branch_param"));
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < operands.size(); ++i) {
+      new_operands.push_back(true_b.AddInstruction(
+          HloInstruction::CreateGetTupleElement(operand_shapes[i], param, i)));
+    }
+    auto root = true_b.AddInstruction(
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+    true_computation = module_->AddEmbeddedComputation(true_b.Build(root));
+  }
+
+  SpmdBuilder false_b("false_computation", visiting_hlo_);
+  HloComputation* false_computation;
+  {
+    false_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "false_branch_param"));
+    auto root = CreateZero(hlo->shape(), &false_b);
+    false_computation = module_->AddEmbeddedComputation(false_b.Build(root));
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        hlo->shape(), pred, operand, true_computation, operand,
+        false_computation));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
+  if (hlo->IsCrossReplicaAllReduce() && hlo->operand_count() == 1) {
+    return HandleElementwise(hlo);
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+
+  // Tiled output.
+  std::vector<int64> wanted_input_tile_size(operand.base_shape().rank());
+  std::vector<int64> sharded_new_dims;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    wanted_input_tile_size[i] =
+        hlo->sharding().tile_assignment().dim(hlo->dimensions(i));
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_linear_search(hlo->dimensions(), i) &&
+        hlo->sharding().tile_assignment().dim(i) > 1) {
+      sharded_new_dims.push_back(i);
+    }
+  }
+  if (sharded_new_dims.empty()) {
+    // The new dimensions are replicated, so that we can do the adjustment on
+    // the input.
+    Array<int64> wanted_input_tile_assignment(wanted_input_tile_size);
+    wanted_input_tile_assignment.Each(
+        [&](absl::Span<const int64> indices, int64* val) {
+          std::vector<int64> indices_in_broadcast(hlo->shape().rank(), 0);
+          for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+            indices_in_broadcast[hlo->dimensions(i)] = indices[i];
+          }
+          *val = hlo->sharding().tile_assignment()(indices_in_broadcast);
+        });
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+          {operand.Reshard(HloSharding::Tile(wanted_input_tile_assignment))
+               .hlo()}));
+    });
+  } else {
+    auto input = operand.Reshard(HloSharding::Replicate()).hlo();
+    // We pad and shard the input first, then broadcast to the final shard
+    // shape.
+    auto output_offsets =
+        MakePartitionOffsets(hlo->shape(), hlo->sharding(), partition_id_, &b_);
+    std::vector<HloInstruction*> input_offsets(operand.base_shape().rank());
+    auto output_shard_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto input_shard_shape = input->shape();
+    auto padded_input_shape = input->shape();
+    for (int64 i = 0; i < input_offsets.size(); ++i) {
+      input_offsets[i] = output_offsets[hlo->dimensions(i)];
+      input_shard_shape.set_dimensions(
+          i, output_shard_shape.dimensions(hlo->dimensions(i)));
+      padded_input_shape.set_dimensions(
+          i, hlo->sharding().tile_assignment().dim(hlo->dimensions(i)) *
+                 input_shard_shape.dimensions(i));
+    }
+    auto padded_input = PadToShape(input, padded_input_shape, &b_);
+    auto input_shard =
+        ShapeUtil::Compatible(input_shard_shape, padded_input->shape())
+            ? padded_input
+            : b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+                  input_shard_shape, padded_input, input_offsets,
+                  input_shard_shape.dimensions()));
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(output_shard_shape, {input_shard}));
+    });
+  }
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
+  const Literal& literal = hlo->literal();
+  if (literal.shape().IsTuple() ||
+      (!hlo->sharding().IsTileMaximal() &&
+       (!EvenlyPartitions(hlo->shape(), hlo->sharding()) ||
+        !literal.IsAllFirst()))) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    std::vector<int64> start_indices(hlo->shape().rank(), 0);
+    auto constant = b_.AddInstruction(HloInstruction::CreateConstant(
+        literal.Slice(start_indices, shard_shape.dimensions())));
+    *constant->mutable_shape() = shard_shape;
+    return constant;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->dynamic_slice_sizes()[i] != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 1)->IsConstant() ||
+         !hlo->operand(i + 1)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 1))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        partitioned_shape, new_input, new_indices,
+        partitioned_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->operand(1)->shape().dimensions(i) != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 2)->IsConstant() ||
+         !hlo->operand(i + 2)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  auto new_update =
+      GetPartitionedHlo(hlo->operand(1)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        partitioned_shape, new_input, new_update, new_indices));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
+  auto gather = Cast<HloGatherInstruction>(hlo);
+  const auto& dnums = gather->gather_dimension_numbers();
+  auto operand = GetPartitionedHlo(gather->operand(0));
+  auto indices = GetPartitionedHlo(gather->operand(1));
+  std::vector<int64> collapsed_slice_dims(dnums.collapsed_slice_dims().begin(),
+                                          dnums.collapsed_slice_dims().end());
+  std::vector<int64> start_index_map(dnums.start_index_map().begin(),
+                                     dnums.start_index_map().end());
+  std::vector<int64> offset_dims(dnums.offset_dims().begin(),
+                                 dnums.offset_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, gather->shape(), collapsed_slice_dims, start_index_map,
+        offset_dims, gather->gather_slice_sizes());
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      auto pshape = MakePartitionedShape(gather->shape(), *maybe_passthrough);
+      std::vector<int64> pslice_sizes(gather->gather_slice_sizes().begin(),
+                                      gather->gather_slice_sizes().end());
+      for (int64 i = 0; i < pslice_sizes.size(); ++i) {
+        if (operand.sharding().tile_assignment().dim(i) > 1) {
+          pslice_sizes[i] = operand.hlo()->shape().dimensions(i);
+        }
+      }
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          pshape, operand.hlo(), indices.hlo(), dnums, pslice_sizes,
+          gather->indices_are_sorted()));
+      pgather->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pgather, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, start_index_map, gather->gather_slice_sizes(),
+            num_partitions_) &&
+        ShapeUtil::ByteSizeOf(gather->shape()) <
+            ShapeUtil::ByteSizeOf(gather->operand(0)->shape())) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      // Now the operand is partitioned in trivial slice dimensions, and the
+      // indices are replicated. We execute a gather on partitioned operand,
+      // with full number of indices, where out-of-bounds indices are clamped,
+      // and masked out with 0 in the result; then we use all-reduce to combine
+      // results. Although gather will not get faster, we avoided the need to
+      // replicate the operand.
+      HloInstruction* indices_min;
+      HloInstruction* indices_max;
+      std::tie(indices_min, indices_max) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, start_index_map,
+              dnums.index_vector_dim(), &b_);
+      // Clamp the indices.
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateTernary(
+          indices.base_shape(), HloOpcode::kClamp, indices_min, indices.hlo(),
+          indices_max));
+      // Adjust the indices by subtracting the offset.
+      adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.base_shape(), HloOpcode::kSubtract, adjusted_indices,
+          indices_min));
+      // Gather on adjusted indices.
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          gather->shape(), operand.hlo(), adjusted_indices, dnums,
+          gather->gather_slice_sizes(), gather->indices_are_sorted()));
+      // Mask out invalid results.
+      auto filter = b_.AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+          indices.hlo(), indices_min, ComparisonDirection::kLt));
+      filter = b_.AddInstruction(HloInstruction::CreateBinary(
+          filter->shape(), HloOpcode::kOr, filter,
+          b_.AddInstruction(HloInstruction::CreateCompare(
+              ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+              indices.hlo(), indices_max, ComparisonDirection::kGt))));
+      if (dnums.index_vector_dim() < indices.base_shape().rank()) {
+        std::vector<int64> reduced_filter_dims;
+        for (int64 i = 0; i < filter->shape().rank(); ++i) {
+          if (i != dnums.index_vector_dim()) {
+            reduced_filter_dims.push_back(filter->shape().dimensions(i));
+          }
+        }
+        filter = b_.AddInstruction(HloInstruction::CreateReduce(
+            ShapeUtil::MakeShape(PRED, reduced_filter_dims), filter,
+            CreateR0WithType(PRED, false, &b_), {dnums.index_vector_dim()},
+            MakeBinaryAdd(PRED, module_)));
+      }
+      std::vector<int64> batch_dims;
+      for (int64 i = 0; i < pgather->shape().rank(); ++i) {
+        if (!absl::c_linear_search(dnums.offset_dims(), i)) {
+          batch_dims.push_back(i);
+        }
+      }
+      auto broadcast_filter = b_.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::ChangeElementType(pgather->shape(), PRED), filter,
+          batch_dims));
+      auto filtered = b_.AddInstruction(HloInstruction::CreateTernary(
+          pgather->shape(), HloOpcode::kSelect, broadcast_filter,
+          CreateZero(pgather->shape(), &b_), pgather));
+      // Combine from different partitions.
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, filtered,
+          MakeBinaryAdd(filtered->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+  const auto& tuple = GetPartitionedHlo(hlo->operand(0));
+  auto gte = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetTupleElementShape(tuple.hlo()->shape(), hlo->tuple_index()),
+      tuple.hlo(), hlo->tuple_index()));
+  SetPartitionedHlo(hlo, [&]() {
+    const auto source_sharding = tuple.sharding().GetSubSharding(
+        tuple.base_shape(), {hlo->tuple_index()});
+    gte->set_sharding(source_sharding);
+    PartitionedHlo source_partitioned_gte(gte, hlo->shape(),
+                                          MakePartitioningState());
+    return source_partitioned_gte.Reshard(hlo->sharding()).hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
+  const Shape& shape = ShapeUtil::GetTupleElementShape(hlo->shape(), 0);
+  auto token = GetPartitionedHlo(hlo->operand(0)).hlo();
+  if (ShapeUtil::GetLeafCount(shape) == 0) {
+    // TODO(b/155819021): HloSharding has issues with tuple-shaped sharding: it
+    // requires one element for an empty tuple, but leaf-count number of
+    // elements for non-empty tuple. So if it has a nested empty tuple, we
+    // cannot invoke GetSubSharding() since it expects a sharding for the empty
+    // tuple. This is a workaround for that case.
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(
+          HloInstruction::CreateInfeed(shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+  auto sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  auto shard_shape = MakePartitionedShape(shape, sharding);
+  if (EvenlyPartitions(shape, sharding)) {
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(HloInstruction::CreateInfeed(
+          shard_shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  // Create a branch for each unique partitioned shape.
+  std::vector<Shape> per_branch_partitioned_shapes;
+  std::vector<int32> conditional_branch_indices(num_partitions_);
+  for (int64 i = 0; i < num_partitions_; ++i) {
+    auto partitioned_shape =
+        MakeNonPaddedShapeForGivenPartition(shape, sharding, i);
+    int64 matching_existing_index = 0;
+    for (; matching_existing_index < per_branch_partitioned_shapes.size();
+         ++matching_existing_index) {
+      if (ShapeUtil::Compatible(
+              partitioned_shape,
+              per_branch_partitioned_shapes[matching_existing_index])) {
+        break;
+      }
+    }
+    if (matching_existing_index < per_branch_partitioned_shapes.size()) {
+      conditional_branch_indices[i] = matching_existing_index;
+    } else {
+      conditional_branch_indices[i] = per_branch_partitioned_shapes.size();
+      per_branch_partitioned_shapes.push_back(std::move(partitioned_shape));
+    }
+  }
+
+  HloInstruction* branch_index;
+  if (per_branch_partitioned_shapes.size() == num_partitions_) {
+    // Use partition ID as the branch index if each partition has its own
+    // branch.
+    branch_index = partition_id_;
+    // PartitionId's output is U32 but conditional requires S32.
+    if (branch_index->shape().element_type() != S32) {
+      branch_index = b_.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(branch_index->shape(), S32),
+          branch_index));
+    }
+  } else {
+    // Otherwise, use a constant table to look up the branch index.
+    auto branch_index_table = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<int32>(conditional_branch_indices)));
+    branch_index = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::MakeShape(S32, {1}), branch_index_table, {partition_id_},
+        {1}));
+    branch_index = b_.AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(S32, {}), branch_index));
+  }
+
+  std::vector<HloComputation*> branches(per_branch_partitioned_shapes.size());
+  for (int64 i = 0; i < branches.size(); ++i) {
+    SpmdBuilder branch_b(absl::StrCat("infeed_branch_", i), visiting_hlo_);
+    auto param = branch_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, token->shape(), "infeed_token_param"));
+    auto infeed = branch_b.AddInstruction(HloInstruction::CreateInfeed(
+        per_branch_partitioned_shapes[i], param, hlo->infeed_config()));
+    branches[i] = module_->AddEmbeddedComputation(branch_b.Build(infeed));
+    if (!ShapeUtil::Compatible(per_branch_partitioned_shapes[i], shard_shape)) {
+      TF_ASSIGN_OR_RETURN(
+          auto padded,
+          branches[i]->DeepCopyInstructionWithCustomCopier(
+              infeed, [&](HloInstruction* leaf, const ShapeIndex& leaf_index,
+                          HloComputation* comp) {
+                // Index {1} corresponds to the token.
+                if (leaf_index.empty() || leaf_index[0] != 0) {
+                  return leaf;
+                }
+                ShapeIndexView subindex(leaf_index, 1);
+                if (ShapeUtil::Compatible(
+                        ShapeUtil::GetSubshape(per_branch_partitioned_shapes[i],
+                                               subindex),
+                        ShapeUtil::GetSubshape(shard_shape, subindex))) {
+                  return leaf;
+                }
+                return PadToShape(leaf,
+                                  ShapeUtil::GetSubshape(shard_shape, subindex),
+                                  nullptr, comp);
+              }));
+      branches[i]->set_root_instruction(padded,
+                                        /*accept_different_shape=*/true);
+    }
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        ShapeUtil::MakeTupleShape({shard_shape, token->shape()}), branch_index,
+        branches, std::vector<HloInstruction*>(branches.size(), token)));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    const auto& pd = hlo->padding_config().dimensions(i);
+    // Right now we only support non-padded dimensions to be partitioned.
+    if (hlo->sharding().tile_assignment().dim(i) > 1 &&
+        (pd.edge_padding_high() != 0 || pd.edge_padding_low() != 0 ||
+         pd.interior_padding() != 0)) {
+      return DefaultAction(hlo);
+    }
+  }
+  auto resharded_lhs =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  auto replicated_rhs = GetPartitionedHlo(hlo->operand(1))
+                            .Reshard(HloSharding::Replicate())
+                            .hlo();
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        shard_shape, {resharded_lhs, replicated_rhs}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleParameter(HloInstruction* hlo) {
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto new_param = b_.AddInstruction(HloInstruction::CreateParameter(
+        hlo->parameter_number(), shard_shape, "param"));
+    if (hlo->parameter_replicated_at_leaf_buffers()) {
+      new_param->set_parameter_replicated_at_leaf_buffers(
+          *hlo->parameter_replicated_at_leaf_buffers());
+    }
+    return new_param;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
+  int64 input_count = 1;
+  auto per_input_sharding = hlo->sharding();
+  if (hlo->shape().IsTuple()) {
+    input_count = hlo->shape().tuple_shapes_size();
+    CHECK_GT(input_count, 0);
+    per_input_sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  }
+
+  std::vector<PartitionedHlo> inputs;
+  std::vector<HloInstruction*> inits;
+  for (int64 operand_id = 0; operand_id < input_count; ++operand_id) {
+    inits.push_back(GetPartitionedHlo(hlo->operand(operand_id + input_count))
+                        .Reshard(HloSharding::Replicate())
+                        .hlo());
+    inputs.push_back(GetPartitionedHlo(hlo->operand(operand_id)));
+    if (operand_id > 0) {
+      // Make sure all operands are sharded in the same way.
+      inputs.back() = inputs.back().Reshard(inputs[0].sharding());
+    }
+    if (!inputs[0].sharding().IsTileMaximal()) {
+      inputs.back() = inputs.back().PadWithValue(inits[operand_id]);
+    }
+  }
+  bool reduce_sharded_dimension = false;
+  if (!inputs[0].sharding().IsTileMaximal()) {
+    reduce_sharded_dimension = absl::c_any_of(hlo->dimensions(), [&](int64 i) {
+      return inputs[0].sharding().tile_assignment().dim(i) > 1;
+    });
+
+    // reduce_sharded_dimension is not supported for tuple-shaped reduces.
+    if (reduce_sharded_dimension && input_count > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Currently we only support reducing all or none of the sharded
+    // dimensions.
+    if (reduce_sharded_dimension) {
+      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
+        if (inputs[0].sharding().tile_assignment().dim(i) > 1 &&
+            absl::c_count(hlo->dimensions(), i) == 0) {
+          return DefaultAction(hlo);
+        }
+      }
+    }
+  }
+
+  std::vector<Shape*> new_operand_shapes(input_count * 2);
+  for (int64 i = 0; i < input_count; ++i) {
+    new_operand_shapes[i] = inputs[i].hlo()->mutable_shape();
+    new_operand_shapes[i + input_count] = inits[i]->mutable_shape();
+  }
+  // Create the shard shape of the reduce result.
+  TF_ASSIGN_OR_RETURN(
+      auto reduce_shape,
+      ShapeInference::InferReduceShape(new_operand_shapes, hlo->dimensions(),
+                                       hlo->to_apply()->ComputeProgramShape()));
+  *reduce_shape.mutable_layout() = hlo->shape().layout();
+
+  std::vector<HloInstruction*> input_hlos(input_count);
+  for (int64 i = 0; i < input_count; ++i) {
+    input_hlos[i] = inputs[i].hlo();
+  }
+  auto local_reduce = b_.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, input_hlos, inits, hlo->dimensions(), hlo->to_apply()));
+  local_reduce->set_metadata(hlo->metadata());
+
+  SetPartitionedHlo(hlo, [&]() {
+    HloInstruction* reduce;
+    if (reduce_sharded_dimension) {
+      CHECK(local_reduce->shape().IsArray());
+      reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, local_reduce, hlo->to_apply(), NewChannel());
+      reduce->set_sharding(HloSharding::Replicate());
+    } else {
+      reduce = local_reduce;
+      if (inputs[0].sharding().IsTileMaximal()) {
+        reduce->set_sharding(inputs[0].sharding());
+      } else {
+        // Remove tile assignment dimensions that are reduced.
+        std::vector<int64> tile_dimensions;
+        for (int64 i = 0; i < input_hlos[0]->shape().rank(); ++i) {
+          if (absl::c_count(hlo->dimensions(), i) == 0) {
+            tile_dimensions.push_back(
+                inputs[0].sharding().tile_assignment().dim(i));
+          }
+        }
+        Array<int64> new_tile = inputs[0].sharding().tile_assignment();
+        new_tile.Reshape(tile_dimensions);
+        auto sharding = HloSharding::Tile(new_tile);
+        if (input_count > 1) {
+          std::vector<HloSharding> tuple(input_count, sharding);
+          sharding = HloSharding::Tuple(hlo->shape(), tuple);
+        }
+        reduce->set_sharding(sharding);
+      }
+    }
+
+    return PartitionedHlo(reduce, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
+  auto reverse = Cast<HloReverseInstruction>(hlo);
+  if (reverse->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  if (absl::c_all_of(reverse->dimensions(), [&](int64 d) {
+        return reverse->sharding().tile_assignment().dim(d) == 1;
+      })) {
+    auto operand =
+        GetPartitionedHlo(reverse->operand(0)).Reshard(reverse->sharding());
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(operand.hlo()->shape(), {operand.hlo()}));
+    });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+
+  // Shardings for the body parameter, body root, and cond parameter must be
+  // the same, and the condition root must be replicated so that all partitions
+  // follow the same control flow.
+  hlo->while_condition()->parameter_instruction(0)->set_sharding(sharding);
+  hlo->while_body()->parameter_instruction(0)->set_sharding(sharding);
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_condition(),
+                                                HloSharding::Replicate(),
+                                                next_channel_id_, logger_)
+                         .status());
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_body(), sharding,
+                                                next_channel_id_, logger_)
+                         .status());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateWhile(
+        MakePartitionedShape(hlo->shape(), sharding), hlo->while_condition(),
+        hlo->while_body(),
+        GetPartitionedHlo(hlo->operand(0)).Reshard(sharding).hlo()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConditional(HloInstruction* hlo) {
+  std::vector<HloInstruction*> branch_args;
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+
+    // Shardings of the branch computation parameter and its argument must be
+    // the same.
+    computation->parameter_instruction(0)->set_sharding(
+        hlo->operand(i + 1)->sharding());
+    branch_args.push_back(GetPartitionedHlo(hlo->operand(i + 1)).hlo());
+  }
+
+  // The root of the branch computations must follow the sharding of the
+  // conditional instruction.
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+    TF_RETURN_IF_ERROR(partitioner_
+                           ->PartitionComputation(computation, hlo->sharding(),
+                                                  next_channel_id_, logger_)
+                           .status());
+  }
+
+  // We replicate the predicate of the conditional (the first operand) so that
+  // all partitions follow the same control flow.
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        GetPartitionedHlo(hlo->operand(0))
+            .Reshard(HloSharding::Replicate())
+            .hlo(),
+        hlo->called_computations(), branch_args));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  return HandleSingleDevice(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  if (hlo->sharding().IsReplicated()) {
+    SetPartitionedHlo(hlo, [&] {
+      // Run on a single device (0) and distribute the data to all other cores.
+      std::vector<HloInstruction*> new_operands;
+      for (int64 i = 0; i < hlo->operand_count(); ++i) {
+        new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                   .Reshard(HloSharding::AssignDevice(0))
+                                   .hlo());
+      }
+      auto clone = b_.AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+      clone->set_sharding(HloSharding::AssignDevice(0));
+      return PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+          .Reshard(HloSharding::Replicate())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  TF_RET_CHECK(!hlo->sharding().IsTileMaximal());
+  SetPartitionedHlo(hlo, [&] {
+    // Replicate the operands and run partitioned Rng on all devices.
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                 .Reshard(HloSharding::Replicate())
+                                 .hlo());
+    }
+    return b_.AddInstruction(HloInstruction::CreateRng(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        hlo->random_distribution(), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduceWindow(HloInstruction* hlo) {
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(1))
+                             .Reshard(HloSharding::Replicate());
+  auto resharded_operand_and_window = operand.ReshardAsWindowedInput(
+      hlo->window(), hlo->sharding(), replicated_init.hlo());
+  if (!resharded_operand_and_window.has_value()) {
+    return DefaultAction(hlo);
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape sharded_rw_shape,
+                      ShapeInference::InferReduceWindowShape(
+                          resharded_operand_and_window->sharded_input->shape(),
+                          replicated_init.hlo()->shape(),
+                          resharded_operand_and_window->shard_window,
+                          hlo->to_apply()->ComputeProgramShape()));
+  auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  *sharded_rw_shape.mutable_layout() = shard_shape.layout();
+  SetPartitionedHlo(hlo, [&]() {
+    auto sharded_rw = b_.AddInstruction(HloInstruction::CreateReduceWindow(
+        sharded_rw_shape, resharded_operand_and_window->sharded_input,
+        replicated_init.hlo(), resharded_operand_and_window->shard_window,
+        hlo->to_apply()));
+    if (!resharded_operand_and_window->dynamic_slice_index_on_output
+             .has_value()) {
+      CHECK(ShapeUtil::Compatible(shard_shape, sharded_rw->shape()));
+      return sharded_rw;
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_rw,
+        *resharded_operand_and_window->dynamic_slice_index_on_output,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSelectAndScatter(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  auto source = GetPartitionedHlo(hlo->mutable_operand(1));
+  if (hlo->sharding() != operand.sharding()) {
+    operand = operand.Reshard(hlo->sharding());
+  }
+  if (hlo->sharding() != source.sharding()) {
+    source = source.Reshard(hlo->sharding());
+  }
+
+  // For F32 and BF16 types, we can use NaN padding to workaround the issue with
+  // low/high padding, since comparison will return false with NaN input.
+  if (hlo->shape().element_type() != F32 &&
+      hlo->shape().element_type() != BF16) {
+    return DefaultAction(hlo);
+  }
+
+  auto select = hlo->called_computations()[0];
+  auto select_root = select->root_instruction();
+  if (select_root->opcode() != HloOpcode::kCompare ||
+      select_root->operand(0)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(1)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(0)->parameter_number() +
+              select_root->operand(1)->parameter_number() !=
+          1) {
+    return DefaultAction(hlo);
+  }
+
+  float float_pad_value;
+  if (select_root->comparison_direction() == ComparisonDirection::kGe ||
+      select_root->comparison_direction() == ComparisonDirection::kGt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    }
+  } else if (select_root->comparison_direction() == ComparisonDirection::kLe ||
+             select_root->comparison_direction() == ComparisonDirection::kLt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    }
+  } else {
+    return DefaultAction(hlo);
+  }
+
+  auto pad_value = b_.AddInstruction(HloInstruction::CreateConstant(
+      hlo->shape().element_type() == BF16
+          ? LiteralUtil::CreateR0<bfloat16>(
+                static_cast<bfloat16>(float_pad_value))
+          : LiteralUtil::CreateR0<float>(float_pad_value)));
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(2))
+                             .Reshard(HloSharding::Replicate());
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+
+  // The first window for each dimension that overlaps with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> first_window(
+      hlo->shape().rank());
+  // The first window for each dimension that goes beyond with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_window(
+      hlo->shape().rank());
+  std::vector<OffsetCalculation> data_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> data_right_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_right_halo_sizes(hlo->shape().rank());
+  auto unpadded_data_shard_shape =
+      MakePartitionedShape(hlo->shape(), hlo->sharding());
+  auto unpadded_source_shard_shape =
+      MakePartitionedShape(hlo->operand(1)->shape(), hlo->sharding());
+  auto source_shard_hlo = source.hlo();
+  auto data_shard_hlo = operand.hlo();
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    // If stride > window_size, there will be gaps between windows. These gaps
+    // will also exist in the output, so we keep them during halo exchange.
+    //
+    // TODO(yuanzx): This could introduce overhead if partitions start at
+    // different offsets in a gap.
+    auto wd = hlo->window().dimensions(i);
+    if (wd.stride() > wd.size()) {
+      wd.set_size(wd.stride());
+    }
+    // shard_size * i < stride * k - pad_low + window_size  =>
+    //   k > (shard_size * i + pad_low - window_size) / stride  =>
+    //   first_k == (shard_size * i + pad_low - window_size + stride) / stride
+    first_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        wd.padding_low() - wd.size() + wd.stride(), wd.stride());
+    // shard_size * (i + 1) <= stride * k - pad_low  =>
+    //   k >= (shard_size * i + shard_size + pad_low) / stride  =>
+    //   limit_k == (shard_size * i + shard_size + pad_low + stride - 1) /
+    //     stride
+    limit_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        unpadded_data_shard_shape.dimensions(i) + wd.padding_low() +
+            wd.stride() - 1,
+        wd.stride());
+    source_left_halo_sizes[i] =
+        MultiplyAddDivideOffsetCalculation(
+            unpadded_source_shard_shape.dimensions(i), 0, 1) -
+        first_window[i];
+    source_right_halo_sizes[i] =
+        limit_window[i] - MultiplyAddDivideOffsetCalculation(
+                              unpadded_source_shard_shape.dimensions(i),
+                              unpadded_source_shard_shape.dimensions(i), 1);
+    data_left_halo_sizes[i] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i), wd.padding_low(), 1)) -
+        OffsetCalculation(
+            HloOpcode::kMultiply, first_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1));
+    data_right_halo_sizes[i] =
+        OffsetCalculation(
+            HloOpcode::kMultiply, limit_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1)) -
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i),
+            unpadded_data_shard_shape.dimensions(i) + wd.stride() +
+                wd.padding_low() - wd.size(),
+            1));
+
+    int64 max_windows =
+        (limit_window[i] - first_window[i]).MaxInRange(0, shard_count);
+    auto first_window_hlo =
+        first_window[i].Calculate(partition_ordinals[i], &b_);
+    // Padding on the source is filled with the init value so they do not change
+    // the data on overlapping windows.
+    auto resharded_source = ExchangeHaloAndGetValidData(
+        source_shard_hlo, source.base_shape(), source_left_halo_sizes[i],
+        source_right_halo_sizes[i], 0,
+        limit_window[i].Calculate(shard_count - 1), max_windows, i,
+        hlo->sharding(), first_window_hlo, replicated_init.hlo(),
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_source) {
+      return DefaultAction(hlo);
+    }
+    source_shard_hlo = *resharded_source;
+
+    auto offset_start_in_data =
+        MultiplyAddDivideOffsetCalculation(wd.stride(), 0, 1)
+            .Calculate(first_window_hlo, &b_);
+    int64 padded_data_size =
+        (limit_window[i].Calculate(shard_count - 1) - 1) * wd.stride() +
+        wd.size();
+    int64 data_shard_size = (max_windows - 1) * wd.stride() + wd.size();
+    auto resharded_data = ExchangeHaloAndGetValidData(
+        data_shard_hlo, operand.base_shape(), data_left_halo_sizes[i],
+        data_right_halo_sizes[i], wd.padding_low(), padded_data_size,
+        data_shard_size, i, hlo->sharding(), offset_start_in_data, pad_value,
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_data) {
+      return DefaultAction(hlo);
+    }
+    data_shard_hlo = *resharded_data;
+  }
+
+  Window window_on_shard = hlo->window();
+  for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    auto reshard_wd = window_on_shard.mutable_dimensions(i);
+    // The shards are already explicitly padded.
+    reshard_wd->set_padding_low(0);
+    reshard_wd->set_padding_high(0);
+  }
+
+  auto sharded_select_and_scatter =
+      b_.AddInstruction(HloInstruction::CreateSelectAndScatter(
+          data_shard_hlo->shape(), data_shard_hlo, select, window_on_shard,
+          source_shard_hlo, replicated_init.hlo(),
+          hlo->called_computations()[1]));
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    if (ShapeUtil::Compatible(sharded_select_and_scatter->shape(),
+                              shard_shape)) {
+      return sharded_select_and_scatter;
+    }
+    auto zero = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    std::vector<HloInstruction*> slice_offsets(shard_shape.rank(), zero);
+    for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+      if (hlo->sharding().tile_assignment().dim(i) == 1) {
+        continue;
+      }
+      int64 pad_low = hlo->window().dimensions(i).padding_low();
+      auto left_halo_size =
+          data_left_halo_sizes[i].Calculate(partition_ordinals[i], &b_);
+      if (data_left_halo_sizes[i].Calculate(0) == pad_low) {
+        slice_offsets[i] = left_halo_size;
+      } else {
+        auto is_shard0 = b_.AddInstruction(HloInstruction::CreateCompare(
+            ShapeUtil::MakeShape(PRED, {}), zero, partition_ordinals[i],
+            ComparisonDirection::kEq));
+        auto pad_low_hlo = b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(pad_low)));
+        slice_offsets[i] = b_.AddInstruction(HloInstruction::CreateTernary(
+            zero->shape(), HloOpcode::kSelect, is_shard0, pad_low_hlo,
+            left_halo_size));
+      }
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_select_and_scatter, slice_offsets,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    new_operands.push_back(
+        GetPartitionedHlo(hlo->operand(i))
+            .Reshard(hlo->sharding().GetSubSharding(hlo->shape(), {i}))
+            .hlo());
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateTuple(new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
+    HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->opcode() == HloOpcode::kConvolution);
+
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
+               !rhs.sharding().IsTileMaximal());
+
+  const auto& dnums = hlo->convolution_dimension_numbers();
+
+  // Check if the operand shardings are aligned. Also we currently don't
+  // support partitioning non-spatial dimensions.
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                  const HloSharding& rhs_sharding) {
+    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
+               1 ||
+           rhs_sharding.tile_assignment().dim(
+               dnums.kernel_output_feature_dimension()) != 1;
+  };
+
+  auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+      ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+    if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+    rhs = rhs.PadWithValue(zero);
+  } else {
+    if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.PadWithValue(zero);
+    rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+  }
+
+  // Reshard LHS by exchanging halo such that each shard computes the partial
+  // sum of the full shape result, and add AllReduce.
+  //
+  // The size of halo on each dimension can be calculated from the projection
+  // onto the LHS that each RHS shard i needs to read. RHS and LHS below refers
+  // to the shard size of RHS and LHS, WC is the number of windows, and D is the
+  // window dilation.
+  //
+  // * offset(i): RHS * D * i - low_padding
+  // * limit(i): {(RHS - 1) * D + 1} * (i + 1) + (WC - 1) * stride - low_padding
+  //
+  // Since shard i has LHS of range [i * LHS, (i + 1) * LHS)
+  // * left-halo: i * LHS - offset(i)
+  //              = (LHS - RHS) * i + low_padding
+  // * right-halo: limit(i) - (i + 1) * LHS
+  //   = [{(RHS - 1) * D + 1} - LHS] * (i + 1) + (WC - 1) * stride - low_padding
+
+  Window window = hlo->window();
+  std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+  std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+    int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
+    auto wd = window.dimensions(i);
+    if (wd.base_dilation() != 1 || wd.window_reversal()) {
+      return DefaultAction(hlo);
+    }
+
+    int64 lhs_shard_size =
+        CeilOfRatio(lhs.base_shape().dimensions(lhs_dimension), shard_count);
+    int64 rhs_shard_size =
+        CeilOfRatio(rhs.base_shape().dimensions(rhs_dimension), shard_count);
+    shard_counts[i] = shard_count;
+    lhs_shard_sizes[i] = lhs_shard_size;
+    rhs_shard_sizes[i] = rhs_shard_size;
+  }
+
+  std::vector<OffsetCalculation> left_halo_size_functions(hlo->shape().rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(hlo->shape().rank());
+  Window new_window = window;
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+  HloInstruction* lhs_with_halo = lhs.hlo();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 lhs_shard_size = lhs_shard_sizes[i];
+    int64 rhs_shard_size = rhs_shard_sizes[i];
+
+    if (shard_counts[i] == 1) {
+      continue;
+    }
+
+    // Calculate the left and right halo sizes as described in the comments
+    // above.
+    auto wd = window.dimensions(i);
+    int64 padding_low = wd.padding_low();
+    int64 padding_high = wd.padding_high();
+    int64 base = lhs.base_shape().dimensions(lhs_dimension);
+    int64 window_count = 1 + (padding_low + padding_high + base -
+                              (1 + (wd.size() - 1) * wd.window_dilation())) /
+                                 wd.stride();
+    int64 rhs_shard_size_dilated =
+        (rhs_shard_size - 1) * wd.window_dilation() + 1;
+
+    left_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            lhs_shard_size - rhs_shard_size * wd.window_dilation(), padding_low,
+            1));
+    right_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_size_dilated - lhs_shard_size,
+            rhs_shard_size_dilated - lhs_shard_size +
+                wd.stride() * (window_count - 1) - padding_low,
+            1));
+
+    // Exchange halo and concatenate.
+    int64 dim = dnums.input_spatial_dimensions(i);
+    int64 explicit_left_padding_on_full_shape = padding_low;
+    int64 shard_size_with_halo =
+        wd.stride() * (window_count - 1) + rhs_shard_size_dilated;
+
+    new_window.mutable_dimensions(i)->set_padding_low(0);
+    new_window.mutable_dimensions(i)->set_padding_high(0);
+    new_window.mutable_dimensions(i)->set_size(rhs_shard_size);
+
+    // offset_on_padded_shape and padded_full_shape_size are needed only if
+    // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+    // Since the default value for both the collective-permute is zero and
+    // also we call PadWithValue() on both operands at the beginning, we
+    // don't need to mask here.
+    //
+    // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+    // if it's always safe.
+    auto offset_on_padded_shape =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation());
+    int64 padded_full_shape_size = 0;
+    auto concat = ExchangeHaloAndGetValidData(
+        lhs_with_halo, lhs.base_shape(), left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+        padded_full_shape_size, shard_size_with_halo, dim, lhs.sharding(),
+        offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_), zero,
+        partition_ordinals[dim], collective_ops_creator_, next_channel_id_, &b_,
+        /*mask_invalid_region=*/false);
+    if (!concat) {
+      return DefaultAction(hlo);
+    }
+    lhs_with_halo = *concat;
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+        hlo->shape(), lhs_with_halo, rhs.hlo(), hlo->feature_group_count(),
+        hlo->batch_group_count(), new_window,
+        hlo->convolution_dimension_numbers(), hlo->precision_config()));
+    auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+        &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+        NewChannel());
+    ar->set_sharding(HloSharding::Replicate());
+    return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  const HloSharding& sharding = hlo->sharding();
+  const auto& dnums = hlo->convolution_dimension_numbers();
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  // Handling cases where both operands' shardings are aligned. We check that
+  // the LHS batch dimension is not partitioned because it is mapped to the
+  // output feature dimension in aligned_rhs_sharding, which are not the same
+  // dimension.
+  if (!lhs.sharding().IsTileMaximal() && !rhs.sharding().IsTileMaximal()) {
+    if (options_.conv_halo_exchange_always_on_lhs) {
+      return HandleConvolutionTiledLhsAndRhs(hlo);
+    } else {
+      // Reshard RHS so that each shard computes the partial sum of the full
+      // shape result, and add AllReduce. See HandleConvolutionTiledLhsAndRhs()
+      // that reshards LHS.
+      //
+      // The size of halo on each dimension can be calculated from the
+      // projection onto the RHS that shard i needs to read. RHS and LHS below
+      // refers to the shard size of RHS and LHS, WC is the number of windows,
+      // and D is the window dilation.
+      //
+      // * offset(i): LHS * i + low_padding - (WC - 1) * stride
+      // * limit(i): LHS * (i + 1) + low_padding
+      //
+      // Since shard i has RHS of range [i * RHS * D, (i + 1) * RHS * D)
+      // * left-halo: i * RHS - offset(i)
+      //              = i * (RHS * D - LHS) + (WC - 1) * stride - low_padding
+      // * right-halo: limit(i) - (i + 1) * RHS
+      //              = (i + 1) * (LHS - RHS * D) + low_pading
+
+      auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                      const HloSharding& rhs_sharding) {
+        // We currently don't support partitioning input batch or output feature
+        // dimensions.
+        return lhs_sharding.tile_assignment().dim(
+                   dnums.input_batch_dimension()) != 1 ||
+               rhs_sharding.tile_assignment().dim(
+                   dnums.kernel_output_feature_dimension()) != 1;
+      };
+      auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(hlo->shape().element_type())));
+      if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+          ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+        if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+        rhs = rhs.PadWithValue(zero);
+      } else {
+        if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.PadWithValue(zero);
+        rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+      }
+
+      Window window = hlo->window();
+      std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+      std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
+        auto wd = window.dimensions(i);
+        if (wd.base_dilation() != 1 || wd.window_reversal()) {
+          return DefaultAction(hlo);
+        }
+
+        int64 lhs_shard_size = CeilOfRatio(
+            lhs.base_shape().dimensions(lhs_dimension), shard_count);
+        int64 rhs_shard_size = CeilOfRatio(
+            rhs.base_shape().dimensions(rhs_dimension), shard_count);
+        shard_counts[i] = shard_count;
+        lhs_shard_sizes[i] = lhs_shard_size;
+        rhs_shard_sizes[i] = rhs_shard_size;
+      }
+
+      std::vector<OffsetCalculation> left_halo_size_functions(
+          hlo->shape().rank());
+      std::vector<OffsetCalculation> right_halo_size_functions(
+          hlo->shape().rank());
+      Window new_window = window;
+
+      // Data structures needed for Pad and DynamicSlice on LHS if needed.
+      bool need_dynamic_slice_lhs = false;
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+      std::vector<int64> zero_padding(hlo->shape().rank());
+      PaddingConfig pad_config =
+          window_util::MakeSymmetricPadding(zero_padding);
+      auto zero_s32 = b_.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> dynamic_slice_start_indices(
+          hlo->shape().rank(), zero_s32);
+      Shape dynamic_slice_shape = lhs.hlo()->shape();
+      Shape pad_shape = lhs.hlo()->shape();
+
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 lhs_shard_size = lhs_shard_sizes[i];
+        int64 rhs_shard_size = rhs_shard_sizes[i];
+
+        if (shard_counts[i] == 1) {
+          continue;
+        }
+
+        // Calculate the left and right halo sizes as described in the comments
+        // above. It calculcates the halo sizes with dilation, so we apply
+        // CeilOfRatio({left,right}_halo_size, window_dilation).
+        auto wd = window.dimensions(i);
+        int64 padding_low = wd.padding_low();
+        int64 padding_high = wd.padding_high();
+        int64 base = lhs.base_shape().dimensions(lhs_dimension);
+        int64 window_count =
+            1 + (padding_low + padding_high + base -
+                 (1 + (wd.size() - 1) * wd.window_dilation())) /
+                    wd.stride();
+        left_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                (window_count - 1) * wd.stride() - padding_low +
+                    wd.window_dilation() - 1,
+                wd.window_dilation()));
+        right_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                lhs_shard_size - rhs_shard_size * wd.window_dilation(),
+                lhs_shard_size - rhs_shard_size * wd.window_dilation() +
+                    padding_low + wd.window_dilation() - 1,
+                wd.window_dilation()));
+
+        // New RHS window size includes the maximum of both left and right
+        // halos.
+        int64 halo_size = left_halo_size_functions[rhs_dimension].MaxInRange(
+                              1, shard_counts[i]) +
+                          right_halo_size_functions[rhs_dimension].MaxInRange(
+                              0, shard_counts[i] - 1);
+        int64 new_window_size =
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size;
+
+        // The amount of new low padding could be dynamic (e.g., window_dilation
+        // != 1), which requires pad (to the maximum) and dynamic slice on LHS.
+        //
+        // If we consider the first window, the offset of the dilated RHS that
+        // aligns with the first valid LHS element for shard i is 'padding_low +
+        // LHS * i'. When the left halo is added to RHS, the offset of the first
+        // RHS element is (RHS * i - left_halo) * window_dilation. The
+        // difference between the two values is the amount of padding_low we
+        // need on LHS.
+        auto new_padding_low_function =
+            OffsetCalculation(
+                HloOpcode::kMultiply, left_halo_size_functions[rhs_dimension],
+                OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                    0, wd.window_dilation(), 1))) -
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                -padding_low, 1));
+
+        int64 new_padding_low_max =
+            new_padding_low_function.MaxInRange(0, shard_counts[i]);
+        int64 new_padding_low = new_padding_low_max;
+        int64 new_padding_high = window_count * wd.stride() +
+                                 (new_window_size - 1) * wd.window_dilation() -
+                                 new_padding_low - lhs_shard_size;
+
+        // We do pad/dynamic-slice only when the padding is dynamic.
+        if (!new_padding_low_function.IsConstant()) {
+          need_dynamic_slice_lhs = true;
+          new_padding_low = 0;
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_low(new_padding_low_max);
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_high(new_padding_low_max);
+          pad_shape.set_dimensions(lhs_dimension,
+                                   lhs_shard_size + 2 * new_padding_low_max);
+          dynamic_slice_start_indices[lhs_dimension] =
+              (OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                   0, new_padding_low_max, 1)) -
+               new_padding_low_function)
+                  .Calculate(partition_ordinals[lhs_dimension], &b_);
+          dynamic_slice_shape.set_dimensions(
+              lhs_dimension, lhs_shard_size + new_padding_low_max);
+        }
+
+        // Since the convolution RHS operand size increased with halos, adjust
+        // the window config accordingly.
+        new_window.mutable_dimensions(i)->set_padding_low(new_padding_low);
+        new_window.mutable_dimensions(i)->set_padding_high(new_padding_high);
+        new_window.mutable_dimensions(i)->set_size(
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size);
+      }
+
+      HloInstruction* conv_lhs = lhs.hlo();
+      if (need_dynamic_slice_lhs) {
+        auto pad = b_.AddInstruction(
+            HloInstruction::CreatePad(pad_shape, lhs.hlo(), zero, pad_config));
+        conv_lhs = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+            dynamic_slice_shape, pad, dynamic_slice_start_indices,
+            dynamic_slice_shape.dimensions()));
+      }
+
+      // Exchange halo and concatenate.
+      HloInstruction* rhs_with_halo = rhs.hlo();
+      for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+        int64 dim = dnums.kernel_spatial_dimensions(i);
+        int64 explicit_left_padding_on_full_shape =
+            left_halo_size_functions[dim].Calculate(0);
+        int64 shard_size_with_halo = new_window.dimensions(i).size();
+
+        // offset_on_padded_shape and padded_full_shape_size are needed only if
+        // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+        // Since the default value for both the collective-permute is zero and
+        // also we call PadWithValue() on both operands at the beginning, we
+        // don't need to mask here.
+        //
+        // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+        // if it's always safe.
+        auto offset_on_padded_shape =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_sizes[i], explicit_left_padding_on_full_shape, 1)) -
+            left_halo_size_functions[dim];
+        int64 padded_full_shape_size =
+            offset_on_padded_shape.Calculate(shard_counts[i] - 1) +
+            new_window.dimensions(i).size();
+        auto concat = ExchangeHaloAndGetValidData(
+            rhs_with_halo, rhs.base_shape(), left_halo_size_functions[dim],
+            right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+            padded_full_shape_size, shard_size_with_halo, dim, rhs.sharding(),
+            offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_),
+            zero, partition_ordinals[dim], collective_ops_creator_,
+            next_channel_id_, &b_, /*mask_invalid_region=*/false);
+        if (!concat) {
+          return DefaultAction(hlo);
+        }
+        rhs_with_halo = *concat;
+      }
+
+      SetPartitionedHlo(hlo, [&]() {
+        auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+            hlo->shape(), conv_lhs, rhs_with_halo, hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums,
+            hlo->precision_config()));
+        auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+            &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+            NewChannel());
+        ar->set_sharding(HloSharding::Replicate());
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+
+  if (!sharding.IsTileMaximal()) {
+    // We don't currently support sharding on output feature dimension.
+    if (sharding.tile_assignment().dim(dnums.output_feature_dimension()) > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Check if the operand and the output sharding are aligned.
+    std::vector<int64> input_to_output_indices(hlo->shape().rank());
+    input_to_output_indices[dnums.input_batch_dimension()] =
+        dnums.output_batch_dimension();
+    input_to_output_indices[dnums.input_feature_dimension()] =
+        dnums.output_feature_dimension();
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      input_to_output_indices[dnums.input_spatial_dimensions(i)] =
+          dnums.output_spatial_dimensions(i);
+    }
+    auto target_operand_sharding =
+        hlo_sharding_util::TransposeSharding(sharding, input_to_output_indices);
+    lhs = lhs.Reshard(target_operand_sharding);
+
+    // Replicate the RHS.
+    rhs = rhs.Reshard(HloSharding::Replicate());
+
+    // Convolution window config does not include batch and feature dimensions,
+    // whereas ReshardAsWindowedInput() expects the same number of window
+    // dimensions as the rank of the operand. So add two more trivial
+    // dimensions.
+    std::vector<int64> ones(hlo->shape().rank(), 1);
+    auto operand_window = window_util::MakeWindow(ones);
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *operand_window.mutable_dimensions(dnums.input_spatial_dimensions(i)) =
+          hlo->window().dimensions(i);
+    }
+
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    auto resharded_operand_and_window = lhs.ReshardAsWindowedInput(
+        operand_window, target_operand_sharding, zero);
+    if (!resharded_operand_and_window.has_value()) {
+      return DefaultAction(hlo);
+    }
+    Window new_window;
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *new_window.add_dimensions() =
+          resharded_operand_and_window->shard_window.dimensions(
+              dnums.input_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(
+        Shape sharded_conv_shape,
+        ShapeInference::InferConvolveShape(
+            resharded_operand_and_window->sharded_input->shape(),
+            rhs.hlo()->shape(), hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums));
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    *sharded_conv_shape.mutable_layout() = shard_shape.layout();
+    SetPartitionedHlo(hlo, [&]() {
+      auto sharded_conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+          sharded_conv_shape, resharded_operand_and_window->sharded_input,
+          rhs.hlo(), hlo->feature_group_count(), hlo->batch_group_count(),
+          new_window, dnums, hlo->precision_config()));
+      if (!resharded_operand_and_window->dynamic_slice_index_on_output
+               .has_value()) {
+        CHECK(ShapeUtil::Compatible(shard_shape, sharded_conv->shape()));
+        return sharded_conv;
+      }
+      return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+          shard_shape, sharded_conv,
+          *resharded_operand_and_window->dynamic_slice_index_on_output,
+          shard_shape.dimensions()));
+    });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
+  DotGeneralDimsMapping mapping;
+  const auto& dnums = hlo->dot_dimension_numbers();
+  int64 next_output_dim = 0;
+  for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
+    mapping.batch_dims.emplace_back();
+    mapping.batch_dims.back().lhs = dnums.lhs_batch_dimensions(i);
+    mapping.batch_dims.back().rhs = dnums.rhs_batch_dimensions(i);
+    mapping.batch_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+    mapping.contracting_dims.emplace_back();
+    mapping.contracting_dims.back().lhs = dnums.lhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().rhs = dnums.rhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().output = -1;
+  }
+  for (int64 i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.lhs_non_contracting_dims.emplace_back();
+    mapping.lhs_non_contracting_dims.back().lhs = i;
+    mapping.lhs_non_contracting_dims.back().rhs = -1;
+    mapping.lhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < hlo->operand(1)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.rhs_non_contracting_dims.emplace_back();
+    mapping.rhs_non_contracting_dims.back().lhs = -1;
+    mapping.rhs_non_contracting_dims.back().rhs = i;
+    mapping.rhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  auto create_sharded_dot = [&](HloInstruction* l, HloInstruction* r,
+                                SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharded_dot_shape,
+        ShapeInference::InferDotOpShape(l->shape(), r->shape(),
+                                        hlo->dot_dimension_numbers()));
+    return b->AddInstruction(HloInstruction::CreateDot(
+        sharded_dot_shape, l, r, hlo->dot_dimension_numbers(),
+        hlo->precision_config()));
+  };
+  return HandleDotHelper(hlo, mapping, create_sharded_dot);
+}
+
+Status SpmdPartitioningVisitor::HandleDotHelper(
+    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
+  const HloSharding& lhs_sharding = hlo->operand(0)->sharding();
+  const HloSharding& rhs_sharding = hlo->operand(1)->sharding();
+
+  // Similar to hlo_sharding_util::TransposeSharding(), but allows
+  // removing/adding non-partitioned dimensions.
+  auto transpose_sharding =
+      [&](const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+          absl::Span<int64 const> tgt_to_src) -> absl::optional<HloSharding> {
+    if (source.IsTileMaximal()) {
+      return source;
+    }
+    std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
+    int64 skipped_tgt_dims = 0;
+    for (int64 i = 0; i < tgt_to_src.size(); ++i) {
+      if (tgt_to_src[i] < 0) {
+        skipped_tgt_dims++;
+      } else {
+        tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
+      }
+    }
+    int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
+    std::vector<int64> perm(src_to_tgt.size());
+    for (int64 i = 0; i < src_to_tgt.size(); ++i) {
+      if (src_to_tgt[i] < 0) {
+        if (source.tile_assignment().dim(i) > 1) {
+          return absl::nullopt;
+        }
+        perm[src_to_tgt.size() - skipped_src_dims] = i;
+        skipped_src_dims--;
+      } else {
+        perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
+      }
+    }
+    auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
+    if (skipped_tgt_dims == 0) {
+      return tgt_sharding;
+    }
+    auto reshape_tiles = tgt_sharding.tile_assignment();
+    std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
+    for (int64 i = 0; i < tgt_tiles.size(); ++i) {
+      if (tgt_to_src[i] >= 0) {
+        tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
+      }
+    }
+    reshape_tiles.Reshape(tgt_tiles);
+    return HloSharding::Tile(reshape_tiles);
+  };
+
+  std::vector<int64> lhs_to_rhs_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> lhs_to_output_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> rhs_to_lhs_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> rhs_to_output_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> output_to_lhs_indices(hlo->shape().rank(), -1);
+  std::vector<int64> output_to_rhs_indices(hlo->shape().rank(), -1);
+  auto populate_indices_mapping =
+      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
+        if (mapping.lhs >= 0) {
+          lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
+          lhs_to_output_indices[mapping.lhs] = mapping.output;
+        }
+        if (mapping.rhs >= 0) {
+          rhs_to_lhs_indices[mapping.rhs] = mapping.lhs;
+          rhs_to_output_indices[mapping.rhs] = mapping.output;
+        }
+        if (mapping.output >= 0) {
+          output_to_lhs_indices[mapping.output] = mapping.lhs;
+          output_to_rhs_indices[mapping.output] = mapping.rhs;
+        }
+      };
+  for (const auto& mapping : dims_mapping.batch_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.lhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  auto lhs_sharding_transposed_to_match_rhs =
+      transpose_sharding(lhs_sharding, lhs_to_rhs_indices, rhs_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_lhs =
+      transpose_sharding(rhs_sharding, rhs_to_lhs_indices, lhs_to_rhs_indices);
+  auto lhs_sharding_transposed_to_match_output = transpose_sharding(
+      lhs_sharding, lhs_to_output_indices, output_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_output = transpose_sharding(
+      rhs_sharding, rhs_to_output_indices, output_to_rhs_indices);
+  auto output_sharding_transposed_to_match_lhs = transpose_sharding(
+      hlo->sharding(), output_to_lhs_indices, lhs_to_output_indices);
+  auto output_sharding_transposed_to_match_rhs = transpose_sharding(
+      hlo->sharding(), output_to_rhs_indices, rhs_to_output_indices);
+
+  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
+  auto get_partitions_for_dims =
+      [&](const HloSharding& sharding,
+          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
+          int lhs_rhs_or_output) {
+        int64 partitions = 1;
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_rhs_or_output == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else if (lhs_rhs_or_output == 1) {
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          } else {
+            CHECK_EQ(lhs_rhs_or_output, 2);
+            partitions *= sharding.tile_assignment().dim(dim.output);
+          }
+        }
+        return partitions;
+      };
+  const int64 lhs_batch_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.batch_dims, 0);
+  const int64 rhs_batch_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.batch_dims, 1);
+  const int64 output_batch_partitions =
+      get_partitions_for_dims(hlo->sharding(), dims_mapping.batch_dims, 2);
+  const int64 lhs_contracting_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.contracting_dims, 0);
+  const int64 rhs_contracting_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.contracting_dims, 1);
+  const int64 lhs_non_contracting_partitions = get_partitions_for_dims(
+      lhs_sharding, dims_mapping.lhs_non_contracting_dims, 0);
+  const int64 rhs_non_contracting_partitions = get_partitions_for_dims(
+      rhs_sharding, dims_mapping.rhs_non_contracting_dims, 1);
+  const int64 output_lhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.lhs_non_contracting_dims, 2);
+  const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.rhs_non_contracting_dims, 2);
+
+  auto& lhs = GetPartitionedHlo(hlo->operand(0));
+  auto& rhs = GetPartitionedHlo(hlo->operand(1));
+  // LHS and RHS are partitioned the same way and only partitioned in batch
+  // dimensions.
+  if (lhs_batch_partitions == rhs_batch_partitions &&
+      rhs_batch_partitions == num_partitions_ &&
+      lhs_sharding_transposed_to_match_rhs == rhs_sharding) {
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      dot->set_sharding(*lhs_sharding_transposed_to_match_output);
+      return PartitionedHlo(dot, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // Try emit batch-partitioned einsum with one operand resharded. Returns
+  // whether the attempt succeeds. If may_reshard_with_allreduce is false,
+  // reshard must be done using all-to-all; otherwise this attempt fails.
+  auto try_emit_output_batch_partitioned_einsum_with_reshard =
+      [&](bool may_reshard_with_allreduce) -> StatusOr<bool> {
+    // LHS and output are batch partitioned in the same way.
+    if (lhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        lhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(rhs.sharding(),
+                                  *lhs_sharding_transposed_to_match_rhs)) {
+        return false;
+      }
+      auto resharded_rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    // RHS and output are batch partitioned in the same way.
+    if (rhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(lhs.sharding(),
+                                  *rhs_sharding_transposed_to_match_lhs)) {
+        return false;
+      }
+      auto resharded_lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    return false;
+  };
+
+  {
+    // Try batch-parallel by resharding one operand, and not using all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(false));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // Try to emit windowed DotGeneral when one operand is partitioned in the same
+  // way as the output along non-contracting dimensions, but the other operand
+  // is tiled in other dimensions.
+  auto emit_windowed_dot_general = [&](int64 matching_operand,
+                                       int64 windowing_operand,
+                                       bool windowed_at_contracting_dims,
+                                       bool windowed_at_batch_dims) {
+    CHECK_EQ(matching_operand + windowing_operand, 1);
+    CHECK(!windowed_at_batch_dims || !windowed_at_contracting_dims);
+    auto unpadded_result_buffer_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto padded_result_buffer_shape = unpadded_result_buffer_shape;
+    // For windowing at batch/non-contracting dims, we produce the result one
+    // partition at a time, so we need to pad the shape in case of uneven
+    // partitioning in order to make dynamic-update-slice in-bound.
+    if (!windowed_at_contracting_dims) {
+      padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
+          padded_result_buffer_shape,
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output);
+    }
+    // Mask the padding area of the windowed operand with zero if there is
+    // uneven partitioning.
+    if (windowed_at_contracting_dims) {
+      auto& to_mask = windowing_operand == 0 ? lhs : rhs;
+      to_mask =
+          to_mask.PadWithValue(b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::Zero(hlo->shape().element_type()))));
+    }
+    auto result_buffer = CreateZero(padded_result_buffer_shape, &b_);
+    auto iteration = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(0)));
+
+    // Create a while loop that computes one window per iteration. During each
+    // iteration, each partition sends its input window to its neighbor using
+    // collective-permute for the next iteration.
+    SpmdBuilder body_b("windowed_dot_general_body", visiting_hlo_);
+    auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto l = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(lhs.hlo()->shape(), param, 0));
+    auto r = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(rhs.hlo()->shape(), param, 1));
+    auto o = body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        result_buffer->shape(), param, 2));
+    auto i = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(iteration->shape(), param, 3));
+
+    auto partition_id = collective_ops_creator_.create_partition_id(&body_b);
+    auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, partition_id));
+    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<uint32>(num_partitions_)));
+    data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
+    auto dot_lhs = l;
+    auto dot_rhs = r;
+    if (windowed_at_contracting_dims || windowed_at_batch_dims) {
+      // Slice the matching operand according to the partitioned contracting
+      // dimensions on the windowed operand. We do this by treating the matching
+      // operand as replicated, and resharding it to match the windowed operand.
+      auto slice_operand = matching_operand == 0 ? l : r;
+      slice_operand->set_sharding(HloSharding::Replicate());
+      auto state = MakePartitioningState();
+      state.b = &body_b;
+      state.partition_id = data_partition_id;
+      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
+                       .Reshard(windowing_operand == 0
+                                    ? *lhs_sharding_transposed_to_match_rhs
+                                    : *rhs_sharding_transposed_to_match_lhs)
+                       .hlo();
+      slice_operand->clear_sharding();
+      if (matching_operand == 0) {
+        dot_lhs = slice;
+      } else {
+        dot_rhs = slice;
+      }
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(dot_lhs, dot_rhs, &body_b));
+    if (windowed_at_contracting_dims) {
+      // Accumulate the partial output to the result buffer.
+      o = body_b.AddInstruction(
+          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
+    } else {
+      // The windowing operand is partitioned along batch/non-contracting
+      // dimensions, so we need a dynamic-update-slice to save the partial
+      // output in the result buffer.
+      auto offsets = MakePartitionOffsets(
+          o->shape(),
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output,
+          data_partition_id, &body_b);
+      o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          o->shape(), o, dot, offsets));
+    }
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i,
+        body_b.AddInstruction(
+            HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)))));
+    auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), i,
+        body_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    // Collective-permute for the next window. We don't need it for the last
+    // iteration, so we use a conditional around the collective-permute.
+    HloInstruction* conditional;
+    {
+      SpmdBuilder cp_b("window_collective_permute", visiting_hlo_);
+      {
+        auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+        std::vector<std::pair<int64, int64>> sd_pairs(num_partitions_);
+        for (int64 source = 0; source < num_partitions_; ++source) {
+          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+          sd_pairs[source] = {source,
+                              (source - 1 + num_partitions_) % num_partitions_};
+        }
+        collective_ops_creator_.create_cross_partition_collective_permute(
+            &cp_b, p, sd_pairs, (*next_channel_id_)++);
+      }
+      SpmdBuilder ncp_b("last_iteration_noop", visiting_hlo_);
+      {
+        ncp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+      }
+      conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
+          windowing_operand == 0 ? l->shape() : r->shape(), has_more,
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(cp_b.Build()),
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(ncp_b.Build())));
+    }
+    if (windowing_operand == 0) {
+      l = conditional;
+    } else {
+      r = conditional;
+    }
+    body_b.AddInstruction(HloInstruction::CreateTuple({l, r, o, i}));
+
+    SpmdBuilder cond_b("windowed_dot_general_cond", visiting_hlo_);
+    auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto cond_i = cond_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        iteration->shape(), cond_param, 3));
+    cond_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), cond_i,
+        cond_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    auto while_loop = b_.AddInstruction(HloInstruction::CreateWhile(
+        cond_param->shape(), module_->AddEmbeddedComputation(cond_b.Build()),
+        module_->AddEmbeddedComputation(body_b.Build()),
+        b_.AddInstruction(HloInstruction::CreateTuple(
+            {lhs.hlo(), rhs.hlo(), result_buffer, iteration}))));
+    windowed_dot_general_loops_.push_back({while_loop, windowing_operand,
+                                           windowed_at_contracting_dims,
+                                           windowed_at_batch_dims});
+    SetPartitionedHlo(hlo, [&] {
+      auto result = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          result_buffer->shape(), while_loop, 2));
+      if (!ShapeUtil::Compatible(padded_result_buffer_shape,
+                                 unpadded_result_buffer_shape)) {
+        result = b_.AddInstruction(HloInstruction::CreateSlice(
+            unpadded_result_buffer_shape, result,
+            std::vector<int64>(padded_result_buffer_shape.rank(), 0),
+            unpadded_result_buffer_shape.dimensions(),
+            std::vector<int64>(padded_result_buffer_shape.rank(), 1)));
+      }
+      return result;
+    });
+    return Status::OK();
+  };
+  if (output_lhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_lhs == lhs_sharding &&
+      ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (rhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, true, false);
+    }
+    if (rhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, false);
+    }
+    if (rhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, true);
+    }
+  }
+  if (output_rhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_rhs == rhs_sharding &&
+      ShapeUtil::ByteSizeOf(hlo->operand(0)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (lhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, true, false);
+    }
+    if (lhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, false);
+    }
+    if (lhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, true);
+    }
+  }
+
+  {
+    // Try batch-parallel by resharding one operand, and allowing all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(true));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // LHS and RHS have the same partitioned contracting dimensions.
+  if (lhs_contracting_partitions == rhs_contracting_partitions &&
+      lhs_contracting_partitions == num_partitions_) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    // Pad both sides with zero, since NaN at one side cannot be masked by zero
+    // on the other side.
+    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // LHS and output have the same partitioned non-contracting dimensions.
+  if (lhs_non_contracting_partitions == num_partitions_ &&
+      output_lhs_non_contracting_partitions == num_partitions_ &&
+      lhs_sharding == hlo->sharding()) {
+    auto rhs_replicated = rhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs_replicated, &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // RHS and output have the same partitioned non-contracting dimensions.
+  if (rhs_non_contracting_partitions == num_partitions_ &&
+      output_rhs_non_contracting_partitions == num_partitions_ &&
+      rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+    auto lhs_replicated = lhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs_replicated, rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Output is batch partitioned.
+  if (output_batch_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along LHS non-contracting dimensions.
+  if (output_lhs_non_contracting_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along RHS non-contracting dimensions.
+  if (output_rhs_non_contracting_partitions == num_partitions_) {
+    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Returns true if it is beneficial to reshard the operand at `operand_idx`
+  // across the contracting dimension.
+  const auto should_partition_contracting_dim = [&](int64 operand_idx) {
+    if (!hlo->sharding().IsReplicated()) {
+      return false;
+    }
+
+    if (operand_idx == 0) {
+      // If LHS and output are replicated, we compare the cost of all-gather
+      // on RHS vs all-reduce on the output.
+      return (rhs_contracting_partitions == num_partitions_) &&
+             lhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(1)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    } else {
+      return (lhs_contracting_partitions == num_partitions_) &&
+             rhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(0)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    }
+  };
+
+  // When the output is replicated and one of the operands is partitioned along
+  // contracting dimension, align the other operand to be partitioned along
+  // the contracting dimensions.
+  if (hlo->sharding().IsReplicated() && (should_partition_contracting_dim(0) ||
+                                         should_partition_contracting_dim(1))) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    if (should_partition_contracting_dim(0)) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState()).hlo();
+    });
+    return Status::OK();
+  }
+
+  return DefaultAction(hlo);
+}
+
+namespace {
+
+// Finds a cluster of nodes that produce the inputs for `hlo` which only depend
+// on small operands, which means the cluster should start with broadcasts,
+// constants and iotas. All other internal nodes must be non-side-effecting
+// elemntwise ops. Returns the set of nodes, and the small operands. E.g., for
+// the following graph,
+//
+//     a -> broadcast -> multiply
+//     iota  ---> add--/
+//     constant/
+//
+// FindInputNodesIfOnlyDependOnSmallOperands(multiply) will return
+//    <{broadcast, iota, constant, add, multiply}, [a]>.
+std::pair<std::unordered_set<HloInstruction*>, std::vector<HloInstruction*>>
+FindInputNodesIfOnlyDependOnSmallOperands(HloInstruction* hlo) {
+  std::unordered_set<HloInstruction*> nodes_found;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> worklist;
+  worklist.push_back(hlo);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (nodes_found.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast ||
+        inst->opcode() == HloOpcode::kConstant ||
+        inst->opcode() == HloOpcode::kIota) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        auto res = new_operands_set.emplace(o);
+        if (res.second) {
+          new_operands.push_back(o);
+        }
+      }
+    } else if (inst->IsElementwise() && !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        worklist.push_back(o);
+      }
+    } else {
+      nodes_found.clear();
+      new_operands.clear();
+      break;
+    }
+  }
+  return {std::move(nodes_found), std::move(new_operands)};
+}
+
+// Moves a cluster of memory-reducing nodes into the windowed dot-general loop
+// on contracting dimensions. Such a loop has a dynamic slice on the
+// non-windowed operand. If we move the input nodes into the loop, the
+// dynamic-slice could be merged with them by later optimization passes, which
+// reduces memory.
+//
+// small_operands             small_operands
+//        |                          |
+// input_nodes                loop { |
+//        |          =>         input_nodes
+// loop { |                          |
+//    dynamic-slice             dynamic-slice
+//    ...                       ...
+// }                          }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes.
+Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+    HloInstruction* loop, int64 non_windowed_operand_index) {
+  auto input_tuple = loop->mutable_operand(0);
+  auto old_operand = input_tuple->mutable_operand(non_windowed_operand_index);
+  auto input_nodes = FindInputNodesIfOnlyDependOnSmallOperands(old_operand);
+  auto to_sink = std::move(input_nodes.first);
+  auto new_operands = std::move(input_nodes.second);
+  if (to_sink.empty()) {
+    return Status::OK();
+  }
+  auto computation = loop->parent();
+  // Replace the old operand with a tuple of the found small operands.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(input_tuple->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_input_subtuple));
+
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto old_body_param_users = body_param->users();
+  // Update all tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body->root_instruction()}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(),
+                                   {non_windowed_operand_index}) =
+        new_input_subtuple->shape();
+  }
+  // Now update the loop body.
+  auto new_operand_tuple_inside =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, non_windowed_operand_index));
+  TF_RETURN_IF_ERROR(body->root_instruction()->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_operand_tuple_inside));
+
+  // Create nodes inside the loop body.
+  std::vector<HloInstruction*> worklist;
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_sink.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_operand_tuple_inside, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // HLOs to sink without operands.
+  std::vector<HloInstruction*> nullaries_to_sink;
+  for (auto inst : to_sink) {
+    if (inst->operand_count() == 0) {
+      nullaries_to_sink.push_back(inst);
+    }
+  }
+  // Sort nullaries_to_sink to make it deterministic.
+  absl::c_sort(nullaries_to_sink,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->unique_id() < b->unique_id();
+               });
+  for (auto inst : nullaries_to_sink) {
+    worklist.push_back(inst);
+  }
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    std::vector<HloInstruction*> inst_new_operands(inst->operand_count());
+    for (int64 i = 0; i < inst->operand_count(); ++i) {
+      inst_new_operands[i] = outside_to_inside[inst->operand(i)];
+    }
+    outside_to_inside[inst] = body->AddInstruction(
+        inst->CloneWithNewOperands(inst->shape(), inst_new_operands));
+    add_users_if_available(inst);
+  }
+  TF_RET_CHECK(outside_to_inside.count(old_operand) > 0);
+  for (auto ou : old_body_param_users) {
+    if (ou->opcode() == HloOpcode::kGetTupleElement &&
+        ou->tuple_index() == non_windowed_operand_index) {
+      TF_RETURN_IF_ERROR(
+          ou->ReplaceAllUsesWith(outside_to_inside[old_operand]));
+      TF_RETURN_IF_ERROR(body->RemoveInstruction(ou));
+    }
+  }
+  return Status::OK();
+}
+
+// Moves a cluster of memory-reducing nodes (with reduce nodes at the end) into
+// the windowed dot-general loop on non-contracting dimensions. Such a loop has
+// a dynamic-update-slice at the output. If we move the user nodes into the loop
+// and before the dynamic-update-slice, the user nodes can operate on smaller
+// shapes, which reduces memory.
+//
+// small_operands                   small_operands
+//  | |                 =>                  | |
+//  | |  loop {                     loop {  | |
+//  | |    conv                             | broadcast      conv
+//  | |      |                              |     |           /
+//  | | dynamic-update-slice                |  dynamic-slice /
+//  | |         |                           |     |         /
+//  | |  }      |                           |  multiply-----
+//  |broadcast  /                           |    /
+//  | |        /                            reduce
+//  |multiply--                             |
+//  \ |                                dynamic-update-slice
+//   reduce                         }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes (broadcast).
+Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+    HloInstruction* loop) {
+  CHECK_EQ(loop->user_count(), 1);
+  // There should be a single direct user of the while loop, which is the
+  // gte for element 2, i.e., the dot output.
+  auto user_gte = loop->users().front();
+  CHECK_EQ(user_gte->opcode(), HloOpcode::kGetTupleElement);
+  CHECK_EQ(user_gte->tuple_index(), 2);
+  auto computation = loop->parent();
+
+  // Find the reduce outputs and the input nodes they depend on, if input nodes
+  // only have small operands.
+  std::unordered_set<HloInstruction*> to_move;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> reduce_outputs;
+  std::vector<HloInstruction*> worklist;
+  Shape padded_shape = user_gte->shape();
+  Shape unpadded_shape = user_gte->shape();
+  auto original_output = user_gte;
+
+  if (user_gte->user_count() == 1 &&
+      user_gte->users().back()->opcode() == HloOpcode::kSlice) {
+    original_output = user_gte->users().back();
+    unpadded_shape = original_output->shape();
+  }
+  for (auto u : original_output->users()) {
+    worklist.push_back(u);
+  }
+  to_move.insert(original_output);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (to_move.count(inst) > 0) {
+      continue;
+    }
+    // We only support reduces with simple reduction function, since we may need
+    // to accumulate across iterations manually.
+    if (inst->opcode() == HloOpcode::kReduce &&
+        inst->to_apply()->instruction_count() == 3 &&
+        inst->to_apply()->num_parameters() == 2 &&
+        inst->to_apply()->root_instruction()->IsElementwise()) {
+      to_move.insert(inst);
+      auto other_operand = inst->mutable_operand(1);
+      auto res = new_operands_set.emplace(other_operand);
+      if (res.second) {
+        new_operands.push_back(other_operand);
+      }
+      reduce_outputs.push_back(inst);
+    } else if (inst != computation->root_instruction() &&
+               inst->user_count() > 0 && inst->IsElementwise() &&
+               !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      // For an elementwise op, we need to make sure that they depend on only
+      // nodes already in to_move and nodes with small operands.
+      bool can_include = true;
+      for (auto operand : inst->operands()) {
+        if (to_move.count(operand) > 0) {
+          continue;
+        }
+        auto find_result = FindInputNodesIfOnlyDependOnSmallOperands(operand);
+        if (find_result.first.empty()) {
+          can_include = false;
+          break;
+        }
+        for (auto n : find_result.first) {
+          to_move.insert(n);
+        }
+        for (auto new_operand : find_result.second) {
+          auto res = new_operands_set.insert(new_operand);
+          if (res.second) {
+            new_operands.push_back(new_operand);
+          }
+        }
+      }
+      if (!can_include) {
+        to_move.clear();
+        break;
+      }
+      to_move.insert(inst);
+      for (auto u : inst->users()) {
+        worklist.push_back(u);
+      }
+    } else {
+      to_move.clear();
+      break;
+    }
+  }
+  // If nothing is found, to_move could contain only original_output, or cleared
+  // by the above code.
+  if (to_move.size() <= 1) {
+    return Status::OK();
+  }
+
+  // We will replace the original loop output with reduce-shape outputs. Create
+  // the initial buffers before the loop.
+  for (auto out : reduce_outputs) {
+    auto padded_out_shape = out->shape();
+    int64 operand_dim = 0;
+    int64 output_dim = 0;
+    while (output_dim < padded_out_shape.rank()) {
+      if (absl::c_linear_search(out->dimensions(), operand_dim)) {
+        // Dimension colapsed.
+        ++operand_dim;
+        continue;
+      }
+      // Kept dimensions have the same size of the padded shape.
+      padded_out_shape.set_dimensions(output_dim,
+                                      padded_shape.dimensions(operand_dim));
+      ++operand_dim;
+      ++output_dim;
+    }
+    auto broadcast =
+        computation->AddInstruction(HloInstruction::CreateBroadcast(
+            padded_out_shape,
+            computation->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::Zero(out->shape().element_type()))),
+            {}));
+    new_operands.push_back(broadcast);
+  }
+
+  auto input_tuple = loop->mutable_operand(0);
+  // Create the new input subtuple that contains the small operands and the
+  // reduce-shape result buffers.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(
+      input_tuple->ReplaceOperandWithDifferentShape(2, new_input_subtuple));
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto body_root = body->root_instruction();
+  CHECK_EQ(body_root->opcode(), HloOpcode::kTuple);
+  // Update tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body_root}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(), {2}) =
+        new_input_subtuple->shape();
+  }
+  auto new_loop_input =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, 2));
+
+  // Now create the moved nodes inside the loop body.
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  worklist.clear();
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_move.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_loop_input, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // The elementwise nodes will be created with sliced shape. The original loop
+  // output corresponds to the dynamic-update-slice's update slice.
+  auto dus = body_root->mutable_operand(2);
+  CHECK_EQ(dus->opcode(), HloOpcode::kDynamicUpdateSlice);
+  outside_to_inside[original_output] = dus->mutable_operand(1);
+  add_users_if_available(original_output);
+  std::vector<HloInstruction*> slice_offsets(padded_shape.rank());
+  for (int64 i = 0; i < slice_offsets.size(); ++i) {
+    slice_offsets[i] = dus->mutable_operand(i + 2);
+  }
+  auto get_slice = [&](HloInstruction* padded) {
+    return body->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                     padded->shape().element_type()),
+        padded, slice_offsets, dus->operand(1)->shape().dimensions()));
+  };
+  // Helper functions to create nodes with small operands.
+  auto add_broadcast = [&](const HloInstruction* broadcast) {
+    auto padded_operand_shape = broadcast->operand(0)->shape();
+    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+      padded_operand_shape.set_dimensions(
+          i, padded_shape.dimensions(broadcast->dimensions(i)));
+    }
+    auto padded_operand = PadToShape(outside_to_inside[broadcast->operand(0)],
+                                     padded_operand_shape, nullptr, body);
+    outside_to_inside[broadcast] =
+        get_slice(body->AddInstruction(broadcast->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         padded_operand_shape.element_type()),
+            {padded_operand})));
+  };
+  auto add_iota = [&](const HloInstruction* iota) {
+    outside_to_inside[iota] =
+        get_slice(body->AddInstruction(iota->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         iota->shape().element_type()),
+            {})));
+  };
+  auto add_constant = [&](const HloInstruction* constant) {
+    outside_to_inside[constant] = body->AddInstruction(constant->Clone());
+    outside_to_inside[constant] = get_slice(
+        PadToShape(outside_to_inside[constant],
+                   ShapeUtil::ChangeElementType(
+                       padded_shape, constant->shape().element_type()),
+                   nullptr, body));
+  };
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (outside_to_inside.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast) {
+      add_broadcast(inst);
+    } else if (inst->opcode() == HloOpcode::kIota) {
+      add_iota(inst);
+    } else if (inst->opcode() == HloOpcode::kConstant) {
+      add_constant(inst);
+    } else if (inst->opcode() == HloOpcode::kReduce) {
+      // This is an output, for which we has special handling later.
+    } else {
+      std::vector<HloInstruction*> operands_inside(inst->operand_count());
+      for (int64 i = 0; i < operands_inside.size(); ++i) {
+        operands_inside[i] = outside_to_inside[inst->operand(i)];
+      }
+      outside_to_inside[inst] = body->AddInstruction(inst->CloneWithNewOperands(
+          ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                       inst->shape().element_type()),
+          operands_inside));
+    }
+    add_users_if_available(inst);
+  }
+  std::vector<HloInstruction*> new_outputs_inside(new_operands.size());
+  for (int64 i = 0; i < new_outputs_inside.size(); ++i) {
+    new_outputs_inside[i] = outside_to_inside[new_operands[i]];
+  }
+  // Now create the reduce outpus inside of the loop.
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    auto reduce_outside = reduce_outputs[i];
+    CHECK_EQ(reduce_outside->opcode(), HloOpcode::kReduce);
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto last_iter_result = outside_to_inside[new_operands[index_in_operand]];
+    auto operand0 = outside_to_inside[reduce_outside->operand(0)];
+    auto operand1 = outside_to_inside[reduce_outside->operand(1)];
+    TF_ASSIGN_OR_RETURN(auto reduce_shape,
+                        ShapeInference::InferReduceShape(
+                            {&operand0->shape(), &operand1->shape()},
+                            reduce_outside->dimensions(),
+                            reduce_outside->to_apply()->ComputeProgramShape()));
+    *reduce_shape.mutable_layout() = reduce_outside->shape().layout();
+    std::vector<HloInstruction*> reduce_dus_offsets;
+    // If any collapsed dimension is windowed, we need to accumulate with last
+    // iteration's result. If such a dimension has padding, we also need to mask
+    // off invalid data.
+    bool needs_accumulate = false;
+    std::vector<int64> dims_to_mask;
+    for (int64 i = 0; i < slice_offsets.size(); ++i) {
+      if (absl::c_linear_search(reduce_outside->dimensions(), i)) {
+        if (reduce_outside->operand(0)->shape().dimensions(i) !=
+            operand0->shape().dimensions(i)) {
+          needs_accumulate = true;
+          if (unpadded_shape.dimensions(i) != padded_shape.dimensions(i)) {
+            dims_to_mask.push_back(i);
+          }
+        }
+        continue;
+      }
+      reduce_dus_offsets.push_back(slice_offsets[i]);
+    }
+    // Mask off invalid data in collapsed dimensions.
+    for (int64 dim : dims_to_mask) {
+      auto iota = body->AddInstruction(HloInstruction::CreateIota(
+          ShapeUtil::ChangeElementType(operand0->shape(), S32), dim));
+      auto add = body->AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              iota->shape(), slice_offsets[dim], {}))));
+      auto limit = body->AddInstruction(HloInstruction::CreateBroadcast(
+          iota->shape(),
+          body->AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                  reduce_outside->operand(0)->shape().dimensions(dim)))),
+          {}));
+      auto compare = body->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(iota->shape(), PRED), add, limit,
+          ComparisonDirection::kLt));
+      operand0 = body->AddInstruction(HloInstruction::CreateTernary(
+          operand0->shape(), HloOpcode::kSelect, compare, operand0,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              operand0->shape(), operand1, {}))));
+    }
+    auto output_inside =
+        body->AddInstruction(reduce_outside->CloneWithNewOperands(
+            reduce_shape, {operand0, operand1}));
+    // Accumulate with previous results if needed.
+    if (needs_accumulate) {
+      auto input_slice =
+          body->AddInstruction(HloInstruction::CreateDynamicSlice(
+              output_inside->shape(), last_iter_result, reduce_dus_offsets,
+              output_inside->shape().dimensions()));
+      output_inside = body->AddInstruction(HloInstruction::CreateBinary(
+          output_inside->shape(),
+          reduce_outside->to_apply()->root_instruction()->opcode(),
+          output_inside, input_slice));
+    }
+    // Dynamic-update-slice if needed.
+    if (!ShapeUtil::Compatible(output_inside->shape(),
+                               last_iter_result->shape())) {
+      output_inside =
+          body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              last_iter_result->shape(), last_iter_result, output_inside,
+              reduce_dus_offsets));
+    }
+    new_outputs_inside[index_in_operand] = output_inside;
+  }
+  // Body output.
+  auto new_output_inside =
+      body->AddInstruction(HloInstruction::CreateTuple(new_outputs_inside));
+  TF_RETURN_IF_ERROR(
+      body_root->ReplaceOperandWithDifferentShape(2, new_output_inside));
+  TF_RETURN_IF_ERROR(body->RemoveInstructionAndUnusedOperands(dus));
+  // Replace uses of the reduces outside the loop.
+  auto new_output_gte =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_output_inside->shape(), loop, 2));
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto new_output =
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_outputs_inside[index_in_operand]->shape(), new_output_gte,
+            index_in_operand));
+    if (!ShapeUtil::Compatible(new_output->shape(),
+                               reduce_outputs[i]->shape())) {
+      new_output = computation->AddInstruction(HloInstruction::CreateSlice(
+          reduce_outputs[i]->shape(), new_output,
+          std::vector<int64>(new_output->shape().rank(), 0),
+          reduce_outputs[i]->shape().dimensions(),
+          std::vector<int64>(new_output->shape().rank(), 1)));
+    }
+    TF_RETURN_IF_ERROR(reduce_outputs[i]->ReplaceAllUsesWith(new_output));
+    TF_RETURN_IF_ERROR(
+        computation->RemoveInstructionAndUnusedOperands(reduce_outputs[i]));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
+    HloComputation* computation) {
+  for (auto& loop : windowed_dot_general_loops_) {
+    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims) {
+      // We have a dynamic-slice for the non-windowed operand in
+      // batch/contracting-dim windowed dot-general. So moving the
+      // broadcast/iota/elementwise ops into the loop could help reduce memory
+      // via fusion.
+      TF_RETURN_IF_ERROR(
+          SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+              loop.while_loop, 1 - loop.windowed_operand));
+    }
+    if (!loop.windowed_in_contracting_dims) {
+      // We have a dynamic-update-slice for the output in
+      // batch/non-contracting-dim windowed dot-general. So moving reduce ops
+      // into the loop could help reduce memory.
+      TF_RETURN_IF_ERROR(
+          MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+              loop.while_loop));
+    }
+  }
+  return Status::OK();
+}
+
+StatusOr<bool> SpmdPartitioningVisitor::DoPartition(
+    HloComputation* computation, const HloSharding& root_sharding) {
+  VLOG(2) << "Partitioning computation " << computation->name() << " for "
+          << num_replicas_ << " replicas and " << num_partitions_
+          << " partitions";
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+
+  HloModule* module = computation->parent();
+  auto new_root =
+      GetPartitionedHlo(computation->root_instruction()).Reshard(root_sharding);
+  auto new_computation =
+      module->AddEmbeddedComputation(b_.Build(new_root.hlo()));
+  TF_RETURN_IF_ERROR(DoCodeMotionForWindowedDotGeneralLoops(new_computation));
+
+  // Replace the original computation with the new SPMD computation.
+  std::unordered_map<HloComputation*, HloComputation*> replacement;
+  replacement[computation] = new_computation;
+  module->ReplaceComputations(replacement);
+  return changed_;
+}
+
+Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
+  return Unimplemented(
+      "PartitionId instruction is not supported for SPMD partitioning since "
+      "the meaning is ambiguous -- whether the instruction is replicated or "
+      "the data is replicated, and if the latter which data is replicated.");
+}
+
+SpmdPartitioner::SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                                 SpmdPartitionerOptions options)
+    : SpmdPartitioner(
+          num_partitions, num_replicas, std::move(options),
+          SPMDCollectiveOpsCreator{
+              [](SpmdBuilder* b) {
+                return b->AddInstruction(HloInstruction::CreatePartitionId());
+              },
+              [num_replicas](SpmdBuilder* b, HloInstruction* operand,
+                             HloComputation* reduction, int64 channel_id) {
+                return b->AddInstruction(HloInstruction::CreateAllReduce(
+                    operand->shape(), {operand}, reduction,
+                    CreateReplicaGroups(num_replicas),
+                    /*constrain_layout=*/false, channel_id,
+                    /*use_global_device_ids=*/false));
+              },
+              [](SpmdBuilder* b, HloInstruction* operand,
+                 std::vector<std::pair<int64, int64>>& src_dst_pairs,
+                 int64 channel_id) {
+                return b->AddInstruction(
+                    HloInstruction::CreateCollectivePermute(
+                        operand->shape(), operand, src_dst_pairs, channel_id));
+              },
+              [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+                 const std::vector<ReplicaGroup>& replica_groups,
+                 int64 channel_id, absl::optional<int64> split_dimension) {
+                std::vector<Shape> shapes(operands.size(),
+                                          operands[0]->shape());
+                const Shape output_shape =
+                    (shapes.size() == 1) ? shapes[0]
+                                         : ShapeUtil::MakeTupleShape(shapes);
+                return b->AddInstruction(HloInstruction::CreateAllToAll(
+                    output_shape, operands, replica_groups,
+                    /*constrain_layout=*/false, channel_id, split_dimension));
+              },
+          }) {}
+
+StatusOr<bool> SpmdPartitioner::PartitionComputation(
+    HloComputation* computation, const HloSharding& root_sharding,
+    int64* next_channel_id, SpmdLogger* logger) {
+  auto visitor =
+      CreateVisitor(computation, num_partitions_, num_replicas_,
+                    collective_ops_creator_, next_channel_id, logger, options_);
+  return visitor->DoPartition(computation, root_sharding);
+}
+
+std::unique_ptr<SpmdPartitioningVisitor> SpmdPartitioner::CreateVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger,
+    SpmdPartitionerOptions options) {
+  return absl::make_unique<SpmdPartitioningVisitor>(
+      computation, num_partitions, num_replicas, collective_ops_creator,
+      next_channel_id, logger, std::move(options), this);
+}
+
+StatusOr<bool> SpmdPartitioner::Run(HloModule* module) {
+  TF_RETURN_IF_ERROR(PreprocessSharding(module));
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportBeforePartition(
+                        *module, options_.report_instruction_count));
+
+  // Add the parameters' and output's shardings to the module.
+  std::vector<HloSharding> entry_params_shardings;
+  for (int64 i = 0; i < module->entry_computation()->num_parameters(); ++i) {
+    auto param = module->entry_computation()->parameter_instruction(i);
+    CHECK(param->has_sharding()) << "Missing sharding in entry parameter " << i;
+    entry_params_shardings.push_back(param->sharding());
+  }
+  module->set_spmd_parameters_shardings(entry_params_shardings);
+  auto entry_root = module->entry_computation()->root_instruction();
+  CHECK(entry_root->has_sharding()) << "Missing sharding in entry root.";
+  module->set_spmd_output_sharding(entry_root->sharding());
+
+  FlattenCallGraph flatten;
+  TF_ASSIGN_OR_RETURN(auto changed, flatten.Run(module));
+
+  SpmdLogger logger(options_.report_instruction_count);
+  auto program_shape = module->entry_computation()->ComputeProgramShape();
+  int64 next_channel_id = hlo_query::NextChannelId(*module);
+  TF_ASSIGN_OR_RETURN(
+      bool partition_changed,
+      PartitionComputation(
+          module->entry_computation(),
+          module->entry_computation()->root_instruction()->sharding(),
+          &next_channel_id, &logger));
+  changed |= partition_changed;
+
+  // For the entry computation, make sure that the root instruction and the
+  // parameters preserve their signatures.
+  auto new_program_shape = module->entry_computation()->ComputeProgramShape();
+  if (!options_.allow_module_signature_change) {
+    TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+        program_shape.result(), new_program_shape.result()))
+        << "Result shape changed for the entry computation";
+    TF_RET_CHECK(program_shape.parameters_size() ==
+                 new_program_shape.parameters_size())
+        << "Parameter count changed for the entry computation";
+    for (int64 i = 0; i < program_shape.parameters_size(); ++i) {
+      TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+          program_shape.parameters(i), new_program_shape.parameters(i)))
+          << "Parameter shape changed for the entry computation";
+    }
+  } else {
+    const auto& old_entry_layout = module->entry_computation_layout();
+    // Shapes can change but the layout should still remain the same.
+    for (int64 i = 0; i < new_program_shape.parameters_size(); ++i) {
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          old_entry_layout.parameter_shape(i),
+          new_program_shape.mutable_parameters(i)));
+    }
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        old_entry_layout.result_shape(), new_program_shape.mutable_result()));
+
+    HloModuleConfig config = module->config();
+    *config.mutable_entry_computation_layout() =
+        ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
+    module->set_config(config);
+  }
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportAfterPartition(
+                        *module, options_.report_instruction_count));
+  XLA_VLOG_LINES(1, logger.MakeReport());
+
+  if (changed) {
+    HloPassPipeline pass("spmd-cleanup");
+    pass.AddPass<TupleSimplifier>();
+    pass.AddPass<HloDCE>();
+    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+    pass.AddPass<FlattenCallGraph>();
+    TF_RETURN_IF_ERROR(pass.Run(module).status());
+  }
+
+  TF_RETURN_IF_ERROR(ClearShardingAttributes(module));
+  return changed;
+}
+
+Status SpmdPartitioner::PreprocessSharding(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      if (hlo->HasSideEffectNoRecurse() && hlo->opcode() != HloOpcode::kRng) {
+        TF_RET_CHECK(hlo->has_sharding())
+            << "Side-effect HLO must have sharding: " << hlo->ToString();
+        TF_RET_CHECK(!HasReplicatedSharding(hlo->sharding()) ||
+                     hlo->opcode() == HloOpcode::kInfeed)
+            << "Non-infeed side-effect HLO cannot have a replicated sharding:"
+            << hlo->ToString();
+      }
+
+      // For unassigned HLOs, annotate with replicated sharding.
+      //
+      // Among side-effecting ops, only Rng is allowed to omit the annotation.
+      // In that case, we currently force it to run on core 0, since we don't
+      // support partitioning or replicating the Rng op (the values depend on
+      // the seed provided to each device).
+      //
+      // TODO(hyouklee): Should we also convert single-device shardings (without
+      // side-effects) into replicated?
+      if (!hlo->has_sharding()) {
+        if (hlo->opcode() == HloOpcode::kRng) {
+          hlo->set_sharding(HloSharding::AssignDevice(0));
+        } else {
+          hlo->set_sharding(
+              HloSharding::Single(hlo->shape(), HloSharding::Replicate()));
+        }
+      } else if (!hlo->sharding().IsTileMaximal()) {
+        std::vector<int64> available(num_partitions_);
+        std::iota(available.begin(), available.end(), 0);
+        TF_RET_CHECK(num_partitions_ == hlo_sharding_util::DevicesForSharding(
+                                            hlo->sharding(), available)
+                                            .size())
+            << "num_partitions:" << num_partitions_ << "\n"
+            << "SPMD partitioner only supports tile sharding that includes all "
+               "partitions. If you didn't add this sharding annotation in the "
+               "model, please file a bug to XLA team.\n"
+            << hlo->ToString();
+      }
+    }
+  }
+
+  // Entry computation's parameter and root sharding must be either all
+  // replicated or all on a single device.
+  if (!options_.allow_module_signature_change) {
+    const HloComputation* entry = module->entry_computation();
+    TF_RET_CHECK(entry->root_instruction()->has_sharding());
+    const HloSharding& root_sharding = entry->root_instruction()->sharding();
+    TF_RET_CHECK(root_sharding.IsReplicated() ||
+                 root_sharding.UniqueDevice().has_value())
+        << "Unsupported entry root sharding: " << root_sharding.ToString();
+
+    for (const HloInstruction* param : entry->parameter_instructions()) {
+      TF_RET_CHECK(param->has_sharding());
+      TF_RET_CHECK(param->sharding().IsReplicated() ||
+                   param->sharding().UniqueDevice().has_value())
+          << "Unsupported entry parameter sharding:"
+          << param->sharding().ToString();
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
new file mode 100644
index 00000000000..09d2c4af908
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -0,0 +1,435 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace spmd {
+
+struct SpmdPartitionerOptions {
+  // Always exchange halo on LHS for all convolutions. If false, backprop filter
+  // convolution exchanges halo on RHS.
+  bool conv_halo_exchange_always_on_lhs = true;
+
+  // The number of instructions to be reported for the highest memory profile
+  // instructions.
+  int64 report_instruction_count = 5;
+
+  // The minimum size in MiB of an einsum operand to be considered using
+  // windowed implementation in an HLO loop.
+  int64 threshold_for_windowed_einsum_mib = 256;
+
+  // Whether the entry computations' signature could change after partitioning.
+  bool allow_module_signature_change = false;
+};
+
+// Class to wrap the computation builder to capture information during SPMD
+// transformation.
+class SpmdBuilder : public HloComputation::Builder {
+ public:
+  SpmdBuilder(const std::string& name, HloInstruction* hlo)
+      : HloComputation::Builder(name) {
+    visiting_hlo_ = hlo;
+  }
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction);
+
+  const std::vector<HloInstruction*>& derived_instructions(
+      HloInstruction* hlo) {
+    return instructions_.at(hlo);
+  }
+
+  void set_visiting_hlo(HloInstruction* hlo) { visiting_hlo_ = hlo; }
+
+  HloInstruction* visiting_hlo() const { return visiting_hlo_; }
+
+ private:
+  // Currently visiting instruction.
+  HloInstruction* visiting_hlo_;
+
+  // Map from the currently visiting (old) instruction to new instructions
+  // created during SPMD partitioning.
+  HloInstructionMap<std::vector<HloInstruction*>> instructions_;
+};
+
+// A set of functions that create the cross-partition collective ops.
+struct SPMDCollectiveOpsCreator {
+  // Function used to create a partition ID HLO.
+  std::function<HloInstruction*(SpmdBuilder*)> create_partition_id;
+
+  // Function used to create a cross-partition all-reduce HLO.
+  std::function<HloInstruction*(SpmdBuilder*, HloInstruction* operand,
+                                HloComputation* reduction, int64 channel_id)>
+      create_cross_partition_all_reduce;
+
+  // Function used to create a cross-partition collective-permute HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand,
+      std::vector<std::pair<int64, int64>>& src_dst_pairs,
+      int64 next_channel_id)>
+      create_cross_partition_collective_permute;
+
+  // Function used to create a cross-partition all-to-all HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, absl::Span<HloInstruction* const> operands,
+      const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
+      absl::optional<int64> split_dimension)>
+      create_cross_partition_all_to_all;
+};
+
+// Logger to report memory usage during SPMD partitioning.
+class SpmdLogger {
+ public:
+  explicit SpmdLogger(int64 report_instruction_count)
+      : report_instruction_count_(report_instruction_count) {}
+  static std::string ReportBeforePartition(const HloModule& module,
+                                           int64 report_instruction_count);
+  static std::string ReportAfterPartition(const HloModule& module,
+                                          int64 report_instruction_count);
+
+  // Registers the logging for the groups of instructions created to transform
+  // the given hlo.
+  void RegisterLogEntry(HloInstruction* hlo,
+                        const std::vector<HloInstruction*>& group);
+
+  std::string MakeReport();
+
+ private:
+  template <typename F>
+  static std::string ReportMemoryUsage(const HloModule& module, const F& filter,
+                                       int64 report_instruction_count);
+
+  // A vector of logging messages (one for each original HLO instruction), where
+  // the first integer of the pair represents the size of the HBM used.
+  std::vector<std::pair<int64, std::string>> entries_;
+
+  int64 report_instruction_count_;
+};
+
+class SpmdPartitioningVisitor;
+
+class SpmdPartitioner : public HloModulePass {
+ public:
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options);
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options,
+                  SPMDCollectiveOpsCreator collective_ops_creator)
+      : num_partitions_(num_partitions),
+        num_replicas_(num_replicas),
+        options_(std::move(options)),
+        collective_ops_creator_(std::move(collective_ops_creator)) {}
+  absl::string_view name() const override { return "spmd-partitioning"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Transforms the given computation with SPMD instructions, replacing it with
+  // a new computation.
+  StatusOr<bool> PartitionComputation(HloComputation* computation,
+                                      const HloSharding& root_sharding,
+                                      int64* next_channel_id,
+                                      SpmdLogger* logger);
+
+ protected:
+  virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options);
+
+ private:
+  // Verify that the sharding of instructions in the module are valid, and also
+  // fill in missing sharding information.
+  Status PreprocessSharding(HloModule* module);
+
+  const int64 num_partitions_;
+  const int64 num_replicas_;
+
+  SpmdPartitionerOptions options_;
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+};
+
+// Class describes partition state of the data represented by an HLO created
+// during SPMD partitioning pass.
+//
+// Data on some devices may include padding region, if the base (full) shape
+// could not be evenly partitioned.
+class PartitionedHlo {
+ public:
+  // Return value for ReshardAsWindowedInput which describes the resharded HLO,
+  // the window for the user on the shard, and if necessary, the dynamic slice
+  // offsets to be applied to the output of the op being sharded.
+  struct WindowedInputShardReturnValue {
+    HloInstruction* sharded_input;
+    Window shard_window;
+    absl::optional<std::vector<HloInstruction*>> dynamic_slice_index_on_output;
+  };
+  // A cache for resharding each partitioned HLO.
+  struct ReshardCache {
+    struct PerHloCache {
+      std::vector<std::pair<HloSharding, PartitionedHlo>> reshard_cache;
+      std::vector<
+          std::tuple<HloSharding, Window, WindowedInputShardReturnValue>>
+          window_reshard_cache;
+    };
+    std::unordered_map<HloInstruction*, PerHloCache> per_hlo_cache;
+  };
+  struct PartitioningState {
+    SpmdBuilder* b;
+    HloModule* module;
+    int64 num_replicas;
+    HloInstruction* partition_id;
+    SPMDCollectiveOpsCreator collective_ops_creator;
+    int64* next_channel_id;
+    ReshardCache* reshard_cache;
+  };
+  PartitionedHlo(HloInstruction* hlo, Shape base_shape, PartitioningState state)
+      : hlo_(hlo), base_shape_(base_shape), state_(std::move(state)) {
+    CHECK(hlo->has_sharding())
+        << "PartitionedHlo is missing sharding:" << hlo->ToString();
+    // If the tuple shape instruction does not have a tuple sharding, reassign
+    // to use the tuple sharding. Reshard() implementation assumes this.
+    if (hlo_->shape().IsTuple() && !hlo_->sharding().IsTuple()) {
+      hlo_->set_sharding(
+          hlo_->sharding().GetTupleSharding(hlo_->shape()).ValueOrDie());
+    }
+  }
+
+  // Reshards the current SPMD instruction to a new sharding. Could only modify
+  // the reshard cache.
+  PartitionedHlo Reshard(const HloSharding& target);
+
+  // Pads the garbage area of the output with the provided value.
+  PartitionedHlo PadWithValue(HloInstruction* pad_value) const;
+
+  // Returns the SPMD instruction.
+  HloInstruction* hlo() const { return hlo_; }
+
+  // Returns the sharding of the SPMD instruction.
+  const HloSharding& sharding() const { return hlo_->sharding(); }
+
+  // Original full shape of the data.
+  const Shape& base_shape() const { return base_shape_; }
+
+  int64 NewChannel() const { return (*state_.next_channel_id)++; }
+
+  // Reshards the HLO to a usable partitioned input for a windowed user. Could
+  // only modify the reshard cache.
+  absl::optional<WindowedInputShardReturnValue> ReshardAsWindowedInput(
+      const Window& window, const HloSharding& target,
+      HloInstruction* pad_value, bool mask_invalid_region = true);
+
+ private:
+  // Same as Reshard except that it does not explicitly modify the reshard
+  // cache, although it would indirectly modify by calling Replicate().
+  PartitionedHlo ReshardNoCache(const HloSharding& target);
+
+  // Helper function to replicate the data on all devices. Could only modify
+  // the reshard cache.
+  PartitionedHlo Replicate();
+
+  // Helper function to broadcast data from a single device to all devices.
+  PartitionedHlo Broadcast() const;
+
+  // Helper function to reshard the tensor using AllToAll (instead of the
+  // default of Replicate followed by Slice).
+  PartitionedHlo ReshardWithAllToAll(const HloSharding& target) const;
+
+  // Helper function to reshard the tensor using CollectivePermute.
+  PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
+
+  // SPMD instruction.
+  HloInstruction* hlo_;
+
+  // The original shape of the data before SPMD transformation is applied.
+  Shape base_shape_;
+
+  PartitioningState state_;
+};
+
+struct DotGeneralDimsMapping {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, contracting, non-contracting). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimsMapping {
+    int64 lhs;
+    int64 rhs;
+    int64 output;
+  };
+  std::vector<DimsMapping> batch_dims;
+  std::vector<DimsMapping> contracting_dims;
+  std::vector<DimsMapping> lhs_non_contracting_dims;
+  std::vector<DimsMapping> rhs_non_contracting_dims;
+};
+
+class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
+ public:
+  SpmdPartitioningVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options, SpmdPartitioner* partitioner);
+
+  Status DefaultAction(HloInstruction* hlo) override;
+  Status HandleAllReduce(HloInstruction* hlo) override;
+  Status HandleBroadcast(HloInstruction* hlo) override;
+  Status HandleConstant(HloInstruction* hlo) override;
+  Status HandleCustomCall(HloInstruction* hlo) override;
+  Status HandleDot(HloInstruction* hlo) override;
+  Status HandleDynamicSlice(HloInstruction* hlo) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  Status HandleGather(HloInstruction* hlo) override;
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+  Status HandleInfeed(HloInstruction* hlo) override;
+  Status HandleOutfeed(HloInstruction* hlo) override;
+  Status HandlePad(HloInstruction* hlo) override;
+  Status HandleParameter(HloInstruction* hlo) override;
+  Status HandleReduce(HloInstruction* hlo) override;
+  Status HandleReverse(HloInstruction* hlo) override;
+  Status HandleWhile(HloInstruction* hlo) override;
+  Status HandleConditional(HloInstruction* hlo) override;
+  Status HandleReduceWindow(HloInstruction* hlo) override;
+  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+  Status HandleTuple(HloInstruction* hlo) override;
+  Status HandleRng(HloInstruction* hlo) override;
+  Status HandleConvolution(HloInstruction* hlo) override;
+  Status HandleConcatenate(HloInstruction* hlo) override;
+  Status HandleScatter(HloInstruction* hlo) override;
+  Status HandleSlice(HloInstruction* hlo) override;
+  Status HandleSort(HloInstruction* hlo) override;
+  Status HandleTranspose(HloInstruction* hlo) override;
+  Status HandleReshape(HloInstruction* hlo) override;
+  Status HandleIota(HloInstruction* hlo) override;
+  Status HandlePartitionId(HloInstruction* hlo) override;
+
+  // Handles convolution where both LHS and RHS operands are tiled.
+  Status HandleConvolutionTiledLhsAndRhs(HloInstruction* hlo);
+
+  // Implementation of dot partitioning given DotGeneralDimsMapping.
+  Status HandleDotHelper(
+      HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+      const std::function<StatusOr<HloInstruction*>(
+          HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot);
+
+  // Common handle for elementwise HLOs.
+  Status HandleElementwise(HloInstruction* hlo);
+
+  // Common handle for HLOs that runs on a single device.
+  Status HandleSingleDevice(const HloInstruction* hlo);
+
+  // Returns the PartitionedHlo that corresponds to the original hlo.
+  PartitionedHlo& GetPartitionedHlo(const HloInstruction* hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 1);
+    return partitioned_instructions_.find(hlo)->second;
+  }
+
+  // Sets the PartitionedHlo for the original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const PartitionedHlo& partitioned_hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 0);
+    partitioned_instructions_.emplace(hlo, partitioned_hlo);
+    changed_ = true;
+  }
+
+  // Convenient wrapper that creates PartitionedHlo from the result of the func
+  // and maps it to the given original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const std::function<HloInstruction*()>& func) {
+    HloInstruction* new_hlo = func();
+    new_hlo->set_sharding(hlo->sharding());
+    new_hlo->set_metadata(hlo->metadata());
+    SetPartitionedHlo(
+        hlo, PartitionedHlo(new_hlo, hlo->shape(), MakePartitioningState()));
+    changed_ = true;
+  }
+
+  int64 NewChannel() { return (*next_channel_id_)++; }
+
+  PartitionedHlo::PartitioningState MakePartitioningState() {
+    return PartitionedHlo::PartitioningState{
+        .b = &b_,
+        .module = module_,
+        .num_replicas = num_replicas_,
+        .partition_id = partition_id_,
+        .collective_ops_creator = collective_ops_creator_,
+        .next_channel_id = next_channel_id_,
+        .reshard_cache = &reshard_cache_};
+  }
+
+  SpmdBuilder* builder() { return &b_; }
+
+  StatusOr<bool> DoPartition(HloComputation* computation,
+                             const HloSharding& root_sharding);
+
+ private:
+  Status Preprocess(HloInstruction* hlo) override;
+  Status Postprocess(HloInstruction* hlo) override;
+
+  // Performs code motion for windowed dot-general loops in
+  // windowed_dot_general_loops_. Invoked after the visitor finishes traversing
+  // the graph.
+  Status DoCodeMotionForWindowedDotGeneralLoops(HloComputation* computation);
+
+  bool changed_;
+  HloModule* module_;
+  int64 num_partitions_;
+  int64 num_replicas_;
+
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+
+  // Tracks the next channel id to use for cross-partition all-reduce.
+  int64* next_channel_id_;
+  SpmdBuilder b_;
+
+  HloInstruction* partition_id_;
+
+  PartitionedHlo::ReshardCache reshard_cache_;
+
+  // Mapping from the instruction in the original computation to the new SPMD
+  // partitioned instruction.
+  ConstHloInstructionMap<PartitionedHlo> partitioned_instructions_;
+
+  // Information about a loop created for windowed dot-general. Used when
+  // DoCodeMotionForWindowedDotGeneralLoops() executes after the visitor
+  // finishes traversing the graph.
+  struct WindowedDotGeneralLoop {
+    HloInstruction* while_loop;
+    int64 windowed_operand;
+    bool windowed_in_contracting_dims;
+    bool windowed_in_batch_dims;
+  };
+  std::vector<WindowedDotGeneralLoop> windowed_dot_general_loops_;
+
+  HloInstruction* visiting_hlo_;
+  SpmdLogger* logger_;
+  const SpmdPartitionerOptions options_;
+  SpmdPartitioner* partitioner_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
new file mode 100644
index 00000000000..7a7f2dcc807
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -0,0 +1,3191 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace spmd {
+namespace {
+
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+
+class SpmdPartitioningTest : public HloTestBase {
+ public:
+  StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
+      const char* hlo_module, int64 num_devices,
+      bool conv_halo_exchange_always_on_lhs = true) {
+    // Some tests (BackpropFilter convs) set this flag false to test two
+    // different paths of the implementation.
+    SpmdPartitionerOptions options;
+    options.conv_halo_exchange_always_on_lhs = conv_halo_exchange_always_on_lhs;
+    options.allow_module_signature_change = true;
+
+    TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(
+                                         hlo_module, GetModuleConfigForTest()));
+    HloPassPipeline pass("spmd-partitioning");
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    pass.AddPass<SpmdPartitioner>(num_devices, /*num_replicas=*/1, options);
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    TF_RETURN_IF_ERROR(pass.Run(module.get()).status());
+    return StatusOr<std::unique_ptr<HloModule>>(std::move(module));
+  }
+};
+
+TEST_F(SpmdPartitioningTest, InvalidSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  auto module_status = PartitionComputation(hlo_string, /*num_devices=*/4);
+  EXPECT_FALSE(module_status.status().ok());
+  EXPECT_THAT(module_status.status().ToString(),
+              ::testing::HasSubstr(
+                  "only supports tile sharding that includes all partitions"));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Copy(op::AllReduce(
+                              op::Select(op::Broadcast(op::Compare()),
+                                         op::Constant(), op::Broadcast()))),
+                          op::Shape("s32[2,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  VLOG(1) << module->ToString();
+  EXPECT_THAT(root, op::Copy(AllOf(op::Copy(op::AllReduce(op::Select(
+                                       op::Broadcast(op::Compare()),
+                                       op::Constant(), op::Broadcast()))),
+                                   op::Shape("s32[2,3]"))));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant),
+    sharding={devices=[2,1]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Copy(op::DynamicSlice(
+              op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::Constant(), op::Broadcast())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant())),
+          op::Shape("s32[1,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]")))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]"))))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledEven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= s32[8,2]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[8,2]{1,0} copy(%param), sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Reshape(op::Transpose(op::AllToAll(AllOf(
+                op::Reshape(op::Parameter()), op::Shape("s32[4,2,1]")))))),
+            op::Shape("s32[8,1]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= f32[7,31,128]{2,1,0} parameter(0), sharding={devices=[1,2,1]0,1}
+  ROOT %copy = f32[7,31,128]{2,1,0} copy(%param), sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Slice(op::Reshape(AllOf(op::Transpose(op::AllToAll(
+          op::Reshape(AllOf(op::Pad(), op::Shape("f32[8,16,128]")))))))))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementSwapDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param.0 = (f32[2,3]{1,0}, u32[]) parameter(0),
+    sharding={{maximal device=1}, {maximal device=1}}
+  %gte.0 = f32[2,3]{1,0} get-tuple-element(%param.0), index=0,
+    sharding={maximal device=0}
+  %gte.1 = u32[] get-tuple-element(%param.0), index=1,
+    sharding={maximal device=0}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[]) tuple(%gte.0, %gte.1),
+    sharding={{maximal device=0},{maximal device=0}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  EXPECT_THAT(root->operand(0),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+  EXPECT_THAT(root->operand(1),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param.0 = (f32[2,3]{1,0}, u32[2,3]{1,0}) parameter(0),
+    sharding={{replicated}, {replicated}}
+  gte.0 = f32[2,3]{1,0} get-tuple-element(param.0), index=0,
+    sharding={devices=[2,1]0,1}
+  gte.1 = u32[2,3]{1,0} get-tuple-element(param.0), index=1,
+    sharding={devices=[2,1]0,1}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[2,3]{1,0}) tuple(gte.0, gte.1),
+    sharding={{devices=[2,1]0,1},{devices=[2,1]0,1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+
+  EXPECT_THAT(root->operand(0),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+  EXPECT_THAT(root->operand(1),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+}
+
+TEST_F(SpmdPartitioningTest, TiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(),
+                op::GetTupleElement(
+                    AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
+                op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                             op::Constant())),
+                op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[9,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[9,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[5,2]"), op::GetTupleElement(op::Conditional(
+                                             op::Convert(op::PartitionId()),
+                                             op::AfterAll(), op::AfterAll()))));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[0]->root_instruction(),
+      AllOf(op::Shape("(f32[5,2], token[])"), op::Infeed(op::Parameter())));
+  auto second_infeed =
+      AllOf(op::Shape("(f32[4,2], token[])"), op::Infeed(op::Parameter()));
+  EXPECT_THAT(root->operand(0)->called_computations()[1]->root_instruction(),
+              AllOf(op::Shape("(f32[5,2], token[])"),
+                    op::Tuple(op::Pad(op::GetTupleElement(second_infeed),
+                                      op::Constant()),
+                              op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledTupleInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = ((f32[9,2]{1,0}, f32[2]{0}), token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {replicated}, {maximal device=0}}
+  ROOT infeed.data = (f32[9,2]{1,0}, f32[2]{0}) get-tuple-element(infeed),
+    index=0, sharding={{devices=[2,1]0,1}, {replicated}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("(f32[5,2], f32[2])"),
+                          op::GetTupleElement(op::Conditional(
+                              op::Convert(op::PartitionId()), op::AfterAll(),
+                              op::AfterAll()))));
+  EXPECT_THAT(root->operand(0)->called_computations()[0]->root_instruction(),
+              AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+                    op::Infeed(op::Parameter())));
+  auto second_infeed = AllOf(op::Shape("((f32[4,2], f32[2]), token[])"),
+                             op::Infeed(op::Parameter()));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[1]->root_instruction(),
+      AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+            op::Tuple(op::Tuple(op::Pad(op::GetTupleElement(
+                                            op::GetTupleElement(second_infeed)),
+                                        op::Constant()),
+                                op::GetTupleElement(
+                                    op::GetTupleElement(second_infeed))),
+                      op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicatedReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce = f32[] reduce(constant, constant.1), dimensions={0,1},
+    to_apply=sum, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::AllReduce(op::Reduce(
+          op::Select(
+              op::Compare(op::Add(op::Iota(), op::Broadcast(op::Reshape())),
+                          op::Broadcast(op::Constant())),
+              AllOf(op::Shape("f32[2,3]{1,0}"),
+                    op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                     op::Reshape(), op::Constant())),
+              op::Broadcast(op::Constant())),
+          op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledElementwise) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[3,3]{1,0} constant({{2,2,2},{2,2,2},{2,2,2}}),
+    sharding={replicated}
+  multiply = f32[3,3]{1,0} multiply(constant, constant.1),
+    sharding={devices=[2,1]0,1}
+  ROOT add = f32[3,3]{1,0} add(multiply, constant.1),
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Shape("f32[2,3]{1,0}"),
+          op::Add(op::Multiply(
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant()),
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant())),
+                  op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                   op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledAllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  parameter = f32[3,3]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT all-reduce = f32[3,3]{1,0} all-reduce(parameter), to_apply=sum,
+    replica_groups={}, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[2,3]{1,0}"), op::AllReduce(op::Parameter(0))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[3,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,4,3]{2,1,0}"),
+                          op::Broadcast(op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyOldDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastBothOldAndNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2,3]{2,1,0}"),
+            op::Broadcast(AllOf(op::Shape("f32[2,3]{1,0}"),
+                                op::DynamicSlice(op::Constant(), op::Reshape(),
+                                                 op::Constant())))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastPropagateTiledSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,4,1},{1,3,1},{1,2,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, OutfeedSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token.0 = token[] after-all()
+  data = f32[1024]{0} parameter(0), sharding={maximal device=0}
+  outfeed = token[] outfeed(data, token.0), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("token[]"),
+                          op::Conditional(
+                              op::Compare(op::PartitionId(), op::Constant()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()))));
+
+  HloInstruction* root_b0 = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(root_b0,
+              AllOf(op::Shape("token[]"),
+                    op::Outfeed(op::GetTupleElement(op::Parameter(), 0),
+                                op::GetTupleElement(op::Parameter(), 1))));
+
+  HloInstruction* root_b1 = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(root_b1, AllOf(op::Shape("token[]"), op::AfterAll()));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowReplicatedInput) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={replicated}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[3,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_0x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::ReduceWindow(
+                op::DynamicSlice(AllOf(op::Shape("f32[9,2]{1,0}"),
+                                       op::Pad(op::Constant(), op::Constant())),
+                                 op::Multiply(op::Reshape(), op::Constant()),
+                                 op::Constant()),
+                op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledNegativeLeftHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %reduce-window = f32[3,2]{1,0} reduce-window(%constant, %constant.1),
+    window={size=3x1 stride=2x1 pad=0_1x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[6,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked =
+      op::Select(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+                 pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideUnequalHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[9,2]{1,0} constant(
+    {{1,1},{1,4},{2,1},{3,1},{1,2},{2,2},{4,1},{1,2},{2,1}}),
+    sharding={devices=[3,1]0,1,2}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[5,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_1x0_0}, to_apply=sum,
+    sharding={devices=[3,1]0,1,2}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/3));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[7,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledTwoSideHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[4,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2]{1,0} reduce-window(constant, constant.1),
+    window={size=5x1 stride=3x1 pad=2_2x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto left_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                         op::CollectivePermute(op::Slice(sharded_input)));
+  auto right_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = AllOf(
+      op::Shape("f32[5,2]{1,0}"),
+      op::DynamicSlice(
+          AllOf(op::Shape("f32[6,2]{1,0}"),
+                op::Pad(op::Concatenate(left_halo, sharded_input, right_halo),
+                        op::Constant())),
+          op::Reshape(), op::Constant()));
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiled2D) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[4,4,2,2]{3,2,1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,2,1,1]0,1,2,3}, {maximal device=0}}
+  infeed.data = f32[4,4,2,2]{3,2,1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+  constant = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2,2,2]{3,2,1,0} reduce-window(infeed.data, constant),
+    window={size=5x5x1x1 stride=3x3x1x1 pad=2_2x2_2x0_0x0_0}, to_apply=sum,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input = AllOf(op::Shape("f32[2,2,2,2]{3,2,1,0}"),
+                             op::GetTupleElement(op::Infeed()));
+  auto dim0_left_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_right_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                               op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[6,2,2,2]{3,2,1,0}"),
+            op::Pad(
+                op::Concatenate(dim0_left_halo, sharded_input, dim0_right_halo),
+                op::Constant())),
+      op::Reshape(), op::Constant(), op::Constant(), op::Constant());
+  auto dim0_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim0_masked = op::Select(
+      op::And(op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant()))),
+      dim0_pre_masking, op::Broadcast(op::Constant()));
+  auto dim0_resharded = AllOf(op::Shape("f32[5,2,2,2]{3,2,1,0}"), dim0_masked);
+  auto dim1_left_halo = AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_right_halo =
+      AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+            op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,6,2,2]{3,2,1,0}"),
+            op::Pad(op::Concatenate(dim1_left_halo, dim0_resharded,
+                                    dim1_right_halo),
+                    op::Constant())),
+      op::Constant(), op::Reshape(), op::Constant(), op::Constant());
+  auto dim1_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim1_masked = op::Select(
+      op::And(op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant()))),
+      dim1_pre_masking, op::Broadcast(op::Constant()));
+  auto dim1_resharded = AllOf(op::Shape("f32[5,5,2,2]{3,2,1,0}"), dim1_masked);
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,1,2,2]{3,2,1,0}"),
+                          op::ReduceWindow(dim1_resharded, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedNeedReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,224,224,3]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[64,2,112,224,3]"));
+  auto reshard_lhs = AllOf(op::Reshape(op::Transpose(all_to_all)),
+                           op::Shape("f32[128,112,224,3]"));
+
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Convolution(
+                op::Select(op::And(),
+                           op::Concatenate(left_halo, reshard_lhs, right_halo),
+                           op::Broadcast()),
+                rhs),
+            op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedReordered) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[224,224,3,128] parameter(0)
+  %lhs.copy = f32[224,224,3,128] copy(%lhs), sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=01fb_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[112,224,3,128]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[3,224,3,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[2,224,3,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+// (stride * per_shard_window_count) % dilation == 0
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationSameStartPatternLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,4,4,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 stride=4x4 pad=1_1x1_1 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  // There is no halo exchange, and because the last element in the shard is not
+  // needed (stride == 4), the LHS will be just a slice.
+  auto sliced_lhs =
+      AllOf(op::Slice(op::Copy(op::DynamicSlice(
+                op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                op::Reshape(), op::Constant(), op::Constant()))),
+            op::Shape("f32[128,3,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(sliced_lhs, rhs),
+                          op::Shape("f32[128,2,4,512]")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 1);
+}
+
+// (stride * per_shard_window_count) % dilation != 0 but stride == 1
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationStride1LhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,14,14,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[128,4,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  auto start_window = op::Multiply(op::Reshape(), op::Constant());
+  auto start_input_element = op::Divide(start_window, op::Constant());
+  auto dynamic_offset_for_padded_concat = op::Subtract(
+      op::Constant(), op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                   start_input_element));
+  auto pre_masking =
+      AllOf(op::Shape("f32[128,5,7,512]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[128,6,7,512]"),
+                      op::Pad(op::Concatenate(left_halo, lhs), op::Constant())),
+                op::Constant(), dynamic_offset_for_padded_concat,
+                op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::Compare(op::Add(op::Iota(), op::Broadcast(start_input_element)),
+                  op::Broadcast(op::Constant())),
+      pre_masking, op::Broadcast(op::Constant()));
+  auto dynamic_offset_on_output = op::Subtract(
+      start_window, op::Multiply(start_input_element, op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(AllOf(op::Convolution(masked, rhs),
+                                           op::Shape("f32[128,8,14,512]")),
+                                     op::Constant(), dynamic_offset_on_output,
+                                     op::Constant(), op::Constant()),
+                    op::Shape("f32[128,7,14,512]")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                    op::Reshape(), op::Constant())),
+          op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlapReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[1,4]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto operand = AllOf(op::Copy(op::DynamicSlice(
+                           op::Parameter(0), op::Constant(), op::Reshape())),
+                       op::Shape("f32[11,1]"));
+  auto reshard_operand = op::Reshape(op::Transpose(
+      op::AllToAll(op::Reshape(op::Pad(operand, op::Constant())))));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          reshard_operand, op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterWithOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[6,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8},{6,6},{1,9}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=2x2 pad=1_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto source_shard =
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::DynamicSlice(op::Pad(), op::Reshape(), op::Constant()));
+  // Max halo size is the same as the shard size, so slice is not needed.
+  auto source_left_halo = op::CollectivePermute(source_shard);
+  auto required_source_shard_start =
+      op::Divide(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto source_with_halo = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,2]{1,0}"),
+            op::Pad(op::Concatenate(source_left_halo, source_shard),
+                    op::Constant())),
+      op::Subtract(op::Constant(),
+                   op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                required_source_shard_start)),
+      op::Constant());
+  auto masked_source_with_halo = AllOf(
+      AllOf(op::Shape("f32[3,2]{1,0}")),
+      op::Select(
+          op::Compare(
+              op::Add(op::Iota(), op::Broadcast(required_source_shard_start)),
+              op::Broadcast(op::Constant())),
+          source_with_halo, op::Broadcast(op::Constant())));
+
+  auto data_shard =
+      AllOf(op::Shape("f32[3,4]{1,0}"),
+            op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Reshape(), op::Constant())));
+  auto data_left_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                              op::CollectivePermute(op::Slice(data_shard)));
+  auto data_right_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                               op::CollectivePermute(op::Slice(data_shard)));
+  auto required_data_start_on_padded =
+      op::Multiply(required_source_shard_start, op::Constant());
+  auto left_halo_size = op::Subtract(
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant()),
+      required_data_start_on_padded);
+  auto data_with_halo =
+      AllOf(op::Shape("f32[7,4]{1,0}"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[8,4]{1,0}"),
+                      op::Pad(op::Concatenate(data_left_halo, data_shard,
+                                              data_right_halo),
+                              op::Constant())),
+                op::Subtract(op::Constant(), left_halo_size), op::Constant()));
+  auto index_on_padded =
+      op::Add(op::Iota(), op::Broadcast(required_data_start_on_padded));
+  auto masked_data_with_halo = op::Select(
+      op::And(op::Compare(index_on_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_on_padded, op::Broadcast(op::Constant()))),
+      data_with_halo, op::Broadcast(op::Constant()));
+
+  EXPECT_THAT(
+      root, AllOf(op::DynamicSlice(op::SelectAndScatter(masked_data_with_halo,
+                                                        masked_source_with_halo,
+                                                        op::Constant()),
+                                   left_halo_size, op::Constant()),
+                  op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,56,56,256]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[2,64,28,56,64]"));
+  auto reshard = AllOf(op::Reshape(op::Transpose(all_to_all)));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(reshard, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,512] parameter(0)
+  %lhs.copy = f32[128,56,56,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,64] parameter(1)
+  %rhs.copy = f32[128,28,28,64] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,512,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2},
+    dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,28,28,64]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(rhs)), op::Shape("f32[64,2,14,28,64]"));
+  auto reshard = op::Reshape(op::Transpose(all_to_all));
+
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), reshard)),
+                    op::Shape("f32[1,1,512,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[32,1,28,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[32,1,28,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[32,16,28,64]")))),
+                    op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,2,112,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[128,2,112,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[128,60,112,64]")))),
+                    op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilateUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        AllOf(op::DynamicSlice(op::Pad(lhs, op::Constant()),
+                                               op::Constant(), op::Subtract(),
+                                               op::Constant(), op::Constant()),
+                              op::Shape("f32[128,10,14,512]")),
+                        AllOf(op::Concatenate(left_halo, rhs),
+                              op::Shape("f32[128,5,7,512]")))),
+                    op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[32,1,28,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[32,1,28,128]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[32,16,28,128]")),
+                              rhs)),
+                          op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilate_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[128,117,224,3]")),
+                              rhs)),
+                          op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateUneven_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,1,14,512]"));
+  EXPECT_THAT(
+      root, AllOf(op::AllReduce(op::Convolution(
+                      AllOf(op::DynamicSlice(
+                                AllOf(op::Pad(op::Concatenate(lhs, right_halo),
+                                              op::Constant()),
+                                      op::Shape("f32[128,10,14,512]")),
+                                op::Constant(), op::Reshape(), op::Constant(),
+                                op::Constant()),
+                            op::Shape("f32[128,9,14,512]")),
+                      rhs)),
+                  op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[2,1]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[2,1]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,257]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,116]"));
+  EXPECT_THAT(root,
+              AllOf(op::Concatenate(param0, param1), op::Shape("f32[7,373]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[1,2]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[1,2]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Constant(), op::Reshape())),
+            op::Shape("f32[14,129]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                                op::Reshape())),
+                      op::Shape("f32[14,58]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                                        op::DynamicUpdateSlice(
+                                            op::Broadcast(), param0,
+                                            op::Constant(), op::Multiply()),
+                                        param1, op::Constant(), op::Add())),
+                                    op::Shape("f32[14,374]")),
+                              op::Constant(), op::Multiply()),
+                          op::Shape("f32[14,187]")));
+}
+
+TEST_F(SpmdPartitioningTest, PadAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  %const = f32[] constant(0)
+  ROOT %pad = f32[128,17,257] pad(%param0.copy, %const), padding=0_0x1_2x0_0,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Pad(param0, op::Constant()),
+                          op::Shape("f32[128,17,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[128,11,257] slice(%param0.copy),
+    slice={[0:128:1], [2:13:1], [0:257:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Slice(param0), op::Shape("f32[128,11,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[63,14,251] slice(%param0.copy),
+    slice={[2:128:2], [0:14:1], [5:256:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(
+                op::DynamicSlice(
+                    AllOf(op::Concatenate(
+                              param0,
+                              AllOf(op::CollectivePermute(op::Slice(param0)),
+                                    op::Shape("f32[128,14,2]"))),
+                          op::Shape("f32[128,14,131]")),
+                    op::Constant(), op::Constant(), op::Add()),
+                op::Shape("f32[128,14,126]"))),
+            op::Shape("f32[63,14,126]")));
+}
+
+TEST_F(SpmdPartitioningTest, SortAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  p.0.lhs.1247 = f32[]{:T(256)} parameter(0), sharding={replicated}
+  bitcast-convert = s32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  constant = s32[]{:T(256)} constant(0), sharding={replicated}
+  compare = pred[]{:T(256)E(32)} compare(bitcast-convert, constant), direction=LT, sharding={replicated}
+  constant.1 = u32[]{:T(256)} constant(2147483647), sharding={replicated}
+  bitcast-convert.1 = u32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  subtract = u32[]{:T(256)} subtract(constant.1, bitcast-convert.1), sharding={replicated}
+  bitcast-convert.2 = s32[]{:T(256)} bitcast-convert(subtract), sharding={replicated}
+  select = s32[]{:T(256)} select(compare, bitcast-convert.2, bitcast-convert), sharding={replicated}
+  p.0.rhs.1248 = f32[]{:T(256)} parameter(1), sharding={replicated}
+  bitcast-convert.3 = s32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  compare.1 = pred[]{:T(256)E(32)} compare(bitcast-convert.3, constant), direction=LT, sharding={replicated}
+  bitcast-convert.4 = u32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  subtract.1 = u32[]{:T(256)} subtract(constant.1, bitcast-convert.4), sharding={replicated}
+  bitcast-convert.5 = s32[]{:T(256)} bitcast-convert(subtract.1), sharding={replicated}
+  select.1 = s32[]{:T(256)} select(compare.1, bitcast-convert.5, bitcast-convert.3), sharding={replicated}
+  compare.2 = pred[]{:T(256)E(32)} compare(select, select.1), direction=GT, sharding={replicated}
+  compare.258 = pred[]{:T(256)E(32)} compare(select.1, select), direction=GT, sharding={replicated}
+  compare.259 = pred[]{:T(256)E(32)} compare(compare.2, compare.258), direction=EQ, sharding={replicated}
+  p.1.lhs.1249 = s32[]{:T(256)} parameter(2), sharding={replicated}
+  p.1.rhs.1250 = s32[]{:T(256)} parameter(3), sharding={replicated}
+  compare.260 = pred[]{:T(256)E(32)} compare(p.1.lhs.1249, p.1.rhs.1250), direction=LT, sharding={replicated}
+  ROOT select.86 = pred[]{:T(256)E(32)} select(compare.259, compare.260, compare.2), sharding={replicated}
+}
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,2,1]0,1}
+  %param1 = s32[128,14,257] parameter(1)
+  %param1.copy = s32[128,14,257] copy(%param1), sharding={devices=[1,2,1]0,1}
+  ROOT %sort.6 = (f32[128,14,257]{2,1,0:T(8,128)}, s32[128,14,257]{2,1,0:T(8,128)})
+    sort(%param0.copy, %param1.copy), dimensions={2}, is_stable=true,
+    to_apply=%ge, sharding={{devices=[1,2,1]0,1},{devices=[1,2,1]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[128,7,257]"));
+  auto param1 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("s32[128,7,257]"));
+  EXPECT_THAT(root, AllOf(op::Sort(param0, param1),
+                          op::Shape("(f32[128,7,257], s32[128,7,257])")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionCustomCall) {
+  const char* const hlo_string = R"(
+HloModule cluster_2013453984438090939__.47
+
+ENTRY %cluster_2013453984438090939__.47
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %custom-call = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    custom-call(bf16[2,209664]{1,0} %copy.arg_tuple.1), custom_call_target="TopK"
+  %get-tuple-element = bf16[2,2000]{1,0}
+    get-tuple-element((bf16[2,2000]{1,0}, s32[2,2000]{1,0}) %custom-call),
+    index=0, sharding={replicated}
+  %get-tuple-element.1 = s32[2,2000]{1,0} get-tuple-element((bf16[2,2000]{1,0},
+    s32[2,2000]{1,0}) %custom-call), index=1, sharding={replicated}
+  ROOT %tuple.46 = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    tuple(bf16[2,2000]{1,0} %get-tuple-element, s32[2,2000]{1,0}
+    %get-tuple-element.1), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto custom_call = FindInstruction(module.get(), "custom-call.1");
+  EXPECT_EQ(custom_call->operand(0)->shape().dimensions(1), 104832);
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, ShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[16,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[16,4,19,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, MultiDimensionShardedTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[4,2,1,1]0,1,2,3,4,5,6,7}
+  ROOT %transpose = f32[38,4,16,38] transpose(%param0.copy),
+    dimensions={1,3,0,2}, sharding={devices=[2,1,4,1]0,2,4,6,1,3,5,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[19,4,4,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto resahrd = AllOf(op::Reshape(op::Transpose(op::Reshape(op::AllToAll()))),
+                       op::Shape("f32[16,38,38,2]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, ShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[2,1,1]0,1}
+  ROOT %reshape = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[19,38,324]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %transpose = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::DynamicSlice(
+                AllOf(op::Pad(
+                          AllOf(op::Reshape(AllOf(op::AllReduce(),
+                                                  op::Shape("f32[38,38,324]"))),
+                                op::Shape("f32[38,38,4,81]")),
+                          op::Constant()),
+                      op::Shape("f32[38,38,4,82]")),
+                op::Constant(), op::Constant(), op::Constant(), op::Reshape()),
+            op::Shape("f32[38,38,4,41]")));
+}
+
+// Produces an invalid module after transformation.
+TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[128,5,5,768] parameter(0)
+  %param0.copy = f32[128,5,5,768] copy(%param0),
+    sharding={devices=[1,4,1,1]0,1,2,3}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %rw = f32[128,17,17,768] reduce-window(%param0.copy, %constant.1),
+    window={size=1x5x5x1 pad=0_0x4_4x4_4x0_0 lhs_dilate=1x3x3x1},
+    to_apply=sum, sharding={devices=[1,4,1,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto input_shard = op::Copy(op::DynamicSlice(
+      op::Pad(op::Parameter(0), op::Constant()), op::Constant(), op::Reshape(),
+      op::Constant(), op::Constant()));
+  auto id_mul4_add1 =
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto id_mul5 = op::Multiply(op::Reshape(), op::Constant());
+  auto id_mul5_add1_div3 =
+      op::Divide(op::Add(id_mul5, op::Constant()), op::Constant());
+  auto before_masking = AllOf(
+      op::Shape("f32[128,3,5,768]"),
+      op::DynamicSlice(
+          AllOf(
+              op::Shape("f32[128,4,5,768]"),
+              op::Concatenate(op::CollectivePermute(input_shard), input_shard)),
+          op::Constant(),
+          op::Subtract(op::Constant(),
+                       op::Subtract(id_mul4_add1, id_mul5_add1_div3)),
+          op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::And(op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant())),
+              op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant()))),
+      before_masking, op::Broadcast(op::Constant()));
+  auto rw = AllOf(op::Shape("f32[128,7,17,768]"),
+                  op::ReduceWindow(masked, op::Constant()));
+  auto final_slice_index = op::Subtract(
+      id_mul5,
+      op::Add(op::Multiply(id_mul5_add1_div3, op::Constant()), op::Constant()));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[128,5,17,768]"),
+                    op::DynamicSlice(rw, op::Constant(), final_slice_index,
+                                     op::Constant(), op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,1,1,2]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[4,32,32,64]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::Reduce(param0, op::Constant()), op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0), sharding={devices=[2,1]0,1}
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,1]0,1}
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  ROOT %reduce = (f32[28], s32[28]) reduce(%param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func,
+    sharding={{devices=[2]0,1}, {devices=[2]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Reduce(op::Parameter(0), op::Parameter(1),
+                                     op::Parameter(2), op::Parameter(3)),
+                          op::Shape("(f32[14], s32[14])")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduceOutputReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,2,1,1]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,16,32,128]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(
+                        AllOf(op::AllReduce(op::Reduce(param0, op::Constant())),
+                              op::Shape("f32[128]")),
+                        op::Reshape()),
+                    op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongNonTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=1,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Iota(), op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, U32IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = u32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("u32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, Conditional) {
+  const char* const hlo_string = R"(
+HloModule module
+
+Negate {
+  x = f32[4,5] parameter(0), sharding={replicated}
+  ROOT negate = f32[4,5] negate(x), sharding={replicated}
+}
+
+Identity {
+  y = f32[4,5] parameter(0), sharding={devices=[2,1]0,1}
+  ROOT copy = f32[4,5] copy(y), sharding={devices=[2,1]0,1}
+}
+
+ENTRY entry {
+  %param.0 = pred[] parameter(0)
+  %param.0.copy = pred[] copy(%param.0), sharding={maximal device=0}
+  %param.1 = f32[4,5] parameter(1)
+  %param.1.copy = f32[4,5] copy(%param.1), sharding={replicated}
+  %param.2 = f32[4,5] parameter(2)
+  %param.2.copy = f32[4,5] copy(%param.2), sharding={devices=[2,1]0,1}
+  ROOT cond = f32[4,5] conditional(%param.0.copy, %param.1.copy, %param.2.copy),
+    true_computation=Negate, false_computation=Identity,
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto param0 = AllOf(op::Copy(op::Copy(op::Parameter()), op::Shape("pred[]")));
+  auto param1 = AllOf(op::Copy(op::Parameter()), op::Shape("f32[4,5]"));
+  auto param2 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[2,5]"));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Conditional(op::AllReduce(), param1, param2),
+                          op::Shape("f32[2,5]")));
+
+  auto then_branch_root = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(then_branch_root,
+              AllOf(op::DynamicSlice(op::Negate(op::Parameter()), op::Reshape(),
+                                     op::Constant()),
+                    op::Shape("f32[2,5]")));
+
+  auto else_branch_root = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(else_branch_root,
+              AllOf(op::Copy(op::Parameter()), op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatter_RetinaNet) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param.0 = f32[32,128,384,64] parameter(0)
+  %param.0.copy = f32[32,128,384,64] copy(%param.0),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  %param.1 = f32[32,64,192,64] parameter(1)
+  %param.1.copy = f32[32,64,192,64] copy(%param.1),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[32,128,384,64] select-and-scatter(param.0.copy,
+    %param.1.copy, constant.1), window={size=1x1x1x1 stride=1x2x2x1},
+    select=ge, scatter=sum, sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto source = AllOf(
+      op::Shape("f32[32,8,192,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+  auto data = AllOf(
+      op::Shape("f32[32,16,384,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+
+  EXPECT_THAT(root, op::SelectAndScatter(data, source, op::Constant()));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, TiledDot) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[128,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledDotOutputTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                                    op::Shape("f32[128,256]")),
+                              op::Constant(), op::Reshape()),
+                          op::Shape("f32[128,128]")));
+}
+
+TEST_F(SpmdPartitioningTest, BatchPartitionedConvolution) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,256,256] parameter(0)
+  %lhs.copy = f32[128,256,256] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[256,8,1] parameter(1)
+  %rhs.copy = f32[256,8,1] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,256,8] convolution(%lhs.copy, %rhs.copy),
+    window={size=1}, dim_labels=0bf_io0->0bf, sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[128,128,256]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[256,8,1]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[128,128,8]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotOutputFeaturePartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,64] parameter(0)
+  %lhs.copy = f32[24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[39296,64] parameter(1)
+  %rhs.copy = f32[39296,64] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %dot = f32[24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[19648,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, op::DynamicSlice(rhs, op::Reshape(),
+                                                        op::Constant(),
+                                                        op::Constant())),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[32,12,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  auto lhs_reshard = op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs_reshard, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(0)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,24,64]"));
+  auto rhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(1)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs_slice, rhs_slice),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,24,32,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,39296,32,64]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Dot(lhs, rhs)),
+                          op::Shape("f32[32,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,128,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,12,64,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,12,64,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296,128] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,19648,64,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,24,19648,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputLHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Dot(AllOf(op::DynamicSlice(lhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                          op::Shape("f32[32,12,64,128]")),
+                    rhs),
+            op::Shape("f32[32,12,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputRHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs, AllOf(op::DynamicSlice(
+                                           rhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                                       op::Shape("f32[32,19648,64,128]"))),
+                    op::Shape("f32[32,24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[32,19648,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(op::GetTupleElement(op::While(op::Tuple(
+                                lhs, rhs, op::Broadcast(), op::Constant()))),
+                            op::Shape("f32[32,12,39296]"))),
+            op::Shape("f32[32,12,39295]")));
+  auto while_loop = root->operand(0)->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::DynamicUpdateSlice(op::GetTupleElement(op::Parameter(0)),
+                                       partial_output, op::Constant(),
+                                       op::Constant(), op::Reshape()),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,63,128] parameter(0)
+  %lhs.copy = f32[32,24,63,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39296,63,128] parameter(1)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,63,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[32,39296,32,128]"));
+  auto masked_rhs =
+      op::Select(op::Compare(), rhs, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::GetTupleElement(op::While(op::Tuple(
+                        lhs, masked_rhs, op::Broadcast(), op::Constant()))),
+                    op::Shape("f32[32,12,39296]")));
+  auto while_loop = root->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(
+      op::DynamicSlice(
+          op::Pad(op::GetTupleElement(op::Parameter(0)), op::Constant()),
+          op::Constant(), op::Constant(), op::Reshape(), op::Constant()),
+      op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::Add(op::GetTupleElement(op::Parameter(0)), partial_output),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+  sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,24] reduce(%multiply, %constant), dimensions={2},
+    to_apply=sum, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+    sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,39295] reduce(%multiply, %constant), dimensions={1},
+    to_apply=sum, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContractingFromBroadcast) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %rhs = f32[32,39296,63,128] parameter(0)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,63,128] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1,1]0,1}
+  %add = f32[32,24,63,128] add(%broadcast, %broadcast),
+    sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%add, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, ReplicatedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={replicated}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("s32[]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Select(
+                op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                op::Rng(), op::Broadcast(op::Constant()))),
+            op::Shape("s32[4]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={maximal device=1}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Copy(op::Parameter(1))), op::Shape("s32[]"));
+  EXPECT_THAT(root, AllOf(op::Rng(lhs, op::AllReduce(op::Select(
+                                           op::Broadcast(op::Compare()), rhs,
+                                           op::Broadcast(op::Constant())))),
+                          op::Shape("s32[2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  ROOT %dynamic-slice = s32[128,2] dynamic-slice(%input.copy, %constant, %index),
+    dynamic_slice_sizes={128,2}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(input, op::Constant(), op::Parameter(1)),
+                    op::Shape("s32[64,2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicUpdateSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  %update = s32[128,2] parameter(2)
+  %update.copy = s32[128,2] copy(%update), sharding={devices=[2,1]0,1}
+  ROOT %dynamic-update-slice = s32[128,64]
+    dynamic-update-slice(%input.copy, %update.copy, %constant, %index),
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  auto update = AllOf(op::Copy(op::DynamicSlice(op::Parameter(2), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("s32[64,2]"));
+  EXPECT_THAT(root, AllOf(op::DynamicUpdateSlice(input, update, op::Constant(),
+                                                 op::Parameter(1)),
+                          op::Shape("s32[64,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughGather) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[3,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, GatherPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[2,3,9] gather(%input, %indices), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2,
+    slice_sizes={1,9}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
+  auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
+                   op::Shape("s32[2,3]"));
+  auto clamp = op::Clamp(min, op::Parameter(1), max);
+  auto gather = op::Gather(op::Parameter(0), op::Subtract(clamp, min));
+  auto mask =
+      op::Or(op::Lt(op::Parameter(1), min), op::Gt(op::Parameter(1), max));
+  auto masked =
+      op::Select(op::Broadcast(mask), op::Broadcast(op::Constant()), gather);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::AllReduce(masked), op::Shape("f32[2,3,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughScatter) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={devices=[1,2]0,1}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Scatter(op::Parameter(0), op::Parameter(1),
+                                      op::Parameter(2)),
+                          op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, ScatterPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  %updates = f32[2,3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[17,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto indices = op::Subtract(
+      op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Scatter(op::Parameter(0), indices, op::Parameter(2)),
+                    op::Shape("f32[9,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReverse) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT reverse = f32[3,3]{1,0} reverse(constant), dimensions={1},
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]{1,0}"),
+                          op::Reverse(op::DynamicSlice(
+                              op::Pad(op::Constant(), op::Constant()),
+                              op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, MixWithManualPartitioning) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[8,2] parameter(0), sharding={devices=[2,1]0,1}
+  to_shard = f32[4,2] custom-call(param), custom_call_target="SPMDFullToShardShape", sharding={replicated}
+  add = f32[4,2] add(to_shard, to_shard), sharding={replicated}
+  to_full = f32[8,2] custom-call(add), custom_call_target="SPMDShardToFullShape", sharding={devices=[2,1]0,1}
+  ROOT mul = f32[8,2] multiply(to_full, param), sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  auto to_shard = op::Copy(op::Parameter(0));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2]"),
+                          op::Multiply(op::Copy(op::Add(to_shard, to_shard)),
+                                       op::Parameter(0))));
+}
+
+}  // namespace
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
new file mode 100644
index 00000000000..207f854cd9f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -0,0 +1,662 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace spmd {
+
+bool HasReplicatedSharding(const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    return absl::c_any_of(sharding.tuple_elements(), HasReplicatedSharding);
+  }
+  return sharding.IsReplicated();
+}
+
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(
+          CreateZero(ShapeUtil::GetTupleElementShape(shape, i), b));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  if (shape.IsToken()) {
+    return b->AddInstruction(HloInstruction::CreateToken());
+  }
+  auto zero = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, zero, {}));
+}
+
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
+  HloComputation::Builder sum_b("add");
+  auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+  auto y = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+  if (type == PRED) {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kOr, x, y));
+  } else {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kAdd, x, y));
+  }
+  HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
+  return reduction;
+}
+
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      if (!EvenlyPartitions(ShapeUtil::GetTupleElementShape(shape, i),
+                            sharding.GetSubSharding(shape, {i}))) {
+        return false;
+      }
+    }
+  }
+
+  if (sharding.IsTileMaximal()) {
+    return sharding.IsReplicated();
+  }
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    if (shape.dimensions(i) % sharding.tile_assignment().dim(i) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(
+          MakePartitionedShape(ShapeUtil::GetTupleElementShape(shape, i),
+                               sharding.GetSubSharding(shape, {i})));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+  return sharding.TileShape(shape);
+}
+
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(MakeNonPaddedShapeForGivenPartition(
+          ShapeUtil::GetTupleElementShape(shape, i),
+          sharding.GetSubSharding(shape, {i}), partition_id));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+
+  auto partition_shape = shape;
+  std::vector<int64> tile_offset =
+      sharding.TileOffsetForDevice(shape, partition_id);
+  std::vector<int64> tile_limit =
+      sharding.TileLimitForDevice(shape, partition_id);
+  for (int64 i = 0; i < tile_offset.size(); ++i) {
+    if (sharding.UsesDevice(partition_id)) {
+      partition_shape.set_dimensions(i, tile_limit[i] - tile_offset[i]);
+    } else {
+      partition_shape.set_dimensions(i, 0);
+    }
+  }
+  return partition_shape;
+}
+
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b) {
+  CHECK(!shape.IsTuple());
+
+  Array2D<int32> offset_array(
+      {sharding.tile_assignment().num_elements(), shape.rank()});
+  offset_array.Each([&](int64 i, int64 j, int32* value) {
+    *value = sharding.TileOffsetForDevice(shape, i)[j];
+  });
+  auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2FromArray2D(offset_array)));
+  std::vector<HloInstruction*> offsets;
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (sharding.tile_assignment().dim(i) == 1) {
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
+    } else {
+      auto index = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(S32, {1, 1}), offset_table,
+          {partition_id, b->AddInstruction(HloInstruction::CreateConstant(
+                             LiteralUtil::CreateR0<uint32>(i)))},
+          {1, 1}));
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), index)));
+    }
+  }
+  return offsets;
+}
+
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b) {
+  CHECK(!sharding.IsTileMaximal());
+  auto table_shape =
+      ShapeUtil::MakeShape(S32, sharding.tile_assignment().dimensions());
+  return MakePartitionOffsets(table_shape, sharding, partition_id, b);
+}
+
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b, HloComputation* computation) {
+  CHECK(b == nullptr || computation == nullptr);
+  if (ShapeUtil::Compatible(hlo->shape(), padded_shape)) {
+    return hlo;
+  }
+  PaddingConfig padding_config;
+  for (int64 i = 0; i < padded_shape.rank(); ++i) {
+    auto padding_config_dim = padding_config.add_dimensions();
+    padding_config_dim->set_edge_padding_low(0);
+    padding_config_dim->set_interior_padding(0);
+    padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                              hlo->shape().dimensions(i));
+  }
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    if (b == nullptr) {
+      return computation->AddInstruction(std::move(to_add));
+    }
+    return b->AddInstruction(std::move(to_add));
+  };
+  auto zero = add_hlo(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  return add_hlo(
+      HloInstruction::CreatePad(padded_shape, hlo, zero, padding_config));
+}
+
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return base_shape;
+  }
+  if (EvenlyPartitions(base_shape, sharding)) {
+    return base_shape;
+  }
+  auto shard_shape = MakePartitionedShape(base_shape, sharding);
+  Shape padded_base_shape = base_shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shard_shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  return padded_base_shape;
+}
+
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b) {
+  auto padded_base_shape =
+      GetPaddedShapeForUnevenPartitioning(hlo->shape(), sharding);
+  if (ShapeUtil::Compatible(padded_base_shape, hlo->shape())) {
+    return hlo;
+  }
+  return PadToShape(hlo, padded_base_shape, b);
+}
+
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return absl::nullopt;
+  }
+  int64 dim = -1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (sharding.tile_assignment().dim(i) > 1) {
+      if (dim != -1) {
+        return absl::nullopt;
+      }
+      dim = i;
+    }
+  }
+  CHECK_NE(dim, -1);
+  return dim;
+}
+
+MultiplyAddDivideOffsetCalculation::MultiplyAddDivideOffsetCalculation(
+    int64 multiplier, int64 offset, int64 divisor)
+    : multiplier_(multiplier), offset_(offset), divisor_(divisor) {
+  CHECK_GT(divisor_, 0);
+  Simplify();
+}
+
+OffsetCalculation MultiplyAddDivideOffsetCalculation::operator-(
+    const MultiplyAddDivideOffsetCalculation& other) const {
+  if (divisor_ == 1 && other.divisor_ == 1) {
+    return OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+        multiplier_ - other.multiplier_, offset_ - other.offset_, 1));
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+void MultiplyAddDivideOffsetCalculation::Simplify() {
+  // We could simplify the calculation when multiplier is a multiple of
+  // divisor_. However, when offset_ is not a multiple of divisor_, we must
+  // make sure that offset_ and multiplier_ are both non-negative or both
+  // non-positive. E.g., (3 * i  - 1) / 3 is not equivalent to i or i - 1.
+  if (divisor_ != 1 && multiplier_ % divisor_ == 0 &&
+      (offset_ % divisor_ == 0 || offset_ * multiplier_ > 0)) {
+    multiplier_ /= divisor_;
+    offset_ /= divisor_;
+    divisor_ = 1;
+  }
+}
+
+int64 MultiplyAddDivideOffsetCalculation::Calculate(int64 shard_ordinal) const {
+  return (shard_ordinal * multiplier_ + offset_) / divisor_;
+}
+
+HloInstruction* MultiplyAddDivideOffsetCalculation::Calculate(
+    HloInstruction* shard_ordinal, SpmdBuilder* b) const {
+  auto scalar_shape = ShapeUtil::MakeShape(S32, {});
+  if (multiplier_ == 0) {
+    return b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(offset_ / divisor_)));
+  }
+  HloInstruction* result = shard_ordinal;
+  if (multiplier_ != 1) {
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kMultiply, shard_ordinal,
+        b->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(multiplier_)))));
+  }
+  if (offset_ != 0) {
+    auto offset = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(offset_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kAdd, result, offset));
+  }
+  if (divisor_ != 1) {
+    auto divisor = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(divisor_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kDivide, result, divisor));
+  }
+  return result;
+}
+
+int64 MultiplyAddDivideOffsetCalculation::MaxInRange(
+    int64 start_ordinal, int64 limit_ordinal) const {
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+OffsetCalculation& OffsetCalculation::operator=(
+    const OffsetCalculation& other) {
+  opcode_ = other.opcode_;
+  copy_from_ = other.copy_from_;
+  if (opcode_ != HloOpcode::kCopy) {
+    lhs_ = absl::make_unique<OffsetCalculation>(*other.lhs_);
+    rhs_ = absl::make_unique<OffsetCalculation>(*other.rhs_);
+  }
+  return *this;
+}
+
+bool OffsetCalculation::IsConstant() const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.IsConstant();
+  }
+  if (opcode_ == HloOpcode::kSubtract && *lhs_ == *rhs_) {
+    return true;
+  }
+  return lhs_->IsConstant() && rhs_->IsConstant();
+}
+
+OffsetCalculation OffsetCalculation::operator-(
+    const OffsetCalculation& other) const {
+  if (opcode_ == HloOpcode::kCopy && other.opcode_ == HloOpcode::kCopy) {
+    return copy_from_ - other.copy_from_;
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+bool OffsetCalculation::operator==(const OffsetCalculation& other) const {
+  if (opcode_ != other.opcode_) {
+    return false;
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_ == other.copy_from_;
+  }
+  return *lhs_ == *other.lhs_ && *rhs_ == *other.rhs_;
+}
+
+int64 OffsetCalculation::Calculate(int64 shard_ordinal) const {
+  switch (opcode_) {
+    case HloOpcode::kCopy:
+      return copy_from_.Calculate(shard_ordinal);
+    case HloOpcode::kSubtract:
+      return lhs_->Calculate(shard_ordinal) - rhs_->Calculate(shard_ordinal);
+    case HloOpcode::kMultiply:
+      return lhs_->Calculate(shard_ordinal) * rhs_->Calculate(shard_ordinal);
+    default:
+      LOG(FATAL) << "Should not happen";
+  }
+}
+
+HloInstruction* OffsetCalculation::Calculate(HloInstruction* shard_ordinal,
+                                             SpmdBuilder* b) const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.Calculate(shard_ordinal, b);
+  }
+  auto lhs = lhs_->Calculate(shard_ordinal, b);
+  auto rhs = rhs_->Calculate(shard_ordinal, b);
+  return b->AddInstruction(
+      HloInstruction::CreateBinary(lhs->shape(), opcode_, lhs, rhs));
+}
+
+int64 OffsetCalculation::MaxInRange(int64 start_ordinal,
+                                    int64 limit_ordinal) const {
+  if (IsConstant()) {
+    return Calculate(start_ordinal);
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return std::max(Calculate(start_ordinal), Calculate(limit_ordinal - 1));
+  }
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  int64 input_shard_size = hlo->shape().dimensions(dim);
+  int64 shard_count = target.tile_assignment().dim(dim);
+
+  std::vector<HloInstruction*> concat_pieces;
+
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+  if (max_left_halo_size > input_shard_size) {
+    VLOG(1) << "ExchangeHalo failed: halo is beyond the left neighbor.";
+    return absl::nullopt;
+  }
+  if (max_left_halo_size > 0) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > 0) {
+            std::vector<int64> source_indices(indices.begin(), indices.end());
+            source_indices[dim] -= 1;
+            source_target_pairs.emplace_back(
+                target.tile_assignment()(source_indices), device);
+          }
+        });
+    auto halo_shape = hlo->shape();
+    auto source_halo_slice = hlo;
+    if (max_left_halo_size != hlo->shape().dimensions(dim)) {
+      halo_shape.set_dimensions(dim, max_left_halo_size);
+      std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+      halo_start_indices[dim] =
+          hlo->shape().dimensions(dim) - max_left_halo_size;
+      std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+
+      source_halo_slice = b->AddInstruction(
+          hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
+                           hlo->shape().dimensions(), halo_slice_strides));
+    }
+    auto left_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(left_halo);
+  }
+
+  concat_pieces.push_back(hlo);
+
+  // Right halo.
+  int64 max_right_halo_size =
+      right_halo_size_function.MaxInRange(0, shard_count - 1);
+  if (max_right_halo_size > input_shard_size) {
+    VLOG(1) << "ExchangeHalo failed: halo is beyond the right neighbor.";
+    return absl::nullopt;
+  }
+  if (max_right_halo_size > 0) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > 0) {
+            std::vector<int64> target_indices(indices.begin(), indices.end());
+            target_indices[dim] -= 1;
+            source_target_pairs.emplace_back(
+                device, target.tile_assignment()(target_indices));
+          }
+        });
+    auto halo_shape = hlo->shape();
+    halo_shape.set_dimensions(dim, max_right_halo_size);
+    std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+    std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+
+    auto source_halo_slice = b->AddInstruction(
+        hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
+                         halo_shape.dimensions(), halo_slice_strides));
+    auto right_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(right_halo);
+  }
+
+  auto concat = hlo;
+  // Concat with halos/padding.
+  if (concat_pieces.size() > 1) {
+    auto concat_shape = hlo->shape();
+    int64 concat_dim_size = 0;
+    for (auto piece : concat_pieces) {
+      concat_dim_size += piece->shape().dimensions(dim);
+    }
+    concat_shape.set_dimensions(dim, concat_dim_size);
+    concat = b->AddInstruction(
+        HloInstruction::CreateConcatenate(concat_shape, concat_pieces, dim));
+  }
+
+  return concat;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  CHECK(left_halo_size_functions.size() == hlo->shape().rank());
+  CHECK(right_halo_size_functions.size() == hlo->shape().rank());
+
+  HloInstruction* visiting_hlo = hlo;
+  for (int dim = 0; dim < hlo->shape().rank(); ++dim) {
+    auto concat = ExchangeHalo(visiting_hlo, left_halo_size_functions[dim],
+                               right_halo_size_functions[dim], dim, target,
+                               collective_ops_creator, next_channel_id, b);
+    if (!concat) {
+      return absl::nullopt;
+    }
+    visiting_hlo = *concat;
+  }
+  return visiting_hlo;
+}
+
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region) {
+  auto halo_exchange_result =
+      ExchangeHalo(hlo, left_halo_size_function, right_halo_size_function, dim,
+                   target, collective_ops_creator, next_channel_id, b);
+  if (!halo_exchange_result) {
+    return absl::nullopt;
+  }
+  auto concat = *halo_exchange_result;
+  int64 shard_count = target.tile_assignment().dim(dim);
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+
+  // Now we determine if we need extra padding after the concat.
+  //
+  // The max of halo size or the first shard's explicit left padding.
+  int64 max_left_halo_or_padding_size =
+      std::max(std::max(int64{0}, max_left_halo_size),
+               explicit_left_padding_on_full_shape);
+  // The calculation that returns the dynamic slice index for a shard on the
+  // padded concat, which is the difference between
+  // max_left_halo_or_padding_size and its left halo size.
+  auto start_offset_on_padded_concat_calculation =
+      OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+          0, max_left_halo_or_padding_size, 1)) -
+      left_halo_size_function;
+
+  // See if we need to pad the concat before dynamic slice.
+  int64 extra_left_padding =
+      std::max(int64{0}, max_left_halo_or_padding_size -
+                             std::max(int64{0}, max_left_halo_size));
+  int64 extra_right_padding =
+      start_offset_on_padded_concat_calculation.MaxInRange(0, shard_count) +
+      shard_size_with_halo - concat->shape().dimensions(dim) -
+      extra_left_padding;
+  extra_right_padding = std::max(int64{0}, extra_right_padding);
+  if (extra_left_padding > 0 || extra_right_padding > 0) {
+    PaddingConfig padding_config;
+    auto padded_concat_shape = concat->shape();
+    for (int64 i = 0; i < base_shape.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      padding_config_dim->set_edge_padding_low(0);
+      padding_config_dim->set_edge_padding_high(0);
+      if (i != dim) {
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(extra_left_padding);
+      padding_config_dim->set_edge_padding_high(extra_right_padding);
+      padded_concat_shape.set_dimensions(dim, concat->shape().dimensions(dim) +
+                                                  extra_left_padding +
+                                                  extra_right_padding);
+    }
+    concat = b->AddInstruction(HloInstruction::CreatePad(
+        padded_concat_shape, concat, pad_value, padding_config));
+  }
+
+  auto valid_slice = concat;
+  if (shard_size_with_halo != concat->shape().dimensions(dim)) {
+    // Concat is bigger than the shard shape, so we need a dynamic slice.
+    CHECK_LT(shard_size_with_halo, concat->shape().dimensions(dim));
+    auto slice_shape = concat->shape();
+    slice_shape.set_dimensions(dim, shard_size_with_halo);
+
+    if (left_halo_size_function.IsConstant() &&
+        left_halo_size_function.Calculate(0) ==
+            explicit_left_padding_on_full_shape) {
+      std::vector<int64> start_indices(slice_shape.rank(), 0);
+      std::vector<int64> strides(slice_shape.rank(), 1);
+      valid_slice = b->AddInstruction(
+          HloInstruction::CreateSlice(slice_shape, concat, start_indices,
+                                      slice_shape.dimensions(), strides));
+    } else {
+      auto zero = b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> slice_offsets(base_shape.rank(), zero);
+      slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
+          partition_ordinal, b);
+      valid_slice = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+    }
+  }
+
+  if (!mask_invalid_region) {
+    return valid_slice;
+  }
+
+  int64 total_right_padding = padded_full_shape_size -
+                              base_shape.dimensions(dim) -
+                              explicit_left_padding_on_full_shape;
+  // Mask off garbage data due to uneven partition or low/high padding.
+  if (explicit_left_padding_on_full_shape > 0 || total_right_padding > 0) {
+    auto index_shape = ShapeUtil::ChangeElementType(valid_slice->shape(), S32);
+    auto iota = b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index_in_padded_shape =
+        b->AddInstruction(HloInstruction::CreateBroadcast(
+            index_shape, offset_on_padded_shape, {}));
+    auto index_in_padded_shape = b->AddInstruction(
+        HloInstruction::CreateBinary(index_shape, HloOpcode::kAdd, iota,
+                                     broadcast_start_index_in_padded_shape));
+    auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+    std::vector<HloInstruction*> predicates;
+    if (explicit_left_padding_on_full_shape > 0) {
+      auto valid_index_start =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_start,
+          ComparisonDirection::kGe)));
+    }
+    if (total_right_padding > 0) {
+      auto valid_index_limit =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      base_shape.dimensions(dim) +
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_limit,
+          ComparisonDirection::kLt)));
+    }
+    CHECK(!predicates.empty());
+    auto is_valid =
+        predicates.size() == 2
+            ? b->AddInstruction(HloInstruction::CreateBinary(
+                  mask_shape, HloOpcode::kAnd, predicates[0], predicates[1]))
+            : predicates[0];
+    auto masking_value = b->AddInstruction(
+        HloInstruction::CreateBroadcast(valid_slice->shape(), pad_value, {}));
+    valid_slice = b->AddInstruction(
+        HloInstruction::CreateTernary(valid_slice->shape(), HloOpcode::kSelect,
+                                      is_valid, valid_slice, masking_value));
+  }
+  return valid_slice;
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
new file mode 100644
index 00000000000..f96b23d7073
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -0,0 +1,229 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+namespace xla {
+namespace spmd {
+
+// Returns true if the given sharding contains any replicated sharding.
+bool HasReplicatedSharding(const HloSharding& sharding);
+
+// Creates zero value instructions of the given shape.
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b);
+
+template <typename NativeT>
+HloInstruction* CreateR0WithType(PrimitiveType type, NativeT value,
+                                 SpmdBuilder* b) {
+  auto literal = LiteralUtil::CreateR0(value)
+                     .ConvertToShape(ShapeUtil::MakeShape(type, {}))
+                     .ValueOrDie();
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+// Create a binary add computation of the given type and add to the module.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module);
+
+// Returns true if the shape can be evenly partitioned for the given sharding.
+// All tile sharded dimensions should be evenly divisible and there should be no
+// single-device sharding. Replicate sharding is considered even partition.
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape of the given shape when it is partitioned for the
+// target sharding.
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape for a partition without padding due to uneven
+// sharding.
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id);
+
+// Generates the HLO instructions that represent the dimension offsets on any
+// device. The size of the returned vector is the rank of the given shape.
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b);
+
+// Returns the offsets of the partition in the tile assignment.
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Pads hlo to the desired shape using high padding. Either a builder or a
+// computation needs to be supplied, but not both.
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b,
+                           HloComputation* computation = nullptr);
+
+// Returns the padded shape when combining all partitions.
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding);
+
+// Pads the HLO (with base shape) for uneven tiled partition to make it evenly
+// partitionable.
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b);
+
+// Returns the index of the unique tile dimension. Returns absl::nullopt if the
+// given sharding is not tiled or tiled along multiple dimensions.
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding);
+
+// Utilities for symbolic offset calculation and halo exchange.
+class OffsetCalculation;
+
+// Represents a calculation over integers:
+//   (shard_ordinal * multiplier + offset) / divisor
+class MultiplyAddDivideOffsetCalculation {
+ public:
+  MultiplyAddDivideOffsetCalculation()
+      : multiplier_(0), offset_(0), divisor_(1) {}
+  MultiplyAddDivideOffsetCalculation(int64 multiplier, int64 offset,
+                                     int64 divisor);
+
+  OffsetCalculation operator-(
+      const MultiplyAddDivideOffsetCalculation& other) const;
+
+  bool operator==(const MultiplyAddDivideOffsetCalculation& other) const {
+    return multiplier_ == other.multiplier_ && offset_ == other.offset_ &&
+           divisor_ == other.divisor_;
+  }
+
+  bool IsConstant() const { return multiplier_ == 0; }
+  void Simplify();
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  int64 multiplier_;
+  int64 offset_;
+  int64 divisor_;
+};
+
+// Represents a calculation over integers based on results of other calculations
+// defined by an opcode. If the opcode is kCopy, it simply wraps an
+// MultiplyAddDivideOffsetCalculation.
+class OffsetCalculation {
+ public:
+  OffsetCalculation() : opcode_(HloOpcode::kCopy), copy_from_() {}
+  explicit OffsetCalculation(
+      const MultiplyAddDivideOffsetCalculation& copy_from)
+      : opcode_(HloOpcode::kCopy), copy_from_(copy_from) {}
+  OffsetCalculation(const OffsetCalculation& copy_from) { *this = copy_from; }
+  OffsetCalculation(HloOpcode opcode,
+                    const MultiplyAddDivideOffsetCalculation& lhs,
+                    const MultiplyAddDivideOffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+  OffsetCalculation(HloOpcode opcode, const OffsetCalculation& lhs,
+                    const OffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+
+  OffsetCalculation& operator=(const OffsetCalculation& other);
+
+  // Returns whether the calculation returns the same value for all shards. This
+  // is conservative and could return false even if it is actually constant.
+  bool IsConstant() const;
+
+  OffsetCalculation operator-(const OffsetCalculation& other) const;
+  bool operator==(const OffsetCalculation& other) const;
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  HloOpcode opcode_;
+  std::unique_ptr<OffsetCalculation> lhs_;
+  std::unique_ptr<OffsetCalculation> rhs_;
+  MultiplyAddDivideOffsetCalculation copy_from_;
+};
+
+// Performs halo exchange on the given dimension based on the provided
+// left/right halo size functions. Returns nullopt if the halo is beyond the
+// direct neighbor of the shard.
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchange halo on all dimensions of the HLO. Returns nullopt if any one of the
+// dimensions fails to exchange halo (halo is beyond the neighbor shard).
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchanges halos and performs pad/dynamic-slice on the concatenated data such
+// that the result starts with the first needed element on each shard. It also
+// masks off invalid data due to padding.
+// Arguments:
+//  hlo: the HLO op before halo exchange
+//  explicit_left_padding_on_full_shape: the amount of left padding to be added
+//   explicitly by this function on the base shape before partitioning. Without
+//   base dilation, this is usually set to the window's padding_low so that the
+//   sharded op do not need to add padding_low on the window; however, with base
+//   dilation, this could only be set to a custom size.
+//  padded_full_shape_size: the size of the padded full shape on the given
+//   dimension, which includes explicit_left_padding_on_full_shape and required
+//   right padding to make the shape evenly shardable.
+//  shard_size_with_halo: the shard size on the dimension after halo exchange.
+//   If different shards have different sizes, use the maximum size.
+//  offset_on_padded_shape: the offset HLO (S32) that represents the start of
+//   each shard on the padded full shape.
+//  pad_value: the padding value used on the full shape.
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_

From f5e922903c1f4ffcce836f3c484f2e222e0922b2 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 11:23:16 -0700
Subject: [PATCH 0505/1533] Roll back change: "For python op generation: add
 dispatch to all generated ops (don't skip ops with VISIBILITY=HIDDEN)"

PiperOrigin-RevId: 311368253
Change-Id: I137dff8c2153ac9666c4a028ca891b01dcb96cc6
---
 tensorflow/python/framework/python_op_gen.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 02b659528b0..857cc7b6638 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -371,7 +371,9 @@ void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
   strings::StrAppend(&result_, function_setup);
-  strings::StrAppend(&result_, "  try:\n  ");
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "  try:\n  ");
+  }
   strings::StrAppend(
       &result_, "  _, _, _op, _outputs = _op_def_library._apply_op_helper(\n");
   AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
@@ -688,7 +690,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
-  strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
+  }
 
   AddExport();
   AddDefLine(function_name_, parameters);
@@ -951,6 +955,8 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
 }
 
 void GenEagerPythonOp::AddDispatch(const string& prefix) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) return;
+
   strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
   strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
   AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));

From 0ecd3d8db02b004795402d58130d7816e8e00965 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 11:41:34 -0700
Subject: [PATCH 0506/1533] Update tf.InplaceUpdate summary and description so
 both TensorFlow op registry and TensorFlow MLIR ODS match.

PiperOrigin-RevId: 311372143
Change-Id: I0f6a8debc07cfe67662b3256d0f4ffdab1070eb5
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 25 -------------------
 .../compiler/mlir/tensorflow/ir/tf_ops.td     | 24 ++++++++++++++++++
 .../base_api/api_def_InplaceUpdate.pbtxt      |  8 +++---
 3 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index bddf064f5c6..2d02d0b7508 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3625,31 +3625,6 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
-  let summary = [{
-    Create a copy of `x` with the updated specified rows 'i' with values 'v'.
-
-  }];
-
-  let description = [{
-    Creates a copy of tensor 'x' and updates the columns specified in tensor 'i'
-    with the values 'v'. Originally this function was mutative however for 
-    compilation we make this operation create / operate on a copy.
-  }];
-
-  let arguments = (ins
-    TF_Tensor:$x,
-    I32Tensor:$i,
-    TF_Tensor:$v
-  );
-
-  let results = (outs
-    TF_Tensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_InvOp : TF_Op<"Inv", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 744d1ac5b71..94b0c5f5e19 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -905,5 +905,29 @@ def TF_TensorSliceDatasetOp : TF_Op<"TensorSliceDataset", []> {
   TF_DerivedOperandTypeListAttr Toutput_types = TF_DerivedOperandTypeListAttr<0>;
 }
 
+// TODO(b/156507832): Move tf.InplaceUpdate to tf_generated_ops.td once
+// autogenerated op def matches.
+def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
+  let summary = "Updates specified rows 'i' with values 'v'.";
+
+  let description = [{
+Computes `x[i, :] = v; return x`.
+
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    I32Tensor:$i,
+    TF_Tensor:$v
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
 
 #endif // TF_OPS
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
index 2fcd3659dc7..c0c160d1be4 100644
--- a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -20,9 +20,11 @@ op {
         "A `Tensor` of type T. An alias of `x`. The content "
         "of `y` is undefined if there are duplicates in `i`."
   }
-  summary: <<END
-    Updates specified rows with values in `v`.
+  summary: "Updates specified rows 'i' with values 'v'."
+  description: <<END
+Computes `x[i, :] = v; return x`.
 
-    Computes `x[i, :] = v; return x`.
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
 END
 }

From b69595c6c7ddcba1c1aacd89d53078f8dae0d3f1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 11:46:15 -0700
Subject: [PATCH 0507/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311373038
Change-Id: I495328e259c9c69c73cfa43cc284da61999ada47
---
 tensorflow/go/op/wrappers.go | 51 +++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..7a07a0e78d8 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23940,9 +23940,12 @@ func Copy(scope *Scope, input tf.Output, optional ...CopyAttr) (output tf.Output
 	return op.Output(0)
 }
 
-//     Updates specified rows with values in `v`.
+// Updates specified rows 'i' with values 'v'.
 //
-//     Computes `x[i, :] = v; return x`.
+// Computes `x[i, :] = v; return x`.
+//
+// Originally this function is mutative however for compilation we make this
+// operation create / operate on a copy of `x`.
 //
 // Arguments:
 //	x: A tensor of type `T`.
@@ -25651,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 90f3a1eb381e644ac5d0f3fd126af25f856820a9 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 11:49:10 -0700
Subject: [PATCH 0508/1533] Rolling change forward again: "Add support for
 global operation dispatchers.  (This is intended for use by TF-internal
 classes only.)"

PiperOrigin-RevId: 311373578
Change-Id: Ib40cee66bbb1395c8997db3c1eb3f5914425a280
---
 tensorflow/python/util/dispatch.py      | 21 +++++++++
 tensorflow/python/util/dispatch_test.py | 58 ++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index e94e3345348..3868da14b44 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,6 +39,10 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
+# OpDispatchers which should be used for all operations.
+_GLOBAL_DISPATCHERS = []
+
+
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -82,6 +86,19 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
+class GlobalOpDispatcher(object):
+  """Abstract base class for TensorFlow global operator dispatchers."""
+
+  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
+
+  def handle(self, op, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+
+  def register(self):
+    """Register this dispatcher as a handler for all ops."""
+    _GLOBAL_DISPATCHERS.append(self)
+
+
 def dispatch(op, *args, **kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
@@ -101,6 +118,10 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
+  for dispatcher in _GLOBAL_DISPATCHERS:
+    result = dispatcher.handle(op, args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 89999fcf843..bd35c391924 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,6 +45,47 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
+class TensorTracer(object):
+  """An object used to trace TensorFlow graphs.
+
+  This is an example class that is used to test global op dispatchers.  The
+  global op dispatcher for TensorTracers is defined below.
+  """
+
+  def __init__(self, name, args=None, kwargs=None):
+    self.name = name
+    self.args = args
+    self.kwargs = kwargs
+
+  def __repr__(self):
+    if self.args is None and self.kwargs is None:
+      return self.name
+    else:
+      args = [str(x) for x in self.args]
+      args += sorted(
+          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
+      return "{}({})".format(self.name, ", ".join(args))
+
+
+class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for TensorTracer."""
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a TensorTracer.
+    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
+            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    return TensorTracer(op.__name__, args, kwargs)
+
+  def is_tensor_tracer_arg(self, value):
+    if isinstance(value, TensorTracer):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, TensorTracer) for x in value):
+        return True
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -131,8 +172,21 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
+  def testGlobalDispatcher(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
+      self.assertEqual(
+          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
 
 if __name__ == "__main__":
   googletest.main()
-
-

From 91914123c4239c13b15789e0147a6874e0abaf6c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 18:49:08 +0000
Subject: [PATCH 0509/1533] Simplify comment and use tensor_util.is_tensor and
 convert_to_tensor if both x and y are not tensor

This is needed for `x / y` where both x and y are not tensors. As long as x is
a tensor, `x / y` will work and will rely on _truediv_python3 to correctly
figure out the right type for y.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 2c141483eb1..2d8285f709c 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -72,7 +72,6 @@ from __future__ import print_function
 
 import numpy as np
 import six
-import sys
 from six.moves import builtins
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -83,6 +82,7 @@ from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -439,27 +439,9 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    # tf.math.divide will compute python style division x / y. As python 2
-    # and python 3 have very much different semantics on `/` (__div__ vs.
-    # __truediv__), it would be natural to just use `x / y` as the operator
-    # '/' has already been registered for tensors, see
-    # _OverrideBinaryOperatorHelper for more details.
-    # However, in case both x and y are not tensors, the registered '/'
-    # _OverrideBinaryOperatorHelper will not take effect. In this case,
-    # python's default '/' operator will take effect which result in the return
-    # value of `tf.math.divide` as a non-Tensor.
-    # For that reason we excplicitly calls _truediv_python3/_div_python2
-    # in case both x and y are not tensors.
-    # Since _truediv_python3/_div_python2 operates on tensors and will convert
-    # to tensor if needed. This avoid the situation of the following if not
-    # explicitly calling _truediv_python3/_div_python2:
-    # >>> tf.divide(5, 2)
-    # 2.5 <= should be <tf.Tensor: shape=(), dtype=float64, numpy=2.5> instead.
-    if not (isinstance(x, ops.Tensor) or isinstance(y, ops.Tensor)):
-      if sys.version_info.major < 3:
-        return _div_python2(x, y)
-      else:
-        return _truediv_python3(x, y)
+    # We do conversion here to make sure at least either x or y is a tensor.
+    if not (tensor_util.is_tensor(x) or tensor_util.is_tensor(y)):
+      x = ops.convert_to_tensor(x)
     return x / y
 
 
From 39026d9e33040e9ff2a9d226543cfcac40e97010 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 19:59:40 +0000
Subject: [PATCH 0510/1533] Remove the need to check if y is a tensor (always
 convert x if not)

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 2d8285f709c..03c10b37c95 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -439,8 +439,8 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    # We do conversion here to make sure at least either x or y is a tensor.
-    if not (tensor_util.is_tensor(x) or tensor_util.is_tensor(y)):
+    # We do conversion here to make sure at least x is a tensor.
+    if not tensor_util.is_tensor(x):
       x = ops.convert_to_tensor(x)
     return x / y
 

From c14af64d68f7a5ad852bbe2ff33d553a6e37b1a1 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 13 May 2020 13:07:54 -0700
Subject: [PATCH 0511/1533] Fix Windows build after cl/311081931.

PiperOrigin-RevId: 311389429
Change-Id: I12d355802a21f3538df563410a3ccf3d10dbedee
---
 .../crosstool/windows/msvc_wrapper_for_nvcc.py.tpl     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index c10fb826494..de6512e3088 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -59,7 +59,7 @@ def GetOptionValue(argv, option):
   parser.add_argument(option, nargs='*', action='append')
   option = option.lstrip('-/').replace('-', '_')
   args, leftover = parser.parse_known_args(argv)
-  if args and vars(args).get(option):
+  if args and vars(args)[option]:
     return (sum(vars(args)[option], []), leftover)
   return ([], leftover)
 
@@ -136,10 +136,12 @@ def InvokeNvcc(argv, log=False):
   m_options = ["-m64"]
 
   nvccopts = ['-D_FORCE_INLINES']
-  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+  compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
+  for capability in compute_capabilities:
+    print(capability)
     capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
   nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines

From 20f064bf51db4cd2b0934e9656f8b497691e7901 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Wed, 13 May 2020 13:10:21 -0700
Subject: [PATCH 0512/1533] Improve image testing for TFLite Java tests

PiperOrigin-RevId: 311389871
Change-Id: I3114c2af72e6029035f4ba16f002ec284b9c5917
---
 tensorflow/lite/java/BUILD                    |   1 +
 .../lite/InterpreterMobileNetTest.java        |  66 +++++++++++++-----
 .../java/org/tensorflow/lite/TestUtils.java   |  62 ++++++++++++++++
 .../java/src/testdata/grace_hopper_224.jpg    | Bin 0 -> 24459 bytes
 4 files changed, 111 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow/lite/java/src/testdata/grace_hopper_224.jpg

diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 2fcb4b631be..46cd1be25cb 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -14,6 +14,7 @@ package(
 exports_files([
     "src/testdata/add.bin",
     "src/testdata/add_unknown_dimensions.bin",
+    "src/testdata/grace_hopper_224.jpg",
 ])
 
 JAVA_SRCS = glob([
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
index aaac2f9690a..446cf5f7b02 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -18,7 +18,11 @@ package org.tensorflow.lite;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Map;
+import java.util.PriorityQueue;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -61,14 +65,9 @@ public final class InterpreterMobileNetTest {
   }
 
   private static void runMobileNetFloatTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3 * 4);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.putFloat(0.5f);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsFloatByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
     float[][] labels = new float[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_FLOAT_MODEL_BUFFER, options)) {
       interpreter.run(img, labels);
@@ -78,22 +77,53 @@ public final class InterpreterMobileNetTest {
     assertThat(labels[0])
         .usingExactEquality()
         .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
   }
 
   private static void runMobileNetQuantizedTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.put((byte) 128);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+    byte[][] labels = new byte[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options)) {
-      byte[][] labels = new byte[1][1001];
       interpreter.run(img, labels);
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
       assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
     }
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
   }
 }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
index 1471b4b506b..ae88cddcf57 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
@@ -15,17 +15,24 @@ limitations under the License.
 
 package org.tensorflow.lite;
 
+import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.StandardOpenOption;
 import java.util.EnumSet;
+import javax.imageio.ImageIO;
 
 /** Utility for interacting with test-specific data. */
 public abstract class TestUtils {
 
+  private static final float DEFAULT_IMAGE_MEAN = 127.5f;
+  private static final float DEFAULT_IMAGE_STD = 127.5f;
+
   public static MappedByteBuffer getTestFileAsBuffer(String path) {
     try (FileChannel fileChannel =
         (FileChannel)
@@ -40,5 +47,60 @@ public abstract class TestUtils {
     return true;
   }
 
+  public static ByteBuffer getTestImageAsByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public static ByteBuffer getTestImageAsFloatByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toFloatByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int val = image.getRGB(x, y);
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static ByteBuffer toFloatByteBuffer(BufferedImage image) {
+    return toFloatByteBuffer(image, DEFAULT_IMAGE_MEAN, DEFAULT_IMAGE_STD);
+  }
+
+  private static ByteBuffer toFloatByteBuffer(
+      BufferedImage image, float imageMean, float imageStd) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3 * 4)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int pixelValue = image.getRGB(x, y);
+        imgData.putFloat((((pixelValue >> 16) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat((((pixelValue >> 8) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat(((pixelValue & 0xFF) - imageMean) / imageStd);
+      }
+    }
+    return imgData;
+  }
+
   private TestUtils() {}
 }
diff --git a/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg b/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..15a2f2bd2a5e3d1b23a9af50251b3711d3b0a69a
GIT binary patch
literal 24459
zcmcG!byyrt&@Z~U28TefB{&2K?jGFT-EDCQ4uOQA!Civ8`{EuXKv;CK;10nZ&gOkT
zIp2Nmea^jq+?ws)o~r7qYMGwt-^|Ox%WnXVytJG&01gfgU<x|`FUN$&s<zG^&hEC(
zE<jG!_W(W#IVFT+bag9tXGc#DJ7*98MmT?U0dTP6KmPw}A-*D()&<HqI-8q00u}9S
zY(4BiHb8NZjiVJz8Gr~&6#wXv;QhCrxTA-iho|NL)IhkvP<68dc>tvy&1~FZ_0awr
z_)BpA#Dj>ACF$(q<M!9bKRW-4{*lq+5XhUkn1QU^t$_d22ml>c{a=Frr&U+4C7m6e
z-GH)Yj@BN|AS<ATmB-(h|9X@KuvuHf^c`V2@;{Z}{)3a3p7CEC4wwWEpbyLcQ-}K>
z4$FUY{_%ACAD%Sb%s}qeR&M{hCl^?+|KAeA9?Z)B53+y802Nur)6UZBZ&P8T`L`7S
zc(H#I;NL!`c%`MT2z2(ca&!9+#lIu?uj~!7hNGRuzqNf}%zs<|R}6qRh9x9e0TUnt
za08eDEC5yjAgn)~0WPq^3dZ}#+h5)PLH{{6Fy`O-f9?FA=%1G0;>t-%0<}OMW;Qlf
zmj5{&U=#WuDZy(+Gap!o%Y6XKSb?nE%sgNNE@9^Wcf$Q`58OXmn1sq!-as8YkfpP?
zJ5bZs)5#oUX6Fb1;MV=sfhAK|f{hd$5C8~-6>*aR!~nRL|LqEbhl90I7B*fd04#A!
zXLBonWiU(;?qvxO2Ear`MMXo!L_@>G#X!fvCB(+W#3m%h$0x+cC&tD6hyLU5-;@7q
z4)+QJ;}zB`94ssx0xT>ng1;FS!M{~-|KAvR=>g!P!Wv+T0QVLEj|+!@3-_`D>j31J
z0XR`ult{1@;O`>vFChTn;1Q9KQBcv)F<=+@{}jRj;1T{)hy#E_ghPNwghxU_Mnyuv
z<b(-v5fG{H*pbB5%<$j3hHxMg#OKvXP}2~q_nLEZxqbR2i9(y8@n`+a!kwEazdn@i
zL!zdoR9}Gy&+BQe4Xg8puzrZ==7ltUQsL*u<bj#3%c7>i+3hQBYp*XU#mz%=JJ-B2
zIyT<nsU<DL^Sd{=0C;#<w21%2go2F3{Z9x~?0AT<2wdOdBXNYp6a0;!SN+!?P8#!d
zw@(R@8iW?^TxYb%e*-|F<L;Y=1yS(YQc8=*!)oKdLim4<fd0Qjc!2^i5&rRs3lIfd
z3=8)xBKAU2$#BTqO^%t3L<Q+fc6m8*d(#P0?^WJ<rE)<!uvZ-%uUS(u4wQU?G}sRL
zvEPjh)gNWIK7az!cEBs&w0>HQ*#>ww&G~g+sU13-kL*)w3#97o$4>4aU#%Id4L5hb
z8LFK>d@5QbBC2cBX@N96H&<iE(P?`utVJY1S+$9bl?fZS&Gq&zEm{RKf#lU>f)4$e
zF90czHDmCzsGc=>^3;7u`PhDKmw4Gk1HMy|^c?gBP_Gkz!2SYQ7nx{_JOc&r^J=bX
zN2~WbRb_t{9xBg)gFdAQ`301(X+3|2&^TP9pb8gIGtgP$^1r93i>JI@p8pU}61(}l
zzcBce6WD5@F~bO6x7(l9*@Hquow&`Ls#^k_J)_rJ_6+3b4%2l5RH`f~D|BsmiJE=W
zQ#Pirk+<}$a+(%(gu9EJDjisgW}Tm2msZWZ04kmk=(-9(gxT(g$WDY?kZ`5Innha&
zCMPfc+82QGakRr);F`u0Ws%NB|9y6H(sTRl@xcr@TByP#TeWjQbBx?k__odO3w_PC
zf8^m6;}dPAmFIm^omM#eaX$$W?KiTw&Ydn}QWrxsme^yAcsnLh+2L(>x4i2Rqn!8=
zKZVj&ROibSKVe?s2Z#2%DADO+!k7U&miOz+l-sN|hg~Wv2f`RamyVGQ^+x;6FMx&x
zP;JX>af_Of=8{#rC`6}W>ID#bh2yKEf8R87uZ^N|->=ll|D3FQ<%t0`IMjt6KG6zG
z<&Cd3&328ty#RtB;3kLH1H2?izp5FUc1MrQu7!4sQ5c(<vt^K<bJM-ZH%ug(Pv8+k
z;Ku!es)w1tRXE~+nAp5>Kf*ks>`&ewu9Nui8T;geUZ&Pb)BOaD7BB@F>(>)4Z4IxR
zX6B}4cDS2`$G-398rA>ls-cKC1-YBJTGX8*&b;pwRk}T{uTh>8S<1c_y`)H(JP6X}
zybqGCsnEZF?oIwCEv3zO?4J8^<yzoC<bkzd^?~wnE$`m>mOImnN16P_Vq5c|dN`=W
zolGPub`MKbX063DXrHi$eLN&@`nhyQ7-g2C8v>R_(RPB3?UGM%mVZWgSpr73!;`b}
zyf2YX=zjC*FCC?vn!-AP#AN|*Ij?=!;kM6fn3>B)HSH^HY6vT)xf$&<#JiWP(L)Ej
zT7EJEThIz?G8D29UdG%f`V8jv(zs*FhgSYe6EE(n_BX@7+_84_45G*|KQtY&_-Q8w
zHa}T9p%8n%%N2_$u6^PO*}wOby`F7Y+ULAfD7^2y;w%KWSbDyFOnk;_cxOu4b**(T
z6Qtq0SNB}?r2NCq;(fWRZ7L4gipbMwV?*DwG1I{$g}$X;D)yJt%->HohXLF*&MM;#
zgv|+#P6qm9D!Z*t_xcrjxkkQS#-e#(QVR@@!C23$RkJ$J)^xgM{c4NFL!NQp@+bO>
zu8R2po7r#W#ILR#`v6?ZT_=7Iv9pqF`A@0`T&+4<S~;W^6r)ad&&4l*`|@kLtSNe3
zR-wk8>7elAk0&9SnW?imN0m_=@0wFly5_;dxW;QLD`wtP_WM%4_C!P|((T0oRyAbZ
zvz0myi>45FNy9#hm4Kga`v_d?j&5!p`cA(5{^PPEGQP7Y#W}dw1Vc^M3nnkWc70Fh
z9arn7&;DFERIoSA44YV0F4;hlVgpjttobV-R;|w|EV)0f2q$ZTbX|P=&-Qzpj)*6t
zvrMO$0=hCC9*n$OR^s=#yUm^~E^|^ZM3o1B=+Do0RlL!&Aa5Sxlg#+-)mCkH&+3qo
zg#)-{W*>$FN@u-&p3YYQ&Ha8Jawv0B{^=Q?w6W`3Y0BekoUQysvDBGs+uWUZPr37?
zQw2&?BH}DuG-a)i_KOt+WY4uTUe34QcNJevY;LY4#*MO<d(K16!3(qlFMyh9x66sB
zDcgNr%@!bL^s=wMu=byRr;VF=+3f1e+-JhI38t*Y(gEXQx726e<lK}34%z8^(cRwO
z2T`rzntAXNCvz>I+Bc(SEe&Hs2ayK$HKlHaz30Sdc3Y3Y%vl%X!xJvwq^5aY=H}sV
zifEU|WhXxhW`m|2cp#@Q0Qp9vC@tp)VJAP=>lj(iw%iE=y^o+5z_j0NbWTkIhO5ZL
z@Gn`XHm%<?CT6t#N8?o6F;8@p`qy(yl>3^c>FK&27O`)?zBWkc*6W)XD{dA(qMZ$&
zm+nK5T{@^AVqYtH5Hz|ZdjY6CzcQVj;`G*2mTKka?u><-{iPS+<a)2!Z3W6dcusa+
zyJx#t0F%{Jw%&2S`NKRCsD%BdzV-6R)_d>t>WRnEFCbHsWF~Wl-5_4;XWI>zPzNsr
zi0xgt!`1Dce)ka9CSzWWBU4UYb6$dvU&LA5L7so-cUIBt)fV&aL8U&vtt)l!Y3T)U
zXunV`QRnoo=JHd4#xki<W+s+<MOxk3qSqX39e@vi<4-i-uNaGG3EFZPdjUwhcD)61
zO0yHHl9{XuslNd7zd?diW;F7n;r(ZKl0{#p`|dZBAp$k}{T1kQO)2_w)*A4oV2Dmd
z(qoOM_I>hk!TbvV={jiV=`gMP?g6FP?v;CZ0nJe;Mb55YcEjF@_{S-1yl>xPL4px)
z>+V)2&-Dulzmf_7L{C$P@W-f#hqmdfUjVN`-}RDpEu@u_U*CgIxWk=1sqbES%|8TY
z9NZ`N^pDxbIhIvAWd30FlRiDF5!zp@-e0`hn&B#pFg@u%4hY&|JuNj1U<xKpPqaA9
z*<v4er+!YD*x`6gopY=CQ}=UPsB}n-u=0b|(~-4M&?bLcfJ52J++x(MLxAEXhMhwX
zi#ya?&y#3wu71rZeC=fINC&zc&AU7<qi*jx5s<RhwsrE#d#ap=J9^<Ktebl)M>$;F
z@3TKP8MXedTjcHPc1h(=<2L8L@LmbD5^3uG&HMrgd$w0O+=U-2|2g#03b*;7$IocZ
zYt3P)Q<8!o^+$i$4-@R5H_r_;if^rOS*h?pWY^p{-&f}ycpI-9xV@)zKxyP?m!ofQ
z5g3I|=7;ZXtZ!~x_BhSI?mTupVqLndSb0uAc>xHk*BqW;Sqv)nd(L%Bx0r@dW}OF=
zcbTaFB&TOC-=3ZxUDK-}#%HytvM{p9S#uKoBN!yQU-DIU0vJmQbl7OtGpAyFWlNdb
zUr^2hKJxdr)H~SnHHH2Gq*8b|H#`nZ`F+1!epD`r=6IjoS#UQ*PO0~_<nXw5$a`r&
z&Ki!){*Ay4-}KePTDgjFhP4kZBS__89Fw7sih_y8zDV~4@B;bmYel#=_ukcwl-nR#
zOXvMQu!H@``{eoHNTg$S`b=0bGeB{?>Sy8QeSx1H=8EJKQIL$4=dpvZN{PW@oz;`g
z{^_2HKI6e`!^~hi(0W$%gYGb9bhJ*8^YYp)vks@{+?J8G)XAFXsb@;Zjo-vg`{ORv
zdk!35)d#)0?Hn|Ga@tbZG!?uUagTD?5?c;xQ{k<;GQ8%g%u0Qucgbi|U-#Yl!tq;C
z%SAF?oR~#$jVuIIJ~F;Daxosi;OqT-I1>~#F31V44n)ExSV@K)Tk+)P57#gIM`k@?
z@7+L74YI{}sZ#WH56Cu&)Q`!AQJk7H-<;+jR2rtvid3G&UwuR14Jy$9Ir(56?=1u{
z_jlJGkIe<PcOP719E=_CD-E{OUpkj+9~#-({A$&ZbAUbuUA_QvP4*mL0C4p}x8vn%
zcb}dOxXfHr$5=a~dN<9@EZFDA^d)IeQA}JHUjTsyqDBtRa`$oL4jFo>v(DvA0{qF4
z`+yqfN6H4+^Icf`9a>kpXH8j9%l<Ikxt#N&^A8J$!sTDhRYzB>Eq+1Af=5r3H8xXM
zffH9T&s^Xo-<mm%2!*jau!b@pd8eDVHGg+Dw9-3hfHn#jCDPy$ZHIG6OoEb4{cOg-
zNrwk&d{AYUqT8qcJ1AktFI5Ke+~$c+l|D|=1V?&lwW?w1>-7Qvb>DC7>i_iDd;u`E
zpp6Q?u^VMiSwA`JYW7h4M0{uxxj76aZR*C>ZZR;qJJc6<Fq|eE4&D(WaR^x-ce!q>
z7*ogt1*jgd6_oKJO}Y$^I%Gg6e-VXrm2d@G-v$J(t!cAgAjXrtx~v@(0>Ybn?ZM{X
zOwBX9f{aY1;C?_}NG^E|+qa`SpXKVqM@xhIE+ms~r<!jEbull1Ulox9g(j&|SS}V$
zP0{WnP=I@=;jZ9?R}&}p=vd}%bzeM9=bINmX^<An7~$`&)q4ztsKvM?fPePveey5Q
zAep_O-?_hMDe9hCnG5j7z}iyHE4pjfe*MBf;f?2(dySXgta-OZN(+4ah%Ekl56qe!
zJ{Q;#U{PI}6)tk9kF-cz54EifbW(y2!Pc0DN7~b?*Fu;P<n#GKk+Iz|kd+x%j%N0p
z^zkcGl$1H1<zdl_B@Hh<gCNd;5f4>|C)g5Cv=F2tK%1>{omt)Vts1-+pk!M&=`}sn
zR<k_clnGles#vKe{FaT+4R7#v`G@3a#<F%P3wK#d9!&$R#tz91j%u1pNb$8H8ZGS2
ze0~`>0i<ksx%n+m<XxQ(DRkU)R5)DXx}Y2pyR7v}mll`6a!;~oUpF`61G_PYl?W{B
z-N@|yEs(_uxLaw$26Auj{iNDq&VO6YW4!jUd{}$R&cTUynX-L<?*720<QacPe$TjR
zt$?jg1fNYQw7++d((5$*u4b3QXHb~8oAPaC<hY)~k_C1qs-IS6SNse+JM+)ZXaWPX
zvRG_~*x$UkbtQgUK!`vg%`2S1yjYhcwCLXGFA33~abN>KyYm7VLe_9TAXcjCy^J<d
zZ^6PYy?Mxd%1C89(*`Gw)r-+d62CzSrkxCV=-NHe;!_$vml?@S|Ayrlk33*$@yxUy
zvqQa}p9$*xMB)^ai_+?Tp^ElOV?mSa_N|u1!N=wn$xP}k5302hw>L9;Mx><WSt)B-
z%dgdYY?ps-ERnZEN@RE;D-HX3jHm<Yjv^HW_>6Gm%fH11qYATC(is*Oi8fcqBd_n1
zTZKRW!~%r2l~_ELiljR%&kyY+K#)I#z9G0*j)39>%kpqh&3MnT)*d>y&)YUINKFuG
z(|^lf$(by;i{uy86&O;;I;=;CT6J>0Jixo*DK|WIvUsH67TtMw2PZXo2c*Vf03grm
zv)#D4qMeOJYbks^!=g+n%%m1%z5lGE*VWm3)>N>JkTwr;&_DftWh}$Q>gpy11S|lW
z-Ltl=_(^`8GEy)G#n8Vj{)_uiz0tOEbBDo|KnrIl7c&n#b2~>n4<Dc%$lCcY$e{&G
z|KJ_i0C9jN44g9m2c}X5*uu&lu;l(v2?qH9@c$1QKmZSb74ZJA!GC=tW6G;a$pF<n
z&HlQC@c}3Rc$g684~7Ow0ubQ;(%&2bMu^CWe+d;C83`E;6&)Q76%7p?6B`p90}BHU
z?G^4TENmPcTpV;vJbXMHd{`OhuMxPvH4zX|VHI&O&@f=!|Bv)9BJ>;Z`+q}(1^^VW
zV38623kZUp!N49E5Cn&W20(uq0La1+9|U+LctivwWY`Y{)+l%cTtp-QG726QyErQT
zTNvmPLO{)t_XmxTNFu(Mvu<78{L`<q*EEtJEZn#>G(!^-^Xp-j$o>OZ`rAGK5)#Z5
z%ohL^CgsQd>jx1Y6%h#$9^tS2AJ_>O>8}?xWPF$t7_9V`qxTm9wZ!@#6hiYe&U_kf
zxBrR8>~?M+H?AYXwW<iSM^-fXj4?Eo7j4R1ht2wsu6Of-);OKzpl<_eF{W?24A37L
zafJcbeGXrjoWz9g69oL=T|3IX9}E9DCG$k-IWnar8>FktQTz!qA+sv#+yD5;a6EaK
zg>y-2z##^RpD}fm*OC98*lxRYN?b?M-nfOY!p-F{)X|;OIHu@PKeTzY=%$oh%#@(~
zOT$GS)%An7JU$vFH>IN_0XGx0J_ph(To=_s@RbQIyFc-m@9k%ciABy<KWV%S)cxXj
zrnPGWMJX~0Wg6aJ*6`ZG`>)RP#+23Tj15{_LsXgU`j`gNzLW!)WOx7cqL5pTlWfvJ
zb?c+%tn-_?<&@cYm99tw`}#{6C1VxCK2=Sr)E&o?s%tZS*_u^zd<SV=YL|=IW=03(
zTHr=0#9v`>*jpzb?tI_Ls1exqC@fx?$tw=~0+mqg!%LIhtlK4~$Tq7R!AY}K*hCr9
z#<u$2W8F~{9_)H<5gE;W@z$7wwR%m%t()z=`l`KQo55+0iw`dynVR5DvAT9T^gAYe
z{1k3D{TCbg%VC3k&0zWZnXp6vnumY4Xfd!j=i~b~7pYquO;yd0{Y^}2TVa=qJ-PnM
zT@)jEzh1d_(G)QeL4D201N^r9UI4s}{=OUY@)qo-X9Jt=a~}jxnOX9!F^{eW`yFdU
zFWq7rb>!Vos!SlMZo=f9_l=B#pRGnBxr*rX9tPoS62wI@Jr(+NLLDZeE&Y#EuawJw
z`j$+hS%D+pAn`R)1&_vU5ERi0My$#*;tug0O8A#(-aaa|Y9@Q5-%<?6F6`BW<>D+x
zDF`v{_%G>beEgXnTm86Q!lb9&Qjw#vZ-xwy-)3ti7&+sM8e3=@%dxm3KUW9E0}#zU
za>Mq3pXpRLCkBCMukww_I+&&wIac+wR@?ckM{7g0OmvBg8KzW9^MF*0%#30Oa;IMl
zr&(mf@;(d;$=zaK^u1C(BduH&U_S3k)4lW^(Mq?a(30Z3w$9el_(EmBy3wowxm1tr
z<bY!$!<5$&Z3rC@&RLU6Ylyio;hm1FS-n=w^Au}?FguovX?a0ynkzojI0^Gi8XETb
zet3W2PAIkdkm#3p`-Om+4$C4`s<NOT#3-LqJgQ=+DZ_{1aj-7&f#m)LP)(b)_#n(3
z8>QjcCMs8Pfp<RGH#Iv7egULa;7qg3A~~!mHbO}Br{fl&!DUSJF0xr1O_}0CGPgFx
z{Gt5NJ$ElH`B0ECi{Bgs;RRqaO;&P9nk646_vk+KN__f2&XAN^Awc3a9o+lZF~9HW
z9+G~bU-dFlCqlM^Gwd{nQEJHF7x!w<*^l*eK^g4{K^X4@4$5Ym!h;c;o6ux+5s7oe
z5nWuWFbl?3lR;!?<m|<<UW%O3zh%skZ*y@Q1`{Y7XaVt4uAI@4VhxO>GJtDpQedAf
zB38qsYxI%)2bx6s_=pg&)lpNo4IC}|TbduS0e|quX=(7mez9qW@|t*rlb~e_Qk;7J
zoPuJaeYT%(Sp<|%EFU&m@}_9qXFRf)G-zTmWt3fWnPY0<$gzm_S)<?)kFGq0>wbrF
z6p@$Y1Pme8nhQR7;tz39p|49Wgq+@IF&PvkdF=c25Xh)1QUrcndHW>-O-{itapnxN
z9Ye_6rQVxe;6c3mErsVtTT4fnGfQY+67q%!-{-;kTg@2wd8RVkRpt5vo>nWeZ)wW`
zL>1esIcwT>^G!XUvkMEaImy<?HW2!*i|vf-wD*q5zig&4dA}J`NFs>xm{+X`6M`2<
z`_|XC&V-VamexUN6@=E<*0+|!9jd!fJg!L?J*$6=IHOkIzpu-ZN`5|>)wvYFsIGC!
zw?F??%Q!Z9$I7z33Y^Jlc#O^KU~q^km@kfaA88p%PV-5cUz<7tOtLx$FWz}tLf*HB
zE1uC)jc7<*`rUMD2GaUd0h_TTu4dqKVPE*vCoCRr9_S7O%Z)cfE+{fjp~SvOma2u!
z5x%iV4KcVdK~T0=>rw}$;%8SLhSyek3#n`4r>55H%~k-v8dFZT>#rF4KR)SAHob&A
z65~AH^%M5e*5w_Gu&1BaA5KyXvKNRT1yF&nT*M<CUoSXXS^#H$#L4a3inb%Nx|rb_
z7~VEmUWTId)(nX>_a=@HTDn<#J=l51zaA?0$g;)cK>om^WxXQb)krkvnA@PbWH%k)
zoU&ClLT%0)Pp9Y_nQGd?dYX33<3b2tH&Bu0$q76gDERgQcqbxoEx)ctIkq%3LXKuS
zdOSxd##)YsE(aKQkHXG__ZNmej53S1Gp3-Cxe&ep*!;QtC2j-8!TeRJ<nLPyPF_2$
z8foPzLdxdt_YLuq5x?WfQpxJzGWO4lJI!M#r#U2l*_J(KaUk(W*<V&<sn6w*2v?N}
z{CL8ivLB=9Zr17kYUrb(Ab5_NkH9K9UG|BM^3pAsc!9AS`>pBMf$f5Q4PzqDl)^&E
zZ%b*)J#tLTX<xU_^9FKLJFpdtn3BTq#1eA9u*n<^2GKyGH#Krj=Ioh0>x;i^Bzl!w
zrTAYW5zXI}1Cj_%$UNm$YzJdghG}j+%-P*&zTWakC+-pO2}OVLZd<Z+xv)cntW#eb
zX|CDsH6aXqDlcU?kg&INOT^I7!Vt%M<xnP=5l`v%RY#<VX4KWyleJs!(y#wHZ{f1~
z(}=ZECX*2bC}mg?kHka0tw}*c@(hHK%<L-cGl+kt^dQTroM)Zt@gQNOB%*I$L}0zI
zUc@zPpH%yedlnvDZ~40z6B<G?Xz%S(_ZMse9mC~}0H5;ls_oEs+AAE#3a_OZv}<ce
z=Viwt9)b_VTn&V&<7wccXT6x!-o;GpG<9!zias{_CH~etDu}3Tk6_(G4()hCdYExo
z-_$&8d;xGTR$lo`-<gfv2!7@pSArgr6%t~G9bp6vna$QOGU{hyz{$z?K($&u9jI-Q
zV+?boZs_z*NUVkp8&?h;&IhL}HCFMGUlT#?`pX%tR*G2pG}?M((S=>d<*^Y1S@)cp
z^uFZEu_;y0s&A_PRN^bU$o!P5=kU?s>k1MU4Nhr1-`?@0vB#uBMWWy2-FY|ju0RH9
z064#fLsitZsGzi@C>&>0FSxeowg2ELt-Obm4gl`z$`xEpFY&nCuk_UZs99Gx!~v*-
z-L4#ef?XXaV>m6!f8HG|LwNQ5;*=1$xmc<rYUG-$oQg=x<~vT1jLi<)r%_9CII4Uw
zBs|Jk_sbnT4mn!<2s}QefD31|*;~W6hvu4dt(*WgtlyKdRfT#6YK$HfJ46i2=2MT`
z=9+)HluGC+Jsp9sl<KnwU3c!RY&{A+uZn=)bo{2bpQ>6uMo;<AtE}JoB=Q<M>nyPt
zDKTRTMe1mOF3aEMWP@vDK}B~JmjZAl%RbxaD89|1c0(GR`0lbcX+o1U9*{+rU$ZEh
zS#Eb#!?mTd8ZBzwX;n1WDPJOtURqLW$6%83G5cF9Ha_J$4@xy5cH8It`i+D0C7z5M
zxFwgyBCd%^a02qM=!V=~w3`a+M}sr=1kNIHuvPSKyuF>%BzSgiR(e~#+@9cTae<vj
zIn`j-Y09Fk8kCpC!XE1#o!%M!W<`IBas3K+3GMWbpwY<XRGloiS;tQ!t_?2It%d`h
z7r)k}5wAuYEy&aJUEY-7$Q<ucXUK)tcD|JY=v1xbq8H>@(@*F1YYJ51+CE{V(T?v{
z?D)M=?y~6m?tDZD30_?h4Kd`as~w#DrZXA5Yha_pASm-=^7l^5H%rXT9V3dP3*hZ2
z2$OZ5;{YFq{1qxn()ou^Xa|f#HFld|r_$GkmEZVBc_Qr@$X)P?)WN*Gt;0;RN%+f!
zSsq<cm*Eek?-edW0;*hB#@~5yhY19Shy2N<G?x3K8C!;9{!zFtW`xcXw4sdHJY7I`
zxgU)j%#njFbd}|B(r_)##7bV0wi)Xh&m^Y|ncV*7h!{%aK6#mCNf`Tt>0f(T<CQ-%
zJvss>5isuE8)1nW45ukeJh7qGEEo3{MY$DUJU}3}zwTx@HBS9g9J&VC`FEe2!%&=7
zRnDpLv*F>*VC1A6g0!?azUQ>}JR3kUs0z6aI=3;nIAuO%WfmIjV<&DdfT}9GL_|N-
z{nfjvMs3jAuwBHF#CUqJe6D#}9U%%;KG$D7UHulNRcz4B#@&+}U3C2G;aT6oMi|sY
zzBzsym#;sNydHtM#n!v#-eCDIkYa0N+R1X#mp{;k$I^Ec*<rLZ2vlHShDBdSQP+w;
z$q3BybdY@nL9}l5%jbXEHvg(EC(SrI{<S!*T0S45+%u&gfdMT0pa?)4egh><vt*Q~
z$ooMYWa!DpZgegjuGsZ!d!hcxNXzdX{WqTC?<MXPH&l#J;SiBsgq^<40ghA*(&pWm
zN{?%Gj=Wz{<Pvp(Tn)&d4k0Gj+l3@eGdsXBN0TGo_h8?m`W&1GTazLwgz2iVAE9`d
zL&gO4yWxHl1>1S0BpY`M?jUeB)Sg<N#B8*&HDOw4=7fo~=l5}LHC09d*Qi(mzOc{z
zrd3|*a(7BoF1@u7^gOdczxrriyOO5#PAQ=kp^b(H?!=Yc6Tj>`!CA+V(`0*_0Du1$
z77OJBbLA@cg|;u`Sfp#WGTs~=%<H5iz{<jvs3xhrnd`qb&~KWIc$wJV%zv>SSO*<%
zsf5j_;bfg86VJY)WJdX`(IlS)`L_2e!T~=fpLwn_t}1SxWE+j=6&$*->oAb?rIg@W
zXb777qbsJBr{$B<G|LPJOpS5uyXG_9&+Y8gl#~pTPVnUym?YGFP?3zn;%Kf<ql07_
za>IWozly_n#Ofv|gE(VQrz!UNO8_Co1wTu)On$jYy(0R<%MuiVvMTW0MGR@}7K&HJ
zX_(WDfChH5?~*aA0}|2DHW#EZn1aP{&*JZoL$#We)9-?07ZDvK)j|fFOEHApa)Kb(
z%PrOS$te!+xE-y*ho3k$cgh!~JOi|5JgVcw^ci!RZ5ah4<e**^_Q7%N6fA?sTlb2g
z!y5Aszwuk*EM*XyQABuT->rKxTe@@GX(!Tsyq~YE3N$u%=L#a!VJX)hJ`+e)UExhi
zMnS;jYED;4BOLpTv_Kgtm$j=xX-t1HTcyqX98^Ly&x>M5+@MCADsKEIxk#z<9Totk
zHk3}T!a!3F_cEt-R5F5AtmhQ*+xei^lzP)*nEBJF$)IRt67lTP75CHl^!y3Y35e&*
z<E5T$LXs0_=PMr(_1DwdNk}*X8-3fNQ0EQM=%6sU)AeMUot>@`pE-DXwB<(_Ls>4%
zhhn}XA4HOklyG|qn{k%_7rc@PCT88(u-qAQIlJF!`qH29B2&+FT2vlwj5$}N9mn)b
zvQNC;wRjbmR-3OzH3V4~OX!PvX6P{M2VBndX&%4Frlp!v<r$To-=!pKE0DM_l6i``
zEp*m#<i3ol6^*D$#?ty#h=r3%kfefx9#=~cL~5DuDhtp}r+04YCk{oaq48vo>|SCV
zi~2hCXL`+(sw(bP(rKwN%~6Zv^F;o>ayIN0l?k-cYh(K?YTHU{D8Fn~yQ#|IJiIgw
z&FL*_oJ<LBm+8#oLl&Dtx3&Mu$96*_?F$2!R-uPp4JvVYO*(8K0C4;za8KyN@3~||
z_}pARDg;CT>SfD?r@q_@+jw=c^f<P4N48DZOv}w|W0}U%N2DHk0<qhq#H;?#&5sjh
zA#1V7Z{A=cNhj$NM94`{+D^J<gtG&=6@D3pOj*?aQka7b-GIbIqH;AGD$<Wh;RB{|
zrSq}2UFT=N1tcMX@Q>$V8yhhyIkJeay8wcU%Ou$TS&z7ptfK0psjqx6;4C!5S{r#5
z%UpKveJzB6LJZM;2tW+P6HSWy*dKyKg@oG#yPe3Mh7zo<3#tr1-u84Ppd#YZ$BLSi
z_C)f9dE#NZ1*W+e*B3SsjY5MSATyJ+8F4jKSHLjACIhX;%l9AIr#5+6#+ql~b0km{
zJ^Um(Sa_=`koP!1PWA06?ozbS`gS(MYOZRYO{(D#`I)a68d$(M8GJPAI4s~dqI<!D
zK$3bb*7WWB@CNDcMo>a;gP(RGlCs!nR$<vyjDrYB>Q!ZQKE%6Rx8t1rGr#?HTp%gn
zwdB+f0h}2Kw@hIJPuYz*z_9T%2&r|@fNNh49$>$BAl>=TZUFeBeZd*NmFp;2-*0Jd
zsp+kTs8xxayg_OFxU-;^5SrXg|C{;52BS$9BX=V#<5pqoroQqMIn$@~PqJu85VQnS
zCuEyuwkgel>m3veXWU+$1m2jT`zj~<;`ATuf)*uwplPIlBJ>TLE?)6I5`8U^=Gzy5
zp&i2yLUTGaVF5E{rQPW3r^Gj0i~$4Wp=kic_=x#l{}KV~)cchBqFynTR?6xEoEU>j
zM-kVrzr}K}=qZs3gGj9#;f3|&TjP9*5x%|vB%_B_j^0NmK<)$cNcMiy<A!%U5E>BG
z7Z#RDNz<$wHUIqBQP%6wJflU_!|J1yj7Yzn(;qr=`(T=(&poTft8}Ygt*Oq7BZAO&
zO+LZWmAWOf&ytiLaA2~MBsjep?Yg6xQ|`Kb*7{6vHjINP)}ue#4B^>zE6mH|PS8Cn
zAP4^WR`#Zpq}o8X0h$stT>9uBE0k1Fc0crPPVWy|ar4yD>++?dl|OKvBj-zU*SLR<
zBgUvZoCv7CKw&SYq_<x6y!sx(;fu8<<a7efR7u5)*OC(R??rO~Z=Ee4e+YtPS=!gh
zM}}IrGaMP%3o#`m?e(aD%GvY&kQQ9Mqt(3DTh?lAQ`hV)Snyk?H95M4&=1NiP9Jj=
zI-;9g_{kpgB~=NWZ@Wjx9~@&gw3|sf6SU^q(FAs$(W_hQGi6{m32G=evM@5KX~rc|
zhPOa)BGXY0MQlTV143q_yphzPPuX{BDB}jz!O1vfnjtR$A!o-NxN{`ZNN$f12PnFF
zLwn+Bl$7d|Sj*i^GSg~72RtBtIY>E|xc@c(LAF&qme!l3T3v~hZ|o8(IGEly{YDzJ
zJs#mdWr22g#A{U7YRN&NN*v#v8U>RX*U)3eP=C9g5}R3nL-5t|&i5qX@=O7Lrx8@%
z3OU~AKz;nPXA<$S!Y!w<_*tp8sV_#=uC8TmF0c1yb3JBTq}9$TfFZCz>&S6NC~ueR
zX(R!q4N8EDu%Nj4(^J7H2XC9;OvbUHsr>W7WZ1Dd<TJmKlUSF&x?;DY$W^WAl5;7J
zK(xeRacOaYqb65zjySZiO3*7e`tfWAydn;%*f=n77TESJYSbjc^){oL(ilx-N*rQB
ziC__}6WOM)9P2W0DAg~U!IWc^V@wbj^;Kyxb}i~>XS5PF5}VKXVZQKz9f5`YPbZ|I
zpDnA3%?Z0S1q#;6YnXBp#70?Qlz%*9<@xbmX6ASo_a+=eV4ur;QoERpp{%UZ<6U7`
z^6MF(N-tnT!6`T`efOKcHzu1rU-<1es~@f~EGgmn{Oh~m(8ioj5%Rj<3ur(z!9dAq
zrdRNJ!1xd%abWu9Hl-pCxG;_&C+u7Jlj|v)%RcT>7g2E*1?H$V&t)_kL5NIwAo5$6
zqp>ml4ATVVla9w&0NBlE33oL}bd1zWqwy18F!t5;{-=I8x{o4f5G;qIj-p8{V<EB$
zs6wf4V%eSXh(vTW#GdQAuQB5Vz>Z?Bu|tSc^bQgBIn+kYZPVNf;MFRfn{Pu=!l$!D
zPa=pi35(7&iXIrY;ewRsC5Ws0Fz5#he?y6Eu$#D9($)Z5f(7zds;_jE^o$n}PPK<N
zW^zq66|rjBi`$r=*DvT}({}32aC$7WGjRP3lE3758@-WF_>KHHZ}y00<s{_LkA$o{
z+CsBS8nB+a+A%$Iq%L{--RE-Dp<bg0TCFlqsB@iY<3DTHzIdr0&h`SpvW#+K)<J(`
zZ>>5j+`9=Zr3C;;sBPu8@HBlxzlO>`e$d+I{roT_IWnKo9_a8hJx-uJLq`Fmrpg2H
zOCjJ#*AJGBHCHCpFWb=2XbPg8+UiuUE*v)jA84qFdzKZ;vl8rrW`W?ZShSL#2+-bR
zo`u|6lc*8wRITBIypNwGKjqkMdiWrJ>3?FVQzu3zQ*eddm*~iu|LiAmejjT~WVeSu
z(tcB+kcv@}qplFmQbLdluN^=|3&c^3pT?rv9qkA%jnkNUWM{U!Zryr=+2RDLd;uhu
zAHI7VGWrIZA!D`e>?*HzK8Z|AgCmaU=lY57uGCr6uBEGabtMuV7tI}fqVHYWt;WO=
zdI$cU6(XaflnV75Ye8>``qn!A5vx^2iaA+~OqMv!|BW+$1AoC7^Gf0&{VXDufLD&+
z2JU#uk`(yLNXfM0`JySW|5~7l$G{*uk&#&(FL?l7_}(1!$6V&@+IqQ&ri5`mtPp#+
zCtkSyh8PZl?~nKW1@LvP0~Fx_L%Z#8jztFTxbWrl$J;*hM|KXjIW*CNBWu5)oe$i@
zsr^nUj76#2hsw=V$y7AV%}#GO);pwWn4y&#g4i1wKKsdShR_<{&e`rbwzU+!({qZ|
z1g<o^Ewwi;(v=Ti<|+;cA#Z!{+8sv39wwHSIVUxb|FXBcu2y-QmJH-GlBJG0n=}dM
zj4r0uvA^-m<PA>+FQ}P`H@-^|eDq!RVMu&u|E1Ml>erzk$=wlnwlJ?f#z?9;Ii*aU
z4^={J*$5pySVf#BbchU^n_mA#Tfu`HJXvv<asZ+79B(&}lO9u`rD9xw>a>(GA!c_V
zjQ1}I*}ZzpTt1;(2ui;!1BVrsm}$=xA$XnaL4hxT<6E=rghuK4bXHG7lA-#~kuq!N
zt47+tOV9)(H5)(s51sH0B1A~+;=OsZ!UZt-Nl?;T$Zo>#K6pj5)F2l0*_MOCZRF<W
zfT7B_F4K{_lCSww_1<@|DxU_P=7xGQ6%7S}8M1}P#;EP8ORH|S*JRoWkI<lea4JU9
z5o@xXowU#tZxuR1tOl-72yjychWt_1=>#4@c76`!&D=44&o99$yI5v1t4*MB(=6?u
z<lb-A{gF-rRS+>Y59M0%nwo|{_&ceFzmg+<$MFGD00z;a#qwxjx>#N}E&0?kVipd1
z?CmQ(#w*wK;arbx_D`|%4iDVmFBy5?p^pjRd^xgXMZ~KOp79OD3Em`={haO{zs~6~
zXAP;}D^<l)r{xmr+Zw%6-%WcY&<je`qHHYcnunB&#=Pk?wEH!G6sTtc2qnZcvakn%
z<uZ~GT%VD5#%ZM^+-60_@2-p`?3wU1Q{O-puoK{*Mc<|OXc(D^i~EJJ9J1U^YHfWO
zsjaxuGsFSpp3Ck}$!O#HAt~og8)-{iebc$@SD=cnB_`G;&sRt2*vTSQ5srzCK;Kp-
zO_%qVqXI1nY+99UJhl^tRi!28s}G!Bzh?>E*`F~YWRq~|XA-dCEnTCOuM;dIy^<ze
zet7p3GCXa+_qnXh9Vo0qM!nt$*nE}gm1&9YKeDw{U*##xnb@YqH36bL#jAt&b&~<P
zNb6r)Ib}$OljfFxBID1SMCu(E;)rSQ)f!f<??l=;)6gg(QQ*)$N76U)(X&&@uWY=R
z*W4gB#8WNGRY`A?Mbn`XL)ywp_L%S*ez#a0+qEVse?VqmU1uk62k(QfCX_BqC3iUh
ziXUQu@HjYyIBJmRVN#hTG2#?{9`Hc)fPG&`Y`6W`nS5}e<%D#9D0jUZSEN2IigRHY
zp?gsJ+kDKmZ7aHIb){6cXMc%h(`ncTI`=e}%eIXnV(?=Qos@QH$*b{;!Yj>byuvEe
zr*%O&XucLZ9sO=dKyrO&p8o*DFh2d#`m-)%OR<Y%o8@{<`js$s>S(e$gvM82!43K~
z$F!6I`;#<TWuCC93?21@1^80F+5U)*ac50!6%I%#H6&mY-urnI4<ZKoWwGLdM)6vf
zP*EHh`Xp$<P@+!QKDN0O?f9726OX^vEc6By7?=LuwFo?_2RtVrKrt8i+M_-~AE&~8
zOuXo03CzhJCYS4#%29Or@Rc%$4E-uWsvx^0Sf=Ih(giTi@;;GV?2yk;J%{&PmO~^B
zd7yB7{PEBCqNaJCOlH~-U*9SYA$jO&0nzXf-G#qBv3E?8yyn>NvaK0uYhErVP42W}
z<Ed89n`Nak8UKt5yZEcQUPHa@)KlyYTuYiiJL=26_mvgqgFTFyn{XN_?{e))tfXWv
z>{~^d2fqUL01^1Kn`eqjV+Kxw1Cd?6ECUu9g-Vt#UCMUnpr5?2He7|-$;Eyyq<KqJ
zxt~aU@I_RCywMtnAXSXhI_tX+55ZeBjH6+=WS?}N>f;rYFv!8fJo_PfKBzw&-a#SG
zpJ7DVv;HAgR7j5!f#3c83g^|qu7XXeR<@UYrjwG7?cKYMJ>>z=Nx?|4&3QksR0rGc
zli4|_!pzIDF`uA4S<>>g`ddj<(dZXI=dDcJu=M2vjnxy!k46$|reBfvZ=5Rdq;Ir?
z@S}w13T8I(A(1=0=%o?hg#P`SU$(xy%9|!+9dH!oLJ$)Sz1PZ@btSg;<qoV#Et<<r
z4yztt4hW{3zqMzVq9G2r`6Sqojy}B)gnhwv$aY1DR#eiMCtfT^11?L-JtTW{xoVFb
z_Q?3OB#|FpK?apOW~K|Oq%6jtF=U%mYj7FU)&_BNzbZnl^sS^y_|{XrqKGFitBS7p
z2l@SNSf@u-+NqiQD$xaM`kBPNsv}a*(fmn)pV6YnPeD*cvHUC6^IuGwCu+d79Pz!f
zqmQ8zym~|_BXOt3){Y9+KlNC-nrQ38RNgnmGaaCm{@4r+OI2+0Xjpsm3@nn#dK9T)
zI|b|zHW`9KQJQ&Y8uNmWtB+Ec?Bx9i8uyk7(zj$9hY)i=FHyUKH|D2Kt?1_*w4<yH
z+%#1DTlx}c2`ZeV1C>rhQ%KVN1bNFBQ*!y$79OKpoH0roMb)dy%lvB`YVt&n3gLqb
zW)cD7bc{0?SA=ubi3%P&{dR?|Iwx928Xi69Os;*|IzuY(KAPAc1sd2Ks!d8;mfWyF
zV>P-}m&Xv=s2IT*;XN_AtG62CiXk)S>^>(Rzfp9tX{rh%lZ!U)Vs1pOoXaRA2`UR2
zYoW0h<NmOD%g!y8xdO-=Z)1&2X+D;y4KLH)O)4MN@{};#ONGyqTjm)L#pkQ$HS(T&
ztkKNsbcNhQhc!zjXH2V-CtbG0C<$dC2yMbOB(cTuo+RL<)V;|5WF?NpvQl{!TC~03
z(q{{HzVwsvp8gW2=g~sVVt&rmr>vO~$(l>8x#k#csqQW#H8}+Wq&^}vB&N@OnwVdK
z9(~P5h3ZD_j10|MGlh~EB@}L`@hIOBALJ-8wH0g1`Aej^DcV_fMz7|fY$t67&Q-2d
zoNemGvja#p@$9`guI$mf3h4<H@)@2An!u7(fj}cyeJ}oIY+fBBC1S*ZC9D^K0%3Lg
z{Oo+&wW$v`j&>On(@9|A#V5`=*@n8bBcGfJBdMd$8bVW=cU3f_ib0Gh9ywto{i!$j
zvp3Rq$I9$VYFjK$uMt0Op{fTT;tPXKJ+vz}NXjfY`0adVfNzU`FL?X?kjsvcZ*?U2
zBSv3#$QKQ7kX+!YkCutq=@1+2w0@MKL(6mJDIe+-(?ha|(}46rri~sgLwt#B{Nk)<
z#YanpWjZP0d~_saX5{c{$SbZ)pO4I9|A;Qm?2=tQ<h&6Zhx^Ke%Y4=bw<@J<t!&`d
zMCAh#@CW--3H*Wtgbx3B4dJp%@H_6=y#>Q(@w8)n$o?l@$x2*t!4R?m74koLAvge<
z3;OL)qULFyuz>e^?$gM>rUP#KLyTEaF)UdO(A$JfO_}W&H366xFvQ?Zas)f)kzf`Y
zvmBm^_$0CC^p`S(0p(oF*S7mt7VMTqMm23BORC4{{Z^(&%6~pL^SOMY1&+({ZGjv*
z=63|Etgeq$OAeWrjT9y*yjkoMWUe*@;iBrwjRiQZl}Ol_x2&Mmb5O-j`5n{ZN}Hdc
z>Y^EVwGQ$h>gxQ|;OKGlv|WR0gN_|=W@qf^Q}wLwt^d;4`m9Egd@8Av3+-F2W-)fa
z<uI_XY^Sg|$fzN>xg9Qmr9Wdod0K`!*lgkO=AK@}5mL<cJZq~p%%L%7T&+k~rV0D@
zMT#2HS93GGJb3}`^b2P-qx{x*`r+`CE7NUg;pmC=z2R(JwfTC==^WI%bamy>$qC4{
z$x2^Nlz<O>__2vAS~0GCP7^;ff&FXzM*V%qa`y8oYJkR@MFrh=DCdVvOSQ0VClD=5
ziGt*EmwIP>Dl+R@N}UwWBMDv5z%PfWK8RTEAm`}}t>fb-8Y}=`!nxEz%e$F{n#K^h
zz|Tq@cygo(0)>wQ(Wt`?G$B<#yqOh#3B7%%ZD+u6XHen;WG$YqA0sN)CUIi0EBBg$
zlFG`f#{mH-;(v%W^uPR4QLdM_V%(n(jggk^TxwudXkK-SMPk=cPK_%KZ5%F1<iJKD
zMZXVD(g?)g&PNp<-^8Ad7+7$uA=5pzRKO_kqytiYtiz(%=U$$nm!H{Y{$XDfCZMZc
zaF#uSkh2wkXHNX<*tVwWwOdi&YQ|@-!$`AmGR)Qp&9d5}xCCq8IGf$@p+aG8kE(LI
zYzFxQVgD;xm$N4e!b3F=n%yBKV}qX{eO4)}7Sb|t4MKW*IYG#QM3M1;e=WQ()IPAR
ziFK3=x&08tBW-7UK!Ycz%!#9%!?<pChLG+_YEYx+&bgpp51B5aTa@eei)sEQe^cqJ
z1s1=IrS765eX6csPd#8_nJy{YpF>ZQgTnnjn|ZdDX84ShC}bP!+ic{X9WhWlC|<%V
z&4h8pmq4Xn#y<RBOcc}jC_B?`Dy&=e9uKlkC1SRYj(bUdcVxQmiY(>z6AsG^K^4SG
z_$Z*4?aD0cr#zgB^k$znhD0C({FI*S^qMqs?s#N4;){f)7bpFnu-M_dh+DH|e@WZD
z3r4c`&#g;-{zJgv+Pv|^AJnngf0QN}tqglzJ1~k~v1kc;%P7zlqUa+jHRltA?jNY=
z9ZsOz52n;NFqD!YM`#|$j3^Nz2_%SzjH1E^O1mO*a-Teot^FeNvxr~J(dbGv<`SqL
zjcT~jnbMaNcQ`uG2)@kzPJw;N=csk~C4%oTS+-THo=kbBbebPAH(J8ag}tE7hK0>)
zQ?-gf%@dW8ZFWiDvK;$-)Q4{c%FPk_J|H6@lvPc&tTB#I6^e#-En#6Vh!!!JF3=pc
z$Pi0v6ns~O<z<|9NE@$xeWE#mIF<xD(r|`=QEtD+@!~F{A<9NOd9)~j?%dziUR_zW
zGy_{deXmW&`&@XzsP6Lu&>Olh&RKCQ2;ho+>S}rwmrAhzinb3;-XQ0B*>jULv)RFp
zi_1y9K4r<Lv3WrwP(Mpn7R6rWyhsSoVy1ebm~Mi@MNN2-)b{p5(QvzLR=~u)Zh7w=
zV{3WQcI~FIbCUWm@nm@MEu5Slv!8BVvqneVpi1!)t$x3a&uEFtF$IrWmrU8U@c@4t
zoE?uEl3{`R-$_f3?Qxq?GDv;mntpMBkO=XI^0g-QrHtZpT`Rl~_ep*r;YrS`q4@%0
z{sHEExRzH`1Lt+!$E$6-a&_@i3P2Pc_+OhPCryk@DkAYS@lSua7}Y<<hjwK59OD2^
zTu)gHBISYw+?U92#B^|0h_LpA;@61L)Wc9Y=b`WI5XpPsEpmK+zyB@_9JwJ2Z0oyb
zdn-=}HkG9giRN37fv4lASa{Py$WRE!H?&743mi6N`k+%ShsfOW)~83aHJLIn6uDI1
z8!H}v{@wyW9YKkOVlvO?=3+Q3pHx)gs&*i^i&`<c>LPoM(EOU(R_!VQ1uY)sNkk)z
zo8CQsd=hapN1Jlgz$->Q$tS|}m<iqo|M}w?E1TVJZZ%dxQF#qjq_&Yyf17x7oTesq
zPF7EUd2dqSc!AEx4^@KlBsnw>Pwe{DU;*OZxLa3-F6<DkO?OX^pHrX{gpMa(+4>#b
z2ek*M0mgTQ&XNJ>lXkae<7eRpiXSKhrFq^_g7sw3)LSXBZHMZs28y;~YQ;CZ-p?l2
z>&P2ll)#O6zvDzK?qSIIAzc{N9w7=6*T+Wp&3tIvJtv&|UdD#6<0Sv}QSwo`g(CPF
zDDtsUb2g%V$6m3!mS*Ocptd26lRW9$58m`H&uKfepU(6wyQfSTvKf4rR8yyHT7|xq
zZXU*d-d|a)xXy=?n+>E4>6Nw8THGZx0iEupIXdaH&&=?VGfDmS$~JEOkQ|uxdAl>E
zPIoeWme^Ib7LY<3wmVf^T)*l_9h+=WY-VxK@rV>>Bo`@zWAZH-Y4It=W32PET0j%k
zdh+A>*Bdz1MFx36@!F(qu=^~BNY2bGdr-BD&a_rX?#g8^tCnb6V_)+)6~|j`nI1Ch
zgR+A<d3{jP=M(DSBQ%EXl733&A{PhmnS9UdoKI1-v&O8-1P#uWRv@HzNMPmlCSh-e
zYqYIlMOjRZ9n4Idu`9ji2<^-aJG*$p{z5`^&9cHWG&dSK`C#g&Jd`Y*0Jw4PXQmDt
zyOnMp4XVm+B`>YP3U!^%;>AKM&-c}tGfiB<MPl(i0M{rvqqOoX_UKKmOuKX=1)r2n
z`_|e*N_o{S9?Xkdf=}2pUPC!TxVJnc1R9wgnVF7k{VkmcHnJ0(x;uTMjGubWXwFWi
z3Y^lrMpwtII2)rOGYR>>7F3G}-8Mt-lC(IN{`CAd`z3ajlCmycUXHHgRhb-e3&bM8
zE4v+Vr7=%iYn<e;>R4DGOOLE7_eOKeX3OQk9cY$2kO&Kq#<d~WpL1-Ij&x=(`XaWd
z9wy&>$$>5M^rx!dcODMEw_BJn++67^J|`-9Ms;|F$hP&KLLm?I1<)_aumF7c;5zf+
z1)%s^!z8g{ukYmeSJ(GMD#v_rgm@q-=Nxr0zq`T)wW?kAMUMBeG5}yuBv#Bv85S};
zj3rc6c0ZQITGj9{f@(LZe1i+d#|H$q`<Wl5as6uHiiOksMpT&GTN{q&M>snPdo5e(
zNSCmzTOw)ys<J*Vj+Y=p7T|iTGPPcia8bk))K!$}@Tq8x>rH1cfqH>!9DyK14vI+%
z@F7;B(?+n0C1=~WJYX=6T-Ok>n9gf*_=>MlKscVo%d51$TBYUg6@|Jwd?yg(RW$1z
zRxEc1w?q*1iqvr~!Pq~!WR^H%W8FHWO`vp11w>*3o4u>`+kUfLvs$1QBl&eC%_ccL
zbSGP~*eowb;QtEG8!+UIKa$&E6$8QRj%3+$&ST0MlA|+aOT9%ETB==jE~Nheh?O{0
z1BpJBr=P+^a&d9Ad&Ndf&kWO8^NmG5TfeC(OuW6cb!r{kbw(Q}{{WV=--Rn3_oMYR
zk+-R9LmX|{bKbZfHp+^Jb(xERzk-~10Rw|P$M?X7#S^&o6A^<MZ4Ne?rpjDta8k4=
zDJSHi!TMtxV{hlr=Z#X$qLiLF+)qKw#N*JY5hf2&fuo2(ObPGIV<9r2Kr#ov81YJz
zsUkoC08h+(v5zdmj}m!|9_vJMQe<?2g1}J_q?00J`{BT)-6wUi1JxXN=rCGZPU%sc
zDN+jeN_Zov?|_5^p7T8M+x}Q}*MjBvkWS)EOv(QMJQZ%YJ$Sp&TG~=j<#kh3$Wplp
zQ;P`%k6|<P!t1@RzR7j0v)tJ%)lj_UGtHjPAe5+oQIBNx7IT$>O5lF=qd!3Uam}k=
z`8k5!HNTmZuAk}Fgk+r($qrKGi%V4(r`j#nU8)-kexS!acuGIq97kEjX4fqnuG7lo
z*HJX5Ef*B|?76g`DQA^7v(O&t_a7`?jLmz?d1EV9=1kppq_tXT0d}IQsm+bF34jlB
zNgxD(Njv~?R=h`e-O)?#nABtpfo-YH71vM_x9Vveqotx!Po=_K0ml{(DM~;d>K!qU
zd=l4P#N6;DZ*UBFbb4?|{I_V~+01r@SZNA_1+oWT!>%+}8n#)j0oU7XgFdAM#u+u4
zj`v--J#cX;GD;g)mYMp0d}_>>3xzUIUx~w_!bsKbMX@jH6jH__rG$_;xZBWQ-ARkg
z_9_GwIdN&ok3(dwN9aH2i5A=AUozy>S&l-=bSzg|?8~#Ns^sFI^HQY(R2y$-5dr~G
z_~N#yBX27HXyc*A0#hO-Bje5_b_q!{40R384&d$Nvv`+Hu`EHNHrb6Fx8=FN(LOV#
zv}$gkT;I9oEYnh_YMn2s6p2kUd&mQXwgKn>>Tt?%?{RBcQk0@psXn5wa(kQv5>ix<
zL=HRS-kpT<48$MTI3*1{RO}JL(AJIsSS+;lGgP}g&MrNs>d=18n(BLJj2xR#`jQ9P
zE~y*1nWJz(`1Qr44P7$-0Eo?J$T_lscC}qEO<!$6W)WLNwn7s62;iia(=o^(g9KuV
zcvHBh+2UJF>fCw8hw&{F1f=aS>b#;pp=1&9$6!29=#FO8{F83ZI(<fmH3uZEN}_1E
z5N<Swhv_#SC20Xko;RcpH%=&H6o#stwS!;Y{{Tc|0jp&tLtP=WPKV1+bx^O?m+Gqm
zisitqrCkb0>pzwwEbVEks;P(R_ob$kB*!vje7FY^`fA1AnXA1u9ZNM0EVo}o^JNcH
zzK=@Kv?v0Vq=E=M;h&so%{|ezJn8c4l#l>PB&G?8`W#_t<a1wR!E~9L3oXGz7R@ZB
zyRB6f3?phH0(kA*dwXI)$eA*Z$X$J=0$gPTnCZCx0IUouvEJ=Vwp5+T9%IEM{JkPQ
zzYP6w;HR#or^_{U>?JBMD5g}X0U%0@^nr_0&oj<bvPd9`7UpOdD|M3X37}Keb#(|X
zx}_~^B#@N%j{g7-IX$s(^4?sd(p?<Pnd<h^7_4f;S#f`O2{iQWQ?10dXYi6z3cfHq
zVl42Lq<3ua%?6?7TVs^jc9lY+(!<SdRmsu#YE*;BWk2{}{v~?&0TQqFoa%#Gcj2n#
zm1*me)!RJPvt(hQue>y%hia4u`ymKQ3I71qj`%r(Vb0I^sm8oM>mAZ8Og#P;6Q>`y
z-Ep10>20#OD%c8y{5ima<OC?EZ*d)P5>#Lne50QK0PBE)caljM)vQ4Y;gNM0J?Z^N
zpOf=N!q>Z%w$rl1o&9YpP_L?fxUjh&#=X&kiXBACtk$3bHMXJ{cjBcoqp@G`sXr`L
zic$vY1V=r4VPQaknF0iv@BX+cNr+Ta3r6iZZWVcHVW0j+{nte@KDFt7p((m-xteK>
zYX1Q8RbbN8{E~(KKln<p{8M`l)u(AmAo0&!a>S#ewuK~g9ZZZpvd*$<u2l)Au;=^3
zEFuC}dH(>kG6(+0LVzDA!N__*EoqtJKFDOaEzRP0n&`ucl!Xvtc^FO!h*+8Hig9G|
zm#MT1Z5O*mlJwU_eZuKoKP5Jzng0NX64p%dbEY|rw?&y6t;;n^+MU&aJxodM=QsF%
zYf652Ee0z|Ik0t)Zi<AsZA@C0+;(cF<-ih16YtV+Q=lJF9e=E2&D2s{DVeUJscNE~
z^zUi*97~P3l6s{;kfHK80TLl{qt~Zg5#3Tc8!DUQn@H;NW~fwWdImwW=8|d>vC*oq
zKJ~NgR0i?$IIVW7lSh2ER+5-vvYo>nwz!k|W3Sr9knEbFkgKvi>Xo+<a3L(cv==F!
z{zp(xMFZd3Mmb5JbpG|FdH&;+vl<&!RhFb3?a6KQgSieQo~c(Kn8zEWuN4&CQ}-UL
zs8&%HOeAi{(M~=)b*i^KXZAGG)mXpWIYBA=Lugl}`<l=Ti%{?7r9rg&A29dDUkM%4
z1ep9>bA>gx3cH=gs_Aom$EvEHTkkl&t6J1lqy0(3Ws*`1&nLez`WUej1WXMa5b2-R
zJh=#M3Iqs)?fzH;i%2RwPgy4!Qsnwl4?nNh#|4mh<|nE4!?#eCgCGw;BiP^|&u$NY
z*IWrnf<Qewb>rIsObF&V<DN!55+Qp98297yz>rFL-6Niy@r)G$0oWvRV;F!WNbB2(
z7;^I1TC1xqmnuuGzNK`vlywbMQ$E>+H1ZS_g@f)CkVZa$JSpo!HP1?N?xQ%%Y}8(9
zcGk3xQq^l`WkLtTdPyJdsAK2J3n+q~-rcd(U)dF@hQ7g?w4$b{dA9nbUT9n0$V-4F
z$oLYFet$eE7YbH<q$_daE5pN?zN7xqH0q9>8Mcul?UZUEwy#Vie}=QsFQgAchp*sK
zh$*RethZ^*6u)Z@)LZThxaB&j9At+WdSPf&d?+kI1RjK9`s+TrX|90eI*k7Ssn_&d
zt?Z92mQpv=*1l6Ah8mf1!yN|zNl{8cBb6D`_NVI&%Q;@}`3oat%(X>c<9<SFEh($7
zQ)mS#RYgYhhZa&q`e85vWJx*d+zpj-b+MeMk8kd@7+*-GqKBxIW@y`65!uH?;AccH
zO<UQwvc+1|`|dpNmqzo7Kq@2y2_*0p^^94a4)AEa(N{@$)oWAKIa1ZL_jX@>AunBA
zPd<HWWRx%u;b8K!>Tm%GJnGah9KIy=CfNO>FlTn@>1rS;Ho7M0RscN`<3Ko>?+WLp
zA07|?0B9zm_;$O}<&5E1S(-HM*CG3JuPNHAXi@b$M75cL)UczmT<zMUCDhm(VUw(t
z`K_?Ef|j?W($quMK-`zy?tC^~vqc3h9WY{*gf^5UB?(9Z0N??R8Gg~-Xk%G?M%?IZ
zwME+&M?uTf3kFd0$XQiWqu^d8Aooi{9Wn7|;{O0CY4?TaHA7BwEgj0MI?{?9r&CVg
zA;+FU-1r?wkd4YvB2uuFm^@>K&M8Y$+*^uuYf(~`zDksNB%b7+N%uH722o<N<eofI
z1u`1z&tN^zzw6%^Ln;SxG1&YBVR%Y*f;Vx%o@W^H(4SNuoJYPM6AjUAv`TlaTt?CG
z!Typx3NQp7_%$pA5xGBzbOZG`ETOR4VSReQj(<lJw1AS<LIo)Q0K}R90D5N`OP~cQ
zU8j@n?mwmil9eec4tDo3@&5P;3G)EnM3{gFw{Oc5TIs-{DbWOuI(47W=NS&BVIT>K
zGwwfM-wB-bAyB1lBz52tFiKoj;$ZR5$K!^b65Ty=UaDu!JE(2!r<iJ~>DznET>_jc
zq<E9+3uO9v1Cjte!JJ?0lbY@M>nd7vb*L%2Ypp73LvtmibX#QmrC~`wDaSM1S(CfD
zDkIzbV%&Hu&LX=k-f|T~{{Vk!sTV3#DE|QE-h{dU?h@ib{{Zg%aiQq_Ys99V=R!2^
z9J~;GTAVsQv<*VN*EK84ZAc<gxj#&E=f|DCos7?9yCEP|);CbwtL~&BNhu#q#~lW}
zsu_f;BikJC@$qs?ncps1M<$xq?MnV7*S3$zqmEUF0%{t%HF3#G@_Ad-D!=A0@bW+y
z@H>(cK#rq~1aK7`j)MW@iI@Pu=Ng5QD0{f%nVH5^t;T0@AFq}IA!-9>FeAPOnOC6y
z05@5}gqA@vpi0S7r;d8yJedG-*OQMawIO9BZBUU1j(f<)LX@M=DkqPQ5h4PP<JY7f
zk&hIFoxl#9gWCoar9Aie9@z3o35h3z9L!;Cys((u2~qjtV(`bVHxCZlw`kPrS~u&d
zXcFTU^h~%As>*ezmccv%R1*p_9bn=ek6zUz0tp6BF@+4Cc9oGQ-;wph+AdeGmr$g?
z5Pl+cBgB?`dp&9m({iCrTdl0ALXz1`NFkOIdUP9>eM=%#dH~AHnL{?`8uePTje_TT
ztx7!A)6g)t))OFZ+K_mX2gIBeLvN|Bt76|pO7&GuLekAUEjCuSTvC8a5|POy@BqhQ
zzh?gcg@tK0O@B6KyK`yGH&+9-2ZecH54&M19YbJ}V31a&b%1f_{3ArHXm#9nT*9I7
zm*Rq^n7*9^&i-1tNnN4D@%4j-_Ic5LRMRRdOP-<W?ej0u);d_HsIR6dLfiv($?T|;
z1_^^54nB}fB_>Y;sW|7C?N!yht!vUY^xX(%qV=2W*m9Ym0W8x}I+q=&KGLnNr`)Yy
z?3^)|aZUlG8)&&!_3G_aYoW1KT<adU+njmII!7K-nx|XEDs@eum_b=W3Zf*W6F5aG
zTDP{S0w>=Ec?weGpr_zZ7#bQastJM9uMlwe4rzd!tdr%*+5q4Y@cH0FV$Uj1BaC@X
zsZs|4LS}GR*oN)N<P+Ig^LI&ftS@SSN{Wy2q<pYjiC8<qNdgEF$IBi|L8OJLJb({A
z*i8LPjw|f#$OOqV{kmet!68Fhlz<ZkNDAYQKDZNQeq6MP3Ns%bxJ>r)6a*0*d_Erz
z_{&ZZIXnR*bi!W7$#^3QqE<%e?l~U}60-!QGtUw2&OD{8sO|@B^Y1t)kU0aVKP)a_
zk$FTJDUbvR?fT;nZ5hvpPBkjWnjCs4+o=p!f&A32jgI`l1S{r&?S?9qB$=LnLm209
zQ9GplFxaLr$R%dr-FfW~X=&K5(_Wq`fSD=n^Ne###;wgY`txz53L7o9$_ihM`DMaC
ztGMYk#1(|AaUh+dJD%9&{=H@v^H#HGOX?TBx32XKGJo{hK1zQk@sh$Dc}tG~wV2C;
zU=8L|2XrP62lK`z0ZblA>S8b`ObL*B^WXh25IsYVXQX~u(JZ=~JUsYT&YmNS)cHdH
z0A;s6io-_BcP}+4dBE40s9FI60Udgii6cIK&CPz(XK35L-tt!ahnx0fHFk6@)V346
zg!v@E3rwDppgZB`;5%FNgGVxx{{Y4rs+Qe#uDCq=PqR{%>nLD^PQ|GyS{!UbfKp5W
z9Wi3{L&UzA>b+$%)*Q`lt9qH|2)S0*Q?RCK%76H3c0;OO#W)h;mhRK{XaPOPi(F2M
zcC%nr3Uk2^h!=fyHhTraNgr{2W>5bBvt~6l{tWy}+SE(BOLBNm<E5*md=viw1HaoH
zD$9B2B2+fgY3OYaIHaX0EL{`47!p89^>^#u44z4J8^xV;VJe|p%UX`zl?Q3$!egj`
z1H8r*wZQD1)U>WN=N&%JI(u19L!NSl*2Miv6sA#KPFfD67*R^PC<E3Bjy>_9NE=i^
z2aYk<9aPA97ge&YMpn&sN~(%?A#5n92PvHd{o_lCO2B|4j-Z7D80A)a%Z%i#^L@$;
z+`iu3n&9;kDndd$<e^cxi6JIBk9os@JXA6>BGK@br;mp`;ObY1bA*6+0(*4l1faB$
zCxbkQkZ^NqN=&#&CL`KA96h656eU`(06CW=S+$W=Iik@-^~%1MEY*&qa+!G&U1<}-
zO4}+RfxGU?XVn<(wpgyVyO-%|DAd{*OGBgp1Y_pv!<cLG_2PptS?bFd3uQ}HbdD$a
z9YTVVej-QI5;4}r_^hze_;|AF4F%a0wE2piD`-yjFz?}Yy@Qgb))bOB=soa~fHgZS
zF*C1jQM#A>NqDcznrCjRwA$*@{bCtgwJSjcFw#LPQTE^%#~JmfR`U+F>IHs#%r38U
z`&go2l0_{VC(CGibt+NrrKtDB+2Z$Aa@{SVjV;TxS0CaVgQ>Y@dyT<usctk<mD4cU
zZAu$gK5B;q1hzYzRMAitb|`QJPyFz)0c_mUXR|*kyf;cx;zY>l!1>?{LY(7p=a}c)
z9&ce(o{2CzU}{=;=s+ZZF(be`r~)h(HWVn3IOu<092k(N7M7VoQkI~EB_ov*U;*<c
zK4%MTHXDv$trRo~Ze&W_a3v_7<e%~wxS12dzK$zFt96eu>a)ts_Iry(n_Ct1Xs1|h
z_J0}(E}z7%5~0BP;tb$BE%ildaC*wPg+m$e7Y2fOll;`|6;7?w4M<_yiW-cWbY=Qy
zS5EU!AR$e<l2W87DN+a^gYg(8tu53Po-&2Nka>WEAO8SYx^-Dj)0`^nvt4FqaJzK0
z)@C}DHq|&~y2W1p%vlnX3G$pP0!pL<(+R0EEse`1%IW-4uRe-`%~skFx^)K?RZ~jS
z=?Msvu`fDA9yS85_?_`o#@D^TS%ibA;cvRs(_x(xo2qN)n>)ZiD{ZN_xGIxoJfoU&
zlBkiH>{XVvT&e!sNScHM`RsX%Z_P(0Q@ypOebPoWg>mUA<J|T7e@tIo{iGU6sn`3R
zr!wc4*-o9=YVCHHsVivOHBGfyR`Pt}TRYTB_$h!v;9?HGXeON7F157O*UP(%Q`I`t
zZhftzxwNc+WFij$ayZ2ry7;JMbTuC^we7Cp9s7^bP{H~gUD8>kI6ww5atnxQEx7~7
zO-BV?$w}nI4!r!ZR`kH*<BK(|9v-xRBI$ZFhEmQqI_r~;R=4C-*V8zS%+&d50Vr%L
zRM=9YWW<g!S(|8dDihDk`r=q;WbH0y*CWNGwEBZ?%~ocrl1fMQa*G=S=c)jO1Hcd`
zfrAi`M*!{bJaK~yK{7-n`;mbm8>F7I)^Rg;Raaj#UELN-C2e_dC|ao<s8JIIO}3?<
z{{YT7`u#)I&pTZCCEBGI)~|Ia?kD<_j+SRlP_<|63e5SI(CR5&rLoqo<3^DyVb38z
zPq<Mc3`Ap^1`-m0s2~of*keC~v=%sy#jNQx64A`sc}0l5&?Ev?Bp!MX^2d_|pI1)N
z(-{M48>C3@#ykWpjv_%HOc?;Iz0r~qHp!mz+x;<*DF=dt5(hq>n8Eb{2d7BD$8>=t
z3HV_^DK3NM0CyoNN5&wGETp_%C{?<O^IuH{C7QPzKmPzH36!+RN>l+nAAAK6m4cua
z5}-&u{&CD7h8D?lCnY|l>NN?9EdKzQ+1bK&-Cr9|z;28;E9)z6RnOcmHI7u()h$mx
z^Q+#s7Eh^4QRI?)oB?a)MItvI0PXO`7I$!{={>RIs_`U?6@WnagYVl4Jk@_M0G^Th
z=LvuUI0gsx;{x5Z?MZ-o@r4C79WEYHf)BjRj61U4e!AIhO0w2C<z1<l9jRuC#S++c
zzyiP>x9v>)rwuA++klnz;xUKRnO4AdpL5?1dqbOJu}_LeMC{E+RaHM-RPA+AiFx-6
zjw(+i^WUU)_zYpRtz;P7dIRixV796fkOG8l>(q=Mn&bo!f!pX#6mYVLo13LOys-r%
zo=yy)evvgvU3ELovXdpY)(}bdlgJoGov#T8yzqFzjW~d*CL^a4k2n|Zq>4_d!N6YJ
z^&U#hXy=NYqop}*c6PL(`sY^5jd|&=VX!wXN{|a=^iUmpb;D0H>+2bVGFD~^e4wgV
zDmJOK{i(80Qrb{GK?D7ec*HK1&s81|6aBDYS~>L|NRB&TYcNVCO!|P+nfY#Rb;>L_
z>8CCDtwkqIahfMN^zsJW5%RZUZa@~(!CLIq<D#bijH>0oPF|ybE#8gPIIH@Ia!V#Q
zp|p*_vUyhF>gn#EKKR|K&iBnezSZg1+0ICQs;9|n^g<s~)dlZJ4S<0gP-H9cf*|6q
zB|v(CGdz$m6|EzoB=sgc;G5dl+>yDzh`-%un>*WWb`~BxsqZ=Xu&&Lw%hl`n9i|a?
zUA<{-cQra%G^{+=3i8D6^UAsih~{Su!>*NKVSmFdKu?#q?TY#T0JGB(ii$uRw{yqH
zU?<B`qiW}mjw#eoQ`JajmA7&2J<CglblV%jDTN&b*%&TuW}Wxcf>ZB0Yb)8KB0G|Q
zhS5rgZj4-2QYwno_f0(5M%|!6^NE@MP$Qg1?e69b`hEJ&4XgqZt;Csw$4)RnbSHwc
z0Qba@!pe4t;}!t{#c^7!W~tScugxQ^b6_uFVYP<H)pAOt$UP&R384f65BXq#@>R(r
z<q`V!#un+__>VF<J^PRK#k;g2ifF&)OO=Z(Us$%+Ji;!NN}#Dq6#>?kKq*k}F$x(a
zK~ae>Q0fadV8VSx_mUv{30F_1BT}MfQzPCm0|;(N3FcQu{-sn2ae%@4SUpKT)<IDz
zf(*$g0R#le5I%SmgFOg7-#lS`3R0s0@O~I7x#<fbUC($2j&LGI&;VC?JpTaG1#LZg
z{{VamD<hfWIRyUzY&aB%&q3gc3hDay#uT7PC0q`Jzw70XB|&`rKdu8HK_kChD)PuF
zA_48&9#JYAcB((F2H8L`p~n+|E+nXbm$p1YTdWBxKau|U_rt6o>H0VjZ*t0tl99|3
zc*YWwBy{-+!dRq5m@&{sHnsU@Gg>Y5*L$v7yHQ)~Q@Wb6riIp-e0!nABqc+h#N%(J
zw+l6=Q7ktb{S_@$B|S%(<yA+Jr$9bYa2W*w&4h!;9Pv2ler@prok7T-z}cq9L@8;I
z9jmz3EyG1{rU*(LOCc>3B}ocU1tf(M6o6Ce0WnNkY(7B9;?6B{Z*DfzP57f&ZV)jD
z^XIm<^u5nw<~pTo>(ucl2Qy6M2pdTNd!D%3W=mzJ*_kf1Hfq6DZl<k$nkUEpD=Be7
z3i^om!gN3ZDfJG$aBYqSq1@1E8d&1qOAWRHU_|ueiN-@|O1I1^e>{0^Aw2IgOm**q
z+)~t_QnR#3By_}^?3k1!0yY3627w4CbjQQvgy<@cM+SL4@IN)ggDV_}5%b5h5u$Li
zQ@ut`^+BFEm>2__0zoST?dkEtV*8+S=#j*M*8p3tfFZzVuNYF`mf#~i#f1P#;$j91
zQ{g-m1rKm13DoPUOm_r>9dJsXQj~j;PjA%Y*@qOHq0~56w1q$>IqQNzQcNlV`1Zy<
zQVA+BdJGLJT36>GMFk`o_V&X#>Y7$KNrgy})Ajn}0WwcMrF_3E7~pbz!kIJFnZQXP
NZ6lFB*mCuk|JnJR1G)eJ

literal 0
HcmV?d00001


From 3dcace488c5dbe4ff8a88496c5e8ff72144d780d Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 13 May 2020 13:47:15 -0700
Subject: [PATCH 0513/1533] review comments.

---
 .bazelrc                                    |   2 +-
 tensorflow/tensorflow.bzl                   |   8 +-
 tensorflow/workspace.bzl                    |  11 --
 third_party/mkl/build_defs.bzl              |   2 +-
 third_party/mkl_dnn/build_defs.bzl          |   4 +-
 third_party/mkl_dnn/mkldnn_threadpool.BUILD | 133 --------------------
 third_party/mkl_dnn/mkldnn_v1.BUILD         |  26 ++--
 7 files changed, 27 insertions(+), 159 deletions(-)
 delete mode 100644 third_party/mkl_dnn/mkldnn_threadpool.BUILD

diff --git a/.bazelrc b/.bazelrc
index bb5f1c03727..2c83830072d 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -143,11 +143,11 @@ build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
+# config to build OneDNN backend with a user specified threadpool.
 build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
 build:mkl_threadpool -c opt
-
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b6066200553..0b544ae54f1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -328,9 +328,11 @@ def tf_copts(
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
         if_mkl_v1_open_source_only(["-DENABLE_MKLDNN_V1"]) +
-        if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
-        if_mkldnn_threadpool(["-DENABLE_MKLDNN_V1"]) +
-        if_mkldnn_threadpool(["-DINTEL_MKL_DNN_ONLY"]) +
+        if_mkldnn_threadpool([
+          "-DENABLE_MKLDNN_THREADPOOL",
+          "-DENABLE_MKLDNN_V1",
+          "-DINTEL_MKL_DNN_ONLY"
+        ]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 24f9b962d79..83e74f3d105 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -232,17 +232,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "mkl_dnn_tp",
-        build_file = clean_dep("//third_party/mkl_dnn:mkldnn_threadpool.BUILD"),
-        sha256 = "54737bcb4dc1961d32ee75da3ecc529fa48198f8b2ca863a079e19a9c4adb70f",
-        strip_prefix = "oneDNN-1.4",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
-            "https://github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index f69d27dd094..bd0686523bc 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -107,7 +107,7 @@ def mkl_deps():
     return select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_tp//:mkl_dnn"],
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_v1//:mkl_dnn"],
         "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
         "@org_tensorflow//third_party/mkl:build_with_mkl": [
             "@org_tensorflow//third_party/mkl:intel_binary_blob",
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index 5778d136e9b..bd3b4b94f29 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -34,10 +34,10 @@ def if_mkldnn_threadpool(if_true, if_false = []):
     """Returns `if_true` if MKL-DNN v1.x is used.
 
     Shorthand for select()'ing on whether we're building with
-    MKL-DNN v1.x open source library only, without depending on MKL binary form.
+    MKL-DNN v1.x open source library only with user specified threadpool, without depending on MKL binary form.
 
     Returns a select statement which evaluates to if_true if we're building
-    with MKL-DNN v1.x open source library only. Otherwise, the
+    with MKL-DNN v1.x open source library only with user specified threadpool. Otherwise, the
     select statement evaluates to if_false.
 
     """
diff --git a/third_party/mkl_dnn/mkldnn_threadpool.BUILD b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
deleted file mode 100644
index 7209b8a62d0..00000000000
--- a/third_party/mkl_dnn/mkldnn_threadpool.BUILD
+++ /dev/null
@@ -1,133 +0,0 @@
-exports_files(["LICENSE"])
-
-load(
-    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkl_open_source_only",
-    "if_mkldnn_threadpool",
-)
-load(
-    "@org_tensorflow//third_party:common.bzl",
-    "template_rule",
-)
-
-config_setting(
-    name = "clang_linux_x86_64",
-    values = {
-        "cpu": "k8",
-        "define": "using_clang=true",
-    },
-)
-
-template_rule(
-    name = "dnnl_config_h",
-    src = "include/dnnl_config.h.in",
-    out = "include/dnnl_config.h",
-    substitutions = {
-        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
-        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
-        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    },
-)
-# Create the file mkldnn_version.h with MKL-DNN version numbers.
-# Currently, the version numbers are hard coded here. If MKL-DNN is upgraded then
-# the version numbers have to be updated manually. The version numbers can be
-# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
-# set to "version_major.version_minor.version_patch". The git hash version can
-# be set to NA.
-# TODO(agramesh1) Automatically get the version numbers from CMakeLists.txt.
-
-template_rule(
-    name = "dnnl_version_h",
-    src = "include/dnnl_version.h.in",
-    out = "include/dnnl_version.h",
-    substitutions = {
-        "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "4",
-        "@DNNL_VERSION_PATCH@": "0",
-        "@DNNL_VERSION_HASH@": "N/A",
-    },
-)
-
-cc_library(
-    name = "mkl_dnn",
-    srcs = glob([
-        "src/common/*.cpp",
-        "src/common/*.hpp",
-        "src/cpu/*.cpp",
-        "src/cpu/*.hpp",
-        "src/cpu/**/*.cpp",
-        "src/cpu/**/*.hpp",
-        "src/cpu/xbyak/*.h",
-    ]) + if_mkldnn_threadpool([
-        ":dnnl_config_h",
-    ]) + [":dnnl_version_h"],
-    hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
-        "-DUSE_MKL",
-        "-DUSE_CBLAS",
-    ] + if_mkl_open_source_only([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + if_mkldnn_threadpool([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + select({
-        "@org_tensorflow//tensorflow:linux_x86_64": ["-fopenmp-simd"],
-        # TODO(ibiryukov): enable openmp with clang by including libomp as a
-        # dependency.
-        ":clang_linux_x86_64": [],
-        "//conditions:default": [],
-    }),
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/gemm",
-        "src/cpu/xbyak",
-    ],
-    visibility = ["//visibility:public"],
-    deps = select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:mkl_headers",
-            "@mkl_linux//:mkl_libs_linux",
-        ],
-        "@org_tensorflow//tensorflow:macos": [
-            "@mkl_darwin//:mkl_headers",
-            "@mkl_darwin//:mkl_libs_darwin",
-        ],
-        "@org_tensorflow//tensorflow:windows": [
-            "@mkl_windows//:mkl_headers",
-            "@mkl_windows//:mkl_libs_windows",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "mkldnn_single_threaded",
-    srcs = glob([
-        "src/common/*.cpp",
-        "src/common/*.hpp",
-        "src/cpu/*.cpp",
-        "src/cpu/*.hpp",
-        "src/cpu/**/*.cpp",
-        "src/cpu/**/*.hpp",
-        "src/cpu/xbyak/*.h",
-    ]) + [":dnnl_config_h"],
-    hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
-        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
-    ],
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/gemm",
-        "src/cpu/xbyak",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 243ec00a60f..c7aa0207ee2 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -4,6 +4,7 @@ load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "@org_tensorflow//third_party:common.bzl",
@@ -17,16 +18,26 @@ config_setting(
         "define": "using_clang=true",
     },
 )
+_DNNL_RUNTIME_OMP = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
+
+_DNNL_RUNTIME_THREADPOOL = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
 
 template_rule(
     name = "dnnl_config_h",
     src = "include/dnnl_config.h.in",
     out = "include/dnnl_config.h",
-    substitutions = {
-        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    },
+    substitutions = if_mkldnn_threadpool(
+        _DNNL_RUNTIME_THREADPOOL,
+        if_false = _DNNL_RUNTIME_OMP,
+    ),
 )
 
 # Create the file mkldnn_version.h with MKL-DNN version numbers.
@@ -59,9 +70,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-    ]) + if_mkl_v1_open_source_only([
-        ":dnnl_config_h",
-    ]) + [":dnnl_version_h"],
+    ]) + [":dnnl_config_h"]
+       + [":dnnl_version_h"],
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",

From 7d704e32464c25e910a8ce51d643336b7d8f8bd6 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 13 May 2020 13:47:01 -0700
Subject: [PATCH 0514/1533] Move tf.keras.layers.featureDenseFeature back to
 Keras package.

PiperOrigin-RevId: 311396758
Change-Id: I253d89a5f23dce3ed06db665640c0c2ec3902cf9
---
 tensorflow/python/feature_column/BUILD        |  20 +
 .../feature_column/dense_features.py          |   5 +
 .../feature_column/dense_features_test.py     | 416 +-----------------
 .../feature_column/dense_features_v2.py       |   7 +-
 .../feature_column/dense_features_v2_test.py  |   2 +-
 .../feature_column/feature_column_lib.py      |   8 +-
 .../feature_column/feature_column_v2_test.py  | 291 ++++++++++++
 .../feature_column/keras_integration_test.py  |   2 +-
 .../sequence_feature_column_test.py           |  49 +++
 .../feature_column/serialization_test.py      |  66 +++
 tensorflow/python/keras/feature_column/BUILD  |  78 ----
 .../python/keras/feature_column/__init__.py   |   0
 ...equence_feature_column_integration_test.py |   2 +-
 .../python/keras/layers/serialization.py      |  18 +-
 .../saving/saved_model/saved_model_test.py    |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   4 +-
 17 files changed, 463 insertions(+), 509 deletions(-)
 rename tensorflow/python/{keras => }/feature_column/dense_features.py (97%)
 rename tensorflow/python/{keras => }/feature_column/dense_features_test.py (62%)
 rename tensorflow/python/{keras => }/feature_column/dense_features_v2.py (94%)
 rename tensorflow/python/{keras => }/feature_column/dense_features_v2_test.py (99%)
 delete mode 100644 tensorflow/python/keras/feature_column/__init__.py

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 786c26c009a..d67cdf9cc06 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -55,6 +55,8 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
+        "dense_features.py",
+        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -124,6 +126,15 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_test_main_lib",
+    ],
+)
+
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -166,6 +177,15 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_v2_test_main_lib",
+    ],
+)
+
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
diff --git a/tensorflow/python/keras/feature_column/dense_features.py b/tensorflow/python/feature_column/dense_features.py
similarity index 97%
rename from tensorflow/python/keras/feature_column/dense_features.py
rename to tensorflow/python/feature_column/dense_features.py
index 820f1a6b1b7..6feef185815 100644
--- a/tensorflow/python/keras/feature_column/dense_features.py
+++ b/tensorflow/python/feature_column/dense_features.py
@@ -23,6 +23,7 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
@@ -172,3 +173,7 @@ class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
           cols_to_output_tensors[column] = processed_tensors
         output_tensors.append(processed_tensors)
     return self._verify_and_concat_tensors(output_tensors)
+
+
+layer_serialization.inject_feature_column_v1_objects(
+    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/keras/feature_column/dense_features_test.py b/tensorflow/python/feature_column/dense_features_test.py
similarity index 62%
rename from tensorflow/python/keras/feature_column/dense_features_test.py
rename to tensorflow/python/feature_column/dense_features_test.py
index ec07964bcbe..7cd523dcc14 100644
--- a/tensorflow/python/keras/feature_column/dense_features_test.py
+++ b/tensorflow/python/feature_column/dense_features_test.py
@@ -18,21 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
@@ -678,417 +676,5 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
-class IndicatorColumnTest(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
-
-class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertCountEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-
-class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
-
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertCountEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = df.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = df.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = df.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = df.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = df.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class SequenceFeatureColumnsTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = df.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = df.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2.py b/tensorflow/python/feature_column/dense_features_v2.py
similarity index 94%
rename from tensorflow/python/keras/feature_column/dense_features_v2.py
rename to tensorflow/python/feature_column/dense_features_v2.py
index e4dc22f1bbe..405c5d63249 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2.py
+++ b/tensorflow/python/feature_column/dense_features_v2.py
@@ -18,9 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.feature_column import dense_features
+from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -93,3 +94,7 @@ class DenseFeatures(dense_features.DenseFeatures):
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
     super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
+
+
+layer_serialization.inject_feature_column_v2_objects(
+    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2_test.py b/tensorflow/python/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/keras/feature_column/dense_features_v2_test.py
rename to tensorflow/python/feature_column/dense_features_v2_test.py
index 95fc8b7ac1e..71cb163a7d9 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/feature_column/dense_features_v2_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,7 +31,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index bda20ff3f2c..afe14f55bfc 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.feature_column.dense_features_v2 import *
+from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.keras.feature_column.dense_features_v2 import *
-from tensorflow.python.keras.feature_column.dense_features import *
 from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index a13f38a5203..fe769850fb0 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,6 +31,7 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -5581,6 +5582,23 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -6253,6 +6271,156 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                           self.evaluate(predictions))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+    # Provide sparse input and get dense result.
+    l = df.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertItemsEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -7158,6 +7326,129 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
       self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
diff --git a/tensorflow/python/feature_column/keras_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
index 456c0204350..e0677e84e50 100644
--- a/tensorflow/python/feature_column/keras_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -23,12 +23,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.feature_column import dense_features_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index d0cf5ee7670..3d5d24ec03a 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,6 +24,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -110,6 +111,54 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = dense_features.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = dense_features.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 881ca0cca5e..78b72746ac9 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import test_util
@@ -113,6 +114,71 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = dense_features.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = dense_features.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = dense_features.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = dense_features.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = dense_features.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 94097c28d73..650efcceb52 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -12,88 +12,11 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "feature_column",
-    srcs = ["__init__.py"],
     deps = [
-        ":dense_features",
-        ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
-py_library(
-    name = "dense_features",
-    srcs = [
-        "dense_features.py",
-    ],
-    deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tf_export",
-        "//tensorflow/python:util",
-        "//tensorflow/python/feature_column:feature_column_v2",
-        "//tensorflow/python/keras:backend",
-    ],
-)
-
-py_library(
-    name = "dense_features_v2",
-    srcs = [
-        "dense_features_v2.py",
-    ],
-    deps = [
-        ":dense_features",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tf_export",
-        "//tensorflow/python/feature_column:feature_column_v2",
-    ],
-)
-
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":dense_features",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/feature_column:feature_column_v2",
-    ],
-)
-
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":dense_features_v2",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/feature_column:feature_column_v2",
-    ],
-)
-
 py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
@@ -136,7 +59,6 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        ":dense_features",
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/keras/feature_column/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
index b1100bf7b07..8784182e23b 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -24,11 +24,11 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import init_ops_v2
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 30be3d485df..0a90441d8a0 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -64,11 +64,23 @@ ALL_V2_MODULES = (
     recurrent_v2,
     preprocessing_normalization
 )
+FEATURE_COLUMN_V1_OBJECTS = {}
+FEATURE_COLUMN_V2_OBJECTS = {}
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
+def inject_feature_column_v1_objects(name, cls):
+  global FEATURE_COLUMN_V1_OBJECTS
+  FEATURE_COLUMN_V1_OBJECTS[name] = cls
+
+
+def inject_feature_column_v2_objects(name, cls):
+  global FEATURE_COLUMN_V2_OBJECTS
+  FEATURE_COLUMN_V2_OBJECTS[name] = cls
+
+
 def populate_deserializable_objects():
   """Populates dict ALL_OBJECTS with every built-in layer.
   """
@@ -122,11 +134,9 @@ def populate_deserializable_objects():
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
 
   if tf2.enabled():
-    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
-    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
+    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V2_OBJECTS)
   else:
-    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
-    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
+    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V1_OBJECTS)
 
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 5e9ccc2d37a..9cbe8607a54 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -47,7 +48,6 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ba9156d7f95..ecda1603325 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 130a9954202..f7137f0d09b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"

From 8d1e8b350c37dc37b4439c7f646d3ec178598931 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 13:57:07 -0700
Subject: [PATCH 0515/1533] Export
 tf.keras.layers.experimental.preprocessing.CategoryCrossing layer.

PiperOrigin-RevId: 311398537
Change-Id: I394c7dd5ae7fe168f3238dbd8a7ab064ff6ad2c1
---
 tensorflow/python/keras/layers/__init__.py    |   1 +
 .../python/keras/layers/preprocessing/BUILD   |  19 ++
 .../layers/preprocessing/benchmarks/BUILD     |  10 +
 .../categorical_crossing_benchmark.py         | 116 +++++++++
 .../preprocessing/categorical_crossing.py     | 140 ++++-------
 .../categorical_crossing_distribution_test.py |  64 +++++
 .../categorical_crossing_test.py              |  82 +------
 ...tal.preprocessing.-category-crossing.pbtxt | 222 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 ...tal.preprocessing.-category-crossing.pbtxt | 222 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 11 files changed, 707 insertions(+), 177 deletions(-)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 192c6a4afc8..ede199a9169 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -57,6 +57,7 @@ else:
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
+from tensorflow.python.keras.layers.preprocessing.categorical_crossing import CategoryCrossing
 
 # Advanced activations.
 from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 501c99fe890..bef294429bd 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -310,6 +310,25 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "categorical_crossing_distribution_test",
+    srcs = ["categorical_crossing_distribution_test.py"],
+    main = "categorical_crossing_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/155502591
+    ],
+    deps = [
+        ":categorical_crossing",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
 tf_py_test(
     name = "discretization_test",
     size = "small",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 276fb4767af..0c7e6ba856d 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -17,6 +17,16 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "categorical_crossing_benchmark",
+    srcs = ["categorical_crossing_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:categorical_crossing",
+    ],
+)
+
 tf_py_test(
     name = "index_lookup_adapt_benchmark",
     srcs = ["index_lookup_adapt_benchmark.py"],
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
new file mode 100644
index 00000000000..80a7903f0b9
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras categorical_encoding preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+
+# word_gen creates random sequences of ASCII letters (both lowercase and upper).
+# The number of unique strings is ~2,700.
+def int_gen():
+  for _ in itertools.count(1):
+    yield (np.random.randint(0, 5, (1,)), np.random.randint(0, 7, (1,)))
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = sparse_ops.sparse_cross([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    input_1 = keras.Input(shape=(1,), dtype=dtypes.int64, name="word")
+    input_2 = keras.Input(shape=(1,), dtype=dtypes.int64, name="int")
+    layer = categorical_crossing.CategoryCrossing()
+    _ = layer([input_1, input_2])
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = layer([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    name = "categorical_crossing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
index e3eb27b2b4e..88b552e23b7 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
@@ -20,49 +20,35 @@ from __future__ import print_function
 
 import itertools
 
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.layers.experimental.preprocessing.CategoryCrossing')
 class CategoryCrossing(Layer):
   """Category crossing layer.
 
-  This layer transforms multiple categorical inputs to categorical outputs
-  by Cartesian product, and hash the output if necessary. Without hashing
-  (`num_bins=None`) the output dtype is string, with hashing the output dtype
-  is int64.
-
-  For each input, the hash function uses a specific fingerprint method, i.e.,
-  [FarmHash64](https://github.com/google/farmhash) to compute the hashed output,
-  that provides a consistent hashed output across different platforms.
-  For multiple inputs, the final output is calculated by first computing the
-  fingerprint of `hash_key`, and concatenate it with the fingerprints of
-  each input. The user can also obfuscate the output with customized `hash_key`.
-
-  If [SipHash64[(https://github.com/google/highwayhash) is desired instead, the
-  user can set `num_bins=None` to get string outputs, and use Hashing layer to
-  get hashed output with SipHash64.
+  This layer concatenates multiple categorical inputs into a single categorical
+  output (similar to Cartesian product). The output dtype is string.
 
   Usage:
-
-  Use with string output.
   >>> inp_1 = tf.constant([['a'], ['b'], ['c']])
   >>> inp_2 = tf.constant([['d'], ['e'], ['f']])
-  >>> layer = categorical_crossing.CategoryCrossing()
-  >>> output = layer([inp_1, inp_2])
-
-  Use with hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2)
-  >>> output = layer([inp_1, inp_2])
-
-  Use with customized hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
-  >>> output = layer([inp_1, inp_2])
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing()
+  >>> layer([inp_1, inp_2])
+  <tf.Tensor: shape=(3, 1), dtype=string, numpy=
+    array([[b'a_X_d'],
+           [b'b_X_e'],
+           [b'c_X_f']], dtype=object)>
 
   Arguments:
     depth: depth of input crossing. By default None, all inputs are crossed into
@@ -74,10 +60,6 @@ class CategoryCrossing(Layer):
       equal to N1 or N2. Passing `None` means a single crossed output with all
       inputs. For example, with inputs `a`, `b` and `c`, `depth=2` means the
       output will be [a;b;c;cross(a, b);cross(bc);cross(ca)].
-    num_bins: Number of hash bins. By default None, no hashing is performed.
-    hash_key: Integer hash_key that will be used by the concatenate
-      fingerprints. If not given, will use a default key from
-      `tf.sparse.cross_hashed`. This is only valid when `num_bins` is not None.
     name: Name to give to the layer.
     **kwargs: Keyword arguments to construct a layer.
 
@@ -87,114 +69,69 @@ class CategoryCrossing(Layer):
   Output shape: a single string or int tensor or sparse tensor of shape
     `[batch_size, d1, ..., dm]`
 
-  Below 'hash' stands for tf.fingerprint, and cat stands for 'FingerprintCat'.
+  Returns:
+    If any input is `RaggedTensor`, the output is `RaggedTensor`.
+    Else, if any input is `SparseTensor`, the output is `SparseTensor`.
+    Otherwise, the output is `Tensor`.
 
   Example: (`depth`=None)
     If the layer receives three inputs:
     `a=[[1], [4]]`, `b=[[2], [5]]`, `c=[[3], [6]]`
-    the output will be a string tensor if not hashed:
+    the output will be a string tensor:
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be an int64 tensor if hashed:
-    `[[cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-     [[cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]`
 
   Example: (`depth` is an integer)
     With the same input above, and if `depth`=2,
-    the output will be a list of 6 string tensors if not hashed:
+    the output will be a list of 6 string tensors:
     `[[b'1'], [b'4']]`
     `[[b'2'], [b'5']]`
     `[[b'3'], [b'6']]`
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`
-    the output will be a list of 6 int64 tensors if hashed:
-    `[[hash(b'1')], [hash(b'4')]]`
-    `[[hash(b'2')], [hash(b'5')]]`
-    `[[hash(b'3')], [hash(b'6')]]`
-    `[[cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]`,
 
   Example: (`depth` is a tuple/list of integers)
     With the same input above, and if `depth`=(2, 3)
-    the output will be a list of 4 string tensors if not hashed:
+    the output will be a list of 4 string tensors:
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`,
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be a list of 4 int64 tensors if hashed:
-    `[
-      [cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-      [cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]
-     ]`
   """
 
   def __init__(self,
                depth=None,
-               num_bins=None,
-               hash_key=None,
                name=None,
                **kwargs):
     # TODO(tanzheny): Consider making seperator configurable.
-    if num_bins is None and hash_key is not None:
-      raise ValueError('`hash_key` is only valid when `num_bins` is not None')
     super(CategoryCrossing, self).__init__(name=name, **kwargs)
     self.depth = depth
-    self.num_bins = num_bins
-    self.hash_key = hash_key
     if isinstance(depth, (tuple, list)):
       self._depth_tuple = depth
     elif depth is not None:
       self._depth_tuple = tuple([i for i in range(1, depth + 1)])
+    strategy = ds_context.get_strategy()
+    if strategy.__class__.__name__.startswith('TPUStrategy'):
+      raise ValueError('TPU strategy is not support for this layer yet.')
 
   def partial_crossing(self, partial_inputs, ragged_out, sparse_out):
     """Gets the crossed output from a partial list/tuple of inputs."""
-    if self.num_bins is not None:
-      partial_output = sparse_ops.sparse_cross_hashed(
-          partial_inputs, num_buckets=self.num_bins, hash_key=self.hash_key)
-    else:
-      partial_output = sparse_ops.sparse_cross(partial_inputs)
-
     # If ragged_out=True, convert output from sparse to ragged.
     if ragged_out:
-      return ragged_tensor.RaggedTensor.from_sparse(partial_output)
+      return ragged_array_ops.cross(partial_inputs)
     elif sparse_out:
-      return partial_output
+      return sparse_ops.sparse_cross(partial_inputs)
     else:
-      return sparse_ops.sparse_tensor_to_dense(partial_output)
+      return sparse_ops.sparse_tensor_to_dense(
+          sparse_ops.sparse_cross(partial_inputs))
 
   def call(self, inputs):
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
     ragged_out = sparse_out = False
-    if all([ragged_tensor.is_ragged(inp) for inp in inputs]):
-      # (b/144500510) ragged.map_flat_values(sparse_cross_hashed, inputs) will
-      # cause kernel failure. Investigate and find a more efficient
-      # implementation
-      inputs = [inp.to_sparse() for inp in inputs]
+    if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
       ragged_out = True
-    else:
-      if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
-        raise ValueError(
-            'Inputs must be either all `RaggedTensor`, or none of them should '
-            'be `RaggedTensor`, got {}'.format(inputs))
-
-      if any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
-        sparse_out = True
+    elif any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
+      sparse_out = True
 
     outputs = []
     for depth in depth_tuple:
@@ -229,15 +166,22 @@ class CategoryCrossing(Layer):
   def compute_output_signature(self, input_spec):
     input_shapes = [x.shape for x in input_spec]
     output_shape = self.compute_output_shape(input_shapes)
-    output_dtype = dtypes.int64 if self.num_bins else dtypes.string
-    return sparse_tensor.SparseTensorSpec(
-        shape=output_shape, dtype=output_dtype)
+    if any([
+        isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
+    elif any([
+        isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return sparse_tensor.SparseTensorSpec(
+          shape=output_shape, dtype=dtypes.string)
+    return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
 
   def get_config(self):
     config = {
         'depth': self.depth,
-        'num_bins': self.num_bins,
-        'hash_key': self.hash_key
     }
     base_config = super(CategoryCrossing, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
new file mode 100644
index 00000000000..e1ba91e3558
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
@@ -0,0 +1,64 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+@combinations.generate(
+    combinations.combine(
+        # Investigate why crossing is not supported with TPU.
+        distribution=strategy_combinations.strategies_minus_tpu,
+        mode=['eager', 'graph']))
+class CategoryCrossingDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    input_array_1 = np.array([['a', 'b'], ['c', 'd']])
+    input_array_2 = np.array([['e', 'f'], ['g', 'h']])
+
+    # pyformat: disable
+    expected_output = [[b'a_X_e', b'a_X_f', b'b_X_e', b'b_X_f'],
+                       [b'c_X_g', b'c_X_h', b'd_X_g', b'd_X_h']]
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string)
+      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string)
+      input_data = [input_data_1, input_data_2]
+      layer = categorical_crossing.CategoryCrossing()
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict([input_array_1, input_array_2])
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
index 49d8f0d7003..5bbcf5ce022 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.platform import test
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class CategoryCrossingTest(keras_parameterized.TestCase):
 
-  def test_crossing_basic(self):
+  def test_crossing_sparse_inputs(self):
     layer = categorical_crossing.CategoryCrossing()
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -52,36 +52,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
     self.assertAllEqual([b'a_X_d', b'b_X_e', b'c_X_e'], output.values)
 
-  def test_crossing_sparse_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=1)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([0, 0, 0], output.values)
-
-  def test_crossing_sparse_inputs_with_hash_key(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([1, 0, 1], output.values)
-
-    layer_2 = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=137)
-    output = layer_2([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    # Note the output is different with above.
-    self.assertAllClose([0, 1, 0], output.values)
-
   def test_crossing_sparse_inputs_depth_int(self):
     layer = categorical_crossing.CategoryCrossing(depth=1)
     inputs_0 = sparse_tensor.SparseTensor(
@@ -127,35 +97,15 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         [expected_outputs_0, expected_outputs_1, expected_outputs_2], axis=0)
     self.assertAllEqual(expected_out, output)
 
-  def test_crossing_hashed_two_bins(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertEqual(output.values.numpy().max(), 1)
-    self.assertEqual(output.values.numpy().min(), 0)
-
-  def test_crossing_hashed_ragged_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
+  def test_crossing_ragged_inputs(self):
     inputs_0 = ragged_factory_ops.constant(
         [['omar', 'skywalker'], ['marlo']],
         dtype=dtypes.string)
     inputs_1 = ragged_factory_ops.constant(
         [['a'], ['b']],
         dtype=dtypes.string)
-    out_data = layer([inputs_0, inputs_1])
-    expected_output = [[0, 0], [0]]
-    self.assertAllClose(expected_output, out_data)
     inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
-    out_t = layer([inp_0_t, inp_1_t])
-    model = training.Model(inputs=[inp_0_t, inp_1_t], outputs=out_t)
-    self.assertAllClose(expected_output, model.predict([inputs_0, inputs_1]))
 
     non_hashed_layer = categorical_crossing.CategoryCrossing()
     out_t = non_hashed_layer([inp_0_t, inp_1_t])
@@ -198,16 +148,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertIsInstance(output, ragged_tensor.RaggedTensor)
     self.assertAllEqual(expected_output, output)
 
-  def test_invalid_mixed_sparse_and_ragged_input(self):
-    with self.assertRaises(ValueError):
-      layer = categorical_crossing.CategoryCrossing(num_bins=2)
-      inputs_0 = ragged_factory_ops.constant(
-          [['omar'], ['marlo']],
-          dtype=dtypes.string)
-      inputs_1 = sparse_tensor.SparseTensor(
-          indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-      layer([inputs_0, inputs_1])
-
   def test_crossing_with_dense_inputs(self):
     layer = categorical_crossing.CategoryCrossing()
     inputs_0 = np.asarray([[1, 2]])
@@ -251,13 +191,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output,
                         model.predict([inputs_0, inputs_1, inputs_2]))
 
-  def test_crossing_hashed_with_dense_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = np.asarray([[1, 2]])
-    inputs_1 = np.asarray([[1, 3]])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose([[1, 1, 0, 0]], output)
-
   def test_crossing_compute_output_signature(self):
     input_shapes = [
         tensor_shape.TensorShape([2, 2]),
@@ -272,18 +205,9 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
     self.assertEqual(output_spec.dtype, dtypes.string)
 
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    output_spec = layer.compute_output_signature(input_specs)
-    self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
-    self.assertEqual(output_spec.dtype, dtypes.int64)
-
-  def test_crossing_with_invalid_hash_key(self):
-    with self.assertRaises(ValueError):
-      _ = categorical_crossing.CategoryCrossing(hash_key=133)
-
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, name='hashing')
+    layer = categorical_crossing.CategoryCrossing(depth=2, name='hashing')
     config = layer.get_config()
     layer_1 = categorical_crossing.CategoryCrossing.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..0407188ab6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index f369c32a65e..20e5ca1af9c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..0407188ab6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index f369c32a65e..20e5ca1af9c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"

From 5fee245d9ff76b9fae2b7404f47aae83c84b8564 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Wed, 13 May 2020 13:57:37 -0700
Subject: [PATCH 0516/1533] [XLA] Basic (R1) support for CPU bounded dynamic
 shapes.

- Add dynamic tensor metadata read/write in XRT.
- Implement two custom calls: PadToStatic and SliceToDynamic -- R1 only.
- Some helper functions in shape util to do sanity check.
- Tests -- R1 Only.

PiperOrigin-RevId: 311398639
Change-Id: I7129fd13f4e0a2b7a14efb52eb814f753a15e05e
---
 tensorflow/compiler/xla/BUILD                 |   1 +
 tensorflow/compiler/xla/service/cpu/BUILD     |   1 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  15 +-
 .../xla/service/cpu/cpu_executable.cc         |   7 +-
 .../compiler/xla/service/cpu/ir_emitter.cc    |  88 ++++++
 .../compiler/xla/service/cpu/ir_emitter.h     |   2 +
 .../compiler/xla/service/shaped_buffer.h      |  13 +
 tensorflow/compiler/xla/shape_util.cc         |  36 ++-
 tensorflow/compiler/xla/shape_util.h          |  29 +-
 tensorflow/compiler/xrt/kernels/BUILD         |   1 +
 .../compiler/xrt/kernels/xrt_execute_op.cc    | 257 ++++++++++++++-
 tensorflow/compiler/xrt/tests/raw_api_test.cc | 299 ++++++++++++++++++
 12 files changed, 726 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 0193bea9d6d..45f49cee328 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -331,6 +331,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 121bdedf2dd..2f432cd9356 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -146,6 +146,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_padder",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index b04237138e8..fe769bbdd2a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -72,6 +72,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -239,7 +240,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   HloPassPipeline pipeline("HLO passes through layout assignment");
   pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                             /*allow_mixed_precision=*/false);
-
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
@@ -273,6 +273,13 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConvolutionGroupConverter>(
       cost_model,
       /*convert_batch_groups_only=*/false);
+  pipeline.AddPass<ScatterExpander>();
+  pipeline.AddPass<BatchNormExpander>(
+      /*rewrite_training_op=*/true,
+      /*rewrite_inference_op=*/true,
+      /*rewrite_grad_op=*/true);
+  pipeline.AddPass<DynamicPadder>();
+  pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
@@ -281,12 +288,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
                                                /*allow_mixed_precision=*/false);
 
     pass.AddPass<TreeReductionRewriter>();
-    pass.AddPass<ScatterExpander>();
-    pass.AddPass<BatchNormExpander>(
-        /*rewrite_training_op=*/true,
-        /*rewrite_inference_op=*/true,
-        /*rewrite_grad_op=*/true);
-    pipeline.AddPass<HloGetDimensionSizeRewriter>();
     AlgebraicSimplifierOptions options;
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<AlgebraicSimplifier>(options);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 8c1ae0179c0..f031daecb1f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -363,7 +363,12 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   if (shape.IsOpaque()) {
     return sizeof(void*);
   }
-  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  if (shape.is_static() || shape.IsTuple()) {
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+  // Each dynamic dimension size is represented as a S32.
+  int64 metadata_size = sizeof(int32) * shape.dimensions_size();
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
 }
 
 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 2b715bfa17a..f516a1538d3 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -2357,7 +2357,95 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
+  // TODO(jackcao): Generalize this to generic llvm emitter.
+  TF_RET_CHECK(hlo->shape().rank() == 1);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+  for (int64 i = 1; i < hlo->operand_count(); ++i) {
+    const int64 dim_index = i - 1;
+    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(i));
+    llvm::LoadInst* dim_size = b_.CreateLoad(source_buffer, "dim_size");
+    llvm::Value* dest_buffer = GetEmittedValueFor(hlo);
+    llvm::Value* raw_buffer =
+        b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
+
+    int32 raw_data_size =
+        ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(hlo->shape()));
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    b_.CreateStore(dim_size,
+                   b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+  }
+
+  return EmitTargetElementLoop(hlo,
+                               [=](const llvm_ir::IrArray::Index& dest_index) {
+                                 // TODO(jackcao): Properly linearize dest_index
+                                 // and delinearize to source index.
+                                 return GetIrArrayFor(hlo->operand(0))
+                                     .EmitReadArrayElement(dest_index, &b_);
+                               });
+}
+
+Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
+  // TODO(jackcao): Generalize this to generic llvm emitter.
+  TF_RET_CHECK(hlo->operand(0)->shape().rank() == 1);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data_slice,
+                      assignment_.GetUniqueSlice(hlo, {0}));
+  const Shape& data_shape = ShapeUtil::GetSubshape(hlo->shape(), {0});
+  llvm::Value* data_address = EmitBufferPointer(data_slice, data_shape);
+  llvm_ir::IrArray data_array(data_address, data_shape);
+  TF_RETURN_IF_ERROR(llvm_ir::LoopEmitter(
+                         [=](const llvm_ir::IrArray::Index& dest_index) {
+                           // TODO(jackcao): Properly linearize dest_index and
+                           // delinearize to source index.
+                           return GetIrArrayFor(hlo->operand(0))
+                               .EmitReadArrayElement(dest_index, &b_);
+                         },
+                         llvm_ir::IrArray(data_address, data_shape), &b_)
+                         .EmitLoop(IrName(hlo)));
+  std::vector<llvm::Value*> tuple_operand_ptrs;
+  tuple_operand_ptrs.push_back(data_array.GetBasePointer());
+
+  // PadToStatic has a dynamic tensor as input and variadic size of outputs:
+  // (static_tensor, dynamic_dim_0, dynamic_dim_1, ... )
+  // Dynamic dimension sizes starts from output index 1.
+  for (int64 i = 1; i < hlo->shape().tuple_shapes_size(); ++i) {
+    // Read from the metadata section of the dynamic input (operand 0).
+    const Shape& dim_shape = ShapeUtil::GetSubshape(hlo->shape(), {i});
+    TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dim_size_slice,
+                        assignment_.GetUniqueSlice(hlo, {i}));
+    llvm::Value* dest_dim_size_address =
+        EmitBufferPointer(dim_size_slice, data_shape);
+    const int64 dim_index = i - 1;
+    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(0));
+    llvm::Value* raw_buffer =
+        b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
+    int32 raw_data_size = ShapeUtil::ByteSizeOf(
+        ShapeUtil::MakeStaticShape(hlo->operand(0)->shape()));
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    llvm::Value* dim_size = b_.CreateLoad(
+        b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+    b_.CreateStore(dim_size, b_.CreateBitCast(dest_dim_size_address,
+                                              b_.getInt32Ty()->getPointerTo()));
+    tuple_operand_ptrs.push_back(dest_dim_size_address);
+  }
+
+  // Emit static tensor and dynamic sizes as one tuple.
+  llvm_ir::EmitTuple(GetIrArrayFor(hlo), tuple_operand_ptrs, &b_);
+  return Status::OK();
+}
+
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
+  if (custom_call->custom_call_target() == "PadToStatic") {
+    return HandlePadToStatic(custom_call);
+  }
+  if (custom_call->custom_call_target() == "SliceToDynamic") {
+    return HandleSliceToDynamic(custom_call);
+  }
   absl::Span<HloInstruction* const> operands(custom_call->operands());
   llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 24524c67b11..9b0d11e9f3f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -183,6 +183,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   }
 
  private:
+  Status HandleSliceToDynamic(HloInstruction* hlo);
+  Status HandlePadToStatic(HloInstruction* hlo);
   Status HandleAllReduceSingleReplica(HloInstruction* crs);
   Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index a1872330648..b7a67b4e66e 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -93,6 +94,18 @@ class ShapedBuffer {
     buffers_.replace_shape_ptr(&on_device_shape_);
   }
 
+  // Reset the shape of this shaped buffer and underlying buffer structure.
+  //
+  // Precondition: EqualStructure(this->on_device_shape_, on_device_shape).
+  void set_shapes(const Shape& on_host_shape, const Shape& on_device_shape) {
+    CHECK(ShapeUtil::EqualStructure(on_device_shape, on_device_shape_))
+        << "Structures are not the same. new: " << on_device_shape
+        << ", old: " << on_device_shape_;
+    on_host_shape_ = on_host_shape;
+    on_device_shape_ = on_device_shape;
+    buffers_.replace_shape_ptr(&on_device_shape_);
+  }
+
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
   const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 22ee5a16a30..52cbb8f95ac 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
@@ -150,6 +151,19 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
+/* static */ bool ShapeUtil::EqualStructure(const Shape& lhs,
+                                            const Shape& rhs) {
+  bool equal = true;
+  ForEachSubshape(lhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(rhs, index);
+  });
+  ForEachSubshape(rhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(lhs, index);
+  });
+
+  return equal;
+}
+
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -261,6 +275,12 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return ValidateShape(*shape);
 }
 
+/* static */ Shape ShapeUtil::MakeStaticShape(const Shape& original) {
+  Shape result = original;
+  result.clear_dynamic_dimensions();
+  return result;
+}
+
 /* static */ Shape ShapeUtil::MakeTupleShape(absl::Span<const Shape> shapes) {
   Shape result;
   result.set_element_type(TUPLE);
@@ -626,8 +646,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
   } else if (shape.IsArray()) {
-    int64 byte_size = ByteSizeOfElements(shape);
-    return byte_size;
+    return ByteSizeOfElements(shape);
   } else if (shape.element_type() == TOKEN) {
     return 0;
   } else if (shape.element_type() == OPAQUE_TYPE) {
@@ -1441,6 +1460,19 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return shape;
 }
 
+/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+    const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
+  if (dynamic_shape.rank() != bounded_shape.rank()) {
+    return false;
+  }
+  for (int64 i = 0; i < dynamic_shape.rank(); ++i) {
+    if (dynamic_shape.dimensions(i) > bounded_shape.dimensions(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
   CHECK(shape.IsArray());
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 7e05e17865d..dde56587482 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -298,6 +298,16 @@ class ShapeUtil {
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
+  // Two shapes have same structure if all subshape indices of lhs are presented
+  // on rhs and vice versa.
+  // A nested tuple shape of (F32, (S32[2], F32[2, 2])) is structurally equal to
+  // (S32, (F32[3], S32[2])) as their structures are both (,(,))
+  //
+  // In contrast, (F32, (F32, F32)) is structurally different from
+  // ((F32, F32), F32) as the former has structure (,(,)) while the latter has
+  // ((,),)
+  static bool EqualStructure(const Shape& lhs, const Shape& rhs);
+
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
   // fluff. Note that zero dimensions are included in the true rank, e.g.,
@@ -339,6 +349,9 @@ class ShapeUtil {
   // element type changed to type.
   static Shape ChangeElementType(const Shape& original, PrimitiveType type);
 
+  // Retursn a shape with same dimensions but with all dimensions set to static.
+  static Shape MakeStaticShape(const Shape& original);
+
   // Creates a tuple shape from a slice of element shapes within the tuple.
   static Shape MakeTupleShape(absl::Span<const Shape> shapes);
 
@@ -643,12 +656,16 @@ class ShapeUtil {
   static Shape FilterDimensions(const std::function<bool(int64)>& p,
                                 Shape shape);
 
-  // Iterates through all the shape indexes, in minor to major order, starting
-  // from the base indexes, incrementing by the incr steps, up to count
-  // (index[i] < base[i] + count[i]), and calls the visitor_function with the
-  // current index.
-  // The visitor_function visitor function should return true if it wants to
-  // continue, or false otherwise.
+  // Returns true if `dynamic_shape` has dimensions that are less-equal to the
+  // "bounded_shape".
+  static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                       const xla::Shape& bounded_shape);
+
+  // Iterates through all the shape indexes, in minor to major order,
+  // starting from the base indexes, incrementing by the incr steps, up to
+  // count (index[i] < base[i] + count[i]), and calls the visitor_function
+  // with the current index. The visitor_function visitor function should
+  // return true if it wants to continue, or false otherwise.
   //
   // visitor_function must be a callable of type
   // StatusOr<bool>(absl::Span<int64>) or compatible.
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index d71e6e2cc73..494ba29e981 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -49,6 +49,7 @@ cc_library(
     deps = [
         ":xrt_state_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index d39b37387f2..2fc599e42df 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
@@ -38,7 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/timed.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
@@ -146,6 +151,231 @@ xla::StatusOr<InputBuffers> GetChainedOpInputs(
   return std::move(input_buffers);
 }
 
+// Given a shape, returns a byte array representing the shape metadata of the
+// shape. The shape metadata contains dimensions sizes stored as contiguous S32.
+std::vector<int32> PrepareMetadata(const xla::Shape& shape) {
+  DCHECK(shape.is_static());
+  DCHECK(shape.IsArray());
+  // Each dimension size is stored as a S32.
+  std::vector<int32> result(shape.dimensions_size());
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    result[i] = shape.dimensions(i);
+  }
+  return result;
+}
+
+// Given a buffer with dynamic shape, update buffer metadata at the correct
+// offset starting from that buffer.
+//
+// +-----------+
+// |Payload    |
+// +-----------+
+// | Padding   |
+// +-----------+
+// |dim_size_0 |  (each dim_size is a S32):
+// +-----------+
+// |dim_size_1 |
+// +-----------+
+//  ..........
+// +-----------+
+//
+// Size of payload = ByteSizeOf(runtime_shape)
+// Size of payload + padding = ByteSizeOf(compile_time_shape_static)
+// Size of payload + padding + metadata = ByteSizeOf(compile_time_shape)
+Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
+                      const xla::Shape& compile_time_shape,
+                      const xla::Shape& runtime_shape) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape compile_time_shape_static =
+      xla::ShapeUtil::MakeStaticShape(compile_time_shape);
+  uint64 offset = shape_size_fn(compile_time_shape_static);
+  uint64 metadata_size = shape_size_fn(compile_time_shape) - offset;
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(buffer, offset, metadata_size);
+
+  auto metadata_literal = std::make_shared<xla::Literal>(
+      xla::LiteralUtil::CreateR1<int32>(PrepareMetadata(runtime_shape)));
+  TF_RETURN_IF_ERROR(transfer_manager->TransferArrayToDeviceAsync(
+      stream, *metadata_literal, metadata_buffer));
+  // Retain the literal until the end of the transfer.
+  stream->ThenDoHostCallback([metadata_literal]() { return Status::OK(); });
+  return Status::OK();
+}
+
+// Given a static input buffer, convert it to dynamic form by expanding it to
+// the bounded size and attaching a metadata filled with dimension sizes.
+//
+// From:
+// +--------+
+// |Payload |
+// +--------+
+//
+// To:
+//
+// +--------+
+// |Payload |
+// +--------+
+// | Padding|
+// +--------+
+// |Metadata|
+// +--------+
+//
+// As we can't expand the size of an existing memory allocation, a reallocation
+// is required. A list of new allocations are returned after this function. The
+// caller is reponsible for maintaining those allocations.
+xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
+    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    std::vector<xla::ShapedBuffer*> runtime_inputs,
+    const std::vector<xla::ShapeLayout>& compile_time_shapes) {
+  std::vector<se::OwningDeviceMemory> new_allocations;
+  TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size());
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  for (int64 i = 0; i < compile_time_shapes.size(); i++) {
+    const xla::Shape& compile_time_shape = compile_time_shapes[i].shape();
+    if (compile_time_shape.is_static()) {
+      continue;
+    }
+    auto* runtime_input = runtime_inputs[i];
+
+    bool element_modified = false;
+    TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
+        compile_time_shape,
+        [&](const xla::Shape& compile_time_shape,
+            const xla::ShapeIndex& index) -> Status {
+          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+            return Status::OK();
+          }
+          const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape(
+              runtime_input->on_device_shape(), index);
+          TF_RET_CHECK(!runtime_shape.IsTuple());
+          TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible(
+              runtime_shape, compile_time_shape));
+          se::DeviceMemoryBase* static_input =
+              runtime_input->buffers().mutable_element(index);
+          TF_ASSIGN_OR_RETURN(
+              auto dynamic_input,
+              allocator->Allocate(stream->parent()->device_ordinal(),
+                                  shape_size_fn(compile_time_shape)));
+          new_allocations.emplace_back(std::move(dynamic_input));
+          se::DeviceMemory<uint8>* dynamic_input_base =
+              new_allocations.back().ptr();
+          // Send the original data to the new location.
+          stream->ThenMemcpyD2D(dynamic_input_base, *static_input,
+                                static_input->size());
+          TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
+                                            compile_time_shape, runtime_shape));
+          // Modify the memory location in the input shape tree to point to the
+          // new input.
+          runtime_input->set_buffer(*dynamic_input_base, index);
+          element_modified = true;
+          return Status::OK();
+        }));
+    if (element_modified) {
+      runtime_input->set_shapes(compile_time_shape, compile_time_shape);
+      // The input location has been modified, need to fix tuple table to
+      // point to the correct address.
+      TF_ASSIGN_OR_RETURN(
+          auto transfer_manager,
+          xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+      TF_RETURN_IF_ERROR(
+          transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input));
+    }
+  }
+  return std::move(new_allocations);
+}
+
+xla::StatusOr<xla::Literal> ReadMetadataLiteral(
+    se::Stream* stream, se::DeviceMemoryBase* buffer,
+    const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape buffer_shape_static =
+      xla::ShapeUtil::MakeStaticShape(buffer_shape);
+  const int64 offset = shape_size_fn(buffer_shape_static);
+  int64 metadata_size = shape_size_fn(buffer_shape) - offset;
+  TF_RET_CHECK(metadata_size != 0);
+  auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+  return transfer_manager->TransferArrayFromDevice(
+      stream,
+      xla::ShapeUtil::MakeShape(xla::S32, {buffer_shape.dimensions_size()}),
+      metadata_buffer);
+}
+
+// For each subshape in the result buffer that's dynamic, read the dynamic
+// dimension sizes from the metadata, and update output shapes. The result shape
+// is a static and concrete shape.
+xla::Status UpdateDynamicOutputs(se::Stream* stream,
+                                 xla::ShapedBuffer* shaped_buffer,
+                                 xla::Shape* output_host_shape,
+                                 xla::Shape* output_device_shape) {
+  DCHECK(output_device_shape->is_dynamic());
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        const xla::Shape& buffer_shape =
+            xla::ShapeUtil::GetSubshape(*output_device_shape, index);
+        if (buffer_shape.IsTuple()) {
+          return Status::OK();
+        }
+        xla::Shape& host_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_host_shape, index);
+        xla::Shape& device_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_device_shape, index);
+        if (device_shape.is_static()) {
+          return Status::OK();
+        }
+        TF_ASSIGN_OR_RETURN(auto metadata,
+                            ReadMetadataLiteral(stream, buffer, buffer_shape,
+                                                transfer_manager));
+        // Update shape size from metadata.
+        for (int64 i = 0; i < metadata.element_count(); ++i) {
+          host_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+          device_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+        }
+        return Status::OK();
+      }));
+  output_host_shape->clear_dynamic_dimensions();
+  output_device_shape->clear_dynamic_dimensions();
+  return Status::OK();
+}
+
+// Create output tuple from run_result.
+xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
+    se::Stream* stream, xla::ScopedShapedBuffer run_result,
+    xla::Backend* backend, int device_ordinal) {
+  XRTTupleAllocation* output_tuple;
+  xla::ShapedBuffer shaped_buffer = run_result.release();
+  if (shaped_buffer.on_device_shape().is_dynamic()) {
+    // Update dynamic shapes from output buffer, and create a XRT tensor with
+    // dimension sizes read from metadata.
+    xla::Shape output_host_shape = shaped_buffer.on_host_shape();
+    xla::Shape output_device_shape = shaped_buffer.on_device_shape();
+    TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
+        stream, &shaped_buffer, &output_host_shape, &output_device_shape));
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, output_host_shape, output_device_shape, backend,
+        device_ordinal, &output_tuple));
+  } else {
+    // Fast-path: Don't copy shapes of output buffer.
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, backend, device_ordinal, &output_tuple));
+  }
+  return RefPtr<XRTTupleAllocation>(output_tuple);
+}
+
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
@@ -191,18 +421,31 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
 
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
+  const std::vector<xla::ShapeLayout>& shape_layouts =
+      executable->executable()
+          ->module_config()
+          .entry_computation_layout()
+          .parameter_layouts();
+  TF_ASSIGN_OR_RETURN(auto new_allocations,
+                      UpdateDynamicInputs(stream, run_options.allocator(),
+                                          input_buffers.input_pointers,
+                                          shape_layouts));
+  auto new_allocations_ptr =
+      std::make_shared<std::vector<se::OwningDeviceMemory>>(
+          std::move(new_allocations));
   TF_ASSIGN_OR_RETURN(
       xla::ScopedShapedBuffer run_result,
       executable->Run(input_buffers.input_pointers, run_options));
+  // Retain the new allocation for input memory until the end of execution.
+  stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); });
+
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
-  auto shaped_buffer = run_result.release();
-  XRTTupleAllocation* output_tuple;
-  TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-      shaped_buffer, device_ref->backend(), device_ref->device_ordinal(),
-      &output_tuple));
-  RefPtr<XRTTupleAllocation> output_tuple_ptr(output_tuple);
+  TF_ASSIGN_OR_RETURN(
+      RefPtr<XRTTupleAllocation> output_tuple_ptr,
+      CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
+                        device_ref->device_ordinal()));
 
   // The ScopedShapedBuffer returned by the executable Run() API, in case of
   // input/output buffer aliasing, might have holes in it, which need to be
@@ -215,7 +458,7 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
           const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
     TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
     return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple->AliasBufferFrom(
+               ? output_tuple_ptr->AliasBufferFrom(
                      *input_buffers.input_tuples[alias.parameter_number],
                      alias.parameter_index, output_index)
                : Status::OK();
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 243289c8821..fbf9dfd0a17 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -49,6 +49,67 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+xla::XlaComputation ReturnDynamicR1() {
+  xla::XlaBuilder builder("ReturnDynamicR1");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  return builder.Build(pad_sum).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
+  auto p1 = xla::Parameter(&builder, 1, dyn_shape, "P1");
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+xla::XlaComputation ReturnDynamicR1Tuple() {
+  xla::XlaBuilder builder("ReturnDynamicR1Tuple");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  auto one = xla::One(&builder, xla::S32);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  auto pad_sub = xla::SetDimensionSize(sub, p2 + one, 0);
+  auto tuple = xla::Tuple(&builder, {pad_sum, sum, pad_sub});
+  return builder.Build(tuple, /*remove_dynamic_dimensions=*/true).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1Tuple() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  xla::Shape tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  xla::Shape nest_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  auto p = xla::Parameter(&builder, 0, tuple_shape, "P0");
+  auto p0 = xla::GetTupleElement(p, 0);
+  auto p1 = xla::GetTupleElement(p, 1);
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+template <typename T>
+xla::LiteralProto CreateR0(T v) {
+  auto array = xla::LiteralUtil::CreateR0<T>(v);
+  return array.ToProto();
+}
+
 class XrtClientSession : public ClientSession {
  public:
   explicit XrtClientSession(const Scope& scope) : ClientSession(scope) {
@@ -61,6 +122,11 @@ class XrtClientSession : public ClientSession {
 string* xla_test_device_ptr;  // initial value set in main()
 string* xla_platform_ptr;     // initial value set in main()
 
+bool SupportDynamicShapes() {
+  // TODO(jackcao): Support dynamic shapes on XLA GPU.
+  return *xla_test_device_ptr != "XLA_GPU";
+}
+
 string DeviceFromFlag() {
   string xla_test_device = *xla_test_device_ptr;
   return absl::StrCat("/device:", xla_test_device, ":0");
@@ -1035,6 +1101,239 @@ TEST(RawApiTest, CompileAndExecute) {
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
 
+TEST(RawApiTest, DynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, 2.5f, 1.17f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, DynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f, 1.0f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape(
+          {dyn_shape, xla::ShapeUtil::MakeShape(xla::F32, {4}), dyn_shape})
+          .ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected0 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  auto expected1 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f, 0.0f});
+  auto expected2 = xla::LiteralUtil::CreateR1<float>({0.0f, 3.0f, 1.0f});
+  auto expected =
+      xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLATupleNode tuple_desc;
+  auto subdesc_10 = tuple_desc.add_tuples();
+  auto subdesc_11 = tuple_desc.add_tuples();
+  subdesc_10->set_input_index(0);
+  subdesc_10->set_release_input_handle(true);
+  subdesc_11->set_input_index(1);
+  subdesc_11->set_release_input_handle(true);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  xla::Shape dyn_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_input_shape, dyn_input_shape});
+  *shapes->add_parameters() = dyn_tuple_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+
+  auto tuple_0 = ops::Const(root.WithDevice("/device:CPU:0"),
+                            tuple_desc.SerializeAsString());
+  auto t0_handle = ops::XRTMakeTuple(
+      root, tuple_0,
+      {static_cast<Output>(p0_handle), static_cast<Output>(p1_handle)});
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {static_cast<Output>(t0_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto allocate_op_0 = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto allocate_op_1 = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(allocate_op_0), Output(allocate_op_1)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});

From 2bc8118d17e8c4ca5a43e007af1521dfddabd02c Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 13 May 2020 14:18:41 -0700
Subject: [PATCH 0517/1533] Threadpool changes for pooling ops.

---
 tensorflow/core/kernels/mkl_avgpooling_op.cc  | 26 ++++++++++-------
 tensorflow/core/kernels/mkl_maxpooling_op.cc  | 29 ++++++++++++-------
 .../core/kernels/mkl_pooling_ops_common.cc    | 27 +++++++++--------
 .../core/kernels/mkl_pooling_ops_common.h     | 26 +++++++----------
 4 files changed, 56 insertions(+), 52 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index 47b999e0e11..66e86639aaa 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -136,9 +136,10 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       const T* src_data = input_tensor.flat<T>().data();
 
       T* dst_data = output_tensor->flat<T>().data();
-
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, pooling_fwd->GetEngine()));
       // Execute pooling op.
-      pooling_fwd->Execute(src_data, dst_data);
+      pooling_fwd->Execute(src_data, dst_data, nullptr, fwd_cpu_stream);
 
       // Pass min, max from input to output.
       if (int8_forward_inference) {
@@ -160,9 +161,9 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
         output_max->flat<float>()(0) = max_input;
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -240,8 +241,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
               : memory::desc(diff_dst_dims, MklDnnType<T>(),
                              this->data_format_mkldnn_);
 
-      // Pass prop_kind::forward_training to create a forward primitive
-      // that is used in the backward pass.
+// Pass prop_kind::forward_training to create a forward primitive
+// that is used in the backward pass.
 #ifdef ENABLE_MKLDNN_V1
       // TODO(DNNL): Find out what should we use src_md.data.format.
       MklPoolingParams bwdParams(
@@ -260,6 +261,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, pooling_bwd->GetEngine()));
       Tensor* output_tensor = nullptr;
       this->AllocateOutputTensor(context, *(pooling_bwd->GetPoolingBwdPd()),
                                  orig_input_dims_mkl_order,
@@ -286,11 +289,12 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       T* diff_src_data = output_tensor->flat<T>().data();
 
       // Execute pooling op.
-      pooling_bwd->Execute(diff_dst_data, diff_src_data);
+      pooling_bwd->Execute(diff_dst_data, diff_src_data, nullptr,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
                                               error_msg));
     }
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index dbccb35b88b..3a560775f2b 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -167,10 +167,12 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       const T* src_data = input_tensor.flat<T>().data();
 
       T* dst_data = output_tensor->flat<T>().data();
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, pooling_fwd->GetEngine()));
 
       if (int8_forward_inference) {
         // Execute pooling op
-        pooling_fwd->Execute(src_data, dst_data);
+        pooling_fwd->Execute(src_data, dst_data, nullptr, fwd_cpu_stream);
 
         // Pass min, max from input to output.
         const Tensor& min_input_t = MklGetInput(context, 1);
@@ -197,12 +199,12 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
         T* ws_data =
             static_cast<T*>(dnn_data_wksp.GetOpMem().get_data_handle());
         // Execute pooling op.
-        pooling_fwd->Execute(src_data, dst_data, ws_data);
+        pooling_fwd->Execute(src_data, dst_data, ws_data, fwd_cpu_stream);
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
                                               error_msg));
     }
@@ -322,6 +324,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, pooling_bwd->GetEngine()));
       // Allocate output tensor and memory primitive.
       Tensor* output_tensor = nullptr;
       this->AllocateOutputTensor(context, *(pooling_bwd->GetPoolingBwdPd()),
@@ -335,8 +339,10 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
                                      pooling_bwd)) {
         grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor);
-        grad_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd), cpu_engine_));
+        grad_dnn_data.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd),
+                                   cpu_engine_),
+            context);
         diff_dst_data =
             static_cast<T*>(grad_dnn_data.GetOpMem().get_data_handle());
       } else {
@@ -361,11 +367,12 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       T* diff_src_data = output_tensor->flat<T>().data();
 
       // Execute pooling op.
-      pooling_bwd->Execute(diff_dst_data, diff_src_data, ws_data);
+      pooling_bwd->Execute(diff_dst_data, diff_src_data, ws_data,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status:" + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ". in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status:" + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ". in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(context, errors::Aborted("Compute received an exception:",
                                               error_msg));
     }
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 5bd9c17f95e..2dfc6db0075 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
-
 namespace tensorflow {
 using mkldnn::prop_kind;
 
@@ -38,11 +37,11 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
   context_.alg_kind = fwdParams.alg_kind;
   context_.prop_kind = fwdParams.prop_kind;
 
-  // Create memory descriptor
-  // FIXME: Pooling doesn't expose to get the src_primitive_desc,
-  //        so src format is currently hard-coded.
-  //        A utility function is used to do this,
-  //        which may be broken with future CPU architectures
+// Create memory descriptor
+// FIXME: Pooling doesn't expose to get the src_primitive_desc,
+//        so src format is currently hard-coded.
+//        A utility function is used to do this,
+//        which may be broken with future CPU architectures
 #ifndef ENABLE_MKLDNN_V1
   bool is_2d = (fwdParams.src_dims.size() == 4);
   if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value)
@@ -126,7 +125,8 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
 
 template <typename T>
 void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
-                                        void* ws_data) {
+                                        void* ws_data,
+                                        std::shared_ptr<stream> fwd_stream) {
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -138,10 +138,9 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   }
 
 #ifdef ENABLE_MKLDNN_V1
-  execute_primitives(context_.fwd_primitives, context_.fwd_stream,
-                     context_.net_args);
+  execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
-  context_.fwd_stream->submit(context_.fwd_primitives);
+  fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
   // Set back data handle.
@@ -268,7 +267,8 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
 
 template <typename T>
 void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
-                                        T* diff_src_data, const void* ws_data) {
+                                        T* diff_src_data, const void* ws_data,
+                                        std::shared_ptr<stream> bwd_stream) {
   context_.diff_dst_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(diff_dst_data)));
   context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
@@ -278,10 +278,9 @@ void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
   }
 
 #ifdef ENABLE_MKLDNN_V1
-  execute_primitives(context_.bwd_primitives, context_.bwd_stream,
-                     context_.net_args);
+  execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
-  context_.bwd_stream->submit(context_.bwd_primitives);
+  bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
   // Set back data handle.
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 54f4dc8503e..9df0256512e 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -86,8 +86,7 @@ template <typename T>
 class MklPoolingFwdPrimitive : public MklPrimitive {
  public:
   explicit MklPoolingFwdPrimitive(const MklPoolingParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.fwd == nullptr) Setup(fwdParams);
   }
 
@@ -97,7 +96,8 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
   //   src_data:  input data buffer of src
   //   ws_data:   output data buffer of workspace
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data, void* ws_data = nullptr);
+  void Execute(const T* src_data, T* dst_data, void* ws_data,
+               std::shared_ptr<stream> fwd_stream);
 
   std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
     return context_.fwd_pd;
@@ -159,12 +159,10 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
           fwd_pd(nullptr),
           src_md(nullptr),
           dst_md(nullptr),
-          fwd(nullptr),
-          fwd_stream(nullptr) {}
+          fwd(nullptr) {}
   };
 
   struct PoolingFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -229,8 +227,7 @@ template <typename T>
 class MklPoolingBwdPrimitive : public MklPrimitive {
  public:
   explicit MklPoolingBwdPrimitive(const MklPoolingParams& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bwd == nullptr) Setup(bwdParams);
   }
 
@@ -240,8 +237,8 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
   //   diff_dst_data:  input data buffer of diff_dst
   //   diff_src_data:  output data buffer of diff_src
   //   ws_data:        input data buffer of workspace
-  void Execute(const T* diff_dst_data, T* diff_src_data,
-               const void* ws_data = nullptr);
+  void Execute(const T* diff_dst_data, T* diff_src_data, const void* ws_data,
+               std::shared_ptr<stream> bwd_stream);
 
  public:
   std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
@@ -315,12 +312,10 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
           bwd_desc(nullptr),
           fwd_pd(nullptr),
           bwd_pd(nullptr),
-          bwd(nullptr),
-          bwd_stream(nullptr) {}
+          bwd(nullptr) {}
   };
 
   struct PoolingBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -694,9 +689,8 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
                   errors::InvalidArgument("Input must be 4 or 5-dimensional"));
     } else {
       OP_REQUIRES(
-          context,
-          input_mkl_shape.GetDimension() == 4 ||
-              input_mkl_shape.GetDimension() == 5,
+          context, input_mkl_shape.GetDimension() == 4 ||
+                       input_mkl_shape.GetDimension() == 5,
           errors::InvalidArgument("Input shape must be 4 or 5-dimensional"));
     }
   }

From 062cf92d066771ab3cf2910f125b0209c305eb2b Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Wed, 13 May 2020 14:17:05 -0700
Subject: [PATCH 0518/1533] [tf.lite] Adds a setQuantizedModelsAllowed() Java
 API for running quant models with GPU delegate

PiperOrigin-RevId: 311402449
Change-Id: I49809a004ad11c4bc9d9e5272472f3b85ea7948f
---
 .../org/tensorflow/lite/gpu/GpuDelegate.java  | 22 ++++-
 .../java/src/main/native/gpu_delegate_jni.cc  |  6 +-
 tensorflow/lite/java/BUILD                    |  1 +
 .../java/org/tensorflow/lite/Interpreter.java |  5 ++
 .../lite/NativeInterpreterWrapper.java        |  7 ++
 .../native/nativeinterpreterwrapper_jni.cc    |  9 ++
 .../lite/InterpreterTestHelper.java           | 29 +++++++
 .../tensorflow/lite/gpu/GpuDelegateTest.java  | 85 ++++++++++++++++++-
 8 files changed, 160 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java

diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 8d802ae044a..895f12f0233 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -62,6 +62,18 @@ public class GpuDelegate implements Delegate, Closeable {
       return this;
     }
 
+    /**
+     * Enables running quantized models with the delegate. Defaults to false.
+     *
+     * <p>WARNING: This is an experimental API and subject to change.
+     *
+     * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
+     */
+    public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
+      this.quantizedModelsAllowed = quantizedModelsAllowed;
+      return this;
+    }
+
     /**
      * Sets the inference preference for precision/compilation/runtime tradeoffs.
      *
@@ -74,11 +86,16 @@ public class GpuDelegate implements Delegate, Closeable {
     }
 
     boolean precisionLossAllowed = true;
+    boolean quantizedModelsAllowed = false;
     int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
   }
 
   public GpuDelegate(Options options) {
-    delegateHandle = createDelegate(options.precisionLossAllowed, options.inferencePreference);
+    delegateHandle =
+        createDelegate(
+            options.precisionLossAllowed,
+            options.quantizedModelsAllowed,
+            options.inferencePreference);
   }
 
   public GpuDelegate() {
@@ -107,7 +124,8 @@ public class GpuDelegate implements Delegate, Closeable {
     System.loadLibrary(TFLITE_GPU_LIB);
   }
 
-  private static native long createDelegate(boolean precisionLossAllowed, int preference);
+  private static native long createDelegate(
+      boolean precisionLossAllowed, boolean quantizedModelsAllowed, int preference);
 
   private static native void deleteDelegate(long delegateHandle);
 }
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 089e2c2f816..900cc0e0d75 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -23,7 +23,7 @@ extern "C" {
 
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
-    jint inference_preference) {
+    jboolean quantized_models_allowed, jint inference_preference) {
   TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
   if (precision_loss_allowed == JNI_TRUE) {
     options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
@@ -31,6 +31,10 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
         TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
     options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
   }
+  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
+  if (quantized_models_allowed) {
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+  }
   options.inference_preference = static_cast<int32_t>(inference_preference);
   return reinterpret_cast<jlong>(TfLiteGpuDelegateV2Create(&options));
 }
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 46cd1be25cb..5eb5e8ab023 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -353,6 +353,7 @@ filegroup(
 filegroup(
     name = "portable_gpu_tests",
     srcs = [
+        "src/test/java/org/tensorflow/lite/InterpreterTestHelper.java",
         "src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java",
     ],
     visibility = ["//visibility:public"],
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index d191b550d8f..5625ef98bb6 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -491,6 +491,11 @@ public final class Interpreter implements AutoCloseable {
     wrapper.resetVariableTensors();
   }
 
+  int getExecutionPlanLength() {
+    checkNotClosed();
+    return wrapper.getExecutionPlanLength();
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index a22d7241587..8eb3c66f3b5 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -324,6 +324,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return outputTensor;
   }
 
+  /** Gets the number of ops in the execution plan. */
+  int getExecutionPlanLength() {
+    return getExecutionPlanLength(interpreterHandle);
+  }
+
   private void applyDelegates(Interpreter.Options options) {
     // First apply the flex delegate if necessary. This ensures the graph is fully resolved before
     // applying other delegates.
@@ -419,6 +424,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native int getOutputCount(long interpreterHandle);
 
+  private static native int getExecutionPlanLength(long interpreterHandle);
+
   private static native String[] getInputNames(long interpreterHandle);
 
   private static native String[] getOutputNames(long interpreterHandle);
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 971aa5efd7a..690b58ac1f4 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -241,6 +241,15 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensorIndex(
   return interpreter->outputs()[output_index];
 }
 
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getExecutionPlanLength(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return 0;
+  return static_cast<jint>(interpreter->execution_plan().size());
+}
+
 JNIEXPORT jint JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputCount(JNIEnv* env,
                                                                 jclass clazz,
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
new file mode 100644
index 00000000000..34eb47e4dbe
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** Utility for interacting with Interpreter in delegate tests. */
+public abstract class InterpreterTestHelper {
+
+  /**
+   * Returns the number of nodes in the execution plan that are invoked per inference.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   */
+  public static int executionPlanLength(Interpreter interpreter) {
+    return interpreter.getExecutionPlanLength();
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
index 1fe4a531624..d92a7119aab 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@@ -18,12 +18,17 @@ package org.tensorflow.lite.gpu;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.nio.ByteBuffer;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.PriorityQueue;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.InterpreterTestHelper;
 import org.tensorflow.lite.TestUtils;
 
 /** Unit tests for {@link org.tensorflow.lite.gpu.GpuDelegate}. */
@@ -32,6 +37,9 @@ public final class GpuDelegateTest {
 
   private static final String MODEL_PATH = "tensorflow/lite/testdata/multi_add.bin";
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
+  private static final ByteBuffer MOBILENET_QUANTIZED_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(
+          "third_party/tensorflow/lite/java/demo/app/src/main/assets/mobilenet_v1_1.0_224_quant.tflite");
 
   @Test
   public void testBasic() throws Exception {
@@ -41,7 +49,7 @@ public final class GpuDelegateTest {
   }
 
   @Test
-  public void testInterpreterWithGpu() throws Exception {
+  public void testInterpreterWithGpu_FloatModel() throws Exception {
     Interpreter.Options options = new Interpreter.Options();
     try (GpuDelegate delegate = new GpuDelegate();
         Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
@@ -60,4 +68,79 @@ public final class GpuDelegateTest {
       assertThat(parsedOutput1).usingTolerance(0.1f).containsExactly(expected1).inOrder();
     }
   }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunWithDelegate() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate =
+            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Should be only 1 node (Delegate) in the execution plan.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(1);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunOnCPU() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate = new GpuDelegate();
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Original execution plan remains since default behavior doesn't allow quantized models.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
+  }
 }

From cec91bb8d67993892c7451f6fde446846ec786cf Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 13 May 2020 14:23:49 -0700
Subject: [PATCH 0519/1533] threadpool support for mkl_conv_bwd ops.

---
 .../core/kernels/mkl_conv_grad_filter_ops.cc  | 49 ++++++++++---------
 .../core/kernels/mkl_conv_grad_input_ops.cc   | 44 +++++++++--------
 2 files changed, 51 insertions(+), 42 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 269513f2e7d..70a4bed9a08 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -97,9 +97,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
  public:
   explicit MklConvBwdFilterPrimitive(
       const MklConvBwdFilterParams& convBwdFilterDims)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_filter_stream.reset(new CPU_STREAM(cpu_engine_));
-
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create convolution backward filter primitive.
     if (context_.conv_bwd_filter == nullptr) {
       Setup(convBwdFilterDims);
@@ -114,7 +112,8 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_bias_data:   output data buffer for diff_bias
   //   diff_dst_data:    input data buffer for diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-               const T* diff_bias_data, const T* diff_dst_data) {
+               const T* diff_bias_data, const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_filter_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -127,11 +126,10 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.bwd_filter_primitives,
-                       context_.bwd_filter_stream,
+    execute_primitives(context_.bwd_filter_primitives, bwd_filter_stream,
                        context_.bwd_filter_primitives_args);
 #else
-    context_.bwd_filter_stream->submit(context_.bwd_filter_primitives);
+    bwd_filter_stream->submit(context_.bwd_filter_primitives);
 #endif
 
     context_.src_mem->set_data_handle(DummyData);
@@ -147,8 +145,10 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_filter_data: output data buffer of diff_filter
   //   diff_dst_data:    input data buffer of diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-               const T* diff_dst_data) {
-    Execute(src_data, diff_filter_data, nullptr, diff_dst_data);
+               const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_filter_stream) {
+    Execute(src_data, diff_filter_data, nullptr, diff_dst_data,
+            bwd_filter_stream);
   }
 
 #ifndef ENABLE_MKLDNN_V1
@@ -223,8 +223,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
           src_md(nullptr),
           diff_filter_md(nullptr),
           diff_bias_md(nullptr),
-          diff_dst_md(nullptr),
-          bwd_filter_stream(nullptr) {
+          diff_dst_md(nullptr) {
     }
   };
 
@@ -345,7 +344,6 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   }
 
   struct ConvBwdFilterContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -600,8 +598,10 @@ class MklConvCustomBackpropFilterOp
       auto bwd_filter_pd = conv_bwd_filter->GetPrimitiveDesc();
       if (IS_SRC_REORDER_NEEDED(fwd_src_md, bwd_filter_pd, conv_bwd_filter)) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_filter_pd->PRIMITIVE_DESC_SRC, cpu_engine_));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_filter_pd->PRIMITIVE_DESC_SRC,
+                                   cpu_engine_),
+            context);
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
@@ -612,8 +612,10 @@ class MklConvCustomBackpropFilterOp
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bwd_filter_pd,
                                      conv_bwd_filter)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_filter_pd->PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_filter_pd->PRIMITIVE_DESC_DIFF_DST,
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
@@ -646,26 +648,29 @@ class MklConvCustomBackpropFilterOp
       }
 
       // Execute convolution backward filter.
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, conv_bwd_filter->GetEngine()));
       if (bias_enabled) {
         T* diff_bias_data =
             static_cast<T*>(const_cast<T*>(diff_bias_tensor->flat<T>().data()));
         conv_bwd_filter->Execute(src_data, diff_filter_data, diff_bias_data,
-                                 diff_dst_data);
+                                 diff_dst_data, bwd_cpu_stream);
       } else {
-        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data);
+        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data,
+                                 bwd_cpu_stream);
       }
 
       // Reorder diff_filter back to Tensorflow layout if necessary.
       if (diff_filter_reorder_required) {
-        diff_filter.InsertReorderToUserMem();
+        diff_filter.InsertReorderToUserMem(context);
       }
 
       // Delete primitive since it is not cached.
       if (do_not_cache) delete conv_bwd_filter;
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index bcd0446b748..3b81b283ae5 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -99,9 +99,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
  public:
   explicit MklConvBwdInputPrimitive(
       const MklConvBwdInputParams& convBwdInputDims)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_input_stream.reset(new CPU_STREAM(cpu_engine_));
-
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create conv bwd input primitive
     if (context_.conv_bwd_input == nullptr) {
       Setup(convBwdInputDims);
@@ -116,7 +114,8 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   //   diff_dst_data: input data buffer for dst
   // Bias does not matter here
   void Execute(const T* diff_src_data, const T* filter_data,
-               const T* diff_dst_data) {
+               const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_input_stream) {
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)));
     context_.filter_mem->set_data_handle(
@@ -125,10 +124,10 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         static_cast<T*>(const_cast<T*>(diff_dst_data)));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.bwd_input_primitives, context_.bwd_input_stream,
+    execute_primitives(context_.bwd_input_primitives, bwd_input_stream,
                        context_.bwd_input_primitives_args);
 #else
-    context_.bwd_input_stream->submit(context_.bwd_input_primitives);
+    bwd_input_stream->submit(context_.bwd_input_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // Set data handle back to DummyData.
@@ -180,7 +179,6 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     std::shared_ptr<memory::desc> diff_dst_md;
 
     // MKL-DNN pipeline for executing primitives.
-    std::shared_ptr<mkldnn::stream> bwd_input_stream;
     std::vector<mkldnn::primitive> bwd_input_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -203,8 +201,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
           fwd_pd(nullptr),
           diff_src_md(nullptr),
           filter_md(nullptr),
-          diff_dst_md(nullptr),
-          bwd_input_stream(nullptr) {
+          diff_dst_md(nullptr) {
     }
   };
 
@@ -290,7 +287,6 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   }
 
   struct ConvBwdInputContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -522,8 +518,10 @@ class MklConvCustomBackpropInputOp
       if (IS_FILTER_REORDER_NEEDED(fwd_filter_md, bwd_input_pd,
                                    conv_bwd_input)) {
         filter.SetUsrMem(fwd_filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_input_pd.get()->PRIMITIVE_DESC_WEIGHTS, cpu_engine_));
+        filter.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_input_pd.get()->PRIMITIVE_DESC_WEIGHTS,
+                                   cpu_engine_),
+            context);
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
       } else {
         filter_data =
@@ -535,23 +533,29 @@ class MklConvCustomBackpropInputOp
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bwd_input_pd,
                                      conv_bwd_input)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_input_pd.get()->PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_input_pd.get()->PRIMITIVE_DESC_DIFF_DST,
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
             static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
       }
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, conv_bwd_input->GetEngine()));
       // Execute conv bwd input primitive.
       if (!eager_mode) {
-        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
+        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data,
+                                bwd_cpu_stream);
       } else {
         // In eager mode we first write the output to temporary
         // buffer in MKL format. Then we convert the data to TF format.
         T* tmp_data =
             static_cast<T*>(const_cast<T*>(tmp_tensor.flat<T>().data()));
-        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data);
+        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data,
+                                bwd_cpu_stream);
         auto output_tf_md = diff_src_mkl_shape.GetTfLayout();
 #ifndef ENABLE_MKLDNN_V1
         auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
@@ -563,7 +567,7 @@ class MklConvCustomBackpropInputOp
         memory* dst_data_mem =
             new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, diff_src_data);
         CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                cpu_engine_);
+                                cpu_engine_, context);
       }
 
       // Delete primitive since it is not cached.
@@ -571,9 +575,9 @@ class MklConvCustomBackpropInputOp
         delete conv_bwd_input;
       }
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));

From 4f6a3a4db05fb591a22c2107f30cba5c3e251412 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 13 May 2020 14:23:56 -0700
Subject: [PATCH 0520/1533] Make regularizers API more consistent.

PiperOrigin-RevId: 311403808
Change-Id: I2a372937bdc316f742015be6080ad945bf970377
---
 .../python/keras/layers/serialization_test.py |   8 +-
 tensorflow/python/keras/regularizers.py       | 115 +++++++++++-------
 .../keras/tests/add_loss_correctness_test.py  |   2 +-
 .../python/keras/utils/generic_utils_test.py  |   6 +-
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 +++
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 +++
 .../v1/tensorflow.keras.regularizers.l1.pbtxt |  18 +++
 .../v1/tensorflow.keras.regularizers.l2.pbtxt |  18 +++
 .../v1/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 +++
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 +++
 .../v2/tensorflow.keras.regularizers.l1.pbtxt |  18 +++
 .../v2/tensorflow.keras.regularizers.l2.pbtxt |  18 +++
 .../v2/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 14 files changed, 258 insertions(+), 65 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt

diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index b18a0fbd8cc..920881c6a3e 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -53,7 +53,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -88,7 +88,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -116,7 +116,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters(
       [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
@@ -135,7 +135,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
   def test_serialize_deserialize_lstm(self, layer):
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 973d916f7e0..b8bae4cc155 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -14,13 +14,14 @@
 # ==============================================================================
 """Built-in regularizers.
 """
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
@@ -60,8 +61,8 @@ class Regularizer(object):
   >>> layer = tf.keras.layers.Dense(
   ...     5, input_dim=5,
   ...     kernel_initializer='ones',
-  ...     kernel_regularizer=tf.keras.regularizers.l1(0.01),
-  ...     activity_regularizer=tf.keras.regularizers.l2(0.01))
+  ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
+  ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
   >>> tensor = tf.ones(shape=(5, 5)) * 2.0
   >>> out = layer(tensor)
 
@@ -73,9 +74,9 @@ class Regularizer(object):
   ## Available penalties
 
   ```python
-  tf.keras.regularizers.l1(0.3)  # L1 Regularization Penalty
-  tf.keras.regularizers.l2(0.1)  # L2 Regularization Penalty
-  tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)  # L1 + L2 penalties
+  tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
+  tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
+  tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
   ```
 
   ## Directly calling a regularizer
@@ -84,7 +85,7 @@ class Regularizer(object):
   as if it is a one-argument function.
 
   E.g.
-  >>> regularizer = tf.keras.regularizers.l2(2.)
+  >>> regularizer = tf.keras.regularizers.L2(2.)
   >>> tensor = tf.ones(shape=(5, 5))
   >>> regularizer(tensor)
   <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
@@ -194,7 +195,7 @@ class Regularizer(object):
 
 @keras_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
-  r"""A regularizer that applies both L1 and L2 regularization penalties.
+  """A regularizer that applies both L1 and L2 regularization penalties.
 
   The L1 regularization penalty is computed as:
   `loss = l1 * reduce_sum(abs(x))`
@@ -202,19 +203,23 @@ class L1L2(Regularizer):
   The L2 regularization penalty is computed as
   `loss = l2 * reduce_sum(square(x))`
 
+  L1L2 may be passed to a layer as a string identifier:
+
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
+
+  In this case, the default values used are `l1=0.01` and `l2=0.01`.
+
   Attributes:
       l1: Float; L1 regularization factor.
       l2: Float; L2 regularization factor.
   """
 
   def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
-    self.l1 = K.cast_to_floatx(l1)
-    self.l2 = K.cast_to_floatx(l2)
+    self.l1 = backend.cast_to_floatx(l1)
+    self.l2 = backend.cast_to_floatx(l2)
 
   def __call__(self, x):
-    if not self.l1 and not self.l2:
-      return K.constant(0.)
-    regularization = 0.
+    regularization = backend.constant(0., dtype=x.dtype)
     if self.l1:
       regularization += self.l1 * math_ops.reduce_sum(math_ops.abs(x))
     if self.l2:
@@ -225,39 +230,64 @@ class L1L2(Regularizer):
     return {'l1': float(self.l1), 'l2': float(self.l2)}
 
 
-# Aliases.
-
-
-@keras_export('keras.regularizers.l1')
-def l1(l=0.01):
-  r"""Create a regularizer that applies an L1 regularization penalty.
+@keras_export('keras.regularizers.L1', 'keras.regularizers.l1')
+class L1(Regularizer):
+  """A regularizer that applies a L1 regularization penalty.
 
   The L1 regularization penalty is computed as:
-  `loss = l * reduce_sum(abs(x))`
+  `loss = l1 * reduce_sum(abs(x))`
 
-  Arguments:
-      l: Float; L1 regularization factor.
+  L1 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L1 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
+
+  In this case, the default value used is `l1=0.01`.
+
+  Attributes:
+      l1: Float; L1 regularization factor.
   """
-  return L1L2(l1=l)
+
+  def __init__(self, l1=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l1 = kwargs.pop('l', l1)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l1 = backend.cast_to_floatx(l1)
+
+  def __call__(self, x):
+    return self.l1 * math_ops.reduce_sum(math_ops.abs(x))
+
+  def get_config(self):
+    return {'l1': float(self.l1)}
 
 
-@keras_export('keras.regularizers.l2')
-def l2(l=0.01):
-  r"""Create a regularizer that applies an L2 regularization penalty.
+@keras_export('keras.regularizers.L2', 'keras.regularizers.l2')
+class L2(Regularizer):
+  """A regularizer that applies a L2 regularization penalty.
 
   The L2 regularization penalty is computed as:
-  `loss = l * reduce_sum(square(x))`
+  `loss = l2 * reduce_sum(square(x))`
 
-  Arguments:
-      l: Float; L2 regularization factor.
+  L2 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L2 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
+
+  In this case, the default value used is `l2=0.01`.
+
+  Attributes:
+      l2: Float; L2 regularization factor.
   """
-  return L1L2(l2=l)
+
+  def __init__(self, l2=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l2 = kwargs.pop('l', l2)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l2 = backend.cast_to_floatx(l2)
+
+  def __call__(self, x):
+    return self.l2 * math_ops.reduce_sum(math_ops.square(x))
+
+  def get_config(self):
+    return {'l2': float(self.l2)}
 
 
 @keras_export('keras.regularizers.l1_l2')
@@ -280,6 +310,11 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
+# Deserialization aliases.
+l1 = L1
+l2 = L2
+
+
 @keras_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
@@ -287,6 +322,10 @@ def serialize(regularizer):
 
 @keras_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
+  if config == 'l1_l2':
+    # Special case necessary since the defaults used for "l1_l2" (string)
+    # differ from those of the L1L2 class.
+    return L1L2(l1=0.01, l2=0.01)
   return deserialize_keras_object(
       config,
       module_objects=globals(),
@@ -296,18 +335,12 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.regularizers.get')
 def get(identifier):
+  """Retrieve a regularizer instance from a config or identifier."""
   if identifier is None:
     return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
-    identifier = str(identifier)
-    # We have to special-case functions that return classes.
-    # TODO(omalleyt): Turn these into classes or class aliases.
-    special_cases = ['l1', 'l2', 'l1_l2']
-    if identifier in special_cases:
-      # Treat like a class.
-      return deserialize({'class_name': identifier, 'config': {}})
     return deserialize(str(identifier))
   elif callable(identifier):
     return identifier
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index 323a2626c15..a19eec75ffb 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -288,7 +288,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           model_layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
-      y = np.ones((10, 1), 'float32')
+      y = np.zeros((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       model.compile(
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 334758871fa..ddaa60c3c24 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -201,7 +201,7 @@ class SerializeKerasObjectTest(test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableInt)
     self.assertEqual(new_layer.units, 3)
 
@@ -253,7 +253,7 @@ class SerializeKerasObjectTest(test.TestCase):
     self.assertEqual(new_layer.name, 'SerializableNestedInt')
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
@@ -293,7 +293,7 @@ class SerializeKerasObjectTest(test.TestCase):
             'SerializableNestedInt': SerializableNestedInt
         })
     self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L1L2)
+    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
     self.assertIsInstance(new_layer.units, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertIs(new_layer.units.fn, serializable_fn)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"

From e0153c3c621b4b2ab073144ab10987ca0f7d5c1d Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 13 May 2020 14:28:29 -0700
Subject: [PATCH 0521/1533] threadpool support for relu, eltwise and softmax.

---
 tensorflow/core/kernels/mkl_aggregate_ops.cc | 12 +--
 tensorflow/core/kernels/mkl_relu_op.cc       | 93 +++++++++-----------
 tensorflow/core/kernels/mkl_softmax_op.cc    | 30 +++----
 3 files changed, 60 insertions(+), 75 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 21be643eaa5..866e9160d87 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -244,7 +244,7 @@ class MklAddNOp : public OpKernel {
 
       // Create Sum op, and submit net for execution.
       std::vector<primitive> net;
-      auto sum_stream = CPU_STREAM(cpu_engine);
+      stream* fwd_cpu_stream = CreateStream(ctx, cpu_engine);
 #ifdef ENABLE_MKLDNN_V1
       mkldnn::sum sum_op(sum_pd);
       std::unordered_map<int, memory> net_args = {
@@ -253,15 +253,15 @@ class MklAddNOp : public OpKernel {
       for (int i = 0; i < num_inputs; ++i) {
         net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
       }
-      sum_op.execute(sum_stream, net_args);
+      sum_op.execute(*fwd_cpu_stream, net_args);
 #else
       net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
-      sum_stream.submit(net).wait();
+      fwd_cpu_stream->submit(net).wait();
 #endif
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           ctx, errors::Aborted("Operation received an exception:", error_msg));
     }
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index ffbc1e28355..80662d191ed 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_map>
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using mkldnn::algorithm;
 using mkldnn::eltwise_forward;
@@ -61,13 +61,11 @@ template <typename T>
 class MklEltwiseFwdPrimitive : public MklPrimitive {
  public:
   explicit MklEltwiseFwdPrimitive(const MklEltwiseFwdParams<T>& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
 #ifndef ENABLE_MKLDNN_V1
     context_.src_fmt =
         static_cast<mkldnn::memory::format>(fwdParams.src_md.data.format);
 #endif
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
-
     // create eltwise primitive
     if (context_.eltwise_fwd == nullptr) {
       Setup(fwdParams);
@@ -79,7 +77,8 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
   // Eltwise forward execute
   //   src_data:  input data buffer of src
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data) {
+  void Execute(const T* src_data, T* dst_data,
+               std::shared_ptr<stream> fwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -87,12 +86,10 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
-    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
-      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
-                                            context_.fwd_primitives_args.at(i));
-    }
+    execute_primitives(context_.fwd_primitives, fwd_stream,
+                       context_.fwd_primitives_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream->submit(context_.fwd_primitives);
 #endif
 
     // After execution, set data handle back.
@@ -134,7 +131,6 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
     // Eltwise primitive
     std::shared_ptr<mkldnn::primitive> eltwise_fwd;
 
-    std::shared_ptr<stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -153,8 +149,7 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
           src_md(nullptr),
           dst_md(nullptr),
           src_mpd(nullptr),
-          eltwise_fwd(nullptr),
-          fwd_stream(nullptr) {
+          eltwise_fwd(nullptr) {
     }
   };
 
@@ -169,14 +164,12 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
 #else
         new MEMORY_PD_CONSTRUCTOR_2_PARAMS(*context_.src_md, cpu_engine_));
 #endif
-
     // Create an eltwise forward descriptor and primitive descriptor
     context_.fwd_desc.reset(new eltwise_forward::desc(
         prop_kind::forward, fwdParams.alg_kind, *context_.src_md,
         fwdParams.alpha, fwdParams.beta));
     context_.fwd_pd.reset(new EltwiseFwdPd(*context_.fwd_desc, cpu_engine_));
     auto fwd_pd = context_.fwd_pd.get();
-
 #ifdef ENABLE_MKLDNN_V1
     // Create memory primitive based on dummy data
     context_.src_mem.reset(new MEMORY_CONSTRUCTOR(fwd_pd->PRIMITIVE_DESC_SRC,
@@ -195,12 +188,10 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
     context_.eltwise_fwd.reset(new eltwise_forward(
         *context_.fwd_pd, *context_.src_mem, *context_.dst_mem));
 #endif
-
     context_.fwd_primitives.push_back(*context_.eltwise_fwd);
   }
 
   struct EltwiseFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -281,14 +272,13 @@ template <typename T>
 class MklEltwiseBwdPrimitive : public MklPrimitive {
  public:
   explicit MklEltwiseBwdPrimitive(const MklEltwiseBwdParams<T>& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
 #ifndef ENABLE_MKLDNN_V1
     context_.src_fmt =
         static_cast<mkldnn::memory::format>(bwdParams.common_md.data.format);
     context_.diff_dst_fmt =
         static_cast<mkldnn::memory::format>(bwdParams.common_md.data.format);
 #endif
-    context_.bwd_stream.reset(new stream(CPU_STREAM(cpu_engine_)));
     // create eltwise primitive
     if (context_.eltwise_bwd == nullptr) {
       Setup(bwdParams);
@@ -301,7 +291,8 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   //   src_data:       input data buffer of src
   //   diff_dst_data:  input data buffer of diff_dst
   //   diff_src_data:  output data buffer of diff_src
-  void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data) {
+  void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data,
+               std::shared_ptr<stream> bwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_dst_mem->set_data_handle(
@@ -311,12 +302,10 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.bwd_primitives.size(),
               context_.bwd_primitives_args.size());
-    for (size_t i = 0; i < context_.bwd_primitives.size(); ++i) {
-      context_.bwd_primitives.at(i).execute(*context_.bwd_stream,
-                                            context_.bwd_primitives_args.at(i));
-    }
+    execute_primitives(context_.bwd_primitives, bwd_stream,
+                       context_.bwd_primitives_args);
 #else
-    context_.bwd_stream->submit(context_.bwd_primitives);
+    bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // after execution, set data handle back
@@ -367,7 +356,6 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
     // Eltwise primitive.
     std::shared_ptr<mkldnn::primitive> eltwise_bwd;
 
-    std::shared_ptr<stream> bwd_stream;
     std::vector<mkldnn::primitive> bwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -391,8 +379,7 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
           fwd_desc(nullptr),
           fwd_pd(nullptr),
           bwd_pd(nullptr),
-          eltwise_bwd(nullptr),
-          bwd_stream(nullptr) {
+          eltwise_bwd(nullptr) {
     }
   };
 
@@ -448,7 +435,6 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   }
 
   struct EltwiseBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -525,12 +511,10 @@ class MklReluOpBase : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, src_index);
       MklDnnShape dnn_shape_src;
       GetMklShape(context, src_index, &dnn_shape_src);
-
       if (src_tensor.dims() == 0) {
         Compute_Scalar(context);
         return;
       }
-
       MklDnnShape dnn_shape_dst;
       TensorShape tf_shape_dst;
       Tensor* dst_tensor = nullptr;
@@ -542,7 +526,6 @@ class MklReluOpBase : public OpKernel {
                                   dnn_shape_dst);
         return;
       }
-
       // Set DNN primitive - src
       MklDnnData<T> src(&cpu_engine);
       memory::dims src_dims;
@@ -556,26 +539,25 @@ class MklReluOpBase : public OpKernel {
         // Create blocked memory descriptor
         src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
       }
-
       // Try to get an eltwise forward primitive from caching pool
       MklEltwiseFwdParams<T> fwdParams(src_dims, src_md, alg_kind, alpha_,
                                        beta_);
-
       MklEltwiseFwdPrimitive<T>* eltwise_fwd =
           MklEltwiseFwdPrimitiveFactory<T>::Get(fwdParams);
-
       auto eltwise_fwd_pd = eltwise_fwd->GetEltwiseFwdPd();
-
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, eltwise_fwd->GetEngine()));
       // Check if src needs to be reordered
       const T* src_data = src_tensor.flat<T>().data();
       if (IS_SRC_REORDER_NEEDED(src_md, eltwise_fwd_pd, eltwise_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_fwd_pd->PRIMITIVE_DESC_SRC, cpu_engine));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(eltwise_fwd_pd->PRIMITIVE_DESC_SRC,
+                                   cpu_engine),
+            context);
         src_data = const_cast<T*>(
             reinterpret_cast<T*>(src.GetOpMem().get_data_handle()));
       }
-
       // Allocate dst tensor, always set it as MKL-DNN layout
       if (dnn_shape_src.IsMklTensor()) {
         dnn_shape_dst.SetMklTensor(true);
@@ -590,7 +572,6 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                   {static_cast<const int>(src_index)},
                                   static_cast<const int>(dst_index),
@@ -600,11 +581,11 @@ class MklReluOpBase : public OpKernel {
       T* dst_data = dst_tensor->flat<T>().data();
 
       // execute eltwise
-      eltwise_fwd->Execute(src_data, dst_data);
+      eltwise_fwd->Execute(src_data, dst_data, fwd_cpu_stream);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
@@ -727,13 +708,16 @@ class MklReluGradOpBase : public OpKernel {
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
 
       auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd();
-
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, eltwise_bwd->GetEngine()));
       // check whether need reorder for src / diff_dst
       const T* src_data = src_tensor.flat<T>().data();
       if (IS_SRC_REORDER_NEEDED(src_md, eltwise_bwd_pd, eltwise_bwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(
+                eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine),
+            context);
         src_data = const_cast<T*>(
             reinterpret_cast<T*>(src.GetOpMem().get_data_handle()));
       }
@@ -742,8 +726,10 @@ class MklReluGradOpBase : public OpKernel {
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, eltwise_bwd_pd,
                                      eltwise_bwd)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(
+                eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine),
+            context);
         diff_dst_data = const_cast<T*>(
             reinterpret_cast<T*>(diff_dst.GetOpMem().get_data_handle()));
       }
@@ -779,11 +765,12 @@ class MklReluGradOpBase : public OpKernel {
       T* diff_src_data = diff_src_tensor->flat<T>().data();
 
       // execute eltwise bwd
-      eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data);
+      eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index b07cd58cfd2..93bc5897e63 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #ifdef INTEL_MKL
 
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 using mkldnn::prop_kind;
 using mkldnn::softmax_forward;
@@ -48,8 +48,7 @@ template <typename T>
 class MklSoftmaxPrimitive : public MklPrimitive {
  public:
   explicit MklSoftmaxPrimitive(const MklSoftmaxParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     Setup(fwdParams);
   }
 
@@ -58,16 +57,18 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   // Softmax forward execute
   //   src_data:  input data buffer of src
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data) {
+  void Execute(const T* src_data, T* dst_data,
+               std::shared_ptr<stream> fwd_cpu_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+    DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
+    execute_primitives(context_.fwd_primitives, fwd_cpu_stream,
                        context_.fwd_net_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_cpu_stream->submit(context_.fwd_primitives);
 #endif
 
     // After execution, set data handle back.
@@ -95,7 +96,6 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::softmax_forward::primitive_desc> fwd_pd;
     std::shared_ptr<mkldnn::primitive> softmax_fwd;
 
-    std::shared_ptr<stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
     std::vector<MemoryArgsMap> fwd_net_args;
 
@@ -105,8 +105,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
           fwd_desc(nullptr),
           src_md(nullptr),
           fwd_pd(nullptr),
-          softmax_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          softmax_fwd(nullptr) {}
   };
 
   // Softmax forward primitive setup
@@ -143,7 +142,6 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   }
 
   struct SoftmaxFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -303,13 +301,13 @@ class MklSoftmaxOp : public OpKernel {
 
       const T* src_data = src_tensor.flat<T>().data();
       T* dst_data = reinterpret_cast<T*>(output_tensor->flat<T>().data());
-
-      // Execute softmax primitive.
-      softmax_fwd->Execute(src_data, dst_data);
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, softmax_fwd->GetEngine()));
+      softmax_fwd->Execute(src_data, dst_data, fwd_cpu_stream);
     } catch (mkldnn::error& e) {
-      string error_msg = "Status: " + std::to_string(e.status) +
-                         ", message: " + string(e.message) + ", in file " +
-                         string(__FILE__) + ":" + std::to_string(__LINE__);
+      string error_msg = "Status: " + std::to_string(e.status) + ", message: " +
+                         string(e.message) + ", in file " + string(__FILE__) +
+                         ":" + std::to_string(__LINE__);
       OP_REQUIRES_OK(
           context,
           errors::Aborted("Operation received an exception:", error_msg));

From 8588e0aab8c1ef6a4214bcc2f7d0bb61578a88b3 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Wed, 13 May 2020 14:27:47 -0700
Subject: [PATCH 0522/1533] Support running a remote function with packed input
 handles. - Support copying a packed TensorHandle from a client to a remote
 worker.

PiperOrigin-RevId: 311404609
Change-Id: Iadf2c7793dc3631f7be05de611d059733bbfdd63
---
 tensorflow/c/eager/c_api_remote_test.cc       |  14 ++-
 .../common_runtime/eager/tensor_handle.cc     |  23 +++-
 .../core/common_runtime/eager/tensor_handle.h |   5 +
 .../eager/eager_service_impl.cc               |  48 ++++++++
 .../eager/eager_service_impl.h                |   2 +
 .../eager/eager_service_impl_test.cc          | 102 +++++++++++++++++
 .../eager/remote_copy_node.cc                 | 104 +++++++++++++++++-
 .../eager/remote_copy_node.h                  |   3 +
 tensorflow/core/protobuf/eager_service.proto  |  22 ++++
 9 files changed, 316 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 12c63675c87..9dc18c7a6f1 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -434,7 +434,7 @@ string AddVariablesFunction() {
   return def.SerializeAsString();
 }
 
-TEST(CAPI, TestFunctionWithPackedInput) {
+void TestFunctionWithPackedInput(const bool remote) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
   // This server def has the task index set to 0.
@@ -502,6 +502,10 @@ TEST(CAPI, TestFunctionWithPackedInput) {
   ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
   TFE_OpAddInput(func, packed_handle, status);
   ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  if (remote) {
+    TFE_OpSetDevice(func, task1_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
 
   TFE_TensorHandle* retvals[1] = {nullptr};
   int num_retvals = 1;
@@ -537,6 +541,14 @@ TEST(CAPI, TestFunctionWithPackedInput) {
   worker_server2.release();
 }
 
+TEST(CAPI, TestLocalFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/false);
+}
+
+TEST(CAPI, TestRemoteFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/true);
+}
+
 void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index dfe3e4a1426..49fa69e2185 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -318,17 +318,14 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 }
 
 Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        const tensorflow::DataType dtype,
+                                        const tensorflow::TensorShape& shape,
                                         EagerContext* ctx,
                                         TensorHandle** packed_handle) {
   if (handles.empty()) {
     return errors::InvalidArgument("Handles should not be empty.");
   }
 
-  // Get the dtype and shape from the fisrt handle since all handles have the
-  // same dtype and shape.
-  tensorflow::DataType dtype = handles.at(0)->dtype;
-  tensorflow::TensorShape shape;
-  TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
   ResourceHandleInfo resource_handle_info;
   if (dtype == DT_RESOURCE) {
     TF_RETURN_IF_ERROR(
@@ -360,6 +357,22 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   return Status::OK();
 }
 
+Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        EagerContext* ctx,
+                                        TensorHandle** packed_handle) {
+  if (handles.empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  // Get the dtype and shape from the fisrt handle since all handles have the
+  // same dtype and shape.
+  tensorflow::DataType dtype = handles.at(0)->dtype;
+  tensorflow::TensorShape shape;
+  TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
+  return CreatePackedHandle(std::move(handles), dtype, shape, ctx,
+                            packed_handle);
+}
+
 TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
                            const tensorflow::DataType dtype,
                            const tensorflow::TensorShape& shape,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 25d7fea3200..6f9ee565c73 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -91,6 +91,11 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // Create a handle which packs the given handles of the same dtype and shape.
   // If handles are on different devices, assign the packed handle to a
   // CompositeDevice.
+  static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                   const tensorflow::DataType dtype,
+                                   const tensorflow::TensorShape& shape,
+                                   EagerContext* ctx,
+                                   TensorHandle** packed_handle);
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    EagerContext* ctx,
                                    TensorHandle** packed_handle);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 95131150d3d..6dc03cbc527 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -524,6 +524,8 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
       s = context->Context()->Executor().AddOrExecute(std::move(node));
     } else if (item.has_send_tensor()) {
       s = SendTensor(item.send_tensor(), context->Context());
+    } else if (item.has_send_packed_handle()) {
+      s = SendPackedHandle(item.send_packed_handle(), context->Context());
     } else if (item.has_register_function()) {
       s = RegisterFunction(item.register_function(), context->Context());
     } else if (item.has_cleanup_function()) {
@@ -643,6 +645,52 @@ Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
   return Status::OK();
 }
 
+Status EagerServiceImpl::SendPackedHandle(
+    const SendPackedHandleOp& send_packed_handle, EagerContext* eager_context) {
+  if (send_packed_handle.handles().empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  std::vector<tensorflow::TensorHandle*> handles;
+  handles.resize(send_packed_handle.handles_size());
+  for (int i = 0; i < send_packed_handle.handles_size(); ++i) {
+    const auto& item = send_packed_handle.handles(i);
+    if (item.has_local_handle()) {
+      Tensor tensor;
+      if (!ParseTensorProtoToTensor(item.local_handle().tensor(), &tensor)) {
+        return errors::InvalidArgument(
+            "Invalid TensorProto: ",
+            item.local_handle().tensor().DebugString());
+      }
+      Device* op_device = nullptr;
+      TF_RETURN_IF_ERROR(eager_context->FindDeviceFromName(
+          item.local_handle().device().c_str(), &op_device));
+      handles[i] = TensorHandle::CreateLocalHandle(
+          std::move(tensor), /*d=*/nullptr, op_device, eager_context);
+    } else {
+      TF_RETURN_IF_ERROR(
+          eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
+              item.remote_handle(), &handles[i]));
+    }
+  }
+
+  tensorflow::TensorHandle* packed_handle = nullptr;
+  std::vector<tensorflow::TensorHandle*> handles_to_pack = handles;
+  // Create a unshaped packed TensorHandle.
+  TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
+      std::move(handles_to_pack), handles.at(0)->dtype, TensorShape(),
+      eager_context, &packed_handle));
+
+  for (auto* h : handles) {
+    // Unref handle since it has a ref in the packed handle now.
+    h->Unref();
+  }
+
+  eager_context->RemoteMgr()->AddOperationOutputs({packed_handle},
+                                                  send_packed_handle.op_id());
+  return Status::OK();
+}
+
 tensorflow::Status EagerServiceImpl::GetServerContext(
     uint64 context_id, ServerContext** server_context) {
   tf_shared_lock l(contexts_mu_);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 06d4c36b61c..1e4d36ccf9f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -212,6 +212,8 @@ class EagerServiceImpl {
                    QueueResponse* queue_response);
   Status SendTensor(const SendTensorOp& send_tensor,
                     EagerContext* eager_context);
+  Status SendPackedHandle(const SendPackedHandleOp& send_packed_handle,
+                          EagerContext* eager_context);
   Status RegisterFunction(const RegisterFunctionOp& register_function,
                           EagerContext* eager_context);
   Status CleanupFunction(const CleanupFunctionOp& cleanup_function);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 9930bb86e6b..23bf324b80f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -881,6 +881,108 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
                                                &close_context_response));
 }
 
+// Test serializes and sends a pack TensorHandle.
+TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  uint64 context_id = random::New64();
+  CreateContextRequest request;
+  auto* server_def = request.mutable_server_def();
+  server_def->set_job_name("localhost");
+  server_def->set_task_index(0);
+  request.add_cluster_device_attributes()->set_name(device0);
+  request.add_cluster_device_attributes()->set_name(device1);
+  request.add_cluster_device_attributes()->set_name(device2);
+  request.set_context_id(context_id);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  // Copy a tensor to device0
+  auto* send_tensor = remote_enqueue_request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(1);
+  SetTensorProto(send_tensor->add_tensors());
+
+  // Copy a packed handle to device0
+  auto* send_packed_handle =
+      remote_enqueue_request.add_queue()->mutable_send_packed_handle();
+  send_packed_handle->set_op_id(3);
+  RemoteTensorHandle* remote_handle =
+      send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(send_tensor->op_id());
+  remote_handle->set_output_num(0);
+  remote_handle->set_op_device(device0);
+  remote_handle->set_device(device0);
+
+  SendPackedHandleOp::LocalTensorHandle* lcoal_handle =
+      send_packed_handle->add_handles()->mutable_local_handle();
+  SetTensorProto(lcoal_handle->mutable_tensor());
+  lcoal_handle->set_device(device1);
+
+  remote_handle = send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(2);
+  remote_handle->set_output_num(5);
+  remote_handle->set_op_device(device2);
+  remote_handle->set_device(device2);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  tensorflow::TensorHandle* packed_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      context_id, RemoteTensorHandleInternal(3, 0), &packed_handle));
+
+  EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
+  EXPECT_EQ(packed_handle->NumPackedHandles(), 3);
+
+  TensorHandle* handle0 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(0, &handle0));
+  EXPECT_EQ(handle0->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle0->op_device()->name(), device0);
+  const Tensor* t0 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t0));
+  auto actual = t0->flat<float>();
+  EXPECT_EQ(4, actual.size());
+  EXPECT_EQ(1.0, actual(0));
+  EXPECT_EQ(2.0, actual(1));
+  EXPECT_EQ(3.0, actual(2));
+  EXPECT_EQ(4.0, actual(3));
+
+  TensorHandle* handle1 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(1, &handle1));
+  EXPECT_EQ(handle1->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle1->op_device()->name(), device1);
+  const Tensor* t1 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t1));
+  EXPECT_EQ(t1, t0);
+
+  TensorHandle* handle2 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(2, &handle2));
+  EXPECT_EQ(handle2->Type(), TensorHandle::REMOTE);
+  EXPECT_EQ(handle2->op_device()->name(), device2);
+  int64 op_id;
+  int32 output_num;
+  TF_ASSERT_OK(handle2->RemoteAddressUntilReady(
+      absl::get<Device*>(handle2->device()), &op_id, &output_num));
+  EXPECT_EQ(op_id, 2);
+  EXPECT_EQ(output_num, 5);
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  close_context_request.set_context_view_id(0);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 // Test requests sent to the eager service on master.
 TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   tensorflow::Rendezvous* rendezvous =
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index b281bcef2b3..5d0793b258c 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace eager {
@@ -290,6 +292,102 @@ void RemoteCopyNode::StartRecv(StatusCallback done) {
   }
 }
 
+Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
+                             const Device* target_device, EagerContext* ctx,
+                             SendPackedHandleOp* op) {
+  op->set_op_id(op_id);
+  for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
+    TensorHandle* h = nullptr;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
+    if (h->Type() == TensorHandle::LOCAL) {
+      // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
+      // copy it to the CPU before copying it out.
+      Tensor tensor;
+      TF_RETURN_IF_ERROR(h->CopyToDevice(*ctx, ctx->HostCPU(), &tensor));
+      auto* local_handle = op->add_handles()->mutable_local_handle();
+      local_handle->set_device(h->op_device() ? h->op_device()->name()
+                                              : ctx->HostCPU()->name());
+      tensor.AsProtoTensorContent(local_handle->mutable_tensor());
+    } else if (h->Type() == TensorHandle::REMOTE) {
+      // Only serialize the resource dtype and shape of the first handle, since
+      // all handles are of the same resource dtype and shape.
+      Device* src_device = absl::get<Device*>(h->device());
+      const bool serialize_resource_dtype_and_shape =
+          (i == 0) && (h->dtype == DT_RESOURCE) &&
+          (ctx->OnSameTask(src_device, target_device));
+      TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
+          h, op->add_handles()->mutable_remote_handle(), src_device,
+          absl::get<Device*>(h->DeviceOrHostCPU(*ctx))->name(),
+          serialize_resource_dtype_and_shape));
+    } else {
+      return errors::InvalidArgument("Nested packed handles are not supported");
+    }
+  }
+  return Status::OK();
+}
+
+void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
+  Status s;
+  const uint64 context_view_id = ctx_->GetContextViewId();
+  if (!send_device_->IsLocal()) {
+    s = errors::InvalidArgument(
+        "Copy a packed handle from a remote device is not supported");
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueRequest request;
+  uint64 context_id = ctx_->GetContextId();
+  request.set_context_id(context_id);
+  s = SerializePackedHandle(recv_op_id_, src_, recv_device_, ctx_,
+                            request.add_queue()->mutable_send_packed_handle());
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  TensorShape shape;
+  s = src_->Shape(&shape);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+  captured_state_->SetSrcShape(shape);
+
+  core::RefCountPtr<eager::EagerClient> eager_client;
+  s = ctx_->GetClient(recv_device_, &eager_client);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueResponse* response = new EnqueueResponse;
+  Device* recv_device = recv_device_;
+  const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
+  eager_client->StreamingEnqueueAsync(
+      &request, response,
+      [captured_state, response, recv_device, context_view_id,
+       done](const Status& s) {
+        if (s.ok()) {
+          Status status = captured_state->dst()->SetRemoteShape(
+              captured_state->GetSrcShape(), recv_device, context_view_id);
+          if (!status.ok()) {
+            LOG(ERROR) << "Ignoring an error encountered when setting remote "
+                          "shape of tensor received by SendPackedHadnle rpc: "
+                       << status.ToString();
+          }
+        } else {
+          captured_state->dst()->PoisonRemote(s, recv_device, context_view_id);
+        }
+        done(s);
+        delete response;
+      });
+}
+
 void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   Status s;
   EnqueueRequest request;
@@ -351,7 +449,11 @@ Status RemoteCopyNode::Prepare() {
 
 void RemoteCopyNode::RunAsync(StatusCallback done) {
   started_ = true;
-  if (ctx_->UseSendTensorRPC() && send_device_->IsLocal() &&
+  if (src_->Type() == TensorHandle::PACKED) {
+    return StartSendPackedHandle(std::move(done));
+  }
+
+  if ((ctx_->UseSendTensorRPC()) && send_device_->IsLocal() &&
       !recv_device_->IsLocal()) {
     return StartRemoteSendTensor(std::move(done));
   }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
index a527cd47127..7816a24ed33 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -121,6 +121,9 @@ class RemoteCopyNode : public AsyncEagerNode {
   // SendTensor RPC *on the receiver*.
   void StartRemoteSendTensor(StatusCallback done);
 
+  // Send a local packed TensorHandle to a remote device.
+  void StartSendPackedHandle(StatusCallback done);
+
   // State that is captured by Send and/or Recv callbacks (depending on which
   // one(s) is remote) and outlives this node in the case of remote->remote
   // copy.
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index e9e21777d3f..3fe2bd486ba 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -69,6 +69,7 @@ message QueueItem {
     // enqueued in streaming call. Request with this item type waits for pending
     // nodes to finish on the remote executor and report status.
     SyncRemoteExecutorForStream sync_remote_executor_for_stream = 6;
+    SendPackedHandleOp send_packed_handle = 7;
   }
 }
 
@@ -238,6 +239,27 @@ message SendTensorOp {
   string device_name = 3;
 }
 
+// Send a packed TensorHandle to a remote worker.
+message SendPackedHandleOp {
+  // Op id of the remote packed TensorHandle.
+  int64 op_id = 1;
+
+  message LocalTensorHandle {
+    TensorProto tensor = 1;
+    // Device where the tensor is produced.
+    string device = 2;
+  }
+
+  message Handle {
+    oneof item {
+      LocalTensorHandle local_handle = 1;
+      RemoteTensorHandle remote_handle = 2;
+    }
+  }
+
+  repeated Handle handles = 2;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Eager Service defines a TensorFlow service that executes operations eagerly

From 2560d6fd31b20e81a5a98a73f325fb1dcf0c68a7 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Wed, 13 May 2020 14:39:12 -0700
Subject: [PATCH 0523/1533] Fix issue where metric instances created in
 subclassed layer are not tracked on creation but only on call.

PiperOrigin-RevId: 311407078
Change-Id: I2cd7ecb675699a56a4b90e5a29ba80ce6ca59cac
---
 tensorflow/python/keras/engine/base_layer.py  |  6 +++
 .../python/keras/engine/base_layer_v1.py      |  6 +++
 tensorflow/python/keras/engine/training.py    |  7 ----
 .../python/keras/engine/training_test.py      | 37 +++++++++++++++++++
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index f6fa17df5c2..94b696d842b 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -2585,6 +2585,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 24d12ae4d59..4a277ec3a3e 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -2223,6 +2223,12 @@ class Layer(base_layer.Layer):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 52bf42a099d..d8c95b2a972 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -334,13 +334,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
     super(Model, self).__setattr__(name, value)
 
-    # Keep track of metric instance created in subclassed model/layer.
-    # We do this so that we can maintain the correct order of metrics by adding
-    # the instance to the `metrics` list as soon as it is created.
-    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-    if isinstance(value, metrics_module.Metric):
-      self._metrics.append(value)
-
   @generic_utils.default
   def build(self, input_shape):
     """Builds the model based on input shapes received.
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index e4c1ff6b1f8..c1c498b207b 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -2979,6 +2979,8 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         return self.dense1(x)
 
     model = TestModel()
+    self.assertListEqual([m.name for m in model.metrics],
+                         ['metric_1', 'metric_2'])
     model.compile(
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
@@ -2998,6 +3000,41 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_add_metric_calls_layer(self):
+
+    class TestLayer(layers_module.Layer):
+
+      def __init__(self):
+        super(TestLayer, self).__init__(name='test_layer')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
+        self.m1 = metrics_module.Mean(name='m_1')
+        self.m2 = [
+            metrics_module.Mean(name='m_2'),
+            metrics_module.Mean(name='m_3')
+        ]
+        self.m3 = {
+            'mean4': metrics_module.Mean(name='m_4'),
+            'mean5': metrics_module.Mean(name='m_5')
+        }
+
+      def call(self, x):
+        self.add_metric(self.m2[0](x))
+        self.add_metric(self.m2[1](x))
+        self.add_metric(self.m1(x))
+        self.add_metric(self.m3['mean4'](x))
+        self.add_metric(self.m3['mean5'](x))
+        self.add_metric(math_ops.reduce_sum(x), name='m_6', aggregation='mean')
+        return self.dense1(x)
+
+    layer = TestLayer()
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5'])
+
+    layer(np.ones((10, 10)))
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
+
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
 

From 8a25406f8a6260dee347512f1cb2d44634cc4977 Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 13 May 2020 14:58:02 -0700
Subject: [PATCH 0524/1533] review changes.

---
 third_party/mkl_dnn/mkldnn_v1.BUILD | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index c7aa0207ee2..93499fd62f2 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -70,8 +70,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-    ]) + [":dnnl_config_h"]
-       + [":dnnl_version_h"],
+    ]) + [":dnnl_config_h",
+          ":dnnl_version_h"],
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",

From 2046f7c450b8215f33b4ebfca094b637e36e6a7f Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Wed, 13 May 2020 14:56:38 -0700
Subject: [PATCH 0525/1533] Move TPUClusterResolver into tpu subdirectory.

PiperOrigin-RevId: 311410592
Change-Id: I7c4ca01621ae27cd4c36ff996cf90237328d75e4
---
 tensorflow/opensource_only.files              |   2 -
 .../python/distribute/cluster_resolver/BUILD  |  30 +-
 .../distribute/cluster_resolver/tpu/BUILD     |  44 +++
 .../tpu/tpu_cluster_resolver.py               | 349 ++++++++++++++++++
 .../{ => tpu}/tpu_cluster_resolver_test.py    |   4 +-
 .../cluster_resolver/tpu_cluster_resolver.py  | 334 +----------------
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |   2 +-
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |   2 +-
 8 files changed, 403 insertions(+), 364 deletions(-)
 create mode 100644 tensorflow/python/distribute/cluster_resolver/tpu/BUILD
 create mode 100644 tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
 rename tensorflow/python/distribute/cluster_resolver/{ => tpu}/tpu_cluster_resolver_test.py (99%)

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 9ca7bb4fe28..41750ea02b4 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -6,8 +6,6 @@ tensorflow/compat_template_v1.__init__.py
 tensorflow/compiler/mlir/glob_lit_test.bzl
 tensorflow/lite/micro/build_def.bzl
 tensorflow/python/autograph/core/config.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
 tensorflow/python/eager/benchmarks_test_base.py
 tensorflow/python/tpu/profiler/pip_package/BUILD
 tensorflow/python/tpu/profiler/pip_package/README
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 8577f1978b9..c7427af2081 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -1,10 +1,6 @@
 # Description: Operations defined for Cluster Resolvers
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_rpc_deps",
-)
 
 package(
     default_visibility = [
@@ -64,12 +60,7 @@ py_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu/client",
-    ] + tf_additional_rpc_deps(),
+    deps = ["//tensorflow/python/distribute/cluster_resolver/tpu:tpu_cluster_resolver_py"],
 )
 
 py_library(
@@ -137,25 +128,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "tpu_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["tpu_cluster_resolver_test.py"],
-    grpc_enabled = True,
-    main = "tpu_cluster_resolver_test.py",
-    python_version = "PY3",
-    deps = [
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu/client",
-        "@absl_py//absl/testing:flagsaver",
-    ],
-)
-
 tf_py_test(
     name = "slurm_cluster_resolver_py_test",
     size = "small",
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
new file mode 100644
index 00000000000..4825bf3b6d8
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -0,0 +1,44 @@
+# Description: OSS only cluster resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_rpc_deps",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "tpu_cluster_resolver_py",
+    srcs = ["tpu_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu/client",
+    ] + tf_additional_rpc_deps(),
+)
+
+tf_py_test(
+    name = "tpu_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tpu_cluster_resolver_test.py"],
+    grpc_enabled = True,
+    main = "tpu_cluster_resolver_test.py",
+    python_version = "PY3",
+    deps = [
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/tpu/client",
+    ],
+)
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
new file mode 100644
index 00000000000..943b736fde4
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -0,0 +1,349 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Cloud TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+try:
+  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
+except ImportError:
+  logging.debug(
+      'Falling back to TensorFlow client; we recommended you install the Cloud '
+      'TPU client directly with pip install cloud-tpu-client.')
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
+
+
+def is_running_in_gce():
+  return True
+
+
+_TPU_DEVICE_REGEX = re.compile(
+    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
+_TPU_CONN_RETRIES = 120
+DeviceDetails = collections.namedtuple(
+    'DeviceDetails', ['device_map', 'total_cores'])
+
+
+class TPUClusterResolver(cluster_resolver.ClusterResolver):
+  """Cluster Resolver for Google Cloud TPUs.
+
+  This is an implementation of cluster resolvers for the Google Cloud TPU
+  service. As Cloud TPUs are in alpha, you will need to specify a API definition
+  file for this to consume, in addition to a list of Cloud TPUs in your Google
+  Cloud Platform project.
+
+  TPUClusterResolver supports the following distinct environments:
+  Google Compute Engine
+  Google Kubernetes Engine
+  Google internal
+  """
+
+  @staticmethod
+  def _get_device_dict_and_cores(devices):
+    """Returns a dict of hosts to cores and total cores given devices names.
+
+    Returns a namedtuple with two attributes:
+      device_map: A map of host_ids to a list of core_ids.
+      total_cores: The total number of cores within the TPU system.
+
+    Args:
+      devices: A list of devices returned by session.list_devices()
+    """
+    device_map = collections.defaultdict(list)
+    num_cores = 0
+    for device in devices:
+      match = _TPU_DEVICE_REGEX.match(device.name)
+      if match:
+        host_id = match.group('host_id')
+        core_id = match.group('core_id')
+        device_map[host_id].append(core_id)
+        num_cores += 1
+    return DeviceDetails(device_map, num_cores)
+
+  @staticmethod
+  def _verify_and_return_same_core_count(device_dict):
+    """Verifies that every device in device_dict has the same # of cores."""
+    num_cores_per_host_set = (
+        {len(core_ids) for core_ids in device_dict.values()})
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError('TPU cores on each device is not the same. This '
+                         'should never happen. Devices: {}'.format(device_dict))
+    return num_cores_per_host_set.pop()
+
+  def __init__(self,
+               tpu=None,
+               zone=None,
+               project=None,
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
+               credentials='default',
+               service=None,
+               discovery_url=None):
+    """Creates a new TPUClusterResolver object.
+
+    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
+    for the IP addresses and ports of each Cloud TPU listed.
+
+    Args:
+      tpu: A string corresponding to the TPU to use. If the string is an empty
+        string, the string 'local', or a string that begins with 'grpc://', then
+          it is assumed to not correspond with a Cloud TPU and will instead be
+          passed as the session master and no ClusterSpec propagation will be
+          done. In the future, this may also support a list of strings when
+          multiple Cloud TPUs are used.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+      job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
+      credentials: GCE Credentials. If None, then we use default credentials
+        from the oauth2client
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. If you specify a custom service object, then the credentials
+        parameter will be ignored.
+      discovery_url: A URL template that points to the location of the discovery
+        service. It should have two parameters {api} and {apiVersion} that when
+        filled in produce an absolute URL to the discovery document for that
+        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
+        this.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
+      RuntimeError: If an empty TPU name is specified and this is running in a
+        Google Cloud environment.
+    """
+
+    self._cloud_tpu_client = client.Client(
+        tpu=tpu,
+        zone=zone,
+        project=project,
+        credentials=credentials,
+        service=service,
+        discovery_url=discovery_url)
+
+    self._tpu = self._cloud_tpu_client.name()
+    # By default the task_type is 'worker` and the task_id is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_id = 0
+    self._coordinator_name = coordinator_name
+    if (coordinator_name and not coordinator_address):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
+
+  def __enter__(self):
+    self._cloud_tpu_client.enter()
+
+  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
+    self._cloud_tpu_client.exit(type, value, traceback)
+
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
+
+    Args:
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_id: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
+
+    Returns:
+      string, the connection string to use when creating a session.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+
+    cluster_spec = self.cluster_spec()
+    if task_type is not None and task_id is not None:
+      # task_type and task_id is from the function parameter
+      master = cluster_spec.task_address(task_type, task_id)
+    elif self.task_type is not None and self.task_id is not None:
+      # task_type and task_id is from the object
+      master = cluster_spec.task_address(self.task_type, self.task_id)
+    else:
+      # by default we take the first item in the cluster with the right name
+      job_tasks = cluster_spec.job_tasks(self.task_type)
+      if not job_tasks:
+        raise ValueError('No TPUs with the specified names exist.')
+      master = job_tasks[0]
+    return cluster_resolver.format_master_url(master, 'grpc')
+
+  def get_master(self):
+    return self.master()
+
+  def get_job_name(self):
+    return self.task_type
+
+  def get_tpu_system_metadata(self):
+    """Returns the metadata of the TPU system.
+
+    Users can call this method to get some facts of the TPU system, like
+    total number of cores, number of TPU workers and the devices. E.g.
+    ```python
+
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tpu_system_medata = resolver.get_tpu_system_metadata()
+    num_hosts = tpu_system_medata.num_hosts
+    ```
+
+    Returns:
+      A `tf.tpu.experimental.TPUSystemMetadata` object.
+    """
+    cluster_spec = self.cluster_spec()
+    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
+            self.master(),
+            cluster_def=cluster_def,
+            query_topology=False))
+
+    return tpu_system_metadata
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest TPU information.
+
+    We retrieve the information from the GCE APIs every time this method is
+    called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Cloud TPUs,
+      or None.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
+    """
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    ############################################################################
+
+    network_endpoints = self._cloud_tpu_client.network_endpoints()
+    worker_list = [
+        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+        for endpoint in network_endpoints
+    ]
+    cluster_spec = {self.task_type: worker_list}
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_id=None,
+                       config_proto=None):
+    """Returns the number of TPU cores per worker.
+
+    Connects to the master and list all the devices present in the master,
+    and counts them up. Also verifies that the device counts per host in the
+    cluster is the same before returning the number of TPU cores per host.
+
+    Args:
+      task_type: Unused.
+      task_id: Unused.
+      config_proto: Used to create a connection to a TPU master in order to
+        retrieve the system metadata.
+
+    Raises:
+      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
+        number of TPU devices per host is different.
+    """
+    retry_count = 1
+    # TODO(b/120564445): Replace with standard library for retries.
+    while True:
+      try:
+        device_details = TPUClusterResolver._get_device_dict_and_cores(
+            cluster_resolver.get_accelerator_devices(
+                self.master(), config_proto=config_proto))
+        break
+      except errors.DeadlineExceededError:
+        error_message = ('Failed to connect to master. The TPU might not be '
+                         'ready (e.g. still scheduling) or the master '
+                         'address is incorrect: got (%s)' % self.master())
+        if retry_count <= _TPU_CONN_RETRIES:
+          logging.warning(error_message)
+          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
+          retry_count += 1
+        else:
+          raise RuntimeError(error_message)
+
+    if device_details.total_cores:
+      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
+          device_details.device_map)}
+    return {'TPU': 0}
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
+  def _start_local_server(self):
+    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
+    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
+                                     protocol='grpc',
+                                     config=None,
+                                     start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
+
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
similarity index 99%
rename from tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 1fad0a3fc95..1dc9a73fd74 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -25,7 +25,7 @@ from six.moves.urllib.error import URLError
 
 from tensorflow.python import framework
 from tensorflow.python.client import session
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
+from tensorflow.python.distribute.cluster_resolver.tpu import tpu_cluster_resolver as resolver
 from tensorflow.python.eager.context import LogicalDevice
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
@@ -41,7 +41,7 @@ except ImportError:
   logging.debug(
       'Falling back to TensorFlow client; we recommended you install the Cloud '
       'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
 
 
 class MockRequestClass(object):
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 79ec0bc13d1..5731c2c930a 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -12,339 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Cloud TPUs."""
+"""Shim so that direct imports of tpu_cluster_resolver get correct symbols.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import re
-
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import is_running_in_gce  # pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import TPUClusterResolver
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
-except ImportError:
-  logging.debug(
-      'Falling back to TensorFlow client; we recommended you install the Cloud '
-      'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
-
-def is_running_in_gce():
-  return True
-
-
-_TPU_DEVICE_REGEX = re.compile(
-    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
-_TPU_CONN_RETRIES = 120
-DeviceDetails = collections.namedtuple(
-    'DeviceDetails', ['device_map', 'total_cores'])
-
-
-@tf_export('distribute.cluster_resolver.TPUClusterResolver')
-class TPUClusterResolver(cluster_resolver.ClusterResolver):
-  """Cluster Resolver for Google Cloud TPUs.
-
-  This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
-
-  TPUClusterResolver supports the following distinct environments:
-  Google Compute Engine
-  Google Kubernetes Engine
-  Google internal
-  """
-
-  @staticmethod
-  def _get_device_dict_and_cores(devices):
-    """Returns a dict of hosts to cores and total cores given devices names.
-
-    Returns a namedtuple with two attributes:
-      device_map: A map of host_ids to a list of core_ids.
-      total_cores: The total number of cores within the TPU system.
-
-    Args:
-      devices: A list of devices returned by session.list_devices()
-    """
-    device_map = collections.defaultdict(list)
-    num_cores = 0
-    for device in devices:
-      match = _TPU_DEVICE_REGEX.match(device.name)
-      if match:
-        host_id = match.group('host_id')
-        core_id = match.group('core_id')
-        device_map[host_id].append(core_id)
-        num_cores += 1
-    return DeviceDetails(device_map, num_cores)
-
-  @staticmethod
-  def _verify_and_return_same_core_count(device_dict):
-    """Verifies that every device in device_dict has the same # of cores."""
-    num_cores_per_host_set = (
-        {len(core_ids) for core_ids in device_dict.values()})
-    if len(num_cores_per_host_set) != 1:
-      raise RuntimeError('TPU cores on each device is not the same. This '
-                         'should never happen. Devices: {}'.format(device_dict))
-    return num_cores_per_host_set.pop()
-
-  def __init__(self,
-               tpu=None,
-               zone=None,
-               project=None,
-               job_name='worker',
-               coordinator_name=None,
-               coordinator_address=None,
-               credentials='default',
-               service=None,
-               discovery_url=None):
-    """Creates a new TPUClusterResolver object.
-
-    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
-    for the IP addresses and ports of each Cloud TPU listed.
-
-    Args:
-      tpu: A string corresponding to the TPU to use. If the string is an empty
-        string, the string 'local', or a string that begins with 'grpc://', then
-          it is assumed to not correspond with a Cloud TPU and will instead be
-          passed as the session master and no ClusterSpec propagation will be
-          done. In the future, this may also support a list of strings when
-          multiple Cloud TPUs are used.
-      zone: Zone where the TPUs are located. If omitted or empty, we will assume
-        that the zone of the TPU is the same as the zone of the GCE VM, which we
-        will try to discover from the GCE metadata service.
-      project: Name of the GCP project containing Cloud TPUs. If omitted or
-        empty, we will try to discover the project name of the GCE VM from the
-        GCE metadata service.
-      job_name: Name of the TensorFlow job the TPUs belong to.
-      coordinator_name: The name to use for the coordinator. Set to None if the
-        coordinator should not be included in the computed ClusterSpec.
-      coordinator_address: The address of the coordinator (typically an ip:port
-        pair). If set to None, a TF server will be started. If coordinator_name
-        is None, a TF server will not be started even if coordinator_address is
-        None.
-      credentials: GCE Credentials. If None, then we use default credentials
-        from the oauth2client
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. If you specify a custom service object, then the credentials
-        parameter will be ignored.
-      discovery_url: A URL template that points to the location of the discovery
-        service. It should have two parameters {api} and {apiVersion} that when
-        filled in produce an absolute URL to the discovery document for that
-        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
-        this.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-      ValueError: If no TPUs are specified.
-      RuntimeError: If an empty TPU name is specified and this is running in a
-        Google Cloud environment.
-    """
-
-    self._cloud_tpu_client = client.Client(
-        tpu=tpu,
-        zone=zone,
-        project=project,
-        credentials=credentials,
-        service=service,
-        discovery_url=discovery_url)
-
-    self._tpu = self._cloud_tpu_client.name()
-    # By default the task_type is 'worker` and the task_id is 0 (which is the
-    # first worker in the task).
-    self.task_type = job_name
-    self.task_id = 0
-    self._coordinator_name = coordinator_name
-    if (coordinator_name and not coordinator_address):
-      self._start_local_server()
-    else:
-      self._coordinator_address = coordinator_address
-
-  def __enter__(self):
-    self._cloud_tpu_client.enter()
-
-  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
-    self._cloud_tpu_client.exit(type, value, traceback)
-
-  def master(self, task_type=None, task_id=None, rpc_layer=None):
-    """Get the Master string to be used for the session.
-
-    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
-    first instance in the ClusterSpec returned by the cluster_spec function.
-
-    If a non-TPU name is used when constructing a TPUClusterResolver, that will
-    be returned instead (e.g. If the tpus argument's value when constructing
-    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
-    'grpc://10.240.1.2:8470' will be returned).
-
-    Args:
-      task_type: (Optional, string) The type of the TensorFlow task of the
-        master.
-      task_id: (Optional, integer) The index of the TensorFlow task of the
-        master.
-      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
-        communicate with TPUs.
-
-    Returns:
-      string, the connection string to use when creating a session.
-
-    Raises:
-      ValueError: If none of the TPUs specified exists.
-    """
-
-    cluster_spec = self.cluster_spec()
-    if task_type is not None and task_id is not None:
-      # task_type and task_id is from the function parameter
-      master = cluster_spec.task_address(task_type, task_id)
-    elif self.task_type is not None and self.task_id is not None:
-      # task_type and task_id is from the object
-      master = cluster_spec.task_address(self.task_type, self.task_id)
-    else:
-      # by default we take the first item in the cluster with the right name
-      job_tasks = cluster_spec.job_tasks(self.task_type)
-      if not job_tasks:
-        raise ValueError('No TPUs with the specified names exist.')
-      master = job_tasks[0]
-    return cluster_resolver.format_master_url(master, 'grpc')
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    return self.task_type
-
-  def get_tpu_system_metadata(self):
-    """Returns the metadata of the TPU system.
-
-    Users can call this method to get some facts of the TPU system, like
-    total number of cores, number of TPU workers and the devices. E.g.
-    ```python
-
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tpu_system_medata = resolver.get_tpu_system_metadata()
-    num_hosts = tpu_system_medata.num_hosts
-    ```
-
-    Returns:
-      A `tf.tpu.experimental.TPUSystemMetadata` object.
-    """
-    cluster_spec = self.cluster_spec()
-    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
-            self.master(),
-            cluster_def=cluster_def,
-            query_topology=False))
-
-    return tpu_system_metadata
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest TPU information.
-
-    We retrieve the information from the GCE APIs every time this method is
-    called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Cloud TPUs,
-      or None.
-
-    Raises:
-      RuntimeError: If the provided TPU is not healthy.
-    """
-    ############################################################################
-    # There are 5 potential cases this code must handle:
-    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
-    #      a. Create a ClusterSpec that includes the coordinator job
-    #      b. Create a ClusterSpec without the coordinator job.
-    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
-    #     tasks and
-    #      a. Create a ClusterSpec with the coordinator
-    #      b. Create a ClusterSpec without the coordinator
-    ############################################################################
-
-    network_endpoints = self._cloud_tpu_client.network_endpoints()
-    worker_list = [
-        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-        for endpoint in network_endpoints
-    ]
-    cluster_spec = {self.task_type: worker_list}
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
-
-  def num_accelerators(self,
-                       task_type=None,
-                       task_id=None,
-                       config_proto=None):
-    """Returns the number of TPU cores per worker.
-
-    Connects to the master and list all the devices present in the master,
-    and counts them up. Also verifies that the device counts per host in the
-    cluster is the same before returning the number of TPU cores per host.
-
-    Args:
-      task_type: Unused.
-      task_id: Unused.
-      config_proto: Used to create a connection to a TPU master in order to
-        retrieve the system metadata.
-
-    Raises:
-      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
-        number of TPU devices per host is different.
-    """
-    retry_count = 1
-    # TODO(b/120564445): Replace with standard library for retries.
-    while True:
-      try:
-        device_details = TPUClusterResolver._get_device_dict_and_cores(
-            cluster_resolver.get_accelerator_devices(
-                self.master(), config_proto=config_proto))
-        break
-      except errors.DeadlineExceededError:
-        error_message = ('Failed to connect to master. The TPU might not be '
-                         'ready (e.g. still scheduling) or the master '
-                         'address is incorrect: got (%s)' % self.master())
-        if retry_count <= _TPU_CONN_RETRIES:
-          logging.warning(error_message)
-          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
-          retry_count += 1
-        else:
-          raise RuntimeError(error_message)
-
-    if device_details.total_cores:
-      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
-          device_details.device_map)}
-    return {'TPU': 0}
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in."""
-    return self._environment
-
-  def _start_local_server(self):
-    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
-    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
-                                     protocol='grpc',
-                                     config=None,
-                                     start=True)
-    # self._server.target is of the form: grpc://ipaddress:port
-    target = compat.as_bytes(self._server.target)
-    splits = target.split(compat.as_bytes(':'))
-    assert len(splits) == 3, self._server.target
-    assert splits[0] == compat.as_bytes('grpc'), self._server.target
-    self._coordinator_port = compat.as_text(splits[2])
-    self._coordinator_address = '%s:%s' % (
-        address, compat.as_text(self._coordinator_port))
-
-  def __deepcopy__(self, memo):
-    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
-    return self
+tf_export('distribute.cluster_resolver.TPUClusterResolver')(TPUClusterResolver)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {

From 18c0da102443f3500c43618d469bd7e7f761696c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 15:00:16 -0700
Subject: [PATCH 0526/1533] Correctly handle empty matrices in tf.linalg.svd.

PiperOrigin-RevId: 311411299
Change-Id: Ie5440ad4593291409f801fb174fbac3120db0eb7
---
 tensorflow/core/kernels/BUILD                 |  4 +++-
 tensorflow/core/kernels/svd_op_gpu.cu.cc      | 19 +++++++++++++--
 tensorflow/core/kernels/svd_op_impl.h         | 23 +++++++++++++++----
 tensorflow/python/kernel_tests/svd_op_test.py |  7 +++---
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 7cfb6fcae67..6cb8704f494 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3792,7 +3792,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "svd_op",
     prefix = "svd_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        ":eye_functor",
+    ]),
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 2821abf8a6c..482fd057e4e 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/eye_functor.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -390,8 +391,22 @@ class SvdOpGpu : public AsyncOpKernel {
                          done);
 
     if (n == 0 || m == 0) {
-      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
-      // Therefore, we return X.
+      if (n == m || !compute_uv_ || !full_matrices_) {
+        // S, U, and V are all empty. Nothing to do.
+        done();
+        return;
+      }
+      auto device = context->eigen_device<GPUDevice>();
+      functor::EyeFunctor<GPUDevice, Scalar> eye;
+      if (m > 0) {
+        // Return a full canonical basis for the column space.
+        auto outputU_reshaped = outputU->flat_inner_dims<Scalar, 3>();
+        eye(device, outputU_reshaped);
+      } else if (n > 0) {
+        // Return a full canonical basis for the row space.
+        auto outputV_reshaped = outputV->flat_inner_dims<Scalar, 3>();
+        eye(device, outputV_reshaped);
+      }
       done();
       return;
     }
diff --git a/tensorflow/core/kernels/svd_op_impl.h b/tensorflow/core/kernels/svd_op_impl.h
index 2a67700c126..675826a057c 100644
--- a/tensorflow/core/kernels/svd_op_impl.h
+++ b/tensorflow/core/kernels/svd_op_impl.h
@@ -83,16 +83,29 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
 
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
+    int64 n = inputs[0].cols();
+    int64 m = inputs[0].rows();
+    const bool empty = (m == 0 || n == 0);
     int options = 0;  // Don't compute singular vectors;
     if (compute_uv_) {
       options = full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
                                : Eigen::ComputeThinU | Eigen::ComputeThinV;
     }
-    Eigen::BDCSVD<Matrix> svd(inputs[0], options);
-    outputs->at(0) = svd.singularValues().template cast<Scalar>();
-    if (compute_uv_) {
-      outputs->at(1) = svd.matrixU();
-      outputs->at(2) = svd.matrixV();
+    if (!empty) {
+      Eigen::BDCSVD<Matrix> svd(inputs[0], options);
+      outputs->at(0) = svd.singularValues().template cast<Scalar>();
+      if (compute_uv_) {
+        outputs->at(1) = svd.matrixU();
+        outputs->at(2) = svd.matrixV();
+      }
+    } else if (compute_uv_ && full_matrices_) {
+      // For an empty matrix where only one dimension is zero, we still set
+      // U or V to the unit matrix for the dimension that is non-zero.
+      if (m > 0) {
+        outputs->at(1) = Matrix::Identity(m, m);
+      } else {
+        outputs->at(2) = Matrix::Identity(n, n);
+      }
     }
   }
 
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 120e604e7ae..a53d2470aa5 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -93,7 +93,8 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
                   full_matrices_):
 
   def CompareSingularValues(self, x, y, tol):
-    self.assertAllClose(x, y, atol=(x[0] + y[0]) * tol)
+    atol = (x[0] + y[0]) * tol if len(x) else tol
+    self.assertAllClose(x, y, atol=atol)
 
   def CompareSingularVectors(self, x, y, rank, tol):
     # We only compare the first 'rank' singular vectors since the
@@ -374,8 +375,8 @@ if __name__ == "__main__":
   for compute_uv in False, True:
     for full_matrices in False, True:
       for dtype in dtypes_to_test:
-        for rows in 1, 2, 5, 10, 32, 100:
-          for cols in 1, 2, 5, 10, 32, 100:
+        for rows in 0, 1, 2, 5, 10, 32, 100:
+          for cols in 0, 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
               shape = batch_dims + (rows, cols)
               # TF2 does not support placeholders under eager so we skip it

From 4eeb6d742e1ff416f9fc2baeba2fab698e6f28cf Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Wed, 13 May 2020 15:00:41 -0700
Subject: [PATCH 0527/1533] More robustly check for undefined symbols before
 attempting to use them. This check is required because undefined symbols are
 initialized with a special placeholder before entering control flow. This
 placeholder can lead to confusing error messages if left unchecked. The
 change introduces two more general  operators: "variable load" and "return".

PiperOrigin-RevId: 311411422
Change-Id: Ic8abda74c1f68c1d4de491949d309d60099b91b4
---
 tensorflow/python/autograph/converters/BUILD  |  14 ++
 .../autograph/converters/asserts_test.py      |   4 +-
 .../python/autograph/converters/functions.py  |   9 -
 .../autograph/converters/functions_test.py    |   4 +-
 .../autograph/converters/return_statements.py |  99 ++++----
 .../converters/return_statements_test.py      |   3 +-
 .../python/autograph/converters/variables.py  |  76 ++++++
 .../autograph/converters/variables_test.py    | 116 +++++++++
 tensorflow/python/autograph/core/BUILD        |   1 +
 .../autograph/core/function_wrappers.py       |  11 +-
 .../autograph/core/function_wrappers_test.py  |   2 +-
 .../python/autograph/impl/conversion.py       |   2 +
 tensorflow/python/autograph/operators/BUILD   |  18 +-
 .../python/autograph/operators/__init__.py    |   8 +-
 .../autograph/operators/control_flow.py       |  26 +-
 .../operators/control_flow_deprecated_py2.py  |  10 +-
 .../autograph/operators/control_flow_test.py  |  10 +-
 .../python/autograph/operators/symbols.py     | 115 ---------
 .../autograph/operators/symbols_test.py       | 230 ------------------
 .../{special_values.py => variables.py}       |  40 +--
 ...ecial_values_test.py => variables_test.py} |  32 ++-
 .../reaching_definitions_py3_test.py          |  12 +
 .../reaching_definitions_test.py              |   3 +
 23 files changed, 351 insertions(+), 494 deletions(-)
 create mode 100644 tensorflow/python/autograph/converters/variables.py
 create mode 100644 tensorflow/python/autograph/converters/variables_test.py
 delete mode 100644 tensorflow/python/autograph/operators/symbols.py
 delete mode 100644 tensorflow/python/autograph/operators/symbols_test.py
 rename tensorflow/python/autograph/operators/{special_values.py => variables.py} (72%)
 rename tensorflow/python/autograph/operators/{special_values_test.py => variables_test.py} (58%)

diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 9c1d5a38707..ec780a7c0a1 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -33,6 +33,7 @@ py_library(
         "logical_expressions.py",
         "return_statements.py",
         "slices.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -213,3 +214,16 @@ py_test(
         "//tensorflow/python/autograph/pyct",
     ],
 )
+
+py_test(
+    name = "variables_test",
+    srcs = ["variables_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":converters",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/autograph/pyct",
+    ],
+)
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index fd31cd15a0e..dc435cbc90e 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import asserts
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
@@ -36,7 +37,8 @@ class AssertsTest(converter_testing.TestCase):
       return a
 
     with ops.Graph().as_default():
-      with self.converted(test_fn, (functions, asserts), {}) as result:
+      with self.converted(
+          test_fn, (functions, asserts, return_statements), {}) as result:
         op = result.test_fn(constant_op.constant(False))
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
diff --git a/tensorflow/python/autograph/converters/functions.py b/tensorflow/python/autograph/converters/functions.py
index fc33dafb63d..26ead131f9b 100644
--- a/tensorflow/python/autograph/converters/functions.py
+++ b/tensorflow/python/autograph/converters/functions.py
@@ -38,15 +38,6 @@ class _Function(object):
 class FunctionTransformer(converter.Base):
   """Wraps function bodies around autograph-specific boilerplate."""
 
-  def visit_Return(self, node):
-    if node.value is None:
-      return node
-    node = self.generic_visit(node)
-    return templates.replace(
-        'return function_context_name.mark_return_value(value)',
-        function_context_name=self.state[_Function].context_name,
-        value=node.value)
-
   def _function_scope_options(self, fn_scope):
     """Returns the options with which to create function scopes."""
     # Top-level function receive the options that were directly requested.
diff --git a/tensorflow/python/autograph/converters/functions_test.py b/tensorflow/python/autograph/converters/functions_test.py
index aad455e67d7..2a51ef71ebf 100644
--- a/tensorflow/python/autograph/converters/functions_test.py
+++ b/tensorflow/python/autograph/converters/functions_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
@@ -74,7 +75,7 @@ class FunctionTransformer(converter_testing.TestCase):
       l += 1
       return l, inner_fn(l)
 
-    with self.converted(test_fn, functions, {},
+    with self.converted(test_fn, (functions, return_statements), {},
                         (ops.name_scope,)) as result:
       first, second = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', first.op.name)
@@ -119,6 +120,7 @@ class FunctionTransformer(converter_testing.TestCase):
     ns = {'TestClass': TestClass}
     node, ctx = self.prepare(TestClass, ns)
     node = functions.transform(node, ctx)
+    node = return_statements.transform(node, ctx)
 
     with self.compiled(node, {}, (ops.name_scope,)) as result:
       first, second = result.TestClass().test_fn(constant_op.constant(1))
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 39bac60fb91..e4062e42db7 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -220,9 +220,9 @@ class ReturnStatementsTransformer(converter.Base):
         retval = val
   """
 
-  def __init__(self, ctx, default_to_null_return):
+  def __init__(self, ctx, allow_missing_return):
     super(ReturnStatementsTransformer, self).__init__(ctx)
-    self.default_to_null_return = default_to_null_return
+    self.allow_missing_return = allow_missing_return
 
   def visit_Return(self, node):
     for block in reversed(self.state[_Block].stack):
@@ -339,75 +339,68 @@ class ReturnStatementsTransformer(converter.Base):
     return node
 
   def visit_FunctionDef(self, node):
-    self.state[_Function].enter()
-    self.state[_Block].enter()
-    self.state[_Block].is_function = True
+    with self.state[_Function] as fn:
+      with self.state[_Block] as block:
+        block.is_function = True
 
-    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    do_return_var_name = self.ctx.namer.new_symbol(
-        'do_return', scope.referenced)
-    retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
-    self.state[_Function].do_return_var_name = do_return_var_name
-    self.state[_Function].retval_var_name = retval_var_name
+        scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+        do_return_var_name = self.ctx.namer.new_symbol('do_return',
+                                                       scope.referenced)
+        retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
+        fn.do_return_var_name = do_return_var_name
+        fn.retval_var_name = retval_var_name
 
-    converted_body = self._visit_statement_block(node, node.body)
+        node.body = self._visit_statement_block(node, node.body)
 
-    # Avoid placing statements before any eventual docstring.
-    # TODO(mdan): Should a docstring even be included in the output?
-    docstring = None
-    if converted_body:
-      if (isinstance(converted_body[0], gast.Expr) and
-          isinstance(converted_body[0].value, gast.Constant)):
-        docstring = converted_body[0]
-        converted_body = converted_body[1:]
+        if block.return_used:
 
-    if self.state[_Block].return_used:
+          if self.allow_missing_return:
+            # The function whould have a single `with` node that wraps the
+            # entire body. If the function had a docstring, the body has two
+            # nodes, with the `with` as the second node.
+            wrapper_node = node.body[-1]
+            assert isinstance(wrapper_node, gast.With), (
+                'This transformer requires the functions converter.')
 
-      if self.default_to_null_return:
-        # TODO(mdan): Remove the (do_return_var_name,) below.
-        # Currently, that line ensures the variable is both defined and alive
-        # throughout the function.
-        template = """
-          do_return_var_name = False
-          retval_var_name = ag__.UndefinedReturnValue()
-          body
-          (do_return_var_name,)
-          return ag__.retval(retval_var_name)
-        """
-      else:
-        template = """
-          body
-          return retval_var_name
-        """
-      node.body = templates.replace(
-          template,
-          body=converted_body,
-          do_return_var_name=do_return_var_name,
-          retval_var_name=retval_var_name)
+            template = """
+              do_return_var_name = False
+              retval_var_name = ag__.UndefinedReturnValue()
+              body
+              return function_context.ret(retval_var_name, do_return_var_name)
+            """
 
-      if docstring:
-        node.body.insert(0, docstring)
+            wrapper_node.body = templates.replace(
+                template,
+                body=wrapper_node.body,
+                do_return_var_name=do_return_var_name,
+                function_context=anno.getanno(node, 'function_context_name'),
+                retval_var_name=retval_var_name)
+          else:
+            template = """
+              body
+              return retval_var_name
+            """
+            node.body = templates.replace(
+                template,
+                body=node.body,
+                do_return_var_name=do_return_var_name,
+                retval_var_name=retval_var_name)
 
-    self.state[_Block].exit()
-    self.state[_Function].exit()
     return node
 
 
 def transform(node, ctx, default_to_null_return=True):
-  """Ensure a function has only a single return."""
-  # Note: Technically, these two could be merged into a single walk, but
-  # keeping them separate helps with readability.
-
+  """Ensure a function has only a single return, at the end."""
   node = qual_names.resolve(node)
   node = activity.resolve(node, ctx, None)
 
+  # Note: Technically, these two could be merged into a single walk, but
+  # keeping them separate helps with readability.
   node = ConditionalReturnRewriter(ctx).visit(node)
 
   node = qual_names.resolve(node)
   node = activity.resolve(node, ctx, None)
-
   transformer = ReturnStatementsTransformer(
-      ctx, default_to_null_return=default_to_null_return)
+      ctx, allow_missing_return=default_to_null_return)
   node = transformer.visit(node)
-
   return node
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index df687927638..3f1e6a0bd97 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.converters import functions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import ops
@@ -28,7 +29,7 @@ class SingleReturnTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
     ns = {'ops': ops}
-    with self.converted(test_fn, return_statements, ns) as result:
+    with self.converted(test_fn, (functions, return_statements), ns) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
   def test_straightline(self):
diff --git a/tensorflow/python/autograph/converters/variables.py b/tensorflow/python/autograph/converters/variables.py
new file mode 100644
index 00000000000..3028a65a69b
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Overloads all variable read operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import templates
+
+
+class VariableAccessTransformer(converter.Base):
+  """Rewrites basic symbol reads.
+
+  This transformer rewrites variable reads with a "read" operator which allows
+  tracking activity.
+
+  Example:
+
+  For a basic statement:
+
+      a = b + c
+
+  This is translated to:
+
+      a = ld(b) + ld(c)
+
+  Augmented assignment operations also introduce a `ld` operator:
+
+      a += b
+
+  The assignment target also receives an operator to properly represent the
+  read:
+
+      a = ld(a)
+      a += ld(b)
+  """
+
+  def visit_Name(self, node):
+    # Only the loads which existed in the original code are overloaded.
+    if not anno.hasanno(node, anno.Static.ORIG_DEFINITIONS):
+      return node
+    if isinstance(node.ctx, gast.Load):
+      node = templates.replace_as_expression('ag__.ld(var_)', var_=node)
+    return node
+
+  def visit_AugAssign(self, node):
+    if isinstance(node.target, gast.Name):
+      template = """
+        var_ = ag__.ld(var_)
+        original
+      """
+      node = templates.replace(template, var_=node.target, original=node)
+    else:
+      node = self.generic_visit(node)
+    return node
+
+
+def transform(node, ctx):
+  return VariableAccessTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/variables_test.py b/tensorflow/python/autograph/converters/variables_test.py
new file mode 100644
index 00000000000..556dafbaa8a
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables_test.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for variables module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.autograph.converters import variables
+from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.platform import test
+
+
+class VariablesTest(converter_testing.TestCase):
+
+  @contextlib.contextmanager
+  def apply_add_one_conversion(self, fn):
+    """Generates code which adds 1 to all variable reads."""
+    with self.converted(fn, variables, {}) as result:
+      result.ag__.__dict__['ld'] = lambda x: x + 1
+      yield result
+
+  def test_read(self):
+
+    def test_fn(l):
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), 2)
+
+  def test_aug_assign(self):
+
+    def test_fn(l):
+      l *= 10
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), (1 + 1) * 10 + 1)  # two reads
+
+  def test_attribute(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+    def test_fn(l):
+      return l.v
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_subscript(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __getitem__(self, _):
+        return self.v
+
+    def test_fn(l):
+      return l[0]
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_call(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __call__(self):
+        return self.v
+
+    def test_fn(l):
+      return l()
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 655dc118a37..4a5c50dac55 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -30,6 +30,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/autograph/operators",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
diff --git a/tensorflow/python/autograph/core/function_wrappers.py b/tensorflow/python/autograph/core/function_wrappers.py
index cc0e7b98de5..d425f8b679d 100644
--- a/tensorflow/python/autograph/core/function_wrappers.py
+++ b/tensorflow/python/autograph/core/function_wrappers.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util import nest
 
 
+# TODO(mdan): Move this into operators - it represents a function definition.
+
+
 class FunctionScope(object):
   """Context manager that wraps the body of a converted function.
 
@@ -84,8 +88,13 @@ class FunctionScope(object):
     if self.use_auto_deps:
       self.autodeps_scope.__exit__(exc_type, exc_val, exc_tb)
 
-  def mark_return_value(self, value):
+  def ret(self, value, did_return):
     """Marks a value as returned from the function guarded by the scope."""
+    del did_return
+
+    if isinstance(value, variables.UndefinedReturnValue):
+      return None
+
     if self.use_auto_deps:
       self._return_value_marked = True
       if value is None:
diff --git a/tensorflow/python/autograph/core/function_wrappers_test.py b/tensorflow/python/autograph/core/function_wrappers_test.py
index 917a5358633..344ba495570 100644
--- a/tensorflow/python/autograph/core/function_wrappers_test.py
+++ b/tensorflow/python/autograph/core/function_wrappers_test.py
@@ -46,7 +46,7 @@ class FunctionWrappersTest(test.TestCase):
         converter.ConversionOptions(
             optional_features=converter.Feature.AUTO_CONTROL_DEPS)) as scope:
       v.assign(2)
-      op = scope.mark_return_value(constant_op.constant(1))
+      op = scope.ret(constant_op.constant(1), True)
     self.evaluate(op)
     self.assertEqual(self.evaluate(v.read_value()), 2)
 
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 7a7efe3d43a..eeea0aef896 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -36,6 +36,7 @@ from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.converters import slices
+from tensorflow.python.autograph.converters import variables
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import function_wrappers
@@ -92,6 +93,7 @@ class AutoGraphTranspiler(transpiler.FunctionTranspiler):
     node = control_flow.transform(node, ctx)
     node = conditional_expressions.transform(node, ctx)
     node = logical_expressions.transform(node, ctx)
+    node = variables.transform(node, ctx)
     return node
 
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 6db9e4f8e3b..3851c7b44ba 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -29,8 +29,7 @@ py_library(
         "logical.py",
         "py_builtins.py",
         "slices.py",
-        "special_values.py",
-        "symbols.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -148,19 +147,8 @@ py_test(
 )
 
 py_test(
-    name = "special_values_test",
-    srcs = ["special_values_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":operators",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "symbols_test",
-    srcs = ["symbols_test.py"],
+    name = "variables_test",
+    srcs = ["variables_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 495b6070aae..f7f9078107c 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -60,8 +60,6 @@ from tensorflow.python.autograph.operators.py_builtins import range_
 from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
-from tensorflow.python.autograph.operators.special_values import is_undefined
-from tensorflow.python.autograph.operators.special_values import is_undefined_return
-from tensorflow.python.autograph.operators.special_values import retval
-from tensorflow.python.autograph.operators.special_values import Undefined
-from tensorflow.python.autograph.operators.special_values import UndefinedReturnValue
+from tensorflow.python.autograph.operators.variables import ld
+from tensorflow.python.autograph.operators.variables import Undefined
+from tensorflow.python.autograph.operators.variables import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 48b7971ec16..592281b0ce2 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -65,7 +65,7 @@ import traceback
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import compat_util
 from tensorflow.python.autograph.utils import misc
@@ -103,13 +103,13 @@ def _verify_loop_init_vars(values, symbol_names):
   for name, value in zip(symbol_names, values):
     if value is None:
       raise ValueError('"{}" may not be None before the loop.'.format(name))
-    if special_values.is_undefined_return(value):
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
       raise ValueError(
           'return statements are not supported within a TensorFlow loop.')
-    if special_values.is_undefined(value):
+    if isinstance(value, variables.Undefined):
       raise ValueError('"{}" must be defined before the loop.'.format(name))
 
 
@@ -495,8 +495,7 @@ def _tf_range_for_stmt(
   iterate = compat_util.BasicRef(start)
 
   def _value_or(name, var, default):
-    if (name == opts['iterate_names']
-        and isinstance(var, special_values.Undefined)):
+    if (name == opts['iterate_names'] and isinstance(var, variables.Undefined)):
       return default
     return var
 
@@ -1019,7 +1018,15 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
       results_tuple = results
     else:
       results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
+
+    for result in results_tuple:
+      if isinstance(result, variables.UndefinedReturnValue):
+        raise ValueError(
+            'A value must also be returned from the {} branch. If a value is '
+            'returned from one branch of a conditional a value must be '
+            'returned from all branches.'.format(branch_name))
+
+    undefined = [v for v in results_tuple if isinstance(v, variables.Undefined)]
     if undefined:
       raise ValueError(
           'The following symbols must also be initialized in the {} branch: {}.'
@@ -1027,13 +1034,6 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
           ' statement.'.format(branch_name,
                                tuple(s.symbol_name for s in undefined)))
 
-    for result in results_tuple:
-      if special_values.is_undefined_return(result):
-        raise ValueError(
-            'A value must also be returned from the {} branch. If a value is '
-            'returned from one branch of a conditional a value must be '
-            'returned from all branches.'.format(branch_name))
-
     return results
 
   return wrapper
diff --git a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
index e01a2f206c8..5a900fb19ed 100644
--- a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
+++ b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
@@ -66,7 +66,7 @@ import functools
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import misc
 from tensorflow.python.autograph.utils import tensors
@@ -103,13 +103,13 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 
 def _disallow_undefs_into_loop(*values):
   """Ensures that all values in the state are defined when entering a loop."""
-  undefined = tuple(filter(special_values.is_undefined, values))
+  undefined = [v for v in values if isinstance(v, variables.Undefined)]
   if undefined:
     raise ValueError(
         '{} must be defined before the loop.'.format(
             ','.join(s.symbol_name for s in undefined)))
   for value in values:
-    if special_values.is_undefined_return(value):
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
@@ -1129,7 +1129,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
       results_tuple = results
     else:
       results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
+    undefined = [v for v in results_tuple if isinstance(v, variables.Undefined)]
     if undefined:
       raise ValueError(
           'The following symbols must also be initialized in the {} branch: {}.'
@@ -1138,7 +1138,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
                                tuple(s.symbol_name for s in undefined)))
 
     for result in results_tuple:
-      if special_values.is_undefined_return(result):
+      if isinstance(result, variables.UndefinedReturnValue):
         raise ValueError(
             'A value must also be returned from the {} branch. If a value is '
             'returned from one branch of a conditional a value must be '
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 5f0a9d09bf3..1c4407904b2 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -29,7 +29,7 @@ import numpy as np
 import six
 
 from tensorflow.python.autograph.operators import control_flow
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables as variable_operators
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
@@ -546,7 +546,7 @@ class ForLoopTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, '"s" may not be None'):
       self._basic_loop(None, lambda i, s: s)
     with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
     with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
@@ -785,7 +785,7 @@ class WhileLoopTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, '"s" may not be None'):
       self._basic_loop(None, lambda i, s: s)
     with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
     with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
@@ -887,10 +887,10 @@ class IfStmtTest(test.TestCase):
   def test_tensor_undefined_output(self):
     with self.assertRaisesRegex(
         ValueError, "must also be initialized in the if.*'s'"):
-      self._basic_cond(lambda: special_values.Undefined('s'), lambda: 1)
+      self._basic_cond(lambda: variable_operators.Undefined('s'), lambda: 1)
     with self.assertRaisesRegex(
         ValueError, "must also be initialized in the else.*'s'"):
-      self._basic_cond(lambda: 1, lambda: special_values.Undefined('s'))
+      self._basic_cond(lambda: 1, lambda: variable_operators.Undefined('s'))
 
   def test_tensor_dtype_change(self):
     with self.assertRaisesRegex(TypeError, '"s" has dtype int32.*but.*float32'):
diff --git a/tensorflow/python/autograph/operators/symbols.py b/tensorflow/python/autograph/operators/symbols.py
deleted file mode 100644
index 0dd7e0a5956..00000000000
--- a/tensorflow/python/autograph/operators/symbols.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Abstract representation of composite symbols that can be used in staging code.
-
-This provides a way to checkpoint the values of symbols that may be undefined
-entering staged control flow. This checkpointing is necessary to prevent some
-unintended side-effects. For example checkpointing prevents side-effects in one
-branch of a conditional from leaking into another.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-
-
-is_undefined = special_values.is_undefined
-Undefined = special_values.Undefined
-
-
-class Symbol(object):
-  """Representation of a simple or composite Python symbol.
-
-  Subclasses should implement `maybe_compute_value(self)` that returns the value
-  corresponding to the symbol or Undefined if no such value exists.
-  """
-
-  def __init__(self, name):
-    self.name = name
-
-
-class ValueSymbol(Symbol):
-  """Representation of a simple Python symbol with a concrete value.
-
-  This includes variables and literals. Since we are reifying undefined symbols
-  `Undefined` is also a valid value.
-  """
-
-  def __init__(self, name, value):
-    super(ValueSymbol, self).__init__(name)
-    self.value = value
-
-  def maybe_compute_value(self):
-    return self.value
-
-
-class AttributeAccessSymbol(Symbol):
-  """Representation of Python attribute access e.g. `a.b`."""
-
-  def __init__(self, parent_symbol, attr_name):
-    super(AttributeAccessSymbol, self).__init__(
-        parent_symbol.name + '.' + attr_name)
-    self.attr_name = attr_name
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the attribute access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    such attribute or if the base is itself undefined.
-
-    Returns:
-      value corresponding to the attribute access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    if (is_undefined(parent_value) or
-        getattr(parent_value, self.attr_name, None) is None):
-      return Undefined(self.name)
-
-    return parent_value.__getattribute__(self.attr_name)
-
-
-class SubscriptSymbol(Symbol):
-  """Representation of Python subscript access e.g. `a[b]`."""
-
-  def __init__(self, parent_symbol, index_symbol):
-    super(SubscriptSymbol, self).__init__(
-        parent_symbol.name + '[' + index_symbol.name + ']')
-    self.index_symbol = index_symbol
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the subscript access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    element corresponding to the given subscript or if the base itself is
-    not defined.
-
-    Returns:
-      value corresponding to the subscript access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    index_value = self.index_symbol.maybe_compute_value()
-    if is_undefined(parent_value) or is_undefined(index_value):
-      return Undefined(self.name)
-
-    try:
-      return parent_value[index_value]
-    except (IndexError, KeyError, TypeError):
-      # Reify the lack of an object for the given index/key
-      # This allows us to define them later without regret
-      return Undefined(self.name)
diff --git a/tensorflow/python/autograph/operators/symbols_test.py b/tensorflow/python/autograph/operators/symbols_test.py
deleted file mode 100644
index 3acb16273bd..00000000000
--- a/tensorflow/python/autograph/operators/symbols_test.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for special symbol handling."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-from tensorflow.python.autograph.operators import symbols
-from tensorflow.python.platform import test
-
-Undefined = special_values.Undefined
-AttributeAccessSymbol = symbols.AttributeAccessSymbol
-SubscriptSymbol = symbols.SubscriptSymbol
-ValueSymbol = symbols.ValueSymbol
-
-
-class SymbolsTest(test.TestCase):
-
-  def test_value_symbol_returns_value(self):
-    a = 42
-    a_symbol = ValueSymbol('a', a)
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_symbol.name, 'a')
-
-  def test_attribute_access_missing_attribute(self):
-    class Foo(object):
-      pass
-    a = Foo()
-
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_undefined_target(self):
-    a = Undefined('a')
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_basic(self):
-    class Foo(object):
-
-      def __init__(self):
-        self.b = 'this is an attribute'
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-
-  def test_item_access_undefined_index(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = Undefined('b')
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_no_getitem(self):
-    class Foo(object):
-      pass
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_undefined_root(self):
-    a = Undefined('a')
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_basic(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-
-  def test_item_access_after_attribute_access(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    class Bar(object):
-
-      def __init__(self):
-        self.b = Foo()
-
-    a = Bar()
-    c = 42
-    a_symbol = ValueSymbol('a', a)
-    c_symbol = ValueSymbol('c', c)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(c_symbol.maybe_compute_value(), c)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b[c])
-
-  def test_attribute_access_after_item_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b].c)
-
-  def test_item_access_after_item_access(self):
-    class Bar(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    c = 43
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    c_symbol = ValueSymbol('b', c)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b][c])
-
-  def test_attribute_access_after_attribute_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    class Foo(object):
-
-      def __init__(self):
-        self.b = Bar()
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b.c)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/variables.py
similarity index 72%
rename from tensorflow/python/autograph/operators/special_values.py
rename to tensorflow/python/autograph/operators/variables.py
index c172cce23f1..150f64e1758 100644
--- a/tensorflow/python/autograph/operators/special_values.py
+++ b/tensorflow/python/autograph/operators/variables.py
@@ -19,6 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 
+def ld(v):
+  """Load variable operator."""
+  if isinstance(v, Undefined):
+    return v.read()
+  return v
+
+
 class Undefined(object):
   """Represents an undefined symbol in Python.
 
@@ -51,6 +58,10 @@ class Undefined(object):
   def __init__(self, symbol_name):
     self.symbol_name = symbol_name
 
+  def read(self):
+    raise UnboundLocalError("'{}' is used before assignment".format(
+        self.symbol_name))
+
   def __repr__(self):
     return self.symbol_name
 
@@ -66,34 +77,7 @@ class Undefined(object):
     return self
 
 
-def is_undefined(value):
-  """Checks whether Autograph has determined that a given value is undefined.
-
-  This only works in places where Autograph reifies undefined symbols. Note that
-  if this function is passed a truly undefined symbol the call-site will raise
-  NameError.
-
-  Args:
-    value: value to test for undefinedness
-  Returns:
-    Boolean, whether the input value is undefined.
-  """
-  return isinstance(value, Undefined)
-
-
 # TODO(mdan): Refactor as a RetVal object, aggregating the value and do_return.
 class UndefinedReturnValue(object):
-  """Represents a default return value from a function (None in Python)."""
+  """Represents a return value that is undefined."""
   pass
-
-
-def retval(value):
-  """Returns the actual value that a return statement should produce."""
-  if isinstance(value, UndefinedReturnValue):
-    return None
-  return value
-
-
-def is_undefined_return(value):
-  """Checks whether `value` is the default return value."""
-  return isinstance(value, UndefinedReturnValue)
diff --git a/tensorflow/python/autograph/operators/special_values_test.py b/tensorflow/python/autograph/operators/variables_test.py
similarity index 58%
rename from tensorflow/python/autograph/operators/special_values_test.py
rename to tensorflow/python/autograph/operators/variables_test.py
index 1742cc4277d..168e6172232 100644
--- a/tensorflow/python/autograph/operators/special_values_test.py
+++ b/tensorflow/python/autograph/operators/variables_test.py
@@ -18,28 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.platform import test
 
 
 class SpecialValuesTest(test.TestCase):
 
   def test_undefined(self):
-    undefined_symbol = special_values.Undefined('name')
-    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    undefined_symbol = variables.Undefined('name')
+    undefined_symbol2 = variables.Undefined('name')
 
-    undefined_symbol2 = special_values.Undefined('name')
+    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    self.assertEqual(undefined_symbol2.symbol_name, 'name')
     self.assertNotEqual(undefined_symbol, undefined_symbol2)
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol))
-    self.assertTrue(special_values.is_undefined(undefined_symbol2))
-
   def test_undefined_operations(self):
-    undefined_symbol = special_values.Undefined('name')
+    undefined_symbol = variables.Undefined('name')
+
+    self.assertIsInstance(undefined_symbol.foo, variables.Undefined)
+    self.assertIsInstance(undefined_symbol[0], variables.Undefined)
+    self.assertNotIsInstance(undefined_symbol.__class__, variables.Undefined)
+
+  def test_read(self):
+    self.assertEqual(variables.ld(1), 1)
+    o = object()
+    self.assertEqual(variables.ld(o), o)
+
+    self.assertIsNone(variables.ld(None))
+
+  def test_read_undefined(self):
+    with self.assertRaisesRegex(UnboundLocalError, 'used before assignment'):
+      variables.ld(variables.Undefined('a'))
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol.foo))
-    self.assertTrue(special_values.is_undefined(undefined_symbol[0]))
-    self.assertFalse(special_values.is_undefined(undefined_symbol.__class__))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
index 7333ec0c872..ba27280f729 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
@@ -78,6 +78,18 @@ class ReachingDefinitionsAnalyzerTest(
 
     self.assertSameDef(local_body[1].test, local_body[2].value.elts[0])
 
+    # Note: the function name is is visible inside the function body. But it's
+    # a closure variable, not a local.
+    #
+    # Example:
+    #
+    #   >>> def f():
+    #   ...  print(f)
+    #   >>> g = f
+    #   >>> f = 'something else'
+    #   >>> g()
+    #   something else
+    #
     self.assertHasDefinedIn(local_body[1], ('a', 'b'))
 
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index c4e7cbd4d17..64b00fcbeba 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -255,6 +255,9 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
 
     inner_fn_body = fn_body[1].body[1].body
     def_of_a_in_foo = inner_fn_body[0].value
+    # Even though `a` is visible in the inner functio above, the late binding
+    # makes it impossible to assume that the same value will be visible at
+    # call time.
     self.assertHasDefs(def_of_a_in_foo, 0)
 
   def test_nested_functions_isolation(self):

From 7e39134874fb8315ea941c661f32394eaf667c3b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 13 May 2020 15:05:25 -0700
Subject: [PATCH 0528/1533] [Grappler] Do not add control edges from
 placeholder inputs in function inlining

PiperOrigin-RevId: 311412339
Change-Id: Ie40c0c44f1d6b42b53c259f8ad92d171577cd9c7
---
 tensorflow/core/grappler/optimizers/function_optimizer.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index eaccff3b127..ed3af955c13 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -1122,7 +1122,15 @@ void AddStrictInputSemantics(Node* caller, Graph* g) {
 
   VLOG(3) << "Add control edges from all data inputs to enforce strict "
              "semantics with regard to function inputs";
+
+  // Do not add control edges from placeholders, because it will prevent
+  // pruning, and they can't produce any side effects anyway.
+  const auto is_placeholder = [](const Node* node) -> bool {
+    return node->type_string() == "Placeholder";
+  };
+
   for (const Node* node : data_inputs) {
+    if (is_placeholder(node)) continue;
     g->AddControlEdge(g->FindNodeId(node->id()), caller,
                       /*allow_duplicates=*/true);
   }

From e1b0e64119a082bda7ac0125c59b970d7eac54f1 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 15:33:52 -0700
Subject: [PATCH 0529/1533] Export RandomZoom after its odd behavior was fixed.

PiperOrigin-RevId: 311417546
Change-Id: Idb5bcff8b97a1bba1ab054a19ad0a701cf04cc00
---
 .../preprocessing/image_preprocessing.py      |  45 ++--
 .../preprocessing/image_preprocessing_test.py |  22 +-
 ...erimental.preprocessing.-random-zoom.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 ...erimental.preprocessing.-random-zoom.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 6 files changed, 495 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 05a6e84e6cc..832915dac68 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -827,6 +827,7 @@ class RandomRotation(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomZoom')
 class RandomZoom(Layer):
   """Randomly zoom each image during training.
 
@@ -847,7 +848,8 @@ class RandomZoom(Layer):
       For instance, `width_factor=(0.2, 0.3)` result in an output zooming out
       between 20% to 30%.
       `width_factor=(-0.3, -0.2)` result in an output zooming in between 20%
-      to 30%.
+      to 30%. Defaults to `None`, i.e., zooming vertical and horizontal
+      directions by preserving the aspect ratio.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -860,6 +862,14 @@ class RandomZoom(Layer):
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
 
+  Example:
+
+  >>> input_img = np.random.random((32, 224, 224, 3))
+  >>> layer = tf.keras.layers.experimental.preprocessing.RandomZoom(.5, .2)
+  >>> out_img = layer(input_img)
+  >>> out_img.shape
+  TensorShape([32, 224, 224, 3])
+
   Input shape:
     4D tensor with shape:
     `(samples, height, width, channels)`, data_format='channels_last'.
@@ -873,9 +883,10 @@ class RandomZoom(Layer):
       negative.
   """
 
+  # TODO(b/156526279): Add `fill_value` argument.
   def __init__(self,
                height_factor,
-               width_factor,
+               width_factor=None,
                fill_mode='reflect',
                interpolation='bilinear',
                seed=None,
@@ -894,16 +905,17 @@ class RandomZoom(Layer):
                        'got {}'.format(height_factor))
 
     self.width_factor = width_factor
-    if isinstance(width_factor, (tuple, list)):
-      self.width_lower = width_factor[0]
-      self.width_upper = width_factor[1]
-    else:
-      self.width_lower = -width_factor
-      self.width_upper = width_factor
+    if width_factor is not None:
+      if isinstance(width_factor, (tuple, list)):
+        self.width_lower = width_factor[0]
+        self.width_upper = width_factor[1]
+      else:
+        self.width_lower = -width_factor  # pylint: disable=invalid-unary-operand-type
+        self.width_upper = width_factor
 
-    if self.width_lower < -1. or self.width_upper < -1.:
-      raise ValueError('`width_factor` must have values larger than -1, '
-                       'got {}'.format(width_factor))
+      if self.width_lower < -1. or self.width_upper < -1.:
+        raise ValueError('`width_factor` must have values larger than -1, '
+                         'got {}'.format(width_factor))
 
     check_fill_mode_and_interpolation(fill_mode, interpolation)
 
@@ -928,10 +940,13 @@ class RandomZoom(Layer):
           shape=[batch_size, 1],
           minval=1. + self.height_lower,
           maxval=1. + self.height_upper)
-      width_zoom = self._rng.uniform(
-          shape=[batch_size, 1],
-          minval=1. + self.width_lower,
-          maxval=1. + self.width_upper)
+      if self.width_factor is not None:
+        width_zoom = self._rng.uniform(
+            shape=[batch_size, 1],
+            minval=1. + self.width_lower,
+            maxval=1. + self.width_upper)
+      else:
+        width_zoom = height_zoom
       zooms = math_ops.cast(
           array_ops.concat([width_zoom, height_zoom], axis=1),
           dtype=dtypes.float32)
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 28c9955c9dd..38d2d25916a 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -1021,7 +1021,27 @@ class RandomZoomTest(keras_parameterized.TestCase):
     for dtype in (np.int64, np.float32):
       with tf_test_util.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
-        layer = image_preprocessing.RandomZoom((.5, .5), (.5, .5),
+        layer = image_preprocessing.RandomZoom((.5, .5), (.8, .8),
+                                               fill_mode='constant',
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 5, 7, 9, 0],
+            [0, 10, 12, 14, 0],
+            [0, 20, 22, 24, 0],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((.5, .5),
                                                fill_mode='constant',
                                                interpolation='nearest')
         output_image = layer(np.expand_dims(input_image, axis=0))
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..85850223bcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 20e5ca1af9c..0964922ea26 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "RandomWidth"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Rescaling"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..85850223bcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 20e5ca1af9c..0964922ea26 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "RandomWidth"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Rescaling"
     mtype: "<type \'type\'>"

From 0ac3572e8de360a0f91a186228fe9de16c92a8cf Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Wed, 13 May 2020 16:07:02 -0700
Subject: [PATCH 0530/1533] Make SerializeRemoteTensorHandle block only when
 the remote op is a function, in order to still benefit from async execution.

PiperOrigin-RevId: 311423473
Change-Id: I87a3973ddf1954facb69c14499ce2fa07a9d6e99
---
 tensorflow/c/eager/c_api_remote_test.cc       | 22 +++++++++++++++++++
 .../core/common_runtime/eager/execute.cc      | 10 +++++++--
 .../core/common_runtime/eager/execute_node.cc | 16 ++++++++++----
 .../common_runtime/eager/tensor_handle.cc     |  9 ++++----
 .../core/common_runtime/eager/tensor_handle.h |  9 ++++----
 .../eager/eager_service_impl_test.cc          |  5 +++--
 .../eager/remote_copy_node.cc                 |  6 +++--
 .../distributed_runtime/eager/remote_mgr.cc   | 15 ++++++++-----
 .../distributed_runtime/eager/remote_mgr.h    |  9 +++++---
 .../eager/remote_mgr_test.cc                  |  6 +++--
 .../eager/remote_tensor_handle_data.cc        |  9 +++++---
 .../eager/remote_tensor_handle_data.h         |  7 +++---
 12 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 9dc18c7a6f1..544dffb664c 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -434,6 +434,22 @@ string AddVariablesFunction() {
   return def.SerializeAsString();
 }
 
+void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle, status);
+  TFE_TensorHandle* is_initialized[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
+  CHECK_EQ(1, num_retvals);
+  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
+  bool initialized = false;
+  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
+  EXPECT_EQ(initialized, true);
+  delete status;
+}
+
 void TestFunctionWithPackedInput(const bool remote) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
@@ -474,6 +490,12 @@ void TestFunctionWithPackedInput(const bool remote) {
   TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
   TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
 
+  // Add a sync point in order to make sure that variables have been initialized
+  // before the function execution starts.
+  // TODO(b/155789951): Remove once b/155789951 is fixed.
+  VarIsInitialized(ctx, h1);
+  VarIsInitialized(ctx, h2);
+
   // Pack 3 variable handles into one TFE_TensorHandle.
   int num_replicas = 3;
   std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index f6b4370bbdc..f23b0fa7877 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -782,9 +782,15 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         }
       }
       auto* input_handle = remote_op->add_op_inputs()->mutable_remote_handle();
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_until_ready = op->is_function();
       TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
-          input, input_handle, input_device, *input_device_name,
-          serialize_resource_dtype_and_shape));
+          input, wait_until_ready, input_handle, input_device,
+          *input_device_name, serialize_resource_dtype_and_shape));
       if (!input_handle->resource_dtypes_and_shapes().empty()) {
         TF_RETURN_IF_ERROR(
             input->AddResourceShapeMirror(op_device, input_handle->op_id(),
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index 3197d3e0ac7..27503cfd99d 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -97,9 +97,11 @@ Status ExecuteNodeArgs::Init(
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (has_remote_inputs_) {
+    const bool is_function = kernel->IsFunction();
     serialize_remote_handle_ =
-        [ctx, &op_inputs](const FunctionArgIndex& index,
-                          eager::RemoteTensorHandle* handle) -> Status {
+        [ctx, &op_inputs, is_function](
+            const FunctionArgIndex& index,
+            eager::RemoteTensorHandle* handle) -> Status {
       TensorHandle* h = op_inputs[index.index];
       if (op_inputs[index.index]->Type() == TensorHandle::PACKED) {
         TF_RETURN_IF_ERROR(
@@ -112,8 +114,14 @@ Status ExecuteNodeArgs::Init(
             "together.");
       }
       Device* device = absl::get<Device*>(variant_device);
-      return ctx->RemoteMgr()->SerializeRemoteTensorHandle(h, handle, device,
-                                                           device->name());
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_util_ready = is_function;
+      return ctx->RemoteMgr()->SerializeRemoteTensorHandle(
+          h, wait_util_ready, handle, device, device->name());
     };
   }
 #endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 49fa69e2185..dbfc5639017 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -705,8 +705,8 @@ Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
-                                             int32* output_num) const {
+Status TensorHandle::RemoteAddress(const Device* d, const bool wait_until_ready,
+                                   int64* op_id, int32* output_num) const {
   DVLOG(3) << "RemoteAddress on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -714,7 +714,8 @@ Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
     tf_shared_lock l(mu_);
     auto mirror = remote_mirrors_.find(d->name());
     if (mirror != remote_mirrors_.end()) {
-      return mirror->second.OpIdAndOutputNumUntilReady(op_id, output_num);
+      return mirror->second.OpIdAndOutputNum(wait_until_ready, op_id,
+                                             output_num);
     }
 
     return errors::FailedPrecondition(
@@ -726,7 +727,7 @@ Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
   }
 
   auto& data = absl::get<RemoteTensorHandleData>(data_);
-  return data.OpIdAndOutputNumUntilReady(op_id, output_num);
+  return data.OpIdAndOutputNum(wait_until_ready, op_id, output_num);
 }
 
 bool TensorHandle::HasRemoteMirror(const Device* d,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 6f9ee565c73..5e7638ae03c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -168,10 +168,11 @@ class TensorHandle : public AbstractTensorHandleInterface,
   Status AddResourceShapeMirror(const Device* d, int64 op_id, int output_num,
                                 EagerContext* ctx);
 
-  // Return the op_id and output num if the handle refers to a remote tensor;
-  // and blocks until the remote tensor is ready on the given remote worker.
-  Status RemoteAddressUntilReady(const Device* d, int64* op_id,
-                                 int32* output_num) const;
+  // Return the op_id and output num if the handle refers to a remote tensor.
+  // If wait_until_ready is true, block until the remote tensor is ready on the
+  // given remote worker.
+  Status RemoteAddress(const Device* d, const bool wait_until_ready,
+                       int64* op_id, int32* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 23bf324b80f..46a6181cfa9 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -970,8 +970,9 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   EXPECT_EQ(handle2->op_device()->name(), device2);
   int64 op_id;
   int32 output_num;
-  TF_ASSERT_OK(handle2->RemoteAddressUntilReady(
-      absl::get<Device*>(handle2->device()), &op_id, &output_num));
+  TF_ASSERT_OK(handle2->RemoteAddress(absl::get<Device*>(handle2->device()),
+                                      /*wait_until_ready=*/true, &op_id,
+                                      &output_num));
   EXPECT_EQ(op_id, 2);
   EXPECT_EQ(output_num, 5);
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 5d0793b258c..090417863f3 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -147,7 +147,8 @@ void RemoteCopyNode::StartSend() {
     request.set_context_id(ctx_->GetContextId());
     auto* remote_op = request.add_queue()->mutable_operation();
     status = ctx_->RemoteMgr()->SerializeRemoteTensorHandle(
-        src_, remote_op->add_op_inputs()->mutable_remote_handle(),
+        src_, /*wait_until_ready=*/false,
+        remote_op->add_op_inputs()->mutable_remote_handle(),
         absl::get<Device*>(src_->device()),
         absl::get<Device*>(src_->DeviceOrHostCPU(*ctx_))->name());
     if (!status.ok()) {
@@ -316,7 +317,8 @@ Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
           (i == 0) && (h->dtype == DT_RESOURCE) &&
           (ctx->OnSameTask(src_device, target_device));
       TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
-          h, op->add_handles()->mutable_remote_handle(), src_device,
+          h, /*wait_until_ready=*/false,
+          op->add_handles()->mutable_remote_handle(), src_device,
           absl::get<Device*>(h->DeviceOrHostCPU(*ctx))->name(),
           serialize_resource_dtype_and_shape));
     } else {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 7c5115d33ef..94a4f199337 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -74,6 +74,7 @@ Status RemoteMgr::GetMirroredResourceShape(
 }
 
 Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
+                                        const bool wait_until_ready,
                                         int64* op_id, int32* output_num) {
   // TODO(allenl): Consider supporting remote handles on custom devices.
   VariantDevice device = handle->device();
@@ -82,8 +83,8 @@ Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
         "Custom devices and remote execution are currently not supported "
         "together.");
   }
-  TF_RETURN_IF_ERROR(handle->RemoteAddressUntilReady(absl::get<Device*>(device),
-                                                     op_id, output_num));
+  TF_RETURN_IF_ERROR(handle->RemoteAddress(
+      absl::get<Device*>(device), wait_until_ready, op_id, output_num));
   tensorflow::TensorHandle* h;
   TF_RETURN_IF_ERROR(
       GetTensorHandleImpl(RemoteTensorHandleInternal(*op_id, *output_num), &h));
@@ -120,13 +121,15 @@ Status RemoteMgr::DeleteTensorHandle(
 }
 
 Status RemoteMgr::SerializeRemoteTensorHandle(
-    TensorHandle* in, RemoteTensorHandle* out, Device* device,
-    const string& device_name, const bool serialize_resource_dtype_and_shape) {
+    TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+    Device* device, const string& device_name,
+    const bool serialize_resource_dtype_and_shape) {
   int64 op_id;
   int32 output_num;
-  if (!in->RemoteAddressUntilReady(device, &op_id, &output_num).ok()) {
+  if (!in->RemoteAddress(device, wait_until_ready, &op_id, &output_num).ok()) {
     tf_shared_lock l(remote_tensor_handle_mu_);
-    TF_RETURN_IF_ERROR(GetRemoteTensorHandle(in, &op_id, &output_num));
+    TF_RETURN_IF_ERROR(
+        GetRemoteTensorHandle(in, wait_until_ready, &op_id, &output_num));
   }
   out->Clear();
   out->set_op_id(op_id);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 54c987d4daa..2446352c931 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -61,9 +61,11 @@ class RemoteMgr {
   }
 
   // Serialize a remote TensorHandle to a RemoteTensorHandle.
+  // If wait_until_ready is true, block until the remote handle is ready on a
+  // remote worker.
   Status SerializeRemoteTensorHandle(
-      TensorHandle* in, RemoteTensorHandle* out, Device* device,
-      const string& device_name,
+      TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+      Device* device, const string& device_name,
       const bool serialize_resource_dtype_and_shape = false);
 
   // Deserialize a RemoteTensorHandle to a TensorHandle(local/remote).
@@ -83,7 +85,8 @@ class RemoteMgr {
   // Returns the op_id and output_num if the given local TensorHandle exists in
   // remote_tensor_handle_map_.
   Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
-                               int64* op_id, int32* output_num)
+                               const bool wait_until_ready, int64* op_id,
+                               int32* output_num)
       TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   Status GetTensorHandleImpl(const RemoteTensorHandleInternal& remote_handle,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index e4cf6277c5a..1e33a9d0f62 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -81,7 +81,8 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
       handle->SetRemoteShape(shape, remote_device_, ctx_->GetContextViewId()));
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
@@ -97,7 +98,8 @@ TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
       op_id, output_num, DT_FLOAT, remote_device_, ctx_);
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 6cdf6b196a2..6f4d5ada759 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -194,9 +194,12 @@ string RemoteTensorHandleData::DebugString() const {
                          " output_num: ", output_num_);
 }
 
-Status RemoteTensorHandleData::OpIdAndOutputNumUntilReady(
-    int64* op_id, int32* output_num) const {
-  TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
+Status RemoteTensorHandleData::OpIdAndOutputNum(const bool wait_util_ready,
+                                                int64* op_id,
+                                                int32* output_num) const {
+  if (wait_util_ready) {
+    TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
+  }
   *op_id = op_id_;
   *output_num = output_num_;
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 37ad5e721b6..5f096677225 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -50,9 +50,10 @@ class RemoteTensorHandleData {
 
   string DebugString() const;
 
-  // Block until the remote tensor is ready on a remote worker and return the op
-  // id and output num.
-  Status OpIdAndOutputNumUntilReady(int64* op_id, int32* output_num) const;
+  // Return the op id and output num. If wait_util_ready is true, block until
+  // the remote tensor is ready on a remote worker.
+  Status OpIdAndOutputNum(const bool wait_util_ready, int64* op_id,
+                          int32* output_num) const;
 
   uint64 context_view_id() const { return context_view_id_; }
 

From 0c9e56e931ba86dc67ee76e1af9d900e42825a85 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 16:08:29 -0700
Subject: [PATCH 0531/1533] Update attr name for Dense version Bincount.

PiperOrigin-RevId: 311423709
Change-Id: Ief7c901477be8e06b1d3f98613c7390c12e9680b
---
 .../base_api/api_def_DenseBincount.pbtxt      |  2 +-
 .../base_api/api_def_RaggedBincount.pbtxt     |  2 +-
 .../base_api/api_def_SparseBincount.pbtxt     |  2 +-
 tensorflow/core/kernels/bincount_op.cc        | 28 +++++++++----------
 tensorflow/core/ops/math_ops.cc               |  6 ++--
 .../python/kernel_tests/bincount_op_test.py   | 16 +++++------
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |  6 ++--
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |  6 ++--
 8 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
index 3f9ec2761a1..11043899ba4 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
@@ -28,7 +28,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
index b4deaa7c430..b6299ada526 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
@@ -34,7 +34,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
index cfcc432f880..12cb5f43218 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
@@ -40,7 +40,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 846b43255d3..a84b25f2541 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -130,8 +130,8 @@ struct BincountFunctor<CPUDevice, Tidx, T, false> {
   }
 };
 
-template <typename Tidx, typename T, bool binary_count>
-struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_count> {
+template <typename Tidx, typename T, bool binary_output>
+struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_output> {
   static Status Compute(OpKernelContext* context,
                         const typename TTypes<Tidx, 2>::ConstTensor& in,
                         const typename TTypes<T, 2>::ConstTensor& weights,
@@ -148,7 +148,7 @@ struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_count> {
             for (int64 j = 0; j < num_cols; ++j) {
               Tidx value = in(i, j);
               if (value < num_bins) {
-                if (binary_count) {
+                if (binary_output) {
                   out(i, value) = T(1);
                 } else {
                   if (weights.size()) {
@@ -221,7 +221,7 @@ template <typename Device, typename Tidx, typename T>
 class DenseBincountOp : public OpKernel {
  public:
   explicit DenseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -240,7 +240,7 @@ class DenseBincountOp : public OpKernel {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(
             ctx, functor::BincountFunctor<Device, Tidx, T, true>::Compute(
                      ctx, data.flat<Tidx>(), weights.flat<T>(), out, size));
@@ -259,7 +259,7 @@ class DenseBincountOp : public OpKernel {
           ctx, ctx->allocate_output(0, TensorShape({num_rows, size}), &out_t));
       auto out = out_t->matrix<T>();
       fill(ctx->eigen_device<Device>(), out_t->flat<T>());
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(
             ctx, functor::BincountReduceFunctor<Device, Tidx, T, true>::Compute(
                      ctx, data.matrix<Tidx>(), weight_matrix, out, size));
@@ -273,7 +273,7 @@ class DenseBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
@@ -314,7 +314,7 @@ template <typename Device, typename Tidx, typename T>
 class SparseBincountOp : public OpKernel {
  public:
   explicit SparseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -338,7 +338,7 @@ class SparseBincountOp : public OpKernel {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(ctx,
                        functor::BincountFunctor<Device, Tidx, T, true>::Compute(
                            ctx, values, weights, out, size));
@@ -359,7 +359,7 @@ class SparseBincountOp : public OpKernel {
         const int64 batch = indices_mat(i, 0);
         const Tidx bin = values(i);
         if (bin < size) {
-          if (binary_count_) {
+          if (binary_output_) {
             out(batch, bin) = T(1);
           } else {
             if (weights_size) {
@@ -374,7 +374,7 @@ class SparseBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
@@ -395,7 +395,7 @@ template <typename Device, typename Tidx, typename T>
 class RaggedBincountOp : public OpKernel {
  public:
   explicit RaggedBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -429,7 +429,7 @@ class RaggedBincountOp : public OpKernel {
       OP_REQUIRES(ctx, bin >= 0,
                   errors::InvalidArgument("Input must be non-negative"));
       if (bin < size) {
-        if (binary_count_) {
+        if (binary_output_) {
           out(batch_idx - 1, bin) = T(1);
         } else {
           T value = (weights_size > 0) ? weights(idx) : T(1);
@@ -440,7 +440,7 @@ class RaggedBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7ac003379d4..cbf03d7b045 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1657,7 +1657,7 @@ REGISTER_OP("DenseBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
@@ -1704,7 +1704,7 @@ REGISTER_OP("SparseBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       const Tensor* size_tensor = c->input_tensor(3);
@@ -1754,7 +1754,7 @@ REGISTER_OP("RaggedBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 4178e1203e2..222716dfdfa 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -183,7 +183,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=[], size=size, binary_count=True)))
+                  input=inp, weights=[], size=size, binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -201,7 +201,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=np_weight, size=size, binary_count=True)))
+                  input=inp, weights=np_weight, size=size, binary_output=True)))
 
   def _test_bincount_col_count(self, num_rows, num_cols, size, dtype):
     np.random.seed(42)
@@ -230,7 +230,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=[], size=size, binary_count=True)))
+                  input=inp, weights=[], size=size, binary_output=True)))
 
   def _test_bincount_col_count_with_weights(self, num_rows, num_cols, size,
                                             dtype):
@@ -401,7 +401,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=[num_rows],
                 size=size,
                 weights=[],
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -427,7 +427,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=[num_rows],
                 size=size,
                 weights=inp_weight,
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -490,7 +490,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=inp_sparse.dense_shape,
                 size=size,
                 weights=[],
-                binary_count=True)))
+                binary_output=True)))
 
 
 class RaggedBincountOpTest(test_util.TensorFlowTestCase,
@@ -530,7 +530,7 @@ class RaggedBincountOpTest(test_util.TensorFlowTestCase,
                 values=x.values,
                 weights=[],
                 size=6,
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -629,7 +629,7 @@ class RaggedBincountOpTest(test_util.TensorFlowTestCase,
                 values=x.values,
                 weights=[],
                 size=size,
-                binary_count=True)))
+                binary_output=True)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index e622768979c..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1074,7 +1074,7 @@ tf_module {
   }
   member_method {
     name: "DenseBincount"
-    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "DenseCountSparseOutput"
@@ -3070,7 +3070,7 @@ tf_module {
   }
   member_method {
     name: "RaggedBincount"
-    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "RaggedCountSparseOutput"
@@ -4082,7 +4082,7 @@ tf_module {
   }
   member_method {
     name: "SparseBincount"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "SparseConcat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index e622768979c..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1074,7 +1074,7 @@ tf_module {
   }
   member_method {
     name: "DenseBincount"
-    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "DenseCountSparseOutput"
@@ -3070,7 +3070,7 @@ tf_module {
   }
   member_method {
     name: "RaggedBincount"
-    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "RaggedCountSparseOutput"
@@ -4082,7 +4082,7 @@ tf_module {
   }
   member_method {
     name: "SparseBincount"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "SparseConcat"

From a03da3516d600c769e59f9aeddb312013ffe9e54 Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Wed, 13 May 2020 16:15:33 -0700
Subject: [PATCH 0532/1533] Register sparse FullyConnected kernel by default.

PiperOrigin-RevId: 311424830
Change-Id: Id72f75124b59fa11f9cb84447d7a886a579bae39
---
 tensorflow/lite/kernels/fully_connected.cc    | 121 +++++++-----------
 .../lite/kernels/fully_connected_test.cc      |   9 +-
 tensorflow/lite/kernels/register.cc           |   2 +-
 3 files changed, 52 insertions(+), 80 deletions(-)

diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 1cd1b14e7a8..cbc3efd5da5 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -61,8 +61,6 @@ enum KernelType {
   kReference,
   kGenericOptimized,
   kLegacyPie,  // Legacy path used by the PIE team and related clients.
-  kSparseReference,
-  kSparseOptimized,
 };
 
 struct OpData {
@@ -631,57 +629,20 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    reference_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseReference) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    reference_ops::FullyConnectedSparseWeight(
-        sparsity, op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseOptimized) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    if (!SupportedSparsityFormat(sparsity)) {
-      TF_LITE_KERNEL_LOG(context,
-                         "Unsupported sparse fully-connected weight format.");
-      return kTfLiteError;
-    }
-
-    if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
-      // Random sparse.
-      optimized_ops::FullyConnectedSparseWeight(
-          sparsity, op_params, GetTensorShape(input),
-          GetTensorData<float>(input), GetTensorShape(filter),
-          GetTensorData<float>(filter), GetTensorShape(bias),
-          GetTensorData<float>(bias), GetTensorShape(output),
-          GetTensorData<float>(output));
-    } else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
-               sparsity.dim_metadata[2].dense_size == 4) {
-      // Block sparse with block size of 1x4.
-      optimized_ops::FullyConnectedSparseWeight1x4(
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      reference_ops::FullyConnectedSparseWeight(
           sparsity, op_params, GetTensorShape(input),
           GetTensorData<float>(input), GetTensorShape(filter),
           GetTensorData<float>(filter), GetTensorShape(bias),
           GetTensorData<float>(bias), GetTensorShape(output),
           GetTensorData<float>(output));
     } else {
-      TF_LITE_KERNEL_LOG(context,
-                         "Unsupported sparse fully-connected weight format.");
-      return kTfLiteError;
+      reference_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output));
     }
   } else if (kernel_type == kLegacyPie) {
     return EvalPie(context, node, params, data, input, filter, bias, output);
@@ -689,14 +650,47 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    op_params.lhs_cacheable = IsConstantTensor(filter);
-    op_params.rhs_cacheable = IsConstantTensor(input);
-    optimized_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output),
-        CpuBackendContext::GetFromContext(context));
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      if (!SupportedSparsityFormat(sparsity)) {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+      if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
+        // Random sparse.
+        optimized_ops::FullyConnectedSparseWeight(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
+                 sparsity.dim_metadata[2].dense_size == 4) {
+        // Block sparse with block size of 1x4.
+        optimized_ops::FullyConnectedSparseWeight1x4(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+    } else {
+      op_params.lhs_cacheable = IsConstantTensor(filter);
+      op_params.rhs_cacheable = IsConstantTensor(input);
+      optimized_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output),
+          CpuBackendContext::GetFromContext(context));
+    }
   }
 
   return kTfLiteOk;
@@ -757,23 +751,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace fully_connected
 
-// TODO(b/147449640): Clean up sparse registrations after conversion is done.
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_REF() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseReference>,
-      fully_connected::Eval<fully_connected::kSparseReference>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_OPT() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseOptimized>,
-      fully_connected::Eval<fully_connected::kSparseOptimized>};
-  return &r;
-}
-
 TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
   static TfLiteRegistration r = {
       fully_connected::Init, fully_connected::Free,
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 34d68cf0b0d..7227b8a5e92 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -361,11 +361,6 @@ const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
     {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
 });
 
-const auto kKernelMapSparse = new std::map<string, TfLiteRegistration*>({
-    {"SparseReference", ops::builtin::Register_FULLY_CONNECTED_SPARSE_REF()},
-    {"SparseOptimized", ops::builtin::Register_FULLY_CONNECTED_SPARSE_OPT()},
-});
-
 class QuantizedFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
@@ -1187,7 +1182,7 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
 class SparseFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMapSparse;
+    return *kKernelMapNoPie;
   }
 };
 
@@ -1277,7 +1272,7 @@ TEST_P(SparseFullyConnectedOpTest, Simple1x4Test) {
 
 INSTANTIATE_TEST_SUITE_P(
     SparseFullyConnectedOpTest, SparseFullyConnectedOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapSparse)));
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index f3a321e325b..8ca58e6a309 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -77,7 +77,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 7);
+             /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),

From 1798a5e959d6781764c2beec673e61cc58c26455 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 13 May 2020 16:18:13 -0700
Subject: [PATCH 0533/1533] [XLA:SPMD] Fix reshape with halo exchange

PiperOrigin-RevId: 311425288
Change-Id: Ia1e29df7b16d9eb60953aba3336022505e823d3a
---
 .../xla/service/spmd/spmd_partitioner.cc      |  2 +-
 .../xla/service/spmd/spmd_partitioner_test.cc | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index fd865342ca3..b857c8bdbe6 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -1661,7 +1661,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     }
     TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
     CHECK_EQ(
-        reshard_output->sharded_input->shape().dimensions(input_sharded_dim),
+        reshard_output->sharded_input->shape().dimensions(output_sharded_dim),
         output_shard_shape.dimensions(output_sharded_dim));
     SetPartitionedHlo(hlo, [&] { return reshard_output->sharded_input; });
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 7a7f2dcc807..ca1afc816b0 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -27,6 +27,7 @@ namespace xla {
 namespace spmd {
 namespace {
 
+using ::testing::_;
 using ::testing::AllOf;
 namespace op = xla::testing::opcode_matchers;
 
@@ -1994,6 +1995,29 @@ ENTRY entry {
             op::Shape("f32[38,38,4,41]")));
 }
 
+TEST_F(SpmdPartitioningTest, ReshapeMergeDimsWithHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[2,3,7,10] parameter(0), sharding={devices=[1,1,2,1]0,1}
+  ROOT %reshape = s32[3,2,1,14,5] reshape(%input),
+    sharding={devices=[1,1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto reshape =
+      AllOf(op::Reshape(op::Parameter(0)), op::Shape("s32[3,2,1,8,5]"));
+  auto halo = op::CollectivePermute(op::Slice(reshape));
+  auto exchanged =
+      op::DynamicSlice(op::Concatenate(halo, reshape), _, _, _, _, _);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
+}
+
 // Produces an invalid module after transformation.
 TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
   const char* const hlo_string = R"(

From 07568a96e8a6ab5c492ff5d7ebba5efca2f37a1e Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Wed, 13 May 2020 16:18:49 -0700
Subject: [PATCH 0534/1533] Simplify `trainable` in batchnorm layers to just
 use the python variable rather than the private _trainable_var created by
 `backend.freezable_variable`

Compile/Fit continues to reflect the updated value of trainable if you recompile after changing trainable.

This change does come with a behavior change in subtle situations though.

`backend.freezable_variable` occupied a strange no-mans land in between tf.variable and python state. It allowed you to update the value in a tf.function even after tracing occurred (like a tf.variable). But, it did not appear in lists of variables of the model/saved_models.
(It would probably act in unpredictable ways when a single batchnorm layer was used in several different tf.functions, because the layers only maintained a single freezable_variable even though each funcgraph would've needed its own. It's also unclear how it actually behaved in loaded saved_models.)

So, before this code change, disabling/enabling `trainable` after a tf.function containing batchnorm had already been traced caused existing traces to reflect the new value of `trainable`.

Now, because `trainable` is standard python state it acts the same way as other python state in tf.functions. The value will be frozen at tracing time. If you want to update `trainable` after tracing you must trace a new tf.function.

PiperOrigin-RevId: 311425391
Change-Id: I51166212efa28b56c4193f9358907a9dc54b7d2d
---
 tensorflow/python/keras/backend.py            | 47 -----------------
 .../python/keras/layers/normalization.py      | 19 ++-----
 .../python/keras/layers/normalization_test.py | 50 ++++---------------
 .../saving/saved_model/saved_model_test.py    | 31 ++++++++++++
 4 files changed, 44 insertions(+), 103 deletions(-)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 2700fae9e29..11e53e032ae 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -1162,53 +1162,6 @@ def is_placeholder(x):
     return False
 
 
-def freezable_variable(value, shape=None, name=None):
-  """A tensor-like object whose value can be updated only up until execution.
-
-  After creating the freezable variable, you can update its value by calling
-  `var.update_value(new_value)` (similar to a regular variable).
-  Unlike an actual variable, the value used during execution is the current
-  value at the time the execution function (`backend.function()`) was created.
-
-  This is an internal API, expected to be temporary. It is used to implement a
-  mutable `trainable` property for `BatchNormalization` layers, with a frozen
-  value after model compilation.
-
-  We don't use a plain variable in this case because we need the value used
-  in a specific model to be frozen after `compile` has been called
-  (e.g. GAN use case).
-
-  Arguments:
-    value: The initial value for the tensor-like object.
-    shape: The shape for the tensor-like object (cannot be changed).
-    name: The name for the tensor-like object.
-
-  Returns:
-    A tensor-like object with a static value that can be updated via
-    `x.update_value(new_value)`, up until creating an execution function
-    (afterwards the value is fixed).
-  """
-  graph = get_graph()
-  with graph.as_default():
-    x = array_ops.placeholder_with_default(
-        value, shape=shape, name=name)
-    x._initial_value = value
-    x._current_value = value
-
-    def update_value(new_value):
-      x._current_value = new_value
-
-    def get_value():
-      return x._current_value
-
-    x.update_value = update_value
-    x.get_value = get_value
-
-    global _FREEZABLE_VARS
-    _FREEZABLE_VARS[graph].add(x)
-  return x
-
-
 @keras_export('keras.backend.shape')
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 9a35cd86525..a6d3c3c3e1c 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -28,7 +28,6 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
@@ -234,7 +233,6 @@ class BatchNormalizationBase(Layer):
 
     self.fused = fused
     self._bessels_correction_test_only = True
-    self._trainable_var = None
     self.trainable = trainable
 
     if renorm:
@@ -294,14 +292,6 @@ class BatchNormalizationBase(Layer):
   @trainable.setter
   def trainable(self, value):
     self._trainable = value
-    if self._trainable_var is not None:
-      self._trainable_var.update_value(value)
-
-  def _get_trainable_var(self):
-    if self._trainable_var is None:
-      self._trainable_var = K.freezable_variable(
-          self._trainable, name=self.name + '_trainable')
-    return self._trainable_var
 
   @property
   def _param_dtype(self):
@@ -722,12 +712,9 @@ class BatchNormalizationBase(Layer):
     if self._USE_V2_BEHAVIOR:
       if isinstance(training, int):
         training = bool(training)
-      if base_layer_utils.is_in_keras_graph():
-        training = math_ops.logical_and(training, self._get_trainable_var())
-      elif not self.trainable:
-        # When the layer is not trainable, it overrides the value passed from
-        # model.
-        training = self.trainable
+      # When the layer is not trainable, it overrides the value passed from
+      # model.
+      training = math_ops.logical_and(training, self.trainable)
     return training
 
   def call(self, inputs, training=None):
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index ad5d00eb4d9..4d1e3213ba7 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
@@ -35,7 +34,6 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
@@ -170,6 +168,13 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batchnorm_non_trainable_with_fit(self):
+    # We use the same data shape for all the data we use in this test.
+    # This will prevent any used tf.functions from retracing.
+    # This helps us verify that changing trainable and recompiling really
+    # does update the training loop, rather than a different data shape
+    # triggering a retrace.
+    data_shape = (100, 3)
+
     inputs = keras.Input((3,))
     bn = normalization_v2.BatchNormalization()
     outputs = bn(inputs)
@@ -178,10 +183,10 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
-    model.fit(np.random.random((100, 3)), np.random.random((100, 3)))
+    model.fit(np.random.random(data_shape), np.random.random(data_shape))
 
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
+    test_data = np.random.random(data_shape)
+    test_targets = np.random.random(data_shape)
     test_loss = model.evaluate(test_data, test_targets)
 
     bn.trainable = False
@@ -192,41 +197,6 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def test_batchnorm_non_trainable_with_tf_function(self):
-    inputs = keras.Input((3,))
-    bn = normalization_v2.BatchNormalization()
-    outputs = bn(inputs)
-    model = keras.Model(inputs, outputs)
-    loss_fn = keras.losses.MeanSquaredError()
-    optimizer = rmsprop_v2.RMSprop()
-
-    @def_function.function()
-    def train_step(x, y):
-      with backprop.GradientTape() as tape:
-        y_pred = model(x, training=True)
-        loss = loss_fn(y, y_pred)
-      grads = tape.gradient(loss, model.trainable_weights)
-      optimizer.apply_gradients(zip(grads, model.trainable_weights))
-      return loss
-
-    @def_function.function()
-    def test_step(x, y):
-      y_pred = model(x, training=False)
-      loss = loss_fn(y, y_pred)
-      return loss
-
-    train_step(np.random.random((100, 3)), np.random.random((100, 3)))
-
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
-    test_loss = test_step(test_data, test_targets)
-
-    bn.trainable = False
-    train_loss = train_step(test_data, test_targets)
-    if context.executing_eagerly():
-      self.assertAlmostEqual(test_loss.numpy(), train_loss.numpy())
-
   def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
 
     class MyModel(keras.Model):
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 9cbe8607a54..30a93e2bba3 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -391,6 +391,37 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
       self.evaluate(loaded.get_updates_for(input_arr2))
     self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
 
+  def testDisablingBatchNormTrainableBeforeSaving(self):
+    # We disable trainable on the batchnorm layers before saving
+    model = keras.models.Sequential(
+        keras.layers.BatchNormalization(input_shape=(1,)))
+    model.trainable = False
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
+    self.evaluate(variables.variables_initializer(loaded.variables))
+    input_arr = array_ops.constant([[11], [12], [13]], dtype=dtypes.float32)
+    input_arr2 = array_ops.constant([[14], [15], [16]], dtype=dtypes.float32)
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
+
+    # Trainable should still be disabled after loading
+    self.evaluate(loaded(input_arr, training=True))
+    if not context.executing_eagerly():
+      self.evaluate(loaded.get_updates_for(input_arr))
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.0])
+
+    # Re-enabling trainable on the loaded model should cause the batchnorm
+    # layer to start training again.
+    # Note: this only works in v2.
+    if context.executing_eagerly():
+      loaded.trainable = True
+      self.evaluate(loaded(input_arr, training=True))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
+      self.evaluate(loaded(input_arr2, training=False))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
   def testSaveWithSignatures(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(5, input_shape=(3,),

From 4fd957d3cf0dab49d7a8c77b724560768dbfdcb2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 16:38:42 -0700
Subject: [PATCH 0535/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311428968
Change-Id: Ib63776765d20322f80be9dc261f394486746eddc
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 7a07a0e78d8..bab430e1472 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e35304c8b9dda8c46811112f106264c6e29a1e78 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 13 May 2020 16:45:39 -0700
Subject: [PATCH 0536/1533] Add simple canonicalizer for tf.fill

Avoids need for using fallback converter, especially given splat nature vs the cost of converting back and forth from tensor.

PiperOrigin-RevId: 311430247
Change-Id: Ia5f235176f87d355b084c95073350cb890d711c4
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  2 ++
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 25 ++++++++++++++++++-
 .../mlir/tensorflow/tests/canonicalize.mlir   | 11 ++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 2d02d0b7508..64ea0732e8c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -2907,6 +2907,8 @@ fill([2, 3], 9) ==> [[9, 9, 9]
     return Verify(*this);
   }];
 
+  let hasFolder = 1;
+
   let builders = [OpBuilder<
     "OpBuilder &builder, OperationState &result, Value dims, Value value"
   >];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 82ddc80875a..2007824369c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1606,7 +1606,7 @@ static ShapedType InferFillOpType(Value dims, Value value) {
 
   llvm::SmallVector<int64_t, 4> shape;
   shape.reserve(dims_attr.getNumElements());
-  for (const APInt &dim : dims_attr.getValues<APInt>()) {
+  for (const APInt dim : dims_attr.getValues<APInt>()) {
     shape.push_back(dim.getSExtValue());
   }
   return RankedTensorType::get(shape, etype);
@@ -1617,6 +1617,29 @@ void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
   FillOp::build(builder, result, InferFillOpType(dims, value), dims, value);
 }
 
+OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "fill op has two operand");
+
+  auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
+  if (!value) return {};
+
+  auto type = getType().cast<ShapedType>();
+  if (type.hasStaticShape())
+    return DenseElementsAttr::get(type, value.getValue({}));
+
+  auto dims = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!dims) return {};
+
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.reserve(dims.getNumElements());
+  for (const APInt dim : dims.getValues<APInt>()) {
+    shape.push_back(dim.getSExtValue());
+  }
+  type = RankedTensorType::get(shape, type.getElementType());
+
+  return DenseElementsAttr::get(type, value.getValue({}));
+}
+
 //===----------------------------------------------------------------------===//
 // FusedBatchNormGradOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 18f8d5f4486..e05894dc266 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -471,3 +471,14 @@ func @testRankOfRankedTensor(%arg0 : tensor<4x3x2xf32>) -> tensor<i32> {
   // CHECK: return [[VAL0]]
   return %0 : tensor<i32>
 }
+
+// CHECK-LABEL: @foldFill
+func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>) {
+  %0 = "tf.Const"() {value = dense<[3, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %1 = "tf.Const"() {value = dense<23.0> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %2 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<3x2x1xf32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %3 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<*xf32>
+  return %2, %3 : tensor<3x2x1xf32>, tensor<*xf32>
+}

From 3d557534a3d5792f03c3607b14b0b0bfb51bdc1f Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Thu, 14 May 2020 00:01:41 +0000
Subject: [PATCH 0537/1533] Reorganized tests for recompute grad

---
 tensorflow/python/eager/forwardprop_test.py   |  5 ++-
 .../gradient_checkpoint_test.py               | 10 +++--
 tensorflow/python/ops/custom_gradient.py      |  6 +--
 tensorflow/python/ops/gradients_test.py       | 41 ++++++++++++++++++-
 4 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 0c9ffaa0816..d1a30b352d3 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -350,7 +350,7 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
     _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3)
 
   # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test?
-  def testCustomGradientRecomputeGrad(self):
+  def testExceptionCustomGradientRecomputeGradForward(self):
 
     @custom_gradient.recompute_grad
     def f(x):
@@ -358,7 +358,8 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     with self.assertRaisesRegexp(NotImplementedError,
                                  "recompute_grad tried to transpose"):
-      _test_gradients(self, f, [constant_op.constant([1.])], order=3)
+      primals = [constant_op.constant([1.])]
+      sym_jac_fwd = _jacfwd(f, primals)
 
   def testExceptionInCustomGradientNotSwallowed(self):
 
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
index 92c53b3ab70..18e88179e9b 100644
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 import tensorflow as tf
 from tensorflow.keras import layers, optimizers
 
-
 def _get_big_cnn_model(img_dim, n_channels, num_partitions,
                        blocks_per_partition):
   """Creates a test model whose activations are significantly larger than model size."""
@@ -67,7 +66,6 @@ def _compute_loss(logits, labels):
       tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
                                                      labels=labels))
 
-
 def _limit_gpu_memory():
   """Helper function to limit GPU memory for testing  """
   gpus = tf.config.experimental.list_physical_devices('GPU')
@@ -80,6 +78,8 @@ def _limit_gpu_memory():
           ])
     except RuntimeError as e:
       print(e)
+    return True
+  return False
 
 
 def _get_dummy_data(img_dim, n_channels, batch_size):
@@ -90,7 +90,6 @@ def _get_dummy_data(img_dim, n_channels, batch_size):
 
 def _train_no_recompute(n_steps):
   """Trains a single large model without gradient checkpointing."""
-  _limit_gpu_memory()
   img_dim, n_channels, batch_size = 256, 1, 4
   x, y = _get_dummy_data(img_dim, n_channels, batch_size)
   model = _get_big_cnn_model(img_dim,
@@ -113,7 +112,6 @@ def _train_no_recompute(n_steps):
 
 def _train_with_recompute(n_steps):
   """Trains a single large model with gradient checkpointing using tf.recompute_grad."""
-  _limit_gpu_memory()
   img_dim, n_channels, batch_size = 256, 1, 4
   x, y = _get_dummy_data(img_dim, n_channels, batch_size)
   # This model is the same model as _get_big_cnn_model but split into 3 parts.
@@ -146,12 +144,16 @@ def _train_with_recompute(n_steps):
 class GradientCheckpointTest(tf.test.TestCase):
 
   def test_raises_oom_exception(self):
+    if not _limit_gpu_memory():
+      self.skipTest("No virtual GPUs found")
     with self.assertRaises(Exception) as context:
       _train_no_recompute(1)
     self.assertTrue(
         context.exception.__class__.__name__ == 'ResourceExhaustedError')
 
   def test_does_not_raise_oom_exception(self):
+    if not _limit_gpu_memory():
+      self.skipTest("No virtual GPUs found")
     n_step = 2
     losses = _train_with_recompute(n_step)
     self.assertTrue(len(losses) == n_step)
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 6489aff117f..d57be41c3de 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -33,6 +33,7 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 
 
 VAR_OP_TYPES = [
@@ -503,7 +504,8 @@ def recompute_grad(f):
         kw_vars = list(variables)
       grads = t.gradient(result,
                          list(id_args) + kw_vars,
-                         output_gradients=dresult)
+                         output_gradients=dresult,
+                         unconnected_gradients=UnconnectedGradients.ZERO)
 
       def transpose(*t_args, **t_kwargs):
         """Gradient function calculation for forward mode autodiff."""
@@ -513,8 +515,6 @@ def recompute_grad(f):
             "Consider not using recompute_grad in forward mode autodiff".format(
                 f.__name__))
 
-      if len(grads) == 1 and None in grads:
-        return 0, transpose
       return (grads[:len(id_args)], grads[len(id_args):]), transpose
 
     return result, grad
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 817d8a1adbe..9b536136cb5 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -59,7 +59,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
-
+from tensorflow.python.ops import gradient_checker_v2
 
 class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
@@ -1340,6 +1340,45 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
 
     return grads_re, grads
 
+  def _grad(self, f, argnums=0):
+    """Return a function which computes the gradient of `f`."""
+
+    def _f(*params):
+      with backprop.GradientTape() as tape:
+        tape.watch(params)
+        outputs = f(*params)
+      return tape.gradient(
+          outputs,
+          params[argnums],
+          unconnected_gradients=unconnected_gradients.UnconnectedGradients.ZERO)
+
+    return _f
+
+  def _test_gradients(self, f, inputs, order, delta=1e-3, rtol=1e-2, atol=1e-6):
+    """Tests backward jacobians of `f`'s [0, `order`)-order gradients."""
+    if order < 1:
+      raise ValueError(
+          "`order` should be a positive integer, got '{}'.".format(order))
+    if order > 1:
+      self._test_gradients(f=self._grad(f),
+                           inputs=inputs,
+                           order=order - 1,
+                           delta=delta,
+                           rtol=rtol,
+                           atol=atol)
+    sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(f,
+                                                                 inputs,
+                                                                 delta=delta)
+    testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
+  
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomGradientRecomputeGradHigherOrder(self):
+
+    @custom_gradient.recompute_grad
+    def f(x):
+      return math_ops.reduce_prod(math_ops.tanh(x)**2)
+    self._test_gradients(f, [constant_op.constant([1.])], order=3)
+  
   @test_util.run_in_graph_and_eager_modes
   def testFnRecompute(self):
     """Checks that recompute_grad works grads of function args."""

From d0a48afee650b12dde805fadca868d6b113c3c5d Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Wed, 13 May 2020 17:03:13 -0700
Subject: [PATCH 0538/1533] Add a few more utility functions for TPUs

PiperOrigin-RevId: 311433350
Change-Id: I62a3bd2635f4eb07a21f3b1cdb1bbea5017e6851
---
 tensorflow/core/tpu/BUILD                     | 25 +++++++++++++
 tensorflow/core/tpu/tpu_compilation_device.cc | 24 ++++++++++++
 tensorflow/core/tpu/tpu_defs.h                |  9 +++++
 tensorflow/core/tpu/tpu_node_device_util.cc   | 37 +++++++++++++++++++
 tensorflow/core/tpu/tpu_node_device_util.h    | 30 +++++++++++++++
 5 files changed, 125 insertions(+)
 create mode 100644 tensorflow/core/tpu/tpu_compilation_device.cc
 create mode 100644 tensorflow/core/tpu/tpu_node_device_util.cc
 create mode 100644 tensorflow/core/tpu/tpu_node_device_util.h

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 4ea5fc39929..46a8759a257 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -37,10 +37,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_device",
+    srcs = ["tpu_compilation_device.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_defs",
+        ":tpu_node_device_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_node_device_util",
+    srcs = ["tpu_node_device_util.cc"],
+    hdrs = ["tpu_node_device_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tpu_defs",
     srcs = ["tpu_defs.cc"],
     hdrs = ["tpu_defs.h"],
+    deps = ["//tensorflow/core:protos_all_cc"],
 )
 
 cc_library(
diff --git a/tensorflow/core/tpu/tpu_compilation_device.cc b/tensorflow/core/tpu/tpu_compilation_device.cc
new file mode 100644
index 00000000000..2b2314820bc
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_compilation_device.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+namespace tensorflow {
+
+REGISTER_XLA_BACKEND(DEVICE_TPU_XLA_JIT, kTpuAllTypes, TpuOpFilter);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_defs.h b/tensorflow/core/tpu/tpu_defs.h
index b2a6e3ce303..497afb5c392 100644
--- a/tensorflow/core/tpu/tpu_defs.h
+++ b/tensorflow/core/tpu/tpu_defs.h
@@ -18,6 +18,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_TPU_DEFS_H_
 #define TENSORFLOW_CORE_TPU_TPU_DEFS_H_
 
+#include <array>
+
+#include "tensorflow/core/framework/types.pb.h"
+
 namespace tensorflow {
 
 // Name of the TPU device, which corresponds to a single core.
@@ -43,6 +47,11 @@ extern const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR;
 // variable.
 extern const char* const TPU_FAST_MEM_ATTR;  // "_TPU_FAST_MEM"
 
+// Supported types for TPUs.
+static constexpr std::array<DataType, 11> kTpuAllTypes = {
+    {DT_INT32, DT_UINT32, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL,
+     DT_COMPLEX64, DT_INT64, DT_UINT64, DT_QINT8, DT_QUINT8}};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_DEFS_H_
diff --git a/tensorflow/core/tpu/tpu_node_device_util.cc b/tensorflow/core/tpu/tpu_node_device_util.cc
new file mode 100644
index 00000000000..2dfd7d984d6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+bool TpuOpFilter(KernelDef* kdef) {
+  StringPiece op(kdef->op());
+  VLOG(2) << "TpuOpFilter " << op;
+  // Enable const string operands to Assert op (b/69167214).
+  if (op == "Const") {
+    AddDtypeToKernelDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (op == "Assert") {
+    AddDtypeToKernelDefConstraint("T", DT_STRING, kdef);
+  }
+  return true;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_node_device_util.h b/tensorflow/core/tpu/tpu_node_device_util.h
new file mode 100644
index 00000000000..c6d5be9f5a6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+#define TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+
+#include "tensorflow/core/framework/kernel_def.pb.h"
+
+namespace tensorflow {
+
+// This is a BackendOpFilter. (see tensorflow/compiler/tf2xla/xla_op_registry.h)
+// It returns true if the op should be registered on the device, it may
+// optionally modify the KernelDef.
+bool TpuOpFilter(KernelDef* kdef);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_

From a4f82e759af213872631fd9d8e6b037e69ddaa47 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Wed, 13 May 2020 17:20:44 -0700
Subject: [PATCH 0539/1533] Create per worker datasets in
 `strategy.experimental_distribute_datasets_from_function` instead of in
 `__iter__`. This will avoid tracing `dataset_fn` each time creating a new
 DistributedIterator.

PiperOrigin-RevId: 311436128
Change-Id: Ib839326c6d9e0b0fad051f4baa1ceac9eef08045
---
 tensorflow/python/distribute/input_lib.py | 49 ++++++++---------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 68e55d5a6af..26bc9a087fb 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -823,14 +823,15 @@ class DistributedDatasetsFromFunction(_IterableInput):
           "input_contexts (%d)" %
           (input_workers.num_workers, len(input_contexts)))
 
-    self._dataset_fn = dataset_fn
     self._input_workers = input_workers
     self._input_contexts = input_contexts
     self._strategy = strategy
-    self._element_spec = None
-
-    super(DistributedDatasetsFromFunction, self).__init__(
-        input_workers=input_workers)
+    self._datasets, element_spec = (
+        _create_datasets_per_worker_with_input_context(self._input_contexts,
+                                                       self._input_workers,
+                                                       dataset_fn))
+    self._element_spec = _create_distributed_tensor_spec(
+        self._strategy, element_spec)
 
   def __iter__(self):
     if (ops.executing_eagerly_outside_functions() or
@@ -842,9 +843,9 @@ class DistributedDatasetsFromFunction(_IterableInput):
       enable_legacy_iterators = getattr(self._strategy,
                                         "_enable_legacy_iterators", False)
 
-      iterators, element_spec = _create_iterators_per_worker_with_input_context(
-          self._input_contexts, self._input_workers, self._dataset_fn,
-          enable_legacy_iterators)
+      iterators = _create_iterators_per_worker(self._datasets,
+                                               self._input_workers,
+                                               enable_legacy_iterators)
 
       if enable_legacy_iterators:
         iterator = DistributedIteratorV1(self._input_workers, iterators,
@@ -852,8 +853,6 @@ class DistributedDatasetsFromFunction(_IterableInput):
       else:
         iterator = DistributedIterator(self._input_workers, iterators,
                                        self._strategy)
-      self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                           element_spec)
       iterator._element_spec = self._element_spec  # pylint: disable=protected-access
       return iterator
 
@@ -896,13 +895,10 @@ class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
     return self._get_iterator()
 
   def _get_iterator(self):
-    iterators, element_spec = _create_iterators_per_worker_with_input_context(
-        self._input_contexts, self._input_workers, self._dataset_fn,
-        True)
+    iterators = _create_iterators_per_worker(self._datasets,
+                                             self._input_workers, True)
     iterator = DistributedIteratorV1(self._input_workers, iterators,
                                      self._strategy)
-    self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                         element_spec)
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
     return iterator
 
@@ -1375,27 +1371,16 @@ def _create_iterators_per_worker(worker_datasets, input_workers,
   return iterators
 
 
-def _create_iterators_per_worker_with_input_context(input_contexts,
-                                                    input_workers,
-                                                    dataset_fn,
-                                                    enable_legacy_iterators):
-  """Create a multidevice iterator per workers given a dataset function."""
-  iterators = []
-  element_specs = []
+def _create_datasets_per_worker_with_input_context(input_contexts,
+                                                   input_workers, dataset_fn):
+  """Create device datasets per worker given a dataset function."""
+  datasets = []
   for i, ctx in enumerate(input_contexts):
     worker = input_workers.worker_devices[i]
     with ops.device(worker):
       dataset = dataset_fn(ctx)
-      element_specs.append(dataset.element_spec)
-      devices = input_workers.compute_devices_for_worker(i)
-      if tf2.enabled() and not enable_legacy_iterators:
-        iterator = _SingleWorkerOwnedDatasetIterator(dataset, worker,
-                                                     devices)
-      else:
-        iterator = _SingleWorkerDatasetIterator(dataset, worker,
-                                                devices)
-      iterators.append(iterator)
-  return iterators, dataset.element_spec
+      datasets.append(dataset)
+  return datasets, dataset.element_spec
 
 
 # TODO(sourabhbajaj): Remove this in lieu of distributed datasets

From f84726697e208ef30ed830e00acebbbe9bc06553 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 17:24:15 -0700
Subject: [PATCH 0540/1533] [tf.data] Update some maths formulas in the
 ComputeWaitTime function.

PiperOrigin-RevId: 311436667
Change-Id: Ie3537625e9daac73caba5f790b90b65507f999f7
---
 tensorflow/core/framework/model.cc      | 229 ++++++++++++++----------
 tensorflow/core/framework/model.h       |  20 +++
 tensorflow/core/framework/model_test.cc |  72 ++++++++
 3 files changed, 231 insertions(+), 90 deletions(-)

diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 7aeec28e995..b4a54029a4f 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -32,96 +32,6 @@ constexpr char kInputTimeDerivativeKey[] = "last_input_time";
 // Wrapper for the square function to reduce verbosity.
 inline double Square(double x) { return x * x; }
 
-// Given the average time between output events (`output_time`), the average
-// time between input events (`input_time`) and the buffer size, the method
-// computes the expected time an input event will have to wait.
-//
-// The wait time is approximated as the product of the probability the buffer
-// will be empty and the time it takes to produce an element into the buffer.
-//
-// The formula used for computing the probability is derived by modeling the
-// problem as an M/M/1/K queue
-// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
-//
-// Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
-// and `buffer_size` if the corresponding pointers are not `nullptr`.
-double ComputeWaitTime(double output_time, double input_time,
-                       double buffer_size, double* output_time_derivative,
-                       double* input_time_derivative,
-                       double* buffer_size_derivative) {
-  // Case 0: either the producer or the consumer are infinitely fast. Wait time
-  // is the time to produce an output.
-  if (output_time == 0 || input_time == 0) {
-    if (output_time_derivative) {
-      *output_time_derivative = 1.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return output_time;
-  }
-  // Case 1: the consumer is slower than the producer. Wait time is 0 since the
-  // buffer will be full in the long run.
-  if (input_time > output_time) {
-    if (output_time_derivative) {
-      *output_time_derivative = 0.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return 0;
-  }
-  // Case 2: the consumer and the producer are equally fast. Expected wait time
-  // decreases linearly with the size of the buffer.
-  if (input_time == output_time) {
-    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
-    if (output_time_derivative) {
-      *output_time_derivative = p_buffer_empty;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      const double p_buffer_empty_der = -1.0L / Square(buffer_size + 1.0L);
-      *buffer_size_derivative = p_buffer_empty_der * output_time;
-    }
-    return p_buffer_empty * output_time;
-  }
-  // Case 3: the producer is slower than the consumer and neither is infinitely
-  // fast.
-  const double alpha = 1.0L / input_time;
-  const double beta = 1.0L / output_time;
-  const double ratio_pow = std::pow((beta / alpha), (buffer_size + 1.0L));
-  const double p_buffer_empty = (1.0L - beta / alpha) / (1.0L - ratio_pow);
-  if (output_time_derivative) {
-    *output_time_derivative =
-        (1.0L - ratio_pow -
-         (output_time - input_time) * (buffer_size + 1.0L) * ratio_pow /
-             output_time) /
-        Square(1.0L - ratio_pow);
-  }
-  if (input_time_derivative) {
-    *input_time_derivative =
-        (ratio_pow - 1.0L +
-         (buffer_size + 1.0L) * ratio_pow * (alpha / beta - 1.0L)) /
-        Square(1.0L - ratio_pow);
-  }
-  if (buffer_size_derivative) {
-    const double p_buffer_empty_der = (1.0L - beta / alpha) * ratio_pow *
-                                      std::log(beta / alpha) /
-                                      Square(1.0L - ratio_pow);
-    *buffer_size_derivative = p_buffer_empty_der * output_time;
-  }
-
-  return p_buffer_empty * output_time;
-}
-
 // The first input of InterleaveMany corresponds to the input dataset whose
 // elements are used to create the (derived) input datasets whose elements are
 // interleaved as output.
@@ -700,6 +610,145 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
+double Node::ComputeWaitTime(const double& output_time,
+                             const double& input_time,
+                             const double& buffer_size,
+                             double* output_time_derivative,
+                             double* input_time_derivative,
+                             double* buffer_size_derivative) {
+  // If we set x=`input_time`, y=`output_time`, n=`buffer_size`,
+  // p=`p_buffer_empty`, T=`wait_time`, then we have:
+  // if y = 0, then p = 0;
+  // elif x = 0, then p = 1;
+  // elif x = y, then p = 1 / (n+1);
+  // else p = [1 - x/y] / [1 - power(x/y, n+1)].
+  //
+  // We also have T = p * y, and derivatives of T w.r.t. x, y, n are computed:
+  // dT/dx = dp/dx * y,
+  // dT/dy = p + dp/dy * y,
+  // dT/dn = dp/dn * y.
+  // Then the remaining work is to compute dp/dx, dp/dy, dp/dn by considering
+  // different cases and substitute the values into above formulas.
+
+  // Case 1: if producer is infinitely fast. The buffer will always be full.
+  // Wait time will always be 0.
+  if (output_time == 0) {
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = 0` since p=0 on the
+      // line y=0 doesn't imply dp/dy = 0 there. Actually to compute dp/dy at
+      // (x,0), we need to consider lim_{dy->0+} [p(x,dy)-p(x,0)] / dy, where
+      // p(x,0)=0 and p(x,dy) = [1 - x/dy] / [1 - power(x/dy, n+1)].
+      if (buffer_size == 0 || input_time == 0) {
+        *output_time_derivative = 1.0L;
+      } else {
+        *output_time_derivative = 0.0L;
+      }
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = 0.0L;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return 0.0L;
+  }
+
+  // Case 2: if consumer is infinitely fast. Wait time is always the time to
+  // produce an output.
+  if (input_time == 0) {
+    if (output_time_derivative) {
+      *output_time_derivative = 1.0L;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since p=1 on the
+      // line x=0 doesn't imply dp/dx = 0 there. Actually to compute dp/dx at
+      // (0,y), we need to consider lim_{dx->0+} [p(dx,y)-p(0,y)] / dx, where
+      // p(0,y)=1, p(dx,y) = [1 - dx/y] / [1 - power(dx/y, n+1)] if y!=0.
+      if (buffer_size == 0) {
+        *input_time_derivative = 0.0L;
+      } else {
+        *input_time_derivative = -1.0L;
+      }
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return output_time;
+  }
+
+  // Case 3: the consumer and the producer are equally fast. Expected wait time
+  // decreases linearly with the size of the buffer.
+  if (input_time == output_time) {
+    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
+    const double p_buffer_empty_der =
+        -buffer_size / (2.0L * buffer_size + 2.0L);
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = p_buffer_empty` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dy = 0 there. Actually to
+      // compute dp/dy at (y,y), we need to consider
+      // lim_{dy->0} [p(y,y+dy)-p(y,y)] / dy, where p(y,y)=1/(n+1),
+      // p(y,y+dy) = [1 - y/(y+dy)] / [1 - power(y/(y+dy), n+1)].
+      *output_time_derivative = p_buffer_empty - p_buffer_empty_der;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dx = 0 there. Actually to
+      // compute dp/dx at (x,x), we need to consider
+      // lim_{dx->0} [p(x+dx,x)-p(x,x)] / dx, where p(x,x)=1/(n+1),
+      // p(x+dx,x) = [1 - (x+dx)/x] / [1 - power((x+dx)/x, n+1)].
+      *input_time_derivative = p_buffer_empty_der;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = -output_time / Square(buffer_size + 1.0L);
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 4: the consumer is slower than the producer and neither is infinitely
+  // fast. Case 4 and Case 5 actually follow same formula. Separate them for
+  // numerical computation reasons.
+  if (input_time > output_time) {
+    const double ratio = output_time / input_time;
+    const double ratio_pow = std::pow(ratio, buffer_size);
+    const double p_buffer_empty =
+        ratio_pow * (1.0L - ratio) / (1.0L - ratio * ratio_pow);
+    const double p_buffer_empty_der =
+        (buffer_size - (buffer_size + 1.0L) * ratio + ratio_pow * ratio) *
+        ratio_pow / ratio / Square(1.0L - ratio_pow * ratio);
+    if (output_time_derivative) {
+      *output_time_derivative = p_buffer_empty + p_buffer_empty_der * ratio;
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = -p_buffer_empty_der * Square(ratio);
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                                std::log(ratio) * output_time;
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 5: the producer is slower than the consumer and neither is infinitely
+  // fast.
+  const double ratio = input_time / output_time;
+  const double ratio_pow = std::pow(ratio, buffer_size);
+  const double p_buffer_empty = (1.0L - ratio) / (1.0L - ratio_pow * ratio);
+  const double p_buffer_empty_der =
+      ((buffer_size + 1.0L - buffer_size * ratio) * ratio_pow - 1.0L) /
+      Square(1.0L - ratio_pow * ratio);
+  if (output_time_derivative) {
+    *output_time_derivative = p_buffer_empty - p_buffer_empty_der * ratio;
+  }
+  if (input_time_derivative) {
+    *input_time_derivative = p_buffer_empty_der;
+  }
+  if (buffer_size_derivative) {
+    *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                              ratio_pow * ratio * std::log(ratio) * output_time;
+  }
+  return p_buffer_empty * output_time;
+}
+
 void Node::CollectTunableParameters(
     absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
   CollectTunableParametersHelper(parameters);
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 97ac9dd35ae..a4af549fad2 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -285,6 +285,26 @@ class Node {
     autotune_.store(autotune);
   }
 
+  // Given the average time between output events (`output_time`), the average
+  // time between input events (`input_time`) and the buffer size, the method
+  // computes the expected time an input event will have to wait.
+  //
+  // The wait time is approximated as the product of the probability the buffer
+  // will be empty and the time it takes to produce an element into the buffer.
+  //
+  // The formula used for computing the probability is derived by modeling the
+  // problem as an M/M/1/K queue
+  // (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
+  //
+  // Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
+  // and `buffer_size` if the corresponding pointers are not `nullptr`.
+  static double ComputeWaitTime(const double& output_time,
+                                const double& input_time,
+                                const double& buffer_size,
+                                double* output_time_derivative,
+                                double* input_time_derivative,
+                                double* buffer_size_derivative);
+
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
       absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 92c309bd476..898594b7c81 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -757,6 +757,78 @@ TEST(SnapshotTest, Model) {
     }
   }
 }
+
+class ComputeWaitTimeTest
+    : public ::testing::TestWithParam<std::tuple<double, double, double>> {};
+
+TEST_P(ComputeWaitTimeTest, Model) {
+  const double output_time = std::get<0>(GetParam());
+  const double input_time = std::get<1>(GetParam());
+  const double buffer_size = std::get<2>(GetParam());
+
+  double output_time_derivative = 0.0L;
+  double input_time_derivative = 0.0L;
+  double buffer_size_derivative = 0.0L;
+
+  double wait_time = model::Node::ComputeWaitTime(
+      output_time, input_time, buffer_size, &output_time_derivative,
+      &input_time_derivative, &buffer_size_derivative);
+
+  double new_wait_time =
+      model::Node::ComputeWaitTime(output_time + kParameterStep, input_time,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(output_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (output_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time - kParameterStep, input_time,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(output_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time =
+      model::Node::ComputeWaitTime(output_time, input_time + kParameterStep,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(input_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (input_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time, input_time - kParameterStep,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(input_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                               buffer_size + kParameterStep,
+                                               nullptr, nullptr, nullptr);
+  EXPECT_NEAR(buffer_size_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (buffer_size >= kParameterStep) {
+    new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                                 buffer_size - kParameterStep,
+                                                 nullptr, nullptr, nullptr);
+    EXPECT_NEAR(buffer_size_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, ComputeWaitTimeTest,
+    ::testing::Combine(::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 1, 2, 4, 10, 20, 40)));
+
 }  // namespace
 }  // namespace model
 }  // namespace data

From 805c399ead74b45ee5587d786e5fbd20a6592768 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <laurenzo@google.com>
Date: Wed, 13 May 2020 17:28:34 -0700
Subject: [PATCH 0541/1533] Legalize TF broadcasting ops via dedicated
 xla_chlo.broadcast_* ops.

* Also enable patterns to expand these ops to explicit broadcast forms.
* Cleans up some test cases that it was not clear they were adding value.
* Also adds a registration for the shape dialect to the tf2xla bridge (this was causing an assert in the TF AOT compiler tests).

PiperOrigin-RevId: 311437273
Change-Id: I2c2a1367d1b5d208b9d92f4d0feb665c3a09c786
---
 tensorflow/compiler/mlir/xla/BUILD            |   1 +
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 246 +++++-------------
 .../mlir/xla/transforms/legalize_tf.cc        |   7 +
 .../xla/transforms/legalize_tf_patterns.td    |  38 +--
 tensorflow/compiler/tf2xla/BUILD              |   1 +
 tensorflow/compiler/tf2xla/mlir_tf2xla.cc     |   2 +
 6 files changed, 102 insertions(+), 193 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 590595a668f..12334e463fa 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -132,6 +132,7 @@ cc_library(
         "transforms/legalize_tf_control_flow.cc",
     ],
     deps = [
+        ":chlo_legalize_to_hlo",
         ":convert_op_folder",
         ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index a5353beb772..450910b2e4d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -426,6 +426,8 @@ func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tenso
 
 //===----------------------------------------------------------------------===//
 // Binary op legalizations.
+// Most of these expand from the same pattern. Full semantics are
+// verified for tf.Add and pattern application only for the rest.
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @add
@@ -439,19 +441,49 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 // CHECK-LABEL: func @broadcast_add
+// TODO(laurenzo): Change this to a (5 + 2x1) shaped add to make the check
+// patterns unambiguous and more interesting (once broadcastable trait is
+// fixed upstream).
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
 
 // CHECK-LABEL: func @broadcast_multi_dim_add
+// TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
+// broadcastable bug is fixed (helps make the CHECK matching unambiguous)
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0: tensor<4x4x4x4xi32>
 }
 
+// CHECK-LABEL: func @add_dynamic
+func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
 // CHECK-LABEL: func @div
 func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
@@ -460,13 +492,6 @@ func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_div
-func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @shift_left
 func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
@@ -474,13 +499,6 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0 : tensor<4xi32>
 }
 
-// CHECK-LABEL: func @div_dynamic
-func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
 // CHECK-LABEL: func @div_unranked
 func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
   // CHECK: tf.Div
@@ -510,13 +528,6 @@ func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_mul
-func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @real_div
 func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
@@ -524,13 +535,6 @@ func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_real_div
-func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RealDiv"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @sub
 func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
@@ -539,13 +543,6 @@ func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_sub
-func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @shift_right
 func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
@@ -553,13 +550,6 @@ func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0 : tensor<4xi32>
 }
 
-// CHECK-LABEL: func @broadcast_shift_right
-func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  // CHECK: "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
 // CHECK-LABEL: func @shift_right_unsigned
 func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
   // CHECK:  tf.RightShift
@@ -581,20 +571,6 @@ func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @and_broadcast
-func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @and_dynamic
-func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
 // CHECK-LABEL: func @and_unranked
 func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
   // CHECK: tf.LogicalAnd
@@ -609,20 +585,6 @@ func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @or_broadcast
-func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @or_dynamic
-func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
 // CHECK-LABEL: func @bitwise_or
 func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-NEXT: xla_hlo.or
@@ -630,20 +592,6 @@ func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0: tensor<4xi32>
 }
 
-// CHECK-LABEL: func @bitwise_or_broadcast
-func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_or_dynamic
-func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
 // CHECK-LABEL: func @bitwise_and
 func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-NEXT: xla_hlo.and
@@ -651,20 +599,6 @@ func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0: tensor<4xi32>
 }
 
-// CHECK-LABEL: func @bitwise_and_broadcast
-func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_and_dynamic
-func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
 // CHECK-LABEL: func @pow
 func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK-NEXT:  xla_hlo.power
@@ -672,13 +606,6 @@ func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %0: tensor<2xf32>
 }
 
-// CHECK-LABEL: func @pow_dynamic
-func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK-NEXT:  xla_hlo.power
-  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  return %0: tensor<?xf32>
-}
-
 // CHECK-LABEL: func @diag_part
 // CHECK-SAME: %[[ARG:.*]]: tensor<4x3x4x3xf32>
 func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
@@ -862,6 +789,8 @@ func @broadcast_to(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
 
 //===----------------------------------------------------------------------===//
 // Equality op legalizations.
+// tf.Equal and tf.NotEqual expand from the same pattern. Full semantics are
+// verified for tf.Equal and pattern application only for tf.NotEqual
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @equal
@@ -873,14 +802,26 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @equal_dynamic
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
 
 // CHECK-LABEL: func @equal_broadcast
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
@@ -927,70 +868,42 @@ func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @notequal_dynamic
-func @notequal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast
-func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast_no_incompatible_shapes_error
-func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_broadcastable
-func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_dynamic
-func @notequal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_both_dynamic
-func @notequal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
 //===----------------------------------------------------------------------===//
 // Compare op legalizations.
+// These expand from the same pattern. Full semantics are checked for
+// tf.Greater. Others just check that the pattern applied.
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @greater
 func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
+  // CHECK: "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
 // CHECK-LABEL: func @broadcast_greater
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
 
 // CHECK-LABEL: func @greater_dynamic
-func @greater_dynamic(%arg0: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
+func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
 
@@ -1008,13 +921,6 @@ func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @broadcast_greater_equal
-func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
 // CHECK-LABEL: func @less
 func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
@@ -1022,13 +928,6 @@ func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @broadcast_less
-func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
 // CHECK-LABEL: func @less_equal
 func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
@@ -1036,13 +935,6 @@ func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @broadcast_less_equal
-func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
 
 //===----------------------------------------------------------------------===//
 // Complex op legalizations.
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index a0a5e47ad65..10bac232b0f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -44,9 +44,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/xla/convert_op_folder.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -4955,7 +4957,12 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
       ConvertRandomShuffleOp, ConvertVariableShapeOp, ConvertXlaShardingOp,
       ConvertXlaDynamicUpdateSliceOp>(op->getContext());
 
+  // Populate with CHLO->HLO lowerings to account for TF ops legalized to
+  // CHLO first.
+  xla_chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
+
   ConversionTarget target(*context);
+  target.addIllegalDialect<xla_chlo::XlaHloClientDialect>();
   target.addLegalDialect<XlaHloDialect>();
   target.addLegalDialect<StandardOpsDialect>();
   target.addLegalDialect<shape::ShapeDialect>();
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 2a27c1f2966..959902692dc 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -18,6 +18,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def SignedIntTensor : TensorOf<[I1, I8, I16, I32, I64]>;
@@ -80,6 +81,9 @@ def BiasAddFeatureDimension : NativeCodeCall<
 
 // $input needs to be a ranked tensor to identify index of the feature
 // dimension depending on the data_format 'NHWC' or 'NCHW'.
+// TODO(laurenzo): This should be converted to do explicit broadcasting since
+// it can generate broadcast dimensions that are not compatible with the simple
+// xla_chlo.add broadcast_dims.
 def : Pat<(TF_BiasAddOp AnyRankedTensor:$input, $bias, $data_format),
           (HLO_AddOp $input, $bias,
               (BiasAddFeatureDimension $data_format, $input))>;
@@ -96,16 +100,16 @@ class DirectBinaryPat<Op FromOp, Op ToOp>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r))>;
 
-foreach fromToBinPair = [[TF_AddOp, HLO_AddOp],
-                         [TF_AddV2Op, HLO_AddOp],
-                         [TF_DivOp, HLO_DivOp],
-                         [TF_LeftShiftOp, HLO_ShiftLeftOp],
-                         [TF_MaximumOp, HLO_MaxOp],
-                         [TF_MinimumOp, HLO_MinOp],
-                         [TF_MulOp, HLO_MulOp],
-                         [TF_PowOp, HLO_PowOp],
-                         [TF_RealDivOp, HLO_DivOp],
-                         [TF_SubOp, HLO_SubOp]] in
+foreach fromToBinPair = [[TF_AddOp, HLOClient_BroadcastAddOp],
+                         [TF_AddV2Op, HLOClient_BroadcastAddOp],
+                         [TF_DivOp, HLOClient_BroadcastDivOp],
+                         [TF_LeftShiftOp, HLOClient_BroadcastShiftLeftOp],
+                         [TF_MaximumOp, HLOClient_BroadcastMaxOp],
+                         [TF_MinimumOp, HLOClient_BroadcastMinOp],
+                         [TF_MulOp, HLOClient_BroadcastMulOp],
+                         [TF_PowOp, HLOClient_BroadcastPowOp],
+                         [TF_RealDivOp, HLOClient_BroadcastDivOp],
+                         [TF_SubOp, HLOClient_BroadcastSubOp]] in
   def : DirectBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 def LowerRightShiftSigned :
@@ -196,10 +200,10 @@ class DirectLogicalBinaryPat<Op FromOp, Op ToOp>
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r)),
         [(SignedIntTensor $l)]>;
 
-foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
-                         [TF_LogicalOrOp, HLO_OrOp],
-                         [TF_BitwiseOrOp, HLO_OrOp],
-                         [TF_BitwiseAndOp, HLO_AndOp]] in
+foreach fromToBinPair = [[TF_LogicalAndOp, HLOClient_BroadcastAndOp],
+                         [TF_LogicalOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseAndOp, HLOClient_BroadcastAndOp]] in
   def : DirectLogicalBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 //===----------------------------------------------------------------------===//
@@ -208,7 +212,8 @@ foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
 
 class DirectComparePat<Op FromOp, StrEnumAttrCase direction>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
+        (HLOClient_BroadcastCompareOp
+           $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
 
 def : DirectComparePat<TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT>;
 def : DirectComparePat<TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE>;
@@ -218,7 +223,8 @@ def : DirectComparePat<TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE>;
 class EqualityPat<Op FromOp, StrEnumAttrCase direction>
     : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r,
            TrueBoolAttr:$incompatible_shape_error),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction),
+        (HLOClient_BroadcastCompareOp
+         $l, $r, (BinBroadcastDimensions $l, $r), direction),
         [(AreBroadcastCompatible $l, $r)]>;
 
 def : EqualityPat<TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ>;
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index c2ad1255a35..897528b6de9 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -182,6 +182,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index daf261fa5d8..43793be56a7 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -95,6 +96,7 @@ static void RegisterDialects() {
     mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     mlir::registerDialect<mlir::StandardOpsDialect>();
     mlir::registerDialect<mlir::xla_hlo::XlaHloDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
     return true;
   }();
   (void)init_once;

From e989b132d4ed9625dee8a3896844f81bc54d1101 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Wed, 13 May 2020 17:42:23 -0700
Subject: [PATCH 0542/1533] Simplify error logging in XNNPACK delegate

Use TF_LITE_MAYBE_KERNEL_LOG to remove redundant if blocks

PiperOrigin-RevId: 311439195
Change-Id: I3f75e6178061b63d01a7b935e6d23739651f37d0
---
 .../delegates/xnnpack/xnnpack_delegate.cc     | 327 +++++++-----------
 1 file changed, 128 insertions(+), 199 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 388509c9873..6d9b4dac8f8 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -245,10 +245,9 @@ class Subgraph {
         *flags = 0;
         return kTfLiteOk;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context, "invalid padding mode (%d) in node #%d",
-                             static_cast<int>(padding), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid padding mode (%d) in node #%d",
+                                 static_cast<int>(padding), node_index);
         return kTfLiteError;
     }
   }
@@ -274,32 +273,24 @@ class Subgraph {
         *output_max = 6.0f;
         return kTfLiteOk;
       case kTfLiteActTanh:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Tanh) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Tanh) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSignBit:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Sign) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sign) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSigmoid:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              context, "unsupported fused activation (Sigmoid) in node #%d",
-              node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sigmoid) in node #%d",
+            node_index);
         return kTfLiteError;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "invalid fused activation (%d) in node #%d",
-                             static_cast<int>(activation), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid fused activation (%d) in node #%d",
+                                 static_cast<int>(activation), node_index);
         return kTfLiteError;
     }
   }
@@ -308,34 +299,26 @@ class Subgraph {
                                              const TfLiteConvParams* params,
                                              int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
       return kTfLiteError;
     }
 
@@ -346,52 +329,41 @@ class Subgraph {
       TfLiteContext* context, const TfLiteDepthwiseConvParams* params,
       int output_channels, int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->depth_multiplier <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid depth multiplier %d in node #%d",
-                           params->depth_multiplier, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid depth multiplier %d in node #%d",
+                               params->depth_multiplier, node_index);
       return kTfLiteError;
     }
     if (output_channels % params->depth_multiplier != 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "depth multiplier %d is incompatible with "
-                           "number of output channels %d in node #%d",
-                           params->depth_multiplier, output_channels,
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "depth multiplier %d is incompatible with "
+                               "number of output channels %d in node #%d",
+                               params->depth_multiplier, output_channels,
+                               node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
       return kTfLiteError;
     }
 
@@ -402,17 +374,13 @@ class Subgraph {
       TfLiteContext* context, const TfLiteTransposeConvParams* params,
       int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
@@ -502,11 +470,9 @@ class Subgraph {
       TfLiteContext* context, const TfLiteFullyConnectedParams* params,
       int node_index) {
     if (params->weights_format != kTfLiteFullyConnectedWeightsFormatDefault) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unsupported non-default weights format in node #%d",
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported non-default weights format in node #%d",
+          node_index);
       return kTfLiteError;
     }
 
@@ -517,39 +483,29 @@ class Subgraph {
                                          const TfLitePoolParams* params,
                                          int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->filter_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
-                           params->filter_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
+                               params->filter_width, node_index);
       return kTfLiteError;
     }
     if (params->filter_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
-                           params->filter_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
+                               params->filter_height, node_index);
       return kTfLiteError;
     }
     if (params->filter_width == 1 && params->filter_height == 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
+                               node_index);
       return kTfLiteError;
     }
 
@@ -562,19 +518,15 @@ class Subgraph {
                                                int expected_num_outputs,
                                                int node_index) {
     if (node->inputs->size != expected_num_inputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of inputs (%d != %d) in node #%d",
-                           node->inputs->size, expected_num_inputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of inputs (%d != %d) in node #%d",
+          node->inputs->size, expected_num_inputs, node_index);
       return kTfLiteError;
     }
     if (node->outputs->size != expected_num_outputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unexpected number of output (%d != %d) in node #%d",
-            node->outputs->size, expected_num_outputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of output (%d != %d) in node #%d",
+          node->outputs->size, expected_num_outputs, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -584,11 +536,9 @@ class Subgraph {
                                            const TfLiteTensor& tensor,
                                            int tensor_index, int node_index) {
     if (tensor.type != kTfLiteFloat32) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unsupported type %s in tensor #%d in node #%d",
-            TfLiteTypeGetName(tensor.type), tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported type %s in tensor #%d in node #%d",
+          TfLiteTypeGetName(tensor.type), tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -599,21 +549,17 @@ class Subgraph {
                                        int expected_num_dims,
                                        int tensor_index) {
     if (tensor.dims->size != expected_num_dims) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "unexpected number of shape dimensions (%d != %d) in tensor #%d",
-            tensor.dims->size, expected_num_dims, tensor_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "unexpected number of shape dimensions (%d != %d) in tensor #%d",
+          tensor.dims->size, expected_num_dims, tensor_index);
       return kTfLiteError;
     }
     for (int i = 0; i < tensor.dims->size; i++) {
       if (tensor.dims->data[i] <= 0) {
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "invalid dimension #%d (%d) in tensor #%d", i,
-                             tensor.dims->data[i], tensor_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid dimension #%d (%d) in tensor #%d", i,
+                                 tensor.dims->data[i], tensor_index);
         return kTfLiteError;
       }
     }
@@ -624,25 +570,22 @@ class Subgraph {
                                             const TfLiteTensor& tensor,
                                             int tensor_index, int node_index) {
     if (tensor.dims->size < 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of shape dimensions (%d) in "
-                           "tensor #%d in node #%d: "
-                           "expected at least a 1D tensor",
-                           tensor.dims->size, tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "tensor #%d in node #%d: "
+                               "expected at least a 1D tensor",
+                               tensor.dims->size, tensor_index, node_index);
       return kTfLiteError;
     }
     // Validate that all non-channel dimensions (if any) are exactly 1.
     for (int i = 0; i < tensor.dims->size - 1; i++) {
       if (tensor.dims->data[i] != 1) {
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unexpected value %d of shape dimension #%d in "
-                             "tensor #%d in node #%d: "
-                             "expected 1 for non-channel dimensions",
-                             tensor.dims[i], i, tensor_index, node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unexpected value %d of shape dimension #%d in "
+            "tensor #%d in node #%d: "
+            "expected 1 for non-channel dimensions",
+            tensor.dims[i], i, tensor_index, node_index);
         return kTfLiteError;
       }
     }
@@ -654,12 +597,11 @@ class Subgraph {
       int node_index) {
     // TODO(b/149120844): remove checks once dynamic tensors are supported
     if (tensor.allocation_type == kTfLiteDynamic) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected non-dynamic tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected non-dynamic tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -671,12 +613,11 @@ class Subgraph {
                                                   int node_index) {
     if (tensor.allocation_type != kTfLiteMmapRo ||
         tensor.data.raw_const == nullptr) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected static read-only tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected static read-only tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -1134,23 +1075,19 @@ class Subgraph {
     const int32_t input_channels = filter_tensor.dims->data[1];
 
     if (input_tensor.dims->size == 0) {
-      if (logging_context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "unexpected number of shape dimensions %d in tensor #%d",
-            input_tensor.dims->size, node->inputs->data[0]);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of shape dimensions %d in tensor #%d",
+          input_tensor.dims->size, node->inputs->data[0]);
       return kTfLiteError;
     }
 
     int32_t num_input_elements = 1;
     for (int i = 0; i < input_tensor.dims->size; i++) {
       if (input_tensor.dims->data[i] <= 0) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(logging_context,
-                             "invalid dimension #%d (%d) in tensor #%d", i,
-                             input_tensor.dims->data[i], node->inputs->data[0]);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context, "invalid dimension #%d (%d) in tensor #%d", i,
+            input_tensor.dims->data[i], node->inputs->data[0]);
         return kTfLiteError;
       }
       num_input_elements *= input_tensor.dims->data[i];
@@ -1163,55 +1100,47 @@ class Subgraph {
 
       for (int i = 0; i < input_tensor.dims->size - 1; i++) {
         if (input_tensor.dims->data[i] != output_tensor.dims->data[i]) {
-          if (logging_context != nullptr) {
-            TF_LITE_KERNEL_LOG(
-                logging_context,
-                "mismatch in shape dimension %d (%d != %d) in input and output "
-                "tensors of FULLY_CONNECTED operator #%d",
-                i, input_tensor.dims->data[i], output_tensor.dims->data[i],
-                node_index);
-          }
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "mismatch in shape dimension %d (%d != %d) in input and output "
+              "tensors of FULLY_CONNECTED operator #%d",
+              i, input_tensor.dims->data[i], output_tensor.dims->data[i],
+              node_index);
           return kTfLiteError;
         }
       }
     } else {
       if (num_input_elements % input_channels != 0) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              logging_context,
-              "number of elements in input tensor #%d in FULLY_CONNECTED "
-              "operator is not divisible by input channels (%d)",
-              node->inputs->data[0], input_channels);
-          return kTfLiteError;
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "number of elements in input tensor #%d in FULLY_CONNECTED "
+            "operator is not divisible by input channels (%d)",
+            node->inputs->data[0], input_channels);
+        return kTfLiteError;
       }
 
       TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 2,
                                              node->outputs->data[0]));
 
       if (output_tensor.dims->data[0] != num_input_elements / input_channels) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              logging_context,
-              "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
-              "does not match batch size %d in reshaped input tensor #%d",
-              output_tensor.dims->data[0], node->outputs->data[0],
-              num_input_elements / input_channels, node->inputs->data[0]);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
+            "does not match batch size %d in reshaped input tensor #%d",
+            output_tensor.dims->data[0], node->outputs->data[0],
+            num_input_elements / input_channels, node->inputs->data[0]);
         return kTfLiteError;
       }
     }
 
     if (output_tensor.dims->data[output_tensor.dims->size - 1] !=
         output_channels) {
-      if (logging_context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "number of channels %d in output tensor #%d does not match output "
-            "channels %d in filter tensor #%d",
-            output_tensor.dims->data[output_tensor.dims->size - 1],
-            node->outputs->data[0], output_channels, node->inputs->data[1]);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "number of channels %d in output tensor #%d does not match output "
+          "channels %d in filter tensor #%d",
+          output_tensor.dims->data[output_tensor.dims->size - 1],
+          node->outputs->data[0], output_channels, node->inputs->data[1]);
       return kTfLiteError;
     }
 

From a76e002aa82c40d272c66ef56d43a77c5cc106b8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 17:42:35 -0700
Subject: [PATCH 0543/1533] Internal change

PiperOrigin-RevId: 311439232
Change-Id: Ieed3345eef5dc74a2b0cc4805ed5269bf775a405
---
 tensorflow/python/ops/math_ops.py      | 4 ----
 tensorflow/python/ops/math_ops_test.py | 6 ------
 2 files changed, 10 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 749aa89593a..4c4982c6fd5 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -82,7 +82,6 @@ from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -439,9 +438,6 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    # We do conversion here to make sure at least x is a tensor.
-    if not tensor_util.is_tensor(x):
-      x = ops.convert_to_tensor(x)
     return x / y
 
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 6171ea037d9..2405eec9e49 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -495,12 +495,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     # Consistent with desire to get numerator
     self.assertAllEqual(tf_result, expanded_nums)
 
-  def testWithPythonValue(self):
-    # Test case for GitHub issue 39475:
-    # https://github.com/tensorflow/tensorflow/issues/39475
-    x = math_ops.divide(5, 2)
-    self.assertTrue(isinstance(x, ops.Tensor))
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class DivNoNanTest(test_util.TensorFlowTestCase):

From 43691f9b891045f41b59ec1afbe06637c19d2377 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Wed, 13 May 2020 17:52:25 -0700
Subject: [PATCH 0544/1533] [XLA] Add use_names column to buffer info debug
 string to help with debugging.

PiperOrigin-RevId: 311440576
Change-Id: I060aed0171625c79bfa7d8ae821f098670d6c84f
---
 .../xla/service/memory_space_assignment.cc    | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 8752e870bb7..742de71e74c 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -585,23 +585,35 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
   // definition_time: int. Logical time this value was defined in the schedule.
   // use_times: string. This is a semicolon-separated list of integers for all
   // the use times.
+  // use_names: string. This is a semicolon-separated list of string
+  // representation of uses.
   if (debug_str->empty()) {
     // Append the column names.
     absl::StrAppend(debug_str,
-                    "buffer_id,buffer_name,alt_mem_benefit,size,definition_"
-                    "time,use_times\n");
+                    "buffer_id,buffer_name,alt_mem_benefit,size,"
+                    "definition_time,use_times,use_names\n");
   }
   const HloBuffer& buffer =
       alias_analysis_.GetBufferContainingValue(*interval.buffer);
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   int64 definition_time =
       instruction_schedule.at(interval.buffer->defining_position().instruction);
-  std::set<int64> use_times;
+  std::vector<std::pair<int64, std::string>> uses;
   for (const HloValue* value : buffer.values()) {
     for (const HloUse& use : value->uses()) {
-      use_times.insert(instruction_schedule.at(use.instruction));
+      uses.push_back(
+          {instruction_schedule.at(use.instruction), use.ToString()});
     }
   }
+  absl::c_sort(uses);
+  std::vector<int64> use_times;
+  std::vector<std::string> use_names;
+  use_times.reserve(uses.size());
+  use_names.reserve(uses.size());
+  for (auto use : uses) {
+    use_times.push_back(use.first);
+    use_names.push_back(use.second);
+  }
 
   absl::StrAppend(debug_str, buffer.id(), ",");
   absl::StrAppend(debug_str, "\"", interval.buffer->ToShortString(), "\",");
@@ -612,7 +624,8 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
       debug_str, alternate_memory_benefit ? *alternate_memory_benefit : 0, ",");
   absl::StrAppend(debug_str, interval.size, ",");
   absl::StrAppend(debug_str, definition_time, ",");
-  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\"");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\",");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_names, ";"), "\"");
   absl::StrAppend(debug_str, "\n");
 }
 

From 649c80888967bc3f0d9e60f51ff69e5c173537ec Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 18:35:48 -0700
Subject: [PATCH 0545/1533] Reuse existing util functions in
 ReplicateToIslandPass and add back a test for remapping results (NFC).

PiperOrigin-RevId: 311446065
Change-Id: Iba9516da76f6df9459b5ad323d9ac9fd7563ded7
---
 .../tensorflow/tests/replicate_to_island.mlir | 22 ++++++++++++++++
 .../transforms/replicate_to_island.cc         | 26 +++++++------------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index c8b4ad2cb9f..8da252fc832 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -119,3 +119,25 @@ func @replicate_control() {
 // CHECK: %[[REPLICA_1:.*]] = tf_executor.island
 // CHECK: %[[SINK:.*]] = tf_executor.island(%[[REPLICA_0]], %[[REPLICA_1]])
 // CHECK: tf_executor.fetch %[[SINK]]
+
+
+// Tests replicate results are remapped correctly.
+// CHECK-LABEL: func @replicate_result
+func @replicate_result(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %1:5 = tf_executor.island {
+      %2:4 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i1>) {n = 2 : i32} {
+        %3 = "tf.opA"(%arg2) : (tensor<i1>) -> tensor<f32>
+        %4 = "tf.opB"(%arg2) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %3, %4 : tensor<f32>, tensor<i32>
+      }
+      tf_executor.yield %2#0, %2#1, %2#2, %2#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+    }
+    tf_executor.fetch %1#0, %1#1, %1#2, %1#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+  }
+  return
+}
+
+// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0, %[[REPLICA_0]]#1, %[[REPLICA_1]]#1
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index fe9283d6932..2fd230005d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -156,9 +156,9 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
 //   }) {device = "/DEVICE:3"} : () -> tensor<i1>
 //   tf_executor.yield %a1, %b1 : tensor<i1>, tensor<i1>
 // }
-LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
-                                         tf_executor::IslandOp island_op,
-                                         tf_device::ReplicateOp replicate_op) {
+void CreateIslandsFromReplicate(const Dialect* tf_dialect,
+                                tf_executor::IslandOp island_op,
+                                tf_device::ReplicateOp replicate_op) {
   OpBuilder builder(island_op);
   const int num_replicas = replicate_op.n().getLimitedValue();
 
@@ -199,21 +199,17 @@ LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
   }
 
   island_op.erase();
-  return success();
 }
 
 // Finds islands with a single `tf_device.replicate` and create individual
 // islands per replica of the replicate.
-LogicalResult LowerSingleIslandReplicateToIslands(
-    const Dialect* tf_dialect, tf_executor::IslandOp island_op) {
-  if (!hasSingleElement(island_op.GetBody().without_terminator()))
-    return success();
+void LowerSingleIslandReplicateToIslands(const Dialect* tf_dialect,
+                                         tf_executor::IslandOp island_op) {
+  if (!island_op.WrapsSingleOp()) return;
 
   if (auto replicate_op =
           llvm::dyn_cast<tf_device::ReplicateOp>(&island_op.GetBody().front()))
-    return CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
-
-  return success();
+    CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
 }
 
 void ReplicateToIslandPass::runOnFunction() {
@@ -223,13 +219,9 @@ void ReplicateToIslandPass::runOnFunction() {
     getFunction().emitError() << "'tf' dialect is not registered";
   }
 
-  auto result = getFunction().walk([&](tf_executor::IslandOp island_op) {
-    if (failed(LowerSingleIslandReplicateToIslands(tf_dialect, island_op)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
+  getFunction().walk([&](tf_executor::IslandOp island_op) {
+    LowerSingleIslandReplicateToIslands(tf_dialect, island_op);
   });
-
-  if (result.wasInterrupted()) return signalPassFailure();
 }
 }  // anonymous namespace
 

From 22eb1624cb3b2eb4e2369bcba35bd9156aa080d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 18:39:40 -0700
Subject: [PATCH 0546/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311446541
Change-Id: I40afd0237cbf7947fe620390ff2788dbbb3203e9
---
 tensorflow/go/op/wrappers.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index bab430e1472..598e3a48bfe 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -17329,13 +17329,13 @@ func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 // SparseBincountAttr is an optional argument to SparseBincount.
 type SparseBincountAttr func(optionalAttr)
 
-// SparseBincountBinaryCount sets the optional binary_count attribute to value.
+// SparseBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func SparseBincountBinaryCount(value bool) SparseBincountAttr {
+func SparseBincountBinaryOutput(value bool) SparseBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 
@@ -17434,13 +17434,13 @@ func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 // DenseBincountAttr is an optional argument to DenseBincount.
 type DenseBincountAttr func(optionalAttr)
 
-// DenseBincountBinaryCount sets the optional binary_count attribute to value.
+// DenseBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func DenseBincountBinaryCount(value bool) DenseBincountAttr {
+func DenseBincountBinaryOutput(value bool) DenseBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 
@@ -38654,13 +38654,13 @@ func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output,
 // RaggedBincountAttr is an optional argument to RaggedBincount.
 type RaggedBincountAttr func(optionalAttr)
 
-// RaggedBincountBinaryCount sets the optional binary_count attribute to value.
+// RaggedBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func RaggedBincountBinaryCount(value bool) RaggedBincountAttr {
+func RaggedBincountBinaryOutput(value bool) RaggedBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 

From 43adb4ee3b96e250e05533bc2470813c2f70272c Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 13 May 2020 18:46:10 -0700
Subject: [PATCH 0547/1533] Remove nanopb dependency as it should not be needed
 anymore (grpc version on master no longer needs it)

PiperOrigin-RevId: 311447262
Change-Id: Icb8b98542188be57e5402cdbe4d9fb90c2c84f8d
---
 tensorflow/workspace.bzl | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6a958e1b00f..9b745656125 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -643,17 +643,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "com_github_nanopb_nanopb",
-        sha256 = "18234d9f01b57248472a9bfa65c3379352b5d66c15b0ef1c2b4feece4b5670fe",
-        build_file = "@com_github_grpc_grpc//third_party:nanopb.BUILD",
-        strip_prefix = "nanopb-0.4.1",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-            "https://github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "linenoise",
         build_file = clean_dep("//third_party:linenoise.BUILD"),

From d2c578c71901275323ba3c00c57ec2e91531a698 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 13 May 2020 18:50:29 -0700
Subject: [PATCH 0548/1533] [XLA:SPMD] Avoid designated initializer.

It broke external build.

PiperOrigin-RevId: 311447720
Change-Id: I460624dc2242deead277eb70fbd1c6a0701250f6
---
 .../xla/service/spmd/spmd_partitioner.h         | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 09d2c4af908..f22f564be73 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -370,14 +370,15 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   int64 NewChannel() { return (*next_channel_id_)++; }
 
   PartitionedHlo::PartitioningState MakePartitioningState() {
-    return PartitionedHlo::PartitioningState{
-        .b = &b_,
-        .module = module_,
-        .num_replicas = num_replicas_,
-        .partition_id = partition_id_,
-        .collective_ops_creator = collective_ops_creator_,
-        .next_channel_id = next_channel_id_,
-        .reshard_cache = &reshard_cache_};
+    PartitionedHlo::PartitioningState state;
+    state.b = &b_;
+    state.module = module_;
+    state.num_replicas = num_replicas_;
+    state.partition_id = partition_id_;
+    state.collective_ops_creator = collective_ops_creator_;
+    state.next_channel_id = next_channel_id_;
+    state.reshard_cache = &reshard_cache_;
+    return state;
   }
 
   SpmdBuilder* builder() { return &b_; }

From b3b2c766b49384ec947056a9744844e42fe7cbb7 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Thu, 14 May 2020 09:54:18 +0800
Subject: [PATCH 0549/1533] [tflite] Java binding for fp16 in NNAPI delegate

add Java binding to use allow_fp16 in NNAPI delegate
---
 .../tensorflow/lite/nnapi/NnApiDelegate.java   | 18 ++++++++++++++++--
 .../java/src/main/native/nnapi_delegate_jni.cc |  7 ++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
index 105c02dadba..257902e45a1 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/java/org/tensorflow/lite/nnapi/NnApiDelegate.java
@@ -118,12 +118,24 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
       return this;
     }
 
+    /**
+     * Enable or disable to allow fp32 computation to be run in fp16 in NNAPI.
+     * See https://source.android.com/devices/neural-networks#android-9
+     *
+     * <p>Only effective on Android 9 (API level 28) and above.
+     */
+    public Options setAllowFp16(boolean enable) {
+      this.allowFp16 = enable;
+      return this;
+    }
+
     private int executionPreference = EXECUTION_PREFERENCE_UNDEFINED;
     private String acceleratorName = null;
     private String cacheDir = null;
     private String modelToken = null;
     private Integer maxDelegatedPartitions = null;
     private Boolean useNnapiCpu = null;
+    private Boolean allowFp16 = null;
   }
 
   public NnApiDelegate(Options options) {
@@ -139,7 +151,8 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
             /*overrideDisallowCpu=*/ options.useNnapiCpu != null,
             /*disallowCpuValue=*/ options.useNnapiCpu != null
                 ? !options.useNnapiCpu.booleanValue()
-                : false);
+                : false,
+           options.allowFp16 != null ? options.allowFp16 : false);
   }
 
   public NnApiDelegate() {
@@ -204,7 +217,8 @@ public class NnApiDelegate implements Delegate, AutoCloseable {
       String modelToken,
       int maxDelegatedPartitions,
       boolean overrideDisallowCpu,
-      boolean disallowCpuValue);
+      boolean disallowCpuValue,
+      boolean allowFp16);
 
   private static native void deleteDelegate(long delegateHandle);
 
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
index 6b5171ddfef..df2c4030e6c 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_jni.cc
@@ -27,7 +27,8 @@ JNIEXPORT jlong JNICALL
 Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jint preference, jstring accelerator_name,
     jstring cache_dir, jstring model_token, jint max_delegated_partitions,
-    jboolean override_disallow_cpu, jboolean disallow_cpu_value) {
+    jboolean override_disallow_cpu, jboolean disallow_cpu_value,
+    jboolean allow_fp16) {
   StatefulNnApiDelegate::Options options = StatefulNnApiDelegate::Options();
   options.execution_preference =
       (StatefulNnApiDelegate::Options::ExecutionPreference)preference;
@@ -49,6 +50,10 @@ Java_org_tensorflow_lite_nnapi_NnApiDelegate_createDelegate(
     options.disallow_nnapi_cpu = disallow_cpu_value;
   }
 
+  if (allow_fp16) {
+    options.allow_fp16 = allow_fp16;
+  }
+
   auto delegate = new StatefulNnApiDelegate(options);
 
   if (options.accelerator_name) {

From cde93f014c5b42800d3a43ffea53001ccc635e29 Mon Sep 17 00:00:00 2001
From: ShengYang1 <yang.sheng@intel.com>
Date: Thu, 14 May 2020 10:53:58 +0800
Subject: [PATCH 0550/1533] update

---
 .../core/common_runtime/mkl_layout_pass.cc    |   2 +-
 .../common_runtime/mkl_layout_pass_test.cc    |  44 +--
 .../grappler/optimizers/mkl_remapper_test.cc  | 277 ++++++++----------
 .../core/grappler/optimizers/remapper.cc      |  11 +-
 tensorflow/core/kernels/BUILD                 |   4 -
 5 files changed, 145 insertions(+), 193 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 3695c4ca7f9..3374113465f 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1680,7 +1680,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   }
 
   static bool FusedBatchNormExRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
 
     int num_side_inputs;
     TF_CHECK_OK(GetNodeAttr(n->def(), "num_side_inputs", &num_side_inputs));
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 3f02c4b1512..71ab786f8a5 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -3216,18 +3216,17 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
             "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
+// clang-format off
 #ifdef ENABLE_MKLDNN_V1
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
-    InitGraph("node { name: 'A' op: '" #INPUT                                \
-              "'}"                                                           \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                           \
               "node { name: 'B' op: 'Input'}"                                \
               "node { name: 'C' op: 'Input'}"                                \
               "node { name: 'D' op: 'Input'}"                                \
               "node { name: 'E' op: 'Input'}"                                \
               "node { name: 'F' op: '_FusedBatchNormEx'"                     \
-              " attr { key: 'T'               value { type: " #T             \
-              " } }"                                                         \
+              " attr { key: 'T'               value { type: " #T " } }"      \
               " attr { key: 'U'               value { type: DT_FLOAT } }"    \
               " attr { key: 'data_format'     value { s: 'NCHW' } }"         \
               " attr { key: 'epsilon'         value { f: 0.0001 } }"         \
@@ -3236,12 +3235,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
               " attr { key: 'activation_mode' value { s: 'Relu' } }"         \
               " input: ['A', 'B', 'C', 'D', 'E'] }"                          \
               "node { name: 'G' op: 'Zeta'"                                  \
-              " attr { key: 'T' value { type: " #T                           \
-              " } }"                                                         \
+              " attr { key: 'T' value { type: " #T " } }"                    \
               " input: ['A', 'F'] }");                                       \
     EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
-              "A(" #INPUT                                                    \
-              ");B(Input);C(Input);D(Input);"                                \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);"                    \
               "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);"     \
               "DMT/_4(Const);E(Input);"                                      \
               "F(_MklFusedBatchNormEx);G(Zeta)|A->F;A->G;"                   \
@@ -3257,17 +3254,14 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
 // Rewrite test for _FusedBatchNormEx Op with side input
 #define REGISTER_TEST(NAME, T, INPUT)                                     \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
-    InitGraph("node { name: 'A' op: '" #INPUT                             \
-              "'}"                                                        \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
               "node { name: 'B' op: 'Input'}"                             \
               "node { name: 'C' op: 'Input'}"                             \
               "node { name: 'D' op: 'Input'}"                             \
               "node { name: 'E' op: 'Input'}"                             \
-              "node { name: 'F' op: '" #INPUT                             \
-              "'}"                                                        \
+              "node { name: 'F' op: '" #INPUT "'}"                        \
               "node { name: 'G' op: '_FusedBatchNormEx'"                  \
-              " attr { key: 'T'               value { type: " #T          \
-              " } }"                                                      \
+              " attr { key: 'T'               value { type: " #T " } }"   \
               " attr { key: 'U'               value { type: DT_FLOAT } }" \
               " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
               " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
@@ -3276,14 +3270,11 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
               " attr { key: 'activation_mode' value { s: 'Relu' } }"      \
               " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                  \
               "node { name: 'H' op: 'Zeta'"                               \
-              " attr { key: 'T' value { type: " #T                        \
-              " } }"                                                      \
+              " attr { key: 'T' value { type: " #T " } }"                 \
               " input: ['A', 'G'] }");                                    \
     EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
-              "A(" #INPUT                                                 \
-              ");B(Input);C(Input);D(Input);E(Input);"                    \
-              "F(" #INPUT                                                 \
-              ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                 \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
+              "F(" #INPUT ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"     \
               "B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");               \
   }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
@@ -3292,15 +3283,13 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
 // Rewrite test for _FusedBatchNormEx Op with Identity activation
 #define REGISTER_TEST(NAME, T, INPUT)                                     \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
-    InitGraph("node { name: 'A' op: '" #INPUT                             \
-              "'}"                                                        \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
               "node { name: 'B' op: 'Input'}"                             \
               "node { name: 'C' op: 'Input'}"                             \
               "node { name: 'D' op: 'Input'}"                             \
               "node { name: 'E' op: 'Input'}"                             \
               "node { name: 'G' op: '_FusedBatchNormEx'"                  \
-              " attr { key: 'T'               value { type: " #T          \
-              " } }"                                                      \
+              " attr { key: 'T'               value { type: " #T " } }"   \
               " attr { key: 'U'               value { type: DT_FLOAT } }" \
               " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
               " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
@@ -3309,18 +3298,17 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
               " attr { key: 'activation_mode' value { s: 'Identity' } }"  \
               " input: ['A', 'B', 'C', 'D', 'E'] }"                       \
               "node { name: 'H' op: 'Zeta'"                               \
-              " attr { key: 'T' value { type: " #T                        \
-              " } }"                                                      \
+              " attr { key: 'T' value { type: " #T " } }"                 \
               " input: ['A', 'G'] }");                                    \
     EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
-              "A(" #INPUT                                                 \
-              ");B(Input);C(Input);D(Input);E(Input);"                    \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
               "G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                   \
               "B->G:1;C->G:2;D->G:3;E->G:4;G->H:1");                      \
   }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
 #undef REGISTER_TEST
 #endif  // ENABLE_MKLDNN_V1
+// clang-format on
 
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 85d802a2e38..cf1953fcdb2 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -300,169 +300,136 @@ TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
   using ::tensorflow::ops::Placeholder;
 
   for (bool is_training : {true, false}) {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    for (bool has_side_input : {true, false}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-    const int num_channels = 24;
+      const int num_channels = 24;
 
-    TensorShape channel_shape({num_channels});
-    TensorShape empty_shape({0});
+      TensorShape channel_shape({num_channels});
+      TensorShape empty_shape({0});
 
-    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
-                             ops::Placeholder::Shape({2, 8, 8, num_channels}));
-    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
-    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
-    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
-    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
-    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+      auto input =
+          Placeholder(s.WithOpName("input"), DT_FLOAT,
+                      ops::Placeholder::Shape({2, 8, 8, num_channels}));
+      auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+      auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+      auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+      auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+      auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
 
-    float epsilon = 0.1f;
-    auto fbn = ops::FusedBatchNormV3(
-        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
-        ops::FusedBatchNormV3::IsTraining(is_training)
-            .Epsilon(epsilon)
-            .DataFormat("NHWC"));
-    auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
-    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+      float epsilon = 0.1f;
+      auto fbn =
+          ops::FusedBatchNormV3(s.WithOpName("fused_batch_norm"), input_cast,
+                                scale, offset, mean, var,
+                                ops::FusedBatchNormV3::IsTraining(is_training)
+                                    .Epsilon(epsilon)
+                                    .DataFormat("NHWC"));
 
-    auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
-    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                             : channel_shape);
-    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                            : channel_shape);
+      if (has_side_input) {
+        auto side_input =
+            Placeholder(s.WithOpName("side_input"), DT_FLOAT,
+                        ops::Placeholder::Shape({2, 8, 8, num_channels}));
+        auto side_input_cast =
+            ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
+        auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
+        auto relu = ops::Relu(s.WithOpName("relu"), add);
+      } else {
+        auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
+      }
 
-    GrapplerItem item;
-    item.fetch = {"fetch"};
-    item.feed = {{"input", input_t},
-                 {"scale", scale_t},
-                 {"offset", offset_t},
-                 {"mean", mean_t},
-                 {"var", var_t}};
-    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+      auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+      auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                               : channel_shape);
+      auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                              : channel_shape);
+      auto side_input_t =
+          GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
 
-    // Place all nodes on CPU.
-    for (int i = 0; i < item.graph.node_size(); ++i) {
-      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      GrapplerItem item;
+      item.fetch = {"relu"};
+      if (has_side_input)
+        item.feed = {{"input", input_t},   {"scale", scale_t},
+                     {"offset", offset_t}, {"mean", mean_t},
+                     {"var", var_t},       {"side_input", side_input_t}};
+      else
+        item.feed = {{"input", input_t},
+                     {"scale", scale_t},
+                     {"offset", offset_t},
+                     {"mean", mean_t},
+                     {"var", var_t}};
+      TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+      // Place all nodes on CPU.
+      for (int i = 0; i < item.graph.node_size(); ++i) {
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      }
+
+      Remapper optimizer(RewriterConfig::AGGRESSIVE);
+      GraphDef output;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+      int found = 0;
+      if (has_side_input) {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "add") {
+            EXPECT_EQ(node.op(), "Add");
+            ASSERT_EQ(node.input_size(), 2);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            EXPECT_EQ(node.input(1), "side_input_cast");
+            found++;
+          }
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Relu");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "add");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "FusedBatchNormV3");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 3);
+      } else {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Identity");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "_FusedBatchNormEx");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+
+            auto attr = node.attr();
+            EXPECT_EQ(attr["num_side_inputs"].i(), 0);
+            EXPECT_EQ(attr["activation_mode"].s(), "Relu");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 2);
+      }
+
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+      ASSERT_EQ(tensors.size(), 1);
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
     }
-
-    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
-    GraphDef output;
-    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-    int found = 0;
-    for (const NodeDef& node : output.node()) {
-      if (node.name() == "relu") {
-        EXPECT_EQ(node.op(), "Identity");
-        ASSERT_EQ(node.input_size(), 1);
-        EXPECT_EQ(node.input(0), "fused_batch_norm");
-        found++;
-      }
-      if (node.name() == "fused_batch_norm") {
-        EXPECT_EQ(node.op(), "_FusedBatchNormEx");
-        ASSERT_EQ(node.input_size(), 5);
-        EXPECT_EQ(node.input(0), "input_cast");
-        EXPECT_EQ(node.input(1), "scale");
-        EXPECT_EQ(node.input(2), "offset");
-        EXPECT_EQ(node.input(3), "mean");
-        EXPECT_EQ(node.input(4), "var");
-
-        auto attr = node.attr();
-        EXPECT_EQ(attr["num_side_inputs"].i(), 0);
-        EXPECT_EQ(attr["activation_mode"].s(), "Relu");
-        found++;
-      }
-    }
-    EXPECT_EQ(found, 2);
-  }
-}
-
-TEST_F(MklRemapperTest, FuseBatchNormWithAddAndRelu) {
-  using ::tensorflow::ops::Placeholder;
-
-  for (bool is_training : {true, false}) {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-
-    const int num_channels = 24;
-
-    TensorShape input_shape({2, 8, 8, num_channels});
-    TensorShape channel_shape({num_channels});
-    TensorShape empty_shape({0});
-
-    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
-                             ops::Placeholder::Shape(input_shape));
-    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
-    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
-    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
-    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
-    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
-    auto side_input = Placeholder(s.WithOpName("side_input"), DT_FLOAT,
-                                  ops::Placeholder::Shape(input_shape));
-    auto side_input_cast =
-        ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
-
-    float epsilon = 0.1f;
-    auto fbn = ops::FusedBatchNormV3(
-        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
-        ops::FusedBatchNormV3::IsTraining(is_training)
-            .Epsilon(epsilon)
-            .DataFormat("NHWC"));
-    auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
-    auto relu = ops::Relu(s.WithOpName("relu"), add);
-    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
-
-    auto input_t = GenerateRandomTensor<DT_FLOAT>(input_shape);
-    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                             : channel_shape);
-    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                            : channel_shape);
-    auto side_input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
-
-    GrapplerItem item;
-    item.fetch = {"fetch"};
-    item.feed = {{"input", input_t},   {"scale", scale_t},
-                 {"offset", offset_t}, {"mean", mean_t},
-                 {"var", var_t},       {"side_input", side_input_t}};
-    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
-
-    // Place all nodes on CPU.
-    for (int i = 0; i < item.graph.node_size(); ++i) {
-      item.graph.mutable_node(i)->set_device("/device:CPU:0");
-    }
-
-    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
-    GraphDef output;
-    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-    int found = 0;
-    for (const NodeDef& node : output.node()) {
-      if (node.name() == "add") {
-        EXPECT_EQ(node.op(), "Add");
-        ASSERT_EQ(node.input_size(), 2);
-        EXPECT_EQ(node.input(0), "fused_batch_norm");
-        EXPECT_EQ(node.input(1), "side_input_cast");
-        found++;
-      }
-      if (node.name() == "relu") {
-        EXPECT_EQ(node.op(), "Relu");
-        ASSERT_EQ(node.input_size(), 1);
-        EXPECT_EQ(node.input(0), "add");
-        found++;
-      }
-      if (node.name() == "fused_batch_norm") {
-        EXPECT_EQ(node.op(), "FusedBatchNormV3");
-        ASSERT_EQ(node.input_size(), 5);
-        EXPECT_EQ(node.input(0), "input_cast");
-        EXPECT_EQ(node.input(1), "scale");
-        EXPECT_EQ(node.input(2), "offset");
-        EXPECT_EQ(node.input(3), "mean");
-        EXPECT_EQ(node.input(4), "var");
-        found++;
-      }
-    }
-    EXPECT_EQ(found, 3);
   }
 }
 #endif  // ENABLE_MKLDNN_V1
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index eeaaefc52c0..9a7d1953105 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -796,8 +796,9 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
       [&](const utils::MutableNodeView& fused_batch_norm) -> bool {
     const auto* fused_batch_norm_node_def = fused_batch_norm.node();
     if (!IsFusedBatchNorm(*fused_batch_norm_node_def)) return false;
-// We fuse FusedBatchNorm on GPU or MKL CPU.
+
 #ifndef ENABLE_MKLDNN_V1
+    // We fuse FusedBatchNorm on GPU or MKL CPU.
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
 #endif
 
@@ -868,8 +869,8 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 
   // Input to a Relu can be an Add node with FusedBatchNorm as one of the inputs
   if (IsAdd(*relu_fanin_0_node_def)) {
-// Currently no CPU implementation for "FusedBatchNorm + SideInput +
-// <Activation>""
+    // Currently no CPU implementation for "FusedBatchNorm + SideInput +
+    // <Activation>""
 #ifdef ENABLE_MKLDNN_V1
     return false;
 #endif
@@ -959,10 +960,10 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
 
   // FusedBatchNormV2 and V3 have an extra type parameter.
   if (fused_batch_norm.op() != "FusedBatchNorm") {
-    (*attr)["U"] = src_attr.at("U");
+    SetAttrValue(src_attr.at("U"), &(*attr)["U"]);
   } else {
 #ifndef ENABLE_MKLDNN_V1
-    (*attr)["U"] = src_attr.at("T");
+    SetAttrValue(src_attr.at("T"), &(*attr)["U"]);
 #else
     SetAttrValue(DT_FLOAT, &(*attr)["U"]);
 #endif
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b009cbfb565..d5d59329c9a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8225,10 +8225,6 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    hdrs = [
-        "fused_batch_norm_op.h",
-        "no_op.h",
-    ],
     deps = NN_DEPS + [
         ":fused_batch_norm_op",
         ":no_op",

From db9b247cd1f3ff046359f7b64ca60c2d697fe2e1 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 13 May 2020 19:53:08 -0700
Subject: [PATCH 0551/1533] Fix the functional model loading with nested
 sequential model.

The nested sequential model is created with _is_graph_network = False, the current instance check is not strong enough.

PiperOrigin-RevId: 311454248
Change-Id: I3b36cc037474587c134eab567d42694129c5cf52
---
 tensorflow/python/keras/engine/functional.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 80eb6cb27d5..c79e2849c4f 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -1017,7 +1017,9 @@ def _map_subgraph_network(inputs, outputs):
 def _should_skip_first_node(layer):
   """Returns True if the first layer node should not be saved or loaded."""
   # Networks start with a pre-existing node linking their input to output.
-  return isinstance(layer, Functional)
+  # For a sequential model, it is first created with _is_graph_network = False,
+  # we have to keep the _is_graph_network check here.
+  return isinstance(layer, Functional) and layer._is_graph_network
 
 
 def _deserialize_keras_tensors(kwargs, layer_map):

From 1549473a2e9e50ed9d3c751f25eaf7ee6db180d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 19:55:41 -0700
Subject: [PATCH 0552/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311454462
Change-Id: If1168947a389a398dc9f2d50279c35212d21a973
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 598e3a48bfe..c6d67c9ad44 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 91c1b12641744df0eb7c1194e6aff5f21992df6b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 14 May 2020 03:12:45 +0000
Subject: [PATCH 0553/1533] Add complex64/complex128 support for
 tf.math.l2_normalize

This PR tries to address the issue raised in 39522 where
there was no complex support for tf.math.l2_normalize.

This PR adds complex support for tf.math.l2_normalize.

This PR fixes 39522.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/nn_impl.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 03c1289246e..c99e8c29a64 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -641,6 +641,15 @@ def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_complex:
+      square_real = math_ops.square(math_ops.real(x))
+      square_imag = math_ops.square(math_ops.imag(x))
+      square_sum = math_ops.real(
+          math_ops.reduce_sum(square_real + square_imag, axis, keepdims=True))
+      x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
+      norm_real = math_ops.multiply(math_ops.real(x), x_inv_norm)
+      norm_imag = math_ops.multiply(math_ops.imag(x), x_inv_norm)
+      return math_ops.complex(norm_real, norm_imag, name=name)
     square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)

From 5e6479904941624cf7ce58ab3d236375c8012ef4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 20:19:40 -0700
Subject: [PATCH 0554/1533] Switch weights from per-value to per-input-item.

PiperOrigin-RevId: 311457055
Change-Id: I533b66dad37855bb264c73703c71d15da2ee2511
---
 .../api_def_DenseCountSparseOutput.pbtxt      |  23 +-
 .../api_def_RaggedCountSparseOutput.pbtxt     |  27 +-
 .../api_def_SparseCountSparseOutput.pbtxt     |  29 ++-
 tensorflow/core/kernels/count_ops.cc          | 246 +++++++-----------
 tensorflow/core/ops/count_ops.cc              |  39 +--
 tensorflow/python/ops/bincount.py             | 151 +++++++++--
 tensorflow/python/ops/bincount_test.py        | 188 +++++++++----
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v1/tensorflow.sparse.pbtxt     |   2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v2/tensorflow.sparse.pbtxt     |   2 +-
 11 files changed, 441 insertions(+), 278 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
index 416da1ccaab..8296bfe6d7b 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -4,61 +4,62 @@ op {
   in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing data to count.
+Tensor containing data to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values. May
+also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a tf.tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
index 1763aea1fa6..37224d841de 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -4,67 +4,68 @@ op {
   in_arg {
     name: "splits"
     description: <<END
-int64; Tensor containing the row splits of the ragged tensor to count.
+Tensor containing the row splits of the ragged tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
-  END
+Values tensor for the resulting sparse tensor object.
+END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
   END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a ragged tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
index 62538e36a45..a346710c8b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -4,73 +4,74 @@ op {
   in_arg {
     name: "indices"
     description: <<END
-int64; Tensor containing the indices of the sparse tensor to count.
+Tensor containing the indices of the sparse tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
 in_arg {
     name: "dense_shape"
     description: <<END
-int64; Tensor containing the dense shape of the sparse tensor to count.
+Tensor containing the dense shape of the sparse tensor to count.
 END
   }
-  in_arg {
+ in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_values"
       description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_dense_shape"
       description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a sparse tensor input."
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index e7cc18ac454..7c85b050039 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -16,17 +16,20 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-using BatchedIntMap = std::vector<absl::flat_hash_map<int64, int64>>;
+template <class T>
+using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
 
 namespace {
 // TODO(momernick): Extend this function to work with outputs of rank > 2.
-Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
+template <class T>
+Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
                     bool is_1d, OpKernelContext* context) {
   int total_values = 0;
   int num_batches = per_batch_counts.size();
@@ -44,12 +47,12 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
       context->allocate_output(1, TensorShape({total_values}), &values));
 
   auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<int64>();
+  auto output_values = values->flat<T>();
   int64 value_loc = 0;
   for (int b = 0; b < num_batches; ++b) {
     const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
+    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
+                                         per_batch_count.end());
     std::sort(pairs.begin(), pairs.end());
     for (const auto& x : pairs) {
       if (is_1d) {
@@ -77,85 +80,19 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
   return Status::OK();
 }
 
-Status OutputWeightedSparse(const BatchedIntMap& per_batch_counts,
-                            int num_values, const Tensor& weights, bool is_1d,
-                            OpKernelContext* context) {
-  if (!TensorShapeUtils::IsVector(weights.shape())) {
-    return errors::InvalidArgument(
-        "Weights must be a 1-dimensional tensor. Got: ",
-        weights.shape().DebugString());
-  }
-
-  if (num_values > weights.dim_size(0)) {
-    return errors::InvalidArgument("The maximum array value was ", num_values,
-                                   ", but the weight array has size ",
-                                   weights.shape().DebugString());
-  }
-  auto weight_values = weights.flat<float>();
-
-  int total_values = 0;
-  int num_batches = per_batch_counts.size();
-  for (const auto& per_batch_count : per_batch_counts) {
-    total_values += per_batch_count.size();
-  }
-
-  Tensor* indices;
-  int inner_dim = is_1d ? 1 : 2;
-  TF_RETURN_IF_ERROR(context->allocate_output(
-      0, TensorShape({total_values, inner_dim}), &indices));
-
-  Tensor* values;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(1, TensorShape({total_values}), &values));
-
-  auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<float>();
-  int64 value_loc = 0;
-  for (int b = 0; b < num_batches; ++b) {
-    const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
-    std::sort(pairs.begin(), pairs.end());
-    for (const auto& x : pairs) {
-      if (is_1d) {
-        output_indices(value_loc, 0) = x.first;
-      } else {
-        output_indices(value_loc, 0) = b;
-        output_indices(value_loc, 1) = x.first;
-      }
-      output_values(value_loc) = x.second * weight_values(x.first);
-      ++value_loc;
-    }
-  }
-
-  Tensor* dense_shape;
-  if (is_1d) {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({1}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_values;
-  } else {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_batches;
-    dense_shape->flat<int64>().data()[1] = num_values;
-  }
-  return Status::OK();
-}
-
-template <class T>
-T GetOutputSize(T max_seen, T max_length, T min_length) {
+int GetOutputSize(int max_seen, int max_length, int min_length) {
   return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
 }
 
 }  // namespace
 
-template <class T>
+template <class T, class W>
 class DenseCount : public OpKernel {
  public:
   explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -170,6 +107,15 @@ class DenseCount : public OpKernel {
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
                     data.shape().DebugString()));
 
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == data.shape(),
+          errors::InvalidArgument(
+              "Weights and data must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; data shape: ", data.shape().DebugString()));
+    }
+
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
     int negative_valued_axis = -1;
     int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
@@ -179,19 +125,23 @@ class DenseCount : public OpKernel {
       num_batch_elements *= data.shape().dim_size(i);
     }
     int num_value_elements = data.shape().num_elements() / num_batch_elements;
-    auto per_batch_counts = BatchedIntMap(num_batch_elements);
+    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
+
     T max_value = 0;
 
     const auto data_values = data.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int i = 0;
     for (int b = 0; b < num_batch_elements; ++b) {
       for (int v = 0; v < num_value_elements; ++v) {
         const auto& value = data_values(i);
         if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-          if (binary_count_) {
-            (per_batch_counts[b])[value] = 1;
+          if (binary_output_) {
+            per_batch_counts[b][value] = 1;
+          } else if (use_weights) {
+            per_batch_counts[b][value] += weight_values(i);
           } else {
-            (per_batch_counts[b])[value]++;
+            per_batch_counts[b][value]++;
           }
           if (value > max_value) {
             max_value = value;
@@ -201,30 +151,24 @@ class DenseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
 };
 
-template <class T>
+template <class T, class W>
 class SparseCount : public OpKernel {
  public:
   explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -235,23 +179,27 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     bool is_1d = shape.NumElements() == 1;
-    const auto indices_values = indices.matrix<int64>();
-    const auto values_values = values.flat<T>();
-
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
+
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+
     T max_value = 0;
 
     for (int idx = 0; idx < num_values; ++idx) {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch])[value]++;
+          per_batch_counts[batch][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -259,30 +207,25 @@ class SparseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-template <class T>
+template <class T, class W>
 class RaggedCount : public OpKernel {
  public:
   explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -290,13 +233,15 @@ class RaggedCount : public OpKernel {
     const Tensor& values = context->input(1);
     const Tensor& weights = context->input(2);
     bool use_weights = weights.NumElements() > 0;
+    bool is_1d = false;
 
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
     int batch_idx = 0;
 
@@ -306,10 +251,12 @@ class RaggedCount : public OpKernel {
       }
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch_idx - 1])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch_idx - 1][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch_idx - 1])[value]++;
+          per_batch_counts[batch_idx - 1][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -317,42 +264,47 @@ class RaggedCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, false, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           false, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-#define REGISTER(TYPE)                                    \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")  \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          DenseCount<TYPE>)               \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          SparseCount<TYPE>)              \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          RaggedCount<TYPE>)
+#define REGISTER_W(W_TYPE) \
+  REGISTER(int32, W_TYPE)  \
+  REGISTER(int64, W_TYPE)
 
-REGISTER(int32);
-REGISTER(int64);
+#define REGISTER(I_TYPE, W_TYPE)                                     \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          DenseCount<I_TYPE, W_TYPE>)                \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          SparseCount<I_TYPE, W_TYPE>)               \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          RaggedCount<I_TYPE, W_TYPE>)
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_W);
+TF_CALL_float(REGISTER_W);
+TF_CALL_double(REGISTER_W);
+
+#undef REGISTER_W
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
index c9fbe1f8d8e..8de0a2ef954 100644
--- a/tensorflow/core/ops/count_ops.cc
+++ b/tensorflow/core/ops/count_ops.cc
@@ -19,12 +19,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
-  int32 rank = c->Rank(c->input(0));
-  DimensionHandle nvals = c->UnknownDim();
+  auto values = c->input(0);
+  auto weights = c->input(1);
+  ShapeHandle output;
+  auto num_weights = c->NumElements(weights);
+  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
+    output = values;
+  } else {
+    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
+  }
+  auto rank = c->Rank(output);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -32,8 +41,8 @@ Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
 }
 
 Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
-  DimensionHandle rank = c->Dim(c->input(0), 1);
-  DimensionHandle nvals = c->UnknownDim();
+  auto rank = c->Dim(c->input(0), 1);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -45,7 +54,7 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
   if (rank != c->kUnknownRank) {
     ++rank;  // Add the ragged dimension
   }
-  DimensionHandle nvals = c->UnknownDim();
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -54,12 +63,12 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
 
 REGISTER_OP("DenseCountSparseOutput")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(DenseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -69,12 +78,12 @@ REGISTER_OP("SparseCountSparseOutput")
     .Input("indices: int64")
     .Input("values: T")
     .Input("dense_shape: int64")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(SparseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -83,12 +92,12 @@ REGISTER_OP("SparseCountSparseOutput")
 REGISTER_OP("RaggedCountSparseOutput")
     .Input("splits: int64")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(RaggedCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
index e1b3bebaaaa..68950eaf596 100644
--- a/tensorflow/python/ops/bincount.py
+++ b/tensorflow/python/ops/bincount.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
@@ -33,7 +33,7 @@ def sparse_bincount(values,
                     axis=0,
                     minlength=None,
                     maxlength=None,
-                    binary_count=False,
+                    binary_output=False,
                     name=None):
   """Count the number of times an integer value appears in a tensor.
 
@@ -58,8 +58,9 @@ def sparse_bincount(values,
     maxlength: If given, skips `values` that are greater than or equal to
       `maxlength`, and ensures that the output has a `dense_shape` of at most
       `maxlength` in the inner dimension.
-    binary_count: Whether to do a binary count. When True, this op will return 1
-      for any value that exists instead of counting the number of occurrences.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
     name: A name for this op.
 
   Returns:
@@ -78,7 +79,7 @@ def sparse_bincount(values,
   SparseTensor) and returns a SparseTensor where the value of (i,j) is the
   number of times value j appears in batch i.
 
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(data, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
@@ -102,7 +103,7 @@ def sparse_bincount(values,
   dense shape is [2, 500] instead of [2,10002] or [2, 102].
 
   >>> minlength = maxlength = 500
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(
   ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
   >>> print(output)
@@ -123,8 +124,8 @@ def sparse_bincount(values,
   some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
   the 'values' tensor is all 1s.
 
-  >>> dense = [[10, 20, 30, 20], [11, 101, 11, 10001]]
-  >>> output = tf.sparse.bincount(dense, binary_count=True, axis=-1)
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
   [[    0    10]
@@ -136,20 +137,42 @@ def sparse_bincount(values,
    values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
    dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
 
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
   """
   with ops.name_scope(name, "count", [values, weights]):
     if not isinstance(values, sparse_tensor.SparseTensor):
       values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
           values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
 
-    if weights is not None and binary_count:
-      raise ValueError("binary_count and weights are mutually exclusive.")
-
-    if weights is None:
-      weights = []
-      output_type = dtypes.int64
-    else:
-      output_type = dtypes.float32
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
 
     if axis is None:
       axis = 0
@@ -162,38 +185,114 @@ def sparse_bincount(values,
     maxlength_value = maxlength if maxlength is not None else -1
 
     if axis == 0:
-      if isinstance(values,
-                    (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
+      if isinstance(values, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(values, weights)
+        values = values.values
+      elif isinstance(values, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(values, weights)
         values = values.values
       else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
         values = array_ops.reshape(values, [-1])
 
     if isinstance(values, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
           values.indices,
           values.values,
           values.dense_shape,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     elif isinstance(values, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
           values.row_splits,
           values.values,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     else:
+      weights = validate_dense_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
           values,
           weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
 
     return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_dense_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.dtype)
+
+  if not isinstance(weights, ops.Tensor):
+    raise ValueError(
+        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
+
+  return weights
+
+
+def validate_sparse_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, sparse_tensor.SparseTensor):
+    raise ValueError(
+        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
+
+  checks = []
+  if weights.dense_shape is not values.dense_shape:
+    checks.append(
+        check_ops.assert_equal(
+            weights.dense_shape,
+            values.dense_shape,
+            message="'weights' and 'values' must have the same dense shape."))
+  if weights.indices is not values.indices:
+    checks.append(
+        check_ops.assert_equal(
+            weights.indices,
+            values.indices,
+            message="'weights' and 'values' must have the same indices.")
+    )
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
+
+
+def validate_ragged_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, ragged_tensor.RaggedTensor):
+    raise ValueError(
+        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
+
+  checks = []
+  if weights.row_splits is not values.row_splits:
+    checks.append(
+        check_ops.assert_equal(
+            weights.row_splits,
+            values.row_splits,
+            message="'weights' and 'values' must have the same row splits."))
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_test.py
index 776b65b72d0..839af8dcc35 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import bincount
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -65,7 +67,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 6],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_maxlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -73,7 +75,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 7],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -82,7 +84,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 9],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_larger_values_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -91,40 +93,40 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 8],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_no_maxlength_weights",
           "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [1, 2, 3, 8, 5],
+          "expected_values": [2, 1, 0.5, 9, 3],
           "expected_shape": [2, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[0.5, 1, 2], [3, 4, 5]]
       }, {
           "testcase_name": "_maxlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "maxlength": 7,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [1, 2, 3, 0.5, 8],
+          "expected_values": [2, 1, 0.5, 3, 9],
           "expected_shape": [2, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
       }, {
           "testcase_name": "_minlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 9,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_minlength_larger_values_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 3,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_1d",
           "x": np.array([3, 2, 1, 1], dtype=np.int32),
@@ -146,7 +148,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        expected_shape,
                        minlength=None,
                        maxlength=None,
-                       binary_count=False,
+                       binary_output=False,
                        weights=None,
                        axis=-1):
     y = bincount.sparse_bincount(
@@ -154,7 +156,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
         weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -216,7 +218,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
           "expected_values": [1, 1, 1, 1],
           "expected_shape": [3, 6],
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -230,7 +232,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -244,7 +246,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -258,7 +260,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -268,9 +270,10 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -279,11 +282,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -292,11 +296,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name":
@@ -305,11 +310,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name": "_1d",
@@ -338,16 +344,17 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
+    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_sparse,
-        weights=weights,
+        weights=w_sparse,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -393,7 +400,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 6],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_maxlength_binary",
@@ -402,7 +409,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 7],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_binary",
@@ -412,13 +419,13 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 9],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_larger_values_binary",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "minlength": 3,
-          "binary_count": True,
+          "binary_output": True,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
@@ -428,18 +435,18 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "testcase_name": "_no_maxlength_weights",
           "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_maxlength_weights",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "maxlength": 7,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_weights",
@@ -447,9 +454,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 9,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_larger_values_weights",
@@ -457,9 +464,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 3,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_1d",
@@ -484,21 +491,114 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
+    w = ragged_factory_ops.constant(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_ragged,
-        weights=weights,
+        weights=w,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
+class TestSparseCountFailureModes(test.TestCase):
+
+  def test_dense_input_sparse_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_ragged_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_wrong_shape_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = np.array([[3, 2], [5, 4], [4, 3]])
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must have the same shape"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+    else:
+      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_dense_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_ragged_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same indices"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_too_many_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible shapes"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_shape_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
+                 dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same dense shape"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_dense_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_sparse_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_different_shape_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same row splits"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 4c4f6c62291..f8f8edb26a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index a9ad81920dd..67235bb2cf2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"

From 59a473982d771a50d9c97298a69c06e6a90395b1 Mon Sep 17 00:00:00 2001
From: Teng Lu <teng.lu@intel.com>
Date: Thu, 14 May 2020 11:40:43 +0800
Subject: [PATCH 0555/1533] Support BF16 Softmax and add UT.

---
 tensorflow/core/kernels/mkl_tmp_bf16_ops.cc |  4 +++-
 tensorflow/core/ops/nn_grad.cc              |  2 +-
 tensorflow/python/ops/math_ops_test.py      |  2 +-
 tensorflow/python/ops/nn_grad_test.py       | 18 ++++++++++++++++++
 tensorflow/python/ops/nn_test.py            | 15 +++++++++++++++
 5 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
index 7f45979a57e..e8d53a1fadf 100644
--- a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
+++ b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
@@ -56,7 +56,9 @@ namespace tensorflow {
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("_FusedMatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);  \
   REGISTER_KERNEL_BUILDER(                                                    \
-      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
+      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp); \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
 
 TF_CALL_bfloat16(REGISTER_CPU);
 #undef REGISTER_CPU
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index c39f3adfa97..ae75e6b95b2 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -31,7 +31,7 @@ Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       // Ret val defs
       {"grad_x: T"},
       // Attr defs
-#if defined(INTEL_MKL) && defined(ENABLE_INTEL_MKL_BFLOAT16)
+#if defined(INTEL_MKL)
       {{"T: {float, double, bfloat16}"}},
 #else
       {{"T: {float, double}"}},
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index ab554388cdc..1362a23e104 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -45,7 +45,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
       self.assertEqual(y_tf, 21)
 
   def testReduceExtendType(self):
-    in_f32 = np.random.rand(1024, 1024).astype(np.float)
+    in_f32 = np.random.randn(1000, 1000).astype(np.float32)
     in_bf16 = math_ops.cast(in_f32, dtypes.bfloat16)
 
     out_f32 = self.evaluate(math_ops.reduce_sum(in_f32))
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 9da56cb7200..1334b733854 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -33,6 +33,24 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+class SoftmaxOpTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testSoftmaxGradGradExtendType(self):
+    if test_util.IsMklEnabled():
+      inputs = constant_op.constant(
+          [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.bfloat16)
+      r = nn_ops.softmax(inputs)
+      r_g = gradients_impl.gradients(r, inputs)[0]
+      with self.cached_session():
+        error = gradient_checker.compute_gradient_error(
+            inputs,
+            inputs.get_shape(),
+            r_g,
+            r_g.get_shape())
+        self.assertLess(error, 1e-4)
+
+
 class Relu6OpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 860bdc60387..ec60e13411d 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -130,6 +130,21 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
+  @test_util.run_deprecated_v1
+  def testSoftmaxExtendType(self):
+    if test_util.IsMklEnabled():
+      x_shape = [5, 10]
+      x_np = np.random.randn(*x_shape).astype(np.float32)
+
+      x_f32_tf = constant_op.constant(x_np)
+      x_bf16_tf = math_ops.cast(x_f32_tf, dtypes.bfloat16)
+      y_f32_tf = self.evaluate(nn_ops.softmax(x_f32_tf))
+      y_bf16_tf = self.evaluate(nn_ops.softmax(x_bf16_tf))
+      expected = math_ops.cast(y_f32_tf, dtypes.bfloat16)
+      # BF16 type has less precision
+      eps = 1e-2
+      self.assertAllClose(y_bf16_tf, expected, eps)
+
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
   @test_util.run_deprecated_v1
   def testGradient(self, x_shape):

From 486a076444e6aea7fabca8c2b984d1b6c2e50daa Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 20:41:31 -0700
Subject: [PATCH 0556/1533] support tpu strategy for crossing.

PiperOrigin-RevId: 311459380
Change-Id: I51e71d267147c6db2cba449788be63066a4f37bb
---
 .../preprocessing/categorical_crossing.py     |  4 ---
 .../categorical_crossing_distribution_test.py | 27 ++++++++++++++++---
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
index 88b552e23b7..68848458bb2 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import itertools
 
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -110,9 +109,6 @@ class CategoryCrossing(Layer):
       self._depth_tuple = depth
     elif depth is not None:
       self._depth_tuple = tuple([i for i in range(1, depth + 1)])
-    strategy = ds_context.get_strategy()
-    if strategy.__class__.__name__.startswith('TPUStrategy'):
-      raise ValueError('TPU strategy is not support for this layer yet.')
 
   def partial_crossing(self, partial_inputs, ragged_out, sparse_out):
     """Gets the crossed output from a partial list/tuple of inputs."""
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
index e1ba91e3558..57dea6edf4a 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
@@ -31,10 +33,22 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.platform import test
 
 
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
 @combinations.generate(
     combinations.combine(
         # Investigate why crossing is not supported with TPU.
-        distribution=strategy_combinations.strategies_minus_tpu,
+        distribution=strategy_combinations.all_strategies,
         mode=['eager', 'graph']))
 class CategoryCrossingDistributionTest(
     keras_parameterized.TestCase,
@@ -43,6 +57,9 @@ class CategoryCrossingDistributionTest(
   def test_distribution(self, distribution):
     input_array_1 = np.array([['a', 'b'], ['c', 'd']])
     input_array_2 = np.array([['e', 'f'], ['g', 'h']])
+    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(
+        {'input_1': input_array_1, 'input_2': input_array_2})
+    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
 
     # pyformat: disable
     expected_output = [[b'a_X_e', b'a_X_f', b'b_X_e', b'b_X_f'],
@@ -50,13 +67,15 @@ class CategoryCrossingDistributionTest(
     config.set_soft_device_placement(True)
 
     with distribution.scope():
-      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string)
-      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string)
+      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_1')
+      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_2')
       input_data = [input_data_1, input_data_2]
       layer = categorical_crossing.CategoryCrossing()
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict([input_array_1, input_array_2])
+    output_dataset = model.predict(inp_dataset)
     self.assertAllEqual(expected_output, output_dataset)
 
 
From fc56619a9b43fd6df93f4ee234a303fe77f05fb1 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Wed, 13 May 2020 20:44:56 -0700
Subject: [PATCH 0557/1533] Add the newly-added 'nnapi_allow_fp16' option to
 README.

PiperOrigin-RevId: 311459737
Change-Id: I1f132096008a142f403c811289f387852225d3e5
---
 tensorflow/lite/tools/benchmark/README.md | 1 +
 tensorflow/lite/tools/delegates/README.md | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index c44129cbbd3..ae7e4ae150d 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -73,6 +73,7 @@ Note when `use_legacy_nnapi` is selected, this parameter won't work.
     `/data/local/tmp/` and this benchmark tool will not correctly use NNAPI.
 *   `nnapi_accelerator_name`: `str` (default="")
 *   `disable_nnapi_cpu`: `bool` (default=false)
+*   `nnapi_allow_fp16`: `bool` (default=false)
 
 #### Hexagon delegate
 * `use_hexagon`: `bool` (default=false)
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index 709fcffb24d..bc1bffd49b6 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -73,6 +73,8 @@ TFLite delegate.
     [NNAPI CPU reference implementation](https://developer.android.com/ndk/guides/neuralnetworks#device-assignment)
     from the possible devices to be used by NNAPI to execute the model. This
     option is ignored if `nnapi_accelerator_name` is specified.
+*   `nnapi_allow_fp16`: `bool` (default=false) \
+    Whether to allow FP32 computation to be run in FP16.
 
 ### Hexagon delegate provider
 *   `use_hexagon`: `bool` (default=false) \

From 3f197f3f0e562be8b5ca04acb66487c8864ae5e6 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 13 May 2020 20:48:37 -0700
Subject: [PATCH 0558/1533] Optimize broadcast int8 max.

PiperOrigin-RevId: 311460102
Change-Id: Id1b3f64deca0d9aca7608985393be5814763817f
---
 .../internal/optimized/optimized_ops.h        | 156 ++++++++++++++++++
 tensorflow/lite/kernels/maximum_minimum.cc    |  64 +++++--
 2 files changed, 205 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index a6d37f4f1ed..c72400f33a5 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -7921,6 +7921,162 @@ void Transpose(const TransposeParams& unshrinked_params,
                       shrinked_output_shape, output_data);
 }
 
+// Assume input1 & input2 have the same scale & zero point.
+inline void MaximumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
+
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 8; i += 8) {
+    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
+    vst1_s8(output_data + i, max_data);
+  }
+#endif  // NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_val, input2_val);
+  }
+}
+
+inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
+  for (; i <= size - 8; i += 8) {
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
+    vst1_s8(output_data + i, max_data);
+  }
+#endif  // NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_data, input2_val);
+  }
+}
+
+inline void BroadcastMaximumFivefold(
+    const ArithmeticParams& unswitched_params,
+    const RuntimeShape& unswitched_input1_shape,
+    const int8* unswitched_input1_data,
+    const RuntimeShape& unswitched_input2_shape,
+    const int8* unswitched_input2_data, const RuntimeShape& output_shape,
+    int8* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastMaximumFivefoldInt8/8bit");
+
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const int8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const int8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  int8* output_data_ptr = output_data;
+  const int8* input1_data_ptr = input1_data;
+  const int8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const int8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            MaximumElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                               output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const int8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          MaximumScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                                 output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+// TODO(b/156140316): Try to unify the broadcast dispatch logic for binary ops.
+template <typename Op>
+inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BroadcastMaximumFivefold(params, input1_shape, input1_data, input2_shape,
+                           input2_data, output_shape, output_data);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3c6c524c13d..abe9647f69e 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -31,6 +32,7 @@ namespace maximum_minimum {
 // This file has a reference implementation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
+  kGenericOptimized,
 };
 
 constexpr int kInputTensor1 = 0;
@@ -85,7 +87,7 @@ struct MinimumOp {
   }
 };
 
-template <typename data_type, typename op_type>
+template <KernelType kernel_type, typename data_type, typename op_type>
 void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
                      const OpContext& op_context) {
   reference_ops::MaximumMinimumBroadcastSlow(
@@ -98,29 +100,57 @@ void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
       op_type::template op<data_type>);
 }
 
+// Maximum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MaximumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMaximumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MaximumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MaximumOp::template op<int8>);
+}
+
 template <KernelType kernel_type, typename OpType>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
 
-  if (kernel_type == kReference) {
     switch (op_context.output->type) {
       case kTfLiteFloat32:
-        TFLiteOperation<float, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, float, OpType>(context, node, op_context);
         break;
       case kTfLiteUInt8:
-        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, uint8_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt8:
-        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int8_t, OpType>(context, node, op_context);
         break;
       case kTfLiteInt32:
-        TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int32_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt64:
-        TFLiteOperation<int64_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int64_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt16:
-        TFLiteOperation<int16_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int16_t, OpType>(context, node,
+                                                      op_context);
         break;
       default:
         context->ReportError(context,
@@ -128,12 +158,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                              op_context.output->type);
         return kTfLiteError;
     }
-  } else {
-    context->ReportError(context,
-                         "Type %d is currently not supported by Maximum.",
-                         op_context.output->type);
-    return kTfLiteError;
-  }
   return kTfLiteOk;
 }
 
@@ -147,6 +171,14 @@ TfLiteRegistration* Register_MAXIMUM_REF() {
   return &r;
 }
 
+TfLiteRegistration* Register_MAXIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MaximumOp>};
+  return &r;
+}
+
 TfLiteRegistration* Register_MINIMUM_REF() {
   static TfLiteRegistration r = {
       nullptr, nullptr, maximum_minimum::Prepare,
@@ -154,7 +186,9 @@ TfLiteRegistration* Register_MINIMUM_REF() {
                             maximum_minimum::MinimumOp>};
   return &r;
 }
-TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
+TfLiteRegistration* Register_MAXIMUM() {
+  return Register_MAXIMUM_GENERIC_OPT();
+}
 TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
 
 }  // namespace builtin

From 3f75577690a2a1b420727ceef7f46e40697a6ce4 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 13 May 2020 21:10:03 -0700
Subject: [PATCH 0559/1533] Drop the dependency on ruy:detect_arm, which is
 becoming private.

PiperOrigin-RevId: 311462480
Change-Id: I74c62386997b34022301e673275856b77992a1b2
---
 tensorflow/lite/kernels/internal/BUILD        |  5 +-
 .../kernels/internal/optimized/cpu_check.cc   | 50 +++++++++++++++++++
 .../kernels/internal/optimized/cpu_check.h    |  8 +--
 .../internal/optimized/neon_tensor_utils.cc   |  3 +-
 4 files changed, 57 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/optimized/cpu_check.cc

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 5958a9c1098..93292fbb640 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -654,7 +654,6 @@ cc_library(
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "@ruy//ruy",
-        "@ruy//ruy:detect_arm",
     ],
 )
 
@@ -1039,6 +1038,7 @@ cc_test(
 
 cc_library(
     name = "cpu_check",
+    srcs = ["optimized/cpu_check.cc"],
     hdrs = [
         "optimized/cpu_check.h",
         "optimized/neon_check.h",
@@ -1058,9 +1058,6 @@ cc_library(
         ":windows": tflite_deps_intel,
         "//conditions:default": [],
     },
-    deps = [
-        "@ruy//ruy:detect_arm",  # safe to use regardless of arch.
-    ],
 )
 
 cc_test(
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.cc b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
new file mode 100644
index 00000000000..8fd17a7e33a
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
+
+namespace tflite {
+
+namespace {
+
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+bool DetectDotprodByLinuxAuxvMethod() {
+  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+  // however we need to support building against older headers for the time
+  // being.
+  const int kLocalHwcapAsimddp = 1 << 20;
+  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+}  // namespace
+
+bool DetectArmNeonDotprod() {
+#if defined __linux__ && defined __aarch64__
+  return DetectDotprodByLinuxAuxvMethod();
+#endif
+
+  return false;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
index 2c02e756f14..b39371a3e2f 100644
--- a/tensorflow/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 
-#include "ruy/detect_arm.h"  // from @ruy
-
 // This include is superfluous. However, it's been here for a while, and a
 // number of files have been relying on it to include neon_check.h for them.
 // This should be removed, but with a global run of presubmits to catch
@@ -25,12 +23,16 @@ limitations under the License.
 
 namespace tflite {
 
+// On A64, returns true if the dotprod extension is present.
+// On other architectures, returns false unconditionally.
+bool DetectArmNeonDotprod();
+
 struct CpuFlags {
   bool neon_dotprod = false;
 };
 
 inline void GetCpuFlags(CpuFlags* cpu_flags) {
-  cpu_flags->neon_dotprod = ruy::DetectDotprod();
+  cpu_flags->neon_dotprod = DetectArmNeonDotprod();
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 4d8c20074d5..4c90cd86a56 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <limits>
 #include <utility>
 
-#include "ruy/detect_arm.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
@@ -80,7 +79,7 @@ inline void* aligned_alloc(size_t alignment, size_t size,
 }
 
 bool HasSdotInstruction() {
-  static const bool has_dotprod = ruy::DetectDotprod();
+  static const bool has_dotprod = DetectArmNeonDotprod();
   return has_dotprod;
 }
 

From 5bd2ae7b8a491055842d7f8c0dd8dccc947fa4d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 21:15:51 -0700
Subject: [PATCH 0560/1533] Legalize tflite CustomOp

PiperOrigin-RevId: 311463041
Change-Id: I1a8eda844814ce08b247c94ad8ec1fb5debea033
---
 .../compiler/mlir/lite/transforms/lower_static_tensor_list.cc    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index a69b0a3c624..49be29065fe 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -859,6 +859,7 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   target.addLegalOp<ConstantOp>();
   target.addLegalOp<FuncOp>();
   target.addLegalOp<ReturnOp>();
+  target.addLegalOp<TFL::CustomOp>();
   // Register fused LSTM/RNN ops as legal.
   target.addLegalOp<TFL::LSTMOp>();
   target.addLegalOp<TFL::UnidirectionalSequenceLSTMOp>();

From 63262ea46da769530412d2591cf692f9d018e6ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 21:47:35 -0700
Subject: [PATCH 0561/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311466009
Change-Id: Id2a01503a9a383b197047968be3c385bbd5238ea
---
 tensorflow/go/op/wrappers.go | 124 +++++++++++++++++------------------
 1 file changed, 59 insertions(+), 65 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c6d67c9ad44..e6725269279 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4715,7 +4715,7 @@ type DenseCountSparseOutputAttr func(optionalAttr)
 
 // DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4727,7 +4727,7 @@ func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
 
 // DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4742,20 +4742,20 @@ func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	values: int32 or int64; Tensor containing data to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	values: Tensor containing data to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values. May
+// also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -8607,7 +8607,7 @@ type RaggedCountSparseOutputAttr func(optionalAttr)
 
 // RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8619,7 +8619,7 @@ func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
 
 // RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8634,33 +8634,27 @@ func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	splits: int64; Tensor containing the row splits of the ragged tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	splits: Tensor containing the row splits of the ragged tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//   END
-//   }
-//   out_arg {
-//     name: "output_dense_shape"
-//     description: <<END
-// int64; shape tensor for the resulting sparse tensor object.
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
 //   END
 //   }
 //   attr {
 //     name: "T"
 //     description: <<END
-// dtype; dtype of the input values tensor.
-//	output_dense_shape
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+// Dtype of the input values tensor.
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -12059,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -13706,7 +13700,7 @@ type SparseCountSparseOutputAttr func(optionalAttr)
 
 // SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13718,7 +13712,7 @@ func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
 
 // SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13733,22 +13727,22 @@ func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	indices: int64; Tensor containing the indices of the sparse tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	dense_shape: int64; Tensor containing the dense shape of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	indices: Tensor containing the indices of the sparse tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -18975,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 1b215ce9f3236d2de1c679530332ffe773ac4168 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Wed, 13 May 2020 22:00:02 -0700
Subject: [PATCH 0562/1533] Update speech training notebook to use quantized
 inputs and outputs

PiperOrigin-RevId: 311467379
Change-Id: Id8df2f2a5a72f4dd2f5b8c3178ab9980a22dfff9
---
 .../examples/micro_speech/train/train_micro_speech_model.ipynb  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
index 2a64ecd7078..bfe75bdd9f7 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
+++ b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
@@ -1 +1 @@
-{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","outputId":"1667d949-267c-4588-fe25-c0674d1dd074","executionInfo":{"status":"ok","timestamp":1588895159583,"user_tz":420,"elapsed":3711,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}},"colab":{"base_uri":"https://localhost:8080/","height":85}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","!echo \"Training these words:\" $WANTED_WORDS\n","!echo \"Training steps in each stage:\" $TRAINING_STEPS\n","!echo \"Learning rate in each stage:\" $LEARNING_RATE\n","!echo \"Total number of training steps:\" $TOTAL_STEPS"],"execution_count":1,"outputs":[{"output_type":"stream","text":["Training these words: yes,no\n","Training steps in each stage: 12000,3000\n","Learning rate in each stage: 0.001,0.0001\n","Total number of training steps: 15000\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE =20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 9.8077\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'$TOTAL_STEPS \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.quantized_input_stats = {\"Reshape_1\": (QUANT_INPUT_MIN, QUANT_INPUT_MAX)}\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","outputId":"d4a7c3eb-3d74-40e6-9eb5-7d2ffc5e3b6d","executionInfo":{"status":"ok","timestamp":1588901109389,"user_tz":420,"elapsed":9673,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}},"colab":{"base_uri":"https://localhost:8080/","height":51}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":15,"outputs":[{"output_type":"stream","text":["Float accuracy is 91.343042% (N=1236)\n","Quantized accuracy is 90.857605% (N=1236)\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":442},"outputId":"415d733c-86c4-4f19-9aa0-edc4112e6efb","executionInfo":{"status":"ok","timestamp":1588901187730,"user_tz":420,"elapsed":11964,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":16,"outputs":[{"output_type":"stream","text":["Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n","Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease\n","Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]\n","Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease\n","Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n","Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease\n","Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release\n","Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release\n","Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n","Get:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease [15.4 kB]\n","Get:11 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [908 kB]\n","Get:12 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [844 kB]\n","Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n","Get:16 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main Sources [1,814 kB]\n","Get:17 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [1,376 kB]\n","Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [1,205 kB]\n","Get:19 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main amd64 Packages [875 kB]\n","Fetched 7,294 kB in 3s (2,429 kB/s)\n","Reading package lists... Done\n","Selecting previously unselected package xxd.\n","(Reading database ... 144429 files and directories currently installed.)\n","Preparing to unpack .../xxd_2%3a8.0.1453-1ubuntu1.3_amd64.deb ...\n","Unpacking xxd (2:8.0.1453-1ubuntu1.3) ...\n","Setting up xxd (2:8.0.1453-1ubuntu1.3) ...\n","Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","outputId":"dbaba37d-8a8d-4e11-d780-478971d9ee95","colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"status":"ok","timestamp":1588901241295,"user_tz":420,"elapsed":1288,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":17,"outputs":[{"output_type":"stream","text":["unsigned char g_model[] = {\n","  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,\n","  0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,\n","  0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n","  0x64, 0x49, 0x00, 0x00, 0x34, 0x42, 0x00, 0x00, 0x1c, 0x42, 0x00, 0x00,\n","  0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n","  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,\n","  0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0xd4, 0x41, 0x00, 0x00, 0xb4, 0x41, 0x00, 0x00,\n","  0x24, 0x03, 0x00, 0x00, 0xf4, 0x02, 0x00, 0x00, 0xec, 0x02, 0x00, 0x00,\n","  0xe4, 0x02, 0x00, 0x00, 0xc4, 0x02, 0x00, 0x00, 0xbc, 0x02, 0x00, 0x00,\n","  0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0xee, 0xbc, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00,\n","  0xd0, 0xb9, 0xff, 0xff, 0xd4, 0xb9, 0xff, 0xff, 0x0a, 0xbd, 0xff, 0xff,\n","  0x04, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0xd4, 0x3e, 0x2e, 0xa6,\n","  0xd9, 0x4c, 0x23, 0x25, 0xd3, 0x2f, 0x09, 0xcb, 0xf6, 0x04, 0xc4, 0x1d,\n","  0xe5, 0x46, 0xf2, 0xcf, 0xd5, 0x53, 0x0c, 0x2b, 0x28, 0x06, 0xf8, 0xe9,\n","  0xe1, 0xdb, 0xdd, 0xf0, 0xbe, 0x0c, 0xfc, 0xa5, 0xb9, 0x1b, 0xca, 0x13,\n","  0x0d, 0xed, 0x0b, 0xd3, 0xff, 0xc8, 0x0d, 0xee, 0x04, 0xfe, 0xe1, 0x08,\n","  0xd9, 0xec, 0x26, 0x06, 0x0c, 0xcb, 0x1b, 0xc3, 0xf8, 0x81, 0xd5, 0xbc,\n","  0xc8, 0x48, 0xe6, 0x46, 0x0e, 0x34, 0x09, 0x0c, 0xea, 0x23, 0xe0, 0x14,\n","  0x17, 0xf5, 0xe0, 0x07, 0xe2, 0x3a, 0xaa, 0xea, 0x05, 0x5f, 0x26, 0x31,\n","  0x4e, 0xf6, 0xce, 0xe6, 0x0b, 0xed, 0xa7, 0xea, 0xbe, 0x08, 0xa4, 0x1b,\n","  0xd0, 0x50, 0x11, 0x2a, 0x16, 0xd3, 0xca, 0x11, 0xeb, 0xd8, 0xcb, 0xeb,\n","  0xfc, 0xee, 0xa5, 0x12, 0xda, 0x19, 0xfd, 0x1e, 0x1e, 0xc1, 0xc8, 0xe7,\n","  0xfc, 0x99, 0xae, 0xca, 0xe9, 0x57, 0x19, 0xe8, 0x1e, 0xff, 0xc4, 0xef,\n","  0xdc, 0x0d, 0x25, 0xef, 0x1c, 0xef, 0x2e, 0xed, 0xf3, 0x39, 0xd6, 0x76,\n","  0xe5, 0x4b, 0xb2, 0x2d, 0x4a, 0xf0, 0xf5, 0xcb, 0xc7, 0xf4, 0xbe, 0xea,\n","  0xcb, 0xed, 0xce, 0x0a, 0xa4, 0x69, 0x1a, 0x34, 0x0a, 0xdc, 0xca, 0x37,\n","  0xd4, 0xdf, 0x34, 0xe6, 0xf1, 0xd2, 0xb9, 0x1d, 0xb1, 0x42, 0xa3, 0x3a,\n","  0x0f, 0xc0, 0xc3, 0x0a, 0xcf, 0xc4, 0xe7, 0xd2, 0xfa, 0x62, 0x14, 0x18,\n","  0x49, 0xe1, 0x07, 0xe2, 0xec, 0x29, 0x4c, 0xd0, 0x53, 0xda, 0xdb, 0xe8,\n","  0xf9, 0x2f, 0x0e, 0xf6, 0x17, 0x2a, 0x23, 0x29, 0x7d, 0xec, 0x04, 0x2b,\n","  0x27, 0xf8, 0xb2, 0xdc, 0xbf, 0xec, 0xec, 0xb0, 0xe4, 0x62, 0x01, 0x42,\n","  0x28, 0xe2, 0x13, 0xe7, 0x13, 0xf3, 0xd3, 0xe1, 0xf7, 0xc3, 0xee, 0xf9,\n","  0xc4, 0x62, 0xfc, 0x58, 0x12, 0xc5, 0x02, 0x19, 0xe3, 0xe1, 0xf0, 0xe8,\n","  0xc4, 0x5e, 0xf9, 0xf3, 0x31, 0xce, 0xf0, 0xc0, 0xf8, 0x2e, 0x34, 0x37,\n","  0x7f, 0xc7, 0xa1, 0xdf, 0xf3, 0x31, 0xf8, 0xed, 0x27, 0x11, 0xc9, 0x19,\n","  0x72, 0xf3, 0x18, 0x1b, 0x2b, 0xe6, 0xef, 0xd8, 0xd1, 0xd4, 0x14, 0xf8,\n","  0xd5, 0x51, 0x40, 0x42, 0x2d, 0xe5, 0x0b, 0x94, 0x03, 0xf4, 0xde, 0xdf,\n","  0xf1, 0xc0, 0x08, 0xf9, 0xc4, 0x71, 0xf5, 0x75, 0x20, 0xc8, 0xf9, 0xcb,\n","  0xe0, 0x0c, 0x81, 0xf5, 0xc2, 0x6f, 0x25, 0xe3, 0x15, 0xca, 0x40, 0xac,\n","  0xe6, 0x37, 0x60, 0xb4, 0x30, 0xb8, 0x19, 0xdb, 0xf1, 0x22, 0x56, 0xfe,\n","  0x02, 0xf7, 0xfb, 0x0e, 0x68, 0xe6, 0x5e, 0x81, 0x15, 0xe4, 0xc5, 0xd9,\n","  0xc3, 0xbd, 0x42, 0xe5, 0xbe, 0x2f, 0xde, 0x3d, 0x04, 0xe3, 0x4a, 0x97,\n","  0xdb, 0xf6, 0xb1, 0xdf, 0xe5, 0xb2, 0x4b, 0xf2, 0xbc, 0x5e, 0x22, 0x7f,\n","  0xfd, 0xd7, 0x37, 0xda, 0xd2, 0x1a, 0x22, 0xf8, 0xbf, 0x69, 0x1b, 0x22,\n","  0x07, 0xcc, 0x11, 0xa3, 0xf8, 0x2c, 0x35, 0xdf, 0x60, 0xc8, 0xc9, 0xd9,\n","  0xeb, 0x0c, 0x4e, 0x2e, 0x28, 0xe4, 0x44, 0x02, 0x7f, 0xda, 0x62, 0x25,\n","  0x14, 0xe6, 0xbd, 0xe1, 0xcf, 0x9c, 0x50, 0x17, 0xff, 0x1e, 0xc3, 0x3c,\n","  0x25, 0xde, 0x4c, 0x14, 0xf7, 0xfc, 0x02, 0xe1, 0xdd, 0xd3, 0x3d, 0xf8,\n","  0xef, 0x49, 0x0c, 0x7b, 0x0a, 0xff, 0x24, 0x34, 0xfe, 0x2b, 0x14, 0x0b,\n","  0xb6, 0x4f, 0xc5, 0x23, 0xe6, 0xe2, 0x12, 0x9f, 0xeb, 0x21, 0xc9, 0x45,\n","  0x35, 0xcc, 0xbf, 0xea, 0x01, 0xf4, 0xe0, 0x15, 0x0e, 0xe8, 0x9d, 0xff,\n","  0x54, 0xc7, 0xec, 0x27, 0x32, 0xed, 0xe3, 0xef, 0xd6, 0xa7, 0xf5, 0xea,\n","  0xfa, 0x09, 0xc3, 0x32, 0x1d, 0xfd, 0x05, 0x19, 0x03, 0xf6, 0x05, 0xe9,\n","  0xed, 0xe6, 0x05, 0x64, 0xf0, 0x35, 0xdc, 0x61, 0x12, 0x1d, 0x20, 0x3c,\n","  0x0f, 0x33, 0xf8, 0x12, 0xa1, 0x1c, 0x81, 0x1d, 0xdc, 0xe1, 0x0a, 0x99,\n","  0xd1, 0xf7, 0x9f, 0xc9, 0x1b, 0xd8, 0x32, 0xf2, 0xee, 0xb3, 0xaf, 0x0f,\n","  0x01, 0xdd, 0x49, 0xf8, 0x7c, 0xa6, 0xbd, 0xac, 0x36, 0xeb, 0x0f, 0x01,\n","  0xdb, 0xca, 0xb8, 0xb8, 0xf8, 0xf6, 0xf9, 0x27, 0x32, 0xf8, 0xde, 0xef,\n","  0x19, 0xff, 0xf9, 0xf7, 0xf3, 0xde, 0xc7, 0x93, 0xfb, 0x1e, 0x1d, 0x50,\n","  0xf3, 0x31, 0xc5, 0x00, 0x18, 0x27, 0xb8, 0x1a, 0x9e, 0xdf, 0xd0, 0x2c,\n","  0xce, 0xe0, 0xa3, 0xa9, 0x9d, 0xb8, 0xaf, 0x67, 0x13, 0xd3, 0x19, 0xf7,\n","  0xed, 0x81, 0xb1, 0x3d, 0xe9, 0xd5, 0x00, 0xf4, 0x45, 0x93, 0xcd, 0x62,\n","  0x1e, 0xd6, 0x3a, 0x08, 0xd9, 0xb9, 0xd2, 0x1e, 0xeb, 0xe9, 0xbb, 0x1e,\n","  0x1f, 0xf9, 0xe0, 0x20, 0xf6, 0xf2, 0x30, 0xf9, 0xfe, 0xfb, 0xe9, 0x66,\n","  0xeb, 0xf5, 0x13, 0x40, 0xcf, 0x2d, 0xce, 0x0f, 0xe9, 0x06, 0x9a, 0x0c,\n","  0x64, 0xbc, 0xff, 0xff, 0x9a, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x31, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0xbc, 0xff, 0xff,\n","  0x88, 0xbc, 0xff, 0xff, 0xbe, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, 0xfe, 0xff, 0xff,\n","  0x78, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x43, 0xfd, 0xff, 0xff, 0xa9, 0xff, 0xff, 0xff, 0x97, 0xfc, 0xff, 0xff,\n","  0xea, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x3e, 0x00, 0x00,\n","  0xf5, 0xf9, 0xff, 0x08, 0xea, 0x05, 0x0f, 0x0c, 0xf3, 0x0e, 0xf6, 0x0f,\n","  0xfa, 0x01, 0x11, 0xf1, 0xf6, 0xea, 0xfc, 0x0f, 0xfc, 0xf1, 0xdd, 0x0e,\n","  0x1c, 0xef, 0xe6, 0xff, 0x05, 0xe8, 0x03, 0x11, 0xf6, 0xf1, 0x11, 0x0c,\n","  0xd7, 0x08, 0xf5, 0x30, 0xd9, 0x10, 0x14, 0x11, 0x10, 0x17, 0xee, 0x23,\n","  0x0c, 0xeb, 0x00, 0x06, 0xf6, 0xf7, 0x18, 0x0e, 0x18, 0x13, 0xfe, 0xfa,\n","  0xf3, 0xdd, 0xfa, 0xfb, 0x01, 0xfd, 0xe5, 0xe4, 0x00, 0x0d, 0xfe, 0x09,\n","  0xe9, 0x0a, 0x10, 0x1d, 0xf8, 0xf4, 0x0a, 0x1a, 0x10, 0x12, 0x18, 0xf1,\n","  0xfc, 0x1d, 0x00, 0x25, 0xd8, 0x08, 0xf8, 0xff, 0x06, 0x19, 0xf5, 0x0f,\n","  0x1c, 0x17, 0x0c, 0x16, 0xf3, 0x29, 0x20, 0x32, 0xfe, 0x19, 0xfb, 0x02,\n","  0x04, 0x15, 0xf3, 0x2b, 0x06, 0x14, 0x0e, 0xde, 0x04, 0x0e, 0xfc, 0x2d,\n","  0x1b, 0xdb, 0xec, 0xee, 0x00, 0xf6, 0x01, 0x33, 0x02, 0xe7, 0x06, 0xdd,\n","  0xf9, 0x03, 0x13, 0x03, 0xf8, 0xec, 0x14, 0xe4, 0x0f, 0xfa, 0xd4, 0x22,\n","  0x00, 0x11, 0x09, 0x02, 0x0e, 0xf4, 0x05, 0xfb, 0x04, 0x15, 0x04, 0x03,\n","  0xff, 0x0f, 0x09, 0xf2, 0xeb, 0xfc, 0x06, 0x00, 0xe5, 0x0a, 0xf2, 0xfc,\n","  0xfd, 0x12, 0xee, 0xe9, 0xf2, 0xfd, 0xf9, 0xf3, 0xce, 0x0f, 0xe9, 0xee,\n","  0xff, 0x14, 0x15, 0x0b, 0xcb, 0x03, 0xf2, 0x1b, 0xdb, 0x09, 0x1d, 0x07,\n","  0xd8, 0xde, 0xe6, 0x13, 0xd8, 0xf0, 0xe6, 0x00, 0xe7, 0xec, 0xd3, 0x00,\n","  0xc5, 0x25, 0xdb, 0x0a, 0xde, 0x1f, 0xd9, 0x11, 0xc1, 0x06, 0x01, 0x2e,\n","  0x09, 0x19, 0x09, 0x0f, 0xbe, 0x00, 0xf7, 0x08, 0x10, 0x12, 0xff, 0x10,\n","  0xf4, 0x05, 0xdf, 0x16, 0xe7, 0xe6, 0xef, 0xf4, 0xdd, 0x18, 0x18, 0x16,\n","  0xeb, 0x1a, 0xd7, 0xdb, 0xee, 0x15, 0xf1, 0x1e, 0xfc, 0x02, 0xfe, 0x0a,\n","  0xed, 0x17, 0x1c, 0x39, 0x01, 0xde, 0x06, 0xf3, 0xdb, 0x27, 0xfc, 0x1e,\n","  0xe4, 0x01, 0x03, 0x1d, 0xc5, 0x0d, 0xea, 0x0b, 0xfe, 0x05, 0xfc, 0x10,\n","  0xc2, 0x06, 0x0a, 0x51, 0xf4, 0xd8, 0xe8, 0x03, 0xcd, 0x1a, 0xe7, 0x13,\n","  0xfb, 0xfd, 0xe2, 0x2a, 0xf7, 0x0d, 0xea, 0x29, 0xfc, 0xea, 0x1c, 0x08,\n","  0x0a, 0x13, 0xfc, 0xf8, 0x15, 0xf3, 0x06, 0xe9, 0x1d, 0x0c, 0x1c, 0x14,\n","  0xdc, 0x17, 0x16, 0xff, 0x00, 0x06, 0x0c, 0xfe, 0x0c, 0x0a, 0xe6, 0x18,\n","  0xef, 0xd6, 0x1d, 0xee, 0xd2, 0x1c, 0xfe, 0x0d, 0xec, 0xfc, 0xe8, 0x02,\n","  0xf8, 0x13, 0xf9, 0x17, 0x08, 0xf8, 0xf9, 0x06, 0x04, 0x07, 0xcf, 0x07,\n","  0xfb, 0xde, 0xf2, 0x0c, 0xe4, 0xf2, 0x1d, 0xdd, 0xd7, 0xfd, 0xec, 0xfd,\n","  0xd8, 0xd9, 0x0a, 0xf5, 0xf4, 0x02, 0x1f, 0x0e, 0xf8, 0x1a, 0xe0, 0x06,\n","  0x0a, 0x23, 0xf6, 0x1f, 0xea, 0x07, 0xde, 0x00, 0xf5, 0x10, 0xe7, 0x06,\n","  0xf3, 0xe1, 0x0a, 0x2a, 0xf0, 0x00, 0x18, 0x09, 0xe8, 0xd6, 0xec, 0x00,\n","  0xef, 0x1c, 0xf2, 0x07, 0xf1, 0xf5, 0x16, 0x13, 0xdf, 0x0f, 0xdd, 0x1b,\n","  0x10, 0xdb, 0xfb, 0x07, 0xda, 0x17, 0xdf, 0x28, 0xf5, 0xe9, 0x07, 0x0b,\n","  0x02, 0xf4, 0xf0, 0x0e, 0xda, 0x1e, 0x1d, 0xff, 0xde, 0x0e, 0x1e, 0x24,\n","  0xf5, 0xfc, 0x08, 0x1f, 0xff, 0x12, 0x09, 0x18, 0x20, 0xd8, 0x08, 0xf0,\n","  0xef, 0x07, 0x02, 0x19, 0xe8, 0xf3, 0x02, 0x03, 0xdf, 0x22, 0x0e, 0x04,\n","  0x0d, 0xf9, 0xea, 0x1c, 0xf1, 0x17, 0x08, 0x02, 0x0b, 0x02, 0x00, 0x22,\n","  0xf0, 0x0e, 0xdf, 0x07, 0xea, 0x01, 0xf3, 0xef, 0xfb, 0xff, 0x07, 0xfd,\n","  0xf7, 0xf2, 0x14, 0x1e, 0x17, 0xe7, 0x12, 0xf8, 0xee, 0xfc, 0x09, 0xe0,\n","  0x08, 0xd5, 0x07, 0xff, 0x11, 0xf7, 0xee, 0x14, 0xfd, 0xe0, 0xda, 0x03,\n","  0xd5, 0xcd, 0x04, 0xe5, 0xea, 0xde, 0xf7, 0x02, 0x0b, 0xfb, 0x03, 0x10,\n","  0xf7, 0xcf, 0x0c, 0xfb, 0xee, 0x06, 0x0a, 0x12, 0x0e, 0xd7, 0xfb, 0x06,\n","  0xf6, 0xe0, 0xfb, 0xf1, 0xec, 0xf6, 0x13, 0xf6, 0x0a, 0xea, 0x24, 0x0a,\n","  0xfd, 0xe6, 0xf8, 0x19, 0x06, 0xe2, 0x05, 0x20, 0x08, 0xe3, 0xd8, 0x05,\n","  0x00, 0xcd, 0xeb, 0x0f, 0xfd, 0xec, 0xf6, 0xfc, 0xe1, 0xf8, 0xf4, 0xfe,\n","  0xdf, 0x10, 0xf8, 0x0d, 0xf3, 0xf9, 0x06, 0x06, 0xd5, 0xfb, 0x16, 0x18,\n","  0x00, 0xfe, 0xf9, 0x17, 0x12, 0xe2, 0xfb, 0xf8, 0xe5, 0x06, 0x29, 0xdf,\n","  0xfb, 0xfd, 0x08, 0x11, 0xf8, 0x10, 0x13, 0x03, 0xe1, 0xf9, 0xf8, 0xfd,\n","  0x06, 0xf2, 0x11, 0xff, 0xf8, 0xfe, 0x12, 0xf5, 0xf2, 0xe1, 0x26, 0x0b,\n","  0xe9, 0xfe, 0x04, 0xf1, 0xeb, 0xfd, 0x0c, 0x26, 0xfd, 0xfb, 0x12, 0xf8,\n","  0xfd, 0x01, 0x03, 0x05, 0x09, 0x27, 0x28, 0xff, 0x0f, 0x0a, 0xe9, 0xff,\n","  0x00, 0xec, 0xf7, 0xf4, 0x04, 0x03, 0x08, 0x10, 0xfe, 0xf3, 0x1f, 0xf5,\n","  0xf0, 0xff, 0x0a, 0x20, 0x0c, 0xd4, 0xef, 0xdb, 0xf5, 0xf4, 0x1a, 0x02,\n","  0xfe, 0xda, 0x04, 0xe4, 0x0b, 0xd9, 0x1a, 0xee, 0xfd, 0xc6, 0xf8, 0x0d,\n","  0xec, 0xfe, 0x19, 0xe1, 0x1f, 0xc5, 0x1d, 0x02, 0xf6, 0xd6, 0x04, 0xe6,\n","  0x06, 0xe4, 0x0c, 0xf0, 0x31, 0xe8, 0xe2, 0xec, 0x1d, 0xe8, 0x0f, 0x02,\n","  0x2d, 0xe8, 0xf1, 0xf7, 0x0f, 0xf9, 0x13, 0xfd, 0x1f, 0xd8, 0x24, 0x17,\n","  0xfb, 0xf8, 0x01, 0xe3, 0x14, 0xaf, 0x14, 0x01, 0x1c, 0xe5, 0x10, 0xf2,\n","  0x16, 0xd3, 0xed, 0xe3, 0x15, 0x02, 0x27, 0xeb, 0x1e, 0x12, 0x19, 0xff,\n","  0x16, 0xeb, 0x13, 0x11, 0xfa, 0x14, 0xf4, 0x02, 0x11, 0x08, 0xfc, 0xf9,\n","  0x07, 0xdc, 0x1c, 0xeb, 0x16, 0xf0, 0x1c, 0x06, 0x08, 0xfa, 0xf9, 0x11,\n","  0xee, 0x07, 0xf3, 0x06, 0xfd, 0xfd, 0x19, 0xf9, 0xf1, 0xe2, 0x1f, 0xf2,\n","  0x0f, 0xe9, 0x0c, 0xfb, 0x1d, 0x03, 0x02, 0xe2, 0x1c, 0x11, 0xfb, 0xf7,\n","  0x04, 0x04, 0x18, 0xe7, 0x27, 0xe2, 0xfc, 0xf5, 0x06, 0x00, 0x08, 0xfd,\n","  0x15, 0xdb, 0x16, 0xfe, 0x04, 0x08, 0xf8, 0xff, 0xfb, 0xeb, 0xeb, 0xfe,\n","  0xed, 0xf4, 0xf0, 0xe4, 0xfe, 0x22, 0x09, 0x02, 0x21, 0xc8, 0x0b, 0xe4,\n","  0xf4, 0xf2, 0x04, 0x02, 0xef, 0xce, 0x13, 0x07, 0xfa, 0xe0, 0xff, 0xf1,\n","  0xfe, 0xd5, 0xfc, 0xdc, 0x0f, 0xf2, 0x05, 0x10, 0x00, 0xd4, 0x24, 0xea,\n","  0x1e, 0xe3, 0x2a, 0x18, 0xf3, 0xd2, 0x01, 0xe0, 0x0e, 0xdb, 0x2a, 0xeb,\n","  0x02, 0xdd, 0xec, 0xd7, 0x12, 0xec, 0x31, 0xfc, 0x25, 0xd9, 0x04, 0x08,\n","  0x15, 0xd0, 0xe8, 0x14, 0x18, 0xf9, 0xfa, 0xf6, 0x24, 0xea, 0x0a, 0x06,\n","  0x02, 0xfb, 0x05, 0xea, 0x02, 0xf0, 0x04, 0xf1, 0x1f, 0x13, 0x04, 0x17,\n","  0x14, 0xf0, 0x0d, 0x10, 0x03, 0x05, 0x26, 0xec, 0xfe, 0xe8, 0x19, 0xe9,\n","  0x0a, 0xee, 0xe4, 0x04, 0x2a, 0xec, 0x1b, 0x06, 0x05, 0xff, 0xd7, 0xf5,\n","  0x1c, 0x0c, 0x20, 0xfe, 0xe3, 0xe1, 0x11, 0xdc, 0x2b, 0x03, 0x04, 0x1d,\n","  0x1a, 0xd4, 0x1d, 0xea, 0x06, 0x04, 0x04, 0x1a, 0x1e, 0xef, 0x00, 0xe0,\n","  0x1e, 0xf8, 0x0c, 0xfe, 0x12, 0xd8, 0x0b, 0xe5, 0xf2, 0x03, 0x21, 0x06,\n","  0x01, 0x22, 0xef, 0xf3, 0xfb, 0xfb, 0x25, 0x17, 0x08, 0xeb, 0xf3, 0xec,\n","  0xf4, 0x06, 0x21, 0xec, 0xe3, 0xe3, 0xe4, 0xe5, 0xf9, 0xe8, 0x0d, 0xec,\n","  0x1c, 0xc3, 0x0b, 0xdf, 0x12, 0x05, 0xe6, 0xdd, 0xde, 0xc5, 0xe6, 0xea,\n","  0x1a, 0xf1, 0x0f, 0xe3, 0x11, 0xcf, 0xea, 0xe5, 0xfe, 0xf6, 0x02, 0x0b,\n","  0x0e, 0xd5, 0x03, 0xd6, 0x11, 0x02, 0x2d, 0xfc, 0xed, 0xec, 0xee, 0xfa,\n","  0xf8, 0xf2, 0x01, 0x0e, 0x19, 0xf1, 0x14, 0x03, 0x1a, 0xf3, 0x0c, 0xf9,\n","  0xf5, 0xf4, 0xf2, 0xdf, 0xf0, 0xd6, 0x32, 0xf6, 0x18, 0x06, 0xf3, 0x01,\n","  0x02, 0xe8, 0x09, 0x14, 0xff, 0x0f, 0x23, 0x26, 0x05, 0xf3, 0x08, 0xf3,\n","  0x16, 0xfb, 0xed, 0x0d, 0x13, 0xe8, 0x25, 0xf1, 0xe9, 0xf2, 0xf5, 0x0c,\n","  0x19, 0xf0, 0x1f, 0xfa, 0x00, 0xe4, 0xfe, 0x22, 0xf2, 0xd5, 0x14, 0xe9,\n","  0x06, 0xe9, 0xfe, 0x13, 0x07, 0x08, 0x00, 0xfd, 0x16, 0xdb, 0xe0, 0x12,\n","  0x07, 0x14, 0x09, 0x1c, 0x17, 0x10, 0x20, 0xd3, 0xfd, 0xe9, 0x25, 0xfb,\n","  0x19, 0xd8, 0x0b, 0xf9, 0xf3, 0xde, 0xfe, 0x21, 0x12, 0xec, 0xf4, 0xe4,\n","  0xf7, 0xff, 0x21, 0xef, 0x26, 0x0f, 0xf9, 0xee, 0xe6, 0x03, 0x2f, 0xf7,\n","  0x0e, 0x10, 0xfa, 0x08, 0x0b, 0xfa, 0xe9, 0xff, 0xf9, 0xdd, 0x01, 0xe3,\n","  0xfb, 0x01, 0xfc, 0xf4, 0x1a, 0xb9, 0xf6, 0xd5, 0x1b, 0x01, 0xfd, 0xe2,\n","  0x03, 0xd2, 0x11, 0xf5, 0x10, 0xd9, 0x07, 0x07, 0xe1, 0xc1, 0xff, 0xd4,\n","  0x10, 0xef, 0x23, 0x10, 0x01, 0xba, 0x09, 0xd1, 0xfd, 0xe3, 0x0d, 0xe3,\n","  0x00, 0xcf, 0x03, 0xcd, 0xfd, 0xf9, 0xfe, 0xe9, 0x07, 0xe4, 0x04, 0xfc,\n","  0xf1, 0x00, 0x21, 0x01, 0xf6, 0x01, 0xda, 0x14, 0xe8, 0xd9, 0x14, 0x05,\n","  0x08, 0x01, 0x26, 0xf8, 0xfb, 0xc1, 0x2c, 0x1a, 0x06, 0xed, 0xef, 0xf5,\n","  0xf1, 0x00, 0x0e, 0x19, 0x1f, 0x08, 0xff, 0x0c, 0x04, 0xf6, 0x25, 0x17,\n","  0x1a, 0x0b, 0xeb, 0xe6, 0x0f, 0x10, 0x13, 0x14, 0x12, 0xfa, 0x22, 0xee,\n","  0xe6, 0x0b, 0x2d, 0xf9, 0x1e, 0xf0, 0x04, 0x09, 0x00, 0x0f, 0x2f, 0x05,\n","  0xe8, 0xf9, 0x03, 0xd7, 0x02, 0xea, 0x1f, 0xfd, 0x22, 0xed, 0xf1, 0xed,\n","  0xfe, 0xdc, 0x0d, 0x0e, 0x0c, 0xf0, 0x19, 0xf1, 0x09, 0xe0, 0x2c, 0xfb,\n","  0x02, 0xdc, 0xf3, 0xd9, 0x32, 0xf7, 0x09, 0xe3, 0x09, 0x17, 0x03, 0xf3,\n","  0x08, 0x01, 0x1b, 0xfa, 0x06, 0xfa, 0x1f, 0x15, 0x16, 0xe7, 0x16, 0xfe,\n","  0xfe, 0xf4, 0xe0, 0xe2, 0x12, 0x21, 0xfa, 0x15, 0x00, 0xcb, 0x07, 0xb6,\n","  0x1b, 0xf2, 0x34, 0xfa, 0xfd, 0xba, 0x19, 0xd4, 0x2c, 0xde, 0xf2, 0x1c,\n","  0x0c, 0xc5, 0xef, 0xe4, 0x0a, 0xfb, 0x03, 0x03, 0xf2, 0xcd, 0x01, 0xe0,\n","  0xf2, 0xf6, 0xf5, 0x0a, 0xf6, 0xc5, 0x0d, 0xe2, 0x09, 0xdc, 0x00, 0x05,\n","  0x10, 0xe1, 0x14, 0xf7, 0x02, 0x08, 0x14, 0x12, 0xf5, 0xf8, 0x1c, 0xe9,\n","  0xf5, 0xf1, 0x26, 0xd8, 0x16, 0x06, 0x00, 0xf8, 0xf4, 0xe0, 0x32, 0x03,\n","  0x07, 0x15, 0xea, 0x10, 0xf2, 0xfa, 0x17, 0x1f, 0x07, 0x07, 0x17, 0x06,\n","  0x06, 0xe7, 0x05, 0xfe, 0xe5, 0x1b, 0x16, 0xff, 0xf8, 0xfe, 0x2c, 0xf8,\n","  0x00, 0x03, 0xf3, 0xf3, 0xf3, 0xf0, 0xfb, 0xdf, 0x02, 0xe5, 0x16, 0xed,\n","  0xf9, 0x01, 0x23, 0x03, 0x16, 0xe6, 0xfe, 0xeb, 0x00, 0xf0, 0x27, 0x1b,\n","  0xeb, 0xee, 0x03, 0xe9, 0x02, 0xd8, 0x2f, 0xe4, 0x0d, 0xde, 0x14, 0xe3,\n","  0xfd, 0xf6, 0x13, 0x06, 0x10, 0xf4, 0xeb, 0xe5, 0x19, 0xf0, 0x17, 0xea,\n","  0x15, 0x0d, 0xe4, 0x0b, 0x31, 0xf3, 0x13, 0x1b, 0xf9, 0xe0, 0x0b, 0xfc,\n","  0x09, 0x03, 0x26, 0xe6, 0xeb, 0xd1, 0xd9, 0xc8, 0x00, 0xf7, 0x26, 0x0a,\n","  0x08, 0xd4, 0xe3, 0xd6, 0x1b, 0x06, 0x1a, 0xed, 0xf4, 0xee, 0xfd, 0xe7,\n","  0x14, 0xe1, 0x06, 0x11, 0xf9, 0xaa, 0xf6, 0xd7, 0x0c, 0xdf, 0x25, 0x17,\n","  0x11, 0xd8, 0xfa, 0x08, 0x0e, 0xed, 0x29, 0x0c, 0xec, 0xeb, 0x0b, 0x02,\n","  0xf3, 0xfb, 0x19, 0x1c, 0x13, 0x11, 0x10, 0xeb, 0x0d, 0xef, 0x11, 0xff,\n","  0x14, 0xe4, 0xd9, 0x02, 0xed, 0xe6, 0x23, 0xdf, 0xfb, 0xf4, 0xef, 0xee,\n","  0xf9, 0xf2, 0x24, 0x04, 0x03, 0x02, 0x0b, 0x0e, 0xed, 0x08, 0x19, 0xf9,\n","  0xf2, 0x02, 0xf4, 0x02, 0xf0, 0x1b, 0x03, 0x08, 0xf7, 0xe7, 0xf9, 0xf3,\n","  0xf7, 0x15, 0x11, 0x18, 0x18, 0x0e, 0x13, 0x13, 0x0d, 0x0e, 0x0e, 0x06,\n","  0xfb, 0xe8, 0x13, 0x09, 0x07, 0xf2, 0x24, 0x0c, 0x22, 0xf8, 0x08, 0xef,\n","  0xee, 0xec, 0x25, 0x09, 0x17, 0xde, 0xfb, 0xdd, 0x0d, 0xd0, 0x3c, 0x29,\n","  0x13, 0xf5, 0xeb, 0xeb, 0xfc, 0xd2, 0x33, 0xf9, 0x05, 0xe0, 0x15, 0x04,\n","  0x08, 0xfd, 0x14, 0x14, 0xfe, 0x0a, 0xee, 0xe7, 0x14, 0xfb, 0x15, 0xef,\n","  0x07, 0xdf, 0x12, 0x14, 0x00, 0xf0, 0xff, 0x03, 0xf9, 0xe5, 0xf7, 0xcf,\n","  0x07, 0xeb, 0x0b, 0xd8, 0xf4, 0xce, 0xe1, 0xaf, 0x20, 0x0b, 0xfa, 0x09,\n","  0xf6, 0xbf, 0x18, 0xe9, 0x06, 0xcc, 0x03, 0xf4, 0x0e, 0xb8, 0x08, 0xd0,\n","  0x07, 0xe9, 0x10, 0x17, 0x0a, 0xcf, 0x21, 0xf7, 0x03, 0xf9, 0x26, 0xe0,\n","  0x04, 0xe8, 0x0c, 0xff, 0x0b, 0xfe, 0x16, 0x16, 0xfe, 0xda, 0x17, 0x04,\n","  0xfd, 0x0b, 0x15, 0x0d, 0xf8, 0x08, 0xf9, 0xf3, 0x00, 0xe8, 0x07, 0x0a,\n","  0xf4, 0xf9, 0x0e, 0xdc, 0xfb, 0xe3, 0xfe, 0x09, 0xff, 0x07, 0xfa, 0xfd,\n","  0xe6, 0x05, 0xf9, 0x0e, 0xf2, 0xef, 0xfe, 0xf6, 0x04, 0xee, 0x2d, 0x0e,\n","  0x04, 0xe7, 0xec, 0xfb, 0xf1, 0x08, 0x17, 0x04, 0xf9, 0xf9, 0x15, 0xff,\n","  0x00, 0xfc, 0x23, 0xf6, 0x00, 0x1a, 0xf4, 0x1c, 0x02, 0x04, 0x1e, 0x11,\n","  0x00, 0xee, 0xf3, 0xe6, 0xed, 0xfa, 0x24, 0xe0, 0xfb, 0xe7, 0x10, 0xd7,\n","  0xdc, 0xf5, 0x4c, 0xf3, 0x19, 0x01, 0xf9, 0xef, 0x00, 0xee, 0x13, 0xeb,\n","  0xf9, 0xd7, 0x0b, 0xf1, 0xef, 0x05, 0x45, 0xf7, 0x01, 0x0b, 0xf3, 0xfa,\n","  0x0d, 0x10, 0x18, 0x1c, 0xf5, 0xf5, 0x0a, 0xef, 0x0c, 0x19, 0x06, 0xf8,\n","  0x06, 0xf1, 0x29, 0xd0, 0x0c, 0x07, 0x17, 0xf7, 0x18, 0xb0, 0x26, 0xcf,\n","  0x16, 0x01, 0x03, 0xf4, 0xf0, 0xc8, 0x04, 0xe8, 0x1a, 0xf4, 0x0f, 0xeb,\n","  0x0e, 0xb6, 0x00, 0xd3, 0x04, 0xf8, 0x26, 0xf8, 0x1a, 0xa8, 0xf9, 0xcb,\n","  0x04, 0xeb, 0x22, 0x0a, 0x0d, 0xcd, 0xeb, 0xea, 0x03, 0xe2, 0x09, 0xed,\n","  0x0b, 0xe3, 0x09, 0xf1, 0xf1, 0xec, 0x21, 0xee, 0x0e, 0xf4, 0x1c, 0x04,\n","  0xee, 0xfb, 0x0d, 0x1a, 0xfc, 0xf4, 0xfe, 0xef, 0x06, 0xe0, 0x13, 0x0e,\n","  0xfd, 0x05, 0x0b, 0x1d, 0xfd, 0xf6, 0x09, 0x1b, 0x04, 0x27, 0xf5, 0x0e,\n","  0xf0, 0xed, 0x1e, 0xf7, 0xea, 0xfa, 0x1a, 0xf9, 0xe5, 0x07, 0x15, 0x0e,\n","  0x00, 0xea, 0xfa, 0xe9, 0xf7, 0xec, 0x31, 0xec, 0x04, 0x09, 0x10, 0xec,\n","  0xfd, 0xe4, 0x27, 0x00, 0x0c, 0xdc, 0xdc, 0xde, 0xed, 0xe9, 0x1f, 0xe4,\n","  0xfa, 0x02, 0xd9, 0xfe, 0x06, 0xf1, 0x15, 0xee, 0xf1, 0xf3, 0x14, 0xe2,\n","  0x00, 0xdb, 0x28, 0x17, 0x09, 0xdc, 0xfe, 0xea, 0xfc, 0x14, 0x20, 0x13,\n","  0xf9, 0xed, 0xf1, 0xe8, 0xfd, 0x04, 0x3a, 0xfd, 0x00, 0x15, 0xf1, 0xee,\n","  0x10, 0xe3, 0x0b, 0x20, 0x10, 0xeb, 0x10, 0xc3, 0x14, 0xf8, 0x03, 0x0b,\n","  0x11, 0xc3, 0x27, 0xc5, 0x2d, 0xdb, 0x15, 0x0e, 0xf5, 0xce, 0xfa, 0xd8,\n","  0x1c, 0xf0, 0x20, 0x04, 0xec, 0xc4, 0xf9, 0xda, 0x1c, 0xd9, 0x01, 0x05,\n","  0x1f, 0xbb, 0xf8, 0xff, 0xef, 0x06, 0x10, 0xe3, 0x02, 0xe6, 0xdb, 0xee,\n","  0x02, 0xfe, 0xfc, 0x15, 0xfe, 0xf0, 0xdb, 0xfb, 0xf5, 0xfc, 0x16, 0x02,\n","  0xed, 0x01, 0x12, 0xe2, 0x06, 0xeb, 0x10, 0x16, 0x03, 0xed, 0x1a, 0x07,\n","  0xf0, 0xe4, 0x29, 0xf5, 0xfa, 0xe1, 0x07, 0xe8, 0xf8, 0xfd, 0xf5, 0x03,\n","  0xfc, 0x18, 0x03, 0xe2, 0x00, 0xf7, 0x13, 0xf9, 0xe4, 0x10, 0x25, 0xfc,\n","  0x0e, 0x1f, 0x1c, 0x12, 0x1e, 0xfd, 0x01, 0xf9, 0xef, 0x1d, 0x17, 0x1b,\n","  0x04, 0xfd, 0x25, 0x12, 0xf5, 0x20, 0x0a, 0x02, 0x03, 0xff, 0xe6, 0xe5,\n","  0xf4, 0x05, 0x42, 0x1a, 0x0b, 0xdc, 0xfd, 0xed, 0xf3, 0xd0, 0x43, 0xf3,\n","  0x10, 0x09, 0x0a, 0xed, 0xff, 0xe2, 0x1b, 0x1d, 0x08, 0xe4, 0xfe, 0xf7,\n","  0xff, 0xf9, 0x2e, 0xfa, 0xf8, 0xe7, 0xe7, 0xeb, 0xfd, 0xfe, 0x30, 0x06,\n","  0x00, 0x1d, 0x12, 0xf4, 0x0d, 0xf4, 0x1c, 0xed, 0x01, 0xd2, 0x17, 0xb3,\n","  0x0c, 0x0c, 0xf4, 0x1e, 0x26, 0xd8, 0xf7, 0xbd, 0x24, 0xe7, 0x11, 0x12,\n","  0xf9, 0xb9, 0xf6, 0xde, 0x3c, 0xf7, 0xfe, 0x0c, 0x16, 0xc5, 0x14, 0xcd,\n","  0x24, 0x06, 0xfa, 0x21, 0x03, 0xcb, 0xf7, 0xf0, 0xfc, 0xff, 0xfe, 0xf8,\n","  0x0a, 0xed, 0xdf, 0xe4, 0x0f, 0x19, 0x10, 0x0f, 0xf9, 0xf9, 0x11, 0xf3,\n","  0xf1, 0xf1, 0x33, 0xdc, 0x02, 0xd6, 0xde, 0xe0, 0xf9, 0xec, 0xfe, 0x09,\n","  0xfc, 0xd4, 0xeb, 0x0b, 0xec, 0xe3, 0x10, 0x0e, 0x0d, 0x13, 0x00, 0xe6,\n","  0xf2, 0xf2, 0x12, 0xec, 0x05, 0xf7, 0xff, 0x03, 0x02, 0x0f, 0x0c, 0x00,\n","  0xf3, 0xfc, 0x02, 0xd9, 0xf0, 0x02, 0xef, 0xfa, 0x06, 0xda, 0x0a, 0xe4,\n","  0xf6, 0x10, 0x14, 0x03, 0x12, 0xe6, 0x25, 0x09, 0x06, 0xf1, 0x26, 0x04,\n","  0xfa, 0xe1, 0xdd, 0xfa, 0xef, 0x06, 0x11, 0xfd, 0xf9, 0xf8, 0xfd, 0xe8,\n","  0xf8, 0x0b, 0x24, 0x22, 0xf9, 0xd1, 0x1a, 0xfe, 0xf0, 0xed, 0x3c, 0xfd,\n","  0xf6, 0xfc, 0xe4, 0xf6, 0xf1, 0x05, 0x25, 0xf9, 0xee, 0x1b, 0x0d, 0xe2,\n","  0xf8, 0xff, 0x2b, 0x16, 0xf6, 0xf4, 0x27, 0xe0, 0x02, 0x05, 0x0a, 0x11,\n","  0x1d, 0xd1, 0xfb, 0xcb, 0x17, 0xf3, 0x23, 0xf9, 0x17, 0xb7, 0xec, 0x9f,\n","  0x1d, 0xf2, 0x0f, 0x27, 0x10, 0xc5, 0xfa, 0xdf, 0x21, 0xe7, 0x0e, 0x01,\n","  0x06, 0xb1, 0x02, 0xe2, 0x0e, 0xf8, 0x07, 0x04, 0x1a, 0xc1, 0x04, 0xed,\n","  0xfe, 0xf6, 0x0c, 0x1c, 0x1d, 0xe1, 0xe5, 0xed, 0x03, 0xd7, 0xfb, 0x28,\n","  0x00, 0xdf, 0xe9, 0xcd, 0xef, 0x04, 0x20, 0xe9, 0x10, 0xde, 0x00, 0xee,\n","  0xf3, 0xd0, 0x02, 0x09, 0x0b, 0x0e, 0xee, 0xf8, 0xea, 0xf3, 0x31, 0x0d,\n","  0xf7, 0x1e, 0x0f, 0xe9, 0xe9, 0xff, 0x16, 0xda, 0x12, 0xf3, 0xec, 0x1c,\n","  0xfd, 0x04, 0x0a, 0x09, 0x01, 0xed, 0xf9, 0x0d, 0xf9, 0x12, 0xfc, 0x08,\n","  0xfa, 0xd6, 0x12, 0x0b, 0x02, 0xff, 0xfe, 0x06, 0x0f, 0xe3, 0xf0, 0xdb,\n","  0xf0, 0xf5, 0x0f, 0x17, 0x0d, 0xe4, 0x2a, 0xf4, 0x13, 0xe9, 0x3a, 0x0c,\n","  0x04, 0x11, 0xee, 0xf0, 0xf1, 0xf5, 0x31, 0x04, 0xf2, 0x04, 0x14, 0x02,\n","  0xfd, 0xe7, 0x2a, 0xf6, 0xff, 0x17, 0xed, 0xea, 0xe1, 0xf9, 0x27, 0x20,\n","  0x0b, 0xe6, 0x1f, 0xfe, 0x00, 0xf9, 0x10, 0x05, 0x04, 0x0e, 0xf0, 0xf7,\n","  0x18, 0x17, 0x13, 0xf0, 0x21, 0xcd, 0xf9, 0xcd, 0x13, 0xfb, 0x05, 0xe6,\n","  0x1b, 0xba, 0xf5, 0xb2, 0x2b, 0xd4, 0x19, 0x18, 0xf4, 0xc8, 0xee, 0xce,\n","  0x31, 0xf4, 0xec, 0x2d, 0xfa, 0xc0, 0xeb, 0xe9, 0x0e, 0xe0, 0x2f, 0xfe,\n","  0x17, 0xd1, 0x09, 0xfc, 0xf6, 0xdc, 0xf1, 0x00, 0x11, 0xd2, 0xf4, 0xe4,\n","  0xfc, 0x0f, 0x02, 0x27, 0x0e, 0xdd, 0x19, 0x08, 0x03, 0xf8, 0x1f, 0xeb,\n","  0xfa, 0x0d, 0xf1, 0x11, 0x0c, 0xe4, 0x31, 0x07, 0x02, 0xe7, 0xec, 0xf0,\n","  0xe7, 0x02, 0x1b, 0xf0, 0xf8, 0x22, 0xfa, 0xe2, 0xfd, 0xf2, 0x13, 0x17,\n","  0x0d, 0xf3, 0xfc, 0x01, 0xe4, 0xe2, 0x01, 0x09, 0xf4, 0xf1, 0x0c, 0x0d,\n","  0x00, 0xf9, 0xfa, 0x07, 0x0c, 0xf4, 0xf5, 0xe9, 0xfa, 0x2f, 0x3d, 0x11,\n","  0xef, 0x0b, 0x12, 0x04, 0xed, 0xfb, 0x17, 0x0e, 0x0d, 0xfb, 0xfb, 0xe1,\n","  0x0e, 0xf0, 0x22, 0x13, 0x07, 0xed, 0xee, 0xda, 0xf2, 0xe8, 0x48, 0x07,\n","  0xfc, 0xd2, 0xe3, 0xf0, 0xfa, 0xf9, 0x10, 0x0c, 0xe7, 0xeb, 0x01, 0xd3,\n","  0xfb, 0xff, 0x3b, 0xf9, 0xf8, 0xef, 0xe9, 0xea, 0xe3, 0x01, 0x03, 0x04,\n","  0xfb, 0xf9, 0x1a, 0x1e, 0x18, 0xf4, 0x05, 0x22, 0x21, 0xc9, 0x0c, 0xbf,\n","  0x27, 0xfb, 0x06, 0x1d, 0x17, 0xce, 0x0e, 0xb7, 0x3c, 0xfa, 0xea, 0x0f,\n","  0x12, 0xa6, 0xff, 0xd6, 0x25, 0xd4, 0x1e, 0xe4, 0x12, 0xaf, 0xdd, 0xd6,\n","  0x2c, 0xfc, 0x08, 0xf5, 0x0e, 0xbb, 0x0a, 0xe2, 0x06, 0xfc, 0x27, 0x2e,\n","  0x0f, 0xc7, 0xf8, 0x00, 0x00, 0x04, 0x1c, 0x0b, 0x0e, 0x04, 0x17, 0x11,\n","  0x06, 0x0c, 0x17, 0x13, 0xfb, 0xf3, 0xe0, 0xe7, 0x06, 0xdf, 0x0b, 0x11,\n","  0x01, 0xfb, 0xef, 0x05, 0xf3, 0xc7, 0x01, 0xfc, 0xfc, 0x0b, 0x04, 0x00,\n","  0x04, 0x13, 0x25, 0x2a, 0x05, 0xfb, 0x24, 0xf1, 0xe6, 0xfd, 0x19, 0x09,\n","  0x01, 0xe0, 0xf2, 0xf5, 0x03, 0xfd, 0xfe, 0x06, 0x08, 0xe9, 0xde, 0x1a,\n","  0xfd, 0x17, 0x1b, 0x11, 0x0c, 0xf7, 0x0c, 0xf6, 0xfb, 0xf6, 0x29, 0x1b,\n","  0x1e, 0x00, 0xea, 0xe2, 0xfe, 0xeb, 0x1d, 0x22, 0xff, 0x15, 0xec, 0xcd,\n","  0xef, 0xc4, 0x18, 0x15, 0xed, 0xed, 0x08, 0xeb, 0xf8, 0xe4, 0x35, 0x08,\n","  0x0b, 0xe4, 0x13, 0xf4, 0xf6, 0xff, 0x12, 0xfc, 0xfc, 0x05, 0x0b, 0xf6,\n","  0xeb, 0x07, 0x0d, 0x0f, 0xf8, 0x21, 0xf0, 0xe1, 0x1e, 0xf2, 0xf1, 0xfe,\n","  0x2b, 0xe6, 0x2a, 0xd2, 0x15, 0xf1, 0x02, 0xfc, 0x22, 0xce, 0xe2, 0xbc,\n","  0x35, 0xf9, 0x1e, 0x1c, 0x17, 0xaf, 0xf7, 0xfa, 0x2a, 0xea, 0x13, 0xfe,\n","  0x08, 0xbe, 0x1b, 0xcf, 0x19, 0x16, 0x00, 0x1b, 0x1c, 0xbe, 0xe9, 0xee,\n","  0x05, 0xe6, 0xec, 0x03, 0x26, 0xd2, 0xec, 0x0c, 0xf7, 0xeb, 0xf8, 0xf8,\n","  0x1f, 0xde, 0xf3, 0xdd, 0x0f, 0x01, 0x26, 0xf9, 0x00, 0xf0, 0xe9, 0xe0,\n","  0x0f, 0xc3, 0x0b, 0xe9, 0x01, 0xee, 0x03, 0xd8, 0xf4, 0xee, 0x29, 0x14,\n","  0xf2, 0xfe, 0xf1, 0x09, 0xfc, 0x09, 0x0e, 0xfe, 0x06, 0x04, 0xfb, 0x07,\n","  0xf0, 0xfe, 0x24, 0xfa, 0xf7, 0xf9, 0x0b, 0xfa, 0xf1, 0xf3, 0x1c, 0xf9,\n","  0x05, 0xdb, 0x09, 0xf9, 0x10, 0xf5, 0x17, 0x2d, 0x09, 0xf9, 0xf3, 0x06,\n","  0xfd, 0xe4, 0x07, 0xf6, 0xff, 0xfb, 0xfe, 0xf7, 0xfb, 0xf2, 0x22, 0xfe,\n","  0xfb, 0xfb, 0x12, 0xe4, 0xf0, 0xec, 0x2a, 0x1c, 0xf8, 0xfa, 0x01, 0xd9,\n","  0xef, 0x00, 0x1d, 0x06, 0xf8, 0xff, 0x05, 0x0b, 0xf4, 0x00, 0x38, 0x16,\n","  0xf3, 0xf5, 0x1e, 0x07, 0xde, 0x0b, 0x32, 0x25, 0xfe, 0x03, 0x0d, 0x0a,\n","  0x1f, 0x05, 0x28, 0x01, 0x19, 0xd3, 0xff, 0xc2, 0x0a, 0x01, 0xf6, 0x1e,\n","  0x24, 0xda, 0xf9, 0xb2, 0x4f, 0xef, 0xf9, 0x13, 0xf5, 0xd2, 0xd7, 0xe6,\n","  0x37, 0xf4, 0x02, 0x09, 0x05, 0xa3, 0xf7, 0xd9, 0x14, 0xf2, 0x0b, 0x05,\n","  0x36, 0xbd, 0x0c, 0x17, 0xfc, 0xfa, 0x22, 0x27, 0x1f, 0xc2, 0xf6, 0xf3,\n","  0xff, 0xe6, 0x25, 0x17, 0x08, 0xd0, 0x04, 0x1a, 0xfb, 0xff, 0x08, 0x24,\n","  0xf1, 0xf3, 0x15, 0xf4, 0xf6, 0xf2, 0x12, 0xe5, 0x01, 0xd8, 0xec, 0x17,\n","  0x00, 0xd9, 0x08, 0x11, 0x04, 0x11, 0x02, 0xe9, 0xea, 0xe9, 0x20, 0xf4,\n","  0x12, 0xe7, 0xe3, 0x00, 0xfe, 0x10, 0x1d, 0xeb, 0xfe, 0xe6, 0xd6, 0x05,\n","  0xfa, 0xf3, 0x14, 0x19, 0x03, 0xdc, 0x0e, 0xe3, 0xf7, 0xfd, 0x31, 0xf3,\n","  0x05, 0x11, 0xf5, 0xe3, 0x01, 0x05, 0x2c, 0x03, 0x15, 0xdf, 0x21, 0x0e,\n","  0xe7, 0xfb, 0x09, 0x0c, 0xfb, 0xf9, 0x1b, 0xdc, 0xe3, 0xf3, 0x14, 0xdb,\n","  0x02, 0xe8, 0x0a, 0xfd, 0xf7, 0xf9, 0x05, 0xdb, 0xfb, 0xe7, 0xf2, 0xfe,\n","  0xf5, 0xe5, 0x10, 0xdd, 0x00, 0xf0, 0xe0, 0xf5, 0xf0, 0x04, 0x19, 0x24,\n","  0xff, 0xe4, 0xf0, 0xf0, 0x23, 0x19, 0x17, 0xf6, 0x11, 0xdd, 0xdf, 0xde,\n","  0x2a, 0xee, 0x0a, 0xfb, 0x2b, 0xc5, 0x05, 0xb4, 0x51, 0xf3, 0x09, 0x10,\n","  0x0a, 0xb3, 0xfd, 0xe6, 0x48, 0xdf, 0x14, 0x0b, 0x1b, 0xcc, 0xd9, 0xfa,\n","  0x15, 0xe5, 0xff, 0x24, 0x30, 0xbf, 0x05, 0x02, 0x09, 0x14, 0x25, 0x18,\n","  0x2d, 0xc2, 0xfe, 0xf5, 0x0a, 0x17, 0xfd, 0x03, 0x15, 0xd3, 0x21, 0x11,\n","  0x10, 0xe5, 0x02, 0xe3, 0xf7, 0x06, 0x15, 0xfa, 0xf5, 0xd3, 0x17, 0x02,\n","  0xf9, 0x05, 0x16, 0xe0, 0x16, 0xd4, 0x0c, 0xe9, 0xf4, 0xfd, 0x28, 0x15,\n","  0x04, 0xe2, 0x03, 0xfd, 0xf6, 0xf5, 0xfb, 0xf8, 0xf4, 0xf1, 0x10, 0xe6,\n","  0x02, 0xfe, 0x03, 0xca, 0xe8, 0x05, 0x14, 0x02, 0xf9, 0xdc, 0xef, 0xf7,\n","  0x09, 0x0f, 0x1e, 0x11, 0xfb, 0xfb, 0x13, 0x23, 0xf8, 0x06, 0x14, 0x12,\n","  0x1b, 0x13, 0x2a, 0xf4, 0x04, 0xe5, 0x24, 0x1c, 0x03, 0xf8, 0x01, 0xd3,\n","  0xe4, 0xd0, 0x3d, 0xe7, 0x0c, 0xde, 0xf1, 0xe3, 0xf1, 0xe8, 0x12, 0xf1,\n","  0x10, 0xdb, 0xe5, 0xd3, 0xe5, 0xf7, 0x0f, 0xeb, 0xf9, 0xee, 0x18, 0xe5,\n","  0xe9, 0x13, 0x18, 0x26, 0x14, 0x00, 0xfc, 0xf7, 0x2b, 0x0f, 0x05, 0xf5,\n","  0x39, 0xd3, 0xf1, 0xd8, 0x29, 0xf4, 0x0f, 0x15, 0x14, 0xbc, 0x00, 0xc9,\n","  0x3f, 0xe1, 0x05, 0x11, 0x23, 0xb4, 0xe3, 0xf6, 0x51, 0xde, 0x26, 0xf6,\n","  0x27, 0xb3, 0xf7, 0xdd, 0x2d, 0xf1, 0x10, 0x09, 0x3d, 0xcd, 0xea, 0xf1,\n","  0x0c, 0x0e, 0xfe, 0x21, 0x24, 0xd6, 0xf9, 0x08, 0xff, 0xee, 0x12, 0x08,\n","  0xfd, 0xe8, 0x19, 0xeb, 0x0b, 0xeb, 0x0f, 0x23, 0x0e, 0xd1, 0xfe, 0xf1,\n","  0xf3, 0xd7, 0xf7, 0x1f, 0xff, 0xe5, 0xfe, 0x12, 0x05, 0xee, 0x13, 0x20,\n","  0x22, 0xdd, 0x03, 0x19, 0x08, 0xee, 0xfd, 0x01, 0x12, 0x1a, 0xfc, 0x0c,\n","  0xf5, 0xf4, 0xfd, 0xef, 0x05, 0xe8, 0x17, 0x08, 0xf2, 0xea, 0x08, 0x13,\n","  0x03, 0xff, 0xf0, 0xe9, 0xfe, 0xff, 0x22, 0xfb, 0xff, 0xee, 0x0c, 0xfb,\n","  0xff, 0x06, 0x27, 0x01, 0x08, 0xe3, 0x0c, 0xf1, 0x06, 0xe4, 0x19, 0x0d,\n","  0x0e, 0xe1, 0xdc, 0xe8, 0xdb, 0xed, 0x2a, 0x0a, 0x06, 0xfd, 0x0e, 0xfb,\n","  0xfb, 0x06, 0x25, 0x27, 0xfc, 0xf2, 0xf5, 0xf6, 0xef, 0xf7, 0x35, 0xf2,\n","  0xe9, 0xea, 0x05, 0xf1, 0xdf, 0x06, 0x16, 0xf2, 0xfe, 0xde, 0xf0, 0x05,\n","  0x2c, 0x25, 0x0a, 0x15, 0x0e, 0xc2, 0x03, 0xad, 0x3a, 0xee, 0x09, 0x27,\n","  0x31, 0xb8, 0x20, 0xb5, 0x53, 0xd7, 0x09, 0xea, 0x0b, 0xc9, 0x04, 0xf9,\n","  0x61, 0xda, 0xde, 0x19, 0x2d, 0xc3, 0xe7, 0xd4, 0x1b, 0xe7, 0xf9, 0x0f,\n","  0x43, 0xc2, 0xff, 0xe6, 0x0c, 0xef, 0x13, 0xf3, 0x1b, 0xe0, 0x0b, 0x08,\n","  0x05, 0x03, 0x09, 0x03, 0x23, 0xf4, 0xe8, 0xf5, 0x15, 0xfe, 0xee, 0xe8,\n","  0x06, 0xe1, 0xe8, 0xf0, 0x20, 0xb3, 0xf4, 0x02, 0x06, 0xe4, 0xfa, 0x14,\n","  0x02, 0xef, 0x13, 0x16, 0x08, 0x0f, 0x0e, 0x22, 0x0b, 0xed, 0xf3, 0x1b,\n","  0x1d, 0x01, 0x22, 0xec, 0x01, 0xe0, 0xf5, 0x18, 0x0c, 0xd5, 0xff, 0x0e,\n","  0x09, 0x06, 0x0b, 0xf1, 0x12, 0xe2, 0xe4, 0xd5, 0x07, 0xfb, 0xfc, 0xfe,\n","  0xf7, 0xf7, 0x04, 0x02, 0xfe, 0xee, 0x05, 0x06, 0x04, 0xd9, 0x00, 0x06,\n","  0xfb, 0x01, 0x28, 0x06, 0x09, 0xfe, 0x1c, 0xd7, 0xf9, 0xdc, 0x1a, 0xf3,\n","  0xf6, 0xc9, 0xfd, 0xfe, 0x06, 0xdc, 0x09, 0xf6, 0xfe, 0xe7, 0x18, 0xf9,\n","  0xf7, 0xe4, 0x24, 0xf5, 0xe9, 0x0a, 0x08, 0xf0, 0xf1, 0x08, 0x2c, 0xfd,\n","  0xf9, 0xe4, 0xf9, 0x03, 0x38, 0x05, 0x0d, 0xf6, 0x1e, 0xda, 0xfc, 0xb9,\n","  0x58, 0x01, 0xff, 0xf5, 0x33, 0xb4, 0xf7, 0xb7, 0x72, 0x12, 0x14, 0xf7,\n","  0xff, 0xd5, 0x06, 0xda, 0x61, 0xd0, 0x06, 0x05, 0x1e, 0xca, 0x0a, 0xfa,\n","  0x30, 0xcf, 0xfa, 0xf2, 0x31, 0xd2, 0x0d, 0xcd, 0x2f, 0xd8, 0x13, 0x13,\n","  0x2c, 0xcc, 0x08, 0xd6, 0x23, 0xd9, 0x12, 0x11, 0x18, 0xfa, 0x0c, 0xe3,\n","  0x18, 0xef, 0xef, 0x00, 0x26, 0xf0, 0xf3, 0xe7, 0x1e, 0xc9, 0x0e, 0x26,\n","  0x04, 0xeb, 0xf0, 0x0a, 0x26, 0xc9, 0xf6, 0xfb, 0x0c, 0xf1, 0x11, 0x00,\n","  0x18, 0xec, 0x10, 0x07, 0x0e, 0x06, 0xde, 0xed, 0x0b, 0xd8, 0x13, 0xfe,\n","  0x05, 0xfc, 0x00, 0xd0, 0x13, 0x07, 0x1f, 0xf2, 0x11, 0x13, 0x0a, 0x1d,\n","  0x10, 0xf8, 0xfd, 0x06, 0x02, 0x06, 0xf5, 0xdf, 0x10, 0xfa, 0x11, 0xe0,\n","  0xf7, 0xf5, 0xf9, 0xe8, 0x0d, 0xda, 0x02, 0xf3, 0xf2, 0xef, 0x0c, 0xe9,\n","  0xfc, 0xc3, 0x18, 0x12, 0xea, 0xfb, 0x08, 0x0f, 0xf7, 0xdf, 0x23, 0x08,\n","  0x03, 0xeb, 0xe9, 0x1e, 0xf2, 0xe2, 0x13, 0xea, 0x01, 0xf2, 0xec, 0xe8,\n","  0xed, 0x0d, 0x15, 0xfc, 0x0f, 0xfd, 0x03, 0xfd, 0x61, 0xee, 0x12, 0xe4,\n","  0x01, 0xd0, 0x0d, 0xc4, 0x4a, 0x10, 0x07, 0x1d, 0x2e, 0xab, 0xe3, 0xa9,\n","  0x7f, 0xf8, 0x1f, 0xe3, 0x00, 0xe5, 0xe6, 0xcd, 0x6c, 0xc4, 0x2a, 0xfb,\n","  0x18, 0xd8, 0xf7, 0xb7, 0x49, 0xf7, 0x19, 0xe2, 0x2e, 0xe3, 0xf5, 0xfd,\n","  0x33, 0xfa, 0x0b, 0xfd, 0x0a, 0xdc, 0xf0, 0x0c, 0x34, 0xd0, 0x02, 0xf4,\n","  0x22, 0xe4, 0xf8, 0xe3, 0x2f, 0xe4, 0x11, 0xe5, 0x0e, 0x0c, 0x1e, 0xe6,\n","  0x21, 0xe8, 0x10, 0xfa, 0x07, 0xfa, 0xef, 0x03, 0x01, 0xde, 0x02, 0x08,\n","  0x0d, 0xdc, 0x17, 0x00, 0x01, 0xe1, 0x1c, 0x0e, 0xfc, 0x02, 0x04, 0xe8,\n","  0x07, 0xee, 0x06, 0xff, 0x09, 0xcd, 0x1a, 0xd1, 0x18, 0x2c, 0xff, 0xf4,\n","  0xf4, 0xee, 0x19, 0xec, 0x1b, 0xf4, 0x09, 0x0e, 0x02, 0xee, 0x15, 0xe3,\n","  0x0f, 0xe4, 0x02, 0x08, 0xfb, 0x15, 0x09, 0xf1, 0x01, 0xcd, 0x22, 0x19,\n","  0xee, 0x04, 0x1f, 0xd7, 0x0c, 0xd5, 0x10, 0xea, 0x0c, 0x06, 0x14, 0xd1,\n","  0xef, 0xef, 0x22, 0x22, 0xf1, 0xf1, 0xfc, 0x0d, 0xf7, 0x00, 0x0e, 0x07,\n","  0xf4, 0x0d, 0x12, 0x01, 0xde, 0x1d, 0x04, 0xe5, 0x03, 0x15, 0xe8, 0xda,\n","  0x62, 0x0f, 0x1a, 0xeb, 0x13, 0xd1, 0x09, 0xe7, 0x79, 0x25, 0xfb, 0xff,\n","  0x43, 0xa8, 0xef, 0xa4, 0x61, 0xfe, 0x15, 0x16, 0x28, 0xbc, 0x07, 0xd6,\n","  0x59, 0xd3, 0x00, 0xf0, 0x18, 0xcb, 0x05, 0xca, 0x2f, 0x08, 0xf4, 0x2d,\n","  0x1f, 0xe5, 0x07, 0xfb, 0x1c, 0x0e, 0x26, 0xf3, 0x3c, 0xd1, 0xe7, 0xf7,\n","  0x0f, 0xf2, 0xfc, 0x24, 0x3a, 0xf4, 0xfa, 0xfc, 0x09, 0xe1, 0x0e, 0x00,\n","  0x06, 0xe2, 0x04, 0xe8, 0x15, 0xdd, 0xf6, 0x06, 0x21, 0xe5, 0xfb, 0xe7,\n","  0xfe, 0xed, 0xfb, 0x14, 0x1c, 0xdd, 0xf8, 0xf6, 0x26, 0x02, 0x02, 0xf1,\n","  0xf7, 0xd3, 0x13, 0xeb, 0x18, 0x03, 0x12, 0xf4, 0xe5, 0xf0, 0xef, 0xe9,\n","  0x2c, 0x0d, 0xe3, 0x19, 0x12, 0xc8, 0xdd, 0xee, 0x08, 0x0b, 0xee, 0x19,\n","  0xf9, 0xf3, 0xf4, 0xf9, 0x0a, 0xfd, 0xf2, 0x0e, 0x15, 0xf8, 0xd6, 0x03,\n","  0x1f, 0xe9, 0xfd, 0x04, 0x15, 0x1f, 0x21, 0xe1, 0x0c, 0xf8, 0xec, 0xf4,\n","  0xee, 0x0c, 0xef, 0xfd, 0x0a, 0xf4, 0x06, 0x14, 0x10, 0xe1, 0xdd, 0x0b,\n","  0x0b, 0x05, 0x0e, 0x0f, 0x01, 0xf7, 0xfd, 0xe0, 0xe2, 0x26, 0x28, 0x26,\n","  0x10, 0x00, 0xe8, 0xfd, 0xfa, 0xec, 0xf7, 0x14, 0x08, 0xff, 0xf7, 0x0c,\n","  0x06, 0x09, 0xf3, 0x0b, 0xf3, 0xfe, 0xec, 0xfd, 0x1a, 0xf8, 0xf1, 0xdb,\n","  0xfe, 0x0f, 0xff, 0x0b, 0x17, 0x1f, 0xfb, 0xe7, 0x0c, 0x13, 0x10, 0xf6,\n","  0x04, 0x11, 0xf3, 0xfd, 0xec, 0xd0, 0xf3, 0xfa, 0x01, 0xfe, 0x03, 0x07,\n","  0x0d, 0xde, 0xf8, 0x05, 0xee, 0xf0, 0xff, 0x08, 0xff, 0xf0, 0x1d, 0x05,\n","  0x14, 0xea, 0xfe, 0x04, 0xf1, 0x0e, 0x19, 0xfb, 0x1a, 0xff, 0xef, 0xf2,\n","  0x02, 0xf5, 0xe7, 0x0e, 0xe4, 0x1d, 0xfa, 0x14, 0xf0, 0xde, 0xf0, 0xe4,\n","  0xf6, 0x04, 0x07, 0xe6, 0xf1, 0x1b, 0xff, 0xfb, 0x16, 0x02, 0x01, 0x10,\n","  0x08, 0x14, 0x08, 0x03, 0xf7, 0x01, 0x02, 0xf6, 0xf9, 0xe7, 0xe9, 0xf5,\n","  0x05, 0x14, 0xfc, 0xe1, 0xfb, 0x20, 0x03, 0x18, 0xfa, 0xe9, 0xf0, 0x1d,\n","  0xf9, 0xf0, 0xfb, 0xed, 0x0a, 0xd9, 0xf4, 0xeb, 0xed, 0x05, 0xf7, 0x0b,\n","  0x0f, 0xf0, 0x0a, 0x07, 0xee, 0xdd, 0x17, 0x08, 0xfb, 0x1c, 0xf4, 0x23,\n","  0xfd, 0x0f, 0x07, 0xdf, 0x03, 0x1f, 0xed, 0xf1, 0xfd, 0xfb, 0xdc, 0x0a,\n","  0x18, 0xf9, 0x00, 0xea, 0xf7, 0xe8, 0xf6, 0x07, 0xee, 0xf8, 0xec, 0xf7,\n","  0x04, 0x0e, 0x0f, 0x00, 0x18, 0xfc, 0x09, 0x1a, 0xfb, 0x00, 0xe5, 0xff,\n","  0x0f, 0x08, 0xeb, 0xfc, 0x0f, 0xe6, 0x14, 0x03, 0xf6, 0xfc, 0x0f, 0xfc,\n","  0x0b, 0xf2, 0x1c, 0x06, 0xf9, 0x09, 0xf9, 0xdf, 0x14, 0xfb, 0xd6, 0xeb,\n","  0xfb, 0xeb, 0x0d, 0x0b, 0x15, 0xe6, 0xf6, 0x04, 0x17, 0xfc, 0x10, 0xf4,\n","  0x05, 0xf7, 0xf7, 0xf2, 0xf9, 0xf0, 0xfc, 0x10, 0x08, 0x0d, 0xe1, 0x0c,\n","  0x06, 0x12, 0xf1, 0xfd, 0x10, 0x2a, 0xfb, 0xec, 0x0c, 0x05, 0x0b, 0x18,\n","  0x2b, 0x0c, 0x08, 0xeb, 0x22, 0xfb, 0xfe, 0x07, 0x08, 0x17, 0x0d, 0xed,\n","  0xe8, 0xf2, 0x0d, 0xdf, 0x14, 0xf5, 0xed, 0xe3, 0x00, 0x06, 0xfb, 0x15,\n","  0x01, 0x03, 0xf9, 0xfe, 0x08, 0x14, 0x01, 0xf3, 0xe4, 0xfb, 0xfe, 0xde,\n","  0x0f, 0xe8, 0xff, 0xf1, 0x03, 0xe5, 0x18, 0xff, 0xfd, 0x02, 0x10, 0xec,\n","  0xfb, 0xf5, 0x12, 0x06, 0x0c, 0xde, 0x0f, 0x0e, 0x03, 0xf1, 0xf9, 0x02,\n","  0xfa, 0x01, 0x07, 0xf3, 0x02, 0x0f, 0x03, 0x13, 0xf4, 0xee, 0x0a, 0x04,\n","  0x0f, 0x1c, 0x1a, 0x03, 0x08, 0x06, 0xf6, 0x16, 0xff, 0xec, 0x14, 0xfe,\n","  0x09, 0xf5, 0x06, 0x1d, 0xf3, 0xf0, 0x22, 0xf7, 0x28, 0xe3, 0x09, 0x28,\n","  0xf2, 0x1a, 0x1c, 0x0e, 0x1a, 0xd5, 0xf6, 0xdd, 0x03, 0xce, 0xff, 0x03,\n","  0xf5, 0xf2, 0x14, 0x02, 0x11, 0xd2, 0x08, 0xfa, 0xf2, 0xf7, 0xf6, 0xef,\n","  0xf8, 0xea, 0xf3, 0xf7, 0xe7, 0x0e, 0x03, 0xf5, 0x07, 0x04, 0x21, 0xf5,\n","  0xec, 0xf6, 0xf1, 0x0f, 0x09, 0x0a, 0x06, 0x03, 0x14, 0xee, 0x03, 0x26,\n","  0x01, 0x0a, 0x09, 0xf8, 0x0a, 0x17, 0xf6, 0x19, 0x1c, 0xfc, 0x0f, 0xf1,\n","  0xf8, 0x06, 0xf7, 0xd9, 0x0b, 0x0e, 0x04, 0xda, 0x03, 0xe8, 0x15, 0x0a,\n","  0x35, 0xfe, 0x03, 0xe5, 0x07, 0xfc, 0x11, 0xfa, 0xfc, 0xf4, 0xe9, 0x06,\n","  0xfd, 0xe4, 0x15, 0x07, 0x10, 0xef, 0xf6, 0xfc, 0x13, 0x14, 0x08, 0x09,\n","  0x12, 0xe6, 0xfb, 0xe1, 0x17, 0x04, 0xf8, 0xfc, 0xfc, 0xf1, 0xf3, 0xee,\n","  0x27, 0x0d, 0xf7, 0xfd, 0x0a, 0xf7, 0x14, 0x00, 0x0d, 0xff, 0xf3, 0x0a,\n","  0xf9, 0x01, 0x04, 0xfd, 0xf2, 0xf4, 0x13, 0x16, 0xfb, 0x09, 0xe4, 0xef,\n","  0xf8, 0xf1, 0x10, 0xff, 0x14, 0xfa, 0xda, 0xf6, 0xff, 0xff, 0xfb, 0x10,\n","  0x0b, 0x08, 0x0d, 0xf8, 0x04, 0x10, 0xf8, 0xf2, 0x10, 0x00, 0x16, 0x0b,\n","  0x00, 0x00, 0x14, 0x0b, 0xee, 0xf7, 0x0e, 0x0b, 0xf8, 0xed, 0xf6, 0x0f,\n","  0xff, 0xc1, 0xfc, 0x04, 0xf6, 0x0a, 0xfa, 0x01, 0xe3, 0xdc, 0x05, 0x07,\n","  0x00, 0x27, 0x01, 0x06, 0xe1, 0xeb, 0x25, 0x05, 0xf1, 0x22, 0x17, 0x1a,\n","  0x0a, 0xff, 0x15, 0x18, 0xf3, 0x0f, 0x01, 0x19, 0xfd, 0x0e, 0xec, 0x08,\n","  0xfa, 0xfd, 0x0f, 0xeb, 0x09, 0x0e, 0xe2, 0x23, 0x07, 0xfa, 0xef, 0xfe,\n","  0xe9, 0xfc, 0x27, 0x0d, 0x08, 0xf9, 0x0d, 0xf8, 0x1f, 0x15, 0x15, 0xd7,\n","  0x1d, 0x1a, 0x0e, 0x12, 0x10, 0x23, 0x0d, 0xef, 0xf4, 0x04, 0xff, 0xec,\n","  0x05, 0xfc, 0x05, 0x07, 0xf0, 0x0c, 0xfb, 0xf9, 0x07, 0xf4, 0x01, 0x0b,\n","  0xf5, 0x02, 0x14, 0xfa, 0xe3, 0xee, 0xe5, 0x08, 0xea, 0x11, 0x08, 0x0f,\n","  0xfc, 0xfc, 0xf4, 0xfb, 0xf6, 0x37, 0x0f, 0xea, 0xfe, 0xfe, 0xf6, 0xf5,\n","  0x11, 0x27, 0xed, 0xe9, 0xfb, 0x09, 0xfb, 0x05, 0xeb, 0xf8, 0x00, 0xf0,\n","  0xf1, 0x0c, 0x2b, 0x07, 0xe3, 0x0d, 0x27, 0xdc, 0x06, 0x22, 0xf3, 0x02,\n","  0xf9, 0x0a, 0x07, 0x24, 0xfe, 0x0a, 0x17, 0x1a, 0x07, 0xf7, 0xee, 0xf3,\n","  0x14, 0x0c, 0x04, 0x08, 0xf2, 0xec, 0xf7, 0x1d, 0xf1, 0xef, 0xf8, 0xef,\n","  0x19, 0xe8, 0x1d, 0x1a, 0xe1, 0xd8, 0x0c, 0xee, 0xe7, 0x17, 0x16, 0xe4,\n","  0xf4, 0xe8, 0x26, 0x08, 0x05, 0x24, 0x06, 0x0b, 0xf7, 0xe8, 0x27, 0x17,\n","  0xe5, 0xe7, 0xeb, 0xe8, 0x0d, 0xe2, 0xf7, 0x11, 0xfd, 0xdb, 0xf9, 0x17,\n","  0xfc, 0x15, 0x0f, 0x17, 0xe6, 0xeb, 0xf4, 0xf9, 0x03, 0x19, 0xe0, 0x1e,\n","  0x09, 0xed, 0xfe, 0xf7, 0x2a, 0x26, 0x12, 0x1a, 0xed, 0xe9, 0x0b, 0xf5,\n","  0x15, 0x20, 0x1c, 0x07, 0x07, 0xf7, 0x0a, 0x0d, 0x0f, 0x1e, 0x1a, 0xe6,\n","  0x0f, 0x24, 0x03, 0x1b, 0x20, 0xfc, 0x13, 0x04, 0x0c, 0x03, 0xfe, 0xea,\n","  0x00, 0x07, 0xec, 0x0f, 0xde, 0x16, 0x19, 0x07, 0xe7, 0xe5, 0x15, 0xfd,\n","  0xd4, 0x1a, 0xfb, 0x01, 0x07, 0xdb, 0x04, 0xfe, 0xda, 0x20, 0xf9, 0x0f,\n","  0xce, 0xf6, 0x19, 0x14, 0xe6, 0x2f, 0xed, 0x0b, 0x02, 0xfb, 0xd8, 0xf8,\n","  0xec, 0x1f, 0x03, 0xfe, 0x14, 0x1e, 0xfd, 0x00, 0xff, 0x13, 0xf4, 0xfb,\n","  0x01, 0x08, 0xd7, 0x03, 0x03, 0xe0, 0x03, 0xef, 0xfe, 0x0a, 0xe3, 0x05,\n","  0x03, 0x0b, 0x1e, 0xf0, 0xf1, 0x16, 0x18, 0x01, 0xfb, 0xe5, 0xf5, 0xdc,\n","  0x03, 0xed, 0x02, 0xff, 0x0b, 0x1a, 0xf7, 0x24, 0xf9, 0xda, 0x1a, 0xe7,\n","  0x05, 0x1d, 0xf8, 0xf1, 0xf6, 0xf2, 0xd6, 0xf0, 0xfb, 0x16, 0xf1, 0x10,\n","  0x17, 0xf5, 0x08, 0x09, 0xf7, 0xfa, 0xed, 0x02, 0x09, 0xfc, 0xf1, 0xf2,\n","  0xfd, 0xea, 0xfc, 0x01, 0x07, 0x06, 0x09, 0x06, 0x08, 0xfb, 0xea, 0x0c,\n","  0x03, 0x1e, 0x0b, 0x2b, 0xe3, 0xf1, 0x0b, 0xe4, 0x1b, 0x27, 0xea, 0x1c,\n","  0x0b, 0xfb, 0x01, 0x04, 0x1c, 0x26, 0xf2, 0xf2, 0xf6, 0xf2, 0xfb, 0xfb,\n","  0x05, 0x2c, 0xef, 0xe9, 0xfb, 0x05, 0x10, 0x0b, 0x08, 0x05, 0x1c, 0xf1,\n","  0xd2, 0x07, 0x0b, 0xe0, 0xf9, 0x03, 0xe7, 0xf3, 0xfa, 0x12, 0xee, 0xf3,\n","  0xe0, 0xf8, 0x0e, 0xf0, 0xf1, 0x30, 0x17, 0x01, 0x00, 0xe0, 0x1a, 0xfe,\n","  0xde, 0x2c, 0x03, 0x05, 0x00, 0xe5, 0xf7, 0x02, 0xfb, 0x34, 0xdd, 0x08,\n","  0x09, 0x06, 0x1f, 0x0a, 0x00, 0x14, 0xec, 0xdd, 0xf7, 0xf0, 0xdb, 0xe9,\n","  0xf8, 0x14, 0xff, 0xee, 0xf5, 0xf9, 0x12, 0x01, 0x0c, 0xf7, 0xfd, 0x23,\n","  0xff, 0x0d, 0x19, 0x12, 0xfa, 0xf6, 0xf9, 0xfe, 0xe6, 0x00, 0x21, 0x0b,\n","  0xf8, 0xfd, 0x15, 0xfb, 0xee, 0xf2, 0xfe, 0x0a, 0x12, 0x1d, 0x09, 0xee,\n","  0xf4, 0xc4, 0xff, 0xe7, 0xfd, 0x2a, 0x22, 0x00, 0xe9, 0xff, 0xea, 0xf1,\n","  0xfb, 0x15, 0xe0, 0x19, 0xde, 0xe6, 0xf1, 0x00, 0xee, 0xfd, 0xf5, 0x0a,\n","  0x00, 0xfd, 0x0a, 0x0d, 0xf4, 0xf9, 0xf2, 0xe6, 0x02, 0x15, 0x1c, 0x00,\n","  0xee, 0xfb, 0xfe, 0xed, 0xf0, 0x3e, 0xff, 0x2f, 0xf6, 0xf7, 0xf7, 0xda,\n","  0x11, 0x22, 0x15, 0x26, 0xfc, 0xfe, 0xfb, 0xfc, 0xf6, 0x2f, 0x02, 0x14,\n","  0x18, 0xe9, 0x14, 0x19, 0x14, 0x22, 0x02, 0xfd, 0xff, 0x1a, 0x13, 0xf9,\n","  0xfd, 0x08, 0x06, 0xeb, 0xeb, 0x1e, 0xf0, 0xf6, 0xf4, 0x01, 0xf9, 0x0f,\n","  0xe5, 0x03, 0xf4, 0xea, 0x02, 0xe0, 0x04, 0x09, 0xe2, 0x2d, 0xf7, 0x16,\n","  0x04, 0xde, 0xd8, 0xf2, 0xe2, 0x46, 0xe3, 0x08, 0xe8, 0x0d, 0xf6, 0xfc,\n","  0xfb, 0x2b, 0xf6, 0x0d, 0xe4, 0x01, 0xfa, 0x03, 0xeb, 0x28, 0x03, 0x24,\n","  0x1d, 0xf3, 0xff, 0xe9, 0xe7, 0x19, 0x1a, 0xe3, 0x04, 0xf7, 0xed, 0xfd,\n","  0x02, 0x04, 0x14, 0x09, 0x09, 0x1c, 0x0b, 0x08, 0x09, 0xe8, 0x0b, 0xef,\n","  0x04, 0x02, 0xfe, 0x19, 0xfc, 0xf4, 0x08, 0xf8, 0xef, 0xd4, 0x04, 0x13,\n","  0xf6, 0x1c, 0x16, 0x0b, 0xe1, 0xc3, 0xe0, 0xc7, 0x0f, 0x40, 0x12, 0xff,\n","  0xdf, 0x02, 0xf5, 0xf2, 0xfd, 0x0a, 0xfa, 0x12, 0xef, 0xe6, 0xfb, 0x0c,\n","  0xfa, 0x0d, 0xfa, 0x18, 0xed, 0xfe, 0x21, 0xf9, 0xed, 0xf3, 0x00, 0x1f,\n","  0xfc, 0x08, 0x1d, 0x20, 0xdd, 0x14, 0xf8, 0x0e, 0x15, 0x40, 0xeb, 0x30,\n","  0xdb, 0x09, 0xfc, 0xf1, 0xee, 0x1d, 0x0d, 0x3a, 0x02, 0x0c, 0x0d, 0xf3,\n","  0x2b, 0x2c, 0x0e, 0x0a, 0x04, 0xf6, 0xfe, 0xe6, 0x17, 0x21, 0xee, 0x0a,\n","  0x11, 0x05, 0xf4, 0x19, 0x05, 0x2b, 0xe7, 0xfa, 0xfa, 0x25, 0x08, 0xd8,\n","  0xdd, 0xf6, 0xf6, 0x22, 0xf0, 0xfa, 0x06, 0xdf, 0xe5, 0xe1, 0x09, 0xf2,\n","  0xfc, 0x2d, 0x07, 0xfa, 0xf2, 0xe8, 0xf7, 0xee, 0xf7, 0x46, 0x03, 0xfb,\n","  0xe9, 0xf7, 0x07, 0x01, 0x1b, 0x23, 0xf3, 0x09, 0xff, 0x07, 0xfa, 0xeb,\n","  0xfb, 0x38, 0x05, 0xf1, 0xed, 0xf9, 0x13, 0xfd, 0xf9, 0x16, 0x04, 0x12,\n","  0x00, 0x06, 0xf1, 0xf2, 0x0c, 0xfe, 0xf4, 0xd7, 0x08, 0x15, 0xe2, 0x11,\n","  0x14, 0x0c, 0x02, 0xeb, 0x06, 0x21, 0x00, 0x0c, 0x14, 0x0a, 0x24, 0xfe,\n","  0xda, 0xdb, 0x0f, 0x0a, 0xf5, 0x3a, 0x11, 0xe3, 0xed, 0xcc, 0xfb, 0xbb,\n","  0x12, 0x27, 0x0a, 0x02, 0xe8, 0x00, 0xfe, 0xf2, 0xfe, 0x1c, 0x05, 0xfb,\n","  0xf9, 0x0c, 0xf8, 0x1c, 0xe9, 0xfa, 0xe5, 0x10, 0xdc, 0xea, 0xdb, 0xfd,\n","  0xe4, 0x0a, 0xe9, 0xf5, 0xe9, 0x01, 0x2a, 0x19, 0xf9, 0x10, 0xfc, 0xff,\n","  0x06, 0x27, 0x0a, 0x4c, 0xe9, 0x03, 0xf4, 0x10, 0x25, 0x48, 0xef, 0x3f,\n","  0xfe, 0x00, 0xf9, 0x0a, 0x21, 0x2d, 0x08, 0x18, 0x0a, 0xed, 0x06, 0xe4,\n","  0x2d, 0x13, 0x09, 0x0c, 0x0c, 0x0f, 0x11, 0x06, 0x18, 0x18, 0xf0, 0xff,\n","  0xf2, 0x1e, 0xf8, 0x13, 0xe6, 0xf3, 0xea, 0x1e, 0xf5, 0x18, 0xfb, 0x1c,\n","  0xe2, 0xdb, 0x13, 0xf8, 0x03, 0x35, 0xfc, 0xf8, 0xed, 0xf1, 0x05, 0xf6,\n","  0x0b, 0x3c, 0xfe, 0x06, 0xe1, 0x0f, 0x03, 0x07, 0x11, 0x29, 0x16, 0x0e,\n","  0xec, 0x01, 0xf3, 0xf3, 0x11, 0x29, 0x07, 0x04, 0x15, 0x11, 0x10, 0xf0,\n","  0x04, 0x11, 0xf2, 0x22, 0x08, 0x0b, 0xff, 0xe8, 0x08, 0xf5, 0x00, 0xe1,\n","  0x01, 0x09, 0x04, 0xfd, 0x03, 0xea, 0x06, 0xf6, 0x01, 0x08, 0xed, 0x0d,\n","  0xfe, 0x0f, 0x07, 0x00, 0xe3, 0xd8, 0x02, 0x1e, 0xf3, 0x3d, 0x35, 0x0f,\n","  0xcb, 0xe2, 0x13, 0xd6, 0x0c, 0x4e, 0x16, 0xe3, 0xe0, 0xf2, 0xf4, 0xf4,\n","  0xf5, 0x28, 0xf8, 0xf8, 0xe8, 0x05, 0xe8, 0x12, 0xf9, 0x04, 0xee, 0x0e,\n","  0xdf, 0xee, 0xed, 0x0d, 0xe1, 0xff, 0xfa, 0x0c, 0xfd, 0x05, 0xe7, 0x13,\n","  0x07, 0x1c, 0xfd, 0x05, 0x0c, 0x42, 0xf3, 0x42, 0xf8, 0x17, 0xe6, 0x0d,\n","  0x03, 0x3a, 0x27, 0x33, 0x03, 0x02, 0x1c, 0x10, 0x15, 0x31, 0xdf, 0x18,\n","  0x00, 0xfb, 0x02, 0xe1, 0x12, 0x29, 0x23, 0x05, 0x03, 0x0c, 0x07, 0x11,\n","  0x0e, 0x06, 0xfa, 0xf8, 0x05, 0x1d, 0x24, 0x10, 0x00, 0xfa, 0xea, 0x10,\n","  0x06, 0xf4, 0xfd, 0x16, 0xec, 0xf0, 0x0c, 0x0e, 0x0d, 0x1b, 0xf0, 0xfd,\n","  0xf9, 0xe4, 0x16, 0xeb, 0x01, 0x2b, 0x0b, 0xeb, 0xef, 0x10, 0xdf, 0x04,\n","  0x14, 0x2f, 0x03, 0xe5, 0xf6, 0x11, 0x26, 0xf9, 0xf8, 0x36, 0x14, 0xf0,\n","  0x03, 0x05, 0xf3, 0xed, 0x13, 0x0a, 0x07, 0x03, 0xfb, 0xff, 0xf6, 0xe7,\n","  0x08, 0xf4, 0xf5, 0x08, 0x05, 0xfe, 0xe8, 0x1e, 0x05, 0x08, 0xe2, 0xfd,\n","  0x25, 0x14, 0x1c, 0x15, 0x0b, 0x1b, 0x0e, 0x01, 0xf1, 0xe1, 0x19, 0x15,\n","  0xec, 0x4b, 0x26, 0xfe, 0xd9, 0xe4, 0x09, 0xd5, 0xf9, 0x40, 0xfe, 0xff,\n","  0xf0, 0x07, 0x10, 0xfa, 0x0a, 0x29, 0x08, 0x01, 0xdd, 0xf1, 0xe8, 0x12,\n","  0xf3, 0x13, 0xf8, 0x02, 0xe2, 0xe2, 0x28, 0x11, 0xee, 0xf6, 0xd4, 0x0c,\n","  0xee, 0x19, 0xff, 0x10, 0xf2, 0x25, 0xff, 0xfd, 0x0d, 0x19, 0x06, 0x53,\n","  0x06, 0x06, 0xf9, 0x1a, 0x0a, 0x45, 0xe5, 0x43, 0x0a, 0x05, 0xfd, 0x0e,\n","  0x00, 0x17, 0x02, 0x14, 0xf4, 0x12, 0x08, 0xd7, 0x14, 0x0d, 0xf3, 0x1a,\n","  0x0b, 0xfe, 0x21, 0x1b, 0x2f, 0x1e, 0xf1, 0xf4, 0xfb, 0x21, 0x08, 0x00,\n","  0xf3, 0xfe, 0x02, 0x0e, 0x00, 0xfb, 0x13, 0xfc, 0xee, 0xf0, 0x04, 0x04,\n","  0x04, 0x10, 0x02, 0xfd, 0xdc, 0xfc, 0x0c, 0xe8, 0xfa, 0x36, 0xf4, 0x0a,\n","  0xf1, 0xfa, 0xf8, 0xea, 0x00, 0x23, 0x0c, 0x18, 0x00, 0xfb, 0x12, 0xf1,\n","  0xff, 0x29, 0x05, 0x05, 0x05, 0x0a, 0xf6, 0xec, 0x0c, 0x12, 0x05, 0x16,\n","  0xff, 0x11, 0x16, 0xed, 0x01, 0x0c, 0x16, 0x1a, 0xf6, 0x06, 0x09, 0x00,\n","  0x0b, 0x21, 0xf3, 0x0b, 0xdd, 0x1a, 0x12, 0x14, 0x06, 0x2f, 0x11, 0xdc,\n","  0xff, 0xdf, 0x1d, 0x17, 0xe4, 0x3f, 0x0c, 0xf5, 0xe6, 0xdb, 0xfa, 0xdf,\n","  0x0c, 0x33, 0x27, 0xe8, 0xe8, 0xf1, 0x0f, 0x06, 0x05, 0x18, 0x11, 0xfb,\n","  0xe2, 0xf9, 0xd9, 0xef, 0xf7, 0x0c, 0xf3, 0xfe, 0xce, 0xec, 0xfd, 0x04,\n","  0xfe, 0xf5, 0xf6, 0x19, 0xe0, 0x0f, 0x0a, 0x0a, 0xed, 0x0c, 0xf9, 0x26,\n","  0x08, 0x30, 0x00, 0x2a, 0xe7, 0xfd, 0xf4, 0xfc, 0x0c, 0x1c, 0xe9, 0x5e,\n","  0x1c, 0x0b, 0x07, 0xf0, 0x10, 0x23, 0xea, 0x17, 0xfc, 0x01, 0x0d, 0xfb,\n","  0x2f, 0x2f, 0xe7, 0xfb, 0x04, 0x1b, 0x0f, 0xd9, 0x14, 0x21, 0xf0, 0x0e,\n","  0xe9, 0x1c, 0x0d, 0xdd, 0xf2, 0x0c, 0xe7, 0x09, 0x01, 0x12, 0x0b, 0xe5,\n","  0xe4, 0xe7, 0x05, 0xdb, 0x10, 0x25, 0xf1, 0xfa, 0xfa, 0xeb, 0x18, 0x0f,\n","  0xf7, 0x3d, 0x22, 0xf3, 0xed, 0xfa, 0x01, 0x0d, 0x1b, 0x28, 0x00, 0xe8,\n","  0xfc, 0x0d, 0xf3, 0x00, 0x00, 0x16, 0x19, 0x05, 0x0b, 0x07, 0xfb, 0xfe,\n","  0x18, 0x15, 0x24, 0xeb, 0xf2, 0x16, 0x0a, 0xe6, 0x13, 0x03, 0xf5, 0xff,\n","  0x04, 0x00, 0xfe, 0x0e, 0x03, 0xf3, 0x0e, 0x0d, 0x04, 0x27, 0xeb, 0x0d,\n","  0x09, 0x23, 0x15, 0xf3, 0xdb, 0xf0, 0xf6, 0x14, 0x06, 0x1f, 0x19, 0xfa,\n","  0xe0, 0xe9, 0xfc, 0xd8, 0x09, 0x54, 0x21, 0xfb, 0xd8, 0x0d, 0xd8, 0x07,\n","  0x12, 0x1f, 0x04, 0x14, 0xdd, 0x03, 0x14, 0xf1, 0xdf, 0xfa, 0x01, 0x0e,\n","  0xe4, 0xfa, 0x0c, 0x20, 0xe6, 0x06, 0xf4, 0xfc, 0xf2, 0xf9, 0x01, 0x20,\n","  0xed, 0x18, 0xf3, 0x07, 0xfe, 0x3c, 0xfa, 0x3f, 0xfc, 0x14, 0x0b, 0xfe,\n","  0x0d, 0x29, 0x15, 0x30, 0x04, 0x01, 0xe6, 0x0f, 0x19, 0x23, 0xff, 0x22,\n","  0x0d, 0xfc, 0xfe, 0x13, 0x1a, 0x26, 0xd8, 0x1a, 0x13, 0x03, 0xfd, 0xfc,\n","  0x0c, 0x12, 0xed, 0x08, 0x18, 0x11, 0x1b, 0xfe, 0x19, 0xfa, 0xf4, 0x07,\n","  0xf6, 0x11, 0x09, 0xf5, 0x02, 0xdb, 0x21, 0xe0, 0x06, 0x13, 0xec, 0x06,\n","  0xf3, 0xe5, 0xfb, 0xfb, 0xf9, 0x2d, 0xf7, 0xe4, 0xfe, 0xfb, 0xdc, 0xf9,\n","  0x03, 0x1d, 0x0c, 0xec, 0xf1, 0x14, 0x03, 0x00, 0x04, 0x16, 0x1d, 0x00,\n","  0x18, 0x01, 0x20, 0xf8, 0x0f, 0x26, 0x11, 0xdb, 0x06, 0x0a, 0xe9, 0xf1,\n","  0x0b, 0xfe, 0x07, 0xf7, 0xff, 0xfe, 0x08, 0xff, 0x05, 0xf4, 0x13, 0x05,\n","  0xe3, 0x02, 0x24, 0xfe, 0x0e, 0x0d, 0x01, 0xea, 0xec, 0xd9, 0x0e, 0x0e,\n","  0xf2, 0x2f, 0x23, 0x11, 0xf3, 0xcd, 0x0c, 0xea, 0x1f, 0x49, 0x16, 0x04,\n","  0xec, 0x0a, 0x18, 0xef, 0x20, 0x0e, 0x20, 0xda, 0xc3, 0xfd, 0x09, 0xeb,\n","  0xe5, 0x20, 0xfb, 0x06, 0xe7, 0x04, 0xfc, 0x10, 0xfa, 0xf6, 0xfb, 0xee,\n","  0xe6, 0x0b, 0xee, 0x13, 0xeb, 0x11, 0xea, 0xed, 0x20, 0x34, 0x0f, 0x2e,\n","  0xf3, 0x1c, 0x00, 0xd4, 0x15, 0x3e, 0x12, 0x31, 0xf4, 0x06, 0xf9, 0xdd,\n","  0x11, 0x1c, 0x23, 0x11, 0xf8, 0xfb, 0x11, 0xfb, 0x19, 0x10, 0xd8, 0x24,\n","  0x10, 0x18, 0x0d, 0x27, 0x04, 0x0f, 0xdf, 0xf5, 0x08, 0x07, 0x12, 0xdb,\n","  0x08, 0x01, 0x07, 0xfe, 0xf3, 0x00, 0x09, 0xf9, 0x01, 0xd3, 0x00, 0xf9,\n","  0x05, 0x1d, 0xf9, 0xf2, 0xf4, 0xf9, 0x1a, 0xfd, 0xf2, 0x38, 0x01, 0x12,\n","  0xef, 0xf6, 0x06, 0xfb, 0x0a, 0x1c, 0xf6, 0x10, 0x06, 0x05, 0xf2, 0x03,\n","  0xf9, 0x07, 0x07, 0xf8, 0x0f, 0xff, 0xf3, 0xff, 0x17, 0x18, 0x08, 0x0d,\n","  0xf2, 0xff, 0xf1, 0x03, 0x2e, 0xfb, 0xff, 0xd5, 0xf0, 0x05, 0x01, 0x0d,\n","  0xf5, 0xf0, 0xeb, 0x05, 0x0c, 0x0d, 0xff, 0x13, 0x0c, 0x13, 0x24, 0xf1,\n","  0xf9, 0xf1, 0x07, 0x06, 0xe9, 0x45, 0x2c, 0x0e, 0xdc, 0xe5, 0x1c, 0xea,\n","  0x0e, 0x4e, 0x32, 0x05, 0xed, 0xfb, 0xfa, 0xf6, 0x0d, 0x15, 0xfb, 0xfb,\n","  0xe2, 0xf7, 0xea, 0xfb, 0xf1, 0x14, 0xef, 0x07, 0xf8, 0x08, 0x1d, 0x24,\n","  0xed, 0x06, 0xe3, 0xed, 0xf1, 0x09, 0x1f, 0x0e, 0xef, 0x1b, 0xec, 0xfb,\n","  0x10, 0x2c, 0x08, 0x3e, 0xef, 0x0d, 0x07, 0xf4, 0x0c, 0x35, 0x18, 0x30,\n","  0xf5, 0xf7, 0xf4, 0xf8, 0x12, 0x29, 0x00, 0x0e, 0xfe, 0x00, 0x03, 0xe6,\n","  0x13, 0x29, 0x1f, 0x22, 0x08, 0xfc, 0x0c, 0x06, 0x1f, 0x16, 0x0d, 0x0c,\n","  0xff, 0xf6, 0xfd, 0x1a, 0xfc, 0x00, 0xef, 0xff, 0x09, 0x0f, 0x0c, 0x02,\n","  0xfd, 0xe2, 0x0a, 0xf5, 0xfb, 0x1b, 0xf2, 0xdc, 0xff, 0xf7, 0x14, 0xf9,\n","  0x17, 0x2a, 0x19, 0x1c, 0xfc, 0x0d, 0xf0, 0x02, 0x09, 0x22, 0x13, 0x05,\n","  0x0c, 0x02, 0xff, 0x0c, 0x04, 0x08, 0x0d, 0xd3, 0x0b, 0x04, 0x12, 0xe4,\n","  0x0a, 0x16, 0x00, 0xf1, 0x10, 0x13, 0x07, 0xe6, 0x2a, 0xf1, 0xf3, 0xdf,\n","  0x08, 0x11, 0x0b, 0x07, 0x08, 0x0b, 0xe9, 0xef, 0xed, 0x0d, 0x06, 0x1e,\n","  0x06, 0x1d, 0x04, 0xf9, 0xfe, 0xde, 0xf6, 0x1b, 0xea, 0x4d, 0x12, 0xfd,\n","  0xe1, 0xec, 0x1e, 0xeb, 0xfc, 0x2f, 0x0b, 0x01, 0xdc, 0x03, 0xf7, 0xef,\n","  0x08, 0x07, 0x16, 0x04, 0xd2, 0x07, 0x08, 0xf0, 0xe7, 0x13, 0xfd, 0x04,\n","  0xdf, 0xf3, 0xfb, 0x25, 0xef, 0x06, 0x00, 0x07, 0xf1, 0x0d, 0x05, 0x00,\n","  0x01, 0x1a, 0xf9, 0xf1, 0x09, 0x42, 0x19, 0x2b, 0x0b, 0x12, 0xfc, 0x16,\n","  0x15, 0x2b, 0x19, 0x27, 0xfa, 0xfb, 0x04, 0xec, 0x15, 0x0e, 0x26, 0x26,\n","  0x11, 0xef, 0xf9, 0xeb, 0x29, 0x23, 0xf9, 0x05, 0xf6, 0x01, 0x17, 0x14,\n","  0x08, 0x14, 0x0a, 0x03, 0x05, 0x05, 0x10, 0x02, 0x0f, 0x0a, 0x0e, 0x0a,\n","  0x00, 0xff, 0x02, 0x03, 0xf0, 0xec, 0xe3, 0xf2, 0xf4, 0x16, 0x08, 0xf0,\n","  0x07, 0xda, 0x20, 0x05, 0x17, 0x34, 0x0b, 0xda, 0x02, 0xeb, 0x05, 0x14,\n","  0xfb, 0x19, 0x10, 0xe8, 0x08, 0xfa, 0xed, 0x07, 0xfe, 0x25, 0xf3, 0x1d,\n","  0xfd, 0xf6, 0xfa, 0xe5, 0x10, 0x12, 0x28, 0x09, 0x01, 0xfc, 0x0e, 0xde,\n","  0x2f, 0x05, 0x18, 0xfa, 0xf4, 0xf4, 0xed, 0x1d, 0x05, 0xfc, 0x01, 0xfd,\n","  0x10, 0x0a, 0x1c, 0x09, 0x23, 0x21, 0x0c, 0x18, 0xfe, 0xdc, 0xf7, 0x1d,\n","  0xea, 0x3c, 0x0d, 0x0c, 0x07, 0xe7, 0xe4, 0xe0, 0x03, 0x2c, 0xf7, 0xea,\n","  0xd2, 0x01, 0xfd, 0xe7, 0x24, 0x19, 0x04, 0xf1, 0xce, 0x02, 0xda, 0xe9,\n","  0xf7, 0x1d, 0xf2, 0x00, 0xd7, 0x15, 0x13, 0x15, 0xf1, 0x0b, 0xf6, 0xe8,\n","  0xf3, 0x11, 0xe5, 0x12, 0xea, 0x1f, 0xee, 0x18, 0x0c, 0x39, 0x02, 0x1c,\n","  0x03, 0x13, 0xf6, 0x1f, 0x0b, 0x39, 0xfb, 0x1d, 0x04, 0x03, 0xfb, 0xe9,\n","  0x12, 0x29, 0xfd, 0xfc, 0x18, 0x13, 0xff, 0x13, 0x18, 0x30, 0x0b, 0x1f,\n","  0xf9, 0x04, 0x02, 0xf8, 0x17, 0xfb, 0x26, 0xfe, 0x13, 0x02, 0x12, 0x1e,\n","  0x03, 0xfa, 0xf2, 0x06, 0x04, 0xe9, 0xed, 0x22, 0xfa, 0xfa, 0xf9, 0x00,\n","  0xfa, 0x18, 0x1b, 0xf9, 0x17, 0xf9, 0xed, 0x1c, 0xff, 0x22, 0x08, 0xfc,\n","  0xfd, 0x0a, 0xfe, 0x0e, 0x00, 0x17, 0xf9, 0x0b, 0xfa, 0x18, 0x0f, 0xff,\n","  0x01, 0x14, 0x27, 0xda, 0x10, 0x0a, 0xf0, 0xde, 0x10, 0x16, 0xfd, 0xef,\n","  0xfb, 0x04, 0x04, 0xf9, 0x2d, 0xfb, 0x00, 0xfe, 0xf5, 0xff, 0xfe, 0xdf,\n","  0x0a, 0x17, 0xfa, 0x04, 0xf6, 0x17, 0xf7, 0x11, 0xf4, 0x2a, 0xfa, 0x26,\n","  0x09, 0xfa, 0xdf, 0x16, 0xea, 0x29, 0x03, 0xf4, 0xdc, 0xdc, 0xf2, 0xd7,\n","  0x0a, 0x3e, 0x01, 0xfe, 0xf6, 0x04, 0x0a, 0x00, 0x0a, 0x38, 0xfb, 0xf7,\n","  0xdc, 0x02, 0x0b, 0xff, 0xeb, 0x0e, 0xee, 0x00, 0xe8, 0x0c, 0x0c, 0x23,\n","  0xf1, 0x00, 0xf0, 0x11, 0xec, 0x25, 0xf0, 0x0a, 0xfa, 0x23, 0xf7, 0xf2,\n","  0x10, 0x1b, 0x09, 0x08, 0x04, 0x10, 0xed, 0x03, 0x19, 0x33, 0x16, 0x1c,\n","  0xfb, 0x19, 0x08, 0x07, 0x07, 0x26, 0xfc, 0x11, 0x19, 0x05, 0xfb, 0xf6,\n","  0x38, 0x0e, 0xed, 0x2a, 0x0a, 0x14, 0x13, 0xe5, 0x15, 0x01, 0x07, 0x08,\n","  0xfc, 0x05, 0x1d, 0xf2, 0x08, 0x01, 0xde, 0xf3, 0x0d, 0xe8, 0x1b, 0xff,\n","  0xf3, 0xf5, 0xfa, 0xea, 0x07, 0x16, 0x25, 0x01, 0x07, 0xfc, 0x09, 0x12,\n","  0xf5, 0x12, 0xfc, 0x04, 0xf6, 0x0c, 0xef, 0xfd, 0x05, 0x2c, 0x04, 0x13,\n","  0xf8, 0xfc, 0xec, 0x15, 0x04, 0xfb, 0x05, 0x14, 0x03, 0xea, 0x02, 0xfe,\n","  0x0c, 0x14, 0x0d, 0x15, 0xfb, 0x07, 0xfd, 0xeb, 0x24, 0x06, 0xff, 0xeb,\n","  0xf4, 0x07, 0xfb, 0x22, 0x07, 0xe8, 0xee, 0x0e, 0xfb, 0xfe, 0xde, 0x19,\n","  0xf1, 0x38, 0xf6, 0x00, 0x0c, 0xf0, 0x00, 0x11, 0xfe, 0x36, 0xef, 0xd6,\n","  0xe6, 0xed, 0x0f, 0xe7, 0xfe, 0x38, 0xf2, 0x21, 0xe0, 0x02, 0xe4, 0x06,\n","  0x03, 0x1f, 0x08, 0xd9, 0xfd, 0xf6, 0x13, 0xfc, 0xe1, 0x0b, 0xfd, 0xfc,\n","  0xf2, 0xff, 0x29, 0x1a, 0xef, 0x04, 0xfb, 0xf0, 0xe2, 0x1d, 0x0a, 0x16,\n","  0xe3, 0x26, 0xdd, 0x0c, 0xf6, 0x23, 0xf0, 0x13, 0xfc, 0x0b, 0x10, 0x0e,\n","  0x0f, 0x35, 0xfb, 0x16, 0x14, 0x20, 0x04, 0x01, 0x0a, 0x0e, 0xf1, 0x0f,\n","  0x08, 0xf8, 0xf9, 0xf8, 0x1c, 0x32, 0x1a, 0x14, 0x05, 0x0c, 0x10, 0xda,\n","  0x24, 0x25, 0x13, 0x1f, 0x14, 0xfb, 0x06, 0xdf, 0x01, 0x0c, 0xfa, 0x03,\n","  0x00, 0xe5, 0x15, 0xf4, 0xf3, 0xdf, 0x0d, 0xee, 0xfe, 0x0e, 0x0e, 0x27,\n","  0x11, 0xe9, 0x0c, 0x11, 0xed, 0x2b, 0x03, 0x16, 0x09, 0xec, 0x06, 0xe7,\n","  0xf2, 0x33, 0x04, 0x09, 0xf8, 0x0f, 0x0e, 0xf2, 0xfd, 0xfa, 0x04, 0xf4,\n","  0x10, 0x0a, 0x0b, 0xfc, 0x02, 0xfc, 0xfc, 0xf8, 0x1e, 0x04, 0xe8, 0xdb,\n","  0x10, 0xf8, 0x0a, 0x0f, 0xfb, 0xf8, 0x1f, 0xee, 0x0e, 0xde, 0xed, 0xd8,\n","  0x09, 0x0a, 0x1c, 0x2c, 0x06, 0x35, 0x01, 0x0a, 0x0f, 0xf5, 0xf7, 0x0f,\n","  0xf7, 0x30, 0x15, 0x08, 0xcf, 0xf9, 0x20, 0xd9, 0x05, 0x35, 0xe3, 0x04,\n","  0xe1, 0x12, 0xf0, 0x04, 0xfa, 0x1d, 0xfd, 0xff, 0x04, 0xfb, 0x28, 0xfd,\n","  0xed, 0x2f, 0xf5, 0x04, 0xe6, 0x04, 0xe6, 0x16, 0xe5, 0xe8, 0xf2, 0xeb,\n","  0xf1, 0x02, 0x07, 0x08, 0xe9, 0x22, 0xed, 0xf6, 0x01, 0x3c, 0xf4, 0x17,\n","  0xe5, 0x0e, 0x07, 0x06, 0x00, 0x34, 0xfc, 0x1a, 0x1a, 0x17, 0x0c, 0xfe,\n","  0xf7, 0x1d, 0xe9, 0x30, 0x11, 0xf8, 0x19, 0x05, 0x1d, 0x1b, 0xda, 0xfe,\n","  0x07, 0xf1, 0x08, 0xf2, 0x2b, 0xff, 0xef, 0x01, 0xf9, 0x04, 0x05, 0xfb,\n","  0xf8, 0x06, 0x0d, 0x04, 0x14, 0xfb, 0x14, 0x06, 0xf2, 0xe3, 0xfe, 0x07,\n","  0xf4, 0x0c, 0xfd, 0x1d, 0x18, 0xeb, 0x05, 0xee, 0x12, 0x24, 0x00, 0x0b,\n","  0xff, 0xf9, 0x01, 0xec, 0xfa, 0x1e, 0x1b, 0xfe, 0x01, 0x07, 0x26, 0x06,\n","  0x02, 0x0c, 0xf7, 0x03, 0x1c, 0xf2, 0x14, 0xdc, 0x09, 0x1f, 0xf4, 0x14,\n","  0x0e, 0x0c, 0xf8, 0xec, 0x1c, 0x0f, 0xf8, 0xf8, 0x0a, 0xf7, 0x1b, 0xfb,\n","  0xfe, 0x1b, 0xfa, 0xee, 0x05, 0x06, 0xef, 0x20, 0xe5, 0x4e, 0xef, 0xea,\n","  0xf5, 0xe7, 0x06, 0x17, 0xd8, 0x1e, 0x12, 0xfa, 0xed, 0xf5, 0x01, 0xf2,\n","  0xfe, 0x2a, 0x07, 0xfd, 0xdd, 0x01, 0xfa, 0x02, 0x12, 0x2f, 0xf5, 0x0e,\n","  0xf2, 0xff, 0x03, 0xfc, 0xe7, 0x23, 0xd8, 0x08, 0xef, 0x00, 0xef, 0x0c,\n","  0xe4, 0xe7, 0xf6, 0xfc, 0xcb, 0x18, 0x0d, 0x0d, 0xe9, 0x12, 0x0c, 0x00,\n","  0xf8, 0x23, 0xea, 0x28, 0xeb, 0x26, 0xfa, 0xe5, 0x1a, 0x32, 0x1a, 0x1b,\n","  0x15, 0x16, 0xf1, 0x07, 0xf5, 0x2b, 0x01, 0x11, 0x12, 0x0f, 0x09, 0x00,\n","  0x23, 0x23, 0xf4, 0xfb, 0x08, 0x0b, 0x10, 0x1f, 0x21, 0x0e, 0x0a, 0x08,\n","  0x0a, 0xff, 0x01, 0x17, 0xf2, 0xe3, 0xdc, 0x12, 0x0f, 0x05, 0x16, 0xec,\n","  0xf3, 0xef, 0xeb, 0xeb, 0xfb, 0x12, 0x11, 0xf8, 0x17, 0xe7, 0x0c, 0xf6,\n","  0x08, 0x0e, 0x15, 0xe4, 0x0a, 0x00, 0xd8, 0xf8, 0xf6, 0x00, 0x08, 0x22,\n","  0xfd, 0xfa, 0x0c, 0xfe, 0x08, 0x14, 0xfc, 0x04, 0x06, 0xfa, 0x15, 0xf5,\n","  0x0f, 0xf6, 0xf0, 0x03, 0x03, 0xf2, 0x0e, 0xf1, 0x27, 0xf0, 0xf0, 0xf4,\n","  0xf9, 0x07, 0xf0, 0x07, 0x02, 0x1b, 0xfc, 0x00, 0xf2, 0xfc, 0x13, 0x06,\n","  0xef, 0x2a, 0xf2, 0xeb, 0xf9, 0xe5, 0xfa, 0x0c, 0xe6, 0x20, 0xf4, 0x03,\n","  0xe2, 0x12, 0xfb, 0x03, 0x0b, 0x39, 0xed, 0x0f, 0xf4, 0x10, 0x04, 0xf8,\n","  0x04, 0x27, 0xee, 0x05, 0xfe, 0x00, 0x07, 0xf3, 0xd6, 0x0e, 0xe8, 0x10,\n","  0xd9, 0xf6, 0x04, 0x10, 0xd3, 0xf3, 0xef, 0x01, 0xf8, 0x18, 0x00, 0x1b,\n","  0xec, 0x28, 0xe1, 0x18, 0xee, 0x38, 0x13, 0x1b, 0x0d, 0x1b, 0xf8, 0x05,\n","  0xec, 0x20, 0xe6, 0x10, 0x0a, 0x14, 0x0a, 0xe4, 0x0a, 0x1b, 0x05, 0xef,\n","  0x1b, 0xed, 0xf7, 0x07, 0x13, 0x0d, 0x1a, 0x06, 0x05, 0x09, 0x05, 0xff,\n","  0x32, 0x04, 0xee, 0x10, 0x1d, 0xf5, 0x0e, 0xe8, 0x0e, 0x07, 0x1e, 0xf4,\n","  0xff, 0x00, 0xfa, 0x0d, 0xf6, 0xdd, 0xf8, 0xfd, 0xed, 0x1a, 0x1c, 0xe0,\n","  0x0d, 0xf7, 0xfe, 0xe6, 0xf9, 0x1a, 0x25, 0x0a, 0x11, 0x04, 0xf7, 0xe8,\n","  0x0a, 0x09, 0x09, 0x0b, 0x03, 0x03, 0x1e, 0xe8, 0x07, 0x1a, 0xdf, 0x15,\n","  0x03, 0xe5, 0xdb, 0xe9, 0x1c, 0xf4, 0x0e, 0xf5, 0xfb, 0x08, 0x03, 0xfe,\n","  0x15, 0xfa, 0xfc, 0x22, 0xef, 0x01, 0xf8, 0x0a, 0xf2, 0x04, 0x0a, 0x15,\n","  0xdf, 0x09, 0x0b, 0x18, 0xef, 0x28, 0x05, 0x05, 0xe1, 0xf2, 0x16, 0x1a,\n","  0xd9, 0x1a, 0xfc, 0x01, 0xe2, 0x11, 0x25, 0xdc, 0xe9, 0x13, 0xf6, 0xf9,\n","  0xd5, 0xfd, 0xfd, 0xf9, 0xd2, 0x14, 0xf3, 0x0c, 0xd5, 0x08, 0x15, 0xff,\n","  0xe1, 0xfa, 0xe0, 0xf9, 0xfe, 0x10, 0x00, 0x15, 0xd5, 0xf0, 0x05, 0x1d,\n","  0xf1, 0x28, 0x04, 0xf7, 0xea, 0x20, 0xf8, 0x0e, 0x08, 0x29, 0x10, 0x0a,\n","  0xf1, 0x21, 0x0d, 0x16, 0xfc, 0x24, 0xf0, 0x1e, 0x14, 0x09, 0xec, 0x07,\n","  0x1a, 0x1f, 0x09, 0xf0, 0x19, 0xe5, 0x19, 0xf4, 0x04, 0x0d, 0x01, 0x00,\n","  0xf4, 0xf8, 0x07, 0xfd, 0x18, 0x10, 0x1d, 0x01, 0x0a, 0xfc, 0x18, 0x1b,\n","  0xf5, 0x15, 0xfa, 0xf9, 0x09, 0xe7, 0x0d, 0x0d, 0xff, 0xfe, 0xf0, 0xf2,\n","  0xf9, 0x10, 0x0f, 0x09, 0xfa, 0xdb, 0xf2, 0xe4, 0xfa, 0x14, 0x08, 0x0b,\n","  0x04, 0xfb, 0x12, 0xe8, 0xf7, 0x02, 0x05, 0x0d, 0xfe, 0x0d, 0xe4, 0xf9,\n","  0xef, 0x20, 0x0d, 0xe6, 0xff, 0x00, 0xea, 0xf0, 0x09, 0x07, 0x02, 0x0a,\n","  0x14, 0xec, 0x1e, 0xe4, 0x20, 0xf8, 0xf5, 0x05, 0x02, 0x09, 0x0a, 0x10,\n","  0xf8, 0xff, 0x01, 0xf3, 0x16, 0x10, 0x00, 0x02, 0xda, 0x24, 0xd2, 0x00,\n","  0x0c, 0xff, 0xdd, 0x13, 0xdc, 0x2b, 0xd6, 0xf5, 0xfd, 0x1d, 0xf7, 0x0d,\n","  0xd9, 0x18, 0xee, 0x0e, 0xef, 0x0f, 0xf2, 0xf8, 0xd5, 0x11, 0xdd, 0x0f,\n","  0xf2, 0xfb, 0xfc, 0x07, 0xda, 0x01, 0xe1, 0xf2, 0xf4, 0xf8, 0xf6, 0xfa,\n","  0xd4, 0xfa, 0xee, 0xdd, 0xce, 0x06, 0xf7, 0x0c, 0xf5, 0x1a, 0xe8, 0x0a,\n","  0xf5, 0x1e, 0xf4, 0x12, 0xf1, 0x23, 0x02, 0x09, 0xfa, 0x00, 0xea, 0x21,\n","  0xed, 0x04, 0xf7, 0x05, 0xf9, 0x1a, 0xf7, 0x09, 0x0c, 0xf8, 0x08, 0x20,\n","  0x12, 0x21, 0xdd, 0x08, 0x04, 0xfd, 0x17, 0x08, 0x06, 0x17, 0xec, 0x12,\n","  0x05, 0xfb, 0x07, 0x14, 0x01, 0x1c, 0x13, 0xf4, 0x1e, 0xea, 0x09, 0xf0,\n","  0x1a, 0x04, 0xfb, 0xfe, 0xf2, 0x1e, 0x1a, 0xfb, 0x20, 0xef, 0x10, 0xff,\n","  0x04, 0x19, 0x09, 0x07, 0x15, 0xee, 0xda, 0xc8, 0x01, 0x22, 0xf5, 0xe3,\n","  0x05, 0xfb, 0x18, 0xf2, 0xf1, 0x07, 0xfb, 0x05, 0xf8, 0xef, 0xf9, 0xf4,\n","  0xfa, 0xfa, 0xf0, 0xfa, 0x00, 0xe5, 0xf1, 0x0a, 0xfc, 0xf5, 0xe9, 0x25,\n","  0xff, 0xed, 0xf8, 0xfd, 0xec, 0xf0, 0xfc, 0x00, 0xf3, 0x0b, 0x0a, 0x1c,\n","  0xdb, 0x03, 0x0b, 0x0b, 0xfa, 0xfc, 0xda, 0xfc, 0xef, 0x2d, 0xff, 0xf5,\n","  0x0b, 0x04, 0xfa, 0xe2, 0xd8, 0x03, 0x0f, 0x1c, 0xf8, 0x0c, 0x1c, 0x1e,\n","  0xe3, 0x12, 0x18, 0xeb, 0xe7, 0x18, 0xfb, 0xdd, 0xcc, 0xf2, 0xf2, 0xe3,\n","  0xfe, 0x11, 0xed, 0x15, 0xf6, 0xff, 0xdf, 0xf9, 0xfb, 0x04, 0x03, 0x23,\n","  0xc6, 0x1b, 0x2e, 0xfa, 0x0a, 0x2d, 0x02, 0x08, 0xf2, 0x14, 0xeb, 0xe5,\n","  0xfb, 0xf7, 0x03, 0x15, 0x12, 0x06, 0x1d, 0x05, 0x07, 0x10, 0x0d, 0x0c,\n","  0x13, 0x0c, 0xf5, 0xf6, 0x14, 0x05, 0xee, 0xfa, 0xf6, 0x06, 0x0c, 0xe3,\n","  0x06, 0xf9, 0xea, 0xf6, 0x23, 0xea, 0x0e, 0xfb, 0xf6, 0x10, 0x17, 0x11,\n","  0x08, 0xfb, 0x25, 0xef, 0x07, 0x1e, 0xf4, 0xeb, 0xed, 0x27, 0xef, 0xf7,\n","  0xfa, 0xe5, 0xee, 0xe9, 0x06, 0x18, 0xed, 0xea, 0x09, 0xec, 0x0e, 0x01,\n","  0xfc, 0xe9, 0xff, 0xec, 0xee, 0xe0, 0x1f, 0xe7, 0xfb, 0x12, 0x20, 0x07,\n","  0x16, 0xea, 0x08, 0xd3, 0x1d, 0xe8, 0xea, 0xff, 0xf1, 0xf2, 0x0f, 0xd1,\n","  0x24, 0xfb, 0xe3, 0x1b, 0xff, 0xe4, 0xef, 0xf3, 0xfd, 0xf8, 0xfd, 0xe0,\n","  0xf1, 0x01, 0x11, 0x0d, 0x0f, 0xf5, 0x05, 0xe3, 0xf8, 0xe3, 0xe5, 0xfd,\n","  0x01, 0x03, 0xfc, 0x04, 0x0f, 0x08, 0xfe, 0xf8, 0xeb, 0x1b, 0x0e, 0xdc,\n","  0x19, 0xf9, 0x11, 0xe6, 0x0e, 0xfd, 0x0d, 0x14, 0xfa, 0x06, 0x20, 0x01,\n","  0x00, 0x0e, 0x0f, 0xe5, 0x24, 0xe0, 0x0e, 0xf3, 0xfa, 0xfe, 0xf6, 0xfb,\n","  0x05, 0x18, 0xfe, 0xf7, 0xe9, 0x1b, 0x07, 0xe0, 0xf9, 0xf2, 0x0f, 0xc7,\n","  0xf8, 0x08, 0x01, 0x16, 0xfa, 0xfe, 0x0a, 0xed, 0xfd, 0xfd, 0xf9, 0x14,\n","  0xf7, 0xe0, 0x0c, 0xf9, 0xf9, 0x0d, 0x0e, 0x06, 0x03, 0x1f, 0x17, 0x00,\n","  0xfa, 0x00, 0xfb, 0x15, 0x12, 0xe4, 0xde, 0xf7, 0x03, 0xec, 0x14, 0x22,\n","  0x02, 0xfe, 0x0a, 0xfe, 0x06, 0xf8, 0xfe, 0xed, 0x0a, 0xed, 0x0b, 0x08,\n","  0xe4, 0xec, 0xf8, 0xe5, 0x06, 0x07, 0xe8, 0xf4, 0xfa, 0xe0, 0xfa, 0x0f,\n","  0x05, 0x13, 0x0a, 0xf3, 0xf2, 0x09, 0x19, 0x0c, 0xfa, 0xf1, 0xfc, 0xec,\n","  0xe2, 0x01, 0xfa, 0xfb, 0x0b, 0xfc, 0x09, 0x06, 0x0d, 0x11, 0x04, 0xf2,\n","  0x0a, 0xf7, 0x28, 0x0f, 0xf4, 0x17, 0xda, 0xff, 0xfb, 0xe9, 0x26, 0x02,\n","  0xf5, 0x17, 0x00, 0xe2, 0xfe, 0x10, 0xe8, 0x05, 0x21, 0xed, 0x0b, 0x1e,\n","  0x01, 0x00, 0xf5, 0xff, 0x29, 0xe8, 0x11, 0xf4, 0x20, 0xf9, 0x16, 0xec,\n","  0x0c, 0xd9, 0x23, 0xf0, 0x0f, 0x27, 0xdd, 0xe9, 0x05, 0xd9, 0x0d, 0x20,\n","  0x0e, 0x34, 0xfe, 0xda, 0x27, 0xf9, 0x1a, 0x26, 0x17, 0x0b, 0xd8, 0xd7,\n","  0xfe, 0x0a, 0x0c, 0xe3, 0xf8, 0x0c, 0xdd, 0xf9, 0x12, 0xef, 0xff, 0x1b,\n","  0x01, 0xf7, 0xe9, 0xf6, 0x05, 0xf1, 0x1d, 0xdf, 0xf3, 0x05, 0xf2, 0xf2,\n","  0x00, 0x05, 0x00, 0xf8, 0x0f, 0xdf, 0x00, 0x03, 0x00, 0x13, 0x0b, 0xf0,\n","  0x0e, 0xec, 0x13, 0x11, 0x17, 0xf1, 0x07, 0xe7, 0x1d, 0xe1, 0xe7, 0xe9,\n","  0x07, 0x00, 0x1e, 0x21, 0x0b, 0xe8, 0xeb, 0xf5, 0x14, 0xf0, 0x10, 0xe9,\n","  0xf5, 0xf1, 0xf2, 0xcd, 0xf3, 0x0e, 0x10, 0xff, 0x11, 0xf0, 0x03, 0xf8,\n","  0x1c, 0x0f, 0x13, 0xeb, 0xf1, 0xe3, 0x0e, 0xfc, 0x02, 0x0b, 0xf3, 0xf2,\n","  0xfb, 0x01, 0x01, 0xfc, 0xee, 0xfb, 0xeb, 0xf9, 0xf4, 0x0d, 0x03, 0xfe,\n","  0xda, 0x13, 0x15, 0xfa, 0x14, 0xf1, 0xd9, 0x09, 0xf5, 0x07, 0x04, 0x03,\n","  0xfe, 0x15, 0xfe, 0x0b, 0x0e, 0xf9, 0x20, 0xeb, 0x11, 0xef, 0xf8, 0xf7,\n","  0x21, 0xeb, 0x0e, 0x0f, 0x0d, 0x1e, 0x13, 0x02, 0xec, 0xef, 0xf5, 0xf4,\n","  0xf9, 0x1a, 0xf3, 0xf8, 0x01, 0xfb, 0xfd, 0xe1, 0xfb, 0x22, 0x09, 0xee,\n","  0xf0, 0xf0, 0x0c, 0xf9, 0x12, 0x3d, 0xfe, 0xef, 0xec, 0x02, 0x0e, 0x15,\n","  0x0c, 0x18, 0x25, 0xf2, 0x0d, 0xf4, 0x13, 0xf6, 0x0e, 0x02, 0xf1, 0xf4,\n","  0x0d, 0xff, 0x0f, 0xf0, 0x20, 0x00, 0xf4, 0xf2, 0x12, 0xfc, 0x2a, 0xda,\n","  0xe8, 0xe6, 0xfa, 0xfc, 0xf7, 0x14, 0x07, 0xf2, 0x11, 0xe9, 0x00, 0x00,\n","  0x02, 0x02, 0x06, 0x10, 0x05, 0xf6, 0x17, 0xf7, 0xf6, 0x0a, 0x06, 0xea,\n","  0xfc, 0xf6, 0xe9, 0x01, 0x05, 0xfd, 0x01, 0x12, 0x12, 0xdd, 0xf4, 0xfb,\n","  0x02, 0x10, 0xf6, 0x03, 0x06, 0xf5, 0x24, 0xed, 0x16, 0xf0, 0xfe, 0x1e,\n","  0xff, 0xf8, 0x20, 0xeb, 0xf7, 0xf4, 0x01, 0x09, 0x1f, 0x03, 0x0e, 0x07,\n","  0xe8, 0xe1, 0x04, 0xf1, 0x0e, 0xec, 0x1c, 0x07, 0x0a, 0xf3, 0xf3, 0xed,\n","  0xfe, 0xf3, 0x0d, 0xf9, 0xeb, 0xf9, 0xff, 0xf5, 0x1f, 0xf9, 0x14, 0xff,\n","  0xee, 0x09, 0xda, 0xfc, 0x09, 0xe2, 0x24, 0xfa, 0x00, 0x15, 0x0b, 0xf7,\n","  0x00, 0xf8, 0x1b, 0x08, 0x09, 0xea, 0x13, 0x11, 0xf5, 0x0c, 0x11, 0x10,\n","  0xfa, 0xf6, 0x11, 0xf5, 0xed, 0x05, 0x17, 0xff, 0xfd, 0x0f, 0x19, 0xe7,\n","  0xf3, 0x39, 0xf7, 0x04, 0x06, 0xf6, 0x01, 0x07, 0xf5, 0x38, 0x03, 0xfe,\n","  0xf8, 0xfe, 0xea, 0xfe, 0xfb, 0x43, 0x0f, 0x04, 0x21, 0xe5, 0x14, 0x1a,\n","  0xdb, 0x20, 0x24, 0xfa, 0x23, 0xdb, 0x24, 0xf8, 0xfa, 0x0f, 0x02, 0x00,\n","  0xf0, 0xd6, 0x07, 0x0e, 0xe6, 0xeb, 0x01, 0x14, 0x08, 0x12, 0x0e, 0x16,\n","  0xf4, 0xe5, 0x02, 0xed, 0x0c, 0x07, 0xe0, 0x13, 0xf7, 0xf6, 0xea, 0xf2,\n","  0xf2, 0x08, 0xe6, 0xe6, 0xfc, 0xe9, 0x0d, 0x0b, 0xf3, 0xff, 0xf7, 0xe0,\n","  0x1c, 0xed, 0x0b, 0xfe, 0x0a, 0x0f, 0xfb, 0x10, 0xf5, 0xf2, 0x1e, 0x1d,\n","  0xdf, 0x0a, 0x25, 0x0e, 0x09, 0x11, 0xf4, 0xf4, 0xfc, 0xf1, 0x0a, 0x0b,\n","  0x0f, 0x1b, 0x14, 0x09, 0x0f, 0xf7, 0xf9, 0x04, 0x06, 0x19, 0x0b, 0x0b,\n","  0xea, 0xdc, 0xf9, 0x03, 0x05, 0xf6, 0x10, 0x20, 0xfc, 0xea, 0xe9, 0x0b,\n","  0x0c, 0x1c, 0xe1, 0x11, 0x18, 0x0d, 0x09, 0x11, 0x0a, 0xf6, 0xf5, 0xf7,\n","  0x07, 0x14, 0x1c, 0x09, 0xf3, 0x04, 0x0d, 0x02, 0xed, 0x25, 0x13, 0xf2,\n","  0xe9, 0x07, 0xfe, 0xf3, 0xed, 0xf3, 0xdf, 0x00, 0xe9, 0x27, 0xf3, 0xf6,\n","  0xe6, 0xd8, 0x11, 0xfc, 0x13, 0x50, 0xfd, 0x0d, 0xd8, 0xd8, 0x1e, 0x0f,\n","  0xe4, 0x49, 0x01, 0x1c, 0xfb, 0xee, 0x20, 0x19, 0x23, 0x2a, 0xfe, 0x10,\n","  0x02, 0xd0, 0x1d, 0x15, 0x1e, 0x3f, 0x07, 0x09, 0x02, 0xc5, 0x1c, 0x22,\n","  0xe9, 0x0a, 0x07, 0xfe, 0x00, 0x00, 0x09, 0x04, 0x0f, 0x04, 0x09, 0x09,\n","  0xe6, 0x14, 0x10, 0x0d, 0xe9, 0xea, 0xf6, 0x18, 0x07, 0x36, 0xe7, 0xf9,\n","  0xe7, 0xde, 0xe4, 0xf6, 0x1d, 0x12, 0xfd, 0xee, 0xf1, 0xed, 0x05, 0x0d,\n","  0x08, 0x10, 0xf9, 0xee, 0x09, 0xf9, 0xf9, 0x09, 0xf1, 0xff, 0xea, 0xf4,\n","  0xeb, 0xfd, 0xf3, 0x1e, 0xff, 0xe6, 0xdb, 0x09, 0xf7, 0xf9, 0x1c, 0x15,\n","  0x0c, 0xfb, 0xf2, 0xfc, 0x02, 0x19, 0xe5, 0x2a, 0xe6, 0xfc, 0xda, 0xec,\n","  0xea, 0x05, 0xfd, 0x0a, 0xe2, 0xec, 0x01, 0xf4, 0x08, 0x27, 0xf8, 0x25,\n","  0x0c, 0xf6, 0xf1, 0xec, 0xe5, 0xfb, 0x05, 0x0d, 0xed, 0xe3, 0xf0, 0xdc,\n","  0x05, 0xe4, 0x09, 0xf6, 0xfe, 0xdf, 0xff, 0xf9, 0xe9, 0x05, 0x26, 0xfe,\n","  0x03, 0x08, 0xfe, 0x07, 0xed, 0xfb, 0xef, 0xfa, 0xe0, 0xfd, 0xf0, 0x14,\n","  0x0b, 0x3c, 0x06, 0x04, 0xfe, 0xe0, 0x04, 0xfa, 0xe2, 0x4a, 0x07, 0x0d,\n","  0xee, 0xda, 0x03, 0xf3, 0xee, 0x40, 0x07, 0x0e, 0xfd, 0xbd, 0x11, 0xfe,\n","  0x20, 0x41, 0xdd, 0x09, 0xf6, 0xd6, 0x20, 0xfe, 0xec, 0x20, 0xf1, 0xff,\n","  0x06, 0xc9, 0x15, 0x02, 0xfb, 0x04, 0x1f, 0xe8, 0xe5, 0xe4, 0xfc, 0xe4,\n","  0x06, 0xfe, 0x23, 0xf3, 0xfe, 0x1b, 0xe2, 0xf5, 0xfd, 0xce, 0x17, 0xea,\n","  0xdc, 0x16, 0xf0, 0x0f, 0xd7, 0xd9, 0x11, 0xf9, 0xf9, 0x19, 0x15, 0x10,\n","  0x04, 0xf3, 0xef, 0xff, 0xff, 0xff, 0xf0, 0xdf, 0xfb, 0xe8, 0x1d, 0xff,\n","  0x1a, 0xf9, 0xed, 0xfb, 0xfe, 0x03, 0xd6, 0x13, 0x09, 0x04, 0x0a, 0x10,\n","  0x1e, 0xfb, 0xfc, 0x17, 0x04, 0xf4, 0x00, 0x0b, 0xf9, 0x05, 0x14, 0x2b,\n","  0xe7, 0xe2, 0x0c, 0x06, 0x10, 0x29, 0x16, 0x2b, 0xdb, 0xe1, 0xf5, 0x00,\n","  0xf0, 0xf8, 0x13, 0x2b, 0x01, 0xea, 0xf5, 0xdf, 0xe9, 0xe5, 0x00, 0x16,\n","  0xf4, 0xf8, 0xff, 0xfb, 0xfe, 0xd8, 0x29, 0x11, 0xf2, 0xfe, 0xdd, 0xf0,\n","  0x1d, 0xfb, 0x1a, 0x09, 0xf9, 0x0e, 0x1d, 0xff, 0xda, 0x08, 0x0f, 0x06,\n","  0xf3, 0xec, 0xe0, 0x1b, 0xef, 0x2a, 0xf9, 0x0e, 0xef, 0xb9, 0xf2, 0x08,\n","  0xfe, 0x58, 0xfd, 0x14, 0xfd, 0xea, 0xea, 0x17, 0xf4, 0x30, 0x1e, 0x08,\n","  0x0c, 0xe2, 0x23, 0xe7, 0x09, 0x38, 0x05, 0x18, 0x12, 0xd1, 0x1a, 0x24,\n","  0x07, 0x2f, 0x16, 0xe1, 0x01, 0xc5, 0x1e, 0x00, 0x08, 0x13, 0xe1, 0xdd,\n","  0xff, 0xe5, 0xea, 0xe0, 0xe4, 0xee, 0xe8, 0xeb, 0xe1, 0x1a, 0xfe, 0xf5,\n","  0xef, 0xbe, 0xe9, 0xff, 0xff, 0x0f, 0xec, 0xfa, 0xcc, 0xd1, 0x09, 0xfc,\n","  0x06, 0x23, 0x0e, 0xf7, 0xe8, 0xec, 0xdb, 0x07, 0x18, 0x09, 0xdc, 0xf6,\n","  0xec, 0xe2, 0x02, 0x15, 0x13, 0x0a, 0xea, 0xee, 0x17, 0xf8, 0xf0, 0x31,\n","  0x04, 0xf2, 0x0a, 0x07, 0xfe, 0x05, 0x0d, 0x2f, 0xfe, 0xe2, 0xf5, 0x1f,\n","  0x13, 0x2b, 0xe4, 0x21, 0xe0, 0xdc, 0x04, 0x0e, 0x01, 0x14, 0x25, 0x47,\n","  0xe6, 0xf6, 0xe5, 0x05, 0x07, 0x0c, 0x12, 0x23, 0xeb, 0xe1, 0xd5, 0x18,\n","  0xe4, 0xe5, 0xde, 0x12, 0xf0, 0x01, 0x04, 0xf2, 0x07, 0xe1, 0xed, 0xeb,\n","  0x03, 0xf2, 0x0d, 0x0c, 0xf3, 0xf1, 0xfb, 0x14, 0xfc, 0x26, 0xe6, 0xe9,\n","  0xee, 0xf4, 0xfe, 0x0a, 0xdf, 0xe2, 0xfa, 0x0e, 0x0b, 0x27, 0xe9, 0xf9,\n","  0xec, 0xc4, 0x16, 0xe3, 0x00, 0x50, 0x0b, 0x0f, 0x0d, 0xd2, 0x0d, 0x01,\n","  0xf7, 0x35, 0xfe, 0x1f, 0x1f, 0xe1, 0x0c, 0x05, 0x19, 0x2f, 0xfd, 0xf4,\n","  0x0d, 0xc3, 0x1d, 0xff, 0x11, 0x29, 0x04, 0xfe, 0x01, 0xc8, 0x2c, 0x02,\n","  0xf5, 0x26, 0xf6, 0xdd, 0xfe, 0xe3, 0x0b, 0x09, 0xe6, 0xe0, 0xfc, 0xdf,\n","  0xfa, 0x17, 0xf0, 0xfa, 0xf0, 0xd7, 0x02, 0xe9, 0x00, 0x1d, 0xd8, 0x04,\n","  0xcb, 0xd3, 0x1a, 0xe2, 0x12, 0x0d, 0xdc, 0xf6, 0x06, 0x08, 0x0d, 0xfb,\n","  0x27, 0x06, 0xf5, 0xdc, 0xfc, 0xdc, 0xfb, 0x2a, 0x0f, 0x16, 0xf3, 0xd9,\n","  0xfa, 0xf7, 0xf0, 0x2d, 0x0a, 0x0a, 0xf8, 0x01, 0xd7, 0x04, 0xfe, 0x3e,\n","  0x07, 0xd4, 0xe6, 0xf3, 0x1d, 0x19, 0x07, 0x3c, 0x01, 0xca, 0xea, 0x06,\n","  0x0a, 0x06, 0xe9, 0x20, 0xe7, 0xdf, 0xd6, 0x03, 0x06, 0x16, 0xf9, 0x42,\n","  0xf0, 0xf0, 0xd8, 0xfc, 0xfe, 0xee, 0x18, 0x0f, 0xf3, 0xf9, 0xf7, 0x03,\n","  0x08, 0xf0, 0xf8, 0xf6, 0x09, 0xd9, 0xfc, 0x06, 0xed, 0x1c, 0x0f, 0x07,\n","  0x0b, 0x00, 0xfc, 0xfb, 0xc4, 0xfe, 0xf9, 0xfb, 0xe1, 0x02, 0x0d, 0x02,\n","  0x02, 0x2e, 0xff, 0xed, 0xef, 0x9c, 0xfb, 0x09, 0x16, 0x4d, 0x27, 0x1d,\n","  0x0e, 0xce, 0xf8, 0x1b, 0x16, 0x4e, 0x12, 0x24, 0x14, 0xeb, 0x04, 0x0b,\n","  0xfd, 0x20, 0x28, 0xfb, 0x18, 0xb5, 0xff, 0xf2, 0x15, 0x15, 0xff, 0xf6,\n","  0x1d, 0xb7, 0x34, 0xe3, 0x02, 0x13, 0xfb, 0x09, 0x09, 0xdc, 0xfa, 0xf1,\n","  0x13, 0x06, 0x22, 0xe1, 0xf2, 0x15, 0xea, 0xdb, 0xee, 0xbf, 0x07, 0xe7,\n","  0xfe, 0x2a, 0xe4, 0xe9, 0xe8, 0xcf, 0xfa, 0xf3, 0x23, 0x16, 0x0f, 0xf9,\n","  0xed, 0xe9, 0x09, 0x01, 0x0c, 0x00, 0xef, 0xea, 0xe6, 0xd8, 0x13, 0x11,\n","  0x0f, 0x2b, 0xdf, 0xd6, 0xfb, 0xf8, 0xf8, 0x30, 0x0a, 0x00, 0xf2, 0xd8,\n","  0xf7, 0x16, 0xfd, 0x45, 0x07, 0xf9, 0xd5, 0x03, 0x0d, 0x17, 0x12, 0x47,\n","  0xf8, 0xe0, 0xed, 0xfb, 0xf9, 0x06, 0xfb, 0x36, 0xfe, 0xe9, 0x01, 0xea,\n","  0xef, 0x13, 0x0d, 0x40, 0xe1, 0xf5, 0xe6, 0xfe, 0xff, 0xf9, 0xe9, 0x06,\n","  0xdd, 0xf2, 0xe1, 0x02, 0x01, 0xf0, 0xeb, 0x03, 0xf8, 0x09, 0x0a, 0x09,\n","  0xf8, 0x01, 0x01, 0x00, 0xff, 0x09, 0xf4, 0x0e, 0xd1, 0x11, 0xe4, 0x0f,\n","  0xe0, 0xe6, 0x05, 0x24, 0x00, 0x3e, 0xf7, 0xf4, 0xe7, 0xc9, 0xf8, 0x1c,\n","  0x1a, 0x44, 0x05, 0x12, 0xfe, 0xc4, 0x26, 0x09, 0x16, 0x53, 0x11, 0x22,\n","  0x16, 0xe5, 0x16, 0xf5, 0x24, 0x34, 0x08, 0x0b, 0x0a, 0xbf, 0xfb, 0xdf,\n","  0x0f, 0x2a, 0x0b, 0xf6, 0x30, 0xa4, 0x3a, 0xf9, 0x08, 0x2f, 0x20, 0xe6,\n","  0x01, 0xea, 0x10, 0x26, 0xf0, 0xf7, 0x05, 0xd1, 0xf2, 0x07, 0x03, 0x13,\n","  0xeb, 0xcc, 0x07, 0xf0, 0xfe, 0x16, 0xf3, 0xfb, 0xd0, 0xd8, 0x10, 0xe4,\n","  0x0a, 0x09, 0xdc, 0x0b, 0xe5, 0xdf, 0xed, 0x00, 0x13, 0x09, 0xf7, 0xf7,\n","  0xf8, 0xf4, 0xf6, 0x1c, 0x28, 0xfc, 0xe5, 0xf2, 0xe6, 0x01, 0x0b, 0x41,\n","  0x0e, 0xe8, 0xf5, 0xe0, 0x14, 0x02, 0x12, 0x38, 0x02, 0xd9, 0xd2, 0x0d,\n","  0x04, 0x10, 0x06, 0x4f, 0x07, 0xdd, 0x03, 0xf0, 0xf9, 0x0e, 0xff, 0x3e,\n","  0xef, 0xf4, 0xed, 0xed, 0x04, 0x11, 0xed, 0x4e, 0xdd, 0xeb, 0xe1, 0xfd,\n","  0xff, 0xfe, 0xef, 0x0f, 0xba, 0xeb, 0xe7, 0xe3, 0x0b, 0xee, 0x0f, 0xd9,\n","  0xf7, 0x0b, 0xff, 0xed, 0xff, 0x09, 0x03, 0xfd, 0xff, 0xf2, 0x09, 0xf0,\n","  0xdd, 0x1d, 0xed, 0x07, 0xde, 0xe8, 0x05, 0xfd, 0x11, 0x3f, 0x04, 0xf5,\n","  0xe4, 0x9e, 0x01, 0x26, 0x30, 0x5c, 0x08, 0x12, 0x11, 0xcc, 0x07, 0xf3,\n","  0x24, 0x38, 0xf9, 0x1b, 0x1f, 0xd6, 0x16, 0x18, 0x1d, 0x28, 0x11, 0x05,\n","  0x21, 0xa4, 0x09, 0xfa, 0x1d, 0x2f, 0xe5, 0xdd, 0x15, 0xa6, 0x24, 0xe2,\n","  0xf6, 0x0c, 0xd6, 0xfb, 0x1c, 0xd9, 0x1c, 0x06, 0xf8, 0xf9, 0x1e, 0xdd,\n","  0xf9, 0x12, 0xec, 0x10, 0xdb, 0xb6, 0x0b, 0xe4, 0xf1, 0x32, 0xd0, 0x18,\n","  0xd2, 0xce, 0xf5, 0xf5, 0x26, 0x08, 0xf8, 0x0b, 0xfc, 0xfa, 0x11, 0xf1,\n","  0x0b, 0x13, 0xfc, 0xfe, 0xf0, 0xf3, 0xfa, 0x16, 0x0e, 0x02, 0xd6, 0x17,\n","  0xf3, 0x10, 0xf9, 0x47, 0x07, 0xfd, 0xe6, 0xf9, 0x00, 0x13, 0x04, 0x2b,\n","  0xf6, 0xe5, 0xf8, 0xeb, 0x0e, 0xfd, 0x1a, 0x2e, 0xee, 0xf2, 0xf4, 0xf0,\n","  0xfc, 0xf9, 0xeb, 0x47, 0xf7, 0xe9, 0xd5, 0xec, 0xf7, 0x0c, 0xee, 0x3c,\n","  0xe4, 0xfb, 0xd4, 0xf3, 0x02, 0x03, 0x03, 0x22, 0xd1, 0x01, 0xeb, 0x0c,\n","  0xf8, 0xde, 0x0e, 0x15, 0xf8, 0x10, 0xf8, 0x04, 0xfc, 0x0f, 0x05, 0x06,\n","  0xff, 0xe1, 0x0e, 0xe1, 0xf3, 0x14, 0xf7, 0x09, 0xda, 0xfd, 0xfe, 0xf8,\n","  0x16, 0x2c, 0x0a, 0xfc, 0xed, 0xb0, 0x15, 0x07, 0x1a, 0x48, 0x03, 0x1f,\n","  0x05, 0xcc, 0x14, 0xee, 0x2c, 0x47, 0x0b, 0x0d, 0x29, 0xda, 0x0b, 0x1b,\n","  0x25, 0x31, 0x1e, 0xf8, 0x1d, 0xc5, 0x26, 0xe9, 0x14, 0x1b, 0x0c, 0xe8,\n","  0x0d, 0x91, 0x1d, 0x07, 0x06, 0x21, 0x22, 0xd9, 0xfc, 0xd1, 0x17, 0x05,\n","  0x03, 0xd7, 0xe2, 0xfa, 0xfa, 0x09, 0xf3, 0xf6, 0xe1, 0xc6, 0xf6, 0xe1,\n","  0xff, 0x18, 0xdd, 0x19, 0xe8, 0xee, 0x1f, 0xeb, 0xfe, 0x1b, 0xe5, 0x11,\n","  0xdc, 0xf7, 0x06, 0x0c, 0xf8, 0x0a, 0xee, 0xf9, 0xdb, 0x06, 0x04, 0x21,\n","  0x0a, 0x1f, 0x05, 0x15, 0xe6, 0xfc, 0x1f, 0x30, 0x0e, 0xe6, 0xe0, 0x03,\n","  0xff, 0x11, 0xe2, 0x43, 0xfd, 0xe1, 0xf6, 0x13, 0xf7, 0x04, 0x21, 0x59,\n","  0x0c, 0xf8, 0xcd, 0x11, 0xf9, 0x0c, 0xf3, 0x5c, 0x00, 0xd7, 0xf3, 0x0f,\n","  0xfa, 0x16, 0x0f, 0x45, 0xe9, 0xed, 0xec, 0xff, 0xf5, 0x0c, 0x1e, 0x1a,\n","  0xd1, 0xfe, 0xe6, 0xfc, 0x0b, 0xfa, 0xe2, 0xef, 0xf5, 0x0b, 0x02, 0x10,\n","  0xea, 0x10, 0xf3, 0xfe, 0xf3, 0xe8, 0x17, 0x08, 0xdd, 0x28, 0x15, 0x04,\n","  0xf2, 0xdf, 0xfd, 0xff, 0x1d, 0x39, 0xf6, 0xf3, 0xdd, 0xc1, 0x0c, 0x09,\n","  0x18, 0x60, 0x0c, 0x22, 0x0a, 0xe3, 0x34, 0xf7, 0x26, 0x5b, 0x0a, 0x18,\n","  0x09, 0xd7, 0x29, 0x18, 0x2b, 0x44, 0xe1, 0x16, 0x14, 0xd0, 0x21, 0xfb,\n","  0x15, 0x0b, 0x06, 0xe4, 0x07, 0xa5, 0x2a, 0x02, 0xf8, 0x10, 0x15, 0xe1,\n","  0x0c, 0xe3, 0x19, 0xfc, 0x0f, 0xef, 0xf3, 0xf9, 0xed, 0x01, 0xd7, 0x05,\n","  0xe7, 0xbe, 0xdf, 0xe8, 0xff, 0x05, 0xf0, 0x12, 0xcf, 0xcf, 0xf3, 0xf3,\n","  0x13, 0x1e, 0xef, 0x11, 0xe1, 0x01, 0xf0, 0x0e, 0x26, 0xed, 0xef, 0xfb,\n","  0xee, 0xf7, 0xfe, 0x17, 0x0a, 0x18, 0xd6, 0x09, 0xf3, 0x05, 0xec, 0x28,\n","  0x0b, 0xef, 0xd2, 0xf8, 0xec, 0x05, 0x00, 0x36, 0x03, 0xf8, 0xe8, 0x1f,\n","  0x1d, 0x00, 0x1a, 0x3c, 0xfe, 0xee, 0xea, 0xf2, 0xf6, 0x0d, 0x00, 0x3c,\n","  0xf4, 0xe1, 0xf2, 0xf7, 0x0d, 0x10, 0x21, 0x47, 0xf7, 0x0a, 0xea, 0xfe,\n","  0x0c, 0x07, 0xfd, 0x26, 0xbe, 0xfb, 0xd6, 0x04, 0x11, 0xf9, 0x12, 0xfd,\n","  0xf3, 0x17, 0xff, 0xe6, 0x0d, 0xfc, 0xf7, 0xee, 0xf9, 0xf1, 0x07, 0xf6,\n","  0xfc, 0x0d, 0x03, 0x01, 0xdd, 0xe7, 0x01, 0x14, 0x23, 0x31, 0x09, 0xff,\n","  0x02, 0xa4, 0x14, 0xe5, 0x22, 0x5c, 0xf9, 0x01, 0x18, 0xcb, 0x13, 0x05,\n","  0x13, 0x50, 0x0d, 0x11, 0x12, 0xe8, 0x2f, 0xe6, 0x33, 0x37, 0x1a, 0xf4,\n","  0x2b, 0xc0, 0x20, 0xdd, 0x12, 0x1c, 0xe8, 0xca, 0x0b, 0xa8, 0x1d, 0xe3,\n","  0x16, 0xf8, 0xf5, 0xcf, 0x0f, 0xeb, 0xfb, 0xfc, 0xfe, 0x00, 0xf0, 0xe1,\n","  0xe0, 0x11, 0xe1, 0x03, 0xef, 0xc2, 0xfa, 0xfc, 0xf3, 0x11, 0xf1, 0x26,\n","  0xdb, 0xbe, 0xef, 0xee, 0x13, 0x0c, 0xf1, 0xe3, 0xdc, 0xfb, 0x00, 0xfa,\n","  0x07, 0xe7, 0xe4, 0xe5, 0xdd, 0xfd, 0x0f, 0x17, 0x15, 0x13, 0xf8, 0xe8,\n","  0xfb, 0x04, 0x0e, 0x33, 0x09, 0xfe, 0xcc, 0xfa, 0x00, 0x0a, 0x25, 0x33,\n","  0x09, 0xfb, 0xd3, 0xf0, 0x00, 0x06, 0xfd, 0x39, 0x08, 0xf8, 0xd0, 0x0c,\n","  0x0c, 0x13, 0xf5, 0x47, 0xf4, 0xe9, 0xe9, 0xf7, 0xfa, 0x1e, 0xfe, 0x50,\n","  0xf7, 0xfe, 0xdc, 0x19, 0xf4, 0xf7, 0xfc, 0x29, 0xd2, 0xf9, 0xd7, 0xe0,\n","  0xfb, 0xed, 0xed, 0xf7, 0xe1, 0x04, 0xe3, 0x16, 0xf5, 0xfc, 0x08, 0x03,\n","  0xe8, 0xda, 0x00, 0x10, 0x02, 0x1a, 0xe5, 0xf9, 0xca, 0x08, 0x23, 0x00,\n","  0x22, 0x3b, 0x11, 0x0f, 0xf0, 0xc2, 0x1d, 0xdf, 0x24, 0x49, 0x21, 0x1a,\n","  0x08, 0xdc, 0x0c, 0x0a, 0x2d, 0x51, 0x0b, 0x27, 0x1f, 0xdd, 0x2c, 0xf5,\n","  0x13, 0x31, 0xe1, 0xfe, 0x25, 0xc8, 0x0c, 0x28, 0x1a, 0x13, 0xfe, 0xca,\n","  0x0c, 0xa4, 0x08, 0x1e, 0x1b, 0x11, 0x06, 0xf0, 0x08, 0xd5, 0x03, 0xec,\n","  0x08, 0x09, 0x01, 0x09, 0xeb, 0xf5, 0xf8, 0xf7, 0xef, 0xb5, 0xda, 0xf7,\n","  0xf2, 0x0c, 0xe7, 0x0c, 0xce, 0xcb, 0x03, 0xf8, 0x13, 0x1a, 0xfe, 0xf6,\n","  0xe4, 0xfe, 0xf0, 0x0b, 0x11, 0xe0, 0xeb, 0x13, 0xd8, 0xfa, 0x1c, 0x06,\n","  0x19, 0x2d, 0xdd, 0xf5, 0xe4, 0xfd, 0xe5, 0x27, 0x07, 0xfd, 0xbe, 0xdc,\n","  0x00, 0x13, 0x1e, 0x2a, 0x26, 0xf8, 0xf6, 0x0b, 0x13, 0x1e, 0xf4, 0x37,\n","  0x0b, 0xf6, 0xcb, 0xe6, 0xfb, 0x0f, 0xf6, 0x48, 0xfb, 0xe3, 0xd6, 0x01,\n","  0xf5, 0x13, 0xf1, 0x22, 0xe9, 0xfe, 0xe9, 0xf5, 0x0b, 0x11, 0xfa, 0x1c,\n","  0xe2, 0xfb, 0xd3, 0x06, 0x0e, 0xf1, 0x0b, 0x0e, 0xf5, 0x13, 0x27, 0x10,\n","  0xea, 0xfc, 0x18, 0x0f, 0xf3, 0xe6, 0x0e, 0x09, 0x00, 0x10, 0x02, 0xfe,\n","  0xc1, 0xfc, 0xfa, 0x0a, 0x1a, 0x3e, 0x15, 0x01, 0xe3, 0xb5, 0x1d, 0x08,\n","  0x21, 0x58, 0xeb, 0x14, 0xf2, 0xcb, 0x1a, 0x04, 0x2d, 0x59, 0xe0, 0x21,\n","  0x0a, 0xf9, 0x2d, 0xe9, 0x18, 0x26, 0x03, 0xf8, 0x2d, 0xc2, 0x1a, 0x06,\n","  0x16, 0x03, 0x19, 0xcf, 0x2a, 0xae, 0x32, 0xfc, 0x0a, 0x15, 0x1e, 0xde,\n","  0x19, 0xd3, 0x0d, 0xed, 0x0c, 0xf4, 0x02, 0xf8, 0xdc, 0x03, 0x11, 0xff,\n","  0xf7, 0xa7, 0x0c, 0xf5, 0x0c, 0x0b, 0xee, 0xeb, 0xd6, 0xcd, 0x0d, 0xde,\n","  0xf8, 0x11, 0xf4, 0xfb, 0xd3, 0xff, 0xf3, 0x00, 0x17, 0xe9, 0xf6, 0xeb,\n","  0xe6, 0xef, 0x12, 0x04, 0x1d, 0x34, 0xd9, 0xf9, 0xd6, 0xff, 0xff, 0x2b,\n","  0xfb, 0x09, 0xef, 0xf4, 0xeb, 0x10, 0xfd, 0x30, 0x26, 0x04, 0xe2, 0x04,\n","  0x09, 0x06, 0xe5, 0x41, 0x02, 0xff, 0xed, 0x17, 0x05, 0xfd, 0x08, 0x39,\n","  0xfd, 0xfd, 0xce, 0xdd, 0xed, 0x03, 0x05, 0x30, 0x01, 0x12, 0xec, 0xfb,\n","  0xff, 0x10, 0x1a, 0x21, 0xe2, 0x08, 0xeb, 0xf9, 0x07, 0xf5, 0xf0, 0xd8,\n","  0xea, 0xfb, 0xea, 0xf4, 0x10, 0xf8, 0x1c, 0xfd, 0xfd, 0xe9, 0x16, 0xfd,\n","  0x0d, 0x0e, 0xf9, 0x06, 0xbb, 0xed, 0x08, 0x00, 0x3d, 0x3d, 0xd7, 0x12,\n","  0xda, 0xb9, 0x0d, 0x05, 0x10, 0x4f, 0x02, 0x1c, 0x1b, 0xd1, 0x02, 0xf0,\n","  0x12, 0x4b, 0x11, 0xfc, 0x20, 0x0a, 0x05, 0x01, 0x0e, 0x3c, 0x05, 0xf9,\n","  0x23, 0xcb, 0x1b, 0x15, 0x2b, 0xfe, 0x20, 0xda, 0x23, 0xa0, 0x1e, 0x12,\n","  0x1a, 0x09, 0x14, 0xd0, 0x11, 0xcc, 0x07, 0xfa, 0x13, 0x08, 0xed, 0x11,\n","  0xdf, 0x05, 0xf9, 0x1e, 0xfb, 0xc5, 0x12, 0xf5, 0x0d, 0x2f, 0xe5, 0xfc,\n","  0xd0, 0xd4, 0xe9, 0x05, 0xf8, 0x17, 0x04, 0x07, 0xd6, 0xe1, 0x0c, 0xef,\n","  0x19, 0xec, 0x01, 0xff, 0xd5, 0xf3, 0x07, 0x18, 0x0f, 0x12, 0xc2, 0xf9,\n","  0xd8, 0x07, 0xf0, 0x2f, 0x03, 0x10, 0xe2, 0x16, 0x05, 0x14, 0x0a, 0x30,\n","  0x21, 0xf4, 0xd6, 0x08, 0x08, 0x1e, 0x12, 0x3c, 0x01, 0x05, 0xd9, 0x0d,\n","  0xfb, 0x18, 0xfb, 0x2f, 0x02, 0xf2, 0xee, 0x08, 0xf3, 0x14, 0x09, 0x3d,\n","  0x02, 0x13, 0xf4, 0xfc, 0xe7, 0x1d, 0x23, 0x43, 0xe7, 0x07, 0xd5, 0x1c,\n","  0x0b, 0xe5, 0xf5, 0x08, 0xe6, 0xe4, 0xea, 0x08, 0xeb, 0x0e, 0x00, 0x06,\n","  0xd8, 0x10, 0xfe, 0x06, 0x09, 0x28, 0x03, 0xf7, 0xb2, 0xe8, 0x02, 0x27,\n","  0x22, 0x28, 0xe8, 0x04, 0xd9, 0xb5, 0x12, 0x08, 0x22, 0x4b, 0x0a, 0x29,\n","  0x0d, 0xf0, 0x11, 0xdc, 0x24, 0x46, 0x02, 0x21, 0x18, 0xf8, 0x0d, 0x03,\n","  0x36, 0x3c, 0x04, 0xf5, 0x1c, 0xc6, 0x1f, 0x05, 0x11, 0x0c, 0xe2, 0xcc,\n","  0x2b, 0x92, 0x11, 0xfb, 0x22, 0x01, 0xf7, 0xd8, 0x16, 0xd5, 0xf4, 0x00,\n","  0x20, 0xff, 0x02, 0xec, 0xe6, 0x0f, 0xef, 0x06, 0xf4, 0xb4, 0xf2, 0xd3,\n","  0xf7, 0x17, 0x02, 0x00, 0xd3, 0xb2, 0xdb, 0x1b, 0x05, 0x24, 0xdf, 0xef,\n","  0xce, 0xea, 0x21, 0x04, 0x01, 0xf1, 0xf1, 0xfa, 0xd7, 0xeb, 0xf6, 0x01,\n","  0x19, 0x14, 0xeb, 0x09, 0xe5, 0x04, 0x0d, 0x16, 0x08, 0x14, 0xf1, 0x19,\n","  0xf0, 0x18, 0x05, 0x30, 0x0e, 0xe2, 0xea, 0xeb, 0xf1, 0x1b, 0xe6, 0x28,\n","  0x0c, 0x11, 0xd9, 0x01, 0xfd, 0x06, 0x0b, 0x38, 0x05, 0xdf, 0xd4, 0x1d,\n","  0xe8, 0x08, 0x0e, 0x3d, 0xfb, 0x04, 0xee, 0x04, 0xf2, 0x11, 0x0e, 0x35,\n","  0xe3, 0x13, 0xeb, 0x11, 0x05, 0x01, 0xf9, 0x07, 0xec, 0xff, 0x21, 0x26,\n","  0xf4, 0xf0, 0xf4, 0x00, 0xc7, 0xf4, 0xf3, 0x22, 0xfe, 0x20, 0x0e, 0x0e,\n","  0xbd, 0x01, 0xfa, 0x01, 0x25, 0x42, 0xff, 0x10, 0xdb, 0xca, 0x14, 0x1f,\n","  0x17, 0x4f, 0xfa, 0x2e, 0x04, 0xe7, 0x19, 0xe0, 0x17, 0x40, 0xfd, 0x11,\n","  0x02, 0xfc, 0x1e, 0xee, 0x21, 0x30, 0x1a, 0x0a, 0x27, 0xd3, 0x0f, 0x13,\n","  0x1e, 0x05, 0x02, 0xee, 0x1f, 0x9a, 0x05, 0x1f, 0x12, 0xfd, 0x14, 0xea,\n","  0x0c, 0xcf, 0x06, 0xea, 0x27, 0xf1, 0xfb, 0xf2, 0xe2, 0x1f, 0x04, 0xee,\n","  0xe5, 0xbe, 0xee, 0xe7, 0xea, 0x19, 0xed, 0x01, 0xc8, 0xd8, 0x10, 0x17,\n","  0x12, 0x16, 0xe9, 0x09, 0xd0, 0xfb, 0xf4, 0x20, 0x0c, 0x14, 0xfb, 0x03,\n","  0xcf, 0xff, 0xf3, 0xfe, 0xfd, 0x1b, 0xe8, 0xf0, 0xdc, 0xf6, 0xd7, 0x13,\n","  0x11, 0x07, 0xe6, 0xdf, 0xe5, 0x08, 0x05, 0x2e, 0x0c, 0xef, 0xc4, 0xec,\n","  0xf1, 0x0a, 0xe9, 0x14, 0xf2, 0x1f, 0xf3, 0x0d, 0xfe, 0x08, 0x29, 0x34,\n","  0x09, 0xfb, 0xd4, 0xf1, 0xe0, 0x30, 0x06, 0x54, 0xfa, 0xfd, 0xe6, 0x16,\n","  0xfe, 0x12, 0xe5, 0x1f, 0xea, 0x02, 0xfa, 0xe1, 0x06, 0xf7, 0xe8, 0xe9,\n","  0xe3, 0x0d, 0x02, 0xfe, 0xe9, 0xfc, 0xfb, 0x1a, 0xeb, 0xf9, 0x06, 0x04,\n","  0x0a, 0x11, 0x09, 0xf4, 0xbe, 0x04, 0x18, 0xf7, 0x35, 0x3a, 0xf1, 0xf6,\n","  0xdc, 0xbf, 0x14, 0xf7, 0x16, 0x4b, 0xe8, 0x20, 0x03, 0xd6, 0x15, 0xfc,\n","  0x1f, 0x38, 0xea, 0x0b, 0x12, 0x2e, 0x0c, 0xd5, 0x30, 0x2b, 0x00, 0x00,\n","  0x2d, 0xc7, 0x15, 0xd6, 0x1c, 0xeb, 0xec, 0xcc, 0x2c, 0x99, 0x14, 0xf4,\n","  0x12, 0x09, 0x1e, 0xf5, 0xf7, 0xc4, 0xf7, 0xf8, 0x0f, 0xe7, 0x0c, 0xf4,\n","  0xf6, 0xfb, 0x00, 0x01, 0xe6, 0xce, 0xe6, 0x23, 0xe9, 0x0c, 0xf2, 0xf1,\n","  0xc6, 0xf5, 0x1a, 0xfc, 0xf5, 0x0e, 0xfc, 0xfa, 0xc2, 0xef, 0x0a, 0x1f,\n","  0xed, 0x1c, 0xcf, 0xfd, 0xd1, 0xfb, 0x0a, 0x07, 0x11, 0x2b, 0xe4, 0x01,\n","  0xd9, 0x0c, 0xfd, 0x3f, 0x02, 0x09, 0xe4, 0xee, 0xea, 0x06, 0xf3, 0x2d,\n","  0x1c, 0xe6, 0xd6, 0x1c, 0xfd, 0x0d, 0x17, 0x25, 0xf4, 0x18, 0xfe, 0xe7,\n","  0xfd, 0xff, 0x04, 0x36, 0xfe, 0x06, 0xda, 0xee, 0xf1, 0x20, 0x02, 0x41,\n","  0xee, 0x18, 0xdc, 0xf9, 0xf4, 0x27, 0x03, 0x2f, 0xee, 0x19, 0xe3, 0xf1,\n","  0x10, 0xf2, 0xdf, 0xe2, 0xdf, 0x0b, 0x23, 0x09, 0x02, 0xfc, 0xe2, 0x11,\n","  0xca, 0xf1, 0xf1, 0xf0, 0xf1, 0x10, 0xfe, 0x0c, 0xbd, 0xfc, 0x1b, 0x1a,\n","  0x30, 0x3a, 0x04, 0x03, 0xce, 0xce, 0x21, 0x00, 0xfc, 0x4a, 0xda, 0x1b,\n","  0x05, 0xdc, 0x07, 0xeb, 0x0b, 0x41, 0x21, 0x17, 0x0e, 0x0d, 0x1f, 0xfe,\n","  0x2c, 0x29, 0xe6, 0x15, 0x26, 0xb9, 0x06, 0xf7, 0x22, 0x09, 0x03, 0xea,\n","  0x30, 0x95, 0x28, 0xf6, 0x20, 0x02, 0xfc, 0xf3, 0xf6, 0xce, 0xee, 0x00,\n","  0x2d, 0xee, 0xf2, 0xf1, 0xd6, 0x0b, 0xe3, 0x08, 0xfa, 0xe1, 0xe2, 0x0c,\n","  0xef, 0x22, 0xf1, 0x06, 0xd3, 0xed, 0x08, 0x1b, 0xfc, 0x03, 0xec, 0x03,\n","  0xb6, 0x03, 0xec, 0xfd, 0xfd, 0xf3, 0xd9, 0x0e, 0xd4, 0xd7, 0xd5, 0x15,\n","  0x0c, 0x1a, 0xd9, 0xeb, 0xdd, 0x11, 0x14, 0x1c, 0x10, 0x07, 0xe9, 0xf0,\n","  0xdf, 0x07, 0xdb, 0x15, 0x1e, 0xe8, 0xe6, 0xe1, 0x00, 0x13, 0x12, 0x1b,\n","  0xef, 0x0a, 0xc8, 0xfd, 0x0d, 0x0f, 0x0a, 0x40, 0x07, 0xf6, 0xcb, 0x02,\n","  0xde, 0x16, 0x13, 0x4f, 0xfb, 0x08, 0xd3, 0xee, 0xde, 0x07, 0xe5, 0x22,\n","  0xe7, 0xfe, 0xec, 0xea, 0x06, 0xf0, 0xfe, 0xdf, 0xd6, 0xd5, 0xfb, 0x14,\n","  0xf9, 0xff, 0x0d, 0xfd, 0xd2, 0xeb, 0x02, 0x03, 0xf5, 0x06, 0xf8, 0xfb,\n","  0xb0, 0xee, 0x06, 0x1a, 0x22, 0x47, 0x0d, 0xf9, 0xd0, 0xec, 0x03, 0xd5,\n","  0x0c, 0x3f, 0x07, 0x1b, 0xf1, 0xcc, 0x03, 0xec, 0x1d, 0x47, 0xeb, 0xf6,\n","  0x04, 0x18, 0x19, 0x09, 0x27, 0x27, 0x01, 0xeb, 0x18, 0xda, 0x10, 0xf9,\n","  0x1f, 0xf7, 0x11, 0xe2, 0x2e, 0xa8, 0x0a, 0x05, 0x2f, 0x06, 0xf9, 0x02,\n","  0x18, 0xc5, 0x0f, 0x20, 0x24, 0xe0, 0xf4, 0xea, 0xd5, 0xf0, 0xeb, 0xf3,\n","  0xef, 0xd1, 0x11, 0xfd, 0xeb, 0xf8, 0xfa, 0x02, 0xcd, 0xe6, 0x11, 0xf8,\n","  0x04, 0x01, 0xcd, 0x15, 0xd4, 0xea, 0x08, 0xe3, 0x0e, 0x1e, 0xe5, 0x0d,\n","  0xe7, 0xe8, 0x04, 0xd6, 0x02, 0x30, 0xe6, 0x06, 0xe5, 0xf6, 0x08, 0x0e,\n","  0x09, 0x1a, 0xf1, 0x08, 0xed, 0x13, 0xfb, 0x2b, 0x0c, 0xf1, 0xe5, 0xfc,\n","  0x03, 0x19, 0xfb, 0x24, 0x00, 0x1a, 0xcd, 0xf8, 0x03, 0x12, 0x0c, 0x21,\n","  0x14, 0x00, 0xc4, 0xf2, 0xe3, 0x08, 0x09, 0x2f, 0xf0, 0x04, 0xe4, 0xdd,\n","  0xd7, 0x2d, 0x24, 0x26, 0xf3, 0x05, 0xfd, 0x1e, 0xfb, 0xf4, 0x07, 0xe8,\n","  0xce, 0xd8, 0xe9, 0xe1, 0x09, 0x0d, 0xdc, 0x0a, 0xd5, 0x00, 0xf8, 0xd7,\n","  0x0d, 0x05, 0xfe, 0x02, 0xbd, 0x07, 0x14, 0xe9, 0x18, 0x23, 0xe2, 0xfa,\n","  0xbf, 0xeb, 0x19, 0x1d, 0x18, 0x39, 0xf4, 0x13, 0xe2, 0xe6, 0x08, 0x10,\n","  0x11, 0x2b, 0xf6, 0x29, 0x08, 0x2c, 0x15, 0xfb, 0x33, 0x1f, 0x25, 0x24,\n","  0x24, 0xe4, 0x1e, 0xfd, 0x0f, 0x0e, 0xfc, 0xe9, 0x1d, 0xb9, 0x0a, 0xe4,\n","  0x36, 0xed, 0x10, 0xf4, 0x03, 0xd2, 0x04, 0xff, 0x14, 0xe5, 0x1f, 0xf7,\n","  0xe4, 0x0c, 0xdf, 0x0d, 0xfc, 0xf1, 0x1a, 0xee, 0xe2, 0x0b, 0xe7, 0xe9,\n","  0xbf, 0xee, 0x03, 0xe8, 0xeb, 0xf9, 0xf7, 0x22, 0xab, 0x0a, 0xef, 0x0a,\n","  0x12, 0x05, 0xfc, 0xea, 0xfc, 0xee, 0x20, 0xf2, 0x01, 0x27, 0xc8, 0xf6,\n","  0xf1, 0x04, 0x00, 0x0a, 0x03, 0x17, 0xf7, 0xe2, 0xe2, 0x16, 0xef, 0x0e,\n","  0x15, 0xfb, 0xd5, 0xee, 0xe3, 0x2c, 0x15, 0x13, 0xef, 0x15, 0xe6, 0xe3,\n","  0xf1, 0x2a, 0xed, 0x32, 0xef, 0x0d, 0xea, 0xf6, 0xe6, 0x27, 0xfc, 0x3e,\n","  0xeb, 0x09, 0xe7, 0xef, 0xf4, 0x0f, 0xf2, 0x0a, 0xfa, 0x0e, 0xda, 0xf6,\n","  0xfe, 0xfc, 0x12, 0xe3, 0xd7, 0x03, 0x10, 0xf9, 0xfd, 0x0a, 0x28, 0x0e,\n","  0xd1, 0xf5, 0xdc, 0xfc, 0xf4, 0x0f, 0xf3, 0x1b, 0xb8, 0x16, 0x0c, 0xf4,\n","  0xf8, 0x24, 0xf9, 0x18, 0xc6, 0xf1, 0x0d, 0xf3, 0x22, 0x2b, 0x06, 0x2a,\n","  0xf2, 0xfe, 0xf3, 0xe7, 0x10, 0x10, 0x02, 0x15, 0x15, 0x4c, 0x01, 0xe5,\n","  0x0d, 0xfc, 0x17, 0x19, 0x36, 0xeb, 0x01, 0xfe, 0x0a, 0xf3, 0xfa, 0x09,\n","  0x31, 0xa5, 0x04, 0x2c, 0x32, 0x0d, 0xe8, 0x05, 0x04, 0xc2, 0x0c, 0x1b,\n","  0x4e, 0xe3, 0xeb, 0xf0, 0xb5, 0x06, 0xee, 0xea, 0x16, 0xea, 0xde, 0xfd,\n","  0xcf, 0x24, 0xf0, 0x04, 0xcf, 0xeb, 0xf5, 0x07, 0xe3, 0xff, 0xde, 0x11,\n","  0xce, 0xee, 0x13, 0x0e, 0x0a, 0xfb, 0xfb, 0xf5, 0xda, 0xed, 0x04, 0xe9,\n","  0x17, 0x34, 0xec, 0x11, 0xe2, 0xff, 0x04, 0x07, 0x15, 0x26, 0xf7, 0xff,\n","  0x07, 0x20, 0x1d, 0x34, 0x13, 0x0d, 0xfd, 0xf9, 0x04, 0x39, 0xf5, 0x1e,\n","  0xfd, 0x28, 0xf5, 0x04, 0x02, 0x3a, 0xea, 0x2a, 0xe7, 0xf8, 0x03, 0xda,\n","  0xf7, 0x27, 0xfc, 0x2b, 0xd5, 0x15, 0xf4, 0x14, 0xdf, 0x1c, 0xfc, 0x2e,\n","  0xdc, 0x17, 0xee, 0xe7, 0xe9, 0x06, 0x0a, 0xe0, 0xd9, 0xd9, 0xee, 0x13,\n","  0xe5, 0x1e, 0x06, 0x02, 0xe9, 0xfa, 0xfb, 0xf9, 0xed, 0x10, 0xe8, 0x0a,\n","  0xdf, 0x0f, 0x2e, 0xee, 0x0a, 0x21, 0xfc, 0xff, 0xe9, 0xe5, 0x1b, 0xe5,\n","  0xfb, 0x29, 0x05, 0x23, 0xfa, 0x11, 0x07, 0x09, 0xeb, 0x11, 0x1c, 0x12,\n","  0xf9, 0x33, 0x09, 0xfa, 0x1a, 0x20, 0xdd, 0x03, 0x24, 0xf6, 0xf8, 0x04,\n","  0x14, 0x07, 0xef, 0xf6, 0x0c, 0xd1, 0x0f, 0x0f, 0x36, 0x05, 0x06, 0xeb,\n","  0xf6, 0xcf, 0x03, 0xfd, 0x3a, 0xe2, 0x03, 0xf6, 0xd7, 0x11, 0x0b, 0x0a,\n","  0xf7, 0xfe, 0x00, 0x10, 0xdf, 0xf5, 0xdb, 0x1b, 0xca, 0x13, 0x02, 0xf9,\n","  0x03, 0xef, 0x10, 0xf8, 0xe3, 0x1e, 0xe8, 0x13, 0x16, 0x12, 0xea, 0xe9,\n","  0xee, 0x15, 0xfd, 0xef, 0xfe, 0x11, 0xf1, 0x05, 0x00, 0x1d, 0x02, 0xf0,\n","  0x00, 0x21, 0xe9, 0xfa, 0xdc, 0x1b, 0xef, 0x16, 0x24, 0x19, 0x0f, 0x10,\n","  0xf2, 0x35, 0xed, 0x1b, 0x01, 0x2c, 0x1f, 0xea, 0x01, 0x3f, 0x15, 0x0e,\n","  0xfe, 0x1d, 0xed, 0xf0, 0xf8, 0x22, 0x09, 0x01, 0x03, 0x1c, 0xfa, 0x08,\n","  0xe5, 0x39, 0x02, 0x27, 0x15, 0x19, 0xe3, 0xff, 0xe0, 0x11, 0x0e, 0x0b,\n","  0x01, 0x1a, 0xfb, 0xf3, 0x07, 0x01, 0xec, 0x0e, 0x06, 0xf9, 0xfb, 0x12,\n","  0xf6, 0x12, 0x17, 0x00, 0xf5, 0x04, 0xfa, 0x15, 0x07, 0xff, 0xf3, 0xfa,\n","  0x20, 0xf5, 0x0d, 0x0e, 0x0e, 0xf1, 0xd9, 0x03, 0x11, 0x1a, 0xfb, 0x0e,\n","  0xed, 0xe9, 0xe5, 0xf1, 0x04, 0x14, 0x0f, 0xf3, 0x15, 0xec, 0xfd, 0x0b,\n","  0x04, 0x0f, 0xf8, 0x1b, 0x08, 0xf4, 0xe1, 0x1c, 0x10, 0x0f, 0x06, 0xf8,\n","  0xed, 0xee, 0x05, 0x0d, 0xff, 0x22, 0xec, 0xe8, 0xf8, 0x0c, 0xdb, 0x0e,\n","  0x18, 0xe6, 0xf0, 0x03, 0xf2, 0xed, 0x06, 0xef, 0xf5, 0x19, 0x01, 0x12,\n","  0xf4, 0xe4, 0x29, 0x29, 0x12, 0xdb, 0x03, 0x0e, 0x0e, 0x07, 0x1a, 0x0c,\n","  0xed, 0x01, 0x09, 0x06, 0x00, 0xfe, 0x0b, 0xd8, 0x13, 0xf0, 0x00, 0x1c,\n","  0xf8, 0x0c, 0xf7, 0x0c, 0x0b, 0x15, 0xf8, 0x15, 0xf0, 0x28, 0x10, 0x1e,\n","  0xe6, 0xf0, 0xfa, 0x06, 0xec, 0xff, 0x0b, 0xfc, 0xfe, 0x03, 0x10, 0x0a,\n","  0xea, 0xed, 0xf7, 0xff, 0xeb, 0xf6, 0xea, 0xe7, 0xf7, 0x0c, 0xe9, 0x23,\n","  0xfe, 0xe3, 0xec, 0xd6, 0x04, 0xfa, 0x05, 0x0a, 0xf7, 0xf0, 0xf4, 0xd9,\n","  0xf3, 0xd6, 0xf4, 0xf7, 0xf1, 0xdf, 0xfc, 0xde, 0x06, 0x10, 0x08, 0x03,\n","  0x16, 0x03, 0x18, 0xe7, 0x0d, 0xfc, 0xf9, 0x02, 0xee, 0x04, 0xf7, 0xec,\n","  0x15, 0x05, 0xf0, 0x0b, 0xf6, 0x1a, 0x09, 0x03, 0x23, 0xff, 0xe4, 0xf3,\n","  0xed, 0xfc, 0xf4, 0xf7, 0x18, 0x17, 0x26, 0xdb, 0xe3, 0x0b, 0x03, 0xda,\n","  0x26, 0xfb, 0x08, 0xf6, 0xff, 0x0f, 0x0d, 0xf8, 0xff, 0xf7, 0xf9, 0xf7,\n","  0xe4, 0xf4, 0xf0, 0x1a, 0x02, 0x09, 0xf6, 0xfd, 0xee, 0x1a, 0x07, 0xed,\n","  0x14, 0x03, 0xe8, 0xf7, 0x07, 0xfd, 0x1b, 0x1e, 0x35, 0xfb, 0xe6, 0xf4,\n","  0xf6, 0x17, 0xf0, 0xed, 0xfc, 0x0f, 0xfd, 0x11, 0xef, 0x03, 0x11, 0x07,\n","  0x1a, 0xf7, 0xef, 0xef, 0x0b, 0x15, 0x14, 0xf8, 0x1c, 0x0d, 0x1d, 0xf7,\n","  0x10, 0xec, 0x1f, 0x0a, 0x05, 0x11, 0x0b, 0xda, 0xe7, 0xee, 0xfd, 0xdc,\n","  0x15, 0xf0, 0xfd, 0xeb, 0xe1, 0x16, 0xf9, 0x06, 0x02, 0xeb, 0x09, 0x03,\n","  0x04, 0xe7, 0x19, 0x15, 0xff, 0xf0, 0x05, 0xf5, 0xf7, 0x0a, 0x11, 0xe7,\n","  0xf8, 0x15, 0x10, 0xf8, 0xfe, 0x11, 0x05, 0x00, 0x0d, 0xee, 0xde, 0x00,\n","  0xe5, 0x0f, 0xf0, 0x05, 0xf6, 0x1a, 0x0b, 0x08, 0x10, 0x13, 0x0c, 0xf1,\n","  0x1e, 0xf2, 0x01, 0xfb, 0x1b, 0x0b, 0x05, 0x05, 0x1b, 0x2d, 0xde, 0x0b,\n","  0xed, 0x11, 0xfc, 0xfe, 0x2a, 0x0d, 0xfc, 0xf8, 0xf9, 0xf4, 0x13, 0xe4,\n","  0x1a, 0xf6, 0xf0, 0xf3, 0x1d, 0x01, 0x18, 0xdb, 0xf8, 0x0f, 0xf9, 0xf5,\n","  0x11, 0x1b, 0x0f, 0xf9, 0x19, 0xf6, 0x05, 0xf3, 0xf5, 0x3a, 0x12, 0xdd,\n","  0x08, 0xe4, 0x15, 0xfb, 0x01, 0x3f, 0xfd, 0x1e, 0x03, 0xf0, 0x06, 0xf9,\n","  0xfe, 0x13, 0x03, 0x17, 0x1d, 0xea, 0xf6, 0xec, 0xe3, 0x05, 0xe0, 0x0f,\n","  0xf4, 0xfd, 0x01, 0xea, 0x13, 0xf8, 0xe1, 0x07, 0x10, 0xed, 0xff, 0xf0,\n","  0xfe, 0xd8, 0xfe, 0x06, 0xf0, 0xfe, 0x0b, 0x00, 0xf1, 0x24, 0xe8, 0xfc,\n","  0x20, 0x0d, 0x13, 0xed, 0x00, 0x0e, 0x05, 0x10, 0x04, 0x0a, 0x18, 0xdc,\n","  0xfa, 0x08, 0xed, 0x0c, 0x0f, 0x05, 0x0a, 0xe0, 0xe6, 0xe6, 0x12, 0x02,\n","  0x05, 0xf2, 0x04, 0xee, 0x1f, 0xf6, 0xe2, 0xf2, 0xff, 0x00, 0x05, 0xf5,\n","  0x25, 0xe4, 0xf4, 0xf7, 0x00, 0x05, 0x06, 0xeb, 0x1d, 0xf8, 0x03, 0xe7,\n","  0x06, 0xef, 0x06, 0xeb, 0x1c, 0xfc, 0x00, 0x16, 0x06, 0xfe, 0x02, 0xfc,\n","  0x01, 0xeb, 0xe9, 0x08, 0x00, 0x05, 0x0a, 0x14, 0x02, 0xf6, 0xdd, 0xff,\n","  0x18, 0x1b, 0x07, 0x14, 0x00, 0x02, 0x03, 0x06, 0x0a, 0x07, 0xf1, 0x25,\n","  0xf3, 0x02, 0x06, 0x07, 0x0c, 0x0c, 0x19, 0x07, 0x06, 0x0d, 0xf1, 0xfb,\n","  0xec, 0x0c, 0x03, 0x09, 0xfa, 0x29, 0xf5, 0x08, 0x0a, 0xff, 0xf5, 0x00,\n","  0xfe, 0x3e, 0x12, 0xee, 0x18, 0xe4, 0xef, 0x10, 0xe3, 0x3f, 0x08, 0x14,\n","  0x06, 0xf7, 0x16, 0x1c, 0x21, 0x17, 0xfd, 0x10, 0xd9, 0xee, 0xf7, 0x0a,\n","  0xf8, 0x09, 0x00, 0x11, 0x17, 0xec, 0xe8, 0xe3, 0xfe, 0xf4, 0xe8, 0x0b,\n","  0xf3, 0x06, 0x17, 0x04, 0x01, 0xe7, 0xe6, 0x00, 0xe0, 0x0a, 0x02, 0x04,\n","  0x04, 0xf7, 0xf6, 0xda, 0x1f, 0x16, 0xe5, 0xfc, 0xf0, 0x1d, 0xfd, 0xfb,\n","  0x15, 0x0c, 0xf7, 0x09, 0xeb, 0x15, 0x0a, 0xe7, 0xf6, 0x0e, 0xfb, 0xeb,\n","  0x00, 0xee, 0xe2, 0xff, 0x05, 0x13, 0x04, 0xe9, 0x09, 0x0b, 0xeb, 0xfb,\n","  0x02, 0x17, 0x01, 0xf3, 0x03, 0x07, 0x09, 0xe1, 0xfb, 0x03, 0x02, 0x0b,\n","  0xe2, 0x27, 0x07, 0xe7, 0x09, 0x0e, 0x19, 0xfa, 0xf4, 0x02, 0x09, 0xfc,\n","  0x04, 0x04, 0x07, 0x21, 0x0a, 0x09, 0xfe, 0x03, 0xf0, 0x0f, 0xf0, 0x19,\n","  0xef, 0xf7, 0x19, 0xdb, 0x0f, 0x35, 0xe1, 0xf5, 0x24, 0xf2, 0x04, 0xe6,\n","  0x05, 0x21, 0xea, 0x30, 0x10, 0x1d, 0xe7, 0x08, 0x01, 0x20, 0xf4, 0x24,\n","  0x0d, 0x12, 0xfa, 0x07, 0x0e, 0x0f, 0xf1, 0x14, 0xe7, 0x10, 0x15, 0xef,\n","  0x0c, 0xeb, 0xf6, 0x07, 0x05, 0x48, 0xce, 0xf6, 0xea, 0xe9, 0x04, 0x1d,\n","  0xf3, 0x45, 0xea, 0xf6, 0xf9, 0xdc, 0xfb, 0x10, 0x0c, 0x25, 0xff, 0xf5,\n","  0xfe, 0xf2, 0x1f, 0x01, 0x0b, 0x06, 0xd6, 0x1b, 0xe8, 0x03, 0x04, 0x0f,\n","  0x0e, 0xe5, 0xf7, 0x07, 0x04, 0xfa, 0x04, 0x0d, 0x03, 0xf4, 0x00, 0xf8,\n","  0xfc, 0xfd, 0x07, 0x07, 0x14, 0x06, 0x17, 0xec, 0x1d, 0x24, 0xef, 0x01,\n","  0xff, 0xf5, 0xec, 0xfb, 0x19, 0x17, 0x16, 0x06, 0x06, 0xe5, 0xdb, 0x1b,\n","  0x0e, 0x04, 0xe7, 0xe7, 0xfe, 0x07, 0xf5, 0xf1, 0xf3, 0x0c, 0x23, 0xfb,\n","  0xf6, 0x09, 0xd6, 0xd9, 0xe5, 0x0d, 0xe1, 0xf4, 0x13, 0x08, 0xe9, 0x0e,\n","  0x03, 0x19, 0x04, 0x0d, 0x04, 0x0e, 0xf5, 0x1e, 0xe6, 0xef, 0x0e, 0xfa,\n","  0x07, 0xe1, 0x14, 0xf4, 0xfb, 0xfa, 0x0f, 0x0c, 0x02, 0xfc, 0xda, 0xf4,\n","  0xf9, 0x1f, 0x0b, 0x0f, 0x09, 0x19, 0x06, 0xfb, 0x0e, 0x43, 0xfe, 0x0f,\n","  0x13, 0x04, 0xea, 0xfe, 0x16, 0x3f, 0x14, 0x4a, 0xff, 0xf5, 0xda, 0xf7,\n","  0x0f, 0x01, 0xed, 0x10, 0xfb, 0x27, 0xe9, 0x01, 0xfe, 0x12, 0x0c, 0x0b,\n","  0x01, 0x25, 0x07, 0xee, 0xfe, 0x10, 0xf7, 0xf9, 0x04, 0x63, 0xd7, 0x13,\n","  0xf4, 0xd8, 0xf3, 0x11, 0x11, 0x50, 0xe3, 0x15, 0xf2, 0xc6, 0x22, 0x13,\n","  0x08, 0x2a, 0xe0, 0x17, 0xfe, 0xf1, 0xe7, 0xf8, 0x2e, 0x1b, 0xed, 0x14,\n","  0x1c, 0xf9, 0xf9, 0xf0, 0xf2, 0xf1, 0xff, 0xdc, 0xff, 0xfc, 0x0a, 0x07,\n","  0x00, 0xf3, 0x00, 0x1d, 0x0d, 0xfa, 0xe3, 0x07, 0xfb, 0xde, 0x02, 0x1e,\n","  0xfe, 0x18, 0xf1, 0xfe, 0x10, 0x00, 0xec, 0xfa, 0x18, 0x23, 0x21, 0xfc,\n","  0x02, 0xf0, 0x04, 0x07, 0xf8, 0x08, 0xf4, 0xee, 0x0d, 0xe9, 0xe7, 0xe4,\n","  0x05, 0xf5, 0x07, 0xe9, 0xf7, 0x04, 0xe9, 0xde, 0x0b, 0x20, 0x21, 0x03,\n","  0x07, 0xec, 0xe6, 0xeb, 0xf8, 0xed, 0xf0, 0xeb, 0x06, 0x09, 0x08, 0xf4,\n","  0x13, 0xe8, 0xf3, 0xfd, 0xfa, 0xfa, 0xfb, 0xf5, 0xfd, 0x09, 0xf8, 0x03,\n","  0xfd, 0x11, 0xfa, 0xf9, 0xfa, 0x14, 0xe1, 0x14, 0x03, 0x11, 0xe7, 0x29,\n","  0x1c, 0x55, 0x07, 0x17, 0x1c, 0x07, 0xf1, 0x14, 0x14, 0x28, 0x28, 0x66,\n","  0xfd, 0x0e, 0xd3, 0x24, 0x18, 0x0a, 0x0a, 0x1c, 0xf7, 0x2d, 0xfe, 0xfb,\n","  0x0e, 0xf6, 0x09, 0xf6, 0x0b, 0x24, 0xeb, 0xf8, 0xf6, 0x0d, 0x03, 0x08,\n","  0x03, 0x71, 0xe8, 0xf5, 0xdd, 0xe0, 0xe9, 0x08, 0xf0, 0x52, 0xf0, 0x08,\n","  0xe0, 0xd4, 0x0c, 0xe5, 0x20, 0x37, 0xe0, 0x03, 0xf9, 0xe9, 0x00, 0xf0,\n","  0x10, 0x12, 0x00, 0x15, 0x10, 0xfd, 0xee, 0x03, 0x22, 0xf0, 0x0b, 0xfc,\n","  0x08, 0xf1, 0x04, 0x11, 0xfe, 0x0c, 0xec, 0x05, 0xf4, 0xfc, 0x0a, 0xf8,\n","  0x0d, 0xee, 0xe1, 0xe1, 0x29, 0x0f, 0x2a, 0x06, 0xfe, 0xea, 0xf0, 0xf7,\n","  0x27, 0x0b, 0xf2, 0x07, 0xf5, 0xdb, 0xf8, 0x19, 0xf5, 0x05, 0xda, 0xf3,\n","  0x01, 0xec, 0xea, 0x15, 0xfb, 0x1d, 0x00, 0xde, 0xeb, 0xfe, 0xf0, 0x01,\n","  0x04, 0x03, 0xfc, 0x04, 0xf7, 0x1a, 0xf8, 0xda, 0x0c, 0xfb, 0x03, 0xeb,\n","  0xf8, 0x08, 0xdc, 0xff, 0xed, 0xf7, 0xf5, 0xfd, 0x07, 0x06, 0xfc, 0xf6,\n","  0x02, 0xf8, 0xf3, 0x11, 0x0e, 0xe9, 0xf1, 0x18, 0xf2, 0x0c, 0x00, 0x22,\n","  0x10, 0xea, 0x10, 0x16, 0x24, 0x42, 0x0d, 0x26, 0x06, 0x15, 0xde, 0xe9,\n","  0x05, 0x1d, 0xec, 0x4c, 0xfd, 0x23, 0xf1, 0x1e, 0x1c, 0xf9, 0x02, 0x19,\n","  0xff, 0x10, 0xe3, 0xf0, 0xff, 0xf5, 0xfe, 0x03, 0x1a, 0x29, 0xcf, 0xdb,\n","  0xe2, 0x0f, 0xf4, 0xf1, 0x0d, 0x5a, 0xfd, 0x27, 0xde, 0xe8, 0xff, 0x17,\n","  0xdd, 0x52, 0xd9, 0x15, 0xdf, 0xd5, 0x00, 0xc9, 0x0f, 0x39, 0x03, 0xee,\n","  0xe5, 0xe2, 0xe8, 0xf5, 0x2e, 0x1b, 0x03, 0xf3, 0x19, 0x0c, 0xf8, 0xe9,\n","  0x13, 0xf4, 0x00, 0xe2, 0x0f, 0x05, 0x02, 0x17, 0x06, 0xf1, 0xe9, 0x1b,\n","  0x1c, 0x11, 0xd9, 0xef, 0x03, 0xe5, 0xeb, 0x14, 0x10, 0x05, 0xfe, 0x01,\n","  0xfd, 0xef, 0x11, 0x1e, 0x12, 0x0a, 0x13, 0xea, 0xf2, 0xf0, 0xf4, 0x19,\n","  0x04, 0x05, 0x01, 0xe6, 0x0c, 0xfe, 0xf4, 0x25, 0x1a, 0x12, 0x1e, 0xdd,\n","  0xfc, 0x06, 0xd7, 0x11, 0x12, 0xfe, 0xe4, 0xe0, 0x03, 0xef, 0xe3, 0x14,\n","  0x06, 0xf9, 0x06, 0x00, 0x0e, 0x08, 0xe2, 0x01, 0xf5, 0xfb, 0xfe, 0xf6,\n","  0x02, 0xfc, 0xf5, 0x12, 0x00, 0xf1, 0x07, 0x01, 0x14, 0x0f, 0x06, 0xf6,\n","  0xee, 0x38, 0x21, 0x1a, 0x18, 0xe5, 0xff, 0x0d, 0xf7, 0x46, 0xea, 0x1c,\n","  0x07, 0xf0, 0xdc, 0xf9, 0x19, 0x13, 0x15, 0x44, 0x08, 0x1a, 0xd2, 0x05,\n","  0x18, 0xf4, 0x17, 0x1a, 0xf9, 0x23, 0xe9, 0xff, 0x16, 0xff, 0xe9, 0x0f,\n","  0xf6, 0x2b, 0xe8, 0xec, 0xe7, 0xf8, 0x20, 0x10, 0x15, 0x5d, 0xdb, 0x00,\n","  0xe4, 0xe3, 0xe1, 0x2b, 0x04, 0x4e, 0xec, 0x05, 0xe3, 0xb5, 0xf7, 0xda,\n","  0x16, 0x2c, 0xe8, 0xfd, 0x01, 0xfd, 0x10, 0xe9, 0x11, 0x17, 0xec, 0x13,\n","  0x1d, 0x15, 0xeb, 0xf5, 0x09, 0x00, 0xf8, 0x20, 0x0e, 0xf5, 0xef, 0x0a,\n","  0x03, 0xec, 0x13, 0x2a, 0x02, 0xfb, 0x1d, 0xea, 0xf0, 0x01, 0xea, 0xf9,\n","  0x16, 0x0b, 0x01, 0x07, 0xfd, 0xf4, 0xe1, 0xff, 0x19, 0x04, 0x14, 0xeb,\n","  0xf5, 0xf8, 0xfc, 0xf5, 0x0d, 0x0e, 0xde, 0xe2, 0x15, 0xff, 0xfa, 0xe5,\n","  0x03, 0x25, 0xf6, 0xec, 0xf9, 0x06, 0xfe, 0x29, 0xee, 0xfc, 0xee, 0xe5,\n","  0x0d, 0xea, 0xe5, 0x01, 0x01, 0xf8, 0x0d, 0xeb, 0x09, 0x00, 0xca, 0xff,\n","  0x0c, 0x03, 0xf1, 0xef, 0xf7, 0xf1, 0xed, 0x03, 0xf2, 0xe8, 0xe9, 0xe9,\n","  0x07, 0xfc, 0xeb, 0x1f, 0xdb, 0x19, 0x01, 0x17, 0x03, 0x0e, 0xfb, 0x11,\n","  0x08, 0x51, 0xdc, 0x2d, 0x09, 0xef, 0xeb, 0x18, 0x07, 0x21, 0xec, 0x4b,\n","  0xf7, 0x43, 0xd9, 0x00, 0x00, 0xee, 0xf5, 0x19, 0xe4, 0x25, 0xe3, 0xfc,\n","  0x09, 0x05, 0xf6, 0x11, 0x07, 0x30, 0xcb, 0x0f, 0xef, 0x04, 0x01, 0x0f,\n","  0x06, 0x4b, 0xfa, 0xf2, 0xe7, 0xe9, 0xea, 0x10, 0x0d, 0x4f, 0xe5, 0xf2,\n","  0xf9, 0xd3, 0x07, 0xe4, 0x22, 0x37, 0xeb, 0xed, 0xfb, 0xf5, 0xda, 0xd7,\n","  0x16, 0x12, 0x0d, 0x11, 0x07, 0x1f, 0x11, 0xe0, 0xff, 0xf2, 0x07, 0x1d,\n","  0xfa, 0x03, 0xfe, 0xf6, 0xf4, 0xe6, 0xde, 0xe9, 0x05, 0xed, 0xfd, 0xfa,\n","  0xf3, 0x03, 0xe8, 0x01, 0x26, 0x20, 0xfd, 0xf3, 0x04, 0xd1, 0xff, 0x09,\n","  0x28, 0x20, 0xfc, 0xfe, 0x02, 0xed, 0x03, 0x02, 0x0d, 0x04, 0xe5, 0xd4,\n","  0x04, 0xf8, 0xea, 0xfb, 0xfc, 0x14, 0x1b, 0xd6, 0x0b, 0xfb, 0xf9, 0x15,\n","  0xf5, 0xf6, 0x08, 0xd9, 0x03, 0x05, 0xed, 0x00, 0x12, 0xfe, 0xfb, 0xf6,\n","  0x13, 0xf3, 0xd7, 0xe3, 0xed, 0xfd, 0x13, 0xfb, 0x00, 0xf2, 0xe6, 0x29,\n","  0xfc, 0x09, 0x01, 0xdf, 0x03, 0x08, 0x04, 0xfe, 0x07, 0x25, 0xf4, 0x1d,\n","  0x0a, 0xdb, 0xf6, 0x1a, 0x09, 0x41, 0x12, 0x2c, 0x0a, 0xf6, 0xe4, 0xf9,\n","  0x0e, 0x13, 0x27, 0x45, 0xe1, 0x29, 0xd8, 0x05, 0x0c, 0xf0, 0x09, 0x19,\n","  0xf5, 0x1f, 0xef, 0xef, 0x08, 0xeb, 0xf2, 0x0f, 0x0c, 0x24, 0xde, 0x06,\n","  0xee, 0xfb, 0xf5, 0x14, 0x18, 0x30, 0xdd, 0x0f, 0xe8, 0xfb, 0x1a, 0x1b,\n","  0x06, 0x50, 0xe0, 0xfa, 0xf5, 0xb5, 0xf2, 0xf5, 0x27, 0x35, 0xed, 0x12,\n","  0xf3, 0x08, 0x0a, 0xd1, 0x1a, 0x15, 0xf7, 0x03, 0xf3, 0x0c, 0xec, 0xe7,\n","  0xff, 0xfb, 0xef, 0x2a, 0xf7, 0xf5, 0x12, 0x13, 0x08, 0xe3, 0x00, 0x2a,\n","  0x05, 0x0f, 0xfb, 0xf1, 0xed, 0x13, 0xf5, 0x02, 0x0c, 0x14, 0xf1, 0xf4,\n","  0xfa, 0xdb, 0x00, 0x03, 0x26, 0x2e, 0x26, 0x08, 0xf5, 0xe9, 0xfd, 0xe9,\n","  0x04, 0x20, 0x13, 0xcc, 0xfe, 0xf9, 0x02, 0x15, 0xf7, 0x05, 0xea, 0xc3,\n","  0xee, 0xfa, 0xf8, 0x10, 0xf8, 0xf1, 0xfe, 0xdb, 0x07, 0x06, 0xdb, 0xfa,\n","  0x08, 0x01, 0x23, 0xfb, 0x0d, 0xff, 0xdf, 0xf0, 0xfc, 0xfd, 0x03, 0xff,\n","  0x02, 0x0b, 0xf7, 0x04, 0xea, 0xf0, 0x0a, 0x19, 0x04, 0xfa, 0xee, 0x00,\n","  0xf5, 0x25, 0x09, 0x24, 0x09, 0xfc, 0xff, 0xff, 0x11, 0x39, 0x05, 0x2a,\n","  0xf8, 0xf9, 0xcc, 0x28, 0x08, 0x05, 0x07, 0x4c, 0xe3, 0x27, 0xd4, 0x06,\n","  0xf8, 0xe8, 0xf9, 0x1d, 0xee, 0x10, 0xdb, 0x06, 0xfd, 0xf2, 0x05, 0xf9,\n","  0x16, 0x26, 0xe3, 0xf3, 0xf8, 0x00, 0xdd, 0xf9, 0x16, 0x3b, 0xe9, 0xfa,\n","  0xe8, 0xfd, 0xf0, 0x26, 0xf1, 0x30, 0xc5, 0xe0, 0xe6, 0xbd, 0xf1, 0xd7,\n","  0x00, 0x24, 0xf6, 0x19, 0xea, 0xca, 0xf1, 0xf8, 0x1f, 0x16, 0xf7, 0xf2,\n","  0xf7, 0x16, 0x00, 0xf6, 0x09, 0xe5, 0x06, 0xfb, 0x12, 0x1f, 0xfc, 0xe7,\n","  0xf8, 0xfc, 0xed, 0x01, 0x03, 0x13, 0x07, 0xff, 0xd3, 0x17, 0xfb, 0x01,\n","  0x12, 0x1d, 0x1c, 0xf6, 0xf1, 0xef, 0xf3, 0x02, 0x15, 0x22, 0x06, 0xed,\n","  0xff, 0xea, 0xef, 0x11, 0x0d, 0x0d, 0xe7, 0xe4, 0xff, 0x09, 0x02, 0xf8,\n","  0xf0, 0x00, 0x02, 0xe2, 0x0d, 0x0c, 0xf7, 0x1b, 0xfa, 0xff, 0xe3, 0xe8,\n","  0x10, 0xe9, 0xea, 0x01, 0x0e, 0xfe, 0x1f, 0xf8, 0x0b, 0x04, 0xe7, 0xfe,\n","  0xf9, 0x02, 0x01, 0xf5, 0x09, 0xf8, 0xfe, 0x0a, 0xfb, 0x06, 0x1b, 0xe2,\n","  0x00, 0xef, 0xde, 0x15, 0xf8, 0x2d, 0xf1, 0x1a, 0x05, 0xff, 0xf0, 0x11,\n","  0x00, 0x41, 0xe2, 0x26, 0x14, 0xd3, 0xde, 0xf3, 0x09, 0x0d, 0xfa, 0x28,\n","  0xdc, 0x37, 0xc7, 0x06, 0xf3, 0xf9, 0x07, 0x27, 0xe9, 0x14, 0xd4, 0x24,\n","  0xfa, 0x04, 0x13, 0x08, 0xf7, 0x11, 0xf0, 0x0d, 0x01, 0x03, 0x06, 0x16,\n","  0x08, 0x47, 0xe2, 0x13, 0xe6, 0xf0, 0xdc, 0x21, 0xf4, 0x3e, 0xeb, 0x19,\n","  0xe2, 0xcc, 0xf5, 0xf7, 0x15, 0x34, 0xde, 0x2c, 0xe0, 0xd6, 0xde, 0xd3,\n","  0x11, 0x0f, 0x01, 0x0f, 0xf2, 0x14, 0x02, 0xee, 0x16, 0xdb, 0xe1, 0xfd,\n","  0x01, 0x13, 0x1d, 0x09, 0x14, 0xf2, 0xd2, 0x05, 0xfe, 0x0a, 0xe9, 0x03,\n","  0x0b, 0x13, 0xf2, 0x21, 0x35, 0x0d, 0x0a, 0xf6, 0xed, 0xf5, 0xf5, 0x0d,\n","  0x2c, 0x2a, 0xf3, 0xec, 0xf3, 0xde, 0xef, 0x0c, 0x07, 0x06, 0x16, 0xd1,\n","  0xf4, 0xfe, 0xe7, 0x1c, 0xf9, 0xfe, 0xf3, 0xc6, 0x04, 0x01, 0xef, 0x03,\n","  0xeb, 0x04, 0x06, 0xd1, 0x05, 0xee, 0xf7, 0x19, 0x25, 0x09, 0x2a, 0xff,\n","  0x20, 0x11, 0xf3, 0x02, 0x0c, 0xf7, 0x08, 0xf2, 0x00, 0xf5, 0xd9, 0x24,\n","  0xfd, 0xfb, 0xe7, 0x06, 0x04, 0xd9, 0x0f, 0xe2, 0xf5, 0x16, 0x03, 0x07,\n","  0xfd, 0xf3, 0xe3, 0xfa, 0xfc, 0x30, 0x27, 0x22, 0x04, 0xf3, 0xdf, 0x0b,\n","  0x12, 0x09, 0xe5, 0x2d, 0xf9, 0x34, 0xbb, 0x13, 0xeb, 0xff, 0xe8, 0x0a,\n","  0xf4, 0x03, 0xea, 0xed, 0xdf, 0xf8, 0x0a, 0xfe, 0x07, 0x31, 0xe7, 0xe8,\n","  0xfc, 0x03, 0x03, 0x03, 0x1a, 0x2a, 0xe5, 0x0a, 0xe5, 0x0d, 0x1d, 0x2a,\n","  0xed, 0x40, 0xd3, 0x05, 0xee, 0xc5, 0xda, 0xf8, 0x12, 0x3f, 0xe6, 0xfc,\n","  0xde, 0xe0, 0xd6, 0xc6, 0x0b, 0x0d, 0x05, 0x01, 0xe7, 0x18, 0xd7, 0xec,\n","  0x05, 0xed, 0xfb, 0x19, 0x0d, 0xf9, 0x03, 0x02, 0x0a, 0xe9, 0xe2, 0x1e,\n","  0x0e, 0x11, 0x05, 0xe6, 0xed, 0x05, 0xe5, 0xe0, 0x1d, 0x18, 0xfb, 0xed,\n","  0xf1, 0xcf, 0xf7, 0x17, 0x2f, 0x20, 0x0a, 0x11, 0x02, 0xed, 0xf0, 0x01,\n","  0x0d, 0x14, 0x09, 0xc8, 0xf0, 0x00, 0xf9, 0xf9, 0xf1, 0x01, 0xe5, 0xce,\n","  0x02, 0xf4, 0xdb, 0x13, 0xfe, 0x07, 0xf5, 0xee, 0x05, 0xe9, 0xef, 0x25,\n","  0x1a, 0x1a, 0x0d, 0x02, 0x18, 0x05, 0xc8, 0xe2, 0xf8, 0xf1, 0x00, 0xf9,\n","  0x1a, 0xf7, 0xf5, 0xf0, 0xef, 0x07, 0xff, 0xf0, 0xee, 0xeb, 0xf5, 0x28,\n","  0xd0, 0x15, 0x1b, 0x1e, 0x08, 0xdc, 0xeb, 0xfa, 0xf3, 0x3b, 0xee, 0x18,\n","  0x03, 0xfa, 0xdb, 0x11, 0x00, 0x10, 0x21, 0x43, 0xe6, 0x39, 0xea, 0xfd,\n","  0xf0, 0xf3, 0xef, 0x1f, 0xd8, 0x19, 0xbc, 0xfb, 0xea, 0xf5, 0xda, 0x01,\n","  0x0e, 0x09, 0xd3, 0xf7, 0x01, 0xfa, 0xec, 0x12, 0x0a, 0x40, 0xd9, 0xec,\n","  0xea, 0x05, 0x13, 0x17, 0xed, 0x4b, 0xc5, 0xfc, 0xf0, 0xc8, 0xf7, 0x07,\n","  0x02, 0x2a, 0xe4, 0xef, 0xd7, 0xed, 0x04, 0xcc, 0x00, 0xf7, 0xf8, 0x0c,\n","  0xe7, 0x1d, 0xfc, 0xe3, 0x07, 0xd8, 0xf7, 0x06, 0x00, 0x15, 0x0c, 0xff,\n","  0x0c, 0xe6, 0xf2, 0xda, 0x1a, 0x1a, 0x0f, 0x04, 0xec, 0xf2, 0xe4, 0x15,\n","  0x13, 0x10, 0x04, 0xf0, 0x01, 0xeb, 0x04, 0x02, 0x21, 0x29, 0x25, 0x03,\n","  0xf9, 0xde, 0xe4, 0x07, 0x0b, 0x13, 0x13, 0xce, 0x1c, 0xfd, 0xed, 0xf3,\n","  0x00, 0x14, 0x1b, 0xd1, 0x0b, 0xf3, 0xf0, 0x06, 0x01, 0x0a, 0x05, 0xe0,\n","  0x16, 0xe2, 0xec, 0xeb, 0x07, 0x05, 0x03, 0xd8, 0x14, 0x02, 0xdd, 0xf8,\n","  0x16, 0x03, 0x07, 0xda, 0x27, 0xf0, 0xf3, 0x10, 0xfc, 0x14, 0xf3, 0x0f,\n","  0x01, 0x0f, 0xfe, 0xee, 0xe0, 0x14, 0x02, 0x22, 0xfd, 0xd8, 0xff, 0xfe,\n","  0xe7, 0x2b, 0x21, 0x2d, 0x05, 0xfc, 0xcb, 0x07, 0xe4, 0x12, 0x17, 0x36,\n","  0xe4, 0x23, 0xf6, 0x19, 0xcf, 0x05, 0xd7, 0x16, 0xd4, 0xfb, 0xc2, 0x20,\n","  0xe3, 0xfe, 0xe9, 0xf8, 0xfc, 0xfd, 0xee, 0x15, 0xf0, 0xf4, 0xe2, 0x12,\n","  0x04, 0x39, 0xdc, 0xff, 0xf9, 0xf4, 0xf9, 0x0b, 0xf4, 0x45, 0xed, 0x0e,\n","  0xcd, 0xda, 0x16, 0xfc, 0x15, 0x37, 0xe4, 0x26, 0xe1, 0xda, 0x22, 0xd8,\n","  0xfc, 0x03, 0x06, 0x06, 0xec, 0x01, 0x04, 0xec, 0x1f, 0xdf, 0xfa, 0xf6,\n","  0x1c, 0x0a, 0x22, 0xda, 0xf7, 0xea, 0x07, 0xe2, 0x0d, 0x0e, 0x04, 0xfa,\n","  0xf1, 0x01, 0xe7, 0x10, 0x2a, 0x18, 0x0d, 0xfa, 0xf0, 0xe9, 0x03, 0xf5,\n","  0x18, 0x24, 0x1b, 0xf0, 0xf2, 0xe0, 0xf2, 0xea, 0x1a, 0x05, 0x13, 0xde,\n","  0x0d, 0xfb, 0xe6, 0x15, 0x0a, 0xf9, 0x0d, 0xe0, 0x00, 0x00, 0xf6, 0x12,\n","  0xf6, 0x09, 0x06, 0xe4, 0x0c, 0xfb, 0xe7, 0xeb, 0xff, 0xfe, 0xf2, 0xde,\n","  0x21, 0x14, 0x03, 0x04, 0x0d, 0xff, 0x21, 0xe9, 0x24, 0xf9, 0x03, 0x00,\n","  0xf0, 0xfb, 0xff, 0xf5, 0xf6, 0x20, 0xfd, 0x25, 0xe7, 0x06, 0xf8, 0x09,\n","  0x00, 0xdf, 0xef, 0xf0, 0xdd, 0x30, 0xde, 0x33, 0x11, 0xe9, 0x01, 0x04,\n","  0x00, 0x13, 0xf7, 0x32, 0xf9, 0x1f, 0xf6, 0x05, 0xda, 0xfb, 0x1c, 0x1d,\n","  0xdf, 0x18, 0xda, 0x10, 0xda, 0x04, 0x1a, 0xe0, 0x15, 0x09, 0xd7, 0x0c,\n","  0xe6, 0x19, 0xf9, 0x0a, 0xfe, 0x47, 0xdb, 0x09, 0xdf, 0x00, 0xe8, 0x22,\n","  0xe6, 0x4e, 0xd7, 0x0d, 0xde, 0xeb, 0xf7, 0x0d, 0x0f, 0x2d, 0xe5, 0xfd,\n","  0xdf, 0xe7, 0x14, 0xed, 0x0c, 0x09, 0xf7, 0x11, 0x02, 0x1c, 0x0f, 0xcc,\n","  0x1e, 0xf7, 0xf2, 0xf1, 0x09, 0x0e, 0xe2, 0xfb, 0xfd, 0xe4, 0x07, 0x07,\n","  0x15, 0xfd, 0x28, 0xf6, 0xf2, 0xec, 0xe7, 0xf5, 0x17, 0x1e, 0xe8, 0x0a,\n","  0xec, 0xd7, 0xe9, 0x27, 0x1f, 0x36, 0xf0, 0xee, 0xf6, 0xe0, 0xf2, 0x0e,\n","  0x26, 0x1d, 0x0d, 0xdd, 0x02, 0xe7, 0xe0, 0x09, 0xf9, 0x0d, 0xde, 0xe2,\n","  0xfe, 0xef, 0xee, 0x06, 0xfa, 0x1a, 0x0e, 0xd9, 0x10, 0xf9, 0x10, 0x08,\n","  0x0f, 0xfb, 0xf8, 0xcf, 0x23, 0x13, 0xf5, 0x04, 0x07, 0x0e, 0x22, 0xfd,\n","  0x29, 0xf3, 0xf2, 0x18, 0xf1, 0x04, 0x0b, 0x11, 0x00, 0x09, 0xed, 0x07,\n","  0xe0, 0x27, 0x07, 0x1e, 0xe4, 0xda, 0x0e, 0xdb, 0xd6, 0x3d, 0x0f, 0x25,\n","  0x03, 0xdf, 0xdc, 0x20, 0xe5, 0x09, 0xea, 0x36, 0xee, 0x29, 0xda, 0x00,\n","  0xde, 0xfc, 0x27, 0x21, 0xdb, 0x38, 0xed, 0x05, 0xd2, 0xfe, 0xf4, 0xf4,\n","  0x07, 0xf5, 0xd3, 0xfc, 0xe1, 0x0e, 0xf7, 0x19, 0x00, 0x35, 0xe0, 0xf6,\n","  0x01, 0xfe, 0x08, 0x27, 0xf1, 0x52, 0xec, 0x06, 0xdd, 0xd7, 0x1a, 0xfd,\n","  0x0b, 0x2d, 0xdf, 0x02, 0xd8, 0xe9, 0xdd, 0xec, 0x0b, 0x14, 0x0a, 0x02,\n","  0xec, 0x1d, 0x08, 0xde, 0x11, 0xec, 0x0e, 0x0b, 0x2b, 0xee, 0x0c, 0xf4,\n","  0x0c, 0xdc, 0x03, 0x05, 0x17, 0xfb, 0xf5, 0xf9, 0xea, 0xeb, 0x06, 0x12,\n","  0x20, 0x20, 0x0e, 0xf9, 0x00, 0xd7, 0x11, 0x03, 0x1d, 0x2b, 0x01, 0xe5,\n","  0x01, 0xf6, 0xf9, 0x03, 0x10, 0x1d, 0x14, 0xcd, 0xfd, 0xe3, 0xe9, 0x1d,\n","  0xfa, 0x11, 0x07, 0xd9, 0x09, 0xfd, 0xeb, 0x02, 0xfc, 0x0e, 0xe6, 0xe9,\n","  0x0c, 0xeb, 0xeb, 0x08, 0x0e, 0xf6, 0xf7, 0xe2, 0x14, 0x09, 0x02, 0xfd,\n","  0x09, 0x02, 0x0e, 0xe9, 0x31, 0x05, 0xfc, 0x11, 0xef, 0x05, 0x05, 0x09,\n","  0xfc, 0xd7, 0x09, 0xee, 0xdc, 0x05, 0x0e, 0x18, 0xd2, 0xd3, 0xf1, 0x26,\n","  0xcc, 0x3a, 0x0d, 0x2a, 0x22, 0xf7, 0xce, 0x14, 0xed, 0x1a, 0xfe, 0x54,\n","  0xf3, 0x2f, 0xfb, 0x14, 0xe6, 0xf3, 0xe7, 0x15, 0xd3, 0x1a, 0xe2, 0x0b,\n","  0xe0, 0xf6, 0x0a, 0xf6, 0xe8, 0xea, 0xf4, 0x17, 0xd4, 0x11, 0xfb, 0x11,\n","  0xfd, 0x37, 0xf6, 0x0b, 0xe3, 0x00, 0x05, 0x1a, 0xdc, 0x59, 0xd7, 0x17,\n","  0xbf, 0xe7, 0xe3, 0x2a, 0xfa, 0x30, 0xfb, 0xf3, 0xde, 0xee, 0x01, 0xfd,\n","  0x10, 0x1b, 0x06, 0xf5, 0xee, 0xf8, 0xd9, 0xe4, 0x0a, 0xe0, 0x01, 0x13,\n","  0x22, 0x0d, 0xda, 0xfe, 0x06, 0xeb, 0xfe, 0xe9, 0x1c, 0x1e, 0xff, 0xfb,\n","  0xfb, 0xf9, 0xe6, 0xec, 0x14, 0x2d, 0x01, 0x09, 0xd9, 0xd9, 0x0c, 0xe7,\n","  0x2e, 0x12, 0xef, 0xe9, 0xfe, 0xee, 0x0b, 0x13, 0x0d, 0x0d, 0xf1, 0xf2,\n","  0xf6, 0xf5, 0xf0, 0x1e, 0xf6, 0xf7, 0x01, 0xcd, 0xfd, 0x04, 0xed, 0xfd,\n","  0xfd, 0x22, 0x16, 0xde, 0x09, 0xee, 0xee, 0xe3, 0x19, 0x14, 0xd7, 0xee,\n","  0x1d, 0xf6, 0x02, 0xfd, 0x21, 0x02, 0xe0, 0xfa, 0x21, 0xf4, 0xfe, 0xf8,\n","  0xf4, 0xf1, 0xff, 0x19, 0xf5, 0x02, 0xea, 0x23, 0x01, 0x07, 0x0e, 0x2f,\n","  0xf2, 0xec, 0x04, 0xfb, 0xd9, 0x40, 0xee, 0x19, 0x30, 0xed, 0xf5, 0xe7,\n","  0xe6, 0x20, 0xf6, 0x27, 0xf3, 0x2a, 0xec, 0x14, 0xff, 0x00, 0x0f, 0x05,\n","  0xd0, 0x0d, 0xfe, 0x1e, 0xd7, 0xf8, 0xeb, 0xfe, 0xf3, 0x16, 0xe9, 0x07,\n","  0xec, 0x10, 0x19, 0x08, 0x0e, 0x2a, 0x11, 0x08, 0xe1, 0xe9, 0x11, 0x1c,\n","  0xf1, 0x53, 0xd4, 0xf8, 0xc7, 0xed, 0xf4, 0x03, 0xf1, 0x29, 0xfc, 0xf0,\n","  0xc3, 0xf5, 0xf4, 0x0c, 0x21, 0x11, 0xf9, 0x0b, 0xe0, 0xfc, 0x08, 0xfc,\n","  0x12, 0xe1, 0x18, 0x03, 0x17, 0x0f, 0xfc, 0xdb, 0x06, 0xeb, 0x05, 0x0f,\n","  0x17, 0x0b, 0x0f, 0x16, 0xdd, 0xf1, 0xf1, 0xfd, 0x0f, 0x29, 0xec, 0x21,\n","  0xe7, 0xe3, 0x0c, 0x09, 0x0d, 0x10, 0xe7, 0x12, 0xf6, 0xe9, 0x01, 0x0f,\n","  0x17, 0xf0, 0xec, 0xd2, 0xff, 0x0c, 0xd5, 0x0b, 0xff, 0x09, 0xf5, 0xe1,\n","  0xfe, 0xf5, 0x0f, 0x0e, 0x01, 0x18, 0xfa, 0xe7, 0x02, 0x0f, 0xf4, 0x1b,\n","  0x24, 0x03, 0xfe, 0xf1, 0x16, 0x0d, 0xfa, 0xf3, 0x13, 0x17, 0x04, 0xec,\n","  0x17, 0x00, 0xf0, 0x17, 0x01, 0xff, 0x0e, 0xfb, 0xfc, 0x23, 0x0c, 0x02,\n","  0xe8, 0x00, 0xfe, 0x1c, 0xd8, 0xd9, 0x05, 0xda, 0xa8, 0x2a, 0xdf, 0x3f,\n","  0x0d, 0xe6, 0xe8, 0x0d, 0xf1, 0x17, 0x15, 0x57, 0xe5, 0x24, 0xfa, 0x0b,\n","  0xe9, 0xef, 0xfc, 0x03, 0xd0, 0x2e, 0xe7, 0x1e, 0xe8, 0x18, 0x15, 0xfe,\n","  0xfd, 0x0b, 0xe9, 0x0d, 0xe8, 0x00, 0x00, 0x08, 0xfd, 0x27, 0xea, 0x01,\n","  0xda, 0x01, 0xfa, 0x2a, 0xf8, 0x46, 0x03, 0x04, 0xd8, 0xed, 0xfb, 0x28,\n","  0xed, 0x43, 0xeb, 0x17, 0xce, 0xed, 0xe0, 0xe2, 0x0b, 0x18, 0x03, 0x24,\n","  0xdd, 0xfe, 0xee, 0xf3, 0x27, 0xef, 0x00, 0x1b, 0x1d, 0x19, 0xee, 0xf0,\n","  0x0b, 0x03, 0xf6, 0x03, 0x12, 0x1f, 0x0a, 0x1b, 0xf0, 0xec, 0xf5, 0x02,\n","  0x25, 0x15, 0x0d, 0x23, 0xe0, 0xe1, 0x14, 0xf5, 0x07, 0x24, 0xf8, 0x0e,\n","  0xf3, 0x00, 0xf8, 0xf4, 0x07, 0x00, 0xfa, 0xe5, 0x06, 0xe9, 0xf7, 0x16,\n","  0x13, 0xfb, 0x04, 0xe6, 0xf1, 0xde, 0x07, 0xfc, 0x08, 0x05, 0xf2, 0xfe,\n","  0x0f, 0xfd, 0xea, 0xe8, 0x0e, 0x03, 0xf6, 0xe9, 0x15, 0xf5, 0xf4, 0xf1,\n","  0x04, 0xf8, 0x06, 0xe6, 0x23, 0xe5, 0xff, 0xf1, 0xfa, 0x00, 0xf4, 0xfb,\n","  0xf6, 0x08, 0x02, 0xfe, 0x07, 0x08, 0xfe, 0x10, 0xef, 0xd6, 0x02, 0xed,\n","  0xbd, 0x21, 0xff, 0x29, 0x22, 0xe3, 0xe1, 0xfa, 0xfe, 0x33, 0x00, 0x31,\n","  0xf0, 0x1b, 0x03, 0x06, 0xf1, 0xe5, 0xe8, 0xdb, 0xe6, 0x23, 0xeb, 0x09,\n","  0xef, 0x0b, 0x1b, 0xfc, 0xff, 0x00, 0xee, 0x0a, 0xc2, 0x14, 0xfe, 0x0e,\n","  0xf8, 0x33, 0xe6, 0x0f, 0xeb, 0x08, 0x27, 0x24, 0xf0, 0x54, 0xea, 0xeb,\n","  0xc4, 0xe1, 0xf3, 0xe2, 0xfd, 0x3c, 0xfe, 0xfa, 0xca, 0xec, 0xef, 0x1a,\n","  0x08, 0x13, 0x03, 0x17, 0xf2, 0x09, 0xfa, 0xe8, 0x26, 0xee, 0x09, 0x22,\n","  0x07, 0x0f, 0x08, 0xf8, 0x00, 0x02, 0xfe, 0xda, 0x2a, 0x0d, 0x0e, 0x23,\n","  0xf8, 0xfa, 0xfb, 0xf5, 0x1e, 0x14, 0xf8, 0xf2, 0xdd, 0xf9, 0xf1, 0x1b,\n","  0x0e, 0x0e, 0xf9, 0xf7, 0xe6, 0xfb, 0x0a, 0x03, 0xf6, 0x12, 0xf8, 0xff,\n","  0xed, 0x14, 0x04, 0x12, 0x12, 0x0b, 0x05, 0xf1, 0x03, 0xe2, 0xf2, 0x18,\n","  0xfb, 0x00, 0xf1, 0xc6, 0x0f, 0xfd, 0xeb, 0xef, 0x16, 0xf8, 0x00, 0xf5,\n","  0x20, 0xf7, 0x04, 0xf0, 0x08, 0x06, 0x0a, 0xf9, 0x11, 0xf4, 0xf8, 0xec,\n","  0xf2, 0x0d, 0x15, 0xe0, 0xe4, 0x07, 0xef, 0xfd, 0xf3, 0x13, 0xfa, 0x1e,\n","  0xf2, 0xee, 0xec, 0xf3, 0xc9, 0x18, 0x13, 0x34, 0x16, 0xeb, 0xf3, 0xe0,\n","  0xd2, 0x24, 0xf4, 0x25, 0xe4, 0xfe, 0x03, 0x04, 0xf0, 0xfc, 0xf6, 0xf1,\n","  0xcc, 0xfe, 0xf9, 0xeb, 0xdb, 0x1c, 0xf0, 0x18, 0xf6, 0xff, 0x00, 0x0b,\n","  0xe0, 0x14, 0xf6, 0x14, 0xff, 0x2b, 0x19, 0xd9, 0xed, 0xf8, 0x00, 0x07,\n","  0xe2, 0x48, 0xf0, 0xf3, 0xd5, 0xeb, 0x04, 0xef, 0xee, 0x38, 0xf4, 0x20,\n","  0xc9, 0xfe, 0xe9, 0x08, 0x0a, 0x0d, 0xf7, 0x22, 0xf8, 0x12, 0xfe, 0xe4,\n","  0x25, 0xdb, 0x0d, 0x05, 0xfe, 0x02, 0xe6, 0x03, 0xfc, 0xf5, 0x08, 0xee,\n","  0x16, 0x20, 0x07, 0xf3, 0xfb, 0xec, 0x00, 0xec, 0x16, 0x22, 0x00, 0x1e,\n","  0xe0, 0xd6, 0x00, 0x1a, 0x09, 0x1a, 0xeb, 0x13, 0xfe, 0xe2, 0xf7, 0xff,\n","  0xe6, 0xf3, 0x28, 0xf9, 0x01, 0xf8, 0xee, 0xe8, 0x07, 0x0a, 0x1d, 0xf0,\n","  0xed, 0xde, 0x06, 0x15, 0xfd, 0xf5, 0x13, 0xcf, 0xfc, 0x00, 0x0b, 0xdc,\n","  0x0e, 0xf9, 0xfa, 0xed, 0x12, 0xfa, 0xf7, 0x07, 0x22, 0xe6, 0x01, 0xee,\n","  0x0e, 0xef, 0xf5, 0x13, 0x01, 0xe9, 0xed, 0x03, 0xe7, 0xda, 0xdc, 0x19,\n","  0xf5, 0x1d, 0x07, 0x2c, 0xee, 0xd5, 0xf1, 0xf2, 0xc6, 0x21, 0x01, 0x3a,\n","  0x0e, 0xe0, 0x06, 0x0f, 0xdc, 0x33, 0xed, 0x30, 0xec, 0x02, 0x23, 0x06,\n","  0x01, 0x13, 0xfe, 0xed, 0xdb, 0xfc, 0x13, 0xf9, 0xfc, 0x0d, 0x09, 0x18,\n","  0xeb, 0x0d, 0xeb, 0x02, 0xcc, 0x10, 0x1a, 0xed, 0x05, 0x2c, 0xf6, 0x07,\n","  0xf0, 0xf0, 0xf0, 0x17, 0xe9, 0x33, 0xeb, 0x19, 0xb8, 0xf0, 0xeb, 0xed,\n","  0xdf, 0x1b, 0xf6, 0xfa, 0xd2, 0xf4, 0xfe, 0x18, 0xf1, 0x09, 0xf0, 0xe5,\n","  0xd8, 0x08, 0x1a, 0x19, 0x16, 0xe1, 0x12, 0x02, 0x14, 0x1c, 0x02, 0x1e,\n","  0x11, 0xf1, 0x08, 0x01, 0x0b, 0x1d, 0xfe, 0x21, 0xed, 0xf1, 0x03, 0x0c,\n","  0x21, 0xfe, 0xfa, 0x18, 0xec, 0xe4, 0x02, 0x09, 0x15, 0x03, 0x0b, 0x0c,\n","  0xfb, 0xeb, 0xfa, 0xfc, 0xf7, 0xf8, 0xf0, 0x14, 0xf5, 0xfa, 0x0a, 0x09,\n","  0x13, 0x06, 0xea, 0xf5, 0x06, 0xfb, 0xfa, 0x0f, 0x10, 0xf9, 0xfa, 0xe7,\n","  0xf2, 0xe2, 0x01, 0x14, 0x06, 0x02, 0xf3, 0xfe, 0x0c, 0xfb, 0xf0, 0xfa,\n","  0x07, 0xe9, 0xea, 0xf1, 0xf5, 0xf7, 0xe5, 0xf7, 0xf8, 0xf5, 0xf9, 0xf5,\n","  0xe7, 0xfd, 0xe7, 0xfc, 0xe2, 0x28, 0x0e, 0x00, 0xf3, 0xd9, 0x10, 0x16,\n","  0xf4, 0x39, 0xe8, 0x28, 0xed, 0xf3, 0xdc, 0x07, 0x0f, 0x3a, 0xec, 0x09,\n","  0xeb, 0xf7, 0x16, 0x09, 0xf9, 0x1f, 0x02, 0x26, 0xd9, 0xfb, 0x0a, 0xf5,\n","  0xf5, 0x2b, 0xe7, 0xfb, 0xfd, 0x05, 0x1b, 0x0a, 0x05, 0x16, 0xf3, 0xfd,\n","  0xe8, 0x23, 0x01, 0xe7, 0xfb, 0x0f, 0x0d, 0x03, 0xe0, 0x3c, 0x1f, 0xe3,\n","  0xd2, 0x13, 0xfd, 0xeb, 0xe4, 0x11, 0xfb, 0x08, 0xde, 0xeb, 0x02, 0xf2,\n","  0x10, 0xf8, 0xf0, 0x16, 0xe3, 0x21, 0x05, 0x14, 0x17, 0xe6, 0xf7, 0xed,\n","  0x09, 0x19, 0x14, 0x23, 0x0b, 0xfe, 0x13, 0x26, 0x00, 0x25, 0xef, 0xee,\n","  0x05, 0x00, 0x07, 0xf3, 0xfb, 0x15, 0xf5, 0xfc, 0xe0, 0xf4, 0xf4, 0xe4,\n","  0xe5, 0x10, 0xf6, 0x03, 0xdc, 0xe5, 0x09, 0xf5, 0xe7, 0xf1, 0xf1, 0xdb,\n","  0x01, 0x09, 0xfd, 0xdc, 0xdc, 0xfc, 0x12, 0xee, 0x15, 0xdf, 0x15, 0xe1,\n","  0xf9, 0x08, 0x05, 0xde, 0x04, 0xea, 0x25, 0x1d, 0x12, 0xd9, 0xf0, 0xdf,\n","  0x31, 0xe1, 0xe5, 0xe5, 0xf6, 0xf1, 0x00, 0xe9, 0x07, 0xf2, 0x08, 0x0b,\n","  0xee, 0xe1, 0xfa, 0x06, 0x76, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0xc9, 0x01, 0x00, 0x00, 0x59, 0xfe, 0xff, 0xff,\n","  0x8f, 0xfe, 0xff, 0xff, 0x50, 0x01, 0x00, 0x00, 0x60, 0xfb, 0xff, 0xff,\n","  0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,\n","  0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x58, 0xfa, 0xff, 0xff, 0xbc, 0x01, 0x00, 0x00,\n","  0xb0, 0x01, 0x00, 0x00, 0xa4, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x78, 0x01, 0x00, 0x00, 0x18, 0x01, 0x00, 0x00,\n","  0xb4, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0xaa, 0xfe, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n","  0xce, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x03, 0x00, 0x00, 0x00,\n","  0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x1a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00,\n","  0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,\n","  0x07, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,\n","  0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x28, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x38, 0x00, 0x00, 0x00,\n","  0x2c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x1a, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0x00,\n","  0x38, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,\n","  0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00,\n","  0x10, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0xac, 0x04, 0x00, 0x00, 0x44, 0x04, 0x00, 0x00, 0xc4, 0x03, 0x00, 0x00,\n","  0x4c, 0x03, 0x00, 0x00, 0xd0, 0x02, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00,\n","  0x20, 0x02, 0x00, 0x00, 0xb4, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,\n","  0x6c, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xd4, 0xff, 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n","  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n","  0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa8, 0x07, 0x00, 0x00,\n","  0xf2, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x4c, 0x00, 0x00, 0x00,\n","  0x07, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xe4, 0xfb, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,\n","  0x13, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n","  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,\n","  0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n","  0xb4, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,\n","  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00,\n","  0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0xd6, 0x72, 0xec, 0x39, 0x57, 0x66, 0x72, 0x3a,\n","  0x1e, 0xe6, 0x14, 0x3a, 0x27, 0x15, 0x3a, 0x39, 0x33, 0xb7, 0x25, 0x3a,\n","  0xf6, 0x03, 0x80, 0x3a, 0xd2, 0x73, 0x28, 0x39, 0x79, 0xbb, 0x5c, 0x3a,\n","  0x12, 0x00, 0x00, 0x00, 0x66, 0x69, 0x72, 0x73, 0x74, 0x5f, 0x77, 0x65,\n","  0x69, 0x67, 0x68, 0x74, 0x73, 0x2f, 0x72, 0x65, 0x61, 0x64, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x32, 0xfd, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x09, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xfd, 0xff, 0xff,\n","  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x4a, 0xb2, 0xf3, 0x39, 0x1f, 0x00, 0x00, 0x00, 0x66, 0x69, 0x6e, 0x61,\n","  0x6c, 0x5f, 0x66, 0x63, 0x5f, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73,\n","  0x2f, 0x72, 0x65, 0x61, 0x64, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70,\n","  0x6f, 0x73, 0x65, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xa0, 0x0f, 0x00, 0x00, 0x9a, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x58, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xbb, 0xb0, 0xba, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0xd8, 0x1c, 0x35, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x3b, 0xcf, 0x3e, 0xc1, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,\n","  0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x06, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,\n","  0x2c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x0f, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,\n","  0x32, 0x2f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x42, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x14, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,\n","  0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0xba, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x60, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x8c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,\n","  0x61, 0x70, 0x65, 0x5f, 0x31, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa8, 0x07, 0x00, 0x00,\n","  0x2e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x60, 0x00, 0x00, 0x00,\n","  0x09, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,\n","  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,\n","  0xbd, 0xad, 0x93, 0x3d, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x1a, 0x93, 0x41,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0xaa, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,\n","  0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0xc4, 0x94, 0x0c, 0x38, 0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d,\n","  0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,\n","  0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x02, 0xa4, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n","  0x8c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x50, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x7c, 0x67, 0x40, 0x38, 0x32, 0x3f, 0xc5, 0x38, 0x5e, 0x53, 0x72, 0x38,\n","  0x90, 0x6b, 0x97, 0x37, 0xd6, 0xd8, 0x86, 0x38, 0xc2, 0x56, 0xd0, 0x38,\n","  0xf3, 0x12, 0x89, 0x37, 0x92, 0x9d, 0xb3, 0x38, 0x0b, 0x00, 0x00, 0x00,\n","  0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x70, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xca, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x06, 0x02, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x72, 0xe6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x19,\n","  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x00, 0x16, 0x0a, 0x00, 0x0e, 0x00, 0x07, 0x00,\n","  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00,\n","  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,\n","  0x03, 0x00, 0x00, 0x00\n","};\n","unsigned int g_model_len = 18952;\n"],"name":"stdout"}]}]}
\ No newline at end of file
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","colab":{}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","print(\"Training these words: %s\" % WANTED_WORDS)\n","print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n","print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n","print(\"Total number of training steps: %s\" % TOTAL_STEPS)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE = 20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 26.0\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UczQKtqLi7OJ","colab_type":"text"},"source":["# Skipping the training\n","\n","If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."]},{"cell_type":"code","metadata":{"id":"RZw3VNlnla-J","colab_type":"code","colab":{}},"source":["#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n","#!tar xzf speech_micro_train_2020_05_10.tgz"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.inference_input_type = tf.lite.constants.INT8\n","  converter.inference_output_type = tf.lite.constants.INT8\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  quantized_input = np.zeros((1960), np.int8)\n","  for index, input_value in enumerate(current_input.flatten()):\n","    # These scaling values are derived from those used in input_data.py in the\n","    # training pipeline.\n","    value = ((input_value - QUANT_INPUT_MIN) * 256) / QUANT_INPUT_RANGE\n","    value -= 128\n","    if value < -128:\n","      value = -128\n","    if value > 127:\n","      value = 127\n","    quantized_input[index] = value\n","  flattened_input = np.array(quantized_input.flatten(), dtype=np.int8).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","colab":{}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"iYlIKpO2mkhv","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
\ No newline at end of file

From d5a5959dd33d783a2af711b777a51292b0d8a02a Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 13 May 2020 22:30:26 -0700
Subject: [PATCH 0563/1533] optimize for int8 add.

PiperOrigin-RevId: 311471171
Change-Id: I822d1205b1c5312ecf0e2602b6ac35082740574d
---
 .../internal/optimized/integer_ops/add.h      | 141 +++++++++++-------
 1 file changed, 91 insertions(+), 50 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index a9dae4feac5..8937fe2b26e 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -35,58 +35,99 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
+
 #ifdef USE_NEON
-  const int8x8_t output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
-  const int8x8_t output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
-    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
-    const int16x8_t input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high = vget_high_s16(input1_val);
-    const int16x4_t input1_val_low = vget_low_s16(input1_val);
-    const int16x4_t input2_val_high = vget_high_s16(input2_val);
-    const int16x4_t input2_val_low = vget_low_s16(input2_val);
-    int32x4_t x11 = vmovl_s16(input1_val_low);
-    int32x4_t x12 = vmovl_s16(input1_val_high);
-    int32x4_t x21 = vmovl_s16(input2_val_low);
-    int32x4_t x22 = vmovl_s16(input2_val_high);
-    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
-    x11 = vshlq_s32(x11, left_shift_dup);
-    x12 = vshlq_s32(x12, left_shift_dup);
-    x21 = vshlq_s32(x21, left_shift_dup);
-    x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
-    x11 = vshlq_s32(x11, input1_shift_dup);
-    x12 = vshlq_s32(x12, input1_shift_dup);
-    x21 = vshlq_s32(x21, input2_shift_dup);
-    x22 = vshlq_s32(x22, input2_shift_dup);
-    int32x4_t s1 = vaddq_s32(x11, x21);
-    int32x4_t s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+  const int8x16_t output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, -params.output_shift);
-    s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const int16x4_t s1_narrowed = vmovn_s32(s1);
-    const int16x4_t s2_narrowed = vmovn_s32(s2);
-    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                                  vdupq_n_s16(params.output_offset));
-    const int8x8_t clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
-    vst1_s8(output_data + i, clamped);
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s = vcombine_s16(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+        vmaxq_s8(output_activation_min_vector,
+                 vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From 4afee5f519ee47d6771f78c5580c0bc6d17d8876 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 13 May 2020 22:48:33 -0700
Subject: [PATCH 0564/1533] Replace SameOperandsAndResultType by
 TFL_TCresVTEtIsSameAsOp to cover quantization types

Also fixes Mobilenet-v3-quant conversion failure.

PiperOrigin-RevId: 311473695
Change-Id: I08f836a2b829772f7a8d6b39766ab67ccd2c9a10
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 34 +++++++++++--------
 .../mlir/lite/transforms/dense_to_sparse.cc   |  3 +-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index fdf1501dbef..8a949a45e2d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1561,10 +1561,12 @@ def TFL_GreaterOp : TFL_Op<"greater", [
   let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
 }
 
-def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
-                                           SameOperandsAndResultShape,
-                                           SameOperandsAndResultType,
-                                           TFL_GpuTargetOp]> {
+def TFL_HardSwishOp: TFL_Op<"hard_swish", [
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_GpuTargetOp]> {
   let summary = "Hardswish activation function.";
   let description = [{
     Computes hard-swish activation function
@@ -1574,7 +1576,7 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
 
   let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$input);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$out);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$output);
 
   let hasOptions = 0;
 }
@@ -1606,7 +1608,8 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
 def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [
     SameOperandsAndResultShape,
     NoSideEffect,
-    SameOperandsAndResultType]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Leaky Relu operator";
 
   let description = [{
@@ -1740,7 +1743,8 @@ def TFL_LogOp: TFL_Op<"log", [
 def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultType,
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     // zero_point = max_value
     // scale = -log_softmax_output_min / (max_value + 1)
     FixedResultScale<Int8UniformQuantizedType<127, 625, -4>>,
@@ -1896,11 +1900,11 @@ Rounds the values of a tensor to the nearest integer, element-wise.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32]>:$x
+    TFL_FpTensor:$x
   );
 
   let results = (outs
-    TFL_TensorOf<[F32]>:$y
+    TFL_FpTensor:$y
   );
 }
 
@@ -2443,9 +2447,9 @@ def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect,
     Computes element-wise reverse square root of input
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TFL_FpTensor:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TFL_FpTensor:$y);
 
   let hasFolder = 1;
 }
@@ -3361,9 +3365,11 @@ def TFL_QuantizeOp: TFL_Op<"quantize", [
   let results = (outs AnyTensor:$output);
 }
 
-def TFL_DensifyOp: TFL_Op<"densify", [NoSideEffect,
-                                      SameOperandsAndResultType,
-                                      NoQuantizableResult]> {
+def TFL_DensifyOp: TFL_Op<"densify", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoQuantizableResult]> {
   let summary = "Densify operator";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index 201a0bb2481..9b526f40277 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -321,7 +321,8 @@ void DenseToSparse::runOnFunction() {
 
       if (result.needs_densify) {
         const auto value = op->getOperand(operand);
-        auto densify = builder.create<DensifyOp>(op->getLoc(), value);
+        auto densify =
+            builder.create<DensifyOp>(op->getLoc(), value.getType(), value);
         value.replaceAllUsesWith(densify);
         densify.setOperand(value);
       }

From 9173c6c3d3f9f3a8c58f48f803952cad83bf8730 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 22:52:09 -0700
Subject: [PATCH 0565/1533] Internal change

PiperOrigin-RevId: 311474047
Change-Id: I2c8bcfc0c13d5bf82eaeeadc43202171eeecab8b
---
 .../kernels/data/experimental/snapshot_util.cc | 18 +++++++-----------
 tensorflow/core/platform/tensor_coding.cc      |  9 +--------
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 3ad1345d776..6c4d6424146 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -503,12 +503,10 @@ Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
       size_t tensor_proto_size = tensor_proto_strs[complex_index].second;
       TensorProto tp;
 #if defined(PLATFORM_GOOGLE)
-      auto tensor_proto_ptr = tensor_proto_str.release();
-      absl::Cord c;
-      c.AppendExternalMemory(
-          absl::string_view(tensor_proto_ptr, tensor_proto_size),
-          tensor_proto_ptr,
-          [](void* arg) { delete[] static_cast<char*>(arg); });
+      absl::string_view tensor_proto_view(tensor_proto_str.get(),
+                                          tensor_proto_size);
+      absl::Cord c = absl::MakeCordFromExternal(
+          tensor_proto_view, [s = std::move(tensor_proto_str)] {});
       if (!tp.ParseFromCord(c)) {
         return errors::Internal("Could not parse TensorProto");
       }
@@ -615,11 +613,9 @@ Status Reader::ReadRecord(absl::Cord* record) {
   } else {
     auto tmp_str = absl::make_unique<tstring>();
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(length, tmp_str.get()));
-    tstring* tmp_str_raw = tmp_str.release();
-    record->AppendExternalMemory(*tmp_str_raw, tmp_str_raw,
-                                 [](absl::string_view unused_data, void* arg) {
-                                   delete static_cast<tstring*>(arg);
-                                 });
+    absl::string_view tmp_str_view(*tmp_str);
+    record->Append(
+        absl::MakeCordFromExternal(tmp_str_view, [s = std::move(tmp_str)] {}));
     return Status::OK();
   }
 }
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 66d28d7b15f..cd938a5be1d 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -134,14 +134,7 @@ std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
 #if defined(TENSORFLOW_PROTOBUF_USES_CORD)
 void AssignRefCounted(StringPiece src, core::RefCounted* obj, absl::Cord* out) {
   obj->Ref();
-  out->Clear();
-  // Defines a lambda to unref "obj" when Cord deletes this piece of
-  // memory. +[] converts the lambda to a C style function pointer.
-  auto cleanup = +[](absl::string_view donotcare, void* obj) {
-    reinterpret_cast<core::RefCounted*>(obj)->Unref();
-  };
-  out->AppendExternalMemory(absl::string_view(src.data(), src.size()), obj,
-                            cleanup);
+  *out = absl::MakeCordFromExternal(src, [obj] { obj->Unref(); });
 }
 
 void EncodeStringList(const tstring* strings, int64 n, absl::Cord* out) {

From 2f8ea36a4475a726e564223023d98e09959919a9 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 13 May 2020 22:57:23 -0700
Subject: [PATCH 0566/1533] Pin keras_preprocessing to 1.1.0

PiperOrigin-RevId: 311474544
Change-Id: I1911f5b834e61cd269c39ca30559d1304b3a787f
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index f61e00c01d5..806ad2d0cdb 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -60,7 +60,7 @@ REQUIRED_PACKAGES = [
     'gast == 0.3.3',
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing >= 1.1.0',
+    'keras_preprocessing == 1.1.0',
     'numpy >= 1.16.0, < 2.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',

From 2a55f049241fae552009b8e520894da1404f281f Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 13 May 2020 23:02:24 -0700
Subject: [PATCH 0567/1533] Cleanup `setup.py`

Remove python 2 and TF 1.x stanzas. Also make keras_preprocessing be between 1.1.1 and 1.2.

PiperOrigin-RevId: 311475047
Change-Id: I4ba517cb8babd609e83d031c86afb6670d34c757
---
 tensorflow/tools/pip_package/setup.py | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 806ad2d0cdb..4b8289a6202 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -55,12 +55,10 @@ _VERSION = '2.2.0'
 REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',
     'astunparse == 1.6.3',
-    'backports.weakref >= 1.0rc1;python_version<"3.4"',
-    'enum34 >= 1.1.6;python_version<"3.4"',
     'gast == 0.3.3',
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing == 1.1.0',
+    'keras_preprocessing >= 1.1.1, < 1.2',
     'numpy >= 1.16.0, < 2.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
@@ -68,18 +66,10 @@ REQUIRED_PACKAGES = [
     'tensorflow_estimator >= 2.2.0, < 2.3.0',
     'termcolor >= 1.1.0',
     'wrapt >= 1.11.1',
-    # python3 requires wheel 0.26
-    'wheel >= 0.26;python_version>="3"',
-    'wheel;python_version<"3"',
-    # mock comes with unittest.mock for python3, need to install for python2
-    'mock >= 2.0.0;python_version<"3"',
-    # functools comes with python3, need to install the backport for python2
-    'functools32 >= 3.2.3;python_version<"3"',
+    'wheel >= 0.26',
     'six >= 1.12.0',
     # scipy < 1.4.1 causes segfaults due to pybind11
-    # Latest scipy pip for py2 is scipy==1.2.2
-    'scipy == 1.4.1;python_version>="3"',
-    'scipy == 1.2.2;python_version<"3"',
+    'scipy == 1.4.1',
 ]
 
 if sys.byteorder == 'little':
@@ -100,8 +90,6 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.3.0a0, < 2.4.0a0'
-    elif 'tensorflow_estimator' in pkg and '2.0' in project_name:
-      REQUIRED_PACKAGES[i] = 'tensorflow-estimator-2.0-preview'
     elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
@@ -121,11 +109,6 @@ CONSOLE_SCRIPTS = [
 ]
 # pylint: enable=line-too-long
 
-# Only keep freeze_graph console script in 1.X.
-if _VERSION.startswith('1.') and '_2.0' not in project_name:
-  CONSOLE_SCRIPTS.append(
-      'freeze_graph = tensorflow.python.tools.freeze_graph:run_main')
-
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
   CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')

From e40aeb534e10db518ec44ccf32b09db8446d6aa3 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 13 May 2020 23:20:32 -0700
Subject: [PATCH 0568/1533] Change xtensa optimized softmax to use precomputed
 lookup table for quantized exponent calculation.  Use new memory API for
 softmax.

PiperOrigin-RevId: 311476576
Change-Id: I1026f6eca0e098c42f7b784ab599ed362dc533c9
---
 .../micro/kernels/xtensa_hifimini/softmax.cc  | 134 ++++++++++++++----
 1 file changed, 108 insertions(+), 26 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index c95fd0e40a4..a7c5604ef64 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -29,16 +29,88 @@ namespace micro {
 namespace activations {
 namespace {
 
-// TODO(b/141176180): This code is currently a strict subset of the portable
-// implementation (softmax.cc one directory up). When TFLM implements
-// registrations for selective types (e.g. compile without float support), this
-// can be removed. Otherwise, any HiFi specific optimizations should land here.
+struct OpData {
+  uint16_t* exp_lut;
+};
+
+// Number of unique int8 and int16 values.  Used in exponent lookup table
+// conputation.
+constexpr int kInt8Range =
+    std::numeric_limits<int8_t>::max() - std::numeric_limits<int8>::min() + 1;
+constexpr int kInt16Range =
+    std::numeric_limits<int16_t>::max() - std::numeric_limits<int16>::min() + 1;
+// Each 16-bit precalculated exponent is expressed as a Q0.16 fixedpoint
+// value. We special-case e^0 since 1.0 requires 1 integer bit to
+// express.
+constexpr int kExpFractionalBits = 16;
+// e^0 expressed as Q1.15 exceeds the int16_t range, so it must be handled
+// specially.
+constexpr int kMaxExponentValue = (1 << kExpFractionalBits);
+
+// Quantized softmax with int8 input and int16 output.
+// TODO(b/155656675): Investigate removing const ref params.
+inline TfLiteStatus Softmax(const OpData& op_data,
+                            const RuntimeShape& input_shape,
+                            const int8_t* input_data,
+                            const RuntimeShape& output_shape,
+                            int16_t* output_data) {
+  // The last dimension is depth.  Outer size is the the total input size
+  // divided by depth.
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    int8_t max_in_row = std::numeric_limits<int8_t>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    uint32_t sum_of_exps = 0;
+    for (int c = 0; c < depth; ++c) {
+      TFLITE_DCHECK(max_in_row >= input_data[i * depth + c]);
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+
+      sum_of_exps +=
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+    }
+
+    // Ensure we cannnot overflow the full_range_output value.  We need to
+    // guarantee that kInt16Range * max(input_data) / sum_of_exps < kInt16Range.
+    TFLITE_DCHECK(sum_of_exps >= kMaxExponentValue);
+
+    for (int c = 0; c < depth; ++c) {
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+      // Special case for diff == 0
+      uint32_t unscaled_output =
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+      int64_t scaled_output = static_cast<int64_t>(unscaled_output) *
+                              static_cast<int64_t>(kInt16Range);
+      int32_t full_range_output =
+          scaled_output / sum_of_exps + std::numeric_limits<int16_t>::min();
+      // Round up if remainder exceeds half of the divider value.
+      uint32_t remainder = scaled_output % sum_of_exps;
+      if (remainder * 2 >= sum_of_exps) {
+        full_range_output++;
+      }
+      output_data[i * depth + c] = static_cast<int16_t>(std::max(
+          std::min(full_range_output,
+                   static_cast<int32>(std::numeric_limits<int16_t>::max())),
+          static_cast<int32_t>(std::numeric_limits<int16_t>::min())));
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
 
 TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
                                     const TfLiteSoftmaxParams* params,
-                                    SoftmaxParams* op_data) {
+                                    OpData* op_data) {
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     if (input->type == kTfLiteUInt8) {
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
@@ -55,28 +127,30 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
       }
     }
 
-    static const int kScaledDiffIntegerBits = 5;
+    // Precompute e^(-x * input_scale * beta) for every possible int8 input.
+    // This computation is used for every iteration of Softmax.  We must compute
+    // using pre-scaled inputs to avoid introducing additional error, while
+    // restricting our input range to the int8 range. This is valid since beta
+    // and input scale are constant for a given op in the graph. Skip index 0
+    // since that is a special case which requires 1 integer bit instead of 0.
+    for (int i = 1; i <= kInt8Range; i++) {
+      float scaled_input = i * input->params.scale;
+      float exp_value =
+          std::exp((-scaled_input) * static_cast<float>(params->beta));
 
-    int input_left_shift;
-    tflite::PreprocessSoftmaxScaling(
-        static_cast<double>(params->beta),
-        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &op_data->input_multiplier, &input_left_shift);
-    op_data->input_left_shift = input_left_shift;
-    op_data->diff_min =
-        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                            op_data->input_left_shift);
+      float exponent_scaled =
+          std::round(exp_value * static_cast<float>(1 << kExpFractionalBits));
+      op_data->exp_lut[i] = static_cast<uint16_t>(exponent_scaled);
+    }
   }
   return kTfLiteOk;
 }
 
-}  // namespace
-
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams),
-                                        &data) == kTfLiteError) {
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
     return nullptr;
   }
   return data;
@@ -92,26 +166,34 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
   TFLITE_DCHECK(node->user_data != nullptr);
-  SoftmaxParams* op_params = static_cast<SoftmaxParams*>(node->user_data);
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+
+  // Allocate an array to precompute exponents over all int8 inputs, applying
+  // the scale and beta before calculating exp. It is mandatory to apply beta
+  // and scale here, since each softmax op may have different beta and scale
+  // values. Beta and scale will remain constant for a given softmax op.
+  void* allocated_ptr;
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, kInt8Range * sizeof(int16_t), &allocated_ptr));
+  op_data->exp_lut = static_cast<uint16_t*>(allocated_ptr);
 
   TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxOpData(context, input, output, params, op_params));
+      CalculateSoftmaxOpData(context, input, output, params, op_data));
 
   return kTfLiteOk;
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_params = static_cast<SoftmaxParams*>(node->user_data);
+  auto* op_data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
     // TODO(b/155656675): Const ref params can be slow on xtensa.
-    tflite::reference_ops::Softmax(
-        *op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int16_t>(output));
-    return kTfLiteOk;
+    return Softmax(*op_data, GetTensorShape(input),
+                   GetTensorData<int8_t>(input), GetTensorShape(output),
+                   GetTensorData<int16_t>(output));
   } else {
     TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                        TfLiteTypeGetName(input->type), input->type);

From 112288586dd69d5eede04be059b7eddc5635bc98 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 23:31:44 -0700
Subject: [PATCH 0569/1533] Switch weights from per-value to per-input-item.

PiperOrigin-RevId: 311477582
Change-Id: I749c4edfcfd4dd3acd036a1d14b2c493b8d8bfc8
---
 .../api_def_DenseCountSparseOutput.pbtxt      |  23 +-
 .../api_def_RaggedCountSparseOutput.pbtxt     |  27 +-
 .../api_def_SparseCountSparseOutput.pbtxt     |  29 +--
 tensorflow/core/kernels/count_ops.cc          | 246 +++++++++++-------
 tensorflow/core/ops/count_ops.cc              |  39 ++-
 tensorflow/python/ops/bincount.py             | 151 ++---------
 tensorflow/python/ops/bincount_test.py        | 188 ++++---------
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v1/tensorflow.sparse.pbtxt     |   2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v2/tensorflow.sparse.pbtxt     |   2 +-
 11 files changed, 278 insertions(+), 441 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
index 8296bfe6d7b..416da1ccaab 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -4,62 +4,61 @@ op {
   in_arg {
     name: "values"
     description: <<END
-Tensor containing data to count.
+int32 or int64; Tensor containing data to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-A Tensor of the same shape as indices containing per-index weight values. May
-also be the empty tensor if no weights are used.
+float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-Indices tensor for the resulting sparse tensor object.
+int64; indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-Values tensor for the resulting sparse tensor object.
+int64 or float32; values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-Shape tensor for the resulting sparse tensor object.
+int64; shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-Dtype of the input values tensor.
+dtype; dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-Minimum value to count. Can be set to -1 for no minimum.
+int32; minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-Maximum value to count. Can be set to -1 for no maximum.
+int32; maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_output"
+    name: "binary_count"
     description: <<END
-Whether to output the number of occurrences of each value or 1.
+bool; whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-Dtype of the output values tensor.
+dtype; dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a tf.tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
index 37224d841de..1763aea1fa6 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -4,68 +4,67 @@ op {
   in_arg {
     name: "splits"
     description: <<END
-Tensor containing the row splits of the ragged tensor to count.
+int64; Tensor containing the row splits of the ragged tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-Tensor containing values of the sparse tensor to count.
+int32 or int64; Tensor containing values of the sparse tensor to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-A Tensor of the same shape as indices containing per-index weight values.
-May also be the empty tensor if no weights are used.
+float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-Indices tensor for the resulting sparse tensor object.
+int64; indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-Values tensor for the resulting sparse tensor object.
-END
+int64 or float32; values tensor for the resulting sparse tensor object.
+  END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-Shape tensor for the resulting sparse tensor object.
+int64; shape tensor for the resulting sparse tensor object.
   END
   }
   attr {
     name: "T"
     description: <<END
-Dtype of the input values tensor.
+dtype; dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-Minimum value to count. Can be set to -1 for no minimum.
+int32; minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-Maximum value to count. Can be set to -1 for no maximum.
+int32; maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_output"
+    name: "binary_count"
     description: <<END
-Whether to output the number of occurrences of each value or 1.
+bool; whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-Dtype of the output values tensor.
+dtype; dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a ragged tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
index a346710c8b3..62538e36a45 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -4,74 +4,73 @@ op {
   in_arg {
     name: "indices"
     description: <<END
-Tensor containing the indices of the sparse tensor to count.
+int64; Tensor containing the indices of the sparse tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-Tensor containing values of the sparse tensor to count.
+int32 or int64; Tensor containing values of the sparse tensor to count.
 END
   }
 in_arg {
     name: "dense_shape"
     description: <<END
-Tensor containing the dense shape of the sparse tensor to count.
+int64; Tensor containing the dense shape of the sparse tensor to count.
 END
   }
- in_arg {
+  in_arg {
     name: "weights"
     description: <<END
-A Tensor of the same shape as indices containing per-index weight values.
-May also be the empty tensor if no weights are used.
+float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-Indices tensor for the resulting sparse tensor object.
+int64; indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_values"
       description: <<END
-Values tensor for the resulting sparse tensor object.
+int64 or float32; values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_dense_shape"
       description: <<END
-Shape tensor for the resulting sparse tensor object.
+int64; shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-Dtype of the input values tensor.
+dtype; dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-Minimum value to count. Can be set to -1 for no minimum.
+int32; minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-Maximum value to count. Can be set to -1 for no maximum.
+int32; maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_output"
+    name: "binary_count"
     description: <<END
-Whether to output the number of occurrences of each value or 1.
+bool; whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-Dtype of the output values tensor.
+dtype; dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a sparse tensor input."
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index 7c85b050039..e7cc18ac454 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -16,20 +16,17 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-template <class T>
-using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
+using BatchedIntMap = std::vector<absl::flat_hash_map<int64, int64>>;
 
 namespace {
 // TODO(momernick): Extend this function to work with outputs of rank > 2.
-template <class T>
-Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
+Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
                     bool is_1d, OpKernelContext* context) {
   int total_values = 0;
   int num_batches = per_batch_counts.size();
@@ -47,12 +44,12 @@ Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
       context->allocate_output(1, TensorShape({total_values}), &values));
 
   auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<T>();
+  auto output_values = values->flat<int64>();
   int64 value_loc = 0;
   for (int b = 0; b < num_batches; ++b) {
     const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
-                                         per_batch_count.end());
+    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
+                                           per_batch_count.end());
     std::sort(pairs.begin(), pairs.end());
     for (const auto& x : pairs) {
       if (is_1d) {
@@ -80,19 +77,85 @@ Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
   return Status::OK();
 }
 
-int GetOutputSize(int max_seen, int max_length, int min_length) {
+Status OutputWeightedSparse(const BatchedIntMap& per_batch_counts,
+                            int num_values, const Tensor& weights, bool is_1d,
+                            OpKernelContext* context) {
+  if (!TensorShapeUtils::IsVector(weights.shape())) {
+    return errors::InvalidArgument(
+        "Weights must be a 1-dimensional tensor. Got: ",
+        weights.shape().DebugString());
+  }
+
+  if (num_values > weights.dim_size(0)) {
+    return errors::InvalidArgument("The maximum array value was ", num_values,
+                                   ", but the weight array has size ",
+                                   weights.shape().DebugString());
+  }
+  auto weight_values = weights.flat<float>();
+
+  int total_values = 0;
+  int num_batches = per_batch_counts.size();
+  for (const auto& per_batch_count : per_batch_counts) {
+    total_values += per_batch_count.size();
+  }
+
+  Tensor* indices;
+  int inner_dim = is_1d ? 1 : 2;
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({total_values, inner_dim}), &indices));
+
+  Tensor* values;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(1, TensorShape({total_values}), &values));
+
+  auto output_indices = indices->matrix<int64>();
+  auto output_values = values->flat<float>();
+  int64 value_loc = 0;
+  for (int b = 0; b < num_batches; ++b) {
+    const auto& per_batch_count = per_batch_counts[b];
+    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
+                                           per_batch_count.end());
+    std::sort(pairs.begin(), pairs.end());
+    for (const auto& x : pairs) {
+      if (is_1d) {
+        output_indices(value_loc, 0) = x.first;
+      } else {
+        output_indices(value_loc, 0) = b;
+        output_indices(value_loc, 1) = x.first;
+      }
+      output_values(value_loc) = x.second * weight_values(x.first);
+      ++value_loc;
+    }
+  }
+
+  Tensor* dense_shape;
+  if (is_1d) {
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(2, TensorShape({1}), &dense_shape));
+    dense_shape->flat<int64>().data()[0] = num_values;
+  } else {
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(2, TensorShape({2}), &dense_shape));
+    dense_shape->flat<int64>().data()[0] = num_batches;
+    dense_shape->flat<int64>().data()[1] = num_values;
+  }
+  return Status::OK();
+}
+
+template <class T>
+T GetOutputSize(T max_seen, T max_length, T min_length) {
   return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
 }
 
 }  // namespace
 
-template <class T, class W>
+template <class T>
 class DenseCount : public OpKernel {
  public:
   explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -107,15 +170,6 @@ class DenseCount : public OpKernel {
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
                     data.shape().DebugString()));
 
-    if (use_weights) {
-      OP_REQUIRES(
-          context, weights.shape() == data.shape(),
-          errors::InvalidArgument(
-              "Weights and data must have the same shape. Weight shape: ",
-              weights.shape().DebugString(),
-              "; data shape: ", data.shape().DebugString()));
-    }
-
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
     int negative_valued_axis = -1;
     int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
@@ -125,23 +179,19 @@ class DenseCount : public OpKernel {
       num_batch_elements *= data.shape().dim_size(i);
     }
     int num_value_elements = data.shape().num_elements() / num_batch_elements;
-    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
-
+    auto per_batch_counts = BatchedIntMap(num_batch_elements);
     T max_value = 0;
 
     const auto data_values = data.flat<T>();
-    const auto weight_values = weights.flat<W>();
     int i = 0;
     for (int b = 0; b < num_batch_elements; ++b) {
       for (int v = 0; v < num_value_elements; ++v) {
         const auto& value = data_values(i);
         if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-          if (binary_output_) {
-            per_batch_counts[b][value] = 1;
-          } else if (use_weights) {
-            per_batch_counts[b][value] += weight_values(i);
+          if (binary_count_) {
+            (per_batch_counts[b])[value] = 1;
           } else {
-            per_batch_counts[b][value]++;
+            (per_batch_counts[b])[value]++;
           }
           if (value > max_value) {
             max_value = value;
@@ -151,24 +201,30 @@ class DenseCount : public OpKernel {
       }
     }
 
-    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
-    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
-                                            is_1d, context));
+    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
+    if (use_weights) {
+      OP_REQUIRES_OK(context,
+                     OutputWeightedSparse(per_batch_counts, num_output_values,
+                                          weights, is_1d, context));
+    } else {
+      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
+                                           is_1d, context));
+    }
   }
 
  private:
-  int maxlength_;
-  int minlength_;
-  bool binary_output_;
+  T minlength_;
+  T maxlength_;
+  bool binary_count_;
 };
 
-template <class T, class W>
+template <class T>
 class SparseCount : public OpKernel {
  public:
   explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -179,27 +235,23 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     bool is_1d = shape.NumElements() == 1;
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
-    const auto indices_values = indices.matrix<int64>();
-    const auto values_values = values.flat<T>();
-    const auto weight_values = weights.flat<W>();
-
-    auto per_batch_counts = BatchedMap<W>(num_batches);
-
+    auto per_batch_counts = BatchedIntMap(num_batches);
     T max_value = 0;
 
     for (int idx = 0; idx < num_values; ++idx) {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_output_) {
-          per_batch_counts[batch][value] = 1;
-        } else if (use_weights) {
-          per_batch_counts[batch][value] += weight_values(idx);
+        if (binary_count_) {
+          (per_batch_counts[batch])[value] = 1;
         } else {
-          per_batch_counts[batch][value]++;
+          (per_batch_counts[batch])[value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -207,25 +259,30 @@ class SparseCount : public OpKernel {
       }
     }
 
-    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
-    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
-                                            is_1d, context));
+    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
+    if (use_weights) {
+      OP_REQUIRES_OK(context,
+                     OutputWeightedSparse(per_batch_counts, num_output_values,
+                                          weights, is_1d, context));
+    } else {
+      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
+                                           is_1d, context));
+    }
   }
 
  private:
-  int maxlength_;
-  int minlength_;
-  bool binary_output_;
-  bool validate_;
+  T minlength_;
+  T maxlength_;
+  bool binary_count_;
 };
 
-template <class T, class W>
+template <class T>
 class RaggedCount : public OpKernel {
  public:
   explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -233,15 +290,13 @@ class RaggedCount : public OpKernel {
     const Tensor& values = context->input(1);
     const Tensor& weights = context->input(2);
     bool use_weights = weights.NumElements() > 0;
-    bool is_1d = false;
 
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
-    const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedMap<W>(num_batches);
+    auto per_batch_counts = BatchedIntMap(num_batches);
     T max_value = 0;
     int batch_idx = 0;
 
@@ -251,12 +306,10 @@ class RaggedCount : public OpKernel {
       }
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_output_) {
-          per_batch_counts[batch_idx - 1][value] = 1;
-        } else if (use_weights) {
-          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
+        if (binary_count_) {
+          (per_batch_counts[batch_idx - 1])[value] = 1;
         } else {
-          per_batch_counts[batch_idx - 1][value]++;
+          (per_batch_counts[batch_idx - 1])[value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -264,47 +317,42 @@ class RaggedCount : public OpKernel {
       }
     }
 
-    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
-    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
-                                            is_1d, context));
+    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
+    if (use_weights) {
+      OP_REQUIRES_OK(context,
+                     OutputWeightedSparse(per_batch_counts, num_output_values,
+                                          weights, false, context));
+    } else {
+      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
+                                           false, context));
+    }
   }
 
  private:
-  int maxlength_;
-  int minlength_;
-  bool binary_output_;
-  bool validate_;
+  T minlength_;
+  T maxlength_;
+  bool binary_count_;
 };
 
-#define REGISTER_W(W_TYPE) \
-  REGISTER(int32, W_TYPE)  \
-  REGISTER(int64, W_TYPE)
+#define REGISTER(TYPE)                                    \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")  \
+                              .TypeConstraint<TYPE>("T")  \
+                              .Device(DEVICE_CPU),        \
+                          DenseCount<TYPE>)               \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput") \
+                              .TypeConstraint<TYPE>("T")  \
+                              .Device(DEVICE_CPU),        \
+                          SparseCount<TYPE>)              \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput") \
+                              .TypeConstraint<TYPE>("T")  \
+                              .Device(DEVICE_CPU),        \
+                          RaggedCount<TYPE>)
 
-#define REGISTER(I_TYPE, W_TYPE)                                     \
-                                                                     \
-  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
-                              .TypeConstraint<I_TYPE>("T")           \
-                              .TypeConstraint<W_TYPE>("output_type") \
-                              .Device(DEVICE_CPU),                   \
-                          DenseCount<I_TYPE, W_TYPE>)                \
-                                                                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
-                              .TypeConstraint<I_TYPE>("T")           \
-                              .TypeConstraint<W_TYPE>("output_type") \
-                              .Device(DEVICE_CPU),                   \
-                          SparseCount<I_TYPE, W_TYPE>)               \
-                                                                     \
-  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
-                              .TypeConstraint<I_TYPE>("T")           \
-                              .TypeConstraint<W_TYPE>("output_type") \
-                              .Device(DEVICE_CPU),                   \
-                          RaggedCount<I_TYPE, W_TYPE>)
-
-TF_CALL_INTEGRAL_TYPES(REGISTER_W);
-TF_CALL_float(REGISTER_W);
-TF_CALL_double(REGISTER_W);
-
-#undef REGISTER_W
+REGISTER(int32);
+REGISTER(int64);
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
index 8de0a2ef954..c9fbe1f8d8e 100644
--- a/tensorflow/core/ops/count_ops.cc
+++ b/tensorflow/core/ops/count_ops.cc
@@ -19,21 +19,12 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
 
 Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
-  auto values = c->input(0);
-  auto weights = c->input(1);
-  ShapeHandle output;
-  auto num_weights = c->NumElements(weights);
-  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
-    output = values;
-  } else {
-    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
-  }
-  auto rank = c->Rank(output);
-  auto nvals = c->UnknownDim();
+  int32 rank = c->Rank(c->input(0));
+  DimensionHandle nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -41,8 +32,8 @@ Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
 }
 
 Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
-  auto rank = c->Dim(c->input(0), 1);
-  auto nvals = c->UnknownDim();
+  DimensionHandle rank = c->Dim(c->input(0), 1);
+  DimensionHandle nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -54,7 +45,7 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
   if (rank != c->kUnknownRank) {
     ++rank;  // Add the ragged dimension
   }
-  auto nvals = c->UnknownDim();
+  DimensionHandle nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -63,12 +54,12 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
 
 REGISTER_OP("DenseCountSparseOutput")
     .Input("values: T")
-    .Input("weights: output_type")
+    .Input("weights: float")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_output: bool")
-    .Attr("output_type: {int32, int64, float, double}")
+    .Attr("binary_count: bool")
+    .Attr("output_type: {int64, float}")
     .SetShapeFn(DenseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -78,12 +69,12 @@ REGISTER_OP("SparseCountSparseOutput")
     .Input("indices: int64")
     .Input("values: T")
     .Input("dense_shape: int64")
-    .Input("weights: output_type")
+    .Input("weights: float")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_output: bool")
-    .Attr("output_type: {int32, int64, float, double}")
+    .Attr("binary_count: bool")
+    .Attr("output_type: {int64, float}")
     .SetShapeFn(SparseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -92,12 +83,12 @@ REGISTER_OP("SparseCountSparseOutput")
 REGISTER_OP("RaggedCountSparseOutput")
     .Input("splits: int64")
     .Input("values: T")
-    .Input("weights: output_type")
+    .Input("weights: float")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_output: bool")
-    .Attr("output_type: {int32, int64, float, double}")
+    .Attr("binary_count: bool")
+    .Attr("output_type: {int64, float}")
     .SetShapeFn(RaggedCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
index 68950eaf596..e1b3bebaaaa 100644
--- a/tensorflow/python/ops/bincount.py
+++ b/tensorflow/python/ops/bincount.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
@@ -33,7 +33,7 @@ def sparse_bincount(values,
                     axis=0,
                     minlength=None,
                     maxlength=None,
-                    binary_output=False,
+                    binary_count=False,
                     name=None):
   """Count the number of times an integer value appears in a tensor.
 
@@ -58,9 +58,8 @@ def sparse_bincount(values,
     maxlength: If given, skips `values` that are greater than or equal to
       `maxlength`, and ensures that the output has a `dense_shape` of at most
       `maxlength` in the inner dimension.
-    binary_output: If True, this op will output 1 instead of the number of times
-      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
-      reduce_add). Defaults to False.
+    binary_count: Whether to do a binary count. When True, this op will return 1
+      for any value that exists instead of counting the number of occurrences.
     name: A name for this op.
 
   Returns:
@@ -79,7 +78,7 @@ def sparse_bincount(values,
   SparseTensor) and returns a SparseTensor where the value of (i,j) is the
   number of times value j appears in batch i.
 
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
   >>> output = tf.sparse.bincount(data, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
@@ -103,7 +102,7 @@ def sparse_bincount(values,
   dense shape is [2, 500] instead of [2,10002] or [2, 102].
 
   >>> minlength = maxlength = 500
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
   >>> output = tf.sparse.bincount(
   ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
   >>> print(output)
@@ -124,8 +123,8 @@ def sparse_bincount(values,
   some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
   the 'values' tensor is all 1s.
 
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
+  >>> dense = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> output = tf.sparse.bincount(dense, binary_count=True, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
   [[    0    10]
@@ -137,42 +136,20 @@ def sparse_bincount(values,
    values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
    dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
 
-  **Weighted bin-counting**
-
-  This example takes two inputs - a values tensor and a weights tensor. These
-  tensors must be identically shaped, and have the same row splits or indices
-  in the case of RaggedTensors or SparseTensors. When performing a weighted
-  count, the op will output a SparseTensor where the value of (i, j) is the
-  sum of the values in the weight tensor's batch i in the locations where
-  the values tensor has the value j. In this case, the output dtype is the
-  same as the dtype of the weights tensor.
-
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
-  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[    0    10]
-   [    0    20]
-   [    0    30]
-   [    1    11]
-   [    1   101]
-   [    1 10001]], shape=(6, 2), dtype=int64),
-   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
-   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
-
   """
   with ops.name_scope(name, "count", [values, weights]):
     if not isinstance(values, sparse_tensor.SparseTensor):
       values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
           values, name="values")
-    if weights is not None:
-      if not isinstance(weights, sparse_tensor.SparseTensor):
-        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-            weights, name="weights")
 
-    if weights is not None and binary_output:
-      raise ValueError("binary_output and weights are mutually exclusive.")
+    if weights is not None and binary_count:
+      raise ValueError("binary_count and weights are mutually exclusive.")
+
+    if weights is None:
+      weights = []
+      output_type = dtypes.int64
+    else:
+      output_type = dtypes.float32
 
     if axis is None:
       axis = 0
@@ -185,114 +162,38 @@ def sparse_bincount(values,
     maxlength_value = maxlength if maxlength is not None else -1
 
     if axis == 0:
-      if isinstance(values, sparse_tensor.SparseTensor):
-        if weights is not None:
-          weights = validate_sparse_weights(values, weights)
-        values = values.values
-      elif isinstance(values, ragged_tensor.RaggedTensor):
-        if weights is not None:
-          weights = validate_ragged_weights(values, weights)
+      if isinstance(values,
+                    (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
         values = values.values
       else:
-        if weights is not None:
-          weights = array_ops.reshape(weights, [-1])
         values = array_ops.reshape(values, [-1])
 
     if isinstance(values, sparse_tensor.SparseTensor):
-      weights = validate_sparse_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
           values.indices,
           values.values,
           values.dense_shape,
-          weights,
+          weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_output=binary_output)
+          binary_count=binary_count,
+          output_type=output_type)
     elif isinstance(values, ragged_tensor.RaggedTensor):
-      weights = validate_ragged_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
           values.row_splits,
           values.values,
-          weights,
+          weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_output=binary_output)
+          binary_count=binary_count,
+          output_type=output_type)
     else:
-      weights = validate_dense_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
           values,
           weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_output=binary_output)
+          binary_count=binary_count,
+          output_type=output_type)
 
     return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
-
-
-def validate_dense_weights(values, weights):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    return array_ops.constant([], dtype=values.dtype)
-
-  if not isinstance(weights, ops.Tensor):
-    raise ValueError(
-        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
-
-  return weights
-
-
-def validate_sparse_weights(values, weights):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    return array_ops.constant([], dtype=values.values.dtype)
-
-  if not isinstance(weights, sparse_tensor.SparseTensor):
-    raise ValueError(
-        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
-
-  checks = []
-  if weights.dense_shape is not values.dense_shape:
-    checks.append(
-        check_ops.assert_equal(
-            weights.dense_shape,
-            values.dense_shape,
-            message="'weights' and 'values' must have the same dense shape."))
-  if weights.indices is not values.indices:
-    checks.append(
-        check_ops.assert_equal(
-            weights.indices,
-            values.indices,
-            message="'weights' and 'values' must have the same indices.")
-    )
-  if checks:
-    with ops.control_dependencies(checks):
-      weights = array_ops.identity(weights.values)
-  else:
-    weights = weights.values
-
-  return weights
-
-
-def validate_ragged_weights(values, weights):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    return array_ops.constant([], dtype=values.values.dtype)
-
-  if not isinstance(weights, ragged_tensor.RaggedTensor):
-    raise ValueError(
-        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
-
-  checks = []
-  if weights.row_splits is not values.row_splits:
-    checks.append(
-        check_ops.assert_equal(
-            weights.row_splits,
-            values.row_splits,
-            message="'weights' and 'values' must have the same row splits."))
-  if checks:
-    with ops.control_dependencies(checks):
-      weights = array_ops.identity(weights.values)
-  else:
-    weights = weights.values
-
-  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_test.py
index 839af8dcc35..776b65b72d0 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_test.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import errors
 from tensorflow.python.ops import bincount
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -67,7 +65,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 6],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_maxlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -75,7 +73,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 7],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_minlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -84,7 +82,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 9],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_minlength_larger_values_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -93,40 +91,40 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 8],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_no_maxlength_weights",
           "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [2, 1, 0.5, 9, 3],
+          "expected_values": [1, 2, 3, 8, 5],
           "expected_shape": [2, 6],
-          "weights": [[0.5, 1, 2], [3, 4, 5]]
+          "weights": [0.5, 1, 2, 3, 4, 5]
       }, {
           "testcase_name": "_maxlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "maxlength": 7,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [2, 1, 0.5, 3, 9],
+          "expected_values": [1, 2, 3, 0.5, 8],
           "expected_shape": [2, 7],
-          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
+          "weights": [0.5, 1, 2, 3, 4, 5, 6]
       }, {
           "testcase_name": "_minlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 9,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
           "expected_shape": [2, 9],
-          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       }, {
           "testcase_name": "_minlength_larger_values_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 3,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
           "expected_shape": [2, 8],
-          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       }, {
           "testcase_name": "_1d",
           "x": np.array([3, 2, 1, 1], dtype=np.int32),
@@ -148,7 +146,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        expected_shape,
                        minlength=None,
                        maxlength=None,
-                       binary_output=False,
+                       binary_count=False,
                        weights=None,
                        axis=-1):
     y = bincount.sparse_bincount(
@@ -156,7 +154,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
         weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_output=binary_output,
+        binary_count=binary_count,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -218,7 +216,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
           "expected_values": [1, 1, 1, 1],
           "expected_shape": [3, 6],
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -232,7 +230,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -246,7 +244,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -260,7 +258,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -270,10 +268,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 7, 10],
+          "expected_values": [1, 3, 8, 5],
           "expected_shape": [3, 6],
-          "weights":
-              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5]
       },
       {
           "testcase_name":
@@ -282,12 +279,11 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 7, 10],
+          "expected_values": [1, 3, 8, 5],
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "weights":
-              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5, 6]
       },
       {
           "testcase_name":
@@ -296,12 +292,11 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_values": [1, 3, 7, 8, 5],
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "weights":
-              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name":
@@ -310,12 +305,11 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_values": [1, 3, 7, 8, 5],
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "weights":
-              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name": "_1d",
@@ -344,17 +338,16 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_output=False,
+                        binary_count=False,
                         weights=None,
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
-    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_sparse,
-        weights=w_sparse,
+        weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_output=binary_output,
+        binary_count=binary_count,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -400,7 +393,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 6],
-          "binary_output": True,
+          "binary_count": True,
       },
       {
           "testcase_name": "_maxlength_binary",
@@ -409,7 +402,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 7],
-          "binary_output": True,
+          "binary_count": True,
       },
       {
           "testcase_name": "_minlength_binary",
@@ -419,13 +412,13 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 9],
-          "binary_output": True,
+          "binary_count": True,
       },
       {
           "testcase_name": "_minlength_larger_values_binary",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "minlength": 3,
-          "binary_output": True,
+          "binary_count": True,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
@@ -435,18 +428,18 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "testcase_name": "_no_maxlength_weights",
           "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
           "expected_shape": [5, 6],
-          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5]
       },
       {
           "testcase_name": "_maxlength_weights",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "maxlength": 7,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
           "expected_shape": [5, 7],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5, 6]
       },
       {
           "testcase_name": "_minlength_weights",
@@ -454,9 +447,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 9,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
           "expected_shape": [5, 9],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name": "_minlength_larger_values_weights",
@@ -464,9 +457,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 3,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
           "expected_shape": [5, 8],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name": "_1d",
@@ -491,114 +484,21 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_output=False,
+                        binary_count=False,
                         weights=None,
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
-    w = ragged_factory_ops.constant(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_ragged,
-        weights=w,
+        weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_output=binary_output,
+        binary_count=binary_count,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
-class TestSparseCountFailureModes(test.TestCase):
-
-  def test_dense_input_sparse_weights_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_dense_input_ragged_weights_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_dense_input_wrong_shape_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = np.array([[3, 2], [5, 4], [4, 3]])
-    # Note: Eager mode and graph mode throw different errors here. Graph mode
-    # will fail with a ValueError from the shape checking logic, while Eager
-    # will fail with an InvalidArgumentError from the kernel itself.
-    if context.executing_eagerly():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "must have the same shape"):
-        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-    else:
-      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
-        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_dense_weights_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_ragged_weights_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_wrong_indices_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same indices"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_too_many_indices_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Incompatible shapes"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_wrong_shape_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
-                 dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same dense shape"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_dense_weights_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_sparse_weights_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_different_shape_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same row splits"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index f8f8edb26a8..4c4f6c62291 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 67235bb2cf2..a9ad81920dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"

From 03f3e8153c405578fdd6aea6694569859eecaac9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 23:46:17 -0700
Subject: [PATCH 0570/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311478670
Change-Id: Ib8c15d5cba307629a0d8fc55e07efc401502899e
---
 tensorflow/go/op/wrappers.go | 124 ++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 59 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index e6725269279..c6d67c9ad44 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4715,7 +4715,7 @@ type DenseCountSparseOutputAttr func(optionalAttr)
 
 // DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: Minimum value to count. Can be set to -1 for no minimum.
+// value: int32; minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4727,7 +4727,7 @@ func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
 
 // DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: Maximum value to count. Can be set to -1 for no maximum.
+// value: int32; maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4742,20 +4742,20 @@ func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	values: Tensor containing data to count.
-//	weights: A Tensor of the same shape as indices containing per-index weight values. May
-// also be the empty tensor if no weights are used.
-//	binary_output: Whether to output the number of occurrences of each value or 1.
+//	values: int32 or int64; Tensor containing data to count.
+//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+//	binary_count: bool; whether to output the number of occurrences of each value or 1.
+//	output_type: dtype; dtype of the output values tensor.
 //
 // Returns:
-//	output_indices: Indices tensor for the resulting sparse tensor object.
-//	output_values: Values tensor for the resulting sparse tensor object.
-//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
-func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: int64; indices tensor for the resulting sparse tensor object.
+//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
+//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_output": binary_output}
+	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -8607,7 +8607,7 @@ type RaggedCountSparseOutputAttr func(optionalAttr)
 
 // RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: Minimum value to count. Can be set to -1 for no minimum.
+// value: int32; minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8619,7 +8619,7 @@ func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
 
 // RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: Maximum value to count. Can be set to -1 for no maximum.
+// value: int32; maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8634,27 +8634,33 @@ func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	splits: Tensor containing the row splits of the ragged tensor to count.
-//	values: Tensor containing values of the sparse tensor to count.
-//	weights: A Tensor of the same shape as indices containing per-index weight values.
-// May also be the empty tensor if no weights are used.
-//	binary_output: Whether to output the number of occurrences of each value or 1.
+//	splits: int64; Tensor containing the row splits of the ragged tensor to count.
+//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
+//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+//	binary_count: bool; whether to output the number of occurrences of each value or 1.
+//	output_type: dtype; dtype of the output values tensor.
 //
 // Returns:
-//	output_indices: Indices tensor for the resulting sparse tensor object.
-//	output_values: Values tensor for the resulting sparse tensor object.
-//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+//	output_indices: int64; indices tensor for the resulting sparse tensor object.
+//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
+//   END
+//   }
+//   out_arg {
+//     name: "output_dense_shape"
+//     description: <<END
+// int64; shape tensor for the resulting sparse tensor object.
 //   END
 //   }
 //   attr {
 //     name: "T"
 //     description: <<END
-// Dtype of the input values tensor.
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+// dtype; dtype of the input values tensor.
+//	output_dense_shape
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_output": binary_output}
+	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -12053,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -13700,7 +13706,7 @@ type SparseCountSparseOutputAttr func(optionalAttr)
 
 // SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: Minimum value to count. Can be set to -1 for no minimum.
+// value: int32; minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13712,7 +13718,7 @@ func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
 
 // SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: Maximum value to count. Can be set to -1 for no maximum.
+// value: int32; maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13727,22 +13733,22 @@ func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	indices: Tensor containing the indices of the sparse tensor to count.
-//	values: Tensor containing values of the sparse tensor to count.
-//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
-//	weights: A Tensor of the same shape as indices containing per-index weight values.
-// May also be the empty tensor if no weights are used.
-//	binary_output: Whether to output the number of occurrences of each value or 1.
+//	indices: int64; Tensor containing the indices of the sparse tensor to count.
+//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
+//	dense_shape: int64; Tensor containing the dense shape of the sparse tensor to count.
+//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+//	binary_count: bool; whether to output the number of occurrences of each value or 1.
+//	output_type: dtype; dtype of the output values tensor.
 //
 // Returns:
-//	output_indices: Indices tensor for the resulting sparse tensor object.
-//	output_values: Values tensor for the resulting sparse tensor object.
-//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
-func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: int64; indices tensor for the resulting sparse tensor object.
+//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
+//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_output": binary_output}
+	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -18969,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From d33cb73389c4198c01d8dac55cbbd6620abe7d4b Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Wed, 13 May 2020 23:48:03 -0700
Subject: [PATCH 0571/1533] Expose inference type in the mlir quantizer

This is to prepare the 16 bits activation quantization release. The data type
specified by this flag is only applied on the activations.

PiperOrigin-RevId: 311478782
Change-Id: I5f63f0508011cc0b1b47a0debb35c17d3284eae9
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  6 ++--
 .../lite/quantization/lite/quantize_model.cc  | 10 ++++--
 .../lite/quantization/lite/quantize_model.h   |  4 ++-
 .../lite/quantization/lite/tfl_quantizer.cc   |  3 +-
 tensorflow/lite/python/convert.py             |  7 ++--
 tensorflow/lite/python/lite_v2_test.py        | 36 +++++++++++++++++++
 tensorflow/lite/python/wrap_toco.py           |  6 ++--
 tensorflow/lite/toco/python/BUILD             |  1 +
 .../lite/toco/python/toco_python_api.cc       | 21 +++++++++--
 tensorflow/lite/toco/python/toco_python_api.h |  2 +-
 .../python/lite/toco_python_api_wrapper.cc    |  7 ++--
 11 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 8a949a45e2d..a585b8e1520 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -414,9 +414,9 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$input,
     TFL_TensorOf<[F32, QI8, QUI8]>:$filter,
-    TFL_TensorOfOrNone<[F32, I32]>:$bias,
+    TFL_TensorOfOrNone<[F32, I32, I64]>:$bias,
     I32Attr:$dilation_h_factor,
     I32Attr:$dilation_w_factor,
     TFL_AFAttr:$fused_activation_function,
@@ -425,7 +425,7 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
     I32Attr:$stride_w
   );
 
-  let results = (outs TFL_TensorOf<[F32, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$output);
 
   let hasOptions = 0b1;
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 0ac3fa419bc..a2e3c065113 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace lite {
@@ -38,6 +39,7 @@ namespace lite {
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
+    const tflite::TensorType& inference_type,
     const std::unordered_set<std::string>& operator_names,
     bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
@@ -73,7 +75,7 @@ TfLiteStatus QuantizeModel(
   // Apply quantization passes
   PassManager pm(module->getContext());
   TFL::QuantizationSpecs quant_specs;
-  quant_specs.inference_type = tensorflow::DT_QINT8;
+  quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.post_training_quantization = true;
   quant_specs.disable_per_channel = disable_per_channel;
 
@@ -81,8 +83,10 @@ TfLiteStatus QuantizeModel(
   auto input_tf_type = tflite::TflTypeToTfType(input_type);
   if (input_tf_type == tensorflow::DT_FLOAT) {
     emit_adaptor = true;
-  } else if (input_tf_type == tensorflow::DT_UINT8) {
-    quant_specs.inference_type = tensorflow::DT_QUINT8;
+  } else if (input_tf_type == tensorflow::DT_UINT8 ||
+             input_tf_type == tensorflow::DT_INT8 ||
+             input_tf_type == tensorflow::DT_INT16) {
+    quant_specs.inference_type = input_tf_type;
   }
 
   pm.addPass(TFL::CreatePrepareQuantizePass(quant_specs));
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
index 578aa6438de..d60df56b473 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -26,11 +26,13 @@ namespace mlir {
 namespace lite {
 
 // Quantize the `input_model` and write the result to a flatbuffer `builder`.
-// The `input_type` and `output_type` can be float32/qint8/int8.
+// The `input_type`, `output_type` and `inference_type` can be
+// float32/qint8/int8/int16.
 // Return partially quantized model if `fully_quantize` is false.
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
+    const tflite::TensorType& inference_type,
     const std::unordered_set<std::string>& operator_names,
     bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
index 77bd87a3c03..5bd1b71e631 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
@@ -46,7 +46,8 @@ TfLiteStatus QuantizeAnnotatedModel(llvm::StringRef buffer,
 
   tflite::StderrReporter error_reporter;
   return mlir::lite::QuantizeModel(
-      *model, tflite::TensorType_INT8, tflite::TensorType_INT8, {},
+      *model, tflite::TensorType_INT8, tflite::TensorType_INT8,
+      tflite::TensorType_INT8, {},
       /*disable_per_channel=*/false,
       /*fully_quantize=*/true, builder, &error_reporter);
 }
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index ae70afd6962..6b7a32f1bcc 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -108,7 +108,8 @@ class ConverterError(Exception):
   pass
 
 
-def mlir_quantize(input_data_str, disable_per_channel=False):
+def mlir_quantize(input_data_str, disable_per_channel=False,
+                  inference_type=_types_pb2.INT8):
   """Quantize `input_data_str` with calibration results.
 
   Args:
@@ -116,13 +117,15 @@ def mlir_quantize(input_data_str, disable_per_channel=False):
                     calibration results).
     disable_per_channel: Bool indicating whether to do per-channel or
                          per-tensor quantization
+    inference_type: Data type for the activations. The default value is int8.
 
   Returns:
     Quantized model in serialized form (e.g. a TFLITE model) with floating-point
     inputs and outputs.
   """
   return wrap_toco.wrapped_experimental_mlir_quantize(input_data_str,
-                                                      disable_per_channel)
+                                                      disable_per_channel,
+                                                      inference_type)
 
 
 def mlir_sparsify(input_data_str):
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 4768892f359..9af37df2975 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -29,7 +29,9 @@ import tensorflow as tf
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_v2_test_util
+from tensorflow.lite.python.convert import mlir_quantize
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import recurrent
@@ -204,6 +206,40 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
+  def testCalibrateAndQuantizeBuiltinInt16(self):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    # TODO(b/156309549): We should add INT16 to the builtin types.
+    converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
+    converter.representative_dataset = calibration_gen
+    converter._experimental_calibrate_only = True
+    calibrated_tflite = converter.convert()
+    quantized_tflite = mlir_quantize(calibrated_tflite,
+                                     inference_type=_types_pb2.QUANTIZED_INT16)
+
+    self.assertTrue(quantized_tflite)
+
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
   def _getTrainingTimeQuantizedModel(self):
 
     class QLinear(tf.keras.layers.Layer):
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index 3c1f98ff42d..8f72cc8cbbd 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -43,10 +43,12 @@ def wrapped_get_potentially_supported_ops():
   return _pywrap_toco_api.TocoGetPotentiallySupportedOps()
 
 
-def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel):
+def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel,
+                                       inference_type):
   """Wraps experimental mlir quantize model."""
   return _pywrap_toco_api.ExperimentalMlirQuantizeModel(input_data_str,
-                                                        disable_per_channel)
+                                                        disable_per_channel,
+                                                        inference_type)
 
 
 def wrapped_experimental_mlir_sparsify(input_data_str):
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index bea582d83a5..7dfa714d1d6 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
         "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
+        "//tensorflow/lite/toco:types_proto_cc",
     ] + select({
         # This is required when running `tflite_convert` from `bazel`.
         # It requires to link with TensorFlow Ops to get the op definitions.
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index aafd14f9da8..441aabf0ffe 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_tooling.h"
 #include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/types.pb.h"
 
 namespace toco {
 
@@ -229,7 +230,7 @@ PyObject* TocoGetPotentiallySupportedOps() {
 }
 
 PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
-                            bool fully_quantize) {
+                            bool fully_quantize, int inference_type) {
   using tflite::interpreter_wrapper::PythonErrorReporter;
   char* buf = nullptr;
   Py_ssize_t length;
@@ -249,11 +250,25 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
   auto tflite_model = absl::make_unique<tflite::ModelT>();
   model->GetModel()->UnPackTo(tflite_model.get(), nullptr);
 
+  tflite::TensorType inference_tensor_type;
+  switch (inference_type) {
+    case toco::IODataType::QUANTIZED_INT16:
+      inference_tensor_type = tflite::TensorType_INT16;
+      break;
+    case toco::IODataType::QUANTIZED_UINT8:
+      inference_tensor_type = tflite::TensorType_UINT8;
+      break;
+    case toco::IODataType::INT8:
+      inference_tensor_type = tflite::TensorType_INT8;
+      break;
+    default:
+      return nullptr;
+  }
   flatbuffers::FlatBufferBuilder builder;
   auto status = mlir::lite::QuantizeModel(
       *tflite_model, tflite::TensorType::TensorType_FLOAT32,
-      tflite::TensorType::TensorType_FLOAT32, {}, disable_per_channel,
-      fully_quantize, &builder, error_reporter.get());
+      tflite::TensorType::TensorType_FLOAT32, inference_tensor_type, {},
+      disable_per_channel, fully_quantize, &builder, error_reporter.get());
 
   if (status != kTfLiteOk) {
     error_reporter->exception();
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index 7afb097fd4a..058ae9fb942 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -44,7 +44,7 @@ PyObject* TocoGetPotentiallySupportedOps();
 // is specified by the calibration data are not sufficient to quantize the
 // model.
 PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
-                            bool fully_quantize);
+                            bool fully_quantize, int inference_type);
 
 // Sparsifies model to encode sparse tensors with proper format. Throws error if
 // sparsification fails.
diff --git a/tensorflow/python/lite/toco_python_api_wrapper.cc b/tensorflow/python/lite/toco_python_api_wrapper.cc
index e6e0e111ec4..b77200a3bee 100644
--- a/tensorflow/python/lite/toco_python_api_wrapper.cc
+++ b/tensorflow/python/lite/toco_python_api_wrapper.cc
@@ -57,12 +57,13 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
   m.def(
       "ExperimentalMlirQuantizeModel",
       [](py::object input_contents_txt_raw, bool disable_per_channel,
-         bool fully_quantize) {
+         bool fully_quantize, int inference_type) {
         return tensorflow::PyoOrThrow(toco::MlirQuantizeModel(
-            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize));
+            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize,
+            inference_type));
       },
       py::arg("input_contents_txt_raw"), py::arg("disable_per_channel") = false,
-      py::arg("fully_quantize") = true,
+      py::arg("fully_quantize") = true, py::arg("inference_type") = 9,
       R"pbdoc(
       Returns a quantized model.
     )pbdoc");

From d36ad412f43672e84366c34428663553238e85c1 Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Thu, 14 May 2020 00:11:31 -0700
Subject: [PATCH 0572/1533] Add IOS_BENCHMARK enum value to BenchmarkType

PiperOrigin-RevId: 311481101
Change-Id: I142b5b6231a817df6b688786fa508379ce06dd79
---
 tensorflow/core/util/test_log.proto | 41 +++++++++++++++--------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index ddb7a0275ac..6d3af02e657 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -1,6 +1,8 @@
 // Protocol messages for describing the results of benchmarks and unit tests.
 syntax = "proto3";
 
+package tensorflow;
+
 import "google/protobuf/any.proto";
 import "google/protobuf/wrappers.proto";
 
@@ -9,14 +11,12 @@ option java_outer_classname = "TestLogProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util.testlog";
 
-package tensorflow;
-
 message EntryValue {
   oneof kind {
     double double_value = 1;
     string string_value = 2;
   }
-};
+}
 
 message MetricEntry {
   // Metric name
@@ -62,7 +62,7 @@ message BenchmarkEntry {
   // Metric name, value and expected range. This can include accuracy metrics
   // typically used to determine whether the accuracy test has passed
   repeated MetricEntry metrics = 7;
-};
+}
 
 message BenchmarkEntries {
   repeated BenchmarkEntry entry = 1;
@@ -72,7 +72,7 @@ message BuildConfiguration {
   string mode = 1;               // opt, dbg, etc
   repeated string cc_flags = 2;  // CC compiler flags, if known
   repeated string opts = 3;      // Bazel compilation options, if known
-};
+}
 
 message CommitId {
   oneof kind {
@@ -85,7 +85,7 @@ message CommitId {
   string snapshot = 3;
   // Changelist tested if the change list is not already submitted.
   int64 pending_changelist = 4;
-};
+}
 
 message CPUInfo {
   int64 num_cores = 1;
@@ -105,7 +105,7 @@ message CPUInfo {
 
   // Cache sizes (in bytes), e.g. "L2": 262144 (for 256KB)
   map<string, int64> cache_size = 6;
-};
+}
 
 message MemoryInfo {
   int64 total = 1;      // Total virtual memory in bytes
@@ -113,26 +113,26 @@ message MemoryInfo {
 }
 
 message GPUInfo {
-  string model = 1;  // e.g. "Tesla K40c"
-  string uuid = 2;   // Final entry in output of "nvidia-smi -L"
+  string model = 1;   // e.g. "Tesla K40c"
+  string uuid = 2;    // Final entry in output of "nvidia-smi -L"
   string bus_id = 3;  // e.g. "0000:04:00.0"
-};
+}
 
 message PlatformInfo {
-  string bits = 1;       // e.g. '64bit'
-  string linkage = 2;    // e.g. 'ELF'
-  string machine = 3;    // e.g. 'i386'
-  string release = 4;    // e.g. '3.13.0-76-generic'
-  string system = 5;     // e.g. 'Linux'
-  string version = 6;    // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
-};
+  string bits = 1;     // e.g. '64bit'
+  string linkage = 2;  // e.g. 'ELF'
+  string machine = 3;  // e.g. 'i386'
+  string release = 4;  // e.g. '3.13.0-76-generic'
+  string system = 5;   // e.g. 'Linux'
+  string version = 6;  // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
+}
 
 message AvailableDeviceInfo {       // Matches DeviceAttributes
   string name = 1;                  // Device name.
   string type = 2;                  // Device type, e.g. 'CPU' or 'GPU'.
   int64 memory_limit = 3;           // Memory capacity in bytes.
   string physical_description = 4;  // The physical description of this device.
-};
+}
 
 message MachineConfiguration {
   // Host name of machine that ran the benchmark.
@@ -154,7 +154,7 @@ message MachineConfiguration {
   repeated AvailableDeviceInfo available_device_info = 5;
 
   MemoryInfo memory_info = 6;
-};
+}
 
 // Run-specific items such as arguments to the test / benchmark.
 message RunConfiguration {
@@ -206,6 +206,7 @@ message TestResults {
     PYTHON_BENCHMARK = 2;
     ANDROID_BENCHMARK = 3;
     EDGE_BENCHMARK = 4;
+    IOS_BENCHMARK = 5;
   }
   BenchmarkType benchmark_type = 10;
 
@@ -219,4 +220,4 @@ message TestResults {
   // TensorFlow version this benchmark runs against.
   // This can be either set to full version or just the major version.
   string tf_version = 12;
-};
+}

From da78c46560fbccec8e61039e6b836fee5ebdc8c1 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Thu, 14 May 2020 00:31:22 -0700
Subject: [PATCH 0573/1533] Fix comment to reflect actual logic.

PiperOrigin-RevId: 311483016
Change-Id: Ib66f41fd4c470bdcd516f4f03de7d78fb8ddde1c
---
 tensorflow/lite/graph_info.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 875a03af817..a419a56a9e6 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -191,11 +191,11 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
   std::vector<NodeSubset>* node_subsets_;
   std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
-  // is an input or constant.
+  // negative values of kEpochNotReady if not assigned, kEpochAlwaysReady if it
+  // is an input to the whole model or a constant that has no dependencies.
   std::vector<int> tensor_epochs_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned.
+  // negative values of kEpochNotReady if not assigned.
   std::vector<int> node_epochs_;
 };
 

From ca18db7f3f5057bb83c41f4710d7a6a75224300d Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Thu, 14 May 2020 01:38:34 -0700
Subject: [PATCH 0574/1533] Return a meaningful error for dynamic shape inputs
 with outside compilation head extraction in TPUs.

PiperOrigin-RevId: 311490072
Change-Id: Idc7bf1764aba1fcbfcf830e36a5b575b387923d7
---
 .../python/distribute/tpu_strategy_test.py    | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index de4c975d5ef..6c93e29c028 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -140,6 +141,9 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -164,6 +168,28 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
+  def test_dynamic_shape_with_outside_compilation_failure(self):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)

From b187ba0bcc04d471ee7cd60aaddbcdfc892e24c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 01:46:54 -0700
Subject: [PATCH 0575/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/bfa200ebcf37

PiperOrigin-RevId: 311490759
Change-Id: Icd37195b07135947a26f185a8d2a1ddc1adf718c
---
 .../mlir/xla/tests/lhlo-fuse-linalg.mlir      | 52 +++++++++----------
 .../lhlo-legalize-select-and-scatter.mlir     | 34 ++++++------
 .../mlir/xla/tests/lhlo-legalize-to-gpu.mlir  |  2 +-
 .../lhlo-legalize-to-parallel-loops.mlir      | 50 +++++++++---------
 .../lhlo_legalize_to_parallel_loops.cc        | 48 ++++++++---------
 .../compiler/mlir/xla/transforms/passes.h     |  4 +-
 6 files changed, 95 insertions(+), 95 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index 013748fea28..99b1766e73c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -24,9 +24,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -36,9 +36,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -46,8 +46,8 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 
 // PLOOP-LABEL: func @fusion
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
@@ -94,9 +94,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:      linalg.generic
 //       CHECK:        subf
@@ -107,9 +107,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 //   TILED-DAG:   %[[C2:.*]] = constant 2
 //   TILED-DAG:   %[[C3:.*]] = constant 3
 //   TILED-NOT:   linalg.generic
-//       TILED:   loop.for {{.*}} step %[[C2]]
-//       TILED:     loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:   loop.for
+//       TILED:   scf.for {{.*}} step %[[C2]]
+//       TILED:     scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:   scf.for
 //       TILED:       linalg.generic
 //       TILED:       linalg.generic
 //       TILED:         subf
@@ -118,8 +118,8 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 
 // PLOOP-LABEL: func @fusion_of_three
 //   PLOOP-NOT:   linalg.generic
-//       PLOOP:   loop.parallel
-//   PLOOP-NOT:   loop.parallel
+//       PLOOP:   scf.parallel
+//   PLOOP-NOT:   scf.parallel
 //       PLOOP:       linalg.generic
 //       PLOOP:       linalg.generic
 //       PLOOP:         subf
@@ -147,11 +147,11 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 // CHECK-LABEL: func @fusion_4d
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//       CHECK:      loop.for {{.*}} step %[[C1]]
-//       CHECK:        loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//       CHECK:      scf.for {{.*}} step %[[C1]]
+//       CHECK:        scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -161,9 +161,9 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -171,8 +171,8 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 
 // PLOOP-LABEL: func @fusion_4d
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
index 5b763cde2ed..c640b395f4d 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
@@ -50,19 +50,19 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Parallel loop to initialize the output buffer.
 // CHECK:    [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
-// CHECK:    loop.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK:    scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:          to ([[C112]], [[C112]]) step ([[C1]], [[C1]]) {
 // CHECK:      store [[INIT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 
 // Parallel loop over source buffer to compute scattered values.
-// CHECK:    loop.parallel ([[II:%.*]], [[JJ:%.*]]) = ([[C0]], [[C0]])
+// CHECK:    scf.parallel ([[II:%.*]], [[JJ:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:          to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
 
 // Window loop w.r.t. first dim.
 // CHECK:      [[SEL_RES_I:%.*]]:4
-// CHECK-SAME:   = loop.for [[WIN_I:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:   = scf.for [[WIN_I:%.*]] = [[C0]] to [[C3]] step [[C1]]
 // CHECK-SAME:     iter_args(
 // CHECK-SAME:       [[SEL_I_0:%.*]] = [[C0]], [[SEL_J_0:%.*]] = [[C0]],
 // CHECK-SAME:       [[SEL_VAL_0:%.*]] = [[C0_F32]],
@@ -71,7 +71,7 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Window loop w.r.t. second dim.
 // CHECK:      [[SEL_RES_J:%.*]]:4
-// CHECK-SAME:   = loop.for [[WIN_J:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:   = scf.for [[WIN_J:%.*]] = [[C0]] to [[C3]] step [[C1]]
 // CHECK-SAME:     iter_args(
 // CHECK-SAME:       [[SEL_I:%.*]] = [[SEL_I_0]], [[SEL_J:%.*]] = [[SEL_J_0]],
 // CHECK-SAME:       [[SEL_VAL:%.*]] = [[SEL_VAL_0]],
@@ -102,14 +102,14 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 // be applied, current selected ivs (SEL_I, SEL_J) and value (SEL_VAL) are
 // returned in that case.
 // CHECK:  [[IF_INBOUNDS_RES:%.*]]:4
-// CHECK-SAME:  = loop.if [[INBOUNDS_1]] -> (index, index, f32, i1) {
+// CHECK-SAME:  = scf.if [[INBOUNDS_1]] -> (index, index, f32, i1) {
 
 
   // INBOUNDS-THEN-BODY, i.e. if INBOUNDS == true
 
   // CHECK: [[ARG_ELEM:%.*]] = load [[ARG_BUF]]{{\[}}[[ARG_I]], [[ARG_J]]]
   // CHECK: [[IF_INIT_RES:%.*]]:4
-  // CHECK-SAME:  = loop.if [[SEL_INIT]] -> (index, index, f32, i1) {
+  // CHECK-SAME:  = scf.if [[SEL_INIT]] -> (index, index, f32, i1) {
 
     // INIT-THEN-BODY, i.e. INBOUNDS == true and INIT = true
 
@@ -133,40 +133,40 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 
     // Depending on PRED, return ARG ivs & elem or current select ivs and value.
-    // CHECK:  [[IF_PRED_RES:%.*]]:4 = loop.if [[PRED]]
-    // CHECK:    loop.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]], [[CTRUE]]
+    // CHECK:  [[IF_PRED_RES:%.*]]:4 = scf.if [[PRED]]
+    // CHECK:    scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]], [[CTRUE]]
     // CHECK:  } else {
-    // CHECK:    loop.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]], [[SEL_INIT]]
+    // CHECK:    scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]], [[SEL_INIT]]
     // CHECK:  }
 
     // INIT-THEN-BODY yield.
-    // CHECK:  loop.yield [[IF_PRED_RES]]#0, [[IF_PRED_RES]]#1,
+    // CHECK:  scf.yield [[IF_PRED_RES]]#0, [[IF_PRED_RES]]#1,
     // CHECK-SAME:        [[IF_PRED_RES]]#2, [[IF_PRED_RES]]#3
 
     // INIT-ELSE-BODY, i.e. if INBOUNDS == TRUE and INIT == FALSE, returns ARG
     // ivs and element without computing Select function.
-    // CHECK:  loop.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]],
+    // CHECK:  scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]],
     // CHECK-SAME:        [[CTRUE]] : index, index, f32, i1
     // CHECK:  }
 
   // INBOUNDS-THEN-BODY yield.
-  // CHECK:  loop.yield [[IF_INIT_RES]]#0, [[IF_INIT_RES]]#1, [[IF_INIT_RES]]#2,
+  // CHECK:  scf.yield [[IF_INIT_RES]]#0, [[IF_INIT_RES]]#1, [[IF_INIT_RES]]#2,
   // CHECK-SAME:        [[IF_INIT_RES]]#3 : index, index, f32, i1
   // CHECK:  }
 
   // INBOUNDS-ELSE-REGION, i.e. if INBOUNDS == FALSE
   // We are in the pad area, return current iter_args.
-  // CHECK:  loop.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]],
+  // CHECK:  scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]],
   // CHECK-SAME:  [[SEL_INIT]] : index, index, f32, i1
   // CHECK:  }
 
 // Window loop w.r.t. second dim yield.
-// CHECK:  loop.yield [[IF_INBOUNDS_RES]]#0, [[IF_INBOUNDS_RES]]#1,
+// CHECK:  scf.yield [[IF_INBOUNDS_RES]]#0, [[IF_INBOUNDS_RES]]#1,
 // CHECK-SAME:        [[IF_INBOUNDS_RES]]#2, [[IF_INBOUNDS_RES]]#3
 // CHECK:  }
 
 // Window loop w.r.t. first dim yield.
-// CHECK:    loop.yield [[SEL_RES_J]]#0, [[SEL_RES_J]]#1, [[SEL_RES_J]]#2,
+// CHECK:    scf.yield [[SEL_RES_J]]#0, [[SEL_RES_J]]#1, [[SEL_RES_J]]#2,
 // CHECK-SAME:          [[SEL_RES_J]]#3 : index, index, f32, i1
 // CHECK:  }
 
@@ -196,4 +196,4 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 // CHECK:  atomic_yield [[RES]] : f32
 
 // Parallel loop over source buffer yield
-// CHECK:  loop.yield
+// CHECK:  scf.yield
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
index 4d878cee6f4..16ffbf241b0 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
@@ -22,7 +22,7 @@ func @reduce(%arg: memref<100x10xf32>,
 // CHECK-DAG: %[[LB:.*]] = constant 0 : index
 // CHECK-DAG: %[[UB:.*]] = constant 10 : index
 // CHECK-DAG: %[[STEP:.*]] = constant 1 : index
-// CHECK: loop.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK: scf.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
 // CHECK: %[[LHS:.*]] = linalg.slice %[[ARG2]][%[[IDX]]] : memref<100xf32>, index, memref<f32, #map0>
 // CHECK: %[[RHS:.*]] = linalg.slice %[[ARG0]][%[[IDX]], %[[IDX1]]] : memref<100x10xf32>, index, index, memref<f32, #map0>
 // CHECK: "xla_lhlo.add"(%[[LHS]], %[[RHS]], %[[LHS]]) : (memref<f32, {{.*}}>, memref<f32, {{.*}}>, memref<f32, {{.*}}>) -> ()
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
index cb169e060ef..32c367f97d6 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
@@ -22,13 +22,13 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK-DAG:  [[C10:%.*]] = constant 10 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[C100]], [[C5]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:      ([[C0]]) to ([[C10]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<100x10x5xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -37,12 +37,12 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -66,10 +66,10 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK-DAG:  [[C1:%.*]] = constant 1 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:      [[REDUCTION_RESULT:%.*]] = loop.parallel ([[I:%.*]]) = ([[C0]])
+// CHECK:      [[REDUCTION_RESULT:%.*]] = scf.parallel ([[I:%.*]]) = ([[C0]])
 // CHECK-SAME:     to ([[C100]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:        [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]{{\[}}[[I]]{{\]}}
-// CHECK:        loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:        scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:        ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:          [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:          [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -78,9 +78,9 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK:          store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:          "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:          [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:          loop.reduce.return [[ACC_RESULT]]
+// CHECK:          scf.reduce.return [[ACC_RESULT]]
 // CHECK:        }
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[C0]]]
 
 // -----
@@ -107,13 +107,13 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:  [[DIM1:%.*]] = dim [[ARG_BUF]], 1 : memref<?x?x?xf32>
 // CHECK:  [[DIM2:%.*]] = dim [[ARG_BUF]], 2 : memref<?x?x?xf32>
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[DIM0]], [[DIM2]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:     ([[C0]]) to ([[DIM1]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<?x?x?xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -122,12 +122,12 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -158,9 +158,9 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK-DAG:  [[C56:%.*]] = constant 56 : index
 // CHECK-DAG:  [[C112:%.*]] = constant 112 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
-// CHECK:      loop.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK:      scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:         to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
-// CHECK:        [[REDUCTION_RESULT:%.*]] = loop.parallel
+// CHECK:        [[REDUCTION_RESULT:%.*]] = scf.parallel
 // CHECK-SAME:       ([[IW:%.*]], [[JW:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:       to ([[C3]], [[C3]]) step ([[C1]], [[C1]])
 // CHECK-SAME:       init ([[INIT]]) -> f32 {
@@ -177,15 +177,15 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:          [[INDEX_J_FITS:%.*]] = cmpi "ult", [[INDEX_J]], [[C112]]
 // CHECK:          [[IN_BOUNDS_1:%.*]] = and [[IN_BOUNDS_0]], [[INDEX_J_FITS]]
 
-// CHECK:          [[ELEM_TO_REDUCE:%.*]] = loop.if [[IN_BOUNDS_1]] -> (f32) {
+// CHECK:          [[ELEM_TO_REDUCE:%.*]] = scf.if [[IN_BOUNDS_1]] -> (f32) {
 // CHECK:            [[OPERAND_ELEM:%.*]] =
 // CHECK-SAME:         load [[OPERAND_BUF]]{{\[}}[[INDEX_I]], [[INDEX_J]]]
-// CHECK:              loop.yield [[OPERAND_ELEM]] : f32
+// CHECK:              scf.yield [[OPERAND_ELEM]] : f32
 // CHECK:            } else {
-// CHECK:              loop.yield [[INIT]] : f32
+// CHECK:              scf.yield [[INIT]] : f32
 // CHECK:            }
 
-// CHECK:          loop.reduce([[ELEM_TO_REDUCE]])  : f32 {
+// CHECK:          scf.reduce([[ELEM_TO_REDUCE]])  : f32 {
 // CHECK:          ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:            [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:            [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -194,12 +194,12 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:            store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:            "xla_lhlo.maximum"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:            [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:            loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:            scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:          }
-// CHECK:          loop.yield
+// CHECK:          scf.yield
 // CHECK:        }
 // CHECK:        store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      }
 // CHECK:      return
 // CHECK:    }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index c5f5b39e04c..734a75a4307 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -61,8 +61,8 @@ Value ApplySingleResultLhloCode(Location loc, ValueRange operands,
 
 // Converts a block with LHLO ops and with signature:
 //   ^bb(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-// into a reduction operator of loop.reduce by doing buffer allocation for
-// scalar arguments and the result of `loop.reduce` to make it compatible with
+// into a reduction operator of scf.reduce by doing buffer allocation for
+// scalar arguments and the result of `scf.reduce` to make it compatible with
 // LHLO ops.
 void ConvertToReductionOperator(Location loc, scf::ReduceOp reduce_op,
                                 Block* lhlo_block, OpBuilder* b) {
@@ -170,10 +170,10 @@ scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //  is roughly converted into:
 //
 //  %init = load %init_buf[] : memref<f32>
-//  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-//    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+//  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+//    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
 //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-//      loop.reduce(%elem_to_reduce)  {
+//      scf.reduce(%elem_to_reduce)  {
 //        ^bb0(%elem: f32, %acc: f32):   // no predecessors
 //          elem_buf = alloc() : memref<f32>
 //          store %elem, elem_buf[] : memref<f32>
@@ -181,11 +181,11 @@ scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //          store %acc, acc_buf[] : memref<f32>
 //          <LHLO_ops>
 //          %acc_result = load acc_buf[] : memref<f32>
-//          loop.reduce.return %acc_result : f32
+//          scf.reduce.return %acc_result : f32
 //      } : f32
-//      loop.yield
+//      scf.yield
 //    } : f32
-//    loop.yield
+//    scf.yield
 //  }
 class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
  public:
@@ -206,24 +206,24 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
   }
 
  private:
-  // Creates nested `loop.parallel` ops with `loop.reduce`. The outer ParallelOp
+  // Creates nested `scf.parallel` ops with `scf.reduce`. The outer ParallelOp
   // refers to the parallel dimensions of `xla_reduce_op` if any and the inner
-  // ParallelOp refers to the reduction dimensions. The loop.reduce op is
+  // ParallelOp refers to the reduction dimensions. The scf.reduce op is
   // returned.
   //
   // If the reduction argument is a memref<100x10x5xf32> and the
   // reduction is performed along dimension 1 then this method will generate
   //
   //  %init = load %init_buf[] : memref<f32>
-  //  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-  //    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+  //  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+  //    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
   //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-  //      loop.reduce(%elem_to_reduce)  {
+  //      scf.reduce(%elem_to_reduce)  {
   //        <THE BLOCK PTR TO BE RETURNED>
   //      } : f32
-  //      loop.yield
+  //      scf.yield
   //    } : f32
-  //    loop.yield
+  //    scf.yield
   //  }
   scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceOp xla_reduce_op,
@@ -341,20 +341,20 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 // is roughly converted into:
 //
 //    %neutral_elem = load %init_buf[] : memref<f32>
-//    loop.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
-//      %result = loop.parallel (%iw, %jw) = (%c0, %c0)
+//    scf.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
+//      %result = scf.parallel (%iw, %jw) = (%c0, %c0)
 //                  to (%c3, %c3) step (%c1, %c1) neutral_elem (%0) -> f32 {
 //        %in_bounds = <COMPUTE IF INDEX IS IN OPERAND'S pad>
 //        %elem = load %operand[%computed_i, %computed_j]
 //        %elem_or_neutral = select %in_bounds, %elem, %neutral_elem : f32
-//        loop.reduce(%elem_to_reduce)  : f32 {
+//        scf.reduce(%elem_to_reduce)  : f32 {
 //          ^bb0(%arg7: f32, %arg8: f32):
 //            <LHLO ops>
 //        }
-//        loop.yield
+//        scf.yield
 //      }
 //      store %result, %output_buffer[%i, %j] : memref<56x56xf32>
-//      loop.yield
+//      scf.yield
 //    }
 //    return
 //  }
@@ -457,16 +457,16 @@ class ReduceWindowOpConverter
 // https://www.tensorflow.org/xla/operation_semantics#selectandscatter
 //
 // Pseudocode:
-//  loop.parallel(coordinates O in the output):
+//  scf.parallel(coordinates O in the output):
 //    output[O] = init
-//  loop.parallel(coordinates S in the source):
+//  scf.parallel(coordinates S in the source):
 //    selected_ivs = 0
 //    selected_val = 0
 //    initialized_flag = false
-//    loop.for (first dim W_1 in the window)
+//    scf.for (first dim W_1 in the window)
 //         iter_args (selected_ivs, selected_val, initialized_flag):
 //    ...
-//      loop.for (last dim W_N in the window):
+//      scf.for (last dim W_N in the window):
 //           iter_args (selected_ivs, selected_val, initialized_flag):
 //        I = S * stride + W - pad_low
 //        if I within bounds of operand:
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 2d0164981a3..39375e210d5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -81,8 +81,8 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass();
 // Fuses linalg ops obtained after LHLO lowering. To enable fusion,
 // operations are first tiled.
 //
-// When 'use_parallel_loops' is set, the tiling will use loop.parallel
-// operations. Otherwise, loop.for operations are used.
+// When 'use_parallel_loops' is set, the tiling will use scf.parallel
+// operations. Otherwise, scf.for operations are used.
 //
 // 'tile_sizes' provides the tile sizes to use for tiling. If the linalg
 // operation has more dimensions than tile sizes provided, 1 is used as

From d7503555753420aba3a4f9010bb5f7ed13d6c9ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:02:43 -0700
Subject: [PATCH 0576/1533] Update GraphDef version to 401.

PiperOrigin-RevId: 311492238
Change-Id: I93cb2eda8127d2ca0504ba2e06911a994c190347
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 68df6a1b632..a534c0cf827 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 400  // Updated: 2020/5/13
+#define TF_GRAPH_DEF_VERSION 401  // Updated: 2020/5/14
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 9d0cf955c1e1bd2e653b93bf939c6f1617d67881 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:02:45 -0700
Subject: [PATCH 0577/1533] compat: Update forward compatibility horizon to
 2020-05-14

PiperOrigin-RevId: 311492245
Change-Id: I64918fc404fd05bb26edf1910f3bbab07a7856f5
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 26d291877cb..2a21590bb9a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 14)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e10d6dd07b0f08ff3e039bb7276b0417668d5928 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:10:24 -0700
Subject: [PATCH 0578/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311493179
Change-Id: I58caf5368efe0ff0fc5d0ef72320347d677fd888
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c6d67c9ad44..598e3a48bfe 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 23d478c4228095a2c7d47bae46f8b0d3024ca284 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 14 May 2020 02:24:39 -0700
Subject: [PATCH 0579/1533] Add Starlark rules to generate cubin headers.

Also add a cuda_gpu_architectures macro for getting a list of CUDA GPU architectures.

PiperOrigin-RevId: 311494598
Change-Id: Ie573c2d22a42ab9e0002bdcfbee5be534b87cd2c
---
 .../compiler/mlir/tools/kernel_gen/BUILD      |   1 +
 .../core/kernels/cubin_headers/build_defs.bzl | 101 ++++++++++++++++++
 third_party/gpus/cuda/BUILD.tpl               |   8 ++
 third_party/gpus/cuda/build_defs.bzl.tpl      |   4 +
 third_party/gpus/cuda_configure.bzl           |  15 +++
 5 files changed, 129 insertions(+)
 create mode 100644 tensorflow/core/kernels/cubin_headers/build_defs.bzl

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index d4269c336e9..27a8dbd2809 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -40,6 +40,7 @@ cc_library(
 tf_cc_binary(
     name = "tf_to_cubin",
     srcs = ["tf_to_cubin.cc"],
+    visibility = ["//tensorflow/core/kernels/cubin_headers:__pkg__"],
     deps = [
         ":cubin_creator",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
new file mode 100644
index 00000000000..b09c515c883
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -0,0 +1,101 @@
+"""Generates cubin headers for TF dialect ops."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda")
+
+def _lookup_file(filegroup, path):
+    """Extracts file at (relative) path in filegroup."""
+    for file in filegroup.files.to_list():
+        if file.path.endswith(path):
+            return file
+    return None
+
+def _gen_kernel_image_hdr_impl(ctx):
+    if not ctx.attr.gpu_archs:
+        fail("No GPU architecture specified, use --config=cuda or similar")
+
+    name = ctx.attr.name
+    tile_sizes = ctx.attr.tile_size.replace("x", ",")
+    same_shape = []
+    if ctx.attr.same_shape:
+        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
+
+    cubins = []
+    images = []
+    for arch in ctx.attr.gpu_archs:
+        filename = "%s.%s.cubin" % (name, arch)
+        cubin = ctx.actions.declare_file(filename)
+        ctx.actions.run(
+            outputs = [cubin],
+            executable = ctx.executable._tool,
+            arguments = same_shape + [
+                "--tile_sizes=%s" % tile_sizes,
+                "--arch=%s" % arch.split("_")[1],
+                "--output=%s" % cubin.path,
+                ctx.attr.op,
+            ],
+            mnemonic = "compile",
+        )
+        cubins.append(cubin)
+        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
+
+    # Generate fatbin file from all cubins.
+    fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    ctx.actions.run(
+        outputs = [fatbin],
+        inputs = cubins,
+        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
+        arguments = [
+            "--64",
+            "--cmdline=--compile-only",
+            "--link",
+            "--compress-all",
+            "--create=%s" % fatbin.path,
+        ] + images,
+        mnemonic = "fatbinary",
+    )
+
+    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [fatbin],
+        tools = [bin2c],
+        command = "%s --static --const --type=int --name=%s %s 1> %s" %
+                  (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
+        mnemonic = "bin2c",
+    )
+
+_gen_kernel_image_hdr = rule(
+    implementation = _gen_kernel_image_hdr_impl,
+    output_to_genfiles = True,
+    attrs = {
+        "op": attr.string(mandatory = True),
+        "tile_size": attr.string(mandatory = True),
+        "same_shape": attr.string(),
+        "out": attr.output(mandatory = True),
+        "symbol": attr.string(mandatory = True),
+        "gpu_archs": attr.string_list(mandatory = True),
+        "_cuda_root": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda_root"),
+        ),
+        "_tool": attr.label(
+            executable = True,
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            cfg = "host",
+        ),
+    },
+)
+
+def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
+    """Generates a C header with fatbin data from a Tensorflow op."""
+    if_cuda(
+        if_true = [_gen_kernel_image_hdr(
+            name = name,
+            op = op,
+            tile_size = tile_size,
+            same_shape = same_shape,
+            out = "%s.h" % name,
+            symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
+            gpu_archs = cuda_gpu_architectures(),
+            tags = tags,
+        )],
+    )
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 9d17e1b8f35..92586dd7d11 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -166,6 +166,14 @@ cc_library(
     data = [":cuda-nvvm"],
 )
 
+filegroup(
+    name = "cuda_root",
+    srcs = [
+        "cuda/bin/fatbinary",
+        "cuda/bin/bin2c",
+    ],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index 3280d6b041f..bba772e2377 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -51,6 +51,10 @@ def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return %{cuda_is_configured}
 
+def cuda_gpu_architectures():
+    """Returns a list of supported GPU architectures."""
+    return %{cuda_gpu_architectures}
+
 def if_cuda_is_configured(x):
     """Tests if the CUDA was enabled during the configure process.
 
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index c587f117deb..aa8a2f0226d 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -714,6 +714,7 @@ def _create_dummy_repository(repository_ctx):
         {
             "%{cuda_is_configured}": "False",
             "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
         },
     )
     _tpl(
@@ -842,6 +843,16 @@ def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
     ]
     return str(capability_flags)
 
+def _compute_cuda_gpu_architectures(repository_ctx, compute_capabilities):
+    gpu_architectures = [
+        "sm_" + capability.replace(".", "")
+        for capability in compute_capabilities
+    ]
+
+    # Make the list unique.
+    gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
+    return str(gpu_architectures)
+
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
 
@@ -973,6 +984,10 @@ def _create_local_cuda_repository(repository_ctx):
                 repository_ctx,
                 cuda_config.compute_capabilities,
             ),
+            "%{cuda_gpu_architectures}": _compute_cuda_gpu_architectures(
+                repository_ctx,
+                cuda_config.compute_capabilities,
+            ),
         },
     )
 

From 5767af0cd2c3327d05f84acddcdf4152e6543f58 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:26:18 -0700
Subject: [PATCH 0580/1533] Bump open source llvm revision to
 bfa200ebcf3706fde0dde335a3c1fa3fe1b3ba3f

PiperOrigin-RevId: 311494763
Change-Id: I218a77222ac4ca3131d2614ea84d1268d5de655e
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9b745656125..f4d60f07149 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "897d8ee5cd693e17f95a7e84194bca4c089a520b"
-    LLVM_SHA256 = "994677daedf23bc93ce04f1a527c07c09b7fbbd0986d867b60bd6710057a40de"
+    LLVM_COMMIT = "bfa200ebcf3706fde0dde335a3c1fa3fe1b3ba3f"
+    LLVM_SHA256 = "72deefcfe20434cb27a31ff9503c348dcf21065dbd27e9fa54c1fb3f5089b8e1"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 85bf5f7c202f1c656ebf169592aa4a0a9c022e8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:32:13 -0700
Subject: [PATCH 0581/1533] Return a meaningful error for dynamic shape inputs
 with outside compilation head extraction in TPUs.

PiperOrigin-RevId: 311495416
Change-Id: I42b12ac545224c32e770d963a5f3f333ba280531
---
 .../python/distribute/tpu_strategy_test.py    | 26 -------------------
 1 file changed, 26 deletions(-)

diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 6c93e29c028..de4c975d5ef 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
-from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -141,9 +140,6 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
-
-    # Disable automatic outside compilation.
-    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -168,28 +164,6 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
-  def test_dynamic_shape_with_outside_compilation_failure(self):
-    # Enable automatic outside compilation.
-    config.set_soft_device_placement(True)
-    strategy = get_tpu_strategy()
-    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
-        2, drop_remainder=False)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-    iterator = iter(dataset)
-
-    @def_function.function
-    def train_fn(iterator):
-
-      def step_fn(inputs):
-        _, inputs = inputs
-        return math_ops.reduce_sum(inputs)
-
-      return strategy.experimental_local_results(
-          strategy.run(step_fn, args=(next(iterator),)))
-
-    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
-      logging.info(train_fn(iterator))
-
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)

From a04c8be3e7086d9e14ba37c3c0945a3ea98414ce Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 14 May 2020 03:13:12 -0700
Subject: [PATCH 0582/1533] Update XNNPACK dependency

Bring in memory optimization for XNNPACK delegate in TF Lite

PiperOrigin-RevId: 311500960
Change-Id: I49b093ab177ca2e4806ed42390e367b58b14dc85
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f4d60f07149..c3d097a8362 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "15a300dec0d483af67310ed2edf76a6eff643e1438d0612ad00a372add472c22",
-        strip_prefix = "XNNPACK-5cb16e7ace0fcdcab164af01620a606ba828a3be",
+        sha256 = "0440d9ad632945f10992664be84eb0c0c76581f8474df3c124aa30350981126c",
+        strip_prefix = "XNNPACK-d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/5cb16e7ace0fcdcab164af01620a606ba828a3be.zip",
-            "https://github.com/google/XNNPACK/archive/5cb16e7ace0fcdcab164af01620a606ba828a3be.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
+            "https://github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
         ],
     )
 

From 015197cf8bda5010b2b12170da738bdb66482551 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 14 May 2020 06:41:25 -0700
Subject: [PATCH 0583/1533] Generate cubin headers for bias_add and relu.

Also, instead of checking if_cuda, check whether cuda_gpu_architectures()
is non-empty.

PiperOrigin-RevId: 311521784
Change-Id: I6a1a7e9cefc8e845e69d62fb3c19d9976b0f2196
---
 tensorflow/core/kernels/cubin_headers/BUILD   | 47 +++++++++++++++++++
 .../core/kernels/cubin_headers/build_defs.bzl |  9 ++--
 2 files changed, 51 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/core/kernels/cubin_headers/BUILD

diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
new file mode 100644
index 00000000000..bb7995dd221
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -0,0 +1,47 @@
+# Generates headers containing cubin for CUDA kernels.
+load("//tensorflow/core/kernels/cubin_headers:build_defs.bzl", "gen_kernel_image_hdr")
+
+bias_add_kernel = """
+func @bias_add(%arg0: tensor<?x?xf99>,
+         %arg1: tensor<?xf99>) -> tensor<?x?xf99> {
+  %0 = "tf.BiasAdd"(%arg0, %arg1) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?x?xf99>, tensor<?xf99>) -> tensor<?x?xf99>
+  return %0 : tensor<?x?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "bias_add_{type}_kernel".format(type = type),
+        op = bias_add_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,2",
+        tile_size = "16x16",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
+
+relu_kernel = """
+func @relu(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Relu"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "relu_{type}_kernel".format(type = type),
+        op = relu_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,1",
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index b09c515c883..14f47601f06 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -1,6 +1,6 @@
 """Generates cubin headers for TF dialect ops."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
 
 def _lookup_file(filegroup, path):
     """Extracts file at (relative) path in filegroup."""
@@ -87,8 +87,8 @@ _gen_kernel_image_hdr = rule(
 
 def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
     """Generates a C header with fatbin data from a Tensorflow op."""
-    if_cuda(
-        if_true = [_gen_kernel_image_hdr(
+    if cuda_gpu_architectures():
+        _gen_kernel_image_hdr(
             name = name,
             op = op,
             tile_size = tile_size,
@@ -97,5 +97,4 @@ def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
             symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
             gpu_archs = cuda_gpu_architectures(),
             tags = tags,
-        )],
-    )
+        )

From ec2cc2903f54d526dfdcfa314c9e181a8a5f76fa Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Thu, 14 May 2020 07:41:59 -0700
Subject: [PATCH 0584/1533] Introduce a higher-level function handling in the
 tracing oriented unified API

This patch intends to make function tracing more of a first class concept in
the API. It tries to move away from the "flat graph" model with "placeholder"
operation introduced with the expectation to turn them into function
parameters later. Instead the user starts by creating an empty function which
is an ExecutionContext (and as such can trace operations). Function parameters
can get added to this context using a dedicated API returning an AbstractTensor.

The diff in UnifiedCAPI/TestBasicGraph is probably a good illustration of the
change from a client point of view.

Another important point of this patch is to make it so that no C public API is
defined in the `c_api_unified_experimental_graph.cc` file, instead the
implementation is dispatched based on a registered factory function to create the
tracing context. This will allow to swap the tracing implementation through
injection later.

PiperOrigin-RevId: 311529850
Change-Id: I822047f4306835abc0e044dc87c14179596f64bd
---
 tensorflow/c/eager/BUILD                      |   2 +
 .../c/eager/c_api_unified_experimental.cc     |  69 +++++++++++
 .../c/eager/c_api_unified_experimental.h      |  26 ++--
 .../eager/c_api_unified_experimental_eager.cc |  11 ++
 .../eager/c_api_unified_experimental_graph.cc | 111 ++++++++----------
 .../c_api_unified_experimental_internal.h     |  17 +++
 .../eager/c_api_unified_experimental_test.cc  |  73 +++++-------
 7 files changed, 193 insertions(+), 116 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d3059df1bef..69808f6f49f 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -448,6 +448,8 @@ tf_cuda_library(
         "//conditions:default": [],
     }) + [
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 68afffb28b4..d29c457798e 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
@@ -26,6 +28,51 @@ using tensorflow::string;
 using tensorflow::internal::OutputList;
 using tensorflow::internal::unwrap;
 
+namespace tensorflow {
+namespace internal {
+typedef absl::flat_hash_map<std::string, FactoryFunction> FactoriesMap;
+
+static FactoriesMap& GetFactories() {
+  static FactoriesMap* factories = new FactoriesMap;
+  return *factories;
+}
+
+static const char* default_factory = "<unset>";
+
+void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
+  assert((!GetFactories().count(name)) ||
+         (GetFactories()[name] == factory) &&
+             "Duplicate tracing factory registration");
+  GetFactories()[name] = factory;
+}
+
+void SetDefaultTracingEngine(const char* name) { default_factory = name; }
+
+static ExecutionContext* CreateTracingExecutionContext(const char* fn_name,
+                                                       TF_Status* s) {
+  auto entry = GetFactories().find(default_factory);
+  if (entry != GetFactories().end()) return entry->second(fn_name, s);
+  string msg = absl::StrCat(
+      "No tracing engine factory has been registered with the key '",
+      default_factory, "' (available: ");
+  // Ensure deterministic (sorted) order in the error message
+  std::set<string> factories_sorted;
+  for (const auto& factory : GetFactories())
+    factories_sorted.insert(factory.first);
+  const char* comma = "";
+  for (const string& factory : factories_sorted) {
+    msg += comma + factory;
+    comma = ", ";
+  }
+  msg += ")";
+
+  TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+  return nullptr;
+}
+
+}  // end namespace internal
+}  // end namespace tensorflow
+
 // =============================================================================
 // Public C API entry points
 //
@@ -36,6 +83,28 @@ using tensorflow::internal::unwrap;
 //
 // =============================================================================
 
+void TF_SetTracingImplementation(const char* name) {
+  tensorflow::internal::SetDefaultTracingEngine(name);
+}
+
+// Creates a new TensorFlow function, it is an execution context attached to a
+// given tracing context.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* s) {
+  return wrap(tensorflow::internal::CreateTracingExecutionContext(fn_name, s));
+}
+
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList* outputs, TF_Status* s) {
+  auto* func = wrap(unwrap(ctx)->Finalize(unwrap(outputs), s));
+  TF_DeleteExecutionContext(ctx);
+  return func;
+}
+
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s) {
+  return wrap(unwrap(func)->AddParameter(dtype, s));
+}
+
 void TF_DeleteExecutionContext(TF_ExecutionContext* c) { delete unwrap(c); }
 
 TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* c) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index be8fc64c2e1..512717caa34 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -49,15 +49,26 @@ typedef struct TF_AbstractOp TF_AbstractOp;
 // setting functional attributes of other composite ops e.g. control flow.
 typedef struct TF_AbstractFunction TF_AbstractFunction;
 
-// Creates a context for tracing the execution of operations into a function.
-TF_ExecutionContext* TF_NewGraphExecutionContext(TF_Status* s);
+// This allows the client to swap the implementation of the tracing engine.
+// Any future call to TF_CreateFunction will use the implementation defined
+// here.
+void TF_SetTracingImplementation(const char* name);
+
+// Creates a new TensorFlow function. A Function is an execution context, and as
+// such it can trace operations through TF_ExecuteOperation. After completing
+// tracing, a function can be obtained by TF_FinalizeFunction.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* status);
 
 // Creates a context for eager execution of operations.
 TF_ExecutionContext* TF_NewEagerExecutionContext(TFE_ContextOptions*,
                                                  TF_Status* s);
-
 void TF_DeleteExecutionContext(TF_ExecutionContext*);
 
+// Add a new parameter to a TensorFlow Function.
+// TODO(aminim): what about shape?
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s);
+
 // Create an operation suitable to use with the provided context. The operation
 // requires its type (e.g. "AddV2") to be set independently.
 TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* ctx);
@@ -100,13 +111,12 @@ void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_ExecutionContext* ctx, TF_Status* s);
 
 // Creates a new TF_AbstractFunction from the current tracing states in the
-// context. The returned TF_GraphToFunction must be deleted by the client.
+// context. The provided `ctx` is consumed by this API call and deleted.
+// The returned TF_AbstractFunction must be deleted by the client,
 // TODO(aminim): clarify the contract on the state of the context after this
 // call.
-TF_AbstractFunction* TF_ExecutionContextToFunction(
-    const TF_ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const TF_AbstractTensor* inputs, int num_outputs,
-    const TF_AbstractTensor* outputs, TF_Status* status);
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList*, TF_Status*);
 
 void TF_DeleteAbstractFunction(TF_AbstractFunction*);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental_eager.cc b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
index 820c61445fb..cf8cf845834 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_eager.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
@@ -123,6 +123,17 @@ class EagerContext : public ExecutionContext {
     }
   }
 
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't add function parameter on an eager context.");
+    return nullptr;
+  }
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't use finalize function on an eager context.");
+    return nullptr;
+  }
+
   void RegisterFunction(AbstractFunction* afunc, TF_Status* s) override {
     auto* func = afunc->GetTfFunction(s);
     if (!func) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 36f8353894b..e38332e3e8e 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
@@ -114,12 +115,14 @@ struct GraphFunction : public AbstractFunction {
   static constexpr AbstractFunctionKind kKind = kGraphFunc;
 };
 
-// GraphContext wraps a TF_Graph and manages the "execution" of operation, i.e.
-// adding them to the graph.
+// GraphContext wraps a TF_Graph modeling a single function and manages the
+// "execution" of operation, i.e. adding them to the function.
 class GraphContext : public ExecutionContext {
  public:
-  GraphContext()
-      : ExecutionContext(kKind), graph_(new TF_Graph(), TF_DeleteGraph) {}
+  explicit GraphContext(const char* name)
+      : ExecutionContext(kKind),
+        graph_(new TF_Graph(), TF_DeleteGraph),
+        name_(name) {}
 
   AbstractOp* CreateOperation() override {
     // TODO(srbs): Should the lifetime of this op be tied to the context.
@@ -164,24 +167,38 @@ class GraphContext : public ExecutionContext {
     }
   }
 
-  TF_Function* ToFunction(const char* fn_name, int num_inputs,
-                          const GraphTensor* inputs, int num_outputs,
-                          const GraphTensor* outputs, TF_Status* status) const {
-    std::vector<TF_Output> graph_inputs;
-    graph_inputs.resize(num_inputs);
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_OperationDescription* opdesc =
+        TF_NewOperation(graph_.get(), "Placeholder",
+                        absl::StrCat("_input_", inputs_.size()).c_str());
+    TF_SetAttrType(opdesc, "dtype", dtype);
+    auto* operation = TF_FinishOperation(opdesc, s);
+    if (!s->status.ok()) return nullptr;
+
+    inputs_.push_back(TF_Output{operation, 0});
+    return new GraphTensor(inputs_.back(), this);
+  }
+
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    std::unique_ptr<GraphFunction> func(new GraphFunction);
     std::vector<TF_Output> graph_outputs;
-    graph_outputs.resize(num_outputs);
-    for (int i = 0; i < num_inputs; i++) {
-      graph_inputs[i] = inputs[i].output;
-    }
-    for (int i = 0; i < num_outputs; i++) {
-      graph_outputs[i] = outputs[i].output;
+    graph_outputs.reserve(outputs->outputs.size());
+    for (AbstractTensor* abstract_output : outputs->outputs) {
+      GraphTensor* output = dyncast<GraphTensor>(abstract_output);
+      if (!output) {
+        TF_SetStatus(s, TF_UNIMPLEMENTED,
+                     "Returning a non-graph tensor from a function has not "
+                     "been implemented yet.");
+        return nullptr;
+      }
+      graph_outputs.push_back(output->output);
     }
 
-    return TF_GraphToFunction(graph_.get(), fn_name, 0, -1, nullptr,
-                              graph_inputs.size(), graph_inputs.data(),
-                              graph_outputs.size(), graph_outputs.data(),
-                              nullptr, nullptr, fn_name, status);
+    func->func = TF_GraphToFunction(
+        graph_.get(), name_, 0, -1, nullptr, inputs_.size(), inputs_.data(),
+        graph_outputs.size(), graph_outputs.data(), nullptr, nullptr, name_, s);
+    if (TF_GetCode(s) != TF_OK) return nullptr;
+    return func.release();
   }
 
   void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
@@ -195,54 +212,20 @@ class GraphContext : public ExecutionContext {
 
  private:
   std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph_;
+  std::vector<TF_Output> inputs_;
+  const char* name_;
 };
 
-// Helper that converts the graph currently held in the context into a function.
-static AbstractFunction* ExecutionContextToFunction(
-    const ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const AbstractTensor* inputs, int num_outputs,
-    const AbstractTensor* outputs, TF_Status* status) {
-  auto* graph_ctx = dyncast<const GraphContext>(fn_body);
-  if (graph_ctx == nullptr) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "fn_body is not a TF_GraphContext.");
-    return nullptr;
-  }
-  auto* graph_inputs = dyncast<const GraphTensor>(inputs);
-  if (!graph_inputs) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT, "inputs aren't GraphTensors.");
-    return nullptr;
-  }
-  auto* graph_outputs = dyncast<const GraphTensor>(outputs);
-  if (!graph_outputs) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT, "outputs aren't GraphTensors.");
-    return nullptr;
-  }
-  GraphFunction* func = new GraphFunction;
-  func->func = graph_ctx->ToFunction(fn_name, num_inputs, graph_inputs,
-                                     num_outputs, graph_outputs, status);
-  return func;
+static ExecutionContext* GraphTracingFactory(const char* name, TF_Status* s) {
+  return new GraphContext(name);
 }
 
+// Register the tracing implemented in this file as the default tracing engine.
+static bool register_tracing = [] {
+  RegisterTracingEngineFactory("graphdef", GraphTracingFactory);
+  SetDefaultTracingEngine("graphdef");
+  return true;
+}();
+
 }  // namespace internal
 }  // namespace tensorflow
-
-// =============================================================================
-// Public C API entry points
-// These are only the entry points specific to the Graph API.
-// =============================================================================
-
-using tensorflow::internal::unwrap;
-
-TF_ExecutionContext* TF_NewGraphExecutionContext(TF_Status* s) {
-  return wrap(new tensorflow::internal::GraphContext());
-}
-
-TF_AbstractFunction* TF_ExecutionContextToFunction(
-    const TF_ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const TF_AbstractTensor* inputs, int num_outputs,
-    const TF_AbstractTensor* outputs, TF_Status* status) {
-  return wrap(ExecutionContextToFunction(unwrap(fn_body), fn_name, num_inputs,
-                                         unwrap(inputs), num_outputs,
-                                         unwrap(outputs), status));
-}
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
index ab085a20ff0..49212a230ee 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_internal.h
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace internal {
@@ -148,6 +149,17 @@ struct ExecutionContext {
   // Creates an empty AbstractOperation suitable to use with this context.
   virtual AbstractOp* CreateOperation() = 0;
 
+  // Add a function parameter and return the corresponding tensor.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) = 0;
+
+  // Finalize this context and make a function out of it. The context is in a
+  // invalid state after this call and must be destroyed.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) = 0;
+
   // Registers a functions with this context, after this the function is
   // available to be called/referenced by its name in this context.
   virtual void RegisterFunction(AbstractFunction* func, TF_Status* s) = 0;
@@ -156,6 +168,11 @@ struct ExecutionContext {
   const ExecutionContextKind k;
 };
 
+typedef ExecutionContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
+void SetDefaultTracingEngine(const char* name);
+void RegisterTracingEngineFactory(const ::tensorflow::string& name,
+                                  FactoryFunction factory);
+
 // Create utilities to wrap/unwrap: this convert from the C opaque types to the
 // C++ implementation, and back.
 #define MAKE_WRAP_UNWRAP(C_TYPEDEF, CPP_CLASS)                              \
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index bd99189852e..9f56c8aa579 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -29,7 +29,12 @@ using tensorflow::string;
 namespace tensorflow {
 namespace {
 
-TEST(UnifiedCAPI, TestBasicEager) {
+class UnifiedCAPI : public ::testing::TestWithParam<const char*> {
+ protected:
+  void SetUp() override { TF_SetTracingImplementation(GetParam()); }
+};
+
+TEST_P(UnifiedCAPI, TestBasicEager) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -81,33 +86,18 @@ TEST(UnifiedCAPI, TestBasicEager) {
   TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifiedCAPI, TestBasicGraph) {
+TEST_P(UnifiedCAPI, TestBasicGraph) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  // Start a new function / execution context.
+  string fn_name = "double";
+  TF_ExecutionContext* graph_ctx =
+      TF_CreateFunction(fn_name.c_str(), status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  // Add a placeholder to the graph.
-  auto* placeholder_op = TF_NewAbstractOp(graph_ctx);
-  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
+  auto* placeholder_t =
+      TF_AddFunctionParameter(graph_ctx, TF_FLOAT, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetAttrType(placeholder_op, "dtype", TF_FLOAT, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build inputs and outputs.
-  TF_OutputList* placeholder_outputs = TF_NewOutputList();
-
-  // Execute.
-  TF_ExecuteOperation(placeholder_op, 0, nullptr, placeholder_outputs,
-                      graph_ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  ASSERT_EQ(1, TF_OutputListNumOutputs(placeholder_outputs));
-  TF_AbstractTensor* placeholder_t = TF_OutputListGet(placeholder_outputs, 0);
-
-  // Delete placeholder op.
-  TF_DeleteAbstractOp(placeholder_op);
 
   // Build an abstract operation.
   auto* add_op = TF_NewAbstractOp(graph_ctx);
@@ -123,17 +113,13 @@ TEST(UnifiedCAPI, TestBasicGraph) {
   // Execute.
   TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractTensor* output_t = TF_OutputListGet(add_outputs, 0);
 
   // Clean up operation and inputs.
   TF_DeleteAbstractOp(add_op);
 
-  string fn_name = "double";
-  TF_AbstractFunction* func = TF_ExecutionContextToFunction(
-      graph_ctx, fn_name.c_str(), 1, placeholder_t, 1, output_t, status.get());
+  TF_AbstractFunction* func =
+      TF_FinalizeFunction(graph_ctx, add_outputs, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_DeleteAbstractTensor(placeholder_t);
-  TF_DeleteAbstractTensor(output_t);
 
   // Build eager context.
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -174,18 +160,16 @@ TEST(UnifiedCAPI, TestBasicGraph) {
   ASSERT_EQ(*f_value, 4.0);
 
   TF_DeleteOutputList(add_outputs);
-  TF_DeleteOutputList(placeholder_outputs);
   TF_DeleteAbstractOp(fn_op);
   TF_DeleteAbstractTensor(input_t);
   TF_DeleteAbstractTensor(final_result);
   TF_DeleteTensor(f_t);
   TF_DeleteAbstractFunction(func);
 
-  TF_DeleteExecutionContext(graph_ctx);
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
-TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+TEST_P(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -193,18 +177,15 @@ TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContextOptions(opts);
 
-  TF_AbstractFunction* func = TF_ExecutionContextToFunction(
-      ctx, nullptr, 0, nullptr, 0, nullptr, status.get());
+  TF_AbstractFunction* func = TF_FinalizeFunction(ctx, nullptr, status.get());
   ASSERT_EQ(nullptr, func);
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
-
-  TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -222,10 +203,10 @@ TEST(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -243,7 +224,7 @@ TEST(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
+TEST_P(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   // Build an Eager context.
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -273,7 +254,8 @@ TEST(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build a Graph context.
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Execute eager op using graph context.
@@ -289,10 +271,11 @@ TEST(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
+TEST_P(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -349,5 +332,7 @@ TEST(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
+INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI, ::testing::Values("graphdef"));
+
 }  // namespace
 }  // namespace tensorflow

From e2f8f5ad62b0deeff639e065f62a978c416c0c6b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 07:46:56 -0700
Subject: [PATCH 0585/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311530506
Change-Id: Ifcfd3d1247eba8a92c3a44f883cf4a098afdbce6
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 598e3a48bfe..c6d67c9ad44 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 5d92849778771a475fe339d2954db12c3d4ecc2b Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <guozhong.zhuang@intel.com>
Date: Thu, 14 May 2020 08:28:07 -0700
Subject: [PATCH 0586/1533] fix conv_ops_test and remapper_test

---
 .../core/grappler/optimizers/remapper_test.cc |  3 +++
 tensorflow/core/kernels/conv_ops_test.cc      | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 35e09b28205..52f420c57cc 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -607,6 +607,7 @@ TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   using ops::Placeholder;
 
@@ -685,6 +686,7 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
 
+
 TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
   using ops::Placeholder;
 
@@ -850,6 +852,7 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
+#endif
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 21dffa3cc5e..9e9ca27a570 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -1028,12 +1028,14 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBias(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
   this->VerifyConv2DWithBias(filter_size, filter_count,
                              /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1062,6 +1064,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1072,6 +1075,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
 // -------------------------------------------------------------------------- //
 // Conv2D + FusedBatchNorm + {Activation}                                     //
@@ -1095,6 +1099,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
@@ -1102,6 +1107,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
       filter_size, filter_count,
       /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1131,6 +1137,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1141,34 +1148,50 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
                             OneByOneConvolution,                //
                             ImageSizeConvolution,               //
                             SpatialConvolution,                 //
+#ifndef INTEL_MKL
                             ExplicitPaddingConvolution,         //
+#endif
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
+#ifndef INTEL_MKL
                             SpatialConvolutionAndActivation,    //
                             ExplicitPaddingConvolutionAndActivation);
+#else
+                            SpatialConvolutionAndActivation);
+#endif
 
 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
                             OneByOneConvolution,                //
                             ImageSizeConvolution,               //
                             SpatialConvolution,                 //
+#ifndef INTEL_MKL
                             ExplicitPaddingConvolution,         //
+#endif
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
+#ifndef INTEL_MKL
                             SpatialConvolutionAndActivation,    //
                             ExplicitPaddingConvolutionAndActivation);
+#else
+                            SpatialConvolutionAndActivation);
+#endif
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
+
+#ifndef INTEL_MKL
 using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
                                FusedBatchNormDataTypes);
+#endif
 
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

From 10c7f276e41f6b1790d8e767f77b9f5583419ad5 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Thu, 14 May 2020 17:37:50 +0200
Subject: [PATCH 0587/1533] Test autograph indirect tf.map_fn decorator

---
 tensorflow/python/kernel_tests/map_fn_test.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 1e10d689886..a5c860b407d 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -186,6 +186,24 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_autograph_indirect():
+    def test_function(x):
+      cond = tf.constant(-1)
+      if cond == 0:
+        result = x
+      else:
+        result = x
+      return result
+
+    @tf.function
+    def map_call(x):
+      tf.map_fn(test_function, x)
+
+    x = constant_op.constant([1])
+    y = map_call(x)
+    self.assertAllEqual([1], self.evaluate(y))
+
   @test_util.run_in_graph_and_eager_modes
   def testMapShape(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]])

From ed01ecd92d4376d519247f1d3ce2d8ab5c1d99da Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 08:35:00 -0700
Subject: [PATCH 0588/1533] Clarify docstring: At EOF, GFile.readline() returns
 "". Along the way, fix a comment about the same topic.

PiperOrigin-RevId: 311537677
Change-Id: I8dbd4fbf12f617efc5fdff0eb615337dc9c2fa8d
---
 tensorflow/python/keras/layers/preprocessing/table_utils.py | 2 +-
 tensorflow/python/lib/io/file_io.py                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index 88e9d95e2ed..f5397da1f3e 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -144,7 +144,7 @@ def get_vocabulary_from_file(vocabulary_path, encoding="utf-8"):
   vocab = []
   with gfile.GFile(vocabulary_path, "r") as reader:
     while True:
-      # Get the next line, and break if it is None.
+      # Get the next line (incl. \n), and break if nothing is left to read.
       text = reader.readline()
       if not text:
         break
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index a1db2fb056c..7c484c825d3 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -165,7 +165,7 @@ class FileIO(object):
     self._read_buf.seek(offset)
 
   def readline(self):
-    r"""Reads the next line from the file. Leaves the '\n' at the end."""
+    r"""Reads the next line, keeping \n. At EOF, returns ''."""
     self._preread_check()
     return self._prepare_value(self._read_buf.readline())
 

From 38e941dada7b7d790b4b060ec04ee78d5c9252ef Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Thu, 14 May 2020 17:40:11 +0200
Subject: [PATCH 0589/1533] Fix missing return

---
 tensorflow/python/kernel_tests/map_fn_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index a5c860b407d..7bf793c1e20 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -198,7 +198,7 @@ class MapFnTest(test.TestCase):
 
     @tf.function
     def map_call(x):
-      tf.map_fn(test_function, x)
+      return tf.map_fn(test_function, x)
 
     x = constant_op.constant([1])
     y = map_call(x)

From 83b0c2a225869f61cd420abdb044588bcd2f6696 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 08:37:48 -0700
Subject: [PATCH 0590/1533] Automated g4 rollback of changelist 311477582.

PiperOrigin-RevId: 311538137
Change-Id: Id9c4f986f0c5a6408ea60147917fb72977b83efe
---
 .../api_def_DenseCountSparseOutput.pbtxt      |  23 +-
 .../api_def_RaggedCountSparseOutput.pbtxt     |  27 +-
 .../api_def_SparseCountSparseOutput.pbtxt     |  29 ++-
 tensorflow/core/kernels/count_ops.cc          | 246 +++++++-----------
 tensorflow/core/ops/count_ops.cc              |  39 +--
 tensorflow/python/ops/bincount.py             | 151 +++++++++--
 tensorflow/python/ops/bincount_test.py        | 188 +++++++++----
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v1/tensorflow.sparse.pbtxt     |   2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v2/tensorflow.sparse.pbtxt     |   2 +-
 11 files changed, 441 insertions(+), 278 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
index 416da1ccaab..8296bfe6d7b 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -4,61 +4,62 @@ op {
   in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing data to count.
+Tensor containing data to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values. May
+also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a tf.tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
index 1763aea1fa6..37224d841de 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -4,67 +4,68 @@ op {
   in_arg {
     name: "splits"
     description: <<END
-int64; Tensor containing the row splits of the ragged tensor to count.
+Tensor containing the row splits of the ragged tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
-  END
+Values tensor for the resulting sparse tensor object.
+END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
   END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a ragged tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
index 62538e36a45..a346710c8b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -4,73 +4,74 @@ op {
   in_arg {
     name: "indices"
     description: <<END
-int64; Tensor containing the indices of the sparse tensor to count.
+Tensor containing the indices of the sparse tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
 in_arg {
     name: "dense_shape"
     description: <<END
-int64; Tensor containing the dense shape of the sparse tensor to count.
+Tensor containing the dense shape of the sparse tensor to count.
 END
   }
-  in_arg {
+ in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_values"
       description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_dense_shape"
       description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a sparse tensor input."
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index e7cc18ac454..7c85b050039 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -16,17 +16,20 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-using BatchedIntMap = std::vector<absl::flat_hash_map<int64, int64>>;
+template <class T>
+using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
 
 namespace {
 // TODO(momernick): Extend this function to work with outputs of rank > 2.
-Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
+template <class T>
+Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
                     bool is_1d, OpKernelContext* context) {
   int total_values = 0;
   int num_batches = per_batch_counts.size();
@@ -44,12 +47,12 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
       context->allocate_output(1, TensorShape({total_values}), &values));
 
   auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<int64>();
+  auto output_values = values->flat<T>();
   int64 value_loc = 0;
   for (int b = 0; b < num_batches; ++b) {
     const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
+    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
+                                         per_batch_count.end());
     std::sort(pairs.begin(), pairs.end());
     for (const auto& x : pairs) {
       if (is_1d) {
@@ -77,85 +80,19 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
   return Status::OK();
 }
 
-Status OutputWeightedSparse(const BatchedIntMap& per_batch_counts,
-                            int num_values, const Tensor& weights, bool is_1d,
-                            OpKernelContext* context) {
-  if (!TensorShapeUtils::IsVector(weights.shape())) {
-    return errors::InvalidArgument(
-        "Weights must be a 1-dimensional tensor. Got: ",
-        weights.shape().DebugString());
-  }
-
-  if (num_values > weights.dim_size(0)) {
-    return errors::InvalidArgument("The maximum array value was ", num_values,
-                                   ", but the weight array has size ",
-                                   weights.shape().DebugString());
-  }
-  auto weight_values = weights.flat<float>();
-
-  int total_values = 0;
-  int num_batches = per_batch_counts.size();
-  for (const auto& per_batch_count : per_batch_counts) {
-    total_values += per_batch_count.size();
-  }
-
-  Tensor* indices;
-  int inner_dim = is_1d ? 1 : 2;
-  TF_RETURN_IF_ERROR(context->allocate_output(
-      0, TensorShape({total_values, inner_dim}), &indices));
-
-  Tensor* values;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(1, TensorShape({total_values}), &values));
-
-  auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<float>();
-  int64 value_loc = 0;
-  for (int b = 0; b < num_batches; ++b) {
-    const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
-    std::sort(pairs.begin(), pairs.end());
-    for (const auto& x : pairs) {
-      if (is_1d) {
-        output_indices(value_loc, 0) = x.first;
-      } else {
-        output_indices(value_loc, 0) = b;
-        output_indices(value_loc, 1) = x.first;
-      }
-      output_values(value_loc) = x.second * weight_values(x.first);
-      ++value_loc;
-    }
-  }
-
-  Tensor* dense_shape;
-  if (is_1d) {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({1}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_values;
-  } else {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_batches;
-    dense_shape->flat<int64>().data()[1] = num_values;
-  }
-  return Status::OK();
-}
-
-template <class T>
-T GetOutputSize(T max_seen, T max_length, T min_length) {
+int GetOutputSize(int max_seen, int max_length, int min_length) {
   return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
 }
 
 }  // namespace
 
-template <class T>
+template <class T, class W>
 class DenseCount : public OpKernel {
  public:
   explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -170,6 +107,15 @@ class DenseCount : public OpKernel {
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
                     data.shape().DebugString()));
 
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == data.shape(),
+          errors::InvalidArgument(
+              "Weights and data must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; data shape: ", data.shape().DebugString()));
+    }
+
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
     int negative_valued_axis = -1;
     int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
@@ -179,19 +125,23 @@ class DenseCount : public OpKernel {
       num_batch_elements *= data.shape().dim_size(i);
     }
     int num_value_elements = data.shape().num_elements() / num_batch_elements;
-    auto per_batch_counts = BatchedIntMap(num_batch_elements);
+    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
+
     T max_value = 0;
 
     const auto data_values = data.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int i = 0;
     for (int b = 0; b < num_batch_elements; ++b) {
       for (int v = 0; v < num_value_elements; ++v) {
         const auto& value = data_values(i);
         if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-          if (binary_count_) {
-            (per_batch_counts[b])[value] = 1;
+          if (binary_output_) {
+            per_batch_counts[b][value] = 1;
+          } else if (use_weights) {
+            per_batch_counts[b][value] += weight_values(i);
           } else {
-            (per_batch_counts[b])[value]++;
+            per_batch_counts[b][value]++;
           }
           if (value > max_value) {
             max_value = value;
@@ -201,30 +151,24 @@ class DenseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
 };
 
-template <class T>
+template <class T, class W>
 class SparseCount : public OpKernel {
  public:
   explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -235,23 +179,27 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     bool is_1d = shape.NumElements() == 1;
-    const auto indices_values = indices.matrix<int64>();
-    const auto values_values = values.flat<T>();
-
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
+
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+
     T max_value = 0;
 
     for (int idx = 0; idx < num_values; ++idx) {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch])[value]++;
+          per_batch_counts[batch][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -259,30 +207,25 @@ class SparseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-template <class T>
+template <class T, class W>
 class RaggedCount : public OpKernel {
  public:
   explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -290,13 +233,15 @@ class RaggedCount : public OpKernel {
     const Tensor& values = context->input(1);
     const Tensor& weights = context->input(2);
     bool use_weights = weights.NumElements() > 0;
+    bool is_1d = false;
 
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
     int batch_idx = 0;
 
@@ -306,10 +251,12 @@ class RaggedCount : public OpKernel {
       }
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch_idx - 1])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch_idx - 1][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch_idx - 1])[value]++;
+          per_batch_counts[batch_idx - 1][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -317,42 +264,47 @@ class RaggedCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, false, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           false, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-#define REGISTER(TYPE)                                    \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")  \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          DenseCount<TYPE>)               \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          SparseCount<TYPE>)              \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          RaggedCount<TYPE>)
+#define REGISTER_W(W_TYPE) \
+  REGISTER(int32, W_TYPE)  \
+  REGISTER(int64, W_TYPE)
 
-REGISTER(int32);
-REGISTER(int64);
+#define REGISTER(I_TYPE, W_TYPE)                                     \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          DenseCount<I_TYPE, W_TYPE>)                \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          SparseCount<I_TYPE, W_TYPE>)               \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          RaggedCount<I_TYPE, W_TYPE>)
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_W);
+TF_CALL_float(REGISTER_W);
+TF_CALL_double(REGISTER_W);
+
+#undef REGISTER_W
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
index c9fbe1f8d8e..8de0a2ef954 100644
--- a/tensorflow/core/ops/count_ops.cc
+++ b/tensorflow/core/ops/count_ops.cc
@@ -19,12 +19,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
-  int32 rank = c->Rank(c->input(0));
-  DimensionHandle nvals = c->UnknownDim();
+  auto values = c->input(0);
+  auto weights = c->input(1);
+  ShapeHandle output;
+  auto num_weights = c->NumElements(weights);
+  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
+    output = values;
+  } else {
+    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
+  }
+  auto rank = c->Rank(output);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -32,8 +41,8 @@ Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
 }
 
 Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
-  DimensionHandle rank = c->Dim(c->input(0), 1);
-  DimensionHandle nvals = c->UnknownDim();
+  auto rank = c->Dim(c->input(0), 1);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -45,7 +54,7 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
   if (rank != c->kUnknownRank) {
     ++rank;  // Add the ragged dimension
   }
-  DimensionHandle nvals = c->UnknownDim();
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -54,12 +63,12 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
 
 REGISTER_OP("DenseCountSparseOutput")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(DenseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -69,12 +78,12 @@ REGISTER_OP("SparseCountSparseOutput")
     .Input("indices: int64")
     .Input("values: T")
     .Input("dense_shape: int64")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(SparseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -83,12 +92,12 @@ REGISTER_OP("SparseCountSparseOutput")
 REGISTER_OP("RaggedCountSparseOutput")
     .Input("splits: int64")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(RaggedCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
index e1b3bebaaaa..68950eaf596 100644
--- a/tensorflow/python/ops/bincount.py
+++ b/tensorflow/python/ops/bincount.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
@@ -33,7 +33,7 @@ def sparse_bincount(values,
                     axis=0,
                     minlength=None,
                     maxlength=None,
-                    binary_count=False,
+                    binary_output=False,
                     name=None):
   """Count the number of times an integer value appears in a tensor.
 
@@ -58,8 +58,9 @@ def sparse_bincount(values,
     maxlength: If given, skips `values` that are greater than or equal to
       `maxlength`, and ensures that the output has a `dense_shape` of at most
       `maxlength` in the inner dimension.
-    binary_count: Whether to do a binary count. When True, this op will return 1
-      for any value that exists instead of counting the number of occurrences.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
     name: A name for this op.
 
   Returns:
@@ -78,7 +79,7 @@ def sparse_bincount(values,
   SparseTensor) and returns a SparseTensor where the value of (i,j) is the
   number of times value j appears in batch i.
 
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(data, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
@@ -102,7 +103,7 @@ def sparse_bincount(values,
   dense shape is [2, 500] instead of [2,10002] or [2, 102].
 
   >>> minlength = maxlength = 500
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(
   ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
   >>> print(output)
@@ -123,8 +124,8 @@ def sparse_bincount(values,
   some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
   the 'values' tensor is all 1s.
 
-  >>> dense = [[10, 20, 30, 20], [11, 101, 11, 10001]]
-  >>> output = tf.sparse.bincount(dense, binary_count=True, axis=-1)
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
   [[    0    10]
@@ -136,20 +137,42 @@ def sparse_bincount(values,
    values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
    dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
 
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
   """
   with ops.name_scope(name, "count", [values, weights]):
     if not isinstance(values, sparse_tensor.SparseTensor):
       values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
           values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
 
-    if weights is not None and binary_count:
-      raise ValueError("binary_count and weights are mutually exclusive.")
-
-    if weights is None:
-      weights = []
-      output_type = dtypes.int64
-    else:
-      output_type = dtypes.float32
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
 
     if axis is None:
       axis = 0
@@ -162,38 +185,114 @@ def sparse_bincount(values,
     maxlength_value = maxlength if maxlength is not None else -1
 
     if axis == 0:
-      if isinstance(values,
-                    (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
+      if isinstance(values, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(values, weights)
+        values = values.values
+      elif isinstance(values, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(values, weights)
         values = values.values
       else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
         values = array_ops.reshape(values, [-1])
 
     if isinstance(values, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
           values.indices,
           values.values,
           values.dense_shape,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     elif isinstance(values, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
           values.row_splits,
           values.values,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     else:
+      weights = validate_dense_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
           values,
           weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
 
     return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_dense_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.dtype)
+
+  if not isinstance(weights, ops.Tensor):
+    raise ValueError(
+        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
+
+  return weights
+
+
+def validate_sparse_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, sparse_tensor.SparseTensor):
+    raise ValueError(
+        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
+
+  checks = []
+  if weights.dense_shape is not values.dense_shape:
+    checks.append(
+        check_ops.assert_equal(
+            weights.dense_shape,
+            values.dense_shape,
+            message="'weights' and 'values' must have the same dense shape."))
+  if weights.indices is not values.indices:
+    checks.append(
+        check_ops.assert_equal(
+            weights.indices,
+            values.indices,
+            message="'weights' and 'values' must have the same indices.")
+    )
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
+
+
+def validate_ragged_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, ragged_tensor.RaggedTensor):
+    raise ValueError(
+        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
+
+  checks = []
+  if weights.row_splits is not values.row_splits:
+    checks.append(
+        check_ops.assert_equal(
+            weights.row_splits,
+            values.row_splits,
+            message="'weights' and 'values' must have the same row splits."))
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_test.py
index 776b65b72d0..839af8dcc35 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import bincount
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -65,7 +67,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 6],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_maxlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -73,7 +75,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 7],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -82,7 +84,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 9],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_larger_values_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -91,40 +93,40 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 8],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_no_maxlength_weights",
           "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [1, 2, 3, 8, 5],
+          "expected_values": [2, 1, 0.5, 9, 3],
           "expected_shape": [2, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[0.5, 1, 2], [3, 4, 5]]
       }, {
           "testcase_name": "_maxlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "maxlength": 7,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [1, 2, 3, 0.5, 8],
+          "expected_values": [2, 1, 0.5, 3, 9],
           "expected_shape": [2, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
       }, {
           "testcase_name": "_minlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 9,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_minlength_larger_values_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 3,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_1d",
           "x": np.array([3, 2, 1, 1], dtype=np.int32),
@@ -146,7 +148,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        expected_shape,
                        minlength=None,
                        maxlength=None,
-                       binary_count=False,
+                       binary_output=False,
                        weights=None,
                        axis=-1):
     y = bincount.sparse_bincount(
@@ -154,7 +156,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
         weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -216,7 +218,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
           "expected_values": [1, 1, 1, 1],
           "expected_shape": [3, 6],
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -230,7 +232,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -244,7 +246,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -258,7 +260,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -268,9 +270,10 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -279,11 +282,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -292,11 +296,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name":
@@ -305,11 +310,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name": "_1d",
@@ -338,16 +344,17 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
+    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_sparse,
-        weights=weights,
+        weights=w_sparse,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -393,7 +400,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 6],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_maxlength_binary",
@@ -402,7 +409,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 7],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_binary",
@@ -412,13 +419,13 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 9],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_larger_values_binary",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "minlength": 3,
-          "binary_count": True,
+          "binary_output": True,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
@@ -428,18 +435,18 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "testcase_name": "_no_maxlength_weights",
           "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_maxlength_weights",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "maxlength": 7,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_weights",
@@ -447,9 +454,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 9,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_larger_values_weights",
@@ -457,9 +464,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 3,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_1d",
@@ -484,21 +491,114 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
+    w = ragged_factory_ops.constant(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_ragged,
-        weights=weights,
+        weights=w,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
+class TestSparseCountFailureModes(test.TestCase):
+
+  def test_dense_input_sparse_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_ragged_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_wrong_shape_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = np.array([[3, 2], [5, 4], [4, 3]])
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must have the same shape"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+    else:
+      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_dense_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_ragged_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same indices"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_too_many_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible shapes"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_shape_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
+                 dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same dense shape"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_dense_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_sparse_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_different_shape_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same row splits"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 4c4f6c62291..f8f8edb26a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index a9ad81920dd..67235bb2cf2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"

From 8e8c67c3375da3fe8b44e7c11eb1d3fbb2eaa41c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 08:44:49 -0700
Subject: [PATCH 0591/1533] Comment typo fix.

PiperOrigin-RevId: 311539306
Change-Id: Ieb8cf58b706e822177269b00a1a0ba58f0a97067
---
 tensorflow/python/ops/tensor_array_ops.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index e8ea9ff4e4d..d386d14b64a 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -1122,7 +1122,7 @@ class TensorArray(object):
     Returns:
       A new TensorArray object with flow that ensures the control dependencies
       from the contexts will become control dependencies for writes, reads, etc.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
     """
     return self._implementation.identity()
 
@@ -1152,7 +1152,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the write occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if there are more writers than specified.
@@ -1217,7 +1217,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the unstack occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1236,7 +1236,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the scatter occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1255,7 +1255,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the split occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.

From e033fd5b33e5f3cfb7b075715e6d38c3de2383fd Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 14 May 2020 08:44:55 -0700
Subject: [PATCH 0592/1533] [TF MLIR SI] Don't constant fold, only consider
 result of folding

This results in less changes to the module during shape inference (e.g., only shapes are changed, no constant nodes are created). Effectively this computes the folded result and then just uses that information locally. Which is conceptually more wasteful (as a subsequent canonicalize pass may need to recompute these) but is less surprising and avoids dropping attributes during this part.

There is still additional changes that need to be made to avoid doing needless computations here, this mostly focuses on decreasing graph mutations.

PiperOrigin-RevId: 311539328
Change-Id: Ib6daa331c1e18a6d23463aa945c87e59d253708b
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  10 +-
 .../tensorflow/tests/shape_inference.mlir     |  31 ++--
 .../tensorflow/transforms/shape_inference.cc  | 145 +++++++++++++-----
 3 files changed, 137 insertions(+), 49 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 2007824369c..b21fef32cca 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -3551,12 +3551,20 @@ OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   if (!const_perm) return {};
 
   auto const_value = const_perm.value();
-  const auto &elements = const_value.getValues<APInt>();
+  const auto elements = const_value.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
   }
 
+  // TODO(jpienaar): Remove when we handle this more generally.
+  if (op.getType() != op.x().getType()) {
+    // If the types don't match then only fold if all the operands are in the TF
+    // dialect.
+    for (auto user : op.getOperation()->getUsers())
+      if (user->getDialect() != op.getDialect()) return {};
+  }
+
   return op.x();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 160bba94cfc..cfe8db9025e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -3,8 +3,8 @@
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK-NOT: tf.Cast
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+ // CHECK: %[[RESULT:.*]] = "tf.AddV2"
+ // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
  // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
@@ -60,8 +60,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[CST:.*]] = "tf.Const"{{.*}} {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[CST]]
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"
+// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
 // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
 // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
@@ -300,13 +300,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
-  // CHECK-LABEL: func @fold_cast
-  func @fold_cast(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-    // CHECK-NOT: Cast
-    %0 = "tf.Cast"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
-    return %0 : tensor<*xf32>
-  }
-
   // CHECK-LABEL: func @while_variant
   // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
   func @while_variant(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant> {
@@ -362,8 +355,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @partitioned_call_func_const
   func @partitioned_call_func_const(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-    // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-    // CHECK: return %[[CONST]]
     return %arg0 : tensor<2xi32>
   }
 
@@ -410,4 +401,18 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %40 = "tf.Reshape"(%39, %19) {T = f32, Tshape = i32, device = ""} : (tensor<1x4x4x32xf32>, tensor<2xi32>) -> tensor<?x?xf32>
    return
   }
+
+  // CHECK-LABEL: const_fold
+  func @const_fold() -> () {
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %1 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Add
+    // CHECK-SAME: (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 5a2cae38062..6a63e83be0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -430,6 +430,7 @@ LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
 Attribute ComputeOutputComponent(const ValuePort& value_port,
                                  ValueQueryFn values) {
   LLVM_DEBUG(value_port.print(llvm::errs() << "\nComputing output for "));
+  if (auto known = values(value_port)) return known;
 
   auto op = value_port.producer.dyn_cast<Operation*>();
   if (!op) return nullptr;
@@ -454,6 +455,7 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
     ValuePort op_port(op->getOperand(port[1]));
     return values(op_port);
   }
+
   return nullptr;
 }
 
@@ -475,8 +477,11 @@ class ShapeInference {
   }
 
   Attribute ComputeOutputComponent(const ValuePort& value_port) {
-    return ::mlir::TF::ComputeOutputComponent(
+    if (auto known_attr = results_[value_port]) return known_attr;
+    auto attr = ::mlir::TF::ComputeOutputComponent(
         value_port, [this](const ValuePort& port) { return results_[port]; });
+    RecordValue(value_port, attr);
+    return attr;
   }
 
   // Returns ShapeHandle if the op result could be computed as shape.
@@ -520,19 +525,35 @@ class ShapeInference {
   LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
                                                     int64_t max_iteration);
 
+  // Propagates any constant operand of call_op to the called function body's
+  // corresponding argument if the callee has only one use.
+  //
+  // TODO(b/154065712): Move this to a more general inter-procedural constant
+  // folding pass.
+  void PropagateConstantToCallee(CallOpInterface call_op,
+                                 SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Propagates any constant return value of the callee function to the call
+  // op's corresponding result.
+  void PropagateConstantFromCallee(CallOpInterface call_op,
+                                   SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Tries to compute the result of folding the op. This doesn't actually
+  // perform constant folding, it is just computes the equivalent constants.
+  // Returns whether it was able to compute constant values.
+  LogicalResult TryToFold(Operation* op);
+
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
   // e.g., first element of OpResult produded) to an Attribute if the ValuePort
   // corresponds to a constant value.
   ValuePortResultMap results_;
   int64_t graph_version_;
-  MLIRContext* context_;
   Dialect* tf_dialect_;
 };
 
 ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
     : graph_version_(graph_version) {
-  context_ = context;
   tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
 }
 
@@ -581,7 +602,6 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
       auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
-      RecordValue(front, ret);
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
@@ -686,10 +706,14 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
     size_t index = it.index();
 
     // If the operand is constant, then convert it to Tensor.
-    ElementsAttr attr;
-    if (matchPattern(operand, m_Constant(&attr))) {
+    ValuePort vp(operand);
+    Attribute attr = ComputeOutputComponent(vp);
+    if (!attr && matchPattern(operand, m_Constant(&attr)))
+      RecordValue(vp, attr);
+    if (attr) {
       tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status = tensorflow::ConvertToTensor(attr, input_tensor);
+      auto status =
+          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
       if (status.ok()) {
         input_tensors[index] = input_tensor;
       } else {
@@ -865,13 +889,9 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
   return success(all_succeeded);
 }
 
-// If the callee has only one use, propagates any constant operand of call_op to
-// the called function body's corresponding argument.
-//
-// TODO(b/154065712): Move this to a more general inter-procedural constant
-// folding pass.
-void PropagateConstantToCallee(CallOpInterface call_op,
-                               SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
+                                               SymbolRefAttr callee_sym,
+                                               ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
@@ -879,31 +899,29 @@ void PropagateConstantToCallee(CallOpInterface call_op,
   Operation* op = call_op.getOperation();
   if (num_uses == 1) {
     // If this is the only caller, and an operand is a constant, propagate
-    // the constant inside the function.
+    // the constant value inside the function.
     for (auto arg : func.getArguments()) {
-      auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
-      if (isa_and_nonnull<TF::ConstOp>(operand)) {
-        arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
-      }
+      auto operand = op->getOperand(arg.getArgNumber());
+      if (auto known_constant = ComputeOutputComponent(ValuePort(operand)))
+        RecordValue(ValuePort(arg), known_constant);
     }
   }
 }
 
-// Propagates any constant return value of the callee function to the call op's
-// corresponding result.
-void PropagateConstantFromCallee(CallOpInterface call_op,
-                                 SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantFromCallee(CallOpInterface call_op,
+                                                 SymbolRefAttr callee_sym,
+                                                 ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
-  // If the return value is a constant, replace the call result with a constant.
+  // If the return value is a constant, use the constant as the value of
+  // the call return.
   Operation* op = call_op.getOperation();
   OpBuilder builder(op);
   builder.setInsertionPointAfter(op);
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
-    auto retval_op = retval.value().getDefiningOp();
-    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
-      op->getResult(retval.index())
-          .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
+    ValuePort vp(retval.value());
+    if (auto known_constant = ComputeOutputComponent(vp)) {
+      RecordValue(ValuePort(op->getResult(retval.index())), known_constant);
     }
   }
 }
@@ -938,10 +956,68 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   return success();
 }
 
+LogicalResult ShapeInference::TryToFold(Operation* op) {
+  // If any output result is known, then the op probably has been computed
+  // before.
+  if (op->getNumResults() > 0 && results_[ValuePort(op->getResult(0))])
+    return success();
+
+  SmallVector<Attribute, 8> constant_operands(op->getNumOperands());
+  SmallVector<OpFoldResult, 8> fold_results;
+
+  // Check to see if any operands to the operation is constant and whether
+  // the operation knows how to constant fold itself.
+  bool some_unknown = false;
+  for (int i = 0, e = op->getNumOperands(); i != e; ++i) {
+    if (!(constant_operands[i] =
+              ComputeOutputComponent(ValuePort(op->getOperand(i)))))
+      some_unknown = true;
+  }
+
+  // Attempt to constant fold the operation.
+  auto* abstract_op = op->getAbstractOperation();
+  if (abstract_op) {
+    if (failed(abstract_op->foldHook(op, constant_operands, fold_results)))
+      return failure();
+  } else {
+    Dialect* dialect = op->getDialect();
+    if (!dialect) return failure();
+    // Only attempt TF dialect fallback if there are no unknown operands.
+    if (some_unknown && dialect == tf_dialect_) return failure();
+    SmallVector<Attribute, 8> constants;
+    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
+      return failure();
+    fold_results.assign(constants.begin(), constants.end());
+  }
+
+  for (auto result : zip(op->getResults(), fold_results)) {
+    auto fold_result = std::get<1>(result);
+    Attribute attr = nullptr;
+    if ((attr = fold_result.dyn_cast<Attribute>())) {
+      RecordValue(ValuePort(std::get<0>(result)), attr);
+    } else {
+      auto value = fold_result.get<Value>();
+      if ((attr = ComputeOutputComponent(ValuePort(value))))
+        RecordValue(ValuePort(std::get<0>(result)), attr);
+    }
+
+    if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
+      if (std::get<0>(result).getType() == eattr.getType()) continue;
+
+      // Inserts a cast back to the original type if any user is not in the
+      // TF dialect.
+      Type old_type = std::get<0>(result).getType();
+      std::get<0>(result).setType(eattr.getType());
+      AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result), tf_dialect_,
+                                         old_type);
+    }
+  }
+
+  return success();
+}
+
 LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
                                                       int64_t max_iteration) {
-  // An operation folder that is used to attempt folding before inference._
-  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -955,9 +1031,7 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
         changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
-        // TODO(jpienaar): Debug why we can't just return here. We end up with
-        // additional constant due to the propagation of constant into attached
-        // function if we return already.
+        return;
       }
 
       if (op->getDialect() != tf_dialect_) {
@@ -965,8 +1039,9 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
         return;
       }
 
-      // Before attempting inference, just try to fold the operation.
-      if (succeeded(folder.tryToFold(op))) return;
+      // Before attempting inference, just try to compute the folded
+      // value/shape.
+      if (succeeded(TryToFold(op))) return;
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.

From 8565ed2eed43057d4f880a0594100108df438d85 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Thu, 14 May 2020 08:55:13 -0700
Subject: [PATCH 0593/1533] Removing TensorHandleList Delete APIs, since
 TensorHandleList pointer is owned by ConcreteFunction.

PiperOrigin-RevId: 311541122
Change-Id: I0a538b3452c62ee021cf7a41257cbcf580c0d3f2
---
 .../c/experimental/saved_model/internal/tensorhandle_list.cc  | 3 ---
 .../c/experimental/saved_model/public/tensorhandle_list.h     | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
index 6ef937591aa..7d018658101 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -32,8 +32,5 @@ TFE_TensorHandle* TF_TensorHandleListGet(const TF_TensorHandleList* list,
   return tensorflow::wrap((*tensorflow::unwrap(list))[i]);
 }
 
-void TF_DeleteTensorHandleList(const TF_TensorHandleList* list) {
-  delete tensorflow::unwrap(list);
-}
 
 }  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
index 393708aa2bf..a1e88db3474 100644
--- a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
+++ b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
@@ -36,10 +36,6 @@ TF_CAPI_EXPORT extern size_t TF_TensorHandleListSize(
 TF_CAPI_EXPORT extern TFE_TensorHandle* TF_TensorHandleListGet(
     const TF_TensorHandleList* list, int i);
 
-// Deletes `list`.
-TF_CAPI_EXPORT extern void TF_DeleteTensorHandleList(
-    const TF_TensorHandleList* list);
-
 #ifdef __cplusplus
 }  // end extern "C"
 #endif  // __cplusplus

From ffef54602d33f3b23ce21a0d421efde05efe7cef Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Thu, 14 May 2020 18:21:13 +0200
Subject: [PATCH 0594/1533] Fix missing self Add initial autograph wrapping in
 map_fn

---
 tensorflow/python/kernel_tests/map_fn_test.py | 2 +-
 tensorflow/python/ops/map_fn.py               | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 7bf793c1e20..1859c6c5873 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -187,7 +187,7 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(nums, received[2])
 
   @test_util.run_in_graph_and_eager_modes
-  def testMap_autograph_indirect():
+  def testMap_autograph_indirect(self):
     def test_function(x):
       cond = tf.constant(-1)
       if cond == 0:
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 2c9c678336e..dfe32998282 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -39,6 +39,12 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
+autograph_ctx = lazy_loader.LazyLoader(
+    "autograph_ctx", globals(),
+    "tensorflow.python.autograph.core.ag_ctx")
+autograph = lazy_loader.LazyLoader(
+    "autograph", globals(),
+    "tensorflow.python.autograph.impl.api")
 
 @tf_export(v1=["map_fn"])
 @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
@@ -477,7 +483,8 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      result_value = fn(elems_value)
+      ag_ctx = autograph_ctx.control_status_ctx()
+      result_value = autograph.tf_convert(elems_value, ag_ctx)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(

From 5d3c548620a5e23ba765cd8d7a09feaa08e9b056 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 09:35:50 -0700
Subject: [PATCH 0595/1533] Resolve trivial aliases for portable TensorFlow
 targets.

PiperOrigin-RevId: 311548335
Change-Id: I837aa5a62500682783607841f0c993c2b6c238ed
---
 tensorflow/c/BUILD                         | 20 ++++++++---------
 tensorflow/c/eager/BUILD                   |  4 ++--
 tensorflow/cc/saved_model/BUILD            |  2 +-
 tensorflow/compiler/jit/BUILD              |  2 +-
 tensorflow/core/common_runtime/eager/BUILD | 18 +++++++--------
 tensorflow/core/kernels/BUILD              | 26 +++++++++++-----------
 tensorflow/examples/label_image/BUILD      |  2 +-
 tensorflow/java/src/main/native/BUILD      |  2 +-
 tensorflow/lite/delegates/flex/BUILD       | 14 ++++++------
 tensorflow/lite/testing/BUILD              | 12 +++++-----
 tensorflow/lite/testing/kernel_test/BUILD  |  2 +-
 tensorflow/tools/benchmark/BUILD           |  4 ++--
 12 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 66ade5c7bd4..7fb02028837 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -85,7 +85,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:chromiumos": [
             ":tf_attrtype",
@@ -182,7 +182,7 @@ tf_cuda_library(
         ":tf_status_internal",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tf_status",
@@ -219,7 +219,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -234,7 +234,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tf_status_internal",
@@ -272,7 +272,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -288,7 +288,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tensor_interface",
@@ -313,7 +313,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tensor_interface",
@@ -426,7 +426,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -457,7 +457,7 @@ tf_cuda_library(
     ] + select({
         "//tensorflow:android": [
             ":c_api_internal",
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api_internal",
@@ -484,7 +484,7 @@ tf_cuda_library(
         ":tf_status_helper",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 69808f6f49f..fe4d5ac6ffe 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -35,7 +35,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":context_interface",
@@ -412,7 +412,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a20cc9c9945..b13d8db48a9 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -84,7 +84,7 @@ cc_library(
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
     ]) + if_android([
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ]),
 )
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 28d922f9e3c..bc8fac0e88f 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -251,7 +251,7 @@ cc_library(
     visibility = [":friends"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:graph",
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 2b2313d91ff..625468b39d5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -47,7 +47,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -83,7 +83,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/types:optional",
@@ -147,7 +147,7 @@ tf_cuda_library(
         "//tensorflow/core/platform:platform_port",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -181,7 +181,7 @@ tf_cuda_library(
         ":eager_executor",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/types:variant",
@@ -207,7 +207,7 @@ tf_cuda_library(
         ":tensor_handle_data",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/strings",
@@ -312,7 +312,7 @@ tf_cuda_library(
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:windows": KERNEL_AND_DEVICE_DEPS,
         "//conditions:default": KERNEL_AND_DEVICE_DEPS + [
@@ -381,7 +381,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
@@ -498,7 +498,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
@@ -527,7 +527,7 @@ tf_cuda_library(
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6cb8704f494..788924e8b37 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4021,7 +4021,7 @@ cc_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//third_party/eigen3",
@@ -4046,7 +4046,7 @@ cc_library(
         ":eigen_spatial_convolutions-inl",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -4062,7 +4062,7 @@ cc_library(
     deps = select({
         "//tensorflow:android": [
             ":conv_3d_mobile",
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             ":conv_3d",
@@ -7270,8 +7270,8 @@ tf_cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":quantized_ops",
@@ -7331,8 +7331,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7416,8 +7416,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_testutil",
@@ -7603,8 +7603,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7829,8 +7829,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 162a44ac109..a0e5005d45a 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -35,7 +35,7 @@ tf_cc_binary(
             # cc:cc_ops is used to include image ops (for label_image)
             # Jpg, gif, and png related code won't be included
             "//tensorflow/cc:cc_ops",
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
             # cc:android_tensorflow_image_op is for including jpeg/gif/png
             # decoder to enable real-image evaluation on Android
             "//tensorflow/core/kernels:android_tensorflow_image_op",
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 0b363ff577e..e38e58d6fe6 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -30,7 +30,7 @@ tf_cuda_library(
     }),
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api",
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index d69d2207e63..98314fdc1b8 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -23,7 +23,7 @@ cc_library(
         "//tensorflow/lite:string_util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -63,7 +63,7 @@ cc_library(
         ":delegate_only_runtime",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -100,7 +100,7 @@ cc_library(
         "//tensorflow/lite:util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -134,7 +134,7 @@ cc_library(
         "@com_google_absl//absl/memory",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -180,7 +180,7 @@ cc_library(
         # set of core TensorFlow kernels. We may want to revisit this dependency
         # to allow selective registration via build targets.
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -208,7 +208,7 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -242,7 +242,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index df85f659bf3..379230b3a4b 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -68,8 +68,8 @@ exports_files([
             "//tensorflow/core:test",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
     }),
 ) for conversion_mode, test_name, tags, args in generated_test_models_all() + merged_test_models()]
@@ -326,7 +326,7 @@ cc_library(
             "//tensorflow/core:tensorflow",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -365,7 +365,7 @@ cc_library(
             "//tensorflow/core:framework",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -405,7 +405,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -440,7 +440,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
index 5180f2f4e5a..76333c76259 100644
--- a/tensorflow/lite/testing/kernel_test/BUILD
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -25,7 +25,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 93b408d522e..674133431f1 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -28,8 +28,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",

From 866e01f318188f15c00d77c2efb219a2c50eb96b Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 14 May 2020 09:55:16 -0700
Subject: [PATCH 0596/1533] [XLA:Python] Cache the backend in xla_client_test.

This is in preparation for removing backend caching logic from xla_client.

PiperOrigin-RevId: 311551914
Change-Id: Ia791dc911bd7d9890dec111b8da69a9c619f061c
---
 tensorflow/compiler/xla/python/xla_client_test.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 62b3fae018a..fbdd9921a40 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -2029,8 +2029,11 @@ def TestFactory(xla_backend, cloud_tpu=False):
   return tests
 
 
-def InstantiateTests(globals_dict, backend, test_prefix="", **kw):
-  for klass in TestFactory(backend, **kw):
+def InstantiateTests(globals_dict, backend_fn, test_prefix="", **kw):
+  # Avoid creating a new backend per test (this causes GPU OOM, and is probably
+  # inefficient).
+  backend_fn = functools.lru_cache(maxsize=None)(backend_fn)
+  for klass in TestFactory(backend_fn, **kw):
     test = type(test_prefix + klass.__name__, (klass,), {})
     # Clean up the qualified names of the tests to not include the test factory.
     test.__qualname__ = test.__name__

From 9dd3efb5aa3bacba8c66042ff975a3b9d4d30f95 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Thu, 14 May 2020 10:06:52 -0700
Subject: [PATCH 0597/1533] Do not silently ignore ptxas compilation failures.

Change the xla_gpu_unsafe_fallback_to_driver_on_ptxas_error default to false.

PiperOrigin-RevId: 311554370
Change-Id: I9a7f9ff114957998a84136e16333addf4a2cd354
---
 tensorflow/compiler/xla/debug_options_flags.cc        |  2 +-
 tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 216fb0a7422..60a563ee956 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -66,7 +66,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_enable_xprof_traceme(true);
   // TODO(b/155295372): disable ptxas fallback by default.
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(true);
-  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(true);
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(false);
 
   return opts;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 0196267d904..7ff8d40b440 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -416,11 +416,12 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
             CHECK(hlo_module_config.debug_options()
                       .xla_gpu_unsafe_fallback_to_driver_on_ptxas_error())
                 << "There was an error when trying to compile ptx into sass "
-                   "code. If you want to try falling back to the GPU driver to "
-                   "jit compile ptx, you can use the flag "
-                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_error."
-                   " Use at your own risk though, it has known drawbacks like "
-                   "increased memory consumption.";
+                   "code. Up until May 14 2020, XLA silently ignored such "
+                   "errors and fell back to the GPU driver. This is likely to "
+                   "trigger subtle runtime issues and is hence discouraged. "
+                   "If you want to temporarily restore this behavior use the "
+                   "flag --xla_gpu_unsafe_fallback_to_driver_on_ptxas_error "
+                   "and file a bug in b/components/366096.";
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if

From c3d351abd20a814e7a8eae4e3d951b18667cbac8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 10:24:35 -0700
Subject: [PATCH 0598/1533] Internal change

PiperOrigin-RevId: 311558265
Change-Id: Ib91edbfdbd7d3442c72401a794283518393bc64d
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  10 +-
 .../tensorflow/tests/shape_inference.mlir     |  31 ++--
 .../tensorflow/transforms/shape_inference.cc  | 145 +++++-------------
 3 files changed, 49 insertions(+), 137 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index b21fef32cca..2007824369c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -3551,20 +3551,12 @@ OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   if (!const_perm) return {};
 
   auto const_value = const_perm.value();
-  const auto elements = const_value.getValues<APInt>();
+  const auto &elements = const_value.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
   }
 
-  // TODO(jpienaar): Remove when we handle this more generally.
-  if (op.getType() != op.x().getType()) {
-    // If the types don't match then only fold if all the operands are in the TF
-    // dialect.
-    for (auto user : op.getOperation()->getUsers())
-      if (user->getDialect() != op.getDialect()) return {};
-  }
-
   return op.x();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index cfe8db9025e..160bba94cfc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -3,8 +3,8 @@
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"
- // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+ // CHECK-NOT: tf.Cast
+ // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
  // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
@@ -60,8 +60,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[SHAPE:.*]] = "tf.Shape"
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
+// CHECK: %[[CST:.*]] = "tf.Const"{{.*}} {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[CST]]
 // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
 // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
@@ -300,6 +300,13 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
+  // CHECK-LABEL: func @fold_cast
+  func @fold_cast(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    // CHECK-NOT: Cast
+    %0 = "tf.Cast"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
+    return %0 : tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @while_variant
   // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
   func @while_variant(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant> {
@@ -355,6 +362,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @partitioned_call_func_const
   func @partitioned_call_func_const(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    // CHECK: return %[[CONST]]
     return %arg0 : tensor<2xi32>
   }
 
@@ -401,18 +410,4 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %40 = "tf.Reshape"(%39, %19) {T = f32, Tshape = i32, device = ""} : (tensor<1x4x4x32xf32>, tensor<2xi32>) -> tensor<?x?xf32>
    return
   }
-
-  // CHECK-LABEL: const_fold
-  func @const_fold() -> () {
-    // CHECK: tf.Const
-    // CHECK-SAME: () -> tensor<4xi32>
-    %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
-    // CHECK: tf.Const
-    // CHECK-SAME: () -> tensor<4xi32>
-    %1 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
-    // CHECK: tf.Add
-    // CHECK-SAME: (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-    %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-    return
-  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 6a63e83be0f..5a2cae38062 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -430,7 +430,6 @@ LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
 Attribute ComputeOutputComponent(const ValuePort& value_port,
                                  ValueQueryFn values) {
   LLVM_DEBUG(value_port.print(llvm::errs() << "\nComputing output for "));
-  if (auto known = values(value_port)) return known;
 
   auto op = value_port.producer.dyn_cast<Operation*>();
   if (!op) return nullptr;
@@ -455,7 +454,6 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
     ValuePort op_port(op->getOperand(port[1]));
     return values(op_port);
   }
-
   return nullptr;
 }
 
@@ -477,11 +475,8 @@ class ShapeInference {
   }
 
   Attribute ComputeOutputComponent(const ValuePort& value_port) {
-    if (auto known_attr = results_[value_port]) return known_attr;
-    auto attr = ::mlir::TF::ComputeOutputComponent(
+    return ::mlir::TF::ComputeOutputComponent(
         value_port, [this](const ValuePort& port) { return results_[port]; });
-    RecordValue(value_port, attr);
-    return attr;
   }
 
   // Returns ShapeHandle if the op result could be computed as shape.
@@ -525,35 +520,19 @@ class ShapeInference {
   LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
                                                     int64_t max_iteration);
 
-  // Propagates any constant operand of call_op to the called function body's
-  // corresponding argument if the callee has only one use.
-  //
-  // TODO(b/154065712): Move this to a more general inter-procedural constant
-  // folding pass.
-  void PropagateConstantToCallee(CallOpInterface call_op,
-                                 SymbolRefAttr callee_sym, ModuleOp module);
-
-  // Propagates any constant return value of the callee function to the call
-  // op's corresponding result.
-  void PropagateConstantFromCallee(CallOpInterface call_op,
-                                   SymbolRefAttr callee_sym, ModuleOp module);
-
-  // Tries to compute the result of folding the op. This doesn't actually
-  // perform constant folding, it is just computes the equivalent constants.
-  // Returns whether it was able to compute constant values.
-  LogicalResult TryToFold(Operation* op);
-
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
   // e.g., first element of OpResult produded) to an Attribute if the ValuePort
   // corresponds to a constant value.
   ValuePortResultMap results_;
   int64_t graph_version_;
+  MLIRContext* context_;
   Dialect* tf_dialect_;
 };
 
 ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
     : graph_version_(graph_version) {
+  context_ = context;
   tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
 }
 
@@ -602,6 +581,7 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
       auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
+      RecordValue(front, ret);
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
@@ -706,14 +686,10 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
     size_t index = it.index();
 
     // If the operand is constant, then convert it to Tensor.
-    ValuePort vp(operand);
-    Attribute attr = ComputeOutputComponent(vp);
-    if (!attr && matchPattern(operand, m_Constant(&attr)))
-      RecordValue(vp, attr);
-    if (attr) {
+    ElementsAttr attr;
+    if (matchPattern(operand, m_Constant(&attr))) {
       tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status =
-          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
+      auto status = tensorflow::ConvertToTensor(attr, input_tensor);
       if (status.ok()) {
         input_tensors[index] = input_tensor;
       } else {
@@ -889,9 +865,13 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
   return success(all_succeeded);
 }
 
-void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
-                                               SymbolRefAttr callee_sym,
-                                               ModuleOp module) {
+// If the callee has only one use, propagates any constant operand of call_op to
+// the called function body's corresponding argument.
+//
+// TODO(b/154065712): Move this to a more general inter-procedural constant
+// folding pass.
+void PropagateConstantToCallee(CallOpInterface call_op,
+                               SymbolRefAttr callee_sym, ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
@@ -899,29 +879,31 @@ void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
   Operation* op = call_op.getOperation();
   if (num_uses == 1) {
     // If this is the only caller, and an operand is a constant, propagate
-    // the constant value inside the function.
+    // the constant inside the function.
     for (auto arg : func.getArguments()) {
-      auto operand = op->getOperand(arg.getArgNumber());
-      if (auto known_constant = ComputeOutputComponent(ValuePort(operand)))
-        RecordValue(ValuePort(arg), known_constant);
+      auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
+      if (isa_and_nonnull<TF::ConstOp>(operand)) {
+        arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
+      }
     }
   }
 }
 
-void ShapeInference::PropagateConstantFromCallee(CallOpInterface call_op,
-                                                 SymbolRefAttr callee_sym,
-                                                 ModuleOp module) {
+// Propagates any constant return value of the callee function to the call op's
+// corresponding result.
+void PropagateConstantFromCallee(CallOpInterface call_op,
+                                 SymbolRefAttr callee_sym, ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
-  // If the return value is a constant, use the constant as the value of
-  // the call return.
+  // If the return value is a constant, replace the call result with a constant.
   Operation* op = call_op.getOperation();
   OpBuilder builder(op);
   builder.setInsertionPointAfter(op);
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
-    ValuePort vp(retval.value());
-    if (auto known_constant = ComputeOutputComponent(vp)) {
-      RecordValue(ValuePort(op->getResult(retval.index())), known_constant);
+    auto retval_op = retval.value().getDefiningOp();
+    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
+      op->getResult(retval.index())
+          .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
     }
   }
 }
@@ -956,68 +938,10 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   return success();
 }
 
-LogicalResult ShapeInference::TryToFold(Operation* op) {
-  // If any output result is known, then the op probably has been computed
-  // before.
-  if (op->getNumResults() > 0 && results_[ValuePort(op->getResult(0))])
-    return success();
-
-  SmallVector<Attribute, 8> constant_operands(op->getNumOperands());
-  SmallVector<OpFoldResult, 8> fold_results;
-
-  // Check to see if any operands to the operation is constant and whether
-  // the operation knows how to constant fold itself.
-  bool some_unknown = false;
-  for (int i = 0, e = op->getNumOperands(); i != e; ++i) {
-    if (!(constant_operands[i] =
-              ComputeOutputComponent(ValuePort(op->getOperand(i)))))
-      some_unknown = true;
-  }
-
-  // Attempt to constant fold the operation.
-  auto* abstract_op = op->getAbstractOperation();
-  if (abstract_op) {
-    if (failed(abstract_op->foldHook(op, constant_operands, fold_results)))
-      return failure();
-  } else {
-    Dialect* dialect = op->getDialect();
-    if (!dialect) return failure();
-    // Only attempt TF dialect fallback if there are no unknown operands.
-    if (some_unknown && dialect == tf_dialect_) return failure();
-    SmallVector<Attribute, 8> constants;
-    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
-      return failure();
-    fold_results.assign(constants.begin(), constants.end());
-  }
-
-  for (auto result : zip(op->getResults(), fold_results)) {
-    auto fold_result = std::get<1>(result);
-    Attribute attr = nullptr;
-    if ((attr = fold_result.dyn_cast<Attribute>())) {
-      RecordValue(ValuePort(std::get<0>(result)), attr);
-    } else {
-      auto value = fold_result.get<Value>();
-      if ((attr = ComputeOutputComponent(ValuePort(value))))
-        RecordValue(ValuePort(std::get<0>(result)), attr);
-    }
-
-    if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
-      if (std::get<0>(result).getType() == eattr.getType()) continue;
-
-      // Inserts a cast back to the original type if any user is not in the
-      // TF dialect.
-      Type old_type = std::get<0>(result).getType();
-      std::get<0>(result).setType(eattr.getType());
-      AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result), tf_dialect_,
-                                         old_type);
-    }
-  }
-
-  return success();
-}
-
 LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
                                                       int64_t max_iteration) {
+  // An operation folder that is used to attempt folding before inference._
+  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -1031,7 +955,9 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
         changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
-        return;
+        // TODO(jpienaar): Debug why we can't just return here. We end up with
+        // additional constant due to the propagation of constant into attached
+        // function if we return already.
       }
 
       if (op->getDialect() != tf_dialect_) {
@@ -1039,9 +965,8 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
         return;
       }
 
-      // Before attempting inference, just try to compute the folded
-      // value/shape.
-      if (succeeded(TryToFold(op))) return;
+      // Before attempting inference, just try to fold the operation.
+      if (succeeded(folder.tryToFold(op))) return;
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.

From 88f814346658016a4d820f40f67fbe69b65258cb Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 14 May 2020 03:14:24 +0000
Subject: [PATCH 0599/1533] Add test case for complex64/complex128 on
 tf.math.l2_normalize

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/nn_test.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 860bdc60387..3d756cb7e81 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -303,6 +303,19 @@ class L2NormalizeTest(test_lib.TestCase):
       print("L2Normalize gradient err = %g " % err)
       self.assertLess(err, 1e-4)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testL2NormalizeComplex(self):
+    x_shape = [20, 7, 3]
+    for dtype in [np.complex64, np.complex128]:
+      np.random.seed(1)
+      x_np = (np.random.random_sample(x_shape).astype(dtype) +
+              np.random.random_sample(x_shape).astype(dtype) * 1j)
+      for dim in range(len(x_shape)):
+        y_np = self._l2Normalize(x_np, dim)
+        x_tf = constant_op.constant(x_np, name="x")
+        y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
+        self.assertAllClose(y_np, self.evaluate(y_tf))
+
 
 class DropoutTest(test_lib.TestCase):
 

From 8098b120097088423fe260d7633f4dfc9d882033 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 10:44:05 -0700
Subject: [PATCH 0600/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311562488
Change-Id: I7dd029345a87fd0c982a8bbedefc29df8a5fd563
---
 tensorflow/go/op/wrappers.go | 80 +++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c6d67c9ad44..a6ee1a13b6e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4715,7 +4715,7 @@ type DenseCountSparseOutputAttr func(optionalAttr)
 
 // DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4727,7 +4727,7 @@ func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
 
 // DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4742,20 +4742,20 @@ func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	values: int32 or int64; Tensor containing data to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	values: Tensor containing data to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values. May
+// also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -8607,7 +8607,7 @@ type RaggedCountSparseOutputAttr func(optionalAttr)
 
 // RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8619,7 +8619,7 @@ func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
 
 // RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8634,33 +8634,27 @@ func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	splits: int64; Tensor containing the row splits of the ragged tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	splits: Tensor containing the row splits of the ragged tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//   END
-//   }
-//   out_arg {
-//     name: "output_dense_shape"
-//     description: <<END
-// int64; shape tensor for the resulting sparse tensor object.
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
 //   END
 //   }
 //   attr {
 //     name: "T"
 //     description: <<END
-// dtype; dtype of the input values tensor.
-//	output_dense_shape
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+// Dtype of the input values tensor.
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -13706,7 +13700,7 @@ type SparseCountSparseOutputAttr func(optionalAttr)
 
 // SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13718,7 +13712,7 @@ func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
 
 // SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13733,22 +13727,22 @@ func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	indices: int64; Tensor containing the indices of the sparse tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	dense_shape: int64; Tensor containing the dense shape of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	indices: Tensor containing the indices of the sparse tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}

From 0d94bc6d71f89a380e0b57967f6c78d59f5785f1 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Thu, 14 May 2020 11:02:42 -0700
Subject: [PATCH 0601/1533] Fix style in `op_hint.py` to match formatting from
 Copybara.

No functional changes

PiperOrigin-RevId: 311566454
Change-Id: Ic4f002df42168bdb8841b80a93ebf22a8e7fa4bd
---
 tensorflow/lite/python/op_hint.py | 62 ++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 159fcaa2bf3..9d62c1b8a97 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -435,6 +435,7 @@ class OpHint(object):
     Args:
       *args: List of inputs to be converted (should be Tf.Tensor).
       **kwargs: This allows 'names' which should be a list of names.
+
     Returns:
       Wrapped inputs (identity standins that have additional metadata). These
       are also are also tf.Tensor's.
@@ -453,6 +454,7 @@ class OpHint(object):
     Args:
       *args: List of outputs to be converted (should be tf.Tensor).
       **kwargs: See
+
     Returns:
       Wrapped outputs (identity standins that have additional metadata). These
       are also tf.Tensor's.
@@ -574,8 +576,8 @@ class _LiteAggregateOperand(_LiteOperand):
       elif self.aggregation == OpHint.AGGREGATE_STACK:
         pass
       else:
-        raise ValueError(
-            "Invalid aggregation type %r specified" % self.aggregation)
+        raise ValueError("Invalid aggregation type %r specified" %
+                         self.aggregation)
     return self.flattened
 
   def flatten(self):
@@ -646,8 +648,8 @@ class _LiteAggregateOperand(_LiteOperand):
       stack_node.attr["num"].i = len(flattened)
       output_type = flattened[0].attr["T"].type
       stack_node.attr["T"].type = output_type
-      stack_node.input.append(_tensorflow_output_name(
-          fused_op_name, output_index))
+      stack_node.input.append(
+          _tensorflow_output_name(fused_op_name, output_index))
       out_graphdef.node.extend([stack_node])
 
       for idx, discrete in enumerate(flattened):
@@ -675,11 +677,10 @@ class _LiteFuncCall(object):
     inputs: inputs to the op (hash from index # to argument)
     outputs: outputs to the op (hash from index # to argument)
     function_name: the tflite custom op name to use
-    uuid: a unique call id for this particular call  (i.e.
-      multiple function calls would have the same function_name but different
-      uuids.
-    params: A param name to key value for op constant data. I.e. for
-      axis on a reduction, strides on a convolution, etc.
+    uuid: a unique call id for this particular call  (i.e. multiple function
+      calls would have the same function_name but different uuids.
+    params: A param name to key value for op constant data. I.e. for axis on a
+      reduction, strides on a convolution, etc.
     level: Level of the OpHint.
     children_inputs_mappings: If the Ophint has children, children inputs
       mappings indicate how their inputs & outputs are mapped.
@@ -700,6 +701,7 @@ class _LiteFuncCall(object):
     Returns:
       Tuple of (inputs, outputs). where input and output i a list of names.
     """
+
     def _flatten(input_or_output_dict):
       flattened_items = []
       for item in input_or_output_dict.values():
@@ -709,6 +711,7 @@ class _LiteFuncCall(object):
     return _flatten(self.inputs), _flatten(self.outputs)
 
   def __str__(self):
+
     def format_args(items):
       s = ""
       for idx, item in items.iteritems():
@@ -739,8 +742,8 @@ def _find_all_hints_in_nodes(nodes):
   for node in nodes:
     attr = node.attr
     # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
-    if (OpHint.FUNCTION_UUID_ATTR not in attr
-        or not attr[OpHint.FUNCTION_UUID_ATTR].s):
+    if (OpHint.FUNCTION_UUID_ATTR not in attr or
+        not attr[OpHint.FUNCTION_UUID_ATTR].s):
       continue
     uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
 
@@ -751,9 +754,11 @@ def _find_all_hints_in_nodes(nodes):
     call_def.level = attr[OpHint.FUNCTION_LEVEL_ATTR].i
     # Get sorting and aggregation information
 
-    sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
-            if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
-    if sort == -1: sort = None
+    sort = (
+        attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
+        if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
+    if sort == -1:
+      sort = None
     aggregation = None
     if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
       aggregation = _compat.as_text(attr[OpHint.FUNCTION_AGGREGATE_ATTR].s)
@@ -887,6 +892,7 @@ def _tensor_name_base(full_tensor_name):
   Args:
     full_tensor_name: A tensor name that is annotated with a device placement
       (this is what tensor flow introspection gives).
+
   Returns:
     A name without any device assignment.
   """
@@ -919,10 +925,10 @@ def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
   while next_to_visit:
     current_node = next_to_visit.pop()
     visited.add(current_node)
-    if (current_node in reachable_by_input
-        and current_node not in input_nodes_set):
-      raise TypeError(
-          "Node %s uses input %s not in input_nodes." % (n, current_node))
+    if (current_node in reachable_by_input and
+        current_node not in input_nodes_set):
+      raise TypeError("Node %s uses input %s not in input_nodes." %
+                      (n, current_node))
     if current_node not in input_nodes_set:
       next_to_visit += [
           input_node for input_node in name_to_input_name[current_node]
@@ -1066,6 +1072,7 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
 
   Args:
     in_graph_def: Graph def to use as input.
+
   Returns:
     Simplified tuple (graph_def, changed_something) where changed_something
     is true if anything was done.
@@ -1101,15 +1108,15 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       node = name_to_node[current_node_name]
       is_op_hint_stack = node.name.startswith("OpHintStack")
       is_op_hint_unstack = node.name.startswith("OpHintUnstack")
-      if (node.op == "Identity" or is_op_hint_stack
-          or (do_generic_pack_unpack and node.op == "Pack")):
+      if (node.op == "Identity" or is_op_hint_stack or
+          (do_generic_pack_unpack and node.op == "Pack")):
         is_hint_created_stack |= is_op_hint_stack
         next_to_visit += [
             input_node for input_node in name_to_input_name[current_node_name]
             if input_node not in visited
         ]
-      elif (is_op_hint_unstack
-            or (do_generic_pack_unpack and node.op == "Unpack")):
+      elif (is_op_hint_unstack or
+            (do_generic_pack_unpack and node.op == "Unpack")):
         unpack_nodes.add(node.name)
         is_hint_created_stack &= is_op_hint_unstack
       else:
@@ -1124,7 +1131,8 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       # Unstacked form
       no_external_dependency = True
       for other_n in in_graph_def.node:
-        if other_n.name in visited: continue
+        if other_n.name in visited:
+          continue
         for input_tensor in name_to_input_name[other_n.name]:
           input_op = _tensor_name_base(input_tensor)
           if input_op in visited and input_op != pack_node:
@@ -1141,9 +1149,9 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
           if node_name not in visited:
             new_node = _copy.deepcopy(other_n)
             new_node.input[:] = [
-                (end_input if stripped == pack_node else
-                 non_stripped) for stripped, non_stripped in zip(
-                     name_to_input_name[node_name], new_node.input[:])
+                (end_input if stripped == pack_node else non_stripped)
+                for stripped, non_stripped in zip(name_to_input_name[node_name],
+                                                  new_node.input[:])
             ]
             out.node.extend([new_node])
         return out, True
@@ -1177,6 +1185,7 @@ def _convert_op_hints_to_stubs_helper(
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new stubbed graph_def.
   """
@@ -1306,6 +1315,7 @@ def convert_op_hints_to_stubs(session=None,
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new graphdef with all ops contained in OpHints being replaced by
     a single op call with the right parameters.

From 5a4e21c1c6e97aa9e1f31a5fe4ac763bd5b57381 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Thu, 14 May 2020 11:21:19 -0700
Subject: [PATCH 0602/1533] Add test for PromoteVarHandlesToArgs pass for
 testing users of tf.VarHandleOps (NFC).

PiperOrigin-RevId: 311570365
Change-Id: I65d0d98b43e4d4b15fa3e798dfd4b58fccb40ec9
---
 .../tests/promote_var_handles_to_args.mlir          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
index 5e53a457ecb..8b8a070cfab 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
@@ -44,3 +44,16 @@ func @duplicate_vars() {
   %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   return
 }
+
+// CHECK-LABEL: func @duplicate_vars_with_users
+// CHECK-SAME: (%arg0: tensor<f32>, %arg1: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK: "tf.ReadVariableOp"(%arg1)
+// CHECK: "tf.AssignAddVariableOp"(%arg1, %arg0)
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars_with_users(%arg0: tensor<f32>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %2 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  "tf.AssignAddVariableOp"(%2, %arg0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  return
+}

From ba43780830f09da72081fe5061c436f1c6203a92 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Thu, 14 May 2020 12:13:06 -0700
Subject: [PATCH 0603/1533] Generate MLIR ops for TPU Host/Device communication
 for outside compilation.

These ops are needed for communicating dependencies(data or control flow) between
TPU device calculations and outside compiled computations run on host.

PiperOrigin-RevId: 311580827
Change-Id: Ia82623ae2a3535b829691952063724cfaedf22bb
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 103 ++++++++++++++++++
 tensorflow/core/ops/tpu_host_compute_ops.cc   |   6 +-
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 64ea0732e8c..aa1601c4032 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -10329,6 +10329,33 @@ https://www.tensorflow.org/xla/operation_semantics#gather
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaHostComputeOp : TF_Op<"XlaHostCompute", []> {
+  let summary = [{
+A pseudo-op to represent host-side computation in an XLA program.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrArrayAttr:$ancestors,
+    TF_ShapeAttrArray:$shapes,
+    SymbolRefAttr:$shape_inference_graph,
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "1000000">:$cost_estimate_ns,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_XlaKeyValueSortOp : TF_Op<"XlaKeyValueSort", [NoSideEffect]> {
   let summary = "Wraps the XLA Sort operator, documented at";
 
@@ -10377,6 +10404,24 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
+  let summary = "An op to receive a tensor from the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape,
+    StrAttr:$key
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr Toutput = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_XlaReduceOp : TF_Op<"XlaReduce", [NoSideEffect]> {
   let summary = "Wraps the XLA Reduce operator, documented at";
 
@@ -10441,6 +10486,23 @@ i=0...N-1.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
+  let summary = "An op to send a tensor to the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$key
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tinput = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaSvdOp : TF_Op<"XlaSvd", [NoSideEffect]> {
   let summary = [{
 Computes the eigen decomposition of a batch of self-adjoint matrices
@@ -10582,3 +10644,44 @@ used to look up the program in the compilation cache.
   TF_DerivedResultSizeAttr num_computations = TF_DerivedResultSizeAttr<1>;
   TF_DerivedOperandSizeAttr NumDynamicShapes = TF_DerivedOperandSizeAttr<0>;
 }
+
+def TF__XlaRecvAtHostOp : TF_Op<"_XlaRecvAtHost", []> {
+  let summary = [{
+A placeholder op to receive values from a running XLA computation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
+  let summary = "A placeholder op to send values to a running XLA computation.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+}
diff --git a/tensorflow/core/ops/tpu_host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
index 48aeb81ac13..753cc0015d9 100644
--- a/tensorflow/core/ops/tpu_host_compute_ops.cc
+++ b/tensorflow/core/ops/tpu_host_compute_ops.cc
@@ -28,8 +28,7 @@ REGISTER_OP("_XlaSendFromHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent from TensorFlow to a
-running XLA computation.
+A placeholder op to send values to a running XLA computation.
 
 inputs: A list of tensors that will be sent to the XLA computation.
 dynamic_key: The key sent at runtime by the compile node to identify which
@@ -49,8 +48,7 @@ REGISTER_OP("_XlaRecvAtHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent to TensorFlow from a
-running XLA computation.
+A placeholder op to receive values from a running XLA computation.
 
 dynamic_key: The key sent at runtime by the compile node to identify which
 execution the transfer corresponds to.

From 215616fddc5731023739da5ab1ebb51cadfc452e Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Thu, 14 May 2020 12:35:12 -0700
Subject: [PATCH 0604/1533] Add support for setting up a TF_OutputList from the
 client and use it to build function with multiple results

PiperOrigin-RevId: 311585364
Change-Id: I5245fd0f5e5c0e8e7e22350d970c508e0154d59b
---
 .../c/eager/c_api_unified_experimental.cc     |   4 +
 .../c/eager/c_api_unified_experimental.h      |  18 ++-
 .../eager/c_api_unified_experimental_graph.cc |   4 +
 .../eager/c_api_unified_experimental_test.cc  | 147 +++++++++++++++++-
 4 files changed, 164 insertions(+), 9 deletions(-)

diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index d29c457798e..e5030a602b3 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -127,6 +127,10 @@ int TF_OutputListNumOutputs(TF_OutputList* o) {
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i) {
   return wrap(unwrap(o)->outputs[i]);
 }
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status* s) {
+  unwrap(o)->outputs.push_back(unwrap(tensor));
+}
 
 void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
                             TF_Status* s) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index 512717caa34..86c59a7f625 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -88,19 +88,21 @@ void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
 void TF_DeleteAbstractTensor(TF_AbstractTensor*);
 
 // TF_OutputList holds the list of TF_AbstractTensor that results from executing
-// an operation.
-// It just lets us not specify the number of outputs of an operation
-// beforehand. This forces a memory allocation in the runtime, which is bad, but
-// it allows for generic code.
-// TODO(aminim): the description above isn't clear with respect to
-// TF_OutputListNumOutputs and the current eager implementation which requires
-// the number of outputs to be set by the client.
+// an operation, or provided to create a function.
+// When executing an operation in an eager context, the expected number of
+// outputs must be set beforehand with `TF_OutputListSetNumOutputs`.
 typedef struct TF_OutputList TF_OutputList;
 TF_OutputList* TF_NewOutputList();
 void TF_DeleteOutputList(TF_OutputList* o);
-void TF_OutputListSetNumOutputs(TF_OutputList* o, int, TF_Status*);
+// Prepare tracing to the expected number of output for an operation.
+void TF_OutputListSetNumOutputs(TF_OutputList* o, int num_outputs, TF_Status*);
+// Return the number of outputs in the list.
 int TF_OutputListNumOutputs(TF_OutputList* o);
+// Return the `i`th output in the list.
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i);
+// Append a tensor at the end of the output list, growing its size by one.
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status*);
 
 // TF_ExecuteOperation will, if in eager mode, execute, if in graph mode, maybe
 // capture some inputs and then add a node in the graph. The output tensors are
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index e38332e3e8e..dd5a95b3526 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -139,6 +139,10 @@ class GraphContext : public ExecutionContext {
       return;
     }
     auto* tf_opdesc = graph_op->op_.release();
+    if (tf_opdesc == nullptr) {
+      TF_SetStatus(s, TF_INVALID_ARGUMENT, "AbstractOp is incomplete.");
+      return;
+    }
     for (int i = 0; i < num_inputs; ++i) {
       auto* graph_tensor = dyncast<GraphTensor>(inputs[i]);
       if (!graph_tensor) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 9f56c8aa579..9776b4d13ed 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -169,7 +169,152 @@ TEST_P(UnifiedCAPI, TestBasicGraph) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
-TEST_P(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Status* s = status.get();
+
+  // Start a new function / execution context.
+  string fn_name = "two_adds";
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  auto* arg0 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  auto* arg1 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a first "Add" computing `arg0 + arg1`.
+  TF_AbstractTensor* add_output1;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add1", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg0, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output1 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Same with a second "Add" computing `arg1 + arg1`.
+  TF_AbstractTensor* add_output2;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add2", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg1, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output2 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Finalize the function by providing the returned values.
+  TF_AbstractFunction* func;
+  {
+    // We want to return the output of both add operations, create a new list
+    // and populate it.
+    TF_OutputList* func_outputs = TF_NewOutputList();
+    TF_OutputListPushBack(func_outputs, add_output1, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_OutputListPushBack(func_outputs, add_output2, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    func = TF_FinalizeFunction(graph_ctx, func_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteOutputList(func_outputs);
+  }
+
+  /**
+   * We traced so far this function:
+   *
+   *   def two_adds(a, b):
+   *     my_add1 = a + b
+   *     my_add2 = b + b
+   *     return my_add1, my_add2
+   *
+   * Now we will execute this function with an eager context:
+   *
+   *   output1, output2 = two_adds(2.0, 3.0)
+   *
+   * and check that we got 5.0 and 6.0 as results.
+   */
+
+  // Build eager context.
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TFE_DeleteContextOptions(opts);
+
+  TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build the abstract op to run the function.
+  TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
+  TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build two abstract input tensors as function arguments.
+  std::vector<TF_AbstractTensor*> func_args;
+  {
+    TFE_Context* eager_ctx =
+        TF_ExecutionContextGetTFEContext(eager_execution_ctx);
+    TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, 2.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    input_eager = TestScalarTensorHandle(eager_ctx, 3.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  }
+
+  TF_OutputList* func_outputs = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(func_outputs, 2, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_ExecuteOperation(fn_op, func_args.size(), func_args.data(), func_outputs,
+                      eager_execution_ctx, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteAbstractOp(fn_op);
+  for (TF_AbstractTensor* t : func_args) TF_DeleteAbstractTensor(t);
+
+  ASSERT_EQ(2, TF_OutputListNumOutputs(func_outputs));
+  float results[2];
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TFE_TensorHandle* handle = TF_AbstractTensorGetEagerTensor(result, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_Tensor* f_t = TFE_TensorHandleResolve(handle, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    results[idx] = *static_cast<float*>(TF_TensorData(f_t));
+    TF_DeleteTensor(f_t);
+  }
+  ASSERT_EQ(results[0], 5.0);
+  ASSERT_EQ(results[1], 6.0);
+
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TF_DeleteAbstractTensor(result);
+  }
+  TF_DeleteOutputList(func_outputs);
+  TF_DeleteExecutionContext(eager_execution_ctx);
+  TF_DeleteAbstractFunction(func);
+}
+
+TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();

From f4a49c6871a36444a0a28e9b127ab052efc1f9ca Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Thu, 14 May 2020 12:40:01 -0700
Subject: [PATCH 0605/1533] Set sparse FullyConnected op version properly.

PiperOrigin-RevId: 311586496
Change-Id: Ieb57857388bbb25de02163b9a6594dd02666b867
---
 tensorflow/lite/toco/tflite/operator.cc             |  1 +
 tensorflow/lite/tools/versioning/op_version.cc      | 10 ++++++++++
 tensorflow/lite/tools/versioning/op_version.h       |  3 +++
 tensorflow/lite/tools/versioning/op_version_test.cc |  9 +++++++++
 4 files changed, 23 insertions(+)

diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 917fd24c952..fee10a19787 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -487,6 +487,7 @@ class FullyConnected
     op_sig.options.fully_connected.keep_num_dims = fc_op.keep_num_dims;
     op_sig.options.fully_connected.weights_format =
         GetWeightFormat(fc_op.weights_format);
+    op_sig.options.fully_connected.sparse_weight = false;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 9022afca629..118e2d420f8 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -121,6 +121,11 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       // | Quantized Int8  |                  4 |                        4 |
       // +-----------------+--------------------+--------------------------+
 
+      // FullyConnected with sparse weight is supported at version 8.
+      if (op_sig.options.fully_connected.sparse_weight) {
+        return 8;
+      }
+
       // Int16 fully fixed point kernel is at version 7.
       if (op_sig.input_types.at(0) == TensorType_INT16 &&
           op_sig.input_types.at(1) == TensorType_INT16 &&
@@ -578,6 +583,11 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.fully_connected.weights_format =
             fully_connected_option->weights_format();
       }
+
+      const Tensor* weight_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      op_sig.options.fully_connected.sparse_weight =
+          (weight_tensor->sparsity() != nullptr);
     } break;
 
     case BuiltinOperator_MUL: {
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index 4b0fe8836e2..df74ffaf6dd 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -37,6 +37,9 @@ typedef struct {
     struct {
       bool keep_num_dims;
       FullyConnectedOptionsWeightsFormat weights_format;
+      // TODO(b/156530611): Make this global when more ops support sparse
+      // computation.
+      bool sparse_weight;
     } fully_connected;
     struct {
       float input1_scale;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index f0d8259d764..4017fc3bff0 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -352,6 +352,15 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   fake_op_sig.options.fully_connected = {
       false, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  fake_op_sig.options.fully_connected = {
+      false, FullyConnectedOptionsWeightsFormat_DEFAULT, true};
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {

From 45d18ddb7ee181d5f847c64558ad72d63e9db609 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 13:22:03 -0700
Subject: [PATCH 0606/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311594498
Change-Id: I8a91e5e8f8418d44ece61b1c52c76892ff949d0b
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a6ee1a13b6e..e6725269279 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 66769844a5c58e8a25352d8a16ff40b04f6c523e Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 14 May 2020 13:27:42 -0700
Subject: [PATCH 0607/1533] [XLA:CPU] Allow C64 and C128 types in Sort().

These seem to have been omitted mostly as an oversight; the logic in Sort() doesn't seem to be data-type specific.

PiperOrigin-RevId: 311595522
Change-Id: I6264bbe6556a0823e8a88e2025c4886182aad6bf
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 23 ++++---------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index f516a1538d3..5a4c6250293 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <algorithm>
 #include <iterator>
 #include <limits>
@@ -570,25 +571,9 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
   PrimitiveType keys_type = keys_shape.element_type();
-  switch (keys_type) {
-    case PRED:
-    case S8:
-    case U8:
-    case S16:
-    case U16:
-    case BF16:
-    case F16:
-    case S32:
-    case U32:
-    case F32:
-    case S64:
-    case U64:
-    case F64:
-      break;
-    default:
-      return Unimplemented(
-          "Element type %s not supported in the Sort op on CPU.",
-          PrimitiveType_Name(keys_type));
+  if (!primitive_util::IsArrayType(keys_type)) {
+    return Unimplemented("Element type %s not supported in the Sort op on CPU.",
+                         PrimitiveType_Name(keys_type));
   }
   std::vector<llvm::Value*> destination_addresses(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {

From 6db3caf99be91664813ae621e62c3287e2af44d3 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 14 May 2020 13:28:33 -0700
Subject: [PATCH 0608/1533] Update gather_op_test and unique_op_test to use
 subTest for easier debugging.

PiperOrigin-RevId: 311595699
Change-Id: I1a8cf8b5b314aada4aeeece2603e975bc8a4ff42
---
 .../python/kernel_tests/gather_op_test.py     | 213 +++++++++---------
 .../python/kernel_tests/unique_op_test.py     |  59 ++---
 2 files changed, 142 insertions(+), 130 deletions(-)

diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 953f18bb07a..b966110963c 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -62,14 +62,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in _TEST_TYPES:
         for indices in 4, [1, 2, 2, 4, 5]:
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices_tf = constant_op.constant(indices)
-          gather_t = array_ops.gather(params, indices_tf)
-          gather_val = self.evaluate(gather_t)
-          np_val = params_np[indices]
-          self.assertAllEqual(np_val, gather_val)
-          self.assertEqual(np_val.shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, indices=indices):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices_tf = constant_op.constant(indices)
+            gather_t = array_ops.gather(params, indices_tf)
+            gather_val = self.evaluate(gather_t)
+            np_val = params_np[indices]
+            self.assertAllEqual(np_val, gather_val)
+            self.assertEqual(np_val.shape, gather_t.get_shape())
 
   def testScalar2D(self):
     with self.session(use_gpu=True):
@@ -77,14 +78,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices = constant_op.constant(2)
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
-          expected_shape = data.shape[:axis] + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices = constant_op.constant(2)
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
+            expected_shape = data.shape[:axis] + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   def testSimpleTwoD32(self):
     with self.session(use_gpu=True):
@@ -92,16 +94,17 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          # The indices must be in bounds for any axis.
-          indices = constant_op.constant([0, 1, 0, 2])
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
-                              gather_val)
-          expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            # The indices must be in bounds for any axis.
+            indices = constant_op.constant([0, 1, 0, 2])
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
+                                gather_val)
+            expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
@@ -112,58 +115,60 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         for axis in range(len(shape)):
           params = self._buildParams(np.random.randn(*shape), dtype)
           indices = np.random.randint(shape[axis], size=indices_shape)
-          with self.cached_session(use_gpu=True) as sess:
-            tf_params = constant_op.constant(params)
-            tf_indices = constant_op.constant(indices)
-            # Check that both positive and negative indices for axis work.
-            tf_axis = constant_op.constant(axis)
-            tf_negative_axis = constant_op.constant(-len(shape) + axis)
-            gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
-            gather_negative_axis = array_ops.gather(
-                tf_params, tf_indices, axis=tf_negative_axis)
-            gather_value, gather_negative_axis_value = sess.run(
-                [gather, gather_negative_axis])
-            gather_np = np.take(params, indices, axis)
-            self.assertAllEqual(gather_np, gather_value)
-            self.assertAllEqual(gather_np, gather_negative_axis_value)
-            expected_shape = (params.shape[:axis] + indices.shape +
-                              params.shape[axis + 1:])
-            self.assertEqual(expected_shape, gather.shape)
-            self.assertEqual(expected_shape, gather_negative_axis.shape)
+          with self.subTest(indices_shape=indices_shape, dtype=dtype, axis=axis,
+                            indices=indices):
+            with self.cached_session(use_gpu=True) as sess:
+              tf_params = constant_op.constant(params)
+              tf_indices = constant_op.constant(indices)
+              # Check that both positive and negative indices for axis work.
+              tf_axis = constant_op.constant(axis)
+              tf_negative_axis = constant_op.constant(-len(shape) + axis)
+              gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
+              gather_negative_axis = array_ops.gather(
+                  tf_params, tf_indices, axis=tf_negative_axis)
+              gather_value, gather_negative_axis_value = sess.run(
+                  [gather, gather_negative_axis])
+              gather_np = np.take(params, indices, axis)
+              self.assertAllEqual(gather_np, gather_value)
+              self.assertAllEqual(gather_np, gather_negative_axis_value)
+              expected_shape = (params.shape[:axis] + indices.shape +
+                                params.shape[axis + 1:])
+              self.assertEqual(expected_shape, gather.shape)
+              self.assertEqual(expected_shape, gather_negative_axis.shape)
 
-            # Test gradients
-            gather_grad = np.random.randn(
-                *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
-            if dtype.is_complex:
-              gather_grad -= 1j * gather_grad
-            params_grad, indices_grad, axis_grad = gradients_impl.gradients(
-                gather, [tf_params, tf_indices, tf_axis], gather_grad)
-            self.assertEqual(indices_grad, None)
-            self.assertEqual(axis_grad, None)
-            if dtype.is_integer:
-              self.assertEqual(params_grad, None)
-              continue
-            # For axis 0, we are able to create an efficient IndexedSlices for
-            # the gradient.
-            if axis == 0:
-              self.assertEqual(type(params_grad), ops.IndexedSlices)
-              params_grad = ops.convert_to_tensor(params_grad)
-            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
-            outer_dims = axis
-            inner_dims = len(shape) - axis - 1
-            gather_grad = gather_grad.reshape(
-                shape[:axis] + (indices.size,) + shape[axis + 1:])
-            for source_index, dest_index in enumerate(indices.flat):
-              dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
-                            (slice(None),) * inner_dims)
-              source_slice = ((slice(None),) * outer_dims + (source_index,) +
+              # Test gradients
+              gather_grad = np.random.randn(
+                  *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
+              if dtype.is_complex:
+                gather_grad -= 1j * gather_grad
+              params_grad, indices_grad, axis_grad = gradients_impl.gradients(
+                  gather, [tf_params, tf_indices, tf_axis], gather_grad)
+              self.assertEqual(indices_grad, None)
+              self.assertEqual(axis_grad, None)
+              if dtype.is_integer:
+                self.assertEqual(params_grad, None)
+                continue
+              # For axis 0, we are able to create an efficient IndexedSlices for
+              # the gradient.
+              if axis == 0:
+                self.assertEqual(type(params_grad), ops.IndexedSlices)
+                params_grad = ops.convert_to_tensor(params_grad)
+              correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+              outer_dims = axis
+              inner_dims = len(shape) - axis - 1
+              gather_grad = gather_grad.reshape(
+                  shape[:axis] + (indices.size,) + shape[axis + 1:])
+              for source_index, dest_index in enumerate(indices.flat):
+                dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
                               (slice(None),) * inner_dims)
-              correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(
-                correct_params_grad,
-                self.evaluate(params_grad),
-                atol=2e-6,
-                rtol=2e-6)
+                source_slice = ((slice(None),) * outer_dims + (source_index,) +
+                                (slice(None),) * inner_dims)
+                correct_params_grad[dest_slice] += gather_grad[source_slice]
+              self.assertAllClose(
+                  correct_params_grad,
+                  self.evaluate(params_grad),
+                  atol=2e-6,
+                  rtol=2e-6)
 
   @test_util.run_deprecated_v1
   def testString(self):
@@ -177,12 +182,14 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
-      params = self._buildParams(
-          np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
-      with self.cached_session():
-        self.assertAllEqual([7, 8, 9],
-                            array_ops.gather(params, 1, axis=0).eval())
-        self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
+      with self.subTest(unsigned_type=unsigned_type):
+        params = self._buildParams(
+            np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
+        with self.cached_session():
+          self.assertAllEqual([7, 8, 9],
+                              array_ops.gather(params, 1, axis=0).eval())
+          self.assertAllEqual([1, 7],
+                              array_ops.gather(params, 0, axis=1).eval())
 
   @test_util.run_deprecated_v1
   def testUnknownIndices(self):
@@ -237,14 +244,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       indices = 0
       for bad_axis in (1, 2, -2):
         # Shape inference can validate axis for known params rank.
-        with self.assertRaisesWithPredicateMatch(
-            ValueError, "Shape must be at least rank . but is rank 1"):
-          array_ops.gather(params, indices, axis=bad_axis)
-        # If params rank is unknown, an op error occurs.
-        with self.assertRaisesOpError(
-            r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
-          array_ops.gather(params_ph, indices, axis=bad_axis).eval(
-              feed_dict={params_ph: params})
+        with self.subTest(bad_axis=bad_axis):
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Shape must be at least rank . but is rank 1"):
+            array_ops.gather(params, indices, axis=bad_axis)
+          # If params rank is unknown, an op error occurs.
+          with self.assertRaisesOpError(
+              r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
+            array_ops.gather(params_ph, indices, axis=bad_axis).eval(
+                feed_dict={params_ph: params})
 
   @test_util.run_deprecated_v1
   def testEmptySlices(self):
@@ -252,20 +260,21 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
           # Leading axis gather.
-          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
-          indices = np.array([3, 4], dtype=itype)
-          gather = array_ops.gather(params, indices, axis=0)
-          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+          with self.subTest(dtype=dtype, itype=itype):
+            params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
+            indices = np.array([3, 4], dtype=itype)
+            gather = array_ops.gather(params, indices, axis=0)
+            self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
 
-          # Middle axis gather.
-          params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=1)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
+            # Middle axis gather.
+            params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=1)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
 
-          # Trailing axis gather.
-          params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=2)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
+            # Trailing axis gather.
+            params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=2)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
 
   @parameterized.parameters([
       # batch_dims=0 (equivalent to tf.gather)
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 7d9e875be2d..436fef8171f 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -61,17 +61,18 @@ class UniqueTest(test.TestCase):
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0 = self.evaluate([y0, idx0])
-      y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1 = self.evaluate([y1, idx1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
+        y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used
@@ -144,26 +145,28 @@ class UniqueWithCountsTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
     for value, count in zip(tf_y, tf_count):
-      v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
-      self.assertEqual(count, sum(v))
+      with self.subTest(value=value, count=count):
+        v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
+        self.assertEqual(count, sum(v))
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
-      y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_count0, np.array([2, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
-      self.assertAllEqual(tf_count1, np.array([1, 2]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_count0, np.array([2, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+        self.assertAllEqual(tf_count1, np.array([1, 2]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used

From 28feb4df0d4ab386946bdee1a0e5c36cc58246cf Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 15 Apr 2020 13:15:45 -0700
Subject: [PATCH 0609/1533] Enable build with CUDA 11

---
 tensorflow/stream_executor/cuda/BUILD         |    4 +
 .../stream_executor/cuda/cublas_11_0.inc      | 4038 +++++++++
 .../stream_executor/cuda/cublas_stub.cc       |    6 +-
 tensorflow/stream_executor/cuda/cuda_11_0.inc | 2036 +++++
 .../cuda/cuda_runtime_11_0.inc                | 1501 ++++
 .../cuda/cusolver_dense_11_0.inc              | 5953 ++++++++++++
 .../stream_executor/cuda/cusparse_11_0.inc    | 7942 +++++++++++++++++
 .../stream_executor/cuda/cusparse_stub.cc     |    3 +-
 .../platform/default/dso_loader.cc            |   16 +-
 third_party/gpus/cuda/BUILD.tpl               |   36 +
 third_party/gpus/cuda/BUILD.windows.tpl       |   36 +
 third_party/gpus/cuda/cuda_config.h.tpl       |    6 +-
 third_party/gpus/cuda_configure.bzl           |  113 +-
 third_party/gpus/find_cuda_config.py          |  151 +-
 .../gpus/find_cuda_config.py.gz.base64        |    2 +-
 15 files changed, 21813 insertions(+), 30 deletions(-)
 create mode 100644 tensorflow/stream_executor/cuda/cublas_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cuda_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cusparse_11_0.inc

diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 1457a36beaf..67e1726f168 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -273,6 +273,7 @@ cc_library(
     textual_hdrs = glob(["cufft_*.inc"]),
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cufft_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
@@ -371,6 +372,7 @@ cc_library(
     textual_hdrs = ["curand_10_0.inc"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:curand_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
@@ -430,6 +432,7 @@ cc_library(
         # LINT.IfChange
         "@local_config_cuda//cuda:cublas_headers",
         # LINT.ThenChange(//tensorflow/copy.bara.sky:cublas_headers)
+        "@local_config_cuda//cuda:cusolver_headers",
         "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
@@ -451,6 +454,7 @@ cc_library(
     textual_hdrs = glob(["cusparse_*.inc"]),
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuda//cuda:cusparse_headers",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform:dso_loader",
     ]),
diff --git a/tensorflow/stream_executor/cuda/cublas_11_0.inc b/tensorflow/stream_executor/cuda/cublas_11_0.inc
new file mode 100644
index 00000000000..36ddbfd0648
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublas_11_0.inc
@@ -0,0 +1,4038 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+size_t CUBLASWINAPI cublasGetCudartVersion(void) {
+  using FuncPtr = size_t (CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut, int logToStdErr, const char* logFileName) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLogCallback);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLogCallback *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
+                                             int incx, void *devicePtr, int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
+                                             int incx, void *y, int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
+                                             const void *A, int lda, void *B, 
+                                             int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
+                                             const void *A, int lda, void *B,
+                                             int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
+                                                  const void *hostPtr, int incx, 
+                                                  void *devicePtr, int incy,
+                                                  cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
+                                                  const void *devicePtr, int incx,
+                                                  void *hostPtr, int incy,
+                                                  cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
+                                                  const void *A, int lda, void *B,
+                                                  int ldb, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
+                                                  const void *A, int lda, void *B,
+                                                  int ldb, cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
+  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
+  return func_ptr(srName, info);
+}
+
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const void *x, 
+                                                     cudaDataType xType,
+                                                     int incx, 
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
+                                                     int n, 
+                                                     const void *x,
+                                                     cudaDataType xType, 
+                                                     int incx, 
+                                                     const void *y, 
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
+                                                     int n, 
+                                                     const void *x,
+                                                     cudaDataType xType, 
+                                                     int incx, 
+                                                     const void *y, 
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     void *result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     const float *y, 
+                                                     int incy,
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     const double *y,
+                                                     int incy,
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      const cuComplex *y, 
+                                                      int incy,
+                                                      cuComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      const cuComplex *y, 
+                                                      int incy,
+                                                      cuComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      const cuDoubleComplex *y, 
+                                                      int incy,
+                                                      cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *y, 
+                                                      int incy,
+                                                      cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const void *alpha,  /* host or device pointer */
+                                                     cudaDataType alphaType,
+                                                     void *x, 
+                                                     cudaDataType xType,
+                                                     int incx,
+                                                     cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *alpha,  /* host or device pointer */
+                                                     float *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *alpha,  /* host or device pointer */
+                                                     double *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const cuComplex *alpha, /* host or device pointer */
+                                                     cuComplex *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */
+                                                     cuDoubleComplex *x, 
+                                                     int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
+                                                      int n,
+                                                      const void *alpha, /* host or device pointer */
+                                                      cudaDataType alphaType,
+                                                      const void *x,
+                                                      cudaDataType xType,
+                                                      int incx,
+                                                      void *y,
+                                                      cudaDataType yType,
+                                                      int incy,
+                                                      cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      const float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */
+                                                      const double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *alpha, /* host or device pointer */
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCopyEx (cublasHandle_t handle,
+                                                      int n, 
+                                                      const void *x,
+                                                      cudaDataType xType,
+                                                      int incx, 
+                                                      void *y,
+                                                      cudaDataType yType,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCopyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      float *x, 
+                                                      int incx, 
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      double *x, 
+                                                      int incx, 
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      cuComplex *x, 
+                                                      int incx, 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
+                                                      int n, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSwapEx (cublasHandle_t handle,
+                                                      int n,
+                                                      void *x,
+                                                      cudaDataType xType,
+                                                      int incx,
+                                                      void *y,
+                                                      cudaDataType yType,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSwapEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIamaxEx(cublasHandle_t handle,
+                                                      int n,
+                                                      const void *x, cudaDataType xType,
+                                                      int incx,
+                                                      int *result  /* host or device pointer */
+                                                    ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIamaxEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const double *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      int *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIaminEx(cublasHandle_t handle,
+                                                      int n,
+                                                      const void *x, cudaDataType xType,
+                                                      int incx,
+                                                      int *result /* host or device pointer */
+                                                    ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIaminEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAsumEx(cublasHandle_t handle,
+                                                     int n, 
+                                                     const void *x,
+                                                     cudaDataType xType,
+                                                     int incx, 
+                                                     void *result,
+                                                     cudaDataType resultType, /* host or device pointer */
+                                                     cudaDataType executiontype
+                                                  ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAsumEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const float *x, 
+                                                     int incx, 
+                                                     float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     const double *x, 
+                                                     int incx, 
+                                                     double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuComplex *x, 
+                                                      int incx, 
+                                                      float *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
+                                                      int n, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx, 
+                                                      double *result) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     float *x, 
+                                                     int incx, 
+                                                     float *y, 
+                                                     int incy, 
+                                                     const float *c,  /* host or device pointer */
+                                                     const float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     double *x, 
+                                                     int incx, 
+                                                     double *y, 
+                                                     int incy, 
+                                                     const double *c,  /* host or device pointer */
+                                                     const double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuComplex *x, 
+                                                     int incx, 
+                                                     cuComplex *y, 
+                                                     int incy, 
+                                                     const float *c,      /* host or device pointer */
+                                                     const cuComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuComplex *x, 
+                                                     int incx, 
+                                                     cuComplex *y, 
+                                                     int incy, 
+                                                     const float *c,  /* host or device pointer */
+                                                     const float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuDoubleComplex *x, 
+                                                     int incx, 
+                                                     cuDoubleComplex *y, 
+                                                     int incy, 
+                                                     const double *c,            /* host or device pointer */
+                                                     const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     cuDoubleComplex *x, 
+                                                     int incx, 
+                                                     cuDoubleComplex *y, 
+                                                     int incy, 
+                                                     const double *c,  /* host or device pointer */
+                                                     const double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasRotEx (cublasHandle_t handle,
+                                                     int n,
+                                                     void *x,
+                                                     cudaDataType xType,
+                                                     int incx,
+                                                     void *y,
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     const void *c,  /* host or device pointer */
+                                                     const void *s,
+                                                     cudaDataType csType,
+                                                     cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int, const void *, const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, c, s, csType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
+                                                     float *a,   /* host or device pointer */
+                                                     float *b,   /* host or device pointer */
+                                                     float *c,   /* host or device pointer */
+                                                     float *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
+                                                     double *a,  /* host or device pointer */
+                                                     double *b,  /* host or device pointer */
+                                                     double *c,  /* host or device pointer */
+                                                     double *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
+                                                     cuComplex *a,  /* host or device pointer */
+                                                     cuComplex *b,  /* host or device pointer */
+                                                     float *c,      /* host or device pointer */
+                                                     cuComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
+                                                     cuDoubleComplex *a,  /* host or device pointer */
+                                                     cuDoubleComplex *b,  /* host or device pointer */
+                                                     double *c,           /* host or device pointer */
+                                                     cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
+                                                     void *a,   /* host or device pointer */
+                                                     void *b,   /* host or device pointer */
+                                                     cudaDataType abType,
+                                                     void *c,   /* host or device pointer */
+                                                     void *s,   /* host or device pointer */
+                                                     cudaDataType csType,
+                                                     cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, void *, void *, cudaDataType, void *, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     float *x, 
+                                                     int incx, 
+                                                     float *y, 
+                                                     int incy, 
+                                                     const float* param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
+                                                     int n, 
+                                                     double *x, 
+                                                     int incx, 
+                                                     double *y, 
+                                                     int incy, 
+                                                     const double* param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasRotmEx(cublasHandle_t handle,
+                                                     int n,
+                                                     void *x,
+                                                     cudaDataType xType,
+                                                     int incx,
+                                                     void *y,
+                                                     cudaDataType yType,
+                                                     int incy,
+                                                     const void* param, /* host or device pointer */
+                                                     cudaDataType paramType,
+                                                     cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int, const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, param, paramType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
+                                                      float *d1,        /* host or device pointer */
+                                                      float *d2,        /* host or device pointer */
+                                                      float *x1,        /* host or device pointer */
+                                                      const float *y1,  /* host or device pointer */
+                                                      float *param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
+                                                      double *d1,        /* host or device pointer */  
+                                                      double *d2,        /* host or device pointer */  
+                                                      double *x1,        /* host or device pointer */  
+                                                      const double *y1,  /* host or device pointer */  
+                                                      double *param) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasRotmgEx(cublasHandle_t handle,
+                                                      void *d1,        /* host or device pointer */
+                                                      cudaDataType d1Type,
+                                                      void *d2,        /* host or device pointer */
+                                                      cudaDataType d2Type,
+                                                      void *x1,        /* host or device pointer */
+                                                      cudaDataType x1Type,
+                                                      const void *y1,  /* host or device pointer */
+                                                      cudaDataType y1Type,
+                                                      void *param,     /* host or device pointer */
+                                                      cudaDataType paramType,
+                                                      cudaDataType executiontype
+                                                      ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, void *, cudaDataType, void *, cudaDataType, void *, cudaDataType, const void *, cudaDataType, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d1Type, d2, d2Type, x1, x1Type, y1, y1Type, param, paramType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m, 
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */
+                                                      const float *A, 
+                                                      int lda, 
+                                                      const float *x, 
+                                                      int incx, 
+                                                      const float *beta,  /* host or device pointer */
+                                                      float *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */
+                                                      double *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x, 
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda, 
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A, 
+                                                      int lda, 
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda, 
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda, 
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
+                                                      cublasOperation_t trans, 
+                                                      int m,
+                                                      int n,
+                                                      int kl,
+                                                      int ku, 
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda, 
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *AP, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *AP, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *AP, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *AP, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const float *AP, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const double *AP, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuComplex *AP, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      const cuDoubleComplex *AP, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const float *A, 
+                                                      int lda, 
+                                                      float *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuComplex *A, 
+                                                      int lda, 
+                                                      cuComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      cublasDiagType_t diag, 
+                                                      int n, 
+                                                      int k, 
+                                                      const cuDoubleComplex *A, 
+                                                      int lda, 
+                                                      cuDoubleComplex *x, 
+                                                      int incx) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */ 
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta, /* host or device pointer */ 
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta, /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,   /* host or device pointer */ 
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *x, 
+                                                      int incx,
+                                                      const float *beta,  /* host or device pointer */ 
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,   /* host or device pointer */ 
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *x, 
+                                                      int incx,
+                                                      const double *beta,   /* host or device pointer */ 
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */ 
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *x, 
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */ 
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *x, 
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                      cuDoubleComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const float *alpha,  /* host or device pointer */                                           
+                                                      const float *AP,
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      float *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *AP,
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      double *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *AP,
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *y,
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *AP,
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *y, 
+                                                      int incy) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     const float *y,
+                                                     int incy,
+                                                     float *A,
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
+                                                     int m,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */   
+                                                     const double *x,
+                                                     int incx,
+                                                     const double *y,
+                                                     int incy,
+                                                     double *A,
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     float *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const double *x,
+                                                     int incx,
+                                                     double *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *A, 
+                                                     int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const float *x,
+                                                     int incx,
+                                                     float *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const double *x,
+                                                     int incx,
+                                                     double *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float *alpha, /* host or device pointer */  
+                                                     const cuComplex *x,
+                                                     int incx,
+                                                     cuComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *x,
+                                                     int incx,
+                                                     cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *y,
+                                                      int incy,
+                                                      float *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *x,
+                                                      int incx,
+                                                      const double *y,
+                                                      int incy,
+                                                      double *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, int n, 
+                                                      const cuComplex *alpha,  /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx, 
+                                                      const cuComplex *y,
+                                                      int incy, 
+                                                      cuComplex *A, 
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, int n, 
+                                                      const cuComplex *alpha,  /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx, 
+                                                      const cuComplex *y,
+                                                      int incy, 
+                                                      cuComplex *A, 
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n, 
+                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *A,
+                                                      int lda) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const float *x,
+                                                      int incx,
+                                                      const float *y,
+                                                      int incy,
+                                                      float *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const double *x,
+                                                      int incx, 
+                                                      const double *y,
+                                                      int incy,
+                                                      double *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *x,
+                                                      int incx,
+                                                      const cuComplex *y,
+                                                      int incy,
+                                                      cuComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *x,
+                                                      int incx,
+                                                      const cuDoubleComplex *y,
+                                                      int incy,
+                                                      cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A, 
+                                                      int lda,
+                                                      const float *B,
+                                                      int ldb, 
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A, 
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb, 
+                                                      const double *beta, /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A, 
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb, 
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A, 
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb, 
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
+                                                     cublasOperation_t transa, cublasOperation_t transb,  
+                                                     int m, int n, int k, 
+                                                     const cuComplex *alpha, 
+                                                     const void *A, 
+                                                     cudaDataType Atype, 
+                                                     int lda, 
+                                                     const void *B, 
+                                                     cudaDataType Btype, 
+                                                     int ldb,
+                                                     const cuComplex *beta, 
+                                                     void *C, 
+                                                     cudaDataType Ctype, 
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A, 
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb, 
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A, 
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb, 
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *B,
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const float *beta, /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const void *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *B,
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const void *beta, /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc,
+                                                      cublasComputeType_t computeType,
+                                                      cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cublasComputeType_t, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
+                                                     cublasOperation_t transa, cublasOperation_t transb,  
+                                                     int m, int n, int k, 
+                                                     const cuComplex *alpha, 
+                                                     const void *A, 
+                                                     cudaDataType Atype, 
+                                                     int lda, 
+                                                     const void *B, 
+                                                     cudaDataType Btype, 
+                                                     int ldb,
+                                                     const cuComplex *beta, 
+                                                     void *C, 
+                                                     cudaDataType Ctype, 
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
+                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
+                                                           int m, int n, int k, 
+                                                           const unsigned char *A, int A_bias, int lda, 
+                                                           const unsigned char *B, int B_bias, int ldb,
+                                                                 unsigned char *C, int C_bias, int ldc,
+                                                           int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype, 
+                                                      int lda,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      void *C, 
+                                                      cudaDataType Ctype, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo, 
+                                                      cublasOperation_t trans, 
+                                                      int n, 
+                                                      int k,
+                                                      const cuComplex *alpha, 
+                                                      const void *A, 
+                                                      cudaDataType Atype, 
+                                                      int lda,
+                                                      const cuComplex *beta, 
+                                                      void *C, 
+                                                      cudaDataType Ctype, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double *alpha,  /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const double *beta,  /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float *alpha,  /* host or device pointer */  
+                                                      const void *A, 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const float *beta,   /* host or device pointer */  
+                                                      void *C,
+                                                      cudaDataType Ctype,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo, 
+                                                       cublasOperation_t trans, 
+                                                       int n, 
+                                                       int k,
+                                                       const float *alpha, 
+                                                       const void *A, cudaDataType Atype, 
+                                                       int lda,
+                                                       const float *beta, 
+                                                       void *C, 
+                                                       cudaDataType Ctype, 
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const float *alpha, /* host or device pointer */  
+                                                       const float *A,
+                                                       int lda,
+                                                       const float *B,
+                                                       int ldb,
+                                                       const float *beta, /* host or device pointer */  
+                                                       float *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const double *alpha, /* host or device pointer */  
+                                                       const double *A,
+                                                       int lda,
+                                                       const double *B,
+                                                       int ldb,
+                                                       const double *beta, /* host or device pointer */  
+                                                       double *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuComplex *alpha, /* host or device pointer */  
+                                                       const cuComplex *A,
+                                                       int lda,
+                                                       const cuComplex *B,
+                                                       int ldb,
+                                                       const cuComplex *beta, /* host or device pointer */  
+                                                       cuComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
+                                                       const cuDoubleComplex *A,
+                                                       int lda,
+                                                       const cuDoubleComplex *B,
+                                                       int ldb,
+                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
+                                                       cuDoubleComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int n,
+                                                       int k,
+                                                       const cuComplex *alpha, /* host or device pointer */  
+                                                       const cuComplex *A,
+                                                       int lda,
+                                                       const cuComplex *B,
+                                                       int ldb,
+                                                       const float *beta,   /* host or device pointer */  
+                                                       cuComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans, 
+                                                       int n,
+                                                       int k,
+                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                       const cuDoubleComplex *A, 
+                                                       int lda,
+                                                       const cuDoubleComplex *B,
+                                                       int ldb,
+                                                       const double *beta, /* host or device pointer */  
+                                                       cuDoubleComplex *C,
+                                                       int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const float *alpha, /* host or device pointer */ 
+                                                    const float *A,
+                                                    int lda,
+                                                    const float *B,
+                                                    int ldb,
+                                                    const float *beta, /* host or device pointer */ 
+                                                    float *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const double *alpha, /* host or device pointer */ 
+                                                    const double *A,
+                                                    int lda,
+                                                    const double *B,
+                                                    int ldb,
+                                                    const double *beta, /* host or device pointer */ 
+                                                    double *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex *alpha, /* host or device pointer */ 
+                                                    const cuComplex *A,
+                                                    int lda,
+                                                    const cuComplex *B,
+                                                    int ldb,
+                                                    const cuComplex *beta, /* host or device pointer */ 
+                                                    cuComplex *C, 
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo, 
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                    const cuDoubleComplex *A,
+                                                    int lda,
+                                                    const cuDoubleComplex *B,
+                                                    int ldb,
+                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                    cuDoubleComplex *C, 
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex *alpha, /* host or device pointer */ 
+                                                    const cuComplex *A,
+                                                    int lda,
+                                                    const cuComplex *B,
+                                                    int ldb,
+                                                    const float *beta, /* host or device pointer */ 
+                                                    cuComplex *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                    const cuDoubleComplex *A,
+                                                    int lda,
+                                                    const cuDoubleComplex *B,
+                                                    int ldb,
+                                                    const double *beta, /* host or device pointer */ 
+                                                    cuDoubleComplex *C,
+                                                    int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      const float *B,
+                                                      int ldb,
+                                                      const float *beta, /* host or device pointer */  
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m, 
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb,
+                                                      const double *beta, /* host or device pointer */  
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuComplex *alpha, /* host or device pointer */  
+                                                      const cuComplex *A,
+                                                      int lda,
+                                                      const cuComplex *B,
+                                                      int ldb,
+                                                      const cuComplex *beta, /* host or device pointer */  
+                                                      cuComplex *C, 
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      int m,
+                                                      int n,
+                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                      const cuDoubleComplex *A,
+                                                      int lda,
+                                                      const cuDoubleComplex *B,
+                                                      int ldb,
+                                                      const cuDoubleComplex *beta, /* host or device pointer */  
+                                                      cuDoubleComplex *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda,
+                                                      float *B,
+                                                      int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A, 
+                                                      int lda, 
+                                                      double *B,
+                                                      int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     cuComplex *B,
+                                                     int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *A,                                        
+                                                     int lda,
+                                                     cuDoubleComplex *B,
+                                                     int ldb) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const float *alpha, /* host or device pointer */  
+                                                      const float *A,
+                                                      int lda, 
+                                                      const float *B,
+                                                      int ldb,
+                                                      float *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
+                                                      cublasSideMode_t side,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      cublasDiagType_t diag,
+                                                      int m,
+                                                      int n,
+                                                      const double *alpha, /* host or device pointer */  
+                                                      const double *A,
+                                                      int lda,
+                                                      const double *B,
+                                                      int ldb,
+                                                      double *C,
+                                                      int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex *alpha, /* host or device pointer */  
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     const cuComplex *B,
+                                                     int ldb,
+                                                     cuComplex *C,
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
+                                                     const cuDoubleComplex *A,
+                                                     int lda,
+                                                     const cuDoubleComplex *B,
+                                                     int ldb,
+                                                     cuDoubleComplex *C,
+                                                     int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const float *alpha,  /* host or device pointer */  
+                                                          const float *const Aarray[], 
+                                                          int lda,
+                                                          const float *const Barray[],
+                                                          int ldb, 
+                                                          const float *beta,   /* host or device pointer */  
+                                                          float *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *const [], int, const float *const [], int, const float *, float *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const double *alpha,  /* host or device pointer */ 
+                                                          const double *const Aarray[], 
+                                                          int lda,
+                                                          const double *const Barray[],
+                                                          int ldb, 
+                                                          const double *beta,  /* host or device pointer */ 
+                                                          double *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *const [], int, const double *const [], int, const double *, double *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuComplex *alpha, /* host or device pointer */ 
+                                                          const cuComplex *const Aarray[], 
+                                                          int lda,
+                                                          const cuComplex *const Barray[],
+                                                          int ldb, 
+                                                          const cuComplex *beta, /* host or device pointer */ 
+                                                          cuComplex *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *const [], int, const cuComplex *const [], int, const cuComplex *, cuComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuComplex *alpha, /* host or device pointer */ 
+                                                          const cuComplex *const Aarray[], 
+                                                          int lda,
+                                                          const cuComplex *const Barray[],
+                                                          int ldb, 
+                                                          const cuComplex *beta, /* host or device pointer */ 
+                                                          cuComplex *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *const [], int, const cuComplex *const [], int, const cuComplex *, cuComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb, 
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                          const cuDoubleComplex *const Aarray[], 
+                                                          int lda,
+                                                          const cuDoubleComplex *const Barray[],
+                                                          int ldb, 
+                                                          const cuDoubleComplex *beta, /* host or device pointer */ 
+                                                          cuDoubleComplex *const Carray[],
+                                                          int ldc,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *const [], int, const cuDoubleComplex *const [], int, const cuDoubleComplex *, cuDoubleComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx  (cublasHandle_t handle, 
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb, 
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const void *alpha, /* host or device pointer */  
+                                                      const void *const Aarray[], 
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void *const Barray[],
+                                                      cudaDataType Btype,
+                                                      int ldb, 
+                                                      const void *beta, /* host or device pointer */  
+                                                      void *const Carray[],
+                                                      cudaDataType Ctype,
+                                                      int ldc,
+                                                      int batchCount,
+                                                      cublasComputeType_t computeType,
+                                                      cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *const [], cudaDataType, int, const void *const [], cudaDataType, int, const void *, void *const [], cudaDataType, int, int, cublasComputeType_t, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda, Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const void *alpha,  /* host or device pointer */
+                                                                 const void *A,
+                                                                 cudaDataType Atype,
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const void *B,
+                                                                 cudaDataType Btype,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const void *beta,   /* host or device pointer */
+                                                                 void *C,
+                                                                 cudaDataType Ctype,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount,
+                                                                 cublasComputeType_t computeType,
+                                                                 cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, long long, const void *, cudaDataType, int, long long, const void *, void *, cudaDataType, int, long long, int, cublasComputeType_t, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC, batchCount, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const float *alpha,  /* host or device pointer */
+                                                                 const float *A,
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const float *B,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const float *beta,   /* host or device pointer */
+                                                                 float *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const double *alpha,  /* host or device pointer */
+                                                                 const double *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const double *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const double *beta,   /* host or device pointer */
+                                                                 double *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuComplex *alpha,  /* host or device pointer */
+                                                                 const cuComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuComplex *beta,   /* host or device pointer */
+                                                                 cuComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuComplex *alpha,  /* host or device pointer */
+                                                                 const cuComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuComplex *beta,   /* host or device pointer */
+                                                                 cuComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb, 
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
+                                                                 const cuDoubleComplex *A, 
+                                                                 int lda,
+                                                                 long long int strideA,   /* purposely signed */
+                                                                 const cuDoubleComplex *B,
+                                                                 int ldb, 
+                                                                 long long int strideB,
+                                                                 const cuDoubleComplex *beta,   /* host or device poi */
+                                                                 cuDoubleComplex *C,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const float *alpha, /* host or device pointer */ 
+                                                  const float *A, 
+                                                  int lda,
+                                                  const float *beta , /* host or device pointer */ 
+                                                  const float *B, 
+                                                  int ldb,
+                                                  float *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const double *alpha, /* host or device pointer */ 
+                                                  const double *A, 
+                                                  int lda,
+                                                  const double *beta, /* host or device pointer */ 
+                                                  const double *B, 
+                                                  int ldb,
+                                                  double *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const cuComplex *alpha, /* host or device pointer */ 
+                                                  const cuComplex *A, 
+                                                  int lda,
+                                                  const cuComplex *beta, /* host or device pointer */  
+                                                  const cuComplex *B, 
+                                                  int ldb,
+                                                  cuComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa, 
+                                                  cublasOperation_t transb,
+                                                  int m, 
+                                                  int n,
+                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
+                                                  const cuDoubleComplex *A, 
+                                                  int lda,
+                                                  const cuDoubleComplex *beta, /* host or device pointer */  
+                                                  const cuDoubleComplex *B, 
+                                                  int ldb,
+                                                  cuDoubleComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  float *const A[],                /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                          /*Device Pointer*/
+                                                  int *info,                       /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  double *const A[],               /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                          /*Device Pointer*/
+                                                  int *info,                       /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  cuComplex *const A[],           /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                         /*Device Pointer*/
+                                                  int *info,                      /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
+                                                  int n, 
+                                                  cuDoubleComplex *const A[],     /*Device pointer*/
+                                                  int lda, 
+                                                  int *P,                         /*Device Pointer*/
+                                                  int *info,                      /*Device Pointer*/
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const float *const A[],         /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  float *const C[],               /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *const [], int, const int *, float *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const double *const A[],        /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  double *const C[],              /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *const [], int, const int *, double *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const cuComplex *const A[],     /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                   /*Device pointer*/
+                                                  cuComplex *const C[],           /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *const [], int, const int *, cuComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
+                                                  int n,
+                                                  const cuDoubleComplex *const A[], /*Device pointer*/
+                                                  int lda,
+                                                  const int *P,                     /*Device pointer*/
+                                                  cuDoubleComplex *const C[],       /*Device pointer*/
+                                                  int ldc,
+                                                  int *info,
+                                                  int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *const [], int, const int *, cuDoubleComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const float *const Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            float *const Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *const [], int, const int *, float *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int n, 
+                                                           int nrhs, 
+                                                           const double *const Aarray[], 
+                                                           int lda, 
+                                                           const int *devIpiv, 
+                                                           double *const Barray[], 
+                                                           int ldb, 
+                                                           int *info,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *const [], int, const int *, double *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const cuComplex *const Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            cuComplex *const Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const [], int, const int *, cuComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t  CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, 
+                                                            cublasOperation_t trans, 
+                                                            int n, 
+                                                            int nrhs, 
+                                                            const cuDoubleComplex *const Aarray[], 
+                                                            int lda, 
+                                                            const int *devIpiv, 
+                                                            cuDoubleComplex *const Barray[], 
+                                                            int ldb, 
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *const [], int, const int *, cuDoubleComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const float *alpha,           /*Host or Device Pointer*/
+                                                          const float *const A[], 
+                                                          int lda,
+                                                          float *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *const [], int, float *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const double *alpha,          /*Host or Device Pointer*/
+                                                          const double *const A[], 
+                                                          int lda,
+                                                          double *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *const [], int, double *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const cuComplex *alpha,       /*Host or Device Pointer*/
+                                                          const cuComplex *const A[], 
+                                                          int lda,
+                                                          cuComplex *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const [], int, cuComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t    handle, 
+                                                          cublasSideMode_t  side, 
+                                                          cublasFillMode_t  uplo,
+                                                          cublasOperation_t trans, 
+                                                          cublasDiagType_t  diag,
+                                                          int m, 
+                                                          int n, 
+                                                          const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+                                                          const cuDoubleComplex *const A[], 
+                                                          int lda,
+                                                          cuDoubleComplex *const B[], 
+                                                          int ldb,
+                                                          int batchCount) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const float *const A[],      /*Device pointer*/
+                                                          int lda, 
+                                                          float *const Ainv[],         /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *const [], int, float *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const double *const A[],     /*Device pointer*/
+                                                          int lda, 
+                                                          double *const Ainv[],        /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *const [], int, double *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const cuComplex *const A[],  /*Device pointer*/
+                                                          int lda, 
+                                                          cuComplex *const Ainv[],     /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                   /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *const [], int, cuComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
+                                                          int n, 
+                                                          const cuDoubleComplex *const A[], /*Device pointer*/
+                                                          int lda, 
+                                                          cuDoubleComplex *const Ainv[],    /*Device pointer*/
+                                                          int lda_inv, 
+                                                          int *info,                        /*Device Pointer*/
+                                                          int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, 
+                                                           int m, 
+                                                           int n,
+                                                           float *const Aarray[],      /*Device pointer*/
+                                                           int lda,
+                                                           float *const TauArray[],    /*Device pointer*/                                                           
+                                                           int *info,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, float *const [], int, float *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasDgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            double *const Aarray[],     /*Device pointer*/
+                                                            int lda, 
+                                                            double *const TauArray[],   /*Device pointer*/                                                            
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, double *const [], int, double *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasCgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            cuComplex *const Aarray[],          /*Device pointer*/
+                                                            int lda, 
+                                                            cuComplex *const TauArray[],        /*Device pointer*/                                                            
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuComplex *const [], int, cuComplex *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasZgeqrfBatched( cublasHandle_t handle, 
+                                                            int m, 
+                                                            int n,
+                                                            cuDoubleComplex *const Aarray[],    /*Device pointer*/
+                                                            int lda,
+                                                            cuDoubleComplex *const TauArray[],  /*Device pointer*/
+                                                            int *info,
+                                                            int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuDoubleComplex *const [], int, cuDoubleComplex *const [], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasSgelsBatched( cublasHandle_t handle, 
+                                                           cublasOperation_t trans, 
+                                                           int m,  
+                                                           int n,
+                                                           int nrhs,
+                                                           float *const Aarray[],      /*Device pointer*/
+                                                           int lda, 
+                                                           float *const Carray[],      /*Device pointer*/
+                                                           int ldc,
+                                                           int *info, 
+                                                           int *devInfoArray,          /*Device pointer*/
+                                                           int batchSize ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, float *const [], int, float *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasDgelsBatched( cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           int nrhs,
+                                                           double *const Aarray[],     /*Device pointer*/
+                                                           int lda, 
+                                                           double *const Carray[],     /*Device pointer*/
+                                                           int ldc,
+                                                           int *info, 
+                                                           int *devInfoArray,          /*Device pointer*/
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, double *const [], int, double *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasCgelsBatched( cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           int nrhs,
+                                                           cuComplex *const Aarray[],  /*Device pointer*/
+                                                           int lda,
+                                                           cuComplex *const Carray[],  /*Device pointer*/
+                                                           int ldc,
+                                                           int *info,
+                                                           int *devInfoArray,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const [], int, cuComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI  cublasZgelsBatched( cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           int nrhs,
+                                                           cuDoubleComplex *const Aarray[],  /*Device pointer*/
+                                                           int lda,
+                                                           cuDoubleComplex *const Carray[],  /*Device pointer*/
+                                                           int ldc,
+                                                           int *info,
+                                                           int *devInfoArray,
+                                                           int batchSize) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *const [], int, cuDoubleComplex *const [], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const float *A, 
+                                                  int lda,
+                                                  const float *x, 
+                                                  int incx,
+                                                  float *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const double *A, 
+                                                  int lda,
+                                                  const double *x, 
+                                                  int incx,
+                                                  double *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const cuComplex *A, 
+                                                  int lda,
+                                                  const cuComplex *x, 
+                                                  int incx,
+                                                  cuComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode, 
+                                                  int m, 
+                                                  int n,
+                                                  const cuDoubleComplex *A, 
+                                                  int lda,
+                                                  const cuDoubleComplex *x, 
+                                                  int incx,
+                                                  cuDoubleComplex *C, 
+                                                  int ldc) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const float *AP,
+                                                     float *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const double *AP,
+                                                     double *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuComplex *AP,
+                                                     cuComplex *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuDoubleComplex *AP,
+                                                     cuDoubleComplex *A,  
+                                                     int lda ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const float *A,
+                                                     int lda,
+                                                     float *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const double *A,
+                                                     int lda,
+                                                     double *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuComplex *A,
+                                                     int lda,
+                                                     cuComplex *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
+                                                     cublasFillMode_t uplo, 
+                                                     int n,                                     
+                                                     const cuDoubleComplex *A,
+                                                     int lda,
+                                                     cuDoubleComplex *AP ) {
+  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cublas_stub.cc b/tensorflow/stream_executor/cuda/cublas_stub.cc
index dd13ad0960b..1cbfd51316c 100644
--- a/tensorflow/stream_executor/cuda/cublas_stub.cc
+++ b/tensorflow/stream_executor/cuda/cublas_stub.cc
@@ -12,7 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#if CUBLAS_VER_MAJOR >= 11
+#include "third_party/gpus/cuda/include/cublas_v2.h"
+#else
 #include "third_party/gpus/cuda/include/cublas.h"
+#endif
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
@@ -65,7 +69,7 @@ typedef enum {} cublasMath_t;
 #include "tensorflow/stream_executor/cuda/cublas_10_1.inc"
 #elif CUDA_VERSION == 10020
 #include "tensorflow/stream_executor/cuda/cublas_10_2.inc"
-#elif CUDA_VERSION == 11000
+#elif CUBLAS_VER_MAJOR == 11 && CUBLAS_VER_MINOR == 0
 #include "tensorflow/stream_executor/cuda/cublas_11_0.inc"
 #else
 #error "We have no wrapper for this version."
diff --git a/tensorflow/stream_executor/cuda/cuda_11_0.inc b/tensorflow/stream_executor/cuda/cuda_11_0.inc
new file mode 100644
index 00000000000..21d91b7cb73
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_11_0.inc
@@ -0,0 +1,2036 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuInit(unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(Flags);
+}
+
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, ordinal);
+}
+
+CUresult CUDAAPI cuDeviceGetCount(int *count) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(name, len, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUuuid *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(uuid, dev);
+}
+
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(bytes, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUdevice, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, dev, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevprop *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, dev);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(major, minor, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice, unsigned int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags, active);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags, dev);
+}
+
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+CUresult CUDAAPI cuCtxSynchronize(void) {
+  using FuncPtr = CUresult (CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUlimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pvalue, limit);
+}
+
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunc_cache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pconfig);
+}
+
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsharedconfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx, version);
+}
+
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void) {
+  using FuncPtr = CUresult (CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fname);
+}
+
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fatCubin);
+}
+
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod);
+}
+
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytes, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfRef, hmod, name);
+}
+
+CUresult CUDAAPI
+cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t, const char *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, data, size, name, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI
+cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, CUjitInputType, const char *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI
+cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState, void **, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, cubinOut, sizeOut);
+}
+
+CUresult CUDAAPI
+cuLinkDestroy(CUlinkState state) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUlinkState);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state);
+}
+
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize);
+}
+
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
+}
+
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbase, psize, dptr);
+}
+
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
+  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize);
+}
+
+CUresult CUDAAPI cuMemFreeHost(void *p) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, p, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, p);
+}
+
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize, flags);
+}
+
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdevice *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, pciBusId);
+}
+
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
+  using FuncPtr = CUresult (CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, dev);
+}
+
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUipcEventHandle *, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, event);
+}
+
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent *, CUipcEventHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, handle);
+}
+
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, dptr);
+}
+
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, handle, Flags);
+}
+
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostUnregister(void *p) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N);
+}
+
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N);
+}
+
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N);
+}
+
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hArray);
+}
+
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
+}
+
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pLevelArray, hMipmappedArray, level);
+}
+
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hMipmappedArray);
+}
+
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t, size_t, CUdeviceptr, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, alignment, addr, flags);
+}
+
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmemGenericAllocationHandle *, size_t, const CUmemAllocationProp *, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, size, prop, flags);
+}
+
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmemGenericAllocationHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, size_t, CUmemGenericAllocationHandle, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, offset, handle, flags);
+}
+
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, desc, count);
+}
+
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned long long *, const CUmemLocation *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags, location, ptr);
+}
+
+CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUmemGenericAllocationHandle, CUmemAllocationHandleType, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(shareableHandle, handle, handleType, flags);
+}
+
+CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmemGenericAllocationHandle *, void *, CUmemAllocationHandleType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, osHandle, shHandleType);
+}
+
+CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, const CUmemAllocationProp *, CUmemAllocationGranularity_flags);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(granularity, prop, option);
+}
+
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, handle);
+}
+
+CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmemGenericAllocationHandle *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRetainAllocationHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, addr);
+}
+
+CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, attribute, ptr);
+}
+
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, hStream);
+}
+
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult (CUDAAPI *)(void *, size_t, CUmem_range_attribute, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult (CUDAAPI *)(void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attribute, ptr);
+}
+
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUpointer_attribute *, void **, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numAttributes, attributes, data, ptr);
+}
+
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, Flags);
+}
+
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, flags, priority);
+}
+
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, pctx);
+}
+
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUevent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, hEvent, Flags);
+}
+
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, callback, userData, flags);
+}
+
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, mode);
+}
+
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstreamCaptureMode *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, phGraph);
+}
+
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus);
+}
+
+CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus, cuuint64_t *id) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus, id);
+}
+
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, dptr, length, flags);
+}
+
+CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamAttrID, CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUstreamAttrID, const CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, Flags);
+}
+
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent, hStream);
+}
+
+CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, CUevent, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMilliseconds, hStart, hEnd);
+}
+
+CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalMemory *, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUexternalMemory, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUexternalMemory, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalMemory);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult (CUDAAPI *)(const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUexternalSemaphore);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, unsigned int, CUstreamBatchMemOpParams *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, count, paramArray, flags);
+}
+
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, hfunc);
+}
+
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUfunction_attribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, attrib, value);
+}
+
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUstream, CUhostFn, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, fn, userData);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, x, y, z);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, ptr, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height, hStream);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfunction, int, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, texunit, hTexRef);
+}
+
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraph, flags);
+}
+
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t, const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, memsetParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies, childGraph);
+}
+
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, phGraph);
+}
+
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph *, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphClone, originalGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phNode, hOriginalNode, hClonedGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, type);
+}
+
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, nodes, numNodes);
+}
+
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, rootNodes, numRootNodes);
+}
+
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numEdges);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependentNodes, numDependentNodes);
+}
+
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, const CUgraphNode *, const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph, const CUgraphNode *, const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode);
+}
+
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *, char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
+}
+
+CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec, CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec, CUgraphNode, const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec, CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hStream);
+}
+
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec);
+}
+
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph);
+}
+
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphExec, CUgraph, CUgraphNode *, CUgraphExecUpdateResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      CUkernelNodeAttrValue *value_out) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID, CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      const CUkernelNodeAttrValue *value) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID, const CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit, flags);
+}
+
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUfunction, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hMipmappedArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes) {
+  using FuncPtr = CUresult (CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, desc, dptr, Pitch);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUarray_format, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fmt, NumPackedComponents);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, int, CUaddress_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, dim, am);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, bias);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, maxAniso);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, pBorderColor);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phMipmappedArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUaddress_mode *, CUtexref, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pam, hTexRef, dim);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray_format *, int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFormat, pNumChannels, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbias, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pmaxAniso, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pBorderColor, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSurfRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUsurfref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hSurfRef);
+}
+
+CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *, const CUDA_TEXTURE_DESC *, const CUDA_RESOURCE_VIEW_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, dev, peerDev);
+}
+
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext, Flags);
+}
+
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext);
+}
+
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice) {
+  using FuncPtr = CUresult (CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attrib, srcDevice, dstDevice);
+}
+
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUarray *, CUgraphicsResource, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArray, resource, arrayIndex, mipLevel);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMipmappedArray, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevPtr, pSize, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUgraphicsResource, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) {
+  using FuncPtr = CUresult (CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId) {
+  using FuncPtr = CUresult (CUDAAPI *)(const void **, const CUuuid *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) {
+  using FuncPtr = CUresult (CUDAAPI *)(CUmodule *, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetModule");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod, hfunc);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc b/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
new file mode 100644
index 00000000000..2415f2ef534
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
@@ -0,0 +1,1501 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaSharedMemConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, pciBusId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(char *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, handle);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, handle, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error) {
+  using FuncPtr = const char* (CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
+  if (!func_ptr) return "cudaGetErrorName symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char* (CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return "cudaGetErrorString symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaDeviceProp *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, int device, int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, device, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, srcDevice, dstDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, const struct cudaDeviceProp *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, prop);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device_arr, len);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags ) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetAttribute(
+        cudaStream_t hStream, enum cudaStreamAttrID attr,
+        union cudaStreamAttrValue *value_out) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID, union cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamSetAttribute(
+        cudaStream_t hStream, enum cudaStreamAttrID attr,
+        const union cudaStreamAttrValue *value) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID, const union cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, event, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream,
+        cudaStreamCallback_t callback, void *userData, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags __dv(cudaMemAttachSingle)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, devPtr, length, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(enum cudaStreamCaptureMode *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus, unsigned long long *pId) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus, pId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaEvent_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaEvent_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ms, start, end);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(cudaExternalMemory_t *extMem_out, const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(void **devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc *bufferDesc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, cudaExternalMemory_t, const struct cudaExternalMemoryBufferDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaMipmappedArray_t *, cudaExternalMemory_t, const struct cudaExternalMemoryMipmappedArrayDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaExternalMemory_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(cudaExternalSemaphore_t *extSem_out, const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaExternalSemaphore_t *, const struct cudaExternalSemaphoreHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const cudaExternalSemaphore_t *, const struct cudaExternalSemaphoreSignalParams *, unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const cudaExternalSemaphore_t *, const struct cudaExternalSemaphoreWaitParams *, unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaExternalSemaphore_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, dim3, dim3, void **, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, dim3, dim3, void **, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags  __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaLaunchParams *, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, cacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, config);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attr, func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, attr, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __CUDA_DEPRECATED  __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, fn, userData);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, const void *func, int numBlocks, int blockSize) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, const void *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, size_t *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t *, const struct cudaChannelFormatDesc *, size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, width, height, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaMipmappedArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHost, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevice, pHost, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, pHost);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, extent);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t *, const struct cudaChannelFormatDesc *, struct cudaExtent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, extent, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *, struct cudaExtent, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(levelArray, mipmappedArray, level);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaChannelFormatDesc *, struct cudaExtent *, unsigned int *, cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, extent, flags, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, int, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, const void *, size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *, size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t, size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t, size_t, size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, const void *, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, const void *, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, const void *, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, int, const void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, const void *, size_t, size_t, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *, size_t, size_t, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t, size_t, size_t, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, const void *, size_t, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, const void *, size_t, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, int, size_t, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(size, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void *, size_t, enum cudaMemoryAdvise, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *, size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t, size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t, size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, device, peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, resource, arrayIndex, mipLevel);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, resource);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, const struct textureReference *, const void *, const struct cudaChannelFormatDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, size);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, const struct textureReference *, const void *, const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct textureReference *, cudaArray_const_t, const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct textureReference *, cudaMipmappedArray_const_t, const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, mipmappedArray, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(size_t *, const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct textureReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, symbol);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct surfaceReference *, cudaArray_const_t, const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const struct surfaceReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaChannelFormatDesc *, cudaArray_const_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaTextureObject_t *, const struct cudaResourceDesc *, const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaResourceViewDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaSurfaceObject_t *, const struct cudaResourceDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(runtimeVersion);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph, unsigned int flags) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeCopyAttributes(
+        cudaGraphNode_t hSrc,
+        cudaGraphNode_t hDst) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSrc, hDst);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode,
+    enum cudaKernelNodeAttrID attr,
+    union cudaKernelNodeAttrValue *value_out) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID, union cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode,
+    enum cudaKernelNodeAttrID attr,
+    const union cudaKernelNodeAttrValue *value) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID, const union cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms *pCopyParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, pCopyParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemsetParams *pMemsetParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, pMemsetParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaGraph_t childGraph) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies, childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t, const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphClone, originalGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pNode, originalNode, clonedGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pType);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t *nodes, size_t *numNodes) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, nodes, numNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, pRootNodes, pNumRootNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, size_t *numEdges) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numEdges);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, size_t *pNumDependencies) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependencies, pNumDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, size_t *pNumDependentNodes) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependentNodes, pNumDependentNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *, const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *, const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroyNode(cudaGraphNode_t node) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphNode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode, char *pLogBuffer, size_t bufferSize) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t, cudaGraphNode_t *, char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t, const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphNode_t *hErrorNode_out, enum cudaGraphExecUpdateResult *updateResult_out) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *, enum cudaGraphExecUpdateResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraphExec_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+  using FuncPtr = cudaError_t (CUDARTAPI *)(const void **, const cudaUUID_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc b/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
new file mode 100644
index 00000000000..4177f0fd7cd
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
@@ -0,0 +1,5953 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+cusolverStatus_t CUSOLVERAPI cusolverGetProperty(
+    libraryPropertyType type, 
+    int *value) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverGetVersion(
+    int *version) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSetStream (cusolverDnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsCreate(
+            cusolverDnIRSParams_t* params_ptr ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params_ptr);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsDestroy(
+            cusolverDnIRSParams_t params ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetRefinementSolver(
+            cusolverDnIRSParams_t params,
+            cusolverIRSRefinement_t refinement_solver ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolverIRSRefinement_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetRefinementSolver");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, refinement_solver);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetSolverMainPrecision(
+            cusolverDnIRSParams_t params,
+            cusolverPrecType_t solver_main_precision ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolverPrecType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverMainPrecision");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_main_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetSolverLowestPrecision(
+            cusolverDnIRSParams_t params,
+            cusolverPrecType_t solver_lowest_precision ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolverPrecType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverLowestPrecision");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_lowest_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetSolverPrecisions(
+            cusolverDnIRSParams_t params,
+            cusolverPrecType_t solver_main_precision,
+            cusolverPrecType_t solver_lowest_precision ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolverPrecType_t, cusolverPrecType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverPrecisions");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_main_precision, solver_lowest_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetTol(
+            cusolverDnIRSParams_t params,
+            double val ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, val);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetTolInner(
+            cusolverDnIRSParams_t params,
+            double val ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTolInner");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, val);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetMaxIters(
+            cusolverDnIRSParams_t params,
+            cusolver_int_t maxiters ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetMaxItersInner(
+            cusolverDnIRSParams_t params,
+            cusolver_int_t maxiters_inner ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxItersInner");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters_inner);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsGetMaxIters(
+            cusolverDnIRSParams_t params,
+            cusolver_int_t *maxiters ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsGetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsEnableFallback(
+    cusolverDnIRSParams_t params ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsEnableFallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsDisableFallback(
+    cusolverDnIRSParams_t params ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsDisableFallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI 
+    cusolverDnIRSInfosDestroy(
+        cusolverDnIRSInfos_t infos ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSInfos_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos);
+}
+
+cusolverStatus_t CUSOLVERAPI 
+    cusolverDnIRSInfosCreate(
+        cusolverDnIRSInfos_t* infos_ptr ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSInfos_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos_ptr);
+}
+
+cusolverStatus_t CUSOLVERAPI 
+    cusolverDnIRSInfosGetNiters(
+            cusolverDnIRSInfos_t infos,
+            cusolver_int_t *niters ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetNiters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, niters);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSInfosGetOuterNiters(
+            cusolverDnIRSInfos_t infos,
+            cusolver_int_t *outer_niters ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetOuterNiters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, outer_niters);
+}
+
+cusolverStatus_t CUSOLVERAPI 
+    cusolverDnIRSInfosRequestResidual(
+        cusolverDnIRSInfos_t infos ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSInfos_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosRequestResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos);
+}
+
+cusolverStatus_t CUSOLVERAPI 
+    cusolverDnIRSInfosGetResidualHistory(
+            cusolverDnIRSInfos_t infos,
+            void **residual_history ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSInfos_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetResidualHistory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, residual_history);
+}
+
+cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSInfosGetMaxIters(
+            cusolverDnIRSInfos_t infos,
+            cusolver_int_t *maxiters ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        cusolver_int_t *dipiv,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgels(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *iter,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuDoubleComplex *dA, cusolver_int_t ldda,
+        cuDoubleComplex *dB, cusolver_int_t lddb,
+        cuDoubleComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        cuComplex *dA, cusolver_int_t ldda,
+        cuComplex *dB, cusolver_int_t lddb,
+        cuComplex *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        double *dA, cusolver_int_t ldda,
+        double *dB, cusolver_int_t lddb,
+        double *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, double *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        float *dA, cusolver_int_t ldda,
+        float *dB, cusolver_int_t lddb,
+        float *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv(
+        cusolverDnHandle_t handle,
+        cusolverDnIRSParams_t gesv_irs_params,
+        cusolverDnIRSInfos_t  gesv_irs_infos,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        void *dA, cusolver_int_t ldda,
+        void *dB, cusolver_int_t lddb,
+        void *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *niters,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t, cusolver_int_t, cusolver_int_t, void *, cusolver_int_t, void *, cusolver_int_t, void *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, gesv_irs_params, gesv_irs_infos, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolverDnIRSParams_t params,
+        cusolver_int_t n, cusolver_int_t nrhs,
+        size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t, cusolver_int_t, cusolver_int_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, n, nrhs, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels(
+        cusolverDnHandle_t handle,
+        cusolverDnIRSParams_t gels_irs_params,
+        cusolverDnIRSInfos_t  gels_irs_infos,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs,
+        void *dA, cusolver_int_t ldda,
+        void *dB, cusolver_int_t lddb,
+        void *dX, cusolver_int_t lddx,
+        void *dWorkspace, size_t lwork_bytes,
+        cusolver_int_t *niters,
+        cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, void *, cusolver_int_t, void *, cusolver_int_t, void *, cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, gels_irs_params, gels_irs_infos, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels_bufferSize(
+        cusolverDnHandle_t handle,
+        cusolverDnIRSParams_t params,
+        cusolver_int_t m, 
+        cusolver_int_t n, 
+        cusolver_int_t nrhs, 
+        size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t, cusolver_int_t, cusolver_int_t, cusolver_int_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, nrhs, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf_bufferSize( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    float *A, 
+    int lda, 
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf_bufferSize( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    double *A, 
+    int lda, 
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf_bufferSize( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    cuComplex *A, 
+    int lda, 
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf_bufferSize( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    cuDoubleComplex *A, 
+    int lda, 
+    int *Lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    float *A, 
+    int lda,  
+    float *Workspace, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    double *A, 
+    int lda, 
+    double *Workspace, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    cuComplex *A, 
+    int lda, 
+    cuComplex *Workspace, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf( 
+    cusolverDnHandle_t handle, 
+    cublasFillMode_t uplo, 
+    int n, 
+    cuDoubleComplex *A, 
+    int lda, 
+    cuDoubleComplex *Workspace, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs,
+    const float *A,
+    int lda,
+    float *B,
+    int ldb,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs,
+    const double *A,
+    int lda,
+    double *B,
+    int ldb,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs,
+    const cuComplex *A,
+    int lda,
+    cuComplex *B,
+    int ldb,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs,
+    const cuDoubleComplex *A,
+    int lda,
+    cuDoubleComplex *B,
+    int ldb,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    float *Aarray[],
+    int lda,
+    int *infoArray,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    double *Aarray[],
+    int lda,
+    int *infoArray,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *Aarray[],
+    int lda,
+    int *infoArray,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *Aarray[],
+    int lda,
+    int *infoArray,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs, /* only support rhs = 1*/
+    float *A[],
+    int lda,
+    float *B[],
+    int ldb,
+    int *d_info,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, float * [], int, float * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs, /* only support rhs = 1*/
+    double *A[],
+    int lda,
+    double *B[],
+    int ldb,
+    int *d_info,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, double * [], int, double * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs, /* only support rhs = 1*/
+    cuComplex *A[],
+    int lda,
+    cuComplex *B[],
+    int ldb,
+    int *d_info,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex * [], int, cuComplex * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    int nrhs, /* only support rhs = 1*/
+    cuDoubleComplex *A[],
+    int lda,
+    cuDoubleComplex *B[],
+    int ldb,
+    int *d_info,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex * [], int, cuDoubleComplex * [], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    float *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    double *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    float *A,
+    int lda,
+    float *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    double *A,
+    int lda,
+    double *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A,
+    int lda,
+    cuComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    cuDoubleComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    float *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    double *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    cuComplex *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    float *A,
+    int lda,
+    float *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    double *A,
+    int lda,
+    double *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    cuComplex *A,
+    int lda,
+    cuComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    cublasDiagType_t diag,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    cuDoubleComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    float *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    double *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    float *A,
+    int lda,
+    float *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    double *A,
+    int lda,
+    double *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A,
+    int lda,
+    cuComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    cuDoubleComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    float *A,
+    int lda,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    double *A,
+    int lda,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    cuComplex *A,
+    int lda,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    float *A, 
+    int lda, 
+    float *Workspace, 
+    int *devIpiv, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    double *A, 
+    int lda, 
+    double *Workspace, 
+    int *devIpiv, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    cuComplex *A, 
+    int lda, 
+    cuComplex *Workspace, 
+    int *devIpiv, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *, int, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    cuDoubleComplex *A, 
+    int lda, 
+    cuDoubleComplex *Workspace, 
+    int *devIpiv, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp( 
+    cusolverDnHandle_t handle, 
+    int n, 
+    float *A, 
+    int lda, 
+    int k1, 
+    int k2, 
+    const int *devIpiv, 
+    int incx) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp( 
+    cusolverDnHandle_t handle, 
+    int n, 
+    double *A, 
+    int lda, 
+    int k1, 
+    int k2, 
+    const int *devIpiv, 
+    int incx) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClaswp( 
+    cusolverDnHandle_t handle, 
+    int n, 
+    cuComplex *A, 
+    int lda, 
+    int k1, 
+    int k2, 
+    const int *devIpiv, 
+    int incx) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp( 
+    cusolverDnHandle_t handle, 
+    int n, 
+    cuDoubleComplex *A, 
+    int lda, 
+    int k1, 
+    int k2, 
+    const int *devIpiv, 
+    int incx) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, cuDoubleComplex *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs( 
+    cusolverDnHandle_t handle, 
+    cublasOperation_t trans, 
+    int n, 
+    int nrhs, 
+    const float *A, 
+    int lda, 
+    const int *devIpiv, 
+    float *B, 
+    int ldb, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int, const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs( 
+    cusolverDnHandle_t handle, 
+    cublasOperation_t trans, 
+    int n, 
+    int nrhs, 
+    const double *A, 
+    int lda, 
+    const int *devIpiv, 
+    double *B, 
+    int ldb, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int, const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs( 
+    cusolverDnHandle_t handle, 
+    cublasOperation_t trans, 
+    int n, 
+    int nrhs, 
+    const cuComplex *A, 
+    int lda, 
+    const int *devIpiv, 
+    cuComplex *B, 
+    int ldb, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int, const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs( 
+    cusolverDnHandle_t handle, 
+    cublasOperation_t trans, 
+    int n, 
+    int nrhs, 
+    const cuDoubleComplex *A, 
+    int lda, 
+    const int *devIpiv, 
+    cuDoubleComplex *B, 
+    int ldb, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    float *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    double *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    cuComplex *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    float *A,  
+    int lda, 
+    float *TAU,  
+    float *Workspace,  
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    double *A, 
+    int lda, 
+    double *TAU, 
+    double *Workspace, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *, int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    cuComplex *A, 
+    int lda, 
+    cuComplex *TAU, 
+    cuComplex *Workspace, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *, int, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    cuDoubleComplex *A, 
+    int lda, 
+    cuDoubleComplex *TAU, 
+    cuDoubleComplex *Workspace, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    const float *A,
+    int lda,
+    const float *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    const double *A,
+    int lda,
+    const double *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    const cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, const cuComplex *, int, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    float *A,
+    int lda,
+    const float *tau,
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, float *, int, const float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    double *A,
+    int lda,
+    const double *tau,
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, double *, int, const double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int k,
+    cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const float *A,
+    int lda,
+    const float *tau,
+    const float *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const double *A,
+    int lda,
+    const double *tau,
+    const double *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const double *, int, const double *, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    const cuComplex *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    const cuDoubleComplex *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const float *A,
+    int lda,
+    const float *tau,
+    float *C,
+    int ldc,
+    float *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const float *, int, const float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const double *A,
+    int lda,
+    const double *tau,
+    double *C,
+    int ldc,
+    double *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const double *, int, const double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    cuComplex *C,
+    int ldc,
+    cuComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    int k,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *C,
+    int ldc,
+    cuDoubleComplex *work,
+    int lwork,
+    int *devInfo) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int n,
+    float *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int n,
+    double *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int n,
+    cuComplex *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    float *A,
+    int lda,
+    int *ipiv,
+    float *work,
+    int lwork,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    double *A,
+    int lda,
+    int *ipiv,
+    double *work,
+    int lwork,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A,
+    int lda,
+    int *ipiv,
+    cuComplex *work,
+    int lwork,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    int *ipiv,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const float *A,
+        int lda,
+        const int *ipiv,
+        float *B,
+        int ldb,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int, const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const double *A,
+        int lda,
+        const int *ipiv,
+        double *B,
+        int ldb,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int, const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const cuComplex *A,
+        int lda,
+        const int *ipiv,
+        cuComplex *B,
+        int ldb,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int, const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const cuDoubleComplex *A,
+        int lda,
+        const int *ipiv,
+        cuDoubleComplex *B,
+        int ldb,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const float *A,
+        int lda,
+        const int *ipiv,
+        float *B,
+        int ldb,
+        float *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int, const int *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const double *A,
+        int lda,
+        const int *ipiv,
+        double *B,
+        int ldb,
+        double *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int, const int *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const cuComplex *A,
+        int lda,
+        const int *ipiv,
+        cuComplex *B,
+        int ldb,
+        cuComplex *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int, const int *, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        int nrhs,
+        const cuDoubleComplex *A,
+        int lda,
+        const int *ipiv,
+        cuDoubleComplex *B,
+        int ldb,
+        cuDoubleComplex *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        float *A,
+        int lda,
+        const int *ipiv,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        double *A,
+        int lda,
+        const int *ipiv,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        cuComplex *A,
+        int lda,
+        const int *ipiv,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        cuDoubleComplex *A,
+        int lda,
+        const int *ipiv,
+        int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        float *A,
+        int lda,
+        const int *ipiv,
+        float *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        double *A,
+        int lda,
+        const int *ipiv,
+        double *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        cuComplex *A,
+        int lda,
+        const int *ipiv,
+        cuComplex *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
+        cusolverDnHandle_t handle,
+        cublasFillMode_t uplo,
+        int n,
+        cuDoubleComplex *A,
+        int lda,
+        const int *ipiv,
+        cuDoubleComplex *work,
+        int lwork,
+        int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *Lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    float *A,  
+    int lda,
+    float *D, 
+    float *E, 
+    float *TAUQ,  
+    float *TAUP, 
+    float *Work,
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, float *, int, float *, float *, float *, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    double *A, 
+    int lda,
+    double *D, 
+    double *E, 
+    double *TAUQ, 
+    double *TAUP, 
+    double *Work,
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *, int, double *, double *, double *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    cuComplex *A, 
+    int lda, 
+    float *D, 
+    float *E, 
+    cuComplex *TAUQ, 
+    cuComplex *TAUP,
+    cuComplex *Work, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *, cuComplex *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd( 
+    cusolverDnHandle_t handle, 
+    int m, 
+    int n, 
+    cuDoubleComplex *A,
+    int lda, 
+    double *D, 
+    double *E, 
+    cuDoubleComplex *TAUQ,
+    cuDoubleComplex *TAUP, 
+    cuDoubleComplex *Work, 
+    int Lwork, 
+    int *devInfo ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *, cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    const float *A,
+    int lda,
+    const float *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    const double *A,
+    int lda,
+    const double *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    const cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *, int, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    float *A,
+    int lda,
+    const float *tau,
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int, const float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    double *A,
+    int lda,
+    const double *tau,
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int, const double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int, const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungbr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side, 
+    int m,
+    int n,
+    int k,
+    cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    const float *A,
+    int lda,
+    const float *d,
+    const float *e,
+    const float *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const float *, int, const float *, const float *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    const double *A,
+    int lda,
+    const double *d,
+    const double *e,
+    const double *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const double *, int, const double *, const double *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    const cuComplex *A,
+    int lda,
+    const float *d,
+    const float *e,
+    const cuComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int, const float *, const float *, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const double *d,
+    const double *e,
+    const cuDoubleComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const double *, const double *, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    float *A,
+    int lda,
+    float *d,
+    float *e,
+    float *tau,
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    double *A,
+    int lda,
+    double *d,
+    double *e,
+    double *tau,
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *, double *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A,
+    int lda,
+    float *d,
+    float *e,
+    cuComplex *tau,
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *, float *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    double *d,
+    double *e,
+    cuDoubleComplex *tau,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo, 
+    int n,
+    const float *A,
+    int lda,
+    const float *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo,
+    int n,
+    const double *A,
+    int lda,
+    const double *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo, 
+    int n,
+    const cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo, 
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo, 
+    int n,
+    float *A,
+    int lda,
+    const float *tau,
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo, 
+    int n,
+    double *A,
+    int lda,
+    const double *tau,
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo, 
+    int n,
+    cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t uplo, 
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    const float *A,
+    int lda,
+    const float *tau,
+    const float *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    const double *A,
+    int lda,
+    const double *tau,
+    const double *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, int, const double *, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    const cuComplex *A,
+    int lda,
+    const cuComplex *tau,
+    const cuComplex *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *tau,
+    const cuDoubleComplex *C,
+    int ldc,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    float *A,
+    int lda,
+    float *tau,
+    float *C,
+    int ldc,
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, float *, int, float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    double *A,
+    int lda,
+    double *tau,
+    double *C,
+    int ldc,
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, double *, int, double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    cuComplex *A,
+    int lda,
+    cuComplex *tau,
+    cuComplex *C,
+    int ldc,
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t side,
+    cublasFillMode_t uplo,
+    cublasOperation_t trans,
+    int m,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    cuDoubleComplex *tau,
+    cuDoubleComplex *C,
+    int ldc,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int m,
+    int n,
+    int *lwork ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd (
+    cusolverDnHandle_t handle, 
+    signed char jobu, 
+    signed char jobvt, 
+    int m, 
+    int n, 
+    float *A, 
+    int lda, 
+    float *S, 
+    float *U, 
+    int ldu, 
+    float *VT, 
+    int ldvt, 
+    float *work, 
+    int lwork, 
+    float *rwork, 
+    int  *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, signed char, signed char, int, int, float *, int, float *, float *, int, float *, int, float *, int, float *, int  *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd (
+    cusolverDnHandle_t handle, 
+    signed char jobu, 
+    signed char jobvt, 
+    int m, 
+    int n, 
+    double *A, 
+    int lda, 
+    double *S, 
+    double *U, 
+    int ldu, 
+    double *VT, 
+    int ldvt, 
+    double *work,
+    int lwork, 
+    double *rwork, 
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, signed char, signed char, int, int, double *, int, double *, double *, int, double *, int, double *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd (
+    cusolverDnHandle_t handle, 
+    signed char jobu, 
+    signed char jobvt, 
+    int m, 
+    int n, 
+    cuComplex *A,
+    int lda, 
+    float *S, 
+    cuComplex *U, 
+    int ldu, 
+    cuComplex *VT, 
+    int ldvt,
+    cuComplex *work, 
+    int lwork, 
+    float *rwork, 
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int, float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd (
+    cusolverDnHandle_t handle, 
+    signed char jobu, 
+    signed char jobvt, 
+    int m, 
+    int n, 
+    cuDoubleComplex *A, 
+    int lda, 
+    double *S, 
+    cuDoubleComplex *U, 
+    int ldu, 
+    cuDoubleComplex *VT, 
+    int ldvt, 
+    cuDoubleComplex *work, 
+    int lwork, 
+    double *rwork, 
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int, cuDoubleComplex *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work, lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const float *A,
+    int lda,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const double *A,
+    int lda,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const cuComplex *A,
+    int lda,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const cuComplex *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    float *A,
+    int lda,
+    float *W, 
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *, int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    double *A,
+    int lda,
+    double *W, 
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *, int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    cuComplex *A,
+    int lda,
+    float *W, 
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *, int, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    double *W, 
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    const float *A,
+    int lda,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const float *, int, float, float, int, int, int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    const double *A,
+    int lda,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const double *, int, double, double, int, int, int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    const cuComplex *A,
+    int lda,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int, float, float, int, int, int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int, double, double, int, int, int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    float *A,
+    int lda,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    float *W, 
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, float *, int, float, float, int, int, int *, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    double *A,
+    int lda,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    double *W, 
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, double *, int, double, double, int, int, int *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    cuComplex *A,
+    int lda,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    float *W, 
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, float, float, int, int, int *, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    double *W, 
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int, double, double, int, int, int *, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,
+    cusolverEigMode_t jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo, 
+    int n,
+    const float *A, 
+    int lda,
+    const float *B, 
+    int ldb,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const float *, int, const float *, int, float, float, int, int, int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz,  
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,  
+    int n,
+    const double *A, 
+    int lda,
+    const double *B, 
+    int ldb,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const double *, int, const double *, int, double, double, int, int, int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz,  
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,  
+    int n,
+    const cuComplex *A, 
+    int lda,
+    const cuComplex *B, 
+    int ldb,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int, float, float, int, int, int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz, 
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,  
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *B, 
+    int ldb,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, double, int, int, int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz,  
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,  
+    int n,
+    float *A, 
+    int lda,
+    float *B, 
+    int ldb,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    float *W, 
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, float *, int, float *, int, float, float, int, int, int *, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,  
+    cusolverEigMode_t jobz,  
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,  
+    int n,
+    double *A, 
+    int lda,
+    double *B, 
+    int ldb,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    double *W, 
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, double *, int, double *, int, double, double, int, int, int *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz,  
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,  
+    int n,
+    cuComplex *A,
+    int lda,
+    cuComplex *B, 
+    int ldb,
+    float vl,
+    float vu,
+    int il,
+    int iu,
+    int *meig,
+    float *W, 
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float, float, int, int, int *, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz,  
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,  
+    int n,
+    cuDoubleComplex *A, 
+    int lda,
+    cuDoubleComplex *B, 
+    int ldb,
+    double vl,
+    double vu,
+    int il,
+    int iu,
+    int *meig,
+    double *W, 
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double, int, int, int *, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu, il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, 
+    int n,
+    const float *A, 
+    int lda,
+    const float *B, 
+    int ldb,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const float *, int, const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz,  
+    cublasFillMode_t uplo,  
+    int n,
+    const double *A, 
+    int lda,
+    const double *B, 
+    int ldb,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const double *, int, const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz,  
+    cublasFillMode_t uplo,  
+    int n,
+    const cuComplex *A, 
+    int lda,
+    const cuComplex *B, 
+    int ldb,
+    const float *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo,  
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const cuDoubleComplex *B, 
+    int ldb,
+    const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz,  
+    cublasFillMode_t uplo,  
+    int n,
+    float *A, 
+    int lda,
+    float *B, 
+    int ldb,
+    float *W, 
+    float *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, float *, int, float *, int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,  
+    cusolverEigMode_t jobz,  
+    cublasFillMode_t uplo,  
+    int n,
+    double *A, 
+    int lda,
+    double *B, 
+    int ldb,
+    double *W, 
+    double *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, double *, int, double *, int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz,  
+    cublasFillMode_t uplo,  
+    int n,
+    cuComplex *A,
+    int lda,
+    cuComplex *B, 
+    int ldb,
+    float *W, 
+    cuComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,   
+    cusolverEigMode_t jobz,  
+    cublasFillMode_t uplo,  
+    int n,
+    cuDoubleComplex *A, 
+    int lda,
+    cuDoubleComplex *B, 
+    int ldb,
+    double *W, 
+    cuDoubleComplex *work,
+    int lwork,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(
+    syevjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(syevjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(
+    syevjInfo_t info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(
+    syevjInfo_t info,
+    double tolerance) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(syevjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(
+    syevjInfo_t info,
+    int max_sweeps) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(
+    syevjInfo_t info,
+    int sort_eig) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_eig);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
+    cusolverDnHandle_t handle,
+    syevjInfo_t info,
+    double *residual) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
+    cusolverDnHandle_t handle,
+    syevjInfo_t info,
+    int *executed_sweeps) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const float *A,
+    int lda,
+    const float *W,
+    int *lwork,
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const float *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const double *A, 
+    int lda,
+    const double *W,
+    int *lwork,
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const double *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const cuComplex *A, 
+    int lda,
+    const float *W,
+    int *lwork,
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const cuComplex *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    const cuDoubleComplex *A, 
+    int lda,
+    const double *W,
+    int *lwork,
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int n,   
+    float *A,
+    int lda,
+    float *W, 
+    float *work,
+    int lwork,
+    int *info, 
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *, int, float *, float *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, 
+    int n,
+    double *A,
+    int lda,
+    double *W,
+    double *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *, int, double *, double *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, 
+    int n,
+    cuComplex *A,
+    int lda,
+    float *W,
+    cuComplex *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *, int, float *, cuComplex *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    double *W,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int n,
+    const float *A,
+    int lda,
+    const float *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const float *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int n,
+    const double *A,
+    int lda,
+    const double *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const double *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int n,
+    const cuComplex *A,
+    int lda,
+    const float *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const cuComplex *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo,
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const double *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo, 
+    int n,
+    float *A,
+    int lda,
+    float *W,
+    float *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *, int, float *, float *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo,
+    int n,
+    double *A,
+    int lda,
+    double *W, 
+    double *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *, int, double *, double *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A,
+    int lda,
+    float *W, 
+    cuComplex *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *, int, float *, cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    double *W, 
+    cuDoubleComplex *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo,
+    int n,
+    const float *A, 
+    int lda,
+    const float *B, 
+    int ldb,
+    const float *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const float *, int, const float *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo,
+    int n,
+    const double *A, 
+    int lda,
+    const double *B,
+    int ldb,
+    const double *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const double *, int, const double *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int n,
+    const cuComplex *A, 
+    int lda,
+    const cuComplex *B, 
+    int ldb,
+    const float *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int n,
+    const cuDoubleComplex *A, 
+    int lda,
+    const cuDoubleComplex *B, 
+    int ldb,
+    const double *W,
+    int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, 
+    int n,
+    float *A, 
+    int lda,
+    float *B, 
+    int ldb,
+    float *W,
+    float *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, float *, int, float *, int, float *, float *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, 
+    int n,
+    double *A, 
+    int lda,
+    double *B,
+    int ldb,
+    double *W, 
+    double *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, double *, int, double *, int, double *, double *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz, 
+    cublasFillMode_t uplo,
+    int n,
+    cuComplex *A, 
+    int lda,
+    cuComplex *B, 
+    int ldb,
+    float *W,
+    cuComplex *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *, cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t itype, 
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,  
+    int n,
+    cuDoubleComplex *A, 
+    int lda,
+    cuDoubleComplex *B, 
+    int ldb,
+    double *W, 
+    cuDoubleComplex *work,
+    int lwork,
+    int *info,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t, cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(
+    gesvdjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(gesvdjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(
+    gesvdjInfo_t info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(
+    gesvdjInfo_t info,
+    double tolerance) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(gesvdjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(
+    gesvdjInfo_t info,
+    int max_sweeps) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(
+    gesvdjInfo_t info,
+    int sort_svd) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_svd);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
+    cusolverDnHandle_t handle,
+    gesvdjInfo_t info,
+    double *residual) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
+    cusolverDnHandle_t handle,
+    gesvdjInfo_t info,
+    int *executed_sweeps) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int m,                
+    int n,                
+    const float *A,    
+    int lda,           
+    const float *S, 
+    const float *U,   
+    int ldu, 
+    const float *V,
+    int ldv,  
+    int *lwork,
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int, const float *, const float *, int, const float *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int m,
+    int n,
+    const double *A, 
+    int lda,
+    const double *S,
+    const double *U,
+    int ldu,
+    const double *V,
+    int ldv,
+    int *lwork,
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int, const double *, const double *, int, const double *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int m,
+    int n,
+    const cuComplex *A,
+    int lda,
+    const float *S,
+    const cuComplex *U,
+    int ldu,
+    const cuComplex *V,
+    int ldv,
+    int *lwork,
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int, const float *, const cuComplex *, int, const cuComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int m, 
+    int n, 
+    const cuDoubleComplex *A,
+    int lda,
+    const double *S,
+    const cuDoubleComplex *U,
+    int ldu, 
+    const cuDoubleComplex *V,
+    int ldv,
+    int *lwork,
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *, int, const double *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int m, 
+    int n, 
+    float *A, 
+    int lda, 
+    float *S, 
+    float *U,
+    int ldu,
+    float *V,
+    int ldv, 
+    float *work,
+    int lwork,
+    int *info,
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int m,
+    int n,
+    double *A,
+    int lda,
+    double *S,
+    double *U,
+    int ldu,
+    double *V,
+    int ldv, 
+    double *work,
+    int lwork,
+    int *info, 
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *, double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int m, 
+    int n,
+    cuComplex *A,
+    int lda,
+    float *S,
+    cuComplex *U,
+    int ldu,
+    cuComplex *V,
+    int ldv,
+    cuComplex *work,
+    int lwork,
+    int *info,
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int, float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int m,
+    int n,
+    cuDoubleComplex *A,
+    int lda, 
+    double *S, 
+    cuDoubleComplex *U,
+    int ldu,
+    cuDoubleComplex *V,
+    int ldv,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info,
+    gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int, cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int econ,
+    int m,
+    int n, 
+    const float *A,
+    int lda,
+    const float *S,
+    const float *U,
+    int ldu, 
+    const float *V,
+    int ldv,
+    int *lwork,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int, const float *, const float *, int, const float *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int econ,
+    int m,
+    int n,
+    const double *A, 
+    int lda,
+    const double *S,
+    const double *U,
+    int ldu,
+    const double *V,
+    int ldv,
+    int *lwork,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int, const double *, const double *, int, const double *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int econ,
+    int m,
+    int n,
+    const cuComplex *A,
+    int lda,
+    const float *S,
+    const cuComplex *U,
+    int ldu,
+    const cuComplex *V,
+    int ldv,
+    int *lwork,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *, int, const float *, const cuComplex *, int, const cuComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int econ,
+    int m,
+    int n,
+    const cuDoubleComplex *A,
+    int lda,
+    const double *S,
+    const cuDoubleComplex *U,
+    int ldu,
+    const cuDoubleComplex *V,
+    int ldv, 
+    int *lwork,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuDoubleComplex *, int, const double *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int econ,
+    int m,
+    int n,
+    float *A, 
+    int lda,
+    float *S,
+    float *U,
+    int ldu,
+    float *V,
+    int ldv,
+    float *work,
+    int lwork,
+    int *info,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int, float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int econ, 
+    int m, 
+    int n, 
+    double *A, 
+    int lda,
+    double *S,
+    double *U,
+    int ldu,
+    double *V,
+    int ldv,
+    double *work,
+    int lwork,
+    int *info,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int, double *, double *, int, double *, int, double *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int econ,
+    int m,
+    int n,
+    cuComplex *A,
+    int lda,
+    float *S,
+    cuComplex *U,
+    int ldu,
+    cuComplex *V,
+    int ldv,
+    cuComplex *work,
+    int lwork,
+    int *info,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int, float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int econ,
+    int m,
+    int n,
+    cuDoubleComplex *A,
+    int lda,
+    double *S,
+    cuDoubleComplex *U, 
+    int ldu, 
+    cuDoubleComplex *V,
+    int ldv,
+    cuDoubleComplex *work,
+    int lwork,
+    int *info,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *, int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int, cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int rank,
+    int m,
+    int n,
+    const float *d_A, 
+    int lda,
+    long long int strideA, 
+    const float *d_S, 
+    long long int strideS, 
+    const float *d_U, 
+    int ldu,
+    long long int strideU, 
+    const float *d_V, 
+    int ldv,
+    long long int strideV,
+    int *lwork,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int, long long, const float *, long long, const float *, int, long long, const float *, int, long long, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int rank,
+    int m,
+    int n,
+    const double *d_A, 
+    int lda,
+    long long int strideA, 
+    const double *d_S,   
+    long long int strideS, 
+    const double *d_U,  
+    int ldu,
+    long long int strideU, 
+    const double *d_V,
+    int ldv,
+    long long int strideV, 
+    int *lwork,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int, long long, const double *, long long, const double *, int, long long, const double *, int, long long, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int rank,
+    int m,
+    int n,
+    const cuComplex *d_A, 
+    int lda,
+    long long int strideA, 
+    const float *d_S, 
+    long long int strideS, 
+    const cuComplex *d_U,
+    int ldu,
+    long long int strideU, 
+    const cuComplex *d_V, 
+    int ldv,
+    long long int strideV, 
+    int *lwork,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *, int, long long, const float *, long long, const cuComplex *, int, long long, const cuComplex *, int, long long, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz,
+    int rank,
+    int m,
+    int n,
+    const cuDoubleComplex *d_A,
+    int lda,
+    long long int strideA,
+    const double *d_S, 
+    long long int strideS, 
+    const cuDoubleComplex *d_U, 
+    int ldu,
+    long long int strideU,
+    const cuDoubleComplex *d_V,
+    int ldv,
+    long long int strideV, 
+    int *lwork,
+    int batchSize
+    ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuDoubleComplex *, int, long long, const double *, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int rank, 
+    int m,   
+    int n,  
+    const float *d_A, 
+    int lda, 
+    long long int strideA,
+    float *d_S, 
+    long long int strideS, 
+    float *d_U, 
+    int ldu, 
+    long long int strideU,
+    float *d_V, 
+    int ldv,    
+    long long int strideV, 
+    float *d_work,
+    int lwork,
+    int *d_info,
+    double *h_R_nrmF,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int, long long, float *, long long, float *, int, long long, float *, int, long long, float *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info, h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int rank,
+    int m, 
+    int n, 
+    const double *d_A,
+    int lda,  
+    long long int strideA, 
+    double *d_S, 
+    long long int strideS,
+    double *d_U, 
+    int ldu, 
+    long long int strideU, 
+    double *d_V, 
+    int ldv, 
+    long long int strideV,
+    double *d_work,
+    int lwork,
+    int *d_info,
+    double *h_R_nrmF, 
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int, long long, double *, long long, double *, int, long long, double *, int, long long, double *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info, h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int rank,  
+    int m, 
+    int n, 
+    const cuComplex *d_A, 
+    int lda,
+    long long int strideA,
+    float *d_S,
+    long long int strideS,
+    cuComplex *d_U, 
+    int ldu,   
+    long long int strideU,  
+    cuComplex *d_V, 
+    int ldv, 
+    long long int strideV,
+    cuComplex *d_work,
+    int lwork,
+    int *d_info,
+    double *h_R_nrmF, 
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *, int, long long, float *, long long, cuComplex *, int, long long, cuComplex *, int, long long, cuComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info, h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t jobz, 
+    int rank, 
+    int m,   
+    int n,  
+    const cuDoubleComplex *d_A, 
+    int lda,    
+    long long int strideA,
+    double *d_S,
+    long long int strideS,
+    cuDoubleComplex *d_U, 
+    int ldu,   
+    long long int strideU, 
+    cuDoubleComplex *d_V,
+    int ldv, 
+    long long int strideV, 
+    cuDoubleComplex *d_work,
+    int lwork,
+    int *d_info,
+    double *h_R_nrmF,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuDoubleComplex *, int, long long, double *, long long, cuDoubleComplex *, int, long long, cuDoubleComplex *, int, long long, cuDoubleComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS, d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info, h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateParams(
+    cusolverDnParams_t *params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnParams_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroyParams(
+    cusolverDnParams_t params) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSetAdvOptions (
+    cusolverDnParams_t params,
+    cusolverDnFunction_t function,
+    cusolverAlgMode_t algo   ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnParams_t, cusolverDnFunction_t, cusolverAlgMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetAdvOptions");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, function, algo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnPotrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t uplo,
+    int64_t n,
+    cudaDataType dataTypeA,
+    const void *A,
+    int64_t lda,
+    cudaDataType computeType,
+    size_t *workspaceInBytes ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t, cudaDataType, const void *, int64_t, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnPotrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t uplo,
+    int64_t n,
+    cudaDataType dataTypeA,
+    void *A,
+    int64_t lda,
+    cudaDataType computeType,
+    void *pBuffer,
+    size_t workspaceInBytes,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t, cudaDataType, void *, int64_t, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnPotrs(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t uplo,
+    int64_t n,
+    int64_t nrhs,
+    cudaDataType dataTypeA,
+    const void *A,
+    int64_t lda,
+    cudaDataType dataTypeB,
+    void *B,
+    int64_t ldb,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t, int64_t, cudaDataType, const void *, int64_t, cudaDataType, void *, int64_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, nrhs, dataTypeA, A, lda, dataTypeB, B, ldb, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t m,
+    int64_t n,
+    cudaDataType dataTypeA,
+    const void *A,
+    int64_t lda,
+    cudaDataType dataTypeTau,
+    const void *tau,
+    cudaDataType computeType,
+    size_t *workspaceInBytes ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType, const void *, int64_t, cudaDataType, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau, computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGeqrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t m,
+    int64_t n,
+    cudaDataType dataTypeA,
+    void *A,
+    int64_t lda,
+    cudaDataType dataTypeTau,
+    void *tau,
+    cudaDataType computeType,
+    void *pBuffer,
+    size_t workspaceInBytes,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType, void *, int64_t, cudaDataType, void *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau, computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t m,
+    int64_t n,
+    cudaDataType dataTypeA,
+    const void *A,
+    int64_t lda,
+    cudaDataType computeType,
+    size_t *workspaceInBytes ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType, const void *, int64_t, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t m,
+    int64_t n,
+    cudaDataType dataTypeA,
+    void *A,
+    int64_t lda,
+    int64_t *ipiv,
+    cudaDataType computeType,
+    void *pBuffer,
+    size_t workspaceInBytes,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType, void *, int64_t, int64_t *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, ipiv, computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetrs(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasOperation_t trans,
+    int64_t n,
+    int64_t nrhs,
+    cudaDataType dataTypeA,
+    const void *A,
+    int64_t lda,
+    const int64_t *ipiv,
+    cudaDataType dataTypeB,
+    void *B,
+    int64_t ldb,
+    int *info ) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cublasOperation_t, int64_t, int64_t, cudaDataType, const void *, int64_t, const int64_t *, cudaDataType, void *, int64_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, trans, n, nrhs, dataTypeA, A, lda, ipiv, dataTypeB, B, ldb, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    cudaDataType dataTypeA,
+    const void *A,
+    int64_t lda,
+    cudaDataType dataTypeW,
+    const void *W,
+    cudaDataType computeType,
+    size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cublasFillMode_t, int64_t, cudaDataType, const void *, int64_t, cudaDataType, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevd(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cublasFillMode_t uplo,
+    int64_t n,
+    cudaDataType dataTypeA,
+    void *A,
+    int64_t lda,
+    cudaDataType dataTypeW,
+    void *W,
+    cudaDataType computeType,
+    void *pBuffer,
+    size_t workspaceInBytes,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cublasFillMode_t, int64_t, cudaDataType, void *, int64_t, cudaDataType, void *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW, W, computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,
+    int64_t n,
+    cudaDataType dataTypeA,
+    const void *A,
+    int64_t lda,
+    void *vl,
+    void *vu,
+    int64_t il,
+    int64_t iu,
+    int64_t *h_meig,
+    cudaDataType dataTypeW,
+    const void *W,
+    cudaDataType computeType,
+    size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, const void *, int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl, vu, il, iu, h_meig, dataTypeW, W, computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t uplo,
+    int64_t n,
+    cudaDataType dataTypeA,
+    void *A,
+    int64_t lda,
+    void * vl,
+    void * vu,
+    int64_t il,
+    int64_t iu,
+    int64_t *meig64,
+    cudaDataType dataTypeW,
+    void *W,
+    cudaDataType computeType,
+    void *pBuffer,
+    size_t workspaceInBytes,
+    int *info) {
+  using FuncPtr = cusolverStatus_t (CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t, cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, void *, int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType, void *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl, vu, il, iu, meig64, dataTypeW, W, computeType, pBuffer, workspaceInBytes, info);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_11_0.inc b/tensorflow/stream_executor/cuda/cusparse_11_0.inc
new file mode 100644
index 00000000000..1adb0f27433
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusparse_11_0.inc
@@ -0,0 +1,7942 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+cusparseStatus_t CUSPARSEAPI
+cusparseCreate(cusparseHandle_t* handle) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroy(cusparseHandle_t handle) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetVersion(cusparseHandle_t handle,
+                   int*             version) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetProperty(libraryPropertyType type,
+                    int*                value) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+const char* CUSPARSEAPI
+cusparseGetErrorName(cusparseStatus_t status) {
+  using FuncPtr = const char* (CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
+  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
+  return func_ptr(status);
+}
+
+const char* CUSPARSEAPI
+cusparseGetErrorString(cusparseStatus_t status) {
+  using FuncPtr = const char* (CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
+  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetStream(cusparseHandle_t handle,
+                  cudaStream_t     streamId) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetStream(cusparseHandle_t handle,
+                  cudaStream_t*    streamId) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetPointerMode(cusparseHandle_t       handle,
+                       cusparsePointerMode_t* mode) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetPointerMode(cusparseHandle_t      handle,
+                       cusparsePointerMode_t mode) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateMatDescr(cusparseMatDescr_t* descrA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseMatDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCopyMatDescr(cusparseMatDescr_t       dest,
+                     const cusparseMatDescr_t src) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseMatDescr_t, const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dest, src);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatType(cusparseMatDescr_t   descrA,
+                   cusparseMatrixType_t type) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, type);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatFillMode(cusparseMatDescr_t descrA,
+                       cusparseFillMode_t fillMode) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, fillMode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatDiagType(cusparseMatDescr_t descrA,
+                       cusparseDiagType_t diagType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, diagType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatIndexBase(cusparseMatDescr_t  descrA,
+                        cusparseIndexBase_t base) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, base);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrsv2Info(csrsv2Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsric02Info(csric02Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsric02Info(csric02Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsric02Info(bsric02Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsric02Info(bsric02Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrilu02Info(csrilu02Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrilu02Info(bsrilu02Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrsv2Info(bsrsv2Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrsm2Info(bsrsm2Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(bsrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsru2csrInfo(csru2csrInfo_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csru2csrInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csru2csrInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateColorInfo(cusparseColorInfo_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseColorInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyColorInfo(cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetColorAlgs(cusparseColorInfo_t info,
+                     cusparseColorAlg_t  alg) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetColorAlgs(cusparseColorInfo_t info,
+                     cusparseColorAlg_t* alg) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreatePruneInfo(pruneInfo_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(pruneInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyPruneInfo(pruneInfo_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(pruneInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSaxpyi(cusparseHandle_t    handle,
+               int                 nnz,
+               const float*        alpha,
+               const float*        xVal,
+               const int*          xInd,
+               float*              y,
+               cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const float *, const float *, const int *, float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDaxpyi(cusparseHandle_t    handle,
+               int                 nnz,
+               const double*       alpha,
+               const double*       xVal,
+               const int*          xInd,
+               double*             y,
+               cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const double *, const double *, const int *, double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCaxpyi(cusparseHandle_t    handle,
+               int                 nnz,
+               const cuComplex*    alpha,
+               const cuComplex*    xVal,
+               const int*          xInd,
+               cuComplex*          y,
+               cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *, cuComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZaxpyi(cusparseHandle_t       handle,
+               int                    nnz,
+               const cuDoubleComplex* alpha,
+               const cuDoubleComplex* xVal,
+               const int*             xInd,
+               cuDoubleComplex*       y,
+               cusparseIndexBase_t    idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const int *, cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgthr(cusparseHandle_t    handle,
+              int                 nnz,
+              const float*        y,
+              float*              xVal,
+              const int*          xInd,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const float *, float *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgthr(cusparseHandle_t    handle,
+              int                 nnz,
+              const double*       y,
+              double*             xVal,
+              const int*          xInd,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const double *, double *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgthr(cusparseHandle_t    handle,
+              int                 nnz,
+              const cuComplex*    y,
+              cuComplex*          xVal,
+              const int*          xInd,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgthr(cusparseHandle_t       handle,
+              int                    nnz,
+              const cuDoubleComplex* y,
+              cuDoubleComplex*       xVal,
+              const int*             xInd,
+              cusparseIndexBase_t    idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               float*              y,
+               float*              xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               double*             y,
+               double*             xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               cuComplex*          y,
+               cuComplex*          xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cuComplex *, cuComplex *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               cuDoubleComplex*    y,
+               cuDoubleComplex*    xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSsctr(cusparseHandle_t    handle,
+              int                 nnz,
+              const float*        xVal,
+              const int*          xInd,
+              float*              y,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const float *, const int *, float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDsctr(cusparseHandle_t    handle,
+              int                 nnz,
+              const double*       xVal,
+              const int*          xInd,
+              double*             y,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const double *, const int *, double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsctr(cusparseHandle_t    handle,
+              int                 nnz,
+              const cuComplex*    xVal,
+              const int*          xInd,
+              cuComplex*          y,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZsctr(cusparseHandle_t       handle,
+              int                    nnz,
+              const cuDoubleComplex* xVal,
+              const int*             xInd,
+              cuDoubleComplex*       y,
+              cusparseIndexBase_t    idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuDoubleComplex *, const int *, cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSroti(cusparseHandle_t    handle,
+              int                 nnz,
+              float*              xVal,
+              const int*          xInd,
+              float*              y,
+              const float*        c,
+              const float*        s,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, float *, const int *, float *, const float *, const float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDroti(cusparseHandle_t    handle,
+              int                 nnz,
+              double*             xVal,
+              const int*          xInd,
+              double*             y,
+              const double*       c,
+              const double*       s,
+              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, double *, const int *, double *, const double *, const double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const float*        alpha,
+               const float*        A,
+               int                 lda,
+               int                 nnz,
+               const float*        xVal,
+               const int*          xInd,
+               const float*        beta,
+               float*              y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const float *, const float *, int, int, const float *, const int *, const float *, float *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const double*       alpha,
+               const double*       A,
+               int                 lda,
+               int                 nnz,
+               const double*       xVal,
+               const int*          xInd,
+               const double*       beta,
+               double*             y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const double *, const double *, int, int, const double *, const int *, const double *, double *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const cuComplex*    alpha,
+               const cuComplex*    A,
+               int                 lda,
+               int                 nnz,
+               const cuComplex*    xVal,
+               const int*          xInd,
+               const cuComplex*    beta,
+               cuComplex*          y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *, const cuComplex *, int, int, const cuComplex *, const int *, const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi(cusparseHandle_t       handle,
+               cusparseOperation_t    transA,
+               int                    m,
+               int                    n,
+               const cuDoubleComplex* alpha,
+               const cuDoubleComplex* A,
+               int                    lda,
+               int                    nnz,
+               const cuDoubleComplex* xVal,
+               const int*             xInd,
+               const cuDoubleComplex* beta,
+               cuDoubleComplex*       y,
+               cusparseIndexBase_t    idxBase,
+               void*                  pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *, const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrmvEx_bufferSize(cusparseHandle_t         handle,
+                           cusparseAlgMode_t        alg,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      n,
+                           int                      nnz,
+                           const void*              alpha,
+                           cudaDataType             alphatype,
+                           const cusparseMatDescr_t descrA,
+                           const void*              csrValA,
+                           cudaDataType             csrValAtype,
+                           const int*               csrRowPtrA,
+                           const int*               csrColIndA,
+                           const void*              x,
+                           cudaDataType             xtype,
+                           const void*              beta,
+                           cudaDataType             betatype,
+                           void*                    y,
+                           cudaDataType             ytype,
+                           cudaDataType             executiontype,
+                           size_t*                  bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int, const void *, cudaDataType, const cusparseMatDescr_t, const void *, cudaDataType, const int *, const int *, const void *, cudaDataType, const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA, csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta, betatype, y, ytype, executiontype, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrmvEx(cusparseHandle_t         handle,
+                cusparseAlgMode_t        alg,
+                cusparseOperation_t      transA,
+                int                      m,
+                int                      n,
+                int                      nnz,
+                const void*              alpha,
+                cudaDataType             alphatype,
+                const cusparseMatDescr_t descrA,
+                const void*              csrValA,
+                cudaDataType             csrValAtype,
+                const int*               csrRowPtrA,
+                const int*               csrColIndA,
+                const void*              x,
+                cudaDataType             xtype,
+                const void*              beta,
+                cudaDataType             betatype,
+                void*                    y,
+                cudaDataType             ytype,
+                cudaDataType             executiontype,
+                void*                    buffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int, const void *, cudaDataType, const cusparseMatDescr_t, const void *, cudaDataType, const int *, const int *, const void *, cudaDataType, const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA, csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta, betatype, y, ytype, executiontype, buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const float*             alpha,
+               const cusparseMatDescr_t descrA,
+               const float*             bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const float*             x,
+               const float*             beta,
+               float*                   y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, int, const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const double*            alpha,
+               const cusparseMatDescr_t descrA,
+               const double*            bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const double*            x,
+               const double*            beta,
+               double*                  y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, int, const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const cuComplex*         alpha,
+               const cusparseMatDescr_t descrA,
+               const cuComplex*         bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const cuComplex*         x,
+               const cuComplex*         beta,
+               cuComplex*               y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const cuDoubleComplex*   alpha,
+               const cusparseMatDescr_t descrA,
+               const cuDoubleComplex*   bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const cuDoubleComplex*   x,
+               const cuDoubleComplex*   beta,
+               cuDoubleComplex*         y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const float*             alpha,
+                const cusparseMatDescr_t descrA,
+                const float*             bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const float*             x,
+                const float*             beta,
+                float*                   y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, const int *, const int *, int, const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA, bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const double*            alpha,
+                const cusparseMatDescr_t descrA,
+                const double*            bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const double*            x,
+                const double*            beta,
+                double*                  y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, const int *, const int *, int, const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA, bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const cuComplex*         alpha,
+                const cusparseMatDescr_t descrA,
+                const cuComplex*         bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const cuComplex*         x,
+                const cuComplex*         beta,
+                cuComplex*               y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, const int *, const int *, int, const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA, bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrxmv(cusparseHandle_t      handle,
+             cusparseDirection_t      dirA,
+             cusparseOperation_t      transA,
+             int                      sizeOfMask,
+             int                      mb,
+             int                      nb,
+             int                      nnzb,
+             const cuDoubleComplex*   alpha,
+             const cusparseMatDescr_t descrA,
+             const cuDoubleComplex*   bsrSortedValA,
+             const int*               bsrSortedMaskPtrA,
+             const int*               bsrSortedRowPtrA,
+             const int*               bsrSortedEndPtrA,
+             const int*               bsrSortedColIndA,
+             int                      blockDim,
+             const cuDoubleComplex*   x,
+             const cuDoubleComplex*   beta,
+             cuDoubleComplex*         y) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, const int *, const int *, int, const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA, bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
+                          csrsv2Info_t     info,
+                          int*             position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           float*                   csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           double*                  csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              float*                   csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              double*                  csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const float*             csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, csrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const double*            csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, csrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, csrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, csrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const float*             f,
+                      float*                   x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const double*            f,
+                      double*                  x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const cuComplex*         f,
+                      cuComplex*               x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const cuDoubleComplex*   f,
+                      cuDoubleComplex*         x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
+                          bsrsv2Info_t     info,
+                          int*             position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              float*                   bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              double*                  bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const float*             bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const double*            bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const float*             f,
+                      float*                   x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, int, bsrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const double*            f,
+                      double*                  x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, int, bsrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const cuComplex*         f,
+                      cuComplex*               x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, bsrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const cuDoubleComplex*   f,
+                      cuDoubleComplex*         x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const float*             alpha,
+               const cusparseMatDescr_t descrA,
+               const float* bsrSortedValA,
+               const int*   bsrSortedRowPtrA,
+               const int*   bsrSortedColIndA,
+               const int    blockSize,
+               const float* B,
+               const int    ldb,
+               const float* beta,
+               float*       C,
+               int          ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, const int, const float *, const int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const double*            alpha,
+               const cusparseMatDescr_t descrA,
+               const double* bsrSortedValA,
+               const int*    bsrSortedRowPtrA,
+               const int*    bsrSortedColIndA,
+               const int     blockSize,
+               const double* B,
+               const int     ldb,
+               const double* beta,
+               double*       C,
+               int           ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, const int, const double *, const int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const cuComplex*         alpha,
+               const cusparseMatDescr_t descrA,
+               const cuComplex* bsrSortedValA,
+               const int*       bsrSortedRowPtrA,
+               const int*       bsrSortedColIndA,
+               const int        blockSize,
+               const cuComplex* B,
+               const int        ldb,
+               const cuComplex* beta,
+               cuComplex*       C,
+               int              ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, const int, const cuComplex *, const int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+ cusparseZbsrmm(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                cusparseOperation_t      transB,
+                int                      mb,
+                int                      n,
+                int                      kb,
+                int                      nnzb,
+                const cuDoubleComplex*   alpha,
+                const cusparseMatDescr_t descrA,
+                const cuDoubleComplex*   bsrSortedValA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedColIndA,
+                const int                blockSize,
+                const cuDoubleComplex*   B,
+                const int                ldb,
+                const cuDoubleComplex*   beta,
+                cuDoubleComplex*         C,
+                int                      ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, const int, const cuDoubleComplex *, const int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize, B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemmi(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               int              k,
+               int              nnz,
+               const float*     alpha,
+               const float*     A,
+               int              lda,
+               const float*     cscValB,
+               const int*       cscColPtrB,
+               const int*       cscRowIndB,
+               const float*     beta,
+               float*           C,
+               int              ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, int, const float *, const float *, int, const float *, const int *, const int *, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemmi(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               int              k,
+               int              nnz,
+               const double*    alpha,
+               const double*    A,
+               int              lda,
+               const double*    cscValB,
+               const int*       cscColPtrB,
+               const int*       cscRowIndB,
+               const double*    beta,
+               double*          C,
+               int              ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, int, const double *, const double *, int, const double *, const int *, const int *, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemmi(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               int              k,
+               int              nnz,
+               const cuComplex* alpha,
+               const cuComplex* A,
+               int              lda,
+               const cuComplex* cscValB,
+               const int*       cscColPtrB,
+               const int*       cscRowIndB,
+               const cuComplex* beta,
+               cuComplex*       C,
+               int              ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const int *, const int *, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemmi(cusparseHandle_t       handle,
+               int                    m,
+               int                    n,
+               int                    k,
+               int                    nnz,
+               const cuDoubleComplex* alpha,
+               const cuDoubleComplex* A,
+               int                    lda,
+               const cuDoubleComplex* cscValB,
+               const int*             cscColPtrB,
+               const int*             cscRowIndB,
+               const cuDoubleComplex* beta,
+               cuDoubleComplex*       C,
+               int                    ldc) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const int *, const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB, cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrsm2Info(csrsm2Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
+                          csrsm2Info_t     info,
+                          int* position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const float*             alpha,
+                              const cusparseMatDescr_t descrA,
+                              const float*             csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const float*             B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const double*            alpha,
+                              const cusparseMatDescr_t descrA,
+                              const double*            csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const double*            B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, const double *, int, csrsm2Info_t, cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const cuComplex*         alpha,
+                              const cusparseMatDescr_t descrA,
+                              const cuComplex*         csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const cuComplex*         B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, const cuComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const cuDoubleComplex*   alpha,
+                              const cusparseMatDescr_t descrA,
+                              const cuDoubleComplex*   csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const cuDoubleComplex*   B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const float*             alpha,
+                         const cusparseMatDescr_t descrA,
+                         const float*             csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const float*             B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const double*            alpha,
+                         const cusparseMatDescr_t descrA,
+                         const double*            csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const double*            B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, const double *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const cuComplex*         alpha,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const cuComplex*         B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, const cuComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const cuDoubleComplex*   alpha,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const cuDoubleComplex*   B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      float*                   B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      double*                  B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, double *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      cuComplex*               B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, cuComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      cuDoubleComplex*         B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
+                          bsrsm2Info_t     info,
+                          int*             position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              float*                   bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              double*                  bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const float*             bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, bsrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const double*            bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, bsrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, bsrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const float*             B,
+                      int                      ldb,
+                      float*                   X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, int, bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const double*            B,
+                      int                      ldb,
+                      double*                  X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, int, bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const cuComplex*         B,
+                      int                      ldb,
+                      cuComplex*               X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const cuDoubleComplex*   B,
+                      int                      ldb,
+                      cuDoubleComplex*         X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, cusparseOperation_t, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int, cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               float*           boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               double*          boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuComplex*       boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuDoubleComplex* boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrilu02_zeroPivot(cusparseHandle_t handle,
+                            csrilu02Info_t   info,
+                            int*             position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             float*                   csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             double*                  csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             cuComplex*               csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             cuDoubleComplex*         csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                float*                   csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                double*                  csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                cuComplex*               csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                cuDoubleComplex*         csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const float*             csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const double*            csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const cuComplex*         csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const cuDoubleComplex*   csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               float*           boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               double*          boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuComplex*       boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuDoubleComplex* boost_val) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrilu02_zeroPivot(cusparseHandle_t handle,
+                            bsrilu02Info_t   info,
+                            int*             position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             float*                   bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             double*                  bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             cuComplex*               bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             cuDoubleComplex*         bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                float*                   bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                double*                  bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                cuComplex*               bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsrilu02Info_t           info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  float*                   bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  double*                  bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
+                           csric02Info_t    info,
+                           int*             position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            float*                   csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            double*                  csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            cuComplex*               csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            cuDoubleComplex*         csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               float*                   csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               double*                  csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               cuComplex*               csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr, csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const float*             csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 float*                   csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 double*                  csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 cuComplex*               csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 cuDoubleComplex*         csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
+                           bsric02Info_t    info,
+                           int*             position) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            float*                   bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            double*                  bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            cuComplex*               bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            cuDoubleComplex*         bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               float*                   bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               double*                  bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuComplex*               bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const float*             bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const double*            bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 float*                   bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, float *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 double*                  bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, double *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 cuComplex*               bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*
+                      bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuComplex *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 cuDoubleComplex*         bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const float*     dl,
+                             const float*     d,
+                             const float*     du,
+                             const float*     B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const double*    dl,
+                             const double*    d,
+                             const double*    du,
+                             const double*    B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const double *, const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const cuComplex* dl,
+                             const cuComplex* d,
+                             const cuComplex* du,
+                             const cuComplex* B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cuComplex *, const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_bufferSizeExt(cusparseHandle_t       handle,
+                             int                    m,
+                             int                    n,
+                             const cuDoubleComplex* dl,
+                             const cuDoubleComplex* d,
+                             const cuDoubleComplex* du,
+                             const cuDoubleComplex* B,
+                             int                    ldb,
+                             size_t*                bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const float*     dl,
+               const float*     d,
+               const float*     du,
+               float*           B,
+               int              ldb,
+               void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const float *, const float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const double*    dl,
+               const double*    d,
+               const double*    du,
+               double*          B,
+               int              ldb,
+               void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const double *, const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const cuComplex* dl,
+               const cuComplex* d,
+               const cuComplex* du,
+               cuComplex*       B,
+               int              ldb,
+               void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cuComplex *, const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2(cusparseHandle_t       handle,
+               int                    m,
+               int                    n,
+               const cuDoubleComplex* dl,
+               const cuDoubleComplex* d,
+               const cuDoubleComplex* du,
+               cuDoubleComplex*       B,
+               int                    ldb,
+               void*                  pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const float*     dl,
+                                     const float*     d,
+                                     const float*     du,
+                                     const float*     B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const double*    dl,
+                                     const double*    d,
+                                     const double*    du,
+                                     const double*    B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const double *, const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const cuComplex* dl,
+                                     const cuComplex* d,
+                                     const cuComplex* du,
+                                     const cuComplex* B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cuComplex *, const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_nopivot_bufferSizeExt(cusparseHandle_t       handle,
+                                     int                    m,
+                                     int                    n,
+                                     const cuDoubleComplex* dl,
+                                     const cuDoubleComplex* d,
+                                     const cuDoubleComplex* du,
+                                     const cuDoubleComplex* B,
+                                     int                    ldb,
+                                     size_t*                bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const float*     dl,
+                       const float*     d,
+                       const float*     du,
+                       float*           B,
+                       int              ldb,
+                       void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const float *, const float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const double*    dl,
+                       const double*    d,
+                       const double*    du,
+                       double*          B,
+                       int              ldb,
+                       void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const double *, const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const cuComplex* dl,
+                       const cuComplex* d,
+                       const cuComplex* du,
+                       cuComplex*       B,
+                       int              ldb,
+                       void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cuComplex *, const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_nopivot(cusparseHandle_t       handle,
+                       int                    m,
+                       int                    n,
+                       const cuDoubleComplex* dl,
+                       const cuDoubleComplex* d,
+                       const cuDoubleComplex* du,
+                       cuDoubleComplex*       B,
+                       int                    ldb,
+                       void*                  pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const float*     dl,
+                                         const float*     d,
+                                         const float*     du,
+                                         const float*     x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const float *, const float *, const float *, const float *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const double*    dl,
+                                         const double*    d,
+                                         const double*    du,
+                                         const double*    x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const double *, const double *, const double *, const double *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const cuComplex* dl,
+                                         const cuComplex* d,
+                                         const cuComplex* du,
+                                         const cuComplex* x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuComplex *, const cuComplex *, const cuComplex *, const cuComplex *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                         int                    m,
+                                         const cuDoubleComplex* dl,
+                                         const cuDoubleComplex* d,
+                                         const cuDoubleComplex* du,
+                                         const cuDoubleComplex* x,
+                                         int                    batchCount,
+                                         int                    batchStride,
+                                         size_t* bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const float*     dl,
+                           const float*     d,
+                           const float*     du,
+                           float*           x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const float *, const float *, const float *, float *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const double*    dl,
+                           const double*    d,
+                           const double*    du,
+                           double*          x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const double *, const double *, const double *, double *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const cuComplex* dl,
+                           const cuComplex* d,
+                           const cuComplex* du,
+                           cuComplex*       x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuComplex *, const cuComplex *, const cuComplex *, cuComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2StridedBatch(cusparseHandle_t       handle,
+                           int                    m,
+                           const cuDoubleComplex* dl,
+                           const cuDoubleComplex* d,
+                           const cuDoubleComplex* du,
+                           cuDoubleComplex*       x,
+                           int                    batchCount,
+                           int                    batchStride,
+                           void*                  pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const float*     dl,
+                                            const float*     d,
+                                            const float*     du,
+                                            const float*     x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              algo,
+                                         int              m,
+                                         const double*    dl,
+                                         const double*    d,
+                                         const double*    du,
+                                         const double*    x,
+                                         int              batchCount,
+                                         size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const double *, const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const cuComplex* dl,
+                                            const cuComplex* d,
+                                            const cuComplex* du,
+                                            const cuComplex* x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cuComplex *, const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                            int                    algo,
+                                            int                    m,
+                                            const cuDoubleComplex* dl,
+                                            const cuDoubleComplex* d,
+                                            const cuDoubleComplex* du,
+                                            const cuDoubleComplex* x,
+                                            int                    batchCount,
+                                            size_t*        pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              float*           dl,
+                              float*           d,
+                              float*           du,
+                              float*           x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, float *, float *, float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              double*          dl,
+                              double*          d,
+                              double*          du,
+                              double*          x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, double *, double *, double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuComplex*       dl,
+                              cuComplex*       d,
+                              cuComplex*       du,
+                              cuComplex*       x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuDoubleComplex* dl,
+                              cuDoubleComplex* d,
+                              cuDoubleComplex* du,
+                              cuDoubleComplex* x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const float*     ds,
+                                            const float*     dl,
+                                            const float*     d,
+                                            const float*     du,
+                                            const float*     dw,
+                                            const float*     x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const float *, const float *, const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const double*    ds,
+                                            const double*    dl,
+                                            const double*    d,
+                                            const double*    du,
+                                            const double*    dw,
+                                            const double*    x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const double *, const double *, const double *, const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const cuComplex* ds,
+                                            const cuComplex* dl,
+                                            const cuComplex* d,
+                                            const cuComplex* du,
+                                            const cuComplex* dw,
+                                            const cuComplex* x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cuComplex *, const cuComplex *, const cuComplex *, const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                            int                    algo,
+                                            int                    m,
+                                            const cuDoubleComplex* ds,
+                                            const cuDoubleComplex* dl,
+                                            const cuDoubleComplex* d,
+                                            const cuDoubleComplex* du,
+                                            const cuDoubleComplex* dw,
+                                            const cuDoubleComplex* x,
+                                            int                    batchCount,
+                                            size_t*         pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              float*           ds,
+                              float*           dl,
+                              float*           d,
+                              float*           du,
+                              float*           dw,
+                              float*           x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, float *, float *, float *, float *, float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              double*          ds,
+                              double*          dl,
+                              double*          d,
+                              double*          du,
+                              double*          dw,
+                              double*          x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, double *, double *, double *, double *, double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuComplex*       ds,
+                              cuComplex*       dl,
+                              cuComplex*       d,
+                              cuComplex*       du,
+                              cuComplex*       dw,
+                              cuComplex*       x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *, cuComplex *, cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuDoubleComplex* ds,
+                              cuDoubleComplex* dl,
+                              cuDoubleComplex* d,
+                              cuDoubleComplex* du,
+                              cuDoubleComplex* dw,
+                              cuDoubleComplex* x,
+                              int              batchCount,
+                              void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrgemm2Info(csrgemm2Info_t* info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrgemm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(csrgemm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const float*             alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const float*             beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int, const int *, const int *, const float *, const cusparseMatDescr_t, int, const int *, const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD, csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const double*            alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const double*            beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int, const int *, const int *, const double *, const cusparseMatDescr_t, int, const int *, const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD, csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const cuComplex*         alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cuComplex*         beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuComplex *, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int, const int *, const int *, const cuComplex *, const cusparseMatDescr_t, int, const int *, const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD, csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const cuDoubleComplex*   alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cuDoubleComplex*   beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int, const int *, const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *, const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD, csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrgemm2Nnz(cusparseHandle_t         handle,
+                     int                      m,
+                     int                      n,
+                     int                      k,
+                     const cusparseMatDescr_t descrA,
+                     int                      nnzA,
+                     const int*               csrSortedRowPtrA,
+                     const int*               csrSortedColIndA,
+                     const cusparseMatDescr_t descrB,
+                     int                      nnzB,
+                     const int*               csrSortedRowPtrB,
+                     const int*               csrSortedColIndB,
+                     const cusparseMatDescr_t descrD,
+                     int                      nnzD,
+                     const int*               csrSortedRowPtrD,
+                     const int*               csrSortedColIndD,
+                     const cusparseMatDescr_t descrC,
+                     int*                     csrSortedRowPtrC,
+                     int*                     nnzTotalDevHostPtr,
+                     const csrgemm2Info_t     info,
+                     void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD, csrSortedColIndD, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgemm2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      k,
+                  const float*             alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const float*             csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const float*             csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const float*             beta,
+                  const cusparseMatDescr_t descrD,
+                  int                      nnzD,
+                  const float*             csrSortedValD,
+                  const int*               csrSortedRowPtrD,
+                  const int*               csrSortedColIndD,
+                  const cusparseMatDescr_t descrC,
+                  float*                   csrSortedValC,
+                  const int*               csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  const csrgemm2Info_t     info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t, int, const float *, const int *, const int *, const cusparseMatDescr_t, int, const float *, const int *, const int *, const float *, const cusparseMatDescr_t, int, const float *, const int *, const int *, const cusparseMatDescr_t, float *, const int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedValD, csrSortedRowPtrD, csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgemm2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      k,
+                  const double*            alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const double*            csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const double*            beta,
+                  const cusparseMatDescr_t descrD,
+                  int                      nnzD,
+                  const double*            csrSortedValD,
+                  const int*               csrSortedRowPtrD,
+                  const int*               csrSortedColIndD,
+                  const cusparseMatDescr_t descrC,
+                  double*                  csrSortedValC,
+                  const int*               csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  const csrgemm2Info_t     info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t, int, const double *, const int *, const int *, const cusparseMatDescr_t, int, const double *, const int *, const int *, const double *, const cusparseMatDescr_t, int, const double *, const int *, const int *, const cusparseMatDescr_t, double *, const int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedValD, csrSortedRowPtrD, csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgemm2(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      k,
+                 const cuComplex*         alpha,
+                 const cusparseMatDescr_t descrA,
+                 int                      nnzA,
+                 const cuComplex*         csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 const cusparseMatDescr_t descrB,
+                 int                      nnzB,
+                 const cuComplex*         csrSortedValB,
+                 const int*               csrSortedRowPtrB,
+                 const int*               csrSortedColIndB,
+                 const cuComplex*         beta,
+                 const cusparseMatDescr_t descrD,
+                 int                      nnzD,
+                 const cuComplex*         csrSortedValD,
+                 const int*               csrSortedRowPtrD,
+                 const int*               csrSortedColIndD,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               csrSortedValC,
+                 const int*               csrSortedRowPtrC,
+                 int*                     csrSortedColIndC,
+                 const csrgemm2Info_t     info,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuComplex *, const cusparseMatDescr_t, int, const cuComplex *, const int *, const int *, const cusparseMatDescr_t, int, const cuComplex *, const int *, const int *, const cuComplex *, const cusparseMatDescr_t, int, const cuComplex *, const int *, const int *, const cusparseMatDescr_t, cuComplex *, const int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedValD, csrSortedRowPtrD, csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgemm2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      k,
+                  const cuDoubleComplex*   alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuDoubleComplex*   csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuDoubleComplex*   csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cuDoubleComplex*   beta,
+                  const cusparseMatDescr_t descrD,
+                  int                      nnzD,
+                  const cuDoubleComplex*   csrSortedValD,
+                  const int*               csrSortedRowPtrD,
+                  const int*               csrSortedColIndD,
+                  const cusparseMatDescr_t descrC,
+                  cuDoubleComplex*         csrSortedValC,
+                  const int*               csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  const csrgemm2Info_t     info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *, const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *, const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta, descrD, nnzD, csrSortedValD, csrSortedRowPtrD, csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const float*             alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const float*             csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const float*             beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const float*             csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const float*             csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int, const float *, const int *, const int *, const float *, const cusparseMatDescr_t, int, const float *, const int *, const int *, const cusparseMatDescr_t, const float *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const double*            alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const double*            csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const double*            beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const double*            csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const double*            csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int, const double *, const int *, const int *, const double *, const cusparseMatDescr_t, int, const double *, const int *, const int *, const cusparseMatDescr_t, const double *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const cuComplex*         alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const cuComplex*         csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cuComplex*         beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const cuComplex*         csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const cuComplex*         csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t, int, const cuComplex *, const int *, const int *, const cuComplex *, const cusparseMatDescr_t, int, const cuComplex *, const int *, const int *, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const cuDoubleComplex*   alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const cuDoubleComplex*   csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cuDoubleComplex*   beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const cuDoubleComplex*   csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const cuDoubleComplex*   csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *, const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *, const int *, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrgeam2Nnz(cusparseHandle_t         handle,
+                     int                      m,
+                     int                      n,
+                     const cusparseMatDescr_t descrA,
+                     int                      nnzA,
+                     const int*               csrSortedRowPtrA,
+                     const int*               csrSortedColIndA,
+                     const cusparseMatDescr_t descrB,
+                     int                      nnzB,
+                     const int*               csrSortedRowPtrB,
+                     const int*               csrSortedColIndB,
+                     const cusparseMatDescr_t descrC,
+                     int*                     csrSortedRowPtrC,
+                     int*                     nnzTotalDevHostPtr,
+                     void*                    workspace) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int, const int *, const int *, const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, workspace);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const float*             alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const float*             csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const float*             beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const float*             csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  float*                   csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int, const float *, const int *, const int *, const float *, const cusparseMatDescr_t, int, const float *, const int *, const int *, const cusparseMatDescr_t, float *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const double*            alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const double*            beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const double*            csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  double*                  csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int, const double *, const int *, const int *, const double *, const cusparseMatDescr_t, int, const double *, const int *, const int *, const cusparseMatDescr_t, double *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const cuComplex*         alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuComplex*         csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cuComplex*         beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuComplex*         csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  cuComplex*               csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t, int, const cuComplex *, const int *, const int *, const cuComplex *, const cusparseMatDescr_t, int, const cuComplex *, const int *, const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const cuDoubleComplex*   alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuDoubleComplex*   csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cuDoubleComplex*   beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuDoubleComplex*   csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  cuDoubleComplex*         csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cuDoubleComplex *, const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *, const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB, csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const float*              csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const float*              fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, const float *, int *, int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, fractionToColor, ncolors, coloring, reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const double*            fractionToColor,
+                  int*                     ncolors,
+                  int*                     coloring,
+                  int*                     reordering,
+                  const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, const double *, int *, int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, fractionToColor, ncolors, coloring, reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const cuComplex*          csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const float*              fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, const float *, int *, int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, fractionToColor, ncolors, coloring, reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrcolor(cusparseHandle_t          handle,
+                  int                       m,
+                  int                       nnz,
+                  const cusparseMatDescr_t  descrA,
+                  const cuDoubleComplex*    csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const double*             fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, const double *, int *, int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, fractionToColor, ncolors, coloring, reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const float*             A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const double*            A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const cuComplex*         A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const cuDoubleComplex*   A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol, nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      float                    tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cusparseMatDescr_t, const float *, const int *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow, nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      double                   tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cusparseMatDescr_t, const double *, const int *, int *, int *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow, nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const cuComplex*         csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      cuComplex                tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *, const int *, int *, int *, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow, nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const cuDoubleComplex*   csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      cuDoubleComplex          tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow, nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const float*             csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          float*                   csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          float                    tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, const int *, float *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA, csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC, csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          double*                  csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          double                   tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, const int *, double *, int *, int *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA, csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC, csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          cuComplex*               csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          cuComplex                tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, const int *, cuComplex *, int *, int *, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA, csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC, csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          cuDoubleComplex*         csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          cuDoubleComplex          tol) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, const int *, cuDoubleComplex *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA, csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC, csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSdense2csr(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             A,
+                   int                      lda,
+                   const int*               nnzPerRow,
+                   float*                   csrSortedValA,
+                   int*                     csrSortedRowPtrA,
+                   int*                     csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int, const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDdense2csr(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            A,
+                   int                      lda,
+                   const int*               nnzPerRow,
+                   double*                  csrSortedValA,
+                   int*                     csrSortedRowPtrA,
+                   int*                     csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int, const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCdense2csr(cusparseHandle_t           handle,
+                     int                      m,
+                     int                      n,
+                     const cusparseMatDescr_t descrA,
+                     const cuComplex*         A,
+                     int                      lda,
+                     const int*               nnzPerRow,
+                     cuComplex*               csrSortedValA,
+                     int*                     csrSortedRowPtrA,
+                     int*                     csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZdense2csr(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   A,
+                   int                      lda,
+                   const int*               nnzPerRow,
+                   cuDoubleComplex*         csrSortedValA,
+                   int*                     csrSortedRowPtrA,
+                   int*                     csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   float*                   A,
+                   int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   double*                  A,
+                   int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   cuComplex*               A,
+                   int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2dense(cusparseHandle_t         handle,
+                int                      m,
+                int                      n,
+                const cusparseMatDescr_t descrA,
+                const cuDoubleComplex*   csrSortedValA,
+                const int*               csrSortedRowPtrA,
+                const int*               csrSortedColIndA,
+                cuDoubleComplex*         A,
+                int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   float*                   cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int, const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   double*                  cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int, const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   cuComplex*               cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   cuDoubleComplex*         cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   float*                   A,
+                   int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   double*                  A,
+                   int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   cuComplex*               A,
+                   int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   cuDoubleComplex*         A,
+                   int                      lda) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA, cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoo2csr(cusparseHandle_t    handle,
+                 const int*          cooRowInd,
+                 int                 nnz,
+                 int                 m,
+                 int*                csrSortedRowPtr,
+                 cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2coo(cusparseHandle_t    handle,
+                 const int*          csrSortedRowPtr,
+                 int                 nnz,
+                 int                 m,
+                 int*                cooRowInd,
+                 cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2bsrNnz(cusparseHandle_t         handle,
+                    cusparseDirection_t      dirA,
+                    int                      m,
+                    int                      n,
+                    const cusparseMatDescr_t descrA,
+                    const int*               csrSortedRowPtrA,
+                    const int*               csrSortedColIndA,
+                    int                      blockDim,
+                    const cusparseMatDescr_t descrC,
+                    int*                     bsrSortedRowPtrC,
+                    int*                     nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA, csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC, nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const float*             csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 float*                   bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, blockDim, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const double*            csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 double*                  bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, blockDim, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const cuComplex*         csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, blockDim, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const cuDoubleComplex*   csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuDoubleComplex*         bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, blockDim, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const float*             bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 float*                   csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const double*            bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 double*                  csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const cuComplex*         bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const cuDoubleComplex*   bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuDoubleComplex*         csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const float*     bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const float *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const double*    bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const double *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const cuComplex* bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc_bufferSize(cusparseHandle_t       handle,
+                                int                    mb,
+                                int                    nb,
+                                int                    nnzb,
+                                const cuDoubleComplex* bsrSortedVal,
+                                const int*             bsrSortedRowPtr,
+                                const int*             bsrSortedColInd,
+                                int                    rowBlockDim,
+                                int                    colBlockDim,
+                                int*                   pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const float*     bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const float *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const double*    bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const double *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const cuComplex* bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc_bufferSizeExt(cusparseHandle_t       handle,
+                                   int                    mb,
+                                   int                    nb,
+                                   int                    nnzb,
+                                   const cuDoubleComplex* bsrSortedVal,
+                                   const int*             bsrSortedRowPtr,
+                                   const int*             bsrSortedColInd,
+                                   int                    rowBlockDim,
+                                   int                    colBlockDim,
+                                   size_t*                pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc(cusparseHandle_t handle,
+                     int              mb,
+                     int              nb,
+                     int              nnzb,
+                     const float*     bsrSortedVal,
+                     const int* bsrSortedRowPtr,
+                     const int* bsrSortedColInd,
+                     int        rowBlockDim,
+                     int        colBlockDim,
+                     float*     bscVal,
+                     int*       bscRowInd,
+                     int*       bscColPtr,
+                     cusparseAction_t copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const float *, const int *, const int *, int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd, bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc(cusparseHandle_t    handle,
+                     int                 mb,
+                     int                 nb,
+                     int                 nnzb,
+                     const double*       bsrSortedVal,
+                     const int*          bsrSortedRowPtr,
+                     const int*          bsrSortedColInd,
+                     int                 rowBlockDim,
+                     int                 colBlockDim,
+                     double*             bscVal,
+                     int*                bscRowInd,
+                     int*                bscColPtr,
+                     cusparseAction_t    copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const double *, const int *, const int *, int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd, bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc(cusparseHandle_t    handle,
+                     int                 mb,
+                     int                 nb,
+                     int                 nnzb,
+                     const cuComplex*    bsrSortedVal,
+                     const int*          bsrSortedRowPtr,
+                     const int*          bsrSortedColInd,
+                     int                 rowBlockDim,
+                     int                 colBlockDim,
+                     cuComplex*          bscVal,
+                     int*                bscRowInd,
+                     int*                bscColPtr,
+                     cusparseAction_t    copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuComplex *, const int *, const int *, int, int, cuComplex *, int *, int *, cusparseAction_t, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd, bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc(cusparseHandle_t       handle,
+                     int                    mb,
+                     int                    nb,
+                     int                    nnzb,
+                     const cuDoubleComplex* bsrSortedVal,
+                     const int*             bsrSortedRowPtr,
+                     const int*             bsrSortedColInd,
+                     int                    rowBlockDim,
+                     int                    colBlockDim,
+                     cuDoubleComplex*       bscVal,
+                     int*                   bscRowInd,
+                     int*                   bscColPtr,
+                     cusparseAction_t       copyValues,
+                     cusparseIndexBase_t    idxBase,
+                     void*                  pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *, const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd, bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const int *, const int *, int, int, const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDim, colBlockDim, descrC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const float*             bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   float*                   csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, int, const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDim, colBlockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const double*            bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   double*                  csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, int, const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDim, colBlockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   cuComplex*               csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, int, const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDim, colBlockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   cuDoubleComplex*         csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDim, colBlockDim, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const float*             csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const double*            csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const cuComplex*         csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const cuDoubleComplex*   csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const float*             csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const double*            csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const cuComplex*         csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const cuDoubleComplex*   csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2gebsrNnz(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      int                      m,
+                      int                      n,
+                      const cusparseMatDescr_t descrA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const cusparseMatDescr_t descrC,
+                      int*                     bsrSortedRowPtrC,
+                      int                      rowBlockDim,
+                      int                      colBlockDim,
+                      int*                     nnzTotalDevHostPtr,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const int *, const int *, const cusparseMatDescr_t, int *, int, int, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA, csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim, colBlockDim, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   float*                   bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   double*                  bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   cuComplex*               bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   cuDoubleComplex*         bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const float*             bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const double*            bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const cuComplex*         bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const cuDoubleComplex*   bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const float*             bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, int, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const double*            bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, int, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const cuComplex*         bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, int, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const cuDoubleComplex*   bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, int, int, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXgebsr2gebsrNnz(cusparseHandle_t         handle,
+                        cusparseDirection_t      dirA,
+                        int                      mb,
+                        int                      nb,
+                        int                      nnzb,
+                        const cusparseMatDescr_t descrA,
+                        const int*               bsrSortedRowPtrA,
+                        const int*               bsrSortedColIndA,
+                        int                      rowBlockDimA,
+                        int                      colBlockDimA,
+                        const cusparseMatDescr_t descrC,
+                        int*                     bsrSortedRowPtrC,
+                        int                      rowBlockDimC,
+                        int                      colBlockDimC,
+                        int*                     nnzTotalDevHostPtr,
+                        void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const int *, const int *, int, int, const cusparseMatDescr_t, int *, int, int, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC, bsrSortedRowPtrC, rowBlockDimC, colBlockDimC, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const float*             bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     float*                   bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, int, int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const double*            bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     double*                  bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, int, int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const cuComplex*         bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     cuComplex*               bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const cuComplex *, const int *, const int *, int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const cuDoubleComplex*   bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     cuDoubleComplex*         bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseDirection_t, int, int, int, const cusparseMatDescr_t, const cuDoubleComplex *, const int *, const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC, bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateIdentityPermutation(cusparseHandle_t handle,
+                                  int              n,
+                                  int*             p) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, p);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       cooRowsA,
+                               const int*       cooColsA,
+                               size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosortByRow(cusparseHandle_t handle,
+                      int              m,
+                      int              n,
+                      int              nnz,
+                      int*             cooRowsA,
+                      int*             cooColsA,
+                      int*             P,
+                      void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosortByColumn(cusparseHandle_t handle,
+                         int              m,
+                         int              n,
+                         int              nnz,
+                         int*             cooRowsA,
+                         int*             cooColsA,
+                         int*             P,
+                         void*            pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       csrRowPtrA,
+                               const int*       csrColIndA,
+                               size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsort(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 const int*               csrRowPtrA,
+                 int*                     csrColIndA,
+                 int*                     P,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcscsort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       cscColPtrA,
+                               const int*       cscRowIndA,
+                               size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcscsort(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 const int*               cscColPtrA,
+                 int*                     cscRowIndA,
+                 int*                     P,
+                 void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                float*           csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, float *, const int *, int *, csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                double*          csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, double *, const int *, int *, csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                cuComplex*       csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, cuComplex *, const int *, int *, csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                cuDoubleComplex* csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *, csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const float*             A,
+                                      int                      lda,
+                                      const float*             threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const float*             csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t* pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, int, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const double*            A,
+                                      int                      lda,
+                                      const double*            threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const double*            csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t*               pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, int, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const float*             A,
+                           int                      lda,
+                           const float*             threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, int, const float *, const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const double*            A,
+                           int                      lda,
+                           const double*            threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrSortedRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, int, const double *, const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const float*             A,
+                        int                      lda,
+                        const float*             threshold,
+                        const cusparseMatDescr_t descrC,
+                        float*                   csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, int, const float *, const cusparseMatDescr_t, float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const double*            A,
+                        int                      lda,
+                        const double*            threshold,
+                        const cusparseMatDescr_t descrC,
+                        double*                  csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, int, const double *, const cusparseMatDescr_t, double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const float*             csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const float*             threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const float*             csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t*                 pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, const float *, const cusparseMatDescr_t, const float *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const double*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const double*            threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const double*            csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t*                 pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, const double *, const cusparseMatDescr_t, const double *, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrNnz(cusparseHandle_t         handle,
+                         int                      m,
+                         int                      n,
+                         int                      nnzA,
+                         const cusparseMatDescr_t descrA,
+                         const float*             csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const float*             threshold,
+                         const cusparseMatDescr_t descrC,
+                         int*                     csrSortedRowPtrC,
+                         int*                     nnzTotalDevHostPtr,
+                         void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, const float *, const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, threshold, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+ cusparseDpruneCsr2csrNnz(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          int                      nnzA,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          const double*            threshold,
+                          const cusparseMatDescr_t descrC,
+                          int*                     csrSortedRowPtrC,
+                          int*                     nnzTotalDevHostPtr,
+                          void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, const double *, const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, threshold, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const float*             threshold,
+                      const cusparseMatDescr_t descrC,
+                      float*                   csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, const float *, const cusparseMatDescr_t, float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const double*            threshold,
+                      const cusparseMatDescr_t descrC,
+                      double*                  csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, const double *, const cusparseMatDescr_t, double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, threshold, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const float*             A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const float*             csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, int, float, const cusparseMatDescr_t, const float *, const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const double*            A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const double*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, int, float, const cusparseMatDescr_t, const double *, const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const float*             A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, int, float, const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC, nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const double*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, int, float, const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC, nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const float*             A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    float*                   csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const float *, int, float, const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const double*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    double*                  csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, const double *, int, float, const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const float*             csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const float*             csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, float, const cusparseMatDescr_t, const float *, const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const double*            csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const double*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, float, const cusparseMatDescr_t, const double *, const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const float*             csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, float, const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, percentage, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const double*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, float, const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, percentage, descrC, csrSortedRowPtrC, nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const float*             csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float                    percentage,
+                                  const cusparseMatDescr_t descrC,
+                                  float*                   csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *, const int *, const int *, float, const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const double*            csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float                    percentage,
+                                  const cusparseMatDescr_t descrC,
+                                  double*                  csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *, const int *, const int *, float, const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, percentage, descrC, csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsr2cscEx2(cusparseHandle_t     handle,
+                   int                  m,
+                   int                  n,
+                   int                  nnz,
+                   const void*          csrVal,
+                   const int*           csrRowPtr,
+                   const int*           csrColInd,
+                   void*                cscVal,
+                   int*                 cscColPtr,
+                   int*                 cscRowInd,
+                   cudaDataType         valType,
+                   cusparseAction_t     copyValues,
+                   cusparseIndexBase_t  idxBase,
+                   cusparseCsr2CscAlg_t alg,
+                   void*                buffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const void *, const int *, const int *, void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t, cusparseCsr2CscAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd, valType, copyValues, idxBase, alg, buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsr2cscEx2_bufferSize(cusparseHandle_t     handle,
+                              int                  m,
+                              int                  n,
+                              int                  nnz,
+                              const void*          csrVal,
+                              const int*           csrRowPtr,
+                              const int*           csrColInd,
+                              void*                cscVal,
+                              int*                 cscColPtr,
+                              int*                 cscRowInd,
+                              cudaDataType         valType,
+                              cusparseAction_t     copyValues,
+                              cusparseIndexBase_t  idxBase,
+                              cusparseCsr2CscAlg_t alg,
+                              size_t*              bufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, int, int, int, const void *, const int *, const int *, void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t, cusparseCsr2CscAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal, cscColPtr, cscRowInd, valType, copyValues, idxBase, alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSpVec(cusparseSpVecDescr_t* spVecDescr,
+                    int64_t               size,
+                    int64_t               nnz,
+                    void*                 indices,
+                    void*                 values,
+                    cusparseIndexType_t   idxType,
+                    cusparseIndexBase_t   idxBase,
+                    cudaDataType          valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *, cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
+                 int64_t*             size,
+                 int64_t*             nnz,
+                 void**               indices,
+                 void**               values,
+                 cusparseIndexType_t* idxType,
+                 cusparseIndexBase_t* idxBase,
+                 cudaDataType*        valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetIndexBase(cusparseSpVecDescr_t spVecDescr,
+                          cusparseIndexBase_t* idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpVecDescr_t, cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr,
+                       void**               values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr,
+                       void*                values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnVec(cusparseDnVecDescr_t* dnVecDescr,
+                    int64_t               size,
+                    void*                 values,
+                    cudaDataType          valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
+                 int64_t*             size,
+                 void**               values,
+                 cudaDataType*        valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr,
+                       void**               values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr,
+                       void*                values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetFormat(cusparseSpMatDescr_t spMatDescr,
+                       cusparseFormat_t*    format) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, cusparseFormat_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, format);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetIndexBase(cusparseSpMatDescr_t spMatDescr,
+                          cusparseIndexBase_t* idxBase) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr,
+                       void**               values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr,
+                       void*                values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetSize(cusparseSpMatDescr_t spMatDescr,
+                     int64_t*             rows,
+                     int64_t*             cols,
+                     int64_t*             nnz) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                             int                  batchCount) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                             int*                 batchCount) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsr(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 csrRowOffsets,
+                  void*                 csrColInd,
+                  void*                 csrValues,
+                  cusparseIndexType_t   csrRowOffsetsType,
+                  cusparseIndexType_t   csrColIndType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *, cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues, csrRowOffsetsType, csrColIndType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               csrRowOffsets,
+               void**               csrColInd,
+               void**               csrValues,
+               cusparseIndexType_t* csrRowOffsetsType,
+               cusparseIndexType_t* csrColIndType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **, void **, cusparseIndexType_t *, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues, csrRowOffsetsType, csrColIndType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr,
+                       void*                csrRowOffsets,
+                       void*                csrColInd,
+                       void*                csrValues) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, void *, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetPointers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, csrRowOffsets, csrColInd, csrValues);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCoo(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 cooRowInd,
+                  void*                 cooColInd,
+                  void*                 cooValues,
+                  cusparseIndexType_t   cooIdxType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *, cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues, cooIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCooAoS(cusparseSpMatDescr_t* spMatDescr,
+                     int64_t               rows,
+                     int64_t               cols,
+                     int64_t               nnz,
+                     void*                 cooInd,
+                     void*                 cooValues,
+                     cusparseIndexType_t   cooIdxType,
+                     cusparseIndexBase_t   idxBase,
+                     cudaDataType          valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCooAoS");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, cooIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               cooRowInd,  // COO row indices
+               void**               cooColInd,  // COO column indices
+               void**               cooValues,  // COO values
+               cusparseIndexType_t* idxType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, // COO row indices
+               void **, // COO column indices
+               void **, // COO values
+               cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues, idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooAoSGet(cusparseSpMatDescr_t spMatDescr,
+                  int64_t*             rows,
+                  int64_t*             cols,
+                  int64_t*             nnz,
+                  void**               cooInd,     // COO indices
+                  void**               cooValues,  // COO values
+                  cusparseIndexType_t* idxType,
+                  cusparseIndexBase_t* idxBase,
+                  cudaDataType*        valueType) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, // COO indices
+                  void **, // COO values
+                  cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooAoSGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnMat(cusparseDnMatDescr_t* dnMatDescr,
+                    int64_t               rows,
+                    int64_t               cols,
+                    int64_t               ld,
+                    void*                 values,
+                    cudaDataType          valueType,
+                    cusparseOrder_t       order) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType, cusparseOrder_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
+                 int64_t*             rows,
+                 int64_t*             cols,
+                 int64_t*             ld,
+                 void**               values,
+                 cudaDataType*        type,
+                 cusparseOrder_t*     order) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **, cudaDataType *, cusparseOrder_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr,
+                       void**               values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr,
+                       void*                values) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetStridedBatch(cusparseDnMatDescr_t dnMatDescr,
+                             int                  batchCount,
+                             int64_t              batchStride) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetStridedBatch(cusparseDnMatDescr_t dnMatDescr,
+                             int*                 batchCount,
+                             int64_t*             batchStride) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseDnMatDescr_t, int *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV_bufferSize(cusparseHandle_t     handle,
+                        cusparseOperation_t  opX,
+                        cusparseSpVecDescr_t vecX,
+                        cusparseDnVecDescr_t vecY,
+                        const void*          result,
+                        cudaDataType         computeType,
+                        size_t*              bufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t, cusparseDnVecDescr_t, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV(cusparseHandle_t     handle,
+             cusparseOperation_t  opX,
+             cusparseSpVecDescr_t vecX,
+             cusparseDnVecDescr_t vecY,
+             void*                result,
+             cudaDataType         computeType,
+             void*                externalBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t, cusparseDnVecDescr_t, void *, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMV(cusparseHandle_t     handle,
+             cusparseOperation_t  opA,
+             const void*          alpha,
+             cusparseSpMatDescr_t matA,
+             cusparseDnVecDescr_t vecX,
+             const void*          beta,
+             cusparseDnVecDescr_t vecY,
+             cudaDataType         computeType,
+             cusparseSpMVAlg_t    alg,
+             void*                externalBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t, cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMV_bufferSize(cusparseHandle_t    handle,
+                        cusparseOperation_t opA,
+                        const void*         alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnVecDescr_t vecX,
+                        const void*          beta,
+                        cusparseDnVecDescr_t vecY,
+                        cudaDataType         computeType,
+                        cusparseSpMVAlg_t    alg,
+                        size_t*              bufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t, cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType, cusparseSpMVAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM(cusparseHandle_t     handle,
+             cusparseOperation_t  opA,
+             cusparseOperation_t  opB,
+             const void*          alpha,
+             cusparseSpMatDescr_t matA,
+             cusparseDnMatDescr_t matB,
+             const void*          beta,
+             cusparseDnMatDescr_t matC,
+             cudaDataType         computeType,
+             cusparseSpMMAlg_t    alg,
+             void*                externalBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *, cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM_bufferSize(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        cusparseOperation_t  opB,
+                        const void*          alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnMatDescr_t matB,
+                        const void*          beta,
+                        cusparseDnMatDescr_t matC,
+                        cudaDataType         computeType,
+                        cusparseSpMMAlg_t    alg,
+                        size_t*              bufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *, cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t* descr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpGEMMDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_createDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_destroyDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_workEstimation(cusparseHandle_t      handle,
+                              cusparseOperation_t   opA,
+                              cusparseOperation_t   opB,
+                              const void*           alpha,
+                              cusparseSpMatDescr_t  matA,
+                              cusparseSpMatDescr_t  matB,
+                              const void*           beta,
+                              cusparseSpMatDescr_t  matC,
+                              cudaDataType          computeType,
+                              cusparseSpGEMMAlg_t   alg,
+                              cusparseSpGEMMDescr_t spgemmDescr,
+                              size_t*               bufferSize1,
+                              void*                 externalBuffer1) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *, cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_workEstimation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, alg, spgemmDescr, bufferSize1, externalBuffer1);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_compute(cusparseHandle_t      handle,
+                       cusparseOperation_t   opA,
+                       cusparseOperation_t   opB,
+                       const void*           alpha,
+                       cusparseSpMatDescr_t  matA,
+                       cusparseSpMatDescr_t  matB,
+                       const void*           beta,
+                       cusparseSpMatDescr_t  matC,
+                       cudaDataType          computeType,
+                       cusparseSpGEMMAlg_t   alg,
+                       cusparseSpGEMMDescr_t spgemmDescr,
+                       size_t*               bufferSize2,
+                       void*                 externalBuffer2) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *, cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_compute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, alg, spgemmDescr, bufferSize2, externalBuffer2);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_copy(cusparseHandle_t      handle,
+                    cusparseOperation_t   opA,
+                    cusparseOperation_t   opB,
+                    const void*           alpha,
+                    cusparseSpMatDescr_t  matA,
+                    cusparseSpMatDescr_t  matB,
+                    const void*           beta,
+                    cusparseSpMatDescr_t  matC,
+                    cudaDataType          computeType,
+                    cusparseSpGEMMAlg_t   alg,
+                    cusparseSpGEMMDescr_t spgemmDescr) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *, cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t, cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_copy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, alg, spgemmDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstrainedGeMM(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        cusparseOperation_t  opB,
+                        const void*          alpha,
+                        cusparseDnMatDescr_t matA,
+                        cusparseDnMatDescr_t matB,
+                        const void*          beta,
+                        cusparseSpMatDescr_t matC,
+                        cudaDataType         computeType,
+                        void*                externalBuffer) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *, cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *, cusparseSpMatDescr_t, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstrainedGeMM_bufferSize(cusparseHandle_t     handle,
+                                   cusparseOperation_t  opA,
+                                   cusparseOperation_t  opB,
+                                   const void*          alpha,
+                                   cusparseDnMatDescr_t matA,
+                                   cusparseDnMatDescr_t matB,
+                                   const void*          beta,
+                                   cusparseSpMatDescr_t matC,
+                                   cudaDataType         computeType,
+                                   size_t*              bufferSize) {
+  using FuncPtr = cusparseStatus_t (CUSPARSEAPI *)(cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *, cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *, cusparseSpMatDescr_t, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType, bufferSize);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_stub.cc b/tensorflow/stream_executor/cuda/cusparse_stub.cc
index 783b034d7b6..bb4a922c30f 100644
--- a/tensorflow/stream_executor/cuda/cusparse_stub.cc
+++ b/tensorflow/stream_executor/cuda/cusparse_stub.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cusparse.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
@@ -59,7 +60,7 @@ cusparseStatus_t GetSymbolNotFoundError() {
 #include "tensorflow/stream_executor/cuda/cusparse_10_1.inc"
 #elif CUDA_VERSION == 10020
 #include "tensorflow/stream_executor/cuda/cusparse_10_2.inc"
-#elif CUDA_VERSION == 11000
+#elif CUSPARSE_VER_MAJOR == 11 && CUSPARSE_VER_MINOR == 0
 #include "tensorflow/stream_executor/cuda/cusparse_11_0.inc"
 #else
 #error "We don't have a wrapper for this version."
diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc
index 9ae8b41ccf4..fb7d88aaedb 100644
--- a/tensorflow/stream_executor/platform/default/dso_loader.cc
+++ b/tensorflow/stream_executor/platform/default/dso_loader.cc
@@ -31,8 +31,12 @@ namespace internal {
 
 namespace {
 string GetCudaVersion() { return TF_CUDA_VERSION; }
-string GetCudaLibVersion() { return TF_CUDA_LIB_VERSION; }
 string GetCudnnVersion() { return TF_CUDNN_VERSION; }
+string GetCublasVersion() { return TF_CUBLAS_VERSION; }
+string GetCusolverVersion() { return TF_CUSOLVER_VERSION; }
+string GetCurandVersion() { return TF_CURAND_VERSION; }
+string GetCufftVersion() { return TF_CUFFT_VERSION; }
+string GetCusparseVersion() { return TF_CUSPARSE_VERSION; }
 string GetTensorRTVersion() { return TF_TENSORRT_VERSION; }
 
 port::StatusOr<void*> GetDsoHandle(const string& name, const string& version) {
@@ -77,23 +81,23 @@ port::StatusOr<void*> GetCudaRuntimeDsoHandle() {
 }
 
 port::StatusOr<void*> GetCublasDsoHandle() {
-  return GetDsoHandle("cublas", GetCudaLibVersion());
+  return GetDsoHandle("cublas", GetCublasVersion());
 }
 
 port::StatusOr<void*> GetCufftDsoHandle() {
-  return GetDsoHandle("cufft", GetCudaLibVersion());
+  return GetDsoHandle("cufft", GetCufftVersion());
 }
 
 port::StatusOr<void*> GetCusolverDsoHandle() {
-  return GetDsoHandle("cusolver", GetCudaLibVersion());
+  return GetDsoHandle("cusolver", GetCusolverVersion());
 }
 
 port::StatusOr<void*> GetCusparseDsoHandle() {
-  return GetDsoHandle("cusparse", GetCudaLibVersion());
+  return GetDsoHandle("cusparse", GetCusparseVersion());
 }
 
 port::StatusOr<void*> GetCurandDsoHandle() {
-  return GetDsoHandle("curand", GetCudaLibVersion());
+  return GetDsoHandle("curand", GetCurandVersion());
 }
 
 port::StatusOr<void*> GetCuptiDsoHandle() {
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 92586dd7d11..6b5318f60c2 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -84,6 +84,42 @@ cuda_header_library(
     includes = ["cublas/include"],
 )
 
+cuda_header_library(
+    name = "cusolver_headers",
+    hdrs = [":cusolver-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cusolver/include",
+    deps = [":cuda_headers"],
+    includes = ["cusolver/include"],
+)
+
+cuda_header_library(
+    name = "cufft_headers",
+    hdrs = [":cufft-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cufft/include",
+    deps = [":cuda_headers"],
+    includes = ["cufft/include"],
+)
+
+cuda_header_library(
+    name = "cusparse_headers",
+    hdrs = [":cusparse-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "cusparse/include",
+    deps = [":cuda_headers"],
+    includes = ["cusparse/include"],
+)
+
+cuda_header_library(
+    name = "curand_headers",
+    hdrs = [":curand-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    strip_include_prefix = "curand/include",
+    deps = [":cuda_headers"],
+    includes = ["curand/include"],
+)
+
 cc_library(
     name = "cublas",
     srcs = ["cuda/lib/%{cublas_lib}"],
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index f91c71f74a1..75d360fa17e 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -85,6 +85,42 @@ cuda_header_library(
     deps = [":cuda_headers"],
 )
 
+cuda_header_library(
+    name = "cusolver_headers",
+    hdrs = [":cusolver-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusolver/include"],
+    strip_include_prefix = "cusolver/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cufft_headers",
+    hdrs = [":cufft-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cufft/include"],
+    strip_include_prefix = "cufft/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "cusparse_headers",
+    hdrs = [":cusparse-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["cusparse/include"],
+    strip_include_prefix = "cusparse/include",
+    deps = [":cuda_headers"],
+)
+
+cuda_header_library(
+    name = "curand_headers",
+    hdrs = [":curand-include"],
+    include_prefix = "third_party/gpus/cuda/include",
+    includes = ["curand/include"],
+    strip_include_prefix = "curand/include",
+    deps = [":cuda_headers"],
+)
+
 cc_import(
     name = "cublas",
     interface_library = "cuda/lib/%{cublas_lib}",
diff --git a/third_party/gpus/cuda/cuda_config.h.tpl b/third_party/gpus/cuda/cuda_config.h.tpl
index dbd846307bb..b59889938b1 100644
--- a/third_party/gpus/cuda/cuda_config.h.tpl
+++ b/third_party/gpus/cuda/cuda_config.h.tpl
@@ -17,7 +17,11 @@ limitations under the License.
 #define CUDA_CUDA_CONFIG_H_
 
 #define TF_CUDA_VERSION "%{cuda_version}"
-#define TF_CUDA_LIB_VERSION "%{cuda_lib_version}"
+#define TF_CUBLAS_VERSION "%{cublas_version}"
+#define TF_CUSOLVER_VERSION "%{cusolver_version}"
+#define TF_CURAND_VERSION "%{curand_version}"
+#define TF_CUFFT_VERSION "%{cufft_version}"
+#define TF_CUSPARSE_VERSION "%{cusparse_version}"
 #define TF_CUDNN_VERSION "%{cudnn_version}"
 
 #define TF_CUDA_TOOLKIT_PATH "%{cuda_toolkit_path}"
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index aa8a2f0226d..7e5b159eab6 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -527,28 +527,28 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
             "cublas",
             cpu_value,
             cuda_config.config["cublas_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.cublas_version,
             static = False,
         ),
         "cusolver": _check_cuda_lib_params(
             "cusolver",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["cusolver_library_dir"],
+            cuda_config.cusolver_version,
             static = False,
         ),
         "curand": _check_cuda_lib_params(
             "curand",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["curand_library_dir"],
+            cuda_config.curand_version,
             static = False,
         ),
         "cufft": _check_cuda_lib_params(
             "cufft",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["cufft_library_dir"],
+            cuda_config.cufft_version,
             static = False,
         ),
         "cudnn": _check_cuda_lib_params(
@@ -568,8 +568,8 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config):
         "cusparse": _check_cuda_lib_params(
             "cusparse",
             cpu_value,
-            cuda_config.config["cuda_library_dir"],
-            cuda_config.cuda_lib_version,
+            cuda_config.config["cusparse_library_dir"],
+            cuda_config.cusparse_version,
             static = False,
         ),
     }
@@ -646,18 +646,37 @@ def _get_cuda_config(repository_ctx, find_cuda_config_script):
     cuda_version = ("64_%s%s" if is_windows else "%s.%s") % (cuda_major, cuda_minor)
     cudnn_version = ("64_%s" if is_windows else "%s") % config["cudnn_version"]
 
-    # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
-    # It changed from 'x.y' to just 'x' in CUDA 10.1.
-    if (int(cuda_major), int(cuda_minor)) >= (10, 1):
+    if int(cuda_major) >= 11:
+        cublas_version = ("64_%s" if is_windows else "%s") % config["cublas_version"].split(".")[0]
+        cusolver_version = ("64_%s" if is_windows else "%s") % config["cusolver_version"].split(".")[0]
+        curand_version = ("64_%s" if is_windows else "%s") % config["curand_version"].split(".")[0]
+        cufft_version = ("64_%s" if is_windows else "%s") % config["cufft_version"].split(".")[0]
+        cusparse_version = ("64_%s" if is_windows else "%s") % config["cusparse_version"].split(".")[0]
+    elif (int(cuda_major), int(cuda_minor)) >= (10, 1):
+        # cuda_lib_version is for libraries like cuBLAS, cuFFT, cuSOLVER, etc.
+        # It changed from 'x.y' to just 'x' in CUDA 10.1.
         cuda_lib_version = ("64_%s" if is_windows else "%s") % cuda_major
+        cublas_version = cuda_lib_version
+        cusolver_version = cuda_lib_version
+        curand_version = cuda_lib_version
+        cufft_version = cuda_lib_version
+        cusparse_version = cuda_lib_version
     else:
-        cuda_lib_version = cuda_version
+        cublas_version = cuda_version
+        cusolver_version = cuda_version
+        curand_version = cuda_version
+        cufft_version = cuda_version
+        cusparse_version = cuda_version
 
     return struct(
         cuda_toolkit_path = toolkit_path,
         cuda_version = cuda_version,
+        cublas_version = cublas_version,
+        cusolver_version = cusolver_version,
+        curand_version = curand_version,
+        cufft_version = cufft_version,
+        cusparse_version = cusparse_version,
         cudnn_version = cudnn_version,
-        cuda_lib_version = cuda_lib_version,
         compute_capabilities = compute_capabilities(repository_ctx),
         cpu_value = cpu_value,
         config = config,
@@ -739,6 +758,10 @@ def _create_dummy_repository(repository_ctx):
             "%{copy_rules}": """
 filegroup(name="cuda-include")
 filegroup(name="cublas-include")
+filegroup(name="cusolver-include")
+filegroup(name="cufft-include")
+filegroup(name="cusparse-include")
+filegroup(name="curand-include")
 filegroup(name="cudnn-include")
 """,
         },
@@ -770,7 +793,11 @@ filegroup(name="cudnn-include")
         "cuda:cuda_config.h",
         {
             "%{cuda_version}": "",
-            "%{cuda_lib_version}": "",
+            "%{cublas_version}": "",
+            "%{cusolver_version}": "",
+            "%{curand_version}": "",
+            "%{cufft_version}": "",
+            "%{cusparse_version}": "",
             "%{cudnn_version}": "",
             "%{cuda_toolkit_path}": "",
         },
@@ -935,6 +962,56 @@ def _create_local_cuda_repository(repository_ctx):
         ],
     ))
 
+    cusolver_include_path = cuda_config.config["cusolver_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cusolver-include",
+        srcs = [
+            cusolver_include_path + "/cusolver_common.h",
+            cusolver_include_path + "/cusolverDn.h",
+        ],
+        outs = [
+            "cusolver/include/cusolver_common.h",
+            "cusolver/include/cusolverDn.h",
+        ],
+    ))
+
+    cufft_include_path = cuda_config.config["cufft_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cufft-include",
+        srcs = [
+            cufft_include_path + "/cufft.h",
+        ],
+        outs = [
+            "cufft/include/cufft.h",
+        ],
+    ))
+
+    cusparse_include_path = cuda_config.config["cusparse_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "cusparse-include",
+        srcs = [
+            cusparse_include_path + "/cusparse.h",
+        ],
+        outs = [
+            "cusparse/include/cusparse.h",
+        ],
+    ))
+
+    curand_include_path = cuda_config.config["curand_include_dir"]
+    copy_rules.append(make_copy_files_rule(
+        repository_ctx,
+        name = "curand-include",
+        srcs = [
+            curand_include_path + "/curand.h",
+        ],
+        outs = [
+            "curand/include/curand.h",
+        ],
+    ))
+
     check_cuda_libs_script = repository_ctx.path(Label("@org_tensorflow//third_party/gpus:check_cuda_libs.py"))
     cuda_libs = _find_libs(repository_ctx, check_cuda_libs_script, cuda_config)
     cuda_lib_srcs = []
@@ -1143,7 +1220,11 @@ def _create_local_cuda_repository(repository_ctx):
         tpl_paths["cuda:cuda_config.h"],
         {
             "%{cuda_version}": cuda_config.cuda_version,
-            "%{cuda_lib_version}": cuda_config.cuda_lib_version,
+            "%{cublas_version}": cuda_config.cublas_version,
+            "%{cusolver_version}": cuda_config.cusolver_version,
+            "%{curand_version}": cuda_config.curand_version,
+            "%{cufft_version}": cuda_config.cufft_version,
+            "%{cusparse_version}": cuda_config.cusparse_version,
             "%{cudnn_version}": cuda_config.cudnn_version,
             "%{cuda_toolkit_path}": cuda_config.cuda_toolkit_path,
         },
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index e2ab42abf67..19ab4a91465 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -318,12 +318,9 @@ def _find_cublas_config(base_paths, required_version, cuda_version):
     # cuBLAS uses the major version only.
     cublas_version = header_version.split(".")[0]
 
-    if not _matches_version(cuda_version, cublas_version):
-      raise ConfigError("cuBLAS version %s does not match CUDA version %s" %
-                        (cublas_version, cuda_version))
-
   else:
     # There is no version info available before CUDA 10.1, just find the file.
+    header_version = cuda_version
     header_path = _find_file(base_paths, _header_paths(), "cublas_api.h")
     # cuBLAS version is the same as CUDA version (x.y).
     cublas_version = required_version
@@ -331,10 +328,98 @@ def _find_cublas_config(base_paths, required_version, cuda_version):
   library_path = _find_library(base_paths, "cublas", cublas_version)
 
   return {
+      "cublas_version": header_version,
       "cublas_include_dir": os.path.dirname(header_path),
       "cublas_library_dir": os.path.dirname(library_path),
   }
 
+def _find_cusolver_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CUSOLVER_VER_MAJOR", "CUSOLVER_VER_MINOR",
+                       "CUSOLVER_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "cusolver_common.h",
+                                               required_version,
+                                               get_header_version)
+    cusolver_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "cusolver_common.h")
+    cusolver_version = required_version
+
+  library_path = _find_library(base_paths, "cusolver", cusolver_version)
+
+  return {
+      "cusolver_version": header_version,
+      "cusolver_include_dir": os.path.dirname(header_path),
+      "cusolver_library_dir": os.path.dirname(library_path),
+  }
+
+def _find_curand_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CURAND_VER_MAJOR", "CURAND_VER_MINOR",
+                       "CURAND_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "curand.h",
+                                               required_version,
+                                               get_header_version)
+    curand_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "curand.h")
+    curand_version = required_version
+
+  library_path = _find_library(base_paths, "curand", curand_version)
+
+  return {
+      "curand_version": header_version,
+      "curand_include_dir": os.path.dirname(header_path),
+      "curand_library_dir": os.path.dirname(library_path),
+  }
+
+def _find_cufft_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CUFFT_VER_MAJOR", "CUFFT_VER_MINOR",
+                       "CUFFT_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "cufft.h",
+                                               required_version,
+                                               get_header_version)
+    cufft_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "cufft.h")
+    cufft_version = required_version
+
+  library_path = _find_library(base_paths, "cufft", cufft_version)
+
+  return {
+      "cufft_version": header_version,
+      "cufft_include_dir": os.path.dirname(header_path),
+      "cufft_library_dir": os.path.dirname(library_path),
+  }
+
 
 def _find_cudnn_config(base_paths, required_version):
 
@@ -358,6 +443,36 @@ def _find_cudnn_config(base_paths, required_version):
   }
 
 
+def _find_cusparse_config(base_paths, required_version, cuda_version):
+
+  if _at_least_version(cuda_version, "11.0"):
+
+    def get_header_version(path):
+      version = (
+          _get_header_version(path, name)
+          for name in ("CUSPARSE_VER_MAJOR", "CUSPARSE_VER_MINOR",
+                       "CUSPARSE_VER_PATCH"))
+      return ".".join(version)
+
+    header_path, header_version = _find_header(base_paths, "cusparse.h",
+                                               required_version,
+                                               get_header_version)
+    cusparse_version = header_version.split(".")[0]
+
+  else:
+    header_version = cuda_version
+    header_path = _find_file(base_paths, _header_paths(), "cusparse.h")
+    cusparse_version = required_version
+
+  library_path = _find_library(base_paths, "cusparse", cusparse_version)
+
+  return {
+      "cusparse_version": header_version,
+      "cusparse_include_dir": os.path.dirname(header_path),
+      "cusparse_library_dir": os.path.dirname(library_path),
+  }
+
+
 def _find_nccl_config(base_paths, required_version):
 
   def get_header_version(path):
@@ -465,6 +580,34 @@ def find_cuda_config():
     result.update(
         _find_cublas_config(cublas_paths, cublas_version, cuda_version))
 
+    cusolver_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      cusolver_paths = cuda_paths
+    cusolver_version = os.environ.get("TF_CUSOLVER_VERSION", "")
+    result.update(
+        _find_cusolver_config(cusolver_paths, cusolver_version, cuda_version))
+
+    curand_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      curand_paths = cuda_paths
+    curand_version = os.environ.get("TF_CURAND_VERSION", "")
+    result.update(
+        _find_curand_config(curand_paths, curand_version, cuda_version))
+
+    cufft_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      cufft_paths = cuda_paths
+    cufft_version = os.environ.get("TF_CUFFT_VERSION", "")
+    result.update(
+        _find_cufft_config(cufft_paths, cufft_version, cuda_version))
+
+    cusparse_paths = base_paths
+    if tuple(int(v) for v in cuda_version.split(".")) < (11, 0):
+      cusparse_paths = cuda_paths
+    cusparse_version = os.environ.get("TF_CUSPARSE_VERSION", "")
+    result.update(
+        _find_cusparse_config(cusparse_paths, cusparse_version, cuda_version))
+
   if "cudnn" in libraries:
     cudnn_paths = _get_legacy_path("CUDNN_INSTALL_PATH", base_paths)
     cudnn_version = os.environ.get("TF_CUDNN_VERSION", "")
diff --git a/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/gpus/find_cuda_config.py.gz.base64
index ee5ddc22044..ae3ee37c077 100644
--- a/third_party/gpus/find_cuda_config.py.gz.base64
+++ b/third_party/gpus/find_cuda_config.py.gz.base64
@@ -1 +1 @@
-eJzNXHtz47iR/5+fAseJy6RHpu291FZOF++V157JKvHZU7Zm9lK2o0AkLHFNkQofkpVUvvt1NwASBCm/JlsZVa1HAtFAd+PXLxDYd+w0W27yeDYv2XeHR//FxnPBxiItsvxjkq3ZSVXOs7wI2EmSsCvsVrArUYh8JaLAeee8Y+dxCN1FxKo0Ejkrgf5kyUP4Rz0ZsC8iL+IsZd8Fh8zDDq565Pr/DSNssoot+IalWcmqQsAQccHu40Qw8RiKZcnilIXZYpnEPA0FW8flnKZRgwAb7M9qiGxacujNof8Sft2b/RgviWH8zMtyOTw4WK/XASdmgyyfHSSyY3FwPjr9cHH9YR8YJpLPaSKKguXib1Wcg6jTDeNL4CfkU+Ay4WuW5YzPcgHPygz5XedxGaezASuy+3LNcwGjRHFR5vG0KlvK0tyBzGYHUBdPmXtyzUbXLvvx5Hp0PYAxfh6Nf7r8PGY/n1xdnVyMRx+u2eUVO728OBuNR5cX8OsjO7n4M/vT6OJswASoCqYRj8sc+QcmY1QjLR27FqLFwH0mGSqWIozv4xDkSmcVnwk2y1YiT0EcthT5Ii5wMQtgL4JRkngRl7yklo5QOM3xv/TjuK77KY9TgOHp57MTmH6a83yDzLC54Dh/BEsUllkeC+KRrST6AFIZMIiKJSk3RSkWgeMg4IswjwFnheA5YKEgVWwbHoFZtEcZwIqj1srCgcYFQiASJaoqJRXHuWaCBlpK/pE+zNL7eFblpECkK8ooq8qAuFrycl5IPNHoRIxUNQ5r0QBget0QgvM8q2ZzJtJVnGfpQqSls+J5jGgFUx7dg6mxFU/iyGIgVkoaSOGkVjS7xJzIc1r4XJRVTiBg0ATqCrNIKG0mAGO0Pak8XAagvY+BeRi/4ZIj27MKuQOmrqvlMssR+Q0Zmg0tgxenYVJF0BRWP56fXPsD+HJ2cTFgF6en5wNSjHRaV+P2kpb8AQeqeZpyQLqJkIYfMGqUevxxgnNOPp2Mf7p2DBUyrULkHPzRgu8XYslBdUA8S7IpTRIwY/Ykyx4kmiR4Cgc51aCSSCJ3Ned5tI8qjACExGhRTU027/NsgewB9yQCYSNwYClb/KKG0Y3WUoFu2OV1Y9ORuOdVUmI/cLXR0HEYGGtaPQ7BKx5URX6QZCFPDsIq4gNqkNqVWCQudpNIoobtL3cDoP8ZljZbF0NW88H61DYA/+Ow5nM6vL39lGeznC/YR1TF7e3Fl9HZ6IT94dNnCEuLZYUulI2zLHmIy9tbHP72ds9xPoJCpzx8AL8a0VIAd/E0TuJygw53IUwMJUVGUYUnYJEpdFxJFTod9WbSWfawDqtEoWTTNrQAVEewctSS7td67h1FaVuuPqvDIvRtsMVOoihGY+RJB6xAvf+VHxiCLIo+GjpfPlxdQ/DQ64JN48vL8z+NxrSaQCPNrqbBHy2qfhqwUGOei4v2RLJpdHE9Pjk/10RozzVz+MNijppMGukCJj+dXekhtCOgIcYQxy+vrsbGMHVTa2rni/al/QAIwdNNhU4pwKTBEB53MaTuPgabXXRx6JYF4IE7NQyUg4ZIvFiWG+xdpYafzhgPKcPh6Ub3Vf4LogDgH+fjKnBptwGma3AxdMr7ye/Vsx8mapAhA6aCv7efSR80AUANWRAE7YfqS/MUAq3jQLoATpnFmf6WFfobejz9fZnwEpnRv8Fvq2/gxpZ5FkL2UbdsCgfSguUmgbAyxJwHFXw825fP98F97fNyv8yWTplvhrCgZJzFHLxBwtQg63kczh2VHo6o7QOGobo7plJIUATFkq9TTYdRaCIeRViV2pPLoRqGRLqVH8cJEw551Cm5P5rQ+0A8gMp9nHsJj6EbeFk2iQtQKnhWj57IgFlrKpCZg+dDfoTpMPRzDcK1dKkvIFXO1yRe8DB7CekZz2GemnLBS0x+NIQ8HpYVTyY1hHXWoVtoAkDJ6VyED6hHQf6T/K9OdxZCqExHUcto76g4gEWFSGSjBrU9DUtVVg3Wh2ksmmVM2ZcapM1nPTZ+OmNZnbFHAeHQIHmNJ9WfI2Z/joJ22zivhEkQfGcTWL8/Qtx6hiL4zycpuh97DmJKr0Q+K4Y1ta23ISU1WmmdlAnCax7UxG0Vt0kxLQ4xX9L5jIxGkOCWPEkoCdUDXckMc8h+VsDC3nLsBl4Ssu2kOEtFIJEJfyFuW0sO7vMCukhhlX1I3UELVEVQgS1BRB4aWVedaOPA1njHNgCxhlt6vtMDv+NOU6s38dIZjedlgTWv1zFAbbm8nCSCF+XrTFfp8pjdgN/zVj7lqSvKRy0WoGQsPTdw/TtDqD7CrnQWqSEh+6HRhhZkJkodpbQomHgOWMoXQvsbBQx0Biq0wPRADsUWZdOnZqUWSBgghwlWY/A8zoJsKfTIbu5CdE4h94Zc89ityvv937l+kMMQSKD8KJNQowUM6KvnvpNzsp2Cvfduo/e+y3aI0QFN5RMZ4I+6a9tSKqC2YAZl2tI7Mhbfrb1xCMsuipinExAyqsLSu4/zooT8VkD2HdnKAOuhHB2z4WmcqoIcXCpRUQovCZVC1Hw3WREgWfBLFqfePYwuV/MeFSVJ8SfpVdLfmWuVRBNZCkyoPvD6uOqk2e36oTZTLFusmNmwiejRZDQXrAQFbs/Vza6P6ZV7UID4B3Uj0KlM6tjIRYIQY9ZEPvFuWiMDJPaX7p1P0bzEmkGuOpYZACgA8x47/oF5wZ7vypXDCILDi5Ls2MSanEGaQQtOKrUxkaVmU/DC3kEksCb0XF6Ecez6ElIq6/mcxvjwjLrUyY85YBMRuihEngMeRZ4GACwTYtdrI9M3sFlQee5hkevJAXzfBIMqLSdYOio80FfT56h1NtslT2YLsO7u4cqJRPUHB0KbgM+RtX6+BzIYRs7aSaoabNWBC1Shcv8A5PFasdSta1t3YD14Wxm72oGO4DFag7X0RU8Mt3njWvX5/k6BLsckAvBanVzVZLJt9UlT9w601W/ReoWVW+638+72RS4SWWrXm1i4F2JshcjdD8sbKRbdmllXbfx0GpRsduvePrmP/VlaNQ/FY5nz4uD086fx6ODJAWUfelb7OV0Y/Qskb238PCs69P7+t62frR/9suKDR5OsJfxeWzQwrQnVlhPaw/Nwv0HKOajF0L+RZ+WhSAdNXzRXVwaQG/c21SDb3Sl2EZ7krVF6+hLXfqQZwL+TXsac8LWDtqnvDLdlFmtaKadZlUTkWWhbEqvvHbkTB9+MnbfNEKw0ze7pn8ZYPUMZXU0ZgtXWQ3UnUr1Kx6aAPQnB9qGUh6NIju9RgB7L9QD/eK2QLyOeObFvpSr4DLXJ40K8HTItRSg7aBFrM5MJ1LZaU5udTE3kFjZpCMxMFwGiaHaRN01+0RMBDAYBbyYHFDzQ4nQUsupqm9bdKfaCaIMUgE5MeDV2qel9a3Bwtk/Wae3PEzn1zfDojiK0gEDfy9aTfEBkKDL3GW6emx41tWc4si1Qt93oE+hQE4nXGcxguxC2CAOGke4VyOLSiso5Lzslp0SbVk5Tbfz77LZTnxrSkufyVcf4rZs+Hf8wsCbd7i8U9ROLqnqYQAaISb1jHqXl0qFga1VMYJLJRkvlKpPpdzRvwIbKZ16MjpahPAF2K+VqM/6MA3larJaGKH2UOZ+FzM7Kw6RIt6VKV9BocIe7A9tLetd89aBrG5Xvt3L8WmO4bWNm7u5OFOxE5HD1pAcH7Ojw8HBQc7FDv2W77zgqxTY0W6u14XsrclwkDubuc9p/0Yq8YYyuLn2nUwNZ4rB3+CoAtw0E49NsJYJaC6ZLrsXui80kd166A6tAMfCQrsKwDw1GLDoFWiaraP26O0uk/Qt80XUbvb8N4M+AffH0d/rju7VX1HX11lJeV/D7ihco5Huq757tnDqd2154d+vo7fs5bdBCAymIUgtQBf4IxKNwO4kJhXLZwdVUagvM0HC9Wk+4j56KYhqnsgAYNOwMrPrRXkxfMr9atGGyfTrsewAoisQqDo06q4BSyS7bsFqBijuK+T4VtKWskS1yYNetW/aCo8NgGrrKmJdlbFrzdv463tQlYrBmvx7oNQYB/d0eJ1kbF27CyddpMJK9yVIvq6+Va7x+6+2v9O/LTer/Oz/RU8tYoxRXRyUqa5blI5dHYGrtBUQ+vjy79MIins38Ib2yQpOaZkBqbtbhpjzoLNASqTnq+kzHcs1smuX4In7eTlQsZYDuAkgd/cGLiG3VEPVBPYAvM/sudzeHd+w/jvseHN2pzR9KT8za0B2lEAWLGIROS/lawtTqECvEVcFoz2WL7+7OR+ulPME/NOpNg3OHbftr9VFbEyg59LNBYUcyv01sqG0bsYl3i7pZMc2hsYamqVvzdFasGRUt7DmJLHP2LfLnZLKN2BLKXBotlo0bpPinlSBNE168JEWy4uNQ7z123s9YW3bg0Y5c2f0lGZaZY5m7lM+8PjF6YiilWAR277nNeY7J/5788fLKpeSsaRtdYNu2fMXs+ulkfPpTEy51nqarUNNJMvb2JIzWgy/jt6RiX52H9Sdh6FbVIZmqUE55wX/JmpN/WZpsArVpTQJsS9nMCvvwzjHz4k651gZSe+BmI6fr6xSrdZZcsCgT8uiYzIzI/zWPt/s8xrz2tJYR0GI3+xPv8HVwLuQ5tXqCOL2HSmrFITPEUxlTAQBVr4bRNgbsl6pQe3V07gVf7NkYel3oNyBkLZ/xnpgOQaKZ8KKtEA9San/LYtoAQ/lfk1jgcG5nLfvjCHV52qdu8aZE+bQ77ThS2y1GafrrFI6NU3upS7MdGp4sM3wZ/ZRuTP8kV3X+4cuHc+2vnvJWX1Ewpunr3dRXOqmtdSIs2Mu9zqvKwzSV1WEzw7bUp+khQ3Dze9Du9SZcI+FXwToNw+TbRDWdc6xBLX9pTNOvVvT9NdCMuvlGwEzL9GtgGQcGjZrj9yLZ7IB5r/Fz0OrzBhgT3VehuKQzsHn5jSL5S3MutwG02fhUytnq2cL8O/ZXxcRfMXxzECgVOYd6lmXTX6CwxdPhbE3XLqBjyaC2oy1cuh2gso6qiNOZGg6ee4sqLuMlbv/HC0iRpiLJ1ir2NxqhQxGtdFBlbDxJvEbP2iDqnSGc5Is6WybfR8rrFwv+IDrZHtOXLuhgR/CsmddnTN5q7xerUXov8m8ozVZnX4xctk4r9anv79kiW4nIPLZH+WW+qC/ZKLmU4oN5J5d8i5bq0b4RZSEAtBd4oaN8lZ9cxSg1WK49S6+7tDuBS7ObBp2+b3CdNe1b3Sda8gTPqU1EuvLgP7VLqk43Hd/c2W+F7KtA5EzopFvfTYLmhbQem86J1UePWmeTmuYb3ftOr9nANQ45KO5aZ/PEjIdSwheJweXmYetwL3hGGqVfElzn7iWOQd8djd47F3RipsxglF15IAnPrvzu+8n3v21OtuyyNZ3VWOZiFWdVkWxkGUi7ld3beQ5TN6PYGZ00Bi+Q4d0f6KA0RVPtDuRln3Us7z3iRSJ0DNbq+61zxE+tV/d06F+8g5u/HLC7Pf89CXa7ft9IdfA/vwHD6VvdJ0+N3rRfMxhnw56Drd8c+QE/mMR/l4bchHcDCHUX+ZJzIC/cai0VUDjO5QVIdfVAIVq5DG1pueBJM4Wz7QiGolIky4SHwnNvb+nUmglw7KBk6Ly17CI5ikO6QpXTHeCnbnIquEgRmltjx+yG57NVALEe3L08E4sN9O5pUwT4/eZoeHfndN6/WacIXeuSFUpGgrUOUFnr57Yu9T0bUF50/LIzJ70ra51VMB7DaumVFI/AWyHX8U76dnXi9R//lOtKu6t0QrPWoHE8c5uMnTtjbuvslPKDdFC1WkbgWr3uC+tmgr5tp87ayOFu2jvwd+Y+jma2YUQbZFlBEujZJ97NkYxo6rPfM+/ocMCO6qzvHfuxs62ltpzWvFBXWuR2U338DDeejNc7+mqExWujhP4dqV5Emlf4akzaCm/y/J6dcJMLe7+qbzUUUNJ0C1Kgcq+hYgcwtxtnetBi73D0m2JzD3Gr3D07XAaD9kZHLR3Vjj3CUT23XbZOvOwRzap3eyQzL0s+I5i5x9HwZhW9tVQ6peqTrE63tkvXG/p7JOxJVnuktO9zPiOpXQe3+e3JXUlqtO6Hgb7SQqPGpVjUQQu08gCcRfJajks5Jt0+aLXSWyaj6CPX83CHSrKi8MpMmdWNNBnrFhxqOhqjLuaIObEZ4DmkShgHartxUXNdM0FX+T13pxjKPX3mNSP5/fUVuiC1d4+BT/4vAAL831sIryhzT6jaGx9CpCjx+IPjYLCnDGQyoauGkwlKMpm4OJIUyvl/z0/IjA==
\ No newline at end of file
+eJzdPGtz4zaS3/UrcJxzmZqRaTu7ldrTrXPl2DMX7/rsKVszuS3bq4VJSGaGInUkJVu7lf9+3Q2ABEBS8iOTuOKqTEQQ3ehu9BMPvmFH2XyVx9O7kn2zt/8fbHQn2EikRZZ/SLJ7drgo77K8CNhhkrAL7FawC1GIfCmioPem94adxiF0FxFbpJHIWQnwh3Mewv/UmwH7LPIizlL2TbDHfOzgqVde/z8BwypbsBlfsTQr2aIQgCIu2CROBBMPoZiXLE5ZmM3mSczTULD7uLyjYRQSIIP9TaHIbksOvTn0n8PTxOzHeEkE499dWc6Hu7v39/cBJ2KDLJ/uJrJjsXt6cvT+7PL9DhBMIJ/SRBQFy8X/LeIcWL1dMT4HekJ+C1Qm/J5lOePTXMC7MkN67/O4jNPpgBXZpLznuQAsUVyUeXy7KC1haeqAZ7MDiIunzDu8ZCeXHvv+8PLkcgA4fjwZ/XD+acR+PLy4ODwbnby/ZOcX7Oj87PhkdHJ+Bk8f2OHZ39hfT86OB0yAqGAY8TDPkX4gMkYx0tSxSyEsAiaZJKiYizCexCHwlU4XfCrYNFuKPAV22Fzks7jAySyAvAiwJPEsLnlJLQ2mcJiDX/Sv53nexzxOQQ2PPh0fwvC3Oc9XSAy7ExzHj2CKwjLLY0E0sqXUPlCpDAhEwRKXq6IUs6DXQ4UvwjwGPSsEz0EXChJFF3pUzMLGMoAZR6mVRQ8aZ6gCkShRVCmJOM41EYRoLulH+DBLJ/F0kZMAEa4oo2xRBkTVnJd3hdQnwk7ACFXpYcUaKJieN1TBuzxbTO+YSJdxnqUzkZa9Jc9j1FYw5ZMJmBpb8iSOHAJiJaSBZE5KRZNLxIk8p4nPRbnISQkYNIG4wiwSSpoJqDHanhQeTgPATmIgHvDXVHIke7pA6oCoy8V8nuWo+TUYmg1Ngx+nYbKIoClcfH96eNkfwI/js7MBOzs6Oh2QYKTTuhjZU1ryL4iooumWg6abGlLTA0aNXI8+jHHM8cfD0Q+XPUOETIsQKQd/NOM7hZhzEB0AT5PslgYJmDF6kmVfpDZJ5Sl6SKlWKqlJ5K7ueB7toAgjUEIitFjcmmRO8myG5AH1xALpRtCDqbToRQmjG624Atmw88vapiMx4YukxH7gaqNhr8fAWNPFwxC84u6iyHeTLOTJbriI+IAapHSlLhIV20kktYbtzLcDgP8Rpja7L4asooO1iW0A/qfH6r+j4fX1xzyb5nzGPqAorq/PPp8cnxyy//74CcLSbL5AF8pGWZZ8icvra0R/ff221/sAAr3l4RfwqxFNBVAX38ZJXK7Q4c6EqUNJkVFU4QlYZAodl1KEvYZ4M+ksW0iHWaJQsrINLQDRkVr11JTuVHJuxaKkLWefVWER+ta6xQ6jKEZj5ElDWQF654V/gIIsiv606nx+f3EJwUPPCzaNzs9P/3oyotkEGGl2FQw+WFDtMGChxjhnZ/ZAsunk7HJ0eHqqgdCeK+LwwSGOmkwY6QLGPxxfaBTaERCKEcTx84uLkYGmarKG7n3WvrRdAULwdLdCpxRg0mAID9sYUrcfgtU2ujh0ywL0gfcqNVAOGiLxbF6usPciNfx0xnhIGQ5PV7qv8l8QBUD/cTyuApd2G2C6BhXDXjkZ/1m9+26skAwZEBX8034nfdAYFGrIgiCwX6of9VsItL0epAvglFmc6V9ZoX+hx9O/5wkvkRj9DH5b/QI3Ns+zELKPqmVV9CAtmK8SCCtDzHlQwAfTHfl+B9zXDi93ymzeK/PVECaUjLO4A2+QMIXk/i4O73oqPTyhtvcYhqrumEohQBEUc36fajiMQmPxIMJFqT25RFUTJNJOenq9MOGQRx2R+6MB/fdEA4i8j2PP4TV0Ay/LxnEBQgXP6tMbGTArSQUyc/D7kB9hOgz9PAPwXrrUR4Aq52sCz3iYPQb0mOcwTgU54yUmP1qFfB6WC56MKxXWWYduoQFAS47uRPgF5SjIf5L/1enOTAiV6ShoGe17Kg5gUSES2aiV2h2GpSqrBuvDNBbNMqbsSyGx6axw418Dl9MZexQQDg2Qp3hS/bfP3L/9wG4b5QthAgTfuADO8weIWxsggj+shWj+uWMQUXom8mkxrKBduQ0pqdFCa6RMEF7zoAK2RWyDYlocYr6k8xkZjSDBLXmSUBKqEV3IDHPIflSKhb0l7lq9pMraSXGWikBqJvwLcduZcnCfZ9BFMqvsQ8oOWqAqggpsDizy0Mi6qkQbETv4DlwFxBpu7vd7Lep30GiyehMtDWw8Lwusef2GAWrL5eU4Ebwon2a6SpYH7Ar8nr/sU566pHzUIQFKxtL3Aq9/YzDVBtjkzgE1OGTf1dLQjExFqaOUZgUTzwFL+Uxof6MUA52BCi0wPIBDsUXZ9JFZqQVSDZDCBKsxeB9nQTYXGrOXexCdU8i9Idc88BblZOdPXj/IAQUCKD/KpKrRBAb00/feyDHZVsHe+dfRu77HtojQAQ3VJzDQP+qubUuJgNqCKZRpc3/fmHyv8sYhTLsoYp6OgcloEZb+JM6LEvJbAdl35AoDrIdydMyGb+NUFeTgUgmKUngJqASixrvKigDBgp+yOPUngF3O5gQFJUHxkeQq4W/MuUqisSwFxlQf+G1UNdJsu36ozBTLFidm1mSi9mgwGgtmggK37+lmr4/plbdbAPu7VSPAqUzqwMhFghBj1li+8a8szKASO3Pvpk/RvMSaQc46lhmgUKDMb9nBd8wP3vY9OXMYQRC9KMmOTV2TI0gzsNRJpTamZqnRlHph7yASWBP6Hi/COPb6UqVU1vMpjfHlMXWpkh8TYR0RmlqINAc8inytADBNqLu+rZl9QzcLKs99LHJ9iaDfN5VBlZZjLB2VPtBP0+eoeTbbJU1mC5DuvcWZE4nqDw6EFgE3gVmP7wAM0MhRG0lVrVtV4AJRqNw/AH58K5Z6VW3rDZwXzytjl1vQETyGhcySF70x3OaV59TnO1sFuhwTCJTX6eSpJpNsp0+aejcgrXaL1jOs3HK7nTeXL3KRyFK7WsTCtRBjKUSufjjeSJHoVcR6auGn0aB4c1vf7pD72Jmmi/qleChzXuweffo4Otldi1D2oXeVn9OF0S/AubXws5F16P3tH61H66GdV3zxYIJZzL+1WQPTGlNtOaY1PB/XGySfg4oN/Yw0Kw9FMqj7orl6MoBcedepVrLtrWIb1ZO8NXJPP+LKj9QI+jfSy5gDPhWpDX1juC2zWNNCOcoWSUSehZYlsfrekitx8MtYeVsNwUrTbEL/q43VN4TRlJTBWGU9VHci1JNkbDLYkhB0o1IejiI57qMAPJbrAf7jWyFfRjxz4L6TquA7lCaPC/F8lbEEoezAAtZmJhOorlpTm51MTeQSNkkIzEwXAaKoV5FXdX7REgEMAkHfTAooeKDF6Sjk1NUurLdVvA2iFUKAdmLCq3WXmt5ZyMHZrq3T7L81OfXVcP+GIrSAQN9K1lo6IDIUmbeBmk3Do6TeGo6sQ9VdN7pGO9RA4mkGM+hmwmVhwDDSPUGzuLSi8o6XjZJTapsWTl1t/HZ226hPDW7Jc/VVx/i5iz4N/zBwBu32Fwp6zaSqHqYig4pJuWMepfnSoaCzKiZlksmGJXKVybQ7mmfohspnHq0dlqGsUXYn5bIJ3+BA1rNlSYjSR5nzOZrZmHkYFOE6qnSlGrXe4epAd0nvmVsPurZR+b6V41cSw2UbM3P3tqJgKyKHqwfd3WX7e3t7g4qKLXqW7f1eT6XYhmQrsdZ0d2qOh8DBnbdJ+o+akWfgaMqy32vUQA477A1uBeCygWD8NluKoJKC6ZIrtttiM/Gdl97AKVAMfUiXYdimDUYsOgJYJqtovd2dJdL+BW50XUfvrgP4Z8A++/o3/dP3Kq+o6+rOUl5X8DuKFijkW6rvluWcKp3rLrybdXT3eo6ttNBAAqLUAkSBD4F4EF4jMaFQLjt4GkotgRkSrmZrjftoqShu41QWAIOanIFTP7qT2ZfEL2e2mnQPh313QYsisYxDo84qoFRyyzasVqDijmK+QwVtKWtkBxzI9aqWt8H+XnAbesqY52VsWnM3fQ1v6hEwWHO/QvQUg4D+XouTrIwLF+HkdhpgchdZqmnta+Ea22+t/ZX8+3KR+n9PD/XQMtYowVVRicqaefnA5RGYSnoBgY/Oj8/9sIin0/6QtqzQpG4zADUX63BRHmQWaI7UGFV9pmO5JjbNctyIv7MTFUcYILsAUsf+4FHArmgIerdC0JeZfZO6q70b9m8HbS/2b9TiD6UnZm3onaQQBYsYmE5LuS1hSnWIFeKyYLTm0uG7m+PRfClP8C+t9abBeUPb/qw+amkCOYd+rlK4kaxvAxti6wI29d2BrmdMU2jMoWnqzjiNGauxooVt4sgx574Dvokn14gdpsyp0Wy5eoMQPzsJ0m3Ci8ekSE58HOq1x8b+jLNkBx5t35PdH5NhmTmWuUq5YfvE6ImhlGIR2L3v1ec5xv9z+JfzC4+Ss7rt5AzbuvIVs+vHw9HRD3W41HmarkJNJ8nY85Mwmg8+j5+Tir04D2tPwtCtqkMyi0I55Rn/KatP/mVpsgrUojUx0JWymRX23k3PKu/f4G5qLuQxrwp1nE6gEFlySKzwUMOtgPlVO6uoWgP206JQS110bAT3xcwpaF9AdyfpabHVmCNHPsZGLJ0yRD3k6jinfudDztrvkJY7gyihp0RuREeprIm3w1GbXcBj2PIaOP3WO7cOt0aQ6/1aw6P9bLunIkuAoq/qoPaDvd/UQV2en6J/cVyU2brRSRmdfyU3Vc3LbAZW/Zp8VUXbc3zQV3QarsQ6qX2pD5AIyQvYuDv8gN1pnSdQPZ/lCxTsy7xBztPod+0LLg7Pjl1PULdt9ANV11/JC+B8vDLjJxV5ZaavxNRB4kvtHdGRtZt4O2zd7LLO0qnfs+ycIF9m5ZNJ+bs28g8fRq6NV00bTVz3/JUsHObilRk4ascrs28ppHYCX2rdgI2M28DaYdtGj3Wmjd2eZdkI+HTDtndj0vTrbMfUhvlYs3SNEu9rGAZJj9IY9SNZ3On7z+9PtdmtM7oXbMOkz0ioX2hwnbsvMGGPN7YnbbqkqdxzqUfoWlCse8iFrfp5YPd6lloj4AvVupjzHJj7Pcesy4+HF5fvG0Wq0bq5SK07/1pFKs3LK4tfSlleWQirRNVJ5ovLUkIoy1ILd1dZanVaW5bKns8rSyXsixxAGobJ6wxrdH2wslf5pIMaPVmG+DXCGcrmlUQzmqavEcwQMUjUxN+q02YH3E4yHgdWn2doMsG9SItLulqaP6r0+g00+XN93bVWaLNxXfyxelo6/4b9QxHxD1y058BQKnJewuDZ7U8ixEspGbunrxlAx5LFpTwZRZfu1W7EoojTqUIH7/3ZIi7jOZ6qi2eigF5Jdq9W/GuJ0F0DKzaoA0I8SfxaztogqgMXOMhn+Vreh1dfNZjxL6Jx5o3pbxnQfYlgo5lXVzeea+9ny5N0IvJXFHPVlRJjO7zabtKXqb9ls2wpIvM2HO075bPq2xWKLyX44C74JaRUYXslwkIF0F7gkY7ySX5yGSPXYLnuKK3u0u0ELs1tGjT6PsN1VrDPdZ9oyWO8/jUW6dKH/9ThI3Vp6ODqxj1s6X5hg5wJXSBru6Bfn/PWuOn6VXWjx7ryUzdf6d43es4GnnF3QFFnXXkTUx5KDh/FBpdncqw7s+AZCUs7JzjPzW8jDNo+fdD6KQO6iFJmgGVb3vPBKyF/+nb87R/rCyPb7J6uQMxzsYyzRZGs5PYwHQJqfvSmx9QHR9gxXeAFL5DhJzWgg5IUDbU9kN/QuI/l54Tw+xzoGJzZ71vXc9fNV/PS5d/93au/77Kbt/13xNj1/buaq93/+ncwnLbZXXsZ88o+vWdcudqktv36Jg34wST+pzTkOrwbilB1kWeHB/I7VlpKRcKLO/ldIXWjX2m0chna0nLBk3qIXtfNBgWlQOYJD4XvXV/TZTBTwbGD4qFxGLipyVEc0pdJcvq01roPJCl1kSzUH2M5YFc8ny4DiPXg7uVVU2ygI52rIsDfV/vDm5te41ircznPc75dgpwRY9a9JGf+POtbORsDyqNuNTbGpCOo1hUA4zXMlp5J8QC0FXIeb6RvVxdJ//WznFc6tEQXHysJGrceu3hsfIrFs64kKT9I9z8X8whcq988B14P4CzGqOUGZ24kuiv7YNuN6kkHHDSxNSHaIMsFJIG+e5HcxGRE0z77M/P39wZsv8r63rDvG8dd1EGTe16oL0XIQybVrS48bmKcmtRfHHBorYVgcrJBI80v41Q66Qq8zvNbDpiZVLinVLpmQ20c/6JSBjnuVVJuDNEQTmOLvlU89SGMpwnIPuJiU9PcxO8SE+27fU0hWQM0ROTsabYKSO9OP0085p6/SYW749klGNy2+JpyMfE3xGLvBbVKRW3oPU0oxhapQYGzT9RpUXLN66talD1E06Lc1cV2i6pWjJ9oUdZ6vE1Nc/2xTUwqQqVpR4hK0zpGuZmz10xwW8KUu7fSngPU3xXr5L5lb80g0N1iqbijRasW5mghqZu3RqLewpqz0NbCmfnxsw2MmYurNW3OalvFla7l2jir6rxu7lprjhYOW6rkFi7d77Nt4NRdgLPpbSmaiWs0zy8D/YkawhqXYlZlyyCVL0BZJD+z41FxS18TsVpxCM9YbaKc58sNCslJ/5dmra6+MCWT7BmPUzlqtYpExInVAO8VLoRxQb6ZkGuqKyLo05y+t1UM5b0E5teY+u0LO5j7qK0SzLjlJz0D/Fyt8Isy94Va9MOXkKKWeJ2p18Mqg0qf8Zg+HTYeIyfjsYeYJFO9/wcrgfOr
\ No newline at end of file

From fb68e31b3e240b7b165ec71aac9f57338c34a0c3 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 15 Apr 2020 13:28:22 -0700
Subject: [PATCH 0610/1533] Basic CUDNN8 support

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |    4 +
 tensorflow/stream_executor/cuda/cudnn_8_0.inc | 3269 +++++++++++++++++
 tensorflow/stream_executor/cuda/cudnn_stub.cc |   12 +-
 third_party/gpus/cuda_configure.bzl           |   23 +-
 third_party/gpus/find_cuda_config.py          |   24 +-
 .../gpus/find_cuda_config.py.gz.base64        |    2 +-
 6 files changed, 3316 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow/stream_executor/cuda/cudnn_8_0.inc

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 6122877f91f..8cb562fe41a 100755
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1278,7 +1278,11 @@ port::Status CheckAndFetchProjectionWeights(
   cudnnRNNMode_t mode;
   cudnnRNNAlgo_t algo;
   cudnnDataType_t data_type;
+#if CUDNN_VERSION >= 8000
+  RETURN_IF_CUDNN_ERROR(cudnnGetRNNDescriptor_v6(
+#else
   RETURN_IF_CUDNN_ERROR(cudnnGetRNNDescriptor(
+#endif
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
       /*hiddenSize=*/&hidden_size_v,
       /*numLayers=*/&num_layers_v,
diff --git a/tensorflow/stream_executor/cuda/cudnn_8_0.inc b/tensorflow/stream_executor/cuda/cudnn_8_0.inc
new file mode 100644
index 00000000000..11c1eafe48f
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cudnn_8_0.inc
@@ -0,0 +1,3269 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+size_t CUDNNWINAPI
+cudnnGetVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
+  if (!func_ptr) return 0;
+  return func_ptr();
+}
+
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
+  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rstatus, mode, tag);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(tensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t, cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnInitTransformDest");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, srcDesc, destDesc, destSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, const uint32_t, const cudnnTensorFormat_t, const int32_t [], const int32_t [], const uint32_t [], const cudnnFoldingDirection_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA, foldA, direction);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *, int32_t [], int32_t [], uint32_t [], cudnnFoldingDirection_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA, padAfterA, foldA, direction);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(transformDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(opTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(reduceTensorDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, valuePtr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, yDesc, y, alpha);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, k, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterSizeInBytes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc, size);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(filterDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(poolingDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc, mode, reluNanOpt, coef);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(activationDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(lrnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(derivedBnDesc, xDesc, mode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, theta, grid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(xdesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dropoutDesc, handle, dropout, states, seed);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc, algorithm);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(src, dest);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToCreate);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t algoDesc,
+                             cudnnStatus_t status,
+                             float time,
+                             size_t memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
+                             cudnnAlgorithmDescriptor_t *algoDesc,
+                             cudnnStatus_t *status,
+                             float *time,
+                             size_t *memory) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, algoDesc, status, time, memory);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(algoPerf, numberToDestroy);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle,
+                   cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace,
+                   size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreAlgorithm(cudnnHandle_t handle,
+                      void *algoSpace,
+                      size_t algoSpaceSizeInBytes,
+                      cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mask, udata, fptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsInferVersionCheck(void) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpsInferVersionCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, mathType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, groupCount);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionReorderType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, reorderType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionReorderType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, reorderType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int pad_h,      /* zero-padding height */
+                                int pad_w,      /* zero-padding width */
+                                int u,          /* vertical filter stride */
+                                int v,          /* horizontal filter stride */
+                                int dilation_h, /* filter dilation in the vertical dimension */
+                                int dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int *pad_h,      /* zero-padding height */
+                                int *pad_w,      /* zero-padding width */
+                                int *u,          /* vertical filter stride */
+                                int *v,          /* horizontal filter stride */
+                                int *dilation_h, /* filter dilation in the vertical dimension */
+                                int *dilation_w, /* filter dilation in the horizontal dimension */
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLength, /* nbDims-2 size */
+                                const int padA[],
+                                const int filterStrideA[],
+                                const int dilationA[],
+                                cudnnConvolutionMode_t mode,
+                                cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
+                                int arrayLengthRequested,
+                                int *arrayLength,
+                                int padA[],
+                                int strideA[],
+                                int dilationA[],
+                                cudnnConvolutionMode_t *mode,
+                                cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int *n,
+                                      int *c,
+                                      int *h,
+                                      int *w) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
+                                      const cudnnTensorDescriptor_t inputTensorDesc,
+                                      const cudnnFilterDescriptor_t filterDesc,
+                                      int nbDims,
+                                      int tensorOuputDimA[]) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOuputDimA);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                    const cudnnTensorDescriptor_t xDesc,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const cudnnConvolutionDescriptor_t convDesc,
+                                    const cudnnTensorDescriptor_t yDesc,
+                                    cudnnConvolutionFwdPreference_t preference,
+                                    size_t memoryLimitInBytes,
+                                    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t srcDesc,
+                                       const cudnnFilterDescriptor_t filterDesc,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t destDesc,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
+                                     const cudnnTensorDescriptor_t xDesc,
+                                     const cudnnFilterDescriptor_t wDesc,
+                                     const cudnnConvolutionDescriptor_t convDesc,
+                                     const cudnnTensorDescriptor_t yDesc,
+                                     const int requestedAlgoCount,
+                                     int *returnedAlgoCount,
+                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnTensorDescriptor_t xDesc,
+                                       const void *x,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnConvolutionDescriptor_t convDesc,
+                                       const cudnnTensorDescriptor_t yDesc,
+                                       void *y,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
+                                       void *workSpace,
+                                       size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle,
+            const cudnnTensorDescriptor_t xDesc,
+            const void *x,
+            const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc,
+            void *colBuffer) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnReorderFilterAndBias(cudnnHandle_t handle,
+                          const cudnnFilterDescriptor_t filterDesc,
+                          cudnnReorderType_t reorderType,
+                          const void *filterData,
+                          void *reorderedFilterData,
+                          int reorderBias,
+                          const void *biasData,
+                          void *reorderedBiasData) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t, const void *, void *, int, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReorderFilterAndBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, reorderType, filterData, reorderedFilterData, reorderBias, biasData, reorderedBiasData);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const cudnnConvolutionDescriptor_t convDesc,
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        cudnnConvolutionFwdAlgo_t algo,
+                                        size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionForward(cudnnHandle_t handle,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnConvolutionDescriptor_t convDesc,
+                        cudnnConvolutionFwdAlgo_t algo,
+                        void *workSpace,
+                        size_t workSpaceSizeInBytes,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t yDesc,
+                        void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
+                                      const void *alpha1,
+                                      const cudnnTensorDescriptor_t xDesc,
+                                      const void *x,
+                                      const cudnnFilterDescriptor_t wDesc,
+                                      const void *w,
+                                      const cudnnConvolutionDescriptor_t convDesc,
+                                      cudnnConvolutionFwdAlgo_t algo,
+                                      void *workSpace,
+                                      size_t workSpaceSizeInBytes,
+                                      const void *alpha2,
+                                      const cudnnTensorDescriptor_t zDesc,
+                                      const void *z,
+                                      const cudnnTensorDescriptor_t biasDesc,
+                                      const void *bias,
+                                      const cudnnActivationDescriptor_t activationDesc,
+                                      const cudnnTensorDescriptor_t yDesc,
+                                      void *y) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t wDesc,
+                                          const cudnnTensorDescriptor_t dyDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t dxDesc,
+                                          const int requestedAlgoCount,
+                                          int *returnedAlgoCount,
+                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t wDesc,
+                                            const void *w,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const void *dy,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t dxDesc,
+                                            void *dx,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
+                                            void *workSpace,
+                                            size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
+                                         const cudnnFilterDescriptor_t wDesc,
+                                         const cudnnTensorDescriptor_t dyDesc,
+                                         const cudnnConvolutionDescriptor_t convDesc,
+                                         const cudnnTensorDescriptor_t dxDesc,
+                                         cudnnConvolutionBwdDataPreference_t preference,
+                                         size_t memoryLimitInBytes,
+                                         cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
+                                            const cudnnFilterDescriptor_t filterDesc,
+                                            const cudnnTensorDescriptor_t diffDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnTensorDescriptor_t gradDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
+                                             const cudnnFilterDescriptor_t wDesc,
+                                             const cudnnTensorDescriptor_t dyDesc,
+                                             const cudnnConvolutionDescriptor_t convDesc,
+                                             const cudnnTensorDescriptor_t dxDesc,
+                                             cudnnConvolutionBwdDataAlgo_t algo,
+                                             size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardData(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnFilterDescriptor_t wDesc,
+                             const void *w,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnConvolutionDescriptor_t convDesc,
+                             cudnnConvolutionBwdDataAlgo_t algo,
+                             void *workSpace,
+                             size_t workSpaceSizeInBytes,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
+                                          const cudnnFilterDescriptor_t filterDesc,
+                                          const cudnnTensorDescriptor_t diffDesc,
+                                          const cudnnConvolutionDescriptor_t convDesc,
+                                          const cudnnTensorDescriptor_t gradDesc,
+                                          const cudnnTensorFormat_t transformFormat,
+                                          cudnnFilterDescriptor_t foldedFilterDesc,
+                                          cudnnTensorDescriptor_t paddedDiffDesc,
+                                          cudnnConvolutionDescriptor_t foldedConvDesc,
+                                          cudnnTensorDescriptor_t foldedGradDesc,
+                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorFormat_t, cudnnFilterDescriptor_t, cudnnTensorDescriptor_t, cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, transformFormat, foldedFilterDesc, paddedDiffDesc, foldedConvDesc, foldedGradDesc, filterFoldTransDesc, diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *, cudnnFusedOps_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack, ops);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        const void *param) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack, paramLabel, param);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
+                                        cudnnFusedOpsConstParamLabel_t paramLabel,
+                                        void *param,
+                                        int *isNULL) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, void *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(constPack, paramLabel, param, isNULL);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack, ops);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack, paramLabel, ptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
+                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
+                                          void *ptr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(varPack, paramLabel, ptr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan, ops);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
+                      cudnnFusedOpsPlan_t plan,
+                      const cudnnFusedOpsConstParamPack_t constPack,
+                      size_t *workspaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMakeFusedOpsPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, plan, constPack, workspaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t, cudnnFusedOpsVariantParamPack_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFusedOpsExecute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, plan, varPack);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNAlgo_t, cudnnRNNMode_t, cudnnRNNBiasMode_t, cudnnDirectionMode_t, cudnnRNNInputMode_t, cudnnDataType_t, cudnnDataType_t, cudnnMathType_t, int32_t, int32_t, int32_t, int32_t, cudnnDropoutDescriptor_t, uint32_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v8");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, algo, cellMode, biasMode, dirMode, inputMode, dataType, mathPrec, mathType, inputSize, hiddenSize, projSize, numLayers, dropoutDesc, auxFlags);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNAlgo_t *, cudnnRNNMode_t *, cudnnRNNBiasMode_t *, cudnnDirectionMode_t *, cudnnRNNInputMode_t *, cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *, int32_t *, int32_t *, int32_t *, int32_t *, cudnnDropoutDescriptor_t *, uint32_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor_v8");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, algo, cellMode, biasMode, dirMode, inputMode, dataType, mathPrec, mathType, inputSize, hiddenSize, projSize, numLayers, dropoutDesc, auxFlags);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         const int hiddenSize,
+                         const int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNAlgo_t algo,
+                         cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, cellMode, algo, mathPrec);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         int *hiddenSize,
+                         int *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDirectionMode_t *direction,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnDataType_t *mathPrec) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor_v6");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, cellMode, algo, mathPrec);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, mType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNBiasMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, biasMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBiasMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, biasMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t clipMode,
+                cudnnNanPropagation_t clipNanOpt,
+                double lclip,
+                double rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t *clipMode,
+                cudnnNanPropagation_t *clipNanOpt,
+                double *lclip,
+                double *rclip) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
+                            cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize,
+                            const int outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
+                            const cudnnRNNDescriptor_t rnnDesc,
+                            int *recProjSize,
+                            int *outProjSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                             const int minibatch,
+                             const cudnnDataType_t dataType,
+                             cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, minibatch, dataType, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, plan);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle,
+                      const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc,
+                      size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
+                                const cudnnRNNDescriptor_t rnnDesc,
+                                const int pseudoLayer,
+                                const cudnnTensorDescriptor_t xDesc,
+                                const cudnnFilterDescriptor_t wDesc,
+                                const void *w,
+                                const int linLayerID,
+                                cudnnFilterDescriptor_t linLayerMatDesc,
+                                void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
+                              const cudnnRNNDescriptor_t rnnDesc,
+                              const int pseudoLayer,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const cudnnFilterDescriptor_t wDesc,
+                              const void *w,
+                              const int linLayerID,
+                              cudnnFilterDescriptor_t linLayerBiasDesc,
+                              void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInference(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         const void *x,
+                         const cudnnTensorDescriptor_t hxDesc,
+                         const void *hx,
+                         const cudnnTensorDescriptor_t cxDesc,
+                         const void *cx,
+                         const cudnnFilterDescriptor_t wDesc,
+                         const void *w,
+                         const cudnnTensorDescriptor_t *yDesc,
+                         void *y,
+                         const cudnnTensorDescriptor_t hyDesc,
+                         void *hy,
+                         const cudnnTensorDescriptor_t cyDesc,
+                         void *cy,
+                         void *workspace,
+                         size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned *paddingMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDesc, paddingMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
+                           const cudnnRNNDescriptor_t rnnDesc,
+                           const cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           const cudnnTensorDescriptor_t hxDesc,
+                           const void *hx,
+                           const cudnnTensorDescriptor_t cxDesc,
+                           const void *cx,
+                           const cudnnFilterDescriptor_t wDesc,
+                           const void *w,
+                           const cudnnRNNDataDescriptor_t yDesc,
+                           void *y,
+                           const cudnnTensorDescriptor_t hyDesc,
+                           void *hy,
+                           const cudnnTensorDescriptor_t cyDesc,
+                           void *cy,
+                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                           const void *keys,                     /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                           void *cAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                           void *iAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                           void *queries,                        /* reserved, should pass NULL */
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, algoDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
+                                        const cudnnRNNDescriptor_t rnnDesc,
+                                        const int seqLength,
+                                        const cudnnTensorDescriptor_t *xDesc,
+                                        const void *x,
+                                        const cudnnTensorDescriptor_t hxDesc,
+                                        const void *hx,
+                                        const cudnnTensorDescriptor_t cxDesc,
+                                        const void *cx,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const void *w,
+                                        const cudnnTensorDescriptor_t *yDesc,
+                                        void *y,
+                                        const cudnnTensorDescriptor_t hyDesc,
+                                        void *hy,
+                                        const cudnnTensorDescriptor_t cyDesc,
+                                        void *cy,
+                                        const float findIntensity,
+                                        const int requestedAlgoCount,
+                                        int *returnedAlgoCount,
+                                        cudnnAlgorithmPerformance_t *perfResults,
+                                        void *workspace,
+                                        size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int [], const cudnnSeqDataAxis_t [], size_t, const int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int [], cudnnSeqDataAxis_t [], size_t *, size_t, int [], void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetSeqDataDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes, seqLengthArraySize, seqLengthSizeRequested, seqLengthArray, paddingFill);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, unsigned int, int, double, cudnnDataType_t, cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc, attnMode, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, unsigned int *, int *, double *, cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *, cudnnDropoutDescriptor_t *, cudnnDropoutDescriptor_t *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAttnDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attnDesc, attnMode, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnBuffers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnMultiHeadAttnWeightKind_t, size_t, const void *, cudnnTensorDescriptor_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, wKind, weightSizeInBytes, weights, wDesc, wAddr);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int [], const int [], const int [], const int [], const cudnnSeqDataDescriptor_t, const void *, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t, void *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnForward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx, devSeqLengthsQO, devSeqLengthsKV, qDesc, queries, residuals, kDesc, keys, vDesc, values, oDesc, out, weightSizeInBytes, weights, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvInferVersionCheck(void) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAdvInferVersionCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, dgrid, dtheta);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsTrainVersionCheck(void) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpsTrainVersionCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                            const cudnnTensorDescriptor_t xDesc,
+                                            const cudnnTensorDescriptor_t dyDesc,
+                                            const cudnnConvolutionDescriptor_t convDesc,
+                                            const cudnnFilterDescriptor_t dwDesc,
+                                            const int requestedAlgoCount,
+                                            int *returnedAlgoCount,
+                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              const void *x,
+                                              const cudnnTensorDescriptor_t dyDesc,
+                                              const void *y,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t dwDesc,
+                                              void *dw,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
+                                              void *workSpace,
+                                              size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnConvolutionDescriptor_t convDesc,
+                                           const cudnnFilterDescriptor_t dwDesc,
+                                           cudnnConvolutionBwdFilterPreference_t preference,
+                                           size_t memoryLimitInBytes,
+                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
+                                              const cudnnTensorDescriptor_t srcDesc,
+                                              const cudnnTensorDescriptor_t diffDesc,
+                                              const cudnnConvolutionDescriptor_t convDesc,
+                                              const cudnnFilterDescriptor_t gradDesc,
+                                              const int requestedAlgoCount,
+                                              int *returnedAlgoCount,
+                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
+                                               const cudnnTensorDescriptor_t xDesc,
+                                               const cudnnTensorDescriptor_t dyDesc,
+                                               const cudnnConvolutionDescriptor_t convDesc,
+                                               const cudnnFilterDescriptor_t gradDesc,
+                                               cudnnConvolutionBwdFilterAlgo_t algo,
+                                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
+                               const void *alpha,
+                               const cudnnTensorDescriptor_t xDesc,
+                               const void *x,
+                               const cudnnTensorDescriptor_t dyDesc,
+                               const void *dy,
+                               const cudnnConvolutionDescriptor_t convDesc,
+                               cudnnConvolutionBwdFilterAlgo_t algo,
+                               void *workSpace,
+                               size_t workSpaceSizeInBytes,
+                               const void *beta,
+                               const cudnnFilterDescriptor_t dwDesc,
+                               void *dw) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnConvolutionBackwardBias(cudnnHandle_t handle,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dbDesc,
+                             void *db) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
+                               const cudnnRNNDescriptor_t rnnDesc,
+                               const int seqLength,
+                               const cudnnTensorDescriptor_t *xDesc,
+                               size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTraining(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t cxDesc,
+                        const void *cx,
+                        const cudnnFilterDescriptor_t wDesc,
+                        const void *w,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        void *y,
+                        const cudnnTensorDescriptor_t hyDesc,
+                        void *hy,
+                        const cudnnTensorDescriptor_t cyDesc,
+                        void *cy,
+                        void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle,
+                     const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength,
+                     const cudnnTensorDescriptor_t *yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy,
+                     const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy,
+                     const cudnnFilterDescriptor_t wDesc,
+                     const void *w,
+                     const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx,
+                     const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx,
+                     const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx,
+                     const cudnnTensorDescriptor_t dhxDesc,
+                     void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc,
+                     void *dcx,
+                     void *workspace,
+                     size_t workSpaceSizeInBytes,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights(cudnnHandle_t handle,
+                        const cudnnRNNDescriptor_t rnnDesc,
+                        const int seqLength,
+                        const cudnnTensorDescriptor_t *xDesc,
+                        const void *x,
+                        const cudnnTensorDescriptor_t hxDesc,
+                        const void *hx,
+                        const cudnnTensorDescriptor_t *yDesc,
+                        const void *y,
+                        const void *workspace,
+                        size_t workSpaceSizeInBytes,
+                        const cudnnFilterDescriptor_t dwDesc,
+                        void *dw,
+                        const void *reserveSpace,
+                        size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnTensorDescriptor_t cxDesc,
+                          const void *cx,
+                          const cudnnFilterDescriptor_t wDesc,
+                          const void *w,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          void *y,
+                          const cudnnTensorDescriptor_t hyDesc,
+                          void *hy,
+                          const cudnnTensorDescriptor_t cyDesc,
+                          void *cy,
+                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                          const void *keys,                     /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                          void *cAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                          void *iAttn,                          /* reserved, should pass NULL */
+                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                          void *queries,                        /* reserved, should pass NULL */
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardDataEx(cudnnHandle_t handle,
+                       const cudnnRNNDescriptor_t rnnDesc,
+                       const cudnnRNNDataDescriptor_t yDesc,
+                       const void *y,
+                       const cudnnRNNDataDescriptor_t dyDesc,
+                       const void *dy,
+                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+                       const void *dcAttn,                    /* reserved, should pass NULL */
+                       const cudnnTensorDescriptor_t dhyDesc,
+                       const void *dhy,
+                       const cudnnTensorDescriptor_t dcyDesc,
+                       const void *dcy,
+                       const cudnnFilterDescriptor_t wDesc,
+                       const void *w,
+                       const cudnnTensorDescriptor_t hxDesc,
+                       const void *hx,
+                       const cudnnTensorDescriptor_t cxDesc,
+                       const void *cx,
+                       const cudnnRNNDataDescriptor_t dxDesc,
+                       void *dx,
+                       const cudnnTensorDescriptor_t dhxDesc,
+                       void *dhx,
+                       const cudnnTensorDescriptor_t dcxDesc,
+                       void *dcx,
+                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+                       void *dkeys,                           /* reserved, should pass NULL */
+                       void *workSpace,
+                       size_t workSpaceSizeInBytes,
+                       void *reserveSpace,
+                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
+                          const cudnnRNNDescriptor_t rnnDesc,
+                          const cudnnRNNDataDescriptor_t xDesc,
+                          const void *x,
+                          const cudnnTensorDescriptor_t hxDesc,
+                          const void *hx,
+                          const cudnnRNNDataDescriptor_t yDesc,
+                          const void *y,
+                          void *workSpace,
+                          size_t workSpaceSizeInBytes,
+                          const cudnnFilterDescriptor_t dwDesc,
+                          void *dw,
+                          void *reserveSpace,
+                          size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t cxDesc,
+                                       const void *cx,
+                                       const cudnnFilterDescriptor_t wDesc,
+                                       const void *w,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       void *y,
+                                       const cudnnTensorDescriptor_t hyDesc,
+                                       void *hy,
+                                       const cudnnTensorDescriptor_t cyDesc,
+                                       void *cy,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
+                                    const cudnnRNNDescriptor_t rnnDesc,
+                                    const int seqLength,
+                                    const cudnnTensorDescriptor_t *yDesc,
+                                    const void *y,
+                                    const cudnnTensorDescriptor_t *dyDesc,
+                                    const void *dy,
+                                    const cudnnTensorDescriptor_t dhyDesc,
+                                    const void *dhy,
+                                    const cudnnTensorDescriptor_t dcyDesc,
+                                    const void *dcy,
+                                    const cudnnFilterDescriptor_t wDesc,
+                                    const void *w,
+                                    const cudnnTensorDescriptor_t hxDesc,
+                                    const void *hx,
+                                    const cudnnTensorDescriptor_t cxDesc,
+                                    const void *cx,
+                                    const cudnnTensorDescriptor_t *dxDesc,
+                                    void *dx,
+                                    const cudnnTensorDescriptor_t dhxDesc,
+                                    void *dhx,
+                                    const cudnnTensorDescriptor_t dcxDesc,
+                                    void *dcx,
+                                    const float findIntensity,
+                                    const int requestedAlgoCount,
+                                    int *returnedAlgoCount,
+                                    cudnnAlgorithmPerformance_t *perfResults,
+                                    void *workspace,
+                                    size_t workSpaceSizeInBytes,
+                                    void *reserveSpace,
+                                    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, count);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
+                                       const cudnnRNNDescriptor_t rnnDesc,
+                                       const int seqLength,
+                                       const cudnnTensorDescriptor_t *xDesc,
+                                       const void *x,
+                                       const cudnnTensorDescriptor_t hxDesc,
+                                       const void *hx,
+                                       const cudnnTensorDescriptor_t *yDesc,
+                                       const void *y,
+                                       const float findIntensity,
+                                       const int requestedAlgoCount,
+                                       int *returnedAlgoCount,
+                                       cudnnAlgorithmPerformance_t *perfResults,
+                                       const void *workspace,
+                                       size_t workSpaceSizeInBytes,
+                                       const cudnnFilterDescriptor_t dwDesc,
+                                       void *dw,
+                                       const void *reserveSpace,
+                                       size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, const int [], const int [], const int [], const int [], const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, size_t, const void *, size_t, void *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, devSeqLengthsDQDO, devSeqLengthsDKDV, doDesc, dout, dqDesc, dqueries, queries, dkDesc, dkeys, keys, dvDesc, dvalues, values, weightSizeInBytes, weights, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, size_t, const void *, void *, size_t, void *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc, values, doDesc, dout, weightSizeInBytes, weights, dweights, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t, cudnnNanPropagation_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t, cudnnNanPropagation_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor_v8");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType, normMode, gradMode, maxLabelLength);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *, cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptorEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *, cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor_v8");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc, compType, normMode, gradMode, maxLabelLength);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctcLossDesc);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                      /* labels, in CPU memory */
+    const int hostLabelLengths[],                /* the length of each label, in CPU memory */
+    const int hostInputLengths[],                /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int [], const int [], const int [], void *, const cudnnTensorDescriptor_t, void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, probs, hostLabels, hostLabelLengths, hostInputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int labels[],                          /* labels, in GPU memory */
+    const int labelLengths[],                    /* the length of each label, in GPU memory */
+    const int inputLengths[],                    /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, const cudnnTensorDescriptor_t, const void *, const int [], const int [], const int [], void *, const cudnnTensorDescriptor_t, void *, size_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss_v8");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, ctcLossDesc, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, workSpaceSizeInBytes, workspace);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize_v8");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, ctcLossDesc, probsDesc, gradientsDesc, sizeInBytes);
+}
+
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvTrainVersionCheck(void) {
+  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAdvTrainVersionCheck");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_stub.cc b/tensorflow/stream_executor/cuda/cudnn_stub.cc
index f683cecdb52..e30f749897e 100644
--- a/tensorflow/stream_executor/cuda/cudnn_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_stub.cc
@@ -51,15 +51,17 @@ cudnnStatus_t GetSymbolNotFoundError() { return CUDNN_STATUS_INTERNAL_ERROR; }
 #error cuDNN version earlier than 6 is not supported.
 #elif CUDNN_MAJOR < 7
 #include "tensorflow/stream_executor/cuda/cudnn_6_0.inc"
-#elif CUDNN_MINOR < 1
+#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 1
 #include "tensorflow/stream_executor/cuda/cudnn_7_0.inc"
 // 2 instead of 3: see https://github.com/tensorflow/tensorflow/issues/32350
-#elif CUDNN_MINOR < 2
+#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 2
 #include "tensorflow/stream_executor/cuda/cudnn_7_1.inc"
-#elif CUDNN_MINOR < 4
+#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 4
 #include "tensorflow/stream_executor/cuda/cudnn_7_3.inc"
-#elif CUDNN_MINOR < 6
+#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 6
 #include "tensorflow/stream_executor/cuda/cudnn_7_4.inc"
-#else
+#elif CUDNN_MAJOR == 7
 #include "tensorflow/stream_executor/cuda/cudnn_7_6.inc"
+#else
+#include "tensorflow/stream_executor/cuda/cudnn_8_0.inc"
 #endif
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index aa8a2f0226d..3bf1388a3f9 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -967,11 +967,30 @@ def _create_local_cuda_repository(repository_ctx):
         ],
     ))
 
+    if [int(x) for x in cuda_config.cudnn_version.split(".")] < [8, 0]:
+      cudnn_headers = ["cudnn.h"]
+    else:
+      cudnn_headers = ["cudnn_adv_infer.h",
+                       "cudnn_adv_train.h",
+                       "cudnn_cnn_infer.h",
+                       "cudnn_cnn_train.h",
+                       "cudnn_ops_infer.h",
+                       "cudnn_ops_train.h",
+                       "cudnn.h",          
+                       "cudnn_version.h",
+                      ]
+
+    cudnn_srcs = []
+    cudnn_outs = []
+    for header in cudnn_headers:
+      cudnn_srcs.append(cudnn_header_dir + '/' + header)
+      cudnn_outs.append("cudnn/include/" + header)
+
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cudnn-include",
-        srcs = [cudnn_header_dir + "/cudnn.h"],
-        outs = ["cudnn/include/cudnn.h"],
+        srcs = cudnn_srcs,
+        outs = cudnn_outs,
     ))
 
     # Set up BUILD file for cuda/
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index e2ab42abf67..d4214930c14 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -219,17 +219,20 @@ def _find_library(base_paths, library_name, required_version):
   return _find_file(base_paths, _library_paths(), filepattern)
 
 
-def _find_versioned_file(base_paths, relative_paths, filepattern,
+def _find_versioned_file(base_paths, relative_paths, filepatterns,
                          required_version, get_version):
   """Returns first valid path to a file that matches the requested version."""
+  if type(filepatterns) not in [list, tuple]:
+    filepatterns = [filepatterns]
   for path in _cartesian_product(base_paths, relative_paths):
-    for file in glob.glob(os.path.join(path, filepattern)):
-      actual_version = get_version(file)
-      if _matches_version(actual_version, required_version):
-        return file, actual_version
+    for filepattern in filepatterns:
+      for file in glob.glob(os.path.join(path, filepattern)):
+        actual_version = get_version(file)
+        if _matches_version(actual_version, required_version):
+          return file, actual_version
   raise _not_found_error(
       base_paths, relative_paths,
-      filepattern + " matching version '%s'" % required_version)
+      ", ".join(filepatterns) + " matching version '%s'" % required_version)
 
 
 def _find_header(base_paths, header_name, required_version, get_version):
@@ -339,12 +342,13 @@ def _find_cublas_config(base_paths, required_version, cuda_version):
 def _find_cudnn_config(base_paths, required_version):
 
   def get_header_version(path):
-    version = (
+    version = [
         _get_header_version(path, name)
-        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL"))
-    return ".".join(version)
+        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
+    return ".".join(version) if version[0] else None
 
-  header_path, header_version = _find_header(base_paths, "cudnn.h",
+  header_path, header_version = _find_header(base_paths,
+                                             ("cudnn.h", "cudnn_version.h"),
                                              required_version,
                                              get_header_version)
   cudnn_version = header_version.split(".")[0]
diff --git a/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/gpus/find_cuda_config.py.gz.base64
index ee5ddc22044..5378ddd2e2d 100644
--- a/third_party/gpus/find_cuda_config.py.gz.base64
+++ b/third_party/gpus/find_cuda_config.py.gz.base64
@@ -1 +1 @@
-eJzNXHtz47iR/5+fAseJy6RHpu291FZOF++V157JKvHZU7Zm9lK2o0AkLHFNkQofkpVUvvt1NwASBCm/JlsZVa1HAtFAd+PXLxDYd+w0W27yeDYv2XeHR//FxnPBxiItsvxjkq3ZSVXOs7wI2EmSsCvsVrArUYh8JaLAeee8Y+dxCN1FxKo0Ejkrgf5kyUP4Rz0ZsC8iL+IsZd8Fh8zDDq565Pr/DSNssoot+IalWcmqQsAQccHu40Qw8RiKZcnilIXZYpnEPA0FW8flnKZRgwAb7M9qiGxacujNof8Sft2b/RgviWH8zMtyOTw4WK/XASdmgyyfHSSyY3FwPjr9cHH9YR8YJpLPaSKKguXib1Wcg6jTDeNL4CfkU+Ay4WuW5YzPcgHPygz5XedxGaezASuy+3LNcwGjRHFR5vG0KlvK0tyBzGYHUBdPmXtyzUbXLvvx5Hp0PYAxfh6Nf7r8PGY/n1xdnVyMRx+u2eUVO728OBuNR5cX8OsjO7n4M/vT6OJswASoCqYRj8sc+QcmY1QjLR27FqLFwH0mGSqWIozv4xDkSmcVnwk2y1YiT0EcthT5Ii5wMQtgL4JRkngRl7yklo5QOM3xv/TjuK77KY9TgOHp57MTmH6a83yDzLC54Dh/BEsUllkeC+KRrST6AFIZMIiKJSk3RSkWgeMg4IswjwFnheA5YKEgVWwbHoFZtEcZwIqj1srCgcYFQiASJaoqJRXHuWaCBlpK/pE+zNL7eFblpECkK8ooq8qAuFrycl5IPNHoRIxUNQ5r0QBget0QgvM8q2ZzJtJVnGfpQqSls+J5jGgFUx7dg6mxFU/iyGIgVkoaSOGkVjS7xJzIc1r4XJRVTiBg0ATqCrNIKG0mAGO0Pak8XAagvY+BeRi/4ZIj27MKuQOmrqvlMssR+Q0Zmg0tgxenYVJF0BRWP56fXPsD+HJ2cTFgF6en5wNSjHRaV+P2kpb8AQeqeZpyQLqJkIYfMGqUevxxgnNOPp2Mf7p2DBUyrULkHPzRgu8XYslBdUA8S7IpTRIwY/Ykyx4kmiR4Cgc51aCSSCJ3Ned5tI8qjACExGhRTU027/NsgewB9yQCYSNwYClb/KKG0Y3WUoFu2OV1Y9ORuOdVUmI/cLXR0HEYGGtaPQ7BKx5URX6QZCFPDsIq4gNqkNqVWCQudpNIoobtL3cDoP8ZljZbF0NW88H61DYA/+Ow5nM6vL39lGeznC/YR1TF7e3Fl9HZ6IT94dNnCEuLZYUulI2zLHmIy9tbHP72ds9xPoJCpzx8AL8a0VIAd/E0TuJygw53IUwMJUVGUYUnYJEpdFxJFTod9WbSWfawDqtEoWTTNrQAVEewctSS7td67h1FaVuuPqvDIvRtsMVOoihGY+RJB6xAvf+VHxiCLIo+GjpfPlxdQ/DQ64JN48vL8z+NxrSaQCPNrqbBHy2qfhqwUGOei4v2RLJpdHE9Pjk/10RozzVz+MNijppMGukCJj+dXekhtCOgIcYQxy+vrsbGMHVTa2rni/al/QAIwdNNhU4pwKTBEB53MaTuPgabXXRx6JYF4IE7NQyUg4ZIvFiWG+xdpYafzhgPKcPh6Ub3Vf4LogDgH+fjKnBptwGma3AxdMr7ye/Vsx8mapAhA6aCv7efSR80AUANWRAE7YfqS/MUAq3jQLoATpnFmf6WFfobejz9fZnwEpnRv8Fvq2/gxpZ5FkL2UbdsCgfSguUmgbAyxJwHFXw825fP98F97fNyv8yWTplvhrCgZJzFHLxBwtQg63kczh2VHo6o7QOGobo7plJIUATFkq9TTYdRaCIeRViV2pPLoRqGRLqVH8cJEw551Cm5P5rQ+0A8gMp9nHsJj6EbeFk2iQtQKnhWj57IgFlrKpCZg+dDfoTpMPRzDcK1dKkvIFXO1yRe8DB7CekZz2GemnLBS0x+NIQ8HpYVTyY1hHXWoVtoAkDJ6VyED6hHQf6T/K9OdxZCqExHUcto76g4gEWFSGSjBrU9DUtVVg3Wh2ksmmVM2ZcapM1nPTZ+OmNZnbFHAeHQIHmNJ9WfI2Z/joJ22zivhEkQfGcTWL8/Qtx6hiL4zycpuh97DmJKr0Q+K4Y1ta23ISU1WmmdlAnCax7UxG0Vt0kxLQ4xX9L5jIxGkOCWPEkoCdUDXckMc8h+VsDC3nLsBl4Ssu2kOEtFIJEJfyFuW0sO7vMCukhhlX1I3UELVEVQgS1BRB4aWVedaOPA1njHNgCxhlt6vtMDv+NOU6s38dIZjedlgTWv1zFAbbm8nCSCF+XrTFfp8pjdgN/zVj7lqSvKRy0WoGQsPTdw/TtDqD7CrnQWqSEh+6HRhhZkJkodpbQomHgOWMoXQvsbBQx0Biq0wPRADsUWZdOnZqUWSBgghwlWY/A8zoJsKfTIbu5CdE4h94Zc89ityvv937l+kMMQSKD8KJNQowUM6KvnvpNzsp2Cvfduo/e+y3aI0QFN5RMZ4I+6a9tSKqC2YAZl2tI7Mhbfrb1xCMsuipinExAyqsLSu4/zooT8VkD2HdnKAOuhHB2z4WmcqoIcXCpRUQovCZVC1Hw3WREgWfBLFqfePYwuV/MeFSVJ8SfpVdLfmWuVRBNZCkyoPvD6uOqk2e36oTZTLFusmNmwiejRZDQXrAQFbs/Vza6P6ZV7UID4B3Uj0KlM6tjIRYIQY9ZEPvFuWiMDJPaX7p1P0bzEmkGuOpYZACgA8x47/oF5wZ7vypXDCILDi5Ls2MSanEGaQQtOKrUxkaVmU/DC3kEksCb0XF6Ecez6ElIq6/mcxvjwjLrUyY85YBMRuihEngMeRZ4GACwTYtdrI9M3sFlQee5hkevJAXzfBIMqLSdYOio80FfT56h1NtslT2YLsO7u4cqJRPUHB0KbgM+RtX6+BzIYRs7aSaoabNWBC1Shcv8A5PFasdSta1t3YD14Wxm72oGO4DFag7X0RU8Mt3njWvX5/k6BLsckAvBanVzVZLJt9UlT9w601W/ReoWVW+638+72RS4SWWrXm1i4F2JshcjdD8sbKRbdmllXbfx0GpRsduvePrmP/VlaNQ/FY5nz4uD086fx6ODJAWUfelb7OV0Y/Qskb238PCs69P7+t62frR/9suKDR5OsJfxeWzQwrQnVlhPaw/Nwv0HKOajF0L+RZ+WhSAdNXzRXVwaQG/c21SDb3Sl2EZ7krVF6+hLXfqQZwL+TXsac8LWDtqnvDLdlFmtaKadZlUTkWWhbEqvvHbkTB9+MnbfNEKw0ze7pn8ZYPUMZXU0ZgtXWQ3UnUr1Kx6aAPQnB9qGUh6NIju9RgB7L9QD/eK2QLyOeObFvpSr4DLXJ40K8HTItRSg7aBFrM5MJ1LZaU5udTE3kFjZpCMxMFwGiaHaRN01+0RMBDAYBbyYHFDzQ4nQUsupqm9bdKfaCaIMUgE5MeDV2qel9a3Bwtk/Wae3PEzn1zfDojiK0gEDfy9aTfEBkKDL3GW6emx41tWc4si1Qt93oE+hQE4nXGcxguxC2CAOGke4VyOLSiso5Lzslp0SbVk5Tbfz77LZTnxrSkufyVcf4rZs+Hf8wsCbd7i8U9ROLqnqYQAaISb1jHqXl0qFga1VMYJLJRkvlKpPpdzRvwIbKZ16MjpahPAF2K+VqM/6MA3larJaGKH2UOZ+FzM7Kw6RIt6VKV9BocIe7A9tLetd89aBrG5Xvt3L8WmO4bWNm7u5OFOxE5HD1pAcH7Ojw8HBQc7FDv2W77zgqxTY0W6u14XsrclwkDubuc9p/0Yq8YYyuLn2nUwNZ4rB3+CoAtw0E49NsJYJaC6ZLrsXui80kd166A6tAMfCQrsKwDw1GLDoFWiaraP26O0uk/Qt80XUbvb8N4M+AffH0d/rju7VX1HX11lJeV/D7ihco5Huq757tnDqd2154d+vo7fs5bdBCAymIUgtQBf4IxKNwO4kJhXLZwdVUagvM0HC9Wk+4j56KYhqnsgAYNOwMrPrRXkxfMr9atGGyfTrsewAoisQqDo06q4BSyS7bsFqBijuK+T4VtKWskS1yYNetW/aCo8NgGrrKmJdlbFrzdv463tQlYrBmvx7oNQYB/d0eJ1kbF27CyddpMJK9yVIvq6+Va7x+6+2v9O/LTer/Oz/RU8tYoxRXRyUqa5blI5dHYGrtBUQ+vjy79MIins38Ib2yQpOaZkBqbtbhpjzoLNASqTnq+kzHcs1smuX4In7eTlQsZYDuAkgd/cGLiG3VEPVBPYAvM/sudzeHd+w/jvseHN2pzR9KT8za0B2lEAWLGIROS/lawtTqECvEVcFoz2WL7+7OR+ulPME/NOpNg3OHbftr9VFbEyg59LNBYUcyv01sqG0bsYl3i7pZMc2hsYamqVvzdFasGRUt7DmJLHP2LfLnZLKN2BLKXBotlo0bpPinlSBNE168JEWy4uNQ7z123s9YW3bg0Y5c2f0lGZaZY5m7lM+8PjF6YiilWAR277nNeY7J/5788fLKpeSsaRtdYNu2fMXs+ulkfPpTEy51nqarUNNJMvb2JIzWgy/jt6RiX52H9Sdh6FbVIZmqUE55wX/JmpN/WZpsArVpTQJsS9nMCvvwzjHz4k651gZSe+BmI6fr6xSrdZZcsCgT8uiYzIzI/zWPt/s8xrz2tJYR0GI3+xPv8HVwLuQ5tXqCOL2HSmrFITPEUxlTAQBVr4bRNgbsl6pQe3V07gVf7NkYel3oNyBkLZ/xnpgOQaKZ8KKtEA9San/LYtoAQ/lfk1jgcG5nLfvjCHV52qdu8aZE+bQ77ThS2y1GafrrFI6NU3upS7MdGp4sM3wZ/ZRuTP8kV3X+4cuHc+2vnvJWX1Ewpunr3dRXOqmtdSIs2Mu9zqvKwzSV1WEzw7bUp+khQ3Dze9Du9SZcI+FXwToNw+TbRDWdc6xBLX9pTNOvVvT9NdCMuvlGwEzL9GtgGQcGjZrj9yLZ7IB5r/Fz0OrzBhgT3VehuKQzsHn5jSL5S3MutwG02fhUytnq2cL8O/ZXxcRfMXxzECgVOYd6lmXTX6CwxdPhbE3XLqBjyaC2oy1cuh2gso6qiNOZGg6ee4sqLuMlbv/HC0iRpiLJ1ir2NxqhQxGtdFBlbDxJvEbP2iDqnSGc5Is6WybfR8rrFwv+IDrZHtOXLuhgR/CsmddnTN5q7xerUXov8m8ozVZnX4xctk4r9anv79kiW4nIPLZH+WW+qC/ZKLmU4oN5J5d8i5bq0b4RZSEAtBd4oaN8lZ9cxSg1WK49S6+7tDuBS7ObBp2+b3CdNe1b3Sda8gTPqU1EuvLgP7VLqk43Hd/c2W+F7KtA5EzopFvfTYLmhbQem86J1UePWmeTmuYb3ftOr9nANQ45KO5aZ/PEjIdSwheJweXmYetwL3hGGqVfElzn7iWOQd8djd47F3RipsxglF15IAnPrvzu+8n3v21OtuyyNZ3VWOZiFWdVkWxkGUi7ld3beQ5TN6PYGZ00Bi+Q4d0f6KA0RVPtDuRln3Us7z3iRSJ0DNbq+61zxE+tV/d06F+8g5u/HLC7Pf89CXa7ft9IdfA/vwHD6VvdJ0+N3rRfMxhnw56Drd8c+QE/mMR/l4bchHcDCHUX+ZJzIC/cai0VUDjO5QVIdfVAIVq5DG1pueBJM4Wz7QiGolIky4SHwnNvb+nUmglw7KBk6Ly17CI5ikO6QpXTHeCnbnIquEgRmltjx+yG57NVALEe3L08E4sN9O5pUwT4/eZoeHfndN6/WacIXeuSFUpGgrUOUFnr57Yu9T0bUF50/LIzJ70ra51VMB7DaumVFI/AWyHX8U76dnXi9R//lOtKu6t0QrPWoHE8c5uMnTtjbuvslPKDdFC1WkbgWr3uC+tmgr5tp87ayOFu2jvwd+Y+jma2YUQbZFlBEujZJ97NkYxo6rPfM+/ocMCO6qzvHfuxs62ltpzWvFBXWuR2U338DDeejNc7+mqExWujhP4dqV5Emlf4akzaCm/y/J6dcJMLe7+qbzUUUNJ0C1Kgcq+hYgcwtxtnetBi73D0m2JzD3Gr3D07XAaD9kZHLR3Vjj3CUT23XbZOvOwRzap3eyQzL0s+I5i5x9HwZhW9tVQ6peqTrE63tkvXG/p7JOxJVnuktO9zPiOpXQe3+e3JXUlqtO6Hgb7SQqPGpVjUQQu08gCcRfJajks5Jt0+aLXSWyaj6CPX83CHSrKi8MpMmdWNNBnrFhxqOhqjLuaIObEZ4DmkShgHartxUXNdM0FX+T13pxjKPX3mNSP5/fUVuiC1d4+BT/4vAAL831sIryhzT6jaGx9CpCjx+IPjYLCnDGQyoauGkwlKMpm4OJIUyvl/z0/IjA==
\ No newline at end of file
+eJzNPGtT5DiS3/0rdO4jsOnCwNzGxF7dMhcMdN9wy0EHVDO3AbW1KlsUHly2z4+C2o3975eZkmxZdvHqmYipiGFsWZnKTOVTj/7AjrN8XcSL+4p9t3/w72xyL9hEpGVWfE6yR3ZUV/dZUQbsKEnYJXYr2aUoRbESUeB8cD6wsziE7iJidRqJglUAf5TzEP6nvozYtSjKOEvZd8E+87CDqz65/n8AhnVWsyVfszSrWF0KQBGX7C5OBBNPocgrFqcszJZ5EvM0FOwxru5pGIUEyGB/USiyecWhN4f+Obzdmf0Yr4hg/N1XVT7e23t8fAw4ERtkxWIvkR3LvbPT40/nV592gWAC+ZomoixZIf6vjgtgdb5mPAd6Qj4HKhP+yLKC8UUh4FuVIb2PRVzF6WLEyuyueuSFACxRXFZFPK+rjrA0dcCz2QHExVPmHl2x0yuX/Xh0dXo1Ahw/n05+uvg6YT8fXV4enU9OP12xi0t2fHF+cjo5vTiHt8/s6Pwv7M+n5ycjJkBUMIx4ygukH4iMUYw0dexKiA4Bd5kkqMxFGN/FIfCVLmq+EGyRrUSRAjssF8UyLnEySyAvAixJvIwrXlFLjykc5vBX/Tmu634p4hTU8PjryREMPy94sUZi2L3gOH4EUxRWWRELopGtpPaBSmVAIAqWuFyXlVgGjoMKX4ZFDHpWCl6ALpQkik3oUTHLLpYRzDhKrSodaFyiCkSiQlGlJOK40EQQolzSj/Bhlt7Fi7ogASJcWUVZXQVEVc6r+1LqE2EnYIRq9LBhDRRMzxuq4H2R1Yt7JtJVXGTpUqSVs+JFjNoKpnx6B6bGVjyJI4uAWAlpJJmTUtHkEnGiKGjiC1HVBSkBgyYQV5hFQkkzATVG25PCw2kA2LsYiAf8LZUcyV7USB0QdVXneVag5rdgaDY0DV6chkkdQVNY/3h2dOWP4OHk/HzEzo+Pz0YkGOm0LifdKa34AyJqaJpz0HRTQ1p6wKiR68nnGY45+3I0+enKMUTItAiRcvBHS75bipyD6AB4kWRzGiRgxuhJlj1IbZLKUzpIqVYqqUnkru55Ee2iCCNQQiK0rOcmmXdFtkTygHpigXQjcGAqO/SihNGNNlyBbNjFVWvTkbjjdVJhP3C10dhxGBhrWj+NwSvu1WWxl2QhT/bCOuIjapDSlbpIVGwnkdQatptvBwD/M0xt9liOWUMHGxLbCPyPw9rf8fj29kuRLQq+ZJ9RFLe359enJ6dH7L++fIWwtMxrdKFskmXJQ1zd3iL629sdx/kMAp3z8AH8akRTAdTF8ziJqzU63KUwdSgpM4oqPAGLTKHjSorQ6Yk3k85ygHSYJQol666hBSA6UitHTeluI+dBLEracvZZExahb6tb7CiKYjRGnvSUFaB3v/EHKMii6KdV5/rT5RUEDz0v2DS5uDj78+mEZhNgpNk1MPjSgRqGAQs1xjk/7w4km07PryZHZ2caCO25IQ5fLOKoyYSRLmD208mlRqEdAaGYQBy/uLycGGiaps7QzrX2pcMKEIKnmwudUoBJgyE8bWNI3X4K1tvo4tAtC9AH7jRqoBw0ROJlXq2xd50afjpjPKQMh6dr3Vf5L4gCoP84HleBS7sNMF2DirFT3c3+pL79MFNIxgyICv7e/SZ90AwUasyCIOh+VA/tVwi0jgPpAjhlFmf6KSv1E3o8/ZwnvEJi9Dv4bfUEbiwvshCyj6ZlXTqQFuTrBMLKGHMeFPDhYld+3wX3tcur3SrLnapYj2FCyTjLe/AGCVNIHu/j8N5R6eEptX3CMNR0x1QKAcqgzPljquEwCs3EkwjrSntyiaolSKQb6XGcMOGQRx2T+6MBvU9EA4jcx7Fz+AzdwMuyWVyCUMGzevRFBsxGUoHMHDwf8iNMh6GfawA+Spf6ClDlfE3gJQ+z14Ce8ALGaSCXvMLkR6uQx8Oq5smsUWGddegWGgC05PhehA8oR0H+k/yvTneWQqhMR0HLaO+oOIBFhUhko1ZqexiWqqwarA/TWDTLmLIvhaRLZ4Mbfz1cVmfsUUI4NEDe4kn174DZv4Og2zYpamECBN/ZANb7Z4hbL0AE//YsRP9nj0FE6ZkoFuW4gbblNqakRgutlzJBeC2CBrgr4i4opsUh5ks6n5HRCBLciicJJaEa0aXMMMfsZ6VY2FvibtVLqmw3Kc5SEUjNhL8Qt60pB/d5Dl0ks8o+pOygBaoiqMByYJGHRtbVJNqI2MJ3aCsg1nC55zsD6nfYa+r0Jlp62HhRlVjzej0D1JbLq1kieFm9zXSVLA/ZDfg9b+VTnrqifNQiAUrGynMD158aTA0B9rmzQA0O2Q+tNDQjC1HpKKVZwcRzxFK+FNrfKMVAZ6BCCwwP4FBsUTZ9bFZqgVQDpDDBagy+x1mQ5UJjdgsXonMKuTfkmoduXd3t/tH1gwJQIIDyo0yqGk1gQI+e+0GOybZK9tG7jT76LtsiQkc0lE9goH/UXduWEgG1BQso03LvwJh8t/HGIUy7KGOezoDJqA4r7y4uygryWwHZd2QLA6yHcnTMhudxqgpycKkERSm8BFQCUePdZGWAYMEvWZx6d4BdzuYdCkqC4ivJVcJPzblKopksBWZUH3hDVPXS7G790Jgpli1WzGzJRO3RYDQWzAQFbs/Vza6P6ZW7VwL7e00jwKlM6tDIRYIQY9ZMfvFuOphBJXZzd+pTNK+wZpCzjmUGKBQo8w47/IF5wY7vypnDCILoRUV2bOqaHEGaQUedVGpjapYaTakX9g4igTWh5/IyjGPXlyqlsp6vaYwfT6hLk/yYCNuI0NdCpDngUeRpBYBpQt31uprpG7pZUnnuYZHrSQS+byqDKi1nWDoqfaBH0+eoeTbbJU1mC5Du7uDMiUT1BwdCi4AvgXVePwIYoJGj9pKqVreawAWiULl/APx4nVjqNrWtO7I+vK+MXW1BR/AYHWQdedEXw23euFZ9vrtVossxgUB5rU6uajLJtvqkqTsFaQ1btJ5h5ZaH7by/fFGIRJbazSIWroUYSyFy9cPyRopEtyHWVQs/vQbFm926s0vuY3eR1u1H8VQVvNw7/vplcrr3LELZh741fk4XRr8C552FnxdZh97f/6Hz2nkZ5hU/PJlgHeZ3uqyBac2otpzRGp6H6w2Sz1HDhn5HmpWHIhm0fdFcXRlAbtzbVCvZ9la5jepJ3hq5p4e48SMtAn8qvYw54FuRdqGnhtsyizUtlOOsTiLyLLQsidX3llyJgydj5W09BitNszv6X2usniGMvqQMxhrroboTod4kY5PBgYRgMyrl4SiS4z4KwGO5HuAfrxPyZcQzB/atVAW/oTR5XIr3q0xHEMoOOsDazGQCtanW1GYnUxO5hE0SAjPTRYAo21XkdZtfDEQAg0DQN5MCCh5ocToKWXW1DetulTtBtEYI0E5MeLXuUtPHDnJwts/Wad3fMzn1zfhgShFaQKAfJOtZOiAylJn7AjUvDY+S2jEc2QZVt93oM9qhBhJvM5hytJkLm4cRw1D3BtXi0oyqe171ak6pblo6jbpV61yYfgISa/Q3YIk3mEGNWFXniZj2Zg093435Pv0VHYHWjDjtDKkt/tv8xUBlbIiZZOE3XeP3Lji1c9p4p5E18GZvpeCfUSkdIcA2VE3UmUJQdqkAmNFpPnVQ2lifk1rLtKczVyqnGnZ571BSlVm9Wk07JvuM2VnJX5fwF1zZ82x1JESJrMw+LZXu6QEMinAb1guUorR6iOsUmxcXXHMTRFdZqvLoVBuNxHAByawh3K0o2IrI9etB9/bYwf7+/qihYoveZbvvOCrZNyTbiLWle6PmuAgc3LsvSf9VM/IOHH1Z+k6vGrPYYR9wUwIXMATj82wlgkYKZnBo2B7KEojvonJHVqlk6EO6CsMhbTCi4jHAMlnP6433LJG+QOCW22308TaAPyN27eln+uO7jTvVFf7GRQW9lrCraHGn/sA6wMDCUpNYbl4C6Ff0m1eWukoLDSQgSnJAFPgSiCfh9lIkSipkB1dDqcU4Q8LNbD3jPgZqm3mcylJk1JIzsipZezJ9Sfxq2VWTzcNh3z3Qokis4tCo+Eoo2uwCEusmqP2jmO9SaV3Jat0CB3LdpmUnONgP5qGrjDmvYtOaN9PX86YuAYM1+w2itxgE9HcHnGRjXLgcKDf2AJO93NNMq6+Fa2wEDvZX8vflcvn/nh3poWWsUYJrohIVWHn1xOVhnEZ6AYFPLk4uvLCMFwt/TJtnaFLzDEDNZUPcHgCZBZojNUZTKeq4rolNswKPBNx3UxdLGBjhIYn1R68CtkVD0HsNAl8mfX3qbvan7F8Ohz4cqNxPpipmleqephAFS8gSca+LNkhMqY6xVl2VjFZ/Nvju/ng0X8oT/ENrvWlw7rhrf50+apEEOYd+tlLYkczvAhti2wRs6rsF3c6YptCYQ9PUrXF6M9ZiRQt7iSPLnH0L/CWebCO2mDKnRrNl6w1C/NNKkOYJL1+TIlnxcaxXQXs7RdbiIXi0A1d2f02GZeZY5nrpCxs5Rk8MpRSLwO49tz1ZMvufo/++uHQpOWvbTs+xbVO+Ynb9cjQ5/qkNlzpP0/Ww6SQZe38SRvPB8/g9qdg352HDSRi6VXVcpy6VU17yX7L2DGKWJutALZ8TA5tSNrPW3586Zl7cK966itRF3C4p9X2dIrXJkksWZUIeYpOZEfm/9vNmn8eY1x3WMgKa7Hal5ANuTBdCnphrBojTO6ikVhwyQzwfMhegoGqTGm1jxH6pS7VqSCdwcIvR1qG3hX5DhazpM3as6TgmmgkvuwLxIKX2N0ymrWDI/1sSC0Tn9uZyOI5Ql+d96gZvSpDPu9OeI7XdYpSmv03h2G4Ovdal2Q4Nz7gZvoxepRvTr+Sqzj5dfzqTu+SbvRUan3rGvIISdJ3Vv8+Hvc3loLWCqNHZMfnYeApQ3jci+0b3t7ECbYl6hT97U+GZprLubEfYlFS1PWRwb99H3V7vshgE/CaDScMw+W3sxftGe6GznI25yDdtLfTWievPRfX3xnSUzduj+W+izDRNv4UuI2KQqIl/UJPNDphRG6+jTp93qDHBfZMWV3TOt6h+p5p83Z49bhXabHwume307Oj8B/Y3RcTfMDHgwFAqCg6VMsvmv0DJjCfg2SNdLYGOFYOqkRaH6QaEymfqMk4XCh1895Z1XMU57nDES0i+5iLJHlVW0UqEDn50Ek2VC/Ik8Vo5a4No1pxwkGt1fk7uucorJkv+IHp5JNMXS+jwSvCimTfnaN5r7+er0/ROFL+jBF6d7zGy5CZh1Sfbv2fLbCUi82giZa7FsrlIpPi61vG5l6W+R0oNtt+JsFABtBd4paN8k59cxcg1WK49yqC7tDuBS7ObRr2+73CdDex73Sda8gzP4s1EuvLgP7X+qk5wHd5M7f0m+7oTORM6zTd0W6LdBdW46Sxcc7yqc/6qbb7Rvad6zkaucZBDUdc5fygWPJQcvooNLpclOweYwTMSlmFOcJ77F1VGQ/dQBu+V0KmgKgMs2/LQFZ7P+eP3s+//0J7e2WaPdB4lL8QqzuoyWcsCk9ZB+zcQHaZuf7ETOk0NXiDD+03QQUmKhtoeyQtNj7G824mXpdAxWLPvd85KPzdf/ROwf/X2bv66x6Y7/kdi7PbxY8vV3n/+KxjO0Ow+ezL2pruBYZx/e0lt/fZYE/jBJP67NOQ2vBuK0HSR26cjealYS6mEkvReXvJU1yuURiuXoS2tEDxph3A2HTNRUAokT3goPPf2lk7mmQqOHRQPvf3QviZHcUjXxAq65/zcbVWlLpKF9mYcVLe8WKwCiPXg7uW5X2ygXa11GeDzzcF4OnV6O3vWSUnXukiGnBFjnUNi1vy5nYuLLwaUVx0x7Y1Ju3Cd4xPGZ5gtPZPiCWgr5TxOpW9Xp3r/8U85r7RuS6dQGwkaR1A38di7F+d2zocpP0iHces8Atfq9bfC2wGGFrR6cyPR3XTX9qfmCpEmtiVEGySdSfHsU/0mJiOa+uxPzDvYH7GDJuv7wH7sLZipxaxHXqprO3Ihqzlih0taxsaRvv5h0doKYXita1AjzWuKjU7aAm/z/IE1dpMKeyVsaDaUoqTpBk2Byr1RFTuAuf04M6At9grHsCm2dy038j2wdmYQaC90NNxR7TjAHNVzm3nrxcsB1qx6d4Az80LoC4yZaxwtbVbR23ClU6ohzpp0azN3g6F/gMOBZHWAS/vO6guc2nVwl96B3JW4Rut+GOlrO4Q1rsSyCVoglQegLJJXj1zKMemGRaeV9q+Moo9cz8MUhWRF4ZWZMqtbdzLWLTnUdISjKeaIOLEe4QmnWhiHhvtxUVPdEEH/XIHnbpVjuVvAvBaTP1xfoQtSuwIY+OQ/cxDgP+EhvLIqPKFqb/wIkaLCgxWOg8GeMpDZjK5TzmbIyWzmIibJlPP/j1wLdg==
\ No newline at end of file

From ec7ea83d9d416ac5322061535a2251658cbf5d22 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 14 May 2020 14:19:16 -0700
Subject: [PATCH 0611/1533] [TF:TRT] Enable concatenation_test and
 biasadd_matmul_test for TAP.

PiperOrigin-RevId: 311604247
Change-Id: Ifca2be4bf2f40dc48f2beffb76bea94fe52101b4
---
 tensorflow/python/compiler/tensorrt/BUILD | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 1e4c215994f..192ba71cebd 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -120,8 +120,10 @@ cuda_py_tests(
     srcs = [
         "test/base_test.py",
         "test/batch_matmul_test.py",
+        "test/biasadd_matmul_test.py",
         "test/binary_tensor_weight_broadcast_test.py",
         "test/combined_nms_test.py",
+        "test/concatenation_test.py",
         "test/const_broadcast_test.py",
         "test/conv2d_test.py",
         "test/dynamic_input_shapes_test.py",
@@ -155,27 +157,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "concatenation_test",
-    srcs = [
-        "test/biasadd_matmul_test.py",
-        "test/concatenation_test.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_rocm",
-        "no_windows",
-        "nomac",
-        "notap",  # b/140261407
-    ],
-    xla_enable_strict_auto_jit = False,
-    deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
 cuda_py_test(
     name = "quantization_mnist_test",
     srcs = ["test/quantization_mnist_test.py"],

From 501309eef9b43a3e965c2ec6a315ea76fffa2c90 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Thu, 14 May 2020 14:22:42 -0700
Subject: [PATCH 0612/1533] Only add sub_index to _Arg nodes.

PiperOrigin-RevId: 311604877
Change-Id: Ib7c941b38e6ea38378bd4d9d44dc1d262ee6dd4a
---
 .../core/common_runtime/replicate_per_replica_nodes.cc    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index cfbcde82ce2..fbae80aef55 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -42,7 +42,9 @@ class ReplicateHelper {
       Node* replicated_node = graph->AddNode(node_def, &status);
       TF_RETURN_IF_ERROR(status);
       replicated_node->set_assigned_device_name(device);
-      replicated_node->AddAttr("sub_index", i);
+      if (replicated_node->IsArg()) {
+        replicated_node->AddAttr("sub_index", i);
+      }
       replicated_nodes[i] = replicated_node;
     }
     replicated_nodes_map_.emplace(node, std::move(replicated_nodes));
@@ -214,7 +216,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
       // Reuse the original nodes if there is only one allowed device.
       for (Node* n : cluster_nodes) {
         n->set_assigned_device_name(allowed_devices.at(0));
-        n->AddAttr("sub_index", 0);
+        if (n->IsArg()) {
+          n->AddAttr("sub_index", 0);
+        }
       }
       continue;
     }

From 6f57007fb8713c2e2cb3f2aa8971544b67cc2516 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 14 May 2020 14:32:59 -0700
Subject: [PATCH 0613/1533] Use subTest on einsum_test to make errors easier to
 understand.

PiperOrigin-RevId: 311606884
Change-Id: I7f8738ffc26479f98d468431706c7d4f7c6efcfc
---
 .../python/kernel_tests/einsum_op_test.py     | 72 ++++++++++---------
 1 file changed, 40 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/kernel_tests/einsum_op_test.py b/tensorflow/python/kernel_tests/einsum_op_test.py
index a6b623b828c..47d5d457193 100644
--- a/tensorflow/python/kernel_tests/einsum_op_test.py
+++ b/tensorflow/python/kernel_tests/einsum_op_test.py
@@ -42,10 +42,11 @@ class EinsumOpTest(test.TestCase):
     r = np.random.RandomState(0)
     inputs = []
     for shape in input_shapes:
-      arr = np.array(r.randn(*shape)).astype(dtype)
-      if dtype == np.complex64 or dtype == np.complex128:
-        arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-      inputs.append(arr)
+      with self.subTest(s=s, shape=shape):
+        arr = np.array(r.randn(*shape)).astype(dtype)
+        if dtype == np.complex64 or dtype == np.complex128:
+          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+        inputs.append(arr)
     input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
     a = np.einsum(s, *inputs)
     b = self.evaluate(gen_linalg_ops.einsum(input_tensors, s))
@@ -160,10 +161,11 @@ class EinsumOpTest(test.TestCase):
       input_shapes = [(2, 2), (2, 2)]
       inputs = []
       for shape in input_shapes:
-        arr = np.array(r.randn(*shape)).astype(dtype)
-        if dtype == np.complex64 or dtype == np.complex128:
-          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-        inputs.append(arr)
+        with self.subTest(dtype=dtype, shape=shape):
+          arr = np.array(r.randn(*shape)).astype(dtype)
+          if dtype == np.complex64 or dtype == np.complex128:
+            arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+          inputs.append(arr)
       input_tensors = [constant_op.constant(x) for x in inputs]
       if dtype == bfloat16:
         # np.einsum doesn't support bfloat16.
@@ -199,14 +201,15 @@ class EinsumOpTest(test.TestCase):
         ('...ij,...jk->ik', r.randn(2, 2, 3), r.randn(3, 4)),
     ]
     for args in cases:
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
+      with self.subTest(args=args):
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
 
-      placeholders = [
-          array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
-      ]
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
+        placeholders = [
+            array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
+        ]
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
 
   @test_util.run_in_graph_and_eager_modes
   def testPlaceholder(self):
@@ -216,10 +219,12 @@ class EinsumOpTest(test.TestCase):
       inputs = []
       input_placeholders = []
       for actual_shape, placeholder_shape in input_and_placeholder_shapes:
-        input_np = np.array(r.randn(*actual_shape))
-        inputs.append(input_np)
-        input_placeholders.append(
-            array_ops.placeholder_with_default(input_np, placeholder_shape))
+        with self.subTest(equation=equation, actual_shape=actual_shape,
+                          placeholder_shape=placeholder_shape):
+          input_np = np.array(r.randn(*actual_shape))
+          inputs.append(input_np)
+          input_placeholders.append(
+              array_ops.placeholder_with_default(input_np, placeholder_shape))
 
       a = np.einsum(equation, *inputs)
       b = self.evaluate(gen_linalg_ops.einsum(input_placeholders, equation))
@@ -288,19 +293,22 @@ class EinsumGradTest(test.TestCase):
     with self.cached_session():
       r = np.random.RandomState(seed=0)
       for dtype in (np.float32, np.float64, np.complex64, np.complex128):
-        tol = 10 * np.sqrt(np.finfo(dtype).resolution)
-        if dtype in (np.complex64, np.complex128):
-          inputs = [
-              np.array(r.randn(*shape), dtype) +
-              1j * np.array(r.randn(*shape), dtype) for shape in input_shapes
-          ]
-        else:
-          inputs = [np.array(r.randn(*shape), dtype) for shape in input_shapes]
-        input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
-        analytical, numerical = gradient_checker_v2.compute_gradient(
-            lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
-        self.assertLess(
-            gradient_checker_v2.max_error(analytical, numerical), tol)
+        with self.subTest(s=s, dtype=dtype):
+          tol = 10 * np.sqrt(np.finfo(dtype).resolution)
+          if dtype in (np.complex64, np.complex128):
+            inputs = [
+                np.array(r.randn(*shape), dtype) +
+                1j * np.array(r.randn(*shape), dtype) for shape in input_shapes
+            ]
+          else:
+            inputs = [
+                np.array(r.randn(*shape), dtype) for shape in input_shapes]
+          input_tensors = [
+              constant_op.constant(x, shape=x.shape) for x in inputs]
+          analytical, numerical = gradient_checker_v2.compute_gradient(
+              lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
+          self.assertLess(
+              gradient_checker_v2.max_error(analytical, numerical), tol)
 
   @test_util.disable_xla('b/131919749')
   def testUnary(self):

From eab1e71ebfd3e5ef83219584dd3785db46200d43 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 14 May 2020 14:44:10 -0700
Subject: [PATCH 0614/1533] Add Minimum and Maximum ops to Hexagon delegate for
 uint8/int8

PiperOrigin-RevId: 311609003
Change-Id: Iedb6b59b5895b28c906c029e95202294377d32ec
---
 .../experimental/delegates/hexagon/README.md  |   2 +
 .../delegates/hexagon/builders/BUILD          |   2 +
 .../hexagon/builders/min_max_builder.cc       | 106 +++++++++++
 .../hexagon/builders/min_max_builder.h        |  45 +++++
 .../delegates/hexagon/builders/op_builder.cc  |   4 +
 .../delegates/hexagon/builders/op_factory.h   |   1 +
 .../delegates/hexagon/builders/tests/BUILD    |   1 +
 .../builders/tests/min_max_builder_test.cc    | 171 ++++++++++++++++++
 .../experimental/delegates/hexagon/utils.cc   |  12 ++
 9 files changed, 344 insertions(+)
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc

diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index a97342c9fdc..6e627c17cd2 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -79,8 +79,10 @@ are verified in `IsNodeSupportedByHexagon`:
 * Hardswish
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
+* Maximum
 * MaxPool2D (without any activation) (b/129276536)
 * Mean
+* Minimum
 * MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index ff764984de9..e24adc2537c 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -19,6 +19,7 @@ cc_library(
         "hardswish_builder.cc",
         "l2_normalization_builder.cc",
         "matmul_builder.cc",
+        "min_max_builder.cc",
         "mirror_pad_builder.cc",
         "neg_op_builder.cc",
         "op_builder.cc",
@@ -46,6 +47,7 @@ cc_library(
         "hardswish_builder.h",
         "l2_normalization_builder.h",
         "matmul_builder.h",
+        "min_max_builder.h",
         "mirror_pad_builder.h",
         "neg_op_builder.h",
         "op_builder.h",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
new file mode 100644
index 00000000000..ab5895b9a14
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h"
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus MinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                               const TfLiteIntArray* outputs,
+                                               TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  int a_tensor_id;
+  int b_tensor_id;
+
+  // Input tensors a and b.
+  a_tensor_id = inputs->data[0];
+  b_tensor_id = inputs->data[1];
+  const auto& a_tensor = context->tensors[a_tensor_id];
+  const auto& b_tensor = context->tensors[b_tensor_id];
+  if (a_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(a_tensor_id, a_tensor);
+  if (b_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(b_tensor_id, b_tensor);
+  AddInput(graph_builder_->GetHexagonTensorId(a_tensor_id));
+  AddInput(graph_builder_->GetHexagonTensorId(b_tensor_id));
+
+  // Add Inputs A & B min/max
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(a_tensor, &a_input_min_, &a_input_max_));
+  auto* a_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_min_),
+      sizeof(a_input_min_));
+  auto* a_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_max_),
+      sizeof(a_input_max_));
+  AddInput(TensorID(a_input_min_const->GetID(), 0));
+  AddInput(TensorID(a_input_max_const->GetID(), 0));
+
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(b_tensor, &b_input_min_, &b_input_max_));
+  auto* b_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_min_),
+      sizeof(b_input_min_));
+  auto* b_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_max_),
+      sizeof(b_input_max_));
+  AddInput(TensorID(b_input_min_const->GetID(), 0));
+  AddInput(TensorID(b_input_max_const->GetID(), 0));
+
+  // Add output min/max
+  const int output_tensor_id = outputs->data[0];
+  const auto& output_tensor = context->tensors[output_tensor_id];
+  float output_min, output_max;
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
+
+  // Add outputs.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MinMaxOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                              TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new MinMaxOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
new file mode 100644
index 00000000000..4d50d941e4f
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MinMaxOpBuilder : public OpBuilder {
+ public:
+  explicit MinMaxOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  float a_input_min_, a_input_max_, b_input_min_, b_input_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index c7432e64c79..230a292b6fe 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -93,6 +93,10 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateQuantizeBuilder(this, OP_Requantize_8to8);
     case kTfLiteBuiltinHardSwish:
       return CreateHardSwishBuilder(this, OP_QuantizedHardSwish_8);
+    case kTfLiteBuiltinMinimum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMinimum_8);
+    case kTfLiteBuiltinMaximum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMaximum_8);
     default:
       context_->ReportError(context_, "Op not supported: %d", op_type);
       return nullptr;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index 0beb88cc68e..515d0edb929 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -54,6 +54,7 @@ OpBuilder* CreateBatchSeqBuilder(GraphBuilder* graph_builder, int op_type,
 OpBuilder* CreateQuantizeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
 
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index 47a78dca6ac..a5cdc0411ca 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -30,6 +30,7 @@ hexagon_op_tests(
         "conv_test.cc",
         "l2_norm_test.cc",
         "matmul_test.cc",
+        "min_max_builder_test.cc",
         "mirror_pad_test.cc",
         "mul_test.cc",
         "neg_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
new file mode 100644
index 00000000000..315ea909c53
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename data_type>
+class MinMaxOpModel : public SingleOpModelWithHexagon {
+ public:
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                std::initializer_list<data_type> input1_values,
+                const TensorData& input2,
+                std::initializer_list<data_type> input2_values,
+                const TensorData& output, bool input1_const) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+
+    // A workaround to mark the tensors as constant.
+    if (input1_const) {
+      auto* input1_tensor = interpreter_->tensor(input1_);
+      input1_tensor->allocation_type = kTfLiteMmapRo;
+    } else {
+      auto* input2_tensor = interpreter_->tensor(input2_);
+      input2_tensor->allocation_type = kTfLiteMmapRo;
+    }
+  }
+
+  void SetInput1(std::vector<data_type> data) { PopulateTensor(input1_, data); }
+
+  void SetInput2(std::vector<data_type> data) { PopulateTensor(input2_, data); }
+
+  std::vector<data_type> GetOutput() {
+    return ExtractVector<data_type>(output_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(op, input1, input2, output);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+template <typename data_type>
+void TestModelConstInput(tflite::BuiltinOperator op, const TensorData& input1,
+                         const TensorData& input2, const TensorData& output,
+                         std::initializer_list<data_type> input1_values,
+                         std::initializer_list<data_type> input2_values,
+                         bool input1_const) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(
+      op, input1, input1_values, input2, input2_values, output, input1_const);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MAXIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 20, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MINIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Maximum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 12, 1};
+  TestModel<int8_t>(BuiltinOperator_MINIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25}, data1, data2);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index 1df0a6df66c..8aff13549b8 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -79,8 +79,10 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinConcatenation:
     case kTfLiteBuiltinL2Normalization:
     case kTfLiteBuiltinLogistic:
+    case kTfLiteBuiltinMaximum:
     case kTfLiteBuiltinMaxPool2d:
     case kTfLiteBuiltinMean:
+    case kTfLiteBuiltinMinimum:
     case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
     case kTfLiteBuiltinPad:
@@ -366,6 +368,16 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       return InputsWithCorrectTypes(node, context,
                                     {{kTfLiteUInt8, kTfLiteInt8}});
     }
+    case kTfLiteBuiltinMinimum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
+    case kTfLiteBuiltinMaximum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
     default:
       return false;
   }

From b4cf239a2f1fb9f5979eed8d543fb8a754470339 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 14 May 2020 14:59:21 -0700
Subject: [PATCH 0615/1533] Reinstall tf-estimator-nightly since regular
 installs might not work.

PiperOrigin-RevId: 311611762
Change-Id: Ib72c156ab89422bbfce47be8216535c094b4cd6b
---
 tensorflow/tools/ci_build/release/common.sh      | 6 ++++--
 tensorflow/tools/ci_build/release/common_win.bat | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index bb40042e3af..0a9f6eae0b3 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -177,7 +177,8 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install scipy --user
   "${PIP_CMD}" install scikit-learn --user
   "${PIP_CMD}" install PyYAML==3.13 --user
-  "${PIP_CMD}" install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  "${PIP_CMD}" install --force-reinstall --user --upgrade tf-estimator-nightly
   "${PIP_CMD}" install --user --upgrade tb-nightly
   "${PIP_CMD}" install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_pip_installations)
@@ -220,7 +221,8 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade attrs
-  ${PIP_CMD} install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  ${PIP_CMD} install --force-reinstall --user --upgrade tf-estimator-nightly
   ${PIP_CMD} install --user --upgrade wrapt
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
 }
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 85f22c1e4cb..d34c92736c0 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -28,7 +28,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install tf-estimator-nightly --no-deps
+%PIP_EXE% install --force-reinstall tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
 %PIP_EXE% install numpy --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade

From 38f02efb53fb3ceb1001bacf8662cf0488e341f2 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Fri, 15 May 2020 06:47:52 +0800
Subject: [PATCH 0616/1533] add @deprecated to
 Interpreter.Options.setAllowFp16PrecisionForFP32

---
 .../java/src/main/java/org/tensorflow/lite/Interpreter.java   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 5625ef98bb6..0faf3c008f4 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -102,7 +102,9 @@ public final class Interpreter implements AutoCloseable {
      * Sets whether to allow float16 precision for FP32 calculation when possible. Defaults to false
      * (disallow).
      *
-     * <p>WARNING: This is an experimental API and subject to change.
+     * @deprecated Prefer using
+     * {@link org.tensorflow.lite.nnapi.NnApiDelegate.Options#setAllowFp16(boolean enable)}.
+     *
      */
     public Options setAllowFp16PrecisionForFp32(boolean allow) {
       this.allowFp16PrecisionForFp32 = allow;

From 6abea04db74ef7eede4e3dbd91282c77df866d23 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 15:51:15 -0700
Subject: [PATCH 0617/1533] Added BUILD rules for the micro-frontend TF op so
 that we can use it as a C++ library.

PiperOrigin-RevId: 311621054
Change-Id: I54e9932fe54f7cc94f5863f7924d85853d24e48e
---
 tensorflow/lite/experimental/microfrontend/BUILD | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index aaaf864bb60..bf0eb6ae726 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -27,6 +27,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "audio_microfrontend_op_lib",
+    srcs = ["ops/audio_microfrontend_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/experimental/microfrontend/lib:frontend",
+    ],
+    alwayslink = 1,
+)
+
 cc_test(
     name = "audio_microfrontend_test",
     size = "small",

From c628246c31ea9ff1d96ffc59a12f748db418ea76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 16:14:53 -0700
Subject: [PATCH 0618/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311625551
Change-Id: I3205d380573ed326d5b55cdc089577f34433f1f1
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index e6725269279..a6ee1a13b6e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 7d40f2c3897a8c7bb8d236c352fcd267fbe9bc88 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Thu, 14 May 2020 16:21:53 -0700
Subject: [PATCH 0619/1533] Fix bazel TFLM compilation w/ TF_LITE_STATIC_MEMORY

Ensure dynamic string utils aren't compiled when this build define is
present.

PiperOrigin-RevId: 311626904
Change-Id: Ica229bf337019f0f446fdb94aaf42c6b7e7c749e
---
 tensorflow/lite/kernels/BUILD                        |  2 +-
 tensorflow/lite/kernels/internal/BUILD               |  3 +--
 .../internal/reference/portable_tensor_utils.cc      | 12 +++++++-----
 .../internal/reference/portable_tensor_utils.h       |  1 -
 .../internal/reference/portable_tensor_utils_impl.h  |  6 +++++-
 tensorflow/lite/kernels/internal/tensor.h            |  3 +++
 tensorflow/lite/kernels/internal/tensor_utils.h      |  7 ++++++-
 .../lite/kernels/internal/tensor_utils_test.cc       |  1 +
 tensorflow/lite/kernels/non_max_suppression.cc       |  1 -
 tensorflow/lite/string_util.cc                       |  2 ++
 tensorflow/lite/string_util.h                        |  4 ++++
 11 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 6f6d111fd77..3a29fee5699 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -386,7 +386,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite/kernels/internal:quantization_util",
-        "@flatbuffers",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 93292fbb640..d6a96efdbf7 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -629,7 +629,6 @@ cc_library(
         ":cppmath",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "@gemmlowp",
     ],
 )
@@ -785,7 +784,6 @@ cc_library(
     deps = [
         ":cpu_check",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "//third_party/eigen3",
     ],
 )
@@ -819,6 +817,7 @@ cc_test(
         ":quantization_util",
         ":tensor_utils",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 22e37d5af71..0e66dfee191 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -53,7 +52,7 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
                                      int8_t* quantized_values, float min_value,
                                      float max_value, float* scaling_factor) {
-  const int kScale = 127;
+  const int32_t kScale = 127;
   const float range = std::max(std::abs(min_value), std::abs(max_value));
   if (range == 0) {
     memset(quantized_values, 0, size * sizeof(int8_t));
@@ -66,7 +65,8 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
     const int32_t quantized_value =
         static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
     // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+    quantized_values[i] = static_cast<int8_t>(
+        std::min(kScale, std::max(-kScale, quantized_value)));
   }
 }
 
@@ -660,7 +660,8 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
       value -= output_zp;
-      value = std::min(std::max(-128, value), 127);
+      value = std::min(std::max(static_cast<int32_t>(-128), value),
+                       static_cast<int32_t>(127));
 
       output[index] = static_cast<int8>(value);
     }
@@ -748,7 +749,8 @@ void PortableVectorBatchVectorCwiseProductAccumulate(
       int32_t prod = vector[v] * *batch_vector++;
       prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
       int32_t output = prod + *result;
-      output = std::max(std::min(32767, output), -32768);
+      output = std::max(std::min(static_cast<int32_t>(32767), output),
+                        static_cast<int32_t>(-32768));
       *result++ = output;
     }
   }
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 9a365074513..f2e6c9b4f7d 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -18,7 +18,6 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 
 #if defined(_MSC_VER)
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index d8bd70f3722..6c15a6cd919 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -20,13 +20,17 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Limit a float input f between +abs_limit and -abs_limit.
diff --git a/tensorflow/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
index 0005bf38d54..543117df0e5 100644
--- a/tensorflow/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -119,6 +119,8 @@ class SequentialTensorWriter {
   T* output_ptr_;
 };
 
+// String ops are not yet supported on platforms w/ static memory.
+#ifndef TF_LITE_STATIC_MEMORY
 template <>
 class SequentialTensorWriter<string> {
  public:
@@ -138,6 +140,7 @@ class SequentialTensorWriter<string> {
   TfLiteTensor* output_;
   DynamicBuffer buffer_;
 };
+#endif  // TF_LITE_STATIC_MEMORY
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 1929c2e2ff4..5e106eb7de4 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -20,13 +20,18 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation. Use of CpuBackendContext in method
+// implementations is purely optional.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Checks if all entries of vector are zero for float.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 9b047d3ba84..3ad59acdb68 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index ee8e407066d..f57ee1bc5d2 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index f7fcf2ac630..44719858f2a 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -89,6 +89,7 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
 void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
   auto dims = TfLiteIntArrayCreate(1);
   dims->data[0] = offset_.size() - 1;  // Store number of strings.
@@ -109,6 +110,7 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
                     tensor->is_variable, tensor);
 }
+#endif  // TF_LITE_STATIC_MEMORY
 
 int GetStringCount(const void* raw_buffer) {
   // The first integers in the raw buffer is the number of strings.
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index 779b1e12ab8..879aa76b83b 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -74,6 +74,9 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
+  // String tensors are not generally supported on platforms w/ static memory.
+  // TODO(b/156130024): Remove this guard after removing header from TFLM deps.
+#ifndef TF_LITE_STATIC_MEMORY
   // Fill content into a string tensor, with the given new_shape. The new shape
   // must match the number of strings in this object. Caller relinquishes
   // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
@@ -82,6 +85,7 @@ class DynamicBuffer {
 
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
 
  private:
   // Data buffer to store contents of strings, not including headers.

From e6c2a5a212752e3b6a58a621a4ba512bbb9eb246 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 14 May 2020 16:53:17 -0700
Subject: [PATCH 0620/1533] Change more libraries in
 third_party/tensorflow/c/BUILD to depend on
 portable_tensorflow_lib_lite_no_runtime to support effort to reduce
 dependencies on mobile.

PiperOrigin-RevId: 311632630
Change-Id: I5061b458f894bccb9c0e23791d265f6ee95bdd38
---
 tensorflow/c/BUILD             | 48 ++++++++++++++++++++--------------
 tensorflow/core/platform/BUILD |  2 +-
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 7fb02028837..05d5f9a3ed2 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -219,7 +219,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -232,12 +232,13 @@ cc_library(
     srcs = ["tf_status.cc"],
     hdrs = ["tf_status.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tf_status_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tf_status_internal",
             "//tensorflow/core:lib",
         ],
     }),
@@ -259,10 +260,15 @@ cc_library(
     name = "tensor_interface",
     hdrs = ["tensor_interface.h"],
     visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
 )
 
 cc_library(
@@ -286,16 +292,17 @@ cc_library(
     srcs = ["tf_tensor.cc"],
     hdrs = ["tf_tensor.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+        ":tf_status_helper",
+        ":tf_tensor_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
-            ":tf_status_helper",
-            ":tf_tensor_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:protos_all_cc",
@@ -311,14 +318,15 @@ tf_cuda_library(
         "tf_tensor_internal.h",
     ],
     visibility = ["//tensorflow:internal"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/platform:casts",
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index c7ff378d2ac..f78b738247d 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -1472,6 +1472,7 @@ filegroup(
         "abi.h",
         "blocking_counter.h",
         "byte_order.h",
+        "casts.h",
         "coding.cc",
         "coding.h",
         "context.h",
@@ -1557,7 +1558,6 @@ filegroup(
     srcs = [
         "base64.cc",
         "base64.h",
-        "casts.h",
         "cpu_feature_guard.cc",
         "cpu_feature_guard.h",
         "fingerprint.h",

From 90077f8c7c6517d4d761e35ade80597aab458873 Mon Sep 17 00:00:00 2001
From: Lucy Fox <lucyfox@google.com>
Date: Thu, 14 May 2020 16:59:59 -0700
Subject: [PATCH 0621/1533] Instrument the number of times the MLIR-based TF
 Bridge is enabled.

PiperOrigin-RevId: 311633792
Change-Id: Iba286e1c82900833b5cf9f69a697a312e51f3156
---
 tensorflow/compiler/tf2xla/BUILD               |  6 +-----
 tensorflow/compiler/tf2xla/mlir_bridge_pass.cc | 12 ++++++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 897528b6de9..55341c0a01f 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -704,12 +704,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:device_util",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:core_cpu",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:lib",
         "@llvm-project//llvm:support",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 499e27f0981..c398e5f129e 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -18,10 +18,18 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+auto* mlir_bridge_gauge_v1 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v1",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF1 models");
+auto* mlir_bridge_gauge_v2 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v2",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF2 models");
+
 // This runs the first phase of the "bridge", transforming the graph in a form
 // that can be executed with delegation of some computations to an accelerator.
 // This builds on the model of XLA where a subset of the graph is encapsulated
@@ -32,10 +40,12 @@ Status MlirBridgePass::Run(const ConfigProto& config_proto,
                            mlir::ModuleOp module) {
   if (!config_proto.experimental().enable_mlir_bridge()) {
     VLOG(0) << "Skipping MLIR TPU Bridge, session flag not enabled";
+    mlir_bridge_gauge_v2->GetCell()->Set(false);
     return Status::OK();
   }
 
   VLOG(0) << "Running MLIR TPU Bridge";
+  mlir_bridge_gauge_v2->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
@@ -48,10 +58,12 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
 
   if (!options.session_options->config.experimental().enable_mlir_bridge()) {
     VLOG(0) << "Skipping MLIR TPU Bridge V1 Compat, session flag not enabled";
+    mlir_bridge_gauge_v1->GetCell()->Set(false);
     return Status::OK();
   }
 
   VLOG(0) << "Running MLIR TPU Bridge V1 Compat";
+  mlir_bridge_gauge_v1->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridgeV1Compat(module, /*enable_logging=*/VLOG_IS_ON(1)));
 

From d5e0f468cd1a9ddb1de1eaeb62734dc177047c72 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Thu, 14 May 2020 17:03:36 -0700
Subject: [PATCH 0622/1533] Report remote target in error messages for gRPC
 eager service requests.

PiperOrigin-RevId: 311634462
Change-Id: Ib0550c172e419ea17dac9ffa28c18b9e1a03b3cc
---
 .../rpc/eager/grpc_eager_client.cc            | 13 +++++++-----
 .../rpc/grpc_rpc_factory.cc                   |  3 ++-
 .../core/distributed_runtime/rpc/grpc_state.h | 21 ++++++++++++-------
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index de4f36ea24d..752bfdf71a1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -106,8 +106,8 @@ class GrpcEagerClientThread : public core::RefCounted {
 class GrpcEagerClient : public EagerClient {
  public:
   GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
-                  GrpcEagerClientThread* thread)
-      : stub_(channel), thread_(thread) {
+                  GrpcEagerClientThread* thread, const string& target)
+      : stub_(channel), thread_(thread), target_(target) {
     // Hold a reference to make sure the corresponding EagerClientThread
     // outlives the client.
     thread_->Ref();
@@ -127,7 +127,8 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
         response, std::move(done_wrapped), /*call_opts=*/nullptr,         \
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true);   \
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,    \
+        &target_);                                                        \
   }
 
   CLIENT_METHOD(CreateContext);
@@ -146,7 +147,8 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(
         &stub_, cq_, "/tensorflow.eager.EagerService/CloseContext", *request,
         response, std::move(done_wrapped), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr);
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
 
     VLOG(1) << "Sending RPC to close remote eager context "
             << request->DebugString();
@@ -194,6 +196,7 @@ class GrpcEagerClient : public EagerClient {
  private:
   ::grpc::GenericStub stub_;
   const GrpcEagerClientThread* thread_;
+  const string target_;
 
   ::grpc::CompletionQueue* cq_;
 
@@ -236,7 +239,7 @@ class GrpcEagerClientCache : public EagerClientCache {
       int assigned_index = AssignClientToThread(target);
       GrpcEagerClientThread* thread = threads_[assigned_index].get();
       core::RefCountPtr<EagerClient> worker(
-          new GrpcEagerClient(shared, thread));
+          new GrpcEagerClient(shared, thread, target));
       it = clients_.emplace(target, std::move(worker)).first;
     }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 272d6bb1b20..bcb98baaeb9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -210,7 +210,8 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
-      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_, 0 /* max_retries */);
+      /*threadpool=*/nullptr, fail_fast_, timeout_in_ms_, /*max_retries=*/0,
+      /*target=*/nullptr);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index c72ba6035a4..041b6e51ffb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -45,7 +45,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, int32 max_retries = 0,
-           bool fail_fast = true)
+           bool fail_fast = true, const string* target = nullptr)
       : RPCState(
             stub, cq, method, request, response, std::move(done), call_opts,
             threadpool,
@@ -63,7 +63,7 @@ class RPCState : public GrpcClientCQTag {
 #endif  // PLATFORM_GOOGLE
               return x;
             }(),
-            /*timeout_in_ms=*/0, max_retries) {
+            /*timeout_in_ms=*/0, max_retries, target) {
   }
 
   template <typename Request>
@@ -71,7 +71,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms,
-           int32 max_retries)
+           int32 max_retries, const string* target)
       : call_opts_(call_opts),
         threadpool_(threadpool),
         done_(std::move(done)),
@@ -80,7 +80,8 @@ class RPCState : public GrpcClientCQTag {
         cq_(cq),
         stub_(stub),
         method_(method),
-        fail_fast_(fail_fast) {
+        fail_fast_(fail_fast),
+        target_(target) {
     response_ = response;
     ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
     if (!s.ok()) {
@@ -152,10 +153,13 @@ class RPCState : public GrpcClientCQTag {
       StartCall();
     } else {
       // Attach additional GRPC error information if any to the final status
-      s = Status(s.code(),
-                 strings::StrCat(s.error_message(),
-                                 "\nAdditional GRPC error information:\n",
-                                 context_->debug_error_string()));
+      string error_msg = s.error_message();
+      strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
+      if (target_) {
+        strings::StrAppend(&error_msg, " from remote target ", *target_);
+      }
+      strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
+      s = Status(s.code(), error_msg);
       // Always treat gRPC cancellation as a derived error. This ensures that
       // other error types are preferred during status aggregation. (gRPC
       // cancellation messages do not contain the original status message).
@@ -196,6 +200,7 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::GenericStub* stub_;
   ::grpc::string method_;
   bool fail_fast_;
+  const string* target_;
 };
 
 // Represents state associated with one streaming RPC call.

From 9a6a6476b563a65416b4bb438d021a2c7e52f139 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 00:40:15 +0000
Subject: [PATCH 0623/1533] Add test and remove decorator

---
 tensorflow/python/kernel_tests/map_fn_test.py |  8 +++-----
 tensorflow/python/ops/map_fn.py               | 11 +----------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 1859c6c5873..0bc3307e484 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -189,20 +189,18 @@ class MapFnTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMap_autograph_indirect(self):
     def test_function(x):
-      cond = tf.constant(-1)
+      cond = constant_op.constant(-1)
       if cond == 0:
         result = x
       else:
         result = x
       return result
-
-    @tf.function
     def map_call(x):
-      return tf.map_fn(test_function, x)
+      return map_fn.map_fn(test_function, x)
 
     x = constant_op.constant([1])
     y = map_call(x)
-    self.assertAllEqual([1], self.evaluate(y))
+    self.assertAllEqual([1], self.evaluate(y)) 
 
   @test_util.run_in_graph_and_eager_modes
   def testMapShape(self):
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index dfe32998282..4a21a6e148b 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -39,14 +39,6 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
-autograph_ctx = lazy_loader.LazyLoader(
-    "autograph_ctx", globals(),
-    "tensorflow.python.autograph.core.ag_ctx")
-autograph = lazy_loader.LazyLoader(
-    "autograph", globals(),
-    "tensorflow.python.autograph.impl.api")
-
-@tf_export(v1=["map_fn"])
 @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
 def map_fn(fn,
            elems,
@@ -483,8 +475,7 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      ag_ctx = autograph_ctx.control_status_ctx()
-      result_value = autograph.tf_convert(elems_value, ag_ctx)
+      result_value = fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(

From 86342e236b40996ea5b6ccd17f1e753b00668d1c Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 02:45:52 +0200
Subject: [PATCH 0624/1533] restore a remove export

---
 tensorflow/python/ops/map_fn.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 4a21a6e148b..2c9c678336e 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -39,6 +39,8 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
+
+@tf_export(v1=["map_fn"])
 @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
 def map_fn(fn,
            elems,

From a2ef8b5a0659516dad3ce3f501223286615dab56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 17:47:38 -0700
Subject: [PATCH 0625/1533] Update svd_op_test to run (non-gradient) tests in
 eager as well as graph mode.

PiperOrigin-RevId: 311640894
Change-Id: I39b4666c461c64ffe3f33992bb536961a266abd7
---
 tensorflow/python/kernel_tests/BUILD          |   2 +-
 tensorflow/python/kernel_tests/svd_op_test.py | 162 +++++++++---------
 2 files changed, 81 insertions(+), 83 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index b226e0cb859..13f59b74baf 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -3468,7 +3468,7 @@ cuda_py_test(
     name = "svd_op_test",
     size = "medium",
     srcs = ["svd_op_test.py"],
-    shard_count = 20,
+    shard_count = 30,
     tags = [
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index a53d2470aa5..6c2199cc591 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -31,7 +31,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -58,35 +58,31 @@ class SvdOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.svd(vector)
 
-  @test_util.run_v1_only("b/120545219")
-  def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for compute_uv_ in True, False:
-        for full_matrices_ in True, False:
-          matrix1 = random_ops.random_normal([5, 5], seed=42)
-          matrix2 = random_ops.random_normal([5, 5], seed=42)
-          if compute_uv_:
-            s1, u1, v1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2, u2, v2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, u1, v1, s2, u2, v2]
-          else:
-            s1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      for i in range(2):
-        s = 6 * i
-        self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
-        self.assertAllEqual(val[s + 1], val[s + 4])  # u1 == u2
-        self.assertAllEqual(val[s + 2], val[s + 5])  # v1 == v2
-      for i in range(2):
-        s = 12 + 2 * i
-        self.assertAllEqual(val[s], val[s + 1])  # s1 == s2
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testExecuteMultipleWithoutError(self):
+    all_ops = []
+    shape = [6, 5]
+    seed = [42, 24]
+    for compute_uv_ in True, False:
+      for full_matrices_ in True, False:
+        matrix1 = stateless_random_ops.stateless_random_normal(shape, seed)
+        matrix2 = stateless_random_ops.stateless_random_normal(shape, seed)
+        self.assertAllEqual(matrix1, matrix2)
+        if compute_uv_:
+          s1, u1, v1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2, u2, v2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2, u1, u2, v1, v2]
+        else:
+          s1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
@@ -136,8 +132,10 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     is_complex = dtype_ in (np.complex64, np.complex128)
     is_single = dtype_ in (np.float32, np.complex64)
     tol = 3e-4 if is_single else 1e-12
@@ -152,48 +150,48 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
-      if use_static_shape_:
-        x_tf = constant_op.constant(x_np)
-      else:
-        x_tf = array_ops.placeholder(dtype_)
+    if use_static_shape_:
+      x_tf = constant_op.constant(x_np)
+    else:
+      x_tf = array_ops.placeholder(dtype_)
 
-      if compute_uv_:
-        s_tf, u_tf, v_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
-        else:
+    if compute_uv_:
+      s_tf, u_tf, v_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
+      else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val, u_tf_val, v_tf_val = sess.run(
               [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np})
+    else:
+      s_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val = self.evaluate(s_tf)
       else:
-        s_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val = self.evaluate(s_tf)
-        else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
-      if compute_uv_:
-        u_np, s_np, v_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      else:
-        s_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      # We explicitly avoid the situation where numpy eliminates a first
-      # dimension that is equal to one.
-      s_np = np.reshape(s_np, s_tf_val.shape)
+    if compute_uv_:
+      u_np, s_np, v_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    else:
+      s_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    # We explicitly avoid the situation where numpy eliminates a first
+    # dimension that is equal to one.
+    s_np = np.reshape(s_np, s_tf_val.shape)
 
-      CompareSingularValues(self, s_np, s_tf_val, tol)
-      if compute_uv_:
-        CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
-        CompareSingularVectors(self,
-                               np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
-                               min(shape_[-2:]), tol)
-        CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
-                           full_matrices_, tol)
-        CheckUnitary(self, u_tf_val, tol)
-        CheckUnitary(self, v_tf_val, tol)
+    CompareSingularValues(self, s_np, s_tf_val, tol)
+    if compute_uv_:
+      CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
+      CompareSingularVectors(self, np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
+                             min(shape_[-2:]), tol)
+      CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
+                         full_matrices_, tol)
+      CheckUnitary(self, u_tf_val, tol)
+      CheckUnitary(self, v_tf_val, tol)
 
   return Test
 
@@ -378,15 +376,15 @@ if __name__ == "__main__":
         for rows in 0, 1, 2, 5, 10, 32, 100:
           for cols in 0, 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-              shape = batch_dims + (rows, cols)
-              # TF2 does not support placeholders under eager so we skip it
-              for use_static_shape in set([True, tf2.enabled()]):
+              full_shape = batch_dims + (rows, cols)
+              for use_static_shape in set([True, False]):
                 name = "%s_%s_static_shape_%s__compute_uv_%s_full_%s" % (
-                    dtype.__name__, "_".join(map(str, shape)), use_static_shape,
-                    compute_uv, full_matrices)
-                _AddTest(SvdOpTest, "Svd", name,
-                         _GetSvdOpTest(dtype, shape, use_static_shape,
-                                       compute_uv, full_matrices))
+                    dtype.__name__, "_".join(map(str, full_shape)),
+                    use_static_shape, compute_uv, full_matrices)
+                _AddTest(
+                    SvdOpTest, "Svd", name,
+                    _GetSvdOpTest(dtype, full_shape, use_static_shape,
+                                  compute_uv, full_matrices))
   for compute_uv in False, True:
     for full_matrices in False, True:
       dtypes = ([np.float32, np.float64] + [np.complex64, np.complex128] *
@@ -397,16 +395,16 @@ if __name__ == "__main__":
           mat_shapes += [(5, 11), (11, 5)]
         for mat_shape in mat_shapes:
           for batch_dims in [(), (3,)]:
-            shape = batch_dims + mat_shape
-            name = "%s_%s_compute_uv_%s_full_%s" % (
-                dtype.__name__, "_".join(map(str, shape)), compute_uv,
-                full_matrices)
-            _AddTest(SvdGradOpTest, "SvdGrad", name,
-                     _GetSvdGradOpTest(dtype, shape, compute_uv, full_matrices))
+            full_shape = batch_dims + mat_shape
+            name = "%s_%s_compute_uv_%s_full_%s" % (dtype.__name__, "_".join(
+                map(str, full_shape)), compute_uv, full_matrices)
+            _AddTest(
+                SvdGradOpTest, "SvdGrad", name,
+                _GetSvdGradOpTest(dtype, full_shape, compute_uv, full_matrices))
             # The results are too inaccurate for float32.
             if dtype in (np.float64, np.complex128):
               _AddTest(
                   SvdGradGradOpTest, "SvdGradGrad", name,
-                  _GetSvdGradGradOpTest(dtype, shape, compute_uv,
+                  _GetSvdGradGradOpTest(dtype, full_shape, compute_uv,
                                         full_matrices))
   test.main()

From 4662933489550ed226c1682967e8632af9218363 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 14 May 2020 18:00:36 -0700
Subject: [PATCH 0626/1533] Build DynamicSlice and DynamicUpdateSlice ops with
 MlirHloBuilder

Whitelist XlaDynamicSlice and XlaDynamicUpdateSlice for testing

PiperOrigin-RevId: 311642899
Change-Id: Icbf009cf69d3b183d0c83c10925a5fbaa3c49f1f
---
 .../compiler/mlir/xla/ir/mlir_hlo_builder.cc  | 22 +++++++++
 .../compiler/mlir/xla/ir/mlir_hlo_builder.h   |  8 ++++
 .../xla/tests/legalize-tf-with-tf2xla.mlir    | 24 ++++++++++
 .../xla/transforms/legalize_tf_with_tf2xla.cc |  2 +
 tensorflow/compiler/tests/xla_ops_test.py     |  5 +-
 tensorflow/compiler/xla/client/xla_builder.cc | 48 +++++++++++--------
 tensorflow/compiler/xla/client/xla_builder.h  |  6 +++
 7 files changed, 93 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index cc334d8654f..461c357e509 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -282,6 +282,28 @@ StatusOr<XlaOp> MlirHloBuilder::SliceInternal(
       GetI64ElementsAttr(strides, &builder_)));
 }
 
+StatusOr<XlaOp> MlirHloBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValues(start_indices),
+      GetI64ElementsAttr(slice_sizes, &builder_)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicUpdateSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValue(update),
+      GetValues(start_indices)));
+}
+
 StatusOr<XlaOp> MlirHloBuilder::PadInternal(
     const Shape& shape, XlaOp operand, XlaOp padding_value,
     const PaddingConfig& padding_config) {
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 5a84d60cdc2..fc5baaee44d 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -175,6 +175,14 @@ class MlirHloBuilder : public XlaBuilder {
                                 absl::Span<const int64> limit_indices,
                                 absl::Span<const int64> strides) override;
 
+  StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes) override;
+
+  StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices) override;
+
   StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
                               XlaOp padding_value,
                               const PaddingConfig& padding_config) override;
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index 01398eb7314..e8d5cfe997d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -163,6 +163,30 @@ func @truncated_normal() -> tensor<2x2xf32> {
   return %1 : tensor<2x2xf32>
 }
 
+// CHECK-LABEL: dynamic_update_slice
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<3x4xi32>, %[[ARG1:.*]]: tensor<2x2xi32>, %[[ARG2:.*]]: tensor<2xi32>
+func @dynamic_update_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<2x2xi32>, %arg2: tensor<2xi32>) -> tensor<3x4xi32> {
+
+  // CHECK: %[[SLICE0:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM0:.*]] = "xla_hlo.reshape"(%[[SLICE0]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<2> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: "xla_hlo.dynamic-update-slice"(%[[ARG0]], %[[ARG1]], %[[DIM0]], %[[DIM1]])
+
+  %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<3x4xi32>, tensor<2x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  return %0: tensor<3x4xi32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 86a2defd3a8..76657bd5e20 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -168,6 +168,8 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::XlaBroadcastHelperOp>(),
     TypeID::get<TF::XlaConvOp>(),
     TypeID::get<TF::XlaDotOp>(),
+    TypeID::get<TF::XlaDynamicSliceOp>(),
+    TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
     TypeID::get<TF::XlaPadOp>(),
     TypeID::get<TF::Xlog1pyOp>(),
     TypeID::get<TF::XlogyOp>()
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 1f83701ea7c..f3e915daa67 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -304,7 +304,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
       self._assertOpOutputMatchesExpected(
           lambda x: xla.transpose(x, [1, 0]), args=(v,), expected=v.T)
 
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testDynamicSlice(self):
     for dtype in self.numeric_types:
       self._assertOpOutputMatchesExpected(
@@ -317,7 +316,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                         [[673, 674], [683, 684], [693, 694]]]),
               dtype=dtype))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectStartIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
@@ -331,7 +330,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           (r'start_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and start_indices has shape \[2\].*'))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectSizeIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 6539817d524..a4e5b936153 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -864,8 +864,6 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64> slice_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> start_indices_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
@@ -876,23 +874,28 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
                             *operand_shape, start_indices_shapes, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    std::vector<XlaOp> operands = {operand};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+    return DynamicSliceInternal(shape, operand, start_indices, slice_sizes);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  for (int64 size : slice_sizes) {
+    instr.add_dynamic_slice_sizes(size);
+  }
+
+  std::vector<XlaOp> operands = {operand};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+}
+
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
     std::vector<const Shape*> start_indices_shape_ptrs;
@@ -904,15 +907,22 @@ XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferDynamicUpdateSliceShape(
                          *operand_shape, *update_shape, start_indices_shapes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    std::vector<XlaOp> operands = {operand, update};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          operands);
+    return DynamicUpdateSliceInternal(shape, operand, update, start_indices);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  std::vector<XlaOp> operands = {operand, update};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                        operands);
+}
+
 XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
                               int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 24b0cba3a1b..b631514248c 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -423,9 +423,15 @@ class XlaBuilder {
 
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64> slice_sizes);
+  virtual StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes);
 
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
+  virtual StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
   virtual StatusOr<XlaOp> ConcatInDimInternal(const Shape& shape,

From 377612c026bcfc1fd86e63b7c5f995101d7bebfd Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 14 May 2020 18:19:41 -0700
Subject: [PATCH 0627/1533] Fix hardswish test for ubsan.

PiperOrigin-RevId: 311645688
Change-Id: Id9f3b31da09355c9997f3f2cc95dca5954c956ec
---
 tensorflow/lite/kernels/internal/reference/reference_ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index f40b268b443..1a6c6d0d80e 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -2597,7 +2597,7 @@ inline void HardSwish(const HardSwishParams& params,
     // significant bits in the high bits of our 16-bit fixedpoint values, so
     // that fixed-point approximate computations below are as accurate as
     // possible.
-    const int16_t input_value_on_hires_input_scale = input_value << 7;
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
     // Compute the input value on essentially the output scale, just not
     // right-shifted yet. This is the value that we'll use in the (x >= +3)
     // case, and that in the general case we'll multiply against the "relu-ish"

From a5267f056ff5838ad7ac7dd8c8f1fc29e3064d68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 18:40:52 -0700
Subject: [PATCH 0628/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311648388
Change-Id: Id9f6f7c4de82be3a405377e722e740fd0dfee80d
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a6ee1a13b6e..e6725269279 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 0de7edf8b11755878043e76200d575b08c025d63 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Thu, 14 May 2020 19:03:56 -0700
Subject: [PATCH 0629/1533] Generate separate pod for Core ML delegate

PiperOrigin-RevId: 311651255
Change-Id: I7ba8755d447674fa8d20935b0c9815a5406b879f
---
 .../lite/experimental/delegates/coreml/BUILD  |  6 ++++-
 tensorflow/lite/experimental/ios/BUILD.apple  | 25 ++++++++++++++++---
 .../ios/TensorFlowLiteC.podspec.template      | 14 +++++++++--
 .../lite/experimental/swift/BUILD.apple       | 14 +++++++++++
 .../swift/Sources/CoreMLDelegate.swift        |  2 +-
 .../swift/TensorFlowLiteSwift-nightly.podspec | 16 ++++++++++--
 .../TensorFlowLiteSwift.podspec.template      | 16 ++++++++++--
 7 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 92aa96d5c50..c04aba65aa0 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 package(default_visibility = [
     "//visibility:public",
 ])
@@ -46,6 +45,11 @@ objc_library(
     name = "coreml_delegate",
     srcs = ["coreml_delegate.mm"],
     hdrs = ["coreml_delegate.h"],
+    module_name = "TensorFlowLiteCCoreML",
+    # By setting CoreML as weak_framework, the TensorFlow Lite can be built for older iOS versions.
+    weak_sdk_frameworks = [
+        "CoreML",
+    ],
     deps = [
         ":coreml_delegate_kernel",
         ":mlmodel_proto_cc",
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 8e7b32eba91..5c954bc3de8 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -24,7 +24,6 @@ genrule(
 
 TFL_FRAMEWORK_HDRS = [
     "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    ":coreml_delegate.h",
     "//tensorflow/lite/c:c_api.h",
     "//tensorflow/lite/c:common.h",
 ]
@@ -58,16 +57,35 @@ ios_static_framework(
     ],
 )
 
+# This target builds the Core ML delegate as a separate static framework, which
+# does not include the TensorFlow Lite runtime. As this target does not contain
+# TensorFlow Lite runtime, it is intended to be linked along with the
+# TensorFlowLiteC framework above in a composable way.
+#
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreMl_framework
+ios_static_framework(
+    name = "TensorFlowLiteCCoreML_framework",
+    hdrs = [
+        ":coreml_delegate.h",
+    ],
+    avoid_deps = [
+        ":tensorflow_lite_c",
+    ],
+    bundle_name = "TensorFlowLiteCCoreML",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_c",
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
         "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
     ],
     linkopts = [
-        "-Wl,-weak_framework,CoreML",
         "-Wl,-weak_framework,Metal",
     ],
     tags = [
@@ -77,7 +95,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/delegates/gpu:metal_delegate",
-        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
index d69c479282b..d8a5ef8f2e1 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
@@ -19,6 +19,16 @@ Pod::Spec.new do |s|
 
   s.module_name = 'TensorFlowLiteC'
   s.library = 'c++'
-  s.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
-  s.weak_frameworks = 'CoreML'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.weak_framework = 'CoreML'
+    coreml.dependency 'TensorFlowLiteC/Core'
+    coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.framework'
+  end
 end
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 50130fc194a..e671721dd1c 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -10,6 +10,19 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/153554551): investigate if separate delegate libraries can be made with same module_name
+# If you don't need delegates and want to reduce size of the app, you can exclude Metal/Core ML
+# delegate related dependencies from the rule.
+# For example, if you don't want to use Core ML delegate:
+# 1. add `exclude = ["Sources/CoreMLDelegate.swift"]` to `glob`, so that `srcs` would look like this:
+#    ```
+#    srcs = glob(
+#        ["Sources/*.swift"],
+#        exclude = ["Sources/CoreMLDelegate.swift"],
+#    ),
+# 2. remove "-Wl,-weak_framework,CoreML" from `linkopts`
+# 3. remove "...:coreml_delegate" from `deps`
+
 swift_library(
     name = "TensorFlowLite",
     srcs = glob(["Sources/*.swift"]),
@@ -21,6 +34,7 @@ swift_library(
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_whitelist(),
     deps = [
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
         "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
     ],
 )
diff --git a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
index 5a1526d45ea..9fc76bc3026 100644
--- a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import TensorFlowLiteC
+import TensorFlowLiteCCoreML
 
 /// A delegate that uses the `Core ML` framework for performing TensorFlow Lite graph operations.
 ///
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
index 3b21483f663..8b0e797eeaa 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
@@ -20,8 +20,20 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', "~> #{s.version}"
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
 
   s.test_spec 'Tests' do |ts|
     ts.source_files = swift_dir + 'Tests/*.swift'
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
index 9e875b44ee2..a925112f539 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@@ -20,8 +20,20 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
 
   s.test_spec 'Tests' do |ts|
     ts.source_files = swift_dir + 'Tests/*.swift'

From efa3fb28d94b7937edaafb5874c191ad0e2149ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 19:07:07 -0700
Subject: [PATCH 0630/1533] Split index_lookup into string_lookup and
 integer_lookup.

PiperOrigin-RevId: 311651579
Change-Id: Ie033727dbe1026a7c7a88e4b31653840a17ac3d1
---
 .../python/keras/layers/preprocessing/BUILD   |  64 +-
 .../layers/preprocessing/index_lookup.py      | 253 ++++----
 .../index_lookup_distribution_test.py         |   7 +-
 .../layers/preprocessing/index_lookup_test.py | 597 +++++++++++++-----
 .../layers/preprocessing/integer_lookup.py    | 112 ++++
 .../preprocessing/integer_lookup_test.py      | 501 +++++++++++++++
 .../layers/preprocessing/integer_lookup_v1.py |  25 +
 .../layers/preprocessing/string_lookup.py     | 106 ++++
 .../preprocessing/string_lookup_test.py       | 224 +++++++
 .../layers/preprocessing/string_lookup_v1.py  |  25 +
 .../keras/layers/preprocessing/table_utils.py |   1 -
 .../preprocessing/text_vectorization.py       |  68 +-
 .../preprocessing/text_vectorization_test.py  | 106 +---
 .../preprocessing/text_vectorization_v1.py    |   4 +-
 ...al.preprocessing.-text-vectorization.pbtxt |   2 +-
 ...al.preprocessing.-text-vectorization.pbtxt |   2 +-
 16 files changed, 1658 insertions(+), 439 deletions(-)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/integer_lookup.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/string_lookup.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py

diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index bef294429bd..c1e1d5573e5 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -27,10 +27,12 @@ py_library(
         ":discretization",
         ":hashing",
         ":image_preprocessing",
+        ":integer_lookup",
         ":normalization",
         ":preprocessing_stage",
         ":preprocessing_test_utils",
         ":reduction",
+        ":string_lookup",
         ":text_vectorization",
     ],
 )
@@ -146,6 +148,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "integer_lookup",
+    srcs = [
+        "integer_lookup.py",
+        "integer_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
     name = "table_utils",
     srcs = [
@@ -179,7 +195,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":categorical_encoding",
-        ":index_lookup",
+        ":string_lookup",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -235,6 +251,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "string_lookup",
+    srcs = [
+        "string_lookup.py",
+        "string_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
     name = "preprocessing_stage",
     srcs = [
@@ -442,6 +472,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "integer_lookup_test",
+    size = "medium",
+    srcs = ["integer_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":integer_lookup",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "normalization_distribution_test",
     srcs = ["normalization_distribution_test.py"],
@@ -517,6 +563,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "string_lookup_test",
+    size = "medium",
+    srcs = ["string_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":preprocessing_test_utils",
+        ":string_lookup",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "preprocessing_stage_test",
     srcs = ["preprocessing_stage_test.py"],
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index d6c8a07c8ba..ba9b0d740e1 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -41,14 +41,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts"
 
 
 class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Maps strings (or integers) from a vocabulary to integer indices.
+  """Maps values from a vocabulary to integer indices.
 
-  This layer translates a set of arbitrary strings or integers into an integer
-  output via a table-based lookup, with optional out-of-vocabulary handling.
+  This layer translates a set of arbitrary hashables into an integer output via
+  a table-based lookup, with optional out-of-vocabulary handling. This is the
+  basis layer for both IntegerLookup and IndexLookup; it holds the common
+  logic but is not intended to be exported as part of the Keras API.
 
   If desired, the user can call this layer's `adapt()` method on a data set,
   which will analyze the data set, determine the frequency of individual string
-  or integer values, and create a vocabulary from them. This vocabulary can have
+  values, and create a vocabulary from them. This vocabulary can have
   unlimited size or be capped, depending on the configuration options for this
   layer; if there are more unique values in the input than the maximum
   vocabulary size, the most frequent terms will be used to create the
@@ -56,84 +58,47 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary. Note that the vocabulary
-      does include OOV buckets, so the effective number of unique values in the
-      vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
-    num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
-      1. If this value is more than 1, OOV inputs are hashed to determine their
-      OOV value; if this value is 0, passing an OOV input will result in a '-1'
-      being returned for that value in the output tensor. (Note that, because
-      the value is -1 and not 0, this will allow you to effectively drop OOV
-      values from categorical encodings.)
-    vocabulary: An optional list of vocabulary terms, or a path to a text file
-      containing a vocabulary to load into this layer. The file should contain
-      one token per line. In either case, the vocabulary must be unique; if
-      the list or file contains the same token multiple times, an error will
-      be thrown. Note that when passing a vocabulary - either as a list or as
-      a file - the vocabulary will not be present in the layer's config dict;
-      it will instead be a part of the layer's weights.
-    reserve_zero: Whether to reserve the index 0, which indicates pad values in
-      the Keras masking system. If True, the output of this layer will be in the
-      range `[1...max_tokens+1)`; if False, the output will be in the range
-      `[0...max_tokens)`. Defaults to True.
-    mask_zero: If True, input values of 0 (for integers) and `""` (for strings)
-      will be treated as masked values and assigned an output value of 0. If
-      this option is set, `reserve_zero` must also be set. Defaults to False.
-  Call arguments:
-    inputs: The data to look up. Can be a tf.Tensor or RaggedTensor.
-    invert: Controls the lookup direction. If False, the layer will map strings
-      to integers; if true, the layer will map integers to strings. Defaults
-      to False.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+      value is more than 1, OOV inputs are hashed to determine their OOV value;
+      if this value is 0, passing an OOV input will result in a '-1' being
+      returned for that value in the output tensor. (Note that, because the
+      value is -1 and not 0, this will allow you to effectively drop OOV values
+      from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. If set to None, no mask term will be added and the OOV tokens, if
+      any, will be indexed from (0...num_oov_indices) instead of
+      (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. This token is
+      only used when performing an inverse lookup.
+    vocabulary: An optional list of vocabulary terms. If the list contains the
+      same token multiple times, an error will be thrown.
   """
   # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
-               max_tokens=None,
-               num_oov_tokens=1,
+               max_tokens,
+               num_oov_indices,
+               mask_token,
+               oov_token,
                vocabulary=None,
-               reserve_zero=True,
-               mask_zero=False,
                **kwargs):
-    invert = False
-    if invert:
-      allowed_dtypes = [dtypes.int32, dtypes.int64]
-    else:
-      allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]
-
-    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
-      raise ValueError("TextVectorization may only have a dtype in %s." %
-                       allowed_dtypes)
-
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = dtypes.int64 if invert else dtypes.string
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
     # are creating a 0-element vocab, which doesn't make sense.
     if max_tokens is not None and max_tokens <= 1:
       raise ValueError("If set, max_tokens must be greater than 1.")
 
-    if num_oov_tokens < 0:
-      raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
-                       num_oov_tokens)
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)
 
-    self.invert = invert
     self.max_tokens = max_tokens
-    self.num_oov_tokens = num_oov_tokens
-    self.reserve_zero = reserve_zero
-    self.mask_zero = mask_zero
-
-    # We need to reserve at least num_oov_tokens tokens, plus one additional
-    # value if we are reserving the zero value in our output.
-    if reserve_zero:
-      self._reserved_values = (num_oov_tokens + 1)
-    else:
-      self._reserved_values = num_oov_tokens
-
-    # We need to account for the OOV buckets in our vocabulary size.
-    if max_tokens is not None:
-      self._max_elements = max_tokens - num_oov_tokens
-    else:
-      self._max_elements = None
+    self.num_oov_indices = num_oov_indices
+    self.oov_token = oov_token
+    self.mask_token = mask_token
 
     # If there is only one OOV bucket, we can determine the OOV value (either 0
     # or 1 depending on whether 0 is reserved) and set that as the default
@@ -141,20 +106,17 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # do a further hashing step; to make this easier, we set the OOV value to
     # -1. (This lets us do a vectorized add and cast to boolean to determine
     # locations where we need to do extra hashing.)
-    if self.num_oov_tokens == 1:
-      self._oov_value = 1 if reserve_zero else 0
+    if self.num_oov_indices == 1:
+      self._oov_value = 0 if mask_token is None else 1
     else:
       self._oov_value = -1
 
     super(IndexLookup, self).__init__(
-        combiner=_IndexLookupCombiner(self.max_tokens), **kwargs)
+        combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token),
+        **kwargs)
+
+    self._output_dtype = dtypes.int64
 
-    # If the layer's input type is int32, we can only output int32 values -
-    # MutableHashTable doesn't allow us to map int32->int64.
-    if self.dtype == dtypes.int32:
-      self._output_dtype = dtypes.int32
-    else:
-      self._output_dtype = dtypes.int64
     self._table = lookup_ops.MutableHashTable(
         key_dtype=self.dtype,
         value_dtype=self._output_dtype,
@@ -167,33 +129,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # counting code in the Model object doesn't throw an attribute error.
     tracked_table.shape = tensor_shape.TensorShape((0,))
 
-    if self.num_oov_tokens <= 1:
-      oov_tokens = None
+    if self.num_oov_indices <= 1:
+      oov_indices = None
     else:
-      oov_start = 1 if reserve_zero else 0
-      oov_tokens = list(range(oov_start, self._reserved_values))
+      oov_start = 1 if mask_token is not None else 0
+      oov_end = oov_start + num_oov_indices
+      oov_indices = list(range(oov_start, oov_end))
 
     self._table_handler = table_utils.TableHandler(
         table=self._table,
-        oov_tokens=oov_tokens,
+        oov_tokens=oov_indices,
         use_v1_apis=self._use_v1_apis())
 
     if vocabulary is not None:
-      if isinstance(vocabulary, str):
-        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
-      table_utils.validate_vocabulary_is_unique(vocabulary)
-
       self.set_vocabulary(vocabulary)
 
   def compute_output_shape(self, input_shape):
     return input_shape
 
-  def compute_output_signature(self, input_spec, invert=False):
+  def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    if invert:
-      output_dtype = dtypes.string
-    else:
-      output_dtype = dtypes.int64
+    output_dtype = dtypes.int64
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
@@ -220,10 +176,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     keys, values = self._table_handler.data()
     # This is required because the MutableHashTable doesn't preserve insertion
     # order, but we rely on the order of the array to assign indices.
-    if self.dtype == dtypes.string:
-      return [x.decode("utf-8") for _, x in sorted(zip(values, keys))]
-    else:
-      return [x for _, x in sorted(zip(values, keys))]
+    return [x for _, x in sorted(zip(values, keys))]
 
   def vocab_size(self):
     return self._table_handler.vocab_size()
@@ -231,10 +184,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
   def get_config(self):
     config = {
         "max_tokens": self.max_tokens,
-        "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": None,
-        "reserve_zero": self.reserve_zero,
-        "mask_zero": self.mask_zero,
+        "num_oov_indices": self.num_oov_indices,
+        "oov_token": self.oov_token,
+        "mask_token": self.mask_token,
     }
     base_config = super(IndexLookup, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -246,46 +198,101 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # abstraction for ease of saving!) we return 0.
     return 0
 
-  def set_vocabulary(self,
-                     vocab,
-                     append=False):
+  def set_vocabulary(self, vocab):
     """Sets vocabulary (and optionally document frequency) data for this layer.
 
     This method sets the vocabulary for this layer directly, instead of
     analyzing a dataset through 'adapt'. It should be used whenever the vocab
     information is already known. If vocabulary data is already present in the
-    layer, this method will either replace it, if 'append' is set to False, or
-    append to it (if 'append' is set to True).
+    layer, this method will either replace it
 
     Arguments:
       vocab: An array of string tokens.
-      append: Whether to overwrite or append any existing vocabulary data.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
         input data is missing.
     """
-    current_table_size = self._table_handler.vocab_size()
-    total_vocab_size = len(vocab) + (current_table_size if append else 0)
-    if self.max_tokens is not None and total_vocab_size > self._max_elements:
+
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    if should_have_mask:
+      has_mask = vocab[0] == self.mask_token
+      oov_start = 1
+    else:
+      has_mask = False
+      oov_start = 0
+
+    should_have_oov = self.num_oov_indices > 0
+    if should_have_oov:
+      oov_end = oov_start + self.num_oov_indices
+      expected_oov = [self.oov_token] * self.num_oov_indices
+      has_oov = vocab[oov_start:oov_end] == expected_oov
+      # If we get a numpy array, then has_oov may end up being a numpy array
+      # instead of a bool. Fix this by collapsing the variable if it's not bool.
+      if not isinstance(has_oov, bool):
+        has_oov = any(has_oov)
+    else:
+      has_oov = False
+
+    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
+      raise ValueError("The passed vocabulary has the correct mask token `%s` "
+                       "at index 0, but does not have the OOV token `%s` in "
+                       "indices [%s:%s]. Instead, we found `%s`. Was this "
+                       "vocabulary generated by a layer with incompatible "
+                       "settings?" %
+                       (self.mask_token, self.oov_token, oov_start, oov_end,
+                        vocab[oov_start:oov_end]))
+
+    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
+      raise ValueError(
+          "The passed vocabulary has the correct OOV token `%s` at "
+          "indices [%s:%s], but does not have the mask token `%s` in "
+          "index 0. Instead, we found `%s`. Was this vocabulary "
+          "generated by a layer with incompatible settings?" %
+          (self.oov_token, oov_start, oov_end, self.mask_token, vocab[0]))
+
+    insert_special_tokens = not has_oov and not has_mask
+
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+    special_tokens.extend([self.oov_token] * self.num_oov_indices)
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+    if self.oov_token in tokens:
+      raise ValueError("Reserved OOV token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "OOV token for this layer." %
+                       (self.oov_token, tokens.index(self.oov_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
       raise ValueError(
           "Attempted to set a vocabulary larger than the maximum vocab size. "
-          "Passed vocab size is %s, max vocab size is %s. Note that the OOV "
-          "token(s) are automatically added to the number of tokens." %
+          "Passed vocab size is %s, max vocab size is %s." %
           (total_vocab_size, self.max_tokens))
 
-    start_index = self._reserved_values + (current_table_size if append else 0)
+    start_index = num_special_tokens
     values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
-    vocab = table_utils.convert_to_ndarray(vocab, self.dtype)
-    table_utils.assert_same_type(self.dtype, vocab, "vocab")
 
-    values = table_utils.convert_to_ndarray(values, self._output_dtype)
-    table_utils.assert_same_type(self._output_dtype, values, "values")
-
-    if not append and current_table_size > 0:
-      self._table_handler.clear()
+    self._table_handler.clear()
     self._table_handler.insert(vocab, values)
 
+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_tokens, special_token_values)
+
   def _set_state_variables(self, updates):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
@@ -316,18 +323,20 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
       dataset, all tokens are retained.s
   """
 
-  def __init__(self, vocab_size=None):
+  def __init__(self, vocab_size=None, mask_value=None):
     self._vocab_size = vocab_size
+    self._mask_value = mask_value
 
   def compute(self, values, accumulator=None):
     """Compute a step in this computation, returning a new accumulator."""
-    values = base_preprocessing_layer.convert_to_list(values)
+    values = base_preprocessing_layer.convert_to_list(
+        values, sparse_default_value=self._mask_value)
 
     if accumulator is None:
       accumulator = self._create_accumulator()
 
     # TODO(momernick): Benchmark improvements to this algorithm.
-    if isinstance(values, (str, bytes)):
+    if isinstance(values, (str, bytes, np.int64)):
       accumulator.count_dict[values] += 1
     else:
       for document in values:
@@ -362,6 +371,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
         "vocab": A list of the retained items in the vocabulary.
     """
     vocab_counts = accumulator.count_dict
+    if self._mask_value in vocab_counts:
+      del vocab_counts[self._mask_value]
     sorted_counts = sorted(
         vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
     vocab_data = (
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
index 3360dad6ffe..098e67f5f6b 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -65,7 +65,12 @@ class IndexLookupDistributionTest(
 
     with distribution.scope():
       input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
+      layer = get_layer_class()(
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
       layer.adapt(vocab_dataset)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index 3c5b5757ec2..a95834233b3 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import itertools
 import os
 import random
-import six
 import string
 
 from absl.testing import parameterized
@@ -31,7 +30,6 @@ from tensorflow.python import keras
 from tensorflow.python import tf2
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -44,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -71,6 +68,10 @@ def _get_end_to_end_test_cases():
                         ["and"], ["earth"], ["michigan"]]),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
           "input_dtype":
@@ -91,6 +92,9 @@ def _get_end_to_end_test_cases():
                        dtype=np.int64),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": 0,
+              "oov_token": -1,
               "dtype": dtypes.int64,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
@@ -172,7 +176,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -193,7 +202,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -209,7 +223,12 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -223,7 +242,12 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -248,7 +272,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -270,7 +299,11 @@ class CategoricalEncodingMultiOOVTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -286,7 +319,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -301,7 +339,11 @@ class CategoricalEncodingMultiOOVTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -321,13 +363,14 @@ class CategoricalEncodingAdaptTest(
         dense_shape=[3, 4])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    # Note that the expected vocabulary has a null string (''). This is because
-    # we assume that sparse tensors are in fact dense tensors with elided
-    # values, not ragged tensors. Therefore, we assume that any missing data
-    # is important and give it a spot in our vocab.
-    expected_vocabulary = ["", "michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_ragged_adapt(self):
@@ -335,9 +378,14 @@ class CategoricalEncodingAdaptTest(
                                               ["fire", "michigan"]])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    expected_vocabulary = ["michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_sparse_int_input(self):
@@ -352,7 +400,12 @@ class CategoricalEncodingAdaptTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -368,7 +421,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -382,7 +440,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -400,34 +463,15 @@ class CategoricalEncodingAdaptTest(
     batched_ds = ds.take(2)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = get_layer_class()(
-        max_tokens=10, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=10,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token=None,
+        dtype=dtypes.string)
     _ = layer(input_t)
     layer.adapt(batched_ds)
 
 
-@keras_parameterized.run_all_keras_modes
-class IndexLookupDistributionTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_cpu_distribution(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
-      layer.set_vocabulary(vocab_data)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-
 @keras_parameterized.run_all_keras_modes
 class IndexLookupOutputTest(keras_parameterized.TestCase,
                             preprocessing_test_utils.PreprocessingLayerTest):
@@ -439,7 +483,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -448,7 +497,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
 
   def test_output_shape(self):
     input_data = keras.Input(shape=(4,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
 
@@ -459,7 +513,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(reserve_zero=False)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=None,
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -473,7 +532,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
@@ -485,15 +550,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
                                 preprocessing_test_utils.PreprocessingLayerTest
                                ):
 
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
   def test_int_output_explicit_vocab(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
@@ -501,107 +557,195 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_get_vocab_returns_str(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    layer = get_layer_class()(vocabulary=vocab_data)
-    layer_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, layer_vocab)
-    self.assertIsInstance(layer_vocab[0], six.text_type)
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_list = ["earth", "wind", "and", "fire"]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=5)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
   def test_non_unique_vocab_fails(self):
     vocab_data = ["earth", "wind", "and", "fire", "fire"]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_list = ["earth", "wind", "and", "fire", "earth"]
-    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+  def test_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = ["", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
-      _ = get_layer_class()(vocabulary=vocab_path)
+      layer.set_vocabulary(vocab_data)
 
+  def test_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
 
-@keras_parameterized.run_all_keras_modes
-class InverseLookupOutputTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
 
-  def DISABLE_test_inverse_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
-    int_outputs, string_outputs = model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64)
 
-  def DISABLE_test_inverse_output_serialization(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
+  def test_int_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = [1234, -1, 11, 21, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
 
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
+  def test_int_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = [-1, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
 
-    with CustomObjectScope({"IndexLookup": get_layer_class()}):
-      new_model = keras.Model.from_config(model.get_config())
-    new_model.set_weights(model.get_weights())
-    int_outputs, string_outputs = new_model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_int_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = [0, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = [14, 38, -1, 34, 3, 84]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = [125, 0, 3, 4, 94]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
@@ -612,7 +756,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -626,7 +775,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -643,25 +797,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase,
   def test_too_long_vocab_fails_in_single_setting(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
-    layer = get_layer_class()(max_tokens=4)
+    layer = get_layer_class()(
+        max_tokens=4,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    layer = get_layer_class()(max_tokens=4)
-
-    # The first time we call set_vocabulary, we're under the max_tokens
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_zero_max_tokens_fails(self):
     with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
-      _ = get_layer_class()(max_tokens=0)
+      _ = get_layer_class()(
+          max_tokens=0,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -676,7 +829,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
     # Build and validate a golden model.
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -705,8 +863,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
 
 @keras_parameterized.run_all_keras_modes
-class IndexLookupCombinerTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+class IndexLookupStringCombinerTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
   def compare_text_accumulators(self, a, b, msg=None):
     if a is None or b is None:
@@ -834,5 +993,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase,
     self.validate_accumulator_extract(combiner, data, expected_extract_output)
 
 
+@keras_parameterized.run_all_keras_modes
+class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def compare_text_accumulators(self, a, b, msg=None):
+    if a is None or b is None:
+      self.assertAllEqual(a, b, msg=msg)
+
+    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
+
+  compare_accumulators = compare_text_accumulators
+
+  def update_accumulator(self, accumulator, data):
+    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
+
+    return accumulator
+
+  def test_combiner_api_compatibility_int_mode(self):
+    data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
+    combiner = index_lookup._IndexLookupCombiner()
+    expected_accumulator_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+        "counts": np.array([2, 2, 2, 1, 1]),
+    }
+    expected_extract_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+    }
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_serialize_and_deserialize(combiner, data,
+                                                        expected_accumulator)
+    self.validate_accumulator_uniqueness(combiner, data)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+  # TODO(askerryryan): Add tests confirming equivalence to behavior of
+  # existing tf.keras.preprocessing.text.Tokenizer.
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "top_k_smaller_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      {
+          "testcase_name": "top_k_larger_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 10,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "no_top_k",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": None,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "single_element_per_row",
+          "data": np.array([[42], [1138], [1729], [1138], [725]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([2, 1, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      # Which tokens are retained are based on global frequency, and thus are
+      # sensitive to frequency within a document. In contrast, because idf only
+      # considers the presence of a token in a document, it is insensitive
+      # to the frequency of the token within the document.
+      {
+          "testcase_name":
+              "retained_tokens_sensitive_to_within_document_frequency",
+          "data":
+              np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
+                        [725, 203]]),
+          "vocab_size":
+              3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 42, 1729, 725, 203]),
+              "counts": np.array([4, 2, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 42]),
+          },
+      })
+  def test_combiner_computation(self, data, vocab_size,
+                                expected_accumulator_output,
+                                expected_extract_output):
+    combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_computation(combiner, data, expected_accumulator)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
new file mode 100644
index 00000000000..671c02573db
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -0,0 +1,112 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class IntegerLookup(index_lookup.IndexLookup):
+  """Maps integers from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary integers into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_values: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_tokens - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary values to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_value: A value that represents masked inputs, and which is mapped to
+      index 0. Defaults to 0. If set to None, no mask term will be added and the
+      OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of
+      (1...num_oov_tokens+1).
+    oov_value: The value representing an out-of-vocabulary value. Defaults to
+      -1.
+    vocabulary: An optional list of values, or a path to a text file containing
+      a vocabulary to load into this layer. The file should contain one value
+      per line. If the list or file contains the same token multiple times, an
+      error will be thrown.
+  """
+
+  def __init__(self,
+               max_values=None,
+               num_oov_indices=1,
+               mask_value=0,
+               oov_value=-1,
+               vocabulary=None,
+               **kwargs):
+    allowed_dtypes = [dtypes.int64]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("IntegerLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.int64
+
+    # If max_values is set, the value must be greater than 1 - otherwise we
+    # are creating a 0-element vocab, which doesn't make sense.
+    if max_values is not None and max_values <= 1:
+      raise ValueError("If set, max_values must be greater than 1.")
+
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
+        vocabulary = [int(v) for v in vocabulary]
+
+    super(IntegerLookup, self).__init__(
+        max_tokens=max_values,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_value,
+        oov_token=oov_value,
+        vocabulary=vocabulary,
+        **kwargs)
+
+  def get_config(self):
+    base_config = super(IntegerLookup, self).get_config()
+    # Because the super config has a bunch of args we're also passing,
+    # we need to rename and remove them from the config dict.
+    base_config["max_values"] = base_config["max_tokens"]
+    del base_config["max_tokens"]
+
+    base_config["mask_value"] = base_config["mask_token"]
+    del base_config["mask_token"]
+
+    base_config["oov_value"] = base_config["oov_token"]
+    del base_config["oov_token"]
+    return base_config
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
new file mode 100644
index 00000000000..515a1ca6667
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@@ -0,0 +1,501 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+import random
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return integer_lookup.IntegerLookup
+  else:
+    return integer_lookup_v1.IntegerLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_ints_soft_vocab_cap",
+          # Create an array where 1138 is the most frequent term, followed by
+          # 1729, then 725, then 42. This ensures that the vocab accumulator
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
+                        [1729], [725], [725]],
+                       dtype=np.int64),
+          "input_data":
+              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                       dtype=np.int64),
+          "kwargs": {
+              "max_values": None,
+              "dtype": dtypes.int64,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.int64
+      },)
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupLayerTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # IntegerLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"IntegerLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 133], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(
+        max_values=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_value=0,
+        oov_value=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None, num_oov_indices=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingAdaptTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_adapt(self):
+    vocab_data = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 2]],
+        values=[203, 1729, 203],
+        dense_shape=[3, 4])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_ragged_adapt(self):
+    vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_single_int_generator_dataset(self):
+
+    def word_gen():
+      for _ in itertools.count(1):
+        yield random.randint(0, 100)
+
+    ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
+                                            tensor_shape.TensorShape([]))
+    batched_ds = ds.take(2)
+    input_t = keras.Input(shape=(), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
+    _ = layer(input_t)
+    layer.adapt(batched_ds)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupOutputTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_output_shape(self):
+    input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    int_data = layer(input_data)
+    self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
+
+  def test_int_output_no_reserved_zero(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, mask_value=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_values=None,
+    )
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(str(vocab) + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_int(self):
+    vocab_data = [42, 1138, 725, 1729]
+    expected_vocab = [0, -1, 42, 1138, 725, 1729]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], np.int64)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = [42, 1138, 725, 1729]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = [42, 1138, 725, 1729, 1729]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = [42, 1138, 725, 1729, 42]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class IntegerLookupSaveableTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupErrorTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_too_long_vocab_fails_in_single_setting(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    layer = get_layer_class()(max_values=4, num_oov_indices=1)
+    with self.assertRaisesRegex(ValueError,
+                                "vocabulary larger than the maximum vocab.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_zero_max_values_fails(self):
+    with self.assertRaisesRegex(ValueError, ".*max_values.*"):
+      _ = get_layer_class()(max_values=0, num_oov_indices=1)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupSavingTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_vocabulary_persistence_across_saving(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    # Build and validate a golden model.
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(output_dataset, expected_output)
+
+    # Save the model to disk.
+    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+    model.save(output_path, save_format="tf")
+
+    # Delete the session and graph to ensure that the loaded model is generated
+    # from scratch.
+    # TODO(b/149526183): Can't clear session when TF2 is disabled.
+    if tf2.enabled():
+      keras.backend.clear_session()
+
+    loaded_model = keras.models.load_model(
+        output_path, custom_objects={"IntegerLookup": get_layer_class()})
+
+    # Ensure that the loaded model is unique (so that the save/load is real)
+    self.assertIsNot(model, loaded_model)
+
+    # Validate correctness of the new model.
+    new_output_dataset = loaded_model.predict(input_array)
+    self.assertAllEqual(new_output_dataset, expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
new file mode 100644
index 00000000000..ec326f4d78b
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
@@ -0,0 +1,25 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+
+
+class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
new file mode 100644
index 00000000000..4032486b5f0
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -0,0 +1,106 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class StringLookup(index_lookup.IndexLookup):
+  """Maps strings from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary strings into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_tokens: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. Defaults to the empty string "". If set to None, no mask term
+      will be added and the OOV tokens, if any, will be indexed from
+      (0...num_oov_indices) instead of (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. Defaults to
+      "[OOV]".
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. If the list or file contains the same token multiple
+      times, an error will be thrown.
+    encoding: The Python string encoding to use. Defaults to `'utf-8'`.
+  """
+
+  def __init__(self,
+               max_tokens=None,
+               num_oov_indices=1,
+               mask_token="",
+               oov_token="[OOV]",
+               vocabulary=None,
+               encoding="utf-8",
+               **kwargs):
+    allowed_dtypes = [dtypes.string]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("StringLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.string
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
+
+    self.encoding = encoding
+
+    super(StringLookup, self).__init__(
+        max_tokens=max_tokens,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_token,
+        oov_token=oov_token,
+        vocabulary=vocabulary,
+        **kwargs)
+
+  def get_config(self):
+    config = {"encoding": self.encoding}
+    base_config = super(StringLookup, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def get_vocabulary(self):
+    if self._table_handler.vocab_size() == 0:
+      return []
+
+    keys, values = self._table_handler.data()
+    # This is required because the MutableHashTable doesn't preserve insertion
+    # order, but we rely on the order of the array to assign indices.
+    return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
new file mode 100644
index 00000000000..b2a610ac328
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@@ -0,0 +1,224 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.python import keras
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return string_lookup.StringLookup
+  else:
+    return string_lookup_v1.StringLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": None,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.string
+      },
+  )
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupLayerTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # StringLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"StringLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupVocabularyTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_str(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], six.text_type)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = ["earth", "wind", "and", "fire", "earth"]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class StringLookupSaveableTest(keras_parameterized.TestCase,
+                               preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
new file mode 100644
index 00000000000..0d4c70de655
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
@@ -0,0 +1,25 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+
+
+class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index f5397da1f3e..05447f6e9ff 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -189,4 +189,3 @@ def convert_to_ndarray(x, dtype=None):
     if np.can_cast(array.dtype, np_dtype):
       array = array.astype(np_dtype, casting="safe")
   return array
-
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 119e0b5ccff..4156ba50c02 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -32,7 +32,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -269,10 +269,6 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     self._max_tokens = max_tokens
 
-    # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
-    # modes don't have a PAD value, so we only need to reserve one value.
-    self._reserved_values = 2 if output_mode == INT else 1
-
     # In INT mode, the zero value is reserved for padding (per Keras standard
     # padding approaches). In non-INT modes, there is no padding so we can set
     # the OOV value to zero instead of one.
@@ -303,9 +299,9 @@ class TextVectorization(CombinerPreprocessingLayer):
             self._max_vocab_size, compute_idf=output_mode == TFIDF),
         **kwargs)
 
-    reserve_zero = output_mode in [None, INT]
+    mask_token = "" if output_mode in [None, INT] else None
     self._index_lookup_layer = self._get_index_lookup_class()(
-        max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string)
+        max_tokens=max_tokens, mask_token=mask_token)
 
     # If this layer is configured for string or integer output, we do not
     # create a vectorization layer (as the output is not vectorized).
@@ -328,7 +324,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     return (keys.numpy(), values.numpy())
 
   def _get_index_lookup_class(self):
-    return index_lookup.IndexLookup
+    return string_lookup.StringLookup
 
   def _to_numpy(self, preprocessed_data):
     """Converts preprocessed inputs into numpy arrays."""
@@ -428,26 +424,21 @@ class TextVectorization(CombinerPreprocessingLayer):
   def set_vocabulary(self,
                      vocab,
                      df_data=None,
-                     oov_df_value=None,
-                     append=False):
+                     oov_df_value=None):
     """Sets vocabulary (and optionally document frequency) data for this layer.
 
     This method sets the vocabulary and DF data for this layer directly, instead
     of analyzing a dataset through 'adapt'. It should be used whenever the vocab
     (and optionally document frequency) information is already known. If
-    vocabulary data is already present in the layer, this method will either
-    replace it, if 'append' is set to False, or append to it (if 'append' is set
-    to True).
+    vocabulary data is already present in the layer, this method will replace
+    it.
 
     Arguments:
       vocab: An array of string tokens.
       df_data: An array of document frequency data. Only necessary if the layer
         output_mode is TFIDF.
       oov_df_value: The document frequency of the OOV token. Only necessary if
-        output_mode is TFIDF. OOV data is optional when appending additional
-        data in TFIDF mode; if an OOV value is supplied it will overwrite the
-        existing OOV value.
-      append: Whether to overwrite or append any existing vocabulary data.
+        output_mode is TFIDF.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
@@ -468,8 +459,7 @@ class TextVectorization(CombinerPreprocessingLayer):
                           "be changed after the layer is "
                           "called.").format(mode=self._output_mode))
 
-    current_table_size = self._index_lookup_layer.vocab_size()
-    self._index_lookup_layer.set_vocabulary(vocab, append)
+    self._index_lookup_layer.set_vocabulary(vocab)
 
     # When doing raw or integer output, we don't have a Vectorize layer to
     # manage. In this case, we can return directly.
@@ -477,14 +467,9 @@ class TextVectorization(CombinerPreprocessingLayer):
       return
 
     if not self._pad_to_max or self._max_tokens is None:
-      num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values
+      num_tokens = self._index_lookup_layer.vocab_size()
       self._vectorize_layer.set_num_elements(num_tokens)
 
-    # We're only _really_ appending if the table_size is nonzero. This is
-    # important for some sanity checks in tfidf mode (specifically, checking if
-    # oov_df_value is set or not) and handling existing tfidf weight data.
-    append = append if current_table_size > 0 else False
-
     if self._output_mode == TFIDF:
       if df_data is None:
         raise ValueError("df_data must be set if output_mode is TFIDF")
@@ -492,31 +477,14 @@ class TextVectorization(CombinerPreprocessingLayer):
         raise ValueError("df_data must be the same length as vocab. "
                          "len(df_data) is %s, len(vocab) is %s" %
                          (len(vocab), len(df_data)))
-      if not append and oov_df_value is None:
-        raise ValueError("You must pass an oov_df_value the first time "
-                         "'set_vocabulary' is called when output_mode is "
+      if oov_df_value is None:
+        raise ValueError("You must pass an oov_df_value when output_mode is "
                          "TFIDF.")
 
       df_data = self._convert_to_ndarray(df_data)
-      if append:
-        # The existing IDF data is stored in a Keras weight, so we can get it
-        # by calling K.get_value() on the weight object. Take the first
-        # table_size+1 values in case we're padding the weight with zeros
-        existing_df_data = K.get_value(
-            self._vectorize_layer.tf_idf_weights)[:current_table_size + 1]
-        df_data = np.append(existing_df_data, df_data, axis=0)
-        # If we are appending and need to replace the OOV DF value, we can
-        # assign it over the existing OOV DF value at index 0 of the (already-
-        # concatenated) DF value array.
-        if oov_df_value is not None:
-          df_data[0] = oov_df_value
-      else:
-        # If we are not appending (that is, we have only new data) we need to
-        # insert the OOV value to the front of the array. (This is a append to
-        # the head, not a replacement of the zeroth value.)
-        if not isinstance(oov_df_value, np.ndarray):
-          oov_df_value = np.array([oov_df_value])
-        df_data = np.insert(df_data, 0, oov_df_value)
+      if not isinstance(oov_df_value, np.ndarray):
+        oov_df_value = np.array([oov_df_value])
+      df_data = np.insert(df_data, 0, oov_df_value)
       self._vectorize_layer.set_tfidf_data(df_data)
 
   def build(self, input_shape):
@@ -536,8 +504,10 @@ class TextVectorization(CombinerPreprocessingLayer):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
     if self._output_mode == TFIDF:
-      self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME],
-                          updates[_OOV_IDF_NAME])
+      self.set_vocabulary(
+          updates[_VOCAB_NAME],
+          updates[_IDF_NAME],
+          updates[_OOV_IDF_NAME])
     else:
       self.set_vocabulary(updates[_VOCAB_NAME])
 
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index d8325f39149..f8a1f5b9434 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -619,25 +619,6 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
   def test_int_output_densifies_with_zeros(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     # Create an input array that has 5 elements in the first example and 4 in
@@ -1046,7 +1027,10 @@ class TextVectorizationOutputTest(
         split=None,
         output_mode=text_vectorization.TFIDF,
         pad_to_max_tokens=True)
-    layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05)
+    layer.set_vocabulary(
+        vocab_data,
+        df_data=tfidf_data,
+        oov_df_value=.05)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -1084,60 +1068,6 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
-  def test_tfidf_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [.1, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
-  def test_tfidf_appending_with_oov_replacement(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [1.5, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    # Note that here we've replaced the OOV vaue.
-    layer.set_vocabulary(
-        vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
   def test_accept_1D_input(self):
     input_array = np.array(["earth wind and fire",
                             "fire and earth michigan"])
@@ -1274,22 +1204,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-
-    layer = get_layer_class()(
-        max_tokens=4,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-
-    # The first time we call set_vocabulary, we're under the max_tokens limit
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
@@ -1326,18 +1240,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "You must pass an oov_df_value.*"):
       layer.set_vocabulary(vocab_data, df_data)
 
-  def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    df_data = [1, 2, 3, 4]
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    with self.assertRaisesRegex(ValueError,
-                                "You must pass an oov_df_value.*"):
-      layer.set_vocabulary(vocab_data, df_data, append=True)
-
   def test_set_tfidf_in_non_tfidf_fails(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     df_data = [1, 2, 3, 4]
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index b869bee52ab..59cf2c61288 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
-from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util.tf_export import keras_export
@@ -84,7 +84,7 @@ class TextVectorization(text_vectorization.TextVectorization,
     return categorical_encoding_v1.CategoricalEncoding
 
   def _get_index_lookup_class(self):
-    return index_lookup_v1.IndexLookup
+    return string_lookup_v1.StringLookup
 
   def _to_numpy(self, data):
     """Converts preprocessed inputs into numpy arrays."""
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 47852865558..4f5b0f480e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -221,7 +221,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 05154268354..a33f65189fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -219,7 +219,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"

From 37df93331ee0ee9f00830ee223a79198edca8f89 Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Thu, 14 May 2020 19:19:49 -0700
Subject: [PATCH 0631/1533] Adds utility methods for storing SignatureDefs in
 the metadata table in the flatbuffer

PiperOrigin-RevId: 311652937
Change-Id: I397c7ce6fad843cff789dedb583d6df44545db3f
---
 tensorflow/lite/tools/signature/BUILD         | 106 +++++++++++
 .../tools/signature/signature_def_util.cc     | 175 ++++++++++++++++++
 .../lite/tools/signature/signature_def_util.h |  71 +++++++
 .../signature/signature_def_util_test.cc      | 167 +++++++++++++++++
 .../signature_def_util_wrapper_pybind11.cc    |  95 ++++++++++
 .../tools/signature/signature_def_utils.py    |  95 ++++++++++
 .../signature/signature_def_utils_test.py     |  76 ++++++++
 7 files changed, 785 insertions(+)
 create mode 100644 tensorflow/lite/tools/signature/BUILD
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util.cc
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util.h
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util_test.cc
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
 create mode 100644 tensorflow/lite/tools/signature/signature_def_utils.py
 create mode 100644 tensorflow/lite/tools/signature/signature_def_utils_test.py

diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
new file mode 100644
index 00000000000..cf28b2eab72
--- /dev/null
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -0,0 +1,106 @@
+# Utilities for signature_defs in TFLite
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+TFLITE_DEFAULT_COPTS = if_not_windows([
+    "-Wall",
+    "-Wno-comment",
+    "-Wno-extern-c-compat",
+])
+
+cc_library(
+    name = "signature_def_util",
+    srcs = ["signature_def_util.cc"],
+    hdrs = ["signature_def_util.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@com_google_protobuf//:protobuf",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "signature_def_util_test",
+    size = "small",
+    srcs = ["signature_def_util_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/cc/saved_model:signature_constants",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+pybind_extension(
+    name = "_pywrap_signature_def_util_wrapper",
+    srcs = [
+        "signature_def_util_wrapper_pybind11.cc",
+    ],
+    module_name = "_pywrap_signature_def_util_wrapper",
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/python:pybind11_lib",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "signature_def_utils",
+    srcs = ["signature_def_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":_pywrap_signature_def_util_wrapper",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "signature_def_utils_test",
+    srcs = ["signature_def_utils_test.py"],
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_mac",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":signature_def_utils",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
new file mode 100644
index 00000000000..e44fe98b3cc
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::Status;
+using SerializedSignatureDefMap = std::map<std::string, std::string>;
+using SignatureDefMap = std::map<std::string, tensorflow::SignatureDef>;
+
+const Metadata* GetSignatureDefMetadata(const Model* model) {
+  if (!model || !model->metadata()) {
+    return nullptr;
+  }
+  for (int i = 0; i < model->metadata()->size(); ++i) {
+    const Metadata* metadata = model->metadata()->Get(i);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      return metadata;
+    }
+  }
+  return nullptr;
+}
+
+Status ReadSignatureDefMap(const Model* model, const Metadata* metadata,
+                           SerializedSignatureDefMap* map) {
+  if (!model || !metadata || !map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  const flatbuffers::Vector<uint8_t>* flatbuffer_data =
+      model->buffers()->Get(metadata->buffer())->data();
+  const auto signature_defs =
+      flexbuffers::GetRoot(flatbuffer_data->data(), flatbuffer_data->size())
+          .AsMap();
+  for (int i = 0; i < signature_defs.Keys().size(); ++i) {
+    const std::string key = signature_defs.Keys()[i].AsString().c_str();
+    (*map)[key] = signature_defs[key].AsString().c_str();
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
+Status SetSignatureDefMap(const Model* model,
+                          const SignatureDefMap& signature_def_map,
+                          std::string* model_data_with_signature_def) {
+  if (!model || !model_data_with_signature_def) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  if (signature_def_map.empty()) {
+    return tensorflow::errors::InvalidArgument(
+        "signature_def_map should not be empty");
+  }
+  flexbuffers::Builder fbb;
+  const size_t start_map = fbb.StartMap();
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  int buffer_id = mutable_model->buffers.size();
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    buffer_id = metadata->buffer();
+  } else {
+    auto buffer = absl::make_unique<BufferT>();
+    mutable_model->buffers.emplace_back(std::move(buffer));
+    auto sigdef_metadata = absl::make_unique<MetadataT>();
+    sigdef_metadata->buffer = buffer_id;
+    sigdef_metadata->name = kSignatureDefsMetadataName;
+    mutable_model->metadata.emplace_back(std::move(sigdef_metadata));
+  }
+  for (const auto& entry : signature_def_map) {
+    fbb.String(entry.first.c_str(), entry.second.SerializeAsString());
+  }
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  mutable_model->buffers[buffer_id]->data = fbb.GetBuffer();
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data_with_signature_def =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+bool HasSignatureDef(const Model* model, const std::string& signature_key) {
+  if (!model) {
+    return false;
+  }
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (!metadata) {
+    return false;
+  }
+  SerializedSignatureDefMap signature_defs;
+  if (ReadSignatureDefMap(model, metadata, &signature_defs) !=
+      tensorflow::Status::OK()) {
+    return false;
+  }
+  return (signature_defs.find(signature_key) != signature_defs.end());
+}
+
+Status GetSignatureDefMap(const Model* model,
+                          SignatureDefMap* signature_def_map) {
+  if (!model || !signature_def_map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  SignatureDefMap retrieved_signature_def_map;
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    SerializedSignatureDefMap signature_defs;
+    auto status = ReadSignatureDefMap(model, metadata, &signature_defs);
+    if (status != tensorflow::Status::OK()) {
+      return tensorflow::errors::Internal("Error reading signature def map: %s",
+                                          status.error_message());
+    }
+    for (const auto& entry : signature_defs) {
+      tensorflow::SignatureDef signature_def;
+      if (!signature_def.ParseFromString(entry.second)) {
+        return tensorflow::errors::Internal(
+            "Cannot parse signature def found in flatbuffer.");
+      }
+      retrieved_signature_def_map[entry.first] = signature_def;
+    }
+    *signature_def_map = retrieved_signature_def_map;
+  }
+  return Status::OK();
+}
+
+Status ClearSignatureDefMap(const Model* model, std::string* model_data) {
+  if (!model || !model_data) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  for (int id = 0; id < model->metadata()->size(); ++id) {
+    const Metadata* metadata = model->metadata()->Get(id);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      auto* buffers = &(mutable_model->buffers);
+      buffers->erase(buffers->begin() + metadata->buffer());
+      mutable_model->metadata.erase(mutable_model->metadata.begin() + id);
+      break;
+    }
+  }
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/signature/signature_def_util.h b/tensorflow/lite/tools/signature/signature_def_util.h
new file mode 100644
index 00000000000..7e9c96ffc43
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Constant for name of the Metadata entry associated with SignatureDefs.
+constexpr char kSignatureDefsMetadataName[] = "signature_defs_metadata";
+
+// The function `SetSignatureDefMap()` results in
+// `model_data_with_signature_defs` containing a serialized TFLite model
+// identical to `model` with a metadata and associated buffer containing
+// a FlexBuffer::Map with `signature_def_map` keys and values serialized to
+// String.
+//
+// If a Metadata entry containing a SignatureDef map exists, it will be
+//   overwritten.
+//
+// Returns error if `model_data_with_signature_defs` is null or
+//   `signature_def_map` is empty.
+//
+// On success, returns tensorflow::Status::OK() or error otherwise.
+// On error, `model_data_with_signature_defs` is unchanged.
+tensorflow::Status SetSignatureDefMap(
+    const Model* model,
+    const std::map<std::string, tensorflow::SignatureDef>& signature_def_map,
+    std::string* model_data_with_signature_defs);
+
+// The function `HasSignatureDef()` returns true if `model` contains a Metadata
+// table pointing to a buffer containing a FlexBuffer::Map and the map has
+// `signature_key` as a key, or false otherwise.
+bool HasSignatureDef(const Model* model, const std::string& signature_key);
+
+// The function `GetSignatureDefMap()` results in `signature_def_map`
+// pointing to a map<std::string, tensorflow::SignatureDef>
+// parsed from `model`'s metadata buffer.
+//
+// If the Metadata entry does not exist, `signature_def_map` is unchanged.
+// If the Metadata entry exists but cannot be parsed, returns an error.
+tensorflow::Status GetSignatureDefMap(
+    const Model* model,
+    std::map<std::string, tensorflow::SignatureDef>* signature_def_map);
+
+// The function `ClearSignatureDefs` results in `model_data`
+// containing a serialized Model identical to `model` omitting any
+// SignatureDef-related metadata or buffers.
+tensorflow::Status ClearSignatureDefMap(const Model* model,
+                                        std::string* model_data);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
diff --git a/tensorflow/lite/tools/signature/signature_def_util_test.cc b/tensorflow/lite/tools/signature/signature_def_util_test.cc
new file mode 100644
index 00000000000..d4581e262a4
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/cc/saved_model/signature_constants.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::kClassifyMethodName;
+using tensorflow::kDefaultServingSignatureDefKey;
+using tensorflow::kPredictMethodName;
+using tensorflow::SignatureDef;
+using tensorflow::Status;
+
+constexpr char kSignatureInput[] = "input";
+constexpr char kSignatureOutput[] = "output";
+constexpr char kTestFilePath[] = "tensorflow/lite/testdata/add.bin";
+
+class SimpleSignatureDefUtilTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    flatbuffer_model_ = FlatBufferModel::BuildFromFile(kTestFilePath);
+    ASSERT_NE(flatbuffer_model_, nullptr);
+    model_ = flatbuffer_model_->GetModel();
+    ASSERT_NE(model_, nullptr);
+  }
+
+  SignatureDef GetTestSignatureDef() {
+    auto signature_def = SignatureDef();
+    tensorflow::TensorInfo input_tensor;
+    tensorflow::TensorInfo output_tensor;
+    *input_tensor.mutable_name() = kSignatureInput;
+    *output_tensor.mutable_name() = kSignatureOutput;
+    *signature_def.mutable_method_name() = kClassifyMethodName;
+    (*signature_def.mutable_inputs())[kSignatureInput] = input_tensor;
+    (*signature_def.mutable_outputs())[kSignatureOutput] = output_tensor;
+    return signature_def;
+  }
+  std::unique_ptr<FlatBufferModel> flatbuffer_model_;
+  const Model* model_;
+};
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefTest) {
+  SignatureDef expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  const std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, OverwriteSignatureDefTest) {
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  *expected_signature_def.mutable_method_name() = kPredictMethodName;
+  expected_signature_def_map.erase(
+      expected_signature_def_map.find(kDefaultServingSignatureDefKey));
+  constexpr char kTestSignatureDefKey[] = "ServingTest";
+  expected_signature_def_map[kTestSignatureDefKey] = expected_signature_def;
+  EXPECT_EQ(
+      Status::OK(),
+      SetSignatureDefMap(add_model, expected_signature_def_map, &model_output));
+  const Model* final_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(final_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  EXPECT_NE(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_TRUE(HasSignatureDef(final_model, kTestSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kTestSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, GetSignatureDefTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(), GetSignatureDefMap(model_, &test_signature_def_map));
+  EXPECT_FALSE(HasSignatureDef(model_, kDefaultServingSignatureDefKey));
+}
+
+TEST_F(SimpleSignatureDefUtilTest, ClearSignatureDefTest) {
+  const int expected_num_buffers = model_->buffers()->size();
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  SignatureDef test_signature_def;
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_EQ(Status::OK(), ClearSignatureDefMap(add_model, &model_output));
+  const Model* clear_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(clear_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(expected_num_buffers, clear_model->buffers()->size());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefErrorsTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  std::string model_output;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, &model_output)));
+  SignatureDef test_signature_def;
+  test_signature_def_map[kDefaultServingSignatureDefKey] = test_signature_def;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, nullptr)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
new file mode 100644
index 00000000000..9477305d433
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+py::bytes WrappedSetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer,
+    const std::map<std::string, std::string>& serialized_signature_def_map) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string data;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  for (const auto& entry : serialized_signature_def_map) {
+    tensorflow::SignatureDef signature_def;
+    if (!signature_def.ParseFromString(entry.second)) {
+      throw std::invalid_argument("Cannot parse signature def");
+    }
+    signature_def_map[entry.first] = signature_def;
+  }
+  auto status = tflite::SetSignatureDefMap(model, signature_def_map, &data);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument(status.error_message());
+  }
+  return py::bytes(data);
+}
+
+std::map<std::string, py::bytes> WrappedGetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  auto status = tflite::GetSignatureDefMap(model, &signature_def_map);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("Cannot parse signature def");
+  }
+  std::map<std::string, py::bytes> serialized_signature_def_map;
+  for (const auto& entry : signature_def_map) {
+    serialized_signature_def_map[entry.first] =
+        py::bytes(entry.second.SerializeAsString());
+  }
+  return serialized_signature_def_map;
+}
+
+py::bytes WrappedClearSignatureDefs(const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  auto status = tflite::ClearSignatureDefMap(model, &content);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("An unknown error occurred");
+  }
+  return py::bytes(content);
+}
+
+PYBIND11_MODULE(_pywrap_signature_def_util_wrapper, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_signature_def_util_wrapper
+    -----
+  )pbdoc";
+
+  m.def("SetSignatureDefMap", &WrappedSetSignatureDefMap);
+
+  m.def("GetSignatureDefMap", &WrappedGetSignatureDefMap);
+
+  m.def("ClearSignatureDefs", &WrappedClearSignatureDefs);
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_utils.py b/tensorflow/lite/tools/signature/signature_def_utils.py
new file mode 100644
index 00000000000..df25c651172
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to SignatureDefs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import _pywrap_signature_def_util_wrapper as signature_def_util
+
+
+def set_signature_defs(tflite_model, signature_def_map):
+  """Sets SignatureDefs to the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: Binary TFLite model (bytes or bytes-like object) to which to
+      add signature_def.
+    signature_def_map: dict containing SignatureDefs to store in metadata.
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      metadata field containing SignatureDef.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+      signature_def_map is empty or does not contain a SignatureDef.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = {
+      k: v.SerializeToString() for k, v in signature_def_map.items()}
+  model_buffer = signature_def_util.SetSignatureDefMap(
+      model, serialized_signature_def_map)
+  return model_buffer
+
+
+def get_signature_defs(tflite_model):
+  """Get SignatureDef dict from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to get the signature_def.
+
+  Returns:
+    dict containing serving names to SignatureDefs if exists, otherwise, empty
+      dict.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+    DecodeError:
+      SignatureDef cannot be parsed from TfLite SignatureDef metadata.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = signature_def_util.GetSignatureDefMap(model)
+  def _deserialize(serialized):
+    signature_def = meta_graph_pb2.SignatureDef()
+    signature_def.ParseFromString(serialized)
+    return signature_def
+  return {k: _deserialize(v) for k, v in serialized_signature_def_map.items()}
+
+
+def clear_signature_defs(tflite_model):
+  """Clears SignatureDefs from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to remove signature_defs.
+
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      no SignatureDef metadata.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  return signature_def_util.ClearSignatureDefs(model)
diff --git a/tensorflow/lite/tools/signature/signature_def_utils_test.py b/tensorflow/lite/tools/signature/signature_def_utils_test.py
new file mode 100644
index 00000000000..f7cb33188af
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils_test.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for signature_def_util.py.
+
+   - Tests adding a SignatureDef to TFLite metadata.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import signature_def_utils
+
+
+class SignatureDefUtilsTest(tf.test.TestCase):
+
+  def testAddSignatureDefToFlatbufferMetadata(self):
+    """Test a SavedModel conversion has correct Metadata."""
+    filename = tf.compat.v1.resource_loader.get_path_to_datafile(
+        '../../testdata/add.bin')
+    if not os.path.exists(filename):
+      raise IOError('File "{0}" does not exist in {1}.'.format(
+          filename,
+          tf.compat.v1.resource_loader.get_root_dir_with_all_resources()))
+
+    with tf.io.gfile.GFile(filename, 'rb') as fp:
+      tflite_model = bytearray(fp.read())
+
+    self.assertIsNotNone(tflite_model, 'TFLite model is none')
+    sig_input_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_input_tensor_signature = {'x': sig_input_tensor}
+    sig_output_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_output_tensor_signature = {'y': sig_output_tensor}
+    predict_signature_def = (
+        tf.compat.v1.saved_model.build_signature_def(
+            sig_input_tensor_signature, sig_output_tensor_signature,
+            tf.saved_model.PREDICT_METHOD_NAME))
+    serving_key = tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    signature_def_map = {serving_key: predict_signature_def}
+    tflite_model = signature_def_utils.set_signature_defs(
+        tflite_model, signature_def_map)
+    saved_signature_def_map = signature_def_utils.get_signature_defs(
+        tflite_model)
+    signature_def = saved_signature_def_map.get(serving_key)
+    self.assertIsNotNone(signature_def, 'SignatureDef not found')
+    self.assertEqual(signature_def.SerializeToString(),
+                     predict_signature_def.SerializeToString())
+    remove_tflite_model = (
+        signature_def_utils.clear_signature_defs(tflite_model))
+    signature_def_map = signature_def_utils.get_signature_defs(
+        remove_tflite_model)
+    self.assertIsNone(signature_def_map.get(serving_key),
+                      'SignatureDef found, but should be missing')
+
+
+if __name__ == '__main__':
+  tf.test.main()

From 3b225a9776de735ded763d227e0c4c869d8e85c6 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 14 May 2020 19:42:15 -0700
Subject: [PATCH 0632/1533] Rework kernel check for fully_connected.

PiperOrigin-RevId: 311655034
Change-Id: Ic82fd9a9350cac89043db85d1ba1d4ec480435e5
---
 tensorflow/lite/kernels/kernel_util.cc | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index b30747eac61..ded536ab3a7 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -126,11 +126,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
   // pipeline.
   if (bias) {
     const double bias_scale = static_cast<double>(bias->params.scale);
-    // Here we're making sure the input_product_scale & bias_scale the same.
-    // Normally this should be guaranteed by the training pipeline, we are
-    // setting the threshold to be 2e-6 to allow some numeric stability
-    // difference.
-    TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
+    // Here we're making sure the input_product_scale & bias_scale are about the
+    // same. Since we have:
+    // (output - output_zp) * output_scale =
+    // input_product_scale * input_product + bias * bias_scale ---- (0)
+    //
+    // (0) equals:
+    // (input_product + bias) * input_product_scale ----- (1)
+    //           +
+    // bias * (bias_scale - input_product_scale)   ------ (2)
+    //
+    // For the real kernel computation, we're doing (1), so we really need to
+    // make sure (2) has minimum impact on the output, so:
+    // bias * (bias_scale - input_product_scale) / output_scale should be
+    // a small number for an integer.
+    // Since normally bias should be within a small range.
+    // We should expect (bias_scale - input_product_scale) / output_scale to
+    // be a small number like 0.02.
+    const double scale_diff = std::abs(input_product_scale - bias_scale);
+    const double output_scale = static_cast<double>(output->params.scale);
+
+    TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
   }
   return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                           multiplier);

From a98948acf8c92f580db09fe739f028796985b9e3 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Thu, 14 May 2020 19:43:19 -0700
Subject: [PATCH 0633/1533] Use CompactTextString instead of String for
 generating ops.

PiperOrigin-RevId: 311655146
Change-Id: I57e5c595522b47dd9badbf0720569ffef69fed66
---
 tensorflow/go/genop/internal/genop.go      |   9 +-
 tensorflow/go/genop/internal/genop_test.go | 255 +++++++++++++++++++++
 2 files changed, 259 insertions(+), 5 deletions(-)

diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 95547045111..c4ea8abb543 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -567,11 +567,10 @@ func isListAttr(attrdef *odpb.OpDef_AttrDef) bool {
 // This is useful when 's' corresponds to a "oneof" protocol buffer message.
 // For example, consider the protocol buffer message:
 //   oneof value { bool b = 1;  int64 i = 2; }
-// String() on a Go corresponding object (using proto.CompactTextString) will
-// print "b:true", or "i:7" etc. This function strips out the leading "b:" or
-// "i:".
-func stripLeadingColon(s fmt.Stringer) string {
-	x := s.String()
+// proto.CompactTextString) will print "b:true", or "i:7" etc. This function
+// strips out the leading "b:" or "i:".
+func stripLeadingColon(m proto.Message) string {
+	x := proto.CompactTextString(m)
 	y := strings.SplitN(x, ":", 2)
 	if len(y) < 2 {
 		return x
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index a339d181e8d..b467efc7aea 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -533,6 +533,261 @@ func TestOp(scope *Scope, bb tf.Output, aa tf.Output, optional ...TestOpAttr) (c
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
+`,
+		},
+		{
+			tag: "SampleDistortedBoundingBox",
+			opdef: `
+name: "SampleDistortedBoundingBox"
+input_arg {
+	name: "image_size"
+	type_attr: "T"
+}
+input_arg {
+	name: "bounding_boxes"
+	type: DT_FLOAT
+}
+output_arg {
+	name: "begin"
+	type_attr: "T"
+}
+output_arg {
+	name: "size"
+	type_attr: "T"
+}
+output_arg {
+	name: "bboxes"
+	type: DT_FLOAT
+}
+attr {
+	name: "T"
+	type: "type"
+	allowed_values {
+		list {
+			type: DT_UINT8
+			type: DT_INT8
+			type: DT_INT16
+			type: DT_INT32
+			type: DT_INT64
+		}
+	}
+}
+attr {
+	name: "seed"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "seed2"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "min_object_covered"
+	type: "float"
+	default_value {
+		f: 0.1
+	}
+}
+attr {
+	name: "aspect_ratio_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.75
+			f: 1.33
+		}
+	}
+}
+attr {
+	name: "area_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.05
+			f: 1
+		}
+	}
+}
+attr {
+	name: "max_attempts"
+	type: "int"
+	default_value {
+		i: 100
+	}
+}
+attr {
+	name: "use_image_if_no_bounding_boxes"
+	type: "bool"
+	default_value {
+		b: false
+	}
+}
+is_stateful: true
+`,
+			apidef: `
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  in_arg {
+    name: "image_size"
+    description: "Blah blah"
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "begin"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "size"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "bboxes"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed2"
+    description: "Blah blah"
+  }
+  attr {
+    name: "min_object_covered"
+    description: "Blah blah"
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "area_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "max_attempts"
+    description: "Blah blah"
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: "Blah blah"
+  }
+  summary: "Generate a single randomly distorted bounding box for an image."
+	description: "Blah blah"
+}
+`,
+			wanted: `
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Blah blah
+//
+// Arguments:
+//	image_size: Blah blah
+//	bounding_boxes: Blah blah
+//
+// Returns:
+//	begin: Blah blah
+//	size: Blah blah
+//	bboxes: Blah blah
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 `,
 		},
 	}

From 9489fbca6759050ac9d4c9348a65d79b0c5c06ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 19:47:32 -0700
Subject: [PATCH 0634/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 311655487
Change-Id: Ia4b492dc27139b316a3f8a5b90d68582c05efe4a
---
 tensorflow/go/op/wrappers.go | 206 +++++++++++++++++------------------
 1 file changed, 103 insertions(+), 103 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index e6725269279..04c36ed3399 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1274,7 +1274,7 @@ type SqueezeAttr func(optionalAttr)
 // value: If specified, only squeezes the dimensions listed. The dimension
 // index starts at 0. It is an error to squeeze a dimension that is not 1. Must
 // be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func SqueezeAxis(value []int64) SqueezeAttr {
@@ -1358,7 +1358,7 @@ type PlaceholderAttr func(optionalAttr)
 //
 // value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
 // shape is unconstrained.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -4016,7 +4016,7 @@ func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSampler
 //
 // value: A list of unigram counts or probabilities, one per ID in sequential
 // order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["unigrams"] = value
@@ -7166,7 +7166,7 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 type TensorArrayV2Attr func(optionalAttr)
 
 // TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7291,7 +7291,7 @@ type TensorArrayConcatV3Attr func(optionalAttr)
 // excluding the first dimension. Used to validate the shapes of
 // TensorArray elements. If this shape is not fully specified, concatenating
 // zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -7350,7 +7350,7 @@ type TensorArrayGatherV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7841,7 +7841,7 @@ type PriorityQueueV2Attr func(optionalAttr)
 // PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
 // value: The type of each component in a value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
@@ -8148,7 +8148,7 @@ type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 // MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
 // value: The type list for the return values.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
@@ -8160,7 +8160,7 @@ func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDe
 // MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
 // value: The list of shapes being produced.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
@@ -8516,7 +8516,7 @@ func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output
 type OptimizeDatasetAttr func(optionalAttr)
 
 // OptimizeDatasetOptimizationConfigs sets the optional optimization_configs attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func OptimizeDatasetOptimizationConfigs(value []string) OptimizeDatasetAttr {
 	return func(m optionalAttr) {
 		m["optimization_configs"] = value
@@ -9292,7 +9292,7 @@ type RandomShuffleQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
@@ -9515,7 +9515,7 @@ func DebugIdentityV2TensorDebugMode(value int64) DebugIdentityV2Attr {
 // DebugIdentityV2DebugUrls sets the optional debug_urls attribute to value.
 //
 // value: List of URLs to debug targets, e.g., file:///foo/tfdbg_dump.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityV2DebugUrls(value []string) DebugIdentityV2Attr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9580,7 +9580,7 @@ func DebugNanCountTensorName(value string) DebugNanCountAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNanCountDebugUrls(value []string) DebugNanCountAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9654,7 +9654,7 @@ func DebugIdentityTensorName(value string) DebugIdentityAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityDebugUrls(value []string) DebugIdentityAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -10521,7 +10521,7 @@ func ParseExampleDatasetV2Deterministic(value string) ParseExampleDatasetV2Attr
 }
 
 // ParseExampleDatasetV2RaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
@@ -10531,7 +10531,7 @@ func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
 }
 
 // ParseExampleDatasetV2RaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -10541,7 +10541,7 @@ func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleData
 }
 
 // ParseExampleDatasetV2RaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedSplitTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -13310,7 +13310,7 @@ func ParseExampleDatasetSloppy(value bool) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
@@ -13320,7 +13320,7 @@ func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13330,7 +13330,7 @@ func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatase
 }
 
 // ParseExampleDatasetRaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedSplitTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13895,7 +13895,7 @@ func DebugNumericSummaryTensorName(value string) DebugNumericSummaryAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNumericSummaryDebugUrls(value []string) DebugNumericSummaryAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -15139,7 +15139,7 @@ func TensorSummaryDescription(value string) TensorSummaryAttr {
 // TensorSummaryLabels sets the optional labels attribute to value.
 //
 // value: An unused list of strings.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
 		m["labels"] = value
@@ -15396,7 +15396,7 @@ func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableO
 }
 
 // MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -16112,7 +16112,7 @@ type ParseSingleSequenceExampleAttr func(optionalAttr)
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16122,7 +16122,7 @@ func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSing
 }
 
 // ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16138,7 +16138,7 @@ func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseS
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -16153,7 +16153,7 @@ func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleS
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16169,7 +16169,7 @@ func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) Parse
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21599,7 +21599,7 @@ func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22307,7 +22307,7 @@ func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22539,7 +22539,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22608,7 +22608,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDe
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluPaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluPaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23700,7 +23700,7 @@ func QuantizedMatMulWithBias(scope *Scope, a tf.Output, b tf.Output, bias tf.Out
 type TensorArrayGatherV2Attr func(optionalAttr)
 
 // TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -23895,7 +23895,7 @@ func CopyTensorName(value string) CopyAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyDebugOpsSpec(value []string) CopyAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -24127,7 +24127,7 @@ type FIFOQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
@@ -24471,7 +24471,7 @@ func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTable
 // MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
 // value: The shape of each value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25934,7 +25934,7 @@ func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *
 type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeBackpropInputExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27242,7 +27242,7 @@ func ParseSequenceExampleV2NcontextSparse(value int64) ParseSequenceExampleV2Att
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27254,7 +27254,7 @@ func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequence
 // ParseSequenceExampleV2ContextRaggedValueTypes sets the optional context_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27266,7 +27266,7 @@ func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSeq
 // ParseSequenceExampleV2ContextRaggedSplitTypes sets the optional context_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27282,7 +27282,7 @@ func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSeq
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -27312,7 +27312,7 @@ func ParseSequenceExampleV2NfeatureListDense(value int64) ParseSequenceExampleV2
 }
 
 // ParseSequenceExampleV2FeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27327,7 +27327,7 @@ func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSeque
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27339,7 +27339,7 @@ func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequ
 // ParseSequenceExampleV2FeatureListRaggedValueTypes sets the optional feature_list_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27351,7 +27351,7 @@ func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) Pars
 // ParseSequenceExampleV2FeatureListRaggedSplitTypes sets the optional feature_list_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27367,7 +27367,7 @@ func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) Pars
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -28548,7 +28548,7 @@ func BatchMaxEnqueuedBatches(value int64) BatchAttr {
 }
 
 // BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func BatchAllowedBatchSizes(value []int64) BatchAttr {
 	return func(m optionalAttr) {
 		m["allowed_batch_sizes"] = value
@@ -31175,7 +31175,7 @@ func VarHandleOpSharedName(value string) VarHandleOpAttr {
 //
 // value: The allowed devices containing the resource variable. Set when the output
 // ResourceHandle represents a per-replica/partitioned resource variable.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func VarHandleOpAllowedDevices(value []string) VarHandleOpAttr {
 	return func(m optionalAttr) {
 		m["allowed_devices"] = value
@@ -32522,7 +32522,7 @@ func CopyHostTensorName(value string) CopyHostAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyHostDebugOpsSpec(value []string) CopyHostAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -32851,7 +32851,7 @@ type IteratorFromStringHandleAttr func(optionalAttr)
 //
 // value: If specified, defines the type of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
@@ -32864,7 +32864,7 @@ func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromString
 //
 // value: If specified, defines the shape of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
@@ -34520,7 +34520,7 @@ type TensorArrayV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -36610,7 +36610,7 @@ func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36620,7 +36620,7 @@ func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceEx
 }
 
 // ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36636,7 +36636,7 @@ func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenc
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -36651,7 +36651,7 @@ func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExamp
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36667,7 +36667,7 @@ func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequen
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -39265,7 +39265,7 @@ type PrelinearizeTupleAttr func(optionalAttr)
 // tuple shapes in the order the shapes appear in the "shapes" input. The layout
 // elements for a sub-shape can be set to -1 in which case the corresponding layout
 // will be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
 	return func(m optionalAttr) {
 		m["layouts"] = value
@@ -40936,7 +40936,7 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 type DatasetToGraphAttr func(optionalAttr)
 
 // DatasetToGraphStatefulWhitelist sets the optional stateful_whitelist attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func DatasetToGraphStatefulWhitelist(value []string) DatasetToGraphAttr {
@@ -41345,7 +41345,7 @@ func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, l
 type TensorArrayConcatV2Attr func(optionalAttr)
 
 // TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -41660,7 +41660,7 @@ func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
 // TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
 //
 // value: The assignment of devices for the computation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["device_assignment"] = value
@@ -41670,7 +41670,7 @@ func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAtt
 // TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
 //
 // value: DEPRECATED. Use num_cores_per_replica instead.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["computation_shape"] = value
@@ -41678,7 +41678,7 @@ func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAtt
 }
 
 // TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["host_compute_core"] = value
@@ -41686,7 +41686,7 @@ func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAtt
 }
 
 // TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["padding_map"] = value
@@ -41737,7 +41737,7 @@ func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPURepli
 type TensorListConcatAttr func(optionalAttr)
 
 // TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -42546,7 +42546,7 @@ func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 type CollectiveReduceAttr func(optionalAttr)
 
 // CollectiveReduceWaitFor sets the optional wait_for attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
 	return func(m optionalAttr) {
 		m["wait_for"] = value
@@ -43077,7 +43077,7 @@ func DecodeCSVNaValue(value string) DecodeCSVAttr {
 }
 
 // DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
 	return func(m optionalAttr) {
 		m["select_cols"] = value
@@ -43622,7 +43622,7 @@ func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddin
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -44714,7 +44714,7 @@ func EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal(value int64) EnqueueTPUEm
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -44722,7 +44722,7 @@ func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmb
 }
 
 // EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["max_sequence_lengths"] = value
@@ -45506,7 +45506,7 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeBackpropFilterExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -46632,7 +46632,7 @@ type InfeedEnqueueTupleAttr func(optionalAttr)
 // all the tuple shapes, in the order the shapes appear in the "shapes" input.
 // The layout elements for a sub-shape can be set to -1, in which case the
 // corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
 	return func(m optionalAttr) {
 		m["layouts"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47517,7 +47517,7 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeExplicitPaddings(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47634,7 +47634,7 @@ type PaddingFIFOQueueV2Attr func(optionalAttr)
 // zeros up to the maximum shape of all elements in the given batch.
 // If the length of this attr is 0, different queue elements may have
 // different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
@@ -47886,7 +47886,7 @@ type InfeedEnqueueAttr func(optionalAttr)
 // InfeedEnqueueShape sets the optional shape attribute to value.
 //
 // value: The shape of the tensor.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -47898,7 +47898,7 @@ func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 // value: A vector holding the requested layout in minor-to-major sequence.
 // If a layout attribute is passed, but its values are all -1, the layout will
 // be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
 		m["layout"] = value
@@ -48506,7 +48506,7 @@ func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48653,7 +48653,7 @@ type PrelinearizeAttr func(optionalAttr)
 // PrelinearizeShape sets the optional shape attribute to value.
 //
 // value: The shape of the tensor.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -48665,7 +48665,7 @@ func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
 // value: A vector holding the requested layout in minor-to-major sequence. If a layout
 // attribute is passed but its values are all -1 the layout will be computed by
 // the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeLayout(value []int64) PrelinearizeAttr {
 	return func(m optionalAttr) {
 		m["layout"] = value
@@ -49084,7 +49084,7 @@ func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEm
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -49092,7 +49092,7 @@ func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmb
 }
 
 // EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["max_sequence_lengths"] = value

From a9c0ce87a68b30949da76bb921ee5985039f6fb8 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Thu, 14 May 2020 20:09:23 -0700
Subject: [PATCH 0635/1533] Check the minimum metadata parser version in the
 MetadataExtractor Java library.

PiperOrigin-RevId: 311657605
Change-Id: I39169392214b8a70d5882c5ec4af93021480ce23
---
 .../support/metadata/MetadataExtractor.java   | 73 ++++++++++++++++++-
 .../support/metadata/ModelMetadataInfo.java   | 13 ++++
 .../support/metadata/metadata_schema.fbs      | 33 ++++++++-
 .../lite/tools/versioning/runtime_version.h   |  4 +-
 4 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
index 054ea0e9730..3ded50e5d95 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -54,6 +54,11 @@ import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
  * MetadataExtractor} omits subgraph index as an input in its methods.
  */
 public class MetadataExtractor {
+  // TODO(b/156539454): remove the hardcode versioning number and populate the version through
+  // genrule.
+  /** The version of the metadata parser that this {@link MetadataExtractor} library depends on. */
+  public static final String METADATA_PARSER_VERSION = "1.0.0";
+
   /** The helper class to load metadata from TFLite model FlatBuffer. */
   private final ModelInfo modelInfo;
 
@@ -76,6 +81,15 @@ public class MetadataExtractor {
     ByteBuffer metadataBuffer = modelInfo.getMetadataBuffer();
     if (metadataBuffer != null) {
       metadataInfo = new ModelMetadataInfo(metadataBuffer);
+
+      // Prints warning message if the minimum parser version is not satisfied.
+      if (!isMinimumParserVersionSatisfied()) {
+        System.err.printf(
+            "<Warning> Some fields in the metadata belong to a future schema. The minimum parser"
+                + " version required is %s, but the version of the current metadata parser is %s",
+            metadataInfo.getMininumParserVersion(), METADATA_PARSER_VERSION);
+      }
+
       checkArgument(
           modelInfo.getInputTensorCount() == metadataInfo.getInputTensorCount(),
           String.format(
@@ -98,7 +112,7 @@ public class MetadataExtractor {
   }
 
   /** Returns {@code true} if the model has metadata. Otherwise, returns {@code false}. */
-  public Boolean hasMetadata() {
+  public boolean hasMetadata() {
     return metadataInfo != null;
   }
 
@@ -216,7 +230,31 @@ public class MetadataExtractor {
   }
 
   /**
-   * Asserts if {@link metdadataInfo} is not initialized. Some models may not have metadata and this
+   * Returns {@code true} if the minimum parser version required by the given metadata flatbuffer
+   * precedes or equals to the version of the metadata parser that this MetadataExtractor library is
+   * relying on. All fields in the metadata can be parsed correctly with this metadata extractor
+   * library in this case. Otherwise, it returns {@code false}.
+   *
+   * <p>For example, assume the underlying metadata parser version is {@code 1.14.1},
+   *
+   * <ul>
+   *   <li>it returns {@code true}, if the required minimum parser version is the same or older,
+   *       such as {@code 1.14.1} or {@code 1.14.0}. Null version precedes all numeric versions,
+   *       because some metadata flatbuffers are generated before the first versioned release; <br>
+   *   <li>it returns {@code false}, if the required minimum parser version is newer, such as {@code
+   *       1.14.2}.
+   * </ul>
+   */
+  public final boolean isMinimumParserVersionSatisfied() {
+    String minVersion = metadataInfo.getMininumParserVersion();
+    if (minVersion == null) {
+      return true;
+    }
+    return compareVersions(minVersion, METADATA_PARSER_VERSION) <= 0;
+  }
+
+  /**
+   * Asserts if {@link #metadataInfo} is not initialized. Some models may not have metadata and this
    * is allowed. However, invoking methods that reads the metadata is not allowed.
    *
    * @throws IllegalStateException if this model does not contain model metadata
@@ -260,4 +298,35 @@ public class MetadataExtractor {
       return null;
     }
   }
+
+  /**
+   * Compares two semantic version numbers.
+   *
+   * <p>Examples of comparing two versions: <br>
+   * {@code 1.9} precedes {@code 1.14}; <br>
+   * {@code 1.14} precedes {@code 1.14.1}; <br>
+   * {@code 1.14} and {@code 1.14.0} are euqal;
+   *
+   * @return the value {@code 0} if the two versions are equal; a value less than {@code 0} if
+   *     {@code version1} precedes {@code version2}; a value greater than {@code 0} if {@code
+   *     version2} precedes {@code version1}.
+   */
+  private static int compareVersions(String version1, String version2) {
+    // Using String.split instead of the recommanded Guava Splitter because we've been avoiding
+    // depending on other third party libraries in this project.
+    String[] levels1 = version1.split("\\.", 0);
+    String[] levels2 = version2.split("\\.", 0);
+
+    int length = Math.max(levels1.length, levels2.length);
+    for (int i = 0; i < length; i++) {
+      Integer v1 = i < levels1.length ? Integer.parseInt(levels1[i]) : 0;
+      Integer v2 = i < levels2.length ? Integer.parseInt(levels2[i]) : 0;
+      int compare = v1.compareTo(v2);
+      if (compare != 0) {
+        return compare;
+      }
+    }
+
+    return 0;
+  }
 }
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
index 57fa7113c2a..751ed500dc2 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
@@ -38,6 +38,9 @@ final class ModelMetadataInfo {
   /** Metadata array of output tensors. */
   private final List</* @Nullable */ TensorMetadata> outputsMetadata;
 
+  /** The minimum parser version required to fully understand the metadata flatbuffer. */
+  private final String /* @Nullable */ minVersion;
+
   /**
    * Creates a {@link ModelMetadataInfo} with the metadata FlatBuffer, {@code buffer}.
    *
@@ -56,6 +59,7 @@ final class ModelMetadataInfo {
 
     inputsMetadata = getInputsMetadata(modelMetadata);
     outputsMetadata = getOutputsMetadata(modelMetadata);
+    minVersion = modelMetadata.minParserVersion();
   }
 
   /** Gets the count of input tensors with metadata in the metadata FlatBuffer. */
@@ -77,6 +81,15 @@ final class ModelMetadataInfo {
     return inputsMetadata.get(inputIndex);
   }
 
+  /**
+   * Gets the minimum parser version of the metadata. It can be {@code null} if the version is not
+   * populated.
+   */
+  @Nullable
+  String getMininumParserVersion() {
+    return minVersion;
+  }
+
   /** Gets the root handler for the model metadata. */
   ModelMetadata getModelMetadata() {
     return modelMetadata;
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index 7e8d148d504..b8e529ad1c5 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -29,11 +29,31 @@ namespace tflite;
 // generate the model interface. It is recommended to fill in at least those
 // enties to boost the codegen performance.
 
-// This corresponds to the schema version.
+// LINT.IfChange
+
+// The Metadata schema is versioned by the Semantic versioning number, which
+// tracks the schema changes according to the Semantic versioning rules.
+//
+// ModelMetadata.min_parser_version indicates the minimum necessary metadata
+// parser version to fully understand all fields in a given metadata flatbuffer.
+//
+// New fields and types will have associated comments with the schema version for
+// which they were added.
+//
+// Schema Semantic version: 1.0.0
+
+// This indicates the flatbuffer compatibility. The number will bump up when a
+// break change is applied to the schema, such as removing fields or adding new
+// fields to the middle of a table.
 file_identifier "M001";
 // File extension of any written files.
 file_extension "tflitemeta";
 
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     /supportmetadata/java/src/java/org/tensorflow/lite/support/metadata/\
+//     MetadataExtractor.java)
+
+// LINT.IfChange
 enum AssociatedFileType : byte {
   UNKNOWN = 0,
   // Files such as readme.txt
@@ -498,6 +518,17 @@ table ModelMetadata {
 
   // A list of associated files of this model.
   associated_files:[AssociatedFile];
+
+  // The minimum metadata parser version that can fully understand the fields in
+  // the metadata flatbuffer. The version is effectively the largest version
+  // number among the versions of all the fields populated and the smallest
+  // compatible version indicated by the file identifier.
+  //
+  // This field is automaticaly populated by the MetadataPopulator when
+  // the metadata is populated into a TFLite model.
+  min_parser_version:string;
 }
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     support/metadata/cc/metadata_version.cc)
 
 root_type ModelMetadata;
diff --git a/tensorflow/lite/tools/versioning/runtime_version.h b/tensorflow/lite/tools/versioning/runtime_version.h
index e4c25221310..ad88bd2ab89 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.h
+++ b/tensorflow/lite/tools/versioning/runtime_version.h
@@ -24,8 +24,8 @@ namespace tflite {
 void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
 
 // Returns true if the first version string precedes the second.
-// For example, '1.14' should precede '1.9', also '1.14.1' should precede
-// '1.14'. If two version string is equal, then false will be returned.
+// For example, '1.9' should precede '1.14', also '1.14' should precede
+// '1.14.1'. If two version string is equal, then false will be returned.
 bool CompareRuntimeVersion(const std::string&, const std::string&);
 
 }  // namespace tflite

From 97f2fffe7ef89d6eb1b013698538b1726345a7e5 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Thu, 14 May 2020 20:45:52 -0700
Subject: [PATCH 0636/1533] This is an internal change not visible to the
 public.

PiperOrigin-RevId: 311661259
Change-Id: I8443f6037e3cabca1dca72ce1748eea8fd71770c
---
 tensorflow/core/tpu/tpu_config_c_api.h | 33 +++++++++++++++-----------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
index 334a6a19325..b7caf0648b1 100644
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
 
 #include <cstddef>
+#include <cstdint>
 
 #include "tensorflow/c/tf_status.h"
 
@@ -26,29 +27,33 @@ extern "C" {
 
 bool TPUHostInitialized();
 
-// TODO(frankchn): Modify API to take in raw values instead of Tensors.
-void ConfigureDistributedTpuOp_DoWork(size_t input_size,
-                                      TpuSerializedProto** inputs,
-                                      TpuSerializedProto* output,
+void ConfigureDistributedTpuOp_DoWork(const size_t num_cores_per_host_size,
+                                      const int32_t* num_cores_per_host,
+                                      size_t* host_config_output_size,
+                                      char** host_config_output,
                                       TF_Status* status);
 
-void WaitForDistributedTpuOp_DoWork(size_t input_size,
-                                    TpuSerializedProto** inputs,
-                                    TpuSerializedProto* output,
-                                    TF_Status* status);
+void WaitForDistributedTpuOp_DoWork(
+    const size_t num_hosts, const size_t num_cores_per_host,
+    const int32_t** host_ordinal_to_global_core_id_map,
+    size_t* tpu_topology_output_size, char** tpu_topology_output,
+    TF_Status* status);
 
 void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
 
 void InitializeHostForDistributedTpuOp_DoWork(
-    size_t input_size, TpuSerializedProto** inputs,
-    bool enable_whole_mesh_compilations, TpuSerializedProto* output,
-    TF_Status* status);
+    const size_t tpu_host_config_size, const char* tpu_host_config,
+    const bool enable_whole_mesh_compilations, size_t* core_id_output_size,
+    int32_t** core_id_output, TF_Status* status);
 
-void SetGlobalTPUArrayOp_DoWork(size_t input_size, TpuSerializedProto** inputs,
-                                TF_Status* status);
+void SetGlobalTPUArrayOp_DoWork(const size_t tpu_topology_size,
+                                const char* tpu_topology, TF_Status* status);
 
-void DisconnectDistributedTpuChipsOp_DoWork(TpuSerializedProto* output,
+void DisconnectDistributedTpuChipsOp_DoWork(int32_t* number_of_chips_output,
                                             TF_Status* status);
+
+void TpuConfigurationApi_FreeCharArray(char* output);
+void TpuConfigurationApi_FreeInt32Array(int32_t* output);
 }
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_

From 5cf4311435e9087e0e9c7f4e1d4b415de6761530 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Thu, 14 May 2020 20:59:56 -0700
Subject: [PATCH 0637/1533] Fix a memory leak.

PiperOrigin-RevId: 311662668
Change-Id: I59f9c9cdb8baed7a9828bb818ce1d293d185e6b6
---
 tensorflow/c/eager/c_api_remote_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 544dffb664c..d04e4ef4212 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -447,6 +447,9 @@ void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
   bool initialized = false;
   memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
   EXPECT_EQ(initialized, true);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(is_initialized[0]);
+  TFE_DeleteOp(op);
   delete status;
 }
 

From 28899d991f8f7443a04343fe9f308a1ea28a0795 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 14 May 2020 21:28:44 -0700
Subject: [PATCH 0638/1533] Optimize int8 broadcast min.

PiperOrigin-RevId: 311665392
Change-Id: I566547f44975d3d88cb7a17e8c6418a4a186ccda
---
 .../internal/optimized/optimized_ops.h        | 109 ++++++++++++++----
 tensorflow/lite/kernels/maximum_minimum.cc    |  38 +++++-
 2 files changed, 124 insertions(+), 23 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index c72400f33a5..b18f0f4bb5a 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -7963,14 +7963,59 @@ inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
   }
 }
 
-inline void BroadcastMaximumFivefold(
-    const ArithmeticParams& unswitched_params,
-    const RuntimeShape& unswitched_input1_shape,
-    const int8* unswitched_input1_data,
-    const RuntimeShape& unswitched_input2_shape,
-    const int8* unswitched_input2_data, const RuntimeShape& output_shape,
-    int8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMaximumFivefoldInt8/8bit");
+// Assume input1 & input2 have the same scale & zero point.
+inline void MinimumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_val, input2_val);
+  }
+}
+
+inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_data, input2_val);
+  }
+}
+
+template <typename ElementwiseF, typename ScalarBroadcastF>
+inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
+                                    const RuntimeShape& unswitched_input1_shape,
+                                    const int8* unswitched_input1_data,
+                                    const RuntimeShape& unswitched_input2_shape,
+                                    const int8* unswitched_input2_data,
+                                    const RuntimeShape& output_shape,
+                                    int8* output_data,
+                                    ElementwiseF elementwise_f,
+                                    ScalarBroadcastF scalar_broadcast_f,
+                                    const std::string& label_name) {
+  ruy::profiler::ScopeLabel label(label_name);
 
   ArithmeticParams switched_params = unswitched_params;
   switched_params.input1_offset = unswitched_params.input2_offset;
@@ -8000,9 +8045,8 @@ inline void BroadcastMaximumFivefold(
   const int8* input2_data_reset = input2_data;
   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
   // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
   // input2.shape.FlatSize = y0 * y2 * y3 * y4.
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
@@ -8018,8 +8062,8 @@ inline void BroadcastMaximumFivefold(
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
           for (int i3 = 0; i3 < y3; ++i3) {
-            MaximumElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                               output_data_ptr);
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
+                          output_data_ptr);
             input2_data_ptr += y4;
             output_data_ptr += y4;
           }
@@ -8031,23 +8075,23 @@ inline void BroadcastMaximumFivefold(
       input2_data_reset = input2_data_ptr;
     }
   } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
     //
     // Note that this handles the case of pure scalar broadcast when
     // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
     // broadcast with batch (as y2 > 1).
     //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
     // AddScalarBroadcast function.
     for (int i0 = 0; i0 < y0; ++i0) {
       const int8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
-          MaximumScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                                 output_data_ptr);
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
           input2_data_ptr += y3;
           output_data_ptr += y3;
           input1_data_ptr += 1;
@@ -8058,7 +8102,6 @@ inline void BroadcastMaximumFivefold(
   }
 }
 
-// TODO(b/156140316): Try to unify the broadcast dispatch logic for binary ops.
 template <typename Op>
 inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
                                      const RuntimeShape& input1_shape,
@@ -8073,8 +8116,30 @@ inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
         output_data, op);
   }
 
-  BroadcastMaximumFivefold(params, input1_shape, input1_data, input2_shape,
-                           input2_data, output_shape, output_data);
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MaximumElementwise, MaximumScalarBroadcast,
+                          "BroadcastMaximumFivefoldInt8/8bit");
+}
+
+template <typename Op>
+inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MinimumElementwise, MinimumScalarBroadcast,
+                          "BroadcastMinimumFivefoldInt8/8bit");
 }
 
 }  // namespace optimized_ops
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index abe9647f69e..cad86acd8dd 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -125,6 +125,31 @@ void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MaximumOp>(
       MaximumOp::template op<int8>);
 }
 
+// Minimum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MinimumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMinimumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MinimumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MinimumOp::template op<int8>);
+}
+
 template <KernelType kernel_type, typename OpType>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
@@ -186,10 +211,21 @@ TfLiteRegistration* Register_MINIMUM_REF() {
                             maximum_minimum::MinimumOp>};
   return &r;
 }
+
+TfLiteRegistration* Register_MINIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
+
 TfLiteRegistration* Register_MAXIMUM() {
   return Register_MAXIMUM_GENERIC_OPT();
 }
-TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
+TfLiteRegistration* Register_MINIMUM() {
+  return Register_MINIMUM_GENERIC_OPT();
+}
 
 }  // namespace builtin
 }  // namespace ops

From 24c75ce5016efb4ab107f27b96aac07549d8617b Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Thu, 14 May 2020 22:18:50 -0700
Subject: [PATCH 0639/1533] Addresses
 https://github.com/tensorflow/tensorflow/issues/38694 by ensuring TensorShape
 v1/v2 backwards compatibility.

PiperOrigin-RevId: 311670326
Change-Id: I0e7045ff4eb19cb0096d6fa41a494c8f9a6b85c5
---
 tensorflow/python/ops/image_ops_impl.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e6a5cdbf4e8..52b65efad67 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1231,8 +1231,10 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
                                    name='size')
 
     size_const_as_shape = tensor_util.constant_value_as_shape(size)
-    new_height_const = size_const_as_shape.dims[0].value
-    new_width_const = size_const_as_shape.dims[1].value
+    new_height_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                       0).value
+    new_width_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                      1).value
 
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.

From 2b2e4412053fa3df5861cf1d9a8ff82061e06a65 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 00:27:20 -0700
Subject: [PATCH 0640/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/dad2e92eaf53

PiperOrigin-RevId: 311683742
Change-Id: I6177dd65ae548b719c656201ed1a7f9829acd745
---
 third_party/mlir/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 8b61ce98dab..1bddf2180bc 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -564,6 +564,7 @@ cc_library(
         ":StandardOps",
         ":Support",
         ":Transforms",
+        ":VectorOps",
     ],
 )
 

From cca62cc73b19a017ab7e0a1b34690893f68cc9f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 01:02:05 -0700
Subject: [PATCH 0641/1533] Add a new DelegateUtils::InvokeWithCPUFallback API
 for handling delegate failures automatically in Invoke(). This is especially
 useful for NNAPI usage, as drivers may evict clients.

The fallback is suitable when callers:
- Use CPU buffers only (no AHardwareBuffer or GL buffers)
- Do not store pointers to tensor data across Invoke() calls, as they are invalidated
- Do not have tensor state across multiple Invoke() calls

PiperOrigin-RevId: 311687567
Change-Id: Ib3b6fd3fb6c4e85c9512d939d8e87efea03ac4fe
---
 tensorflow/lite/BUILD                         |  19 +-
 tensorflow/lite/core/subgraph.cc              |   7 +
 tensorflow/lite/core/subgraph.h               |   6 +
 tensorflow/lite/delegates/BUILD               |  23 ++
 .../lite/delegates/interpreter_utils.cc       |  67 ++++
 tensorflow/lite/delegates/interpreter_utils.h |  52 +++
 .../lite/delegates/interpreter_utils_test.cc  |  92 +++++
 tensorflow/lite/delegates/utils.h             |   2 +
 tensorflow/lite/interpreter.cc                |   4 +
 tensorflow/lite/interpreter.h                 |  13 +
 tensorflow/lite/interpreter_test.cc           | 309 +---------------
 tensorflow/lite/interpreter_test.h            | 331 ++++++++++++++++++
 12 files changed, 618 insertions(+), 307 deletions(-)
 create mode 100644 tensorflow/lite/delegates/interpreter_utils.cc
 create mode 100644 tensorflow/lite/delegates/interpreter_utils.h
 create mode 100644 tensorflow/lite/delegates/interpreter_utils_test.cc
 create mode 100644 tensorflow/lite/interpreter_test.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 14babee2da7..4d8c07aa15b 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -340,11 +340,27 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "interpreter_test_fixtures",
+    testonly = True,
+    hdrs = ["interpreter_test.h"],
+    deps = [
+        ":framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Test main interpreter
 cc_test(
     name = "interpreter_test",
     size = "small",
-    srcs = ["interpreter_test.cc"],
+    srcs = [
+        "interpreter_test.cc",
+    ],
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     tags = [
         "tflite_not_portable_ios",  # TODO(b/117786830)
@@ -352,6 +368,7 @@ cc_test(
     deps = [
         ":external_cpu_backend_context",
         ":framework",
+        ":interpreter_test_fixtures",
         ":string_util",
         ":version",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 7f4e0e286ea..81710df128b 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -533,6 +533,11 @@ void Subgraph::SetCancellationFunction(void* data,
   check_cancelled_func_ = check_cancelled_func;
 }
 
+bool Subgraph::IsCancelled() {
+  return (check_cancelled_func_ != nullptr) &&
+         (*check_cancelled_func_)(cancellation_data_);
+}
+
 void Subgraph::ReserveNodes(int count) {
   nodes_and_registration_.reserve(count);
 }
@@ -1316,6 +1321,8 @@ TfLiteStatus Subgraph::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Subgraph::HasDelegates() { return !delegates_applied_.empty(); }
+
 TfLiteStatus Subgraph::EnsureMemoryAllocations() {
   if (memory_planner_) {
     state_ = kStateUninvokable;
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 0b0c1e31e89..d6067daaa6a 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -553,6 +553,9 @@ class Subgraph {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if the subgraph has delegates applied.
+  bool HasDelegates();
+
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
@@ -578,6 +581,9 @@ class Subgraph {
   // Ensures the memory required is planned and allocated.
   TfLiteStatus EnsureMemoryAllocations();
 
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
   // The state of the Interpreter.
   enum State {
     // The interpreter isn't ready to be invoked.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index df671675ec9..8d4c921576d 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -43,3 +43,26 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "interpreter_utils",
+    srcs = ["interpreter_utils.cc"],
+    hdrs = ["interpreter_utils.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
+cc_test(
+    name = "interpreter_utils_test",
+    srcs = ["interpreter_utils_test.cc"],
+    linkopts = tflite_linkopts(),
+    linkstatic = 1,
+    deps = [
+        ":interpreter_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:interpreter_test_fixtures",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
new file mode 100644
index 00000000000..85d79d887fb
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.cc
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/interpreter_utils.h"
+
+namespace tflite {
+namespace delegates {
+TfLiteStatus InterpreterUtils::InvokeWithCPUFallback(Interpreter* interpreter) {
+  TfLiteStatus status = interpreter->Invoke();
+  if (status == kTfLiteOk || interpreter->IsCancelled() ||
+      !interpreter->HasDelegates()) {
+    return status;
+  }
+  // Retry without delegation.
+  // TODO(b/138706191): retry only if error is due to delegation.
+  TF_LITE_REPORT_ERROR(
+      interpreter->error_reporter(),
+      "Invoke() failed in the presence of delegation. Retrying without.");
+
+  // Copy input data to a buffer.
+  // Input data is safe since Subgraph::PrepareOpsAndTensors() passes
+  // preserve_inputs=true to ArenaPlanner.
+  std::vector<char> buf;
+  size_t input_size = 0;
+
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    input_size += t->bytes;
+  }
+  buf.reserve(input_size);
+  auto bufp = buf.begin();
+  for (auto i : interpreter->inputs()) {
+    // TF_LITE_ENSURE_STATUS(interpreter->EnsureTensorDataIsReadable(i));
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::copy(t->data.raw, t->data.raw + t->bytes, bufp);
+    bufp += t->bytes;
+  }
+
+  TF_LITE_ENSURE_STATUS(interpreter->RemoveAllDelegates());
+
+  // Copy inputs from buffer.
+  bufp = buf.begin();
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::copy(bufp, bufp + t->bytes, t->data.raw);
+    bufp += t->bytes;
+  }
+
+  // Invoke again.
+  TF_LITE_ENSURE_STATUS(interpreter->Invoke());
+  return kTfLiteDelegateError;
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
new file mode 100644
index 00000000000..f736c2db1f4
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+
+#include "tensorflow/lite/interpreter.h"
+
+// Utility functions and classes for using delegates.
+
+namespace tflite {
+namespace delegates {
+#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+class InterpreterUtils {
+ public:
+  /// Invokes an interpreter with automatic fallback from delegation to CPU.
+  ///
+  /// If using the delegate fails, the delegate is automatically undone and an
+  /// attempt made to return the interpreter to an invokable state.
+  ///
+  /// Allowing the fallback is suitable only if both of the following hold:
+  /// - The caller is known not to cache pointers to tensor data across Invoke()
+  ///   calls.
+  /// - The model is not stateful (no variables, no LSTMs) or the state isn't
+  ///   needed between batches.
+  ///
+  /// Returns one of the following three status codes:
+  /// 1. kTfLiteOk: Success. Output is valid.
+  /// 2. kTfLiteDelegateError: Delegate error but fallback succeeded. Output is
+  /// valid.
+  /// NOTE: This undoes all delegates previously applied to the Interpreter.
+  /// 3. kTfLiteError: Unexpected/runtime failure. Output is invalid.
+  /// WARNING: This is an experimental API and subject to change.
+  static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
+};
+#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
diff --git a/tensorflow/lite/delegates/interpreter_utils_test.cc b/tensorflow/lite/delegates/interpreter_utils_test.cc
new file mode 100644
index 00000000000..8dc856d796c
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/interpreter_utils.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter_test.h"
+
+namespace tflite {
+namespace {
+
+TEST_F(TestDelegate, DelegateNodeInvokeFailureFallback) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Delegation modified execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // Delegation removed, returning to original execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // Check outputs.
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  // Pre-delegation execution plan should have three nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // All delegates should be undone.
+  EXPECT_EQ(interpreter_->execution_plan().size(), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index d6d22c4efa2..3b0668af04b 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_H_
 
+// Utility functions and classes for implementing delegates.
+
 #include <functional>
 #include <limits>
 #include <set>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index c8ccf671d60..167254a2a62 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -310,6 +310,8 @@ void Interpreter::SetCancellationFunction(void* data,
   }
 }
 
+bool Interpreter::IsCancelled() { return primary_subgraph().IsCancelled(); }
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
@@ -340,6 +342,8 @@ TfLiteStatus Interpreter::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
+
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index b93fd76c13b..aa9c54d295f 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -42,6 +42,9 @@ namespace tflite {
 
 class InterpreterTest;
 class TestDelegate;
+namespace delegates {
+class InterpreterUtils;  // Class for friend declarations.
+}  // namespace delegates
 
 namespace impl {
 
@@ -529,6 +532,7 @@ class Interpreter {
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
   friend class tflite::TestDelegate;
+  friend class tflite::delegates::InterpreterUtils;
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -542,6 +546,15 @@ class Interpreter {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if delegates have been applied.
+  bool HasDelegates();
+
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
+  // Get the error reporter associated with this interpreter.
+  ErrorReporter* error_reporter() { return error_reporter_; }
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index abd92ad563d..1d8f82ef16a 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
+#include "tensorflow/lite/interpreter_test.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -35,25 +36,6 @@ limitations under the License.
 
 namespace tflite {
 
-// InterpreterTest is a friend of Interpreter, so it can access context_.
-class InterpreterTest : public ::testing::Test {
- public:
-  template <typename Delegate>
-  static TfLiteStatus ModifyGraphWithDelegate(
-      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
-    Interpreter::TfLiteDelegatePtr tflite_delegate(
-        delegate.release(), [](TfLiteDelegate* delegate) {
-          delete reinterpret_cast<Delegate*>(delegate);
-        });
-    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
-  }
-
- protected:
-  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
-
-  Interpreter interpreter_;
-};
-
 namespace ops {
 namespace builtin {
 TfLiteRegistration* Register_PADV2();
@@ -1304,291 +1286,6 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
-// Build a kernel registration for an op that copies its one input
-// to an output
-TfLiteRegistration AddOpRegistration() {
-  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-  reg.custom_name = "my_add";
-  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
-
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
-
-    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
-    for (int i = 0; i < input1->dims->size; ++i) {
-      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
-    }
-
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-        context, output, TfLiteIntArrayCopy(input1->dims)));
-    return kTfLiteOk;
-  };
-
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TF_LITE_ENSURE(context, a0);
-    TF_LITE_ENSURE(context, a0->data.f);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
-    TF_LITE_ENSURE(context, a1);
-    TF_LITE_ENSURE(context, a1->data.f);
-    TfLiteTensor* out = GetOutput(context, node, 0);
-    TF_LITE_ENSURE(context, out);
-    TF_LITE_ENSURE(context, out->data.f);
-    int num = a0->dims->data[0];
-    for (int i = 0; i < num; i++) {
-      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-  return reg;
-}
-
-}  // namespace
-
-// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
-class TestDelegate : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
-    interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = AddOpRegistration();
-    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
-  }
-
-  void TearDown() override {
-    // Interpreter relies on delegate to free the resources properly. Thus
-    // the life cycle of delegate must be longer than interpreter.
-    interpreter_.reset();
-    delegate_.reset();
-  }
-
-  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
-
-  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
-
-  TfLiteStatus RemoveAllDelegates() {
-    return interpreter_->RemoveAllDelegates();
-  }
-
- protected:
-  class SimpleDelegate {
-   public:
-    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
-    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
-    // value-copyable and compatible with TfLite.
-    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
-    // min_ops_per_subset: If >0, partitioning preview is used to choose only
-    // those subsets with min_ops_per_subset number of nodes.
-    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
-        : nodes_(nodes),
-          fail_delegate_node_prepare_(fail_node_prepare),
-          min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
-      delegate_.Prepare = [](TfLiteContext* context,
-                             TfLiteDelegate* delegate) -> TfLiteStatus {
-        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
-        TfLiteIntArray* nodes_to_separate =
-            TfLiteIntArrayCreate(simple->nodes_.size());
-        // Mark nodes that we want in TfLiteIntArray* structure.
-        int index = 0;
-        for (auto node_index : simple->nodes_) {
-          nodes_to_separate->data[index++] = node_index;
-          // make sure node is added
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-        }
-        // Check that all nodes are available
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(
-            context->GetExecutionPlan(context, &execution_plan));
-        for (int exec_index = 0; exec_index < execution_plan->size;
-             exec_index++) {
-          int node_index = execution_plan->data[exec_index];
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          if (exec_index == node_index) {
-            // Check op details only if it wasn't delegated already.
-            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-          }
-        }
-
-        // Get preview of delegate partitioning from the context.
-        TfLiteDelegateParams* params_array;
-        int num_partitions;
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        if (simple->min_ops_per_subset() > 0) {
-          // Build a new vector of ops from subsets with atleast the minimum
-          // size.
-          std::vector<int> allowed_ops;
-          for (int idx = 0; idx < num_partitions; ++idx) {
-            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
-            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
-            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
-                               nodes_in_subset->data + nodes_in_subset->size);
-          }
-
-          // Free existing nodes_to_separate & initialize a new array with
-          // allowed_ops.
-          TfLiteIntArrayFree(nodes_to_separate);
-          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
-          memcpy(nodes_to_separate->data, allowed_ops.data(),
-                 sizeof(int) * nodes_to_separate->size);
-        }
-
-        // Another call to PreviewDelegateParitioning should be okay, since
-        // partitioning memory is managed by context.
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, simple->FakeFusedRegistration(), nodes_to_separate,
-            delegate);
-        TfLiteIntArrayFree(nodes_to_separate);
-        return kTfLiteOk;
-      };
-      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* tensor) -> TfLiteStatus {
-        // TODO(ycling): Implement tests to test buffer copying logic.
-        return kTfLiteOk;
-      };
-      delegate_.CopyFromBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle,
-             TfLiteTensor* output) -> TfLiteStatus {
-        TFLITE_CHECK_GE(buffer_handle, -1);
-        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
-        const float floats[] = {6., 6., 6.};
-        int num = output->dims->data[0];
-        for (int i = 0; i < num; i++) {
-          output->data.f[i] = floats[i];
-        }
-        return kTfLiteOk;
-      };
-
-      delegate_.FreeBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
-      // Store type-punned data SimpleDelegate structure.
-      delegate_.data_ = static_cast<void*>(this);
-      delegate_.flags = delegate_flags;
-    }
-
-    TfLiteRegistration FakeFusedRegistration() {
-      TfLiteRegistration reg = {nullptr};
-      reg.custom_name = "fake_fused_op";
-
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        out->data_is_stale = true;
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_invoke_) {
-        reg.invoke = [](TfLiteContext* context,
-                        TfLiteNode* node) -> TfLiteStatus {
-          return kTfLiteError;
-        };
-      }
-
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
-        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-          return kTfLiteError;
-        };
-      }
-
-      return reg;
-    }
-
-    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
-
-    int min_ops_per_subset() { return min_ops_per_subset_; }
-
-   private:
-    std::vector<int> nodes_;
-    TfLiteDelegate delegate_;
-    bool fail_delegate_node_prepare_ = false;
-    int min_ops_per_subset_ = 0;
-    bool fail_delegate_node_invoke_ = false;
-  };
-
-  std::unique_ptr<Interpreter> interpreter_;
-  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
-};
-namespace {
-
 TEST_F(TestDelegate, BasicDelegate) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
@@ -1967,7 +1664,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -1981,7 +1678,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
diff --git a/tensorflow/lite/interpreter_test.h b/tensorflow/lite/interpreter_test.h
new file mode 100644
index 00000000000..d4f0c8a05c5
--- /dev/null
+++ b/tensorflow/lite/interpreter_test.h
@@ -0,0 +1,331 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_INTERPRETER_TEST_H_
+#define TENSORFLOW_LITE_INTERPRETER_TEST_H_
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+
+namespace tflite {
+// InterpreterTest is a friend of Interpreter, so it can access context_.
+class InterpreterTest : public ::testing::Test {
+ public:
+  template <typename Delegate>
+  static TfLiteStatus ModifyGraphWithDelegate(
+      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
+    Interpreter::TfLiteDelegatePtr tflite_delegate(
+        delegate.release(), [](TfLiteDelegate* delegate) {
+          delete reinterpret_cast<Delegate*>(delegate);
+        });
+    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
+  }
+
+ protected:
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
+
+  Interpreter interpreter_;
+};
+
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    const TfLiteTensor* input1 = GetInput(context, node, 0);
+    const TfLiteTensor* input2 = GetInput(context, node, 1);
+    TfLiteTensor* output = GetOutput(context, node, 0);
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    TF_LITE_ENSURE(context, a0);
+    TF_LITE_ENSURE(context, a0->data.f);
+    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    TF_LITE_ENSURE(context, a1);
+    TF_LITE_ENSURE(context, a1->data.f);
+    TfLiteTensor* out = GetOutput(context, node, 0);
+    TF_LITE_ENSURE(context, out);
+    TF_LITE_ENSURE(context, out->data.f);
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
+  }
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
+    // min_ops_per_subset: If >0, partitioning preview is used to choose only
+    // those subsets with min_ops_per_subset number of nodes.
+    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
+    explicit SimpleDelegate(
+        const std::vector<int>& nodes,
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false, int min_ops_per_subset = 0,
+        bool fail_node_invoke = false)
+        : nodes_(nodes),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          min_ops_per_subset_(min_ops_per_subset),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is added
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          if (exec_index == node_index) {
+            // Check op details only if it wasn't delegated already.
+            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+          }
+        }
+
+        // Get preview of delegate partitioning from the context.
+        TfLiteDelegateParams* params_array;
+        int num_partitions;
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        if (simple->min_ops_per_subset() > 0) {
+          // Build a new vector of ops from subsets with atleast the minimum
+          // size.
+          std::vector<int> allowed_ops;
+          for (int idx = 0; idx < num_partitions; ++idx) {
+            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
+            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
+            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
+                               nodes_in_subset->data + nodes_in_subset->size);
+          }
+
+          // Free existing nodes_to_separate & initialize a new array with
+          // allowed_ops.
+          TfLiteIntArrayFree(nodes_to_separate);
+          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
+          memcpy(nodes_to_separate->data, allowed_ops.data(),
+                 sizeof(int) * nodes_to_separate->size);
+        }
+
+        // Another call to PreviewDelegateParitioning should be okay, since
+        // partitioning memory is managed by context.
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
+        // TODO(b/156586986): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
+        TFLITE_CHECK_GE(buffer_handle, -1);
+        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
+        const float floats[] = {6., 6., 6.};
+        int num = output->dims->data[0];
+        for (int i = 0; i < num; i++) {
+          output->data.f[i] = floats[i];
+        }
+        return kTfLiteOk;
+      };
+
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = delegate_flags;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+
+      reg.invoke = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+        // Copy input data to output data.
+        const TfLiteTensor* a0;
+        const TfLiteTensor* a1;
+        if (node->inputs->size == 2) {
+          a0 = GetInput(context, node, 0);
+          a1 = GetInput(context, node, 1);
+        } else {
+          a0 = GetInput(context, node, 0);
+          a1 = a0;
+        }
+        TfLiteTensor* out = GetOutput(context, node, 0);
+        int num = 1;
+        for (int i = 0; i < a0->dims->size; ++i) {
+          num *= a0->dims->data[i];
+        }
+        for (int i = 0; i < num; i++) {
+          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+        }
+        // Make the data stale so that CopyFromBufferHandle can be invoked
+        if (out->buffer_handle != kTfLiteNullBufferHandle) {
+          out->data_is_stale = true;
+        }
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      }
+
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        // Set output size to input size
+        const TfLiteTensor* input1;
+        const TfLiteTensor* input2;
+        if (node->inputs->size == 2) {
+          input1 = GetInput(context, node, 0);
+          input2 = GetInput(context, node, 1);
+        } else {
+          input1 = GetInput(context, node, 0);
+          input2 = input1;
+        }
+        TfLiteTensor* output = GetOutput(context, node, 0);
+
+        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+            context, output, TfLiteIntArrayCopy(input1->dims)));
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int min_ops_per_subset() { return min_ops_per_subset_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
+    int min_ops_per_subset_ = 0;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_INTERPRETER_TEST_H_

From f6e2a28158eadfffc1e385abb054502ca4cf96d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 02:02:46 -0700
Subject: [PATCH 0642/1533] Update GraphDef version to 402.

PiperOrigin-RevId: 311693254
Change-Id: Ieb9a5f81784013ad6bb4a6b1fd1f119b34c68604
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a534c0cf827..8f0967c1eaa 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 401  // Updated: 2020/5/14
+#define TF_GRAPH_DEF_VERSION 402  // Updated: 2020/5/15
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From ab70af78dcac29dc886456b475e114848fd74665 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 02:02:49 -0700
Subject: [PATCH 0643/1533] compat: Update forward compatibility horizon to
 2020-05-15

PiperOrigin-RevId: 311693261
Change-Id: Id490a7dbe95ff4e493b3490d71c92a9b41f2b484
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 2a21590bb9a..29ba7317747 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 14)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 15)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 6ccf21ef6d284fc1fc262789523cbece1b22ddad Mon Sep 17 00:00:00 2001
From: Dmitry Zakharov <dzakhar@synopsys.com>
Date: Fri, 15 May 2020 12:46:49 +0300
Subject: [PATCH 0644/1533] =?UTF-8?q?Cleanup=20of=20TODO=E2=80=99s=20in=20?=
 =?UTF-8?q?ARC=20specific=20files?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tensorflow/lite/micro/arc_emsdp/debug_log.cc                 | 1 -
 .../person_detection_experimental/arc_emsdp/emsdp.lcf        | 3 ---
 tensorflow/lite/micro/kernels/arc_mli/conv.cc                | 1 -
 tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc   | 1 -
 tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc      | 4 +---
 .../micro/kernels/arc_mli/depthwise_conv_slicing_test.cc     | 1 -
 tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc     | 1 -
 tensorflow/lite/micro/kernels/arc_mli/pooling.cc             | 1 -
 tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc     | 5 ++---
 9 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/arc_emsdp/debug_log.cc b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
index b3b25f88ac1..fa9909f7372 100644
--- a/tensorflow/lite/micro/arc_emsdp/debug_log.cc
+++ b/tensorflow/lite/micro/arc_emsdp/debug_log.cc
@@ -55,7 +55,6 @@ typedef volatile struct dw_uart_reg {
 // to organize blocking loop for printing symbols. No input and no IRQ handling. 
 // See embarc_osp repository for full EMSDP uart driver.
 // (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp)
-// TODO: Consider U-Boot API to do it in a less "hacky" way.
 void DbgUartSendStr(const char* s) {
   DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
   const char* src = s;
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
index 2d7954217d3..95732d2a8b9 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf
@@ -1,7 +1,6 @@
 # Difference with common EMSDP LCF file (to reduce data access time): 
 # - move data from external PSRAM to on-chip memory
 # - move text from SRAM to ICCM
-# - TODO: Move tensor arena to DCCM to reduce data flow between fast and extrnal memory
 #
 # CCMWRAP memory regions indicate unusable portions of the address space
 #   due to CCM memory wrapping into upper addresses beyond its size
@@ -46,8 +45,6 @@ SECTIONS {
     } > SRAM
 
     GROUP BLOCK(4): {
-# TODO: Move tensor arena to DCCM when it will be possible
-#       .tensor_arena? : {}
        .Zdata? : {}
        .heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
        .stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv.cc b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
index 4a2676821d9..b80d220a1cc 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv.cc
@@ -52,7 +52,6 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
   int32_t per_channel_output_multiplier[kMaxChannels];
   int32_t per_channel_output_shift[kMaxChannels];
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
index 9eb9d6499dd..7703bec3602 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
@@ -256,7 +256,6 @@ void TestConvQuantizedPerChannel(
       CreateQuantizedTensor(output_data, output_dims, output_scale,
                             output_zero_point, "output_tensor");
 
-  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
   float input_scales[] = {1, input_scale};
   int input_zero_points[] = {1, input_zero_point};
   TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
index 2aad76bc042..e46f4766fce 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv.cc
@@ -54,7 +54,6 @@ struct OpData {
   int output_shift;
 
   // Per channel output multiplier and shift.
-  // TODO(b/141139247): Allocate these dynamically when possible.
   int32_t per_channel_output_multiplier[kMaxChannels];
   int32_t per_channel_output_shift[kMaxChannels];
 
@@ -74,9 +73,8 @@ bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
 
   // MLI optimized version only supports int8 dataype, dilation factor of 1 and
   // per-axis quantization of weights (no broadcasting/per-tensor)
-  // TODO: ((in_ch == filters_num) || (in_ch == 1)) is a forbidding of  
+  // (in_ch == filters_num) || (in_ch == 1)) is a forbidding of  
   // channel multiplier logic for multichannel input.
-  // To be removed after it will be supported in MLI 
   bool ret_val = (filter->type == kTfLiteInt8) &&
                  (input->type == kTfLiteInt8) &&
                  (bias->type == kTfLiteInt32) &&
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
index e6a87ff82e6..03a9fcbb30b 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
@@ -152,7 +152,6 @@ void TestDepthwiseConvQuantizedPerChannel(
       CreateQuantizedTensor(output_data, output_dims, output_scale,
                             input_zero_point, "output_tensor");
 
-  // TODO(njeff): Affine Quantization Params should be set on tensor creation.
   float input_scales[] = {1, input_scale};
   int input_zero_points[] = {1, input_zero_point};
   TfLiteAffineQuantization input_quant = {FloatArrayFromFloats(input_scales),
diff --git a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
index 89eae356f51..c2e35dbc8dc 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/fully_connected.cc
@@ -236,7 +236,6 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
   op_params.weights_offset = -filter->params.zero_point;
   op_params.output_offset = output->params.zero_point;
   op_params.output_multiplier = data->output_multiplier;
-  // TODO(b/138810107): Figure out whether output shift should be inverted
   op_params.output_shift = -data->output_shift;
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
index 79deacc23d9..0d79fc5dbcf 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling.cc
@@ -46,7 +46,6 @@ enum MliPoolingType { AveragePooling = 0, MaxPooling = 1 };
 bool IsMliApplicable(TfLiteContext* context, const TfLiteTensor* input,
                      const TfLitePoolParams* params) {
   // MLI optimized version only supports int8 dataype and no fused Relu
-  // TODO: subject to add mli_saturate kernel
   return (input->type == kTfLiteInt8 && params->activation == kTfLiteActNone);
 }
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
index 097908e30ab..1518513649f 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/scratch_buf_mgr.cc
@@ -54,7 +54,6 @@ static void get_arc_two_buffer_sizes(int request_size_1, int request_size_2, int
   } else {
     // In case only one buffer is available,
     // use only the max buffer, and split it.
-    // TODO compute optimal split ratio based on request ratio.
     *grant_size_1 = maxavailable / 2;
     *grant_size_2 = maxavailable / 2;
   }
@@ -228,7 +227,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
     const int padding_bot,
     int *in_slice_height,
     int *out_slice_height) {
-  const int height_dimension = 1; // todo: compute from rank
+  const int height_dimension = 1;
   const int in_height = in->shape[height_dimension];
   const int out_height = out->shape[height_dimension];
   const int line_size_in = mli_hlp_count_elem_num(in, height_dimension + 1) * mli_hlp_tensor_element_size(in);
@@ -250,7 +249,7 @@ TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
       // in this case only two slices are needed, so both could benefit from padding. take the MIN to get the worst case.
       max_out_lines_for_input = (max_lines_in + std::min(padding_top, padding_bot) - kernel_height + 1) / stride_height;
     } else {
-      max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height; // TODO add padding exceptions and test by makin fit=false;
+      max_out_lines_for_input = (max_lines_in - kernel_height + 1) / stride_height;
     }
     // Ten compute how many ouput lines fit into the output tensor.
     max_lines_out = std::min(out_height, static_cast<int>(out->capacity) / line_size_out);

From 06c671cde8970541f3a8ef7604502adf9d4e5099 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 02:46:38 -0700
Subject: [PATCH 0645/1533] [XLA] Verify statically shaped result type in
 `xla_hlo.reshape` operation

The result type of the `xla_hlo.reshape` operation must have a static shape.
This is now checked by the operation's verifier.

PiperOrigin-RevId: 311697582
Change-Id: I90e8e513d205d62bb052a6cbecd7ebd88614db6d
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 26 ++++++-----
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 +-
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 43 ++++++++++---------
 3 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 68eafb8b33e..b6036ee2130 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -1358,19 +1358,23 @@ static LogicalResult Verify(PadOp op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(ReshapeOp op) {
-  auto operand_ty = op.operand().getType().cast<TensorType>();
+  // If the operand type is dynamically shaped there is nothing to verify.
+  auto operand_ty = op.operand().getType().cast<RankedTensorType>();
   if (!operand_ty || !operand_ty.hasStaticShape()) return success();
-  int64_t num_input_elements = operand_ty.getNumElements();
 
-  auto out_ty = op.getType().cast<RankedTensorType>();
-  if (out_ty && out_ty.hasStaticShape()) {
-    int64_t num_output_elements = out_ty.getNumElements();
-    if (num_input_elements != num_output_elements)
-      return op.emitOpError()
-             << "number of output elements (" << num_output_elements
-             << ") doesn't match expected number of elements ("
-             << num_input_elements << ")";
-  }
+  // If the operand type is statically shaped (not required) the number of
+  // elements must match that of the result type.
+  auto result_ty = op.getType().cast<RankedTensorType>();
+  assert(result_ty && result_ty.hasStaticShape() &&
+         "result type must be statically shaped");
+  int64_t num_result_elements = result_ty.getNumElements();
+  int64_t num_operand_elements = operand_ty.getNumElements();
+  if (num_result_elements != num_operand_elements)
+    return op.emitOpError()
+           << "number of output elements (" << num_result_elements
+           << ") doesn't match expected number of elements ("
+           << num_operand_elements << ")";
+
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index f78ac7624d2..1ca4e0c5d82 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -1058,7 +1058,7 @@ def HLO_ReshapeOp: HLO_Op<"reshape",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ReshapeOp {
   let arguments = (ins HLO_Tensor:$operand);
 
-  let results = (outs HLO_Tensor);
+  let results = (outs HLO_StaticShapeTensor);
   let hasFolder = 1;
 
   let hasCustomHLOConverter = 1;
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 450910b2e4d..d5440a024ab 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -3555,30 +3555,31 @@ func @assert(%arg0: tensor<i1>, %arg1: tensor<*xf32>) {
 // tf.Unpack legalization
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: @unpack
-func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+// TODO(b/156340000): Re-enable when fixed.
+// // C-HECK-LABEL: @unpack
+// func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+//   // C-HECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
 
-  %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
-  // return %[[RES1]], %[[RES2]], %[[RES3]]
-  return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
-}
+//   %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
+//   // return %[[RES1]], %[[RES2]], %[[RES3]]
+//   return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
+// }
 
-// CHECK-LABEL: @unpack_dynamic
-func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+// // C-HECK-LABEL: @unpack_dynamic
+// func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
 
-  %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
-  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
-}
+//   %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+//   return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+// }
 
 //===----------------------------------------------------------------------===//
 // tf.UnsortedSegment{Max|Min|Prod|Sum} legalization

From bfe99ed9d61220cae4eb9f4a7e35cb113cd6fce9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 03:33:15 -0700
Subject: [PATCH 0646/1533] Remove default initialization in
 tflite::StatefulNnApiDelegate::Data struct.

PiperOrigin-RevId: 311702143
Change-Id: I317473ef15a0ee8f31b1b99ee6e9f23f9f4f19cd
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 1bd9fb5c49f..b94c6d66978 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -90,7 +90,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // of number of nodes and selecting them until the limit is reached.
     int max_number_delegated_partitions = 3;
 
-    // allow fp32 compuation to be run in fp16
+    // allow fp32 compuation to be run in fp16.
     bool allow_fp16 = false;
   };
 
@@ -187,8 +187,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Maximum number of NNAPI partition to delegate. Zero or negative means
     // no limit. Copied from StatefulNnApiDelegate::Options
     int max_number_delegated_partitions;
-    // allow fp32 computation to be run in fp32
-    bool allow_fp16 = false;
+    // allow fp32 computation to be run in fp16.
+    bool allow_fp16;
 
     ~Data();
 

From df2c8d282320ad4c8a81f1ec631537ad4752cfeb Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Fri, 15 May 2020 03:46:01 -0700
Subject: [PATCH 0647/1533] Allow index typed values in
 `hlo_scalars_to_dimension_tensor`.

The limitation stems from a time where tensors with index element type were not allowed in MLIR. With this change, we can remove many `index_cast` ops.

PiperOrigin-RevId: 311703219
Change-Id: I56c7dba29e43b3ee13a1066c0974f72e696600ab
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 +-
 .../compiler/mlir/xla/ir/hlo_ops_base.td      |  4 ++-
 .../xla/tests/materialize-broadcasts.mlir     | 28 +++++++---------
 tensorflow/compiler/mlir/xla/tests/ops.mlir   |  8 +++++
 .../mlir/xla/tests/unfuse_batch_norm.mlir     | 19 ++++-------
 .../xla/transforms/materialize_broadcasts.cc  | 33 ++++++++-----------
 .../mlir/xla/transforms/unfuse_batch_norm.cc  |  4 +--
 7 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 1ca4e0c5d82..0d7771b180e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -785,7 +785,7 @@ def HLO_ScalarsToDimensionTensorOp : HLO_Op<"scalars_to_dimension_tensor",
     compute shape arguments to dynamic operations.
   }];
 
-  let arguments = (ins Variadic<AnySignlessInteger>:$scalars);
+  let arguments = (ins Variadic<HLO_DimensionValue>:$scalars);
   let results = (outs HLO_DimensionTensor);
 
   // Cannot be exported to legacy formats.
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index b5de675f13f..b5130eafd0e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -62,9 +62,11 @@ def HLO_Tuple : NestedTupleOf<[HLO_Tensor, HLO_Token]>;
 
 def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
 
+def HLO_DimensionValue : AnyTypeOf<[Index, HLO_Pred, HLO_Int]>;
+
 // Dynamic representation of a shape vector as a tensor.
 def HLO_DimensionTensor : ShapedContainerType<
-    [Index, HLO_Pred, HLO_Int],
+    [HLO_DimensionValue],
     And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
     "a 1D tensor of dimensions">;
 
diff --git a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
index 2340650dda8..a7f4a5b4474 100644
--- a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
@@ -245,16 +245,14 @@ func @compareBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tenso
 // CHECK-LABEL: @dynamicCompareBroadcastRhs
 func @dynamicCompareBroadcastRhs(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xi1> {
   // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
   // CHECK-NEXT: %c1 = constant 1 : index
   // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
   // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
   // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM1:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT: "xla_hlo.compare"(%[[BROADCAST0]], %[[BROADCAST1]]) {comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
   %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xi1>
   return %0 : tensor<?x?xi1>
@@ -265,16 +263,14 @@ func @dynamicCompareBroadcastRhs(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -
 // CHECK-LABEL: @dynamicBroadcastAdd
 func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
   // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
   // CHECK-NEXT: %c1 = constant 1 : index
   // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
   // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
   // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM1:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
   %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
@@ -285,12 +281,10 @@ func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tenso
 // CHECK-LABEL: @dynamicBroadcastAddScalar
 func @dynamicBroadcastAddScalar(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
   // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
   // CHECK-NEXT: %[[DIM1:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[DIM1]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
   %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index 8cb63311657..f09ec62c8dc 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -461,6 +461,14 @@ func @scalars_to_dimension_tensor(%arg0: i32, %arg1: i32) -> tensor<2xi32> {
 
 // -----
 
+// CHECK-LABEL: @scalars_to_dimension_tensor_index
+func @scalars_to_dimension_tensor_index(%arg0: index, %arg1: index) -> tensor<2xindex> {
+  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (index, index) -> tensor<2xindex>
+  return %0 : tensor<2xindex>
+}
+
+// -----
+
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
index 9778772e250..7a54de73db7 100644
--- a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
@@ -106,24 +106,19 @@ func @batchNormInference_dynamic_shape(
     -> tensor<?x?x?x?xf32> {
   // CHECK-DAG: %[[EPS:.+]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], 0 : tensor<?xf32>
-  // CHECK-DAG: %[[INDEX_CAST:.+]] = index_cast %[[DIM]] : index to i32
-  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INDEX_CAST]]) : (i32) -> tensor<1xi32>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM]]) : (index) -> tensor<1xindex>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = xla_hlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "xla_hlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK-DAG: %[[INPUT_DIM_0:.+]] = dim %[[X]], 0 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_0:.+]] = index_cast %[[INPUT_DIM_0]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], 1 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_1:.+]] = index_cast %[[INPUT_DIM_1]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], 2 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_2:.+]] = index_cast %[[INPUT_DIM_2]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], 3 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_3:.+]] = index_cast %[[INPUT_DIM_3]] : index to i32
-  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_INDEX_CAST_0]], %[[INPUT_INDEX_CAST_1]], %[[INPUT_INDEX_CAST_2]], %[[INPUT_INDEX_CAST_3]]) : (i32, i32, i32, i32) -> tensor<4xi32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : (index, index, index, index) -> tensor<4xindex>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = xla_hlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_SCALED:.+]] = xla_hlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = xla_hlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index bf666400900..7b4262825f8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -143,28 +143,23 @@ std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
   // either be the same in that dimension or it can be 1, in which case the
   // shape of the other operand is used.
   for (int i = 0; i < output_rank; ++i) {
-    Value index_value;
     if (indexes[i] == kExpandShape) {
       // The smaller shape gets expanded to the larger one in this case.
-      index_value = rewriter->create<mlir::DimOp>(loc, large, i);
-    } else {
-      // Compute the result shape depending on whether the rank of smaller is 1.
-      // This does not check that the broadcast operation actualy is correct.
-      // In particular, we do not check that both shapes are the same if the
-      // smaller ranked shape is not 1.
-      ConstantOp one = rewriter->create<mlir::ConstantOp>(
-          loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
-      DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
-      DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
-      CmpIOp compare =
-          rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
-      index_value =
-          rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim);
+      shape_values.push_back(rewriter->create<mlir::DimOp>(loc, large, i));
+      continue;
     }
-    // Ideally, we would like to keep this on index but MLIR does not allow
-    // this.
-    shape_values.push_back(rewriter->create<mlir::IndexCastOp>(
-        loc, index_value, rewriter->getIntegerType(32)));
+    // Compute the result shape depending on whether the rank of smaller is 1.
+    // This does not check that the broadcast operation actualy is correct.
+    // In particular, we do not check that both shapes are the same if the
+    // smaller ranked shape is not 1.
+    ConstantOp one = rewriter->create<mlir::ConstantOp>(
+        loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
+    DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
+    DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
+    CmpIOp compare =
+        rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
+    shape_values.push_back(
+        rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim));
   }
 
   return shape_values;
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
index 32d8b079c89..d53aaee3701 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
@@ -58,9 +58,7 @@ Value CalculateShapeValue(Location loc, Value operand,
   int64_t rank = result_type.getRank();
   shape_values.reserve(rank);
   for (int64_t i = 0; i < rank; ++i) {
-    auto index_value = rewriter.create<mlir::DimOp>(loc, operand, i);
-    shape_values.push_back(rewriter.create<mlir::IndexCastOp>(
-        loc, index_value, rewriter.getIntegerType(32)));
+    shape_values.push_back(rewriter.create<mlir::DimOp>(loc, operand, i));
   }
   Type shape_element_type = shape_values.front().getType();
   return rewriter.create<ScalarsToDimensionTensorOp>(

From 0c6f6f4776e132148cd7775492665c5456c28294 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 04:00:10 -0700
Subject: [PATCH 0648/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/854f5f332af4

PiperOrigin-RevId: 311704350
Change-Id: I2314bba4a4f72fdceda3a3439bac3e3fb96b811a
---
 third_party/mlir/BUILD | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 1bddf2180bc..93843d58f30 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2458,6 +2458,7 @@ cc_library(
         ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
+        ":LinalgToStandard",
         ":NVVMDialect",
         ":Parser",
         ":Pass",
@@ -2543,6 +2544,7 @@ cc_library(
         ":LinalgPassIncGen",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
+        ":LinalgToStandard",
         ":LinalgTransforms",
         ":LoopPassIncGen",
         ":LoopsToGPUPass",
@@ -3121,6 +3123,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "LinalgToStandard",
+    srcs = glob([
+        "lib/Conversion/LinalgToStandard/*.cpp",
+        "lib/Conversion/LinalgToStandard/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = glob([
+        "include/mlir/Conversion/LinalgToStandard/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":Affine",
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LinalgOps",
+        ":Pass",
+        ":SCFDialect",
+        ":StandardOps",
+        ":Support",
+        ":Transforms",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 cc_library(
     name = "LinalgToSPIRV",
     srcs = glob([

From 872e950b51edbf3430d547e2fe4ed15ba8b18f77 Mon Sep 17 00:00:00 2001
From: seo-inyoung <62606132+seo-inyoung@users.noreply.github.com>
Date: Fri, 15 May 2020 20:05:11 +0900
Subject: [PATCH 0649/1533] Update SECURITY.md

simple error correction
---
 SECURITY.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SECURITY.md b/SECURITY.md
index 6fc2c3aa9cc..f3a6c148b2e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -64,7 +64,7 @@ your model, and we recommend you run the TensorFlow process in a sandbox.
 
 It is possible to write models that are secure in a sense that they can safely
 process untrusted inputs assuming there are no bugs. There are two main reasons
-to not rely on this: first, it is easy to write models which must not be exposed
+to not rely on this: First, it is easy to write models which must not be exposed
 to untrusted inputs, and second, there are bugs in any software system of
 sufficient complexity. Letting users control inputs could allow them to trigger
 bugs either in TensorFlow or in dependent libraries.
@@ -149,7 +149,7 @@ attack (or worse). Because TensorFlow behaves correctly, this is not a
 vulnerability in TensorFlow (although it would be a vulnerability of this
 hypothetical system).
 
-As a general rule, it is incorrect behavior for Tensorflow to access memory it
+As a general rule, it is incorrect behavior for TensorFlow to access memory it
 does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
 such behaviors constitute a vulnerability.
 

From 103bb013d4d4ba19da0445abd9b9c627af9df817 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 14:23:20 +0200
Subject: [PATCH 0650/1533] Verifiy differences with test annotation

---
 tensorflow/python/kernel_tests/map_fn_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 0bc3307e484..81dd817687a 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.autograph.impl import api
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -186,7 +187,8 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
-  @test_util.run_in_graph_and_eager_modes
+  #@test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testMap_autograph_indirect(self):
     def test_function(x):
       cond = constant_op.constant(-1)
@@ -195,6 +197,8 @@ class MapFnTest(test.TestCase):
       else:
         result = x
       return result
+
+    @api.convert(recursive=False) 
     def map_call(x):
       return map_fn.map_fn(test_function, x)
 

From bbc2f3a190ff05a0bb8c30246dc71490587f434a Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 15:37:38 +0200
Subject: [PATCH 0651/1533] Let test to fail

---
 tensorflow/python/kernel_tests/map_fn_test.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 81dd817687a..8ead634aa11 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.autograph.impl import api
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -187,8 +187,7 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
-  #@test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes
   def testMap_autograph_indirect(self):
     def test_function(x):
       cond = constant_op.constant(-1)
@@ -198,7 +197,7 @@ class MapFnTest(test.TestCase):
         result = x
       return result
 
-    @api.convert(recursive=False) 
+    @def_function.function
     def map_call(x):
       return map_fn.map_fn(test_function, x)
 

From 8da4a14be31d4621208724b373730b2c4972f2f9 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 15 May 2020 07:34:14 -0700
Subject: [PATCH 0652/1533] Avoid overhead for creating executors if there is
 no change in execution mode.

PiperOrigin-RevId: 311726778
Change-Id: I33a1e5085e1740504181bd6096229b6df12b26f8
---
 tensorflow/python/eager/context.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 86b3d5cf95f..c6ef21402d2 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -2027,6 +2027,8 @@ def execution_mode(mode):
   """Context manager for setting execution mode for current thread."""
   if mode is None:
     yield
+  elif (mode == ASYNC) == context().executor.is_async():
+    yield
   else:
     ctx = context()
     executor_new = executor.new_executor(mode == ASYNC)

From 560762e40d9bb085ea33f52b36b96a3851e1b3d2 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 16:49:53 +0200
Subject: [PATCH 0653/1533] Test autograph transform of fn

---
 tensorflow/python/ops/map_fn.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 2c9c678336e..e39d35c36b0 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -477,6 +479,8 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
+      autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
+      result_value = autographed_fn(elems_value)
       result_value = fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)

From 020a88ac127caa1e333ce36873ad2602abc5f7d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 08:09:03 -0700
Subject: [PATCH 0654/1533] Rollback to investigate failure

PiperOrigin-RevId: 311731132
Change-Id: I109ce87f13bb1b1c06b3e110bafbdf9c014c8258
---
 tensorflow/lite/BUILD                         |  19 +-
 tensorflow/lite/core/subgraph.cc              |   7 -
 tensorflow/lite/core/subgraph.h               |   6 -
 tensorflow/lite/delegates/BUILD               |  23 --
 .../lite/delegates/interpreter_utils.cc       |  67 ----
 tensorflow/lite/delegates/interpreter_utils.h |  52 ---
 .../lite/delegates/interpreter_utils_test.cc  |  92 -----
 tensorflow/lite/delegates/utils.h             |   2 -
 tensorflow/lite/interpreter.cc                |   4 -
 tensorflow/lite/interpreter.h                 |  13 -
 tensorflow/lite/interpreter_test.cc           | 309 +++++++++++++++-
 tensorflow/lite/interpreter_test.h            | 331 ------------------
 12 files changed, 307 insertions(+), 618 deletions(-)
 delete mode 100644 tensorflow/lite/delegates/interpreter_utils.cc
 delete mode 100644 tensorflow/lite/delegates/interpreter_utils.h
 delete mode 100644 tensorflow/lite/delegates/interpreter_utils_test.cc
 delete mode 100644 tensorflow/lite/interpreter_test.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 4d8c07aa15b..14babee2da7 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -340,27 +340,11 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "interpreter_test_fixtures",
-    testonly = True,
-    hdrs = ["interpreter_test.h"],
-    deps = [
-        ":framework",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 # Test main interpreter
 cc_test(
     name = "interpreter_test",
     size = "small",
-    srcs = [
-        "interpreter_test.cc",
-    ],
+    srcs = ["interpreter_test.cc"],
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     tags = [
         "tflite_not_portable_ios",  # TODO(b/117786830)
@@ -368,7 +352,6 @@ cc_test(
     deps = [
         ":external_cpu_backend_context",
         ":framework",
-        ":interpreter_test_fixtures",
         ":string_util",
         ":version",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 81710df128b..7f4e0e286ea 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -533,11 +533,6 @@ void Subgraph::SetCancellationFunction(void* data,
   check_cancelled_func_ = check_cancelled_func;
 }
 
-bool Subgraph::IsCancelled() {
-  return (check_cancelled_func_ != nullptr) &&
-         (*check_cancelled_func_)(cancellation_data_);
-}
-
 void Subgraph::ReserveNodes(int count) {
   nodes_and_registration_.reserve(count);
 }
@@ -1321,8 +1316,6 @@ TfLiteStatus Subgraph::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
-bool Subgraph::HasDelegates() { return !delegates_applied_.empty(); }
-
 TfLiteStatus Subgraph::EnsureMemoryAllocations() {
   if (memory_planner_) {
     state_ = kStateUninvokable;
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d6067daaa6a..0b0c1e31e89 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -553,9 +553,6 @@ class Subgraph {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
-  // Returns true if the subgraph has delegates applied.
-  bool HasDelegates();
-
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
@@ -581,9 +578,6 @@ class Subgraph {
   // Ensures the memory required is planned and allocated.
   TfLiteStatus EnsureMemoryAllocations();
 
-  // Returns true if cancellation function returns true.
-  bool IsCancelled();
-
   // The state of the Interpreter.
   enum State {
     // The interpreter isn't ready to be invoked.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 8d4c921576d..df671675ec9 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -43,26 +43,3 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-cc_library(
-    name = "interpreter_utils",
-    srcs = ["interpreter_utils.cc"],
-    hdrs = ["interpreter_utils.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite:framework",
-    ],
-)
-
-cc_test(
-    name = "interpreter_utils_test",
-    srcs = ["interpreter_utils_test.cc"],
-    linkopts = tflite_linkopts(),
-    linkstatic = 1,
-    deps = [
-        ":interpreter_utils",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:interpreter_test_fixtures",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
deleted file mode 100644
index 85d79d887fb..00000000000
--- a/tensorflow/lite/delegates/interpreter_utils.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/interpreter_utils.h"
-
-namespace tflite {
-namespace delegates {
-TfLiteStatus InterpreterUtils::InvokeWithCPUFallback(Interpreter* interpreter) {
-  TfLiteStatus status = interpreter->Invoke();
-  if (status == kTfLiteOk || interpreter->IsCancelled() ||
-      !interpreter->HasDelegates()) {
-    return status;
-  }
-  // Retry without delegation.
-  // TODO(b/138706191): retry only if error is due to delegation.
-  TF_LITE_REPORT_ERROR(
-      interpreter->error_reporter(),
-      "Invoke() failed in the presence of delegation. Retrying without.");
-
-  // Copy input data to a buffer.
-  // Input data is safe since Subgraph::PrepareOpsAndTensors() passes
-  // preserve_inputs=true to ArenaPlanner.
-  std::vector<char> buf;
-  size_t input_size = 0;
-
-  for (auto i : interpreter->inputs()) {
-    TfLiteTensor* t = interpreter->tensor(i);
-    input_size += t->bytes;
-  }
-  buf.reserve(input_size);
-  auto bufp = buf.begin();
-  for (auto i : interpreter->inputs()) {
-    // TF_LITE_ENSURE_STATUS(interpreter->EnsureTensorDataIsReadable(i));
-    TfLiteTensor* t = interpreter->tensor(i);
-    std::copy(t->data.raw, t->data.raw + t->bytes, bufp);
-    bufp += t->bytes;
-  }
-
-  TF_LITE_ENSURE_STATUS(interpreter->RemoveAllDelegates());
-
-  // Copy inputs from buffer.
-  bufp = buf.begin();
-  for (auto i : interpreter->inputs()) {
-    TfLiteTensor* t = interpreter->tensor(i);
-    std::copy(bufp, bufp + t->bytes, t->data.raw);
-    bufp += t->bytes;
-  }
-
-  // Invoke again.
-  TF_LITE_ENSURE_STATUS(interpreter->Invoke());
-  return kTfLiteDelegateError;
-}
-
-}  // namespace delegates
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
deleted file mode 100644
index f736c2db1f4..00000000000
--- a/tensorflow/lite/delegates/interpreter_utils.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
-#define TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
-
-#include "tensorflow/lite/interpreter.h"
-
-// Utility functions and classes for using delegates.
-
-namespace tflite {
-namespace delegates {
-#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-class InterpreterUtils {
- public:
-  /// Invokes an interpreter with automatic fallback from delegation to CPU.
-  ///
-  /// If using the delegate fails, the delegate is automatically undone and an
-  /// attempt made to return the interpreter to an invokable state.
-  ///
-  /// Allowing the fallback is suitable only if both of the following hold:
-  /// - The caller is known not to cache pointers to tensor data across Invoke()
-  ///   calls.
-  /// - The model is not stateful (no variables, no LSTMs) or the state isn't
-  ///   needed between batches.
-  ///
-  /// Returns one of the following three status codes:
-  /// 1. kTfLiteOk: Success. Output is valid.
-  /// 2. kTfLiteDelegateError: Delegate error but fallback succeeded. Output is
-  /// valid.
-  /// NOTE: This undoes all delegates previously applied to the Interpreter.
-  /// 3. kTfLiteError: Unexpected/runtime failure. Output is invalid.
-  /// WARNING: This is an experimental API and subject to change.
-  static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
-};
-#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-}  // namespace delegates
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
diff --git a/tensorflow/lite/delegates/interpreter_utils_test.cc b/tensorflow/lite/delegates/interpreter_utils_test.cc
deleted file mode 100644
index 8dc856d796c..00000000000
--- a/tensorflow/lite/delegates/interpreter_utils_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/interpreter_utils.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter_test.h"
-
-namespace tflite {
-namespace {
-
-TEST_F(TestDelegate, DelegateNodeInvokeFailureFallback) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Delegation modified execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(
-      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
-      kTfLiteDelegateError);
-  // Delegation removed, returning to original execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // Check outputs.
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  // Pre-delegation execution plan should have three nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(
-      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
-      kTfLiteDelegateError);
-  // All delegates should be undone.
-  EXPECT_EQ(interpreter_->execution_plan().size(), 3);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-}  // namespace
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 3b0668af04b..d6d22c4efa2 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_H_
 
-// Utility functions and classes for implementing delegates.
-
 #include <functional>
 #include <limits>
 #include <set>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 167254a2a62..c8ccf671d60 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -310,8 +310,6 @@ void Interpreter::SetCancellationFunction(void* data,
   }
 }
 
-bool Interpreter::IsCancelled() { return primary_subgraph().IsCancelled(); }
-
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
@@ -342,8 +340,6 @@ TfLiteStatus Interpreter::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
-bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
-
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index aa9c54d295f..b93fd76c13b 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -42,9 +42,6 @@ namespace tflite {
 
 class InterpreterTest;
 class TestDelegate;
-namespace delegates {
-class InterpreterUtils;  // Class for friend declarations.
-}  // namespace delegates
 
 namespace impl {
 
@@ -532,7 +529,6 @@ class Interpreter {
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
   friend class tflite::TestDelegate;
-  friend class tflite::delegates::InterpreterUtils;
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -546,15 +542,6 @@ class Interpreter {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
-  // Returns true if delegates have been applied.
-  bool HasDelegates();
-
-  // Returns true if cancellation function returns true.
-  bool IsCancelled();
-
-  // Get the error reporter associated with this interpreter.
-  ErrorReporter* error_reporter() { return error_reporter_; }
-
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 1d8f82ef16a..cfc7c168aa5 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
-#include "tensorflow/lite/interpreter_test.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -36,6 +35,25 @@ limitations under the License.
 
 namespace tflite {
 
+// InterpreterTest is a friend of Interpreter, so it can access context_.
+class InterpreterTest : public ::testing::Test {
+ public:
+  template <typename Delegate>
+  static TfLiteStatus ModifyGraphWithDelegate(
+      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
+    Interpreter::TfLiteDelegatePtr tflite_delegate(
+        delegate.release(), [](TfLiteDelegate* delegate) {
+          delete reinterpret_cast<Delegate*>(delegate);
+        });
+    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
+  }
+
+ protected:
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
+
+  Interpreter interpreter_;
+};
+
 namespace ops {
 namespace builtin {
 TfLiteRegistration* Register_PADV2();
@@ -1286,6 +1304,291 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    const TfLiteTensor* input1 = GetInput(context, node, 0);
+    const TfLiteTensor* input2 = GetInput(context, node, 1);
+    TfLiteTensor* output = GetOutput(context, node, 0);
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    TF_LITE_ENSURE(context, a0);
+    TF_LITE_ENSURE(context, a0->data.f);
+    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    TF_LITE_ENSURE(context, a1);
+    TF_LITE_ENSURE(context, a1->data.f);
+    TfLiteTensor* out = GetOutput(context, node, 0);
+    TF_LITE_ENSURE(context, out);
+    TF_LITE_ENSURE(context, out->data.f);
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+}  // namespace
+
+// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
+  }
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
+    // min_ops_per_subset: If >0, partitioning preview is used to choose only
+    // those subsets with min_ops_per_subset number of nodes.
+    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
+    explicit SimpleDelegate(
+        const std::vector<int>& nodes,
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false, int min_ops_per_subset = 0,
+        bool fail_node_invoke = false)
+        : nodes_(nodes),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          min_ops_per_subset_(min_ops_per_subset),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is added
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          if (exec_index == node_index) {
+            // Check op details only if it wasn't delegated already.
+            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+          }
+        }
+
+        // Get preview of delegate partitioning from the context.
+        TfLiteDelegateParams* params_array;
+        int num_partitions;
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        if (simple->min_ops_per_subset() > 0) {
+          // Build a new vector of ops from subsets with atleast the minimum
+          // size.
+          std::vector<int> allowed_ops;
+          for (int idx = 0; idx < num_partitions; ++idx) {
+            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
+            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
+            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
+                               nodes_in_subset->data + nodes_in_subset->size);
+          }
+
+          // Free existing nodes_to_separate & initialize a new array with
+          // allowed_ops.
+          TfLiteIntArrayFree(nodes_to_separate);
+          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
+          memcpy(nodes_to_separate->data, allowed_ops.data(),
+                 sizeof(int) * nodes_to_separate->size);
+        }
+
+        // Another call to PreviewDelegateParitioning should be okay, since
+        // partitioning memory is managed by context.
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
+        // TODO(b/156586986): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
+        TFLITE_CHECK_GE(buffer_handle, -1);
+        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
+        const float floats[] = {6., 6., 6.};
+        int num = output->dims->data[0];
+        for (int i = 0; i < num; i++) {
+          output->data.f[i] = floats[i];
+        }
+        return kTfLiteOk;
+      };
+
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = delegate_flags;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+
+      reg.invoke = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+        // Copy input data to output data.
+        const TfLiteTensor* a0;
+        const TfLiteTensor* a1;
+        if (node->inputs->size == 2) {
+          a0 = GetInput(context, node, 0);
+          a1 = GetInput(context, node, 1);
+        } else {
+          a0 = GetInput(context, node, 0);
+          a1 = a0;
+        }
+        TfLiteTensor* out = GetOutput(context, node, 0);
+        int num = 1;
+        for (int i = 0; i < a0->dims->size; ++i) {
+          num *= a0->dims->data[i];
+        }
+        for (int i = 0; i < num; i++) {
+          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+        }
+        // Make the data stale so that CopyFromBufferHandle can be invoked
+        out->data_is_stale = true;
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      }
+
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        // Set output size to input size
+        const TfLiteTensor* input1;
+        const TfLiteTensor* input2;
+        if (node->inputs->size == 2) {
+          input1 = GetInput(context, node, 0);
+          input2 = GetInput(context, node, 1);
+        } else {
+          input1 = GetInput(context, node, 0);
+          input2 = input1;
+        }
+        TfLiteTensor* output = GetOutput(context, node, 0);
+
+        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+            context, output, TfLiteIntArrayCopy(input1->dims)));
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int min_ops_per_subset() { return min_ops_per_subset_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
+    int min_ops_per_subset_ = 0;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+namespace {
+
 TEST_F(TestDelegate, BasicDelegate) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
@@ -1664,7 +1967,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  interpreter_->Invoke();
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -1678,7 +1981,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  interpreter_->Invoke();
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
diff --git a/tensorflow/lite/interpreter_test.h b/tensorflow/lite/interpreter_test.h
deleted file mode 100644
index d4f0c8a05c5..00000000000
--- a/tensorflow/lite/interpreter_test.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_INTERPRETER_TEST_H_
-#define TENSORFLOW_LITE_INTERPRETER_TEST_H_
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
-
-namespace tflite {
-// InterpreterTest is a friend of Interpreter, so it can access context_.
-class InterpreterTest : public ::testing::Test {
- public:
-  template <typename Delegate>
-  static TfLiteStatus ModifyGraphWithDelegate(
-      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
-    Interpreter::TfLiteDelegatePtr tflite_delegate(
-        delegate.release(), [](TfLiteDelegate* delegate) {
-          delete reinterpret_cast<Delegate*>(delegate);
-        });
-    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
-  }
-
- protected:
-  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
-
-  Interpreter interpreter_;
-};
-
-// Build a kernel registration for an op that copies its one input
-// to an output
-TfLiteRegistration AddOpRegistration() {
-  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-  reg.custom_name = "my_add";
-  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
-
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
-
-    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
-    for (int i = 0; i < input1->dims->size; ++i) {
-      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
-    }
-
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-        context, output, TfLiteIntArrayCopy(input1->dims)));
-    return kTfLiteOk;
-  };
-
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TF_LITE_ENSURE(context, a0);
-    TF_LITE_ENSURE(context, a0->data.f);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
-    TF_LITE_ENSURE(context, a1);
-    TF_LITE_ENSURE(context, a1->data.f);
-    TfLiteTensor* out = GetOutput(context, node, 0);
-    TF_LITE_ENSURE(context, out);
-    TF_LITE_ENSURE(context, out->data.f);
-    int num = a0->dims->data[0];
-    for (int i = 0; i < num; i++) {
-      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-  return reg;
-}
-
-// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
-class TestDelegate : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
-    interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = AddOpRegistration();
-    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
-  }
-
-  void TearDown() override {
-    // Interpreter relies on delegate to free the resources properly. Thus
-    // the life cycle of delegate must be longer than interpreter.
-    interpreter_.reset();
-    delegate_.reset();
-  }
-
-  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
-
-  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
-
-  TfLiteStatus RemoveAllDelegates() {
-    return interpreter_->RemoveAllDelegates();
-  }
-
- protected:
-  class SimpleDelegate {
-   public:
-    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
-    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
-    // value-copyable and compatible with TfLite.
-    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
-    // min_ops_per_subset: If >0, partitioning preview is used to choose only
-    // those subsets with min_ops_per_subset number of nodes.
-    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
-        : nodes_(nodes),
-          fail_delegate_node_prepare_(fail_node_prepare),
-          min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
-      delegate_.Prepare = [](TfLiteContext* context,
-                             TfLiteDelegate* delegate) -> TfLiteStatus {
-        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
-        TfLiteIntArray* nodes_to_separate =
-            TfLiteIntArrayCreate(simple->nodes_.size());
-        // Mark nodes that we want in TfLiteIntArray* structure.
-        int index = 0;
-        for (auto node_index : simple->nodes_) {
-          nodes_to_separate->data[index++] = node_index;
-          // make sure node is added
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-        }
-        // Check that all nodes are available
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(
-            context->GetExecutionPlan(context, &execution_plan));
-        for (int exec_index = 0; exec_index < execution_plan->size;
-             exec_index++) {
-          int node_index = execution_plan->data[exec_index];
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          if (exec_index == node_index) {
-            // Check op details only if it wasn't delegated already.
-            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-          }
-        }
-
-        // Get preview of delegate partitioning from the context.
-        TfLiteDelegateParams* params_array;
-        int num_partitions;
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        if (simple->min_ops_per_subset() > 0) {
-          // Build a new vector of ops from subsets with atleast the minimum
-          // size.
-          std::vector<int> allowed_ops;
-          for (int idx = 0; idx < num_partitions; ++idx) {
-            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
-            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
-            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
-                               nodes_in_subset->data + nodes_in_subset->size);
-          }
-
-          // Free existing nodes_to_separate & initialize a new array with
-          // allowed_ops.
-          TfLiteIntArrayFree(nodes_to_separate);
-          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
-          memcpy(nodes_to_separate->data, allowed_ops.data(),
-                 sizeof(int) * nodes_to_separate->size);
-        }
-
-        // Another call to PreviewDelegateParitioning should be okay, since
-        // partitioning memory is managed by context.
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, simple->FakeFusedRegistration(), nodes_to_separate,
-            delegate);
-        TfLiteIntArrayFree(nodes_to_separate);
-        return kTfLiteOk;
-      };
-      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* tensor) -> TfLiteStatus {
-        // TODO(b/156586986): Implement tests to test buffer copying logic.
-        return kTfLiteOk;
-      };
-      delegate_.CopyFromBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle,
-             TfLiteTensor* output) -> TfLiteStatus {
-        TFLITE_CHECK_GE(buffer_handle, -1);
-        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
-        const float floats[] = {6., 6., 6.};
-        int num = output->dims->data[0];
-        for (int i = 0; i < num; i++) {
-          output->data.f[i] = floats[i];
-        }
-        return kTfLiteOk;
-      };
-
-      delegate_.FreeBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
-      // Store type-punned data SimpleDelegate structure.
-      delegate_.data_ = static_cast<void*>(this);
-      delegate_.flags = delegate_flags;
-    }
-
-    TfLiteRegistration FakeFusedRegistration() {
-      TfLiteRegistration reg = {nullptr};
-      reg.custom_name = "fake_fused_op";
-
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        if (out->buffer_handle != kTfLiteNullBufferHandle) {
-          out->data_is_stale = true;
-        }
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_invoke_) {
-        reg.invoke = [](TfLiteContext* context,
-                        TfLiteNode* node) -> TfLiteStatus {
-          return kTfLiteError;
-        };
-      }
-
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
-        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-          return kTfLiteError;
-        };
-      }
-
-      return reg;
-    }
-
-    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
-
-    int min_ops_per_subset() { return min_ops_per_subset_; }
-
-   private:
-    std::vector<int> nodes_;
-    TfLiteDelegate delegate_;
-    bool fail_delegate_node_prepare_ = false;
-    int min_ops_per_subset_ = 0;
-    bool fail_delegate_node_invoke_ = false;
-  };
-
-  std::unique_ptr<Interpreter> interpreter_;
-  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
-};
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_INTERPRETER_TEST_H_

From d6dd56f74f228227dc9781bd389147df61d3784e Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 17:26:04 +0200
Subject: [PATCH 0655/1533] Remove original fn call

---
 tensorflow/python/ops/map_fn.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index e39d35c36b0..b98b4ad10bc 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -481,7 +481,6 @@ def map_fn(fn,
       elems_value = elems_unflatten(elems_value_flat)
       autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
       result_value = autographed_fn(elems_value)
-      result_value = fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(

From ec0026c8c38319c8ea1cc6ce80a1e5b6bb48c502 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Fri, 15 May 2020 08:38:22 -0700
Subject: [PATCH 0656/1533] Update ops_history for bincount.

PiperOrigin-RevId: 311735482
Change-Id: I4bff5fdf6a840a5a5c692b5b906817815d41ba71
---
 tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt  | 2 +-
 tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt | 2 +-
 tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
index e26e1639e82..9bab6854e40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
@@ -39,7 +39,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
index 9d94149cc09..4f5fb24109c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
@@ -43,7 +43,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
index 333b71a5e1c..9bbc5132845 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
@@ -47,7 +47,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false

From e8d51ef6010faec838b34fe07cddb7369721d903 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Fri, 15 May 2020 08:50:01 -0700
Subject: [PATCH 0657/1533] Remove the unnecessary address-returning operator
 and lamda expression.

PiperOrigin-RevId: 311737378
Change-Id: I55bf12bf66540ed32dd48d61da7f41bdf2ace5eb
---
 tensorflow/lite/tools/evaluation/utils.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 33967b6f4ea..3807814fee1 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -119,7 +119,7 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 #if defined(__ANDROID__)
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
   return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(options),
-                           &TfLiteGpuDelegateV2Delete);
+                           TfLiteGpuDelegateV2Delete);
 }
 #endif  // defined(__ANDROID__)
 
@@ -184,9 +184,7 @@ TfLiteDelegatePtr CreateXNNPACKDelegate() {
 TfLiteDelegatePtr CreateXNNPACKDelegate(
     const TfLiteXNNPackDelegateOptions* xnnpack_options) {
   auto xnnpack_delegate = TfLiteXNNPackDelegateCreate(xnnpack_options);
-  return TfLiteDelegatePtr(xnnpack_delegate, [](TfLiteDelegate* delegate) {
-    TfLiteXNNPackDelegateDelete(delegate);
-  });
+  return TfLiteDelegatePtr(xnnpack_delegate, TfLiteXNNPackDelegateDelete);
 }
 
 TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads) {

From 6bddca85b3f792cef733da529ea3fbb92fcb9522 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 15 May 2020 09:18:04 -0700
Subject: [PATCH 0658/1533] Use fully-qualified std::string in TraceMe

PiperOrigin-RevId: 311741974
Change-Id: Ic9100c53ded4011b590651cbb5ca276b093a3fc2
---
 .../core/profiler/internal/traceme_recorder.h |  5 +-
 tensorflow/core/profiler/lib/traceme.h        | 54 +++++++++----------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 1da7d4cebb1..5fdea5bddbd 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 
 #include <atomic>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -52,13 +53,13 @@ class TraceMeRecorder {
   // Times are in ns since the Unix epoch.
   struct Event {
     uint64 activity_id;
-    string name;
+    std::string name;
     uint64 start_time;  // 0 = missing
     uint64 end_time;    // 0 = missing
   };
   struct ThreadInfo {
     uint32 tid;
-    string name;
+    std::string name;
   };
   struct ThreadEvents {
     ThreadInfo thread;
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index af93ac11b1e..2c3e3ebe6cc 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -16,12 +16,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 
 #include <new>
+#include <string>
 #include <utility>
 
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/strings/strip.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -78,20 +76,21 @@ inline int GetTFTraceMeLevel(bool is_expensive) {
 //          auto id = ActivityStart("step");
 //          ... do some work ...
 //          ActivityEnd(id);
+//       The two static methods should be called within the same thread.
 class TraceMe {
  public:
-  // Constructor that traces a user-defined activity labeled with activity_name
+  // Constructor that traces a user-defined activity labeled with name
   // in the UI. Level defines the trace priority, used for filtering TraceMe
   // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
   // - Must be a positive integer.
   // - Can be a value in enum TraceMeLevel.
   // Users are welcome to use level > 3 in their code, if they wish to filter
   // out their host traces based on verbosity.
-  explicit TraceMe(absl::string_view activity_name, int level = 1) {
+  explicit TraceMe(absl::string_view name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(activity_name);
+      new (&no_init_.name) std::string(name);
       start_time_ = EnvTime::NowNanos();
     }
 #endif
@@ -102,26 +101,26 @@ class TraceMe {
   // Note: We can't take the string by value because a) it would make the
   // overloads ambiguous, and b) we want lvalue strings to use the string_view
   // constructor so we avoid copying them when tracing is disabled.
-  explicit TraceMe(string &&activity_name, int level = 1) {
+  explicit TraceMe(std::string&& name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(std::move(activity_name));
+      new (&no_init_.name) std::string(std::move(name));
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
   // Do not allow passing strings by reference or value since the caller
-  // may unintentionally maintain ownership of the activity_name.
-  // Explicitly std::move the activity_name or wrap it in a string_view if
+  // may unintentionally maintain ownership of the name.
+  // Explicitly std::move the name or wrap it in a string_view if
   // you really wish to maintain ownership.
-  explicit TraceMe(const string &activity_name, int level = 1) = delete;
+  explicit TraceMe(const std::string& name, int level = 1) = delete;
 
   // This overload is necessary to make TraceMe's with string literals work.
   // Otherwise, the string&& and the string_view constructor would be equally
   // good overload candidates.
-  explicit TraceMe(const char *raw, int level = 1)
+  explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
   // This overload only generates the activity name if tracing is enabled.
@@ -136,12 +135,14 @@ class TraceMe {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(name_generator());
+      new (&no_init_.name) std::string(name_generator());
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
+  ~TraceMe() { Stop(); }
+
   // Stop tracing the activity. Called by the destructor, but exposed to allow
   // stopping tracing before the object goes out of scope. Only has an effect
   // the first time it is called.
@@ -171,23 +172,21 @@ class TraceMe {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        absl::string_view orig = no_init_.name;
-        if (absl::EndsWith(orig, "#")) {
-          // orig does have metadata.
-          absl::ConsumeSuffix(&orig, "#");
-          absl::ConsumePrefix(&new_metadata, "#");
-          no_init_.name = absl::StrCat(orig, ",", new_metadata);
-        } else {
-          // orig does not have metadata.
-          absl::StrAppend(&no_init_.name, new_metadata);
+        std::string& name = no_init_.name;
+        DCHECK(!name.empty());
+        DCHECK(!new_metadata.empty());
+        if (name.back() == '#') {  // name already has metadata
+          name.back() = ',';
+          if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
+            new_metadata.remove_prefix(1);
+          }
         }
+        name.append(new_metadata.data(), new_metadata.size());
       }
     }
 #endif
   }
 
-  ~TraceMe() { Stop(); }
-
   // Static API, for use when scoped objects are inconvenient.
 
   // Record the start time of an activity.
@@ -196,7 +195,7 @@ class TraceMe {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
       uint64 activity_id = TraceMeRecorder::NewActivityId();
-      TraceMeRecorder::Record({activity_id, string(name),
+      TraceMeRecorder::Record({activity_id, std::string(name),
                                /*start_time=*/EnvTime::NowNanos(),
                                /*end_time=*/0});
       return activity_id;
@@ -211,7 +210,8 @@ class TraceMe {
     // We don't check the level again (see TraceMe::Stop()).
     if (TF_PREDICT_FALSE(activity_id != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
+        TraceMeRecorder::Record({activity_id, /*name=*/std::string(),
+                                 /*start_time=*/0,
                                  /*end_time=*/EnvTime::NowNanos()});
       }
     }
@@ -239,7 +239,7 @@ class TraceMe {
   union NoInit {
     NoInit() {}
     ~NoInit() {}
-    string name;
+    std::string name;
   } no_init_;
 
   uint64 start_time_ = kUntracedActivity;

From 64d839bb754b104e151bb49bb4ec46dbe690745d Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 18:21:51 +0200
Subject: [PATCH 0659/1533] Fix lint and improve readibility

---
 tensorflow/python/ops/map_fn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index b98b4ad10bc..40f8edfcdd1 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -479,7 +479,8 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
+      ag_ctx = autograph_ctx.control_status_ctx()
+      autographed_fn = autograph.tf_convert(fn, ag_ctx)
       result_value = autographed_fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)

From c568e0dd7fe372db4f5380f20ea2f96ebdd1b935 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 09:19:40 -0700
Subject: [PATCH 0660/1533] Avoid overhead for creating executors if there is
 no change in execution mode.

PiperOrigin-RevId: 311742240
Change-Id: I8676ab711d3c3d9e64d4ec142e5d934f7c32ee73
---
 tensorflow/python/eager/context.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index c6ef21402d2..86b3d5cf95f 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -2027,8 +2027,6 @@ def execution_mode(mode):
   """Context manager for setting execution mode for current thread."""
   if mode is None:
     yield
-  elif (mode == ASYNC) == context().executor.is_async():
-    yield
   else:
     ctx = context()
     executor_new = executor.new_executor(mode == ASYNC)

From 02b5a6754bb9f62f1b415783e684ab8a69c4a01b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 09:32:35 -0700
Subject: [PATCH 0661/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/9d4b4f344d8e

PiperOrigin-RevId: 311744575
Change-Id: Icdd7f018b188db8f8768f3b40d6411c2257547c1
---
 .../compiler/xla/service/mlir_gpu/BUILD       |  2 +-
 .../conv_emitter/conv_emitter_test.cc         |  2 +-
 .../xla/service/mlir_gpu/kernel_lowering.cc   |  6 ++--
 third_party/mlir/BUILD                        | 34 +++++++++----------
 third_party/mlir/test.BUILD                   |  2 +-
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index a57e4300d6e..07655a61074 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -185,10 +185,10 @@ cc_library(
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopsToGPUPass",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToGPUPass",
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
index 56684b1f726..d5cad385324 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 847ad918308..4645b084eb6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
@@ -351,7 +351,7 @@ struct FixKernelFunctionSignatures
 struct MapParallelLoops
     : public mlir::PassWrapper<MapParallelLoops, mlir::FunctionPass> {
   void runOnFunction() override {
-    mlir::greedilyMapParallelLoopsToGPU(getFunction().getBody());
+    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
   }
 };
 
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 93843d58f30..5636bc27cff 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2005,9 +2005,9 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPU",
-    srcs = ["lib/Conversion/LoopsToGPU/LoopsToGPU.cpp"],
-    hdrs = ["include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h"],
+    name = "SCFToGPU",
+    srcs = ["lib/Conversion/SCFToGPU/SCFToGPU.cpp"],
+    hdrs = ["include/mlir/Conversion/SCFToGPU/SCFToGPU.h"],
     includes = ["include"],
     deps = [
         ":Affine",
@@ -2027,22 +2027,22 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPUPass",
+    name = "SCFToGPUPass",
     srcs = [
-        "lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToGPU/SCFToGPUPass.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h",
+        "include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h",
     ],
     includes = ["include"],
     deps = [
         ":Affine",
         ":ConversionPassIncGen",
         ":GPUDialect",
-        ":LoopsToGPU",
         ":Pass",
         ":SCFDialect",
+        ":SCFToGPU",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -2053,11 +2053,11 @@ cc_library(
 cc_library(
     name = "CFGTransforms",
     srcs = [
-        "lib/Conversion/LoopToStandard/LoopToStandard.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToStandard/SCFToStandard.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h",
+        "include/mlir/Conversion/SCFToStandard/SCFToStandard.h",
     ],
     includes = ["include"],
     deps = [
@@ -2468,7 +2468,7 @@ cc_library(
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
-        ":VectorToLoops",
+        ":VectorToSCF",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir/test:TestAffine",
         "@llvm-project//mlir/test:TestDialect",
@@ -2547,13 +2547,13 @@ cc_library(
         ":LinalgToStandard",
         ":LinalgTransforms",
         ":LoopPassIncGen",
-        ":LoopsToGPUPass",
         ":NVVMDialect",
         ":OpenMPDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
         ":SCFDialect",
+        ":SCFToGPUPass",
         ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
@@ -2602,11 +2602,11 @@ cc_binary(
     deps = [
         ":Analysis",
         ":IR",
-        ":LoopsToGPUPass",
         ":MlirOptLib",
         ":MlirOptMain",
         ":OpenMPDialect",
         ":QuantOps",
+        ":SCFToGPUPass",
         ":Transforms",
         "@llvm-project//llvm:all_targets",
         "@llvm-project//llvm:support",
@@ -3117,7 +3117,7 @@ cc_library(
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
-        ":VectorToLoops",
+        ":VectorToSCF",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
@@ -3355,13 +3355,13 @@ cc_library(
 )
 
 cc_library(
-    name = "VectorToLoops",
+    name = "VectorToSCF",
     srcs = glob([
-        "lib/Conversion/VectorToLoops/*.cpp",
-        "lib/Conversion/VectorToLoops/*.h",
+        "lib/Conversion/VectorToSCF/*.cpp",
+        "lib/Conversion/VectorToSCF/*.h",
     ]),
     hdrs = glob([
-        "include/mlir/Conversion/VectorToLoops/*.h",
+        "include/mlir/Conversion/VectorToSCF/*.h",
     ]),
     includes = ["include"],
     deps = [
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index eb5d8a650eb..24b310f076e 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -171,7 +171,7 @@ cc_library(
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:VectorToLLVM",
-        "@llvm-project//mlir:VectorToLoops",
+        "@llvm-project//mlir:VectorToSCF",
     ],
 )
 

From 9957cb60a248ba1e61d5606a3d0a189290f36b37 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Fri, 15 May 2020 10:05:35 -0700
Subject: [PATCH 0662/1533] Bump open source llvm revision to
 9d4b4f344d8ea917e082cf58d66b71c0171e1650

PiperOrigin-RevId: 311751290
Change-Id: Ie8366f82180116dd363c3ed7ece36f948196bf1b
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c3d097a8362..949c6920e33 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "bfa200ebcf3706fde0dde335a3c1fa3fe1b3ba3f"
-    LLVM_SHA256 = "72deefcfe20434cb27a31ff9503c348dcf21065dbd27e9fa54c1fb3f5089b8e1"
+    LLVM_COMMIT = "9d4b4f344d8ea917e082cf58d66b71c0171e1650"
+    LLVM_SHA256 = "36e4470b5656cea3e0afb218edbdd96376fcb51dc2c5ed887b21237068baee41"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 53c634a6c150da732dcd6305478ffecd6a887668 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Fri, 15 May 2020 10:17:05 -0700
Subject: [PATCH 0663/1533] [MLIR/XLA] Constant sinking to control flow
 regions.

This is necessary for exporting to XLA since functional control flow is expected.

PiperOrigin-RevId: 311753796
Change-Id: If4e50a3b2fa668f162c9b30cc80e2bf743a9b641
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |  1 +
 .../tensorflow/utils/compile_mlir_util.cc     |  4 +
 tensorflow/compiler/mlir/xla/BUILD            | 19 +++++
 .../tests/sink-constants-to-control-flow.mlir | 60 +++++++++++++
 .../compiler/mlir/xla/transforms/passes.h     |  4 +
 .../sink_constants_to_control_flow.cc         | 85 +++++++++++++++++++
 6 files changed, 173 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
 create mode 100644 tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 54b560ed6ce..eb220a31f80 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1140,6 +1140,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/compiler/mlir/xla:type_to_shape",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
+    "//tensorflow/compiler/mlir/xla:xla_sink_constants_to_control_flow",
     "//tensorflow/compiler/tf2xla:common",
     "//tensorflow/compiler/tf2xla:xla_compiler",
     "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index e8ca691f961..03283da0112 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -305,6 +305,10 @@ Status ConvertMLIRToXlaComputation(
   // invocation.
   tf2xla.addNestedPass<mlir::FuncOp>(
       mlir::xla_hlo::createLegalizeTFPass(false));
+  // In order to export to XLA, we must sink constants to control flow regions,
+  // since XLA uses functional control flow.
+  tf2xla.addNestedPass<mlir::FuncOp>(
+      mlir::xla_hlo::createSinkConstantsToControlFlowPass());
 
   if (VLOG_IS_ON(1)) {
     // Print the whole module after each pass which requires disabling
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 12334e463fa..179a637ec7b 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -193,6 +193,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_sink_constants_to_control_flow",
+    srcs = [
+        "transforms/sink_constants_to_control_flow.cc",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "map_xla_to_scalar_op",
     hdrs = ["transforms/map_xla_to_scalar_op.h"],
@@ -873,6 +891,7 @@ cc_library(
         ":xla_legalize_to_standard",
         ":xla_lower",
         ":xla_materialize_broadcasts",
+        ":xla_sink_constants_to_control_flow",
         ":xla_test_passes",
     ],
 )
diff --git a/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
new file mode 100644
index 00000000000..c2fbad2faec
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
@@ -0,0 +1,60 @@
+// RUN: xla-opt %s -xla-hlo-sink-constants-to-control-flow | FileCheck %s --dump-input=fail
+
+// Tests sinking constants to a while loop.
+
+// CHECK-LABEL: func @sink_const_to_while
+func @sink_const_to_while(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK-NEXT: xla_hlo.while
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1A:.+]]: tensor<i64>
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    // CHECK: "xla_hlo.compare"(%[[C0]], %[[ARG1A]])
+    %1 = "xla_hlo.compare"(%c0, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "xla_hlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1B:.+]]: tensor<i64>
+    // CHECK-DAG: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    // CHECK-DAG: %[[ADD0:.+]] = xla_hlo.add %[[ARG1B]], %[[ARG1B]]
+    %2 = xla_hlo.add %arg1, %arg1 : tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]], %[[ADD0]]
+    %3 = xla_hlo.add %c1, %2 : tensor<i64>
+    // CHECK: %[[ADD2:.+]] = xla_hlo.add %[[C1]], %[[ADD1]]
+    %4 = xla_hlo.add %c1, %3 : tensor<i64>
+    "xla_hlo.return"(%4) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
+
+// Tests sinking constants to a conditional op.
+
+// CHECK-LABEL: func @sink_const_to_conditional
+func @sink_const_to_conditional(%arg0: tensor<i64>) -> tensor<i64> {
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.compare"(%arg0, %c0) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %1 = "xla_hlo.tuple"(%arg0) : (tensor<i64>) -> tuple<tensor<i64>>
+  // CHECK: xla_hlo.conditional
+  %2 = "xla_hlo.conditional"(%0, %1, %1) ( {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    %3 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD0:.+]] = xla_hlo.add %[[C0]],
+    %4 = xla_hlo.add %c0, %3 : tensor<i64>
+    %5 = "xla_hlo.tuple"(%4) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%5) : (tuple<tensor<i64>>) -> ()
+  },  {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    %6 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]],
+    %7 = xla_hlo.add %c1, %6 : tensor<i64>
+    %8 = "xla_hlo.tuple"(%7) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%8) : (tuple<tensor<i64>>) -> ()
+  }) : (tensor<i1>, tuple<tensor<i64>>, tuple<tensor<i64>>) -> tuple<tensor<i64>>
+  %9 = "xla_hlo.get_tuple_element"(%2) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+  return %9 : tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 39375e210d5..b148eac4286 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -65,6 +65,10 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 
+// Sinks constants implicitly captured in control flow regions. This is
+// necessary to export to XLA.
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
+
 }  // namespace xla_hlo
 
 namespace xla_lhlo {
diff --git a/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
new file mode 100644
index 00000000000..29646465acd
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+namespace {
+
+// A pass that sinks constants implicitly captured in control flow regions. This
+// is necessary to export to XLA.
+class SinkConstantsToControlFlow
+    : public mlir::PassWrapper<SinkConstantsToControlFlow, FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([](Operation* op) {
+      if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
+        SinkToRegion(&while_op.body());
+        SinkToRegion(&while_op.cond());
+      } else if (auto cond_op = llvm::dyn_cast<ConditionalOp>(op)) {
+        SinkToRegion(&cond_op.true_branch());
+        SinkToRegion(&cond_op.false_branch());
+      }
+    });
+  }
+
+ private:
+  // Performs constant sinking into a region.
+  static void SinkToRegion(Region* region) {
+    llvm::DenseMap<Value, ConstOp> sunk_constant;
+    visitUsedValuesDefinedAbove({*region}, [&](OpOperand* use) {
+      Value constant = use->get();
+      auto const_op = dyn_cast_or_null<ConstOp>(constant.getDefiningOp());
+      if (!const_op) return;
+      auto map_entry = sunk_constant.try_emplace(constant, nullptr);
+      if (!map_entry.second) {
+        // This constant has already been cloned into the region, reuse it.
+        use->set(map_entry.first->getSecond().getResult());
+        if (constant.use_empty()) const_op.erase();
+        return;
+      }
+      if (constant.hasOneUse()) {
+        const_op.getOperation()->moveBefore(&region->front().front());
+        return;
+      }
+      map_entry.first->getSecond() = const_op.clone();
+      region->front().getOperations().insert(region->front().begin(),
+                                             map_entry.first->getSecond());
+      use->set(map_entry.first->getSecond().getResult());
+    });
+  }
+};
+
+static mlir::PassRegistration<SinkConstantsToControlFlow> pass(
+    "xla-hlo-sink-constants-to-control-flow",
+    "Sink constants implicitly captured in control flow regions. This is "
+    "necessary to export to XLA.");
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass() {
+  return std::make_unique<SinkConstantsToControlFlow>();
+}
+
+}  // namespace xla_hlo
+}  // namespace mlir

From 2540d202b5b798c7cea953b60247b834bef3ca07 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Fri, 15 May 2020 10:19:17 -0700
Subject: [PATCH 0664/1533] Fix TF2XLA's InitGraph for unused feeds.

If a feed is not used, previously it would prune the placeholders and cause crashes.

PiperOrigin-RevId: 311754319
Change-Id: Ie1ad67c21ffb83ba88aeabea94c416473df099a0
---
 .../compiler/tf2xla/graph_compiler_util.cc    | 27 ++++++++++----
 tensorflow/compiler/tf2xla/tf2xla_test.cc     | 37 +++++++++++++++++++
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/graph_compiler_util.cc b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
index 57278eea292..a9385e05564 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_util.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
@@ -49,10 +49,12 @@ typedef std::unordered_map<string, Node*> NodeMap;
 // Each feed id identifies the positional output of some node, which may consist
 // of multiple edges. AddPlaceholdersForFeeds has already replaced each fed
 // tensor with a placeholder.  For each feed tensor, replaces all edges so they
-// point from a new _Arg node instead.
+// point from a new _Arg node instead. The newly created _Arg nodes are added to
+// `arg_nodes`.
 Status AddArgNodes(Graph* graph, const NodeMap& node_map,
                    const protobuf::RepeatedPtrField<tf2xla::Feed>& feeds,
-                   const std::unordered_map<string, string>& feed_remapping) {
+                   const std::unordered_map<string, string>& feed_remapping,
+                   std::unordered_set<const Node*>* arg_nodes) {
   for (int arg_index = 0; arg_index < feeds.size(); ++arg_index) {
     const tf2xla::Feed& feed = feeds[arg_index];
     // All feeds have been replaced by placeholders.
@@ -86,6 +88,7 @@ Status AddArgNodes(Graph* graph, const NodeMap& node_map,
             .Attr(kShapeAttr, TensorShape(feed.shape()))
             .Attr(kDebugNameAttr, feed.name())
             .Finalize(graph, &arg_node));
+    arg_nodes->insert(arg_node);
 
     // Collects out-edges from the feed node that have a matching edge index;
     // these will be replaced with edges from the arg node instead.
@@ -149,13 +152,13 @@ Status RewriteAndPruneGraph(
   for (Node* n : graph->nodes()) {
     node_map[n->name()] = n;
   }
+  std::unordered_set<const Node*> nodes_to_keep;
+  TF_RETURN_IF_ERROR(AddArgNodes(graph, node_map, config.feed(), feed_remapping,
+                                 &nodes_to_keep));
   TF_RETURN_IF_ERROR(
-      AddArgNodes(graph, node_map, config.feed(), feed_remapping));
-  std::unordered_set<const Node*> retval_nodes;
-  TF_RETURN_IF_ERROR(
-      AddRetvalNodes(graph, node_map, config.fetch(), &retval_nodes));
+      AddRetvalNodes(graph, node_map, config.fetch(), &nodes_to_keep));
   VLOG(2) << "Post rewrite: " << DumpGraphToFile("tf2xla_post_rewrite", *graph);
-  PruneForReverseReachability(graph, std::move(retval_nodes));
+  PruneForReverseReachability(graph, std::move(nodes_to_keep));
   FixupSourceAndSinkEdges(graph);
   VLOG(2) << "Post prune: " << DumpGraphToFile("tfcompile_post_prune", *graph);
   // Sanity-check, to make sure the feeds and fetches still exist post-pruning.
@@ -277,8 +280,16 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   // Prune the GraphDef first so that unknown ops that we aren't compiling get
   // filtered out.
   GraphDef second_copy_def;
+  // Add the placeholder nodes as "fetches" in prune_config, such that they will
+  // be preserved in PruneGraphDefInto.
+  auto prune_config = config;
+  for (const auto& entry : feed_remapping) {
+    auto ph = prune_config.add_fetch();
+    *ph->mutable_id()->mutable_node_name() = entry.second;
+    ph->mutable_id()->set_output_index(0);
+  }
   TF_RETURN_IF_ERROR(
-      PruneGraphDefInto(config, first_copy_def, &second_copy_def));
+      PruneGraphDefInto(prune_config, first_copy_def, &second_copy_def));
 
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(
       &second_copy_def, *g->op_registry(), /*node_offset=*/0));
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index 24afe595b18..7ea69f734c9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -99,5 +99,42 @@ TEST(ConvertGraphDefToXla, Sum) {
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
+TEST(ConvertGraphDefToXla, SumWithUnusedArgument) {
+  GraphDef graph_def = SumGraph();
+  tf2xla::Config config = SumConfig();
+  NodeDef* unused = graph_def.add_node();
+  unused->set_name("unused");
+  unused->set_op("Placeholder");
+  (*unused->mutable_attr())["dtype"] = TypeAttrValue(DT_INT32);
+  config.add_feed()->mutable_id()->set_node_name("unused");
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  // Set up arguments.
+  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_global_or = client->TransferToServer(x_literal);
+  auto y_global_or = client->TransferToServer(y_literal);
+  auto unused_global_or = client->TransferToServer(y_literal);
+  TF_EXPECT_OK(x_global_or.status());
+  TF_EXPECT_OK(y_global_or.status());
+  TF_EXPECT_OK(unused_global_or.status());
+  std::unique_ptr<xla::GlobalData> x_global =
+      std::move(x_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> y_global =
+      std::move(y_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> unused_global =
+      std::move(unused_global_or.ValueOrDie());
+
+  // Execute and check result.
+  auto result_or = client->ExecuteAndTransfer(
+      computation, {x_global.get(), y_global.get(), unused_global.get()});
+  TF_EXPECT_OK(result_or.status());
+  xla::Literal result = std::move(result_or.ValueOrDie());
+  EXPECT_EQ("(\ns32[] 42\n)", result.ToString());
+}
+
 }  // namespace
 }  // namespace tensorflow

From 76d3d13b5ad112300796a2f78be26031f9c71571 Mon Sep 17 00:00:00 2001
From: Rajeshwar Reddy T <43972606+rthadur@users.noreply.github.com>
Date: Fri, 15 May 2020 10:27:17 -0700
Subject: [PATCH 0665/1533] Create bot_config.yml

---
 .github/bot_config.yml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/bot_config.yml

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
new file mode 100644
index 00000000000..d63bd2ce844
--- /dev/null
+++ b/.github/bot_config.yml
@@ -0,0 +1,29 @@
+ # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
+
+# A list of assignees
+assignees:
+   - amahendrakar
+   - ravikyram
+   - Saduf2019
+# A list of assignees for    
+compiler_assignees:
+   - joker-eph

From d62a22a30000f11bde298daa86d82004e8531767 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Fri, 15 May 2020 10:43:51 -0700
Subject: [PATCH 0666/1533] Extend Keras Lambda layers to work with functions
 of any signature rather than only functions that take one argument.

Any *args and **kwargs passed when calling the lambda layer will be forwarded directly to the underlying lambda.

PiperOrigin-RevId: 311759844
Change-Id: Ia5ffe17f2951e4fd42d9ee4020c7c8b35ef9122f
---
 tensorflow/python/keras/layers/core.py        | 43 ++++++++++---------
 tensorflow/python/keras/layers/core_test.py   | 20 +++++++++
 .../v1/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 .../v2/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index db9c47eca17..d1528c7ba59 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -53,7 +53,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -738,7 +738,8 @@ class Lambda(Layer):
   models. `Lambda` layers are best suited for simple operations or
   quick experimentation. For more advanced use cases, follow
   [this guide](https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-  for subclassing `tf.keras.layers.Layer`.
+  for subclassing `tf.keras.layers.Layer`. (Do not subclass
+  `tf.keras.layers.Lamba`.)
 
   The main reason to subclass `tf.keras.layers.Layer` instead of using a
   `Lambda` layer is saving and inspecting a Model. `Lambda` layers
@@ -798,8 +799,7 @@ class Lambda(Layer):
     computation, but anything more complex should use a subclass Layer instead.
 
   Arguments:
-    function: The function to be evaluated. Takes input tensor as first
-      argument.
+    function: The function to evaluate when the layer is called.
     output_shape: Expected output shape from function. This argument can be
       inferred if not explicitly provided. Can be a tuple or function. If a
       tuple, it only specifies the first dimension onward;
@@ -812,8 +812,8 @@ class Lambda(Layer):
     mask: Either None (indicating no masking) or a callable with the same
       signature as the `compute_mask` layer method, or a tensor that will be
       returned as output mask regardless of what the input is.
-    arguments: Optional dictionary of keyword arguments to be passed to the
-      function.
+    arguments: Optional dictionary of keyword arguments to pass by default to
+      the function when those arguments are not passed to the layer call.
   Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
@@ -823,11 +823,16 @@ class Lambda(Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, function, output_shape=None, mask=None, arguments=None,
                **kwargs):
-    super(Lambda, self).__init__(**kwargs)
-
     self.arguments = arguments or {}
     self.function = function
 
+    # Decorate the function to produce this layer's call method
+    def _call_wrapper(*args, **kwargs):
+      return self._call_wrapper(*args, **kwargs)
+    self.call = tf_decorator.make_decorator(function, _call_wrapper)
+
+    super(Lambda, self).__init__(**kwargs)
+
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
@@ -836,9 +841,8 @@ class Lambda(Layer):
     # Warning on every invocation will be quite irksome in Eager mode.
     self._already_warned = False
 
-    function_args = tf_inspect.getfullargspec(function).args
-    self._fn_expects_training_arg = 'training' in function_args
-    self._fn_expects_mask_arg = 'mask' in function_args
+    self._expects_training_arg = 'training' in self._call_fn_args
+    self._expects_mask_arg = 'mask' in self._call_fn_args
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -869,23 +873,22 @@ class Lambda(Layer):
     output_shapes = tf_utils.convert_shapes(self._output_shape, to_tuples=False)
     return nest.map_structure(_add_batch, output_shapes)
 
-  def call(self, inputs, mask=None, training=None):
+  def _call_wrapper(self, *args, **kwargs):
     # We must copy for thread safety, but it only needs to be a shallow copy.
-    kwargs = {k: v for k, v in self.arguments.items()}
-    if self._fn_expects_mask_arg:
-      kwargs['mask'] = mask
-    if self._fn_expects_training_arg:
-      kwargs['training'] = training
+    call_kwargs = {k: v for k, v in self.arguments.items()}
+
+    # override default kwargs with the args passed to the layer call
+    call_kwargs.update(kwargs)
 
     created_variables = []
-    def _variable_creator(next_creator, **kwargs):
-      var = next_creator(**kwargs)
+    def _variable_creator(next_creator, **creator_kwargs):
+      var = next_creator(**creator_kwargs)
       created_variables.append(var)
       return var
 
     with backprop.GradientTape(watch_accessed_variables=True) as tape,\
         variable_scope.variable_creator_scope(_variable_creator):
-      result = self.function(inputs, **kwargs)
+      result = self.function(*args, **call_kwargs)
     self._check_variables(created_variables, tape.watched_variables())
     return result
 
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 3daa187f1ce..aa1192e12fc 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -139,6 +139,26 @@ class LambdaLayerTest(keras_parameterized.TestCase):
     out = ld([x1, x2])
     self.assertAllEqual(out.shape, [3, 2])
 
+  def test_lambda_multiple_args(self):
+    ld = keras.layers.Lambda(lambda x, y: x[0] + y)
+    x1 = np.ones([3, 2], np.float32)
+    x2 = np.ones([3, 5], np.float32)
+
+    expected_result = x1 * 2
+    self.assertAllEqual(ld([x1, x2], x1), expected_result)
+    self.assertAllEqual(ld([x1, x2], y=x1), expected_result)
+    self.assertAllEqual(ld(x=[x1, x2], y=x1), expected_result)
+
+  def test_lambda_constructor_args_and_multiple_args(self):
+    x1 = np.ones([3, 2], np.float32)
+    x2 = np.ones([3, 5], np.float32)
+    ld = keras.layers.Lambda(lambda x, y: x[0] + y, arguments={'y': x1*2})
+
+    self.assertAllEqual(ld([x1, x2]), x1 * 3)
+    self.assertAllEqual(ld([x1, x2], y=x1), x1 * 2)
+    self.assertAllEqual(ld(x=[x1, x2]), x1 * 3)
+    self.assertAllEqual(ld(x=[x1, x2], y=x1), x1 * 2)
+
   def test_lambda_output_shape(self):
     l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
     l(keras.backend.variable(np.ones((1, 1))))
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 22fa730112f..d4dbe96d1ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 22fa730112f..d4dbe96d1ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"

From bd010a095ee4eca62b39ac54e0d96e93adf49672 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 10:45:17 -0700
Subject: [PATCH 0667/1533] String formatting in assertAllEqual() fails in
 Python 3 because bytestring may be converted using %s. Use %r to fix this.

PiperOrigin-RevId: 311760220
Change-Id: Ia46073b51bc38b8e88016edab37bc34ceebd5d7f
---
 tensorflow/python/framework/test_util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index d5bbd889166..4981e1b68fd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -2686,7 +2686,7 @@ class TensorFlowTestCase(googletest.TestCase):
     if (b.ndim <= 3 or b.size < 500):
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-          " Contents: %s. \n%s." % (a.shape, b.shape, b, msg))
+          " Contents: %r. \n%s." % (a.shape, b.shape, b, msg))
     else:
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
@@ -2709,8 +2709,8 @@ class TensorFlowTestCase(googletest.TestCase):
       else:
         # np.where is broken for scalars
         x, y = a, b
-      msgs.append("not equal lhs = {}".format(x))
-      msgs.append("not equal rhs = {}".format(y))
+      msgs.append("not equal lhs = %r" % x)
+      msgs.append("not equal rhs = %r" % y)
       # With Python 3, we need to make sure the dtype matches between a and b.
       b = b.astype(a.dtype)
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))

From 262e92804b465874927d48be30311147692dd7a9 Mon Sep 17 00:00:00 2001
From: Xunkai Zhang <xunkai@google.com>
Date: Fri, 15 May 2020 10:46:10 -0700
Subject: [PATCH 0668/1533] [tfls.util] Remove tensorflow-lite-gpu from
 dependencies.

PiperOrigin-RevId: 311760392
Change-Id: Ia8fe0682cfda037589f7546f1e70974c1be439c5
---
 .../org/tensorflow/lite/gpu/GpuDelegate.java  | 16 +++--
 .../lite/experimental/support/java/BUILD      | 19 ++++-
 .../lite/support/model/GpuDelegateProxy.java  | 69 +++++++++++++++++++
 .../tensorflow/lite/support/model/Model.java  | 23 ++++---
 4 files changed, 110 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java

diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 895f12f0233..78cab0d2cbf 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -17,18 +17,19 @@ package org.tensorflow.lite.gpu;
 
 import java.io.Closeable;
 import org.tensorflow.lite.Delegate;
+import org.tensorflow.lite.annotations.UsedByReflection;
 
 /**
  * {@link Delegate} for GPU inference.
  *
- * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/
- * {@code Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an
- * {@code EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from
- * the same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will
- * internally create one, but then the developer must ensure that {@code Interpreter.run()} is
- * always called from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was
- * called.
+ * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/ {@code
+ * Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an {@code
+ * EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from the
+ * same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will internally
+ * create one, but then the developer must ensure that {@code Interpreter.run()} is always called
+ * from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was called.
  */
+@UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
 public class GpuDelegate implements Delegate, Closeable {
 
   private static final long INVALID_DELEGATE_HANDLE = 0;
@@ -98,6 +99,7 @@ public class GpuDelegate implements Delegate, Closeable {
             options.inferencePreference);
   }
 
+  @UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
   public GpuDelegate() {
     this(new Options());
   }
diff --git a/tensorflow/lite/experimental/support/java/BUILD b/tensorflow/lite/experimental/support/java/BUILD
index 43e984a0cb8..85f5da17193 100644
--- a/tensorflow/lite/experimental/support/java/BUILD
+++ b/tensorflow/lite/experimental/support/java/BUILD
@@ -9,7 +9,24 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/156482505): The NOGPU target is a temporary target. Internally, people
+# may already depend on "tensorflow-lite-support" so we shouldn't remove GPU
+# from its dependency. We will have CLs to help users migrate. After migration
+# is done, the "NOGPU" target will be removed.
+android_library(
+    name = "tensorflow-lite-support-nogpu",
+    srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
+    javacopts = JAVACOPTS,
+    manifest = "AndroidManifest.xml",
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "@org_checkerframework_qual",
+    ],
+)
+
 # TODO(138904786): Split Java part and Android part to make the support library usable by pure Java.
+# For new users: Please use "tensorflow-lite-support-nogpu" if possible, and
+# additionally depends on "tensorflowlite_gpu" if needed.
 android_library(
     name = "tensorflow-lite-support",
     srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
@@ -17,7 +34,7 @@ android_library(
     manifest = "AndroidManifest.xml",
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
-        "//tensorflow/lite/java:tensorflowlite_gpu",
+        "//tensorflow/lite/java:tensorflowlite_gpu",  # unuseddeps: keep
         "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
new file mode 100644
index 00000000000..9cfcf923ded
--- /dev/null
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.model;
+
+import android.util.Log;
+import java.io.Closeable;
+import java.io.IOException;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.tensorflow.lite.Delegate;
+
+/**
+ * Helper class to create and call necessary methods of {@code GpuDelegate} which is not a strict
+ * dependency.
+ */
+class GpuDelegateProxy implements Delegate, Closeable {
+
+  private static final String TAG = "GpuDelegateProxy";
+
+  private final Delegate proxiedDelegate;
+  private final Closeable proxiedCloseable;
+
+  @Nullable
+  public static GpuDelegateProxy maybeNewInstance() {
+    try {
+      Class<?> clazz = Class.forName("org.tensorflow.lite.gpu.GpuDelegate");
+      Object instance = clazz.getDeclaredConstructor().newInstance();
+      return new GpuDelegateProxy(instance);
+    } catch (ReflectiveOperationException e) {
+      Log.e(TAG, "Failed to create the GpuDelegate dynamically.", e);
+      return null;
+    }
+  }
+
+  /** Calls {@code close()} method of the delegate. */
+  @Override
+  public void close() {
+    try {
+      proxiedCloseable.close();
+    } catch (IOException e) {
+      // Should not trigger, because GpuDelegate#close never throws. The catch is required because
+      // of Closeable#close.
+      Log.e(TAG, "Failed to close the GpuDelegate.", e);
+    }
+  }
+
+  /** Calls {@code getNativeHandle()} method of the delegate. */
+  @Override
+  public long getNativeHandle() {
+    return proxiedDelegate.getNativeHandle();
+  }
+
+  private GpuDelegateProxy(Object instance) {
+    this.proxiedCloseable = (Closeable) instance;
+    this.proxiedDelegate = (Delegate) instance;
+  }
+}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
index c7f9e83f692..40659e39848 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
@@ -22,7 +22,6 @@ import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
 import org.checkerframework.checker.nullness.qual.Nullable;
 import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.gpu.GpuDelegate;
 import org.tensorflow.lite.support.common.FileUtil;
 import org.tensorflow.lite.support.common.SupportPreconditions;
 
@@ -91,7 +90,7 @@ public class Model {
   /** The memory-mapped model data. */
   private final MappedByteBuffer byteModel;
 
-  private final GpuDelegate gpuDelegate;
+  private final GpuDelegateProxy gpuDelegateProxy;
 
   /**
    * Builder for {@link Model}.
@@ -181,24 +180,30 @@ public class Model {
    * @param modelPath The original path of the model. It can be fetched later by {@link
    *     Model#getPath()}.
    * @param options The options for running the model.
+   * @throws IllegalArgumentException if {@code options.device} is {@link Device#GPU} but
+   *     "tensorflow-lite-gpu" is not linked to the project.
    */
   public static Model createModel(
       @NonNull MappedByteBuffer byteModel, @NonNull String modelPath, @NonNull Options options) {
     Interpreter.Options interpreterOptions = new Interpreter.Options();
-    GpuDelegate gpuDelegate = options.device.equals(Device.GPU) ? new GpuDelegate() : null;
+    GpuDelegateProxy gpuDelegateProxy = null;
     switch (options.device) {
       case NNAPI:
         interpreterOptions.setUseNNAPI(true);
         break;
       case GPU:
-        interpreterOptions.addDelegate(gpuDelegate);
+        gpuDelegateProxy = GpuDelegateProxy.maybeNewInstance();
+        SupportPreconditions.checkArgument(
+            gpuDelegateProxy != null,
+            "Cannot inference with GPU. Did you add \"tensorflow-lite-gpu\" as dependency?");
+        interpreterOptions.addDelegate(gpuDelegateProxy);
         break;
       case CPU:
         break;
     }
     interpreterOptions.setNumThreads(options.numThreads);
     Interpreter interpreter = new Interpreter(byteModel, interpreterOptions);
-    return new Model(modelPath, byteModel, interpreter, gpuDelegate);
+    return new Model(modelPath, byteModel, interpreter, gpuDelegateProxy);
   }
 
   /** Returns the memory-mapped model data. */
@@ -243,8 +248,8 @@ public class Model {
     if (interpreter != null) {
       interpreter.close();
     }
-    if (gpuDelegate != null) {
-      gpuDelegate.close();
+    if (gpuDelegateProxy != null) {
+      gpuDelegateProxy.close();
     }
   }
 
@@ -252,10 +257,10 @@ public class Model {
       @NonNull String modelPath,
       @NonNull MappedByteBuffer byteModel,
       @NonNull Interpreter interpreter,
-      @Nullable GpuDelegate gpuDelegate) {
+      @Nullable GpuDelegateProxy gpuDelegateProxy) {
     this.modelPath = modelPath;
     this.byteModel = byteModel;
     this.interpreter = interpreter;
-    this.gpuDelegate = gpuDelegate;
+    this.gpuDelegateProxy = gpuDelegateProxy;
   }
 }

From 26104505b8267c6f08493869e64e59af2ed62326 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Fri, 15 May 2020 10:50:40 -0700
Subject: [PATCH 0669/1533] [XLA:Python] Expose new use_spmd_partitioning
 compile option.

PiperOrigin-RevId: 311761373
Change-Id: I1f696e0c082295dc0e6896f05d1e88525de7ce70
---
 tensorflow/compiler/xla/python/xla.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index f03595bf677..65fb5311994 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -1406,7 +1406,10 @@ PYBIND11_MODULE(xla_extension, m) {
                              options.device_assignment())
                        : absl::nullopt;
           },
-          &ExecutableBuildOptions::set_device_assignment);
+          &ExecutableBuildOptions::set_device_assignment)
+      .def_property("use_spmd_partitioning",
+                    &ExecutableBuildOptions::use_spmd_partitioning,
+                    &ExecutableBuildOptions::set_use_spmd_partitioning);
 
   py::class_<XlaComputation>(m, "XlaComputation")
       .def(py::init([](const py::bytes& serialized_hlo_module_proto)

From 77245d07d13522a5cb5d060390fffa1894df5bbf Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Fri, 15 May 2020 10:58:42 -0700
Subject: [PATCH 0670/1533] Add dispatch support to more Python APIs.

PiperOrigin-RevId: 311763060
Change-Id: Ib35371483aa083e245996508a82fd13d8ac43131
---
 tensorflow/python/keras/activations.py        |  16 +++
 tensorflow/python/keras/backend.py            | 104 ++++++++++++++++++
 tensorflow/python/keras/backend_config.py     |   3 +
 tensorflow/python/keras/losses.py             |  16 +++
 tensorflow/python/keras/metrics.py            |   6 +
 tensorflow/python/ops/array_ops.py            |  47 ++++++++
 .../python/ops/candidate_sampling_ops.py      |   6 +
 tensorflow/python/ops/check_ops.py            |  42 +++++++
 tensorflow/python/ops/clip_ops.py             |   4 +
 tensorflow/python/ops/confusion_matrix.py     |   3 +
 tensorflow/python/ops/control_flow_ops.py     |   8 ++
 tensorflow/python/ops/ctc_ops.py              |   9 ++
 tensorflow/python/ops/embedding_ops.py        |   7 ++
 tensorflow/python/ops/functional_ops.py       |   7 ++
 tensorflow/python/ops/histogram_ops.py        |   3 +
 tensorflow/python/ops/image_ops_impl.py       |  80 ++++++++++++--
 tensorflow/python/ops/linalg/linalg_impl.py   |  12 +-
 .../ops/linalg/sparse/conjugate_gradient.py   |   2 +
 tensorflow/python/ops/linalg_ops.py           |  12 ++
 tensorflow/python/ops/logging_ops.py          |   3 +
 tensorflow/python/ops/losses/losses_impl.py   |  12 ++
 tensorflow/python/ops/manip_ops.py            |   2 +
 tensorflow/python/ops/math_ops.py             |  56 +++++++++-
 tensorflow/python/ops/nn_impl.py              |  33 ++++++
 tensorflow/python/ops/nn_ops.py               |  76 ++++++++++++-
 tensorflow/python/ops/numerics.py             |   3 +
 tensorflow/python/ops/parsing_ops.py          |  11 ++
 tensorflow/python/ops/proto_ops.py            |   5 +-
 .../python/ops/ragged/ragged_array_ops.py     |   5 +
 .../python/ops/ragged/ragged_concat_ops.py    |   2 +
 .../python/ops/ragged/ragged_factory_ops.py   |   4 +
 .../ops/ragged/ragged_functional_ops.py       |   2 +
 .../python/ops/ragged/ragged_math_ops.py      |   2 +
 .../python/ops/ragged/ragged_string_ops.py    |  11 ++
 .../python/ops/ragged/segment_id_ops.py       |   3 +
 tensorflow/python/ops/random_ops.py           |  10 ++
 tensorflow/python/ops/rnn.py                  |   7 ++
 tensorflow/python/ops/script_ops.py           |   4 +
 tensorflow/python/ops/sets_impl.py            |   5 +
 tensorflow/python/ops/signal/dct_ops.py       |   3 +
 tensorflow/python/ops/signal/fft_ops.py       |  21 +++-
 tensorflow/python/ops/signal/mel_ops.py       |   2 +
 tensorflow/python/ops/signal/mfcc_ops.py      |   2 +
 .../python/ops/signal/reconstruction_ops.py   |   2 +
 tensorflow/python/ops/signal/shape_ops.py     |   2 +
 tensorflow/python/ops/signal/spectral_ops.py  |   6 +
 tensorflow/python/ops/signal/window_ops.py    |   6 +
 tensorflow/python/ops/sort_ops.py             |   3 +
 tensorflow/python/ops/sparse_ops.py           |   6 +
 tensorflow/python/ops/special_math_ops.py     |  10 ++
 tensorflow/python/ops/stateless_random_ops.py |  11 ++
 tensorflow/python/ops/string_ops.py           |   7 ++
 52 files changed, 696 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 34d04d68c6c..0ee4a91f417 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -24,6 +24,7 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # b/123041942
@@ -41,6 +42,7 @@ _TF_ACTIVATIONS_V2 = {
 
 
 @keras_export('keras.activations.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax converts a real vector to a vector of categorical probabilities.
 
@@ -82,6 +84,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.activations.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.0):
   """Exponential linear unit.
 
@@ -100,6 +103,7 @@ def elu(x, alpha=1.0):
 
 
 @keras_export('keras.activations.selu')
+@dispatch.add_dispatch_support
 def selu(x):
   """Scaled Exponential Linear Unit (SELU).
 
@@ -153,6 +157,7 @@ def selu(x):
 
 
 @keras_export('keras.activations.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus activation function, `softplus(x) = log(exp(x) + 1)`.
   
@@ -174,6 +179,7 @@ def softplus(x):
 
 
 @keras_export('keras.activations.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign activation function, `softsign(x) = x / (abs(x) + 1)`.
   
@@ -194,6 +200,7 @@ def softsign(x):
 
 
 @keras_export('keras.activations.swish')
+@dispatch.add_dispatch_support
 def swish(x):
   """Swish activation function, `swish(x) = x * sigmoid(x)`.
 
@@ -224,6 +231,7 @@ def swish(x):
 
 
 @keras_export('keras.activations.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Applies the rectified linear unit activation function.
 
@@ -264,6 +272,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.activations.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Hyperbolic tangent activation function.
 
@@ -285,6 +294,7 @@ def tanh(x):
 
 
 @keras_export('keras.activations.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Sigmoid activation function, `sigmoid(x) = 1 / (1 + exp(-x))`.
 
@@ -314,6 +324,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.activations.exponential')
+@dispatch.add_dispatch_support
 def exponential(x):
   """Exponential activation function.
 
@@ -334,6 +345,7 @@ def exponential(x):
 
 
 @keras_export('keras.activations.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Hard sigmoid activation function.
 
@@ -360,6 +372,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.activations.linear')
+@dispatch.add_dispatch_support
 def linear(x):
   """Linear activation function (pass-through).
 
@@ -380,6 +393,7 @@ def linear(x):
 
 
 @keras_export('keras.activations.serialize')
+@dispatch.add_dispatch_support
 def serialize(activation):
   """Returns the string identifier of an activation function.
 
@@ -410,6 +424,7 @@ def serialize(activation):
 
 
 @keras_export('keras.activations.deserialize')
+@dispatch.add_dispatch_support
 def deserialize(name, custom_objects=None):
   """Returns activation function given a string identifier.
 
@@ -447,6 +462,7 @@ def deserialize(name, custom_objects=None):
 
 
 @keras_export('keras.activations.get')
+@dispatch.add_dispatch_support
 def get(identifier):
   """Returns function.
 
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 11e53e032ae..11795625d06 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -76,6 +76,7 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training.tracking import util as tracking_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
@@ -173,6 +174,7 @@ def backend():
 
 
 @keras_export('keras.backend.cast_to_floatx')
+@dispatch.add_dispatch_support
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -799,6 +801,7 @@ def is_sparse(tensor):
 
 
 @keras_export('keras.backend.to_dense')
+@dispatch.add_dispatch_support
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -1007,6 +1010,7 @@ def _initialize_variables(session):
 
 
 @keras_export('keras.backend.constant')
+@dispatch.add_dispatch_support
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -1163,6 +1167,7 @@ def is_placeholder(x):
 
 
 @keras_export('keras.backend.shape')
+@dispatch.add_dispatch_support
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -1245,6 +1250,7 @@ def ndim(x):
 
 
 @keras_export('keras.backend.dtype')
+@dispatch.add_dispatch_support
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -1343,6 +1349,7 @@ def zeros(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones')
+@dispatch.add_dispatch_support
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -1377,6 +1384,7 @@ def ones(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.eye')
+@dispatch.add_dispatch_support
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -1433,6 +1441,7 @@ def zeros_like(x, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones_like')
+@dispatch.add_dispatch_support
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1563,6 +1572,7 @@ def count_params(x):
 
 
 @keras_export('keras.backend.cast')
+@dispatch.add_dispatch_support
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1647,6 +1657,7 @@ def moving_average_update(x, value, momentum):
 
 
 @keras_export('keras.backend.dot')
+@dispatch.add_dispatch_support
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
@@ -1707,6 +1718,7 @@ def dot(x, y):
 
 
 @keras_export('keras.backend.batch_dot')
+@dispatch.add_dispatch_support
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -1895,6 +1907,7 @@ def batch_dot(x, y, axes=None):
 
 
 @keras_export('keras.backend.transpose')
+@dispatch.add_dispatch_support
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -1926,6 +1939,7 @@ def transpose(x):
 
 
 @keras_export('keras.backend.gather')
+@dispatch.add_dispatch_support
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -1961,6 +1975,7 @@ def gather(reference, indices):
 
 
 @keras_export('keras.backend.max')
+@dispatch.add_dispatch_support
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -1979,6 +1994,7 @@ def max(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.min')
+@dispatch.add_dispatch_support
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -1997,6 +2013,7 @@ def min(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.sum')
+@dispatch.add_dispatch_support
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -2015,6 +2032,7 @@ def sum(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.prod')
+@dispatch.add_dispatch_support
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -2033,6 +2051,7 @@ def prod(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.cumsum')
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -2047,6 +2066,7 @@ def cumsum(x, axis=0):
 
 
 @keras_export('keras.backend.cumprod')
+@dispatch.add_dispatch_support
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2081,6 +2101,7 @@ def var(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.std')
+@dispatch.add_dispatch_support
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -2107,6 +2128,7 @@ def std(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.mean')
+@dispatch.add_dispatch_support
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -2127,6 +2149,7 @@ def mean(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.any')
+@dispatch.add_dispatch_support
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -2143,6 +2166,7 @@ def any(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.all')
+@dispatch.add_dispatch_support
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -2159,6 +2183,7 @@ def all(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.argmax')
+@dispatch.add_dispatch_support
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -2173,6 +2198,7 @@ def argmax(x, axis=-1):
 
 
 @keras_export('keras.backend.argmin')
+@dispatch.add_dispatch_support
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -2187,6 +2213,7 @@ def argmin(x, axis=-1):
 
 
 @keras_export('keras.backend.square')
+@dispatch.add_dispatch_support
 def square(x):
   """Element-wise square.
 
@@ -2200,6 +2227,7 @@ def square(x):
 
 
 @keras_export('keras.backend.abs')
+@dispatch.add_dispatch_support
 def abs(x):
   """Element-wise absolute value.
 
@@ -2213,6 +2241,7 @@ def abs(x):
 
 
 @keras_export('keras.backend.sqrt')
+@dispatch.add_dispatch_support
 def sqrt(x):
   """Element-wise square root.
 
@@ -2229,6 +2258,7 @@ def sqrt(x):
 
 
 @keras_export('keras.backend.exp')
+@dispatch.add_dispatch_support
 def exp(x):
   """Element-wise exponential.
 
@@ -2242,6 +2272,7 @@ def exp(x):
 
 
 @keras_export('keras.backend.log')
+@dispatch.add_dispatch_support
 def log(x):
   """Element-wise log.
 
@@ -2276,6 +2307,7 @@ def logsumexp(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.round')
+@dispatch.add_dispatch_support
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -2291,6 +2323,7 @@ def round(x):
 
 
 @keras_export('keras.backend.sign')
+@dispatch.add_dispatch_support
 def sign(x):
   """Element-wise sign.
 
@@ -2304,6 +2337,7 @@ def sign(x):
 
 
 @keras_export('keras.backend.pow')
+@dispatch.add_dispatch_support
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -2318,6 +2352,7 @@ def pow(x, a):
 
 
 @keras_export('keras.backend.clip')
+@dispatch.add_dispatch_support
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -2341,6 +2376,7 @@ def clip(x, min_value, max_value):
 
 
 @keras_export('keras.backend.equal')
+@dispatch.add_dispatch_support
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -2355,6 +2391,7 @@ def equal(x, y):
 
 
 @keras_export('keras.backend.not_equal')
+@dispatch.add_dispatch_support
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -2369,6 +2406,7 @@ def not_equal(x, y):
 
 
 @keras_export('keras.backend.greater')
+@dispatch.add_dispatch_support
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -2383,6 +2421,7 @@ def greater(x, y):
 
 
 @keras_export('keras.backend.greater_equal')
+@dispatch.add_dispatch_support
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -2397,6 +2436,7 @@ def greater_equal(x, y):
 
 
 @keras_export('keras.backend.less')
+@dispatch.add_dispatch_support
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -2411,6 +2451,7 @@ def less(x, y):
 
 
 @keras_export('keras.backend.less_equal')
+@dispatch.add_dispatch_support
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -2425,6 +2466,7 @@ def less_equal(x, y):
 
 
 @keras_export('keras.backend.maximum')
+@dispatch.add_dispatch_support
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -2449,6 +2491,7 @@ def maximum(x, y):
 
 
 @keras_export('keras.backend.minimum')
+@dispatch.add_dispatch_support
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -2463,6 +2506,7 @@ def minimum(x, y):
 
 
 @keras_export('keras.backend.sin')
+@dispatch.add_dispatch_support
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -2476,6 +2520,7 @@ def sin(x):
 
 
 @keras_export('keras.backend.cos')
+@dispatch.add_dispatch_support
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2621,6 +2666,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 
 @keras_export('keras.backend.batch_normalization')
+@dispatch.add_dispatch_support
 def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2683,6 +2729,7 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 
 
 @keras_export('keras.backend.concatenate')
+@dispatch.add_dispatch_support
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2720,6 +2767,7 @@ def concatenate(tensors, axis=-1):
 
 
 @keras_export('keras.backend.reshape')
+@dispatch.add_dispatch_support
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2749,6 +2797,7 @@ def reshape(x, shape):
 
 
 @keras_export('keras.backend.permute_dimensions')
+@dispatch.add_dispatch_support
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2780,6 +2829,7 @@ def permute_dimensions(x, pattern):
 
 
 @keras_export('keras.backend.resize_images')
+@dispatch.add_dispatch_support
 def resize_images(x, height_factor, width_factor, data_format,
                   interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
@@ -2843,6 +2893,7 @@ def resize_images(x, height_factor, width_factor, data_format,
 
 
 @keras_export('keras.backend.resize_volumes')
+@dispatch.add_dispatch_support
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -2875,6 +2926,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
 
 
 @keras_export('keras.backend.repeat_elements')
+@dispatch.add_dispatch_support
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -2936,6 +2988,7 @@ def repeat_elements(x, rep, axis):
 
 
 @keras_export('keras.backend.repeat')
+@dispatch.add_dispatch_support
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -2971,6 +3024,7 @@ def repeat(x, n):
 
 
 @keras_export('keras.backend.arange')
+@dispatch.add_dispatch_support
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -3009,6 +3063,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
 
 
 @keras_export('keras.backend.tile')
+@dispatch.add_dispatch_support
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3026,6 +3081,7 @@ def tile(x, n):
 
 
 @keras_export('keras.backend.flatten')
+@dispatch.add_dispatch_support
 def flatten(x):
   """Flatten a tensor.
 
@@ -3051,6 +3107,7 @@ def flatten(x):
 
 
 @keras_export('keras.backend.batch_flatten')
+@dispatch.add_dispatch_support
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -3076,6 +3133,7 @@ def batch_flatten(x):
 
 
 @keras_export('keras.backend.expand_dims')
+@dispatch.add_dispatch_support
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -3090,6 +3148,7 @@ def expand_dims(x, axis=-1):
 
 
 @keras_export('keras.backend.squeeze')
+@dispatch.add_dispatch_support
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -3104,6 +3163,7 @@ def squeeze(x, axis):
 
 
 @keras_export('keras.backend.temporal_padding')
+@dispatch.add_dispatch_support
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -3121,6 +3181,7 @@ def temporal_padding(x, padding=(1, 1)):
 
 
 @keras_export('keras.backend.spatial_2d_padding')
+@dispatch.add_dispatch_support
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -3152,6 +3213,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.spatial_3d_padding')
+@dispatch.add_dispatch_support
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -3196,6 +3258,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.stack')
+@dispatch.add_dispatch_support
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -3222,6 +3285,7 @@ def stack(x, axis=0):
 
 
 @keras_export('keras.backend.one_hot')
+@dispatch.add_dispatch_support
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -3241,6 +3305,7 @@ def one_hot(indices, num_classes):
 
 
 @keras_export('keras.backend.reverse')
+@dispatch.add_dispatch_support
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -3321,6 +3386,7 @@ def get_value(x):
 
 
 @keras_export('keras.backend.batch_get_value')
+@dispatch.add_dispatch_support
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -3382,6 +3448,7 @@ def set_value(x, value):
 
 
 @keras_export('keras.backend.batch_set_value')
+@dispatch.add_dispatch_support
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -3424,6 +3491,7 @@ set_value.__doc__ = set_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 
 
 @keras_export('keras.backend.print_tensor')
+@dispatch.add_dispatch_support
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -3861,6 +3929,7 @@ def gradients(loss, variables):
 
 
 @keras_export('keras.backend.stop_gradient')
+@dispatch.add_dispatch_support
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -3882,6 +3951,7 @@ def stop_gradient(variables):
 
 
 @keras_export('keras.backend.rnn')
+@dispatch.add_dispatch_support
 def rnn(step_function,
         inputs,
         initial_states,
@@ -4276,6 +4346,7 @@ def rnn(step_function,
 
 
 @keras_export('keras.backend.switch')
+@dispatch.add_dispatch_support
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -4409,6 +4480,7 @@ def in_test_phase(x, alt, training=None):
 
 
 @keras_export('keras.backend.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
@@ -4462,6 +4534,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.backend.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -4480,6 +4553,7 @@ def elu(x, alpha=1.):
 
 
 @keras_export('keras.backend.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax of a tensor.
 
@@ -4495,6 +4569,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.backend.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus of a tensor.
 
@@ -4508,6 +4583,7 @@ def softplus(x):
 
 
 @keras_export('keras.backend.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign of a tensor.
 
@@ -4527,6 +4603,7 @@ def _backtrack_identity(tensor):
 
 
 @keras_export('keras.backend.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -4595,6 +4672,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
@@ -4676,6 +4754,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -4712,6 +4791,7 @@ def binary_crossentropy(target, output, from_logits=False):
 
 
 @keras_export('keras.backend.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -4725,6 +4805,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.backend.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -4747,6 +4828,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.backend.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Element-wise tanh.
 
@@ -4760,6 +4842,7 @@ def tanh(x):
 
 
 @keras_export('keras.backend.dropout')
+@dispatch.add_dispatch_support
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -4780,6 +4863,7 @@ def dropout(x, level, noise_shape=None, seed=None):
 
 
 @keras_export('keras.backend.l2_normalize')
+@dispatch.add_dispatch_support
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -4794,6 +4878,7 @@ def l2_normalize(x, axis=None):
 
 
 @keras_export('keras.backend.in_top_k')
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -4896,6 +4981,7 @@ def _preprocess_padding(padding):
 
 
 @keras_export('keras.backend.conv1d')
+@dispatch.add_dispatch_support
 def conv1d(x,
            kernel,
            strides=1,
@@ -4946,6 +5032,7 @@ def conv1d(x,
 
 
 @keras_export('keras.backend.conv2d')
+@dispatch.add_dispatch_support
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -4989,6 +5076,7 @@ def conv2d(x,
 
 
 @keras_export('keras.backend.conv2d_transpose')
+@dispatch.add_dispatch_support
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -5129,6 +5217,7 @@ def separable_conv1d(x,
 
 
 @keras_export('keras.backend.separable_conv2d')
+@dispatch.add_dispatch_support
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -5186,6 +5275,7 @@ def separable_conv2d(x,
 
 
 @keras_export('keras.backend.depthwise_conv2d')
+@dispatch.add_dispatch_support
 def depthwise_conv2d(x,
                      depthwise_kernel,
                      strides=(1, 1),
@@ -5235,6 +5325,7 @@ def depthwise_conv2d(x,
 
 
 @keras_export('keras.backend.conv3d')
+@dispatch.add_dispatch_support
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -5337,6 +5428,7 @@ def conv3d_transpose(x,
 
 
 @keras_export('keras.backend.pool2d')
+@dispatch.add_dispatch_support
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -5396,6 +5488,7 @@ def pool2d(x,
 
 
 @keras_export('keras.backend.pool3d')
+@dispatch.add_dispatch_support
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -5526,6 +5619,7 @@ def local_conv(inputs,
 
 
 @keras_export('keras.backend.local_conv1d')
+@dispatch.add_dispatch_support
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
@@ -5561,6 +5655,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
 
 @keras_export('keras.backend.local_conv2d')
+@dispatch.add_dispatch_support
 def local_conv2d(inputs,
                  kernel,
                  kernel_size,
@@ -5602,6 +5697,7 @@ def local_conv2d(inputs,
 
 
 @keras_export('keras.backend.bias_add')
+@dispatch.add_dispatch_support
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -5646,6 +5742,7 @@ def bias_add(x, bias, data_format=None):
 
 
 @keras_export('keras.backend.random_normal')
+@dispatch.add_dispatch_support
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -5682,6 +5779,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_uniform')
+@dispatch.add_dispatch_support
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -5715,6 +5813,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
 
 @deprecated(None, 'Use `tf.keras.backend.random_bernoulli` instead.')
 @keras_export('keras.backend.random_binomial')
+@dispatch.add_dispatch_support
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -5751,6 +5850,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_bernoulli')
+@dispatch.add_dispatch_support
 def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random bernoulli distribution of values.
 
@@ -5767,6 +5867,7 @@ def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.truncated_normal')
+@dispatch.add_dispatch_support
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -5801,6 +5902,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.ctc_label_dense_to_sparse')
+@dispatch.add_dispatch_support
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -5847,6 +5949,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
 
 
 @keras_export('keras.backend.ctc_batch_cost')
+@dispatch.add_dispatch_support
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -5879,6 +5982,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
 
 
 @keras_export('keras.backend.ctc_decode')
+@dispatch.add_dispatch_support
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
diff --git a/tensorflow/python/keras/backend_config.py b/tensorflow/python/keras/backend_config.py
index c1bf163c444..cd1f1e4b423 100644
--- a/tensorflow/python/keras/backend_config.py
+++ b/tensorflow/python/keras/backend_config.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # The type of float to use throughout a session.
@@ -30,6 +31,7 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 
 
 @keras_export('keras.backend.epsilon')
+@dispatch.add_dispatch_support
 def epsilon():
   """Returns the value of the fuzz factor used in numeric expressions.
 
@@ -110,6 +112,7 @@ def set_floatx(value):
 
 
 @keras_export('keras.backend.image_data_format')
+@dispatch.add_dispatch_support
 def image_data_format():
   """Returns the default image data format convention.
 
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 99fb015288b..2bb53dcfaa5 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.ops.losses import util as tf_losses_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -1164,6 +1165,7 @@ class Huber(LossFunctionWrapper):
               'keras.losses.mean_squared_error',
               'keras.losses.mse',
               'keras.losses.MSE')
+@dispatch.add_dispatch_support
 def mean_squared_error(y_true, y_pred):
   """Computes the mean squared error between labels and predictions.
 
@@ -1199,6 +1201,7 @@ def mean_squared_error(y_true, y_pred):
               'keras.losses.mean_absolute_error',
               'keras.losses.mae',
               'keras.losses.MAE')
+@dispatch.add_dispatch_support
 def mean_absolute_error(y_true, y_pred):
   """Computes the mean absolute error between labels and predictions.
 
@@ -1231,6 +1234,7 @@ def mean_absolute_error(y_true, y_pred):
               'keras.losses.mean_absolute_percentage_error',
               'keras.losses.mape',
               'keras.losses.MAPE')
+@dispatch.add_dispatch_support
 def mean_absolute_percentage_error(y_true, y_pred):
   """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
@@ -1267,6 +1271,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
               'keras.losses.mean_squared_logarithmic_error',
               'keras.losses.msle',
               'keras.losses.MSLE')
+@dispatch.add_dispatch_support
 def mean_squared_logarithmic_error(y_true, y_pred):
   """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
 
@@ -1315,6 +1320,7 @@ def _maybe_convert_labels(y_true):
 
 
 @keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
+@dispatch.add_dispatch_support
 def squared_hinge(y_true, y_pred):
   """Computes the squared hinge loss between `y_true` and `y_pred`.
 
@@ -1347,6 +1353,7 @@ def squared_hinge(y_true, y_pred):
 
 
 @keras_export('keras.metrics.hinge', 'keras.losses.hinge')
+@dispatch.add_dispatch_support
 def hinge(y_true, y_pred):
   """Computes the hinge loss between `y_true` and `y_pred`.
 
@@ -1378,6 +1385,7 @@ def hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.categorical_hinge')
+@dispatch.add_dispatch_support
 def categorical_hinge(y_true, y_pred):
   """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
@@ -1410,6 +1418,7 @@ def categorical_hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.huber', v1=[])
+@dispatch.add_dispatch_support
 def huber(y_true, y_pred, delta=1.0):
   """Computes Huber loss value.
 
@@ -1447,6 +1456,7 @@ def huber(y_true, y_pred, delta=1.0):
 
 
 @keras_export('keras.losses.log_cosh', 'keras.losses.logcosh')
+@dispatch.add_dispatch_support
 def log_cosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
 
@@ -1485,6 +1495,7 @@ def log_cosh(y_true, y_pred):
 
 @keras_export('keras.metrics.categorical_crossentropy',
               'keras.losses.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(y_true,
                              y_pred,
                              from_logits=False,
@@ -1525,6 +1536,7 @@ def categorical_crossentropy(y_true,
 
 @keras_export('keras.metrics.sparse_categorical_crossentropy',
               'keras.losses.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   """Computes the sparse categorical crossentropy loss.
 
@@ -1556,6 +1568,7 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
 
 @keras_export('keras.metrics.binary_crossentropy',
               'keras.losses.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
   """Computes the binary crossentropy loss.
 
@@ -1599,6 +1612,7 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
               'keras.losses.kullback_leibler_divergence',
               'keras.losses.kld',
               'keras.losses.KLD')
+@dispatch.add_dispatch_support
 def kl_divergence(y_true, y_pred):
   """Computes Kullback-Leibler divergence loss between `y_true` and `y_pred`.
 
@@ -1635,6 +1649,7 @@ def kl_divergence(y_true, y_pred):
 
 
 @keras_export('keras.metrics.poisson', 'keras.losses.poisson')
+@dispatch.add_dispatch_support
 def poisson(y_true, y_pred):
   """Computes the Poisson loss between y_true and y_pred.
 
@@ -1676,6 +1691,7 @@ def poisson(y_true, y_pred):
         'keras.losses.cosine',
         'keras.losses.cosine_similarity',
     ])
+@dispatch.add_dispatch_support
 def cosine_similarity(y_true, y_pred, axis=-1):
   """Computes the cosine similarity between labels and predictions.
 
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 63cf7c578bc..a67755b9333 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -69,6 +69,7 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -3212,6 +3213,7 @@ def accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.binary_accuracy')
+@dispatch.add_dispatch_support
 def binary_accuracy(y_true, y_pred, threshold=0.5):
   """Calculates how often predictions matches binary labels.
 
@@ -3239,6 +3241,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
 
 
 @keras_export('keras.metrics.categorical_accuracy')
+@dispatch.add_dispatch_support
 def categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches one-hot labels.
 
@@ -3267,6 +3270,7 @@ def categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.sparse_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches integer labels.
 
@@ -3307,6 +3311,7 @@ def sparse_categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often targets are in the top `K` predictions.
 
@@ -3332,6 +3337,7 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 @keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often integer targets are in the top `K` predictions.
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9a5e95d8aad..a2640925a38 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -57,6 +57,7 @@ _BaseSlice = slice
 
 
 @tf_export("reshape", v1=["reshape", "manip.reshape"])
+@dispatch.add_dispatch_support
 def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
   r"""Reshapes a tensor.
 
@@ -197,6 +198,7 @@ def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
 
 
 @tf_export("fill")
+@dispatch.add_dispatch_support
 def fill(dims, value, name=None):
   r"""Creates a tensor filled with a scalar value.
 
@@ -455,6 +457,7 @@ listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
                         "This op will be removed after the deprecation date. "
                         "Please switch to tf.sets.difference().")
 @tf_export(v1=["setdiff1d"])
+@dispatch.add_dispatch_support
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   """Computes the difference between two lists of numbers or strings.
 
@@ -498,6 +501,7 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 
 @tf_export("broadcast_dynamic_shape")
+@dispatch.add_dispatch_support
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given symbolic shapes.
 
@@ -523,6 +527,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
 
 @tf_export("broadcast_static_shape")
+@dispatch.add_dispatch_support
 def broadcast_static_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given known shapes.
 
@@ -550,6 +555,7 @@ def broadcast_static_shape(shape_x, shape_y):
 
 
 @tf_export("shape", v1=[])
+@dispatch.add_dispatch_support
 def shape_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -596,6 +602,7 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
 
 @tf_export(v1=["shape"])
+@dispatch.add_dispatch_support
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -650,6 +657,7 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
 
 
 @tf_export("shape_n")
+@dispatch.add_dispatch_support
 def shape_n(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns shape of tensors.
@@ -1007,6 +1015,7 @@ def _slice_helper(tensor, slice_spec, var=None):
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
 @tf_export("slice")
+@dispatch.add_dispatch_support
 def slice(input_, begin, size, name=None):
   # pylint: disable=redefined-builtin
   """Extracts a slice from a tensor.
@@ -1062,6 +1071,7 @@ def slice(input_, begin, size, name=None):
 
 # pylint: disable=invalid-name
 @tf_export("strided_slice")
+@dispatch.add_dispatch_support
 def strided_slice(input_,
                   begin,
                   end,
@@ -1253,6 +1263,7 @@ ops.Tensor._override_operator("__getitem__", _slice_helper)
 
 
 @tf_export("parallel_stack")
+@dispatch.add_dispatch_support
 def parallel_stack(values, name="parallel_stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor in parallel.
 
@@ -1489,6 +1500,7 @@ ops.register_tensor_conversion_function((list, tuple),
 
 
 @tf_export("unstack")
+@dispatch.add_dispatch_support
 def unstack(value, num=None, axis=0, name="unstack"):
   """Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
 
@@ -1632,6 +1644,7 @@ def concat(values, axis, name="concat"):
 
 
 @tf_export(v1=["boolean_mask"])
+@dispatch.add_dispatch_support
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.
 
@@ -1824,6 +1837,7 @@ def sparse_mask(a, mask_indices, name=None):
 
 
 @tf_export("unique")
+@dispatch.add_dispatch_support
 def unique(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1871,6 +1885,7 @@ unique.__doc__ = gen_array_ops.unique.__doc__
 
 
 @tf_export("unique_with_counts")
+@dispatch.add_dispatch_support
 def unique_with_counts(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1923,6 +1938,7 @@ unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 
 
 @tf_export("split")
+@dispatch.add_dispatch_support
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor `value` into a list of sub tensors.
 
@@ -2000,6 +2016,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
 
 @tf_export("transpose", v1=[])
+@dispatch.add_dispatch_support
 def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
   """Transposes `a`, where `a` is a Tensor.
 
@@ -2080,6 +2097,7 @@ def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
 
 
 @tf_export(v1=["transpose"])
+@dispatch.add_dispatch_support
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`.
 
@@ -2170,6 +2188,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
 @tf_export(
     "linalg.matrix_transpose",
     v1=["linalg.transpose", "linalg.matrix_transpose", "matrix_transpose"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_transpose", "linalg.transpose")
 def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   """Transposes last two dimensions of tensor `a`.
@@ -2248,6 +2267,7 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 
 
 @tf_export("linalg.diag", v1=["linalg.diag", "matrix_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag")
 def matrix_diag(diagonal,
                 name="diag",
@@ -2416,6 +2436,7 @@ def matrix_diag(diagonal,
 
 
 @tf_export("linalg.diag_part", v1=["linalg.diag_part", "matrix_diag_part"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag_part")
 @dispatch.add_dispatch_support
 def matrix_diag_part(
@@ -2556,6 +2577,7 @@ def matrix_diag_part(
 
 
 @tf_export("linalg.set_diag", v1=["linalg.set_diag", "matrix_set_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_set_diag")
 def matrix_set_diag(
     input,  # pylint:disable=redefined-builtin
@@ -2719,6 +2741,7 @@ def _tag_zeros_tensor(fun):
 
 
 @tf_export("zeros")
+@dispatch.add_dispatch_support
 @_tag_zeros_tensor
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -2971,6 +2994,7 @@ def ones_like_impl(tensor, dtype, name, optimize=True):
 
 
 @tf_export("ones")
+@dispatch.add_dispatch_support
 def ones(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to one (1).
 
@@ -3182,6 +3206,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
 
 
 @tf_export("pad", v1=[])
+@dispatch.add_dispatch_support
 def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
   """Pads a tensor.
 
@@ -3240,6 +3265,7 @@ def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
 
 
 @tf_export(v1=["pad"])
+@dispatch.add_dispatch_support
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -3357,6 +3383,7 @@ def _get_paddings_constant(paddings):
 
 
 @tf_export("meshgrid")
+@dispatch.add_dispatch_support
 def meshgrid(*args, **kwargs):
   """Broadcasts parameters for evaluation on an N-D grid.
 
@@ -3500,6 +3527,7 @@ def _TileGradShape(op):
 
 
 @tf_export("edit_distance")
+@dispatch.add_dispatch_support
 def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   """Computes the Levenshtein distance between sequences.
 
@@ -3694,6 +3722,7 @@ def required_space_to_batch_paddings(input_shape,
 
 
 @tf_export(v1=["nn.space_to_batch", "space_to_batch"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -3717,6 +3746,7 @@ space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
 @tf_export("space_to_batch", "nn.space_to_batch", v1=[])
+@dispatch.add_dispatch_support
 def space_to_batch_v2(input, block_shape, paddings, name=None):  # pylint: disable=redefined-builtin
   return space_to_batch_nd(input, block_shape, paddings, name)
 
@@ -3725,6 +3755,7 @@ space_to_batch_v2.__doc__ = gen_array_ops.space_to_batch_nd.__doc__
 
 
 @tf_export(v1=["nn.space_to_depth", "space_to_depth"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
@@ -3734,6 +3765,7 @@ space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export("nn.space_to_depth", v1=[])
+@dispatch.add_dispatch_support
 def space_to_depth_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
 
@@ -3742,6 +3774,7 @@ space_to_depth_v2.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export(v1=["nn.depth_to_space", "depth_to_space"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
@@ -3751,6 +3784,7 @@ depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export("nn.depth_to_space", v1=[])
+@dispatch.add_dispatch_support
 def depth_to_space_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
 
@@ -3759,6 +3793,7 @@ depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export(v1=["batch_to_space"])
+@dispatch.add_dispatch_support
 def batch_to_space(input, crops, block_size, name=None, block_shape=None):  # pylint: disable=redefined-builtin,missing-docstring
   block_size = deprecation.deprecated_argument_lookup("block_shape",
                                                       block_shape, "block_size",
@@ -3776,6 +3811,7 @@ batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
 @tf_export("batch_to_space", v1=[])
+@dispatch.add_dispatch_support
 def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=redefined-builtin
   """BatchToSpace for N-D tensors of type T.
 
@@ -4091,6 +4127,7 @@ def _all_dimensions(x):
 
 
 @tf_export("sequence_mask")
+@dispatch.add_dispatch_support
 def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
   """Returns a mask tensor representing the first N positions of each cell.
 
@@ -4317,6 +4354,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 @tf_export("where", v1=["where_v2"])
+@dispatch.add_dispatch_support
 def where_v2(condition, x=None, y=None, name=None):
   """Return the elements where `condition` is `True` (multiplexing `x` and `y`).
 
@@ -5003,6 +5041,7 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
 # because round_mode was added later.
 # (And also now because of 'axis' processing).
 @tf_export(v1=["quantize_v2"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -5056,6 +5095,7 @@ quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 # tf.quantization.quantize; we can deprecate tf.quantization.quantize in next
 # version of TensorFlow.
 @tf_export("quantization.quantize", v1=["quantization.quantize", "quantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("quantize")
 def quantize(
     input,  # pylint: disable=redefined-builtin
@@ -5095,6 +5135,7 @@ def quantize(
 
 @tf_export("quantization.dequantize", v1=["quantization.dequantize",
                                           "dequantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("dequantize")
 def dequantize(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -5130,6 +5171,7 @@ dequantize.__doc__ = gen_array_ops.dequantize.__doc__
 
 
 @tf_export("quantization.quantize_and_dequantize")
+@dispatch.add_dispatch_support
 def quantize_and_dequantize(
     input,  # pylint: disable=redefined-builtin
     input_min,
@@ -5189,6 +5231,7 @@ def quantize_and_dequantize(
 
 
 @tf_export("searchsorted")
+@dispatch.add_dispatch_support
 def searchsorted(sorted_sequence,
                  values,
                  side="left",
@@ -5253,6 +5296,7 @@ quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
 
 
 @tf_export("image.extract_patches")
+@dispatch.add_dispatch_support
 def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   r"""Extract `patches` from `images`.
 
@@ -5374,6 +5418,7 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
 
 
 @tf_export(v1=["image.extract_image_patches", "extract_image_patches"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "ksizes is deprecated, use sizes instead",
                              "ksizes")
 def extract_image_patches(  # pylint: disable=missing-docstring
@@ -5422,6 +5467,7 @@ extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
 
 
 @tf_export("fingerprint")
+@dispatch.add_dispatch_support
 def fingerprint(data, method="farmhash64", name=None):
   r"""Generates fingerprint values.
 
@@ -5668,6 +5714,7 @@ def _with_nonzero_rank(data):
 
 
 @tf_export("repeat")
+@dispatch.add_dispatch_support
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
   """Repeat elements of `input`.
   
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 56f76a49d51..6c1a36e65c9 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -24,12 +24,14 @@ from tensorflow.python.ops import array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_candidate_sampling_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(
     'random.uniform_candidate_sampler',
     v1=['random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.uniform_candidate_sampler')
 def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                               range_max, seed=None, name=None):
@@ -92,6 +94,7 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
         'random.log_uniform_candidate_sampler',
         'nn.log_uniform_candidate_sampler'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.log_uniform_candidate_sampler')
 def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                                   range_max, seed=None, name=None):
@@ -154,6 +157,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
 @tf_export(
     'random.learned_unigram_candidate_sampler',
     'nn.learned_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
@@ -213,6 +217,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
 
 @tf_export('random.fixed_unigram_candidate_sampler',
            'nn.fixed_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -341,6 +346,7 @@ def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
 
 
 @tf_export('nn.compute_accidental_hits')
+@dispatch.add_dispatch_support
 def compute_accidental_hits(true_classes, sampled_candidates, num_true,
                             seed=None, name=None):
   """Compute the position ids in `sampled_candidates` matching `true_classes`.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index cefca5defae..9a5b86a1deb 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 NUMERIC_TYPES = frozenset(
@@ -375,6 +376,7 @@ def _binary_assert(sym, opname, op_func, static_func, x, y, data, summarize,
 @tf_export(
     'debugging.assert_proper_iterable',
     v1=['debugging.assert_proper_iterable', 'assert_proper_iterable'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_proper_iterable')
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
@@ -404,6 +406,7 @@ def assert_proper_iterable(values):
 
 
 @tf_export('debugging.assert_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
 
@@ -436,6 +439,7 @@ def assert_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_negative', 'assert_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_negative')
 @_unary_assert_doc('< 0', 'negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -456,6 +460,7 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
 
@@ -488,6 +493,7 @@ def assert_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_positive', 'assert_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_positive')
 @_unary_assert_doc('> 0', 'positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -507,6 +513,7 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_non_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
 
@@ -541,6 +548,7 @@ def assert_non_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_negative')
 @_unary_assert_doc('>= 0', 'non-negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -561,6 +569,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_non_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
 
@@ -595,6 +604,7 @@ def assert_non_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_positive')
 @_unary_assert_doc('<= 0', 'non-positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -615,6 +625,7 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -649,6 +660,7 @@ def assert_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_equal', 'assert_equal'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('==')
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
@@ -660,6 +672,7 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_none_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
   """Assert the condition `x != y` holds for all elements.
 
@@ -698,6 +711,7 @@ def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_none_equal')
 @_binary_assert_doc('!=')
 def assert_none_equal(
@@ -707,6 +721,7 @@ def assert_none_equal(
 
 
 @tf_export('debugging.assert_near', v1=[])
+@dispatch.add_dispatch_support
 def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
                    name=None):
   """Assert the condition `x` and `y` are close element-wise.
@@ -760,6 +775,7 @@ def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
 
 
 @tf_export(v1=['debugging.assert_near', 'assert_near'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -839,6 +855,7 @@ def assert_near(
 
 
 @tf_export('debugging.assert_less', 'assert_less', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -874,6 +891,7 @@ def assert_less_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less', 'assert_less'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('<')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   return _binary_assert('<', 'assert_less', math_ops.less, np.less, x, y, data,
@@ -881,6 +899,7 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_less_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
 
@@ -917,6 +936,7 @@ def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_less_equal')
 @_binary_assert_doc('<=')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
@@ -925,6 +945,7 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -961,6 +982,7 @@ def assert_greater_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater', 'assert_greater'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('>')
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   return _binary_assert('>', 'assert_greater', math_ops.greater, np.greater, x,
@@ -968,6 +990,7 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  #
 
 
 @tf_export('debugging.assert_greater_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x >= y` holds element-wise.
 
@@ -1005,6 +1028,7 @@ def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_greater_equal')
 @_binary_assert_doc('>=')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
@@ -1062,6 +1086,7 @@ def _assert_rank_condition(
 
 
 @tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank equal to `rank`.
 
@@ -1095,6 +1120,7 @@ def assert_rank_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank', 'assert_rank'])
+@dispatch.add_dispatch_support
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -1157,6 +1183,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_rank_at_least', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_at_least_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank of at least `rank`.
 
@@ -1190,6 +1217,7 @@ def assert_rank_at_least_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -1322,6 +1350,7 @@ def _assert_ranks_condition(
 
 
 @tf_export('debugging.assert_rank_in', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_in_v2(x, ranks, message=None, name=None):
   """Assert that `x` has a rank in `ranks`.
 
@@ -1354,6 +1383,7 @@ def assert_rank_in_v2(x, ranks, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1417,6 +1447,7 @@ def assert_rank_in(
 
 
 @tf_export('debugging.assert_integer', v1=[])
+@dispatch.add_dispatch_support
 def assert_integer_v2(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
 
@@ -1437,6 +1468,7 @@ def assert_integer_v2(x, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_integer', 'assert_integer'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1476,6 +1508,7 @@ def assert_integer(x, message=None, name=None):
 
 
 @tf_export('debugging.assert_type', v1=[])
+@dispatch.add_dispatch_support
 def assert_type_v2(tensor, tf_type, message=None, name=None):
   """Asserts that the given `Tensor` is of the specified type.
 
@@ -1495,6 +1528,7 @@ def assert_type_v2(tensor, tf_type, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_type', 'assert_type'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
@@ -1584,6 +1618,7 @@ _TensorDimSizes = collections.namedtuple(
 
 
 @tf_export('debugging.assert_shapes', v1=[])
+@dispatch.add_dispatch_support
 def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
                      name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
@@ -1650,6 +1685,7 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
 
 
 @tf_export(v1=['debugging.assert_shapes'])
+@dispatch.add_dispatch_support
 def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
 
@@ -1939,6 +1975,7 @@ def is_numeric_tensor(tensor):
         'math.is_non_decreasing', 'debugging.is_non_decreasing',
         'is_non_decreasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_non_decreasing',
                                   'is_non_decreasing')
 def is_non_decreasing(x, name=None):
@@ -1980,6 +2017,7 @@ def is_non_decreasing(x, name=None):
         'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
         'is_strictly_increasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
                                   'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
@@ -2066,6 +2104,7 @@ def _assert_same_base_type(items, expected_type=None):
 @tf_export(
     'debugging.assert_same_float_dtype',
     v1=['debugging.assert_same_float_dtype', 'assert_same_float_dtype'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_same_float_dtype')
 def assert_same_float_dtype(tensors=None, dtype=None):
   """Validate and return float type based on `tensors` and `dtype`.
@@ -2098,6 +2137,7 @@ def assert_same_float_dtype(tensors=None, dtype=None):
 
 
 @tf_export('debugging.assert_scalar', v1=[])
+@dispatch.add_dispatch_support
 def assert_scalar_v2(tensor, message=None, name=None):
   """Asserts that the given `tensor` is a scalar.
 
@@ -2120,6 +2160,7 @@ def assert_scalar_v2(tensor, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None, message=None):
   """Asserts that the given `tensor` is a scalar (i.e. zero-dimensional).
@@ -2154,6 +2195,7 @@ def assert_scalar(tensor, name=None, message=None):
 
 
 @tf_export('ensure_shape')
+@dispatch.add_dispatch_support
 def ensure_shape(x, shape, name=None):
   """Updates the shape of a tensor and checks at runtime that the shape holds.
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index edb35afa52c..f7662516b4f 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -152,6 +152,7 @@ def _clip_by_value_grad(op, grad):
 
 
 @tf_export("clip_by_norm")
+@dispatch.add_dispatch_support
 def clip_by_norm(t, clip_norm, axes=None, name=None):
   """Clips tensor values to a maximum L2-norm.
 
@@ -235,6 +236,7 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
 
 
 @tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("global_norm")
 def global_norm(t_list, name=None):
   """Computes the global norm of multiple tensors.
@@ -285,6 +287,7 @@ def global_norm(t_list, name=None):
 
 
 @tf_export("clip_by_global_norm")
+@dispatch.add_dispatch_support
 def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   """Clips values of multiple tensors by the ratio of the sum of their norms.
 
@@ -382,6 +385,7 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
     "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) "
     "instead.")
 @tf_export(v1=["clip_by_average_norm"])
+@dispatch.add_dispatch_support
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 3e885975b03..39177defe57 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -93,6 +94,7 @@ def remove_squeezable_dimensions(
 
 
 @tf_export('math.confusion_matrix', v1=[])
+@dispatch.add_dispatch_support
 def confusion_matrix(labels,
                      predictions,
                      num_classes=None,
@@ -202,6 +204,7 @@ def confusion_matrix(labels,
 
 
 @tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
 def confusion_matrix_v1(labels,
                         predictions,
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 58948f7d52a..918c989432d 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -54,6 +54,7 @@ from tensorflow.python.ops.gen_control_flow_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -110,6 +111,7 @@ def _summarize_eager(tensor, summarize=None):
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
 @tf_export("debugging.Assert", "Assert")
+@dispatch.add_dispatch_support
 @tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
@@ -1095,6 +1097,7 @@ def _UnpackIfSingleton(res):
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
 @tf_export(v1=["cond"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -1318,6 +1321,7 @@ def _cast_indexed_slice_indices(a, b):
 
 
 @tf_export("cond", v1=[])
+@dispatch.add_dispatch_support
 def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
@@ -2942,6 +2946,7 @@ def group(*inputs, **kwargs):
 
 
 @tf_export("tuple", v1=[])
+@dispatch.add_dispatch_support
 def tuple_v2(tensors, control_inputs=None, name=None):
   """Group tensors together.
 
@@ -2978,6 +2983,7 @@ def tuple_v2(tensors, control_inputs=None, name=None):
 
 
 @tf_export(v1=["tuple"])
+@dispatch.add_dispatch_support
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
@@ -3312,6 +3318,7 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
 
 
 @tf_export("case", v1=[])
+@dispatch.add_dispatch_support
 def case_v2(pred_fn_pairs,
             default=None,
             exclusive=False,
@@ -3416,6 +3423,7 @@ def case_v2(pred_fn_pairs,
 
 
 @tf_export(v1=["case"])
+@dispatch.add_dispatch_support
 def case(pred_fn_pairs,
          default=None,
          exclusive=False,
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index d989bc0be44..6c9cdf1dd08 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -70,6 +71,7 @@ def _generate_defun_backend(unique_api_name, preferred_device, func):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
+@dispatch.add_dispatch_support
 def ctc_loss(labels,
              inputs=None,
              sequence_length=None,
@@ -284,6 +286,7 @@ def _CTCLossV2Grad(op, grad_loss, _):
 
 
 @tf_export("nn.ctc_greedy_decoder")
+@dispatch.add_dispatch_support
 def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
   """Performs greedy decoding on the logits given in input (best path).
 
@@ -333,6 +336,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
 
 
 @tf_export(v1=["nn.ctc_beam_search_decoder"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder(inputs,
                             sequence_length,
                             beam_width=100,
@@ -395,6 +399,7 @@ def ctc_beam_search_decoder(inputs,
 
 
 @tf_export("nn.ctc_beam_search_decoder", v1=["nn.ctc_beam_search_decoder_v2"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder_v2(inputs,
                                sequence_length,
                                beam_width=100,
@@ -731,6 +736,7 @@ def _ctc_loss_shape(op):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss_v2"])
+@dispatch.add_dispatch_support
 def ctc_loss_v2(labels,
                 logits,
                 label_length,
@@ -825,6 +831,7 @@ def ctc_loss_v2(labels,
 
 
 @tf_export("nn.ctc_loss", v1=[])
+@dispatch.add_dispatch_support
 def ctc_loss_v3(labels,
                 logits,
                 label_length,
@@ -1056,6 +1063,7 @@ def ctc_loss_dense(labels,
 
 
 @tf_export("nn.collapse_repeated")
+@dispatch.add_dispatch_support
 def collapse_repeated(labels, seq_length, name=None):
   """Merge repeated labels into single labels.
 
@@ -1153,6 +1161,7 @@ def dense_labels_to_sparse(dense, length):
 
 
 @tf_export("nn.ctc_unique_labels")
+@dispatch.add_dispatch_support
 def ctc_unique_labels(labels, name=None):
   """Get unique labels and indices for batched labels for `tf.nn.ctc_loss`.
 
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 2fdae49b1f6..1c7b204fa58 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -250,6 +251,7 @@ def _embedding_lookup_and_transform(params,
 
 
 @tf_export(v1=["nn.embedding_lookup"])
+@dispatch.add_dispatch_support
 def embedding_lookup(
     params,
     ids,
@@ -327,6 +329,7 @@ def embedding_lookup(
 
 
 @tf_export("nn.embedding_lookup", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_v2(params, ids, max_norm=None, name=None):
   """Looks up embeddings for the given `ids` from a list of tensors.
 
@@ -392,6 +395,7 @@ def embedding_lookup_v2(params, ids, max_norm=None, name=None):
 
 
 @tf_export(v1=["nn.embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
@@ -574,6 +578,7 @@ def embedding_lookup_sparse(params,
 
 
 @tf_export("nn.embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse_v2(params,
                                sp_ids,
                                sp_weights,
@@ -664,6 +669,7 @@ def embedding_lookup_sparse_v2(params,
 
 
 @tf_export("nn.safe_embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse_v2(embedding_weights,
                                     sparse_ids,
                                     sparse_weights=None,
@@ -765,6 +771,7 @@ def safe_embedding_lookup_sparse_v2(embedding_weights,
 
 
 @tf_export(v1=["nn.safe_embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 8ec925824de..37b41a55eb9 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops.gen_functional_ops import remote_call
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -45,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # TODO(yuanbyu, mrry): Handle stride to support sliding windows.
 @tf_export(v1=["foldl"])
+@dispatch.add_dispatch_support
 def foldl(fn,
           elems,
           initializer=None,
@@ -162,6 +164,7 @@ def foldl(fn,
 
 
 @tf_export("foldl", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -238,6 +241,7 @@ def foldl_v2(fn,
 
 
 @tf_export(v1=["foldr"])
+@dispatch.add_dispatch_support
 def foldr(fn,
           elems,
           initializer=None,
@@ -356,6 +360,7 @@ def foldr(fn,
 
 
 @tf_export("foldr", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -432,6 +437,7 @@ def foldr_v2(fn,
 
 
 @tf_export(v1=["scan"])
+@dispatch.add_dispatch_support
 def scan(fn,
          elems,
          initializer=None,
@@ -686,6 +692,7 @@ def scan(fn,
 
 
 @tf_export("scan", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 92f3e7a24ba..d88025d653c 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -26,10 +26,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('histogram_fixed_width_bins')
+@dispatch.add_dispatch_support
 def histogram_fixed_width_bins(values,
                                value_range,
                                nbins=100,
@@ -101,6 +103,7 @@ def histogram_fixed_width_bins(values,
 
 
 @tf_export('histogram_fixed_width')
+@dispatch.add_dispatch_support
 def histogram_fixed_width(values,
                           value_range,
                           nbins=100,
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 52b65efad67..4920be213d8 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -323,6 +324,7 @@ def fix_image_flip_shape(image, result):
 
 
 @tf_export('image.random_flip_up_down')
+@dispatch.add_dispatch_support
 def random_flip_up_down(image, seed=None):
   """Randomly flips an image vertically (upside down).
 
@@ -363,6 +365,7 @@ def random_flip_up_down(image, seed=None):
 
 
 @tf_export('image.random_flip_left_right')
+@dispatch.add_dispatch_support
 def random_flip_left_right(image, seed=None):
   """Randomly flip an image horizontally (left to right).
 
@@ -450,6 +453,7 @@ def _random_flip(image, flip_index, seed, scope_name):
 
 
 @tf_export('image.flip_left_right')
+@dispatch.add_dispatch_support
 def flip_left_right(image):
   """Flip an image horizontally (left to right).
 
@@ -484,6 +488,7 @@ def flip_left_right(image):
 
 
 @tf_export('image.flip_up_down')
+@dispatch.add_dispatch_support
 def flip_up_down(image):
   """Flip an image vertically (upside down).
 
@@ -549,6 +554,7 @@ def _flip(image, flip_index, scope_name):
 
 
 @tf_export('image.rot90')
+@dispatch.add_dispatch_support
 def rot90(image, k=1, name=None):
   """Rotate image(s) counter-clockwise by 90 degrees.
 
@@ -660,6 +666,7 @@ def _rot90_4D(images, k, name_scope):
 
 
 @tf_export('image.transpose', v1=['image.transpose', 'image.transpose_image'])
+@dispatch.add_dispatch_support
 def transpose(image, name=None):
   """Transpose image(s) by swapping the height and width dimension.
 
@@ -718,6 +725,7 @@ def transpose(image, name=None):
 
 
 @tf_export('image.central_crop')
+@dispatch.add_dispatch_support
 def central_crop(image, central_fraction):
   """Crop the central region of the image(s).
 
@@ -850,6 +858,7 @@ def central_crop(image, central_fraction):
 
 
 @tf_export('image.pad_to_bounding_box')
+@dispatch.add_dispatch_support
 def pad_to_bounding_box(image, offset_height, offset_width, target_height,
                         target_width):
   """Pad `image` with zeros to the specified `height` and `width`.
@@ -959,6 +968,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
 
 
 @tf_export('image.crop_to_bounding_box')
+@dispatch.add_dispatch_support
 def crop_to_bounding_box(image, offset_height, offset_width, target_height,
                          target_width):
   """Crops an image to a specified bounding box.
@@ -1041,6 +1051,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
 @tf_export(
     'image.resize_with_crop_or_pad',
     v1=['image.resize_with_crop_or_pad', 'image.resize_image_with_crop_or_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_crop_or_pad(image, target_height, target_width):
   """Crops and/or pads an image to a target width and height.
 
@@ -1258,6 +1269,7 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
 
 
 @tf_export(v1=['image.resize_images', 'image.resize'])
+@dispatch.add_dispatch_support
 def resize_images(images,
                   size,
                   method=ResizeMethodV1.BILINEAR,
@@ -1343,6 +1355,7 @@ def resize_images(images,
 
 
 @tf_export('image.resize', v1=[])
+@dispatch.add_dispatch_support
 def resize_images_v2(images,
                      size,
                      method=ResizeMethod.BILINEAR,
@@ -1594,6 +1607,7 @@ def _resize_image_with_pad_common(image, target_height, target_width,
 
 
 @tf_export(v1=['image.resize_image_with_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v1(image,
                              target_height,
                              target_width,
@@ -1636,6 +1650,7 @@ def resize_image_with_pad_v1(image,
 
 
 @tf_export('image.resize_with_pad', v1=[])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v2(image,
                              target_height,
                              target_width,
@@ -1676,6 +1691,7 @@ def resize_image_with_pad_v2(image,
 
 
 @tf_export('image.per_image_standardization')
+@dispatch.add_dispatch_support
 def per_image_standardization(image):
   """Linearly scales each image in `image` to have mean 0 and variance 1.
 
@@ -1721,6 +1737,7 @@ def per_image_standardization(image):
 
 
 @tf_export('image.random_brightness')
+@dispatch.add_dispatch_support
 def random_brightness(image, max_delta, seed=None):
   """Adjust the brightness of images by a random factor.
 
@@ -1756,6 +1773,7 @@ def random_brightness(image, max_delta, seed=None):
 
 
 @tf_export('image.random_contrast')
+@dispatch.add_dispatch_support
 def random_contrast(image, lower, upper, seed=None):
   """Adjust the contrast of an image or images by a random factor.
 
@@ -1796,6 +1814,7 @@ def random_contrast(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_brightness')
+@dispatch.add_dispatch_support
 def adjust_brightness(image, delta):
   """Adjust the brightness of RGB or Grayscale images.
 
@@ -1847,6 +1866,7 @@ def adjust_brightness(image, delta):
 
 
 @tf_export('image.adjust_contrast')
+@dispatch.add_dispatch_support
 def adjust_contrast(images, contrast_factor):
   """Adjust contrast of RGB or grayscale images.
 
@@ -1903,6 +1923,7 @@ def adjust_contrast(images, contrast_factor):
 
 
 @tf_export('image.adjust_gamma')
+@dispatch.add_dispatch_support
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs [Gamma Correction](http://en.wikipedia.org/wiki/Gamma_correction).
 
@@ -1967,6 +1988,7 @@ def adjust_gamma(image, gamma=1, gain=1):
 
 
 @tf_export('image.convert_image_dtype')
+@dispatch.add_dispatch_support
 def convert_image_dtype(image, dtype, saturate=False, name=None):
   """Convert `image` to `dtype`, scaling its values if needed.
 
@@ -2066,6 +2088,7 @@ def convert_image_dtype(image, dtype, saturate=False, name=None):
 
 
 @tf_export('image.rgb_to_grayscale')
+@dispatch.add_dispatch_support
 def rgb_to_grayscale(images, name=None):
   """Converts one or more images from RGB to Grayscale.
 
@@ -2101,6 +2124,7 @@ def rgb_to_grayscale(images, name=None):
 
 
 @tf_export('image.grayscale_to_rgb')
+@dispatch.add_dispatch_support
 def grayscale_to_rgb(images, name=None):
   """Converts one or more images from Grayscale to RGB.
 
@@ -2137,6 +2161,7 @@ def grayscale_to_rgb(images, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_hue')
+@dispatch.add_dispatch_support
 def random_hue(image, max_delta, seed=None):
   """Adjust the hue of RGB images by a random factor.
 
@@ -2179,6 +2204,7 @@ def random_hue(image, max_delta, seed=None):
 
 
 @tf_export('image.adjust_hue')
+@dispatch.add_dispatch_support
 def adjust_hue(image, delta, name=None):
   """Adjust hue of RGB images.
 
@@ -2246,6 +2272,7 @@ def adjust_hue(image, delta, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_jpeg_quality')
+@dispatch.add_dispatch_support
 def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
   """Randomly changes jpeg encoding quality for inducing jpeg noise.
 
@@ -2293,6 +2320,7 @@ def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
 
 
 @tf_export('image.adjust_jpeg_quality')
+@dispatch.add_dispatch_support
 def adjust_jpeg_quality(image, jpeg_quality, name=None):
   """Adjust jpeg encoding quality of an image.
 
@@ -2343,6 +2371,7 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
 
 
 @tf_export('image.random_saturation')
+@dispatch.add_dispatch_support
 def random_saturation(image, lower, upper, seed=None):
   """Adjust the saturation of RGB images by a random factor.
 
@@ -2389,6 +2418,7 @@ def random_saturation(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_saturation')
+@dispatch.add_dispatch_support
 def adjust_saturation(image, saturation_factor, name=None):
   """Adjust saturation of RGB images.
 
@@ -2480,42 +2510,43 @@ tf_export(
     'io.decode_and_crop_jpeg',
     'image.decode_and_crop_jpeg',
     v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
-        gen_image_ops.decode_and_crop_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_and_crop_jpeg))
 
 tf_export(
     'io.decode_bmp',
     'image.decode_bmp',
     v1=['io.decode_bmp', 'image.decode_bmp'])(
-        gen_image_ops.decode_bmp)
+        dispatch.add_dispatch_support(gen_image_ops.decode_bmp))
 tf_export(
     'io.decode_gif',
     'image.decode_gif',
     v1=['io.decode_gif', 'image.decode_gif'])(
-        gen_image_ops.decode_gif)
+        dispatch.add_dispatch_support(gen_image_ops.decode_gif))
 tf_export(
     'io.decode_jpeg',
     'image.decode_jpeg',
     v1=['io.decode_jpeg', 'image.decode_jpeg'])(
-        gen_image_ops.decode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_jpeg))
 tf_export(
     'io.decode_png',
     'image.decode_png',
     v1=['io.decode_png', 'image.decode_png'])(
-        gen_image_ops.decode_png)
+        dispatch.add_dispatch_support(gen_image_ops.decode_png))
 
 tf_export(
     'io.encode_jpeg',
     'image.encode_jpeg',
     v1=['io.encode_jpeg', 'image.encode_jpeg'])(
-        gen_image_ops.encode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.encode_jpeg))
 tf_export(
     'io.extract_jpeg_shape',
     'image.extract_jpeg_shape',
     v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
-        gen_image_ops.extract_jpeg_shape)
+        dispatch.add_dispatch_support(gen_image_ops.extract_jpeg_shape))
 
 
 @tf_export('io.encode_png', 'image.encode_png')
+@dispatch.add_dispatch_support
 def encode_png(image, compression=-1, name=None):
   r"""PNG-encode an image.
 
@@ -2548,6 +2579,7 @@ def encode_png(image, compression=-1, name=None):
     'io.decode_image',
     'image.decode_image',
     v1=['io.decode_image', 'image.decode_image'])
+@dispatch.add_dispatch_support
 def decode_image(contents,
                  channels=None,
                  dtype=dtypes.uint8,
@@ -2661,6 +2693,7 @@ def decode_image(contents,
 
 
 @tf_export('image.total_variation')
+@dispatch.add_dispatch_support
 def total_variation(images, name=None):
   """Calculate and return the total variation for one or more images.
 
@@ -2732,6 +2765,7 @@ def total_variation(images, name=None):
 
 
 @tf_export('image.sample_distorted_bounding_box', v1=[])
+@dispatch.add_dispatch_support
 def sample_distorted_bounding_box_v2(image_size,
                                      bounding_boxes,
                                      seed=0,
@@ -2831,6 +2865,7 @@ def sample_distorted_bounding_box_v2(image_size,
 
 
 @tf_export(v1=['image.sample_distorted_bounding_box'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None,
     instructions='`seed2` arg is deprecated.'
@@ -2945,6 +2980,7 @@ def sample_distorted_bounding_box(image_size,
 
 
 @tf_export('image.non_max_suppression')
+@dispatch.add_dispatch_support
 def non_max_suppression(boxes,
                         scores,
                         max_output_size,
@@ -2997,6 +3033,7 @@ def non_max_suppression(boxes,
 
 
 @tf_export('image.non_max_suppression_with_scores')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_scores(boxes,
                                     scores,
                                     max_output_size,
@@ -3083,6 +3120,7 @@ def non_max_suppression_with_scores(boxes,
 
 
 @tf_export('image.non_max_suppression_overlaps')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_overlaps(overlaps,
                                       scores,
                                       max_output_size,
@@ -3134,6 +3172,7 @@ _rgb_to_yiq_kernel = [[0.299, 0.59590059, 0.2115],
 
 
 @tf_export('image.rgb_to_yiq')
+@dispatch.add_dispatch_support
 def rgb_to_yiq(images):
   """Converts one or more images from RGB to YIQ.
 
@@ -3167,6 +3206,7 @@ _yiq_to_rgb_kernel = [[1, 1, 1], [0.95598634, -0.27201283, -1.10674021],
 
 
 @tf_export('image.yiq_to_rgb')
+@dispatch.add_dispatch_support
 def yiq_to_rgb(images):
   """Converts one or more images from YIQ to RGB.
 
@@ -3195,6 +3235,7 @@ _rgb_to_yuv_kernel = [[0.299, -0.14714119, 0.61497538],
 
 
 @tf_export('image.rgb_to_yuv')
+@dispatch.add_dispatch_support
 def rgb_to_yuv(images):
   """Converts one or more images from RGB to YUV.
 
@@ -3221,6 +3262,7 @@ _yuv_to_rgb_kernel = [[1, 1, 1], [0, -0.394642334, 2.03206185],
 
 
 @tf_export('image.yuv_to_rgb')
+@dispatch.add_dispatch_support
 def yuv_to_rgb(images):
   """Converts one or more images from YUV to RGB.
 
@@ -3314,6 +3356,7 @@ def _verify_compatible_image_shapes(img1, img2):
 
 
 @tf_export('image.psnr')
+@dispatch.add_dispatch_support
 def psnr(a, b, max_val, name=None):
   """Returns the Peak Signal-to-Noise Ratio between a and b.
 
@@ -3525,6 +3568,7 @@ def _ssim_per_channel(img1,
 
 
 @tf_export('image.ssim')
+@dispatch.add_dispatch_support
 def ssim(img1,
          img2,
          max_val,
@@ -3604,6 +3648,7 @@ _MSSSIM_WEIGHTS = (0.0448, 0.2856, 0.3001, 0.2363, 0.1333)
 
 
 @tf_export('image.ssim_multiscale')
+@dispatch.add_dispatch_support
 def ssim_multiscale(img1,
                     img2,
                     max_val,
@@ -3731,6 +3776,7 @@ def ssim_multiscale(img1,
 
 
 @tf_export('image.image_gradients')
+@dispatch.add_dispatch_support
 def image_gradients(image):
   """Returns image gradients (dy, dx) for each color channel.
 
@@ -3804,6 +3850,7 @@ def image_gradients(image):
 
 
 @tf_export('image.sobel_edges')
+@dispatch.add_dispatch_support
 def sobel_edges(image):
   """Returns a tensor holding Sobel edge maps.
 
@@ -3888,21 +3935,22 @@ resize_area_deprecation = deprecation.deprecated(
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
 tf_export(v1=['image.resize_area'])(
-    resize_area_deprecation(gen_image_ops.resize_area))
+    resize_area_deprecation(
+        dispatch.add_dispatch_support(gen_image_ops.resize_area)))
 
 resize_bicubic_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
 tf_export(v1=['image.resize_bicubic'])(
-    resize_bicubic_deprecation(resize_bicubic))
+    dispatch.add_dispatch_support(resize_bicubic_deprecation(resize_bicubic)))
 
 resize_bilinear_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
 tf_export(v1=['image.resize_bilinear'])(
-    resize_bilinear_deprecation(resize_bilinear))
+    dispatch.add_dispatch_support(resize_bilinear_deprecation(resize_bilinear)))
 
 resize_nearest_neighbor_deprecation = deprecation.deprecated(
     date=None,
@@ -3910,10 +3958,12 @@ resize_nearest_neighbor_deprecation = deprecation.deprecated(
         'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
         'instead.'))
 tf_export(v1=['image.resize_nearest_neighbor'])(
-    resize_nearest_neighbor_deprecation(resize_nearest_neighbor))
+    dispatch.add_dispatch_support(
+        resize_nearest_neighbor_deprecation(resize_nearest_neighbor)))
 
 
 @tf_export('image.crop_and_resize', v1=[])
+@dispatch.add_dispatch_support
 def crop_and_resize_v2(image,
                        boxes,
                        box_indices,
@@ -3997,6 +4047,7 @@ def crop_and_resize_v2(image,
 
 
 @tf_export(v1=['image.crop_and_resize'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              'box_ind is deprecated, use box_indices instead',
                              'box_ind')
@@ -4019,6 +4070,7 @@ crop_and_resize_v1.__doc__ = gen_image_ops.crop_and_resize.__doc__
 
 
 @tf_export(v1=['image.extract_glimpse'])
+@dispatch.add_dispatch_support
 def extract_glimpse(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4104,6 +4156,7 @@ def extract_glimpse(
 
 
 @tf_export('image.extract_glimpse', v1=[])
+@dispatch.add_dispatch_support
 def extract_glimpse_v2(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4190,6 +4243,7 @@ def extract_glimpse_v2(
 
 
 @tf_export('image.combined_non_max_suppression')
+@dispatch.add_dispatch_support
 def combined_non_max_suppression(boxes,
                                  scores,
                                  max_output_size_per_class,
@@ -4442,6 +4496,7 @@ def _suppression_loop_body(boxes, iou_threshold, output_size, idx, tile_size):
 
 
 @tf_export('image.non_max_suppression_padded')
+@dispatch.add_dispatch_support
 def non_max_suppression_padded(boxes,
                                scores,
                                max_output_size,
@@ -4816,6 +4871,7 @@ def non_max_suppression_padded_v1(boxes,
 
 
 @tf_export('image.draw_bounding_boxes', v1=[])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes_v2(images, boxes, colors, name=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4870,6 +4926,7 @@ def draw_bounding_boxes_v2(images, boxes, colors, name=None):
 
 
 @tf_export(v1=['image.draw_bounding_boxes'])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes(images, boxes, name=None, colors=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4922,6 +4979,7 @@ def draw_bounding_boxes(images, boxes, name=None, colors=None):
 
 
 @tf_export('image.generate_bounding_box_proposals')
+@dispatch.add_dispatch_support
 def generate_bounding_box_proposals(scores,
                                     bbox_deltas,
                                     image_info,
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index f7617d83caf..82acd09caec 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -41,7 +41,7 @@ cholesky = linalg_ops.cholesky
 cholesky_solve = linalg_ops.cholesky_solve
 det = linalg_ops.matrix_determinant
 slogdet = gen_linalg_ops.log_matrix_determinant
-tf_export('linalg.slogdet')(slogdet)
+tf_export('linalg.slogdet')(dispatch.add_dispatch_support(slogdet))
 diag = array_ops.matrix_diag
 diag_part = array_ops.matrix_diag_part
 eigh = linalg_ops.self_adjoint_eig
@@ -51,7 +51,7 @@ eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
 lu = gen_linalg_ops.lu
-tf_export('linalg.logm')(logm)
+tf_export('linalg.logm')(dispatch.add_dispatch_support(logm))
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
@@ -230,6 +230,7 @@ def _matrix_exp_pade13(matrix):
 
 
 @tf_export('linalg.expm')
+@dispatch.add_dispatch_support
 def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the matrix exponential of one or more square matrices.
 
@@ -340,6 +341,7 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
 
 
 @tf_export('linalg.tridiagonal_solve')
+@dispatch.add_dispatch_support
 def tridiagonal_solve(diagonals,
                       rhs,
                       diagonals_format='compact',
@@ -541,6 +543,7 @@ def _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
 
 
 @tf_export('linalg.tridiagonal_matmul')
+@dispatch.add_dispatch_support
 def tridiagonal_matmul(diagonals, rhs, diagonals_format='compact', name=None):
   r"""Multiplies tridiagonal matrix by matrix.
 
@@ -638,6 +641,7 @@ def _maybe_validate_matrix(a, validate_args):
 
 
 @tf_export('linalg.matrix_rank')
+@dispatch.add_dispatch_support
 def matrix_rank(a, tol=None, validate_args=False, name=None):
   """Compute the matrix rank of one or more matrices.
 
@@ -676,6 +680,7 @@ def matrix_rank(a, tol=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.pinv')
+@dispatch.add_dispatch_support
 def pinv(a, rcond=None, validate_args=False, name=None):
   """Compute the Moore-Penrose pseudo-inverse of one or more matrices.
 
@@ -805,6 +810,7 @@ def pinv(a, rcond=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_solve')
+@dispatch.add_dispatch_support
 def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
   """Solves systems of linear eqns `A X = RHS`, given LU factorizations.
 
@@ -902,6 +908,7 @@ def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_matrix_inverse')
+@dispatch.add_dispatch_support
 def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
   """Computes the inverse given the LU decomposition(s) of one or more matrices.
 
@@ -966,6 +973,7 @@ def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_reconstruct')
+@dispatch.add_dispatch_support
 def lu_reconstruct(lower_upper, perm, validate_args=False, name=None):
   """The reconstruct one or more matrices from their LU decomposition(s).
 
diff --git a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
index 613309f856d..6794636c3fd 100644
--- a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
+++ b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
@@ -27,10 +27,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('linalg.experimental.conjugate_gradient')
+@dispatch.add_dispatch_support
 def conjugate_gradient(operator,
                        rhs,
                        preconditioner=None,
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index abca7df19e0..03b7b98119d 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 # Names below are lower_case.
@@ -82,6 +83,7 @@ def _RegularizedGramianCholesky(matrix, l2_regularizer, first_kind):
 @tf_export(
     'linalg.triangular_solve',
     v1=['linalg.triangular_solve', 'matrix_triangular_solve'])
+@dispatch.add_dispatch_support
 def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
   """Solve systems of linear equations with upper or lower triangular matrices.
 
@@ -143,6 +145,7 @@ def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
 
 @tf_export(
     'linalg.cholesky_solve', v1=['linalg.cholesky_solve', 'cholesky_solve'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('cholesky_solve')
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
@@ -187,6 +190,7 @@ def cholesky_solve(chol, rhs, name=None):
 
 
 @tf_export('eye', 'linalg.eye')
+@dispatch.add_dispatch_support
 def eye(num_rows,
         num_columns=None,
         batch_shape=None,
@@ -234,6 +238,7 @@ def eye(num_rows,
 
 
 @tf_export('linalg.lstsq', v1=['linalg.lstsq', 'matrix_solve_ls'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('matrix_solve_ls')
 def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   r"""Solves one or more linear least-squares problems.
@@ -371,6 +376,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
 
 
 @tf_export('linalg.eig', 'eig', v1=[])
+@dispatch.add_dispatch_support
 def eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of matrices.
 
@@ -401,6 +407,7 @@ def eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvals', 'eigvals', v1=[])
+@dispatch.add_dispatch_support
 def eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more matrices.
 
@@ -427,6 +434,7 @@ def eigvals(tensor, name=None):
 
 
 @tf_export('linalg.eigh', v1=['linalg.eigh', 'self_adjoint_eig'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eig')
 def self_adjoint_eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of self-adjoint matrices.
@@ -450,6 +458,7 @@ def self_adjoint_eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvalsh', v1=['linalg.eigvalsh', 'self_adjoint_eigvals'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eigvals')
 def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
@@ -473,6 +482,7 @@ def self_adjoint_eigvals(tensor, name=None):
 
 
 @tf_export('linalg.svd', v1=['linalg.svd', 'svd'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('svd')
 def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   r"""Computes the singular value decompositions of one or more matrices.
@@ -544,6 +554,7 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export('norm', 'linalg.norm', v1=[])
+@dispatch.add_dispatch_support
 def norm_v2(tensor,
             ord='euclidean',
             axis=None,
@@ -615,6 +626,7 @@ def norm_v2(tensor,
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=['norm', 'linalg.norm'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 7e980a0dbb3..8ca63f55987 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -71,6 +72,7 @@ except NameError:
             "only a concern in graph mode. Below is an example "
             "of how to ensure tf.print executes in graph mode:\n")
 @tf_export(v1=["Print"])
+@dispatch.add_dispatch_support
 def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
   """Prints a list of tensors.
 
@@ -136,6 +138,7 @@ def _is_filepath(output_stream):
 # function definition.
 # pylint: disable=g-doc-args
 @tf_export("print")
+@dispatch.add_dispatch_support
 def print_v2(*inputs, **kwargs):
   """Print the specified inputs.
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 556c646f2a7..6a7b4b68420 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
@@ -136,6 +137,7 @@ def _num_elements(losses):
 
 
 @tf_export(v1=["losses.compute_weighted_loss"])
+@dispatch.add_dispatch_support
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -204,6 +206,7 @@ def compute_weighted_loss(
 
 
 @tf_export(v1=["losses.absolute_difference"])
+@dispatch.add_dispatch_support
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -257,6 +260,7 @@ def absolute_difference(
 
 
 @tf_export(v1=["losses.cosine_distance"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -313,6 +317,7 @@ def cosine_distance(
 
 
 @tf_export(v1=["losses.hinge_loss"])
+@dispatch.add_dispatch_support
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -363,6 +368,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
 
 
 @tf_export(v1=["losses.huber_loss"])
+@dispatch.add_dispatch_support
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -439,6 +445,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
 
 
 @tf_export(v1=["losses.log_loss"])
+@dispatch.add_dispatch_support
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -496,6 +503,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 # TODO(b/37208492): Add reduction arg.
 @tf_export(v1=["losses.mean_pairwise_squared_error"])
+@dispatch.add_dispatch_support
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -592,6 +600,7 @@ def mean_pairwise_squared_error(
 
 
 @tf_export(v1=["losses.mean_squared_error"])
+@dispatch.add_dispatch_support
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -645,6 +654,7 @@ def mean_squared_error(
 
 
 @tf_export(v1=["losses.sigmoid_cross_entropy"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -709,6 +719,7 @@ def sigmoid_cross_entropy(
 
 
 @tf_export(v1=["losses.softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -831,6 +842,7 @@ def _remove_squeezable_dimensions(
 
 
 @tf_export(v1=["losses.sparse_softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 56e8a894c24..fe99696f82f 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
 @tf_export('roll', v1=['roll', 'manip.roll'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('manip.roll')
 def roll(input, shift, axis, name=None):  # pylint: disable=redefined-builtin
   return _gen_manip_ops.roll(input, shift, axis, name)
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 4c4982c6fd5..31994c16ddd 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -104,6 +104,7 @@ nextafter = gen_math_ops.next_after
 
 
 @tf_export("linspace", v1=["lin_space", "linspace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("lin_space")
 def linspace_nd(start, stop, num, name=None, axis=0):
   r"""Generates evenly-spaced values in an interval along a given axis.
@@ -214,8 +215,8 @@ linspace = linspace_nd
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export(v1=["arg_max"])(arg_max)
-tf_export(v1=["arg_min"])(arg_min)
+tf_export(v1=["arg_max"])(dispatch.add_dispatch_support(arg_max))
+tf_export(v1=["arg_min"])(dispatch.add_dispatch_support(arg_min))
 
 
 # This is set by resource_variable_ops.py. It is included in this way since
@@ -234,6 +235,7 @@ def _set_doc(doc):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["math.argmax", "argmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -250,6 +252,7 @@ def argmax(input,
 
 
 @tf_export("math.argmax", "argmax", v1=[])
+@dispatch.add_dispatch_support
 def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the largest value across axes of a tensor.
 
@@ -283,6 +286,7 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
 
 
 @tf_export(v1=["math.argmin", "argmin"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -299,6 +303,7 @@ def argmin(input,
 
 
 @tf_export("math.argmin", "argmin", v1=[])
+@dispatch.add_dispatch_support
 def argmin_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the smallest value across axes of a tensor.
 
@@ -549,6 +554,7 @@ def _neg(x, name=None):
 
 
 @tf_export(v1=["math.scalar_mul", "scalar_mul"])
+@dispatch.add_dispatch_support
 def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
@@ -581,6 +587,7 @@ def scalar_mul(scalar, x, name=None):
 
 
 @tf_export("math.scalar_mul", "scalar_mul", v1=[])
+@dispatch.add_dispatch_support
 @_set_doc(scalar_mul.__doc__)
 def scalar_mul_v2(scalar, x, name=None):
   with ops.name_scope(name, "scalar_mul", [x]) as name:
@@ -701,6 +708,7 @@ def sign(x, name=None):
 
 
 @tf_export("math.real", v1=["math.real", "real"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("real")
 @dispatch.add_dispatch_support
 def real(input, name=None):
@@ -735,6 +743,7 @@ def real(input, name=None):
 
 
 @tf_export("math.imag", v1=["math.imag", "imag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("imag")
 @dispatch.add_dispatch_support
 def imag(input, name=None):
@@ -768,6 +777,7 @@ def imag(input, name=None):
 
 
 @tf_export("math.angle", v1=["math.angle", "angle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("angle")
 @dispatch.add_dispatch_support
 def angle(input, name=None):
@@ -937,6 +947,7 @@ def saturate_cast(value, dtype, name=None):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_float"])
+@dispatch.add_dispatch_support
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -956,6 +967,7 @@ def to_float(x, name="ToFloat"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_double"])
+@dispatch.add_dispatch_support
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -975,6 +987,7 @@ def to_double(x, name="ToDouble"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int32"])
+@dispatch.add_dispatch_support
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -994,6 +1007,7 @@ def to_int32(x, name="ToInt32"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int64"])
+@dispatch.add_dispatch_support
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -1013,6 +1027,7 @@ def to_int64(x, name="ToInt64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_bfloat16"])
+@dispatch.add_dispatch_support
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -1032,6 +1047,7 @@ def to_bfloat16(x, name="ToBFloat16"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex64"])
+@dispatch.add_dispatch_support
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
@@ -1051,6 +1067,7 @@ def to_complex64(x, name="ToComplex64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex128"])
+@dispatch.add_dispatch_support
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
@@ -1265,6 +1282,7 @@ def truediv(x, y, name=None):
     date=None,
     instructions="Deprecated in favor of operator or tf.math.divide.")
 @tf_export(v1=["div"])
+@dispatch.add_dispatch_support
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1288,6 +1306,7 @@ def div(x, y, name=None):
 
 
 @tf_export("math.divide_no_nan", v1=["math.divide_no_nan", "div_no_nan"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("div_no_nan")
 @dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
@@ -1620,6 +1639,7 @@ ops.Tensor._override_operator("__ne__", tensor_not_equals)
 
 
 @tf_export("range")
+@dispatch.add_dispatch_support
 def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disable=redefined-builtin
   """Creates a sequence of numbers.
 
@@ -1751,6 +1771,7 @@ def _may_reduce_to_scalar(keepdims, axis, output):
 
 
 @tf_export(v1=["math.reduce_sum", "reduce_sum"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -1885,6 +1906,7 @@ def reduce_sum_with_dims(input_tensor,
 
 
 @tf_export("math.reduce_euclidean_norm")
+@dispatch.add_dispatch_support
 def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the Euclidean norm of elements across dimensions of a tensor.
 
@@ -1928,6 +1950,7 @@ def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2005,6 +2028,7 @@ def count_nonzero(input_tensor=None,
 
 
 @tf_export("math.count_nonzero", v1=[])
+@dispatch.add_dispatch_support
 def count_nonzero_v2(
     input,  # pylint: disable=redefined-builtin
     axis=None,
@@ -2072,6 +2096,7 @@ def count_nonzero_v2(
 
 
 @tf_export(v1=["math.reduce_mean", "reduce_mean"])
+@dispatch.add_dispatch_support
 def reduce_mean_v1(input_tensor,
                    axis=None,
                    keepdims=None,
@@ -2198,6 +2223,7 @@ def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("math.reduce_variance")
+@dispatch.add_dispatch_support
 def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the variance of elements across dimensions of a tensor.
 
@@ -2246,6 +2272,7 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("math.reduce_std")
+@dispatch.add_dispatch_support
 def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the standard deviation of elements across dimensions of a tensor.
 
@@ -2328,6 +2355,7 @@ def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_prod", "reduce_prod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2373,6 +2401,7 @@ def reduce_prod_v1(input_tensor,
 
 
 @tf_export(v1=["math.reduce_min", "reduce_min"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2459,6 +2488,7 @@ def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_max", "reduce_max"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2563,6 +2593,7 @@ def reduce_max_with_dims(input_tensor,
 
 
 @tf_export(v1=["math.reduce_all", "reduce_all"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2662,6 +2693,7 @@ def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_any", "reduce_any"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2761,6 +2793,7 @@ def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2817,6 +2850,7 @@ def reduce_logsumexp_v1(input_tensor,
 
 
 @tf_export("math.reduce_logsumexp", "reduce_logsumexp", v1=[])
+@dispatch.add_dispatch_support
 def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
@@ -2877,6 +2911,7 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("linalg.trace", v1=["linalg.trace", "trace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("trace")
 @dispatch.add_dispatch_support
 def trace(x, name=None):
@@ -3116,6 +3151,7 @@ def matmul(a,
 
 
 @tf_export("linalg.matvec")
+@dispatch.add_dispatch_support
 def matvec(a,
            b,
            transpose_a=False,
@@ -3219,6 +3255,7 @@ _OverrideBinaryOperatorHelper(matmul, "matmul")
 sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
     gen_math_ops.sparse_mat_mul)
 tf_export(v1=["sparse_matmul"])(sparse_matmul)
+@dispatch.add_dispatch_support
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -3371,6 +3408,7 @@ def add_n(inputs, name=None):
 
 
 @tf_export("math.accumulate_n", v1=["math.accumulate_n", "accumulate_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("accumulate_n")
 def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   """Returns the element-wise sum of a list of tensors.
@@ -3449,6 +3487,7 @@ def _accumulate_n_grad(op, grad):
 
 
 @tf_export("math.sigmoid", "nn.sigmoid", "sigmoid")
+@dispatch.add_dispatch_support
 def sigmoid(x, name=None):
   r"""Computes sigmoid of `x` element-wise.
 
@@ -3521,6 +3560,7 @@ def log_sigmoid(x, name=None):
 
 
 @tf_export("math.bincount", v1=[])
+@dispatch.add_dispatch_support
 def bincount(arr,
              weights=None,
              minlength=None,
@@ -3596,6 +3636,7 @@ def bincount(arr,
 
 
 @tf_export(v1=["math.bincount", "bincount"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("bincount")
 def bincount_v1(arr,
                 weights=None,
@@ -3629,6 +3670,7 @@ def bincount_v1(arr,
 
 
 @tf_export("math.cumsum", "cumsum")
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative sum of the tensor `x` along `axis`.
 
@@ -3700,6 +3742,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumprod", v1=["math.cumprod", "cumprod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("cumprod")
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
@@ -3753,6 +3796,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumulative_logsumexp", v1=["math.cumulative_logsumexp"])
+@dispatch.add_dispatch_support
 def cumulative_logsumexp(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative log-sum-exp of the tensor `x` along `axis`.
 
@@ -3912,6 +3956,7 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
 @tf_export(
     "math.unsorted_segment_mean",
     v1=["math.unsorted_segment_mean", "unsorted_segment_mean"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_mean")
 @dispatch.add_dispatch_support
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
@@ -3958,6 +4003,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
 @tf_export(
     "math.unsorted_segment_sqrt_n",
     v1=["math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
 @dispatch.add_dispatch_support
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
@@ -4307,6 +4353,7 @@ def sparse_segment_sqrt_n_v2(data,
 
 
 @tf_export("tensordot", "linalg.tensordot")
+@dispatch.add_dispatch_support
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes and outer product.
 
@@ -4493,6 +4540,7 @@ def tensordot(a, b, axes, name=None):
 
 
 @tf_export("math.polyval")
+@dispatch.add_dispatch_support
 def polyval(coeffs, x, name=None):
   r"""Computes the elementwise value of a polynomial.
 
@@ -4563,6 +4611,7 @@ def polyval(coeffs, x, name=None):
 
 
 @tf_export("math.reciprocal_no_nan")
+@dispatch.add_dispatch_support
 def reciprocal_no_nan(x, name=None):
   """Performs a safe reciprocal operation, element wise.
 
@@ -4665,6 +4714,7 @@ def ndtri(x, name=None):
 
 
 @tf_export("math.ceil", v1=["math.ceil", "ceil"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("ceil")
 @dispatch.add_dispatch_support
 def ceil(x, name=None):
@@ -4778,6 +4828,7 @@ def exp(x, name=None):
 
 
 @tf_export("math.sobol_sample")
+@dispatch.add_dispatch_support
 def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
   """Generates points from the Sobol sequence.
 
@@ -4802,6 +4853,7 @@ def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
 
 
 @tf_export("math.rsqrt", v1=["math.rsqrt", "rsqrt"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("rsqrt")
 @dispatch.add_dispatch_support
 def rsqrt(x, name=None):
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 03c1289246e..4bda85077bc 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -39,12 +39,14 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import util as losses_util
 from tensorflow.python.platform import device_context
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("nn.log_poisson_loss")
+@dispatch.add_dispatch_support
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
   """Computes log Poisson loss given `log_input`.
 
@@ -110,6 +112,7 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
 
 
 @tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
     _sentinel=None,
     labels=None,
@@ -192,6 +195,7 @@ def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
 # Note: intentionally calling this v2 to not allow existing code with indirect
 # imports to ignore the sentinel behavior.
 @tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
     labels=None,
     logits=None,
@@ -242,6 +246,7 @@ def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
 
 
 @tf_export("nn.weighted_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
                                           name=None):
   """Computes a weighted cross entropy.
@@ -320,6 +325,7 @@ def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
 
 
 @tf_export(v1=["nn.weighted_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "targets is deprecated, use labels instead", "targets")
 def weighted_cross_entropy_with_logits(labels=None,
                                        logits=None,
@@ -384,6 +390,7 @@ def weighted_cross_entropy_with_logits(labels=None,
 
 
 @tf_export("nn.compute_average_loss")
+@dispatch.add_dispatch_support
 def compute_average_loss(per_example_loss,
                          sample_weight=None,
                          global_batch_size=None):
@@ -440,6 +447,7 @@ def compute_average_loss(per_example_loss,
 
 
 @tf_export("nn.scale_regularization_loss")
+@dispatch.add_dispatch_support
 def scale_regularization_loss(regularization_loss):
   """Scales the sum of the given regularization losses by number of replicas.
 
@@ -478,6 +486,7 @@ def scale_regularization_loss(regularization_loss):
 
 
 @tf_export(v1=["nn.relu_layer"])
+@dispatch.add_dispatch_support
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -501,6 +510,7 @@ def relu_layer(x, weights, biases, name=None):
 
 
 @tf_export("nn.swish")
+@dispatch.add_dispatch_support
 @custom_gradient.custom_gradient
 def swish(features):
   # pylint: disable=g-doc-args
@@ -538,6 +548,7 @@ def swish(features):
 
 # pylint: disable=redefined-builtin
 @tf_export("linalg.normalize")
+@dispatch.add_dispatch_support
 def normalize(tensor, ord="euclidean", axis=None, name=None):
   """Normalizes `tensor` along dimension `axis` using specified norm.
 
@@ -590,6 +601,7 @@ def normalize(tensor, ord="euclidean", axis=None, name=None):
 
 
 @tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -618,6 +630,7 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
 
 
 @tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[])
+@dispatch.add_dispatch_support
 def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
   """Normalizes along dimension `axis` using an L2 norm.
 
@@ -668,6 +681,7 @@ def _count_nonzero(input_tensor, dtype=dtypes.int64):
 
 
 @tf_export("math.zero_fraction", "nn.zero_fraction")
+@dispatch.add_dispatch_support
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.
 
@@ -710,6 +724,7 @@ def zero_fraction(value, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["nn.depthwise_conv2d"])
+@dispatch.add_dispatch_support
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -838,6 +853,7 @@ def depthwise_conv2d(input,
 
 
 @tf_export("nn.depthwise_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def depthwise_conv2d_v2(input,
                         filter,
                         strides,
@@ -935,6 +951,7 @@ def depthwise_conv2d_v2(input,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export(v1=["nn.separable_conv2d"])
+@dispatch.add_dispatch_support
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -1042,6 +1059,7 @@ def separable_conv2d(input,
 
 
 @tf_export("nn.separable_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def separable_conv2d_v2(
     input,
     depthwise_filter,
@@ -1117,6 +1135,7 @@ def separable_conv2d_v2(
 
 
 @tf_export(v1=["nn.sufficient_statistics"])
+@dispatch.add_dispatch_support
 def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
                           keepdims=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
@@ -1174,6 +1193,7 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
 
 
 @tf_export("nn.sufficient_statistics", v1=[])
+@dispatch.add_dispatch_support
 def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -1203,6 +1223,7 @@ def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
 
 
 @tf_export("nn.normalize_moments")
+@dispatch.add_dispatch_support
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
@@ -1235,6 +1256,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
 
 
 @tf_export(v1=["nn.moments"])
+@dispatch.add_dispatch_support
 def moments(
     x,
     axes,
@@ -1300,6 +1322,7 @@ def moments(
 
 
 @tf_export("nn.moments", v1=[])
+@dispatch.add_dispatch_support
 def moments_v2(
     x,
     axes,
@@ -1336,6 +1359,7 @@ def moments_v2(
 
 
 @tf_export(v1=["nn.weighted_moments"])
+@dispatch.add_dispatch_support
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
                      keepdims=None):
   """Returns the frequency-weighted mean and variance of `x`.
@@ -1414,6 +1438,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
 
 
 @tf_export("nn.weighted_moments", v1=[])
+@dispatch.add_dispatch_support
 def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -1438,6 +1463,7 @@ def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
 
 
 @tf_export("nn.batch_normalization")
+@dispatch.add_dispatch_support
 def batch_normalization(x,
                         mean,
                         variance,
@@ -1508,6 +1534,7 @@ def batch_normalization(x,
 
 
 @tf_export(v1=["nn.fused_batch_norm"])
+@dispatch.add_dispatch_support
 def fused_batch_norm(
     x,
     scale,
@@ -1631,6 +1658,7 @@ def fused_batch_norm(
 
 
 @tf_export(v1=["nn.batch_norm_with_global_normalization"])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization(t=None,
                                          m=None,
                                          v=None,
@@ -1685,6 +1713,7 @@ def batch_norm_with_global_normalization(t=None,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export("nn.batch_norm_with_global_normalization", v1=[])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization_v2(input,
                                             mean,
                                             variance,
@@ -1934,6 +1963,7 @@ def _compute_sampled_logits(weights,
 
 
 @tf_export("nn.nce_loss", v1=[])
+@dispatch.add_dispatch_support
 def nce_loss_v2(weights,
                 biases,
                 labels,
@@ -2038,6 +2068,7 @@ def nce_loss_v2(weights,
 
 
 @tf_export(v1=["nn.nce_loss"])
+@dispatch.add_dispatch_support
 def nce_loss(weights,
              biases,
              labels,
@@ -2149,6 +2180,7 @@ def nce_loss(weights,
 
 
 @tf_export("nn.sampled_softmax_loss", v1=[])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss_v2(weights,
                             biases,
                             labels,
@@ -2240,6 +2272,7 @@ def sampled_softmax_loss_v2(weights,
 
 
 @tf_export(v1=["nn.sampled_softmax_loss"])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 248c57c1ba5..e7955100b24 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -239,6 +239,7 @@ class _NonAtrousConvolution(object):
 
 
 @tf_export("nn.dilation2d", v1=[])
+@dispatch.add_dispatch_support
 def dilation2d_v2(
     input,   # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -306,6 +307,7 @@ def dilation2d_v2(
 
 
 @tf_export(v1=["nn.dilation2d"])
+@dispatch.add_dispatch_support
 def dilation2d_v1(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -324,6 +326,7 @@ dilation2d_v1.__doc__ = gen_nn_ops.dilation2d.__doc__
 
 
 @tf_export("nn.with_space_to_batch")
+@dispatch.add_dispatch_support
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
     dilation_rate,
@@ -772,6 +775,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 
 @tf_export(v1=["nn.convolution"])
+@dispatch.add_dispatch_support
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -907,7 +911,8 @@ def convolution(
 
 
 @tf_export("nn.convolution", v1=[])
-def convolution_v2(
+@dispatch.add_dispatch_support
+def convolution_v2(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filters,
     strides=None,
@@ -1116,6 +1121,7 @@ class Convolution(object):
 
 
 @tf_export(v1=["nn.pool"])
+@dispatch.add_dispatch_support
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1290,6 +1296,7 @@ def pool(
 
 
 @tf_export("nn.pool", v1=[])
+@dispatch.add_dispatch_support
 def pool_v2(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1389,6 +1396,7 @@ def pool_v2(
 
 
 @tf_export("nn.atrous_conv2d")
+@dispatch.add_dispatch_support
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
@@ -1576,6 +1584,7 @@ def convert_padding(padding):
 
 
 @tf_export(v1=["nn.conv1d"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -1674,6 +1683,7 @@ def conv1d(
 
 
 @tf_export("nn.conv1d", v1=[])
+@dispatch.add_dispatch_support
 def conv1d_v2(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1739,6 +1749,7 @@ def conv1d_v2(
 
 
 @tf_export("nn.conv1d_transpose")
+@dispatch.add_dispatch_support
 def conv1d_transpose(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1827,6 +1838,7 @@ def conv1d_transpose(
 
 
 @tf_export("nn.conv2d", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
               strides,
@@ -1927,6 +1939,7 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
 
 @tf_export(v1=["nn.conv2d"])
+@dispatch.add_dispatch_support
 def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter=None,
@@ -2024,6 +2037,7 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
 
 
 @tf_export(v1=["nn.conv2d_backprop_filter"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter_sizes,
@@ -2084,6 +2098,7 @@ def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-defau
 
 
 @tf_export(v1=["nn.conv2d_backprop_input"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
     filter=None,
@@ -2148,6 +2163,7 @@ def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-defaul
 
 
 @tf_export(v1=["nn.conv2d_transpose"])
+@dispatch.add_dispatch_support
 def conv2d_transpose(
     value=None,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2224,6 +2240,7 @@ def conv2d_transpose(
 
 
 @tf_export("nn.conv2d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_transpose_v2(
     input,  # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -2301,6 +2318,7 @@ def conv2d_transpose_v2(
 
 
 @tf_export("nn.atrous_conv2d_transpose")
+@dispatch.add_dispatch_support
 def atrous_conv2d_transpose(value,
                             filters,
                             output_shape,
@@ -2459,6 +2477,7 @@ def atrous_conv2d_transpose(value,
 
 
 @tf_export(v1=["nn.depthwise_conv2d_native"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native")
 def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2538,6 +2557,7 @@ def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-defa
         "nn.depthwise_conv2d_native_backprop_input",
         "nn.depthwise_conv2d_backprop_input"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_input")
 def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
@@ -2607,6 +2627,7 @@ def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin
         "nn.depthwise_conv2d_native_backprop_filter",
         "nn.depthwise_conv2d_backprop_filter"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_filter")
 def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2672,6 +2693,7 @@ def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builti
 
 
 @tf_export("nn.conv3d", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
               filters,
               strides,
@@ -2691,6 +2713,7 @@ def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
 
 
 @tf_export(v1=["nn.conv3d"])
+@dispatch.add_dispatch_support
 def conv3d_v1(  # pylint: disable=missing-docstring,dangerous-default-value
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -2711,6 +2734,7 @@ conv3d_v1.__doc__ = gen_nn_ops.conv3d.__doc__
 
 
 @tf_export(v1=["nn.conv3d_transpose"])
+@dispatch.add_dispatch_support
 def conv3d_transpose(
     value,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2782,6 +2806,7 @@ def conv3d_transpose(
 
 
 @tf_export("nn.conv3d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
                         filters,
                         output_shape,
@@ -2861,6 +2886,7 @@ CONV_TRANSPOSE_OPS = (
 
 
 @tf_export("nn.conv_transpose")
+@dispatch.add_dispatch_support
 def conv_transpose(input,  # pylint: disable=redefined-builtin
                    filters,
                    output_shape,
@@ -2958,6 +2984,7 @@ _tf_deterministic_ops.value = None
 
 
 @tf_export("nn.bias_add")
+@dispatch.add_dispatch_support
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
 
@@ -3047,6 +3074,7 @@ def bias_add_v1(value, bias, name=None):
 
 
 @tf_export(v1=["nn.crelu"])
+@dispatch.add_dispatch_support
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -3079,12 +3107,14 @@ def crelu(features, name=None, axis=-1):
 
 
 @tf_export("nn.crelu", v1=[])
+@dispatch.add_dispatch_support
 def crelu_v2(features, axis=-1, name=None):
   return crelu(features, name=name, axis=axis)
 crelu_v2.__doc__ = crelu.__doc__
 
 
 @tf_export("nn.relu6")
+@dispatch.add_dispatch_support
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
 
@@ -3107,6 +3137,7 @@ def relu6(features, name=None):
 
 
 @tf_export("nn.leaky_relu")
+@dispatch.add_dispatch_support
 def leaky_relu(features, alpha=0.2, name=None):
   """Compute the Leaky ReLU activation function.
 
@@ -3245,6 +3276,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax", "math.softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -3289,6 +3321,7 @@ def softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.softmax", "math.softmax", v1=[])
+@dispatch.add_dispatch_support
 def softmax_v2(logits, axis=None, name=None):
   """Computes softmax activations.
 
@@ -3316,6 +3349,7 @@ def softmax_v2(logits, axis=None, name=None):
 
 
 @tf_export(v1=["nn.log_softmax", "math.log_softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -3346,6 +3380,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+@dispatch.add_dispatch_support
 def log_softmax_v2(logits, axis=None, name=None):
   """Computes log softmax activations.
 
@@ -3382,6 +3417,7 @@ def _ensure_xent_args(name, sentinel, labels, logits):
 
 
 @tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
@@ -3444,6 +3480,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax_cross_entropy_with_logits_v2_helper(
     labels, logits, axis=None, name=None, dim=None):
@@ -3571,6 +3608,7 @@ See `tf.nn.softmax_cross_entropy_with_logits_v2`.
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
 def softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
@@ -3639,6 +3677,7 @@ def softmax_cross_entropy_with_logits(
 
 
 @tf_export(v1=["nn.sparse_softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
     labels=None,
@@ -3764,6 +3803,7 @@ def sparse_softmax_cross_entropy_with_logits(
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
@@ -3816,6 +3856,7 @@ def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
 
 
 @tf_export("nn.avg_pool", v1=["nn.avg_pool_v2"])
+@dispatch.add_dispatch_support
 def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  # pylint: disable=redefined-builtin
   """Performs the avg pooling on the input.
 
@@ -3878,6 +3919,7 @@ def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  #
 
 
 @tf_export(v1=["nn.avg_pool", "nn.avg_pool2d"])
+@dispatch.add_dispatch_support
 def avg_pool(value, ksize, strides, padding, data_format="NHWC",
              name=None, input=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
@@ -3922,6 +3964,7 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC",
 
 
 @tf_export("nn.avg_pool2d", v1=[])
+@dispatch.add_dispatch_support
 def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -3961,6 +4004,7 @@ def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 
 @tf_export("nn.avg_pool1d")
+@dispatch.add_dispatch_support
 def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4006,6 +4050,7 @@ def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  #
 
 
 @tf_export("nn.avg_pool3d")
+@dispatch.add_dispatch_support
 def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4046,6 +4091,7 @@ def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool", v1=["nn.max_pool_v2"])
+@dispatch.add_dispatch_support
 def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
   """Performs the max pooling on the input.
 
@@ -4106,6 +4152,7 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
 
 
 @tf_export(v1=["nn.max_pool"])
+@dispatch.add_dispatch_support
 def max_pool(value,
              ksize,
              strides,
@@ -4155,6 +4202,7 @@ def max_pool(value,
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool1d")
+@dispatch.add_dispatch_support
 def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4199,6 +4247,7 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool2d")
+@dispatch.add_dispatch_support
 def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4237,6 +4286,7 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool3d")
+@dispatch.add_dispatch_support
 def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4279,6 +4329,7 @@ def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 
 @tf_export("nn.max_pool_with_argmax", v1=[])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v2(
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4348,6 +4399,7 @@ def max_pool_with_argmax_v2(
 
 
 @tf_export(v1=["nn.max_pool_with_argmax"])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v1(  # pylint: disable=missing-docstring,invalid-name
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4442,6 +4494,7 @@ def _calc_bias_add_flops(graph, node):
 
 
 @tf_export(v1=["nn.xw_plus_b"])
+@dispatch.add_dispatch_support
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -4691,6 +4744,7 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
 
 
 @tf_export("math.top_k", "nn.top_k")
+@dispatch.add_dispatch_support
 def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
@@ -4751,6 +4805,7 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
 
 
 @tf_export(v1=["nn.fractional_max_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_max_pool_v2.")
 def fractional_max_pool(value,
@@ -4837,6 +4892,7 @@ def fractional_max_pool(value,
 
 
 @tf_export("nn.fractional_max_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_max_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -4922,6 +4978,7 @@ def fractional_max_pool_v2(value,
 
 
 @tf_export(v1=["nn.fractional_avg_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_avg_pool_v2.")
 def fractional_avg_pool(value,
@@ -4987,6 +5044,7 @@ def fractional_avg_pool(value,
 
 
 @tf_export("nn.fractional_avg_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_avg_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -5065,6 +5123,7 @@ def _calc_dilation2d_flops(graph, node):
 
 
 @tf_export(v1=["nn.erosion2d"])
+@dispatch.add_dispatch_support
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -5124,6 +5183,7 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
 
 
 @tf_export("nn.erosion2d", v1=[])
+@dispatch.add_dispatch_support
 def erosion2d_v2(value,
                  filters,
                  strides,
@@ -5193,6 +5253,7 @@ def erosion2d_v2(value,
 
 
 @tf_export(v1=["math.in_top_k", "nn.in_top_k"])
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -5227,6 +5288,7 @@ def in_top_k(predictions, targets, k, name=None):
 
 
 @tf_export("math.in_top_k", "nn.in_top_k", v1=[])
+@dispatch.add_dispatch_support
 def in_top_k_v2(targets, predictions, k, name=None):
   return in_top_k(predictions, targets, k, name)
 
@@ -5234,7 +5296,11 @@ def in_top_k_v2(targets, predictions, k, name=None):
 in_top_k_v2.__doc__ = in_top_k.__doc__
 
 
-tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
-tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
-tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
-tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
+tf_export(v1=["nn.quantized_avg_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_avg_pool))
+tf_export(v1=["nn.quantized_conv2d"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_conv2d))
+tf_export(v1=["nn.quantized_relu_x"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_relu_x))
+tf_export(v1=["nn.quantized_max_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_max_pool))
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 9f9e7229442..81a532bb150 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -25,10 +25,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -50,6 +52,7 @@ def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
 
 
 @tf_export("debugging.assert_all_finite", v1=[])
+@dispatch.add_dispatch_support
 def verify_tensor_all_finite_v2(x, message, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 8e518e913be..edcae89aada 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import parsing_config
 from tensorflow.python.ops.gen_parsing_ops import *
 # pylint: enable=wildcard-import,undefined-variable
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -77,6 +78,7 @@ def _prepend_none_dimension(features):
 
 
 @tf_export("io.parse_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_example_v2(serialized, features, example_names=None, name=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -314,6 +316,7 @@ def parse_example_v2(serialized, features, example_names=None, name=None):
 
 
 @tf_export(v1=["io.parse_example", "parse_example"])
+@dispatch.add_dispatch_support
 def parse_example(serialized, features, name=None, example_names=None):
   return parse_example_v2(serialized, features, example_names, name)
 
@@ -373,6 +376,7 @@ def _parse_example_raw(serialized, names, params, name):
 
 
 @tf_export(v1=["io.parse_single_example", "parse_single_example"])
+@dispatch.add_dispatch_support
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -407,6 +411,7 @@ def parse_single_example(serialized, features, name=None, example_names=None):
 
 
 @tf_export("io.parse_single_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_single_example_v2(
     serialized, features, example_names=None, name=None
     ):
@@ -448,6 +453,7 @@ def parse_single_example_v2(
 
 
 @tf_export("io.parse_sequence_example")
+@dispatch.add_dispatch_support
 def parse_sequence_example(serialized,
                            context_features=None,
                            sequence_features=None,
@@ -692,6 +698,7 @@ def _parse_sequence_example_raw(serialized,
 @tf_export("io.parse_single_sequence_example",
            v1=["io.parse_single_sequence_example",
                "parse_single_sequence_example"])
+@dispatch.add_dispatch_support
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
@@ -835,6 +842,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 @tf_export("io.decode_raw", v1=[])
+@dispatch.add_dispatch_support
 def decode_raw(input_bytes,
                out_type,
                little_endian=True,
@@ -877,6 +885,7 @@ def decode_raw(input_bytes,
 
 
 @tf_export(v1=["decode_raw", "io.decode_raw"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "bytes is deprecated, use input_bytes instead",
                              "bytes")
@@ -921,6 +930,7 @@ def decode_raw_v1(
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export(v1=["io.decode_csv", "decode_csv"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
@@ -970,6 +980,7 @@ def decode_csv(records,
 
 
 @tf_export("io.decode_csv", v1=[])
+@dispatch.add_dispatch_support
 def decode_csv_v2(records,
                   record_defaults,
                   field_delim=",",
diff --git a/tensorflow/python/ops/proto_ops.py b/tensorflow/python/ops/proto_ops.py
index 1f7300dbef9..0e19aad584c 100644
--- a/tensorflow/python/ops/proto_ops.py
+++ b/tensorflow/python/ops/proto_ops.py
@@ -22,10 +22,11 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops.gen_decode_proto_ops import decode_proto_v2 as decode_proto
 from tensorflow.python.ops.gen_encode_proto_ops import encode_proto
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
-tf_export("io.decode_proto")(decode_proto)
-tf_export("io.encode_proto")(encode_proto)
+tf_export("io.decode_proto")(dispatch.add_dispatch_support(decode_proto))
+tf_export("io.encode_proto")(dispatch.add_dispatch_support(encode_proto))
 
 ops.NotDifferentiable("DecodeProtoV2")
 ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 7f971cd558f..782902f2f71 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 #===============================================================================
@@ -40,6 +41,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('ragged.boolean_mask')
+@dispatch.add_dispatch_support
 def boolean_mask(data, mask, name=None):
   """Applies a boolean mask to `data` without flattening the mask dimensions.
 
@@ -538,6 +540,7 @@ def ragged_one_hot(indices,
 # ragged.stack_dynamic_partitions
 #===============================================================================
 @tf_export('ragged.stack_dynamic_partitions')
+@dispatch.add_dispatch_support
 def stack_dynamic_partitions(data, partitions, num_partitions, name=None):
   """Stacks dynamic partitions of a Tensor or RaggedTensor.
 
@@ -699,6 +702,7 @@ def reverse(tensor, axis, name=None):
 
 
 @tf_export('ragged.cross')
+@dispatch.add_dispatch_support
 def cross(inputs, name=None):
   """Generates feature cross from a list of tensors.
 
@@ -725,6 +729,7 @@ def cross(inputs, name=None):
 
 
 @tf_export('ragged.cross_hashed')
+@dispatch.add_dispatch_support
 def cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
   """Generates hashed feature cross from a list of tensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
index 9bcb1aa4765..cd710f449a6 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -71,6 +72,7 @@ def concat(values, axis, name=None):
 
 
 @tf_export('ragged.stack')
+@dispatch.add_dispatch_support
 def stack(values, axis=0, name=None):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` `RaggedTensor`.
 
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index aa148ae7fe8..3a6f6231149 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -34,6 +35,7 @@ from tensorflow.python.util.tf_export import tf_export
 # Op to construct a constant RaggedTensor from a nested Python list.
 #===============================================================================
 @tf_export("ragged.constant")
+@dispatch.add_dispatch_support
 def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
              name=None, row_splits_dtype=dtypes.int64):
   """Constructs a constant RaggedTensor from a nested Python list.
@@ -86,6 +88,7 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
 
 
 @tf_export(v1=["ragged.constant_value"])
+@dispatch.add_dispatch_support
 def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None,
                    row_splits_dtype="int64"):
   """Constructs a RaggedTensorValue from a nested Python list.
@@ -311,6 +314,7 @@ def _default_inner_shape_for_pylist(pylist, ragged_rank):
 
 
 @tf_export(v1=["ragged.placeholder"])
+@dispatch.add_dispatch_support
 def placeholder(dtype, ragged_rank, value_shape=None, name=None):
   """Creates a placeholder for a `tf.RaggedTensor` that will always be fed.
 
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index cc45f729e58..00b5ced6170 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -24,10 +24,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("ragged.map_flat_values")
+@dispatch.add_dispatch_support
 def map_flat_values(op, *args, **kwargs):
   """Applies `op` to the values of one or more RaggedTensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 5483cda571c..73a53583ada 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -38,6 +39,7 @@ from tensorflow.python.util.tf_export import tf_export
 #===============================================================================
 # pylint: disable=redefined-builtin
 @tf_export('ragged.range')
+@dispatch.add_dispatch_support
 def range(starts, limits=None, deltas=1, dtype=None,
           name=None, row_splits_dtype=dtypes.int64):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index d5f21832044..0d9c4d506f3 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -29,10 +29,12 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("strings.bytes_split")
+@dispatch.add_dispatch_support
 def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
   """Split string elements of `input` into bytes.
 
@@ -80,6 +82,7 @@ def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_encode")
+@dispatch.add_dispatch_support
 def unicode_encode(input,
                    output_encoding,
                    errors="replace",
@@ -177,6 +180,7 @@ def unicode_encode(input,
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_decode")
+@dispatch.add_dispatch_support
 def unicode_decode(input,
                    input_encoding,
                    errors="replace",
@@ -222,6 +226,7 @@ def unicode_decode(input,
 
 
 @tf_export("strings.unicode_decode_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_decode_with_offsets(input,
                                 input_encoding,
                                 errors="replace",
@@ -283,6 +288,7 @@ def unicode_decode_with_offsets(input,
 
 
 @tf_export("strings.unicode_split")
+@dispatch.add_dispatch_support
 def unicode_split(input,
                   input_encoding,
                   errors="replace",
@@ -330,6 +336,7 @@ def unicode_split(input,
 
 
 @tf_export("strings.unicode_split_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_split_with_offsets(input,
                                input_encoding,
                                errors="replace",
@@ -453,6 +460,7 @@ def _unicode_decode(input, input_encoding, errors, replacement_char,
 
 
 @tf_export("strings.split", v1=[])
+@dispatch.add_dispatch_support
 def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
   """Split elements of `input` based on `sep` into a `RaggedTensor`.
 
@@ -514,6 +522,7 @@ def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable
 
 
 @tf_export(v1=["string_split"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "delimiter is deprecated, please use sep instead.",
                              "delimiter")
@@ -578,6 +587,7 @@ def string_split(source, sep=None, skip_empty=True, delimiter=None,
 # In TensorFlow 1.x, "tf.strings.split" uses the new signature (with maxsplit),
 # but we need to add the result_type argument.
 @tf_export(v1=["strings.split"])
+@dispatch.add_dispatch_support
 def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redefined-builtin
                      result_type="SparseTensor", source=None, name=None):
   """Split elements of `input` based on `sep`.
@@ -651,6 +661,7 @@ def reduce_join(inputs, axis=None, keepdims=None, separator="", name=None):
 
 
 @tf_export("strings.ngrams")
+@dispatch.add_dispatch_support
 def ngrams(data,
            ngram_width,
            separator=" ",
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 5329860743e..0d4a58bfea4 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -25,12 +25,14 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.row_splits_to_segment_ids")
+@dispatch.add_dispatch_support
 def row_splits_to_segment_ids(splits, name=None, out_type=None):
   """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
 
@@ -74,6 +76,7 @@ def row_splits_to_segment_ids(splits, name=None, out_type=None):
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.segment_ids_to_row_splits")
+@dispatch.add_dispatch_support
 def segment_ids_to_row_splits(segment_ids, num_segments=None,
                               out_type=None, name=None):
   """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 83cb7fcc92a..1af91ed0dd3 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -36,10 +36,12 @@ from tensorflow.python.ops.gen_random_ops import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("random.normal", v1=["random.normal", "random_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_normal")
 def random_normal(shape,
                   mean=0.0,
@@ -155,6 +157,7 @@ def parameterized_truncated_normal(shape,
 
 @tf_export("random.truncated_normal",
            v1=["random.truncated_normal", "truncated_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
@@ -202,6 +205,7 @@ ops.NotDifferentiable("TruncatedNormal")
 
 
 @tf_export("random.uniform", v1=["random.uniform", "random_uniform"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_uniform")
 def random_uniform(shape,
                    minval=0,
@@ -313,6 +317,7 @@ ops.NotDifferentiable("RandomUniform")
 
 
 @tf_export("random.shuffle", v1=["random.shuffle", "random_shuffle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
@@ -345,6 +350,7 @@ def random_shuffle(value, seed=None, name=None):
 
 
 @tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
@@ -389,6 +395,7 @@ def random_crop(value, size, seed=None, name=None):
 
 
 @tf_export(v1=["random.multinomial", "multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.categorical` instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
@@ -468,6 +475,7 @@ def _maybe_set_static_shape_helper(tensor, shape, postfix_tensor):
 
 
 @tf_export("random.gamma", v1=["random.gamma", "random_gamma"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_gamma")
 def random_gamma(shape,
                  alpha,
@@ -561,6 +569,7 @@ def random_gamma(shape,
 
 
 @tf_export(v1=["random.poisson", "random_poisson"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -601,6 +610,7 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
 
 
 @tf_export("random.poisson", v1=[])
+@dispatch.add_dispatch_support
 def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index b87e5d65a37..6c11ebefb1c 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -342,6 +343,7 @@ def _reverse_seq(input_seq, lengths):
                         "keras.layers.RNN(cell))`, which is equivalent to "
                         "this API")
 @tf_export(v1=["nn.bidirectional_dynamic_rnn"])
+@dispatch.add_dispatch_support
 def bidirectional_dynamic_rnn(cell_fw,
                               cell_bw,
                               inputs,
@@ -499,6 +501,7 @@ def bidirectional_dynamic_rnn(cell_fw,
     None,
     "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
 @tf_export(v1=["nn.dynamic_rnn"])
+@dispatch.add_dispatch_support
 def dynamic_rnn(cell,
                 inputs,
                 sequence_length=None,
@@ -912,6 +915,7 @@ def _dynamic_rnn_loop(cell,
 
 
 @tf_export(v1=["nn.raw_rnn"])
+@dispatch.add_dispatch_support
 def raw_rnn(cell,
             loop_fn,
             parallel_iterations=None,
@@ -1238,6 +1242,7 @@ def raw_rnn(cell,
                         "Please use `keras.layers.RNN(cell, unroll=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_rnn"])
+@dispatch.add_dispatch_support
 def static_rnn(cell,
                inputs,
                initial_state=None,
@@ -1416,6 +1421,7 @@ def static_rnn(cell,
                         "Please use `keras.layers.RNN(cell, stateful=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_state_saving_rnn"])
+@dispatch.add_dispatch_support
 def static_state_saving_rnn(cell,
                             inputs,
                             state_saver,
@@ -1510,6 +1516,7 @@ def static_state_saving_rnn(cell,
                         "keras.layers.RNN(cell, unroll=True))`, which is "
                         "equivalent to this API")
 @tf_export(v1=["nn.static_bidirectional_rnn"])
+@dispatch.add_dispatch_support
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index bee85dc4a5b..7ee5a16ca9a 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -370,6 +371,7 @@ def _EagerPyFuncGrad(op, *dy):
 
 
 @tf_export("py_function")
+@dispatch.add_dispatch_support
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
@@ -551,6 +553,7 @@ def py_func_common(func, inp, Tout, stateful=True, name=None):
     stateful argument making all functions stateful.
     """)
 @tf_export(v1=["py_func"])
+@dispatch.add_dispatch_support
 def py_func(func, inp, Tout, stateful=True, name=None):
   return py_func_common(func, inp, Tout, stateful, name=name)
 
@@ -559,6 +562,7 @@ py_func.__doc__ = "%s" % py_func_common.__doc__
 
 
 @tf_export("numpy_function")
+@dispatch.add_dispatch_support
 def numpy_function(func, inp, Tout, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 988d437bae8..0b65033ce8c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import gen_set_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,6 +33,7 @@ _VALID_DTYPES = set([
 
 
 @tf_export("sets.size", v1=["sets.size", "sets.set_size"])
+@dispatch.add_dispatch_support
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -135,6 +137,7 @@ def _set_operation(a, b, set_operation, validate_indices=True):
 
 @tf_export(
     "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
+@dispatch.add_dispatch_support
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -205,6 +208,7 @@ def set_intersection(a, b, validate_indices=True):
 
 @tf_export(
     "sets.difference", v1=["sets.difference", "sets.set_difference"])
+@dispatch.add_dispatch_support
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -286,6 +290,7 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
 
 @tf_export(
     "sets.union", v1=["sets.union", "sets.set_union"])
+@dispatch.add_dispatch_support
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
index d628e54cdf9..18730743941 100644
--- a/tensorflow/python/ops/signal/dct_ops.py
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -50,6 +51,7 @@ def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
 
 # TODO(rjryan): Implement `axis` parameter.
 @tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+@dispatch.add_dispatch_support
 def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
 
@@ -181,6 +183,7 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disabl
 
 # TODO(rjryan): Implement `n` and `axis` parameters.
 @tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+@dispatch.add_dispatch_support
 def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
 
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
index 6e9e8ef80e4..86a94cf5de7 100644
--- a/tensorflow/python/ops/signal/fft_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -181,17 +182,23 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(
+    dispatch.add_dispatch_support(rfft))
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(
+    dispatch.add_dispatch_support(irfft))
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(
+    dispatch.add_dispatch_support(rfft2d))
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(
+    dispatch.add_dispatch_support(irfft2d))
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(
+    dispatch.add_dispatch_support(rfft3d))
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(
+    dispatch.add_dispatch_support(irfft3d))
 
 
 def _fft_size_for_grad(grad, rank):
@@ -363,6 +370,7 @@ def _irfft_grad_helper(rank, rfft_fn):
 
 
 @tf_export("signal.fftshift")
+@dispatch.add_dispatch_support
 def fftshift(x, axes=None, name=None):
   """Shift the zero-frequency component to the center of the spectrum.
 
@@ -407,6 +415,7 @@ def fftshift(x, axes=None, name=None):
 
 
 @tf_export("signal.ifftshift")
+@dispatch.add_dispatch_support
 def ifftshift(x, axes=None, name=None):
   """The inverse of fftshift.
 
diff --git a/tensorflow/python/ops/signal/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
index b95876bc977..cf0bed9ef1b 100644
--- a/tensorflow/python/ops/signal/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -90,6 +91,7 @@ def _validate_arguments(num_mel_bins, sample_rate,
 
 
 @tf_export('signal.linear_to_mel_weight_matrix')
+@dispatch.add_dispatch_support
 def linear_to_mel_weight_matrix(num_mel_bins=20,
                                 num_spectrogram_bins=129,
                                 sample_rate=8000,
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 56cbff40bca..948b78a858e 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,10 +22,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import dct_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.mfccs_from_log_mel_spectrograms')
+@dispatch.add_dispatch_support
 def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
 
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
index fcdcf592f14..e340e97b3e5 100644
--- a/tensorflow/python/ops/signal/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -23,10 +23,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("signal.overlap_and_add")
+@dispatch.add_dispatch_support
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
 
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index 1c95873fc3d..7a3acce3475 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,6 +56,7 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
 
 
 @tf_export("signal.frame")
+@dispatch.add_dispatch_support
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           name=None):
   """Expands `signal`'s `axis` dimension into frames of `frame_length`.
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index d096e53e8f8..7c4c5542b84 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.ops.signal import window_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.stft')
+@dispatch.add_dispatch_support
 def stft(signals, frame_length, frame_step, fft_length=None,
          window_fn=window_ops.hann_window,
          pad_end=False, name=None):
@@ -95,6 +97,7 @@ def stft(signals, frame_length, frame_step, fft_length=None,
 
 
 @tf_export('signal.inverse_stft_window_fn')
+@dispatch.add_dispatch_support
 def inverse_stft_window_fn(frame_step,
                            forward_window_fn=window_ops.hann_window,
                            name=None):
@@ -156,6 +159,7 @@ def inverse_stft_window_fn(frame_step,
 
 
 @tf_export('signal.inverse_stft')
+@dispatch.add_dispatch_support
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
@@ -291,6 +295,7 @@ def _enclosing_power_of_two(value):
 
 
 @tf_export('signal.mdct')
+@dispatch.add_dispatch_support
 def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
          pad_end=False, norm=None, name=None):
   """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`.
@@ -366,6 +371,7 @@ def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
 
 
 @tf_export('signal.inverse_mdct')
+@dispatch.add_dispatch_support
 def inverse_mdct(mdcts,
                  window_fn=window_ops.vorbis_window,
                  norm=None,
diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
index bb10bdf4be5..eb33c3f3b58 100644
--- a/tensorflow/python/ops/signal/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -52,6 +53,7 @@ def _check_params(window_length, dtype):
 
 
 @tf_export('signal.kaiser_window')
+@dispatch.add_dispatch_support
 def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
   """Generate a [Kaiser window][kaiser].
 
@@ -91,6 +93,7 @@ def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.kaiser_bessel_derived_window')
+@dispatch.add_dispatch_support
 def kaiser_bessel_derived_window(window_length, beta=12.,
                                  dtype=dtypes.float32, name=None):
   """Generate a [Kaiser Bessel derived window][kbd].
@@ -118,6 +121,7 @@ def kaiser_bessel_derived_window(window_length, beta=12.,
 
 
 @tf_export('signal.vorbis_window')
+@dispatch.add_dispatch_support
 def vorbis_window(window_length, dtype=dtypes.float32, name=None):
   """Generate a [Vorbis power complementary window][vorbis].
 
@@ -142,6 +146,7 @@ def vorbis_window(window_length, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hann_window')
+@dispatch.add_dispatch_support
 def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   """Generate a [Hann window][hann].
 
@@ -167,6 +172,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hamming_window')
+@dispatch.add_dispatch_support
 def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
                    name=None):
   """Generate a [Hamming][hamming] window.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
index 92435e6bdef..4e66a80bc01 100644
--- a/tensorflow/python/ops/sort_ops.py
+++ b/tensorflow/python/ops/sort_ops.py
@@ -30,10 +30,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('sort')
+@dispatch.add_dispatch_support
 def sort(values, axis=-1, direction='ASCENDING', name=None):
   """Sorts a tensor.
   
@@ -67,6 +69,7 @@ def sort(values, axis=-1, direction='ASCENDING', name=None):
 
 
 @tf_export('argsort')
+@dispatch.add_dispatch_support
 def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
   """Returns the indices of a tensor that give its sorted order along an axis.
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 844aa3c744c..c4c88ab86ef 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -1065,6 +1065,7 @@ def sparse_slice(sp_input, start, size, name=None):
 
 
 @tf_export(v1=["sparse_to_dense"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     None,
     "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
@@ -1994,6 +1995,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
 
 
 @tf_export(v1=["io.serialize_sparse", "serialize_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
@@ -2014,6 +2016,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
@@ -2040,6 +2043,7 @@ def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
 
 
 @tf_export(v1=["io.serialize_many_sparse", "serialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
@@ -2069,6 +2073,7 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_many_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_many_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
@@ -2172,6 +2177,7 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
 @tf_export(
     "io.deserialize_many_sparse",
     v1=["io.deserialize_many_sparse", "deserialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("deserialize_many_sparse")
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize and concatenate `SparseTensors` from a serialized minibatch.
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index a05a488408d..036346cdecd 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -42,11 +42,13 @@ from tensorflow.python.ops import gen_special_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
 @tf_export('math.lbeta', v1=['math.lbeta', 'lbeta'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('lbeta')
 def lbeta(x, name=None):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
@@ -102,6 +104,7 @@ def lbeta(x, name=None):
 
 
 @tf_export('math.special.dawsn')
+@dispatch.add_dispatch_support
 def dawsn(x, name=None):
   """Computes Dawson's integral of `x` element-wise.
 
@@ -131,6 +134,7 @@ def dawsn(x, name=None):
 
 
 @tf_export('math.special.expint')
+@dispatch.add_dispatch_support
 def expint(x, name=None):
   """Computes the Exponential integral of `x` element-wise.
 
@@ -159,6 +163,7 @@ def expint(x, name=None):
 
 
 @tf_export('math.special.fresnel_cos')
+@dispatch.add_dispatch_support
 def fresnel_cos(x, name=None):
   """Computes Fresnel's cosine integral of `x` element-wise.
 
@@ -188,6 +193,7 @@ def fresnel_cos(x, name=None):
 
 
 @tf_export('math.special.fresnel_sin')
+@dispatch.add_dispatch_support
 def fresnel_sin(x, name=None):
   """Computes Fresnel's sine integral of `x` element-wise.
 
@@ -216,6 +222,7 @@ def fresnel_sin(x, name=None):
 
 
 @tf_export('math.special.spence')
+@dispatch.add_dispatch_support
 def spence(x, name=None):
   """Computes Spence's integral of `x` element-wise.
 
@@ -244,6 +251,7 @@ def spence(x, name=None):
 
 
 @tf_export('math.bessel_i0')
+@dispatch.add_dispatch_support
 def bessel_i0(x, name=None):
   """Computes the Bessel i0 function of `x` element-wise.
 
@@ -268,6 +276,7 @@ def bessel_i0(x, name=None):
 
 
 @tf_export('math.bessel_i1')
+@dispatch.add_dispatch_support
 def bessel_i1(x, name=None):
   """Computes the Bessel i1 function of `x` element-wise.
 
@@ -325,6 +334,7 @@ def _enclosing_tpu_context():
 
 
 @tf_export('einsum', 'linalg.einsum')
+@dispatch.add_dispatch_support
 def einsum(equation, *inputs, **kwargs):
   """Tensor contraction over specified indices and outer product.
 
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 2bf53d3a0f7..0ae29ba0219 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateless_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("StatelessMultinomial")
@@ -40,6 +41,7 @@ ops.NotDifferentiable("StatelessTruncatedNormal")
 
 
 @tf_export("random.experimental.stateless_split")
+@dispatch.add_dispatch_support
 def split(seed, num=2):
   """Splits an RNG seed into `num` new seeds by adding a leading axis.
 
@@ -73,6 +75,7 @@ def split(seed, num=2):
 
 
 @tf_export("random.experimental.stateless_fold_in")
+@dispatch.add_dispatch_support
 def fold_in(seed, data):
   """Folds in data to an RNG seed to form a new RNG seed.
 
@@ -111,6 +114,7 @@ def fold_in(seed, data):
 
 
 @tf_export("random.stateless_uniform")
+@dispatch.add_dispatch_support
 def stateless_random_uniform(shape,
                              seed,
                              minval=0,
@@ -205,6 +209,7 @@ def stateless_random_uniform(shape,
 
 
 @tf_export("random.stateless_binomial")
+@dispatch.add_dispatch_support
 def stateless_random_binomial(shape,
                               seed,
                               counts,
@@ -274,6 +279,7 @@ def stateless_random_binomial(shape,
 
 
 @tf_export("random.stateless_gamma")
+@dispatch.add_dispatch_support
 def stateless_random_gamma(shape,
                            seed,
                            alpha,
@@ -372,6 +378,7 @@ def stateless_random_gamma(shape,
 
 
 @tf_export("random.stateless_poisson")
+@dispatch.add_dispatch_support
 def stateless_random_poisson(shape,
                              seed,
                              lam,
@@ -434,6 +441,7 @@ def stateless_random_poisson(shape,
 
 
 @tf_export("random.stateless_normal")
+@dispatch.add_dispatch_support
 def stateless_random_normal(shape,
                             seed,
                             mean=0.0,
@@ -474,6 +482,7 @@ def stateless_random_normal(shape,
 
 
 @tf_export("random.stateless_truncated_normal")
+@dispatch.add_dispatch_support
 def stateless_truncated_normal(shape,
                                seed,
                                mean=0.0,
@@ -520,6 +529,7 @@ def stateless_truncated_normal(shape,
 
 
 @tf_export(v1=["random.stateless_multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.stateless_categorical` instead.")
 def stateless_multinomial(logits,
@@ -562,6 +572,7 @@ def stateless_multinomial(logits,
 
 
 @tf_export("random.stateless_categorical")
+@dispatch.add_dispatch_support
 def stateless_categorical(logits,
                           num_samples,
                           seed,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 09ba078383a..dd0ae223d9d 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -73,6 +73,7 @@ regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 
 @tf_export(
     "strings.regex_replace", v1=["strings.regex_replace", "regex_replace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("regex_replace")
 @dispatch.add_dispatch_support
 def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
@@ -112,6 +113,7 @@ def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
 
 
 @tf_export("strings.format")
+@dispatch.add_dispatch_support
 def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
   r"""Formats a string template using a list of tensors.
 
@@ -300,6 +302,7 @@ def _reduce_join_reduction_dims(x, axis):
 
 
 @tf_export(v1=["strings.reduce_join", "reduce_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -412,6 +415,7 @@ string_length_v2.__doc__ = gen_string_ops.string_length.__doc__
 
 
 @tf_export(v1=["substr"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(None, "Use `tf.strings.substr` instead of `tf.substr`.")
 def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
   return substr(input, pos, len, name=name, unit=unit)
@@ -476,6 +480,7 @@ def string_to_number(input, out_type=dtypes.float32, name=None):
 
 
 @tf_export(v1=["strings.to_number", "string_to_number"])
+@dispatch.add_dispatch_support
 def string_to_number_v1(
     string_tensor=None,
     out_type=dtypes.float32,
@@ -519,6 +524,7 @@ def string_to_hash_bucket(input, num_buckets, name=None):
 
 
 @tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])
+@dispatch.add_dispatch_support
 def string_to_hash_bucket_v1(
     string_tensor=None,
     num_buckets=None,
@@ -532,6 +538,7 @@ string_to_hash_bucket_v1.__doc__ = gen_string_ops.string_to_hash_bucket.__doc__
 
 
 @tf_export("strings.join", v1=["strings.join", "string_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("string_join")
 @dispatch.add_dispatch_support
 def string_join(inputs, separator="", name=None):

From fdcdac12a72ead0128463dc029af58e896897cc9 Mon Sep 17 00:00:00 2001
From: Rajeshwar Reddy T <43972606+rthadur@users.noreply.github.com>
Date: Fri, 15 May 2020 11:22:06 -0700
Subject: [PATCH 0671/1533] Update bot_config.yml

---
 .github/bot_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index d63bd2ce844..fdb19d453c2 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -1,4 +1,4 @@
- # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
     #
     # Licensed under the Apache License, Version 2.0 (the "License");
     # you may not use this file except in compliance with the License.

From ff17316b19d5958605cebf941e4302d60f405784 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Fri, 15 May 2020 11:20:09 -0700
Subject: [PATCH 0672/1533] Check for `_metrics` in case sublayer is resetting
 `_metrics` property.

PiperOrigin-RevId: 311767501
Change-Id: I1f97904314a0f1912c918b89f461edd1183f4604
---
 tensorflow/python/keras/engine/base_layer.py    | 2 +-
 tensorflow/python/keras/engine/base_layer_v1.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 94b696d842b..0f4bec92e39 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -2588,7 +2588,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Keep track of metric instance created in subclassed layer.
     from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
     for val in nest.flatten(value):
-      if isinstance(val, metrics_module.Metric):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
         self._metrics.append(val)
 
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 4a277ec3a3e..80e0b4be2f1 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -2226,7 +2226,7 @@ class Layer(base_layer.Layer):
     # Keep track of metric instance created in subclassed layer.
     from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
     for val in nest.flatten(value):
-      if isinstance(val, metrics_module.Metric):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
         self._metrics.append(val)
 
     # TODO(scottzhu): Need to track Module object as well for weight tracking.

From 6e2219518cf6351bd7067b98e75e2862b5c5b88a Mon Sep 17 00:00:00 2001
From: Rajeshwar Reddy T <43972606+rthadur@users.noreply.github.com>
Date: Fri, 15 May 2020 11:23:59 -0700
Subject: [PATCH 0673/1533] Update bot_config.yml

---
 .github/bot_config.yml | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index fdb19d453c2..ee6037f4b94 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -1,23 +1,23 @@
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ============================================================================
-    #
-    # THIS IS A GENERATED DOCKERFILE.
-    #
-    # This file was assembled from multiple pieces, whose use is documented
-    # throughout. Please refer to the TensorFlow dockerfiles documentation
-    # for more information.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 # A list of assignees
 assignees:

From 1f530076d15fe482234b83273e4432b14c353853 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 11:21:31 -0700
Subject: [PATCH 0674/1533] Fix TFLite builds on Windows/MacOS

Avoid using `--enable_platform_specific_config` when cross-compiling for
iOS/Android, as this pulls in host build flags, which may not be
appropriate (e.g., when cross-compiling for Android on a Windows host).

Also fix an issue when building tensorflowlite_c for iOS.

Fixes #38525.

PiperOrigin-RevId: 311767770
Change-Id: I80b817fd89a6889dc78be50f1def8b899f091cb6
---
 .bazelrc                  | 11 ++++++++++-
 configure.py              |  1 -
 tensorflow/lite/c/BUILD   |  3 +++
 tensorflow/tensorflow.bzl |  3 +++
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 224238d7c0b..7e0f820b4c2 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -235,10 +235,15 @@ build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
 build:c++1z --config=c++17
 
-# Enable using platform specific build settings
+# Enable using platform specific build settings, except when cross-compiling for
+# mobile platforms.
 build --enable_platform_specific_config
+build:android --noenable_platform_specific_config
+build:ios --noenable_platform_specific_config
 
 # Suppress C++ compiler warnings, otherwise build logs become 10s of MBs.
+build:android --copt=-w
+build:ios --copt=-w
 build:linux --copt=-w
 build:macos --copt=-w
 build:windows --copt=/w
@@ -258,6 +263,10 @@ build:macos --define=INCLUDEDIR=$(PREFIX)/include
 # TF_SYSTEM_LIBS do not work on windows.
 
 # By default, build TF in C++ 14 mode.
+build:android --cxxopt=-std=c++14
+build:android --host_cxxopt=-std=c++14
+build:ios --cxxopt=-std=c++14
+build:ios --host_cxxopt=-std=c++14
 build:linux --cxxopt=-std=c++14
 build:linux --host_cxxopt=-std=c++14
 build:macos --cxxopt=-std=c++14
diff --git a/configure.py b/configure.py
index 945c3036a8d..9154000d944 100644
--- a/configure.py
+++ b/configure.py
@@ -1387,7 +1387,6 @@ def main():
     # Windows.
     environ_cp['TF_DOWNLOAD_CLANG'] = '0'
     environ_cp['TF_NEED_MPI'] = '0'
-    environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index e1702d40d5a..1aa043b7c0c 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -22,6 +22,9 @@ package(
 tflite_cc_shared_object(
     name = "tensorflowlite_c",
     linkopts = select({
+        "//tensorflow:ios": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
+        ],
         "//tensorflow:macos": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
         ],
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index f56330b428a..c029de9a4e8 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -615,6 +615,9 @@ def tf_cc_shared_object(
             linkshared = 1,
             data = data + data_extra,
             linkopts = linkopts + _rpath_linkopts(name_os_full) + select({
+                clean_dep("//tensorflow:ios"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
                 clean_dep("//tensorflow:macos"): [
                     "-Wl,-install_name,@rpath/" + soname,
                 ],

From 9cb8d45b72233c19125c4ca8890fae5611110ec9 Mon Sep 17 00:00:00 2001
From: Lev Proleev <levp@google.com>
Date: Fri, 15 May 2020 11:44:26 -0700
Subject: [PATCH 0675/1533] Add NNAPI delegate support for Elu

PiperOrigin-RevId: 311772163
Change-Id: I94393872c9afa25aafc2fc55f688d47caa57ed14
---
 .../lite/delegates/nnapi/acceleration_test_list.cc |  7 ++++---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc  | 14 ++++++++++++--
 .../lite/delegates/nnapi/nnapi_delegate_kernel.h   |  1 +
 tensorflow/lite/kernels/lstm_test.cc               |  2 +-
 tensorflow/lite/nnapi/NeuralNetworksTypes.h        |  7 +++++++
 tensorflow/lite/nnapi/nnapi_implementation.cc      | 13 -------------
 6 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index cc9e049123e..46a6a720d1e 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -56,6 +56,7 @@ FloatActivationsOpTest/PRelu,29
 LogisticOpTest/LogisticOpTest/Sigmoid(.+nt8)?/\d+
 LogisticOpTest/LogisticOpTest/Sigmoid/\d+
 TanhOpTest/TanhOpTest/Tanh(.+nt8)?/\d+,29
+FloatActivationsOpTest/Elu,30
 FloatActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwishBias
@@ -301,14 +302,14 @@ VariedShapeSpec/ReshapeOpTest/WithStretchDimension/1
 
 # resize_bilinear_test
 // align_corners & half_pixel_centers are not implemented in NNAPI before API 30
-ResizeBilinearOpTest/ResizeBilinearOpTest.+HalfPixelCenters.*,30
+ResizeBilinearOpTest/ResizeBilinearOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeBilinearOpTest/ResizeBilinearOpTest/.+/0,29
 
 # resize_nearest_neighbor_test
 // align_corners & half_pixel_centers are not implemented in NNAPI before API 30
-ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*,30
-ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*,30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*/0,30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index ff6ad0dc0d9..e6faea62bf6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <functional>
 #include <initializer_list>
@@ -1623,7 +1624,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeBilinear: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       const auto& input = context->tensors[node->inputs->data[0]];
       const auto output_dims = context->tensors[node->outputs->data[0]].dims;
       Expect(input.dims->size == 4,
@@ -1663,7 +1664,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeNearestNeighbor: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
@@ -2334,6 +2335,11 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only supports floating point input.", &val_ctx);
     } break;
+    case kTfLiteBuiltinElu: {
+      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI13,
+                                 &val_ctx);
+    } break;
     default:
       // All other operators are not mapped.
       AddValidationFailure(NNAPIValidationFailureType::kUnsupportedOperator,
@@ -3111,6 +3117,10 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
       *nn_op_type = ANEURALNETWORKS_REDUCE_SUM;
     } break;
+    case kTfLiteBuiltinElu: {
+      mapping_args.builder->AddScalarFloat32Operand(1.0);
+      *nn_op_type = ANEURALNETWORKS_ELU;
+    } break;
     default:
       // All other operators are not mapped.
       return kTfLiteError;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 668fdf5b5f6..af93d9650c9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -31,6 +31,7 @@ namespace nnapi {
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
 constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+constexpr int32_t kMinSdkVersionForNNAPI13 = 30;
 
 // Track tensor indices to NN API tensor indices mapping.
 class OperandMapping {
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 2bd31eae8db..62634e6bfbd 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2050,7 +2050,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
       }};
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/0.000902065);
+                /*tolerance=*/0.0009021);
 }
 
 class CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 851c1718e0a..a3dfd373405 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -136,6 +136,13 @@ enum {
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
   ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
+  ANEURALNETWORKS_QUANTIZED_LSTM = 95,
+  ANEURALNETWORKS_IF = 96,
+  ANEURALNETWORKS_WHILE = 97,
+  ANEURALNETWORKS_ELU = 98,
+  ANEURALNETWORKS_HARD_SWISH = 99,
+  ANEURALNETWORKS_FILL = 100,
+  ANEURALNETWORKS_RANK = 101,
 };
 
 /**
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 71a4de53e9a..accdfb6c7da 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -45,19 +45,6 @@ int32_t GetAndroidSdkVersion() {
       }
       result = result * 10 + digit;
     }
-    // TODO(levp): remove once SDK gets updated to 29th level
-    // Upgrade SDK version for pre-release Q to be able to test functionality
-    // available from SDK level 29.
-    if (result == 28) {
-      char versionCodename[PROP_VALUE_MAX];
-      const char* versionCodenameProp = "ro.build.version.codename";
-      length = __system_property_get(versionCodenameProp, versionCodename);
-      if (length != 0) {
-        if (versionCodename[0] == 'Q') {
-          return 29;
-        }
-      }
-    }
     return result;
   }
   return 0;

From 362818b71e540df53f909f09f55f5c31234c29ca Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Fri, 15 May 2020 11:52:10 -0700
Subject: [PATCH 0676/1533] Attempt to build libtensorflow GPU with the
 manylinux2010 toolchain.

PiperOrigin-RevId: 311773671
Change-Id: I6a0a34852786fb2187ea7ad131a6e4878c84e089
---
 tensorflow/tools/ci_build/builds/libtensorflow.sh       | 2 +-
 tensorflow/tools/ci_build/linux/libtensorflow_docker.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 44180b8bf84..a281afe7442 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -54,7 +54,7 @@ function build_libtensorflow_tarball() {
   BAZEL_OPTS="--config=opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
   export CC_OPT_FLAGS="-mavx -msse4.2"
   if [ "${TF_NEED_CUDA}" == "1" ]; then
-    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
     export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 467b8dc8083..1b255682671 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -36,7 +36,7 @@ DOCKER_BINARY="docker"
 if [ "${TF_NEED_CUDA}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-gpu"
   DOCKER_BINARY="nvidia-docker"
-  DOCKER_FILE="Dockerfile.gpu"
+  DOCKER_FILE="Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010"
 fi
 if [ "${TF_NEED_ROCM}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-rocm"

From 75132b735b4f0f0ec7f86f5d3db9b8e05209ab63 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 11:54:21 -0700
Subject: [PATCH 0677/1533] Minor cleanup for strings in
 xplane_to_profile_response.

PiperOrigin-RevId: 311774079
Change-Id: I445cd1121c548dd2beb133057eeab4f434939df9
---
 .../core/profiler/convert/xplane_to_profile_response.cc   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index e6fe74942fc..70a07171310 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
+#include <string>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -78,14 +80,14 @@ Status ConvertProtoToJson(const Proto& proto_output, std::string* json_output) {
     // tensorflow::StringPiece.
     auto error_msg = status.message();
     return errors::Internal(
-        strings::StrCat("Could not convert proto to JSON string: ",
-                        StringPiece(error_msg.data(), error_msg.length())));
+        "Could not convert proto to JSON string: ",
+        absl::string_view(error_msg.data(), error_msg.length()));
   }
   return Status::OK();
 }
 
 // Returns the tool name with extension.
-string ToolName(absl::string_view tool) {
+std::string ToolName(absl::string_view tool) {
   if (tool == kTraceViewer) return "trace.json.gz";
   if (tool == kMemoryProfile) return "memory_profile.json.gz";
   return absl::StrCat(tool, ".pb");

From b1fc80f4a199f353d5bce0a79689b08181d3d96d Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Fri, 15 May 2020 12:00:16 -0700
Subject: [PATCH 0678/1533] Reduce Functional.__call__ Python overhead by
 ~5-10%

PiperOrigin-RevId: 311775071
Change-Id: I45dd0a1ce865d6c17f7b5e292799348e1e17a91c
---
 tensorflow/python/keras/engine/functional.py | 46 +++++++-------------
 tensorflow/python/keras/engine/node.py       |  4 ++
 2 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index c79e2849c4f..f219e590daf 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -469,11 +469,11 @@ class Functional(training_lib.Model):
         mask: (Optional) Tensor or nested structure of Tensors.
 
     Returns:
-        Two lists: output_tensors, output_masks
+        output_tensors
     """
     inputs = self._flatten_to_reference_inputs(inputs)
     if mask is None:
-      masks = [None for _ in range(len(inputs))]
+      masks = [None] * len(inputs)
     else:
       masks = self._flatten_to_reference_inputs(mask)
     for input_t, mask in zip(inputs, masks):
@@ -481,55 +481,39 @@ class Functional(training_lib.Model):
 
     # Dictionary mapping reference tensors to computed tensors.
     tensor_dict = {}
+    tensor_usage_count = self._tensor_usage_count
     for x, y in zip(self.inputs, inputs):
       y = self._conform_to_reference_input(y, ref_input=x)
       x_id = str(id(x))
-      tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
+      tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
 
-    depth_keys = list(self._nodes_by_depth.keys())
+    nodes_by_depth = self._nodes_by_depth
+    depth_keys = list(nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
 
     for depth in depth_keys:
-      nodes = self._nodes_by_depth[depth]
+      nodes = nodes_by_depth[depth]
       for node in nodes:
         if node.is_input:
           continue  # Input tensors already exist.
 
-        if not all(
-            str(id(tensor)) in tensor_dict
-            for tensor in nest.flatten(node.keras_inputs)):
+        if any(t_id not in tensor_dict for t_id in node.flat_input_ids):
           continue  # Node is not computable, try skipping.
 
-        layer = node.layer
         args, kwargs = node.map_arguments(tensor_dict)
-        outputs = layer(*args, **kwargs)
+        outputs = node.layer(*args, **kwargs)
 
         # Update tensor_dict.
-        for x, y in zip(nest.flatten(node.outputs), nest.flatten(outputs)):
-          x_id = str(id(x))
-          tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
+        for x_id, y in zip(node.flat_output_ids, nest.flatten(outputs)):
+          tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
 
     output_tensors = []
-    output_shapes = []
     for x in self.outputs:
-      assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
-      tensor = tensor_dict[str(id(x))].pop()
-      output_shapes.append(x.shape)
-      output_tensors.append(tensor)
+      x_id = str(id(x))
+      assert x_id in tensor_dict, 'Could not compute output ' + str(x)
+      output_tensors.append(tensor_dict[x_id].pop())
 
-    if output_shapes is not None:
-      input_shapes = [x.shape for x in inputs]
-      try:
-        cache_key = tuple(tf_utils.convert_shapes(input_shapes, to_tuples=True))
-        self._output_shape_cache[cache_key] = nest.pack_sequence_as(
-            self._nested_outputs, output_shapes)
-      except ValueError:
-        # In case there are unknown TensorShape, eg for sparse tensor input,
-        # We skip the caching since the shape is unknown.
-        pass
-
-    output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
-    return output_tensors
+    return nest.pack_sequence_as(self._nested_outputs, output_tensors)
 
   def _flatten_to_reference_inputs(self, tensors):
     """Maps `tensors` to their respective `keras.Input`."""
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index 945cf1c64bd..a9e0b621d75 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -102,6 +102,10 @@ class Node(object):
       tensor._keras_history = KerasHistory(
           layer=layer, node_index=node_index, tensor_index=i)
 
+    # Cached for performance.
+    self.flat_input_ids = [str(id(t)) for t in self._keras_inputs]
+    self.flat_output_ids = [str(id(t)) for t in nest.flatten(self.outputs)]
+
   @property
   def keras_inputs(self):
     """Tensors input to this node that can be traced back to a `keras.Input`."""

From 0b59eaf0bf66b71dc108ec4f73c548fc48abc36d Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Fri, 15 May 2020 12:09:13 -0700
Subject: [PATCH 0679/1533] Provide builtin_op_kernels target with Ruy and GEMV
 caching unconditionally enabled

PiperOrigin-RevId: 311776871
Change-Id: I948ea5524fdcf17c36e6219fb1ae18fafdecee4e
---
 tensorflow/lite/kernels/BUILD | 294 +++++++++++++++++++---------------
 1 file changed, 168 insertions(+), 126 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 3a29fee5699..657b5d89a85 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -235,6 +235,15 @@ cc_library(
     visibility = ["//visibility:private"],
 )
 
+cc_library(
+    name = "tflite_with_ruy_and_caching_enabled",
+    defines = [
+        "TFLITE_WITH_RUY",
+        "TFLITE_WITH_RUY_GEMV",
+    ],
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "tflite_with_ruy_default",
     build_for_embedded = True,
@@ -423,140 +432,157 @@ cc_library(
     ],
 )
 
+BUILTIN_KERNEL_SRCS = [
+    "activations.cc",
+    "add.cc",
+    "add_n.cc",
+    "arg_min_max.cc",
+    "audio_spectrogram.cc",
+    "basic_rnn.cc",
+    "batch_matmul.cc",
+    "batch_to_space_nd.cc",
+    "bidirectional_sequence_lstm.cc",
+    "bidirectional_sequence_rnn.cc",
+    "cast.cc",
+    "ceil.cc",
+    "comparisons.cc",
+    "concatenation.cc",
+    "conv.cc",
+    "densify.cc",
+    "depth_to_space.cc",
+    "depthwise_conv.cc",
+    "dequantize.cc",
+    "detection_postprocess.cc",
+    "div.cc",
+    "elementwise.cc",
+    "embedding_lookup.cc",
+    "embedding_lookup_sparse.cc",
+    "exp.cc",
+    "expand_dims.cc",
+    "fake_quant.cc",
+    "fill.cc",
+    "floor.cc",
+    "floor_div.cc",
+    "floor_mod.cc",
+    "fully_connected.cc",
+    "gather.cc",
+    "gather_nd.cc",
+    "hashtable_lookup.cc",
+    "if.cc",
+    "l2norm.cc",
+    "local_response_norm.cc",
+    "logical.cc",
+    "lsh_projection.cc",
+    "lstm.cc",
+    "matrix_diag.cc",
+    "matrix_set_diag.cc",
+    "maximum_minimum.cc",
+    "mfcc.cc",
+    "mirror_pad.cc",
+    "mul.cc",
+    "neg.cc",
+    "non_max_suppression.cc",
+    "numeric_verify.cc",
+    "one_hot.cc",
+    "pack.cc",
+    "pad.cc",
+    "pooling.cc",
+    "pow.cc",
+    "quantize.cc",
+    "range.cc",
+    "rank.cc",
+    "reduce.cc",
+    "reshape.cc",
+    "resize_bilinear.cc",
+    "resize_nearest_neighbor.cc",
+    "reverse.cc",
+    "reverse_sequence.cc",
+    "round.cc",
+    "scatter_nd.cc",
+    "segment_sum.cc",
+    "select.cc",
+    "shape.cc",
+    "skip_gram.cc",
+    "slice.cc",
+    "space_to_batch_nd.cc",
+    "space_to_depth.cc",
+    "sparse_to_dense.cc",
+    "split.cc",
+    "split_v.cc",
+    "squared_difference.cc",
+    "squeeze.cc",
+    "strided_slice.cc",
+    "sub.cc",
+    "svdf.cc",
+    "tile.cc",
+    "topk_v2.cc",
+    "transpose.cc",
+    "transpose_conv.cc",
+    "unidirectional_sequence_lstm.cc",
+    "unidirectional_sequence_rnn.cc",
+    "unique.cc",
+    "unpack.cc",
+    "where.cc",
+    "while.cc",
+    "zeros_like.cc",
+]
+
+BUILTIN_KERNEL_DEPS = [
+    ":cpu_backend_context",
+    ":cpu_backend_gemm",
+    ":cpu_backend_threadpool",
+    ":eigen_support",
+    ":kernel_util",
+    ":lstm_eval",
+    ":lstm_shared",
+    ":op_macros",
+    ":padding",
+    "@com_google_absl//absl/memory",
+    "@com_google_absl//absl/strings",
+    "//third_party/eigen3",
+    "@flatbuffers",
+    "//tensorflow/lite:framework_lib",
+    "//tensorflow/lite:minimal_logging",
+    "//tensorflow/lite:string_util",
+    "//tensorflow/lite/c:common",
+    "//tensorflow/lite/kernels/internal:audio_utils",
+    "//tensorflow/lite/kernels/internal:common",
+    "//tensorflow/lite/kernels/internal:compatibility",
+    "//tensorflow/lite/kernels/internal:cpu_check",
+    "//tensorflow/lite/kernels/internal:kernel_utils",
+    "//tensorflow/lite/kernels/internal:optimized",
+    "//tensorflow/lite/kernels/internal:optimized_base",
+    "//tensorflow/lite/kernels/internal:quantization_util",
+    "//tensorflow/lite/kernels/internal:reference_base",
+    "//tensorflow/lite/kernels/internal:strided_slice_logic",
+    "//tensorflow/lite/kernels/internal:tensor",
+    "//tensorflow/lite/kernels/internal:tensor_utils",
+    "//tensorflow/lite/kernels/internal:types",
+]
+
 cc_library(
     name = "builtin_op_kernels",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "add_n.cc",
-        "arg_min_max.cc",
-        "audio_spectrogram.cc",
-        "basic_rnn.cc",
-        "batch_matmul.cc",
-        "batch_to_space_nd.cc",
-        "bidirectional_sequence_lstm.cc",
-        "bidirectional_sequence_rnn.cc",
-        "cast.cc",
-        "ceil.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "densify.cc",
-        "depth_to_space.cc",
-        "depthwise_conv.cc",
-        "dequantize.cc",
-        "detection_postprocess.cc",
-        "div.cc",
-        "elementwise.cc",
-        "embedding_lookup.cc",
-        "embedding_lookup_sparse.cc",
-        "exp.cc",
-        "expand_dims.cc",
-        "fake_quant.cc",
-        "fill.cc",
-        "floor.cc",
-        "floor_div.cc",
-        "floor_mod.cc",
-        "fully_connected.cc",
-        "gather.cc",
-        "gather_nd.cc",
-        "hashtable_lookup.cc",
-        "if.cc",
-        "l2norm.cc",
-        "local_response_norm.cc",
-        "logical.cc",
-        "lsh_projection.cc",
-        "lstm.cc",
-        "matrix_diag.cc",
-        "matrix_set_diag.cc",
-        "maximum_minimum.cc",
-        "mfcc.cc",
-        "mirror_pad.cc",
-        "mul.cc",
-        "neg.cc",
-        "non_max_suppression.cc",
-        "numeric_verify.cc",
-        "one_hot.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "pow.cc",
-        "quantize.cc",
-        "range.cc",
-        "rank.cc",
-        "reduce.cc",
-        "reshape.cc",
-        "resize_bilinear.cc",
-        "resize_nearest_neighbor.cc",
-        "reverse.cc",
-        "reverse_sequence.cc",
-        "round.cc",
-        "scatter_nd.cc",
-        "segment_sum.cc",
-        "select.cc",
-        "shape.cc",
-        "skip_gram.cc",
-        "slice.cc",
-        "space_to_batch_nd.cc",
-        "space_to_depth.cc",
-        "sparse_to_dense.cc",
-        "split.cc",
-        "split_v.cc",
-        "squared_difference.cc",
-        "squeeze.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tile.cc",
-        "topk_v2.cc",
-        "transpose.cc",
-        "transpose_conv.cc",
-        "unidirectional_sequence_lstm.cc",
-        "unidirectional_sequence_rnn.cc",
-        "unique.cc",
-        "unpack.cc",
-        "where.cc",
-        "while.cc",
-        "zeros_like.cc",
-    ],
+    srcs = BUILTIN_KERNEL_SRCS,
     hdrs = [
         "dequantize.h",
     ],
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
-    deps = [
-        ":cpu_backend_context",
-        ":cpu_backend_gemm",
-        ":cpu_backend_threadpool",
-        ":eigen_support",
-        ":kernel_util",
-        ":lstm_eval",
-        ":lstm_shared",
-        ":op_macros",
-        ":padding",
-        "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:audio_utils",
-        "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/kernels/internal:cpu_check",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
-        "//tensorflow/lite/kernels/internal:optimized",
-        "//tensorflow/lite/kernels/internal:optimized_base",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:strided_slice_logic",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
-        "//tensorflow/lite/kernels/internal:types",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@farmhash_archive//:farmhash",
-        "@flatbuffers",
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"],
+)
+
+# Creates a target where Ruy is unconditionally enabled along with caching
+# on GEMV operations. This is useful for TF Lite deployments where custom
+# copts are not allowed, e.g. b/156119344
+cc_library(
+    name = "builtin_op_kernels_ruy_and_caching",
+    srcs = BUILTIN_KERNEL_SRCS,
+    hdrs = [
+        "dequantize.h",
     ],
+    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
+    visibility = ["//visibility:private"],
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"] + [":tflite_with_ruy_and_caching_enabled"],
 )
 
 cc_library(
@@ -673,6 +699,22 @@ cc_library(
     ],
 )
 
+#  TODO(b/156664104) Remove once runtime flag available.
+cc_library(
+    name = "builtin_ops_ruy_and_caching_enabled",
+    srcs = ["register.cc"],
+    hdrs = [
+        "builtin_op_kernels.h",
+        "fully_connected.h",
+        "register.h",
+    ],
+    deps = [
+        ":builtin_op_kernels_ruy_and_caching",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 # The builtin_ops target will resolve to optimized kernels when available. This
 # target uses reference kernels only, and is useful for validation and testing.
 # It should *not* generally be used in production.

From d968853cc6825c705a4443844319279c464b152e Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Fri, 15 May 2020 12:12:51 -0700
Subject: [PATCH 0680/1533] Skip TFE_ContextAsyncWait for tfrt. In current
 TF-TFRT integration, all ops are executed synchronously. We will revisit this
 later.

PiperOrigin-RevId: 311777624
Change-Id: I3a27805dcce53ccf572f3c500d6fd0a532b286b2
---
 tensorflow/c/eager/c_api.cc                    | 4 +---
 tensorflow/c/eager/context_interface.h         | 3 +++
 tensorflow/core/common_runtime/eager/context.h | 2 ++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 5c01ccb82bb..f5535c80d30 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -899,9 +899,7 @@ TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
 #if defined(IS_MOBILE_PLATFORM)
   status->status = tensorflow::Status::OK();
 #else   // !defined(IS_MOBILE_PLATFORM)
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  status->status = context->SyncExecutors();
+  status->status = tensorflow::unwrap(ctx)->AsyncWait();
 #endif  // !IS_MOBILE_PLATFORM
 }
 
diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/context_interface.h
index d21ab45e579..76f182f4945 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/context_interface.h
@@ -101,6 +101,9 @@ class AbstractContextInterface {
   // Destroy the step resource container for a training step.
   virtual void EndStep() = 0;
 
+  // Block until all pending nodes are finished,
+  virtual Status AsyncWait() = 0;
+
  protected:
   virtual ~AbstractContextInterface() {}
 };
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index d034aaf2f9c..d03a91c817a 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -295,6 +295,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // errors, and the error message will be combined from all executors.
   Status SyncExecutors();
 
+  Status AsyncWait() override { return SyncExecutors(); }
+
   core::RefCountPtr<KernelAndDevice> GetCachedKernel(Fprint128 cache_key);
 
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);

From 985275ea27f1e542da9d267a0a42b791d4159ac5 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 15 May 2020 12:15:58 -0700
Subject: [PATCH 0681/1533] When restoring a variable with an initializer, pass
 through restore metadata rather than forgetting it

This avoids 2x memory usage when restoring with a distribution strategy, since otherwise variables are restored twice (with two live copies the second time).

PiperOrigin-RevId: 311778129
Change-Id: I60c1c23d0b554d30e3913f588e6f11a7c430fe71
---
 .../python/distribute/checkpointing_test.py   | 36 +++++++++++++++++++
 .../python/distribute/distribute_lib.py       | 20 ++++++++---
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index 040faf6f6ce..ad646905315 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import adam as adam_v1
@@ -96,6 +97,41 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
         self.assertEqual((training_continuation + 1) * num_training_steps,
                          root.optimizer_step.numpy())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          mode=["eager"]))
+  def testInitializeFromCheckpoint(self, distribution):
+    variable_shape = [5]
+    save_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(
+        array_ops.ones(variable_shape)))
+    save_path = save_checkpoint.save(
+        os.path.join(self.get_temp_dir(), "checkpoint"))
+    with distribution.scope():
+      restore_checkpoint = trackable_utils.Checkpoint()
+      restore_checkpoint.restore(save_path)
+      initial_value = restore_checkpoint._preload_simple_restoration(
+          "v", variable_shape)
+      v = variables_lib.Variable(initial_value)
+      # Check that the variable is now tagged as restored. `Checkpoint` then
+      # knows it doesn't have to restore `v`'s value when it's assigned to an
+      # object.
+      self.assertGreater(v._update_uid, 0)
+      self.assertAllClose(array_ops.ones(variable_shape), v)
+      v.assign(array_ops.zeros(variable_shape))
+      # Assignment to an object should not trigger restoration, since we already
+      # restored the object through an initializer. This wouldn't be a
+      # correctness issue, but it would mean that models would use twice as much
+      # memory when loading (the buffer already assigned to the variable, and
+      # the new restoration).
+      restore_checkpoint.v = v
+      self.assertAllClose(array_ops.zeros(variable_shape), v)
+
   @combinations.generate(
       combinations.combine(
           distribution=[
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 6baa15f59c1..4531e922840 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1772,13 +1772,25 @@ class StrategyExtendedV2(object):
       kwargs["distribute_strategy"] = strategy
 
       # Unwrap `initial_value` if it is a `CheckpointInitialValue` to avoid
-      # dereferencing a `Tensor` that is without a `name`.
-      # TODO(b/138130844): Revisit the following check once
-      # `CheckpointInitialValue` class is removed.
+      # dereferencing a `Tensor` that is without a `name`. We still need to
+      # propagate the metadata it's holding.
       if isinstance(kwargs["initial_value"], trackable.CheckpointInitialValue):
+        checkpoint_restore_uid = kwargs[
+            "initial_value"].checkpoint_position.restore_uid
         kwargs["initial_value"] = kwargs["initial_value"].wrapped_value
+      else:
+        checkpoint_restore_uid = None
 
-      return self._create_variable(next_creator, **kwargs)
+      created = self._create_variable(next_creator, **kwargs)
+
+      if checkpoint_restore_uid is not None:
+        # pylint: disable=protected-access
+        # Let the checkpointing infrastructure know that the variable was
+        # already restored so it doesn't waste memory loading the value again.
+        created._maybe_initialize_trackable()
+        created._update_uid = checkpoint_restore_uid
+        # pylint: enable=protected-access
+      return created
 
     def distributed_getter(getter, *args, **kwargs):
       if not self._allow_variable_partition():

From 28229ffdbf8d996449bf2ad8289d18201f21ca7b Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 15 May 2020 12:20:44 -0700
Subject: [PATCH 0682/1533] Delete Tensor constructor that takes a pointer.
 Otherwise, say,
 std::make_unique<Tensor>(GetTensorSomewhereThatActuallyReturnsAPointer())
 would construct boolean tensor without a compile time error.

PiperOrigin-RevId: 311778946
Change-Id: Ibdb69ff7c4a9697028ed30ac40ffb0797b4493f9
---
 tensorflow/core/framework/tensor.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 54541be0b4f..744a14e007e 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <type_traits>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -239,6 +240,12 @@ class Tensor {
   /// are not valid.
   Tensor(Tensor&& other);
 
+  // Explicitly delete constructor that take a pointer (except char*)
+  // so that the pointer doesn't get implicitly cast to bool.
+  template <typename T, typename std::enable_if<!std::is_same<T, char>::value,
+                                                T>::type* = nullptr>
+  explicit Tensor(T* t) = delete;
+
   ~Tensor();
 
   /// Returns the data type.

From 321d3d9fd09b956e163e859909465690f73806a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 12:37:37 -0700
Subject: [PATCH 0683/1533] Update Eigen to:
 https://gitlab.com/libeigen/eigen/-/commit/9b411757abd8458f9689b1384c6bf75da9b82357

PiperOrigin-RevId: 311782120
Change-Id: I8b68ee1dbc23f1e7861c17d8a7715867860124dc
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 949c6920e33..404d253e8bd 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "2c7c0aec4271dfca6b8a7707e2112f67c4cb3bdf7c89c0e98d3fcd39707c4468",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51",
+        sha256 = "59f7cc665fff375f142d558e7c08c95ac254fa13d077cbecce757a556d30e0d9",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-9b411757abd8458f9689b1384c6bf75da9b82357",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/49f1aeb60d9f759859fce0d16aa5d1ecc7168d51/eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/49f1aeb60d9f759859fce0d16aa5d1ecc7168d51/eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
         ],
     )
 

From 2db0d85d05a3dad9153b9afa6bd2ed5ba7c24102 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 12:40:36 -0700
Subject: [PATCH 0684/1533] Fix case where embedding column ->
 use_safe_embedding = false is used with variable partitioning.

PiperOrigin-RevId: 311782693
Change-Id: I38b59943a25adbe77e9f3f01c49a713876cc3f22
---
 .../python/feature_column/feature_column.py   |   4 +-
 .../feature_column/feature_column_test.py     | 312 ++++++++++++------
 .../feature_column/feature_column_v2.py       |   4 +-
 .../feature_column/feature_column_v2_test.py  | 193 +++++++----
 4 files changed, 336 insertions(+), 177 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 87420d0e850..07df4e914c9 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2546,7 +2546,7 @@ class _EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -2696,7 +2696,7 @@ class _SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 21def9cfa2c..38800fc2162 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import copy
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.example import example_pb2
@@ -852,9 +853,9 @@ class HashedCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -1714,10 +1715,10 @@ class LinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc._numeric_column('price')
@@ -2841,7 +2842,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2891,7 +2892,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2927,7 +2928,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3043,7 +3044,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3077,7 +3078,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3129,7 +3130,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3618,9 +3619,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4058,9 +4059,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4363,9 +4364,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4820,7 +4821,7 @@ class IndicatorColumnTest(test.TestCase):
         self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
-class EmbeddingColumnTest(test.TestCase):
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -4956,10 +4957,29 @@ class EmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_a),
                                   self.evaluate(output_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
@@ -4974,12 +4994,20 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -4997,25 +5025,43 @@ class EmbeddingColumnTest(test.TestCase):
     # Build columns.
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }))
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
@@ -5072,7 +5118,7 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
@@ -5102,11 +5148,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in my_vars]))
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
@@ -5169,8 +5215,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval(
@@ -5233,8 +5279,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
@@ -5280,14 +5326,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5361,14 +5407,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5450,13 +5496,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in trainable_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
@@ -5513,17 +5557,16 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
-class SharedEmbeddingColumnTest(test.TestCase):
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -5772,33 +5815,59 @@ class SharedEmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_b),
                                   self.evaluate(output_b_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -5808,38 +5877,65 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1:
         (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
     )
-    expected_lookups_b = (
-        # example 0:
-        (1., 2.),  # ids [0], embedding = [1, 2]
-        # example 1:
-        (0., 0.),  # ids [], embedding = [0, 0]
-    )
+    if use_safe_embedding_lookup:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+          # example 1:
+          (0., 0.),  # ids [], embedding = [0, 0]
+      )
+    else:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+      )
 
     # Build columns.
     categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a._get_dense_tensor(
-        _LazyBuilder(input_features))
-    embedding_lookup_b = embedding_column_b._get_dense_tensor(
-        _LazyBuilder(input_features))
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
 
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a._get_dense_tensor(
+          _LazyBuilder(input_features))
+      embedding_lookup_b = embedding_column_b._get_dense_tensor(
+          _LazyBuilder(input_features))
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
-      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
-      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
@@ -5886,11 +5982,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in global_vars))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in my_vars))
 
@@ -5997,14 +6093,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6091,14 +6187,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6195,16 +6291,16 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           tuple([v.name for v in trainable_vars]))
     else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 23a9861eb1b..b572987d52d 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -3263,7 +3263,7 @@ class EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -3558,7 +3558,7 @@ class SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = (embedding_ops.embedding_lookup_sparse)
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index fe769850fb0..cba87a51c23 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -2087,7 +2087,7 @@ class LinearModelTest(test.TestCase):
       for var in model.variables:
         self.assertIsInstance(var, variables_lib.VariableV1)
       variable_names = [var.name for var in model.variables]
-      self.assertItemsEqual([
+      self.assertCountEqual([
           'linear_model/dense_feature_bucketized/weights:0',
           'linear_model/price1/weights:0',
           'linear_model/sparse_feature_embedding/embedding_weights:0',
@@ -2731,10 +2731,10 @@ class OldLinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc.numeric_column('price')
@@ -3411,7 +3411,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3461,7 +3461,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3497,7 +3497,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3616,7 +3616,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -5904,7 +5904,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -5968,7 +5968,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6036,7 +6036,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6109,7 +6109,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6180,7 +6180,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6230,14 +6230,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6274,15 +6274,25 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       {
           'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
       }, {
           'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
       })
   @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
+  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
@@ -6297,13 +6307,20 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
 
     def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -6321,25 +6338,43 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     # Build columns.
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
+      # Provide sparse input and get dense result.
+      l = df.DenseFeatures((embedding_column,))
+      dense_features = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in global_vars]))
     for v in global_vars:
       self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in trainable_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
     self.evaluate(lookup_ops.tables_initializer())
@@ -6410,9 +6445,9 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    self.assertItemsEqual([],
+    self.assertCountEqual([],
                           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6475,10 +6510,10 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6528,14 +6563,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6610,14 +6645,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6972,15 +7007,26 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       {
           'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
       }, {
           'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
       })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self, use_safe_embedding_lookup):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
     input_a = np.array([
         [2, -1, -1],  # example 0, ids [2]
@@ -6997,13 +7043,20 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
 
     def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -7031,22 +7084,32 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
-    embedding_lookup_b = embedding_column_b.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
+      embedding_lookup_b = embedding_column_b.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
+                             'vars/aaa_bbb_shared_embedding/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -7279,14 +7342,14 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'aaa_bbb_shared_embedding:0',
           'linear_model/bbb_shared_embedding/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars['aaa_bbb_shared_embedding:0']
       linear_weights_a = trainable_vars[
@@ -7420,18 +7483,18 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
         tuple([v.name for v in global_vars]))
     for v in global_vars:
       self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
           tuple([v.name for v in trainable_vars]))
     else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
 
     self.evaluate(variables_lib.global_variables_initializer())

From 08968c30dcf6907cb2a9b0d8f56d0358cac39edf Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Fri, 15 May 2020 12:41:50 -0700
Subject: [PATCH 0685/1533] Clarify why we have *-gpu package. Fix
 https://github.com/tensorflow/tensorflow/issues/39581

PiperOrigin-RevId: 311782903
Change-Id: If002f2d2b112012e1c75e0c16f7a922546a9bba5
---
 tensorflow/tools/pip_package/setup.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 4b8289a6202..8a5450d78b6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -43,8 +43,6 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-DOCLINES = __doc__.split('\n')
-
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
@@ -93,6 +91,16 @@ if 'tf_nightly' in project_name:
     elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
+DOCLINES = __doc__.split('\n')
+if project_name.endswith('-gpu'):
+  project_name_no_gpu = project_name[:-len('-gpu')]
+  _GPU_PACKAGE_NOTE = 'Note that %s package by default supports both CPU and '\
+      'GPU. %s has the same content and exists solely for backward '\
+      'compatiblity. Please migrate to %s for GPU support.'\
+      % (project_name_no_gpu, project_name, project_name_no_gpu)
+  DOCLINES.append(_GPU_PACKAGE_NOTE)
+
+
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'toco_from_protos = tensorflow.lite.toco.python.toco_from_protos:main',

From 8dd28457699100145cad17aa4d44da81fddefda9 Mon Sep 17 00:00:00 2001
From: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Date: Fri, 15 May 2020 19:34:30 +0000
Subject: [PATCH 0686/1533] Reviewer requests

---
 tensorflow/stream_executor/rocm/rocm_gpu_executor.cc |  3 ++-
 third_party/gpus/cuda_configure.bzl                  | 10 +++++++---
 third_party/gpus/rocm_configure.bzl                  |  3 +--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index 216602a7597..fd3b5f19913 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -133,8 +133,9 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
     const char* mem_it = nullptr;
-    for (auto x : in_memory_modules_)
+    for (auto x : in_memory_modules_) {
       if (x.second == module) mem_it = x.first;
+    }
     if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
   }
   return true;
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index ce924fe4cd2..7e779a993e2 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -809,20 +809,24 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
 def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions=None):
-    """Returns a rule to recursively copy a directory."""
+    """Returns a rule to recursively copy a directory.
+    If exceptions is not None, it must be a list of files or directories in 
+    'src_dir'; these will be excluded from copying. 
+    """
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
     post_cmd=''
     if exceptions!=None:
-      outs = [x for x in outs if not any([x.startswith(y) for y in exceptions])]
+      outs = [x for x in outs if not any([x.startswith(src_dir+"/"+y) 
+        for y in exceptions])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
     if exceptions!=None:
       for x in exceptions:
-        post_cmd+=" ; rm -fR " + x.replace(src_dir, out_dir)
+        post_cmd+=" ; rm -fR " + out_dir + "/" + x
     return """genrule(
     name = "%s",
     outs = [
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 3f518fb05f1..4cfec2459e4 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -615,8 +615,7 @@ def _create_local_rocm_repository(repository_ctx):
             name = "rocm-include",
             src_dir = rocm_toolkit_path + "/include",
             out_dir = "rocm/include",
-            exceptions = [rocm_toolkit_path + "/include/gtest", 
-              rocm_toolkit_path + "/include/gmock"],
+            exceptions = ["gtest", "gmock"],
         ),
         make_copy_dir_rule(
             repository_ctx,

From 2bbf57217f277f20be3d4eabc0fb839011251ab5 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Fri, 15 May 2020 13:08:27 -0700
Subject: [PATCH 0687/1533] Change XLA's default to disable cpu_fast_math
 options with the exception of min_max behavior.

This is due to issues around inf/nan behavior on the cpu. tf_library still enables all fast math options though with the observation that currently most users of this are desiring performance and have tested their code already.

PiperOrigin-RevId: 311787817
Change-Id: Iab012d49435845dc5b7a5fcedca89bf159ec65a3
---
 tensorflow/compiler/aot/tfcompile.bzl           | 14 +++++++++++++-
 tensorflow/compiler/aot/tfcompile_main.cc       |  2 ++
 tensorflow/compiler/xla/debug_options_flags.cc  | 17 +++++++++++++++--
 tensorflow/compiler/xla/python/xla.cc           |  6 +-----
 .../xla/service/cpu/tests/cpu_intrinsic_test.cc |  7 +++++++
 .../service/cpu/tests/cpu_vectorization_test.cc |  7 +++++++
 tensorflow/compiler/xla/service/llvm_ir/BUILD   |  1 +
 .../xla/service/llvm_ir/alias_analysis_test.cc  |  2 +-
 .../compiler/xla/service/llvm_ir/llvm_util.cc   |  9 +++++++--
 .../xla/tests/client_library_test_base.h        |  1 +
 tensorflow/compiler/xla/tests/hlo_test_base.cc  | 10 ++++++++++
 tensorflow/compiler/xla/tests/hlo_test_base.h   |  4 ++++
 .../xla/tests/vector_ops_simple_test.cc         |  3 +--
 tensorflow/compiler/xla/xla.proto               | 13 ++++++++++++-
 .../python/kernel_tests/betainc_op_test.py      |  7 ++++---
 tensorflow/python/ops/gradient_checker_test.py  |  2 +-
 16 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index abccefbcdbb..f0c3e7da0ba 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -42,7 +42,8 @@ def tf_library(
         mlir_components = "None",
         deps = None,
         tags = []):
-    """Runs tfcompile to compile a TensorFlow graph into executable code.
+    """Runs tfcompile to compile a TensorFlow graph into executable code with fast
+    math enabled on cpu.
 
     Given an invocation of tf_library(name="foo", ...), generates the following
     build targets:
@@ -207,6 +208,15 @@ def tf_library(
         srcs.append(debug_info)
         debug_info_flag = " --debug_info=$(location " + debug_info + ")"
 
+    default_fast_math_xla_flags = "XLA_FLAGS=\"\
+      --xla_cpu_enable_fast_math=true \
+      --xla_cpu_fast_math_honor_nans=false \
+      --xla_cpu_fast_math_honor_infs=false \
+      --xla_cpu_fast_math_honor_functions=false \
+      --xla_cpu_fast_math_honor_division=false \
+      --xla_cpu_enable_fast_min_max=true \
+      $${XLA_FLAGS:-}\" "
+
     native.genrule(
         name = ("gen_" + name),
         srcs = srcs,
@@ -216,6 +226,7 @@ def tf_library(
             function_object_file,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
@@ -256,6 +267,7 @@ def tf_library(
             session_module_pb,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index f0cf8f2ded9..846947454bb 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -67,6 +67,8 @@ int main(int argc, char** argv) {
   flags.entry_point = "entry";
   flags.debug_info_path_begin_marker = "";
 
+  // Note that tfcompile.bzl's tf_library macro sets fast math flags as that is
+  // generally the preferred case.
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
   xla::AppendDebugOptionsFlags(&flag_list);
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 60a563ee956..4152982bf4c 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -55,9 +55,16 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // b/77879207.
   opts.set_xla_gpu_disable_multi_streaming(true);
 
-  // TODO(jlebar): Disable fastmath once doing so is not a performance
-  // regression.
+  // Disable forms of fast math that have caused users problems in the past.
   opts.set_xla_cpu_enable_fast_math(true);
+  opts.set_xla_cpu_fast_math_honor_nans(true);
+  opts.set_xla_cpu_fast_math_honor_infs(true);
+  opts.set_xla_cpu_fast_math_honor_functions(true);
+  opts.set_xla_cpu_fast_math_honor_division(true);
+
+  // By default, copy TF's Eigen style min_max behavior with nans.
+  opts.set_xla_cpu_enable_fast_min_max(false);
+
   opts.set_xla_gpu_enable_fast_min_max(true);
 
   opts.set_xla_allow_excess_precision(true);
@@ -261,6 +268,12 @@ static void AllocateFlags() {
       "When xla_cpu_enable_fast_math is true then this controls whether we "
       "forbid to approximate calculations for functions. Ignored when "
       "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_fast_min_max",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_min_max),
+      flag_values->xla_cpu_enable_fast_min_max(),
+      "Enable fast floating point min/max lowering that always propagates "
+      "NaNs."));
   flag_objects->push_back(tensorflow::Flag(
       "xla_gpu_enable_fast_min_max",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 65fb5311994..f10ec978399 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -872,11 +872,7 @@ PYBIND11_MODULE(xla_extension, m) {
         DebugOptions* debug_options =
             options.executable_build_options.mutable_debug_options();
         // Sets fast-math-disabling default options expected by JAX.
-        // TODO(phawkins): make these XLA-wide defaults.
-        debug_options->set_xla_cpu_fast_math_honor_infs(true);
-        debug_options->set_xla_cpu_fast_math_honor_nans(true);
-        debug_options->set_xla_cpu_fast_math_honor_division(true);
-        debug_options->set_xla_cpu_fast_math_honor_functions(true);
+        debug_options->set_xla_cpu_enable_fast_min_max(false);
         debug_options->set_xla_gpu_enable_fast_min_max(false);
         return options;
       }))
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index b6d6de28bc5..efeab3bd31a 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -70,6 +70,13 @@ class CpuUnaryIntrinsicTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 // Creates a module with a call to the unary op, and tests if the
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
index 8a72eb15487..757d878e224 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -69,6 +69,13 @@ class CpuVectorizationTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 TEST_P(CpuVectorizationTest, DoIt) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 39399df7ad8..cabcc8e06ee 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -64,6 +64,7 @@ cc_library(
     srcs = ["llvm_util.cc"],
     hdrs = ["llvm_util.h"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index 453a5cd84b2..f7808773592 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -58,7 +58,7 @@ ENTRY while3 {
 
   CompileAndVerifyIr(hlo_string, R"(
 ; CHECK-LABEL: @body(i8* %retval
-; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
+; CHECK: %[[add_result:.*]] = fadd reassoc nsz contract  float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
 ; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], align 4, !alias.scope ![[alias_scope_md_for_store:[0-9]+]]
 ;
 ; CHECK-LABEL: @condition(i8* %retval, i8* noalias %run_options, i8** noalias %params
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 4c9a8d3e004..c2b11819448 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
@@ -90,7 +91,9 @@ llvm::CallInst* EmitCallToIntrinsic(
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -103,7 +106,9 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 5b83186ffa4..790497f888e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,6 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
+    opts->set_xla_cpu_enable_fast_min_max(!disabled);
     opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 8eed609a134..7b64be5597b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -165,6 +165,16 @@ PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
   return precision_config;
 }
 
+void HloTestBase::SetAotFastMathDebugOptions(DebugOptions* options) {
+  options->set_xla_cpu_enable_fast_math(true);
+  options->set_xla_gpu_enable_fast_min_max(true);
+  options->set_xla_cpu_enable_fast_min_max(true);
+  options->set_xla_cpu_fast_math_honor_nans(false);
+  options->set_xla_cpu_fast_math_honor_infs(false);
+  options->set_xla_cpu_fast_math_honor_functions(false);
+  options->set_xla_cpu_fast_math_honor_division(false);
+}
+
 DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index d05776a0cb9..85b1876dd3c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -100,6 +100,10 @@ class HloTestBase : public ::testing::Test {
 
   static PrecisionConfig DefaultPrecisionConfig(int operands);
 
+  // Sets most fath math options to be enabled to model the fast math flags
+  // generally used for CPU:AOT compilation.
+  static void SetAotFastMathDebugOptions(DebugOptions* options);
+
  protected:
   // This uses the interpreter backend as the reference backend and
   // automatically finds another supported backend as the test backend. If the
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 3407a68f709..40e226f9902 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -310,8 +310,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
 
 XLA_TEST_F(VecOpsSimpleTest, ClampFloatEdgeCases) {
   XlaBuilder builder(TestName());
-  mutable_debug_options()->set_xla_cpu_enable_fast_math(false);
-  mutable_debug_options()->set_xla_gpu_enable_fast_min_max(false);
+  SetFastMathDisabled(true);
   auto low = ConstantR1<float>(&builder, {NAN, 1, 1});
   auto high = ConstantR1<float>(&builder, {3, NAN, 3});
   auto x = ConstantR1<float>(&builder, {2, 2, NAN});
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f4b08f454b9..9374b1fca6a 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -148,9 +148,20 @@ message DebugOptions {
   // xla_cpu_enable_fast_math is false.
   bool xla_cpu_fast_math_honor_functions = 129;
 
+  // When false we lower the Minimum and Maximum hlos in the CPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NaN.  In other words, if flag
+  // this is false we always propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the gpu flag
+  // below!
+  bool xla_cpu_enable_fast_min_max = 140;
+
   // When true we lower the Minimum and Maximum hlos in the GPU backend such
   // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
   // this is true we don't propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the cpu flag
+  // above!
   bool xla_gpu_enable_fast_min_max = 100;
 
   // Allows xla to increase the output precision of floating point operations.
@@ -280,7 +291,7 @@ message DebugOptions {
   // memory, or have bugs.
   bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_error = 139;
 
-  // Next id: 140
+  // Next id: 141
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index c4f70b5bc29..c564c822918 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -55,8 +55,8 @@ class BetaincTest(test.TestCase):
       # the scipy version of betainc uses a double-only implementation.
       # TODO(ebrevdo): identify reasons for (sometime) precision loss
       # with doubles
-      rtol = 1e-4 if dtype == dtypes.float32 else 5e-5
-      atol = 9e-6 if dtype == dtypes.float32 else 3e-6
+      rtol = 1e-4
+      atol = 1e-5
       self.assertAllCloseAccordingToType(
           scipy_out, tf_out, rtol=rtol, atol=atol)
 
@@ -66,7 +66,8 @@ class BetaincTest(test.TestCase):
       with self.cached_session():
         tf_comb = math_ops.betainc(a_comb, b_comb, x_comb).eval()
       scipy_comb = special.betainc(a_comb, b_comb, x_comb, dtype=np_dt)
-      self.assertAllCloseAccordingToType(scipy_comb, tf_comb)
+      self.assertAllCloseAccordingToType(
+          scipy_comb, tf_comb, rtol=rtol, atol=atol)
 
       # Test broadcasting between scalars and other shapes
       with self.cached_session():
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 92ca9c2971e..c8ebf12569a 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -149,7 +149,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertAllEqual(correct, analytical)
       self.assertAllClose(correct, numerical, rtol=1e-4)
       self.assertLess(
-          gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
+          gradient_checker.compute_gradient_error(x, size, y, size), 3e-4)
 
   @test_util.run_deprecated_v1
   def testComplexConj(self):

From b3bf8bd856b7698bb84cdae07570cf0494ac9374 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 13:15:33 -0700
Subject: [PATCH 0688/1533] Extend Keras Lambda layers to work with functions
 of any signature rather than only functions that take one argument.

Any *args and **kwargs passed when calling the lambda layer will be forwarded directly to the underlying lambda.

PiperOrigin-RevId: 311789009
Change-Id: Ic072d2252038330cc944d7f565f14806753d7436
---
 tensorflow/python/keras/layers/core.py        | 43 +++++++++----------
 tensorflow/python/keras/layers/core_test.py   | 20 ---------
 .../v1/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 .../v2/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 4 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index d1528c7ba59..db9c47eca17 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -53,7 +53,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -738,8 +738,7 @@ class Lambda(Layer):
   models. `Lambda` layers are best suited for simple operations or
   quick experimentation. For more advanced use cases, follow
   [this guide](https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-  for subclassing `tf.keras.layers.Layer`. (Do not subclass
-  `tf.keras.layers.Lamba`.)
+  for subclassing `tf.keras.layers.Layer`.
 
   The main reason to subclass `tf.keras.layers.Layer` instead of using a
   `Lambda` layer is saving and inspecting a Model. `Lambda` layers
@@ -799,7 +798,8 @@ class Lambda(Layer):
     computation, but anything more complex should use a subclass Layer instead.
 
   Arguments:
-    function: The function to evaluate when the layer is called.
+    function: The function to be evaluated. Takes input tensor as first
+      argument.
     output_shape: Expected output shape from function. This argument can be
       inferred if not explicitly provided. Can be a tuple or function. If a
       tuple, it only specifies the first dimension onward;
@@ -812,8 +812,8 @@ class Lambda(Layer):
     mask: Either None (indicating no masking) or a callable with the same
       signature as the `compute_mask` layer method, or a tensor that will be
       returned as output mask regardless of what the input is.
-    arguments: Optional dictionary of keyword arguments to pass by default to
-      the function when those arguments are not passed to the layer call.
+    arguments: Optional dictionary of keyword arguments to be passed to the
+      function.
   Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
@@ -823,16 +823,11 @@ class Lambda(Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, function, output_shape=None, mask=None, arguments=None,
                **kwargs):
+    super(Lambda, self).__init__(**kwargs)
+
     self.arguments = arguments or {}
     self.function = function
 
-    # Decorate the function to produce this layer's call method
-    def _call_wrapper(*args, **kwargs):
-      return self._call_wrapper(*args, **kwargs)
-    self.call = tf_decorator.make_decorator(function, _call_wrapper)
-
-    super(Lambda, self).__init__(**kwargs)
-
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
@@ -841,8 +836,9 @@ class Lambda(Layer):
     # Warning on every invocation will be quite irksome in Eager mode.
     self._already_warned = False
 
-    self._expects_training_arg = 'training' in self._call_fn_args
-    self._expects_mask_arg = 'mask' in self._call_fn_args
+    function_args = tf_inspect.getfullargspec(function).args
+    self._fn_expects_training_arg = 'training' in function_args
+    self._fn_expects_mask_arg = 'mask' in function_args
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -873,22 +869,23 @@ class Lambda(Layer):
     output_shapes = tf_utils.convert_shapes(self._output_shape, to_tuples=False)
     return nest.map_structure(_add_batch, output_shapes)
 
-  def _call_wrapper(self, *args, **kwargs):
+  def call(self, inputs, mask=None, training=None):
     # We must copy for thread safety, but it only needs to be a shallow copy.
-    call_kwargs = {k: v for k, v in self.arguments.items()}
-
-    # override default kwargs with the args passed to the layer call
-    call_kwargs.update(kwargs)
+    kwargs = {k: v for k, v in self.arguments.items()}
+    if self._fn_expects_mask_arg:
+      kwargs['mask'] = mask
+    if self._fn_expects_training_arg:
+      kwargs['training'] = training
 
     created_variables = []
-    def _variable_creator(next_creator, **creator_kwargs):
-      var = next_creator(**creator_kwargs)
+    def _variable_creator(next_creator, **kwargs):
+      var = next_creator(**kwargs)
       created_variables.append(var)
       return var
 
     with backprop.GradientTape(watch_accessed_variables=True) as tape,\
         variable_scope.variable_creator_scope(_variable_creator):
-      result = self.function(*args, **call_kwargs)
+      result = self.function(inputs, **kwargs)
     self._check_variables(created_variables, tape.watched_variables())
     return result
 
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index aa1192e12fc..3daa187f1ce 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -139,26 +139,6 @@ class LambdaLayerTest(keras_parameterized.TestCase):
     out = ld([x1, x2])
     self.assertAllEqual(out.shape, [3, 2])
 
-  def test_lambda_multiple_args(self):
-    ld = keras.layers.Lambda(lambda x, y: x[0] + y)
-    x1 = np.ones([3, 2], np.float32)
-    x2 = np.ones([3, 5], np.float32)
-
-    expected_result = x1 * 2
-    self.assertAllEqual(ld([x1, x2], x1), expected_result)
-    self.assertAllEqual(ld([x1, x2], y=x1), expected_result)
-    self.assertAllEqual(ld(x=[x1, x2], y=x1), expected_result)
-
-  def test_lambda_constructor_args_and_multiple_args(self):
-    x1 = np.ones([3, 2], np.float32)
-    x2 = np.ones([3, 5], np.float32)
-    ld = keras.layers.Lambda(lambda x, y: x[0] + y, arguments={'y': x1*2})
-
-    self.assertAllEqual(ld([x1, x2]), x1 * 3)
-    self.assertAllEqual(ld([x1, x2], y=x1), x1 * 2)
-    self.assertAllEqual(ld(x=[x1, x2]), x1 * 3)
-    self.assertAllEqual(ld(x=[x1, x2], y=x1), x1 * 2)
-
   def test_lambda_output_shape(self):
     l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
     l(keras.backend.variable(np.ones((1, 1))))
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index d4dbe96d1ba..22fa730112f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index d4dbe96d1ba..22fa730112f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"

From dc1c299833317401e14a1651ed906164464425c8 Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Fri, 15 May 2020 13:32:17 -0700
Subject: [PATCH 0689/1533] Add Unsupported dtype in tfrt for backward
 compatibility. We will use this dtype to support legacy types (e.g.
 DT_RESOURCE, DT_VARIANT) that are not natively implemented in TFRT.

PiperOrigin-RevId: 311791879
Change-Id: Ied0bfadf68f07e68fe8eb941c0d02bcb9f1a0b40
---
 tensorflow/c/eager/context_interface.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/context_interface.h
index 76f182f4945..2861fa43b66 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/context_interface.h
@@ -101,7 +101,7 @@ class AbstractContextInterface {
   // Destroy the step resource container for a training step.
   virtual void EndStep() = 0;
 
-  // Block until all pending nodes are finished,
+  // Block until all pending nodes are finished.
   virtual Status AsyncWait() = 0;
 
  protected:

From 31583920dcdeb54ffdb34acb1ce6b1db546ad33c Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Fri, 15 May 2020 13:34:26 -0700
Subject: [PATCH 0690/1533] Create _HostComputeMlir for use in TF MLIR.

PiperOrigin-RevId: 311792286
Change-Id: I6ec57f9b23c17dd52e756ead4ddfad58ecdb2f76
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index aa1601c4032..82282bb925a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -10586,6 +10586,27 @@ def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF__HostComputeMlirOp : TF_Op<"_HostComputeMlir", []> {
+  let summary = "A host-side computation called from a TPU device.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF__RecvTPUEmbeddingActivationsOp : TF_Op<"_RecvTPUEmbeddingActivations", []> {
   let summary = "An op that receives embeddng activations on the TPU.";
 

From 27ac446be5b10ee68900696a2c5184fce727e86d Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Fri, 15 May 2020 13:34:58 -0700
Subject: [PATCH 0691/1533] Enable MLIR saved model import by default in
 TFLiteConverterV2's saved model API

PiperOrigin-RevId: 311792366
Change-Id: I98356499c0a1eb7c740104ca4b11af5d45c4a4a1
---
 tensorflow/lite/python/lite.py | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 99be58f4376..ce59c56a1d0 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -386,13 +386,8 @@ class TFLiteConverterBase(object):
         return True
     return False
 
-  def _parse_saved_model_args(self, always_enable_saved_model_import=False):
-    """Parses SavedModel arguments from the given Keras/RNN SavedModel.
-
-    Args:
-      always_enable_saved_model_import: Bool. When the value is true, it enables
-        MLIR saved model import path regardless of checking the conditions.
-    """
+  def _parse_saved_model_args(self):
+    """Parses SavedModel arguments from the given Keras/RNN SavedModel."""
     if not self.experimental_new_converter:
       self.saved_model_dir = None
       return
@@ -405,17 +400,16 @@ class TFLiteConverterBase(object):
         # frozen graph def path.
         self.saved_model_dir = None
         return
-      if (not always_enable_saved_model_import and
-          not self._contains_function_with_implements_attr(saved_model_proto)):
+      if not self._contains_function_with_implements_attr(saved_model_proto):
         self.saved_model_dir = None
-        return
-
-      if not self._saved_model_exported_names:
-        self._saved_model_exported_names = []
-      self._saved_model_version = saved_model_proto.saved_model_schema_version
-      if self._saved_model_version not in [1, 2]:
-        raise ValueError("SavedModel file format({0}) is not supported".format(
-            self._saved_model_version))
+      else:
+        if not self._saved_model_exported_names:
+          self._saved_model_exported_names = []
+        self._saved_model_version = saved_model_proto.saved_model_schema_version
+        if self._saved_model_version not in [1, 2]:
+          raise ValueError(
+              "SavedModel file format({0}) is not supported".format(
+                  self._saved_model_version))
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
@@ -548,7 +542,7 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
     self._saved_model_tags = saved_model_tags
     self._saved_model_exported_names = saved_model_exported_names
     self._trackable_obj = trackable_obj
-    self._parse_saved_model_args(always_enable_saved_model_import=True)
+    self._parse_saved_model_args()
 
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.

From 340ac1aedb082dbf3092608354c8f5a1d2d276d9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 13:49:39 -0700
Subject: [PATCH 0692/1533] Move GetDeviceCoordinates() function and related
 constants in tpu_rewrite pass to common utility file.

PiperOrigin-RevId: 311795001
Change-Id: If86babf6656da132fb58b1a2266034f3b341e06d
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |  2 +
 .../tensorflow/transforms/tpu_rewrite_pass.cc | 63 +++++++------------
 .../utils/tpu_rewrite_device_util.cc          | 33 +++++++++-
 .../utils/tpu_rewrite_device_util.h           | 10 +++
 .../utils/tpu_rewrite_device_util_test.cc     | 26 ++++++++
 5 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index eb220a31f80..2bbdbb383a1 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1279,6 +1279,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1293,6 +1294,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index f5e9da915c8..986736a9502 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -64,19 +64,14 @@ static llvm::cl::opt<bool> tpu_compile_metadata_debug(
                    "'tf._TPUCompileMlir' op as a proto debug string"));
 
 constexpr char kNumReplicasAttr[] = "num_replicas";
-constexpr char kNumCoresPerReplicaAttr[] = "num_cores_per_replica";
 constexpr char kStepMarkerLocationAttr[] = "step_marker_location";
 constexpr char kPaddingMapAttr[] = "padding_map";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
 constexpr char kDeviceAttr[] = "device";
 constexpr char kDevicesAttr[] = "devices";
 constexpr char kVersionsAttr[] = "tf.versions";
 
 constexpr char kBadStringArrayElementMsg[] =
     "bad '{0}' attribute at index {1}, not a string";
-constexpr char kBadIntArrayElementMsg[] =
-    "bad '{0}' attribute at index {1}, not an int";
 constexpr char kBadArrayElementMsg[] =
     "bad '{0}' attribute at index {1} with value '{2}': failed to parse to {3}";
 constexpr char kBadArrayAttrLengthMsg[] =
@@ -163,32 +158,6 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
   return success();
 }
 
-// Extracts device coordinates from a device assignment attribute on an op.
-LogicalResult GetDeviceCoordinates(
-    tf_device::ClusterFuncOp op,
-    llvm::SmallVectorImpl<int64_t>* device_assignment) {
-  auto device_assignment_attr =
-      op.getAttrOfType<ArrayAttr>(kDeviceAssignmentAttr);
-  if (!device_assignment_attr)
-    return op.emitOpError(CreateMissingAttributeMsg(kDeviceAssignmentAttr));
-
-  device_assignment->reserve(device_assignment_attr.size());
-
-  for (auto device_coordinate_and_idx :
-       llvm::enumerate(device_assignment_attr)) {
-    auto device_coordinate =
-        device_coordinate_and_idx.value().dyn_cast<IntegerAttr>();
-    if (!device_coordinate)
-      return op.emitOpError(llvm::formatv(kBadIntArrayElementMsg,
-                                          kDeviceAssignmentAttr,
-                                          device_coordinate_and_idx.index()));
-
-    device_assignment->push_back(device_coordinate.getInt());
-  }
-
-  return success();
-}
-
 // Populates a TPUCompileMetadataProto with StepMarkerLocation from a
 // `tf_device::ClusterFuncOp`.
 LogicalResult SetMetadataProtoStepMarkerLocation(
@@ -661,27 +630,41 @@ LogicalResult Rewrite(
           : nullptr;
   if (replicate) num_replicas = replicate.n().getLimitedValue();
 
-  auto num_cores_per_replica_attr =
-      cluster_func.getAttrOfType<IntegerAttr>(kNumCoresPerReplicaAttr);
+  auto num_cores_per_replica_attr = cluster_func.getAttrOfType<IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
   if (!num_cores_per_replica_attr)
     return cluster_func.emitOpError(
-        CreateMissingAttributeMsg(kNumCoresPerReplicaAttr));
+        CreateMissingAttributeMsg(tensorflow::kNumCoresPerReplicaAttr));
 
   int num_cores_per_replica = num_cores_per_replica_attr.getInt();
 
-  auto topology_attr = cluster_func.getAttrOfType<StringAttr>(kTopologyAttr);
+  auto topology_attr =
+      cluster_func.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
   if (!topology_attr)
-    return cluster_func.emitOpError(CreateMissingAttributeMsg(kTopologyAttr));
+    return cluster_func.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kTopologyAttr));
 
-  llvm::SmallVector<int64_t, 6> device_assignment;
-  if (failed(GetDeviceCoordinates(cluster_func, &device_assignment)))
-    return failure();
+  auto device_assignment_attr = cluster_func.getAttrOfType<mlir::ArrayAttr>(
+      tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return cluster_func.emitOpError(
+        llvm::formatv("requires attribute '{0}'",
+                      tensorflow::kDeviceAssignmentAttr)
+            .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+  if (!status_or_device_coodinates.ok())
+    return cluster_func.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
 
   // Determine compilation and execution devices.
   auto status_or_tpu_device_assignment =
       tensorflow::GetTPUCompilationAndExecutionDevices(
           devices, num_replicas, num_cores_per_replica,
-          topology_attr.getValue(), device_assignment);
+          topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
   if (!status_or_tpu_device_assignment.ok())
     return cluster_func.emitError()
            << "error in fetching TPU compilation/execution devices: "
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 06c10c26835..282b7ad3139 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -39,6 +39,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+
+const char* const kTPUReplicatedHost = "TPU_REPLICATED_HOST";
+const char* const kNumCoresPerReplicaAttr = "num_cores_per_replica";
+const char* const kTopologyAttr = "topology";
+const char* const kDeviceAssignmentAttr = "device_assignment";
+
 // Device coordinates are defined as (x, y, z, core), thus resulting in a rank 4
 // topology.
 constexpr int kTPUTopologyRank = 4;
@@ -46,8 +52,8 @@ constexpr int kTPUTopologyRank = 4;
 constexpr char kDeviceTPUSystem[] = "TPU_SYSTEM";
 constexpr char kDeviceTPU[] = "TPU";
 constexpr char kTPUReplicatedCore[] = "TPU_REPLICATED_CORE";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
+constexpr char kBadIntArrayElementMsg[] =
+    "bad '{0}' attribute at index {1}, not an int";
 
 using Device = DeviceNameUtils::ParsedName;
 using Devices = llvm::ArrayRef<DeviceNameUtils::ParsedName>;
@@ -417,6 +423,27 @@ GetGeneralTPUExecutionDeviceAssignment(
 
 }  // anonymous namespace
 
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr) {
+  llvm::SmallVector<int64_t, 8> device_coordinates;
+  device_coordinates.reserve(device_assignment_attr.size());
+
+  for (auto device_coordinate_and_idx :
+       llvm::enumerate(device_assignment_attr)) {
+    auto device_coordinate =
+        device_coordinate_and_idx.value().dyn_cast<mlir::IntegerAttr>();
+    if (!device_coordinate)
+      return errors::InvalidArgument(
+          llvm::formatv(kBadIntArrayElementMsg, kDeviceAssignmentAttr,
+                        device_coordinate_and_idx.index())
+              .str());
+
+    device_coordinates.push_back(device_coordinate.getInt());
+  }
+
+  return device_coordinates;
+}
+
 StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     Devices devices, int num_replicas, int num_cores_per_replica,
     llvm::StringRef topology_attr,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 5fdb6b8768b..6bb541ab683 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -30,6 +31,11 @@ limitations under the License.
 namespace tensorflow {
 using stream_executor::port::StatusOr;
 
+extern const char* const kTPUReplicatedHost;
+extern const char* const kNumCoresPerReplicaAttr;
+extern const char* const kTopologyAttr;
+extern const char* const kDeviceAssignmentAttr;
+
 // A TPU device for execution alongside its associated host CPU device.
 struct TPUDeviceAndHost {
   TPUDeviceAndHost() {}
@@ -67,6 +73,10 @@ struct TPUDeviceAssignment {
   llvm::Optional<xla::DeviceAssignmentProto> xla_device_assignment;
 };
 
+// Extracts device coordinates from a device assignment attribute on an op.
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr);
+
 // Finds the TPU compilation device and execution devices from `devices` for a
 // TPU computation subgraph. Compilation device is determined from looking up
 // all TPU_SYSTEM:0 devices and choosing the CPU device associated to the first
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 7ac5635a6e4..a70e93a0195 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <tuple>
 
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
@@ -596,5 +598,29 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(computation_device_2.replica_device_ids(1), 3);
 }
 
+TEST(TPURewriteDeviceUtilTest, TestGetDeviceCoordinates) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getI64ArrayAttr({1, 2, 3});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(status_or_device_coodinates.ok());
+  auto device_coordinates = status_or_device_coodinates.ConsumeValueOrDie();
+  EXPECT_EQ(device_coordinates[0], 1);
+  EXPECT_EQ(device_coordinates[1], 2);
+  EXPECT_EQ(device_coordinates[2], 3);
+}
+
+TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getF32ArrayAttr({1.0, 2.0, 3.0});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(!status_or_device_coodinates.ok());
+  EXPECT_EQ(status_or_device_coodinates.status().error_message(),
+            "bad 'device_assignment' attribute at index 0, not an int");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow

From 20f064ffa6a0b40e76f8e7b37a0a647febe5a840 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 14:30:51 -0700
Subject: [PATCH 0693/1533] optimize for int8 add.

PiperOrigin-RevId: 311802413
Change-Id: I14cd70984ae7a8cad89b9c4a1a5216fcb7609c0e
---
 .../internal/optimized/integer_ops/add.h      | 141 +++++++-----------
 1 file changed, 50 insertions(+), 91 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index 8937fe2b26e..a9dae4feac5 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -35,99 +35,58 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
-
 #ifdef USE_NEON
-  const int8x16_t output_activation_min_vector =
-      vdupq_n_s8(params.quantized_activation_min);
-  const int8x16_t output_activation_max_vector =
-      vdupq_n_s8(params.quantized_activation_max);
-
-  const int input1_left_shift = params.left_shift + params.input1_shift;
-  const int input2_left_shift = params.left_shift + params.input2_shift;
-  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
-  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
-
-  for (; i <= size - 16; i += 16) {
-    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
-    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
-
-    const int16x8_t input1_val_s16_high =
-        vmovl_s8(vget_high_s8(input1_val_original));
-    const int16x8_t input1_val_s16_low =
-        vmovl_s8(vget_low_s8(input1_val_original));
-
-    const int16x8_t input2_val_s16_high =
-        vmovl_s8(vget_high_s8(input2_val_original));
-    const int16x8_t input2_val_s16_low =
-        vmovl_s8(vget_low_s8(input2_val_original));
-    const int16x8_t input1_val_high =
-        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val_high =
-        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
-    const int16x8_t input1_val_low =
-        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val_low =
-        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
-    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
-    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
-    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
-    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
-    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
-    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
-    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
-    int32x4_t x111 = vmovl_s16(input1_val_low_low);
-    int32x4_t x112 = vmovl_s16(input1_val_low_high);
-    int32x4_t x121 = vmovl_s16(input1_val_high_low);
-    int32x4_t x122 = vmovl_s16(input1_val_high_high);
-    int32x4_t x211 = vmovl_s16(input2_val_low_low);
-    int32x4_t x212 = vmovl_s16(input2_val_low_high);
-    int32x4_t x221 = vmovl_s16(input2_val_high_low);
-    int32x4_t x222 = vmovl_s16(input2_val_high_high);
-
-    x111 = vshlq_s32(x111, input1_left_dup);
-    x112 = vshlq_s32(x112, input1_left_dup);
-    x121 = vshlq_s32(x121, input1_left_dup);
-    x122 = vshlq_s32(x122, input1_left_dup);
-    x211 = vshlq_s32(x211, input2_left_dup);
-    x212 = vshlq_s32(x212, input2_left_dup);
-    x221 = vshlq_s32(x221, input2_left_dup);
-    x222 = vshlq_s32(x222, input2_left_dup);
-    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
-    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
-    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
-    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
-    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
-    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
-    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
-    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
-    int32x4_t s11 = vaddq_s32(x111, x211);
-    int32x4_t s12 = vaddq_s32(x112, x212);
-    int32x4_t s21 = vaddq_s32(x121, x221);
-    int32x4_t s22 = vaddq_s32(x122, x222);
-    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
-    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
-    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
-    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+  const int8x8_t output_activation_min_vector =
+      vdup_n_s8(params.quantized_activation_min);
+  const int8x8_t output_activation_max_vector =
+      vdup_n_s8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8) {
+    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
+    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
+    const int16x8_t input1_val =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high = vget_high_s16(input1_val);
+    const int16x4_t input1_val_low = vget_low_s16(input1_val);
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x11 = vmovl_s16(input1_val_low);
+    int32x4_t x12 = vmovl_s16(input1_val_high);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+    x11 = vshlq_s32(x11, left_shift_dup);
+    x12 = vshlq_s32(x12, left_shift_dup);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x11 = vshlq_s32(x11, input1_shift_dup);
+    x12 = vshlq_s32(x12, input1_shift_dup);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s11 = RoundingDivideByPOT(s11, -params.output_shift);
-    s12 = RoundingDivideByPOT(s12, -params.output_shift);
-    s21 = RoundingDivideByPOT(s21, -params.output_shift);
-    s22 = RoundingDivideByPOT(s22, -params.output_shift);
-    const int16x4_t s11_narrowed = vmovn_s32(s11);
-    const int16x4_t s12_narrowed = vmovn_s32(s12);
-    const int16x4_t s21_narrowed = vmovn_s32(s21);
-    const int16x4_t s22_narrowed = vmovn_s32(s22);
-    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
-                                   vdupq_n_s16(params.output_offset));
-    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
-                                   vdupq_n_s16(params.output_offset));
-    const int16x8_t s = vcombine_s16(vqmovn_s16(s1), vqmovn_s16(s2));
-
-    const int8x16_t clamped =
-        vmaxq_s8(output_activation_min_vector,
-                 vminq_s8(output_activation_max_vector, s));
-    vst1q_s8(output_data + i, clamped);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const int8x8_t clamped =
+        vmax_s8(output_activation_min_vector,
+                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
+    vst1_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From c77c31d45d849ebdf6ab53f9238137ffebe07829 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Fri, 15 May 2020 14:36:46 -0700
Subject: [PATCH 0694/1533] Enable TextVectorization to be called on lists of
 strings (or lists of list-wrapped strings). Using NumPy arrays of characters
 is generally a bad practice because of their extreme memory usage.

PiperOrigin-RevId: 311803496
Change-Id: I179eee4a8a879f8871ef6cc1253c34c42da06983
---
 .../keras/engine/base_preprocessing_layer.py  | 14 +++--
 .../python/keras/layers/preprocessing/BUILD   |  1 +
 .../preprocessing/text_vectorization.py       | 29 ++++++++---
 .../preprocessing/text_vectorization_test.py  | 52 +++++++++++++++++++
 4 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index 84138dd0a00..efd8a0e621f 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -143,9 +143,12 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       accumulator = self._combiner.restore(self._restore_updates())
 
     if not isinstance(data,
-                      (dataset_ops.DatasetV2, np.ndarray, ops.EagerTensor)):
+                      (dataset_ops.DatasetV2,
+                       np.ndarray,
+                       ops.Tensor,
+                       ragged_tensor.RaggedTensor)):
       raise ValueError(
-          '`adapt()` requires a batched Dataset, an EagerTensor, '
+          '`adapt()` requires a batched Dataset, a Tensor, '
           'or a Numpy array as input, '
           'got {}'.format(type(data)))
 
@@ -158,9 +161,14 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
             'elements. Please use `dataset.take(...)` to make the number '
             'of elements finite.')
       next_data = self._get_dataset_iterator(data)
+      # TODO(fchollet): consider checking if the dataset is already batched
+      # and otherwise batching it.
+    elif isinstance(data, (ops.Tensor, ragged_tensor.RaggedTensor)):
+      next_data = self._get_dataset_iterator(
+          dataset_ops.Dataset.from_tensor_slices(data).batch(512))
     else:
       generator, _ = training_generator.convert_to_generator_like(
-          data, batch_size=len(data))
+          data, batch_size=512)
       # If the data is not a dataset, we can iterate over it using next(foo);
       # here, we wrap that into a callable.
       next_data = lambda: next(generator)
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index c1e1d5573e5..052a57b52f3 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -521,6 +521,7 @@ tf_py_test(
     size = "medium",
     srcs = ["text_vectorization_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     deps = [
         ":preprocessing_test_utils",
         ":text_vectorization",
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 4156ba50c02..b1eff6e0bf3 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -346,11 +346,16 @@ class TextVectorization(CombinerPreprocessingLayer):
       return tensor_shape.TensorShape([input_shape[0], self._max_tokens])
 
     if self._output_mode == INT and self._split is None:
-      return input_shape
+      if len(input_shape) == 1:
+        input_shape = tuple(input_shape) + (1,)
+      return tensor_shape.TensorShape(input_shape)
 
     if self._output_mode == INT and self._split is not None:
       input_shape = list(input_shape)
-      input_shape[1] = self._output_sequence_length
+      if len(input_shape) == 1:
+        input_shape = input_shape + [self._output_sequence_length]
+      else:
+        input_shape[1] = self._output_sequence_length
       return tensor_shape.TensorShape(input_shape)
 
   def compute_output_signature(self, input_spec):
@@ -366,7 +371,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     Arguments:
       data: The data to train on. It can be passed either as a tf.data Dataset,
-        or as a numpy array.
+        as a NumPy array, a string tensor, or as a list of texts.
       reset_state: Optional argument specifying whether to clear the state of
         the layer at the start of the call to `adapt`. This must be True for
         this layer, which does not support repeated calls to `adapt`.
@@ -377,24 +382,30 @@ class TextVectorization(CombinerPreprocessingLayer):
     # Build the layer explicitly with the original data shape instead of relying
     # on an implicit call to `build` in the base layer's `adapt`, since
     # preprocessing changes the input shape.
-    if isinstance(data, np.ndarray):
-      if data.ndim == 1:
-        data = np.expand_dims(data, axis=-1)
+    if isinstance(data, (list, tuple, np.ndarray)):
+      data = ops.convert_to_tensor(data)
+
+    if isinstance(data, ops.Tensor):
+      if data.shape.rank == 1:
+        data = array_ops.expand_dims(data, axis=-1)
       self.build(data.shape)
-      preprocessed_inputs = self._to_numpy(self._preprocess(data))
+      preprocessed_inputs = self._preprocess(data)
     elif isinstance(data, dataset_ops.DatasetV2):
       # TODO(momernick): Replace this with a more V2-friendly API.
       shape = dataset_ops.get_legacy_output_shapes(data)
       if not isinstance(shape, tensor_shape.TensorShape):
         raise ValueError("The dataset passed to 'adapt' must contain a single "
                          "tensor value.")
+      if shape.rank == 0:
+        data = data.map(lambda tensor: array_ops.expand_dims(tensor, 0))
+        shape = dataset_ops.get_legacy_output_shapes(data)
       if shape.rank == 1:
         data = data.map(lambda tensor: array_ops.expand_dims(tensor, -1))
       self.build(dataset_ops.get_legacy_output_shapes(data))
       preprocessed_inputs = data.map(self._preprocess)
     else:
       raise ValueError(
-          "adapt() requires a Dataset or a Numpy array as input, got {}".format(
+          "adapt() requires a Dataset or an array as input, got {}".format(
               type(data)))
     super(TextVectorization, self).adapt(preprocessed_inputs, reset_state)
 
@@ -561,6 +572,8 @@ class TextVectorization(CombinerPreprocessingLayer):
     return inputs
 
   def call(self, inputs):
+    if isinstance(inputs, (list, tuple, np.ndarray)):
+      inputs = ops.convert_to_tensor(inputs)
     if inputs.shape.rank == 1:
       inputs = array_ops.expand_dims(inputs, axis=-1)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index f8a1f5b9434..5a9762719d5 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -29,6 +29,7 @@ from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
@@ -286,6 +287,57 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
           adapt_data=vocab_data)
     self.assertAllClose(expected_output, output_data)
 
+  def test_list_inputs_1d(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_tensor_inputs(self):
+    vocab_data = constant_op.constant(
+        ["two two two", "two three three", "three four four five"])
+    input_data = constant_op.constant(["two three", "four five"])
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_list_inputs_2d(self):
+    vocab_data = [
+        ["two two two"], ["two three three"], ["three four four five"]]
+    input_data = [["two three"], ["four five"]]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_dataset_of_single_strings(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    vocab_ds = dataset_ops.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    layer = get_layer_class()()
+    layer.adapt(vocab_ds)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationPreprocessingTest(

From cfb6d217c9963de69a31d543a373b9a39854108c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 14:41:08 -0700
Subject: [PATCH 0695/1533] Implement NNAPI QoS APIs in NNAPI delegate.

PiperOrigin-RevId: 311804298
Change-Id: Ia018050ca90fbc2cc12f363b5bc52727734e4abf
---
 .../lite/delegates/nnapi/nnapi_delegate.cc    |  37 +++++
 .../lite/delegates/nnapi/nnapi_delegate.h     |  36 +++++
 .../delegates/nnapi/nnapi_delegate_test.cc    |  17 ++
 tensorflow/lite/nnapi/NeuralNetworksTypes.h   |  24 +++
 tensorflow/lite/nnapi/nnapi_implementation.cc |  11 ++
 tensorflow/lite/nnapi/nnapi_implementation.h  | 148 ++++++++++++++++++
 6 files changed, 273 insertions(+)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index e6faea62bf6..39ab19aed2d 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -3256,6 +3256,22 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
     RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result,
                                     "configuring NNAPI caching", nnapi_errno);
   }
+  // Set compilation timeout if applicable.
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_compilation_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksCompilation_setTimeout(
+              compilation,
+              delegate_options.max_compilation_timeout_duration_ns),
+          "setting compilation timeout", nnapi_errno);
+    }
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        nnapi_->ANeuralNetworksCompilation_setPriority(
+            compilation, delegate_options.execution_priority),
+        "setting compilation priority", nnapi_errno);
+  }
   const int finish_result =
       nnapi_->ANeuralNetworksCompilation_finish(compilation);
   if (finish_result != ANEURALNETWORKS_NO_ERROR) {
@@ -3322,6 +3338,27 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
   std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>
       execution_unique_ptr(execution, NNFreeExecution(nnapi_));
 
+  // Set compilation timeout if applicable.
+  const auto delegate_options =
+      StatefulNnApiDelegate::GetOptions(node->delegate);
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_execution_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setTimeout(
+              execution, delegate_options.max_execution_timeout_duration_ns),
+          "setting execution timeout", nnapi_errno);
+    }
+    if (delegate_options.max_execution_loop_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setLoopTimeout(
+              execution,
+              delegate_options.max_execution_loop_timeout_duration_ns),
+          "setting execution loop timeout", nnapi_errno);
+    }
+  }
+
   // Set the input tensor buffers. Note: we access tflite tensors using
   // absolute indices but NN api indices inputs by relative indices.
   int relative_input_index = 0;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index b94c6d66978..68c55e1aef4 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
@@ -92,6 +93,30 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
 
     // allow fp32 compuation to be run in fp16.
     bool allow_fp16 = false;
+
+    // Specifies the relative priority for executions of the model.
+    // Available values are {ANEURALNETWORKS_PRIORITY_LOW,
+    // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH,
+    // ANEURALNETWORKS_PRIORITY_DEFAULT}.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model. If the device is not able to complete the compilation within the
+    // specified duration, the compilation may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model. If the device is not able to complete the execution within the
+    // specified duration, the execution may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_execution_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution. If a WHILE loop condition model does not output false
+    // within the specified duration, the execution will be aborted. If set to
+    // 0, the default timeout for loops will be used.
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
   };
 
   // Uses default options.
@@ -189,6 +214,17 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     int max_number_delegated_partitions;
     // allow fp32 computation to be run in fp16.
     bool allow_fp16;
+    // Specifies the relative priority for executions of the model.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model.
+    uint64_t max_execution_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
 
     ~Data();
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index ea9111c4567..acfa0c77d30 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -304,6 +304,23 @@ TEST(NNAPIDelegate, StatefulDelegateWithCompilationCaching) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
+// Sanity check for the state-ful NNAPI delegate with QoS hints.
+TEST(NNAPIDelegate, StatefulDelegateWithQoS) {
+  StatefulNnApiDelegate::Options options;
+  options.execution_priority = ANEURALNETWORKS_PRIORITY_HIGH;
+  options.max_compilation_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_loop_timeout_duration_ns = UINT64_MAX;
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 // Sanity check for the state-ful NNAPI delegate using TfLiteBufferHandle.
 TEST(NNAPIDelegate, StatefulDelegateWithBufferHandles) {
   // Skip the test if Android specific functions could not be found.
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index a3dfd373405..6739838e4d1 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -215,6 +215,18 @@ enum {
   ANEURALNETWORKS_DEVICE_ACCELERATOR = 4,
 };
 
+/**
+ * Relative execution priority.
+ *
+ * Available since API level 30.
+ */
+enum {
+  ANEURALNETWORKS_PRIORITY_LOW = 90,
+  ANEURALNETWORKS_PRIORITY_MEDIUM = 100,
+  ANEURALNETWORKS_PRIORITY_HIGH = 110,
+  ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
+};
+
 /**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
@@ -528,9 +540,21 @@ typedef int (*ANeuralNetworksCompilation_setCaching_fn)(
     ANeuralNetworksCompilation* compilation, const char* cacheDir,
     const uint8_t* token);
 
+typedef int (*ANeuralNetworksCompilation_setTimeout_fn)(
+    ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+typedef int (*ANeuralNetworksCompilation_setPriority_fn)(
+    ANeuralNetworksCompilation* compilation, int priority);
+
 typedef int (*ANeuralNetworksExecution_compute_fn)(
     ANeuralNetworksExecution* execution);
 
+typedef int (*ANeuralNetworksExecution_setTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
+typedef int (*ANeuralNetworksExecution_setLoopTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
 typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(
     ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index accdfb6c7da..ad5869fec04 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -215,6 +215,17 @@ const NnApi LoadNnApi() {
                          ANeuralNetworksModel_getExtensionOperationType);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksModel_setOperandExtensionData);
+
+  // API 30 (NNAPI 1.3) methods.
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setPriority);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setLoopTimeout);
+
   return nnapi;
 }
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
index a27f5ba661a..abee0fbdef3 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.h
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -789,6 +789,76 @@ struct NnApi {
       ANeuralNetworksCompilation* compilation, const char* cacheDir,
       const uint8_t* token);
 
+  /**
+   * Set the maximum expected duration for compiling the model.
+   *
+   * If the device is not able to complete the compilation within the specified
+   * duration, the compilation may be aborted. The timeout duration begins at
+   * the call to {@link ANeuralNetworksCompilation_finish}.
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long a compilation will take to abort the
+   * compilation before it has even started if the driver believes the
+   * compilation cannot be completed within the timeout duration. Similarly, it
+   * enables drivers to abort an ongoing compilation if it is taking too long.
+   * However, this call does not guarantee that the compilation will complete or
+   * abort within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called),
+   * the timeout duration for compiling the model is considered infinite.
+   *
+   * The {@link ANeuralNetworksCompilation} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent finishing a compilation. If this duration is exceeded, the
+   *     compilation may be aborted. If set to 0, the timeout duration is
+   *     considered infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksCompilation_setTimeout)(
+      ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+  /**
+   * Set the execution priority.
+   *
+   * Execution priorities are relative to other executions created by the same
+   * application (specifically same uid) for the same device. Specifically,
+   * priorities of executions from one application will not affect executions
+   * from another application. Similarly, priorities of executions on one device
+   * will not affect executions on another device.
+   *
+   * Higher priority executions may use more compute resources than lower
+   * priority executions, and may preempt or starve lower priority executions.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param compilation The compilation to be modified.
+   * @param priority The relative priority of the execution compared to other
+   *     executions created by the application. Must be one of
+   *     ANEURALNETWORKS_PRIORITY_*.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPriority)(
+      ANeuralNetworksCompilation* compilation, int priority);
+
   /**
    * Schedule synchronous evaluation of the execution.
    *
@@ -813,6 +883,84 @@ struct NnApi {
    */
   int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
 
+  /**
+   * Set the maximum expected duration of the specified execution.
+   *
+   * If the device is not able to complete the execution within the specified
+   * duration, the execution may be aborted. The timeout duration begins at a
+   * call to one of:
+   * - {@link ANeuralNetworksExecution_burstCompute}
+   * - {@link ANeuralNetworksExecution_compute}
+   * - {@link ANeuralNetworksExecution_startCompute}
+   * - {@link ANeuralNetworksExecution_startComputeWithDependencies}
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long an execution will take to abort the
+   * execution before it has even started if the driver believes the execution
+   * cannot be completed within the timeout duration. Similarly, it enables
+   * drivers to abort an ongoing execution if it is taking too long. However,
+   * this call does not guarantee that the execution will complete or abort
+   * within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called),
+   * the timeout duration for execution is considered infinite.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created from an
+   * {@link ANeuralNetworksCompilation} which in turn was created from
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent executing a model. If this duration is exceeded, the execution
+   *     may be aborted. If set to 0, the timeout duration is considered
+   * infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * Set the maximum duration of WHILE loops in the specified execution.
+   *
+   * This is a fuzzy per-loop timeout intended to prevent infinite loops.
+   *
+   * If a WHILE loop condition model does not output false within the specified
+   * duration, the execution will be aborted.
+   *
+   * See {@link ANeuralNetworks_getDefaultLoopTimeout} and
+   * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default
+   * and maximum timeout values.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that can be spent
+   *     executing a WHILE loop. If the specified duration value exceeds the
+   * value produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be
+   *     overridden by that value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *         ANEURALNETWORKS_BAD_STATE if execution has started.
+   *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setLoopTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
   /**
    * Get the dimensional information of the specified output operand of the
    * model of the

From 4ee27d9668f46c89b93fd5d306e25ede4e6a2f09 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Fri, 15 May 2020 14:46:00 -0700
Subject: [PATCH 0696/1533] Add benchmarks for scalar conversions

PiperOrigin-RevId: 311805249
Change-Id: Id1f499cfe6a1aac7ab1fc8d8339ce7b4031d4b6b
---
 tensorflow/python/eager/benchmarks_test.py | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 227fca5ea6f..3056d1a98ea 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -1194,6 +1194,46 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def _benchmark_convert_constant(self, value, cached):
+    global GLOBAL_TEST_VALUE
+    GLOBAL_TEST_VALUE = value
+
+    def cached_func():
+      ops.convert_to_tensor(value)
+
+    def uncached_func():
+      global GLOBAL_TEST_VALUE
+      GLOBAL_TEST_VALUE += 1
+      ops.convert_to_tensor(GLOBAL_TEST_VALUE)
+
+    func = cached_func if cached else uncached_func
+
+    self._run(func, 10000)
+
+  def benchmark_convert_python_int(self):
+    self._benchmark_convert_constant(42, cached=True)
+
+  def benchmark_convert_python_int_uncached(self):
+    self._benchmark_convert_constant(42, cached=False)
+
+  def benchmark_convert_python_float(self):
+    self._benchmark_convert_constant(42.0, cached=True)
+
+  def benchmark_convert_python_float_uncached(self):
+    self._benchmark_convert_constant(42.0, cached=False)
+
+  def benchmark_convert_numpy_int(self):
+    self._benchmark_convert_constant(np.array(42), cached=True)
+
+  def benchmark_convert_numpy_int_uncached(self):
+    self._benchmark_convert_constant(np.array(42), cached=False)
+
+  def benchmark_convert_numpy_float(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=True)
+
+  def benchmark_convert_numpy_float_uncached(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=False)
+
   @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_convert_3x_list_to_tensor(self):
     xs = [1, 2, 3]

From c61bc6a4f32dc697b8eb51ef3bf490e8d0780228 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 15 May 2020 14:46:55 -0700
Subject: [PATCH 0697/1533] Support cancellation in multi-device and
 distributed function execution.

In executing a multi-device or distributed function, one component function failure could cause other component functions to hang due to dependencies (e.g., they are pending receiving tensors from the failed component function). This can often lead to issues that are hard to debug especially with a large number of workers.

This change cancels local and remote component functions in multi-device function execution if one component function fails, by cancelling the function rendezvous and the component function execution request RPCs. Since the cancelled errors are marked as derived, the original failure error message will be reported to users.

PiperOrigin-RevId: 311805431
Change-Id: I2f0b819e2b0a228fdeb242361b41ef4cadc7e3d2
---
 tensorflow/c/eager/BUILD                      |   3 +
 tensorflow/c/eager/c_api_remote_test.cc       | 180 ++++++++++++++++++
 .../process_function_library_runtime.cc       |  99 ++++++----
 .../core/distributed_runtime/eager/BUILD      |   2 +
 .../eager/cluster_function_library_runtime.cc |  24 ++-
 .../distributed_runtime/eager/eager_client.h  |   6 +-
 .../eager/eager_service_impl_test.cc          |   3 +-
 .../core/distributed_runtime/rpc/eager/BUILD  |   1 +
 .../rpc/eager/grpc_eager_client.cc            |  14 +-
 .../rpc/rpc_rendezvous_mgr.cc                 |  14 ++
 10 files changed, 305 insertions(+), 41 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index fe4d5ac6ffe..0180b4bdee2 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -357,10 +357,13 @@ tf_cuda_cc_test(
         ":c_api_test_util",
         ":tfe_tensorhandle_internal",
         "//tensorflow/c:c_test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:function_optimization_registry",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index d04e4ef4212..93d830d2c90 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -19,11 +19,16 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 namespace {
@@ -574,6 +579,181 @@ TEST(CAPI, TestRemoteFunctionWithPackedInput) {
   TestFunctionWithPackedInput(/*remote=*/true);
 }
 
+string VariableAddFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'VariableAddFunction'"
+      "      input_arg {"
+      "        name: 'var0'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'var0_value'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read0:value:0'"
+      "      device: '/job:localhost/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'identity'"
+      "      op: 'Identity'"
+      "      input: 'add:z:0'"
+      "      device: '/job:localhost/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'var0_value'"
+      "      value: 'identity:output:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
+ public:
+  FunctionErrorInjectionPass(string error_node, string error_device)
+      : error_node_(error_node), error_device_(error_device) {}
+  tensorflow::Status Run(const tensorflow::DeviceSet& device_set,
+                         const tensorflow::ConfigProto& config_proto,
+                         std::unique_ptr<tensorflow::Graph>* graph,
+                         tensorflow::FunctionLibraryDefinition* flib_def,
+                         std::vector<std::string>* control_ret_node_names,
+                         bool* control_rets_updated) override {
+    // Inject failure to function instantiation if finding a node that contains
+    // the given node name (error_node_) and requested device (error_device_).
+    for (const auto node : graph->get()->nodes()) {
+      if (node->name().find(error_node_) != string::npos &&
+          node->requested_device() == error_device_) {
+        return tensorflow::errors::Internal("Injected graph pass error.");
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  const string error_node_;
+  const string error_device_;
+};
+
+void TestDistributedFunctionCancellation(bool inject_error) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+  const char dev2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  if (inject_error) {
+    // Inject a function optimization pass failure when it sees the 'read0' op
+    // having a requested device `dev2_name`. During execution:
+    //   * task:0 processes the main function `VariableAddFunction` and places
+    //     the read0 op on task:2
+    //   * task:0 partitions the main function with a subgraph containing read0
+    //     sent to task:2
+    //   * task:2 graph pass reports an error when it sees read0 with dev2_name
+    tensorflow::function_optimization_registration::
+        FunctionOptimizationPassRegistration register_test_pass(
+            std::make_unique<FunctionErrorInjectionPass>("read0", dev2_name));
+  }
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
+  EXPECT_NE(var_handle, nullptr);
+
+  const string function_def = VariableAddFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "VariableAddFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, var_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+
+  if (inject_error) {
+    ASSERT_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  } else {
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_EQ(1, num_retvals);
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteTensorHandle(retvals[0]);
+    float sum = 0;
+    ASSERT_EQ(sizeof(sum), TF_TensorByteSize(t));
+    memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+    TF_DeleteTensor(t);
+    ASSERT_EQ(sum, 4.0);
+  }
+
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(var_handle);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, DistributedFunctionNoError) {
+  TestDistributedFunctionCancellation(false);
+}
+
+TEST(CAPI, DistributedFunctionCancelledOnError) {
+  TestDistributedFunctionCancellation(true);
+}
+
 void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 271169f2a5e..364750b6679 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -230,7 +231,7 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
-      VLOG(1) << "Could not find device: " << device_name;
+      VLOG(4) << "Could not find device: " << device_name;
       return nullptr;
     }
   }
@@ -1046,7 +1047,37 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     return;
   }
 
-  auto* refcounted_done = new ReffedStatusCallback(std::move(done));
+  // A locally created cancellation manager, used only when the caller does not
+  // provide one in argument.
+  std::shared_ptr<CancellationManager> local_cm;
+  CancellationManager* cm = opts.cancellation_manager;
+  if (cm == nullptr) {
+    local_cm = std::make_shared<CancellationManager>();
+    cm = local_cm.get();
+  }
+  auto token = cm->get_cancellation_token();
+  const auto cancelled_error = errors::Cancelled(
+      "ProcessFunctionLibraryRuntime::RunMultiDevice was cancelled.");
+  const bool already_cancelled = !cm->RegisterCallback(
+      token,
+      [rendez = opts.rendezvous, n_func = data->glue_.size(), cancelled_error] {
+        // Abort rendezvous only if there are more than one component functions
+        // to avoid reporting cancellation error directly to PartitionedCallOps
+        // that launch a single component function.
+        if (rendez && n_func > 1) {
+          rendez->StartAbort(cancelled_error);
+        }
+      });
+  if (already_cancelled) {
+    done(cancelled_error);
+    return;
+  }
+
+  auto* refcounted_done = new ReffedStatusCallback(
+      [cm, token, local_cm, done = std::move(done)](const Status& s) {
+        cm->TryDeregisterCallback(token);
+        done(s);
+      });
   for (int i = 0; i < data->glue_.size(); ++i) {
     refcounted_done->Ref();
   }
@@ -1059,7 +1090,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
 
     opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs;
     opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs;
-    opts_copy.remote_execution = false;
+    opts_copy.cancellation_manager = cm;
 
     InternalArgs comp_args;
     Status s = get_component_args(comp_data, &comp_args);
@@ -1067,13 +1098,39 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(2) << "Failed to get component function arguments: " << s;
       refcounted_done->UpdateStatus(s);
       refcounted_done->Unref();
+      cm->StartCancel();
       continue;
     }
     std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
     rets->resize(data->num_outputs_);
 
+    auto component_fn_callback = [comp_rets, rets, comp_data, refcounted_done,
+                                  cm, local_cm, data,
+                                  target](const Status& status) {
+      if (!status.ok()) {
+        VLOG(2) << "Component function execution on target " << target
+                << " failed: " << status;
+        const string function_and_msg = strings::StrCat(
+            errors::FormatFunctionForError(data->function_name_), " ",
+            status.error_message());
+        refcounted_done->UpdateStatus(Status(status.code(), function_and_msg));
+        // Cancel the execution of other component functions.
+        cm->StartCancel();
+      } else {
+        VLOG(2) << "Component function execution on target " << target
+                << " succeeded.";
+        for (int i = 0; i < comp_rets->size(); ++i) {
+          (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
+        }
+      }
+      delete comp_rets;
+      // refcounted_done is thread-safe
+      refcounted_done->Unref();
+    };
+
     FunctionLibraryRuntime* flr = GetFLR(target);
     if (flr != nullptr) {
+      opts_copy.remote_execution = false;
       // When target device has private thread pool, use the target device
       // runner
       thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
@@ -1084,24 +1141,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       flr->Run(opts_copy, handle, GetLocalArgs(comp_args.args), comp_rets,
-               [comp_rets, rets, comp_data, refcounted_done,
-                data](const Status& status) {
-                 if (!status.ok()) {
-                   VLOG(2) << "Component function execution failed: " << status;
-                   const string function_and_msg = strings::StrCat(
-                       errors::FormatFunctionForError(data->function_name_),
-                       " ", status.error_message());
-                   refcounted_done->UpdateStatus(
-                       Status(status.code(), function_and_msg));
-                 } else {
-                   for (int i = 0; i < comp_rets->size(); ++i) {
-                     (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
-                   }
-                 }
-                 delete comp_rets;
-                 // refcounted_done is thread-safe
-                 refcounted_done->Unref();
-               });
+               std::move(component_fn_callback));
     } else {
       opts_copy.remote_execution = true;
 
@@ -1109,21 +1149,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
               << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
-      RunInternal(
-          opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
-          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-            if (!status.ok()) {
-              VLOG(2) << "Component function execution failed: " << status;
-              refcounted_done->UpdateStatus(status);
-            } else {
-              for (int i = 0; i < comp_rets->size(); ++i) {
-                (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
-              }
-            }
-            delete comp_rets;
-            // refcounted_done is thread-safe
-            refcounted_done->Unref();
-          });
+      RunInternal(opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
+                  std::move(component_fn_callback));
     }
   }
   refcounted_done->Unref();
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index c7fdfa176b1..c27758cbb44 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -42,6 +42,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:worker_session",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
@@ -68,6 +69,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index ec129173833..55f0697d2b4 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -189,13 +191,31 @@ void EagerClusterFunctionLibraryRuntime::Run(
   op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
   remote_op->set_device(function_data->target);
 
+  CancellationManager* cm = opts.cancellation_manager;
+  CancellationToken token = 0;
+  auto call_opts = std::make_shared<CallOptions>();
+  if (cm != nullptr) {
+    token = cm->get_cancellation_token();
+    const bool already_cancelled = !cm->RegisterCallback(
+        token,
+        [call_opts, request, response, done]() { call_opts->StartCancel(); });
+    if (already_cancelled) {
+      done(errors::Cancelled("EagerClusterFunctionLibraryRuntime::Run"));
+      return;
+    }
+  }
+
   // Execute component function on remote worker using RunComponentFunction RPC.
   // Different from executing remote functions with Enqueue, this method runs
   // a function on remote worker without tying up a thread (i.e., pure
   // asynchronously).
   eager_client->RunComponentFunctionAsync(
-      request.get(), response.get(),
-      [request, response, rets, done = std::move(done)](const Status& s) {
+      call_opts.get(), request.get(), response.get(),
+      [request, response, rets, call_opts, cm, token,
+       done = std::move(done)](const Status& s) {
+        if (cm != nullptr) {
+          cm->TryDeregisterCallback(token);
+        }
         if (!s.ok()) {
           done(s);
           return;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 9ca802d8a72..d6cf0943176 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
@@ -38,12 +39,15 @@ class EagerClient : public core::RefCounted {
   CLIENT_METHOD(UpdateContext);
   CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
-  CLIENT_METHOD(RunComponentFunction);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
 
 #undef CLIENT_METHOD
 
+  virtual void RunComponentFunctionAsync(
+      CallOptions* call_opts, const RunComponentFunctionRequest* request,
+      RunComponentFunctionResponse* response, StatusCallback done) = 0;
+
   // Feeds `request` into the request stream of EagerService::StreamingEnqueue.
   // `response` will be filled with the response for this `request`. The
   // 1-to-1 correspondence between requests and responses is a property
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 46a6181cfa9..3c537d99a3a 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -90,7 +90,8 @@ class FakeEagerClient : public EagerClient {
   CLIENT_METHOD(CloseContext);
 #undef CLIENT_METHOD
 
-  void RunComponentFunctionAsync(const RunComponentFunctionRequest* request,
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
                                  RunComponentFunctionResponse* response,
                                  StatusCallback done) override {
     impl_->RunComponentFunction(request, response, std::move(done));
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index d7251029d10..c1deabc23cd 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 752bfdf71a1..c8288f28c36 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
 
 #include "grpcpp/generic/generic_stub.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
@@ -135,7 +136,6 @@ class GrpcEagerClient : public EagerClient {
   CLIENT_METHOD(UpdateContext);
   CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
-  CLIENT_METHOD(RunComponentFunction);
   CLIENT_METHOD(KeepAlive);
 
 #undef CLIENT_METHOD
@@ -164,6 +164,18 @@ class GrpcEagerClient : public EagerClient {
     }
   }
 
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
+                                 RunComponentFunctionResponse* response,
+                                 StatusCallback done) override {
+    StatusCallback done_wrapped = callback_wrapper(std::move(done));
+    new RPCState<protobuf::Message>(
+        &stub_, cq_, "/tensorflow.eager.EagerService/RunComponentFunction",
+        *request, response, std::move(done_wrapped), call_opts,
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
+  }
+
   void StreamingEnqueueAsync(const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 5bb61eb8cc1..b973421efa4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -144,6 +144,20 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
       recv_done();
     };
     wi_->RecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
+
+    // NOTE: Check if the rendezvous was aborted after sending out the RPC. The
+    // ordering is important because `StartAbort` could be called right before
+    // the `RecvTensorAsync` request registers its RPC cancellation to `opts_`.
+    // In that case, the previous `StartAbort` would not trigger the
+    // cancellation of this call.
+    Status s;
+    {
+      mutex_lock l(mu_);
+      s = status_;
+    }
+    if (!s.ok()) {
+      opts_.StartCancel();
+    }
   }
 
   string src_worker_;

From cf35170ceaebd332683582eb08b4315708b55f76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 14:47:11 -0700
Subject: [PATCH 0698/1533] Fixes linkage error on pre-18 Android where GLESv3
 is not available

With this CL gpu delegate is linkable for Android Apps supporting pre-18 API level.
This solution works because tflite gpu delegate only weak-imports OpenGL ES 3+ symbols. However, it may results in a runtime crash if gpu delegate tries to use
GLES3 symbols on those devices. A reasonable solution for pre-18 API is refusing
to delegate?

Two symbols were behaving as strong symbols ("glUnmapBuffer", "glMapBufferRange") because they were defined in a template only class, which would get preprocessed before #define that redefines GLES symbols into weak symbols.

PiperOrigin-RevId: 311805477
Change-Id: Ia217ebe64a975092a43869ece7d42f64c33bf795
---
 tensorflow/lite/delegates/gpu/BUILD           | 12 ++++++++++--
 tensorflow/lite/delegates/gpu/gl/gl_buffer.cc | 13 +++++++++++++
 tensorflow/lite/delegates/gpu/gl/gl_buffer.h  |  6 ++----
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 2581232bc2b..c667c2056f4 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -32,7 +32,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
@@ -220,7 +224,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
index 1de49676219..344e494690a 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@@ -145,6 +145,19 @@ absl::Status CreatePersistentBuffer(size_t size,
   return absl::OkStatus();
 }
 
+namespace gl_buffer_internal {
+
+BufferMapper::BufferMapper(GLenum target, size_t offset, size_t bytes,
+                           GLbitfield access)
+    : target_(target),
+      data_(glMapBufferRange(target_, offset, bytes, access)) {}
+
+BufferMapper::~BufferMapper() {
+  TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError();
+}
+
+};  // namespace gl_buffer_internal
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
index 3225679ec5a..1877fb1f144 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -229,11 +229,9 @@ class BufferBinder {
 // RAII for mapping and unmapping a buffer.
 class BufferMapper {
  public:
-  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access)
-      : target_(target),
-        data_(glMapBufferRange(target_, offset, bytes, access)) {}
+  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access);
 
-  ~BufferMapper() { TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError(); }
+  ~BufferMapper();
 
   void* data() { return data_; }
 

From bc96c17ece9424c9d1f3f4d80675a1faadad247d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 14:58:07 -0700
Subject: [PATCH 0699/1533] Handle head extract outside compilation cluster
 device assignment.

PiperOrigin-RevId: 311807325
Change-Id: I0155a0d4e1aa62c29d0a58c3539b3eba22e7e85c
---
 ...extract_head_tail_outside_compilation.mlir | 195 +++++++++++-------
 ...u_extract_head_tail_outside_compilation.cc | 100 ++++++++-
 2 files changed, 218 insertions(+), 77 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index eb67bdcc914..90fa8cff5dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -2,80 +2,135 @@
 
 // Tests extraction of a outside compiled ops at head of TPU computation.
 
-func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
-  // CHECK:      tf_device.launch
-  // CHECK:        "tf.A"
-  // CHECK-NEXT:   tf_device.return
-  //
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        "tf.C"
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
-    "tf.B"() : () -> ()
-    "tf.C"() : () -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @single_head_outside_compilation
+  func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      tf_device.launch
+    //
+    // CHECK:        "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    //
+    // CHECK:      device
+    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.B"() : () -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
 
-// CHECK-LABEL: func @multiple_head_outside_compilation
-func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
-  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-  // CHECK:        %[[A_OUT:.*]] = "tf.A"
-  // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
-  // CHECK:        "tf.C"
-  // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-  //
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-    "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
-    "tf.D"(%1) : (tensor<i32>) -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @multiple_head_outside_compilation
+  func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+    // CHECK:        "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK:      device
+    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
+      "tf.D"(%1) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
 
-// CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
-func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
-  // CHECK-NOT:  tf_device.launch
-  // CHECK:      "tf_device.cluster"
-  // CHECK-NEXT:   "tf.A"
-  // CHECK-NEXT:   "tf.B"
-  // CHECK-NEXT:   "tf.C"
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
-    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
-    "tf.C"(%1) : (tensor<i32>) -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
+  func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
+    // CHECK-NOT:  tf_device.launch
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+      "tf.C"(%1) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
 
-// CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
-func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
-  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-  // CHECK:        %[[A_OUT:.*]] = "tf.A"
-  // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
-  // CHECK-NEXT:   tf_device.return %[[D_OUT]]
-  //
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        "tf.B"
-  // CHECK:        "tf.C"
-  // CHECK:        "tf.E"
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-    %1 = "tf.B"() {} : () -> (tensor<i32>)
-    %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-    %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
-    %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
+  func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+    // CHECK:      device
+    // CHECK-SAME: "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.B"
+    // CHECK:        "tf.C"
+    // CHECK:        "tf.E"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"() {} : () -> (tensor<i32>)
+      %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_replicated_head_outside_compilation
+  func @test_replicated_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+    // CHECK:      device
+    // CHECK-SAME: "TPU_REPLICATED_HOST"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.B"
+    // CHECK:        "tf.C"
+    // CHECK:        "tf.E"
+    // CHECK-NEXT:   tf_device.return
+    tf_device.replicate() {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+        %1 = "tf.B"() {} : () -> (tensor<i32>)
+        %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+        %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+        %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index b9e214470cd..02d0c3e849b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -20,17 +20,22 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -188,6 +193,82 @@ llvm::Optional<tf_device::LaunchOp> IsolateHeadExtractedOpsToLaunchOp(
   return host_launch_op;
 }
 
+// Parses TPU compilation and execution device form tpu cluster and assigns
+// host device to `host_launch` device attribute.
+LogicalResult SetCompilationDeviceToHostLaunch(
+    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
+    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
+  auto num_cores_per_replica_attr = tpu_cluster.getAttrOfType<IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
+  if (!num_cores_per_replica_attr)
+    return tpu_cluster.emitOpError(
+        "cluster op missing `num_cores_per_replica` attribute");
+
+  if (num_cores_per_replica_attr.getInt() != 1)
+    return tpu_cluster.emitOpError(
+        "outside compilation is not supported with model parallelism.");
+
+  auto topology_attr =
+      tpu_cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
+  if (!topology_attr)
+    return tpu_cluster.emitOpError("cluster op missing `topology` attribute");
+
+  auto device_assignment_attr = tpu_cluster.getAttrOfType<mlir::ArrayAttr>(
+      tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return tpu_cluster.emitOpError(
+        llvm::formatv("requires attribute '{0}'",
+                      tensorflow::kDeviceAssignmentAttr)
+            .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+
+  if (!status_or_device_coodinates.ok())
+    return tpu_cluster.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
+
+  // Determine compilation and execution devices.
+  auto status_or_tpu_device_assignment =
+      tensorflow::GetTPUCompilationAndExecutionDevices(
+          devices.device_names(), /*num_replicas=*/1,
+          /*num_cores_per_replica=*/1, topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
+  if (!status_or_tpu_device_assignment.ok())
+    return tpu_cluster.emitError()
+           << "error in fetching TPU compilation/execution devices: "
+           << status_or_tpu_device_assignment.status().error_message();
+  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
+  host_launch.deviceAttr(
+      builder->getStringAttr(tpu_device_assignment.tpu_devices[0][0].host));
+
+  return success();
+}
+
+// Assigns host device attribute to host launch op or enclosing
+// tf_device.replicate op if TPU computation is replicated.
+LogicalResult HandleHostLaunchDeviceAssignment(
+    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
+    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
+  auto parent_replicate_op =
+      llvm::dyn_cast_or_null<tf_device::ReplicateOp>(host_launch.getParentOp());
+  // If computation is replicated, then add TPU_REPLICATED_HOST device alias
+  // to the host launch op. This device alias would later be a reference to
+  // host device string in the device map of tf_device.replicate op
+  // during tpu_rewrite pass.
+  if (parent_replicate_op) {
+    host_launch.deviceAttr(
+        builder->getStringAttr(tensorflow::kTPUReplicatedHost));
+  } else {
+    if (failed(SetCompilationDeviceToHostLaunch(builder, devices, tpu_cluster,
+                                                host_launch)))
+      return failure();
+  }
+
+  return success();
+}
+
 struct TPUExtractHeadTailOutsideCompilation
     : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
                          OperationPass<ModuleOp>> {
@@ -202,17 +283,22 @@ void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
     return signalPassFailure();
 
   OpBuilder builder(&getContext());
-  module.walk([&](tf_device::ClusterOp cluster) {
+  auto result = module.walk([&](tf_device::ClusterOp cluster) {
     auto head_outside_compiled_ops = IdentifyOutsideCompiledOpsAtHead(cluster);
-    IsolateHeadExtractedOpsToLaunchOp(&builder, cluster,
-                                      head_outside_compiled_ops);
-
-    // TODO(b/156030523): Update device attribute of newly created host launch
-    // op as well as enclosing Replicate op (if TPU computation is replicated)
-    // with host device names.
+    auto host_launch_op = IsolateHeadExtractedOpsToLaunchOp(
+        &builder, cluster, head_outside_compiled_ops);
+    if (host_launch_op) {
+      if (failed(HandleHostLaunchDeviceAssignment(&builder, devices, cluster,
+                                                  *host_launch_op))) {
+        return WalkResult::interrupt();
+      }
+    }
 
     // TODO(b/155115766): Implement tail outside compiled op extraction.
+    return WalkResult::advance();
   });
+
+  if (result.wasInterrupted()) signalPassFailure();
 }
 
 }  // anonymous namespace

From 0c7e5ac6c9666ab1d9be3076a14ce128c8ef3403 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 15:02:32 -0700
Subject: [PATCH 0700/1533] Enable SVD shape test in eager mode.

PiperOrigin-RevId: 311808080
Change-Id: I5d77485f0f17aae4647aca0b4512f231fd1f3290
---
 tensorflow/python/kernel_tests/svd_op_test.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 6c2199cc591..eae42f55a3f 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -46,16 +47,16 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SvdOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.svd(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.svd(vector)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)

From f1471bd25c127d5352eeb6c61c832c35b129e280 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 15 May 2020 15:03:35 -0700
Subject: [PATCH 0701/1533] Include what you use for the micro_framework bazel
 target.

PiperOrigin-RevId: 311808278
Change-Id: I2869b7c191c71461d44edc77dc1cd3999c6376d9
---
 tensorflow/lite/kernels/op_macros.h                 |  1 +
 tensorflow/lite/micro/BUILD                         |  2 ++
 tensorflow/lite/micro/memory_helpers.cc             |  4 ++++
 tensorflow/lite/micro/memory_helpers.h              |  3 +++
 tensorflow/lite/micro/micro_allocator.cc            |  2 ++
 tensorflow/lite/micro/micro_allocator.h             |  6 +++++-
 tensorflow/lite/micro/micro_error_reporter.cc       |  3 +++
 tensorflow/lite/micro/micro_error_reporter.h        |  3 ++-
 tensorflow/lite/micro/micro_interpreter.cc          | 10 +++++++---
 tensorflow/lite/micro/micro_interpreter.h           |  4 ++++
 tensorflow/lite/micro/micro_mutable_op_resolver.h   |  3 +++
 tensorflow/lite/micro/micro_optional_debug_tools.cc |  9 +++++++++
 tensorflow/lite/micro/simple_memory_allocator.cc    |  1 +
 tensorflow/lite/micro/simple_memory_allocator.h     |  2 +-
 tensorflow/lite/micro/test_helpers.cc               |  7 ++++++-
 tensorflow/lite/micro/test_helpers.h                |  4 +++-
 16 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 33d033b10b6..8c1a6b1be16 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -19,6 +19,7 @@ limitations under the License.
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG
 
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
 #define DEBUG_LOG(x) \
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 5742a383b0f..67471bc64a6 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -56,8 +56,10 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/micro/memory_planner",
         "//tensorflow/lite/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index 302f160a235..c1b761bf088 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/memory_helpers.h"
 
+#include <cstddef>
 #include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/memory_helpers.h b/tensorflow/lite/micro/memory_helpers.h
index ef8205c8038..f52da062271 100644
--- a/tensorflow/lite/micro/memory_helpers.h
+++ b/tensorflow/lite/micro/memory_helpers.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 #define TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 
+#include <cstddef>
+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 54ce3383a08..1dd1fa4b63c 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
+#include "tensorflow/lite/micro/memory_planner/memory_planner.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index 6a6e1e03e53..d05974f365a 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -15,9 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/micro/micro_error_reporter.cc b/tensorflow/lite/micro/micro_error_reporter.cc
index bea3dc8db4c..6d8361cd25a 100644
--- a/tensorflow/lite/micro/micro_error_reporter.cc
+++ b/tensorflow/lite/micro/micro_error_reporter.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
+#include <cstdarg>
+
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_string.h"
 #endif
 
diff --git a/tensorflow/lite/micro/micro_error_reporter.h b/tensorflow/lite/micro/micro_error_reporter.h
index b18c47f4ecb..e2c073a465d 100644
--- a/tensorflow/lite/micro/micro_error_reporter.h
+++ b/tensorflow/lite/micro/micro_error_reporter.h
@@ -15,9 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 
+#include <cstdarg>
+
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/micro/debug_log.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 2d774d0a139..b46f9ecb9ea 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -14,12 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/micro_interpreter.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
-#include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 15f53b681a6..180a557668e 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index ead9be490a3..6c3e9a3331e 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 
+#include <cstring>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 42c42aea9f8..daa5d007cdf 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -20,8 +20,17 @@ limitations under the License.
 #endif
 
 #include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+
 namespace tflite {
 namespace {
 
diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index be7c469529e..911e1e404f7 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index ed73104a2c6..223ef8398a4 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 
+#include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 77a1cc82f3b..c2607cd32c6 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/test_helpers.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
 #include <initializer_list>
+#include <new>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 010e1f9e336..2d1d2895db0 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 // Useful functions for writing tests.
 
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"

From 8f1e715482accc94f9859954ed8b334c88c2b0cb Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Fri, 15 May 2020 15:04:55 -0700
Subject: [PATCH 0702/1533] Remove debug message

PiperOrigin-RevId: 311808620
Change-Id: I3c1ded522e5e2a9487ee9b1c2307d5e72820c9e6
---
 tensorflow/compiler/mlir/tensorflow/translate/import_model.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index a613ce1f920..37bbbbe5ee4 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -978,7 +978,6 @@ StatusOr<mlir::Type> ImporterBase::InferOutputType(const Node& node, int idx,
     if (dtype == DT_RESOURCE) {
       const AttrValue* dtype_attr = node.attrs().Find("_handle_dtypes");
       const AttrValue* shape_attr = node.attrs().Find("_handle_shapes");
-      LOG(INFO) << dtype_attr << " " << shape_attr;
       if (dtype_attr && shape_attr) {
         if (dtype_attr->list().type().empty()) {
           return errors::InvalidArgument(

From 40e0712354815c9b4fd695e3ff7c231e55abb64c Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 15:10:59 -0700
Subject: [PATCH 0703/1533] [tf.lite] Avoid designated initializers

PiperOrigin-RevId: 311809643
Change-Id: I35af646b5e84d9ae7b25aa3cd52ae6b2eb5f0298
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc          | 7 ++++---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.h           | 7 ++++---
 tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc | 5 ++++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 39ab19aed2d..b3967800b44 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -4019,6 +4019,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {
   std::for_each(std::begin(delegate_state_cache),
                 std::end(delegate_state_cache),
@@ -4056,9 +4058,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
 
 StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
                                              Options options)
-    : TfLiteDelegate(TfLiteDelegateCreate()),
-      delegate_data_(Data{.execution_preference = options.execution_preference,
-                          .nnapi = nnapi}) {
+    : TfLiteDelegate(TfLiteDelegateCreate()), delegate_data_(nnapi) {
   if (options.accelerator_name) {
     delegate_data_.accelerator_name = options.accelerator_name;
   }
@@ -4068,6 +4068,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   if (options.model_token) {
     delegate_data_.model_token = options.model_token;
   }
+  delegate_data_.execution_preference = options.execution_preference;
   delegate_data_.disallow_nnapi_cpu = options.disallow_nnapi_cpu;
   delegate_data_.max_number_delegated_partitions =
       options.max_number_delegated_partitions;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 68c55e1aef4..7ef02bc5107 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -181,8 +181,6 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
  private:
   // Encapsulates all delegate data.
   struct Data {
-    // Preferred Power/perf trade-off.
-    Options::ExecutionPreference execution_preference;
     // Pointer to NNAPI implementation to be used by this delegate as
     // set when building the StatefulNnApiDelegate instance.
     // Will generally be the NnApiInstance() singleton but can be overridden
@@ -190,6 +188,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The ownership of the nnapi instance is left to the caller of
     // the StatefulNnApiDelegate constructor.
     const NnApi* nnapi;
+    // Preferred Power/perf trade-off.
+    Options::ExecutionPreference execution_preference;
     // Selected NNAPI accelerator name.
     std::string accelerator_name;
     // The cache dir for NNAPI model.
@@ -202,7 +202,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     std::vector<MemoryRegistration> tensor_memory_map;
     // Contains a non zero value if any NNAPI method call
     // operation returned a non zero result code.
-    int nnapi_errno;
+    int nnapi_errno = ANEURALNETWORKS_NO_ERROR;
     // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
     // when trying to understand if all nodes are supported by the target
     // accelerators.
@@ -226,6 +226,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // the execution
     uint64_t max_execution_loop_timeout_duration_ns = 0;
 
+    explicit Data(const NnApi* nnapi);
     ~Data();
 
     // Caches an initialised NNAPIDelegateKernel.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
index 3c23054ea25..2bc7ae58449 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
@@ -27,7 +27,8 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options /* options */)
     : StatefulNnApiDelegate() {}
 
 StatefulNnApiDelegate::StatefulNnApiDelegate()
-    : TfLiteDelegate(TfLiteDelegateCreate()) {
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(/*nnapi=*/nullptr) {
   Prepare = DoPrepare;
 }
 
@@ -46,6 +47,8 @@ int StatefulNnApiDelegate::GetNnApiErrno() const { return 0; }
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {}
 
 void StatefulNnApiDelegate::Data::CacheDelegateKernel(

From f7d5cb929b63ac6717f294f710cd235b5ec4ef75 Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Fri, 15 May 2020 15:25:26 -0700
Subject: [PATCH 0704/1533] Print message about dumped MLIR modules

PiperOrigin-RevId: 311811882
Change-Id: I6c85e75c87d3ca413631927d11aff61f7ed9b39f
---
 tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
index 06805e633e2..d7b511094d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
@@ -38,6 +38,7 @@ inline static void Log(BridgeLoggerConfig::PrintCallbackFn print_callback,
   std::unique_ptr<llvm::raw_ostream> os;
   std::string filepath;
   if (CreateFileForDumping(name, &os, &filepath).ok()) print_callback(*os);
+  VLOG(1) << "Dumped MLIR module to " << filepath;
 }
 
 void BridgeLoggerConfig::printBeforeIfEnabled(mlir::Pass* pass,

From b6284742e41f0ce702e5a5bdefb18795f559568e Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 15 May 2020 15:40:20 -0700
Subject: [PATCH 0705/1533] Converting some of the dependencies in
 tensorflow/c/BUILD to use portable_tensorflow_lib_lite_no_runtime.

PiperOrigin-RevId: 311814443
Change-Id: I42e75403c81babba32d4b9bb99ab4eed21e6ba44
---
 tensorflow/c/BUILD                     | 10 ++++++++--
 tensorflow/core/platform/default/BUILD |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 05d5f9a3ed2..e2781afc3e5 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -394,8 +394,14 @@ tf_cuda_library(
     deps = [
         ":tf_status",
         ":tf_status_internal",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 49318fd0811..89231b0f206 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -509,6 +509,7 @@ filegroup(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
+        "casts.h",
         "context.h",
         "dynamic_annotations.h",
         "env.cc",

From ec52e0fcd3107c060ee116781c73e1cad4d19219 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Fri, 15 May 2020 15:52:55 -0700
Subject: [PATCH 0706/1533] added missing CHECK, whitespace cleanup

PiperOrigin-RevId: 311816309
Change-Id: I4181bf3d82e82f4ee60f2a894f487c19522490d1
---
 tensorflow/python/tf_program/tests/mlir_gen_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/tf_program/tests/mlir_gen_test.py b/tensorflow/python/tf_program/tests/mlir_gen_test.py
index 5e1ca5b36e0..49737352d73 100644
--- a/tensorflow/python/tf_program/tests/mlir_gen_test.py
+++ b/tensorflow/python/tf_program/tests/mlir_gen_test.py
@@ -83,7 +83,7 @@ class MLIRGenTest(MLIRGenTestBase):
       CHECK-LABEL: func @test_fn(%arg0: i1, %arg1: i1) -> i1
       CHECK: %[[r0:[0-9]+]] = "tfp.And"(%arg0, %arg0, %arg1) : (i1, i1, i1) -> tensor<*xi1>
       CHECK: %[[r1:[0-9]+]] = "tfp.Or"(%arg0, %arg1, %[[r0]]) : (i1, i1, tensor<*xi1>) -> tensor<*xi1>
-      return %[[r1]] : tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
     """
     self._check_code(mlir_code, exp_mlir_code)
 
@@ -158,7 +158,7 @@ class MLIRGenTest(MLIRGenTestBase):
     mlir_code = mlir_gen(test_fn)
     exp_mlir_code = r"""
       CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> i32
-      
+
       CHECK: %[[r1:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
       CHECK-NEXT: %[[r2:[0-9]+]] = "tfp.If"(%[[r1]]) ( {
         CHECK: return %{{[0-9]+}} : tensor<i32>
@@ -222,7 +222,7 @@ class MLIRGenTest(MLIRGenTestBase):
       CHECK: %[[r5:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
       CHECK: %[[r7:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
       CHECK: %[[r8:[0-9]+]] = "tfp.Or"(%[[r5]], %[[r7]]) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-      
+
       CHECK: %[[r9:[0-9]+]]:4 = "tfp.If"(%[[r8]]) ( {
         CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>
         CHECK-NEXT: },  {

From da27ac6878d739eed3d3bebdaf9be260c47df14a Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Fri, 15 May 2020 15:54:21 -0700
Subject: [PATCH 0707/1533] Support int16 quantization type

This patch is just changing a hard-coded 8 bits setting to be configured by the inference type.

PiperOrigin-RevId: 311816528
Change-Id: I8da61fb0751122e29134d13e5f8200c89980e131
---
 .../compiler/mlir/lite/transforms/prepare_quantize.cc       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index a9e10a485bf..87cae3dd957 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -70,6 +70,7 @@ class PrepareQuantizePass
     : public PassWrapper<PrepareQuantizePass, FunctionPass> {
  public:
   // Constructor used by the PassRegistration and enforce uint8 quantization.
+  // This is only used by test.
   explicit PrepareQuantizePass() {
     if (quantize_signed)
       quant_specs_.inference_type = tensorflow::DT_QINT8;
@@ -257,15 +258,16 @@ void PrepareQuantizePass::runOnFunction() {
   // convert all of them to signed.
   OwningRewritePatternList patterns;
   bool is_signed = quant_specs_.IsSignedInferenceType();
+  int bit_width = quant_specs_.GetQuantizationTypeWidth();
   if (is_signed) {
     patterns.insert<quant::ConvertUnsignedToSigned<quant::QuantizeCastOp>>(ctx);
     // Convert quant stats to int8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, true, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, true, ctx);
   } else {
     // Convert quant stats to uint8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, false, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, false, ctx);
   }
   applyPatternsAndFoldGreedily(func, patterns);
 

From 55b36215c8b8b2223b079522938bbdde695bcaf5 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 15 May 2020 15:59:38 -0700
Subject: [PATCH 0708/1533] include-what-you-use for lite/core/api:api

PiperOrigin-RevId: 311817318
Change-Id: If003542599d8901465dcafc605fd4df1d0c65add
---
 tensorflow/lite/core/api/BUILD                     | 1 +
 tensorflow/lite/core/api/flatbuffer_conversions.cc | 6 +++++-
 tensorflow/lite/core/api/flatbuffer_conversions.h  | 5 ++++-
 tensorflow/lite/core/api/op_resolver.cc            | 4 ++++
 tensorflow/lite/core/api/tensor_utils.cc           | 2 ++
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 6681a3ed550..419a3b2486d 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -26,6 +26,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 63e04899ca3..c52fc9f690b 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
-#include <cstdlib>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index d774afe8e85..2feddfaa8e6 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -19,9 +19,12 @@ limitations under the License.
 // flatbuffer serialization format into in-memory values that are used by the
 // runtime API and interpreter.
 
+#include <cstddef>
+#include <new>
+#include <type_traits>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/op_resolver.cc b/tensorflow/lite/core/api/op_resolver.cc
index 6424071f371..c239d9ed23e 100644
--- a/tensorflow/lite/core/api/op_resolver.cc
+++ b/tensorflow/lite/core/api/op_resolver.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/op_resolver.h"
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
 namespace tflite {
 
 TfLiteStatus GetRegistrationFromOpCode(
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
index d8d6fc46a18..3aac16b6878 100644
--- a/tensorflow/lite/core/api/tensor_utils.cc
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string.h>
 
+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {
 
 TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {

From eb07fd848a2f0cf45623799f43f372d07ae9a59b Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Fri, 15 May 2020 16:02:58 -0700
Subject: [PATCH 0709/1533] Add a TFE_Py_Execute traceme, which is the entrance
 to TF c++

PiperOrigin-RevId: 311817887
Change-Id: If924b3f3273096c961e6cc24459a620ce3889963
---
 tensorflow/python/eager/pywrap_tfe_src.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 2d96ed57246..639f623bd1a 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -852,6 +852,8 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
                               TFE_CancellationManager* cancellation_manager,
                               TFE_OutputTensorHandles* outputs,
                               TF_Status* out_status) {
+  tensorflow::profiler::TraceMe activity(
+      "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo);
   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status);
   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); });
   if (!out_status->status.ok()) return;

From 763710df31acf4b5da8f3c27f1bf0dd0ebb50c91 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 16:05:39 -0700
Subject: [PATCH 0710/1533] Update (non-gradient) tests for tf.linalg.qr to
 also run in eager mode.

PiperOrigin-RevId: 311818375
Change-Id: I70d721522f4060f1a4997c271837fbd6f3629e9f
---
 tensorflow/python/kernel_tests/qr_op_test.py | 61 +++++++++++---------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 4e0af934053..b1bbd0aaee3 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -30,7 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -45,35 +46,37 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class QrOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
-    # The input to qr should be a tensor of at least rank 2.
+    # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.qr(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.qr(vector)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for full_matrices_ in True, False:
-        for rows_ in 4, 5:
-          for cols_ in 4, 5:
-            matrix1 = random_ops.random_normal([rows_, cols_], seed=42)
-            matrix2 = random_ops.random_normal([rows_, cols_], seed=42)
-            q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
-            q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
-            all_ops += [q1, r1, q2, r2]
-      val = self.evaluate(all_ops)
-      for i in range(8):
-        q = 4 * i
-        self.assertAllClose(val[q], val[q + 2])  # q1 == q2
-        self.assertAllClose(val[q + 1], val[q + 3])  # r1 == r2
+    seed = [42, 24]
+    all_ops = []
+    for full_matrices_ in True, False:
+      for rows_ in 4, 5:
+        for cols_ in 4, 5:
+          matrix_shape = [rows_, cols_]
+          matrix1 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          matrix2 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          self.assertAllEqual(matrix1, matrix2)
+          q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
+          q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
+          all_ops += [q1, q2, r1, r2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllClose(val[i], val[i + 1])
 
 
 def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
@@ -121,8 +124,10 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-14
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     np.random.seed(1)
     x_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
@@ -131,7 +136,6 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
       if use_static_shape_:
         x_tf = constant_op.constant(x_np)
       else:
@@ -141,7 +145,8 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       if use_static_shape_:
         q_tf_val, r_tf_val = self.evaluate([q_tf, r_tf])
       else:
-        q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
+        with self.session(use_gpu=True) as sess:
+          q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
       q_dims = q_tf_val.shape
       np_q = np.ndarray(q_dims, dtype_)
@@ -266,7 +271,7 @@ if __name__ == "__main__":
         for full_matrices in False, True:
           for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
             # TF2 does not support placeholders under eager so we skip it
-            for use_static_shape in set([True, tf2.enabled()]):
+            for use_static_shape in [True, False]:
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s_static_%s" % (dtype.__name__,
                                                   "_".join(map(str, shape)),

From a133be3d31f215d669cfbfdc7df4f28edc99c50a Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 15 May 2020 16:08:10 -0700
Subject: [PATCH 0711/1533] Add TraceMeEncode helper

PiperOrigin-RevId: 311818799
Change-Id: I437235c1603a1e5be99a410376801771cfda0c66
---
 tensorflow/core/profiler/lib/BUILD            | 10 +++
 tensorflow/core/profiler/lib/traceme_encode.h | 82 +++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 tensorflow/core/profiler/lib/traceme_encode.h

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 6316fd118fc..0aa1a5d6b67 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -102,6 +102,16 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "traceme_encode",
+    hdrs = ["traceme_encode.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "annotated_traceme",
     hdrs = ["annotated_traceme.h"],
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
new file mode 100644
index 00000000000..772f56a2153
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+
+#include <string.h>
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace internal {
+
+// Copies the contents of str to the address pointed by out.
+// Returns the address after the copy.
+// REQUIRED: The address range [out, out + str.size()] must have been allocated.
+TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
+                                               absl::string_view str) {
+  const size_t str_size = str.size();
+  if (str_size > 0) {
+    memcpy(out, str.data(), str_size);
+    out += str_size;
+  }
+  return out;
+}
+
+}  // namespace internal
+
+// Encodes an event name and arguments into a string stored by TraceMe.
+// Use within a lambda to avoid expensive operations when tracing is inactive.
+// Example Usage:
+//   TraceMe trace_me([&name, value1]() {
+//     return TraceMeEncode(name, {{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::string name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  if (TF_PREDICT_TRUE(args.size() > 0)) {
+    const auto old_size = name.size();
+    auto new_size = old_size + args.size() * 2 + 1;
+    for (const auto& arg : args) {
+      new_size += arg.first.size() + arg.second.size();
+    }
+    name.resize(new_size);
+    char* const begin = &name[0];
+    char* out = begin + old_size;
+    *out++ = '#';
+    for (const auto& arg : args) {
+      out = internal::Append(out, arg.first);
+      *out++ = '=';
+      out = internal::Append(out, arg.second.Piece());
+      *out++ = ',';
+    }
+    *(out - 1) = '#';
+    DCHECK_EQ(out, begin + new_size);
+  }
+  return name;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_

From 22608ca0c293ec1a2976dcfdc6f02d5ce2173cb5 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 15 May 2020 16:13:31 -0700
Subject: [PATCH 0712/1533] Update TPU rewrite pass to populate replicated host
 devices on tf_device.replicate.

Replicated host devices under data parallelism may be necessary if outside compilation is present.

PiperOrigin-RevId: 311819706
Change-Id: Iad2775559374d481e3b39ba1a8681f660ee6787e
---
 .../mlir/tensorflow/tests/tpu_rewrite.mlir    | 29 ++++++++++---------
 .../tensorflow/transforms/tpu_rewrite_pass.cc | 12 ++++++++
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index b8a48bbb379..332b46f427f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -747,7 +747,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests simple case of `tf_device.cluster_func` on TPU with replication.
+// Tests simple case of `tf_device.cluster_func` on TPU with replication. Under
+// data parallelism replicated host devices are also added to the
+// tf_device.replicate
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   // CHECK-LABEL: func @replicated_tpu_cluster_func
@@ -758,7 +760,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
     // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK-SAME: ([%[[A_OUTPUT]], %[[ARG_0]]] as %[[RI_0:[a-z0-9]*]]: tensor<?xi32>)
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]}
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}
     // CHECK-SAME: n = 2
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[RI_0]])
@@ -1222,7 +1224,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests simple case of `tf_device.cluster_func` on TPU with replication and parallel_execute.
+// Tests simple case of `tf_device.cluster_func` on TPU with replication and
+// parallel_execute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   // CHECK-LABEL: func @replicated_parallel_tpu_cluster_func
@@ -1240,7 +1243,6 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
         tf_device.return
       }, {
         %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
-
         tf_device.return %4 : tensor<?xi32>
       }) : () -> (tensor<?xi32>)
       tf_device.return %3 : tensor<?xi32>
@@ -1317,15 +1319,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 //   "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01"
 // -----
 
-// Tests devices are set properly for replicated model parallelism.
+// Tests devices are set properly for replicated model parallelism. No
+// replicated host device should be present.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @replicated_parallel_execute
   func @replicated_parallel_execute(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>) {
     // CHECK: tf_device.replicate
-    // CHECK-SAME: devices =
-    // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
-    // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]}
     %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
       // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
@@ -1357,8 +1358,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests that inputs are inputs with maximal and replicate sharding are set properly
-// for replicated model parallelism.
+// Tests that inputs are inputs with maximal and replicate sharding are set
+// properly for replicated model parallelism.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_input_with_sharding_configurations
@@ -1392,8 +1393,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests devices are set properly for replicated model parallelism with
-// outputs to TPU computation placed on logical device 0.
+// Tests devices are set properly for replicated model parallelism with outputs
+// to TPU computation placed on logical device 0.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_different_outputs
@@ -1469,8 +1470,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests inputs are correctly split and fed into TPU computation for
-// tiled input sharding.
+// Tests inputs are correctly split and fed into TPU computation for tiled input
+// sharding.
 
 // The following OpSharding is used for TPU computation inputs in below test:
 // Proto debug string:
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 986736a9502..a7ad6a964b9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -437,6 +437,18 @@ void AssignDevicesToReplicate(
                               builder->getStrArrayAttr(devices_by_core)));
   }
 
+  // For data parallelism, also add replicated host devices, as these are
+  // necessary for outside compilation.
+  if (num_cores_per_replica == 1) {
+    llvm::SmallVector<StringRef, 8> hosts;
+    hosts.reserve(num_replicas);
+    for (int replica = 0; replica < num_replicas; ++replica)
+      hosts.push_back(tpu_devices[replica][0].host);
+
+    device_attrs.push_back(builder->getNamedAttr(
+        tensorflow::kTPUReplicatedHost, builder->getStrArrayAttr(hosts)));
+  }
+
   replicate.setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attrs));
 }
 

From a3746cc77a95db8acea7f7fbd5495fbdf0563139 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 16:29:51 -0700
Subject: [PATCH 0713/1533] Remove LazyLoader dependency from lite.interpreter

The original motivation for using this when loading the native
lite.Interpreter deps appears to no longer hold.

PiperOrigin-RevId: 311822195
Change-Id: I2a6877dcd65cdc906d025722714fe209c8673d5d
---
 tensorflow/lite/python/interpreter.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index ccbba9014c8..04863b12853 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -27,20 +27,8 @@ import numpy as np
 # pylint: disable=g-import-not-at-top
 if not __file__.endswith('tflite_runtime/interpreter.py'):
   # This file is part of tensorflow package.
-  from tensorflow.python.util.lazy_loader import LazyLoader
+  from tensorflow.lite.python.interpreter_wrapper import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
   from tensorflow.python.util.tf_export import tf_export as _tf_export
-
-  # Lazy load since some of the performance benchmark skylark rules
-  # break dependencies. Must use double quotes to match code internal rewrite
-  # rule.
-  # pylint: disable=g-inconsistent-quotes
-  _interpreter_wrapper = LazyLoader(
-      "_interpreter_wrapper", globals(),
-      "tensorflow.lite.python.interpreter_wrapper."
-      '_pywrap_tensorflow_interpreter_wrapper')
-  # pylint: enable=g-inconsistent-quotes
-
-  del LazyLoader
 else:
   # This file is part of tflite_runtime package.
   from tflite_runtime import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper

From 82519ad18676039327d29b80ed7dd098b61ce415 Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Fri, 15 May 2020 23:35:47 +0000
Subject: [PATCH 0714/1533] Fixed tests

---
 tensorflow/python/ops/custom_gradient.py |  6 +--
 tensorflow/python/ops/gradients_test.py  | 47 ++++++++++++------------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d57be41c3de..4a375e11554 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -28,12 +28,12 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import op_selector
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
-from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 
 
 VAR_OP_TYPES = [
@@ -487,7 +487,7 @@ def recompute_grad(f):
       result = f(*args, **kwargs)
 
     @custom_gradient
-    def grad(*dresult, **grad_kwargs):
+    def inner_recompute_grad(*dresult, **grad_kwargs):
       """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
       # Gradient calculation for reverse mode autodiff.
       variables = grad_kwargs.get("variables")
@@ -517,7 +517,7 @@ def recompute_grad(f):
 
       return (grads[:len(id_args)], grads[len(id_args):]), transpose
 
-    return result, grad
+    return result, inner_recompute_grad
 
   return inner
 
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 9b536136cb5..e1da54e6427 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1369,9 +1369,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
     sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(f,
                                                                  inputs,
                                                                  delta=delta)
-    testcase.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
+    self.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
   
-  @test_util.run_in_graph_and_eager_modes
   def testCustomGradientRecomputeGradHigherOrder(self):
 
     @custom_gradient.recompute_grad
@@ -1395,8 +1394,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
           shape=10,
           trainable=True,
       )
-
-      test_input = constant(np.zeros((10, 10), dtype=np.float32))
+      self.evaluate(test_var.assign(np.ones([10])))
+      test_input = constant(np.ones((10, 10), dtype=np.float32))
 
       grads_re, grads = self._TestFnVariablesGradient(test_input, TestFn,
                                                       test_input)
@@ -1432,24 +1431,24 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
   def testFnRecomputeWithScopeGradientTape(self):
     """Checks that recompute_grad works with var scope and GradientTape."""
 
-    def TestFn(input_t):
-      with variable_scope.variable_scope("inner_scope"):
-        test_var = variable_scope.get_variable(
-            name="test_var",
-            shape=10,
-            trainable=True,
-        )
-        return input_t * test_var
+    def TestFn(input_t, test_var):
+      return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
 
     with variable_scope.variable_scope(
         "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True):
+      with variable_scope.variable_scope("inner_scope"):
+        test_var = variable_scope.get_variable(
+            name="test_var", shape=10, trainable=True,
+        )
+        self.evaluate(test_var.assign(np.ones([10])))
+
       test_fn_re = custom_gradient.recompute_grad(TestFn)
 
       with backprop.GradientTape(persistent=True) as tape:
-        out_re = test_fn_re(test_input_t)
-        out = TestFn(test_input_t)
+        out_re = test_fn_re(test_input_t, test_var)
+        out = TestFn(test_input_t, test_var)
 
     grads_re = tape.gradient(out_re, variables.trainable_variables())
     grads = tape.gradient(out, variables.trainable_variables())
@@ -1464,22 +1463,22 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
   def testFnRecomputeWithScopeGradients(self):
     """Checks that recompute_grad works with var scope and gradients(..)."""
 
-    def TestFn(input_t):
-      with variable_scope.variable_scope("inner_scope"):
-        test_var = variable_scope.get_variable(
-            name="test_var",
-            shape=10,
-            trainable=True,
-        )
-        return input_t * test_var
+    def TestFn(input_t, test_var):
+      return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
 
     with variable_scope.variable_scope(
         "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True):
+      with variable_scope.variable_scope("inner_scope"):
+        test_var = variable_scope.get_variable(
+            name="test_var", shape=10, trainable=True,
+        )
+        self.evaluate(test_var.assign(np.ones([10])))
+      
       test_fn_re = custom_gradient.recompute_grad(TestFn)
-      out_re = test_fn_re(test_input_t)
-      out = TestFn(test_input_t)
+      out_re = test_fn_re(test_input_t, test_var)
+      out = TestFn(test_input_t, test_var)
 
     grads_re = gradients.gradients(out_re, variables.trainable_variables())
     grads = gradients.gradients(out, variables.trainable_variables())

From c26ac449e0c798e5527f565e95078e42c662952f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 17:18:30 -0700
Subject: [PATCH 0715/1533] Enable tf.linalg.matrix_solve tests in eager mode.

PiperOrigin-RevId: 311829192
Change-Id: I8d8c0fb2e28c6dd497a99724d4e2bcd78f2d2ed6
---
 .../kernel_tests/matrix_solve_op_test.py      | 96 ++++++++++---------
 1 file changed, 50 insertions(+), 46 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 0b6b403210c..bbd909c8e58 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -21,14 +21,16 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -56,19 +58,19 @@ class MatrixSolveOpTest(test.TestCase):
           a_np = np.tile(a_np, batch_dims + [1, 1])
           b = np.tile(b, batch_dims + [1, 1])
         np_ans = np.linalg.solve(a_np, b)
-        for use_placeholder in False, True:
-          with self.cached_session(use_gpu=True) as sess:
-            if use_placeholder:
-              a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+        for use_placeholder in set((False, not context.executing_eagerly())):
+          if use_placeholder:
+            a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+            with self.cached_session(use_gpu=True) as sess:
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
-            else:
-              tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = self.evaluate(tf_ans)
-              self.assertEqual(tf_ans.get_shape(), out.shape)
-            self.assertEqual(np_ans.shape, out.shape)
-            self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
+          else:
+            tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
+            out = self.evaluate(tf_ans)
+            self.assertEqual(tf_ans.get_shape(), out.shape)
+          self.assertEqual(np_ans.shape, out.shape)
+          self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
 
   def _generateMatrix(self, m, n):
     matrix = (np.random.normal(-5, 5,
@@ -77,7 +79,7 @@ class MatrixSolveOpTest(test.TestCase):
         [m, n]))
     return matrix
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolve(self):
     for n in 1, 2, 4, 9:
       matrix = self._generateMatrix(n, n)
@@ -85,7 +87,7 @@ class MatrixSolveOpTest(test.TestCase):
         rhs = self._generateMatrix(n, nrhs)
         self._verifySolve(matrix, rhs)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolveBatch(self):
     for n in 2, 5:
       matrix = self._generateMatrix(n, n)
@@ -94,48 +96,50 @@ class MatrixSolveOpTest(test.TestCase):
         for batch_dims in [[2], [2, 2], [7, 4]]:
           self._verifySolve(matrix, rhs, batch_dims=batch_dims)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
     # When the solve of a non-square matrix is attempted we should return
     # an error
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
-        matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
-        linalg_ops.matrix_solve(matrix, matrix)
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
-    with self.session(use_gpu=True):
-      matrix = constant_op.constant([[1., 0.], [0., 1.]])
-      rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
-        linalg_ops.matrix_solve(matrix, rhs)
+    matrix = constant_op.constant([[1., 0.], [0., 1.]])
+    rhs = constant_op.constant([[1., 0.]])
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      self.evaluate(linalg_ops.matrix_solve(matrix, rhs))
 
   def testNotInvertible(self):
     # The input should be invertible.
-    with self.session(use_gpu=True):
-      with self.assertRaisesOpError("Input matrix is not invertible."):
-        # All rows of the matrix below add to zero
-        matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
-                                       [0., -1., 1.]])
-        linalg_ops.matrix_solve(matrix, matrix).eval()
+    with self.assertRaisesOpError("Input matrix is not invertible."):
+      # All rows of the matrix below add to zero
+      matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
+                                     [0., -1., 1.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrent(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for adjoint_ in False, True:
-        lhs1 = random_ops.random_normal([3, 3], seed=42)
-        lhs2 = random_ops.random_normal([3, 3], seed=42)
-        rhs1 = random_ops.random_normal([3, 3], seed=42)
-        rhs2 = random_ops.random_normal([3, 3], seed=42)
-        s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
-        s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
-        all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      self.assertAllEqual(val[0], val[1])
-      self.assertAllEqual(val[2], val[3])
+    seed = [42, 24]
+    matrix_shape = [3, 3]
+    all_ops = []
+    for adjoint_ in False, True:
+      lhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      lhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
+      s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
+      all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(all_ops), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 class MatrixSolveBenchmark(test.Benchmark):

From c1ac8f2b817ce772e0da53f017cd662143d8ec38 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 17:31:15 -0700
Subject: [PATCH 0716/1533] Enable tf.linalg.matrix_solve_ls tests in eager
 mode.

PiperOrigin-RevId: 311830778
Change-Id: I63aca8ab80b63201b3fe12e5e0af31f5760b3fad
---
 .../kernel_tests/matrix_solve_ls_op_test.py   | 92 ++++++++++---------
 1 file changed, 50 insertions(+), 42 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index b99c8f6d256..b7a159e2eff 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -89,6 +90,8 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
     if not fast and l2_regularizer != 0:
       # The slow path does not support regularization.
       return
+    if use_placeholder and context.executing_eagerly():
+      return
     maxdim = np.max(x.shape)
     if dtype == np.float32 or dtype == np.complex64:
       tol = maxdim * 5e-4
@@ -109,64 +112,70 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
         b = np.tile(b, batch_shape + (1, 1))
         np_ans = np.tile(np_ans, batch_shape + (1, 1))
         np_r_norm = np.tile(np_r_norm, batch_shape)
-      with self.cached_session(use_gpu=fast) as sess:
-        if use_placeholder:
-          a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          feed_dict = {a_ph: a, b_ph: b}
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
-        else:
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a, b, fast=fast, l2_regularizer=l2_regularizer)
-          feed_dict = {}
-          self.assertEqual(np_ans.shape, tf_ans.get_shape())
-        if l2_regularizer == 0:
-          # The least squares solution should satisfy A^H * (b - A*x) = 0.
-          tf_r = b - math_ops.matmul(a, tf_ans)
-          tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
-          tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
-          tf_ans_val, tf_r_norm_val = sess.run(
-              [tf_ans, tf_r_norm], feed_dict=feed_dict)
-          self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
-        else:
+      if use_placeholder:
+        a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        feed_dict = {a_ph: a, b_ph: b}
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
+      else:
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a, b, fast=fast, l2_regularizer=l2_regularizer)
+        feed_dict = None
+        self.assertEqual(np_ans.shape, tf_ans.get_shape())
+      if feed_dict:
+        with self.session(use_gpu=True) as sess:
           tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict)
-
+      else:
+        tf_ans_val = self.evaluate(tf_ans)
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
-  @test_util.run_v1_only("b/120545219")
+      if l2_regularizer == 0:
+        # The least squares solution should satisfy A^H * (b - A*x) = 0.
+        tf_r = b - math_ops.matmul(a, tf_ans)
+        tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
+        tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
+        if feed_dict:
+          with self.session(use_gpu=True) as sess:
+            tf_ans_val, tf_r_norm_val = sess.run([tf_ans, tf_r_norm],
+                                                 feed_dict=feed_dict)
+        else:
+          tf_ans_val, tf_r_norm_val = self.evaluate([tf_ans, tf_r_norm])
+        self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
       matrix = constant_op.constant([[1., 0.], [0., 1.]])
       rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
+      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
         linalg_ops.matrix_solve_ls(matrix, rhs)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     full = np.array([[1., 2.], [3., 4.], [5., 6.]])
     empty0 = np.empty([3, 0])
     empty1 = np.empty([0, 2])
     for fast in [True, False]:
-      with self.cached_session(use_gpu=True):
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 2))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
-    matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
+    matrix = np.array([1., 0., 0., 0., 1., 0., 0., 0., 1.] * 3).reshape(3, 3, 3)
     rhs = np.array([1., 2., 3.] * 3).reshape(3, 3, 1)
     answer = linalg_ops.matrix_solve(matrix, rhs)
     ls_answer = linalg_ops.matrix_solve_ls(matrix, rhs)
@@ -358,8 +367,7 @@ if __name__ == "__main__":
     # ROCm does not support BLAS operations for complex types
     dtypes_to_test += [np.complex64, np.complex128]
   for dtype_ in dtypes_to_test:
-    # TF2 does not support placeholders under eager so we skip it
-    for use_placeholder_ in set([False, not tf2.enabled()]):
+    for use_placeholder_ in set([False, True]):
       for fast_ in [True, False]:
         l2_regularizers = [0] if dtype_ == np.complex128 else [0, 0.1]
         for l2_regularizer_ in l2_regularizers:

From 96f1bbe90a58b94fe760ff748afa1aff20e16696 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 15 May 2020 18:04:14 -0700
Subject: [PATCH 0717/1533] [XLA:GPU] [NFC] Add more logging output to explain
 fusion decisions

PiperOrigin-RevId: 311834483
Change-Id: I13a0c23f1da4f7080eff4852b0e470a9d86c26b5
---
 tensorflow/compiler/xla/service/gpu/gpu_fusible.cc    | 11 +++++++++++
 .../compiler/xla/service/gpu/instruction_fusion.cc    |  4 ++++
 tensorflow/compiler/xla/service/instruction_fusion.cc |  6 +++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 1316e8ad1aa..bb4184ff76f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -351,6 +351,9 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
                            const HloInstruction& instr2) {
   if (SharedMemoryUsage(instr1) + SharedMemoryUsage(instr2) >
       kSharedMemoryBudgetInBytes) {
+    VLOG(5) << "Shared memory usage of fusion of " << instr1.ToString()
+            << " and " << instr2.ToString() << " would be over the budget of "
+            << kSharedMemoryBudgetInBytes << "B";
     return true;
   }
 
@@ -383,6 +386,14 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
           num_output_buffers <=
       kMaxOperandsAndOutputsPerFusion) {
     return false;
+  } else {
+    VLOG(5) << "Operand count of "
+            << "(" << instr1.ToString() << " ) = " << instr1.operand_count()
+            << " and ( " << instr2.ToString()
+            << " ) = " << instr2.operand_count()
+            << " and num_output_buffers = " << num_output_buffers
+            << " is bigger than the bound of "
+            << kMaxOperandsAndOutputsPerFusion;
   }
 
   // Compute the precise number of operands to the new fusion.
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index fc1c1bb4ab1..a0580e2ab04 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -65,12 +65,16 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   if (!ShouldFuseInexpensiveChecks(consumer, operand_index)) {
+    VLOG(5) << "Not fusing inexpensive checks of operand " << operand_index
+            << " of " << consumer->ToString();
     return false;
   }
   auto producer = consumer->operand(operand_index);
 
   // The following checks are potentially expensive.
   if (FusionWouldBeTooLarge(*consumer, *producer)) {
+    VLOG(5) << "Fusion of (" << producer->ToString() << ") into ("
+            << consumer->ToString() << ") would be too large";
     return false;
   }
   if (consumer->opcode() != HloOpcode::kFusion) {
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 1bc3d24274c..5de081c6343 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -502,7 +502,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     while (true) {
       auto next_entry =
           fusion_queue->DequeueNextInstructionAndOperandsToFuseInOrder();
-      auto instruction = next_entry.first;
+      HloInstruction* instruction = next_entry.first;
       if (instruction == nullptr) {
         break;
       }
@@ -512,12 +512,14 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         continue;
       }
 
+      VLOG(5) << "Considering fusion of: " << instruction->ToString();
       std::vector<int64>& sorted_operand_numbers = next_entry.second;
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
 
         if (!operand->IsFusible()) {
+          VLOG(3) << "Operand (" << operand->ToString() << ") is not fusible";
           continue;
         }
 
@@ -691,6 +693,8 @@ bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
   if (FusionWouldDuplicate(*producer, *consumer) &&
       (!may_duplicate_ || is_expensive_(*producer)) &&
       !IsAlwaysDuplicable(*producer)) {
+    VLOG(4) << "Stopping: fusion may duplicate operand ("
+            << producer->ToString() << ") , and this is expensive";
     return false;
   }
 

From cbc4d5442e946306ef5f2ed88ec1ec3c4c9ec765 Mon Sep 17 00:00:00 2001
From: Mehmet Deveci <deveci@google.com>
Date: Fri, 15 May 2020 18:07:38 -0700
Subject: [PATCH 0718/1533] Adding an option to tensor tracer to create a
 suffix folder based on the fingerprint of the tf.graph.

If use_fingerprint_subdirectory is provided, then the TensorTracer summaries will be written under <trace_dir>/<fingerprint>. If there are changes to the graph, the changes will be listed under different fingerprints.

PiperOrigin-RevId: 311834837
Change-Id: I9dfbabfeb7fbe58a2a47c2581474ed86647781dc
---
 tensorflow/python/tpu/tensor_tracer.proto     |  4 +++
 tensorflow/python/tpu/tensor_tracer.py        | 21 +++++++++++-
 tensorflow/python/tpu/tensor_tracer_flags.py  |  4 ++-
 tensorflow/python/tpu/tensor_tracer_report.py | 34 +++++++++++++++++--
 4 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/tpu/tensor_tracer.proto b/tensorflow/python/tpu/tensor_tracer.proto
index ad5392d65fe..7b745f0f45b 100644
--- a/tensorflow/python/tpu/tensor_tracer.proto
+++ b/tensorflow/python/tpu/tensor_tracer.proto
@@ -21,6 +21,10 @@ message TensorTracerReport {
   // A map from tensor name to its TracedTensorDef.
   map<string, TracedTensorDef> tensordef = 3;
 
+  // The fingerprint of the TensorTracerReport (fingerprint calculation excludes
+  // this field and graphdef).
+  string fingerprint = 4;
+
   message TensorTracerConfig {
     // Tensor tracer version, e.g. hostcall, outside compilation.
     string version = 1;
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index bd96de42f3a..b4f99897094 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -100,7 +100,7 @@ _TT_TENSORBOARD_PLUGIN_NAME = 'tensor_tracer'
 _TT_HOSTCALL_KEY = 'tensor_tracer_host_call'
 _TT_EVENT_FILE_SUFFIX = '.tensor_tracer'
 
-_TT_SUMMARY_MAX_QUEUE = 100
+_TT_SUMMARY_MAX_QUEUE = 10
 
 
 def set_parameters(tensor_tracer_params=None):
@@ -206,6 +206,9 @@ def set_parameters(tensor_tracer_params=None):
           -> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
           result of op1 will also be printed. trace_stack_size is 2, the result
           of op1 and op2 will be printed.
+        - use_fingerprint_subdirectory: The trace directory will be chosen as
+          using the fingerprint of the trace metadata under the provided
+          trace_dir.
   """
   flags = '--%s=1' % tensor_tracer_flags.FLAG_NAME_ENABLE
   if tensor_tracer_params:
@@ -547,6 +550,7 @@ class TensorTracer(object):
     self._traced_op_names = set()
     self._report_proto = None
     self._temp_cache_var = []
+    self._report_proto_path = ''
 
   def report_proto(self):
     """Getter for tensor_tracer.proto object for summary and full_tensor_summary modes.
@@ -564,6 +568,14 @@ class TensorTracer(object):
                        'Report proto only exists for '
                        'trace_mode=[summary|full_tensor_summary]')
 
+  def report_proto_path(self):
+    """Getter for path where tensor_tracer.proto object should be written.
+
+    Returns:
+      A string path.
+    """
+    return self._report_proto_path
+
   def _get_all_cache_variables(self):
     return self._cache_variables
 
@@ -1366,6 +1378,13 @@ class TensorTracer(object):
       self._report_proto = report_handler.create_report_proto(
           self._tt_config, self._parameters, tensor_trace_order,
           tensor_trace_points, self._signature_types())
+      if self._parameters.use_fingerprint_subdir:
+        self._parameters.trace_dir = os.path.join(
+            self._parameters.trace_dir, self._report_proto.fingerprint)
+        logging.info('TensorTracer updating trace_dir to %s',
+                     self._parameters.trace_dir)
+      self._report_proto_path = tensor_tracer_report.report_proto_path(
+          self._parameters.trace_dir)
       if self._parameters.report_file_path != _SKIP_REPORT_FILE:
         report_handler.write_report_proto(self._report_proto, self._parameters)
     else:
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index c5e3e88597b..4e412c46e82 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -74,6 +74,7 @@ FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
 FLAG_NAME_SUMMARY_SIGNATURES = 'signatures'
 FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core'
 FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache'
+FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory'
 
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
 _TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
@@ -127,6 +128,7 @@ class TTParameters(object):
     self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
     self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
     self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
+    self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
 
     # _trace_ops_before_included and _trace_ops_after_included denotes to depth
     # of tracing relative to the ops given in --included_opnames or
@@ -274,7 +276,7 @@ class TTParameters(object):
         FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
         FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
-        FLAG_NAME_TEMP_CACHE_VAR
+        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
     ]
     tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR)
     if not tensor_tracer_flags:
diff --git a/tensorflow/python/tpu/tensor_tracer_report.py b/tensorflow/python/tpu/tensor_tracer_report.py
index e8a122d981f..3270b2a2fd3 100644
--- a/tensorflow/python/tpu/tensor_tracer_report.py
+++ b/tensorflow/python/tpu/tensor_tracer_report.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import hashlib
 import os
 
+
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tensor_tracer_pb2
@@ -53,6 +55,18 @@ _CURRENT_VERSION = 'use-outside-compilation'
 _TT_REPORT_PROTO = 'tensor_tracer_report.report_pb'
 
 
+def report_proto_path(trace_dir):
+  """Returns the path where report proto should be written.
+
+  Args:
+     trace_dir: String denoting the trace directory.
+
+  Returns:
+     A string denoting the path to the report proto.
+  """
+  return os.path.join(trace_dir, _TT_REPORT_PROTO)
+
+
 def topological_sort(g):
   """Performs topological sort on the given graph.
 
@@ -206,6 +220,12 @@ class OpenReportFile(object):
       self._report_file.close()
 
 
+def proto_fingerprint(message_proto):
+  serialized_message = message_proto.SerializeToString()
+  hasher = hashlib.sha256(serialized_message)
+  return hasher.hexdigest()
+
+
 class TTReportHandle(object):
   """Utility class responsible from creating a tensor tracer report."""
 
@@ -255,8 +275,6 @@ class TTReportHandle(object):
                                     key=lambda x: x[1]):
       report.config.signatures.append(signature_name)
 
-    tf_graph = tensor_trace_order.graph_order.graph
-    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     for tensor in tensor_trace_order.graph_order.tensors:
       tensor_def = tensor_tracer_pb2.TensorTracerReport.TracedTensorDef()
       tensor_def.name = tensor.name
@@ -265,6 +283,11 @@ class TTReportHandle(object):
         tensor_def.cache_index = (
             tensor_trace_order.tensorname_to_cache_idx[tensor.name])
       else:
+        # To prevent small changes affecting the fingerprint calculation, avoid
+        # writing the untraced tensors to metadata. Fingerprints will be
+        # different only when the list of the traced tensors are different.
+        if tt_parameters.use_fingerprint_subdir:
+          continue
         tensor_def.is_traced = False
 
       if tensor.name in tensor_trace_points:
@@ -274,12 +297,17 @@ class TTReportHandle(object):
       elif tensor.op.name in self.instrument_records:
         tensor_def.explanation = self.instrument_records[tensor.op.name]
       report.tensordef[tensor.name].CopyFrom(tensor_def)
+    report.fingerprint = proto_fingerprint(report)
+    logging.info('TensorTracerProto fingerprint is %s.',
+                 report.fingerprint)
+    tf_graph = tensor_trace_order.graph_order.graph
+    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     return report
 
   def write_report_proto(self, report_proto, tt_parameters):
     """Writes the given report proto under trace_dir."""
     gfile.MakeDirs(tt_parameters.trace_dir)
-    report_path = os.path.join(tt_parameters.trace_dir, _TT_REPORT_PROTO)
+    report_path = report_proto_path(tt_parameters.trace_dir)
     with gfile.GFile(report_path, 'wb') as f:
       f.write(report_proto.SerializeToString())
 

From 939b69e701c4ce749267e5b3d5d8b5557e3f1300 Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Sat, 16 May 2020 01:13:19 +0000
Subject: [PATCH 0719/1533] Added grad_wrapper to accomodate graph mode

---
 tensorflow/python/ops/custom_gradient.py | 60 +++++++++++++-----------
 tensorflow/python/ops/gradients_test.py  | 42 +++++++++--------
 2 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 4a375e11554..aa80756b859 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -486,38 +486,42 @@ def recompute_grad(f):
     with tape_lib.stop_recording():
       result = f(*args, **kwargs)
 
-    @custom_gradient
-    def inner_recompute_grad(*dresult, **grad_kwargs):
-      """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
-      # Gradient calculation for reverse mode autodiff.
-      variables = grad_kwargs.get("variables")
-      with backprop.GradientTape() as t:
-        id_args = [gen_array_ops.identity(x) for x in args]
-        t.watch(id_args)
+    def grad_wrapper(*wrapper_args, **grad_kwargs):
+      """Wrapper function to accomodate lack of kwargs in graph mode decorator."""
+      @custom_gradient
+      def inner_recompute_grad(*dresult):
+        """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
+        # Gradient calculation for reverse mode autodiff.
+        variables = grad_kwargs.get("variables")
+        with backprop.GradientTape() as t:
+          id_args = [gen_array_ops.identity(x) for x in args]
+          t.watch(id_args)
+          if variables is not None:
+            t.watch(variables)
+          with ops.control_dependencies(dresult):
+            with variable_scope.variable_scope(current_var_scope):
+              result = f(*id_args, **kwargs)
+        kw_vars = []
         if variables is not None:
-          t.watch(variables)
-        with ops.control_dependencies(dresult):
-          with variable_scope.variable_scope(current_var_scope):
-            result = f(*id_args, **kwargs)
-      kw_vars = []
-      if variables is not None:
-        kw_vars = list(variables)
-      grads = t.gradient(result,
-                         list(id_args) + kw_vars,
-                         output_gradients=dresult,
-                         unconnected_gradients=UnconnectedGradients.ZERO)
+          kw_vars = list(variables)
+        grads = t.gradient(result,
+                           list(id_args) + kw_vars,
+                           output_gradients=dresult,
+                           unconnected_gradients=UnconnectedGradients.ZERO)
 
-      def transpose(*t_args, **t_kwargs):
-        """Gradient function calculation for forward mode autodiff."""
-        # Just throw an error since gradients / activations are not stored on tape for recompute.
-        raise NotImplementedError(
-            "recompute_grad tried to transpose grad of {}. "
-            "Consider not using recompute_grad in forward mode autodiff".format(
-                f.__name__))
+        def transpose(*t_args, **t_kwargs):
+          """Gradient function calculation for forward mode autodiff."""
+          # Just throw an error since gradients / activations are not stored on tape for recompute.
+          raise NotImplementedError(
+              "recompute_grad tried to transpose grad of {}. "
+              "Consider not using recompute_grad in forward mode" 
+              "autodiff".format(f.__name__))
 
-      return (grads[:len(id_args)], grads[len(id_args):]), transpose
+        return (grads[:len(id_args)], grads[len(id_args):]), transpose
 
-    return result, inner_recompute_grad
+      return inner_recompute_grad(*wrapper_args)
+
+    return result, grad_wrapper
 
   return inner
 
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index e1da54e6427..57fb2f4ddb3 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1431,24 +1431,25 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
   def testFnRecomputeWithScopeGradientTape(self):
     """Checks that recompute_grad works with var scope and GradientTape."""
 
-    def TestFn(input_t, test_var):
-      return input_t * test_var
+    def TestFn(input_t):
+      with variable_scope.variable_scope("inner_scope"):
+        test_var = variable_scope.get_variable(
+            name="test_var",
+            shape=10,
+            trainable=True,
+        )
+        self.evaluate(test_var.assign(np.ones([10])))
+        return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
 
     with variable_scope.variable_scope(
         "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True):
-      with variable_scope.variable_scope("inner_scope"):
-        test_var = variable_scope.get_variable(
-            name="test_var", shape=10, trainable=True,
-        )
-        self.evaluate(test_var.assign(np.ones([10])))
-
       test_fn_re = custom_gradient.recompute_grad(TestFn)
 
       with backprop.GradientTape(persistent=True) as tape:
-        out_re = test_fn_re(test_input_t, test_var)
-        out = TestFn(test_input_t, test_var)
+        out_re = test_fn_re(test_input_t)
+        out = TestFn(test_input_t)
 
     grads_re = tape.gradient(out_re, variables.trainable_variables())
     grads = tape.gradient(out, variables.trainable_variables())
@@ -1463,22 +1464,23 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
   def testFnRecomputeWithScopeGradients(self):
     """Checks that recompute_grad works with var scope and gradients(..)."""
 
-    def TestFn(input_t, test_var):
-      return input_t * test_var
+    def TestFn(input_t):
+      with variable_scope.variable_scope("inner_scope"):
+        test_var = variable_scope.get_variable(
+            name="test_var",
+            shape=10,
+            trainable=True,
+        )
+        self.evaluate(test_var.assign(np.ones([10])))
+        return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
 
     with variable_scope.variable_scope(
         "output_scope", reuse=variable_scope.AUTO_REUSE, use_resource=True):
-      with variable_scope.variable_scope("inner_scope"):
-        test_var = variable_scope.get_variable(
-            name="test_var", shape=10, trainable=True,
-        )
-        self.evaluate(test_var.assign(np.ones([10])))
-      
       test_fn_re = custom_gradient.recompute_grad(TestFn)
-      out_re = test_fn_re(test_input_t, test_var)
-      out = TestFn(test_input_t, test_var)
+      out_re = test_fn_re(test_input_t)
+      out = TestFn(test_input_t)
 
     grads_re = gradients.gradients(out_re, variables.trainable_variables())
     grads = gradients.gradients(out, variables.trainable_variables())

From 4fc945e30a0dfaa53848ad4393cb78dcb3283ef4 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 15 May 2020 18:09:14 -0700
Subject: [PATCH 0720/1533] [tf.data] Reduce verbosity of a warning as it is
 not actionable and appears for all programs that use tf.data with
 tf.distribute.

PiperOrigin-RevId: 311834993
Change-Id: Iafb60c31008369f48e986f9ff3b400a9d5ada36d
---
 tensorflow/core/kernels/data/captured_function.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 28738e3e2fe..adba99d37a4 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -466,17 +466,15 @@ Status FunctionMetadata::Create(
 
   auto attr = fdef->attr().find(FunctionLibraryDefinition::kIntsOnDeviceAttr);
   if (attr != fdef->attr().end() && attr->second.b()) {
-    LOG(WARNING)
-        << "Disabling multi-device execution for a function that uses the "
-        << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
+    VLOG(1) << "Disabling multi-device execution for a function that uses the "
+            << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
     (*out_metadata)->use_multi_device_function_ = false;
     return Status::OK();
   }
   auto validate_arg = [](const OpDef::ArgDef& arg) {
     if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
-      LOG(WARNING) << "Disabling multi-device execution for a function with "
-                      "a vector argument "
-                   << arg.name() << ".";
+      VLOG(1) << "Disabling multi-device execution for a function with "
+              << "a vector argument " << arg.name() << ".";
       return false;
     }
     return true;

From 312079996985b7a15ad7ff27c39ece6625e30121 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Fri, 15 May 2020 18:50:51 -0700
Subject: [PATCH 0721/1533] [XLA] Calculate and print statistics about
 prefetches and evictions.

PiperOrigin-RevId: 311839276
Change-Id: Iec0b0318825c665cfca067d4edf30b56e8f9c833
---
 .../xla/service/memory_space_assignment.cc    | 40 +++++++++++++++----
 .../xla/service/memory_space_assignment.h     | 14 +++++--
 .../service/memory_space_assignment_test.cc   | 25 +++++++++---
 3 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 742de71e74c..431e6af2dc0 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -1706,20 +1706,39 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidate(
   return absl::nullopt;
 }
 
-/*static*/ int64 MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(
-    const HloModule& module) {
-  int64 max_copies = 0;
+StatusOr<MemorySpaceAssignment::AsyncCopyStats>
+MemorySpaceAssignment::CalculateAsyncCopyStats() const {
+  AsyncCopyStats stats;
+  stats.max_outstanding_async_copies = 0;
+  stats.num_prefetches = 0;
+  stats.prefetch_bytes = 0;
+  stats.num_evictions = 0;
+  stats.eviction_bytes = 0;
   int64 current_copies = 0;
-  for (HloInstruction* instruction :
-       module.schedule().sequence(module.entry_computation()).instructions()) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+                      HloDataflowAnalysis::Run(*module_));
+  for (HloInstruction* instruction : module_->schedule()
+                                         .sequence(module_->entry_computation())
+                                         .instructions()) {
     if (instruction->opcode() == HloOpcode::kCopyStart) {
       current_copies++;
     } else if (instruction->opcode() == HloOpcode::kCopyDone) {
       current_copies--;
+      int64 size =
+          options_.size_fn(dataflow_analysis->GetUniqueValueAt(instruction));
+      if (instruction->shape().layout().memory_space() ==
+          options_.alternate_memory_space) {
+        ++stats.num_prefetches;
+        stats.prefetch_bytes += size;
+      } else {
+        ++stats.num_evictions;
+        stats.eviction_bytes += size;
+      }
     }
-    max_copies = std::max(max_copies, current_copies);
+    stats.max_outstanding_async_copies =
+        std::max(stats.max_outstanding_async_copies, current_copies);
   }
-  return max_copies;
+  return stats;
 }
 
 /*static*/ MemorySpaceAssignment::BufferIntervalCompare
@@ -1851,8 +1870,13 @@ MemorySpaceAssignment::RunMemorySpaceAssignment(
   VLOG(3) << "Module after memory space assignment: ";
   XLA_VLOG_LINES(3, module_->ToString());
   TF_CHECK_OK(module_->schedule().Verify());
+  TF_ASSIGN_OR_RETURN(AsyncCopyStats stats, CalculateAsyncCopyStats());
   VLOG(1) << "Maximum number of outstanding async copies: "
-          << CountMaximumOutstandingAsyncCopies(*module_);
+          << stats.max_outstanding_async_copies;
+  VLOG(1) << "Number of prefetches: " << stats.num_prefetches
+          << ", in bytes: " << stats.prefetch_bytes;
+  VLOG(1) << "Number of evictions: " << stats.num_evictions
+          << ", in bytes: " << stats.eviction_bytes;
 
   TF_RETURN_IF_ERROR(VerifyAndExportHeapSimulatorTrace());
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index eb16db90600..727b8da6c08 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -604,6 +604,15 @@ class MemorySpaceAssignment {
     AllocationSequence allocation_sequence_;
   };
 
+  // Statistics of asynchronous copies.
+  struct AsyncCopyStats {
+    int64 max_outstanding_async_copies;
+    int64 num_prefetches;
+    int64 prefetch_bytes;
+    int64 num_evictions;
+    int64 eviction_bytes;
+  };
+
   virtual ~MemorySpaceAssignment() = default;
 
   // Runs the MemorySpaceAssignment pass.
@@ -611,9 +620,8 @@ class MemorySpaceAssignment {
       HloModule* module, const HloLiveRange& hlo_live_range,
       const HloAliasAnalysis& alias_analysis, const Options& options);
 
-  // Returns the maximum number of outstanding asynchronous copies in the
-  // module.
-  static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module);
+  // Calculates asynchronous copy statistics.
+  StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
 
   static BufferIntervalCompare GetMemoryBoundednessBufferIntervalCompare(
       const MemorySpaceAssignmentCostAnalysis& cost_analysis);
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index b2125d318d0..984f2e7b4ea 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -184,6 +184,22 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     }
   }
 
+  /*static*/ int64 CountMaximumOutstandingAsyncCopies(const HloModule& module) {
+    int64 max_copies = 0;
+    int64 current_copies = 0;
+    for (HloInstruction* instruction : module.schedule()
+                                           .sequence(module.entry_computation())
+                                           .instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        current_copies++;
+      } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+        current_copies--;
+      }
+      max_copies = std::max(max_copies, current_copies);
+    }
+    return max_copies;
+  }
+
   std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
     HloComputation::Builder builder(TestName());
     Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
@@ -391,8 +407,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies0) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/0);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            0);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 0);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
@@ -400,8 +415,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/1);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            1);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 1);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
@@ -409,8 +423,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/2);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            2);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 2);
 }
 
 // TODO(berkin): This test is broken with some prefetch timing improvements.

From 583b37ffc350ebc01312ccccc32895f0f9d1725e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 16 May 2020 02:48:47 +0000
Subject: [PATCH 0722/1533] Allow single element padding_values to be
 broadcasted to a structure for tf.data.Dataset.padded_batch

This PR is a follow up based on comment in 35900.

In 35900, the issue was raised where if padding_values is a single elment
and the dataset has a structured shape, then tf.data.Dataset.padded_batch
will return and error.

This PR adds the support to "broadcast" padding_values to match the same
structure as the dataset, which could be convenient in many cases (avoid
repeat the padding_values into the stucture manually).

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/data/ops/dataset_ops.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 9ae560f71db..2e069689143 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -4027,6 +4027,12 @@ class PaddedBatchDataset(UnaryDataset):
     self._padded_shapes = nest.pack_sequence_as(input_shapes,
                                                 flat_padded_shapes_as_tensors)
 
+    # If padding_values is a single element and input_shapes is a structure,
+    # "broadcast" padding_values to the same structure as input_shapes.
+    if nest.is_sequence(input_shapes) and not nest.is_sequence(padding_values):
+      padding_values = nest.map_structure(
+          lambda _: padding_values, input_shapes)
+
     self._padding_values = nest.map_structure_up_to(
         input_shapes, _padding_value_to_tensor, padding_values,
         get_legacy_output_types(input_dataset))

From 0c4b0a2462a754a50d6b8fc4f95039e1dd36c294 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 16 May 2020 02:52:41 +0000
Subject: [PATCH 0723/1533] Add test case for "broadcast" padding_values to the
 same structure as dataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../data/kernel_tests/padded_batch_test.py       | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
index 75d515484a4..c9f52182a75 100644
--- a/tensorflow/python/data/kernel_tests/padded_batch_test.py
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -306,6 +306,22 @@ class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         ds, expected_output=[[0.0, 1.0, 2.0, 3.0, 4.0]])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testDefaultPaddedValueShapes(self):
+
+    def fill(x):
+      return array_ops.fill([x], x)
+
+    dataset = dataset_ops.Dataset.zip((
+        dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4]).map(fill),
+        dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4]).map(fill)))
+    dataset = dataset.padded_batch(batch_size=2, padding_values=-1)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            ([[1, -1], [2, 2]], [[1, -1], [2, 2]]),
+            ([[3, 3, 3, -1], [4, 4, 4, 4]], [[3, 3, 3, -1], [4, 4, 4, 4]])])
+
 
 if __name__ == '__main__':
   test.main()

From 90dc8696e7cc810a0c5df8f1cc6cbc4cd0d70ccf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 20:22:59 -0700
Subject: [PATCH 0724/1533] Fix the quantile accumulator to always return at
 most the number of requested boundaries.

PiperOrigin-RevId: 311846622
Change-Id: I6a3d4bf3efbf4ce171f62e1af2dc29bd77cd4063
---
 .../kernels/boosted_trees/quantile_ops.cc     |  5 ++++-
 .../boosted_trees/quantile_ops_test.py        | 22 ++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index 0065bdd66aa..0de08bcff2d 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -65,7 +65,8 @@ std::vector<float> GenerateBoundaries(const QuantileStream& stream,
 
   // Uniquify elements as we may get dupes.
   auto end_it = std::unique(boundaries.begin(), boundaries.end());
-  boundaries.resize(std::distance(boundaries.begin(), end_it));
+  boundaries.resize(std::min<size_t>(std::distance(boundaries.begin(), end_it),
+                                     num_boundaries));
   return boundaries;
 }
 
@@ -421,6 +422,8 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
             generate_quantiles_ ? GenerateQuantiles(*stream, num_buckets)
                                 : GenerateBoundaries(*stream, num_buckets),
             stream_idx);
+        VLOG(1) << "Created " << stream_resource->boundaries(stream_idx).size()
+                << " boundaries.";
       }
     };
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index fb44c33d602..7c3a382c955 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     self.eps = 0.01
     self.max_elements = 1 << 16
-    self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
+    self.num_quantiles = constant_op.constant(4, dtype=dtypes.int64)
 
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
@@ -183,7 +183,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -202,7 +205,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
@@ -215,7 +221,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -233,7 +242,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()

From b60f79dad844f2b63d17d86ac46ff982b1e43057 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 21:18:33 -0700
Subject: [PATCH 0725/1533] Fix the quantile accumulator to always return at
 most the number of requested boundaries.

PiperOrigin-RevId: 311850905
Change-Id: If188c30fdb6e9968c809c60bffdd0c8a31297cac
---
 .../kernels/boosted_trees/quantile_ops.cc     |  5 +----
 .../boosted_trees/quantile_ops_test.py        | 22 +++++--------------
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index 0de08bcff2d..0065bdd66aa 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -65,8 +65,7 @@ std::vector<float> GenerateBoundaries(const QuantileStream& stream,
 
   // Uniquify elements as we may get dupes.
   auto end_it = std::unique(boundaries.begin(), boundaries.end());
-  boundaries.resize(std::min<size_t>(std::distance(boundaries.begin(), end_it),
-                                     num_boundaries));
+  boundaries.resize(std::distance(boundaries.begin(), end_it));
   return boundaries;
 }
 
@@ -422,8 +421,6 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
             generate_quantiles_ ? GenerateQuantiles(*stream, num_buckets)
                                 : GenerateBoundaries(*stream, num_buckets),
             stream_idx);
-        VLOG(1) << "Created " << stream_resource->boundaries(stream_idx).size()
-                << " boundaries.";
       }
     };
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 7c3a382c955..fb44c33d602 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     self.eps = 0.01
     self.max_elements = 1 << 16
-    self.num_quantiles = constant_op.constant(4, dtype=dtypes.int64)
+    self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
 
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
@@ -183,10 +183,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -205,10 +202,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
@@ -221,10 +215,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -242,10 +233,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()

From eb71191f7b03e8248d760ea02776582536b7492d Mon Sep 17 00:00:00 2001
From: Xunkai Zhang <xunkai@google.com>
Date: Fri, 15 May 2020 21:32:46 -0700
Subject: [PATCH 0726/1533] [tfls.metadata] Use java7 version opts to build
 metadata lib.

PiperOrigin-RevId: 311851755
Change-Id: I853e23a60f37cb89cc57653e85ff708ed467d512
---
 third_party/flatbuffers/build_defs.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index d07ad18630f..9be627119cf 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -472,6 +472,7 @@ def flatbuffer_java_library(
     native.java_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         deps = [
             "@flatbuffers//:runtime_java",
         ],
@@ -562,7 +563,6 @@ def flatbuffer_android_library(
         srcs,
         custom_package = "",
         package_prefix = "",
-        javacopts = None,
         include_paths = DEFAULT_INCLUDE_PATHS,
         flatc_args = DEFAULT_FLATC_ARGS,
         visibility = None):
@@ -575,7 +575,6 @@ def flatbuffer_android_library(
           namespace in the schema files will be used. (optional)
       package_prefix: like custom_package, but prefixes to the existing
           namespace. (optional)
-      javacopts: List of options to pass to javac.
       include_paths: List of paths that includes files can be found in. (optional)
       flatc_args: List of additional arguments to pass to flatc. (optional)
       visibility: Visibility setting for the android_library rule. (optional)
@@ -604,6 +603,7 @@ def flatbuffer_android_library(
     android_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         visibility = visibility,
         deps = [
             "@flatbuffers//:runtime_android",

From e234c0a44e526dd79d782ad5623ea9f3f3298139 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 23:01:01 -0700
Subject: [PATCH 0727/1533] [tf.data] Update output time functions to solve the
 stack overflow problem. Also update some mathematics computation in the code.

PiperOrigin-RevId: 311858942
Change-Id: Iafa345b5d235c60a455671c924af594396a361ad
---
 tensorflow/core/framework/model.cc            | 658 +++++++++++-------
 tensorflow/core/framework/model.h             |  82 ++-
 tensorflow/core/framework/model_test.cc       | 114 +--
 .../python/data/kernel_tests/options_test.py  |   2 +-
 4 files changed, 533 insertions(+), 323 deletions(-)

diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index b4a54029a4f..658be94b9bb 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -25,10 +25,6 @@ namespace data {
 namespace model {
 namespace {
 
-// Key of the derivative w.r.t. the last input time in the gradient of
-// `OutputTime`.
-constexpr char kInputTimeDerivativeKey[] = "last_input_time";
-
 // Wrapper for the square function to reduce verbosity.
 inline double Square(double x) { return x * x; }
 
@@ -50,34 +46,60 @@ class InterleaveMany : public Node {
         Args{id_, name_, std::move(output)});
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (num_inputs() <= 1) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        old_input_time +
+        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the average
   // output time of inputs comprising the interleave "cycle".
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
-    }
-    double delta = SelfProcessingTimeLocked() * (num_inputs() - 1);
-    input_times->back() += delta;
-    auto cleanup = gtl::MakeCleanup(
-        [input_times, delta]() { input_times->back() -= delta; });
-    double output_time;
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      output_time =
-          (OutputTimeForInputs(input_times, &inputs_gradient) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
-      for (auto& pair : inputs_gradient) {
-        (*gradient)[pair.first] =
-            pair.second / static_cast<double>(num_inputs() - 1);
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
       }
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + inputs_gradient[kInputTimeDerivativeKey] /
-                                    static_cast<double>(num_inputs() - 1);
+      return;
+    }
+
+    double output_time = (OutputTimeForInputs(*output_times) -
+                          (*output_times)[inputs_.front()->long_name()]) /
+                         static_cast<double>(num_inputs() - 1);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient /= static_cast<double>(num_inputs() - 1);
+        }
+      }
+
+      (*output_time_gradients)[long_name()] =
+          (OutputTimeGradientsForInputs(*output_time_gradients) -
+           (*output_time_gradients)[inputs_.front()->long_name()]) /
+          static_cast<double>(num_inputs() - 1);
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
@@ -85,15 +107,10 @@ class InterleaveMany : public Node {
           first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
-    } else {
-      output_time =
-          (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
     }
-    return SelfProcessingTimeLocked() + output_time;
+    (*output_times)[long_name()] = self_processing_time + output_time;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -107,16 +124,15 @@ class InterleaveMany : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     double processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -148,55 +164,85 @@ class AsyncInterleaveMany : public Node {
         Args{id_, name_, std::move(output)}, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (num_inputs() <= 1) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+    } else {
+      input_time =
+          SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the
   // self-processing time and the average output time of inputs comprising the
   // interleave "cycle", `input_time` is specified through `input_times` and
   // `buffer_size` is derived from parallelism.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+      }
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time =
-        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
+
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     double parallelism = num_inputs() - 1;  // default to cycle length
     auto* parameter = gtl::FindOrNull(parameters_, kParallelism);
     if (parameter) {
       parallelism = std::min(parallelism, (*parameter)->value);
     }
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      double output_time_for_inputs =
-          OutputTimeForInputs(input_times, &inputs_gradient) -
-          inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr);
-      double output_time = output_time_for_inputs /
-                           static_cast<double>(num_inputs() - 1) / parallelism;
+
+    double output_time_for_inputs =
+        OutputTimeForInputs(*output_times) -
+        (*output_times)[inputs_.front()->long_name()];
+    double output_time = output_time_for_inputs /
+                         static_cast<double>(num_inputs() - 1) / parallelism;
+    double result;
+
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double result = ComputeWaitTime(
-          SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-          &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism, &output_time_der, &input_time_der,
+                               &buffer_size_der);
+      (*output_time_gradients)[long_name()] = input_time_der;
       double parallelism_der = -output_time_for_inputs /
                                static_cast<double>(num_inputs() - 1) /
                                Square(parallelism);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = output_time_der * pair.second /
-                                    static_cast<double>(num_inputs() - 1) /
-                                    parallelism;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (output_time_der /
+                        static_cast<double>(num_inputs() - 1) / parallelism);
         }
       }
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
@@ -204,23 +250,21 @@ class AsyncInterleaveMany : public Node {
           first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
       // Add derivative w.r.t. own parallelism parameter.
       if (parameter && (*parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        (*gradients)[long_name()] =
             output_time_der * parallelism_der + buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-         inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-        static_cast<double>(num_inputs() - 1) / parallelism;
-    return ComputeWaitTime(
-        SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-        /*output_time_derivative=*/nullptr,
-        /*input_time_derivative=*/nullptr, /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -234,16 +278,15 @@ class AsyncInterleaveMany : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     double processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -260,41 +303,55 @@ class KnownRatio : public Node {
                                         ratio_);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (ratio_ == 0) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the product of
   // `ratio_` and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (ratio_ == 0) {
-      return SelfProcessingTimeLocked();
-    }
-    double old_input_time = input_times->back();
-    input_times->back() =
-        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    double result;
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + ratio_ *
-                                    inputs_gradient[kInputTimeDerivativeKey] *
-                                    (1.0L + 1.0L / ratio_);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_;
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-    } else {
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+      return;
     }
-    return result;
+    double result =
+        self_processing_time + ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio_;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -309,8 +366,8 @@ class KnownRatio : public Node {
     }
     double processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -340,6 +397,29 @@ class AsyncKnownRatio : public Node {
         Args{id_, name_, std::move(output)}, ratio_, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (ratio_ == 0.0) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+      (*input_times)[long_name()] = input_time;
+      return;
+    }
+
+    double parallelism = 1.0;
+    auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
+    if (parallelism_parameter) {
+      parallelism = (*parallelism_parameter)->value;
+    }
+    input_time = SelfProcessingTimeLocked() / ratio_ / parallelism;
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the self
   // processing time and the product of `ratio_` and the sum of output times of
@@ -347,9 +427,12 @@ class AsyncKnownRatio : public Node {
   // has parallelism parameter, then `buffer_size` is derived from parallelism.
   //
   // Current implementation assumes that there is at most 1 parameter per node.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double parallelism = 1.0;
     double buffer_size = 0.0;
     auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
@@ -361,80 +444,85 @@ class AsyncKnownRatio : public Node {
       buffer_size = (*buffer_size_parameter)->value;
     }
     double self_processing_time = SelfProcessingTimeLocked();
+    double result;
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     if (ratio_ == 0.0) {
       double output_time = self_processing_time / parallelism;
-      if (gradient) {
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+
         double output_time_der = 0.0L;
         double input_time_der = 0.0L;
         double buffer_size_der = 0.0L;
-        double result = ComputeWaitTime(output_time, input_times->back(),
-                                        buffer_size, &output_time_der,
-                                        &input_time_der, &buffer_size_der);
-        auto last_input_time_der =
-            gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-        (*gradient)[kInputTimeDerivativeKey] =
-            last_input_time_der + input_time_der;
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 &output_time_der, &input_time_der,
+                                 &buffer_size_der);
+        (*output_time_gradients)[long_name()] = input_time_der;
         // Add derivative w.r.t. own parameter if it's tunable.
         if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-          (*gradient)[long_name()] =
+          (*gradients)[long_name()] =
               -output_time_der * self_processing_time / Square(parallelism) +
               buffer_size_der;
         } else if (buffer_size_parameter &&
                    (*buffer_size_parameter)->state->tunable) {
-          (*gradient)[long_name()] = buffer_size_der;
+          (*gradients)[long_name()] = buffer_size_der;
         }
-        return result;
+      } else {
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 /*output_time_derivative=*/nullptr,
+                                 /*input_time_derivative=*/nullptr,
+                                 /*buffer_size_derivative=*/nullptr);
       }
-      return ComputeWaitTime(output_time, input_times->back(), buffer_size,
-                             /*output_time_derivative=*/nullptr,
-                             /*input_time_derivative=*/nullptr,
-                             /*buffer_size_derivative=*/nullptr);
+      (*output_times)[long_name()] = result;
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time = self_processing_time / ratio_ / parallelism;
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
+
+    double output_time = self_processing_time / parallelism +
+                         ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double output_time =
-          self_processing_time / parallelism +
-          ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      double result =
-          ComputeWaitTime(output_time, old_input_time, buffer_size,
+      result =
+          ComputeWaitTime(output_time, input_time, buffer_size,
                           &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_ * output_time_der;
+      (*output_time_gradients)[long_name()] = input_time_der;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (ratio_ * output_time_der);
         }
       }
+
       // Add derivative w.r.t. own parameter if it's tunable.
       if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        double inputs_time_der_sum =
+            OutputTimeGradientsForInputs(*output_time_gradients);
+        (*gradients)[long_name()] =
             -output_time_der * self_processing_time / Square(parallelism) +
             buffer_size_der -
-            output_time_der * inputs_gradient[kInputTimeDerivativeKey] *
-                self_processing_time / Square(parallelism);
+            output_time_der * inputs_time_der_sum * self_processing_time /
+                Square(parallelism);
       } else if (buffer_size_parameter &&
                  (*buffer_size_parameter)->state->tunable) {
-        (*gradient)[long_name()] = buffer_size_der;
+        (*gradients)[long_name()] = buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(output_time, input_time, buffer_size,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        self_processing_time / parallelism +
-        ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
-    return ComputeWaitTime(output_time, old_input_time, buffer_size,
-                           /*output_time_derivative=*/nullptr,
-                           /*input_time_derivative=*/nullptr,
-                           /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -449,8 +537,8 @@ class AsyncKnownRatio : public Node {
     }
     double processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -469,44 +557,64 @@ class UnknownRatio : public Node {
     return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
   }
 
-  // The output time is the sum of the self processing time and the product of
-  // the ratio estimate and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
     if (num_elements_ == 0 || inputs_.empty() ||
         inputs_.front()->num_elements() == 0) {
-      return SelfProcessingTimeLocked();
+      (*input_times)[long_name()] = old_input_time;
+      return;
     }
-    // TODO(jsimsa): The current implementation assumes that the number of input
-    // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    double old_input_time = input_times->back();
-    input_times->back() = (old_input_time + SelfProcessingTimeLocked()) / ratio;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      double result =
-          SelfProcessingTimeLocked() +
-          ratio * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der +
-          inputs_gradient[kInputTimeDerivativeKey] / ratio;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio;
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
+  // The output time is the sum of the self processing time and the product of
+  // the ratio estimate and the sum of output times of inputs.
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (num_elements_ == 0 || inputs_.empty() ||
+        inputs_.front()->num_elements() == 0) {
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-      return result;
+      return;
     }
-    return SelfProcessingTimeLocked() +
-           ratio * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+    // TODO(jsimsa): The current implementation assumes that the number of input
+    // elements consumed per output is the same across all inputs.
+    double ratio = static_cast<double>(inputs_.front()->num_elements()) /
+                   static_cast<double>(num_elements_);
+    double result =
+        self_processing_time + ratio * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -520,8 +628,7 @@ class UnknownRatio : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (inputs_.empty() || num_elements_ == 0) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     // TODO(jsimsa): The current implementation assumes that the number of input
@@ -531,8 +638,8 @@ class UnknownRatio : public Node {
                    static_cast<double>(num_elements_);
     double processing_time =
         ratio * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -548,11 +655,30 @@ class Unknown : public Node {
     return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
   }
 
-  // The output time is the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return OutputTimeForInputs(input_times, gradient);
+    double input_time;
+    if (output_) {
+      input_time = (*input_times)[output_->long_name()];
+    } else {
+      input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
+  // The output time is the sum of output times of inputs.
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double result = OutputTimeForInputs(*output_times);
+    (*output_times)[long_name()] = result;
+    if (gradients) {
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
   }
 
   // The processing time is the sum of processing times of inputs.
@@ -562,8 +688,7 @@ class Unknown : public Node {
       TF_SHARED_LOCKS_REQUIRED(mu_) {
     double processing_time =
         TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), processing_time));
+    (*total_processing_times)[long_name()] = processing_time;
   }
 };
 
@@ -751,19 +876,21 @@ double Node::ComputeWaitTime(const double& output_time,
 
 void Node::CollectTunableParameters(
     absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
-  CollectTunableParametersHelper(parameters);
-
+  tf_shared_lock l(mu_);
   // Collect tunable parameters from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->CollectTunableParametersHelper(parameters);
   }
+  CollectTunableParametersHelper(parameters);
 }
 
 string Node::DebugString() const {
   absl::flat_hash_map<string, string> debug_strings;
-
+  tf_shared_lock l(mu_);
   // Build up the debug string from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->DebugStringHelper(&debug_strings);
   }
   DebugStringHelper(&debug_strings);
@@ -780,10 +907,35 @@ void Node::FlushMetrics() {
   metrics_.record_num_elements(num_elements_);
 }
 
-double Node::OutputTime(std::vector<double>* input_times,
-                        absl::flat_hash_map<string, double>* gradient) const {
+double Node::OutputTime(absl::flat_hash_map<string, double>* input_times,
+                        absl::flat_hash_map<string, double>* gradients) const {
+  // To store the output time gradient w.r.t. input time (if `gradients` is not
+  // `nullptr`) and the output time for each node.
+  absl::flat_hash_map<string, double> output_time_gradients, output_times;
   tf_shared_lock l(mu_);
-  return OutputTimeLocked(input_times, gradient);
+  auto nodes = CollectNodes(TraversalOrder::BFS);
+
+  // Computes and stores input time for each node from the root to leaves of the
+  // nodes tree.
+  InputTimeLocked(input_times);
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->InputTimeLocked(input_times);
+  }
+
+  std::reverse(nodes.begin(), nodes.end());
+  // Computes and stores the output time and output time gradient w.r.t. input
+  // time (if `gradients` is not `nullptr`) for each node from leaves of the
+  // nodes tree to the root.
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->OutputTimeLocked(*input_times, gradients, &output_times,
+                           &output_time_gradients);
+  }
+  OutputTimeLocked(*input_times, gradients, &output_times,
+                   &output_time_gradients);
+
+  return output_times[long_name()];
 }
 
 std::shared_ptr<Node> Node::Snapshot(std::shared_ptr<Node> output) const {
@@ -808,9 +960,10 @@ double Node::SelfProcessingTime() const {
 
 double Node::TotalBufferedBytes() const {
   absl::flat_hash_map<string, double> total_bytes;
-
+  tf_shared_lock l(mu_);
   // Compute total buffered bytes from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->TotalBufferedBytesHelper(&total_bytes);
   }
   TotalBufferedBytesHelper(&total_bytes);
@@ -820,10 +973,11 @@ double Node::TotalBufferedBytes() const {
 
 double Node::TotalMaximumBufferedBytes() const {
   absl::flat_hash_map<string, double> total_bytes;
-
+  tf_shared_lock l(mu_);
   // Compute total maximum buffered bytes from the leaves of the nodes tree
   // to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->TotalMaximumBufferedBytesHelper(&total_bytes);
   }
   TotalMaximumBufferedBytesHelper(&total_bytes);
@@ -836,17 +990,16 @@ double Node::TotalProcessingTime(
   // Create a hash map to store the per-element CPU time spent in the subtree
   // rooted in each node.
   absl::flat_hash_map<string, double> total_processing_times;
+  tf_shared_lock l(mu_);
 
   // Computes per-element CPU time spent in the subtree rooted in the node from
   // the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
     tf_shared_lock l(node->mu_);
     node->TotalProcessingTimeLocked(processing_times, &total_processing_times);
   }
-  {
-    tf_shared_lock l(mu_);
-    TotalProcessingTimeLocked(processing_times, &total_processing_times);
-  }
+  TotalProcessingTimeLocked(processing_times, &total_processing_times);
+
   return total_processing_times[long_name()];
 }
 
@@ -859,13 +1012,25 @@ double Node::AverageBufferedElementSize() const {
 }
 
 double Node::OutputTimeForInputs(
-    std::vector<double>* input_times,
-    absl::flat_hash_map<string, double>* gradient) const {
+    const absl::flat_hash_map<string, double>& output_times) const {
   double sum = 0;
   for (auto& input : inputs_) {
     // Inputs for which autotuning is disabled are excluded.
     if (input->autotune()) {
-      sum += input->OutputTime(input_times, gradient);
+      sum += output_times.at(input->long_name());
+    }
+  }
+  return sum;
+}
+
+double Node::OutputTimeGradientsForInputs(
+    const absl::flat_hash_map<string, double>& output_time_gradients) const {
+  double sum = 0;
+  for (auto& input : inputs_) {
+    // Inputs for which autotuning is disabled are excluded.
+    if (input->autotune()) {
+      sum +=
+          gtl::FindWithDefault(output_time_gradients, input->long_name(), 0.0L);
     }
   }
   return sum;
@@ -919,12 +1084,12 @@ double Node::SelfProcessingTimeLocked() const {
          static_cast<double>(num_elements_);
 }
 
-Node::NodeVector Node::CollectNodes() const {
+Node::NodeVector Node::CollectNodes(TraversalOrder order) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   NodeVector node_vector;
   std::list<std::shared_ptr<Node>> temp_list;
 
   {
-    tf_shared_lock l(mu_);
     for (auto& input : inputs_) {
       node_vector.push_back(input);
       temp_list.push_back(input);
@@ -942,16 +1107,19 @@ Node::NodeVector Node::CollectNodes() const {
       }
     }
   }
-  std::reverse(node_vector.begin(), node_vector.end());
+
+  if (order == TraversalOrder::REVERSE_BFS) {
+    std::reverse(node_vector.begin(), node_vector.end());
+  }
   return node_vector;
 }
 
 void Node::CollectTunableParametersHelper(
-    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
+    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     return;
   }
-  tf_shared_lock l(mu_);
   for (auto& pair : parameters_) {
     if (pair.second->state->tunable) {
       parameters->insert(std::make_pair(long_name(), pair.second));
@@ -959,9 +1127,8 @@ void Node::CollectTunableParametersHelper(
   }
 }
 
-void Node::DebugStringHelper(
-    absl::flat_hash_map<string, string>* debug_strings) const {
-  tf_shared_lock l(mu_);
+void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+    const TF_SHARED_LOCKS_REQUIRED(mu_) {
   string result;
   strings::StrAppend(&result, long_name(), ":\n");
   strings::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
@@ -1011,13 +1178,13 @@ std::shared_ptr<Node> Node::SnapshotHelper(
 }
 
 void Node::TotalBufferedBytesHelper(
-    absl::flat_hash_map<string, double>* total_bytes) const {
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     total_bytes->insert(std::make_pair(long_name(), 0));
     return;
   }
 
-  tf_shared_lock l(mu_);
   double result = 0;
   auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
   if (!parameter) {
@@ -1033,13 +1200,13 @@ void Node::TotalBufferedBytesHelper(
 }
 
 void Node::TotalMaximumBufferedBytesHelper(
-    absl::flat_hash_map<string, double>* total_bytes) const {
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     total_bytes->insert(std::make_pair(long_name(), 0));
     return;
   }
 
-  tf_shared_lock l(mu_);
   double result = 0;
   auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
   if (!parameter) {
@@ -1181,8 +1348,8 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   double new_output_time;
   double new_value;
   for (int i = 0; i < kMaxIterations; ++i) {
-    absl::flat_hash_map<string, double> gradient;
-    new_output_time = OutputTime(snapshot, &gradient);
+    absl::flat_hash_map<string, double> gradients;
+    new_output_time = OutputTime(snapshot, &gradients);
     int64 model_parallelism = 0;
     for (auto& pair : essential_parameters) {
       model_parallelism += std::round(pair.second->value);
@@ -1199,12 +1366,12 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
     for (auto& pair : parameters) {
       if (pair.second->value != pair.second->max) {
         max_abs_derivative =
-            std::max(max_abs_derivative, std::abs(gradient[pair.first]));
+            std::max(max_abs_derivative, std::abs(gradients[pair.first]));
       }
     }
     for (auto& pair : parameters) {
       new_value = pair.second->value -
-                  kDescentStep * gradient[pair.first] / max_abs_derivative;
+                  kDescentStep * gradients[pair.first] / max_abs_derivative;
       // Projection on a feasible interval.
       if (new_value > pair.second->max) {
         pair.second->value = pair.second->max;
@@ -1248,7 +1415,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
     pair.second->value = pair.second->min;
   }
   while (true) {
-    const double output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+    const double output_time = OutputTime(snapshot, /*gradients=*/nullptr);
     bool all_max = true;
     for (auto& pair : parameters) {
       if (pair.second->value < pair.second->max) {
@@ -1267,7 +1434,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
         continue;
       }
       pair.second->value++;
-      double new_output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+      double new_output_time = OutputTime(snapshot, /*gradients=*/nullptr);
       double delta = output_time - new_output_time;
       if (delta > best_delta &&
           (delta > kBufferSizeMinDelta || pair.second->name != kBufferSize)) {
@@ -1297,15 +1464,18 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
 }
 
 double Model::OutputTime(std::shared_ptr<Node> node,
-                         absl::flat_hash_map<string, double>* gradient) {
-  std::vector<double> input_times(1, 0);
+                         absl::flat_hash_map<string, double>* gradients) {
+  // To store the input time for each node.
+  absl::flat_hash_map<string, double> input_times;
+
   // TODO(jsimsa): Now that we are accounting for buffer size in wait time
   // computation, assuming that the input is infinitely fast will result in
   // inaccurate estimates of the output latency.
   //
   // We should compute the output latency as a fix-point of the following
   // equation: `output_time = node(OutputTime(input_times(1, output_time))`.
-  return node->OutputTime(&input_times, gradient);
+
+  return node->OutputTime(&input_times, gradients);
 }
 
 double Model::TotalBufferedBytes(std::shared_ptr<Node> node) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index a4af549fad2..e325056f0c4 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -42,11 +42,19 @@ constexpr int64 kAutotune = -1;
 constexpr char kParallelism[] = "parallelism";
 constexpr char kBufferSize[] = "buffer_size";
 
+// A key used to identify input time gradient.
+constexpr char kInputTimeKey[] = "input_time";
+
 enum class AutotuneAlgorithm {
   HILL_CLIMB = 0,
   GRADIENT_DESCENT = 1,
 };
 
+enum class TraversalOrder {
+  BFS = 0,
+  REVERSE_BFS = 1,
+};
+
 // Represents thread-safe state that can be shared between an input pipeline and
 // the performance model.
 struct SharedState {
@@ -316,11 +324,11 @@ class Node {
   // Flushes the metrics recorded by this node.
   void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  double OutputTime(std::vector<double>* input_times,
-                    absl::flat_hash_map<string, double>* gradient) const
+  // Returns the per-element output time for this node and if `gradients` is not
+  // `nullptr`, collects the output time gradient w.r.t. tunable parameters of
+  // the subtree rooted in this node.
+  double OutputTime(absl::flat_hash_map<string, double>* input_times,
+                    absl::flat_hash_map<string, double>* gradients) const
       TF_LOCKS_EXCLUDED(mu_);
 
   // Returns a copy of this node, making a deep copy of its inputs and a
@@ -414,20 +422,34 @@ class Node {
   // Returns the average size of an element buffered in this node.
   double AverageBufferedElementSize() const TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the sum of per-element output time for the inputs of this node and
-  // if `gradient` is not `nullptr`, collects gradients of output times w.r.t.
-  // tunable parameters and the last input time.
-  double OutputTimeForInputs(std::vector<double>* input_times,
-                             absl::flat_hash_map<string, double>* gradient)
-      const TF_SHARED_LOCKS_REQUIRED(mu_);
+  // Returns the sum of per-element output time for the tunable inputs of this
+  // node.
+  double OutputTimeForInputs(
+      const absl::flat_hash_map<string, double>& output_times) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  virtual double OutputTimeLocked(std::vector<double>* input_times,
-                                  absl::flat_hash_map<string, double>* gradient)
+  // Returns the sum of output time gradient w.r.t. input time for the tunable
+  // inputs of this node.
+  double OutputTimeGradientsForInputs(
+      const absl::flat_hash_map<string, double>& output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Computes the input time for this node and stores it in `input_times`.
+  virtual void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
+  // Computes the per-element output time for this node and stores it in
+  // `output_times`. If `gradients` is not `nullptr`, computes the output time
+  // gradient w.r.t. tunable parameters of the subtree rooted in this node and
+  // stores it in `gradients`, also computes the output time gradient w.r.t.
+  // input time and stores it in `output_time_gradients`.
+  virtual void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
   // Returns the sum of per-element processing time for the inputs of this node
   // by adding values for input nodes in `total_processing_times`. Processing
   // time for a given input is a weighted combination of a statistic based on
@@ -452,18 +474,20 @@ class Node {
       absl::flat_hash_map<string, double>* total_processing_times)
       TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
-  // Returns a vector of nodes of the subtree rooted in this node.
-  // The nodes are in the reverse breadth-first search order.
-  NodeVector CollectNodes() const;
+  // Returns a vector of nodes of the subtree rooted in this node. The nodes are
+  // either in breadth-first search or reverse breadth-first search order
+  // depending on the `order` argument. The root node itself is not collected.
+  NodeVector CollectNodes(TraversalOrder order) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Collect tunable parameters for the node.
   void CollectTunableParametersHelper(
-      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters)
-      const;
+      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Build up debug string for the node and store in the debug strings map.
-  void DebugStringHelper(
-      absl::flat_hash_map<string, string>* debug_strings) const;
+  void DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+      const TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Copy the node and add the (input, copy) pairs to the NodePairList.
   std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> clone_base,
@@ -471,12 +495,14 @@ class Node {
 
   // Compute total buffered bytes for the node and store in the total bytes map.
   void TotalBufferedBytesHelper(
-      absl::flat_hash_map<string, double>* total_bytes) const;
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Compute total maximum buffered bytes for the node and store in the total
   // bytes map.
   void TotalMaximumBufferedBytesHelper(
-      absl::flat_hash_map<string, double>* total_bytes) const;
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Stores the time passed to the last call to `Node::record_start()` on the
   // current thread.
@@ -619,11 +645,11 @@ class Model {
   // an element divided by CPU budget.
   void OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget);
 
-  // Collects the output time and if `gradient` is not `nullptr`, the output
+  // Collects the output time and if `gradients` is not `nullptr`, the output
   // time gradient w.r.t. tunable parameters of the subtree rooted in the given
-  // node and the last input time.
+  // node.
   double OutputTime(std::shared_ptr<Node> node,
-                    absl::flat_hash_map<string, double>* gradient);
+                    absl::flat_hash_map<string, double>* gradients);
 
   // Collects the processing time for the given node.
   double TotalProcessingTime(std::shared_ptr<Node> node);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 898594b7c81..688dd0083e9 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -44,18 +44,19 @@ TEST_P(AsyncInterleaveManyTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", async_interleave_many});
+      model::MakeSourceNode({2, "source1", async_interleave_many});
   async_interleave_many->add_input(source1);
   auto cleanup1 = gtl::MakeCleanup([async_interleave_many, source1]() {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(), 0);
   async_interleave_many->record_buffer_event(110, 10);
@@ -123,7 +124,8 @@ TEST_P(AsyncKnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(), 0);
   async_known_many->record_buffer_event(110, 10);
@@ -194,12 +196,12 @@ TEST(InterleaveManyTest, Model) {
       model::MakeSourceNode({1, "meta_source", interleave_many});
   interleave_many->add_input(meta_source);
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", interleave_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->add_input(source1);
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", interleave_many});
+      model::MakeSourceNode({3, "source2", interleave_many});
   interleave_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   interleave_many->add_processing_time(100);
   EXPECT_EQ(interleave_many->processing_time(), 100);
   EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
@@ -238,7 +240,7 @@ TEST_P(KnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr), 0);
@@ -286,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
 
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source->add_processing_time(100);
   EXPECT_EQ(source->processing_time(), 100);
   EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -310,7 +312,7 @@ TEST(UnknownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   unknown_many->add_processing_time(100);
   EXPECT_EQ(unknown_many->processing_time(), 100);
   EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -345,7 +347,7 @@ TEST(UnknownTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 0);
@@ -390,17 +392,23 @@ class TestNode : public model::Node {
     return nullptr;
   }
 
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return 0;
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {}
+
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    (*output_times)[long_name()] = 0;
   }
 
   void TotalProcessingTimeLocked(
       absl::flat_hash_map<string, double>* processing_times,
       absl::flat_hash_map<string, double>* total_processing_times) override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
-    total_processing_times->insert(std::make_pair(long_name(), 0));
+    (*total_processing_times)[long_name()] = 0;
   }
 };
 
@@ -504,7 +512,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 = model::MakeAsyncInterleaveManyNode(
-      {0, "async_interleave_many", nullptr},
+      {2, "async_interleave_many", async_interleave_many},
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -514,12 +522,13 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   async_interleave_many->CollectTunableParameters(&parameters);
   async_interleave_many->record_element();
@@ -532,13 +541,13 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[source1->long_name()]->value = 1;
 
   // Test gradient of own parameters.
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   double output_time =
-      async_interleave_many->OutputTime(&input_times, &gradient);
+      async_interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_interleave_many->long_name()]->value += kParameterStep;
   double new_output_time =
       async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_interleave_many->long_name()],
+  EXPECT_NEAR(gradients[async_interleave_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -546,7 +555,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[async_interleave_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -565,7 +574,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
           parameter_value)});
   std::shared_ptr<Node> source1 = model::MakeAsyncKnownRatioNode(
-      {0, "source1", nullptr}, num_inputs_per_output,
+      {1, "source1", async_known_many}, num_inputs_per_output,
       {model::MakeParameter(
           parameter_name,
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
@@ -573,7 +582,8 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   async_known_many->add_input(source1);
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   async_known_many->add_input(source2);
   source1->record_element();
   source1->add_processing_time(100);
@@ -584,14 +594,14 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
 
   // Test gradient of own parameters.
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   async_known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
   parameters[source1->long_name()]->value = 1;
-  double output_time = async_known_many->OutputTime(&input_times, &gradient);
+  double output_time = async_known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -599,7 +609,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   parameters[async_known_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -614,28 +624,29 @@ TEST(InterleaveManyGradientTest, Model) {
   std::shared_ptr<Node> interleave_many =
       model::MakeInterleaveManyNode({0, "interleave_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", interleave_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
           parallelism)});
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({2, "source1", async_known_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->record_element();
   interleave_many->add_processing_time(100);
   interleave_many->add_input(source1);
   interleave_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   interleave_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = interleave_many->OutputTime(&input_times, &gradient);
+  double output_time = interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -647,7 +658,7 @@ TEST(KnownRatioGradientTest, Model) {
   std::shared_ptr<Node> known_many = model::MakeKnownRatioNode(
       {0, "known_many", nullptr}, num_inputs_per_output);
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", known_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -657,15 +668,16 @@ TEST(KnownRatioGradientTest, Model) {
   known_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = known_many->OutputTime(&input_times, &gradient);
+  double output_time = known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -677,7 +689,7 @@ TEST(UnknownRatioGradientTest, Model) {
   std::shared_ptr<Node> unknown_many =
       model::MakeUnknownRatioNode({0, "unknown_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -687,15 +699,16 @@ TEST(UnknownRatioGradientTest, Model) {
   unknown_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   unknown_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown_many->OutputTime(&input_times, &gradient);
+  double output_time = unknown_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -707,7 +720,7 @@ TEST(UnknownGradientTest, Model) {
   std::shared_ptr<Node> unknown =
       model::MakeUnknownNode({0, "unknown", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -717,15 +730,16 @@ TEST(UnknownGradientTest, Model) {
   unknown->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   unknown->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown->OutputTime(&input_times, &gradient);
+  double output_time = unknown->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index dea217367dc..9ab3de788fc 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -108,7 +108,7 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     for _ in range(999):
       result = result.concatenate(ds)
     options = dataset_ops.Options()
-    options.experimental_optimization.autotune = False
+    options.experimental_optimization.autotune = True
     result = result.with_options(options)
     self.assertDatasetProduces(result, [0]*1000)
 

From 47fbe120b00942287075c5175747f0023cc9409d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 16 May 2020 02:02:56 -0700
Subject: [PATCH 0728/1533] Update GraphDef version to 403.

PiperOrigin-RevId: 311868879
Change-Id: I65b672d01f04083f61d3fe61f4c9778eedbd7d87
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8f0967c1eaa..c3a1fe1ed16 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 402  // Updated: 2020/5/15
+#define TF_GRAPH_DEF_VERSION 403  // Updated: 2020/5/16
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 0102aeaa137467d5b273c61390795558d90c4b73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 16 May 2020 02:02:57 -0700
Subject: [PATCH 0729/1533] compat: Update forward compatibility horizon to
 2020-05-16

PiperOrigin-RevId: 311868882
Change-Id: I6909dffc42b26ede9ace7d810b234929a7275ca0
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 29ba7317747..f1c599c15c6 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 15)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 16)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 74396bcd3096c075e24a62f34c3f5d6c0ad3c454 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Sat, 16 May 2020 02:12:45 -0700
Subject: [PATCH 0730/1533] Fix windows build.

PiperOrigin-RevId: 311869440
Change-Id: Ic2b9f5da404bb7049627271c291349c1ad1fec25
---
 tensorflow/compiler/aot/tfcompile.bzl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index f0c3e7da0ba..208b01c49d5 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -208,14 +208,14 @@ def tf_library(
         srcs.append(debug_info)
         debug_info_flag = " --debug_info=$(location " + debug_info + ")"
 
-    default_fast_math_xla_flags = "XLA_FLAGS=\"\
-      --xla_cpu_enable_fast_math=true \
-      --xla_cpu_fast_math_honor_nans=false \
-      --xla_cpu_fast_math_honor_infs=false \
-      --xla_cpu_fast_math_honor_functions=false \
-      --xla_cpu_fast_math_honor_division=false \
-      --xla_cpu_enable_fast_min_max=true \
-      $${XLA_FLAGS:-}\" "
+    default_fast_math_xla_flags = ("XLA_FLAGS='" +
+                                   "--xla_cpu_enable_fast_math=true " +
+                                   "--xla_cpu_fast_math_honor_nans=false " +
+                                   "--xla_cpu_fast_math_honor_infs=false " +
+                                   "--xla_cpu_fast_math_honor_functions=false " +
+                                   "--xla_cpu_fast_math_honor_division=false " +
+                                   "--xla_cpu_enable_fast_min_max=true " +
+                                   "$${XLA_FLAGS:-}' ")
 
     native.genrule(
         name = ("gen_" + name),

From fd976b2defe66ac368b8cc5c96500bf5fe7b1d12 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Sat, 16 May 2020 02:21:00 -0700
Subject: [PATCH 0731/1533] optimize for int8 add.

PiperOrigin-RevId: 311869888
Change-Id: I4009635592941be39aa5c71e185e3eecbc2ec49c
---
 .../internal/optimized/integer_ops/add.h      | 141 +++++++++++-------
 1 file changed, 91 insertions(+), 50 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index a9dae4feac5..ff8e4687d58 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -35,58 +35,99 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
+
 #ifdef USE_NEON
-  const int8x8_t output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
-  const int8x8_t output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
-    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
-    const int16x8_t input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high = vget_high_s16(input1_val);
-    const int16x4_t input1_val_low = vget_low_s16(input1_val);
-    const int16x4_t input2_val_high = vget_high_s16(input2_val);
-    const int16x4_t input2_val_low = vget_low_s16(input2_val);
-    int32x4_t x11 = vmovl_s16(input1_val_low);
-    int32x4_t x12 = vmovl_s16(input1_val_high);
-    int32x4_t x21 = vmovl_s16(input2_val_low);
-    int32x4_t x22 = vmovl_s16(input2_val_high);
-    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
-    x11 = vshlq_s32(x11, left_shift_dup);
-    x12 = vshlq_s32(x12, left_shift_dup);
-    x21 = vshlq_s32(x21, left_shift_dup);
-    x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
-    x11 = vshlq_s32(x11, input1_shift_dup);
-    x12 = vshlq_s32(x12, input1_shift_dup);
-    x21 = vshlq_s32(x21, input2_shift_dup);
-    x22 = vshlq_s32(x22, input2_shift_dup);
-    int32x4_t s1 = vaddq_s32(x11, x21);
-    int32x4_t s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+  const int8x16_t output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, -params.output_shift);
-    s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const int16x4_t s1_narrowed = vmovn_s32(s1);
-    const int16x4_t s2_narrowed = vmovn_s32(s2);
-    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                                  vdupq_n_s16(params.output_offset));
-    const int8x8_t clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
-    vst1_s8(output_data + i, clamped);
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+        vmaxq_s8(output_activation_min_vector,
+                 vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From d70dc548b58c56a6a510b8d676cbc08ffdad3189 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Sat, 16 May 2020 03:51:08 -0700
Subject: [PATCH 0732/1533] Optimize trivial RealDiv ops

PiperOrigin-RevId: 311874492
Change-Id: I8084b4a0a913d4585420bff20a21688ae8d41286
---
 .../mlir/tensorflow/ir/tf_generated_ops.td        |  2 ++
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc  | 15 ++++++++++-----
 .../mlir/tensorflow/tests/constant-fold.mlir      |  9 +++++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 82282bb925a..d53bafff638 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -6331,6 +6331,8 @@ If `x` and `y` are reals, this will return the floating-point division.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
 def TF_ReciprocalOp : TF_Op<"Reciprocal", [NoSideEffect, SameOperandsAndResultType]> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 2007824369c..78623ca3c61 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -110,7 +110,6 @@ static inline bool HasRankAtMost(Value value, int64_t rank) {
   return !type || type.getRank() <= rank;
 }
 
-
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
@@ -462,9 +461,10 @@ LogicalResult FoldOperandsPermutation(
 namespace {
 // Folder that returns LHS of an Arithmetic Op if the RHS is a constant
 // known to be Identity (e.g X+0)
-template <typename OpT,
-          typename std::enable_if<llvm::is_one_of<
-              OpT, AddV2Op, SubOp, MulOp, DivOp>::value>::type * = nullptr>
+template <
+    typename OpT,
+    typename std::enable_if<llvm::is_one_of<
+        OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
 OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
                                         ArrayRef<Attribute> operands) {
   auto result_op_type = arithmetic_op.getResult().getType();
@@ -479,7 +479,8 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
   // Mul and Div ops have identity value one while AddV2 and SubOp have identity
   // value zero.
   int identity =
-      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value);
+      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value ||
+       std::is_same<OpT, RealDivOp>::value);
 
   Type element_ty = lhs_type.getElementType();
   Attribute identity_attr;
@@ -2408,6 +2409,10 @@ void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<RealDivWithSqrtDivisor>(context);
 }
 
+OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index bccb8923134..32815956ff7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -384,6 +384,15 @@ func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor
   // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
 }
 
+func @RemoveTrivialRealDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.0> : tensor<2x2xf32>
+  %0 = "tf.RealDiv"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialRealDiv
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
 func @RemoveTrivialDivBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   %cst = constant dense<1.0> : tensor<2x2xbf16>
   %0 = "tf.Div"(%arg0, %cst) : (tensor<2x2xbf16>, tensor<2x2xbf16>) -> tensor<2x2xbf16>

From 766f2968fcecbd815e7090aea70ad79d471a1332 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Sat, 16 May 2020 15:00:44 -0700
Subject: [PATCH 0733/1533] Simplify some tests

PiperOrigin-RevId: 311910223
Change-Id: I751aa9344c08a490261822dc8010d1704da95a7c
---
 .../mlir/tensorflow/tests/constant-fold.mlir  | 70 ++++++++-----------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 32815956ff7..2119e78bd1e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -302,15 +302,13 @@ func @testTensorListElementShape(%arg0: tensor<!tf.variant<tensor<2x4xf32>>>) ->
   return %0: tensor<2xi32>
 }
 
-func @RemoveTrivialAdd(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Add"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Add"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialAdd
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialAddBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
@@ -331,26 +329,22 @@ func @RemoveTrivialAddBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   // CHECK-NEXT: return %arg0 : tensor<2x2xbf16>
 }
 
-func @RemoveTrivialAddV2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialAddV2(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialAddV2
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
-func @RemoveTrivialSub(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialSub(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Sub"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Sub"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialSub
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialSubInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
@@ -362,26 +356,22 @@ func @RemoveTrivialSubInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
   // CHECK-NEXT: return %arg0 : tensor<2x2xi8>
 }
 
-func @RemoveTrivialMul(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialMul(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<1.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Mul"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Mul"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialMul
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
-func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<1.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Div"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Div"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialDiv
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialRealDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
@@ -420,28 +410,24 @@ func @DivBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   // CHECK: tf.Div
 }
 
-func @DontRemoveTrivialAdd(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<2x2xf32> {
+func @DontRemoveTrivialAdd(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: DontRemoveTrivialAdd
   // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
-  // CHECK: %[[add:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%[[add]], %[[CONST]]) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK: return %[[RESULT]] : tensor<2x2xf32>
 }
 
-func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>, %arg1: tensor<2x2xf32>) -> tensor<?x?xf32> {
+func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<?x?xf32> , tensor<2x2xf32>) -> tensor<?x?xf32>
-  return %1 :tensor<?x?xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<?x?xf32> , tensor<2x2xf32>) -> tensor<?x?xf32>
+  return %0 :tensor<?x?xf32>
 
   // CHECK-LABEL: DontRemoveTrivialAdd2
   // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
-  // CHECK: %[[add:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%[[add]], %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
   // CHECK: return %[[RESULT]] : tensor<?x?xf32>
 }

From bf639d750bc3eb22a0ac2affb24772658641e1bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 16 May 2020 16:12:32 -0700
Subject: [PATCH 0734/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/7af0c8559b6d

PiperOrigin-RevId: 311914136
Change-Id: I8a04df09178d9dfce79c13c8d14daf4f69048dee
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.h            |  2 +-
 .../compiler/mlir/tensorflow/ir/control_flow_ops.h    |  2 +-
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h       |  2 +-
 .../mlir/tensorflow/transforms/constant_fold.cc       |  2 +-
 tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h           |  2 +-
 tensorflow/compiler/mlir/xla/ir/chlo_ops.h            |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.h             |  2 +-
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.h            |  2 +-
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc     |  5 ++---
 tensorflow/compiler/xla/service/gpu/ir_emitter.cc     |  2 +-
 third_party/mlir/BUILD                                | 11 ++++-------
 11 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 0e6a3db1f1b..c7a1504c3b7 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
index 15a4ecfc537..39245425a5a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFControlFlow {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 979f506b3b1..88307267ab4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index be35c6caa16..55a0b5c3fd3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/tf_status.h"
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 545183a052b..9c98c9b0e19 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.h b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
index 474d4b7d95a..a5337907579 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 namespace xla_chlo {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
index 25b2f009cc6..9725a0684f6 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
index 190c5ff832d..1c4ccaae214 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 5a4c6250293..70dde919afb 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -2875,9 +2875,8 @@ Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
                                  old_state->getType()->getScalarType(),
                                  address->getType()->getPointerAddressSpace()));
   llvm::StoreInst* store = Store(old_state, address);
-  store->setAlignment(
-      llvm::MaybeAlign(IrEmitter::MinimumAlignmentForPrimitiveType(
-          rng_state->shape().element_type())));
+  store->setAlignment(llvm::Align(IrEmitter::MinimumAlignmentForPrimitiveType(
+      rng_state->shape().element_type())));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 011eb07d3bd..744cd7b56bf 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -222,7 +222,7 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     // Derive a minimum alignment from the type. The optimizer can increase it
     // later.
     store->setAlignment(
-        llvm::MaybeAlign(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
+        llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
     return true;
   }
 
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 5636bc27cff..58c932ea723 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -788,9 +788,6 @@ cc_library(
             "lib/Support/*.h",
         ],
         exclude = [
-            # TODO(herhut): Move JitRunner out of Support so that Support does not
-            # depend on dialect.
-            "lib/Support/JitRunner.cpp",
             # TODO(jpienaar): Move this out, else Support depends on Analysis/
             "lib/Support/MlirOptMain.cpp",
         ],
@@ -2232,10 +2229,10 @@ gentbl(
 cc_library(
     name = "SideEffects",
     srcs = [
-        "lib/Interfaces/SideEffects.cpp",
+        "lib/Interfaces/SideEffectInterfaces.cpp",
     ],
     hdrs = [
-        "include/mlir/Interfaces/SideEffects.h",
+        "include/mlir/Interfaces/SideEffectInterfaces.h",
     ],
     includes = ["include"],
     deps = [
@@ -2621,8 +2618,8 @@ cc_binary(
 
 cc_library(
     name = "MlirJitRunner",
-    srcs = ["lib/Support/JitRunner.cpp"],
-    hdrs = ["include/mlir/Support/JitRunner.h"],
+    srcs = ["lib/ExecutionEngine/JitRunner.cpp"],
+    hdrs = ["include/mlir/ExecutionEngine/JitRunner.h"],
     includes = ["include"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",

From 82d70b6763317e59ab84f42c095d96b676b6d4cd Mon Sep 17 00:00:00 2001
From: Anudhyan Boral <anudhyan@google.com>
Date: Sat, 16 May 2020 19:59:08 -0700
Subject: [PATCH 0735/1533] Add F64 Sqrt test.

PiperOrigin-RevId: 311926087
Change-Id: I2f71e56825ad255a823c5a2fdd593231c474e6b1
---
 tensorflow/compiler/xla/client/lib/math_test.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 9b8156efe5b..cb79b2ef7db 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -236,6 +236,19 @@ XLA_TEST_F(MathTest, SqrtF32) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, SqrtF64) {
+  XlaBuilder builder(TestName());
+  Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F64);
+
+  std::unique_ptr<GlobalData> zero_data =
+      client_->TransferToServer(zero_literal).ConsumeValueOrDie();
+
+  XlaOp zero = Parameter(&builder, 0, zero_literal.shape(), "zero");
+  Sqrt(zero);
+
+  ComputeAndCompareR0<double>(&builder, 0.0f, {zero_data.get()}, error_spec_);
+}
+
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64
 XLA_TEST_F(MathTest, ErfInvF64) {
   XlaBuilder builder(TestName());

From 93955171ee302e272ef59e286cf8c5b3060112ec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 17 May 2020 02:02:56 -0700
Subject: [PATCH 0736/1533] Update GraphDef version to 404.

PiperOrigin-RevId: 311945310
Change-Id: Ieda91ca5df65ea1f26085b5d8420b954815f0c7d
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index c3a1fe1ed16..63501a14f56 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 403  // Updated: 2020/5/16
+#define TF_GRAPH_DEF_VERSION 404  // Updated: 2020/5/17
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From c4a9de96742d85a8772e3868fc2f13955a195e18 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 17 May 2020 02:02:57 -0700
Subject: [PATCH 0737/1533] compat: Update forward compatibility horizon to
 2020-05-17

PiperOrigin-RevId: 311945312
Change-Id: I16d03e29d3c39925d112516edcb48d3f4c16c0d1
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index f1c599c15c6..2a99a0774ad 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e480d8f7ff66dbab239019c9f202748f6fa1f661 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Sun, 17 May 2020 21:07:20 -0700
Subject: [PATCH 0738/1533] Add a compile flag (i.e. adding "--define
 tflite_with_xnnpack=true" when using bazel to build, which corresponds to
 defining macro TFLITE_BUILD_WITH_XNNPACK_DELEGATE) to apply XNNPACK delegate
 in TFLite runtime.

This is mainly to support Windows where weak symbols are not supported.

PiperOrigin-RevId: 312011534
Change-Id: I27c6b206b8aa8ded2d2671c2fca843574f75752b
---
 tensorflow/lite/BUILD                         | 32 ++++++++++++
 tensorflow/lite/interpreter_builder.cc        | 26 +++-------
 .../lite/tflite_with_xnnpack_optional.cc      | 52 +++++++++++++++++++
 .../lite/tflite_with_xnnpack_optional.h       | 26 ++++++++++
 4 files changed, 117 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/lite/tflite_with_xnnpack_optional.cc
 create mode 100644 tensorflow/lite/tflite_with_xnnpack_optional.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 14babee2da7..ef25f03562f 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -246,6 +246,7 @@ cc_library(
         ":minimal_logging",
         ":simple_memory_arena",
         ":string",
+        ":tflite_with_xnnpack_optional",
         ":type_to_tflitetype",
         ":util",
         ":version",
@@ -311,6 +312,8 @@ cc_library(
     ],
 )
 
+# Link this library to inject XNNPACK delegate to TFLite runtime automatically
+# by utilizing the weak symbols if they're supported by the platform.
 cc_library(
     name = "tflite_with_xnnpack",
     srcs = ["tflite_with_xnnpack.cc"],
@@ -323,6 +326,35 @@ cc_library(
     alwayslink = 1,
 )
 
+# Enables applying XNNPACK delegate for float models in TFLite runtime.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "tflite_with_xnnpack_enabled",
+    values = {"define": "tflite_with_xnnpack=true"},
+)
+
+cc_library(
+    name = "tflite_with_xnnpack_optional",
+    srcs = ["tflite_with_xnnpack_optional.cc"],
+    hdrs = [
+        "core/macros.h",
+        "tflite_with_xnnpack_optional.h",
+    ],
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    defines = select({
+        ":tflite_with_xnnpack_enabled": ["TFLITE_BUILD_WITH_XNNPACK_DELEGATE"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//tensorflow/lite/c:common",
+    ] + select({
+        ":tflite_with_xnnpack_enabled": [
+            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
 cc_test(
     name = "string_util_test",
     size = "small",
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index fb87702fd13..43d81ef0770 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -108,27 +109,14 @@ TfLiteStatus ParseSparseIndexVector(const DimensionMetadata* src,
 
 const char* kEmptyTensorName = "";
 
-#if TFLITE_HAS_ATTRIBUTE_WEAK
 // Using weak symbols to create a delegate allows automatic injection of the
 // delegate simply by adding it as a dependency.
-
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-// For XNNPACK delegate, see also the strong override in
-// lite/tflite_with_xnnpack.cc.
-TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireXNNPACKDelegate(
-    int num_threads) {
-  return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
-}
-#else
-Interpreter::TfLiteDelegatePtr (*AcquireFlexDelegate)() = nullptr;
-Interpreter::TfLiteDelegatePtr (*AcquireXNNPACKDelegate)(int) = nullptr;
-#endif
-
 namespace impl {
 
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
@@ -541,17 +529,17 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter,
                                                 int num_threads) {
   // First, apply XNNPACK delegate if applicable.
-  if (AcquireXNNPACKDelegate && num_fp32_tensors_ > 0) {
-    if (auto xnnpack_delegate = AcquireXNNPACKDelegate(num_threads)) {
-      // The execution will fall back to default implementation if the XNNPACK
-      // delegate fails to be applied. Therefore, we ignore the return status
-      // here and let it fall through the rest of the code.
+  if (num_fp32_tensors_ > 0) {
+    // The execution will fall back to default implementation if the XNNPACK
+    // delegate fails to be applied. Therefore, we ignore the return status
+    // here and let it fall through the rest of the code.
+    if (auto xnnpack_delegate = MaybeCreateXNNPACKDelegate(num_threads)) {
       interpreter->ModifyGraphWithDelegate(std::move(xnnpack_delegate));
     }
   }
 
   // Secondly, apply Flex delegate if applicable.
-  if (has_flex_op_ && AcquireFlexDelegate) {
+  if (has_flex_op_) {
     if (auto flex_delegate = AcquireFlexDelegate()) {
       return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
     }
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.cc b/tensorflow/lite/tflite_with_xnnpack_optional.cc
new file mode 100644
index 00000000000..31d4ff50f28
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
+
+#include "tensorflow/lite/core/macros.h"
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#endif
+
+namespace tflite {
+
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+#ifndef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+// Using weak symbols to create a delegate allows automatic injection of the
+// delegate simply by adding it as a dependency. See the strong override in
+// lite/tflite_with_xnnpack.cc,
+TFLITE_ATTRIBUTE_WEAK TfLiteDelegatePtr
+AcquireXNNPACKDelegate(int num_threads) {
+  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+}
+#endif
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  auto opts = TfLiteXNNPackDelegateOptionsDefault();
+  // Note that we don't want to use the thread pool for num_threads == 1.
+  opts.num_threads = num_threads > 1 ? num_threads : 0;
+  return TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&opts),
+                           TfLiteXNNPackDelegateDelete);
+}
+#else
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  return AcquireXNNPACKDelegate(num_threads);
+}
+#endif
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.h b/tensorflow/lite/tflite_with_xnnpack_optional.h
new file mode 100644
index 00000000000..afbdbd17356
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#define TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+MaybeCreateXNNPACKDelegate(int num_threads);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_

From c5fbab166f3b983c39efc997e63a11c1bd7f549e Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Mon, 18 May 2020 00:02:44 -0700
Subject: [PATCH 0739/1533] Fix Core ML delegate framework's include

PiperOrigin-RevId: 312025787
Change-Id: I00121199e2363d307cd52c1b2bfa4cbc66d36831
---
 tensorflow/lite/experimental/ios/BUILD.apple | 22 ++++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 5c954bc3de8..a29e8bd6ed5 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -11,17 +11,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-genrule(
-    name = "strip_coreml_include_hdr",
-    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
-    outs = ["coreml_delegate.h"],
-    cmd = """
-    sed 's/#include \".*common.h"/#include \"common.h\"/' \
-    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)" \
-    > "$@"
-    """,
-)
-
 TFL_FRAMEWORK_HDRS = [
     "//tensorflow/lite/delegates/gpu:metal_delegate.h",
     "//tensorflow/lite/c:c_api.h",
@@ -57,6 +46,17 @@ ios_static_framework(
     ],
 )
 
+genrule(
+    name = "strip_coreml_include_hdr",
+    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
+    outs = ["coreml_delegate.h"],
+    cmd = """
+    sed "s|#include \".*common.h\"|#include \"TensorFlowLiteC/common.h\"|"\
+    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
+    > "$@"
+    """,
+)
+
 # This target builds the Core ML delegate as a separate static framework, which
 # does not include the TensorFlow Lite runtime. As this target does not contain
 # TensorFlow Lite runtime, it is intended to be linked along with the

From ea4ef0e6faf651c9f76ef90848dc62d8aa660ac1 Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Mon, 18 May 2020 00:03:24 -0700
Subject: [PATCH 0740/1533] Bumps llvm version

PiperOrigin-RevId: 312025889
Change-Id: I9c2a75e34bbfb2b9f6afaf0398c9cfde6870ac3b
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 404d253e8bd..452152efacf 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "9d4b4f344d8ea917e082cf58d66b71c0171e1650"
-    LLVM_SHA256 = "36e4470b5656cea3e0afb218edbdd96376fcb51dc2c5ed887b21237068baee41"
+    LLVM_COMMIT = "7af0c8559b6d9426dd5e977370516d2baa4c206f"
+    LLVM_SHA256 = "4c5efbc48755f9983a8522eddd6e448f0b93e3e75a56a507c1ecb44d367db6d5"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 344f8982507cd03ba79b7e21fef6f115451ee497 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 00:28:56 -0700
Subject: [PATCH 0741/1533] Slightly optimize quantized add.

PiperOrigin-RevId: 312028385
Change-Id: Ie1fbb3071e4e258c24db78440e1275168694fda9
---
 .../lite/kernels/internal/optimized/integer_ops/add.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index ff8e4687d58..95b78b3a6b3 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -47,6 +47,9 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
   const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
 
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
   for (; i <= size - 16; i += 16) {
     const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
     const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
@@ -61,13 +64,13 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
     const int16x8_t input2_val_s16_low =
         vmovl_s8(vget_low_s8(input2_val_original));
     const int16x8_t input1_val_high =
-        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
+        vaddq_s16(input1_val_s16_high, input1_offset_dup);
     const int16x8_t input2_val_high =
-        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
+        vaddq_s16(input2_val_s16_high, input2_offset_dup);
     const int16x8_t input1_val_low =
-        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
+        vaddq_s16(input1_val_s16_low, input1_offset_dup);
     const int16x8_t input2_val_low =
-        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
+        vaddq_s16(input2_val_s16_low, input2_offset_dup);
     const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
     const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
     const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);

From 76853076b382474ff35f4561fde231b06a5ccdfa Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Mon, 18 May 2020 01:32:19 -0700
Subject: [PATCH 0742/1533] Add optimized MatrixBatchVectorMultiplyAccumulate
 for asymmetric inputs for sse

PiperOrigin-RevId: 312035618
Change-Id: I5ae85ae9b0b646d2fe1e665c25aae6b99622dd2b
---
 .../internal/optimized/neon_tensor_utils.cc   |  35 +++--
 .../internal/optimized/neon_tensor_utils.h    |  10 --
 .../optimized/neon_tensor_utils_impl.h        |   6 -
 .../internal/optimized/sse_tensor_utils.cc    | 129 ++++++++++--------
 .../internal/optimized/sse_tensor_utils.h     |  22 +--
 .../optimized/sse_tensor_utils_impl.h         |  10 +-
 .../reference/portable_tensor_utils.cc        |  29 ----
 .../reference/portable_tensor_utils.h         |  10 --
 .../reference/portable_tensor_utils_impl.h    |   6 -
 .../kernels/internal/tensor_utils_test.cc     |   8 +-
 10 files changed, 110 insertions(+), 155 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 4c90cd86a56..c96f298370a 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -1466,16 +1466,20 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       int i = 0;
       int32_t* scratch_ptr = scratch;
       for (; i <= total_size - 8; i += 8, result += 8) {
-        float batch_scaling_factor0 = scaling_factors[i / m_rows];
-        float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
-        if (per_channel_scale) {
-          batch_scaling_factor0 *= per_channel_scale[i % m_rows];
-          batch_scaling_factor1 *= per_channel_scale[(i + 4) % m_rows];
-        }
+        const float batch_scaling_factor0 = scaling_factors[i / m_rows];
+        const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
         const int batch_input_offset0 = -input_offset[i / m_rows];
         const int batch_input_offset1 = -input_offset[(i + 4) / m_rows];
-        const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
-        const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+        float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
+        float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+        if (per_channel_scale) {
+          const float32x4_t per_channel_scale0 =
+              vld1q_f32(&per_channel_scale[i % m_rows]);
+          const float32x4_t per_channel_scale1 =
+              vld1q_f32(&per_channel_scale[(i + 4) % m_rows]);
+          scaling_factor0 = vmulq_f32(scaling_factor0, per_channel_scale0);
+          scaling_factor1 = vmulq_f32(scaling_factor1, per_channel_scale1);
+        }
         const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0);
         const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1);
         const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows));
@@ -1498,7 +1502,10 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
 
       scratch_ptr += i;
       for (; i < total_size; i++) {
-        const float batch_scaling_factor = scaling_factors[i / m_rows];
+        float batch_scaling_factor = scaling_factors[i / m_rows];
+        if (per_channel_scale) {
+          batch_scaling_factor *= per_channel_scale[i % m_rows];
+        }
         const int32_t zero_point = input_offset[i / m_rows];
         int32_t dotprod = *(scratch_ptr++);
         dotprod -= row_sums[i % m_rows] * zero_point;
@@ -1514,16 +1521,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       per_channel_scale, input_offset, row_sums);
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NeonMatrixBatchVectorMultiplyAccumulateImpl(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, nullptr);
-}
-
 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
   int64x2x2_t result;
   const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index b978bf5f3bb..86951fcd559 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -55,16 +55,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                    vectors, scaling_factors, n_batch, scratch, result, context);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, per_channel_scale,
-                   input_offset);
-}
-
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 1b043390c22..1554d07a61c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -62,12 +62,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context);
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
-
 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                         const int32_t* bias, int32_t layer_norm_scale_a,
                         int32_t layer_norm_scale_b, int32_t variance_limit,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 7fb69e7b4f4..80cc14c6d26 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -89,18 +90,24 @@ float GetFloatVectorElement(__m128 v) {
 
 }  // namespace
 
-void SseMatrixBatchVectorMultiplyAccumulate(
+void SseMatrixBatchVectorMultiplyAccumulateImpl(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result) {
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, const int32_t* row_sums) {
   for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
+    const int32_t batch_offset = input_offset ? input_offset[batch] : 0;
     // Compute dot-product for every column.
     for (std::intptr_t row = 0; row < m_rows; ++row) {
       // Get the address of the first element of the row.
       const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-
+      const float row_scale =
+          per_channel_scale ? per_channel_scale[row] * batch_scaling_factor
+                            : batch_scaling_factor;
+      const int32_t row_offset =
+          row_sums && batch_offset ? batch_offset * row_sums[row] : 0;
       // Initialize the dot product sum for the row to 0.
       __m128i dotprod_32x4 = _mm_setzero_si128();
       std::intptr_t col = 0;
@@ -152,8 +159,10 @@ void SseMatrixBatchVectorMultiplyAccumulate(
       for (; col < m_cols; ++col) {
         sum += row_ptr[col] * vectors[col];
       }  // for col
-
-      *result += sum * batch_scaling_factor;
+      if (row_offset) {
+        sum -= row_offset;
+      }
+      *result += sum * row_scale;
       ++result;
     }  // for row
 
@@ -165,56 +174,30 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  if (input_offset == nullptr) {
-    SseMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                           scaling_factors, n_batch, result);
-    return;
-  }
-  static constexpr std::intptr_t kBlockSize = 16;
-  for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    for (std::intptr_t row = 0; row < m_rows; ++row) {
-      const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale != nullptr) {
-        scale *= per_channel_scale[row];
-      }
-      __m128i dotprod_32x4 = _mm_setzero_si128();
-      __m128i row_sum_16x8 = _mm_setzero_si128();
-      std::intptr_t col = 0;
-      for (; col < (m_cols & ~(kBlockSize - 1)); col += kBlockSize) {
-        const __m128i vec_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(vectors + col));
-        const __m128i row_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
-        // dotprod += vec · row
-        dotprod_32x4 =
-            _mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_8x16, row_8x16));
+    float* __restrict__ result) {
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+      /*row_sums=*/nullptr);
+}
 
-        // Pairwise add 16x 8-bit values; equivalently, multipy-add with 1.
-        // Result is 8x 16-bit values.
-        const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
-        row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
-      }  // for col
-      // Pairwise add 8x 16-bit values; equivalently, multipy-add with 1.
-      // Result is 4x 32-bit values.
-      const __m128i row_sum_32x4 =
-          _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
-      int32_t sum = ReduceInt32x4(dotprod_32x4);
-      int32_t row_sum = ReduceInt32x4(row_sum_32x4);
-      // Postamble loop.
-      for (; col < m_cols; ++col) {
-        sum += row_ptr[col] * vectors[col];
-        row_sum += row_ptr[col];
-      }  // for col
-      sum -= row_sum * input_offset[batch];
-      *result += sum * scale;
-      ++result;
-    }  // for row
-    vectors += m_cols;
-  }  // for batch
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  if ((input_offset != nullptr) && (!compute_row_sums || *compute_row_sums)) {
+    memset(row_sums, 0, sizeof(int32_t) * m_rows);
+    SseReductionSumVector(matrix, row_sums, m_rows, m_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, row_sums);
 }
 
 namespace {
@@ -347,6 +330,44 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
   }  // for batch
 }
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size) {
+  static constexpr std::intptr_t kBlockSize = 16;
+  for (std::intptr_t row = 0; row < output_size; ++row) {
+    const int8_t* __restrict__ row_ptr = input_vector + row * reduction_size;
+    __m128i row_sum_16x8 = _mm_setzero_si128();
+    std::intptr_t col = 0;
+    for (; col < (reduction_size & ~(kBlockSize - 1)); col += kBlockSize) {
+      const __m128i row_8x16 =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
+      const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+    }  // for col
+#ifdef __SSE4_1__
+    // Postamble for 8x 8-bit inputs.
+    if (col < (reduction_size & ~7)) {
+      // _mm_loadu_si64 not supported in gcc versions < 9, breaks kokoro build.
+      const __m128i row_16x8 = _mm_cvtepi8_epi16(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col)));
+      // dotprod += vec · row
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+      col += 8;
+    }
+#endif
+    const __m128i row_sum_32x4 =
+        _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
+    int32_t row_sum = ReduceInt32x4(row_sum_32x4);
+#if defined(__SSE4_1__) && defined(__clang__)
+    // SSE 4.1: Don't try to unroll and vectorize this, already done above.
+#pragma clang loop unroll(disable) vectorize(disable)
+#endif
+    for (; col < reduction_size; col++) {
+      row_sum += *(row_ptr + col);
+    }
+    *(output_vector + row) += row_sum;
+  }
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 986e70a7823..224d811e862 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -59,10 +59,9 @@ void MatrixBatchVectorMultiplyAccumulate(
     int n_batch, float* __restrict__ result, const float* per_channel_scale,
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context) {
-  PortableMatrixBatchVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
-      context);
+  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                  vectors, scaling_factors, n_batch, result, per_channel_scale,
+                  input_offset, scratch, row_sums, compute_row_sums, context);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
@@ -75,17 +74,6 @@ void MatrixBatchVectorMultiplyAccumulate(
                   vectors, scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors,
-    const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result, per_channel_scale,
-                  input_offset);
-}
-
 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
@@ -315,8 +303,8 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
 
 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
                         int output_size, int reduction_size) {
-  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
-                   reduction_size);
+  SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                  reduction_size);
 }
 
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
index 1996b1f30a9..c5ede624762 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
@@ -38,8 +40,9 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset);
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 // Matrix multiplication for quantized values using symmetric quantization.
 // Sparse version.
@@ -49,6 +52,9 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ scaling_factors, int n_batch,
     float* __restrict__ result);
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size);
+
 #endif  // __SSSE3__
 
 }  // namespace tensor_utils
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 0e66dfee191..4f6db290d4f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -161,35 +161,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    const float batch_offset = input_offset[batch];
-    const int8_t* row_ptr = matrix;
-    for (int row = 0; row < m_rows; ++row) {
-      int32_t dotprod = 0;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale) {
-        scale *= per_channel_scale[row];
-      }
-#if defined(__GNUC__)
-      // Prefetch the row to cache.
-      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
-                         3 /* temporal locality */);
-#endif
-      for (int col = 0; col < m_cols; ++col, ++row_ptr) {
-        dotprod += (*row_ptr) * (vectors[col] - batch_offset);
-      }  // for col
-      *result += dotprod * scale;
-      ++result;
-    }  // for row
-  }    // for batch
-}
-
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index f2e6c9b4f7d..0fd7a407595 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -98,16 +98,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                                               scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                              scaling_factors, n_batch, result,
-                                              per_channel_scale, input_offset);
-}
-
 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 6c15a6cd919..34767ccd942 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -83,12 +83,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     int n_batch, int32_t* scratch, float* __restrict__ result,
     CpuBackendContext* context);
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
-
 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 3ad59acdb68..878cf0d2618 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -1136,11 +1136,15 @@ std::vector<float> TestPerChannelDotprodMatrixBatchVectorMultiply(
     bool is_per_channel = true) {
   MatrixVectorData data =
       SetupMatrixVectorData(rows, cols, batch, negative, is_per_channel);
-
+  std::vector<int32_t> scratch(rows * batch);
+  std::vector<int32_t> row_sums(rows);
+  bool compute_row_sums = true;
+  CpuBackendContext context;
   MatrixBatchVectorMultiplyAccumulate(
       data.matrix.data(), rows, cols, data.vectors.data(),
       data.scale_factors.data(), batch, &data.results[0],
-      data.per_channel_scales.data(), data.input_offsets.data());
+      data.per_channel_scales.data(), data.input_offsets.data(), scratch.data(),
+      row_sums.data(), &compute_row_sums, &context);
   return data.results;
 }
 

From de8a517f4068589fb5cd82c8a8a8dc3d5e101c0e Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Mon, 18 May 2020 01:58:56 -0700
Subject: [PATCH 0743/1533] fix escape in Core ML header processing

PiperOrigin-RevId: 312038605
Change-Id: I422e343729a7f27808c3f9b908460faeeaa58ce5
---
 tensorflow/lite/experimental/ios/BUILD.apple | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index a29e8bd6ed5..7e2a3623af1 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -51,7 +51,7 @@ genrule(
     srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
     outs = ["coreml_delegate.h"],
     cmd = """
-    sed "s|#include \".*common.h\"|#include \"TensorFlowLiteC/common.h\"|"\
+    sed 's|#include ".*common.h"|#include "TensorFlowLiteC/common.h"|'\
     "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
     > "$@"
     """,

From 647ef2db28957b9cb1d0df66ee9a2a37ca21ca15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 02:02:53 -0700
Subject: [PATCH 0744/1533] Update GraphDef version to 405.

PiperOrigin-RevId: 312039077
Change-Id: I03ac966118084eb80d817cdfe98b175c75bf86aa
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 63501a14f56..7abbcd5474c 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 404  // Updated: 2020/5/17
+#define TF_GRAPH_DEF_VERSION 405  // Updated: 2020/5/18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 72c50430aa5347e6c9bc1a1927a4e13db0dc766a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 02:02:54 -0700
Subject: [PATCH 0745/1533] compat: Update forward compatibility horizon to
 2020-05-18

PiperOrigin-RevId: 312039082
Change-Id: I03c04d8d9a395087e866a67ca58a263150b3f754
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 2a99a0774ad..88a26661f82 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From b2f3e8f5639a9370c9f8987a733ab3496eb87a97 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 18 May 2020 06:16:05 -0700
Subject: [PATCH 0746/1533] numerics_test.py: Move tfdbg2-specific test methods
 to debug_v2_ops_test.py

PiperOrigin-RevId: 312065934
Change-Id: Idf576fd41ae96ed19f815bcce8848eabef036834
---
 .../python/debug/lib/debug_v2_ops_test.py     | 34 ++++++++++++++
 .../python/kernel_tests/numerics_test.py      | 46 -------------------
 2 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index c76cbeeac6c..07721920f63 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_debug_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
@@ -680,6 +681,39 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
     self.assertAllEqual(tensor_1, tensor_2)
     self.assertEqual(tensor_id_1, tensor_id_2)
 
+  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
+    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf and \+Inf values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
+    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0, 0.0])
+      t2 = constant_op.constant([0.0, 0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf, \+Inf, and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2PositiveInfAndNaN(self):
+    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([0.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had \+Inf and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 4d31cd45289..950658bc886 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -24,7 +24,6 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -132,51 +131,6 @@ class NumericsTest(test.TestCase):
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
 
-  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
-    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf and +Inf values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
-    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0, 0.0])
-      t2 = constant_op.constant([0.0, 0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf, +Inf, and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2PositiveInfAndNaN(self):
-    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([0.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had +Inf and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
 
 if __name__ == "__main__":
   # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems

From fb416f16e2b01252326816bb311c3e6165d13bcf Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 18 May 2020 06:28:20 -0700
Subject: [PATCH 0747/1533] [tfdbg] Fix source_utils_test in Python 3.8+

This is related to https://bugs.python.org/issue12458

In python 3.8, traceback reports the first instead of last line in
a multi-line continuation block.

Certain parts of source_utils_test.py assume that traceback always
returns the last line, which is true all the way up to 3.7.

In order to fix this, we use the `ast` module to extract the lineno
of the first line in a multi-line continuation block.

PiperOrigin-RevId: 312067389
Change-Id: I8a3ac129b3d75230a3eedd64c3605779dcab5336
---
 tensorflow/python/debug/BUILD                 |  1 -
 .../python/debug/lib/source_utils_test.py     | 38 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 956e90999c7..1ef0504ecb8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -840,7 +840,6 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss_py38",  #TODO(b/151449908)
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index faf2365fc9c..89964a21ba7 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ast
 import os
+import sys
 import tempfile
 import zipfile
 
@@ -43,7 +45,41 @@ from tensorflow.python.util import tf_inspect
 
 
 def line_number_above():
-  return tf_inspect.stack()[1][2] - 1
+  """Get lineno of the AST node immediately above this function's call site.
+
+  It is assumed that there is no empty line(s) between the call site and the
+  preceding AST node.
+
+  Returns:
+    The lineno of the preceding AST node, at the same level of the AST.
+    If the preceding AST spans multiple lines:
+      - In Python 3.8+, the lineno of the first line is returned.
+      - In older Python versions, the lineno of the last line is returned.
+  """
+  # https://bugs.python.org/issue12458: In Python 3.8, traceback started
+  # to return the lineno of the first line of a multi-line continuation block,
+  # instead of that of the last line. Therefore, in Python 3.8+, we use `ast` to
+  # get the lineno of the first line.
+  call_site_lineno = tf_inspect.stack()[1][2]
+  if sys.version_info < (3, 8):
+    return call_site_lineno - 1
+  else:
+    with open(__file__, "rb") as f:
+      source_text = f.read().decode("utf-8")
+    source_tree = ast.parse(source_text)
+    prev_node = _find_preceding_ast_node(source_tree, call_site_lineno)
+    return prev_node.lineno
+
+
+def _find_preceding_ast_node(node, lineno):
+  """Find the ast node immediately before and not including lineno."""
+  for i, child_node in enumerate(node.body):
+    if child_node.lineno == lineno:
+      return node.body[i - 1]
+    if hasattr(child_node, "body"):
+      found_node = _find_preceding_ast_node(child_node, lineno)
+      if found_node:
+        return found_node
 
 
 class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):

From ff2019a216aed7bbb1e30432b47abcfe5567f0b4 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 07:06:15 -0700
Subject: [PATCH 0748/1533] Optimize multiply by quantize multiplier.

PiperOrigin-RevId: 312072311
Change-Id: I7d01be9aa8f1a238c6887d4770a1090899337383
---
 .../internal/optimized/optimized_ops.h        | 82 ++++++-------------
 1 file changed, 27 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index b18f0f4bb5a..64598d70ee3 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -201,63 +201,35 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
 // MultiplyByQuantizedMultipler.
 #ifdef USE_NEON
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
-    int32x4x4_t input_val, int32 quantized_multiplier, int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  const int left_shift = shift > 0 ? shift : 0;
-  const int right_shift = shift > 0 ? 0 : -shift;
+    int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) {
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
   int32x4x4_t result;
-  // The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
-  // is limited to NEON.
-#ifdef GEMMLOWP_NEON
-  const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
-  result.val[0] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[0], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[1] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[1], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[2] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[2], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[3] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[3], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-#else
-  for (int i = 0; i < 4; ++i) {
-    int32_t vals[4];
-    vals[0] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[1] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[2] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[3] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
 
-    result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
-  }
-#endif
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[1] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[2] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[3] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
   return result;
 }
 #endif

From b5ed51fb220fa85b96268b392fe7f60804c004c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 07:37:15 -0700
Subject: [PATCH 0749/1533] Resolve trivial aliases for portable TensorFlow
 targets.

PiperOrigin-RevId: 312076343
Change-Id: I49adacfaea505bed1edb4ca51776057474d2a4ca
---
 tensorflow/tensorflow.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 9e89094f4e7..d72bdf58186 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -874,7 +874,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,
@@ -891,7 +891,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,

From ea113ef6cdbd34203f8f951af8621dbc1e4572e6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 07:41:37 -0700
Subject: [PATCH 0750/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/a2a4e5aae894

PiperOrigin-RevId: 312076934
Change-Id: I12015eb4ec1278668834ca8a687d290a00eba112
---
 tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index c2b11819448..6375bf7341f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -292,7 +292,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
   llvm::AllocaInst* alloca =
       b->CreateAlloca(type, element_count, AsStringRef(name));
   if (alignment != 0) {
-    alloca->setAlignment(llvm::MaybeAlign(alignment));
+    alloca->setAlignment(llvm::Align(alignment));
   }
   return alloca;
 }

From f40a063d84df3f4e0ed2a2fc78d8b79f203a03b4 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 18 May 2020 07:46:08 -0700
Subject: [PATCH 0751/1533] [TF:TRT] Enhance InstantiateBuildAndRun to support
 the case where the input type and output type are not the same.

This is to prepare for a change to enhance the TF-TRT bridge to support the Cast
operations that can be represented via IIdentityLayer.

PiperOrigin-RevId: 312077452
Change-Id: Iab6bfb54d6a346eef158785f61a1311559cee855
---
 .../tf2tensorrt/convert/convert_nodes_test.cc | 37 +++++++++++++++----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 884ed7a5771..82c02c17e93 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1712,7 +1712,7 @@ INSTANTIATE_TEST_CASE_P(
 
 // Builds and runs the converted network. Checks output tensor shape. Tests
 // output values using a matcher.
-template <DataType dtype>
+template <DataType input_dtype, DataType output_dtype>
 void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
                                  const TestParamBase& p,
                                  const std::vector<float>& input_vec,
@@ -1731,12 +1731,14 @@ void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
     // runtime errors.
     return;
   }
-  typedef typename EnumToDataType<dtype>::Type T;
+  typedef typename EnumToDataType<input_dtype>::Type Tin;
   TensorShape shape;
   TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.input_dims, &shape));
   const DataVec input_data{
-      {"input", test->AsTensor<T>(CastTestVector<float, T>(input_vec), shape)}};
-  DataVec output_data{{name, test->ConstructTensor<T>(6)}};
+      {"input",
+       test->AsTensor<Tin>(CastTestVector<float, Tin>(input_vec), shape)}};
+  typedef typename EnumToDataType<output_dtype>::Type Tout;
+  DataVec output_data{{name, test->ConstructTensor<Tout>(6)}};
   test->BuildAndRun(input_data, &output_data);
   // Check the shape of the actual output tensor
   TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.expected_output_dims, &shape));
@@ -1744,7 +1746,7 @@ void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
       << "Expected shape: " << shape.DebugString() << ", actual shape"
       << output_data[0].tensor.shape().DebugString();
   // Cast the output to float and compare to expected output
-  auto out_span = GetSpanForData<T>(output_data[0]);
+  auto out_span = GetSpanForData<Tout>(output_data[0]);
   std::vector<float> casted_output(out_span.begin(), out_span.end());
   EXPECT_THAT(casted_output, matcher);
 }
@@ -1754,16 +1756,35 @@ void InstantiateBuildAndRun(DataType tf_dtype, const string& name,
                             const std::vector<float>& input_vec,
                             const Matcher<std::vector<float>>& matcher) {
   if (tf_dtype == DT_FLOAT) {
-    BuildAndRunConvertedNetwork<DT_FLOAT>(name, test, p, input_vec, matcher);
+    BuildAndRunConvertedNetwork<DT_FLOAT, DT_FLOAT>(name, test, p, input_vec,
+                                                    matcher);
   } else if (tf_dtype == DT_HALF) {
-    BuildAndRunConvertedNetwork<DT_HALF>(name, test, p, input_vec, matcher);
+    BuildAndRunConvertedNetwork<DT_HALF, DT_HALF>(name, test, p, input_vec,
+                                                  matcher);
   } else if (tf_dtype == DT_INT32) {
-    BuildAndRunConvertedNetwork<DT_INT32>(name, test, p, input_vec, matcher);
+    BuildAndRunConvertedNetwork<DT_INT32, DT_INT32>(name, test, p, input_vec,
+                                                    matcher);
   } else {
     FAIL() << "Test not supported for " << tf_dtype;
   }
 }
 
+void InstantiateBuildAndRun(DataType input_tf_dtype, DataType output_tf_dtype,
+                            const string& name, OpConverterTest* test,
+                            const TestParamBase& p,
+                            const std::vector<float>& input_vec,
+                            const Matcher<std::vector<float>>& matcher) {
+  if (input_tf_dtype == output_tf_dtype) {
+    InstantiateBuildAndRun(input_tf_dtype, name, test, p, input_vec, matcher);
+  } else if (input_tf_dtype == DT_HALF && output_tf_dtype) {
+    BuildAndRunConvertedNetwork<DT_HALF, DT_FLOAT>(name, test, p, input_vec,
+                                                   matcher);
+  } else {
+    FAIL() << "Test not supported for input " << input_tf_dtype << " output "
+           << output_tf_dtype;
+  }
+}
+
 template <typename T>
 void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
   out->Clear();

From c4e877d94a0f3ea9506c6c641ecea816d6af6113 Mon Sep 17 00:00:00 2001
From: Marcin Sielski <marcin.sielski@gmail.com>
Date: Mon, 18 May 2020 16:50:03 +0200
Subject: [PATCH 0752/1533] Address issues identified during review Why:

* Improve build instruction on RPI.

This change addresses the need by:

* --depth 1 removal for git clone,
* change name of the directory from tensor_src to tensorflow_src,
* improve PATH setup in case other cross-tools are installed,
* change the compilator version used to build the tensorflow package.
---
 tensorflow/lite/g3doc/guide/build_rpi.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index a1724258118..4a39f4e7677 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -23,7 +23,7 @@ To cross compile TensorFlow Lite follow the steps:
 1. Clone official Raspberry Pi cross-compilation toolchain:
 
     ```bash
-    git clone --depth 1 https://github.com/raspberrypi/tools.git rpi_tools
+    git clone https://github.com/raspberrypi/tools.git rpi_tools
     ```
 
 2. Clone TensorFlow repository:
@@ -39,7 +39,7 @@ To cross compile TensorFlow Lite follow the steps:
 build dependencies:
 
     ```bash
-    cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
     ```
 
     **Note:** You only need to do this once.
@@ -47,7 +47,7 @@ build dependencies:
 4. To build ARMv7 binary for Raspberry Pi 2, 3 and 4 execute:
 
     ```bash
-    PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh
+    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh
     ```
 
     **Note:** This should compile a static library in:
@@ -56,7 +56,7 @@ build dependencies:
 5. To build ARMv6 binary for Raspberry Pi Zero execute:
 
     ```bash
-    PATH=$PATH:../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin/ ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
+    PATH=../rpi_tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf/bin:$PATH ./tensorflow/lite/tools/make/build_rpi_lib.sh TARGET_ARCH=armv6
     ```
 
     **Note:** This should compile a static library in:
@@ -64,7 +64,7 @@ build dependencies:
 
 ## Compile natively on Raspberry Pi
 
-Instruction has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc version 6.3.0 20170516 (Raspbian 6.3.0-18+rpi1):
+Instruction has been tested on Raspberry Pi Zero, Raspbian GNU/Linux 10 (buster), gcc version 8.3.0 (Raspbian 8.3.0-6+rpi1):
 
 To natively compile TensorFlow Lite follow the steps:
 
@@ -78,7 +78,7 @@ To natively compile TensorFlow Lite follow the steps:
 build dependencies:
 
     ```bash
-    cd tensor_src && ./tensorflow/lite/tools/make/download_dependencies.sh
+    cd tensorflow_src && ./tensorflow/lite/tools/make/download_dependencies.sh
     ```
 
     **Note:** You only need to do this once.

From 50fcac47a2652459a7f9b71255cfa1cf0077447b Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 07:49:05 -0700
Subject: [PATCH 0753/1533] Optimize quantized mul.

PiperOrigin-RevId: 312077803
Change-Id: Ib6bbf261834a828590748e2c39ad146bad7d80ae
---
 .../internal/optimized/integer_ops/mul.h      | 139 ++++++++++++------
 1 file changed, 97 insertions(+), 42 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index 18aeef4c8b5..0d385ec1656 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -38,49 +38,81 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.output_offset, -256);
   TFLITE_DCHECK_LT(params.output_offset, 256);
 #ifdef USE_NEON
-  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
-  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
-  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input1_val_original = vld1_s8(input1_data + i);
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input1_val_s16 = vmovl_s8(input1_val_original);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
 
-    const auto input1_val_low = vget_low_s16(input1_val);
-    const auto input1_val_high = vget_high_s16(input1_val);
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
 
-    auto p1 = vmull_s16(input2_val_low, input1_val_low);
-    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
@@ -117,40 +149,63 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
   const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
   const auto output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
 
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const auto input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
 
-    auto p1 = vmull_n_s16(input2_val_low, input1_val);
-    auto p2 = vmull_n_s16(input2_val_high, input1_val);
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From 454195592520a68033aaf123c083e1ff7d9bb719 Mon Sep 17 00:00:00 2001
From: Marcin Sielski <marcin.sielski@gmail.com>
Date: Mon, 18 May 2020 16:58:01 +0200
Subject: [PATCH 0754/1533] Add clone step to native build instrunction. Why:

* Improve the documentation.

This change addresses the need by:

* Add clone repository step,
* Change rpi_armv7 to rpi_armv6 .
---
 tensorflow/lite/g3doc/guide/build_rpi.md | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/g3doc/guide/build_rpi.md b/tensorflow/lite/g3doc/guide/build_rpi.md
index 4a39f4e7677..c75b39cd7e5 100644
--- a/tensorflow/lite/g3doc/guide/build_rpi.md
+++ b/tensorflow/lite/g3doc/guide/build_rpi.md
@@ -29,7 +29,7 @@ To cross compile TensorFlow Lite follow the steps:
 2. Clone TensorFlow repository:
 
     ```bash
-    git clone --depth 1 https://github.com/tensorflow/tensorflow.git tensorflow_src
+    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
 
     ```
 
@@ -74,7 +74,14 @@ To natively compile TensorFlow Lite follow the steps:
     sudo apt-get install build-essential
     ```
 
-2. Run following script at the root of the TensorFlow repository to download all the
+2. Clone TensorFlow repository:
+
+    ```bash
+    git clone https://github.com/tensorflow/tensorflow.git tensorflow_src
+
+    ```
+
+3. Run following script at the root of the TensorFlow repository to download all the
 build dependencies:
 
     ```bash
@@ -83,11 +90,11 @@ build dependencies:
 
     **Note:** You only need to do this once.
 
-3. You should then be able to compile TensorFlow Lite with:
+4. You should then be able to compile TensorFlow Lite with:
 
     ```bash
     ./tensorflow/lite/tools/make/build_rpi_lib.sh
     ```
 
     **Note:** This should compile a static library in:
-    `tensorflow/lite/tools/make/gen/lib/rpi_armv7/libtensorflow-lite.a`.
+    `tensorflow/lite/tools/make/gen/lib/rpi_armv6/libtensorflow-lite.a`.

From 55aee9e55084b309d5a01dae6685d4622482d6df Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 18 May 2020 08:55:02 -0700
Subject: [PATCH 0755/1533] [TF:TRT] Add utilities for converting between TF
 types and TRT types.

PiperOrigin-RevId: 312087947
Change-Id: Ie4c47ab5c6aae97af5a83bba06e3de0637752ecf
---
 .../tf2tensorrt/convert/convert_nodes_test.cc | 32 ++++++-----------
 .../compiler/tf2tensorrt/convert/utils.cc     | 35 +++++++++++++++++++
 .../compiler/tf2tensorrt/convert/utils.h      |  3 ++
 3 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 82c02c17e93..964370af6be 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -137,30 +137,18 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
   return os;
 }
 
-nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
-  switch (tf_dtype) {
-    case DT_FLOAT:
-      return nvinfer1::DataType::kFLOAT;
-    case DT_HALF:
-      return nvinfer1::DataType::kHALF;
-    case DT_INT32:
-      return nvinfer1::DataType::kINT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
-  }
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_type) {
+  nvinfer1::DataType trt_type;
+  Status status = TfTypeToTrtType(tf_type, &trt_type);
+  EXPECT_EQ(status, Status::OK());
+  return trt_type;
 }
 
-DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return DT_FLOAT;
-    case nvinfer1::DataType::kHALF:
-      return DT_HALF;
-    case nvinfer1::DataType::kINT32:
-      return DT_INT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
-  }
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_type) {
+  DataType tf_type;
+  Status status = TrtTypeToTfType(trt_type, &tf_type);
+  EXPECT_EQ(status, Status::OK());
+  return tf_type;
 }
 
 NodeDef MakeNodeDef(const string& name, const string& op,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index fb3ae6943d3..a4b64ec0dc5 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -185,6 +186,40 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
   return Status::OK();
 }
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type) {
+  switch (tf_type) {
+    case DT_FLOAT:
+      *trt_type = nvinfer1::DataType::kFLOAT;
+      break;
+    case DT_HALF:
+      *trt_type = nvinfer1::DataType::kHALF;
+      break;
+    case DT_INT32:
+      *trt_type = nvinfer1::DataType::kINT32;
+      break;
+    default:
+      return errors::Internal("Unsupported tensorflow type");
+  }
+  return Status::OK();
+}
+
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) {
+  switch (trt_type) {
+    case nvinfer1::DataType::kFLOAT:
+      *tf_type = DT_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *tf_type = DT_HALF;
+      break;
+    case nvinfer1::DataType::kINT32:
+      *tf_type = DT_INT32;
+      break;
+    default:
+      return errors::Internal("Invalid TRT type");
+  }
+  return Status::OK();
+}
+
 int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
   int n_bindings = engine->getNbBindings();
   int n_input = 0;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 5d4cf1bb851..59eeb420134 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -106,6 +106,9 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
                             bool use_implicit_batch, int batch_size,
                             TensorShape& shape);
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
+
 // Returns a string that includes compile time TensorRT library version
 // information {Maj, Min, Patch}.
 string GetLinkedTensorRTVersion();

From 46f7108d78c6a3c0854fe66ce1cd92e5ebb3d6e2 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Mon, 18 May 2020 09:08:29 -0700
Subject: [PATCH 0756/1533] Internal change

PiperOrigin-RevId: 312090528
Change-Id: I474709513b01db8c24c50fd670029451c51cb622
---
 tensorflow/python/keras/layers/embeddings.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 3f57fd6cb63..e30e93f02dc 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -129,8 +129,10 @@ class Embedding(Layer):
     # since it knows all kernels using the variable only exist on CPU.
     # When eager execution is enabled, the placement decision has to be made
     # right now. Checking for the presence of GPUs to avoid complicating the
-    # TPU codepaths which can handle sparse optimizers.
-    if context.executing_eagerly() and context.context().num_gpus():
+    # TPU codepaths which can handle sparse optimizers. But if we are within
+    # a tf.function, we go back the graph mode logic and rely on the placer.
+    if (context.executing_eagerly() and context.context().num_gpus() and
+        not ops.inside_function()):
       with ops.device('cpu:0'):
         self.embeddings = self.add_weight(
             shape=(self.input_dim, self.output_dim),

From 32165792a3ae4705f50d82329db0733aa01bb6ed Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 18 May 2020 09:23:09 -0700
Subject: [PATCH 0757/1533] [TF:TRT] Implement cast from fp16 to fp32 with
 IIdentityLayer.

This is the first CL to implement the request in b/150285802.

Add Cast op test to convert_nodes_test.

PiperOrigin-RevId: 312093049
Change-Id: I77215cf6da104f51acc93de1b03e9a179db54f0a
---
 .../tf2tensorrt/convert/convert_nodes.cc      | 106 +++++++++++++++---
 .../tf2tensorrt/convert/convert_nodes.h       |   2 +
 .../tf2tensorrt/convert/convert_nodes_test.cc |  21 +++-
 3 files changed, 109 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index a43b16e9e6a..e791ff9ff60 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
@@ -795,6 +796,19 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
   }
 }
 
+Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const {
+  if (is_tensor()) {
+    nvinfer1::DataType trt_type = tensor()->getType();
+    return TrtTypeToTfType(trt_type, tf_type);
+  }
+
+  if (is_weights()) {
+    *tf_type = weights().GetTensor().dtype();
+    return Status::OK();
+  }
+  return errors::Internal("The object is probably not initialized");
+}
+
 string TRT_TensorOrWeights::DebugString() const {
   string output = "TRT_TensorOrWeights(type=";
   if (is_tensor()) {
@@ -1900,27 +1914,48 @@ Status CheckInputsWeights(
   return Status::OK();
 }
 
-Status AllowDataTypes(const OpConverterParams& params,
-                      const std::set<DataType>& allowed_dtypes,
-                      const char* dtype_attr_name = "T") {
-  const auto& node_def = params.node_def;
+Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type,
+                        const char* type_attr_name) {
   TFAttrs attrs(node_def);
-  if (!attrs.count(dtype_attr_name)) {
-    return errors::InvalidArgument("Attribute with name ", dtype_attr_name,
+  if (!attrs.count(type_attr_name)) {
+    return errors::InvalidArgument("Attribute with name ", type_attr_name,
                                    " not found.");
   }
-  const auto op_dtype = attrs.get<DataType>(dtype_attr_name);
-  if (!allowed_dtypes.count(op_dtype)) {
-    // Build string list of allowed types.
-    std::ostringstream ss;
-    for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) {
-      if (it != allowed_dtypes.begin()) ss << ", ";
-      ss << DataTypeString(*it);
-    }
-    return errors::Unimplemented("Data type ", DataTypeString(op_dtype),
+  *tf_type = attrs.get<DataType>(type_attr_name);
+  return Status::OK();
+}
+
+Status GetInputTfType(const OpConverterParams& params, DataType* tf_type,
+                      int pos) {
+  const std::vector<TRT_TensorOrWeights>& inputs = params.inputs;
+  if (inputs.size() <= pos) {
+    return errors::Internal("Invalid input position");
+  }
+
+  return inputs[pos].GetTfType(tf_type);
+}
+
+constexpr const char kOutputTypeAttrName[] = "T";
+
+Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) {
+  return GetNodeDefTfType(params.node_def, tf_type, kOutputTypeAttrName);
+}
+
+Status AllowDataTypes(const OpConverterParams& params,
+                      const std::set<DataType>& allowed_types,
+                      const char* type_attr_name = kOutputTypeAttrName) {
+  const auto& node_def = params.node_def;
+  DataType tf_type;
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name));
+  if (!allowed_types.count(tf_type)) {
+    string allowed_types_string = absl::StrJoin(
+        allowed_types, ", ", [](string* out, const DataType& type) {
+          absl::StrAppendFormat(out, "%s", DataTypeString(type));
+        });
+    return errors::Unimplemented("Data type ", DataTypeString(tf_type),
                                  " is not supported for ", node_def.op(),
-                                 ", must be one of [", ss.str(), "], at ",
-                                 node_def.name());
+                                 ", must be one of [", allowed_types_string,
+                                 "], at ", node_def.name());
   }
   return Status::OK();
 }
@@ -4598,6 +4633,42 @@ Status ConvertUnpack(OpConverterParams* params) {
   return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true);
 }
 
+// Supports cast fp16=>fp32 through IIdentityLayer.
+Status ConvertCast(OpConverterParams* params) {
+  const NodeDef& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  auto unsupport_cast_error = [&]() {
+    return errors::Unimplemented("Cast op: ", node_def.op(),
+                                 " not supported at: ", node_def.name());
+  };
+
+  DataType input_type;
+  TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0));
+  if (input_type != DataType::DT_HALF) {
+    return unsupport_cast_error();
+  }
+
+  DataType output_type;
+  TF_RETURN_IF_ERROR(GetOutputTfType(*params, &output_type));
+  if (output_type != DataType::DT_FLOAT) {
+    return unsupport_cast_error();
+  }
+
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::ITensor* input = params->inputs.at(0).tensor();
+  nvinfer1::IIdentityLayer* layer =
+      params->converter->network()->addIdentity(*input);
+  layer->setPrecision(nvinfer1::DataType::kFLOAT);
+
+  if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
+    return errors::Internal("IIdentityLayer doesn't work as expected");
+  }
+
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+}
+
 Status ConvertConcat(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -5675,6 +5746,7 @@ static void RegisterValidatableOpConverters(
   (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
 #endif
   (*registration)["AddN"] = ConvertAddN;
+  (*registration)["Cast"] = ConvertCast;
   (*registration)["ConcatV2"] = ConvertConcat;
   (*registration)["Const"] = ConvertConst;
   (*registration)["Conv2D"] = ConvertConv2D;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 2092aecd657..2fe8eec9675 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -294,6 +294,8 @@ class TRT_TensorOrWeights {
 
   nvinfer1::Dims GetTrtDims() const;
 
+  Status GetTfType(DataType* tf_type) const;
+
   int batch_size() const { return batch_size_; }
 
   string DebugString() const;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 964370af6be..1efc31f9e24 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -5147,6 +5147,14 @@ NodeDef CreateUnaryOp() {
   return T(s.WithOpName("my_unary"), input).operation.node()->def();
 }
 
+NodeDef CreateCastOp() {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
+  return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT)
+      .operation.node()
+      ->def();
+}
+
 TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
   const auto& spec = GetParam();
   const TrtTestMode trt_mode = std::get<0>(spec);
@@ -5174,6 +5182,7 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
   ADD_OP("Asinh", ops::Asinh, std::asinh);
   ADD_OP("Atan", ops::Atan, std::atan);
   ADD_OP("Atanh", ops::Atanh, std::atanh);
+  op_map["Cast"] = std::make_pair(CreateCastOp, [](float x) { return x; });
   ADD_OP("Ceil", ops::Ceil, std::ceil);
   ADD_OP("Cos", ops::Cos, std::cos);
   ADD_OP("Cosh", ops::Cosh, std::cosh);
@@ -5212,7 +5221,13 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
     }
     NodeDef node_def = op_map[op_name].first();
 
-    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode);
+    // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for
+    // now. Need to find a better way to express input and output types.
+    DataType input_tf_dtype = op_name == "Cast" ? DT_HALF : tf_dtype;
+    DataType output_tf_dtype = tf_dtype;
+
+    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(input_tf_dtype),
+                  trt_mode);
     RunValidationAndConversion(node_def, Status::OK(), "my_unary",
                                p.expected_output_dims);
 
@@ -5220,8 +5235,8 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
     std::vector<float> output;
     std::transform(input_values.begin(), input_values.end(),
                    std::back_inserter(output), op_map[op_name].second);
-    InstantiateBuildAndRun(tf_dtype, "my_unary", this, p, input_values,
-                           ArrayFloatNear(output, 0.0001, true));
+    InstantiateBuildAndRun(input_tf_dtype, output_tf_dtype, "my_unary", this, p,
+                           input_values, ArrayFloatNear(output, 0.0001, true));
   }
 }
 

From 9c49cda7d988680985aa194703edd72df60a57bc Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Mon, 18 May 2020 09:27:00 -0700
Subject: [PATCH 0758/1533] Update release notes for the 1.15.3, 2.0.2 and
 2.1.1 patch releases.

PiperOrigin-RevId: 312093793
Change-Id: I476369d7d3f8e8d54dd10f412f25049265fc688f
---
 RELEASE.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 6c8921cf492..f251f6ceffa 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,28 @@
+# Release 2.1.1
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+* Fixes a versioning bug which causes Keras layers from TF 1.x to be used instead of those from TF 2.x
+
+# Release 2.0.2
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
+# Release 1.15.3
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
 # Release 2.2.0
 
 TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update).

From cfdb9434054da65025c25d5dbcda029c16faf868 Mon Sep 17 00:00:00 2001
From: Ilya Tokar <tokarip@google.com>
Date: Mon, 18 May 2020 09:35:23 -0700
Subject: [PATCH 0759/1533] Tweak round_to_bfloat16 to make it vectorizable.

This simplifies control flow by handling positive and
negative denormals separately. Should be ~40% faster.

PiperOrigin-RevId: 312095390
Change-Id: I5b6388e48b8c217edb0fc4fe14c3add64fb52c65
---
 tensorflow/core/lib/bfloat16/bfloat16.h | 327 ++++++++++++------------
 1 file changed, 163 insertions(+), 164 deletions(-)

diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 4c38738593f..54d78480066 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -194,171 +194,170 @@ struct bfloat16 {
     input = f.u;
     bfloat16 output;
 
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+    //
+    //
+    // Least significant bit of resulting bfloat.
+    uint32_t lsb = (input >> 16) & 1;
+    uint32_t rounding_bias = 0x7fff + lsb;
+    input += rounding_bias;
+    output.value = static_cast<uint16_t>(input >> 16);
+    if ((f.u & 0xff800000u) == 0) {
+      // Flush positive denormal to 0
+      output.value = 0x0;
+    }
+    if ((f.u & 0xff800000u) == 0x80000000u) {
+      // Flush negative denormal to -0
+      output.value = 0x8000;
+    }
     if (float_isnan(v)) {
-      // If the value is a NaN, squash it to a qNaN with msb of fraction set,
-      // this makes sure after truncation we don't end up with an inf.
-      //
-      // qNaN magic: All exponent bits set + most significant bit of fraction
-      // set.
-      output.value = 0x7fc0;
-    } else if (std::fabs(v) < std::numeric_limits<float>::min()) {
-      // Flush denormal to +/- 0.0
-      output.value = std::signbit(v) ? 0x8000 : 0;
-    } else {
-      // Fast rounding algorithm that rounds a half value to nearest even. This
-      // reduces expected error when we convert a large number of floats. Here
-      // is how it works:
-      //
-      // Definitions:
-      // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
-      // with the following tags:
-      //
-      // Sign |  Exp (8 bits) | Frac (23 bits)
-      //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
-      //
-      //  S: Sign bit.
-      //  E: Exponent bits.
-      //  F: First 6 bits of fraction.
-      //  L: Least significant bit of resulting bfloat16 if we truncate away the
-      //  rest of the float32. This is also the 7th bit of fraction
-      //  R: Rounding bit, 8th bit of fraction.
-      //  T: Sticky bits, rest of fraction, 15 bits.
-      //
-      // To round half to nearest even, there are 3 cases where we want to round
-      // down (simply truncate the result of the bits away, which consists of
-      // rounding bit and sticky bits) and two cases where we want to round up
-      // (truncate then add one to the result).
-      //
-      // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
-      // 1s) as the rounding bias, adds the rounding bias to the input, then
-      // truncates the last 16 bits away.
-      //
-      // To understand how it works, we can analyze this algorithm case by case:
-      //
-      // 1. L = 0, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input may create any carry, depending on
-      //   whether there is any value set to 1 in T bits.
-      //   - R may be set to 1 if there is a carry.
-      //   - L remains 0.
-      //   - Note that this case also handles Inf and -Inf, where all fraction
-      //   bits, including L, R and Ts are all 0. The output remains Inf after
-      //   this algorithm.
-      //
-      // 2. L = 1, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits but
-      //   adds 1 to rounding bit.
-      //   - L remains 1.
-      //
-      // 3. L = 0, R = 1, all of T are 0:
-      //   Expect: round down, this is exactly at half, the result is already
-      //   even (L=0).
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input sets all sticky bits to 1, but
-      //   doesn't create a carry.
-      //   - R remains 1.
-      //   - L remains 0.
-      //
-      // 4. L = 1, R = 1:
-      //   Expect: round up, this is exactly at half, the result needs to be
-      //   round to the next even number.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits, but
-      //   creates a carry from rounding bit.
-      //   - The carry sets L to 0, creates another carry bit and propagate
-      //   forward to F bits.
-      //   - If all the F bits are 1, a carry then propagates to the exponent
-      //   bits, which then creates the minimum value with the next exponent
-      //   value. Note that we won't have the case where exponents are all 1,
-      //   since that's either a NaN (handled in the other if condition) or inf
-      //   (handled in case 1).
-      //
-      // 5. L = 0, R = 1, any of T is 1:
-      //   Expect: round up, this is greater than half.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input creates a carry from sticky bits,
-      //   sets rounding bit to 0, then create another carry.
-      //   - The second carry sets L to 1.
-      //
-      // Examples:
-      //
-      //  Exact half value that is already even:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
-      //
-      //     This falls into case 3. We truncate the rest of 16 bits and no
-      //     carry is created into F and L:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //  Exact half value, round to next even number:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     which then propagates into L and F:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //
-      //  Max denormal value round to min normal value:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
-      //
-      //  Max normal value round to Inf:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
-      //
-      //
-      // Least significant bit of resulting bfloat.
-      uint32_t lsb = (input >> 16) & 1;
-      uint32_t rounding_bias = 0x7fff + lsb;
-      input += rounding_bias;
-      output.value = static_cast<uint16_t>(input >> 16);
+      output.value = NAN_VALUE;
     }
     return output;
   }

From dbc0fffedb506c12837a5eda0d87b01b659136ba Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Mon, 18 May 2020 09:35:47 -0700
Subject: [PATCH 0760/1533] Report remote target name for worker service RPCs.

PiperOrigin-RevId: 312095453
Change-Id: I73fc7948f994426b8d62bdefd5573cfe3b5b793d
---
 .../rpc/grpc_remote_worker.cc                    | 16 ++++++++++------
 .../distributed_runtime/rpc/grpc_remote_worker.h |  3 ++-
 .../distributed_runtime/rpc/grpc_worker_cache.cc |  6 +++---
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 85431acdf0c..6e706179863 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -45,7 +45,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             thread::ThreadPool* callback_threadpool,
-                            WorkerCacheLogger* logger)
+                            WorkerCacheLogger* logger, const string& target)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
@@ -66,7 +66,8 @@ class GrpcRemoteWorker : public WorkerInterface {
         instancesource_(Method(GrpcWorkerMethod::kCompleteInstance)),
         getstepsequence_(Method(GrpcWorkerMethod::kGetStepSequence)),
         markrecvfinished_(Method(GrpcWorkerMethod::kMarkRecvFinished)),
-        logger_(logger) {}
+        logger_(logger),
+        target_(target) {}
 
   ~GrpcRemoteWorker() override {}
 
@@ -273,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     bool fail_fast = true) {
     new RPCState<protobuf::Message>(
         &stub_, cq_, method, *request, response, std::move(done), call_opts,
-        callback_threadpool_, /*max_retries=*/0, fail_fast);
+        callback_threadpool_, /*max_retries=*/0, fail_fast, &target_);
   }
 
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
@@ -281,7 +282,8 @@ class GrpcRemoteWorker : public WorkerInterface {
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_);
+                                 callback_threadpool_, /*max_retries=*/0,
+                                 /*fail_fast=*/true, &target_);
   }
 
   void IssueMarkRecvFinishedRequest(int64 request_id) {
@@ -321,6 +323,7 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   // Support for logging.
   WorkerCacheLogger* logger_;
+  const string target_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
 };
@@ -328,9 +331,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger) {
+                                     WorkerCacheLogger* logger,
+                                     const string& target) {
   return new GrpcRemoteWorker(std::move(channel), completion_queue,
-                              callback_threadpool, logger);
+                              callback_threadpool, logger, target);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index c0a49ecfc38..97e590e0ad1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -29,7 +29,8 @@ class WorkerInterface;
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger);
+                                     WorkerCacheLogger* logger,
+                                     const string& target);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index f6b6e15a2ba..1d75728ddd2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -69,9 +69,9 @@ class GrpcWorkerCache : public WorkerCachePartial {
         return nullptr;
       }
       size_t index = AssignWorkerToThread(target);
-      return NewGrpcRemoteWorker(channel,
-                                 worker_env_->GetCompletionQueue(index),
-                                 worker_env_->GetThreadPool(), &logger_);
+      return NewGrpcRemoteWorker(
+          channel, worker_env_->GetCompletionQueue(index),
+          worker_env_->GetThreadPool(), &logger_, target);
     }
   }
 

From 1b2a65c15fed4a27bc94ebbce930feea455d927f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Mon, 18 May 2020 09:46:53 -0700
Subject: [PATCH 0761/1533] Add legalization from hlo.dot to lhlo.dot

PiperOrigin-RevId: 312097353
Change-Id: Ia8b0fef86c77426f54090354779c62163bf97426
---
 .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir         | 12 ++++++++++++
 .../mlir/xla/transforms/hlo_legalize_to_lhlo.cc      |  1 +
 .../mlir/xla/transforms/map_hlo_to_lhlo_op.h         |  1 +
 3 files changed, 14 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 53296b257ae..68f6d172afc 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -395,3 +395,15 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
+// CHECK-SAME: (%[[ARG0:.*]]: [[TYPE:.*]],
+// CHECK-SAME:  %[[RESULT:.*]]: [[TYPE]])
+// CHECK: "xla_lhlo.dot"(%[[ARG0]], %[[ARG0]], %{{.*}}) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+    %dot = "xla_hlo.dot"(%arg0, %arg0)
+      : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    return %dot : tensor<1024x1024xf32>
+  }
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 10f35768bbd..11b2ae65d8e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -362,6 +362,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<xla_hlo::CopyOp>,
       HloToLhloOpConverter<xla_hlo::CosOp>,
       HloToLhloOpConverter<xla_hlo::DivOp>,
+      HloToLhloOpConverter<xla_hlo::DotOp>,
       HloToLhloOpConverter<xla_hlo::ExpOp>,
       HloToLhloOpConverter<xla_hlo::ImagOp>,
       HloToLhloOpConverter<xla_hlo::IotaOp>,
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index fed21e9bafc..21b954a3eb4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -49,6 +49,7 @@ MAP_HLO_TO_LHLO(ConvertOp);
 MAP_HLO_TO_LHLO(CopyOp);
 MAP_HLO_TO_LHLO(CosOp);
 MAP_HLO_TO_LHLO(DivOp);
+MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
 MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);

From 0bf90cb2a8b241a728943d343f1cdd922e408c73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 10:12:52 -0700
Subject: [PATCH 0762/1533] Enable (non-gradient) tests of tf.linalg.cholesky
 in eager mode.

PiperOrigin-RevId: 312102967
Change-Id: Icefc46a8268413dfaec42109d4f57dd07f602a54
---
 .../python/kernel_tests/cholesky_op_test.py   | 45 ++++++++++---------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 7d5f7715eb1..01c497a37ed 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.platform import benchmark
@@ -91,7 +91,7 @@ def TriAngInvCompositeGrad(l, grad):
 
 class CholeskyOpTest(test.TestCase):
 
-  def _verifyCholeskyBase(self, sess, x, chol, verification):
+  def _verifyCholeskyBase(self, x, chol, verification):
     chol_np, verification_np = self.evaluate([chol, verification])
     self.assertAllClose(x, verification_np)
     self.assertShapeEqual(x, chol)
@@ -106,11 +106,11 @@ class CholeskyOpTest(test.TestCase):
 
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
-    with self.cached_session(use_gpu=True) as sess:
-      chol = linalg_ops.cholesky(x)
-      verification = math_ops.matmul(chol, chol, adjoint_b=True)
-      self._verifyCholeskyBase(sess, x, chol, verification)
+    chol = linalg_ops.cholesky(x)
+    verification = math_ops.matmul(chol, chol, adjoint_b=True)
+    self._verifyCholeskyBase(x, chol, verification)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBasic(self):
     data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]])
     for dtype in (np.float32, np.float64):
@@ -123,6 +123,7 @@ class CholeskyOpTest(test.TestCase):
         complex_data += data
         self._verifyCholesky(complex_data)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatch(self):
     simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
     self._verifyCholesky(simple_array)
@@ -144,21 +145,21 @@ class CholeskyOpTest(test.TestCase):
         matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(
           np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]
                    ]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     tensor3 = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
 
   # The below invalid Cholesky call returns an error with TF Classic and just
@@ -175,21 +176,23 @@ class CholeskyOpTest(test.TestCase):
         self._verifyCholesky(
             np.array([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]]))
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
   @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
-      matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
-      c1 = linalg_ops.cholesky(matrix1)
-      c2 = linalg_ops.cholesky(matrix2)
-      c1_val, c2_val = self.evaluate([c1, c2])
-      self.assertAllClose(c1_val, c2_val)
+    seed = [42, 24]
+    matrix_shape = [5, 5]
+    matrix1 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
+    matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
+    c1 = linalg_ops.cholesky(matrix1)
+    c2 = linalg_ops.cholesky(matrix2)
+    c1_val, c2_val = self.evaluate([c1, c2])
+    self.assertAllClose(c1_val, c2_val)
 
 
 class CholeskyGradTest(test.TestCase):

From 83b85568fb5a5aade46a41909ee9a1b6f3643b57 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 18 May 2020 10:23:36 -0700
Subject: [PATCH 0763/1533] Support int8 in tflite_convert

PiperOrigin-RevId: 312105323
Change-Id: I161b9b324e37f42f2026592f7c5bec8ac568c3d6
---
 tensorflow/lite/python/tflite_convert.py      |  6 ++-
 tensorflow/lite/python/tflite_convert_test.py | 39 +++++++++++++++----
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index d0dd7313df3..c7504a3a638 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -65,6 +65,8 @@ def _parse_inference_type(value, flag):
     return lite_constants.FLOAT
   if value == "QUANTIZED_UINT8":
     return lite_constants.QUANTIZED_UINT8
+  if value == "INT8":
+    return lite_constants.INT8
   raise ValueError("Unsupported value for --{0}. Only FLOAT and "
                    "QUANTIZED_UINT8 are supported.".format(flag))
 
@@ -352,12 +354,12 @@ def _get_tf1_flags(parser):
   parser.add_argument(
       "--inference_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help="Target data type of real-number arrays in the output file.")
   parser.add_argument(
       "--inference_input_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help=("Target data type of real-number input arrays. Allows for a "
             "different type for input arrays in the case of quantization."))
 
diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py
index 1e80907edbd..d6a35ba9248 100644
--- a/tensorflow/lite/python/tflite_convert_test.py
+++ b/tensorflow/lite/python/tflite_convert_test.py
@@ -98,8 +98,8 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'Placeholder', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'Placeholder',
+                                              'add'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -137,8 +137,31 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'random', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'random', 'add'))
+    self._run(flags_str, should_succeed=True)
+    os.remove(graph_def_file)
+
+  def testQATFrozenGraphDefInt8(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output',
+          num_bits=16)  # INT8 inference type works for 16 bits fake quant.
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = self._getFilepath('model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    flags_str = ('--inference_type=INT8 --std_dev_values=128,128 '
+                 '--mean_values=128,128 '
+                 '--graph_def_file={0} --input_arrays={1},{2} '
+                 '--output_arrays={3}'.format(graph_def_file, 'inputA',
+                                              'inputB', 'output'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -166,8 +189,8 @@ class TfLiteConvertV1Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 
@@ -299,8 +322,8 @@ class TfLiteConvertV2Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 

From dec7430b13213974928ae395322feabc788b1664 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Mon, 18 May 2020 10:38:01 -0700
Subject: [PATCH 0764/1533] Ensure that tf_py_test tfrt test is not enabled for
 open source build by introducing tfrt_enabled_internal flag.

PiperOrigin-RevId: 312108475
Change-Id: Ia73668bf1e8f097441ed23dd75fb1ac2c0327e1f
---
 tensorflow/python/data/service/BUILD               |  2 ++
 tensorflow/python/eager/BUILD                      |  2 +-
 tensorflow/python/keras/layers/preprocessing/BUILD |  2 ++
 tensorflow/python/kernel_tests/BUILD               |  5 ++++-
 tensorflow/python/kernel_tests/proto/BUILD         |  2 +-
 tensorflow/python/saved_model/BUILD                |  2 ++
 tensorflow/tensorflow.bzl                          | 11 ++++++++++-
 7 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/data/service/BUILD b/tensorflow/python/data/service/BUILD
index 19bcaa3b952..18678230205 100644
--- a/tensorflow/python/data/service/BUILD
+++ b/tensorflow/python/data/service/BUILD
@@ -1,4 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index c08cb8cc1c3..394b929bf1b 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 052a57b52f3..b580382f9d8 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -2,6 +2,8 @@
 #   Contains the Keras preprocess layers (internal TensorFlow version).
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 13f59b74baf..cd03da9b179 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1,8 +1,11 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index d9643f3d125..0e935dfe8c4 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -1,7 +1,7 @@
 # Tests of tf.io.*proto.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 2e5db7edd27..5c30d320fb7 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -2,6 +2,8 @@
 # TensorFlow SavedModel.
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d72bdf58186..70b03146f34 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2218,6 +2218,15 @@ def tf_py_test(
         xla_enabled = False,
         grpc_enabled = False,
         tfrt_enabled = False,
+        # `tfrt_enabled` is set for some test targets, and if we enable
+        # TFRT tests just by that, this will enable TFRT builds for open source.
+        # TFRT open source is not fully integrated yet so we need a temporary
+        # workaround to enable TFRT only for internal builds. `tfrt_enabled_internal`
+        # will be set by `tensorflow.google.bzl`'s `tf_py_test` target, which is
+        # only applied for internal builds.
+        # TODO(b/156911178): Revert this temporary workaround once TFRT open source
+        # is fully integrated with TF.
+        tfrt_enabled_internal = False,
         **kwargs):
     """Create one or more python tests with extra tensorflow dependencies."""
     xla_test_true_list = []
@@ -2261,7 +2270,7 @@ def tf_py_test(
         deps = depset(deps + xla_test_true_list),
         **kwargs
     )
-    if tfrt_enabled:
+    if tfrt_enabled_internal:
         py_test(
             name = name + "_tfrt",
             size = size,

From 95620005efbc52a446a232d5e74ee9fec793f918 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Mon, 18 May 2020 10:41:07 -0700
Subject: [PATCH 0765/1533] Document new methods to enable XNNPACK engine in
 TFLite

PiperOrigin-RevId: 312109175
Change-Id: Iefcbb2ef5d7c83160ef2fc09d668c8e4ac440949
---
 tensorflow/lite/delegates/xnnpack/README.md | 45 ++++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index e0ef6f0899c..c4e3f540faf 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -1,15 +1,48 @@
 # XNNPACK backend for TensorFlow Lite
 
 XNNPACK is a highly optimized library of floating-point neural network
-inference operators for ARM, WebAssembly, and x86 platforms. This document
-describes how to use the XNNPACK library as a backend for TensorFlow Lite.
+inference operators for ARM, x86, and WebAssembly architectures in Android, iOS,
+Windows, Linux, macOS, and Emscripten environments. This document describes how
+to use the XNNPACK library as an inference engine for TensorFlow Lite.
 
-## Enabling XNNPACK backend in TensorFlow Lite models
+## Using XNNPACK engine with TensorFlow Lite interpreter
 
 XNNPACK integrates with TensorFlow Lite interpreter through the delegation
-mechanism. To leverage XNNPACK library for acceleration, the users need to
-create an XNNPACK delegate with the `TfLiteXNNPackDelegateCreate` function,
-and call `Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
+mechanism. There are three methods to enable XNNPACK engine in TensorFlow Lite.
+
+### Enable XNNPACK via Bazel build flags (recommended)
+
+When building TensorFlow Lite with Bazel, add
+`--define tflite_with_xnnpack=true`, and the TensorFlow Lite interpreter will
+use XNNPACK engine by default.
+
+The exact command depends on the target platform, e.g. for Android AAR you'd use
+
+```
+bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --define tflite_with_xnnpack=true \
+  //tensorflow/lite/java:tensorflow-lite
+```
+
+### Enable XNNPACK via additional dependency
+
+Another way to enable XNNPACK is to build and link the
+`//tensorflow/lite:tflite_with_xnnpack` target into your application alongside
+the TensorFlow Lite framework.
+
+This method works on platforms which support POSIX-style weak symbols (Android,
+iOS, Linux, Mac, but **NOT** Windows).
+
+### Enable XNNPACK via low-level delegate API (not recommended)
+
+While it is possible to use low-level delegate API to enable XNNPACK, this
+method is **NOT RECOMMENDED** unless you need to use TensorFlow Lite both with
+and without XNNPACK (e.g. for benchmarking).
+
+With low-level delegate API users create an XNNPACK delegate with the
+`TfLiteXNNPackDelegateCreate` function, and then call
+`Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
 the model to the XNNPACK delegate. The users must destroy the delegate with
 `TfLiteXNNPackDelegateDelete` **after** releasing the TensorFlow Lite
 interpreter. The snippet below illustrates the typical usage:

From 723b2b59946c3a0bfa83b0b5df408e4699c88016 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 10:44:42 -0700
Subject: [PATCH 0766/1533] enable device tracer test.

PiperOrigin-RevId: 312109916
Change-Id: Ibf8f17dc7cfd95aeb991796880161567fcb9ebe4
---
 tensorflow/core/profiler/internal/gpu/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index e6ee8514227..c6fe4d77031 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -55,7 +55,6 @@ tf_cc_test_gpu(
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + [
         "nomac",
-        "notap",  # b/154510273
         "gpu_cupti",
     ],
     deps = [

From 9cf08f43e07c6bb47bd9d41b3c6b0f33811f77c6 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 18 May 2020 11:17:10 -0700
Subject: [PATCH 0767/1533] [XLA:Python] Delete deprecated methods from
 XLA:Python API.

PiperOrigin-RevId: 312117146
Change-Id: I232b67b9c4955b7fa6ab7e3ced9446d5ca2ea0e8
---
 tensorflow/compiler/xla/python/xla.cc        | 114 -------------------
 tensorflow/compiler/xla/python/xla_client.py |  10 +-
 2 files changed, 5 insertions(+), 119 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index f10ec978399..0c4695cabf3 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -930,34 +930,6 @@ PYBIND11_MODULE(xla_extension, m) {
           "client",
           [](const ClientAndPtr<Device>& device) { return device.client; })
       .def("__str__", &Device::DebugString)
-      // TODO(phawkins): remove capitalized names after updating callers.
-      .def("TransferToInfeed",
-           [](const Device& device, const LiteralSlice& literal) {
-             GlobalPyRefManager()->CollectGarbage();
-             py::gil_scoped_release gil_release;
-             TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                 device.GetLocalDeviceState());
-             return local_device->client()->TransferToInfeedLocal(
-                 literal, local_device->device_ordinal());
-           })
-      .def(
-          "TransferFromOutfeed",
-          [](const Device& device, const Shape& shape) -> StatusOr<py::object> {
-            GlobalPyRefManager()->CollectGarbage();
-            std::shared_ptr<Literal> literal_shared;
-            {
-              py::gil_scoped_release gil_release;
-              TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                  device.GetLocalDeviceState());
-              TF_ASSIGN_OR_RETURN(
-                  Literal literal,
-                  local_device->client()->TransferFromOutfeedLocal(
-                      shape, local_device->device_ordinal()));
-
-              literal_shared = std::make_shared<Literal>(std::move(literal));
-            }
-            return LiteralToPython(std::move(literal_shared));
-          })
       .def("transfer_to_infeed",
            [](const Device& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
@@ -1244,28 +1216,6 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("size_of_generated_code_in_bytes",
            &PjRtExecutable::SizeOfGeneratedCodeInBytes)
       .def("delete", &PjRtExecutable::Delete)
-      // TODO(phawkins): delete capitalized methods after updating callers.
-      .def("Delete", &PjRtExecutable::Delete)
-      .def(
-          "Execute",
-          [](const PjRtExecutable& executable,
-             absl::Span<PjRtBuffer* const> args)
-              -> StatusOr<std::vector<ClientAndUniquePtr<PjRtBuffer>>> {
-            py::gil_scoped_release gil_release;
-            ExecuteOptions options;
-            options.untuple_result = true;
-            TF_ASSIGN_OR_RETURN(
-                std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
-                executable.Execute(args, options));
-            std::vector<ClientAndUniquePtr<PjRtBuffer>> outputs;
-            outputs.reserve(output_buffers.size());
-            for (auto& buffer : output_buffers) {
-              outputs.push_back(WrapWithClient(
-                  executable.client()->shared_from_this(), std::move(buffer)));
-            }
-            return outputs;
-          },
-          py::arg("arguments"))
       .def(
           "execute",
           [](const PjRtExecutable& executable,
@@ -1286,33 +1236,6 @@ PYBIND11_MODULE(xla_extension, m) {
             return outputs;
           },
           py::arg("arguments"))
-      // TODO(phawkins): delete capitalized methods after updating callers.
-      .def(
-          "ExecuteOnLocalDevices",
-          [](const PjRtExecutable& executable,
-             absl::Span<const std::vector<PjRtBuffer*>> args)
-              -> StatusOr<
-                  std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>>> {
-            py::gil_scoped_release gil_release;
-            ExecuteOptions options;
-            options.untuple_result = true;
-            TF_ASSIGN_OR_RETURN(
-                std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
-                    output_buffers,
-                executable.ExecuteOnLocalDevices(args, options));
-            std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>> outputs;
-            outputs.resize(output_buffers.size());
-            for (int computation = 0; computation < output_buffers.size();
-                 ++computation) {
-              for (auto& buffer : output_buffers[computation]) {
-                outputs[computation].push_back(
-                    WrapWithClient(executable.client()->shared_from_this(),
-                                   std::move(buffer)));
-              }
-            }
-            return outputs;
-          },
-          py::arg("arguments"))
       .def(
           "execute_on_local_devices",
           [](const PjRtExecutable& executable,
@@ -1414,12 +1337,6 @@ PYBIND11_MODULE(xla_extension, m) {
         proto.ParseFromString(serialized_hlo_module_proto);
         return absl::make_unique<XlaComputation>(proto);
       }))
-      // TODO(phawkins): delete capitalized names after updating callers.
-      .def("GetProgramShape", &XlaComputation::GetProgramShape)
-      .def("GetSerializedProto", &GetComputationSerializedProto)
-      .def("GetHloText", &GetComputationHloText)
-      .def("GetHloDotGraph", &GetComputationHloDotGraph)
-      .def("Hash", &HashComputation)
       .def("get_hlo_module", &GetHloModule)
       .def("program_shape", &XlaComputation::GetProgramShape)
       .def("as_serialized_hlo_module_proto", &GetComputationSerializedProto)
@@ -1512,28 +1429,7 @@ PYBIND11_MODULE(xla_extension, m) {
           },
           "Builds a computation from the contents of the builder.",
           py::arg("root") = absl::nullopt)
-      .def("ClearOpMetadata", &XlaBuilder::ClearOpMetadata)
       .def("GetShape", &XlaBuilder::GetShape)
-      .def(
-          "GetProgramShape",
-          [](const XlaBuilder& builder,
-             absl::optional<XlaOp> root) -> StatusOr<ProgramShape> {
-            return root ? builder.GetProgramShape(*root)
-                        : builder.GetProgramShape();
-          },
-          py::arg("root") = absl::nullopt)
-      .def("IsConstant", &XlaBuilder::IsConstant)
-      .def("SetOpMetadata", &XlaBuilder::SetOpMetadata)
-      .def("SetSharding", &XlaBuilder::SetSharding)
-      .def("ClearSharding", &XlaBuilder::ClearSharding)
-      .def("SetUpAlias",
-           [](XlaBuilder& builder, const std::vector<int64>& output_index,
-              int64 param_number, const std::vector<int64>& param_index) {
-             builder.SetUpAlias(
-                 ShapeIndex(output_index.begin(), output_index.end()),
-                 param_number,
-                 ShapeIndex(param_index.begin(), param_index.end()));
-           })
       .def(
           "build",
           [](XlaBuilder& builder, absl::optional<XlaOp> root) {
@@ -1564,17 +1460,7 @@ PYBIND11_MODULE(xla_extension, m) {
                  ShapeIndex(param_index.begin(), param_index.end()));
            });
 
-  // TODO(phawkins): delete capitalized names after updating callers
-  m.def("BufferToDLPackManagedTensor", BufferToDLPackManagedTensor);
   m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor);
-  m.def("DLPackManagedTensorToBuffer",
-        [](const py::capsule& tensor, std::shared_ptr<PjRtClient> client)
-            -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
-          TF_ASSIGN_OR_RETURN(
-              std::unique_ptr<PjRtBuffer> buffer,
-              DLPackManagedTensorToBuffer(tensor, client.get()));
-          return WrapWithClient(std::move(client), std::move(buffer));
-        });
   m.def("dlpack_managed_tensor_to_buffer",
         [](const py::capsule& tensor, std::shared_ptr<PjRtClient> client)
             -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index d9cd906939d..76c3bc33a91 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -300,13 +300,13 @@ CompileOptions = _xla.CompileOptions
 # An Executable is a C++ class that duck types with the following API:
 # class Executable(object):
 #   def local_devices(self) -> [Device]:
-#   def Execute(self, arguments : [Buffer]) -> Buffer:
+#   def execute(self, arguments : [Buffer]) -> Buffer:
 #     """Execute on one replica with Buffer arguments and return value."""
 #
-#   def SizeOfGeneratedCodeInBytes(self) -> int:
+#   def size_of_generated_code_in_bytes(self) -> int:
 #     """Return generated binary size, or -1 if not known."""
 #
-#   def ExecuteOnLocalDevices(self, arguments: [[Buffer]]) -> [Buffer]:
+#   def execute_on_local_devices(self, arguments: [[Buffer]]) -> [Buffer]:
 #     """Execute on many replicas with Buffer arguments and return value.
 #
 #     Args:
@@ -329,7 +329,7 @@ def execute_with_python_values(executable, arguments, backend):
     return backend.buffer_from_pyval(arg, device=executable.local_devices()[0])
 
   arguments = [put(arg) for arg in arguments]
-  outputs = executable.Execute(arguments)
+  outputs = executable.execute(arguments)
   return [x.to_py() for x in outputs]
 
 
@@ -359,7 +359,7 @@ def execute_with_python_values_replicated(executable, arguments, backend):
     flat_arg_buffers = flat_arg_buffers[len(replica_args):]
   return [[x.to_py()
            for x in xs]
-          for xs in executable.ExecuteOnLocalDevices(arg_buffers)]
+          for xs in executable.execute_on_local_devices(arg_buffers)]
 
 
 class PaddingType(enum.Enum):

From ef45324fc62fc9a911e5771a40f9790900500de9 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 11:26:48 -0700
Subject: [PATCH 0768/1533] Hexagon Delegate - Allow optional tensors as valid
 tensors in inputs. - Update fully connected builder to handle optional bias
 tensor.

PiperOrigin-RevId: 312119090
Change-Id: If905792a78f61abde0f269ed252aa2501ae60815
---
 .../hexagon/builders/matmul_builder.cc        | 68 +++++++++--------
 .../hexagon/builders/tests/matmul_test.cc     | 73 +++++++++++++++++--
 .../experimental/delegates/hexagon/utils.cc   | 21 ++++--
 3 files changed, 116 insertions(+), 46 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
index c53e62d27a7..c0c815ffdcc 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
@@ -129,35 +129,41 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
 
   // Bias tensor.
   int bias_tensor_id = inputs->data[2];
-  const auto& bias_tensor = context->tensors[bias_tensor_id];
-  auto* const_bias_node =
-      graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
-  graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), 0);
-  ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
-  auto* bias_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
-      sizeof(bias_min_));
-  auto* bias_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
-      sizeof(bias_max_));
+  TensorID matmul_and_bias_out = matmul_out,
+           matmul_and_bias_out_min = matmul_out_min,
+           matmul_and_bias_out_max = matmul_out_max;
+  if (bias_tensor_id != -1) {
+    const auto& bias_tensor = context->tensors[bias_tensor_id];
+    auto* const_bias_node =
+        graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
+    graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(),
+                                    0);
+    ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
+    auto* bias_min_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
+        sizeof(bias_min_));
+    auto* bias_max_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
+        sizeof(bias_max_));
 
-  // MatMul + Bias.
-  auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
-  bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
-  bias_add_op->AddInput(matmul_out);
-  bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
-  bias_add_op->AddInput(matmul_out_min);
-  bias_add_op->AddInput(matmul_out_max);
-  bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
-  bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
-  const auto& bias_add_out =
-      bias_add_op->AddOutput(sizeof(int32_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-  const auto& bias_add_out_min =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  const auto& bias_add_out_max =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    // MatMul + Bias.
+    auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
+    bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
+    bias_add_op->AddInput(matmul_out);
+    bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
+    bias_add_op->AddInput(matmul_out_min);
+    bias_add_op->AddInput(matmul_out_max);
+    bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
+    bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
+    matmul_and_bias_out =
+        bias_add_op->AddOutput(sizeof(int32_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+    matmul_and_bias_out_min =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    matmul_and_bias_out_max =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  }
 
   // Quantize 32-bit result into 8-bit format using output tensor min/max.
   ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min_,
@@ -170,9 +176,9 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       sizeof(output_max_));
   auto* quantize_biasadd_op = graph_builder_->AddNode(GetTFLiteNodeID());
   quantize_biasadd_op->SetOpType(OP_Requantize_32to8);
-  quantize_biasadd_op->AddInput(bias_add_out);
-  quantize_biasadd_op->AddInput(bias_add_out_min);
-  quantize_biasadd_op->AddInput(bias_add_out_max);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_min);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_max);
   quantize_biasadd_op->AddInput(TensorID(output_min_const->GetID(), 0));
   quantize_biasadd_op->AddInput(TensorID(output_max_const->GetID(), 0));
   node_output_ =
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
index a16e22888dd..3a5f320a6a7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
@@ -22,7 +22,7 @@ using testing::ElementsAreArray;
 class FullyConnectedOpModel : public SingleOpModelWithHexagon {
  public:
   FullyConnectedOpModel(int units, int batches, const TensorData& input,
-                        const TensorData& output)
+                        const TensorData& output, bool optional_bias = false)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
     for (size_t i = 0; i < input.shape.size(); ++i) {
@@ -34,9 +34,13 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
     weights_ =
         AddInput({input.type, {units_, input_size_}, input.min, input.max});
 
-    auto bias_scale = GetScale(input_) * GetScale(weights_);
-    TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
-    bias_ = AddInput(bias);
+    if (optional_bias) {
+      bias_ = AddNullInput();
+    } else {
+      auto bias_scale = GetScale(input_) * GetScale(weights_);
+      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
 
     output_ = AddOutput(output);
 
@@ -46,15 +50,16 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
                                     FullyConnectedOptionsWeightsFormat_DEFAULT,
                                     /*keep_num_dims=*/false)
             .Union());
-
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    BuildInterpreter({GetShape(input_), GetShape(weights_)});
 
     // Weights & bias tensors need to be constant.
     // We don't use AddConstInput to allow setting filter values later.
     auto* weights_tensor = interpreter_->tensor(weights_);
     weights_tensor->allocation_type = kTfLiteMmapRo;
-    auto* bias_tensor = interpreter_->tensor(bias_);
-    bias_tensor->allocation_type = kTfLiteMmapRo;
+    if (!optional_bias) {
+      auto* bias_tensor = interpreter_->tensor(bias_);
+      bias_tensor->allocation_type = kTfLiteMmapRo;
+    }
   }
 
   void SetBias(const std::vector<float>& data) {
@@ -146,4 +151,56 @@ TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8_NoBias) {
+  FullyConnectedOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128}, /*optional_bias*/ true);
+
+  m.SetWeights<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NoBias) {
+  FullyConnectedOpModel m(/*units=*/3, /*batches*/ 2,
+                          /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+                          /*output=*/{TensorType_INT8, {}, -127, 128},
+                          /*optional_bias*/ true);
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index 8aff13549b8..ae7f6994657 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -116,6 +116,9 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
   int tensor_id;
   for (int i = 0; i < node->inputs->size; ++i) {
     tensor_id = node->inputs->data[i];
+    // Skip optional tensors. Builders should handle optional tensors
+    // not available.
+    if (tensor_id == -1) continue;
     const auto& tensor = context->tensors[tensor_id];
     if (tensor.dims->size > 4) return false;
   }
@@ -191,19 +194,22 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       if (!InputsWithCorrectTypes(node, context,
                                   {{kTfLiteUInt8, kTfLiteInt8},
                                    {kTfLiteUInt8, kTfLiteInt8},
-                                   {kTfLiteInt32}}))
+                                   {kTfLiteInt32, kTfLiteNoType}}))
         return false;
 
       const auto& weights_tensor = context->tensors[node->inputs->data[1]];
-      const auto& bias_tensor = context->tensors[node->inputs->data[2]];
-      const bool weights_and_bias_const =
-          weights_tensor.allocation_type == kTfLiteMmapRo &&
-          bias_tensor.allocation_type == kTfLiteMmapRo;
+      bool bias_const_or_no_bias = true;
+      if (node->inputs->data[2] != -1) {
+        const auto& bias_tensor = context->tensors[node->inputs->data[2]];
+        bias_const_or_no_bias = bias_tensor.allocation_type == kTfLiteMmapRo;
+      }
+      const bool weights_const =
+          weights_tensor.allocation_type == kTfLiteMmapRo;
 
       const TfLiteFullyConnectedParams* matmul_params =
           reinterpret_cast<const TfLiteFullyConnectedParams*>(
               node->builtin_data);
-      return (weights_and_bias_const &&
+      return (weights_const && bias_const_or_no_bias &&
               IsActivationReluOrNone(matmul_params->activation) &&
               matmul_params->keep_num_dims == false &&
               matmul_params->weights_format ==
@@ -335,7 +341,8 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         return false;
       const auto& input_tensor = context->tensors[node->inputs->data[1]];
       const bool is_four_dim_or_less = input_tensor.dims->size < 5;
-      // We need splitting axis to be constant, so Hexagon knows output shapes.
+      // We need splitting axis to be constant, so Hexagon knows output
+      // shapes.
       return is_four_dim_or_less &&
              IsConstantTensor(GetInput(context, node, 0));
     }

From 6f19d507f4955f571582349213c69991868379bb Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 18 May 2020 11:50:56 -0700
Subject: [PATCH 0769/1533] [XLA] Fix rendering of the RngBitGenerator
 description table

PiperOrigin-RevId: 312123981
Change-Id: I9d1ecdf88dfb9f5689dcfc26f6243a192ab55dd6
---
 .../compiler/xla/g3doc/operation_semantics.md | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 495701eaac2..002d07184a7 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -2299,20 +2299,26 @@ The output is guaranteed to be a deterministic function of the initial state but
 it is *not* guaranteed to be deterministic between backends and different
 compiler versions.
 
-<b>`RngBitGenerator(algorithm, key, shape)`</b> | Arguments | Type | Semantics |
-|---------------- | ----------------- | ------------------------------------- |
-| `algorithm` | `RandomAlgorithm` | PRNG algorithm to be used. | |
-`initial_state` | `XlaOp` | Initial state for the PRNG algorithm. | | `shape` |
-`Shape` | Output shape for generated data. |
+<b>`RngBitGenerator(algorithm, key, shape)`</b>
 
-Available values for `algorithm`: * `rng_default`: Backend specific algorithm
-with backend specific shape requirements. * `rng_three_fry`: ThreeFry
-counter-based PRNG algorithm. The `initial_state` shape is `u64[2]` with
-arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
-* `rng_philox`: Philox algorithm to generate random numbers in parallel. The
-`initial_state` shape is `u64[3]` with arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+Arguments       | Type              | Semantics
+--------------- | ----------------- | -------------------------------------
+`algorithm`     | `RandomAlgorithm` | PRNG algorithm to be used.
+`initial_state` | `XlaOp`           | Initial state for the PRNG algorithm.
+`shape`         | `Shape`           | Output shape for generated data.
+
+Available values for `algorithm`:
+
+-   `rng_default`: Backend specific algorithm with backend specific shape
+    requirements.
+
+-   `rng_three_fry`: ThreeFry counter-based PRNG algorithm. The `initial_state`
+    shape is `u64[2]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+
+-   `rng_philox`: Philox algorithm to generate random numbers in parallel. The
+    `initial_state` shape is `u64[3]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
 
 ## Scatter
 

From 672e419c9f7e331fff4449799e8cd7c476ac4b7c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 12:35:23 -0700
Subject: [PATCH 0770/1533] Enable tests for tf.linalg.lu in eager mode.

PiperOrigin-RevId: 312132817
Change-Id: I0dd5b96cc2b3462817e0637794a623c24bd0f989
---
 tensorflow/python/kernel_tests/lu_op_test.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 7935b66f4af..de9d8c32cb5 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -214,15 +214,20 @@ class LuOpTest(test.TestCase):
     data = np.random.rand(n, n) + 1j * np.random.rand(n, n)
     self._verifyLu(data)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyLu(np.empty([0, 2, 2]))
     self._verifyLu(np.empty([2, 0, 0]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    matrix1 = random_ops.random_normal([5, 5], seed=42)
-    matrix2 = random_ops.random_normal([5, 5], seed=42)
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
     lu1, p1 = linalg_ops.lu(matrix1)
     lu2, p2 = linalg_ops.lu(matrix2)
     lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])

From 7254343a10ba00d48f828981cec3e3587e667ca9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 12:37:47 -0700
Subject: [PATCH 0771/1533] Enable tests for tf.linalg.matrix_square_root in
 eager mode.

PiperOrigin-RevId: 312133318
Change-Id: I541a94a21594384fba30a9198ad5a7300537c498
---
 .../matrix_square_root_op_test.py             | 37 +++++++++++--------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index c36d83e2530..6cf330ed981 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -21,10 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
 
 
@@ -89,31 +90,35 @@ class SquareRootOpTest(test.TestCase):
     self._verifySquareRootReal(np.empty([0, 2, 2]))
     self._verifySquareRootReal(np.empty([2, 0, 0]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to the square root should be at least a 2-dimensional tensor.
     tensor = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_square_root(tensor)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNotSquare(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
       self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with test_util.use_gpu():
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      square1 = math_ops.matmul(matrix1, matrix1)
-      square2 = math_ops.matmul(matrix2, matrix2)
-      sqrt1 = gen_linalg_ops.matrix_square_root(square1)
-      sqrt2 = gen_linalg_ops.matrix_square_root(square2)
-      all_ops = [sqrt1, sqrt2]
-      sqrt = self.evaluate(all_ops)
-      self.assertAllClose(sqrt[0], sqrt[1])
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
+    square1 = math_ops.matmul(matrix1, matrix1)
+    square2 = math_ops.matmul(matrix2, matrix2)
+    sqrt1 = gen_linalg_ops.matrix_square_root(square1)
+    sqrt2 = gen_linalg_ops.matrix_square_root(square2)
+    all_ops = [sqrt1, sqrt2]
+    sqrt = self.evaluate(all_ops)
+    self.assertAllClose(sqrt[0], sqrt[1])
 
 
 if __name__ == "__main__":

From b5436f9d5fe7bdfc8e42f0b27328a8457d48ccf6 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 18 May 2020 12:43:30 -0700
Subject: [PATCH 0772/1533] Rename TraceMe::SetMetadata to
 TraceMe::AppendMetadata and add lambda overload.

PiperOrigin-RevId: 312134462
Change-Id: Ia1a0f7de954fba6c0b05a6beae10cc08dc803cfc
---
 tensorflow/core/profiler/lib/BUILD            |  2 +
 tensorflow/core/profiler/lib/traceme.h        | 56 +++++++++-----
 tensorflow/core/profiler/lib/traceme_encode.h | 73 +++++++++++++++----
 tensorflow/python/profiler/internal/BUILD     |  1 +
 .../profiler/internal/traceme_wrapper.cc      | 10 ++-
 5 files changed, 107 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 0aa1a5d6b67..5bb9236efb3 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -94,6 +94,7 @@ cc_library(
     hdrs = ["traceme.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":traceme_encode",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
@@ -159,6 +160,7 @@ filegroup(
         "profiler_session.h",
         "scoped_annotation.h",
         "traceme.h",
+        "traceme_encode.h",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 2c3e3ebe6cc..ec5f6765afb 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -28,6 +28,7 @@ limitations under the License.
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #endif
+#include "tensorflow/core/profiler/lib/traceme_encode.h"  // IWYU pragma: export
 
 namespace tensorflow {
 namespace profiler {
@@ -123,13 +124,20 @@ class TraceMe {
   explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
-  // This overload only generates the activity name if tracing is enabled.
-  // Useful for avoiding things like string concatenation when tracing is
-  // disabled. The |name_generator| may be a lambda or functor that returns a
-  // type that the string() constructor can take.
+  // This overload only generates the name (and possibly metadata) if tracing is
+  // enabled. Useful for avoiding expensive operations (e.g., string
+  // concatenation) when tracing is disabled.
+  // name_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
   // name_generator is templated, rather than a std::function to avoid
   // allocations std::function might make even if never called.
-  // Usage: profiler::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
+  // Example Usage:
+  //   TraceMe op_trace_me([&]() {
+  //     return StrCat(op_name, ":", op_type);
+  //   }
+  //   TraceMe trace_me_with_metadata([&value1]() {
+  //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+  //   });
   template <typename NameGeneratorT>
   explicit TraceMe(NameGeneratorT name_generator, int level = 1) {
     DCHECK_GE(level, 1);
@@ -167,21 +175,35 @@ class TraceMe {
 #endif
   }
 
-  // Sets new_metadata in the metadata part of no_init_.name.
-  void SetMetadata(absl::string_view new_metadata) {
+  // Appends new_metadata to the TraceMe name passed to the constructor.
+  // metadata_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
+  // metadata_generator is only evaluated when tracing is enabled.
+  // metadata_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Example Usage:
+  //   trace_me.AppendMetadata([&value1]() {
+  //     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+  //   });
+  template <typename MetadataGeneratorT>
+  void AppendMetadata(MetadataGeneratorT metadata_generator) {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        std::string& name = no_init_.name;
-        DCHECK(!name.empty());
-        DCHECK(!new_metadata.empty());
-        if (name.back() == '#') {  // name already has metadata
-          name.back() = ',';
-          if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
-            new_metadata.remove_prefix(1);
-          }
-        }
-        name.append(new_metadata.data(), new_metadata.size());
+        traceme_internal::AppendMetadata(&no_init_.name, metadata_generator());
+      }
+    }
+#endif
+  }
+
+  // Appends new_metadata to the payload.
+  // This overload should only be used by other TraceMe APIs.
+  // Prefer the overload above instead.
+  void AppendMetadata(absl::string_view new_metadata) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
+      if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
+        traceme_internal::AppendMetadata(&no_init_.name, new_metadata);
       }
     }
 #endif
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index 772f56a2153..2e23c6d878b 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -28,7 +28,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace profiler {
-namespace internal {
+namespace traceme_internal {
 
 // Copies the contents of str to the address pointed by out.
 // Returns the address after the copy.
@@ -36,24 +36,18 @@ namespace internal {
 TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
                                                absl::string_view str) {
   const size_t str_size = str.size();
-  if (str_size > 0) {
+  if (TF_PREDICT_TRUE(str_size > 0)) {
     memcpy(out, str.data(), str_size);
     out += str_size;
   }
   return out;
 }
 
-}  // namespace internal
-
-// Encodes an event name and arguments into a string stored by TraceMe.
-// Use within a lambda to avoid expensive operations when tracing is inactive.
-// Example Usage:
-//   TraceMe trace_me([&name, value1]() {
-//     return TraceMeEncode(name, {{"key1", value1}, {"key2", 42}});
-//   });
-inline std::string TraceMeEncode(
+// Appends args encoded as TraceMe metadata to name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
     std::string name,
-    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+    const std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>>&
+        args) {
   if (TF_PREDICT_TRUE(args.size() > 0)) {
     const auto old_size = name.size();
     auto new_size = old_size + args.size() * 2 + 1;
@@ -65,9 +59,9 @@ inline std::string TraceMeEncode(
     char* out = begin + old_size;
     *out++ = '#';
     for (const auto& arg : args) {
-      out = internal::Append(out, arg.first);
+      out = Append(out, arg.first);
       *out++ = '=';
-      out = internal::Append(out, arg.second.Piece());
+      out = Append(out, arg.second.Piece());
       *out++ = ',';
     }
     *(out - 1) = '#';
@@ -76,6 +70,57 @@ inline std::string TraceMeEncode(
   return name;
 }
 
+// Appends new_metadata to the metadata part of name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
+    std::string* name, absl::string_view new_metadata) {
+  if (!TF_PREDICT_FALSE(new_metadata.empty())) {
+    if (!name->empty() && name->back() == '#') {  // name already has metadata
+      name->back() = ',';
+      if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
+        new_metadata.remove_prefix(1);
+      }
+    }
+    name->append(new_metadata.data(), new_metadata.size());
+  }
+}
+
+}  // namespace traceme_internal
+
+// Encodes an event name and arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me([value1]() {
+//     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::string name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::move(name), args);
+}
+inline std::string TraceMeEncode(
+    absl::string_view name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+inline std::string TraceMeEncode(
+    const char* name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+
+// Encodes arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me("my_trace");
+//   ...
+//   trace_me.AppendMetadata([value1]() {
+//     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(), args);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index d9f93c2fb21..9b0f216508e 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -89,6 +89,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme_headers",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
     ],
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index a1b5370836b..6b0098e316d 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "pybind11/pybind11.h"
 #include "tensorflow/core/platform/types.h"
@@ -27,13 +29,13 @@ namespace {
 // Helper to implement TraceMe as a context manager in Python.
 class TraceMeWrapper {
  public:
-  explicit TraceMeWrapper(const tensorflow::string& name) : name_(name) {}
+  explicit TraceMeWrapper(const std::string& name) : name_(name) {}
 
   void Enter() { traceme_.emplace(std::move(name_)); }
 
-  void SetMetadata(const tensorflow::string& new_metadata) {
+  void SetMetadata(const std::string& new_metadata) {
     if (TF_PREDICT_TRUE(traceme_)) {
-      traceme_->SetMetadata(new_metadata);
+      traceme_->AppendMetadata(absl::string_view(new_metadata));
     }
   }
 
@@ -50,7 +52,7 @@ class TraceMeWrapper {
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
   py::class_<TraceMeWrapper> traceme_class(m, "TraceMe");
-  traceme_class.def(py::init<const tensorflow::string&>())
+  traceme_class.def(py::init<const std::string&>())
       .def("Enter", &TraceMeWrapper::Enter)
       .def("Exit", &TraceMeWrapper::Exit)
       .def("SetMetadata", &TraceMeWrapper::SetMetadata)

From 8e661af54d9787b2a3a2371cc6efcfa1d8db6a34 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 18 May 2020 13:03:24 -0700
Subject: [PATCH 0773/1533] [XLA] Simplify tautological compares (and (< x A)
 (< x B)) to (< x A) when `a <= B` holds.

This is required for figuring out the trip count of loops whose condition
contains the conjunction.  Such conjunctions arise from TF when a for loop with
`tf.range` is lowered, or when using `tf.while_loop` with `maximum_iterations`
set.

PiperOrigin-RevId: 312138518
Change-Id: I12c5c7d0aeedbf0d375f3cff1d23b39aea89f64a
---
 .../xla/service/algebraic_simplifier.cc       | 65 +++++++++++++++++++
 .../xla/service/algebraic_simplifier_test.cc  | 19 ++++++
 2 files changed, 84 insertions(+)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 55af8726dc8..ecbf2075abe 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -508,6 +508,13 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Tries to convert slice(reshape(X)) into reshape(slice(X))
   StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
+  // Tries to simplify `(and (< a N) (< a K))` in cases where `N <= K` into
+  // `(< a N)`. This is crucial for being able to figure out the loop trip
+  // count.
+  //
+  // Assumes that the input is conjunction.
+  StatusOr<bool> TrySimplifyTautologicalCompare(HloInstruction* conjunction);
+
   // Useful when we want to use the same visitor over multiple computations.
   void ResetState(HloComputation* computation);
 
@@ -856,6 +863,57 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   return Status::OK();
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare(
+    HloInstruction* conjunction) {
+  HloInstruction *lhs, *rhs;
+  if (!Match(conjunction, m::And(m::Op(&lhs), m::Op(&rhs)))) {
+    return false;
+  }
+  struct LessThanCompareInfo {  // (LT var constant)
+    HloInstruction* var;
+    int64 constant;
+  };
+
+  auto get_compare_info_helper =
+      [&](HloInstruction* lhs,
+          HloInstruction* rhs) -> absl::optional<LessThanCompareInfo> {
+    if (!Match(rhs, m::Constant().WithShape(
+                        m::Shape().IsEffectiveScalar().WithElementType(
+                            PrimitiveType::S32)))) {
+      return absl::nullopt;
+    }
+    return {LessThanCompareInfo{lhs, *rhs->literal().GetFirstInteger()}};
+  };
+
+  auto get_compare_info =
+      [&](HloInstruction* cmp) -> absl::optional<LessThanCompareInfo> {
+    HloInstruction *lhs, *rhs;
+    if (!Match(cmp, m::Compare(m::Op(&lhs), m::Op(&rhs))
+                        .WithComparisonDirection(ComparisonDirection::kLt))) {
+      return absl::nullopt;
+    }
+    if (auto match1 = get_compare_info_helper(lhs, rhs)) {
+      return match1;
+    } else if (auto match2 = get_compare_info_helper(rhs, lhs)) {
+      return match2;
+    }
+    return absl::nullopt;
+  };
+
+  absl::optional<LessThanCompareInfo> lhs_info = get_compare_info(lhs);
+  absl::optional<LessThanCompareInfo> rhs_info = get_compare_info(rhs);
+  if (lhs_info && rhs_info && lhs_info->var == rhs_info->var) {
+    int64 new_bound = std::min(lhs_info->constant, rhs_info->constant);
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        conjunction,
+        HloInstruction::CreateCompare(lhs->shape(), lhs_info->var,
+                                      MakeScalarLike(lhs_info->var, new_bound),
+                                      ComparisonDirection::kLt)));
+    return true;
+  }
+  return false;
+}
+
 Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
@@ -890,6 +948,13 @@ Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
     return Status::OK();
   }
 
+  // Simplify tautological conjunctions.
+  TF_ASSIGN_OR_RETURN(bool found_tautological_compare,
+                      TrySimplifyTautologicalCompare(logical_and));
+  if (found_tautological_compare) {
+    return Status::OK();
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 6c8e80aa963..08a004e39fe 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -5761,6 +5761,25 @@ TEST_F(AlgebraicSimplifierTest, CompareSame) {
               GmockMatch(m::Broadcast(m::ConstantScalar(true))));
 }
 
+TEST_F(AlgebraicSimplifierTest, CompareSimplified) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[] parameter(0)
+      c1 = s32[] constant(10)
+      c2 = s32[] constant(100)
+      cmp1 = pred[] compare(param, c1), direction=LT
+      cmp2 = pred[] compare(param, c2), direction=LT
+      ROOT out = pred[] and(cmp1, cmp2)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Op(), m::Op().IsConstantScalar(10))
+                     .WithComparisonDirection(ComparisonDirection::kLt)));
+}
+
 TEST_F(AlgebraicSimplifierTest, CanDisableDotToMultiplyRewrite) {
   // Some backends may have better performance by treating an outer product as a
   // Dot, rather than a broadcast Multiply

From 869920697b243622073317ddc533bdff41684c41 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 18 May 2020 13:27:55 -0700
Subject: [PATCH 0774/1533] [tf.lite] Use in-process conversion when the new
 converter is used

Out-of-process conversion was a workaround for the legacy converter,
which would generally crash the process when conversion failed. However,
out-of-process conversion also adds a good deal of complexity, so avoid
it when using the new conversion backend.

PiperOrigin-RevId: 312142994
Change-Id: I7ddc83df99ccf24be6e15f46d6a116dce8321933
---
 tensorflow/lite/python/convert.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 6b7a32f1bcc..a5fbb88132e 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -169,9 +169,10 @@ def toco_convert_protos(model_flags_str,
     RuntimeError: When conversion fails, an exception is raised with the error
       message embedded.
   """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
+  # Historically, TOCO conversion failures would trigger a crash, so we would
+  # attempt to run the converter out-of-process. The MLIR conversion pipeline
+  # surfaces errors instead, and can be safely run in-process.
+  if enable_mlir_converter or not _toco_from_proto_bin:
     try:
       model_str = wrap_toco.wrapped_toco_convert(model_flags_str,
                                                  toco_flags_str, input_data_str,

From da67fcddef242a0c358f4acc5f263880c1863836 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Mon, 18 May 2020 13:36:18 -0700
Subject: [PATCH 0775/1533] Edit Hexagon documentation to reflect new supported
 models

PiperOrigin-RevId: 312144610
Change-Id: I9c8b0d9ad6ea4b745b4bb985ca143cca660a5b14
---
 .../g3doc/performance/hexagon_delegate.md     | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 60fe9465bf4..0e947d1d5e1 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -22,15 +22,15 @@ are supported, including:
 
 **Supported models:**
 
-The Hexagon delegate currently supports quantized models generated using
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize),
-e.g.,
-[these quantized models](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
-hosted on the TensorFlow Lite repo. It does not (yet) support models with
-[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec).
-Sample models include
-[MobileNet V1](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz),
-[SSD Mobilenet](https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip).
+The Hexagon delegate supports all models that conform to our
+[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec),
+including those generated using
+[post-training integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant).
+UInt8 models trained with the legacy
+[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize)
+path are also supported, for e.g.,
+[these quantized versions](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
+on our Hosted Models page.
 
 ## Hexagon Delegate Java API
 
@@ -254,10 +254,6 @@ ro.board.platform`).
 
 ## FAQ
 
-*   Will the delegate support models created using
-    [post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)?
-    *   This is tentatively planned for a future release, though there is no
-        concrete timeline.
 *   Which ops are supported by the delegate?
     *   See the current list of [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
 *   How can I tell that the model is using the DSP when I enable the delegate?

From d4f71ff132a1262f4a6b05f58807e8ba3d46b83d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 13:38:25 -0700
Subject: [PATCH 0776/1533] Enable tests for tf.linalg.tensordot in eager mode.

PiperOrigin-RevId: 312144965
Change-Id: I2d75f7d9bd7f05aef6d1dee620dffcea66071b97
---
 .../python/kernel_tests/tensordot_op_test.py  | 43 ++++++++++++-------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 71e448f7855..7f8c5e9781b 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -41,16 +41,19 @@ def _add_test(test, test_name, fn):
 
 class TensordotTest(test_lib.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_shape(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4], [5, 6]]
     a_axes = [1]
     b_axes = [0]
     # Invalid static shapes.
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       math_ops.tensordot(a, b, (a_axes, b_axes))
+
     # Invalid dynamic shapes.
+    if context.executing_eagerly():
+      return
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "Matrix size-incompatible"):
@@ -65,7 +68,7 @@ class TensordotTest(test_lib.TestCase):
                 axes_ph: (a_axes, b_axes)
             })
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
@@ -77,6 +80,8 @@ class TensordotTest(test_lib.TestCase):
     with self.assertRaises(IndexError):
       math_ops.tensordot(a, b, [[0], [7]])
 
+    if context.executing_eagerly():
+      return
     # Invalid dynamic axes.
     a_ph = array_ops.placeholder(dtypes.float32)
     b_ph = array_ops.placeholder(dtypes.float32)
@@ -93,22 +98,22 @@ class TensordotTest(test_lib.TestCase):
                   axes_ph: axes_value
               })
 
-  # Test case for 11950
+  # Test case for https://github.com/tensorflow/tensorflow/issues/11950
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_valid_axis(self):
     for axes_value in [1, 2], [[1], [2]], [[], []], 0:
-      with self.cached_session():
-        np_a = np.ones((3, 3))
-        np_b = np.array([2, 3, 1])[None, None]
-        np_ans = np.tensordot(np_a, np_b, axes_value)
+      np_a = np.ones((3, 3))
+      np_b = np.array([2, 3, 1])[None, None]
+      np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
-        tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
-        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
+      tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
+      tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
 
-        self.assertAllEqual(tf_ans.shape, np_ans.shape)
-        self.assertAllEqual(tf_ans, np_ans)
+      self.assertAllEqual(tf_ans.shape, np_ans.shape)
+      self.assertAllEqual(self.evaluate(tf_ans), np_ans)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Shape inference test")
   def test_partial_shape_inference(self):
     for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
@@ -159,7 +164,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
         size=np.prod(b_shape)).reshape(b_shape).astype(dtype_)
     return a, b, a_dims, b_dims
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     num_trials = min(30, num_dims_ * num_dims_)
     if dtype_ == np.float16:
       tol = 0.05
@@ -187,7 +195,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot_scalar_axes(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     if num_dims_ < 1:
       self.skipTest("Not a test")
     if dtype_ == np.float16:
@@ -229,7 +240,7 @@ if __name__ == "__main__":
       for rank_b in 1, 2, 4, 5:
         for num_dims in range(0, min(rank_a, rank_b) + 1):
           # TF2 does not support placeholders under eager so we skip it
-          for dynamic_shape in set([False, not tf2.enabled()]):
+          for dynamic_shape in set([False, True]):
             for testcase in _get_tensordot_tests(dtype, rank_a, rank_b,
                                                  num_dims, dynamic_shape):
               name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__,

From ecf503380978e04e5e47f231fcc33a49d6c9d841 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Mon, 18 May 2020 13:38:32 -0700
Subject: [PATCH 0777/1533] Return a meaningful error for dynamic shape inputs
 with outside compilation head extraction in TPUs.

PiperOrigin-RevId: 312144982
Change-Id: I187b58ac8759b391fdcb9649bffd979025350f55
---
 .../python/distribute/tpu_strategy_test.py    | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index de4c975d5ef..6c93e29c028 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -140,6 +141,9 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -164,6 +168,28 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
+  def test_dynamic_shape_with_outside_compilation_failure(self):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)

From 3d4c5d1b578397070d8cecbfe88d8fa06c183189 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 14:06:53 -0700
Subject: [PATCH 0778/1533] NFC: Update canonicalize tests to use regex.

PiperOrigin-RevId: 312150354
Change-Id: Ifed616606d5c8c708a3800256c4234b9bbb3ce3c
---
 .../mlir/lite/tests/canonicalize.mlir         | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 1f067aae685..5c69130c939 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -11,9 +11,9 @@ func @reshape_removeAdjacent(tensor<4x4x4xf32>) -> tensor<64xf32> {
   return %1 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacent
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return %[[RESHAPE]]
 }
 
 // Checks that tfl.reshape should be removed if its output has more than one
@@ -29,11 +29,11 @@ func @reshape_removeAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> tensor<64xf32>
   return %3 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %2 = addf %0, %1
-// CHECK:  return %2
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESHAPE_2:.*]]  = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESULT:.*]] = addf %[[RESHAPE_1]], %[[RESHAPE_2]]
+// CHECK:  return %[[RESULT]]
 }
 
 // Checks that tfl.reshape should be kept if its output has more than one
@@ -47,11 +47,11 @@ func @reshape_keepAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> (tensor<16x4xf32
   return %0, %1 : tensor<16x4xf32>, tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_keepAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<[16, 4]> : tensor<2xi32>
-// CHECK:  %cst_0 = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst_0) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return %0, %1
+// CHECK:  %[[CST:.*]]  = constant dense<[16, 4]> : tensor<2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
+// CHECK:  %[[RESHAPE_2:.*]] = "tfl.reshape"(%arg0, %[[CST_0]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return  %[[RESHAPE_1]],  %[[RESHAPE_2]]
 }
 
 // Checks that tfl.reshape should be removed if its output type is the same

From 6dcb7268bb28221134cd1151a730e89023d59623 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Mon, 18 May 2020 14:33:45 -0700
Subject: [PATCH 0779/1533] Rename `_get_closest` to more accurately reflect
 what it does.

PiperOrigin-RevId: 312155516
Change-Id: I27d8dd110ace0150ea735f718ed94948a9a75a74
---
 tensorflow/python/distribute/values.py  | 22 +++++++++++-----------
 tensorflow/python/training/optimizer.py |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 444915aa123..84904f93104 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -139,7 +139,7 @@ class DistributedValues(object):
         "This method should be overridden by sub-classes which support cross-"
         "replica accesses.")
 
-  def _get_closest(self):
+  def _get_on_device_or_primary(self):
     """Returns value in same replica or device if possible, else the _primary."""
     replica_id = _get_current_replica_id_as_int()
     if replica_id is None:
@@ -379,7 +379,7 @@ class Mirrored(DistributedDelegate):
   """Holds a map from replica to values which are kept in sync."""
 
   def _get_cross_replica(self):
-    return self._get_closest()
+    return self._get_on_device_or_primary()
 
   def _as_graph_element(self):
     obj = self._get()
@@ -480,11 +480,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     return init_op
 
   def initialized_value(self):
-    return self._get_closest().initialized_value()
+    return self._get_on_device_or_primary().initialized_value()
 
   @property
   def initial_value(self):
-    return self._get_closest().initial_value
+    return self._get_on_device_or_primary().initial_value
 
   @property
   def constraint(self):
@@ -537,7 +537,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return self._values[replica_id].handle
 
   def eval(self, session=None):
-    return self._get_closest().eval(session)
+    return self._get_on_device_or_primary().eval(session)
 
   @property
   def _save_slice_info(self):
@@ -552,7 +552,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def device(self):
-    return self._get_closest().device
+    return self._get_on_device_or_primary().device
 
   @property
   def trainable(self):
@@ -587,7 +587,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return array_ops.identity(self._get())
 
   def value(self):
-    return self._get_closest().value()
+    return self._get_on_device_or_primary().value()
 
   def numpy(self):
     if context.executing_eagerly():
@@ -961,7 +961,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return array_ops.identity(Mirrored._get_cross_replica(self))
 
   def _as_graph_element(self):
-    return self._get_closest()._as_graph_element()  # pylint: disable=protected-access
+    return self._get_on_device_or_primary()._as_graph_element()  # pylint: disable=protected-access
 
   def _gather_saveables_for_checkpoint(self):
     """Overrides Trackable method.
@@ -1067,7 +1067,7 @@ class SyncOnReadVariable(DistributedVariable):
   """Holds a map from replica to variables whose values are reduced on save."""
 
   def _update_replica(self, update_fn, value, **kwargs):
-    return update_fn(self._get_closest(), value, **kwargs)
+    return update_fn(self._get_on_device_or_primary(), value, **kwargs)
 
   # TODO(b/154017756): Make assign behaivor in cross replica context consistent
   # with MirroredVariable.
@@ -1146,8 +1146,8 @@ class SyncOnReadVariable(DistributedVariable):
       if ds_context.in_cross_replica_context():
         return self._get_cross_replica()
       else:
-        # _get_closest() returns a Variable.
-        return self._get_closest().value()
+        # _get_on_device_or_primary() returns a Variable.
+        return self._get_on_device_or_primary().value()
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 9732ea04f26..1fe8a8c729b 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -768,7 +768,7 @@ class Optimizer(
       # pylint: enable=protected-access
       mirrored_slot = named_slots.get(key, None)
       if mirrored_slot is None: return None
-      return mirrored_slot._get_closest()  # pylint: disable=protected-access
+      return mirrored_slot._get_on_device_or_primary()  # pylint: disable=protected-access
 
     return named_slots.get(_var_key(var), None)
 

From 756e66db61ec5b0a642be7381f65cc87d4e64802 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 15:03:26 -0700
Subject: [PATCH 0780/1533] Modify signature of layout_config().

PiperOrigin-RevId: 312161403
Change-Id: I9304d4839f6bcea6804dd959b131ffac7c0be6d6
---
 tensorflow/compiler/xla/service/hlo_module_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 833d0fe59d0..964f83322a4 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -204,7 +204,7 @@ class HloModuleConfig {
 
   std::vector<std::vector<int64>>* mutable_dot_config() { return &dot_config_; }
 
-  absl::Span<const std::vector<std::vector<int64>>> layout_config() const {
+  const std::vector<std::vector<std::vector<int64>>>& layout_config() const {
     return layout_config_;
   }
 

From 1a07ecf8526bca5748bf447b16586b60889cdc36 Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Mon, 18 May 2020 15:08:28 -0700
Subject: [PATCH 0781/1533] In TF-TFRT integration, C API will get dtype from
 underlying fallback tensor directly if the tfrt dtype is Unsupported. This is
 used to support dtypes that are not natively implemented in TFRT (e.g.
 DT_RESOURCE).

Enable a few resnet50 tests.

PiperOrigin-RevId: 312162457
Change-Id: Iece6d621120e8b20d0a0fe7b271a76dc29caa924
---
 .../python/eager/benchmarks/resnet50/resnet50_test.py     | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 9d049a6d59d..34ceb56d129 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -104,7 +104,6 @@ class ResNet50Test(tf.test.TestCase):
       context.async_wait()
     self.assertEqual((2, 1000), output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply(self):
     self._apply(defun=False)
 
@@ -121,7 +120,6 @@ class ResNet50Test(tf.test.TestCase):
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_no_top(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
@@ -132,7 +130,6 @@ class ResNet50Test(tf.test.TestCase):
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_with_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
@@ -141,7 +138,6 @@ class ResNet50Test(tf.test.TestCase):
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_no_average_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -153,7 +149,6 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 7, 7, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_block3_strides(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -165,7 +160,6 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_retrieve_intermediates(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -220,7 +214,6 @@ class ResNet50Test(tf.test.TestCase):
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
-  @test_util.disable_tfrt('b/155260334')
   def test_train(self):
     self._test_train()
 
@@ -228,7 +221,6 @@ class ResNet50Test(tf.test.TestCase):
   def test_train_async(self):
     self._test_train(execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_no_garbage(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format)

From 3c54ef5ab94813713ae538b76a78e1fac4ac424d Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 18 May 2020 15:17:54 -0700
Subject: [PATCH 0782/1533] Support running a tf.function with packed variable
 inputs both locally and remotely. - Support packing multiple EagerTensors of
 the same dtype and shape. - Create CompositeDevices on the same task as the
 local host CPU, in order to correctly trigger packed TensorHandle copy from a
 client to a remote worker.

PiperOrigin-RevId: 312164194
Change-Id: Ia15718309c8c68eb645bfe0bf967ddd6d2551b3a
---
 .../core/common_runtime/composite_device.cc   | 12 ++--
 .../core/common_runtime/composite_device.h    |  5 +-
 .../common_runtime/composite_device_test.cc   | 11 ++--
 .../core/common_runtime/eager/context.cc      |  7 ++-
 .../core/common_runtime/eager/context_test.cc | 12 ++--
 .../common_runtime/eager/execute_node_test.cc |  3 +-
 .../eager/tensor_handle_test.cc               |  3 +
 .../process_function_library_runtime_test.cc  |  3 +-
 tensorflow/python/eager/backprop.py           | 13 ++++
 tensorflow/python/eager/context.py            | 16 +++++
 tensorflow/python/eager/function_test.py      | 37 ++++++++++++
 tensorflow/python/eager/pywrap_tensor.cc      | 15 ++++-
 tensorflow/python/eager/pywrap_tfe.h          |  3 +-
 tensorflow/python/eager/remote_test.py        | 31 ++++++++++
 tensorflow/python/framework/ops.py            | 59 +++++++++++++++++++
 tensorflow/python/framework/ops_test.py       | 47 +++++++++++++++
 tensorflow/python/tfe_wrapper.cc              | 20 +++++++
 17 files changed, 274 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc
index 3103fa37941..7fd41e00a04 100644
--- a/tensorflow/core/common_runtime/composite_device.cc
+++ b/tensorflow/core/common_runtime/composite_device.cc
@@ -24,7 +24,7 @@ const char* const kCompositeDeviceType = "COMPOSITE";
 
 std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     const std::vector<string>& underlying_devices, const int unique_device_id,
-    Status* status) {
+    const DeviceNameUtils::ParsedName& host_name, Status* status) {
   if (underlying_devices.empty()) {
     status->Update(
         errors::InvalidArgument("underlying_devices should not be empty."));
@@ -62,13 +62,15 @@ std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
       return nullptr;
     }
   }
+
+  DeviceNameUtils::ParsedName parsed_composite_name = host_name;
   DeviceAttributes device_attributes;
-  parsed_name.type = kCompositeDeviceType;
-  device_attributes.set_device_type(parsed_name.type);
-  parsed_name.id = unique_device_id;
+  parsed_composite_name.type = kCompositeDeviceType;
+  parsed_composite_name.id = unique_device_id;
   const string composite_name =
-      DeviceNameUtils::ParsedNameToString(parsed_name);
+      DeviceNameUtils::ParsedNameToString(parsed_composite_name);
   device_attributes.set_name(composite_name);
+  device_attributes.set_device_type(kCompositeDeviceType);
 
   return absl::WrapUnique(
       new CompositeDevice(device_attributes, underlying_devices));
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
index 127e5b8303a..850eae55e8d 100644
--- a/tensorflow/core/common_runtime/composite_device.h
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -42,10 +42,11 @@ class CompositeDevice : public Device {
     return &underlying_devices_;
   }
 
-  // Helper for creating a CompositeDevice.
+  // Helper for creating a CompositeDevice on the same task as the given host
+  // CPU.
   static std::unique_ptr<CompositeDevice> MakeDevice(
       const std::vector<string>& underlying_devices, const int unique_device_id,
-      Status* status);
+      const DeviceNameUtils::ParsedName& host_name, Status* status);
 
  private:
   CompositeDevice(const DeviceAttributes& device_attributes,
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
index ac2f9108ecb..73a6ae44912 100644
--- a/tensorflow/core/common_runtime/composite_device_test.cc
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -20,12 +20,15 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(CompositeDeviceTest, Basic) {
+  const string host_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  DeviceNameUtils::ParsedName parsed_host_name;
+  EXPECT_TRUE(DeviceNameUtils::ParseFullName(host_name, &parsed_host_name));
   std::vector<string> underlying_devices;
   {
     Status status;
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(absl::StrContains(status.error_message(),
@@ -41,7 +44,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:CPU:1");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
-                                    &status);
+                                    parsed_host_name, &status);
     TF_ASSERT_OK(status);
     EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType);
     EXPECT_EQ(underlying_devices, *composite_device->underlying_devices());
@@ -53,7 +56,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:CPU:0");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(
@@ -68,7 +71,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:GPU:0");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(absl::StrContains(status.error_message(),
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index b8dfe92aac6..207c6a02d5b 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -935,8 +935,11 @@ Status EagerContext::FindOrCreateCompositeDevice(
   }
 
   Status s;
-  auto device = CompositeDevice::MakeDevice(underlying_devices,
-                                            composite_devices_.size(), &s);
+  // Create a CompositeDevice on the same task as the host CPU, in order to
+  // trigger packed TensorHandle copy from a client to a remote worker.
+  auto device =
+      CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(),
+                                  HostCPU()->parsed_name(), &s);
   TF_RETURN_IF_ERROR(s);
   *composite_device = device.get();
   pflr_->AddCompositeDevice(*composite_device);
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index f83e3f0b45d..c6ed61c80c4 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -31,7 +31,7 @@ static Device* CreateDevice(const string& type, int n) {
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
-  attr.set_name("/job:a/replica:0/task:0/device:" + type + ":" +
+  attr.set_name("/job:localhost/replica:0/task:0/device:" + type + ":" +
                 std::to_string(n));
   attr.set_device_type(type);
   return new FakeDevice(attr);
@@ -179,10 +179,10 @@ TEST_F(EagerContextTest, CompositeDevice) {
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_0));
   EXPECT_EQ(composite_device_0->name(),
-            "/job:worker/replica:0/task:0/device:COMPOSITE:0");
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:0");
   CompositeDevice* device = nullptr;
   TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
-      "/job:worker/replica:0/task:0/device:COMPOSITE:0", &device));
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0", &device));
   EXPECT_EQ(device, composite_device_0);
   CompositeDevice* composite_device_1 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
@@ -193,13 +193,13 @@ TEST_F(EagerContextTest, CompositeDevice) {
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_2));
   EXPECT_EQ(composite_device_2->name(),
-            "/job:worker/replica:0/task:0/device:COMPOSITE:1");
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:1");
   TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
-      "/job:worker/replica:0/task:0/device:COMPOSITE:1", &device));
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:1", &device));
   EXPECT_EQ(device, composite_device_2);
 
   EXPECT_TRUE(errors::IsNotFound(context()->FindCompositeDeviceFromName(
-      "/job:worker/replica:0/task:0/device:COMPOSITE:2", &device)));
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 99f030322df..83fbcf5017e 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -61,7 +61,8 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
   Status s;
   std::unique_ptr<CompositeDevice> composite_device =
       CompositeDevice::MakeDevice({device0->name(), device1->name()},
-                                  /*unique_device_id=*/0, &s);
+                                  /*unique_device_id=*/0,
+                                  device_mgr.HostCPU()->parsed_name(), &s);
   TF_ASSERT_OK(s);
 
   auto ctx = new EagerContext(
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 779158375de..13b634bbec4 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -100,6 +100,7 @@ class PackedTensorHandleTest : public ::testing::Test {
     for (const char* name : device_names_) {
       devices.emplace_back(CreateDevice("GPU", name));
     }
+    devices.emplace_back(CreateDevice("CPU", host_name_));
     device_mgr_ = new StaticDeviceMgr(std::move(devices));
 
     context_ = new EagerContext(
@@ -132,6 +133,8 @@ class PackedTensorHandleTest : public ::testing::Test {
       "/job:worker/replica:0/task:1/device:GPU:0",
       "/job:worker/replica:0/task:1/device:GPU:1"};
 
+  const char* host_name_ = "/job:worker/replica:0/task:0/device:CPU:0";
+
   StaticDeviceMgr* device_mgr_;
   EagerContext* context_;
 };
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 247b94dc58c..5bdb4601d37 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -820,7 +820,8 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) {
   Status s;
   std::unique_ptr<CompositeDevice> composite_device =
       CompositeDevice::MakeDevice({device0_->name(), device1_->name()},
-                                  /*unique_device_id=*/0, &s);
+                                  /*unique_device_id=*/0,
+                                  device_mgr_->HostCPU()->parsed_name(), &s);
   TF_ASSERT_OK(s);
   AddCompositeDevice(composite_device.get());
 
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index fb7c4055136..7a3dce7db4e 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -241,6 +241,11 @@ def implicit_val_and_grad(f):
                        "function was being computed.")
 
     sources = [v.handle for v in variables]
+    for s in sources:
+      if getattr(s, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
     grad = imperative_grad.imperative_grad(this_tape, nest.flatten(end_node),
                                            sources)
     return end_node, list(zip(grad, variables))
@@ -548,6 +553,10 @@ def make_vjp(f, params=None, persistent=True):
       ]
       args = _ensure_unique_tensor_objects(parameter_positions, args)
       for i in parameter_positions:
+        if getattr(args[i], "is_packed", False):
+          raise ValueError(
+              "GradientTape.gradient is not supported on packed EagerTensors"
+              "yet.")
         sources.append(args[i])
         tape.watch(this_tape, args[i])
       result = f(*args)
@@ -1032,6 +1041,10 @@ class GradientTape(object):
             logging.WARN, "The dtype of the source tensor must be "
             "floating (e.g. tf.float32) when calling GradientTape.gradient, "
             "got %r", t.dtype)
+      if getattr(t, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
 
     if output_gradients is not None:
       output_gradients = [None if x is None else ops.convert_to_tensor(x)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 86b3d5cf95f..604a960afd5 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1123,6 +1123,22 @@ class Context(object):
     pywrap_tfe.TFE_Py_RegisterCustomDevice(self._handle, device_capsule,
                                            device_name, device_info_capsule)
 
+  def pack_eager_tensors(self, tensors):
+    """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+    Args:
+      tensors: a list of EagerTensors to pack.
+
+    Returns:
+      A packed EagerTensor.
+    """
+    self.ensure_initialized()
+    if self._lazy_remote_inputs_copy is not None and (
+        not self._lazy_remote_inputs_copy):
+      raise ValueError("Packing eager tensors is not supported when "
+                       "lazy_remote_inputs_copy is disabled.")
+    return pywrap_tfe.TFE_Py_PackEagerTensors(self._handle, tensors)
+
   def remove_function(self, name):
     """Remove a function from the context.
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 4e68f1460d9..078ca8b8878 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -186,6 +186,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(AttributeError, 'no attribute'):
       add(c)
 
+  def testPackedVariable(self):
+    with ops.device('/cpu:0'):
+      v0_0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/cpu:1'):
+      v0_1 = resource_variable_ops.ResourceVariable(2.0)
+      v1_0 = resource_variable_ops.ResourceVariable(3.0)
+    with ops.device('/cpu:2'):
+      v1_1 = resource_variable_ops.ResourceVariable(4.0)
+
+    packed_var_0 = ops.pack_eager_tensors([v0_0.handle, v0_1.handle])
+    packed_var_1 = ops.pack_eager_tensors([v1_0.handle, v1_1.handle])
+
+    # TODO(b/145922293): use ResourceVariable.assign_add and
+    # ResourceVariable.read_value directly once we support packing multiple
+    # ResourceVariable into one ResourceVariable.
+    @def_function.function
+    def read_var():
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_0, constant_op.constant(5.0))
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_1, constant_op.constant(6.0))
+      with ops.device('/cpu:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+      with ops.device('/cpu:1'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+        read2 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+      with ops.device('/cpu:2'):
+        read3 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+
+      return read0, read1, read2, read3
+
+    self.assertAllEqual(read_var(), (1 + 5, 2 + 5, 3 + 6, 4 + 6))
+
   def testImplementsAttributeBasic(self):
     v = def_function.function(
         experimental_implements='func')(lambda x, y: x + y)
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index a72f74b38b8..b209ddb6162 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -345,6 +345,8 @@ typedef struct EagerTensor {
   char unused[kMaxEagerTensorParentSize];
   TFE_TensorHandle* handle;
   int64_t id;
+  // Indicates whether it's a packed tensor or not.
+  bool is_packed;
   // This mirrors tensorflow.core.framework.ops.Tensor._handle_data Which will
   // be None for tensors of type other than DT_RESOURCE. For DT_RESOURCE
   // tensors, this will contain a serialized HandleData proto with shape
@@ -418,6 +420,7 @@ bool MaybeInvokeCreatedOnEagerTensorProfiler(EagerTensor* created_tensor) {
 int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->id = get_uid();
   self->handle = nullptr;
+  self->is_packed = false;
   Py_INCREF(Py_None);
   self->handle_data = Py_None;
   Py_INCREF(Py_None);
@@ -647,6 +650,11 @@ static PyObject* EagerTensor_backing_device(EagerTensor* self) {
 #endif
 }
 
+// Getter `is_packed`.
+static PyObject* EagerTensor_is_packed(EagerTensor* self) {
+  return PyBool_FromLong(self->is_packed);
+}
+
 static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("_id"), (getter)EagerTensor_getid, nullptr,
      const_cast<char*>("Tensor ID."), nullptr},
@@ -655,6 +663,9 @@ static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("backing_device"), (getter)EagerTensor_backing_device,
      nullptr, const_cast<char*>("Device on which tensor's memory is resident."),
      nullptr},
+    {const_cast<char*>("is_packed"), (getter)EagerTensor_is_packed, nullptr,
+     const_cast<char*>("Whether the EagerTensor is a packed tensor or not."),
+     nullptr},
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_handle_data,
      (setter)EagerTensor_sethandle_data,
      const_cast<char*>("Shape/DType data if the EagerTensor is a DT_RESOURCE"),
@@ -813,7 +824,8 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) {
   return reinterpret_cast<const EagerTensor*>(o)->handle;
 }
 
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed) {
   if (handle == nullptr) {
     return nullptr;
   }
@@ -821,6 +833,7 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
       EagerTensorType->tp_new(EagerTensorType, EmptyTuple(), EmptyDict()));
   if (t != nullptr) {
     t->id = get_uid();
+    t->is_packed = is_packed;
     Py_INCREF(Py_None);
     t->handle_data = Py_None;
     Py_INCREF(Py_None);
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 92a0a200e3d..a5c9c181539 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -129,7 +129,8 @@ void TFE_DeleteContextCapsule(PyObject* context);
 bool EagerTensor_CheckExact(const PyObject* o);
 
 // Helper function to construct a new EagerTensor from a TFE_TensorHandle.
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle);
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed = false);
 
 // Extracts the handle inside EagerTensor object `o`. Returns nullptr on error.
 TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 32fe6372f77..710e7bf5f9d 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import server_lib
 from tensorflow.python.training.server_lib import ClusterSpec
@@ -324,6 +325,36 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
+  def testMultiDeviceFunctionWithPackedVariable(self):
+    with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+      var0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+      var1 = resource_variable_ops.ResourceVariable(2.0)
+
+    packed_var = ops.pack_eager_tensors([var0.handle, var1.handle])
+    self.assertEqual(packed_var.device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+    self.assertEqual(packed_var.backing_device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+
+    @def_function.function
+    def add_variables():
+      with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+      with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+
+      return read0 + read1
+
+    # Run the function on a remote device
+    with ops.device('/job:worker/replica:0/task:0'):
+      self.assertAllEqual(add_variables().numpy(), 3.0)
+
+    # Run the function on a local worker
+    self.assertAllEqual(add_variables().numpy(), 3.0)
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionOnRemoteDeviceWithWait(self):
     with ops.device('/job:worker/replica:0/task:1'):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 43652d51eae..5b6dac5be34 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1394,6 +1394,65 @@ def _error_prefix(name):
   return "" if name is None else "%s: " % name
 
 
+def pack_eager_tensors(tensors, ctx=None):
+  """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+  Args:
+    tensors: a list of EagerTensors to pack.
+    ctx: context.context().
+
+  Returns:
+    A packed EagerTensor.
+  """
+  if not isinstance(tensors, list):
+    raise TypeError("tensors must be a list or a tuple: %s" % tensors)
+
+  if not tensors:
+    raise ValueError("Empty tensors is unexpected for packing.")
+
+  dtype = tensors[0].dtype
+  shape = tensors[0].shape
+  handle_data = tensors[0]._handle_data  # pylint: disable=protected-access
+  is_resource = dtype == dtypes.resource
+  for i in range(len(tensors)):
+    t = tensors[i]
+    if not isinstance(t, EagerTensor):
+      raise TypeError("tensors must be a list of EagerTensors: %s" % t)
+
+    if t.dtype != dtype:
+      raise ValueError(
+          "All tensors being packed should have the same dtype %s, "
+          "but the %d-th tensor is of dtype %s" % (dtype, i, t.dtype))
+    if t.shape != shape:
+      raise ValueError(
+          "All tensors being packed should have the same shape %s, "
+          "but the %d-th tensor is of shape %s" % (shape, i, t.shape))
+    # pylint: disable=protected-access
+    if is_resource and t._handle_data != handle_data:
+      raise ValueError(
+          "All tensors being packed should have the same handle data %s, "
+          "but the %d-th tensor is of handle data %s" %
+          (handle_data, i, t._handle_data))
+    # pylint: enable=protected-access
+
+  if ctx is None:
+    ctx = context.context()
+
+  # Propogate handle data for resource variables
+  packed_tensor = ctx.pack_eager_tensors(tensors)
+  if handle_data is not None:
+    packed_tensor._handle_data = handle_data  # pylint: disable=protected-access
+
+  def grad_fun(_):
+    raise ValueError(
+        "Gradients through pack_eager_tensors are not supported yet.")
+
+  tape.record_operation("pack_eager_tensors", [packed_tensor], tensors,
+                        grad_fun)
+
+  return packed_tensor
+
+
 def convert_to_tensor(value,
                       dtype=None,
                       name=None,
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 322df8ffac8..11193155999 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -3408,5 +3409,51 @@ class CustomConvertToCompositeTensorTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x_, tensor_util.constant_value(y_))
 
 
+@test_util.disable_tfrt("Packing EagerTensors is not supported yet.")
+class PackEagerTensorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(PackEagerTensorTest, self).setUp()
+    context._reset_context()
+    cpus = config.list_physical_devices("CPU")
+    # Set 2 virtual CPUs
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+    ])
+
+  def testPack(self):
+    with context.eager_mode():
+      with ops.device("CPU:0"):
+        var0 = resource_variable_ops.ResourceVariable(1.0)
+        c0 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+      with ops.device("CPU:1"):
+        var1 = resource_variable_ops.ResourceVariable(2.0)
+        var2 = resource_variable_ops.ResourceVariable([3.0])
+        c1 = constant_op.constant([9.0])
+
+      packed_var0 = ops.pack_eager_tensors([var0.handle, var1.handle])
+      self.assertTrue(packed_var0.is_packed)
+      self.assertEqual(packed_var0.dtype, var0.handle.dtype)
+      self.assertEqual(packed_var0.shape, var0.handle.shape)
+      self.assertEqual(packed_var0._handle_data, var0.handle._handle_data)
+      self.assertIn("COMPOSITE:0", packed_var0.device)
+      self.assertIn("COMPOSITE:0", packed_var0.backing_device)
+      with self.assertRaises(errors.InvalidArgumentError):
+        packed_var0.numpy()
+
+      # Different dtypes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, c1])
+
+      # Different shapes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([c0, c1])
+
+      # Different handle data
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, var2.handle])
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 836cafbd494..efcd912f430 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -210,6 +210,22 @@ TFE_OutputTensorHandles InputTFE_OutputTensorHandles(
   return output_tensor_handles;
 }
 
+// Packs multiple `EagerTensor`s of the same dtype and shape into one
+// `EagerTensor`.
+py::object TFE_Py_PackEagerTensors_wrapper(const py::handle& context,
+                                           const py::handle& tensors) {
+  TFE_Context* ctx = tensorflow::InputTFE_Context(context);
+  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(tensors);
+  tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
+  int size = handles.size();
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &size, status.get());
+  tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+  PyObject* packed_tensor =
+      EagerTensorFromHandle(packed_handle, /*is_packed=*/true);
+  return tensorflow::PyoOrThrow(packed_tensor);
+}
+
 // This function was created from fusing the typemap logic in platform/base.i.
 py::object TFE_Py_ExecuteCancelable_wrapper(
     const py::handle& context, const char* device_name, const char* op_name,
@@ -558,6 +574,10 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_Py_InitEagerTensor", [](const py::handle& o) {
     return tensorflow::PyoOrThrow(TFE_Py_InitEagerTensor(o.ptr()));
   });
+  m.def("TFE_Py_PackEagerTensors",
+        [](const py::handle& context, const py::handle& handles) {
+          return tensorflow::TFE_Py_PackEagerTensors_wrapper(context, handles);
+        });
   m.def("TFE_Py_SetEagerTensorProfiler", &TFE_Py_SetEagerTensorProfiler);
   m.def("TFE_Py_RegisterJVPFunction", [](const py::handle& o) {
     return tensorflow::PyoOrThrow(TFE_Py_RegisterJVPFunction(o.ptr()));

From 4001e3dad3c6340b0c2001d89b3954f189e9aeb5 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Mon, 18 May 2020 15:22:44 -0700
Subject: [PATCH 0783/1533] Updates GPU delegate documentation with
 experimental quant support

PiperOrigin-RevId: 312165090
Change-Id: I8fb624f71101fce6a379ed24f6002f8f4b60245d
---
 tensorflow/lite/g3doc/performance/gpu.md      |   2 +-
 .../lite/g3doc/performance/gpu_advanced.md    | 189 ++++++++----------
 .../g3doc/performance/model_optimization.md   |   6 +-
 3 files changed, 84 insertions(+), 113 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 8762afb4c83..b5abf46f845 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -31,7 +31,7 @@ models.
 For a step-by-step tutorial, watch the
 [GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video.
 
-Note: This requires OpenGL ES 3.1 or higher.
+Note: This requires OpenCL or OpenGL ES (3.1 or higher).
 
 #### Step 1. Clone the TensorFlow source code and open it in Android Studio
 
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 9f47c2e55e8..dce3eb8db6b 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -1,9 +1,9 @@
 # TensorFlow Lite on GPU
 
 [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several
-hardware accelerators.  This document describes how to use the GPU backend using
-the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher)
-and iOS (requires iOS 8 or later).
+hardware accelerators. This document describes how to use the GPU backend using
+the TensorFlow Lite delegate APIs on Android (requires OpenCL or OpenGL ES 3.1
+and higher) and iOS (requires iOS 8 or later).
 
 ## Benefits of GPU Acceleration
 
@@ -35,25 +35,33 @@ power and generating less heat than the same task run on a CPU.
 TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float
 precision:
 
-* `ADD v1`
-* `AVERAGE_POOL_2D v1`
-* `CONCATENATION v1`
-* `CONV_2D v1`
-* `DEPTHWISE_CONV_2D v1-2`
-* `FULLY_CONNECTED v1`
-* `LOGISTIC v1`
-* `MAX_POOL_2D v1`
-* `MUL v1`
-* `PAD v1`
-* `PRELU v1`
-* `RELU v1`
-* `RELU6 v1`
-* `RESHAPE v1`
-* `RESIZE_BILINEAR v1`
-* `SOFTMAX v1`
-* `STRIDED_SLICE v1`
-* `SUB v1`
-* `TRANSPOSE_CONV v1`
+*   `ADD`
+*   `AVERAGE_POOL_2D`
+*   `CONCATENATION`
+*   `CONV_2D`
+*   `DEPTHWISE_CONV_2D v1-2`
+*   `EXP`
+*   `FULLY_CONNECTED`
+*   `LOGISTIC`
+*   `LSTM v2 (Basic LSTM only)`
+*   `MAX_POOL_2D`
+*   `MAXIMUM`
+*   `MINIMUM`
+*   `MUL`
+*   `PAD`
+*   `PRELU`
+*   `RELU`
+*   `RELU6`
+*   `RESHAPE`
+*   `RESIZE_BILINEAR v1-3`
+*   `SOFTMAX`
+*   `STRIDED_SLICE`
+*   `SUB`
+*   `TRANSPOSE_CONV`
+
+By default, all ops are only supported at version 1. Enabling the
+[experimental quantization support](gpu_advanced.md#running-quantized-models-experimental-android-only)
+allows the appropriate versions; for example, ADD v2.
 
 ## Basic Usage
 
@@ -82,8 +90,8 @@ delegate.close();
 ### Android (C/C++)
 
 For C/C++ usage of TensorFlow Lite GPU on Android, the GPU delegate can be
-created with `TfLiteGpuDelegateCreate()` and destroyed with
-`TfLiteGpuDelegateDelete()`.
+created with `TfLiteGpuDelegateV2Create()` and destroyed with
+`TfLiteGpuDelegateV2Delete()`.
 
 ```c++
 // Set up interpreter.
@@ -94,15 +102,7 @@ std::unique_ptr<Interpreter> interpreter;
 InterpreterBuilder(*model, op_resolver)(&interpreter);
 
 // NEW: Prepare GPU delegate.
-const TfLiteGpuDelegateOptions options = {
-  .metadata = NULL,
-  .compile_options = {
-    .precision_loss_allowed = 1,  // FP16
-    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
-    .dynamic_batch_enabled = 0,   // Not fully functional yet
-  },
-};
-auto* delegate = TfLiteGpuDelegateCreate(&options);
+auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
@@ -111,9 +111,13 @@ if (interpreter->Invoke() != kTfLiteOk) return false;
 ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
 
 // NEW: Clean up.
-TfLiteGpuDelegateDelete(delegate);
+TfLiteGpuDelegateV2Delete(delegate);
 ```
 
+Take a look at `TfLiteGpuDelegateOptionsV2` to create a delegate instance with
+custom options. You can initialize the default options with
+`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary.
+
 TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system.
 The delegate can be built, for example, using the following command:
 
@@ -165,6 +169,43 @@ called.
 
 ## Advanced Usage
 
+### Running quantized models (Experimental, Android only)
+
+The GPU delegate already supports
+[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
+models. There is experimental support on Android to run 8-bit quantized as well.
+This includes all flavors of quantization, including:
+
+*   Models trained with
+    [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
+*   [Post-training dynamic-range quantization](https://www.tensorflow.org/lite/performance/post_training_quant)
+*   [Post-training full-integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant)
+
+To optimize performance, use models that have floating-point input & output
+tensors.
+
+This feature can be enabled using delegate options as follows:
+
+**C++ API**
+
+```c++
+// NEW: Prepare custom options with feature enabled.
+TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
+options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+
+auto* delegate = TfLiteGpuDelegateV2Create(options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+```
+
+**Java API**
+
+```java
+// NEW: Prepare GPU delegate with feature turned on.
+GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+```
+
 ### Delegate Options for iOS
 
 `NewGpuDelegate()` accepts a `struct` of options.
@@ -210,7 +251,7 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
 the options, to avoid any unexpected behavior if default values are changed in
 the future.
 
-### Input/Output Buffers
+### Input/Output Buffers (iOS only)
 
 To do computation on the GPU, data must be made available to the GPU. This often
 requires performing a memory copy. It is desirable not to cross the CPU/GPU
@@ -229,80 +270,10 @@ To achieve best performance, TensorFlow Lite makes it possible for users to
 directly read from and write to the TensorFlow hardware buffer and bypass
 avoidable memory copies.
 
-#### Android
-
-Assuming the image input is in the GPU memory, it must first be converted to an
-OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to
-a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that
-`Interpreter.bindGlBufferToTensor()` must be called before
-`Interpreter.modifyGraphWithDelegate()`.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create an SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int inputSsboId = id[0];
-
-// Create interpreter.
-Interpreter interpreter = new Interpreter(tfliteModel);
-Tensor inputTensor = interpreter.getInputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null input argument indicates use of the bound buffer for input.
-fillSsboWithCameraImageTexture(inputSsboId);
-float[] outputArray = new float[outputSize];
-interpreter.runInference(null, outputArray);
-```
-
-A similar approach can be applied to the output tensor. In that case,
-`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to
-disable the default copying of the network's output from GPU memory to CPU
-memory.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create a SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int outputSsboId = id[0];
-
-// Create interpreter.
-Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true);
-Interpreter interpreter = new Interpreter(tfliteModel, options);
-Tensor outputTensor = interpreter.getOutputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null output argument indicates use of the bound buffer for output.
-ByteBuffer input = getCameraImageByteBuffer();
-interpreter.runInference(input, null);
-renderOutputSsbo(outputSsboId);
-```
-
-#### iOS
-
 Assuming the image input is in GPU memory, it must first be converted to a
 `MTLBuffer` object for Metal. You can associate a TfLiteTensor to a
-user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that
-`BindMetalBufferToTensor()` must be called before
+user-prepared `MTLBuffer` with `TFLGpuDelegateBindMetalBufferToTensor()`. Note
+that `TFLGpuDelegateBindMetalBufferToTensor()` must be called before
 `Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is,
 by default, copied from GPU memory to CPU memory. This behavior can be turned
 off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
@@ -312,8 +283,8 @@ initialization.
 // Prepare GPU delegate.
 auto* delegate = NewGpuDelegate(nullptr);
 interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
-if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
-if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index feb6cfecea6..c66b06f9b59 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -89,9 +89,9 @@ The following types of quantization are available in TensorFlow Lite:
 Technique                                                                                               | Data requirements                | Size reduction | Accuracy                    | Supported hardware
 ------------------------------------------------------------------------------------------------------- | -------------------------------- | -------------- | --------------------------- | ------------------
 [Post-training float16 quantization](post_training_float16_quant.ipynb)                                 | No data                          | Up to 50%      | Insignificant accuracy loss | CPU, GPU
-[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU
-[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, EdgeTPU, Hexagon DSP
-[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, EdgeTPU, Hexagon DSP
+[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU, GPU (Android)
+[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, GPU (Android), EdgeTPU, Hexagon DSP
+[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, GPU (Android), EdgeTPU, Hexagon DSP
 
 Below are the latency and accuracy results for post-training quantization and
 quantization-aware training on a few models. All latency numbers are measured on

From f5c5747f134b3dfd42b1d546f1842aa2e1e70670 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 18 May 2020 15:29:57 -0700
Subject: [PATCH 0784/1533] Re-enable signal kernel tests on py38

PiperOrigin-RevId: 312166420
Change-Id: Ie18cf2e29d8a05d57675ce3e75b06509205a4e61
---
 tensorflow/python/kernel_tests/signal/BUILD     |  1 -
 .../python/kernel_tests/signal/test_util.py     |  4 +---
 .../kernel_tests/signal/window_ops_test.py      | 17 ++++++++---------
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index adb12a5e850..bd893184570 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -149,7 +149,6 @@ cuda_py_tests(
     python_version = "PY3",
     shard_count = 4,
     tags = [
-        "no_oss_py38",  #TODO(b/151631881)
         "no_windows_gpu",
     ],
     deps = [
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index 1e95fe4b28f..e8d477a843b 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -50,7 +50,7 @@ def grappler_optimize(graph, fetches=None, config_proto=None):
   return tf_optimizer.OptimizeGraph(config_proto, metagraph)
 
 
-def tflite_convert(fn, input_templates, use_mlir=False):
+def tflite_convert(fn, input_templates):
   """Converts the provided fn to tf.lite model.
 
   Args:
@@ -59,7 +59,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
     input_templates: A list of Tensors, ndarrays or TensorSpecs describing the
       inputs that fn expects. The actual values of the Tensors or ndarrays are
       unused.
-    use_mlir: Experimental. Whether to use the tf.lite MLIR converter.
 
   Returns:
     The serialized tf.lite model.
@@ -67,7 +66,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
   fn = def_function.function(fn)
   concrete_func = fn.get_concrete_function(*input_templates)
   converter = lite.TFLiteConverterV2([concrete_func])
-  converter.experimental_new_converter = use_mlir
   return converter.convert()
 
 
diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
index 9f5fe6f64c7..9432e70c7f2 100644
--- a/tensorflow/python/kernel_tests/signal/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -156,15 +156,14 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
       self.assertLen(rewritten_graph.node, 1)
 
   @parameterized.parameters(
-      # Due to control flow, only MLIR is supported.
       # Only float32 is supported.
-      (window_ops.hann_window, 10, False, dtypes.float32, True),
-      (window_ops.hann_window, 10, True, dtypes.float32, True),
-      (window_ops.hamming_window, 10, False, dtypes.float32, True),
-      (window_ops.hamming_window, 10, True, dtypes.float32, True),
-      (window_ops.vorbis_window, 12, None, dtypes.float32, True))
-  def test_tflite_convert(self, window_fn, window_length, periodic, dtype,
-                          use_mlir):
+      (window_ops.hann_window, 10, False, dtypes.float32),
+      (window_ops.hann_window, 10, True, dtypes.float32),
+      (window_ops.hamming_window, 10, False, dtypes.float32),
+      (window_ops.hamming_window, 10, True, dtypes.float32),
+      (window_ops.vorbis_window, 12, None, dtypes.float32))
+  def test_tflite_convert(self, window_fn, window_length, periodic, dtype):
+
     def fn(window_length):
       try:
         return window_fn(window_length, periodic=periodic, dtype=dtype)
@@ -172,7 +171,7 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
         return window_fn(window_length, dtype=dtype)
 
     tflite_model = test_util.tflite_convert(
-        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)], use_mlir)
+        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)])
     window_length = np.array(window_length).astype(np.int32)
     actual_output, = test_util.evaluate_tflite_model(
         tflite_model, [window_length])

From 94108993a3adc322b67d35244c8488ead4034dee Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Mon, 18 May 2020 15:35:17 -0700
Subject: [PATCH 0785/1533] Allow static result shape for unranked operand in
 shape verifier

Previously, a static result shape for an unranked operand produced an error in
shape verifier. This was too restrictive because shape inference is often
incomplete at this point.

PiperOrigin-RevId: 312167322
Change-Id: Ia198f07699174a4ea3c77099c9408def95e058be
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc      | 9 ++++++---
 tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir | 6 +++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 78623ca3c61..69b8f15320f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -2603,9 +2603,12 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
              << variadic_idx_str << " to match rank of operand"
              << variadic_idx_str;
   } else if (result_ranked_type.hasStaticShape()) {
-    // The operand is an unranked tensor, verify that the result is dynamic.
-    return op->emitOpError("requires dynamic shape result")
-           << variadic_idx_str << " for unranked operand" << variadic_idx_str;
+    // The operand is an unranked tensor, print a warning if the result
+    // is static.
+    // Note: We do not handle this situation as an error, this would be too
+    // restrictive due to incompleteness of shape inference at this point.
+    op->emitWarning("has static shape result")
+        << variadic_idx_str << " for unranked operand" << variadic_idx_str;
   }
 
   Type element_type = result_ranked_type.getElementType();
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index ffa287e0e53..3560fec7b7d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1326,7 +1326,7 @@ func @testShapeMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
@@ -1370,7 +1370,7 @@ func @testShapeNMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeNWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result #1 for unranked operand #1}}
+  // expected-warning @+1 {{has static shape result #1 for unranked operand #1}}
   %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<?xi32>, tensor<2xi32>)
   return %0#1 : tensor<2xi32>
 }
@@ -1428,7 +1428,7 @@ func @testVariableShapeMismatchDim(%arg0: tensor<*x!tf.resource<tensor<1x32x32x1
 // -----
 
 func @testVariableShapeWrongResultDimDynamic(%arg0: tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32> {
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.VariableShape"(%arg0) {output = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }

From 1acf6989bf72de324f61be20491a7c017a7da5c6 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 18 May 2020 15:51:05 -0700
Subject: [PATCH 0786/1533] Fix argument check tests to work in eager mode

PiperOrigin-RevId: 312170271
Change-Id: Ie7ffb52cf63559255b5463d651eb72b924a3c3bf
---
 .../core/kernels/reverse_sequence_op.cc       | 44 +++++-----
 .../kernel_tests/reverse_sequence_op_test.py  | 83 +++++++++----------
 tensorflow/python/ops/array_ops.py            |  8 +-
 3 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 0e112133915..b5b62bc76ca 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -43,9 +43,9 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename Tlen>
 void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
-  auto seq_lens_t = seq_lens.vec<Tlen>();
+  auto seq_lens_t = seq_lengths.vec<Tlen>();
 
   std::vector<Tlen> seq_lens_vec(seq_lens_t.size());
 
@@ -56,15 +56,16 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 
   for (size_t d = 0; d < seq_lens_vec.size(); ++d) {
     OP_REQUIRES(context, seq_lens_vec[d] >= 0,
@@ -77,21 +78,22 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
 void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
 
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 }
 
 template <>
@@ -117,14 +119,14 @@ class ReverseSequenceOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    const Tensor& seq_lens = context->input(1);
+    const Tensor& seq_lengths = context->input(1);
 
     // Preliminary validation of sizes.
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()),
-                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
-                                        seq_lens.dims()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lengths.shape()),
+                errors::InvalidArgument("seq_lengths must be 1-dim, not ",
+                                        seq_lengths.dims()));
 
-    auto seq_lens_t = seq_lens.vec<Tlen>();
+    auto seq_lens_t = seq_lengths.vec<Tlen>();
 
     CheckErrors<Device, Tlen>(context, batch_dim_, seq_dim_);
     if (!context->status().ok()) return;
@@ -186,7 +188,7 @@ namespace functor {
   void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
       const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
       int32 batch_dim, int32 seq_dim,                                  \
-      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<Tlen>::ConstVec seq_lengths,                     \
       typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 05307c9834a..267decff38b 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -19,10 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -135,56 +136,52 @@ class ReverseSequenceTest(test.TestCase):
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
-  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
-    t = array_ops.reverse_sequence(
-        array_ops.placeholder(
-            dtypes.float32, shape=None),
-        seq_lengths=array_ops.placeholder(
-            dtypes.int64, shape=(32,)),
-        batch_axis=0,
-        seq_axis=1)
-    self.assertIs(t.get_shape().ndims, None)
+    # Enter graph mode since we want to test partial shapes
+    with context.graph_mode():
+      t = array_ops.reverse_sequence(
+          array_ops.placeholder(dtypes.float32, shape=None),
+          seq_lengths=array_ops.placeholder(dtypes.int64, shape=(32,)),
+          batch_axis=0,
+          seq_axis=1)
+      self.assertIs(t.get_shape().ndims, None)
 
+  def testInvalidArguments(self):
     # Batch size mismatched between input and seq_lengths.
-    with self.assertRaises(ValueError):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(33,)),
-          seq_axis=3)
+    # seq_length too long
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2, 2], seq_axis=1)
+
+    # seq_length too short
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2], seq_axis=1)
+
+    # Invalid seq_length shape
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 ("Shape must be rank 1 but is rank 2|"
+                                  "seq_lengths must be 1-dim")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [[2, 2]], seq_axis=1)
 
     # seq_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "seq_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "seq_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=2)
 
     # batch_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "batch_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=0,
-          batch_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "batch_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2],
+                                 seq_axis=1,
+                                 batch_axis=3)
 
-    with self.cached_session():
-      inputs = array_ops.placeholder(dtypes.float32, shape=(32, 2, 3))
-      seq_lengths = array_ops.placeholder(dtypes.int64, shape=(32,))
-      output = array_ops.reverse_sequence(
-          inputs, seq_lengths=seq_lengths,
-          seq_axis=0)  # batch_axis default is 0
-      with self.assertRaisesOpError("batch_dim == seq_dim"):
-        output.eval(feed_dict={
-            inputs: np.random.rand(32, 2, 3),
-            seq_lengths: xrange(32)
-        })
+    with self.assertRaisesRegexp((errors.OpError, errors.InvalidArgumentError),
+                                 "batch_dim == seq_dim == 0"):
+      output = array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=0)
+      self.evaluate(output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a2640925a38..ce0755fc782 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -4473,8 +4473,8 @@ def reverse_sequence(input,
   dimension `seq_axis`.
 
   The elements of `seq_lengths` must obey `seq_lengths[i] <=
-  input.dims[seq_dim]`, and `seq_lengths` must be a vector of length
-  `input.dims[batch_dim]`.
+  input.dims[seq_axis]`, and `seq_lengths` must be a vector of length
+  `input.dims[batch_axis]`.
 
   The output slice `i` along dimension `batch_axis` is then given by
   input slice `i`, with the first `seq_lengths[i]` slices along
@@ -4496,8 +4496,8 @@ def reverse_sequence(input,
   Args:
     input: A `Tensor`. The input to reverse.
     seq_lengths: A `Tensor`. Must be one of the following types: `int32`,
-      `int64`. 1-D with length `input.dims(batch_dim)` and `max(seq_lengths) <=
-      input.dims(seq_dim)`
+      `int64`. 1-D with length `input.dims(batch_axis)` and `max(seq_lengths) <=
+      input.dims(seq_axis)`
     seq_axis: An `int`. The dimension which is partially reversed.
     batch_axis: An optional `int`. Defaults to `0`. The dimension along which
       reversal is performed.

From ad6e816328507f80c30d25d73b0c03219d339dd6 Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Mon, 18 May 2020 16:06:46 -0700
Subject: [PATCH 0787/1533] Add lowering from xla_hlo/lhlo reverse op to
 Linalg.

This is only supported for static shape.

PiperOrigin-RevId: 312173157
Change-Id: Iab149f02153597ef5a967628397fcac9a4db1329
---
 .../xla/tests/hlo-legalize-to-linalg.mlir     | 13 ++++++++
 .../xla/tests/lhlo-legalize-to-linalg.mlir    | 13 ++++++++
 .../xla/transforms/xla_legalize_to_linalg.cc  | 30 +++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
index a856ee5e83c..a27bf2cff79 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
@@ -542,3 +542,16 @@ func @convert_f32_to_i32(%input: tensor<2x2xf32>) -> tensor<2x2xi32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32):
 // CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%input: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %result = "xla_hlo.reverse"(%input) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %result : tensor<2x3xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index bb8010b520c..626e905695c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -636,3 +636,16 @@ func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%arg0: memref<2x3xf32>, %arg1: memref<2x3xf32>) {
+  "xla_lhlo.reverse"(%arg0, %arg1) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (memref<2x3xf32>, memref<2x3xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index 799a20aa693..2b496677d62 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -573,6 +573,34 @@ class ConstConverter : public OpConversionPattern<xla_lhlo::ConstOp> {
   }
 };
 
+// TODO(b/156787842): Support the lowering for dynamic shapes.
+template <typename OpTy, bool isLHLO = true>
+class ReverseConverter
+    : public DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                     isLHLO> {
+ public:
+  using DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                isLHLO>::DataMovementOpConverter;
+  static ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) {
+    auto resultType =
+        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto nloops = resultType.getRank();
+    SmallVector<AffineExpr, 2> inputExprs;
+    inputExprs.reserve(nloops);
+    for (int i = 0; i < nloops; ++i)
+      inputExprs.push_back(b->getAffineDimExpr(i));
+    for (auto dim : op.dimensions()) {
+      int i = dim.getZExtValue();
+      if (resultType.isDynamicDim(i)) return {};
+      int n = resultType.getShape()[i];
+      inputExprs[i] = b->getAffineConstantExpr(n - 1) - inputExprs[i];
+    }
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
+         b->getMultiDimIdentityMap(nloops)});
+  }
+};
+
 class SliceConverter : public OpConversionPattern<xla_lhlo::SliceOp> {
  public:
   using OpConversionPattern<xla_lhlo::SliceOp>::OpConversionPattern;
@@ -642,6 +670,7 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::SubOp>,
                    PointwiseToLinalgConverter<xla_lhlo::TanhOp>,
                    ReshapeAddRemoveDimConverter<xla_lhlo::ReshapeOp>,
+                   ReverseConverter<xla_lhlo::ReverseOp>,
                    ScalarPointwiseToStandardConverter<xla_lhlo::AddOp>,
                    SliceConverter
                   >(context);
@@ -742,6 +771,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_hlo::TanhOp, false>,
                    ReshapeAddRemoveDimConverter<xla_hlo::ReshapeOp, false>,
                    ReshapeOpConverter<xla_hlo::ReshapeOp, false>,
+                   ReverseConverter<xla_hlo::ReverseOp, false>,
                    TransposeConverter<xla_hlo::TransposeOp, false>>(context);
 }
 

From ad6798a2f62ae2cb7f433af7b721bf14b9850dde Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Mon, 18 May 2020 17:01:57 -0700
Subject: [PATCH 0788/1533] [XLA] Fix alternate memory allocation of
 conditional operands.

Consider the following flattened HLO schedule of a conditional:

1: a = fusion()
   true_computation:
2:    parameter = parameter(0)
3:    ...
4:    ...
   false_computation:
5:    parameter = parameter(0)
6:    ...
7:    ...
8: conditional = conditional(pred, a, a)
9: b = fusion(a)

When we had a tensor that was a conditional operand (e.g. "a" in the example),
we reserved the alternate memory for the entire 1-8 range. This meant that when
we tried to allocate inside the called computations of the conditional, the
offset we picked wasn't available since it would fall within the 1-8 range. This
CL now reserves the conditional until the parameter of the earliest called
computations (1-2 range).

To allow efficient use of alternate memory by avoiding a very large conditional
from claiming the offset for the entire called computation, the conditional
operand might die within the called computation, allowing other HLOs inside the
called computations to reclaim that alternate memory offset. This creates a
subtlety for subsequent uses of conditional operands (e.g. "a" is used by a
fusion at 9). These subsequent uses will force evictions (and then do another
prefetch). After optimization, the graph might look like the following:

  a (Alternate Mem) = fusion()
  cs0 = copy-start(a)  # Must evict a because the allocation may die within
                       # called computation.
  cd0 (Default Mem) = copy-done(cs0)
  true_computation:
    parameter (Alternate Mem) = parameter(0)
    ...
    # parameter's alternate memory allocation may die here and another tensor
    # might use the same offset.
  false_computation:
    parameter (Alternate Mem) = parameter(0)
    ...
    # parameter's alternate memory allocation may die here and another tensor
    # might use the same offset.
  conditional = conditional(pred, a, a)
  cs1 = copy-start(cd0)  # May prefetch the value back to alternate memory.
  cd1 (Alternate Mem) = copy-done(cs1)
  b = fusion(cd1)

PiperOrigin-RevId: 312182824
Change-Id: I3ff5d019025ef96ced1aed4f6d170df677273348
---
 .../xla/service/memory_space_assignment.cc    | 296 ++++++++++++----
 .../xla/service/memory_space_assignment.h     |  18 +-
 .../service/memory_space_assignment_test.cc   | 321 +++++++++++++++++-
 3 files changed, 563 insertions(+), 72 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 431e6af2dc0..81a8a102402 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -502,7 +502,8 @@ bool AlternateMemoryBestFitHeap::IsIntervalAllowedInAlternateMemory(
 }
 
 bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
-    const HloUse& use) const {
+    const AllocationValue& value, const HloUse& use) const {
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   if (use.instruction->opcode() == HloOpcode::kWhile) {
     HloComputation* while_body = use.instruction->while_body();
 
@@ -512,7 +513,6 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
     HloValue* parameter_value =
         &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
             while_body->parameter_instruction(0), use.operand_index);
-    const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
     int64 parameter_time =
         instruction_schedule.at(while_body->parameter_instruction(0));
     int64 root_time = instruction_schedule.at(while_body->root_instruction());
@@ -567,7 +567,54 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
                  "there is a required default memory assignment.";
       return false;
     }
+  } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+    // For any use of this conditional (the same value might be passed into
+    // multiple called computations), determine if the parameter->first use
+    // dependency is short.
+    int64 conditional_time = instruction_schedule.at(use.instruction);
+    for (const HloUse& other_use : value.uses()) {
+      if (other_use.instruction != use.instruction) {
+        continue;
+      }
+      HloComputation* called_computation =
+          use.instruction->called_computations().at(other_use.operand_number -
+                                                    1);
+      const HloInstruction* parameter_instruction =
+          called_computation->parameter_instruction(0);
+      HloValue* parameter_value =
+          &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+              parameter_instruction, other_use.operand_index);
+      int64 parameter_time = instruction_schedule.at(parameter_instruction);
+      int64 min_use_time = conditional_time;
+      for (const HloUse& parameter_use : parameter_value->uses()) {
+        if (parameter_use.instruction->parent() == called_computation &&
+            parameter_use.instruction->opcode() !=
+                HloOpcode::kGetTupleElement &&
+            parameter_use.instruction->opcode() != HloOpcode::kTuple &&
+            parameter_use.instruction->opcode() != HloOpcode::kBitcast) {
+          min_use_time = std::min(
+              min_use_time, instruction_schedule.at(parameter_use.instruction));
+        }
+      }
+      if (options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+              parameter_value->shape(), parameter_time, min_use_time)) {
+        VLOG(4) << "Conditional allocation allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+        return true;
+      } else {
+        VLOG(4) << "Conditional allocation not allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+      }
+    }
+    return false;
   }
+
   return true;
 }
 
@@ -769,20 +816,12 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         if (position.instruction->opcode() == HloOpcode::kConditional) {
           VLOG(3) << "Adding required assignment for condition output: "
                   << value->ToShortString();
-          required_assignments_[value].push_back(
-              {MemorySpace::kDefault,
-               instruction_schedule.at(position.instruction),
-               /*chunk=*/absl::nullopt});
+          AddRequiredAssignment(position.instruction, position.index,
+                                MemorySpace::kDefault);
           for (const HloComputation* called_computation :
                position.instruction->called_computations()) {
-            HloValue* root_value =
-                &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-                    called_computation->root_instruction(), position.index);
-            required_assignments_[root_value].push_back(
-                {MemorySpace::kDefault,
-                 instruction_schedule.at(
-                     called_computation->root_instruction()),
-                 /*chunk=*/absl::nullopt});
+            AddRequiredAssignment(called_computation->root_instruction(),
+                                  position.index, MemorySpace::kDefault);
           }
         }
       }
@@ -808,9 +847,13 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
       }
 
       // Iterate over the uses.
-      for (HloUse use : allocation_value.uses()) {
+      for (int use_idx = 0; use_idx < allocation_value.uses().size();
+           ++use_idx) {
+        const HloUse& use = allocation_value.uses().at(use_idx);
         int64 use_time = instruction_schedule.at(use.instruction);
         int64 latest_prefetch_time = use_time;
+        bool allow_no_copy_alternate_mem_allocation = true;
+        absl::optional<int64> earliest_prefetch_time = absl::nullopt;
 
         // Sequential calls include kWhile, kCall, and kConditional opcodes.
         bool is_sequential_call =
@@ -857,14 +900,41 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
             // when we look at uses within the while loop body.
             use_time =
                 instruction_schedule.at(while_body->parameter_instruction(0));
+          } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+            // Replace the use time with the earliest parameter of called
+            // computations.
+            for (const HloComputation* called_computation :
+                 use.instruction->called_computations()) {
+              use_time = std::min(
+                  use_time, instruction_schedule.at(
+                                called_computation->parameter_instruction(0)));
+            }
           }
         }
 
         // Add a required assignment in default memory if the use not allowed in
         // alternate memory.
-        if (!IsUseAllowedInAlternateMemory(use)) {
-          required_assignments_[allocation_value.value()].push_back(
-              {MemorySpace::kDefault, use_time, /*chunk=*/absl::nullopt});
+        if (!IsUseAllowedInAlternateMemory(allocation_value, use)) {
+          AddRequiredAssignment(allocation_value.value(), use.instruction,
+                                MemorySpace::kDefault, use_time);
+        } else if (use_idx > 0) {
+          // We allow buffers in alternate memory that are passed into
+          // conditionals to give up their alternate memory allocation inside
+          // the called computation. This means that if a conditional operator
+          // has an alternate memory allocation, subsequent uses cannot use the
+          // same alternate memory allocation in order not to clobber data. So
+          // we force default memory allocation for these subsequent uses.
+          const HloUse& previous_use = allocation_value.uses().at(use_idx - 1);
+          if (previous_use.instruction->opcode() == HloOpcode::kConditional &&
+              previous_use.instruction != use.instruction) {
+            allow_no_copy_alternate_mem_allocation = false;
+            earliest_prefetch_time =
+                instruction_schedule.at(previous_use.instruction);
+            VLOG(3) << "Previous use (" << previous_use.ToString()
+                    << ") of use (" << use.ToString()
+                    << ") is a conditional, so this use will need to evict. "
+                    << "Earliest prefetch time = " << *earliest_prefetch_time;
+          }
         }
 
         // Bitcasts don't define buffers and don't directly consume buffers.
@@ -872,10 +942,16 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         // bitcasts will be handled specially.
         if (use.instruction->opcode() != HloOpcode::kBitcast) {
           AllocationRequest request;
-          request.start_time = definition_time;
+          // Rarely, (e.g., when conditional true and false parameters are the
+          // same), definition time can be the time of the conditional and use
+          // time is the parameter use, which is less.
+          request.start_time = std::min(definition_time, use_time);
           request.end_time = use_time;
           request.latest_prefetch_time = latest_prefetch_time;
           request.size = interval.size;
+          request.allow_no_copy_alternate_mem_allocation =
+              allow_no_copy_alternate_mem_allocation;
+          request.earliest_prefetch_time = earliest_prefetch_time;
           request.preferred_offset = preferred_offset;
           request.use = use;
           request.allocation_value = &allocation_value;
@@ -1061,35 +1137,42 @@ void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
   if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
     chunk = aliased_allocation->chunk();
   }
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  HloValue* value =
-      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
-  int64 instruction_time = instruction_schedule.at(instruction);
+  AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
+                        chunk);
+}
+
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloValue* value, const HloInstruction* instruction,
+    MemorySpaceAssignment::MemorySpace memory_space, int64 time,
+    absl::optional<HeapSimulator::Chunk> chunk) {
   // Check for existing required assignment at this time and make sure it is the
   // same as this if there is one.
-  auto existing_required_assignment =
-      RequiredMemoryAssignmentAt(value, instruction_time);
+  auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
   if (existing_required_assignment) {
-    CHECK(aliased_allocation->memory_space() ==
-          existing_required_assignment->memory_space);
+    CHECK(memory_space == existing_required_assignment->memory_space)
+        << "inst = " << instruction->ToString() << " at " << time;
     CHECK((!chunk && !existing_required_assignment->chunk) ||
           chunk->offset == existing_required_assignment->chunk->offset);
-    VLOG(3) << "Not adding aliased required assignment because there is one "
-               "already: "
-            << value->ToShortString() << " at " << instruction_time << " at "
-            << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                    ? "def"
-                    : "alt");
-    return;
+    VLOG(3) << "Not adding required assignment because there is one already: "
+            << value->ToShortString() << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+  } else {
+    VLOG(3) << "Adding required assignment: " << value->ToShortString()
+            << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+    required_assignments_[value].push_back({memory_space, time, chunk});
   }
+}
 
-  required_assignments_[value].push_back(
-      {aliased_allocation->memory_space(), instruction_time, chunk});
-  VLOG(3) << "Adding aliased required assignment: " << value->ToShortString()
-          << " at " << instruction_time << " at "
-          << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                  ? "def"
-                  : "alt");
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloInstruction* instruction, ShapeIndex index,
+    MemorySpace memory_space, absl::optional<Chunk> chunk) {
+  const HloValue* value =
+      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
+  int64 instruction_time =
+      hlo_live_range_.instruction_schedule().at(instruction);
+  AddRequiredAssignment(value, instruction, memory_space, instruction_time,
+                        chunk);
 }
 
 void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
@@ -1289,6 +1372,7 @@ bool AlternateMemoryBestFitHeap::FindAllocation(
   // First try keeping the allocation entirely in the alternate memory.
   if (required_memory_space_at_start != MemorySpace::kDefault &&
       required_memory_space_at_end != MemorySpace::kDefault &&
+      request.allow_no_copy_alternate_mem_allocation &&
       AllocateInAlternateMemoryNoCopy(request)) {
     return true;
   }
@@ -1618,9 +1702,14 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   //                                     ^      ^
   //                                   Copy    Copy
   //                                   Start   Done
-  options_.prefetch_interval_picker->Begin(
-      request.use, prev_allocation_in_default_mem.earliest_available_time(),
-      request.latest_prefetch_time);
+  int64 earliest_prefetch_time =
+      prev_allocation_in_default_mem.earliest_available_time();
+  if (request.earliest_prefetch_time) {
+    earliest_prefetch_time =
+        std::max(earliest_prefetch_time, *request.earliest_prefetch_time);
+  }
+  options_.prefetch_interval_picker->Begin(request.use, earliest_prefetch_time,
+                                           request.latest_prefetch_time);
   VLOG(3) << "Trying prefetch picker = "
           << options_.prefetch_interval_picker->ToDebugString();
 
@@ -2435,6 +2524,34 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
            std::tuple<const HloValue*, Chunk, HeapSimulatorTrace::Event::Kind>>
       events;
 
+  auto add_allocation_and_verify = [&](int64 start_time, int64 end_time,
+                                       const Chunk& chunk,
+                                       const HloValue* value) {
+    events[std::make_tuple(start_time, /*is_free=*/false, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
+    events[std::make_tuple(end_time, /*is_free=*/true, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
+
+    // Get the chunks overlapping in time and search if they overlap in space
+    // as well.
+    // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
+    // really should check against end_time (inclusive) for cases where the
+    // operand can't share buffer with user (see
+    // HloDataflowAnalysis::CanShareOperandBufferWithUser).
+    for (const Chunk& overlapping_chunk :
+         interval_tree.ChunksOverlappingInTime(start_time, end_time - 1)) {
+      if (chunk.OverlapsWith(overlapping_chunk)) {
+        return InternalError(
+            ("Value %s (%d, %d) off: %d size: %d overlaps with another chunk"
+             " off: %d size: %d"),
+            value->ToShortString(), start_time, end_time, chunk.offset,
+            chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      }
+    }
+    interval_tree.Add(start_time, end_time - 1, chunk);
+    return Status::OK();
+  };
+
   // Go through all instructions in the module to ensure CopyStart/CopyDone
   // instructions copy between alternate memory and default memory.
   for (const HloComputation* computation :
@@ -2470,34 +2587,73 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
     for (const HloValue* value : buffer.values()) {
       const HloLiveRange::TimeBound& time_bound =
           hlo_live_range->buffer_live_ranges().at(value);
-      events[std::make_tuple(time_bound.start, /*is_free=*/false,
-                             value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
-      events[std::make_tuple(time_bound.end, /*is_free=*/true, value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
-
-      VLOG(3) << " buffer: " << buffer.ToString()
-              << " value: " << value->ToShortString() << ": ("
-              << time_bound.start << ", " << time_bound.end
-              << ") off: " << chunk.offset << ", size: " << chunk.size;
-      // Get the chunks overlapping in time and search if they overlap in space
-      // as well.
-      // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
-      // really should check against end_time (inclusive) for cases where the
-      // operand can't share buffer with user (see
-      // HloDataflowAnalysis::CanShareOperandBufferWithUser).
-      for (const Chunk& overlapping_chunk :
-           interval_tree.ChunksOverlappingInTime(time_bound.start,
-                                                 time_bound.end - 1)) {
-        if (chunk.OverlapsWith(overlapping_chunk)) {
-          return InternalError(
-              ("Buffer %s (%d, %d) off: %d size: %d overlaps with another chunk"
-               " off: %d size: %d"),
-              buffer.ToString(), time_bound.start, time_bound.end, chunk.offset,
-              chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      const HloInstruction* last_use_instruction = nullptr;
+      int64 last_use_time = time_bound.start;
+      for (const HloUse& use : value->uses()) {
+        int64 use_time =
+            hlo_live_range->instruction_schedule().at(use.instruction);
+        if (use_time > last_use_time) {
+          last_use_time = use_time;
+          last_use_instruction = use.instruction;
         }
       }
-      interval_tree.Add(time_bound.start, time_bound.end - 1, chunk);
+
+      if (last_use_instruction &&
+          last_use_instruction->opcode() == HloOpcode::kConditional) {
+        // Special case when verifying conditional: we internally split the use
+        // of alternate memory in conditionals, so fish them out from the
+        // conditionals.
+        VLOG(3) << " Splitting conditional buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        int64 earliest_computation_start_time = time_bound.end;
+        for (const HloComputation* called_computation :
+             last_use_instruction->called_computations()) {
+          earliest_computation_start_time =
+              std::min(earliest_computation_start_time,
+                       hlo_live_range->computation_span_times()
+                           .at(called_computation)
+                           .start);
+          int64 parameter_time = -1;
+          int64 last_use_time = -1;
+          for (const HloPosition& position : value->positions()) {
+            if (position.instruction->opcode() == HloOpcode::kParameter &&
+                position.instruction->parent() == called_computation) {
+              parameter_time = hlo_live_range->instruction_schedule().at(
+                  position.instruction);
+              break;
+            }
+          }
+          for (const HloUse& use : value->uses()) {
+            if (use.instruction->parent() == called_computation) {
+              last_use_time = std::max(
+                  last_use_time,
+                  hlo_live_range->instruction_schedule().at(use.instruction));
+            }
+          }
+          if (last_use_time != -1) {
+            CHECK_NE(parameter_time, -1);
+            VLOG(3) << "  computation: " << called_computation->name() << ": ("
+                    << parameter_time << ", " << last_use_time << ")";
+            TF_RETURN_IF_ERROR(add_allocation_and_verify(
+                parameter_time, last_use_time, chunk, value));
+          }
+        }
+        VLOG(3) << "  from beginning until first computation: ("
+                << time_bound.start << ", "
+                << (earliest_computation_start_time - 1) << ")";
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, earliest_computation_start_time - 1, chunk,
+            value));
+      } else {
+        VLOG(3) << " buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, time_bound.end, chunk, value));
+      }
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 727b8da6c08..340446d21dd 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -816,11 +816,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   // use_times is a sorted sequence of the times of all uses.
   // latest_prefetch_time is the latest time we can schedule the CopyDone for a
   // prefetch.
+  // If allow_no_copy_alternate_mem_allocation is false, an eviction is forced.
+  // If earliest_prefetch_time is set, prefetches cannot start before this
+  // value.
   struct AllocationRequest {
     int64 start_time;
     int64 end_time;
     int64 latest_prefetch_time;
     int64 size;
+    bool allow_no_copy_alternate_mem_allocation;
+    absl::optional<int64> earliest_prefetch_time;
     absl::optional<int64> preferred_offset;
     HloUse use;
     MemorySpaceAssignment::AllocationValue* allocation_value;
@@ -841,7 +846,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   bool IsIntervalAllowedInAlternateMemory(const BufferInterval& interval) const;
 
   // Returns true if the use is allowed in the alternate memory.
-  bool IsUseAllowedInAlternateMemory(const HloUse& use) const;
+  bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
+                                     const HloUse& use) const;
 
   // Given an HloValue, creates AllocationValue objects and corresponding
   // AllocationSequences and appends them into allocation_sequence_list_.
@@ -895,6 +901,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       const HloInstruction* instruction, ShapeIndex index,
       const MemorySpaceAssignment::Allocation* aliased_allocation);
 
+  // This sets a required assignment. CHECK fails if there is a conflicting
+  // required assignment at the same time.
+  void AddRequiredAssignment(const HloValue* value,
+                             const HloInstruction* instruction,
+                             MemorySpace memory_space, int64 time,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+  void AddRequiredAssignment(const HloInstruction* instruction,
+                             ShapeIndex index, MemorySpace memory_space,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+
   // Adds input and outputs as required assignments.
   void AddInputAndOutputRequiredAssignments();
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 984f2e7b4ea..a9be3850d89 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -1663,6 +1663,324 @@ TEST_P(MemorySpaceAssignmentTest, ControlPredecessorsBug) {
   AssignMemorySpace(module.get());
 }
 
+TEST_P(MemorySpaceAssignmentTest, ConditionalShouldBeAllocatedInAlternateMem) {
+  // Checks if simple conditionals get alternate memory allocations.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy and gtes got alternate memory allocations.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("neg1");
+    auto neg1_operand = neg1->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto neg2 = module->GetComputationWithName("false_computation")
+                    ->GetInstructionWithName("neg2");
+    auto neg2_operand = neg2->operand(0);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalAvoidsUnnecessaryPrefetch) {
+  // Checks if we avoid unnecessary allocation in alternate memory if the input
+  // won't be used in the computation for a long time.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    neg0 = f32[3]{0} negate(gte0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    neg9 = f32[3]{0} negate(neg8)
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    ROOT add = f32[3]{0} add(neg9, gte1)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy1 doesn't get unnecessarily allocated in alternate mem
+    // (due to long negate chain in true_computation) but is prefetched before
+    // add.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kDefaultMemorySpace);
+    auto add = module->GetComputationWithName("true_computation")
+                   ->GetInstructionWithName("add");
+    auto add_operand = add->operand(1);
+    EXPECT_EQ(add_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUse) {
+  // Make sure there is an evict when there is a conditional use followed by
+  // another use.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    add0 = f32[3]{0} add(gte0, gte1)
+    neg0 = f32[3]{0} negate(add0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    ROOT neg9 = f32[3]{0} negate(neg8)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+    ROOT add1 = f32[3]{0} add(copy1, conditional)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure the copy1->add edge is in alternate memory. Before conditional,
+    // this should be evicted to default memory and neg uses the input from
+    // default memory.
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto add0 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("add0");
+    auto add0_operand = add0->operand(1);
+    EXPECT_EQ(add0_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto add1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName("add1");
+    auto add1_operand = add1->operand(0);
+    EXPECT_EQ(add1_operand->shape().layout().memory_space(),
+              kDefaultMemorySpace);
+    EXPECT_EQ(add1_operand->opcode(), HloOpcode::kCopyDone);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUseInWhile) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  while_cond {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    ROOT gte = pred[] get-tuple-element(p0), index=2
+  }
+
+  while_body {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    gte2 = pred[] get-tuple-element(p0), index=2
+    cond_tuple = (f32[3]{0}) tuple(gte0)
+    conditional = f32[3]{0} conditional(gte2, cond_tuple, cond_tuple), true_computation=true_computation, false_computation=false_computation
+    add = f32[3]{0} add(conditional, gte1)
+    neg0 = f32[3]{0} negate(add)
+    neg1 = f32[3]{0} negate(neg0)
+    ROOT tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(gte0, neg1, gte2)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(copy0, copy1, p1)
+    while = (f32[3]{0}, f32[3]{0}, pred[]) while(tuple), condition=while_cond, body=while_body
+    ROOT gte = f32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure copy1/while{0}/cond_tuple{0} gets alternate memory allocation.
+    // This will force an eviction and a prefetch for while body root.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto conditional = module->GetComputationWithName("while_body")
+                           ->GetInstructionWithName("conditional");
+    auto conditional_operand = conditional->operand(1);
+    EXPECT_EQ(ShapeUtil::GetSubshape(conditional_operand->shape(), {0})
+                  .layout()
+                  .memory_space(),
+              kAlternateMemorySpace);
+    auto while_root =
+        module->GetComputationWithName("while_body")->root_instruction();
+    auto while_root_operand = while_root->operand(0);
+    EXPECT_THAT(
+        while_root_operand,
+        op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                      op::AsyncCopy(kDefaultMemorySpace, kAlternateMemorySpace,
+                                    op::GetTupleElement(op::Parameter(0)))));
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, NestedConditional) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  true_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    slice = f32[1]{0} slice(gte), slice={[0:1]}
+    bitcast = f32[] bitcast(slice)
+    constant = f32[] constant(0.0)
+    compare = pred[] compare(bitcast, constant), direction=GT
+    ROOT conditional = f32[3]{0} conditional(compare, p0, p0), true_computation=true_computation2, false_computation=false_computation2
+  }
+
+  false_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg3 = f32[3]{0} negate(gte)
+  }
+
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation1, false_computation=false_computation1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure alternate memory allocation gets propagated into both levels of
+    // conditional.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1_operand = module->GetComputationWithName("true_computation2")
+                            ->GetInstructionWithName("neg1")
+                            ->operand(0);
+    auto neg2_operand = module->GetComputationWithName("false_computation2")
+                            ->GetInstructionWithName("neg2")
+                            ->operand(0);
+    auto neg3_operand = module->GetComputationWithName("false_computation1")
+                            ->GetInstructionWithName("neg3")
+                            ->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg3_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
 TEST_P(MemorySpaceAssignmentTest,
        RequestIdentifierShouldNotBeAllocatedInAlternateMem) {
   // Ensure that request identifier returned by Send/Recv HLOs are not allocated
@@ -2149,7 +2467,8 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule3) {
   AssignMemorySpace(module.get(), -1, 5);
 }
 
-TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule4) {
+// TODO(berkin): This might be an incorrect input graph, investigate.
+TEST_P(MemorySpaceAssignmentTest, DISABLED_NonEntryComputationSchedule4) {
   auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(xla::F32, {2, 3});
   Shape shape2 = ShapeUtil::MakeShape(xla::F32, {3, 3});

From acaaab2504a94711a4c1084328c79c10b7c9a594 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 17:09:11 -0700
Subject: [PATCH 0789/1533] Rename TransformTensorV2 op to
 TransformTensorBilinearV2 op.

PiperOrigin-RevId: 312184091
Change-Id: I5450142e1022f72705bc5fbdf6c99c94cdbb346b
---
 tensorflow/lite/delegates/gpu/common/model_builder.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 46856a70a7c..964c8289f83 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -2350,7 +2350,7 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
  private:
 };
 
-class TransformTensorV2OperationParser : public TFLiteOperationParser {
+class TransformTensorBilinearV2OperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
@@ -2368,7 +2368,7 @@ class TransformTensorV2OperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    std::string op_name = "transform_tensor_v2";
+    std::string op_name = "transform_tensor_bilinear_v2";
     node->operation.type = op_name;
     BHWC output_shape;
     RETURN_IF_ERROR(
@@ -2731,8 +2731,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       if (custom_name == "TransformTensor") {
         return std::make_unique<TransformTensorOperationParser>();
       }
-      if (custom_name == "TransformTensorV2") {
-        return std::make_unique<TransformTensorV2OperationParser>();
+      if (custom_name == "TransformTensorBilinearV2") {
+        return std::make_unique<TransformTensorBilinearV2OperationParser>();
       }
       if (custom_name == "TransformLandmarks") {
         return std::make_unique<TransformLandmarksOperationParser>();

From 637c14abf840d83e0f6177694030455d6af35937 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 18 May 2020 17:25:05 -0700
Subject: [PATCH 0790/1533] Add SparseCrossV2 which supports strong_hash with
 salt, and fingerprint doens't take `hash_key`. hash function will be run
 before FingerprintCat.

PiperOrigin-RevId: 312186543
Change-Id: I67a51645250b9d0714b757c85dabf1137e64b167
---
 .../base_api/api_def_SparseCrossHashed.pbtxt  | 104 +++
 .../base_api/api_def_SparseCrossV2.pbtxt      |  91 ++
 .../api_def_SparseCrossHashed.pbtxt           |   4 +
 .../python_api/api_def_SparseCrossV2.pbtxt    |   4 +
 tensorflow/core/kernels/sparse_cross_op.cc    | 805 ++++++++++++------
 tensorflow/core/ops/sparse_ops.cc             |  40 +
 .../kernel_tests/sparse_cross_op_test.py      | 592 +++++++++++++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   8 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   8 +
 9 files changed, 1417 insertions(+), 239 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c4340cb9b7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,104 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "num_buckets"
+    description: <<END
+It is used if hashed_output is true.
+output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+END
+  }
+  in_arg {
+    name: "strong_hash"
+    description: <<END
+boolean, if true, siphash with salt will be used instead of farmhash.
+END
+  }
+  in_arg {
+    name: "salt"
+    description: <<END
+Specify the salt that will be used by the siphash function.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..0627d9b3909
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,91 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+string used when joining a list of string inputs, can be used as separator later.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c830668733
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..dfa0a670c4c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index c7c538a945f..9a80aad5d04 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // Contains OP to generate sparse crosses.
 #include <assert.h>
+
 #include <limits>
 #include <string>
 #include <vector>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/strong_hash.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -42,7 +44,8 @@ class ColumnInterface {
   virtual int64 FeatureCount(int64 batch) const = 0;
 
   // Returns the fingerprint of nth feature from the specified batch.
-  virtual InternalType Feature(int64 batch, int64 n) const = 0;
+  virtual InternalType Feature(int64 batch, int64 n,
+                               bool strong_hash) const = 0;
 
   virtual ~ColumnInterface() {}
 };
@@ -63,7 +66,7 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
     return feature_counts_[batch];
   }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~SparseTensorColumn() override {}
 
@@ -73,18 +76,69 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
   std::vector<int64> feature_start_indices_;
 };
 
+// A column that is backed by a sparse tensor.
+template <typename InternalType>
+class KeyedSparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  KeyedSparseTensorColumn(const Tensor& values,
+                          std::vector<int64> feature_counts,
+                          std::vector<int64> feature_start_indices,
+                          std::vector<int64> key)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    DCHECK_EQ(feature_counts_.size(), feature_start_indices_.size());
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override {
+    return feature_counts_[batch];
+  }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedSparseTensorColumn() override {}
+
+ private:
+  const Tensor& values_;
+  uint64 key_[2];
+  std::vector<int64> feature_counts_;
+  std::vector<int64> feature_start_indices_;
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                         bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return Fingerprint64(values_.vec<tstring>().data()[start + n]);
   return values_.vec<int64>().data()[start + n];
 }
 
+template <>
+int64 KeyedSparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                              bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (strong_hash) {
+    if (DT_STRING == values_.dtype()) {
+      return StrongKeyedHash(key_, values_.vec<tstring>()(start + n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+               sizeof(values_.dtype())});
+  }
+  if (DT_STRING == values_.dtype())
+    return Fingerprint64(values_.vec<tstring>()(start + n));
+  return Fingerprint64(
+      {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+       sizeof(values_.dtype())});
+}
+
 // InternalType is string or StringPiece when using StringCrosser.
 template <>
-tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return values_.vec<tstring>().data()[start + n];
@@ -92,8 +146,24 @@ tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
 }
 
 template <>
-StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                     int64 n) const {
+tstring KeyedSparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                  bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return values_.vec<tstring>().data()[start + n];
+  return std::to_string(values_.vec<int64>().data()[start + n]);
+}
+
+template <>
+StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                     bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  return values_.vec<tstring>().data()[start + n];
+}
+
+template <>
+StringPiece KeyedSparseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   return values_.vec<tstring>().data()[start + n];
 }
@@ -106,7 +176,7 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
 
   int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~DenseTensorColumn() override {}
 
@@ -114,9 +184,46 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
   const Tensor& tensor_;
 };
 
+// A column that is backed by a dense tensor.
+template <typename InternalType>
+class KeyedDenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit KeyedDenseTensorColumn(const Tensor& tensor, std::vector<int64> key)
+      : tensor_(tensor) {
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedDenseTensorColumn() override {}
+
+ private:
+  const Tensor& tensor_;
+  uint64 key_[2];
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                        bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype())
+    return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
+  return tensor_.matrix<int64>()(batch, n);
+}
+
+template <>
+int64 KeyedDenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
+  if (strong_hash) {
+    if (DT_STRING == tensor_.dtype()) {
+      return StrongKeyedHash(key_, tensor_.matrix<tstring>()(batch, n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(tensor_.matrix<int64>()(batch, n)),
+               sizeof(tensor_.dtype())});
+  }
   if (DT_STRING == tensor_.dtype())
     return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
   return tensor_.matrix<int64>()(batch, n);
@@ -124,14 +231,28 @@ int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
 
 // Internal type is string or StringPiece when using StringCrosser.
 template <>
-tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                            bool strong_hash) const {
   if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
   return std::to_string(tensor_.matrix<int64>()(batch, n));
 }
 
 template <>
-StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                    int64 n) const {
+tstring KeyedDenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                 bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
+  return std::to_string(tensor_.matrix<int64>()(batch, n));
+}
+
+template <>
+StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                    bool strong_hash) const {
+  return tensor_.matrix<tstring>()(batch, n);
+}
+
+template <>
+StringPiece KeyedDenseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   return tensor_.matrix<tstring>()(batch, n);
 }
 
@@ -169,24 +290,24 @@ class StringCrosser {
  public:
   StringCrosser(const std::vector<
                     std::unique_ptr<ColumnInterface<InternalType>>>& columns,
-                const int64 num_buckets_unused, const uint64 hash_key_unused)
-      : columns_(columns) {}
-
-  string Generate(const int64 batch_index,
-                  const std::vector<int>& permutation) const {
-    static const auto k_feature_separator = "_X_";
+                const int64 num_buckets_unused, const uint64 hash_key_unused,
+                const tstring k_feature_separator)
+      : columns_(columns), k_feature_separator_(k_feature_separator) {}
 
+  string Generate(const int64 batch_index, const std::vector<int>& permutation,
+                  bool unused_strong_hash) const {
     gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
     for (int i = 0; i < permutation.size(); i++) {
-      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i], false);
     }
     // TODO(zakaria): this will copy the string twice, might effect
     // performance.
-    return absl::StrJoin(cross_vec, k_feature_separator);
+    return absl::StrJoin(cross_vec, k_feature_separator_);
   }
 
  private:
   const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+  const tstring k_feature_separator_;
 };
 
 // Generates the sparse crosses as nested hash to avoid string manipulations.
@@ -194,15 +315,16 @@ class HashCrosser {
  public:
   HashCrosser(
       const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
-      const int64 num_buckets, const uint64 hash_key)
+      const int64 num_buckets, const uint64 hash_key,
+      const tstring k_feature_separator_unused)
       : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
 
-  int64 Generate(const int64 batch_index,
-                 const std::vector<int>& permutation) const {
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool unused_strong_hash) const {
     // Do the fingerprint concatenation on uint64.
     uint64 hashed_output = hash_key_;
     for (size_t i = 0; i < permutation.size(); ++i) {
-      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i], false);
       hashed_output = FingerprintCat64(hashed_output, hash_i);
     }
     // The return value is int64 based on the number of buckets.
@@ -220,6 +342,39 @@ class HashCrosser {
   const uint64 hash_key_;
 };
 
+// Generates the sparse crosses as nested hash to avoid string manipulations.
+class HashCrosserV2 {
+ public:
+  HashCrosserV2(
+      const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
+      const int64 num_buckets, const uint64 hash_key_unused,
+      const tstring k_feature_separator_unused)
+      : columns_(columns), num_buckets_(num_buckets) {}
+
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool strong_hash) const {
+    // Do the fingerprint concatenation on uint64.
+    uint64 hashed_output =
+        columns_[0]->Feature(batch_index, permutation[0], strong_hash);
+    for (size_t i = 1; i < permutation.size(); ++i) {
+      uint64 hash_i =
+          columns_[i]->Feature(batch_index, permutation[i], strong_hash);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    // The return value is int64 based on the number of buckets.
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      // To prevent negative output we take modulo to max int64.
+      return hashed_output % std::numeric_limits<int64>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns_;
+  const int64 num_buckets_;
+};
+
 // ProductIterator generates cartesian products based on indices.
 template <typename InternalType>
 class ProductIterator {
@@ -275,16 +430,264 @@ struct CrossTraits;
 template <typename InternalType>
 struct CrossTraits<false, InternalType> {
   typedef StringCrosser<InternalType> Crosser;
+  typedef StringCrosser<InternalType> CrosserV2;
   typedef OutputUpdater<tstring> Updater;
 };
 
 template <>
 struct CrossTraits<true, int64> {
   typedef HashCrosser Crosser;
+  typedef HashCrosserV2 CrosserV2;
   typedef OutputUpdater<int64> Updater;
 };
 }  // namespace
 
+// Calculate the batch size from either the shapes input or the dense input.
+int64 CalculateBatchSize(const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  if (shapes_list_in.size() > 0) {
+    return shapes_list_in[0].vec<int64>()(0);
+  }
+
+  if (dense_list_in.size() > 0) {
+    return dense_list_in[0].dim_size(0);
+  }
+
+  return 0;
+}
+
+// Validates input tensors.
+Status ValidateInput(const OpInputList& indices_list_in,
+                     const OpInputList& values_list_in,
+                     const OpInputList& shapes_list_in,
+                     const OpInputList& dense_list_in) {
+  const auto size = indices_list_in.size();
+  // Validates indices_list_in OpInputList.
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input indices should be a matrix but received shape ",
+          indices_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(1) != 2) {
+      return errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                     indices_list_in[i].shape().dim_size(1),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates values_list_in OpInputList.
+  if (values_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input values, got ",
+                                   values_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input values should be a vector but received shape ",
+          values_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(0) !=
+        values_list_in[i].shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Expected size of values to be ",
+          indices_list_in[i].shape().dim_size(0), " got ",
+          values_list_in[i].shape().dim_size(0), " at position ", i);
+    }
+  }
+
+  // Validates shapes_list_in OpInputList
+  if (shapes_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                   shapes_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input shapes should be a vector but received shape ",
+          shapes_list_in[i].shape().DebugString(), " at position ", i);
+    }
+
+    if (shapes_list_in[i].vec<int64>().size() != 2) {
+      return errors::InvalidArgument("shape should imply a 2D tensor, but got ",
+                                     shapes_list_in[i].shape().DebugString(),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates dense_list_in OpInputList
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Dense inputs should be a matrix but received shape ",
+          dense_list_in[i].shape().DebugString(), " at position ", i);
+    }
+  }
+
+  // Validates batch sizes.  (Note: we do this after validating the input
+  // shapes, because CalculateBatchSize() depends on inputs having valid
+  // shapes).
+  const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  for (int i = 0; i < size; i++) {
+    if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", shapes_list_in[i].vec<int64>()(0),
+                                     " at position ", i);
+    }
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (dense_list_in[i].dim_size(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", dense_list_in[i].dim_size(0),
+                                     " at dense tensor ", i);
+    }
+  }
+
+  return Status::OK();
+}
+
+// Extracts data about the features and populates feature data.
+void ExtractFeatureData(
+    const OpInputList& indices_list_in, int64 batch_size,
+    std::vector<std::vector<int64>>* feature_counts,
+    std::vector<std::vector<int64>>* feature_start_indices) {
+  gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
+  for (int b = 0; b < batch_size; b++) {
+    for (int i = 0; i < indices_list_in.size(); i++) {
+      const auto indices = indices_list_in[i].matrix<int64>();
+      int64 feature_count = 0;
+      int64 start_index = current_row[i];
+      // Loops until we reach next batch index for current feature column.
+      while (current_row[i] < indices_list_in[i].dim_size(0) &&
+             indices(current_row[i], 0) == b) {
+        feature_count++;
+        current_row[i]++;
+      }
+      (*feature_counts)[i].push_back(feature_count);
+      (*feature_start_indices)[i].push_back(start_index);
+    }
+  }
+}
+
+// Returns number of crosses for a given batch_index
+template <typename InternalType>
+int64 CrossCountByBatchIndex(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int batch_index) {
+  int64 cross_count = 1;
+  for (int i = 0; i < columns.size(); i++) {
+    const auto feature_count = columns[i]->FeatureCount(batch_index);
+    // If one column is missing any feature, there won't be any cross.
+    if (feature_count == 0) {
+      return 0;
+    }
+    cross_count *= feature_count;
+  }
+  return cross_count;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateColumnsFromInput(const OpInputList& indices_list_in,
+                         const OpInputList& values_list_in,
+                         const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new SparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i])));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(new DenseTensorColumn<InternalType>(dense_list_in[i]));
+  }
+
+  return columns;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateKeyedColumnsFromInput(const OpInputList& indices_list_in,
+                              const OpInputList& values_list_in,
+                              const OpInputList& shapes_list_in,
+                              const OpInputList& dense_list_in,
+                              std::vector<int64> keys) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new KeyedSparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i]), keys));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(
+        new KeyedDenseTensorColumn<InternalType>(dense_list_in[i], keys));
+  }
+
+  return columns;
+}
+
+// Allocates output tensors with proper size and sets the shape tensor of
+// the output SparseTensor.
+// It also output_start_indices which contains the start indices for each
+// input in the output SparseTensor.
+template <typename InternalType>
+Status CreateOutputTensors(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int64 batch_size, OpKernelContext* context, Tensor** indices_out,
+    Tensor** values_out, Tensor** shape_out,
+    std::vector<int64>* output_start_indices) {
+  // Calculates dimensions for output tensors.
+  int64 cross_count_total = 0;
+  int64 max_cross_count = 0;
+  for (int64 b = 0; b < batch_size; b++) {
+    // For each input, sets starting indices in output SparseTensor
+    (*output_start_indices)[b] = cross_count_total;
+    const auto cross_count = CrossCountByBatchIndex(columns, b);
+    max_cross_count = std::max(max_cross_count, cross_count);
+    cross_count_total += cross_count;
+  }
+
+  // Allocates tensors.
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({cross_count_total, 2}), indices_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      1, TensorShape({cross_count_total}), values_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(2, TensorShape({2}), shape_out));
+
+  // Sets shape.
+  auto shape_vec = (*shape_out)->vec<int64>();
+  shape_vec(0) = batch_size;
+  shape_vec(1) = max_cross_count;
+
+  return Status::OK();
+}
+
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
@@ -312,11 +715,12 @@ class SparseCrossOp : public OpKernel {
                                           shapes_list_in, dense_list_in));
 
     std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
-        GenerateColumnsFromInput(indices_list_in, values_list_in,
-                                 shapes_list_in, dense_list_in);
+        GenerateColumnsFromInput<InternalType>(indices_list_in, values_list_in,
+                                               shapes_list_in, dense_list_in);
 
+    const tstring k_feature_separator = "_X_";
     typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
-        columns, num_buckets_, hash_key_);
+        columns, num_buckets_, hash_key_, k_feature_separator);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -335,7 +739,8 @@ class SparseCrossOp : public OpKernel {
         int64 cross_count = 0;
         while (product_iterator.HasNext()) {
           const auto permutation = product_iterator.Next();
-          updater.Update(b, cross_count, crosser.Generate(b, permutation));
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
           cross_count++;
         }
       }
@@ -349,222 +754,138 @@ class SparseCrossOp : public OpKernel {
   }
 
  private:
-  // Validates input tensors.
-  Status ValidateInput(const OpInputList& indices_list_in,
-                       const OpInputList& values_list_in,
-                       const OpInputList& shapes_list_in,
-                       const OpInputList& dense_list_in) {
-    const auto size = indices_list_in.size();
-    // Validates indices_list_in OpInputList.
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input indices should be a matrix but received shape ",
-            indices_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(1) != 2) {
-        return errors::InvalidArgument("Expected D2 of index to be 2 got ",
-                                       indices_list_in[i].shape().dim_size(1),
-                                       " at position ", i);
-      }
-    }
-
-    // Validates values_list_in OpInputList.
-    if (values_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input values, got ",
-                                     values_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input values should be a vector but received shape ",
-            values_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(0) !=
-          values_list_in[i].shape().dim_size(0)) {
-        return errors::InvalidArgument(
-            "Expected size of values to be ",
-            indices_list_in[i].shape().dim_size(0), " got ",
-            values_list_in[i].shape().dim_size(0), " at position ", i);
-      }
-    }
-
-    // Validates shapes_list_in OpInputList
-    if (shapes_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input shapes, got ",
-                                     shapes_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input shapes should be a vector but received shape ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-
-      if (shapes_list_in[i].vec<int64>().size() != 2) {
-        return errors::InvalidArgument(
-            "shape should imply a 2D tensor, but got ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates dense_list_in OpInputList
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Dense inputs should be a matrix but received shape ",
-            dense_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates batch sizes.  (Note: we do this after validating the input
-    // shapes, because CalculateBatchSize() depends on inputs having valid
-    // shapes).
-    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    for (int i = 0; i < size; i++) {
-      if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
-        return errors::InvalidArgument(
-            "Expected batch size ", batch_size, " got ",
-            shapes_list_in[i].vec<int64>()(0), " at position ", i);
-      }
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (dense_list_in[i].dim_size(0) != batch_size) {
-        return errors::InvalidArgument("Expected batch size ", batch_size,
-                                       " got ", dense_list_in[i].dim_size(0),
-                                       " at dense tensor ", i);
-      }
-    }
-
-    return Status::OK();
-  }
-
-  // Calculate the batch size from either the shapes input or the dense input.
-  int64 CalculateBatchSize(const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    if (shapes_list_in.size() > 0) {
-      return shapes_list_in[0].vec<int64>()(0);
-    }
-
-    if (dense_list_in.size() > 0) {
-      return dense_list_in[0].dim_size(0);
-    }
-
-    return 0;
-  }
-
-  // Generate the columns given the sparse and dense inputs.
-  std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
-  GenerateColumnsFromInput(const OpInputList& indices_list_in,
-                           const OpInputList& values_list_in,
-                           const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
-    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    const int64 number_of_columns = shapes_list_in.size();
-
-    std::vector<std::vector<int64>> feature_counts(number_of_columns,
-                                                   std::vector<int64>());
-    std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
-                                                          std::vector<int64>());
-
-    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
-                       &feature_start_indices);
-
-    columns.reserve(values_list_in.size());
-    for (int i = 0; i < values_list_in.size(); ++i) {
-      columns.emplace_back(new SparseTensorColumn<InternalType>(
-          values_list_in[i], std::move(feature_counts[i]),
-          std::move(feature_start_indices[i])));
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      columns.emplace_back(
-          new DenseTensorColumn<InternalType>(dense_list_in[i]));
-    }
-
-    return columns;
-  }
-
-  // Extracts data about the features and populates feature data.
-  void ExtractFeatureData(
-      const OpInputList& indices_list_in, int64 batch_size,
-      std::vector<std::vector<int64>>* feature_counts,
-      std::vector<std::vector<int64>>* feature_start_indices) {
-    gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
-    for (int b = 0; b < batch_size; b++) {
-      for (int i = 0; i < indices_list_in.size(); i++) {
-        const auto indices = indices_list_in[i].matrix<int64>();
-        int64 feature_count = 0;
-        int64 start_index = current_row[i];
-        // Loops until we reach next batch index for current feature column.
-        while (current_row[i] < indices_list_in[i].dim_size(0) &&
-               indices(current_row[i], 0) == b) {
-          feature_count++;
-          current_row[i]++;
-        }
-        (*feature_counts)[i].push_back(feature_count);
-        (*feature_start_indices)[i].push_back(start_index);
-      }
-    }
-  }
-
-  // Allocates output tensors with proper size and sets the shape tensor of
-  // the output SparseTensor.
-  // It also output_start_indices which contains the start indices for each
-  // input in the output SparseTensor.
-  Status CreateOutputTensors(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int64 batch_size, OpKernelContext* context, Tensor** indices_out,
-      Tensor** values_out, Tensor** shape_out,
-      std::vector<int64>* output_start_indices) {
-    // Calculates dimensions for output tensors.
-    int64 cross_count_total = 0;
-    int64 max_cross_count = 0;
-    for (int64 b = 0; b < batch_size; b++) {
-      // For each input, sets starting indices in output SparseTensor
-      (*output_start_indices)[b] = cross_count_total;
-      const auto cross_count = CrossCountByBatchIndex(columns, b);
-      max_cross_count = std::max(max_cross_count, cross_count);
-      cross_count_total += cross_count;
-    }
-
-    // Allocates tensors.
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        0, TensorShape({cross_count_total, 2}), indices_out));
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        1, TensorShape({cross_count_total}), values_out));
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), shape_out));
-
-    // Sets shape.
-    auto shape_vec = (*shape_out)->vec<int64>();
-    shape_vec(0) = batch_size;
-    shape_vec(1) = max_cross_count;
-
-    return Status::OK();
-  }
-
-  // Returns number of crosses for a given batch_index
-  int64 CrossCountByBatchIndex(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int batch_index) {
-    int64 cross_count = 1;
-    for (int i = 0; i < columns.size(); i++) {
-      const auto feature_count = columns[i]->FeatureCount(batch_index);
-      // If one column is missing any feature, there won't be any cross.
-      if (feature_count == 0) {
-        return 0;
-      }
-      cross_count *= feature_count;
-    }
-    return cross_count;
-  }
   int64 num_buckets_;
   uint64 hash_key_;
 };
 
+class SparseCrossV2Op : public OpKernel {
+ public:
+  explicit SparseCrossV2Op(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* sep_t;
+    OP_REQUIRES_OK(context, context->input("sep", &sep_t));
+    const tstring separator = sep_t->scalar<tstring>()();
+
+    std::vector<std::unique_ptr<ColumnInterface<tstring>>> columns =
+        GenerateColumnsFromInput<tstring>(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    StringCrosser<tstring> crosser(columns, 0, 0, separator);
+    OutputUpdater<tstring> updater(output_start_indices, indices_out,
+                                   values_out);
+    auto do_work = [&columns, crosser, updater](int64 begin, int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<tstring> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
+class SparseCrossHashedOp : public OpKernel {
+ public:
+  explicit SparseCrossHashedOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* num_buckets_t;
+    OP_REQUIRES_OK(context, context->input("num_buckets", &num_buckets_t));
+    const int64 num_buckets = num_buckets_t->scalar<int64>()();
+
+    const Tensor* strong_hash_t;
+    OP_REQUIRES_OK(context, context->input("strong_hash", &strong_hash_t));
+    const bool strong_hash = strong_hash_t->scalar<bool>()();
+
+    const Tensor* salt_t;
+    OP_REQUIRES_OK(context, context->input("salt", &salt_t));
+    const auto salt = salt_t->flat<int64>();
+    std::vector<int64> key_{salt(0), salt(1)};
+
+    std::vector<std::unique_ptr<ColumnInterface<int64>>> columns =
+        GenerateKeyedColumnsFromInput<int64>(indices_list_in, values_list_in,
+                                             shapes_list_in, dense_list_in,
+                                             key_);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    const tstring unused_sep;
+    HashCrosserV2 crosser(columns, num_buckets, 0, unused_sep);
+    OutputUpdater<int64> updater(output_start_indices, indices_out, values_out);
+    auto do_work = [&columns, crosser, updater, strong_hash](int64 begin,
+                                                             int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<int64> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, strong_hash));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<tstring>("out_type")
@@ -589,4 +910,10 @@ REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .TypeConstraint<int64>("internal_type"),
                         SparseCrossOp<true, int64>);
 
+REGISTER_KERNEL_BUILDER(Name("SparseCrossV2").Device(DEVICE_CPU),
+                        SparseCrossV2Op);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCrossHashed").Device(DEVICE_CPU),
+                        SparseCrossHashedOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 85186c4a2d8..906cef1f5ec 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -272,6 +272,46 @@ REGISTER_OP("SparseCross")
       return Status::OK();
     });
 
+REGISTER_OP("SparseCrossV2")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("sep: string")
+    .Output("output_indices: int64")
+    .Output("output_values: string")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("SparseCrossHashed")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("num_buckets: int64")
+    .Input("strong_hash: bool")
+    .Input("salt: int64")
+    .Output("output_indices: int64")
+    .Output("output_values: int64")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
     .Input("indices: int64")
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 5037f82af72..b352c1a080f 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -27,10 +27,55 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+class BaseSparseCrossOpTest(test.TestCase):
+
+  def _sparse_tensor(self, data, batch_size=-1):
+    """Generates a SparseTensor.
+
+    Args:
+      data: Should be a list of list of strings or int64. Each item of the outer
+        list represents a batch. Each item of the batch is a feature of a
+        specific feature column.
+      batch_size: optional batch size, especially for cases when data has no
+        entry for some batches.
+
+    Returns:
+     A SparseTensor.
+    """
+    indices = []
+    values = []
+    max_col_count = 0
+    for batch, batch_ix in zip(data, range(len(data))):
+      for column, column_ix in zip(batch, range(len(batch))):
+        indices.append([batch_ix, column_ix])
+        values.append(column)
+        max_col_count = max(max_col_count, column_ix + 1)
+    shape = [batch_size if batch_size != -1 else len(data), max_col_count]
+    value_type = (
+        dtypes.string
+        if not values or isinstance(values[0], str) else dtypes.int64)
+    return sparse_tensor.SparseTensor(
+        constant_op.constant(indices, dtypes.int64, [len(indices), 2]),
+        constant_op.constant(values, value_type, [len(indices)]),
+        constant_op.constant(shape, dtypes.int64))
+
+  def _assert_sparse_tensor_equals(self, sp1, sp2):
+    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
+    self.assertAllEqual(sp1.values.eval(), sp2.values)
+    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+
+  def _assert_sparse_tensor_empty(self, sp):
+    self.assertEqual(0, sp.indices.size)
+    self.assertEqual(0, sp.values.size)
+    # TODO(zakaria): check if we can ignore the first dim of the shape.
+    self.assertEqual(0, sp.dense_shape[1])
+
+
 class SparseCrossOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -459,5 +504,552 @@ class SparseCrossOpTest(test.TestCase):
       self.evaluate(sparse_ops.sparse_cross([st1, st2]))
 
 
+class SparseCrossV2OpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_sparse(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1'],
+        ['batch2-FC1-F1_X_batch2-FC2-F1',
+         'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1',
+         'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_sep(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_Y_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_Y_batch1-FC2-F1'],
+        ['batch2-FC1-F1_Y_batch2-FC2-F1',
+         'batch2-FC1-F1_Y_batch2-FC2-F2',
+         'batch2-FC1-F2_Y_batch2-FC2-F1',
+         'batch2-FC1-F2_Y_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2',
+         'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense_sep(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_batch1-FC2-F1', 'batch1-FC1-F1_batch1-FC2-F2',
+         'batch1-FC1-F2_batch1-FC2-F1', 'batch1-FC1-F2_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_batch2-FC2-F1', 'batch2-FC1-F1_batch2-FC2-F2',
+         'batch2-FC1-F2_batch2-FC2-F1', 'batch2-FC1-F2_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_sparse(self):
+    """Tests mixed type."""
+    sp_inp_1 = self._sparse_tensor([[11], [333, 55555]])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1'],
+        ['333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
+         '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_dense(self):
+    """Tests mixed dense inputs."""
+    dense_inp_1 = constant_op.constant([[11, 333], [55555, 999999]],
+                                       dtypes.int64)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2',
+         '333_X_batch1-FC2-F1', '333_X_batch1-FC2-F2'
+        ],
+        ['55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
+         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_cross_dense(self):
+    """Tests sparse and dense inputs."""
+    sp_inp = self._sparse_tensor([['batch1-FC1-F1'],
+                                  ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    dense_inp = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                      ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                     dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp.indices],
+        values=[sp_inp.values],
+        shapes=[sp_inp.dense_shape],
+        dense_inputs=[dense_inp],
+        sep='_X_')
+    expected_out = self._sparse_tensor(
+        [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'],
+         [
+             'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+         ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x3x3(self):
+    """Tests 3x3x3 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor(
+        [['batch1-FC2-F1', 'batch1-FC2-F2', 'batch1-FC2-F3']])
+    sp_inp_3 = self._sparse_tensor(
+        [['batch1-FC3-F1', 'batch1-FC3-F2', 'batch1-FC3-F3']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x1x2(self):
+    """Tests 3x1x2 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_large_batch(self):
+    """Tests with large batch size to force multithreading."""
+    batch_size = 5000
+    col1 = []
+    col2 = []
+    col3 = []
+    for b in range(batch_size):
+      col1.append(
+          ['batch%d-FC1-F1' % b,
+           'batch%d-FC1-F2' % b,
+           'batch%d-FC1-F3' % b])
+      col2.append(['batch%d-FC2-F1' % b])
+      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
+    sp_inp_1 = self._sparse_tensor(col1)
+    sp_inp_2 = self._sparse_tensor(col2)
+    sp_inp_3 = self._sparse_tensor(col3)
+
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+
+    col_out = []
+    for b in range(batch_size):
+      col_out.append([
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
+      ])
+
+    expected_out = self._sparse_tensor(col_out)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_one_column_empty(self):
+    """Tests when one column is empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([], 1)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_some_columns_empty(self):
+    """Tests when more than one columns are empty.
+
+    Cross for the corresponding batch should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2)
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]], 2)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_all_columns_empty(self):
+    """Tests when all columns are empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([])
+    sp_inp_2 = self._sparse_tensor([])
+    sp_inp_3 = self._sparse_tensor([])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+
+class SparseCrossHashedOpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_hashed_zero_bucket_no_hash_key(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[1, 1],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[9186962005966787372]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+    # salt is not being used when `strong_hash` is False.
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[137, 173],
+        strong_hash=False)
+    out_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out_2))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_output(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=100,
+        salt=[137, 173],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[79]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_has_no_collision(self):
+    """Tests that fingerprint concatenation has no collisions."""
+    # Although the last 10 bits of 359 and 1024+359 are identical.
+    # As a result, all the crosses shouldn't collide.
+    t1 = constant_op.constant([[359], [359 + 1024]], dtype=dtypes.int64)
+    t2 = constant_op.constant(
+        [list(range(10)), list(range(10))], dtype=dtypes.int64)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[t2, t1],
+        num_buckets=1024,
+        salt=[137, 173],
+        strong_hash=False)
+    cross = sparse_tensor.SparseTensor(inds, vals, shapes)
+    cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
+    with session.Session():
+      values = self.evaluate(cross_dense)
+      self.assertTrue(numpy.not_equal(values[0], values[1]).all())
+
+  def test_hashed_3x1x2(self):
+    """Tests 3x1x2 permutation with hashed output."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=1000,
+        salt=[137, 173],
+        strong_hash=False)
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      out = self.evaluate(output)
+      self.assertEqual(6, len(out.values))
+      self.assertAllEqual([[0, i] for i in range(6)], out.indices)
+      self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
+      all_values_are_different = len(out.values) == len(set(out.values))
+      self.assertTrue(all_values_are_different)
+
+  def test_hashed_different_salt(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=False,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 1])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertNotAllEqual(out.values, out_2.values)
+
+  def test_sep_ignored_in_hashed_out(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertAllEqual(out.values, out_2.values)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..f798ebf25fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -4100,6 +4100,14 @@ tf_module {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..f798ebf25fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -4100,6 +4100,14 @@ tf_module {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "

From 3753d9ff839762d7c64e7f5b0e2ac69fbd4f1b32 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 18 May 2020 17:42:11 -0700
Subject: [PATCH 0791/1533] Remove the unnecessary address-returning operator
 and lamda expression.

PiperOrigin-RevId: 312188829
Change-Id: Ia17acc7e84f79846ee1bd7aeab9ca0800905c52c
---
 tensorflow/lite/tools/evaluation/utils.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 3807814fee1..33967b6f4ea 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -119,7 +119,7 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 #if defined(__ANDROID__)
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
   return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(options),
-                           TfLiteGpuDelegateV2Delete);
+                           &TfLiteGpuDelegateV2Delete);
 }
 #endif  // defined(__ANDROID__)
 
@@ -184,7 +184,9 @@ TfLiteDelegatePtr CreateXNNPACKDelegate() {
 TfLiteDelegatePtr CreateXNNPACKDelegate(
     const TfLiteXNNPackDelegateOptions* xnnpack_options) {
   auto xnnpack_delegate = TfLiteXNNPackDelegateCreate(xnnpack_options);
-  return TfLiteDelegatePtr(xnnpack_delegate, TfLiteXNNPackDelegateDelete);
+  return TfLiteDelegatePtr(xnnpack_delegate, [](TfLiteDelegate* delegate) {
+    TfLiteXNNPackDelegateDelete(delegate);
+  });
 }
 
 TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads) {

From 456a61ddb1b4d774b68caf046193a44c5cbe4c24 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 18:24:26 -0700
Subject: [PATCH 0792/1533] Hexagon Delegate Skip tensors which are not
 available from type checking. -1 means optional tensor and not available.

PiperOrigin-RevId: 312194000
Change-Id: I390ccaad7a72892ebba09ad66af3404e43da7ff4
---
 tensorflow/lite/experimental/delegates/hexagon/utils.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index ae7f6994657..d6e5e7bc8cd 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -42,6 +42,8 @@ bool InputsWithCorrectTypes(
     const std::vector<std::vector<TfLiteType>>& per_input_possible_types) {
   if (node->inputs->size != per_input_possible_types.size()) return false;
   for (int i = 0; i < per_input_possible_types.size(); ++i) {
+    // Skip optional tensor.
+    if (node->inputs->data[i] == -1) continue;
     bool type_found = false;
     for (auto possible_type : per_input_possible_types[i]) {
       if (TensorTypeMatch(node->inputs->data[i], context, possible_type)) {

From 97c4543c9dff244413e0105a7f5cbd0a1c02d08b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 18:27:25 -0700
Subject: [PATCH 0793/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 312194335
Change-Id: I90d2f0daa1b6f701101c54b1fbac25a17367ced6
---
 tensorflow/go/op/wrappers.go | 467 ++++++++++++++++++++++-------------
 1 file changed, 300 insertions(+), 167 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 04c36ed3399..7efdcf181d9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -26103,6 +26103,173 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpreted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
 type ConfigureDistributedTPUAttr func(optionalAttr)
 
@@ -30655,57 +30822,6 @@ func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_in
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ClipByValue",
-		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // VariableShapeAttr is an optional argument to VariableShape.
 type VariableShapeAttr func(optionalAttr)
 
@@ -34196,6 +34312,74 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	strong_hash: boolean, if true, siphash with salt will be used instead of farmhash.
+//	salt: Specify the salt that will be used by the siphash function.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossHashed(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, num_buckets tf.Output, strong_hash tf.Output, salt tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossHashed",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), num_buckets, strong_hash, salt,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
 type QuantizedInstanceNormAttr func(optionalAttr)
 
@@ -34457,6 +34641,71 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	sep: string used when joining a list of string inputs, can be used as separator later.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossV2(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, sep tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossV2",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), sep,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Pads a tensor with mirrored values.
 //
 // This operation pads a `input` with mirrored values according to the `paddings`
@@ -36887,34 +37136,6 @@ func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpreted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Says whether the targets are in the top `K` predictions.
 //
 // This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
@@ -48489,94 +48710,6 @@ func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LRNGradAttr is an optional argument to LRNGrad.
 type LRNGradAttr func(optionalAttr)
 

From 714092f36095ec762a5806fbe3c0fad7ec162e8e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 18 May 2020 18:36:37 -0700
Subject: [PATCH 0794/1533] Disable flaky tensorflow/c/eager:c_api_test

PiperOrigin-RevId: 312195494
Change-Id: I7cbd78f2142ef586e6ca78da73c2cf53304ae3b6
---
 tensorflow/c/eager/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 0180b4bdee2..24593806c65 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -319,6 +319,7 @@ tf_cuda_cc_test(
     tags = [
         "noguitar",  # TODO(b/155445984): flaky
         #"guitar",
+        "notap",  # TODO(b/156981931): flaky
         "multi_gpu",
     ],
     deps = [

From d3886d23d7c5f423390b4c570842fe2c31f24ff5 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 18 May 2020 18:54:25 -0700
Subject: [PATCH 0795/1533] Move compression_utils to core/data.

This is in preparation for adding a CompressElementOp, which will use CompressElement to compress a dataset element in a tf.data service agnostic way.

PiperOrigin-RevId: 312197651
Change-Id: I3558b2f5036dcf4c91ed9059a7b896351c79da40
---
 tensorflow/core/data/BUILD                    | 47 ++++++++++++++++++-
 .../data/{service => }/compression_utils.cc   | 21 +++++----
 .../data/{service => }/compression_utils.h    | 11 ++---
 .../{service => }/compression_utils_test.cc   |  8 ++--
 tensorflow/core/data/dataset.proto            | 27 +++++++++++
 tensorflow/core/data/service/BUILD            | 38 ++-------------
 tensorflow/core/data/service/common.proto     | 19 --------
 .../core/data/service/data_service_test.cc    |  4 +-
 tensorflow/core/data/service/worker.proto     |  1 +
 tensorflow/core/data/service/worker_impl.cc   |  6 +--
 .../core/kernels/data/experimental/BUILD      |  4 +-
 .../experimental/data_service_dataset_op.cc   |  6 +--
 12 files changed, 106 insertions(+), 86 deletions(-)
 rename tensorflow/core/data/{service => }/compression_utils.cc (90%)
 rename tensorflow/core/data/{service => }/compression_utils.h (82%)
 rename tensorflow/core/data/{service => }/compression_utils_test.cc (89%)
 create mode 100644 tensorflow/core/data/dataset.proto

diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 9c58be108fc..e42c46d6348 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -1,5 +1,10 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_protos_all",
+)
 
 package(
     default_visibility = [
@@ -10,6 +15,46 @@ package(
 
 exports_files(["LICENSE"])
 
+cc_library(
+    name = "compression_utils",
+    srcs = ["compression_utils.cc"],
+    hdrs = [
+        "compression_utils.h",
+    ],
+    deps = [
+        ":dataset_proto_cc",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "compression_utils_test",
+    srcs = ["compression_utils_test.cc"],
+    deps = [
+        ":compression_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+    ],
+)
+
+tf_proto_library(
+    name = "dataset_proto",
+    srcs = ["dataset.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "standalone",
     srcs = ["standalone.cc"],
diff --git a/tensorflow/core/data/service/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
similarity index 90%
rename from tensorflow/core/data/service/compression_utils.cc
rename to tensorflow/core/data/compression_utils.cc
index c4a47e1b00e..ea06a082128 100644
--- a/tensorflow/core/data/service/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -21,11 +21,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out) {
   tensorflow::profiler::TraceMe activity(
-      "Compress", tensorflow::profiler::TraceMeLevel::kInfo);
+      "CompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
 
   // Step 1: Determine the total uncompressed size. This requires serializing
   // non-memcopyable tensors, which we save to use again later.
@@ -51,7 +51,8 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
   char* position = uncompressed.mdata();
   int non_memcpy_component_index = 0;
   for (auto& component : element) {
-    ComponentMetadata* metadata = out->mutable_component_metadata()->Add();
+    CompressedComponentMetadata* metadata =
+        out->mutable_component_metadata()->Add();
     metadata->set_dtype(component.dtype());
     component.shape().AsProto(metadata->mutable_tensor_shape());
     if (DataTypeCanUseMemcpy(component.dtype())) {
@@ -74,10 +75,10 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
   return Status::OK();
 }
 
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out) {
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out) {
   tensorflow::profiler::TraceMe activity(
-      "Uncompress", tensorflow::profiler::TraceMeLevel::kInfo);
+      "UncompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
   int num_components = compressed.component_metadata_size();
   out->clear();
   out->reserve(num_components);
@@ -92,7 +93,8 @@ Status Uncompress(const CompressedElement& compressed,
   tensor_proto_strs.reserve(num_components);
   int64 total_size = 0;
   for (int i = 0; i < num_components; ++i) {
-    const ComponentMetadata& metadata = compressed.component_metadata(i);
+    const CompressedComponentMetadata& metadata =
+        compressed.component_metadata(i);
     if (DataTypeCanUseMemcpy(metadata.dtype())) {
       out->emplace_back(metadata.dtype(), metadata.tensor_shape());
       TensorBuffer* buffer = DMAHelper::buffer(&out->back());
@@ -146,6 +148,5 @@ Status Uncompress(const CompressedElement& compressed,
   return Status::OK();
 }
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/compression_utils.h b/tensorflow/core/data/compression_utils.h
similarity index 82%
rename from tensorflow/core/data/service/compression_utils.h
rename to tensorflow/core/data/compression_utils.h
index 96698aaaf09..5e033771272 100644
--- a/tensorflow/core/data/service/compression_utils.h
+++ b/tensorflow/core/data/compression_utils.h
@@ -16,24 +16,23 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_SERVICE_COMPRESSION_UTILS_H_
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 // Compresses the components of `element` into the `CompressedElement` proto.
 //
 // In addition to writing the actual compressed bytes, `Compress` fills
 // out the per-component metadata for the `CompressedElement`.
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out);
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out);
 
 // Uncompresses a `CompressedElement` into a vector of tensor components.
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out);
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out);
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/service/compression_utils_test.cc b/tensorflow/core/data/compression_utils_test.cc
similarity index 89%
rename from tensorflow/core/data/service/compression_utils_test.cc
rename to tensorflow/core/data/compression_utils_test.cc
index b5da13efeed..eb220092f88 100644
--- a/tensorflow/core/data/service/compression_utils_test.cc
+++ b/tensorflow/core/data/compression_utils_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
@@ -20,7 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 class ParameterizedCompressionUtilsTest
     : public DatasetOpsTestBase,
@@ -29,9 +28,9 @@ class ParameterizedCompressionUtilsTest
 TEST_P(ParameterizedCompressionUtilsTest, RoundTrip) {
   std::vector<Tensor> element = GetParam();
   CompressedElement compressed;
-  TF_ASSERT_OK(Compress(element, &compressed));
+  TF_ASSERT_OK(CompressElement(element, &compressed));
   std::vector<Tensor> round_trip_element;
-  TF_ASSERT_OK(Uncompress(compressed, &round_trip_element));
+  TF_ASSERT_OK(UncompressElement(compressed, &round_trip_element));
   TF_EXPECT_OK(
       ExpectEqual(element, round_trip_element, /*compare_order=*/true));
 }
@@ -50,6 +49,5 @@ std::vector<std::vector<Tensor>> TestCases() {
 INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedCompressionUtilsTest,
                          ::testing::ValuesIn(TestCases()));
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/dataset.proto b/tensorflow/core/data/dataset.proto
new file mode 100644
index 00000000000..27a36364e76
--- /dev/null
+++ b/tensorflow/core/data/dataset.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package tensorflow.data;
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+// This file contains protocol buffers for working with tf.data Datasets.
+
+// Metadata describing a compressed component of a dataset element.
+message CompressedComponentMetadata {
+  // The dtype of the component tensor.
+  .tensorflow.DataType dtype = 1;
+  // The shape of the component tensor.
+  .tensorflow.TensorShapeProto tensor_shape = 2;
+  // Size of the uncompressed tensor bytes. For tensors serialized as
+  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
+  // this is the size of the buffer underlying the Tensor.
+  int64 tensor_size_bytes = 3;
+}
+
+message CompressedElement {
+  // Compressed tensor bytes for all components of the element.
+  bytes data = 1;
+  // Metadata for the components of the element.
+  repeated CompressedComponentMetadata component_metadata = 2;
+}
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 5413493cb78..b87f4f171cd 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -44,6 +44,7 @@ tf_proto_library(
     cc_api_version = 2,
     protodeps = tf_additional_all_protos() + [
         ":common_proto",
+        "//tensorflow/core/data:dataset_proto",
     ],
 )
 
@@ -84,7 +85,6 @@ cc_library(
     ],
     deps = [
         ":common_proto_cc",
-        ":compression_utils",
         ":credentials_factory",
         ":grpc_util",
         ":master_cc_grpc_proto",
@@ -98,6 +98,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/data:standalone",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -129,39 +130,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "compression_utils",
-    srcs = ["compression_utils.cc"],
-    hdrs = [
-        "compression_utils.h",
-    ],
-    deps = [
-        ":common_proto_cc",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "compression_utils_test",
-    srcs = ["compression_utils_test.cc"],
-    deps = [
-        ":compression_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels/data:dataset_test_base",
-    ],
-)
-
 cc_library(
     name = "credentials_factory",
     srcs = ["credentials_factory.cc"],
@@ -317,7 +285,6 @@ tf_cc_test(
     srcs = ["data_service_test.cc"],
     tags = ["no_windows"],
     deps = [
-        ":compression_utils",
         ":data_service",
         ":grpc_master_impl",
         ":grpc_util",
@@ -333,6 +300,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/kernels/data:dataset_test_base",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index 6dfa698764b..4bde56fe1ca 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -3,7 +3,6 @@ syntax = "proto3";
 package tensorflow.data;
 
 import "tensorflow/core/framework/graph.proto";
-import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
 message DatasetDef {
@@ -12,24 +11,6 @@ message DatasetDef {
   GraphDef graph = 1;
 }
 
-message ComponentMetadata {
-  // The dtype of the component tensor.
-  .tensorflow.DataType dtype = 1;
-  // The shape of the component tensor.
-  .tensorflow.TensorShapeProto tensor_shape = 2;
-  // Size of the uncompressed tensor bytes. For tensors serialized as
-  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
-  // this is the size of the buffer underlying the Tensor.
-  int64 tensor_size_bytes = 3;
-}
-
-message CompressedElement {
-  // Compressed tensor bytes for all components of the element.
-  bytes data = 1;
-  // Metadata for the components of the element.
-  repeated ComponentMetadata component_metadata = 2;
-}
-
 message TaskDef {
   // The dataset to iterate over.
   // TODO(aaudibert): load the dataset from disk instead of passing it here.
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index 73a46bad3d0..bd01cb90a66 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "grpcpp/create_channel.h"
 #include "grpcpp/security/credentials.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
 #include "tensorflow/core/data/service/master.pb.h"
@@ -74,7 +74,7 @@ Status CheckWorkerOutput(const std::string& worker_address, int64 task_id,
       return errors::Internal("Reached end of sequence too early.");
     }
     std::vector<Tensor> element;
-    TF_RETURN_IF_ERROR(service_util::Uncompress(compressed, &element));
+    TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
     TF_RETURN_IF_ERROR(DatasetOpsTestBase::ExpectEqual(element, expected,
                                                        /*compare_order=*/true));
   }
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index 04b8f03474c..51c6899f540 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow.data;
 
+import "tensorflow/core/data/dataset.proto";
 import "tensorflow/core/data/service/common.proto";
 
 message ProcessTaskRequest {
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 8d00825227b..b4be18ebccd 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
@@ -135,8 +135,8 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
 
   if (!end_of_sequence) {
     VLOG(3) << "Producing an element for task " << request->task_id();
-    TF_RETURN_IF_ERROR(service_util::Compress(
-        outputs, response->mutable_compressed_element()));
+    TF_RETURN_IF_ERROR(
+        CompressElement(outputs, response->mutable_compressed_element()));
   }
   response->set_end_of_sequence(end_of_sequence);
 
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 4ddfd99951c..85f8af878ee 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -131,8 +131,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data/service:common_proto_cc",
-        "//tensorflow/core/data/service:compression_utils",
+        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/data:dataset_proto_cc",
         "//tensorflow/core/data/service:data_service",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 56077a671fb..3f8e778d1d8 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -496,7 +496,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
       std::vector<Tensor> element;
       if (!end_of_sequence) {
-        TF_RETURN_IF_ERROR(service_util::Uncompress(compressed, &element));
+        TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
       }
       mutex_lock l(mu_);
       if (end_of_sequence) {

From efd77d2e45f2958615a15812d225caa093f1e5af Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Mon, 18 May 2020 19:21:37 -0700
Subject: [PATCH 0796/1533] Adding skip record functionality to snapshot utils.

PiperOrigin-RevId: 312200718
Change-Id: Icba0dfd19ffc6ddc0ca49f58d241beff7cd27714
---
 .../data/experimental/snapshot_util.cc        | 39 ++++++++++++++++---
 .../kernels/data/experimental/snapshot_util.h |  3 ++
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 6c4d6424146..877d05ebb3f 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -62,7 +62,7 @@ Status Writer::Create(Env* env, const std::string& filename,
 }
 
 Status Writer::Initialize(tensorflow::Env* env) {
-  TF_RETURN_IF_ERROR(env->NewWritableFile(filename_, &dest_));
+  TF_RETURN_IF_ERROR(env->NewAppendableFile(filename_, &dest_));
 #if defined(IS_SLIM_BUILD)
   if (compression_type_ != io::compression::kNone) {
     LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
@@ -228,13 +228,14 @@ class Reader::Dataset : public DatasetBase {
   explicit Dataset(const std::string& filename, const std::string& compression,
                    const int64 version, const DataTypeVector& dtypes,
                    const std::vector<PartialTensorShape>& shapes,
-                   DatasetContext::Params params)
+                   const int64 start_index, DatasetContext::Params params)
       : DatasetBase(DatasetContext(std::move(params))),
         filename_(filename),
         compression_(compression),
         version_(version),
         dtypes_(dtypes),
-        shapes_(shapes) {}
+        shapes_(shapes),
+        start_index_(start_index) {}
 
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
 
@@ -268,6 +269,7 @@ class Reader::Dataset : public DatasetBase {
   int64 version_;
   DataTypeVector dtypes_;
   std::vector<PartialTensorShape> shapes_;
+  const int64 start_index_;
 
   class Iterator : public DatasetIterator<Dataset> {
    public:
@@ -275,9 +277,10 @@ class Reader::Dataset : public DatasetBase {
         : DatasetIterator<Dataset>(params) {}
 
     Status Initialize(IteratorContext* ctx) override {
-      return Reader::Create(ctx->env(), dataset()->filename_,
-                            dataset()->compression_, dataset()->version_,
-                            dataset()->dtypes_, &reader_);
+      TF_RETURN_IF_ERROR(Reader::Create(
+          ctx->env(), dataset()->filename_, dataset()->compression_,
+          dataset()->version_, dataset()->dtypes_, &reader_));
+      return reader_->SkipRecords(dataset()->start_index_);
     }
 
    protected:
@@ -397,17 +400,32 @@ Status Reader::MakeNestedDataset(Env* env,
                                  const string& compression_type, int version,
                                  const DataTypeVector& dtypes,
                                  const std::vector<PartialTensorShape>& shapes,
+                                 const int64 start_index,
                                  DatasetBase** output) {
   std::vector<DatasetBase*> datasets;
 
   datasets.reserve(filenames.size());
   for (const auto& filename : filenames) {
+    // TODO(frankchn): The reading pattern could be controlled in a non-round
+    // robin fashion, so we cannot assume a round-robin manner when restoring.
+    int64 dataset_start_index = start_index / filenames.size();
+    if (start_index % filenames.size() > datasets.size()) {
+      dataset_start_index++;
+    }
+
     datasets.push_back(
         new Dataset(filename, compression_type, version, dtypes, shapes,
+                    dataset_start_index,
                     DatasetContext::Params({"snapshot_util::Reader::Dataset",
                                             "snapshot_util_reader_Dataset"})));
   }
 
+  // Rotate the vector such that the first dataset contains the next element
+  // to be produced.
+  std::rotate(datasets.begin(),
+              datasets.begin() + (start_index % filenames.size()),
+              datasets.end());
+
   *output = new NestedDataset(
       datasets, DatasetContext::Params({"snapshot_util::Reader::NestedDataset",
                                         "snapshot_util_reader_NestedDataset"}));
@@ -463,6 +481,15 @@ Status Reader::Initialize(Env* env) {
   return Status::OK();
 }
 
+Status Reader::SkipRecords(int64 num_records) {
+  // TODO(frankchn): Optimize to not parse the entire Tensor and actually skip.
+  for (int i = 0; i < num_records; ++i) {
+    std::vector<Tensor> unused_tensors;
+    TF_RETURN_IF_ERROR(ReadTensors(&unused_tensors));
+  }
+  return Status::OK();
+}
+
 Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
   profiler::TraceMe activity(
       [&]() { return absl::StrCat(kClassName, kSeparator, "ReadTensors"); },
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.h b/tensorflow/core/kernels/data/experimental/snapshot_util.h
index dd15c591a22..79299bb79b4 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.h
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.h
@@ -130,10 +130,13 @@ class Reader {
                                   const string& compression_type, int version,
                                   const DataTypeVector& dtypes,
                                   const std::vector<PartialTensorShape>& shapes,
+                                  const int64 start_index,
                                   DatasetBase** output);
 
   Status ReadTensors(std::vector<Tensor>* read_tensors);
 
+  Status SkipRecords(int64 num_records);
+
  private:
   explicit Reader(const std::string& filename, const string& compression_type,
                   int version, const DataTypeVector& dtypes);

From d98a0e601762228e0c0666f964530a470432bade Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 18 May 2020 19:27:50 -0700
Subject: [PATCH 0797/1533] Move tf.keras.layers.featureDenseFeature back to
 Keras package.

PiperOrigin-RevId: 312201284
Change-Id: I8e51198c62a8e79ef493a173d7f4f8ab65f300eb
---
 tensorflow/python/feature_column/BUILD        |  20 -
 .../feature_column/feature_column_lib.py      |   8 +-
 .../feature_column/feature_column_v2_test.py  | 326 -------------
 .../feature_column/keras_integration_test.py  |   2 +-
 .../sequence_feature_column_test.py           |  49 --
 .../feature_column/serialization_test.py      |  66 ---
 tensorflow/python/keras/feature_column/BUILD  |  78 +++
 .../python/keras/feature_column/__init__.py   |   0
 .../feature_column/dense_features.py          |   5 -
 .../feature_column/dense_features_test.py     | 452 +++++++++++++++++-
 .../feature_column/dense_features_v2.py       |   7 +-
 .../feature_column/dense_features_v2_test.py  |   2 +-
 ...equence_feature_column_integration_test.py |   2 +-
 .../python/keras/layers/serialization.py      |  18 +-
 .../saving/saved_model/saved_model_test.py    |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   4 +-
 17 files changed, 545 insertions(+), 498 deletions(-)
 create mode 100644 tensorflow/python/keras/feature_column/__init__.py
 rename tensorflow/python/{ => keras}/feature_column/dense_features.py (97%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_test.py (60%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2.py (94%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2_test.py (99%)

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d67cdf9cc06..786c26c009a 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -55,8 +55,6 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
-        "dense_features.py",
-        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -126,15 +124,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -177,15 +166,6 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_v2_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index afe14f55bfc..bda20ff3f2c 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.feature_column.dense_features_v2 import *
-from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.keras.feature_column.dense_features_v2 import *
+from tensorflow.python.keras.feature_column.dense_features import *
 from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index cba87a51c23..076515c84b8 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -5582,23 +5581,6 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -6271,191 +6253,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                           self.evaluate(predictions))
 
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True,
-          'partition_variables': False,
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False,
-          'partition_variables': False,
-      }, {
-          'testcase_name': 'use_safe_embedding_lookup_partitioned',
-          'use_safe_embedding_lookup': True,
-          'partition_variables': True,
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
-          'use_safe_embedding_lookup': False,
-          'partition_variables': True,
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
-    # Inputs.
-    vocabulary_size = 4
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.),  # id 2
-        (9., 13.)  # id 3
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      if partition_variables:
-        self.assertEqual([vocabulary_size, embedding_dimension],
-                         partition_info.full_shape)
-        self.assertAllEqual((2, embedding_dimension), shape)
-      else:
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertIsNone(partition_info)
-
-      self.assertEqual(dtypes.float32, dtype)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    partitioner = None
-    if partition_variables:
-      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
-    with variable_scope.variable_scope('vars', partitioner=partitioner):
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_initializer,
-          use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-      # Provide sparse input and get dense result.
-      l = df.DenseFeatures((embedding_column,))
-      dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
-           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
-          tuple([v.name for v in global_vars]))
-    else:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
-          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
-           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
-          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertCountEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -7389,129 +7186,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
       self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertCountEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
diff --git a/tensorflow/python/feature_column/keras_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
index e0677e84e50..456c0204350 100644
--- a/tensorflow/python/feature_column/keras_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -23,12 +23,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column import dense_features_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index 3d5d24ec03a..d0cf5ee7670 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,7 +24,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -111,54 +110,6 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = dense_features.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = dense_features.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 78b72746ac9..881ca0cca5e 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import test_util
@@ -114,71 +113,6 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = dense_features.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 650efcceb52..94097c28d73 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -12,11 +12,88 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "feature_column",
+    srcs = ["__init__.py"],
     deps = [
+        ":dense_features",
+        ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
+py_library(
+    name = "dense_features",
+    srcs = [
+        "dense_features.py",
+    ],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:backend",
+    ],
+)
+
+py_library(
+    name = "dense_features_v2",
+    srcs = [
+        "dense_features_v2.py",
+    ],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
 py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
@@ -59,6 +136,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
+        ":dense_features",
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/keras/feature_column/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
similarity index 97%
rename from tensorflow/python/feature_column/dense_features.py
rename to tensorflow/python/keras/feature_column/dense_features.py
index 6feef185815..820f1a6b1b7 100644
--- a/tensorflow/python/feature_column/dense_features.py
+++ b/tensorflow/python/keras/feature_column/dense_features.py
@@ -23,7 +23,6 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
@@ -173,7 +172,3 @@ class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
           cols_to_output_tensors[column] = processed_tensors
         output_tensors.append(processed_tensors)
     return self._verify_and_concat_tensors(output_tensors)
-
-
-layer_serialization.inject_feature_column_v1_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
similarity index 60%
rename from tensorflow/python/feature_column/dense_features_test.py
rename to tensorflow/python/keras/feature_column/dense_features_test.py
index 7cd523dcc14..76b91dd605f 100644
--- a/tensorflow/python/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -18,22 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
@@ -676,5 +679,452 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
+class IndicatorColumnTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
+    # Inputs.
+    vocabulary_size = 4
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
+      self.assertEqual(dtypes.float32, dtype)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+      # Provide sparse input and get dense result.
+      l = df.DenseFeatures((embedding_column,))
+      dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertCountEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = df.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SequenceFeatureColumnsTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = df.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = df.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
similarity index 94%
rename from tensorflow/python/feature_column/dense_features_v2.py
rename to tensorflow/python/keras/feature_column/dense_features_v2.py
index 405c5d63249..e4dc22f1bbe 100644
--- a/tensorflow/python/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -18,10 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import serialization as layer_serialization
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -94,7 +93,3 @@ class DenseFeatures(dense_features.DenseFeatures):
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
     super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
-
-
-layer_serialization.inject_feature_column_v2_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/feature_column/dense_features_v2_test.py
rename to tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 71cb163a7d9..95fc8b7ac1e 100644
--- a/tensorflow/python/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
index 8784182e23b..b1100bf7b07 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -24,11 +24,11 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import init_ops_v2
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 0a90441d8a0..30be3d485df 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -64,23 +64,11 @@ ALL_V2_MODULES = (
     recurrent_v2,
     preprocessing_normalization
 )
-FEATURE_COLUMN_V1_OBJECTS = {}
-FEATURE_COLUMN_V2_OBJECTS = {}
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
-def inject_feature_column_v1_objects(name, cls):
-  global FEATURE_COLUMN_V1_OBJECTS
-  FEATURE_COLUMN_V1_OBJECTS[name] = cls
-
-
-def inject_feature_column_v2_objects(name, cls):
-  global FEATURE_COLUMN_V2_OBJECTS
-  FEATURE_COLUMN_V2_OBJECTS[name] = cls
-
-
 def populate_deserializable_objects():
   """Populates dict ALL_OBJECTS with every built-in layer.
   """
@@ -134,9 +122,11 @@ def populate_deserializable_objects():
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
 
   if tf2.enabled():
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V2_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
   else:
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V1_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
 
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 30a93e2bba3..4ada84191dc 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -48,6 +47,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ecda1603325..ba9156d7f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index f7137f0d09b..130a9954202 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"

From 63926472df4f777b43146c608a0027a42569fe57 Mon Sep 17 00:00:00 2001
From: Jing Pu <jingpu@google.com>
Date: Mon, 18 May 2020 19:34:59 -0700
Subject: [PATCH 0798/1533] Fix TF_ConcatV2Op conversion pattern when the axis
 is a I64 Tensor.

PiperOrigin-RevId: 312201848
Change-Id: I55fcd3b514e9da905d0687d7c66e4da49c178ea5
---
 .../compiler/mlir/lite/tests/legalize-tf.mlir |  9 ++++++
 .../mlir/lite/transforms/legalize_tf.cc       | 29 +++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 15b6bf56b7a..15c73d2db2c 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1048,6 +1048,15 @@ func @concatv2With3Tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2
 // CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
+func @concatv2I64Axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
+  %0 = "tf.Const"() { value = dense<-1> : tensor<i64> } : () -> tensor<i64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>, tensor<i64>) -> tensor<2x3xi32>
+  return %1 : tensor<2x3xi32>
+
+// CHECK-LABEL: concatv2I64Axis
+// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
+}
+
 func @resize_with_bilinear(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeBilinear"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index ab4c4f5c4cf..bfcbc190638 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
@@ -202,6 +203,26 @@ LogicalResult ConvertTFConcatOp::matchAndRewrite(
   return success();
 }
 
+// Converts any IntegerAttr to an IntegerAttr of an i32 type.
+// The value won't change in the new attribute, but if the value is out of
+// the bound of i32, the function returns a failure.
+LogicalResult ConvertToI32Attr(IntegerAttr attr, IntegerAttr* attr_i32) {
+  if (attr.getType().isInteger(/*width=*/32)) {
+    *attr_i32 = attr;
+    return success();
+  }
+
+  int64_t value = attr.getInt();
+  if (value > std::numeric_limits<int>::max() ||
+      value < std::numeric_limits<int>::min()) {
+    return failure();
+  }
+
+  *attr_i32 = IntegerAttr::get(
+      IntegerType::get(/*width=*/32, attr.getContext()), value);
+  return success();
+}
+
 LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concat_op = cast<TF::ConcatV2Op>(op);
@@ -211,12 +232,16 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
   // Extract axis attribute from constant axis tensor
   ElementsAttr axis;
   if (!matchPattern(tf_concat_op.axis(), m_Constant(&axis))) return failure();
+  IntegerAttr axis_int = ExtractSingleElementAsInteger(axis);
+
+  // "axis" operand could be a i64 tensor. Resolve it here.
+  IntegerAttr axis_i32;
+  if (failed(ConvertToI32Attr(axis_int, &axis_i32))) return failure();
 
   StringAttr fused_activation_function =
       StringAttr::get("NONE", rewriter.getContext());
   rewriter.replaceOpWithNewOp<ConcatenationOp>(
-      op, output_type, values, ExtractSingleElementAsInteger(axis),
-      fused_activation_function);
+      op, output_type, values, axis_i32, fused_activation_function);
   return success();
 }
 

From 44c387f2979ff469e56e492fb417ded3448591a3 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 19:38:16 -0700
Subject: [PATCH 0799/1533] NFC: Update const-fold tests to use regex which is
 the suggested way for matching.

PiperOrigin-RevId: 312202071
Change-Id: I901de7936dc260eb968a835be826dfbd39b78c9f
---
 .../compiler/mlir/lite/tests/const-fold.mlir  | 184 +++++++++---------
 1 file changed, 92 insertions(+), 92 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index 4b8993e2b26..a8463d51c7e 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -8,13 +8,13 @@ func @add_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>,
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.500000e+00> : tensor<4xf32>
-  // CHECK: %cst_0 = constant dense<-5.000000e-01> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<6.000000e+00> : tensor<f32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_3 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_4 = constant dense<3.000000e+00> : tensor<4xf32>
-  // CHECK: %0 = tfl.add %cst, %cst_0 {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.500000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-5.000000e-01> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<6.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_3:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_4:.*]]  = constant dense<3.000000e+00> : tensor<4xf32>
+  // CHECK: %0 = tfl.add %[[CST]], %[[CST_0]] {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -33,10 +33,10 @@ func @add_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<9> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<6> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<5> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<2> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<9> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<2> : tensor<4xi32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -54,10 +54,10 @@ func @sub_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.000000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<2.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<2.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -75,10 +75,10 @@ func @sub_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<7> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<10> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<3> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<7> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<10> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<3> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<6> : tensor<4xi32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -96,10 +96,10 @@ func @mul_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<6.750000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<-2.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<5.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<-1.750000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<6.750000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-2.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<-1.750000e+00> : tensor<4xf32>
 
   %5 = "tfl.mul"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.mul"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -170,8 +170,8 @@ func @add_dense_splat_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_int
@@ -183,8 +183,8 @@ func @add_splat_dense_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape
@@ -196,8 +196,8 @@ func @add_dense_dense_int_same_shape() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_trailing_dim
@@ -212,10 +212,10 @@ func @add_dense_dense_int_trailing_dim() -> (tensor<2x2xi32>, tensor<2x2x2xi32>,
 
   return %0, %1, %2 : tensor<2x2xi32>, tensor<2x2x2xi32>, tensor<2x2x2xi32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_mixing_1_n
@@ -226,8 +226,8 @@ func @add_dense_dense_int_mixing_1_n() -> tensor<2x2xi32> {
   %0 = "tfl.add"(%cst_0, %cst_1) {fused_activation_function = "NONE"} : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
 
   return %0 : tensor<2x2xi32>
-// CHECK: %cst = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_splat_float
@@ -239,8 +239,8 @@ func @add_dense_splat_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_float
@@ -252,8 +252,8 @@ func @add_splat_dense_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_same_shape
@@ -265,8 +265,8 @@ func @add_dense_dense_float_same_shape() -> (tensor<4xf32>) {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_trailing_dim
@@ -281,10 +281,10 @@ func @add_dense_dense_float_trailing_dim() -> (tensor<2x2xf32>, tensor<2x2x2xf32
 
   return %0, %1, %2 : tensor<2x2xf32>, tensor<2x2x2xf32>, tensor<2x2x2xf32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_mixfng_1_n
@@ -296,24 +296,24 @@ func @add_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @rank
 func @rank() -> tensor<1xi32> {
   %cst = constant dense<[[1], [2]]> : tensor<2x1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%cst) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
 
 // CHECK-LABEL: @rank_input_known_rank
 func @rank_input_known_rank(%arg0 : tensor<2x1xi32>) -> tensor<1xi32> {
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%arg0) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
@@ -323,8 +323,8 @@ func @reshape() -> tensor<4xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<4xi32>
   return %0 : tensor<4xi32>
 }
@@ -334,8 +334,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -343,8 +343,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
 
 // CHECK-LABEL: @pseudo_const
 func @pseudo_const() -> tensor<i32> {
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<i32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<i32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   return %0 : tensor<i32>
 }
@@ -356,8 +356,8 @@ func @range_int() -> tensor<?xi32> {
   %cst_1 = constant dense<4> : tensor<i32>
   %cst_2 = constant dense<1> : tensor<i32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -368,8 +368,8 @@ func @range_float() -> tensor<?xf32> {
   %cst_1 = constant dense<4.0> : tensor<f32>
   %cst_2 = constant dense<1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -381,8 +381,8 @@ func @range_float_neg_delta() -> tensor<?xf32> {
   %cst_1 = constant dense<-4.0> : tensor<f32>
   %cst_2 = constant dense<-1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -393,8 +393,8 @@ func @range_float_nonzero_base() -> tensor<?xf32> {
   %cst_1 = constant dense<7.0> : tensor<f32>
   %cst_2 = constant dense<1.5> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -414,8 +414,8 @@ func @transpose_1d() -> tensor<3xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<3xi32>
   return %0 : tensor<3xi32>
 }
@@ -425,8 +425,8 @@ func @transpose_dynamic() -> tensor<?xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -436,8 +436,8 @@ func @transpose_2d() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[1, 0]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -447,8 +447,8 @@ func @transpose_2d_identity() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[0, 1]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -460,8 +460,8 @@ func @transpose_3d() -> tensor<4x2x3xi32> {
   %cst = constant dense<[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]> : tensor<2x3x4xi32>
   %cst_perm = constant dense<[2, 0, 1]> : tensor<3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x3x4xi32>, tensor<3xi32>) -> tensor<4x2x3xi32>
   return %0 : tensor<4x2x3xi32>
 }
@@ -473,8 +473,8 @@ func @ConstantFoldBinaryOpDynamicOutput() -> tensor<?xi32> {
   %87 = "tfl.sub"(%cst_0, %cst) {fused_activation_function = "NONE"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   return %87 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape_dynamic
@@ -486,8 +486,8 @@ func @add_dense_dense_int_same_shape_dynamic() -> tensor<?xi32> {
 
   return %2 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concat_2_tensors_1_empty
@@ -497,8 +497,8 @@ func @concat_2_tensors_1_empty() -> tensor<2xi32> {
   %3 = "tfl.concatenation"(%1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<0xi32>) -> tensor<2xi32>
   return %3 : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<2xi32>
-  // CHECK: return [[cst]] : tensor<2xi32>
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<2xi32>
+  // CHECK: return %[[CST]] : tensor<2xi32>
 }
 
 // CHECK-LABEL: @concat_3_tensors_1_empty
@@ -509,7 +509,7 @@ func @concat_3_tensors_1_empty() -> tensor<?xi32> {
   %3 = "tfl.concatenation"(%0, %1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<2xi32>, tensor<0xi32>) -> tensor<?xi32>
   return %3 : tensor<?xi32>
 
-  // CHECK: %0 = "tfl.concatenation"(%cst, %cst) {axis = 0 : i32, fused_activation_function = "NONE"}
+  // CHECK: %0 = "tfl.concatenation"(%[[CST]], %[[CST]]) {axis = 0 : i32, fused_activation_function = "NONE"}
   // CHECK: return %0 : tensor<?xi32>
 }
 
@@ -520,10 +520,10 @@ func @concatConstantTensorsFirstDim() -> tensor<2x2x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<2x2x3xi32>
   return %0 : tensor<2x2x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsMiddleDim
@@ -533,10 +533,10 @@ func @concatConstantTensorsMiddleDim() -> tensor<1x4x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x4x3xi32>
   return %0 : tensor<1x4x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsLastDim
@@ -546,10 +546,10 @@ func @concatConstantTensorsLastDim() -> tensor<1x2x6xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 2 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x2x6xi32>
   return %0 : tensor<1x2x6xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_dense_float_mixfng_1_n
@@ -561,8 +561,8 @@ func @div_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_different_rank
@@ -574,6 +574,6 @@ func @div_dense_different_rank() -> tensor<1x2x2xf32> {
 
   return %0 : tensor<1x2x2xf32>
 
-// CHECK: %cst = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
+// CHECK:  return %[[CST]]
 }

From 977fd309a95353c86da2471f6964d28066dba0f9 Mon Sep 17 00:00:00 2001
From: Ryohei Ikegami <iofg2100@gmail.com>
Date: Sun, 17 May 2020 19:32:56 +0900
Subject: [PATCH 0800/1533] Build tensorflow/lite/c/*.cc in tflite Makefile

---
 tensorflow/lite/tools/make/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 41f87fb033d..394b5f9386a 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -118,6 +118,7 @@ CORE_CC_ALL_SRCS := \
 $(wildcard tensorflow/lite/*.cc) \
 $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
+$(wildcard tensorflow/lite/c/*.cc) \
 $(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc) \
 $(wildcard tensorflow/lite/experimental/resource/*.cc) \

From 125ce1812dffb02cac733f1c6108d1e7fca6c77b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 20:14:37 -0700
Subject: [PATCH 0801/1533] Make serialization of node_def.attr()
 deterministic.

tensorflow::NodeDef::attr is a map, so iteration order is non-deterministic. Hence,
when exporting, first sort by attribute name.
PiperOrigin-RevId: 312205528
Change-Id: I6cec8f7d34bc7db26cd53a2a0e2f9b4600801cb3
---
 tensorflow/compiler/mlir/lite/flatbuffer_export.cc  | 13 ++++++-------
 .../mlir2flatbuffer/custom_op_with_tflite_op.mlir   |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 6a631b1433d..df84b028f63 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -799,11 +799,6 @@ Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
 
 Optional<CustomOptionsOffset> Translator::CreateCustomOpCustomOptions(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
-  std::string node_def_str;
-  if (!node_def.SerializeToString(&node_def_str)) {
-    return emitError(loc, "failed to serialize tensorflow node_def"),
-           llvm::None;
-  }
   auto flex_builder = CreateFlexBuilderWithNodeAttrs(node_def, loc);
   return builder_.CreateVector(flex_builder->GetBuffer());
 }
@@ -813,9 +808,13 @@ Translator::CreateFlexBuilderWithNodeAttrs(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
   auto flex_builder = absl::make_unique<flexbuffers::Builder>();
   size_t map_start = flex_builder->StartMap();
-  for (const auto& pair : node_def.attr()) {
+  using Item = std::pair<std::string, ::tensorflow::AttrValue>;
+  std::vector<Item> attrs(node_def.attr().begin(), node_def.attr().end());
+  std::sort(attrs.begin(), attrs.end(),
+            [](Item& p1, Item& p2) -> bool { return p1.first < p2.first; });
+  for (const Item& pair : attrs) {
     const char* key = pair.first.c_str();
-    const auto& attr = pair.second;
+    const ::tensorflow::AttrValue& attr = pair.second;
     switch (attr.value_case()) {
       case ::tensorflow::AttrValue::kS:
         flex_builder->String(key, attr.s());
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
index 1b46fa3d0e5..320f869ac4c 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
@@ -65,7 +65,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:      opcode_index: 1,
 // CHECK-NEXT:      inputs: [ 2, 1 ],
 // CHECK-NEXT:      outputs: [ 3 ],
-// CHECK-NEXT:      custom_options: [ 105, 110, 116, 95, 97, 116, 116, 114, 0, 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 2, 33, 43, 2, 1, 2, 11, 2, 20, 4, 4, 36, 1 ]
+// CHECK-NEXT:      custom_options: [ 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 105, 110, 116, 95, 97, 116, 116, 114, 0, 2, 42, 11, 2, 1, 2, 20, 2, 20, 4, 4, 36, 1 ]
 // CHECK-NEXT:    }, {
 // CHECK-NEXT:      opcode_index: 2,
 // CHECK-NEXT:      inputs: [ 3 ],

From 489c8de9af23fa77d5a5a198e4a3eb5fcd1e60fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 20:42:25 -0700
Subject: [PATCH 0802/1533] [tf.data] Remove several unnecessary lines in the
 test.

PiperOrigin-RevId: 312208396
Change-Id: I52acdc04caea09ac83b4c9ac12378c818af650e6
---
 tensorflow/python/data/kernel_tests/options_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index 9ab3de788fc..27b5a336a6c 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -107,9 +107,6 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     for _ in range(999):
       result = result.concatenate(ds)
-    options = dataset_ops.Options()
-    options.experimental_optimization.autotune = True
-    result = result.with_options(options)
     self.assertDatasetProduces(result, [0]*1000)
 
 
From c87d12a5e9bc4c568bd310c2266f1f28264e20fb Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Mon, 18 May 2020 20:50:54 -0700
Subject: [PATCH 0803/1533] Introduce TraceMeProducer and TraceMeConsumer.

PiperOrigin-RevId: 312209299
Change-Id: I304049413d332b17e141e3f85486f9676e2f859a
---
 tensorflow/core/profiler/lib/BUILD            |  13 ++
 .../core/profiler/lib/connected_traceme.h     | 122 ++++++++++++++++++
 tensorflow/core/profiler/lib/traceme.h        |   8 ++
 .../core/profiler/utils/xplane_schema.cc      |   5 +
 .../core/profiler/utils/xplane_schema.h       |   5 +
 5 files changed, 153 insertions(+)
 create mode 100644 tensorflow/core/profiler/lib/connected_traceme.h

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 5bb9236efb3..2c4d9e96fcd 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -126,6 +126,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "connected_traceme",
+    hdrs = ["connected_traceme.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":traceme",
+        ":traceme_encode",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_pybind_cc_library_wrapper(
     name = "scoped_annotation_headers",
     visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
new file mode 100644
index 00000000000..7a31fa19a03
--- /dev/null
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+
+namespace tensorflow {
+namespace profiler {
+
+/*
+ * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
+ * different threads. TraceMeProducer generates the context information to be
+ * passed to TraceMeConsumer, which consists of the context id and optionally
+ * the context name. They may be provided by the user. Then, the events of the
+ * same context information can be correlated during the analysis.
+ *
+ * Example Usages:
+ * (1) Using the user-provided context name and id. The user is responsible for
+ *     providing the same context name and id to TraceMeProducer and
+ *     TraceMeConsumer.
+ * [Producer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     "executor_context", user_context_id);
+ * [Consumer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, user_context_id, "executor_context");
+ *
+ * (2) Using the user-provided context name and generic id. The user is
+ *     responsible for passing the TraceMeProducer's context id to
+ *     TraceMeConsumer as well as providing the same context name to
+ *     TraceMeProducer and TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     "executor_context");
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, context_id, "executor_context");
+ *
+ * (3) Using the generic context information. The user is responsible for
+ *     passing the TraceMeProducer's context id to TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); });
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer([&] { return "op_execute"; }, context_id);
+ */
+class TraceMeProducer {
+ public:
+  template <typename NameT>
+  explicit TraceMeProducer(NameT name, absl::string_view context_name = "",
+                           absl::optional<uint64> context_id = absl::nullopt,
+                           int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      context_id_ =
+          context_id.has_value() ? *context_id : TraceMe::NewActivityId();
+      if (context_name.empty()) {
+        return TraceMeEncode({{"$p", context_id_}});
+      } else {
+        return TraceMeEncode({{"$pn", context_name}, {"$p", context_id_}});
+      }
+    });
+  }
+
+  uint64 GetContextId() const { return context_id_; }
+
+ private:
+  TraceMe trace_me_;
+  uint64 context_id_ = 0;
+};
+
+class TraceMeConsumer {
+ public:
+  template <typename NameT>
+  TraceMeConsumer(NameT name, uint64 context_id,
+                  absl::string_view context_name = "", int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      if (context_name.empty()) {
+        return TraceMeEncode({{"$c", context_id}});
+      } else {
+        return TraceMeEncode({{"$cn", context_name}, {"$c", context_id}});
+      }
+    });
+  }
+
+ private:
+  TraceMe trace_me_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index ec5f6765afb..e157c2601be 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -248,6 +248,14 @@ class TraceMe {
 #endif
   }
 
+  static uint64 NewActivityId() {
+#if !defined(IS_MOBILE_PLATFORM)
+    return TraceMeRecorder::NewActivityId();
+#else
+    return 0;
+#endif
+  }
+
  private:
   // Activity ID or start time used when tracing is disabled.
   constexpr static uint64 kUntracedActivity = 0;
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index f8ff31b078a..710d9a889fb 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -147,6 +147,11 @@ const StatTypeMap& GetStatTypeMap() {
       {"region_type", kRegionType},
       {"data_type", kDataType},
       {"shape", kTensorShapes},
+      // Schema related.
+      {"$pn", kProducerContextName},
+      {"$cn", kConsumerContextName},
+      {"$p", kProducerId},
+      {"$c", kConsumerId},
       // Device trace arguments.
       {"device_id", kDeviceId},
       {"context_id", kContextId},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 31ff90155f5..8b19db8c38d 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -139,6 +139,11 @@ enum StatType {
   kRegionType,
   kDataType,
   kTensorShapes,
+  // Schema related.
+  kProducerContextName,
+  kConsumerContextName,
+  kProducerId,
+  kConsumerId,
   // Device trace arguments.
   kDeviceId,
   kContextId,

From aa90d29341126f183d31b6803d65627a92c5514c Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 20:56:59 -0700
Subject: [PATCH 0804/1533] slightly improve quantized max performance.

PiperOrigin-RevId: 312209911
Change-Id: I789ae3a443cc457ec444ea797a1b70b9465ff771
---
 .../internal/optimized/optimized_ops.h        | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 64598d70ee3..746ed622632 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -7898,16 +7898,16 @@ inline void MaximumElementwise(int size, const ArithmeticParams& params,
                                const int8* input1_data, const int8* input2_data,
                                int8* output_data) {
   ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
-
   int i = 0;
 #ifdef USE_NEON
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
-    vst1_s8(output_data + i, max_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
   }
-#endif  // NEON
+#endif  // USE_NEON
   for (; i < size; ++i) {
     const int8 input1_val = input1_data[i];
     const int8 input2_val = input2_data[i];
@@ -7922,13 +7922,14 @@ inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
   int i = 0;
 
 #ifdef USE_NEON
-  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
-    vst1_s8(output_data + i, max_data);
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
   }
-#endif  // NEON
+#endif  // USE_NEON
   for (; i < size; ++i) {
     const int8 input2_val = input2_data[i];
     output_data[i] = std::max(input1_data, input2_val);
@@ -7939,6 +7940,7 @@ inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
 inline void MinimumElementwise(int size, const ArithmeticParams& params,
                                const int8* input1_data, const int8* input2_data,
                                int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumElementwiseInt8/8bit");
   int i = 0;
 #ifdef USE_NEON
   for (; i <= size - 16; i += 16) {
@@ -7959,6 +7961,7 @@ inline void MinimumElementwise(int size, const ArithmeticParams& params,
 inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
                                    int8 input1_data, const int8* input2_data,
                                    int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumScalarBroadcastInt8/8bit");
   int i = 0;
 
 #ifdef USE_NEON
@@ -7985,10 +7988,7 @@ inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
                                     const RuntimeShape& output_shape,
                                     int8* output_data,
                                     ElementwiseF elementwise_f,
-                                    ScalarBroadcastF scalar_broadcast_f,
-                                    const std::string& label_name) {
-  ruy::profiler::ScopeLabel label(label_name);
-
+                                    ScalarBroadcastF scalar_broadcast_f) {
   ArithmeticParams switched_params = unswitched_params;
   switched_params.input1_offset = unswitched_params.input2_offset;
   switched_params.input1_multiplier = unswitched_params.input2_multiplier;
@@ -8090,8 +8090,7 @@ inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
 
   BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
                           input2_data, output_shape, output_data,
-                          MaximumElementwise, MaximumScalarBroadcast,
-                          "BroadcastMaximumFivefoldInt8/8bit");
+                          MaximumElementwise, MaximumScalarBroadcast);
 }
 
 template <typename Op>
@@ -8110,8 +8109,7 @@ inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
 
   BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
                           input2_data, output_shape, output_data,
-                          MinimumElementwise, MinimumScalarBroadcast,
-                          "BroadcastMinimumFivefoldInt8/8bit");
+                          MinimumElementwise, MinimumScalarBroadcast);
 }
 
 }  // namespace optimized_ops

From 3da4ead13d2c02161fa3d62bb9d1795eb0e2c67a Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Mon, 18 May 2020 21:35:31 -0700
Subject: [PATCH 0805/1533] [TF] Add eager microbenchmark for conv2d.

On my machine:

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_CPU"
  iters: 30000
  wall_time: 187.51747608184814
  extras {
    key: "examples_per_sec"
    value {
      double_value: 5332.836
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 187.517
    }
  }
}

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_GPU"
  iters: 30000
  wall_time: 59.453535079956055
  extras {
    key: "examples_per_sec"
    value {
      double_value: 16819.858
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 59.454
    }
  }
}

PiperOrigin-RevId: 312213393
Change-Id: I6744f37a034b388e0b3053522c3b2d6e023495f1
---
 tensorflow/python/eager/benchmarks_test.py | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3056d1a98ea..3f4cc79afc4 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -120,6 +120,10 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._num_iters_2_by_2 = 30000
     self._num_iters_100_by_784 = 30000
 
+    # used for conv2d benchmarks
+    self._m_8_28_28_3 = random_ops.random_uniform((8, 28, 28, 3))
+    self._m_1_3_3_1 = random_ops.random_uniform((1, 3, 3, 1))
+
   def _get_benchmark_name(self):
     """Mostly copied from benchmark.py _get_name()."""
     stack = tf_inspect.stack()
@@ -305,6 +309,10 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: m * m
     self._run(func, num_iters)
 
+  def _benchmark_tf_conv2d(self, m1, m2, num_iters):
+    func = lambda: nn_ops.conv2d(m1, m2, strides=[1, 1, 1, 1], padding="VALID")
+    self._run(func, num_iters)
+
   def _benchmark_tf_multiply_op(self, m, num_iters):
     func = lambda: math_ops.multiply(m, m)
     self._run(func, num_iters)
@@ -339,6 +347,21 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.gpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
+  def benchmark_tf_conv2d_CPU(self):
+    with context.device(CPU):
+      m1 = self._m_8_28_28_3.cpu()
+      m2 = self._m_1_3_3_1.cpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_tf_conv2d_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m1 = self._m_8_28_28_3.gpu()
+      m2 = self._m_1_3_3_1.gpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
   def benchmark_tf_identity(self):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)

From 97aed8f72e461721466f5ab835c23d6fa4bbf6a9 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 18 May 2020 22:45:19 -0700
Subject: [PATCH 0806/1533] Remove trivial quantize op

PiperOrigin-RevId: 312221307
Change-Id: Ibed5b449cedf5268f675a9fb09807e429f8a254a
---
 .../lite/quantization/quantization_utils.h    | 50 +++++++++++++++++++
 .../mlir/lite/tests/post-quantize.mlir        | 10 ++++
 .../mlir/lite/transforms/post_quantize.cc     |  1 +
 3 files changed, 61 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 27ccc7d2b22..d4512509f6b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 
 namespace mlir {
@@ -363,6 +365,54 @@ struct ConvertUnsignedToSigned : public OpRewritePattern<Q> {
   }
 };
 
+// Fold Extra Requantize ops if the preceding ops has free scale requirement.
+template <typename RQ>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
+  explicit FoldTrivalRequantizeOp(MLIRContext* context)
+      : OpRewritePattern<RQ>(context, 1) {}
+
+  LogicalResult matchAndRewrite(RQ op,
+                                PatternRewriter& rewriter) const override {
+    Value pre_quantized = op.input();
+    auto pre_quantized_type =
+        quant::QuantizedType::getQuantizedElementType(pre_quantized.getType());
+    if (!pre_quantized_type) return failure();
+
+    Operation* def = pre_quantized.getDefiningOp();
+    if (!def) return failure();
+    if (def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() ||
+        def->hasTrait<OpTrait::quant::NoQuantizableResult>()) {
+      return failure();
+    }
+
+    op.emitWarning("Remove trivial `rescale` op. Please fix the source graph.");
+
+    llvm::SmallVector<Type, 4> new_output_types;
+    for (auto result : def->getResults()) {
+      result.getUsers().begin()->dump();
+      op.dump();
+      if (result.hasOneUse() && *result.getUsers().begin() == op) {
+        new_output_types.push_back(op.qtype());
+      } else {
+        new_output_types.push_back(result.getType());
+      }
+    }
+
+    // Remove this rescale op.
+    rewriter.replaceOp(op, {pre_quantized});
+
+    // Replace the output scale of the preceding op.
+    rewriter.setInsertionPointAfter(def);
+    OperationState new_state(def->getLoc(), def->getName().getStringRef(),
+                             def->getOperands(), new_output_types,
+                             def->getAttrs());
+    Operation* new_op = rewriter.createOperation(new_state);
+
+    rewriter.replaceOp(def, new_op->getResults());
+    return success();
+  }
+};
+
 // Given a quantized type `input`, magnifying its scales by the factor stored in
 // `factor`. If `input` isn't a quantized type or the `factor` doesn't match the
 // dimension size of `input` or isn't floating-point, nullptr will be returned.
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index 5377c4fdb98..6573a2f1c36 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -19,6 +19,16 @@ func @RemoveUnused(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> (tensor<2xf32>,t
 // CHECK-NEXT: return %[[split]]#0, %[[split]]#1
 }
 
+// CHECK-LABEL: RemoveTrival
+func @RemoveTrival(%arg0: tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, %arg1: tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, %arg2: none) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>> {
+  %1 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, none) -> tensor<384x128x!quant.uniform<i8:f32, 1.0>>
+  %2 = "tfl.quantize"(%1) {qtype = tensor<384x128x!quant.uniform<i8:f32, 2.0>>} : (tensor<384x128x!quant.uniform<i8:f32, 1.0>>) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+  return %2 : tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"{{.*}} -> tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
+// CHECK-NEXT: return %[[fc]]
+}
+
 func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
   %cst = constant dense<[1, 1001]> : tensor<2xi32>
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 97b7d57dbf4..7954f72046a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -125,6 +125,7 @@ void PostQuantizePass::runOnFunction() {
   auto func = getFunction();
   auto* ctx = func.getContext();
   TFL::populateWithGenerated(ctx, &patterns);
+  patterns.insert<quant::FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 
   if (!emit_quant_adaptor_ops_) {

From 3c6dadd17f168958ae21d39bbc2ac95af4cd14ca Mon Sep 17 00:00:00 2001
From: Chuan He <chhe@google.com>
Date: Mon, 18 May 2020 23:15:56 -0700
Subject: [PATCH 0807/1533]    Fix bug in Canonicalizer folder function for
 ArithmeticOp.

PiperOrigin-RevId: 312224624
Change-Id: Icd6b5ed25fedfa4b4f99be0d09fc5746010aad2a
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc      |  6 ++++++
 .../compiler/mlir/tensorflow/tests/constant-fold.mlir | 11 +++++++++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 69b8f15320f..7fcc82f6757 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -497,6 +497,12 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
       return arithmetic_op.x();
   }
 
+  auto rhs_type = arithmetic_op.y().getType().template cast<ShapedType>();
+  // TODO(chhe): we could fold and add an identity to force the broadcast.
+  if (result_op_type != rhs_type) {
+    return {};
+  }
+
   bool is_symmetric =
       (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
   if (auto attr = operands[0].dyn_cast_or_null<DenseElementsAttr>()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 2119e78bd1e..3ae6023400c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -431,3 +431,14 @@ func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
   // CHECK: return %[[RESULT]] : tensor<?x?xf32>
 }
+
+// Test no fold because of the broadcast.
+func @DontRemoveTrivialMul(%arg0: tensor<1x6x8x1xf32>) -> tensor<1x6x8x1xf32> {
+  %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  return %1 : tensor<1x6x8x1xf32>
+  // CHECK-LABEL: DontRemoveTrivialMul
+  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[RESULT:.*]] = "tf.Mul"(%arg0, %[[CONST]]) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  // CHECK: return %[[RESULT]] : tensor<1x6x8x1xf32>
+}

From f7d038cc3b8398b2e88c3fafda0670dafe293220 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Tue, 19 May 2020 00:20:53 -0700
Subject: [PATCH 0808/1533] Enable more TFRT tests.

PiperOrigin-RevId: 312230367
Change-Id: Icc82c7ce424a1db2ca3cf2eabc1e5932fec7b6a7
---
 tensorflow/python/BUILD                       |  6 ++-
 .../benchmarks/resnet50/resnet50_test.py      | 14 +++----
 tensorflow/python/eager/benchmarks_test.py    | 40 +++++++++----------
 tensorflow/python/framework/ops_test.py       | 21 +++++-----
 tensorflow/python/kernel_tests/BUILD          |  1 +
 tensorflow/python/kernel_tests/random/BUILD   |  2 +
 .../kernel_tests/random/random_ops_test.py    |  2 +
 .../random/stateless_random_ops_test.py       | 10 +++++
 .../resource_variable_ops_test.py             |  6 +++
 9 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a49e4b74def..869e2f2f8d8 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
@@ -26,6 +26,9 @@ load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
@@ -2071,6 +2074,7 @@ tf_py_test(
     srcs = ["framework/constant_op_test.py"],
     main = "framework/constant_op_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":constant_op",
     ],
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 34ceb56d129..362fad1388c 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -108,15 +108,15 @@ class ResNet50Test(tf.test.TestCase):
     self._apply(defun=False)
 
   @test_util.disable_tfrt(
-      'TFE_ContextGetExecutorForThread not implemented for tfrt')
+      'TFE_ContextGetExecutorForThread not implemented b/156188669')
   def test_apply_async(self):
     self._apply(defun=False, execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun(self):
     self._apply(defun=True)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
@@ -217,7 +217,7 @@ class ResNet50Test(tf.test.TestCase):
   def test_train(self):
     self._test_train()
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('TFE_ContextGetExecutorForThread missing b/156188669')
   def test_train_async(self):
     self._test_train(execution_mode=context.ASYNC)
 
@@ -329,7 +329,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_apply_with_defun(self):
     self._benchmark_eager_apply(
         'eager_apply_with_defun',
@@ -389,7 +389,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
         'eager_train_with_defun', MockIterator,
@@ -408,7 +408,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         resnet50_test_util.device_and_data_format(),
         defun=False)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_datasets_with_defun(self):
 
     def make_iterator(tensors):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3f4cc79afc4..223b62ededa 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -618,7 +618,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -639,7 +639,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("function not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_nested_defun_matmul_2_by_2(self):
     m = self._m_2_by_2.cpu()
     self._benchmark_nested_defun_matmul(
@@ -687,7 +687,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("function not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -815,35 +815,35 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(100, 784))
 
@@ -1097,7 +1097,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1109,7 +1109,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     cache_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature_and_with_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1122,7 +1122,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       return defined(t1=t, t2=t, t3=t, t4=t, t5=t, t6=t, t7=t, t8=t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1135,7 +1135,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     signature_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(signature_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature_and_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1305,11 +1305,11 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         resources.append(resource_variable_ops.ResourceVariable(self._m_2))
       self._run(lambda: add_all(resources), num_iters)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(5, 1000)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveHundredResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(500, 100)
 
@@ -1344,15 +1344,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     with context.device(CPU):
       self._run(benchmark_fn, 10)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenThousandResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10000)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkHundredResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(100)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 11193155999..7626bd780bb 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -91,7 +91,7 @@ class ResourceTest(test_util.TensorFlowTestCase):
                   resources.shared_resources()).eval()), 0)
 
 
-@test_util.disable_tfrt("Graph is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -311,7 +311,8 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     del x
     self.assertIsNotNone(x_ref.deref())
 
-@test_util.disable_tfrt("Graph mode is not supported yet.")
+
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
@@ -356,7 +357,7 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(x.indices, [0, 2])
 
 
-@test_util.disable_tfrt("Graph mode is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesSpecTest(test_util.TensorFlowTestCase,
                             parameterized.TestCase):
@@ -502,7 +503,7 @@ def _apply_op(g, *args, **kwargs):
     return op.outputs
 
 
-@test_util.disable_tfrt("Graph is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class OperationTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
@@ -1445,7 +1446,7 @@ class NameTest(test_util.TensorFlowTestCase):
                        g.create_op("FloatOutput", [], [dtypes.float32]).name)
 
 
-@test_util.disable_tfrt("Device API are not supported yet.")
+@test_util.disable_tfrt("Device API are not supported yet. b/156188344")
 class DeviceTest(test_util.TensorFlowTestCase):
 
   def testNoDevice(self):
@@ -2026,7 +2027,7 @@ class CollectionTest(test_util.TensorFlowTestCase):
       # Collections are ordered.
       self.assertEqual([90, 100], ops.get_collection("key"))
 
-  @test_util.disable_tfrt("Functions are not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def test_defun(self):
     with context.eager_mode():
 
@@ -2133,7 +2134,7 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     # e should be dominated by c.
     self.assertEqual(e.op.control_inputs, [])
 
-  @test_util.disable_tfrt("Graph is not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes
   def testEager(self):
     def future():
@@ -2454,7 +2455,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     self._testGraphElements([a, variable, b])
 
 
-@test_util.disable_tfrt("Graphs are not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class InitScopeTest(test_util.TensorFlowTestCase):
 
   def testClearsControlDependencies(self):
@@ -2757,7 +2758,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertFalse(self.evaluate(f()))
 
 
-@test_util.disable_tfrt("Graphs are not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class GraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -3235,7 +3236,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
-  @test_util.disable_tfrt("Functions are not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def testColocateWithVariableInFunction(self):
     v = variables.Variable(1.)
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index cd03da9b179..9e38a78578f 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -864,6 +864,7 @@ cuda_py_test(
     srcs = ["resource_variable_ops_test.py"],
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index c3335cbc546..b5d291d2973 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -87,6 +87,7 @@ cuda_py_test(
     name = "random_ops_test",
     size = "medium",
     srcs = ["random_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -101,6 +102,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
     shard_count = 2,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 4dbbb7c7f1e..73c8bd09db0 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -336,6 +336,8 @@ class RandomUniformTest(RandomOpTestCommon):
       self.assertLess(error.max(), 5 * std)
 
   # Check that minval = maxval is fine iff we're producing no numbers
+  @test_util.disable_tfrt(
+      "TFE_TensorHandleToNumpy not implemented yet. b/156191611")
   def testUniformIntsDegenerate(self):
     for dt in dtypes.int32, dtypes.int64:
       def sample(n):
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 0b9fbab716c..d7e50083deb 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -154,44 +154,54 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
                                 **kwds),
               functools.partial(random_ops.random_poisson, shape=(10,), **kwds))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchFloat(self):
     self._test_match(self._float_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchInt(self):
     self._test_match(self._int_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchMultinomial(self):
     self._test_match(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchGamma(self):
     self._test_match(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchPoisson(self):
     self._test_match(self._poisson_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismFloat(self):
     self._test_determinism(
         self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismInt(self):
     self._test_determinism(
         self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismMultinomial(self):
     self._test_determinism(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismGamma(self):
     self._test_determinism(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismPoisson(self):
     self._test_determinism(self._poisson_cases())
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 41ce9eb8a57..bf229943fd4 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -57,6 +57,8 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
+@test_util.disable_tfrt(
+    "Trying to assign variable with wrong dtype. b/156200342")
 @test_util.with_control_flow_v2
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
                               parameterized.TestCase):
@@ -332,6 +334,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     g = gradients_impl.gradients(c, [b], unconnected_gradients="zero")[0]
     self.assertAllEqual(g.shape.as_list(), [1, 2])
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_deprecated_v1
   def testGradientCondInWhileLoop(self):
     v = resource_variable_ops.ResourceVariable(initial_value=1.0)
@@ -965,6 +968,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.disable_xla("XLA doesn't allow changing shape at assignment, as "
                          "dictated by tf2xla/xla_resource.cc:SetTypeAndShape")
   @test_util.run_in_graph_and_eager_modes
@@ -1327,6 +1331,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
 
   # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
   # EagerTensor constants with TensorProto inputs.
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes()
   def testVariantInitializer(self):
     variant_shape_and_type_data = self.create_variant_shape_and_type_data()
@@ -1520,6 +1525,7 @@ class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
         context.LogicalDeviceConfiguration(),
     ])
 
+  @test_util.disable_tfrt("Multiple device support. b/154956430")
   def testAllowedDevices(self):
     device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
     device1 = "/job:localhost/replica:0/task:0/device:CPU:1"

From bfd37881017e49a75f9b9ac6600d0e95a93b4afe Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 19 May 2020 01:02:14 -0700
Subject: [PATCH 0809/1533] Reorder functions in an effort to group utility
 functions that use symbols defined in values.py and are used by classes
 defined in values.py.

PiperOrigin-RevId: 312234995
Change-Id: I3ec7fbc1d35935da54e61d991a44bc81b0b61d67
---
 tensorflow/python/distribute/values.py      | 374 ++++++++++----------
 tensorflow/python/distribute/values_test.py |   4 +-
 2 files changed, 191 insertions(+), 187 deletions(-)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 84904f93104..432f6b06975 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -43,6 +43,7 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Utility functions used by the different classes below.
 def _get_current_replica_id_as_int():
   """Returns the current replica ID as an integer, or `None`."""
   replica_context = ds_context.get_replica_context()
@@ -55,6 +56,59 @@ def _get_current_replica_id_as_int():
   return replica_id
 
 
+def _assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(tensor)
+
+
+def _assign_add_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_add(tensor)
+
+
+def _assign_sub_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_sub(tensor)
+
+
+def _assert_replica_context(strategy):
+  replica_context = ds_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+
+
+def _apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.extended.broadcast_to(
+        strategy.experimental_local_results(value)[0],
+        destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
+_aggregation_error_msg = (
+    "You must specify an aggregation method to update a "
+    "{variable_type} in Replica Context. You can do so by passing "
+    "an explicit value for argument `aggregation` to tf.Variable(..)."
+    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
+    "`tf.VariableAggregation` lists the possible aggregation methods."
+    "This is required because {variable_type} should always be "
+    "kept in sync. When updating them or assigning to them in a "
+    "replica context, we automatically try to aggregate the values "
+    "before updating the variable. For this aggregation, we need to "
+    "know the aggregation method. "
+    "Another alternative is to not try to update such "
+    "{variable_type} in replica context, but in cross replica "
+    "context. You can enter cross replica context by calling "
+    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
+    "Inside `merge_fn`, you can then update the {variable_type} "
+    "using `tf.distribute.StrategyExtended.update()`.")
+
+
 @tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
   """Base class for representing distributed values.
@@ -389,21 +443,6 @@ class Mirrored(DistributedDelegate):
     return obj
 
 
-def _assign_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign(tensor)
-
-
-def _assign_add_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_add(tensor)
-
-
-def _assign_sub_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_sub(tensor)
-
-
 class DistributedVarOp(object):
   """A class that looks like `tf.Operation`."""
 
@@ -743,59 +782,6 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     pass
 
 
-def _validate_colocate_extended(v, extended):
-  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
-  if variable_strategy.extended is not extended:
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
-        (v, variable_strategy))
-
-
-def validate_colocate_distributed_variable(v, extended):
-  if not isinstance(v, DistributedVariable):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def validate_colocate(v, extended):
-  if not hasattr(v, "_distribute_strategy"):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def _apply_aggregation(strategy, value, aggregation, destinations):
-  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.extended.broadcast_to(
-        strategy.experimental_local_results(value)[0],
-        destinations=destinations)
-  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
-  return strategy.extended.reduce_to(reduce_op, value, destinations)
-
-
-_aggregation_error_msg = (
-    "You must specify an aggregation method to update a "
-    "{variable_type} in Replica Context. You can do so by passing "
-    "an explicit value for argument `aggregation` to tf.Variable(..)."
-    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
-    "`tf.VariableAggregation` lists the possible aggregation methods."
-    "This is required because {variable_type} should always be "
-    "kept in sync. When updating them or assigning to them in a "
-    "replica context, we automatically try to aggregate the values "
-    "before updating the variable. For this aggregation, we need to "
-    "know the aggregation method. "
-    "Another alternative is to not try to update such "
-    "{variable_type} in replica context, but in cross replica "
-    "context. You can enter cross replica context by calling "
-    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
-    "Inside `merge_fn`, you can then update the {variable_type} "
-    "using `tf.distribute.StrategyExtended.update()`.")
-
-
 class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -812,87 +798,6 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
             for v in self._mirrored_variable.values))
 
 
-def create_mirrored_variable(  # pylint: disable=missing-docstring
-    strategy, real_mirrored_creator, mirrored_cls, sync_on_read_cls, **kwargs):
-  # Figure out what collections this variable should be added to.
-  # We'll add the MirroredVariable to those collections instead.
-  var_collections = kwargs.pop("collections", None)
-  if var_collections is None:
-    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
-  synchronization = kwargs.get("synchronization",
-                               vs.VariableSynchronization.ON_WRITE)
-
-  if synchronization == vs.VariableSynchronization.NONE:
-    raise ValueError(
-        "`NONE` variable synchronization mode is not supported with `Mirrored` "
-        "distribution strategy. Please change the `synchronization` for "
-        "variable: " + str(kwargs["name"]))
-  elif synchronization == vs.VariableSynchronization.ON_READ:
-    is_sync_on_read = True
-  elif synchronization in (vs.VariableSynchronization.ON_WRITE,
-                           vs.VariableSynchronization.AUTO):
-    # `AUTO` synchronization defaults to `ON_WRITE`.
-    is_sync_on_read = False
-  else:
-    raise ValueError(
-        "Invalid variable synchronization mode: %s for variable: %s" %
-        (synchronization, kwargs["name"]))
-
-  aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
-
-  if aggregation not in (vs.VariableAggregation.NONE,
-                         vs.VariableAggregation.SUM,
-                         vs.VariableAggregation.MEAN,
-                         vs.VariableAggregation.ONLY_FIRST_REPLICA):
-    raise ValueError("Invalid variable aggregation mode: %s for variable: %s" %
-                     (aggregation, kwargs["name"]))
-
-  # Ignore user-specified caching device, not needed for mirrored variables.
-  kwargs.pop("caching_device", None)
-
-  # TODO(josh11b,apassos): It would be better if variable initialization
-  # was never recorded on the tape instead of having to do this manually
-  # here.
-  with tape.stop_recording():
-    value_list = real_mirrored_creator(**kwargs)
-    var_cls = sync_on_read_cls if is_sync_on_read else mirrored_cls
-    result = var_cls(strategy, value_list, aggregation)
-    # Install the created DistributedVariable as _distributed_container property
-    # of the underlying variables, to make it easy to map back to the container.
-    for v in result.values:
-      # Hold a strong reference to avoid the container from being GC-ed. After
-      # v = v.assign(), the user code may no longer holds references to the
-      # original container, since v.assign() returns a new DistributedVariable.
-      v._distributed_container = result  # pylint: disable=protected-access
-
-  # Add the wrapped variable to the requested collections.
-  # The handling of eager mode and the global step matches
-  # ResourceVariable._init_from_args().
-  if not context.executing_eagerly():
-    g = ops.get_default_graph()
-    # If "trainable" is True, next_creator() will add the member variables
-    # to the TRAINABLE_VARIABLES collection, so we manually remove
-    # them and replace with the MirroredVariable. We can't set
-    # "trainable" to False for next_creator() since that causes functions
-    # like implicit_gradients to skip those variables.
-    if kwargs.get("trainable", True):
-      var_collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for value in value_list:
-        for i, trainable_variable in enumerate(l):
-          if value is trainable_variable:
-            del l[i]
-            break
-
-    g.add_to_collections(var_collections, result)
-  elif ops.GraphKeys.GLOBAL_STEP in var_collections:
-    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
-
-  return result
-
-
 class MirroredVariable(DistributedVariable, Mirrored):
   """Holds a map from replica to variables whose values are kept in sync."""
 
@@ -993,30 +898,6 @@ class MirroredVariable(DistributedVariable, Mirrored):
         self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
 
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
-  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(MirroredVariable,
-                                        _tensor_conversion_mirrored)
-
-
-def _tensor_conversion_mirrored_val(value, dtype=None, name=None, as_ref=False):
-  return ops.convert_to_tensor(
-      value._get(), dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(Mirrored,
-                                        _tensor_conversion_mirrored_val)
-
-
-def is_distributed_variable(v):
-  """Determine if a variable is ds variable or TPU mirrored variable."""
-  return isinstance(v, DistributedVariable)
-
-
 class _SyncOnReadSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a SyncOnReadVariable."""
 
@@ -1053,16 +934,6 @@ class _SyncOnReadSaveable(saveable_object.SaveableObject):
             for v in self._sync_on_read_variable.values))
 
 
-def _assert_replica_context(strategy):
-  replica_context = ds_context.get_replica_context()
-  if not replica_context:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-  if replica_context.strategy is not strategy:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-
-
 class SyncOnReadVariable(DistributedVariable):
   """Holds a map from replica to variables whose values are reduced on save."""
 
@@ -1188,8 +1059,110 @@ class SyncOnReadVariable(DistributedVariable):
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
 
-# Register a conversion function for SyncOnReadVariable which allows as_ref to
-# be true.
+# Variable creation function for sync strategies.
+def create_mirrored_variable(  # pylint: disable=missing-docstring
+    strategy, real_mirrored_creator, mirrored_cls, sync_on_read_cls, **kwargs):
+  # Figure out what collections this variable should be added to.
+  # We'll add the MirroredVariable to those collections instead.
+  var_collections = kwargs.pop("collections", None)
+  if var_collections is None:
+    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  synchronization = kwargs.get("synchronization",
+                               vs.VariableSynchronization.ON_WRITE)
+
+  if synchronization == vs.VariableSynchronization.NONE:
+    raise ValueError(
+        "`NONE` variable synchronization mode is not supported with `Mirrored` "
+        "distribution strategy. Please change the `synchronization` for "
+        "variable: " + str(kwargs["name"]))
+  elif synchronization == vs.VariableSynchronization.ON_READ:
+    is_sync_on_read = True
+  elif synchronization in (vs.VariableSynchronization.ON_WRITE,
+                           vs.VariableSynchronization.AUTO):
+    # `AUTO` synchronization defaults to `ON_WRITE`.
+    is_sync_on_read = False
+  else:
+    raise ValueError(
+        "Invalid variable synchronization mode: %s for variable: %s" %
+        (synchronization, kwargs["name"]))
+
+  aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+
+  if aggregation not in (vs.VariableAggregation.NONE,
+                         vs.VariableAggregation.SUM,
+                         vs.VariableAggregation.MEAN,
+                         vs.VariableAggregation.ONLY_FIRST_REPLICA):
+    raise ValueError("Invalid variable aggregation mode: %s for variable: %s" %
+                     (aggregation, kwargs["name"]))
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    value_list = real_mirrored_creator(**kwargs)
+    var_cls = sync_on_read_cls if is_sync_on_read else mirrored_cls
+    result = var_cls(strategy, value_list, aggregation)
+    # Install the created DistributedVariable as _distributed_container property
+    # of the underlying variables, to make it easy to map back to the container.
+    for v in result.values:
+      # Hold a strong reference to avoid the container from being GC-ed. After
+      # v = v.assign(), the user code may no longer holds references to the
+      # original container, since v.assign() returns a new DistributedVariable.
+      v._distributed_container = result  # pylint: disable=protected-access
+
+  # Add the wrapped variable to the requested collections.
+  # The handling of eager mode and the global step matches
+  # ResourceVariable._init_from_args().
+  if not context.executing_eagerly():
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      var_collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for value in value_list:
+        for i, trainable_variable in enumerate(l):
+          if value is trainable_variable:
+            del l[i]
+            break
+
+    g.add_to_collections(var_collections, result)
+  elif ops.GraphKeys.GLOBAL_STEP in var_collections:
+    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
+
+  return result
+
+
+# Register a conversion functions which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+# MirroredVariables
+def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
+  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(MirroredVariable,
+                                        _tensor_conversion_mirrored)
+
+
+# Mirrored Values
+def _tensor_conversion_mirrored_val(value, dtype=None, name=None, as_ref=False):
+  return ops.convert_to_tensor(
+      value._get(), dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(Mirrored,
+                                        _tensor_conversion_mirrored_val)
+
+
+# SyncOnReadVariables
 def _tensor_conversion_sync_on_read(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -1379,6 +1352,37 @@ def value_container(val):
   return val
 
 
+def is_distributed_variable(v):
+  """Determine if a variable is ds variable or TPU mirrored variable."""
+  return isinstance(v, DistributedVariable)
+
+
+def _validate_colocate_extended(v, extended):
+  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
+  if variable_strategy.extended is not extended:
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
+        (v, variable_strategy))
+
+
+def validate_colocate_distributed_variable(v, extended):
+  if not isinstance(v, DistributedVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate(v, extended):
+  if not hasattr(v, "_distribute_strategy"):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
 class AggregatingVariable(variables_lib.Variable, core.Tensor):
   """A wrapper around a variable that aggregates updates across replicas."""
 
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 67ed86b4047..ef26174e82d 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -1722,8 +1722,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
                                          experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+        # variables_lib.VariableAggregation.MEAN,
+        # variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
     for aggregation in aggregations:
       if isinstance(distribution, _TPU_STRATEGIES):

From d8d6ede4b1b4fcf16223dae68da61f19a70f21f7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 01:47:35 -0700
Subject: [PATCH 0810/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/a6be4d17e349

PiperOrigin-RevId: 312239502
Change-Id: I2d144af2d9f2d745f9fe37e9513eabb682e1abcc
---
 third_party/mlir/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 58c932ea723..1ad94212dcd 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2583,6 +2583,7 @@ cc_library(
     srcs = [
         "tools/mlir-opt/mlir-opt.cpp",
     ],
+    copts = ["-DMLIR_INCLUDE_TESTS"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
         ":Analysis",

From 6c776edfd37a5df50ada3139751a3ed689899d44 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 02:03:03 -0700
Subject: [PATCH 0811/1533] compat: Update forward compatibility horizon to
 2020-05-19

PiperOrigin-RevId: 312240989
Change-Id: I85cb77f98e70362e56878faa52a414191146a200
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 88a26661f82..751f4b6cadf 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 18)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a98f72c490c018828960fbb5bf59b56eba02285f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 02:03:06 -0700
Subject: [PATCH 0812/1533] Update GraphDef version to 406.

PiperOrigin-RevId: 312240999
Change-Id: I2c77677753920c9402b26a8abc2b1844c8237ebb
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 7abbcd5474c..048ed8e930e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 405  // Updated: 2020/5/18
+#define TF_GRAPH_DEF_VERSION 406  // Updated: 2020/5/19
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 686908251a6711212cc7fad6de3d929c6c0c1921 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 19 May 2020 02:29:22 -0700
Subject: [PATCH 0813/1533] Move GraphWithDequantPartitionHelper out of
 delegates/gpu, and put into util.h as the logic remains same w/ other
 delegates that need to support FP16.

PiperOrigin-RevId: 312243729
Change-Id: I7e2ff7cf80c4860f016cf5dcb60efd94cd2d39dc
---
 tensorflow/lite/delegates/gpu/common/BUILD    |   1 +
 .../delegates/gpu/common/model_builder.cc     |   4 +-
 .../gpu/common/model_builder_helper.cc        | 153 ----------------
 .../gpu/common/model_builder_helper.h         |  60 -------
 tensorflow/lite/delegates/utils.cc            | 163 ++++++++++++++++++
 tensorflow/lite/delegates/utils.h             |  66 +++++++
 6 files changed, 233 insertions(+), 214 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 94d79182a92..b7120605902 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -116,6 +116,7 @@ cc_library(
         ":status",
         ":tensor",
         "@com_google_absl//absl/strings",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 964c8289f83..18b48583295 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -2809,7 +2810,8 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
     return true;
   };
 
-  GraphWithDequantPartitionHelper partition_helper(context, node_supported_fn);
+  delegates::FP16GraphPartitionHelper partition_helper(context,
+                                                       node_supported_fn);
   std::set<std::string> unsupported_nodes_info;
   if (partition_helper.Partition(&unsupported_nodes_info) != kTfLiteOk) {
     return TfLiteIntArrayCreate(0);
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 65e2b6f0d47..4973a8179cd 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 
-#include <set>
 #include <string>
-#include <unordered_map>
 
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
@@ -33,157 +31,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-TfLiteStatus GraphWithDequantPartitionHelper::Partition(
-    std::set<std::string>* unsupported_nodes_info) {
-  const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
-  // Clean up those partitions that have a single dequant op. NoteThose
-  // removed dequant ops have to be reserved in the graph and should not be
-  // delegated.
-  RemoveSingleDequantNodePartitions();
-  return status;
-}
-
-std::vector<int>
-GraphWithDequantPartitionHelper::GetNodesOfFirstNLargestPartitions(int n) {
-  // We first get partitions to reduce the number of nodes to be checked in
-  // deciding which dequant ops could actually be replaced. And then we
-  // remap input-tensor to dequant nodes' inputs and remove those
-  // to-be-reserved dequant nodes.
-  auto first_nps = GetFirstNLargestPartitions(n);
-  std::vector<int> ops_to_replace;
-  for (const auto p : first_nps) {
-    auto nodes = p->nodes_to_replace;
-    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
-                          nodes->data + nodes->size);
-  }
-  RemapInputTensors(ops_to_replace);
-  RemoveReservedDequantsFromNodes(&ops_to_replace);
-  return ops_to_replace;
-}
-
-bool GraphWithDequantPartitionHelper::IsNodeSupported(
-    TfLiteContext* context, TfLiteNode* node, TfLiteRegistration* registration,
-    int node_id, std::string* unsupported_details) {
-  // If we need to handle dequant nodes, we have to remap input tensors of
-  // this node if some of them come from a dequant node before testing if
-  // the node is supported.
-  std::vector<int> orig_inputs;
-  if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
-                                 &orig_inputs)) {
-    // We have a dequant op here. Note that we retrun an Ok status because a
-    // dequant node is first added as supported. Later, this dequant node
-    // will be removed if it has to be preserved in the graph which happens
-    // when its immediate downstream nodes cannot be supported.
-    return true;
-  }
-  const auto status = GraphPartitionHelper::IsNodeSupported(
-      context, node, registration, node_id, unsupported_details);
-  RestoreToOrigInputTensors(node, orig_inputs);
-  return status;
-}
-
-bool GraphWithDequantPartitionHelper::RecordAndRemapInputTensors(
-    int32_t op_code, int node_id, TfLiteNode* node,
-    std::vector<int>* orig_inputs) {
-  orig_inputs->clear();
-  // Record the dequant node.
-  if (op_code == kTfLiteBuiltinDequantize &&
-      context_->tensors[node->inputs->data[0]].type ==
-          TfLiteType::kTfLiteFloat16) {
-    dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
-    return true;
-  }
-  // For a dequantize op, there's no need to remap its input tensors.
-  if (dequant_nodes_.empty()) return false;
-  RemapInputTensors(node, orig_inputs);
-  return false;
-}
-
-void GraphWithDequantPartitionHelper::RestoreToOrigInputTensors(
-    TfLiteNode* node, const std::vector<int>& orig_inputs) {
-  if (node->inputs->size != orig_inputs.size()) return;
-  for (int j = 0; j < node->inputs->size; ++j) {
-    node->inputs->data[j] = orig_inputs[j];
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemapInputTensors(
-    const std::vector<int>& nodes) const {
-  for (int node_id : nodes) {
-    TfLiteNode* node;
-    TfLiteRegistration* registration;
-    GetNodeAndRegistration(context_, node_id, &node, &registration)
-        .IgnoreError();
-    RemapInputTensors(node, nullptr /* orig_inputs*/);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemoveSingleDequantNodePartitions() {
-  auto it = partitions_.begin();
-  while (it != partitions_.end()) {
-    auto p = *it;
-    if (p->nodes_to_replace->size != 1) {
-      ++it;
-      continue;
-    }
-    int node_id = p->nodes_to_replace->data[0];
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* registration = nullptr;
-    GetNodeAndRegistration(context_, node_id, &node, &registration)
-        .IgnoreError();
-    if (registration->builtin_code != kTfLiteBuiltinDequantize ||
-        context_->tensors[node->inputs->data[0]].type !=
-            TfLiteType::kTfLiteFloat16) {
-      ++it;
-      continue;
-    }
-    // Note such dequant nodes have to be preserved in the graph as dequant
-    // ops are not actually supported in the GPU delegate.
-    dequant_nodes_to_save_.insert(node_id);
-    it = partitions_.erase(it);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemoveReservedDequantsFromNodes(
-    std::vector<int>* nodes) {
-  if (dequant_nodes_to_save_.empty()) return;
-  auto it = nodes->begin();
-  while (it != nodes->end()) {
-    if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
-      ++it;
-      continue;
-    }
-    it = nodes->erase(it);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemapInputTensors(
-    TfLiteNode* node, std::vector<int>* orig_inputs) const {
-  TfLiteIntArray* inputs = node->inputs;
-  auto inputs_view = TfLiteIntArrayView(inputs);
-  // Prepopulate 'orig_inputs' first and clear it if there's no input from a
-  // dequant op.
-  if (orig_inputs) {
-    orig_inputs->clear();
-    orig_inputs->reserve(inputs->size);
-    for (auto tid : inputs_view) {
-      orig_inputs->push_back(tid);
-    }
-  }
-  // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
-  // order to test if it is supported.
-  bool is_remapped = false;
-  for (int j = 0; j < inputs->size; ++j) {
-    const int input_tid = inputs->data[j];
-    const auto it = dequant_nodes_.find(input_tid);
-    if (it != dequant_nodes_.end()) {
-      inputs->data[j] = it->second;
-      is_remapped = true;
-    }
-  }
-  if (!is_remapped && orig_inputs) orig_inputs->clear();
-}
-
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
                                     TfLiteRegistration** registration) {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 54ae19e890a..9caa5630037 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -16,17 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
-#include <set>
-#include <string>
-#include <unordered_map>
-
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -35,61 +30,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-class GraphWithDequantPartitionHelper : public delegates::GraphPartitionHelper {
- public:
-  GraphWithDequantPartitionHelper(
-      TfLiteContext* context, delegates::IsNodeSupportedFn is_node_supported_fn)
-      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
-
-  TfLiteStatus Partition(
-      std::set<std::string>* unsupported_nodes_info) override;
-
-  // Returns a list of node indices of all nodes from the first n largest
-  // partitions. If there are fewer paritions than n, all nodes will be
-  // returned. The partition is ranked according to the number of nodes.
-  std::vector<int> GetNodesOfFirstNLargestPartitions(int n);
-
- protected:
-  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteRegistration* registration, int node_id,
-                       std::string* unsupported_details) override;
-
- private:
-  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
-  // When it's not a dequant op, remap its inputs to the inputs of the preceding
-  // dequant if there's a one and returns false. 'orig_inputs' records original
-  // input tensor ids of this node if any input is remapped.
-  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
-                                  TfLiteNode* node,
-                                  std::vector<int>* orig_inputs);
-
-  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
-  void RestoreToOrigInputTensors(TfLiteNode* node,
-                                 const std::vector<int>& orig_inputs);
-
-  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
-  // them are from dequant ops.
-  void RemapInputTensors(const std::vector<int>& nodes) const;
-
-  void RemoveSingleDequantNodePartitions();
-
-  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes);
-
-  // Remap input tensors of a single 'node' if some of come from a dequant op.
-  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
-  // this node if any input is remapped.
-  void RemapInputTensors(TfLiteNode* node, std::vector<int>* orig_inputs) const;
-
-  // A map recording dequantize nodes's input/output tensors of this selected
-  // graph. The key is the output tensor id, and the value is the input tensor
-  // id.
-  std::unordered_map<int, int> dequant_nodes_;
-
-  // A set of dequant nodes as in node indices that have to be preserved in the
-  // graph.
-  std::set<int> dequant_nodes_to_save_;
-};
-
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
                                     TfLiteRegistration** registration);
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index fba8bec39a5..f9cf9380a31 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/context_util.h"
 
 namespace tflite {
@@ -136,5 +137,167 @@ TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
   return kTfLiteOk;
 }
 
+TfLiteStatus FP16GraphPartitionHelper::Partition(
+    std::set<std::string>* unsupported_nodes_info) {
+  const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
+  // Clean up those partitions that have a single dequant op. NoteThose
+  // removed dequant ops have to be reserved in the graph and should not be
+  // delegated.
+  RemoveSingleDequantNodePartitions();
+  return status;
+}
+
+std::vector<int> FP16GraphPartitionHelper::GetNodesOfFirstNLargestPartitions(
+    int n) {
+  // We first get partitions to reduce the number of nodes to be checked in
+  // deciding which dequant ops could actually be replaced. And then we
+  // remap input-tensor to dequant nodes' inputs and remove those
+  // to-be-reserved dequant nodes.
+  auto first_nps = GetFirstNLargestPartitions(n);
+  std::vector<int> ops_to_replace;
+  for (const auto p : first_nps) {
+    auto nodes = p->nodes_to_replace;
+    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
+                          nodes->data + nodes->size);
+  }
+  RemapInputTensors(ops_to_replace);
+  RemoveReservedDequantsFromNodes(&ops_to_replace);
+  return ops_to_replace;
+}
+
+bool FP16GraphPartitionHelper::IsNodeSupported(
+    TfLiteContext* context, TfLiteNode* node, TfLiteRegistration* registration,
+    int node_id, std::string* unsupported_details) {
+  // If we need to handle dequant nodes, we have to remap input tensors of
+  // this node if some of them come from a dequant node before testing if
+  // the node is supported.
+  std::vector<int> orig_inputs;
+  if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
+                                 &orig_inputs)) {
+    // We have a dequant op here. Note that we retrun an Ok status because a
+    // dequant node is first added as supported. Later, this dequant node
+    // will be removed if it has to be preserved in the graph which happens
+    // when its immediate downstream nodes cannot be supported.
+    return true;
+  }
+  const auto status = GraphPartitionHelper::IsNodeSupported(
+      context, node, registration, node_id, unsupported_details);
+  RestoreToOrigInputTensors(node, orig_inputs);
+  return status;
+}
+
+bool FP16GraphPartitionHelper::RecordAndRemapInputTensors(
+    int32_t op_code, int node_id, TfLiteNode* node,
+    std::vector<int>* orig_inputs) {
+  orig_inputs->clear();
+  // Record the dequant node.
+  if (op_code == kTfLiteBuiltinDequantize &&
+      context_->tensors[node->inputs->data[0]].type ==
+          TfLiteType::kTfLiteFloat16) {
+    dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
+    return true;
+  }
+  // For a dequantize op, there's no need to remap its input tensors.
+  if (dequant_nodes_.empty()) return false;
+  RemapInputTensors(node, orig_inputs);
+  return false;
+}
+
+void FP16GraphPartitionHelper::RestoreToOrigInputTensors(
+    TfLiteNode* node, const std::vector<int>& orig_inputs) {
+  if (node->inputs->size != orig_inputs.size()) return;
+  for (int j = 0; j < node->inputs->size; ++j) {
+    node->inputs->data[j] = orig_inputs[j];
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    const std::vector<int>& nodes) const {
+  for (int node_id : nodes) {
+    TfLiteNode* node;
+    TfLiteRegistration* registration;
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    RemapInputTensors(node, nullptr /* orig_inputs*/);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveSingleDequantNodePartitions() {
+  auto it = partitions_.begin();
+  while (it != partitions_.end()) {
+    auto p = *it;
+    if (p->nodes_to_replace->size != 1) {
+      ++it;
+      continue;
+    }
+    int node_id = p->nodes_to_replace->data[0];
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    if (registration->builtin_code != kTfLiteBuiltinDequantize ||
+        context_->tensors[node->inputs->data[0]].type !=
+            TfLiteType::kTfLiteFloat16) {
+      ++it;
+      continue;
+    }
+    // Note such dequant nodes have to be preserved in the graph as dequant
+    // ops are not actually supported in the GPU delegate.
+    dequant_nodes_to_save_.insert(node_id);
+    it = partitions_.erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveReservedDequantsFromNodes(
+    std::vector<int>* nodes) {
+  if (dequant_nodes_to_save_.empty()) return;
+  auto it = nodes->begin();
+  while (it != nodes->end()) {
+    if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
+      ++it;
+      continue;
+    }
+    it = nodes->erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    TfLiteNode* node, std::vector<int>* orig_inputs) const {
+  TfLiteIntArray* inputs = node->inputs;
+  auto inputs_view = TfLiteIntArrayView(inputs);
+  // Prepopulate 'orig_inputs' first and clear it if there's no input from a
+  // dequant op.
+  if (orig_inputs) {
+    orig_inputs->clear();
+    orig_inputs->reserve(inputs->size);
+    for (auto tid : inputs_view) {
+      orig_inputs->push_back(tid);
+    }
+  }
+  // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
+  // order to test if it is supported.
+  bool is_remapped = false;
+  for (int j = 0; j < inputs->size; ++j) {
+    const int input_tid = inputs->data[j];
+    const auto it = dequant_nodes_.find(input_tid);
+    if (it != dequant_nodes_.end()) {
+      inputs->data[j] = it->second;
+      is_remapped = true;
+    }
+  }
+  if (!is_remapped && orig_inputs) orig_inputs->clear();
+}
+
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index d6d22c4efa2..2238ba681e6 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <limits>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -109,6 +111,70 @@ class GraphPartitionHelper {
   // Contains an array of supported node indices.
   TfLiteIntArray* supported_nodes_ = nullptr;  // owns the memory
 };
+
+// While partitioning the graph, this claims DEQUANTIZE nodes (FP16->FP32) in
+// addition to supported nodes for the delegate, when the DEQUANTIZE node's
+// output is an input to the kernel that supports FP16 input.
+// Noth that you have to use `GetNodesOfFirstNLargestPartitions` instead of
+// superclass' `GetFirstNLargestPartitions` to do actual remapping of FP16
+// inputs.
+class FP16GraphPartitionHelper : public GraphPartitionHelper {
+ public:
+  FP16GraphPartitionHelper(TfLiteContext* context,
+                           IsNodeSupportedFn is_node_supported_fn)
+      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
+
+  TfLiteStatus Partition(
+      std::set<std::string>* unsupported_nodes_info) override;
+
+  // Returns a list of node indices of all nodes from the first n largest
+  // partitions. If there are fewer paritions than n, all nodes will be
+  // returned. The partition is ranked according to the number of nodes.
+  // TODO(b/156707497): Add this to superclass besides
+  // GetFirstNLargestPartitions (one that returns partitions instead of nodes)
+  std::vector<int> GetNodesOfFirstNLargestPartitions(int n);
+
+ protected:
+  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteRegistration* registration, int node_id,
+                       std::string* unsupported_details) override;
+
+ private:
+  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
+  // When it's not a dequant op, remap its inputs to the inputs of the preceding
+  // dequant if there's a one and returns false. 'orig_inputs' records original
+  // input tensor ids of this node if any input is remapped.
+  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
+                                  TfLiteNode* node,
+                                  std::vector<int>* orig_inputs);
+
+  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
+  void RestoreToOrigInputTensors(TfLiteNode* node,
+                                 const std::vector<int>& orig_inputs);
+
+  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
+  // them are from dequant ops.
+  void RemapInputTensors(const std::vector<int>& nodes) const;
+
+  void RemoveSingleDequantNodePartitions();
+
+  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes);
+
+  // Remap input tensors of a single 'node' if some of come from a dequant op.
+  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
+  // this node if any input is remapped.
+  void RemapInputTensors(TfLiteNode* node, std::vector<int>* orig_inputs) const;
+
+  // A map recording dequantize nodes's input/output tensors of this selected
+  // graph. The key is the output tensor id, and the value is the input tensor
+  // id.
+  std::unordered_map<int, int> dequant_nodes_;
+
+  // A set of dequant nodes as in node indices that have to be preserved in the
+  // graph.
+  std::set<int> dequant_nodes_to_save_;
+};
+
 }  // namespace delegates
 }  // namespace tflite
 

From 8121e42ca4c90a79c8f8b1a61d424eb46f2c8c0c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 04:23:07 -0700
Subject: [PATCH 0814/1533] Clarify CPU/GPU infeed error messages.

PiperOrigin-RevId: 312254085
Change-Id: Ic981d72bf59e41b149cf0036a272250d4ea482a3
---
 tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc | 3 ++-
 tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index fae9670051a..e21ed7ad60e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -154,7 +154,8 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor,
                                                    int64 size,
                                                    const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("CPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size <= 0) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 05fa798dc39..cb22b4d9042 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -96,7 +96,8 @@ Status GpuTransferManager::EnqueueBuffersToInfeed(
 StatusOr<InfeedBuffer> GpuTransferManager::TransferBufferToInfeedInternal(
     se::StreamExecutor* executor, int64 size, const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("GPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size == 0) {

From b93bd76a9f5025ec42b6b9a2ca4a26562b49c405 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 19 May 2020 05:15:31 -0700
Subject: [PATCH 0815/1533] Generate a cubin header for tanh.

So far, only generate it for f32 and f64, f16 doesn't work yet.

PiperOrigin-RevId: 312258425
Change-Id: I73c7a58d8fa2ebf02729fe1f7317aabb746fa8b0
---
 .../mlir/tools/kernel_gen/cubin_creator.cc    | 10 ++++++++--
 tensorflow/core/kernels/cubin_headers/BUILD   | 20 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index b1c4b1beae1..f47485d0214 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -231,8 +231,14 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
       xla::mlir_gpu::LowerLHLOToGPU(module.get(), tile_sizes, unroll_factors,
                                     /*collapseParallelLoops=*/false));
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
-  TF_RETURN_IF_ERROR(
-      PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+  // TODO(b/156985522): Figure out why we get a segfault when generating Tanh
+  // with 'same_shape' containing {0, 1}. We would also get the crash if we
+  // unconditionally call PropagateStaticShapeKnowledgeToKernel while
+  // 'same_shape' is empty.
+  if (!same_shape.empty()) {
+    TF_RETURN_IF_ERROR(
+        PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+  }
 
   mlir::OwningModuleRef kernel_module =
       xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index bb7995dd221..509ac008355 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -45,3 +45,23 @@ func @relu(%arg0: tensor<?xf99>) -> tensor<?xf99> {
         ("f64", "DT_DOUBLE"),
     ]
 ]
+
+tanh_kernel = """
+func @tanh(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Tanh"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "tanh_{type}_kernel".format(type = type),
+        op = tanh_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]

From e0b19f6ef223af40e2e6d1d21b8464c1b2ebee8f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 05:51:19 -0700
Subject: [PATCH 0816/1533] Simplify cuda toolchain config

This change does:

* Inlines all action_configs and features.
* Makes linux and darwin toolchain have `no_legacy_features`, adds all missing features.
* Moves all flags into 3 features: default_compile_flags, default_archive_flags, default_link_flags. If flag set depends on enabling some other feature, we use `with_feature` to express that.
* Removes all extra features that are now empty and have no semantic meaning.

As a result, all flags appear in the order of potential appearance on the generated command line, and there is no magic patching of this toolchain by Bazel anymore.

PiperOrigin-RevId: 312262853
Change-Id: If80dfeac50256d83de4b565a13e2c4a6351fb376
---
 .../crosstool/cc_toolchain_config.bzl.tpl     | 2387 +++++++----------
 1 file changed, 953 insertions(+), 1434 deletions(-)

diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index e50592fd857..4acc05ff88c 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -12,1426 +12,237 @@ load(
     "tool",
     "tool_path",
     "variable_with_value",
+    "with_feature_set",
 )
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    host_system_name = "local"
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
 
-    target_system_name = "local"
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
 
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = ctx.attr.builtin_sysroot
-
-    all_link_actions = [
+def all_link_actions():
+    return [
         ACTION_NAMES.cpp_link_executable,
         ACTION_NAMES.cpp_link_dynamic_library,
         ACTION_NAMES.cpp_link_nodeps_dynamic_library,
     ]
 
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
         ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
     )
 
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
     )
 
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
     )
 
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "darwin":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-force_load,"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "msvc":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("/WHOLEARCHIVE:"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
     )
 
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
+def _tool_paths(cpu, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
+                "/ar" if cpu == "local" else "/libtool"
+            )),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
         ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if ctx.attr.compiler == "clang":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "-fexperimental-new-pass-manager",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    elif ctx.attr.compiler == "msvc":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "/DCOMPILER_MSVC",
-                              "/DNOMINMAX",
-                              "/D_WIN32_WINNT=0x0600",
-                              "/D_CRT_SECURE_NO_DEPRECATE",
-                              "/D_CRT_SECURE_NO_WARNINGS",
-                              "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                              "/bigobj",
-                              "/Zm500",
-                              "/J",
-                              "/Gy",
-                              "/GF",
-                              "/EHsc",
-                              "/wd4351",
-                              "/wd4291",
-                              "/wd4250",
-                              "/wd4996",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    else:
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags")
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cuda_path_feature = feature(
-        name = "cuda_path",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--cuda-path=" + ctx.attr.cuda_path],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            default_compile_flags_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-        if ctx.attr.cuda_path:
-            features += [cuda_path_feature]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
+    elif cpu == "x64_windows":
+        return [
             tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
             tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
             tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
@@ -1452,58 +263,766 @@ def _impl(ctx):
                 path = "wrapper/bin/msvc_nop.bat",
             ),
         ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _cuda_set(cuda_path, actions):
+    if cuda_path:
+        return flag_set(
+            actions = actions,
+            flag_groups = [
+                flag_group(
+                    flags = ["--cuda-path=" + cuda_path],
+                ),
+            ],
+        )
+    else:
+        return []
+
+def _nologo():
+  return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-fexperimental-new-pass-manager"]),
+                        ] if compiler == "clang" else [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_compile_actions,
+                ) + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ] if cpu == "local" else []) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+                        with_features = [with_feature_set(features = ["alwayslink"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path]),
+                        ],
+                    ),
+                ] + ([flag_set(
+                    actions = all_link_actions(),
+                    flag_groups = [
+                        flag_group(flags = ["-Wl,--gc-sections"]),
+                        flag_group(
+                            flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                        ),
+                    ],
+                )] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+                    ),
+                ] if cpu == "darwin" else []) + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_link_actions(),
+                ) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "alwayslink", enabled = cpu == "local"),
+            feature(name = "opt"),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    elif cpu == "x64_windows":
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "common_flags",
+                enabled = True,
+                env_sets = [
+                    env_set(
+                        actions = all_compile_actions() + all_link_actions() + all_archive_actions(),
+                        env_entries = [
+                            env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                            env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include),
+                            env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                            env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                            env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-B",
+                                    "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                                ],
+                            ),
+                            _nologo(),
+                            flag_group(
+                                flags = [
+                                    "/DCOMPILER_MSVC",
+                                    "/DNOMINMAX",
+                                    "/D_WIN32_WINNT=0x0600",
+                                    "/D_CRT_SECURE_NO_DEPRECATE",
+                                    "/D_CRT_SECURE_NO_WARNINGS",
+                                    "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                                    "/bigobj",
+                                    "/Zm500",
+                                    "/J",
+                                    "/Gy",
+                                    "/GF",
+                                    "/EHsc",
+                                    "/wd4351",
+                                    "/wd4291",
+                                    "/wd4250",
+                                    "/wd4996",
+                                ],
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [flag_group(flags = ["/showIncludes"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MT"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MD"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MTd"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MDd"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                        ] + ([
+                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
+                        ] if ctx.attr.host_unfiltered_compile_flags else []),
+                    ),
+                    flag_set(
+                        actions = [ACTION_NAMES.assemble],
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}", "/Zi"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fa%{output_file}"],
+                                        expand_if_available = "output_assembly_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/P", "/Fi%{output_file}"],
+                                        expand_if_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/c", "%{source_file}"],
+                                expand_if_available = "source_file",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DLL"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/IMPLIB:%{interface_library_output_path}"],
+                                expand_if_available = "interface_library_output_path",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            _libraries_to_link_group("msvc"),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/SUBSYSTEM:CONSOLE"]),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(flags = ["/MACHINE:X64"]),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                        ],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                                expand_if_available = "def_file_path",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "parse_showincludes", enabled = True),
+            feature(name = "no_stripping", enabled = True),
+            feature(
+                name = "targets_windows",
+                enabled = True,
+                implies = ["copy_dynamic_libraries_to_binary"],
+            ),
+            feature(name = "copy_dynamic_libraries_to_binary"),
+            feature(
+                name = "generate_pdb_file",
+                requires = [
+                    feature_set(features = ["dbg"]),
+                    feature_set(features = ["fastbuild"]),
+                ],
+            ),
+            feature(name = "static_link_msvcrt"),
+            feature(
+                name = "static_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "static_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dbg",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "fastbuild",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "opt",
+            ),
+            feature(name = "windows_export_all_symbols"),
+            feature(name = "no_windows_export_all_symbols"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(
+                name = "supports_interface_shared_libraries",
+                enabled = True,
+            ),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
     else:
         fail("Unreachable")
 
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+        target_cpu = "darwin"
+        target_libc = "macosx"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+        target_cpu = "x64_windows"
+        target_libc = "msvcrt"
+        compiler = "msvc-cl"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.msvc_ml_path,
+            c_compiler_path = ctx.attr.msvc_cl_path,
+            cc_compiler_path = ctx.attr.msvc_cl_path,
+            archiver_path = ctx.attr.msvc_lib_path,
+            linker_path = ctx.attr.msvc_link_path,
+            strip_path = "fake_tool_strip_not_supported",
+        )
+    else:
+        fail("Unreachable")
+
     out = ctx.actions.declare_file(ctx.label.name)
     ctx.actions.write(out, "Fake executable")
     return [
         cc_common.create_cc_toolchain_config_info(
             ctx = ctx,
-            features = features,
+            features = _features(cpu, compiler, ctx),
             action_configs = action_configs,
             artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
             toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
+            host_system_name = "local",
+            target_system_name = "local",
             target_cpu = target_cpu,
             target_libc = target_libc,
             compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
             make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
         ),
         DefaultInfo(
             executable = out,
@@ -1514,6 +1033,7 @@ cc_toolchain_config = rule(
     implementation = _impl,
     attrs = {
         "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default = "unknown"),
         "builtin_include_directories": attr.string_list(),
         "extra_no_canonical_prefixes_flags": attr.string_list(),
         "host_compiler_path": attr.string(),
@@ -1531,7 +1051,6 @@ cc_toolchain_config = rule(
         "msvc_lib_path": attr.string(default = "msvc_not_used"),
         "msvc_link_path": attr.string(default = "msvc_not_used"),
         "msvc_ml_path": attr.string(default = "msvc_not_used"),
-        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default="unknown"),
     },
     provides = [CcToolchainConfigInfo],
     executable = True,

From 9bdd08406461f6988cffb48100ab79994b50ee64 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Tue, 19 May 2020 08:50:47 -0700
Subject: [PATCH 0817/1533] Fix bug where dispatch broke for ops that define an
 argument named 'op'.

PiperOrigin-RevId: 312288165
Change-Id: I714848226466815cb34e8497ebc7df471880783a
---
 tensorflow/python/framework/python_op_gen.cc | 5 ++++-
 tensorflow/python/util/dispatch.py           | 8 ++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 857cc7b6638..ca0c5d9ef1a 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -959,7 +959,10 @@ void GenEagerPythonOp::AddDispatch(const string& prefix) {
 
   strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
   strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
-  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));
+  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_,
+                                  ", "
+                                  "(), dict("));
+  strings::StrAppend(&result_, prefix, "      )\n");
   strings::StrAppend(&result_, prefix,
                      "  if result is not "
                      "_dispatch.OpDispatcher.NOT_SUPPORTED:\n");
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index 3868da14b44..51dfe3793ae 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -99,7 +99,7 @@ class GlobalOpDispatcher(object):
     _GLOBAL_DISPATCHERS.append(self)
 
 
-def dispatch(op, *args, **kwargs):
+def dispatch(op, args, kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
   Calls the `handle` method of each `OpDispatcher` that has been registered
@@ -107,8 +107,8 @@ def dispatch(op, *args, **kwargs):
 
   Args:
     op: Python function: the operation to dispatch for.
-    *args: The arguments to the operation.
-    **kwargs: They keyword arguments to the operation.
+    args: The arguments to the operation.
+    kwargs: They keyword arguments to the operation.
 
   Returns:
     The result of the operation, or `NOT_SUPPORTED` if no registered
@@ -202,7 +202,7 @@ def add_dispatch_support(target):
     except (TypeError, ValueError):
       # Note: convert_to_eager_tensor currently raises a ValueError, not a
       # TypeError, when given unexpected types.  So we need to catch both.
-      result = dispatch(wrapper, *args, **kwargs)
+      result = dispatch(wrapper, args, kwargs)
       if result is not OpDispatcher.NOT_SUPPORTED:
         return result
       else:

From be4980e3409618f89aef5089025ca2a3d0c0b819 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Tue, 19 May 2020 08:54:59 -0700
Subject: [PATCH 0818/1533] [XLA] Improve cost analysis for while loops.

In order to prioritize alternate memory allocations for HLOs in (nested) while
loops, the cost model now accounts for these instructions as executing a
heuristic constant number of times (currently 5). Nested while loops will be
calculated to have executed pow(5, nesting_level) times.

PiperOrigin-RevId: 312288904
Change-Id: Ibb177ac971922e0660cd0385f1b38d223804d0c9
---
 tensorflow/compiler/xla/service/BUILD         |   1 +
 .../xla/service/memory_space_assignment.cc    | 123 +++++++++++-------
 .../xla/service/memory_space_assignment.h     |  33 ++++-
 .../service/memory_space_assignment_test.cc   |   3 +-
 4 files changed, 103 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 126b62a8eb2..a8f20827c6d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3284,6 +3284,7 @@ cc_library(
         ":heap_simulator",
         ":hlo_cost_analysis",
         "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/core/lib/math:math_util",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 81a8a102402..44509395b6f 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -16,53 +16,52 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/core/lib/math/math_util.h"
 namespace xla {
 
 namespace {
 // Define a dummy chunk for chunks that will be allocated in the default memory
 // space and for keeping track of number of asynchronous copies.
 const HeapSimulator::Chunk kDummyChunk{-1, -1};
+// This variable is used by the cost analysis in estimating how many times each
+// while loop will execute. Nested loops will be assumed to have executed
+// pow(kWhileExecutionCount, nesting_level) times.
+const int kWhileExecutionCount = 5;
 
-// Returns a heuristic value that captures how much putting this tensor to
-// the alternate memory would help if the op is memory bound, or otherwise
-// how far off is the op to memory boundedness. The larger this number, the
-// higher priority it will be placed in the alternate memory.
-float GetAlternateMemoryBenefit(
-    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+}  // namespace
+
+float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
     const HloInstruction& instruction,
-    float elapsed_time_due_to_alternate_mem) {
+    float elapsed_time_due_to_alternate_mem) const {
   float elapsed_time_due_to_compute =
-      cost_analysis.GetInstructionElapsedDueToCompute(instruction);
+      GetInstructionElapsedDueToCompute(instruction);
   float elapsed_time_due_to_memory =
-      cost_analysis.GetInstructionElapsedDueToMemory(instruction);
+      GetInstructionElapsedDueToMemory(instruction);
   if (elapsed_time_due_to_memory > elapsed_time_due_to_compute) {
     // Memory bound, return how much alternate memory is better.
-    return elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem;
+    int while_nest_level = CalculateWhileLoopNestLevel(&instruction);
+    return (elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem) *
+           tensorflow::MathUtil::IPow<float>(kWhileExecutionCount,
+                                             while_nest_level);
   } else {
     // Compute bound, return how far off are we to memory boundedness.
     return elapsed_time_due_to_memory - elapsed_time_due_to_compute;
   }
 }
 
-// Returns a heuristic value of memory boundedness for the given BufferInterval.
-// The larger this number, the higher priority it will be placed in the
-// alternate memory.
-float GetMemoryBoundedness(
-    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
-    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) {
+float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
+    const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
   const HloInstruction& defining_instruction =
       *interval.buffer->defining_instruction();
-  float alternate_mem_benefit =
-      GetAlternateMemoryBenefit(cost_analysis, defining_instruction,
-                                cost_analysis.GetInstructionElapsedDueToMemory(
-                                    defining_instruction,
-                                    /*operand_in_alternate_mem=*/{},
-                                    /*output_in_alternate_mem=*/true));
+  float alternate_mem_benefit = GetAlternateMemoryBenefit(
+      defining_instruction,
+      GetInstructionElapsedDueToMemory(defining_instruction,
+                                       /*operand_in_alternate_mem=*/{},
+                                       /*output_in_alternate_mem=*/true));
   for (const HloUse& use : interval.buffer->uses()) {
     float use_alternate_mem_benefit = GetAlternateMemoryBenefit(
-        cost_analysis, *use.instruction,
-        cost_analysis.GetInstructionElapsedDueToMemory(*use.instruction,
-                                                       use.operand_number));
+        *use.instruction,
+        GetInstructionElapsedDueToMemory(*use.instruction, use.operand_number));
     // If the benefit is positive (memory bound), add it to this buffer's
     // benefit. If the benefit is negative (compute bound), calculate the
     // maximum.
@@ -77,7 +76,7 @@ float GetMemoryBoundedness(
   // Get performance slowdown in seconds of prefetching current BufferInterval
   // causing to other BufferIntervals.
   float alternate_mem_slowdown =
-      cost_analysis.GetInstructionElapsedDueToMemorySlowdown(interval.size);
+      GetInstructionElapsedDueToMemorySlowdown(interval.size);
 
   // Scale the slowdown based on the time of this buffer. We would want earlier
   // buffers have lower slowdown values, because they are less likely to overlap
@@ -86,13 +85,28 @@ float GetMemoryBoundedness(
   // for early HLOs, and full slowdown for mid-to-late HLOs.
   // TODO(yuemmawang): Further in a smarter way, we want buffers overlapped with
   // more HLOs have higher slowdown, and vice versa.
-  float scale = interval.start * 1.0 / cost_analysis.GetScheduleEndTime();
+  float scale = interval.start * 1.0 / GetScheduleEndTime();
   alternate_mem_slowdown *= scale;
 
   return alternate_mem_benefit - alternate_mem_slowdown;
 }
 
-}  // namespace
+int MemorySpaceAssignmentCostAnalysis::CalculateWhileLoopNestLevel(
+    const HloInstruction* instruction) const {
+  int nest_level = 0;
+  const HloComputation* computation = instruction->parent();
+  while (!computation->IsEntryComputation()) {
+    auto node = call_graph_.GetNode(computation);
+    auto callsites = node.caller_callsites();
+    CHECK_EQ(callsites.size(), 1) << "The module is not flattened!";
+    auto callsite = callsites[0];
+    if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+      ++nest_level;
+    }
+    computation = callsite.instruction()->parent();
+  }
+  return nest_level;
+}
 
 float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToCompute(
     const HloInstruction& instruction) const {
@@ -207,29 +221,30 @@ CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis,
     float min_async_copy_to_overlap_ratio,
     float max_async_copy_to_overlap_ratio)
-    : cost_analysis_(cost_analysis),
+    : elapsed_time_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0.0),
+      while_nest_level_(
+          cost_analysis.hlo_live_range().instruction_schedule().size(), 0),
+      cost_analysis_(cost_analysis),
       min_async_copy_to_overlap_ratio_(min_async_copy_to_overlap_ratio),
       max_async_copy_to_overlap_ratio_(max_async_copy_to_overlap_ratio) {
   instruction_schedule_ =
       &cost_analysis_.hlo_live_range().instruction_schedule();
 
-  // First create a vector of elapsed times of HLO instructions.
-  std::vector<float> instructions_elapsed_time(instruction_schedule_->size(),
-                                               0.0);
+  // Create a vector of elapsed times and while nesting levels of HLO
+  // instructions.
   for (const auto& instruction_and_logical_time : *instruction_schedule_) {
     float elapsed_time = cost_analysis_.cost_analysis().optimal_seconds(
         *instruction_and_logical_time.first);
     int64 logical_time = instruction_and_logical_time.second;
-    if (logical_time >= instructions_elapsed_time.size()) {
-      instructions_elapsed_time.resize(logical_time + 1, 0.0);
+    if (logical_time >= elapsed_time_.size()) {
+      elapsed_time_.resize(logical_time + 1, 0.0);
+      while_nest_level_.resize(logical_time + 1, 0);
     }
-    instructions_elapsed_time[logical_time] = elapsed_time;
-  }
-  // As an optimization, create a cumulative sum vector of elapsed time.
-  float cumsum = 0.0;
-  for (float elapsed_time : instructions_elapsed_time) {
-    cumsum += elapsed_time;
-    elapsed_time_cumsum_.push_back(cumsum);
+    elapsed_time_[logical_time] = elapsed_time;
+    while_nest_level_[logical_time] =
+        cost_analysis_.CalculateWhileLoopNestLevel(
+            instruction_and_logical_time.first);
   }
 }
 
@@ -303,7 +318,17 @@ bool CostAnalysisPrefetchIntervalPicker::Done() const {
 
 float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
     int64 start_time, int64 end_time) const {
-  return elapsed_time_cumsum_[end_time - 1] - elapsed_time_cumsum_[start_time];
+  int interval_nest_level =
+      std::min(while_nest_level_[start_time], while_nest_level_[end_time]);
+  float total_elapsed = 0;
+  for (int i = start_time + 1; i < end_time; ++i) {
+    total_elapsed +=
+        elapsed_time_[i] *
+        tensorflow::MathUtil::IPow<float>(
+            kWhileExecutionCount,
+            std::max(0, while_nest_level_[i] - interval_nest_level));
+  }
+  return total_elapsed;
 }
 
 std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
@@ -328,7 +353,7 @@ std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
 absl::optional<float>
 CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
     const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const {
-  return GetMemoryBoundedness(cost_analysis_, interval);
+  return cost_analysis_.GetMemoryBoundedness(interval);
 }
 
 std::string MemorySpaceAssignment::AllocationValue::ToString() const {
@@ -805,8 +830,6 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
     }
 
     const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-    global_max_time_ = instruction_schedule.at(
-        module->entry_computation()->root_instruction());
 
     // TODO(berkin): For now, place the phi values due to conditionals in
     // default memory.
@@ -1609,6 +1632,9 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
                    request.allocation_value->defining_position().shape(),
                    eviction_start_time, request.end_time),
                eviction_end_time);
+  // Evictions must complete by the time of this use.
+  preferred_eviction_end_time =
+      std::min(preferred_eviction_end_time, request.latest_prefetch_time);
 
   BufferInterval eviction_mem_interval;
   eviction_mem_interval.buffer = request.allocation_value->value();
@@ -1616,8 +1642,7 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
   // Try to reserve a buffer from the end of the previous allocation to the
   // preferred eviction end time.
   eviction_mem_interval.start = eviction_end_time + 1;
-  eviction_mem_interval.end =
-      std::min(preferred_eviction_end_time, global_max_time_);
+  eviction_mem_interval.end = preferred_eviction_end_time;
   int64 preferred_offset = prev_allocation->chunk().offset;
   VLOG(3) << "Eviction (" << eviction_start_time << ", " << eviction_end_time
           << ") preferred end time = " << eviction_mem_interval.end;
@@ -1834,8 +1859,8 @@ MemorySpaceAssignment::CalculateAsyncCopyStats() const {
 MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
     const MemorySpaceAssignmentCostAnalysis& cost_analysis) {
   return [&](const BufferInterval& x, const BufferInterval& y) {
-    float x_memory_boundedness = GetMemoryBoundedness(cost_analysis, x);
-    float y_memory_boundedness = GetMemoryBoundedness(cost_analysis, y);
+    float x_memory_boundedness = cost_analysis.GetMemoryBoundedness(x);
+    float y_memory_boundedness = cost_analysis.GetMemoryBoundedness(y);
     if (x_memory_boundedness != y_memory_boundedness) {
       return x_memory_boundedness > y_memory_boundedness;
     }
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 340446d21dd..cf23c792c21 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -82,16 +82,31 @@ class MemorySpaceAssignmentCostAnalysis {
       const HloCostAnalysis& cost_analysis,
       float async_copy_bandwidth_bytes_per_second,
       float alternate_mem_bandwidth_bytes_per_second,
-      const HloLiveRange& hlo_live_range)
+      const HloLiveRange& hlo_live_range, const CallGraph& call_graph)
       : cost_analysis_(cost_analysis),
         async_copy_bandwidth_bytes_per_second_(
             async_copy_bandwidth_bytes_per_second),
         alternate_mem_bandwidth_bytes_per_second_(
             alternate_mem_bandwidth_bytes_per_second),
-        hlo_live_range_(hlo_live_range) {}
+        hlo_live_range_(hlo_live_range),
+        call_graph_(call_graph) {}
 
   const HloCostAnalysis& cost_analysis() const { return cost_analysis_; }
 
+  // Returns a heuristic value that captures how much putting this tensor to the
+  // alternate memory would help if the op is memory bound, or otherwise how far
+  // off is the op to memory boundedness. The larger this number, the higher
+  // priority it will be placed in the alternate memory.
+  float GetAlternateMemoryBenefit(
+      const HloInstruction& instruction,
+      float elapsed_time_due_to_alternate_mem) const;
+
+  // Returns a heuristic value of memory boundedness for the given
+  // BufferInterval.  The larger this number, the higher priority it will be
+  // placed in the alternate memory.
+  float GetMemoryBoundedness(
+      const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const;
+
   // Returns the elapsed time in seconds due to compute only.
   float GetInstructionElapsedDueToCompute(
       const HloInstruction& instruction) const;
@@ -127,6 +142,10 @@ class MemorySpaceAssignmentCostAnalysis {
 
   int64 GetScheduleEndTime() const;
 
+  // Returns the number of nested while loop levels this instruction resides in.
+  // 0 means it is not in a while loop.
+  int CalculateWhileLoopNestLevel(const HloInstruction* instruction) const;
+
   const HloLiveRange& hlo_live_range() const { return hlo_live_range_; }
 
  private:
@@ -134,6 +153,7 @@ class MemorySpaceAssignmentCostAnalysis {
   float async_copy_bandwidth_bytes_per_second_;
   float alternate_mem_bandwidth_bytes_per_second_;
   const HloLiveRange& hlo_live_range_;
+  const CallGraph& call_graph_;
 };
 
 // Abstract base class that memory space assignment uses to pick prefetch
@@ -262,10 +282,10 @@ class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
   // corresponds to the instruction schedule.
   float GetLogicalIntervalElapsed(int64 start_time, int64 end_time) const;
 
-  // For performance reasons, we calculate the prefix sum of the elapsed time so
-  // that it's efficient to find the elapsed time in seconds in any logical
-  // interval.
-  std::vector<float> elapsed_time_cumsum_;
+  // For each instruction in the flattened schedule, maintain their elapsed time
+  // and while nesting level.
+  std::vector<float> elapsed_time_;
+  std::vector<int> while_nest_level_;
 
   const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
   float min_async_copy_to_overlap_ratio_;
@@ -988,7 +1008,6 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       required_assignments_;
   // Number of bytes reserved in alternate memory space.
   int64 reserved_in_bytes_ = 0;
-  int64 global_max_time_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index a9be3850d89..61843b2e765 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -57,9 +57,10 @@ class MemorySpaceAssignmentTest : public HloTestBase,
         HloLiveRange::Run(module->schedule(), *alias_analysis,
                           module->entry_computation())
             .ValueOrDie();
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
     MemorySpaceAssignmentCostAnalysis cost_analysis(
         hlo_cost_analysis, kAsyncCopyBandwidth, kAlternateMemBandwidth,
-        *hlo_live_range);
+        *hlo_live_range, *call_graph);
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
             cost_analysis, /*min_async_copy_to_overlap_ratio=*/0.8,

From fb7ba76670e45d91414ad30780613c32882be65a Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 19 May 2020 08:55:30 -0700
Subject: [PATCH 0819/1533] Fix ops pbtxt

PiperOrigin-RevId: 312288981
Change-Id: I9cee129d2d38c4cd22f3fa81537f103a65b86ad9
---
 .../DenseCountSparseOutput.pbtxt              |   6 +-
 .../RaggedCountSparseOutput.pbtxt             |   6 +-
 .../SparseCountSparseOutput.pbtxt             |   6 +-
 .../ops_history_v2/SparseCrossHashed.pbtxt    |  72 ++++++++
 .../compat/ops_history_v2/SparseCrossV2.pbtxt |  64 +++++++
 tensorflow/core/ops/ops.pbtxt                 | 160 +++++++++++++++++-
 6 files changed, 299 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
index c5b845fd0fb..be566eab9f4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseCountSparseOutput.pbtxt
@@ -6,7 +6,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -49,7 +49,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -57,8 +57,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
index 7f492418b48..aa1a4e07aaf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedCountSparseOutput.pbtxt
@@ -10,7 +10,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -53,7 +53,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -61,8 +61,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
index b701e5fc0db..ed79733f97f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCountSparseOutput.pbtxt
@@ -14,7 +14,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -57,7 +57,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -65,8 +65,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..73002a92f24
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossHashed.pbtxt
@@ -0,0 +1,72 @@
+op {
+  name: "SparseCrossHashed"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "strong_hash"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "salt"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..206542e4713
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseCrossV2.pbtxt
@@ -0,0 +1,64 @@
+op {
+  name: "SparseCrossV2"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1ea06a2fdac..c951cb11778 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -11515,7 +11515,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
@@ -11530,7 +11530,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -11573,7 +11573,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -11581,8 +11581,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -33206,7 +33208,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
@@ -33225,7 +33227,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -33268,7 +33270,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -33276,8 +33278,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -44717,7 +44721,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
@@ -44849,7 +44853,7 @@ op {
   }
   input_arg {
     name: "weights"
-    type: DT_FLOAT
+    type_attr: "output_type"
   }
   output_arg {
     name: "output_indices"
@@ -44892,7 +44896,7 @@ op {
     minimum: -1
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
   }
   attr {
@@ -44900,8 +44904,10 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_INT32
         type: DT_INT64
         type: DT_FLOAT
+        type: DT_DOUBLE
       }
     }
   }
@@ -44999,6 +45005,142 @@ op {
     }
   }
 }
+op {
+  name: "SparseCrossHashed"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "num_buckets"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "strong_hash"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "salt"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
+op {
+  name: "SparseCrossV2"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "values"
+    type_list_attr: "sparse_types"
+  }
+  input_arg {
+    name: "shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "dense_inputs"
+    type_list_attr: "dense_types"
+  }
+  input_arg {
+    name: "sep"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_indices"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_values"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output_shape"
+    type: DT_INT64
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+}
 op {
   name: "SparseDenseCwiseAdd"
   input_arg {

From e25d9862ca5c42997112c564f1253fd001bc4a15 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Tue, 19 May 2020 09:04:05 -0700
Subject: [PATCH 0820/1533] [TF saved_model_cli] Allow user to set target_cpu
 for xla aot compilation.

PiperOrigin-RevId: 312290453
Change-Id: I024e2b3884e436578e351d43961199e4c28307c3
---
 tensorflow/compiler/aot/tfcompile.bzl              | 6 ++++--
 tensorflow/python/tools/saved_model_aot_compile.py | 3 +++
 tensorflow/python/tools/saved_model_cli.py         | 9 +++++++++
 tensorflow/python/tools/tools.bzl                  | 8 +++++++-
 tensorflow/tensorflow.bzl                          | 2 +-
 5 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 208b01c49d5..f2b28e70ff1 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -20,7 +20,7 @@ load(
     "tf_cc_test",
     "tf_copts",
 )
-load("//tensorflow:tensorflow.bzl", "tfcompile_extra_flags")
+load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")
 
 def tf_library(
         name,
@@ -188,7 +188,9 @@ def tf_library(
     # `find` on such an object.
     need_xla_data_proto = flags and flags.find("--gen_program_shape") != -1
 
-    flags = tfcompile_extra_flags() + flags
+    target_cpu = tfcompile_target_cpu()
+    extra_flags = "--target_cpu=" + target_cpu + " " if target_cpu else " "
+    flags = extra_flags + flags
 
     if enable_xla_hlo_profiling:
         profiling_flag = "--xla_hlo_profile"
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index a8694454ef2..5a34d10420a 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -215,6 +215,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
                                    signature_def_key,
                                    cpp_class,
                                    target_triple,
+                                   target_cpu,
                                    variables_to_feed=(),
                                    enable_multithreading=False):
   """Compile a `MetaGraphDef` to header+object files in `output_prefix`.
@@ -239,6 +240,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
     signature_def_key: String, the signature_def to use in the SavedModel.
     cpp_class: String, Name of output C++ class.
     target_triple: String, LLVM target triple.
+    target_cpu: String, LLVM target cpu name.
     variables_to_feed: A list of strings, the variables that will be fed by the
       user; these won't be frozen.  If `None`, then we will extract all the
       variables in the graph and mark them as to-feed.  The default behavior is
@@ -367,6 +369,7 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
       config=config_pbtxt_location,
       cpp_class=cpp_class,
       target_triple=target_triple,
+      target_cpu=target_cpu,
       entry_point='entry_{}'.format(entry_digest),
       out_function_object='{}.o'.format(output_prefix),
       out_header='{}.h'.format(output_prefix),
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 261ee1b9e9d..0f8f68436a3 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -825,6 +825,7 @@ def aot_compile_cpu(args):
       variables_to_feed=variables_to_feed,
       output_prefix=args.output_prefix,
       target_triple=args.target_triple,
+      target_cpu=args.target_cpu,
       cpp_class=args.cpp_class,
       enable_multithreading=args.enable_multithreading)
 
@@ -1096,6 +1097,14 @@ def add_aot_compile_cpu_subparser(subparsers):
             'x86_64-none-darwin, x86_64-apple-ios, arm64-none-ios, '
             'armv7-none-android.  More examples are available in tfcompile.bzl '
             'in the tensorflow codebase.'))
+  parser_compile.add_argument(
+      '--target_cpu',
+      type=str,
+      default='',
+      help=('Target cpu name for LLVM during AOT compilation.  Examples: '
+            'x86_64, skylake, haswell, westmere, <empty> (unknown).  For '
+            'a complete list of options, run (for x86 targets): '
+            '`llc -march=x86 -mcpu=help`'))
   parser_compile.add_argument(
       '--checkpoint_path',
       type=str,
diff --git a/tensorflow/python/tools/tools.bzl b/tensorflow/python/tools/tools.bzl
index c6853e1fc63..79f771bbcad 100644
--- a/tensorflow/python/tools/tools.bzl
+++ b/tensorflow/python/tools/tools.bzl
@@ -1,6 +1,7 @@
 """Definitions for using tools like saved_model_cli."""
 
 load("//tensorflow:tensorflow.bzl", "clean_dep", "if_xla_available")
+load("//tensorflow:tensorflow.bzl", "tfcompile_target_cpu")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "target_llvm_triple")
 
 def _maybe_force_compile(args, force_compile):
@@ -19,6 +20,7 @@ def saved_model_compile_aot(
         signature_def = "serving_default",
         variables_to_feed = "",
         target_triple = None,
+        target_cpu = None,
         force_without_xla_support_flag = True,
         tags = None):
     """Compile a SavedModel directory accessible from a filegroup.
@@ -88,7 +90,9 @@ def saved_model_compile_aot(
         uninitialized in the compiled object (this applies to all input
         arguments from the signature as well).
       target_triple: The LLVM target triple to use (defaults to current build's
-        target architecture's triple).
+        target architecture's triple).  Similar to clang's -target flag.
+      target_cpu: The LLVM cpu name used for compilation.  Similar to clang's
+        -mcpu flag.
       force_without_xla_support_flag: Whether to compile even when
         `--define=with_xla_support=true` is not set.  If `False`, and the
         define is not passed when building, then the created `cc_library`
@@ -100,6 +104,7 @@ def saved_model_compile_aot(
     """
     saved_model = "{}/saved_model.pb".format(directory)
     target_triple = target_triple or target_llvm_triple()
+    target_cpu = target_cpu or tfcompile_target_cpu() or ""
     variables_to_feed = variables_to_feed or "''"
     if checkpoint_path:
         checkpoint_cmd_args = (
@@ -131,6 +136,7 @@ def saved_model_compile_aot(
             "--variables_to_feed {} ".format(variables_to_feed) +
             "--signature_def_key {} ".format(signature_def) +
             "--target_triple " + target_triple + " " +
+            ("--target_cpu " + target_cpu + " " if target_cpu else "") +
             "--tag_set {} ".format(tag_set)
         ),
         tags = tags,
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 70b03146f34..9a780839be3 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2866,7 +2866,7 @@ def if_mlir(if_true, if_false = []):
         "//conditions:default": if_false,
     })
 
-def tfcompile_extra_flags():
+def tfcompile_target_cpu():
     return ""
 
 def tf_external_workspace_visible(visibility):

From e77b33595d5b3222859ffc0d480a627fb850e5cd Mon Sep 17 00:00:00 2001
From: stevensa <andrew.stevens@infineon.com>
Date: Tue, 19 May 2020 18:17:26 +0200
Subject: [PATCH 0821/1533] Tidy: remove commenting nits

---
 .../compiler/mlir/lite/transforms/prepare_tf.cc       | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index aa1baea73fb..1febb49b38b 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -175,7 +175,7 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
 
   using FetchAttrType = typename FetchMinMax::AttrType;
   LogicalResult matchAndRewrite(TFFakeQuantOp tf_op,
-                                     PatternRewriter &rewriter) const override {
+                                PatternRewriter &rewriter) const override {
     // We don't want to insert quantize/dequantize if the quantize op exists.
     auto res = tf_op.outputs();
     if (!res.hasOneUse() || isa<QuantizeOp>(*res.user_begin())) {
@@ -230,16 +230,16 @@ struct InsertTFLQuantOpsAfterTFFakeQuantOp
 // TF::FakeQuant operators
 //
 using PreparePerTensorFakeQuant = InsertTFLQuantOpsAfterTFFakeQuantOp<
-    TF::FakeQuantWithMinMaxVarsOp, false,
+    TF::FakeQuantWithMinMaxVarsOp, /*PerAxis=*/false,
     FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsOp>>;
 
 using PreparePerChannelFakeQuant = InsertTFLQuantOpsAfterTFFakeQuantOp<
-    TF::FakeQuantWithMinMaxVarsPerChannelOp, true,
+    TF::FakeQuantWithMinMaxVarsPerChannelOp, /*PerAxis=*/true,
     FetchConstantMinMaxInputs<TF::FakeQuantWithMinMaxVarsPerChannelOp>>;
 
 using PreparePerTensorFakeQuantWithMinMaxArgs =
     InsertTFLQuantOpsAfterTFFakeQuantOp<
-        TF::FakeQuantWithMinMaxArgsOp, false,
+        TF::FakeQuantWithMinMaxArgsOp, /*PerAxis=*/false,
         FetchMinMaxAttrs<TF::FakeQuantWithMinMaxArgsOp>>;
 
 // Templated class for declaring a converter from some TensorFlow convolution
@@ -269,8 +269,7 @@ struct ConvertTFConvOp : public RewritePattern {
 
   ConvertTFConvOp(MLIRContext *context)
       : RewritePattern(TFConvOpType::getOperationName(), 1, context),
-        intAttrOne(Builder(context).getI32IntegerAttr(1)) {}
-
+237
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
     // Assumes TensorFlow convolution op is already verified to be

From 282db86128991017d062beeef901ddc76d645730 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 19 May 2020 09:12:46 -0700
Subject: [PATCH 0822/1533] Fix condition for activation function.
 FullyConnected in hexagon delegate doesn't support activation currently.

PiperOrigin-RevId: 312292018
Change-Id: I921f00a42d1092f8910acf563cd0e97afd46ddc2
---
 tensorflow/lite/experimental/delegates/hexagon/utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index d6e5e7bc8cd..c6bb99761cb 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -212,7 +212,7 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
           reinterpret_cast<const TfLiteFullyConnectedParams*>(
               node->builtin_data);
       return (weights_const && bias_const_or_no_bias &&
-              IsActivationReluOrNone(matmul_params->activation) &&
+              matmul_params->activation == kTfLiteActNone &&
               matmul_params->keep_num_dims == false &&
               matmul_params->weights_format ==
                   kTfLiteFullyConnectedWeightsFormatDefault);

From 15d39f5e83ee2ba11567d82c0e7cc087522720ea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 09:16:48 -0700
Subject: [PATCH 0823/1533] Update TextVectorization to use internal layer
 adapt calls instead of its own combiner.

PiperOrigin-RevId: 312292748
Change-Id: Ia157a06f55a28325dac9e4a58b3fed23fc4599d4
---
 .../engine/base_preprocessing_layer_v1.py     |   3 +-
 .../preprocessing/text_vectorization.py       | 233 ++----------------
 .../preprocessing/text_vectorization_test.py  | 228 ++---------------
 3 files changed, 46 insertions(+), 418 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py b/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
index fb77b696f68..f603fac25c3 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer_v1.py
@@ -55,8 +55,9 @@ class CombinerPreprocessingLayer(
 
   def _get_dataset_iterator(self, dataset):
     """Gets an iterator from a tf.data.Dataset."""
-    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     session = K.get_session()
+    session.run(iterator.initializer)
     next_element = iterator.get_next()
     return lambda: session.run(next_element)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index b1eff6e0bf3..96c5f137cb9 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -17,10 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import json
-import operator
-
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
@@ -29,7 +25,6 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding
 from tensorflow.python.keras.layers.preprocessing import string_lookup
@@ -41,7 +36,6 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import keras_export
 
 LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
@@ -122,7 +116,9 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      contains 1 OOV token, so the effective number of tokens is `(max_tokens -
+      1 - (1 if output == "int" else 0))`
     standardize: Optional specification for standardization to apply to the
       input text. Values can be None (no standardization),
       'lower_and_strip_punctuation' (lowercase and remove punctuation) or a
@@ -138,7 +134,8 @@ class TextVectorization(CombinerPreprocessingLayer):
     output_mode: Optional specification for the output of the layer. Values can
       be "int", "binary", "count" or "tf-idf", configuring the layer as follows:
         "int": Outputs integer indices, one integer index per split string
-          token.
+          token. When output == "int", 0 is reserved for masked locations;
+          this reduces the vocab size to max_tokens-2 instead of max_tokens-1
         "binary": Outputs a single int array per batch, of either vocab_size or
           max_tokens size, containing 1s in all elements where the token mapped
           to that index exists at least once in the batch item.
@@ -274,12 +271,6 @@ class TextVectorization(CombinerPreprocessingLayer):
     # the OOV value to zero instead of one.
     self._oov_value = 1 if output_mode == INT else 0
 
-    # We always reduce the max token number by 1 to account for the OOV token
-    # if it is set. Keras' use of the reserved number 0 for padding tokens,
-    # if the output is in INT mode, does not really count as a 'token' for
-    # vocabulary purposes, so we only reduce vocab size by 1 here.
-    self._max_vocab_size = max_tokens - 1 if max_tokens is not None else None
-
     self._standardize = standardize
     self._split = split
     self._ngrams_arg = ngrams
@@ -295,8 +286,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     self._called = False
 
     super(TextVectorization, self).__init__(
-        combiner=_TextVectorizationCombiner(
-            self._max_vocab_size, compute_idf=output_mode == TFIDF),
+        combiner=None,
         **kwargs)
 
     mask_token = "" if output_mode in [None, INT] else None
@@ -306,14 +296,14 @@ class TextVectorization(CombinerPreprocessingLayer):
     # If this layer is configured for string or integer output, we do not
     # create a vectorization layer (as the output is not vectorized).
     if self._output_mode in [None, INT]:
-      return
-
-    if max_tokens is not None and self._pad_to_max:
-      max_elements = max_tokens
+      self._vectorize_layer = None
     else:
-      max_elements = None
-    self._vectorize_layer = self._get_vectorization_class()(
-        max_tokens=max_elements, output_mode=self._output_mode)
+      if max_tokens is not None and self._pad_to_max:
+        max_elements = max_tokens
+      else:
+        max_elements = None
+      self._vectorize_layer = self._get_vectorization_class()(
+          max_tokens=max_elements, output_mode=self._output_mode)
 
   # These are V1/V2 shim points. There are V1 implementations in the V1 class.
   def _get_vectorization_class(self):
@@ -407,7 +397,14 @@ class TextVectorization(CombinerPreprocessingLayer):
       raise ValueError(
           "adapt() requires a Dataset or an array as input, got {}".format(
               type(data)))
-    super(TextVectorization, self).adapt(preprocessed_inputs, reset_state)
+
+    self._index_lookup_layer.adapt(preprocessed_inputs)
+    if self._vectorize_layer:
+      if isinstance(data, ops.Tensor):
+        integer_data = self._index_lookup_layer(preprocessed_inputs)
+      else:
+        integer_data = preprocessed_inputs.map(self._index_lookup_layer)
+      self._vectorize_layer.adapt(integer_data)
 
   def get_vocabulary(self):
     return self._index_lookup_layer.get_vocabulary()
@@ -616,191 +613,3 @@ class TextVectorization(CombinerPreprocessingLayer):
     # If we're not returning integers here, we rely on the vectorization layer
     # to create the output.
     return self._vectorize_layer(indexed_data)
-
-
-class _TextVectorizationAccumulator(
-    collections.namedtuple("_TextVectorizationAccumulator",
-                           ["count_dict", "per_doc_count_dict", "metadata"])):
-  pass
-
-
-# A note on this combiner: This contains functionality that will be extracted
-# into the Vectorization and IndexLookup combiner objects. At that point,
-# TextVectorization can become a PreprocessingStage instead of a Layer and
-# this combiner can be retired. Until then, we leave this as is instead of
-# attempting a refactor of what will soon be deleted.
-class _TextVectorizationCombiner(Combiner):
-  """Combiner for the TextVectorization preprocessing layer.
-
-  This class encapsulates the logic for computing a vocabulary based on the
-  frequency of each token.
-
-  Attributes:
-    vocab_size: (Optional) If set, only the top `vocab_size` tokens (based on
-      frequency across the dataset) are retained in the vocabulary. If None, or
-      set to a value greater than the total number of distinct tokens in the
-      dataset, all tokens are retained.
-    compute_idf: (Optional) If set, the inverse document frequency will be
-      computed for each value.
-  """
-
-  def __init__(self, vocab_size=None, compute_idf=False):
-    self._vocab_size = vocab_size
-    self._compute_idf = compute_idf
-    self._input_dtype = dtypes.string
-
-  def compute(self, values, accumulator=None):
-    """Compute a step in this computation, returning a new accumulator."""
-    if dtypes.as_dtype(self._input_dtype) != dtypes.as_dtype(values.dtype):
-      raise RuntimeError("Expected input type %s, got %s" %
-                         (self._input_dtype, values.dtype))
-    if ragged_tensor.is_ragged(values):
-      values = values.to_list()
-    if isinstance(values, ops.EagerTensor):
-      values = values.numpy()
-    if isinstance(values, np.ndarray):
-      values = values.tolist()
-
-    if accumulator is None:
-      accumulator = self._create_accumulator()
-
-    # If we are being passed raw strings or bytestrings, we need to wrap them
-    # in an array so we don't accidentally iterate over the bytes instead of
-    # treating the string as one object.
-    if isinstance(values, (str, bytes)):
-      values = [values]
-
-    # TODO(momernick): Benchmark improvements to this algorithm.
-    for document in values:
-      current_doc_id = accumulator.metadata[0]
-      for token in document:
-        accumulator.count_dict[token] += 1
-        if self._compute_idf:
-          doc_count = accumulator.per_doc_count_dict[token]
-          if doc_count["last_doc_id"] != current_doc_id:
-            doc_count["count"] += 1
-            doc_count["last_doc_id"] = current_doc_id
-      accumulator.metadata[0] += 1
-
-    return accumulator
-
-  def merge(self, accumulators):
-    """Merge several accumulators to a single accumulator."""
-    if not accumulators:
-      return accumulators
-
-    base_accumulator = accumulators[0]
-
-    for accumulator in accumulators[1:]:
-      base_accumulator.metadata[0] += accumulator.metadata[0]
-      for token, value in accumulator.count_dict.items():
-        base_accumulator.count_dict[token] += value
-      if self._compute_idf:
-        for token, value in accumulator.per_doc_count_dict.items():
-          # Any newly created token counts in 'base_accumulator''s
-          # per_doc_count_dict will have a last_doc_id of -1. This is always
-          # less than the next doc id (which are strictly positive), so any
-          # future occurrences are guaranteed to be counted.
-          base_accumulator.per_doc_count_dict[token]["count"] += value["count"]
-
-    return base_accumulator
-
-  def _inverse_document_frequency(self, document_counts, num_documents):
-    """Compute the inverse-document-frequency (IDF) component of TFIDF.
-
-    Uses the default weighting scheme described in
-    https://en.wikipedia.org/wiki/Tf%E2%80%93idf.
-
-    Args:
-      document_counts: An array of the # of documents each token appears in.
-      num_documents: An int representing the total number of documents
-
-    Returns:
-      An array of "inverse document frequency" weights.
-    """
-    return np.log(1 + num_documents / (1 + np.array(document_counts)))
-
-  def extract(self, accumulator):
-    """Convert an accumulator into a dict of output values.
-
-    Args:
-      accumulator: An accumulator aggregating over the full dataset.
-
-    Returns:
-      A dict of:
-        "vocab": A list of the retained items in the vocabulary.
-        "idf": The inverse-document-frequency for each item in vocab.
-          idf[vocab_idx] is the IDF value for the corresponding vocab item.
-        "oov_idf": The inverse-document-frequency for the OOV token.
-    """
-    if self._compute_idf:
-      vocab_counts, document_counts, num_documents = accumulator
-    else:
-      vocab_counts, _, _ = accumulator
-
-    sorted_counts = sorted(
-        vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
-    vocab_data = (
-        sorted_counts[:self._vocab_size] if self._vocab_size else sorted_counts)
-    vocab = [data[0] for data in vocab_data]
-
-    if self._compute_idf:
-      doc_counts = [document_counts[token]["count"] for token in vocab]
-      idf = self._inverse_document_frequency(doc_counts, num_documents[0])
-      oov_idf = np.array([np.log(1 + num_documents[0])])
-      return {_VOCAB_NAME: vocab, _IDF_NAME: idf, _OOV_IDF_NAME: oov_idf}
-    else:
-      return {_VOCAB_NAME: vocab}
-
-  def restore(self, output):
-    """Create an accumulator based on 'output'."""
-    raise NotImplementedError(
-        "TextVectorization does not restore or support streaming updates.")
-
-  def serialize(self, accumulator):
-    """Serialize an accumulator for a remote call."""
-    output_dict = {}
-    output_dict["metadata"] = accumulator.metadata
-    output_dict["vocab"] = list(accumulator.count_dict.keys())
-    output_dict["vocab_counts"] = list(accumulator.count_dict.values())
-    if self._compute_idf:
-      output_dict["idf_vocab"] = list(accumulator.per_doc_count_dict.keys())
-      output_dict["idf_counts"] = [
-          counter["count"]
-          for counter in accumulator.per_doc_count_dict.values()
-      ]
-    return compat.as_bytes(json.dumps(output_dict))
-
-  def deserialize(self, encoded_accumulator):
-    """Deserialize an accumulator received from 'serialize()'."""
-    accumulator_dict = json.loads(compat.as_text(encoded_accumulator))
-
-    accumulator = self._create_accumulator()
-    accumulator.metadata[0] = accumulator_dict["metadata"][0]
-
-    count_dict = dict(
-        zip(accumulator_dict["vocab"], accumulator_dict["vocab_counts"]))
-    accumulator.count_dict.update(count_dict)
-
-    if self._compute_idf:
-      create_dict = lambda x: {"count": x, "last_doc_id": -1}
-      idf_count_dicts = [
-          create_dict(count) for count in accumulator_dict["idf_counts"]
-      ]
-      idf_dict = dict(zip(accumulator_dict["idf_vocab"], idf_count_dicts))
-      accumulator.per_doc_count_dict.update(idf_dict)
-
-    return accumulator
-
-  def _create_accumulator(self):
-    """Accumulate a sorted array of vocab tokens and corresponding counts."""
-
-    count_dict = collections.defaultdict(int)
-    if self._compute_idf:
-      create_default_dict = lambda: {"count": 0, "last_doc_id": -1}
-      per_doc_count_dict = collections.defaultdict(create_default_dict)
-    else:
-      per_doc_count_dict = None
-    metadata = [0]
-    return _TextVectorizationAccumulator(count_dict, per_doc_count_dict,
-                                         metadata)
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index 5a9762719d5..ede4cc287ac 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -62,7 +62,7 @@ def _get_end_to_end_test_cases():
           "testcase_name":
               "test_simple_tokens_int_mode",
           # Create an array where 'earth' is the most frequent term, followed by
-          # 'wind', then 'and', then 'fire'. This ensures that the vocab accumulator
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
           # is sorting by frequency.
           "vocab_data":
               np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
@@ -78,6 +78,26 @@ def _get_end_to_end_test_cases():
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
       },
+      {
+          "testcase_name":
+              "test_simple_tokens_int_mode_hard_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": 6,
+              "standardize": None,
+              "split": None,
+              "output_mode": text_vectorization.INT
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+      },
       {
           "testcase_name":
               "test_documents_int_mode",
@@ -985,7 +1005,7 @@ class TextVectorizationOutputTest(
         output_mode=text_vectorization.BINARY,
         pad_to_max_tokens=False)
     _ = layer(input_data)
-    with self.assertRaisesRegex(RuntimeError, "vocabulary cannot be changed"):
+    with self.assertRaisesRegex(RuntimeError, "can't be adapted after being"):
       layer.adapt(vocab_data)
 
   def test_bag_output_soft_maximum_set_state_variables_after_call_fails(self):
@@ -1347,6 +1367,7 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 ".*`output_sequence_length` must not be set.*"):
       _ = get_layer_class()(output_mode="count", output_sequence_length=2)
 
+
 # Custom functions for the custom callable serialization test. Declared here
 # to avoid multiple registrations from run_all_keras_modes().
 @generic_utils.register_keras_serializable(package="Test")
@@ -1528,208 +1549,5 @@ class TextVectorizationSavingTest(
     self.assertAllClose(new_output_dataset, expected_output)
 
 
-@keras_parameterized.run_all_keras_modes
-class TextVectorizationCombinerTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def compare_text_accumulators(self, a, b, msg=None):
-    if a is None or b is None:
-      self.assertAllEqual(a, b, msg=msg)
-
-    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
-    self.assertAllEqual(a.metadata, b.metadata, msg=msg)
-
-    if a.per_doc_count_dict is not None:
-
-      def per_doc_counts(accumulator):
-        count_values = [
-            count_dict["count"]
-            for count_dict in accumulator.per_doc_count_dict.values()
-        ]
-        return dict(zip(accumulator.per_doc_count_dict.keys(), count_values))
-
-      self.assertAllEqual(per_doc_counts(a), per_doc_counts(b), msg=msg)
-
-  compare_accumulators = compare_text_accumulators
-
-  def update_accumulator(self, accumulator, data):
-    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
-    accumulator.metadata[0] = data["num_documents"]
-
-    if "document_counts" in data:
-      create_dict = lambda x: {"count": x, "last_doc_id": -1}
-      idf_count_dicts = [
-          create_dict(count) for count in data["document_counts"]
-      ]
-      idf_dict = dict(zip(data["vocab"], idf_count_dicts))
-
-      accumulator.per_doc_count_dict.update(idf_dict)
-
-    return accumulator
-
-  def test_combiner_api_compatibility_int_mode(self):
-    data = np.array([["earth", "wind", "and", "fire"],
-                     ["earth", "wind", "and", "michigan"]])
-    combiner = text_vectorization._TextVectorizationCombiner(compute_idf=False)
-    expected_accumulator_output = {
-        "vocab": np.array(["and", "earth", "wind", "fire", "michigan"]),
-        "counts": np.array([2, 2, 2, 1, 1]),
-        "num_documents": np.array(2),
-    }
-    expected_extract_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-    }
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_serialize_and_deserialize(combiner, data,
-                                                        expected_accumulator)
-    self.validate_accumulator_uniqueness(combiner, data)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-  def test_combiner_api_compatibility_tfidf_mode(self):
-    data = np.array([["earth", "wind", "and", "fire"],
-                     ["earth", "wind", "and", "michigan"]])
-    combiner = text_vectorization._TextVectorizationCombiner(compute_idf=True)
-    expected_extract_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-        "idf": np.array([0.510826, 0.510826, 0.510826, 0.693147, 0.693147]),
-        "oov_idf": np.array([1.098612])
-    }
-    expected_accumulator_output = {
-        "vocab": np.array(["wind", "earth", "and", "michigan", "fire"]),
-        "counts": np.array([2, 2, 2, 1, 1]),
-        "document_counts": np.array([2, 2, 2, 1, 1]),
-        "num_documents": np.array(2),
-    }
-
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_serialize_and_deserialize(combiner, data,
-                                                        expected_accumulator)
-    self.validate_accumulator_uniqueness(combiner, data)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-  # TODO(askerryryan): Add tests confirming equivalence to behavior of
-  # existing tf.keras.preprocessing.text.Tokenizer.
-  @parameterized.named_parameters(
-      {
-          "testcase_name":
-              "top_k_smaller_than_full_vocab",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name":
-              "top_k_larger_than_full_vocab",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              10,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name":
-              "no_top_k",
-          "data":
-              np.array([["earth", "wind"], ["fire", "wind"], ["and"],
-                        ["fire", "wind"]]),
-          "vocab_size":
-              None,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "counts": np.array([3, 2, 1, 1]),
-              "document_counts": np.array([3, 2, 1, 1]),
-              "num_documents": np.array(4),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth", "and"]),
-              "idf": np.array([0.693147, 0.847298, 1.098612, 1.098612]),
-              "oov_idf": np.array([1.609438]),
-          },
-      },
-      {
-          "testcase_name": "single_element_per_row",
-          "data": np.array([["earth"], ["wind"], ["fire"], ["wind"], ["and"]]),
-          "vocab_size": 3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "and", "earth", "fire"]),
-              "counts": np.array([2, 1, 1, 1]),
-              "document_counts": np.array([2, 1, 1, 1]),
-              "num_documents": np.array(5),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.980829, 1.252763, 1.252763]),
-              "oov_idf": np.array([1.791759]),
-          },
-      },
-      # Which tokens are retained are based on global frequency, and thus are
-      # sensitive to frequency within a document. In contrast, because idf only
-      # considers the presence of a token in a document, it is insensitive
-      # to the frequency of the token within the document.
-      {
-          "testcase_name":
-              "retained_tokens_sensitive_to_within_document_frequency",
-          "data":
-              np.array([["earth", "earth"], ["wind", "wind"], ["fire", "fire"],
-                        ["wind", "wind"], ["and", "michigan"]]),
-          "vocab_size":
-              3,
-          "expected_accumulator_output": {
-              "vocab": np.array(["wind", "earth", "fire", "and", "michigan"]),
-              "counts": np.array([4, 2, 2, 1, 1]),
-              "document_counts": np.array([2, 1, 1, 1, 1]),
-              "num_documents": np.array(5),
-          },
-          "expected_extract_output": {
-              "vocab": np.array(["wind", "fire", "earth"]),
-              "idf": np.array([0.980829, 1.252763, 1.252763]),
-              "oov_idf": np.array([1.791759]),
-          },
-      })
-  def test_combiner_computation(self,
-                                data,
-                                vocab_size,
-                                expected_accumulator_output,
-                                expected_extract_output,
-                                compute_idf=True):
-    combiner = text_vectorization._TextVectorizationCombiner(
-        vocab_size=vocab_size, compute_idf=compute_idf)
-    expected_accumulator = combiner._create_accumulator()
-    expected_accumulator = self.update_accumulator(expected_accumulator,
-                                                   expected_accumulator_output)
-    self.validate_accumulator_computation(combiner, data, expected_accumulator)
-    self.validate_accumulator_extract(combiner, data, expected_extract_output)
-
-
-
 if __name__ == "__main__":
   test.main()

From 6772e0ca843165a3e6bde8efaf1bfcddb78adbe7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 09:24:15 -0700
Subject: [PATCH 0824/1533] Internal visibility change

PiperOrigin-RevId: 312294090
Change-Id: Id01f9c913cfef7200b7cee80323c25a734cddf4e
---
 tensorflow/python/ops/structured/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/ops/structured/BUILD b/tensorflow/python/ops/structured/BUILD
index e9504efdd99..64b7bd7f1d5 100644
--- a/tensorflow/python/ops/structured/BUILD
+++ b/tensorflow/python/ops/structured/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow:tensorflow.bzl", "py_test")
 package(
     default_visibility = [
         "//learning/tfx/autotfx:__subpackages__",
+        "//research/graph/convolutions/model/autotfx:__subpackages__",
         "//tensorflow:internal",
     ],
     licenses = ["notice"],  # Apache 2.0

From 3114f6b980e34acbb8137e4f1718fa58dfdc1b4b Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 19 May 2020 09:25:16 -0700
Subject: [PATCH 0825/1533] Note the reason why tests in numerics_test.py are
 marked v1-only

PiperOrigin-RevId: 312294288
Change-Id: I37f7c467605bebd22e4dc1f1d904bc74300c801a
---
 tensorflow/python/kernel_tests/numerics_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 950658bc886..475badb6efe 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -66,7 +66,7 @@ class VerifyTensorAllFiniteTest(test.TestCase):
         self.evaluate(t_verified)
 
 
-@test_util.run_v1_only("b/120545219")
+@test_util.run_v1_only("add_check_numerics_op() is meant to be a v1-only API")
 class NumericsTest(test.TestCase):
 
   def testInf(self):

From 60ac3647968e36be809cdaede086cf8cb8cd8fb5 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <laurenzo@google.com>
Date: Tue, 19 May 2020 09:33:05 -0700
Subject: [PATCH 0826/1533] Add a Compare() builder that is compatible with
 omitting broadcast_dimensions with the same ordering as the other binary ops.

This helps reduce boilerplate of generated code that seeks to treat all binary ops generically.

PiperOrigin-RevId: 312295743
Change-Id: I7d12b26579ef5375394e5980fec3c11c128318f7
---
 tensorflow/compiler/xla/client/xla_builder.cc | 4 ++++
 tensorflow/compiler/xla/client/xla_builder.h  | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index a4e5b936153..58365c0f498 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -3188,6 +3188,10 @@ XlaOp Compare(const XlaOp lhs, const XlaOp rhs,
                                  broadcast_dimensions, direction);
 }
 
+XlaOp Compare(const XlaOp lhs, const XlaOp rhs, ComparisonDirection direction) {
+  return Compare(lhs, rhs, {}, direction);
+}
+
 XlaOp Dot(const XlaOp lhs, const XlaOp rhs,
           const PrecisionConfig* precision_config) {
   return lhs.builder()->Dot(lhs, rhs, precision_config);
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index b631514248c..426b6d83207 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -889,6 +889,7 @@ class XlaBuilder {
   friend XlaOp Compare(XlaOp lhs, XlaOp rhs,
                        absl::Span<const int64> broadcast_dimensions,
                        ComparisonDirection direction);
+  friend XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
   friend XlaOp Dot(XlaOp lhs, XlaOp rhs,
                    const PrecisionConfig* precision_config);
   friend XlaOp DotGeneral(XlaOp lhs, XlaOp rhs,
@@ -1498,10 +1499,12 @@ XlaOp Lt(XlaOp lhs, XlaOp rhs,
 XlaOp Le(XlaOp lhs, XlaOp rhs,
          absl::Span<const int64> broadcast_dimensions = {});
 
-// Enqueues a comparison instruction onto the computation.
+// Enqueues a comparison instruction onto the computation (optionally without
+// broadcast_dimensions for consistency with others).
 XlaOp Compare(XlaOp lhs, XlaOp rhs,
               absl::Span<const int64> broadcast_dimensions,
               ComparisonDirection direction);
+XlaOp Compare(XlaOp lhs, XlaOp rhs, ComparisonDirection direction);
 
 // Enqueues a dot instruction onto the computation.
 XlaOp Dot(XlaOp lhs, XlaOp rhs,

From 3245c2f87e4347347542f3f8181d2024ced68287 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 09:44:36 -0700
Subject: [PATCH 0827/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/2e499eee5884

PiperOrigin-RevId: 312297705
Change-Id: I0487894138d9a80b9e0d288808bedd7fc9ba6780
---
 third_party/mlir/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 1ad94212dcd..5ebcbb6e3d2 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2680,6 +2680,7 @@ cc_binary(
     srcs = ["tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp"],
     linkshared = True,
     deps = [
+        ":mlir_c_runner_utils",
         "//third_party/gpus/cuda:cuda_headers",
         "//third_party/gpus/cuda:cuda_runtime",
         "//third_party/gpus/cuda:libcuda",

From 6c1f11a557add7f836751361f26caf2e0062d509 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 09:52:02 -0700
Subject: [PATCH 0828/1533] Minor fix to include order.

PiperOrigin-RevId: 312298890
Change-Id: I3ae60f2d4c5f6c92aa165c7fa1263445c4a98a6d
---
 .../keras/layers/preprocessing/text_vectorization_test.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index ede4cc287ac..2a6ffd223c8 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -37,9 +37,9 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import convolutional
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers import embeddings
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1
-from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.saving import saved_model_experimental as saving
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope

From 81065dbaba8c6b8b5a2ebe8ed1e7f476f580cea1 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Tue, 19 May 2020 09:57:30 -0700
Subject: [PATCH 0829/1533] Add TPUEmbedding mid level API.

PiperOrigin-RevId: 312299886
Change-Id: If8e24b080c0fc6d841c0c681aae71d7537b704f8
---
 .../tools/api/generator/api_init_files.bzl    |    1 +
 .../tools/api/generator/api_init_files_v1.bzl |    1 +
 tensorflow/python/tpu/BUILD                   |   41 +
 tensorflow/python/tpu/api.py                  |    2 +
 tensorflow/python/tpu/tpu_embedding_v2.py     | 1318 +++++++++++++++++
 .../python/tpu/tpu_embedding_v2_utils.py      |  624 ++++++++
 ....tpu.experimental.embedding.-adagrad.pbtxt |   10 +
 ...low.tpu.experimental.embedding.-adam.pbtxt |   10 +
 ...perimental.embedding.-feature-config.pbtxt |    9 +
 ...ow.tpu.experimental.embedding.-s-g-d.pbtxt |   10 +
 ...erimental.embedding.-t-p-u-embedding.pbtxt |   27 +
 ...experimental.embedding.-table-config.pbtxt |    9 +
 ...ensorflow.tpu.experimental.embedding.pbtxt |   27 +
 .../v1/tensorflow.tpu.experimental.pbtxt      |    4 +
 ....tpu.experimental.embedding.-adagrad.pbtxt |   10 +
 ...low.tpu.experimental.embedding.-adam.pbtxt |   10 +
 ...perimental.embedding.-feature-config.pbtxt |    9 +
 ...ow.tpu.experimental.embedding.-s-g-d.pbtxt |   10 +
 ...erimental.embedding.-t-p-u-embedding.pbtxt |   27 +
 ...experimental.embedding.-table-config.pbtxt |    9 +
 ...ensorflow.tpu.experimental.embedding.pbtxt |   27 +
 .../v2/tensorflow.tpu.experimental.pbtxt      |    4 +
 22 files changed, 2199 insertions(+)
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v2.py
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v2_utils.py
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt

diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 13068a8090e..03120fb8dc4 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -67,6 +67,7 @@ TENSORFLOW_API_INIT_FILES = [
     "summary/experimental/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
     "train/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index e5f0f46898f..a8154c6f35c 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -85,6 +85,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "summary/__init__.py",
     "sysconfig/__init__.py",
     "test/__init__.py",
+    "tpu/experimental/embedding/__init__.py",
     "tpu/experimental/__init__.py",
     "tpu/__init__.py",
     "train/__init__.py",
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index ebf0a4ffc57..5b466d7e20a 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -179,6 +179,8 @@ py_library(
         ":feature_column_v2",
         ":preempted_hook_py",
         ":tpu_embedding",
+        ":tpu_embedding_v2",
+        ":tpu_embedding_v2_utils",
         ":tpu_lib",
     ],
 )
@@ -435,6 +437,45 @@ tf_py_test(
     ],
 )
 
+py_library(
+    name = "tpu_embedding_v2_utils",
+    srcs = ["tpu_embedding_v2_utils.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain/contrib/learn/tpu:__subpackages__",
+        "//quality/deepsearch:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training/saving:saveable_hook",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "tpu_embedding_v2",
+    srcs = ["tpu_embedding_v2.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain/contrib/learn/tpu:__subpackages__",
+        "//quality/deepsearch:__subpackages__",
+    ],
+    deps = [
+        ":tpu_embedding_v2_utils",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu:tpu_py",
+        "//tensorflow/python/training/saving:saveable_hook",
+        "@six_archive//:six",
+    ],
+)
+
 tf_proto_library(
     name = "tensor_tracer_proto",
     srcs = ["tensor_tracer.proto"],
diff --git a/tensorflow/python/tpu/api.py b/tensorflow/python/tpu/api.py
index 7296de81dfe..a7db89ec0a5 100644
--- a/tensorflow/python/tpu/api.py
+++ b/tensorflow/python/tpu/api.py
@@ -27,5 +27,7 @@ from tensorflow.python.tpu import bfloat16
 from tensorflow.python.tpu import feature_column_v2
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_embedding
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
 from tensorflow.python.tpu import tpu_optimizer
 # pylint: enable=unused-import
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
new file mode 100644
index 00000000000..3b454d5428c
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -0,0 +1,1318 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Mid level API for TPU Embeddings."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import functools
+from absl import logging
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import sharded_variable
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute import values as tf_values
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.training.saving import saveable_hook
+from tensorflow.python.training.tracking import tracking
+from tensorflow.python.util import compat
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+_HOOK_KEY = "TPUEmbedding_saveable"
+_NAME_KEY = "_tpu_embedding_layer"
+
+
+# TODO(bfontain): Cleanup and remove this once there is an implementation of
+# sharded variables that can be used in the PSStrategy with optimizers.
+# We implement just enough of the of a tf.Variable so that this could be passed
+# to an optimizer.
+class TPUShardedVariable(sharded_variable.ShardedVariable):
+  """A ShardedVariable class for TPU."""
+
+  @property
+  def _in_graph_mode(self):
+    return self.variables[0]._in_graph_mode  # pylint: disable=protected-access
+
+  @property
+  def _unique_id(self):
+    return self.variables[0]._unique_id  # pylint: disable=protected-access
+
+  @property
+  def _distribute_strategy(self):
+    return self.variables[0]._distribute_strategy  # pylint: disable=protected-access
+
+  @property
+  def _shared_name(self):
+    return self._name
+
+
+def _add_key_attr(op, name):
+  op._set_attr(_NAME_KEY, attr_value_pb2.AttrValue(s=compat.as_bytes(name)))  # pylint: disable=protected-access
+
+
+@tf_export("tpu.experimental.embedding.TPUEmbedding")
+class TPUEmbedding(tracking.AutoTrackable):
+  """The TPUEmbedding mid level API.
+
+  NOTE: When instantiated under a TPUStrategy, this class can only be created
+  once per call to `tf.tpu.experimental.initialize_tpu_system`. If you wish to
+  re-initialize the embedding engine you must re-initialize the tpu as well.
+  Doing this will clear any variables from TPU, so ensure you have checkpointed
+  before you do this. If a further instances of the class are needed,
+  set the `initialize_tpu_embedding` argument to `False`.
+
+  This class can be used to support training large embeddings on TPU. When
+  creating an instance of this class, you must specify the complete set of
+  tables and features you expect to lookup in those tables. See the
+  documentation of `tf.tpu.experimental.embedding.TableConfig` and
+  `tf.tpu.experimental.embedding.FeatureConfig` for more details on the complete
+  set of options. We will cover the basic usage here.
+
+  NOTE: multiple `FeatureConfig` objects can use the same `TableConfig` object,
+  allowing different features to share the same table:
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  ```
+
+  There are two modes under which the `TPUEmbedding` class can used. This
+  depends on if the class was created under a `TPUStrategy` scope or not.
+
+  Under `TPUStrategy`, we allow access to the method `enqueue`, `dequeue` and
+  `apply_gradients`. We will show examples below of how to use these to train
+  and evaluate your model. Under CPU, we only access to the `embedding_tables`
+  property which allow access to the embedding tables so that you can use them
+  to run model evaluation/prediction on CPU.
+
+  First lets look at the `TPUStrategy` mode. Initial setup looks like:
+
+  ```python
+  strategy = tf.distribute.experimental.TPUStrategy(...)
+  with strategy.scope():
+    embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+        feature_config=feature_config,
+        batch_size=1024,
+        optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  To use this API on TPU you should use a custom training loop. Below is an
+  example of a training and evaluation step:
+
+  ```python
+  @tf.function
+  def training_step(dataset_iterator, num_steps):
+    def tpu_step(tpu_features):
+      with tf.GradientTape() as tape:
+        activations = embedding.dequeue()
+        tape.watch(activations)
+        model_output = model(activations)
+        loss = ...  # some function of labels and model_output
+
+      embedding_gradients = tape.gradient(loss, activations)
+      embedding.apply_gradients(embedding_gradients)
+      # Insert your model gradient and optimizer application here
+
+    for _ in tf.range(num_steps):
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+  @tf.function
+  def evalution_step(dataset_iterator, num_steps):
+    def tpu_step(tpu_features):
+      activations = embedding.dequeue()
+      model_output = model(activations)
+      # Insert your evaluation code here.
+
+    for _ in tf.range(num_steps):
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=False)
+      strategy.run(tpu_step, args=(embedding_features, ))
+  ```
+
+  NOTE: The calls to `enqueue` have `training` set to `True` when
+  `embedding.apply_gradients` is used and set to `False` when
+  `embedding.apply_gradients` is not present in the function. If you don't
+  follow this pattern you may cause an error to be raised or the tpu may
+  deadlock.
+
+  In the above examples, we assume that the user has a dataset which returns
+  a tuple where the first element of the tuple matches the structure of what
+  was passed as the `feature_config` argument to the object initializer. Also we
+  utilize `tf.range` to get a `tf.while_loop` in order to increase performance.
+
+  When checkpointing your model, you should include your
+  `tf.tpu.experimental.embedding.TPUEmbedding` object in the checkpoint. It is a
+  trackable object and saving it will save the embedding tables and their
+  optimizer slot variables:
+
+  ```python
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.save(...)
+  ```
+
+  On CPU, only the `embedding_table` property is usable. This will allow you to
+  restore a checkpoint to the object and have access to the table variables:
+
+  ```python
+  model = model_fn(...)
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=1024,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  checkpoint = tf.train.Checkpoint(model=model, embedding=embedding)
+  checkpoint.restore(...)
+
+  tables = embedding.embedding_tables
+  ```
+
+  You can now use table in functions like `tf.nn.embedding_lookup` to perform
+  your embedding lookup and pass to your model.
+
+  """
+
+  def __init__(self, feature_config, batch_size, optimizer,
+               pipeline_execution_with_tensor_core=False,
+               initialize_tpu_embedding=True):
+    """Creates the TPUEmbedding mid level API object.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+          feature_config=tf.tpu.experimental.embedding.FeatureConfig(
+              table=tf.tpu.experimental.embedding.TableConfig(
+                  dim=...,
+                  vocabulary_size=...)))
+    ```
+
+    Args:
+      feature_config: A nested structure of
+        `tf.tpu.experimental.embedding.FeatureConfig` configs.
+      batch_size: The global batch size that you indend to use. Note that is
+        fixed and the same batch size must be used for both training and
+        evaluation.
+      optimizer: An instance of one of `tf.tpu.experimental.embedding.SGD`,
+        `tf.tpu.experimental.embedding.Adagrad` or
+        `tf.tpu.experimental.embedding.Adam`.
+      pipeline_execution_with_tensor_core: If True, the TPU embedding
+        computations will overlap with the TensorCore computations (and hence
+        will be one step old). Set to True for improved performance.
+      initialize_tpu_embedding: If False, will not initialize the TPU embedding
+        engine. If this is set to False and another instance of this class has
+        not initialized the tpu embedding engine, the creation of this object
+        will fail.
+
+    Raises:
+      ValueError: If optimizer is not one of tf.tpu.experimental.embedding.(SGD,
+      Adam or Adagrad).
+    """
+    self._strategy = distribution_strategy_context.get_strategy()
+    self._using_tpu = isinstance(self._strategy, tpu_strategy.TPUStrategy)
+    self._pipeline_execution_with_tensor_core = (
+        pipeline_execution_with_tensor_core)
+
+    self._feature_config = feature_config
+
+    # The TPU embedding ops are slightly inconsistent with how they refer to
+    # tables:
+    # * The enqueue op takes a parallel list of tensors for input, one of those
+    #   is the table id for the feature which matches the integer index of the
+    #   table in the proto created by _create_config_proto().
+    # * The recv_tpu_embedding_activations op emits lookups per table in the
+    #   order from the config proto.
+    # * The send_tpu_embedding_gradients expects input tensors to be per table
+    #   in the same order as the config proto.
+    # * Per optimizer load and retrieve ops are specified per table and take the
+    #   table name rather than the table id.
+    # Thus we must fix a common order to tables and ensure they have unique
+    # names.
+
+    # Set table order here
+    self._table_config = list(
+        {feature.table for feature in nest.flatten(feature_config)})
+
+    # Ensure tables have unique names. Also error check the optimizer as we
+    # specifically don't do that in the TableConfig class to allow high level
+    # APIs that are built on this to use strings/other classes to represent
+    # optimizers (before they are passed to this class).
+    table_names = []
+    for i, table in enumerate(self._table_config):
+      if table.optimizer is None:
+        # TODO(bfontain) Should we allow some sort of optimizer merging here?
+        table.optimizer = optimizer
+      if not isinstance(table.optimizer, tpu_embedding_v2_utils._Optimizer):  # pylint: disable=protected-access
+        raise ValueError("{} is an unsupported optimizer class. Please pass an "
+                         "instance of one of the optimizer classes under "
+                         "tf.tpu.experimental.embedding.".format(
+                             type(table.optimizer)))
+      if table.name is None:
+        table.name = "table_{}".format(i)
+      if table.name in table_names:
+        raise ValueError("Multiple tables with name {} found.".format(
+            table.name))
+      table_names.append(table.name)
+
+    if self._using_tpu:
+      # Extract a list of callable learning rates also in fixed order. Each
+      # table in the confix proto will get a index into this list and we will
+      # pass this list in the same order after evaluation to the
+      # send_tpu_embedding_gradients op.
+      self._dynamic_learning_rates = list({
+          table.optimizer.learning_rate for table in self._table_config if
+          callable(table.optimizer.learning_rate)})
+
+      # We need to list of host devices for the load/retrieve operations.
+      self._hosts = get_list_of_hosts(self._strategy)
+
+      # TODO(bfontain) Remove this once we have an official way of splitting
+      # prefetch between host and device.
+      self._strategy.extended._set_prefetch_on_host(True)  # pylint: disable=protected-access
+
+      # We generally use the per core batch size, but will have the user pass
+      # in a global batch size.
+      self._batch_size = batch_size // self._strategy.num_replicas_in_sync
+
+      self._config_proto = self._create_config_proto()
+      if initialize_tpu_embedding:
+        # This is mainly for testing purposes, sometimes we don't want to
+        # initialize the embedding engine, but just want a copy of the API
+        # which can interact with an already initialized engine.
+        logging.info("Initializing TPU Embedding engine with config: %s",
+                     self._config_proto)
+        @def_function.function
+        def load_config():
+          tpu.initialize_system_for_tpu_embedding(self._config_proto)
+
+        load_config()
+        logging.info("Done initializing TPU Embedding engine.")
+
+    # Create and load variables and slot variables into the TPU.
+    # Note that this is a dict of dicts. Keys to the first dict are table names.
+    # We would prefer to use TableConfigs, but then these variables won't be
+    # properly tracked by the tracking API.
+    self._variables = self._create_variables_and_slots()
+    if self._using_tpu:
+      self._load_variables()
+
+  @property
+  def embedding_tables(self):
+    """Returns a dict of embedding tables, keyed by `TableConfig`.
+
+    This property only works when the `TPUEmbedding` object is created under a
+    non-TPU strategy. This is intended to be used to for CPU based lookup when
+    creating a serving checkpoint.
+
+    Returns:
+      A dict of embedding tables, keyed by `TableConfig`.
+
+    Raises:
+      RuntimeError: If object was created under a `TPUStrategy`.
+    """
+    # We don't support returning tables on TPU due to their sharded nature and
+    # the fact that when using a TPUStrategy:
+    # 1. Variables are stale and are only updated when a checkpoint is made.
+    # 2. Updating the variables won't affect the actual tables on the TPU.
+    if self._using_tpu:
+      raise RuntimeError("Unable to retrieve embedding tables when using a TPU "
+                         "strategy. If you need access, save your model, "
+                         "create this object under a CPU strategy and restore.")
+
+    # Only return the tables and not the slot variables. On CPU this are honest
+    # tf.Variables.
+    return {table: self._variables[table.name]["parameters"]
+            for table in self._table_config}
+
+  def _create_config_proto(self):
+    """Creates the TPUEmbeddingConfiguration proto.
+
+    This proto is used to initialize the TPU embedding engine.
+
+    Returns:
+      A TPUEmbeddingConfiguration proto.
+    """
+
+    config_proto = tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration()
+
+    # There are several things that need to be computed here:
+    # 1. Each table has a num_features, which corresponds to the number of
+    #    output rows per example for this table. Sequence features count for
+    #    their maximum sequence length.
+    # 2. Learning rate index: the index of the dynamic learning rate for this
+    #    table (if it exists) in the list we created at initialization.
+    #    We don't simply create one learning rate index per table as this has
+    #    extremely bad performance characteristics. The more separate
+    #    optimization configurations we have, the worse the performance will be.
+    num_features = {table: 0 for table in self._table_config}
+    for feature in nest.flatten(self._feature_config):
+      num_features[feature.table] += (1 if feature.max_sequence_length == 0
+                                      else feature.max_sequence_length)
+
+    # Map each callable dynamic learning rate to its in index in the list.
+    learning_rate_index = {r: i for i, r in enumerate(
+        self._dynamic_learning_rates)}
+
+    for table in self._table_config:
+      table_descriptor = config_proto.table_descriptor.add()
+      table_descriptor.name = table.name
+
+      # For small tables, we pad to the number of hosts so that at least one
+      # id will be assigned to each host.
+      table_descriptor.vocabulary_size = max(table.vocabulary_size,
+                                             self._strategy.extended.num_hosts)
+      table_descriptor.dimension = table.dim
+
+      table_descriptor.num_features = num_features[table]
+
+      parameters = table_descriptor.optimization_parameters
+
+      # We handle the learning rate separately here and don't allow the
+      # optimization class to handle this, as it doesn't know about dynamic
+      # rates.
+      if callable(table.optimizer.learning_rate):
+        parameters.learning_rate.dynamic.tag = (
+            learning_rate_index[table.optimizer.learning_rate])
+      else:
+        parameters.learning_rate.constant = table.optimizer.learning_rate
+
+      # Use optimizer to handle the rest of the parameters.
+      table.optimizer._set_optimization_parameters(parameters)  # pylint: disable=protected-access
+
+    # Always set mode to training, we override the mode during enqueue.
+    config_proto.mode = (
+        tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.TRAINING)
+
+    config_proto.batch_size_per_tensor_core = self._batch_size
+    config_proto.num_hosts = self._strategy.extended.num_hosts
+    config_proto.num_tensor_cores = self._strategy.num_replicas_in_sync
+
+    # TODO(bfontain): Allow users to pick MOD for the host sharding.
+    config_proto.sharding_strategy = (
+        tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.DIV_DEFAULT)
+    config_proto.pipeline_execution_with_tensor_core = (
+        self._pipeline_execution_with_tensor_core)
+
+    return config_proto
+
+  def _compute_per_table_gradients(self, gradients):
+    """Computes a dict of lists of gradients, keyed by table name.
+
+    Args:
+      gradients: A nested structure of Tensors (and Nones) with the same
+        structure as the feature config.
+
+    Returns:
+      A dict of lists of tensors, keyed by the table names, containing the
+    gradients in the correct order with None gradients repalaced by zeros.
+    """
+
+    nest.assert_same_structure(self._feature_config, gradients)
+
+    per_table_gradients = {table: [] for table in self._table_config}
+    for (path, gradient), feature in zip(
+        nest.flatten_with_joined_string_paths(gradients),
+        nest.flatten(self._feature_config)):
+      if gradient is not None and not isinstance(gradient, ops.Tensor):
+        raise ValueError(
+            "Found {} at path {} in gradients. Expected Tensor.".format(
+                type(gradient), path))
+
+      # Expected tensor shape differs for sequence and non-sequence features.
+      if feature.max_sequence_length > 0:
+        shape = [self._batch_size, feature.max_sequence_length,
+                 feature.table.dim]
+      else:
+        shape = [self._batch_size, feature.table.dim]
+
+      if gradient is not None:
+        if gradient.shape != shape:
+          raise ValueError("Found gradient of shape {} at path {}. Expected "
+                           "shape {}.".format(gradient.shape, path, shape))
+
+        # We expand dims on non-sequence features so that all features are
+        # of rank 3 and we can concat on axis=1.
+        if len(shape) == 2:
+          gradient = array_ops.expand_dims(gradient, axis=1)
+      else:
+        # No gradient for this feature, since we must give a gradient for all
+        # features, pass in a zero tensor here. Note that this is not correct
+        # for all optimizers.
+        logging.warn("No gradient passed for feature %s, sending zero "
+                     "gradient. This may not be correct behavior for certain "
+                     "optimizers like Adam.", path)
+        # Create a shape to mimic the expand_dims above for non-sequence
+        # features.
+        if len(shape) == 2:
+          shape = [shape[0], 1, shape[1]]
+        gradient = array_ops.zeros(shape, dtype=dtypes.float32)
+      per_table_gradients[feature.table].append(gradient)
+
+    return per_table_gradients
+
+  def apply_gradients(self, gradients, name=None):
+    """Applies the gradient update to the embedding tables.
+
+    If a gradient of `None` is passed in any position of the nested structure,
+    then an gradient update with a zero gradient is applied for that feature.
+    For optimizers like SGD or Adagrad, this is the same as applying no update
+    at all. For lazy Adam and other sparsely applied optimizers with decay,
+    ensure you understand the effect of applying a zero gradient.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+    training_step()
+    ```
+
+    Args:
+      gradients: A nested structure of gradients, with structure matching the
+        `feature_config` passed to this object.
+      name: A name for the underlying op.
+
+    Raises:
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+      ValueError: If a non-`tf.Tensor` non-`None` gradient is passed in, or a
+        `tf.Tensor` of the incorrect shape is passed in. Also if
+        the size of any sequence in `gradients` does not match corresponding
+        sequence in `feature_config`.
+      TypeError: If the type of any sequence in `gradients` does not match
+        corresponding sequence in `feature_config`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("apply_gradients is not valid when TPUEmbedding "
+                         "object is not created under a TPUStrategy.")
+
+    # send_tpu_embedding_gradients requires per table gradient, if we only have
+    # one feature per table this isn't an issue. When multiple features share
+    # the same table, the order of the features in per table tensor returned by
+    # recv_tpu_embedding_activations matches the order in which they were passed
+    # to enqueue.
+    # In all three places, we use the fixed order given by nest.flatten to have
+    # a consistent feature order.
+
+    # First construct a dict of tensors one for each table.
+    per_table_gradients = self._compute_per_table_gradients(gradients)
+
+    # Now that we have a list of gradients we can compute a list of gradients
+    # in the fixed order of self._table_config which interleave the gradients of
+    # the individual features. We concat on axis 1 and then reshape into a 2d
+    # tensor. The send gradients op expects a tensor of shape
+    # [num_features*batch_size, dim] for each table.
+    interleaved_gradients = []
+    for table in self._table_config:
+      interleaved_gradients.append(array_ops.reshape(
+          array_ops.concat(per_table_gradients[table], axis=1),
+          [-1, table.dim]))
+    op = tpu_ops.send_tpu_embedding_gradients(
+        inputs=interleaved_gradients,
+        learning_rates=[math_ops.cast(fn(), dtype=dtypes.float32)
+                        for fn in self._dynamic_learning_rates],
+        config=self._config_proto.SerializeToString())
+
+    # Apply the name tag to the op.
+    if name is not None:
+      _add_key_attr(op, name)
+
+  def dequeue(self, name=None):
+    """Get the embedding results.
+
+    Returns a nested structure of `tf.Tensor` objects, matching the structure of
+    the `feature_config` argument to the `TPUEmbedding` class. The output shape
+    of the tensors is `(batch_size, dim)`, where `batch_size` is the per core
+    batch size, `dim` is the dimension of the corresponding `TableConfig`. If
+    the feature's corresponding `FeatureConfig` has `max_sequence_length`
+    greater than 0, the output will be a sequence of shape
+    `(batch_size, max_sequence_length, dim)` instead.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features, ))
+
+    training_step()
+    ```
+
+    Args:
+      name: A name for the underlying op.
+
+    Returns:
+      A nested structure of tensors, with the same structure as `feature_config`
+    passed to this instance of the `TPUEmbedding` object.
+
+    Raises:
+      RuntimeError: If called when object wasn't created under a `TPUStrategy`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("dequeue is not valid when TPUEmbedding object is not "
+                         "created under a TPUStrategy.")
+
+    # The activations returned by this op are per table. So we must separate
+    # them out into per feature activations. The activations are interleaved:
+    # for each table, we expect a [num_features*batch_size, dim] tensor.
+    # E.g. we expect the slice [:num_features, :] to contain the lookups for the
+    # first example of all features using this table.
+    activations = tpu_ops.recv_tpu_embedding_activations(
+        num_outputs=len(self._table_config),
+        config=self._config_proto.SerializeToString())
+
+    # Apply the name tag to the op.
+    if name is not None:
+      _add_key_attr(activations[0].op, name)
+
+    # Compute the number of features for this  table.
+    num_features = {table: 0 for table in self._table_config}
+    for feature in nest.flatten(self._feature_config):
+      num_features[feature.table] += (1 if feature.max_sequence_length == 0
+                                      else feature.max_sequence_length)
+
+    # Activations are reshaped so that they are indexed by batch size and then
+    # by the 'feature' index within the batch. The final dimension should equal
+    # the dimension of the table.
+    table_to_activation = {
+        table: array_ops.reshape(activation,
+                                 [self._batch_size, num_features[table], -1])
+        for table, activation in zip(self._table_config, activations)}
+
+    # We process the features in the same order we enqueued them.
+    # For each feature we take the next slice of the activations, so need to
+    # track the activations and the current position we are in.
+    table_to_position = {table: 0 for table in self._table_config}
+
+    per_feature_activations = []
+    for feature in nest.flatten(self._feature_config):
+      activation = table_to_activation[feature.table]
+      feature_index = table_to_position[feature.table]
+      # We treat non-sequence and sequence features differently here as sequence
+      # features have rank 3 while non-sequence features have rank 2.
+      if feature.max_sequence_length == 0:
+        per_feature_activations.append(
+            activation[:, feature_index, :])
+        table_to_position[feature.table] += 1
+      else:
+        per_feature_activations.append(
+            activation[:, feature_index:(
+                feature_index+feature.max_sequence_length), :])
+        table_to_position[feature.table] += feature.max_sequence_length
+
+    # Pack the list back into the same nested structure as the features.
+    return nest.pack_sequence_as(self._feature_config, per_feature_activations)
+
+  def _create_variables_and_slots(self):
+    """Create variables for TPU embeddings.
+
+    Note under TPUStrategy this will ensure that all creations happen within a
+    variable creation scope of the sharded variable creator.
+
+    Returns:
+      A dict of dicts. The outer dict is keyed by the table names and the inner
+      dicts are keyed by 'parameters' and the slot variable names.
+    """
+
+    def create_variables(table):
+      """Create all variables."""
+      shape = (table.vocabulary_size, table.dim)
+
+      # We use functools.partial here for the initial_value so that we have a
+      # variable creation that is compatible with both the sharded variable
+      # creator and the normal variable creator. The sharded variable creator
+      # will extract the shape of the tensor from the functool.partial object to
+      # decide on the sharding.
+      parameters = tf_variables.Variable(
+          name=table.name,
+          initial_value=functools.partial(
+              table.initializer, shape=shape, dtype=dtypes.float32),
+          trainable=not self._using_tpu)
+      slot_vars = table.optimizer._create_slots(parameters)  # pylint: disable=protected-access
+      slot_vars["parameters"] = parameters
+      return slot_vars
+
+    # Store tables based on name rather than TableConfig as we can't track
+    # through dicts with non-string keys, i.e. we won't be able to save.
+    variables = {}
+    for table in self._table_config:
+      if not self._using_tpu:
+        variables[table.name] = create_variables(table)
+      else:
+        with variable_scope.variable_creator_scope(
+            make_sharded_variable_creator(self._hosts)):
+          variables[table.name] = create_variables(table)
+
+    return variables
+
+  @def_function.function
+  def _load_variables(self):
+    """Load embedding tables to onto TPU for each table and host."""
+
+    def select_fn(host_id):
+      return lambda x: x.variables[host_id]
+
+    num_hosts = self._strategy.extended.num_hosts
+    config = self._config_proto.SerializeToString()
+    for host_id, host in enumerate(self._hosts):
+      variables = nest.map_structure(select_fn(host_id), self._variables)
+      with ops.device(host):
+        for table in self._table_config:
+          table.optimizer._load()(  # pylint: disable=protected-access
+              table_name=table.name,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config,
+              **variables[table.name])
+          # Ensure that only the first table/first host gets a config so that we
+          # don't bloat graph by attaching this large string to each op.
+          # We have num tables * num hosts of these so for models with a large
+          # number of tables training on a large slice, this can be an issue.
+          config = None
+
+  @def_function.function
+  def _retrieve_variables(self):
+    """Retrieve embedding tables from TPU to host memory."""
+    num_hosts = self._strategy.extended.num_hosts
+    config = self._config_proto.SerializeToString()
+    for host_id, host in enumerate(self._hosts):
+      with ops.device(host):
+        for table in self._table_config:
+          retrieved = table.optimizer._retrieve()(  # pylint: disable=protected-access
+              table_name=table.name,
+              num_shards=num_hosts,
+              shard_id=host_id,
+              config=config)
+          # When there are no slot variables (e.g with SGD) this returns a
+          # single tensor rather than a tuple. In this case we put the tensor in
+          # a list to make the following code easier to write.
+          if not isinstance(retrieved, tuple):
+            retrieved = (retrieved,)
+
+          for i, slot in enumerate(["parameters"] +
+                                   table.optimizer._slot_names()):  # pylint: disable=protected-access
+            # We must assign the CPU variables the values of tensors that were
+            # returned from the TPU.
+            self._variables[table.name][slot].variables[host_id].assign(
+                retrieved[i])
+          # Ensure that only the first table/first host gets a config so that we
+          # don't bloat graph by attaching this large string to each op.
+          # We have num tables * num hosts of these so for models with a large
+          # number of tables training on a large slice, this can be an issue.
+          config = None
+
+  def _gather_saveables_for_checkpoint(self):
+    """Overrides default Trackable implementation to add load/retrieve hook."""
+    # This saveable should be here in both TPU and CPU checkpoints, so when on
+    # CPU, we add the hook with no functions.
+    # TODO(bfontain): Update restore logic in saver so that these hooks are
+    # always executed. Once that is done, we can output an empty list when on
+    # CPU.
+    def factory(name=_HOOK_KEY):
+      return TPUEmbeddingSaveable(
+          name,
+          self._load_variables if self._using_tpu else None,
+          self._retrieve_variables if self._using_tpu else None)
+    return {_HOOK_KEY: factory}
+
+  # Some helper functions for the below enqueue function.
+  def _add_data_for_tensor(self, tensor, weight, indices, values, weights,
+                           int_zeros, float_zeros, path):
+    if weight is not None:
+      raise ValueError(
+          "Weight specified for dense input {}, which is not allowed. "
+          "Weight will always be 1 in this case.".format(path))
+    # For tensors, there are no indices and no weights.
+    indices.append(int_zeros)
+    values.append(math_ops.cast(tensor, dtypes.int32))
+    weights.append(float_zeros)
+
+  def _add_data_for_sparse_tensor(self, tensor, weight, indices, values,
+                                  weights, int_zeros, float_zeros, path):
+    indices.append(math_ops.cast(tensor.indices, dtypes.int32))
+    values.append(math_ops.cast(tensor.values, dtypes.int32))
+    # If we have weights they must be a SparseTensor.
+    if weight is not None:
+      if not isinstance(weight, sparse_tensor.SparseTensor):
+        raise ValueError("Weight for {} is type {} which does not match "
+                         "type input which is SparseTensor.".format(
+                             path, type(weight)))
+      weights.append(math_ops.cast(weight.values, dtypes.float32))
+    else:
+      weights.append(float_zeros)
+
+  def _add_data_for_ragged_tensor(self, tensor, weight, indices, values,
+                                  weights, int_zeros, float_zeros, path):
+    indices.append(math_ops.cast(tensor.row_splits, dtypes.int32))
+    values.append(math_ops.cast(tensor.values, dtypes.int32))
+    # If we have weights they must be a RaggedTensor.
+    if weight is not None:
+      if not isinstance(weight, ragged_tensor.RaggedTensor):
+        raise ValueError("Weight for {} is type {} which does not match "
+                         "type input which is RaggedTensor.".format(
+                             path, type(weight)))
+      weights.append(math_ops.cast(weight.values, dtypes.float32))
+    else:
+      weights.append(float_zeros)
+
+  def _generate_enqueue_op(self, flat_inputs, flat_weights, flat_features,
+                           device_ordinal, mode_override):
+    """Outputs a the enqueue op given the inputs and weights.
+
+    Args:
+      flat_inputs: A list of input tensors.
+      flat_weights: A list of input weights (or None) of the same length as
+        flat_inputs.
+      flat_features: A list of FeatureConfigs of the same length as flat_inputs.
+      device_ordinal: The device to create the enqueue op for.
+      mode_override: A tensor containing the string "train" or "inference".
+
+    Returns:
+      The enqueue op.
+    """
+
+    # First we need to understand which op to use. This depends on if sparse
+    # or ragged tensors are in the flat_inputs.
+    sparse = False
+    ragged = False
+    for inp in flat_inputs:
+      if isinstance(inp, sparse_tensor.SparseTensor):
+        sparse = True
+      elif isinstance(inp, ragged_tensor.RaggedTensor):
+        ragged = True
+    if sparse and ragged:
+      raise ValueError(
+          "Found both SparseTensors and RaggedTensors in the input to the "
+          "enqueue operation. Please ensure that your data does not include "
+          "both SparseTensors and RaggedTensors. It is ok to have Tensors in "
+          "combination with one of the previous types.")
+
+    # Combiners are per table, list in the same order as the table order.
+    combiners = [table.combiner for table in self._table_config]
+
+    # Reverse mapping of self._table_config, so that we can lookup the table
+    # index.
+    table_to_id = {table: i for i, table in enumerate(self._table_config)}
+
+    # These parallel arrays will be the inputs to the enqueue op.
+    indices = []  # sample_indices for sparse, sample_splits for ragged.
+    values = []
+    weights = []
+    table_ids = []
+    max_sequence_lengths = []
+
+    # We have to supply a empty/zero tensor in a list position where we don't
+    # have data (e.g. indices for standard Tensor input, weight when no weight
+    # is specified). We create one op here per call, so that we reduce the
+    # graph size.
+    int_zeros = array_ops.zeros((0,), dtype=dtypes.int32)
+    float_zeros = array_ops.zeros((0,), dtype=dtypes.float32)
+
+    # In the following loop we insert casts so that everything is either int32
+    # or float32. This is because op inputs which are lists of tensors must be
+    # of the same type within the list. Moreover the CPU implementions of these
+    # ops cast to these types anyway, so we don't lose any data by casting
+    # early.
+    for inp, weight, (path, feature) in zip(
+        flat_inputs, flat_weights, flat_features):
+      table_ids.append(table_to_id[feature.table])
+      max_sequence_lengths.append(feature.max_sequence_length)
+      if isinstance(inp, ops.Tensor):
+        self._add_data_for_tensor(inp, weight, indices, values, weights,
+                                  int_zeros, float_zeros, path)
+      elif isinstance(inp, sparse_tensor.SparseTensor):
+        self._add_data_for_sparse_tensor(inp, weight, indices, values, weights,
+                                         int_zeros, float_zeros, path)
+      elif isinstance(inp, ragged_tensor.RaggedTensor):
+        self._add_data_for_ragged_tensor(inp, weight, indices, values, weights,
+                                         int_zeros, float_zeros, path)
+      else:
+        raise ValueError("Input {} is of unknown type {}. Please only pass "
+                         "Tensor, SparseTensor or RaggedTensor as input to "
+                         "enqueue.".format(path, type(inp)))
+
+    if ragged:
+      return tpu_ops.enqueue_tpu_embedding_ragged_tensor_batch(
+          sample_splits=indices,
+          embedding_indices=values,
+          aggregation_weights=weights,
+          mode_override=mode_override,
+          device_ordinal=device_ordinal,
+          combiners=combiners,
+          table_ids=table_ids,
+          max_sequence_lengths=max_sequence_lengths)
+    return tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
+        sample_indices=indices,
+        embedding_indices=values,
+        aggregation_weights=weights,
+        mode_override=mode_override,
+        device_ordinal=device_ordinal,
+        combiners=combiners,
+        table_ids=table_ids,
+        max_sequence_lengths=max_sequence_lengths)
+
+  def _raise_error_for_incorrect_control_flow_context(self):
+    """Raises an error if we are not in the TPUReplicateContext."""
+    # Do not allow any XLA control flow (i.e. control flow in between a
+    # TPUStrategy's run call and the call to this function), as we can't
+    # extract the enqueue from the head when in XLA control flow.
+    graph = ops.get_default_graph()
+    in_tpu_ctx = False
+    while graph is not None:
+      ctx = graph._get_control_flow_context()  # pylint: disable=protected-access
+      while ctx is not None:
+        if isinstance(ctx, tpu.TPUReplicateContext):
+          in_tpu_ctx = True
+          break
+        ctx = ctx.outer_context
+      if in_tpu_ctx:
+        break
+      graph = getattr(graph, "outer_graph", None)
+    if graph != ops.get_default_graph() and in_tpu_ctx:
+      raise RuntimeError(
+          "Current graph {} does not match graph which contains "
+          "TPUReplicateContext {}. This is most likely due to the fact that "
+          "enqueueing embedding data is called inside control flow or a "
+          "nested function inside `strategy.run`. This is not supported "
+          "because outside compilation fails to extract the enqueue ops as "
+          "head of computation.".format(ops.get_default_graph(), graph))
+    return in_tpu_ctx
+
+  def _raise_error_for_non_direct_inputs(self, features):
+    """Checks all tensors in features to see if they are a direct input."""
+
+    # expand_composites here is important: as composite tensors pass through
+    # tpu.replicate, they get 'flattened' into their component tensors and then
+    # repacked before being passed to the tpu function. In means that it is the
+    # component tensors which are produced by an op with the
+    # "_tpu_input_identity" attribute.
+    for path, input_tensor in nest.flatten_with_joined_string_paths(
+        features, expand_composites=True):
+      if input_tensor.op.type == "Placeholder":
+        continue
+      try:
+        is_input = input_tensor.op.get_attr("_tpu_input_identity")
+      except ValueError:
+        is_input = False
+      if not is_input:
+        raise ValueError(
+            "Received input tensor {} which is the output of op {} (type {}) "
+            "which does not have the `_tpu_input_identity` attr. Please "
+            "ensure that the inputs to this layer are taken directly from "
+            "the arguments of the function called by "
+            "strategy.run. Two possible causes are: dynamic batch size "
+            "support or you are using a keras layer and are not passing "
+            "tensors which match the dtype of the `tf.keras.Input`s."
+            "If you are triggering dynamic batch size support, you can "
+            "disable it by passing tf.distribute.RunOptions("
+            "experimental_enable_dynamic_batch_size=False) to the options "
+            "argument of strategy.run().".format(path,
+                                                 input_tensor.op.name,
+                                                 input_tensor.op.type))
+
+  def enqueue(self, features, weights=None, training=True, name=None):
+    """Enqueues id tensors for embedding lookup.
+
+    This function enqueues a structure of features to be looked up in the
+    embedding tables. We expect that the batch size of each of the tensors in
+    features matches the per core batch size. This will automatically happen if
+    your input dataset is batched to the global batch size and you use
+    `tf.distribute.experimental.TPUStrategy`'s `experimental_distribute_dataset`
+    or if you use `experimental_distribute_datasets_from_function` and batch
+    to the per core batch size computed by the context passed to your input
+    function.
+
+    ```python
+    strategy = tf.distribute.experimental.TPUStrategy(...)
+    with strategy.scope():
+      embedding = tf.tpu.experimental.embedding.TPUEmbedding(...)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(...)
+    dataset_iterator = iter(distributed_dataset)
+
+    @tf.function
+    def training_step():
+      def tpu_step(tpu_features):
+        with tf.GradientTape() as tape:
+          activations = embedding.dequeue()
+          tape.watch(activations)
+
+          loss = ... #  some computation involving activations
+
+        embedding_gradients = tape.gradient(loss, activations)
+        embedding.apply_gradients(embedding_gradients)
+
+      embedding_features, tpu_features = next(dataset_iterator)
+      embedding.enqueue(embedding_features, training=True)
+      strategy.run(tpu_step, args=(embedding_features,))
+
+    training_step()
+    ```
+
+    NOTE: You should specify `training=True` when using
+    `embedding.apply_gradients` as above and `training=False` when not using
+    `embedding.apply_gradients` (e.g. for frozen embeddings or when doing
+    evaluation).
+
+    Args:
+      features: A nested structure of `tf.Tensor`s, `tf.SparseTensor`s or
+        `tf.RaggedTensor`s, with the same structure as `feature_config`. Inputs
+        will be downcast to `tf.int32`. Only one type out of `tf.SparseTensor`
+        or `tf.RaggedTensor` is supported per call.
+      weights: If not `None`, a nested structure of `tf.Tensor`s,
+        `tf.SparseTensor`s or `tf.RaggedTensor`s, matching the above, except
+        that the tensors should be of float type (and they will be downcast to
+        `tf.float32`). For `tf.SparseTensor`s we assume the `indices` are the
+        same for the parallel entries from `features` and similarly for
+        `tf.RaggedTensor`s we assume the row_splits are the same.
+      training: Defaults to `True`. If `False`, enqueue the batch as inference
+        batch (forward pass only). Do not call `apply_gradients` when this is
+        `False` as this may lead to a deadlock.
+       name: A name for the underlying op.
+
+    Raises:
+      ValueError: When called inside a strategy.run call and input is not
+        directly taken from the args of the `strategy.run` call. Also if
+        the size of any sequence in `features` does not match corresponding
+        sequence in `feature_config`. Similarly for `weights`, if not `None`.
+      RuntimeError: When called inside a strategy.run call and inside XLA
+        control flow.
+      TypeError: If the type of any sequence in `features` does not match
+        corresponding sequence in `feature_config`. Similarly for `weights`, if
+        not `None`.
+    """
+    if not self._using_tpu:
+      raise RuntimeError("enqueue is not valid when TPUEmbedding object is not "
+                         "created under a TPUStrategy.")
+
+    nest.assert_same_structure(self._feature_config, features)
+
+    # TODO(bfontain): Add a check that the input batch_size matches the per core
+    # batch size that this instance of the API was initialized with.
+
+    flat_inputs = nest.flatten(features)
+    flat_weights = [None] * len(flat_inputs)
+    if weights is not None:
+      nest.assert_same_structure(self._feature_config, weights)
+      flat_weights = nest.flatten(weights)
+    flat_features = nest.flatten_with_joined_string_paths(self._feature_config)
+
+    in_tpu_context = self._raise_error_for_incorrect_control_flow_context()
+    # If we are in a tpu_context, automatically apply outside compilation.
+    if in_tpu_context:
+      self._raise_error_for_non_direct_inputs(features)
+
+      def generate_enqueue_ops():
+        """Generate enqueue ops for outside compilation."""
+        # Note that we put array_ops.where_v2 rather than a python if so that
+        # the op is explicitly create and the constant ops are both in the graph
+        # even though we don't expect training to be a tensor (and thus generate
+        # control flow automatically). This need to make it easier to re-write
+        # the graph later if we need to fix which mode needs to be used.
+        mode_override = array_ops.where_v2(training,
+                                           constant_op.constant("train"),
+                                           constant_op.constant("inference"))
+
+        # Device ordinal is -1 here, a later rewrite will fix this once the op
+        # is expanded by outside compilation.
+        enqueue_op = self._generate_enqueue_op(
+            flat_inputs, flat_weights, flat_features, device_ordinal=-1,
+            mode_override=mode_override)
+
+        # Apply the name tag to the op.
+        if name is not None:
+          _add_key_attr(enqueue_op, name)
+
+        # Ensure that this op has outbound control flow, otherwise it won't be
+        # executed.
+        ops.get_default_graph().control_outputs.append(enqueue_op)
+
+      tpu.outside_compilation(generate_enqueue_ops)
+
+    else:
+      mode_override = "train" if training else "inference"
+      # We generate enqueue ops per device, so we need to gather the all
+      # features for a single device in to a dict.
+      # We rely here on the fact that the devices in the PerReplica value occur
+      # in the same (standard) order as self._strategy.extended.worker_devices.
+      enqueue_ops = []
+      for replica_id in range(self._strategy.num_replicas_in_sync):
+        replica_inputs = tf_values.select_replica(replica_id, flat_inputs)
+        replica_weights = tf_values.select_replica(replica_id, flat_weights)
+        tpu_device = self._strategy.extended.worker_devices[replica_id]
+        # TPU devices string are like /job:worker/replica:0/task:0/device:TPU:0
+        # the device ordinal is the last number
+        device_ordinal = int(tpu_device.rsplit(":", 1)[1])
+        with ops.device(device_util.get_host_for_device(tpu_device)):
+          enqueue_op = self._generate_enqueue_op(
+              replica_inputs, replica_weights, flat_features,
+              device_ordinal=device_ordinal, mode_override=mode_override)
+
+          # Apply the name tag to the op.
+          if name is not None:
+            _add_key_attr(enqueue_op, name)
+          enqueue_ops.append(enqueue_op)
+      ops.get_default_graph().control_outputs.extend(enqueue_ops)
+
+
+class TPUEmbeddingSaveable(saveable_hook.SaveableHook):
+  """Save/Restore hook to Retrieve/Load TPUEmbedding variables."""
+
+  def __init__(self, name, load, retrieve):
+    self._load = load
+    self._retrieve = retrieve
+    super(TPUEmbeddingSaveable, self).__init__(name=name)
+
+  def before_save(self):
+    if self._retrieve is not None:
+      self._retrieve()
+
+  def after_restore(self):
+    if self._load is not None:
+      self._load()
+
+
+def _ragged_embedding_lookup_with_reduce(table, ragged, weights, combiner):
+  """Compute a ragged lookup followed by a reduce on axis 1.
+
+  Args:
+    table: The embedding table.
+    ragged: A RaggedTensor of ids to look up.
+    weights: A RaggedTensor of weights (or None).
+    combiner: One of "mean", "sum", "sqrtn".
+
+  Returns:
+    A Tensor.
+  """
+  if weights is None:
+    weights = array_ops.ones_like(ragged)
+  weights = array_ops.expand_dims(weights, axis=2)
+  ragged_result = embedding_ops.embedding_lookup_ragged(table, ragged)
+  ragged_result = math_ops.reduce_sum(ragged_result * weights, axis=1)
+  if combiner == "mean":
+    ragged_result = ragged_result / math_ops.reduce_sum(weights, axis=1)
+  elif combiner == "sqrtn":
+    ragged_result = ragged_result, math_ops.sqrt(math_ops.reduce_sum(
+        weights*weights, axis=1))
+  return ragged_result
+
+
+def cpu_embedding_lookup(inputs, weights, tables, feature_config):
+  """Uses CPU embedding lookup for embedding ids in features.
+
+  Args:
+    inputs: a nested structure of Tensors, SparseTensors or RaggedTensors.
+    weights: a nested structure of Tensors, SparseTensors or RaggedTensors or
+      None for no weights.
+    tables: a dict of mapping TableConfig objects to Variables.
+    feature_config: a nested structure of FeatureConfig objects with the same
+      structure as inputs.
+
+  Returns:
+    A nested structure of Tensors with the same structure as inputs.
+  """
+
+  nest.assert_same_structure(inputs, feature_config)
+
+  flat_inputs = nest.flatten(inputs)
+  flat_weights = [None] * len(flat_inputs)
+  if weights is not None:
+    nest.assert_same_structure(inputs, weights)
+    flat_weights = nest.flatten(weights)
+  flat_features = nest.flatten_with_joined_string_paths(feature_config)
+
+  outputs = []
+  for inp, weight, (path, feature) in zip(
+      flat_inputs, flat_weights, flat_features):
+    table = tables[feature.table]
+    if feature.max_sequence_length > 0:
+      raise ValueError("Sequence features unsupported at this time.")
+
+    if weight is not None:
+      if isinstance(inp, ops.Tensor):
+        raise ValueError(
+            "Weight specified for {}, but input is dense.".format(path))
+      elif type(weight) is not type(inp):
+        raise ValueError(
+            "Weight for {} is of type {} but it does not match type of the "
+            "input which is {}.".format(path, type(weight), type(inp)))
+
+    if isinstance(inp, ops.Tensor):
+      outputs.append(embedding_ops.embedding_lookup_v2(table, inp))
+
+    elif isinstance(inp, sparse_tensor.SparseTensor):
+      outputs.append(embedding_ops.safe_embedding_lookup_sparse_v2(
+          table, inp, sparse_weights=weight, combiner=feature.table.combiner))
+
+    elif isinstance(inp, ragged_tensor.RaggedTensor):
+      outputs.append(_ragged_embedding_lookup_with_reduce(
+          table, inp, weight, feature.table.combiner))
+
+    else:
+      raise ValueError("Input {} is type {}. Tensor, SparseTensor or "
+                       "RaggedTensor expected.".format(path, type(inp)))
+  return nest.pack_sequence_as(feature_config, outputs)
+
+
+def get_list_of_hosts(strategy):
+  """Returns a sorted list of CPU devices for the remote jobs.
+
+  Args:
+    strategy: A TPUStrategy object.
+
+  Returns:
+    A sort list of device strings.
+  """
+  list_of_hosts = []
+  # Assume this is sorted by task
+  for tpu_device in strategy.extended.worker_devices:
+    host = device_util.get_host_for_device(tpu_device)
+    if host not in list_of_hosts:
+      list_of_hosts.append(host)
+  assert len(list_of_hosts) == strategy.extended.num_hosts
+  return list_of_hosts
+
+
+def extract_variable_info(kwargs):
+  """Extracts the variable creation attributes from the kwargs.
+
+  Args:
+    kwargs: a dict of keyword arguments that were passed to a variable creator
+      scope.
+
+  Returns:
+    A tuple of variable name, initialization function, shape, and dtype.
+
+  Raises:
+    ValueError: if unable to extract this information from the given keyword
+      args.
+  """
+  if "shape" not in kwargs or kwargs["shape"] is None:
+    if not isinstance(kwargs["initial_value"], functools.partial):
+      raise ValueError(
+          "Unable to extract initializer function and shape from {}. Please "
+          "either pass a function that expects a shape and dtype as the "
+          "initial value for your variable or functools.partial object with "
+          "the shape and dtype kwargs set. This is needed so that we can "
+          "initialize the shards of the ShardedVariable locally.".format(
+              kwargs["initial_value"]))
+    return (kwargs["name"], kwargs["initial_value"].keywords["shape"],
+            kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
+            kwargs["initial_value"].func)
+  else:
+    return (kwargs["name"], kwargs["shape"], kwargs["dtype"],
+            kwargs["initial_value"])
+
+
+def make_sharded_variable_creator(hosts):
+  """Makes a sharded variable creator given a list of hosts.
+
+  Args:
+    hosts: a list of tensorflow devices on which to shard the tensors.
+
+  Returns:
+    A variable creator function.
+  """
+
+  def sharded_variable_creator(next_creator, *args, **kwargs):
+    """The sharded variable creator."""
+    kwargs["skip_mirrored_creator"] = True
+
+    num_hosts = len(hosts)
+    name, shape, dtype, initial_value = extract_variable_info(kwargs)
+    rows = shape[0]
+    cols = shape[1]
+    missing = rows % num_hosts
+    # we partition as if we were using MOD sharding.
+    partitions = ([rows // num_hosts + 1] * missing + [rows // num_hosts] *
+                  (num_hosts - missing))
+    variables = []
+    newkwargs = kwargs
+    newkwargs["dtype"] = dtype
+    for i, p in enumerate(partitions):
+      with ops.device(hosts[i]):
+        newkwargs["shape"] = (p, cols)
+        newkwargs["name"] = "{}_{}".format(name, i)
+        newkwargs["initial_value"] = (
+            lambda: initial_value(newkwargs["shape"], dtype=dtype))
+        variables.append(next_creator(*args, **kwargs))
+    return TPUShardedVariable(variables, name=name)
+  return sharded_variable_creator
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
new file mode 100644
index 00000000000..bba0d10a62f
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -0,0 +1,624 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Companion classes for mid level API for TPU Embeddings in TF2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import abc
+import functools
+import math
+import six
+
+from tensorflow.core.protobuf.tpu import optimization_parameters_pb2
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@six.add_metaclass(abc.ABCMeta)
+class _Optimizer(object):
+  """Base class for all optimizers, with common parameters."""
+
+  def __init__(self, learning_rate, use_gradient_accumulation, clip_weight_min,
+               clip_weight_max, weight_decay_factor,
+               multiply_weight_decay_factor_by_learning_rate,
+               slot_variable_creation_fn=None):
+    self.learning_rate = learning_rate
+    self.use_gradient_accumulation = use_gradient_accumulation
+    self.clip_weight_min = clip_weight_min
+    self.clip_weight_max = clip_weight_max
+    self.weight_decay_factor = weight_decay_factor
+    self.multiply_weight_decay_factor_by_learning_rate = (
+        multiply_weight_decay_factor_by_learning_rate)
+
+    if (slot_variable_creation_fn is not None and
+        not callable(slot_variable_creation_fn)):
+      raise ValueError("slot_variable_creation_fn must be either None or a "
+                       "callable.")
+    self.slot_variable_creation_fn = slot_variable_creation_fn
+
+  @abc.abstractmethod
+  def _slot_names(self):
+    """Returns the name of all the slot variables.
+
+    This does not include the 'parameters' variable and these names must match
+    the names of the slots variables as used in the corresponding
+    `tpu_ops.load_tpu_embedding_*` ops.
+    """
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def _slot_initializers(self):
+    """Returns initializers for slot variables.
+
+    This returns a parallel list to self._slot_names().
+    """
+    raise NotImplementedError
+
+  def _set_optimization_parameters(self, parameters):
+    """Sets the optimizer fields in the OptimizationParameters."""
+    if self.use_gradient_accumulation:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.ENABLED)
+    else:
+      parameters.gradient_accumulation_status = (
+          optimization_parameters_pb2.GradientAccumulationStatus.DISABLED)
+
+    if self.clip_weight_min is not None:
+      parameters.clipping_limits.lower.value = self.clip_weight_min
+
+    if self.clip_weight_max is not None:
+      parameters.clipping_limits.upper.value = self.clip_weight_max
+
+    if self.weight_decay_factor:
+      parameters.weight_decay_factor = self.weight_decay_factor
+      if self.multiply_weight_decay_factor_by_learning_rate:
+        parameters.multiply_weight_decay_factor_by_learning_rate = True
+
+  @abc.abstractmethod
+  def _load(self):
+    """Returns the load function for the optimizer."""
+    raise NotImplementedError
+
+  @abc.abstractmethod
+  def _retrieve(self):
+    """Returns the retrieve function for the optimizer."""
+    raise NotImplementedError
+
+  def _create_slots(self, table):
+    """Creates slot variables for table.
+
+    Uses shape of table to create parallel slot variables.
+
+    Args:
+      table: A Variable or equivalent.
+
+    Returns:
+      A dict of variables, keyed by self._slot_names().
+    """
+    if self.slot_variable_creation_fn is not None:
+      return self.slot_variable_creation_fn(table, self._slot_names())
+    else:
+      slots = {}
+      for slot, initializer in zip(self._slot_names(),
+                                   self._slot_initializers()):
+        slots[slot] = tf_variables.Variable(
+            name=table.name + "/" + slot,
+            initial_value=functools.partial(
+                initializer, shape=table.shape, dtype=table.dtype),
+            trainable=False)
+      return slots
+
+
+@tf_export("tpu.experimental.embedding.SGD")
+class SGD(_Optimizer):
+  """Optimization parameters for stochastic gradient descent for TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  ```
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.SGD(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.SGD(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.01,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None):
+    """Optimization parameters for stochastic gradient descent.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed. Weights are decayed by multiplying the weight
+        by this factor each step.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+    """
+    super(SGD, self).__init__(
+        learning_rate, False, clip_weight_min, clip_weight_max,
+        weight_decay_factor, multiply_weight_decay_factor_by_learning_rate)
+
+  def _slot_names(self):
+    return []
+
+  def _slot_initializers(self):
+    return []
+
+  def _set_optimization_parameters(self, parameters):
+    super(SGD, self)._set_optimization_parameters(parameters)
+    parameters.stochastic_gradient_descent.SetInParent()
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_stochastic_gradient_descent_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_stochastic_gradient_descent_parameters
+
+
+@tf_export("tpu.experimental.embedding.Adagrad")
+class Adagrad(_Optimizer):
+  """Optimization parameters for Adagrad with TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  ```python
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```python
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adagrad(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               use_gradient_accumulation=True,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None,
+               slot_variable_creation_fn=None):
+    """Optimization parameters for Adagrad.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      initial_accumulator_value: initial accumulator for Adagrad.
+      use_gradient_accumulation: setting this to `False` makes embedding
+        gradients calculation less accurate but faster.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+      slot_variable_creation_fn: Defaults to `None`. If you wish do directly
+        control the creation of the slot variables, set this to a callable
+        taking two parameters, a variable and a list of slot names to create for
+        it. This function should return a dict with the slot names as keys and
+        the created variables as values. When set to None (the default), uses
+        the built-in variable creation.
+    """
+    super(Adagrad, self).__init__(
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        slot_variable_creation_fn)
+    if initial_accumulator_value <= 0:
+      raise ValueError("Adagrad initial_accumulator_value must be positive")
+    self.initial_accumulator_value = initial_accumulator_value
+
+  def _slot_names(self):
+    return ["accumulators"]
+
+  def _slot_initializers(self):
+    return [init_ops_v2.Constant(self.initial_accumulator_value)]
+
+  def _set_optimization_parameters(self, parameters):
+    super(Adagrad, self)._set_optimization_parameters(parameters)
+    parameters.adagrad.SetInParent()
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_adagrad_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_adagrad_parameters
+
+
+@tf_export("tpu.experimental.embedding.Adam")
+class Adam(_Optimizer):
+  """Optimization parameters for Adam with TPU embeddings.
+
+  Pass this to `tf.tpu.experimental.embedding.TPUEmbedding` via the `optimizer`
+  argument to set the global optimizer and its parameters:
+
+  NOTE: By default this optimizer is lazy, i.e. it will not apply the gradient
+  update of zero to rows that were not looked up. You can change this behavior
+  by setting `lazy_adam` to `False`.
+
+  ```python
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      ...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  This can also be used in a `tf.tpu.experimental.embedding.TableConfig` as the
+  optimizer parameter to set a table specific optimizer. This will override the
+  optimizer and parameters for global embedding optimizer defined above:
+
+  ```python
+  table_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...,
+      optimizer=tf.tpu.experimental.embedding.Adam(0.2))
+  table_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+
+  feature_config = (
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_one),
+      tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_two))
+
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  In the above example, the first feature will be looked up in a table that has
+  a learning rate of 0.2 while the second feature will be looked up in a table
+  that has a learning rate of 0.1.
+
+  See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+  complete description of these parameters and their impacts on the optimizer
+  algorithm.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-07,
+               lazy_adam=True,
+               sum_inside_sqrt=True,
+               use_gradient_accumulation=True,
+               clip_weight_min=None,
+               clip_weight_max=None,
+               weight_decay_factor=None,
+               multiply_weight_decay_factor_by_learning_rate=None,
+               slot_variable_creation_fn=None):
+    """Optimization parameters for Adam.
+
+    See 'tensorflow/core/protobuf/tpu/optimization_parameters.proto' for a
+    complete description of these parameters and their impacts on the optimizer
+    algorithm.
+
+    Args:
+      learning_rate: The learning rate. It should be a floating point value or a
+        callable taking no arguments for a dynamic learning rate.
+      beta_1: A float value.
+        The exponential decay rate for the 1st moment estimates.
+      beta_2: A float value.
+        The exponential decay rate for the 2nd moment estimates.
+      epsilon: A small constant for numerical stability.
+      lazy_adam: Use lazy Adam instead of Adam. Lazy Adam trains faster.
+      sum_inside_sqrt: When this is true, the Adam update formula is changed
+        from `m / (sqrt(v) + epsilon)` to `m / sqrt(v + epsilon**2)`. This
+        option improves the performance of TPU training and is not expected to
+        harm model quality.
+      use_gradient_accumulation: Setting this to `False` makes embedding
+        gradients calculation less accurate but faster.
+      clip_weight_min: the minimum value to clip by; None means -infinity.
+      clip_weight_max: the maximum value to clip by; None means +infinity.
+      weight_decay_factor: amount of weight decay to apply; None means that the
+        weights are not decayed.
+      multiply_weight_decay_factor_by_learning_rate: if true,
+        `weight_decay_factor` is multiplied by the current learning rate.
+      slot_variable_creation_fn: a callable taking two parameters, a variable
+        and a list of slot names to create for it. This function should return
+        a dict with the slot names as keys and the created variables as values.
+        When set to None (the default), uses the built-in variable creation.
+    """
+    super(Adam, self).__init__(
+        learning_rate, use_gradient_accumulation, clip_weight_min,
+        clip_weight_max, weight_decay_factor,
+        multiply_weight_decay_factor_by_learning_rate,
+        slot_variable_creation_fn)
+    if beta_1 < 0. or beta_1 >= 1.:
+      raise ValueError("beta1 must be in the range [0, 1), but received {}."
+                       .format(beta_1))
+    if beta_2 < 0. or beta_2 >= 1.:
+      raise ValueError("beta2 must be in the range [0, 1), but received {}."
+                       .format(beta_2))
+    if epsilon <= 0.:
+      raise ValueError("epsilon must be positive; got {}.".format(epsilon))
+    if not use_gradient_accumulation and not lazy_adam:
+      raise ValueError(
+          "When disabling Lazy Adam, gradient accumulation must be used.")
+
+    self.beta_1 = beta_1
+    self.beta_2 = beta_2
+    self.epsilon = epsilon
+    self.lazy_adam = lazy_adam
+    self.sum_inside_sqrt = sum_inside_sqrt
+
+  def _slot_names(self):
+    return ["momenta", "velocities"]
+
+  def _slot_initializers(self):
+    return [init_ops_v2.Constant(), init_ops_v2.Constant()]
+
+  def _set_optimization_parameters(self, parameters):
+    super(Adam, self)._set_optimization_parameters(parameters)
+    parameters.adam.beta1 = self.beta_1
+    parameters.adam.beta2 = self.beta_2
+    parameters.adam.epsilon = self.epsilon
+    parameters.adam.use_non_lazy_adam = not self.lazy_adam
+    parameters.adam.use_sum_inside_sqrt = self.sum_inside_sqrt
+
+  def _load(self):
+    return tpu_ops.load_tpu_embedding_adam_parameters
+
+  def _retrieve(self):
+    return tpu_ops.retrieve_tpu_embedding_adam_parameters
+
+
+@tf_export("tpu.experimental.embedding.TableConfig")
+class TableConfig(object):
+  """Configuration data for one embedding table.
+
+  This class holds the configuration data for a single embedding table. It is
+  used as the `table` parameter of a
+  `tf.tpu.experimental.embedding.FeatureConfig`. Multiple
+  `tf.tpu.experimental.embedding.FeatureConfig` objects can use the same
+  `tf.tpu.experimental.embedding.TableConfig` object. In this case a shared
+  table will be created for those feature lookups.
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  The above configuration has 2 tables, and three features. The first two
+  features will be looked up in the first table and the third feature will be
+  looked up in the second table.
+
+  """
+
+  def __init__(self, vocabulary_size, dim, initializer, optimizer=None,
+               combiner="mean", name=None):
+    """Embedding table configuration.
+
+    Args:
+      vocabulary_size: Size of the table's vocabulary (number of rows).
+      dim: The embedding dimension (width) of the table.
+      initializer: A callable initializer taking one parameter, the shape of the
+        variable that will be initialized. Will be called once per task, to
+        initialize that task's shard of the embedding table. If not specified,
+        defaults to `truncated_normal_initializer` with mean `0.0` and standard
+        deviation `1/sqrt(dim)`.
+      optimizer: An optional instance of an optimizer parameters class, instance
+        of one of `tf.tpu.experimental.embedding.SGD`,
+        `tf.tpu.experimental.embedding.Adagrad` or
+        `tf.tpu.experimental.embedding.Adam`. It set will override the global
+        optimizer passed to `tf.tpu.experimental.embedding.TPUEmbedding`.
+      combiner: A string specifying how to reduce if there are multiple entries
+        in a single row. Currently 'mean', 'sqrtn', 'sum' are
+        supported, with 'mean' the default. 'sqrtn' often achieves good
+        accuracy, in particular with bag-of-words columns. For more information,
+        see `tf.nn.embedding_lookup_sparse`.
+      name: An optional string used to name the table. Useful for debugging.
+
+    Returns:
+      `TableConfig`.
+
+    Raises:
+      ValueError: if `vocabulary_size` is not a positive integer.
+      ValueError: if `dim` is not a positive integer.
+      ValueError: if `initializer` is specified and is not callable.
+      ValueError: if `combiner` is not supported.
+    """
+    if not isinstance(vocabulary_size, int) or vocabulary_size < 1:
+      raise ValueError("Invalid vocabulary_size {}.".format(vocabulary_size))
+
+    if not isinstance(dim, int) or dim < 1:
+      raise ValueError("Invalid dim {}.".format(dim))
+
+    if (initializer is not None) and (not callable(initializer)):
+      raise ValueError("initializer must be callable if specified.")
+    if initializer is None:
+      initializer = init_ops_v2.TruncatedNormal(mean=0.0,
+                                                stddev=1/math.sqrt(dim))
+
+    if combiner not in ("mean", "sum", "sqrtn"):
+      raise ValueError("Invalid combiner {}".format(combiner))
+
+    self.vocabulary_size = vocabulary_size
+    self.dim = dim
+    self.initializer = initializer
+    self.optimizer = optimizer
+    self.combiner = combiner
+    self.name = name
+
+
+@tf_export("tpu.experimental.embedding.FeatureConfig")
+class FeatureConfig(object):
+  """Configuration data for one embedding feature.
+
+  This class holds the configuration data for a single embedding feature. The
+  main use is to assign features to `tf.tpu.experimental.embedding.TableConfig`s
+  via the table parameter:
+
+  ```python
+  table_config_one = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  table_config_two = tf.tpu.experimental.embedding.TableConfig(
+      vocabulary_size=...,
+      dim=...)
+  feature_config = {
+      'feature_one': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_two': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_one),
+      'feature_three': tf.tpu.experimental.embedding.FeatureConfig(
+          table=table_config_two)}
+  embedding = tf.tpu.experimental.embedding.TPUEmbedding(
+      feature_config=feature_config,
+      batch_size=...
+      optimizer=tf.tpu.experimental.embedding.Adam(0.1))
+  ```
+
+  The above configuration has 2 tables, and three features. The first two
+  features will be looked up in the first table and the third feature will be
+  looked up in the second table.
+
+  When feeding features into `embedding.enqueue` they can be `tf.Tensor`s,
+  `tf.SparseTensor`s or `tf.RaggedTensor`s. When the argument
+  `max_sequence_length` is 0, the default, you should expect a output of
+  `embedding.dequeue` for this feature of shape `(batch_size, dim)`. If
+  `max_sequence_length` is greater than 0, the feature is embedded as a sequence
+  and padded up to the given length. The shape of the output for this feature
+  will be `(batch_size, max_sequence_length, dim)`.
+  """
+
+  def __init__(self, table, max_sequence_length=0, name=None):
+    """Feature configuration.
+
+    Args:
+      table: An instance of `tf.tpu.experimental.embedding.TableConfig`,
+        describing the table in which this feature should be looked up.
+      max_sequence_length: If positive, the feature is a sequence feature with
+        the corresponding maximum sequence length. If the sequence is longer
+        than this, it will be truncated. If 0, the feature is not a sequence
+        feature.
+      name: An optional name for the feature, useful for debugging.
+
+    Returns:
+      `FeatureConfig`.
+
+    Raises:
+      ValueError: if `table` is not an instance of
+        `tf.tpu.experimental.embedding.TableConfig`.
+      ValueError: if `max_sequence_length` not an integer or is negative.
+    """
+    if not isinstance(table, TableConfig):
+      raise ValueError("table is type {}, expected "
+                       "`tf.tpu.experimental.embedding.TableConfig`".format(
+                           type(table)))
+
+    if not isinstance(max_sequence_length, int) or max_sequence_length < 0:
+      raise ValueError("Invalid max_sequence_length {}.".format(
+          max_sequence_length))
+
+    self.table = table
+    self.max_sequence_length = max_sequence_length
+    self.name = name
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
new file mode 100644
index 00000000000..e2c6bbd43d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
new file mode 100644
index 00000000000..941e81acbbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
new file mode 100644
index 00000000000..b2c31d00ad8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.FeatureConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FeatureConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
new file mode 100644
index 00000000000..9a3f47406b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
new file mode 100644
index 00000000000..9cc8354b4bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding.TPUEmbedding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2.TPUEmbedding\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "embedding_tables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'features\', \'weights\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
new file mode 100644
index 00000000000..6be35ed6fb6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.TableConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.TableConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'vocabulary_size\', \'dim\', \'initializer\', \'optimizer\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
new file mode 100644
index 00000000000..9d4f24f4edd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding"
+tf_module {
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FeatureConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUEmbedding"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TableConfig"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
index ef1c8078cca..f9925518a1a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "Topology"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "embedding"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "embedding_column"
     argspec: "args=[\'categorical_column\', \'dimension\', \'combiner\', \'initializer\', \'max_sequence_length\', \'learning_rate_fn\', \'embedding_lookup_device\', \'tensor_core_shape\', \'use_safe_embedding_lookup\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\', \'0\', \'None\', \'None\', \'None\', \'True\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
new file mode 100644
index 00000000000..e2c6bbd43d9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adagrad.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adagrad"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.1\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
new file mode 100644
index 00000000000..941e81acbbb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-adam.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.Adam"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.Adam\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'beta_1\', \'beta_2\', \'epsilon\', \'lazy_adam\', \'sum_inside_sqrt\', \'use_gradient_accumulation\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\', \'slot_variable_creation_fn\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-07\', \'True\', \'True\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
new file mode 100644
index 00000000000..b2c31d00ad8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.FeatureConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.FeatureConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
new file mode 100644
index 00000000000..9a3f47406b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-s-g-d.pbtxt
@@ -0,0 +1,10 @@
+path: "tensorflow.tpu.experimental.embedding.SGD"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.SGD\'>"
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils._Optimizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'learning_rate\', \'clip_weight_min\', \'clip_weight_max\', \'weight_decay_factor\', \'multiply_weight_decay_factor_by_learning_rate\'], varargs=None, keywords=None, defaults=[\'0.01\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
new file mode 100644
index 00000000000..9cc8354b4bf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding.TPUEmbedding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2.TPUEmbedding\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "embedding_tables"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_config\', \'batch_size\', \'optimizer\', \'pipeline_execution_with_tensor_core\', \'initialize_tpu_embedding\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
+  }
+  member_method {
+    name: "apply_gradients"
+    argspec: "args=[\'self\', \'gradients\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "dequeue"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "enqueue"
+    argspec: "args=[\'self\', \'features\', \'weights\', \'training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
new file mode 100644
index 00000000000..6be35ed6fb6
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-table-config.pbtxt
@@ -0,0 +1,9 @@
+path: "tensorflow.tpu.experimental.embedding.TableConfig"
+tf_class {
+  is_instance: "<class \'tensorflow.python.tpu.tpu_embedding_v2_utils.TableConfig\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'vocabulary_size\', \'dim\', \'initializer\', \'optimizer\', \'combiner\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
new file mode 100644
index 00000000000..9d4f24f4edd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.pbtxt
@@ -0,0 +1,27 @@
+path: "tensorflow.tpu.experimental.embedding"
+tf_module {
+  member {
+    name: "Adagrad"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Adam"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FeatureConfig"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SGD"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TPUEmbedding"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TableConfig"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
index df31799828c..5c547f4f49b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "Topology"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "embedding"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "initialize_tpu_system"
     argspec: "args=[\'cluster_resolver\'], varargs=None, keywords=None, defaults=[\'None\'], "

From 43bc7a79f0d2f576d0f3d719f3a55a10f8fba61c Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 19 May 2020 10:04:20 -0700
Subject: [PATCH 0830/1533] Simplify nccl_configure.

It can make use of the newly added cuda_gpu_architectures() macro.

PiperOrigin-RevId: 312301158
Change-Id: Ifa5229831f8d17093a0649b64457ae9d97ba6737
---
 third_party/nccl/build_defs.bzl.tpl |  4 ++--
 third_party/nccl/nccl_configure.bzl | 10 ----------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index e734e49f9dc..7585949ea92 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -1,6 +1,6 @@
 """Repository rule for NCCL."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_gpu_architectures")
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain")
 
 def _gen_device_srcs_impl(ctx):
@@ -285,7 +285,7 @@ def cuda_rdc_library(name, hdrs = None, copts = None, linkstatic = True, **kwarg
         name = dlink_hdrs,
         deps = [lib],
         out = dlink_cc,
-        gpu_archs = %{gpu_architectures},
+        gpu_archs = cuda_gpu_architectures(),
         nvlink_args = select({
             "@org_tensorflow//tensorflow:linux_x86_64": ["--cpu-arch=X86_64"],
             "@org_tensorflow//tensorflow:linux_ppc64le": ["--cpu-arch=PPC64LE"],
diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl
index 92acb204097..d59e861d70b 100644
--- a/third_party/nccl/nccl_configure.bzl
+++ b/third_party/nccl/nccl_configure.bzl
@@ -13,7 +13,6 @@
 
 load(
     "//third_party/gpus:cuda_configure.bzl",
-    "compute_capabilities",
     "enable_cuda",
     "find_cuda_config",
 )
@@ -84,16 +83,7 @@ def _create_local_nccl_repository(repository_ctx):
         # Alias to open source build from @nccl_archive.
         repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT)
 
-        # TODO(csigg): implement and reuse in cuda_configure.bzl.
-        gpu_architectures = [
-            "sm_" + capability.replace(".", "")
-            for capability in compute_capabilities(repository_ctx)
-        ]
-
-        # Round-about way to make the list unique.
-        gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
         config_wrap = {
-            "%{gpu_architectures}": str(gpu_architectures),
             "%{use_bin2c_path}": "False",
         }
         if (int(cuda_major), int(cuda_minor)) <= (10, 1):

From 0fd60fe3a36155f17cb1be710b1960a40e44434e Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Tue, 19 May 2020 10:36:14 -0700
Subject: [PATCH 0831/1533] Rename xla_hlo.conditional to xla_hlo.if.

This is part of separating xla_hlo.conditional op into two separate ops: xla_hlo.if to handle predicated conditional and xla_hlo.case to handle indexed conditional. A follow up CL would add xla_hlo.case op

PiperOrigin-RevId: 312307608
Change-Id: Iea3392fad74b209c5df14c35efd70b373f64bcd7
---
 .../mlir/xla/hlo_function_importer.cc         |  4 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  7 ++-
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      |  2 +-
 .../mlir/xla/tests/legalize-control-flow.mlir |  4 +-
 .../xla/tests/legalize-tf-control-flow.mlir   |  2 +-
 .../tests/sink-constants-to-control-flow.mlir |  4 +-
 .../translate/{conditional.mlir => if.mlir}   |  2 +-
 ...nditional.hlotxt => if_conditional.hlotxt} |  2 +-
 .../xla/transforms/legalize_control_flow.cc   | 47 +++++++++----------
 .../transforms/legalize_tf_control_flow.cc    | 20 ++++----
 .../sink_constants_to_control_flow.cc         |  6 +--
 11 files changed, 51 insertions(+), 49 deletions(-)
 rename tensorflow/compiler/mlir/xla/tests/translate/{conditional.mlir => if.mlir} (98%)
 rename tensorflow/compiler/mlir/xla/tests/translate/{conditional.hlotxt => if_conditional.hlotxt} (97%)

diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 5dc610a5670..718db1597cf 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -423,8 +423,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
       TF_RETURN_IF_ERROR(GetMlirTypes(
           {instruction->true_computation()->root_instruction()}, &rets));
 
-      auto op = func_builder->create<mlir::xla_hlo::ConditionalOp>(
-          loc, rets, operands, attributes);
+      auto op = func_builder->create<mlir::xla_hlo::IfOp>(loc, rets, operands,
+                                                          attributes);
       TF_RETURN_IF_ERROR(ImportComputation(instruction->true_computation(),
                                            &op.true_branch()));
       TF_RETURN_IF_ERROR(ImportComputation(instruction->false_computation(),
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 0d7771b180e..5d46140c3ea 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -484,8 +484,11 @@ def HLO_AfterAllOp : HLO_Op<"after_all", []> {
   let results = (outs HLO_Token);
 }
 
-def HLO_ConditionalOp: HLO_Op<"conditional", []> {
-  string summary = "Conditional operator";
+// Xla Client API has two separate calls for indexed and predicated conditional,
+// although both eventually map to kConditional HLO. IfOp maps to predicated
+// conditional use of kConditional HLO.
+def HLO_IfOp: HLO_Op<"if", []> {
+  string summary = "If operator";
 
   string description = [{
     Returns the result of executing either a true or false function depending on
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 228a26b5abd..9e30d830602 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -618,7 +618,7 @@ LogicalResult ExportXlaOp(DynamicReshapeOp op, OpLoweringContext ctx) {
   return failure();
 }
 
-LogicalResult ExportXlaOp(ConditionalOp op, OpLoweringContext ctx) {
+LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
   xla::XlaComputation true_branch;
   xla::XlaComputation false_branch;
   auto& value_map = *ctx.values;
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
index 83c3f765dc3..83880bc8ce9 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-control-flow.mlir
@@ -35,7 +35,7 @@ func @conditional(%arg0: tensor<f32>) -> tensor<f32> {
 
   // CHECK:   [[VAL1:%.+]] = extract_element [[VAL0]][] : tensor<i1>
   // CHECK:   cond_br [[VAL1]], ^bb1(%arg0 : tensor<f32>), ^bb2(%arg0 : tensor<f32>)
-  %1 = "xla_hlo.conditional"(%0, %arg0, %arg0) ( {
+  %1 = "xla_hlo.if"(%0, %arg0, %arg0) ( {
 
   ^bb0(%arg1: tensor<f32>):
     // CHECK: ^bb1([[VAL2:%.+]]: tensor<f32>):
@@ -131,7 +131,7 @@ func @conditional_with_multiple_blocks(%arg0: tensor<f32>, %arg1: tensor<f32>, %
   // CHECK: ^[[EXIT]](%6: tensor<f32>):
   // CHECK:   return %6 : tensor<f32>
   // CHECK: }
-  %1 = "xla_hlo.conditional"(%pred, %arg0, %arg1) ( {
+  %1 = "xla_hlo.if"(%pred, %arg0, %arg1) ( {
   ^then_entry(%arg2: tensor<f32>):
     br ^then_succ(%arg2: tensor<f32>)
   ^then_succ(%0: tensor<f32>):
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
index 2984ba46993..61f82fcad19 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
@@ -6,7 +6,7 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
   // CHECK: [[VAL0:%.+]] = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   // CHECK: [[VAL1:%.+]] = "xla_hlo.tuple"(%arg0, %arg1)
-  // CHECK: [[VAL2:%.+]] = "xla_hlo.conditional"([[VAL0]], [[VAL1]], [[VAL1]]) ( {
+  // CHECK: [[VAL2:%.+]] = "xla_hlo.if"([[VAL0]], [[VAL1]], [[VAL1]]) ( {
   // CHECK: ^bb0(%arg2: tuple<tensor<f32>, tensor<f32>>):
   // CHECK:   [[VAL4:%.+]] = "xla_hlo.get_tuple_element"(%arg2) {index = 0 : i32}
   // CHECK:   [[VAL5:%.+]] = "xla_hlo.get_tuple_element"(%arg2) {index = 1 : i32}
diff --git a/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
index c2fbad2faec..9f54e40dcaa 100644
--- a/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
@@ -37,8 +37,8 @@ func @sink_const_to_conditional(%arg0: tensor<i64>) -> tensor<i64> {
   %c1 = xla_hlo.constant dense<2> : tensor<i64>
   %0 = "xla_hlo.compare"(%arg0, %c0) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
   %1 = "xla_hlo.tuple"(%arg0) : (tensor<i64>) -> tuple<tensor<i64>>
-  // CHECK: xla_hlo.conditional
-  %2 = "xla_hlo.conditional"(%0, %1, %1) ( {
+  // CHECK: xla_hlo.if
+  %2 = "xla_hlo.if"(%0, %1, %1) ( {
   ^bb0(%arg1: tuple<tensor<i64>>):
     // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
     %3 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir b/tensorflow/compiler/mlir/xla/tests/translate/if.mlir
similarity index 98%
rename from tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir
rename to tensorflow/compiler/mlir/xla/tests/translate/if.mlir
index e510a2aa35f..6542966fc7c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/conditional.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/if.mlir
@@ -41,7 +41,7 @@ func @main(%arg0: tensor<f32>) -> tuple<tensor<f32>> {
   %1 = "xla_hlo.tuple"(%arg0) : (tensor<f32>) -> tuple<tensor<f32>>
 
   // CHECK:   %[[VAL3:.+]] = (f32[]) conditional(pred[] %[[VAL1]], (f32[]) %[[VAL2]], (f32[]) %[[VAL2]]), true_computation=[[R0]], false_computation=[[R1]]
-  %2 = "xla_hlo.conditional"(%0, %1, %1) ( {
+  %2 = "xla_hlo.if"(%0, %1, %1) ( {
   ^bb0(%arg1: tuple<tensor<f32>>):
     %6 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
     %7 = "xla_hlo.log"(%6) : (tensor<f32>) -> tensor<f32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
similarity index 97%
rename from tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt
rename to tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
index 00f6ec2d308..d2c6e669e9b 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/conditional.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/if_conditional.hlotxt
@@ -29,7 +29,7 @@ ENTRY %tfcompile.20 {
   // CHECK: [[R2:%.+]] = "xla_hlo.tuple"([[A0]])
   %tuple.5 = (f32[]) tuple(%arg0.1), metadata={op_type="If" op_name="cond/Merge_if"}
 
-  // CHECK: [[R3:%.+]] = "xla_hlo.conditional"([[R1]], [[R2]], [[R2]]) ( {
+  // CHECK: [[R3:%.+]] = "xla_hlo.if"([[R1]], [[R2]], [[R2]]) ( {
   // CHECK: ^bb0([[A1:%.+]]: tuple<tensor<f32>>):
   // CHECK:   [[R7:%.+]] = "xla_hlo.get_tuple_element"([[A1]])
   // CHECK:   [[R8:%.+]] = "xla_hlo.log"([[R7]])
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
index 129a24600a2..bb1169a57d6 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_control_flow.cc
@@ -61,47 +61,46 @@ LogicalResult ReplaceTerminators(Region* region, Block* target_block,
   return success();
 }
 
-LogicalResult LowerConditionalOp(mlir::xla_hlo::ConditionalOp conditional_op) {
-  Operation* op_inst = conditional_op.getOperation();
-  mlir::OpBuilder builder(conditional_op);
+LogicalResult LowerIfOp(mlir::xla_hlo::IfOp if_op) {
+  Operation* op_inst = if_op.getOperation();
+  mlir::OpBuilder builder(if_op);
   auto orig_block = op_inst->getBlock();
   auto* tail_block = orig_block->splitBlock(op_inst);
-  auto loc = conditional_op.getLoc();
+  auto loc = if_op.getLoc();
 
   // Duplicate the true and false regions in the block between the sections
   // before and after the conditional.
   BlockAndValueMapping mapper;
-  conditional_op.true_branch().cloneInto(orig_block->getParent(),
-                                         Region::iterator(tail_block), mapper);
-  conditional_op.false_branch().cloneInto(orig_block->getParent(),
-                                          Region::iterator(tail_block), mapper);
+  if_op.true_branch().cloneInto(orig_block->getParent(),
+                                Region::iterator(tail_block), mapper);
+  if_op.false_branch().cloneInto(orig_block->getParent(),
+                                 Region::iterator(tail_block), mapper);
 
   // Determine the blocks for the start of the true and false regions.
-  Block* true_block = mapper.lookup(&conditional_op.true_branch().front());
-  Block* false_block = mapper.lookup(&conditional_op.false_branch().front());
+  Block* true_block = mapper.lookup(&if_op.true_branch().front());
+  Block* false_block = mapper.lookup(&if_op.false_branch().front());
 
   // Perform the conditional branch into the true/false cases.
   builder.setInsertionPointToEnd(orig_block);
 
   // Extract the predicate for checking branching, then branch to the true and
   // false regions appropriately.
-  auto cond_value =
-      builder.create<mlir::ExtractElementOp>(loc, conditional_op.pred());
+  auto cond_value = builder.create<mlir::ExtractElementOp>(loc, if_op.pred());
   builder.create<mlir::CondBranchOp>(loc, cond_value, true_block,
-                                     conditional_op.true_arg(), false_block,
-                                     conditional_op.false_arg());
+                                     if_op.true_arg(), false_block,
+                                     if_op.false_arg());
 
   // Replace the true case's return operations with a branch to the tail of
   // the condition.
-  if (failed(ReplaceTerminators(&conditional_op.true_branch(), tail_block, loc,
-                                mapper, &builder)))
+  if (failed(ReplaceTerminators(&if_op.true_branch(), tail_block, loc, mapper,
+                                &builder)))
     return failure();
-  if (failed(ReplaceTerminators(&conditional_op.false_branch(), tail_block, loc,
-                                mapper, &builder)))
+  if (failed(ReplaceTerminators(&if_op.false_branch(), tail_block, loc, mapper,
+                                &builder)))
     return failure();
 
-  tail_block->addArguments(conditional_op.getResult().getType());
-  conditional_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
+  tail_block->addArguments(if_op.getResult().getType());
+  if_op.getResult().replaceAllUsesWith(tail_block->getArgument(0));
 
   op_inst->erase();
   return success();
@@ -210,11 +209,11 @@ LogicalResult LowerWhileOp(mlir::xla_hlo::WhileOp while_op) {
 
 void LegalizeControlFlow::runOnFunction() {
   auto func = getFunction();
-  llvm::SmallVector<ConditionalOp, 4> conditional_ops;
-  func.walk([&](ConditionalOp op) { conditional_ops.push_back(op); });
+  llvm::SmallVector<IfOp, 4> if_ops;
+  func.walk([&](IfOp op) { if_ops.push_back(op); });
 
-  for (auto& op : conditional_ops) {
-    if (failed(LowerConditionalOp(op))) return signalPassFailure();
+  for (auto& op : if_ops) {
+    if (failed(LowerIfOp(op))) return signalPassFailure();
   }
 
   llvm::SmallVector<WhileOp, 4> while_ops;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index 86927fe0e07..ef13e66568d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -66,7 +66,7 @@ createLegalizeTFControlFlowPass() {
 namespace {
 
 void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
-  // De-tuple the results of the xla hlo conditional result.
+  // De-tuple the results of the xla hlo if result.
   for (auto result_it : llvm::enumerate(replace)) {
     auto get_tuple_value = builder->create<xla_hlo::GetTupleElementOp>(
         result_it.value().getLoc(), tuple, result_it.index());
@@ -74,11 +74,11 @@ void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
   }
 }
 
-// Imports the source region into the destination region. The XLA conditional
+// Imports the source region into the destination region. The XLA if
 // operation only supports one argument per branch. Therefore any branch that
 // requires additional arguments requires their values be tupled together. Then,
 // to support multiple returns (as XLA only supports a single return value) the
-// results of the conditional are tupled together.
+// results of the if operation are tupled together.
 void ImportXlaRegion(mlir::FuncOp func, Region* dest_region, Location loc,
                      bool tuple_return = true) {
   BlockAndValueMapping mapper;
@@ -114,11 +114,11 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   builder.setInsertionPoint(op);
   auto tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
 
-  // Create the new conditional op with tuple inputs.
+  // Create the new if op with tuple inputs.
   SmallVector<Value, 3> operands(op.getOperands());
   auto result_type = builder.getTupleType(op.getResultTypes());
-  auto conditional = builder.create<xla_hlo::ConditionalOp>(
-      loc, result_type, op.cond(), tuple_input, tuple_input);
+  auto if_op = builder.create<xla_hlo::IfOp>(loc, result_type, op.cond(),
+                                             tuple_input, tuple_input);
 
   // Import the regions for both the true and false cases. These regions
   // must be updated to tuple the return results together and use the xla hlo
@@ -126,12 +126,12 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   BlockAndValueMapping mapper;
   auto then_branch = module.lookupSymbol<mlir::FuncOp>(op.then_branch());
   auto else_branch = module.lookupSymbol<mlir::FuncOp>(op.else_branch());
-  ImportXlaRegion(then_branch, &conditional.true_branch(), loc);
-  ImportXlaRegion(else_branch, &conditional.false_branch(), loc);
+  ImportXlaRegion(then_branch, &if_op.true_branch(), loc);
+  ImportXlaRegion(else_branch, &if_op.false_branch(), loc);
 
-  // De-tuple the results of the xla hlo conditional result.
+  // De-tuple the results of the xla hlo if result.
   builder.setInsertionPointAfter(op);
-  Detuple(conditional.getResult(), op.getResults(), &builder);
+  Detuple(if_op.getResult(), op.getResults(), &builder);
   op.erase();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
index 29646465acd..5a45e0f3b18 100644
--- a/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
@@ -36,9 +36,9 @@ class SinkConstantsToControlFlow
       if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
         SinkToRegion(&while_op.body());
         SinkToRegion(&while_op.cond());
-      } else if (auto cond_op = llvm::dyn_cast<ConditionalOp>(op)) {
-        SinkToRegion(&cond_op.true_branch());
-        SinkToRegion(&cond_op.false_branch());
+      } else if (auto if_op = llvm::dyn_cast<IfOp>(op)) {
+        SinkToRegion(&if_op.true_branch());
+        SinkToRegion(&if_op.false_branch());
       }
     });
   }

From 7b5a4835ea9a47e6e5bede5691f684cafb94e77d Mon Sep 17 00:00:00 2001
From: stevensa <andrew.stevens@infineon.com>
Date: Tue, 19 May 2020 19:43:13 +0200
Subject: [PATCH 0832/1533] Fix: remove typo

---
 tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 1febb49b38b..5b1d145406c 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -269,7 +269,8 @@ struct ConvertTFConvOp : public RewritePattern {
 
   ConvertTFConvOp(MLIRContext *context)
       : RewritePattern(TFConvOpType::getOperationName(), 1, context),
-237
+        intAttrOne(Builder(context).getI32IntegerAttr(1)) {}
+
   LogicalResult matchAndRewrite(Operation *op,
                                 PatternRewriter &rewriter) const override {
     // Assumes TensorFlow convolution op is already verified to be

From 273617ad91deeaa543ec4be95adb34ba5caf62a8 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 19 May 2020 11:16:42 -0700
Subject: [PATCH 0833/1533] Delete Python ScopedAnnotation

PiperOrigin-RevId: 312316009
Change-Id: I6d5d9afc55b33bb832ba5c7a867c65efdea4a5c2
---
 tensorflow/core/profiler/lib/BUILD            |  6 --
 tensorflow/python/profiler/BUILD              | 10 ----
 tensorflow/python/profiler/internal/BUILD     | 13 -----
 .../internal/scoped_annotation_wrapper.cc     | 55 -------------------
 .../python/profiler/scoped_annotation.py      | 49 -----------------
 5 files changed, 133 deletions(-)
 delete mode 100644 tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc
 delete mode 100644 tensorflow/python/profiler/scoped_annotation.py

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 2c4d9e96fcd..e80b9fc9766 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -139,12 +139,6 @@ cc_library(
     ],
 )
 
-tf_pybind_cc_library_wrapper(
-    name = "scoped_annotation_headers",
-    visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
-    deps = [":scoped_annotation"],
-)
-
 cc_library(
     name = "scoped_annotation",
     hdrs = ["scoped_annotation.h"],
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index b6565f594c9..6747ce9bd11 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -239,13 +239,3 @@ py_library(
         ":trace",
     ],
 )
-
-py_library(
-    name = "scoped_annotation",
-    srcs = ["scoped_annotation.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/profiler/internal:_pywrap_scoped_annotation",
-        "@six_archive//:six",
-    ],
-)
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 9b0f216508e..b565ca1b1f4 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -95,19 +95,6 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_python_pybind_extension(
-    name = "_pywrap_scoped_annotation",
-    srcs = ["scoped_annotation_wrapper.cc"],
-    features = ["-layering_check"],
-    module_name = "_pywrap_scoped_annotation",
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/lib:scoped_annotation_headers",
-        "@com_google_absl//absl/types:optional",
-        "@pybind11",
-    ],
-)
-
 tf_python_pybind_extension(
     name = "_pywrap_profiler",
     srcs = ["profiler_wrapper.cc"],
diff --git a/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc b/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc
deleted file mode 100644
index 078ebb0966c..00000000000
--- a/tensorflow/python/profiler/internal/scoped_annotation_wrapper.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "absl/types/optional.h"
-#include "pybind11/pybind11.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/scoped_annotation.h"
-
-namespace py = pybind11;
-
-namespace {
-
-// Helper to implement ScopedAnnotation as a context manager in Python.
-class ScopedAnnotationWrapper {
- public:
-  explicit ScopedAnnotationWrapper(const tensorflow::string& name)
-      : name_(name) {}
-
-  void Enter() { annotation_.emplace(std::move(name_)); }
-
-  void Exit() { annotation_.reset(); }
-
-  static bool IsEnabled() {
-    return tensorflow::profiler::ScopedAnnotation::IsEnabled();
-  }
-
- private:
-  tensorflow::string name_;
-  absl::optional<tensorflow::profiler::ScopedAnnotation> annotation_;
-};
-
-}  // namespace
-
-PYBIND11_MODULE(_pywrap_scoped_annotation, m) {
-  py::class_<ScopedAnnotationWrapper> scoped_annotation_class(
-      m, "ScopedAnnotation");
-  scoped_annotation_class.def(py::init<const tensorflow::string&>())
-      .def("Enter", &ScopedAnnotationWrapper::Enter)
-      .def("Exit", &ScopedAnnotationWrapper::Exit)
-      .def_static("IsEnabled", &ScopedAnnotationWrapper::IsEnabled);
-};
diff --git a/tensorflow/python/profiler/scoped_annotation.py b/tensorflow/python/profiler/scoped_annotation.py
deleted file mode 100644
index 1d7e2b024b4..00000000000
--- a/tensorflow/python/profiler/scoped_annotation.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ScopedAnnotation allows the profiler to annotate device (e.g., GPU) events.
-
-Usage:
-    with scoped_annotation.ScopedAnnotation('name'):
-      ...
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import six
-
-from tensorflow.python.profiler.internal import _pywrap_scoped_annotation
-
-
-class ScopedAnnotation(object):
-  """Context manager that generates an annotation for the profiler."""
-
-  def __init__(self, name, **kwargs):
-    if _pywrap_scoped_annotation.ScopedAnnotation.IsEnabled():
-      if kwargs:
-        name += '#' + ','.join(key + '=' + str(value)
-                               for key, value in six.iteritems(kwargs)) + '#'
-      self._scoped_annotation = _pywrap_scoped_annotation.ScopedAnnotation(name)
-    else:
-      self._scoped_annotation = None
-
-  def __enter__(self):
-    if self._scoped_annotation:
-      self._scoped_annotation.Enter()
-
-  def __exit__(self, exc_type, exc_val, exc_tb):
-    if self._scoped_annotation:
-      self._scoped_annotation.Exit()

From b7735095de23aa2aac940a984a24f25f8c26395c Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <laurenzo@google.com>
Date: Tue, 19 May 2020 11:17:02 -0700
Subject: [PATCH 0834/1533] Remove implicit broadcasting from xla_hlo binary
 elementwise ops.

* Migrates legalize-tf conversions to either:
  * convert through the chlo.broadcast_* ops (majority)
  * have special case broadcasting for non-supported or non-optimal broadcast modes
* This was done one by one, qc'ing each and many bugs/inefficiencies/ambiguous broadcasting modes were corrected.
* Looks like it may be missing a rule for legalizing complex types (will check on Monday).
* I considered splitting this up, but it was actually pretty important to make the ops more strict to flush out all cases (best done as an atomic change).
* Stricter conversions fixed a number of cases where shapes were dropping to unranked or unknown (and needn't be).
* With this change, most of the binary ops and many of the resulting tf2xla expansions correctly support dynamic shapes via the shape dialect.
  * I verified this with the small set of IREE test cases which support dynamic shapes and will expand coverage once this lands.
* This is some test fallout outside of the xla directory that I will fixup on Monday.

PiperOrigin-RevId: 312316083
Change-Id: I6d246d80cddb84f2dfd62817c7166f53c1f6cdec
---
 .../mlir/tensorflow/tests/legalize_hlo.mlir   |  98 +--
 .../tensorflow/transforms/legalize_hlo.cc     |   1 +
 .../transforms/legalize_hlo_patterns.td       |  70 +-
 tensorflow/compiler/mlir/xla/ir/chlo_ops.cc   |  10 +
 tensorflow/compiler/mlir/xla/ir/chlo_ops.td   |   5 +
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    |  88 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  51 +-
 .../compiler/mlir/xla/ir/mlir_hlo_builder.cc  |   1 -
 .../tests/legalize-tf-binary-elementwise.mlir | 334 ++++++++
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 780 ++++--------------
 .../mlir/xla/tests/legalize-to-std.mlir       |  36 +-
 .../mlir/xla/tests/lower-complex.mlir         |  90 +-
 .../xla/tests/materialize-broadcasts.mlir     | 280 -------
 .../mlir/xla/tests/translate/export.mlir      |  30 +-
 .../mlir/xla/tests/translate/import.hlotxt    |  47 +-
 .../xla/transforms/chlo_legalize_to_hlo.cc    |   9 +-
 .../mlir/xla/transforms/legalize_tf.cc        | 488 +++++++----
 .../xla/transforms/legalize_tf_patterns.td    |  80 +-
 .../legalize_to_standard_patterns.td          |  33 +-
 .../xla/transforms/lower_complex_patterns.td  |  58 +-
 .../xla/transforms/materialize_broadcasts.cc  | 332 +-------
 .../compiler/mlir/xla/transforms/passes.h     |   5 +-
 .../mlir/xla/transforms/unfuse_batch_norm.cc  |  14 +-
 23 files changed, 1056 insertions(+), 1884 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 10cb4f8019d..7691a6bd6e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -2,17 +2,17 @@
 
 
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
@@ -23,12 +23,12 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  %0 = "xla_chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0 : tensor<4x4x4x4xi32>
 }
 
@@ -38,7 +38,7 @@ func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -48,7 +48,7 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   return %0 : tensor<?x?xi32>
 }
 
@@ -68,7 +68,7 @@ func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -78,7 +78,7 @@ func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -88,7 +88,7 @@ func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "xla_chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0 : tensor<1x2xi32>
 }
 
@@ -98,7 +98,7 @@ func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  %0 = "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
+  %0 = "xla_chlo.broadcast_shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
 
@@ -108,12 +108,12 @@ func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
 }
 
 func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -123,12 +123,12 @@ func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
 }
 
 func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -138,12 +138,12 @@ func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
 func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  %0 = "xla_hlo.or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %0 = "xla_chlo.broadcast_or"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -153,12 +153,12 @@ func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
 }
 
 func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   return %0 : tensor<1x4xi8>
 }
 
 func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  %0 = "xla_hlo.and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
+  %0 = "xla_chlo.broadcast_and"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
 
@@ -174,19 +174,19 @@ func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<2x3xi32>
-  %1 = "xla_hlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+  %1 = "xla_chlo.broadcast_compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
   %2 = xla_hlo.constant dense<0> : tensor<3xi32>
-  %3 = "xla_hlo.compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-  %4 = "xla_hlo.compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
-  %5 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %3 = "xla_chlo.broadcast_compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
+  %4 = "xla_chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
+  %5 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %6 = "xla_hlo.abs"(%arg0) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %7 = "xla_hlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
   %8 = xla_hlo.constant dense<1> : tensor<3xi32>
   %9 = xla_hlo.subtract %7, %8 : tensor<3xi32>
-  %10 = "xla_hlo.add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %10 = "xla_chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %11 = "xla_hlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "xla_hlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
-  %13 = "xla_hlo.divide"(%11, %12) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %13 = "xla_chlo.broadcast_divide"(%11, %12) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %14 = "xla_hlo.select"(%4, %5, %13) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   return %14 : tensor<2x3xi32>
 }
@@ -195,14 +195,14 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
   %0 = xla_hlo.constant dense<0> : tensor<3xi32>
   %1 = "xla_hlo.compare"(%arg0, %0) {comparison_direction = "LT"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
   %2 = xla_hlo.constant dense<0> : tensor<2x3xi32>
-  %3 = "xla_hlo.compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-  %4 = "xla_hlo.compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
-  %5 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %3 = "xla_chlo.broadcast_compare"(%arg1, %2) {comparison_direction = "LT"} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
+  %4 = "xla_chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
+  %5 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %6 = "xla_hlo.abs"(%arg0) : (tensor<3xi32>) -> tensor<3xi32>
   %7 = "xla_hlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %8 = xla_hlo.constant dense<1> : tensor<2x3xi32>
   %9 = xla_hlo.subtract %7, %8 : tensor<2x3xi32>
-  %10 = "xla_hlo.add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %10 = "xla_chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %11 = "xla_hlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "xla_hlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %13 = xla_hlo.divide %11, %12 : tensor<2x3xi32>
@@ -218,8 +218,8 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 }
 
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-  %1 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %0 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %1 = "xla_chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
   %2 = "xla_hlo.floor"(%1) : (tensor<2x3xf16>) -> tensor<2x3xf16>
   return %2 : tensor<2x3xf16>
 }
@@ -230,22 +230,22 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -255,17 +255,17 @@ func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
 func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {comparison_direction = "NE"} : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0 : tensor<?xi1>
 }
 
@@ -275,7 +275,7 @@ func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -285,7 +285,7 @@ func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -295,7 +295,7 @@ func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -305,7 +305,7 @@ func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 }
 
 func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "xla_chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0 : tensor<1x2xi1>
 }
 
@@ -326,35 +326,35 @@ func @const() -> tensor<2xi32> {
 
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  %1 = "xla_chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   return %1 : tensor<1xi32>
 }
 
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
-  %1 = "xla_hlo.maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  %1 = "xla_chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   return %1 : tensor<?xi32>
 }
 
 func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
   %1 = xla_hlo.constant dense<6> : tensor<i32>
-  %2 = "xla_hlo.minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  %3 = "xla_hlo.maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %2 = "xla_chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %3 = "xla_chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
   return %3 : tensor<1xi32>
 }
 
 func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = xla_hlo.constant dense<0> : tensor<i32>
   %1 = xla_hlo.constant dense<6> : tensor<i32>
-  %2 = "xla_hlo.minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-  %3 = "xla_hlo.maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %2 = "xla_chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %3 = "xla_chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   return %3 : tensor<?xi32>
 }
 
 func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf32> {
   %0 = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "xla_hlo.compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+  %1 = "xla_chlo.broadcast_compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
   %2 = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32>
   %3 = "xla_hlo.select"(%1, %arg0, %2) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
   return %3 : tensor<4x8xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index 50f77cd9c3d..b1cbc41a03e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index f3371989b73..6fd7556084d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -18,12 +18,16 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
 
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
+// Note that these are legalized from chlo.broadcast_* ops, since those are
+// semantically compatible with the corresponding TF ops. Depending on
+// context, getting to these ops may require some raising.
 //===----------------------------------------------------------------------===//
 
 // Check that two values can be broadcasted together
@@ -31,36 +35,45 @@ def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
 def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
     "types must be broadcastable">;
 
-foreach fromToBinPair = [[HLO_AddOp, TF_AddV2Op],
-                         [HLO_DivOp, TF_DivOp],
-                         [HLO_ShiftLeftOp, TF_LeftShiftOp],
-                         [HLO_MaxOp, TF_MaximumOp],
-                         [HLO_MinOp, TF_MinimumOp],
-                         [HLO_MulOp, TF_MulOp],
-                         [HLO_PowOp, TF_PowOp],
-                         [HLO_SubOp, TF_SubOp],
-                         [HLO_Atan2Op, TF_Atan2Op],
-                         [HLO_RemOp, TF_ModOp]] in
-  def : Pat<(fromToBinPair[0] $l, $r, $_), (fromToBinPair[1] $l, $r),
+foreach fromToBinPair = [[HLO_AddOp, HLOClient_BroadcastAddOp, TF_AddV2Op],
+                         [HLO_DivOp, HLOClient_BroadcastDivOp, TF_DivOp],
+                         [HLO_ShiftLeftOp, HLOClient_BroadcastShiftLeftOp, TF_LeftShiftOp],
+                         [HLO_MaxOp, HLOClient_BroadcastMaxOp, TF_MaximumOp],
+                         [HLO_MinOp, HLOClient_BroadcastMinOp, TF_MinimumOp],
+                         [HLO_MulOp, HLOClient_BroadcastMulOp, TF_MulOp],
+                         [HLO_PowOp, HLOClient_BroadcastPowOp, TF_PowOp],
+                         [HLO_SubOp, HLOClient_BroadcastSubOp, TF_SubOp],
+                         [HLO_Atan2Op, HLOClient_BroadcastAtan2Op, TF_Atan2Op],
+                         [HLO_RemOp, HLOClient_BroadcastRemOp, TF_ModOp]] in {
+  def : Pat<(fromToBinPair[0] $l, $r), (fromToBinPair[2] $l, $r)>;
+  def : Pat<(fromToBinPair[1] $l, $r, $_), (fromToBinPair[2] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+}
 
-foreach pair  = [[HLO_AndOp, TF_BitwiseAndOp],
-                 [HLO_OrOp, TF_BitwiseOrOp],
-                 [HLO_XorOp, TF_BitwiseXorOp]] in
-  def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r, $_), (pair[1] $l, $r),
+foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_BitwiseAndOp],
+                 [HLO_OrOp, HLOClient_BroadcastOrOp, TF_BitwiseOrOp],
+                 [HLO_XorOp, HLOClient_BroadcastXorOp, TF_BitwiseXorOp]] in {
+  def : Pat<(pair[0] TF_IntTensor:$l, TF_IntTensor:$r), (pair[2] $l, $r)>;
+  def : Pat<(pair[1] TF_IntTensor:$l, TF_IntTensor:$r, $_), (pair[2] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+}
 
-foreach pair  = [[HLO_AndOp, TF_LogicalAndOp],
-                 [HLO_OrOp, TF_LogicalOrOp]] in
-  def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r, $_), (pair[1] $l, $r),
+foreach pair  = [[HLO_AndOp, HLOClient_BroadcastAndOp, TF_LogicalAndOp],
+                 [HLO_OrOp, HLOClient_BroadcastOrOp, TF_LogicalOrOp]] in {
+  def : Pat<(pair[0] I1Tensor:$l, I1Tensor:$r), (pair[2] $l, $r)>;
+  def : Pat<(pair[1] I1Tensor:$l, I1Tensor:$r, $_), (pair[2] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+}
 
-def : Pat<(HLO_ShiftRightArithmeticOp $l, $r, $_), (TF_RightShiftOp $l, $r),
+def : Pat<(HLO_ShiftRightArithmeticOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(HLOClient_BroadcastShiftRightArithmeticOp $l, $r, $_), (TF_RightShiftOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
-def : Pat<(HLO_ShiftRightLogicalOp $l, $r, $_), (TF_RightShiftOp $l, $r),
+def : Pat<(HLO_ShiftRightLogicalOp $l, $r), (TF_RightShiftOp $l, $r)>;
+def : Pat<(HLOClient_BroadcastShiftRightLogicalOp $l, $r, $_), (TF_RightShiftOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
 
-def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r, $_)), (TF_FloorDivOp $l, $r),
+def : Pat<(HLO_FloorOp (HLO_DivOp $l, $r)), (TF_FloorDivOp $l, $r)>;
+def : Pat<(HLO_FloorOp (HLOClient_BroadcastDivOp $l, $r, $_)), (TF_FloorDivOp $l, $r),
           [(AreBroadcastCompatible $l, $r)]>;
 
 def : Pat<(HLO_ComplexOp $r, $i), (TF_ComplexOp $r, $i)>;
@@ -117,16 +130,23 @@ def : Pat<(HLO_ConcatenateOp $inputs, $dim),
 
 //===----------------------------------------------------------------------===//
 // Compare op patterns.
+// Note that these are legalized from chlo.broadcast_* ops, since those are
+// semantically compatible with the corresponding TF ops. Depending on
+// context, getting to these ops may require some raising.
 //===----------------------------------------------------------------------===//
 
 foreach p = [[TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ],
-             [TF_NotEqualOp, HLO_COMPARISON_DIRECTION_NE]] in
-  def : Pat<(HLO_CompareOp $l, $r, $_, p[1]), (p[0] $l, $r, ConstBoolAttrTrue),
+             [TF_NotEqualOp, HLO_COMPARISON_DIRECTION_NE]] in {
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, p[1]), (p[0] $l, $r, ConstBoolAttrTrue),
             [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(HLO_CompareOp $l, $r, p[1]), (p[0] $l, $r, ConstBoolAttrTrue)>;
+}
 
 foreach pair = [[TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE],
                 [TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT],
                 [TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE],
-                [TF_LessOp, HLO_COMPARISON_DIRECTION_LT]] in
-  def : Pat<(HLO_CompareOp $l, $r, $_, pair[1]), (pair[0] $l, $r),
+                [TF_LessOp, HLO_COMPARISON_DIRECTION_LT]] in {
+  def : Pat<(HLOClient_BroadcastCompareOp $l, $r, $_, pair[1]), (pair[0] $l, $r),
             [(AreBroadcastCompatible $l, $r)]>;
+  def : Pat<(HLO_CompareOp $l, $r, pair[1]), (pair[0] $l, $r)>;
+}
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
index 5322668aa2e..26db4549a2a 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.cc
@@ -185,6 +185,16 @@ LogicalResult BroadcastComplexOp::reifyReturnTypeShapes(
 // BroadcastCompareOp (has custom type inference due to different result type).
 //===----------------------------------------------------------------------===//
 
+void BroadcastCompareOp::build(OpBuilder& builder, OperationState& result,
+                               Value lhs, Value rhs,
+                               DenseIntElementsAttr broadcast_dimensions,
+                               StringAttr comparison_direction) {
+  auto new_type = GetBroadcastType(lhs.getType(), rhs.getType(),
+                                   builder.getI1Type(), broadcast_dimensions);
+  build(builder, result, new_type, lhs, rhs, broadcast_dimensions,
+        comparison_direction);
+}
+
 LogicalResult BroadcastCompareOp::inferReturnTypeComponents(
     MLIRContext* context, Optional<Location> location, ValueRange operands,
     DictionaryAttr attributes, RegionRange regions,
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
index f9672c1a95a..febc99f6b72 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
@@ -360,6 +360,11 @@ def HLOClient_BroadcastCompareOp : HLOClient_BroadcastBinaryElementwiseOp<
     HLO_ComparisonDirectionAttr:$comparison_direction
   );
   let results = (outs HLO_PredTensor);
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
+    "DenseIntElementsAttr broadcast_dimensions, StringAttr comparison_direction"
+  >];
 }
 
 #endif  // CHLO_OPS
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index b6036ee2130..03928467cff 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -1401,89 +1401,25 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
 //===----------------------------------------------------------------------===//
 
 namespace {
-// Gets the resulting type from a broadcast between two types.
-static Type GetBroadcastType(Builder* builder, Type x, Type y,
-                             Type element_type,
-                             DenseIntElementsAttr broadcast_dimensions) {
+
+// Updates the element type of a (presumed) tensor type 'x', returning either
+// a permuted UnrankedTensorType or RankedTensorType.
+static Type UpdateResultElementType(Builder* builder, Type x,
+                                    Type element_type) {
   auto x_ranked = x.dyn_cast<RankedTensorType>();
-  auto y_ranked = y.dyn_cast<RankedTensorType>();
-  if (!x_ranked || !y_ranked) {
+  if (!x_ranked) {
     return UnrankedTensorType::get(element_type);
   }
 
   auto shape_x = x_ranked.getShape();
-  auto shape_y = y_ranked.getShape();
-
-  if (shape_x.size() == shape_y.size()) {
-    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
-    for (int i = 0; i < shape_x.size(); i++) {
-      auto x_val = shape_x[i];
-      auto y_val = shape_y[i];
-      if (x_val == -1 || y_val == -1) {
-        out_shape[i] = -1;
-      } else {
-        out_shape[i] = std::max(x_val, y_val);
-      }
-    }
-    return RankedTensorType::get(out_shape, element_type);
-  }
-
-  // Return unranked tensor for invalid broadcast dimensions.
-  if (!broadcast_dimensions) return UnrankedTensorType::get(element_type);
-
-  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
-  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
-
-  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
-                                          shape_large.end());
-
-  // Update according to the broadcast dimensions.
-  for (auto index_pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
-    auto old_value = out_shape[index_pair.value().getSExtValue()];
-    auto new_value = shape_small[index_pair.index()];
-    if (old_value != -1 && (new_value == -1 || new_value > old_value)) {
-      out_shape[index_pair.value().getSExtValue()] = new_value;
-    }
-  }
-
-  return RankedTensorType::get(out_shape, element_type);
+  return RankedTensorType::get(shape_x, element_type);
 }
 }  // namespace
 
-#define BINARY_BUILDER(Op)                                                    \
-  void Op::build(OpBuilder& builder, OperationState& result, Value left,      \
-                 Value right, DenseIntElementsAttr broadcast_dimensions) {    \
-    auto type = GetBroadcastType(&builder, left.getType().cast<ShapedType>(), \
-                                 right.getType().cast<ShapedType>(),          \
-                                 getElementTypeOrSelf(right.getType()),       \
-                                 broadcast_dimensions);                       \
-    return Op::build(builder, result, type, left, right,                      \
-                     broadcast_dimensions);                                   \
-  }
-
-BINARY_BUILDER(AddOp);
-BINARY_BUILDER(AndOp);
-BINARY_BUILDER(Atan2Op);
-BINARY_BUILDER(DivOp);
-BINARY_BUILDER(MaxOp);
-BINARY_BUILDER(MinOp);
-BINARY_BUILDER(MulOp);
-BINARY_BUILDER(OrOp);
-BINARY_BUILDER(PowOp);
-BINARY_BUILDER(RemOp);
-BINARY_BUILDER(ShiftLeftOp);
-BINARY_BUILDER(ShiftRightArithmeticOp);
-BINARY_BUILDER(ShiftRightLogicalOp);
-BINARY_BUILDER(SubOp);
-BINARY_BUILDER(XorOp);
-
-#undef BINARY_BUILDER
-
 template <typename Op, typename ElementType = Type, typename ValType,
           typename Convert>
 static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   if (!attrs[0] || !attrs[1]) return {};
-  if (op->broadcast_dimensions().hasValue()) return {};
 
   DenseElementsAttr lhs = attrs[0].dyn_cast<DenseElementsAttr>();
   DenseElementsAttr rhs = attrs[1].dyn_cast<DenseElementsAttr>();
@@ -1893,12 +1829,10 @@ void UnaryEinsumOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 
 void CompareOp::build(OpBuilder& builder, OperationState& result, Value lhs,
-                      Value rhs, DenseIntElementsAttr broadcast_dimensions,
-                      StringAttr comparison_direction) {
-  auto new_type = GetBroadcastType(&builder, lhs.getType(), rhs.getType(),
-                                   builder.getI1Type(), broadcast_dimensions);
-  build(builder, result, new_type, lhs, rhs, broadcast_dimensions,
-        comparison_direction);
+                      Value rhs, StringAttr comparison_direction) {
+  auto new_type =
+      UpdateResultElementType(&builder, lhs.getType(), builder.getI1Type());
+  build(builder, result, new_type, lhs, rhs, comparison_direction);
 }
 
 #define GET_OP_CLASSES
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 5d46140c3ea..99801f1618e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -241,15 +241,9 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
         HLO_Op<mnemonic, !listconcat(traits, [InferShapedTypeOpInterface])> {
   let arguments = (ins
     HLO_Tensor:$lhs,
-    HLO_Tensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+    HLO_Tensor:$rhs
   );
 
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions"
-  >];
-
   let extraClassDeclaration = [{
     static  LogicalResult inferReturnTypeComponents(
         MLIRContext* context, Optional<Location> location, ValueRange operands,
@@ -270,15 +264,15 @@ class HLO_BinaryElementwiseOp<string mnemonic, list<OpTrait> traits> :
 }
 
 def HLO_AddOp : HLO_BinaryElementwiseOp<"add",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_AddOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_AddOp {
   let hasFolder = 1;
 }
 
 def HLO_Atan2Op : HLO_BinaryElementwiseOp<"atan2",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_Atan2Op;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_Atan2Op;
 
 def HLO_ComplexOp: HLO_Op<"complex",
-    [NoSideEffect, SameOperandsElementType, SameOperandsAndResultShape]>,
+    [NoSideEffect, SameOperandsAndResultShape]>,
     BASE_HLO_ComplexOp {
   let builders = [OpBuilder<
     "OpBuilder &, OperationState &tblgen_state, Value lhs, Value rhs">];
@@ -289,39 +283,39 @@ def HLO_ComplexOp: HLO_Op<"complex",
 }
 
 def HLO_DivOp : HLO_BinaryElementwiseOp<"divide",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_DivOp {
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_DivOp {
 }
 
 def HLO_MaxOp : HLO_BinaryElementwiseOp<"maximum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MaxOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MaxOp {
 }
 
 def HLO_MinOp : HLO_BinaryElementwiseOp<"minimum",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MinOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MinOp {
 }
 
 def HLO_MulOp : HLO_BinaryElementwiseOp<"multiply",
-      [Commutative, NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_MulOp {
+      [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MulOp {
   let hasFolder = 1;
 }
 
 def HLO_PowOp : HLO_BinaryElementwiseOp<"power",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_PowOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_PowOp;
 
 def HLO_RemOp : HLO_BinaryElementwiseOp<"remainder",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_RemOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_RemOp;
 
 def HLO_ShiftLeftOp : HLO_BinaryElementwiseOp<"shift_left",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftLeftOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftLeftOp;
 
 def HLO_ShiftRightArithmeticOp : HLO_BinaryElementwiseOp<"shift_right_arithmetic",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightArithmeticOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftRightArithmeticOp;
 
 def HLO_ShiftRightLogicalOp : HLO_BinaryElementwiseOp<"shift_right_logical",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ShiftRightLogicalOp;
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_ShiftRightLogicalOp;
 
 def HLO_SubOp : HLO_BinaryElementwiseOp<"subtract",
-      [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_SubOp {
+      [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_SubOp {
   let hasFolder = 1;
 }
 
@@ -331,11 +325,11 @@ def HLO_SubOp : HLO_BinaryElementwiseOp<"subtract",
 
 // See https://www.tensorflow.org/xla/operation_semantics#element-wise_binary_arithmetic_operations
 class HLO_BinaryLogicalElementwiseOp<string mnemonic> :
-        HLO_BinaryElementwiseOp<mnemonic, [Commutative, NoSideEffect]> {
+        HLO_BinaryElementwiseOp<
+            mnemonic, [Commutative, NoSideEffect, SameOperandsAndResultType]> {
   let arguments = (ins
     HLO_PredOrIntTensor:$lhs,
-    HLO_PredOrIntTensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions
+    HLO_PredOrIntTensor:$rhs
   );
 }
 
@@ -617,23 +611,18 @@ def HLO_TupleOp : HLO_Op<"tuple", [NoSideEffect]>, BASE_HLO_TupleOp {
 }
 
 def HLO_CompareOp: HLO_Op<"compare",
-      [NoSideEffect, SameOperandsElementType]>, BASE_HLO_CompareOp {
+      [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]>,
+      BASE_HLO_CompareOp {
   let arguments = (ins
     HLO_Tensor:$lhs,
     HLO_Tensor:$rhs,
-    OptionalAttr<BroadcastDimAttr>:$broadcast_dimensions,
     HLO_ComparisonDirectionAttr:$comparison_direction
   );
-  let builders = [OpBuilder<
-    "OpBuilder &builder, OperationState &result, Value left, Value  right, "
-    "DenseIntElementsAttr broadcast_dimensions, "
-    "StringAttr comparison_direction"
-  >];
   let results = (outs HLO_PredTensor);
 
   let builders = [OpBuilder<
     "OpBuilder &builder, OperationState &result, Value lhs, Value rhs, "
-    "DenseIntElementsAttr broadcast_dimensions, StringAttr comparison_direction"
+    "StringAttr comparison_direction"
   >];
 }
 
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 461c357e509..774caab77fb 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -209,7 +209,6 @@ StatusOr<XlaOp> MlirHloBuilder::Compare(const Shape& shape, XlaOp lhs,
                                          shape, builder_));
   auto op = builder_.create<mlir::xla_hlo::CompareOp>(
       loc_, ty, GetValue(lhs), GetValue(rhs),
-      /*broadcast_dimensions=*/mlir::DenseIntElementsAttr(),
       builder_.getStringAttr(ComparisonDirectionToString(direction)));
   return MakeXlaOp(op.getResult());
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
new file mode 100644
index 00000000000..c114b8c50a5
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-binary-elementwise.mlir
@@ -0,0 +1,334 @@
+// Note that binary elementwise tests are run with chlo legalization enabled
+// (unlike the rest), since this is the primary use case for such ops and
+// verification of shapes and broadcasts is desired.
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" %s | FileCheck %s --dump-input-on-failure
+
+//===----------------------------------------------------------------------===//
+// Binary op legalizations.
+// Most of these expand from the same pattern. Full semantics are
+// verified for tf.Add and pattern application only for the rest.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @add
+func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %[[SUM0:.*]] = xla_hlo.add %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  %[[SUM1:.*]] = xla_hlo.add %[[SUM0]], %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %[[SUM1]] : tensor<2xi32>
+  %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  %1 = "tf.AddV2"(%0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %1: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @broadcast_add
+// TODO(laurenzo): Change this to a (5 + 2x1) shaped add to make the check
+// patterns unambiguous and more interesting (once broadcastable trait is
+// fixed upstream).
+func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  return %0: tensor<1x2xi32>
+}
+
+// CHECK-LABEL: func @broadcast_multi_dim_add
+// TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
+// broadcastable bug is fixed (helps make the CHECK matching unambiguous)
+func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  return %0: tensor<4x4x4x4xi32>
+}
+
+// CHECK-LABEL: func @add_dynamic
+func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func @div
+func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Div"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @shift_left
+func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
+  %0 = "tf.LeftShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @div_unranked
+func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK: tf.Div
+  %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
+// CHECK-LABEL: func @maximum
+func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
+  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @minimum
+func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
+  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
+  %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: func @mul
+func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.multiply %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Mul"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @real_div
+func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
+  %0 = "tf.RealDiv"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @sub
+func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+  // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
+  // CHECK-NEXT:  return %0 : tensor<2xi32>
+  %0 = "tf.Sub"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// CHECK-LABEL: func @shift_right
+func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
+// CHECK-LABEL: func @shift_right_unsigned
+func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
+  // CHECK:  tf.RightShift
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<4xui8>) -> tensor<4xui8>
+  return %0 : tensor<4xui8>
+}
+
+// CHECK-LABEL: func @broadcast_shift_right_unsigned
+func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
+  // CHECK:  tf.RightShift
+  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
+  return %0 : tensor<2x4xui8>
+}
+
+// CHECK-LABEL: func @and
+func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+  // CHECK-NEXT:  xla_hlo.and
+  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @and_unranked
+func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
+  // CHECK: tf.LogicalAnd
+  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @or
+func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
+  // CHECK-NEXT:  xla_hlo.or
+  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @bitwise_or
+func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK-NEXT: xla_hlo.or
+  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0: tensor<4xi32>
+}
+
+// CHECK-LABEL: func @bitwise_and
+func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
+  // CHECK-NEXT: xla_hlo.and
+  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+  return %0: tensor<4xi32>
+}
+
+// CHECK-LABEL: func @pow
+func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  // CHECK-NEXT:  xla_hlo.power
+  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %0: tensor<2xf32>
+}
+
+//===----------------------------------------------------------------------===//
+// Equality op legalizations.
+// tf.Equal and tf.NotEqual expand from the same pattern. Full semantics are
+// verified for tf.Equal and pattern application only for tf.NotEqual
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @equal
+func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @equal_dynamic
+func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @equal_broadcast
+func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
+  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @equal_broadcast_no_incompatible_shapes_error
+func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_broadcastable
+func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_dynamic
+func @equal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @equal_incompatible_shape_both_dynamic
+func @equal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
+  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @equal_unranked
+func @equal_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1> {
+  // CHECK: "tf.Equal"
+  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @notequal
+func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"}
+  %0 = "tf.NotEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+//===----------------------------------------------------------------------===//
+// Compare op legalizations.
+// These expand from the same pattern. Full semantics are checked for
+// tf.Greater. Others just check that the pattern applied.
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @greater
+func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK: "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @broadcast_greater
+func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  return %0: tensor<1x2xi1>
+}
+
+// CHECK-LABEL: func @greater_dynamic
+func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
+  return %0: tensor<?xi1>
+}
+
+// CHECK-LABEL: func @greater_uranked
+func @greater_uranked(%arg0: tensor<*xi32>) -> tensor<*xi1> {
+  // CHECK:  "tf.Greater"
+  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+  return %0: tensor<*xi1>
+}
+
+// CHECK-LABEL: func @greater_equal
+func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"}
+  %0 = "tf.GreaterEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @less
+func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
+  %0 = "tf.Less"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
+
+// CHECK-LABEL: func @less_equal
+func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
+  %0 = "tf.LessEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
+  return %0: tensor<2xi1>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index d5440a024ab..bfa96413e7c 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1,4 +1,11 @@
-// RUN: tf-opt -xla-legalize-tf=allow-partial-conversion %s | FileCheck %s --dump-input-on-failure
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=false" %s | FileCheck %s --dump-input-on-failure
+// RUN: tf-opt "-xla-legalize-tf=allow-partial-conversion legalize-chlo=true" -verify-diagnostics %s
+// This test runs twice:
+//   1. Through FileCheck with chlo legalization disabled since verifying
+//      that the chlo ops emit produces more useful tests.
+//   2. With chlo legalization enabled, verifying diagnostics to pick up any
+//      issues with the full lowering (can catch some broadcasting corner
+//      cases which emit with a warning).
 
 //===----------------------------------------------------------------------===//
 // BatchNorm op legalizations.
@@ -47,7 +54,7 @@ func @fusedBatchNormV3_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>
   // CHECK: "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 1 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK: %[[VAR:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32} : (tuple<tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>>) -> tensor<8xf32>
   // CHECK: xla_hlo.constant
-  // CHECK: "xla_hlo.multiply"(%[[VAR]], {{.*}}) : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK: xla_chlo.broadcast_multiply %[[VAR]], {{.*}} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   return %0#0 : tensor<8x8x8x8xf32>
 }
 
@@ -68,18 +75,18 @@ func @fusedBatchNormV3_training_exponentialAvgFactor(%arg0: tensor<8x8x8x8xf32>,
   // CHECK-DAG: %[[BATCH_VAR:.*]] = "xla_hlo.get_tuple_element"(%[[RESULT0]]) {index = 2 : i32}
 
   // CHECK: %[[FACTOR:.*]] = xla_hlo.constant dense<1.00195694>
-  // CHECK: %[[CORRECTED_VAR:.*]] = "xla_hlo.multiply"(%[[BATCH_VAR]], %[[FACTOR]])
+  // CHECK: %[[CORRECTED_VAR:.*]] = xla_chlo.broadcast_multiply %[[BATCH_VAR]], %[[FACTOR]]
 
   // CHECK-DAG: %[[ALPHA:.*]] = xla_hlo.constant dense<0.199999988>
   // CHECK-DAG: %[[BETA:.*]] = xla_hlo.constant dense<8.000000e-01>
 
-  // CHECK: %[[ALPHA_MUL_OLD_MEAN:.*]] = "xla_hlo.multiply"(%[[ALPHA]], %arg3)
-  // CHECK: %[[BETA_MUL_BATCH_MEAN:.*]] = "xla_hlo.multiply"(%[[BETA]], %[[BATCH_MEAN]])
-  // CHECK: %[[NEW_BATCH_MEAN:.*]] = xla_hlo.add %[[ALPHA_MUL_OLD_MEAN]], %[[BETA_MUL_BATCH_MEAN]]
+  // CHECK: %[[ALPHA_MUL_OLD_MEAN:.*]] = xla_chlo.broadcast_multiply %[[ALPHA]], %arg3
+  // CHECK: %[[BETA_MUL_BATCH_MEAN:.*]] = xla_chlo.broadcast_multiply %[[BETA]], %[[BATCH_MEAN]]
+  // CHECK: %[[NEW_BATCH_MEAN:.*]] = xla_chlo.broadcast_add %[[ALPHA_MUL_OLD_MEAN]], %[[BETA_MUL_BATCH_MEAN]]
 
-  // CHECK: %[[ALPHA_MUL_OLD_VAR:.*]] = "xla_hlo.multiply"(%[[ALPHA]], %arg4)
-  // CHECK: %[[BETA_MUL_CORRECTED_VAR:.*]] = "xla_hlo.multiply"(%[[BETA]], %[[CORRECTED_VAR]])
-  // CHECK: %[[NEW_BATCH_VAR:.*]] = xla_hlo.add %[[ALPHA_MUL_OLD_VAR]], %[[BETA_MUL_CORRECTED_VAR]]
+  // CHECK: %[[ALPHA_MUL_OLD_VAR:.*]] = xla_chlo.broadcast_multiply %[[ALPHA]], %arg4
+  // CHECK: %[[BETA_MUL_CORRECTED_VAR:.*]] = xla_chlo.broadcast_multiply %[[BETA]], %[[CORRECTED_VAR]]
+  // CHECK: %[[NEW_BATCH_VAR:.*]] = xla_chlo.broadcast_add %[[ALPHA_MUL_OLD_VAR]], %[[BETA_MUL_CORRECTED_VAR]]
 
   // CHECK: return %[[NEW_BATCH_MEAN]], %[[NEW_BATCH_VAR]], %[[BATCH_MEAN]], %[[BATCH_VAR]]
   return %0#1, %0#2, %0#3, %0#4 : tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>
@@ -127,11 +134,12 @@ func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -142,10 +150,10 @@ func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -185,11 +193,12 @@ func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -200,10 +209,11 @@ func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -270,11 +280,12 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -285,10 +296,11 @@ func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<
   // CHECK-NEXT: }) {dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 1, 2]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -355,11 +367,12 @@ func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[act:.*]] = "xla_hlo.convert"(%arg1) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = "xla_hlo.add"(%arg4, %[[eps]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = xla_chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = "xla_hlo.rsqrt"(%[[add]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[sub:.*]] = "xla_hlo.subtract"(%[[act]], %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
-  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8x8x8x8xf32>
+  // CHECK:      %[[bcast_arg3:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[sub:.*]] = xla_hlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul:.*]] = xla_hlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
   // CHECK-NEXT: xla_hlo.constant dense<[0, 2, 3]> : tensor<3xi64>
   // CHECK-NEXT: %[[cmul:.*]] = "xla_hlo.convert"(%[[mul]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[init:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
@@ -370,10 +383,11 @@ func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: }) {dimensions = dense<[0, 2, 3]> : tensor<3xi64>} : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr2:.*]] = "xla_hlo.convert"(%[[red1]]) : (tensor<8xf32>) -> tensor<8xf32>
 
-  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
-  // CHECK-NEXT: %[[mul3:.*]] = "xla_hlo.multiply"(%[[grad]], %[[mul2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8x8x8x8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul2:.*]] = xla_hlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+  // CHECK:      %[[bcast_mul2:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+  // CHECK-NEXT: %[[mul3:.*]] = xla_hlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
 
-  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<8xf32>
+  // CHECK-NEXT: %[[scale_backprop:.*]] = xla_hlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
 
   // CHECK-NEXT: xla_hlo.constant dense<[0, 2, 3]> : tensor<3xi64>
   // CHECK-NEXT: %[[cgrad:.*]] = "xla_hlo.convert"(%[[grad]]) : (tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
@@ -405,207 +419,41 @@ func @fusedBatchNormGradV3_Training_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
 
 // CHECK-LABEL: func @biasAdd_NHWC
 func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_NCHW
 func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   return %0 : tensor<1x32x10x32xi32>
 }
 
 // CHECK-LABEL: func @biasAdd_dynamic
 func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-  // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[ARG0_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK: %[[ARG0_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[ARG0_SHAPE]])
+  // CHECK: %[[ARG1_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[ARG0_EXTENTS]])
+  // CHECK-SAME:   {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.+]] = xla_hlo.add %arg0, %[[ARG1_BCAST]]
   %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW"} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   return %0 : tensor<?x?x?x?xi32>
 }
 
 //===----------------------------------------------------------------------===//
-// Binary op legalizations.
-// Most of these expand from the same pattern. Full semantics are
-// verified for tf.Add and pattern application only for the rest.
+// DiagPart
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: func @add
-func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %[[SUM0:.*]] = xla_hlo.add %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  %[[SUM1:.*]] = xla_hlo.add %[[SUM0]], %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %[[SUM1]] : tensor<2xi32>
-  %0 = "tf.Add"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  %1 = "tf.AddV2"(%0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %1: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_add
-// TODO(laurenzo): Change this to a (5 + 2x1) shaped add to make the check
-// patterns unambiguous and more interesting (once broadcastable trait is
-// fixed upstream).
-func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
-// CHECK-LABEL: func @broadcast_multi_dim_add
-// TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
-// broadcastable bug is fixed (helps make the CHECK matching unambiguous)
-func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
-  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
-  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
-  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
-  return %0: tensor<4x4x4x4xi32>
-}
-
-// CHECK-LABEL: func @add_dynamic
-func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
-// CHECK-LABEL: func @div
-func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Div"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @shift_left
-func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
-  %0 = "tf.LeftShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// CHECK-LABEL: func @div_unranked
-func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.Div
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<*xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
-// CHECK-LABEL: func @maximum
-func @maximum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.maximum %arg0, %arg1 : tensor<4xf32>
-  %0 = "tf.Maximum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: func @minimum
-func @minimum(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK:  xla_hlo.minimum %arg0, %arg1 : tensor<4xf32>
-  %0 = "tf.Minimum"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: func @mul
-func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.multiply %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Mul"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @real_div
-func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
-  %0 = "tf.RealDiv"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @sub
-func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
-  // CHECK-NEXT:  return %0 : tensor<2xi32>
-  %0 = "tf.Sub"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @shift_right
-func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0 : tensor<4xi32>
-}
-
-// CHECK-LABEL: func @shift_right_unsigned
-func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
-  // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<4xui8>) -> tensor<4xui8>
-  return %0 : tensor<4xui8>
-}
-
-// CHECK-LABEL: func @broadcast_shift_right_unsigned
-func @broadcast_shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<2x4xui8>) -> tensor<2x4xui8> {
-  // CHECK:  tf.RightShift
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xui8>, tensor<2x4xui8>) -> tensor<2x4xui8>
-  return %0 : tensor<2x4xui8>
-}
-
-// CHECK-LABEL: func @and
-func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  // CHECK-NEXT:  xla_hlo.and
-  %0 = "tf.LogicalAnd"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @and_unranked
-func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
-  // CHECK: tf.LogicalAnd
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @or
-func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
-  // CHECK-NEXT:  xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg0) : (tensor<2xi1>, tensor<2xi1>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @bitwise_or
-func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0: tensor<4xi32>
-}
-
-// CHECK-LABEL: func @bitwise_and
-func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-  return %0: tensor<4xi32>
-}
-
-// CHECK-LABEL: func @pow
-func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT:  xla_hlo.power
-  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  return %0: tensor<2xf32>
-}
-
 // CHECK-LABEL: func @diag_part
 // CHECK-SAME: %[[ARG:.*]]: tensor<4x3x4x3xf32>
 func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
@@ -625,6 +473,10 @@ func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
   return %0: tensor<4x3xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// Einsum.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @einsum
 func @einsum(%arg0: tensor<2x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<2x4xf32> {
   // CHECK:  xla_hlo.einsum
@@ -639,22 +491,26 @@ func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   return %0: tensor<2x2xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// FloorDiv and FloorMod.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @floordiv_broadcast_i32
 func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
   // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"(%arg0, [[ZEROS1]]) {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
   // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZEROS2]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[CMP1]], [[CMP2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  // CHECK-DAG: [[DIV1:%.+]] = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
   // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_hlo.subtract [[ABS2]], [[ZEROS3]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"([[ABS1]], [[SUB]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ZEROS3]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[DIV2:%.+]] = "xla_hlo.divide"([[NEG]], [[ABS3]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
   // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
@@ -664,19 +520,19 @@ func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> te
 // CHECK-LABEL: func @floordiv_reverse_broadcast_i32
 func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
   // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"(%arg0, [[ZEROS1]]) {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
   // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZEROS2]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[CMP1]], [[CMP2]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
-  // CHECK-DAG: [[DIV1:%.+]] = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
   // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_hlo.subtract [[ABS2]], [[ZEROS3]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"([[ABS1]], [[SUB]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ZEROS3]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[DIV2:%.+]] = xla_hlo.divide [[NEG]], [[ABS3]]
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]]
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
   // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -685,7 +541,7 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
 
 // CHECK-LABEL: func @floordiv_f32
 func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT:  %[[DIV:.*]] = xla_hlo.divide %arg0, %arg0
+  // CHECK-NEXT:  %[[DIV:.*]] = xla_chlo.broadcast_divide %arg0, %arg0
   // CHECK-NEXT:  %[[FLOOR:.*]] = "xla_hlo.floor"(%[[DIV]])
   // CHECK-NEXT:  return %[[FLOOR]] : tensor<2xf32>
   %0 = "tf.FloorDiv"(%arg0, %arg0) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
@@ -696,7 +552,7 @@ func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 func @floordiv_bf16(%arg0: tensor<2xbf16>) -> tensor<2xbf16> {
   // CHECK-NEXT:  xla_hlo.convert
   // CHECK-NEXT:  xla_hlo.convert
-  // CHECK-NEXT:  xla_hlo.divide
+  // CHECK-NEXT:  xla_chlo.broadcast_divide
   // CHECK-NEXT:  xla_hlo.floor
   // CHECK-NEXT:  xla_hlo.convert
   // CHECK-NEXT:  return
@@ -706,7 +562,7 @@ func @floordiv_bf16(%arg0: tensor<2xbf16>) -> tensor<2xbf16> {
 
 // CHECK-LABEL: func @floordiv_f16_broadcast
 func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
-  // CHECK-NEXT:  xla_hlo.divide
+  // CHECK-NEXT:  xla_chlo.broadcast_divide
   // CHECK-NEXT:  xla_hlo.floor
   // CHECK-NEXT:  return
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
@@ -729,15 +585,15 @@ func @floordiv_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*x
 
 // CHECK-LABEL: func @floormod_broadcast_numerator
 func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"([[REM]], [[ZL]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZR:%.+]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[REM:%.+]], [[ZR]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP4:%.+]] = "xla_hlo.compare"([[CMP2]], [[CMP3]]) {comparison_direction = "NE"}
-  // CHECK-DAG: [[AND:%.+]] = xla_hlo.and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = xla_hlo.add %arg1, [[REM]]
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR:%.+]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM:%.+]], [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]]
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
@@ -746,15 +602,15 @@ func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>)
 
 // CHECK-LABEL: func @floormod_broadcast_denominator
 func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP1:%.+]] = "xla_hlo.compare"([[REM]], [[ZL]]) {comparison_direction = "NE"}
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = "xla_hlo.compare"(%arg1, [[ZR:%.+]]) {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = "xla_hlo.compare"([[REM:%.+]], [[ZR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP4:%.+]] = "xla_hlo.compare"([[CMP2]], [[CMP3]]) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
-  // CHECK-DAG: [[AND:%.+]] = xla_hlo.and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = "xla_hlo.add"(%arg1, [[REM]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR:%.+]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM:%.+]], [[ZR]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
@@ -775,6 +631,10 @@ func @floormod_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*x
   return %0: tensor<*xi32>
 }
 
+//===----------------------------------------------------------------------===//
+// BroadcastTo.
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func @broadcast_to
 func @broadcast_to(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
   %cst = "tf.Const"() { value = dense<16> : tensor<4xi32> } : () -> tensor<4xi32>
@@ -787,155 +647,6 @@ func @broadcast_to(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
   return %0 : tensor<16x16x16x16xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// Equality op legalizations.
-// tf.Equal and tf.NotEqual expand from the same pattern. Full semantics are
-// verified for tf.Equal and pattern application only for tf.NotEqual
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @equal
-func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @equal_dynamic
-func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @equal_broadcast
-func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
-  %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @equal_broadcast_no_incompatible_shapes_error
-func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_broadcastable
-func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_dynamic
-func @equal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @equal_incompatible_shape_both_dynamic
-func @equal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.Equal"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @equal_unranked
-func @equal_unranked(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*xi1> {
-  // CHECK: "tf.Equal"
-  %0 = "tf.Equal"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @notequal
-func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-//===----------------------------------------------------------------------===//
-// Compare op legalizations.
-// These expand from the same pattern. Full semantics are checked for
-// tf.Greater. Others just check that the pattern applied.
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @greater
-func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK: "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @broadcast_greater
-func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @greater_dynamic
-func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
-  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
-  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
-  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
-  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @greater_uranked
-func @greater_uranked(%arg0: tensor<*xi32>) -> tensor<*xi1> {
-  // CHECK:  "tf.Greater"
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @greater_equal
-func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @less
-func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-// CHECK-LABEL: func @less_equal
-func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
-  return %0: tensor<2xi1>
-}
-
-
 //===----------------------------------------------------------------------===//
 // Complex op legalizations.
 //===----------------------------------------------------------------------===//
@@ -1224,12 +935,12 @@ func @matrix_band_part(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: ten
   // CHECK: %[[X:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<64x64xbf16>
   // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<64x64xbf16>
   // CHECK: %[[OFFSET:.*]] = xla_hlo.subtract %[[X]], %[[Y]] : tensor<64x64xbf16>
-  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<*xi1>
+  // CHECK: %[[G:.*]] = xla_chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<64x64xbf16>) -> tensor<64x64xi1>
 
   // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<*xi1>
+  // CHECK: %[[I:.*]] = xla_chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<64x64xbf16>, tensor<bf16>) -> tensor<64x64xi1>
 
-  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<*xi1>
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<64x64xi1>
 
   // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<64x64xbf16>
   // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
@@ -1245,11 +956,11 @@ func @matrix_band_part_2(%arg0: tensor<12x24x48xbf16>, %arg1: tensor<i64>, %arg2
   // CHECK: %[[Y:.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<24x48xbf16>
   // CHECK: %[[OFFSET:.*]] = xla_hlo.subtract %[[X]], %[[Y]] : tensor<24x48xbf16>
 
-  // CHECK: %[[G:.*]] = "xla_hlo.compare"(%[[F]], %[[OFFSET]]) {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<*xi1>
+  // CHECK: %[[G:.*]] = xla_chlo.broadcast_compare %[[F]], %[[OFFSET]] {comparison_direction = "LE"} : (tensor<bf16>, tensor<24x48xbf16>) -> tensor<24x48xi1>
 
   // CHECK: %[[H:.*]] = "xla_hlo.convert"(%[[D]]) : (tensor<i64>) -> tensor<bf16>
-  // CHECK: %[[I:.*]] = "xla_hlo.compare"(%[[OFFSET]], %[[H]]) {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<*xi1>
-  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : tensor<*xi1>
+  // CHECK: %[[I:.*]] = xla_chlo.broadcast_compare %[[OFFSET]], %[[H]] {comparison_direction = "LE"} : (tensor<24x48xbf16>, tensor<bf16>) -> tensor<24x48xi1>
+  // CHECK: %[[J:.*]] = xla_hlo.and %[[G]], %[[I]] : tensor<24x48xi1>
 
   // CHECK: %[[ZERO2:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<12x24x48xbf16>
   // CHECK: %[[R:.*]] = "xla_hlo.select"(%[[J]], %[[INPUT]], %[[ZERO2]])
@@ -1396,7 +1107,8 @@ func @max_pool_3d_grad_same(%orig_input: tensor<2x8x13x25x7xf32>, %orig_output:
 // CHECK-LABEL:one_hot
 func @one_hot(%indices: tensor<3xi32>, %on_value: tensor<f32>, %off_value: tensor<f32>) -> tensor<3x5xf32> {
   // CHECK: %[[IOTA:.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<3x5xi32>
-  // CHECK: %[[COMPARE:.*]] = "xla_hlo.compare"(%arg0, %[[IOTA]]) {broadcast_dimensions = dense<0> : tensor<1xi64>, comparison_direction = "EQ"} : (tensor<3xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
+  // CHECK: %[[BCAST_ARG0:.+]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<3x5xi32>
+  // CHECK: %[[COMPARE:.*]] = "xla_hlo.compare"(%[[BCAST_ARG0]], %[[IOTA]]) {comparison_direction = "EQ"} : (tensor<3x5xi32>, tensor<3x5xi32>) -> tensor<3x5xi1>
   // CHECK: %[[ON_VALUE:.*]] = "xla_hlo.broadcast"(%arg1) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[OFF_VALUE:.*]] = "xla_hlo.broadcast"(%arg2) {broadcast_sizes = dense<[3, 5]> : tensor<2xi64>} : (tensor<f32>) -> tensor<3x5xf32>
   // CHECK: %[[RESULT:.*]] = "xla_hlo.select"(%[[COMPARE]], %[[ON_VALUE]], %[[OFF_VALUE]]) : (tensor<3x5xi1>, tensor<3x5xf32>, tensor<3x5xf32>) -> tensor<3x5xf32>
@@ -1561,7 +1273,7 @@ func @stateful_pcall_multi_in_out(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (te
 // CHECK-LABEL: func @relu
 func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i32>
-  // CHECK: "xla_hlo.maximum"(%[[ZERO]], %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: xla_chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   return %0: tensor<1xi32>
 }
@@ -1569,7 +1281,7 @@ func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 // CHECK-LABEL: func @relu_unranked
 func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0> : tensor<i32>
-  // CHECK: "xla_hlo.maximum"(%[[ZERO]], %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: xla_chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
   return %0: tensor<?xi32>
 }
@@ -1597,8 +1309,8 @@ func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tensor<4x8xf32> {
   // CHECK-DAG: %[[ZERO_SCALAR:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32>
-  // CHECK-DAG: %[[PRED:.*]] = "xla_hlo.compare"(%[[FEATURES]], %[[ZERO_SCALAR]]) {comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<*xi1>
-  // CHECK-DAG: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<*xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
+  // CHECK-DAG: %[[PRED:.*]] = xla_chlo.broadcast_compare %[[FEATURES]], %[[ZERO_SCALAR]] {comparison_direction = "GT"} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+  // CHECK-DAG: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
   // CHECK-DAG: return %[[RESULT]] : tensor<4x8xf32>
   %2 = "tf.ReluGrad"(%gradients, %features) : (tensor<4x8xf32>, tensor<?x?xf32>) -> tensor<4x8xf32>
   return %2 : tensor<4x8xf32>
@@ -1708,7 +1420,10 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: {dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2xf32>
   // CHECK: %[[CASTED_MAX:.*]] = "xla_hlo.convert"(%[[MAX]]) : (tensor<2xf32>) -> tensor<2xf32>
 
-  // CHECK: %[[SHIFTED_INP:.*]] = "xla_hlo.subtract"(%[[ARG0]], %[[CASTED_MAX]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_MAX:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[CASTED_MAX]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[SHIFTED_INP:.*]] = xla_hlo.subtract %[[ARG0]], %[[BCAST_MAX]]
   // CHECK: %[[EXP:.*]] = "xla_hlo.exponential"(%[[SHIFTED_INP]])
 
   // Verify reduce op for summation and its body.
@@ -1720,8 +1435,11 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
   // CHECK: {dimensions = dense<1> : tensor<1xi64>}
   // CHECK: %[[CASTED_SUM:.*]] = "xla_hlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
 
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.divide"(%[[EXP]], %[[CASTED_SUM]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // return %[[RESULT]]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_SUM:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[CASTED_SUM]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.*]] = xla_hlo.divide %[[EXP]], %[[BCAST_SUM]]
+  // CHECK: return %[[RESULT]]
 
   %0 = "tf.Softmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   return %0: tensor<2x3xf32>
@@ -1730,7 +1448,7 @@ func @simple_softmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // Verify intermediate and final shape are correct with dynamic shapes.
 // CHECK-LABEL: func @dynamic_softmax
 func @dynamic_softmax(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK: "xla_hlo.divide"({{.*}}) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
+  // CHECK: xla_hlo.divide {{.*}}  : tensor<?x?xf32>
   %0 = "tf.Softmax"(%arg0) : (tensor<?x?xf32>) -> tensor<?x?xf32>
   return %0: tensor<?x?xf32>
 }
@@ -1756,43 +1474,29 @@ func @rank4_softmax(%arg0: tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16> {
   // CHECK: "xla_hlo.reduce"
   // CHECK: dimensions = dense<3>
 
-  // CHECK: "xla_hlo.divide"{{.*}} {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>}
+  // CHECK: xla_hlo.divide {{.*}}
   %0 = "tf.Softmax"(%arg0) : (tensor<2x3x4x5xf16>) -> tensor<2x3x4x5xf16>
   return %0: tensor<2x3x4x5xf16>
 }
 
 //===----------------------------------------------------------------------===//
 // LogSoftmax op legalizations.
+// This just changes the tail of the regular Softmax legalization
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @simple_logsoftmax
 // CHECK-SAME: (%[[ARG0:.*]]: tensor<2x3xf32>)
 func @simple_logsoftmax(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-
-  // Verify reduce op for max computation and its body.
-  // CHECK-DAG: %[[CASTED_INP:.*]] = "xla_hlo.convert"(%[[ARG0]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[NEG_INF:.*]] = xla_hlo.constant dense<0xFF800000> : tensor<f32>
-  // CHECK: %[[MAX:.*]] = "xla_hlo.reduce"(%[[CASTED_INP]], %[[NEG_INF]])
-  // CHECK:  xla_hlo.maximum
-  // CHECK: "xla_hlo.return"
-  // CHECK: {dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf32>, tensor<f32>) -> tensor<2xf32>
-  // CHECK: %[[CASTED_MAX:.*]] = "xla_hlo.convert"(%[[MAX]]) : (tensor<2xf32>) -> tensor<2xf32>
-
-  // CHECK: %[[SHIFTED_INP:.*]] = "xla_hlo.subtract"(%[[ARG0]], %[[CASTED_MAX]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // CHECK: %[[EXP:.*]] = "xla_hlo.exponential"(%[[SHIFTED_INP]])
-
-  // Verify reduce op for summation and its body.
-  // CHECK-DAG: %[[CASTED_EXP:.*]] = "xla_hlo.convert"(%[[EXP]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK-DAG: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-  // CHECK: %[[SUM:.*]] = "xla_hlo.reduce"(%[[CASTED_EXP]], %[[ZERO]])
-  // CHECK:  xla_hlo.add
-  // CHECK: "xla_hlo.return"
-  // CHECK: {dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %{{.*}} = "xla_hlo.reduce"({{.*}})
+  // CHECK: %[[SUM:.*]] = "xla_hlo.reduce"({{.*}})
   // CHECK: %[[CASTED_SUM:.*]] = "xla_hlo.convert"(%[[SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
   // CHECK: %[[LOG:.*]] = "xla_hlo.log"(%[[CASTED_SUM]]) : (tensor<2xf32>) -> tensor<2xf32>
-
-  // CHECK: %[[RESULT:.*]] = "xla_hlo.subtract"(%[[SHIFTED_INP]], %[[LOG]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
-  // return %[[RESULT]]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]]) : (!shape.shape) -> tensor<2xindex>
+  // CHECK: %[[BCAST_SUM:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[LOG]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: %[[RESULT:.*]] = xla_hlo.subtract {{.*}}, %[[BCAST_SUM]]
+  // CHECK: return %[[RESULT]]
 
   %0 = "tf.LogSoftmax"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   return %0: tensor<2x3xf32>
@@ -2643,10 +2347,10 @@ func @strided_slice_nonconstant_begin_end(%arg0: tensor<i32>, %arg1: tensor<32x1
   // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>,
   // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>} : (tensor<1xi32>) -> tensor<1xi32>
   // CHECK-NEXT: %[[INDEX2:.*]] = "xla_hlo.reshape"(%[[INDEX]]) : (tensor<1xi32>) -> tensor<i32>
-  // CHECK-NEXT: %[[CMP:.*]] = "xla_hlo.compare"(%[[INDEX2]], %[[ZERO]])
+  // CHECK-NEXT: %[[CMP:.*]] = xla_chlo.broadcast_compare %[[INDEX2]], %[[ZERO]]
   // CHECK-DAG-SAME: {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
   // CHECK-NEXT: %[[DIM:.*]] = xla_hlo.constant dense<32> : tensor<i32>
-  // CHECK-NEXT: %[[WRAP:.*]] = xla_hlo.add %[[DIM]], %[[INDEX2]] : tensor<i32>
+  // CHECK-NEXT: %[[WRAP:.*]] = xla_chlo.broadcast_add %[[DIM]], %[[INDEX2]] : (tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: %[[INDEX3:.*]] = "xla_hlo.select"(%[[CMP]], %[[WRAP]], %[[INDEX2]]) :
   // CHECK-DAG-SAME: (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: %[[SLICED:.*]] = "xla_hlo.dynamic-slice"
@@ -2775,7 +2479,7 @@ func @mean(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   // CHECK:  "xla_hlo.return"(%[[REDUCE_BODY_RESULT]]) : (tensor<f32>) -> ()
   // CHECK: }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: %[[DIVISOR:.*]] = xla_hlo.constant dense<8.000000e+00> : tensor<f32>
-  // CHECK: %[[MEAN:.*]] = "xla_hlo.divide"(%[[REDUCED]], %[[DIVISOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: %[[MEAN:.*]] = xla_chlo.broadcast_divide %[[REDUCED]], %[[DIVISOR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: %[[CAST_BACK:.*]] = "xla_hlo.convert"(%[[MEAN]]) : (tensor<4xf32>) -> tensor<4xf16>
   // CHECK: %[[RESULT:.*]] = "xla_hlo.reshape"(%[[CAST_BACK]]) : (tensor<4xf16>) -> tensor<4x1xf16>
   // CHECK: return %[[RESULT]] : tensor<4x1xf16>
@@ -3079,8 +2783,8 @@ func @rng_std_normal(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
 func @range(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<5xf32> {
   %1 = "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "range/limit", value = dense<5.000000e+00> : tensor<f32>} : () -> tensor<f32>
   // CHECK-DAG: [[IOTA:%.*]] = "xla_hlo.iota"
-  // CHECK-DAG: [[MUL:%.*]] = "xla_hlo.multiply"([[IOTA]], [[DELTA]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-  // CHECK: "xla_hlo.add"([[MUL]], [[START]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = xla_chlo.broadcast_multiply [[IOTA]], [[DELTA]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK: xla_chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
   %3 = "tf.Range"(%arg0, %1, %arg1) {Tidx = "tfdtype$DT_FLOAT", device = "", name = "range"} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<5xf32>
   return %3 : tensor<5xf32>
 }
@@ -3092,12 +2796,12 @@ func @linspace_static(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<4xf32> {
   // CHECK-DAG: [[NUM_CAST:%.*]] = tensor_cast [[NUM]]
   // CHECK-DAG: [[NUM_F32:%.*]] = "xla_hlo.convert"([[NUM_CAST]])
   // CHECK-DAG: [[ONE:%.*]] = xla_hlo.constant dense<1.000000e+00>
-  // CHECK-DAG: [[STEP_DENOMINATOR:%.*]] = xla_hlo.subtract [[NUM_F32]], [[ONE]]
-  // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = xla_hlo.subtract [[STOP]], [[START]]
-  // CHECK-DAG: [[STEP:%.*]] = xla_hlo.divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
+  // CHECK-DAG: [[STEP_DENOMINATOR:%.*]] = xla_chlo.broadcast_subtract [[NUM_F32]], [[ONE]]
+  // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = xla_chlo.broadcast_subtract [[STOP]], [[START]]
+  // CHECK-DAG: [[STEP:%.*]] = xla_chlo.broadcast_divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
   // CHECK-DAG: [[IOTA:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64}
-  // CHECK-DAG: [[MUL:%.*]] = "xla_hlo.multiply"([[IOTA]], [[STEP]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
-  // CHECK-DAG: [[LINSPACE:%.*]] = "xla_hlo.add"([[MUL]], [[START]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = xla_chlo.broadcast_multiply [[IOTA]], [[STEP]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
+  // CHECK-DAG: [[LINSPACE:%.*]] = xla_chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<[]> : tensor<0xi64>}
   // CHECK: return [[LINSPACE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<4> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.LinSpace"(%arg0, %arg1, %0) : (tensor<f32>, tensor<f32>, tensor<i32>) -> tensor<4xf32>
@@ -3392,13 +3096,13 @@ func @size_ranked(%input: tensor<2x?x8xf32>) -> (tensor<i32>) {
   // CHECK: %[[CONST:.*]] = xla_hlo.constant dense<1>
   // CHECK: %[[DIM_0:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 0
-  // CHECK: %[[MUL_0:.*]] = xla_hlo.multiply %[[CONST]], %[[DIM_0]]
+  // CHECK: %[[MUL_0:.*]] = xla_chlo.broadcast_multiply %[[CONST]], %[[DIM_0]]
   // CHECK: %[[DIM_1:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 1
-  // CHECK: %[[MUL_1:.*]] = xla_hlo.multiply %[[MUL_0]], %[[DIM_1]]
+  // CHECK: %[[MUL_1:.*]] = xla_chlo.broadcast_multiply %[[MUL_0]], %[[DIM_1]]
   // CHECK: %[[DIM_2:.*]] = "xla_hlo.get_dimension_size"(%[[INPUT]])
   // CHECK-SAME: dimension = 2
-  // CHECK: %[[MUL_2:.*]] = xla_hlo.multiply %[[MUL_1]], %[[DIM_2]]
+  // CHECK: %[[MUL_2:.*]] = xla_chlo.broadcast_multiply %[[MUL_1]], %[[DIM_2]]
   %size = "tf.Size"(%input) {T = "tfdtype$DT_FLOAT", out_type = "tfdtype$DT_INT32"} : (tensor<2x?x8xf32>) -> tensor<i32>
   // CHECK: return %[[MUL_2]]
   return %size : tensor<i32>
@@ -3915,7 +3619,7 @@ func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
   // CHECK:   [[INDICES1:%.*]] = "xla_hlo.dynamic-update-slice"([[INDICES]], [[TGT_IDX]], [[IV]]) : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[INDICES2:%.*]] = "xla_hlo.dynamic-update-slice"([[INDICES1]], [[SRC_IDX]], [[SWP]]) : (tensor<4xi32>, tensor<1xi32>, tensor<i32>) -> tensor<4xi32>
   // CHECK:   [[ONE:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-  // CHECK:   [[NEW_IV:%.*]] = xla_hlo.add [[IV]], [[ONE]]
+  // CHECK:   [[NEW_IV:%.*]] = xla_chlo.broadcast_add [[IV]], [[ONE]]
   // CHECK:   [[NEW_TUPLE:%.*]] = "xla_hlo.tuple"([[NEW_IV]], [[SWAPS]], [[INDICES2]])
   // CHECK:   "xla_hlo.return"([[NEW_TUPLE]])
   // CHECK: }) : (tuple<tensor<i32>, tensor<4xi32>, tensor<4xi32>>) -> tuple<tensor<i32>, tensor<4xi32>, tensor<4xi32>>
@@ -3984,7 +3688,7 @@ func @avgpool_valid_padding(%arg0: tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16>
   // CHECK:   "xla_hlo.return"([[ADD]])
   // CHECK: }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>} : (tensor<2x12x20x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
   // CHECK: [[COUNT:%.+]] = xla_hlo.constant dense<4.000000e+00> : tensor<f32>
-  // CHECK: [[DIV:%.+]] = "xla_hlo.divide"([[REDUCE]], [[COUNT]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>} : (tensor<2x3x5x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
+  // CHECK: [[DIV:%.+]] = xla_chlo.broadcast_divide [[REDUCE]], [[COUNT]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<2x3x5x7xf32>, tensor<f32>) -> tensor<2x3x5x7xf32>
   // CHECK: [[CONV16:%.+]] = "xla_hlo.convert"([[DIV]]) : (tensor<2x3x5x7xf32>) -> tensor<2x3x5x7xf16>
   // CHECK: return [[CONV16]]
   %0 = "tf.AvgPool"(%arg0) {data_format = "NHWC", ksize = [1, 2, 2, 1], padding = "VALID", strides = [1, 4, 4, 1]} : (tensor<2x12x20x7xf16>) -> tensor<2x3x5x7xf16>
@@ -4124,177 +3828,11 @@ func @cumsum_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<i32>) -> tensor<?xf32>
 
 // CHECK:  func @qr([[VAL_0:%.*]]: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
 func @qr(%arg0: tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>) {
-// CHECK:    [[VAL_1:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<100x100xi32>
-// CHECK:    [[VAL_2:%.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<100x100xi32>
-// CHECK:    [[VAL_3:%.*]] = "xla_hlo.compare"([[VAL_1]], [[VAL_2]]) {comparison_direction = "EQ"} : (tensor<100x100xi32>, tensor<100x100xi32>) -> tensor<100x100xi1>
-// CHECK:    [[VAL_4:%.*]] = "xla_hlo.convert"([[VAL_3]]) : (tensor<100x100xi1>) -> tensor<100x100xf32>
-// CHECK:    [[VAL_5:%.*]] = "xla_hlo.broadcast"([[VAL_4]]) {broadcast_sizes = dense<500> : tensor<1xi64>} : (tensor<100x100xf32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_6:%.*]] = "xla_hlo.slice"([[VAL_0]]) {limit_indices = dense<[500, 100, 75]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_7:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:    [[VAL_8:%.*]] = "xla_hlo.broadcast"([[VAL_7]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_9:%.*]] = "xla_hlo.broadcast"([[VAL_7]]) {broadcast_sizes = dense<[500, 75]> : tensor<2xi64>} : (tensor<f32>) -> tensor<500x75xf32>
-// CHECK:    [[VAL_10:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_11:%.*]] = "xla_hlo.tuple"([[VAL_10]], [[VAL_6]], [[VAL_8]], [[VAL_9]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_12:%.*]] = "xla_hlo.while"([[VAL_11]]) ( {
-// CHECK:         ^bb0([[VAL_13:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_14:%.*]] = "xla_hlo.get_tuple_element"([[VAL_13]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_15:%.*]] = xla_hlo.constant dense<75> : tensor<i32>
-// CHECK:           [[VAL_16:%.*]] = "xla_hlo.compare"([[VAL_14]], [[VAL_15]]) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-// CHECK:           "xla_hlo.return"([[VAL_16]]) : (tensor<i1>) -> ()
-// CHECK:         },  {
-// CHECK:         ^bb0([[VAL_17:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_18:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_19:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_20:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_21:%.*]] = "xla_hlo.get_tuple_element"([[VAL_17]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_22:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_23:%.*]] = "xla_hlo.dynamic-slice"([[VAL_19]], [[VAL_22]], [[VAL_22]], [[VAL_18]]) {slice_sizes = dense<[500, 100, 1]> : tensor<3xi64>} : (tensor<500x100x75xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_24:%.*]] = "xla_hlo.reshape"([[VAL_23]]) : (tensor<500x100x1xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_25:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_26:%.*]] = xla_hlo.constant dense<1.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_27:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_28:%.*]] = "xla_hlo.dynamic-slice"([[VAL_24]], [[VAL_27]], [[VAL_18]]) {slice_sizes = dense<[500, 1]> : tensor<2xi64>} : (tensor<500x100xf32>, tensor<i32>, tensor<i32>) -> tensor<500x1xf32>
-// CHECK:           [[VAL_29:%.*]] = "xla_hlo.reshape"([[VAL_28]]) : (tensor<500x1xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_30:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<100xi32>
-// CHECK:           [[VAL_31:%.*]] = "xla_hlo.compare"([[VAL_30]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GT"} : (tensor<100xi32>, tensor<i32>) -> tensor<100xi1>
-// CHECK:           [[VAL_32:%.*]] = "xla_hlo.convert"([[VAL_31]]) : (tensor<100xi1>) -> tensor<100xf32>
-// CHECK:           [[VAL_33:%.*]] = "xla_hlo.multiply"([[VAL_24]], [[VAL_32]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<500x100xf32>, tensor<100xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_34:%.*]] = xla_hlo.multiply [[VAL_33]], [[VAL_33]] : tensor<500x100xf32>
-// CHECK:           [[VAL_35:%.*]] = "xla_hlo.reduce"([[VAL_34]], [[VAL_25]]) ( {
-// CHECK:           ^bb0([[VAL_36:%.*]]: tensor<f32>, [[VAL_37:%.*]]: tensor<f32>):
-// CHECK:             [[VAL_38:%.*]] = xla_hlo.add [[VAL_36]], [[VAL_37]] : tensor<f32>
-// CHECK:             "xla_hlo.return"([[VAL_38]]) : (tensor<f32>) -> ()
-// CHECK:           }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<500x100xf32>, tensor<f32>) -> tensor<500xf32>
-// CHECK:           [[VAL_39:%.*]] = xla_hlo.multiply [[VAL_29]], [[VAL_29]] : tensor<500xf32>
-// CHECK:           [[VAL_40:%.*]] = xla_hlo.add [[VAL_39]], [[VAL_41:%.*]] : tensor<500xf32>
-// CHECK:           [[VAL_42:%.*]] = "xla_hlo.sqrt"([[VAL_40]]) : (tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_43:%.*]] = "xla_hlo.compare"([[VAL_41]], [[VAL_25]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<500xf32>, tensor<f32>) -> tensor<500xi1>
-// CHECK:           [[VAL_44:%.*]] = "xla_hlo.compare"([[VAL_29]], [[VAL_25]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"} : (tensor<500xf32>, tensor<f32>) -> tensor<500xi1>
-// CHECK:           [[VAL_45:%.*]] = "xla_hlo.broadcast"([[VAL_26]]) {broadcast_sizes = dense<500> : tensor<1xi64>} : (tensor<f32>) -> tensor<500xf32>
-// CHECK:           [[VAL_46:%.*]] = "xla_hlo.negate"([[VAL_45]]) : (tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_47:%.*]] = "xla_hlo.select"([[VAL_44]], [[VAL_45]], [[VAL_46]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_48:%.*]] = xla_hlo.multiply [[VAL_47]], [[VAL_42]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<500xf32>
-// CHECK:           [[VAL_49:%.*]] = "xla_hlo.select"([[VAL_43]], [[VAL_29]], [[VAL_48]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_50:%.*]] = xla_hlo.subtract [[VAL_49]], [[VAL_29]] : tensor<500xf32>
-// CHECK:           [[VAL_51:%.*]] = xla_hlo.divide [[VAL_50]], [[VAL_49]] : tensor<500xf32>
-// CHECK:           [[VAL_52:%.*]] = "xla_hlo.broadcast"([[VAL_25]]) {broadcast_sizes = dense<500> : tensor<1xi64>} : (tensor<f32>) -> tensor<500xf32>
-// CHECK:           [[VAL_53:%.*]] = "xla_hlo.select"([[VAL_43]], [[VAL_52]], [[VAL_51]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_54:%.*]] = xla_hlo.subtract [[VAL_29]], [[VAL_49]] : tensor<500xf32>
-// CHECK:           [[VAL_55:%.*]] = "xla_hlo.select"([[VAL_43]], [[VAL_45]], [[VAL_54]]) : (tensor<500xi1>, tensor<500xf32>, tensor<500xf32>) -> tensor<500xf32>
-// CHECK:           [[VAL_56:%.*]] = "xla_hlo.compare"([[VAL_30]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<100xi32>, tensor<i32>) -> tensor<100xi1>
-// CHECK:           [[VAL_57:%.*]] = "xla_hlo.convert"([[VAL_56]]) : (tensor<100xi1>) -> tensor<100xf32>
-// CHECK:           [[VAL_58:%.*]] = "xla_hlo.broadcast"([[VAL_57]]) {broadcast_sizes = dense<1> : tensor<1xi64>} : (tensor<100xf32>) -> tensor<1x100xf32>
-// CHECK:           [[VAL_59:%.*]] = "xla_hlo.divide"([[VAL_33]], [[VAL_55]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500x100xf32>, tensor<500xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_60:%.*]] = "xla_hlo.add"([[VAL_58]], [[VAL_59]]) : (tensor<1x100xf32>, tensor<500x100xf32>) -> tensor<500x100xf32>
-// CHECK:           [[VAL_61:%.*]] = "xla_hlo.reshape"([[VAL_60]]) : (tensor<500x100xf32>) -> tensor<500x1x100xf32>
-// CHECK:           [[VAL_62:%.*]] = "xla_hlo.dot_general"([[VAL_61]], [[VAL_19]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x1x100xf32>, tensor<500x100x75xf32>) -> tensor<500x1x75xf32>
-// CHECK:           [[VAL_63:%.*]] = "xla_hlo.dot_general"([[VAL_61]], [[VAL_62]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<1> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x1x100xf32>, tensor<500x1x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_64:%.*]] = "xla_hlo.multiply"([[VAL_53]], [[VAL_63]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_65:%.*]] = xla_hlo.subtract [[VAL_19]], [[VAL_64]] : tensor<500x100x75xf32>
-// CHECK:           [[VAL_66:%.*]] = "xla_hlo.iota"() {iota_dimension = 0 : i64} : () -> tensor<100x1xi32>
-// CHECK:           [[VAL_67:%.*]] = "xla_hlo.compare"([[VAL_66]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"} : (tensor<100x1xi32>, tensor<i32>) -> tensor<100x1xi1>
-// CHECK:           [[VAL_68:%.*]] = "xla_hlo.convert"([[VAL_67]]) : (tensor<100x1xi1>) -> tensor<100x1xf32>
-// CHECK:           [[VAL_69:%.*]] = "xla_hlo.compare"([[VAL_66]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<100x1xi32>, tensor<i32>) -> tensor<100x1xi1>
-// CHECK:           [[VAL_70:%.*]] = "xla_hlo.convert"([[VAL_69]]) : (tensor<100x1xi1>) -> tensor<100x1xf32>
-// CHECK:           [[VAL_71:%.*]] = "xla_hlo.broadcast"([[VAL_70]]) {broadcast_sizes = dense<1> : tensor<1xi64>} : (tensor<100x1xf32>) -> tensor<1x100x1xf32>
-// CHECK:           [[VAL_72:%.*]] = "xla_hlo.multiply"([[VAL_23]], [[VAL_68]]) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<500x100x1xf32>, tensor<100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_73:%.*]] = "xla_hlo.multiply"([[VAL_49]], [[VAL_71]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500xf32>, tensor<1x100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_74:%.*]] = xla_hlo.add [[VAL_72]], [[VAL_73]] : tensor<500x100x1xf32>
-// CHECK:           [[VAL_75:%.*]] = "xla_hlo.broadcast_in_dim"([[VAL_74]]) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<500x100x1xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_76:%.*]] = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<500x100x75xi32>
-// CHECK:           [[VAL_77:%.*]] = "xla_hlo.compare"([[VAL_76]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<500x100x75xi32>, tensor<i32>) -> tensor<500x100x75xi1>
-// CHECK:           [[VAL_78:%.*]] = "xla_hlo.select"([[VAL_77]], [[VAL_75]], [[VAL_65]]) : (tensor<500x100x75xi1>, tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_79:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_80:%.*]] = "xla_hlo.broadcast"([[VAL_79]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_81:%.*]] = "xla_hlo.add"([[VAL_80]], [[VAL_60]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<500x100x75xf32>, tensor<500x100xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_82:%.*]] = "xla_hlo.select"([[VAL_77]], [[VAL_81]], [[VAL_80]]) : (tensor<500x100x75xi1>, tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_83:%.*]] = xla_hlo.add [[VAL_20]], [[VAL_82]] : tensor<500x100x75xf32>
-// CHECK:           [[VAL_84:%.*]] = "xla_hlo.iota"() {iota_dimension = 1 : i64} : () -> tensor<500x75xi32>
-// CHECK:           [[VAL_85:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_86:%.*]] = "xla_hlo.broadcast"([[VAL_85]]) {broadcast_sizes = dense<[500, 75]> : tensor<2xi64>} : (tensor<f32>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_87:%.*]] = "xla_hlo.compare"([[VAL_84]], [[VAL_18]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "EQ"} : (tensor<500x75xi32>, tensor<i32>) -> tensor<500x75xi1>
-// CHECK:           [[VAL_88:%.*]] = "xla_hlo.add"([[VAL_86]], [[VAL_53]]) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<500x75xf32>, tensor<500xf32>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_89:%.*]] = "xla_hlo.select"([[VAL_87]], [[VAL_88]], [[VAL_86]]) : (tensor<500x75xi1>, tensor<500x75xf32>, tensor<500x75xf32>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_90:%.*]] = xla_hlo.add [[VAL_21]], [[VAL_89]] : tensor<500x75xf32>
-// CHECK:           [[VAL_91:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-// CHECK:           [[VAL_92:%.*]] = xla_hlo.add [[VAL_18]], [[VAL_91]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<i32>
-// CHECK:           [[VAL_93:%.*]] = "xla_hlo.tuple"([[VAL_92]], [[VAL_78]], [[VAL_83]], [[VAL_90]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:           "xla_hlo.return"([[VAL_93]]) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> ()
-// CHECK:         }) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_94:%.*]] = "xla_hlo.get_tuple_element"([[VAL_95:%.*]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_96:%.*]] = "xla_hlo.get_tuple_element"([[VAL_95]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_97:%.*]] = "xla_hlo.get_tuple_element"([[VAL_95]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:    [[VAL_98:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_99:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_100:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_101:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_0]], [[VAL_94]], [[VAL_100]], [[VAL_98]], [[VAL_99]]) : (tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_102:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:    [[VAL_103:%.*]] = "xla_hlo.broadcast"([[VAL_102]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_104:%.*]] = "xla_hlo.slice"([[VAL_96]]) {limit_indices = dense<[500, 100, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x100x1xf32>
-// CHECK:    [[VAL_105:%.*]] = "xla_hlo.slice"([[VAL_97]]) {limit_indices = dense<[500, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<500x75xf32>) -> tensor<500x1xf32>
-// CHECK:    [[VAL_106:%.*]] = "xla_hlo.negate"([[VAL_105]]) : (tensor<500x1xf32>) -> tensor<500x1xf32>
-// CHECK:    [[VAL_107:%.*]] = "xla_hlo.multiply"([[VAL_106]], [[VAL_104]]) {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<500x1xf32>, tensor<500x100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:    [[VAL_108:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_109:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_110:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_103]], [[VAL_107]], [[VAL_109]], [[VAL_109]], [[VAL_108]]) : (tensor<500x100x75xf32>, tensor<500x100x1xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_111:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_112:%.*]] = "xla_hlo.tuple"([[VAL_111]], [[VAL_110]], [[VAL_96]], [[VAL_97]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_113:%.*]] = "xla_hlo.while"([[VAL_112]]) ( {
-// CHECK:         ^bb0([[VAL_114:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_115:%.*]] = "xla_hlo.get_tuple_element"([[VAL_114]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_116:%.*]] = xla_hlo.constant dense<74> : tensor<i32>
-// CHECK:           [[VAL_117:%.*]] = "xla_hlo.compare"([[VAL_115]], [[VAL_116]]) {comparison_direction = "LT"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
-// CHECK:           "xla_hlo.return"([[VAL_117]]) : (tensor<i1>) -> ()
-// CHECK:         },  {
-// CHECK:         ^bb0([[VAL_118:%.*]]: tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>):
-// CHECK:           [[VAL_119:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 0 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<i32>
-// CHECK:           [[VAL_120:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_121:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_122:%.*]] = "xla_hlo.get_tuple_element"([[VAL_118]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:           [[VAL_123:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-// CHECK:           [[VAL_124:%.*]] = xla_hlo.add [[VAL_119]], [[VAL_123]] : tensor<i32>
-// CHECK:           [[VAL_125:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_126:%.*]] = "xla_hlo.dynamic-slice"([[VAL_121]], [[VAL_125]], [[VAL_125]], [[VAL_124]]) {slice_sizes = dense<[500, 100, 1]> : tensor<3xi64>} : (tensor<500x100x75xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_127:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_128:%.*]] = "xla_hlo.dynamic-slice"([[VAL_122]], [[VAL_127]], [[VAL_124]]) {slice_sizes = dense<[500, 1]> : tensor<2xi64>} : (tensor<500x75xf32>, tensor<i32>, tensor<i32>) -> tensor<500x1xf32>
-// CHECK:           [[VAL_129:%.*]] = "xla_hlo.iota"() {iota_dimension = 2 : i64} : () -> tensor<500x100x75xi32>
-// CHECK:           [[VAL_130:%.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<f32>
-// CHECK:           [[VAL_131:%.*]] = "xla_hlo.broadcast"([[VAL_130]]) {broadcast_sizes = dense<[500, 100, 75]> : tensor<3xi64>} : (tensor<f32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_132:%.*]] = "xla_hlo.compare"([[VAL_129]], [[VAL_124]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "GE"} : (tensor<500x100x75xi32>, tensor<i32>) -> tensor<500x100x75xi1>
-// CHECK:           [[VAL_133:%.*]] = "xla_hlo.select"([[VAL_132]], [[VAL_131]], [[VAL_121]]) : (tensor<500x100x75xi1>, tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_134:%.*]] = "xla_hlo.dot_general"([[VAL_133]], [[VAL_126]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<1> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x100x1xf32>) -> tensor<500x75x1xf32>
-// CHECK:           [[VAL_135:%.*]] = "xla_hlo.dot_general"([[VAL_120]], [[VAL_134]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x75x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_136:%.*]] = "xla_hlo.negate"([[VAL_128]]) : (tensor<500x1xf32>) -> tensor<500x1xf32>
-// CHECK:           [[VAL_137:%.*]] = xla_hlo.add [[VAL_126]], [[VAL_135]] : tensor<500x100x1xf32>
-// CHECK:           [[VAL_138:%.*]] = "xla_hlo.multiply"([[VAL_136]], [[VAL_137]]) {broadcast_dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<500x1xf32>, tensor<500x100x1xf32>) -> tensor<500x100x1xf32>
-// CHECK:           [[VAL_139:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:           [[VAL_140:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_120]], [[VAL_138]], [[VAL_139]], [[VAL_139]], [[VAL_124]]) : (tensor<500x100x75xf32>, tensor<500x100x1xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:           [[VAL_141:%.*]] = xla_hlo.constant dense<1> : tensor<i32>
-// CHECK:           [[VAL_142:%.*]] = xla_hlo.add [[VAL_119]], [[VAL_141]] {broadcast_dimensions = dense<[]> : tensor<0xi64>} : tensor<i32>
-// CHECK:           [[VAL_143:%.*]] = "xla_hlo.tuple"([[VAL_142]], [[VAL_140]], [[VAL_121]], [[VAL_122]]) : (tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:           "xla_hlo.return"([[VAL_143]]) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> ()
-// CHECK:         }) : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>
-// CHECK:    [[VAL_144:%.*]] = "xla_hlo.get_tuple_element"([[VAL_145:%.*]]) {index = 1 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_146:%.*]] = "xla_hlo.get_tuple_element"([[VAL_145]]) {index = 2 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_147:%.*]] = "xla_hlo.get_tuple_element"([[VAL_145]]) {index = 3 : i32} : (tuple<tensor<i32>, tensor<500x100x75xf32>, tensor<500x100x75xf32>, tensor<500x75xf32>>) -> tensor<500x75xf32>
-// CHECK:    [[VAL_148:%.*]] = "xla_hlo.slice"([[VAL_101]]) {limit_indices = dense<[500, 100, 75]> : tensor<3xi64>, start_indices = dense<[0, 0, 75]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x100x0xf32>
-// CHECK:    [[VAL_149:%.*]] = "xla_hlo.dot_general"([[VAL_144]], [[VAL_148]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<1> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x100x0xf32>) -> tensor<500x75x0xf32>
-// CHECK:    [[VAL_150:%.*]] = "xla_hlo.dot_general"([[VAL_96]], [[VAL_149]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x75x0xf32>) -> tensor<500x100x0xf32>
-// CHECK:    [[VAL_151:%.*]] = xla_hlo.add [[VAL_148]], [[VAL_150]] : tensor<500x100x0xf32>
-// CHECK:    [[VAL_152:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_153:%.*]] = xla_hlo.constant dense<75> : tensor<i32>
-// CHECK:    [[VAL_154:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_155:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_101]], [[VAL_151]], [[VAL_154]], [[VAL_152]], [[VAL_153]]) : (tensor<500x100x75xf32>, tensor<500x100x0xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_156:%.*]] = "xla_hlo.slice"([[VAL_5]]) {limit_indices = dense<[500, 100, 100]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x100xf32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_157:%.*]] = "xla_hlo.dot_general"([[VAL_156]], [[VAL_144]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<1> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x100xf32>, tensor<500x100x75xf32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_158:%.*]] = "xla_hlo.dot_general"([[VAL_157]], [[VAL_96]]) {dot_dimension_numbers = {lhs_batching_dimensions = dense<0> : tensor<1xi64>, lhs_contracting_dimensions = dense<2> : tensor<1xi64>, rhs_batching_dimensions = dense<0> : tensor<1xi64>, rhs_contracting_dimensions = dense<2> : tensor<1xi64>}, precision_config = ["HIGHEST", "HIGHEST"]} : (tensor<500x100x75xf32>, tensor<500x100x75xf32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_159:%.*]] = xla_hlo.add [[VAL_156]], [[VAL_158]] : tensor<500x100x100xf32>
-// CHECK:    [[VAL_160:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_161:%.*]] = xla_hlo.constant dense<0> : tensor<i32>
-// CHECK:    [[VAL_162:%.*]] = "xla_hlo.dynamic-update-slice"([[VAL_5]], [[VAL_159]], [[VAL_161]], [[VAL_161]], [[VAL_160]]) : (tensor<500x100x100xf32>, tensor<500x100x100xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<500x100x100xf32>
-// CHECK:    [[VAL_163:%.*]] = "xla_hlo.slice"([[VAL_162]]) {limit_indices = dense<[500, 100, 75]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x100xf32>) -> tensor<500x100x75xf32>
-// CHECK:    [[VAL_164:%.*]] = "xla_hlo.slice"([[VAL_155]]) {limit_indices = dense<[500, 75, 75]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<500x100x75xf32>) -> tensor<500x75x75xf32>
-// CHECK:    return [[VAL_163]], [[VAL_164]] : tensor<500x100x75xf32>, tensor<500x75x75xf32>
+  // The tf.Qr lowering is a full algorithm that is not effective to verify with
+  // FileCheck. Just verify that it converted.
+  // TODO(laurenzo): Move this out of the mainline tf2xla conversion as it is
+  // really only applicable to certain legacy uses.
+  // CHECK-NOT: "tf.Qr"
   %0:2 = "tf.Qr"(%arg0) {full_matrices = false} : (tensor<500x100x75xf32>) -> (tensor<500x100x75xf32>, tensor<500x75x75xf32>)
   return %0#0, %0#1 : tensor<500x100x75xf32>, tensor<500x75x75xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
index d25a84d0e25..9f27a204baf 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-to-std.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -xla-legalize-to-std %s -o - | FileCheck %s
+// RUN: xla-opt -xla-legalize-to-std %s -o - | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
 func @binary_ops_float(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
@@ -42,40 +42,6 @@ func @binary_ops_int(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32
   return %4 : tensor<4xi32>
 }
 
-// Broadcasting is not currently supported.
-// TODO(suderman):Future pass should take all broadcasted binary ops and convert
-// them to separate broadcast and binary op.
-// CHECK-LABEL: func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
-func @binary_ops_broadcast(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
-  // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "add.3"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {
-      name = "add.3", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %1 = "xla_hlo.multiply"(%0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "mul.4"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %1 = "xla_hlo.multiply"(%0, %arg1) {
-      name = "mul.4", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %2 = "xla_hlo.subtract"(%1, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "sub.5"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %2 = "xla_hlo.subtract"(%1, %arg1) {
-      name = "sub.5", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %3 = "xla_hlo.divide"(%2, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, name = "div.6"} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %3 = "xla_hlo.divide"(%2, %arg1) {
-      name = "div.6", broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: %4 = "xla_hlo.remainder"(%3, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-  %4 = "xla_hlo.remainder"(%3, %arg1) {
-    broadcast_dimensions = dense<1> : tensor<1xi64>} :
-          (tensor<4x4xf32>, tensor<4xf32>) -> tensor<4x4xf32>
-
-  // CHECK-NEXT: return %4 : tensor<4x4xf32>
-  return %4 : tensor<4x4xf32>
-}
-
 // CHECK-LABEL: func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>, tensor<4xi1>) {
 func @compare_int(%arg0: tensor<4xi32>) -> (tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>,tensor<4xi1>) {
   // CHECK-NEXT: %0 = cmpi "eq", %arg0, %arg0 : tensor<4xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir b/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
index 35a5ae549d5..81376761467 100644
--- a/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lower-complex.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt %s -test-xla-lower-complex | FileCheck %s
+// RUN: xla-opt %s -test-xla-chlo-legalize-to-hlo -test-xla-lower-complex | FileCheck %s --dump-input-on-failure
 
 // CHECK-LABEL: @add
 func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>) {
@@ -15,21 +15,6 @@ func @add(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @add_broadcast
-func @add_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.add"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.add"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.add"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL0]], [[VAL1]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @add_unranked
 func @add_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -60,21 +45,6 @@ func @sub(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @sub_broadcast
-func @sub_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.subtract"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.subtract"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.subtract"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL0]], [[VAL1]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @sub_unranked
 func @sub_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -109,25 +79,6 @@ func @mul(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
   return %5, %6 : tensor<2xf32>, tensor<2xf32>
 }
 
-// CHECK-LABEL: @mul_broadcast
-func @mul_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.multiply"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.multiply"(%arg1, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = xla_hlo.subtract [[VAL0]], [[VAL1]]
-  // CHECK-DAG: [[VAL3:%.+]] = "xla_hlo.multiply"(%arg0, %arg3) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL4:%.+]] = "xla_hlo.multiply"(%arg1, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL5:%.+]] = xla_hlo.add [[VAL3]], [[VAL4]]
-  %4 = "xla_hlo.multiply"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return %2, %5 : tensor<1x2xf32>, tensor<1x2xf32>
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
 // CHECK-LABEL: @mul_unranked
 func @mul_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
@@ -186,45 +137,6 @@ func @div(%arg0 : tensor<2xf32>, %arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>, %
 
 // -----
 
-// CHECK-LABEL: @div_broadcast
-func @div_broadcast(%arg0 : tensor<1x2xf32>, %arg1 : tensor<1x2xf32>, %arg2 : tensor<2xf32>, %arg3 : tensor<2xf32>) -> (tensor<1x2xf32>, tensor<1x2xf32>) {
-  %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> (tensor<1x2xcomplex<f32>>)
-  %3 = "xla_hlo.complex"(%arg2, %arg3) : (tensor<2xf32>, tensor<2xf32>) -> (tensor<2xcomplex<f32>>)
-
-  // CHECK-DAG: [[VAL0:%.+]] = "xla_hlo.negate"(%arg3)
-
-  // Compute the numerator's real component:
-  //   numerator.real = lhs.real * rhs.real  lhs.imag * rhs.imag
-  // CHECK-DAG: [[VAL1:%.+]] = "xla_hlo.multiply"(%arg0, %arg2) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL2:%.+]] = "xla_hlo.multiply"(%arg1, [[VAL0]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL3:%.+]] = xla_hlo.subtract [[VAL1]], [[VAL2]]
-
-  // Compute the real valued denominator as rhs * con(rhs):
-  //   denominator = rhs.real * rhs.real + rhs.imag * rhs.imag
-  // CHECK-DAG: [[VAL4:%.+]] = xla_hlo.multiply %arg2, %arg2
-  // CHECK-DAG: [[VAL5:%.+]] = xla_hlo.multiply %arg3, [[VAL0]]
-  // CHECK-DAG: [[VAL6:%.+]] = xla_hlo.subtract [[VAL4]], [[VAL5]]
-
-  // Compute the numerator's imaginary component:
-  //   numerator.imag = lhs.imag * rhs.real - lhs.real * rhs.imag
-  // CHECK-DAG: [[VAL7:%.+]] = "xla_hlo.multiply"(%arg1, %arg2)
-  // CHECK-DAG: [[VAL8:%.+]] = "xla_hlo.multiply"(%arg0, [[VAL0]])
-  // CHECK-DAG: [[VAL9:%.+]] = xla_hlo.add [[VAL8]], [[VAL7]]
-
-  // Divide the numerator by the real valued denominator.
-  // CHECK-DAG: [[VAL10:%.+]] = "xla_hlo.divide"([[VAL3]], [[VAL6]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[VAL11:%.+]] = "xla_hlo.divide"([[VAL9]], [[VAL6]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %4 = "xla_hlo.divide"(%2, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> (tensor<1x2xcomplex<f32>>)
-
-  %5 = "xla_hlo.real"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-  %6 = "xla_hlo.imag"(%4) : (tensor<1x2xcomplex<f32>>) -> (tensor<1x2xf32>)
-
-  // CHECK: return [[VAL10]], [[VAL11]]
-  return %5, %6 : tensor<1x2xf32>, tensor<1x2xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @div_unranked
 func @div_unranked(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>, %arg3 : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xf32>) {
   %2 = "xla_hlo.complex"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xcomplex<f32>>)
diff --git a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
index a7f4a5b4474..55b55c7b4e2 100644
--- a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
@@ -1,225 +1,5 @@
 // RUN: xla-opt -test-xla-materialize-broadcasts -split-input-file %s -o - | FileCheck --dump-input=fail %s
 
-// CHECK-LABEL: @addBroadcastRhs
-func @addBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastLhs
-func @addBroadcastLhs(%arg0: tensor<4xf32>, %arg1: tensor<1x4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %arg1 : tensor<1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>, tensor<1x4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastEqual
-func @addBroadcastEqual(%arg0: tensor<4x1xf32>, %arg1: tensor<1x4xf32>) -> tensor<4x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<4x1xf32>) -> tensor<4x4xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x4xf32>) -> tensor<4x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<4x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4x1xf32>, tensor<1x4xf32>) -> tensor<4x4xf32>
-  return %0 : tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastMultidimension
-func @addBroadcastMultidimension(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1x4xf32>) -> tensor<1x1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>) -> tensor<1x1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %arg1 : tensor<1x1x4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<1x1xf32>, tensor<1x1x4xf32>) -> tensor<1x1x4xf32>
-  return %0 : tensor<1x1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastBothArgs
-func @addBroadcastBothArgs(%arg0: tensor<1x2xf32>, %arg1: tensor<3x2x1xf32>) -> tensor<3x2x2xf32> {
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>) -> tensor<3x2x2xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<3x2x1xf32>) -> tensor<3x2x2xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<3x2x2xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xf32>, tensor<3x2x1xf32>) -> tensor<3x2x2xf32>
-  return %0 : tensor<3x2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addBroadcastScalar
-func @addBroadcastScalar(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>) -> tensor<4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %[[BROADCAST1]] : tensor<4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addWithoutBroadcast
-func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %arg1 : tensor<4xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @addUnranked
-func @addUnranked(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.add %arg0, %arg1 : tensor<*xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @atan2BroadcastRhs
-func @atan2BroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.atan2 %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.atan2"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @divBroadcastRhs
-func @divBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.divide %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @maxBroadcastRhs
-func @maxBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.maximum %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.maximum"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @minBroadcastRhs
-func @minBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.minimum %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.minimum"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @mulBroadcastRhs
-func @mulBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.multiply %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @powBroadcastRhs
-func @powBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.power %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.power"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @remainderBroadcastRhs
-func @remainderBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.remainder %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.remainder"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftLeftBroadcastRhs
-func @shiftLeftBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_left %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_left"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftRightArithmeticBroadcastRhs
-func @shiftRightArithmeticBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_right_arithmetic %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @shiftRightLogicalBroadcastRhs
-func @shiftRightLogicalBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.shift_right_logical %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.shift_right_logical"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @subBroadcastRhs
-func @subBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.subtract %arg0, %[[BROADCAST1]] : tensor<1x4xf32>
-  %0 = "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
-  return %0 : tensor<1x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @andBroadcastRhs
-func @andBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.and %arg0, %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @orBroadcastRhs
-func @orBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.or %arg0, %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @xorBroadcastRhs
-func @xorBroadcastRhs(%arg0: tensor<1x4xi32>, %arg1: tensor<4xi32>) -> tensor<1x4xi32> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>) -> tensor<1x4xi32>
-  // CHECK-NEXT: %[[RESULT:.*]] = xla_hlo.xor %arg0, %[[BROADCAST1]] : tensor<1x4xi32>
-  %0 = "xla_hlo.xor"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xi32>, tensor<4xi32>) -> tensor<1x4xi32>
-  return %0 : tensor<1x4xi32>
-}
-
-// -----
-
 // CHECK-LABEL: @clampBroadcast
 // CHECK-SAME: (%[[MIN:.+]]: tensor<f32>, %[[VAL:.+]]: tensor<4xf32>, %[[MAX:.+]]: tensor<f32>)
 func @clampBroadcast(%min: tensor<f32>, %value: tensor<4xf32>, %max: tensor<f32>) -> tensor<4xf32> {
@@ -229,63 +9,3 @@ func @clampBroadcast(%min: tensor<f32>, %value: tensor<4xf32>, %max: tensor<f32>
   %0 = "xla_hlo.clamp"(%min, %value, %max) : (tensor<f32>, tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   return %0 : tensor<4xf32>
 }
-
-// -----
-
-// CHECK-LABEL: @compareBroadcastRhs
-func @compareBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xi1> {
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.broadcast_in_dim"(%arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<1x4xf32>
-  // CHECK-NEXT: %[[RESULT:.*]] = "xla_hlo.compare"(%arg0, %[[BROADCAST1]]) {comparison_direction = "NE"} : (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xi1>
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xi1>
-  return %0 : tensor<1x4xi1>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicCompareBroadcastRhs
-func @dynamicCompareBroadcastRhs(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xi1> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %c1 = constant 1 : index
-  // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
-  // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[DIM1:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT: "xla_hlo.compare"(%[[BROADCAST0]], %[[BROADCAST1]]) {comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
-  %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xi1>
-  return %0 : tensor<?x?xi1>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicBroadcastAdd
-func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %c1 = constant 1 : index
-  // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
-  // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[DIM1:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @dynamicBroadcastAddScalar
-func @dynamicBroadcastAddScalar(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
-  // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
-  // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
-  %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 15fa91588a5..20b43e8633d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s --dump-input-on-failure
 
 // CHECK:  HloModule
 func @main(%arg0: !xla_hlo.token, %arg1: !xla_hlo.token) -> !xla_hlo.token {
@@ -96,34 +96,6 @@ func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> (tensor<4xi32>, tensor
 
 // -----
 
-// CHECK:  HloModule
-func @main(%arg0: tensor<1x4xi32>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3x4xi32>) -> tensor<2x3x4xi32> {
-  // Same rank degenerate broadcast
-  // CHECK:  [[ARG_0:%.*]] = s32[1,4] parameter(0)
-  // CHECK-NEXT:  [[RESHAPE_1:%.*]] = s32[4] reshape(s32[1,4] [[ARG_0]])
-  // CHECK-NEXT:  [[BROADCAST_1:%.*]] = s32[2,4] broadcast(s32[4] [[RESHAPE_1]])
-  // CHECK-NEXT:  [[ARG_1:%.*]] = s32[2,4] parameter(1)
-  // CHECK-NEXT:  s32[2,4] add(s32[2,4] [[BROADCAST_1]], s32[2,4] [[ARG_1]])
-  %0 = "xla_hlo.add"(%arg0, %arg1) : (tensor<1x4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-
-  // Broadcast up rank
-  // CHECK-NEXT:  [[BROADCAST_2:%.*]] = s32[2,3,4] broadcast(s32[2,4] [[ARG_1]]), dimensions={0,2}
-  // CHECK-NEXT:  [[ARG_2:%.*]] = s32[2,3,4] parameter(2)
-  // CHECK-NEXT:  s32[2,3,4] add(s32[2,3,4] [[BROADCAST_2]], s32[2,3,4] [[ARG_2]])
-  %1 = "xla_hlo.add"(%arg1, %arg2) {broadcast_dimensions = dense<[0,2]> : tensor<2xi64>} : (tensor<2x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
-
-  // Broadcast up rank + degenerate broadcast
-  // CHECK-NEXT:  [[BROADCAST_3:%.*]] = s32[2,1,4] broadcast(s32[1,4] [[ARG_0]]), dimensions={1,2}
-  // CHECK-NEXT:  [[RESHAPE_2:%.*]] = s32[2,4] reshape(s32[2,1,4] [[BROADCAST_3]])
-  // CHECK-NEXT:  [[BROADCAST_4:%.*]] = s32[2,3,4] broadcast(s32[2,4] [[RESHAPE_2]]), dimensions={0,2}
-  // CHECK:  ROOT
-  // CHECK-SAME:  s32[2,3,4] add(s32[2,3,4] [[BROADCAST_4]], s32[2,3,4] [[ARG_2]])
-  %2 = "xla_hlo.add"(%arg0, %arg2) {broadcast_dimensions = dense<[1,2]> : tensor<2xi64>} : (tensor<1x4xi32>, tensor<2x3x4xi32>) -> tensor<2x3x4xi32>
-  return %2 : tensor<2x3x4xi32>
-}
-
-// -----
-
 // CHECK:  HloModule
 func @main(%arg0: tensor<2xi32>) -> tensor<2xf32> {
   %0 = "xla_hlo.bitcast_convert"(%arg0) : (tensor<2xi32>) -> tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 207a8f2eabc..af45f84b34d 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -1,4 +1,4 @@
-// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s --dump-input-on-failure
 
 HloModule main
 
@@ -20,29 +20,6 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   ROOT %dot.4 = f32[] dot(f32[4]{0} %add.3, f32[4]{0} %Arg_1.2), lhs_contracting_dims={0}, rhs_contracting_dims={0}
 }
 
-// This test is more thorough than those of the the other binary ops to test
-// their shared functionality.
-
-// CHECK-LABEL:  func @test_add
-%test_add (Arg_0.1: f32[4], Arg_1.2: f32[4], Arg_2.3: f32[], Arg_3.4: f32[]) -> f32[4] {
-  %Arg_0.1 = f32[4] parameter(0)
-  %Arg_1.2 = f32[4] parameter(1)
-  %Arg_2.3 = f32[] parameter(2)
-  %Arg_3.4 = f32[] parameter(3)
-
-  // Add two tensors
-  // CHECK-NEXT:  xla_hlo.add %arg0, %arg1 {name = "{{.*}}"}
-  %add.3 = f32[4] add(f32[4] %Arg_0.1, f32[4] %Arg_1.2)
-
-  // Add two scalars
-  // CHECK-NEXT:  xla_hlo.add %arg2, %arg3
-  %add.4 = f32[] add(f32[] %Arg_2.3, f32[] %Arg_3.4)
-
-  // Add a tensor and scalar
-  // CHECK-NEXT:  "xla_hlo.add"(%0, %1)
-  ROOT %add.5 = f32[4] add(f32[4] %add.3, f32[] %add.4)
-}
-
 // CHECK-LABEL:  func @test_after_all
 // CHECK-SAME:  ([[VAL_0:%.*]]: !xla_hlo.token, [[VAL_1:%.*]]: !xla_hlo.token) -> !xla_hlo.token
 %test_after_all (token0: token[], token1: token[] ) -> token[] {
@@ -159,11 +136,11 @@ add {
 }
 
 
-// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<1xf32>) -> tensor<3xi1> {
-%test_compare (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[1]) -> pred[3] {
+// CHECK-LABEL:  func @test_compare(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>, %arg2: tensor<3xf32>) -> tensor<3xi1> {
+%test_compare (Arg_0.1: f32[3], Arg_1.2: f32[3], Arg_2.3: f32[3]) -> pred[3] {
   %Arg_0.1 = f32[3] parameter(0)
   %Arg_1.2 = f32[3] parameter(1)
-  %Arg_2.3 = f32[1] parameter(2)
+  %Arg_2.3 = f32[3] parameter(2)
 
   // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   %compare.4 = pred[3] compare(Arg_0.1, Arg_1.2), direction=EQ
@@ -172,7 +149,7 @@ add {
   %compare.5 = pred[3] compare(Arg_0.1, Arg_1.2), direction=LE
 
   // Requires broadcast of compatible tensors.
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "{{.*}}"} : (tensor<3xf32>, tensor<1xf32>) -> tensor<3xi1>
+  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg2) {comparison_direction = "GT", name = "{{.*}}"} : (tensor<3xf32>, tensor<3xf32>) -> tensor<3xi1>
   ROOT %compare.6 = pred[3] compare(Arg_0.1, Arg_2.3), direction=GT
 }
 
@@ -280,19 +257,19 @@ add {
   ROOT %convolution = f32[1,5,1] convolution(f32[1,2,1] %input, f32[1,1,1] %filter), feature_group_count=1, dim_labels=b0f_0io->b0f, window={pad=1_2 size=1}
 }
 
-// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf64> {
-%test_convert (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f64[4] {
+// CHECK-LABEL:  func @test_convert(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf64> {
+%test_convert (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f64[4] {
   %Arg_0.1 = f32[4] parameter(0)
-  %Arg_1.2 = f32[] parameter(1)
+  %Arg_1.2 = f32[4] parameter(1)
 
   // CHECK-NEXT:  %0 = "xla_hlo.convert"(%arg0) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
   %convert.3 = f64[4] convert(f32[4] %Arg_0.1)
 
-  // CHECK-NEXT:  %1 = "xla_hlo.convert"(%arg1) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f64>
-  %convert.4 = f64[] convert(f32[] %Arg_1.2)
+  // CHECK-NEXT:  %1 = "xla_hlo.convert"(%arg1) {name = "{{.*}}"} : (tensor<4xf32>) -> tensor<4xf64>
+  %convert.4 = f64[4] convert(f32[4] %Arg_1.2)
 
-  // CHECK-NEXT:  "xla_hlo.add"(%0, %1)
-  ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[] %convert.4)
+  // CHECK-NEXT:  xla_hlo.add %0, %1
+  ROOT %add.5 = f64[4] add(f64[4] %convert.3, f64[4] %convert.4)
 }
 
 // CHECK-LABEL:  func @test_cosine(%arg0: tensor<1x16x16x3xf32>) -> tensor<1x16x16x3xf32> {
diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
index 0c9585a817f..e5a79616d5b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -163,8 +163,7 @@ struct HloBinaryElementwiseAdaptor {
                          Value broadcasted_lhs, Value broadcasted_rhs,
                          OpBuilder &builder) {
     return builder.create<ToOpTy>(from_op.getLoc(), result_type,
-                                  broadcasted_lhs, broadcasted_rhs,
-                                  /*broadcast_dimensions=*/nullptr);
+                                  broadcasted_lhs, broadcasted_rhs);
   }
 };
 
@@ -183,9 +182,9 @@ struct HloCompareAdaptor {
                                      Type result_type, Value broadcasted_lhs,
                                      Value broadcasted_rhs,
                                      OpBuilder &builder) {
-    return builder.create<xla_hlo::CompareOp>(
-        from_op.getLoc(), result_type, broadcasted_lhs, broadcasted_rhs,
-        /*broadcast_dimensions=*/nullptr, from_op.comparison_direction());
+    return builder.create<xla_hlo::CompareOp>(from_op.getLoc(), result_type,
+                                              broadcasted_lhs, broadcasted_rhs,
+                                              from_op.comparison_direction());
   }
 };
 
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 10bac232b0f..8675d6c8a4b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -67,8 +67,9 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
  public:
   LegalizeTF() = default;
   LegalizeTF(const LegalizeTF &) {}
-  explicit LegalizeTF(bool allow_partial_conversion) {
+  explicit LegalizeTF(bool allow_partial_conversion, bool legalize_chlo) {
     allow_partial_conversion_ = allow_partial_conversion;
+    legalize_chlo_ = legalize_chlo;
   }
 
   /// Performs the lowering to XLA dialect.
@@ -79,6 +80,11 @@ class LegalizeTF : public PassWrapper<LegalizeTF, FunctionPass> {
       *this, "allow-partial-conversion",
       llvm::cl::desc("Allow operations that can't be legalized."),
       llvm::cl::init(false)};
+  Option<bool> legalize_chlo_{
+      *this, "legalize-chlo",
+      llvm::cl::desc(
+          "Also legalizes intermediate chlo ops to hlo (default true)"),
+      llvm::cl::init(true)};
 };
 
 /// Returns if the given TF data format string is the default format.
@@ -362,6 +368,154 @@ static Value UpdateSliceInMinorDims(Location loc, Value v, Value update,
   return DynamicUpdateSliceInMinorDims(loc, v, update, dus_starts, builder);
 }
 
+// Deprecated: This is maintained to aid in porting old code that is not yet
+// dynamic shape aware and uses broadcasting modes that CHLO does not support.
+// Gets the resulting type from a broadcast between two types for statically
+// shaped types. This is to be used for legacy lowerings that both use non
+// left-padded broadcasting and static shapes. Its use should not be permitted
+// in new code.
+// May return nullptr on invalid static broadcast dimensions.
+// ABSL_DEPRECATED()
+static RankedTensorType GetStaticBroadcastType(
+    RankedTensorType x, RankedTensorType y,
+    DenseIntElementsAttr broadcast_dimensions_attr) {
+  auto element_type = x.getElementType();
+  auto shape_x = x.getShape();
+  auto shape_y = y.getShape();
+
+  if (shape_x.size() == shape_y.size()) {
+    llvm::SmallVector<int64_t, 4> out_shape(shape_x.size());
+    for (int i = 0; i < shape_x.size(); i++) {
+      auto x_val = shape_x[i];
+      auto y_val = shape_y[i];
+      out_shape[i] = std::max(x_val, y_val);
+    }
+    return RankedTensorType::get(out_shape, element_type);
+  }
+
+  auto shape_large = shape_x.size() > shape_y.size() ? shape_x : shape_y;
+  auto shape_small = shape_x.size() <= shape_y.size() ? shape_x : shape_y;
+
+  llvm::SmallVector<int64_t, 4> broadcast_dimensions;
+  // Explicit broadcast dimensions.
+  for (const APInt &int_value : broadcast_dimensions_attr) {
+    broadcast_dimensions.push_back(int_value.getSExtValue());
+  }
+  if (broadcast_dimensions.size() != shape_small.size()) {
+    return nullptr;
+  }
+  llvm::SmallVector<int64_t, 4> out_shape(shape_large.begin(),
+                                          shape_large.end());
+
+  // Update according to the broadcast dimensions.
+  for (auto index_pair : llvm::enumerate(broadcast_dimensions)) {
+    auto old_value = out_shape[index_pair.value()];
+    auto new_value = shape_small[index_pair.index()];
+    out_shape[index_pair.value()] = std::max(old_value, new_value);
+  }
+  return RankedTensorType::get(out_shape, element_type);
+}
+
+// Deprecated: This is maintained to aid in porting old code that is not yet
+// dynamic shape aware and uses broadcasting modes that CHLO does not support.
+// Applies static binary broadcasting to a binary elementwise op.
+// This is a legacy helper to provide general broadcasting support in legacy,
+// static shaped code that relies on non-left-padded broadcasting semantics.
+template <typename BinaryOp>
+static Value StaticBinaryBroadcast(Location loc, Value x, Value y,
+                                   DenseIntElementsAttr broadcast_dims,
+                                   OpBuilder &builder) {
+  auto x_type = x.getType().cast<RankedTensorType>();
+  auto y_type = y.getType().cast<RankedTensorType>();
+  auto result_type = GetStaticBroadcastType(x_type, y_type, broadcast_dims);
+  if (!result_type) {
+    emitError(loc) << "could not binary broadcast " << x_type << ", " << y_type
+                   << " with broadcast_dims = " << broadcast_dims;
+    return nullptr;
+  }
+  auto larger_broadcast_dims =
+      GetI64ElementsAttrForSeq(0, result_type.getRank(), &builder);
+  if (x_type.getRank() < y_type.getRank()) {
+    if (x_type != result_type) {
+      x = builder.create<BroadcastInDimOp>(loc, result_type, x, broadcast_dims);
+    }
+    if (y_type != result_type) {
+      y = builder.create<BroadcastInDimOp>(loc, result_type, y,
+                                           larger_broadcast_dims);
+    }
+  } else {
+    if (x_type != result_type) {
+      x = builder.create<BroadcastInDimOp>(loc, result_type, x,
+                                           larger_broadcast_dims);
+    }
+    if (y_type != result_type) {
+      y = builder.create<BroadcastInDimOp>(loc, result_type, y, broadcast_dims);
+    }
+  }
+  return builder.create<BinaryOp>(loc, x, y);
+}
+
+// Gets a 1D tensor type suitable for expressing extents of the given tensor
+// value type. If the value type is ranked, the result will be statically
+// shaped. Otherwise, it will have a dynamic dimension.
+static RankedTensorType GetExtentsTensorTypeFor(TensorType value_type) {
+  Builder b(value_type.getContext());
+  int64_t dim = value_type.hasRank() ? value_type.getRank() : -1;
+  return RankedTensorType::get({dim}, b.getIndexType());
+}
+
+// Broadcasts a 'lower_rank_value' to the shape of a 'higher_rank_value'
+// by assuming that the shape of the lower ranked is a broadcast compatible
+// prefix of the higher ranked.
+// Values must be RankedTensorType (this restriction derives from the
+// broadcast_dimensions attribute on DynamicBroadcastInDim).
+//
+// Example:
+//   CommonPrefixBroadcast(tensor<4x3x256>, tensor<4, 3>) will broadcast the
+//   lower rank value to [4, 3, 256] (i.e. the opposite of numpy-style
+//   implicit broadcasting).
+static Value CommonPrefixBroadcast(Location loc, Value higher_rank_value,
+                                   Value lower_rank_value, OpBuilder &builder) {
+  Value higher_rank_shape =
+      builder.create<shape::ShapeOfOp>(loc, higher_rank_value);
+  auto result_extents_type =
+      GetExtentsTensorTypeFor(higher_rank_value.getType().cast<TensorType>());
+  Value result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, higher_rank_shape);
+
+  auto lower_rank_type = lower_rank_value.getType().cast<RankedTensorType>();
+  auto lower_rank = lower_rank_type.getRank();
+  auto prefix_dims = GetI64ElementsAttrForSeq(0, lower_rank, &builder);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, higher_rank_value.getType(), lower_rank_value, result_extents,
+      prefix_dims);
+}
+
+// Given a value (broadcast_to) and a feature dimension, broadcasts a 1D
+// value (broadcast_from) along that feature dimension. This is a shortcut
+// for the cases where a 1D tensor must be broadcast along a specific feature
+// dimension, which can vary based on data layout, etc.
+//
+// The extent of `broadcast_from` dim0 must be equal to the extent of the
+// feature_dim of `broadcast_to`.
+//
+// Example:
+//   [1x2x3x4], [2], 1 -> [1x2x3x4]
+// TODO(laurenzo): Swap the order of broadcast_to and broadcast_from for
+// consistency. Possibly also rename for clarity.
+static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
+                                     Value broadcast_from, int64_t feature_dim,
+                                     OpBuilder &builder) {
+  auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &builder);
+  auto to_type = broadcast_to.getType().cast<RankedTensorType>();
+  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto result_extents_type = GetExtentsTensorTypeFor(to_type);
+  auto result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, result_shape);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, to_type, broadcast_from, result_extents, broadcast_dims);
+}
+
 // Creates a batch dot using xla_hlo::DotGeneralOp.
 Value BatchDot(Location loc, Value lhs, bool transpose_lhs, Value rhs,
                bool transpose_rhs, int64_t num_batch_dims,
@@ -407,8 +561,7 @@ static void BuildReduceBody(Type element_type, Region *body,
 
   Location loc = body->getLoc();
   auto reducer =
-      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1),
-                          /*broadcast_dimensions=*/nullptr);
+      builder->create<Op>(loc, block->getArgument(0), block->getArgument(1));
   builder->create<ReturnOp>(loc, reducer.getResult());
 }
 
@@ -508,8 +661,7 @@ static void CreateWhile32(Location loc, int num_iterations,
         loc, builder->getI32IntegerAttr(num_iterations));
     StringAttr compare_direction = StringAttr::get("LT", builder->getContext());
     Value compare = builder->create<xla_hlo::CompareOp>(
-        loc, loop_iv, upper_limit,
-        /*broadcast_dimensions=*/nullptr, compare_direction);
+        loc, loop_iv, upper_limit, compare_direction);
 
     builder->create<xla_hlo::ReturnOp>(loc, compare);
   }
@@ -539,9 +691,9 @@ static void CreateWhile32(Location loc, int num_iterations,
     // Increment the loop induction variable by one.
     auto one =
         builder->create<xla_hlo::ConstOp>(loc, builder->getI32IntegerAttr(1));
-    auto no_broadcast_dims = GetI64ElementsAttr({}, builder);
-    auto plus_one = builder->create<xla_hlo::AddOp>(loc, old_values[0], one,
-                                                    no_broadcast_dims);
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, builder);
+    auto plus_one = builder->create<xla_chlo::BroadcastAddOp>(
+        loc, old_values[0], one, scalar_broadcast_dims);
     // Prepend with the updated loop induction variable.
     new_values.insert(new_values.begin(), plus_one);
 
@@ -566,21 +718,6 @@ static IntegerAttr getFeatureDimensionAttr(Builder &b, StringAttr format,
       GetFeatureDimension(format, input.getType().cast<RankedTensorType>()));
 }
 
-//===----------------------------------------------------------------------===//
-// Bias op utilities.
-//===----------------------------------------------------------------------===//
-
-// Return a 1D DenseIntElementsAttr for the feature dimension of a BiasAdd.
-// Requires input to have ranked tensor.
-static DenseIntElementsAttr getBiasFeatureDimension(Builder &b,
-                                                    StringAttr format,
-                                                    Value input) {
-  auto inputType = input.getType().cast<RankedTensorType>();
-  size_t featureDim = GetFeatureDimension(format, inputType);
-  RankedTensorType type = RankedTensorType::get(1, b.getIntegerType(64));
-  return DenseIntElementsAttr::get(type, featureDim);
-}
-
 //===----------------------------------------------------------------------===//
 // MatMul op utilities.
 //===----------------------------------------------------------------------===//
@@ -743,8 +880,7 @@ static void BuildArgMinMaxReductionBody(Type input_element_type,
   StringAttr compare_direction =
       StringAttr::get(direction, builder->getContext());
   Value compare = builder->create<CompareOp>(
-      loc, block->getArgument(0), block->getArgument(2),
-      /*broadcast_dimensions=*/nullptr, compare_direction);
+      loc, block->getArgument(0), block->getArgument(2), compare_direction);
 
   Value selected_input = builder->create<SelectOp>(
       loc, input_type, compare, block->getArgument(0), block->getArgument(2));
@@ -860,8 +996,7 @@ static void BuildSortComparisonBody(llvm::ArrayRef<Type> element_types,
   StringAttr compare_direction =
       StringAttr::get(direction, builder->getContext());
   Value compare = builder->create<xla_hlo::CompareOp>(
-      loc, block->getArgument(0), block->getArgument(1),
-      /*broadcast_dimensions=*/nullptr, compare_direction);
+      loc, block->getArgument(0), block->getArgument(1), compare_direction);
 
   builder->create<xla_hlo::ReturnOp>(loc, compare);
 }
@@ -900,6 +1035,27 @@ NamedAttribute GetConvDimensionNumbersAttr(
           feature_dim, spatial_dims, builder->getContext()));
 }
 
+// Converts a TF::BiasAddOp to HLO.
+// This differs from a normal TF::AddOp with respect to how the data_format
+// is handled, which can optionally require a general broadcast of the
+// 'bias' term in a way that is not compatible with the standard left-padded
+// broadcast semantics (i.e. NCHW will broadcast into dimension 1).
+// The correct 'bias' broadcast will be synthesized manually.
+class ConvertBiasAddOp : public OpRewritePattern<TF::BiasAddOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(TF::BiasAddOp op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = op.getLoc();
+    auto feature_dim = GetFeatureDimension(
+        op.data_formatAttr(), op.value().getType().cast<RankedTensorType>());
+    auto bias_broadcast = Broadcast1DToFeatureDim(loc, op.value(), op.bias(),
+                                                  feature_dim, rewriter);
+    rewriter.replaceOpWithNewOp<AddOp>(op, op.value(), bias_broadcast);
+    return success();
+  }
+};
+
 // Converts the TensorFlow conv op in template to the generic HLO conv op by
 // converting TensorFlow op attributes to HLO op attributes.
 //
@@ -1161,7 +1317,6 @@ class ConvertDiagPartOp : public OpRewritePattern<TF::DiagPartOp> {
                                          rewriter.getI64IntegerAttr(1));
     Value compare = rewriter.create<CompareOp>(
         op.getLoc(), iota0, iota1,
-        /*broadcast_dimensions=*/nullptr,
         StringAttr::get("EQ", rewriter.getContext()));
     Value zero = GetScalarConstOfType(input_type.getElementType(), op.getLoc(),
                                       0, &rewriter);
@@ -1274,33 +1429,35 @@ class ConvertFusedBatchNormGradBase
         non_feature_dims.push_back(i);
       }
       auto reduce_dims = GetI64ElementsAttr(non_feature_dims, &rewriter);
-      auto broadcast_dims = GetI64ElementsAttr({feature_dim}, &rewriter);
-      auto no_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
 
       // scratch1 = rsqrt(var + epsilon)
       RankedTensorType scalar_float = RankedTensorType::get({}, kernel_type);
       auto epsilon = rewriter.create<ConstOp>(
           loc, DenseFPElementsAttr::get(scalar_float, {op.epsilon()}));
-      auto add_op = rewriter.create<AddOp>(loc, var, epsilon.getResult(),
-                                           no_broadcast_dims);
+      auto add_op = rewriter.create<xla_chlo::BroadcastAddOp>(
+          loc, var, epsilon.getResult(), scalar_broadcast_dims);
+
       Value scratch1 = rewriter.create<RsqrtOp>(loc, add_op);
 
       // scratch2 = sum(y_backprop * (x - mean))
-      auto sub_op = rewriter.create<SubOp>(loc, act, mean, broadcast_dims);
-      auto weighted_grad =
-          rewriter.create<MulOp>(loc, grad, sub_op, no_broadcast_dims);
+      auto sub_op = rewriter.create<xla_hlo::SubOp>(
+          loc, act,
+          Broadcast1DToFeatureDim(loc, act, mean, feature_dim, rewriter));
+      auto weighted_grad = rewriter.create<xla_hlo::MulOp>(loc, grad, sub_op);
       Value scratch2 =
           ApplyReduction(loc, weighted_grad, reduce_dims, &rewriter);
 
       // x_backprop = y_backprop * (scale * scratch1)
       auto scaled_grad =
-          rewriter.create<MulOp>(loc, op.scale(), scratch1, no_broadcast_dims);
-      x_backprop =
-          rewriter.create<MulOp>(loc, grad, scaled_grad, broadcast_dims);
+          rewriter.create<xla_hlo::MulOp>(loc, op.scale(), scratch1);
+      x_backprop = rewriter.create<xla_hlo::MulOp>(
+          loc, grad,
+          Broadcast1DToFeatureDim(loc, act, scaled_grad, feature_dim,
+                                  rewriter));
 
       // scale_backprop = scratch2 * scratch1
-      scale_backprop =
-          rewriter.create<MulOp>(loc, scratch1, scratch2, no_broadcast_dims);
+      scale_backprop = rewriter.create<xla_hlo::MulOp>(loc, scratch1, scratch2);
 
       // offset_backprop = sum(y_backprop)
       offset_backprop = ApplyReduction(loc, grad, reduce_dims, &rewriter);
@@ -1396,7 +1553,7 @@ class ConvertFusedBatchNormV3Op
       auto factor_const_op = rewriter.create<xla_hlo::ConstOp>(
           op.getLoc(), rewriter.getFloatAttr(scale_element_type, factor));
 
-      Value corrected_variance = rewriter.create<xla_hlo::MulOp>(
+      Value corrected_variance = rewriter.create<xla_chlo::BroadcastMulOp>(
           op.getLoc(), batch_variance.getType(), batch_variance,
           factor_const_op, /*broadcast_dimensions=*/DenseIntElementsAttr());
 
@@ -1416,24 +1573,26 @@ class ConvertFusedBatchNormV3Op
             rewriter.getFloatAttr(mean_element_type, exponential_avg_factor));
 
         // new_running_mean = alpha * old_mean + beta * batch_mean.
-        auto alpha_mul_old_mean = rewriter.create<MulOp>(
+        auto alpha_mul_old_mean = rewriter.create<xla_chlo::BroadcastMulOp>(
             op.getLoc(), op.mean().getType(), alpha, op.mean(),
             /*broadcast_dimensions=*/DenseIntElementsAttr());
-        auto beta_mul_batch_mean = rewriter.create<MulOp>(
+        auto beta_mul_batch_mean = rewriter.create<xla_chlo::BroadcastMulOp>(
             op.getLoc(), batch_mean.getType(), beta, batch_mean,
             /*broadcast_dimensions=*/DenseIntElementsAttr());
-        batch_mean = rewriter.create<AddOp>(
+        batch_mean = rewriter.create<xla_chlo::BroadcastAddOp>(
             op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
             /*broadcast_dimensions=*/DenseIntElementsAttr());
 
         // new_running_variance = alpha * old_variance + beta * batch_variance.
-        auto alpha_mul_old_variance = rewriter.create<MulOp>(
+        auto alpha_mul_old_variance = rewriter.create<xla_chlo::BroadcastMulOp>(
             op.getLoc(), op.variance().getType(), alpha, op.variance(),
             /*broadcast_dimensions=*/DenseIntElementsAttr());
-        auto beta_mul_batch_variance = rewriter.create<MulOp>(
-            op.getLoc(), corrected_variance.getType(), beta, corrected_variance,
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
-        corrected_variance = rewriter.create<AddOp>(
+        auto beta_mul_batch_variance =
+            rewriter.create<xla_chlo::BroadcastMulOp>(
+                op.getLoc(), corrected_variance.getType(), beta,
+                corrected_variance,
+                /*broadcast_dimensions=*/DenseIntElementsAttr());
+        corrected_variance = rewriter.create<xla_chlo::BroadcastAddOp>(
             op.getLoc(), alpha_mul_old_variance, beta_mul_batch_variance,
             /*broadcast_dimensions=*/DenseIntElementsAttr());
       }
@@ -1586,10 +1745,9 @@ class ConvertAvgPoolOp : public OpRewritePattern<TF::AvgPoolOp> {
     // Divide by the number of elements in the window.
     Value divisor =
         GetScalarConstOfType(sum_element_type, op.getLoc(), count, &rewriter);
-    auto batch_dims =
-        GetI64ElementsAttrForSeq(0, input_type.getRank(), &rewriter);
-    Value result = rewriter.create<DivOp>(op.getLoc(), result_type, reduce,
-                                          divisor, batch_dims);
+    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+    Value result = rewriter.create<xla_chlo::BroadcastDivOp>(
+        op.getLoc(), result_type, reduce, divisor, scalar_broadcast_dims);
 
     // Convert back if we enlarged the element type's bitwidth.
     if (input_element_type != sum_element_type)
@@ -1759,16 +1917,14 @@ class ConvertSigmoidOp : public OpRewritePattern<TF::SigmoidOp> {
         op.getLoc(), type, scalar_one,
         GetI64ElementsAttr(type.getShape(), &rewriter));
 
-    auto scaled_input = rewriter.create<MulOp>(
-        op.getLoc(), operand, constant_ones, DenseIntElementsAttr());
+    auto scaled_input =
+        rewriter.create<xla_hlo::MulOp>(op.getLoc(), operand, constant_ones);
     auto tanh_op =
         rewriter.create<TanhOp>(op.getLoc(), operand.getType(), scaled_input);
     auto mul_op =
-        rewriter.create<MulOp>(op.getLoc(), tanh_op, constant_ones,
-                               /*DenseIntElementsAttr=*/DenseIntElementsAttr());
+        rewriter.create<xla_hlo::MulOp>(op.getLoc(), tanh_op, constant_ones);
     auto add_op =
-        rewriter.create<AddOp>(op.getLoc(), mul_op, constant_ones,
-                               /*DenseIntElementsAttr=*/DenseIntElementsAttr());
+        rewriter.create<xla_hlo::AddOp>(op.getLoc(), mul_op, constant_ones);
 
     rewriter.replaceOp(op, add_op.getResult());
     return success();
@@ -1807,20 +1963,18 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
 
   LogicalResult matchAndRewrite(OpTy op,
                                 PatternRewriter &rewriter) const override {
-    Value logits = op.logits();
-
     // Softmax converter requires ranked type because the XLA reduce ops used
     // while lowering requires dimensions attribute to reduce along.
+    // Note that the input and output shape is equivalent, so we use 'logits'
+    // and its type for shape calculations.
+    Value logits = op.logits();
     RankedTensorType type = logits.getType().dyn_cast<RankedTensorType>();
     if (!type) return failure();
-
     auto loc = op.getLoc();
     int rank = type.getRank();
 
     // Note that the TensorFlow Softmax op verifies that the input rank is
-    // greater than or equal to one so both of the following sequences are
-    // valid.
-    auto batch_dims = GetI64ElementsAttrForSeq(0, rank - 1, &rewriter);
+    // greater than or equal to one so the following sequence is valid.
     auto reduce_dim = rewriter.create<TF::ConstOp>(
         loc, GetI64ElementsAttr({rank - 1}, &rewriter));
 
@@ -1833,8 +1987,10 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
     auto max_logits =
         rewriter.create<TF::MaxOp>(loc, logits, reduce_dim,
                                    /*keep_dims=*/rewriter.getBoolAttr(false));
-    auto shifted_logits =
-        rewriter.create<SubOp>(loc, type, logits, max_logits, batch_dims);
+    auto max_logits_broadcast =
+        CommonPrefixBroadcast(loc, logits, max_logits, rewriter);
+    auto shifted_logits = rewriter.create<xla_hlo::SubOp>(loc, type, logits,
+                                                          max_logits_broadcast);
 
     // Exponentiate the inputs.
     Value exp = rewriter.create<ExpOp>(loc, type, shifted_logits);
@@ -1847,9 +2003,12 @@ class ConvertSoftmaxOp : public OpRewritePattern<OpTy> {
 
     if (use_log) {
       Value log = rewriter.create<LogOp>(loc, sum);
-      rewriter.replaceOpWithNewOp<SubOp>(op, shifted_logits, log, batch_dims);
+      auto log_broadcast = CommonPrefixBroadcast(loc, logits, log, rewriter);
+      rewriter.replaceOpWithNewOp<xla_hlo::SubOp>(op, shifted_logits,
+                                                  log_broadcast);
     } else {
-      rewriter.replaceOpWithNewOp<DivOp>(op, exp, sum, batch_dims);
+      auto sum_broadcast = CommonPrefixBroadcast(loc, logits, sum, rewriter);
+      rewriter.replaceOpWithNewOp<xla_hlo::DivOp>(op, exp, sum_broadcast);
     }
     return success();
   }
@@ -1896,7 +2055,7 @@ class ConvertSizeOp : public OpRewritePattern<TF::SizeOp> {
       auto dim = rewriter.create<GetDimensionSizeOp>(
           op.getLoc(), result_type, input,
           rewriter.getIntegerAttr(rewriter.getIntegerType(32), i));
-      size = rewriter.create<MulOp>(
+      size = rewriter.create<xla_chlo::BroadcastMulOp>(
           op.getLoc(), size->getResult(0), dim.getResult(),
           /*DenseIntElementsAttr=*/DenseIntElementsAttr());
     }
@@ -2582,10 +2741,10 @@ class ConvertRangeOp : public OpRewritePattern<TF::RangeOp> {
 
     auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
                                         rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<MulOp>(
+    auto scaled = rewriter.create<xla_chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, op.delta(),
         xla::getBroadcastDimensionsAttr(&rewriter, iota, op.delta()));
-    rewriter.replaceOpWithNewOp<AddOp>(
+    rewriter.replaceOpWithNewOp<xla_chlo::BroadcastAddOp>(
         op, result_type, scaled, op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
     return success();
@@ -2633,7 +2792,7 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     int64_t num = (*num_attr.begin()).getSExtValue();
 
     // Calculate the scaling that needs to be applied to the iota.
-    auto step_numerator = rewriter.create<SubOp>(
+    auto step_numerator = rewriter.create<xla_chlo::BroadcastSubOp>(
         op.getLoc(), op.start().getType(), op.stop(), op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, op.stop(), op.start()));
     Value step_denominator = rewriter.create<ConvertOp>(
@@ -2641,11 +2800,11 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     if (num > 1) {
       Value one = GetScalarConstOfType(result_type.getElementType(),
                                        op.getLoc(), 1, &rewriter);
-      step_denominator = rewriter.create<SubOp>(
+      step_denominator = rewriter.create<xla_chlo::BroadcastSubOp>(
           op.getLoc(), step_denominator.getType(), step_denominator, one,
           xla::getBroadcastDimensionsAttr(&rewriter, step_denominator, one));
     }
-    auto step = rewriter.create<DivOp>(
+    auto step = rewriter.create<xla_chlo::BroadcastDivOp>(
         op.getLoc(), step_numerator.getType(), step_numerator, step_denominator,
         xla::getBroadcastDimensionsAttr(&rewriter, step_numerator,
                                         step_denominator));
@@ -2653,10 +2812,10 @@ class ConvertLinSpaceOp : public OpRewritePattern<TF::LinSpaceOp> {
     // Scale the iota and add the offset.
     auto iota = rewriter.create<IotaOp>(op.getLoc(), result_type,
                                         rewriter.getI64IntegerAttr(0));
-    auto scaled = rewriter.create<MulOp>(
+    auto scaled = rewriter.create<xla_chlo::BroadcastMulOp>(
         op.getLoc(), result_type, iota, step,
         xla::getBroadcastDimensionsAttr(&rewriter, iota, step));
-    rewriter.replaceOpWithNewOp<AddOp>(
+    rewriter.replaceOpWithNewOp<xla_chlo::BroadcastAddOp>(
         op, result_type, scaled, op.start(),
         xla::getBroadcastDimensionsAttr(&rewriter, scaled, op.start()));
     return success();
@@ -2732,8 +2891,8 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
       auto divisor = GetScalarConstOfType(reduce_element_type, loc,
                                           divisor_count, &rewriter);
       auto broadcast_dims = GetI64ElementsAttr({}, &rewriter);
-      result = rewriter.create<DivOp>(loc, result, divisor.getResult(),
-                                      broadcast_dims);
+      result = rewriter.create<xla_chlo::BroadcastDivOp>(
+          loc, result, divisor.getResult(), broadcast_dims);
     }
 
     result = rewriter.create<ConvertOp>(loc, result, element_type);
@@ -3118,7 +3277,6 @@ class ConvertMaxPoolGradOp : public OpRewritePattern<OpTy> {
 
       auto reducer = rewriter.create<CompareOp>(
           loc, block->getArgument(0), block->getArgument(1),
-          /*broadcast_dimensions=*/nullptr,
           StringAttr::get("GE", rewriter.getContext()));
       rewriter.create<ReturnOp>(loc, reducer.getResult());
     }
@@ -3544,13 +3702,20 @@ class ConvertOneHotOp : public OpRewritePattern<TF::OneHotOp> {
     output_dims.insert(output_dims.begin() + axis, depth);
 
     Location loc = op.getLoc();
+
+    // The iota result is the effective output shape of the computation,
+    // and indices must be broadcast into it. At this point, this computation
+    // would need to be reworked quite a bit to support dynamic shapes, so
+    // just using static broadcasting.
     auto index_type = RankedTensorType::get(output_dims, element_type);
-    Value compare = rewriter.create<CompareOp>(
-        loc, op.indices(),
-        rewriter.create<IotaOp>(
-            loc, index_type,
-            IntegerAttr::get(rewriter.getIntegerType(64), axis)),
-        GetI64ElementsAttr(broadcast_dims, &rewriter),
+    auto iota = rewriter.create<IotaOp>(
+        loc, index_type, IntegerAttr::get(rewriter.getIntegerType(64), axis));
+    auto broadcast_indices = rewriter.create<BroadcastInDimOp>(
+        loc, index_type, op.indices(),
+        GetI64ElementsAttr(broadcast_dims, &rewriter));
+
+    Value compare = rewriter.create<xla_hlo::CompareOp>(
+        loc, broadcast_indices, iota,
         StringAttr::get("EQ", rewriter.getContext()));
     Value on_value = rewriter.create<BroadcastOp>(
         loc, op.getType(), op.on_value(),
@@ -4396,7 +4561,6 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                                          rewriter.getI64IntegerAttr(1));
     Value compare = rewriter.create<CompareOp>(
         op.getLoc(), iota0, iota1,
-        /*broadcast_dimensions=*/nullptr,
         StringAttr::get("EQ", rewriter.getContext()));
     Value identity_matrix =
         rewriter.create<ConvertOp>(op.getLoc(), compare, type.getElementType());
@@ -4430,8 +4594,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                                batch_dims.size(), precision_config, &rewriter);
       a_update = BatchDot(op.getLoc(), y, false, a_update, false,
                           batch_dims.size(), precision_config, &rewriter);
-      a_panel = rewriter.create<AddOp>(op.getLoc(), a_panel, a_update,
-                                       /*broadcast_dimensions=*/nullptr);
+      a_panel = rewriter.create<AddOp>(op.getLoc(), a_panel, a_update);
       a = UpdateSliceInMinorDims(op.getLoc(), a, a_panel, {i, i + k},
                                  &rewriter);
 
@@ -4442,8 +4605,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                                 batch_dims.size(), precision_config, &rewriter);
       q_update = BatchDot(op.getLoc(), q_update, false, y, true,
                           batch_dims.size(), precision_config, &rewriter);
-      q_panel = rewriter.create<AddOp>(op.getLoc(), q_panel, q_update,
-                                       /*broadcast_dimensions=*/nullptr);
+      q_panel = rewriter.create<AddOp>(op.getLoc(), q_panel, q_update);
       q = UpdateSliceInMinorDims(op.getLoc(), q, q_panel, {i}, &rewriter);
     }
     // full_matrices is false when only a partial result in needed. Slice to the
@@ -4505,34 +4667,31 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
     Value iota = builder->create<IotaOp>(
         loc, RankedTensorType::get({m}, builder->getIntegerType(32)),
         builder->getI64IntegerAttr(0));
-    Value gtk = builder->create<CompareOp>(
+    Value gtk = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, iota, k, GetI64ElementsAttr({}, builder),
         StringAttr::get("GT", builder->getContext()));
     gtk = builder->create<ConvertOp>(loc, gtk, x_type.getElementType());
-    Value x_after_k = builder->create<MulOp>(
+    Value x_after_k = builder->create<xla_chlo::BroadcastMulOp>(
         loc, x, gtk, GetI64ElementsAttr({minor_dim}, builder));
-    Value x_after_k_sq = builder->create<MulOp>(
-        loc, x_after_k, x_after_k, /*broadcast_dimensions=*/nullptr);
+    Value x_after_k_sq = builder->create<MulOp>(loc, x_after_k, x_after_k);
     // sigma = np.dot(x[k+1:], x[k+1:])
     auto sigma = builder->create<ReduceOp>(
         loc, x_after_k_sq, zero, GetI64ElementsAttr({minor_dim}, builder));
     BuildReduceBody<AddOp>(x_type.getElementType(), &sigma.body(), builder);
     // mu = np.sqrt(x[k]*x[k] + sigma)
-    Value alpha_sq = builder->create<MulOp>(loc, alpha, alpha,
-                                            /*broadcast_dimensions=*/nullptr);
+    Value alpha_sq = builder->create<MulOp>(loc, alpha, alpha);
     Value mu = builder->create<SqrtOp>(
-        loc, builder->create<AddOp>(loc, alpha_sq, sigma.getResult(0),
-                                    /*broadcast_dimensions=*/nullptr));
+        loc, builder->create<AddOp>(loc, alpha_sq, sigma.getResult(0)));
 
-    Value sigma_is_zero = builder->create<CompareOp>(
+    Value sigma_is_zero = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, sigma.getResult(0), zero, GetI64ElementsAttr({}, builder),
         StringAttr::get("EQ", builder->getContext()));
-    Value alpha_is_negative = builder->create<CompareOp>(
+    Value alpha_is_negative = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, alpha, zero, GetI64ElementsAttr({}, builder),
         StringAttr::get("LT", builder->getContext()));
     auto batch_size_one = builder->create<BroadcastOp>(
         loc, alpha.getType(), one, GetI64ElementsAttr(batch_dims, builder));
-    Value signed_mu = builder->create<MulOp>(
+    Value signed_mu = builder->create<xla_chlo::BroadcastMulOp>(
         loc,
         builder->create<SelectOp>(loc, mu.getType(), alpha_is_negative,
                                   batch_size_one,
@@ -4541,21 +4700,16 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
     *beta = builder->create<SelectOp>(loc, alpha.getType(), sigma_is_zero,
                                       alpha, signed_mu);
     *tau = builder->create<DivOp>(
-        loc,
-        builder->create<SubOp>(loc, *beta, alpha,
-                               /*broadcast_dimensions=*/nullptr),
-        *beta,
-        /*broadcast_dimensions=*/nullptr);
+        loc, builder->create<SubOp>(loc, *beta, alpha), *beta);
     Value zero_tau = builder->create<BroadcastOp>(
         loc, alpha.getType(), zero, GetI64ElementsAttr(batch_dims, builder));
     *tau = builder->create<SelectOp>(loc, alpha.getType(), sigma_is_zero,
                                      zero_tau, *tau);
-    Value divisor = builder->create<SubOp>(loc, alpha, *beta,
-                                           /*broadcast_dimensions=*/nullptr);
+    Value divisor = builder->create<SubOp>(loc, alpha, *beta);
     divisor = builder->create<SelectOp>(loc, divisor.getType(), sigma_is_zero,
                                         batch_size_one, divisor);
 
-    Value eqk = builder->create<CompareOp>(
+    Value eqk = builder->create<xla_chlo::BroadcastCompareOp>(
         loc, iota, k, GetI64ElementsAttr({}, builder),
         StringAttr::get("EQ", builder->getContext()));
     eqk = builder->create<ConvertOp>(loc, eqk, x_type.getElementType());
@@ -4568,10 +4722,12 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
 
     // Form v as [0, 0, ..., 1] ++ x[k+1:] / divisor
     // If sigma is zero, x[k+1:] is zero, so use any non-zero divisor.
-    *v = builder->create<AddOp>(
+    // Note that the add performs a degenerate broadcast.
+    *v = builder->create<xla_chlo::BroadcastAddOp>(
         loc, e_k,
-        builder->create<DivOp>(loc, x_after_k, divisor,
-                               GetI64ElementsAttr(batch_dim_ids, builder)),
+        StaticBinaryBroadcast<DivOp>(loc, x_after_k, divisor,
+                                     GetI64ElementsAttr(batch_dim_ids, builder),
+                                     *builder),
         /*broadcast_dimensions=*/nullptr);
   }
 
@@ -4645,10 +4801,10 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                           precision, builder);
       vva = BatchDot(loc, v_broadcast, true, vva, false, num_batch_dims,
                      precision, builder);
-      auto tau_x_vva = builder->create<MulOp>(
-          loc, tau, vva, GetI64ElementsAttr(batch_dim_indices, builder));
-      a = builder->create<SubOp>(loc, a, tau_x_vva,
-                                 /*broadcast_dimensions=*/nullptr);
+      auto tau_x_vva = StaticBinaryBroadcast<xla_hlo::MulOp>(
+          loc, tau, vva, GetI64ElementsAttr(batch_dim_indices, builder),
+          *builder);
+      a = builder->create<SubOp>(loc, a, tau_x_vva);
 
       // It is more precise to populate column 'k' explicitly, rather than
       // computing it implicitly by applying the Householder transformation.
@@ -4657,12 +4813,12 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
       auto iota = builder->create<IotaOp>(
           loc, RankedTensorType::get({m, 1}, builder->getIntegerType(32)),
           builder->getI64IntegerAttr(0));
-      Value predecessor_mask = builder->create<CompareOp>(
+      Value predecessor_mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("LT", builder->getContext()));
       predecessor_mask = builder->create<ConvertOp>(loc, predecessor_mask,
                                                     a_type.getElementType());
-      Value mask = builder->create<CompareOp>(
+      Value mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("EQ", builder->getContext()));
       mask = builder->create<ConvertOp>(loc, mask, a_type.getElementType());
@@ -4674,14 +4830,14 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           mask,
           GetI64ElementsAttr(llvm::SmallVector<int64_t, 4>(num_batch_dims, 1),
                              builder));
-      Value predecessor_masked_x = builder->create<MulOp>(
+      Value predecessor_masked_x = StaticBinaryBroadcast<MulOp>(
           loc, x, predecessor_mask,
-          GetI64ElementsAttr({num_dims - 2, num_dims - 1}, builder));
-      Value masked_beta = builder->create<MulOp>(
-          loc, beta, mask, GetI64ElementsAttr(batch_dim_indices, builder));
+          GetI64ElementsAttr({num_dims - 2, num_dims - 1}, builder), *builder);
+      Value masked_beta = StaticBinaryBroadcast<MulOp>(
+          loc, beta, mask, GetI64ElementsAttr(batch_dim_indices, builder),
+          *builder);
       Value new_x =
-          builder->create<AddOp>(loc, predecessor_masked_x, masked_beta,
-                                 /*broadcast_dimensions=*/nullptr);
+          builder->create<AddOp>(loc, predecessor_masked_x, masked_beta);
       // Update a[:,j]
       llvm::SmallVector<int64_t, 4> dim_ids(num_dims);
       std::iota(dim_ids.begin(), dim_ids.end(), 0);
@@ -4692,7 +4848,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           loc,
           RankedTensorType::get(a_type.getShape(), builder->getIntegerType(32)),
           builder->getI64IntegerAttr(minor_dim + 1));
-      Value xa_mask = builder->create<CompareOp>(
+      Value xa_mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota_mn, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("EQ", builder->getContext()));
       a = builder->create<SelectOp>(loc, a_type, xa_mask, new_x, a);
@@ -4708,11 +4864,11 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
                              builder));
       auto vs_update = builder->create<SelectOp>(
           loc, vs.getType(), xa_mask,
-          builder->create<AddOp>(
-              loc, vs_zeros, v, GetI64ElementsAttr(vs_broadcast_dims, builder)),
+          StaticBinaryBroadcast<AddOp>(
+              loc, vs_zeros, v, GetI64ElementsAttr(vs_broadcast_dims, builder),
+              *builder),
           vs_zeros);
-      vs = builder->create<AddOp>(loc, vs, vs_update,
-                                  /*broadcast_dimensions=*/nullptr);
+      vs = builder->create<AddOp>(loc, vs, vs_update);
 
       // taus[j] = tau
       llvm::SmallVector<int64_t, 4> tau_broadcast_dims(batch_dims.size());
@@ -4729,17 +4885,16 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           loc, taus.getType(), taus_zeros,
           GetI64ElementsAttr(taus.getType().cast<RankedTensorType>().getShape(),
                              builder));
-      Value taus_mask = builder->create<CompareOp>(
+      Value taus_mask = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota_n, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("EQ", builder->getContext()));
       auto taus_update = builder->create<SelectOp>(
           loc, taus.getType(), taus_mask,
-          builder->create<AddOp>(
+          StaticBinaryBroadcast<AddOp>(
               loc, taus_zeros, tau,
-              GetI64ElementsAttr(tau_broadcast_dims, builder)),
+              GetI64ElementsAttr(tau_broadcast_dims, builder), *builder),
           taus_zeros);
-      taus = builder->create<AddOp>(loc, taus, taus_update,
-                                    /*broadcast_dimensions=*/nullptr);
+      taus = builder->create<AddOp>(loc, taus, taus_update);
       new_values->assign({a, vs, taus});
     };
 
@@ -4796,8 +4951,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
       j = builder->create<AddOp>(
           loc, j,
           GetScalarConstOfType(getElementTypeOrSelf(j.getType()), loc, 1,
-                               builder),
-          /*broadcast_dimensions=*/nullptr);
+                               builder));
       // vs has shape [..., m, 1]
       auto v = DynamicSliceInMinorDims(loc, vs, {j}, {1}, builder);
       // beta has shape [..., 1]
@@ -4816,7 +4970,7 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
           loc, vs.getType(), zero,
           GetI64ElementsAttr(vs.getType().cast<RankedTensorType>().getShape(),
                              builder));
-      auto compare = builder->create<CompareOp>(
+      auto compare = builder->create<xla_chlo::BroadcastCompareOp>(
           loc, iota_mn, j, GetI64ElementsAttr({}, builder),
           StringAttr::get("GE", builder->getContext()));
       auto y = builder->create<SelectOp>(loc, vs.getType(), compare, zero, vs);
@@ -4831,13 +4985,12 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
 
       // z = -beta * (v + wyv)
       auto neg_beta = builder->create<NegOp>(loc, beta);
-      auto v_wyv = builder->create<AddOp>(loc, v, wyv,
-                                          /*broadcast_dimensions=*/nullptr);
+      auto v_wyv = builder->create<AddOp>(loc, v, wyv);
       auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
       beta_broadcast_dims.push_back(n_index);
-      auto z = builder->create<MulOp>(
+      auto z = StaticBinaryBroadcast<MulOp>(
           loc, neg_beta, v_wyv,
-          GetI64ElementsAttr(beta_broadcast_dims, builder));
+          GetI64ElementsAttr(beta_broadcast_dims, builder), *rewriter);
 
       w = DynamicUpdateSliceInMinorDims(loc, w, z, {j}, builder);
       new_values->assign({w, vs, taus});
@@ -4855,8 +5008,9 @@ class ConvertQrOp : public OpRewritePattern<TF::QrOp> {
     auto neg_beta = rewriter->create<NegOp>(loc, beta);
     auto beta_broadcast_dims = llvm::to_vector<4>(batch_dim_indices);
     beta_broadcast_dims.push_back(n_index);
-    auto bv = rewriter->create<MulOp>(
-        loc, neg_beta, v, GetI64ElementsAttr(beta_broadcast_dims, rewriter));
+    auto bv = StaticBinaryBroadcast<MulOp>(
+        loc, neg_beta, v, GetI64ElementsAttr(beta_broadcast_dims, rewriter),
+        *rewriter);
     w = UpdateSliceInMinorDims(loc, w, bv, {0}, rewriter);
 
     SmallVector<Value, 4> while_output;
@@ -4912,7 +5066,8 @@ void EmitLegalizationErrors(Operation *op,
 
 // Performs the lowering to XLA dialect.
 void LegalizeTF::runOnFunction() {
-  if (failed(legalizeTF(getFunction(), allow_partial_conversion_)))
+  if (failed(
+          legalizeTF(getFunction(), allow_partial_conversion_, legalize_chlo_)))
     signalPassFailure();
 }
 
@@ -4923,7 +5078,8 @@ static PassRegistration<LegalizeTF> pass(
 
 #include "tensorflow/compiler/mlir/xla/transforms/generated_legalize_tf.inc"
 
-LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
+LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion,
+                         bool legalize_chlo) {
   MLIRContext *context = op->getContext();
 
   // Add lowering patterns to the list.
@@ -4936,19 +5092,19 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
   TF::PopulateLoweringTFPatterns(context, &patterns);
   patterns.insert<
       ConvertAllOp, ConvertAnyOp, ConvertArgMaxOp, ConvertBatchMatMulV2Op,
-      ConvertBroadcastToOp, ConvertBF16FloorDivOp, ConvertConv2DOp,
-      ConvertConv3DOp, ConvertDepthConv2DOp, ConvertConv2DBackpropFilterOp,
-      ConvertConv3DBackpropFilterOp, ConvertConv2DBackpropInputOp,
-      ConvertConv3DBackpropInputOp, ConvertCumsumOp, ConvertDiagPartOp,
-      ConvertEinsumOp, ConvertFusedBatchNormGradOp,
-      ConvertFusedBatchNormGradV2Op, ConvertFusedBatchNormGradV3Op,
-      ConvertFusedBatchNormV3Op, ConvertInfeedDequeueTupleOp,
-      ConvertInplaceUpdateOp, ConvertLinSpaceOp, ConvertMaxOp, ConvertMinOp,
-      ConvertAvgPoolOp, ConvertMaxPool2DOp, ConvertMaxPool3DOp,
-      ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp, ConvertMeanOp,
-      ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp, ConvertProdOp, ConvertQrOp,
-      ConvertRangeOp, ConvertSelectV2Op, ConvertSigmoidOp, ConvertSizeOp,
-      ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
+      ConvertBiasAddOp, ConvertBroadcastToOp, ConvertBF16FloorDivOp,
+      ConvertConv2DOp, ConvertConv3DOp, ConvertDepthConv2DOp,
+      ConvertConv2DBackpropFilterOp, ConvertConv3DBackpropFilterOp,
+      ConvertConv2DBackpropInputOp, ConvertConv3DBackpropInputOp,
+      ConvertCumsumOp, ConvertDiagPartOp, ConvertEinsumOp,
+      ConvertFusedBatchNormGradOp, ConvertFusedBatchNormGradV2Op,
+      ConvertFusedBatchNormGradV3Op, ConvertFusedBatchNormV3Op,
+      ConvertInfeedDequeueTupleOp, ConvertInplaceUpdateOp, ConvertLinSpaceOp,
+      ConvertMaxOp, ConvertMinOp, ConvertAvgPoolOp, ConvertMaxPool2DOp,
+      ConvertMaxPool3DOp, ConvertMaxPool2DGradOp, ConvertMaxPool3DGradOp,
+      ConvertMeanOp, ConvertOneHotOp, ConvertOutfeedEnqueueTupleOp,
+      ConvertProdOp, ConvertQrOp, ConvertRangeOp, ConvertSelectV2Op,
+      ConvertSigmoidOp, ConvertSizeOp, ConvertSoftmaxOp<TF::LogSoftmaxOp, true>,
       ConvertSoftmaxOp<TF::SoftmaxOp, false>, ConvertSplitOp, ConvertSplitVOp,
       ConvertStridedSliceOp, ConvertStridedSliceGradOp, ConvertSumOp,
       ConvertTensorScatterUpdateOp, ConvertTileOp, ConvertTopKV2Op,
@@ -4959,10 +5115,16 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
 
   // Populate with CHLO->HLO lowerings to account for TF ops legalized to
   // CHLO first.
-  xla_chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
+  if (legalize_chlo) {
+    xla_chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
+  }
 
   ConversionTarget target(*context);
-  target.addIllegalDialect<xla_chlo::XlaHloClientDialect>();
+  if (legalize_chlo) {
+    target.addIllegalDialect<xla_chlo::XlaHloClientDialect>();
+  } else {
+    target.addLegalDialect<xla_chlo::XlaHloClientDialect>();
+  }
   target.addLegalDialect<XlaHloDialect>();
   target.addLegalDialect<StandardOpsDialect>();
   target.addLegalDialect<shape::ShapeDialect>();
@@ -4988,8 +5150,8 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
 }
 
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion) {
-  return std::make_unique<LegalizeTF>(allow_partial_conversion);
+    bool allow_partial_conversion, bool legalize_chlo) {
+  return std::make_unique<LegalizeTF>(allow_partial_conversion, legalize_chlo);
 }
 
 }  // end namespace xla_hlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 959902692dc..33c92ee65d5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -73,21 +73,6 @@ def : Pattern<
 // HLO and XLA doesn't support Assertions.
 def LowerAssert : Pattern<(TF_AssertOp $condition, $data, $summarize), []>;
 
-//===----------------------------------------------------------------------===//
-// Bias op patterns.
-//===----------------------------------------------------------------------===//
-def BiasAddFeatureDimension : NativeCodeCall<
-    "getBiasFeatureDimension($_builder, $0, $1)">;
-
-// $input needs to be a ranked tensor to identify index of the feature
-// dimension depending on the data_format 'NHWC' or 'NCHW'.
-// TODO(laurenzo): This should be converted to do explicit broadcasting since
-// it can generate broadcast dimensions that are not compatible with the simple
-// xla_chlo.add broadcast_dims.
-def : Pat<(TF_BiasAddOp AnyRankedTensor:$input, $bias, $data_format),
-          (HLO_AddOp $input, $bias,
-              (BiasAddFeatureDimension $data_format, $input))>;
-
 //===----------------------------------------------------------------------===//
 // Binary op patterns.
 //===----------------------------------------------------------------------===//
@@ -114,7 +99,8 @@ foreach fromToBinPair = [[TF_AddOp, HLOClient_BroadcastAddOp],
 
 def LowerRightShiftSigned :
   Pat<(TF_RightShiftOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-      (HLO_ShiftRightArithmeticOp $l, $r, (BinBroadcastDimensions $l, $r)),
+      (HLOClient_BroadcastShiftRightArithmeticOp $l, $r,
+       (BinBroadcastDimensions $l, $r)),
       [(SignedIntTensor $r)]>;
 
 // TODO(hinsu): Lower unsigned types to HLO_ShiftRightLogical once the HLO op
@@ -126,10 +112,11 @@ def : Pat<(TF_ComplexOp $r, $i), (HLO_ComplexOp $r, $i)>;
 //
 //  return floor(div(x, y))
 def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-          (HLO_FloorOp (HLO_DivOp $l, $r, (BinBroadcastDimensions $l, $r))),
+          (HLO_FloorOp
+           (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r))),
           [(IEEEFloatTensor $l)]>;
 
-// Performs a substitution of FloorDir for integer tensors, which required
+// Performs a substitution of FloorDiv for integer tensors, which required
 // additional correction for a negative numerator / denominator. Equivalent
 // pseudocode is shown below:
 //
@@ -150,16 +137,16 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
 // broadcast attributes.
 def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
         (HLO_SelectOp
-         (HLO_CompareOp
-          (HLO_CompareOp $l, (HLO_ConstOp (ConstantSplat<"0"> $l)),
+         (HLOClient_BroadcastCompareOp
+          (HLOClient_BroadcastCompareOp $l, (HLO_ConstOp (ConstantSplat<"0"> $l)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-          (HLO_CompareOp $r, (HLO_ConstOp (ConstantSplat<"0"> $r)),
+          (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (ConstantSplat<"0"> $r)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
           (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ),
-        (HLO_DivOp $l, $r, (BinBroadcastDimensions $l, $r)),
-          (HLO_DivOp
-           (HLO_NegOp:$neg (HLO_AddOp (HLO_AbsOp $l),
-                       (HLO_SubOp (HLO_AbsOp $r),
+        (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
+          (HLOClient_BroadcastDivOp
+           (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
+                       (HLOClient_BroadcastSubOp (HLO_AbsOp $r),
                         (HLO_ConstOp (ConstantSplat<"1"> $r)),
                         (NullDenseIntElementsAttr)),
                      (BinBroadcastDimensions $l, $r))),
@@ -175,20 +162,20 @@ def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
 // broadcast attributes.
 def : Pat<(TF_FloorModOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
       (HLO_SelectOp
-       (HLO_AndOp
-        (HLO_CompareOp
-         (HLO_RemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
+       (HLOClient_BroadcastAndOp
+        (HLOClient_BroadcastCompareOp
+         (HLOClient_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
          (HLO_ConstOp:$l_zeros (ConstantSplat<"0"> $l)),
          (BinBroadcastDimensions $l, $rem), HLO_COMPARISON_DIRECTION_NE),
-        (HLO_CompareOp
-         (HLO_CompareOp:$r_cmp $r,
+        (HLOClient_BroadcastCompareOp
+         (HLOClient_BroadcastCompareOp:$r_cmp $r,
           (HLO_ConstOp:$r_zeros (ConstantSplat<"0"> $r)),
           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-         (HLO_CompareOp:$rem_cmp $rem, $r_zeros,
+         (HLOClient_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
           (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT),
          (BinBroadcastDimensions $r_cmp, $rem_cmp), HLO_COMPARISON_DIRECTION_NE),
         (NullDenseIntElementsAttr)),
-        (HLO_AddOp $r,
+        (HLOClient_BroadcastAddOp $r,
          $rem, (BinBroadcastDimensions $r, $rem)), $rem)>;
 
 //===----------------------------------------------------------------------===//
@@ -406,39 +393,36 @@ def : Pattern<(TF_MatrixBandPartOp:$op AnyRankedTensor:$input, $num_lower, $num_
           (HLO_SelectOp:$num_lower_or_m
            (HLO_CompareOp
             $num_lower, (HLO_ConstOp:$zero (ConstantSplat<"0"> $num_lower)),
-            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+            HLO_COMPARISON_DIRECTION_LT
            ),
            $m_dim,
            $num_lower
           ),
           (HLO_SelectOp:$num_upper_or_n
            (HLO_CompareOp
-            $num_upper, $zero,
-            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT
+            $num_upper, $zero, HLO_COMPARISON_DIRECTION_LT
            ),
            $n_dim,
            $num_upper
           ),
           (HLO_SelectOp
            (HLO_AndOp
-            (HLO_CompareOp
+            (HLOClient_BroadcastCompareOp
              (HLO_NegOp
               (createConvertOp $op, $num_lower_or_m, $input)
              ),
              (HLO_SubOp:$offset
-              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input),
-              (NullDenseIntElementsAttr)
+              (createIotaOp<"1"> $op, $input), (createIotaOp<"0"> $op, $input)
              ),
              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
             ),
-            (HLO_CompareOp
+            (HLOClient_BroadcastCompareOp
              $offset,
              (createConvertOp
               $op, $num_upper_or_n, $input
              ),
              (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LE
-            ),
-            (BinBroadcastDimensions $offset, $input)
+            )
            ),
            $input,
            (HLO_ConstOp (ConstantSplat<"0"> $input))
@@ -462,8 +446,9 @@ def : Pat<(TF_ConstOp:$res ElementsAttr:$value),
 // TODO(hinsu): Lower unsigned and quantized types after supporting
 // them in GetScalarOfType.
 def : Pat<(TF_ReluOp AnyRankedTensor:$input),
-          (HLO_MaxOp (HLO_ConstOp:$zero (GetScalarOfType<0> $input)), $input,
-                     (BinBroadcastDimensions $zero, $input)),
+          (HLOClient_BroadcastMaxOp
+               (HLO_ConstOp:$zero (GetScalarOfType<0> $input)), $input,
+               (BinBroadcastDimensions $zero, $input)),
           [(TF_SintOrFpTensor $input)]>;
 
 // TODO(hinsu): Lower unsigned and quantized types after supporting
@@ -485,7 +470,7 @@ def : Pat<(TF_Relu6Op AnyRankedTensor:$input),
 // to create splat tensor of dynamic shape in HLO.
 def : Pat<(TF_ReluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features),
           (HLO_SelectOp
-            (HLO_CompareOp $features,
+            (HLOClient_BroadcastCompareOp $features,
               (HLO_ConstOp (GetScalarOfType<0> $features)),
               (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_GT),
             $gradients, (HLO_ConstOp (ConstantSplat<"0"> $gradients)))>;
@@ -598,7 +583,6 @@ def : Pat<(TF_SignOp $x),
             (HLO_CompareOp
               $x,
               $x,
-              (NullDenseIntElementsAttr),
               HLO_COMPARISON_DIRECTION_NE
             ),
             (HLO_ConstOp (ConstantSplat<"0"> $x)),
@@ -641,8 +625,6 @@ def : Pat<(srcDstOpPair[0]:$old $shape, $seed, $seed2),
 //===----------------------------------------------------------------------===//
 def : Pat<(TF_SigmoidGradOp AnyRankedTensor:$l, AnyRankedTensor:$r),
           (HLO_MulOp
-           (HLO_MulOp $r, $l, (NullDenseIntElementsAttr)),
-           (HLO_SubOp (HLO_ConstOp (ConstantSplat<"1"> $l)), $l,
-            (NullDenseIntElementsAttr)),
-           (NullDenseIntElementsAttr)),
+           (HLO_MulOp $r, $l),
+           (HLO_SubOp (HLO_ConstOp (ConstantSplat<"1"> $l)), $l)),
           [(IEEEFloatTensor $l)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
index c0f6c2c3541..21e39db018b 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_to_standard_patterns.td
@@ -36,47 +36,36 @@ def IsSameSizePred : CPred<
 def IsSameSizeConstraint : Constraint<IsSameSizePred, "inputs are same size">;
 
 
-def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AndOp HLO_PredTensor:$l, HLO_PredTensor:$r),
           (AndOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AddOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (AddFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_SubOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (SubFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (MulFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (DivFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_RemOp HLO_FpTensor:$l, HLO_FpTensor:$r),
           (RemFOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_AddOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (AddIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_SubOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SubIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (MulIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SignedDivIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
-def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r,
-                     IsNullAttr:$broadcast_dimensions),
+def : Pat<(HLO_RemOp HLO_IntTensor:$l, HLO_IntTensor:$r),
           (SignedRemIOp $l, $r),
           [(IsSameSizeConstraint $l, $r)]>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
index dcb0ab20e9e..e1ae5ef6abf 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/lower_complex_patterns.td
@@ -28,70 +28,62 @@ include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 // and imaginary components.
 foreach elementwiseOp = [HLO_AddOp, HLO_SubOp] in
   def : Pat<(elementwiseOp HLO_ComplexTensor:$lhs,
-             HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+             HLO_ComplexTensor:$rhs),
             (HLO_ComplexOp
-              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs),
-               $broadcast_dimensions),
-              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs),
-               $broadcast_dimensions))>;
+              (elementwiseOp (HLO_RealOp $lhs), (HLO_RealOp $rhs)),
+              (elementwiseOp (HLO_ImagOp $lhs), (HLO_ImagOp $rhs)))>;
 
 // Complex multiplication results in a cross product multiplication between the
 // real and imaginary components such that:
 //   result.real = lhs.real * rhs.real - lhs.imag * rhs.imag
 //   result.imag = lhs.imag * rhs.real + lhs.real * rhs.imag
 def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs,
-           HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+           HLO_ComplexTensor:$rhs),
           (HLO_ComplexOp
            (HLO_SubOp
             (HLO_MulOp
              (HLO_RealOp:$lhs_real $lhs),
-             (HLO_RealOp:$rhs_real $rhs),
-             $broadcast_dimensions),
+             (HLO_RealOp:$rhs_real $rhs)),
             (HLO_MulOp
              (HLO_ImagOp:$lhs_imag $lhs),
-             (HLO_ImagOp:$rhs_imag $rhs),
-             $broadcast_dimensions),
-            (NullDenseIntElementsAttr)),
+             (HLO_ImagOp:$rhs_imag $rhs))),
            (HLO_AddOp
-            (HLO_MulOp $lhs_real, $rhs_imag, $broadcast_dimensions),
-            (HLO_MulOp $lhs_imag, $rhs_real, $broadcast_dimensions),
-            (NullDenseIntElementsAttr)))>;
+            (HLO_MulOp $lhs_real, $rhs_imag),
+            (HLO_MulOp $lhs_imag, $rhs_real)))>;
 
 // Multiplication between a complex and real tensor can be distributed by
 // applying the real multiplicant to both the real and complex component.
 //
 // Note that the sourcep pattern is not legal according to the HLO dialect but
 // instead handle intermediates generated by other patterns.
-def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_MulOp (HLO_RealOp $lhs), $rhs, $broadcast_dimensions),
-           (HLO_MulOp (HLO_ImagOp $lhs), $rhs, $broadcast_dimensions))>;
+           (HLO_MulOp (HLO_RealOp $lhs), $rhs),
+           (HLO_MulOp (HLO_ImagOp $lhs), $rhs))>;
 
-def : Pat<(HLO_MulOp HLO_IntOrFpTensor:$lhs, HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_MulOp HLO_IntOrFpTensor:$lhs, HLO_ComplexTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_MulOp $lhs, (HLO_RealOp $rhs), $broadcast_dimensions),
-           (HLO_MulOp $lhs, (HLO_ImagOp $rhs), $broadcast_dimensions))>;
+           (HLO_MulOp $lhs, (HLO_RealOp $rhs)),
+           (HLO_MulOp $lhs, (HLO_ImagOp $rhs)))>;
 
 
 // Division is performed by normalizing the denominator by multiplying by the
 // conjugate of the rhs.
 //   numerator = lhs * conj(rhs)
 //   denominator = rhs * conj(rhs)
-def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_ComplexTensor:$rhs),
             (HLO_DivOp
              (HLO_MulOp:$num $lhs,
               (HLO_ComplexOp:$conj
                (HLO_RealOp $rhs),
-               (HLO_NegOp (HLO_ImagOp $rhs))),
-              $broadcast_dimensions),
-             (HLO_RealOp:$den (HLO_MulOp $rhs, $conj, $broadcast_dimensions)),
-             (BinBroadcastDimensions $num, $den))>;
+               (HLO_NegOp (HLO_ImagOp $rhs)))),
+             (HLO_RealOp:$den (HLO_MulOp $rhs, $conj)))>;
 
 
-def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs, $broadcast_dimensions),
+def : Pat<(HLO_DivOp HLO_ComplexTensor:$lhs, HLO_IntOrFpTensor:$rhs),
           (HLO_ComplexOp
-           (HLO_DivOp (HLO_RealOp $lhs), $rhs, $broadcast_dimensions),
-           (HLO_DivOp (HLO_ImagOp $lhs), $rhs, $broadcast_dimensions))>;
+           (HLO_DivOp (HLO_RealOp $lhs), $rhs),
+           (HLO_DivOp (HLO_ImagOp $lhs), $rhs))>;
 
 
 // Absolute value is evaluated as:
@@ -100,11 +92,8 @@ def : Pat<(HLO_AbsOp HLO_ComplexTensor:$val),
           (HLO_ComplexOp
            (HLO_SqrtOp
              (HLO_AddOp
-              (HLO_MulOp (HLO_RealOp:$real $val), $real,
-               (NullDenseIntElementsAttr)),
-              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag,
-               (NullDenseIntElementsAttr)),
-              (NullDenseIntElementsAttr))),
+              (HLO_MulOp (HLO_RealOp:$real $val), $real),
+              (HLO_MulOp (HLO_ImagOp:$imag $val), $imag))),
            (HLO_ConstOp (ConstantSplat<"0"> $real)))>;
 
 // Exponential can be lowered to an exponential on the real component and a
@@ -117,5 +106,4 @@ def : Pat<(HLO_ExpOp HLO_ComplexTensor:$val),
            (HLO_ExpOp (HLO_RealOp $val)),
            (HLO_ComplexOp
             (HLO_CosOp (HLO_ImagOp:$imag $val)),
-            (HLO_SinOp $imag)),
-           (NullDenseIntElementsAttr))>;
+            (HLO_SinOp $imag)))>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index 7b4262825f8..c56f5adc12d 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -28,259 +28,6 @@ namespace xla_hlo {
 
 namespace {
 
-// Returns a 1-d i64 elements attribute populated with numbers from start to
-// end, excluding.
-static DenseIntElementsAttr GetI64ElementsAttrForSeq(int start, int end,
-                                                     Builder *builder) {
-  int size = end - start;
-
-  SmallVector<int64_t, 4> vals;
-  vals.resize(size);
-  std::iota(vals.begin(), vals.end(), start);
-
-  TensorType ty = RankedTensorType::get({size}, builder->getIntegerType(64));
-  return DenseIntElementsAttr::get(ty, vals);
-}
-
-// Helper function for OpRewritePattern classes to materialize broadcasts on
-// LHS and RHS arguments to a binary op.
-//
-// Returns true and sets out_lhs and out_rhs to BroadcastInDimOps if successful,
-// returns false otherwise.
-template <typename SrcOp>
-bool CreateStaticBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                       Value *out_lhs, Value *out_rhs) {
-  // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
-  // replacing the original LHS and RHS args in the source op with the results
-  // of the broadcasts.
-  //
-  // If the higher dimensional argument does not actually need the broadcast,
-  // a canonicalization pass should be able to remove that op later.
-  Value lhs = op.lhs();
-  Value rhs = op.rhs();
-
-  auto op_ranked_type = op.getType().template dyn_cast<RankedTensorType>();
-  auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
-  if (!op_ranked_type || !lhs_ranked_type || !rhs_ranked_type) {
-    // Unranked, can't determine at this point how to perform the broadcast.
-    return false;
-  }
-
-  // Dynamic result shape, can't use BroadcastInDimOp.
-  assert(op_ranked_type.hasStaticShape() &&
-         "dynamic shape requires DynamicBroadcastInDim");
-
-  auto lhs_rank = lhs_ranked_type.getRank();
-  auto rhs_rank = rhs_ranked_type.getRank();
-  ArrayRef<int64_t> op_shape = op_ranked_type.getShape();
-
-  // BroadcastInDimOp must have the same element type for operands and results,
-  // so preserve the original output shape and the original input element type.
-  // For example, `SrcOp (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xi1>`:
-  //   broadcast_in_dim (tensor<1x4xf32>) -> tensor<1x4xf32>
-  //   broadcast_in_dim (tensor<4xf32>) -> tensor<1x4xf32>
-  //   SrcOp (tensor<1x4xf32>, tensor<1x4xf32>) -> tensor<1x4xi1>
-  if (lhs_ranked_type.getShape() != op_ranked_type.getShape()) {
-    auto type =
-        RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
-    DenseIntElementsAttr attr = GetI64ElementsAttrForSeq(0, lhs_rank, rewriter);
-    if (lhs_rank < rhs_rank) {
-      attr = op.broadcast_dimensions().getValue();
-    }
-
-    lhs =
-        rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), type, lhs, attr);
-  }
-
-  if (rhs_ranked_type.getShape() != op_ranked_type.getShape()) {
-    auto type =
-        RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
-    DenseIntElementsAttr attr = GetI64ElementsAttrForSeq(0, rhs_rank, rewriter);
-    if (rhs_rank < lhs_rank) {
-      attr = op.broadcast_dimensions().getValue();
-    }
-
-    rhs =
-        rewriter->createOrFold<BroadcastInDimOp>(op.getLoc(), type, rhs, attr);
-  }
-
-  *out_lhs = lhs;
-  *out_rhs = rhs;
-  return true;
-}
-
-// Helper template to generate code for computing the result shape of a
-// broadcasted operation. This ultimately should be subsumed by functions
-// from the shape dialect.
-// Assumes that large and small are the operand values of `op` and that they
-// have a ranked tensory type with rank(large) >= rank(small).
-template <typename SrcOp>
-std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
-                                           PatternRewriter *rewriter) {
-  auto loc = op.getLoc();
-  auto larger_ranked_type = large.getType().cast<RankedTensorType>();
-  auto output_rank = larger_ranked_type.getRank();
-
-  constexpr int kExpandShape = -1;
-
-  std::vector<Value> shape_values;
-  shape_values.reserve(output_rank);
-  std::vector<int> indexes(output_rank, kExpandShape);
-  DenseIntElementsAttr broadcast_dimensions =
-      op.broadcast_dimensions().getValue();
-  // Compute a mapping from output dimensions to their corresponding input
-  // dimensions in the smaller ranked operand.
-  for (auto pair : llvm::enumerate(broadcast_dimensions.getIntValues())) {
-    indexes.at(pair.value().getLimitedValue()) = pair.index();
-  }
-
-  // Compute the broadcasted shape of the result using numpy style broadcasting
-  // semantics. The result shape at a position is the shape of the larger
-  // operand at that position if the no dimension of the smaller operand is
-  // mapped to it.
-  // If both operands contribute to an output dimension, their shape has to
-  // either be the same in that dimension or it can be 1, in which case the
-  // shape of the other operand is used.
-  for (int i = 0; i < output_rank; ++i) {
-    if (indexes[i] == kExpandShape) {
-      // The smaller shape gets expanded to the larger one in this case.
-      shape_values.push_back(rewriter->create<mlir::DimOp>(loc, large, i));
-      continue;
-    }
-    // Compute the result shape depending on whether the rank of smaller is 1.
-    // This does not check that the broadcast operation actualy is correct.
-    // In particular, we do not check that both shapes are the same if the
-    // smaller ranked shape is not 1.
-    ConstantOp one = rewriter->create<mlir::ConstantOp>(
-        loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
-    DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
-    DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
-    CmpIOp compare =
-        rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
-    shape_values.push_back(
-        rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim));
-  }
-
-  return shape_values;
-}
-
-// Helper function for OpRewritePattern classes to materialize dynamic
-// broadcasts on LHS and RHS arguments to a binary op.
-//
-// Returns true and set out_lhs and out_rhs for materialized dynamic broadcasts
-// for LHS and RHS arguments, else returns false.
-template <typename SrcOp>
-bool CreateDynamicBroadcastsForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                        Value *out_lhs, Value *out_rhs) {
-  if (!op.broadcast_dimensions().hasValue()) {
-    // Note: the op may still have an implicit broadcast on it, such as
-    // for (tensor<1xf32>, tensor<4xf32>).
-    return false;
-  }
-
-  // Insert BroadcastInDimOps for the left-hand-side and right-hand-side args,
-  // replacing the original LHS and RHS args in the source op with the results
-  // of the broadcasts.
-  Value lhs = op.lhs();
-  Value rhs = op.rhs();
-
-  auto lhs_ranked_type = lhs.getType().dyn_cast<RankedTensorType>();
-  auto rhs_ranked_type = rhs.getType().dyn_cast<RankedTensorType>();
-  if (!lhs_ranked_type || !rhs_ranked_type) {
-    // Unranked, can't determine at this point how to perform the broadcast.
-    return false;
-  }
-
-  auto lhs_rank = lhs_ranked_type.getRank();
-  auto rhs_rank = rhs_ranked_type.getRank();
-
-  // Set broadcast_dimensions to [0, ..., rank] for the higher rank arg.
-  // Use the original op.broadcast_dimensions for the lower rank arg.
-  auto higher_rank_broadcast_dims =
-      GetI64ElementsAttrForSeq(0, std::max(lhs_rank, rhs_rank), rewriter);
-  DenseIntElementsAttr lhs_broadcast_dims;
-  DenseIntElementsAttr rhs_broadcast_dims;
-  std::vector<Value> shape_elements;
-  if (lhs_rank > rhs_rank) {
-    lhs_broadcast_dims = higher_rank_broadcast_dims;
-    rhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    shape_elements = ComputeBroadcastedShape<SrcOp>(op, rhs, lhs, rewriter);
-  } else if (lhs_rank < rhs_rank) {
-    lhs_broadcast_dims = op.broadcast_dimensions().getValue();
-    rhs_broadcast_dims = higher_rank_broadcast_dims;
-    shape_elements = ComputeBroadcastedShape<SrcOp>(op, lhs, rhs, rewriter);
-  } else {
-    // This shouldn't happen for legal ops. If the broadcast_dimensions
-    // attribute is set, the ranks should be different.
-    // TODO(scotttodd): Add a custom verification for ops and assert here.
-    return false;
-  }
-
-  // DynamicBroadcastInDimOp preserves the element type but produces a tensor
-  // with unranked shape. The rank of the output is the length of the
-  // output shape argument.
-  SmallVector<int64_t, 4> op_shape(shape_elements.size(),
-                                   RankedTensorType::kDynamicSize);
-  auto lhs_type =
-      RankedTensorType::get(op_shape, lhs_ranked_type.getElementType());
-  auto rhs_type =
-      RankedTensorType::get(op_shape, rhs_ranked_type.getElementType());
-
-  // We need a way to turn a list of scalars into a vector. While Standard
-  // dialect does not have one, use the XLA_HLO variant.
-  int shape_size = shape_elements.size();
-  Type shape_element_type = shape_elements.front().getType();
-  Value shape_value = rewriter->create<ScalarsToDimensionTensorOp>(
-      op.getLoc(), RankedTensorType::get({shape_size}, shape_element_type),
-      shape_elements);
-
-  *out_lhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
-      op.getLoc(), lhs_type, lhs, shape_value, lhs_broadcast_dims);
-  *out_rhs = rewriter->createOrFold<DynamicBroadcastInDimOp>(
-      op.getLoc(), rhs_type, rhs, shape_value, rhs_broadcast_dims);
-  return true;
-}
-
-template <typename SrcOp>
-bool CreateBroadcastForBinaryOp(SrcOp op, PatternRewriter *rewriter,
-                                Value *out_lhs, Value *out_rhs) {
-  auto op_ranked_type = op.getType().template dyn_cast<RankedTensorType>();
-  if (!op_ranked_type) return false;
-
-  if (op_ranked_type.hasStaticShape()) {
-    if (!CreateStaticBroadcastsForBinaryOp(op, rewriter, out_lhs, out_rhs)) {
-      return false;
-    }
-  } else {
-    if (!CreateDynamicBroadcastsForBinaryOp(op, rewriter, out_lhs, out_rhs)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <typename SrcOp>
-struct BinaryOpWithBroadcastConvert : public OpRewritePattern<SrcOp> {
-  explicit BinaryOpWithBroadcastConvert(MLIRContext *context)
-      : OpRewritePattern<SrcOp>(context) {}
-
-  LogicalResult matchAndRewrite(SrcOp op,
-                                PatternRewriter &rewriter) const override {
-    Value new_lhs;
-    Value new_rhs;
-
-    if (!CreateBroadcastForBinaryOp(op, &rewriter, &new_lhs, &new_rhs))
-      return failure();
-
-    // Replace the original op with a new one that uses the new args.
-    // New args are broadcasts, so no dims are needed on the replacement op.
-    rewriter.replaceOpWithNewOp<SrcOp>(op, op.getType(), new_lhs, new_rhs,
-                                       /*broadcast_dims=*/nullptr);
-    return success();
-  }
-};
-
 // Converts ClampOp with broadcast semantics. ClampOp requires "all three arrays
 // must be the same shape. Alternatively, as a restricted form of broadcasting,
 // min and/or max can be a scalar of type T."
@@ -322,63 +69,10 @@ struct ClampWithBroadcastConvert : public OpRewritePattern<ClampOp> {
   }
 };
 
-// Specialized class for CompareOp, as it has an additional builder argument.
-struct CompareWithBroadcastConvert : public OpRewritePattern<CompareOp> {
-  explicit CompareWithBroadcastConvert(MLIRContext *context)
-      : OpRewritePattern<CompareOp>(context) {}
-
-  LogicalResult matchAndRewrite(CompareOp op,
-                                PatternRewriter &rewriter) const override {
-    Value new_lhs;
-    Value new_rhs;
-
-    if (!CreateBroadcastForBinaryOp(op, &rewriter, &new_lhs, &new_rhs))
-      return failure();
-
-    rewriter.replaceOpWithNewOp<CompareOp>(op, op.getType(), new_lhs, new_rhs,
-                                           /*broadcast_dims=*/nullptr,
-                                           op.comparison_direction());
-    return success();
-  }
-};
-
 }  // namespace
 
 void SetupMaterializeBroadcastsLegality(MLIRContext *context,
                                         ConversionTarget *conversionTarget) {
-#define ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OpType)           \
-  conversionTarget->addDynamicallyLegalOp<OpType>([](OpType op) { \
-    if (op.broadcast_dimensions().hasValue()) return false;       \
-    auto l = op.lhs().getType().cast<ShapedType>();               \
-    auto r = op.rhs().getType().cast<ShapedType>();               \
-    if (!l.hasRank() || !r.hasRank()) return false;               \
-    return l.getShape() == r.getShape();                          \
-  });
-
-  // Binary elementwise ops.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(AddOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(Atan2Op);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(DivOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MaxOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MinOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(MulOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(PowOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(RemOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftLeftOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftRightArithmeticOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(ShiftRightLogicalOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(SubOp);
-
-  // Binary logical elementwise ops.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(AndOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(OrOp);
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(XorOp);
-
-  // CompareOp.
-  ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST(CompareOp);
-
-#undef ADD_DYNAMICALLY_LEGAL_OP_WITH_BROADCAST
-
   conversionTarget->addDynamicallyLegalOp<ClampOp>([](ClampOp op) {
     return op.max().getType() == op.operand().getType() &&
            op.min().getType() == op.operand().getType();
@@ -387,30 +81,10 @@ void SetupMaterializeBroadcastsLegality(MLIRContext *context,
 
 void PopulateMaterializeBroadcastsPatterns(MLIRContext *context,
                                            OwningRewritePatternList *patterns) {
-  // Binary elementwise ops.
-  patterns->insert<BinaryOpWithBroadcastConvert<AddOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<Atan2Op>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<DivOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MaxOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MinOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<MulOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<PowOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<RemOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftLeftOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftRightArithmeticOp>>(
-      context);
-  patterns->insert<BinaryOpWithBroadcastConvert<ShiftRightLogicalOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<SubOp>>(context);
-
-  // Binary logical elementwise ops.
-  patterns->insert<BinaryOpWithBroadcastConvert<AndOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<OrOp>>(context);
-  patterns->insert<BinaryOpWithBroadcastConvert<XorOp>>(context);
-
-  // ClampOp. It can have a restricted form of broadcasting.
+  // ClampOp. This op has a special case where it accepts either same-shaped
+  // inputs or scalars (a restricted form of broadcasting). This makes the
+  // broadcast explicit.
   patterns->insert<ClampWithBroadcastConvert>(context);
-  // CompareOp. Note the specialized class instead of using the template.
-  patterns->insert<CompareWithBroadcastConvert>(context);
 }
 
 }  // namespace xla_hlo
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index b148eac4286..a1dd6c5ce1e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -36,7 +36,7 @@ namespace xla_hlo {
 /// Lowers from TF dialect to HLO dialect. When allow_partial_conversion is
 /// false, emits an error if there is any operation that can't be legalized.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeTFPass(
-    bool allow_partial_conversion = false);
+    bool allow_partial_conversion = false, bool legalize_chlo = true);
 
 /// Lowers from TF dialect to HLO dialect using tf2xla op kernels for the
 /// specified device type.
@@ -50,7 +50,8 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeTFControlFlowPass();
 /// dialect using the conversion patterns registered by the HLO dialect. When
 /// allow_partial_conversion is false, emits an error if there is any operation
 /// that can't be legalized.
-LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false);
+LogicalResult legalizeTF(Operation* op, bool allow_partial_conversion = false,
+                         bool legalize_chlo = true);
 
 /// Lowers HLO control flow ops to the Standard dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeControlFlowPass();
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
index d53aaee3701..98eb404e4d4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
@@ -135,8 +135,8 @@ class UnfuseBatchNormInferencePattern
     if (!epsilon) {
       return failure();
     }
-    Value stddev = rewriter.create<xla_hlo::AddOp>(
-        bn_op.getLoc(), bn_op.variance(), epsilon, /*broadcast_dims=*/nullptr);
+    Value stddev = rewriter.create<xla_hlo::AddOp>(bn_op.getLoc(),
+                                                   bn_op.variance(), epsilon);
     stddev = rewriter.create<xla_hlo::SqrtOp>(bn_op.getLoc(), stddev);
 
     // Broadcast all terms.
@@ -160,13 +160,13 @@ class UnfuseBatchNormInferencePattern
     // Compute:
     // scale * (input - mean) / stddev + offset
     Value result = rewriter.create<xla_hlo::SubOp>(
-        bn_op.getLoc(), bn_op.operand(), broadcast_mean, nullptr);
+        bn_op.getLoc(), bn_op.operand(), broadcast_mean);
     result = rewriter.create<xla_hlo::MulOp>(bn_op.getLoc(), result,
-                                             broadcast_scale, nullptr);
+                                             broadcast_scale);
     result = rewriter.create<xla_hlo::DivOp>(bn_op.getLoc(), result,
-                                             broadcast_stddev, nullptr);
-    rewriter.replaceOpWithNewOp<xla_hlo::AddOp>(bn_op, result, broadcast_offset,
-                                                nullptr);
+                                             broadcast_stddev);
+    rewriter.replaceOpWithNewOp<xla_hlo::AddOp>(bn_op, result,
+                                                broadcast_offset);
 
     return success();
   }

From 34a68f275278921b4e118b3f318a59993be4efc5 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 19 May 2020 12:25:34 -0700
Subject: [PATCH 0835/1533] Do TraceMe kwargs encoding in C++

PiperOrigin-RevId: 312329330
Change-Id: I1e7a30e9953b289dece0582cd4041a2769ff1901
---
 tensorflow/core/profiler/lib/traceme.h        | 13 ------
 tensorflow/python/profiler/BUILD              |  4 +-
 .../profiler/internal/traceme_wrapper.cc      | 43 ++++++++++++++++---
 tensorflow/python/profiler/trace.py           | 25 +----------
 4 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index e157c2601be..6df196bdba7 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -196,19 +196,6 @@ class TraceMe {
 #endif
   }
 
-  // Appends new_metadata to the payload.
-  // This overload should only be used by other TraceMe APIs.
-  // Prefer the overload above instead.
-  void AppendMetadata(absl::string_view new_metadata) {
-#if !defined(IS_MOBILE_PLATFORM)
-    if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
-      if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        traceme_internal::AppendMetadata(&no_init_.name, new_metadata);
-      }
-    }
-#endif
-  }
-
   // Static API, for use when scoped objects are inconvenient.
 
   // Record the start time of an activity.
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 6747ce9bd11..ffc090a4676 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -224,10 +224,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python/profiler/internal:_pywrap_traceme",
-        "//tensorflow/python/types",
-        "@six_archive//:six",
     ],
 )
 
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index 6b0098e316d..06844f2a469 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -16,9 +16,12 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
@@ -26,16 +29,41 @@ namespace py = pybind11;
 
 namespace {
 
+// Converts kwargs to strings and appends them to name encoded as TraceMe
+// metadata.
+TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
+    std::string* name, const py::kwargs& kwargs) {
+  name->push_back('#');
+  for (const auto& kv : kwargs) {
+    absl::StrAppend(name, std::string(py::str(kv.first)), "=",
+                    std::string(py::str(kv.second)), ",");
+  }
+  name->back() = '#';
+}
+
 // Helper to implement TraceMe as a context manager in Python.
 class TraceMeWrapper {
  public:
-  explicit TraceMeWrapper(const std::string& name) : name_(name) {}
+  explicit TraceMeWrapper(py::str name, py::kwargs kwargs)
+      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
 
-  void Enter() { traceme_.emplace(std::move(name_)); }
+  void Enter() {
+    traceme_.emplace([this]() {
+      std::string name(name_);
+      if (!kwargs_.empty()) {
+        AppendMetadata(&name, kwargs_);
+      }
+      return name;
+    });
+  }
 
-  void SetMetadata(const std::string& new_metadata) {
-    if (TF_PREDICT_TRUE(traceme_)) {
-      traceme_->AppendMetadata(absl::string_view(new_metadata));
+  void SetMetadata(py::kwargs kwargs) {
+    if (TF_PREDICT_TRUE(traceme_.has_value() && !kwargs.empty())) {
+      traceme_->AppendMetadata([&kwargs]() {
+        std::string metadata;
+        AppendMetadata(&metadata, kwargs);
+        return metadata;
+      });
     }
   }
 
@@ -44,7 +72,8 @@ class TraceMeWrapper {
   static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
 
  private:
-  tensorflow::string name_;
+  py::str name_;
+  py::kwargs kwargs_;
   absl::optional<tensorflow::profiler::TraceMe> traceme_;
 };
 
@@ -52,7 +81,7 @@ class TraceMeWrapper {
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
   py::class_<TraceMeWrapper> traceme_class(m, "TraceMe");
-  traceme_class.def(py::init<const std::string&>())
+  traceme_class.def(py::init<py::str, py::kwargs>())
       .def("Enter", &TraceMeWrapper::Enter)
       .def("Exit", &TraceMeWrapper::Exit)
       .def("SetMetadata", &TraceMeWrapper::SetMetadata)
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 424bdd6f3fc..2cdbad5118c 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -18,29 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
-
 from tensorflow.python.profiler.internal import _pywrap_traceme
 from tensorflow.python.util.tf_export import tf_export
 
 
-def encode_metadata(metadata):
-  """Encodes the given metadata to a string.
-
-  Args:
-    metadata: in key-value pairs.
-
-  Returns:
-    The encoded string.
-  """
-  if not metadata:
-    return ''
-  content = []
-  for key, value in six.iteritems(metadata):
-    content.append('%s=%s'%(key, value))
-  return '#' + ','.join(content) + '#'
-
-
 @tf_export('profiler.experimental.Trace', v1=[])
 class Trace(object):
   """Context manager that generates a trace event in the profiler.
@@ -92,8 +73,7 @@ class Trace(object):
       training step being traced.
     """
     if _pywrap_traceme.TraceMe.IsEnabled():
-      name += encode_metadata(kwargs)
-      self._traceme = _pywrap_traceme.TraceMe(name)
+      self._traceme = _pywrap_traceme.TraceMe(name, **kwargs)
     else:
       self._traceme = None
 
@@ -134,8 +114,7 @@ class Trace(object):
     to measure the entire duration of call()).
     """
     if self._traceme and kwargs:
-      additional_metadata = encode_metadata(kwargs)
-      self._traceme.SetMetadata(additional_metadata)
+      self._traceme.SetMetadata(**kwargs)
 
   def __exit__(self, exc_type, exc_val, exc_tb):
     if self._traceme:

From c12107003bcdf6a913dc22a0f0963437ee3221bb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 12:28:42 -0700
Subject: [PATCH 0836/1533] Add an 'invert' arg to lookup layers.

PiperOrigin-RevId: 312329926
Change-Id: If00e4f169412d7b8e5ebc2b74dae65ade4b0fd0a
---
 .../layers/preprocessing/index_lookup.py      | 124 +++++++++++----
 .../layers/preprocessing/index_lookup_test.py | 147 +++++++++++++++++-
 .../layers/preprocessing/integer_lookup.py    |   4 +
 .../preprocessing/integer_lookup_test.py      |  30 ++++
 .../layers/preprocessing/string_lookup.py     |   4 +
 .../preprocessing/string_lookup_test.py       |  30 ++++
 tensorflow/python/keras/testing_utils.py      |  24 ++-
 7 files changed, 324 insertions(+), 39 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index ba9b0d740e1..691e1fef386 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -75,6 +75,8 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
       only used when performing an inverse lookup.
     vocabulary: An optional list of vocabulary terms. If the list contains the
       same token multiple times, an error will be thrown.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
   """
   # TODO(momernick): Add an examples section to the docstring.
 
@@ -84,17 +86,22 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
                mask_token,
                oov_token,
                vocabulary=None,
+               invert=False,
                **kwargs):
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
     # are creating a 0-element vocab, which doesn't make sense.
     if max_tokens is not None and max_tokens <= 1:
-      raise ValueError("If set, max_tokens must be greater than 1.")
+      raise ValueError("If set, `max_tokens` must be greater than 1.")
 
     if num_oov_indices < 0:
-      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
-                       num_oov_indices)
+      raise ValueError("`num_oov_indices` must be greater than 0. You passed "
+                       "%s" % num_oov_indices)
 
+    if invert and num_oov_indices != 1:
+      raise ValueError("`num_oov_tokens` must be 1 when `invert` is True.")
+
+    self.invert = invert
     self.max_tokens = max_tokens
     self.num_oov_indices = num_oov_indices
     self.oov_token = oov_token
@@ -117,10 +124,19 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     self._output_dtype = dtypes.int64
 
+    if invert:
+      key_dtype = self._output_dtype
+      value_dtype = self.dtype
+      oov_value = self.oov_token
+    else:
+      key_dtype = self.dtype
+      value_dtype = self._output_dtype
+      oov_value = self._oov_value
+
     self._table = lookup_ops.MutableHashTable(
-        key_dtype=self.dtype,
-        value_dtype=self._output_dtype,
-        default_value=self._oov_value,
+        key_dtype=key_dtype,
+        value_dtype=value_dtype,
+        default_value=oov_value,
         name=(self._name + "_index_table"))
     tracked_table = self._add_trackable(self._table, trainable=False)
     # This is a workaround for summary() on this layer. Because the table is
@@ -149,7 +165,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    output_dtype = dtypes.int64
+    output_dtype = self.dtype if self.invert else self._output_dtype
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
@@ -176,13 +192,18 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     keys, values = self._table_handler.data()
     # This is required because the MutableHashTable doesn't preserve insertion
     # order, but we rely on the order of the array to assign indices.
-    return [x for _, x in sorted(zip(values, keys))]
+    if self.invert:
+      # If we are inverting, the vocabulary is in the values instead of keys.
+      return [x for _, x in sorted(zip(keys, values))]
+    else:
+      return [x for _, x in sorted(zip(values, keys))]
 
   def vocab_size(self):
     return self._table_handler.vocab_size()
 
   def get_config(self):
     config = {
+        "invert": self.invert,
         "max_tokens": self.max_tokens,
         "num_oov_indices": self.num_oov_indices,
         "oov_token": self.oov_token,
@@ -198,33 +219,15 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # abstraction for ease of saving!) we return 0.
     return 0
 
-  def set_vocabulary(self, vocab):
-    """Sets vocabulary (and optionally document frequency) data for this layer.
-
-    This method sets the vocabulary for this layer directly, instead of
-    analyzing a dataset through 'adapt'. It should be used whenever the vocab
-    information is already known. If vocabulary data is already present in the
-    layer, this method will either replace it
-
-    Arguments:
-      vocab: An array of string tokens.
-
-    Raises:
-      ValueError: If there are too many inputs, the inputs do not match, or
-        input data is missing.
-    """
-
+  def _set_forward_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer when inverse is False."""
     table_utils.validate_vocabulary_is_unique(vocab)
 
     should_have_mask = self.mask_token is not None
-    if should_have_mask:
-      has_mask = vocab[0] == self.mask_token
-      oov_start = 1
-    else:
-      has_mask = False
-      oov_start = 0
+    has_mask = vocab[0] == self.mask_token
+    oov_start = 1 if should_have_mask else 0
 
-    should_have_oov = self.num_oov_indices > 0
+    should_have_oov = (self.num_oov_indices > 0) and not self.invert
     if should_have_oov:
       oov_end = oov_start + self.num_oov_indices
       expected_oov = [self.oov_token] * self.num_oov_indices
@@ -293,6 +296,65 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
       special_token_values = np.arange(num_special_tokens, dtype=np.int64)
       self._table_handler.insert(special_tokens, special_token_values)
 
+  def _set_inverse_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer when inverse is True."""
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    has_mask = vocab[0] == self.mask_token
+
+    insert_special_tokens = should_have_mask and not has_mask
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
+      raise ValueError(
+          "Attempted to set a vocabulary larger than the maximum vocab size. "
+          "Passed vocab size is %s, max vocab size is %s." %
+          (total_vocab_size, self.max_tokens))
+
+    start_index = num_special_tokens if insert_special_tokens else 0
+    values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
+
+    self._table_handler.clear()
+    self._table_handler.insert(values, vocab)
+
+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_token_values, special_tokens)
+
+  def set_vocabulary(self, vocab):
+    """Sets vocabulary data for this layer with inverse=False.
+
+    This method sets the vocabulary for this layer directly, instead of
+    analyzing a dataset through 'adapt'. It should be used whenever the vocab
+    information is already known. If vocabulary data is already present in the
+    layer, this method will either replace it
+
+    Arguments:
+      vocab: An array of string tokens.
+
+    Raises:
+      ValueError: If there are too many inputs, the inputs do not match, or
+        input data is missing.
+    """
+    if self.invert:
+      self._set_inverse_vocabulary(vocab)
+    else:
+      self._set_forward_vocabulary(vocab)
+
   def _set_state_variables(self, updates):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index a95834233b3..bbca0c537ef 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -77,6 +77,30 @@ def _get_end_to_end_test_cases():
           "input_dtype":
               dtypes.string
       },
+      {
+          "testcase_name":
+              "test_inverse_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data": np.array([[1], [2], [3], [4], [4], [3], [1], [5]]),
+          "kwargs": {
+              "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
+              "invert": True
+          },
+          "expected_output":
+              np.array([[b"earth"], [b"wind"], [b"and"], [b"fire"], [b"fire"],
+                        [b"and"], [b"earth"], [b"[OOV]"]]),
+          "input_dtype":
+              dtypes.int64
+      },
       {
           "testcase_name":
               "test_ints_soft_vocab_cap",
@@ -125,7 +149,11 @@ class IndexLookupLayerTest(keras_parameterized.TestCase,
                                        use_dataset, expected_output,
                                        input_dtype):
     cls = get_layer_class()
-    expected_output_dtype = dtypes.int64
+    if "invert" in kwargs and kwargs["invert"]:
+      expected_output_dtype = kwargs["dtype"]
+    else:
+      expected_output_dtype = dtypes.int64
+
     input_shape = input_data.shape
 
     if use_dataset:
@@ -156,7 +184,10 @@ class IndexLookupLayerTest(keras_parameterized.TestCase,
           expected_output_dtype=expected_output_dtype,
           validate_training=False,
           adapt_data=vocab_data)
-    self.assertAllClose(expected_output, output_data)
+    if "invert" in kwargs and kwargs["invert"]:
+      self.assertAllEqual(expected_output, output_data)
+    else:
+      self.assertAllClose(expected_output, output_data)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -748,6 +779,118 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
       layer.set_vocabulary(vocab_data)
 
 
+@keras_parameterized.run_all_keras_modes
+class IndexLookupInverseVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", "[OOV]"]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
+
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64,
+        invert=True)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string,
+          invert=True)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64,
+          invert=True)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64,
+        invert=True)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
+
+
 @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
 class IndexLookupSaveableTest(keras_parameterized.TestCase,
                               preprocessing_test_utils.PreprocessingLayerTest):
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
index 671c02573db..c42c7cc1b89 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -57,6 +57,8 @@ class IntegerLookup(index_lookup.IndexLookup):
       a vocabulary to load into this layer. The file should contain one value
       per line. If the list or file contains the same token multiple times, an
       error will be thrown.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
   """
 
   def __init__(self,
@@ -65,6 +67,7 @@ class IntegerLookup(index_lookup.IndexLookup):
                mask_value=0,
                oov_value=-1,
                vocabulary=None,
+               invert=False,
                **kwargs):
     allowed_dtypes = [dtypes.int64]
 
@@ -95,6 +98,7 @@ class IntegerLookup(index_lookup.IndexLookup):
         mask_token=mask_value,
         oov_token=oov_value,
         vocabulary=vocabulary,
+        invert=invert,
         **kwargs)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
index 515a1ca6667..0b71c6aaecc 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@@ -347,6 +347,36 @@ class IntegerLookupOutputTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_inverse_output(self):
+    vocab_data = [0, -1, 42, 1138, 725, 1729]
+    input_array = np.array([[2, 3, 4, 5], [5, 4, 2, 1]])
+    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(invert=True)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_forward_backward_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = np.array([[42, 1138, 725, 1729], [1729, 725, 42, -1]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    inverse_layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    inverse_layer = get_layer_class()(
+        vocabulary=layer.get_vocabulary(), invert=True)
+    int_data = layer(input_data)
+    inverse_data = inverse_layer(int_data)
+    model = keras.Model(inputs=input_data, outputs=inverse_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes
 class IntegerLookupVocabularyTest(
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
index 4032486b5f0..bbebe499204 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -58,6 +58,8 @@ class StringLookup(index_lookup.IndexLookup):
       one token per line. If the list or file contains the same token multiple
       times, an error will be thrown.
     encoding: The Python string encoding to use. Defaults to `'utf-8'`.
+    invert: If true, this layer will map indices to vocabulary items instead
+      of mapping vocabulary items to indices.
   """
 
   def __init__(self,
@@ -67,6 +69,7 @@ class StringLookup(index_lookup.IndexLookup):
                oov_token="[OOV]",
                vocabulary=None,
                encoding="utf-8",
+               invert=False,
                **kwargs):
     allowed_dtypes = [dtypes.string]
 
@@ -89,6 +92,7 @@ class StringLookup(index_lookup.IndexLookup):
         mask_token=mask_token,
         oov_token=oov_token,
         vocabulary=vocabulary,
+        invert=invert,
         **kwargs)
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
index b2a610ac328..0b9081d815c 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@@ -187,6 +187,36 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
     with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
       _ = get_layer_class()(vocabulary=vocab_path)
 
+  def test_inverse_layer(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 0]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", ""]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data, invert=True)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_forward_backward_layer(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = np.array([["earth", "wind", "and", "fire"],
+                                ["fire", "and", "earth", "[OOV]"]])
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    invert_layer = get_layer_class()(
+        vocabulary=layer.get_vocabulary(), invert=True)
+    int_data = layer(input_data)
+    out_data = invert_layer(int_data)
+    model = keras.Model(inputs=input_data, outputs=out_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
 class StringLookupSaveableTest(keras_parameterized.TestCase,
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index 5da6aeef391..b41abbdf1f5 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python import tf2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
@@ -44,6 +45,14 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
 
+def string_test(actual, expected):
+  np.testing.assert_array_equal(actual, expected)
+
+
+def numeric_test(actual, expected):
+  np.testing.assert_allclose(actual, expected, rtol=1e-3, atol=1e-6)
+
+
 def get_test_data(train_samples,
                   test_samples,
                   input_shape,
@@ -132,6 +141,11 @@ def layer_test(layer_cls,
   if expected_output_dtype is None:
     expected_output_dtype = input_dtype
 
+  if dtypes.as_dtype(expected_output_dtype) == dtypes.string:
+    assert_equal = string_test
+  else:
+    assert_equal = numeric_test
+
   # instantiation
   kwargs = kwargs or {}
   layer = layer_cls(**kwargs)
@@ -199,8 +213,7 @@ def layer_test(layer_cls,
         (layer_cls.__name__, x, actual_output.dtype,
          computed_output_signature.dtype, kwargs))
   if expected_output is not None:
-    np.testing.assert_allclose(actual_output, expected_output,
-                               rtol=1e-3, atol=1e-6)
+    assert_equal(actual_output, expected_output)
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
@@ -209,7 +222,7 @@ def layer_test(layer_cls,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3, atol=1e-6)
+    assert_equal(output, actual_output)
 
   # test training mode (e.g. useful for dropout tests)
   # Rebuild the model to avoid the graph being reused between predict() and
@@ -254,8 +267,7 @@ def layer_test(layer_cls,
              computed_output_shape,
              kwargs))
   if expected_output is not None:
-    np.testing.assert_allclose(actual_output, expected_output,
-                               rtol=1e-3, atol=1e-6)
+    assert_equal(actual_output, expected_output)
 
   # test serialization, weight setting at model level
   model_config = model.get_config()
@@ -264,7 +276,7 @@ def layer_test(layer_cls,
     weights = model.get_weights()
     recovered_model.set_weights(weights)
     output = recovered_model.predict(input_data)
-    np.testing.assert_allclose(output, actual_output, rtol=1e-3, atol=1e-6)
+    assert_equal(output, actual_output)
 
   # for further checks in the caller function
   return actual_output

From b4360f894c873cec4bdbe5922d6430bb7acf3f4f Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Tue, 19 May 2020 12:29:18 -0700
Subject: [PATCH 0837/1533] Remove unused experimental APIs

PiperOrigin-RevId: 312330039
Change-Id: I721642d67294ea5e0ba3702058106ea423db72d1
---
 tensorflow/c/c_api_experimental.cc | 206 +----------------------------
 tensorflow/c/c_api_experimental.h  |  42 ------
 tensorflow/c/eager/c_api.cc        |  18 ---
 3 files changed, 3 insertions(+), 263 deletions(-)

diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index e623f30b98c..e9e6d470c68 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -325,205 +325,6 @@ TF_Buffer* TFE_GetServerDef(const char* text_proto, TF_Status* status) {
   return ret;
 }
 
-TFE_Context* TFE_CreateContextFromSession(TF_Session* session,
-                                          TF_Status* status) {
-  auto* opts = TFE_NewContextOptions();
-
-  // Reduce GPU memory allocation, and set appropriate config options for TFE
-  // context.
-  auto* config = TF_CreateConfig(
-      /*xla*/ false, /* gpu_memory_allow_growth */ true, /* num_cpu_devices */
-      10);
-  TFE_ContextOptionsSetConfig(opts, config->data, config->length, status);
-  if (!status->status.ok()) {
-    CHECK(!config);
-    TFE_DeleteContextOptions(opts);
-    return nullptr;
-  }
-
-  auto* ctx = TFE_NewContextFromSession(opts, session, status);
-  TF_DeleteBuffer(config);
-  TFE_DeleteContextOptions(opts);
-  return ctx;
-}
-
-// TODO: retrieve the device string via TFE_ContextListDevices()
-static const char DEFAULT_CPU_DEVICE[] =
-    "/job:localhost/replica:0/task:0/device:CPU:0";
-
-static TFE_TensorHandle* createTFEQueue(TFE_Context* ctx, TF_DataType inputType,
-                                        int tensor_id, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> queueOp(
-      TFE_NewOp(ctx, "FIFOQueueV2", status), TFE_DeleteOp);
-  TFE_OpSetDevice(queueOp.get(), DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return nullptr;
-  // TODO: use NAMED_TENSOR_QUEUE_CAPACITY in S4TF compiler.
-  TFE_OpSetAttrInt(queueOp.get(), "capacity", 1);
-  TFE_OpSetAttrTypeList(queueOp.get(), "component_types", &inputType, 1);
-  auto shared_name = tensorflow::strings::StrCat("fifo_queue_", tensor_id);
-  TFE_OpSetAttrString(queueOp.get(), "shared_name", shared_name.data(),
-                      shared_name.size());
-  TFE_OpSetAttrString(queueOp.get(), "container", "", 0);
-
-  // TODO: consider making this an unknown shape.
-  const int64_t* dims_ptr = nullptr;
-  int num_dims = 0;
-  TFE_OpSetAttrShapeList(queueOp.get(), "shapes", &dims_ptr, &num_dims,
-                         /*num_values*/ 0, status);
-  if (!status->status.ok()) return nullptr;
-
-  int num_retvals = 1;
-  TFE_TensorHandle* queue = nullptr;
-  TFE_Execute(queueOp.get(), &queue, &num_retvals, status);
-  if (!status->status.ok()) return nullptr;
-  CHECK_EQ(num_retvals, 1);
-
-  return queue;
-}
-
-static void createTFEEnqueue(TFE_Context* ctx, TF_DataType inputType,
-                             TFE_TensorHandle* queue, TFE_TensorHandle* tensor,
-                             TF_Status* status) {
-  TFE_Op* op = TFE_NewOp(ctx, "QueueEnqueueV2", status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op_deleter(op, TFE_DeleteOp);
-  TFE_OpSetDevice(op, DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return;
-  TFE_OpAddInput(op, queue, status);
-  if (!status->status.ok()) return;
-  TFE_OpAddInput(op, tensor, status);
-  if (!status->status.ok()) return;
-  TFE_OpSetAttrTypeList(op, "Tcomponents", &inputType, 1);
-  TFE_OpSetAttrInt(op, "timeout_ms", -1);
-
-  int num_retvals = 0;
-  TFE_Execute(op, nullptr /*retvals*/, &num_retvals, status);
-  if (!status->status.ok()) return;
-  CHECK_EQ(num_retvals, 0);
-}
-
-static TFE_TensorHandle* createTFEDequeue(TFE_Context* ctx,
-                                          TF_DataType inputType,
-                                          TFE_TensorHandle* queue,
-                                          TF_Status* status) {
-  TFE_Op* op = TFE_NewOp(ctx, "QueueDequeueV2", status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op_deleter(op, TFE_DeleteOp);
-  TFE_OpSetDevice(op, DEFAULT_CPU_DEVICE, status);
-  if (!status->status.ok()) return nullptr;
-
-  TFE_OpAddInput(op, queue, status);
-  if (!status->status.ok()) return nullptr;
-  TFE_OpSetAttrTypeList(op, "component_types", &inputType, 1);
-  TFE_OpSetAttrInt(op, "timeout_ms", -1);
-  TFE_TensorHandle* ret;
-  int num_retvals = 1;
-  TFE_Execute(op, &ret, &num_retvals, status);
-  if (!status->status.ok()) return nullptr;
-  CHECK_EQ(num_retvals, 1);
-  return ret;
-}
-
-TFE_TensorHandle* TFE_DequeueNamedTensor(TF_Session* session, int tensor_id,
-                                         TF_DataType inputType,
-                                         TF_Status* status) {
-  assert(session);
-  VLOG(1) << "Dequeuing data tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  auto* ret = createTFEDequeue(ctx, inputType, queue, status);
-  return ret;
-}
-
-TFE_TensorHandle* TFE_DequeueNamedTensorFromCtx(TFE_Context* ctx, int tensor_id,
-                                                TF_DataType inputType,
-                                                TF_Status* status) {
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  auto* ret = createTFEDequeue(ctx, inputType, queue, status);
-
-  return ret;
-}
-
-void TFE_EnqueueNamedTensor(TF_Session* session, int tensor_id,
-                            TFE_TensorHandle* tensor, TF_Status* status) {
-  assert(session);
-  VLOG(1) << "Enqueuing data tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TF_DataType inputType = TFE_TensorHandleDataType(tensor);
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, inputType, queue, tensor, status);
-}
-
-void TFE_EnqueueNamedTensorFromCtx(TFE_Context* ctx, int tensor_id,
-                                   TFE_TensorHandle* tensor,
-                                   TF_Status* status) {
-  VLOG(1) << "Enqueuing data tensor with id " << tensor_id;
-
-  TF_DataType inputType = TFE_TensorHandleDataType(tensor);
-  TFE_TensorHandle* queue = createTFEQueue(ctx, inputType, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, inputType, queue, tensor, status);
-}
-
-void TFE_EnqueueVariantTensor(TF_Session* session, int tensor_id,
-                              TFE_TensorHandle* tensor, TF_Status* status) {
-  VLOG(1) << "Enqueuing variant tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, TF_VARIANT, tensor_id, status);
-  if (!status->status.ok()) return;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  createTFEEnqueue(ctx, TF_VARIANT, queue, tensor, status);
-}
-
-TFE_TensorHandle* TFE_DequeueVariantTensor(TF_Session* session, int tensor_id,
-                                           TF_Status* status) {
-  VLOG(1) << "Dequeuing variant tensor with id " << tensor_id;
-
-  auto ctx = TFE_CreateContextFromSession(session, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> ctx_deleter(
-      ctx, TFE_DeleteContext);
-
-  TFE_TensorHandle* queue = createTFEQueue(ctx, TF_VARIANT, tensor_id, status);
-  if (!status->status.ok()) return nullptr;
-  std::unique_ptr<TFE_TensorHandle, decltype(&TFE_DeleteTensorHandle)>
-      queue_deleter(queue, TFE_DeleteTensorHandle);
-
-  return createTFEDequeue(ctx, TF_VARIANT, queue, status);
-}
-
 void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
@@ -622,10 +423,9 @@ void TF_AttrBuilderSetType(TF_AttrBuilder* builder, const char* attr_name,
 void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder, const char* attr_name,
                                const TF_DataType* values, int num_values) {
   auto iter = builder->attr_names.insert(attr_name).first;
-  builder->Set(
-      (*iter).c_str(),
-      tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
-          reinterpret_cast<const tensorflow::DataType*>(values), num_values));
+  builder->Set(*iter, tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
+                          reinterpret_cast<const tensorflow::DataType*>(values),
+                          num_values));
 }
 
 void TF_AttrBuilderCheckCanRunOnDevice(TF_AttrBuilder* builder,
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 551a45d92c4..d0ffbf125fb 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -146,48 +146,6 @@ TF_CAPI_EXPORT extern void TF_EnqueueNamedTensor(TF_Session* session,
 // Create a serialized tensorflow.ServerDef proto.
 TF_Buffer* TFE_GetServerDef(const char* text_proto, TF_Status* status);
 
-// TODO: remove this API in favor of the next one.
-TF_CAPI_EXPORT extern TFE_Context* TFE_NewContextFromSession(
-    const TFE_ContextOptions* opts, TF_Session* sess, TF_Status* status);
-
-// Creates from `session` a new eager context to run a graph function or
-// sends/recvs, so that these concurrent TFE executions can share (via
-// `session` and its associated device mgr) the same set of fifo queue resource
-// ops, used for host<->TF tensor transfers. This way the sends/recvs calls and
-// graph function execution can access the same fifo queue resource handles
-// (associated with devices managed by the device manager, which can be obtained
-// from `session`).
-//
-// TODO: Remove this function once we migrate away from using session.
-TF_CAPI_EXPORT extern TFE_Context* TFE_CreateContextFromSession(
-    TF_Session* session, TF_Status* status);
-
-// TODO: Retire this API in favor of the next one.
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueNamedTensor(
-    TF_Session* session, int tensor_id, TF_DataType inputType,
-    TF_Status* status);
-
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueNamedTensorFromCtx(
-    TFE_Context* ctx, int tensor_id, TF_DataType inputType, TF_Status* status);
-
-TF_CAPI_EXPORT extern void TFE_EnqueueNamedTensor(TF_Session* session,
-                                                  int tensor_id,
-                                                  TFE_TensorHandle* tensor,
-                                                  TF_Status* status);
-
-TF_CAPI_EXPORT extern void TFE_EnqueueNamedTensorFromCtx(
-    TFE_Context* ctx, int tensor_id, TFE_TensorHandle* tensor,
-    TF_Status* status);
-
-// TODO: consider folding the 2 APIs below into the ones above.
-TF_CAPI_EXPORT extern void TFE_EnqueueVariantTensor(TF_Session* session,
-                                                    int tensor_id,
-                                                    TFE_TensorHandle* tensor,
-                                                    TF_Status* status);
-
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
-    TF_Session* session, int tensor_id, TF_Status* status);
-
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index f5535c80d30..912cd184b77 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -727,24 +727,6 @@ TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
       tensorflow::GetDefaultCustomKernelCreator()));
 }
 
-TFE_Context* TFE_NewContextFromSession(const TFE_ContextOptions* opts,
-                                       TF_Session* sess, TF_Status* status) {
-  const tensorflow::DeviceMgr* device_mgr = nullptr;
-  status->status = sess->session->LocalDeviceManager(&device_mgr);
-  if (!status->status.ok()) return nullptr;
-  tensorflow::Rendezvous* r =
-      new tensorflow::IntraProcessRendezvous(device_mgr);
-
-  return tensorflow::wrap(new tensorflow::EagerContext(
-      opts->session_options.options,
-      static_cast<tensorflow::ContextDevicePlacementPolicy>(
-          opts->device_placement_policy),
-      static_cast<tensorflow::ContextMirroringPolicy>(opts->mirroring_policy),
-      opts->async, opts->lazy_remote_inputs_copy, device_mgr,
-      /*device_mgr_owned*/ false, r,
-      tensorflow::GetDefaultCustomKernelCreator()));
-}
-
 void TFE_DeleteContext(TFE_Context* ctx) {
   if (ctx == nullptr) {
     return;

From 0ac6ff5dbdec3b5b7944c8306ec30288d1f9202f Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Tue, 19 May 2020 19:37:24 +0000
Subject: [PATCH 0838/1533] Fixed test

---
 .../integration_test/gradient_checkpoint_test.py    | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
index 18e88179e9b..c09a6bf51e6 100644
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -70,14 +70,11 @@ def _limit_gpu_memory():
   """Helper function to limit GPU memory for testing  """
   gpus = tf.config.experimental.list_physical_devices('GPU')
   if gpus:
-    try:
-      tf.config.experimental.set_virtual_device_configuration(
-          gpus[0], [
-              tf.config.experimental.VirtualDeviceConfiguration(
-                  memory_limit=1024)
-          ])
-    except RuntimeError as e:
-      print(e)
+    tf.config.experimental.set_virtual_device_configuration(
+        gpus[0], [
+            tf.config.experimental.VirtualDeviceConfiguration(
+                memory_limit=1024)
+        ])
     return True
   return False
 

From 37543935526ec5c28893e448c5ee29f24b9d2aee Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Tue, 19 May 2020 12:40:00 -0700
Subject: [PATCH 0839/1533] Update the comment for the normalization parameters

PiperOrigin-RevId: 312332196
Change-Id: Ief01070eb5f94dd70ffa6a44608fadbcc36a1d30
---
 .../support/metadata/metadata_schema.fbs       | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index b8e529ad1c5..a2812e1b6e3 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -317,12 +317,22 @@ table NormalizationOptions{
   // mean and std are normalization parameters. Tensor values are normalized
   // on a per-channel basis, by the formula
   //   (x - mean) / std.
-  // For example, a float MobileNet model will have
-  //   mean = 127.5f and std = 127.5f.
-  // A quantized MobileNet model will have
-  //   mean = 0.0f and std = 1.0f.
   // If there is only one value in mean or std, we'll propogate the value to
   // all channels.
+  //
+  // Quantized models share the same normalization parameters as their
+  // corresponding float models. For example, an image input tensor may have
+  // the normalization parameter of
+  //   mean = 127.5f and std = 127.5f.
+  // The image value will be normalized from [0, 255] to [-1, 1].
+  // Then, for quantized models, the image data should be further quantized
+  // according to the quantization parameters. In the case of uint8, the image
+  // data will be scaled back to [0, 255], while for int8, the image data will
+  // be scaled to [-128, 127].
+  //
+  // Both the normalization parameters and quantization parameters can be
+  // retrieved through the metadata extractor library.
+  // TODO(b/156644598): add link for the metadata extractor library.
 
   // Per-channel mean of the possible values used in normalization.
   //

From c1c8d406569e2a5a795e3392236875cf091c3fc2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 12:48:45 -0700
Subject: [PATCH 0840/1533] Add bot comment for cuda and windows related build
 and install issues.

PiperOrigin-RevId: 312333843
Change-Id: I0ec8a6a7fe9836e7846d350987764f0bbdcf0121
---
 .github/bot_config.yml | 60 +++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 59 insertions(+), 1 deletion(-)

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index ee6037f4b94..88c737f41e2 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -24,6 +24,64 @@ assignees:
    - amahendrakar
    - ravikyram
    - Saduf2019
-# A list of assignees for    
+# A list of assignees for compiler folder
 compiler_assignees:
    - joker-eph
+# Cuda Comment
+cuda_comment: >
+   From the template it looks like you are installing **TensorFlow** (TF) prebuilt binaries:
+      * For TF-GPU - See point 1
+      * For TF-CPU - See point 2
+   -----------------------------------------------------------------------------------------------
+   
+   **1. Installing **TensorFlow-GPU** (TF) prebuilt binaries**
+   
+   
+   Make sure you are using compatible TF and CUDA versions.
+   Please refer following TF version and CUDA version compatibility table.
+   
+   | TF  | CUDA |
+   
+   | :-------------: | :-------------: |
+   
+   | 2.1.0 - 2.2.0  | 10.1 |
+   
+   | 1.13.1 - 2.0  | 10.0  |
+   
+   | 1.5.0 - 1.12.0 | 9.0 |
+   
+     * If you have above configuration and using _**Windows**_ platform -
+       * Try adding the CUDA, CUPTI, and cuDNN installation directories to the %PATH% environment variable.
+       * Refer [windows setup guide](https://www.tensorflow.org/install/gpu#windows_setup).
+     * If you have above configuration and using _**Ubuntu/Linux**_ platform -
+       * Try adding the CUDA, CUPTI, and cuDNN installation directories to the $LD_LIBRARY_PATH environment variable.
+       * Refer [linux setup guide](https://www.tensorflow.org/install/gpu#linux_setup).
+     * If error still persists then, apparently your CPU model does not support AVX instruction sets.
+       * Refer [hardware requirements](https://www.tensorflow.org/install/pip#hardware-requirements).
+   
+   -----------------------------------------------------------------------------------------------
+   
+   **2. Installing **TensorFlow** (TF) CPU prebuilt binaries**
+   
+   
+   *TensorFlow release binaries version 1.6 and higher are prebuilt with AVX instruction sets.*
+   
+   
+   Therefore on any CPU that does not have these instruction sets, either CPU or GPU version of TF will fail to load.
+   
+   Apparently, your CPU model does not support AVX instruction sets. You can still use TensorFlow with the alternatives given below:
+   
+      * Try Google Colab to use TensorFlow.
+         * The easiest way to use TF will be to switch to [google colab](https://colab.sandbox.google.com/notebooks/welcome.ipynb#recent=true). You get pre-installed latest stable TF version. Also you can use ```pip install```  to install any other preferred TF version.
+         * It has an added advantage since you can you easily switch to different hardware accelerators (cpu, gpu, tpu) as per the task.
+         * All you need is a good internet connection and you are all set.
+      * Try to build TF from sources by changing CPU optimization flags.
+   
+   *Please let us know if this helps.*
+   
+windows_comment: >
+   From the stack trace it looks like you are hitting windows path length limit.
+      * Try to disable path length limit on Windows 10.
+        * Refer [disable path length limit instructions guide.](https://mspoweruser.com/ntfs-260-character-windows-10/)
+   
+   Please let us know if this helps.

From b3387c0c19c7ab6e637bcc3d63fbd1854d64f414 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 19 May 2020 13:20:14 -0700
Subject: [PATCH 0841/1533] Disable //tensorflow/python/eager:function_test on
 macos for now.

PiperOrigin-RevId: 312340229
Change-Id: I679d45497cd9ba8e81bbc58205e3452d34bf9788
---
 tensorflow/python/eager/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 394b929bf1b..adc30eab5e1 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -432,6 +432,7 @@ cuda_py_test(
     srcs = ["function_test.py"],
     python_version = "PY3",
     shard_count = 15,
+    tags = ["nomac"],  # b/157056289
     deps = [
         ":backprop",
         ":cancellation",

From 82143c1ad88ceeb51ac6a280b79c2bc766dd854b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 13:22:10 -0700
Subject: [PATCH 0842/1533] Enable TensorCore embeddings for training via
 FeatureColumnV2.

PiperOrigin-RevId: 312340625
Change-Id: I559aba797a8f1a37ecec1e4ee71cd027701ae6dd
---
 tensorflow/python/tpu/feature_column_v2.py    | 230 +++++++++++++-----
 .../python/tpu/feature_column_v2_test.py      | 130 +++++++---
 ....experimental.-embedding-config-spec.pbtxt |   4 +
 3 files changed, 270 insertions(+), 94 deletions(-)

diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index d9820425467..e67842e766a 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -31,15 +31,18 @@ from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu.feature_column import _is_running_on_cpu
 from tensorflow.python.tpu.feature_column import _record_variable_scope_and_name
 from tensorflow.python.tpu.feature_column import _SUPPORTED_CATEGORICAL_COLUMNS_V2
+from tensorflow.python.tpu.feature_column import _SUPPORTED_SEQUENCE_COLUMNS
 from tensorflow.python.tpu.feature_column import _TPUBaseEmbeddingColumn
 from tensorflow.python.util.tf_export import tf_export
 # pylint: disable=protected-access
 
 _ALLOWED_DEVICES = ['cpu', 'tpu_tensor_core', 'tpu_embedding_core']
+_TENSOR_CORE_MASK_KEY_SUFFIX = '__TENSOR_CORE_MASK'
 
 
 class EmbeddingDevice(enum.Enum):
@@ -174,10 +177,13 @@ def embedding_column_v2(categorical_column,
   elif embedding_lookup_device == 'tpu_embedding_core':
     embedding_lookup_device = EmbeddingDevice.TPU_EMBEDDING_CORE
 
-  if (embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE and
-      not tensor_core_shape):
-    raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
-                     'tensor_core_shape to be set.')
+  if embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE:
+    if not tensor_core_shape:
+      raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
+                       'tensor_core_shape to be set.')
+    if isinstance(categorical_column, _SUPPORTED_SEQUENCE_COLUMNS):
+      raise ValueError('embedding_lookup_device=tpu_tensor_core currently does '
+                       'not support sequence columns.')
 
   if not embedding_lookup_device:
     return _TPUEmbeddingColumnV2(
@@ -372,10 +378,14 @@ def shared_embedding_columns_v2(categorical_columns,
   elif embedding_lookup_device == 'tpu_embedding_core':
     embedding_lookup_device = EmbeddingDevice.TPU_EMBEDDING_CORE
 
-  if (embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE and
-      not tensor_core_shape):
-    raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
-                     'tensor_core_shape to be set.')
+  if embedding_lookup_device == EmbeddingDevice.TPU_TENSOR_CORE:
+    if not tensor_core_shape:
+      raise ValueError('Using embedding_lookup_device=tpu_tensor_core requires '
+                       'tensor_core_shape to be set.')
+    for c in sorted_columns:
+      if isinstance(c, _SUPPORTED_SEQUENCE_COLUMNS):
+        raise ValueError('embedding_lookup_device=tpu_tensor_core currently '
+                         'does not support sequence columns.')
 
   # Create the state (_SharedEmbeddingColumnLayer) here.
   for categorical_column, max_sequence_length in zip(
@@ -807,7 +817,13 @@ def sparse_embedding_aggregate_slice(params,
     if combiner == 'sum':
       return aggregate_emb
     elif combiner == 'mean':
-      return aggregate_emb / math_ops.reduce_sum(values_mask_broadcast, axis=1)
+      # In the case we have an empty row, both aggregate_emb and
+      # math_ops.reduce_sum(values_mask_broadcast, axis=1) will be 0. Thus,
+      # we can take max it with a non-zero value to prevent NaNs. Note that
+      # math_ops.reduce_sum(values_mask_broadcast, axis=1) will have integer
+      # values so 1.0 is the smallest value.
+      return aggregate_emb / math_ops.maximum(
+          math_ops.reduce_sum(values_mask_broadcast, axis=1), 1.0)
     else:
       raise ValueError('Dense TPU Embedding does not support combiner '
                        'other than sum and mean.')
@@ -851,6 +867,20 @@ def pad_sparse_embedding_lookup_indices(sparse_indices, padded_size):
   return padded_values, padded_mask
 
 
+def _check_invalid_cases(embedding_lookup_device):
+  """Checks for invalid embedding_lookup_device configurations."""
+  if (tpu.under_tpu_inference_context() and
+      embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
+    raise ValueError(
+        'Using embedding_lookup_device=tpu_embedding_core during inference '
+        'is not supported.')
+  if embedding_lookup_device == EmbeddingDevice.CPU:
+    if not tpu.under_tpu_inference_context():
+      raise ValueError(
+          'Using TPUEmbeddingColumn with embedding_lookup_device="cpu" '
+          'during training is not supported.')
+
+
 class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
   """TPUEmbeddingColumn which allows serving on TensorCore."""
 
@@ -874,46 +904,105 @@ class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
       del kwargs['embedding_lookup_device']
     _TPUEmbeddingColumnV2.__init__(self, *args, **kwargs)
 
-  def create_state(self, state_manager):
-    if (tpu.under_tpu_inference_context() and
-        self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
-      raise ValueError(
-          'Using embedding_lookup_device=tpu_embedding_core during inference '
-          'is not supported.')
-    if self._embedding_lookup_device == EmbeddingDevice.CPU:
-      if tpu.under_tpu_inference_context():
-        return fc_lib.EmbeddingColumn.create_state(self, state_manager)
-      else:
-        raise ValueError(
-            'Using TPUEmbeddingColumn with embedding_lookup_device="cpu" '
-            'during training is not supported.')
+  def __deepcopy__(self, memo):
+    return _TPUDeviceSpecificEmbeddingColumnV2(
+        *(copy.deepcopy(a, memo) for a in self.__getnewargs__()),
+        tensor_core_shape=self._tensor_core_shape,
+        embedding_lookup_device=self._embedding_lookup_device)
 
-    return super(_TPUDeviceSpecificEmbeddingColumnV2,
-                 self).create_state(state_manager)
+  def create_state(self, state_manager):
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU case.
+    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
+    ):
+      return fc_lib.EmbeddingColumn.create_state(self, state_manager)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self).create_state(state_manager)
+
+    # TPU_EMBEDDING_CORE case.
+    return fc_lib.EmbeddingColumn.create_state(self, state_manager)
 
   def get_dense_tensor(self, transformation_cache, state_manager):
     """Private method that follows get_dense_tensor."""
-
-    # If we aren't inferencing on TensorCore, just delegate to parent.
-    if not tpu.under_tpu_inference_context() or not self._tensor_core_shape:
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
+    ):
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self).get_dense_tensor(transformation_cache, state_manager)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
       return super(_TPUDeviceSpecificEmbeddingColumnV2,
                    self).get_dense_tensor(transformation_cache, state_manager)
-    sparse_tensor = transformation_cache.get(self.categorical_column.name,
-                                             state_manager)
 
-    # Use outside compile to densify and pad the input tensors.
-    def host_computation():
-      return pad_sparse_embedding_lookup_indices(sparse_tensor,
-                                                 self._tensor_core_shape[1])
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = transformation_cache.get(self.categorical_column.name,
+                                               state_manager)
 
-    values, mask = tpu.outside_compilation(host_computation)
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
 
-    # Do a dense embedding lookup on TensorCore.
-    embedding_weights = state_manager.get_variable(self, 'embedding_weights')
-    embedding = sparse_embedding_aggregate_slice(embedding_weights,
-                                                 (values, mask),
-                                                 self.get_combiner())
-    return embedding
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = transformation_cache.get(self.categorical_column.name,
+                                        state_manager)
+      mask = transformation_cache.get(
+          self.categorical_column.name + _TENSOR_CORE_MASK_KEY_SUFFIX,
+          state_manager)
+    embedding_weights = state_manager.get_variable(
+        self, name='embedding_weights')
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
+
+  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
+    ):
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor(inputs, weight_collections,
+                                           trainable)
+    # TPU_EMBEDDING_CORE case.
+    elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor(inputs, weight_collections,
+                                           trainable)
+
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = inputs.get(self.get_feature_key_name())
+
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
+
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = inputs.get(self.get_feature_key_name())
+      mask = inputs.get(self.get_feature_key_name() +
+                        _TENSOR_CORE_MASK_KEY_SUFFIX)
+
+    embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
+    if (weight_collections and
+        ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections):
+      weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
+    embedding_weights = variable_scope.get_variable(
+        name='embedding_weights',
+        shape=embedding_shape,
+        dtype=dtypes.float32,
+        initializer=self.initializer,
+        trainable=self.trainable and trainable,
+        collections=weight_collections)
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
 
 
 class _TPUSharedDeviceSpecificEmbeddingColumnV2(_TPUSharedEmbeddingColumnV2):
@@ -940,34 +1029,47 @@ class _TPUSharedDeviceSpecificEmbeddingColumnV2(_TPUSharedEmbeddingColumnV2):
       del kwargs['embedding_lookup_device']
     _TPUSharedEmbeddingColumnV2.__init__(self, *args, **kwargs)
 
+  def __deepcopy__(self, memo):
+    return _TPUSharedDeviceSpecificEmbeddingColumnV2(
+        *(copy.deepcopy(a, memo) for a in self.__getnewargs__()),
+        tensor_core_shape=self._tensor_core_shape,
+        embedding_lookup_device=self._embedding_lookup_device)
+
   def _get_dense_tensor_internal(self, transformation_cache, state_manager):
     """Private method that follows _get_dense_tensor_internal."""
-    if (tpu.under_tpu_inference_context() and
-        self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE):
-      raise ValueError('Using embedding_lookup_device=tpu_embedding_core '
-                       'during inference is not supported.')
-    if self._embedding_lookup_device == EmbeddingDevice.CPU:
-      if tpu.under_tpu_inference_context():
-        return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
-                     self)._get_dense_tensor_internal(transformation_cache,
-                                                      state_manager)
-      else:
-        raise ValueError(
-            'Using TPUSharedEmbeddingColumn with '
-            'embedding_lookup_device="cpu" during training is not supported.')
-    sparse_tensor = transformation_cache.get(self.categorical_column.name,
-                                             state_manager)
+    _check_invalid_cases(self._embedding_lookup_device)
+    # CPU Case.
+    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
+    ):
+      return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor_internal(transformation_cache,
+                                                    state_manager)
+    # TPU_EMBEDDING_CORE case.
+    if self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
+      return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
+                   self)._get_dense_tensor_internal(transformation_cache,
+                                                    state_manager)
 
-    # Use outside compile to densify and pad the input tensors.
-    def host_computation():
-      return pad_sparse_embedding_lookup_indices(sparse_tensor,
-                                                 self._tensor_core_shape[1])
+    # TPU_EMBEDDING_CORE cases.
+    if tpu.under_tpu_inference_context():
+      # For inference, use outside compile to densify and pad the input tensors.
+      sparse_tensor = transformation_cache.get(self.categorical_column.name,
+                                               state_manager)
 
-    values, mask = tpu.outside_compilation(host_computation)
+      def host_computation():
+        return pad_sparse_embedding_lookup_indices(sparse_tensor,
+                                                   self._tensor_core_shape[1])
+
+      values, mask = tpu.outside_compilation(host_computation)
+    else:
+      # For training, the inputs should already have been densified and padded.
+      values = transformation_cache.get(self.categorical_column.name,
+                                        state_manager)
+      mask = transformation_cache.get(
+          self.categorical_column.name + _TENSOR_CORE_MASK_KEY_SUFFIX,
+          state_manager)
 
     # Do a dense embedding lookup on TensorCore.
     embedding_weights = self.shared_embedding_column_creator.embedding_weights
-    embedding = sparse_embedding_aggregate_slice(embedding_weights,
-                                                 (values, mask),
-                                                 self.get_combiner())
-    return embedding
+    return sparse_embedding_aggregate_slice(embedding_weights, (values, mask),
+                                            self.get_combiner())
diff --git a/tensorflow/python/tpu/feature_column_v2_test.py b/tensorflow/python/tpu/feature_column_v2_test.py
index 282d176b301..932fe4e5a0a 100644
--- a/tensorflow/python/tpu/feature_column_v2_test.py
+++ b/tensorflow/python/tpu/feature_column_v2_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import feature_column_v2 as tpu_fc
 from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu import tpu_function
 
 
 def _initialized_session():
@@ -514,50 +515,119 @@ class DeviceSpecificEmbeddingColumnTestV2(test.TestCase,
           embedding_lookup_device='tpu_tensor_core',
           tensor_core_shape=[None, 3])
 
-    # Run in TPUInferenceContext so that we hit the intended densification case.
+    # Run in TPUContexts so that we hit the intended densification case.
     context = tpu._TPUInferenceContext('tpu_inference')
     context.Enter()
+    with tpu_function.tpu_shard_context(1):
+      dense_features = fc_lib.DenseFeatures(embedding_column)
+      # Sqrtn combiner not supported for now.
+      if combiner == 'sqrtn':
+        with self.assertRaisesRegexp(
+            ValueError, 'Dense TPU Embedding does not support combiner'):
+          embedding_lookup = dense_features(input_features)
+        return
+      if combiner == 'mean':
+        expected_lookups = (
+            # example 0:
+            (7., 11.),  # ids [2], embedding = [7, 11]
+            # example 1:
+            (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) =
+            # [2, 3.5]
+        )
+      elif combiner == 'sum':
+        expected_lookups = (
+            # example 0:
+            (7., 11.),  # ids [2], embedding = [7, 11]
+            # example 1:
+            (4., 7),  # ids [0, 1], embedding = sum([1, 2] + [3, 5]) = [4, 7]
+        )
 
-    dense_features = fc_lib.DenseFeatures(embedding_column)
-    # Sqrtn combiner not supported for now.
-    if combiner == 'sqrtn':
-      with self.assertRaisesRegexp(
-          ValueError, 'Dense TPU Embedding does not support combiner'):
-        embedding_lookup = dense_features(input_features)
-      return
-    if combiner == 'mean':
+      embedding_lookup = dense_features(input_features)
+
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      if shared:
+        self.assertCountEqual(('inp_shared_embedding:0',),
+                              tuple([v.name for v in global_vars]))
+      else:
+        self.assertCountEqual(
+            ('dense_features/inp_embedding/embedding_weights:0',),
+            tuple([v.name for v in global_vars]))
+
+      embedding_var = global_vars[0]
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, embedding_var.eval())
+        eval_res = embedding_lookup.eval()
+        self.assertAllEqual(expected_lookups, eval_res)
+      context.Exit()
+
+  @test_util.deprecated_graph_mode_only
+  def test_empty_row(self):
+    # Inputs.
+    vocabulary_size = 3
+    input_sparse_tensor = sparse_tensor.SparseTensorValue(
+        # example 0, ids []
+        # example 1, ids [0, 1, 3]
+        indices=((1, 0), (1, 1), (1, 4)),
+        values=(0, 1, 3),
+        dense_shape=(2, 5))
+    input_features = {'inp': input_sparse_tensor}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.),  # id 2
+        (13., 17.)  # id 3
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Build columns.
+    categorical_column_input = fc_lib.categorical_column_with_identity(
+        key='inp', num_buckets=vocabulary_size)
+
+    # Set tensor_core_shape to be [None, 20] to ensure some padding and
+    # dynamic batch size.
+    embedding_column = tpu_fc.embedding_column_v2(
+        categorical_column_input,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        combiner='mean',
+        embedding_lookup_device='tpu_tensor_core',
+        tensor_core_shape=[None, 3])
+
+    # Run in TPUContexts so that we hit the intended densification case.
+    context = tpu._TPUInferenceContext('tpu_inference')
+    context.Enter()
+    with tpu_function.tpu_shard_context(1):
+      dense_features = fc_lib.DenseFeatures(embedding_column)
       expected_lookups = (
           # example 0:
-          (7., 11.),  # ids [2], embedding = [7, 11]
+          (0., 0.),  # ids [], embedding = [0, 0]
           # example 1:
           (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
       )
-    elif combiner == 'sum':
-      expected_lookups = (
-          # example 0:
-          (7., 11.),  # ids [2], embedding = [7, 11]
-          # example 1:
-          (4., 7),  # ids [0, 1], embedding = sum([1, 2] + [3, 5]) = [4, 7]
-      )
 
-    embedding_lookup = dense_features(input_features)
+      embedding_lookup = dense_features(input_features)
 
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    if shared:
-      self.assertCountEqual(('inp_shared_embedding:0',),
-                            tuple([v.name for v in global_vars]))
-    else:
+      # Assert expected embedding variable and lookups.
+      global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
       self.assertCountEqual(
           ('dense_features/inp_embedding/embedding_weights:0',),
           tuple([v.name for v in global_vars]))
 
-    embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      eval_res = embedding_lookup.eval()
-      self.assertAllEqual(expected_lookups, eval_res)
-    context.Exit()
+      embedding_var = global_vars[0]
+      with _initialized_session():
+        self.assertAllEqual(embedding_values, embedding_var.eval())
+        eval_res = embedding_lookup.eval()
+        self.assertAllEqual(expected_lookups, eval_res)
+      context.Exit()
 
   @test_util.deprecated_graph_mode_only
   def test_error_dense_shape_invalid(self):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
index 46d0362a705..355c57269fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.tpu.experimental.-embedding-config-spec.pbtxt
@@ -35,6 +35,10 @@ tf_class {
     name: "table_to_config_dict"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "tensor_core_feature_columns"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
   }

From deaf5ea06a618749113f7def6285ebf77c19dfa7 Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Tue, 19 May 2020 20:51:23 +0000
Subject: [PATCH 0843/1533] Addressed PR comments

---
 tensorflow/python/keras/integration_test/BUILD                 | 3 +--
 .../python/keras/integration_test/gradient_checkpoint_test.py  | 3 ++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index b7d9957a12e..07c3a4a5ab9 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -1,8 +1,7 @@
 # Description:
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test", "cuda_py_test")
 
 package(
     default_visibility = [
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
index c09a6bf51e6..5209dcff832 100644
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -17,7 +17,8 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
-from tensorflow.keras import layers, optimizers
+layers = tf.keras.layers
+optimizers = tf.keras.optimizers
 
 def _get_big_cnn_model(img_dim, n_channels, num_partitions,
                        blocks_per_partition):

From 935c55c590898f589de230c60ccbc6d50f09a8c7 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 19 May 2020 13:48:55 -0700
Subject: [PATCH 0844/1533] Fix performance regression involving trainable
 check in batchnorm.

The regression made it so a tf `and` op would be used for something that just required a python check, which in turn would make an if statement build a tf.cond instead of a python if.

This change makes it just use a python if.

PiperOrigin-RevId: 312345759
Change-Id: I568c9c992287bfc3e693f34b7b51bd7f35388f34
---
 tensorflow/python/keras/layers/normalization.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index a6d3c3c3e1c..6e96bdcda88 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -712,9 +712,10 @@ class BatchNormalizationBase(Layer):
     if self._USE_V2_BEHAVIOR:
       if isinstance(training, int):
         training = bool(training)
-      # When the layer is not trainable, it overrides the value passed from
-      # model.
-      training = math_ops.logical_and(training, self.trainable)
+      if not self.trainable:
+        # When the layer is not trainable, it overrides the value passed from
+        # model.
+        training = False
     return training
 
   def call(self, inputs, training=None):

From dc0149adb93c612f2c1f7fb6fc294ebcfee8e36a Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 7 May 2020 18:17:52 +0200
Subject: [PATCH 0845/1533] Test ConvertActivation in dynamic shape mode

Additionally helper functions for dynamic shape tests are improved:
- Added test parameter member variables to ParameterizedOpConverterTestBase
- Single TetOpConverter function introduced
- Type parameter handling for input tensors simplified
---
 .../tf2tensorrt/convert/convert_nodes_test.cc | 635 ++++++++++--------
 1 file changed, 337 insertions(+), 298 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 1efc31f9e24..5a9e75faf68 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -32,8 +32,6 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/nn_ops_internal.h"
@@ -58,6 +56,8 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/public/session.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/tensorrt/NvInfer.h"
 
 namespace tensorflow {
@@ -67,9 +67,7 @@ namespace convert {
 using absl::StrCat;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
-using ::testing::FloatNear;
 using ::testing::Matcher;
-using ::testing::NanSensitiveFloatNear;
 
 // TensorRT modes for testing. We define the following three modes:
 // 1. Implicit batch mode: The tensors have static (known) input shape and the
@@ -213,9 +211,12 @@ Matcher<std::vector<float>> ArrayFloatNear(const std::vector<float>& values,
   matchers.reserve(values.size());
   for (const float& v : values) {
     if (nan_sensitive) {
-      matchers.emplace_back(NanSensitiveFloatNear(v, max_abs_error));
+      matchers.emplace_back(::testing::NanSensitiveFloatNear(v, max_abs_error));
+    } else if (max_abs_error == 0) {
+      matchers.emplace_back(::testing::FloatEq(v));
     } else {
-      matchers.emplace_back(FloatNear(v, max_abs_error));
+      EXPECT_GE(max_abs_error, 0);
+      matchers.emplace_back(::testing::FloatNear(v, max_abs_error));
     }
   }
   return ElementsAreArray(matchers);
@@ -298,7 +299,7 @@ struct StaticCaster {
 };
 
 template <typename InCType, typename OutCType>
-std::vector<OutCType> CastTestVector(const std::vector<InCType>& vals) {
+std::vector<OutCType> CastTestVector(const gtl::ArraySlice<InCType>& vals) {
   std::vector<OutCType> res(vals.size());
   std::transform(vals.begin(), vals.end(), res.begin(),
                  StaticCaster<InCType, OutCType>());
@@ -1288,6 +1289,21 @@ inline absl::Span<const T> GetSpanForData(const InputOutputData& data) {
   return absl::Span<const T>(tensor_map.data(), tensor_map.size());
 }
 
+std::vector<float> GetDataAsFloat(InputOutputData& data) {
+  if (data.tensor.dtype() == DT_FLOAT) {
+    auto span = GetSpanForData<float>(data);
+    return std::vector<float>(span.begin(), span.end());
+  }
+  if (data.tensor.dtype() == DT_HALF) {
+    return CastTestVector<Eigen::half, float>(
+        GetSpanForData<Eigen::half>(data));
+  }
+  if (data.tensor.dtype() == DT_INT32) {
+    return CastTestVector<int32, float>(GetSpanForData<int32>(data));
+  }
+  LOG(FATAL) << "DataType not supported for testing "
+             << DataTypeString(data.tensor.dtype());
+}
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -1341,6 +1357,33 @@ class OpConverterTest : public ::testing::Test {
     return ret;
   }
 
+  // Constructs a tensor with given values (vals). The tensor type is defined by
+  // the tf_dtype argument, its shape is given by input_dims. The tensor is
+  // constructed using the allocator of OpConverterTest in Unified Memory.
+  template <typename T>
+  Tensor AsTensor(std::vector<T> vals, const std::vector<int> input_dims,
+                  DataType tf_dtype) {
+    Tensor ret(allocator_.get(), tf_dtype, {static_cast<int64>(vals.size())});
+    if (tf_dtype == DT_FLOAT) {
+      auto conv_vals = CastTestVector<T, float>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(), ret.flat<float>().data());
+    } else if (tf_dtype == DT_HALF) {
+      auto conv_vals = CastTestVector<T, Eigen::half>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(),
+                  ret.flat<Eigen::half>().data());
+    } else if (tf_dtype == DT_INT32) {
+      auto conv_vals = CastTestVector<T, int32>(vals);
+      std::copy_n(conv_vals.data(), conv_vals.size(), ret.flat<int32>().data());
+    } else {
+      LOG(FATAL) << "Cannot create tensor with type "
+                 << DataTypeString(tf_dtype);
+    }
+    TensorShape shape;
+    TF_EXPECT_OK(TensorShapeUtils::MakeShape(input_dims, &shape));
+    CHECK(ret.CopyFrom(ret, shape));
+    return ret;
+  }
+
   // Constructs a flat tensor in Unified Memory.
   template <typename T>
   Tensor ConstructTensor(int data_size, const T& value = T()) {
@@ -1348,6 +1391,13 @@ class OpConverterTest : public ::testing::Test {
     return AsTensor<T>(values);
   }
 
+  // Constructs a flat tensor in Unified Memory.
+  template <typename T>
+  Tensor ConstructTensor(int data_size, const T& value, DataType tf_dtype) {
+    std::vector<T> values(data_size, value);
+    return AsTensor<T>(values, {data_size}, tf_dtype);
+  }
+
   void CheckDataTypeMatches(const DataVec& datas) {
     for (const auto& data : datas) {
       const int input_index = engine_->getBindingIndex(data.name.c_str());
@@ -1361,27 +1411,29 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
-  void BuildAndRun(const DataVec& input_data, DataVec* output_data,
-                   const int batch_size = 1) {
+  Status BuildAndRun(const DataVec& input_data, DataVec* output_data,
+                     const int batch_size = 1) {
     // Mark the output tensor as TRT engine output.
     std::vector<Converter::EngineOutputInfo> output_info;
     for (const auto& data : *output_data) {
       output_info.push_back(
           {data.name, data.name, TfDataTypeToTrt(data.tensor.dtype())});
     }
-    TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info));
+    TF_RETURN_IF_ERROR(converter_->RenameAndMarkOutputTensors(output_info));
 
     // Build the TRT engine.
-    ASSERT_EQ(nullptr, engine_.get());
+    if (engine_.get() != nullptr) {
+      return errors::Internal("Engine already exists");
+    }
     TrtShapeOptimizationProfile profiles;
     if (!converter_->use_implicit_batch()) {
       // Create a single optimization profile for explicit batch mode
       std::vector<TensorShape> input_shapes;
-      TF_ASSERT_OK(GetShapeFromDataVec(input_data, &input_shapes));
+      TF_RETURN_IF_ERROR(GetShapeFromDataVec(input_data, &input_shapes));
       profiles.AddShape(input_shapes);
       profiles.InitProfiles();
     }
-    TF_ASSERT_OK(
+    TF_RETURN_IF_ERROR(
         converter_->BuildCudaEngine(&engine_,
                                     /*max_batch_size=*/batch_size,
                                     /*max_workspace_size_bytes=*/1 << 26,
@@ -1395,7 +1447,9 @@ class OpConverterTest : public ::testing::Test {
     const int num_bindings = input_data.size() + output_data->size();
     std::vector<void*> buffers(num_bindings);
 
-    ASSERT_EQ(engine_->getNbBindings(), num_bindings);
+    if (engine_->getNbBindings() != num_bindings) {
+      return errors::Internal("Number of bindings do not match");
+    }
     // Since we have only 1 optimization profile (which is enabled by default)
     // it is fine to create execution context directly, instead of calling
     // profiles.CreateExecutionContexts()
@@ -1403,19 +1457,19 @@ class OpConverterTest : public ::testing::Test {
         engine_->createExecutionContext());
 
     // Prepare input bindings.
-    TF_ASSERT_OK(SetTrtEngineInputs(engine_.get(), execution_context.get(), 0,
-                                    buffers, converter_->use_implicit_batch(),
-                                    batch_size, nullptr, &input_data));
-
+    TF_RETURN_IF_ERROR(SetTrtEngineInputs(
+        engine_.get(), execution_context.get(), 0, buffers,
+        converter_->use_implicit_batch(), batch_size, nullptr, &input_data));
     // Prepare output bindings.
-    TF_ASSERT_OK(SetTrtEngineOutputs(engine_.get(), execution_context.get(), 0,
-                                     buffers, converter_->use_implicit_batch(),
-                                     batch_size, nullptr, output_data));
-
+    TF_RETURN_IF_ERROR(SetTrtEngineOutputs(
+        engine_.get(), execution_context.get(), 0, buffers,
+        converter_->use_implicit_batch(), batch_size, nullptr, output_data));
     // Execute the TRT engine.
-    TF_ASSERT_OK(TrtEnqueue(execution_context.get(), buffers, stream_,
-                            converter_->use_implicit_batch(), batch_size));
+    TF_RETURN_IF_ERROR(TrtEnqueue(execution_context.get(), buffers, stream_,
+                                  converter_->use_implicit_batch(),
+                                  batch_size));
     cudaStreamSynchronize(stream_);
+    return Status::OK();
   }
 
   bool HasStaticShape(const nvinfer1::Dims& dims) const {
@@ -1432,7 +1486,7 @@ class OpConverterTest : public ::testing::Test {
 
   // Adds ITensor for both validation and conversion, assuming explicit batch
   // dimension is included in dims (ie for an NCHW tensor dims = {N, C, H, W}).
-  void AddTestTensorWithExplicitBatchDim(
+  void AddTestTensorWithTFDims(
       const string& name, const std::vector<int32>& dims,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
     DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
@@ -1452,54 +1506,19 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
-  // Adds ITensor for both validation and conversion. The tensor can have
-  // partial input shape. This function defines static or dynamic shape input
-  // tensor for the network based on the trt_mode attribute. This is done
-  // automatically, unless the user overrides it with an explicit
-  // partial_input_shape_dims argument.
-  //
-  // Parameters:
-  // - dims actual dimensions of the tensor that we will use during the test
-  //   (including explicit batch dim). This is not used if partial_input_shape
-  //   is defined.
-  // - partial_input_shape dimensions which can incude unknown shapes. This can
-  //   be empty, in that case the partial_input_shape will be set automatically
-  //   depending on the trt_mode argument. (This also includse explicit batch
-  //   dim).
-  //
-  //  On return skip_test is false if trt_mode is not compatible with the
-  // partial input shape.
-  void AddTestTensor(
-      const string& name, const std::vector<int32>& dims,
-      nvinfer1::DataType trt_dtype, TrtTestMode trt_mode,
-      const std::vector<int32>* partial_input_shape_dims = nullptr) {
-    std::vector<int32> partial_shape;
-    if (partial_input_shape_dims && !partial_input_shape_dims->empty()) {
-      partial_shape = *partial_input_shape_dims;
-    } else {
-      if (trt_mode == TrtTestMode::kDynamicShape) {
-        // In dynamic shape mode we set the all dims unknown.
-        partial_shape = std::vector<int32>(dims.size(), -1);
-      } else {
-        // Use static (known) input shapes.
-        partial_shape = dims;
-      }
-    }
-    AddTestTensorWithExplicitBatchDim(name, partial_shape, trt_dtype);
-  }
-
   // Adds ITensor for both validation and conversion. The difference compared to
-  // AddTestTensorWithExplicitBatchDim is in the meaning of the dims parameter.
-  // To define a tensor with NCHW shape, here we set dims = {C,H,W} and
-  // batch_size = N. TODO(tfeher) remove this function once all test are updated
-  // to use the other version of AddTestTensor which has the trt_mode arg.
+  // AddTestTensorWithTFDims is in the meaning of the dims parameter. To define
+  // a tensor with NCHW shape, here we set dims = {C,H,W} and batch_size = N.
+  // TODO(tfeher) remove this function once all test are updated to use the
+  // other version of AddTestTensor (defined by
+  // ParameterizedOpConverterTestBase).
   void AddTestTensor(
       const string& name, const std::vector<int32>& dims, int batch_size = 1,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
     std::vector<int32> dims_with_batch(dims.size() + 1);
     dims_with_batch[0] = batch_size;
     std::copy(dims.begin(), dims.end(), dims_with_batch.begin() + 1);
-    AddTestTensorWithExplicitBatchDim(name, dims_with_batch, trt_dtype);
+    AddTestTensorWithTFDims(name, dims_with_batch, trt_dtype);
     if (HasStaticShape(dims)) {
       ASSERT_EQ(batch_size, converter_->batch_size_);
     }
@@ -1532,6 +1551,21 @@ class OpConverterTest : public ::testing::Test {
         converter_->AddTensorOrWeights(name, TRT_TensorOrWeights{weights}));
   }
 
+  template <typename T>
+  void AddTestWeights(const string& name, const std::vector<int>& dims,
+                      const std::vector<T>& values, DataType tf_dtype) {
+    if (tf_dtype == DT_FLOAT) {
+      AddTestWeights(name, dims, CastTestVector<T, float>(values));
+    } else if (tf_dtype == DT_HALF) {
+      AddTestWeights(name, dims, CastTestVector<T, Eigen::half>(values));
+    } else if (tf_dtype == DT_INT32) {
+      AddTestWeights(name, dims, CastTestVector<T, int32>(values));
+    } else {
+      FAIL() << "Cannot create test weights with type "
+             << DataTypeString(tf_dtype);
+    }
+  }
+
   // Test validation in validation-only mode.
   void RunValidation(const Node* node, error::Code expected_code = error::OK,
                      const char* expected_msg_substr = nullptr) {
@@ -1669,20 +1703,146 @@ std::ostream& operator<<(std::ostream& os, const TestParamBase& p) {
   return os;
 }
 
-// Parameterized version of OpConverterTest. This class will be instantiated
-// to test all the TrtTestModes but only in FP32 precision. This means that we
-// will use the following combinations of test parameters:
+// Parameterized version of OpConverterTest. We have the following parameters:
 // 1. TrtTestMode: implicit batch, explicit batch, dynamic shape modes
-// 2. DataType of the input TF tensors: DT_FLOAT
-// 3. TrtPrecisionMode argument for the Converter: FP32
-class ParameterizedOpConverterTest
+// 2. DataType of the input TF tensors: DT_FLOAT, DT_HALF, DT_INT32
+// 3. TrtPrecisionMode argument for the Converter: FP32, FP16, INT8
+// We will introduce subclasses that will be instantiated using different
+// combinations of the DataType and TrtPrecisionMode parameters.
+class ParameterizedOpConverterTestBase
     : public OpConverterTest,
       public ::testing::WithParamInterface<
-          std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {};
+          std::tuple<TrtTestMode, DataType, TrtPrecisionMode>> {
+ public:
+  ParameterizedOpConverterTestBase()
+      : trt_mode(std::get<0>(GetParam())),
+        tf_dtype(std::get<1>(GetParam())),
+        converter_precision(std::get<2>(GetParam())) {}
 
-// Instantiate parameter combinations to test. For debugging purposes it might
-// make sense to run over all possible combinations, but normally a subset of
-// them would be sufficient:
+  void Reset() {
+    OpConverterTest::Reset(converter_precision, trt_mode);
+    input_data_.clear();
+  }
+
+  // Adds an input ITensor for TRT network. Also creates the corresponding TF
+  // tensor, and stores it in the list of inputs (input_data_).
+  //
+  // The TF tensor is always created with concrete static input shape given by
+  // dims. The ITensor can have static or dynamic shape based on the trt_mode
+  // attribute. The ITensor shape is set automatically according to the trt_mode
+  // parameter, unless the user overrides it with an explicit
+  // partial_input_shape_dims argument.
+  //
+  // Parameters:
+  // - name of the input node
+  // - dims actual dimensions of the tensor that we will use during the test
+  //   (including explicit batch dim)
+  // - values initial values for the TF tensor
+  // - dtype data type of the tensor
+  // - partial_input_shape dimensions which can incude unknown shapes. This can
+  //   be empty, in that case the partial_input_shape will be set automatically
+  //   depending on the trt_mode argument. (This argument also includes explicit
+  //   batch dim).
+  //
+  template <typename T>
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     DataType tf_dtype, const std::vector<T>& values,
+                     const std::vector<int32>& partial_input_shape_dims = {}) {
+    std::vector<int32> partial_shape;
+    if (!partial_input_shape_dims.empty()) {
+      partial_shape = partial_input_shape_dims;
+    } else {
+      if (trt_mode == TrtTestMode::kDynamicShape) {
+        // In dynamic shape mode we make all dims unknown.
+        partial_shape = std::vector<int32>(dims.size(), -1);
+      } else {
+        // Use static (known) input shapes.
+        partial_shape = dims;
+      }
+    }
+    AddTestTensorWithTFDims(name, partial_shape, TfDataTypeToTrt(tf_dtype));
+    if (!values.empty()) {
+      VLOG(2) << "Adding test tensor: " << name << " "
+              << DataTypeString(tf_dtype);
+      InputOutputData data{name, AsTensor(values, dims, tf_dtype)};
+      VLOG(2) << "Added tensor: " << data.name
+              << DataTypeString(data.tensor.dtype());
+      input_data_.push_back(data);
+    }
+  }
+
+  // Adds test tensor (same as above) but with the default tf_dtype defined by
+  // the test params.
+  void AddTestTensor(const string& name, const std::vector<int32>& dims,
+                     const std::vector<float>& values = {},
+                     const std::vector<int32>& partial_input_shape_dims = {}) {
+    AddTestTensor<float>(name, dims, tf_dtype, values,
+                         partial_input_shape_dims);
+  }
+
+  // Builds and runs the converted network. Checks output tensor shape. Tests
+  // output values using a matcher. The network can have multiple input and
+  // output tensors. The inputs are defined by the input_data_ member variable.
+  void BuildAndRun(const string& name,
+                   const std::vector<std::vector<int>>& expected_output_dims,
+                   const Status& expected_runtime_status,
+                   const std::vector<Matcher<std::vector<float>>>& matcher) {
+    TensorShape shape;
+    const int n_output = expected_output_dims.size();
+    ASSERT_EQ(n_output, matcher.size());
+    DataVec output_data;
+    for (int i = 0; i < n_output; i++) {
+      TF_EXPECT_OK(
+          TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+      string out_name = (n_output == 1) ? name : StrCat(name, ":", i);
+      InputOutputData data{out_name,
+                           ConstructTensor(shape.num_elements(), 0, tf_dtype)};
+      output_data.push_back(data);
+    }
+    ASSERT_FALSE(input_data_.empty());
+    const int batch_size = input_data_[0].tensor.shape().dim_size(0);
+    Status stat =
+        OpConverterTest::BuildAndRun(input_data_, &output_data, batch_size);
+    ASSERT_EQ(expected_runtime_status, stat);
+    if (expected_runtime_status.ok() && stat.ok()) {
+      for (int i = 0; i < n_output; i++) {
+        // Check the shape of the actual output tensors
+        TF_EXPECT_OK(
+            TensorShapeUtils::MakeShape(expected_output_dims[i], &shape));
+        EXPECT_TRUE(output_data[i].tensor.shape() == shape)
+            << "Expected shape: " << shape.DebugString() << ", actual shape"
+            << output_data[i].tensor.shape().DebugString();
+        EXPECT_THAT(GetDataAsFloat(output_data[i]), matcher[i]);
+      }
+    }
+  }
+
+  // Runs validation and conversion. If conversion is successfull then builds
+  // the TRT network, executes it and checks the output.
+  void TestOpConverter(const string& name, const NodeDef node_def,
+                       const std::vector<int>& expected_output_dims,
+                       const Status& expected_conversion_status,
+                       const Status& expected_runtime_status,
+                       const Matcher<std::vector<float>>& matcher) {
+    RunValidationAndConversion(node_def, expected_conversion_status,
+                               name.c_str(), expected_output_dims);
+    if (expected_conversion_status.ok()) {
+      BuildAndRun(name, std::vector<std::vector<int>>({expected_output_dims}),
+                  expected_runtime_status,
+                  std::vector<Matcher<std::vector<float>>>({matcher}));
+    }
+  }
+
+ protected:
+  const TrtTestMode trt_mode;
+  const DataType tf_dtype;
+  const TrtPrecisionMode converter_precision;
+  DataVec input_data_;
+};
+
+// Op converter test in FP32 mode. While for debugging purposes it might make
+// sense to run over all possible combinations, normally a subset of them
+// would be sufficient:
 // - All valid options to TrtTestMode (implicit, explicit, dynamic shape)
 // - DataType: is the TF data type of the input tensors. This usually only
 //   influences the data type added by Converter::AddInputTensor. We test the
@@ -1692,87 +1852,15 @@ class ParameterizedOpConverterTest
 //   how TRT handles the precision inside the TRT network, but should not matter
 //   for the TF -> TRT conversion. Therefore it should be sufficient to test
 //   for FP32.
+class OpConverterTest1 : public ParameterizedOpConverterTestBase {};
+
+// Instantiate parameter combinations to OpConverterTest1
 INSTANTIATE_TEST_CASE_P(
-    OpConvTestInstantiation, ParameterizedOpConverterTest,
+    OpConvTestInstantiation, OpConverterTest1,
     ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
                        ::testing::Values(DT_FLOAT),
                        ::testing::Values(TrtPrecisionMode::FP32)));
 
-// Builds and runs the converted network. Checks output tensor shape. Tests
-// output values using a matcher.
-template <DataType input_dtype, DataType output_dtype>
-void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
-                                 const TestParamBase& p,
-                                 const std::vector<float>& input_vec,
-                                 const Matcher<std::vector<float>>& matcher) {
-  if (!p.status.ok()) {
-    // conversion was not successful, we cannot run the network
-    return;
-  }
-  if (!p.runtime_status.ok()) {
-    // Runtime error is expected. This can happen if the operation is invalid
-    // for the actual input shape. Usually we catch these errors during
-    // conversion. If the network was defined with dynamic input shape than we
-    // have to postpone these steps until runtime.
-    //
-    // TODO(tfeher) Instead of early return, modify BuildAndRun to handle
-    // runtime errors.
-    return;
-  }
-  typedef typename EnumToDataType<input_dtype>::Type Tin;
-  TensorShape shape;
-  TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.input_dims, &shape));
-  const DataVec input_data{
-      {"input",
-       test->AsTensor<Tin>(CastTestVector<float, Tin>(input_vec), shape)}};
-  typedef typename EnumToDataType<output_dtype>::Type Tout;
-  DataVec output_data{{name, test->ConstructTensor<Tout>(6)}};
-  test->BuildAndRun(input_data, &output_data);
-  // Check the shape of the actual output tensor
-  TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.expected_output_dims, &shape));
-  EXPECT_TRUE(output_data[0].tensor.shape() == shape)
-      << "Expected shape: " << shape.DebugString() << ", actual shape"
-      << output_data[0].tensor.shape().DebugString();
-  // Cast the output to float and compare to expected output
-  auto out_span = GetSpanForData<Tout>(output_data[0]);
-  std::vector<float> casted_output(out_span.begin(), out_span.end());
-  EXPECT_THAT(casted_output, matcher);
-}
-
-void InstantiateBuildAndRun(DataType tf_dtype, const string& name,
-                            OpConverterTest* test, const TestParamBase& p,
-                            const std::vector<float>& input_vec,
-                            const Matcher<std::vector<float>>& matcher) {
-  if (tf_dtype == DT_FLOAT) {
-    BuildAndRunConvertedNetwork<DT_FLOAT, DT_FLOAT>(name, test, p, input_vec,
-                                                    matcher);
-  } else if (tf_dtype == DT_HALF) {
-    BuildAndRunConvertedNetwork<DT_HALF, DT_HALF>(name, test, p, input_vec,
-                                                  matcher);
-  } else if (tf_dtype == DT_INT32) {
-    BuildAndRunConvertedNetwork<DT_INT32, DT_INT32>(name, test, p, input_vec,
-                                                    matcher);
-  } else {
-    FAIL() << "Test not supported for " << tf_dtype;
-  }
-}
-
-void InstantiateBuildAndRun(DataType input_tf_dtype, DataType output_tf_dtype,
-                            const string& name, OpConverterTest* test,
-                            const TestParamBase& p,
-                            const std::vector<float>& input_vec,
-                            const Matcher<std::vector<float>>& matcher) {
-  if (input_tf_dtype == output_tf_dtype) {
-    InstantiateBuildAndRun(input_tf_dtype, name, test, p, input_vec, matcher);
-  } else if (input_tf_dtype == DT_HALF && output_tf_dtype) {
-    BuildAndRunConvertedNetwork<DT_HALF, DT_FLOAT>(name, test, p, input_vec,
-                                                   matcher);
-  } else {
-    FAIL() << "Test not supported for input " << input_tf_dtype << " output "
-           << output_tf_dtype;
-  }
-}
-
 template <typename T>
 void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
   out->Clear();
@@ -1910,14 +1998,7 @@ TEST_F(OpConverterTest, ConvertConst) {
   TestConvertConst<DT_UINT64, uint64, int32>(this);
 }
 
-TEST_P(ParameterizedOpConverterTest, ConvertTranspose) {
-  const auto& spec = GetParam();
-  const TrtTestMode trt_mode = std::get<0>(spec);
-  // Data type of TF input tensors
-  const DataType tf_dtype = std::get<1>(spec);
-  // Precision mode used for  TensorRT engine
-  TrtPrecisionMode converter_precision = std::get<2>(spec);
-
+TEST_P(OpConverterTest1, ConvertTranspose) {
   // Get the NodeDef for Transpose.
   Scope s = Scope::NewRootScope();
   auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
@@ -1928,7 +2009,7 @@ TEST_P(ParameterizedOpConverterTest, ConvertTranspose) {
   std::vector<TestParamBase> test_params = {
       // For the first test we leave param empty. This signals to use a
       // input as weight which will be invalid
-      TestParamBase{{1, 1, 2, 3},
+      TestParamBase{{3, 1, 2, 1},
                     {},
                     {},
                     {},
@@ -1962,20 +2043,17 @@ TEST_P(ParameterizedOpConverterTest, ConvertTranspose) {
   std::vector<float> expected_values{1, 4, 2, 5, 3, 6};
   for (auto p : test_params) {
     SCOPED_TRACE(p);
-    Reset(converter_precision, trt_mode);
-    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode,
-                  &p.partial_input_dims);
+    Reset();
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
     if (p.param.empty()) {
       AddTestTensor("weights", {3});
     } else {
       AddTestWeights<int32>("weights", {static_cast<int>(p.param.size())},
                             p.param);
     }
-    RunValidationAndConversion(node_def, p.status, "my_transpose",
-                               p.expected_output_dims);
-    InstantiateBuildAndRun(tf_dtype, "my_transpose", this, p,
-                           {1, 2, 3, 4, 5, 6},
-                           ElementsAreArray(expected_values));
+    TestOpConverter("my_transpose", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray(expected_values));
   }
 }
 
@@ -2900,90 +2978,67 @@ TEST_F(OpConverterTest, ConvertCombinedNMS) {
 }
 #endif  // IS_TRT_VERSION_GE(5, 1, 0, 0)
 
-TEST_F(OpConverterTest, ConvertActivation) {
+template <typename T>
+NodeDef CreateUnaryOp(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+  return T(s.WithOpName("my_unary"), input).operation.node()->def();
+}
+
+constexpr float kLeakyReluAlpha = 0.2f;
+template <>
+NodeDef CreateUnaryOp<ops::internal::LeakyRelu>(DataType tf_dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+  return ops::internal::LeakyRelu(
+             s.WithOpName("my_unary"), input,
+             ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha))
+      .operation.node()
+      ->def();
+}
+
+TEST_P(OpConverterTest1, ConvertActivation) {
   {
     // Input is weights, should fail.
     Reset();
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto relu = ops::Relu(s.WithOpName("my_act"), input);
-    const NodeDef& node_def = relu.operation.node()->def();
+    const NodeDef& node_def = CreateUnaryOp<ops::Relu>(tf_dtype);
     AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
-        "The input \"input\" for Relu must be a tensor, at my_act");
+        "The input \"input\" for Relu must be a tensor, at my_unary");
   }
 
-  constexpr float kLeakyReluAlpha = 0.2f;
   constexpr float kSeluAlpha = 1.7580993408473768599402175208123f;
   constexpr float kSeluScale = 1.0507009873554804934193349852946f;
+  using OpFunc = std::function<NodeDef(DataType)>;
+  using ValFunc = float (*)(float);
+  std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
 
-  // Get nodedef for activation layer.
-  auto get_act_nodedef = [](string op_name) -> NodeDef {
-    Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    if (op_name == "LeakyRelu") {
-      auto act = ops::internal::LeakyRelu(
-          s.WithOpName("my_act"), input,
-          ops::internal::LeakyRelu::Alpha(kLeakyReluAlpha));
-      return act.operation.node()->def();
-    } else if (op_name == "Relu") {
-      auto act = ops::Relu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Relu6") {
-      auto act = ops::Relu6(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Sigmoid") {
-      auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Tanh") {
-      auto act = ops::Tanh(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Elu") {
-      auto act = ops::Elu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Selu") {
-      auto act = ops::Selu(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softsign") {
-      auto act = ops::Softsign(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    } else if (op_name == "Softplus") {
-      auto act = ops::Softplus(s.WithOpName("my_act"), input);
-      return act.operation.node()->def();
-    }
-    EXPECT_TRUE(false);
-    return NodeDef();
-  };
-  // Get expected output for activation layer.
-  auto get_act_output = [](string op_name, float input) -> float {
-    if (op_name == "LeakyRelu") {
-      return (input > 0.0f) ? input : input * kLeakyReluAlpha;
-    } else if (op_name == "Relu") {
-      return (input > 0.0f) ? input : 0.0f;
-    } else if (op_name == "Relu6") {
-      return std::min(std::max(input, 0.0f), 6.0f);
-    } else if (op_name == "Sigmoid") {
-      return 1.0f / (1.0f + std::exp(-input));
-    } else if (op_name == "Tanh") {
-      return std::tanh(input);
-    } else if (op_name == "Elu") {
-      return (input > 0.0f) ? input : std::exp(input) - 1;
-    } else if (op_name == "Selu") {
-      return (input > 0.0f) ? kSeluScale * input
-                            : kSeluScale * kSeluAlpha * (std::exp(input) - 1);
-    } else if (op_name == "Softsign") {
-      return input / (std::abs(input) + 1);
-    } else if (op_name == "Softplus") {
-      return std::log(std::exp(input) + 1);
-    }
-    EXPECT_TRUE(false);
-    return 0;
-  };
+#define ADD_OP(name, op, compute) \
+  op_map[name] = std::make_pair(CreateUnaryOp<op>, compute)
+  ADD_OP("LeakyRelu", ops::internal::LeakyRelu,
+         [](float x) { return (x > 0.0f) ? x : x * kLeakyReluAlpha; });
+  ADD_OP("Relu", ops::Relu, [](float x) { return (x > 0.0f) ? x : 0.0f; });
+  ADD_OP("Relu6", ops::Relu6,
+         [](float x) { return std::min(std::max(x, 0.0f), 6.0f); });
+  ADD_OP("Sigmoid", ops::Sigmoid,
+         [](float x) { return 1.0f / (1.0f + std::exp(-x)); });
+  ADD_OP("Tanh", ops::Tanh, static_cast<ValFunc>(std::tanh));
+  ADD_OP("Elu", ops::Elu,
+         [](float x) { return (x > 0.0f) ? x : std::exp(x) - 1; });
+  ADD_OP("Selu", ops::Selu, [](float x) {
+    return (x > 0.0f) ? kSeluScale * x
+                      : kSeluScale * kSeluAlpha * (std::exp(x) - 1);
+  });
+  ADD_OP("Softsign", ops::Softsign,
+         [](float x) { return x / (std::abs(x) + 1); });
+  ADD_OP("Softplus", ops::Softplus,
+         [](float x) { return std::log(std::exp(x) + 1); });
+#undef ADD_OP
 
   // Get list of ops to test.
   std::vector<string> ops_to_test;
-  // Add all ops supported by ConvertUnary.
+  // Add all ops supported by ConvertActivation.
   auto* map = ActivationTypeMap();
   ops_to_test.reserve(map->size());
   for (auto& pair : *map) {
@@ -2992,16 +3047,30 @@ TEST_F(OpConverterTest, ConvertActivation) {
   // Add other activation ops to test.
   ops_to_test.push_back("Relu6");
   ops_to_test.push_back("LeakyRelu");
+  auto p = TestParamBase{
+      {1, 1, 2, 3},  // input dims
+      {},            // input partial dims
+      {1, 1, 2, 3},  // expected output dims
+  };
   // Ok.
   for (const string& op_name : ops_to_test) {
+    if (!op_map.count(op_name)) {
+      FAIL() << "Activation op test map does not contain op " << op_name;
+    }
     Reset();
-    NodeDef node_def = get_act_nodedef(op_name);
-    AddTestTensor("input", {1, 2, 3});
-    RunValidationAndConversion(node_def);
+    NodeDef node_def = op_map[op_name].first(tf_dtype);
+    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
+    AddTestTensor("input", p.input_dims, input);
+
+    // std::exp in Softplus will overflow for input > 88
+    std::vector<float> output_values;
+    std::transform(input.begin(), input.end(),
+                   std::back_inserter(output_values), op_map[op_name].second);
+    TestOpConverter("my_unary", node_def, p.expected_output_dims, Status::OK(),
+                    Status::OK(), ArrayFloatNear(output_values, 0, false));
+
     TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+    TF_EXPECT_OK(GetTensorOrWeights("my_unary", &output));
 
     // Certain activations should set quantization range automatically.
     auto ranges = quantization_ranges();
@@ -3011,17 +3080,6 @@ TEST_F(OpConverterTest, ConvertActivation) {
                op_name == "Softsign") {
       EXPECT_EQ(ranges[output.tensor()], 1.0f);
     }
-
-    // std::exp in Softplus will overflow for input > 88
-    const std::vector<float> input = {-100, -2, -1, 0, 1, 88};
-    const DataVec input_data{{"input", AsTensor<float>(input)}};
-    DataVec output_data{{"my_act", ConstructTensor<float>(6)}};
-    BuildAndRun(input_data, &output_data);
-    for (int i = 0; i < input.size(); i++) {
-      const float expected_output = get_act_output(op_name, input[i]);
-      EXPECT_FLOAT_EQ(GetSpanForData<float>(output_data[0])[i],
-                      expected_output);
-    }
   }
 }
 
@@ -3127,17 +3185,11 @@ TEST_F(OpConverterTest, ConvertExpandDims) {
   }
 }
 
-TEST_P(ParameterizedOpConverterTest, ConvertSqueeze) {
-  const auto& spec = GetParam();
-  const TrtTestMode trt_mode = std::get<0>(spec);
+TEST_P(OpConverterTest1, ConvertSqueeze) {
   const bool use_implicit_batch = (trt_mode == TrtTestMode::kImplicitBatch);
-  // Data type of TF input tensors
-  const DataType tf_dtype = std::get<1>(spec);
-  // Precision mode used for  TensorRT engine
-  TrtPrecisionMode converter_precision = std::get<2>(spec);
-
   // Get the NodeDef for Squeeze.
-  auto get_squeeze_nodedef = [tf_dtype](std::vector<int> axes) -> NodeDef {
+  auto get_squeeze_nodedef = [](std::vector<int> axes,
+                                DataType tf_dtype) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
     if (!axes.empty()) {
@@ -3230,14 +3282,12 @@ TEST_P(ParameterizedOpConverterTest, ConvertSqueeze) {
 
   for (TestParamBase p : test_params) {
     SCOPED_TRACE(p);
-    Reset(converter_precision, trt_mode);
-    NodeDef node_def = get_squeeze_nodedef(p.param);
-    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode,
-                  &p.partial_input_dims);
-    RunValidationAndConversion(node_def, p.status, "my_squeeze",
-                               p.expected_output_dims);
-    InstantiateBuildAndRun(tf_dtype, "my_squeeze", this, p, {1, 2, 3, 4, 5, 6},
-                           ElementsAreArray({1, 2, 3, 4, 5, 6}));
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef(p.param, tf_dtype);
+    AddTestTensor("input", p.input_dims, {1, 2, 3, 4, 5, 6},
+                  p.partial_input_dims);
+    TestOpConverter("my_squeeze", node_def, p.expected_output_dims, p.status,
+                    p.runtime_status, ElementsAreArray({1, 2, 3, 4, 5, 6}));
   }
 }
 
@@ -5140,14 +5190,7 @@ TEST_F(OpConverterTest, ConvertGather) {
   TestConvertGather<DT_INT32>(this);
 }
 
-template <typename T>
-NodeDef CreateUnaryOp() {
-  Scope s = Scope::NewRootScope();
-  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-  return T(s.WithOpName("my_unary"), input).operation.node()->def();
-}
-
-NodeDef CreateCastOp() {
+NodeDef CreateCastOp(DataType tf_dtype) {
   Scope s = Scope::NewRootScope();
   auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
   return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT)
@@ -5155,21 +5198,17 @@ NodeDef CreateCastOp() {
       ->def();
 }
 
-TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
-  const auto& spec = GetParam();
-  const TrtTestMode trt_mode = std::get<0>(spec);
-  const DataType tf_dtype = std::get<1>(spec);
-  TrtPrecisionMode converter_precision = std::get<2>(spec);
+TEST_P(OpConverterTest1, ConvertUnary) {
   {
     // Input is weights, should fail.
-    Reset(converter_precision, trt_mode);
-    const NodeDef node_def = CreateUnaryOp<ops::Neg>();
+    Reset();
+    const NodeDef node_def = CreateUnaryOp<ops::Neg>(tf_dtype);
     AddTestWeights<float>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "The input \"x\" for Neg must be a tensor, at my_unary");
   }
-  using OpFunc = std::function<NodeDef(void)>;
+  using OpFunc = std::function<NodeDef(DataType)>;
   using ValFunc = float (*)(float);
   std::map<std::string, std::pair<OpFunc, ValFunc>> op_map;
 #define ADD_OP(name, op, compute) \
@@ -5215,28 +5254,28 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
   };
   for (const string& op_name : ops_to_test) {
     SCOPED_TRACE(op_name);
-    Reset(converter_precision, trt_mode);
+    Reset();
     if (!op_map.count(op_name)) {
       FAIL() << "Unary op test map does not contain op " << op_name;
     }
-    NodeDef node_def = op_map[op_name].first();
+    NodeDef node_def = op_map[op_name].first(tf_dtype);
 
     // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for
     // now. Need to find a better way to express input and output types.
+    //
+    // TODO(tfeher): improve tests by defining an expected output data type and
+    // check that. Currently only the shape and values of the output are
+    // checked.
     DataType input_tf_dtype = op_name == "Cast" ? DT_HALF : tf_dtype;
     DataType output_tf_dtype = tf_dtype;
 
-    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(input_tf_dtype),
-                  trt_mode);
-    RunValidationAndConversion(node_def, Status::OK(), "my_unary",
-                               p.expected_output_dims);
-
     std::vector<float> input_values{-0.9f, 0.6f, 0.0f, -3.5f, 100.0f, 2.9f};
+    AddTestTensor("input", p.input_dims, input_tf_dtype, input_values);
     std::vector<float> output;
     std::transform(input_values.begin(), input_values.end(),
                    std::back_inserter(output), op_map[op_name].second);
-    InstantiateBuildAndRun(input_tf_dtype, output_tf_dtype, "my_unary", this, p,
-                           input_values, ArrayFloatNear(output, 0.0001, true));
+    TestOpConverter("my_unary", node_def, p.expected_output_dims, Status::OK(),
+                    p.runtime_status, ArrayFloatNear(output, 0.0001, true));
   }
 }
 

From baa3e80ca55909ce1b56864c0adcd825862fea63 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Tue, 19 May 2020 14:40:55 -0700
Subject: [PATCH 0846/1533] Fix pylint errors

PiperOrigin-RevId: 312356181
Change-Id: I7259f4e5584e2947dcf919918a5e9371e50185fe
---
 tensorflow/python/tpu/feature_column_v2.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py
index e67842e766a..1012506c48b 100644
--- a/tensorflow/python/tpu/feature_column_v2.py
+++ b/tensorflow/python/tpu/feature_column_v2.py
@@ -913,8 +913,9 @@ class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
   def create_state(self, state_manager):
     _check_invalid_cases(self._embedding_lookup_device)
     # CPU case.
-    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
-    ):
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
       return fc_lib.EmbeddingColumn.create_state(self, state_manager)
     # TPU_EMBEDDING_CORE case.
     elif self._embedding_lookup_device == EmbeddingDevice.TPU_EMBEDDING_CORE:
@@ -928,8 +929,9 @@ class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
     """Private method that follows get_dense_tensor."""
     _check_invalid_cases(self._embedding_lookup_device)
     # CPU Case.
-    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
-    ):
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
       return super(_TPUDeviceSpecificEmbeddingColumnV2,
                    self).get_dense_tensor(transformation_cache, state_manager)
     # TPU_EMBEDDING_CORE case.
@@ -963,8 +965,9 @@ class _TPUDeviceSpecificEmbeddingColumnV2(_TPUEmbeddingColumnV2):
   def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
     _check_invalid_cases(self._embedding_lookup_device)
     # CPU Case.
-    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
-    ):
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
       return super(_TPUDeviceSpecificEmbeddingColumnV2,
                    self)._get_dense_tensor(inputs, weight_collections,
                                            trainable)
@@ -1039,8 +1042,9 @@ class _TPUSharedDeviceSpecificEmbeddingColumnV2(_TPUSharedEmbeddingColumnV2):
     """Private method that follows _get_dense_tensor_internal."""
     _check_invalid_cases(self._embedding_lookup_device)
     # CPU Case.
-    if self._embedding_lookup_device == EmbeddingDevice.CPU or _is_running_on_cpu(
-    ):
+    is_cpu = self._embedding_lookup_device == EmbeddingDevice.CPU
+    is_cpu = is_cpu or _is_running_on_cpu()
+    if is_cpu:
       return super(_TPUSharedDeviceSpecificEmbeddingColumnV2,
                    self)._get_dense_tensor_internal(transformation_cache,
                                                     state_manager)

From 53215ab702aedb306778590277f71768c2d2c148 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 19 May 2020 15:03:17 -0700
Subject: [PATCH 0847/1533] Use the nonlocal mechanism for if statements. This
 is the same mechanism used by for and while loops and it allows reusing much
 of the code. This required the ternary if operator to be split in a separate
 implementation, but that better accounts for its different nature. This
 should also allow more consistent verification and error messages throughout.

PiperOrigin-RevId: 312360755
Change-Id: I57989c6cd40a16653521e18ccf21f2b0e994bd96
---
 tensorflow/python/autograph/converters/BUILD  |   8 +-
 .../converters/conditional_expressions.py     |  20 +-
 .../autograph/converters/control_flow.py      | 394 +++++-------------
 .../autograph/converters/control_flow_test.py |  91 ++++
 tensorflow/python/autograph/operators/BUILD   |  15 +
 .../python/autograph/operators/__init__.py    |   1 +
 .../operators/conditional_expressions.py      |  56 +++
 .../operators/conditional_expressions_test.py |  66 +++
 .../autograph/operators/control_flow.py       | 277 +++++-------
 .../autograph/operators/control_flow_test.py  | 200 ++++++---
 .../pyct/static_analysis/activity.py          |   6 +
 .../reaching_definitions_test.py              |  40 ++
 12 files changed, 672 insertions(+), 502 deletions(-)
 create mode 100644 tensorflow/python/autograph/operators/conditional_expressions.py
 create mode 100644 tensorflow/python/autograph/operators/conditional_expressions_test.py

diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index ec780a7c0a1..9cf3bba8dd5 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -118,7 +118,13 @@ py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+        "no_windows",
+        "nopip",
+    ],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/converters/conditional_expressions.py b/tensorflow/python/autograph/converters/conditional_expressions.py
index 44ab6dee926..65fb6765fcf 100644
--- a/tensorflow/python/autograph/converters/conditional_expressions.py
+++ b/tensorflow/python/autograph/converters/conditional_expressions.py
@@ -18,7 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import templates
 
 
@@ -26,19 +29,20 @@ class ConditionalExpressionTransformer(converter.Base):
   """Converts conditional expressions to functional form."""
 
   def visit_IfExp(self, node):
-    return templates.replace_as_expression(
-        '''ag__.if_stmt(
+    template = '''
+        ag__.if_exp(
             test,
             lambda: true_expr,
             lambda: false_expr,
-            lambda: (),
-            lambda _: None,
-            ('<internal expr>',),
-            ())
-        ''',
+            expr_repr)
+    '''
+    expr_repr = parser.unparse(node.test, include_encoding_marker=False).strip()
+    return templates.replace_as_expression(
+        template,
         test=node.test,
         true_expr=node.body,
-        false_expr=node.orelse)
+        false_expr=node.orelse,
+        expr_repr=gast.Constant(expr_repr, kind=None))
 
 
 def transform(node, ctx):
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index a903c43bcfc..673781e47dd 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -23,7 +23,6 @@ import gast
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.autograph.pyct import anno
-from tensorflow.python.autograph.pyct import ast_util
 from tensorflow.python.autograph.pyct import cfg
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
@@ -57,114 +56,16 @@ class ControlFlowTransformer(converter.Base):
       fn.scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
       return self.generic_visit(node)
 
-  def _create_cond_branch(self, body_name, aliased_orig_names,
-                          aliased_new_names, body, returns):
-    if len(returns) == 1:
-      template = """
-        return retval
-      """
-      return_stmt = templates.replace(template, retval=returns[0])
-    else:
-      template = """
-        return (retvals,)
-      """
-      return_stmt = templates.replace(template, retvals=returns)
-
-    if aliased_orig_names:
-      alias_declarations = []
-      for new_name, old_name in zip(aliased_new_names, aliased_orig_names):
-        template = """
-          try:
-            aliased_new_name = aliased_orig_name
-          except NameError:
-            aliased_new_name = ag__.Undefined(symbol_name)
-        """
-
-        alias_declarations.extend(
-            templates.replace(
-                template,
-                aliased_new_name=new_name,
-                aliased_orig_name=old_name,
-                symbol_name=gast.Constant(str(old_name), kind=None)))
-
-      template = """
-        def body_name():
-          alias_declarations
-          body
-          return_stmt
-      """
-      return templates.replace(
-          template,
-          alias_declarations=alias_declarations,
-          body_name=body_name,
-          body=body,
-          return_stmt=return_stmt)
-    else:
-      template = """
-        def body_name():
-          body
-          return_stmt
-      """
-      return templates.replace(
-          template, body_name=body_name, body=body, return_stmt=return_stmt)
-
-  def _create_cond_expr(self, results, test, body_name, orelse_name,
-                        state_getter_name, state_setter_name,
-                        basic_symbol_names, composite_symbol_names):
-    if results is not None:
-      template = """
-        results = ag__.if_stmt(test, body_name, orelse_name,
-                               state_getter_name, state_setter_name,
-                               (basic_symbol_names,),
-                               (composite_symbol_names,))
-      """
-      return templates.replace(
-          template,
-          test=test,
-          results=results,
-          body_name=body_name,
-          orelse_name=orelse_name,
-          state_getter_name=state_getter_name,
-          state_setter_name=state_setter_name,
-          basic_symbol_names=basic_symbol_names,
-          composite_symbol_names=composite_symbol_names)
-    else:
-      template = """
-        ag__.if_stmt(test, body_name, orelse_name, getter_name, setter_name,
-                     (basic_symbol_names,), (composite_symbol_names,))
-      """
-      return templates.replace(
-          template,
-          test=test,
-          body_name=body_name,
-          orelse_name=orelse_name,
-          getter_name=state_getter_name,
-          setter_name=state_setter_name,
-          basic_symbol_names=basic_symbol_names,
-          composite_symbol_names=composite_symbol_names)
-
-  def _fmt_symbols(self, symbol_set):
-    if not symbol_set:
-      return 'no variables'
-    return ', '.join(map(str, symbol_set))
-
-  def _determine_aliased_symbols(self, scope, node_defined_in):
-    modified_live = scope.modified & node_defined_in
-    # Composite symbols are handled elsewhere, see _create_state_functions
-    return {
-        s for s in modified_live
-        if not s.is_composite() and s not in self.state[_Function].scope.globals
-    }
-
-  def _create_nonlocal_declarations(self, loop_vars):
+  def _create_nonlocal_declarations(self, vars_):
+    vars_ = set(vars_)
     results = []
     global_vars = self.state[_Function].scope.globals
 
     if global_vars:
-      results.append(gast.Global([str(v) for v in global_vars]))
+      results.append(gast.Global([str(v) for v in vars_]))
 
     nonlocal_vars = [
-        v for v in loop_vars if not v.is_composite() and v not in global_vars]
+        v for v in vars_ if not v.is_composite() and v not in global_vars]
     if nonlocal_vars:
       results.append(gast.Nonlocal([str(v) for v in nonlocal_vars]))
 
@@ -176,9 +77,9 @@ class ControlFlowTransformer(converter.Base):
       template = """
         def getter_name():
           return state_vars,
-        def setter_name(loop_vars):
+        def setter_name(vars_):
           nonlocal_declarations
-          state_vars, = loop_vars
+          state_vars, = vars_
       """
       return templates.replace(
           template,
@@ -222,166 +123,34 @@ class ControlFlowTransformer(converter.Base):
           symbol_name=gast.Constant(s.ssf(), kind=None))
     return assignments
 
-  def visit_If(self, node):
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
-    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
-    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
-    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-
-    # Note: this information needs to be extracted before the body conversion
-    # that happens in the call to generic_visit below, because the conversion
-    # generates nodes that lack static analysis annotations.
-    need_alias_in_body = self._determine_aliased_symbols(
-        body_scope, defined_in)
-    need_alias_in_orelse = self._determine_aliased_symbols(
-        orelse_scope, defined_in)
-
-    node = self.generic_visit(node)
-
-    modified_in_cond = body_scope.modified | orelse_scope.modified
-    returned_from_cond = set()
-    composites = set()
-    for s in modified_in_cond:
-      if s in live_out and not s.is_composite():
-        returned_from_cond.add(s)
-      if s.is_composite():
-        # Special treatment for compound objects, always return them.
-        # This allows special handling within the if_stmt itself.
-        # For example, in TensorFlow we need to restore the state of composite
-        # symbols to ensure that only effects from the executed branch are seen.
-        composites.add(s)
-
-    created_in_body = body_scope.modified & returned_from_cond - defined_in
-    created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
-
-    basic_created_in_body = tuple(
-        s for s in created_in_body if not s.is_composite())
-    basic_created_in_orelse = tuple(
-        s for s in created_in_orelse if not s.is_composite())
-
-    # These variables are defined only in a single branch. This is fine in
-    # Python so we pass them through. Another backend, e.g. Tensorflow, may need
-    # to handle these cases specially or throw an Error.
-    possibly_undefined = (set(basic_created_in_body) ^
-                          set(basic_created_in_orelse))
-
-    # Alias the closure variables inside the conditional functions, to allow
-    # the functions access to the respective variables.
-    # We will alias variables independently for body and orelse scope,
-    # because different branches might write different variables.
-    aliased_body_orig_names = tuple(need_alias_in_body)
-    aliased_orelse_orig_names = tuple(need_alias_in_orelse)
-    aliased_body_new_names = tuple(
-        self.ctx.namer.new_symbol(s.ssf(), body_scope.referenced)
-        for s in aliased_body_orig_names)
-    aliased_orelse_new_names = tuple(
-        self.ctx.namer.new_symbol(s.ssf(), orelse_scope.referenced)
-        for s in aliased_orelse_orig_names)
-
-    alias_body_map = dict(zip(aliased_body_orig_names, aliased_body_new_names))
-    alias_orelse_map = dict(
-        zip(aliased_orelse_orig_names, aliased_orelse_new_names))
-
-    node_body = ast_util.rename_symbols(node.body, alias_body_map)
-    node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
-
-    cond_var_name = self.ctx.namer.new_symbol('cond', body_scope.referenced)
-    body_name = self.ctx.namer.new_symbol('if_true', body_scope.referenced)
-    orelse_name = self.ctx.namer.new_symbol('if_false', orelse_scope.referenced)
-    all_referenced = body_scope.referenced | orelse_scope.referenced
-    state_getter_name = self.ctx.namer.new_symbol('get_state', all_referenced)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', all_referenced)
-
-    returned_from_cond = tuple(returned_from_cond)
-    composites = tuple(composites)
-
-    if returned_from_cond:
-      if len(returned_from_cond) == 1:
-        cond_results = returned_from_cond[0]
-      else:
-        cond_results = gast.Tuple([s.ast() for s in returned_from_cond], None)
-
-      returned_from_body = tuple(
-          alias_body_map[s] if s in need_alias_in_body else s
-          for s in returned_from_cond)
-      returned_from_orelse = tuple(
-          alias_orelse_map[s] if s in need_alias_in_orelse else s
-          for s in returned_from_cond)
-
-    else:
-      # When the cond would return no value, we leave the cond called without
-      # results. That in turn should trigger the side effect guards. The
-      # branch functions will return a dummy value that ensures cond
-      # actually has some return value as well.
-      cond_results = None
-      # TODO(mdan): Replace with None once side_effect_guards is retired.
-      returned_from_body = (templates.replace_as_expression(
-          'ag__.match_staging_level(1, cond_var_name)',
-          cond_var_name=cond_var_name),)
-      returned_from_orelse = (templates.replace_as_expression(
-          'ag__.match_staging_level(1, cond_var_name)',
-          cond_var_name=cond_var_name),)
-
-    cond_assign = self.create_assignment(cond_var_name, node.test)
-    body_def = self._create_cond_branch(
-        body_name,
-        aliased_orig_names=aliased_body_orig_names,
-        aliased_new_names=aliased_body_new_names,
-        body=node_body,
-        returns=returned_from_body)
-    orelse_def = self._create_cond_branch(
-        orelse_name,
-        aliased_orig_names=aliased_orelse_orig_names,
-        aliased_new_names=aliased_orelse_new_names,
-        body=node_orelse,
-        returns=returned_from_orelse)
-    undefined_assigns = self._create_undefined_assigns(possibly_undefined)
-    composite_defs = self._create_state_functions(
-        composites, [], state_getter_name, state_setter_name)
-
-    basic_symbol_names = tuple(
-        gast.Constant(str(symbol), kind=None) for symbol in returned_from_cond)
-    composite_symbol_names = tuple(
-        gast.Constant(str(symbol), kind=None) for symbol in composites)
-
-    cond_expr = self._create_cond_expr(cond_results, cond_var_name, body_name,
-                                       orelse_name, state_getter_name,
-                                       state_setter_name, basic_symbol_names,
-                                       composite_symbol_names)
-
-    if_ast = (
-        undefined_assigns + composite_defs + body_def + orelse_def +
-        cond_assign + cond_expr)
-    return if_ast
-
-  def _get_basic_loop_vars(self, modified, live_in, live_out):
-    # The loop variables corresponding to simple symbols (e.g. `x`).
-    basic_loop_vars = []
+  def _get_block_basic_vars(self, modified, live_in, live_out):
+    nonlocals = self.state[_Function].scope.nonlocals
+    basic_scope_vars = []
     for s in modified:
       if s.is_composite():
-        # TODO(mdan): Raise an error when this happens for a TF loop.
+        # TODO(mdan): Raise an error when this happens for a TF scope.
         continue
-      # Variables not live into or out of the loop are considered local to the
-      # loop.
-      if s not in live_in and s not in live_out:
-        continue
-      basic_loop_vars.append(s)
-    return frozenset(basic_loop_vars)
+      # Variables not live into or out of the scope are considered local to the
+      # scope.
+      if s in live_in or s in live_out or s in nonlocals:
+        basic_scope_vars.append(s)
+      continue
+    return frozenset(basic_scope_vars)
 
-  def _get_composite_loop_vars(self, modified, live_in):
-    # The loop variables corresponding to composite symbols (e.g. `self.x`).
-    composite_loop_vars = []
+  def _get_block_composite_vars(self, modified, live_in):
+    # The scope variables corresponding to composite symbols (e.g. `self.x`).
+    composite_scope_vars = []
     for s in modified:
       if not s.is_composite():
         continue
-      # Mutations made to objects created inside the loop will appear as writes
+      # Mutations made to objects created inside the scope will appear as writes
       # to composite symbols. Because these mutations appear as modifications
       # made to composite symbols, we check whether the composite's parent is
-      # actually live into the loop.
+      # actually live into the scope.
       # Example:
       #   while cond:
       #     x = Foo()
-      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      #     x.foo = 2 * x.foo  # x.foo is live into the scope, but x is not.
       #
       # Note that some parents might not be symbols - for example, in x['foo'],
       # 'foo' is a parent, but it's a literal, not a symbol. We don't check the
@@ -390,40 +159,106 @@ class ControlFlowTransformer(converter.Base):
           sss for sss in s.support_set if sss.is_symbol())
       if not all(sss in live_in for sss in support_set_symbols):
         continue
-      composite_loop_vars.append(s)
-    return frozenset(composite_loop_vars)
+      composite_scope_vars.append(s)
+    return frozenset(composite_scope_vars)
 
-  def _get_loop_vars(self, node, modified):
-    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+  def _get_block_vars(self, node, modified):
+    """Determines the variables affected inside a control flow statement."""
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
-    reserved_symbols = body_scope.referenced
 
-    basic_loop_vars = self._get_basic_loop_vars(modified, live_in, live_out)
-    composite_loop_vars = self._get_composite_loop_vars(modified, live_in)
-    loop_vars = tuple(basic_loop_vars | composite_loop_vars)
+    basic_scope_vars = self._get_block_basic_vars(
+        modified,
+        live_in,
+        live_out)
+    composite_scope_vars = self._get_block_composite_vars(modified, live_in)
+    scope_vars = tuple(basic_scope_vars | composite_scope_vars)
 
-    # Variable that are used or defined inside the loop, but not defined
-    # before entering the loop. Only simple variables must be defined. The
+    # Variables that are modified inside the scope, but not defined
+    # before entering it. Only simple variables must be defined. The
     # composite ones will be implicitly checked at runtime.
-    undefined_lives = basic_loop_vars - defined_in
+    # This covers loop variables as well as variables that
+    undefined = tuple(v for v in modified - defined_in if not v.is_composite())
 
-    return loop_vars, reserved_symbols, undefined_lives
+    # Variables that are modified inside the scope, and depend on values outside
+    # it.
+    input_only = basic_scope_vars & live_in - live_out
+
+    # Place the outputs first.
+    scope_vars = sorted(scope_vars, key=lambda v: v in input_only)
+    nouts = len(scope_vars) - len(input_only)
+
+    return scope_vars, undefined, nouts
+
+  def visit_If(self, node):
+    node = self.generic_visit(node)
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
+
+    cond_vars, undefined, nouts = self._get_block_vars(
+        node, body_scope.modified | orelse_scope.modified)
+
+    undefined_assigns = self._create_undefined_assigns(undefined)
+
+    nonlocal_declarations = self._create_nonlocal_declarations(cond_vars)
+
+    reserved = body_scope.referenced | orelse_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
+    state_functions = self._create_state_functions(
+        cond_vars, nonlocal_declarations, state_getter_name, state_setter_name)
+
+    orelse_body = node.orelse
+    if not orelse_body:
+      orelse_body = [gast.Pass()]
+
+    template = """
+      state_functions
+      def body_name():
+        nonlocal_declarations
+        body
+      def orelse_name():
+        nonlocal_declarations
+        orelse
+      undefined_assigns
+      ag__.if_stmt(
+        test,
+        body_name,
+        orelse_name,
+        state_getter_name,
+        state_setter_name,
+        (symbol_names,),
+        nouts)
+    """
+    return templates.replace(
+        template,
+        body=node.body,
+        body_name=self.ctx.namer.new_symbol('if_body', reserved),
+        orelse=orelse_body,
+        orelse_name=self.ctx.namer.new_symbol('else_body', reserved),
+        nonlocal_declarations=nonlocal_declarations,
+        nouts=gast.Constant(nouts, kind=None),
+        state_functions=state_functions,
+        state_getter_name=state_getter_name,
+        state_setter_name=state_setter_name,
+        symbol_names=tuple(gast.Constant(str(s), kind=None) for s in cond_vars),
+        test=node.test,
+        undefined_assigns=undefined_assigns)
 
   def visit_While(self, node):
     node = self.generic_visit(node)
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
 
-    loop_vars, reserved_symbols, possibly_undefs = self._get_loop_vars(
-        node, body_scope.modified)
+    loop_vars, undefined, _ = self._get_block_vars(node, body_scope.modified)
 
-    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    undefined_assigns = self._create_undefined_assigns(undefined)
 
     nonlocal_declarations = self._create_nonlocal_declarations(loop_vars)
 
-    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved_symbols)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved_symbols)
+    reserved = body_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
     state_functions = self._create_state_functions(
         loop_vars, nonlocal_declarations, state_getter_name, state_setter_name)
 
@@ -448,7 +283,7 @@ class ControlFlowTransformer(converter.Base):
     return templates.replace(
         template,
         body=node.body,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved),
         nonlocal_declarations=nonlocal_declarations,
         opts=opts,
         state_functions=state_functions,
@@ -456,7 +291,7 @@ class ControlFlowTransformer(converter.Base):
         state_setter_name=state_setter_name,
         symbol_names=tuple(gast.Constant(str(s), kind=None) for s in loop_vars),
         test=node.test,
-        test_name=self.ctx.namer.new_symbol('loop_test', reserved_symbols),
+        test_name=self.ctx.namer.new_symbol('loop_test', reserved),
         undefined_assigns=undefined_assigns)
 
   def visit_For(self, node):
@@ -464,15 +299,16 @@ class ControlFlowTransformer(converter.Base):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     iter_scope = anno.getanno(node, annos.NodeAnno.ITERATE_SCOPE)
 
-    loop_vars, reserved_symbols, possibly_undefs = self._get_loop_vars(
+    loop_vars, undefined, _ = self._get_block_vars(
         node, body_scope.modified | iter_scope.modified)
 
-    undefined_assigns = self._create_undefined_assigns(possibly_undefs)
+    undefined_assigns = self._create_undefined_assigns(undefined)
 
     nonlocal_declarations = self._create_nonlocal_declarations(loop_vars)
 
-    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved_symbols)
-    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved_symbols)
+    reserved = body_scope.referenced | iter_scope.referenced
+    state_getter_name = self.ctx.namer.new_symbol('get_state', reserved)
+    state_setter_name = self.ctx.namer.new_symbol('set_state', reserved)
     state_functions = self._create_state_functions(
         loop_vars, nonlocal_declarations, state_getter_name, state_setter_name)
 
@@ -484,7 +320,7 @@ class ControlFlowTransformer(converter.Base):
     if anno.hasanno(node, anno.Basic.EXTRA_LOOP_TEST):
       extra_test = anno.getanno(node, anno.Basic.EXTRA_LOOP_TEST)
       extra_test_name = self.ctx.namer.new_symbol(
-          'extra_test', reserved_symbols)
+          'extra_test', reserved)
       template = """
         def extra_test_name():
           nonlocal_declarations
@@ -502,7 +338,7 @@ class ControlFlowTransformer(converter.Base):
 
     # iterate_arg_name holds a single arg with the iterates, which may be a
     # tuple.
-    iterate_arg_name = self.ctx.namer.new_symbol('itr', reserved_symbols)
+    iterate_arg_name = self.ctx.namer.new_symbol('itr', reserved)
     template = """
       iterates = iterate_arg_name
     """
@@ -529,7 +365,7 @@ class ControlFlowTransformer(converter.Base):
     return templates.replace(
         template,
         body=node.body,
-        body_name=self.ctx.namer.new_symbol('loop_body', reserved_symbols),
+        body_name=self.ctx.namer.new_symbol('loop_body', reserved),
         extra_test_function=extra_test_function,
         extra_test_name=extra_test_name,
         iterate_arg_name=iterate_arg_name,
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 32e86400da6..935e2cec4b8 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -1,3 +1,4 @@
+# Lint as: python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -453,6 +454,17 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  def test_local_remains_local(self):
+
+    def test_fn(n):
+      if n > 0:
+        b = 4
+        n = b + 1
+      return n
+
+    self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
+    self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
+
   def test_no_outputs(self):
 
     def test_fn(n):
@@ -465,6 +477,85 @@ class IfStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 1)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  def test_created_outputs(self):
+
+    def test_fn(i):
+      if i == 0:
+        result = i - 1
+      else:
+        result = i + 1
+      return result
+
+    self.assertTransformedResult(test_fn, 0, -1)
+    self.assertTransformedResult(test_fn, 1, 2)
+
+  def test_created_loop_local_outputs(self):
+
+    def test_fn(n, x):
+      for i in n:
+        if i == 0:
+          result = i - 1
+        else:
+          result = i + 1
+        if result > 0:
+          x += 1
+      return x
+
+    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+
+  def test_created_loop_variable(self):
+
+    def test_fn(n, x):
+      for i in n:
+        if i == 0:
+          result = i - 1
+        if i > 0:  # Using the result from previous iteration.
+          if result < 0:
+            x += 1
+      return x
+
+    self.assertTransformedResult(test_fn, (range(5), 10), 14)
+
+  def test_unaffected_global(self):
+
+    def test_fn(i):
+      global g  # pylint:disable=global-variable-undefined
+      if i == 0:
+        g = i - 1
+      return g
+
+    self.assertTransformedResult(test_fn, 1, 3, symbols={'g': 3})
+    self.assertTransformedResult(test_fn, 0, -1, symbols={'g': 3})
+
+  def test_unaffected_nonlocal(self):
+
+    def test_fn(i):
+      def inner_fn():
+        nonlocal n
+        if i == 0:
+          n = i - 1
+
+      n = 3
+      inner_fn()
+      return n
+
+    self.assertTransformedResult(test_fn, 1, 3)
+    self.assertTransformedResult(test_fn, 0, -1)
+
+  def test_output_defined_in_prior_except(self):
+
+    def test_fn(i):
+      try:
+        raise ValueError()
+      except ValueError:
+        x = 1
+      if i == 0:
+        x = i - 1
+      return x
+
+    self.assertTransformedResult(test_fn, 1, 1)
+    self.assertTransformedResult(test_fn, 0, -1)
+
   def test_unbalanced_multiple_composites(self):
 
     class Foo(object):
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 3851c7b44ba..5f644ea525d 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -22,6 +22,7 @@ py_library(
     name = "operators",
     srcs = [
         "__init__.py",
+        "conditional_expressions.py",
         "control_flow.py",
         "control_flow_deprecated_py2.py",
         "data_structures.py",
@@ -62,6 +63,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "conditional_expressions_test",
+    srcs = ["conditional_expressions_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+    ],
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_test(
     name = "control_flow_test",
     srcs = ["control_flow_test.py"],
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index f7f9078107c..8ac4e1d8bb3 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -37,6 +37,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.operators.conditional_expressions import if_exp
 from tensorflow.python.autograph.operators.control_flow import for_stmt
 from tensorflow.python.autograph.operators.control_flow import if_stmt
 from tensorflow.python.autograph.operators.control_flow import while_stmt
diff --git a/tensorflow/python/autograph/operators/conditional_expressions.py b/tensorflow/python/autograph/operators/conditional_expressions.py
new file mode 100644
index 00000000000..7ea2b249935
--- /dev/null
+++ b/tensorflow/python/autograph/operators/conditional_expressions.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conditional expressions (e.g. the ternary if statement)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.autograph.operators import control_flow
+from tensorflow.python.autograph.utils import tensors
+from tensorflow.python.ops import control_flow_ops
+
+
+def if_exp(cond, if_true, if_false, expr_repr):
+  if tensors.is_dense_tensor(cond):
+    return _tf_if_exp(cond, if_true, if_false, expr_repr)
+  else:
+    return _py_if_exp(cond, if_true, if_false)
+
+
+def _tf_if_exp(cond, if_true, if_false, expr_repr):
+  """Overload of if_exp that stages a TF cond."""
+  # TODO(mdan): Use nonlocal once we no longer need to support py2.
+  true_val = []
+  false_val = []
+
+  def true_fn():
+    true_val.append(if_true())
+    if true_val and false_val:
+      control_flow.verify_single_cond_var(expr_repr, true_val[0], false_val[0])
+    return true_val[0]
+
+  def false_fn():
+    false_val.append(if_false())
+    if true_val and false_val:
+      control_flow.verify_single_cond_var(expr_repr, true_val[0], false_val[0])
+    return false_val[0]
+
+  return control_flow_ops.cond(cond, true_fn, false_fn)
+
+
+def _py_if_exp(cond, if_true, if_false):
+  return if_true() if cond else if_false()
diff --git a/tensorflow/python/autograph/operators/conditional_expressions_test.py b/tensorflow/python/autograph/operators/conditional_expressions_test.py
new file mode 100644
index 00000000000..3f126116023
--- /dev/null
+++ b/tensorflow/python/autograph/operators/conditional_expressions_test.py
@@ -0,0 +1,66 @@
+# Lint as: python3
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for conditional_expressions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.autograph.operators import conditional_expressions
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+def _basic_expr(cond):
+  return conditional_expressions.if_exp(
+      cond,
+      lambda: constant_op.constant(1),
+      lambda: constant_op.constant(2),
+      'cond')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class IfExpTest(test.TestCase):
+
+  def test_tensor(self):
+    self.assertEqual(self.evaluate(_basic_expr(constant_op.constant(True))), 1)
+    self.assertEqual(self.evaluate(_basic_expr(constant_op.constant(False))), 2)
+
+  def test_tensor_mismatched_type(self):
+    # tf.function required because eager cond degenerates to Python if.
+    @def_function.function
+    def test_fn():
+      conditional_expressions.if_exp(
+          constant_op.constant(True), lambda: 1.0, lambda: 2, 'expr_repr')
+
+    with self.assertRaisesRegexp(
+        TypeError,
+        "'expr_repr' has dtype float32 in the main.*int32 in the else"):
+      test_fn()
+
+  def test_python(self):
+    self.assertEqual(self.evaluate(_basic_expr(True)), 1)
+    self.assertEqual(self.evaluate(_basic_expr(False)), 2)
+    self.assertEqual(
+        conditional_expressions.if_exp(True, lambda: 1, lambda: 2, ''), 1)
+    self.assertEqual(
+        conditional_expressions.if_exp(False, lambda: 1, lambda: 2, ''), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 592281b0ce2..77db7579ece 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -102,7 +102,7 @@ def _verify_loop_init_vars(values, symbol_names):
   """Ensures that all values in the state are defined when entering a loop."""
   for name, value in zip(symbol_names, values):
     if value is None:
-      raise ValueError('"{}" may not be None before the loop.'.format(name))
+      raise ValueError("'{}' may not be None before the loop.".format(name))
     if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
@@ -110,7 +110,7 @@ def _verify_loop_init_vars(values, symbol_names):
       raise ValueError(
           'return statements are not supported within a TensorFlow loop.')
     if isinstance(value, variables.Undefined):
-      raise ValueError('"{}" must be defined before the loop.'.format(name))
+      raise ValueError("'{}' must be defined before the loop.".format(name))
 
 
 def _is_subshape(left, right):
@@ -133,9 +133,9 @@ def _is_subshape(left, right):
 def _verify_single_loop_var(
     name, check_shape, init, entry, exit_, shape_invariant):
   """Verifies whether the initial, entry and exit values are consistent."""
-  assert entry is not None, 'no TF op should set "{}" to None?'.format(name)
+  assert entry is not None, "no TF op should set '{}' to None?".format(name)
   if exit_ is None:
-    raise ValueError('"{}" is None at the end of the iteration.'.format(name))
+    raise ValueError("'{}' is None at the end of the iteration.".format(name))
 
   if isinstance(init, (bool, int, float, str, np.ndarray)):
     init = ops.convert_to_tensor_v2(init)
@@ -158,9 +158,8 @@ def _verify_single_loop_var(
 
   if entry.dtype != exit_.dtype:
     raise TypeError(
-        '"{}" has dtype {} before the loop, but dtype {} after one'
-        ' iteration. TensorFlow control flow requires it stays the'
-        ' same.'.format(
+        "'{}' has dtype {} before the loop, but dtype {} after one"
+        ' iteration'.format(
             name,
             entry.dtype.name,
             exit_.dtype.name,
@@ -171,19 +170,19 @@ def _verify_single_loop_var(
       entry_shape = entry.shape
       if not _is_subshape(exit_shape, entry_shape):
         raise ValueError(
-            '"{}" has shape {} before the loop, but shape {} after one'
+            "'{}' has shape {} before the loop, but shape {} after one"
             ' iteration. Use tf.autograph.experimental.set_loop_options to set'
             ' shape invariants.'.format(name, entry_shape, exit_shape))
     else:
       init_shape = init.shape
       if not _is_subshape(init_shape, shape_invariant):
         raise ValueError(
-            '"{}" has shape {} before the loop, which does not conform with'
+            "'{}' has shape {} before the loop, which does not conform with"
             ' the shape invariant {}.'.format(name, init_shape,
                                               shape_invariant))
       if not _is_subshape(exit_shape, shape_invariant):
         raise ValueError(
-            '"{}" has shape {} after one iteration, which does not conform with'
+            "'{}' has shape {} after one iteration, which does not conform with"
             ' the shape invariant {}.'.format(
                 name, exit_shape, shape_invariant))
 
@@ -216,13 +215,13 @@ def _verify_tf_loop_vars(init_vars,
       nest.assert_same_structure(init, entry, expand_composites=True)
       nest.assert_same_structure(entry, exit_, expand_composites=True)
     except (ValueError, TypeError) as e:
-      raise TypeError('"{}" does not have the same nested structure after one'
+      raise TypeError("'{}' does not have the same nested structure after one"
                       ' iteration.\n\n{}'.format(name, e))
     if invariant is not None:
       try:
         nest.assert_same_structure(init, invariant, expand_composites=False)
       except (ValueError, TypeError) as e:
-        raise TypeError('"{}" does not have the same nested structure as its'
+        raise TypeError("'{}' does not have the same nested structure as its"
                         ' corresponding shape invariant.\n\n{}'.format(name, e))
 
     nest.map_structure(
@@ -230,13 +229,13 @@ def _verify_tf_loop_vars(init_vars,
         entry, exit_, invariant)
 
 
-def _verify_single_cond_var(name, body_var, orelse_var):
+def verify_single_cond_var(name, body_var, orelse_var):
   """Verifies whether body_var and orelse_var are consistent."""
   if body_var is None:
-    raise ValueError('"{}" is None at the end of the TRUE branch.'.format(name))
+    raise ValueError("'{}' is None at the end of the main branch.".format(name))
   if orelse_var is None:
     raise ValueError(
-        '"{}" is None at the end of the FALSE branch.'.format(name))
+        "'{}' is None at the end of the else branch.".format(name))
 
   if isinstance(body_var, (bool, int, float, str, np.ndarray)):
     body_var = ops.convert_to_tensor_v2(body_var)
@@ -255,41 +254,37 @@ def _verify_single_cond_var(name, body_var, orelse_var):
 
   if body_var.dtype != orelse_var.dtype:
     raise TypeError(
-        '"{}" has dtype {} in the TRUE branch, but dtype={} in the FALSE'
-        ' branch. TensorFlow control flow requires that they are the'
-        ' same.'.format(name, body_var.dtype.name,
-                        orelse_var.dtype.name))
+        "'{}' has dtype {} in the main branch, but dtype {} in the else"
+        ' branch'.format(name, body_var.dtype.name,
+                         orelse_var.dtype.name))
+
+
+def _verify_tf_cond_branch_vars(vars_, symbol_names, branch_name):
+  """Verifies variables output by a conditional branch for consistency."""
+  for name, var_ in zip(symbol_names, vars_):
+    if isinstance(var_, variables.Undefined):
+      raise ValueError(
+          "'{}' must also be initialized in the {} branch".format(
+              name, branch_name))
+    if isinstance(var_, variables.UndefinedReturnValue):
+      raise ValueError(
+          'the {} branch must also have a return statement.'.format(
+              branch_name))
 
 
 def _verify_tf_cond_vars(body_vars, orelse_vars, symbol_names):
   """Verifies variables manipulated by a conditional for consistency."""
-  basic_body_vars, composite_body_vars = body_vars
-  basic_orelse_vars, composite_orelse_vars = orelse_vars
-  assert isinstance(composite_body_vars, tuple)
-  assert isinstance(composite_orelse_vars, tuple)
-
-  # TODO(kkb): Make this more consistent.
-  # The basic outputs should always be a tuple.
-  if not isinstance(basic_body_vars, tuple):
-    basic_body_vars = (basic_body_vars,)
-  if not isinstance(basic_orelse_vars, tuple):
-    basic_orelse_vars = (basic_orelse_vars,)
-
-  body_vars = basic_body_vars + composite_body_vars
-  orelse_vars = basic_orelse_vars + composite_orelse_vars
-
   named_vars = zip(symbol_names, body_vars, orelse_vars)
+
   for name, body_var, orelse_var in named_vars:
     try:
-      nest.assert_same_structure(
-          body_var, orelse_var, expand_composites=True)
+      nest.assert_same_structure(body_var, orelse_var, expand_composites=True)
     except (ValueError, TypeError) as e:
       raise TypeError(
-          '"{}" does not have the same nested structure in the TRUE and FALSE'
-          ' branches.\n\n{}'.format(name, str(e)))
-
+          "'{}' must have the same nested structure in the main and else"
+          ' branches:\n\n{}'.format(name, str(e)))
     nest.map_structure(
-        functools.partial(_verify_single_cond_var, name), body_var, orelse_var)
+        functools.partial(verify_single_cond_var, name), body_var, orelse_var)
 
 
 def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
@@ -314,12 +309,16 @@ def for_stmt(iter_, extra_test, body, get_state, set_state, symbol_names, opts):
   `extra_test`, `body`, `get_state` and `set_state` functions must bind to the
   original `geo_mean` and `arith_mean` symbols, using `nonlocal`.
 
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
     iter_: The entity being iterated over.
-    extra_test: Callable with the state as arguments, and boolean return type.
+    extra_test: Callable with boolean return type.
       An additional loop condition.
-    body: Callable with the iterate and the state as arguments, and state as
-      return type. The actual loop body.
+    body: Callable representing the actual loop body.
     get_state: Additional callable which can capture additional state (such as
       the values of composite symbols). This is only useful when staging the
       loop.
@@ -717,11 +716,14 @@ def while_stmt(test, body, get_state, set_state, symbol_names, opts):
   a tuple of entities that represent an actual state, or a list of arguments
   of the corresponding types.
 
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
-    test: Callable with the state as arguments, and boolean return type. The
-      loop condition.
-    body: Callable with the state as arguments, and state as return type. The
-      actual loop body.
+    test: Callable with boolean return type. The loop condition.
+    body: Callable representing the actual loop body.
     get_state: Additional callable which can capture additional state (such as
       the values of composite symbols). This is only useful when staging the
       loop.
@@ -894,21 +896,32 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
   set_state(final_loop_vars)
 
 
-def if_stmt(cond,
-            body,
-            orelse,
-            get_state,
-            set_state,
-            basic_symbol_names,
-            composite_symbol_names):
+def if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Functional form of an if statement.
 
+  The conditional operates on a state, which includes all symbols whose values
+  are a function of the branch taken.
+
+  For example, given the code below that calculates the abs function:
+
+  ```
+    x = 1
+    if x > 0:
+      x = -x
+  ```
+
+  The state is represented by the variable `x`. The `body, `orelse` and
+  `set_state` functions must bind to the original `x` symbol, using `nonlocal`.
+
+  The inputs and outputs of the callables representing the loop blocks are not
+  explicit - instead, these functions must use nonlocal/global for side effects.
+  The inputs and outputs are instead controlled by the set_state/get_state
+  functions.
+
   Args:
     cond: Boolean.
-    body: Callable with no arguments, and outputs of the positive (if) branch as
-      return type.
-    orelse: Callable with no arguments, and outputs of the negative (else)
-      branch as return type.
+    body: Callable representing the main block of the conditional.
+    orelse: Callable representing the else block of the conditional.
     get_state: Function that returns a tuple containing the values of all
       composite symbols modified within the conditional. This allows access to
       state that branches may mutate through side effects. This function is not
@@ -920,123 +933,63 @@ def if_stmt(cond,
       restore checkpointed values. The single argument a tuple containing values
       for each composite symbol that may be modified in a branch of the
       conditional. The is usually the result of a call to get_state.
-    basic_symbol_names: Tuple containing basic loop var names.
-    composite_symbol_names: Tuple containing composite loop var names.
-
-  Returns:
-    Tuple containing the statement outputs.
+    symbol_names: Tuple containing basic loop var names.
+    nouts: Number of variables output by the statement. Vars which are
+      not outputs will not be passed through staged control flow such as
+      tf.cond. This includes variables that are defined before the conditional,
+      but are not used after it.
   """
   # Note: tf.cond doesn't support SparseTensor.
   if tensors.is_dense_tensor(cond):
-    return tf_if_stmt(cond, body, orelse, get_state, set_state,
-                      basic_symbol_names, composite_symbol_names)
+    _tf_if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts)
   else:
-    return _py_if_stmt(cond, body, orelse)
+    _py_if_stmt(cond, body, orelse)
 
 
-def tf_if_stmt(cond, body, orelse, get_state, set_state, basic_symbol_names,
-               composite_symbol_names):
+def _tf_if_stmt(
+    cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Overload of if_stmt that stages a TF cond."""
-  body = _wrap_disallow_undefs_from_cond(body, branch_name='if')
-  orelse = _wrap_disallow_undefs_from_cond(orelse, branch_name='else')
-  body = _isolate_state(body, get_state, set_state)
-  orelse = _isolate_state(orelse, get_state, set_state)
+  if not nouts:
+    prev_get_state, prev_set_state = get_state, set_state
+    # Control flow V1 wants at least one output.
+    get_state = lambda: (0,) + prev_get_state()
+    set_state = lambda v: prev_set_state(v[1:])
+    symbol_names += ('<unused dummy>',)
+    nouts = 1
 
-  # `state` currently includes the values of any composite symbols (e.g. `a.b`)
-  # composites modified by the loop. `final_vars` includes the values of basic
-  # symbols (e.g. `a`) which cannot be passed by reference and must be returned.
-  # See _isolate_state.
-  # TODO(mdan): We should minimize calls to get/set_state.
+  init_vars = get_state()
 
-  body_branch = 0
-  orelse_branch = 1
-  result = [None, None]
+  # TODO(mdan): Use nonlocal once we no longer need to support py2.
+  new_body_vars_ = [None]
+  new_orelse_vars_ = [None]
 
-  def error_checking_body():
-    result[body_branch] = body()
-    if result[orelse_branch] is not None:
-      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
-                           basic_symbol_names + composite_symbol_names)
-    return result[body_branch]
+  def aug_body():
+    set_state(init_vars)
+    body()
+    new_body_vars = get_state()
+    new_body_vars = new_body_vars[:nouts]
+    new_body_vars_[0] = new_body_vars
+    _verify_tf_cond_branch_vars(new_body_vars, symbol_names, 'main')
+    if new_orelse_vars_[0] is not None:
+      _verify_tf_cond_vars(new_body_vars, new_orelse_vars_[0], symbol_names)
+    return new_body_vars
 
-  def error_checking_orelse():
-    result[orelse_branch] = orelse()
-    if result[body_branch] is not None:
-      _verify_tf_cond_vars(result[body_branch], result[orelse_branch],
-                           basic_symbol_names + composite_symbol_names)
-    return result[orelse_branch]
+  def aug_orelse():
+    set_state(init_vars)
+    orelse()
+    new_orelse_vars = get_state()
+    new_orelse_vars = new_orelse_vars[:nouts]
+    new_orelse_vars_[0] = new_orelse_vars
+    _verify_tf_cond_branch_vars(new_orelse_vars, symbol_names, 'else')
+    if new_body_vars_[0] is not None:
+      _verify_tf_cond_vars(new_body_vars_[0], new_orelse_vars, symbol_names)
+    return new_orelse_vars
 
-  final_vars, final_state = control_flow_ops.cond(cond, error_checking_body,
-                                                  error_checking_orelse)
+  final_cond_vars = control_flow_ops.cond(
+      cond, aug_body, aug_orelse, strict=True)
+  final_cond_vars = final_cond_vars + init_vars[nouts:]
 
-  set_state(final_state)
-
-  return final_vars
-
-
-def _isolate_state(func, get_state, set_state):
-  """Wraps func to (best-effort) isolate state mutations that func may do.
-
-  The simplest example of state mutation is mutation of variables (via e.g.
-  attributes), or modification of globals.
-
-  This allows us to more safely execute this function without worrying about
-  side effects when the function wasn't normally expected to execute. For
-  example, staging requires that the function is executed ahead of time, and
-  we need to ensure its effects are not observed during normal execution.
-
-  Args:
-    func: () -> Any
-    get_state: () -> Any, returns the current state
-    set_state: (Any) -> None, resets the state to the specified values.
-      Typically the result of an earlier call to `get_state`.
-
-  Returns:
-    Tuple[Any, Any], where the first element is the return value of `func`,
-    and the second is the final state values.
-  """
-
-  def wrapper():
-    init_state = get_state()
-    new_vars = func()
-    # TODO(mdan): These should be copies, lest set_state might affect them.
-    new_state = get_state()
-    set_state(init_state)
-    return new_vars, new_state
-
-  return wrapper
-
-
-def _wrap_disallow_undefs_from_cond(func, branch_name):
-  """Wraps conditional branch to disallow returning undefined symbols."""
-
-  def wrapper():
-    """Calls function and raises an error if undefined symbols are returned."""
-    results = func()
-
-    if isinstance(results, tuple):
-      results_tuple = results
-    else:
-      results_tuple = results,
-
-    for result in results_tuple:
-      if isinstance(result, variables.UndefinedReturnValue):
-        raise ValueError(
-            'A value must also be returned from the {} branch. If a value is '
-            'returned from one branch of a conditional a value must be '
-            'returned from all branches.'.format(branch_name))
-
-    undefined = [v for v in results_tuple if isinstance(v, variables.Undefined)]
-    if undefined:
-      raise ValueError(
-          'The following symbols must also be initialized in the {} branch: {}.'
-          ' Alternatively, you may initialize them before the if'
-          ' statement.'.format(branch_name,
-                               tuple(s.symbol_name for s in undefined)))
-
-    return results
-
-  return wrapper
+  set_state(final_cond_vars)
 
 
 def _py_if_stmt(cond, body, orelse):
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 1c4407904b2..57288be9a9f 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -543,21 +543,21 @@ class ForLoopTest(test.TestCase):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+    with self.assertRaisesRegex(ValueError, '\'s\' may not be None'):
       self._basic_loop(None, lambda i, s: s)
-    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
+    with self.assertRaisesRegex(ValueError, '\'s\' must be defined'):
       self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
-    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+    with self.assertRaisesRegex(ValueError, '\'s\' is None at the end'):
       self._basic_loop(0, lambda i, s: None)
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+    with self.assertRaisesRegex(TypeError, '\'s\'.* dtype float32 after'):
       self._basic_loop(0, lambda i, s: 1.0)
 
   def test_tensor_shape_change(self):
-    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+    with self.assertRaisesRegex(ValueError, r'\'s\'.* shape \(1,\) after'):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
@@ -782,21 +782,21 @@ class WhileLoopTest(test.TestCase):
     return s
 
   def test_tensor_illegal_input(self):
-    with self.assertRaisesRegex(ValueError, '"s" may not be None'):
+    with self.assertRaisesRegex(ValueError, "'s' may not be None"):
       self._basic_loop(None, lambda i, s: s)
-    with self.assertRaisesRegex(ValueError, '"s" must be defined'):
+    with self.assertRaisesRegex(ValueError, "'s' must be defined"):
       self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
-    with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
+    with self.assertRaisesRegex(ValueError, "'s' is None at the end"):
       self._basic_loop(0, lambda i, s: None)
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s".* dtype float32 after'):
+    with self.assertRaisesRegex(TypeError, "'s'.* dtype float32 after"):
       self._basic_loop(0, lambda i, s: 1.0)
 
   def test_tensor_shape_change(self):
-    with self.assertRaisesRegex(ValueError, r'"s".* shape \(1,\) after'):
+    with self.assertRaisesRegex(ValueError, r"'s'.* shape \(1,\) after"):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
 
@@ -806,29 +806,88 @@ class IfStmtTest(test.TestCase):
   def test_tensor(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i
+        i = constant_op.constant(1)
+
+      def orelse():
+        nonlocal i
+        i = constant_op.constant(-1)
+
+      def set_state(cond_vars):
+        nonlocal i
+        i, = cond_vars
+
+      i = None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: constant_op.constant(1),
-          orelse=lambda: constant_op.constant(-1),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i,),
+          set_state=set_state,
+          symbol_names=('i',),
+          nouts=1)
+      return i
 
     self.assertEqual(1, self.evaluate(test_fn(constant_op.constant(True))))
     self.assertEqual(-1, self.evaluate(test_fn(constant_op.constant(False))))
 
+  def test_tensor_no_outputs(self):
+
+    def test_fn(cond):
+      def body():
+        nonlocal i
+        i = constant_op.constant(1)
+
+      def orelse():
+        nonlocal i
+        i = constant_op.constant(-1.0)
+
+      def set_state(cond_vars):
+        nonlocal i
+        i, = cond_vars
+
+      i = None
+      control_flow.if_stmt(
+          cond=cond,
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i,),
+          set_state=set_state,
+          symbol_names=('i',),
+          nouts=0)
+      return i
+
+    self.assertEqual(None, test_fn(constant_op.constant(True)))
+    self.assertEqual(None, test_fn(constant_op.constant(False)))
+
   def test_tensor_multiple_returns(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i, j
+        i = constant_op.constant(1)
+        j = constant_op.constant(2)
+
+      def orelse():
+        nonlocal i, j
+        i = constant_op.constant(-1)
+        j = constant_op.constant(-2)
+
+      def set_state(cond_vars):
+        nonlocal i, j
+        i, j = cond_vars
+
+      i, j = None, None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: (constant_op.constant(1), constant_op.constant(2)),
-          orelse=lambda: (constant_op.constant(-1), constant_op.constant(-2)),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (i, j),
+          set_state=set_state,
+          symbol_names=('i', 'j'),
+          nouts=2)
+      return i, j
 
     self.assertEqual((1, 2), self.evaluate(test_fn(constant_op.constant(True))))
     self.assertEqual((-1, -2),
@@ -837,14 +896,24 @@ class IfStmtTest(test.TestCase):
   def test_python(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i
+        i = 1
+
+      def orelse():
+        nonlocal i
+        i = -1
+
+      i = None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: 1,
-          orelse=lambda: -1,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=None,
+          set_state=None,
+          symbol_names=('i',),
+          nouts=1)
+      return i
 
     self.assertEqual(1, test_fn(True))
     self.assertEqual(-1, test_fn(False))
@@ -852,48 +921,75 @@ class IfStmtTest(test.TestCase):
   def test_python_multiple_returns(self):
 
     def test_fn(cond):
-      return control_flow.if_stmt(
+      def body():
+        nonlocal i, j
+        i = 1
+        j = 2
+
+      def orelse():
+        nonlocal i, j
+        i = -1
+        j = -2
+
+      i, j = None, None
+      control_flow.if_stmt(
           cond=cond,
-          body=lambda: (1, 2),
-          orelse=lambda: (-1, -2),
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('_',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=None,
+          set_state=None,
+          symbol_names=('i', 'j'),
+          nouts=2)
+      return i, j
 
     self.assertEqual((1, 2), test_fn(True))
     self.assertEqual((-1, -2), test_fn(False))
 
-  def _basic_cond(self, true_value, false_value):
+  def _basic_cond(self, body_fn, else_fn):
+    def body():
+      nonlocal x
+      x = body_fn()
+
+    def orelse():
+      nonlocal x
+      x = else_fn()
+
+    def set_state(cond_vars):
+      nonlocal x
+      x, = cond_vars
+
+    x = 0
     # Eager cond had different semantics, we don't test those here.
     with func_graph.FuncGraph('tmp').as_default():
-      return control_flow.if_stmt(
+      control_flow.if_stmt(
           cond=constant_op.constant(True),
-          body=true_value,
-          orelse=false_value,
-          get_state=lambda: (),
-          set_state=lambda _: None,
-          basic_symbol_names=('s',),
-          composite_symbol_names=())
+          body=body,
+          orelse=orelse,
+          get_state=lambda: (x,),
+          set_state=set_state,
+          symbol_names=('x',),
+          nouts=1)
+    return x
 
   def test_tensor_none_output(self):
     with self.assertRaisesRegex(
-        ValueError, '"s" is None at the end of the TRUE branch'):
+        ValueError, "'x' is None at the end of the main branch"):
       self._basic_cond(lambda: None, lambda: 1)
     with self.assertRaisesRegex(
-        ValueError, '"s" is None at the end of the FALSE branch'):
+        ValueError, "'x' is None at the end of the else branch"):
       self._basic_cond(lambda: 1, lambda: None)
 
   def test_tensor_undefined_output(self):
     with self.assertRaisesRegex(
-        ValueError, "must also be initialized in the if.*'s'"):
-      self._basic_cond(lambda: variable_operators.Undefined('s'), lambda: 1)
+        ValueError, "'x' must also be initialized in the main branch"):
+      self._basic_cond(lambda: variable_operators.Undefined('x'), lambda: 1)
     with self.assertRaisesRegex(
-        ValueError, "must also be initialized in the else.*'s'"):
+        ValueError, "'x' must also be initialized in the else branch"):
       self._basic_cond(lambda: 1, lambda: variable_operators.Undefined('s'))
 
   def test_tensor_dtype_change(self):
-    with self.assertRaisesRegex(TypeError, '"s" has dtype int32.*but.*float32'):
+    with self.assertRaisesRegex(
+        TypeError, "'x' has dtype int32.*but.*float32"):
       self._basic_cond(lambda: 1, lambda: 1.0)
 
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/activity.py b/tensorflow/python/autograph/pyct/static_analysis/activity.py
index ca68bc9911c..0e19da87451 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/activity.py
@@ -70,6 +70,9 @@ class Scope(object):
     globals: Set[qual_names.QN], names that are explicitly marked as global in
       this scope. Note that this doesn't include free read-only vars bound to
       global symbols.
+    nonlocals: Set[qual_names.QN], names that are explicitly marked as nonlocal
+      in this scope. Note that this doesn't include free read-only vars bound to
+      global symbols.
     free_vars: Set[qual_names.QN], the free variables in this scope. See
       https://docs.python.org/3/reference/executionmodel.html for a precise
       definition.
@@ -111,6 +114,7 @@ class Scope(object):
 
     self.bound = set()
     self.globals = set()
+    self.nonlocals = set()
     self.annotations = set()
 
     self.params = weakref.WeakValueDictionary()
@@ -186,6 +190,7 @@ class Scope(object):
         self.parent.modified.update(self.modified - self.isolated_names)
         self.parent.bound.update(self.bound - self.isolated_names)
         self.parent.globals.update(self.globals)
+        self.parent.nonlocals.update(self.nonlocals)
         self.parent.annotations.update(self.annotations)
       else:
         # TODO(mdan): This is not accurate.
@@ -363,6 +368,7 @@ class ActivityAnalyzer(transformer.Base):
       qn = qual_names.QN(name)
       self.scope.read.add(qn)
       self.scope.bound.add(qn)
+      self.scope.nonlocals.add(qn)
     self._exit_and_record_scope(node)
     return node
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index 64b00fcbeba..ac91b662a47 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -404,6 +404,46 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
 
     self.assertHasDefinedIn(fn_body[1], ('a',))
 
+  def test_definitions_in_except_block(self):
+
+    def test_fn():
+      try:
+        pass
+      except ValueError:
+        a = None
+      if a:  # pylint:disable=using-constant-test
+        a = None
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasDefs(fn_body[1].test, 1)
+    self.assertHasDefs(fn_body[1].body[0].targets[0], 1)
+    self.assertHasDefs(fn_body[2].value, 2)
+
+    self.assertHasDefinedIn(fn_body[1], ('a',))
+
+  def test_definitions_in_except_block_of_raising_try(self):
+
+    def test_fn():
+      try:
+        raise ValueError()
+      except ValueError:
+        a = None
+      if a:  # pylint:disable=using-constant-test
+        a = None
+      return a
+
+    node = self._parse_and_analyze(test_fn)
+    fn_body = node.body
+
+    self.assertHasDefs(fn_body[1].test, 1)
+    self.assertHasDefs(fn_body[1].body[0].targets[0], 1)
+    self.assertHasDefs(fn_body[2].value, 2)
+
+    self.assertHasDefinedIn(fn_body[1], ('a',))
+
   def test_global(self):
 
     def test_fn():

From af2263101b0407ff1fb7f7e492565f1edc4cee30 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Tue, 19 May 2020 15:06:11 -0700
Subject: [PATCH 0848/1533] Don't try t gather from empty tensors

PiperOrigin-RevId: 312361331
Change-Id: I8f81add090d9a5452e671c48a03e0f5cb9f81a41
---
 tensorflow/core/kernels/gather_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 849a2b4389f..3ff7afca7df 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -154,6 +154,7 @@ class GatherOp : public OpKernel {
     Tensor* out = nullptr;
     OP_REQUIRES_OK(c, c->allocate_output(0, result_shape, &out));
     if (N == 0) return;
+    if (inner_size == 0) return;
 
     int64 bad_i = -1;
     auto indices_flat = indices.flat<Index>();

From ca53894d61ca46e3d6a007a6de0c8c3458ead931 Mon Sep 17 00:00:00 2001
From: Lucy Fox <lucyfox@google.com>
Date: Tue, 19 May 2020 15:12:23 -0700
Subject: [PATCH 0849/1533] Canonicalize tf.Select to tf.SelectV2.

The ops are mostly equivalent, except that Select has stricter requirements and does not support broadcasting, whereas SelectV2 does.

There is one special case to be considered in this canonicalization, which is when the predicate is a tensor and the data is multidimensional. In this case, Select op semantics dictate that the predicate tensor length must match the size of the first data dimension. This varies from normal broadcasting semantics, which are used in SelectV2, so we must reshape the tensor in this case to be compatible.

This also adds verifiers and tests for the Select and SelectV2 ops in the MLIR TF dialect.

PiperOrigin-RevId: 312362580
Change-Id: I43f326ad330c92ce279b25cecf5a2cf46714ce3f
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |   8 +-
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 108 +++++++++++++++++
 .../mlir/tensorflow/tests/canonicalize.mlir   |  53 +++++++++
 .../mlir/tensorflow/tests/tf-ops.mlir         | 110 ++++++++++++++++++
 .../tensorflow/transforms/canonicalize.td     |  17 +++
 .../compiler/mlir/xla/tests/legalize-tf.mlir  |  29 ++---
 .../xla/transforms/legalize_tf_patterns.td    |  12 --
 tensorflow/compiler/tests/ternary_ops_test.py |   1 -
 8 files changed, 303 insertions(+), 35 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index d53bafff638..fd24b7284c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -7436,9 +7436,15 @@ select(condition, t, e) ==> [[1, 2],
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<1>;
+
+  let hasCanonicalizer = 1;
+
+  let verifier = [{
+    return Verify(*this);
+  }];
 }
 
-def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect]> {
+def TF_SelectV2Op : TF_Op<"SelectV2", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 7fcc82f6757..1b6dbfe3e1a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -251,6 +251,39 @@ static LogicalResult VerifyTypesCompatibility(
   return success();
 }
 
+// This is a helper for the Select to SelectV2 canonicalization. The `data` rank
+// refers to the rank of `t`/`e` (these two inputs have equal rank; this is
+// checked in the verifier).
+//
+// In most cases, the predicate for Select can be used directly as the predicate
+// for SelectV2. However, there is one case that varies, which is when the
+// predicate is a tensor and the data is multidimensional. In this case, Select
+// op semantics dictate that the predicate tensor length must match the size of
+// the first data dimension. This varies from normal broadcasting semantics
+// (which are used in SelectV2), so we must reshape the tensor in this case to
+// be compatible.
+static Value ReshapeSelectPredIfNecessary(OpBuilder *builder, Location loc,
+                                          Value cond, int data_rank) {
+  auto cond_tensor = cond.getType().cast<RankedTensorType>();
+  // Reshape is only needed in the case that the cond rank is 1 (i.e. it is
+  // a vector) AND t/e rank is > 1.
+  if (cond_tensor.getRank() != 1 || data_rank <= 1) {
+    // No reshape necessary. Leave cond as it is.
+    return cond;
+  }
+
+  // This is the case where a reshape is needed. We want to construct the
+  // shape [x,1,...1], where x is the value in the pred tensor and the
+  // length of the shape is equal to data_rank.
+  SmallVector<int64_t, 8> shape(data_rank, 1);
+  shape[0] = cond_tensor.getShape().front();
+  auto new_shape_type =
+      RankedTensorType::get({data_rank}, builder->getIntegerType(64));
+  auto shape_attr = DenseIntElementsAttr::get(new_shape_type, shape);
+  auto new_shape = builder->create<ConstOp>(loc, shape_attr);
+  return builder->create<ReshapeOp>(loc, cond, new_shape);
+}
+
 //===----------------------------------------------------------------------===//
 // Helper functions detect device capabilities from RuntimeDevices.
 //===----------------------------------------------------------------------===//
@@ -2550,6 +2583,81 @@ void ReshapeOp::build(OpBuilder &builder, OperationState &result, Value tensor,
   return unranked();
 }
 
+//===----------------------------------------------------------------------===//
+// SelectOp
+//===----------------------------------------------------------------------===//
+
+void SelectOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
+                                           MLIRContext *context) {
+  results.insert<SelectToSelectV2>(context);
+}
+
+// Verifies a few extra requirements on SelectOp:
+// (1) `then` and `else` must have same shape
+// (2) At least one of the following must be true:
+//     (a) `cond` has the same rank as `then` and `else`
+//     (b) `cond` is a scalar
+//     (c) `cond` is a vector AND `then` and `else` are non-scalar with their
+//         first dimension equal to `cond`.
+static LogicalResult Verify(SelectOp op) {
+  auto then_tensor = op.t().getType().cast<TensorType>();
+  auto else_tensor = op.e().getType().cast<TensorType>();
+  // Check (1).
+  if (!AreCastCompatible({then_tensor, else_tensor}))
+    return op.emitOpError() << "requires t and e have compatible shapes";
+
+  // Get data rank (if exists).
+  int data_rank;
+  // If data is unranked or data_rank is 0, this will remain -2. Otherwise
+  // refers to first dimension of then and/or else.
+  int data_first_dim = -2;
+  bool then_has_rank = then_tensor.hasRank();
+  bool else_has_rank = else_tensor.hasRank();
+  if (then_has_rank && else_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = std::max(
+          static_cast<int>(else_tensor.getShape().front()), data_first_dim);
+  } else if (then_has_rank) {
+    data_rank = then_tensor.getRank();
+    if (then_tensor.getRank() > 0)
+      data_first_dim = then_tensor.getShape().front();
+  } else if (else_has_rank) {
+    data_rank = else_tensor.getRank();
+    if (else_tensor.getRank() > 0)
+      data_first_dim = else_tensor.getShape().front();
+  } else {
+    // Neither has a rank.
+    return success();
+  }
+
+  auto cond_tensor = op.condition().getType().dyn_cast<RankedTensorType>();
+  if (!cond_tensor) return success();
+  auto cond_rank = cond_tensor.getRank();
+  // Check (2a) and (2b).
+  if (cond_rank == 0 || cond_rank == data_rank) return success();
+  // Check (2c).
+  if (cond_rank == 1) {
+    auto cond_shape = cond_tensor.getShape().front();
+    if (data_rank == 0) {
+      return op.emitOpError()
+             << "requires that t and e are nonscalar when pred is a vector";
+    }
+    // We know `data` tensor has a rank of at least 1.
+    if (data_first_dim != -1 && cond_shape != -1 &&
+        data_first_dim != cond_shape) {
+      return op.emitOpError() << "requires that, when pred is a vector, the "
+                                 "shape matches the first dimension of t and e";
+    }
+    return success();
+  }
+  // None of (2a,b,c) were true; fail.
+  return op.emitOpError() << "requires that pred is a scalar OR has the same "
+                             "rank as t and e OR is a vector";
+}
+
 //===----------------------------------------------------------------------===//
 // SelectV2Op
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index e05894dc266..20f4dd79715 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -258,6 +258,59 @@ func @testDoubleReciprocal(%arg0: tensor<8x16x32x64xi32>) -> tensor<8x16x32x64xi
 // CHECK: return %arg0
 }
 
+// CHECK-LABEL: testSelectScalarPred
+func @testSelectScalarPred(%arg0: tensor<i1>, %arg1: tensor<4x2xf16>, %arg2: tensor<4x2xf16>) -> tensor<4x2xf16> {
+  // CHECK-NEXT: "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<4x2xf16>, tensor<4x2xf16>) -> tensor<4x2xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<4x2xf16>, tensor<4x2xf16>) -> tensor<4x2xf16>
+  return %0: tensor<4x2xf16>
+}
+
+// CHECK-LABEL: testSelectVectorPred
+func @testSelectVectorPred(%arg0: tensor<2xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: %[[SHAPE:.*]] = "tf.Const"
+  // CHECK-NEXT: %[[PRED:.*]] = "tf.Reshape"(%arg0, %[[SHAPE]]) : (tensor<2xi1>, tensor<2xi64>) -> tensor<2x1xi1>
+  // CHECK-NEXT: "tf.SelectV2"(%[[PRED]], %arg1, %arg2) : (tensor<2x1xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// CHECK-LABEL: testSelectAllSameShape
+func @testSelectAllSameShape(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// If we don't have guarantees on input shapes, we can't support canonicalizing
+// to SelectV2. Test these cases.
+// CHECK-LABEL: testSelectInvalid
+func @testSelectInvalid(%arg0: tensor<?xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<?xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// CHECK-LABEL: testSelectInvalidUnranked
+func @testSelectInvalidUnranked(%arg0: tensor<6x7xi1>, %arg1: tensor<*xf16>, %arg2: tensor<*xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<6x7xi1>, tensor<*xf16>, tensor<*xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
+// CHECK-LABEL: testSelectThenUnranked
+func @testSelectThenUnranked(%arg0: tensor<3xi1>, %arg1: tensor<*xf16>, %arg2: tensor<3x2xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<*xf16>, tensor<3x2xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
+// CHECK-LABEL: testSelectElseUnranked
+func @testSelectElseUnranked(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<*xf16>) -> tensor<*xf16> {
+  // CHECK-NEXT: tf.Select
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<*xf16>) -> tensor<*xf16>
+  return %0: tensor<*xf16>
+}
+
 // CHECK-LABEL: testLogicalNotOfEqual
 func @testLogicalNotOfEqual(%arg0: tensor<8x16xf32>, %arg1: tensor<8x16xf32>) -> tensor<8x16xi1> {
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<8x16xf32>, tensor<8x16xf32>) -> tensor<8x16xi1>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 3560fec7b7d..82e60a08e2e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1007,6 +1007,116 @@ func @pcall_func_2(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
 
 // -----
 
+//===--------------------------------------------------------------------===//
+//  tf.Select
+//===--------------------------------------------------------------------===//
+
+// Test valid tf.Select
+// CHECK-LABEL: func @testSelect
+func @testSelect(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<3x2xf16>) -> tensor<3x2xf16> {
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<3x2xf16>) -> tensor<3x2xf16>
+  return %0: tensor<3x2xf16>
+}
+
+// -----
+
+func @testInvalidSelect(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // expected-error @+1 {{requires that, when pred is a vector, the shape matches the first dimension of t and e}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
+// -----
+
+// Test invalid tf.Select - broadcasting then/else parameters is not supported
+func @selectBroadcastThen(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  // expected-error @+1 {{requires t and e have compatible shapes}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+func @invalidSelect(%arg0: tensor<2xi1>, %arg1: tensor<i32>, %arg2: tensor<i32>) -> tensor<2xi32> {
+  // expected-error @+1 {{requires that t and e are nonscalar when pred is a vector}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<i32>, tensor<i32>) -> tensor<2xi32>
+  return %0: tensor<2xi32>
+}
+
+// -----
+
+func @invalidSelect(%arg0: tensor<1x8xi1>, %arg1: tensor<1x8x8xi32>, %arg2: tensor<1x8x8xi32>) -> tensor<1x8x8xi32> {
+  // expected-error @+1 {{requires that pred is a scalar OR has the same rank as t and e OR is a vector}}
+  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<1x8xi1>, tensor<1x8x8xi32>, tensor<1x8x8xi32>) -> tensor<1x8x8xi32>
+  return %0: tensor<1x8x8xi32>
+}
+
+// -----
+
+//===--------------------------------------------------------------------===//
+//  tf.SelectV2
+//===--------------------------------------------------------------------===//
+
+// Test valid tf.SelectV2
+// CHfaECK-LABEL: func @selectV2BroadcastThen
+func @selectV2BroadcastThen(%arg0: tensor<i1>, %arg1: tensor<8x1xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<8x1xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// Test valid tf.SelectV2
+// CHECK-LABEL: func @selectV2BroadcastElse
+func @selectV2BroadcastElse(%arg0: tensor<i1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<8x1xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x8x8xi32>, tensor<8x1xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// Test valid tf.SelectV2
+// CHECK-LABEL: func @selectV2BroadcastPred
+func @selectV2BroadcastPred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<2x8x8xi32>) -> tensor<2x8x8xi32>
+  return %0: tensor<2x8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2BroadcastAll
+func @selectV2BroadcastAll(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<8x1x1xi1>, tensor<1x8x1xi32>, tensor<1x1x8xi32>) -> tensor<8x8x8xi32>
+  return %0: tensor<8x8x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2DynamicRanked
+func @selectV2DynamicRanked(%arg0: tensor<1xi1>, %arg1: tensor<2x?x8xi32>, %arg2: tensor<2x8x8xi32>) -> tensor<2x?x8xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x?x8xi32>, tensor<2x8x8xi32>) -> tensor<2x?x8xi32>
+  return %0: tensor<2x?x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: func @selectV2Unranked
+func @selectV2Unranked(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %arg2: tensor<*xi32>) -> tensor<*xi32> {
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<1xi1>, tensor<2x8x8xi32>, tensor<*xi32>) -> tensor<*xi32>
+  return %0: tensor<*xi32>
+}
+
+// -----
+
+// Test invalid tf.SelectV2: this is an invalid broadcast for the predicate
+func @testInvalidSelectV2(%arg0: tensor<3xi1>, %arg1: tensor<3x2xf16>, %arg2: tensor<3x2xf16>) -> tensor<3x2xf16> {
+  // expected-error @+1 {{operands don't have broadcast-compatible shapes}}
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<3x2xf16>, tensor<3x2xf16>) -> tensor<3x2xf16>
+  return %0: tensor<3x2xf16>
+}
+
+// -----
+
 //===--------------------------------------------------------------------===//
 //  tf.Softmax
 //===--------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
index ccc3e83a2a2..cf09f8d64fb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/canonicalize.td
@@ -152,6 +152,23 @@ def RealDivWithSqrtDivisor : Pat<(TF_RealDivOp $arg0, (TF_SqrtOp $arg1)),
 def ReciprocalNested : Pat<(TF_ReciprocalOp (TF_ReciprocalOp $arg)),
                            (replaceWithValue $arg)>;
 
+//===----------------------------------------------------------------------===//
+// Select op patterns.
+//===----------------------------------------------------------------------===//
+
+def ReshapeSelectPredIfNecessary : NativeCodeCall<
+  "ReshapeSelectPredIfNecessary(&($_builder), $0.getOwner()->getLoc(), $1, "
+  "$2.getType().cast<RankedTensorType>().getRank())">;
+
+// Select supports tensor `condition` where the shape is equal to the first
+// dimension of t and e. SelectV2 op supports normal broadcasting, so in these
+// cases the condition needs to be reshaped.
+def SelectToSelectV2 : Pat<
+  (TF_SelectOp:$op StaticShapeTensorOf<[AnyType]>:$cond,
+                   StaticShapeTensorOf<[AnyType]>:$t,
+                   StaticShapeTensorOf<[AnyType]>:$e),
+  (TF_SelectV2Op (ReshapeSelectPredIfNecessary $op, $cond, $t), $t, $e)>;
+
 //===----------------------------------------------------------------------===//
 // Square op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index bfa96413e7c..2288e0fefc4 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1320,27 +1320,6 @@ func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tens
 // Select op legalizations.
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: func @select
-func @select(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
-  return %0: tensor<2xi32>
-}
-
-// CHECK-LABEL: func @select_float
-func @select_float(%arg0: tensor<2xi1>, %arg1: tensor<2xf32>, %arg2: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
-  return %0: tensor<2xf32>
-}
-
-// CHECK-LABEL: func @select_multidimensional
-func @select_multidimensional(%arg0: tensor<3x2xi1>, %arg1: tensor<3x2xi32>, %arg2: tensor<3x2xi32>) -> tensor<3x2xi32> {
-  // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
-  %0 = "tf.Select"(%arg0, %arg1, %arg2) : (tensor<3x2xi1>, tensor<3x2xi32>, tensor<3x2xi32>) -> tensor<3x2xi32>
-  return %0: tensor<3x2xi32>
-}
-
 // CHECK-LABEL: func @selectv2
 func @selectv2(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT: "xla_hlo.select"(%arg0, %arg1, %arg2)
@@ -1379,6 +1358,14 @@ func @selectv2_broadcast_pred(%arg0: tensor<1xi1>, %arg1: tensor<2x8x8xi32>, %ar
   return %0: tensor<2x8x8xi32>
 }
 
+// CHECK-LABEL: func @selectv2_broadcast_tensor_pred
+func @selectv2_broadcast_tensor_pred(%arg0: tensor<3xi1>, %arg1: tensor<2x3xf16>, %arg2: tensor<2x3xf16>) -> tensor<2x3xf16> {
+  // CHECK: %[[BROADCAST:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi1>) -> tensor<2x3xi1>
+  // CHECK: "xla_hlo.select"(%[[BROADCAST]], %arg1, %arg2)
+  %0 = "tf.SelectV2"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xf16>, tensor<2x3xf16>) -> tensor<2x3xf16>
+  return %0: tensor<2x3xf16>
+}
+
 // CHECK-LABEL: func @selectv2_broadcast_all
 func @selectv2_broadcast_all(%arg0: tensor<8x1x1xi1>, %arg1: tensor<1x8x1xi32>, %arg2: tensor<1x1x8xi32>) -> tensor<8x8x8xi32> {
   // CHECK-DAG: %[[BROADCAST_0:.*]] = "xla_hlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<8x1x1xi1>) -> tensor<8x8x8xi1>
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 33c92ee65d5..19fc42714b0 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -521,18 +521,6 @@ def ConvertAxisAttr : NativeCodeCall<"ConvertAxisAttr($0, $1, &$_builder)">;
 def : Pat<(TF_ReverseV2Op AnyRankedTensor:$values, (TF_ConstOp $axis)),
     (HLO_ReverseOp $values, (ConvertAxisAttr $values, $axis))>;
 
-//===----------------------------------------------------------------------===//
-// Ternary op patterns.
-//===----------------------------------------------------------------------===//
-
-def BothTypesMatch : Constraint<CPred<"$0.getType() == $1.getType()">,
-   "types must be equal">;
-
-def : Pat<(TF_SelectOp $cond, $t, $e), (HLO_SelectOp $cond, $t, $e),
-  // TODO(jpienaar): This restriction is to avoid creating a currently
-  // unsupported HLO select.
-  [(BothTypesMatch $t, $e)]>;
-
 //===----------------------------------------------------------------------===//
 // Unary op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/tests/ternary_ops_test.py b/tensorflow/compiler/tests/ternary_ops_test.py
index a1bb64eb88d..7bbfecff403 100644
--- a/tensorflow/compiler/tests/ternary_ops_test.py
+++ b/tensorflow/compiler/tests/ternary_ops_test.py
@@ -77,7 +77,6 @@ class TernaryOpsTest(xla_test.XLATestCase, parameterized.TestCase):
         np.int32(2),
         expected=np.array([1, 3, 5], dtype=np.int32))
 
-  @test_util.disable_mlir_bridge('TODO(b/155949336)')
   def testSelect(self):
     for dtype in self.numeric_types:
       self._testTernary(

From 930709e46e3b80345ffeac92b7728873dc97b0a7 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 19 May 2020 15:12:39 -0700
Subject: [PATCH 0850/1533] Hexagon delegate: support SLICE op for uint8/int8

PiperOrigin-RevId: 312362625
Change-Id: Idf0185c33bc7d64cf480a70166c15f64b0a409b1
---
 .../experimental/delegates/hexagon/README.md  |   1 +
 .../delegates/hexagon/builders/BUILD          |   2 +
 .../delegates/hexagon/builders/op_builder.cc  |   2 +
 .../delegates/hexagon/builders/op_factory.h   |   1 +
 .../hexagon/builders/slice_builder.cc         | 106 ++++++++++++
 .../hexagon/builders/slice_builder.h          |  45 +++++
 .../delegates/hexagon/builders/tests/BUILD    |   1 +
 .../hexagon/builders/tests/slice_test.cc      | 163 ++++++++++++++++++
 .../experimental/delegates/hexagon/utils.cc   |  11 ++
 9 files changed, 332 insertions(+)
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc

diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index 6e627c17cd2..b0d97b42c99 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -95,6 +95,7 @@ are verified in `IsNodeSupportedByHexagon`:
   * Constraints:
     - Requested size <= 65 (b/143105433)
 * Resize Nearest Neighbor
+* Slice
 * SoftMax
 * SpaceToDepth
 * Split
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index e24adc2537c..cd911bff2a4 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -30,6 +30,7 @@ cc_library(
         "reshape_builder.cc",
         "resize_bilinear_builder.cc",
         "resize_nearest_neighbor_builder.cc",
+        "slice_builder.cc",
         "softmax_builder.cc",
         "space_to_depth_builder.cc",
         "split_builder.cc",
@@ -58,6 +59,7 @@ cc_library(
         "reshape_builder.h",
         "resize_bilinear_builder.h",
         "resize_nearest_neighbor_builder.h",
+        "slice_builder.h",
         "softmax_builder.h",
         "space_to_depth_builder.h",
         "split_builder.h",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index 230a292b6fe..072f8da6fff 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -97,6 +97,8 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateMinMaxBuilder(this, OP_QuantizedMinimum_8);
     case kTfLiteBuiltinMaximum:
       return CreateMinMaxBuilder(this, OP_QuantizedMaximum_8);
+    case kTfLiteBuiltinSlice:
+      return CreateSliceOpBuilder(this, OP_QuantizedSlice_8);
     default:
       context_->ReportError(context_, "Op not supported: %d", op_type);
       return nullptr;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index 515d0edb929..181ad57b3cb 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -55,6 +55,7 @@ OpBuilder* CreateQuantizeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type);
 
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc
new file mode 100644
index 00000000000..cc282343f0c
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h"
+
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/tensor.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+namespace {
+template <typename T>
+void GetBeginAndSizeVectors(int dimensions, const TfLiteTensor* begin,
+                            const TfLiteTensor* size, std::vector<int>* begins,
+                            std::vector<int>* sizes) {
+  for (int i = 0; i < dimensions; ++i) {
+    begins->push_back(GetTensorData<T>(begin)[i]);
+    sizes->push_back(GetTensorData<T>(size)[i]);
+  }
+}
+}  // namespace
+
+TfLiteStatus SliceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                              const TfLiteIntArray* outputs,
+                                              TfLiteContext* context) {
+  static int quant_bound_shape[] = {1, 1, 1, 1};
+
+  // Input data tensor.
+  const int tensor_id = inputs->data[0];
+  const auto& input_tensor = context->tensors[tensor_id];
+  AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+  // Start / Size
+  const auto& begin_tensor = context->tensors[inputs->data[1]];
+  const auto& size_tensor = context->tensors[inputs->data[2]];
+  std::vector<int32_t> begins, sizes;
+  if (begin_tensor.type == kTfLiteInt32) {
+    GetBeginAndSizeVectors<int32_t>(input_tensor.dims->size, &begin_tensor,
+                                    &size_tensor, &begins, &sizes);
+  } else if (begin_tensor.type == kTfLiteInt64) {
+    GetBeginAndSizeVectors<int64_t>(input_tensor.dims->size, &begin_tensor,
+                                    &size_tensor, &begins, &sizes);
+  } else {
+    return kTfLiteError;
+  }
+  const int32_t begins_shape[] = {1, 1, 1, static_cast<int32_t>(begins.size())};
+  auto begins_node = graph_builder_->AddConstNodeWithData(
+      begins_shape, reinterpret_cast<char*>(begins.data()),
+      sizeof(int32_t) * begins.size());
+  auto sizes_node = graph_builder_->AddConstNodeWithData(
+      begins_shape, reinterpret_cast<char*>(sizes.data()),
+      sizeof(int32_t) * begins.size());
+  AddInput(TensorID(begins_node->GetID(), 0));
+  AddInput(TensorID(sizes_node->GetID(), 0));
+
+  // Input min/max
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_));
+  auto* input_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_min_),
+      sizeof(input_min_));
+  auto* input_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&input_max_),
+      sizeof(input_max_));
+  AddInput(TensorID(input_min_const->GetID(), 0));
+  AddInput(TensorID(input_max_const->GetID(), 0));
+
+  // Outputs
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  return kTfLiteOk;
+}
+
+TfLiteStatus SliceOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                             TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new SliceOpBuilder(graph_builder, op_type);
+}
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h
new file mode 100644
index 00000000000..0ee06630dba
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/slice_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class SliceOpBuilder : public OpBuilder {
+ public:
+  explicit SliceOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  float input_min_, input_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_SLICE_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index a5cdc0411ca..bcabf0dbe62 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -40,6 +40,7 @@ hexagon_op_tests(
         "reduce_test.cc",
         "reshape_test.cc",
         "resize_test.cc",
+        "slice_test.cc",
         "softmax_test.cc",
         "space_to_depth_test.cc",
         "split_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc
new file mode 100644
index 00000000000..d3bcfb6a6c2
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/slice_test.cc
@@ -0,0 +1,163 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename index_type>
+class SliceOpModel : public SingleOpModelWithHexagon {
+ public:
+  SliceOpModel(const TensorData& input, const TensorData& output,
+               const TensorData& begin, const TensorData& size,
+               std::initializer_list<index_type> begin_data,
+               std::initializer_list<index_type> size_data) {
+    input_ = AddInput(input);
+    begin_ = AddConstInput(begin, begin_data);
+    size_ = AddConstInput(size, size_data);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SLICE, BuiltinOptions_SliceOptions,
+                 CreateSliceOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_), GetShape(begin_), GetShape(size_)});
+  }
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int size_;
+  int output_;
+};
+
+TEST(SliceOpTest, Input_1D_Uint8) {
+  SliceOpModel<int32_t> m(/*input=*/{TensorType_UINT8, {4}, -10, 10},
+                          /*output=*/{TensorType_UINT8, {2}, -10, 10},
+                          {TensorType_INT32, {1}}, {TensorType_INT32, {1}}, {1},
+                          {2});
+  m.SetInput<uint8_t>({1, 2, 3, 4});
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({2, 3}, 0.1)));
+}
+
+TEST(SliceOpTest, Input_2D_Uint8) {
+  SliceOpModel<int32_t> m(
+      /*input=*/{TensorType_UINT8, {2, 3}, -10, 10},
+      /*output=*/{TensorType_UINT8, {1, 2}, -10, 10}, {TensorType_INT32, {2}},
+      {TensorType_INT32, {2}}, {1, 0}, {1, 2});
+  m.SetInput<uint8_t>({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, SizeInt64_Uint8) {
+  SliceOpModel<int64_t> m(/*input=*/{TensorType_UINT8, {4, 1, 1, 1}, -10, 10},
+                          /*output=*/{TensorType_UINT8, {3, 1, 1, 1}, -10, 10},
+                          {TensorType_INT64, {4}}, {TensorType_INT64, {4}},
+                          {1, 0, 0, 0}, {3, 1, 1, 1});
+  m.SetInput<uint8_t>({1, 2, 3, 4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, SizeMinus1) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 1, 3, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 0, 0},
+      {2, 1, -1, 1});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis1) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 3, 2, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 2, 1, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 1, 0, 0},
+      {2, -1, 1, 1});
+  m.SetInput<uint8_t>({1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis2) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_UINT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_UINT8, {2, 1, 2, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 1, 0},
+      {2, 1, -1, 1});
+  m.SetInput<uint8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+TEST(SliceOpTest, BeginNonZeroSizeMinus1Axis2_Int8) {
+  SliceOpModel<int64_t> m(
+      /*input=*/{TensorType_INT8, {3, 2, 3, 1}, -10, 10},
+      /*output=*/{TensorType_INT8, {2, 1, 2, 1}, -10, 10},
+      {TensorType_INT64, {4}}, {TensorType_INT64, {4}}, {1, 0, 1, 0},
+      {2, 1, -1, 1});
+  m.SetInput<int8_t>({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+  auto reference_output_shape = m.GetOutputShape();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output, 0.1)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index c6bb99761cb..723349ef23e 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -92,6 +92,7 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinRelu6:
     case kTfLiteBuiltinResizeBilinear:
     case kTfLiteBuiltinResizeNearestNeighbor:
+    case kTfLiteBuiltinSlice:
     case kTfLiteBuiltinSoftmax:
     case kTfLiteBuiltinSpaceToDepth:
     case kTfLiteBuiltinSplit:
@@ -387,6 +388,16 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
           node, context,
           {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
     }
+    case kTfLiteBuiltinSlice: {
+      const auto& begins_tensor = context->tensors[node->inputs->data[1]];
+      const auto& sizes_tensor = context->tensors[node->inputs->data[2]];
+      if (!IsConstantTensor(&begins_tensor) || !IsConstantTensor(&sizes_tensor))
+        return false;
+      return InputsWithCorrectTypes(node, context,
+                                    {{kTfLiteUInt8, kTfLiteInt8},
+                                     {kTfLiteInt32, kTfLiteInt64},
+                                     {kTfLiteInt32, kTfLiteInt64}});
+    }
     default:
       return false;
   }

From d894109fe1203f2259819841b85a0354c7780609 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 15:55:10 -0700
Subject: [PATCH 0851/1533] Fix parameter check for batchdim in Gather.

PiperOrigin-RevId: 312371119
Change-Id: I7537194147199136b5b847ce6d1ddd361e42a393
---
 tensorflow/core/kernels/gather_op.cc      | 12 ++++++------
 tensorflow/core/kernels/gather_op_test.cc | 17 ++++++++++++++++-
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 3ff7afca7df..5e6bd1de9d6 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -88,18 +88,18 @@ class GatherOp : public OpKernel {
     }
 
     if (batch_dims_ != 0) {
-      if (batch_dims_ < 0) {
-        batch_dims_ = indices.dims() + batch_dims_;
-      }
-
-      if (!axis_is_set) axis = batch_dims_;
-
       OP_REQUIRES(
           c, batch_dims_ >= -indices.dims() && batch_dims_ <= indices.dims(),
           errors::InvalidArgument("Expected batch_dims in the range [",
                                   -indices.dims(), ", ", indices.dims(),
                                   "], but got ", batch_dims_));
 
+      if (batch_dims_ < 0) {
+        batch_dims_ = indices.dims() + batch_dims_;
+      }
+
+      if (!axis_is_set) axis = batch_dims_;
+
       OP_REQUIRES(c, batch_dims_ < params.dims(),
                   errors::InvalidArgument("batch_dims (", batch_dims_,
                                           ") must be less than rank(params) (",
diff --git a/tensorflow/core/kernels/gather_op_test.cc b/tensorflow/core/kernels/gather_op_test.cc
index ecac2274ae8..e4c77881ea8 100644
--- a/tensorflow/core/kernels/gather_op_test.cc
+++ b/tensorflow/core/kernels/gather_op_test.cc
@@ -40,11 +40,12 @@ namespace {
 
 class GatherOpTest : public OpsTestBase {
  protected:
-  void MakeOp(DataType data_type, DataType index_type) {
+  void MakeOp(DataType data_type, DataType index_type, int batch_dims = 0) {
     TF_ASSERT_OK(NodeDefBuilder("myop", "GatherV2")
                      .Input(FakeInput(data_type))
                      .Input(FakeInput(index_type))
                      .Input(FakeInput(index_type))
+                     .Attr("batch_dims", batch_dims)
                      .Finalize(node_def()));
     TF_ASSERT_OK(InitOp());
   }
@@ -176,6 +177,20 @@ TEST_F(GatherOpTest, Error_IndexOutOfRange) {
       << s;
 }
 
+TEST_F(GatherOpTest, Error_BatchDimsOutOfRange) {
+  MakeOp(DT_FLOAT, DT_INT32, 10);
+
+  // Feed and run
+  AddInputFromArray<float>(TensorShape({5, 3}),
+                           {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14});
+  AddInputFromArray<int32>(TensorShape({4}), {0, 4, 99, 2});
+  AddInputFromArray<int32>(TensorShape({}), {0});
+  Status s = RunOpKernel();
+  EXPECT_TRUE(absl::StrContains(
+      s.ToString(), "Expected batch_dims in the range [-1, 1], but got 10"))
+      << s;
+}
+
 constexpr int kLookups = 2000;
 
 template <typename Index>

From 91da977a0305f2c25c6c149a5924d8b1eb33375c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 15:58:25 -0700
Subject: [PATCH 0852/1533] Improve the bucket generation in Boosted Trees to
 avoid returning more than requested buckets.

PiperOrigin-RevId: 312371738
Change-Id: I7f241c839f52d679ad4ceb82c161018e9b944fa3
---
 .../quantiles/weighted_quantiles_summary.h    | 34 +++++++++++++++++--
 .../boosted_trees/quantile_ops_test.py        | 22 +++++++++---
 2 files changed, 48 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
index 5690c3a6014..a22af7ab71e 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_summary.h
@@ -16,6 +16,7 @@
 #define TENSORFLOW_CORE_KERNELS_BOOSTED_TREES_QUANTILES_WEIGHTED_QUANTILES_SUMMARY_H_
 
 #include <cstring>
+#include <list>
 #include <vector>
 
 #include "tensorflow/core/kernels/boosted_trees/quantiles/weighted_quantiles_buffer.h"
@@ -250,10 +251,37 @@ class WeightedQuantilesSummary {
     float compression_eps = ApproximationError() + (1.0 / num_boundaries);
     compressed_summary.Compress(num_boundaries, compression_eps);
 
+    // Remove the least important boundaries by the gap removing them would
+    // create.
+    std::list<int64> boundaries_to_keep;
+    for (int64 i = 0; i != compressed_summary.entries_.size(); ++i) {
+      boundaries_to_keep.push_back(i);
+    }
+    while (boundaries_to_keep.size() > num_boundaries) {
+      std::list<int64>::iterator min_element = boundaries_to_keep.end();
+      auto prev = boundaries_to_keep.begin();
+      auto curr = prev;
+      ++curr;
+      auto next = curr;
+      ++next;
+      WeightType min_weight = TotalWeight();
+      for (; next != boundaries_to_keep.end(); ++prev, ++curr, ++next) {
+        WeightType new_weight =
+            compressed_summary.entries_[*next].PrevMaxRank() -
+            compressed_summary.entries_[*prev].NextMinRank();
+        if (new_weight < min_weight) {
+          min_element = curr;
+          min_weight = new_weight;
+        }
+      }
+      boundaries_to_keep.erase(min_element);
+    }
+
     // Return boundaries.
-    output.reserve(compressed_summary.entries_.size());
-    for (const auto& entry : compressed_summary.entries_) {
-      output.push_back(entry.value);
+    output.reserve(boundaries_to_keep.size());
+    for (auto itr = boundaries_to_keep.begin(); itr != boundaries_to_keep.end();
+         ++itr) {
+      output.push_back(compressed_summary.entries_[*itr].value);
     }
     return output;
   }
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index fb44c33d602..7c3a382c955 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     self.eps = 0.01
     self.max_elements = 1 << 16
-    self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
+    self.num_quantiles = constant_op.constant(4, dtype=dtypes.int64)
 
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
@@ -183,7 +183,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -202,7 +205,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
@@ -215,7 +221,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -233,7 +242,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()

From 119aa03c7605a82061227c3291e8c5665752c90f Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 19 May 2020 16:02:28 -0700
Subject: [PATCH 0853/1533] Split delegate-specific interpreter tests into a
 separate file

PiperOrigin-RevId: 312372505
Change-Id: If366a884ce090f2ad40bdc20d266ef32eb5a1765
---
 tensorflow/lite/delegates/BUILD            |  21 +
 tensorflow/lite/delegates/delegate_test.cc | 982 +++++++++++++++++++++
 tensorflow/lite/interpreter_test.cc        | 942 --------------------
 3 files changed, 1003 insertions(+), 942 deletions(-)
 create mode 100644 tensorflow/lite/delegates/delegate_test.cc

diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index df671675ec9..619c4d75130 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -43,3 +43,24 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_test(
+    name = "delegate_test",
+    size = "small",
+    srcs = ["delegate_test.cc"],
+    features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
+    tags = [
+        "tflite_not_portable_ios",  # TODO(b/117786830)
+    ],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:version",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
new file mode 100644
index 00000000000..566cc644d3e
--- /dev/null
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -0,0 +1,982 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace {
+
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    const TfLiteTensor* input1 = GetInput(context, node, 0);
+    const TfLiteTensor* input2 = GetInput(context, node, 1);
+    TfLiteTensor* output = GetOutput(context, node, 0);
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    TF_LITE_ENSURE(context, a0);
+    TF_LITE_ENSURE(context, a0->data.f);
+    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    TF_LITE_ENSURE(context, a1);
+    TF_LITE_ENSURE(context, a1->data.f);
+    TfLiteTensor* out = GetOutput(context, node, 0);
+    TF_LITE_ENSURE(context, out);
+    TF_LITE_ENSURE(context, out->data.f);
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+}  // namespace
+
+// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
+  }
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
+    // min_ops_per_subset: If >0, partitioning preview is used to choose only
+    // those subsets with min_ops_per_subset number of nodes.
+    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
+    explicit SimpleDelegate(
+        const std::vector<int>& nodes,
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false, int min_ops_per_subset = 0,
+        bool fail_node_invoke = false)
+        : nodes_(nodes),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          min_ops_per_subset_(min_ops_per_subset),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is added
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          if (exec_index == node_index) {
+            // Check op details only if it wasn't delegated already.
+            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+          }
+        }
+
+        // Get preview of delegate partitioning from the context.
+        TfLiteDelegateParams* params_array;
+        int num_partitions;
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        if (simple->min_ops_per_subset() > 0) {
+          // Build a new vector of ops from subsets with atleast the minimum
+          // size.
+          std::vector<int> allowed_ops;
+          for (int idx = 0; idx < num_partitions; ++idx) {
+            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
+            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
+            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
+                               nodes_in_subset->data + nodes_in_subset->size);
+          }
+
+          // Free existing nodes_to_separate & initialize a new array with
+          // allowed_ops.
+          TfLiteIntArrayFree(nodes_to_separate);
+          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
+          memcpy(nodes_to_separate->data, allowed_ops.data(),
+                 sizeof(int) * nodes_to_separate->size);
+        }
+
+        // Another call to PreviewDelegateParitioning should be okay, since
+        // partitioning memory is managed by context.
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
+        // TODO(b/156586986): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
+        TFLITE_CHECK_GE(buffer_handle, -1);
+        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
+        const float floats[] = {6., 6., 6.};
+        int num = output->dims->data[0];
+        for (int i = 0; i < num; i++) {
+          output->data.f[i] = floats[i];
+        }
+        return kTfLiteOk;
+      };
+
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = delegate_flags;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+
+      reg.invoke = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+        // Copy input data to output data.
+        const TfLiteTensor* a0;
+        const TfLiteTensor* a1;
+        if (node->inputs->size == 2) {
+          a0 = GetInput(context, node, 0);
+          a1 = GetInput(context, node, 1);
+        } else {
+          a0 = GetInput(context, node, 0);
+          a1 = a0;
+        }
+        TfLiteTensor* out = GetOutput(context, node, 0);
+        int num = 1;
+        for (int i = 0; i < a0->dims->size; ++i) {
+          num *= a0->dims->data[i];
+        }
+        for (int i = 0; i < num; i++) {
+          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+        }
+        // Make the data stale so that CopyFromBufferHandle can be invoked
+        out->data_is_stale = true;
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      }
+
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        // Set output size to input size
+        const TfLiteTensor* input1;
+        const TfLiteTensor* input2;
+        if (node->inputs->size == 2) {
+          input1 = GetInput(context, node, 0);
+          input2 = GetInput(context, node, 1);
+        } else {
+          input1 = GetInput(context, node, 0);
+          input2 = input1;
+        }
+        TfLiteTensor* output = GetOutput(context, node, 0);
+
+        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+            context, output, TfLiteIntArrayCopy(input1->dims)));
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int min_ops_per_subset() { return min_ops_per_subset_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
+    int min_ops_per_subset_ = 0;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+namespace {
+
+TEST_F(TestDelegate, BasicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  int node = interpreter_->execution_plan()[0];
+  const auto* node_and_reg = interpreter_->node_and_registration(node);
+  EXPECT_EQ(node_and_reg->second.custom_name,
+            delegate_->FakeFusedRegistration().custom_name);
+
+  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
+      node_and_reg->first.builtin_data);
+  ASSERT_EQ(params->nodes_to_replace->size, 3);
+  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
+  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
+  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
+
+  ASSERT_EQ(params->input_tensors->size, 2);
+  EXPECT_EQ(params->input_tensors->data[0], 0);
+  EXPECT_EQ(params->input_tensors->data[1], 1);
+
+  ASSERT_EQ(params->output_tensors->size, 2);
+  EXPECT_EQ(params->output_tensors->data[0], 3);
+  EXPECT_EQ(params->output_tensors->data[1], 4);
+}
+
+TEST_F(TestDelegate, DelegateNodePrepareFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+  // ModifyGraphWithDelegate fails, since the Prepare() method in the node's
+  // TfLiteRegistration returns an error status.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteDelegateError);
+  // Execution plan should remain unchanged.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, DelegateNodeInvokeFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Delegation modified execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  // Verify Invoke() behavior: fails first, succeeds after RemoveAllDelegates().
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
+  ASSERT_EQ(RemoveAllDelegates(), kTfLiteOk);
+  // Delegation removed, returning to original execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
+  // First delegate only supports nodes 1, 2. Gets applied successfully.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports node 0, but fails during the delegate-node's
+  // Prepare.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
+
+  // Initially, execution plan has 3 nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // First delegate should be applied successfully, yielding a plan with 2
+  // nodes.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  // Second delegate won't get applied.
+  // As a result, previous delegate should also get undone, restoring the
+  // execution plan to its original state.
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteDelegateError);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, SecondDelegationInvokeFailure) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  // Outputs match the AddOp path, rather than delegate path.
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  // Verify Invoke() behavior to ensure Interpreter isn't broken.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
+  EXPECT_EQ(RemoveAllDelegates(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  // Deliberately try to set tensor params with quantization while immutable,
+  // ensuring quantization is properly freed.
+  TfLiteQuantization quant = {};
+  quant.type = kTfLiteAffineQuantization;
+  auto quant_params = static_cast<TfLiteAffineQuantization*>(
+      malloc(sizeof(TfLiteAffineQuantization)));
+  quant_params->scale = nullptr;
+  quant_params->zero_point = nullptr;
+  quant_params->quantized_dimension = 0;
+  quant.params = quant_params;
+  ASSERT_NE(interpreter_->SetTensorParametersReadWrite(0, kTfLiteInt8, "", {3},
+                                                       quant),
+            kTfLiteOk);
+}
+
+TEST_F(TestDelegate, ComplexDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  // 0th should be a non-delegated original op
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+  // 1st should be a new macro op (3) which didn't exist)
+  ASSERT_EQ(interpreter_->execution_plan()[1], 3);
+  const auto* node_and_reg = interpreter_->node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            delegate_->FakeFusedRegistration().custom_name);
+}
+
+TEST_F(TestDelegate, SetBufferHandleToInput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 0;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  ASSERT_EQ(tensor->delegate, nullptr);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetBufferHandleToOutput) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+}
+
+TEST_F(TestDelegate, SetInvalidHandleToTensor) {
+  interpreter_->Invoke();
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  SimpleDelegate another_simple_delegate({0, 1, 2});
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status = interpreter_->SetBufferHandle(
+      kOutputTensorIndex, handle,
+      another_simple_delegate.get_tf_lite_delegate());
+  // Setting a buffer handle to a tensor with another delegate will fail.
+  ASSERT_EQ(status, kTfLiteError);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+}
+
+// We utilize delegation in such a way as to allow node subsets with a minimum
+// number of ops only.
+TEST_F(TestDelegate, TestDelegationWithPartitionPreview) {
+  // We set kTfLiteDelegateFlagsAllowDynamicTensors to ensure the second
+  // delegate can be applied.
+  // Ops 0 and 2 are delegated but end up in the same partition (based on
+  // dependency analysis). However, since min_ops_per_subset = 3, no delegation
+  // takes place.
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 3 /**min_ops_per_subset**/));
+  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
+
+  // Original execution plan remains.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
+  ASSERT_EQ(interpreter_->execution_plan()[2], 2);
+
+  // Same ops supported, but min_ops_per_subset = 2.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
+      false /**fail_node_prepare**/, 2 /**min_ops_per_subset**/));
+  interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate());
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 3);
+  const auto* node_and_reg = interpreter_->node_and_registration(3);
+  ASSERT_EQ(node_and_reg->second.custom_name,
+            delegate2_->FakeFusedRegistration().custom_name);
+  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
+}
+
+TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  // Try resizing input to same shape as before (which should be a No-op).
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // This should fail, since the previous application of the delegate will be
+  // re-done automatically, making the graph immutable again.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Ensure graph has been restored to its valid delegated state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  // Resize again, but call AllocateTensors as usual afterwards.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Try resizing input to same shape as before (which should be a No-op).
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  // Resizing input tensors should temporarily restore original execution plan
+  // of 3 nodes.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // This should fail, since the previous application of the delegate will be
+  // re-done automatically, making the graph immutable again.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Ensure graph has been restored to its valid delegated state.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  // Resize again, but call AllocateTensors as usual afterwards.
+  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, ReleaseNonPersistentMemoryWithDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
+
+  // No-op.
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  // This should fail, since the graph is immutable.
+  ASSERT_NE(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  // Verify Invoke() behavior.
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  interpreter_->Invoke();
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+
+  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
+}
+
+TEST_F(TestDelegate, TestCopyFromBufferInvoke) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
+  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
+         floats.size() * sizeof(float));
+
+  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
+         floats.size() * sizeof(float));
+
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  // Called Invoke without setting the buffer will not call the CopyFromBuffer
+  interpreter_->Invoke();
+  std::vector<float> res = {2.0f, 4.0f, 6.0f};
+  for (int i = 0; i < tensor->dims->data[0]; ++i) {
+    ASSERT_EQ(tensor->data.f[i], res[i]);
+  }
+}
+
+TEST_F(TestDelegate, TestCopyFromBuffer) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
+  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
+  interpreter_->ModifyGraphWithDelegate(delegate);
+
+  constexpr int kOutputTensorIndex = 3;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
+  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
+         floats.size() * sizeof(float));
+
+  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
+         floats.size() * sizeof(float));
+
+  // Before setting the buffer handle, the tensor's `delegate` is already set
+  // because it will be written by the delegate.
+  ASSERT_EQ(tensor->delegate, delegate);
+  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
+
+  TfLiteBufferHandle handle = AllocateBufferHandle();
+  TfLiteStatus status =
+      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
+  interpreter_->Invoke();
+  ASSERT_EQ(status, kTfLiteOk);
+  EXPECT_EQ(tensor->delegate, delegate);
+  EXPECT_EQ(tensor->buffer_handle, handle);
+  for (int i = 0; i < tensor->dims->data[0]; ++i) {
+    ASSERT_EQ(tensor->data.f[i], 6.0f);
+  }
+}
+
+TEST_F(TestDelegate, DelegateCustomOpResolution) {
+  // Build a flatbuffer model that contains the "my_add" custom op which gets
+  // resolved only after SimpleDelegate is applied.
+  flatbuffers::FlatBufferBuilder builder;
+  // Tensors.
+  const int32_t shape[1] = {3};
+  flatbuffers::Offset<Tensor> tensors[3] = {
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("X")),
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Y")),
+      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
+                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Z")),
+  };
+  // Custom op definition.
+  flatbuffers::Offset<OperatorCode> op_code =
+      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "my_add");
+  const int32_t inputs[2] = {0, 1};
+  const int32_t outputs[1] = {2};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0, builder.CreateVector<int32_t>(inputs, 2),
+      builder.CreateVector<int32_t>(outputs, 1), BuiltinOptions_NONE,
+      /*builtin_options=*/0,
+      /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS);
+  // Subgraph & Model.
+  flatbuffers::Offset<SubGraph> subgraph =
+      CreateSubGraph(builder, builder.CreateVector(tensors, 3),
+                     builder.CreateVector<int32_t>(inputs, 2),
+                     builder.CreateVector<int32_t>(outputs, 1),
+                     builder.CreateVector(&op, 1), /*name=*/0);
+  flatbuffers::Offset<Buffer> buffers[1] = {
+      CreateBuffer(builder, builder.CreateVector({})),
+  };
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&op_code, 1),
+      builder.CreateVector(&subgraph, 1), builder.CreateString("test_model"),
+      builder.CreateVector(buffers, 1));
+  builder.Finish(model_buffer);
+  std::vector<char> buffer =
+      std::vector<char>(builder.GetBufferPointer(),
+                        builder.GetBufferPointer() + builder.GetSize());
+  const Model* model = GetModel(buffer.data());
+
+  // Build an interpreter with the model. Initialization should work fine.
+  std::unique_ptr<Interpreter> interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model, ::tflite::ops::builtin::BuiltinOpResolver())(&interpreter),
+      kTfLiteOk);
+  // AllocateTensors should fail, since my_add hasn't been resolved.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
+
+  // Applying static delegate won't work, since the interpreter will first try
+  // to Prepare all original nodes.
+  std::unique_ptr<SimpleDelegate> static_delegate(new SimpleDelegate({0}));
+  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
+                static_delegate->get_tf_lite_delegate()),
+            kTfLiteError);
+
+  // Applying delegate that supports dynamic tensors should work.
+  std::unique_ptr<SimpleDelegate> dynamic_delegate(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
+                dynamic_delegate->get_tf_lite_delegate()),
+            kTfLiteOk);
+  // AllocateTensors will now work.
+  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
+}
+
+class TestDelegateWithDynamicTensors : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+
+    interpreter_->AddTensors(2);
+    interpreter_->SetInputs({0});
+    interpreter_->SetOutputs({1});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = DynamicCopyOpRegistration();
+    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
+
+    delegate_.Prepare = [](TfLiteContext* context,
+                           TfLiteDelegate* delegate) -> TfLiteStatus {
+      // In this test, the delegate replaces all the nodes if this function is
+      // called.
+      TfLiteIntArray* execution_plan;
+      TF_LITE_ENSURE_STATUS(
+          context->GetExecutionPlan(context, &execution_plan));
+      context->ReplaceNodeSubsetsWithDelegateKernels(
+          context, DelegateRegistration(), execution_plan, delegate);
+      return kTfLiteOk;
+    };
+    delegate_.flags = kTfLiteDelegateFlagsNone;
+  }
+
+  static TfLiteRegistration DynamicCopyOpRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+      TfLiteTensor* output = GetOutput(context, node, 0);
+      SetTensorToDynamic(output);
+      return kTfLiteOk;
+    };
+
+    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+      // Not implemented since this isn't required in testing.
+      return kTfLiteOk;
+    };
+    return reg;
+  }
+
+  static TfLiteRegistration DelegateRegistration() {
+    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+    return reg;
+  }
+
+  std::unique_ptr<Interpreter> interpreter_;
+  TfLiteDelegate delegate_;
+};
+
+TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The interpreter should not call delegate's `Prepare` when dynamic tensors
+  // exist. So the node ID isn't changed.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  interpreter_->ModifyGraphWithDelegate(&delegate_);
+
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  // The node should be replaced because dynamic tensors are allowed. Therefore
+  // only node ID in the execution plan is changed from 0 to 1.
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+}
+
+TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
+  // Trigger allocation *before* delegate application.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+
+  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
+
+  // Allocation should still succeed.
+  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index cfc7c168aa5..49b8e7bd816 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -1304,948 +1304,6 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
-// Build a kernel registration for an op that copies its one input
-// to an output
-TfLiteRegistration AddOpRegistration() {
-  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-  reg.custom_name = "my_add";
-  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
-
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
-
-    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
-    for (int i = 0; i < input1->dims->size; ++i) {
-      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
-    }
-
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-        context, output, TfLiteIntArrayCopy(input1->dims)));
-    return kTfLiteOk;
-  };
-
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TF_LITE_ENSURE(context, a0);
-    TF_LITE_ENSURE(context, a0->data.f);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
-    TF_LITE_ENSURE(context, a1);
-    TF_LITE_ENSURE(context, a1->data.f);
-    TfLiteTensor* out = GetOutput(context, node, 0);
-    TF_LITE_ENSURE(context, out);
-    TF_LITE_ENSURE(context, out->data.f);
-    int num = a0->dims->data[0];
-    for (int i = 0; i < num; i++) {
-      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-  return reg;
-}
-
-}  // namespace
-
-// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
-class TestDelegate : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
-    interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = AddOpRegistration();
-    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
-  }
-
-  void TearDown() override {
-    // Interpreter relies on delegate to free the resources properly. Thus
-    // the life cycle of delegate must be longer than interpreter.
-    interpreter_.reset();
-    delegate_.reset();
-  }
-
-  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
-
-  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
-
-  TfLiteStatus RemoveAllDelegates() {
-    return interpreter_->RemoveAllDelegates();
-  }
-
- protected:
-  class SimpleDelegate {
-   public:
-    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
-    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
-    // value-copyable and compatible with TfLite.
-    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
-    // min_ops_per_subset: If >0, partitioning preview is used to choose only
-    // those subsets with min_ops_per_subset number of nodes.
-    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
-        : nodes_(nodes),
-          fail_delegate_node_prepare_(fail_node_prepare),
-          min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
-      delegate_.Prepare = [](TfLiteContext* context,
-                             TfLiteDelegate* delegate) -> TfLiteStatus {
-        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
-        TfLiteIntArray* nodes_to_separate =
-            TfLiteIntArrayCreate(simple->nodes_.size());
-        // Mark nodes that we want in TfLiteIntArray* structure.
-        int index = 0;
-        for (auto node_index : simple->nodes_) {
-          nodes_to_separate->data[index++] = node_index;
-          // make sure node is added
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-        }
-        // Check that all nodes are available
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(
-            context->GetExecutionPlan(context, &execution_plan));
-        for (int exec_index = 0; exec_index < execution_plan->size;
-             exec_index++) {
-          int node_index = execution_plan->data[exec_index];
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          if (exec_index == node_index) {
-            // Check op details only if it wasn't delegated already.
-            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-          }
-        }
-
-        // Get preview of delegate partitioning from the context.
-        TfLiteDelegateParams* params_array;
-        int num_partitions;
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        if (simple->min_ops_per_subset() > 0) {
-          // Build a new vector of ops from subsets with atleast the minimum
-          // size.
-          std::vector<int> allowed_ops;
-          for (int idx = 0; idx < num_partitions; ++idx) {
-            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
-            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
-            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
-                               nodes_in_subset->data + nodes_in_subset->size);
-          }
-
-          // Free existing nodes_to_separate & initialize a new array with
-          // allowed_ops.
-          TfLiteIntArrayFree(nodes_to_separate);
-          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
-          memcpy(nodes_to_separate->data, allowed_ops.data(),
-                 sizeof(int) * nodes_to_separate->size);
-        }
-
-        // Another call to PreviewDelegateParitioning should be okay, since
-        // partitioning memory is managed by context.
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, simple->FakeFusedRegistration(), nodes_to_separate,
-            delegate);
-        TfLiteIntArrayFree(nodes_to_separate);
-        return kTfLiteOk;
-      };
-      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* tensor) -> TfLiteStatus {
-        // TODO(b/156586986): Implement tests to test buffer copying logic.
-        return kTfLiteOk;
-      };
-      delegate_.CopyFromBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle,
-             TfLiteTensor* output) -> TfLiteStatus {
-        TFLITE_CHECK_GE(buffer_handle, -1);
-        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
-        const float floats[] = {6., 6., 6.};
-        int num = output->dims->data[0];
-        for (int i = 0; i < num; i++) {
-          output->data.f[i] = floats[i];
-        }
-        return kTfLiteOk;
-      };
-
-      delegate_.FreeBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
-      // Store type-punned data SimpleDelegate structure.
-      delegate_.data_ = static_cast<void*>(this);
-      delegate_.flags = delegate_flags;
-    }
-
-    TfLiteRegistration FakeFusedRegistration() {
-      TfLiteRegistration reg = {nullptr};
-      reg.custom_name = "fake_fused_op";
-
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        out->data_is_stale = true;
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_invoke_) {
-        reg.invoke = [](TfLiteContext* context,
-                        TfLiteNode* node) -> TfLiteStatus {
-          return kTfLiteError;
-        };
-      }
-
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
-        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-          return kTfLiteError;
-        };
-      }
-
-      return reg;
-    }
-
-    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
-
-    int min_ops_per_subset() { return min_ops_per_subset_; }
-
-   private:
-    std::vector<int> nodes_;
-    TfLiteDelegate delegate_;
-    bool fail_delegate_node_prepare_ = false;
-    int min_ops_per_subset_ = 0;
-    bool fail_delegate_node_invoke_ = false;
-  };
-
-  std::unique_ptr<Interpreter> interpreter_;
-  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
-};
-namespace {
-
-TEST_F(TestDelegate, BasicDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  int node = interpreter_->execution_plan()[0];
-  const auto* node_and_reg = interpreter_->node_and_registration(node);
-  EXPECT_EQ(node_and_reg->second.custom_name,
-            delegate_->FakeFusedRegistration().custom_name);
-
-  const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
-      node_and_reg->first.builtin_data);
-  ASSERT_EQ(params->nodes_to_replace->size, 3);
-  EXPECT_EQ(params->nodes_to_replace->data[0], 0);
-  EXPECT_EQ(params->nodes_to_replace->data[1], 1);
-  EXPECT_EQ(params->nodes_to_replace->data[2], 2);
-
-  ASSERT_EQ(params->input_tensors->size, 2);
-  EXPECT_EQ(params->input_tensors->data[0], 0);
-  EXPECT_EQ(params->input_tensors->data[1], 1);
-
-  ASSERT_EQ(params->output_tensors->size, 2);
-  EXPECT_EQ(params->output_tensors->data[0], 3);
-  EXPECT_EQ(params->output_tensors->data[1], 4);
-}
-
-TEST_F(TestDelegate, DelegateNodePrepareFailure) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 1, 2}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
-  // ModifyGraphWithDelegate fails, since the Prepare() method in the node's
-  // TfLiteRegistration returns an error status.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteDelegateError);
-  // Execution plan should remain unchanged.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, DelegateNodeInvokeFailure) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Delegation modified execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-
-  // Verify Invoke() behavior: fails first, succeeds after RemoveAllDelegates().
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
-  ASSERT_EQ(RemoveAllDelegates(), kTfLiteOk);
-  // Delegation removed, returning to original execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
-  // First delegate only supports nodes 1, 2. Gets applied successfully.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports node 0, but fails during the delegate-node's
-  // Prepare.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0}, kTfLiteDelegateFlagsNone, true /**fail_node_prepare**/));
-
-  // Initially, execution plan has 3 nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // First delegate should be applied successfully, yielding a plan with 2
-  // nodes.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  // Second delegate won't get applied.
-  // As a result, previous delegate should also get undone, restoring the
-  // execution plan to its original state.
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteDelegateError);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, SecondDelegationInvokeFailure) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({1, 2}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  // Outputs match the AddOp path, rather than delegate path.
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-
-  // Verify Invoke() behavior to ensure Interpreter isn't broken.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteError);
-  EXPECT_EQ(RemoveAllDelegates(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, StaticDelegateMakesGraphImmutable) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  // Deliberately try to set tensor params with quantization while immutable,
-  // ensuring quantization is properly freed.
-  TfLiteQuantization quant = {};
-  quant.type = kTfLiteAffineQuantization;
-  auto quant_params = static_cast<TfLiteAffineQuantization*>(
-      malloc(sizeof(TfLiteAffineQuantization)));
-  quant_params->scale = nullptr;
-  quant_params->zero_point = nullptr;
-  quant_params->quantized_dimension = 0;
-  quant.params = quant_params;
-  ASSERT_NE(interpreter_->SetTensorParametersReadWrite(0, kTfLiteInt8, "", {3},
-                                                       quant),
-            kTfLiteOk);
-}
-
-TEST_F(TestDelegate, ComplexDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  // 0th should be a non-delegated original op
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-  // 1st should be a new macro op (3) which didn't exist)
-  ASSERT_EQ(interpreter_->execution_plan()[1], 3);
-  const auto* node_and_reg = interpreter_->node_and_registration(3);
-  ASSERT_EQ(node_and_reg->second.custom_name,
-            delegate_->FakeFusedRegistration().custom_name);
-}
-
-TEST_F(TestDelegate, SetBufferHandleToInput) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 0;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  ASSERT_EQ(tensor->delegate, nullptr);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-}
-
-TEST_F(TestDelegate, SetBufferHandleToOutput) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-}
-
-TEST_F(TestDelegate, SetInvalidHandleToTensor) {
-  interpreter_->Invoke();
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  SimpleDelegate another_simple_delegate({0, 1, 2});
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status = interpreter_->SetBufferHandle(
-      kOutputTensorIndex, handle,
-      another_simple_delegate.get_tf_lite_delegate());
-  // Setting a buffer handle to a tensor with another delegate will fail.
-  ASSERT_EQ(status, kTfLiteError);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-}
-
-// We utilize delegation in such a way as to allow node subsets with a minimum
-// number of ops only.
-TEST_F(TestDelegate, TestDelegationWithPartitionPreview) {
-  // We set kTfLiteDelegateFlagsAllowDynamicTensors to ensure the second
-  // delegate can be applied.
-  // Ops 0 and 2 are delegated but end up in the same partition (based on
-  // dependency analysis). However, since min_ops_per_subset = 3, no delegation
-  // takes place.
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
-      false /**fail_node_prepare**/, 3 /**min_ops_per_subset**/));
-  interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
-
-  // Original execution plan remains.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
-  ASSERT_EQ(interpreter_->execution_plan()[2], 2);
-
-  // Same ops supported, but min_ops_per_subset = 2.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 2}, kTfLiteDelegateFlagsAllowDynamicTensors,
-      false /**fail_node_prepare**/, 2 /**min_ops_per_subset**/));
-  interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate());
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 3);
-  const auto* node_and_reg = interpreter_->node_and_registration(3);
-  ASSERT_EQ(node_and_reg->second.custom_name,
-            delegate2_->FakeFusedRegistration().custom_name);
-  ASSERT_EQ(interpreter_->execution_plan()[1], 1);
-}
-
-TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-
-  // Try resizing input to same shape as before (which should be a No-op).
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // This should fail, since the previous application of the delegate will be
-  // re-done automatically, making the graph immutable again.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Ensure graph has been restored to its valid delegated state.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  // Resize again, but call AllocateTensors as usual afterwards.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  // Try resizing input to same shape as before (which should be a No-op).
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  // Resizing input tensors should temporarily restore original execution plan
-  // of 3 nodes.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 3}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // This should fail, since the previous application of the delegate will be
-  // re-done automatically, making the graph immutable again.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Ensure graph has been restored to its valid delegated state.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  // Resize again, but call AllocateTensors as usual afterwards.
-  ASSERT_EQ(interpreter_->ResizeInputTensor(0, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->ResizeInputTensor(1, {1, 4}), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 4; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, ReleaseNonPersistentMemoryWithDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({1, 2}));
-
-  // No-op.
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-
-  // This should fail, since the graph is immutable.
-  ASSERT_NE(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f, 4.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f, 8.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  // Verify Invoke() behavior.
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-
-  ASSERT_EQ(interpreter_->ReleaseNonPersistentMemory(), kTfLiteOk);
-}
-
-TEST_F(TestDelegate, TestCopyFromBufferInvoke) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
-  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
-         floats.size() * sizeof(float));
-
-  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
-         floats.size() * sizeof(float));
-
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  // Called Invoke without setting the buffer will not call the CopyFromBuffer
-  interpreter_->Invoke();
-  std::vector<float> res = {2.0f, 4.0f, 6.0f};
-  for (int i = 0; i < tensor->dims->data[0]; ++i) {
-    ASSERT_EQ(tensor->data.f[i], res[i]);
-  }
-}
-
-TEST_F(TestDelegate, TestCopyFromBuffer) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
-  TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
-  interpreter_->ModifyGraphWithDelegate(delegate);
-
-  constexpr int kOutputTensorIndex = 3;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  std::vector<float> floats = {1.0f, 2.0f, 3.0f};
-  memcpy(interpreter_->typed_tensor<float>(0), floats.data(),
-         floats.size() * sizeof(float));
-
-  memcpy(interpreter_->typed_tensor<float>(1), floats.data(),
-         floats.size() * sizeof(float));
-
-  // Before setting the buffer handle, the tensor's `delegate` is already set
-  // because it will be written by the delegate.
-  ASSERT_EQ(tensor->delegate, delegate);
-  ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
-
-  TfLiteBufferHandle handle = AllocateBufferHandle();
-  TfLiteStatus status =
-      interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  interpreter_->Invoke();
-  ASSERT_EQ(status, kTfLiteOk);
-  EXPECT_EQ(tensor->delegate, delegate);
-  EXPECT_EQ(tensor->buffer_handle, handle);
-  for (int i = 0; i < tensor->dims->data[0]; ++i) {
-    ASSERT_EQ(tensor->data.f[i], 6.0f);
-  }
-}
-
-TEST_F(TestDelegate, DelegateCustomOpResolution) {
-  // Build a flatbuffer model that contains the "my_add" custom op which gets
-  // resolved only after SimpleDelegate is applied.
-  flatbuffers::FlatBufferBuilder builder;
-  // Tensors.
-  const int32_t shape[1] = {3};
-  flatbuffers::Offset<Tensor> tensors[3] = {
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("X")),
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Y")),
-      CreateTensor(builder, builder.CreateVector<int32_t>(shape, 1),
-                   TensorType_FLOAT32, /*buffer=*/0, builder.CreateString("Z")),
-  };
-  // Custom op definition.
-  flatbuffers::Offset<OperatorCode> op_code =
-      CreateOperatorCodeDirect(builder, BuiltinOperator_CUSTOM, "my_add");
-  const int32_t inputs[2] = {0, 1};
-  const int32_t outputs[1] = {2};
-  flatbuffers::Offset<Operator> op = CreateOperator(
-      builder, /*opcode_index=*/0, builder.CreateVector<int32_t>(inputs, 2),
-      builder.CreateVector<int32_t>(outputs, 1), BuiltinOptions_NONE,
-      /*builtin_options=*/0,
-      /*custom_options=*/0, tflite::CustomOptionsFormat_FLEXBUFFERS);
-  // Subgraph & Model.
-  flatbuffers::Offset<SubGraph> subgraph =
-      CreateSubGraph(builder, builder.CreateVector(tensors, 3),
-                     builder.CreateVector<int32_t>(inputs, 2),
-                     builder.CreateVector<int32_t>(outputs, 1),
-                     builder.CreateVector(&op, 1), /*name=*/0);
-  flatbuffers::Offset<Buffer> buffers[1] = {
-      CreateBuffer(builder, builder.CreateVector({})),
-  };
-  flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&op_code, 1),
-      builder.CreateVector(&subgraph, 1), builder.CreateString("test_model"),
-      builder.CreateVector(buffers, 1));
-  builder.Finish(model_buffer);
-  std::vector<char> buffer =
-      std::vector<char>(builder.GetBufferPointer(),
-                        builder.GetBufferPointer() + builder.GetSize());
-  const Model* model = GetModel(buffer.data());
-
-  // Build an interpreter with the model. Initialization should work fine.
-  std::unique_ptr<Interpreter> interpreter;
-  ASSERT_EQ(
-      InterpreterBuilder(
-          model, ::tflite::ops::builtin::BuiltinOpResolver())(&interpreter),
-      kTfLiteOk);
-  // AllocateTensors should fail, since my_add hasn't been resolved.
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteError);
-
-  // Applying static delegate won't work, since the interpreter will first try
-  // to Prepare all original nodes.
-  std::unique_ptr<SimpleDelegate> static_delegate(new SimpleDelegate({0}));
-  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
-                static_delegate->get_tf_lite_delegate()),
-            kTfLiteError);
-
-  // Applying delegate that supports dynamic tensors should work.
-  std::unique_ptr<SimpleDelegate> dynamic_delegate(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  ASSERT_EQ(interpreter->ModifyGraphWithDelegate(
-                dynamic_delegate->get_tf_lite_delegate()),
-            kTfLiteOk);
-  // AllocateTensors will now work.
-  ASSERT_EQ(interpreter->AllocateTensors(), kTfLiteOk);
-}
-
-class TestDelegateWithDynamicTensors : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-
-    interpreter_->AddTensors(2);
-    interpreter_->SetInputs({0});
-    interpreter_->SetOutputs({1});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = DynamicCopyOpRegistration();
-    interpreter_->AddNodeWithParameters({0}, {1}, nullptr, 0, nullptr, &reg);
-
-    delegate_.Prepare = [](TfLiteContext* context,
-                           TfLiteDelegate* delegate) -> TfLiteStatus {
-      // In this test, the delegate replaces all the nodes if this function is
-      // called.
-      TfLiteIntArray* execution_plan;
-      TF_LITE_ENSURE_STATUS(
-          context->GetExecutionPlan(context, &execution_plan));
-      context->ReplaceNodeSubsetsWithDelegateKernels(
-          context, DelegateRegistration(), execution_plan, delegate);
-      return kTfLiteOk;
-    };
-    delegate_.flags = kTfLiteDelegateFlagsNone;
-  }
-
-  static TfLiteRegistration DynamicCopyOpRegistration() {
-    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-    reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-      TfLiteTensor* output = GetOutput(context, node, 0);
-      SetTensorToDynamic(output);
-      return kTfLiteOk;
-    };
-
-    reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-      // Not implemented since this isn't required in testing.
-      return kTfLiteOk;
-    };
-    return reg;
-  }
-
-  static TfLiteRegistration DelegateRegistration() {
-    TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-    return reg;
-  }
-
-  std::unique_ptr<Interpreter> interpreter_;
-  TfLiteDelegate delegate_;
-};
-
-TEST_F(TestDelegateWithDynamicTensors, DisallowDynamicTensors) {
-  interpreter_->ModifyGraphWithDelegate(&delegate_);
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  // The interpreter should not call delegate's `Prepare` when dynamic tensors
-  // exist. So the node ID isn't changed.
-  ASSERT_EQ(interpreter_->execution_plan()[0], 0);
-}
-
-TEST_F(TestDelegateWithDynamicTensors, AllowDynamicTensors) {
-  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  interpreter_->ModifyGraphWithDelegate(&delegate_);
-
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  // The node should be replaced because dynamic tensors are allowed. Therefore
-  // only node ID in the execution plan is changed from 0 to 1.
-  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
-}
-
-TEST_F(TestDelegateWithDynamicTensors, ModifyGraphAfterAllocate) {
-  // Trigger allocation *before* delegate application.
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-
-  delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-  ASSERT_EQ(interpreter_->ModifyGraphWithDelegate(&delegate_), kTfLiteOk);
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-  ASSERT_EQ(interpreter_->execution_plan()[0], 1);
-
-  // Allocation should still succeed.
-  ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
-}
-
 TEST(TestDelegateOwnership, ProperlyDisposed) {
   struct TfLiteInterpreterOwnedDelegate : public TfLiteDelegate {
     TfLiteInterpreterOwnedDelegate(bool* destroyed, bool* prepared)

From f0eb6dff6fb4f0500f45c8ca2b82c365de17f403 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 16:08:28 -0700
Subject: [PATCH 0854/1533] adjust gather ops launch config. for NCF model,
 this means ~20% gain. (due to grid size from 80->160 on volta).

PiperOrigin-RevId: 312373706
Change-Id: I2413d301ec170e6e90eeae025e4bb17fccd5abbb
---
 tensorflow/core/kernels/gather_functor_gpu.cu.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.h b/tensorflow/core/kernels/gather_functor_gpu.cu.h
index 1cadee41a88..b2dd43885d0 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.h
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.h
@@ -92,13 +92,18 @@ struct GatherFunctor<GPUDevice, T, Index> {
     const int64 indices_size = indices.size();
     const int64 slice_size = params.dimension(2);
 
-    GpuLaunchConfig config = GetGpuLaunchConfig(out_size, d);
     if (is_axis_zero) {
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size, d, &GatherOpKernel<T, Index, true>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
       TF_CHECK_OK(GpuLaunchKernel(
           GatherOpKernel<T, Index, true>, config.block_count,
           config.thread_per_block, 0, d.stream(), params.data(), indices.data(),
           out.data(), gather_dim_size, indices_size, slice_size, out_size));
     } else {
+      GpuLaunchConfig config = GetGpuLaunchConfig(
+          out_size, d, &GatherOpKernel<T, Index, false>,
+          /*dynamic_shared_memory_size=*/0, /*block_size_limit=*/0);
       TF_CHECK_OK(GpuLaunchKernel(
           GatherOpKernel<T, Index, false>, config.block_count,
           config.thread_per_block, 0, d.stream(), params.data(), indices.data(),

From db573482f4f8712ff173f9bb511dceef4128228c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 16:13:13 -0700
Subject: [PATCH 0855/1533] Add a folder for numpy API

PiperOrigin-RevId: 312374580
Change-Id: Ic8be81b738659668814e956d7fd4da7972944257
---
 tensorflow/python/ops/numpy/BUILD       | 16 ++++++++++++++++
 tensorflow/python/ops/numpy/__init__.py | 19 +++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 tensorflow/python/ops/numpy/BUILD
 create mode 100644 tensorflow/python/ops/numpy/__init__.py

diff --git a/tensorflow/python/ops/numpy/BUILD b/tensorflow/python/ops/numpy/BUILD
new file mode 100644
index 00000000000..c5b8828d0d5
--- /dev/null
+++ b/tensorflow/python/ops/numpy/BUILD
@@ -0,0 +1,16 @@
+# TF numpy API
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "numpy",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/python/ops/numpy/__init__.py b/tensorflow/python/ops/numpy/__init__.py
new file mode 100644
index 00000000000..d78a4c3a6fb
--- /dev/null
+++ b/tensorflow/python/ops/numpy/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tensorflow numpy API."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

From e5fcb88fa425f6c905d5bbb28d0da2bfa6257587 Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Tue, 19 May 2020 23:16:48 +0000
Subject: [PATCH 0856/1533] Fixed sanity test issue

---
 tensorflow/python/keras/integration_test/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 07c3a4a5ab9..80d8fb86345 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "tf_py_test")
 
 package(
     default_visibility = [

From f8a918ccf6d39aa6c1dbf56716f1bd476322c1cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 16:52:08 -0700
Subject: [PATCH 0857/1533] Allow tile op to work on variant dtype.

PiperOrigin-RevId: 312382133
Change-Id: I3a0f95865ca0f782fa73f7ba55b3d987de006332
---
 tensorflow/core/kernels/BUILD                 |  2 ++
 .../core/kernels/tile_functor_cpu_variant.cc  | 30 +++++++++++++++++++
 tensorflow/core/kernels/tile_ops.cc           |  3 ++
 .../python/kernel_tests/array_ops_test.py     | 28 +++++++++++++++++
 4 files changed, 63 insertions(+)
 create mode 100644 tensorflow/core/kernels/tile_functor_cpu_variant.cc

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 157b3f30b24..492cf0b9fd6 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1339,6 +1339,7 @@ tf_kernel_library(
         "tile_functor_cpu_int8.cc",
         "tile_functor_cpu_tstring.cc",
         "tile_functor_cpu_uint8.cc",
+        "tile_functor_cpu_variant.cc",
         "tile_functor_sycl.cc",
     ],
     hdrs = ["tile_functor.h"],
@@ -6907,6 +6908,7 @@ filegroup(
         "tile_functor_cpu_int8.cc",
         "tile_functor_cpu_tstring.cc",
         "tile_functor_cpu_uint8.cc",
+        "tile_functor_cpu_variant.cc",
         "tile_ops.cc",
         "tile_ops_cpu_impl_1.cc",
         "tile_ops_cpu_impl_2.cc",
diff --git a/tensorflow/core/kernels/tile_functor_cpu_variant.cc b/tensorflow/core/kernels/tile_functor_cpu_variant.cc
new file mode 100644
index 00000000000..9ecfb4e9fe1
--- /dev/null
+++ b/tensorflow/core/kernels/tile_functor_cpu_variant.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/variant.h"
+#include "tensorflow/core/kernels/tile_functor_cpu.h"
+
+namespace tensorflow {
+namespace functor {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template struct Tile<CPUDevice, Variant, int32>;
+template struct Tile<CPUDevice, Variant, int64>;
+
+}  // end namespace functor
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/tile_ops.cc b/tensorflow/core/kernels/tile_ops.cc
index cd047ed9d4a..5000e3b0f12 100644
--- a/tensorflow/core/kernels/tile_ops.cc
+++ b/tensorflow/core/kernels/tile_ops.cc
@@ -143,6 +143,7 @@ TF_CALL_half(DECLARE_TYPE);
 TF_CALL_complex64(DECLARE_TYPE);
 TF_CALL_complex128(DECLARE_TYPE);
 TF_CALL_tstring(DECLARE_TYPE);
+TF_CALL_variant(DECLARE_TYPE);
 #undef DECLARE_TYPE
 
 #define DECLARE_DIM(T, NDIM)                           \
@@ -244,6 +245,7 @@ class TileOp : public OpKernel {
     TF_CALL_tstring(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice.
     TF_CALL_complex64(HANDLE_TYPE_NAME);
     TF_CALL_complex128(HANDLE_TYPE_NAME);
+    TF_CALL_variant(HANDLE_TYPE_NAME);  // when DEVICE=CPUDevice
 
 #undef HANDLE_TYPE_NAME
 #undef HANDLE_TYPE
@@ -323,6 +325,7 @@ TF_CALL_half(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex64(HANDLE_TYPE_NAME_CPU);
 TF_CALL_complex128(HANDLE_TYPE_NAME_CPU);
 TF_CALL_tstring(HANDLE_TYPE_NAME_CPU);
+TF_CALL_variant(HANDLE_TYPE_NAME_CPU);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_bool(HANDLE_TYPE_NAME_GPU);
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index bea08ac70bf..9eb8bfcef41 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -1994,5 +1995,32 @@ class RepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(v_tf_fn, v_np)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class TileVariantTest(test_util.TensorFlowTestCase):
+
+  def test_tile_tensor_list(self):
+    t = constant_op.constant(np.random.uniform(size=[2, 3, 4]))
+    handle = list_ops.tensor_list_from_tensor(t, element_shape=None)
+    with ops.device("CPU:0"):
+      tiled_handles = array_ops.tile(array_ops.reshape(handle, [1]), [2])
+    tiled_tensor_0 = list_ops.tensor_list_stack(tiled_handles[0], t.dtype, 2,
+                                                [3, 4])
+    tiled_tensor_1 = list_ops.tensor_list_stack(tiled_handles[1], t.dtype, 2,
+                                                [3, 4])
+    self.assertAllEqual(t, tiled_tensor_0)
+    self.assertAllEqual(t, tiled_tensor_1)
+    # Now mutate some of the lists and make sure the changes are not reflected
+    # in the tiled handles.
+    with ops.control_dependencies([
+        list_ops.tensor_list_scatter([t[0] + 1], [0], input_handle=handle),
+        list_ops.tensor_list_set_item(tiled_handles[0], 0, t[0] + 2)]):
+      tiled_tensor_0 = list_ops.tensor_list_stack(tiled_handles[0], t.dtype, 2,
+                                                  [3, 4])
+      tiled_tensor_1 = list_ops.tensor_list_stack(tiled_handles[1], t.dtype, 2,
+                                                  [3, 4])
+    self.assertAllEqual(t, tiled_tensor_0)
+    self.assertAllEqual(t, tiled_tensor_1)
+
+
 if __name__ == "__main__":
   test_lib.main()

From c9819edcfc30424c52e24c4288ccf9fd7df30b63 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 16:52:51 -0700
Subject: [PATCH 0858/1533] pfor: Enable handling VariableShape

PiperOrigin-RevId: 312382248
Change-Id: Ie8f600dadeb6f5ef3bb7483f7435348ac002e176
---
 .../python/ops/parallel_for/control_flow_ops_test.py      | 8 ++++++++
 tensorflow/python/ops/parallel_for/pfor.py                | 1 +
 2 files changed, 9 insertions(+)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 01776808525..5becfa9efb7 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -1903,6 +1903,14 @@ class VariableTest(PForTestCase):
     ):
       pfor_control_flow_ops.vectorized_map(f, x)
 
+  @test_util.run_all_in_graph_and_eager_modes
+  def test_variable_shape(self):
+    v = resource_variable_ops.ResourceVariable([1, 2])
+
+    def loop_fn(_):
+      return resource_variable_ops.variable_shape(v.handle)
+
+    self._test_loop_fn(loop_fn, 2)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index c4621758702..128bbd48629 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -93,6 +93,7 @@ def _stack(t, length):
 passthrough_stateful_ops = set([
     "VariableV2",
     "VarHandleOp",
+    "VariableShape",
     "ReadVariableOp",
     "StackV2",
     "TensorArrayWriteV3",

From 6fad0820f83b8d90fb33acad4589563b479dbd73 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Tue, 19 May 2020 17:09:45 -0700
Subject: [PATCH 0859/1533] Use enum instead of string to specify the context
 type.

PiperOrigin-RevId: 312385361
Change-Id: I7ee9c203e7f662bf6898d80d9fe2b75536d0e044
---
 .../core/profiler/lib/connected_traceme.h     | 43 ++++++++++---------
 .../core/profiler/utils/xplane_schema.cc      |  6 +--
 .../core/profiler/utils/xplane_schema.h       |  6 +--
 3 files changed, 28 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
index 7a31fa19a03..5b16e2e3adf 100644
--- a/tensorflow/core/profiler/lib/connected_traceme.h
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -25,41 +25,46 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+enum class ContextType : int {
+  kGeneric,
+  kTfExecutor,
+};
+
 /*
  * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
  * different threads. TraceMeProducer generates the context information to be
  * passed to TraceMeConsumer, which consists of the context id and optionally
- * the context name. They may be provided by the user. Then, the events of the
+ * the context type. They may be provided by the user. Then, the events of the
  * same context information can be correlated during the analysis.
  *
  * Example Usages:
- * (1) Using the user-provided context name and id. The user is responsible for
- *     providing the same context name and id to TraceMeProducer and
+ * (1) Using the user-provided context type and id. The user is responsible for
+ *     providing the same context type and id to TraceMeProducer and
  *     TraceMeConsumer.
  * [Producer Thread]
  * // user_context_id is provided by the user.
  * TraceMeProducer producer(
  *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
- *     "executor_context", user_context_id);
+ *     ContextType::kTfExecutor, user_context_id);
  * [Consumer Thread]
  * // user_context_id is provided by the user.
  * TraceMeConsumer consumer(
- *     [&] { return "op_execute"; }, user_context_id, "executor_context");
+ *     [&] { return "op_execute"; }, user_context_id, ContextType::kTfExecutor);
  *
- * (2) Using the user-provided context name and generic id. The user is
+ * (2) Using the user-provided context type and generic id. The user is
  *     responsible for passing the TraceMeProducer's context id to
- *     TraceMeConsumer as well as providing the same context name to
+ *     TraceMeConsumer as well as providing the same context type to
  *     TraceMeProducer and TraceMeConsumer.
  * [Producer Thread]
  * TraceMeProducer producer(
  *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
- *     "executor_context");
+ *     ContextType::kTfExecutor);
  * context_id = producer.GetContextId();
  * // Pass context_id to the consumer thread.
  * [Consumer Thread]
  * // context_id is passed from the producer thread.
  * TraceMeConsumer consumer(
- *     [&] { return "op_execute"; }, context_id, "executor_context");
+ *     [&] { return "op_execute"; }, context_id, ContextType::kTfExecutor);
  *
  * (3) Using the generic context information. The user is responsible for
  *     passing the TraceMeProducer's context id to TraceMeConsumer.
@@ -75,18 +80,16 @@ namespace profiler {
 class TraceMeProducer {
  public:
   template <typename NameT>
-  explicit TraceMeProducer(NameT name, absl::string_view context_name = "",
+  explicit TraceMeProducer(NameT name,
+                           ContextType context_type = ContextType::kGeneric,
                            absl::optional<uint64> context_id = absl::nullopt,
                            int level = 2)
       : trace_me_(name, level) {
     trace_me_.AppendMetadata([&] {
       context_id_ =
           context_id.has_value() ? *context_id : TraceMe::NewActivityId();
-      if (context_name.empty()) {
-        return TraceMeEncode({{"$p", context_id_}});
-      } else {
-        return TraceMeEncode({{"$pn", context_name}, {"$p", context_id_}});
-      }
+      return TraceMeEncode(
+          {{"$pt", static_cast<int>(context_type)}, {"$p", context_id_}});
     });
   }
 
@@ -101,14 +104,12 @@ class TraceMeConsumer {
  public:
   template <typename NameT>
   TraceMeConsumer(NameT name, uint64 context_id,
-                  absl::string_view context_name = "", int level = 2)
+                  ContextType context_type = ContextType::kGeneric,
+                  int level = 2)
       : trace_me_(name, level) {
     trace_me_.AppendMetadata([&] {
-      if (context_name.empty()) {
-        return TraceMeEncode({{"$c", context_id}});
-      } else {
-        return TraceMeEncode({{"$cn", context_name}, {"$c", context_id}});
-      }
+      return TraceMeEncode(
+          {{"$ct", static_cast<int>(context_type)}, {"$c", context_id}});
     });
   }
 
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 710d9a889fb..28d5d303940 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -147,9 +147,9 @@ const StatTypeMap& GetStatTypeMap() {
       {"region_type", kRegionType},
       {"data_type", kDataType},
       {"shape", kTensorShapes},
-      // Schema related.
-      {"$pn", kProducerContextName},
-      {"$cn", kConsumerContextName},
+      // XPlane semantics related.
+      {"$pt", kProducerType},
+      {"$ct", kConsumerType},
       {"$p", kProducerId},
       {"$c", kConsumerId},
       // Device trace arguments.
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 8b19db8c38d..98264c3d6e4 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -139,9 +139,9 @@ enum StatType {
   kRegionType,
   kDataType,
   kTensorShapes,
-  // Schema related.
-  kProducerContextName,
-  kConsumerContextName,
+  // XPlane semantics related.
+  kProducerType,
+  kConsumerType,
   kProducerId,
   kConsumerId,
   // Device trace arguments.

From 692bb1da53493a6cf37dc28a4c1e1a82df32d9fa Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Tue, 19 May 2020 17:26:58 -0700
Subject: [PATCH 0860/1533] Fix nightly breakage (macos, linux).

PiperOrigin-RevId: 312388059
Change-Id: I9db7d5e73b82298df3f800ece69cdccd92e706ca
---
 tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh     | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh  | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh    | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh    | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh    | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh | 2 +-
 tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh    | 2 +-
 tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh     | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh       | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh    | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh       | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh    | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh       | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh    | 2 +-
 .../tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh       | 2 +-
 tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh     | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh       | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh    | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh       | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh    | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh       | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh    | 2 +-
 .../tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh       | 2 +-
 27 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
index f6de18d81ac..0630c117036 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
index c64d9c00787..188e47fa74b 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py2_full/pip_v1.sh
@@ -39,7 +39,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py2'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
index 8c9b91dd55e..3f31033b2ac 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
index e03f4c4ce2f..dcbd5b504c8 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py35_full/pip_v1.sh
@@ -43,7 +43,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
index a66dca3885e..26ee4ea8edb 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py35,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
index dc153b16a43..3d04cf1d9ba 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py36_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
index 5d75224a45c..ed577db961a 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py37,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
index afe933a1912..c3840aa2dc8 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py37_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
index a5a5b6a34c4..f8eda5a7520 100644
--- a/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/macos/cpu_py38_full/pip.sh
@@ -44,7 +44,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py38,-v1only,-gpu,-tpu,-benchmark-test'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
index ad14d8724b8..8524bbbad03 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip.sh
@@ -46,7 +46,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
index a4d9bb1de03..bd2e27e8781 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py2_full/pip_v1.sh
@@ -43,7 +43,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test,-no_oss_py2'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
index 3842410edb2..5d0cbacb0b7 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
index cd8cdd98014..1e2665f4120 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py35_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py35'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
index d23ce016080..25c4de88cdd 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
index 084bfeb3a22..c4d78dc3fe5 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py36_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py36'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
index 9cded426bde..940cef32ef8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
index 2df3c0e61e7..2208327388f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py37_full/pip_v1.sh
@@ -42,7 +42,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py37'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
index 366f2464612..a27d1f863d6 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/cpu_py38_full/pip.sh
@@ -45,7 +45,7 @@ export TF_TEST_FLAGS="--define=no_tensorflow_py_deps=true --test_lang_filters=py
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
 export TF_TEST_FILTER_TAGS='-no_oss,-oss_serial,-no_oss_py38,-v1only'
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME="tensorflow_cpu"
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
index 12290d1b0b5..dd618031c0d 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip.sh
@@ -58,7 +58,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/..."
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
index d5e5c76ce82..db0c3a22c06 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py2_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
index be97cc4bfa8..0e8cd8cd784 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
index a3104e88395..4bbbd50724b 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py35_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
index 15f7db11a87..0b26173ca5f 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
index c1fc598eed6..484daa63cb8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py36_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
index 56f2a7f66e9..00047b775b1 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
index e5d3fda2b73..50cf3d61e4a 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py37_full/pip_v1.sh
@@ -56,7 +56,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="//tensorflow/python/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
index 28b633c390e..9aa5fdf68c8 100644
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_py38_full/pip.sh
@@ -59,7 +59,7 @@ export TF_TEST_FLAGS="--test_tag_filters=${TF_TEST_FILTER_TAGS} --build_tag_filt
 --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute "
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} -//tensorflow/lite/... "
 export TF_PIP_TESTS="test_pip_virtualenv_non_clean test_pip_virtualenv_clean"
-export IS_NIGHTLY=0 # Not nightly
+#export IS_NIGHTLY=0 # Not nightly; uncomment if building from tf repo.
 export TF_PROJECT_NAME=${PROJECT_NAME}
 export TF_PIP_TEST_ROOT="pip_test"
 

From 7f3ef3e1eae4d7142cb3b52ede78caa18a37e96c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 17:27:54 -0700
Subject: [PATCH 0861/1533] [Profiler] Add optimization advice for reducing the
 host-to-TPU data transfer time.

PiperOrigin-RevId: 312388184
Change-Id: I2fc8a60af6724467e447026dde7a8d6925ed1357
---
 .../op_stats_to_input_pipeline_analysis.cc       | 12 ++++++++++++
 .../op_stats_to_input_pipeline_analysis.h        | 16 ++++++++++++++++
 .../core/profiler/protobuf/overview_page.proto   |  2 ++
 3 files changed, 30 insertions(+)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 83673458d21..89b4939f5d0 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -752,5 +752,17 @@ std::string GetSummaryNextStep(absl::string_view input_classification,
   return summary_next_step;
 }
 
+double HostToDeviceTransferAsPercentOfInputTime(
+    const InputTimeBreakdown& breakdown) {
+  // Thanks to the scaling trick we did in GenerateHostResult(), we can
+  // estimate the percentage of input-time spent on host-to-device transfer in
+  // the following way.
+  double total_input_time_us =
+      breakdown.demanded_file_read_us() + breakdown.advanced_file_read_us() +
+      breakdown.preprocessing_us() + breakdown.enqueue_us() +
+      breakdown.unclassified_non_enqueue_us();
+  return 100.0 * SafeDivide(breakdown.enqueue_us(), total_input_time_us);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
index 93b4df0b2c2..2191251ee88 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -31,6 +31,17 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsSignificant, we should advise the
+// user to optimize this transfer.
+constexpr double kHostToDeviceTimePercentAsSignificant = 10.0;
+
+// If the percent of input-time spent on host-to-device transfer is greater than
+// kHostToDeviceTimePercentAsDominant, we should ONLY advise the
+// user to optimize this transfer; we won't bother to suggest optimization for
+// tf.data.
+constexpr double kHostToDeviceTimePercentAsDominant = 90.0;
+
 // Computes the summary of step time in milliseconds.
 StepSummary ComputeStepTimeSummaryInMs(
     const ::tensorflow::protobuf::RepeatedPtrField<PerCoreStepInfo>&
@@ -62,6 +73,11 @@ void OutputAnalysis(double output_percent, std::string* output_classification,
 string GetSummaryNextStep(absl::string_view input_classification,
                           const InputTimeBreakdown& breakdown);
 
+// Returns the percentage of the input time that is spent on transferring the
+// data from host to device.
+double HostToDeviceTransferAsPercentOfInputTime(
+    const InputTimeBreakdown& breakdown);
+
 void AddErrorMessages(const OpStats& op_stats,
                       InputPipelineAnalysisResult* result);
 
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 018aa759cc5..1590076d55f 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -81,6 +81,8 @@ message OverviewPageRecommendation {
   // A statement for input that recommends the next steps for investigating the
   // bottleneck.
   string statement = 2;
+  // A list of tips for tackling input bottleneck.
+  repeated OverviewPageTip input_tips = 11;
   // A statement for output that recommends the next steps for investigating the
   // bottleneck.
   string output_statement = 9;

From ba4804031357abecd1f412eeb5a04810a248391a Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Tue, 19 May 2020 17:28:21 -0700
Subject: [PATCH 0862/1533] Add a global resource manager for TPU specific
 operations.

PiperOrigin-RevId: 312388244
Change-Id: I30dd6ce3a2f0eed3d257750626e11b3bb6eded97
---
 tensorflow/core/tpu/BUILD                |  7 ++++
 tensorflow/core/tpu/tpu_configuration.cc | 44 ++++++++++++++++++++++++
 tensorflow/core/tpu/tpu_configuration.h  | 30 ++++++++++++++++
 3 files changed, 81 insertions(+)
 create mode 100644 tensorflow/core/tpu/tpu_configuration.cc
 create mode 100644 tensorflow/core/tpu/tpu_configuration.h

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 46a8759a257..48a9a229d2a 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -68,6 +68,13 @@ cc_library(
     deps = ["//tensorflow/core:protos_all_cc"],
 )
 
+cc_library(
+    name = "tpu_configuration",
+    srcs = ["tpu_configuration.cc"],
+    hdrs = ["tpu_configuration.h"],
+    deps = ["//tensorflow/core:framework"],
+)
+
 cc_library(
     name = "tpu_init_mode",
     srcs = ["tpu_init_mode.cc"],
diff --git a/tensorflow/core/tpu/tpu_configuration.cc b/tensorflow/core/tpu/tpu_configuration.cc
new file mode 100644
index 00000000000..3788d5cc6c2
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_configuration.cc
@@ -0,0 +1,44 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_configuration.h"
+
+namespace tensorflow {
+
+namespace {
+
+ResourceMgr* GetGlobalResourceMgr() {
+  static ResourceMgr* const rmgr = new ResourceMgr();
+  return rmgr;
+}
+
+}  // namespace
+
+#if !defined(PLATFORM_GOOGLE)
+// Used only by Google-internal tests, so deliberately left empty.
+void MaybeInitializeTPUSystemForTests() {}
+#endif
+
+ResourceMgr* GetTPUConfigResourceMgr() {
+  MaybeInitializeTPUSystemForTests();
+
+  // Put all TPU-related state in the global ResourceMgr. This includes the
+  // TpuPodState, compilation cache, etc. We don't use the TPU_SYSTEM
+  // ResourceMgr because there may be more than one TPU_SYSTEM ResourceMgr when
+  // DirectSession or isolate_session_state are used.
+  return GetGlobalResourceMgr();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_configuration.h b/tensorflow/core/tpu/tpu_configuration.h
new file mode 100644
index 00000000000..6c337bd0fe7
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_configuration.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
+#define TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+
+namespace tensorflow {
+
+void MaybeInitializeTPUSystemForTests();
+
+// Returns a process-wide global ResourceMgr.
+ResourceMgr* GetTPUConfigResourceMgr();
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_CONFIGURATION_H_

From 33e390481ad3ffe15a6c7c264073f22b205d675b Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Tue, 19 May 2020 17:51:07 -0700
Subject: [PATCH 0863/1533] [XLA] Fix some all-gather issues.  - Fix a wrong
 shape inference check.  - Remove the partition_count argument from
 AllGatherDecomposer: it is a per-HLO property related to the replica groups. 
 - Change ID types from U32 to S32 to match replica ID type.

PiperOrigin-RevId: 312391312
Change-Id: I00ead2e7fd3653c7dbde15fa7b623104a78b9a8c
---
 .../compiler/xla/client/xla_builder_test.cc   | 13 ++++++++-
 .../xla/service/all_gather_decomposer.cc      | 27 ++++++++++---------
 .../xla/service/all_gather_decomposer.h       | 13 ++++-----
 .../xla/service/all_gather_decomposer_test.cc | 15 +++++------
 .../compiler/xla/service/shape_inference.cc   |  2 +-
 5 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index e1733cd179c..4fa47077fca 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -381,7 +381,18 @@ TEST_F(XlaBuilderTest, Transpose) {
   EXPECT_THAT(root, op::Transpose(op::Parameter()));
 }
 
-TEST_F(XlaBuilderTest, AllGather) {
+TEST_F(XlaBuilderTest, AllGatherR1) {
+  XlaBuilder b(TestName());
+  auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4}), "x");
+  AllGather(x, /*all_gather_dimension=*/0, /*shard_count=*/4);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(root->opcode(), HloOpcode::kAllGather);
+  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), ShapeUtil::MakeShape(F32, {16})));
+}
+
+TEST_F(XlaBuilderTest, AllGatherR2) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {4, 16}), "x");
   AllGather(x, /*all_gather_dimension=*/1, /*shard_count=*/4);
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.cc b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
index ad63218eca8..00b9adaea43 100644
--- a/tensorflow/compiler/xla/service/all_gather_decomposer.cc
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.cc
@@ -50,14 +50,18 @@ HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
   return reduction;
 }
 
-Status DecomposeAllGather(HloAllGatherInstruction* ag, int64 partition_count,
-                          HloComputation* comp) {
+Status DecomposeAllGather(HloAllGatherInstruction* ag, HloComputation* comp) {
+  const int64 shard_size =
+      ag->operand(0)->shape().dimensions(ag->all_gather_dimension());
+  const int64 ag_size = ag->shape().dimensions(ag->all_gather_dimension());
+  TF_RET_CHECK(ag_size % shard_size == 0);
+  int64 partition_count = ag_size / shard_size;
   auto zero = comp->AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::Zero(ag->shape().element_type())));
   zero = comp->AddInstruction(
       HloInstruction::CreateBroadcast(ag->shape(), zero, {}));
   auto zero_index = comp->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      HloInstruction::CreateConstant(LiteralUtil::Zero(U32)));
   std::vector<HloInstruction*> start_indices(ag->shape().rank(), zero_index);
   auto shard_id_from_subgroup = [&](HloInstruction* replica_or_global_id) {
     if (ag->replica_groups().empty()) {
@@ -79,19 +83,19 @@ Status DecomposeAllGather(HloAllGatherInstruction* ag, int64 partition_count,
     }
     // Create a table of shard IDs for each replica_or_global_id, then slice it
     // using replica_or_global_id.
-    std::vector<int32> shard_ids(ag->replica_groups().size() *
-                                 ag->replica_groups()[0].replica_ids_size());
+    std::vector<uint32> shard_ids(ag->replica_groups().size() *
+                                  ag->replica_groups()[0].replica_ids_size());
     for (const auto& group : ag->replica_groups()) {
       for (int64 i = 0; i < group.replica_ids_size(); ++i) {
         shard_ids[group.replica_ids(i)] = i;
       }
     }
     auto id_table = comp->AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR1<int32>(shard_ids)));
+        LiteralUtil::CreateR1<uint32>(shard_ids)));
     auto shard_id = comp->AddInstruction(HloInstruction::CreateDynamicSlice(
-        ShapeUtil::MakeShape(S32, {1}), id_table, {replica_or_global_id}, {1}));
+        ShapeUtil::MakeShape(U32, {1}), id_table, {replica_or_global_id}, {1}));
     shard_id = comp->AddInstruction(
-        HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), shard_id));
+        HloInstruction::CreateReshape(ShapeUtil::MakeShape(U32, {}), shard_id));
     return shard_id;
   };
   HloInstruction* shard_id;
@@ -100,7 +104,7 @@ Status DecomposeAllGather(HloAllGatherInstruction* ag, int64 partition_count,
       auto pid = comp->AddInstruction(HloInstruction::CreatePartitionId());
       auto rid = comp->AddInstruction(HloInstruction::CreateReplicaId());
       auto pcount = comp->AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR0<int32>(partition_count)));
+          LiteralUtil::CreateR0<uint32>(partition_count)));
       auto global_id = comp->AddInstruction(HloInstruction::CreateBinary(
           pid->shape(), HloOpcode::kAdd, pid,
           comp->AddInstruction(HloInstruction::CreateBinary(
@@ -119,8 +123,7 @@ Status DecomposeAllGather(HloAllGatherInstruction* ag, int64 partition_count,
       comp->AddInstruction(HloInstruction::CreateBinary(
           shard_id->shape(), HloOpcode::kMultiply, shard_id,
           comp->AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0<int32>(ag->operand(0)->shape().dimensions(
-                  ag->all_gather_dimension()))))));
+              LiteralUtil::CreateR0<uint32>(shard_size)))));
   auto dus = comp->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
       zero->shape(), zero, ag->mutable_operand(0), start_indices));
   auto ar = comp->AddInstruction(HloInstruction::CreateAllReduce(
@@ -143,7 +146,7 @@ StatusOr<bool> AllGatherDecomposer::Run(HloModule* module) {
       }
       auto ag = Cast<HloAllGatherInstruction>(hlo);
       if (should_decompose_(*ag)) {
-        TF_RETURN_IF_ERROR(DecomposeAllGather(ag, partition_count_, comp));
+        TF_RETURN_IF_ERROR(DecomposeAllGather(ag, comp));
         changed = true;
       }
     }
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer.h b/tensorflow/compiler/xla/service/all_gather_decomposer.h
index d1983e37383..6b20765c709 100644
--- a/tensorflow/compiler/xla/service/all_gather_decomposer.h
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer.h
@@ -26,15 +26,12 @@ namespace xla {
 // dynamic-update-slices and all-reduces.
 class AllGatherDecomposer : public HloModulePass {
  public:
-  AllGatherDecomposer(
-      std::function<bool(const HloAllGatherInstruction&)> should_decompose,
-      int64 partition_count)
-      : should_decompose_(std::move(should_decompose)),
-        partition_count_(partition_count) {}
-  explicit AllGatherDecomposer(int64 partition_count)
+  explicit AllGatherDecomposer(
+      std::function<bool(const HloAllGatherInstruction&)> should_decompose)
+      : should_decompose_(std::move(should_decompose)) {}
+  AllGatherDecomposer()
       : should_decompose_(
-            [](const HloAllGatherInstruction& ag) { return true; }),
-        partition_count_(partition_count) {}
+            [](const HloAllGatherInstruction& ag) { return true; }) {}
   absl::string_view name() const override { return "all_gather_decomposer"; }
 
   // Run AllGatherDecomposer pass on computations in 'module'.
diff --git a/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
index ebcd66ffa07..3df5e51a7c2 100644
--- a/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
+++ b/tensorflow/compiler/xla/service/all_gather_decomposer_test.cc
@@ -48,7 +48,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((module_str)));
-  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  AllGatherDecomposer decomposer;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -71,7 +71,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((module_str)));
-  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  AllGatherDecomposer decomposer;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -94,7 +94,7 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((module_str)));
-  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  AllGatherDecomposer decomposer;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
   EXPECT_TRUE(changed);
   EXPECT_THAT(
@@ -117,11 +117,11 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((module_str)));
-  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  AllGatherDecomposer decomposer;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
   EXPECT_TRUE(changed);
   auto id =
-      AllOf(op::Shape("s32[]"),
+      AllOf(op::Shape("u32[]"),
             op::Reshape(op::DynamicSlice(op::Constant(), op::ReplicaId())));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::AllReduce(op::DynamicUpdateSlice(
@@ -143,13 +143,12 @@ ENTRY entry {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnUnverifiedModule((module_str)));
-  AllGatherDecomposer decomposer(/*partition_count=*/4);
+  AllGatherDecomposer decomposer;
   TF_ASSERT_OK_AND_ASSIGN(bool changed, decomposer.Run(module.get()));
   EXPECT_TRUE(changed);
-  LOG(ERROR) << module->ToString();
   auto global_id =
       op::Add(op::PartitionId(), op::Multiply(op::ReplicaId(), op::Constant()));
-  auto id = AllOf(op::Shape("s32[]"),
+  auto id = AllOf(op::Shape("u32[]"),
                   op::Reshape(op::DynamicSlice(op::Constant(), global_id)));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::AllReduce(op::DynamicUpdateSlice(
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 8d6ef9faba9..0ea7912c95c 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -2001,7 +2001,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 
 /* static */ StatusOr<Shape> ShapeInference::InferAllGatherShape(
     const Shape& operand_shape, int64 all_gather_dimension, int64 shard_count) {
-  TF_RET_CHECK(all_gather_dimension > 0);
+  TF_RET_CHECK(all_gather_dimension >= 0);
   TF_RET_CHECK(all_gather_dimension < operand_shape.rank());
   TF_RET_CHECK(shard_count > 0);
   auto shape = operand_shape;

From b61cbd0d6a8c85eeaeea6ab7061b8a618c5d5872 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 19 May 2020 18:03:30 -0700
Subject: [PATCH 0864/1533] Internal change

PiperOrigin-RevId: 312392899
Change-Id: I1fb6821663ed1986e321912a6886658435cece05
---
 .../lite/delegates/gpu/cl/kernels/multiply_add_test.cc      | 6 +++---
 tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc      | 4 ++--
 tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc   | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
index 2adb6a20bc4..444a380c2e9 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
@@ -38,7 +38,7 @@ TEST_F(OpenCLOperationTest, MultiplyAddVectorMul) {
   src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
 
   MultiplyAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, 2.0f};
   attr.param = parameters;
@@ -68,7 +68,7 @@ TEST_F(OpenCLOperationTest, MultiplyAddVectorAdd) {
   src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
 
   AddAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, 2.0f};
   attr.param = parameters;
@@ -152,7 +152,7 @@ TEST_F(OpenCLOperationTest, MultiplyAddVectorMad) {
   src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
 
   MultiplyAttributes mul_attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, 2.0f};
   mul_attr.param = parameters;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
index 01b603b5961..4b0006c7f32 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/prelu_test.cc
@@ -37,7 +37,7 @@ TEST_F(OpenCLOperationTest, PReLUAlpha) {
   src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
 
   PReLUAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, -2.0f};
   attr.alpha = parameters;
@@ -68,7 +68,7 @@ TEST_F(OpenCLOperationTest, PReLUAlphaClip) {
   src_tensor.data = {0.0f, -1.0f, -2.0f, 3.0f};
 
   PReLUAttributes attr;
-  Tensor<Linear, DataType::FLOAT32> parameters;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
   parameters.shape = Linear(2);
   parameters.data = {0.5f, -2.0f};
   attr.alpha = parameters;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
index aff64dd48f3..1dada33ae04 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd_test.cc
@@ -111,7 +111,7 @@ TEST_F(OpenCLOperationTest, Winograd36To4x4) {
     src_tensor.data[i] = sin(i);
   }
 
-  Tensor<Linear, DataType::FLOAT32> biases;
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> biases;
   biases.shape = Linear(1);
   biases.data.resize(biases.shape.DimensionsProduct());
   for (int i = 0; i < biases.data.size(); ++i) {

From a893ce99e8e1b6699e158fec6ec532054ab9d0a6 Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Wed, 20 May 2020 01:10:54 +0000
Subject: [PATCH 0865/1533] Fixed test

---
 tensorflow/python/ops/gradients_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 57fb2f4ddb3..b63d9561c30 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1471,7 +1471,6 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
             shape=10,
             trainable=True,
         )
-        self.evaluate(test_var.assign(np.ones([10])))
         return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
@@ -1482,6 +1481,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
       out_re = test_fn_re(test_input_t)
       out = TestFn(test_input_t)
 
+    init = tf.compat.v1.global_variables_initializer()
+    self.evaluate(init)
     grads_re = gradients.gradients(out_re, variables.trainable_variables())
     grads = gradients.gradients(out, variables.trainable_variables())
 

From 77b7adc9db3f72276f277e75a2456a05d9409f3f Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Tue, 19 May 2020 18:08:54 -0700
Subject: [PATCH 0866/1533] Add basic support for top level
 function/NameAttrList attributes in Graph -> TF MLIR importer.

Currently when experimental_implements is used in a tf.function, an _implements attribute of NameAttrList is added to a function. Top level function/NameAttrList were not supported in the general case. This adds a custom attribute to model NameAttrList, where NameAttrList.name will be a StringAttr and NameAttrList.attr will be a DictionaryAttr of <string, converted AttrValue>.

e.g.
```
attr {
  key: "_implements"
  value {
    func {
      name: "embedding_matmul"
      attr {
        key: "key1"
        value {
          i: 2
        }
      }
      attr {
        key: "key2"
        value {
          b: false
        }
      }
    }
  }
}
```

will generate
```
_implements = #tf.func<@embedding_matmul, {key1 = 2 : i64, key2 = false}>
```
PiperOrigin-RevId: 312393667
Change-Id: I15e866ee6ea2bfdeb7642e810dc1244d4d858e36
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |  5 +-
 .../mlir/tensorflow/ir/tf_attributes.cc       | 43 +++++++++++++++
 .../mlir/tensorflow/ir/tf_attributes.h        | 30 +++++++++++
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 51 +++++++++++++++++-
 .../tensorflow/tests/func-attr-invalid.mlir   | 50 +++++++++++++++++
 .../mlir/tensorflow/tests/func-attr.mlir      | 13 +++++
 .../graphdef2mlir/function-func-attr.pbtxt    | 53 +++++++++++++++++++
 .../mlir/tensorflow/translate/import_model.cc | 14 ++++-
 8 files changed, 255 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 2bbdbb383a1..9b2e6f0292b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -224,7 +224,10 @@ cc_library(
     hdrs = [
         "ir/tf_attributes.h",
     ],
-    deps = ["@llvm-project//mlir:IR"],
+    deps = [
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
index 6797c04ebcf..dfad1fce26d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+
 namespace mlir {
 namespace TF {
 
@@ -45,6 +47,28 @@ struct ShapeAttrStorage : public AttributeStorage {
   bool unranked = false;
 };
 
+// The storage class for FuncAttr.
+struct FuncAttrStorage : public AttributeStorage {
+  using KeyTy = std::pair<Attribute, Attribute>;
+
+  explicit FuncAttrStorage(Attribute name, Attribute attrs)
+      : name(name), attrs(attrs) {}
+
+  bool operator==(const KeyTy& key) const { return key == KeyTy(name, attrs); }
+  static unsigned hashKey(const KeyTy& key) {
+    return llvm::hash_combine(key.first, key.second);
+  }
+
+  static FuncAttrStorage* construct(mlir::AttributeStorageAllocator& allocator,
+                                    const KeyTy& key) {
+    return new (allocator.allocate<FuncAttrStorage>())
+        FuncAttrStorage(key.first, key.second);
+  }
+
+  Attribute name;
+  Attribute attrs;
+};
+
 }  // namespace detail
 
 // Get or create a shape attribute.
@@ -85,5 +109,24 @@ bool ShapeAttr::hasStaticShape() const {
   return true;
 }
 
+FuncAttr FuncAttr::get(mlir::MLIRContext* context, llvm::StringRef name,
+                       DictionaryAttr attr) {
+  auto symbol = SymbolRefAttr::get(name, context);
+  return Base::get(context, AttrKind::FUNC, symbol, attr);
+}
+
+FuncAttr FuncAttr::get(mlir::MLIRContext* context, SymbolRefAttr symbol,
+                       DictionaryAttr attr) {
+  return Base::get(context, AttrKind::FUNC, symbol, attr);
+}
+
+SymbolRefAttr FuncAttr::GetName() const {
+  return getImpl()->name.cast<SymbolRefAttr>();
+}
+
+DictionaryAttr FuncAttr::GetAttrs() const {
+  return getImpl()->attrs.cast<DictionaryAttr>();
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
index 4d85dd95cea..ba67d6cb671 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ATTRIBUTES_H_
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 
 namespace mlir {
@@ -30,6 +31,7 @@ namespace AttrKind {
 enum Kind {
   FIRST_USED_TENSORFLOW_ATTR = Attribute::FIRST_TENSORFLOW_ATTR,
   SHAPE = FIRST_USED_TENSORFLOW_ATTR,
+  FUNC,
   LAST_USED_TENSORFLOW_ATTR,
 };
 
@@ -38,6 +40,7 @@ enum Kind {
 namespace detail {
 
 struct ShapeAttrStorage;
+struct FuncAttrStorage;
 
 }  // namespace detail
 
@@ -71,6 +74,33 @@ class ShapeAttr : public Attribute::AttrBase<ShapeAttr, Attribute,
   static bool kindof(unsigned kind) { return kind == AttrKind::SHAPE; }
 };
 
+// Custom attribute to model AttrValue.value.func (NameAttrList type attribute).
+// This attribute holds a SymbolRefAttr, for the NameAttrList.name string and a
+// DictionaryAttr for the NameAttrList.attr map<string, AttrValue>. It is
+// currently printed and parsed for the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+//
+// where the first element is the SymbolRefAttr and the second element is the
+// DictionaryAttr.
+class FuncAttr
+    : public Attribute::AttrBase<FuncAttr, Attribute, detail::FuncAttrStorage> {
+ public:
+  using Base::Base;
+
+  static FuncAttr get(mlir::MLIRContext* context, llvm::StringRef name,
+                      DictionaryAttr attr);
+
+  static FuncAttr get(mlir::MLIRContext* context, SymbolRefAttr symbol,
+                      DictionaryAttr attr);
+
+  SymbolRefAttr GetName() const;
+
+  DictionaryAttr GetAttrs() const;
+
+  static bool kindof(unsigned kind) { return kind == AttrKind::FUNC; }
+};
+
 }  // namespace TF
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 1b6dbfe3e1a..6f02b8b92d8 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -58,6 +58,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -3978,7 +3979,7 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context)
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.def"
       >();
   addInterfaces<TFInlinerInterface>();
-  addAttributes<ShapeAttr>();
+  addAttributes<ShapeAttr, FuncAttr>();
 
   // Support unknown operations because not all TensorFlow operations are
   // registered.
@@ -4033,6 +4034,49 @@ void PrintShapeAttr(ShapeAttr attr, DialectAsmPrinter &os) {  // NOLINT
   os << ">";
 }
 
+// Parses a #tf.func attribute of the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+//
+// where the first element is a SymbolRefAttr and the second element is a
+// DictionaryAttr.
+FuncAttr ParseFuncAttr(MLIRContext *context, StringRef spec, Location loc) {
+  auto emit_error = [&, spec]() {
+    emitError(loc, "invalid TensorFlow func attribute: ") << spec;
+    return nullptr;
+  };
+
+  if (!spec.consume_front("func<")) return emit_error();
+
+  size_t func_name_num_read = 0;
+  Attribute func_name_attr =
+      mlir::parseAttribute(spec, context, func_name_num_read);
+  if (!func_name_attr || !func_name_attr.isa<SymbolRefAttr>())
+    return emit_error();
+  spec = spec.drop_front(func_name_num_read);
+
+  if (!spec.consume_front(", ")) return emit_error();
+
+  size_t func_attrs_num_read = 0;
+  Attribute func_attrs_attr =
+      mlir::parseAttribute(spec, context, func_attrs_num_read);
+  if (!func_attrs_attr || !func_attrs_attr.isa<DictionaryAttr>())
+    return emit_error();
+  spec = spec.drop_front(func_attrs_num_read);
+
+  if (!spec.consume_front(">")) return emit_error();
+
+  return mlir::TF::FuncAttr::get(context, func_name_attr.cast<SymbolRefAttr>(),
+                                 func_attrs_attr.cast<DictionaryAttr>());
+}
+
+// Prints a #tf.func attribute of the following format:
+//
+//   #tf.func<@symbol, {attr = "value"}>
+void PrintFuncAttr(FuncAttr attr, DialectAsmPrinter &os) {
+  os << "func<" << attr.GetName() << ", " << attr.GetAttrs() << ">";
+}
+
 }  // namespace
 
 Attribute TensorFlowDialect::parseAttribute(DialectAsmParser &parser,
@@ -4042,6 +4086,8 @@ Attribute TensorFlowDialect::parseAttribute(DialectAsmParser &parser,
 
   if (spec.startswith("shape")) return ParseShapeAttr(getContext(), spec, loc);
 
+  if (spec.startswith("func")) return ParseFuncAttr(getContext(), spec, loc);
+
   return (emitError(loc, "unknown TensorFlow attribute: " + spec), nullptr);
 }
 
@@ -4051,6 +4097,9 @@ void TensorFlowDialect::printAttribute(Attribute attr,
     case AttrKind::SHAPE:
       PrintShapeAttr(attr.cast<ShapeAttr>(), os);
       break;
+    case AttrKind::FUNC:
+      PrintFuncAttr(attr.cast<FuncAttr>(), os);
+      break;
     default:
       llvm_unreachable("unexpected tensorflow attribute kind");
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir b/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir
new file mode 100644
index 00000000000..cd3b8b55032
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/func-attr-invalid.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics
+
+// Tests invalid #tf.func attributes.
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func}}
+func @main() attributes {tf._implements = #tf.func} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<>}}
+func @main() attributes {tf._implements = #tf.func<>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol>}}
+func @main() attributes {tf._implements = #tf.func<@symbol>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<{}>}}
+func @main() attributes {tf._implements = #tf.func<{}>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<"test", {}>}}
+func @main() attributes {tf._implements = #tf.func<"test", {}>} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol, "">}}
+func @main() attributes {tf._implements = #tf.func<@symbol, "">} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{invalid TensorFlow func attribute: func<@symbol, {}, "">}}
+func @main() attributes {tf._implements = #tf.func<@symbol, {}, "">} {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir
new file mode 100644
index 00000000000..de17778c105
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/func-attr.mlir
@@ -0,0 +1,13 @@
+// RUN: tf-opt %s | tf-opt | FileCheck %s --dump-input=fail
+
+// CHECK-LABEL: func @func_attr
+// CHECK-SAME: tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random"}>
+func @func_attr() attributes {tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random"}>} {
+  return
+}
+
+// CHECK-LABEL: func @nested_func_attr
+// CHECK-SAME: tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random", nested = #tf.func<@symbol_b, {attr2 = true, attr3 = 8.000000e+00 : f32}>}>
+func @nested_func_attr() attributes {tf._implements = #tf.func<@symbol_a, {attr0 = 1 : i32, attr1 = "random", nested = #tf.func<@symbol_b, {attr2 = true, attr3 = 8.0 : f32}>}>} {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt
new file mode 100644
index 00000000000..9f044c62736
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/function-func-attr.pbtxt
@@ -0,0 +1,53 @@
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -o - | FileCheck %s --dump-input-on-failure
+
+node {
+  name: "custom_relu_func_call"
+  op: "custom_relu"
+}
+node {
+  name: "custom_embedding_matmul_func_call"
+  op: "custom_embedding_matmul"
+}
+library {
+  function {
+    signature {
+      name: "custom_relu"
+    }
+    attr {
+      key: "_implements"
+      value {
+        func {
+          name: "tensorflow.relu"
+        }
+      }
+    }
+  }
+  function {
+    signature {
+      name: "custom_embedding_matmul"
+    }
+    attr {
+      key: "_implements"
+      value {
+        func {
+          name: "tensorflow.embedding_matmul"
+          attr {
+            key: "key1"
+            value {
+              i: 2
+            }
+          }
+          attr {
+            key: "key2"
+            value {
+              b: false
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+# CHECK-DAG: func @custom_relu{{[0-9]*}}() attributes {tf._implements = #tf.func<@tensorflow.relu, {}>}
+# CHECK-DAG: func @custom_embedding_matmul{{[0-9]*}}() attributes {tf._implements = #tf.func<@tensorflow.embedding_matmul, {key1 = 2 : i64, key2 = false}>}
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 37bbbbe5ee4..bd63a3b224f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -1168,8 +1168,18 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
       return builder_.getArrayAttr(
           llvm::makeArrayRef(attrs.begin(), attrs.end()));
     }
-    case AttrValue::kFunc:
-      return errors::Unknown("kFunc type should be handled separately!");
+    case AttrValue::kFunc: {
+      // TODO(b/156546237): Unify kFunc/NameAttrList attribute representation.
+      // Currently kFunc/NameAttrList attributes in a kList/repeated AttrValue
+      // will not use this representation.
+      NamedAttrList attrs;
+      for (const auto& func_attr : value.func().attr()) {
+        TF_ASSIGN_OR_RETURN(auto attr, ConvertAttributeValue(func_attr.second));
+        attrs.push_back(builder_.getNamedAttr(func_attr.first, attr));
+      }
+      auto func_attrs = builder_.getDictionaryAttr(attrs);
+      return mlir::TF::FuncAttr::get(context_, value.func().name(), func_attrs);
+    }
     case AttrValue::VALUE_NOT_SET:
       return builder_.getUnitAttr();
     // kPlaceholder is not implemented.

From 85166e98875e77c8db8e92cc5f8db2074b5f429a Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Tue, 19 May 2020 18:09:18 -0700
Subject: [PATCH 0867/1533] Use functools partial rather than lambda when
 calling initializer. Patch TPU Embedding API to prioritize extracting shape
 from functools.partial objects.

PiperOrigin-RevId: 312393739
Change-Id: Ia1401db7e10e583ecf90d5e425d6f76545d2bb45
---
 .../python/keras/engine/base_layer_utils.py   |  3 +-
 tensorflow/python/tpu/tpu_embedding_v2.py     | 31 ++++++++++---------
 2 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index c5e00d8e38e..5980eeaf115 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import threading
 
 from tensorflow.python import tf2
@@ -121,7 +122,7 @@ def make_variable(name,
         initializer,
         (type(init_ops.Initializer), type(init_ops_v2.Initializer))):
       initializer = initializer()
-    init_val = lambda: initializer(shape, dtype=dtype)
+    init_val = functools.partial(initializer, shape, dtype=dtype)
     variable_dtype = dtype.base_dtype
   if use_resource is None:
     use_resource = True
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 3b454d5428c..5a66f6ce8b9 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -1260,23 +1260,26 @@ def extract_variable_info(kwargs):
 
   Returns:
     A tuple of variable name, initialization function, shape, and dtype.
-
-  Raises:
-    ValueError: if unable to extract this information from the given keyword
-      args.
   """
-  if "shape" not in kwargs or kwargs["shape"] is None:
-    if not isinstance(kwargs["initial_value"], functools.partial):
-      raise ValueError(
-          "Unable to extract initializer function and shape from {}. Please "
-          "either pass a function that expects a shape and dtype as the "
-          "initial value for your variable or functools.partial object with "
-          "the shape and dtype kwargs set. This is needed so that we can "
-          "initialize the shards of the ShardedVariable locally.".format(
-              kwargs["initial_value"]))
-    return (kwargs["name"], kwargs["initial_value"].keywords["shape"],
+  if (isinstance(kwargs["initial_value"], functools.partial) and (
+      "shape" in kwargs["initial_value"].keywords or
+      kwargs["initial_value"].args)):
+    # Sometimes shape is passed positionally, sometimes it's passed as a kwarg.
+    if "shape" in kwargs["initial_value"].keywords:
+      shape = kwargs["initial_value"].keywords["shape"]
+    else:
+      shape = kwargs["initial_value"].args[0]
+    return (kwargs["name"], shape,
             kwargs["initial_value"].keywords.get("dtype", kwargs["dtype"]),
             kwargs["initial_value"].func)
+  elif "shape" not in kwargs or kwargs["shape"] is None:
+    raise ValueError(
+        "Unable to extract initializer function and shape from {}. Please "
+        "either pass a function that expects a shape and dtype as the "
+        "initial value for your variable or functools.partial object with "
+        "the shape and dtype kwargs set. This is needed so that we can "
+        "initialize the shards of the ShardedVariable locally.".format(
+            kwargs["initial_value"]))
   else:
     return (kwargs["name"], kwargs["shape"], kwargs["dtype"],
             kwargs["initial_value"])

From ed337b776f7049087eeee8915cc55fa003946d43 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 19 May 2020 22:17:40 +0000
Subject: [PATCH 0868/1533] Update docstring and added a doc example for
 "broadcasting" single padding_values to dataset

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/data/ops/dataset_ops.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 2e069689143..abbdf8c80ed 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1461,6 +1461,16 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
     >>> list(dataset.as_numpy_iterator())
     [(array([[ 1,  2,  3, -1], [ 4,  5, -1, -1]], dtype=int32),
       array([[ 10, 100], [ 11,  12]], dtype=int32))]
+    >>> # Pad with a single value and multiple components.
+    >>> E = tf.data.Dataset.zip((A, A)).padded_batch(2, padding_values=-1)
+    >>> for element in E.as_numpy_iterator():
+    ...   print(element)
+    (array([[ 1, -1],
+           [ 2,  2]], dtype=int32), array([[ 1, -1],
+           [ 2,  2]], dtype=int32))
+    (array([[ 3,  3,  3, -1],
+           [ 4,  4,  4,  4]], dtype=int32), array([[ 3,  3,  3, -1],
+           [ 4,  4,  4,  4]], dtype=int32))
 
     See also `tf.data.experimental.dense_to_sparse_batch`, which combines
     elements that may have different shapes into a `tf.sparse.SparseTensor`.
@@ -1479,7 +1489,12 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
         `tf.Tensor`, representing the padding values to use for the respective
         components. None represents that the nested structure should be padded
         with default values.  Defaults are `0` for numeric types and the empty
-        string for string types.
+        string for string types. The `padding_values` should have the
+        same structure as the input dataset. If `padding_values` is a single
+        element and the input dataset has multiple components, then the same
+        `padding_values` will be used to pad every component of the dataset.
+        If `padding_values` is a scalar, then its value will be broadcasted
+        to match the shape of each component.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
         whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller

From e27a818b0dbee7fde4efc9dfb649b7c0048bb44e Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 19 May 2020 18:21:55 -0700
Subject: [PATCH 0869/1533] Deduplicate Python TraceMe implementations

PiperOrigin-RevId: 312395282
Change-Id: I05f7c4705111ba1c28f044dce11aa351142a4314
---
 tensorflow/compiler/xla/python/BUILD          |  2 +-
 tensorflow/compiler/xla/python/xla.cc         | 61 ++++---------
 tensorflow/python/profiler/internal/BUILD     | 14 ++-
 .../internal/traceme_context_manager.h        | 86 +++++++++++++++++++
 .../profiler/internal/traceme_wrapper.cc      | 77 ++---------------
 5 files changed, 128 insertions(+), 112 deletions(-)
 create mode 100644 tensorflow/python/profiler/internal/traceme_context_manager.h

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 8c6bc84cf8e..863296c681c 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -260,8 +260,8 @@ pybind_extension(
         "//tensorflow/core:lib_internal_impl",  # buildcleaner: keep
         "//tensorflow/core/profiler/lib:profiler_backends",
         "//tensorflow/core/profiler/lib:profiler_session",
-        "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/rpc:profiler_server",
+        "//tensorflow/python/profiler/internal:traceme_context_manager",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:platform",
     ] + select({
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 0c4695cabf3..c75586c92a7 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "pybind11/attr.h"
 #include "pybind11/cast.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
@@ -62,15 +63,16 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
+#include "tensorflow/python/profiler/internal/traceme_context_manager.h"
 #include "tensorflow/stream_executor/platform.h"
 
 namespace xla {
+namespace {
 
 namespace py = pybind11;
 
-namespace {
+using ::tensorflow::profiler::TraceMeContextManager;
 
 struct Uniquer {
   absl::Mutex mu;
@@ -620,43 +622,6 @@ void BuildOpsSubmodule(py::module* m) {
 #undef UNARY_OP
 }
 
-// Helper to implement TraceMe as a context manager in Python.
-class TraceMeContextManager {
- public:
-  explicit TraceMeContextManager(py::str name, py::kwargs kwargs)
-      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
-
-  void Enter() {
-    if (IsEnabled()) {
-      std::string name(name_);
-      if (!kwargs_.empty()) {
-        absl::StrAppend(&name, "#");
-        bool first = true;
-        for (const auto entry : kwargs_) {
-          absl::StrAppend(&name, first ? "" : ",",
-                          std::string(py::str(entry.first)), "=",
-                          std::string(py::str(entry.second)));
-          first = false;
-        }
-        absl::StrAppend(&name, "#");
-      }
-      traceme_.emplace(std::move(name));
-    }
-  }
-  py::object Exit(const py::object& ex_type, const py::object& ex_value,
-                  const py::object& traceback) {
-    traceme_.reset();
-    return py::none();
-  }
-
-  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
-
- private:
-  py::str name_;
-  py::kwargs kwargs_;
-  absl::optional<tensorflow::profiler::TraceMe> traceme_;
-};
-
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =
       m->def_submodule("profiler", "TensorFlow profiler integration");
@@ -672,10 +637,22 @@ void BuildProfilerSubmodule(py::module* m) {
       },
       py::arg("port"));
 
-  py::class_<TraceMeContextManager> traceme_class(profiler, "TraceMe");
+  py::class_<TraceMeContextManager> traceme_class(profiler, "TraceMe",
+                                                  py::module_local());
   traceme_class.def(py::init<py::str, py::kwargs>())
-      .def("__enter__", &TraceMeContextManager::Enter)
-      .def("__exit__", &TraceMeContextManager::Exit)
+      .def("__enter__",
+           [](py::object self) -> py::object {
+             py::cast<TraceMeContextManager*>(self)->Enter();
+             return self;
+           })
+      .def("__exit__",
+           [](py::object self, const py::object& ex_type,
+              const py::object& ex_value,
+              const py::object& traceback) -> py::object {
+             py::cast<TraceMeContextManager*>(self)->Exit();
+             return py::none();
+           })
+      .def("set_metadata", &TraceMeContextManager::SetMetadata)
       .def_static("is_enabled", &TraceMeContextManager::IsEnabled);
 }
 
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index b565ca1b1f4..b6648462224 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -80,12 +80,24 @@ cuda_py_test(
 tf_python_pybind_extension(
     name = "_pywrap_traceme",
     srcs = ["traceme_wrapper.cc"],
-    features = ["-layering_check"],
     module_name = "_pywrap_traceme",
     visibility = [
         "//perftools/accelerators/xprof/xprofilez/integration_tests:__pkg__",
         "//tensorflow/python/profiler:__subpackages__",
     ],
+    deps = [
+        ":traceme_context_manager",
+        "@pybind11",
+    ],
+)
+
+cc_library(
+    name = "traceme_context_manager",
+    hdrs = ["traceme_context_manager.h"],
+    features = ["-layering_check"],
+    visibility = [
+        "//tensorflow/compiler/xla/python:__pkg__",
+    ],
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme_headers",
diff --git a/tensorflow/python/profiler/internal/traceme_context_manager.h b/tensorflow/python/profiler/internal/traceme_context_manager.h
new file mode 100644
index 00000000000..fd281684de8
--- /dev/null
+++ b/tensorflow/python/profiler/internal/traceme_context_manager.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
+#define TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "pybind11/pytypes.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+namespace py = pybind11;
+
+namespace tensorflow {
+namespace profiler {
+
+// Helper to implement TraceMe as a context manager in Python.
+class TraceMeContextManager {
+ public:
+  explicit TraceMeContextManager(py::str name, py::kwargs kwargs)
+      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
+
+  void Enter() {
+    if (IsEnabled()) {
+      traceme_.emplace([this]() {
+        std::string name(name_);
+        if (!kwargs_.empty()) {
+          AppendMetadata(&name, kwargs_);
+        }
+        return name;
+      });
+    }
+  }
+
+  void SetMetadata(py::kwargs kwargs) {
+    if (TF_PREDICT_TRUE(traceme_.has_value() && !kwargs.empty())) {
+      traceme_->AppendMetadata([&kwargs]() {
+        std::string metadata;
+        AppendMetadata(&metadata, kwargs);
+        return metadata;
+      });
+    }
+  }
+
+  void Exit() { traceme_.reset(); }
+
+  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
+
+ private:
+  // Converts kwargs to strings and appends them to name encoded as TraceMe
+  // metadata.
+  static void AppendMetadata(std::string* name, const py::kwargs& kwargs) {
+    name->push_back('#');
+    for (const auto& kv : kwargs) {
+      absl::StrAppend(name, std::string(py::str(kv.first)), "=",
+                      std::string(py::str(kv.second)), ",");
+    }
+    name->back() = '#';
+  }
+
+  py::str name_;
+  py::kwargs kwargs_;
+  absl::optional<tensorflow::profiler::TraceMe> traceme_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index 06844f2a469..b3403fa298f 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -13,77 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <string>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
+#include "pybind11/attr.h"
 #include "pybind11/pybind11.h"
-#include "pybind11/pytypes.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/python/profiler/internal/traceme_context_manager.h"
 
-namespace py = pybind11;
-
-namespace {
-
-// Converts kwargs to strings and appends them to name encoded as TraceMe
-// metadata.
-TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
-    std::string* name, const py::kwargs& kwargs) {
-  name->push_back('#');
-  for (const auto& kv : kwargs) {
-    absl::StrAppend(name, std::string(py::str(kv.first)), "=",
-                    std::string(py::str(kv.second)), ",");
-  }
-  name->back() = '#';
-}
-
-// Helper to implement TraceMe as a context manager in Python.
-class TraceMeWrapper {
- public:
-  explicit TraceMeWrapper(py::str name, py::kwargs kwargs)
-      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
-
-  void Enter() {
-    traceme_.emplace([this]() {
-      std::string name(name_);
-      if (!kwargs_.empty()) {
-        AppendMetadata(&name, kwargs_);
-      }
-      return name;
-    });
-  }
-
-  void SetMetadata(py::kwargs kwargs) {
-    if (TF_PREDICT_TRUE(traceme_.has_value() && !kwargs.empty())) {
-      traceme_->AppendMetadata([&kwargs]() {
-        std::string metadata;
-        AppendMetadata(&metadata, kwargs);
-        return metadata;
-      });
-    }
-  }
-
-  void Exit() { traceme_.reset(); }
-
-  static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
-
- private:
-  py::str name_;
-  py::kwargs kwargs_;
-  absl::optional<tensorflow::profiler::TraceMe> traceme_;
-};
-
-}  // namespace
+using ::tensorflow::profiler::TraceMeContextManager;
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
-  py::class_<TraceMeWrapper> traceme_class(m, "TraceMe");
+  py::class_<TraceMeContextManager> traceme_class(m, "TraceMe",
+                                                  py::module_local());
   traceme_class.def(py::init<py::str, py::kwargs>())
-      .def("Enter", &TraceMeWrapper::Enter)
-      .def("Exit", &TraceMeWrapper::Exit)
-      .def("SetMetadata", &TraceMeWrapper::SetMetadata)
-      .def_static("IsEnabled", &TraceMeWrapper::IsEnabled);
+      .def("Enter", &TraceMeContextManager::Enter)
+      .def("Exit", &TraceMeContextManager::Exit)
+      .def("SetMetadata", &TraceMeContextManager::SetMetadata)
+      .def_static("IsEnabled", &TraceMeContextManager::IsEnabled);
 };

From 0d2a69c71357fe865a11ad552d3f1a6d27d037d8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 18:49:48 -0700
Subject: [PATCH 0870/1533] Include missing dependency.

PiperOrigin-RevId: 312398687
Change-Id: Ic0212dfdca85ba9cf35610c4aab7ebece3bd78d6
---
 tensorflow/lite/experimental/delegates/hexagon/builders/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index cd911bff2a4..d120d414181 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -80,6 +80,7 @@ cc_library(
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/kernels:padding",
         "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:tensor",
         "@hexagon_nn//:hexagon_nn_ops",
     ],
 )

From 871cfc8289954df0054c8c933adee39616c63336 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 19:22:23 -0700
Subject: [PATCH 0871/1533] Automatically cast int32 inputs to int64 if the
 table key expects int64.

PiperOrigin-RevId: 312402130
Change-Id: I8ecff0e394c84671eddd2df708a2e2c354608283
---
 .../layers/preprocessing/index_lookup.py      | 12 ++++++++----
 .../layers/preprocessing/index_lookup_test.py | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index 691e1fef386..c0d0d266ad3 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.layers.preprocessing import table_utils
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.util import compat
 
 # The string tokens in the extracted vocabulary
@@ -78,7 +79,6 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     invert: If true, this layer will map indices to vocabulary items instead
       of mapping vocabulary items to indices.
   """
-  # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
                max_tokens,
@@ -124,17 +124,19 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     self._output_dtype = dtypes.int64
 
+    # We need to save the key dtype so that we know if we're expecting int64
+    # keys. If we are, we will cast int32 inputs to int64 as well.
     if invert:
-      key_dtype = self._output_dtype
+      self._key_dtype = self._output_dtype
       value_dtype = self.dtype
       oov_value = self.oov_token
     else:
-      key_dtype = self.dtype
+      self._key_dtype = self.dtype
       value_dtype = self._output_dtype
       oov_value = self._oov_value
 
     self._table = lookup_ops.MutableHashTable(
-        key_dtype=key_dtype,
+        key_dtype=self._key_dtype,
         value_dtype=value_dtype,
         default_value=oov_value,
         name=(self._name + "_index_table"))
@@ -361,6 +363,8 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     self.set_vocabulary(updates[_VOCAB_NAME])
 
   def call(self, inputs):
+    if self._key_dtype == dtypes.int64 and inputs.dtype == dtypes.int32:
+      inputs = math_ops.cast(inputs, dtypes.int64)
     return self._table_handler.lookup(inputs)
 
   def _use_v1_apis(self):
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index bbca0c537ef..73189d9b9f1 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -285,6 +285,25 @@ class CategoricalEncodingInputTest(
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_int32_input_with_int64_keys(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int32)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes
 class CategoricalEncodingMultiOOVTest(

From f1bf706d77e076ddb08498e6cf0a257727cd6a14 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 19:23:00 -0700
Subject: [PATCH 0872/1533] Rename numpy to numpy_ops

PiperOrigin-RevId: 312402176
Change-Id: Ifd0afbfe592860401aa69d082309062e7444e198
---
 tensorflow/python/ops/{numpy => numpy_ops}/BUILD       | 2 +-
 tensorflow/python/ops/{numpy => numpy_ops}/__init__.py | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename tensorflow/python/ops/{numpy => numpy_ops}/BUILD (90%)
 rename tensorflow/python/ops/{numpy => numpy_ops}/__init__.py (100%)

diff --git a/tensorflow/python/ops/numpy/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
similarity index 90%
rename from tensorflow/python/ops/numpy/BUILD
rename to tensorflow/python/ops/numpy_ops/BUILD
index c5b8828d0d5..5b4dae352d6 100644
--- a/tensorflow/python/ops/numpy/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -8,7 +8,7 @@ package(
 )
 
 py_library(
-    name = "numpy",
+    name = "numpy_ops",
     srcs = [
         "__init__.py",
     ],
diff --git a/tensorflow/python/ops/numpy/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
similarity index 100%
rename from tensorflow/python/ops/numpy/__init__.py
rename to tensorflow/python/ops/numpy_ops/__init__.py

From 32254100c3d2c79a3374b153f824e497b27038a2 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Tue, 19 May 2020 19:32:27 -0700
Subject: [PATCH 0873/1533] Look up eager client directly from target in eager
 cluster FLR.

PiperOrigin-RevId: 312403019
Change-Id: I05e0e4f039e2f92d404eac1fbc9561249d6c3d1f
---
 .../eager/cluster_function_library_runtime.cc             | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index 55f0697d2b4..808188aa36d 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -66,13 +66,7 @@ void EagerClusterFunctionLibraryRuntime::Instantiate(
   VLOG(1) << "CFLR::Instantiate: " << function_name << " on " << target
           << " (this: " << this << ")";
   core::RefCountPtr<eager::EagerClient> eager_client;
-  Device* device;
-  s = ctx_->FindDeviceFromName(target.c_str(), &device);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-  s = ctx_->GetClient(device, &eager_client);
+  s = ctx_->GetClient(target, &eager_client);
   if (!s.ok()) {
     done(s);
     return;

From 81f379b0f4d6f89a5faa3c43d81f0fa36fbbd200 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 19 May 2020 19:41:11 -0700
Subject: [PATCH 0874/1533] Disable flaky tensorflow/c/eager:c_api_remote_test
 on asan

PiperOrigin-RevId: 312403791
Change-Id: Ide29a0e661c6dcb44abb0d39657d1e97fecf04d6
---
 tensorflow/c/eager/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 24593806c65..eb3035cc3d7 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -350,7 +350,10 @@ tf_cuda_cc_test(
     # TODO(b/136478427): Figure out how to correctly shut the server down
     args = ["--heap_check=local"],
     extra_copts = tfe_xla_copts(),
-    tags = ["noasan"],  # leaks gRPC server instances
+    tags = [
+        "noasan",  # leaks gRPC server instances
+        "notsan",  # b/157098283
+    ],
     deps = [
         ":c_api",
         ":c_api_experimental",

From 9657b45007529236fed5d3855ee2981a604d5019 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Tue, 19 May 2020 19:41:25 -0700
Subject: [PATCH 0875/1533] Adding benchmarks for image preprocessing layers.

PiperOrigin-RevId: 312403809
Change-Id: Ib61567311c66f1b476a51fc65b785dc8e180c80c
---
 .../layers/preprocessing/benchmarks/BUILD     |  13 ++
 .../benchmarks/image_preproc_benchmark.py     | 163 ++++++++++++++++++
 2 files changed, 176 insertions(+)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py

diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 0c7e6ba856d..653a81581b3 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -1,4 +1,7 @@
 # Benchmarks for Keras preprocessing layers.
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
@@ -46,3 +49,13 @@ tf_py_test(
         "//tensorflow/python/keras/layers/preprocessing:normalization",
     ],
 )
+
+cuda_py_test(
+    name = "image_preproc_benchmark",
+    srcs = ["image_preproc_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:image_preprocessing",
+    ],
+)
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
new file mode 100644
index 00000000000..302c890c823
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/image_preproc_benchmark.py
@@ -0,0 +1,163 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras image preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import image_ops_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+LOWER = .2
+UPPER = .4
+BATCH_SIZE = 32
+
+
+def rotate(inputs):
+  """rotate image."""
+  inputs_shape = array_ops.shape(inputs)
+  batch_size = inputs_shape[0]
+  img_hd = math_ops.cast(inputs_shape[1], dtypes.float32)
+  img_wd = math_ops.cast(inputs_shape[2], dtypes.float32)
+  min_angle = LOWER * 2. * np.pi
+  max_angle = UPPER * 2. * np.pi
+  angles = random_ops.random_uniform(
+      shape=[batch_size], minval=min_angle, maxval=max_angle)
+  return image_preprocessing.transform(
+      inputs, image_preprocessing.get_rotation_matrix(angles, img_hd, img_wd))
+
+
+def zoom(inputs):
+  """zoom image."""
+  inputs_shape = array_ops.shape(inputs)
+  batch_size = inputs_shape[0]
+  img_hd = math_ops.cast(inputs_shape[1], dtypes.float32)
+  img_wd = math_ops.cast(inputs_shape[2], dtypes.float32)
+  height_zoom = random_ops.random_uniform(
+      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
+  width_zoom = random_ops.random_uniform(
+      shape=[batch_size, 1], minval=1. + LOWER, maxval=1. + UPPER)
+  zooms = math_ops.cast(
+      array_ops.concat([width_zoom, height_zoom], axis=1), dtype=dtypes.float32)
+  return image_preprocessing.transform(
+      inputs, image_preprocessing.get_zoom_matrix(zooms, img_hd, img_wd))
+
+
+def image_augmentation(inputs, batch_size):
+  """image augmentation."""
+  img = inputs
+  img = image_ops_impl.resize_images_v2(img, size=[224, 224])
+  img = random_ops.random_crop(img, size=[batch_size, 224, 224, 3])
+  img = rotate(img)
+  img = zoom(img)
+  return img
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_tensor_slices(
+          np.random.random((batch_size, 256, 256, 3)))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      ds = ds.prefetch(batch_size)
+      img_augmentation = functools.partial(
+          image_augmentation, batch_size=batch_size)
+      ds = ds.map(img_augmentation)
+      starts.append(time.time())
+      count = 0
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = i
+        count += 1
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    with ops.device_v2("/gpu:0"):
+      img = keras.Input(shape=(256, 256, 3), dtype=dtypes.float32)
+      preprocessor = keras.Sequential([
+          image_preprocessing.Resizing(224, 224),
+          image_preprocessing.RandomCrop(height=224, width=224),
+          image_preprocessing.RandomRotation(factor=(.2, .4)),
+          image_preprocessing.RandomFlip(mode="horizontal"),
+          image_preprocessing.RandomZoom(.2, .2)
+      ])
+      _ = preprocessor(img)
+
+      num_repeats = 5
+      starts = []
+      ends = []
+      for _ in range(num_repeats):
+        ds = dataset_ops.Dataset.from_tensor_slices(
+            np.random.random((batch_size, 256, 256, 3)))
+        ds = ds.shuffle(batch_size * 100)
+        ds = ds.batch(batch_size)
+        ds = ds.prefetch(batch_size)
+        starts.append(time.time())
+        count = 0
+        # Benchmarked code begins here.
+        for i in ds:
+          _ = preprocessor(i)
+          count += 1
+        # Benchmarked code ends here.
+        ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / count
+    name = "image_preprocessing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()

From 7d65cad2ce1f27d170052a0e1e7b0a061e27089e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 19:42:38 -0700
Subject: [PATCH 0876/1533] Update TextVectorization docstring.

PiperOrigin-RevId: 312403907
Change-Id: I20ec3caecbaaf3b43647e5f32d65063d5e99f1de
---
 .../python/keras/layers/preprocessing/text_vectorization.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 96c5f137cb9..9d083cc8769 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -118,7 +118,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary. Note that this vocabulary
       contains 1 OOV token, so the effective number of tokens is `(max_tokens -
-      1 - (1 if output == "int" else 0))`
+      1 - (1 if output == "int" else 0))`.
     standardize: Optional specification for standardization to apply to the
       input text. Values can be None (no standardization),
       'lower_and_strip_punctuation' (lowercase and remove punctuation) or a

From 4a37d3fecd759f2e8a02f917f4256f9089cd44f4 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 19 May 2020 19:49:43 -0700
Subject: [PATCH 0877/1533] Set static shape information for reshape layer.

See https://github.com/tensorflow/tensorflow/issues/36363 for more details.

PiperOrigin-RevId: 312404506
Change-Id: I6409fb5335f4ac76df7a80b15f672369143bd1fd
---
 tensorflow/python/keras/layers/core.py      | 15 +++++++++++----
 tensorflow/python/keras/layers/core_test.py |  6 ++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index db9c47eca17..60834fad30b 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -460,7 +460,7 @@ class Reshape(Layer):
   >>> # also supports shape inference using `-1` as dimension
   >>> model.add(tf.keras.layers.Reshape((-1, 2, 2)))
   >>> model.output_shape
-  (None, None, 2, 2)
+  (None, 3, 2, 2)
   """
 
   def __init__(self, target_shape, **kwargs):
@@ -495,7 +495,9 @@ class Reshape(Layer):
       is specified.
     """
     output_shape = list(output_shape)
-    msg = 'total size of new array must be unchanged'
+    msg = ('total size of new array must be unchanged, '
+           'input_shape = {}, output_shape = {}'
+           .format(input_shape, output_shape))
 
     known, unknown = 1, None
     for index, dim in enumerate(output_shape):
@@ -529,8 +531,13 @@ class Reshape(Layer):
     return tensor_shape.TensorShape(output_shape)
 
   def call(self, inputs):
-    return array_ops.reshape(inputs,
-                             (array_ops.shape(inputs)[0],) + self.target_shape)
+    result = array_ops.reshape(
+        inputs, (array_ops.shape(inputs)[0],) + self.target_shape)
+    if not context.executing_eagerly():
+      # Set the static shape for the result since it might lost during array_ops
+      # reshape, eg, some `None` dim in the result could be inferred.
+      result.set_shape(self.compute_output_shape(inputs.shape))
+    return result
 
   def get_config(self):
     config = {'target_shape': self.target_shape}
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 3daa187f1ce..70ad63c17eb 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -430,6 +430,12 @@ class CoreLayersTest(keras_parameterized.TestCase):
         kwargs={'target_shape': (-1, 1)},
         input_shape=(None, None, 2))
 
+  def test_reshape_set_static_shape(self):
+    input_layer = keras.Input(batch_shape=(1, None))
+    reshaped = keras.layers.Reshape((1, 100))(input_layer)
+    # Make sure the batch dim is not lost after array_ops.reshape.
+    self.assertEqual(reshaped.shape, [1, 1, 100])
+
   def test_permute(self):
     testing_utils.layer_test(
         keras.layers.Permute, kwargs={'dims': (2, 1)}, input_shape=(3, 2, 4))

From 361470d24adc8b3cbc5b0e4af3a75f92392369c5 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Tue, 19 May 2020 20:15:55 -0700
Subject: [PATCH 0878/1533] [XLA] Strength reduce cvt(pred) / bcast(f32) to
 bcast(1 / f32) * cvt(pred)

This allows us to reduce the number of redundant divides.

PiperOrigin-RevId: 312407220
Change-Id: Id6ac5322d2eeecd1a40aee0e53b2c814220726d0
---
 .../xla/service/algebraic_simplifier.cc       | 16 +++++++++++++++
 .../xla/service/algebraic_simplifier_test.cc  | 20 +++++++++++++++++++
 2 files changed, 36 insertions(+)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index ecbf2075abe..2fbfd156844 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1488,6 +1488,22 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
     return ReplaceInstruction(divide, new_divide);
   }
 
+  // If X is a convert from pred, then
+  // X / broadcast(Y) => broadcast(1/Y) * X
+  if (Match(divide,
+            m::Divide(
+                m::Convert(&a,
+                           m::Op().WithShape(m::Shape().WithElementType(PRED))),
+                m::Broadcast(m::Op(&b).WithShape(m::Shape().IsScalar()))))) {
+    TF_ASSIGN_OR_RETURN(
+        auto recip, MakeBinaryHlo(HloOpcode::kDivide, MakeScalarLike(b, 1), b));
+    auto recip_bcast = computation_->AddInstruction(
+        HloInstruction::CreateBroadcast(divide->shape(), recip, {}));
+    TF_ASSIGN_OR_RETURN(auto mul,
+                        MakeBinaryHlo(HloOpcode::kMultiply, recip_bcast, a));
+    return ReplaceInstruction(divide, mul);
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 08a004e39fe..0260a925b63 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -6481,5 +6481,25 @@ TEST_F(AlgebraicSimplifierTest, SwapConvOperands) {
   EXPECT_EQ(conv->window().dimensions(1).padding_high(), 1);
 }
 
+TEST_F(AlgebraicSimplifierTest, ScalarDividePredicate) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = pred[2] parameter(0)
+      cvt = f32[2] convert(p0)
+      p1 = f32[] parameter(1)
+      bcast = f32[2] broadcast(p1), dimensions={}
+      ROOT div = f32[2] divide(cvt, bcast)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::Convert(m::Parameter(0)),
+          m::Broadcast(m::Divide(m::ConstantScalar(1), m::Parameter(1))))));
+}
+
 }  // namespace
 }  // namespace xla

From f8a797e13ee12691feffb43a9a8eddffeb4872bd Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 19 May 2020 20:31:31 -0700
Subject: [PATCH 0879/1533] Fix formatting of file

PiperOrigin-RevId: 312408716
Change-Id: I63f427c3453745008b159afc7a459df63b0ec8d0
---
 .../python/keras/layers/normalization.py      | 240 ++++++++----------
 1 file changed, 108 insertions(+), 132 deletions(-)

diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 6e96bdcda88..213aadeb606 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Normalization layers.
-"""
+"""Normalization layers."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -43,7 +42,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 class BatchNormalizationBase(Layer):
-  r"""Normalize and scale inputs or activations. (Ioffe and Szegedy, 2014).
+  r"""Normalize and scale inputs or activations.
 
   Normalize the activations of the previous layer at each batch,
   i.e. applies a transformation that maintains the mean activation
@@ -65,20 +64,16 @@ class BatchNormalizationBase(Layer):
   `training=False` when calling the model, or using `model.predict`.
 
   Arguments:
-    axis: Integer, the axis that should be normalized
-      (typically the features axis).
-      For instance, after a `Conv2D` layer with
-      `data_format="channels_first"`,
-      set `axis=1` in `BatchNormalization`.
+    axis: Integer, the axis that should be normalized (typically the features
+      axis). For instance, after a `Conv2D` layer with
+      `data_format="channels_first"`, set `axis=1` in `BatchNormalization`.
     momentum: Momentum for the moving average.
     epsilon: Small float added to variance to avoid dividing by zero.
-    center: If True, add offset of `beta` to normalized tensor.
-      If False, `beta` is ignored.
-    scale: If True, multiply by `gamma`.
-      If False, `gamma` is not used.
-      When the next layer is linear (also e.g. `nn.relu`),
-      this can be disabled since the scaling
-      will be done by the next layer.
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. When the
+      next layer is linear (also e.g. `nn.relu`), this can be disabled since the
+      scaling will be done by the next layer.
     beta_initializer: Initializer for the beta weight.
     gamma_initializer: Initializer for the gamma weight.
     moving_mean_initializer: Initializer for the moving mean.
@@ -89,17 +84,17 @@ class BatchNormalizationBase(Layer):
     gamma_constraint: Optional constraint for the gamma weight.
     renorm: Whether to use [Batch Renormalization](
       https://arxiv.org/abs/1702.03275). This adds extra variables during
-      training. The inference is the same for either value of this parameter.
+        training. The inference is the same for either value of this parameter.
     renorm_clipping: A dictionary that may map keys 'rmax', 'rmin', 'dmax' to
-      scalar `Tensors` used to clip the renorm correction. The correction
-      `(r, d)` is used as `corrected_value = normalized_value * r + d`, with
-      `r` clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
+      scalar `Tensors` used to clip the renorm correction. The correction `(r,
+      d)` is used as `corrected_value = normalized_value * r + d`, with `r`
+      clipped to [rmin, rmax], and `d` to [-dmax, dmax]. Missing rmax, rmin,
       dmax are set to inf, 0, inf, respectively.
     renorm_momentum: Momentum used to update the moving means and standard
-      deviations with renorm. Unlike `momentum`, this affects training
-      and should be neither too small (which would add noise) nor too large
-      (which would give stale estimates). Note that `momentum` is still applied
-      to get the means and variances for inference.
+      deviations with renorm. Unlike `momentum`, this affects training and
+      should be neither too small (which would add noise) nor too large (which
+      would give stale estimates). Note that `momentum` is still applied to get
+      the means and variances for inference.
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
@@ -117,54 +112,36 @@ class BatchNormalizationBase(Layer):
       example, if axis==-1,
         `adjustment = lambda shape: (
           tf.random.uniform(shape[-1:], 0.93, 1.07),
-          tf.random.uniform(shape[-1:], -0.1, 0.1))`
-      will scale the normalized value by up to 7% up or down, then shift the
-      result by up to 0.1 (with independent scaling and bias for each feature
-      but shared across all examples), and finally apply gamma and/or beta. If
-      `None`, no adjustment is applied. Cannot be specified if
-      virtual_batch_size is specified.
-
+          tf.random.uniform(shape[-1:], -0.1, 0.1))` will scale the normalized
+            value by up to 7% up or down, then shift the result by up to 0.1
+            (with independent scaling and bias for each feature but shared
+            across all examples), and finally apply gamma and/or beta. If
+            `None`, no adjustment is applied. Cannot be specified if
+            virtual_batch_size is specified.
   Call arguments:
     inputs: Input tensor (of any rank).
     training: Python boolean indicating whether the layer should behave in
       training mode or in inference mode.
-      - `training=True`: The layer will normalize its inputs using the
-        mean and variance of the current batch of inputs.
-      - `training=False`: The layer will normalize its inputs using the
-        mean and variance of its moving statistics, learned during training.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
-  {{TRAINABLE_ATTRIBUTE_NOTE}}
-
-  Normalization equations:
-    Consider the intermediate activations \(x\) of a mini-batch of size
-    \\(m\\):
-
-    We can compute the mean and variance of the batch
-
-    \\({\mu_B} = \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)
-
-    \\({\sigma_B^2} = \frac{1}{m} \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)
-
-    and then compute a normalized \\(x\\), including a small factor
-    \\({\epsilon}\\) for numerical stability.
-
-    \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\\)
-
-    And finally \\(\hat{x}\) is linearly transformed by \({\gamma}\\)
-    and \\({\beta}\\), which are learned parameters:
-
-    \\({y_i} = {\gamma * \hat{x_i} + \beta}\\)
-
+      - `training=True`: The layer will normalize its inputs using the mean and
+        variance of the current batch of inputs.
+      - `training=False`: The layer will normalize its inputs using the mean and
+        variance of its moving statistics, learned during training.
+  Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Same shape as input.  {{TRAINABLE_ATTRIBUTE_NOTE}}
+  Normalization equations: Consider the intermediate activations \(x\) of a
+    mini-batch of size
+    \\(m\\):  We can compute the mean and variance of the batch  \\({\mu_B} =
+      \frac{1}{m} \sum_{i=1}^{m} {x_i}\\)  \\({\sigma_B^2} = \frac{1}{m}
+      \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\)  and then compute a normalized
+      \\(x\\), including a small factor \\({\epsilon}\\) for numerical
+      stability.  \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 +
+      \epsilon}}\\)  And finally \\(\hat{x}\) is linearly transformed by
+      \({\gamma}\\)
+    and \\({\beta}\\), which are learned parameters:  \\({y_i} = {\gamma *
+      \hat{x_i} + \beta}\\)
   Reference:
-
     - [Ioffe and Szegedy, 2015](https://arxiv.org/abs/1502.03167).
   """
 
@@ -195,8 +172,7 @@ class BatchNormalizationBase(Layer):
                adjustment=None,
                name=None,
                **kwargs):
-    super(BatchNormalizationBase, self).__init__(
-        name=name, **kwargs)
+    super(BatchNormalizationBase, self).__init__(name=name, **kwargs)
     if isinstance(axis, (list, tuple)):
       self.axis = axis[:]
     elif isinstance(axis, int):
@@ -275,8 +251,8 @@ class BatchNormalizationBase(Layer):
     # TODO(reedwm): Support fp64 in FusedBatchNorm then remove this check.
     if self._compute_dtype not in ('float16', 'bfloat16', 'float32', None):
       raise ValueError('Passing fused=True is only supported when the compute '
-                       'dtype is float16, bfloat16, or float32. Got dtype: %s'
-                       % (self._compute_dtype,))
+                       'dtype is float16, bfloat16, or float32. Got dtype: %s' %
+                       (self._compute_dtype,))
 
   def _fused_can_be_used(self):
     try:
@@ -380,13 +356,14 @@ class BatchNormalizationBase(Layer):
       param_shape = (list(axis_to_dim.values())[0],)
     else:
       # Parameter shape is the original shape but with 1 in all non-axis dims
-      param_shape = [axis_to_dim[i] if i in axis_to_dim
-                     else 1 for i in range(ndims)]
+      param_shape = [
+          axis_to_dim[i] if i in axis_to_dim else 1 for i in range(ndims)
+      ]
       if self.virtual_batch_size is not None:
         # When using virtual batches, add an extra dim at index 1
         param_shape.insert(1, 1)
         for idx, x in enumerate(self.axis):
-          self.axis[idx] = x + 1      # Account for added dimension
+          self.axis[idx] = x + 1  # Account for added dimension
 
     if self.scale:
       self.gamma = self.add_weight(
@@ -507,8 +484,7 @@ class BatchNormalizationBase(Layer):
         decay = ops.convert_to_tensor_v2(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (
-            variable - math_ops.cast(value, variable.dtype)) * decay
+        update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
         if inputs_size is not None:
           update_delta = array_ops.where(inputs_size > 0, update_delta,
                                          K.zeros_like(update_delta))
@@ -650,8 +626,9 @@ class BatchNormalizationBase(Layer):
     with ops.control_dependencies([r, d]):
       mean = array_ops.identity(mean)
       stddev = array_ops.identity(stddev)
-    rmin, rmax, dmax = [self.renorm_clipping.get(key)
-                        for key in ['rmin', 'rmax', 'dmax']]
+    rmin, rmax, dmax = [
+        self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
+    ]
     if rmin is not None:
       r = math_ops.maximum(r, rmin)
     if rmax is not None:
@@ -661,13 +638,13 @@ class BatchNormalizationBase(Layer):
       d = math_ops.minimum(d, dmax)
     # When not training, use r=1, d=0.
     r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r))
-    d = tf_utils.smart_cond(training,
-                            lambda: d,
+    d = tf_utils.smart_cond(training, lambda: d,
                             lambda: array_ops.zeros_like(d))
 
     def _update_renorm_variable(var, value, inputs_size):
       """Updates a moving average and weight, returns the unbiased value."""
       value = array_ops.identity(value)
+
       def _do_update():
         """Updates the var, returns the updated value."""
         new_var = self._assign_moving_average(var, value, self.renorm_momentum,
@@ -676,6 +653,7 @@ class BatchNormalizationBase(Layer):
 
       def _fake_update():
         return array_ops.identity(var)
+
       return tf_utils.smart_cond(training, _do_update, _fake_update)
 
     # TODO(yuefengz): colocate the operations
@@ -753,12 +731,13 @@ class BatchNormalizationBase(Layer):
     ndims = len(input_shape)
     reduction_axes = [i for i in range(ndims) if i not in self.axis]
     if self.virtual_batch_size is not None:
-      del reduction_axes[1]     # Do not reduce along virtual batch dim
+      del reduction_axes[1]  # Do not reduce along virtual batch dim
 
     # Broadcasting only necessary for single-axis batch norm where the axis is
     # not the last dimension
     broadcast_shape = [1] * ndims
     broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
+
     def _broadcast(v):
       if (v is not None and len(v.shape) != ndims and
           reduction_axes != list(range(ndims - 1))):
@@ -783,11 +762,9 @@ class BatchNormalizationBase(Layer):
       if self.adjustment:
         adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
         # Adjust only during training.
-        adj_scale = tf_utils.smart_cond(training,
-                                        lambda: adj_scale,
+        adj_scale = tf_utils.smart_cond(training, lambda: adj_scale,
                                         lambda: array_ops.ones_like(adj_scale))
-        adj_bias = tf_utils.smart_cond(training,
-                                       lambda: adj_bias,
+        adj_bias = tf_utils.smart_cond(training, lambda: adj_bias,
                                        lambda: array_ops.zeros_like(adj_bias))
         scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)
 
@@ -879,11 +856,8 @@ class BatchNormalizationBase(Layer):
       scale = math_ops.cast(scale, inputs.dtype)
     # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
     # math in float16 hurts validation accuracy of popular models like resnet.
-    outputs = nn.batch_normalization(inputs,
-                                     _broadcast(mean),
-                                     _broadcast(variance),
-                                     offset,
-                                     scale,
+    outputs = nn.batch_normalization(inputs, _broadcast(mean),
+                                     _broadcast(variance), offset, scale,
                                      self.epsilon)
     # If some components of the shape got lost due to adjustments, fix that.
     outputs.set_shape(input_shape)
@@ -897,21 +871,32 @@ class BatchNormalizationBase(Layer):
 
   def get_config(self):
     config = {
-        'axis': self.axis,
-        'momentum': self.momentum,
-        'epsilon': self.epsilon,
-        'center': self.center,
-        'scale': self.scale,
-        'beta_initializer': initializers.serialize(self.beta_initializer),
-        'gamma_initializer': initializers.serialize(self.gamma_initializer),
+        'axis':
+            self.axis,
+        'momentum':
+            self.momentum,
+        'epsilon':
+            self.epsilon,
+        'center':
+            self.center,
+        'scale':
+            self.scale,
+        'beta_initializer':
+            initializers.serialize(self.beta_initializer),
+        'gamma_initializer':
+            initializers.serialize(self.gamma_initializer),
         'moving_mean_initializer':
             initializers.serialize(self.moving_mean_initializer),
         'moving_variance_initializer':
             initializers.serialize(self.moving_variance_initializer),
-        'beta_regularizer': regularizers.serialize(self.beta_regularizer),
-        'gamma_regularizer': regularizers.serialize(self.gamma_regularizer),
-        'beta_constraint': constraints.serialize(self.beta_constraint),
-        'gamma_constraint': constraints.serialize(self.gamma_constraint)
+        'beta_regularizer':
+            regularizers.serialize(self.beta_regularizer),
+        'gamma_regularizer':
+            regularizers.serialize(self.gamma_regularizer),
+        'beta_constraint':
+            constraints.serialize(self.beta_constraint),
+        'gamma_constraint':
+            constraints.serialize(self.gamma_constraint)
     }
     # Only add TensorFlow-specific parameters if they are set, so as to preserve
     # model compatibility with external Keras.
@@ -942,16 +927,14 @@ def replace_in_base_docstring(replacements):
 @keras_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
 class BatchNormalization(BatchNormalizationBase):
 
-  __doc__ = replace_in_base_docstring(
-      [('''
+  __doc__ = replace_in_base_docstring([("""
     fused: if `True`, use a faster, fused implementation, or raise a ValueError
       if the fused implementation cannot be used. If `None`, use the faster
       implementation if possible. If False, do not used the fused
-      implementation.''',
-        '''
+      implementation.""", """
     fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.'''),
-       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
+      If `False`, use the system recommended implementation."""),
+                                       ('{{TRAINABLE_ATTRIBUTE_NOTE}}', '')])
 
   _USE_V2_BEHAVIOR = False
 
@@ -1048,37 +1031,30 @@ class LayerNormalization(Layer):
 
 
   Arguments:
-    axis: Integer or List/Tuple. The axis or axes
-      to normalize across. Typically this is the features axis/axes. The
-      left-out axes are typically the batch axis/axes.
-      This argument defaults to `-1`, the last dimension in the input.
-    epsilon: Small float added to variance to avoid dividing by zero.
-      Defaults to 1e-3
-    center: If True, add offset of `beta` to normalized tensor.
-        If False, `beta` is ignored. Defaults to True.
-    scale: If True, multiply by `gamma`.
-      If False, `gamma` is not used. Defaults to True.
-      When the next layer is linear (also e.g. `nn.relu`),
-      this can be disabled since the scaling
-      will be done by the next layer.
+    axis: Integer or List/Tuple. The axis or axes to normalize across. Typically
+      this is the features axis/axes. The left-out axes are typically the batch
+      axis/axes. This argument defaults to `-1`, the last dimension in the
+      input.
+    epsilon: Small float added to variance to avoid dividing by zero. Defaults
+      to 1e-3
+    center: If True, add offset of `beta` to normalized tensor. If False, `beta`
+      is ignored. Defaults to True.
+    scale: If True, multiply by `gamma`. If False, `gamma` is not used. Defaults
+      to True. When the next layer is linear (also e.g. `nn.relu`), this can be
+      disabled since the scaling will be done by the next layer.
     beta_initializer: Initializer for the beta weight. Defaults to zeros.
     gamma_initializer: Initializer for the gamma weight. Defaults to ones.
     beta_regularizer: Optional regularizer for the beta weight. None by default.
-    gamma_regularizer: Optional regularizer for the gamma weight.
-      None by default.
+    gamma_regularizer: Optional regularizer for the gamma weight. None by
+      default.
     beta_constraint: Optional constraint for the beta weight. None by default.
     gamma_constraint: Optional constraint for the gamma weight. None by default.
     trainable: Boolean, if `True` the variables will be marked as trainable.
       Defaults to True.
-
-  Input shape:
-    Arbitrary. Use the keyword argument `input_shape`
-    (tuple of integers, does not include the samples axis)
-    when using this layer as the first layer in a model.
-
-  Output shape:
-    Same shape as input.
-
+  Input shape: Arbitrary. Use the keyword argument `input_shape` (tuple of
+    integers, does not include the samples axis) when using this layer as the
+    first layer in a model.
+  Output shape: Same shape as input.
   Reference:
     - [Lei Ba et al., 2016](https://arxiv.org/abs/1607.06450).
   """
@@ -1204,9 +1180,9 @@ class LayerNormalization(Layer):
     broadcast_shape = [1] * ndims
     for dim in self.axis:
       broadcast_shape[dim] = input_shape.dims[dim].value
+
     def _broadcast(v):
-      if (v is not None and len(v.shape) != ndims and
-          self.axis != [ndims - 1]):
+      if (v is not None and len(v.shape) != ndims and self.axis != [ndims - 1]):
         return array_ops.reshape(v, broadcast_shape)
       return v
 

From cd0322fa0ea8954901ad2f1a47a4fd869d79b8e6 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Tue, 19 May 2020 21:38:42 -0700
Subject: [PATCH 0880/1533] Disable flaky TFRT tests.

PiperOrigin-RevId: 312415958
Change-Id: I5cdaab0a4c5c2e9cf09bcee61df3b008a98eac22
---
 .../python/eager/benchmarks/resnet50/resnet50_test.py     | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 362fad1388c..30e2585e842 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -104,6 +104,7 @@ class ResNet50Test(tf.test.TestCase):
       context.async_wait()
     self.assertEqual((2, 1000), output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply(self):
     self._apply(defun=False)
 
@@ -120,6 +121,7 @@ class ResNet50Test(tf.test.TestCase):
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_no_top(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
@@ -130,6 +132,7 @@ class ResNet50Test(tf.test.TestCase):
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_with_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
@@ -138,6 +141,7 @@ class ResNet50Test(tf.test.TestCase):
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_no_average_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -149,6 +153,7 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 7, 7, 2048))
     self.assertEqual(output_shape, output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_block3_strides(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -160,6 +165,7 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_apply_retrieve_intermediates(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -214,6 +220,7 @@ class ResNet50Test(tf.test.TestCase):
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_train(self):
     self._test_train()
 
@@ -221,6 +228,7 @@ class ResNet50Test(tf.test.TestCase):
   def test_train_async(self):
     self._test_train(execution_mode=context.ASYNC)
 
+  @test_util.disable_tfrt('Flaky test. b/157103729')
   def test_no_garbage(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format)

From 749bd23af669d0ae90e59ed655ab2818ec10a2ec Mon Sep 17 00:00:00 2001
From: Janosh Riebesell <janosh.riebesell@gmail.com>
Date: Wed, 20 May 2020 06:56:24 +0200
Subject: [PATCH 0881/1533] shorten tf.shape docstring

clarify when it's different from `x.shape`
---
 tensorflow/python/ops/array_ops.py | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 4f03b985b69..8c84fe1d450 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -535,19 +535,16 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
 
-  This operation returns a 1-D integer tensor representing the shape of `input`.
-  This represents the minimal set of known information at definition time.
+  `tf.shape` returns a 1-D integer tensor representing the shape of `input`.
 
   For example:
 
   >>> t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]])
   >>> tf.shape(t)
   <tf.Tensor: shape=(3,), dtype=int32, numpy=array([2, 2, 3], dtype=int32)>
-  >>> tf.shape(t).numpy()
-  array([2, 2, 3], dtype=int32)
 
-  Note: When using symbolic tensors, such as when using the Keras functional
-  API, tf.shape() will return the shape of the symbolic tensor.
+  Note: When using symbolic tensors, such as when using the Keras API,
+  tf.shape() will return the shape of the symbolic tensor.
 
   >>> a = tf.keras.layers.Input((None, 10))
   >>> tf.shape(a)
@@ -558,17 +555,12 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
   >>> a.shape
   TensorShape([None, None, 10])
   
-  However, when defining custom layers and models that will be run in graph mode
-  at some point, prefer `tf.shape(x)` over `x.shape`. `x.shape` is the static shape
-  of `x` and usually evaluates to `None` in the first dimension during graph
-  construction (to represent the as yet unknown batch size). This can cause problems in
-  function calls like `tf.zeros(x.shape[0])` which don't support `None` values.
-  `tf.shape(x)` on the other hand gives the dynamic shape of `x` which isn't
-  evaluated until training/predicting begins where the full shape of `x`  is known.
+  (The first `None` represents the as yet unknown batch size.)
 
   `tf.shape` and `Tensor.shape` should be identical in eager mode.  Within
   `tf.function` or within a `compat.v1` context, not all dimensions may be
-  known until execution time.
+  known until execution time. Hence when defining custom layers and models
+  for graph mode, prefer the dynamic `tf.shape(x)` over the static `x.shape`.
 
   Args:
     input: A `Tensor` or `SparseTensor`.

From ff997783433bd0975c4daab84facd22d139d555a Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Tue, 19 May 2020 22:15:54 -0700
Subject: [PATCH 0882/1533] Build pip_package with Bazel

Added new script build_pip_package_with_bazel.sh.
This scirpt can be used with ci_build.sh of TF for cross building.

ex) Build armhf Python3 based PIP
$ CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf

PiperOrigin-RevId: 312420603
Change-Id: I5504fb22248e6a9d29560fa9216113a8705b7399
---
 tensorflow/lite/tools/pip_package/README.md   |  46 +++++++
 .../build_pip_package_with_bazel.sh           | 126 ++++++++++++++++++
 .../tools/pip_package/setup_with_bazel.py     |  70 ++++++++++
 .../install/install_pi_python37_toolchain.sh  |  10 +-
 .../install/install_pi_python3_toolchain.sh   |  10 +-
 .../arm_linux_toolchain_configure.bzl         |  11 ++
 .../embedded/arm-linux/cc_config.bzl.tpl      |  10 ++
 7 files changed, 275 insertions(+), 8 deletions(-)
 create mode 100755 tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
 create mode 100644 tensorflow/lite/tools/pip_package/setup_with_bazel.py

diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
index dac8ce02ca1..8a2be59b980 100644
--- a/tensorflow/lite/tools/pip_package/README.md
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -49,6 +49,52 @@ BUILD_DEB=y to the make command (only for python3):
 make BASE_IMAGE=debian:buster PYTHON=python3 TENSORFLOW_TARGET=rpi BUILD_DEB=y docker-build
 ```
 
+## Alternative build with Bazel (experimental)
+
+There is another build steps to build a binary wheel which uses Bazel instead of
+Makefile. You don't need to install additional dependencies.
+This approach can leverage TF's ci_build.sh for ARM cross builds.
+
+### Native build for your workstation
+
+```sh
+tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+```
+
+### Cross build for armhf Python 3.5
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
+### Cross build for armhf Python 3.7
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
+### Cross build for aarch64 Python 3.5
+
+```sh
+  CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
+```
+
+### Cross build for aarch64 Python 3.7
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.7" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON37 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
+```
+
+## Usage
+
 Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
 You can then use the Tensorflow Lite interpreter as.
 
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
new file mode 100755
index 00000000000..69afb2f6b80
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
@@ -0,0 +1,126 @@
+#!/usr/bin/env bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+set -ex
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PYTHON="${PYTHON:-python3}"
+VERSION_SUFFIX=${VERSION_SUFFIX:-}
+export TENSORFLOW_DIR="${SCRIPT_DIR}/../../../.."
+TENSORFLOW_LITE_DIR="${TENSORFLOW_DIR}/tensorflow/lite"
+TENSORFLOW_VERSION=$(grep "_VERSION = " "${TENSORFLOW_DIR}/tensorflow/tools/pip_package/setup.py" | cut -d= -f2 | sed "s/[ '-]//g")
+export PACKAGE_VERSION="${TENSORFLOW_VERSION}${VERSION_SUFFIX}"
+BUILD_DIR="${SCRIPT_DIR}/gen/tflite_pip/${PYTHON}"
+TENSORFLOW_TARGET=$1
+
+# Build source tree.
+rm -rf "${BUILD_DIR}" && mkdir -p "${BUILD_DIR}/tflite_runtime"
+cp -r "${TENSORFLOW_LITE_DIR}/tools/pip_package/debian" \
+      "${TENSORFLOW_LITE_DIR}/tools/pip_package/setup_with_bazel.py" \
+      "${TENSORFLOW_LITE_DIR}/tools/pip_package/MANIFEST.in" \
+      "${TENSORFLOW_LITE_DIR}/python/interpreter_wrapper" \
+      "${BUILD_DIR}"
+cp "${TENSORFLOW_LITE_DIR}/python/interpreter.py" \
+   "${BUILD_DIR}/tflite_runtime"
+echo "__version__ = '${PACKAGE_VERSION}'" >> "${BUILD_DIR}/tflite_runtime/__init__.py"
+echo "__git_version__ = '$(git -C "${TENSORFLOW_DIR}" describe)'" >> "${BUILD_DIR}/tflite_runtime/__init__.py"
+
+# Build python interpreter_wrapper.
+cd "${BUILD_DIR}"
+case "${TENSORFLOW_TARGET}" in
+  rpi|armhf)
+    BAZEL_FLAGS="--config=elinux_armhf
+      --copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
+      --copt=-O3 --copt=-fno-tree-pre --copt=-fpermissive
+      --define=raspberry_pi_with_neon=true"
+    ;;
+  aarch64)
+    BAZEL_FLAGS="--config=elinux_aarch64
+      --copt=-O3"
+    ;;
+  *)
+    ;;
+esac
+
+# We need to pass down the environment variable with a possible alternate Python
+# include path for Python 3.x builds to work.
+export CROSSTOOL_PYTHON_INCLUDE_PATH
+
+bazel build -c opt -s --config=monolithic ${BAZEL_FLAGS} //tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper
+cp "${TENSORFLOW_DIR}/bazel-bin/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.so" \
+   "${BUILD_DIR}/tflite_runtime"
+
+# Build python wheel.
+cd "${BUILD_DIR}"
+case "${TENSORFLOW_TARGET}" in
+  rpi|armhf)
+    ${PYTHON} setup_with_bazel.py bdist --plat-name=linux-armv7l \
+                       bdist_wheel --plat-name=linux-armv7l
+    ;;
+  aarch64)
+    ${PYTHON} setup_with_bazel.py bdist --plat-name=linux-aarch64 \
+                       bdist_wheel --plat-name=linux-aarch64
+    ;;
+  *)
+    if [[ -n "${TENSORFLOW_TARGET}" ]] && [[ -n "${TENSORFLOW_TARGET_ARCH}" ]]; then
+      ${PYTHON} setup_with_bazel.py bdist --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH} \
+                         bdist_wheel --plat-name=${TENSORFLOW_TARGET}-${TENSORFLOW_TARGET_ARCH}
+    else
+      ${PYTHON} setup_with_bazel.py bdist bdist_wheel
+    fi
+    ;;
+esac
+
+echo "Output can be found here:"
+find "${BUILD_DIR}"
+
+# Build debian package.
+if [[ "${BUILD_DEB}" != "y" ]]; then
+  exit 0
+fi
+
+PYTHON_VERSION=$(${PYTHON} -c "import sys;print(sys.version_info.major)")
+if [[ ${PYTHON_VERSION} != 3 ]]; then
+  echo "Debian package can only be generated for python3." >&2
+  exit 1
+fi
+
+DEB_VERSION=$(dpkg-parsechangelog --show-field Version | cut -d- -f1)
+if [[ "${DEB_VERSION}" != "${PACKAGE_VERSION}" ]]; then
+  cat << EOF > "${BUILD_DIR}/debian/changelog"
+tflite-runtime (${PACKAGE_VERSION}-1) unstable; urgency=low
+
+  * Bump version to ${PACKAGE_VERSION}.
+
+ -- TensorFlow team <packages@tensorflow.org>  $(date -R)
+
+$(<"${BUILD_DIR}/debian/changelog")
+EOF
+fi
+
+case "${TENSORFLOW_TARGET}" in
+  rpi|armhf)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a armhf
+    ;;
+  aarch64)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a arm64
+    ;;
+  *)
+    dpkg-buildpackage -b -rfakeroot -us -uc -tc -d
+    ;;
+esac
+
+cat "${BUILD_DIR}/debian/changelog"
+
diff --git a/tensorflow/lite/tools/pip_package/setup_with_bazel.py b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
new file mode 100644
index 00000000000..e3e9a35a62e
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/setup_with_bazel.py
@@ -0,0 +1,70 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite is for mobile and embedded devices.
+
+TensorFlow Lite is the official solution for running machine learning models on
+mobile and embedded devices. It enables on-device machine learning inference
+with low latency and a small binary size on Android, iOS, and other operating
+systems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from setuptools import find_packages
+from setuptools import setup
+PACKAGE_NAME = 'tflite_runtime'
+PACKAGE_VERSION = os.environ['PACKAGE_VERSION']
+DOCLINES = __doc__.split('\n')
+
+setup(
+    name=PACKAGE_NAME.replace('_', '-'),
+    version=PACKAGE_VERSION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
+    url='https://www.tensorflow.org/lite/',
+    author='Google, LLC',
+    author_email='packages@tensorflow.org',
+    license='Apache 2.0',
+    include_package_data=True,
+    keywords='tflite tensorflow tensor machine learning',
+    classifiers=[
+        'Development Status :: 5 - Production/Stable',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Education',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Topic :: Scientific/Engineering',
+        'Topic :: Scientific/Engineering :: Mathematics',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Software Development',
+        'Topic :: Software Development :: Libraries',
+        'Topic :: Software Development :: Libraries :: Python Modules',
+    ],
+    packages=find_packages(exclude=[]),
+    package_dir={'': '.'},
+    package_data={'': ['*.so']},
+    install_requires=[
+        'numpy >= 1.16.0',
+        'pybind11 >= 2.4.3',
+    ])
diff --git a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
index 7688a081d6f..3bda56af648 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python37_toolchain.sh
@@ -15,12 +15,14 @@
 # ==============================================================================
 
 dpkg --add-architecture armhf
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+dpkg --add-architecture arm64
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
 sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
 yes | add-apt-repository ppa:deadsnakes/ppa
 apt-get update
 apt-get install -y python3.7 python3-numpy python3.7-dev python3-pip
 apt-get install -y libpython3.7-dev:armhf
+apt-get install -y libpython3.7-dev:arm64
diff --git a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
index 7c87a3fc7c5..b02c35c612d 100755
--- a/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
+++ b/tensorflow/tools/ci_build/install/install_pi_python3_toolchain.sh
@@ -15,11 +15,13 @@
 # ==============================================================================
 
 dpkg --add-architecture armhf
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
-echo 'deb [arch=armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+dpkg --add-architecture arm64
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-updates main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-security main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
+echo 'deb [arch=arm64,armhf] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse' >> /etc/apt/sources.list.d/armhf.list
 sed -i 's#deb http://archive.ubuntu.com/ubuntu/#deb [arch=amd64] http://archive.ubuntu.com/ubuntu/#g' /etc/apt/sources.list
 apt-get update
 apt-get install -y libpython3-all-dev:armhf
+apt-get install -y libpython3-all-dev:arm64
 apt-get install -y python3 python3-numpy python3-dev python3-pip
diff --git a/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl b/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
index da4282d0215..af34133f27c 100644
--- a/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
+++ b/third_party/toolchains/embedded/arm-linux/arm_linux_toolchain_configure.bzl
@@ -10,6 +10,16 @@ def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
     )
 
 def _arm_linux_toolchain_configure_impl(repository_ctx):
+    # We need to find a cross-compilation include directory for Python, so look
+    # for an environment variable. Be warned, this crosstool template is only
+    # regenerated on the first run of Bazel, so if you change the variable after
+    # it may not be reflected in later builds. Doing a shutdown and clean of Bazel
+    # doesn't fix this, you'll need to delete the generated file at something like:
+    # external/local_config_arm_compiler/CROSSTOOL in your Bazel install.
+    if "CROSSTOOL_PYTHON_INCLUDE_PATH" in repository_ctx.os.environ:
+        python_include_path = repository_ctx.os.environ["CROSSTOOL_PYTHON_INCLUDE_PATH"]
+    else:
+        python_include_path = "/usr/include/python3.5"
     _tpl(repository_ctx, "cc_config.bzl", {
         "%{AARCH64_COMPILER_PATH}%": str(repository_ctx.path(
             repository_ctx.attr.aarch64_repo,
@@ -17,6 +27,7 @@ def _arm_linux_toolchain_configure_impl(repository_ctx):
         "%{ARMHF_COMPILER_PATH}%": str(repository_ctx.path(
             repository_ctx.attr.armhf_repo,
         )),
+        "%{PYTHON_INCLUDE_PATH}%": python_include_path,
     })
     repository_ctx.symlink(repository_ctx.attr.build_file, "BUILD")
 
diff --git a/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl b/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
index 06aaaecfa74..afbea6a3e34 100644
--- a/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
+++ b/third_party/toolchains/embedded/arm-linux/cc_config.bzl.tpl
@@ -252,6 +252,10 @@ def _impl(ctx):
                                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
                                 "-isystem",
                                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                                "-isystem",
+                                "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
                             ],
                         ),
                     ],
@@ -347,6 +351,10 @@ def _impl(ctx):
                                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
                                 "-isystem",
                                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                                "-isystem",
+                                "%{PYTHON_INCLUDE_PATH}%",
+                                "-isystem",
+                                "/usr/include/",
                             ],
                         ),
                     ],
@@ -466,6 +474,7 @@ def _impl(ctx):
                 "%{AARCH64_COMPILER_PATH}%/lib/gcc/aarch64-linux-gnu/8.3.0/include-fixed",
                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/include/c++/8.3.0/",
                 "%{AARCH64_COMPILER_PATH}%/aarch64-linux-gnu/libc/usr/include/",
+                "/usr/include",
             ]
     elif (ctx.attr.cpu == "armhf"):
         cxx_builtin_include_directories = [
@@ -473,6 +482,7 @@ def _impl(ctx):
                 "%{ARMHF_COMPILER_PATH}%/lib/gcc/arm-linux-gnueabihf/8.3.0/include-fixed",
                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/include/c++/8.3.0/",
                 "%{ARMHF_COMPILER_PATH}%/arm-linux-gnueabihf/libc/usr/include/",
+                "/usr/include",
             ]
     else:
         fail("Unreachable")

From fe5ac4182bff516dafe3ae8dc39e659f1af4c6da Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Wed, 20 May 2020 00:08:46 -0700
Subject: [PATCH 0883/1533] [XLA:PARSER] Fix parser to read U64 types properly

PiperOrigin-RevId: 312431164
Change-Id: I39541a3885defcad1b29a597305982d3b457abf4
---
 tensorflow/compiler/xla/service/hlo_lexer.cc       |  5 +++++
 tensorflow/compiler/xla/service/hlo_parser.cc      | 12 ++++--------
 tensorflow/compiler/xla/service/hlo_parser_test.cc |  6 ++----
 3 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index bc1745a0791..314030fc5e8 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -370,6 +370,11 @@ TokKind HloLexer::LexNumberOrPattern() {
     if (absl::SimpleAtoi(slice, &token_state_.int64_val)) {
       return TokKind::kInt;
     }
+    uint64 uint64_val;
+    if (absl::SimpleAtoi(slice, &uint64_val)) {
+      token_state_.int64_val = absl::bit_cast<int64>(uint64_val);
+      return TokKind::kInt;
+    }
     LOG(ERROR) << "Failed to parse int literal: " << slice;
     return TokKind::kError;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 2a90c95850c..f1908bcb996 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -2598,14 +2598,10 @@ bool HloParserImpl::CheckParsedValueIsInRange(LocTy loc, ParsedElemT value) {
            std::is_same<ParsedElemT, bool>::value))
         << "Unimplemented checking for ParsedElemT";
 
-    ParsedElemT upper_bound;
-    if (sizeof(LiteralNativeT) >= sizeof(ParsedElemT)) {
-      upper_bound = std::numeric_limits<ParsedElemT>::max();
-    } else {
-      upper_bound =
-          static_cast<ParsedElemT>(std::numeric_limits<LiteralNativeT>::max());
-    }
-    if (value > upper_bound || value < 0) {
+    const uint64 unsigned_value = value;
+    const uint64 upper_bound =
+        static_cast<uint64>(std::numeric_limits<LiteralNativeT>::max());
+    if (unsigned_value > upper_bound) {
       // Value is out of range for LiteralNativeT.
       return Error(loc, StrCat("value ", value,
                                " is out of range for literal's primitive type ",
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index e18014a3071..8f63835b43d 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -2000,9 +2000,7 @@ TEST_F(HloParserTest, ConstantUnsignedUnderflow) {
         ROOT %constant = u64[] constant(-1)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(Status::OK(), result.status());
-  ExpectHasSubstr(result.status().error_message(),
-                  "is out of range for literal's primitive type U64");
+  EXPECT_EQ(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantUnsignedOverflow) {
@@ -2024,7 +2022,7 @@ TEST_F(HloParserTest, ConstantUnsignedInt64Overflow) {
         ROOT %constant = u64[] constant(9223372036854775808)
       })";
   auto result = ParseAndReturnUnverifiedModule(original);
-  EXPECT_NE(Status::OK(), result.status());
+  EXPECT_EQ(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, ConstantC64Overflow) {

From e9c9cfaf0af2819b5817f3b5f0f7a6d61497f56a Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 20 May 2020 00:11:46 -0700
Subject: [PATCH 0884/1533] Add shape inference pass before legalization:
 prepare-tf may introduce ops those do not have shaped tensor yet, we need to
 insert a shape inference pass to make proper shape propagation.

PiperOrigin-RevId: 312431443
Change-Id: Id1f4bc5d4b7df1bf2c84b69acd64e0bae567dd76
---
 .../tests/end2end/unroll_batch_matmul.pbtxt   | 101 ++++++++++++++++++
 .../compiler/mlir/lite/tf_tfl_passes.cc       |   4 +
 2 files changed, 105 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt

diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt
new file mode 100644
index 00000000000..096033e37cb
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/unroll_batch_matmul.pbtxt
@@ -0,0 +1,101 @@
+# RUN: tf_tfl_translate -tf-input-arrays=Placeholder,Placeholder_1 -tf-input-shapes=2,5,3:3,7 -tf-input-data-types=DT_FLOAT,DT_FLOAT -tf-output-arrays=MatMul -output-mlir %s -o - 2>&1 | FileCheck %s
+
+node {
+  name: "Placeholder"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 2
+        }
+        dim {
+          size: 5
+        }
+        dim {
+          size: 3
+        }
+      }
+    }
+  }
+}
+node {
+  name: "Placeholder_1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 3
+        }
+        dim {
+          size: 7
+        }
+      }
+    }
+  }
+}
+node {
+  name: "MatMul"
+  op: "BatchMatMulV2"
+  input: "Placeholder"
+  input: "Placeholder_1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "adj_x"
+    value {
+      b: false
+    }
+  }
+  attr {
+    key: "adj_y"
+    value {
+      b: false
+    }
+  }
+}
+versions {
+  producer: 175
+}
+
+# CHECK:       func @main(%[[VAL_0:.*]]: tensor<2x5x3xf32>, %[[VAL_1:.*]]: tensor<3x7xf32>) -> tensor<2x5x7xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "Placeholder,Placeholder_1", outputs = "MatMul"}} {
+# CHECK:           %[[VAL_2:.*]] = constant dense<[1, 0]> : tensor<2xi32>
+# CHECK:           %[[VAL_3:.*]] = constant dense<[5, 3]> : tensor<2xi32>
+# CHECK:           %[[VAL_4:.*]] = constant dense<[3, 7]> : tensor<2xi32>
+# CHECK:           %[[VAL_5:.*]] = constant unit
+# CHECK:           %[[VAL_6:.*]] = constant dense<[1, 0, 0]> : tensor<3xi32>
+# CHECK:           %[[VAL_7:.*]] = constant dense<[1, 5, 3]> : tensor<3xi32>
+# CHECK:           %[[VAL_8:.*]] = constant dense<0> : tensor<3xi32>
+# CHECK:           %[[VAL_9:.*]] = constant dense<[1, 3, 7]> : tensor<3xi32>
+# CHECK:           %[[VAL_10:.*]] = "tfl.slice"(%[[VAL_0]], %[[VAL_8]], %[[VAL_7]]) : (tensor<2x5x3xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x5x3xf32>
+# CHECK:           %[[VAL_11:.*]] = "tfl.reshape"(%[[VAL_10]], %[[VAL_3]]) : (tensor<1x5x3xf32>, tensor<2xi32>) -> tensor<5x3xf32>
+# CHECK:           %[[VAL_12:.*]] = "tfl.slice"(%[[VAL_0]], %[[VAL_6]], %[[VAL_7]]) : (tensor<2x5x3xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x5x3xf32>
+# CHECK:           %[[VAL_13:.*]] = "tfl.reshape"(%[[VAL_12]], %[[VAL_3]]) : (tensor<1x5x3xf32>, tensor<2xi32>) -> tensor<5x3xf32>
+# CHECK:           %[[VAL_14:.*]] = "tfl.reshape"(%[[VAL_1]], %[[VAL_9]]) : (tensor<3x7xf32>, tensor<3xi32>) -> tensor<1x3x7xf32>
+# CHECK:           %[[VAL_15:.*]] = "tfl.slice"(%[[VAL_14]], %[[VAL_8]], %[[VAL_9]]) : (tensor<1x3x7xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x3x7xf32>
+# CHECK:           %[[VAL_16:.*]] = "tfl.reshape"(%[[VAL_15]], %[[VAL_4]]) : (tensor<1x3x7xf32>, tensor<2xi32>) -> tensor<3x7xf32>
+# CHECK:           %[[VAL_17:.*]] = "tfl.transpose"(%[[VAL_16]], %[[VAL_2]]) : (tensor<3x7xf32>, tensor<2xi32>) -> tensor<7x3xf32>
+# CHECK:           %[[VAL_18:.*]] = "tfl.fully_connected"(%[[VAL_11]], %[[VAL_17]], %[[VAL_5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
+# CHECK:           %[[VAL_19:.*]] = "tfl.fully_connected"(%[[VAL_13]], %[[VAL_17]], %[[VAL_5]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<5x3xf32>, tensor<7x3xf32>, none) -> tensor<5x7xf32>
+# CHECK:           %[[VAL_20:.*]] = "tfl.pack"(%[[VAL_18]], %[[VAL_19]]) {axis = 0 : i32, values_count = 2 : i32} : (tensor<5x7xf32>, tensor<5x7xf32>) -> tensor<2x5x7xf32>
+# CHECK:           return %[[VAL_20]] : tensor<2x5x7xf32>
+# CHECK:         }
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index d3f1a430642..40420eee697 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -162,6 +162,10 @@ void AddTFToTFLConversionPasses(const mlir::TFL::PassConfig& pass_config,
     pass_manager->addPass(
         mlir::TFL::CreatePrepareTFPass(pass_config.unfold_batch_matmul));
     pass_manager->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
+    if (pass_config.shape_inference) {
+      // Add a shape inference pass to optimize away the unnecessary casts.
+      pass_manager->addPass(mlir::TF::CreateTFShapeInferencePass());
+    }
     pass_manager->addPass(
         mlir::TFL::CreateLegalizeTFPass(pass_config.runtime_verification));
     pass_manager->addPass(mlir::TFL::CreateOptimizePass());

From 4f512f00e9f3473c76cd440da398462867b8dd74 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 20 May 2020 01:37:59 -0700
Subject: [PATCH 0885/1533] [XLA] Unbreak the OSS build

PiperOrigin-RevId: 312440902
Change-Id: I7dc3514e5612dd054cdef30990eede7a16d5d68a
---
 tensorflow/compiler/xla/service/BUILD        | 1 +
 tensorflow/compiler/xla/service/hlo_lexer.cc | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index a8f20827c6d..1591b3a95ba 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -4498,6 +4498,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 314030fc5e8..5502665e886 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "absl/base/casts.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/numbers.h"

From d5f62ec58a4daf9f22643201593b0bf410ba2dfd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 02:02:37 -0700
Subject: [PATCH 0886/1533] Update GraphDef version to 407.

PiperOrigin-RevId: 312443334
Change-Id: I0ac828bea677a2a96bc51a495aef7f166bb1f5cf
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 048ed8e930e..6c6c46980d9 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 406  // Updated: 2020/5/19
+#define TF_GRAPH_DEF_VERSION 407  // Updated: 2020/5/20
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 9ffd9eee7247adebd0f48d53a8d593fc6b9b7bf6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 02:02:39 -0700
Subject: [PATCH 0887/1533] compat: Update forward compatibility horizon to
 2020-05-20

PiperOrigin-RevId: 312443340
Change-Id: I1085fe9f403d09cf67e853ea9b53f7f9c753d0b8
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 751f4b6cadf..9bc9ca973c2 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 19)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 20)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 8670c8584404aa3a2f8c5f80b79d101f7233887a Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 20 May 2020 02:07:32 -0700
Subject: [PATCH 0888/1533] [XLA:CPU] Plumb through a minimal emitter for
 matmuls using the mlir linalg dialect

This is just the most basic lowering and will generate linalg.matmul for small
matmuls and then convert to loops. The result is fairly slow, but we can
iterate on that.

To make XLA use it set XLA_FLAGS=--xla_backend_extra_options=xla_use_linalg_for_dot

PiperOrigin-RevId: 312443993
Change-Id: Icaf20764c954803f5b1bacfaa9456839b28ba52c
---
 tensorflow/compiler/xla/service/cpu/BUILD     |  30 ++++
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  23 ++-
 .../compiler/xla/service/cpu/cpu_options.cc   |   7 +
 .../compiler/xla/service/cpu/cpu_options.h    |   1 +
 .../xla/service/cpu/dot_op_emitter.cc         |  89 +++++++++---
 .../compiler/xla/service/cpu/dot_op_emitter.h |   3 +-
 .../compiler/xla/service/cpu/ir_emitter.cc    |  15 +-
 .../compiler/xla/service/cpu/ir_emitter.h     |  11 +-
 .../compiler/xla/service/cpu/mlir_emitter.cc  | 132 ++++++++++++++++++
 .../compiler/xla/service/cpu/mlir_emitter.h   |  43 ++++++
 10 files changed, 315 insertions(+), 39 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/mlir_emitter.h

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 2f432cd9356..3460e65b0a2 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -118,6 +118,9 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:LLVMDialect",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
@@ -366,6 +369,7 @@ cc_library(
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
         "@llvm-project//llvm:target",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -456,6 +460,7 @@ cc_library(
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
+        ":mlir_emitter",
         ":target_machine_features",
         ":tiled_dot_emitter",
         ":vector_support_library",
@@ -474,6 +479,10 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:core",
+        "@llvm-project//mlir:EDSC",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -1070,3 +1079,24 @@ tf_cc_test(
         "@llvm-project//llvm:target",
     ],
 )
+
+cc_library(
+    name = "mlir_emitter",
+    srcs = ["mlir_emitter.cc"],
+    hdrs = ["mlir_emitter.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/xla:hlo_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:ipo",
+        "@llvm-project//llvm:linker",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:LinalgToLLVM",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TargetLLVMIR",
+        "@llvm-project//mlir:VectorToLLVM",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index fe769bbdd2a..b2416ac2799 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -42,6 +42,8 @@ limitations under the License.
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -158,6 +160,8 @@ CpuCompiler::CpuCompiler() {
   // Initialize LLVM's MC layer for the native target.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
+
+  mlir::registerAllDialects();
 }
 
 namespace {
@@ -606,9 +610,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
                        user_post_optimization_hook_);
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  auto llvm_context = absl::make_unique<llvm::LLVMContext>();
-  auto llvm_module =
-      absl::make_unique<llvm::Module>("__compute_module", *llvm_context);
+  mlir::MLIRContext mlir_context;
+  auto llvm_module = absl::make_unique<llvm::Module>(
+      "__compute_module",
+      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
+          ->getLLVMContext());
 
   auto jit = absl::make_unique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
@@ -662,7 +668,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // before a caller computation.
 
   LLVMTargetMachineFeatures target_machine_features(jit->target_machine());
-  IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
                        &target_machine_features,
@@ -816,8 +822,11 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           opt_level));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  llvm::LLVMContext llvm_context;
-  llvm::Module llvm_module("__compute_module", llvm_context);
+  mlir::MLIRContext mlir_context;
+  llvm::Module llvm_module(
+      "__compute_module",
+      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
+          ->getLLVMContext());
   llvm_module.setDataLayout(target_machine->createDataLayout());
   llvm_module.setTargetTriple(triple.getTriple());
   if (pic_level != llvm::PICLevel::NotPIC) {
@@ -866,7 +875,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     }
 
     LLVMTargetMachineFeatures target_machine_features(target_machine.get());
-    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
+    IrEmitter ir_emitter(&mlir_context, *module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
                          &target_machine_features,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index ff654c83d61..c0222010fd9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -25,6 +25,7 @@ const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaForceEnableExperimentalLlvmIrGemm =
     "xla_force_enable_experimental_llvm_ir_gemm";
+const char* const kXlaUseLinalgForDot = "xla_use_linalg_for_dot";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -63,6 +64,12 @@ bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
+bool UseLinalgForDot(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kXlaUseLinalgForDot) > 0;
+}
+
 static absl::string_view RemoveSuffix(absl::string_view str,
                                       absl::string_view suffix) {
   CHECK_GE(str.size(), suffix.size());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 99e6702d14a..5d25aef6912 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,6 +27,7 @@ namespace options {
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
+bool UseLinalgForDot(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 7dba826b65c..e1ad14600d7 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,8 +23,17 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/EDSC/Builders.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
@@ -89,6 +98,9 @@ enum class DotImplementationStrategy {
   // and the output have to be row major.
   kTiledLlvmIrGemm,
 
+  // The dot operation is lowered into linalg.matmul op and lowered to LLVM IR.
+  kLinalgMatmul,
+
   // The dot operation is lowered into a call into an Eigen routine.  No fusions
   // are supported today.  The two inputs and the output have to be row major.
   // However, we do allow transposing either the LHS or the RHS as part of the
@@ -112,7 +124,7 @@ class DotOpEmitter {
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 
@@ -163,6 +175,9 @@ class DotOpEmitter {
   // Lowers the dot operation as a tiled Matrix*Matrix loop.
   void EmitTiledLlvmIrGemm();
 
+  // Lowers the dot operation through MLIR's linalg.matmul.
+  Status EmitLinalgMatmul();
+
   // Lowers the dot operation as a naive nested loop that computes the result
   // one element at a time.
   void EmitNaiveLlvmIrGemm();
@@ -194,20 +209,19 @@ class DotOpEmitter {
   const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* b_;
+  mlir::MLIRContext* mlir_context_;
   const HloModuleConfig& hlo_module_config_;
   const TargetMachineFeatures& target_machine_features_;
 };
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
-                           const llvm_ir::IrArray& target_array,
-                           const llvm_ir::IrArray& lhs_array,
-                           const llvm_ir::IrArray& rhs_array,
-                           const llvm_ir::IrArray* addend_array,
-                           llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* b,
-                           const HloModuleConfig& hlo_module_config,
-                           const TargetMachineFeatures& target_machine_features)
+DotOpEmitter::DotOpEmitter(
+    DotInfo dot_info, string dot_hlo_name, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features)
     : dot_info_(std::move(dot_info)),
       dot_hlo_name_(std::move(dot_hlo_name)),
       target_array_(target_array),
@@ -216,9 +230,36 @@ DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
       addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
       b_(b),
+      mlir_context_(mlir_context),
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
+Status DotOpEmitter::EmitLinalgMatmul() {
+  Shape operand_shapes[] = {dot_info_.lhs_shape, dot_info_.rhs_shape};
+  llvm::Value* operand_ptrs[] = {lhs_array_.GetBasePointer(),
+                                 rhs_array_.GetBasePointer()};
+  llvm::Value* target_ptr = target_array_.GetBasePointer();
+
+  // Zero out the output buffer.
+  int64 size_bytes = ShapeUtil::ByteSizeOf(dot_info_.result_shape);
+  b_->CreateMemSet(target_ptr, b_->getInt8(0), /*Size=*/size_bytes,
+                   /*Align=*/llvm::MaybeAlign(1));
+
+  std::string name =
+      absl::StrCat("linalgMatMul_", dot_info_.result_shape.ToString(true), "_",
+                   dot_info_.lhs_shape.ToString(true), "_",
+                   dot_info_.rhs_shape.ToString(true));
+  return EmitMlirFuncAndCall(
+      mlir_context_, b_, dot_info_.result_shape, operand_shapes, target_ptr,
+      operand_ptrs, name, [&](mlir::OpBuilder* builder, mlir::FuncOp function) {
+        mlir::edsc::ScopedContext scope(*builder, function.getLoc());
+        mlir::Value a = function.getArgument(0), b = function.getArgument(1),
+                    c = function.getArgument(2);
+        mlir::edsc::intrinsics::linalg_matmul(b, c, a);
+        mlir::edsc::intrinsics::std_ret();
+      });
+}
+
 void DotOpEmitter::EmitTiledLlvmIrGemm() {
   PrimitiveType primitive_type = dot_info_.result_shape.element_type();
   MatMultDims mat_mult_dims = GetMatMultDims();
@@ -418,6 +459,9 @@ Status DotOpEmitter::Emit() {
       EmitTiledLlvmIrGemm();
       return Status::OK();
 
+    case DotImplementationStrategy::kLinalgMatmul:
+      return EmitLinalgMatmul();
+
     case DotImplementationStrategy::kEigen:
       return EmitCallToRuntime();
   }
@@ -886,9 +930,12 @@ DotImplementationStrategy GetDotImplementationStrategy(
   }
 
   if (IsAlignedGemm(dot_info, target_machine_features)) {
-    return CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)
-               ? DotImplementationStrategy::kTiledLlvmIrGemm
-               : DotImplementationStrategy::kEigen;
+    if (CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)) {
+      return options::UseLinalgForDot(config)
+                 ? DotImplementationStrategy::kLinalgMatmul
+                 : DotImplementationStrategy::kTiledLlvmIrGemm;
+    }
+    return DotImplementationStrategy::kEigen;
   }
 
   return DotImplementationStrategy::kNaiveLlvmIr;
@@ -899,15 +946,15 @@ Status EmitNonBatchDotOperation(
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(S32 == type || F16 == type || F32 == type || F64 == type ||
                C64 == type || C128 == type);
   DotOpEmitter dot_emitter(std::move(dot_info), std::move(hlo_name),
                            target_array, lhs_array, rhs_array, addend_array,
-                           executable_run_options_value, b, hlo_module_config,
-                           target_machine_features);
+                           executable_run_options_value, b, mlir_context,
+                           hlo_module_config, target_machine_features);
   return dot_emitter.Emit();
 }
 
@@ -981,7 +1028,7 @@ Status EmitBatchDotOperation(
     const HloInstruction& dot, const llvm_ir::IrArray& target_array,
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot.dot_dimension_numbers()));
 
@@ -1039,7 +1086,7 @@ Status EmitBatchDotOperation(
         // Emit the inner non-batch dot operation.
         return EmitNonBatchDotOperation(
             dot_info, dot.name(), target_slice, lhs_slice, rhs_slice, nullptr,
-            executable_run_options_value, b, hlo_module_config,
+            executable_run_options_value, b, mlir_context, hlo_module_config,
             target_machine_features);
       });
 }
@@ -1089,7 +1136,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features) {
   // This routine assumes that the dot operation is not in a parallelized
@@ -1099,13 +1146,13 @@ Status EmitDotOperation(const HloInstruction& dot,
   if (IsBatchDot(dot)) {
     TF_RET_CHECK(addend_array == nullptr);
     return EmitBatchDotOperation(dot, target_array, lhs_array, rhs_array,
-                                 executable_run_options_value, b,
+                                 executable_run_options_value, b, mlir_context,
                                  hlo_module_config, target_machine_features);
   }
 
   return EmitNonBatchDotOperation(DotInfo(dot), dot.name(), target_array,
                                   lhs_array, rhs_array, addend_array,
-                                  executable_run_options_value, b,
+                                  executable_run_options_value, b, mlir_context,
                                   hlo_module_config, target_machine_features);
 }
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 105bd3005c8..d9cf8a2036b 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -63,7 +64,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 70dde919afb..043ad68a196 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -89,8 +89,8 @@ using llvm_ir::SetToFirstInsertPoint;
 namespace cpu {
 
 IrEmitter::IrEmitter(
-    const HloModule& hlo_module, const BufferAssignment& assignment,
-    llvm::Module* llvm_module,
+    mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
+    const BufferAssignment& assignment, llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
     const TargetMachineFeatures* target_machine_features,
@@ -99,6 +99,7 @@ IrEmitter::IrEmitter(
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
       b_(llvm_module->getContext()),
+      mlir_context_(mlir_context),
       instruction_to_profile_idx_(std::move(instruction_to_profile_idx)),
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
@@ -898,7 +899,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
                           /*addend_array=*/nullptr,
-                          GetExecutableRunOptionsArgument(), &b_,
+                          GetExecutableRunOptionsArgument(), &b_, mlir_context_,
                           hlo_module_config_, target_machine_features_);
 }
 
@@ -2305,10 +2306,10 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     llvm_ir::IrArray addend_array(
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
-    TF_RETURN_IF_ERROR(
-        EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
-                         &addend_array, GetExecutableRunOptionsArgument(), &b_,
-                         hlo_module_config_, target_machine_features_));
+    TF_RETURN_IF_ERROR(EmitDotOperation(
+        *dot, target_array, lhs_array, rhs_array, &addend_array,
+        GetExecutableRunOptionsArgument(), &b_, mlir_context_,
+        hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 9b0d11e9f3f..661785153d0 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_
 
 #include <stddef.h>
+
 #include <map>
 #include <memory>
 #include <string>
@@ -32,6 +33,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Target/TargetMachine.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
@@ -69,14 +71,16 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // hlo_module: the HLO module we are emitting IR for.
   // assignment: a BufferAssignment from which we know which buffers are used by
   //             the HLO nodes.
-  // llvm_module: the LLVM module to emit IR into.
+  // mlir_context: the MLIR context used for IR emission.
+  // llvm_module: the LLVM module to emit IR into. It's built using the LLVM
+  //              context inside of mlir_context.
   // instruction_to_profile_idx: the mapping from HLO instructions to their
   //              index in the profiling array.
   // computation_to_profile_idx: the mapping from HLO computations to their
   //              index in the profiling array.
   // emit_code_for_msan: whether emitted code should be compatible with msan.
-  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
-            llvm::Module* llvm_module,
+  IrEmitter(mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
+            const BufferAssignment& assignment, llvm::Module* llvm_module,
             std::unordered_map<const HloInstruction*, int64>
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
@@ -442,6 +446,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // module's function list).
   std::unique_ptr<IrFunction> compute_function_;
   llvm::IRBuilder<> b_;
+  mlir::MLIRContext* mlir_context_;
 
   // The buffer allocation slice for the root of the computation being compiled.
   // Only relevant for thread local computations.
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
new file mode 100644
index 00000000000..e7d52c288d5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
+
+#include "llvm/Linker/Linker.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+// Lower an MLIR module to an LLVM module.
+std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
+  mlir::PassManager manager(module->getContext());
+  manager.addPass(mlir::createConvertLinalgToLoopsPass());
+  manager.addPass(mlir::createConvertLinalgToLLVMPass());
+  manager.addPass(mlir::createConvertVectorToLLVMPass());
+  manager.addPass(mlir::createLowerToLLVMPass());
+  CHECK(succeeded(manager.run(*module)));
+  return mlir::translateModuleToLLVMIR(*module);
+}
+
+// Get arguments to pass a memref to an mlir function.
+void BuildViewForBuffer(llvm::SmallVectorImpl<llvm::Value *> *args,
+                        llvm::IRBuilder<> *b, const Shape &opShape,
+                        llvm::Value *op_val) {
+  llvm::Type *ty = op_val->getType();
+  while (auto aty = llvm::dyn_cast<llvm::ArrayType>(
+             llvm::cast<llvm::PointerType>(ty)->getElementType())) {
+    ty = aty->getElementType()->getPointerTo();
+  }
+  op_val = b->CreateBitCast(op_val, ty);
+
+  args->push_back(op_val);          // Allocated pointer.
+  args->push_back(op_val);          // Aligned pointer.
+  args->push_back(b->getInt64(0));  // Offset.
+
+  // Sizes.
+  for (int64 dim : opShape.dimensions()) {
+    args->push_back(b->getInt64(dim));
+  }
+
+  int64_t accumulated_stride = 1;
+  llvm::SmallVector<int64_t, 4> strides(opShape.rank(), 1);
+  for (int64 dim : LayoutUtil::MinorToMajor(opShape)) {
+    strides[dim] = accumulated_stride;
+    accumulated_stride *= opShape.dimensions(dim);
+  }
+
+  // Strides.
+  for (int64 stride : strides) {
+    args->push_back(b->getInt64(stride));
+  }
+}
+}  // namespace
+
+Status EmitMlirFuncAndCall(
+    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
+    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
+    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
+    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter) {
+  llvm::Module *llvm_module = b->GetInsertBlock()->getParent()->getParent();
+  mlir::Builder mlir_builder(context);
+
+  // Get memref types for the inputs and output.
+  TF_ASSIGN_OR_RETURN(mlir::Type ret_memref, ConvertTensorShapeToMemRefType(
+                                                 result_shape, mlir_builder));
+  std::vector<mlir::Type> operand_types = {ret_memref};
+  for (int i = 0; i != operand_shapes.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        mlir::Type op_memref,
+        ConvertTensorShapeToMemRefType(operand_shapes[i], mlir_builder));
+    operand_types.push_back(op_memref);
+  }
+
+  // Create the function an call the emission callback.
+  mlir::Location loc = mlir::UnknownLoc::get(context);
+  auto function = mlir::FuncOp::create(
+      loc, func_name, mlir::FunctionType::get(operand_types, {}, context));
+  function.addEntryBlock();
+  mlir::OwningModuleRef mlir_module = mlir::ModuleOp::create(loc);
+  mlir_module->push_back(function);
+  mlir::OpBuilder op_builder(&function.getBody());
+  emitter(&op_builder, function);
+
+  // Now link it all into the main LLVM module.
+  auto mlir_llvm_module = MakeLLVMModule(std::move(mlir_module));
+  mlir_llvm_module->setDataLayout(llvm_module->getDataLayout());
+  llvm::Linker::linkModules(
+      *llvm_module, std::move(mlir_llvm_module), llvm::Linker::None,
+      [](llvm::Module &M, const llvm::StringSet<> &GVS) {
+        llvm::internalizeModule(M, [&GVS](const llvm::GlobalValue &GV) {
+          return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+        });
+      });
+
+  // And leave behind a call to the function generated by MLIR.
+  llvm::Function *func = llvm_module->getFunction(func_name);
+  llvm::SmallVector<llvm::Value *, 4> op_vals;
+  BuildViewForBuffer(&op_vals, b, result_shape, result_ptr);
+  for (int i = 0; i != operand_shapes.size(); ++i) {
+    BuildViewForBuffer(&op_vals, b, operand_shapes[i], operand_ptrs[i]);
+  }
+  b->CreateCall(func, op_vals);
+
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.h b/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
new file mode 100644
index 00000000000..bc0741e851a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace cpu {
+
+// Create a new MLIR function with the name `func_name`, populate it with
+// `emitter` and create a call, passing it the buffers defined by
+// resultShape/resultPtr and operandShapes/operandPtrs. The function is added to
+// the LLVM module at `b`s insertion point.
+Status EmitMlirFuncAndCall(
+    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
+    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
+    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
+    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_

From 488533ca3ccd762907999f4436d21c4bee530bdd Mon Sep 17 00:00:00 2001
From: Abhineet Choudhary <abhi.chou4@gmail.com>
Date: Wed, 20 May 2020 16:30:35 +0530
Subject: [PATCH 0889/1533] add test for batches

---
 tensorflow/python/eager/forwardprop_test.py | 37 +++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 4ddba6b9be3..2533db8c232 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -90,6 +90,28 @@ def _jacfwd(f, primals):
   return nest.pack_sequence_as(primals, jac_flat)
 
 
+def _jvp_batch(f, primal, tangents):
+
+  return control_flow_ops.vectorized_map(
+      functools.partial(_jvp, f, primal), 
+      tangents
+    )
+  
+
+def _jvp_batch_matmul(f, primals, tangent_batch):
+    """Compute the jacobian of `f` at `primals` multiplied by `tangents`."""
+    jac_fwd = _jacfwd(f, primals)
+    def jac_mul(tangent):
+      flat_tangent = array_ops.reshape(tangent, shape=[-1])
+      tangent_vector = array_ops.expand_dims(flat_tangent, 1)
+      jvp_vector = math_ops.matmul(jac_fwd, tangent_vector)
+      return array_ops.reshape(jvp_vector, tangent.shape)
+
+    return control_flow_ops.vectorized_map(
+        jac_mul, 
+        tangent_batch)
+
+
 def _grad(f, argnums=0):
   """Return a function which computes the gradient of `f`."""
 
@@ -939,6 +961,21 @@ class HessianTests(test.TestCase, parameterized.TestCase):
     self.assertAllClose(hess_value, hessian_pfor)
 
 
+class JacobianTests(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+    (math_ops.sin, (2, 3), 5),
+    (math_ops.sin, (2, 3, 4), 10),
+  ])
+  def testJVPBatchCorrectness(self, f, primal_shape, batch_size):
+    primals = [random_ops.random_uniform(primal_shape)]
+    tangent_batch = [random_ops.random_uniform([batch_size, *primal_shape])]
+    self.assertAllClose(
+        _jvp_batch(f, primals, tangent_batch)[1],
+        _jvp_batch_matmul(f, primals, *tangent_batch)
+    )
+
+
 if __name__ == "__main__":
   # TODO(allenl): Also test with 1.x-style graph mode.
   ops.enable_eager_execution()

From 72b5db4b1ff53ff1a599ed12238e2db56e07f473 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 05:24:56 -0700
Subject: [PATCH 0890/1533] [XLA:CPU] Plumb through a minimal emitter for
 matmuls using the mlir linalg dialect

This is just the most basic lowering and will generate linalg.matmul for small
matmuls and then convert to loops. The result is fairly slow, but we can
iterate on that.

To make XLA use it set XLA_FLAGS=--xla_backend_extra_options=xla_use_linalg_for_dot

PiperOrigin-RevId: 312463957
Change-Id: I7ae05e56c5e4257297202a5ebf77fba53b288b4d
---
 tensorflow/compiler/xla/service/cpu/BUILD     |  30 ----
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  23 +--
 .../compiler/xla/service/cpu/cpu_options.cc   |   7 -
 .../compiler/xla/service/cpu/cpu_options.h    |   1 -
 .../xla/service/cpu/dot_op_emitter.cc         |  89 +++---------
 .../compiler/xla/service/cpu/dot_op_emitter.h |   3 +-
 .../compiler/xla/service/cpu/ir_emitter.cc    |  15 +-
 .../compiler/xla/service/cpu/ir_emitter.h     |  11 +-
 .../compiler/xla/service/cpu/mlir_emitter.cc  | 132 ------------------
 .../compiler/xla/service/cpu/mlir_emitter.h   |  43 ------
 10 files changed, 39 insertions(+), 315 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
 delete mode 100644 tensorflow/compiler/xla/service/cpu/mlir_emitter.h

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 3460e65b0a2..2f432cd9356 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -118,9 +118,6 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
-        "@llvm-project//mlir:ExecutionEngineUtils",
-        "@llvm-project//mlir:LLVMDialect",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
@@ -369,7 +366,6 @@ cc_library(
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
         "@llvm-project//llvm:target",
-        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -460,7 +456,6 @@ cc_library(
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
-        ":mlir_emitter",
         ":target_machine_features",
         ":tiled_dot_emitter",
         ":vector_support_library",
@@ -479,10 +474,6 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:core",
-        "@llvm-project//mlir:EDSC",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LinalgOps",
-        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -1079,24 +1070,3 @@ tf_cc_test(
         "@llvm-project//llvm:target",
     ],
 )
-
-cc_library(
-    name = "mlir_emitter",
-    srcs = ["mlir_emitter.cc"],
-    hdrs = ["mlir_emitter.h"],
-    deps = [
-        "//tensorflow/compiler/mlir/xla:hlo_utils",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status",
-        "@llvm-project//llvm:core",
-        "@llvm-project//llvm:ipo",
-        "@llvm-project//llvm:linker",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LLVMTransforms",
-        "@llvm-project//mlir:LinalgToLLVM",
-        "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:TargetLLVMIR",
-        "@llvm-project//mlir:VectorToLLVM",
-    ],
-)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index b2416ac2799..fe769bbdd2a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -42,8 +42,6 @@ limitations under the License.
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
-#include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -160,8 +158,6 @@ CpuCompiler::CpuCompiler() {
   // Initialize LLVM's MC layer for the native target.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
-
-  mlir::registerAllDialects();
 }
 
 namespace {
@@ -610,11 +606,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
                        user_post_optimization_hook_);
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  mlir::MLIRContext mlir_context;
-  auto llvm_module = absl::make_unique<llvm::Module>(
-      "__compute_module",
-      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
-          ->getLLVMContext());
+  auto llvm_context = absl::make_unique<llvm::LLVMContext>();
+  auto llvm_module =
+      absl::make_unique<llvm::Module>("__compute_module", *llvm_context);
 
   auto jit = absl::make_unique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
@@ -668,7 +662,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // before a caller computation.
 
   LLVMTargetMachineFeatures target_machine_features(jit->target_machine());
-  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
+  IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
                        &target_machine_features,
@@ -822,11 +816,8 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           opt_level));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  mlir::MLIRContext mlir_context;
-  llvm::Module llvm_module(
-      "__compute_module",
-      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
-          ->getLLVMContext());
+  llvm::LLVMContext llvm_context;
+  llvm::Module llvm_module("__compute_module", llvm_context);
   llvm_module.setDataLayout(target_machine->createDataLayout());
   llvm_module.setTargetTriple(triple.getTriple());
   if (pic_level != llvm::PICLevel::NotPIC) {
@@ -875,7 +866,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     }
 
     LLVMTargetMachineFeatures target_machine_features(target_machine.get());
-    IrEmitter ir_emitter(&mlir_context, *module, *assignment, &llvm_module,
+    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
                          &target_machine_features,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index c0222010fd9..ff654c83d61 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -25,7 +25,6 @@ const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaForceEnableExperimentalLlvmIrGemm =
     "xla_force_enable_experimental_llvm_ir_gemm";
-const char* const kXlaUseLinalgForDot = "xla_use_linalg_for_dot";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -64,12 +63,6 @@ bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
-bool UseLinalgForDot(const HloModuleConfig& config) {
-  const auto& extra_options_map =
-      config.debug_options().xla_backend_extra_options();
-  return extra_options_map.count(kXlaUseLinalgForDot) > 0;
-}
-
 static absl::string_view RemoveSuffix(absl::string_view str,
                                       absl::string_view suffix) {
   CHECK_GE(str.size(), suffix.size());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 5d25aef6912..99e6702d14a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,7 +27,6 @@ namespace options {
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
-bool UseLinalgForDot(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index e1ad14600d7..7dba826b65c 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,17 +23,8 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
-#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"  // from @llvm-project
-#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"  // from @llvm-project
-#include "mlir/EDSC/Builders.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
-#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
@@ -98,9 +89,6 @@ enum class DotImplementationStrategy {
   // and the output have to be row major.
   kTiledLlvmIrGemm,
 
-  // The dot operation is lowered into linalg.matmul op and lowered to LLVM IR.
-  kLinalgMatmul,
-
   // The dot operation is lowered into a call into an Eigen routine.  No fusions
   // are supported today.  The two inputs and the output have to be row major.
   // However, we do allow transposing either the LHS or the RHS as part of the
@@ -124,7 +112,7 @@ class DotOpEmitter {
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
+                        llvm::IRBuilder<>* b,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 
@@ -175,9 +163,6 @@ class DotOpEmitter {
   // Lowers the dot operation as a tiled Matrix*Matrix loop.
   void EmitTiledLlvmIrGemm();
 
-  // Lowers the dot operation through MLIR's linalg.matmul.
-  Status EmitLinalgMatmul();
-
   // Lowers the dot operation as a naive nested loop that computes the result
   // one element at a time.
   void EmitNaiveLlvmIrGemm();
@@ -209,19 +194,20 @@ class DotOpEmitter {
   const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* b_;
-  mlir::MLIRContext* mlir_context_;
   const HloModuleConfig& hlo_module_config_;
   const TargetMachineFeatures& target_machine_features_;
 };
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(
-    DotInfo dot_info, string dot_hlo_name, const llvm_ir::IrArray& target_array,
-    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
-    const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
-    const TargetMachineFeatures& target_machine_features)
+DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
+                           const llvm_ir::IrArray& target_array,
+                           const llvm_ir::IrArray& lhs_array,
+                           const llvm_ir::IrArray& rhs_array,
+                           const llvm_ir::IrArray* addend_array,
+                           llvm::Value* executable_run_options_value,
+                           llvm::IRBuilder<>* b,
+                           const HloModuleConfig& hlo_module_config,
+                           const TargetMachineFeatures& target_machine_features)
     : dot_info_(std::move(dot_info)),
       dot_hlo_name_(std::move(dot_hlo_name)),
       target_array_(target_array),
@@ -230,36 +216,9 @@ DotOpEmitter::DotOpEmitter(
       addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
       b_(b),
-      mlir_context_(mlir_context),
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-Status DotOpEmitter::EmitLinalgMatmul() {
-  Shape operand_shapes[] = {dot_info_.lhs_shape, dot_info_.rhs_shape};
-  llvm::Value* operand_ptrs[] = {lhs_array_.GetBasePointer(),
-                                 rhs_array_.GetBasePointer()};
-  llvm::Value* target_ptr = target_array_.GetBasePointer();
-
-  // Zero out the output buffer.
-  int64 size_bytes = ShapeUtil::ByteSizeOf(dot_info_.result_shape);
-  b_->CreateMemSet(target_ptr, b_->getInt8(0), /*Size=*/size_bytes,
-                   /*Align=*/llvm::MaybeAlign(1));
-
-  std::string name =
-      absl::StrCat("linalgMatMul_", dot_info_.result_shape.ToString(true), "_",
-                   dot_info_.lhs_shape.ToString(true), "_",
-                   dot_info_.rhs_shape.ToString(true));
-  return EmitMlirFuncAndCall(
-      mlir_context_, b_, dot_info_.result_shape, operand_shapes, target_ptr,
-      operand_ptrs, name, [&](mlir::OpBuilder* builder, mlir::FuncOp function) {
-        mlir::edsc::ScopedContext scope(*builder, function.getLoc());
-        mlir::Value a = function.getArgument(0), b = function.getArgument(1),
-                    c = function.getArgument(2);
-        mlir::edsc::intrinsics::linalg_matmul(b, c, a);
-        mlir::edsc::intrinsics::std_ret();
-      });
-}
-
 void DotOpEmitter::EmitTiledLlvmIrGemm() {
   PrimitiveType primitive_type = dot_info_.result_shape.element_type();
   MatMultDims mat_mult_dims = GetMatMultDims();
@@ -459,9 +418,6 @@ Status DotOpEmitter::Emit() {
       EmitTiledLlvmIrGemm();
       return Status::OK();
 
-    case DotImplementationStrategy::kLinalgMatmul:
-      return EmitLinalgMatmul();
-
     case DotImplementationStrategy::kEigen:
       return EmitCallToRuntime();
   }
@@ -930,12 +886,9 @@ DotImplementationStrategy GetDotImplementationStrategy(
   }
 
   if (IsAlignedGemm(dot_info, target_machine_features)) {
-    if (CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)) {
-      return options::UseLinalgForDot(config)
-                 ? DotImplementationStrategy::kLinalgMatmul
-                 : DotImplementationStrategy::kTiledLlvmIrGemm;
-    }
-    return DotImplementationStrategy::kEigen;
+    return CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)
+               ? DotImplementationStrategy::kTiledLlvmIrGemm
+               : DotImplementationStrategy::kEigen;
   }
 
   return DotImplementationStrategy::kNaiveLlvmIr;
@@ -946,15 +899,15 @@ Status EmitNonBatchDotOperation(
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
+    const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(S32 == type || F16 == type || F32 == type || F64 == type ||
                C64 == type || C128 == type);
   DotOpEmitter dot_emitter(std::move(dot_info), std::move(hlo_name),
                            target_array, lhs_array, rhs_array, addend_array,
-                           executable_run_options_value, b, mlir_context,
-                           hlo_module_config, target_machine_features);
+                           executable_run_options_value, b, hlo_module_config,
+                           target_machine_features);
   return dot_emitter.Emit();
 }
 
@@ -1028,7 +981,7 @@ Status EmitBatchDotOperation(
     const HloInstruction& dot, const llvm_ir::IrArray& target_array,
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
+    const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot.dot_dimension_numbers()));
 
@@ -1086,7 +1039,7 @@ Status EmitBatchDotOperation(
         // Emit the inner non-batch dot operation.
         return EmitNonBatchDotOperation(
             dot_info, dot.name(), target_slice, lhs_slice, rhs_slice, nullptr,
-            executable_run_options_value, b, mlir_context, hlo_module_config,
+            executable_run_options_value, b, hlo_module_config,
             target_machine_features);
       });
 }
@@ -1136,7 +1089,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
+                        llvm::IRBuilder<>* b,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features) {
   // This routine assumes that the dot operation is not in a parallelized
@@ -1146,13 +1099,13 @@ Status EmitDotOperation(const HloInstruction& dot,
   if (IsBatchDot(dot)) {
     TF_RET_CHECK(addend_array == nullptr);
     return EmitBatchDotOperation(dot, target_array, lhs_array, rhs_array,
-                                 executable_run_options_value, b, mlir_context,
+                                 executable_run_options_value, b,
                                  hlo_module_config, target_machine_features);
   }
 
   return EmitNonBatchDotOperation(DotInfo(dot), dot.name(), target_array,
                                   lhs_array, rhs_array, addend_array,
-                                  executable_run_options_value, b, mlir_context,
+                                  executable_run_options_value, b,
                                   hlo_module_config, target_machine_features);
 }
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index d9cf8a2036b..105bd3005c8 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -64,7 +63,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
+                        llvm::IRBuilder<>* b,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 043ad68a196..70dde919afb 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -89,8 +89,8 @@ using llvm_ir::SetToFirstInsertPoint;
 namespace cpu {
 
 IrEmitter::IrEmitter(
-    mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
-    const BufferAssignment& assignment, llvm::Module* llvm_module,
+    const HloModule& hlo_module, const BufferAssignment& assignment,
+    llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
     const TargetMachineFeatures* target_machine_features,
@@ -99,7 +99,6 @@ IrEmitter::IrEmitter(
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
       b_(llvm_module->getContext()),
-      mlir_context_(mlir_context),
       instruction_to_profile_idx_(std::move(instruction_to_profile_idx)),
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
@@ -899,7 +898,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
                           /*addend_array=*/nullptr,
-                          GetExecutableRunOptionsArgument(), &b_, mlir_context_,
+                          GetExecutableRunOptionsArgument(), &b_,
                           hlo_module_config_, target_machine_features_);
 }
 
@@ -2306,10 +2305,10 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     llvm_ir::IrArray addend_array(
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
-    TF_RETURN_IF_ERROR(EmitDotOperation(
-        *dot, target_array, lhs_array, rhs_array, &addend_array,
-        GetExecutableRunOptionsArgument(), &b_, mlir_context_,
-        hlo_module_config_, target_machine_features_));
+    TF_RETURN_IF_ERROR(
+        EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
+                         &addend_array, GetExecutableRunOptionsArgument(), &b_,
+                         hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 661785153d0..9b0d11e9f3f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_
 
 #include <stddef.h>
-
 #include <map>
 #include <memory>
 #include <string>
@@ -33,7 +32,6 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Target/TargetMachine.h"
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
@@ -71,16 +69,14 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // hlo_module: the HLO module we are emitting IR for.
   // assignment: a BufferAssignment from which we know which buffers are used by
   //             the HLO nodes.
-  // mlir_context: the MLIR context used for IR emission.
-  // llvm_module: the LLVM module to emit IR into. It's built using the LLVM
-  //              context inside of mlir_context.
+  // llvm_module: the LLVM module to emit IR into.
   // instruction_to_profile_idx: the mapping from HLO instructions to their
   //              index in the profiling array.
   // computation_to_profile_idx: the mapping from HLO computations to their
   //              index in the profiling array.
   // emit_code_for_msan: whether emitted code should be compatible with msan.
-  IrEmitter(mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
-            const BufferAssignment& assignment, llvm::Module* llvm_module,
+  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
+            llvm::Module* llvm_module,
             std::unordered_map<const HloInstruction*, int64>
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
@@ -446,7 +442,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // module's function list).
   std::unique_ptr<IrFunction> compute_function_;
   llvm::IRBuilder<> b_;
-  mlir::MLIRContext* mlir_context_;
 
   // The buffer allocation slice for the root of the computation being compiled.
   // Only relevant for thread local computations.
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
deleted file mode 100644
index e7d52c288d5..00000000000
--- a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
-
-#include "llvm/Linker/Linker.h"
-#include "llvm/Transforms/IPO/Internalize.h"
-#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Target/LLVMIR.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
-
-namespace xla {
-namespace cpu {
-namespace {
-
-// Lower an MLIR module to an LLVM module.
-std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
-  mlir::PassManager manager(module->getContext());
-  manager.addPass(mlir::createConvertLinalgToLoopsPass());
-  manager.addPass(mlir::createConvertLinalgToLLVMPass());
-  manager.addPass(mlir::createConvertVectorToLLVMPass());
-  manager.addPass(mlir::createLowerToLLVMPass());
-  CHECK(succeeded(manager.run(*module)));
-  return mlir::translateModuleToLLVMIR(*module);
-}
-
-// Get arguments to pass a memref to an mlir function.
-void BuildViewForBuffer(llvm::SmallVectorImpl<llvm::Value *> *args,
-                        llvm::IRBuilder<> *b, const Shape &opShape,
-                        llvm::Value *op_val) {
-  llvm::Type *ty = op_val->getType();
-  while (auto aty = llvm::dyn_cast<llvm::ArrayType>(
-             llvm::cast<llvm::PointerType>(ty)->getElementType())) {
-    ty = aty->getElementType()->getPointerTo();
-  }
-  op_val = b->CreateBitCast(op_val, ty);
-
-  args->push_back(op_val);          // Allocated pointer.
-  args->push_back(op_val);          // Aligned pointer.
-  args->push_back(b->getInt64(0));  // Offset.
-
-  // Sizes.
-  for (int64 dim : opShape.dimensions()) {
-    args->push_back(b->getInt64(dim));
-  }
-
-  int64_t accumulated_stride = 1;
-  llvm::SmallVector<int64_t, 4> strides(opShape.rank(), 1);
-  for (int64 dim : LayoutUtil::MinorToMajor(opShape)) {
-    strides[dim] = accumulated_stride;
-    accumulated_stride *= opShape.dimensions(dim);
-  }
-
-  // Strides.
-  for (int64 stride : strides) {
-    args->push_back(b->getInt64(stride));
-  }
-}
-}  // namespace
-
-Status EmitMlirFuncAndCall(
-    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
-    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
-    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
-    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter) {
-  llvm::Module *llvm_module = b->GetInsertBlock()->getParent()->getParent();
-  mlir::Builder mlir_builder(context);
-
-  // Get memref types for the inputs and output.
-  TF_ASSIGN_OR_RETURN(mlir::Type ret_memref, ConvertTensorShapeToMemRefType(
-                                                 result_shape, mlir_builder));
-  std::vector<mlir::Type> operand_types = {ret_memref};
-  for (int i = 0; i != operand_shapes.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(
-        mlir::Type op_memref,
-        ConvertTensorShapeToMemRefType(operand_shapes[i], mlir_builder));
-    operand_types.push_back(op_memref);
-  }
-
-  // Create the function an call the emission callback.
-  mlir::Location loc = mlir::UnknownLoc::get(context);
-  auto function = mlir::FuncOp::create(
-      loc, func_name, mlir::FunctionType::get(operand_types, {}, context));
-  function.addEntryBlock();
-  mlir::OwningModuleRef mlir_module = mlir::ModuleOp::create(loc);
-  mlir_module->push_back(function);
-  mlir::OpBuilder op_builder(&function.getBody());
-  emitter(&op_builder, function);
-
-  // Now link it all into the main LLVM module.
-  auto mlir_llvm_module = MakeLLVMModule(std::move(mlir_module));
-  mlir_llvm_module->setDataLayout(llvm_module->getDataLayout());
-  llvm::Linker::linkModules(
-      *llvm_module, std::move(mlir_llvm_module), llvm::Linker::None,
-      [](llvm::Module &M, const llvm::StringSet<> &GVS) {
-        llvm::internalizeModule(M, [&GVS](const llvm::GlobalValue &GV) {
-          return !GV.hasName() || (GVS.count(GV.getName()) == 0);
-        });
-      });
-
-  // And leave behind a call to the function generated by MLIR.
-  llvm::Function *func = llvm_module->getFunction(func_name);
-  llvm::SmallVector<llvm::Value *, 4> op_vals;
-  BuildViewForBuffer(&op_vals, b, result_shape, result_ptr);
-  for (int i = 0; i != operand_shapes.size(); ++i) {
-    BuildViewForBuffer(&op_vals, b, operand_shapes[i], operand_ptrs[i]);
-  }
-  b->CreateCall(func, op_vals);
-
-  return Status::OK();
-}
-
-}  // namespace cpu
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.h b/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
deleted file mode 100644
index bc0741e851a..00000000000
--- a/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
-
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Value.h"
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/shape.h"
-#include "tensorflow/compiler/xla/status.h"
-
-namespace xla {
-namespace cpu {
-
-// Create a new MLIR function with the name `func_name`, populate it with
-// `emitter` and create a call, passing it the buffers defined by
-// resultShape/resultPtr and operandShapes/operandPtrs. The function is added to
-// the LLVM module at `b`s insertion point.
-Status EmitMlirFuncAndCall(
-    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
-    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
-    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
-    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter);
-
-}  // namespace cpu
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_

From bd57e264f89053d6e0539ce548b895af107f6a9d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 06:12:05 -0700
Subject: [PATCH 0891/1533] Use tf.io instead of os for checking if paths exist
 in Keras applications.

PiperOrigin-RevId: 312468401
Change-Id: Ibe9c4a9719be5bb8b72f6db84036791031e26760
---
 tensorflow/python/keras/applications/BUILD             | 10 ++++++++--
 tensorflow/python/keras/applications/densenet.py       |  5 ++---
 tensorflow/python/keras/applications/efficientnet.py   |  4 ++--
 .../python/keras/applications/inception_resnet_v2.py   |  5 ++---
 tensorflow/python/keras/applications/inception_v3.py   |  5 ++---
 tensorflow/python/keras/applications/mobilenet.py      |  5 ++---
 tensorflow/python/keras/applications/mobilenet_v2.py   |  5 ++---
 tensorflow/python/keras/applications/nasnet.py         |  5 ++---
 tensorflow/python/keras/applications/resnet.py         |  5 ++---
 tensorflow/python/keras/applications/vgg16.py          |  5 ++---
 tensorflow/python/keras/applications/vgg19.py          |  5 ++---
 tensorflow/python/keras/applications/xception.py       |  5 ++---
 12 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/keras/applications/BUILD b/tensorflow/python/keras/applications/BUILD
index 1eaed45c714..0c566c6e6d5 100644
--- a/tensorflow/python/keras/applications/BUILD
+++ b/tensorflow/python/keras/applications/BUILD
@@ -35,10 +35,16 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:util",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras/engine",
         "//tensorflow/python/keras/layers",
+        "//tensorflow/python/keras/utils:data_utils",
+        "//tensorflow/python/keras/utils:layer_utils",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/python/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
index 39004be622f..620a0b21607 100644
--- a/tensorflow/python/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -193,7 +192,7 @@ def DenseNet(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/efficientnet.py b/tensorflow/python/keras/applications/efficientnet.py
index ece9f7f7e5b..e1413b08533 100644
--- a/tensorflow/python/keras/applications/efficientnet.py
+++ b/tensorflow/python/keras/applications/efficientnet.py
@@ -26,7 +26,6 @@ from __future__ import print_function
 
 import copy
 import math
-import os
 
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
@@ -34,6 +33,7 @@ from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -269,7 +269,7 @@ def EfficientNet(
   if blocks_args == 'default':
     blocks_args = DEFAULT_BLOCKS_ARGS
 
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
index 15cbfa5033c..31f342b4d5a 100644
--- a/tensorflow/python/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -25,14 +25,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -113,7 +112,7 @@ def InceptionResNetV2(include_top=True,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
index 3f528fc131a..9fb1dad6b03 100644
--- a/tensorflow/python/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -109,7 +108,7 @@ def InceptionV3(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
index f531d8d124c..3f29f01da2d 100644
--- a/tensorflow/python/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -64,14 +64,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -164,7 +163,7 @@ def MobileNet(input_shape=None,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index b1138b7ae26..86fd864ab02 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -77,14 +77,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -181,7 +180,7 @@ def MobileNetV2(input_shape=None,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
index f4e5f74e77d..20f1df91048 100644
--- a/tensorflow/python/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -41,14 +41,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
@@ -151,7 +150,7 @@ def NASNet(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/resnet.py b/tensorflow/python/keras/applications/resnet.py
index e72f06ce3d1..5bc47f89460 100644
--- a/tensorflow/python/keras/applications/resnet.py
+++ b/tensorflow/python/keras/applications/resnet.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -138,7 +137,7 @@ def ResNet(stack_fn,
     layers = VersionAwareLayers()
   if kwargs:
     raise ValueError('Unknown argument(s): %s' % (kwargs,))
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
index 3a523dc5dc3..b160c920347 100644
--- a/tensorflow/python/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -114,7 +113,7 @@ def VGG16(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
index e4385cc8f6a..11f1a252c64 100644
--- a/tensorflow/python/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -23,14 +23,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -114,7 +113,7 @@ def VGG19(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '
diff --git a/tensorflow/python/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
index 7139764b15b..f414ded6e18 100644
--- a/tensorflow/python/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -27,14 +27,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.applications import imagenet_utils
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import VersionAwareLayers
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -114,7 +113,7 @@ def Xception(
     ValueError: if `classifier_activation` is not `softmax` or `None` when
       using a pretrained top layer.
   """
-  if not (weights in {'imagenet', None} or os.path.exists(weights)):
+  if not (weights in {'imagenet', None} or file_io.file_exists(weights)):
     raise ValueError('The `weights` argument should be either '
                      '`None` (random initialization), `imagenet` '
                      '(pre-training on ImageNet), '

From 2656a011ae2f0b98e2ea1120cf041b09aaa45ef8 Mon Sep 17 00:00:00 2001
From: Teng Lu <teng.lu@intel.com>
Date: Wed, 20 May 2020 06:20:19 -0700
Subject: [PATCH 0892/1533] Allow rounding loss in Reduce UT.

---
 tensorflow/python/ops/math_ops_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 1362a23e104..7744e3e96aa 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -52,7 +52,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
     out_bf16 = self.evaluate(math_ops.reduce_sum(in_bf16))
     expected = math_ops.cast(out_f32, dtypes.bfloat16)
 
-    self.assertAllEqual(out_bf16, expected)
+    self.assertAllClose(out_bf16, expected, 1e-3)
 
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)

From 18a7a327f37b58b250d89190fe3cda9fa9372fab Mon Sep 17 00:00:00 2001
From: Swapnil Parekh <swapnilbp100@gmail.com>
Date: Wed, 20 May 2020 19:13:15 +0530
Subject: [PATCH 0893/1533] Update tpu_strategy.py

rank[DEP] -> ndim
---
 tensorflow/python/distribute/tpu_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index b574c523ccd..a8ffa618064 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -897,7 +897,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
           if tensor_util.is_tensor(input_tensor):
             rank = input_tensor.get_shape().rank
           else:
-            rank = np.rank(input_tensor)
+            rank = np.ndim(input_tensor)
           maximum_shape = tensor_shape.TensorShape([None] * rank)
           maximum_shapes.append(maximum_shape)
         maximum_shapes = nest.pack_sequence_as(replicate_inputs[0],

From 01b38cd7c651e0e83d7503671669ea9eb13afe81 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 20 May 2020 06:40:25 -0700
Subject: [PATCH 0894/1533] [XLA:CPU] Plumb through a minimal emitter for
 matmuls using the mlir linalg dialect

This is just the most basic lowering and will generate linalg.matmul for small
matmuls and then convert to loops. The result is fairly slow, but we can
iterate on that.

To make XLA use it set XLA_FLAGS=--xla_backend_extra_options=xla_use_linalg_for_dot

PiperOrigin-RevId: 312471829
Change-Id: I213d1f6114671bc595ac1647d3689736ee8f56f4
---
 tensorflow/compiler/xla/service/cpu/BUILD     |  30 ++++
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  23 ++-
 .../compiler/xla/service/cpu/cpu_options.cc   |   7 +
 .../compiler/xla/service/cpu/cpu_options.h    |   1 +
 .../xla/service/cpu/dot_op_emitter.cc         |  89 +++++++++---
 .../compiler/xla/service/cpu/dot_op_emitter.h |   3 +-
 .../compiler/xla/service/cpu/ir_emitter.cc    |  15 +-
 .../compiler/xla/service/cpu/ir_emitter.h     |  11 +-
 .../compiler/xla/service/cpu/mlir_emitter.cc  | 132 ++++++++++++++++++
 .../compiler/xla/service/cpu/mlir_emitter.h   |  43 ++++++
 10 files changed, 315 insertions(+), 39 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
 create mode 100644 tensorflow/compiler/xla/service/cpu/mlir_emitter.h

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 2f432cd9356..3460e65b0a2 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -118,6 +118,9 @@ cc_library(
         ":target_machine_features",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:AllPassesAndDialectsNoRegistration",
+        "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:LLVMDialect",
         "//tensorflow/compiler/xla/service:copy_insertion",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:dump",
@@ -366,6 +369,7 @@ cc_library(
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
         "@llvm-project//llvm:target",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -456,6 +460,7 @@ cc_library(
         ":cpu_options",
         ":cpu_runtime",
         ":ir_emission_utils",
+        ":mlir_emitter",
         ":target_machine_features",
         ":tiled_dot_emitter",
         ":vector_support_library",
@@ -474,6 +479,10 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:core",
+        "@llvm-project//mlir:EDSC",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LinalgOps",
+        "@llvm-project//mlir:StandardOps",
     ],
 )
 
@@ -1070,3 +1079,24 @@ tf_cc_test(
         "@llvm-project//llvm:target",
     ],
 )
+
+cc_library(
+    name = "mlir_emitter",
+    srcs = ["mlir_emitter.cc"],
+    hdrs = ["mlir_emitter.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/xla:hlo_utils",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:ipo",
+        "@llvm-project//llvm:linker",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:LinalgToLLVM",
+        "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TargetLLVMIR",
+        "@llvm-project//mlir:VectorToLLVM",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index fe769bbdd2a..b2416ac2799 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -42,6 +42,8 @@ limitations under the License.
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/map_util.h"
@@ -158,6 +160,8 @@ CpuCompiler::CpuCompiler() {
   // Initialize LLVM's MC layer for the native target.
   llvm::InitializeNativeTarget();
   llvm::InitializeNativeTargetAsmPrinter();
+
+  mlir::registerAllDialects();
 }
 
 namespace {
@@ -606,9 +610,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
                        user_post_optimization_hook_);
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  auto llvm_context = absl::make_unique<llvm::LLVMContext>();
-  auto llvm_module =
-      absl::make_unique<llvm::Module>("__compute_module", *llvm_context);
+  mlir::MLIRContext mlir_context;
+  auto llvm_module = absl::make_unique<llvm::Module>(
+      "__compute_module",
+      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
+          ->getLLVMContext());
 
   auto jit = absl::make_unique<SimpleOrcJIT>(
       CompilerTargetOptions(module->config()),
@@ -662,7 +668,7 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // before a caller computation.
 
   LLVMTargetMachineFeatures target_machine_features(jit->target_machine());
-  IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
+  IrEmitter ir_emitter(&mlir_context, *module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
                        &target_machine_features,
@@ -816,8 +822,11 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
           opt_level));
 
   // Compile must be thread-safe so create a new LLVM context for the module.
-  llvm::LLVMContext llvm_context;
-  llvm::Module llvm_module("__compute_module", llvm_context);
+  mlir::MLIRContext mlir_context;
+  llvm::Module llvm_module(
+      "__compute_module",
+      mlir_context.getRegisteredDialect<mlir::LLVM::LLVMDialect>()
+          ->getLLVMContext());
   llvm_module.setDataLayout(target_machine->createDataLayout());
   llvm_module.setTargetTriple(triple.getTriple());
   if (pic_level != llvm::PICLevel::NotPIC) {
@@ -866,7 +875,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     }
 
     LLVMTargetMachineFeatures target_machine_features(target_machine.get());
-    IrEmitter ir_emitter(*module, *assignment, &llvm_module,
+    IrEmitter ir_emitter(&mlir_context, *module, *assignment, &llvm_module,
                          std::move(instruction_to_profile_idx),
                          std::move(computation_to_profile_idx),
                          &target_machine_features,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index ff654c83d61..c0222010fd9 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -25,6 +25,7 @@ const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaForceEnableExperimentalLlvmIrGemm =
     "xla_force_enable_experimental_llvm_ir_gemm";
+const char* const kXlaUseLinalgForDot = "xla_use_linalg_for_dot";
 const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
@@ -63,6 +64,12 @@ bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaForceEnableExperimentalLlvmIrGemm) > 0;
 }
 
+bool UseLinalgForDot(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kXlaUseLinalgForDot) > 0;
+}
+
 static absl::string_view RemoveSuffix(absl::string_view str,
                                       absl::string_view suffix) {
   CHECK_GE(str.size(), suffix.size());
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 99e6702d14a..5d25aef6912 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -27,6 +27,7 @@ namespace options {
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
 bool ForceEnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
+bool UseLinalgForDot(const HloModuleConfig& config);
 absl::optional<int64> LlvmIrGemvTilingFactor(const HloModuleConfig& config);
 absl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
     const HloModuleConfig& config);
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 7dba826b65c..e1ad14600d7 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,8 +23,17 @@ limitations under the License.
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "mlir/Dialect/Linalg/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h"  // from @llvm-project
+#include "mlir/EDSC/Builders.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/tiled_dot_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
@@ -89,6 +98,9 @@ enum class DotImplementationStrategy {
   // and the output have to be row major.
   kTiledLlvmIrGemm,
 
+  // The dot operation is lowered into linalg.matmul op and lowered to LLVM IR.
+  kLinalgMatmul,
+
   // The dot operation is lowered into a call into an Eigen routine.  No fusions
   // are supported today.  The two inputs and the output have to be row major.
   // However, we do allow transposing either the LHS or the RHS as part of the
@@ -112,7 +124,7 @@ class DotOpEmitter {
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 
@@ -163,6 +175,9 @@ class DotOpEmitter {
   // Lowers the dot operation as a tiled Matrix*Matrix loop.
   void EmitTiledLlvmIrGemm();
 
+  // Lowers the dot operation through MLIR's linalg.matmul.
+  Status EmitLinalgMatmul();
+
   // Lowers the dot operation as a naive nested loop that computes the result
   // one element at a time.
   void EmitNaiveLlvmIrGemm();
@@ -194,20 +209,19 @@ class DotOpEmitter {
   const llvm_ir::IrArray* addend_array_;
   llvm::Value* executable_run_options_value_;
   llvm::IRBuilder<>* b_;
+  mlir::MLIRContext* mlir_context_;
   const HloModuleConfig& hlo_module_config_;
   const TargetMachineFeatures& target_machine_features_;
 };
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
-                           const llvm_ir::IrArray& target_array,
-                           const llvm_ir::IrArray& lhs_array,
-                           const llvm_ir::IrArray& rhs_array,
-                           const llvm_ir::IrArray* addend_array,
-                           llvm::Value* executable_run_options_value,
-                           llvm::IRBuilder<>* b,
-                           const HloModuleConfig& hlo_module_config,
-                           const TargetMachineFeatures& target_machine_features)
+DotOpEmitter::DotOpEmitter(
+    DotInfo dot_info, string dot_hlo_name, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
+    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
+    const TargetMachineFeatures& target_machine_features)
     : dot_info_(std::move(dot_info)),
       dot_hlo_name_(std::move(dot_hlo_name)),
       target_array_(target_array),
@@ -216,9 +230,36 @@ DotOpEmitter::DotOpEmitter(DotInfo dot_info, string dot_hlo_name,
       addend_array_(addend_array),
       executable_run_options_value_(executable_run_options_value),
       b_(b),
+      mlir_context_(mlir_context),
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
+Status DotOpEmitter::EmitLinalgMatmul() {
+  Shape operand_shapes[] = {dot_info_.lhs_shape, dot_info_.rhs_shape};
+  llvm::Value* operand_ptrs[] = {lhs_array_.GetBasePointer(),
+                                 rhs_array_.GetBasePointer()};
+  llvm::Value* target_ptr = target_array_.GetBasePointer();
+
+  // Zero out the output buffer.
+  int64 size_bytes = ShapeUtil::ByteSizeOf(dot_info_.result_shape);
+  b_->CreateMemSet(target_ptr, b_->getInt8(0), /*Size=*/size_bytes,
+                   /*Align=*/llvm::MaybeAlign(1));
+
+  std::string name =
+      absl::StrCat("linalgMatMul_", dot_info_.result_shape.ToString(true), "_",
+                   dot_info_.lhs_shape.ToString(true), "_",
+                   dot_info_.rhs_shape.ToString(true));
+  return EmitMlirFuncAndCall(
+      mlir_context_, b_, dot_info_.result_shape, operand_shapes, target_ptr,
+      operand_ptrs, name, [&](mlir::OpBuilder* builder, mlir::FuncOp function) {
+        mlir::edsc::ScopedContext scope(*builder, function.getLoc());
+        mlir::Value a = function.getArgument(0), b = function.getArgument(1),
+                    c = function.getArgument(2);
+        mlir::edsc::intrinsics::linalg_matmul(b, c, a);
+        mlir::edsc::intrinsics::std_ret();
+      });
+}
+
 void DotOpEmitter::EmitTiledLlvmIrGemm() {
   PrimitiveType primitive_type = dot_info_.result_shape.element_type();
   MatMultDims mat_mult_dims = GetMatMultDims();
@@ -418,6 +459,9 @@ Status DotOpEmitter::Emit() {
       EmitTiledLlvmIrGemm();
       return Status::OK();
 
+    case DotImplementationStrategy::kLinalgMatmul:
+      return EmitLinalgMatmul();
+
     case DotImplementationStrategy::kEigen:
       return EmitCallToRuntime();
   }
@@ -886,9 +930,12 @@ DotImplementationStrategy GetDotImplementationStrategy(
   }
 
   if (IsAlignedGemm(dot_info, target_machine_features)) {
-    return CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)
-               ? DotImplementationStrategy::kTiledLlvmIrGemm
-               : DotImplementationStrategy::kEigen;
+    if (CanEmitTiledLlvmIrGemm(config, dot_info, target_machine_features)) {
+      return options::UseLinalgForDot(config)
+                 ? DotImplementationStrategy::kLinalgMatmul
+                 : DotImplementationStrategy::kTiledLlvmIrGemm;
+    }
+    return DotImplementationStrategy::kEigen;
   }
 
   return DotImplementationStrategy::kNaiveLlvmIr;
@@ -899,15 +946,15 @@ Status EmitNonBatchDotOperation(
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(S32 == type || F16 == type || F32 == type || F64 == type ||
                C64 == type || C128 == type);
   DotOpEmitter dot_emitter(std::move(dot_info), std::move(hlo_name),
                            target_array, lhs_array, rhs_array, addend_array,
-                           executable_run_options_value, b, hlo_module_config,
-                           target_machine_features);
+                           executable_run_options_value, b, mlir_context,
+                           hlo_module_config, target_machine_features);
   return dot_emitter.Emit();
 }
 
@@ -981,7 +1028,7 @@ Status EmitBatchDotOperation(
     const HloInstruction& dot, const llvm_ir::IrArray& target_array,
     const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* b,
-    const HloModuleConfig& hlo_module_config,
+    mlir::MLIRContext* mlir_context, const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   TF_RETURN_IF_ERROR(ValidateDotDimensionNumbers(dot.dot_dimension_numbers()));
 
@@ -1039,7 +1086,7 @@ Status EmitBatchDotOperation(
         // Emit the inner non-batch dot operation.
         return EmitNonBatchDotOperation(
             dot_info, dot.name(), target_slice, lhs_slice, rhs_slice, nullptr,
-            executable_run_options_value, b, hlo_module_config,
+            executable_run_options_value, b, mlir_context, hlo_module_config,
             target_machine_features);
       });
 }
@@ -1089,7 +1136,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features) {
   // This routine assumes that the dot operation is not in a parallelized
@@ -1099,13 +1146,13 @@ Status EmitDotOperation(const HloInstruction& dot,
   if (IsBatchDot(dot)) {
     TF_RET_CHECK(addend_array == nullptr);
     return EmitBatchDotOperation(dot, target_array, lhs_array, rhs_array,
-                                 executable_run_options_value, b,
+                                 executable_run_options_value, b, mlir_context,
                                  hlo_module_config, target_machine_features);
   }
 
   return EmitNonBatchDotOperation(DotInfo(dot), dot.name(), target_array,
                                   lhs_array, rhs_array, addend_array,
-                                  executable_run_options_value, b,
+                                  executable_run_options_value, b, mlir_context,
                                   hlo_module_config, target_machine_features);
 }
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 105bd3005c8..d9cf8a2036b 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -63,7 +64,7 @@ Status EmitDotOperation(const HloInstruction& dot,
                         const llvm_ir::IrArray& rhs_array,
                         const llvm_ir::IrArray* addend_array,
                         llvm::Value* executable_run_options_value,
-                        llvm::IRBuilder<>* b,
+                        llvm::IRBuilder<>* b, mlir::MLIRContext* mlir_context,
                         const HloModuleConfig& hlo_module_config,
                         const TargetMachineFeatures& target_machine_features);
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 70dde919afb..043ad68a196 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -89,8 +89,8 @@ using llvm_ir::SetToFirstInsertPoint;
 namespace cpu {
 
 IrEmitter::IrEmitter(
-    const HloModule& hlo_module, const BufferAssignment& assignment,
-    llvm::Module* llvm_module,
+    mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
+    const BufferAssignment& assignment, llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
     const TargetMachineFeatures* target_machine_features,
@@ -99,6 +99,7 @@ IrEmitter::IrEmitter(
       module_(llvm_module),
       arch_type_(llvm::Triple(llvm_module->getTargetTriple()).getArch()),
       b_(llvm_module->getContext()),
+      mlir_context_(mlir_context),
       instruction_to_profile_idx_(std::move(instruction_to_profile_idx)),
       computation_to_profile_idx_(std::move(computation_to_profile_idx)),
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
@@ -898,7 +899,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
   // Dot operation is complicated so we delegate to a helper class.
   return EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
                           /*addend_array=*/nullptr,
-                          GetExecutableRunOptionsArgument(), &b_,
+                          GetExecutableRunOptionsArgument(), &b_, mlir_context_,
                           hlo_module_config_, target_machine_features_);
 }
 
@@ -2305,10 +2306,10 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
     llvm_ir::IrArray addend_array(
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
-    TF_RETURN_IF_ERROR(
-        EmitDotOperation(*dot, target_array, lhs_array, rhs_array,
-                         &addend_array, GetExecutableRunOptionsArgument(), &b_,
-                         hlo_module_config_, target_machine_features_));
+    TF_RETURN_IF_ERROR(EmitDotOperation(
+        *dot, target_array, lhs_array, rhs_array, &addend_array,
+        GetExecutableRunOptionsArgument(), &b_, mlir_context_,
+        hlo_module_config_, target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 9b0d11e9f3f..661785153d0 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMITTER_H_
 
 #include <stddef.h>
+
 #include <map>
 #include <memory>
 #include <string>
@@ -32,6 +33,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Target/TargetMachine.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_function.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
@@ -69,14 +71,16 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // hlo_module: the HLO module we are emitting IR for.
   // assignment: a BufferAssignment from which we know which buffers are used by
   //             the HLO nodes.
-  // llvm_module: the LLVM module to emit IR into.
+  // mlir_context: the MLIR context used for IR emission.
+  // llvm_module: the LLVM module to emit IR into. It's built using the LLVM
+  //              context inside of mlir_context.
   // instruction_to_profile_idx: the mapping from HLO instructions to their
   //              index in the profiling array.
   // computation_to_profile_idx: the mapping from HLO computations to their
   //              index in the profiling array.
   // emit_code_for_msan: whether emitted code should be compatible with msan.
-  IrEmitter(const HloModule& hlo_module, const BufferAssignment& assignment,
-            llvm::Module* llvm_module,
+  IrEmitter(mlir::MLIRContext* mlir_context, const HloModule& hlo_module,
+            const BufferAssignment& assignment, llvm::Module* llvm_module,
             std::unordered_map<const HloInstruction*, int64>
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
@@ -442,6 +446,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // module's function list).
   std::unique_ptr<IrFunction> compute_function_;
   llvm::IRBuilder<> b_;
+  mlir::MLIRContext* mlir_context_;
 
   // The buffer allocation slice for the root of the computation being compiled.
   // Only relevant for thread local computations.
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
new file mode 100644
index 00000000000..e7d52c288d5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.cc
@@ -0,0 +1,132 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/mlir_emitter.h"
+
+#include "llvm/Linker/Linker.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Target/LLVMIR.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/hlo_utils.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+// Lower an MLIR module to an LLVM module.
+std::unique_ptr<llvm::Module> MakeLLVMModule(mlir::OwningModuleRef module) {
+  mlir::PassManager manager(module->getContext());
+  manager.addPass(mlir::createConvertLinalgToLoopsPass());
+  manager.addPass(mlir::createConvertLinalgToLLVMPass());
+  manager.addPass(mlir::createConvertVectorToLLVMPass());
+  manager.addPass(mlir::createLowerToLLVMPass());
+  CHECK(succeeded(manager.run(*module)));
+  return mlir::translateModuleToLLVMIR(*module);
+}
+
+// Get arguments to pass a memref to an mlir function.
+void BuildViewForBuffer(llvm::SmallVectorImpl<llvm::Value *> *args,
+                        llvm::IRBuilder<> *b, const Shape &opShape,
+                        llvm::Value *op_val) {
+  llvm::Type *ty = op_val->getType();
+  while (auto aty = llvm::dyn_cast<llvm::ArrayType>(
+             llvm::cast<llvm::PointerType>(ty)->getElementType())) {
+    ty = aty->getElementType()->getPointerTo();
+  }
+  op_val = b->CreateBitCast(op_val, ty);
+
+  args->push_back(op_val);          // Allocated pointer.
+  args->push_back(op_val);          // Aligned pointer.
+  args->push_back(b->getInt64(0));  // Offset.
+
+  // Sizes.
+  for (int64 dim : opShape.dimensions()) {
+    args->push_back(b->getInt64(dim));
+  }
+
+  int64_t accumulated_stride = 1;
+  llvm::SmallVector<int64_t, 4> strides(opShape.rank(), 1);
+  for (int64 dim : LayoutUtil::MinorToMajor(opShape)) {
+    strides[dim] = accumulated_stride;
+    accumulated_stride *= opShape.dimensions(dim);
+  }
+
+  // Strides.
+  for (int64 stride : strides) {
+    args->push_back(b->getInt64(stride));
+  }
+}
+}  // namespace
+
+Status EmitMlirFuncAndCall(
+    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
+    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
+    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
+    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter) {
+  llvm::Module *llvm_module = b->GetInsertBlock()->getParent()->getParent();
+  mlir::Builder mlir_builder(context);
+
+  // Get memref types for the inputs and output.
+  TF_ASSIGN_OR_RETURN(mlir::Type ret_memref, ConvertTensorShapeToMemRefType(
+                                                 result_shape, mlir_builder));
+  std::vector<mlir::Type> operand_types = {ret_memref};
+  for (int i = 0; i != operand_shapes.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        mlir::Type op_memref,
+        ConvertTensorShapeToMemRefType(operand_shapes[i], mlir_builder));
+    operand_types.push_back(op_memref);
+  }
+
+  // Create the function an call the emission callback.
+  mlir::Location loc = mlir::UnknownLoc::get(context);
+  auto function = mlir::FuncOp::create(
+      loc, func_name, mlir::FunctionType::get(operand_types, {}, context));
+  function.addEntryBlock();
+  mlir::OwningModuleRef mlir_module = mlir::ModuleOp::create(loc);
+  mlir_module->push_back(function);
+  mlir::OpBuilder op_builder(&function.getBody());
+  emitter(&op_builder, function);
+
+  // Now link it all into the main LLVM module.
+  auto mlir_llvm_module = MakeLLVMModule(std::move(mlir_module));
+  mlir_llvm_module->setDataLayout(llvm_module->getDataLayout());
+  llvm::Linker::linkModules(
+      *llvm_module, std::move(mlir_llvm_module), llvm::Linker::None,
+      [](llvm::Module &M, const llvm::StringSet<> &GVS) {
+        llvm::internalizeModule(M, [&GVS](const llvm::GlobalValue &GV) {
+          return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+        });
+      });
+
+  // And leave behind a call to the function generated by MLIR.
+  llvm::Function *func = llvm_module->getFunction(func_name);
+  llvm::SmallVector<llvm::Value *, 4> op_vals;
+  BuildViewForBuffer(&op_vals, b, result_shape, result_ptr);
+  for (int i = 0; i != operand_shapes.size(); ++i) {
+    BuildViewForBuffer(&op_vals, b, operand_shapes[i], operand_ptrs[i]);
+  }
+  b->CreateCall(func, op_vals);
+
+  return Status::OK();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/mlir_emitter.h b/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
new file mode 100644
index 00000000000..bc0741e851a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/mlir_emitter.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Value.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+namespace cpu {
+
+// Create a new MLIR function with the name `func_name`, populate it with
+// `emitter` and create a call, passing it the buffers defined by
+// resultShape/resultPtr and operandShapes/operandPtrs. The function is added to
+// the LLVM module at `b`s insertion point.
+Status EmitMlirFuncAndCall(
+    mlir::MLIRContext *context, llvm::IRBuilder<> *b, const Shape &result_shape,
+    llvm::ArrayRef<Shape> operand_shapes, llvm::Value *result_ptr,
+    llvm::ArrayRef<llvm::Value *> operand_ptrs, llvm::StringRef func_name,
+    llvm::function_ref<void(mlir::OpBuilder *, mlir::FuncOp)> emitter);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_MLIR_EMITTER_H_

From e18c52cd16ab9f46d09450a0b7eff94c365fc5da Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 20 May 2020 07:11:58 -0700
Subject: [PATCH 0895/1533] Update StopGradients's overly strict verification

Subtypes may differ and we may have dynamic shapes, so don't require static
type equality.

PiperOrigin-RevId: 312475863
Change-Id: I4dd6fef11ece9e1e62560e411ca758e848964330
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index fd24b7284c1..957ba4909a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -8317,8 +8317,9 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   );
 }
 
-def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, SameOperandsAndResultType]> {
-  let summary = "Stops gradient computation.";
+def TF_StopGradientOp : TF_Op<"StopGradient",
+    [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
+  let summary = "Stops gradient computation";
 
   let description = [{
 When executed in a graph, this op outputs its input tensor as-is.

From 9c313c4d2d4c93312f4d43f96c3888c033cb63ca Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Wed, 20 May 2020 07:33:46 -0700
Subject: [PATCH 0896/1533] Use `--ignore-installed` instead of
 `--force-reinstall` when reinstalling tf-estimator-nightly.

PiperOrigin-RevId: 312478607
Change-Id: Id8638c4f8587f7bad51f4f3e71faef52dcce6f9c
---
 tensorflow/tools/ci_build/release/common_win.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index d34c92736c0..464782dcefd 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -28,7 +28,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install --force-reinstall tf-estimator-nightly --no-deps
+%PIP_EXE% install --ignore-installed tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
 %PIP_EXE% install numpy --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade

From 6e509432c07b79a254a40493585ff964e4df4461 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Wed, 20 May 2020 07:49:31 -0700
Subject: [PATCH 0897/1533] Add xla_hlo.case op for  indexed conditional HLO.

Adds import, export and verifier support for this op. It is exported to indexed conditional HLO.

PiperOrigin-RevId: 312480515
Change-Id: I8306e8f7f24b0a304de00547d3022d4fe804deb9
---
 .../mlir/xla/hlo_function_importer.cc         | 38 +++++--
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 41 ++++++++
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    | 22 ++++-
 .../compiler/mlir/xla/ir/hlo_ops_base.td      | 23 +++++
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   | 13 +++
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      | 27 +++++
 .../compiler/mlir/xla/tests/lhlo_ops.mlir     | 21 ++++
 tensorflow/compiler/mlir/xla/tests/ops.mlir   | 92 +++++++++++++++++
 .../mlir/xla/tests/translate/case.mlir        | 99 +++++++++++++++++++
 .../tests/translate/case_conditional.hlotxt   | 46 +++++++++
 10 files changed, 413 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/xla/tests/translate/case.mlir
 create mode 100644 tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt

diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 718db1597cf..22a0b038833 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -420,15 +420,37 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
     }
     case HloOpcode::kConditional: {
       llvm::SmallVector<Type, 4> rets;
-      TF_RETURN_IF_ERROR(GetMlirTypes(
-          {instruction->true_computation()->root_instruction()}, &rets));
+      mlir::Type pred_or_index_type =
+          operands[0].getType().cast<mlir::TensorType>().getElementType();
+      // It is a predicated conditional if first argument is a boolean and
+      // should be mapped to If op.
+      if (pred_or_index_type.isInteger(1)) {
+        TF_RETURN_IF_ERROR(GetMlirTypes(
+            {instruction->true_computation()->root_instruction()}, &rets));
 
-      auto op = func_builder->create<mlir::xla_hlo::IfOp>(loc, rets, operands,
-                                                          attributes);
-      TF_RETURN_IF_ERROR(ImportComputation(instruction->true_computation(),
-                                           &op.true_branch()));
-      TF_RETURN_IF_ERROR(ImportComputation(instruction->false_computation(),
-                                           &op.false_branch()));
+        auto op = func_builder->create<mlir::xla_hlo::IfOp>(loc, rets, operands,
+                                                            attributes);
+        TF_RETURN_IF_ERROR(ImportComputation(instruction->true_computation(),
+                                             &op.true_branch()));
+        TF_RETURN_IF_ERROR(ImportComputation(instruction->false_computation(),
+                                             &op.false_branch()));
+        return op.getOperation();
+      }
+
+      // Otherwise, it is a indexed conditional and should be mapped to Case op.
+      TF_RETURN_IF_ERROR(GetMlirTypes(
+          {instruction->branch_computation(0)->root_instruction()}, &rets));
+
+      int num_branches = instruction->branch_count();
+      auto op = func_builder->create<mlir::xla_hlo::CaseOp>(
+          loc, rets, operands, attributes, num_branches);
+      for (auto index_and_computation :
+           llvm::enumerate(instruction->branch_computations())) {
+        auto index = index_and_computation.index();
+        HloComputation* computation = index_and_computation.value();
+        TF_RETURN_IF_ERROR(
+            ImportComputation(computation, &op.branches()[index]));
+      }
       return op.getOperation();
     }
     case HloOpcode::kConcatenate: {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 03928467cff..d20f1713eba 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -1396,6 +1396,47 @@ OpFoldResult ReshapeOp::fold(ArrayRef<Attribute> operands) {
   return {};
 }
 
+//===----------------------------------------------------------------------===//
+// Case Op
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(CaseOp op) {
+  auto num_branches = op.branches().size();
+  if (op.branch_operands().size() != num_branches)
+    return op.emitOpError() << "expects number of branches " << num_branches
+                            << " to be same as number of branch operands "
+                            << op.branch_operands().size();
+
+  MutableArrayRef<Region> branches = op.branches();
+  OperandRange branch_operands = op.branch_operands();
+  for (unsigned i = 0; i < num_branches; ++i) {
+    mlir::Region& branch_region = branches[i];
+    if (branch_region.empty())
+      return op.emitOpError() << "cannot have empty regions";
+    mlir::Block& entry_block = branch_region.front();
+    if (entry_block.getNumArguments() != 1)
+      return op.emitOpError()
+             << "expects branch regions to have single argument, but found "
+             << entry_block.getNumArguments() << " for branch " << i;
+    auto operand = branch_operands[i];
+    if (entry_block.getArgument(0).getType() != operand.getType())
+      return op.emitOpError()
+             << "expects operand " << i + 1 << " to be of type "
+             << entry_block.getArgument(0).getType() << ", but found "
+             << operand.getType();
+    WalkResult walker = branch_region.walk([&](ReturnOp return_op) {
+      if (return_op.getOperands().getTypes() != op.getResultTypes())
+        return WalkResult::interrupt();
+      return WalkResult::advance();
+    });
+    if (walker.wasInterrupted())
+      return op.emitOpError()
+             << "branch " << i
+             << " returned values do not match op result types";
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // BinaryOps
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 99801f1618e..093e79a8613 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -497,7 +497,8 @@ def HLO_IfOp: HLO_Op<"if", []> {
     HLO_TensorOrTuple:$false_arg
   );
 
-  let regions = (region AnyRegion:$true_branch, AnyRegion:$false_branch);
+  let regions = (region AnyRegion:$true_branch,
+                        AnyRegion:$false_branch);
 
   let results = (outs HLO_TensorOrTuple);
 
@@ -505,6 +506,25 @@ def HLO_IfOp: HLO_Op<"if", []> {
   let hasCustomHLOConverter = 1;
 }
 
+// Xla Client API has two separate calls for indexed and predicated conditional,
+// although both eventually map to kConditional HLO. CaseOp maps to indexed
+// conditional use of kConditional HLO.
+def HLO_CaseOp: HLO_Op<"case", []>,
+      BASE_HLO_CaseOp {
+
+  let arguments = (ins
+    I32Tensor:$index,
+    Variadic<HLO_TensorOrTuple>:$branch_operands
+  );
+
+  let regions = (region VariadicRegion<AnyRegion>:$branches);
+
+  let results = (outs Variadic<HLO_TensorOrTuple>);
+
+  let hasCustomHLOConverter = 1;
+}
+
+
 def HLO_WhileOp: HLO_Op<"while", [SameOperandsAndResultType]> {
   string summary = "While operator";
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index b5130eafd0e..bad1bf16ec3 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -555,6 +555,29 @@ class BASE_HLO_XorOp {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// XLA control flow related op definitions.
+//===----------------------------------------------------------------------===//
+
+class BASE_HLO_CaseOp {
+  string summary = "Switch-Case operator";
+
+  string description = [{
+    Returns the result of executing `branches[index]`. If
+    `index` is < 0 or >= N, then `branches[N-1] is executed as
+    the default branch.
+
+    Each branch `branches[b]` must take in a single argument of same type as
+    `branch_operands[b]` and will be invoked with `branch_operands[b]`. The type
+    of the returned value of each branch must be the same.
+
+    Note that only one of the branches will be executed depending on the value
+    of index.
+    See https://www.tensorflow.org/xla/operation_semantics#conditional.
+  }];
+
+}
+
 //===----------------------------------------------------------------------===//
 // XLA parallelism related op definitions.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index db75bbd1f67..020859aa0bf 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -196,6 +196,19 @@ def LHLO_ReduceWindowOp: LHLO_Op<"reduce_window", [
   let regions = (region SizedRegion<1>:$body);
 }
 
+def LHLO_CaseOp: LHLO_Op<"case", [
+      SingleBlockImplicitTerminator<"TerminatorOp">
+    ]>, BASE_HLO_CaseOp {
+
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", [MemRead]>:$index,
+    Arg<Variadic<LHLO_BufferOrTuple>, "", [MemRead]>:$branch_operands,
+    Arg<LHLO_BufferOrTuple, "", [MemWrite]>:$out
+  );
+
+  let regions = (region VariadicRegion<SizedRegion<1>>:$branches);
+}
+
 //===----------------------------------------------------------------------===//
 // XLA tuple op definitions.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 9e30d830602..8150d719f3e 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -636,6 +636,33 @@ LogicalResult ExportXlaOp(IfOp op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(CaseOp op, OpLoweringContext ctx) {
+  llvm::DenseMap<mlir::Value, xla::XlaOp>& value_map = *ctx.values;
+  OperandRange operands = op.branch_operands();
+  MutableArrayRef<Region> branches = op.branches();
+  llvm::SmallVector<xla::XlaOp, 4> branch_operands(branches.size());
+  std::vector<xla::XlaComputation> computations(branches.size());
+  std::vector<xla::XlaComputation*> computations_p(branches.size());
+
+  for (unsigned i = 0; i < branches.size(); ++i) {
+    branch_operands[i] = value_map[operands[i]];
+    computations_p[i] = &computations[i];
+    if (failed(ctx.converter->LowerRegionAsComputation(&branches[i],
+                                                       computations_p[i])))
+      return failure();
+  }
+  xla::XlaOp result =
+      xla::Conditional(value_map[op.index()], computations_p, branch_operands);
+  if (op.getNumResults() == 1) {
+    value_map[op.getResult(0)] = result;
+  } else {
+    for (auto item : llvm::enumerate(op.getResults())) {
+      value_map[item.value()] = xla::GetTupleElement(result, item.index());
+    }
+  }
+  return success();
+}
+
 LogicalResult ExportXlaOp(ConstOp op, OpLoweringContext ctx) {
   return failure();
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index 23e9d9b68e0..d4d775731c8 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -178,3 +178,24 @@ func @fusion_memref(%input1: memref<10xf32>, %input2: memref<10xf32>, %input3: m
   } ) : () -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @case_memref
+func @case_memref(%index: memref<i32>, %operand_1: memref<f32>, %operand_2: memref<f32>, %operand_3: memref<f32>, %out: memref<f32>) -> () {
+  "xla_lhlo.case"(%index, %operand_1, %operand_2, %operand_3, %out) ( {
+    ^bb0(%arg0: memref<f32>):
+      "xla_lhlo.negate"(%arg0, %out) : (memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    },  {
+    ^bb0(%arg0: memref<f32>):
+      "xla_lhlo.copy"(%arg0, %out) : (memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    },  {
+    ^bb0(%arg0: memref<f32>):
+      "xla_lhlo.add"(%arg0, %arg0, %out) : (memref<f32>, memref<f32>, memref<f32>) -> ()
+      "xla_lhlo.terminator"() : () -> ()
+    }
+  ) : (memref<i32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index f09ec62c8dc..e6ae074f922 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -156,6 +156,98 @@ func @broadcast_in_dim_bad_shape_mismatch(%arg0: tensor<3xi32>) -> tensor<1x2x3x
 
 // -----
 
+func @case_mismatch_num_args(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{expects branch regions to have single argument, but found 2 for branch 1}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+      %1 = "xla_hlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_mismatch_num_results(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{branch 1 returned values do not match op result types}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1, %arg0) : (tensor<f32>, tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_mismatch_arg_type(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{expects operand 2 to be of type 'tensor<i32>', but found 'tensor<f32>'}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<i32>):
+      %1 = xla_hlo.constant dense<2.0> : tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_mismatch_return_type(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{branch 1 returned values do not match op result types}}
+  %0 = "xla_hlo.case"(%index, %operand_1, %operand_2, %operand_3) ( {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = xla_hlo.constant dense<2> : tensor<i32>
+      "xla_hlo.return"(%1) : (tensor<i32>) -> ()
+    },  {
+    ^bb0(%arg0: tensor<f32>):
+      %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+      "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+    }
+  ) : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func @case_empty_region(%index: tensor<i32>, %operand_1: tensor<f32>) -> () {
+  // expected-error@+1 {{cannot have empty regions}}
+  "xla_hlo.case"(%index, %operand_1) ( {} ) : (tensor<i32>, tensor<f32>) -> tensor<f32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func @comp_eq
 func @comp_eq(%arg0: tensor<3xi32>, %arg1: tensor<3xi32>) -> tensor<3xi1> {
   %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case.mlir b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
new file mode 100644
index 00000000000..dba9e8b61ca
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case.mlir
@@ -0,0 +1,99 @@
+// RUN: tf-mlir-translate -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
+
+func @main() -> tensor<f32> {
+  %cst = constant  {name = "constant"} dense<1> : tensor<i32>
+  %cst_0 = constant  {name = "constant.1"} dense<5.600000e+01> : tensor<f32>
+  %cst_1 = constant  {name = "constant.2"} dense<1.200000e+01> : tensor<f32>
+  %cst_2 = constant  {name = "constant.3"} dense<1.300000e+01> : tensor<f32>
+  %0 = "xla_hlo.case"(%cst, %cst_0, %cst_1, %cst_2) ( {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.negate"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.copy"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1) : (tensor<f32>) -> ()
+  }) {name = "conditional"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// CHECK: %[[NEGATE_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] negate(f32[] %[[ARG]])
+// CHECK: }
+
+// CHECK: %[[COPY_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] copy(f32[] %[[ARG]])
+// CHECK: }
+
+// CHECK: %[[FLOOR_BRANCH:.*]] ({{.*}}: f32[]) -> f32[] {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   ROOT %[[RESULT:.*]] = f32[] floor(f32[] %[[ARG]])
+// CHECK: }
+
+// CHECK-LABEL: ENTRY
+// CHECK-SAME:  () -> f32[]
+
+// CHECK: %[[INDEX:.*]] = s32[] constant(1)
+// CHECK: %[[OPERAND_1:.*]] = f32[] constant(56)
+// CHECK: %[[OPERAND_2:.*]] = f32[] constant(12)
+// CHECK: %[[OPERAND_3:.*]] = f32[] constant(13)
+// CHECK: ROOT %[[RESULT:.*]] = f32[] conditional(s32[] %[[INDEX]], f32[] %[[OPERAND_1]], f32[] %[[OPERAND_2]], f32[] %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
+
+// -----
+
+func @main() -> (tensor<f32>, tensor<f32>) {
+  %cst = constant  {name = "constant"} dense<1> : tensor<i32>
+  %cst_0 = constant  {name = "constant.1"} dense<5.600000e+01> : tensor<f32>
+  %cst_1 = constant  {name = "constant.2"} dense<1.200000e+01> : tensor<f32>
+  %cst_2 = constant  {name = "constant.3"} dense<1.300000e+01> : tensor<f32>
+  %0:2 = "xla_hlo.case"(%cst, %cst_0, %cst_1, %cst_2) ( {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.negate"(%arg0) {name = "negate"} : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.copy"(%arg0) {name = "copy"} : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
+  },  {
+  ^bb0(%arg0: tensor<f32>):
+    %1 = "xla_hlo.floor"(%arg0) {name = "floor"} : (tensor<f32>) -> tensor<f32>
+    "xla_hlo.return"(%1, %1) : (tensor<f32>, tensor<f32>) -> ()
+  }) {name = "conditional"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  return %0#0, %0#1 : tensor<f32>, tensor<f32>
+}
+
+// CHECK: %[[NEGATE_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   %[[NEGATE:.*]] = f32[] negate(f32[] %[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[NEGATE]], f32[] %[[NEGATE]])
+// CHECK: }
+
+// CHECK: %[[COPY_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   %[[COPY:.*]] = f32[] copy(f32[] %[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[COPY]], f32[] %[[COPY]])
+// CHECK: }
+
+// CHECK: %[[FLOOR_BRANCH:.*]] ({{.*}}: f32[]) -> (f32[], f32[]) {
+// CHECK:   %[[ARG:.*]] = f32[] parameter(0)
+// CHECK:   %[[FLOOR:.*]] = f32[] floor(f32[] %[[ARG]])
+// CHECK:   ROOT %[[TUPLE:.*]] = (f32[], f32[]) tuple(f32[] %[[FLOOR]], f32[] %[[FLOOR]])
+// CHECK: }
+
+// CHECK-LABEL: ENTRY
+// CHECK-SAME:  () -> (f32[], f32[])
+
+// CHECK: %[[INDEX:.*]] = s32[] constant(1)
+// CHECK: %[[OPERAND_1:.*]] = f32[] constant(56)
+// CHECK: %[[OPERAND_2:.*]] = f32[] constant(12)
+// CHECK: %[[OPERAND_3:.*]] = f32[] constant(13)
+// CHECK: %[[TUPLE:.*]] = (f32[], f32[]) conditional(s32[] %[[INDEX]], f32[] %[[OPERAND_1]], f32[] %[[OPERAND_2]], f32[] %[[OPERAND_3]]), branch_computations={%[[NEGATE_BRANCH]], %[[COPY_BRANCH]], %[[FLOOR_BRANCH]]}
+// CHECK: %[[RES_1:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[TUPLE]]), index=0
+// CHECK: %[[RES_2:.*]] = f32[] get-tuple-element((f32[], f32[]) %[[TUPLE]]), index=1
+// CHECK: ROOT %[[RESULT:.*]] = (f32[], f32[]) tuple(f32[] %[[RES_1]], f32[] %[[RES_2]])
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
new file mode 100644
index 00000000000..2ff223cd480
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/case_conditional.hlotxt
@@ -0,0 +1,46 @@
+// RUN: tf-mlir-translate -hlo-text-to-mlir-hlo %s -o - | FileCheck %s
+
+HloModule Indexed_Conditional
+
+%Negate (x: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  ROOT %negate = f32[] negate(f32[] %x)
+}
+
+%Identity (y: f32[]) -> f32[] {
+  %y = f32[] parameter(0)
+  ROOT %copy = f32[] copy(f32[] %y)
+}
+
+%Floor (z: f32[]) -> f32[] {
+  %z = f32[] parameter(0)
+  ROOT %floor = f32[] floor(f32[] %z)
+}
+
+ENTRY %indexed_conditional () -> f32[] {
+  %constant = s32[] constant(1)
+  %constant.1 = f32[] constant(56)
+  %constant.2 = f32[] constant(12)
+  %constant.3 = f32[] constant(13)
+  ROOT %conditional = f32[] conditional(s32[] %constant, f32[] %constant.1, f32[] %constant.2, f32[] %constant.3), branch_computations={%Negate, %Identity, %Floor}
+}
+
+// CHECK-LABEL: func @main() -> tensor<f32>
+// CHECK: %[[INDEX:.*]] = constant  {name = "constant"} dense<1> : tensor<i32>
+// CHECK: %[[OPERAND_1:.*]] = constant  {name = "{{.*}}"} dense<5.600000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_2:.*]] = constant  {name = "{{.*}}"} dense<1.200000e+01> : tensor<f32>
+// CHECK: %[[OPERAND_3:.*]] = constant  {name = "{{.*}}"} dense<1.300000e+01> : tensor<f32>
+// CHECK: %[[RESULT:.*]] = "xla_hlo.case"(%[[INDEX]], %[[OPERAND_1]], %[[OPERAND_2]], %[[OPERAND_3]]) ( {
+// CHECK:   ^bb0(%[[ARG_1:.*]]: tensor<f32>):
+// CHECK:     %[[RES_1:.*]] = "xla_hlo.negate"(%[[ARG_1]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     "xla_hlo.return"(%[[RES_1]]) : (tensor<f32>) -> ()
+// CHECK:   },  {
+// CHECK:   ^bb0(%[[ARG_2:.*]]: tensor<f32>):
+// CHECK:     %[[RES_2:.*]] = "xla_hlo.copy"(%[[ARG_2]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     "xla_hlo.return"(%[[RES_2]]) : (tensor<f32>) -> ()
+// CHECK:   },  {
+// CHECK:   ^bb0(%[[ARG_3:.*]]: tensor<f32>):
+// CHECK:     %[[RES_3:.*]] = "xla_hlo.floor"(%[[ARG_3]]) {name = "{{.*}}"} : (tensor<f32>) -> tensor<f32>
+// CHECK:     "xla_hlo.return"(%[[RES_3]]) : (tensor<f32>) -> ()
+// CHECK:   }) {name = "{{.*}}"} : (tensor<i32>, tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+// CHECK: return %[[RESULT]] : tensor<f32>

From c0ddb9b4faaa8fea257947386a5a27b9f050710d Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Wed, 20 May 2020 15:55:36 +0100
Subject: [PATCH 0898/1533] TFLu: Update CMSIS-NN glue interface in conv.cc

The CMSIS-NN glue for TFLu convolution now adopts a wrapper function
(arm_convolve_wrapper_s8) to simplify the integration of future optimizations
available in CMSIS-NN. The wrapper function is responsible
to dispatch the most optimized kernel accordingly with the parameters passed
---
 .../lite/micro/kernels/cmsis-nn/conv.cc       | 251 ++++++++++--------
 .../tools/make/third_party_downloads.inc      |   4 +-
 2 files changed, 146 insertions(+), 109 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
index 34d4e837f65..286e24a508d 100644
--- a/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
+++ b/tensorflow/lite/micro/kernels/cmsis-nn/conv.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/conv.h"
 
 #include "arm_nnfunctions.h"
+#include "arm_nn_types.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
@@ -116,7 +117,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 #if defined(__ARM_FEATURE_DSP)
   OpData data;
-  int32_t buf_size;
+  int32_t buf_size = 0;
 
   auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
 
@@ -127,32 +128,51 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   RuntimeShape input_shape = GetTensorShape(input);
   RuntimeShape output_shape = GetTensorShape(output);
 
-  const int input_depth = input_shape.Dims(3);
-  const int input_width = input->dims->data[2];
-  const int input_height = input->dims->data[1];
-  const int filter_width = filter->dims->data[2];
-  const int filter_height = filter->dims->data[1];
-  const int output_width = output->dims->data[2];
-  const int output_height = output->dims->data[1];
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  // Initialize cmsis-nn input dimensions
+  cmsis_nn_dims input_dims;
+  input_dims.n = MatchingDim(input_shape, 0, output_shape, 0);
+  input_dims.h = input->dims->data[1];
+  input_dims.w = input->dims->data[2];
+  input_dims.c = input_shape.Dims(3);
+
+  // Initialize cmsis-nn filter dimensions
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = output_shape.Dims(3);
+  filter_dims.h = filter->dims->data[1];
+  filter_dims.w = filter->dims->data[2];
+  filter_dims.c = input_dims.c;
+
+  // Initialize cmsis-nn output dimensions
+  cmsis_nn_dims output_dims;
+  output_dims.n = input_dims.n;
+  output_dims.h = output->dims->data[1];
+  output_dims.w = output->dims->data[2];
+  output_dims.c = output_shape.Dims(3);
 
   int* buffer_idx = reinterpret_cast<int*>(node->user_data);
 
   TF_LITE_ENSURE_STATUS(CalculateOpData(
-      context, node, params, input_width, input_height, filter_width,
-      filter_height, output_width, output_height, input->type, &data));
+      context, node, params, input_dims.w, input_dims.h, filter_dims.w,
+      filter_dims.h, output_dims.w, output_dims.h, input->type, &data));
 
-  if (data.padding.width == 0 && data.padding.height == 0 &&
-      (input_depth % 4 == 0) && params->stride_width == 1 &&
-      params->stride_height == 1 && filter_width == 1 && filter_height == 1) {
-    buf_size = arm_convolve_1x1_s8_fast_get_buffer_size(input_depth);
-  } else if (output_height == 1 && input_height == 1 && filter_height == 1 &&
-             (output_width % 4 == 0) && batches == 1) {
-    buf_size = arm_convolve_1_x_n_s8_get_buffer_size(input_depth, filter_width,
-                                                     filter_height);
-  } else {
-    buf_size = arm_convolve_s8_get_buffer_size(input_depth, filter_width,
-                                               filter_height);
+  if(input->type == kTfLiteInt8) {
+    // Initialize cmsis-nn convolution parameters
+    cmsis_nn_conv_params conv_params;
+    conv_params.input_offset = -input->params.zero_point;
+    conv_params.output_offset = output->params.zero_point;
+    conv_params.stride.h = params->stride_height;
+    conv_params.stride.w = params->stride_width;
+    conv_params.dilation.h = params->dilation_height_factor;
+    conv_params.dilation.w = params->dilation_width_factor;
+    conv_params.padding.h = data.padding.height;
+    conv_params.padding.w = data.padding.width;
+    conv_params.activation.min = data.output_activation_min;
+    conv_params.activation.max = data.output_activation_max;
+
+    buf_size = arm_convolve_wrapper_s8_get_buffer_size(&conv_params,
+                                                       &input_dims,
+                                                       &filter_dims,
+                                                       &output_dims);
   }
 
   node->user_data = buffer_idx;
@@ -204,6 +224,107 @@ TfLiteStatus EvalQuantizedPerChannel(
     TfLiteContext* context, TfLiteNode* node, TfLiteConvParams* params,
     OpData* data, const TfLiteTensor* input, const TfLiteTensor* filter,
     const TfLiteTensor* bias, TfLiteTensor* output, TfLiteTensor* im2col) {
+
+  // Initialize cmsis-nn convolution parameters
+  cmsis_nn_conv_params conv_params;
+  conv_params.input_offset = -input->params.zero_point;
+  conv_params.output_offset = output->params.zero_point;
+  conv_params.stride.h = params->stride_height;
+  conv_params.stride.w = params->stride_width;
+  conv_params.dilation.h = params->dilation_height_factor;
+  conv_params.dilation.w = params->dilation_width_factor;
+  conv_params.padding.h = data->padding.height;
+  conv_params.padding.w = data->padding.width;
+  conv_params.activation.min = data->output_activation_min;
+  conv_params.activation.max = data->output_activation_max;
+
+  // Initialize cmsis-nn per channel quantization parameters
+  cmsis_nn_per_channel_quant_params quant_params;
+  quant_params.multiplier = data->per_channel_output_multiplier;
+  quant_params.shift = data->per_channel_output_shift;
+
+#if defined(__ARM_FEATURE_DSP)
+  RuntimeShape filter_shape = GetTensorShape(filter);
+  RuntimeShape input_shape = GetTensorShape(input);
+  RuntimeShape output_shape = GetTensorShape(output);
+  RuntimeShape bias_shape = GetTensorShape(bias);
+
+  // Sanity check.
+  TFLITE_DCHECK_LE(conv_params.activation.min, conv_params.activation.max);
+  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
+  const int batch_size = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (GetTensorData<int8_t>(bias)) {
+    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
+  }
+
+  // Initialize cmsis-nn dimensions
+  // Input
+  cmsis_nn_dims input_dims;
+  input_dims.n = batch_size;
+  input_dims.h = input_shape.Dims(1);
+  input_dims.w = input_shape.Dims(2);
+  input_dims.c = input_depth;
+
+  // Filter
+  cmsis_nn_dims filter_dims;
+  filter_dims.n = output_depth;
+  filter_dims.h = filter_shape.Dims(1);
+  filter_dims.w = filter_shape.Dims(2);
+  filter_dims.c = input_depth;
+
+  // Bias
+  cmsis_nn_dims bias_dims;
+  bias_dims.n = 1;
+  bias_dims.h = 1;
+  bias_dims.w = 1;
+  bias_dims.c = output_depth;
+
+  // Output
+  cmsis_nn_dims output_dims;
+  output_dims.n = batch_size;
+  output_dims.h = output_shape.Dims(1);
+  output_dims.w = output_shape.Dims(2);
+  output_dims.c = output_depth;
+
+  // Initialize cmsis-nn context
+  cmsis_nn_context ctx;
+  ctx.buf = nullptr;
+  ctx.size = 0;
+
+  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
+  if (*buffer_idx > -1) {
+    ctx.buf = context->GetScratchBuffer(context, *buffer_idx);
+    // Note: ctx.size is currently not used in cmsis-nn.
+    // The buffer should be allocated in the Prepare function through arm_convolve_wrapper_s8_get_buffer_size
+  }
+
+  // arm_convolve_wrapper_s8 dispatches the optimized kernel accordingly with the parameters passed
+  arm_status status = arm_convolve_wrapper_s8(&ctx,
+                                              &conv_params,
+                                              &quant_params,
+                                              &input_dims,
+                                              GetTensorData<int8_t>(input),
+                                              &filter_dims,
+                                              GetTensorData<int8_t>(filter),
+                                              &bias_dims,
+                                              GetTensorData<int32>(bias),
+                                              &output_dims,
+                                              GetTensorData<int8_t>(output));
+
+  if(status == ARM_MATH_SUCCESS) {
+      return kTfLiteOk;
+  } else {
+      return kTfLiteError;
+  }
+
+#else
+#pragma message( \
+    "CMSIS-NN optimization for conv not available for this target. Using reference kernel.")
+
   ConvParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.output_offset = output->params.zero_point;
@@ -216,91 +337,6 @@ TfLiteStatus EvalQuantizedPerChannel(
   op_params.quantized_activation_min = data->output_activation_min;
   op_params.quantized_activation_max = data->output_activation_max;
 
-#if defined(__ARM_FEATURE_DSP)
-  RuntimeShape filter_shape = GetTensorShape(filter);
-  RuntimeShape input_shape = GetTensorShape(input);
-  RuntimeShape output_shape = GetTensorShape(output);
-  RuntimeShape bias_shape = GetTensorShape(bias);
-
-  // Set min and max value of the output.
-  const int32 output_activation_min = std::numeric_limits<int8_t>::min();
-  const int32 output_activation_max = std::numeric_limits<int8_t>::max();
-
-  // Sanity check.
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
-  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
-  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
-  if (GetTensorData<int8_t>(bias)) {
-    TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
-  }
-
-  const int input_height = input_shape.Dims(1);
-  const int input_width = input_shape.Dims(2);
-  const int filter_height = filter_shape.Dims(1);
-  const int filter_width = filter_shape.Dims(2);
-  const int output_height = output_shape.Dims(1);
-  const int output_width = output_shape.Dims(2);
-  int16_t* buf = nullptr;
-
-  auto* buffer_idx = reinterpret_cast<int*>(node->user_data);
-  if (*buffer_idx > -1) {
-    void* raw = context->GetScratchBuffer(context, *buffer_idx);
-    buf = reinterpret_cast<int16_t*>(raw);
-  }
-
-  if (op_params.padding_values.width == 0 &&
-      op_params.padding_values.height == 0 && (input_depth % 4 == 0) &&
-      op_params.stride_width == 1 && op_params.stride_height == 1 &&
-      filter_width == 1 && filter_height == 1) {
-    if (arm_convolve_1x1_s8_fast(
-            GetTensorData<int8_t>(input), input_width, input_height,
-            input_depth, batches, GetTensorData<int8_t>(filter), output_depth,
-            op_params.padding_values.width, op_params.padding_values.height,
-            op_params.stride_width, op_params.stride_height,
-            GetTensorData<int32>(bias), GetTensorData<int8_t>(output),
-            data->per_channel_output_shift, data->per_channel_output_multiplier,
-            op_params.output_offset, op_params.input_offset,
-            output_activation_min, output_activation_max, output_width,
-            output_height, buf) != ARM_MATH_SUCCESS) {
-      return kTfLiteError;
-    }
-
-  } else if (output_height == 1 && input_height == 1 && filter_height == 1 &&
-             (output_width % 4 == 0) && batches == 1) {
-    if (arm_convolve_1_x_n_s8(
-            GetTensorData<int8_t>(input), input_width, input_depth, batches,
-            GetTensorData<int8_t>(filter), output_depth, filter_width,
-            op_params.padding_values.width, op_params.stride_width,
-            GetTensorData<int32_t>(bias), GetTensorData<int8_t>(output),
-            data->per_channel_output_shift, data->per_channel_output_multiplier,
-            op_params.output_offset, op_params.input_offset,
-            output_activation_min, output_activation_max, output_width,
-            buf) != ARM_MATH_SUCCESS) {
-      return kTfLiteError;
-    }
-  } else {
-    if (arm_convolve_s8(
-            GetTensorData<int8_t>(input), input_width, input_height,
-            input_depth, batches, GetTensorData<int8_t>(filter), output_depth,
-            filter_width, filter_height, op_params.padding_values.width,
-            op_params.padding_values.height, op_params.stride_width,
-            op_params.stride_height, GetTensorData<int32>(bias),
-            GetTensorData<int8_t>(output), data->per_channel_output_shift,
-            data->per_channel_output_multiplier, op_params.output_offset,
-            op_params.input_offset, output_activation_min,
-            output_activation_max, output_width, output_height,
-            buf) != ARM_MATH_SUCCESS) {
-      return kTfLiteError;
-    }
-  }
-#else
-#pragma message( \
-    "CMSIS-NN optimization for conv not available for this target. Using reference kernel.")
-
   reference_integer_ops::ConvPerChannel(
       op_params, data->per_channel_output_multiplier,
       data->per_channel_output_shift, GetTensorShape(input),
@@ -420,3 +456,4 @@ TfLiteRegistration* Register_CONV_2D() {
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
+
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 9251e4c161e..3b6d8f25de8 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -28,8 +28,8 @@ LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765"
 TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz"
 TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f"
 
-CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/8a4db53f69da06e97565fe2f2e8926d193a5759d.zip"
-CMSIS_MD5 := "e9864fb71b65adc4f7d92a9dea6e1aab"
+CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/1150e71e07c79b538efd842aba5b210a31827ae5.zip"
+CMSIS_MD5 := "e05f4222ef58825193910b41a0871dcb"
 
 AM_SDK_URL := "http://s3.asia.ambiqmicro.com/downloads/AmbiqSuite-Rel2.2.0.zip"
 AM_SDK_MD5 := "7605fa2d4d97e6bb7a1190c92b66b597"

From 550581f6bd90ade2e72bdd6af843873c5b92861d Mon Sep 17 00:00:00 2001
From: Bramandia Ramadhana <bramandia@google.com>
Date: Wed, 20 May 2020 08:50:21 -0700
Subject: [PATCH 0899/1533] When calling connect_to_cluser, if the options are
 identical and there is no renaming of local device, reuse existing local
 DeviceManager, otherwise we keep the old DeviceManager around to allow the
 old Tensor created to be usable.

PiperOrigin-RevId: 312489501
Change-Id: Id392d0324aba7e7f9e92f8efeaf33683157470e1
---
 tensorflow/c/eager/c_api.cc                   | 18 ++++-
 tensorflow/c/eager/c_api_cluster_test.cc      | 68 ++++++++++++++++++-
 tensorflow/c/experimental/network.cc          |  2 +-
 .../core/common_runtime/eager/context.cc      | 17 +++--
 .../core/common_runtime/eager/context.h       | 11 ++-
 .../eager/eager_service_impl.cc               |  4 +-
 .../core/distributed_runtime/graph_mgr.cc     |  2 +-
 .../core/distributed_runtime/graph_mgr.h      |  4 +-
 .../rpc/grpc_server_lib.cc                    | 43 +++++++-----
 .../distributed_runtime/rpc/grpc_server_lib.h |  8 +++
 .../core/distributed_runtime/server_lib.cc    | 12 +++-
 .../core/distributed_runtime/server_lib.h     | 11 ++-
 .../distributed_runtime/server_lib_test.cc    |  2 +-
 .../core/distributed_runtime/session_mgr.cc   |  2 +-
 tensorflow/core/distributed_runtime/worker.cc |  2 +-
 .../core/distributed_runtime/worker_env.h     |  2 +-
 .../distributed_runtime/worker_session.cc     |  4 +-
 .../core/distributed_runtime/worker_session.h | 10 +--
 18 files changed, 174 insertions(+), 48 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 912cd184b77..5a39c17e1d9 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -102,6 +102,15 @@ string DeviceName(const tensorflow::Device* d) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
+bool AreLocalDevicesCompatible(const tensorflow::EagerContext* context,
+                               const tensorflow::ServerDef& server_def) {
+  if (server_def.job_name() != context->HostCPU()->parsed_name().job) {
+    return false;
+  }
+  return server_def.default_session_config().SerializeAsString() ==
+         context->session_options().config.SerializeAsString();
+}
+
 tensorflow::Status AddRemoteDevicesToMgr(
     const std::vector<string>& added_remote_workers,
     tensorflow::WorkerCacheInterface* worker_cache,
@@ -469,10 +478,15 @@ tensorflow::Status UpdateTFE_ContextWithServerDef(
       tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
   tensorflow::GrpcServer* grpc_server;
   if (reset_context) {
-    LOG_AND_RETURN_IF_ERROR(tensorflow::NewServer(server_def, &new_server));
+    const tensorflow::DeviceMgr* device_mgr =
+        AreLocalDevicesCompatible(context, server_def)
+            ? context->local_device_mgr()
+            : nullptr;
+    LOG_AND_RETURN_IF_ERROR(tensorflow::NewServerWithOptions(
+        server_def, {device_mgr}, &new_server));
     grpc_server = dynamic_cast<tensorflow::GrpcServer*>(new_server.get());
     LOG_AND_RETURN_IF_ERROR(
-        ListRemoteWorkers(grpc_server, worker_name, &remote_workers));
+        ListRemoteWorkers(new_server.get(), worker_name, &remote_workers));
   } else {
     LOG_AND_RETURN_IF_ERROR(ListRemoteWorkers(context->GetServer(), worker_name,
                                               &curr_remote_workers));
diff --git a/tensorflow/c/eager/c_api_cluster_test.cc b/tensorflow/c/eager/c_api_cluster_test.cc
index 252a0408758..f8c702d592a 100644
--- a/tensorflow/c/eager/c_api_cluster_test.cc
+++ b/tensorflow/c/eager/c_api_cluster_test.cc
@@ -41,7 +41,7 @@ tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
   for (int i = 0; i < num_tasks; i++) {
     int port = tensorflow::testing::PickUnusedPortOrDie();
     job_def->mutable_tasks()->insert(
-        {i, tensorflow::strings::StrCat("localhost:", port)});
+        {i, tensorflow::strings::StrCat("localhost", ":", port)});
   }
   return server_def;
 }
@@ -430,4 +430,70 @@ TEST(CAPI, RemoteExecuteUpdateServerDefWithFailuresAsync) {
   TestRemoteExecuteUpdateServerDefWithFailures(true);
 }
 
+void TestConnectToCluster(bool keep_localhost_for_first_connect) {
+  // Fail fast on GetStatus requests so we can get errors instead of timeout
+  // when updating cluster with non-exsitent worker
+  tensorflow::setenv("GRPC_FAIL_FAST", "TRUE", /*overwrite=*/1);
+
+  const string first_name =
+      keep_localhost_for_first_connect ? "localhost" : "abc";
+  tensorflow::ServerDef server_def = GetServerDef(first_name, 1);
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  const string dev0_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_TensorHandle* var_handle0 = TestVariable(ctx, 1.0, dev0_name);
+  EXPECT_NE(var_handle0, nullptr);
+
+  tensorflow::Status status2;
+  EXPECT_EQ(tensorflow::unwrap(var_handle0)->DeviceName(&status2), dev0_name);
+
+  // Rename local device
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const string dev1_name =
+      absl::StrCat("/job:", first_name, "/replica:0/task:0/device:CPU:0");
+  TFE_TensorHandle* var_handle1 = TestVariable(ctx, 2.0, dev1_name);
+  EXPECT_NE(var_handle1, nullptr);
+  EXPECT_EQ(tensorflow::unwrap(var_handle1)->DeviceName(&status2), dev1_name);
+
+  // Another renaming of local device
+  const string second_name = "def";
+  server_def.set_job_name(second_name);
+  server_def.mutable_cluster()->mutable_job(0)->set_name(second_name);
+  (*server_def.mutable_cluster()->mutable_job(0)->mutable_tasks())[0] =
+      absl::StrCat(second_name, ":",
+                   tensorflow::testing::PickUnusedPortOrDie());
+
+  serialized = server_def.SerializeAsString();
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  const string dev2_name = "/job:def/replica:0/task:0/device:CPU:0";
+  TFE_TensorHandle* var_handle2 = TestVariable(ctx, 2.0, dev2_name);
+  EXPECT_NE(var_handle2, nullptr);
+  EXPECT_EQ(tensorflow::unwrap(var_handle2)->DeviceName(&status2), dev2_name);
+
+  TFE_DeleteTensorHandle(var_handle0);
+  TFE_DeleteTensorHandle(var_handle1);
+  TFE_DeleteTensorHandle(var_handle2);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  tensorflow::unsetenv("GRPC_FAIL_FAST");
+}
+
+TEST(CAPI, ConnectToClusterLocalhostFirst) { TestConnectToCluster(false); }
+
+TEST(CAPI, ConnectToClusterRenameFirst) { TestConnectToCluster(true); }
+
 }  // namespace
diff --git a/tensorflow/c/experimental/network.cc b/tensorflow/c/experimental/network.cc
index 94375cf9983..97e63ec6259 100644
--- a/tensorflow/c/experimental/network.cc
+++ b/tensorflow/c/experimental/network.cc
@@ -108,7 +108,7 @@ class CServerFactory : public ServerFactory {
         delete_function_(delete_function),
         rendezvous_builder_(rendezvous_builder) {}
 
-  Status NewServer(const ServerDef& server_def,
+  Status NewServer(const ServerDef& server_def, const Options& options,
                    std::unique_ptr<ServerInterface>* out_server) override {
     TF_RETURN_IF_ERROR(CGrpcServer::Create(
         server_def, init_function_, start_function_, stop_function_,
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index 207c6a02d5b..1024f3caabd 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -81,7 +81,8 @@ EagerContext::EagerContext(
     bool device_mgr_owned, Rendezvous* rendezvous,
     const CustomKernelCreator* custom_kernel_creator,
     DistributedFunctionLibraryRuntime* cluster_flr)
-    : default_device_placement_policy_(default_device_placement_policy),
+    : opts_(opts),
+      default_device_placement_policy_(default_device_placement_policy),
       default_mirroring_policy_(default_mirroring_policy),
       local_device_manager_(device_mgr, device_mgr_owned),
       host_cpu_device_(device_mgr->HostCPU()),
@@ -1051,7 +1052,7 @@ void EagerContext::IncrementContextViewId() {
 // Set collective ops related state in the context. Passing nullptr to
 // `new_server` will reuse the existing GRPC server in context.
 Status EagerContext::StoreCollectiveOpsServer(
-    std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
+    std::unique_ptr<ServerInterface> new_server, const DeviceMgr* device_mgr,
     CollectiveExecutorMgrInterface* rpc_collective_executor_mgr) {
   collective_executor_mgr_.Reset(rpc_collective_executor_mgr);
 
@@ -1176,7 +1177,7 @@ Status EagerContext::InitializeRemoteMaster(
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
     const std::vector<string>& remote_contexts, uint64 context_id,
-    Rendezvous* r, DeviceMgr* local_device_mgr, int keep_alive_secs,
+    Rendezvous* r, const DeviceMgr* local_device_mgr, int keep_alive_secs,
     DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
@@ -1275,7 +1276,7 @@ Status EagerContext::SetMasterContextState(
     std::shared_ptr<WorkerSession> worker_session,
     std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
     std::unique_ptr<DynamicDeviceMgr> remote_device_manager, uint64 context_id,
-    uint64 context_view_id, Rendezvous* r, DeviceMgr* local_device_mgr,
+    uint64 context_view_id, Rendezvous* r, const DeviceMgr* local_device_mgr,
     int keep_alive_secs, DistributedFunctionLibraryRuntime* cluster_flr,
     std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
         remote_mgr) {
@@ -1287,7 +1288,13 @@ Status EagerContext::SetMasterContextState(
   use_send_tensor_rpc_ =
       ReadBoolFromEnvVar("TF_EAGER_REMOTE_USE_SEND_TENSOR_RPC", true);
 
-  local_device_manager_.Reset(local_device_mgr);
+  if (local_device_mgr != local_device_manager_.Get()) {
+    if (local_device_manager_.Owned()) {
+      old_local_device_managers_.push_back(
+          std::move(local_device_manager_.owned_object));
+    }
+    local_device_manager_.Reset(local_device_mgr);
+  }
   host_cpu_device_ = local_device_manager_.Get()->HostCPU();
 
   if (rendezvous_ != nullptr) rendezvous_->Unref();
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index d03a91c817a..cceb883a965 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -399,7 +399,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
       const std::vector<string>& remote_contexts, uint64 context_id,
-      Rendezvous* r, DeviceMgr* local_device_mgr, int keep_alive_secs,
+      Rendezvous* r, const DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
@@ -436,7 +436,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       const std::vector<string>& remote_contexts, uint64 context_id);
 
   Status StoreCollectiveOpsServer(
-      std::unique_ptr<ServerInterface> new_server, DeviceMgr* device_mgr,
+      std::unique_ptr<ServerInterface> new_server, const DeviceMgr* device_mgr,
       CollectiveExecutorMgrInterface* rpc_collective_executor_mgr);
 
   // For the specified remote worker, preprocess and set its device filters.
@@ -510,6 +510,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // Gets the CPU device on the task of device.
   Status CPUDeviceOnTask(const Device* device, Device** cpu_device) const;
 
+  const SessionOptions& session_options() const { return opts_; }
+
  private:
   ~EagerContext() override;
 
@@ -563,6 +565,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
     T* unowned_object_ptr = nullptr;
   };
 
+  SessionOptions opts_;
   const ContextDevicePlacementPolicy default_device_placement_policy_;
   const ContextMirroringPolicy default_mirroring_policy_;
 
@@ -575,6 +578,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       TF_GUARDED_BY(policy_map_mu_);
 
   OwnedOrUnownedHelper<const DeviceMgr> local_device_manager_;
+  // Maintain copy of all previously created local device managers.
+  std::vector<std::unique_ptr<const DeviceMgr>> old_local_device_managers_;
 
   // Unowned DynamicDeviceMgr is set on remote worker to allow running
   // multi-device function on remote worker.
@@ -662,7 +667,7 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
       std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
       std::unique_ptr<DynamicDeviceMgr> remote_device_manager,
       uint64 context_id, uint64 context_view_id, Rendezvous* r,
-      DeviceMgr* local_device_mgr, int keep_alive_secs,
+      const DeviceMgr* local_device_mgr, int keep_alive_secs,
       DistributedFunctionLibraryRuntime* cluster_flr,
       std::unique_ptr<eager::RemoteMgr, std::function<void(eager::RemoteMgr*)>>
           remote_mgr);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 6dc03cbc527..5327cbb6480 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -238,7 +238,7 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
   TF_RETURN_IF_ERROR(env_->session_mgr->WorkerSessionForSession(
       session_name, &worker_session));
 
-  tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
+  const tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
 
   // Initialize remote tensor communication based on worker session.
   TF_RETURN_IF_ERROR(r->Initialize(worker_session.get()));
@@ -355,7 +355,7 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
   TF_RETURN_IF_ERROR(env_->session_mgr->WorkerSessionForSession(
       session_name, &worker_session));
 
-  tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
+  const tensorflow::DeviceMgr* device_mgr = worker_session->device_mgr();
 
   std::vector<string> remote_workers;
   worker_session->worker_cache()->ListWorkers(&remote_workers);
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 8b363e66d87..fe353d7d76c 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -55,7 +55,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-GraphMgr::GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr)
+GraphMgr::GraphMgr(const WorkerEnv* worker_env, const DeviceMgr* device_mgr)
     : worker_env_(worker_env), device_mgr_(device_mgr), table_(5) {
   // The default value of sync_on_finish will be flipped soon and this
   // environment variable will be removed as well.
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index 50190ab337e..e768c0907b6 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -69,7 +69,7 @@ class WorkerSession;
 //   EXPECT_EQ(out["c"], Tensor({4, 6}));
 class GraphMgr {
  public:
-  explicit GraphMgr(const WorkerEnv* worker_env, DeviceMgr* device_mgr);
+  explicit GraphMgr(const WorkerEnv* worker_env, const DeviceMgr* device_mgr);
   ~GraphMgr();
 
   // Registers a graph. Fills in "handle". The registered graph retains a
@@ -145,7 +145,7 @@ class GraphMgr {
   };
 
   const WorkerEnv* worker_env_;  // Not owned.
-  DeviceMgr* device_mgr_;
+  const DeviceMgr* device_mgr_;
 
   CostModelManager cost_model_manager_;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 754209082fd..6523d2fb4dd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -130,9 +130,6 @@ GrpcServer::~GrpcServer() {
   // OpSegments.)
   if (worker_env_.session_mgr != nullptr) {
     delete worker_env_.session_mgr;  // Deletes graph_mgr's.
-  } else {
-    // Note: session_mgr's legacy_session_ deletes device_mgr now.
-    delete worker_env_.device_mgr;
   }
 
   // Do not delete (as these are not owned by the server):
@@ -204,12 +201,18 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   string name_prefix =
       strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
                       "/task:", server_def_.task_index());
-  std::vector<std::unique_ptr<Device>> devices;
-  TF_RETURN_IF_ERROR(
-      DeviceFactory::AddDevices(sess_opts, name_prefix, &devices));
-  worker_env_.device_mgr = new StaticDeviceMgr(std::move(devices));
-  master_env_.local_devices = worker_env_.device_mgr->ListDevices();
+  if (opts.local_device_mgr == nullptr) {
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_RETURN_IF_ERROR(
+        DeviceFactory::AddDevices(sess_opts, name_prefix, &devices));
+    worker_env_.device_mgr = new StaticDeviceMgr(std::move(devices));
+    owned_device_manager_.reset(worker_env_.device_mgr);
+  } else {
+    worker_env_.device_mgr = opts.local_device_mgr;
+    owned_device_manager_.reset(nullptr);
+  }
   worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
+  master_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.rendezvous_mgr = opts.rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
                                    : opts.rendezvous_mgr_func(&worker_env_);
@@ -527,12 +530,13 @@ std::unique_ptr<Master> GrpcServer::CreateMaster(MasterEnv* master_env) {
 
 /* static */
 Status GrpcServer::Create(const ServerDef& server_def, Env* env,
+                          const DeviceMgr* local_device_mgr,
                           std::unique_ptr<ServerInterface>* out_server) {
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  ServiceInitFunction service_func = nullptr;
   GrpcServerOptions options;
   options.rendezvous_mgr_func = NewRpcRendezvousMgr;
+  options.local_device_mgr = local_device_mgr;
   Status s = ret->Init(options);
   if (!s.ok()) {
     LOG(ERROR) << s;
@@ -542,19 +546,21 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   return Status::OK();
 }
 
+/* static */
+Status GrpcServer::Create(const ServerDef& server_def, Env* env,
+                          std::unique_ptr<ServerInterface>* out_server) {
+  return Create(server_def, env, nullptr, out_server);
+}
+
 /* static */
 Status GrpcServer::Create(const ServerDef& server_def, Env* env,
                           std::unique_ptr<GrpcServer>* out_server) {
-  std::unique_ptr<GrpcServer> ret(
-      new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
-  GrpcServerOptions options;
-  options.rendezvous_mgr_func = NewRpcRendezvousMgr;
-  Status s = ret->Init(options);
+  std::unique_ptr<ServerInterface> server;
+  Status s = Create(server_def, env, nullptr, &server);
   if (!s.ok()) {
-    LOG(ERROR) << s;
     return s;
   }
-  *out_server = std::move(ret);
+  out_server->reset(dynamic_cast<GrpcServer*>(server.release()));
   return Status::OK();
 }
 
@@ -566,9 +572,10 @@ class GrpcServerFactory : public ServerFactory {
     return server_def.protocol() == "grpc";
   }
 
-  Status NewServer(const ServerDef& server_def,
+  Status NewServer(const ServerDef& server_def, const Options& options,
                    std::unique_ptr<ServerInterface>* out_server) override {
-    return GrpcServer::Create(server_def, Env::Default(), out_server);
+    return GrpcServer::Create(server_def, Env::Default(),
+                              options.local_device_mgr, out_server);
   }
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index b3fa7d1f303..0474c5a517f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -68,11 +68,14 @@ struct GrpcServerOptions {
   WorkerCreationFunction worker_func = nullptr;
   StatsPublisherFactory stats_factory = CreateNoOpStatsPublisher;
   GrpcWorkerServiceOptions worker_service_options;
+  const DeviceMgr* local_device_mgr = nullptr;
 };
 
 class GrpcServer : public ServerInterface {
  protected:
   GrpcServer(const ServerDef& server_def, Env* env);
+  GrpcServer(const ServerDef& server_def, DeviceMgr* local_device_mgr,
+             Env* env);
   // Allow children classes to override this and provide custom args to the
   // server before it is constructed. Default behavior is to do nothing.
   virtual void MaybeMutateBuilder(::grpc::ServerBuilder* builder);
@@ -82,6 +85,10 @@ class GrpcServer : public ServerInterface {
                        std::unique_ptr<ServerInterface>* out_server);
   static Status Create(const ServerDef& server_def, Env* env,
                        std::unique_ptr<GrpcServer>* out_server);
+  // Reuse the local_device_mgr.
+  static Status Create(const ServerDef& server_def, Env* env,
+                       const DeviceMgr* local_device_mgr,
+                       std::unique_ptr<ServerInterface>* out_server);
 
   // Destruction is only supported in the factory method. Clean
   // shutdown is not currently implemented for this server type.
@@ -163,6 +170,7 @@ class GrpcServer : public ServerInterface {
 
   // Implementation of a TensorFlow worker, and RPC polling thread.
   WorkerEnv worker_env_;
+  std::unique_ptr<const DeviceMgr> owned_device_manager_;
   std::unique_ptr<GrpcWorker> worker_impl_;
   AsyncServiceInterface* worker_service_ = nullptr;
   std::unique_ptr<Thread> worker_thread_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 62a2011db39..12baa75976a 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -73,7 +73,17 @@ Status NewServer(const ServerDef& server_def,
                  std::unique_ptr<ServerInterface>* out_server) {
   ServerFactory* factory;
   TF_RETURN_IF_ERROR(ServerFactory::GetFactory(server_def, &factory));
-  return factory->NewServer(server_def, out_server);
+  return factory->NewServer(server_def, ServerFactory::Options(), out_server);
+}
+
+// Creates a server based on the given `server_def`, and stores it in
+// `*out_server`. Returns OK on success, otherwise returns an error.
+Status NewServerWithOptions(const ServerDef& server_def,
+                            const ServerFactory::Options& options,
+                            std::unique_ptr<ServerInterface>* out_server) {
+  ServerFactory* factory;
+  TF_RETURN_IF_ERROR(ServerFactory::GetFactory(server_def, &factory));
+  return factory->NewServer(server_def, options, out_server);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/server_lib.h b/tensorflow/core/distributed_runtime/server_lib.h
index 275f526d311..7b4b4892848 100644
--- a/tensorflow/core/distributed_runtime/server_lib.h
+++ b/tensorflow/core/distributed_runtime/server_lib.h
@@ -24,6 +24,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 // This library supports a registration/factory-based mechanism for
 // creating TensorFlow server objects. Each server implementation must
 // have an accompanying implementation of ServerFactory, and create a
@@ -63,10 +65,14 @@ class ServerInterface {
 
 class ServerFactory {
  public:
+  struct Options {
+    // Local DeviceMgr to use.
+    const tensorflow::DeviceMgr* local_device_mgr;
+  };
   // Creates a new server based on the given `server_def`, and stores
   // it in `*out_server`. Returns OK on success, otherwise returns an
   // error.
-  virtual Status NewServer(const ServerDef& server_def,
+  virtual Status NewServer(const ServerDef& server_def, const Options& options,
                            std::unique_ptr<ServerInterface>* out_server) = 0;
 
   // Returns true if and only if this factory can create a server
@@ -92,6 +98,9 @@ class ServerFactory {
 // `*out_server`. Returns OK on success, otherwise returns an error.
 Status NewServer(const ServerDef& server_def,
                  std::unique_ptr<ServerInterface>* out_server);
+Status NewServerWithOptions(const ServerDef& server_def,
+                            const ServerFactory::Options& options,
+                            std::unique_ptr<ServerInterface>* out_server);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
index 77048c24b47..2152ff986d6 100644
--- a/tensorflow/core/distributed_runtime/server_lib_test.cc
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -26,7 +26,7 @@ class TestServerFactory : public ServerFactory {
     return server_def.protocol() == "test_protocol";
   }
 
-  Status NewServer(const ServerDef& server_def,
+  Status NewServer(const ServerDef& server_def, const Options& options,
                    std::unique_ptr<ServerInterface>* out_server) override {
     return Status::OK();
   }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index e2151e068f6..1d9a22a5817 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -171,7 +171,7 @@ Status SessionMgr::UpdateSession(
 
   std::vector<std::unique_ptr<Device>> cluster_devices;
 
-  DeviceMgr* local_device_mgr = worker_session->device_mgr();
+  const DeviceMgr* local_device_mgr = worker_session->device_mgr();
   DeviceMgr* remote_device_mgr = worker_session->remote_device_mgr();
   std::vector<Device*> curr_remote_devices = remote_device_mgr->ListDevices();
   std::vector<std::unique_ptr<Device>> added_remote_devices;
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 7850ecc46b2..f857a63e64d 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -38,7 +38,7 @@ Worker::Worker(WorkerEnv* env) : env_(env), recent_request_ids_(100000) {
 void Worker::GetStatusAsync(const GetStatusRequest* request,
                             GetStatusResponse* response, bool fail_fast,
                             StatusCallback done) {
-  DeviceMgr* dm = env_->device_mgr;
+  const DeviceMgr* dm = env_->device_mgr;
   std::vector<DeviceAttributes> devices;
   dm->ListDeviceAttributes(&devices);
   response->mutable_device_attributes()->Reserve(devices.size());
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index 93d933bfa60..ecc3313d0ce 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -53,7 +53,7 @@ struct WorkerEnv {
   // Note: Please use the device_mgr associated with your session if appropriate
   // instead of this one. Using this device_mgr does not support ClusterSpec
   // propagated sessions.
-  DeviceMgr* device_mgr = nullptr;
+  const DeviceMgr* device_mgr = nullptr;
 
   // A set of rendezvous keyed by step ids.
   RendezvousMgrInterface* rendezvous_mgr = nullptr;
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index ca4f25f08f5..3aed73fa358 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -144,7 +144,7 @@ Status WorkerSession::UpdateWorkerCacheAndDevices(
 std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
-    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    const DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr) {
   return std::shared_ptr<WorkerSession>(new WorkerSession(
       session_name, worker_name, std::move(worker_cache), borrowed_device_mgr,
@@ -154,7 +154,7 @@ std::shared_ptr<WorkerSession> WorkerSession::CreateWithBorrowedDeviceMgr(
 WorkerSession::WorkerSession(
     const string& session_name, const string& worker_name,
     std::unique_ptr<WorkerCacheInterface> worker_cache,
-    DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+    const DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
     std::unique_ptr<DynamicDeviceMgr> remote_device_mgr)
     : session_name_(session_name),
       worker_name_(worker_name),
diff --git a/tensorflow/core/distributed_runtime/worker_session.h b/tensorflow/core/distributed_runtime/worker_session.h
index 3b2d1122558..f870a8c064b 100644
--- a/tensorflow/core/distributed_runtime/worker_session.h
+++ b/tensorflow/core/distributed_runtime/worker_session.h
@@ -37,7 +37,7 @@ class WorkerSession {
   // sessions created with `isolate_session_state == false`. In the
   // those cases, this method returns a pointer to a borrowed
   // DeviceMgr (typically the `worker_env.device_mgr`).
-  DeviceMgr* device_mgr() {
+  const DeviceMgr* device_mgr() {
     return device_mgr_ ? device_mgr_.get() : borrowed_device_mgr_;
   }
 
@@ -65,7 +65,7 @@ class WorkerSession {
   static std::shared_ptr<WorkerSession> CreateWithBorrowedDeviceMgr(
       const string& session_name, const string& worker_name,
       std::unique_ptr<WorkerCacheInterface> worker_cache,
-      DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
+      const DeviceMgr* borrowed_device_mgr, std::unique_ptr<GraphMgr> graph_mgr,
       std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
 
   // In the eager runtime we allow WorkerSession to be updated, where the
@@ -90,7 +90,7 @@ class WorkerSession {
  private:
   WorkerSession(const string& session_name, const string& worker_name,
                 std::unique_ptr<WorkerCacheInterface> worker_cache,
-                DeviceMgr* borrowed_device_mgr,
+                const DeviceMgr* borrowed_device_mgr,
                 std::unique_ptr<GraphMgr> graph_mgr,
                 std::unique_ptr<DynamicDeviceMgr> remote_device_mgr);
 
@@ -113,8 +113,8 @@ class WorkerSession {
 
   std::unique_ptr<ClusterFunctionLibraryRuntime> cluster_flr_;
 
-  const std::unique_ptr<DeviceMgr> device_mgr_;
-  DeviceMgr* const borrowed_device_mgr_;  // Not owned.
+  const std::unique_ptr<const DeviceMgr> device_mgr_;
+  const DeviceMgr* const borrowed_device_mgr_;  // Not owned.
   std::unique_ptr<DynamicDeviceMgr> remote_device_mgr_;
 };
 

From df7d04e17cc99d86f295943e51a0920dd18e0bb9 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 20 May 2020 09:00:31 -0700
Subject: [PATCH 0900/1533] Nit: Remove trailing whitespace.

PiperOrigin-RevId: 312491296
Change-Id: I68e787549bdd491720a85741578ef3a75446e73c
---
 tensorflow/lite/delegates/gpu/gl/compiled_model.fbs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs b/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
index f25f9026629..6887b665ee4 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
+++ b/tensorflow/lite/delegates/gpu/gl/compiled_model.fbs
@@ -156,7 +156,7 @@ table CompiledModel {
 
 table Parameters {
   // indicated flow engine version that compiled this model. If engine version
-  // does not match compiled model, then a model need to be recompiled. 
+  // does not match compiled model, then a model need to be recompiled.
   // version:uint32; // not implemented
 
   // Could potentially be used to track environment when a model was compiled

From e06bd939a1e001dc519be7a74e6c0d5f78ccd3d8 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Wed, 20 May 2020 09:28:30 -0700
Subject: [PATCH 0901/1533] [TF] Add support for more than one outer batch
 dimension to tf.nn.conv2d.

This is part 1/N of adding outer batch dimension support to tf.nn.convXd and keras.layers.ConvXd.

Benchmarks in eager show slowdown is 1-3% on GPU, probabily within error.
<1% slowdown on CPU.

Raw numbers.

BEFORE this change:

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_CPU"
  iters: 30000
  wall_time: 185.06972789764404
  extras {
    key: "examples_per_sec"
    value {
      double_value: 5403.369
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 185.07
    }
  }
}

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_GPU"
  iters: 30000
  wall_time: 60.59416929880778
  extras {
    key: "examples_per_sec"
    value {
      double_value: 16503.238
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 60.594
    }
  }
}

AFTER this change:

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_CPU"
  iters: 30000
  wall_time: 187.3363415400187
  extras {
    key: "examples_per_sec"
    value {
      double_value: 5337.993
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 187.336
    }
  }
}

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_GPU"
  iters: 30000
  wall_time: 60.09331544240316
  extras {
    key: "examples_per_sec"
    value {
      double_value: 16640.786
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 60.093
    }
  }
}

PiperOrigin-RevId: 312496271
Change-Id: I5475f8efd49850f120919b4c1decb00558ac8705
---
 tensorflow/python/BUILD                       |   1 +
 .../python/kernel_tests/conv_ops_test.py      |  24 ++++
 tensorflow/python/ops/nn_ops.py               | 113 +++++++++++++++---
 3 files changed, 119 insertions(+), 19 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 869e2f2f8d8..ea8f564cc3f 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4179,6 +4179,7 @@ py_library(
         ":random_ops",
         ":tensor_shape",
         ":tensor_util",
+        ":variables",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 9192dc05ebc..18b7a47fc8c 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -431,6 +431,30 @@ class Conv2DTest(test.TestCase):
         padding="VALID",
         expected=expected_output)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    conv1 = nn_ops.conv2d(
+        x1,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    conv2 = nn_ops.conv2d(
+        x2,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
   @test_util.run_in_graph_and_eager_modes
   def testConv2D2x2Filter2x1Dilation(self):
     self._VerifyDilatedConvValues(
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index e7955100b24..4c00d085f82 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import numbers
 import os
 
@@ -238,6 +239,55 @@ class _NonAtrousConvolution(object):
         name=self.name)
 
 
+def _squeeze_batch_dims(inp, op, inner_rank, name):
+  """Returns `unsqueeze_batch(op(squeeze_batch(inp)))`.
+
+  Where `squeeze_batch` reshapes `inp` to shape
+  `[prod(inp.shape[:-inner_rank])] + inp.shape[-inner_rank:]`
+  and `unsqueeze_batch` does the reverse reshape but on the output.
+
+  Args:
+    inp: A tensor with dims `batch_shape + inner_shape` where `inner_shape`
+      is length `inner_rank`.
+    op: A callable that takes a single input tensor and returns a single.
+      output tensor.
+    inner_rank: A python integer.
+    name: A string.
+
+  Returns:
+    `unsqueeze_batch_op(squeeze_batch(inp))`.
+  """
+  with ops.name_scope(name, "Convolution", [inp]):
+    inp = ops.convert_to_tensor(inp, name="input")
+    shape = inp.shape
+
+    inner_shape = shape[-inner_rank:]
+    if not inner_shape.is_fully_defined():
+      inner_shape = array_ops.shape(inp)[-inner_rank:]
+
+    batch_shape = shape[:-inner_rank]
+    if not batch_shape.is_fully_defined():
+      batch_shape = array_ops.shape(inp)[:-inner_rank]
+
+    if isinstance(inner_shape, tensor_shape.TensorShape):
+      inp_reshaped = array_ops.reshape(inp, [-1] + inner_shape.as_list())
+    else:
+      inp_reshaped = array_ops.reshape(
+          inp, array_ops.concat(([-1], inner_shape), axis=-1))
+
+    out_reshaped = op(inp_reshaped)
+
+    out_inner_shape = out_reshaped.shape[-inner_rank:]
+    if not out_inner_shape.is_fully_defined():
+      out_inner_shape = array_ops.shape(out_reshaped)[-inner_rank:]
+
+    out = array_ops.reshape(
+        out_reshaped, array_ops.concat((batch_shape, out_inner_shape), axis=-1))
+
+    out.set_shape(inp.shape[:-inner_rank] + out.shape[-inner_rank:])
+    return out
+
+
 @tf_export("nn.dilation2d", v1=[])
 @dispatch.add_dispatch_support
 def dilation2d_v2(
@@ -1847,12 +1897,15 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
               dilations=None,
               name=None):
   # pylint: disable=line-too-long
-  r"""Computes a 2-D convolution given 4-D `input` and `filters` tensors.
+  r"""Computes a 2-D convolution given `input` and 4-D `filters` tensors.
 
-  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
-  and a filter / kernel tensor of shape
-  `[filter_height, filter_width, in_channels, out_channels]`, this op
-  performs the following:
+  The `input` tensor may have rank `4` or higher, where shape dimensions `[:-3]`
+  are considered batch dimensions (`batch_shape`).
+
+  Given an input tensor of shape
+  `batch_shape + [in_height, in_width, in_channels]` and a filter / kernel
+  tensor of shape `[filter_height, filter_width, in_channels, out_channels]`,
+  this op performs the following:
 
   1. Flattens the filter to a 2-D matrix with shape
      `[filter_height * filter_width * in_channels, output_channels]`.
@@ -1890,8 +1943,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
   Args:
     input: A `Tensor`. Must be one of the following types:
       `half`, `bfloat16`, `float32`, `float64`.
-      A 4-D tensor. The dimension order is interpreted according to the value
-      of `data_format`, see below for details.
+      A 4+-D tensor. The dimension order is interpreted according to the value
+      of `data_format`; with the all-but-inner-3 dimensions acting as batch
+      dimensions.  See below for details.
     filters: A `Tensor`. Must have the same type as `input`.
       A 4-D tensor of shape
       `[filter_height, filter_width, in_channels, out_channels]`
@@ -1911,9 +1965,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
       Defaults to `"NHWC"`.
       Specify the data format of the input and output data. With the
       default format "NHWC", the data is stored in the order of:
-          [batch, height, width, channels].
+          `batch_shape + [height, width, channels]`.
       Alternatively, the format could be "NCHW", the data storage order of:
-          [batch, channels, height, width].
+          `batch_shape + [channels, height, width]`.
     dilations: An int or list of `ints` that has length `1`, `2` or `4`,
       defaults to 1. The dilation factor for each dimension of`input`. If a
       single value is given it is replicated in the `H` and `W` dimension. By
@@ -1925,7 +1979,7 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
     name: A name for the operation (optional).
 
   Returns:
-    A `Tensor`. Has the same type as `input`.
+    A `Tensor`. Has the same type as `input` and the same outer batch shape.
   """
   # pylint: enable=line-too-long
   return conv2d(input,  # pylint: disable=redefined-builtin
@@ -2025,15 +2079,36 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
 
   strides = _get_sequence(strides, 2, channel_index, "strides")
   dilations = _get_sequence(dilations, 2, channel_index, "dilations")
-  return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
-                           filter,
-                           strides,
-                           padding,
-                           use_cudnn_on_gpu=use_cudnn_on_gpu,
-                           explicit_paddings=explicit_paddings,
-                           data_format=data_format,
-                           dilations=dilations,
-                           name=name)
+
+  # Try really hard to avoid modifying the legacy name scopes - return early.
+  shape = getattr(input, "shape", None)
+  if shape is not None:
+    ndims = getattr(shape, "ndims", -1)
+    if ndims == -1: ndims = len(shape)
+  if ndims in (4, 3, 2, 1, 0, None):
+    return gen_nn_ops.conv2d(
+        input,
+        filter=filter,
+        strides=strides,
+        padding=padding,
+        use_cudnn_on_gpu=use_cudnn_on_gpu,
+        explicit_paddings=explicit_paddings,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+  return _squeeze_batch_dims(
+      input,
+      functools.partial(
+          gen_nn_ops.conv2d,
+          filter=filter,
+          strides=strides,
+          padding=padding,
+          use_cudnn_on_gpu=use_cudnn_on_gpu,
+          explicit_paddings=explicit_paddings,
+          data_format=data_format,
+          dilations=dilations),
+      inner_rank=3,
+      name=name)
 
 
 @tf_export(v1=["nn.conv2d_backprop_filter"])

From 2b2b680ed3ba12b79909f630dacaf3d9333b1429 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 20 May 2020 16:58:13 +0000
Subject: [PATCH 0902/1533] Fix sha256sum issue of mkl_dnn/oneDNN

This PR tries to address the issue raised in 39696 where
the sha256 of the exiting workspace.bzl archive for mkl_dnn
does not match between mirrored one and the github one.

The issue comes from the fact that mkl_dnn has been renamed
to oneDNN repo, and github repackaged the archive.
The original mkl_dnn now alias to oneDNN (with re-archive).

This PR adjust the download link to go directly to
oneDNN so that the sha256 matches.

This PR fixes 39696.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7b9dc6f8e9b..e3c42f8c93b 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -213,11 +213,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "31e78581e59d7e60d4becaba3834fc6a5bf2dccdae3e16b7f70d89ceab38423f",
-        strip_prefix = "mkl-dnn-0.21.3",
+        sha256 = "a0211aeb5e7dad50b97fa5dffc1a2fe2fe732572d4164e1ee8750a2ede43fbec",
+        strip_prefix = "oneDNN-0.21.3",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.21.3.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/v0.21.3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz"
+            "https://github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz",
         ],
     )
 

From fe46bb0c4dc150590d659d0924816775efc920da Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 20 May 2020 17:13:42 +0000
Subject: [PATCH 0903/1533] Fix missing ',' in workspace.bzl

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/workspace.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e3c42f8c93b..19768949745 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -216,7 +216,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         sha256 = "a0211aeb5e7dad50b97fa5dffc1a2fe2fe732572d4164e1ee8750a2ede43fbec",
         strip_prefix = "oneDNN-0.21.3",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz"
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz",
             "https://github.com/oneapi-src/oneDNN/archive/v0.21.3.tar.gz",
         ],
     )

From 61e39add3b971607ad308142dfa08cc130134b49 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Wed, 20 May 2020 10:09:30 -0700
Subject: [PATCH 0904/1533] Auditing some const ref params for the xtensa
 kernels.

PiperOrigin-RevId: 312503864
Change-Id: I80d979418ca70ea6341682b0cca51784ef4c2b21
---
 .../kernels/xtensa_hifimini/fully_connected.cc      |  6 +++++-
 .../lite/micro/kernels/xtensa_hifimini/softmax.cc   | 13 ++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
index c8bba633de7..39f07862753 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fully_connected.cc
@@ -192,7 +192,11 @@ TfLiteStatus EvalQuantizedInt8(TfLiteContext* context, TfLiteNode* node,
                                const OpData& data, const TfLiteTensor* input,
                                const TfLiteTensor* filter,
                                const TfLiteTensor* bias, TfLiteTensor* output) {
-  // TODO(b/154032858): Investigate removing extra copies.
+  // TODO(b/154032858): Investigate removing extra copies, and also passing by
+  // value. TODO(b/155656675): Consider passing OpData by value once it is also
+  // passed to the FullyConnected function. Until it is copied to a local
+  // op_param variable, we do not get any latency improvements from passing by
+  // value.
   FullyConnectedParams op_params;
   op_params.input_offset = -input->params.zero_point;
   op_params.weights_offset = -filter->params.zero_point;
diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index a7c5604ef64..da75118b598 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -48,12 +48,12 @@ constexpr int kExpFractionalBits = 16;
 constexpr int kMaxExponentValue = (1 << kExpFractionalBits);
 
 // Quantized softmax with int8 input and int16 output.
-// TODO(b/155656675): Investigate removing const ref params.
-inline TfLiteStatus Softmax(const OpData& op_data,
-                            const RuntimeShape& input_shape,
-                            const int8_t* input_data,
-                            const RuntimeShape& output_shape,
-                            int16_t* output_data) {
+// Passing OpData by value does not have much savings in this op, but following
+// that as a best practice, at least for the xtensa kernels. See b/155656675 for
+// more details.
+TfLiteStatus Softmax(OpData op_data, const RuntimeShape& input_shape,
+                     const int8_t* input_data, const RuntimeShape& output_shape,
+                     int16_t* output_data) {
   // The last dimension is depth.  Outer size is the the total input size
   // divided by depth.
   const int trailing_dim = input_shape.DimensionsCount() - 1;
@@ -190,7 +190,6 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
-    // TODO(b/155656675): Const ref params can be slow on xtensa.
     return Softmax(*op_data, GetTensorShape(input),
                    GetTensorData<int8_t>(input), GetTensorShape(output),
                    GetTensorData<int16_t>(output));

From 86b560230dafa825726b48f5a6cedef09d25093d Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Wed, 20 May 2020 10:18:09 -0700
Subject: [PATCH 0905/1533] Handle TPU inputs to OutsideCompiled
 parallel_execute regions.

Adds ops to send/recv data from device -> host when inputs in the OutsideCompiled cluster come from the TPU cluster.  _TPUCompileMlir placeholder ops are also added to be replaced later because host side comm ops require the program_key as input.

PiperOrigin-RevId: 312505822
Change-Id: Ie67caee77bbd525f2e98f6bcc3e8ee5ed98fa235
---
 .../tpu_extract_outside_compilation.mlir      | 129 ++++++++++++++++++
 .../tpu_extract_outside_compilation.cc        | 110 +++++++++++++--
 2 files changed, 231 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 3cb693ee571..9396e1fb88a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -141,4 +141,133 @@ func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> te
   return %1 : tensor<?xf32>
 }
 
+// Tests extraction of a single outside compiled cluster with single device->host input.
+
+// CHECK-LABEL: func @single_outside_compiled_input_single_outside_compilation
+func @single_outside_compiled_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+      // CHECK-NEXT: "tf_device.launch"
+        // CHECK: %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+        // CHECK: %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+        // CHECK-SAME: key = "host_compute_channel_cluster1"
+        // CHECK: "tf.B"(%[[RECV_OUTPUT]])
+      // CHECK: "tf_device.cluster"
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+        // CHECK: "tf._HostComputeMlir"(%[[A_OUTPUT]])
+        // CHECK-SAME: key = "host_compute_channel_cluster1"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+      %4 = "tf.C"() : () -> tensor<?xi32>
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with arg input and single device->host input.
+
+// CHECK-LABEL: func @mixed_input_single_outside_compilation
+func @mixed_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+      // CHECK-NEXT: "tf_device.launch"
+        // CHECK: %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+        // CHECK: %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+        // CHECK-SAME: key = "host_compute_channel_cluster1"
+        // CHECK: "tf.B"(%arg0, %[[RECV_OUTPUT]])
+      // CHECK: "tf_device.cluster"
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+        // CHECK: "tf._HostComputeMlir"(%[[A_OUTPUT]])
+        // CHECK-SAME: key = "host_compute_channel_cluster1"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      "tf.B"(%arg0, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+      %4 = "tf.C"() : () -> tensor<?xi32>
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a multiple outside compiled clusters with single device->host input.
+
+// CHECK-LABEL: func @single_outside_compiled_input_multiple_outside_compilation
+func @single_outside_compiled_input_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT: "tf_device.launch"
+  // CHECK:        %[[STATUS_OUTPUT_2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:        %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
+  // CHECK-SAME: key = "host_compute_channel_cluster2"
+  // CHECK:        "tf.D"(%[[RECV_OUTPUT_2]])
+  // CHECK:       "tf_device.launch"
+  // CHECK:         %[[STATUS_OUTPUT_1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:         %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
+  // CHECK-SAME: key = "host_compute_channel_cluster1"
+  // CHECK:         "tf.B"(%[[RECV_OUTPUT_1]])
+  // CHECK:       "tf_device.cluster"
+  // CHECK:         %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:         "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME: key = "host_compute_channel_cluster1"
+  // CHECK:         %[[C_OUTPUT:[0-9]*]] = "tf.C"
+  // CHECK:         "tf._HostComputeMlir"(%[[C_OUTPUT]])
+  // CHECK-SAME: key = "host_compute_channel_cluster2"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+      %4 = "tf.C"() : () -> tensor<?xi32>
+      "tf.D"(%4) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> ()
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with multiple device->host inputs.
+
+// CHECK-LABEL: func @multiple_outside_compiled_inputs_single_outside_compilation
+func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+      // CHECK-NEXT: "tf_device.launch"
+        // CHECK: %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+        // CHECK: %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+        // CHECK-SAME: key = "host_compute_channel_cluster1"
+        // CHECK: "tf.C"(%[[RECV_OUTPUT]]#0)
+        // CHECK: "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
+      // CHECK: "tf_device.cluster"
+        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
+        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"
+        // CHECK: "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+        // CHECK-SAME: key = "host_compute_channel_cluster1"
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"() : () -> (tensor<?xi32>)
+      "tf.C"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> ()
+      "tf.D"(%4, %3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> ()
+      %5 = "tf.E"() : () -> tensor<?xi32>
+      tf_device.return %5 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
 // TODO(b/154363171): Add test cases for when output of outside compilation is returned by parallel_execute.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 4281b85bd7f..234532fd38b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -17,21 +17,26 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace TFTPU {
 
 namespace {
 
-constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
+constexpr char kAncestorsAttr[] = "ancestors";
 constexpr char kDeviceAttr[] = "device";
+constexpr char kKeyAttr[] = "key";
+constexpr char kShapesAttr[] = "shapes";
+constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
 // Mapping for `_xla_outside_compilation` attribute to ops of a cluster.
 using OutsideClusterMap =
@@ -116,6 +121,85 @@ void PropagateParallelExecuteReturnToReplicate(
         parallel_execute_op.execute_outputs());
 }
 
+// Extracts all externally provided operands of `cluster_ops`.
+llvm::SmallSetVector<Value, 4> GetExternalOperands(
+    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+  llvm::SmallSetVector<Value, 4> external_values;
+
+  for (Operation* op : cluster_ops) {
+    for (Value v : op->getOperands()) {
+      Operation* defining_op = v.getDefiningOp();
+      if (!defining_op) continue;
+      bool is_external = llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
+        return defining_op == cluster_op;
+      });
+
+      if (is_external) external_values.insert(v);
+    }
+  }
+
+  return external_values;
+}
+
+void MoveOutsideCompiledOps(
+    tf_device::ClusterOp tpu_cluster, llvm::StringRef outside_cluster_name,
+    tf_device::LaunchOp host_launch_op,
+    const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    const llvm::SmallSetVector<Value, 4>& external_inputs,
+    const llvm::SmallVector<Value, 4>& external_outputs) {
+  if (external_inputs.empty() && external_outputs.empty()) {
+    MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+    return;
+  }
+
+  OpBuilder builder(host_launch_op.GetBody().getTerminator());
+  auto result_type =
+      RankedTensorType::get({}, builder.getType<TF::StringType>());
+
+  std::string txt_metadata;
+  std::string txt_module;
+  // TODO(b/157054714): Use a better abstraction instead of _TPUCompileMlirOp
+  // and _XlaRecvAtHostOp and _XlaSendFromHostOp.
+
+  // A placeholder _TpuCompileMlirOp is created because it is required input to
+  // XlaRecvAtHostOp and XlaSendFromHostOp but the _TpuCompileMlirOp has not yet
+  // been created for the TPU cluster that contains the outside compiled ops.
+  // This placeholder should be replaced by the TPU cluster _TPUCompileMlirOp in
+  // a subsequent pass.
+  auto compile_op = builder.create<TF::_TPUCompileMlirOp>(
+      tpu_cluster.getLoc(), /*compilation_status=*/result_type, /*program=*/
+      llvm::ArrayRef<Type>{result_type}, llvm::ArrayRef<Value>{}, txt_module,
+      txt_metadata);
+
+  llvm::SmallVector<Type, 4> host_output_types;
+  for (const auto& external_input : external_inputs)
+    host_output_types.push_back(external_input.getType());
+
+  std::string communication_key =
+      absl::StrCat("host_compute_channel_", outside_cluster_name.str());
+  // XlaRecvAtHostOp takes both the program key(dynamic_key) from the
+  // _TpuCompileMlirOp and the communication_key.
+  auto recv_at_host = builder.create<TF::_XlaRecvAtHostOp>(
+      tpu_cluster.getLoc(), host_output_types,
+      /*dynamic_key=*/compile_op.getResult(1),
+      builder.getStringAttr(communication_key),
+      builder.getIntegerAttr(builder.getIntegerType(64), 0));
+
+  // TODO(b/156006200): Handle host->device outputs.
+  builder.setInsertionPoint(cluster_ops.front());
+  auto host_compute = builder.create<TF::_HostComputeMlirOp>(
+      tpu_cluster.getLoc(), llvm::ArrayRef<Type>{},
+      external_inputs.getArrayRef(), llvm::ArrayRef<NamedAttribute>{});
+  host_compute.setAttr(kAncestorsAttr, builder.getArrayAttr({}));
+  host_compute.setAttr(kShapesAttr, builder.getArrayAttr({}));
+  host_compute.setAttr(kKeyAttr, builder.getStringAttr(communication_key));
+  MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
+
+  for (auto result : llvm::zip(external_inputs, recv_at_host.getResults()))
+    mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
+                                     host_launch_op.body());
+}
+
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
 // 'launch` as regions.
 void CreateParallelExecuteFromOutsideClusters(
@@ -123,7 +207,7 @@ void CreateParallelExecuteFromOutsideClusters(
   OpBuilder builder(tpu_cluster);
   // Create parallel_execute regions.  The original TPU cluster computation
   // is the extra region.
-  int num_regions = 1 + clusters.size();
+  const int num_regions = 1 + clusters.size();
   auto parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
       tpu_cluster.getLoc(), num_regions, tpu_cluster.results().getTypes());
 
@@ -134,9 +218,18 @@ void CreateParallelExecuteFromOutsideClusters(
     Block& outside_block =
         parallel_execute_op.GetRegionBlockWithIndex(cluster.index());
     builder.setInsertionPointToEnd(&outside_block);
-    tf_device::LaunchOp launch_op =
+    tf_device::LaunchOp host_launch_op =
         CreateLaunchOpForOutsideCluster(&builder, cluster_ops.back());
-    MoveOutsideClusterOpsToLaunchOp(launch_op, cluster_ops);
+
+    // Determine if there are any inputs that are provided out of cluster.
+    auto external_inputs = GetExternalOperands(cluster_ops);
+    llvm::SmallVector<Value, 4> external_outputs;
+    // TODO(b/156006200): Compute the external outputs.
+
+    MoveOutsideCompiledOps(tpu_cluster, cluster.value().getFirst(),
+                           host_launch_op, cluster_ops, external_inputs,
+                           external_outputs);
+
     builder.setInsertionPointToEnd(&outside_block);
     // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
     // regions either through communication with TPU parallel_execute regions
@@ -146,12 +239,13 @@ void CreateParallelExecuteFromOutsideClusters(
   }
 
   // Move the launch body to last parallel_execute block.
-  Block& inside_block =
+  Block& parallel_execute_tpu_block =
       parallel_execute_op.GetRegionBlockWithIndex(num_regions - 1);
-  builder.setInsertionPointToEnd(&inside_block);
+  builder.setInsertionPointToEnd(&parallel_execute_tpu_block);
   builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
                                       tpu_cluster.getResults());
-  tpu_cluster.getOperation()->moveBefore(inside_block.getTerminator());
+  tpu_cluster.getOperation()->moveBefore(
+      parallel_execute_tpu_block.getTerminator());
 
   PropagateParallelExecuteReturnToReplicate(parallel_execute_op);
   // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute

From 8a714ed02db0d2c93a7594965d062d1c2737b125 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Wed, 20 May 2020 10:18:41 -0700
Subject: [PATCH 0906/1533] [XLA] Instead of limiting outstanding async copies,
 limit outstanding prefetch and evictions

PiperOrigin-RevId: 312505993
Change-Id: I1152171379a1180cbc4dea2b6bc1fdc5485ffbec
---
 .../xla/service/memory_space_assignment.cc    | 50 ++++++++++++-------
 .../xla/service/memory_space_assignment.h     | 16 +++---
 .../service/memory_space_assignment_test.cc   | 45 ++++++++++++++---
 3 files changed, 79 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 44509395b6f..274b7e87f99 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -1293,10 +1293,13 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks() {
     interval_tree_.Remove(interval.start, interval.end, chunk);
   }
   for (const auto& interval : pending_async_copies_) {
-    async_copy_interval_tree_.Remove(interval.start_time, interval.end_time,
-                                     kDummyChunk);
     if (interval.destination == MemorySpace::kAlternate) {
+      prefetch_interval_tree_.Remove(interval.start_time, interval.end_time,
+                                     kDummyChunk);
       async_copy_ordering_.RemoveCopy(interval);
+    } else {
+      eviction_interval_tree_.Remove(interval.start_time, interval.end_time,
+                                     kDummyChunk);
     }
   }
   pending_chunks_.clear();
@@ -1480,27 +1483,37 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
   // the limit at any given time.
   pending_async_copies_.push_back(
       {start_time, copy_done_schedule_before_time, memory_space});
-  async_copy_interval_tree_.Add(start_time, copy_done_schedule_before_time,
-                                kDummyChunk);
   if (memory_space == MemorySpaceAssignment::MemorySpace::kAlternate) {
+    prefetch_interval_tree_.Add(start_time, copy_done_schedule_before_time,
+                                kDummyChunk);
     async_copy_ordering_.AddCopy(pending_async_copies_.back());
+  } else {
+    eviction_interval_tree_.Add(start_time, copy_done_schedule_before_time,
+                                kDummyChunk);
   }
 }
 
 bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
-    int64 start_time, int64 end_time) const {
-  if (options_.max_outstanding_async_copies < 0) {
+    int64 start_time, int64 end_time, bool is_prefetch) const {
+  if (options_.max_outstanding_prefetches < 0 && is_prefetch) {
+    return false;
+  }
+  if (options_.max_outstanding_evictions < 0 && !is_prefetch) {
     return false;
   }
 
-  // Count the asynchronous copies in the interval tree for the given interval.
-  int64 num_async_copies =
-      async_copy_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
-          .size();
-
-  // Add one because we are checking if adding an additional asynchronous copy
-  // would violate the limit.
-  return num_async_copies + 1 > options_.max_outstanding_async_copies;
+  // Count the prefetches/evictions in the interval tree for the given interval.
+  if (is_prefetch) {
+    int64 num_prefetches =
+        prefetch_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
+            .size();
+    return num_prefetches >= options_.max_outstanding_prefetches;
+  } else {
+    int64 num_evictions =
+        eviction_interval_tree_.ChunksOverlappingInTime(start_time, end_time)
+            .size();
+    return num_evictions >= options_.max_outstanding_evictions;
+  }
 }
 
 bool AlternateMemoryBestFitHeap::ViolatesAsyncCopyOrdering(
@@ -1664,7 +1677,8 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
   bool eviction_interval_too_short = (eviction_start_time == eviction_end_time);
   bool eviction_violates_outstanding_copies =
       ViolatesMaximumOutstandingAsyncCopies(eviction_start_time,
-                                            eviction_end_time);
+                                            eviction_end_time,
+                                            /*is_prefetch=*/false);
 
   // See if this interval would violate the asynchronous copy limit.
   if (!eviction_interval_too_short && !eviction_violates_outstanding_copies) {
@@ -1685,7 +1699,8 @@ bool AlternateMemoryBestFitHeap::Evict(const AllocationRequest& request) {
     bool eviction_scheduled = false;
     for (int64 time = eviction_start_time; time < eviction_end_time; ++time) {
       VLOG(4) << "Try evicting (" << time << ", " << time + 1 << ")";
-      if (!ViolatesMaximumOutstandingAsyncCopies(time, time + 1)) {
+      if (!ViolatesMaximumOutstandingAsyncCopies(time, time + 1,
+                                                 /*is_prefetch=*/false)) {
         VLOG(3) << "Eviction successful.";
         AddAsyncCopy(*prev_allocation, MemorySpace::kDefault,
                      /*chunk=*/absl::nullopt, time, time + 1, time + 1,
@@ -1750,7 +1765,8 @@ bool AlternateMemoryBestFitHeap::Prefetch(
     // If this additional asynchronous copy would violate the limit, try a
     // different interval.
     if (ViolatesMaximumOutstandingAsyncCopies(alternate_mem_interval.start,
-                                              request.latest_prefetch_time)) {
+                                              request.latest_prefetch_time,
+                                              /*is_prefetch=*/true)) {
       VLOG(4) << "This would violate the outstanding async copy limit.";
       continue;
     }
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index cf23c792c21..3f59abfd28e 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -343,9 +343,10 @@ class MemorySpaceAssignment {
     // the opcode) to be placed on the alternate memory.
     IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_fn;
 
-    // Specifies the upper bound for number of outstanding asynchronous copies,
-    // -1 for unlimited.
-    int64 max_outstanding_async_copies = -1;
+    // Specifies the upper bound for number of outstanding prefetches and
+    // evictions, -1 for unlimited.
+    int64 max_outstanding_prefetches = -1;
+    int64 max_outstanding_evictions = -1;
 
     // If true, tries allocating buffers across (e.g., before and inside a while
     // loop body) sequential calls (kWhile, kCall, and kConditional).
@@ -953,8 +954,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
 
   // Returns true if the addition of an asynchronous copy in the given time
   // interval would violate the maximum number of asynchronous copies.
-  bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time,
-                                             int64 end_time) const;
+  bool ViolatesMaximumOutstandingAsyncCopies(int64 start_time, int64 end_time,
+                                             bool is_prefetch) const;
 
   // Return true if the asynchronous copy would violate the pipelining order.
   bool ViolatesAsyncCopyOrdering(int64 start_time, int64 end_time) const;
@@ -997,8 +998,9 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   const HloAliasAnalysis& alias_analysis_;
   const HloLiveRange& hlo_live_range_;
   // We use a interval tree to keep track of the number of outstanding
-  // asynchronous copies.
-  BufferIntervalTree async_copy_interval_tree_;
+  // prefetches and evictions.
+  BufferIntervalTree prefetch_interval_tree_;
+  BufferIntervalTree eviction_interval_tree_;
   AsynchronousCopyOrdering async_copy_ordering_;
   std::vector<std::pair<BufferInterval, ChunkCandidate>> pending_chunks_;
   std::vector<AsynchronousCopy> pending_async_copies_;
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 61843b2e765..0a76dd5f31c 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -127,7 +127,8 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     options.prefetch_interval_picker = prefetch_interval_picker;
     options.size_fn = size_fn;
     options.is_allowed_in_alternate_mem_fn = is_allowed_in_alternate_mem;
-    options.max_outstanding_async_copies = max_outstanding_async_copies;
+    options.max_outstanding_prefetches = max_outstanding_async_copies;
+    options.max_outstanding_evictions = max_outstanding_async_copies;
     options.allocate_across_sequential_calls = GetParam();
     options.verify = true;
 
@@ -185,20 +186,45 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     }
   }
 
-  /*static*/ int64 CountMaximumOutstandingAsyncCopies(const HloModule& module) {
-    int64 max_copies = 0;
+  struct OutstandingAsyncCopies {
+    int64 max_copies;
+    int64 max_prefetches;
+    int64 max_evictions;
+  };
+
+  /*static*/ OutstandingAsyncCopies CountMaximumOutstandingAsyncCopies(
+      const HloModule& module) {
+    OutstandingAsyncCopies copies{0, 0, 0};
     int64 current_copies = 0;
+    int64 current_prefetches = 0;
+    int64 current_evictions = 0;
     for (HloInstruction* instruction : module.schedule()
                                            .sequence(module.entry_computation())
                                            .instructions()) {
       if (instruction->opcode() == HloOpcode::kCopyStart) {
         current_copies++;
+        if (ShapeUtil::GetSubshape(instruction->shape(), {0})
+                .layout()
+                .memory_space() == kAlternateMemorySpace) {
+          current_prefetches++;
+        } else {
+          current_evictions++;
+        }
       } else if (instruction->opcode() == HloOpcode::kCopyDone) {
         current_copies--;
+        if (instruction->shape().layout().memory_space() ==
+            kAlternateMemorySpace) {
+          current_prefetches--;
+        } else {
+          current_evictions--;
+        }
       }
-      max_copies = std::max(max_copies, current_copies);
+      copies.max_copies = std::max(copies.max_copies, current_copies);
+      copies.max_prefetches =
+          std::max(copies.max_prefetches, current_prefetches);
+      copies.max_prefetches = std::max(copies.max_evictions, current_evictions);
     }
-    return max_copies;
+    return copies;
   }
 
   std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
@@ -408,7 +434,8 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies0) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/0);
 
-  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 0);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_prefetches, 0);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_evictions, 0);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
@@ -416,7 +443,8 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/1);
 
-  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 1);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_prefetches, 1);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_evictions, 1);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
@@ -424,7 +452,8 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/2);
 
-  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 2);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_prefetches, 2);
+  EXPECT_LE(CountMaximumOutstandingAsyncCopies(*module).max_evictions, 2);
 }
 
 // TODO(berkin): This test is broken with some prefetch timing improvements.

From 4c803450d924685681162f02e570a548eeaa5804 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 10:34:34 -0700
Subject: [PATCH 0907/1533] Implement the llvm lowering for the customcall
 sliceToDynamic and padToStatic for multi-dimensional array on XLA:CPU

PiperOrigin-RevId: 312509595
Change-Id: Ib09ec1844618318ef50943e32cd2496454532ae0
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 107 ++++++++-----
 .../compiler/xla/service/cpu/tests/BUILD      |  13 ++
 .../service/cpu/tests/cpu_dyn_shape_test.cc   |  60 +++++++
 .../compiler/xla/service/llvm_ir/ir_array.cc  |  23 +++
 .../compiler/xla/service/llvm_ir/ir_array.h   |   4 +
 .../xla/service/llvm_ir/loop_emitter.cc       |  63 ++++++--
 .../xla/service/llvm_ir/loop_emitter.h        |  17 ++
 tensorflow/compiler/xrt/tests/raw_api_test.cc | 146 +++++++++++++++++-
 8 files changed, 377 insertions(+), 56 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 043ad68a196..1e204afb001 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -2344,56 +2345,68 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
 }
 
 Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
-  // TODO(jackcao): Generalize this to generic llvm emitter.
-  TF_RET_CHECK(hlo->shape().rank() == 1);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+  std::vector<llvm::Value*> dynamic_dims;
+  int32 raw_data_size =
+      ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(hlo->shape()));
+  llvm::Value* dest_buffer = GetEmittedValueFor(hlo);
+  llvm::Value* raw_buffer =
+      b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
   for (int64 i = 1; i < hlo->operand_count(); ++i) {
     const int64 dim_index = i - 1;
     llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(i));
-    llvm::LoadInst* dim_size = b_.CreateLoad(source_buffer, "dim_size");
-    llvm::Value* dest_buffer = GetEmittedValueFor(hlo);
-    llvm::Value* raw_buffer =
-        b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
+    llvm::LoadInst* dyn_dim_size = b_.CreateLoad(source_buffer, "dyn_dim_size");
 
-    int32 raw_data_size =
-        ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(hlo->shape()));
     llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
         b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
-    b_.CreateStore(dim_size,
+    b_.CreateStore(dyn_dim_size,
                    b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+    dynamic_dims.push_back(b_.CreateIntCast(dyn_dim_size, b_.getInt64Ty(),
+                                            /*isSigned=*/true,
+                                            "i64_dyn_dim_size"));
   }
 
-  return EmitTargetElementLoop(hlo,
-                               [=](const llvm_ir::IrArray::Index& dest_index) {
-                                 // TODO(jackcao): Properly linearize dest_index
-                                 // and delinearize to source index.
-                                 return GetIrArrayFor(hlo->operand(0))
-                                     .EmitReadArrayElement(dest_index, &b_);
-                               });
+  llvm_ir::IrArray data_array = GetIrArrayFor(hlo);
+  // Pseudo code for sliceToDynamic:
+  //
+  //   for (index i in dynamic_dim)
+  //     dest_index = delinearize(linearize(i, dynamic_dim), static_dim)
+  //     dest[dest_index] = source[i]
+  auto loop_body_emitter =
+      [&](const llvm_ir::IrArray::Index& array_index) -> Status {
+    llvm::Value* source_element =
+        GetIrArrayFor(hlo->operand(0)).EmitReadArrayElement(array_index, &b_);
+    llvm::Value* linear_index = array_index.Linearize(dynamic_dims, &b_);
+    // Delinearize the index based on the static shape.
+    llvm_ir::IrArray::Index dest_index(linear_index, data_array.GetShape(),
+                                       &b_);
+    data_array.EmitWriteArrayElement(dest_index, source_element, &b_);
+    return Status::OK();
+  };
+  return llvm_ir::LoopEmitter(loop_body_emitter, data_array.GetShape(),
+                              dynamic_dims, &b_)
+      .EmitLoop(IrName(hlo));
 }
 
 Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
-  // TODO(jackcao): Generalize this to generic llvm emitter.
-  TF_RET_CHECK(hlo->operand(0)->shape().rank() == 1);
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data_slice,
                       assignment_.GetUniqueSlice(hlo, {0}));
+  std::vector<llvm::Value*> dynamic_dims;
+  std::vector<llvm::Value*> tuple_operand_ptrs;
   const Shape& data_shape = ShapeUtil::GetSubshape(hlo->shape(), {0});
+  const Shape& input_shape = hlo->operand(0)->shape();
   llvm::Value* data_address = EmitBufferPointer(data_slice, data_shape);
   llvm_ir::IrArray data_array(data_address, data_shape);
-  TF_RETURN_IF_ERROR(llvm_ir::LoopEmitter(
-                         [=](const llvm_ir::IrArray::Index& dest_index) {
-                           // TODO(jackcao): Properly linearize dest_index and
-                           // delinearize to source index.
-                           return GetIrArrayFor(hlo->operand(0))
-                               .EmitReadArrayElement(dest_index, &b_);
-                         },
-                         llvm_ir::IrArray(data_address, data_shape), &b_)
-                         .EmitLoop(IrName(hlo)));
-  std::vector<llvm::Value*> tuple_operand_ptrs;
-  tuple_operand_ptrs.push_back(data_array.GetBasePointer());
+  llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(0));
+  llvm::Value* raw_buffer =
+      b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
+  int64 raw_data_size =
+      ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(input_shape));
 
+  // Put a placeholder for the data array's pointer
+  tuple_operand_ptrs.push_back(data_array.GetBasePointer());
   // PadToStatic has a dynamic tensor as input and variadic size of outputs:
   // (static_tensor, dynamic_dim_0, dynamic_dim_1, ... )
   // Dynamic dimension sizes starts from output index 1.
@@ -2406,20 +2419,38 @@ Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
     llvm::Value* dest_dim_size_address =
         EmitBufferPointer(dim_size_slice, data_shape);
     const int64 dim_index = i - 1;
-    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(0));
-    llvm::Value* raw_buffer =
-        b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
-    int32 raw_data_size = ShapeUtil::ByteSizeOf(
-        ShapeUtil::MakeStaticShape(hlo->operand(0)->shape()));
     llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
         b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
-    llvm::Value* dim_size = b_.CreateLoad(
-        b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
-    b_.CreateStore(dim_size, b_.CreateBitCast(dest_dim_size_address,
-                                              b_.getInt32Ty()->getPointerTo()));
+    llvm::Value* dyn_dim_size = b_.CreateLoad(
+        b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()),
+        "dyn_dim_size");
+    b_.CreateStore(dyn_dim_size,
+                   b_.CreateBitCast(dest_dim_size_address,
+                                    b_.getInt32Ty()->getPointerTo()));
+    dynamic_dims.push_back(b_.CreateIntCast(dyn_dim_size, b_.getInt64Ty(),
+                                            /*isSigned=*/true,
+                                            "i64_dyn_dim_size"));
     tuple_operand_ptrs.push_back(dest_dim_size_address);
   }
 
+  // Pseudo code for padToStatic:
+  //
+  //   for (index i in dynamic_dim)
+  //     source_index = delinearize(inearize(i, dynamic_dim), static_dim)
+  //     dest[i] = source[source_index]
+  auto loop_body_emitter =
+      [&](const llvm_ir::IrArray::Index& array_index) -> Status {
+    llvm::Value* linear_index = array_index.Linearize(dynamic_dims, &b_);
+    llvm_ir::IrArray::Index source_index(linear_index, input_shape, &b_);
+    llvm::Value* source_element =
+        GetIrArrayFor(hlo->operand(0)).EmitReadArrayElement(source_index, &b_);
+    data_array.EmitWriteArrayElement(array_index, source_element, &b_);
+    return Status::OK();
+  };
+  TF_RETURN_IF_ERROR(
+      llvm_ir::LoopEmitter(loop_body_emitter, input_shape, dynamic_dims, &b_)
+          .EmitLoop(IrName(hlo)));
+
   // Emit static tensor and dynamic sizes as one tuple.
   llvm_ir::EmitTuple(GetIrArrayFor(hlo), tuple_operand_ptrs, &b_);
   return Status::OK();
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index f52de3394fe..1ac8509cdb1 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -35,6 +35,19 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "cpu_dyn_shape_test",
+    srcs = ["cpu_dyn_shape_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "cpu_fusion_test",
     srcs = ["cpu_fusion_test.cc"],
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
new file mode 100644
index 00000000000..46249caa0c7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_dyn_shape_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+using CpuDynamicShapeTest = CpuCodegenTest;
+
+TEST_F(CpuDynamicShapeTest, DynamicShapeR2) {
+  HloComputation::Builder builder(TestName());
+
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  HloInstruction* param_x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, dyn_input_shape, "x"));
+
+  builder.AddInstruction(HloInstruction::CreateUnary(
+      dyn_input_shape, HloOpcode::kNegate, param_x));
+  auto hlo_module = CreateNewVerifiedModule();
+  hlo_module->AddEntryComputation(builder.Build());
+
+  string filecheck_pattern = R"(
+; CHECK: %[[dyn_dim_size:.*]] = load i32, i32*
+; CHECK: %[[i64_dyn_dim_size:.*]] = sext i32 %[[dyn_dim_size:.*]] to i64
+; CHECK: icmp uge i64 %[[custom:.*]], %[[i64_dyn_dim_size:.*]]
+; CHECK: %[[multiplier:.*]] = mul i64 1, %[[i64_dyn_dim_size:.*]]
+; CHECK: mul nuw nsw i64 %[[custom:.*]], %[[multiplier:.*]]
+)";
+
+  CpuAotCompilationOptions options{
+      /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
+                                filecheck_pattern,
+                                /*match_optimized_ir=*/false);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index da0dbf94ddd..278aa3e1696 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -373,6 +374,28 @@ llvm::Value* IrArray::Index::Linearize(absl::Span<const int64> dimensions,
   return logical_linear_index;
 }
 
+llvm::Value* IrArray::Index::Linearize(
+    const std::vector<llvm::Value*>& dynamic_dims,
+    llvm::IRBuilder<>* builder) const {
+  // Each dimension is multiplied by the product of the sizes of all
+  // earlier dimensions and added to the accumulator logical_linear_index.
+  CHECK_EQ(size(), dynamic_dims.size());
+  llvm::Value* logical_linear_index = GetConstantWithIndexType(0);
+  llvm::Value* multiplier = GetConstantWithIndexType(1);
+  for (ssize_t i = size() - 1; i >= 0; --i) {
+    llvm::Value* addend = builder->CreateMul((*this)[i], multiplier, "",
+                                             /*HasNUW=*/true, /*HasNSW=*/true);
+    addend = builder->CreateZExtOrTrunc(addend, index_type_);
+    logical_linear_index = builder->CreateAdd(logical_linear_index, addend, "",
+                                              /*HasNUW=*/true, /*HasNSW=*/true);
+    if (i) {
+      multiplier = builder->CreateMul(multiplier, dynamic_dims[i],
+                                      /*Name=*/"multiplier");
+    }
+  }
+  return logical_linear_index;
+}
+
 llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
                                               llvm::IRBuilder<>* b,
                                               absl::string_view name,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index e838c4a0534..c71654f5294 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -155,6 +155,10 @@ class IrArray {
     llvm::Value* Linearize(absl::Span<const int64> dimensions,
                            llvm::IRBuilder<>* builder) const;
 
+    // Linearizes the index into the given dynamic dimensions.
+    llvm::Value* Linearize(const std::vector<llvm::Value*>& dynamic_dims,
+                           llvm::IRBuilder<>* builder) const;
+
     llvm::Type* GetType() const { return index_type_; }
 
     llvm::Constant* GetConstantWithIndexType(int64 c) const {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 83be4334269..b6b3b2dd8b3 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -35,6 +35,14 @@ LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
                          llvm::IRBuilder<>* b)
     : body_emitter_(body_emitter), shape_(shape), b_(b) {}
 
+LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
+                         std::vector<llvm::Value*> dynamic_dims,
+                         llvm::IRBuilder<>* b)
+    : LoopEmitter::LoopEmitter(body_emitter, shape, b) {
+  CHECK_EQ(dynamic_dims.size(), shape_.dimensions_size());
+  dynamic_dims_ = std::move(dynamic_dims);
+}
+
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          const IrArray& target_array, llvm::IRBuilder<>* b)
     : body_emitter_([=](const llvm_ir::IrArray::Index array_index) -> Status {
@@ -84,6 +92,43 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   }
 }
 
+IrArray::Index LoopEmitter::EmitStaticIndex(ForLoopNest* loop_nest,
+                                            llvm::Type* index_type) {
+  // Create loop nest with one for-loop for each dimension of the target shape.
+  // Loops are added from outermost to innermost order with the ForLoopNest
+  // class so emit loops in order from most-major dimension down to most-minor
+  // dimension (of the target shape).
+  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
+    std::unique_ptr<ForLoop> loop = loop_nest->AddLoop(
+        /*start_index=*/0,
+        /*end_index=*/shape_.dimensions(dimension),
+        /*suffix=*/absl::StrFormat("dim.%d", dimension));
+    array_multi_index[dimension] = loop->GetIndVarValue();
+  }
+  return IrArray::Index(array_multi_index, shape_, index_type);
+}
+
+IrArray::Index LoopEmitter::EmitDynamicIndex(ForLoopNest* loop_nest,
+                                             llvm::Type* index_type) {
+  CHECK_EQ(shape_.is_dynamic(), true);
+  // Create loop nest with one for-loop for each dynamic dimensions.
+  // Loops are added from outermost to innermost order with the ForLoopNest
+  // class so emit loops in order from most-major dimension down to most-minor
+  // dimension (of the target shape).
+  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
+  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
+    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
+    std::unique_ptr<ForLoop> loop = loop_nest->AddLoop(
+        /*suffix=*/absl::StrFormat("dim.%d", dimension),
+        /*start_index=*/llvm::ConstantInt::get(index_type, 0),
+        /*end_index=*/dynamic_dims_[dimension]);
+    array_multi_index[dimension] = loop->GetIndVarValue();
+  }
+  return IrArray::Index(array_multi_index, shape_, index_type);
+}
+
 std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     absl::string_view loop_name, llvm::Type* index_type) {
   CHECK_NE(index_type, nullptr);
@@ -93,21 +138,11 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
     return {IrArray::Index(index_type)};
   }
 
-  // Create loop nest with one for-loop for each dimension of the target shape.
-  // Loops are added from outermost to innermost order with the ForLoopNest
-  // class so emit loops in order from most-major dimension down to most-minor
-  // dimension (of the target shape).
   ForLoopNest loop_nest(loop_name, b_);
-  std::vector<llvm::Value*> array_multi_index(shape_.dimensions_size());
-  for (int i = 0; i < LayoutUtil::MinorToMajor(shape_).size(); ++i) {
-    int64 dimension = LayoutUtil::Major(shape_.layout(), i);
-    std::unique_ptr<ForLoop> loop = loop_nest.AddLoop(
-        /*start_index=*/0,
-        /*end_index=*/shape_.dimensions(dimension),
-        /*suffix=*/absl::StrFormat("dim.%d", dimension));
-    array_multi_index[dimension] = loop->GetIndVarValue();
-  }
-  IrArray::Index array_index(array_multi_index, shape_, index_type);
+
+  IrArray::Index array_index = dynamic_dims_.empty()
+                                   ? EmitStaticIndex(&loop_nest, index_type)
+                                   : EmitDynamicIndex(&loop_nest, index_type);
 
   // Set IR builder insertion point to the loop body basic block of the
   // innermost loop.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index a537c00066b..008205a642a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ir_array.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
@@ -42,6 +43,12 @@ class LoopEmitter {
 
   LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
               llvm::IRBuilder<>* b);
+
+  // Constructs a LoopEmitter from an body_emitter that generates
+  // element of the given target array in the dynamic dimension.
+  LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
+              std::vector<llvm::Value*> dynamic_dims, llvm::IRBuilder<>* b);
+
   // Constructs a LoopEmitter from an element generator that generates each
   // element of the given target array.
   LoopEmitter(const ElementGenerator& target_element_generator,
@@ -81,11 +88,21 @@ class LoopEmitter {
   // The shape that the emitted loop iterates through.
   Shape shape_;
 
+  // Dynamic dimensions that  emitted loop iterates through. Generate the
+  // loop based on the dynamic dimensions if this vector is not empty.
+  std::vector<llvm::Value*> dynamic_dims_;
+
   // Points to the exit block of the emitted loop. If the given shape is
   // scalar, no loops are emitted and exit_bb_ is nullptr in that case.
   llvm::BasicBlock* exit_bb_;
 
   llvm::IRBuilder<>* b_;
+
+ private:
+  IrArray::Index EmitStaticIndex(ForLoopNest* loop_nest,
+                                 llvm::Type* index_type);
+  IrArray::Index EmitDynamicIndex(ForLoopNest* loop_nest,
+                                  llvm::Type* index_type);
 };
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index fbf9dfd0a17..67647cc4285 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -62,6 +62,20 @@ xla::XlaComputation ReturnDynamicR1() {
   return builder.Build(pad_sum).ValueOrDie();
 }
 
+xla::XlaComputation ReturnDynamicR2() {
+  xla::XlaBuilder builder("ReturnDynamicR2");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {2, 4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {2, 4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto pad_sum_dim0 = xla::SetDimensionSize(sum, p2, 0);
+  auto pad_sum_dim1 = xla::SetDimensionSize(pad_sum_dim0, p2, 1);
+  return builder.Build(pad_sum_dim1).ValueOrDie();
+}
+
 xla::XlaComputation AcceptDynamicR1() {
   xla::XlaBuilder builder("AcceptDynamicR1");
   xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
@@ -72,6 +86,16 @@ xla::XlaComputation AcceptDynamicR1() {
   return builder.Build(sum).ValueOrDie();
 }
 
+xla::XlaComputation AcceptDynamicR2() {
+  xla::XlaBuilder builder("AcceptDynamicR2");
+  xla::Shape dyn_shape;
+  dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_shape.set_dynamic_dimension(1, true);
+  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
+  auto negate = xla::Neg(p0);
+  return builder.Build(negate).ValueOrDie();
+}
+
 xla::XlaComputation ReturnDynamicR1Tuple() {
   xla::XlaBuilder builder("ReturnDynamicR1Tuple");
   auto p0 = xla::Parameter(&builder, 0,
@@ -1103,7 +1127,8 @@ TEST(RawApiTest, CompileAndExecute) {
 
 TEST(RawApiTest, DynamicR1Test) {
   if (!SupportDynamicShapes()) {
-    return;
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
   }
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
@@ -1156,9 +1181,71 @@ TEST(RawApiTest, DynamicR1Test) {
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
+TEST(RawApiTest, DynamicR2Test) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = xla::LiteralUtil::CreateR2({{1.0f, 2.0f, 0.5f, -1.0f},
+                                                    {1.5f, 2.5f, 3.0f, -2.0f}})
+                            .ToProto();
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = xla::LiteralUtil::CreateR2({{1.0f, -1.0f, 2.5f, 1.17f},
+                                                    {1.2f, -1.6f, 2.8f, 1.24f}})
+                            .ToProto();
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2, 4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2, 4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  dyn_shape.set_dynamic_dimension(1, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(ReturnDynamicR2(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+  auto expected = xla::LiteralUtil::CreateR2<float>({{2.0f, 1.0f}, {2.7, 0.9}});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, DynamicR1TupleTest) {
   if (!SupportDynamicShapes()) {
-    return;
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
   }
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
@@ -1221,7 +1308,8 @@ TEST(RawApiTest, DynamicR1TupleTest) {
 
 TEST(RawApiTest, AcceptDynamicR1TupleTest) {
   if (!SupportDynamicShapes()) {
-    return;
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
   }
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
@@ -1286,7 +1374,8 @@ TEST(RawApiTest, AcceptDynamicR1TupleTest) {
 
 TEST(RawApiTest, AcceptDynamicR1Test) {
   if (!SupportDynamicShapes()) {
-    return;
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
   }
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
@@ -1334,6 +1423,55 @@ TEST(RawApiTest, AcceptDynamicR1Test) {
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
+TEST(RawApiTest, AcceptDynamicR2Test) {
+  if (!SupportDynamicShapes()) {
+    GTEST_SKIP()
+        << "Skipping the test if backend doesn't support dynamic shapes";
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() =
+      xla::LiteralUtil::CreateR2({{-1.0f, 3.0f, 1.0f}, {-2.0f, -1.0f, 3.0f}})
+          .ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  // Compile time expects ascending layout.
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {2, 4});
+  dyn_shape.set_dynamic_dimension(1, true);
+  *shapes->add_parameters() = dyn_shape.ToProto();
+
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR2(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto result =
+      ops::XRTExecute(root, c_handle.handle, e_config, {Output(p0_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR2<float>(
+      {{1.0f, -3.0f, -1.0f}, {2.0f, 1.0f, -3.0f}});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});

From d89bf4c58ce19653fce0e6ceb16b80b2f3425b16 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 20 May 2020 10:36:16 -0700
Subject: [PATCH 0908/1533] [TF MLIR SI] Don't constant fold, only consider
 result of folding

This results in less changes to the module during shape inference (e.g., only
shapes are changed, no constant nodes are created). Effectively this computes
the folded result and then just uses that information locally. Which is
conceptually more wasteful (as a subsequent canonicalize pass may need to
recompute these) but is less surprising and avoids dropping attributes during
this part.

This adds an option to allow constant propagation from caller to callee and vice versa to retain current behavior. But these should be subsumed by more general constant propagation.

There is still additional changes that need to be made to avoid doing needless
computations here, this mostly focuses on decreasing graph mutations.

PiperOrigin-RevId: 312509934
Change-Id: I8252d73395a5d3a129e80e96b7c16f7abbbad97f
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  10 +-
 .../tensorflow/tests/shape_inference.mlir     |  34 +--
 .../tensorflow/transforms/shape_inference.cc  | 216 ++++++++++++++----
 .../tensorflow/transforms/shape_inference.h   |   8 +-
 .../transforms/shape_inference_pass.cc        |  18 +-
 5 files changed, 223 insertions(+), 63 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 6f02b8b92d8..95e888179e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -3674,12 +3674,20 @@ OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   if (!const_perm) return {};
 
   auto const_value = const_perm.value();
-  const auto &elements = const_value.getValues<APInt>();
+  const auto elements = const_value.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
   }
 
+  // TODO(jpienaar): Remove if/when we handle this more generally.
+  if (op.getType() != op.x().getType()) {
+    // If the types don't match then only fold if all the operands are in the TF
+    // dialect.
+    for (auto user : op.getOperation()->getUsers())
+      if (user->getDialect() != op.getDialect()) return {};
+  }
+
   return op.x();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 160bba94cfc..3cdade8da59 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -1,10 +1,11 @@
-// RUN: tf-opt %s -tf-shape-inference -verify-diagnostics | FileCheck %s -dump-input=fail
+// RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants=false -verify-diagnostics | FileCheck %s -dump-input=fail
+// RUN: tf-opt %s -tf-shape-inference=propagate-caller-callee-constants -verify-diagnostics | FileCheck %s -dump-input=fail
 
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK-NOT: tf.Cast
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+ // CHECK: %[[RESULT:.*]] = "tf.AddV2"
+ // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
  // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
@@ -60,8 +61,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[CST:.*]] = "tf.Const"{{.*}} {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[CST]]
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"
+// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
 // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
 // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
@@ -300,13 +301,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
-  // CHECK-LABEL: func @fold_cast
-  func @fold_cast(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-    // CHECK-NOT: Cast
-    %0 = "tf.Cast"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
-    return %0 : tensor<*xf32>
-  }
-
   // CHECK-LABEL: func @while_variant
   // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
   func @while_variant(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant> {
@@ -362,8 +356,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @partitioned_call_func_const
   func @partitioned_call_func_const(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-    // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-    // CHECK: return %[[CONST]]
     return %arg0 : tensor<2xi32>
   }
 
@@ -410,4 +402,18 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %40 = "tf.Reshape"(%39, %19) {T = f32, Tshape = i32, device = ""} : (tensor<1x4x4x32xf32>, tensor<2xi32>) -> tensor<?x?xf32>
    return
   }
+
+  // CHECK-LABEL: const_fold
+  func @const_fold() -> () {
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %1 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Add
+    // CHECK-SAME: (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 5a2cae38062..5fa810eea33 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -429,7 +429,8 @@ LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
 // existing computed values.
 Attribute ComputeOutputComponent(const ValuePort& value_port,
                                  ValueQueryFn values) {
-  LLVM_DEBUG(value_port.print(llvm::errs() << "\nComputing output for "));
+  LLVM_DEBUG(value_port.print(llvm::dbgs() << "Computing output for ") << "\n");
+  if (auto known = values(value_port)) return known;
 
   auto op = value_port.producer.dyn_cast<Operation*>();
   if (!op) return nullptr;
@@ -454,6 +455,21 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
     ValuePort op_port(op->getOperand(port[1]));
     return values(op_port);
   }
+
+  if (auto graph = dyn_cast<tf_executor::GraphOp>(op)) {
+    if (port.size() == 1)
+      return ComputeOutputComponent(
+          ValuePort(graph.GetFetch().fetches()[port[0]]), values);
+    return nullptr;
+  }
+
+  if (auto island = dyn_cast<tf_executor::IslandOp>(op)) {
+    if (port.size() == 1)
+      return ComputeOutputComponent(
+          ValuePort(island.GetYield().fetches()[port[0]]), values);
+    return nullptr;
+  }
+
   return nullptr;
 }
 
@@ -462,7 +478,8 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
 // TF Graph version, constant values computed, etc.)
 class ShapeInference {
  public:
-  ShapeInference(int64_t graph_version, MLIRContext* context);
+  ShapeInference(int64_t graph_version, MLIRContext* context,
+                 bool propagate_caller_callee_constants);
 
   LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
                                                ValuePortInputs* inputs) {
@@ -475,14 +492,19 @@ class ShapeInference {
   }
 
   Attribute ComputeOutputComponent(const ValuePort& value_port) {
-    return ::mlir::TF::ComputeOutputComponent(
+    if (auto known_attr = results_[value_port]) return known_attr;
+    auto attr = ::mlir::TF::ComputeOutputComponent(
         value_port, [this](const ValuePort& port) { return results_[port]; });
+    RecordValue(value_port, attr);
+    return attr;
   }
 
   // Returns ShapeHandle if the op result could be computed as shape.
   ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic);
 
   void RecordValue(const ValuePort& value_port, Attribute value) {
+    LLVM_DEBUG(value_port.print(llvm::dbgs() << "\trecording ")
+               << value << "\n");
     results_[value_port] = value;
   }
 
@@ -520,19 +542,41 @@ class ShapeInference {
   LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
                                                     int64_t max_iteration);
 
+  // Propagates any constant operand of call_op to the called function body's
+  // corresponding argument if the callee has only one use.
+  //
+  // TODO(b/154065712): Move this to a more general inter-procedural constant
+  // folding pass.
+  void PropagateConstantToCallee(CallOpInterface call_op,
+                                 SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Propagates any constant return value of the callee function to the call
+  // op's corresponding result.
+  void PropagateConstantFromCallee(CallOpInterface call_op,
+                                   SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Tries to compute the result of folding the op. This doesn't actually
+  // perform constant folding, it is just computes the equivalent constants.
+  // Returns whether it was able to compute constant values.
+  LogicalResult TryToFold(Operation* op);
+
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
   // e.g., first element of OpResult produded) to an Attribute if the ValuePort
   // corresponds to a constant value.
   ValuePortResultMap results_;
   int64_t graph_version_;
-  MLIRContext* context_;
   Dialect* tf_dialect_;
+
+  // TODO(b/154065712): Remove propagate_caller_callee_constants once using
+  // SCCP pass instead.
+  bool propagate_caller_callee_constants_;
 };
 
-ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
-    : graph_version_(graph_version) {
-  context_ = context;
+ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context,
+                               bool propagate_caller_callee_constants)
+    : graph_version_(graph_version),
+      propagate_caller_callee_constants_(propagate_caller_callee_constants) {
   tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
 }
 
@@ -581,7 +625,6 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
       auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
-      RecordValue(front, ret);
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
@@ -602,6 +645,8 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
 }
 
 bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
+  LLVM_DEBUG(op->print(llvm::dbgs() << "InferShapeForSingleOperation for ");
+             llvm::dbgs() << "\n");
   assert(tf_dialect_ == op->getDialect());
   // The shape function of these ops sometimes does not propagate subtypes
   // (handle shapes) for resource and variant types. We use a simple passthrough
@@ -686,10 +731,14 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
     size_t index = it.index();
 
     // If the operand is constant, then convert it to Tensor.
-    ElementsAttr attr;
-    if (matchPattern(operand, m_Constant(&attr))) {
+    ValuePort vp(operand);
+    Attribute attr = ComputeOutputComponent(vp);
+    if (!attr && matchPattern(operand, m_Constant(&attr)))
+      RecordValue(vp, attr);
+    if (attr) {
       tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status = tensorflow::ConvertToTensor(attr, input_tensor);
+      auto status =
+          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
       if (status.ok()) {
         input_tensors[index] = input_tensor;
       } else {
@@ -728,10 +777,12 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
                !input_tensors[input];
       });
   if (requires_inputs) {
+    LLVM_DEBUG(llvm::dbgs() << "\trequired input\n");
     std::vector<ShapeHandle> input_tensors_as_shapes;
     for (int input : llvm::seq<int>(0, c.num_inputs())) {
       if (c.requested_input_tensor_as_partial_shape(input) &&
           !input_tensors[input]) {
+        LLVM_DEBUG(llvm::dbgs() << "Requesting " << input << " as shape\n");
         auto op_result = op->getOperand(input).dyn_cast<OpResult>();
         if (!op_result) continue;
         // Resize on first valid shape computed.
@@ -865,45 +916,62 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
   return success(all_succeeded);
 }
 
-// If the callee has only one use, propagates any constant operand of call_op to
-// the called function body's corresponding argument.
-//
-// TODO(b/154065712): Move this to a more general inter-procedural constant
-// folding pass.
-void PropagateConstantToCallee(CallOpInterface call_op,
-                               SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
+                                               SymbolRefAttr callee_sym,
+                                               ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
+  if (num_uses != 1) return;
+
   OpBuilder builder(&func.front().front());
   Operation* op = call_op.getOperation();
-  if (num_uses == 1) {
-    // If this is the only caller, and an operand is a constant, propagate
-    // the constant inside the function.
-    for (auto arg : func.getArguments()) {
-      auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
-      if (isa_and_nonnull<TF::ConstOp>(operand)) {
-        arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
+  // If this is the only caller, and an operand is a constant, propagate
+  // the constant value inside the function.
+  for (auto arg : func.getArguments()) {
+    auto operand = op->getOperand(arg.getArgNumber());
+    if (propagate_caller_callee_constants_) {
+      if (isa_and_nonnull<TF::ConstOp>(operand.getDefiningOp())) {
+        arg.replaceAllUsesWith(
+            builder.clone(*operand.getDefiningOp())->getResult(0));
       }
+      continue;
     }
+
+    auto known_constant = ComputeOutputComponent(ValuePort(operand));
+    if (!known_constant) continue;
+    LLVM_DEBUG(call_op.print(llvm::dbgs() << "Propagate to calee: ");
+               known_constant.print(llvm::dbgs() << " constant ");
+               llvm::dbgs() << "\n");
+    RecordValue(ValuePort(arg), known_constant);
   }
 }
 
-// Propagates any constant return value of the callee function to the call op's
-// corresponding result.
-void PropagateConstantFromCallee(CallOpInterface call_op,
-                                 SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantFromCallee(CallOpInterface call_op,
+                                                 SymbolRefAttr callee_sym,
+                                                 ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
-  // If the return value is a constant, replace the call result with a constant.
+  // If the return value is a constant, use the constant as the value of
+  // the call return.
   Operation* op = call_op.getOperation();
   OpBuilder builder(op);
   builder.setInsertionPointAfter(op);
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
-    auto retval_op = retval.value().getDefiningOp();
-    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
-      op->getResult(retval.index())
-          .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
+    if (propagate_caller_callee_constants_) {
+      auto retval_op = retval.value().getDefiningOp();
+      if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
+        op->getResult(retval.index())
+            .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
+      }
+      continue;
+    }
+
+    ValuePort vp(retval.value());
+    if (auto known_constant = ComputeOutputComponent(vp)) {
+      LLVM_DEBUG(known_constant.print(llvm::dbgs() << "Propagate constant ");
+                 call_op.print(llvm::dbgs() << "from "); llvm::dbgs() << "\n");
+      RecordValue(ValuePort(op->getResult(retval.index())), known_constant);
     }
   }
 }
@@ -938,10 +1006,71 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   return success();
 }
 
+LogicalResult ShapeInference::TryToFold(Operation* op) {
+  LLVM_DEBUG(op->print(llvm::dbgs() << "TryToFold "); llvm::dbgs() << "\n");
+  // If any output result is known, then the op probably has been computed
+  // before.
+  if (op->getNumResults() > 0 && results_[ValuePort(op->getResult(0))])
+    return success();
+
+  SmallVector<Attribute, 8> constant_operands(op->getNumOperands());
+  SmallVector<OpFoldResult, 8> fold_results;
+
+  // Check to see if any operands to the operation is constant and whether
+  // the operation knows how to constant fold itself.
+  bool some_unknown = false;
+  for (int i = 0, e = op->getNumOperands(); i != e; ++i) {
+    if (!(constant_operands[i] =
+              ComputeOutputComponent(ValuePort(op->getOperand(i)))))
+      some_unknown = true;
+  }
+
+  // Attempt to constant fold the operation.
+  auto* abstract_op = op->getAbstractOperation();
+  LogicalResult folded = failure();
+  if (abstract_op) {
+    folded = abstract_op->foldHook(op, constant_operands, fold_results);
+  }
+  // Attempt dialect fallback if op's fold hook failed.
+  if (failed(folded)) {
+    Dialect* dialect = op->getDialect();
+    if (!dialect) return failure();
+    // Only attempt TF dialect fallback if there are no unknown operands.
+    if (some_unknown && dialect == tf_dialect_) return failure();
+    SmallVector<Attribute, 8> constants;
+    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
+      return failure();
+    fold_results.assign(constants.begin(), constants.end());
+  }
+
+  for (auto result : zip(op->getResults(), fold_results)) {
+    auto fold_result = std::get<1>(result);
+    Attribute attr = nullptr;
+    if ((attr = fold_result.dyn_cast<Attribute>())) {
+      RecordValue(ValuePort(std::get<0>(result)), attr);
+    } else {
+      auto value = fold_result.get<Value>();
+      if ((attr = ComputeOutputComponent(ValuePort(value))))
+        RecordValue(ValuePort(std::get<0>(result)), attr);
+    }
+
+    if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
+      if (std::get<0>(result).getType() == eattr.getType()) continue;
+
+      // Inserts a cast back to the original type if any user is not in the
+      // TF dialect.
+      Type old_type = std::get<0>(result).getType();
+      std::get<0>(result).setType(eattr.getType());
+      AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result), tf_dialect_,
+                                         old_type);
+    }
+  }
+
+  return success();
+}
+
 LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
                                                       int64_t max_iteration) {
-  // An operation folder that is used to attempt folding before inference._
-  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -955,9 +1084,7 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
         changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
-        // TODO(jpienaar): Debug why we can't just return here. We end up with
-        // additional constant due to the propagation of constant into attached
-        // function if we return already.
+        return;
       }
 
       if (op->getDialect() != tf_dialect_) {
@@ -965,8 +1092,9 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
         return;
       }
 
-      // Before attempting inference, just try to fold the operation.
-      if (succeeded(folder.tryToFold(op))) return;
+      // Before attempting inference, just try to compute the folded
+      // value/shape.
+      if (succeeded(TryToFold(op))) return;
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.
@@ -989,8 +1117,10 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
 
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
-                                    int64_t graph_version) {
-  ShapeInference context(graph_version, func.getContext());
+                                    int64_t graph_version,
+                                    bool propagate_caller_callee_constants) {
+  ShapeInference context(graph_version, func.getContext(),
+                         propagate_caller_callee_constants);
   if (arg_shapes.empty()) {
     if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
       return failure();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
index e36d8d56d6d..7486fd77388 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -30,9 +30,11 @@ namespace TF {
 // Given a list of refined shapes matching the function arguments of func, runs
 // shape inference over the function to propagate this updated information.
 // If arg_shapes are empty, then argument shapes will be left unchanged.
-LogicalResult InferShapeForFunction(FuncOp func,
-                                    ArrayRef<ArrayRef<int64_t>> arg_shapes,
-                                    int64_t graph_version);
+// TODO(b/154065712): Remove propagate_caller_callee_constants once using
+// SCCP pass instead.
+LogicalResult InferShapeForFunction(
+    FuncOp func, ArrayRef<ArrayRef<int64_t>> arg_shapes, int64_t graph_version,
+    bool propagate_caller_callee_constants = true);
 
 }  // namespace TF
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index acdfc0eb039..1a846398412 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -47,8 +47,15 @@ namespace {
 
 // This transformation pass propagate shapes on the TensorFlow graph.
 // It is a ModulePass in order to be able to change function types.
-struct ShapeInference
+class ShapeInference
     : public PassWrapper<ShapeInference, OperationPass<ModuleOp>> {
+ public:
+  ShapeInference() = default;
+  ShapeInference(const ShapeInference& that) {
+    propagate_caller_callee_constants_ =
+        that.propagate_caller_callee_constants_;
+  }
+
   void runOnOperation() override {
     auto module = getOperation();
     auto producer_or = tensorflow::GetTfGraphProducerVersion(module);
@@ -58,10 +65,17 @@ struct ShapeInference
     }
     int64_t producer = producer_or.ValueOrDie();
     for (auto func : module.getOps<FuncOp>()) {
-      if (failed(InferShapeForFunction(func, /*arg_shapes=*/{}, producer)))
+      if (failed(InferShapeForFunction(func, /*arg_shapes=*/{}, producer,
+                                       propagate_caller_callee_constants_)))
         return signalPassFailure();
     }
   }
+
+ private:
+  Option<bool> propagate_caller_callee_constants_{
+      *this, "propagate-caller-callee-constants",
+      llvm::cl::desc("Propagate constants between callers and callees"),
+      llvm::cl::init(true)};
 };
 
 PassRegistration<ShapeInference> pass(

From c796a0da572703a7dd2b019504365e6d114f6fef Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Wed, 20 May 2020 17:44:57 +0000
Subject: [PATCH 0909/1533] Fixed test

---
 tensorflow/python/ops/gradients_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index b63d9561c30..8d38ca8e1d5 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1481,7 +1481,7 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
       out_re = test_fn_re(test_input_t)
       out = TestFn(test_input_t)
 
-    init = tf.compat.v1.global_variables_initializer()
+    init = variables.global_variables_initializer()
     self.evaluate(init)
     grads_re = gradients.gradients(out_re, variables.trainable_variables())
     grads = gradients.gradients(out, variables.trainable_variables())

From 94ef9a2a9c1f62ed694dd4747271bdd7a535ac00 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Wed, 20 May 2020 10:43:08 -0700
Subject: [PATCH 0910/1533] SyncOnReadVariable.assign() should return Tensor

now it returns tf.Operation in cross replica context regardless of the read_value argument.

PiperOrigin-RevId: 312511470
Change-Id: Ia5b47cc2d4fbe4f80fa73d2649adb6b5e96a7bed
---
 tensorflow/python/distribute/values.py      | 44 ++++++++++++---------
 tensorflow/python/distribute/values_test.py | 29 ++++++++++----
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 432f6b06975..d03628f4714 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -940,9 +940,19 @@ class SyncOnReadVariable(DistributedVariable):
   def _update_replica(self, update_fn, value, **kwargs):
     return update_fn(self._get_on_device_or_primary(), value, **kwargs)
 
+  def _assign_on_each_device(self, assign_func, value, read_value):
+    update = control_flow_ops.group(
+        tuple(
+            assign_func(v.device, v, value)
+            for v in self._values))
+    if not read_value:
+      return update
+    with ops.control_dependencies([update] if update else []):
+      return self.read_value()
+
   # TODO(b/154017756): Make assign behaivor in cross replica context consistent
   # with MirroredVariable.
-  def assign_sub(self, *args, **kwargs):
+  def assign_sub(self, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
         if self._aggregation == vs.VariableAggregation.SUM:
@@ -950,14 +960,13 @@ class SyncOnReadVariable(DistributedVariable):
               "SyncOnReadVariable does not support `assign_sub` in "
               "cross-replica context when aggregation is set to "
               "`tf.VariableAggregation.SUM`.")
-        return control_flow_ops.group(
-            tuple(
-                _assign_sub_on_device(v.device, v, args[0])
-                for v in self._values))
+        return self._assign_on_each_device(_assign_sub_on_device, value,
+                                           read_value)
       else:
-        return super(SyncOnReadVariable, self).assign_sub(*args, **kwargs)
+        return super(SyncOnReadVariable,
+                     self).assign_sub(value, use_locking, name, read_value)
 
-  def assign_add(self, *args, **kwargs):
+  def assign_add(self, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
         if self._aggregation == vs.VariableAggregation.SUM:
@@ -965,26 +974,25 @@ class SyncOnReadVariable(DistributedVariable):
               "SyncOnReadVariable does not support `assign_add` in "
               "cross-replica context when aggregation is set to "
               "`tf.VariableAggregation.SUM`.")
-        return control_flow_ops.group(
-            tuple(
-                _assign_add_on_device(v.device, v, args[0])
-                for v in self._values))
+        return self._assign_on_each_device(_assign_add_on_device, value,
+                                           read_value)
       else:
-        return super(SyncOnReadVariable, self).assign_add(*args, **kwargs)
+        return super(SyncOnReadVariable,
+                     self).assign_add(value, use_locking, name, read_value)
 
-  def assign(self, *args, **kwargs):
+  def assign(self, value, use_locking=False, name=None, read_value=True):
     with ds_context.enter_or_assert_strategy(self._distribute_strategy):
       if ds_context.in_cross_replica_context():
         # To preserve the sum across save and restore, we have to divide the
         # total across all devices when restoring a variable that was summed
         # when saving.
-        tensor = args[0]
         if self._aggregation == vs.VariableAggregation.SUM:
-          tensor = math_ops.cast(tensor / len(self._values), self.dtype)
-        return control_flow_ops.group(
-            tuple(_assign_on_device(v.device, v, tensor) for v in self._values))
+          value = math_ops.cast(value / len(self._values), self.dtype)
+        return self._assign_on_each_device(_assign_on_device, value,
+                                           read_value)
       else:
-        return super(SyncOnReadVariable, self).assign(*args, **kwargs)
+        return super(SyncOnReadVariable,
+                     self).assign(value, use_locking, name, read_value)
 
   def _scatter_not_implemented(self, method):
     raise NotImplementedError(
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index ef26174e82d..bbff6c631cf 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -651,7 +651,10 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       self.assertIsInstance(v.assign_add(delta), core.Tensor)
 
     # In cross replica context we return a PerReplica which is not Tensor like
-    # yet.
+    # all the time yet.
+    if (synchronization == variables_lib.VariableSynchronization.ON_READ and
+        aggregation != variables_lib.VariableAggregation.SUM):
+      assert_is_tensor_like(v)
 
     # In replica context.
     distribution.run(assert_is_tensor_like, args=(v,))
@@ -1610,10 +1613,16 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         variables_lib.VariableAggregation.MEAN,
         variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
-    options = (  # VariableAggregation.SUM in cross-replica mode is tested below
-        [x for x in itertools.product(updates, aggregations, [True, False])
-         if not(x[1] == variables_lib.VariableAggregation.SUM and x[2])])
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
     for update, aggregation, cross_replica in options:
+      # VariableAggregation.SUM in cross-replica mode is tested below,
+      # VariableAggregation.NONE in cross-replica mode is not supported.
+      if cross_replica and aggregation in [
+          variables_lib.VariableAggregation.SUM,
+          variables_lib.VariableAggregation.NONE,
+      ]:
+        continue
       with distribution.scope():
         v = variable_scope.variable(
             0.,
@@ -1647,10 +1656,16 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
         variables_lib.VariableAggregation.MEAN,
         variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
-    options = (  # VariableAggregation.SUM in cross-replica mode is tested below
-        [x for x in itertools.product(updates, aggregations, [True, False])
-         if not(x[1] == variables_lib.VariableAggregation.SUM and x[2])])
+    options = list(
+        x for x in itertools.product(updates, aggregations, [True, False]))
     for update, aggregation, cross_replica in options:
+      # VariableAggregation.SUM in cross-replica mode is tested below,
+      # VariableAggregation.NONE in cross-replica mode is not supported.
+      if cross_replica and aggregation in [
+          variables_lib.VariableAggregation.SUM,
+          variables_lib.VariableAggregation.NONE,
+      ]:
+        continue
       with distribution.scope():
         v = variable_scope.variable(
             0.,

From 9ef6f66ce1bb24bab2b89886132307bc9d83f9d1 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 20 May 2020 10:49:18 -0700
Subject: [PATCH 0911/1533] Replace absl::StrCat with llvm::formatv when
 creating communication key in TPUExtractOutsideCompilation pass (NFC).

PiperOrigin-RevId: 312512766
Change-Id: I8c48df4c8761533596a463ea37b740daa8d4922a
---
 .../tensorflow/transforms/tpu_extract_outside_compilation.cc  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 234532fd38b..58b3bf8bf7d 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -17,9 +17,9 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/strings/str_cat.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
@@ -176,7 +176,7 @@ void MoveOutsideCompiledOps(
     host_output_types.push_back(external_input.getType());
 
   std::string communication_key =
-      absl::StrCat("host_compute_channel_", outside_cluster_name.str());
+      llvm::formatv("host_compute_channel_{0}", outside_cluster_name).str();
   // XlaRecvAtHostOp takes both the program key(dynamic_key) from the
   // _TpuCompileMlirOp and the communication_key.
   auto recv_at_host = builder.create<TF::_XlaRecvAtHostOp>(

From 502e75c1391045768f46018d8882cfe4acf195fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 10:55:43 -0700
Subject: [PATCH 0912/1533] Prune redundant control inputs early in
 model_pruner, since they may prevent deletion of trivial nodes. Prune NoOp
 nodes with empty fanout.

PiperOrigin-RevId: 312514074
Change-Id: I22cb76f5b9b152fc51ce34918d28a81f929ffa38
---
 .../core/grappler/optimizers/model_pruner.cc  | 24 ++++++++++++++-----
 .../grappler/optimizers/model_pruner_test.cc  | 10 ++++----
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 243ab7bd965..20db4360f73 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -33,6 +33,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
+namespace {
 
 bool IsTrivialIdentity(const NodeDef& node, const GraphView& graph_view) {
   for (const auto input :
@@ -103,7 +104,9 @@ bool IsOutputPortRefValue(const NodeDef& node, int port_id,
 bool CanRemoveNode(const NodeDef& node, const GraphView& graph_view,
                    const absl::flat_hash_set<string>& function_names,
                    const OpRegistryInterface& op_registry) {
-  if (IsNoOp(node) && node.input().empty()) {
+  if (IsNoOp(node) &&
+      (node.input().empty() ||
+       graph_view.NumFanouts(node, /*include_controlled_nodes=*/true) == 0)) {
     return true;
   }
   if (IsConstant(node) && node.input().empty() &&
@@ -412,6 +415,8 @@ Status SplitIdentityNInputs(GraphDef* graph,
   return Status::OK();
 }
 
+}  // namespace
+
 Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* optimized_graph) {
   const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
@@ -453,13 +458,18 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Check if we can further prune the graph, by removing the trivial ops.
   absl::flat_hash_set<const NodeDef*> nodes_to_delete;
-  for (const auto& node : pruned_graph->node()) {
-    if (!IsTrivialOp(node, graph_view)) {
+  for (int i = 0; i < pruned_graph->node_size(); ++i) {
+    NodeDef* node = pruned_graph->mutable_node(i);
+    // Remove redundant control inputs, since they may prevent pruning below.
+    DedupControlInputs(node);
+
+    if (!IsTrivialOp(*node, graph_view)) {
+      VLOG(3) << node->name() << " is not trivial.";
       continue;
     }
 
     // Don't remove nodes that must be preserved.
-    if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
+    if (nodes_to_preserve.find(node->name()) != nodes_to_preserve.end()) {
       continue;
     }
 
@@ -477,8 +487,10 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (CanRemoveNode(node, graph_view, function_names, *op_registry)) {
-      nodes_to_delete.insert(&node);
+    if (CanRemoveNode(*node, graph_view, function_names, *op_registry)) {
+      nodes_to_delete.insert(node);
+    } else {
+      VLOG(3) << node->name() << " cannot be removed";
     }
   }
 
diff --git a/tensorflow/core/grappler/optimizers/model_pruner_test.cc b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
index d2624e3d842..9beadbb7c70 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner_test.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner_test.cc
@@ -100,12 +100,13 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
 
     Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
     Output b = ops::Sqrt(s.WithOpName("b"), {a});
-    Output c = ops::Identity(s.WithOpName("c"), b);
+    Output c = ops::Identity(s.WithOpName("c").WithControlDependencies(b), b);
     Output d = ops::Identity(s.WithOpName("d"), c);
     Output e = ops::Sqrt(s.WithOpName("e"), {d});
 
     TF_ASSERT_OK(s.ToGraphDef(&item.graph));
   }
+  item.fetch.push_back("e");
 
   ModelPruner pruner;
   GraphDef output;
@@ -117,8 +118,6 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
 
     Output a = ops::Const(s.WithOpName("a"), 0.0f, {10, 10});
     Output b = ops::Sqrt(s.WithOpName("b"), {a});
-    Output c = ops::Identity(s.WithOpName("c"), b);
-    Output d = ops::Identity(s.WithOpName("d"), b);
     Output e = ops::Sqrt(s.WithOpName("e"), {b});
 
     TF_ASSERT_OK(s.ToGraphDef(&expected));
@@ -126,10 +125,9 @@ TEST_F(ModelPrunerTest, IdentityPruning) {
 
   CompareGraphs(expected, output);
 
-  std::vector<string> fetch = {"e"};
-  auto actual_tensors = EvaluateNodes(output, fetch);
+  auto actual_tensors = EvaluateNodes(output, item.fetch);
   ASSERT_EQ(actual_tensors.size(), 1);
-  auto expected_tensors = EvaluateNodes(item.graph, fetch);
+  auto expected_tensors = EvaluateNodes(item.graph, item.fetch);
   ASSERT_EQ(expected_tensors.size(), 1);
   test::ExpectTensorEqual<float>(actual_tensors[0], expected_tensors[0]);
 }

From eda2272da11d6fb1c517e4013e83438353277da9 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Wed, 20 May 2020 11:05:28 -0700
Subject: [PATCH 0913/1533] Remove TODO and insert helpful comment to
 ResizeInputTensor()

PiperOrigin-RevId: 312516012
Change-Id: I4d230a687c5b1b16c3ffb478e428b17e16a4bc00
---
 tensorflow/lite/interpreter.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index b93fd76c13b..5278bc85eec 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -322,10 +322,9 @@ class Interpreter {
 
   /// Change the dimensionality of a given tensor. Note, this is only acceptable
   /// for tensor indices that are inputs or variables.
-  /// Returns status of failure or success.
-  /// TODO(aselle): Consider implementing ArraySlice equivalent to make this
-  ///   more adept at accepting data without an extra copy. Use absl::ArraySlice
-  ///   if our partners determine that dependency is acceptable.
+  /// Returns status of failure or success. Note that this doesn't actually
+  /// resize any existing buffers. A call to AllocateTensors() is required to
+  /// change the tensor input buffer.
   TfLiteStatus ResizeInputTensor(int tensor_index,
                                  const std::vector<int>& dims);
 
@@ -334,7 +333,8 @@ class Interpreter {
   // tensor indices that are inputs or variables. Only unknown dimensions can be
   // resized with this function. Unknown dimensions are indicated as `-1` in the
   // `dims_signature` attribute of a `TfLiteTensor`. Returns status of failure
-  // or success.
+  // or success.  Note that this doesn't actually resize any existing buffers.
+  /// A call to AllocateTensors() is required to change the tensor input buffer.
   TfLiteStatus ResizeInputTensorStrict(int tensor_index,
                                        const std::vector<int>& dims);
 

From be0672e8d66c1fabe2b7be7d558e19125da1696d Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Wed, 20 May 2020 11:11:06 -0700
Subject: [PATCH 0914/1533] [XLA] Fix async copy statistics to include
 non-entry computations.

PiperOrigin-RevId: 312517119
Change-Id: I87bb65cd9ef8dd0ad96b1af60d101b5da2b2eb8c
---
 .../xla/service/memory_space_assignment.cc    | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 274b7e87f99..bd7a10248b6 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -1847,26 +1847,27 @@ MemorySpaceAssignment::CalculateAsyncCopyStats() const {
   int64 current_copies = 0;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
                       HloDataflowAnalysis::Run(*module_));
-  for (HloInstruction* instruction : module_->schedule()
-                                         .sequence(module_->entry_computation())
-                                         .instructions()) {
-    if (instruction->opcode() == HloOpcode::kCopyStart) {
-      current_copies++;
-    } else if (instruction->opcode() == HloOpcode::kCopyDone) {
-      current_copies--;
-      int64 size =
-          options_.size_fn(dataflow_analysis->GetUniqueValueAt(instruction));
-      if (instruction->shape().layout().memory_space() ==
-          options_.alternate_memory_space) {
-        ++stats.num_prefetches;
-        stats.prefetch_bytes += size;
-      } else {
-        ++stats.num_evictions;
-        stats.eviction_bytes += size;
+  for (const HloComputation* computation :
+       module_->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        current_copies++;
+      } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+        current_copies--;
+        int64 size =
+            options_.size_fn(dataflow_analysis->GetUniqueValueAt(instruction));
+        if (instruction->shape().layout().memory_space() ==
+            options_.alternate_memory_space) {
+          ++stats.num_prefetches;
+          stats.prefetch_bytes += size;
+        } else {
+          ++stats.num_evictions;
+          stats.eviction_bytes += size;
+        }
       }
+      stats.max_outstanding_async_copies =
+          std::max(stats.max_outstanding_async_copies, current_copies);
     }
-    stats.max_outstanding_async_copies =
-        std::max(stats.max_outstanding_async_copies, current_copies);
   }
   return stats;
 }

From 07898e752cf02518508f193a0be2e451450044bd Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 20 May 2020 11:28:10 -0700
Subject: [PATCH 0915/1533] Provide a more informative error message when the
 bazel version check fails

Currently, if the version check fails, the error message is:

```
subprocess.CalledProcessError: Command '['bazel', '--batch', '--bazelrc=/dev/null', 'version']' returned non-zero exit status 1.
```

After this patch, it becomes:

```
Error checking bazel version:  ERROR: The project you're trying to build requires Bazel 3.0.0 (specified in /usr/local/google/home/cheshire/code/opensource/docker_tf/tensorflow/.bazelversion), but it wasn't found in /usr/bin.

You can install the required Bazel version via apt:
  sudo apt update && sudo apt install bazel-3.0.0
```

PiperOrigin-RevId: 312520687
Change-Id: I41523f7defa3db10aa34b6b313d6b65c792b2020
---
 configure.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/configure.py b/configure.py
index 9154000d944..0a5b87172c0 100644
--- a/configure.py
+++ b/configure.py
@@ -1368,8 +1368,13 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  current_bazel_version = check_bazel_version(_TF_MIN_BAZEL_VERSION,
-                                              _TF_MAX_BAZEL_VERSION)
+  try:
+    current_bazel_version = check_bazel_version(_TF_MIN_BAZEL_VERSION,
+                                                _TF_MAX_BAZEL_VERSION)
+  except subprocess.CalledProcessError as e:
+    print("Error checking bazel version: ", e.output.decode('UTF-8').strip())
+    raise e
+
   _TF_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)
 
   reset_tf_configure_bazelrc()

From 0992a65a5d016febeddd2e094c854a3806f19165 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 20 May 2020 11:30:15 -0700
Subject: [PATCH 0916/1533] Move the _BaseFeatureLayer back to Keras.

All its subclasses are in keras/feature_column

PiperOrigin-RevId: 312521092
Change-Id: Icba59f4be0299487df5e2fd86ab697ab9e7317b3
---
 .../feature_column/feature_column_v2.py       | 111 --------------
 tensorflow/python/keras/feature_column/BUILD  |  16 ++
 .../feature_column/base_feature_layer.py      | 145 ++++++++++++++++++
 .../keras/feature_column/dense_features.py    |   3 +-
 .../keras/feature_column/dense_features_v2.py |   3 +-
 .../feature_column/sequence_feature_column.py |   3 +-
 ...eras.experimental.-sequence-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 ...eras.experimental.-sequence-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 10 files changed, 171 insertions(+), 118 deletions(-)
 create mode 100644 tensorflow/python/keras/feature_column/base_feature_layer.py

diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index b572987d52d..7db4f17c10d 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -383,117 +383,6 @@ class _StateManagerImplV2(_StateManagerImpl):
     return var
 
 
-class _BaseFeaturesLayer(Layer):
-  """Base class for DenseFeatures and SequenceFeatures.
-
-  Defines common methods and helpers.
-
-  Args:
-    feature_columns: An iterable containing the FeatureColumns to use as
-      inputs to your model.
-    expected_column_type: Expected class for provided feature columns.
-    trainable:  Boolean, whether the layer's variables will be updated via
-      gradient descent during training.
-    name: Name to give to the DenseFeatures.
-    **kwargs: Keyword arguments to construct a layer.
-
-  Raises:
-    ValueError: if an item in `feature_columns` doesn't match
-      `expected_column_type`.
-  """
-
-  def __init__(self,
-               feature_columns,
-               expected_column_type,
-               trainable,
-               name,
-               partitioner=None,
-               **kwargs):
-    super(_BaseFeaturesLayer, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    self._partitioner = partitioner
-    for column in self._feature_columns:
-      if not isinstance(column, expected_column_type):
-        raise ValueError(
-            'Items of feature_columns must be a {}. '
-            'You can wrap a categorical column with an '
-            'embedding_column or indicator_column. Given: {}'.format(
-                expected_column_type, column))
-
-  def build(self, _):
-    for column in self._feature_columns:
-      with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
-          self.name,
-          partitioner=self._partitioner):
-        with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
-            _sanitize_column_name_for_variable_scope(column.name)):
-          column.create_state(self._state_manager)
-    super(_BaseFeaturesLayer, self).build(None)
-
-  def _output_shape(self, input_shape, num_elements):
-    """Computes expected output shape of the layer or a column's dense tensor.
-
-    Args:
-      input_shape: Tensor or array with batch shape.
-      num_elements: Size of the last dimension of the output.
-
-    Returns:
-      Tuple with output shape.
-    """
-    raise NotImplementedError('Calling an abstract method.')
-
-  def compute_output_shape(self, input_shape):
-    total_elements = 0
-    for column in self._feature_columns:
-      total_elements += column.variable_shape.num_elements()
-    return self._target_shape(input_shape, total_elements)
-
-  def _process_dense_tensor(self, column, tensor):
-    """Reshapes the dense tensor output of a column based on expected shape.
-
-    Args:
-      column: A DenseColumn or SequenceDenseColumn object.
-      tensor: A dense tensor obtained from the same column.
-
-    Returns:
-      Reshaped dense tensor."""
-    num_elements = column.variable_shape.num_elements()
-    target_shape = self._target_shape(array_ops.shape(tensor), num_elements)
-    return array_ops.reshape(tensor, shape=target_shape)
-
-  def _verify_and_concat_tensors(self, output_tensors):
-    """Verifies and concatenates the dense output of several columns."""
-    _verify_static_batch_size_equality(output_tensors, self._feature_columns)
-    return array_ops.concat(output_tensors, -1)
-
-  def get_config(self):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    column_configs = serialization.serialize_feature_columns(
-        self._feature_columns)
-    config = {'feature_columns': column_configs}
-    config['partitioner'] = generic_utils.serialize_keras_object(
-        self._partitioner)
-
-    base_config = super(  # pylint: disable=bad-super-call
-        _BaseFeaturesLayer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    config_cp = config.copy()
-    config_cp['feature_columns'] = serialization.deserialize_feature_columns(
-        config['feature_columns'], custom_objects=custom_objects)
-    config_cp['partitioner'] = generic_utils.deserialize_keras_object(
-        config['partitioner'], custom_objects)
-
-    return cls(**config_cp)
-
-
 class _LinearModelLayer(Layer):
   """Layer that contains logic for `LinearModel`."""
 
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 94097c28d73..6af53646d2f 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -14,18 +14,32 @@ py_library(
     name = "feature_column",
     srcs = ["__init__.py"],
     deps = [
+        ":base_feature_layer",
         ":dense_features",
         ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
+py_library(
+    name = "base_feature_layer",
+    srcs = ["base_feature_layer.py"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras/engine:base_layer",
+        "//tensorflow/python/keras/utils:generic_utils",
+    ],
+)
+
 py_library(
     name = "dense_features",
     srcs = [
         "dense_features.py",
     ],
     deps = [
+        ":base_feature_layer",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf_export",
         "//tensorflow/python:util",
@@ -40,6 +54,7 @@ py_library(
         "dense_features_v2.py",
     ],
     deps = [
+        ":base_feature_layer",
         ":dense_features",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf_export",
@@ -98,6 +113,7 @@ py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
     deps = [
+        ":base_feature_layer",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/keras/feature_column/base_feature_layer.py b/tensorflow/python/keras/feature_column/base_feature_layer.py
new file mode 100644
index 00000000000..12f507efe83
--- /dev/null
+++ b/tensorflow/python/keras/feature_column/base_feature_layer.py
@@ -0,0 +1,145 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""This API defines FeatureColumn abstraction."""
+
+# This file was originally under tf/python/feature_column, and was moved to
+# Keras package in order to remove the reverse dependency from TF to Keras.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.feature_column import feature_column_v2
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+
+
+class _BaseFeaturesLayer(Layer):
+  """Base class for DenseFeatures and SequenceFeatures.
+
+  Defines common methods and helpers.
+
+  Args:
+    feature_columns: An iterable containing the FeatureColumns to use as
+      inputs to your model.
+    expected_column_type: Expected class for provided feature columns.
+    trainable:  Boolean, whether the layer's variables will be updated via
+      gradient descent during training.
+    name: Name to give to the DenseFeatures.
+    **kwargs: Keyword arguments to construct a layer.
+
+  Raises:
+    ValueError: if an item in `feature_columns` doesn't match
+      `expected_column_type`.
+  """
+
+  def __init__(self,
+               feature_columns,
+               expected_column_type,
+               trainable,
+               name,
+               partitioner=None,
+               **kwargs):
+    super(_BaseFeaturesLayer, self).__init__(
+        name=name, trainable=trainable, **kwargs)
+    self._feature_columns = feature_column_v2._normalize_feature_columns(  # pylint: disable=protected-access
+        feature_columns)
+    self._state_manager = feature_column_v2._StateManagerImpl(  # pylint: disable=protected-access
+        self, self.trainable)
+    self._partitioner = partitioner
+    for column in self._feature_columns:
+      if not isinstance(column, expected_column_type):
+        raise ValueError(
+            'Items of feature_columns must be a {}. '
+            'You can wrap a categorical column with an '
+            'embedding_column or indicator_column. Given: {}'.format(
+                expected_column_type, column))
+
+  def build(self, _):
+    for column in self._feature_columns:
+      with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
+          self.name,
+          partitioner=self._partitioner):
+        with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
+            feature_column_v2._sanitize_column_name_for_variable_scope(  # pylint: disable=protected-access
+                column.name)):
+          column.create_state(self._state_manager)
+    super(_BaseFeaturesLayer, self).build(None)
+
+  def _output_shape(self, input_shape, num_elements):
+    """Computes expected output shape of the layer or a column's dense tensor.
+
+    Args:
+      input_shape: Tensor or array with batch shape.
+      num_elements: Size of the last dimension of the output.
+
+    Returns:
+      Tuple with output shape.
+    """
+    raise NotImplementedError('Calling an abstract method.')
+
+  def compute_output_shape(self, input_shape):
+    total_elements = 0
+    for column in self._feature_columns:
+      total_elements += column.variable_shape.num_elements()
+    return self._target_shape(input_shape, total_elements)
+
+  def _process_dense_tensor(self, column, tensor):
+    """Reshapes the dense tensor output of a column based on expected shape.
+
+    Args:
+      column: A DenseColumn or SequenceDenseColumn object.
+      tensor: A dense tensor obtained from the same column.
+
+    Returns:
+      Reshaped dense tensor.
+    """
+    num_elements = column.variable_shape.num_elements()
+    target_shape = self._target_shape(array_ops.shape(tensor), num_elements)
+    return array_ops.reshape(tensor, shape=target_shape)
+
+  def _verify_and_concat_tensors(self, output_tensors):
+    """Verifies and concatenates the dense output of several columns."""
+    feature_column_v2._verify_static_batch_size_equality(  # pylint: disable=protected-access
+        output_tensors, self._feature_columns)
+    return array_ops.concat(output_tensors, -1)
+
+  def get_config(self):
+    # Import here to avoid circular imports.
+    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
+    column_configs = serialization.serialize_feature_columns(
+        self._feature_columns)
+    config = {'feature_columns': column_configs}
+    config['partitioner'] = generic_utils.serialize_keras_object(
+        self._partitioner)
+
+    base_config = super(  # pylint: disable=bad-super-call
+        _BaseFeaturesLayer, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    # Import here to avoid circular imports.
+    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
+    config_cp = config.copy()
+    config_cp['feature_columns'] = serialization.deserialize_feature_columns(
+        config['feature_columns'], custom_objects=custom_objects)
+    config_cp['partitioner'] = generic_utils.deserialize_keras_object(
+        config['partitioner'], custom_objects)
+
+    return cls(**config_cp)
diff --git a/tensorflow/python/keras/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
index 820f1a6b1b7..ef533b71fe7 100644
--- a/tensorflow/python/keras/feature_column/dense_features.py
+++ b/tensorflow/python/keras/feature_column/dense_features.py
@@ -23,12 +23,13 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.feature_column import base_feature_layer as kfc
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export(v1=['keras.layers.DenseFeatures'])
-class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
+class DenseFeatures(kfc._BaseFeaturesLayer):  # pylint: disable=protected-access
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
index e4dc22f1bbe..40c71ce7bd6 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.feature_column import base_feature_layer as kfc
 from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.util.tf_export import keras_export
 
@@ -92,4 +93,4 @@ class DenseFeatures(dense_features.DenseFeatures):
         column.create_state(self._state_manager)
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
-    super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
+    super(kfc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column.py b/tensorflow/python/keras/feature_column/sequence_feature_column.py
index 856e385c8fa..5f64ca9642e 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.feature_column import base_feature_layer as kfc
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.util.tf_export import keras_export
@@ -32,7 +33,7 @@ from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export('keras.experimental.SequenceFeatures')
-class SequenceFeatures(fc._BaseFeaturesLayer):
+class SequenceFeatures(kfc._BaseFeaturesLayer):
   """A layer for sequence input.
 
     All `feature_columns` must be sequence dense columns with the same
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
index 41483f2b83d..e2bef6beaaa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.experimental.SequenceFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.feature_column.sequence_feature_column.SequenceFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ba9156d7f95..7ed6c7747a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
index 41483f2b83d..e2bef6beaaa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-sequence-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.experimental.SequenceFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.feature_column.sequence_feature_column.SequenceFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 130a9954202..3b4eb863387 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.base_feature_layer._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"

From 8d51ed5895c4e24c61099e2b58cca79cfae24710 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Wed, 20 May 2020 11:34:25 -0700
Subject: [PATCH 0917/1533] Replace _TPUCompileMlir placeholder ops with
 correct compile op in parallel_execute regions.

When adding parallel_execute regions for outside compilation, _TPUCompileMlir placeholder ops are generated since the _TPUCompileMlir op is not created until this pass.  This change replaces those placeholder ops with the newly created _TPUCompileMlir op.

PiperOrigin-RevId: 312521930
Change-Id: I2136c9569ea853875397a83dc40eebb7db004a4d
---
 .../compiler/mlir/tensorflow/tests/tpu_rewrite.mlir | 12 +++++++++++-
 .../mlir/tensorflow/transforms/tpu_rewrite_pass.cc  | 13 +++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 332b46f427f..5d65342b4a7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -1234,16 +1234,26 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
     // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+      // CHECK: %[[COMPILE_OUTPUT:[0-9]*]]:2 = "tf_device.launch"
       // CHECK: "tf._TPUCompileMlir"
       // CHECK: "tf.TPUCompileSucceededAssert"
       // CHECK: "tf_device.parallel_execute"
+      // CHECK-NOT:"tf._TPUCompileMlir"
+      // CHECK:    "tf.D"(%[[COMPILE_OUTPUT]]#1
       // CHECK:    "tf.TPUExecute"
+      // CHECK-NOT:"tf._TPUCompileMlir"
+      // CHECK:    "tf.E"(%[[COMPILE_OUTPUT]]#1
       %3 = "tf_device.parallel_execute"() ( {
-        "tf.D"() : () -> ()
+        %status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        "tf.D"(%program) : (tensor<!tf.string>) -> ()
         tf_device.return
       }, {
         %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
         tf_device.return %4 : tensor<?xi32>
+      }, {
+        %status, %program = "tf._TPUCompileMlir"() {metadata = "...", mlir_module = "..."} : () -> (tensor<!tf.string>, tensor<!tf.string>)
+        "tf.E"(%program) : (tensor<!tf.string>) -> ()
+        tf_device.return
       }) : () -> (tensor<?xi32>)
       tf_device.return %3 : tensor<?xi32>
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index a7ad6a964b9..696882cd105 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -701,6 +701,19 @@ LogicalResult Rewrite(
       std::move(tpu_device_assignment.xla_device_assignment), builder);
   if (!compile_op) return failure();
 
+  // This replaces _TPUCompileMlir placeholder ops that are required
+  // by XlaRecvAtHost and XlaSendFromHost ops add in earlier pass.
+  // TODO(b/157054714): When a better abstraction instead of _TPUCompileMlirOp
+  // and _XlaRecvAtHostOp and _XlaSendFromHostOp are used, update to a more
+  // structured lowering.
+  if (auto parallel_op = llvm::dyn_cast<tf_device::ParallelExecuteOp>(
+          cluster_func.getParentOp())) {
+    parallel_op.walk([&](TF::_TPUCompileMlirOp parallel_compile_op) {
+      parallel_compile_op.replaceAllUsesWith(compile_op);
+      parallel_compile_op.erase();
+    });
+  }
+
   // After rewrite, find if there is a TPUCompilationResultOp in the block with
   // the same _tpu_replicate attribute and replace it with the result of the
   // compile op. This op is used as a placeholder to hook during graph creation

From 26a92af90f1c9b16dc24cf40a5e9087895ccbf21 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Wed, 20 May 2020 11:52:37 -0700
Subject: [PATCH 0918/1533] Clean up unneeded lines from  BUILD file.

PiperOrigin-RevId: 312525451
Change-Id: I36e4bfc32a49bc83af1a6d7ab4701e9264d6e83b
---
 tensorflow/python/tpu/BUILD | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index 5b466d7e20a..d398396ec2a 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -441,10 +441,6 @@ py_library(
     name = "tpu_embedding_v2_utils",
     srcs = ["tpu_embedding_v2_utils.py"],
     srcs_version = "PY2AND3",
-    visibility = [
-        "//learning/brain/contrib/learn/tpu:__subpackages__",
-        "//quality/deepsearch:__subpackages__",
-    ],
     deps = [
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute:device_util",
@@ -460,10 +456,6 @@ py_library(
     name = "tpu_embedding_v2",
     srcs = ["tpu_embedding_v2.py"],
     srcs_version = "PY2AND3",
-    visibility = [
-        "//learning/brain/contrib/learn/tpu:__subpackages__",
-        "//quality/deepsearch:__subpackages__",
-    ],
     deps = [
         ":tpu_embedding_v2_utils",
         "//tensorflow/python:variable_scope",

From bf1afb4a36895c2863484acf89a8eb50229346e1 Mon Sep 17 00:00:00 2001
From: Alex Stark <starka@google.com>
Date: Wed, 20 May 2020 11:57:59 -0700
Subject: [PATCH 0919/1533] Correct mistaken ms to ns conversion change.

PiperOrigin-RevId: 312526413
Change-Id: If9a16f26caf69e1f2c5f434114630dbfacd0c038
---
 .../org/tensorflow/ovic/OvicBenchmarker.java   | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
index 49cf21debc5..839984cfc5d 100644
--- a/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
+++ b/tensorflow/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
@@ -57,19 +57,19 @@ public abstract class OvicBenchmarker {
   /** Total runtime in ns. */
   protected double totalRuntimeNano = 0.0;
   /** Total allowed runtime in ms. */
-  protected double wallTimeNano = 20000 * 30 * 1.0e6;
+  protected double wallTimeMilli = 20000 * 30.0;
   /** Record whether benchmark has started (used to skip the first image). */
   protected boolean benchmarkStarted = false;
 
   /**
    * Initializes an {@link OvicBenchmarker}
    *
-   * @param wallTimeNano: a double number specifying the total amount of time to benchmark.
+   * @param wallTimeMilli: a double number specifying the total amount of time to benchmark.
    */
-  public OvicBenchmarker(double wallTimeNano) {
+  protected OvicBenchmarker(double wallTimeMilli) {
     benchmarkStarted = false;
     totalRuntimeNano = 0.0;
-    this.wallTimeNano = wallTimeNano;
+    this.wallTimeMilli = wallTimeMilli;
   }
 
   /** Return the cumulative latency of all runs so far. */
@@ -79,13 +79,13 @@ public abstract class OvicBenchmarker {
 
   /** Check whether the benchmarker should stop. */
   public Boolean shouldStop() {
-    if (totalRuntimeNano >= wallTimeNano) {
+    if ((totalRuntimeNano * 1.0 / 1e6) >= wallTimeMilli) {
       Log.e(
           TAG,
-          "Total runtime (ms) "
-              + (totalRuntimeNano * 1.0e-6)
-              + " exceeded wall-time "
-              + (wallTimeNano * 1.0e-6));
+          "Total runtime "
+              + (totalRuntimeNano * 1.0 / 1e6)
+              + " exceeded walltime (ms) "
+              + wallTimeMilli);
       return true;
     }
     return false;

From 0642f8155f9d1391471067e7d97fee39521d3c44 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 20 May 2020 12:05:58 -0700
Subject: [PATCH 0920/1533] Narrow down argmin/argmax contract to always return
 the smallest index for ties

Currently we get this behavior consistently across TF/XLA:CPU/XLA:GPU/XLA:TPU, and it also matches Numpy semantics.

PiperOrigin-RevId: 312528188
Change-Id: I16901ff67052182fe374235f8c7521cbdf047779
---
 tensorflow/python/ops/math_ops.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 31994c16ddd..06132cc9674 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -256,7 +256,7 @@ def argmax(input,
 def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the largest value across axes of a tensor.
 
-  Note that in case of ties the identity of the return value is not guaranteed.
+  In case of identity returns the smallest index.
 
   For example:
 
@@ -269,6 +269,9 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   <tf.Tensor: shape=(5,), dtype=int64, numpy=array([2, 2, 0, 2, 2])>
   >>> tf.math.argmax(B, 1)
   <tf.Tensor: shape=(3,), dtype=int64, numpy=array([2, 2, 1])>
+  >>> C = tf.constant([0, 0, 0, 0])
+  >>> tf.math.argmax(C) # Returns smallest index in case of ties
+  <tf.Tensor: shape=(), dtype=int64, numpy=0>
 
   Args:
     input: A `Tensor`.
@@ -307,7 +310,7 @@ def argmin(input,
 def argmin_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the smallest value across axes of a tensor.
 
-  Note that in case of ties the identity of the return value is not guaranteed.
+  Returns the smallest index in case of ties.
 
   Args:
     input: A `Tensor`. Must be one of the following types: `float32`, `float64`,

From 236b503131839d32d92e6c97a7dcc93d4072a959 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Wed, 20 May 2020 12:14:41 -0700
Subject: [PATCH 0921/1533] Removing redudancy in device mapping logging

PiperOrigin-RevId: 312529744
Change-Id: I73cce5429a0f5d351a8acdcd54c00e11a1f0d1ea
---
 tensorflow/core/common_runtime/direct_session.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index d104e0a985f..96938bcbafd 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -349,12 +349,12 @@ DirectSession::DirectSession(const SessionOptions& options,
   int devices_added = 0;
   if (options.config.log_device_placement()) {
     const string mapping_str = device_mgr_->DeviceMappingString();
+    string msg;
     if (mapping_str.empty()) {
-      printf("Device mapping: no known devices.\n");
+      msg = "Device mapping: no known devices.";
     } else {
-      printf("Device mapping:\n%s", mapping_str.c_str());
+      msg = strings::StrCat("Device mapping:\n", mapping_str);
     }
-    string msg = strings::StrCat("Device mapping:\n", mapping_str);
     if (!logging::LogToListeners(msg)) {
       LOG(INFO) << msg;
     }

From e510776645702e6f86abcfd4faa9d56f894305c1 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Wed, 20 May 2020 13:20:25 -0700
Subject: [PATCH 0922/1533] Hexagon Delegate: add pack op support for
 int8/uint8

PiperOrigin-RevId: 312542489
Change-Id: I26c101a6e888a2ad918093761b0eb055d3aae7f8
---
 .../experimental/delegates/hexagon/README.md  |   1 +
 .../delegates/hexagon/builders/BUILD          |   2 +
 .../delegates/hexagon/builders/op_builder.cc  |   2 +
 .../delegates/hexagon/builders/op_factory.h   |   1 +
 .../hexagon/builders/pack_builder.cc          | 134 ++++++++++++++++++
 .../delegates/hexagon/builders/pack_builder.h |  46 ++++++
 .../delegates/hexagon/builders/tests/BUILD    |   1 +
 .../hexagon/builders/tests/pack_test.cc       | 125 ++++++++++++++++
 .../experimental/delegates/hexagon/utils.cc   |  10 ++
 9 files changed, 322 insertions(+)
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.cc
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/tests/pack_test.cc

diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index b0d97b42c99..106ddce038b 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -86,6 +86,7 @@ are verified in `IsNodeSupportedByHexagon`:
 * MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
+* Pack
 * Pad: Only supports 0 padding (b/139277813)
 * Quantize (8-bit inputs & outputs only)
 * Relu
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index d120d414181..feadd096c54 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -23,6 +23,7 @@ cc_library(
         "mirror_pad_builder.cc",
         "neg_op_builder.cc",
         "op_builder.cc",
+        "pack_builder.cc",
         "pad_builder.cc",
         "pool_2d_builder.cc",
         "quantize_builder.cc",
@@ -52,6 +53,7 @@ cc_library(
         "mirror_pad_builder.h",
         "neg_op_builder.h",
         "op_builder.h",
+        "pack_builder.h",
         "pad_builder.h",
         "pool_2d_builder.h",
         "quantize_builder.h",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index 072f8da6fff..ba264313805 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -99,6 +99,8 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateMinMaxBuilder(this, OP_QuantizedMaximum_8);
     case kTfLiteBuiltinSlice:
       return CreateSliceOpBuilder(this, OP_QuantizedSlice_8);
+    case kTfLiteBuiltinPack:
+      return CreatePackBuilder(this, OP_QuantizedPack_8);
     default:
       context_->ReportError(context_, "Op not supported: %d", op_type);
       return nullptr;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index 181ad57b3cb..e44bf78992d 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -56,6 +56,7 @@ OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreatePackBuilder(GraphBuilder* graph_builder, int op_type);
 
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.cc
new file mode 100644
index 00000000000..1d99f3bbb8d
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.cc
@@ -0,0 +1,134 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h"
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+namespace {
+
+int GetAxis(int axis, const TfLiteIntArray* inputs, TfLiteContext* context) {
+  auto& input_tensor = context->tensors[inputs->data[0]];
+  // Handle -ve axis.
+  if (axis < 0) {
+    axis += input_tensor.dims->size + 1;
+  }
+  // We need to adjust the axis to be as if the inputs are of rank 4, since
+  // we represent tensors in Hexagon of rank 4.
+  return (4 - input_tensor.dims->size) + axis - 1;
+}
+
+}  // namespace
+TfLiteStatus PackOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                             const TfLiteIntArray* outputs,
+                                             TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  auto* params = reinterpret_cast<TfLitePackParams*>(builtin_data_);
+  int axis = GetAxis(params->axis, inputs, context);
+  // Add axis
+  auto* axis_node = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&axis), sizeof(axis));
+  AddInput(TensorID(axis_node->GetID(), 0));
+
+  // Add all input tensors.
+  minima_.reserve(inputs->size);
+  maxima_.reserve(inputs->size);
+  int tensor_id = -1;
+  float data_min, data_max;
+  for (int i = 0; i < inputs->size; ++i) {
+    tensor_id = inputs->data[i];
+    auto& input_tensor = context->tensors[tensor_id];
+    AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
+    TF_LITE_ENSURE_STATUS(
+        ComputeMinAndMaxQuantValues(input_tensor, &data_min, &data_max));
+    minima_.push_back(data_min);
+    maxima_.push_back(data_max);
+  }
+
+  // Minima tensors.
+  for (int i = 0; i < minima_.size(); ++i) {
+    auto* data_min_const = graph_builder_->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&minima_[i]), sizeof(minima_[i]));
+    AddInput(TensorID(data_min_const->GetID(), 0));
+  }
+
+  // Maxima tensors.
+  for (int i = 0; i < maxima_.size(); ++i) {
+    auto* data_max_const = graph_builder_->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&maxima_[i]), sizeof(maxima_[i]));
+    AddInput(TensorID(data_max_const->GetID(), 0));
+  }
+
+  // Hexagon outputs for this node.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+
+  TensorID pack_out = AddOutput(sizeof(uint8_t), 4,
+                                {output_batch_size, output_height_size,
+                                 output_width_size, output_depth_size});
+
+  // Output min/max for requantization.
+  float output_min, output_max;
+  TF_LITE_ENSURE_STATUS(ComputeMinAndMaxQuantValues(
+      context->tensors[outputs->data[0]], &output_min, &output_max));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+
+  const auto& pack_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  const auto& pack_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  // Requantize output to the expected min/max.
+  auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  requantize_op->SetOpType(OP_Requantize_8to8);
+  requantize_op->AddInput(pack_out);
+  requantize_op->AddInput(pack_out_min);
+  requantize_op->AddInput(pack_out_max);
+  requantize_op->AddInput(TensorID(output_min_const->GetID(), 0));
+  requantize_op->AddInput(TensorID(output_max_const->GetID(), 0));
+  node_output_ =
+      requantize_op->AddOutput(sizeof(uint8_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  return kTfLiteOk;
+}
+
+TfLiteStatus PackOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                            TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+OpBuilder* CreatePackBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new PackOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h
new file mode 100644
index 00000000000..a372c519c01
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/pack_builder.h
@@ -0,0 +1,46 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
+#include <vector>
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class PackOpBuilder : public OpBuilder {
+ public:
+  explicit PackOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  // Min/max for all inputs.
+  std::vector<float> minima_, maxima_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_PACK_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index bcabf0dbe62..0627d5b202d 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -34,6 +34,7 @@ hexagon_op_tests(
         "mirror_pad_test.cc",
         "mul_test.cc",
         "neg_test.cc",
+        "pack_test.cc",
         "pad_test.cc",
         "pool_test.cc",
         "quantize_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/pack_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/pack_test.cc
new file mode 100644
index 00000000000..6f030575a01
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/pack_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class PackOpModel : public SingleOpModelWithHexagon {
+ public:
+  PackOpModel(const TensorData& input_template, int axis, int values_count) {
+    std::vector<std::vector<int>> all_input_shapes;
+    for (int i = 0; i < values_count; ++i) {
+      all_input_shapes.push_back(input_template.shape);
+      AddInput(input_template);
+    }
+    output_ = AddOutput({input_template.type, /*shape=*/{}, input_template.min,
+                         input_template.max});
+    SetBuiltinOp(BuiltinOperator_PACK, BuiltinOptions_PackOptions,
+                 CreatePackOptions(builder_, values_count, axis).Union());
+    BuildInterpreter(all_input_shapes);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+  template <typename integer_type>
+  void SetInput(int index, std::initializer_list<float> data) {
+    QuantizeAndPopulate<integer_type>(index, data);
+  }
+
+  template <typename integer_type>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<integer_type>(ExtractVector<integer_type>(output_),
+                                    GetScale(output_), GetZeroPoint(output_));
+  }
+
+ private:
+  int output_;
+};
+
+template <typename InputType>
+struct PackOpTest : public ::testing::Test {
+  using TypeToTest = InputType;
+  TensorType TENSOR_TYPE =
+      (std::is_same<InputType, int16_t>::value
+           ? TensorType_INT16
+           : (std::is_same<InputType, uint8_t>::value ? TensorType_UINT8
+                                                      : TensorType_INT8));
+};
+
+using TestTypes = testing::Types<int8_t, uint8_t>;
+TYPED_TEST_CASE(PackOpTest, TestTypes);
+
+TYPED_TEST(PackOpTest, ThreeInputs) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2}, -10, 10}, 0, 3);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 4});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {2, 5});
+  model.SetInput<typename TestFixture::TypeToTest>(2, {3, 6});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+TYPED_TEST(PackOpTest, ThreeInputsDifferentAxis) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2}, -10, 10}, 1, 3);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 4});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {2, 5});
+  model.SetInput<typename TestFixture::TypeToTest>(2, {3, 6});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+TYPED_TEST(PackOpTest, ThreeInputsNegativeAxis) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2}, -10, 10}, -1, 3);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 4});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {2, 5});
+  model.SetInput<typename TestFixture::TypeToTest>(2, {3, 6});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+TYPED_TEST(PackOpTest, MultilDimensions) {
+  PackOpModel model({TestFixture::TENSOR_TYPE, {2, 3}, -10, 20}, 1, 2);
+  model.SetInput<typename TestFixture::TypeToTest>(0, {1, 2, 3, 4, 5, 6});
+  model.SetInput<typename TestFixture::TypeToTest>(1, {7, 8, 9, 10, 11, 12});
+  model.Invoke();
+  auto ref_output_shape = model.GetOutputShape();
+  auto ref_output =
+      model.GetDequantizedOutput<typename TestFixture::TypeToTest>();
+  model.ApplyDelegateAndInvoke();
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray(ref_output_shape));
+  EXPECT_THAT(model.GetDequantizedOutput<typename TestFixture::TypeToTest>(),
+              ElementsAreArray(ArrayFloatNear(ref_output)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index 723349ef23e..f75447f8ea6 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -87,6 +87,7 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinMinimum:
     case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
+    case kTfLiteBuiltinPack:
     case kTfLiteBuiltinPad:
     case kTfLiteBuiltinQuantize:
     case kTfLiteBuiltinRelu6:
@@ -398,6 +399,15 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
                                      {kTfLiteInt32, kTfLiteInt64},
                                      {kTfLiteInt32, kTfLiteInt64}});
     }
+    case kTfLiteBuiltinPack: {
+      // All tensors must be 8-bit.
+      for (int i = 0; i < node->inputs->size; ++i) {
+        if (!TensorTypeMatch(node->inputs->data[i], context, kTfLiteUInt8) &&
+            !TensorTypeMatch(node->inputs->data[i], context, kTfLiteInt8))
+          return false;
+      }
+      return true;
+    }
     default:
       return false;
   }

From a66070b1844fce59b9dabb5e765b9ef21a2704c8 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Wed, 20 May 2020 13:25:50 -0700
Subject: [PATCH 0923/1533] Fix BufferAndPlanClearingTest

Properly instantiate this parameterized test target.

PiperOrigin-RevId: 312543593
Change-Id: I4a104c8f0ffff30e79bdc0b0f9c89ae30ca4c34e
---
 tensorflow/lite/simple_memory_arena.cc      | 2 ++
 tensorflow/lite/simple_memory_arena_test.cc | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc
index a4d6d19656b..4aa0a1eb2ef 100644
--- a/tensorflow/lite/simple_memory_arena.cc
+++ b/tensorflow/lite/simple_memory_arena.cc
@@ -136,6 +136,8 @@ TfLiteStatus SimpleMemoryArena::ResolveAlloc(
     char** output_ptr) {
   TF_LITE_ENSURE(context, committed_);
   TF_LITE_ENSURE(context, output_ptr != nullptr);
+  TF_LITE_ENSURE(context,
+                 underlying_buffer_size_ >= (alloc.offset + alloc.size));
   if (alloc.size == 0) {
     *output_ptr = nullptr;
   } else {
diff --git a/tensorflow/lite/simple_memory_arena_test.cc b/tensorflow/lite/simple_memory_arena_test.cc
index fe337562b0a..0196421cc9c 100644
--- a/tensorflow/lite/simple_memory_arena_test.cc
+++ b/tensorflow/lite/simple_memory_arena_test.cc
@@ -197,6 +197,9 @@ TEST_P(BufferAndPlanClearingTest, TestClearBufferAndClearPlan) {
   EXPECT_NE(resolved_ptr, nullptr);
 }
 
+INSTANTIATE_TEST_SUITE_P(BufferAndPlanClearingTest, BufferAndPlanClearingTest,
+                         ::testing::Values(true, false));
+
 }  // namespace
 }  // namespace tflite
 

From f1fe60fea6a0aaf2424fbd875d0bf775f5a655d1 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Mon, 18 May 2020 13:38:37 -0500
Subject: [PATCH 0924/1533] Use _v7 cudnnGetConvolution.*Algorithm APIs

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc   | 65 +++++++++++++++++++
 tensorflow/stream_executor/cuda/cudnn_8_0.inc | 45 -------------
 2 files changed, 65 insertions(+), 45 deletions(-)
 mode change 100755 => 100644 tensorflow/stream_executor/cuda/cuda_dnn.cc

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
old mode 100755
new mode 100644
index 8cb562fe41a..35fdb4c6959
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -2428,6 +2428,27 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
     const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, bool specify_workspace_limit,
     size_t memory_limit_bytes) {
+#if CUDNN_VERSION >= 8000
+  const int num_requested_algos = 5;
+  int num_returned_algos = 0;
+  cudnnConvolutionFwdAlgoPerf_t perf_results[num_requested_algos];
+
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionForwardAlgorithm_v7(
+      cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
+      output_nd.handle(), num_requested_algos, &num_returned_algos,
+      perf_results));
+
+  size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
+  for (int r = 0; r < num_returned_algos; r++) {
+    if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
+        perf_results[r].memory <= mem_limit) {
+      return perf_results[r].algo;
+    }
+  }
+  return port::Status(port::error::INTERNAL,
+                      "cudnnGetConvolutionForwardAlgorithm_v7 returned "
+                      "no suitable algorithms. This could be a cudnn bug.");
+#else
   cudnnConvolutionFwdPreference_t preference =
       specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
                               : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
@@ -2436,6 +2457,7 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
       cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
       output_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
   return algo_to_use;
+#endif
 }
 
 port::StatusOr<cudnnConvolutionBwdDataAlgo_t>
@@ -2446,6 +2468,27 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
                                     const CudnnTensorDescriptor& output_nd,
                                     bool specify_workspace_limit,
                                     size_t memory_limit_bytes) {
+#if CUDNN_VERSION >= 8000
+  const int num_requested_algos = 5;
+  int num_returned_algos = 0;
+  cudnnConvolutionBwdDataAlgoPerf_t perf_results[num_requested_algos];
+
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardDataAlgorithm_v7(
+      cudnn.handle(), filter.handle(), output_nd.handle(), conv.handle(),
+      input_nd.handle(), num_requested_algos, &num_returned_algos,
+      perf_results));
+
+  size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
+  for (int r = 0; r < num_returned_algos; r++) {
+    if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
+        perf_results[r].memory <= mem_limit) {
+      return perf_results[r].algo;
+    }
+  }
+  return port::Status(port::error::INTERNAL,
+                      "cudnnGetConvolutionBackwardDataAlgorithm_v7 returned "
+                      "no suitable algorithms. This could be a cudnn bug.");
+#else
   cudnnConvolutionBwdDataPreference_t preference =
       specify_workspace_limit
           ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
@@ -2455,6 +2498,7 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
       cudnn.handle(), filter.handle(), output_nd.handle(), conv.handle(),
       input_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
   return algo_to_use;
+#endif
 }
 
 port::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
@@ -2465,6 +2509,26 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
                                       const CudnnTensorDescriptor& output_nd,
                                       bool specify_workspace_limit,
                                       size_t memory_limit_bytes) {
+#if CUDNN_VERSION >= 8000
+  const int num_requested_algos = 5;
+  int num_returned_algos = 0;
+  cudnnConvolutionBwdFilterAlgoPerf_t perf_results[num_requested_algos];
+
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+      cudnn.handle(), input_nd.handle(), output_nd.handle(), conv.handle(),
+      filter.handle(), num_requested_algos, &num_returned_algos, perf_results));
+
+  size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
+  for (int r = 0; r < num_returned_algos; r++) {
+    if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
+        perf_results[r].memory <= mem_limit) {
+      return perf_results[r].algo;
+    }
+  }
+  return port::Status(port::error::INTERNAL,
+                      "cudnnGetConvolutionBackwardFilterAlgorithm_v7 returned "
+                      "no suitable algorithms. This could be a cudnn bug.");
+#else
   cudnnConvolutionBwdFilterPreference_t preference =
       specify_workspace_limit
           ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
@@ -2474,6 +2538,7 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
       cudnn.handle(), input_nd.handle(), output_nd.handle(), conv.handle(),
       filter.handle(), preference, memory_limit_bytes, &algo_to_use));
   return algo_to_use;
+#endif
 }
 
 port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
diff --git a/tensorflow/stream_executor/cuda/cudnn_8_0.inc b/tensorflow/stream_executor/cuda/cudnn_8_0.inc
index 11c1eafe48f..9077b82791a 100644
--- a/tensorflow/stream_executor/cuda/cudnn_8_0.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_8_0.inc
@@ -1245,21 +1245,6 @@ cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                    const cudnnTensorDescriptor_t xDesc,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const cudnnConvolutionDescriptor_t convDesc,
-                                    const cudnnTensorDescriptor_t yDesc,
-                                    cudnnConvolutionFwdPreference_t preference,
-                                    size_t memoryLimitInBytes,
-                                    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
-}
-
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
                                        const cudnnTensorDescriptor_t srcDesc,
@@ -1440,21 +1425,6 @@ cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
   return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                         const cudnnFilterDescriptor_t wDesc,
-                                         const cudnnTensorDescriptor_t dyDesc,
-                                         const cudnnConvolutionDescriptor_t convDesc,
-                                         const cudnnTensorDescriptor_t dxDesc,
-                                         cudnnConvolutionBwdDataPreference_t preference,
-                                         size_t memoryLimitInBytes,
-                                         cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
-}
-
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
                                             const cudnnFilterDescriptor_t filterDesc,
@@ -2655,21 +2625,6 @@ cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
   return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t xDesc,
-                                           const cudnnTensorDescriptor_t dyDesc,
-                                           const cudnnConvolutionDescriptor_t convDesc,
-                                           const cudnnFilterDescriptor_t dwDesc,
-                                           cudnnConvolutionBwdFilterPreference_t preference,
-                                           size_t memoryLimitInBytes,
-                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
-}
-
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
                                               const cudnnTensorDescriptor_t srcDesc,

From ffd2f56833b32f4504ae011c97e0a5fe41542b41 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Wed, 20 May 2020 13:35:56 -0700
Subject: [PATCH 0925/1533] Handle complex data types in tf.SigmoidOp and
 tf.SigmoidGradOp lowering

Added support for complex constants in getSplatOfType helper.

SigmoidOp lowering is updated to handle unranked inputs.

Also, enabled testComplexOps python compiler test after enabling AngleOp in the fallback path.

PiperOrigin-RevId: 312545538
Change-Id: I26afa00b09f3abfda194d9bddd34816facbc9d2c
---
 tensorflow/compiler/mlir/xla/ir/hlo_utils.h   | 39 +++++++++-----
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 48 ++++++++++++++---
 .../mlir/xla/transforms/legalize_tf.cc        | 51 ++++++++++++-------
 .../xla/transforms/legalize_tf_patterns.td    |  6 ++-
 .../xla/transforms/legalize_tf_with_tf2xla.cc |  1 +
 tensorflow/compiler/tests/unary_ops_test.py   |  1 -
 6 files changed, 104 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
index 079169e9c5c..03e41f6432c 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_utils.h
@@ -35,22 +35,33 @@ mlir::DenseIntElementsAttr getBroadcastDimensionsAttr(mlir::Builder* b,
                                                       mlir::Value y,
                                                       bool allow_empty = true);
 
-/// Get a constant splat for the given value type.
+// Get a constant splat for the given value of type. Requires value to be of
+// type static shaped RankedTensorType.
+template <typename T>
+static ElementsAttr getSplat(Builder* b, RankedTensorType ty, T constant) {
+  Type element_ty = getElementTypeOrSelf(ty);
+
+  if (element_ty.isSignlessInteger())
+    return DenseElementsAttr::get(ty, b->getIntegerAttr(element_ty, constant));
+
+  if (element_ty.isa<FloatType>())
+    return DenseElementsAttr::get(ty, b->getFloatAttr(element_ty, constant));
+
+  if (auto complex_ty = element_ty.dyn_cast<ComplexType>()) {
+    auto complex_element_ty = complex_ty.getElementType();
+    if (complex_element_ty.isF32())
+      return DenseElementsAttr::get(ty,
+                                    static_cast<std::complex<float>>(constant));
+    if (complex_element_ty.isF64())
+      return DenseElementsAttr::get(
+          ty, static_cast<std::complex<double>>(constant));
+  }
+  llvm_unreachable("unhandled element type");
+}
+
 template <typename T>
 static ElementsAttr getSplat(Builder* b, Value val, T constant) {
-  auto valType = val.getType().cast<TensorType>();
-  auto valElementType = getElementTypeOrSelf(val.getType());
-
-  // Handle integer elements.
-  Attribute elementAttr;
-  if (valElementType.isSignlessInteger())
-    elementAttr = b->getIntegerAttr(valElementType, constant);
-  else if (valElementType.isa<FloatType>())
-    elementAttr = b->getFloatAttr(valElementType, constant);
-  else
-    llvm_unreachable("unhandled element type");
-
-  return DenseElementsAttr::get(valType, elementAttr);
+  return getSplat(b, val.getType().cast<RankedTensorType>(), constant);
 }
 
 // Returns DenseElementsAttr of rank zero with the given element type and the
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 2288e0fefc4..74c5e23dc5f 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -1784,16 +1784,41 @@ func @neg_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @sigmoid
 func @sigmoid(%arg0: tensor<2xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: [[R0:%.+]] = xla_hlo.constant dense<5.000000e-01> : tensor<f32>
-  // CHECK-DAG: [[R1:%.+]] = "xla_hlo.broadcast"([[R0]]) {broadcast_sizes = dense<2> : tensor<1xi64>} : (tensor<f32>) -> tensor<2xf32>
-  // CHECK-DAG: [[R2:%.+]] =  xla_hlo.multiply %arg0, [[R1]] : tensor<2xf32>
-  // CHECK-DAG: [[R3:%.+]] =  "xla_hlo.tanh"([[R2]]) : (tensor<2xf32>) -> tensor<2xf32>
-  // CHECK-DAG: [[R4:%.+]] =  xla_hlo.multiply [[R3]], [[R1]] : tensor<2xf32>
-  // CHECK-DAG: [[R5:%.+]] =  xla_hlo.add [[R4]], [[R1]] : tensor<2xf32>
+  // CHECK-DAG: [[SCALAR:%.+]] = xla_hlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK-DAG: [[SHAPE:%.+]] = shape.shape_of %arg0 : tensor<2xf32>
+  // CHECK-DAG: [[SHAPE_VAL:%.+]] = "shape.to_extent_tensor"([[SHAPE]]) : (!shape.shape) -> tensor<1xindex>
+  // CHECK-DAG: [[HALF:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"([[SCALAR]], [[SHAPE_VAL]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<2xf32>
+  // CHECK-DAG: [[R1:%.+]] =  xla_hlo.multiply %arg0, [[HALF]] : tensor<2xf32>
+  // CHECK-DAG: [[R2:%.+]] =  "xla_hlo.tanh"([[R1]]) : (tensor<2xf32>) -> tensor<2xf32>
+  // CHECK-DAG: [[R3:%.+]] =  xla_hlo.multiply [[R2]], [[HALF]] : tensor<2xf32>
+  // CHECK-DAG: [[R4:%.+]] =  xla_hlo.add [[R3]], [[HALF]] : tensor<2xf32>
   %0 = "tf.Sigmoid"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL: @sigmoid_complex
+func @sigmoid_complex(%arg0: tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>> {
+  // CHECK: [[R0:%.+]] = xla_hlo.constant dense<(5.000000e-01,0.000000e+00)> : tensor<complex<f32>>
+  // CHECK-NOT: tf.Sigmoid
+  %0 = "tf.Sigmoid"(%arg0) : (tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
+  return %0 : tensor<2xcomplex<f32>>
+}
+
+// CHECK-LABEL: @sigmoid_unranked
+func @sigmoid_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK-DAG: [[SCALAR:%.+]] = xla_hlo.constant dense<5.000000e-01> : tensor<f32>
+  // CHECK-DAG: [[SHAPE:%.+]] = shape.shape_of %arg0 : tensor<*xf32>
+  // CHECK-DAG: [[SHAPE_VAL:%.+]] = "shape.to_extent_tensor"([[SHAPE]]) : (!shape.shape) -> tensor<?xindex>
+  // CHECK-DAG: [[HALF:%.+]] = "xla_hlo.dynamic_broadcast_in_dim"([[SCALAR]], [[SHAPE_VAL]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK-DAG: [[R1:%.+]] =  xla_hlo.multiply %arg0, [[HALF]] : tensor<*xf32>
+  // CHECK-DAG: [[R2:%.+]] =  "xla_hlo.tanh"([[R1]]) : (tensor<*xf32>) -> tensor<*xf32>
+  // CHECK-DAG: [[R3:%.+]] =  xla_hlo.multiply [[R2]], [[HALF]] : tensor<*xf32>
+  // CHECK-DAG: [[R4:%.+]] =  xla_hlo.add [[R3]], [[HALF]] : tensor<*xf32>
+  %0 = "tf.Sigmoid"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+
 // CHECK-LABEL: @sigmoid_grad
 func @sigmoid_grad(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK-DAG: [[MUL0:%.+]] =  xla_hlo.multiply %arg1, %arg0 : tensor<2xf32>
@@ -1805,6 +1830,17 @@ func @sigmoid_grad(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -> tensor<2xf32>
   return %0 : tensor<2xf32>
 }
 
+// CHECK-LABEL: @sigmoid_grad_complex
+func @sigmoid_grad_complex(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>> {
+  // CHECK-DAG: [[MUL0:%.+]] =  xla_hlo.multiply %arg1, %arg0 : tensor<2xcomplex<f32>>
+  // CHECK-DAG: [[ONE:%.+]] = xla_hlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<2xcomplex<f32>>
+  // CHECK-DAG: [[SUB:%.+]] =  xla_hlo.subtract [[ONE]], %arg0 : tensor<2xcomplex<f32>>
+  // CHECK-DAG: [[MUL1:%.+]] =  xla_hlo.multiply [[MUL0]], [[SUB]] : tensor<2xcomplex<f32>>
+  // CHECK: return [[MUL1]]
+  %0 = "tf.SigmoidGrad"(%arg0, %arg1) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
+  return %0 : tensor<2xcomplex<f32>>
+}
+
 // CHECK-LABEL: @sin
 func @sin(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK:  "xla_hlo.sine"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 8675d6c8a4b..2d6da67fc60 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -516,6 +516,26 @@ static Value Broadcast1DToFeatureDim(Location loc, Value broadcast_to,
       loc, to_type, broadcast_from, result_extents, broadcast_dims);
 }
 
+// Broadcasts `input` to the shape of `broadcast_to` value following
+// TF::BroadcastTo semantics.
+//
+// Requires that input is a ranked tensor.
+//
+// TODO(hinsu): Utilize TF::ShapeOp followed by TF::BroadcastTo once ShapeOp
+// supports unranked inputs in the lowering.
+static Value BroadcastToShapeOf(Location loc, Value input, Value broadcast_to,
+                                OpBuilder &builder) {
+  auto result_shape = builder.create<shape::ShapeOfOp>(loc, broadcast_to);
+  auto to_type = broadcast_to.getType().cast<TensorType>();
+  auto result_extents_type = GetExtentsTensorTypeFor(to_type);
+  auto result_extents = builder.create<shape::ToExtentTensorOp>(
+      loc, result_extents_type, result_shape);
+  int64_t rank = input.getType().cast<RankedTensorType>().getRank();
+  auto broadcast_dims = GetI64ElementsAttrForSeq(0, rank, &builder);
+  return builder.create<DynamicBroadcastInDimOp>(
+      loc, to_type, input, result_extents, broadcast_dims);
+}
+
 // Creates a batch dot using xla_hlo::DotGeneralOp.
 Value BatchDot(Location loc, Value lhs, bool transpose_lhs, Value rhs,
                bool transpose_rhs, int64_t num_batch_dims,
@@ -1904,27 +1924,20 @@ class ConvertSigmoidOp : public OpRewritePattern<TF::SigmoidOp> {
 
   LogicalResult matchAndRewrite(TF::SigmoidOp op,
                                 PatternRewriter &rewriter) const override {
-    auto operand = op.getOperand();
+    Location loc = op.getLoc();
 
-    auto scalar_one = rewriter.create<ConstOp>(
-        op.getLoc(),
-        rewriter.getFloatAttr(getElementTypeOrSelf(operand.getType()), 0.5));
+    // Create constant half with shape and element type same as the operand.
+    Value operand = op.getOperand();
+    auto operand_ty = operand.getType().cast<TensorType>();
+    auto scalar_ty = RankedTensorType::get({}, operand_ty.getElementType());
+    ElementsAttr attr = mlir::xla::getSplat(&rewriter, scalar_ty, 0.5);
+    auto scalar_half = rewriter.create<ConstOp>(loc, attr);
+    auto half = BroadcastToShapeOf(loc, scalar_half, operand, rewriter);
 
-    auto type = operand.getType().dyn_cast<RankedTensorType>();
-    if (!type)
-      return rewriter.notifyMatchFailure(op, "requires ranked tensor type");
-    auto constant_ones = rewriter.create<BroadcastOp>(
-        op.getLoc(), type, scalar_one,
-        GetI64ElementsAttr(type.getShape(), &rewriter));
-
-    auto scaled_input =
-        rewriter.create<xla_hlo::MulOp>(op.getLoc(), operand, constant_ones);
-    auto tanh_op =
-        rewriter.create<TanhOp>(op.getLoc(), operand.getType(), scaled_input);
-    auto mul_op =
-        rewriter.create<xla_hlo::MulOp>(op.getLoc(), tanh_op, constant_ones);
-    auto add_op =
-        rewriter.create<xla_hlo::AddOp>(op.getLoc(), mul_op, constant_ones);
+    auto scaled_input = rewriter.create<MulOp>(loc, operand, half);
+    auto tanh_op = rewriter.create<TanhOp>(loc, scaled_input);
+    auto mul_op = rewriter.create<MulOp>(loc, tanh_op, half);
+    auto add_op = rewriter.create<AddOp>(loc, mul_op, half);
 
     rewriter.replaceOp(op, add_op.getResult());
     return success();
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 19fc42714b0..4989d97a360 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -611,8 +611,10 @@ def : Pat<(srcDstOpPair[0]:$old $shape, $seed, $seed2),
 //===----------------------------------------------------------------------===//
 // Sigmoid grad op.
 //===----------------------------------------------------------------------===//
+
+// TODO(hinsu): Handle unranked inputs by broadcasting constant one to the
+// shape of $l instead of having it as a constant.
 def : Pat<(TF_SigmoidGradOp AnyRankedTensor:$l, AnyRankedTensor:$r),
           (HLO_MulOp
            (HLO_MulOp $r, $l),
-           (HLO_SubOp (HLO_ConstOp (ConstantSplat<"1"> $l)), $l)),
-          [(IEEEFloatTensor $l)]>;
+           (HLO_SubOp (HLO_ConstOp (ConstantSplat<"1"> $l)), $l))>;
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 76657bd5e20..b15974979c9 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -87,6 +87,7 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::AcosOp>(),
     TypeID::get<TF::AddNOp>(),
     TypeID::get<TF::AddV2Op>(),
+    TypeID::get<TF::AngleOp>(),
     TypeID::get<TF::ApproximateEqualOp>(),
     TypeID::get<TF::AsinhOp>(),
     TypeID::get<TF::AsinOp>(),
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 3e36f67615b..d0e928a5ce6 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -601,7 +601,6 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([-1, -0.5, 0, 0.3], dtype=dtype),
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
-  @test_util.disable_mlir_bridge("TODO(b/156135423): Fix ConvertSigmoidOp")
   def testComplexOps(self):
     for dtype in self.complex_types:
 

From 8a5db25ae76474012283bdea1e8b844042869a3d Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 20 May 2020 11:52:17 -0500
Subject: [PATCH 0926/1533] Filter out WINOGRAD_NONFUSED cudnn Get v7 results

---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 35fdb4c6959..89a840c2a60 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -2441,6 +2441,7 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
   size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
   for (int r = 0; r < num_returned_algos; r++) {
     if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
+        perf_results[r].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
         perf_results[r].memory <= mem_limit) {
       return perf_results[r].algo;
     }
@@ -2481,6 +2482,8 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
   size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
   for (int r = 0; r < num_returned_algos; r++) {
     if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
+        perf_results[r].algo !=
+            CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
         perf_results[r].memory <= mem_limit) {
       return perf_results[r].algo;
     }
@@ -2521,6 +2524,8 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
   size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
   for (int r = 0; r < num_returned_algos; r++) {
     if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
+        perf_results[r].algo !=
+            CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED &&
         perf_results[r].memory <= mem_limit) {
       return perf_results[r].algo;
     }

From e927048acebc79d564ac4aeb932c9c83e7a0bfb6 Mon Sep 17 00:00:00 2001
From: Nathan Luehr <nluehr@nvidia.com>
Date: Wed, 20 May 2020 15:51:41 -0500
Subject: [PATCH 0927/1533] Formatting fix

---
 third_party/gpus/cuda_configure.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 3bf1388a3f9..57b471d041a 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -976,7 +976,7 @@ def _create_local_cuda_repository(repository_ctx):
                        "cudnn_cnn_train.h",
                        "cudnn_ops_infer.h",
                        "cudnn_ops_train.h",
-                       "cudnn.h",          
+                       "cudnn.h",
                        "cudnn_version.h",
                       ]
 

From 3163ca0bd35a14142c30abf00b81eab07f89c474 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 20 May 2020 13:50:28 -0700
Subject: [PATCH 0928/1533] [TF/XLA] Implement TensorListConcat kernel
 conversion for XLA

Only non-nested TensorList is supported.

PiperOrigin-RevId: 312548226
Change-Id: Ib90459789572b38d7e19056069408c4a96a6cea1
---
 .../compiler/jit/mark_for_compilation_pass.cc |   2 +
 .../tf2xla/kernels/tensor_list_ops.cc         | 114 ++++++++++++++++++
 .../python/eager/def_function_xla_jit_test.py |  65 ++++++++++
 3 files changed, 181 insertions(+)

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 174250f18bd..9f5723f4fa4 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -2034,6 +2034,7 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "TensorArraySplitV3",
                                      "TensorArrayV3",
                                      "TensorArrayWriteV3",
+                                     "TensorListConcatV2",
                                      "TensorListElementShape",
                                      "TensorListFromTensor",
                                      "TensorListGather",
@@ -2043,6 +2044,7 @@ absl::flat_hash_set<string> GetKnownXLAWhitelistOp() {
                                      "TensorListPushBack",
                                      "TensorListReserve",
                                      "TensorListSetItem",
+                                     "TensorListSplit",
                                      "TensorListStack",
                                      "TensorScatterAdd",
                                      "TensorScatterSub",
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index fa5a96ca6bd..d01f094dc2e 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -431,6 +431,120 @@ class TensorListStackOp : public XlaOpKernel {
 
 REGISTER_XLA_OP(Name("TensorListStack"), TensorListStackOp);
 
+class TensorListConcatOp : public XlaOpKernel {
+ public:
+  explicit TensorListConcatOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp input = ctx->Input(0);
+
+    // Check that the TensorList is initialized.
+    bool is_initialized;
+    OP_REQUIRES_OK(ctx, (IsTensorListInitialized(input, &is_initialized)));
+    OP_REQUIRES(ctx, is_initialized,
+                errors::InvalidArgument("TensorList is not initialized"));
+
+    // Only non-nested TensorList is supported for now.
+    bool is_nested;
+    OP_REQUIRES_OK(ctx, IsNestedTensorList(input, &is_nested));
+    OP_REQUIRES(ctx, !is_nested,
+                errors::Unimplemented("Only non-nested TensorList is supported "
+                                      "for TensorListConcat."));
+
+    xla::XlaOp buffer;
+    OP_REQUIRES_OK(ctx, GetTensorListBuffer(input, &buffer));
+
+    xla::XlaBuilder* b = input.builder();
+    auto shape_or = b->GetShape(buffer);
+    OP_REQUIRES_OK(ctx, shape_or.status());
+    xla::Shape element_shape = shape_or.ConsumeValueOrDie();
+    std::vector<int64> element_dims =
+        xla::SpanToVector(element_shape.dimensions());
+    OP_REQUIRES(
+        ctx, element_dims.size() > 1,
+        errors::Unimplemented("TensorList of scalars is not supported"));
+    int64 num_elements = element_dims[0];
+    int64 tensor_lengths = element_dims[1];
+
+    std::vector<int64> new_dims = {num_elements * tensor_lengths};
+
+    for (int i = 2; i < element_dims.size(); i++) {
+      new_dims.push_back(element_dims[i]);
+    }
+
+    xla::XlaOp out = xla::Reshape(buffer, new_dims);
+    ctx->SetOutput(0, out);
+
+    // Second output is a tensor of lengths of returned tensors.
+    xla::XlaOp lengths = xla::ConstantR1(b, num_elements, tensor_lengths);
+    ctx->SetOutput(1, lengths);
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListConcatOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListConcatV2"), TensorListConcatOp);
+
+class TensorListSplitOp : public XlaOpKernel {
+ public:
+  explicit TensorListSplitOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_));
+    // Only non-nested TensorList is supported for now.
+    OP_REQUIRES(
+        ctx, dtype_ != DT_VARIANT,
+        errors::Unimplemented(
+            "Only non-nested TensorList is supported for TensorListReserve."));
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    xla::XlaOp input_tensor = ctx->Input(0);
+
+    xla::XlaBuilder* b = input_tensor.builder();
+    auto shape_or = b->GetShape(input_tensor);
+    OP_REQUIRES_OK(ctx, shape_or.status());
+    xla::Shape element_shape = shape_or.ConsumeValueOrDie();
+    std::vector<int64> element_dims =
+        xla::SpanToVector(element_shape.dimensions());
+    OP_REQUIRES(
+        ctx, !element_dims.empty(),
+        errors::Unimplemented("Element dimensions have to be non-empty"));
+
+    std::vector<int64> lengths;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(2, &lengths));
+    OP_REQUIRES(ctx, !lengths.empty(),
+                errors::Unimplemented("Length has to be non-empty"));
+    int64 length = lengths[0];
+    for (int64 len : lengths) {
+      OP_REQUIRES(ctx, len == length,
+                  errors::Unimplemented("All lengths have to be the same"));
+    }
+    OP_REQUIRES(
+        ctx, element_dims[0] % length == 0,
+        errors::Unimplemented("Buffer size has to be a multiple of length"));
+    std::vector<int64> new_dims = {element_dims[0] / length, length};
+    for (int i = 1; i < element_dims.size(); i++) {
+      new_dims.push_back(element_dims[i]);
+    }
+
+    xla::XlaOp reshaped = xla::Reshape(input_tensor, new_dims);
+
+    xla::XlaOp result;
+    OP_REQUIRES_OK(ctx, ExecuteTensorListFromTensor(length, reshaped, &result));
+    ctx->SetTensorListOutput(0, result);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TensorListSplitOp);
+};
+
+REGISTER_XLA_OP(Name("TensorListSplit")
+                    .CompileTimeConstantInput("element_shape")
+                    .CompileTimeConstantInput("lengths"),
+                TensorListSplitOp);
+
 class TensorListFromTensorOp : public XlaOpKernel {
  public:
   explicit TensorListFromTensorOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 13b46491d9f..0e89887647a 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -290,6 +290,71 @@ class DefFunctionTest(test.TestCase):
         y = f(x)
         tape.gradient(y, x)
 
+  def testTensorListConcatV2(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    compiled_f = def_function.function(experimental_compile=True)(f)
+
+    inputs = constant_op.constant([3.14, 2.68, 7.69])
+
+    self.assertAllClose([6.28, 5.36, 15.38, 9.42, 8.04, 23.07], f(inputs))
+
+    self.assertAllClose(compiled_f(inputs), f(inputs))
+
+  def testTensorListConcatV2Multidim(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3, 2])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    compiled_f = def_function.function(experimental_compile=True)(f)
+
+    inputs = constant_op.constant([[3.14, 21.1], [2.68, 22.2], [7.69, 23.3]])
+    self.assertAllClose(f(inputs), compiled_f(inputs))
+
+  def testTensorListConcatV2Scalars(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[1])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    compiled_f = def_function.function(experimental_compile=True)(f)
+    inputs = constant_op.constant([3.14])
+    self.assertAllClose(f(inputs), compiled_f(inputs))
+
+  def testTensorListConcatGrad(self):
+
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    def g():
+      x = constant_op.constant([3.14, 2.68, 7.69])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = f(x)
+        return tape.gradient(y, x)
+
+    compiled_g = def_function.function(experimental_compile=True)(g)
+
+    self.assertAllClose([5.0, 5.0, 5.0], g())
+    self.assertAllClose(compiled_g(), g())
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()

From 77a4102e760cf96431446c6b2eb3aed48b548c49 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 13:59:48 -0700
Subject: [PATCH 0929/1533] Update Eigen to:
 https://gitlab.com/libeigen/eigen/-/commit/cc86a31e20b48b0f03d714b4d1b1f50d52848d36

PiperOrigin-RevId: 312550118
Change-Id: I3b7023148143614669211f194060e5fedd84aca4
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 452152efacf..764207f8aed 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "59f7cc665fff375f142d558e7c08c95ac254fa13d077cbecce757a556d30e0d9",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-9b411757abd8458f9689b1384c6bf75da9b82357",
+        sha256 = "854eabe6817e38d7738fde6ec39c3dfc55fd5e68b2523de8cae936f391a38a69",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-cc86a31e20b48b0f03d714b4d1b1f50d52848d36",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/cc86a31e20b48b0f03d714b4d1b1f50d52848d36/eigen-cc86a31e20b48b0f03d714b4d1b1f50d52848d36.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/cc86a31e20b48b0f03d714b4d1b1f50d52848d36/eigen-cc86a31e20b48b0f03d714b4d1b1f50d52848d36.tar.gz",
         ],
     )
 

From 493cf376587a291a3c0ceb0b9fbecbdc6122eeb7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 14:05:53 -0700
Subject: [PATCH 0930/1533] Clean up saving tests for TextVectorization and add
 the layer to default serialization.

PiperOrigin-RevId: 312551358
Change-Id: I7604c5edff2f509c1aeac1ed7a8f358025e0d5df
---
 .../preprocessing/text_vectorization_test.py  | 111 ++++++------------
 .../python/keras/layers/serialization.py      |   6 +-
 2 files changed, 41 insertions(+), 76 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index 2a6ffd223c8..affa392e42b 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -40,9 +40,7 @@ from tensorflow.python.keras.layers import embeddings
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1
-from tensorflow.python.keras.saving import saved_model_experimental as saving
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_string_ops
@@ -295,16 +293,15 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
       vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
           input_shape[0])
 
-    with CustomObjectScope({"TextVectorization": cls}):
-      output_data = testing_utils.layer_test(
-          cls,
-          kwargs=kwargs,
-          input_shape=input_shape,
-          input_data=input_data,
-          input_dtype=dtypes.string,
-          expected_output_dtype=expected_output_dtype,
-          validate_training=False,
-          adapt_data=vocab_data)
+    output_data = testing_utils.layer_test(
+        cls,
+        kwargs=kwargs,
+        input_shape=input_shape,
+        input_data=input_data,
+        input_dtype=dtypes.string,
+        expected_output_dtype=expected_output_dtype,
+        validate_training=False,
+        adapt_data=vocab_data)
     self.assertAllClose(expected_output, output_data)
 
   def test_list_inputs_1d(self):
@@ -1413,8 +1410,7 @@ class TextVectorizationSavingTest(
     if tf2.enabled():
       keras.backend.clear_session()
 
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
+    loaded_model = keras.models.load_model(output_path)
     self.assertAllEqual(loaded_model.predict(input_array), expected_output)
 
   def test_saving_when_nested(self):
@@ -1448,67 +1444,10 @@ class TextVectorizationSavingTest(
     if tf2.enabled():
       keras.backend.clear_session()
 
-    loaded_model = keras.models.load_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
+    loaded_model = keras.models.load_model(output_path)
     self.assertAllEqual(loaded_model.predict(input_array), expected_output)
 
-  def test_serialization_with_custom_callables(self):
-    input_array = np.array([["earth>wind>and Fire"],
-                            ["\tfire>And\nearth>michigan"]])
-    expected_output = [[b"earth", b"wind", b"and fire"],
-                       [b"\tfire", b"and\nearth", b"michigan"]]
-
-    input_data = keras.Input(shape=(1,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=None,
-        standardize=custom_standardize_fn,
-        split=custom_split_fn,
-        ngrams=None,
-        output_mode=None)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-    serialized_model_data = model.get_config()
-    with CustomObjectScope({"TextVectorization": get_layer_class()}):
-      new_model = keras.Model.from_config(serialized_model_data)
-    new_output_dataset = new_model.predict(input_array)
-    self.assertAllEqual(expected_output, new_output_dataset)
-
-  def DISABLED_test_vocabulary_persistence_across_saving(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    # Build and validate a golden model.
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=None,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(output_dataset, expected_output)
-
-    # Save the model to disk.
-    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
-    model.save(output_path, save_format="tf")
-    loaded_model = saving.load_from_saved_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
-
-    # Ensure that the loaded model is unique (so that the save/load is real)
-    self.assertIsNot(model, loaded_model)
-
-    # Validate correctness of the new model.
-    new_output_dataset = loaded_model.predict(input_array)
-    self.assertAllEqual(new_output_dataset, expected_output)
-
-  def DISABLED_test_vocabulary_persistence_across_saving_with_tfidf(self):
+  def test_saving_with_tfidf(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     tfidf_data = [.5, .25, .2, .125]
     input_array = np.array([["earth", "wind", "and", "earth"],
@@ -1538,8 +1477,7 @@ class TextVectorizationSavingTest(
     # Save the model to disk.
     output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
     model.save(output_path, save_format="tf")
-    loaded_model = saving.load_from_saved_model(
-        output_path, custom_objects={"TextVectorization": get_layer_class()})
+    loaded_model = keras.models.load_model(output_path)
 
     # Ensure that the loaded model is unique (so that the save/load is real)
     self.assertIsNot(model, loaded_model)
@@ -1548,6 +1486,29 @@ class TextVectorizationSavingTest(
     new_output_dataset = loaded_model.predict(input_array)
     self.assertAllClose(new_output_dataset, expected_output)
 
+  def test_serialization_with_custom_callables(self):
+    input_array = np.array([["earth>wind>and Fire"],
+                            ["\tfire>And\nearth>michigan"]])
+    expected_output = [[b"earth", b"wind", b"and fire"],
+                       [b"\tfire", b"and\nearth", b"michigan"]]
+
+    input_data = keras.Input(shape=(1,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=custom_standardize_fn,
+        split=custom_split_fn,
+        ngrams=None,
+        output_mode=None)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+    serialized_model_data = model.get_config()
+    new_model = keras.Model.from_config(serialized_model_data)
+    new_output_dataset = new_model.predict(input_array)
+    self.assertAllEqual(expected_output, new_output_dataset)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 30be3d485df..9cafc0f08d8 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -48,6 +48,8 @@ from tensorflow.python.keras.layers import wrappers
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1 as preprocessing_normalization_v1
+from tensorflow.python.keras.layers.preprocessing import text_vectorization as preprocessing_text_vectorization
+from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1 as preprocessing_text_vectorization_v1
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.util import tf_inspect as inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -57,12 +59,14 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                convolutional_recurrent, core, cudnn_recurrent, dense_attention,
                embeddings, einsum_dense, local, merge, noise, normalization,
                pooling, image_preprocessing, preprocessing_normalization_v1,
+               preprocessing_text_vectorization_v1,
                recurrent, wrappers)
 ALL_V2_MODULES = (
     rnn_cell_wrapper_v2,
     normalization_v2,
     recurrent_v2,
-    preprocessing_normalization
+    preprocessing_normalization,
+    preprocessing_text_vectorization
 )
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.

From 33d5a54801e47d29bfe729a516da2afcf00d0334 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 14:16:41 -0700
Subject: [PATCH 0931/1533] Internal visibility rule change.

PiperOrigin-RevId: 312553439
Change-Id: Ibe2c2f31f14389791d37b8c15a2f1ed578e08f3d
---
 tensorflow/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index ab4316d5ed0..efbdf89ecea 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -524,7 +524,10 @@ package_group(
     ],
 )
 
-package_group(name = "ndarray_tensor_allow_list")
+package_group(
+    name = "ndarray_tensor_allow_list",
+    packages = ["//learning/pathways/..."],
+)
 
 # Packages that use composite tensors or dispatch.
 # TODO(b/154762408) Remove this package group once it's no longer needed.

From 4829f33e1faafcdd435aa88dc313a71716bdffe9 Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Wed, 20 May 2020 15:14:01 -0700
Subject: [PATCH 0932/1533] Fix the formatting of TensorArray. __doc__ indents
 anything after the first line doc. Dedent the rest of the doc after the first
 line so that it is formatted properly on the site.

PiperOrigin-RevId: 312564525
Change-Id: Icecb785b3914267411c3693d70b8daba3e06454d
---
 tensorflow/python/ops/tensor_array_ops.py | 33 +++++++++++------------
 tensorflow/python/util/tf_should_use.py   | 20 +++++++++-----
 2 files changed, 30 insertions(+), 23 deletions(-)

diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index d386d14b64a..58dc92084a6 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -21,10 +21,11 @@ from __future__ import print_function
 
 import contextlib
 
-import numpy as np
 import traceback
 import weakref
 
+import numpy as np
+
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -985,22 +986,20 @@ class TensorArray(object):
 
   Example 3: A simple loop interacting with a `tf.Variable`.
 
-  # TODO(b/153898334) reenable this one flakyness is removed
-  # >>> v = tf.Variable(1)
-  # >>>
-  # >>> @tf.function
-  # ... def f(x):
-  # ...   ta = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
-  # ...
-  # ...   for i in tf.range(x):
-  # ...     v.assign_add(i)
-  # ...     ta = ta.write(i, v)
-  # ...
-  # ...   return ta.stack()
-  # >>>
-  # >>> f(5)
-  # <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 1,  2,  4,  7, 11],
-  # dtype=int32)>
+  # TODO(b/153898334): Convert back to doctest once bug is resolved.
+  ```
+  v = tf.Variable(1)
+  @tf.function
+  def f(x):
+    ta = tf.TensorArray(tf.int32, size=0, dynamic_size=True)
+    for i in tf.range(x):
+      v.assign_add(i)
+      ta = ta.write(i, v)
+    return ta.stack()
+  f(5)
+  <tf.Tensor: shape=(5,), dtype=int32, numpy=array([ 1,  2,  4,  7, 11],
+  dtype=int32)>
+  ```
   """
 
   def __init__(self,
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index 0c11b08131c..9ba4b7520e5 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import copy
 import sys
+import textwrap
 import traceback
 
 import six  # pylint: disable=unused-import
@@ -231,20 +232,27 @@ def should_use_result(fn=None, warn_in_eager=False, error_in_function=False):
     The wrapped function.
   """
   def decorated(fn):
+    """Decorates the input function."""
     def wrapped(*args, **kwargs):
       return _add_should_use_warning(fn(*args, **kwargs),
                                      warn_in_eager=warn_in_eager,
                                      error_in_function=error_in_function)
+    fn_doc = fn.__doc__ or ''
+    split_doc = fn_doc.split('\n', 1)
+    if len(split_doc) == 1:
+      updated_doc = fn_doc
+    else:
+      brief, rest = split_doc
+      updated_doc = '\n'.join([brief, textwrap.dedent(rest)])
+
+    note = ('\n\nNote: The output of this function should be used. If it is '
+            'not, a warning will be logged or an error may be raised. '
+            'To mark the output as used, call its .mark_used() method.')
     return tf_decorator.make_decorator(
         target=fn,
         decorator_func=wrapped,
         decorator_name='should_use_result',
-        decorator_doc=(
-            (fn.__doc__ or '') +
-            ('\n\n  '
-             '**NOTE** The output of this function should be used.  If it is '
-             'not, a warning will be logged or an error may be raised.  '
-             'To mark the output as used, call its .mark_used() method.')))
+        decorator_doc=updated_doc + note)
 
   if fn is not None:
     return decorated(fn)

From 4148ee2e95eeba489be4a5b2994778f1956432fc Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 20 May 2020 15:44:57 -0700
Subject: [PATCH 0933/1533] Align example models to 64-bit boundaries to
 guarantee correctness for all 64-bit flatbuffer accesses.  Aligning 64-bit
 datatypes to 32-bits can cause memory errors on some architectures.

PiperOrigin-RevId: 312570183
Change-Id: I023dc868d9ec3026d23d461a21fcfe0f6251150d
---
 .../benchmarks/keyword_scrambled_model_data.cc    | 15 ++-------------
 .../lite/micro/examples/hello_world/model.cc      | 15 ++-------------
 .../examples/magic_wand/magic_wand_model_data.cc  | 15 ++-------------
 3 files changed, 6 insertions(+), 39 deletions(-)

diff --git a/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
index c1e37dfb37e..834f44ca5ab 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.cc
@@ -15,19 +15,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/benchmarks/keyword_scrambled_model_data.h"
 
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_keyword_scrambled_model_data[] DATA_ALIGN_ATTRIBUTE = {
+// Keep model aligned to 8 bytes to guarantee aligned 64-bit accesses.
+alignas(8) const unsigned char g_keyword_scrambled_model_data[] = {
     0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
     0x14, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00,
     0x0e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xd0, 0x6e, 0x00, 0x00,
diff --git a/tensorflow/lite/micro/examples/hello_world/model.cc b/tensorflow/lite/micro/examples/hello_world/model.cc
index 232e4a14115..f774985fd48 100644
--- a/tensorflow/lite/micro/examples/hello_world/model.cc
+++ b/tensorflow/lite/micro/examples/hello_world/model.cc
@@ -24,19 +24,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/hello_world/model.h"
 
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_model[] DATA_ALIGN_ATTRIBUTE = {
+// Keep model aligned to 8 bytes to guarantee aligned 64-bit accesses.
+alignas(8) const unsigned char g_model[] = {
     0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
     0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
diff --git a/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc b/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc
index 1b8dca8eb0a..d56571dfd6f 100644
--- a/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc
@@ -19,19 +19,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.h"
 
-// We need to keep the data array aligned on some architectures.
-#ifdef __has_attribute
-#define HAVE_ATTRIBUTE(x) __has_attribute(x)
-#else
-#define HAVE_ATTRIBUTE(x) 0
-#endif
-#if HAVE_ATTRIBUTE(aligned) || (defined(__GNUC__) && !defined(__clang__))
-#define DATA_ALIGN_ATTRIBUTE __attribute__((aligned(4)))
-#else
-#define DATA_ALIGN_ATTRIBUTE
-#endif
-
-const unsigned char g_magic_wand_model_data[] DATA_ALIGN_ATTRIBUTE = {
+// Keep model aligned to 8 bytes to guarantee aligned 64-bit accesses.
+alignas(8) const unsigned char g_magic_wand_model_data[] = {
     0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
     0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
     0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,

From 6e4fdec80efe87638c196c2604ba1c148912ee88 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Wed, 20 May 2020 15:46:49 -0700
Subject: [PATCH 0934/1533] Convert TF_SavedModel to a direct pointer to
 tensorflow::SavedModelAPI. This saves us an extra allocation when loading a
 savedmodel, and extra indirection on all saved model functions.

PiperOrigin-RevId: 312570488
Change-Id: I16f21a0124af269f6d2b0e1065fbd1aa6a4224b2
---
 .../c/experimental/saved_model/internal/BUILD    |  1 +
 .../saved_model/internal/saved_model_api.cc      | 16 ++++++++++------
 .../saved_model/internal/saved_model_api_type.h  | 11 ++++++++---
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 5c51e26f925..2ded784882b 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -155,6 +155,7 @@ cc_library(
         "saved_model_api_type.h",
     ],
     deps = [
+        "//tensorflow/c:conversion_macros",
         "//tensorflow/c/experimental/saved_model/core:saved_model_api",
     ],
 )
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
index 629610dbe29..9614e507646 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api.cc
@@ -41,7 +41,7 @@ TF_SavedModel* TF_LoadSavedModel(const char* dirname, TFE_Context* ctx,
   if (!status->status.ok()) {
     return nullptr;
   }
-  return new TF_SavedModel{std::move(result)};
+  return tensorflow::wrap(result.release());
 }
 
 TF_SavedModel* TF_LoadSavedModelWithTags(const char* dirname, TFE_Context* ctx,
@@ -60,17 +60,19 @@ TF_SavedModel* TF_LoadSavedModelWithTags(const char* dirname, TFE_Context* ctx,
   if (!status->status.ok()) {
     return nullptr;
   }
-  return new TF_SavedModel{std::move(result)};
+  return tensorflow::wrap(result.release());
 }
 
-void TF_DeleteSavedModel(TF_SavedModel* model) { delete model; }
+void TF_DeleteSavedModel(TF_SavedModel* model) {
+  delete tensorflow::unwrap(model);
+}
 
 TF_ConcreteFunction* TF_GetSavedModelConcreteFunction(TF_SavedModel* model,
                                                       const char* function_path,
                                                       TF_Status* status) {
   tensorflow::ConcreteFunction* result = nullptr;
   tensorflow::Status get_function_status =
-      model->saved_model->GetFunction(function_path, &result);
+      tensorflow::unwrap(model)->GetFunction(function_path, &result);
   status->status.Update(get_function_status);
   if (!get_function_status.ok()) {
     return nullptr;
@@ -82,7 +84,8 @@ TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
     TF_SavedModel* model, const char* signature_def_key, TF_Status* status) {
   tensorflow::ConcreteFunction* result = nullptr;
   tensorflow::Status get_function_status =
-      model->saved_model->GetSignatureDefFunction(signature_def_key, &result);
+      tensorflow::unwrap(model)->GetSignatureDefFunction(signature_def_key,
+                                                         &result);
   status->status.Update(get_function_status);
   if (!get_function_status.ok()) {
     return nullptr;
@@ -91,7 +94,8 @@ TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_GetSavedModelSignatureDefFunction(
 }
 
 TF_ConcreteFunctionList* TF_ListSavedModelFunctions(TF_SavedModel* model) {
-  return new TF_ConcreteFunctionList{model->saved_model->ListFunctions()};
+  return new TF_ConcreteFunctionList{
+      tensorflow::unwrap(model)->ListFunctions()};
 }
 
 }  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h b/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
index 9e2d1117463..380c3703426 100644
--- a/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
+++ b/tensorflow/c/experimental/saved_model/internal/saved_model_api_type.h
@@ -18,13 +18,18 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/c/conversion_macros.h"
 #include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
 
 // Internal structures used by the SavedModel C API. These are likely to change
 // and should not be depended on.
 
-struct TF_SavedModel {
-  std::unique_ptr<tensorflow::SavedModelAPI> saved_model;
-};
+typedef struct TF_SavedModel TF_SavedModel;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(tensorflow::SavedModelAPI, TF_SavedModel)
+
+}  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_SAVED_MODEL_API_TYPE_H_

From 4be466a87efc152a8581febe7c1deaae562465af Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 20 May 2020 15:46:50 -0700
Subject: [PATCH 0935/1533] use bincount_v2 in tf.math.bincount, support axis
 and binary_output, and support int64.

PiperOrigin-RevId: 312570490
Change-Id: I1d11cd0f294f6899920a547fbe0f8f9c54140be6
---
 tensorflow/python/BUILD                       |  13 +-
 tensorflow/python/__init__.py                 |   2 +-
 tensorflow/python/kernel_tests/BUILD          |   2 +-
 .../python/kernel_tests/bincount_op_test.py   |  77 ++---
 .../ops/{bincount.py => bincount_ops.py}      | 256 ++++++++++++++++-
 ...{bincount_test.py => bincount_ops_test.py} | 266 ++++++++++++++++--
 tensorflow/python/ops/math_ops.py             | 114 +-------
 tensorflow/python/ops/ragged/BUILD            |   2 +
 tensorflow/python/ops/ragged/row_partition.py |   5 +-
 .../python/ops/ragged/segment_id_ops.py       |   4 +-
 .../tools/api/golden/v2/tensorflow.math.pbtxt |   2 +-
 11 files changed, 554 insertions(+), 189 deletions(-)
 rename tensorflow/python/ops/{bincount.py => bincount_ops.py} (51%)
 rename tensorflow/python/ops/{bincount_test.py => bincount_ops_test.py} (71%)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index ea8f564cc3f..f9645786f8b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -137,7 +137,7 @@ py_library(
         ":_pywrap_utils",
         ":array_ops",
         ":audio_ops_gen",
-        ":bincount",
+        ":bincount_ops",
         ":bitwise_ops",
         ":boosted_trees_ops",
         ":check_ops",
@@ -3476,23 +3476,24 @@ py_library(
 )
 
 py_library(
-    name = "bincount",
-    srcs = ["ops/bincount.py"],
+    name = "bincount_ops",
+    srcs = ["ops/bincount_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":count_ops_gen",
         ":framework",
         ":framework_for_generated_wrappers",
+        "//tensorflow/python/compat",
     ],
 )
 
 tf_py_test(
-    name = "bincount_test",
+    name = "bincount_ops_test",
     size = "small",
-    srcs = ["ops/bincount_test.py"],
+    srcs = ["ops/bincount_ops_test.py"],
     python_version = "PY3",
     deps = [
-        ":bincount",
+        ":bincount_ops",
         ":platform_test",
     ],
 )
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 8939c9b3143..781ef33f744 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -85,7 +85,7 @@ from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
 from tensorflow.python.module import module
-from tensorflow.python.ops import bincount
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import bitwise_ops as bitwise
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import image_ops as image
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 9e38a78578f..a04c874c9d6 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -178,9 +178,9 @@ cuda_py_test(
     srcs = ["bincount_op_test.py"],
     tags = ["no_windows_gpu"],
     deps = [
+        "//tensorflow/python:bincount_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 222716dfdfa..22ac9f8e99d 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for math_ops.bincount."""
+"""Tests for bincount_ops.bincount."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -25,8 +25,8 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -37,45 +37,50 @@ class BincountTest(test_util.TensorFlowTestCase):
 
   def test_empty(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=5)),
-                          [0, 0, 0, 0, 0])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=1)),
-                          [0])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], minlength=0)),
-                          [])
-      self.assertEqual(self.evaluate(math_ops.bincount([], minlength=0,
-                                                       dtype=np.float32)).dtype,
-                       np.float32)
-      self.assertEqual(self.evaluate(math_ops.bincount([], minlength=3,
-                                                       dtype=np.float64)).dtype,
-                       np.float64)
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], minlength=5)),
+          [0, 0, 0, 0, 0])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], minlength=1)), [0])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], minlength=0)), [])
+      self.assertEqual(
+          self.evaluate(
+              bincount_ops.bincount([], minlength=0, dtype=np.float32)).dtype,
+          np.float32)
+      self.assertEqual(
+          self.evaluate(
+              bincount_ops.bincount([], minlength=3, dtype=np.float64)).dtype,
+          np.float64)
 
   def test_values(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(self.evaluate(math_ops.bincount([1, 1, 1, 2, 2, 3])),
-                          [0, 3, 2, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([1, 1, 1, 2, 2, 3])),
+          [0, 3, 2, 1])
       arr = [1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5]
-      self.assertAllEqual(self.evaluate(math_ops.bincount(arr)),
-                          [0, 5, 4, 3, 2, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount(arr)), [0, 5, 4, 3, 2, 1])
       arr += [0, 0, 0, 0, 0, 0]
-      self.assertAllEqual(self.evaluate(math_ops.bincount(arr)),
-                          [6, 5, 4, 3, 2, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount(arr)), [6, 5, 4, 3, 2, 1])
 
-      self.assertAllEqual(self.evaluate(math_ops.bincount([])), [])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([0, 0, 0])), [3])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([5])),
-                          [0, 0, 0, 0, 0, 1])
-      self.assertAllEqual(self.evaluate(math_ops.bincount(np.arange(10000))),
-                          np.ones(10000))
+      self.assertAllEqual(self.evaluate(bincount_ops.bincount([])), [])
+      self.assertAllEqual(self.evaluate(bincount_ops.bincount([0, 0, 0])), [3])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([5])), [0, 0, 0, 0, 0, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount(np.arange(10000))),
+          np.ones(10000))
 
   def test_maxlength(self):
     with self.session(use_gpu=True):
-      self.assertAllEqual(self.evaluate(math_ops.bincount([5], maxlength=3)),
-                          [0, 0, 0])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([1], maxlength=3)),
-                          [0, 1])
-      self.assertAllEqual(self.evaluate(math_ops.bincount([], maxlength=3)),
-                          [])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([5], maxlength=3)), [0, 0, 0])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([1], maxlength=3)), [0, 1])
+      self.assertAllEqual(
+          self.evaluate(bincount_ops.bincount([], maxlength=3)), [])
 
   def test_random_with_weights(self):
     num_samples = 10000
@@ -88,7 +93,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         else:
           weights = np.random.random(num_samples)
         self.assertAllClose(
-            self.evaluate(math_ops.bincount(arr, weights)),
+            self.evaluate(bincount_ops.bincount(arr, weights)),
             np.bincount(arr, weights))
 
   def test_random_without_weights(self):
@@ -99,20 +104,20 @@ class BincountTest(test_util.TensorFlowTestCase):
         arr = np.random.randint(0, 1000, num_samples)
         weights = np.ones(num_samples).astype(dtype)
         self.assertAllClose(
-            self.evaluate(math_ops.bincount(arr, None)),
+            self.evaluate(bincount_ops.bincount(arr, None)),
             np.bincount(arr, weights))
 
   def test_zero_weights(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
-          self.evaluate(math_ops.bincount(np.arange(1000), np.zeros(1000))),
+          self.evaluate(bincount_ops.bincount(np.arange(1000), np.zeros(1000))),
           np.zeros(1000))
 
   def test_negative(self):
     # unsorted_segment_sum will only report InvalidArgumentError on CPU
     with self.cached_session(), ops.device("/CPU:0"):
       with self.assertRaises(errors.InvalidArgumentError):
-        self.evaluate(math_ops.bincount([1, 2, 3, -1, 6, 8]))
+        self.evaluate(bincount_ops.bincount([1, 2, 3, -1, 6, 8]))
 
   @test_util.run_deprecated_v1
   def test_shape_function(self):
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount_ops.py
similarity index 51%
rename from tensorflow/python/ops/bincount.py
rename to tensorflow/python/ops/bincount_ops.py
index 68950eaf596..758f0180a84 100644
--- a/tensorflow/python/ops/bincount.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -12,21 +12,245 @@
 # See the License for the specific language governing permissions and
 # maxlengthations under the License.
 # ==============================================================================
-"""tf.sparse.bincount ops."""
+"""bincount ops."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_count_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("math.bincount", v1=[])
+def bincount(arr,
+             weights=None,
+             minlength=None,
+             maxlength=None,
+             dtype=dtypes.int32,
+             name=None,
+             axis=None,
+             binary_output=False):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  tf.math.bincount(values) #[0 2 2 1 2 1]
+  ```
+  Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6
+                  will be the vector length.
+
+  Each bin value in the output indicates number of occurrences of the particular
+  index. Here, index 1 in output has a value 2. This indicates value 1 occurs
+  two times in `values`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  weights = tf.constant([1,5,0,1,0,5,4,5])
+  tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5]
+  ```
+  Bin will be incremented by the corresponding weight instead of 1.
+  Here, index 1 in output has a value 6. This is the summation of weights
+  corresponding to the value in `values`.
+
+  **Bin-counting on a certain axis**
+
+  This example takes a 2 dimensional input and returns a `Tensor` with
+  bincounting on each sample.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [2, 1, 1, 0]], dtype=int32)>
+
+
+  **Bin-counting with binary_output**
+
+  This example gives binary output instead of counting the occurrence.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1, binary_output=True)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [1, 1, 1, 0]], dtype=int32)>
+
+  Args:
+    arr: A Tensor, RaggedTensor, or SparseTensor whose values should be counted.
+      These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+    name: A name scope for the associated operations (optional).
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+
+  Raises:
+    `InvalidArgumentError` if negative values are provided as an input.
+
+  """
+  name = "bincount" if name is None else name
+  with ops.name_scope(name):
+    # Somehow forward compatible needs to be False.
+    if not binary_output and axis is None:
+      arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
+      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
+      output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * (
+          math_ops.reduce_max(arr) + 1)
+      if minlength is not None:
+        minlength = ops.convert_to_tensor(
+            minlength, name="minlength", dtype=dtypes.int32)
+        output_size = gen_math_ops.maximum(minlength, output_size)
+      if maxlength is not None:
+        maxlength = ops.convert_to_tensor(
+            maxlength, name="maxlength", dtype=dtypes.int32)
+        output_size = gen_math_ops.minimum(maxlength, output_size)
+      if weights is not None:
+        weights = ops.convert_to_tensor(weights, name="weights")
+        return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+      weights = constant_op.constant([], dtype)
+      return gen_math_ops.bincount(arr, output_size, weights)
+
+    if not isinstance(arr, sparse_tensor.SparseTensor):
+      arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
+
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
+
+    if not arr.dtype.is_integer:
+      arr = math_ops.cast(arr, dtypes.int32)
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError("Unsupported axis value %s. Only 0 and -1 are currently "
+                       "supported." % axis)
+
+    if isinstance(arr, ragged_tensor.RaggedTensor):
+      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr.values)) > 0
+    else:
+      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
+    if isinstance(arr, sparse_tensor.SparseTensor):
+      output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
+          math_ops.reduce_max(arr.values) + 1)
+    else:
+      output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
+          math_ops.reduce_max(arr) + 1)
+    if minlength is not None:
+      minlength = ops.convert_to_tensor(
+          minlength, name="minlength", dtype=arr.dtype)
+      output_size = gen_math_ops.maximum(minlength, output_size)
+    if maxlength is not None:
+      maxlength = ops.convert_to_tensor(
+          maxlength, name="maxlength", dtype=arr.dtype)
+      output_size = gen_math_ops.minimum(maxlength, output_size)
+
+    if axis == 0:
+      if isinstance(arr, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(arr, weights, dtype)
+        arr = arr.values
+      elif isinstance(arr, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(arr, weights, dtype)
+        arr = arr.values
+      else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
+        arr = array_ops.reshape(arr, [-1])
+
+    if isinstance(arr, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(arr, weights, dtype)
+      return gen_math_ops.sparse_bincount(
+          indices=arr.indices,
+          values=arr.values,
+          dense_shape=arr.dense_shape,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+    elif isinstance(arr, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(arr, weights, dtype)
+      return gen_math_ops.ragged_bincount(
+          splits=arr.row_splits,
+          values=arr.values,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+    else:
+      weights = validate_dense_weights(arr, weights, dtype)
+      return gen_math_ops.dense_bincount(
+          input=arr,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+
+
+@tf_export(v1=["math.bincount", "bincount"])
+@deprecation.deprecated_endpoints("bincount")
+def bincount_v1(arr,
+                weights=None,
+                minlength=None,
+                maxlength=None,
+                dtype=dtypes.int32):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  Args:
+    arr: An int32 tensor of non-negative values.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+  """
+  return bincount(arr, weights, minlength, maxlength, dtype)
+
+
 @tf_export("sparse.bincount")
 def sparse_bincount(values,
                     weights=None,
@@ -45,19 +269,17 @@ def sparse_bincount(values,
 
   Args:
     values: A Tensor, RaggedTensor, or SparseTensor whose values should be
-      counted. These tensors must have a rank of 1 or 2.
-    weights: A 1-dimensional Tensor of weights. If specified, the input array is
-      weighted by the weight array, i.e. if a value `n` is found at position
-      `i`, `out[n]`  will be increased by `weight[i]` instead of 1.
+      counted. These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `value`, the bin will be incremented by the corresponding weight instead
+      of 1.
     axis: The axis to slice over. Axes at and below `axis` will be flattened
       before bin counting. Currently, only `0`, and `-1` are supported. If None,
       all axes will be flattened (identical to passing `0`).
-    minlength: If given, skips `values` that are less than `minlength`, and
-      ensures that the output has a `dense_shape` of at least `minlength` in the
-      inner dimension.
-    maxlength: If given, skips `values` that are greater than or equal to
-      `maxlength`, and ensures that the output has a `dense_shape` of at most
-      `maxlength` in the inner dimension.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `values` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
     binary_output: If True, this op will output 1 instead of the number of times
       a token appears (equivalent to one_hot + reduce_any instead of one_hot +
       reduce_add). Defaults to False.
@@ -229,9 +451,11 @@ def sparse_bincount(values,
     return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
 
 
-def validate_dense_weights(values, weights):
+def validate_dense_weights(values, weights, dtype=None):
   """Validates the passed weight tensor or creates an empty one."""
   if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
     return array_ops.constant([], dtype=values.dtype)
 
   if not isinstance(weights, ops.Tensor):
@@ -241,9 +465,11 @@ def validate_dense_weights(values, weights):
   return weights
 
 
-def validate_sparse_weights(values, weights):
+def validate_sparse_weights(values, weights, dtype=None):
   """Validates the passed weight tensor or creates an empty one."""
   if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
     return array_ops.constant([], dtype=values.values.dtype)
 
   if not isinstance(weights, sparse_tensor.SparseTensor):
@@ -273,9 +499,11 @@ def validate_sparse_weights(values, weights):
   return weights
 
 
-def validate_ragged_weights(values, weights):
+def validate_ragged_weights(values, weights, dtype=None):
   """Validates the passed weight tensor or creates an empty one."""
   if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
     return array_ops.constant([], dtype=values.values.dtype)
 
   if not isinstance(weights, ragged_tensor.RaggedTensor):
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_ops_test.py
similarity index 71%
rename from tensorflow/python/ops/bincount_test.py
rename to tensorflow/python/ops/bincount_ops_test.py
index 839af8dcc35..74fd17cae2b 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -23,9 +23,12 @@ import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import bincount
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
@@ -151,7 +154,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        binary_output=False,
                        weights=None,
                        axis=-1):
-    y = bincount.sparse_bincount(
+    y = bincount_ops.sparse_bincount(
         x,
         weights=weights,
         minlength=minlength,
@@ -349,7 +352,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
     w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
-    y = bincount.sparse_bincount(
+    y = bincount_ops.sparse_bincount(
         x_sparse,
         weights=w_sparse,
         minlength=minlength,
@@ -496,7 +499,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
     w = ragged_factory_ops.constant(weights) if weights is not None else None
-    y = bincount.sparse_bincount(
+    y = bincount_ops.sparse_bincount(
         x_ragged,
         weights=w,
         minlength=minlength,
@@ -508,6 +511,237 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
+class TestDenseBincount(test.TestCase, parameterized.TestCase):
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_all_count(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 1000
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems, 1))
+    inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1)
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals,
+                                            [num_rows, 1])
+
+    np_out = np.bincount(inp_vals, minlength=size)
+    self.assertAllEqual(
+        np_out, self.evaluate(bincount_ops.bincount(sparse_inp, axis=0)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_all_count_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 1000
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems, 1))
+    inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1)
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals,
+                                            [num_rows, 1])
+    weight_vals = np.random.random((n_elems,))
+    sparse_weights = sparse_tensor.SparseTensor(inp_indices, weight_vals,
+                                                [num_rows, 1])
+
+    np_out = np.bincount(inp_vals, minlength=size, weights=weight_vals)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(bincount_ops.bincount(
+            sparse_inp, sparse_weights, axis=0)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_all_binary(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    size = 10
+    n_elems = 4096
+    inp_indices = np.random.randint(0, num_rows, (n_elems, 1))
+    inp_indices = np.concatenate([inp_indices, np.zeros((n_elems, 1))], axis=1)
+    inp_vals = np.random.randint(0, size, (n_elems,), dtype=dtype)
+    sparse_inp = sparse_tensor.SparseTensor(inp_indices, inp_vals,
+                                            [num_rows, 1])
+
+    np_out = np.ones((size,))
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(bincount_ops.bincount(sparse_inp, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_col_reduce_count(self, dtype):
+    num_rows = 128
+    num_cols = 27
+    size = 100
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    # from_dense will filter out 0s.
+    inp = inp + 1
+    # from_dense will cause OOM in GPU.
+    with ops.device("/CPU:0"):
+      inp_sparse = sparse_ops.from_dense(inp)
+      inp_sparse = sparse_tensor.SparseTensor(inp_sparse.indices,
+                                              inp_sparse.values - 1,
+                                              inp_sparse.dense_shape)
+    self.assertAllEqual(
+        np_out, self.evaluate(bincount_ops.bincount(arr=inp_sparse, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_sparse_input_col_reduce_binary(self, dtype):
+    num_rows = 128
+    num_cols = 27
+    size = 100
+    np.random.seed(42)
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate([
+            np.where(np.bincount(inp[j, :], minlength=size) > 0, 1, 0)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    # from_dense will filter out 0s.
+    inp = inp + 1
+    # from_dense will cause OOM in GPU.
+    with ops.device("/CPU:0"):
+      inp_sparse = sparse_ops.from_dense(inp)
+      inp_sparse = sparse_tensor.SparseTensor(inp_sparse.indices,
+                                              inp_sparse.values - 1,
+                                              inp_sparse.dense_shape)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            bincount_ops.bincount(arr=inp_sparse, axis=-1, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+                                    dtype)
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 1, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 2, 1]]
+    # pyformat: enable
+    self.assertAllEqual(expected_output,
+                        self.evaluate(bincount_ops.bincount(arr=x, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_binary(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 1, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 1, 1]]
+    # pyformat: enable
+    self.assertAllEqual(
+        expected_output,
+        self.evaluate(
+            bincount_ops.bincount(arr=x, axis=-1, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_with_weights(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    weights = ragged_factory_ops.constant([[], [], [.1, .2, .3], [],
+                                           [.2, .5, .6, .3]])
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [.2, .3, 0, .1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [.5, 0, 0, 0, .9, .2]]
+    # pyformat: enable
+    self.assertAllClose(
+        expected_output,
+        self.evaluate(bincount_ops.bincount(arr=x, weights=weights, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_np(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(bincount_ops.bincount(arr=x, minlength=size, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_np_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_weight = np.random.random((num_rows, num_cols))
+    np_out = np.reshape(
+        np.concatenate([
+            np.bincount(inp[j, :], weights=np_weight[j, :], minlength=size)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    weights = ragged_tensor.RaggedTensor.from_tensor(np_weight)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            bincount_ops.bincount(
+                arr=x, weights=weights, minlength=size, axis=-1)))
+
+
 class TestSparseCountFailureModes(test.TestCase):
 
   def test_dense_input_sparse_weights_fails(self):
@@ -515,13 +749,13 @@ class TestSparseCountFailureModes(test.TestCase):
     weights = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_dense_input_ragged_weights_fails(self):
     x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
     weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
     with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_dense_input_wrong_shape_fails(self):
     x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
@@ -532,24 +766,24 @@ class TestSparseCountFailureModes(test.TestCase):
     if context.executing_eagerly():
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "must have the same shape"):
-        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+        self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
     else:
       with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
-        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+        self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_dense_weights_fails(self):
     x = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
     with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_ragged_weights_fails(self):
     x = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
     with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_wrong_indices_fails(self):
     x = sparse_ops.from_dense(
@@ -558,7 +792,7 @@ class TestSparseCountFailureModes(test.TestCase):
         np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "must have the same indices"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_too_many_indices_fails(self):
     x = sparse_ops.from_dense(
@@ -567,7 +801,7 @@ class TestSparseCountFailureModes(test.TestCase):
         np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "Incompatible shapes"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_sparse_input_wrong_shape_fails(self):
     x = sparse_ops.from_dense(
@@ -577,27 +811,27 @@ class TestSparseCountFailureModes(test.TestCase):
                  dtype=np.int32))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "must have the same dense shape"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_ragged_input_dense_weights_fails(self):
     x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
     weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
     with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_ragged_input_sparse_weights_fails(self):
     x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
     weights = sparse_ops.from_dense(
         np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
     with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
   def test_ragged_input_different_shape_fails(self):
     x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
     weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "must have the same row splits"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 06132cc9674..18dda547cbe 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -3562,116 +3562,6 @@ def log_sigmoid(x, name=None):
     return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
-@tf_export("math.bincount", v1=[])
-@dispatch.add_dispatch_support
-def bincount(arr,
-             weights=None,
-             minlength=None,
-             maxlength=None,
-             dtype=dtypes.int32,
-             name=None):
-  """Counts the number of occurrences of each value in an integer array.
-
-  If `minlength` and `maxlength` are not given, returns a vector with length
-  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
-  If `weights` are non-None, then index `i` of the output stores the sum of the
-  value in `weights` at each index where the corresponding value in `arr` is
-  `i`.
-
-  ```python
-  values = tf.constant([1,1,2,3,2,4,4,5])
-  tf.math.bincount(values) #[0 2 2 1 2 1]
-  ```
-  Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6
-                  will be the vector length.
-
-  Each bin value in the output indicates number of occurrences of the particular
-  index. Here, index 1 in output has a value 2. This indicates value 1 occurs
-  two times in `values`.
-
-  ```python
-  values = tf.constant([1,1,2,3,2,4,4,5])
-  weights = tf.constant([1,5,0,1,0,5,4,5])
-  tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5]
-  ```
-  Bin will be incremented by the corresponding weight instead of 1.
-  Here, index 1 in output has a value 6. This is the summation of weights
-  corresponding to the value in `values`.
-
-  Args:
-    arr: An int32 tensor of non-negative values.
-    weights: If non-None, must be the same shape as arr. For each value in
-      `arr`, the bin will be incremented by the corresponding weight instead of
-      1.
-    minlength: If given, ensures the output has length at least `minlength`,
-      padding with zeros at the end if necessary.
-    maxlength: If given, skips values in `arr` that are equal or greater than
-      `maxlength`, ensuring that the output has length at most `maxlength`.
-    dtype: If `weights` is None, determines the type of the output bins.
-    name: A name scope for the associated operations (optional).
-
-  Returns:
-    A vector with the same dtype as `weights` or the given `dtype`. The bin
-    values.
-
-  Raises:
-    `InvalidArgumentError` if negative values are provided as an input.
-
-  """
-  name = "bincount" if name is None else name
-  with ops.name_scope(name):
-    arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
-    array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
-    output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
-    if minlength is not None:
-      minlength = ops.convert_to_tensor(
-          minlength, name="minlength", dtype=dtypes.int32)
-      output_size = gen_math_ops.maximum(minlength, output_size)
-    if maxlength is not None:
-      maxlength = ops.convert_to_tensor(
-          maxlength, name="maxlength", dtype=dtypes.int32)
-      output_size = gen_math_ops.minimum(maxlength, output_size)
-    if weights is not None:
-      weights = ops.convert_to_tensor(weights, name="weights")
-      return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
-    weights = constant_op.constant([], dtype)
-    return gen_math_ops.bincount(arr, output_size, weights)
-
-
-@tf_export(v1=["math.bincount", "bincount"])
-@dispatch.add_dispatch_support
-@deprecation.deprecated_endpoints("bincount")
-def bincount_v1(arr,
-                weights=None,
-                minlength=None,
-                maxlength=None,
-                dtype=dtypes.int32):
-  """Counts the number of occurrences of each value in an integer array.
-
-  If `minlength` and `maxlength` are not given, returns a vector with length
-  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
-  If `weights` are non-None, then index `i` of the output stores the sum of the
-  value in `weights` at each index where the corresponding value in `arr` is
-  `i`.
-
-  Args:
-    arr: An int32 tensor of non-negative values.
-    weights: If non-None, must be the same shape as arr. For each value in
-      `arr`, the bin will be incremented by the corresponding weight instead of
-      1.
-    minlength: If given, ensures the output has length at least `minlength`,
-      padding with zeros at the end if necessary.
-    maxlength: If given, skips values in `arr` that are equal or greater than
-      `maxlength`, ensuring that the output has length at most `maxlength`.
-    dtype: If `weights` is None, determines the type of the output bins.
-
-  Returns:
-    A vector with the same dtype as `weights` or the given `dtype`. The bin
-    values.
-  """
-  return bincount(arr, weights, minlength, maxlength, dtype)
-
-
 @tf_export("math.cumsum", "cumsum")
 @dispatch.add_dispatch_support
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
@@ -4556,9 +4446,9 @@ def polyval(coeffs, x, name=None):
 
      p(x) = coeffs[n-1] + x * (coeffs[n-2] + ... + x * (coeffs[1] +
             x * coeffs[0]))
-            
+
   Usage Example:
-  
+
   >>> coefficients = [1.0, 2.5, -4.2]
   >>> x = 5.0
   >>> y = tf.math.polyval(coefficients, x)
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 66cac6a11d2..b2a02b82454 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -307,6 +307,7 @@ py_library(
     deps = [
         ":segment_id_ops",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:bincount_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -417,6 +418,7 @@ py_library(
     deps = [
         ":ragged_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:bincount_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
diff --git a/tensorflow/python/ops/ragged/row_partition.py b/tensorflow/python/ops/ragged/row_partition.py
index 133b55a53bf..e86ecc3f034 100644
--- a/tensorflow/python/ops/ragged/row_partition.py
+++ b/tensorflow/python/ops/ragged/row_partition.py
@@ -228,6 +228,9 @@ class RowPartition(composite_tensor.CompositeTensor):
     ...     nrows=4))
     tf.RowPartition(row_splits=tf.Tensor([0 4 4 7 8], shape=(5,), dtype=int64))
     """
+    # Local import bincount_ops to avoid import-cycle since bincount_ops
+    # imports ragged_tensor.
+    from tensorflow.python.ops import bincount_ops  # pylint: disable=g-import-not-at-top
     if not isinstance(validate, bool):
       raise TypeError("validate must have type bool")
     with ops.name_scope(None, "RowPartitionFromValueRowIds",
@@ -278,7 +281,7 @@ class RowPartition(composite_tensor.CompositeTensor):
       # cast.
       value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
       nrows_int32 = math_ops.cast(nrows, dtypes.int32)
-      row_lengths = math_ops.bincount(
+      row_lengths = bincount_ops.bincount(
           value_rowids_int32,
           minlength=nrows_int32,
           maxlength=nrows_int32,
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 0d4a58bfea4..3b3809d8d56 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -98,6 +98,8 @@ def segment_ids_to_row_splits(segment_ids, num_segments=None,
   Returns:
     A sorted 1-D integer Tensor, with `shape=[num_segments + 1]`.
   """
+  # Local import bincount_ops to avoid import-cycle.
+  from tensorflow.python.ops import bincount_ops  # pylint: disable=g-import-not-at-top
   if out_type is None:
     if isinstance(segment_ids, ops.Tensor):
       out_type = segment_ids.dtype
@@ -119,7 +121,7 @@ def segment_ids_to_row_splits(segment_ids, num_segments=None,
                                                        dtype=dtypes.int32)
       num_segments.shape.assert_has_rank(0)
 
-    row_lengths = math_ops.bincount(
+    row_lengths = bincount_ops.bincount(
         segment_ids,
         minlength=num_segments,
         maxlength=num_segments,
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index 227366f5f98..2ea4e8f84a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -82,7 +82,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\', \'name\', \'axis\', \'binary_output\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\", \'None\', \'None\', \'False\'], "
   }
   member_method {
     name: "ceil"

From 7f2bc445e2dbb0b4dd9870fcaa5a4036a55af336 Mon Sep 17 00:00:00 2001
From: Ajay P <paiajay@amazon.com>
Date: Wed, 20 May 2020 22:58:20 +0000
Subject: [PATCH 0936/1533] Fixed test

---
 tensorflow/python/ops/gradients_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 8d38ca8e1d5..5a76cae5817 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -1371,6 +1371,7 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
                                                                  delta=delta)
     self.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
   
+  @test_util.run_v2_only
   def testCustomGradientRecomputeGradHigherOrder(self):
 
     @custom_gradient.recompute_grad

From 786ee6565feaa29e419901de428761cdeeb90e21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 15:52:22 -0700
Subject: [PATCH 0937/1533] Add doctests to lookup layers.

PiperOrigin-RevId: 312571334
Change-Id: I3792e5165194ea01369544ecdc6f158fcf44bcbf
---
 .../layers/preprocessing/integer_lookup.py    | 78 +++++++++++++++++++
 .../layers/preprocessing/string_lookup.py     | 78 +++++++++++++++++++
 2 files changed, 156 insertions(+)

diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
index c42c7cc1b89..6f497983408 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -59,6 +59,84 @@ class IntegerLookup(index_lookup.IndexLookup):
       error will be thrown.
     invert: If true, this layer will map indices to vocabulary items instead
       of mapping vocabulary items to indices.
+
+  Examples:
+
+  Creating a lookup layer with a known vocabulary
+
+  This example creates a lookup layer with a pre-existing vocabulary.
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup(vocabulary=vocab)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[2, 4, 5],
+         [5, 1, 3]])>
+
+
+  Creating a lookup layer with an adapted vocabulary
+
+  This example creates a lookup layer and generates the vocabulary by analyzing
+  the dataset.
+
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup()
+  >>> layer.adapt(data)
+  >>> layer.get_vocabulary()
+  [0, -1, 42, 1138, 1000, 36, 12]
+
+  Note how the mask value 0 and the OOV value -1 have been added to the
+  vocabulary. The remaining tokens are sorted by frequency (1138, which has
+  2 occurrences, is first) then by inverse sort order.
+
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup()
+  >>> layer.adapt(data)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[6, 3, 2],
+         [2, 4, 5]])>
+
+
+  Inverse lookup
+
+  This example demonstrates how to map indices to values using this layer. (You
+  can also use adapt() with inverse=True, but for simplicity we'll pass the
+  vocab in this example.)
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[1, 3, 4], [4, 5, 2]])
+  >>> layer = IntegerLookup(vocabulary=vocab, invert=True)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[  12, 1138,   42],
+         [  42,   -1,   36]])>
+
+  Note that the integer 5, which is out of the vocabulary space, returns an OOV
+  token.
+
+
+  Forward and inverse lookup pairs
+
+  This example demonstrates how to use the vocabulary of a standard lookup
+  layer to create an inverse lookup layer.
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
+  >>> layer = IntegerLookup(vocabulary=vocab)
+  >>> i_layer = IntegerLookup(vocabulary=layer.get_vocabulary(), invert=True)
+  >>> int_data = layer(data)
+  >>> i_layer(int_data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[  12, 1138,   42],
+         [  42,   -1,   36]])>
+
+  In this example, the input value 1000 resulted in an output of -1, since
+  1000 was not in the vocabulary - it got represented as an OOV, and all OOV
+  values are returned as -1 in the inverse layer. Also, note that for the
+  inverse to work, you must have already set the forward layer vocabulary
+  either directly or via fit() before calling get_vocabulary().
   """
 
   def __init__(self,
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
index bbebe499204..a420de8678a 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -60,6 +60,84 @@ class StringLookup(index_lookup.IndexLookup):
     encoding: The Python string encoding to use. Defaults to `'utf-8'`.
     invert: If true, this layer will map indices to vocabulary items instead
       of mapping vocabulary items to indices.
+
+  Examples:
+
+  Creating a lookup layer with a known vocabulary
+
+  This example creates a lookup layer with a pre-existing vocabulary.
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup(vocabulary=vocab)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[2, 4, 5],
+         [5, 1, 3]])>
+
+
+  Creating a lookup layer with an adapted vocabulary
+
+  This example creates a lookup layer and generates the vocabulary by analyzing
+  the dataset.
+
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup()
+  >>> layer.adapt(data)
+  >>> layer.get_vocabulary()
+  ['', '[OOV]', 'd', 'z', 'c', 'b', 'a']
+
+  Note how the mask token '' and the OOV token [OOV] have been added to the
+  vocabulary. The remaining tokens are sorted by frequency ('d', which has
+  2 occurrences, is first) then by inverse sort order.
+
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup()
+  >>> layer.adapt(data)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[6, 4, 2],
+         [2, 3, 5]])>
+
+
+  Inverse lookup
+
+  This example demonstrates how to map indices to strings using this layer. (You
+  can also use adapt() with inverse=True, but for simplicity we'll pass the
+  vocab in this example.)
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([[1, 3, 4], [4, 5, 2]])
+  >>> layer = StringLookup(vocabulary=vocab, invert=True)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=string, numpy=
+  array([[b'a', b'c', b'd'],
+         [b'd', b'[OOV]', b'b']], dtype=object)>
+
+  Note that the integer 5, which is out of the vocabulary space, returns an OOV
+  token.
+
+
+  Forward and inverse lookup pairs
+
+  This example demonstrates how to use the vocabulary of a standard lookup
+  layer to create an inverse lookup layer.
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([["a", "c", "d"], ["d", "z", "b"]])
+  >>> layer = StringLookup(vocabulary=vocab)
+  >>> i_layer = StringLookup(vocabulary=layer.get_vocabulary(), invert=True)
+  >>> int_data = layer(data)
+  >>> i_layer(int_data)
+  <tf.Tensor: shape=(2, 3), dtype=string, numpy=
+  array([[b'a', b'c', b'd'],
+         [b'd', b'[OOV]', b'b']], dtype=object)>
+
+  In this example, the input value 'z' resulted in an output of '[OOV]', since
+  1000 was not in the vocabulary - it got represented as an OOV, and all OOV
+  values are returned as '[OOV}' in the inverse layer. Also, note that for the
+  inverse to work, you must have already set the forward layer vocabulary
+  either directly or via fit() before calling get_vocabulary().
   """
 
   def __init__(self,

From e622f15b21adaaf0b707db7be6febf8a76b55e25 Mon Sep 17 00:00:00 2001
From: Jingyue Wu <jingyue@google.com>
Date: Wed, 20 May 2020 15:54:53 -0700
Subject: [PATCH 0938/1533] DataFormatVecPermute accepts a vector of size 2.

This partially rolls back cl/307496027.

The code before cl/307496027 assumes the actual length of input_sizes is always
4 and always permutes the vector. However, this is unsafe because the length of
input_sizes can also be 2. cl/307496027 made the code safe. But this way
LayoutOptimizer misses some optimizations, which apparently cause more memory
usage.

This CL makes DataFormatVecPermute accepts a vector of size 2 as well as a
vector of size 4. When the size is 2, the two dimensions are interpreted as
spatial dimensions. This way LayoutOptimizer doesn't need to check the static
shape of input_sizes. Instead, it applies DataFormatVecPermute regardless of
the vector size.

See b/156645925 for details.

PiperOrigin-RevId: 312571735
Change-Id: I257e2bef328882dbbcd0fe6bf07ef1f8989daf36
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  4 +-
 .../compiler/tests/data_format_ops_test.py    | 10 +++
 .../tf2xla/kernels/data_format_ops.cc         | 26 +++++--
 .../generic_layout_optimizer_test.cc          | 76 +++++++------------
 .../generic_layout_optimizer_transposer.cc    | 19 +----
 tensorflow/core/kernels/data_format_ops.cc    | 43 +++++++----
 tensorflow/python/ops/nn_test.py              | 32 ++++++++
 7 files changed, 123 insertions(+), 87 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 95e888179e1..ea41c8224f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1297,8 +1297,8 @@ static LogicalResult Verify(DataFormatVecPermuteOp op) {
 
   if (rank == 1) {
     int64_t dim0 = input_ty.getDimSize(0);
-    if (dim0 != ShapedType::kDynamicSize && dim0 != 4)
-      return op.emitOpError("requires 1D input of size 4");
+    if (dim0 != ShapedType::kDynamicSize && dim0 != 4 && dim0 != 2)
+      return op.emitOpError("requires 1D input of size 4 or size 2");
   }
 
   if (rank == 2) {
diff --git a/tensorflow/compiler/tests/data_format_ops_test.py b/tensorflow/compiler/tests/data_format_ops_test.py
index 681c1f3499e..08d44256b50 100644
--- a/tensorflow/compiler/tests/data_format_ops_test.py
+++ b/tensorflow/compiler/tests/data_format_ops_test.py
@@ -81,11 +81,21 @@ class XlaPermuteOpTest(xla_test.XLATestCase):
       x = np.array([7, 4, 9, 3], dtype=dtype)
       self._runPermuteAndCompare(x, "NHWC", "NCHW", [7, 3, 4, 9])
 
+  def testNHWCToNCHW_Size2(self):
+    for dtype in {np.int32, np.int64}:
+      x = np.array([4, 9], dtype=dtype)
+      self._runPermuteAndCompare(x, "NHWC", "NCHW", [4, 9])
+
   def testNCHWToNHWC(self):
     for dtype in {np.int32, np.int64}:
       x = np.array([7, 4, 9, 3], dtype=dtype)
       self._runPermuteAndCompare(x, "NCHW", "NHWC", [7, 9, 3, 4])
 
+  def testNCHWToNHWC_Size2(self):
+    for dtype in {np.int32, np.int64}:
+      x = np.array([9, 3], dtype=dtype)
+      self._runPermuteAndCompare(x, "NCHW", "NHWC", [9, 3])
+
   def testNHWCToHWNC(self):
     for dtype in {np.int32, np.int64}:
       x = np.array([7, 4, 9, 3], dtype=dtype)
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index fb89742b139..c1f60abc0d6 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -106,8 +106,9 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
                 errors::InvalidArgument(
                     "Input must be a vector or matrix, but got shape ",
                     input_tensor_shape.DebugString()));
+    const int dim0 = input_tensor_shape.dim_size(0);
     OP_REQUIRES(
-        ctx, input_tensor_shape.dim_size(0) == 4,
+        ctx, dim0 == 2 || dim0 == 4,
         errors::InvalidArgument(
             "First dimension of input must be of size 4, but got shape ",
             input_tensor_shape.DebugString()));
@@ -118,10 +119,25 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
               "Second dimension of 2D input must be of size 2, but got shape ",
               input_tensor_shape.DebugString()));
     }
-    int32 dst_indices[4];
-    for (int i = 0; i < 4; ++i) {
-      for (int j = 0; j < 4; ++j) {
-        if (src_format_[i] == dst_format_[j]) {
+
+    string src_format_str = src_format_;
+    string dst_format_str = dst_format_;
+    if (dim0 == 2) {
+      // If the input is a vector of size 2, treat the two elements as spatial
+      // dimensions.
+      auto keep_only_spatial_dimensions = [](string* format_str) -> void {
+        auto new_end = std::remove_if(
+            format_str->begin(), format_str->end(),
+            [](const char dim) { return dim != 'H' && dim != 'W'; });
+        format_str->erase(new_end, format_str->end());
+      };
+      keep_only_spatial_dimensions(&src_format_str);
+      keep_only_spatial_dimensions(&dst_format_str);
+    }
+    std::vector<int32> dst_indices(dim0);
+    for (int i = 0; i < dim0; ++i) {
+      for (int j = 0; j < dim0; ++j) {
+        if (src_format_str[i] == dst_format_str[j]) {
           dst_indices[j] = i;
           break;
         }
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
index c85d85e69ff..79bedf5f2e6 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc
@@ -356,57 +356,35 @@ TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
 #if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
   GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
 #endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  Scope s = Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
-                                        /*input_sizes_length=*/4);
-  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
-  GrapplerItem item;
-  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+  for (const int input_sizes_length : {2, 4}) {
+    Scope s = Scope::NewRootScope();
+    auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
+                                          input_sizes_length);
+    Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
+    GrapplerItem item;
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
 
-  GenericLayoutOptimizer optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
+    GenericLayoutOptimizer optimizer;
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
 
-  Status status;
-  utils::GraphView graph_view(&output, &status);
-  TF_ASSERT_OK(status);
-  auto* conv2d_backprop_node = graph_view.GetNode("Conv2DBackpropInput");
-  ASSERT_NE(conv2d_backprop_node, nullptr);
-  ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
-  VerifyRegularFaninMatch(
-      conv2d_backprop_node, 0,
-      "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer",
-      0);
-  auto* input_sizes_node = graph_view.GetNode(
-      "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer");
-  ASSERT_NE(input_sizes_node, nullptr);
-  EXPECT_EQ(input_sizes_node->GetOp(), "DataFormatVecPermute");
-  ASSERT_EQ(input_sizes_node->NumRegularFanins(), 1);
-  VerifyRegularFaninMatch(input_sizes_node, 0, "InputSizesIdentity", 0);
-}
-
-TEST_F(GenericLayoutOptimizerTest, Conv2DBackpropInput2DInputSizes) {
-#if !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  GTEST_SKIP() << "Neither CUDA nor ROCm is enabled";
-#endif  // !(GOOGLE_CUDA || TENSORFLOW_USE_ROCM)
-  Scope s = Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", /*dilated=*/false,
-                                        /*input_sizes_length=*/2);
-  Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
-  GrapplerItem item;
-  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
-
-  GenericLayoutOptimizer optimizer;
-  GraphDef output;
-  TF_ASSERT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output));
-
-  Status status;
-  utils::GraphView graph_view(&output, &status);
-  TF_ASSERT_OK(status);
-  auto* conv2d_backprop_node = graph_view.GetNode("Conv2DBackpropInput");
-  ASSERT_NE(conv2d_backprop_node, nullptr);
-  ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
-  VerifyRegularFaninMatch(conv2d_backprop_node, 0, "InputSizesIdentity", 0);
+    Status status;
+    utils::GraphView graph_view(&output, &status);
+    TF_ASSERT_OK(status);
+    auto* conv2d_backprop_node = graph_view.GetNode("Conv2DBackpropInput");
+    ASSERT_NE(conv2d_backprop_node, nullptr);
+    ASSERT_EQ(conv2d_backprop_node->NumRegularFanins(), 3);
+    VerifyRegularFaninMatch(
+        conv2d_backprop_node, 0,
+        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer",
+        0);
+    auto* input_sizes_node = graph_view.GetNode(
+        "Conv2DBackpropInput-0-DataFormatVecPermuteNHWCToNCHW-LayoutOptimizer");
+    ASSERT_NE(input_sizes_node, nullptr);
+    EXPECT_EQ(input_sizes_node->GetOp(), "DataFormatVecPermute");
+    ASSERT_EQ(input_sizes_node->NumRegularFanins(), 1);
+    VerifyRegularFaninMatch(input_sizes_node, 0, "InputSizesIdentity", 0);
+  }
 }
 
 TEST_F(GenericLayoutOptimizerTest, Conv2DDataFormatVecPermuteCollapse) {
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index a5a5f7ae64a..ab7d8fcd6cf 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -739,28 +739,13 @@ Status Conv2DBackpropInputTransposer::TransposeNode(
     VLOG(3) << fanin_node->GetName() << " is not a vector.";
     return Status::OK();
   }
-  int vector_size = fanin_shape.dim(0).size();
-  if (vector_size == -1) {
-    VLOG(3) << "The number of elements in " << fanin_node->GetName()
-            << " is unknown.";
-    return Status::OK();
-  }
-  if (vector_size != 2 && vector_size != 4) {
-    return errors::InvalidArgument(
-        fanin_node->GetName(), " must be a vector of size 2 or 4, but found ",
-        vector_size);
-  }
 
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
           << "' with op '" << node->GetOp() << "' from data format '"
           << context->src_format << "' to '" << context->dst_format << "'";
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
-  // Do not permute a input_sizes of size 2 because it represents HW regardless
-  // of whether NCHW or NHWC.
-  if (vector_size != 2) {
-    TF_RETURN_IF_ERROR(
-        UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
-  }
+  TF_RETURN_IF_ERROR(
+      UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {2}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
   return context->graph_view->GetMutationBuilder()->Apply();
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 0b4241dbb93..181aa1b8a2c 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -90,16 +90,15 @@ class DataFormatVecPermuteOp : public OpKernel {
                     "input must be a vector or 2D tensor, but got shape ",
                     input.shape().DebugString()));
     if (input.dims() == 1) {
-      OP_REQUIRES(
-          context, input.NumElements() == 4,
-          errors::InvalidArgument("1D input must be of size 4, but got shape ",
-                                  input.shape().DebugString()));
+      OP_REQUIRES(context, input.NumElements() == 2 || input.NumElements() == 4,
+                  errors::InvalidArgument(
+                      "1D input must be of size 2 or 4, but got shape ",
+                      input.shape().DebugString()));
     } else if (input.dims() == 2) {
-      OP_REQUIRES(
-          context, input.dim_size(0) == 4,
-          errors::InvalidArgument(
-              "First dimension of 2D input must be of size 4, but got shape ",
-              input.shape().DebugString()));
+      OP_REQUIRES(context, input.dim_size(0) == 2 || input.dim_size(0) == 4,
+                  errors::InvalidArgument("First dimension of 2D input must be "
+                                          "of size 2 or 4, but got shape ",
+                                          input.shape().DebugString()));
       OP_REQUIRES(
           context, input.dim_size(1) == 2,
           errors::InvalidArgument(
@@ -112,7 +111,21 @@ class DataFormatVecPermuteOp : public OpKernel {
                    context->allocate_output(0, input.shape(), &output));
     // Support 1D and 2D cases.
     Eigen::DSizes<Eigen::DenseIndex, 8> dst_idx;
-    ComputeDstIndex(input.dims(), &dst_idx);
+    string src_format_str = src_format_;
+    string dst_format_str = dst_format_;
+    if (input.dim_size(0) == 2) {
+      // If the input is a vector of size 2, treat the two elements as spatial
+      // dimensions.
+      auto keep_only_spatial_dimensions = [](string* format_str) -> void {
+        auto new_end = std::remove_if(
+            format_str->begin(), format_str->end(),
+            [](const char dim) { return dim != 'H' && dim != 'W'; });
+        format_str->erase(new_end, format_str->end());
+      };
+      keep_only_spatial_dimensions(&src_format_str);
+      keep_only_spatial_dimensions(&dst_format_str);
+    }
+    ComputeDstIndex(src_format_str, dst_format_str, input.dims(), &dst_idx);
 
     functor::DataFormatVecPermute<Device, T>()(context->eigen_device<Device>(),
                                                input.flat<T>(),
@@ -124,10 +137,12 @@ class DataFormatVecPermuteOp : public OpKernel {
   // Example: HWNC --> NHWC
   // 1D: dst = [1, 2, 0, 3],
   // 2D: dst = [2, 3, 4, 5, 0, 1, 6, 7]
-  void ComputeDstIndex(int num_dim, Eigen::DSizes<Eigen::DenseIndex, 8>* dst) {
-    for (int i = 0; i < src_format_.size(); ++i) {
-      for (int j = 0; j < dst_format_.size(); ++j) {
-        if (dst_format_[j] != src_format_[i]) continue;
+  static void ComputeDstIndex(const string& src_format_str,
+                              const string& dst_format_str, int num_dim,
+                              Eigen::DSizes<Eigen::DenseIndex, 8>* dst) {
+    for (int i = 0; i < src_format_str.size(); ++i) {
+      for (int j = 0; j < dst_format_str.size(); ++j) {
+        if (dst_format_str[j] != src_format_str[i]) continue;
         // Found the dst index. Set output based on the number of dims.
         for (int k = 0; k < num_dim; ++k) {
           (*dst)[i * num_dim + k] = j * num_dim + k;
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 860bdc60387..0088c04f909 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1199,6 +1199,30 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 3, 4, 9])
 
+  def testNHWCToNCHW_Size2(self):
+    x_val = [4, 9]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [4, 9])
+
+  def testNHWCToWHCN(self):
+    x_val = [7, 4, 9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="WHCN")
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [9, 4, 3, 7])
+
+  def testNHWCToWHCN_Size2(self):
+    x_val = [4, 9]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="WHCN")
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [9, 4])
+
   def testNCHWToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
@@ -1207,6 +1231,14 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 9, 3, 4])
 
+  def testNCHWToNHWC_Size2(self):
+    x_val = [9, 3]
+    x = constant_op.constant(x_val)
+    y = nn_ops.data_format_vec_permute(x)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
+      self.assertAllEqual(y_val, [9, 3])
+
   def testNHWCToHWNC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)

From 99350ea1f0cc72067c888219e24333eac3eaced7 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 20 May 2020 15:55:56 -0700
Subject: [PATCH 0939/1533] Run build_cleaner to fix dependencies.

PiperOrigin-RevId: 312571937
Change-Id: I4b9e4e605c10eb17c51aac08a33610c5b9a5a8dc
---
 .../lite/delegates/gpu/common/testing/feature_parity/BUILD      | 2 +-
 .../gpu/common/testing/feature_parity/generators/BUILD          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
index b0c5b7526f8..b5ceff30d1e 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
@@ -24,10 +24,10 @@ cc_library(
     hdrs = ["utils.h"],
     deps = [
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
index f2a6fa10b1e..ae746cdb08d 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/generators/BUILD
@@ -11,7 +11,7 @@ cc_library(
     ],
     deps = [
         ":add",
-        "//tensorflow/lite:framework",
+        "//tensorflow/lite/delegates/gpu/common/testing/feature_parity:utils",
     ],
 )
 

From e28b37be9645a71c747c81181feae2dd8fd3f615 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 15:57:24 -0700
Subject: [PATCH 0940/1533] Enable gradient tests for tf.linalg.qr in eager
 mode.

PiperOrigin-RevId: 312572186
Change-Id: I4d1e62478fa41b277bd5191210f2da5e5c090653
---
 tensorflow/python/kernel_tests/qr_op_test.py | 38 +++++++++-----------
 1 file changed, 17 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index b1bbd0aaee3..d5337c183a6 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateless_random_ops
@@ -175,13 +175,16 @@ class QrGradOpTest(test.TestCase):
 
 def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
 
-  @test_util.run_v1_only("b/120545219")
-  def Test(self):
-    np.random.seed(42)
+  def RandomInput():
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
     if dtype_ in [np.complex64, np.complex128]:
       a += 1j * np.random.uniform(
           low=-1.0, high=1.0, size=shape_).astype(dtype_)
+    return a
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def Test(self):
+    np.random.seed(42)
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     epsilon = np.finfo(dtype_).eps
     delta = 0.1 * epsilon**(1.0 / 3.0)
@@ -189,23 +192,16 @@ def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
       tol = 3e-2
     else:
       tol = 1e-6
-    with self.session(use_gpu=True):
-      tf_a = constant_op.constant(a)
-      tf_b = linalg_ops.qr(tf_a, full_matrices=full_matrices_)
-      for b in tf_b:
-        x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        if dtype_ in [np.complex64, np.complex128]:
-          x_init += 1j * np.random.uniform(
-              low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        theoretical, numerical = gradient_checker.compute_gradient(
-            tf_a,
-            tf_a.get_shape().as_list(),
-            b,
-            b.get_shape().as_list(),
-            x_init_value=x_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # TODO(b/157171666): Sadly we have to double the computation because
+    # gradient_checker_v2.compute_gradient expects a list of functions.
+    funcs = [
+        lambda a: linalg_ops.qr(a, full_matrices=full_matrices_)[0],
+        lambda a: linalg_ops.qr(a, full_matrices=full_matrices_)[1]
+    ]
+    for f in funcs:
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          f, [RandomInput()], delta=delta)
+      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 

From ae14cc6b1bef5b262bfd56f70b3c27853edfa654 Mon Sep 17 00:00:00 2001
From: Henry Tan <henrytan@google.com>
Date: Wed, 20 May 2020 16:33:24 -0700
Subject: [PATCH 0941/1533] Moving TPU ops components as TPU kernels library.

PiperOrigin-RevId: 312578499
Change-Id: I10d7675bfe4a79b65008ffd8f21c6f807dda266e
---
 tensorflow/compiler/jit/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index bc8fac0e88f..5ec0575ed77 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -505,6 +505,7 @@ cc_library(
     name = "shape_inference",
     srcs = ["shape_inference.cc"],
     hdrs = ["shape_inference.h"],
+    visibility = [":friends"],
     deps = [
         ":shape_inference_helpers",
         "//tensorflow/compiler/xla:statusor",

From 26b258151986a1abad52a4c73005fd15355047f9 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 20 May 2020 16:40:34 -0700
Subject: [PATCH 0942/1533] Export Hashing layer. Add `separator` for
 CategoryCrossing and tf.sparse.cross. Add benchmarks for hashing.

PiperOrigin-RevId: 312579726
Change-Id: I0dc5bac26413ec114c57bd59e6810d6c641f600d
---
 tensorflow/python/keras/layers/__init__.py    |   3 +-
 .../python/keras/layers/preprocessing/BUILD   |  20 +-
 .../layers/preprocessing/benchmarks/BUILD     |  16 +-
 ...mark.py => category_crossing_benchmark.py} |   6 +-
 .../benchmarks/hashing_benchmark.py           | 115 +++++++++
 ...rical_crossing.py => category_crossing.py} |  31 ++-
 ...=> category_crossing_distribution_test.py} |   4 +-
 ...sing_test.py => category_crossing_test.py} |  58 +++--
 .../keras/layers/preprocessing/hashing.py     | 176 +++++++++++---
 .../layers/preprocessing/hashing_test.py      |  84 ++++++-
 .../python/keras/layers/serialization.py      |   4 +-
 tensorflow/python/ops/sparse_ops.py           |  56 ++++-
 ...tal.preprocessing.-category-crossing.pbtxt |   4 +-
 ....experimental.preprocessing.-hashing.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 .../api/golden/v1/tensorflow.sparse.pbtxt     |   2 +-
 ...tal.preprocessing.-category-crossing.pbtxt |   4 +-
 ....experimental.preprocessing.-hashing.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 .../api/golden/v2/tensorflow.sparse.pbtxt     |   2 +-
 20 files changed, 938 insertions(+), 91 deletions(-)
 rename tensorflow/python/keras/layers/preprocessing/benchmarks/{categorical_crossing_benchmark.py => category_crossing_benchmark.py} (97%)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
 rename tensorflow/python/keras/layers/preprocessing/{categorical_crossing.py => category_crossing.py} (87%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_crossing_distribution_test.py => category_crossing_distribution_test.py} (98%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_crossing_test.py => category_crossing_test.py} (82%)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index ede199a9169..67ac91cb9be 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -57,7 +57,8 @@ else:
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
-from tensorflow.python.keras.layers.preprocessing.categorical_crossing import CategoryCrossing
+from tensorflow.python.keras.layers.preprocessing.category_crossing import CategoryCrossing
+from tensorflow.python.keras.layers.preprocessing.hashing import Hashing
 
 # Advanced activations.
 from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index b580382f9d8..b7fdc17b81d 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -25,7 +25,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":categorical_crossing",
+        ":category_crossing",
         ":discretization",
         ":hashing",
         ":image_preprocessing",
@@ -52,9 +52,9 @@ py_library(
 )
 
 py_library(
-    name = "categorical_crossing",
+    name = "category_crossing",
     srcs = [
-        "categorical_crossing.py",
+        "category_crossing.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -291,16 +291,16 @@ py_library(
 )
 
 cuda_py_test(
-    name = "categorical_crossing_test",
+    name = "category_crossing_test",
     size = "medium",
-    srcs = ["categorical_crossing_test.py"],
+    srcs = ["category_crossing_test.py"],
     python_version = "PY3",
     shard_count = 4,
     tags = [
         "no_windows",  # b/149031156
     ],
     deps = [
-        ":categorical_crossing",
+        ":category_crossing",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -343,9 +343,9 @@ distribute_py_test(
 )
 
 distribute_py_test(
-    name = "categorical_crossing_distribution_test",
-    srcs = ["categorical_crossing_distribution_test.py"],
-    main = "categorical_crossing_distribution_test.py",
+    name = "category_crossing_distribution_test",
+    srcs = ["category_crossing_distribution_test.py"],
+    main = "category_crossing_distribution_test.py",
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
@@ -354,7 +354,7 @@ distribute_py_test(
         "no_oss",  # b/155502591
     ],
     deps = [
-        ":categorical_crossing",
+        ":category_crossing",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 653a81581b3..6d29126bc7e 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -21,12 +21,22 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "categorical_crossing_benchmark",
-    srcs = ["categorical_crossing_benchmark.py"],
+    name = "category_crossing_benchmark",
+    srcs = ["category_crossing_benchmark.py"],
     python_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/keras/layers/preprocessing:categorical_crossing",
+        "//tensorflow/python/keras/layers/preprocessing:category_crossing",
+    ],
+)
+
+tf_py_test(
+    name = "hashing_benchmark",
+    srcs = ["hashing_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:hashing",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
similarity index 97%
rename from tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
rename to tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
index 80a7903f0b9..efc0ca3766f 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_crossing_benchmark.py
@@ -28,7 +28,7 @@ from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -74,7 +74,7 @@ class BenchmarkLayer(benchmark.Benchmark):
   def bm_layer_implementation(self, batch_size):
     input_1 = keras.Input(shape=(1,), dtype=dtypes.int64, name="word")
     input_2 = keras.Input(shape=(1,), dtype=dtypes.int64, name="int")
-    layer = categorical_crossing.CategoryCrossing()
+    layer = category_crossing.CategoryCrossing()
     _ = layer([input_1, input_2])
 
     num_repeats = 5
@@ -97,7 +97,7 @@ class BenchmarkLayer(benchmark.Benchmark):
       ends.append(time.time())
 
     avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    name = "categorical_crossing|batch_%s" % batch_size
+    name = "category_crossing|batch_%s" % batch_size
     baseline = self.run_dataset_implementation(batch_size)
     extras = {
         "dataset implementation baseline": baseline,
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
new file mode 100644
index 00000000000..68ab28c7f6c
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/hashing_benchmark.py
@@ -0,0 +1,115 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras hashing preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import random
+import string
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers.preprocessing import hashing
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+
+# word_gen creates random sequences of ASCII letters (both lowercase and upper).
+# The number of unique strings is ~2,700.
+def word_gen():
+  for _ in itertools.count(1):
+    yield "".join(random.choice(string.ascii_letters) for i in range(2))
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
+                                              tensor_shape.TensorShape([]))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = string_ops.string_to_hash_bucket(i, num_buckets=2)
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    input_1 = keras.Input(shape=(None,), dtype=dtypes.string, name="word")
+    layer = hashing.Hashing(num_bins=2)
+    _ = layer(input_1)
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.string,
+                                              tensor_shape.TensorShape([]))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = layer(i)
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    name = "hashing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
similarity index 87%
rename from tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
rename to tensorflow/python/keras/layers/preprocessing/category_crossing.py
index 68848458bb2..79c27d9ec36 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -49,6 +49,17 @@ class CategoryCrossing(Layer):
            [b'b_X_e'],
            [b'c_X_f']], dtype=object)>
 
+
+  >>> inp_1 = tf.constant([['a'], ['b'], ['c']])
+  >>> inp_2 = tf.constant([['d'], ['e'], ['f']])
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing(
+  ...    separator='-')
+  >>> layer([inp_1, inp_2])
+  <tf.Tensor: shape=(3, 1), dtype=string, numpy=
+    array([[b'a-d'],
+           [b'b-e'],
+           [b'c-f']], dtype=object)>
+
   Arguments:
     depth: depth of input crossing. By default None, all inputs are crossed into
       one output. It can also be an int or tuple/list of ints. Passing an
@@ -59,6 +70,8 @@ class CategoryCrossing(Layer):
       equal to N1 or N2. Passing `None` means a single crossed output with all
       inputs. For example, with inputs `a`, `b` and `c`, `depth=2` means the
       output will be [a;b;c;cross(a, b);cross(bc);cross(ca)].
+    separator: A string added between each input being joined. Defaults to
+      '_X_'.
     name: Name to give to the layer.
     **kwargs: Keyword arguments to construct a layer.
 
@@ -98,13 +111,12 @@ class CategoryCrossing(Layer):
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
   """
 
-  def __init__(self,
-               depth=None,
-               name=None,
-               **kwargs):
-    # TODO(tanzheny): Consider making seperator configurable.
+  def __init__(self, depth=None, name=None, separator=None, **kwargs):
     super(CategoryCrossing, self).__init__(name=name, **kwargs)
     self.depth = depth
+    if separator is None:
+      separator = '_X_'
+    self.separator = separator
     if isinstance(depth, (tuple, list)):
       self._depth_tuple = depth
     elif depth is not None:
@@ -114,12 +126,16 @@ class CategoryCrossing(Layer):
     """Gets the crossed output from a partial list/tuple of inputs."""
     # If ragged_out=True, convert output from sparse to ragged.
     if ragged_out:
+      # TODO(momernick): Support separator with ragged_cross.
+      if self.separator != '_X_':
+        raise ValueError('Non-default separator with ragged input is not '
+                         'supported yet, given {}'.format(self.separator))
       return ragged_array_ops.cross(partial_inputs)
     elif sparse_out:
-      return sparse_ops.sparse_cross(partial_inputs)
+      return sparse_ops.sparse_cross(partial_inputs, separator=self.separator)
     else:
       return sparse_ops.sparse_tensor_to_dense(
-          sparse_ops.sparse_cross(partial_inputs))
+          sparse_ops.sparse_cross(partial_inputs, separator=self.separator))
 
   def call(self, inputs):
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
@@ -178,6 +194,7 @@ class CategoryCrossing(Layer):
   def get_config(self):
     config = {
         'depth': self.depth,
+        'separator': self.separator,
     }
     base_config = super(CategoryCrossing, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
similarity index 98%
rename from tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
index 57dea6edf4a..1ccc7fe2296 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_distribution_test.py
@@ -28,7 +28,7 @@ from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
@@ -72,7 +72,7 @@ class CategoryCrossingDistributionTest(
       input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string,
                                  name='input_2')
       input_data = [input_data_1, input_data_2]
-      layer = categorical_crossing.CategoryCrossing()
+      layer = category_crossing.CategoryCrossing()
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(inp_dataset)
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
similarity index 82%
rename from tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
index 5bbcf5ce022..f076c9ea865 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
-from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -41,7 +41,7 @@ from tensorflow.python.platform import test
 class CategoryCrossingTest(keras_parameterized.TestCase):
 
   def test_crossing_sparse_inputs(self):
-    layer = categorical_crossing.CategoryCrossing()
+    layer = category_crossing.CategoryCrossing()
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [1, 1]],
         values=['a', 'b', 'c'],
@@ -52,8 +52,32 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
     self.assertAllEqual([b'a_X_d', b'b_X_e', b'c_X_e'], output.values)
 
+  def test_crossing_sparse_inputs_custom_sep(self):
+    layer = category_crossing.CategoryCrossing(separator='_Y_')
+    inputs_0 = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 0], [1, 1]],
+        values=['a', 'b', 'c'],
+        dense_shape=[2, 2])
+    inputs_1 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
+    output = layer([inputs_0, inputs_1])
+    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
+    self.assertAllEqual([b'a_Y_d', b'b_Y_e', b'c_Y_e'], output.values)
+
+  def test_crossing_sparse_inputs_empty_sep(self):
+    layer = category_crossing.CategoryCrossing(separator='')
+    inputs_0 = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 0], [1, 1]],
+        values=['a', 'b', 'c'],
+        dense_shape=[2, 2])
+    inputs_1 = sparse_tensor.SparseTensor(
+        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
+    output = layer([inputs_0, inputs_1])
+    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
+    self.assertAllEqual([b'ad', b'be', b'ce'], output.values)
+
   def test_crossing_sparse_inputs_depth_int(self):
-    layer = categorical_crossing.CategoryCrossing(depth=1)
+    layer = category_crossing.CategoryCrossing(depth=1)
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [2, 0]],
         values=['a', 'b', 'c'],
@@ -69,7 +93,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_out, output)
 
   def test_crossing_sparse_inputs_depth_tuple(self):
-    layer = categorical_crossing.CategoryCrossing(depth=(2, 3))
+    layer = category_crossing.CategoryCrossing(depth=(2, 3))
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [2, 0]],
         values=['a', 'b', 'c'],
@@ -107,14 +131,14 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
 
-    non_hashed_layer = categorical_crossing.CategoryCrossing()
+    non_hashed_layer = category_crossing.CategoryCrossing()
     out_t = non_hashed_layer([inp_0_t, inp_1_t])
     model = training.Model(inputs=[inp_0_t, inp_1_t], outputs=out_t)
     expected_output = [[b'omar_X_a', b'skywalker_X_a'], [b'marlo_X_b']]
     self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
 
   def test_crossing_ragged_inputs_depth_int(self):
-    layer = categorical_crossing.CategoryCrossing(depth=1)
+    layer = category_crossing.CategoryCrossing(depth=1)
     inputs_0 = ragged_factory_ops.constant([['a'], ['b'], ['c']])
     inputs_1 = ragged_factory_ops.constant([['d'], ['e'], ['f']])
     output = layer([inputs_0, inputs_1])
@@ -122,7 +146,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertIsInstance(output, ragged_tensor.RaggedTensor)
     self.assertAllEqual(expected_output, output)
 
-    layer = categorical_crossing.CategoryCrossing(depth=2)
+    layer = category_crossing.CategoryCrossing(depth=2)
     inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     out_t = layer([inp_0_t, inp_1_t])
@@ -132,7 +156,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
 
   def test_crossing_ragged_inputs_depth_tuple(self):
-    layer = categorical_crossing.CategoryCrossing(depth=[2, 3])
+    layer = category_crossing.CategoryCrossing(depth=[2, 3])
     inputs_0 = ragged_factory_ops.constant([['a'], ['b'], ['c']])
     inputs_1 = ragged_factory_ops.constant([['d'], ['e'], ['f']])
     inputs_2 = ragged_factory_ops.constant([['g'], ['h'], ['i']])
@@ -149,21 +173,21 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output, output)
 
   def test_crossing_with_dense_inputs(self):
-    layer = categorical_crossing.CategoryCrossing()
+    layer = category_crossing.CategoryCrossing()
     inputs_0 = np.asarray([[1, 2]])
     inputs_1 = np.asarray([[1, 3]])
     output = layer([inputs_0, inputs_1])
     self.assertAllEqual([[b'1_X_1', b'1_X_3', b'2_X_1', b'2_X_3']], output)
 
   def test_crossing_dense_inputs_depth_int(self):
-    layer = categorical_crossing.CategoryCrossing(depth=1)
+    layer = category_crossing.CategoryCrossing(depth=1)
     inputs_0 = constant_op.constant([['a'], ['b'], ['c']])
     inputs_1 = constant_op.constant([['d'], ['e'], ['f']])
     output = layer([inputs_0, inputs_1])
     expected_output = [[b'a', b'd'], [b'b', b'e'], [b'c', b'f']]
     self.assertAllEqual(expected_output, output)
 
-    layer = categorical_crossing.CategoryCrossing(depth=2)
+    layer = category_crossing.CategoryCrossing(depth=2)
     inp_0_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(1,), dtype=dtypes.string)
     out_t = layer([inp_0_t, inp_1_t])
@@ -174,7 +198,7 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output, model.predict([inputs_0, inputs_1]))
 
   def test_crossing_dense_inputs_depth_tuple(self):
-    layer = categorical_crossing.CategoryCrossing(depth=[2, 3])
+    layer = category_crossing.CategoryCrossing(depth=[2, 3])
     inputs_0 = constant_op.constant([['a'], ['b'], ['c']])
     inputs_1 = constant_op.constant([['d'], ['e'], ['f']])
     inputs_2 = constant_op.constant([['g'], ['h'], ['i']])
@@ -200,21 +224,21 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         tensor_spec.TensorSpec(input_shape, dtypes.string)
         for input_shape in input_shapes
     ]
-    layer = categorical_crossing.CategoryCrossing()
+    layer = category_crossing.CategoryCrossing()
     output_spec = layer.compute_output_signature(input_specs)
     self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
     self.assertEqual(output_spec.dtype, dtypes.string)
 
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):
-    layer = categorical_crossing.CategoryCrossing(depth=2, name='hashing')
+    layer = category_crossing.CategoryCrossing(depth=2, name='hashing')
     config = layer.get_config()
-    layer_1 = categorical_crossing.CategoryCrossing.from_config(config)
+    layer_1 = category_crossing.CategoryCrossing.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
 
-    layer = categorical_crossing.CategoryCrossing(name='hashing')
+    layer = category_crossing.CategoryCrossing(name='hashing')
     config = layer.get_config()
-    layer_1 = categorical_crossing.CategoryCrossing.from_config(config)
+    layer_1 = category_crossing.CategoryCrossing.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
 
 
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index dfd4761f193..05b4445829a 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -22,20 +22,28 @@ import functools
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.ops import gen_sparse_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import keras_export
+
+# Default key from tf.sparse.cross_hashed
+_DEFAULT_SALT_KEY = [0xDECAFCAFFE, 0xDECAFCAFFE]
 
 
+@keras_export('keras.layers.experimental.preprocessing.Hashing')
 class Hashing(Layer):
   """Implements categorical feature hashing, also known as "hashing trick".
 
-  This layer transforms categorical inputs to hashed output. It converts a
-  sequence of int or string to a sequence of int. The stable hash function uses
-  tensorflow::ops::Fingerprint to produce universal output that is consistent
-  across platforms.
+  This layer transforms single or multiple categorical inputs to hashed output.
+  It converts a sequence of int or string to a sequence of int. The stable hash
+  function uses tensorflow::ops::Fingerprint to produce universal output that
+  is consistent across platforms.
 
   This layer uses [FarmHash64](https://github.com/google/farmhash) by default,
   which provides a consistent hashed output across different platforms and is
@@ -48,50 +56,91 @@ class Hashing(Layer):
   the `salt` value serving as additional input to the hash function.
 
   Example (FarmHash64):
-  ```python
-    layer = Hashing(num_bins=3)
-    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
-    layer(inputs)
-    [[1], [0], [1], [1], [2]]
-  ```
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3)
+  >>> inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+  >>> layer(inp)
+  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[1],
+           [0],
+           [1],
+           [1],
+           [2]])>
+
 
   Example (SipHash64):
-  ```python
-    layer = Hashing(num_bins=3, salt=[133, 137])
-    inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
-    layer(inputs)
-    [[1], [2], [1], [0], [2]]
-  ```
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3,
+  ...    salt=[133, 137])
+  >>> inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+  >>> layer(inp)
+  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[1],
+           [2],
+           [1],
+           [0],
+           [2]])>
+
+  Example (Siphash64 with a single integer, same as `salt=[133, 133]`
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.Hashing(num_bins=3,
+  ...    salt=133)
+  >>> inp = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+  >>> layer(inp)
+  <tf.Tensor: shape=(5, 1), dtype=int64, numpy=
+    array([[0],
+           [0],
+           [2],
+           [1],
+           [0]])>
+
+  Reference: [SipHash with salt](https://www.131002.net/siphash/siphash.pdf)
 
   Arguments:
     num_bins: Number of hash bins.
-    salt: A tuple/list of 2 unsigned integer numbers. If passed, the hash
-      function used will be SipHash64, with these values used as an additional
-      input (known as a "salt" in cryptography).
+    salt: A single unsigned integer or None.
+      If passed, the hash function used will be SipHash64, with these values
+      used as an additional input (known as a "salt" in cryptography).
       These should be non-zero. Defaults to `None` (in that
-      case, the FarmHash64 hash function is used).
+      case, the FarmHash64 hash function is used). It also supports
+      tuple/list of 2 unsigned integer numbers, see reference paper for details.
     name: Name to give to the layer.
     **kwargs: Keyword arguments to construct a layer.
 
-  Input shape: A string, int32 or int64 tensor of shape
-    `[batch_size, d1, ..., dm]`
+  Input shape: A single or list of string, int32 or int64 `Tensor`,
+    `SparseTensor` or `RaggedTensor` of shape `[batch_size, ...,]`
 
-  Output shape: An int64 tensor of shape `[batch_size, d1, ..., dm]`
+  Output shape: An int64 `Tensor`, `SparseTensor` or `RaggedTensor` of shape
+    `[batch_size, ...]`. If any input is `RaggedTensor` then output is
+    `RaggedTensor`, otherwise if any input is `SparseTensor` then output is
+    `SparseTensor`, otherwise the output is `Tensor`.
 
   """
 
   def __init__(self, num_bins, salt=None, name=None, **kwargs):
     if num_bins is None or num_bins <= 0:
       raise ValueError('`num_bins` cannot be `None` or non-positive values.')
-    if salt is not None:
-      if not isinstance(salt, (tuple, list)) or len(salt) != 2:
-        raise ValueError('`salt` must be a tuple or list of 2 unsigned '
-                         'integer numbers, got {}'.format(salt))
     super(Hashing, self).__init__(name=name, **kwargs)
     self.num_bins = num_bins
-    self.salt = salt
+    self.strong_hash = True if salt is not None else False
+    if salt is not None:
+      if isinstance(salt, (tuple, list)) and len(salt) == 2:
+        self.salt = salt
+      elif isinstance(salt, int):
+        self.salt = [salt, salt]
+      else:
+        raise ValueError('`salt can only be a tuple of size 2 integers, or a '
+                         'single integer, given {}'.format(salt))
+    else:
+      self.salt = _DEFAULT_SALT_KEY
 
   def call(self, inputs):
+    if isinstance(inputs, (tuple, list)):
+      return self._process_input_list(inputs)
+    else:
+      return self._process_single_input(inputs)
+
+  def _process_single_input(self, inputs):
     # Converts integer inputs to string.
     if inputs.dtype.is_integer:
       if isinstance(inputs, sparse_tensor.SparseTensor):
@@ -116,10 +165,38 @@ class Hashing(Layer):
     else:
       return str_to_hash_bucket(inputs, self.num_bins, name='hash')
 
+  def _process_input_list(self, inputs):
+    # TODO(momernick): support ragged_cross_hashed with corrected fingerprint
+    # and siphash.
+    if any([isinstance(inp, ragged_tensor.RaggedTensor) for inp in inputs]):
+      raise ValueError('Hashing with ragged input is not supported yet.')
+    sparse_inputs = [
+        inp for inp in inputs if isinstance(inp, sparse_tensor.SparseTensor)
+    ]
+    dense_inputs = [
+        inp for inp in inputs if not isinstance(inp, sparse_tensor.SparseTensor)
+    ]
+    all_dense = True if not sparse_inputs else False
+    indices = [sp_inp.indices for sp_inp in sparse_inputs]
+    values = [sp_inp.values for sp_inp in sparse_inputs]
+    shapes = [sp_inp.dense_shape for sp_inp in sparse_inputs]
+    indices_out, values_out, shapes_out = gen_sparse_ops.sparse_cross_hashed(
+        indices=indices,
+        values=values,
+        shapes=shapes,
+        dense_inputs=dense_inputs,
+        num_buckets=self.num_bins,
+        strong_hash=self.strong_hash,
+        salt=self.salt)
+    sparse_out = sparse_tensor.SparseTensor(indices_out, values_out, shapes_out)
+    if all_dense:
+      return sparse_ops.sparse_tensor_to_dense(sparse_out)
+    return sparse_out
+
   def _get_string_to_hash_bucket_fn(self):
     """Returns the string_to_hash_bucket op to use based on `hasher_key`."""
     # string_to_hash_bucket_fast uses FarmHash64 as hash function.
-    if self.salt is None:
+    if not self.strong_hash:
       return string_ops.string_to_hash_bucket_fast
     # string_to_hash_bucket_strong uses SipHash64 as hash function.
     else:
@@ -127,16 +204,43 @@ class Hashing(Layer):
           string_ops.string_to_hash_bucket_strong, key=self.salt)
 
   def compute_output_shape(self, input_shape):
-    return input_shape
+    if not isinstance(input_shape, (tuple, list)):
+      return input_shape
+    input_shapes = input_shape
+    batch_size = None
+    for inp_shape in input_shapes:
+      inp_tensor_shape = tensor_shape.TensorShape(inp_shape).as_list()
+      if len(inp_tensor_shape) != 2:
+        raise ValueError('Inputs must be rank 2, get {}'.format(input_shapes))
+      if batch_size is None:
+        batch_size = inp_tensor_shape[0]
+    # The second dimension is dynamic based on inputs.
+    output_shape = [batch_size, None]
+    return tensor_shape.TensorShape(output_shape)
 
   def compute_output_signature(self, input_spec):
-    output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    output_dtype = dtypes.int64
-    if isinstance(input_spec, sparse_tensor.SparseTensorSpec):
+    if not isinstance(input_spec, (tuple, list)):
+      output_shape = self.compute_output_shape(input_spec.shape)
+      output_dtype = dtypes.int64
+      if isinstance(input_spec, sparse_tensor.SparseTensorSpec):
+        return sparse_tensor.SparseTensorSpec(
+            shape=output_shape, dtype=output_dtype)
+      else:
+        return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
+    input_shapes = [x.shape for x in input_spec]
+    output_shape = self.compute_output_shape(input_shapes)
+    if any([
+        isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.int64)
+    elif any([
+        isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
+        for inp_spec in input_spec
+    ]):
       return sparse_tensor.SparseTensorSpec(
-          shape=output_shape, dtype=output_dtype)
-    else:
-      return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
+          shape=output_shape, dtype=dtypes.int64)
+    return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.int64)
 
   def get_config(self):
     config = {'num_bins': self.num_bins, 'salt': self.salt}
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing_test.py b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
index 147e4bc371b..4c3fd9c7501 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing_test.py
@@ -51,6 +51,15 @@ class HashingTest(keras_parameterized.TestCase):
     # Assert equal for hashed output that should be true on all platforms.
     self.assertAllClose([[0], [0], [1], [0], [0]], output)
 
+  def test_hash_dense_multi_inputs_farmhash(self):
+    layer = hashing.Hashing(num_bins=2)
+    inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
+                        ['skywalker']])
+    inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+    output = layer([inp_1, inp_2])
+    # Assert equal for hashed output that should be true on all platforms.
+    self.assertAllClose([[0], [0], [1], [1], [0]], output)
+
   def test_hash_dense_int_input_farmhash(self):
     layer = hashing.Hashing(num_bins=3)
     inp = np.asarray([[0], [1], [2], [3], [4]])
@@ -72,6 +81,21 @@ class HashingTest(keras_parameterized.TestCase):
     # Note the result is different from (133, 137).
     self.assertAllClose([[1], [0], [1], [0], [1]], output_2)
 
+  def test_hash_dense_multi_inputs_siphash(self):
+    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+    inp_1 = np.asarray([['omar'], ['stringer'], ['marlo'], ['wire'],
+                        ['skywalker']])
+    inp_2 = np.asarray([['A'], ['B'], ['C'], ['D'], ['E']])
+    output = layer([inp_1, inp_2])
+    # Assert equal for hashed output that should be true on all platforms.
+    # Note the result is different from FarmHash.
+    self.assertAllClose([[0], [1], [0], [0], [1]], output)
+
+    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
+    output_2 = layer_2([inp_1, inp_2])
+    # Note the result is different from (133, 137).
+    self.assertAllClose([[1], [1], [1], [0], [1]], output_2)
+
   def test_hash_dense_int_input_siphash(self):
     layer = hashing.Hashing(num_bins=3, salt=[133, 137])
     inp = np.asarray([[0], [1], [2], [3], [4]])
@@ -90,6 +114,19 @@ class HashingTest(keras_parameterized.TestCase):
     self.assertAllClose(indices, output.indices)
     self.assertAllClose([0, 0, 1, 0, 0], output.values)
 
+  def test_hash_sparse_multi_inputs_farmhash(self):
+    layer = hashing.Hashing(num_bins=2)
+    indices = [[0, 0], [1, 0], [2, 0]]
+    inp_1 = sparse_tensor.SparseTensor(
+        indices=indices,
+        values=['omar', 'stringer', 'marlo'],
+        dense_shape=[3, 1])
+    inp_2 = sparse_tensor.SparseTensor(
+        indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1])
+    output = layer([inp_1, inp_2])
+    self.assertAllClose(indices, output.indices)
+    self.assertAllClose([0, 0, 1], output.values)
+
   def test_hash_sparse_int_input_farmhash(self):
     layer = hashing.Hashing(num_bins=3)
     indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
@@ -116,6 +153,25 @@ class HashingTest(keras_parameterized.TestCase):
     # The result should be same with test_hash_dense_input_siphash.
     self.assertAllClose([1, 0, 1, 0, 1], output.values)
 
+  def test_hash_sparse_multi_inputs_siphash(self):
+    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+    indices = [[0, 0], [1, 0], [2, 0]]
+    inp_1 = sparse_tensor.SparseTensor(
+        indices=indices,
+        values=['omar', 'stringer', 'marlo'],
+        dense_shape=[3, 1])
+    inp_2 = sparse_tensor.SparseTensor(
+        indices=indices, values=['A', 'B', 'C'], dense_shape=[3, 1])
+    output = layer([inp_1, inp_2])
+    # The result should be same with test_hash_dense_input_siphash.
+    self.assertAllClose(indices, output.indices)
+    self.assertAllClose([0, 1, 0], output.values)
+
+    layer_2 = hashing.Hashing(num_bins=2, salt=[211, 137])
+    output = layer_2([inp_1, inp_2])
+    # The result should be same with test_hash_dense_input_siphash.
+    self.assertAllClose([1, 1, 1], output.values)
+
   def test_hash_sparse_int_input_siphash(self):
     layer = hashing.Hashing(num_bins=3, salt=[133, 137])
     indices = [[0, 0], [1, 0], [1, 1], [2, 0], [2, 1]]
@@ -140,6 +196,17 @@ class HashingTest(keras_parameterized.TestCase):
     model = training.Model(inputs=inp_t, outputs=out_t)
     self.assertAllClose(out_data, model.predict(inp_data))
 
+  def test_hash_ragged_string_multi_inputs_farmhash(self):
+    layer = hashing.Hashing(num_bins=2)
+    inp_data_1 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    inp_data_2 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    with self.assertRaisesRegexp(ValueError, 'not supported yet'):
+      _ = layer([inp_data_1, inp_data_2])
+
   def test_hash_ragged_int_input_farmhash(self):
     layer = hashing.Hashing(num_bins=3)
     inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]],
@@ -178,6 +245,17 @@ class HashingTest(keras_parameterized.TestCase):
     model = training.Model(inputs=inp_t, outputs=out_t)
     self.assertAllClose(out_data, model.predict(inp_data))
 
+  def test_hash_ragged_string_multi_inputs_siphash(self):
+    layer = hashing.Hashing(num_bins=2, salt=[133, 137])
+    inp_data_1 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    inp_data_2 = ragged_factory_ops.constant(
+        [['omar', 'stringer', 'marlo', 'wire'], ['marlo', 'skywalker', 'wire']],
+        dtype=dtypes.string)
+    with self.assertRaisesRegexp(ValueError, 'not supported yet'):
+      _ = layer([inp_data_1, inp_data_2])
+
   def test_hash_ragged_int_input_siphash(self):
     layer = hashing.Hashing(num_bins=3, salt=[133, 137])
     inp_data = ragged_factory_ops.constant([[0, 1, 3, 4], [2, 1, 0]],
@@ -197,11 +275,11 @@ class HashingTest(keras_parameterized.TestCase):
       _ = hashing.Hashing(num_bins=None)
     with self.assertRaisesRegexp(ValueError, 'cannot be `None`'):
       _ = hashing.Hashing(num_bins=-1)
-    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
+    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=2, salt='string')
-    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
+    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=2, salt=[1])
-    with self.assertRaisesRegexp(ValueError, 'must be a tuple'):
+    with self.assertRaisesRegexp(ValueError, 'can only be a tuple of size 2'):
       _ = hashing.Hashing(num_bins=1, salt=constant_op.constant([133, 137]))
 
   def test_hash_compute_output_signature(self):
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 9cafc0f08d8..2eb7cff75bb 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -45,6 +45,8 @@ from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers import rnn_cell_wrapper_v2
 from tensorflow.python.keras.layers import wrappers
+from tensorflow.python.keras.layers.preprocessing import category_crossing
+from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1 as preprocessing_normalization_v1
@@ -60,7 +62,7 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                embeddings, einsum_dense, local, merge, noise, normalization,
                pooling, image_preprocessing, preprocessing_normalization_v1,
                preprocessing_text_vectorization_v1,
-               recurrent, wrappers)
+               recurrent, wrappers, hashing, category_crossing)
 ALL_V2_MODULES = (
     rnn_cell_wrapper_v2,
     normalization_v2,
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index c4c88ab86ef..cc4b1010021 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -27,6 +27,7 @@ import numbers
 
 import numpy as np
 
+from tensorflow.python.compat import compat as tf_compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -569,7 +570,7 @@ def sparse_add_v2(a, b, threshold=0):
 
 
 @tf_export("sparse.cross")
-def sparse_cross(inputs, name=None):
+def sparse_cross(inputs, name=None, separator=None):
   """Generates sparse cross from a list of sparse and dense tensors.
 
   For example, if the inputs are
@@ -590,14 +591,39 @@ def sparse_cross(inputs, name=None):
       [1, 0]: "b_X_e_X_g"
       [1, 1]: "c_X_e_X_g"
 
+  Customized separator "_Y_":
+
+  >>> inp_0 = tf.constant([['a'], ['b']])
+  >>> inp_1 = tf.constant([['c'], ['d']])
+  >>> output = tf.sparse.cross([inp_0, inp_1], separator='_Y_')
+  >>> output.values
+  <tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a_Y_c', b'b_Y_d'],
+    dtype=object)>
+
+
   Args:
     inputs: An iterable of `Tensor` or `SparseTensor`.
     name: Optional name for the op.
+    separator: A string added between each string being joined. Defaults to
+      '_X_'.
 
   Returns:
     A `SparseTensor` of type `string`.
   """
-  return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
+  if separator is None and not tf_compat.forward_compatible(2020, 6, 14):
+    return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
+  if separator is None:
+    separator = "_X_"
+  separator = ops.convert_to_tensor(separator, dtypes.string)
+  indices, values, shapes, dense_inputs = _sparse_cross_internval_v2(inputs)
+  indices_out, values_out, shape_out = gen_sparse_ops.sparse_cross_v2(
+      indices=indices,
+      values=values,
+      shapes=shapes,
+      dense_inputs=dense_inputs,
+      sep=separator,
+      name=name)
+  return sparse_tensor.SparseTensor(indices_out, values_out, shape_out)
 
 
 _sparse_cross = sparse_cross
@@ -655,6 +681,32 @@ _sparse_cross_hashed = sparse_cross_hashed
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
+def _sparse_cross_internval_v2(inputs):
+  """See gen_sparse_ops.sparse_cross_v2."""
+  if not isinstance(inputs, (tuple, list)):
+    raise TypeError("Inputs must be a list")
+  if not all(
+      isinstance(i, sparse_tensor.SparseTensor) or isinstance(i, ops.Tensor)
+      for i in inputs):
+    raise TypeError("All inputs must be Tensor or SparseTensor.")
+  sparse_inputs = [
+      i for i in inputs if isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  dense_inputs = [
+      i for i in inputs if not isinstance(i, sparse_tensor.SparseTensor)
+  ]
+  indices = [sp_input.indices for sp_input in sparse_inputs]
+  values = [sp_input.values for sp_input in sparse_inputs]
+  shapes = [sp_input.dense_shape for sp_input in sparse_inputs]
+  for i in range(len(values)):
+    if values[i].dtype != dtypes.string:
+      values[i] = math_ops.cast(values[i], dtypes.int64)
+  for i in range(len(dense_inputs)):
+    if dense_inputs[i].dtype != dtypes.string:
+      dense_inputs[i] = math_ops.cast(dense_inputs[i], dtypes.int64)
+  return indices, values, shapes, dense_inputs
+
+
 def _sparse_cross_internal(inputs,
                            hashed_output=False,
                            num_buckets=0,
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
index 0407188ab6b..6cfcbf73e5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_crossing.CategoryCrossing\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'depth\', \'name\', \'separator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
new file mode 100644
index 00000000000..e4a5619058d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Hashing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.hashing.Hashing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_bins\', \'salt\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 0964922ea26..c93b8a89fb8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hashing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index f8f8edb26a8..9550418c2a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -22,7 +22,7 @@ tf_module {
   }
   member_method {
     name: "cross"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'name\', \'separator\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "cross_hashed"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
index 0407188ab6b..6cfcbf73e5d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_crossing.CategoryCrossing\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'depth\', \'name\', \'separator\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
new file mode 100644
index 00000000000..e4a5619058d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-hashing.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Hashing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.hashing.Hashing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_bins\', \'salt\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 0964922ea26..c93b8a89fb8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Hashing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 67235bb2cf2..0028b7d8953 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -18,7 +18,7 @@ tf_module {
   }
   member_method {
     name: "cross"
-    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'inputs\', \'name\', \'separator\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "cross_hashed"

From 5b71c8b023d7c25c65d46f21151661153353bb9b Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Mon, 27 Apr 2020 13:33:35 -0700
Subject: [PATCH 0943/1533] Add ios_static_framework bazel target for Swift

This adds an `ios_static_framework` target that builds the Swift
library for bundling it in other apps.
---
 tensorflow/lite/experimental/swift/BUILD.apple | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index e671721dd1c..72c3652bd89 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -2,7 +2,7 @@
 
 load("//tensorflow/lite:special_rules.bzl", "ios_visibility_whitelist", "tflite_ios_lab_runner")
 load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_DEFAULT_TAGS", "TFL_DISABLED_SANITIZER_TAGS", "TFL_MINIMUM_OS_VERSION")
-load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_unit_test")
+load("@build_bazel_rules_apple//apple:ios.bzl", "ios_application", "ios_static_framework", "ios_unit_test")
 load("@build_bazel_rules_swift//swift:swift.bzl", "swift_library")
 
 package(
@@ -39,6 +39,16 @@ swift_library(
     ],
 )
 
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/swift:TensorFlowLite_framework
+ios_static_framework(
+    name = "TensorFlowLite_framework",
+    bundle_name = "TensorFlowLite",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        ":TensorFlowLite",
+    ],
+)
+
 ios_unit_test(
     name = "Tests",
     size = "small",

From 23b82b36f067c9d4eb57f9df781a98217f5b88fd Mon Sep 17 00:00:00 2001
From: Keith Smiley <keithbsmiley@gmail.com>
Date: Wed, 29 Apr 2020 08:16:41 -0700
Subject: [PATCH 0944/1533] Avoid bundling deps

---
 tensorflow/lite/experimental/swift/BUILD.apple | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 72c3652bd89..5faee7c9f5d 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -42,6 +42,9 @@ swift_library(
 # bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/swift:TensorFlowLite_framework
 ios_static_framework(
     name = "TensorFlowLite_framework",
+    avoid_deps = [
+        "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
+    ],
     bundle_name = "TensorFlowLite",
     minimum_os_version = TFL_MINIMUM_OS_VERSION,
     deps = [

From 31f0c81a88b325365e2d8929aaa23f343e45d724 Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Wed, 20 May 2020 16:58:40 -0700
Subject: [PATCH 0945/1533] Add `inference_input_type` and
 `inference_output_type` flags in TF 2.x TFLiteConverter (backward compatible
 with TF 1.x) to support integer (tf.int8, tf.uint8) input and output types in
 post training full integer quantized models.

PiperOrigin-RevId: 312582865
Change-Id: I1d9ff8b6296f871b99847512bec29b50ab641949
---
 tensorflow/lite/python/lite.py         |  93 ++++++++++++---
 tensorflow/lite/python/lite_v2_test.py | 156 ++++++++++++++++++++++---
 2 files changed, 217 insertions(+), 32 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index ce59c56a1d0..d3cd3301dca 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -201,6 +201,11 @@ class QuantizationMode(object):
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
+  def is_post_training_integer_quantize(self):
+    """Post training integer quantization."""
+    return (self.post_training_int8_no_float() or
+            self.post_training_int8_allow_float())
+
   def training_time_int8_allow_float(self):
     """Training-time int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
@@ -413,7 +418,56 @@ class TFLiteConverterBase(object):
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
-  """Converter subclass to share functionality between V2 converters."""
+  """Converter subclass to share functionality between V2 converters.
+
+  Attributes:
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When False, any unknown operation is an error. When True, custom ops are
+      created for any op that is unknown. The developer needs to provide these
+      to the TensorFlow Lite runtime with a custom resolver. (default False)
+    optimizations: Experimental flag, subject to change. A list of optimizations
+      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
+    representative_dataset: A representative dataset that can be used to
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations. Note that this is an optional
+      attribute but it is necessary if INT8 is the only support builtin ops in
+      target ops.
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
+    inference_input_type: Data type of the input layer. Note that integer types
+      (tf.int8 and tf.uint8) are currently only supported for post training
+      integer quantization. (default tf.float32, must be in {tf.float32,
+      tf.int8, tf.uint8})
+    inference_output_type: Data type of the output layer. Note that integer
+      types (tf.int8 and tf.uint8) are currently only supported for post
+      training integer quantization. (default tf.float32, must be in
+      {tf.float32, tf.int8, tf.uint8})
+    experimental_new_converter: Experimental flag, subject to change. Enables
+      MLIR-based conversion instead of TOCO conversion.
+  """
+
+  def __init__(self):
+    """Constructor for TFLiteConverter."""
+    super(TFLiteConverterBaseV2, self).__init__()
+    self.inference_input_type = constants.FLOAT
+    self.inference_output_type = constants.FLOAT
+
+  def _validate_inference_input_output_types(self, quant_mode):
+    """Validate inference_input_type and inference_output_type flags."""
+    default_types = [constants.FLOAT, None]
+    # We only support integer types for post training integer quantization
+    # as we have statistical information to quantize the input and output.
+    if quant_mode.is_post_training_integer_quantize():
+      all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
+      if self.inference_input_type not in all_types or \
+          self.inference_output_type not in all_types:
+        all_types_names = ["tf." + t.name for t in all_types]
+        raise ValueError("The inference_input_type and inference_output_type "
+                         "must be in {}.".format(all_types_names))
+    elif self.inference_input_type not in default_types or \
+        self.inference_output_type not in default_types:
+      raise ValueError("The inference_input_type and inference_output_type "
+                       "must be tf.float32.")
 
   def convert(self, graph_def, input_tensors, output_tensors):
     """Converts a TensorFlow GraphDef based on instance variables.
@@ -437,6 +491,8 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     quant_mode = QuantizationMode(self.optimizations, self.target_spec,
                                   self.representative_dataset, graph_def)
 
+    self._validate_inference_input_output_types(quant_mode)
+
     if not self._is_unknown_shapes_allowed():
       # Checks dimensions in input tensor.
       for tensor in input_tensors:
@@ -479,6 +535,9 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           "quantize_to_float16": True,
       })
 
+    # Converter requires that the inference_input_type flag is set to FLOAT
+    converter_kwargs.update({"inference_input_type": constants.FLOAT})
+
     if not self.experimental_new_converter:
       logging.warning(
           "Please consider switching to use new converter by setting "
@@ -498,11 +557,11 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         **converter_kwargs)
 
     if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, False)
+      result = self._calibrate_quantize_model(result, self.inference_input_type,
+                                              self.inference_output_type, False)
     elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, True)
+      result = self._calibrate_quantize_model(result, self.inference_input_type,
+                                              self.inference_output_type, True)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -758,12 +817,9 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
 
   Attributes:
     allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When false any unknown operation is an error. When true, custom ops are
-      created for any op that is unknown. The developer will need to provide
-      these to the TensorFlow Lite runtime with a custom resolver.
-      (default False)
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
+      When False, any unknown operation is an error. When True, custom ops are
+      created for any op that is unknown. The developer needs to provide these
+      to the TensorFlow Lite runtime with a custom resolver. (default False)
     optimizations: Experimental flag, subject to change. A list of optimizations
       to apply when converting the model. E.g. `[Optimize.DEFAULT]`
     representative_dataset: A representative dataset that can be used to
@@ -771,8 +827,19 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       dataset to evaluate different optimizations. Note that this is an optional
       attribute but it is necessary if INT8 is the only support builtin ops in
       target ops.
-    experimental_new_converter: Experimental flag, subject to change.
-      Enables MLIR-based conversion instead of TOCO conversion.
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
+    inference_input_type: Data type of the input layer. Note that integer types
+      (tf.int8 and tf.uint8) are currently only supported for post training
+      integer quantization. (default tf.float32, must be in {tf.float32,
+      tf.int8, tf.uint8})
+    inference_output_type: Data type of the output layer. Note that integer
+      types (tf.int8 and tf.uint8) are currently only supported for post
+      training integer quantization. (default tf.float32, must be in
+      {tf.float32, tf.int8, tf.uint8})
+    experimental_new_converter: Experimental flag, subject to change. Enables
+      MLIR-based conversion instead of TOCO conversion.
+
   Example usage:
 
     ```python
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 9af37df2975..fae55e99cd1 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -71,6 +71,27 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
     self.assertEqual(expected_value.numpy(), actual_value)
 
+  @parameterized.named_parameters(
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  @test_util.run_v2_only
+  def testInvalidFloat(self, inference_input_output_type):
+    root = self._getSimpleVariableModel()
+    input_data = tf.constant(1., shape=[1])
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
+    # We don't support integer types as we don't have statistical information
+    # to quantize (only supported for post training integer quantization).
+    with self.assertRaises(ValueError) as error:
+      converter.inference_input_type = inference_input_output_type
+      converter.inference_output_type = inference_input_output_type
+      converter.convert()
+    self.assertEqual(
+        'The inference_input_type and inference_output_type '
+        'must be tf.float32.', str(error.exception))
+
   @test_util.run_v2_only
   def testScalarInput(self):
     root = self._getSimpleVariableModel()
@@ -172,39 +193,113 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def testCalibrateAndQuantizeBuiltinInt8(self, mlir_quantizer):
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  @test_util.run_v2_only
+  def testInvalidPostTrainingDynamicRangeQuantization(
+      self, inference_input_output_type):
+    func, _ = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    # We don't support integer types as we don't have statistical information
+    # to quantize (only supported for post training integer quantization).
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.inference_input_type = inference_input_output_type
+      quantized_converter.inference_output_type = inference_input_output_type
+      quantized_converter.convert()
+    self.assertEqual(
+        'The inference_input_type and inference_output_type '
+        'must be tf.float32.', str(error.exception))
+
+  @parameterized.named_parameters(
+      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  def testPostTrainingIntegerAllowFloatQuantization(
+      self, inference_input_output_type):
     func, calibration_gen = self._getCalibrationQuantizeModel()
 
     # Convert float model.
-    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertTrue(quantized_tflite_model)
+
+    interpreter = Interpreter(model_content=quantized_tflite_model)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     output_details[0]['dtype'])
+
+    # Ensure that the quantized tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(tflite_model))
+
+  @parameterized.named_parameters(
+      ('_DefaultFLOAT32InputOutput_UseTargetTypesFlag',
+       lite.constants.FLOAT, False),
+      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True),
+      ('_INT8InputOutput', lite.constants.INT8, True),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True))
+  @test_util.run_v2_only
+  def testPostTrainingIntegerNoFloatQuantization(self,
+                                                 inference_input_output_type,
+                                                 use_target_ops_flag):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
 
     # Convert model by specifying target spec (instead of optimizations), since
     # when targeting an integer only backend, quantization is mandatory.
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.TFLITE_BUILTINS_INT8
-    ]
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
-    quantized_converter._experimental_new_quantizer = mlir_quantizer
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    if use_target_ops_flag:
+      quantized_converter.target_spec.supported_ops = [
+          lite.OpsSet.TFLITE_BUILTINS_INT8
+      ]
+    else:
+      quantized_converter.target_spec.supported_types = [lite.constants.INT8]
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertTrue(quantized_tflite_model)
 
-    # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     input_details[0]['dtype'])
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
-    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     output_details[0]['dtype'])
 
-    # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    # Ensure that the quantized tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
   def testCalibrateAndQuantizeBuiltinInt16(self):
     func, calibration_gen = self._getCalibrationQuantizeModel()
@@ -279,7 +374,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
 
   @test_util.run_v2_only
-  def testTrainingTimeQuantizeConversion(self):
+  def testTrainingTimeQuantization(self):
     model = self._getTrainingTimeQuantizedModel()
 
     float_converter = lite.TFLiteConverterV2.from_keras_model(model)
@@ -297,6 +392,29 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     interpreter = Interpreter(model_content=quantized_tflite)
     self.assertEqual(np.float32, interpreter.get_input_details()[0]['dtype'])
 
+  @parameterized.named_parameters(
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  def testInvalidTrainingTimeQuantization(self, inference_input_output_type):
+    # We currently don't support integer inference_input_type and
+    # inference_output_type flags for training time quantization.
+
+    model = self._getTrainingTimeQuantizedModel()
+
+    converter = lite.TFLiteConverterV2.from_keras_model(model)
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.inference_input_type = inference_input_output_type
+      quantized_converter.inference_output_type = inference_input_output_type
+      quantized_converter.convert()
+    self.assertEqual(
+        'The inference_input_type and inference_output_type '
+        'must be tf.float32.', str(error.exception))
+
   @test_util.run_v2_only
   def testNewQuantizer(self):
     """Test the model quantized by the new converter."""

From 92d97dfc5963dab8bc5bc68d685c6840b8e2fed6 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 20 May 2020 17:10:15 -0700
Subject: [PATCH 0946/1533] Update optimize pass to adopt the shape constraints
 of binary ops

PiperOrigin-RevId: 312584668
Change-Id: I34fe1d918ac4c29da7e629e6f6f8f6c6b87b92f4
---
 tensorflow/compiler/mlir/lite/tests/optimize.mlir   | 13 +++++++++++++
 .../mlir/lite/transforms/optimize_patterns.td       |  8 +++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 2815afd14b9..3f8257b54f0 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -439,6 +439,19 @@ func @NotReorderReshapeAddIfNotTailingDim(%arg0: tensor<40x40x1xf32>) -> tensor<
   // CHECK: return %[[rs2]]
 }
 
+// CHECK-LABEL: @NotReorderReshapeAddIfHighDim
+func @NotReorderReshapeAddIfHighDim(%arg0: tensor<1x1x1x1x30x96xf32>) -> tensor<1x30x96xf32> {
+  %cst = constant dense<2.0> : tensor<f32>
+  %shape = constant dense<[1, 30, 96]> : tensor<3xi32>
+  %1 = "tfl.reshape"(%arg0, %shape) : (tensor<1x1x1x1x30x96xf32>, tensor<3xi32>) -> tensor<1x30x96xf32>
+  %2 = "tfl.add"(%1, %cst) {fused_activation_function = "NONE"} : (tensor<1x30x96xf32>, tensor<f32>) -> tensor<1x30x96xf32>
+  return %2 : tensor<1x30x96xf32>
+
+  // CHECK: %[[rs1:.*]] = "tfl.reshape"(%arg0
+  // CHECK: %[[rs2:.*]] = "tfl.add"(%[[rs1]]
+  // CHECK: return %[[rs2]]
+}
+
 // CHECK-LABEL: @ReorderElementwiseValueOpAndMoveOp
 func @ReorderElementwiseValueOpAndMoveOp(%arg0: tensor<40x40x1xf32>) -> tensor<40x40xf32> {
   %shape = constant dense<[40, 40]> : tensor<2xi32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index a3244f31053..6ade6122fe4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -29,6 +29,10 @@ def ExtractSingleElementAsFloat : NativeCodeCall<
 // Checks if the value has only one user.
 def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 
+// Checks if the value has rank at most 'n'.
+class HasRankAtMost<int n> : Constraint<
+    CPred<"$0.getType().cast<ShapedType>().getRank() <= " # n>>;
+
 //===----------------------------------------------------------------------===//
 // Ternary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -347,7 +351,9 @@ foreach BinaryOp = [TFL_AddOp, TFL_SubOp, TFL_DivOp, TFL_MulOp] in {
              // The result of the new "BinaryOp" will have the same shape as
              // `input`. In other words, the shape of the `Reshape` op are not
              // changed after the transformation.
-             (IsTailOfShape $rhs, $input)]>;
+             (IsTailOfShape $rhs, $input),
+             (HasRankAtMost<5> $input),
+             (HasRankAtMost<5> $rhs)]>;
 }
 
 foreach BinaryOp = [TFL_FloorDivOp, TFL_FloorModOp, TFL_MinimumOp,

From d53c999feb2751b6a3ec9bace5be6f0b4620e4b4 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Wed, 20 May 2020 17:26:59 -0700
Subject: [PATCH 0947/1533] Have sanity build output a Bazel test summary XML
 file.

This should avoid needing to download the full log file to find out which check failed, hopefully preventing inadvertent force submits.

PiperOrigin-RevId: 312587296
Change-Id: I788b2bddecbbdf5203c55e28b07c306a0e228fcf
---
 tensorflow/tools/ci_build/ci_sanity.sh | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index cc1156f8cc5..6db88755ac8 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -702,23 +702,37 @@ done
 # Print summary of build results
 COUNTER=0
 echo "==== Summary of sanity check results ===="
+TESTCASE_XML=''
 while [[ ${COUNTER} -lt "${#SANITY_STEPS[@]}" ]]; do
   INDEX=COUNTER
   ((INDEX++))
 
   echo "${INDEX}. ${SANITY_STEPS[COUNTER]}: ${SANITY_STEPS_DESC[COUNTER]}"
+  TESTCASE_XML="${TESTCASE_XML} <testcase name=\"${SANITY_STEPS_DESC[COUNTER]}\" status=\"run\" classname=\"\" time=\"0\">"
+
   if [[ ${STEP_EXIT_CODES[COUNTER]} == "0" ]]; then
     printf "  ${COLOR_GREEN}PASS${COLOR_NC}\n"
   else
     printf "  ${COLOR_RED}FAIL${COLOR_NC}\n"
+    TESTCASE_XML="${TESTCASE_XML} <failure message=\"\" type=\"\"/>"
   fi
 
+  TESTCASE_XML="${TESTCASE_XML} </testcase>"
+
   ((COUNTER++))
 done
 
 echo
 echo "${FAIL_COUNTER} failed; ${PASS_COUNTER} passed."
 
+mkdir -p "${KOKORO_ARTIFACTS_DIR}/${KOKORO_JOB_NAME}/summary"
+echo '<?xml version="1.0" encoding="UTF-8"?>'\
+  '<testsuites name="1"  tests="1" failures="0" errors="0" time="0">'\
+  '<testsuite name="Kokoro Summary" tests="'"$((FAIL_COUNTER + PASS_COUNTER))"\
+  '" failures="'"${FAIL_COUNTER}"'" errors="0" time="0">'\
+  "${TESTCASE_XML}"'</testsuite></testsuites>'\
+  > "${KOKORO_ARTIFACTS_DIR}/${KOKORO_JOB_NAME}/summary/sponge_log.xml"
+
 echo
 if [[ ${FAIL_COUNTER} == "0" ]]; then
   printf "Sanity checks ${COLOR_GREEN}PASSED${COLOR_NC}\n"

From 5af64a19a8c4e74c76ed509f51979175d0615e2e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 18:54:30 -0700
Subject: [PATCH 0948/1533] Add `inference_input_type` and
 `inference_output_type` flags in TF 2.x TFLiteConverter (backward compatible
 with TF 1.x) to support integer (tf.int8, tf.uint8) input and output types in
 post training full integer quantized models.

PiperOrigin-RevId: 312598331
Change-Id: I29edaf268102d7065445f9b7bceaa8f7bc505c6f
---
 tensorflow/lite/python/lite.py         |  93 +++------------
 tensorflow/lite/python/lite_v2_test.py | 156 +++----------------------
 2 files changed, 32 insertions(+), 217 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index d3cd3301dca..ce59c56a1d0 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -201,11 +201,6 @@ class QuantizationMode(object):
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
-  def is_post_training_integer_quantize(self):
-    """Post training integer quantization."""
-    return (self.post_training_int8_no_float() or
-            self.post_training_int8_allow_float())
-
   def training_time_int8_allow_float(self):
     """Training-time int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
@@ -418,56 +413,7 @@ class TFLiteConverterBase(object):
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
-  """Converter subclass to share functionality between V2 converters.
-
-  Attributes:
-    allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When False, any unknown operation is an error. When True, custom ops are
-      created for any op that is unknown. The developer needs to provide these
-      to the TensorFlow Lite runtime with a custom resolver. (default False)
-    optimizations: Experimental flag, subject to change. A list of optimizations
-      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
-    representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use the
-      dataset to evaluate different optimizations. Note that this is an optional
-      attribute but it is necessary if INT8 is the only support builtin ops in
-      target ops.
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    inference_input_type: Data type of the input layer. Note that integer types
-      (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
-    inference_output_type: Data type of the output layer. Note that integer
-      types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion.
-  """
-
-  def __init__(self):
-    """Constructor for TFLiteConverter."""
-    super(TFLiteConverterBaseV2, self).__init__()
-    self.inference_input_type = constants.FLOAT
-    self.inference_output_type = constants.FLOAT
-
-  def _validate_inference_input_output_types(self, quant_mode):
-    """Validate inference_input_type and inference_output_type flags."""
-    default_types = [constants.FLOAT, None]
-    # We only support integer types for post training integer quantization
-    # as we have statistical information to quantize the input and output.
-    if quant_mode.is_post_training_integer_quantize():
-      all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
-      if self.inference_input_type not in all_types or \
-          self.inference_output_type not in all_types:
-        all_types_names = ["tf." + t.name for t in all_types]
-        raise ValueError("The inference_input_type and inference_output_type "
-                         "must be in {}.".format(all_types_names))
-    elif self.inference_input_type not in default_types or \
-        self.inference_output_type not in default_types:
-      raise ValueError("The inference_input_type and inference_output_type "
-                       "must be tf.float32.")
+  """Converter subclass to share functionality between V2 converters."""
 
   def convert(self, graph_def, input_tensors, output_tensors):
     """Converts a TensorFlow GraphDef based on instance variables.
@@ -491,8 +437,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     quant_mode = QuantizationMode(self.optimizations, self.target_spec,
                                   self.representative_dataset, graph_def)
 
-    self._validate_inference_input_output_types(quant_mode)
-
     if not self._is_unknown_shapes_allowed():
       # Checks dimensions in input tensor.
       for tensor in input_tensors:
@@ -535,9 +479,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           "quantize_to_float16": True,
       })
 
-    # Converter requires that the inference_input_type flag is set to FLOAT
-    converter_kwargs.update({"inference_input_type": constants.FLOAT})
-
     if not self.experimental_new_converter:
       logging.warning(
           "Please consider switching to use new converter by setting "
@@ -557,11 +498,11 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         **converter_kwargs)
 
     if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, self.inference_input_type,
-                                              self.inference_output_type, False)
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, False)
     elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, self.inference_input_type,
-                                              self.inference_output_type, True)
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, True)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -817,9 +758,12 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
 
   Attributes:
     allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When False, any unknown operation is an error. When True, custom ops are
-      created for any op that is unknown. The developer needs to provide these
-      to the TensorFlow Lite runtime with a custom resolver. (default False)
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver.
+      (default False)
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
     optimizations: Experimental flag, subject to change. A list of optimizations
       to apply when converting the model. E.g. `[Optimize.DEFAULT]`
     representative_dataset: A representative dataset that can be used to
@@ -827,19 +771,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       dataset to evaluate different optimizations. Note that this is an optional
       attribute but it is necessary if INT8 is the only support builtin ops in
       target ops.
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    inference_input_type: Data type of the input layer. Note that integer types
-      (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
-    inference_output_type: Data type of the output layer. Note that integer
-      types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion.
-
+    experimental_new_converter: Experimental flag, subject to change.
+      Enables MLIR-based conversion instead of TOCO conversion.
   Example usage:
 
     ```python
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index fae55e99cd1..9af37df2975 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -71,27 +71,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
     self.assertEqual(expected_value.numpy(), actual_value)
 
-  @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  @test_util.run_v2_only
-  def testInvalidFloat(self, inference_input_output_type):
-    root = self._getSimpleVariableModel()
-    input_data = tf.constant(1., shape=[1])
-    concrete_func = root.f.get_concrete_function(input_data)
-
-    # Convert model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
-    with self.assertRaises(ValueError) as error:
-      converter.inference_input_type = inference_input_output_type
-      converter.inference_output_type = inference_input_output_type
-      converter.convert()
-    self.assertEqual(
-        'The inference_input_type and inference_output_type '
-        'must be tf.float32.', str(error.exception))
-
   @test_util.run_v2_only
   def testScalarInput(self):
     root = self._getSimpleVariableModel()
@@ -193,113 +172,39 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  @test_util.run_v2_only
-  def testInvalidPostTrainingDynamicRangeQuantization(
-      self, inference_input_output_type):
-    func, _ = self._getCalibrationQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Convert quantized model.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = inference_input_output_type
-      quantized_converter.inference_output_type = inference_input_output_type
-      quantized_converter.convert()
-    self.assertEqual(
-        'The inference_input_type and inference_output_type '
-        'must be tf.float32.', str(error.exception))
-
-  @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  def testPostTrainingIntegerAllowFloatQuantization(
-      self, inference_input_output_type):
+      ('EnableMlirQuantizer', True),  # enable mlir quantizer
+      ('DisableMlirQuantizer', False))  # disable mlir quantizer
+  def testCalibrateAndQuantizeBuiltinInt8(self, mlir_quantizer):
     func, calibration_gen = self._getCalibrationQuantizeModel()
 
     # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Convert quantized model.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertTrue(quantized_tflite_model)
-
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
-    self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
-
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
-
-  @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput_UseTargetTypesFlag',
-       lite.constants.FLOAT, False),
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True),
-      ('_INT8InputOutput', lite.constants.INT8, True),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True))
-  @test_util.run_v2_only
-  def testPostTrainingIntegerNoFloatQuantization(self,
-                                                 inference_input_output_type,
-                                                 use_target_ops_flag):
-    func, calibration_gen = self._getCalibrationQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
 
     # Convert model by specifying target spec (instead of optimizations), since
     # when targeting an integer only backend, quantization is mandatory.
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
     quantized_converter.representative_dataset = calibration_gen
-    if use_target_ops_flag:
-      quantized_converter.target_spec.supported_ops = [
-          lite.OpsSet.TFLITE_BUILTINS_INT8
-      ]
-    else:
-      quantized_converter.target_spec.supported_types = [lite.constants.INT8]
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertTrue(quantized_tflite_model)
+    quantized_converter._experimental_new_quantizer = mlir_quantizer
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
 
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
 
   def testCalibrateAndQuantizeBuiltinInt16(self):
     func, calibration_gen = self._getCalibrationQuantizeModel()
@@ -374,7 +279,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
 
   @test_util.run_v2_only
-  def testTrainingTimeQuantization(self):
+  def testTrainingTimeQuantizeConversion(self):
     model = self._getTrainingTimeQuantizedModel()
 
     float_converter = lite.TFLiteConverterV2.from_keras_model(model)
@@ -392,29 +297,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     interpreter = Interpreter(model_content=quantized_tflite)
     self.assertEqual(np.float32, interpreter.get_input_details()[0]['dtype'])
 
-  @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  def testInvalidTrainingTimeQuantization(self, inference_input_output_type):
-    # We currently don't support integer inference_input_type and
-    # inference_output_type flags for training time quantization.
-
-    model = self._getTrainingTimeQuantizedModel()
-
-    converter = lite.TFLiteConverterV2.from_keras_model(model)
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = inference_input_output_type
-      quantized_converter.inference_output_type = inference_input_output_type
-      quantized_converter.convert()
-    self.assertEqual(
-        'The inference_input_type and inference_output_type '
-        'must be tf.float32.', str(error.exception))
-
   @test_util.run_v2_only
   def testNewQuantizer(self):
     """Test the model quantized by the new converter."""

From bdd61926a6f4600b50654f0ed31c2588d9ddcf38 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 20 May 2020 19:09:51 -0700
Subject: [PATCH 0949/1533] Add op sanity checks to the following TFLite ops:

QuantizeOp
RangeOp
RankOp
ReduceAnyOp
ReduceMaxOp
ReduceMinOp
ReduceProdOp
Relu6Op
ReshapeOp
ResizeBilinearOp
ResizeNearestNeighborOp
ReverseSequenceOp
ReverseV2Op
RoundOp
RsqrtOp
SVDFOp
SegmentSumOp
SelectOp
SelectV2Op
ShapeOp
SliceOp
SoftmaxOp
SpaceToBatchNdOp

PiperOrigin-RevId: 312599980
Change-Id: I93588c30156f8c94e589dbd6768911d9cbc9e60a
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 254 +++++++++++-------
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |  12 +-
 .../testing/op_tests/space_to_batch_nd.py     |   7 +
 3 files changed, 174 insertions(+), 99 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index a585b8e1520..44174f6b6a2 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -285,14 +285,18 @@ def TFL_FloatNonNegative : AttrConstraint<
     CPred<"!$_self.cast<FloatAttr>().getValue().isNegative()">,
     "whose value is non-negative">;
 
-def TFL_BoolTrue: AttrConstraint<
+def TFL_BoolTrue : AttrConstraint<
     CPred<"$_self.cast<BoolAttr>().getValue()">,
     "whose value is true">;
 
-def TFL_BoolFalse: AttrConstraint<
+def TFL_BoolFalse : AttrConstraint<
     CPred<"!$_self.cast<BoolAttr>().getValue()">,
     "whose value is false">;
 
+class TFL_StringEqualsTo<string value> : AttrConstraint<
+    CPred<"$_self.cast<StringAttr>().getValue() == \"" # value # "\"">,
+    "whose value equals to '" # value # "'">;
+
 // This is a quantization-aware version of TCresVTEtIsSameAsOp
 class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
   TCOpResIsShapedTypePred<i, j>,
@@ -1892,7 +1896,10 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> {
   let hasOptions = 1;
 }
 
-def TFL_RoundOp: TFL_Op<"round", [NoSideEffect, SameOperandsAndResultType]> {
+def TFL_RoundOp: TFL_Op<"round", [
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultType]> {
   let summary = "Round operator";
 
   let description = [{
@@ -1909,7 +1916,14 @@ Rounds the values of a tensor to the nearest integer, element-wise.
 }
 
 def TFL_SliceOp : TFL_Op<"slice", [
-    NoSideEffect, SameOperandsAndResultsScale, TFL_GpuTargetOp]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 4>,
+    TFL_OperandHasRankAtMost<1, 1>,
+    TFL_OperandHasRankAtMost<2, 1>,
+    TFL_GpuTargetOp]> {
   let summary = "Return a slice from 'input'.";
 
   let description = [{
@@ -1927,13 +1941,13 @@ equivalent to setting:
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, I1, TFL_Str, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$begin,
     TFL_I32OrI64Tensor:$size
   );
 
   let results = (outs
-    AnyTensor:$output
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, I1, TFL_Str, QI8, QUI8, TFL_Quint8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -1961,7 +1975,10 @@ def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
 }
 
 def TFL_ReduceMinOp: TFL_Op<"reduce_min", [
-    NoSideEffect, SameOperandsAndResultsScale]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale]> {
   let summary = "Min-reduction operator";
 
   let description = [{
@@ -1969,19 +1986,23 @@ def TFL_ReduceMinOp: TFL_Op<"reduce_min", [
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
 }
 
 def TFL_ReduceMaxOp: TFL_Op<"reduce_max", [
-    NoSideEffect, SameOperandsAndResultsScale]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultsScale]> {
   let summary = "Max-reduction operator";
 
   let description = [{
@@ -1989,18 +2010,22 @@ def TFL_ReduceMaxOp: TFL_Op<"reduce_max", [
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
 }
 
-def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> {
+def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
   let summary = "Prod-reduction operator";
 
   let description = [{
@@ -2008,12 +2033,13 @@ def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -2308,10 +2334,13 @@ def TFL_RankOp: TFL_Op<"rank", [NoSideEffect]> {
   let hasFolder = 1;
 }
 
-def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
-                                SameOperandsAndResultShape,
-                                SameOperandsAndResultsScale,
-                                TFL_GpuTargetOp]> {
+def TFL_ReluOp: TFL_Op<"relu", [
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultsScale,
+    TFL_GpuTargetOp]> {
   let summary = "Relu operator";
 
   let description = [{
@@ -2319,9 +2348,9 @@ def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
       x -> max(0, x)
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2335,10 +2364,13 @@ def TFL_ReluOp: TFL_Op<"relu", [NoSideEffect,
   ];
 }
 
-def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
-                                  SameOperandsAndResultShape,
-                                  SameOperandsAndResultsScale,
-                                  TFL_GpuTargetOp]> {
+def TFL_Relu6Op: TFL_Op<"relu6", [
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultsScale,
+    TFL_GpuTargetOp]> {
   let summary = "Relu6 operator";
 
   let description = [{
@@ -2346,9 +2378,9 @@ def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
       x -> max(0, min(6, x))
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2362,9 +2394,12 @@ def TFL_Relu6Op: TFL_Op<"relu6", [NoSideEffect,
   ];
 }
 
-def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
-                                  SameOperandsAndResultShape,
-                                  SameOperandsAndResultsScale]> {
+def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    SameOperandsAndResultsScale]> {
   let summary = "Relu1 operator";
 
   let description = [{
@@ -2372,9 +2407,9 @@ def TFL_Relu1Op: TFL_Op<"relu_n1_to_1", [NoSideEffect,
       x -> max(-1, min(1, x))
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, QUI8, I8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$x);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, I8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$y);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2406,7 +2441,11 @@ def TFL_ReshapeOp: TFL_Op<"reshape", [
   let hasFolder = 1;
 }
 
-def TFL_ReverseSequenceOp : TFL_Op<"reverse_sequence", [NoSideEffect]> {
+def TFL_ReverseSequenceOp : TFL_Op<"reverse_sequence", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    TFL_OperandHasRank<1, 1>]> {
   let summary = "Reverses variable length slices.";
 
   let description = [{
@@ -2423,15 +2462,15 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI16, QUI8, TFL_Quint8]>:$input,
     TFL_I32OrI64Tensor:$seq_lengths,
 
-    I32Attr:$seq_dim,
-    I32Attr:$batch_dim
+    Confined<I32Attr, [IntNonNegative]>:$seq_dim,
+    Confined<I32Attr, [IntNonNegative]>:$batch_dim
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI16, QUI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
@@ -2439,6 +2478,7 @@ slice `i`, with the first `seq_lengths[i]` slices along dimension
 
 def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect,
                                   SameOperandsAndResultType,
+                                  SameOperandsAndResultShape,
                                   NoQuantizableResult,
                                   TFL_GpuTargetOp]> {
   let summary = "Reciprocal of square root operator";
@@ -2463,7 +2503,7 @@ def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect]> {
 
   let arguments = (ins AnyTensor:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[I32, I64]>:$output);
 
   DerivedTypeAttr out_type = DerivedTypeAttr<[{
     return getResult().getType().cast<TensorType>().getElementType();
@@ -2472,9 +2512,11 @@ def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect]> {
   let hasOptions = 1;
 }
 
-// TODO(jpienaar): Flesh this out.
-def TFL_RangeOp: TFL_Op<"range", [NoSideEffect, TFL_OperandHasRank<0, 0>,
-    TFL_OperandHasRank<1, 0>, TFL_OperandHasRank<2, 0>,
+def TFL_RangeOp: TFL_Op<"range", [
+    NoSideEffect,
+    TFL_OperandHasRank<0, 0>,
+    TFL_OperandHasRank<1, 0>,
+    TFL_OperandHasRank<2, 0>,
     PredOpTrait<"operands and output must have same element type",
       And<[TCresVTEtIsSameAsOp<0, 0>, TCresVTEtIsSameAsOp<0, 1>,
            TCresVTEtIsSameAsOp<0, 2>]>>]> {
@@ -2486,17 +2528,20 @@ def TFL_RangeOp: TFL_Op<"range", [NoSideEffect, TFL_OperandHasRank<0, 0>,
   }];
 
   let arguments = (ins
-    AnyTensor:$start,
-    AnyTensor:$limit,
-    AnyTensor:$delta);
+    TFL_TensorOf<[I32, F32]>:$start,
+    TFL_TensorOf<[I32, F32]>:$limit,
+    TFL_TensorOf<[I32, F32]>:$delta);
 
-  let results = (outs AnyTensor:$result);
+  let results = (outs TFL_TensorOf<[I32, F32]>:$result);
 
   let hasFolder = 1;
 }
 
-def TFL_ReverseV2Op: TFL_Op<"reverse_v2",
-                            [NoSideEffect, TFL_OperandHasRank<1,1>]> {
+def TFL_ReverseV2Op: TFL_Op<"reverse_v2", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect,
+    TFL_OperandHasRank<1, 1>]> {
   let summary = "ReverseV2 Operator";
 
   let description = [{
@@ -2518,18 +2563,18 @@ def TFL_ReverseV2Op: TFL_Op<"reverse_v2",
 
   let arguments = (
     ins
-    TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$input,
-    TFL_TensorOf<[I32, I64]>:$axis
+    TFL_TensorOf<[F32, UI8, I16, I32, I64, QI16, QUI8, TFL_Quint8, I1]>:$input,
+    TFL_I32Tensor:$axis
   );
 
   let results = (outs
-  TFL_TensorOf<[F32, I16, I32, I64, TFL_Uint8, I1]>:$output
-  );
+    TFL_TensorOf<[F32, UI8, I16, I32, I64, QI16, QUI8, TFL_Quint8, I1]>:$output);
 }
 
 // Select has many instances in TF models where one or more of its operands
 // are unranked. Therefore, we skip adding shape constraints here.
-def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
+def TFL_SelectOp : TFL_Op<"select", [
+  NoSideEffect,
   PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
   PredOpTrait<"operands and result have same element type",
     TCresVTEtIsSameAsOp<0, 1>>]> {
@@ -2545,9 +2590,11 @@ def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
-  let results = (outs AnyTensor:$output);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+
+  let results = (outs
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   // TODO(jpienaar): autogenerate this.
   let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
@@ -2561,7 +2608,12 @@ def TFL_SelectOp : TFL_Op<"select", [NoSideEffect,
   let hasOptions = 1;
 }
 
-def TFL_SelectV2Op : TFL_Op<"select_v2", [NoSideEffect]> {
+def TFL_SelectV2Op : TFL_Op<"select_v2", [
+    NoSideEffect,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<1, 2, 4>,
+    PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
+    PredOpTrait<"operands and result have same element type",
+      TCresVTEtIsSameAsOp<0, 1>>]> {
   let summary = "SelectV2 operator";
 
   let description = [{
@@ -2574,9 +2626,11 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [NoSideEffect]> {
 
   let arguments = (ins
     TFL_BoolTensor:$condition,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$x,
-    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, TFL_Uint8]>:$y);
-  let results = (outs AnyTensor:$output);
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$x,
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+
+  let results = (outs
+    TFL_TensorOf<[F32, I1, I8, I16, I32, I64, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
                             "Value cond, Value x, Value y",
@@ -2605,9 +2659,11 @@ def TFL_SinOp: TFL_Op<"sin", [
   let hasFolder = 1;
 }
 
-// TODO(b/130643170): Adds some constraint for the input/output element types.
 def TFL_SoftmaxOp : TFL_Op<"softmax", [
     NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRankRange<0, 1, 4>,
     SameOperandsAndResultShape,
     // zero_point = 0
     // scale = 1. / (max_value + 1)
@@ -2623,11 +2679,11 @@ def TFL_SoftmaxOp : TFL_Op<"softmax", [
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$input,
     F32Attr:$beta
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
 }
@@ -2914,6 +2970,7 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [
 def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
     NoSideEffect,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankRange<0, 3, 4>,
     PredOpTrait<"input and output must have same element type",
       TCresVTEtIsSameAsOp<0, 0>>
   ]> {
@@ -2924,13 +2981,13 @@ def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, QI8, QUI8]>:$input,
-    TFL_TensorOf<[I32]>:$block_shape,
-    TFL_TensorOf<[I32]>:$paddings
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_I32Tensor:$block_shape,
+    TFL_I32Tensor:$paddings
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output
   );
 }
 
@@ -3045,7 +3102,12 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect, SameOperandsAndResultsScale]
 }
 
 def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [
-    NoSideEffect, SameOperandsAndResultsScale]> {
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRank<0, 4>,
+    TFL_OperandHasRank<1, 1>,
+    SameOperandsAndResultsScale]> {
   let summary = "ResizeBilinear Op";
 
   let description = [{
@@ -3053,23 +3115,26 @@ def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [
   }];
 
   let arguments = (ins
-    // TODO(ycling): Support quantized types.
-    TFL_TensorOf<[F32, I32, QI8, QUI8]>:$input,
-    TFL_TensorOf<[I32]>:$size,
+    TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$input,
+    TFL_I32Tensor:$size,
     BoolAttr:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, QI8, QUI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
 }
 
-def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor",
-                                [NoSideEffect,
-                                 SameOperandsAndResultsScale]> {
+def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRank<0, 4>,
+    TFL_OperandHasRank<1, 1>,
+    SameOperandsAndResultsScale]> {
   let summary = "ResizeNearestNeighbor Op";
 
   let description = [{
@@ -3077,14 +3142,14 @@ def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor",
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$input,
-    TFL_TensorOf<[I32]>:$size,
+    TFL_TensorOf<[F32, TFL_Quint8, QUI8, QI8]>:$input,
+    TFL_I32Tensor:$size,
     BoolAttr:$align_corners,
     DefaultValuedAttr<BoolAttr, "false">:$half_pixel_centers
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, TFL_Uint8, QUI8, QI8]>:$output
+    TFL_TensorOf<[F32, TFL_Quint8, QUI8, QI8]>:$output
   );
 
   let hasOptions = 1;
@@ -3349,7 +3414,9 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
 }
 
 def TFL_QuantizeOp: TFL_Op<"quantize", [
-    FirstAttrDerivedResultType, NoQuantizableResult]> {
+    FirstAttrDerivedResultType,
+    SameOperandsAndResultShape,
+    NoQuantizableResult]> {
   let summary = "Quantize operator";
 
   let description = [{
@@ -3358,11 +3425,11 @@ def TFL_QuantizeOp: TFL_Op<"quantize", [
   }];
 
   let arguments = (
-    ins AnyTensor:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$input,
     TensorTypeAttr:$qtype
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[QI8, QUI8, QI16, TFL_Quint8]>:$output);
 }
 
 def TFL_DensifyOp: TFL_Op<"densify", [
@@ -3941,14 +4008,12 @@ def TFL_NumericVerifyOp : Op<TFL_Dialect, "NumericVerify", [
   let results = (outs);
 }
 
-def SVDFResultConstraint: PredOpTrait<
-  "the input and result tensor elemental types must be same",
-  TCresVTEtIsSameAsOp<0, 0>>;
-
 // SVDF op.
 def TFL_SVDFOp :
-  TFL_Op<"svdf",
-         [SVDFResultConstraint, TFL_StatefulOp]> {
+  TFL_Op<"svdf", [
+    PredOpTrait<"the input and result tensor elemental types must be same",
+      TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_StatefulOp]> {
 
   let summary = "Single value decomposition filter operator";
 
@@ -3960,13 +4025,13 @@ def TFL_SVDFOp :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8]>:$input,
+    ins TFL_TensorOf<[F32, QI8]>:$input,
 
     // Feature Weights.
-    TFL_TensorOf<[F32, I8]>:$feature_weights,
+    TFL_TensorOf<[F32, QI8, QUI8]>:$feature_weights,
 
     // Time weights
-    TFL_TensorOf<[F32, I8]>:$time_weights,
+    TFL_TensorOf<[F32, QI8]>:$time_weights,
 
     // Bias
     TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
@@ -3975,11 +4040,11 @@ def TFL_SVDFOp :
     TFL_StatefulTensor:$activation_state,
 
     // Attributes
-    I32Attr:$rank,
+    Confined<I32Attr, [IntPositive]>:$rank,
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TFL_TensorOf<[F32, I8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8]>:$output);
 
   let hasOptions = 1;
 
@@ -3991,7 +4056,10 @@ def TFL_SVDFOp :
   }];
 }
 
-def TFL_SegmentSumOp: TFL_Op<"segment_sum", [NoSideEffect]> {
+def TFL_SegmentSumOp: TFL_Op<"segment_sum", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "SegmentSum operator";
 
   let description = [{
@@ -3999,7 +4067,7 @@ def TFL_SegmentSumOp: TFL_Op<"segment_sum", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32]>:$data,
+    TFL_TensorOf<[F32, I32]>:$input,
     TFL_I32Tensor:$segment_ids
   );
   let results = (outs TFL_TensorOf<[F32, I32]>:$output);
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index f42e06350e5..981f08d277e 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -190,9 +190,9 @@ func @testSquare(tensor<? x f32>) -> tensor<? x f32> {
   return %0 : tensor<? x f32>
 }
 
-func @testQuantizedResizeNearestNeighbor(tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
-^bb0(%arg0: tensor<? x !quant.uniform<u8:f32, 0.1>>, %arg1: tensor<? x i32>):
-  %0 = "tfl.resize_nearest_neighbor"(%arg0, %arg1) { align_corners = false, half_pixel_centers = false } : (tensor<? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
+func @testQuantizedResizeNearestNeighbor(tensor<? x ? x ? x ? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>> {
+^bb0(%arg0: tensor<? x ? x ? x ? x !quant.uniform<u8:f32, 0.1>>, %arg1: tensor<? x i32>):
+  %0 = "tfl.resize_nearest_neighbor"(%arg0, %arg1) { align_corners = false, half_pixel_centers = false } : (tensor<? x ? x ? x ? x !quant.uniform<u8:f32, 0.1>>, tensor<? x i32>) -> tensor<? x !quant.uniform<u8:f32, 0.1>>
   return %0 : tensor<? x !quant.uniform<u8:f32, 0.1>>
 }
 
@@ -1201,7 +1201,7 @@ func @testResizeBilinear(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>)
 // -----
 
 func @testResizeBilinearInvalidOutputType(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>) -> tensor<?xi32> {
-  // expected-error @+1 {{'tfl.resize_bilinear' op result #0 must be tensor of 32-bit float or QI8 type or QUI8 type values}}
+  // expected-error @+1 {{'tfl.resize_bilinear' op failed to verify that input and output must have same element type}}
   %0 = "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -1499,8 +1499,8 @@ func @testWrongQuantizedLocalResponseNormalization(%arg0 : tensor<1x56x56x192x!q
 
 // CHECK-LABEL: testSvdf
 func @testSvdf(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "RELU", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.svdf"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "RELU", rank = 2 : i32} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
diff --git a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
index 81753539e8a..86b061c6885 100644
--- a/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
+++ b/tensorflow/lite/testing/op_tests/space_to_batch_nd.py
@@ -105,6 +105,13 @@ def make_space_to_batch_nd_tests(options):
       values.append(np.array(parameters["paddings"]))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
+  if options.use_experimental_converter:
+    # Remove unsupported dimension cases. Currently, kernel supports 3 and 4-D
+    # inputs.
+    test_parameters = [
+        test_parameters[0], test_parameters[1], test_parameters[3]
+    ]
+
   make_zip_of_tests(
       options,
       test_parameters,

From bdf665b504579cc03dd20bbc8c8873c81c3b42aa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 19:12:33 -0700
Subject: [PATCH 0950/1533] pfor: Add support to vectorize TensorList
 operations:   TensorListElementShape   TensorListFromTensor  
 TensorListGather   TensorListGetItem   TensorListLength   TensorListReserve  
 TensorListScatterIntoExistingList   TensorListSetItem   TensorListStack

PiperOrigin-RevId: 312600232
Change-Id: If85b8d3fb1f2c7dcc5cadcb1b17fff4de5770d26
---
 .../ops/parallel_for/control_flow_ops_test.py | 110 ++++++++
 tensorflow/python/ops/parallel_for/pfor.py    | 247 +++++++++++++++++-
 2 files changed, 352 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 5becfa9efb7..7faba3241a6 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradients as gradient_ops
 from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
@@ -884,6 +885,115 @@ class TensorArrayTest(PForTestCase):
       self.assertAllClose(actual_grad, computed_grad)
 
 
+class TensorListTest(PForTestCase):
+
+  def test_create_outside_and_write(self):
+    handle1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+    handle2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+
+    def loop_fn(i):
+      h1 = list_ops.tensor_list_set_item(handle1, 0, i)
+      h1 = list_ops.tensor_list_set_item(h1, 1, 1)
+      h2 = list_ops.tensor_list_set_item(handle2, 0, 1)
+      return (list_ops.tensor_list_stack(h1, dtypes.int32),
+              list_ops.tensor_list_stack(h2, dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_inside_and_write(self):
+
+    def loop_fn(i):
+      h1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      h1 = list_ops.tensor_list_set_item(h1, 0, i)
+      h1 = list_ops.tensor_list_set_item(h1, 1, 1)
+      h2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      h2 = list_ops.tensor_list_set_item(h2, 0, 1)
+      return (list_ops.tensor_list_stack(h1, dtypes.int32),
+              list_ops.tensor_list_stack(h2, dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_outside_and_read(self):
+    handle = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+    handle = list_ops.tensor_list_set_item(handle, 0, 0)
+    handle = list_ops.tensor_list_set_item(handle, 1, 1)
+
+    def loop_fn(i):
+      return (list_ops.tensor_list_get_item(handle, i, dtypes.int32),
+              list_ops.tensor_list_get_item(handle, 0, dtypes.int32),
+              list_ops.tensor_list_length(handle),
+              list_ops.tensor_list_element_shape(handle, dtypes.int32),
+              list_ops.tensor_list_element_shape(handle, dtypes.int64))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_create_inside_and_read(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      handle = list_ops.tensor_list_set_item(handle, 0, i)
+      handle = list_ops.tensor_list_set_item(handle, 1, 1)
+      return (list_ops.tensor_list_get_item(handle, 0, dtypes.int32),
+              list_ops.tensor_list_get_item(handle, i, dtypes.int32),
+              list_ops.tensor_list_length(handle),
+              list_ops.tensor_list_element_shape(handle, dtypes.int32),
+              list_ops.tensor_list_element_shape(handle, dtypes.int64))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_create_outside_and_scatter(self):
+    h = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=h)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_inside_and_scatter(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return list_ops.tensor_list_stack(handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 3)
+
+  def test_create_outside_and_gather(self):
+    handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+    handle = list_ops.tensor_list_scatter([[2, 3]], [0], input_handle=handle)
+    handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+
+    def loop_fn(i):
+      return (list_ops.tensor_list_gather(handle, [0, 1], dtypes.int32),
+              list_ops.tensor_list_gather(handle, [i], dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_create_inside_and_gather(self):
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([2], 2, dtypes.int32)
+      handle = list_ops.tensor_list_scatter([[i, 2]], [0], input_handle=handle)
+      handle = list_ops.tensor_list_scatter([[1, 2]], [1], input_handle=handle)
+      return (list_ops.tensor_list_gather(handle, [0, 1], dtypes.int32),
+              list_ops.tensor_list_gather(handle, [i], dtypes.int32))
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_tensor_list_from_tensor(self):
+    t = random_ops.random_uniform([2, 3, 4])
+
+    def loop_fn(i):
+      handle = list_ops.tensor_list_from_tensor(array_ops.gather(t, i), [4])
+      return list_ops.tensor_list_stack(handle, t.dtype)
+
+    self._test_loop_fn(loop_fn, 2)
+
+
 class StackTest(PForTestCase):
 
   @test_util.run_v1_only("b/122612051")
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 128bbd48629..bd6ff9a0bd1 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -52,6 +52,7 @@ from tensorflow.python.ops import gen_random_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -74,6 +75,7 @@ flags.DEFINE_bool(
 
 def _stack(t, length):
   """stacks `t` `length` times."""
+  assert t.dtype != dtypes.variant
   ones = array_ops.ones_like(array_ops.shape(t))
   ones = array_ops.reshape(ones, [-1])
   length = array_ops.reshape(length, [-1])
@@ -965,8 +967,9 @@ def wrap(tensor, is_stacked=True, is_sparse_stacked=False):
   return WrappedTensor(tensor, is_stacked, is_sparse_stacked)
 
 
-def _fallback_converter(pfor_input):
-  logging.warn("Using a while_loop for converting %s", pfor_input.op_type)
+def _fallback_converter(pfor_input, warn=True):
+  if warn:
+    logging.warn("Using a while_loop for converting %s", pfor_input.op_type)
   output_dtypes = [x.dtype for x in pfor_input.outputs]
   iters = pfor_input.pfor.loop_len_vector[0]
 
@@ -2063,7 +2066,7 @@ def _convert_diag(pfor_input):
   else:
     # It is not clear if we can do better than a while loop here with existing
     # kernels.
-    return _fallback_converter(pfor_input)
+    return _fallback_converter(pfor_input, warn=False)
 
 
 # See notes for MatrixDiagV2
@@ -2106,7 +2109,7 @@ def _convert_diag_part(pfor_input):
   else:
     # It is not clear if we can do better than a while loop here with existing
     # kernels.
-    return _fallback_converter(pfor_input)
+    return _fallback_converter(pfor_input, warn=False)
 
 
 @RegisterPFor("OneHot")
@@ -3040,7 +3043,7 @@ def _convert_stateless_multinomial(pfor_input):
   # random numbers under vectorization.
   # Unfortunately, the kernels currently are not necessarily setup to do this
   # efficiently and hence we fallback to a sequential loop for vectorization.
-  return _fallback_converter(pfor_input)
+  return _fallback_converter(pfor_input, warn=False)
 
 
 # linalg_ops
@@ -3472,6 +3475,240 @@ def _convert_tensor_array_grad_v3(pfor_input):
   return [wrap(grad_handle, False), wrap(flow_out, True)]
 
 
+def _stack_tensor_list_shape(shape, pfor_input):
+  first_dim = pfor_input.pfor.loop_len_vector
+  shape_value = tensor_util.constant_value(shape)
+  # Note that negative values in the shape are used to signify unknown shapes
+  # and are handled in a special way.
+  if shape_value is not None:
+    if shape_value == -1 or -1 in shape_value:
+      return constant_op.constant(-1)
+    elif not shape_value:
+      return first_dim
+  else:
+    shape = array_ops.reshape(shape, [-1])
+    return control_flow_ops.cond(
+        math_ops.reduce_any(shape < 0),
+        lambda: constant_op.constant(-1),
+        lambda: array_ops.concat([first_dim, shape], axis=0))
+
+
+def _tile_variant(t, pfor_input):
+  """stacks `t` `length` times."""
+  t.set_shape([])
+  t = array_ops.reshape(t, [-1])
+  with ops.device("CPU:0"):
+    return array_ops.tile(t, pfor_input.pfor.loop_len_vector)
+
+
+def _untile_variant(t):
+  return array_ops.gather(t, 0)
+
+
+@RegisterPFor("TensorListReserve")
+def _convert_tensor_list_reserve(pfor_input):
+  element_shape = pfor_input.unstacked_input(0)
+  num_elements = pfor_input.unstacked_input(1)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  # Prepend a dimension to element_shape.
+  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  handle = list_ops.tensor_list_reserve(
+      element_shape, num_elements, element_dtype=element_dtype)
+
+  return wrap(_tile_variant(handle, pfor_input), True)
+
+
+@RegisterPFor("TensorListElementShape")
+def _convert_tensor_list_element_shape(pfor_input):
+  handle = _untile_variant(pfor_input.stacked_input(0))
+  shape_type = pfor_input.get_attr("shape_type")
+  shape = list_ops.tensor_list_element_shape(handle, shape_type)
+  shape = array_ops.reshape(shape, [-1])
+  shape = shape[1:]
+  return wrap(shape, False)
+
+
+@RegisterPFor("TensorListLength")
+def _convert_tensor_list_length(pfor_input):
+  handle = _untile_variant(pfor_input.stacked_input(0))
+  return wrap(list_ops.tensor_list_length(handle), False)
+
+
+def _stack_tensor_list(handle, dtype, pfor_input, element_shape=None):
+  if element_shape is None:
+    element_shape = list_ops.tensor_list_element_shape(handle, dtypes.int32)
+  length = list_ops.tensor_list_length(handle)
+  new_handle = list_ops.tensor_list_reserve(
+      _stack_tensor_list_shape(element_shape, pfor_input), length, dtype)
+
+  def _body_fn(i, h):
+    elem = list_ops.tensor_list_get_item(handle, i, dtype, element_shape)
+    elem = _stack(elem, pfor_input.pfor.loop_len_vector).t
+    return i + 1, list_ops.tensor_list_set_item(h, i, elem)
+
+  return control_flow_ops.while_loop(lambda i, _: i < length, _body_fn,
+                                     [0, new_handle])[1]
+
+
+@RegisterPFor("TensorListGetItem")
+def _convert_tensor_list_get_item(pfor_input):
+  handle, handle_stacked, _ = pfor_input.input(0)
+  index, index_stacked, _ = pfor_input.input(1)
+  element_shape = pfor_input.unstacked_input(2)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  if handle_stacked:
+    handle = _untile_variant(handle)
+    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    if index_stacked:
+      # We use a sequential loop since that may be more efficient than first
+      # gathering and concatenating all the element corresponding to `index`,
+      # and then doing a gather on it.
+      def _map_fn(i):
+        item_i = list_ops.tensor_list_get_item(
+            handle,
+            index[i],
+            element_dtype=element_dtype)
+        return array_ops.gather(item_i, i)
+
+      output = map_fn.map_fn(_map_fn, pfor_input.pfor.all_indices)
+      return wrap(output, True)
+    else:
+      output = list_ops.tensor_list_get_item(
+          handle,
+          index,
+          element_shape=element_shape,
+          element_dtype=element_dtype)
+      return wrap(output, True)
+  else:
+    assert index_stacked
+    return wrap(
+        list_ops.tensor_list_gather(
+            handle,
+            index,
+            element_shape=element_shape,
+            element_dtype=element_dtype), True)
+
+
+@RegisterPFor("TensorListSetItem")
+def _convert_tensor_array_set_item(pfor_input):
+  handle, handle_stacked, _ = pfor_input.input(0)
+  index, index_stacked, _ = pfor_input.input(1)
+  item, item_stacked, _ = pfor_input.input(2)
+
+  if not handle_stacked:
+    # Special case where we can statically guarantee that the indices are
+    # disjoint.
+    if index is pfor_input.pfor.all_indices:
+      if not item_stacked:
+        item = _stack(item, pfor_input.pfor.loop_len_vector).t
+      return wrap(
+          list_ops.tensor_list_scatter(item, index, input_handle=handle), False)
+    else:
+      handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+  else:
+    handle = _untile_variant(handle)
+
+  if index_stacked:
+    # TODO(agarwal): handle this.
+    raise ValueError("Vectorizing writes to a TensorList with loop "
+                     "variant indices is currently unsupported.")
+
+  else:
+    if not item_stacked:
+      item = _stack(item, pfor_input.pfor.loop_len_vector).t
+    handle = list_ops.tensor_list_set_item(handle, index, item)
+    return wrap(_tile_variant(handle, pfor_input), True)
+
+
+@RegisterPFor("TensorListStack")
+def _convert_tensor_list_stack(pfor_input):
+  handle = pfor_input.stacked_input(0)
+  input_shape = pfor_input.unstacked_input(1)
+  element_dtype = pfor_input.get_attr("element_dtype")
+  num_elements = pfor_input.get_attr("num_elements")
+
+  handle = _untile_variant(handle)
+  input_shape = _stack_tensor_list_shape(input_shape, pfor_input)
+  output = list_ops.tensor_list_stack(
+      handle,
+      element_dtype,
+      element_shape=input_shape,
+      num_elements=num_elements)
+  output = _transpose_first_two_dims(output)
+  return wrap(output, True)
+
+
+@RegisterPFor("TensorListGather")
+def _convert_tensor_list_gather(pfor_input):
+  handle, handle_stacked, _ = pfor_input.input(0)
+  index, index_stacked, _ = pfor_input.input(1)
+  element_shape = pfor_input.unstacked_input(2)
+  element_dtype = pfor_input.get_attr("element_dtype")
+
+  if handle_stacked:
+    handle = _untile_variant(handle)
+    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    if index_stacked:
+      # We use a sequential loop since that may be more efficient than first
+      # gathering and concatenating all the element corresponding to `index`,
+      # and then doing a gather on it.
+      def _map_fn(i):
+        item_i = list_ops.tensor_list_gather(
+            handle,
+            index[i],
+            element_dtype=element_dtype)
+        axis = array_ops.rank(index) - 1
+        return array_ops.gather(item_i, i, axis=axis)
+
+      output = map_fn.map_fn(_map_fn, pfor_input.pfor.all_indices)
+      return wrap(output, True)
+    else:
+      output = list_ops.tensor_list_gather(
+          handle,
+          index,
+          element_shape=element_shape,
+          element_dtype=element_dtype)
+      return wrap(output, True)
+  else:
+    assert index_stacked
+    index_shape = array_ops.shape(index)
+    index = array_ops.reshape(index, [-1])
+    values = list_ops.tensor_list_gather(
+        handle, index, element_shape=element_shape, element_dtype=element_dtype)
+    final_shape = array_ops.concat(
+        [index_shape, array_ops.shape(values)[1:]], axis=0)
+    return wrap(array_ops.reshape(values, final_shape), True)
+
+
+@RegisterPFor("TensorListScatterIntoExistingList")
+def _convert_tensor_list_scatter(pfor_input):
+  pfor_input.stack_inputs([1])
+  handle, handle_stacked, _ = pfor_input.input(0)
+  item = pfor_input.stacked_input(1)
+  # TODO(agarwal): handle stacked indices.
+  indices = pfor_input.unstacked_input(2)
+  if handle_stacked:
+    handle = _untile_variant(handle)
+  else:
+    handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+
+  item = _transpose_first_two_dims(item)
+  handle = list_ops.tensor_list_scatter(item, indices, input_handle=handle)
+  return wrap(_tile_variant(handle, pfor_input), True)
+
+
+@RegisterPFor("TensorListFromTensor")
+def _convert_tensor_list_from_tensor(pfor_input):
+  tensor = pfor_input.stacked_input(0)
+  element_shape = pfor_input.unstacked_input(1)
+  tensor = _transpose_first_two_dims(tensor)
+  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  handle = list_ops.tensor_list_from_tensor(tensor, element_shape)
+  return wrap(_tile_variant(handle, pfor_input), True)
+
+
 # StackV2 conversion is tricky since we don't have arrays of StackV2. So similar
 # to TensorArrays, we convert them by changing the dimension of the elements
 # inside the stack.

From cf739c41044d63fe3361e03c175b4685c078d4b4 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Wed, 20 May 2020 19:26:00 -0700
Subject: [PATCH 0951/1533] Remove TFLite Java runtime dependency in the
 metadata java lib

PiperOrigin-RevId: 312601579
Change-Id: I57d7bfe06d36e62a6fa203c39225687861fa4580
---
 .../tensorflow/lite/support/model/Model.java  | 19 +++++++
 .../experimental/support/metadata/java/BUILD  |  2 -
 .../support/metadata/MetadataExtractor.java   | 52 ++++++++++++++++---
 .../lite/support/metadata/ModelInfo.java      | 46 +++-------------
 4 files changed, 72 insertions(+), 47 deletions(-)

diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
index 40659e39848..8062d68d7b9 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
@@ -22,6 +22,7 @@ import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
 import org.checkerframework.checker.nullness.qual.Nullable;
 import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.Tensor;
 import org.tensorflow.lite.support.common.FileUtil;
 import org.tensorflow.lite.support.common.SupportPreconditions;
 
@@ -218,6 +219,24 @@ public class Model {
     return modelPath;
   }
 
+  /**
+   * Gets the Tensor associated with the provdied input index.
+   *
+   * @throws IllegalStateException if the interpreter is closed.
+   */
+  public Tensor getInputTensor(int inputIndex) {
+    return interpreter.getInputTensor(inputIndex);
+  }
+
+  /**
+   * Gets the Tensor associated with the provdied output index.
+   *
+   * @throws IllegalStateException if the interpreter is closed.
+   */
+  public Tensor getOutputTensor(int outputIndex) {
+    return interpreter.getOutputTensor(outputIndex);
+  }
+
   /**
    * Returns the output shape. Useful if output shape is only determined when graph is created.
    *
diff --git a/tensorflow/lite/experimental/support/metadata/java/BUILD b/tensorflow/lite/experimental/support/metadata/java/BUILD
index 82b6e9866a9..c208752ae24 100644
--- a/tensorflow/lite/experimental/support/metadata/java/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/java/BUILD
@@ -16,7 +16,6 @@ android_library(
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_fbs_android",
         "//tensorflow/lite/experimental/support/metadata:schema_fbs_android",
-        "//tensorflow/lite/java:tensorflowlite_java",
         "@org_checkerframework_qual",
     ],
 )
@@ -32,7 +31,6 @@ java_library(
     deps = [
         "//tensorflow/lite/experimental/support/metadata:metadata_schema_java",
         "//tensorflow/lite/experimental/support/metadata:schema_fbs_java",
-        "//tensorflow/lite/java:tensorflowlite_javalib",
         "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
index 3ded50e5d95..be4d8caf577 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -22,8 +22,6 @@ import java.io.InputStream;
 import java.nio.ByteBuffer;
 import java.util.zip.ZipException;
 import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.Tensor.QuantizationParams;
 import org.tensorflow.lite.schema.Tensor;
 import org.tensorflow.lite.support.metadata.schema.ModelMetadata;
 import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
@@ -111,6 +109,48 @@ public class MetadataExtractor {
     zipFile = createZipFile(buffer);
   }
 
+  /**
+   * Quantization parameters that corresponds to the table, {@code QuantizationParameters}, in the
+   * <a
+   * href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs">TFLite
+   * Model schema file.</a>
+   *
+   * <p>Since per-channel quantization does not apply to input and output tensors, {@code scale} and
+   * {@code zero_point} are both single values instead of arrays.
+   *
+   * <p>For tensor that are not quantized, the values of scale and zero_point are both 0.
+   *
+   * <p>Given a quantized value q, the corresponding float value f should be: <br>
+   * f = scale * (q - zero_point) <br>
+   */
+  public static class QuantizationParams {
+    /** The scale value used in quantization. */
+    private final float scale;
+    /** The zero point value used in quantization. */
+    private final int zeroPoint;
+
+    /**
+     * Creates a {@link QuantizationParams} with {@code scale} and {@code zero_point}.
+     *
+     * @param scale The scale value used in quantization.
+     * @param zeroPoint The zero point value used in quantization.
+     */
+    public QuantizationParams(final float scale, final int zeroPoint) {
+      this.scale = scale;
+      this.zeroPoint = zeroPoint;
+    }
+
+    /** Returns the scale value. */
+    public float getScale() {
+      return scale;
+    }
+
+    /** Returns the zero point value. */
+    public int getZeroPoint() {
+      return zeroPoint;
+    }
+  }
+
   /** Returns {@code true} if the model has metadata. Otherwise, returns {@code false}. */
   public boolean hasMetadata() {
     return metadataInfo != null;
@@ -166,11 +206,11 @@ public class MetadataExtractor {
   }
 
   /**
-   * Gets the {@link DataType} of the input tensor with {@code inputIndex}.
+   * Gets the {@link TensorType} of the input tensor with {@code inputIndex}.
    *
    * @param inputIndex the index of the desired input tensor
    */
-  public DataType getInputTensorType(int inputIndex) {
+  public byte getInputTensorType(int inputIndex) {
     return modelInfo.getInputTensorType(inputIndex);
   }
 
@@ -221,11 +261,11 @@ public class MetadataExtractor {
   }
 
   /**
-   * Gets the {@link DataType} of the output tensor with {@code outputIndex}.
+   * Gets the {@link TensorType} of the output tensor with {@code outputIndex}.
    *
    * @param outputIndex the index of the desired output tensor
    */
-  public DataType getOutputTensorType(int outputIndex) {
+  public byte getOutputTensorType(int outputIndex) {
     return modelInfo.getOutputTensorType(outputIndex);
   }
 
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
index e2905d108d7..309a3dbe774 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelInfo.java
@@ -21,12 +21,8 @@ import static org.tensorflow.lite.support.metadata.Preconditions.checkNotNull;
 import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import org.checkerframework.checker.nullness.qual.Nullable;
-import org.tensorflow.lite.DataType;
-import org.tensorflow.lite.Tensor.QuantizationParams;
 import org.tensorflow.lite.schema.Buffer;
 import org.tensorflow.lite.schema.Metadata;
 import org.tensorflow.lite.schema.Model;
@@ -34,6 +30,7 @@ import org.tensorflow.lite.schema.QuantizationParameters;
 import org.tensorflow.lite.schema.SubGraph;
 import org.tensorflow.lite.schema.Tensor;
 import org.tensorflow.lite.schema.TensorType;
+import org.tensorflow.lite.support.metadata.MetadataExtractor.QuantizationParams;
 
 /** Extracts model information out of TFLite model FLatBuffer. */
 final class ModelInfo {
@@ -49,9 +46,6 @@ final class ModelInfo {
   /** Identifier of the TFLite model metadata in the Metadata array. */
   static final String METADATA_FIELD_NAME = "TFLITE_METADATA";
 
-  /** Maps from TensorType in TFlite FlatBuffer to {@link DataType} in Java. */
-  private final Map<Byte, DataType> tensorTypeToDataTypeMap;
-
   /**
    * Creates a {@link ModelInfo} with the model FlatBuffer, {@code buffer}.
    *
@@ -74,7 +68,6 @@ final class ModelInfo {
 
     inputTensors = getInputTensors(model);
     outputTensors = getOutputTensors(model);
-    tensorTypeToDataTypeMap = createTensorTypeToDataTypeMap();
   }
 
   /**
@@ -106,13 +99,12 @@ final class ModelInfo {
   }
 
   /**
-   * Gets {@link DataType} of the input tensor with {@code inputIndex}.
+   * Gets the {@link TensorType} in byte of the input tensor with {@code inputIndex}.
    *
    * @param inputIndex The index of the desired intput tensor.
    */
-  DataType getInputTensorType(int inputIndex) {
-    Tensor tensor = getInputTensor(inputIndex);
-    return getDataType(tensor.type());
+  byte getInputTensorType(int inputIndex) {
+    return getInputTensor(inputIndex).type();
   }
 
   /** Gets the metadata FlatBuffer from the model FlatBuffer. */
@@ -163,13 +155,12 @@ final class ModelInfo {
   }
 
   /**
-   * Gets {@link DataType} of the output tensor {@code outputIndex}.
+   * Gets the {@link TensorType} in byte of the output tensor {@code outputIndex}.
    *
    * @param outputIndex The index of the desired outtput tensor.
    */
-  DataType getOutputTensorType(int outputIndex) {
-    Tensor tensor = getOutputTensor(outputIndex);
-    return getDataType(tensor.type());
+  byte getOutputTensorType(int outputIndex) {
+    return getOutputTensor(outputIndex).type();
   }
 
   /**
@@ -233,29 +224,6 @@ final class ModelInfo {
             + " flatbuffer.");
   }
 
-  private static Map<Byte, DataType> createTensorTypeToDataTypeMap() {
-    Map<Byte, DataType> map = new HashMap<>();
-    map.put(TensorType.FLOAT32, DataType.FLOAT32);
-    map.put(TensorType.INT32, DataType.INT32);
-    map.put(TensorType.UINT8, DataType.UINT8);
-    map.put(TensorType.INT64, DataType.INT64);
-    map.put(TensorType.STRING, DataType.STRING);
-    return Collections.unmodifiableMap(map);
-  }
-
-  /**
-   * Transforms from TensorType in TFlite FlatBuffer to {@link DataType} in Java.
-   *
-   * @param tensorType The tensor type to be converted.
-   * @throws IllegalArgumentException if the tensor type is not supported.
-   */
-  private DataType getDataType(byte tensorType) {
-    checkArgument(
-        tensorTypeToDataTypeMap.containsKey(tensorType),
-        String.format("Tensor type %d is not supported.", tensorType));
-    return tensorTypeToDataTypeMap.get(tensorType);
-  }
-
   /**
    * Gets the shape of a tensor.
    *

From e56cf87b54d5968a18a52e240d3333dfdbe66be8 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Wed, 20 May 2020 19:27:32 -0700
Subject: [PATCH 0952/1533] Adds necessary hooks to load a TPU-specific shared
 library.

PiperOrigin-RevId: 312601701
Change-Id: I1ae43d253d1734c30ffefe4d4062c82639d7a4d1
---
 tensorflow/core/BUILD                     |  1 +
 tensorflow/core/framework/load_library.cc | 14 +++++++++++
 tensorflow/core/tpu/BUILD                 |  8 ++++++
 tensorflow/core/tpu/tpu_library_loader.cc | 30 +++++++++++++++++++++++
 tensorflow/core/tpu/tpu_library_loader.h  | 29 ++++++++++++++++++++++
 5 files changed, 82 insertions(+)
 create mode 100644 tensorflow/core/tpu/tpu_library_loader.cc
 create mode 100644 tensorflow/core/tpu/tpu_library_loader.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6b4874a8393..2b16801f6ed 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2254,6 +2254,7 @@ tf_cuda_library(
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/tpu:tpu_library_loader",
         "//tensorflow/core/util:einsum_op_util",
         "//tensorflow/core/util:padding",
         "//tensorflow/core/util:port",
diff --git a/tensorflow/core/framework/load_library.cc b/tensorflow/core/framework/load_library.cc
index b9e33b148f7..c223eac4722 100644
--- a/tensorflow/core/framework/load_library.cc
+++ b/tensorflow/core/framework/load_library.cc
@@ -21,6 +21,9 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mem.h"
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/tpu/tpu_library_loader.h"
+#endif  // IS_MOBILE_PLATFORM
 
 namespace tensorflow {
 
@@ -97,6 +100,17 @@ Status LoadLibrary(const char* library_filename, void** result,
   *buf = str_buf;
   *len = str.length();
 
+#if !defined(IS_MOBILE_PLATFORM)
+  // Determine if this library is a TPU library, and if so, calls the TPU
+  // initialization functions to populate function tables, etc...
+  void* unused_symbol;
+  if (env->GetSymbolFromLibrary(library.handle, "TfTpu_Initialize",
+                                &unused_symbol)
+          .ok()) {
+    TF_RETURN_IF_ERROR(tensorflow::tpu::InitializeTPULibrary(library.handle));
+  }
+#endif  // IS_MOBILE_PLATFORM
+
   *result = library.handle;
   return Status::OK();
 }
diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 48a9a229d2a..5d1b7e1101f 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -91,3 +91,11 @@ cc_library(
         "//tensorflow/c:tf_status",
     ],
 )
+
+cc_library(
+    name = "tpu_library_loader",
+    srcs = ["tpu_library_loader.cc"],
+    hdrs = ["tpu_library_loader.h"],
+    visibility = ["//tensorflow:__subpackages__"],
+    deps = ["//tensorflow/core/platform:status"],
+)
diff --git a/tensorflow/core/tpu/tpu_library_loader.cc b/tensorflow/core/tpu/tpu_library_loader.cc
new file mode 100644
index 00000000000..bfd9fe29efe
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_library_loader.cc
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_library_loader.h"
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tpu {
+
+Status InitializeTPULibrary(void* library) {
+  // TODO(frankchn): dlsym the loaded library and populate a struct with the
+  // relevant C APIs necessary for TPUs.
+  return Status::OK();
+}
+
+}  // namespace tpu
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_library_loader.h b/tensorflow/core/tpu/tpu_library_loader.h
new file mode 100644
index 00000000000..35a7dd7c9be
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_library_loader.h
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+#define TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_
+
+#include "tensorflow/core/platform/status.h"
+
+namespace tensorflow {
+namespace tpu {
+
+Status InitializeTPULibrary(void* library);
+
+}  // namespace tpu
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_LIBRARY_LOADER_H_

From f24faa153ad31a4b51578f8181d3aaab77a1ddeb Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Wed, 20 May 2020 20:04:33 -0700
Subject: [PATCH 0953/1533] Add dataset element compression ops.

These allow us to implement tf.data service compression/decompression as a part of the tf.data pipeline.

PiperOrigin-RevId: 312605093
Change-Id: I4a833bc89e602c8fd78abc4c1a0026c2a397449f
---
 .../base_api/api_def_CompressElement.pbtxt    |  5 ++
 .../base_api/api_def_UncompressElement.pbtxt  |  5 ++
 tensorflow/core/framework/common_shape_fns.cc | 19 +++++
 tensorflow/core/framework/common_shape_fns.h  |  3 +
 .../core/kernels/data/experimental/BUILD      | 15 ++++
 .../data/experimental/compression_ops.cc      | 76 +++++++++++++++++
 .../data/experimental/compression_ops.h       | 49 +++++++++++
 tensorflow/core/ops/dataset_ops.cc            | 35 ++------
 .../core/ops/experimental_dataset_ops.cc      | 13 +++
 .../data/experimental/kernel_tests/BUILD      | 15 +++-
 .../kernel_tests/compression_ops_test.py      | 81 +++++++++++++++++++
 tensorflow/python/data/experimental/ops/BUILD | 10 +++
 .../data/experimental/ops/compression_ops.py  | 55 +++++++++++++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |  8 ++
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |  8 ++
 15 files changed, 366 insertions(+), 31 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_CompressElement.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_UncompressElement.pbtxt
 create mode 100644 tensorflow/core/kernels/data/experimental/compression_ops.cc
 create mode 100644 tensorflow/core/kernels/data/experimental/compression_ops.h
 create mode 100644 tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
 create mode 100644 tensorflow/python/data/experimental/ops/compression_ops.py

diff --git a/tensorflow/core/api_def/base_api/api_def_CompressElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_CompressElement.pbtxt
new file mode 100644
index 00000000000..17b63e4ab2f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_CompressElement.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "CompressElement"
+  visibility: HIDDEN
+  summary: "Compresses a dataset element."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UncompressElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_UncompressElement.pbtxt
new file mode 100644
index 00000000000..e2039b674f0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UncompressElement.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "UncompressElement"
+  visibility: HIDDEN
+  summary: "Uncompresses a compressed dataset element."
+}
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 113adbdd432..216002ad8e7 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -468,6 +468,25 @@ Status CheckFormatConstraintsOnShape(const TensorFormat tensor_format,
   return Status::OK();
 }
 
+Status DatasetIteratorShape(shape_inference::InferenceContext* c) {
+  shape_inference::ShapeHandle unused;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+  std::vector<PartialTensorShape> output_shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
+  if (output_shapes.size() != c->num_outputs()) {
+    return errors::InvalidArgument(
+        "`output_shapes` must be the same length as `output_types` (",
+        output_shapes.size(), " vs. ", c->num_outputs());
+  }
+  for (size_t i = 0; i < output_shapes.size(); ++i) {
+    shape_inference::ShapeHandle output_shape_handle;
+    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+        output_shapes[i], &output_shape_handle));
+    c->set_output(static_cast<int>(i), output_shape_handle);
+  }
+  return Status::OK();
+}
+
 Status MakeShapeFromFormat(TensorFormat format, DimensionOrConstant N,
                            const std::vector<DimensionOrConstant>& spatial,
                            DimensionOrConstant C, ShapeHandle* out,
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index e1984abab7e..218400c2435 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -92,6 +92,9 @@ inline Status MergeBothInputsShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// Shape function for dataset iterators.
+Status DatasetIteratorShape(shape_inference::InferenceContext* c);
+
 // Returns a new shape with the specified dims arranged in the specified
 // format. The returned value is owned by this context.
 // Note: if format = "FORMAT_NCHW_VECT_C" then C represents the outer_depth.
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 85f8af878ee..f4b9240ca31 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -109,6 +109,20 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "compression_ops",
+    srcs = ["compression_ops.cc"],
+    hdrs = ["compression_ops.h"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/data:dataset_proto_cc",
+    ],
+)
+
 tf_kernel_library(
     name = "csv_dataset_op",
     srcs = ["csv_dataset_op.cc"],
@@ -681,6 +695,7 @@ tf_kernel_library(
         ":auto_shard_dataset_op",
         ":choose_fastest_branch_dataset_op",
         ":choose_fastest_dataset_op",
+        ":compression_ops",
         ":csv_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/compression_ops.cc b/tensorflow/core/kernels/data/experimental/compression_ops.cc
new file mode 100644
index 00000000000..efa7018acb6
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/compression_ops.cc
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/data/experimental/compression_ops.h"
+
+#include "tensorflow/core/data/compression_utils.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+CompressElementOp::CompressElementOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {}
+
+void CompressElementOp::Compute(OpKernelContext* ctx) {
+  std::vector<Tensor> components;
+  for (size_t i = 0; i < ctx->num_inputs(); ++i) {
+    components.push_back(ctx->input(i));
+  }
+  CompressedElement compressed;
+  OP_REQUIRES_OK(ctx, CompressElement(components, &compressed));
+
+  Tensor* output;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+  output->scalar<Variant>()() = std::move(compressed);
+}
+
+UncompressElementOp::UncompressElementOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+}
+
+void UncompressElementOp::Compute(OpKernelContext* ctx) {
+  Tensor tensor = ctx->input(0);
+  const Variant& variant = tensor.scalar<Variant>()();
+  const CompressedElement* compressed = variant.get<CompressedElement>();
+
+  std::vector<Tensor> components;
+  OP_REQUIRES_OK(ctx, UncompressElement(*compressed, &components));
+  OP_REQUIRES(ctx, components.size() == output_types_.size(),
+              errors::FailedPrecondition("Expected ", output_types_.size(),
+                                         " outputs from uncompress, but got ",
+                                         components.size()));
+  for (int i = 0; i < components.size(); ++i) {
+    OP_REQUIRES(
+        ctx, components[i].dtype() == output_types_[i],
+        errors::FailedPrecondition("Expected a tensor of type ",
+                                   DataTypeString(output_types_[i]),
+                                   " but got a tensor of type ",
+                                   DataTypeString(components[i].dtype())));
+    ctx->set_output(i, components[i]);
+  }
+}
+
+REGISTER_KERNEL_BUILDER(Name("CompressElement").Device(DEVICE_CPU),
+                        CompressElementOp);
+REGISTER_KERNEL_BUILDER(Name("UncompressElement").Device(DEVICE_CPU),
+                        UncompressElementOp);
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/compression_ops.h b/tensorflow/core/kernels/data/experimental/compression_ops.h
new file mode 100644
index 00000000000..6dd89ea4e5d
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/compression_ops.h
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
+
+#include "tensorflow/core/framework/dataset.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+class CompressElementOp : public OpKernel {
+ public:
+  explicit CompressElementOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
+class UncompressElementOp : public OpKernel {
+ public:
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+
+  explicit UncompressElementOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_COMPRESSION_OPS_H_
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 0122cbed087..6a633fb679d 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -731,42 +731,19 @@ REGISTER_OP("OneShotIterator")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ScalarShape);
 
-namespace {
-
-Status IteratorGetNextShapeFn(shape_inference::InferenceContext* c) {
-  shape_inference::ShapeHandle unused;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-  std::vector<PartialTensorShape> output_shapes;
-  TF_RETURN_IF_ERROR(c->GetAttr("output_shapes", &output_shapes));
-  if (output_shapes.size() != c->num_outputs()) {
-    return errors::InvalidArgument(
-        "`output_shapes` must be the same length as `output_types` (",
-        output_shapes.size(), " vs. ", c->num_outputs());
-  }
-  for (size_t i = 0; i < output_shapes.size(); ++i) {
-    shape_inference::ShapeHandle output_shape_handle;
-    TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
-        output_shapes[i], &output_shape_handle));
-    c->set_output(static_cast<int>(i), output_shape_handle);
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
 REGISTER_OP("IteratorGetNext")
     .Input("iterator: resource")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("IteratorGetNextSync")
     .Input("iterator: resource")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 // TODO(b/124308596): Instead of conservatively marking this op as stateful,
 // implement a mechanism to determine whether `dataset` has a side-effect
@@ -778,7 +755,7 @@ REGISTER_OP("DatasetToSingleElement")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 // TODO(b/124308596): Instead of conservatively marking this op as stateful,
 // implement a mechanism to determine whether `dataset` has a side-effect
@@ -796,7 +773,7 @@ REGISTER_OP("ReduceDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
     .SetIsStateful()
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("IteratorToStringHandle")
     .Input("resource_handle: resource")
@@ -875,7 +852,7 @@ REGISTER_OP("OptionalGetValue")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("IteratorGetNextAsOptional")
     .Input("iterator: resource")
@@ -992,7 +969,7 @@ REGISTER_OP("MultiDeviceIteratorGetNextFromShard")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(IteratorGetNextShapeFn);
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("MultiDeviceIteratorToStringHandle")
     .Input("multi_device_iterator: resource")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 2c9cbe2f416..aa4bd64270a 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -132,6 +132,19 @@ REGISTER_OP("ExperimentalChooseFastestDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("CompressElement")
+    .Input("components: input_types")
+    .Output("compressed: variant")
+    .Attr("input_types: list(type) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("UncompressElement")
+    .Input("compressed: variant")
+    .Output("components: output_types")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::DatasetIteratorShape);
+
 REGISTER_OP("CSVDataset")
     .Input("filenames: string")
     .Input("compression_type: string")
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index d5d6cb00733..1d5abb9871b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")  # buildifier: disable=same-origin-load
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -87,6 +87,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "compression_ops_test",
+    srcs = ["compression_ops_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:compression_ops",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 cuda_py_test(
     name = "copy_to_device_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
new file mode 100644
index 00000000000..a091bdca8b9
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/compression_ops_test.py
@@ -0,0 +1,81 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compression ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import compression_ops
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.platform import test
+
+
+def _test_objects():
+  return [
+      combinations.NamedObject("int", 1),
+      combinations.NamedObject("string", "dog"),
+      combinations.NamedObject("tuple", (1, 1)),
+      combinations.NamedObject("int_string_tuple", (1, "dog")),
+      combinations.NamedObject(
+          "sparse",
+          sparse_tensor.SparseTensorValue(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])),
+      combinations.NamedObject(
+          "sparse_structured", {
+              "a":
+                  sparse_tensor.SparseTensorValue(
+                      indices=[[0, 0], [1, 2]],
+                      values=[1, 2],
+                      dense_shape=[3, 4]),
+              "b": (1, 2, "dog")
+          })
+  ]
+
+
+class CompressionOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(element=_test_objects())))
+  def testCompression(self, element):
+    element = element._obj
+
+    compressed = compression_ops.compress(element)
+    uncompressed = compression_ops.uncompress(
+        compressed, structure.type_spec_from_value(element))
+    self.assertValuesEqual(element, self.evaluate(uncompressed))
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(element=_test_objects())))
+  def testDatasetCompression(self, element):
+    element = element._obj
+
+    dataset = dataset_ops.Dataset.from_tensors(element)
+    element_spec = dataset.element_spec
+
+    dataset = dataset.map(lambda *x: compression_ops.compress(x))
+    dataset = dataset.map(lambda x: compression_ops.uncompress(x, element_spec))
+    self.assertDatasetProduces(dataset, [element])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 50d095e46f6..2adf2a6362d 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -33,6 +33,15 @@ py_library(
     ],
 )
 
+py_library(
+    name = "compression_ops",
+    srcs = ["compression_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+    ],
+)
+
 py_library(
     name = "counter",
     srcs = ["counter.py"],
@@ -475,6 +484,7 @@ py_library(
     deps = [
         ":batching",
         ":cardinality",
+        ":compression_ops",
         ":counter",
         ":data_service_ops",
         ":distribute",
diff --git a/tensorflow/python/data/experimental/ops/compression_ops.py b/tensorflow/python/data/experimental/ops/compression_ops.py
new file mode 100644
index 00000000000..1ef7c8b3f01
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/compression_ops.py
@@ -0,0 +1,55 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for compressing and uncompressing dataset elements."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import structure
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def compress(element):
+  """Compress a dataset element.
+
+  Args:
+    element: A nested structure of types supported by Tensorflow.
+
+  Returns:
+    A variant tensor representing the compressed element. This variant can be
+    passed to `uncompress` to get back the original element.
+  """
+  element_spec = structure.type_spec_from_value(element)
+  tensor_list = structure.to_tensor_list(element_spec, element)
+  return ged_ops.compress_element(tensor_list)
+
+
+def uncompress(element, output_spec):
+  """Uncompress a compressed dataset element.
+
+  Args:
+    element: A scalar variant tensor to uncompress. The element should have been
+      created by calling `compress`.
+    output_spec: A nested structure of `tf.TypeSpec` representing the type(s) of
+      the uncompressed element.
+
+  Returns:
+    The uncompressed element.
+  """
+  flat_types = structure.get_flat_tensor_types(output_spec)
+  flat_shapes = structure.get_flat_tensor_shapes(output_spec)
+  tensor_list = ged_ops.uncompress_element(
+      element, output_types=flat_types, output_shapes=flat_shapes)
+  return structure.from_tensor_list(output_spec, tensor_list)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index f798ebf25fd..3db327300a9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -736,6 +736,10 @@ tf_module {
     name: "ComplexAbs"
     argspec: "args=[\'x\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "CompressElement"
+    argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "ComputeAccidentalHits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -4956,6 +4960,10 @@ tf_module {
     name: "UnbatchGrad"
     argspec: "args=[\'original_input\', \'batch_index\', \'grad\', \'id\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
   }
+  member_method {
+    name: "UncompressElement"
+    argspec: "args=[\'compressed\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "UnicodeDecode"
     argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index f798ebf25fd..3db327300a9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -736,6 +736,10 @@ tf_module {
     name: "ComplexAbs"
     argspec: "args=[\'x\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "CompressElement"
+    argspec: "args=[\'components\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "ComputeAccidentalHits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'None\'], "
@@ -4956,6 +4960,10 @@ tf_module {
     name: "UnbatchGrad"
     argspec: "args=[\'original_input\', \'batch_index\', \'grad\', \'id\', \'container\', \'shared_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'None\'], "
   }
+  member_method {
+    name: "UncompressElement"
+    argspec: "args=[\'compressed\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "UnicodeDecode"
     argspec: "args=[\'input\', \'input_encoding\', \'errors\', \'replacement_char\', \'replace_control_characters\', \'Tsplits\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'False\', \"<dtype: \'int64\'>\", \'None\'], "

From 1df42d1bf35cf15954434a5a804275638cae4440 Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Wed, 20 May 2020 20:38:13 -0700
Subject: [PATCH 0954/1533] Update person_detection_experimental model

PiperOrigin-RevId: 312608330
Change-Id: I011d8e9e69f255d74375338e6c5444a6b41b3717
---
 .../examples/person_detection_experimental/training_a_model.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md b/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
index 24067fc188f..beb743a2923 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/training_a_model.md
@@ -372,6 +372,9 @@ tf.lite.TFLiteConverter.from_frozen_graph('vww_96_grayscale_frozen.pb',
 ['input'], ['MobilenetV1/Predictions/Reshape_1'])
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
 converter.representative_dataset = representative_dataset_gen
+converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
+converter.inference_input_type = tf.int8
+converter.inference_output_type = tf.int8
 
 tflite_quant_model = converter.convert()
 open("vww_96_grayscale_quantized.tflite", "wb").write(tflite_quant_model)

From 41224dad54657a6929a03c23193d6e81eab868cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 21:16:15 -0700
Subject: [PATCH 0955/1533] Fuse tf.text WhitespaceTokenizer to tflite custom
 op

PiperOrigin-RevId: 312612112
Change-Id: Ia7142d64948a4e41f795ee1f64ecd004bcbf9be0
---
 tensorflow/compiler/mlir/lite/BUILD           |  36 +++++
 .../compiler/mlir/lite/tests/fuse-tftext.mlir |  14 ++
 .../prepare_composite_functions_tf.cc         |  11 ++
 .../compiler/mlir/lite/utils/tftext_utils.cc  | 127 ++++++++++++++++++
 .../compiler/mlir/lite/utils/tftext_utils.h   |  39 ++++++
 5 files changed, 227 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
 create mode 100644 tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
 create mode 100644 tensorflow/compiler/mlir/lite/utils/tftext_utils.h

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 9b5b0c209e5..6eff7dbd084 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -260,6 +260,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tftext_utils",
+    srcs = [
+        "utils/tftext_utils.cc",
+    ],
+    hdrs = [
+        "utils/tftext_utils.h",
+    ],
+    copts = ["-std=c++14"],
+    deps = [
+        ":tensorflow_lite",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "tftext_utils_test",
+    size = "small",
+    srcs = ["utils/lstm_utils_test.cc"],
+    deps = [
+        ":lstm_utils",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "stateful_ops_utils",
     srcs = [
@@ -320,6 +355,7 @@ cc_library(
         ":lstm_utils",
         ":stateful_ops_utils",
         ":tensorflow_lite",
+        ":tftext_utils",
         ":validators",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
diff --git a/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
new file mode 100644
index 00000000000..f08ac0e1027
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/fuse-tftext.mlir
@@ -0,0 +1,14 @@
+// RUN: tf-opt -tfl-prepare-composite-funcs-tf -tfl-fuse-tftext=true %s -split-input-file | FileCheck %s --dump-input-on-failure
+module {
+
+  func @_whitespace_func(%arg0: tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {tf._GrapplerSpecializedFunc = true, tf._input_shapes = [#tf.shape<1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
+    %0 = "tf.op1"(%arg0)  : (tensor<1x!tf.string>) -> (tensor<?x!tf.string>)
+    %1 = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<?xi64>
+    %2:2 = "tf.op2"(%arg0, %1) : (tensor<1x!tf.string>, tensor<?xi64>) -> (tensor<?x!tf.string>, tensor<?xi64>)
+    return %2#0, %2#1 : tensor<?x!tf.string>, tensor<?xi64>
+  }
+
+  // CHECK: func @_whitespace_func(%arg0: tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>) attributes {tf._GrapplerSpecializedFunc = true, tf._input_shapes = [#tf.shape<1>], tf.api_implements = "tftext:WhitespaceTokenizer", tf.signature.is_stateful} {
+  // CHECK:  "tfl.custom"(%arg0) {custom_code = "tftext:WhitespaceTokenizer", custom_option = opaque<"tfl", "0x"> : tensor<0xi8>} : (tensor<1x!tf.string>) -> (tensor<?x!tf.string>, tensor<?xi64>)
+  // CHECK:  return %0#0, %0#1 : tensor<?x!tf.string>, tensor<?xi64>
+}
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
index 6179eb2ce64..56af68f6bbe 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_composite_functions_tf.cc
@@ -41,15 +41,22 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/lstm_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
+// The cmd line flag to turn on/off Tf.Text API fusion.
 // NOLINTNEXTLINE
+static llvm::cl::opt<bool> fuse_tftext(
+    "tfl-fuse-tftext", llvm::cl::value_desc("bool"),
+    llvm::cl::desc("Fuse TF.Text API ops when it's true"),
+    llvm::cl::init(false));
 
 namespace mlir {
 namespace TFL {
 namespace {
 
 constexpr char kTFAPIImplements[] = "tf.api_implements";
+constexpr char kTfTextAPIPRefix[] = "tftext:";
 
 // Abstracts the conversion of the embedded lookup composite function.
 class ConvertEmbeddedLookupFunc {
@@ -187,6 +194,10 @@ void PrepareCompositeFunctionsPass::ConvertTFAPIImplements(FuncOp func,
     OpBuilder builder(func.getBody());
     if (failed(ConvertKerasLSTMLayer(func, &builder)))
       return signalPassFailure();
+  } else if (fuse_tftext && attr.getValue().startswith(kTfTextAPIPRefix)) {
+    if (failed(ConvertTFTextAPI(func, attr.getValue()))) {
+      return signalPassFailure();
+    }
   }
 }
 
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
new file mode 100644
index 00000000000..12929152d1e
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -0,0 +1,127 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/lite/utils/tftext_utils.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Identifier.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+namespace {
+
+constexpr char kWhitespaceTokenizer[] = "tftext:WhitespaceTokenizer";
+constexpr char kTFAPIImplements[] = "tf.api_implements";
+
+inline OpaqueElementsAttr emptyCustomOption(OpBuilder* builder) {
+  std::string content = "";
+  ShapedType type = RankedTensorType::get(
+      {static_cast<int64_t>(content.size())}, builder->getIntegerType(8));
+  return OpaqueElementsAttr::get(
+      builder->getContext()->getRegisteredDialect("tfl"), type, content);
+}
+
+inline RankedTensorType getInputType(mlir::FuncOp func, int idx) {
+  return func.getType()
+      .getInput(idx)
+      .dyn_cast_or_null<mlir::RankedTensorType>();
+}
+
+inline RankedTensorType getResultType(mlir::FuncOp func, int idx) {
+  return func.getType()
+      .getResult(idx)
+      .dyn_cast_or_null<mlir::RankedTensorType>();
+}
+
+LogicalResult VerifyWhitespaceTokenizer(mlir::FuncOp func) {
+  if (func.getNumResults() != 2) {
+    return failure();
+  }
+  if (func.getNumArguments() != 1) {
+    return failure();
+  }
+  auto input_type = getInputType(func, 0);
+  if (!input_type || input_type.getRank() != 1 ||
+      !input_type.getElementType().isa<mlir::TF::StringType>()) {
+    return failure();
+  }
+  auto value_type = getResultType(func, 0);
+  if (!value_type || value_type.getRank() != 1 ||
+      !value_type.getElementType().isa<mlir::TF::StringType>()) {
+    return failure();
+  }
+  auto offset_type = getResultType(func, 1);
+  if (offset_type.getRank() != 1 ||
+      !offset_type.getElementType().isInteger(64)) {
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult ConvertWhitespaceTokenizer(mlir::FuncOp func,
+                                         llvm::StringRef api) {
+  func.eraseBody();
+  func.addEntryBlock();
+  func.setAttr(kTFAPIImplements, StringAttr::get(api, func.getContext()));
+
+  Value text = func.getArgument(0);
+  auto output_type = func.getType().getResult(0);
+  auto offset_type = func.getType().getResult(1);
+  SmallVector<Type, 2> shape = {output_type, offset_type};
+  ArrayRef<Type> output_types(shape);
+
+  OpBuilder builder(func.getBody());
+
+  auto op = builder.create<mlir::TFL::CustomOp>(func.getLoc(), output_types,
+                                                ValueRange(text), api,
+                                                emptyCustomOption(&builder));
+
+  builder.create<mlir::ReturnOp>(func.getLoc(), op.getResults());
+  return success();
+}
+}  // namespace
+
+LogicalResult ConvertTFTextAPI(mlir::FuncOp func, llvm::StringRef api) {
+  if (api.str() == kWhitespaceTokenizer) {
+    if (succeeded(VerifyWhitespaceTokenizer(func))) {
+      return ConvertWhitespaceTokenizer(func, api);
+    }
+  }
+  return failure();
+}
+
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.h b/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
new file mode 100644
index 00000000000..283e57c179a
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.h
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This header file defines common utils used by TFLite transformation
+// passes to work with op attributes.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace TFL {
+
+LogicalResult ConvertTFTextAPI(mlir::FuncOp func, llvm::StringRef api);
+
+}  // end namespace TFL
+}  // end namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_TFTEXT_UTILS_H_

From 81b1778bcf79f8cc3a545ebc875ae2f65f030554 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 21:18:22 -0700
Subject: [PATCH 0956/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 312612303
Change-Id: I20a2efc1dab991cfd4ae1d464b4876ff7326208b
---
 .../ops_history_v2/CompressElement.pbtxt      | 17 ++++++++
 .../ops_history_v2/UncompressElement.pbtxt    | 23 +++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 40 +++++++++++++++++++
 3 files changed, 80 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
new file mode 100644
index 00000000000..07d8cb461af
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/CompressElement.pbtxt
@@ -0,0 +1,17 @@
+op {
+  name: "CompressElement"
+  input_arg {
+    name: "components"
+    type_list_attr: "input_types"
+  }
+  output_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "input_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt
new file mode 100644
index 00000000000..68406e0e4bc
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/UncompressElement.pbtxt
@@ -0,0 +1,23 @@
+op {
+  name: "UncompressElement"
+  input_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index c951cb11778..75f8c0dadcb 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7451,6 +7451,23 @@ op {
     }
   }
 }
+op {
+  name: "CompressElement"
+  input_arg {
+    name: "components"
+    type_list_attr: "input_types"
+  }
+  output_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "input_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ComputeAccidentalHits"
   input_arg {
@@ -52662,6 +52679,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UncompressElement"
+  input_arg {
+    name: "compressed"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "UnicodeDecode"
   input_arg {

From 203c1de5a4e54079304f154eee1745e6ee3eb3b2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 21:46:54 -0700
Subject: [PATCH 0957/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 312614884
Change-Id: I0346b1bf51895ce50735c6ba3f87e04d80ba01f8
---
 tensorflow/go/op/wrappers.go | 41 ++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 7efdcf181d9..47f5c4952b6 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11417,6 +11417,32 @@ func DynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output) (merged
 	return op.Output(0)
 }
 
+// Uncompresses a compressed dataset element.
+func UncompressElement(scope *Scope, compressed tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "UncompressElement",
+		Input: []tf.Input{
+			compressed,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if components, idx, err = makeOutputList(op, idx, "components"); err != nil {
+		scope.UpdateErr("UncompressElement", err)
+		return
+	}
+	return components
+}
+
 // Records the bytes size of each element of `input_dataset` in a StatsAggregator.
 func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
@@ -30410,6 +30436,21 @@ func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Compresses a dataset element.
+func CompressElement(scope *Scope, components []tf.Output) (compressed tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "CompressElement",
+		Input: []tf.Input{
+			tf.OutputList(components),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // MatMulAttr is an optional argument to MatMul.
 type MatMulAttr func(optionalAttr)
 

From ed0eb69b76f9ff7ac952a3f36692d2c86929a6bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 20 May 2020 22:08:24 -0700
Subject: [PATCH 0958/1533] Remove some unused methods.

PiperOrigin-RevId: 312617087
Change-Id: I60618ee25984825997c204740c12eddefbf9d398
---
 .../layers/preprocessing/text_vectorization.py     | 10 ----------
 .../layers/preprocessing/text_vectorization_v1.py  | 14 --------------
 2 files changed, 24 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 9d083cc8769..1abc37cb4c3 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -309,18 +309,8 @@ class TextVectorization(CombinerPreprocessingLayer):
   def _get_vectorization_class(self):
     return categorical_encoding.CategoricalEncoding
 
-  def _get_table_data(self):
-    keys, values = self._table.export()
-    return (keys.numpy(), values.numpy())
-
   def _get_index_lookup_class(self):
     return string_lookup.StringLookup
-
-  def _to_numpy(self, preprocessed_data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(preprocessed_data, np.ndarray):
-      return preprocessed_data
-    return np.array(preprocessed_data.to_list())
   # End of V1/V2 shim points.
 
   def _assert_same_type(self, expected_type, values, value_name):
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index 59cf2c61288..a7c7b9136f9 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -18,14 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
-from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -85,13 +81,3 @@ class TextVectorization(text_vectorization.TextVectorization,
 
   def _get_index_lookup_class(self):
     return string_lookup_v1.StringLookup
-
-  def _to_numpy(self, data):
-    """Converts preprocessed inputs into numpy arrays."""
-    if isinstance(data, np.ndarray):
-      return data
-    session = K.get_session()
-    data = session.run(data)
-    if isinstance(data, ragged_tensor_value.RaggedTensorValue):
-      data = np.array(data.to_list())
-    return data

From a8001b9e8db92620603c3c0588d251192d327bae Mon Sep 17 00:00:00 2001
From: Davide Libenzi <dlibenzi@google.com>
Date: Wed, 20 May 2020 23:37:04 -0700
Subject: [PATCH 0959/1533] Take proto by value.

PiperOrigin-RevId: 312626373
Change-Id: I2effeab7b0c97052f14b8f52b653f24a379dc7ee
---
 tensorflow/compiler/xla/client/xla_computation.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
index 3ccbfb28bd0..6a3b17a154a 100644
--- a/tensorflow/compiler/xla/client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -29,8 +29,8 @@ namespace xla {
 class XlaComputation {
  public:
   XlaComputation() : unique_id_(-1) {}
-  XlaComputation(const HloModuleProto& proto)
-      : unique_id_(proto.id()), proto_(proto) {}
+  XlaComputation(HloModuleProto proto)
+      : unique_id_(proto.id()), proto_(std::move(proto)) {}
 
   ~XlaComputation() {}
 

From acc8ae3496a94a7ac9d32b6196ffc623f85381b9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 02:03:14 -0700
Subject: [PATCH 0960/1533] Update GraphDef version to 408.

PiperOrigin-RevId: 312638937
Change-Id: I0f4e28e19b9950a791269b68294d1620366c8492
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 6c6c46980d9..9db20363349 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 407  // Updated: 2020/5/20
+#define TF_GRAPH_DEF_VERSION 408  // Updated: 2020/5/21
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From fa0721cfbd93a1506d39735296a260a877354e6c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 02:03:18 -0700
Subject: [PATCH 0961/1533] compat: Update forward compatibility horizon to
 2020-05-21

PiperOrigin-RevId: 312638952
Change-Id: I8d2533185f0976f307bc26dfe50b90e12ad300ad
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 9bc9ca973c2..58b777a1310 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 20)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 21)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 880ea5d754b9253265c8b9782289c9d64a39674a Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Thu, 21 May 2020 03:51:24 -0700
Subject: [PATCH 0962/1533] Export CategoryEncoding keras preprocessing layer.

PiperOrigin-RevId: 312647888
Change-Id: I7e117fab8995280246c6e77cb170d85bcf3040b3
---
 tensorflow/python/keras/layers/__init__.py    |   6 +
 .../python/keras/layers/preprocessing/BUILD   |  22 +-
 .../layers/preprocessing/benchmarks/BUILD     |   6 +-
 ...mark.py => category_encoding_benchmark.py} |   8 +-
 ...rical_encoding.py => category_encoding.py} | 117 ++++-----
 ...=> category_encoding_distribution_test.py} |  31 ++-
 ...ding_test.py => category_encoding_test.py} | 117 ++++-----
 ...encoding_v1.py => category_encoding_v1.py} |  13 +-
 .../preprocessing/text_vectorization.py       |  12 +-
 .../preprocessing/text_vectorization_v1.py    |   4 +-
 .../python/keras/layers/serialization.py      |  16 +-
 ...ing.-category-encoding.__metaclass__.pbtxt |  14 ++
 ...tal.preprocessing.-category-encoding.pbtxt | 234 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 ...ing.-category-encoding.__metaclass__.pbtxt |  14 ++
 ...tal.preprocessing.-category-encoding.pbtxt | 232 +++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 17 files changed, 677 insertions(+), 177 deletions(-)
 rename tensorflow/python/keras/layers/preprocessing/benchmarks/{categorical_encoding_benchmark.py => category_encoding_benchmark.py} (93%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding.py => category_encoding.py} (82%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding_distribution_test.py => category_encoding_distribution_test.py} (64%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding_test.py => category_encoding_test.py} (88%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding_v1.py => category_encoding_v1.py} (89%)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 67ac91cb9be..e0f087b2453 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -44,6 +44,9 @@ from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Res
 
 # Preprocessing layers.
 if tf2.enabled():
+  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding as CategoryEncodingV1
+  CategoryEncodingV2 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization as NormalizationV1
   NormalizationV2 = Normalization
@@ -51,6 +54,9 @@ if tf2.enabled():
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization as TextVectorizationV1
   TextVectorizationV2 = TextVectorization
 else:
+  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding as CategoryEncodingV2
+  CategoryEncodingV1 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization as NormalizationV2
   NormalizationV1 = Normalization
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index b7fdc17b81d..af7f6392219 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -196,7 +196,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
         ":string_lookup",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -216,10 +216,10 @@ py_library(
 )
 
 py_library(
-    name = "categorical_encoding",
+    name = "category_encoding",
     srcs = [
-        "categorical_encoding.py",
-        "categorical_encoding_v1.py",
+        "category_encoding.py",
+        "category_encoding_v1.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -308,12 +308,12 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "categorical_encoding_test",
+    name = "category_encoding_test",
     size = "medium",
-    srcs = ["categorical_encoding_test.py"],
+    srcs = ["category_encoding_test.py"],
     python_version = "PY3",
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
         ":preprocessing_test_utils",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -324,9 +324,9 @@ tf_py_test(
 )
 
 distribute_py_test(
-    name = "categorical_encoding_distribution_test",
-    srcs = ["categorical_encoding_distribution_test.py"],
-    main = "categorical_encoding_distribution_test.py",
+    name = "category_encoding_distribution_test",
+    srcs = ["category_encoding_distribution_test.py"],
+    main = "category_encoding_distribution_test.py",
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
@@ -335,7 +335,7 @@ distribute_py_test(
         "no_oss",  # b/155502591
     ],
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 6d29126bc7e..7c976880059 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -11,12 +11,12 @@ package(
 exports_files(["LICENSE"])
 
 tf_py_test(
-    name = "categorical_encoding_benchmark",
-    srcs = ["categorical_encoding_benchmark.py"],
+    name = "category_encoding_benchmark",
+    srcs = ["category_encoding_benchmark.py"],
     python_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/keras/layers/preprocessing:categorical_encoding",
+        "//tensorflow/python/keras/layers/preprocessing:category_encoding",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
similarity index 93%
rename from tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
rename to tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
index e68b77ebef9..71b4c7b6b61 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for Keras categorical_encoding preprocessing layer."""
+"""Benchmark for Keras category_encoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -42,7 +42,7 @@ class BenchmarkLayer(benchmark.Benchmark):
   def run_dataset_implementation(self, output_mode, batch_size, sequence_length,
                                  max_tokens):
     input_t = keras.Input(shape=(sequence_length,), dtype=dtypes.int32)
-    layer = categorical_encoding.CategoricalEncoding(
+    layer = category_encoding.CategoryEncoding(
         max_tokens=max_tokens, output_mode=output_mode)
     _ = layer(input_t)
 
@@ -68,7 +68,7 @@ class BenchmarkLayer(benchmark.Benchmark):
       ends.append(time.time())
 
     avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    name = "categorical_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
+    name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
         batch_size, sequence_length, max_tokens)
     self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
similarity index 82%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding.py
index 466405a27a9..b0a7e746074 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras text CategoricalEncoding preprocessing layer."""
+"""Keras text CategoryEncoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -32,11 +32,13 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import keras_export
 
 TFIDF = "tf-idf"
 INT = "int"
@@ -49,14 +51,26 @@ _NUM_ELEMENTS_NAME = "num_elements"
 _IDF_NAME = "idf"
 
 
-class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Categorical encoding layer.
+@keras_export("keras.layers.experimental.preprocessing.CategoryEncoding", v1=[])
+class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
+  """Category encoding layer.
 
   This layer provides options for condensing data into a categorical encoding.
   It accepts integer values as inputs and outputs a dense representation
   (one sample = 1-index tensor of float values representing data about the
   sample's tokens) of those inputs.
 
+  Examples:
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
+  ...           max_tokens=4)
+  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
+  <tf.Tensor: shape=(4, 4), dtype=int64, numpy=
+    array([[1, 1, 0, 0],
+           [2, 0, 0, 0],
+           [0, 1, 1, 0],
+           [0, 1, 0, 1]])>
+
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary.
@@ -72,7 +86,6 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
       `Tensor`. Defaults to `False`.
   """
-  # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
                max_tokens=None,
@@ -83,7 +96,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     layer_utils.validate_string_arg(
         output_mode,
         allowable_strings=(COUNT, BINARY, TFIDF),
-        layer_name="CategoricalEncoding",
+        layer_name="CategoryEncoding",
         arg_name="output_mode")
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
@@ -92,10 +105,10 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       raise ValueError("max_tokens must be > 1.")
 
     # We need to call super() before we call _add_state_variable().
-    combiner = _CategoricalEncodingCombiner(
+    combiner = _CategoryEncodingCombiner(
         compute_max_element=max_tokens is None,
         compute_idf=output_mode == TFIDF)
-    super(CategoricalEncoding, self).__init__(combiner=combiner, **kwargs)
+    super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
 
     self._max_tokens = max_tokens
     self._output_mode = output_mode
@@ -158,13 +171,12 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       RuntimeError: if the layer cannot be adapted at this time.
     """
     if not reset_state:
-      raise ValueError("CategoricalEncoding does not support streaming adapts.")
+      raise ValueError("CategoryEncoding does not support streaming adapts.")
 
     if self._called and self._max_tokens is None:
-      raise RuntimeError(
-          "CategoricalEncoding can't be adapted after being called "
-          "if max_tokens is None.")
-    super(CategoricalEncoding, self).adapt(data, reset_state)
+      raise RuntimeError("CategoryEncoding can't be adapted after being called "
+                         "if max_tokens is None.")
+    super(CategoryEncoding, self).adapt(data, reset_state)
 
   def _set_state_variables(self, updates):
     if not self.built:
@@ -180,7 +192,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
         "output_mode": self._output_mode,
         "sparse": self._sparse,
     }
-    base_config = super(CategoricalEncoding, self).get_config()
+    base_config = super(CategoryEncoding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def _convert_to_ndarray(self, x):
@@ -237,65 +249,40 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     else:
       out_depth = self._max_tokens
 
-    if self._sparse:
-      if self._output_mode != COUNT:
-        raise ValueError("Only supports `sparse=True` when `output_mode` "
-                         ' is \"count\", got {}'.format(self._output_mode))
-      inputs = self._convert_to_sparse_inputs(inputs)
-
-      # Consider having sparse.one_hot
-      # Append values to indices, and reduce sum to get the counts.
-      tokens = array_ops.expand_dims(
-          math_ops.cast(inputs.values, dtypes.int64), axis=1)
-      count_tokens = array_ops.concat([inputs.indices, tokens], axis=1)
-      count_values = array_ops.ones_like(inputs.values, dtype=dtypes.int64)
-      unreduced_count_shape = array_ops.concat(
-          [inputs.dense_shape, [out_depth]], axis=0)
-      counts = sparse_tensor.SparseTensor(
-          indices=count_tokens,
-          values=count_values,
-          dense_shape=unreduced_count_shape)
-      count_data = sparse_ops.sparse_reduce_sum_v2(
-          counts, axis=1, output_is_sparse=True)
-      return count_data
-
-    # If the input is a sparse tensor, we densify it with the default value of
-    # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
-    # positions from the output encoding.
-    if isinstance(inputs, sparse_tensor.SparseTensor):
-      inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
-
-    if self._output_mode == BINARY:
-      bool_one_hot_data = array_ops.one_hot(
-          inputs, depth=out_depth, on_value=True, off_value=False)
-      reduced_bool_data = math_ops.reduce_any(bool_one_hot_data, axis=1)
-      binary_data = math_ops.cast(reduced_bool_data, dtypes.int64)
-      binary_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
-      return binary_data
-
-    one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
-    counts = math_ops.reduce_sum(one_hot_data, axis=1)
-    if self._output_mode == COUNT:
-      count_data = math_ops.cast(counts, dtypes.int64)
-      count_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
-      return count_data
-
-    tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
-    tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
     if self._output_mode == TFIDF:
+      # If the input is a sparse tensor, we densify it with the default value of
+      # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
+      # positions from the output encoding.
+      if isinstance(inputs, sparse_tensor.SparseTensor):
+        inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
+      one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
+      counts = math_ops.reduce_sum(one_hot_data, axis=1)
+      tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
+      tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
       return tf_idf_data
 
-    # We can only get here if we didn't recognize the passed mode.
-    raise ValueError("Unknown output mode %s" % self._output_mode)
+    binary_output = (self._output_mode == BINARY)
+    if self._sparse:
+      return bincount_ops.sparse_bincount(
+          inputs, minlength=out_depth, axis=-1, binary_output=binary_output)
+    else:
+      result = bincount_ops.bincount(
+          inputs,
+          minlength=out_depth,
+          dtype=dtypes.int64,
+          axis=-1,
+          binary_output=binary_output)
+      result.set_shape(tensor_shape.TensorShape((None, out_depth)))
+      return result
 
 
-class _CategoricalEncodingAccumulator(
+class _CategoryEncodingAccumulator(
     collections.namedtuple("Accumulator", ["data", "per_doc_count_dict"])):
   pass
 
 
-class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
-  """Combiner for the CategoricalEncoding preprocessing layer.
+class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
+  """Combiner for the CategoryEncoding preprocessing layer.
 
   This class encapsulates the logic for computing the number of elements in the
   input dataset and the document frequency for each element.
@@ -411,7 +398,7 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
   def restore(self, output):
     """Creates an accumulator based on 'output'."""
     raise NotImplementedError(
-        "CategoricalEncoding does not restore or support streaming updates.")
+        "CategoryEncoding does not restore or support streaming updates.")
 
   def serialize(self, accumulator):
     """Serializes an accumulator for a remote call."""
@@ -452,4 +439,4 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
     else:
       per_doc_count_dict = None
     data = [0, 0]
-    return _CategoricalEncodingAccumulator(data, per_doc_count_dict)
+    return _CategoryEncodingAccumulator(data, per_doc_count_dict)
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
similarity index 64%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
index c5214533f94..011495b9314 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
@@ -21,39 +21,58 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
 @combinations.generate(
     combinations.combine(
-        distribution=strategy_combinations.all_strategies,
+        # (b/156783625): Outside compilation failed for eager mode only.
+        distribution=strategy_combinations.strategies_minus_tpu,
         mode=["eager", "graph"]))
-class CategoricalEncodingDistributionTest(
+class CategoryEncodingDistributionTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_distribution(self, distribution):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(input_array)
+    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
 
     # pyformat: disable
     expected_output = [[0, 1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 0, 0]]
     # pyformat: enable
     max_tokens = 6
+    config.set_soft_device_placement(True)
 
     with distribution.scope():
       input_data = keras.Input(shape=(4,), dtype=dtypes.int32)
-      layer = categorical_encoding.CategoricalEncoding(
-          max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+      layer = category_encoding.CategoryEncoding(
+          max_tokens=max_tokens, output_mode=category_encoding.BINARY)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
+    output_dataset = model.predict(inp_dataset)
     self.assertAllEqual(expected_output, output_dataset)
 
 
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
similarity index 88%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
index e21e95a0078..24eeda57b1f 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Keras text categorical_encoding preprocessing layer."""
+"""Tests for Keras text category_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -32,8 +32,8 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -44,15 +44,15 @@ from tensorflow.python.platform import test
 
 def get_layer_class():
   if context.executing_eagerly():
-    return categorical_encoding.CategoricalEncoding
+    return category_encoding.CategoryEncoding
   else:
-    return categorical_encoding_v1.CategoricalEncoding
+    return category_encoding_v1.CategoryEncoding
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingInputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingInputTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
 
   def test_dense_input_sparse_output(self):
     input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
@@ -67,9 +67,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -80,7 +78,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -103,7 +101,7 @@ class CategoricalEncodingInputTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -128,9 +126,7 @@ class CategoricalEncodingInputTest(
     max_tokens = 6
 
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -141,7 +137,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -163,7 +159,7 @@ class CategoricalEncodingInputTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
 
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -184,9 +180,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -197,7 +191,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -214,9 +208,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     encoding_layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = encoding_layer(input_data)
     output_data = math_ops.cast(int_data, dtypes.float32)
     weights = variables.Variable([[.1], [.2], [.3], [.4]], dtype=dtypes.float32)
@@ -228,9 +220,9 @@ class CategoricalEncodingInputTest(
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingAdaptTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
 
   def test_sparse_adapt(self):
     vocab_data = sparse_ops.from_dense(
@@ -248,7 +240,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -273,7 +265,7 @@ class CategoricalEncodingAdaptTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
 
@@ -296,7 +288,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     layer.adapt(vocab_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -306,7 +298,7 @@ class CategoricalEncodingAdaptTest(
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_hard_maximum_set_state_variables_after_build(self):
-    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
 
     # pyformat: disable
@@ -318,7 +310,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     layer._set_state_variables(state_variables)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -339,7 +331,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.build(input_data.shape)
     layer.set_num_elements(max_tokens)
     int_data = layer(input_data)
@@ -351,8 +343,7 @@ class CategoricalEncodingAdaptTest(
 
   def test_set_weights_fails_on_wrong_size_weights(self):
     tfidf_data = [.05, .5, .25, .2, .125]
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.TFIDF)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
 
     with self.assertRaisesRegex(ValueError, ".*Layer weight shape.*"):
       layer.set_weights([np.array(tfidf_data)])
@@ -360,7 +351,7 @@ class CategoricalEncodingAdaptTest(
   def test_set_num_elements_after_call_fails(self):
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer.set_num_elements(5)
@@ -370,17 +361,17 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "can't be adapted"):
       layer.adapt(vocab_data)
 
   def test_set_state_variables_after_call_fails(self):
-    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer._set_state_variables(state_variables)
@@ -388,9 +379,9 @@ class CategoricalEncodingAdaptTest(
 
 @keras_parameterized.run_all_keras_modes
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingOutputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingOutputTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
 
   def test_binary_output_hard_maximum(self):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
@@ -404,7 +395,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -424,7 +415,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -444,8 +435,7 @@ class CategoricalEncodingOutputTest(
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.COUNT)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.COUNT)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -465,7 +455,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.COUNT)
+        max_tokens=None, output_mode=category_encoding.COUNT)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -488,8 +478,7 @@ class CategoricalEncodingOutputTest(
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.TFIDF)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -513,7 +502,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.TFIDF)
+        max_tokens=None, output_mode=category_encoding.TFIDF)
     layer.set_num_elements(max_tokens)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
@@ -524,7 +513,7 @@ class CategoricalEncodingOutputTest(
     self.assertAllClose(expected_output, output_dataset)
 
 
-class CategoricalEncodingModelBuildingTest(
+class CategoryEncodingModelBuildingTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -532,27 +521,27 @@ class CategoricalEncodingModelBuildingTest(
       {
           "testcase_name": "count_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.COUNT
+          "output_mode": category_encoding.COUNT
       }, {
           "testcase_name": "count_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.COUNT
+          "output_mode": category_encoding.COUNT
       }, {
           "testcase_name": "binary_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.BINARY
+          "output_mode": category_encoding.BINARY
       }, {
           "testcase_name": "binary_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.BINARY
+          "output_mode": category_encoding.BINARY
       }, {
           "testcase_name": "tfidf_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.TFIDF
+          "output_mode": category_encoding.TFIDF
       }, {
           "testcase_name": "tfidf_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.TFIDF
+          "output_mode": category_encoding.TFIDF
       })
   def test_end_to_end_bagged_modeling(self, output_mode, max_tokens):
     tfidf_data = np.array([.03, .5, .25, .2, .125])
@@ -564,7 +553,7 @@ class CategoricalEncodingModelBuildingTest(
     weights = []
     if max_tokens is None:
       weights.append(np.array(5))
-    if output_mode == categorical_encoding.TFIDF:
+    if output_mode == category_encoding.TFIDF:
       weights.append(tfidf_data)
 
     layer.set_weights(weights)
@@ -577,7 +566,7 @@ class CategoricalEncodingModelBuildingTest(
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingCombinerTest(
+class CategoryEncodingCombinerTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -617,8 +606,7 @@ class CategoricalEncodingCombinerTest(
 
   def test_combiner_api_compatibility_int_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
-        compute_idf=False)
+    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=False)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "num_documents": np.array(2),
@@ -636,8 +624,7 @@ class CategoricalEncodingCombinerTest(
 
   def test_combiner_api_compatibility_tfidf_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
-        compute_idf=True)
+    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=True)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "document_counts": np.array([1, 2, 2, 2, 1]),
@@ -693,7 +680,7 @@ class CategoricalEncodingCombinerTest(
                                 expected_accumulator_output,
                                 expected_extract_output,
                                 compute_idf=True):
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
+    combiner = category_encoding._CategoryEncodingCombiner(
         compute_idf=compute_idf)
     expected_accumulator = combiner._create_accumulator()
     expected_accumulator = self.update_accumulator(expected_accumulator,
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
similarity index 89%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
index 83128ed5095..3afb86b344f 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
@@ -12,20 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tensorflow V1 version of the text categorical_encoding preprocessing layer."""
+"""Tensorflow V1 version of the text category_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.util.tf_export import keras_export
 
 
-class CategoricalEncoding(categorical_encoding.CategoricalEncoding,
-                          base_preprocessing_layer_v1.CombinerPreprocessingLayer
-                         ):
-  """CategoricalEncoding layer.
+@keras_export(v1=["keras.layers.experimental.preprocessing.CategoryEncoding"])
+class CategoryEncoding(category_encoding.CategoryEncoding,
+                       base_preprocessing_layer_v1.CombinerPreprocessingLayer):
+  """CategoryEncoding layer.
 
   This layer provides options for condensing input data into denser
   representations. It accepts either integer values or strings as inputs,
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 1abc37cb4c3..057575d4ecc 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
@@ -42,10 +42,10 @@ LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
 
 SPLIT_ON_WHITESPACE = "whitespace"
 
-TFIDF = categorical_encoding.TFIDF
-INT = categorical_encoding.INT
-BINARY = categorical_encoding.BINARY
-COUNT = categorical_encoding.COUNT
+TFIDF = category_encoding.TFIDF
+INT = category_encoding.INT
+BINARY = category_encoding.BINARY
+COUNT = category_encoding.COUNT
 
 # This is an explicit regex of all the tokens that will be stripped if
 # LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other
@@ -307,7 +307,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   # These are V1/V2 shim points. There are V1 implementations in the V1 class.
   def _get_vectorization_class(self):
-    return categorical_encoding.CategoricalEncoding
+    return category_encoding.CategoryEncoding
 
   def _get_index_lookup_class(self):
     return string_lookup.StringLookup
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index a7c7b9136f9..505cdc39547 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.util.tf_export import keras_export
@@ -77,7 +77,7 @@ class TextVectorization(text_vectorization.TextVectorization,
   """
 
   def _get_vectorization_class(self):
-    return categorical_encoding_v1.CategoricalEncoding
+    return category_encoding_v1.CategoryEncoding
 
   def _get_index_lookup_class(self):
     return string_lookup_v1.StringLookup
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 2eb7cff75bb..992ff562755 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -46,6 +46,8 @@ from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers import rnn_cell_wrapper_v2
 from tensorflow.python.keras.layers import wrappers
 from tensorflow.python.keras.layers.preprocessing import category_crossing
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
@@ -61,15 +63,11 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                convolutional_recurrent, core, cudnn_recurrent, dense_attention,
                embeddings, einsum_dense, local, merge, noise, normalization,
                pooling, image_preprocessing, preprocessing_normalization_v1,
-               preprocessing_text_vectorization_v1,
-               recurrent, wrappers, hashing, category_crossing)
-ALL_V2_MODULES = (
-    rnn_cell_wrapper_v2,
-    normalization_v2,
-    recurrent_v2,
-    preprocessing_normalization,
-    preprocessing_text_vectorization
-)
+               preprocessing_text_vectorization_v1, recurrent, wrappers,
+               hashing, category_crossing, category_encoding_v1)
+ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
+                  preprocessing_normalization, preprocessing_text_vectorization,
+                  category_encoding)
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
new file mode 100644
index 00000000000..e907d9a293b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
new file mode 100644
index 00000000000..165a6de49a8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -0,0 +1,234 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding_v1.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_num_elements"
+    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tfidf_data"
+    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index c93b8a89fb8..a922b143910 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "CategoryCrossing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoryEncoding"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
new file mode 100644
index 00000000000..e907d9a293b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
new file mode 100644
index 00000000000..2edcfbb6487
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -0,0 +1,232 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_num_elements"
+    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tfidf_data"
+    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index c93b8a89fb8..a922b143910 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "CategoryCrossing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoryEncoding"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"

From 1d096667e9632dd07cba95e732aa3357315b4f5a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 03:56:23 -0700
Subject: [PATCH 0963/1533] Add Int8 PRelu operation to TFLM.

PiperOrigin-RevId: 312648260
Change-Id: Ia1ad20f9b3ca4476d145bd0fdc1bd8f59d6a3c44
---
 tensorflow/lite/micro/kernels/prelu.cc      | 15 +++++++
 tensorflow/lite/micro/kernels/prelu_test.cc | 44 +++++++++++++++++----
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index 2c575269cca..801181abba4 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -102,6 +102,21 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorShape(output), GetTensorData<uint8_t>(output));
       return kTfLiteOk;
     } break;
+    case kTfLiteInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier_1 = output_multiplier_1;
+      op_params.output_shift_1 = output_shift_1;
+      op_params.output_multiplier_2 = output_multiplier_2;
+      op_params.output_shift_2 = output_shift_2;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+          GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+          GetTensorShape(output), GetTensorData<int8_t>(output));
+      return kTfLiteOk;
+    } break;
     default:
       TF_LITE_KERNEL_LOG(
           context, "Only float32 and uint8 are supported currently, got %d.",
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index d6c851a2726..66c0a609e8a 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -82,16 +82,18 @@ void TestPreluFloat(std::initializer_list<int> input_dims_data,
   }
 }
 
+// Template argument T can be either uint8_t or int8_t depending on which type
+// of quantization required to be tested.
+template <typename T>
 void TestPreluQuantized(std::initializer_list<int> input_dims_data,
-                        std::initializer_list<uint8_t> input_data,
-                        float input_min, float input_max,
+                        std::initializer_list<T> input_data, float input_min,
+                        float input_max,
                         std::initializer_list<int> alpha_dims_data,
-                        std::initializer_list<uint8_t> alpha_data,
-                        float alpha_min, float alpha_max,
-                        std::initializer_list<uint8_t> expected_output_data,
+                        std::initializer_list<T> alpha_data, float alpha_min,
+                        float alpha_max,
+                        std::initializer_list<T> expected_output_data,
                         std::initializer_list<int> output_dims_data,
-                        float output_min, float output_max,
-                        uint8_t* output_data) {
+                        float output_min, float output_max, T* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
   TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
@@ -173,7 +175,7 @@ TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
                                   output_data);
 }
 
-TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
+TF_LITE_MICRO_TEST(QuantizedUint8PreluActivationsOpTest) {
   using tflite::testing::F2Q;
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
@@ -200,4 +202,30 @@ TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) {
       kMin, kMax, output_data);
 }
 
+TF_LITE_MICRO_TEST(QuantizedInt8PreluActivationsOpTest) {
+  using tflite::testing::F2QS;
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  const float kAlphaMin = -0.5f;
+  const float kAlphaMax = 0.5f;
+  const int output_dims_count = 12;
+  int8_t output_data[output_dims_count];
+  tflite::testing::TestPreluQuantized(
+      {1, 2, 2, 3},  // input shape
+      {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
+       F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
+       F2QS(-1.0f, kMin, kMax), F2QS(-1.0f, kMin, kMax),
+       F2QS(-1.0f, kMin, kMax), F2QS(-0.25f, kMin, kMax),
+       F2QS(-0.25f, kMin, kMax), F2QS(-0.25f, kMin, kMax)},
+      kMin, kMax, {1, 1, 1, 3},  // alpha shape
+      {F2QS(0.0f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(-0.5f, kMin, kMax)},
+      kMin, kMax,
+      {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
+       F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
+       F2QS(0.0f, kMin, kMax), F2QS(-0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
+       F2QS(0.0f, kMin, kMax), F2QS(-0.125f, kMin, kMax),
+       F2QS(0.125f, kMin, kMax)},
+      {1, 2, 2, 3},  // output shape
+      kMin, kMax, output_data);
+}
 TF_LITE_MICRO_TESTS_END

From dabd045cad1cc555ffafd2797a43a6be576b46e6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 04:31:22 -0700
Subject: [PATCH 0964/1533] Internal change

PiperOrigin-RevId: 312651124
Change-Id: Ibbc245cc3814be993ad13d661b3f555e78152a2e
---
 tensorflow/python/keras/layers/__init__.py    |   6 -
 .../python/keras/layers/preprocessing/BUILD   |  22 +-
 .../layers/preprocessing/benchmarks/BUILD     |   6 +-
 ...k.py => categorical_encoding_benchmark.py} |   8 +-
 ...ry_encoding.py => categorical_encoding.py} | 117 +++++----
 ...categorical_encoding_distribution_test.py} |  31 +--
 ...g_test.py => categorical_encoding_test.py} | 117 +++++----
 ...oding_v1.py => categorical_encoding_v1.py} |  13 +-
 .../preprocessing/text_vectorization.py       |  12 +-
 .../preprocessing/text_vectorization_v1.py    |   4 +-
 .../python/keras/layers/serialization.py      |  16 +-
 ...ing.-category-encoding.__metaclass__.pbtxt |  14 --
 ...tal.preprocessing.-category-encoding.pbtxt | 234 ------------------
 ...as.layers.experimental.preprocessing.pbtxt |   4 -
 ...ing.-category-encoding.__metaclass__.pbtxt |  14 --
 ...tal.preprocessing.-category-encoding.pbtxt | 232 -----------------
 ...as.layers.experimental.preprocessing.pbtxt |   4 -
 17 files changed, 177 insertions(+), 677 deletions(-)
 rename tensorflow/python/keras/layers/preprocessing/benchmarks/{category_encoding_benchmark.py => categorical_encoding_benchmark.py} (93%)
 rename tensorflow/python/keras/layers/preprocessing/{category_encoding.py => categorical_encoding.py} (82%)
 rename tensorflow/python/keras/layers/preprocessing/{category_encoding_distribution_test.py => categorical_encoding_distribution_test.py} (64%)
 rename tensorflow/python/keras/layers/preprocessing/{category_encoding_test.py => categorical_encoding_test.py} (88%)
 rename tensorflow/python/keras/layers/preprocessing/{category_encoding_v1.py => categorical_encoding_v1.py} (89%)
 delete mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
 delete mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index e0f087b2453..67ac91cb9be 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -44,9 +44,6 @@ from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Res
 
 # Preprocessing layers.
 if tf2.enabled():
-  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding
-  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding as CategoryEncodingV1
-  CategoryEncodingV2 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization as NormalizationV1
   NormalizationV2 = Normalization
@@ -54,9 +51,6 @@ if tf2.enabled():
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization as TextVectorizationV1
   TextVectorizationV2 = TextVectorization
 else:
-  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding
-  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding as CategoryEncodingV2
-  CategoryEncodingV1 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization as NormalizationV2
   NormalizationV1 = Normalization
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index af7f6392219..b7fdc17b81d 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -196,7 +196,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":category_encoding",
+        ":categorical_encoding",
         ":string_lookup",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -216,10 +216,10 @@ py_library(
 )
 
 py_library(
-    name = "category_encoding",
+    name = "categorical_encoding",
     srcs = [
-        "category_encoding.py",
-        "category_encoding_v1.py",
+        "categorical_encoding.py",
+        "categorical_encoding_v1.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -308,12 +308,12 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "category_encoding_test",
+    name = "categorical_encoding_test",
     size = "medium",
-    srcs = ["category_encoding_test.py"],
+    srcs = ["categorical_encoding_test.py"],
     python_version = "PY3",
     deps = [
-        ":category_encoding",
+        ":categorical_encoding",
         ":preprocessing_test_utils",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -324,9 +324,9 @@ tf_py_test(
 )
 
 distribute_py_test(
-    name = "category_encoding_distribution_test",
-    srcs = ["category_encoding_distribution_test.py"],
-    main = "category_encoding_distribution_test.py",
+    name = "categorical_encoding_distribution_test",
+    srcs = ["categorical_encoding_distribution_test.py"],
+    main = "categorical_encoding_distribution_test.py",
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
@@ -335,7 +335,7 @@ distribute_py_test(
         "no_oss",  # b/155502591
     ],
     deps = [
-        ":category_encoding",
+        ":categorical_encoding",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 7c976880059..6d29126bc7e 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -11,12 +11,12 @@ package(
 exports_files(["LICENSE"])
 
 tf_py_test(
-    name = "category_encoding_benchmark",
-    srcs = ["category_encoding_benchmark.py"],
+    name = "categorical_encoding_benchmark",
+    srcs = ["categorical_encoding_benchmark.py"],
     python_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/keras/layers/preprocessing:category_encoding",
+        "//tensorflow/python/keras/layers/preprocessing:categorical_encoding",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
similarity index 93%
rename from tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
rename to tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
index 71b4c7b6b61..e68b77ebef9 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for Keras category_encoding preprocessing layer."""
+"""Benchmark for Keras categorical_encoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -42,7 +42,7 @@ class BenchmarkLayer(benchmark.Benchmark):
   def run_dataset_implementation(self, output_mode, batch_size, sequence_length,
                                  max_tokens):
     input_t = keras.Input(shape=(sequence_length,), dtype=dtypes.int32)
-    layer = category_encoding.CategoryEncoding(
+    layer = categorical_encoding.CategoricalEncoding(
         max_tokens=max_tokens, output_mode=output_mode)
     _ = layer(input_t)
 
@@ -68,7 +68,7 @@ class BenchmarkLayer(benchmark.Benchmark):
       ends.append(time.time())
 
     avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
+    name = "categorical_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
         batch_size, sequence_length, max_tokens)
     self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
similarity index 82%
rename from tensorflow/python/keras/layers/preprocessing/category_encoding.py
rename to tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
index b0a7e746074..466405a27a9 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras text CategoryEncoding preprocessing layer."""
+"""Keras text CategoricalEncoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -32,13 +32,11 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat
-from tensorflow.python.util.tf_export import keras_export
 
 TFIDF = "tf-idf"
 INT = "int"
@@ -51,26 +49,14 @@ _NUM_ELEMENTS_NAME = "num_elements"
 _IDF_NAME = "idf"
 
 
-@keras_export("keras.layers.experimental.preprocessing.CategoryEncoding", v1=[])
-class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Category encoding layer.
+class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
+  """Categorical encoding layer.
 
   This layer provides options for condensing data into a categorical encoding.
   It accepts integer values as inputs and outputs a dense representation
   (one sample = 1-index tensor of float values representing data about the
   sample's tokens) of those inputs.
 
-  Examples:
-
-  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
-  ...           max_tokens=4)
-  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
-  <tf.Tensor: shape=(4, 4), dtype=int64, numpy=
-    array([[1, 1, 0, 0],
-           [2, 0, 0, 0],
-           [0, 1, 1, 0],
-           [0, 1, 0, 1]])>
-
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary.
@@ -86,6 +72,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
       `Tensor`. Defaults to `False`.
   """
+  # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
                max_tokens=None,
@@ -96,7 +83,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     layer_utils.validate_string_arg(
         output_mode,
         allowable_strings=(COUNT, BINARY, TFIDF),
-        layer_name="CategoryEncoding",
+        layer_name="CategoricalEncoding",
         arg_name="output_mode")
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
@@ -105,10 +92,10 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       raise ValueError("max_tokens must be > 1.")
 
     # We need to call super() before we call _add_state_variable().
-    combiner = _CategoryEncodingCombiner(
+    combiner = _CategoricalEncodingCombiner(
         compute_max_element=max_tokens is None,
         compute_idf=output_mode == TFIDF)
-    super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
+    super(CategoricalEncoding, self).__init__(combiner=combiner, **kwargs)
 
     self._max_tokens = max_tokens
     self._output_mode = output_mode
@@ -171,12 +158,13 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       RuntimeError: if the layer cannot be adapted at this time.
     """
     if not reset_state:
-      raise ValueError("CategoryEncoding does not support streaming adapts.")
+      raise ValueError("CategoricalEncoding does not support streaming adapts.")
 
     if self._called and self._max_tokens is None:
-      raise RuntimeError("CategoryEncoding can't be adapted after being called "
-                         "if max_tokens is None.")
-    super(CategoryEncoding, self).adapt(data, reset_state)
+      raise RuntimeError(
+          "CategoricalEncoding can't be adapted after being called "
+          "if max_tokens is None.")
+    super(CategoricalEncoding, self).adapt(data, reset_state)
 
   def _set_state_variables(self, updates):
     if not self.built:
@@ -192,7 +180,7 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
         "output_mode": self._output_mode,
         "sparse": self._sparse,
     }
-    base_config = super(CategoryEncoding, self).get_config()
+    base_config = super(CategoricalEncoding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def _convert_to_ndarray(self, x):
@@ -249,40 +237,65 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     else:
       out_depth = self._max_tokens
 
+    if self._sparse:
+      if self._output_mode != COUNT:
+        raise ValueError("Only supports `sparse=True` when `output_mode` "
+                         ' is \"count\", got {}'.format(self._output_mode))
+      inputs = self._convert_to_sparse_inputs(inputs)
+
+      # Consider having sparse.one_hot
+      # Append values to indices, and reduce sum to get the counts.
+      tokens = array_ops.expand_dims(
+          math_ops.cast(inputs.values, dtypes.int64), axis=1)
+      count_tokens = array_ops.concat([inputs.indices, tokens], axis=1)
+      count_values = array_ops.ones_like(inputs.values, dtype=dtypes.int64)
+      unreduced_count_shape = array_ops.concat(
+          [inputs.dense_shape, [out_depth]], axis=0)
+      counts = sparse_tensor.SparseTensor(
+          indices=count_tokens,
+          values=count_values,
+          dense_shape=unreduced_count_shape)
+      count_data = sparse_ops.sparse_reduce_sum_v2(
+          counts, axis=1, output_is_sparse=True)
+      return count_data
+
+    # If the input is a sparse tensor, we densify it with the default value of
+    # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
+    # positions from the output encoding.
+    if isinstance(inputs, sparse_tensor.SparseTensor):
+      inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
+
+    if self._output_mode == BINARY:
+      bool_one_hot_data = array_ops.one_hot(
+          inputs, depth=out_depth, on_value=True, off_value=False)
+      reduced_bool_data = math_ops.reduce_any(bool_one_hot_data, axis=1)
+      binary_data = math_ops.cast(reduced_bool_data, dtypes.int64)
+      binary_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
+      return binary_data
+
+    one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
+    counts = math_ops.reduce_sum(one_hot_data, axis=1)
+    if self._output_mode == COUNT:
+      count_data = math_ops.cast(counts, dtypes.int64)
+      count_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
+      return count_data
+
+    tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
+    tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
     if self._output_mode == TFIDF:
-      # If the input is a sparse tensor, we densify it with the default value of
-      # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
-      # positions from the output encoding.
-      if isinstance(inputs, sparse_tensor.SparseTensor):
-        inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
-      one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
-      counts = math_ops.reduce_sum(one_hot_data, axis=1)
-      tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
-      tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
       return tf_idf_data
 
-    binary_output = (self._output_mode == BINARY)
-    if self._sparse:
-      return bincount_ops.sparse_bincount(
-          inputs, minlength=out_depth, axis=-1, binary_output=binary_output)
-    else:
-      result = bincount_ops.bincount(
-          inputs,
-          minlength=out_depth,
-          dtype=dtypes.int64,
-          axis=-1,
-          binary_output=binary_output)
-      result.set_shape(tensor_shape.TensorShape((None, out_depth)))
-      return result
+    # We can only get here if we didn't recognize the passed mode.
+    raise ValueError("Unknown output mode %s" % self._output_mode)
 
 
-class _CategoryEncodingAccumulator(
+class _CategoricalEncodingAccumulator(
     collections.namedtuple("Accumulator", ["data", "per_doc_count_dict"])):
   pass
 
 
-class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
-  """Combiner for the CategoryEncoding preprocessing layer.
+class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
+  """Combiner for the CategoricalEncoding preprocessing layer.
 
   This class encapsulates the logic for computing the number of elements in the
   input dataset and the document frequency for each element.
@@ -398,7 +411,7 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
   def restore(self, output):
     """Creates an accumulator based on 'output'."""
     raise NotImplementedError(
-        "CategoryEncoding does not restore or support streaming updates.")
+        "CategoricalEncoding does not restore or support streaming updates.")
 
   def serialize(self, accumulator):
     """Serializes an accumulator for a remote call."""
@@ -439,4 +452,4 @@ class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
     else:
       per_doc_count_dict = None
     data = [0, 0]
-    return _CategoryEncodingAccumulator(data, per_doc_count_dict)
+    return _CategoricalEncodingAccumulator(data, per_doc_count_dict)
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
similarity index 64%
rename from tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
rename to tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
index 011495b9314..c5214533f94 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
@@ -21,58 +21,39 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.distribute import tpu_strategy
-from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
-def batch_wrapper(dataset, batch_size, distribution, repeat=None):
-  if repeat:
-    dataset = dataset.repeat(repeat)
-  # TPUs currently require fully defined input shapes, drop_remainder ensures
-  # the input will have fully defined shapes.
-  if isinstance(distribution,
-                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
-    return dataset.batch(batch_size, drop_remainder=True)
-  else:
-    return dataset.batch(batch_size)
-
-
 @combinations.generate(
     combinations.combine(
-        # (b/156783625): Outside compilation failed for eager mode only.
-        distribution=strategy_combinations.strategies_minus_tpu,
+        distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
-class CategoryEncodingDistributionTest(
+class CategoricalEncodingDistributionTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_distribution(self, distribution):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
-    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(input_array)
-    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
 
     # pyformat: disable
     expected_output = [[0, 1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 0, 0]]
     # pyformat: enable
     max_tokens = 6
-    config.set_soft_device_placement(True)
 
     with distribution.scope():
       input_data = keras.Input(shape=(4,), dtype=dtypes.int32)
-      layer = category_encoding.CategoryEncoding(
-          max_tokens=max_tokens, output_mode=category_encoding.BINARY)
+      layer = categorical_encoding.CategoricalEncoding(
+          max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(inp_dataset)
+    output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
 
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
similarity index 88%
rename from tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
rename to tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
index 24eeda57b1f..e21e95a0078 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Keras text category_encoding preprocessing layer."""
+"""Tests for Keras text categorical_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -32,8 +32,8 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers.preprocessing import category_encoding
-from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -44,15 +44,15 @@ from tensorflow.python.platform import test
 
 def get_layer_class():
   if context.executing_eagerly():
-    return category_encoding.CategoryEncoding
+    return categorical_encoding.CategoricalEncoding
   else:
-    return category_encoding_v1.CategoryEncoding
+    return categorical_encoding_v1.CategoricalEncoding
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoryEncodingInputTest(keras_parameterized.TestCase,
-                                preprocessing_test_utils.PreprocessingLayerTest
-                               ):
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_dense_input_sparse_output(self):
     input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
@@ -67,7 +67,9 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -78,7 +80,7 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=category_encoding.COUNT,
+        output_mode=categorical_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -101,7 +103,7 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -126,7 +128,9 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     max_tokens = 6
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -137,7 +141,7 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=category_encoding.COUNT,
+        output_mode=categorical_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -159,7 +163,7 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
     int_data = layer(input_data)
 
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -180,7 +184,9 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -191,7 +197,7 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=category_encoding.COUNT,
+        output_mode=categorical_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -208,7 +214,9 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     encoding_layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
+        max_tokens=max_tokens,
+        output_mode=categorical_encoding.COUNT,
+        sparse=True)
     int_data = encoding_layer(input_data)
     output_data = math_ops.cast(int_data, dtypes.float32)
     weights = variables.Variable([[.1], [.2], [.3], [.4]], dtype=dtypes.float32)
@@ -220,9 +228,9 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
-                                preprocessing_test_utils.PreprocessingLayerTest
-                               ):
+class CategoricalEncodingAdaptTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_sparse_adapt(self):
     vocab_data = sparse_ops.from_dense(
@@ -240,7 +248,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
+        max_tokens=None, output_mode=categorical_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -265,7 +273,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
+        max_tokens=None, output_mode=categorical_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
 
@@ -288,7 +296,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
     int_data = layer(input_data)
     layer.adapt(vocab_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -298,7 +306,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_hard_maximum_set_state_variables_after_build(self):
-    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
 
     # pyformat: disable
@@ -310,7 +318,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
     int_data = layer(input_data)
     layer._set_state_variables(state_variables)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -331,7 +339,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
+        max_tokens=None, output_mode=categorical_encoding.BINARY)
     layer.build(input_data.shape)
     layer.set_num_elements(max_tokens)
     int_data = layer(input_data)
@@ -343,7 +351,8 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
 
   def test_set_weights_fails_on_wrong_size_weights(self):
     tfidf_data = [.05, .5, .25, .2, .125]
-    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
+    layer = get_layer_class()(
+        max_tokens=6, output_mode=categorical_encoding.TFIDF)
 
     with self.assertRaisesRegex(ValueError, ".*Layer weight shape.*"):
       layer.set_weights([np.array(tfidf_data)])
@@ -351,7 +360,7 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
   def test_set_num_elements_after_call_fails(self):
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
+        max_tokens=None, output_mode=categorical_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer.set_num_elements(5)
@@ -361,17 +370,17 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
+        max_tokens=None, output_mode=categorical_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "can't be adapted"):
       layer.adapt(vocab_data)
 
   def test_set_state_variables_after_call_fails(self):
-    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
+        max_tokens=None, output_mode=categorical_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer._set_state_variables(state_variables)
@@ -379,9 +388,9 @@ class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
 
 @keras_parameterized.run_all_keras_modes
 @keras_parameterized.run_all_keras_modes
-class CategoryEncodingOutputTest(keras_parameterized.TestCase,
-                                 preprocessing_test_utils.PreprocessingLayerTest
-                                ):
+class CategoricalEncodingOutputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_binary_output_hard_maximum(self):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
@@ -395,7 +404,7 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -415,7 +424,7 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.BINARY)
+        max_tokens=None, output_mode=categorical_encoding.BINARY)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -435,7 +444,8 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.COUNT)
+    layer = get_layer_class()(
+        max_tokens=6, output_mode=categorical_encoding.COUNT)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -455,7 +465,7 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.COUNT)
+        max_tokens=None, output_mode=categorical_encoding.COUNT)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -478,7 +488,8 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
+    layer = get_layer_class()(
+        max_tokens=6, output_mode=categorical_encoding.TFIDF)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -502,7 +513,7 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=category_encoding.TFIDF)
+        max_tokens=None, output_mode=categorical_encoding.TFIDF)
     layer.set_num_elements(max_tokens)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
@@ -513,7 +524,7 @@ class CategoryEncodingOutputTest(keras_parameterized.TestCase,
     self.assertAllClose(expected_output, output_dataset)
 
 
-class CategoryEncodingModelBuildingTest(
+class CategoricalEncodingModelBuildingTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -521,27 +532,27 @@ class CategoryEncodingModelBuildingTest(
       {
           "testcase_name": "count_hard_max",
           "max_tokens": 5,
-          "output_mode": category_encoding.COUNT
+          "output_mode": categorical_encoding.COUNT
       }, {
           "testcase_name": "count_soft_max",
           "max_tokens": None,
-          "output_mode": category_encoding.COUNT
+          "output_mode": categorical_encoding.COUNT
       }, {
           "testcase_name": "binary_hard_max",
           "max_tokens": 5,
-          "output_mode": category_encoding.BINARY
+          "output_mode": categorical_encoding.BINARY
       }, {
           "testcase_name": "binary_soft_max",
           "max_tokens": None,
-          "output_mode": category_encoding.BINARY
+          "output_mode": categorical_encoding.BINARY
       }, {
           "testcase_name": "tfidf_hard_max",
           "max_tokens": 5,
-          "output_mode": category_encoding.TFIDF
+          "output_mode": categorical_encoding.TFIDF
       }, {
           "testcase_name": "tfidf_soft_max",
           "max_tokens": None,
-          "output_mode": category_encoding.TFIDF
+          "output_mode": categorical_encoding.TFIDF
       })
   def test_end_to_end_bagged_modeling(self, output_mode, max_tokens):
     tfidf_data = np.array([.03, .5, .25, .2, .125])
@@ -553,7 +564,7 @@ class CategoryEncodingModelBuildingTest(
     weights = []
     if max_tokens is None:
       weights.append(np.array(5))
-    if output_mode == category_encoding.TFIDF:
+    if output_mode == categorical_encoding.TFIDF:
       weights.append(tfidf_data)
 
     layer.set_weights(weights)
@@ -566,7 +577,7 @@ class CategoryEncodingModelBuildingTest(
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoryEncodingCombinerTest(
+class CategoricalEncodingCombinerTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -606,7 +617,8 @@ class CategoryEncodingCombinerTest(
 
   def test_combiner_api_compatibility_int_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=False)
+    combiner = categorical_encoding._CategoricalEncodingCombiner(
+        compute_idf=False)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "num_documents": np.array(2),
@@ -624,7 +636,8 @@ class CategoryEncodingCombinerTest(
 
   def test_combiner_api_compatibility_tfidf_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=True)
+    combiner = categorical_encoding._CategoricalEncodingCombiner(
+        compute_idf=True)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "document_counts": np.array([1, 2, 2, 2, 1]),
@@ -680,7 +693,7 @@ class CategoryEncodingCombinerTest(
                                 expected_accumulator_output,
                                 expected_extract_output,
                                 compute_idf=True):
-    combiner = category_encoding._CategoryEncodingCombiner(
+    combiner = categorical_encoding._CategoricalEncodingCombiner(
         compute_idf=compute_idf)
     expected_accumulator = combiner._create_accumulator()
     expected_accumulator = self.update_accumulator(expected_accumulator,
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
similarity index 89%
rename from tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
rename to tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
index 3afb86b344f..83128ed5095 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
@@ -12,21 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tensorflow V1 version of the text category_encoding preprocessing layer."""
+"""Tensorflow V1 version of the text categorical_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import category_encoding
-from tensorflow.python.util.tf_export import keras_export
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding
 
 
-@keras_export(v1=["keras.layers.experimental.preprocessing.CategoryEncoding"])
-class CategoryEncoding(category_encoding.CategoryEncoding,
-                       base_preprocessing_layer_v1.CombinerPreprocessingLayer):
-  """CategoryEncoding layer.
+class CategoricalEncoding(categorical_encoding.CategoricalEncoding,
+                          base_preprocessing_layer_v1.CombinerPreprocessingLayer
+                         ):
+  """CategoricalEncoding layer.
 
   This layer provides options for condensing input data into denser
   representations. It accepts either integer values or strings as inputs,
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 057575d4ecc..1abc37cb4c3 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
-from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding
 from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
@@ -42,10 +42,10 @@ LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
 
 SPLIT_ON_WHITESPACE = "whitespace"
 
-TFIDF = category_encoding.TFIDF
-INT = category_encoding.INT
-BINARY = category_encoding.BINARY
-COUNT = category_encoding.COUNT
+TFIDF = categorical_encoding.TFIDF
+INT = categorical_encoding.INT
+BINARY = categorical_encoding.BINARY
+COUNT = categorical_encoding.COUNT
 
 # This is an explicit regex of all the tokens that will be stripped if
 # LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other
@@ -307,7 +307,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   # These are V1/V2 shim points. There are V1 implementations in the V1 class.
   def _get_vectorization_class(self):
-    return category_encoding.CategoryEncoding
+    return categorical_encoding.CategoricalEncoding
 
   def _get_index_lookup_class(self):
     return string_lookup.StringLookup
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index 505cdc39547..a7c7b9136f9 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.util.tf_export import keras_export
@@ -77,7 +77,7 @@ class TextVectorization(text_vectorization.TextVectorization,
   """
 
   def _get_vectorization_class(self):
-    return category_encoding_v1.CategoryEncoding
+    return categorical_encoding_v1.CategoricalEncoding
 
   def _get_index_lookup_class(self):
     return string_lookup_v1.StringLookup
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 992ff562755..2eb7cff75bb 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -46,8 +46,6 @@ from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers import rnn_cell_wrapper_v2
 from tensorflow.python.keras.layers import wrappers
 from tensorflow.python.keras.layers.preprocessing import category_crossing
-from tensorflow.python.keras.layers.preprocessing import category_encoding
-from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
@@ -63,11 +61,15 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                convolutional_recurrent, core, cudnn_recurrent, dense_attention,
                embeddings, einsum_dense, local, merge, noise, normalization,
                pooling, image_preprocessing, preprocessing_normalization_v1,
-               preprocessing_text_vectorization_v1, recurrent, wrappers,
-               hashing, category_crossing, category_encoding_v1)
-ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
-                  preprocessing_normalization, preprocessing_text_vectorization,
-                  category_encoding)
+               preprocessing_text_vectorization_v1,
+               recurrent, wrappers, hashing, category_crossing)
+ALL_V2_MODULES = (
+    rnn_cell_wrapper_v2,
+    normalization_v2,
+    recurrent_v2,
+    preprocessing_normalization,
+    preprocessing_text_vectorization
+)
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
deleted file mode 100644
index e907d9a293b..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
deleted file mode 100644
index 165a6de49a8..00000000000
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ /dev/null
@@ -1,234 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding_v1.CategoryEncoding\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dynamic"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_spec"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "metrics"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stateful"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
-  }
-  member_method {
-    name: "adapt"
-    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_signature"
-    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_num_elements"
-    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_tfidf_data"
-    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index a922b143910..c93b8a89fb8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -4,10 +4,6 @@ tf_module {
     name: "CategoryCrossing"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "CategoryEncoding"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
deleted file mode 100644
index e907d9a293b..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
-tf_class {
-  is_instance: "<type \'type\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "mro"
-  }
-  member_method {
-    name: "register"
-    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
deleted file mode 100644
index 2edcfbb6487..00000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ /dev/null
@@ -1,232 +0,0 @@
-path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
-tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
-  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
-  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "activity_regularizer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dynamic"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "inbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "input_spec"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "losses"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "metrics"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name_scope"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "non_trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "outbound_nodes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_mask"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stateful"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "submodules"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "trainable_weights"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "updates"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "variables"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "weights"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
-  }
-  member_method {
-    name: "adapt"
-    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
-  member_method {
-    name: "add_loss"
-    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_metric"
-    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_update"
-    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "add_variable"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
-  }
-  member_method {
-    name: "apply"
-    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "compute_output_shape"
-    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "compute_output_signature"
-    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "count_params"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_config"
-    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_config"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_input_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_losses_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_mask_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_output_shape_at"
-    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_updates_for"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_weights"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_num_elements"
-    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_tfidf_data"
-    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_weights"
-    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_name_scope"
-    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index a922b143910..c93b8a89fb8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -4,10 +4,6 @@ tf_module {
     name: "CategoryCrossing"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "CategoryEncoding"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"

From 52fd23939e65699f8c7d53850a17daab7cc83177 Mon Sep 17 00:00:00 2001
From: Khanh LeViet <khanhlvg@google.com>
Date: Thu, 21 May 2020 05:10:17 -0700
Subject: [PATCH 0965/1533] Update TF Lite roadmap

PiperOrigin-RevId: 312654285
Change-Id: I7652bbb1c83dc9cbe3d31d213cfadc90a1c85ec3
---
 tensorflow/lite/g3doc/guide/roadmap.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/guide/roadmap.md b/tensorflow/lite/g3doc/guide/roadmap.md
index 35ef44a7dbf..b762db12c44 100644
--- a/tensorflow/lite/g3doc/guide/roadmap.md
+++ b/tensorflow/lite/g3doc/guide/roadmap.md
@@ -1,4 +1,4 @@
-# TensorFlow Lite 2019 Roadmap
+# TensorFlow Lite Roadmap
 
 **Updated: April 18, 2020**
 

From 5e9effbbae7709aefd9be64180cdf7fb1bd6df87 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 21 May 2020 08:54:27 -0700
Subject: [PATCH 0966/1533] [TF:TRT] Delay the computation of GraphProperties
 used by ConvertAfterShapes.

Replace input_graph_def and graph_properties in ConversionParams with
grappler_item. Also move the computation of GraphProperties used by
ConvertAfterShapes to the routine.

This is to prepare for a change that transforms certain cast operations inside
ConvertAfterShapes before computing the GraphProperties.

PiperOrigin-RevId: 312678395
Change-Id: If0197c0880f02766481a2fdd8574cc2c1aafa015
---
 .../tf2tensorrt/convert/convert_graph.cc      | 37 +++++++++----------
 .../tf2tensorrt/convert/convert_graph.h       |  6 +--
 .../tf2tensorrt/convert/convert_graph_test.cc |  3 +-
 .../convert/trt_optimization_pass.cc          |  8 ++--
 4 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index 806d930b76f..aed422a5627 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -41,14 +41,11 @@ limitations under the License.
 #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
-#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"  // NOLINT
@@ -90,8 +87,6 @@ bool AllowDynamicNonBatchDimension(const ConversionParams& params) {
          GetEngineType(params) == EngineInfo::EngineType::TRTDynamic;
 }
 
-}  // namespace
-
 struct EdgePtrCompare {
   bool operator()(const Edge* lhs, const Edge* rhs) const {
     return lhs->id() < rhs->id();
@@ -555,6 +550,13 @@ Status CreateTRTNode(const ConversionParams& params,
   return Status::OK();
 }
 
+int64 GetNextGraphSequenceNumber() {
+  static std::atomic<int64> graph_sequence_num;
+  return graph_sequence_num++;
+}
+
+}  // namespace
+
 Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
                                       Graph* graph, const string& engine_name) {
   Graph segment_graph(graph->flib_def());
@@ -629,11 +631,6 @@ std::pair<int, Allocator*> GetDeviceAndAllocator(const ConversionParams& params,
   return std::make_pair(cuda_device_id, dev_allocator);
 }
 
-int64 GetNextGraphSequenceNumber() {
-  static std::atomic<int64> graph_sequence_num;
-  return graph_sequence_num++;
-}
-
 // Entry function from optimization pass.
 Status ConvertAfterShapes(const ConversionParams& params) {
   // Sanity checks.
@@ -643,12 +640,15 @@ Status ConvertAfterShapes(const ConversionParams& params) {
         "Calibration with FP32 or FP16 is not supported.");
   }
 
+  grappler::GraphProperties static_graph_properties(*params.grappler_item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
+
+  const GraphDef& graph_def = params.grappler_item->graph;
   // Convert graphdef to graph.
-  FunctionLibraryDefinition flib(OpRegistry::Global(),
-                                 params.input_graph_def->library());
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
   Graph graph(flib);
-  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(),
-                                            *params.input_graph_def, &graph));
+  TF_RETURN_IF_ERROR(
+      ConvertGraphDefToGraph(GraphConstructorOptions(), graph_def, &graph));
 
   // Segment the graph into subgraphs that can be converted to TensorRT
   segment::SegmentOptions segment_options;
@@ -662,10 +662,10 @@ Status ConvertAfterShapes(const ConversionParams& params) {
       AllowDynamicNonBatchDimension(params);
 
   segment::SegmentNodesVector initial_segments;
-  TrtNodeValidator validator(*params.graph_properties, params.precision_mode,
+  TrtNodeValidator validator(static_graph_properties, params.precision_mode,
                              params.use_calibration, params.use_implicit_batch);
   TF_RETURN_IF_ERROR(segment::SegmentGraph(
-      &graph, params.graph_properties,
+      &graph, &static_graph_properties,
       std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator,
                 std::placeholders::_1),
       // Input validation is already done by TrtNodeValidator, so we don't
@@ -693,9 +693,8 @@ Status ConvertAfterShapes(const ConversionParams& params) {
     auto& curr_segment = initial_segments.at(t);
     EngineInfo curr_engine;
     curr_engine.engine_name = StrCat(engine_name_prefix, t);
-    Status status =
-        GetEngineInfo(&graph, *params.graph_properties, curr_segment, node_map,
-                      reverse_topo_order, &curr_engine);
+    Status status = GetEngineInfo(&graph, static_graph_properties, curr_segment,
+                                  node_map, reverse_topo_order, &curr_engine);
     if (!status.ok()) {
       LOG(WARNING) << "Failed to get engine info for segment " << t << ": "
                    << status;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
index 2bfaa2a786c..53ab84a6fa9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h
@@ -18,10 +18,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
-#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
-#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -33,7 +32,7 @@ namespace tensorrt {
 namespace convert {
 
 struct ConversionParams {
-  const GraphDef* input_graph_def = nullptr;
+  const grappler::GrapplerItem* grappler_item = nullptr;
   const std::vector<string>* output_names = nullptr;
   string trt_logger_name;
   size_t max_batch_size = 1;
@@ -41,7 +40,6 @@ struct ConversionParams {
   GraphDef* output_graph_def = nullptr;
   TrtPrecisionMode precision_mode = TrtPrecisionMode::FP32;
   int minimum_segment_size = 3;
-  const grappler::GraphProperties* graph_properties = nullptr;
   const grappler::Cluster* cluster = nullptr;
   // Whether to create engine on conversion or execution time
   bool is_dyn_op = false;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
index 2cfefd27a67..a1f523d6bfa 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph_test.cc
@@ -162,12 +162,11 @@ class ConvertAfterShapesTest : public ::testing::Test {
     // Construct ConversionParams.
     const std::vector<string> output_names{"output"};
     ConversionParams params;
-    params.input_graph_def = &item.graph;
     params.output_names = &output_names;
     params.max_workspace_size_bytes = 8 << 20;
     params.output_graph_def = output_graph_def;
     params.minimum_segment_size = 1;
-    params.graph_properties = &graph_properties;
+    params.grappler_item = &item;
     params.use_calibration = false;
     params.trt_logger_name = "DefaultLogger";
 
diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
index 6ab719db54d..72f4fe5ef9b 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc
@@ -228,9 +228,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
                 << "This can result in poor performance.";
     }
   }
-  grappler::GraphProperties static_graph_properties(item);
-  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
-  ConversionParams cp;
 
   if (use_calibration_ && precision_mode_ != TrtPrecisionMode::INT8) {
     VLOG(1) << "Calibration with FP32 or FP16 is not implemented. "
@@ -255,7 +252,9 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
     }
     nodes_to_preserve.push_back(s);
   }
-  cp.input_graph_def = &item.graph;
+
+  ConversionParams cp;
+  cp.grappler_item = &item;
   cp.output_names = &nodes_to_preserve;
   cp.trt_logger_name = trt_logger_name_;
   cp.max_batch_size = maximum_batch_size_;
@@ -263,7 +262,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster,
   cp.output_graph_def = optimized_graph;
   cp.precision_mode = precision_mode_;
   cp.minimum_segment_size = minimum_segment_size_;
-  cp.graph_properties = &static_graph_properties;
   cp.cluster = cluster;
   cp.is_dyn_op = is_dynamic_op_;
   cp.max_cached_engines = max_cached_batches_;

From 511594e844b7176971487a2ae948b593244f4740 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 08:56:37 -0700
Subject: [PATCH 0967/1533] Add tf.Yield and tf.IfRegion op to model
 non-functional conditionals

PiperOrigin-RevId: 312678676
Change-Id: I53007078552347df678e063547823a76eab2c34c
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |   2 +-
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  50 +++++
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  64 ++++++
 .../mlir/tensorflow/tests/tf-ops.mlir         | 209 ++++++++++++++++++
 4 files changed, 324 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 44174f6b6a2..fb93bec5b56 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -4098,7 +4098,7 @@ def TFL_WhileOp : Op<TFL_Dialect, "while", [
 
     input: A list of input tensors whose types are T.
     output: A list of output tensors whose types are T.
-    cond: A region takes 'input' and returns a boolean scalar tensor.
+    cond: A region that takes 'input' and returns a boolean scalar tensor.
     body: A region that takes a list of tensors and returns another
           list of tensors. Both lists have the same types.
   }];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index ea41c8224f0..d166f7bace7 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1884,6 +1884,56 @@ static LogicalResult Verify(IfOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// YieldOp
+//===----------------------------------------------------------------------===//
+
+static LogicalResult Verify(YieldOp op) {
+  auto parent = op.getParentOp();
+  // A YieldOp should be contained within an IfRegion op
+  // (and WhileRegion in future)
+  if (!isa<IfRegionOp>(parent))
+    op.emitError() << " expects parent op "
+                   << "'" << IfRegionOp::getOperationName() << "' but got '"
+                   << parent->getName().getStringRef() << "'";
+  return success();
+}
+
+//===----------------------------------------------------------------------===//
+// IfRegionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult VerifyRegionResults(Operation *op, Region &region,
+                                  StringRef region_name) {
+  auto op_name = op->getName().getStringRef();
+  // verify that op outputs match yield inputs
+  YieldOp yield = cast<YieldOp>(region.front().getTerminator());
+  unsigned expected_num_results = op->getNumResults();
+  if (yield.getNumOperands() != expected_num_results)
+    return op->emitError(region_name + " region should have " +
+                         Twine(expected_num_results) + " results");
+
+  for (int idx : llvm::seq<int>(0, expected_num_results)) {
+    auto op_result_type = op->getResult(idx).getType().cast<TensorType>();
+    auto region_result_type =
+        yield.getOperand(idx).getType().cast<TensorType>();
+    if (!AreCastCompatible({region_result_type, op_result_type}))
+      return op->emitError(llvm::formatv(
+          "{0} result type {1} is incompatible with {2} "
+          "result type {3} at index {4}",
+          region_name, region_result_type, op_name, op_result_type, idx));
+  }
+  return success();
+}
+
+static LogicalResult Verify(IfRegionOp op) {
+  if (failed(VerifyRegionResults(op, op.then_branch(), "then")))
+    return failure();
+  if (failed(VerifyRegionResults(op, op.else_branch(), "else")))
+    return failure();
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // InvertOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 94b0c5f5e19..1b8f9eb4bb6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -207,6 +207,70 @@ else_branch: A function that takes 'inputs' and returns a list of
   }];
 }
 
+def TF_YieldOp : TF_Op<"Yield", [Terminator]> {
+  let summary = "Yield operation";
+  let description = [{
+    The "yield" operation represents a return operation within the conditional
+    and body of structured control flow (e.g., if and while). The operation
+    takes a variable number of operands and produces no results. The number and
+    types of inputs must match the signature of the operation that contains the
+    region.
+  }];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
+def TF_IfRegionOp : TF_Op<"IfRegion",
+      [SingleBlockImplicitTerminator<"YieldOp">]> {
+  let summary = "output = cond ? then_branch output : else_branch output";
+
+  let description = [{
+"output = cond ? then_branch output : else_branch output"
+
+cond: A Tensor. If the tensor is a scalar of non-boolean type, the
+    scalar is converted to a boolean according to the
+    following rule: if the scalar is a numerical value, non-zero means
+    True and zero means False; if the scalar is a string, non-empty
+    means True and empty means False. If the tensor is not a scalar,
+    being empty means False and being non-empty means True.
+input: A list of input tensors.
+then_branch: A region that computes the outputs of the op if cond = true.
+    It returns a list of tensors using tf.yield (as the terminator). The
+    types of these returned tensors is same as that of the else_branch
+else_branch: A region that computes the outputs of the op if cond = false.
+    It returns a list of tensors using tf.yield (as the terminator). The
+    types of these returned tensors is same as that of the then_branch
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$cond,
+    Variadic<TF_Tensor>:$input,
+
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
+
+    // Used to map StatelessIf and If op defined in TensorFlow to a common op.
+    BoolAttr:$is_stateless
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeAttr Tcond = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+
+  let regions = (region SizedRegion<1>:$then_branch, SizedRegion<1>:$else_branch);
+
+  let verifier = [{
+    return Verify(*this);
+  }];
+}
+
 def TF_MeanOp : TF_Op<"Mean", [NoSideEffect, TF_FoldOperandsTransposeInterface]> {
   let summary = "Computes the mean of elements across dimensions of a tensor.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 82e60a08e2e..c0d1a914788 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -854,6 +854,215 @@ func @testInvalidIfOp(tensor<i1>, tensor<*xf32>) -> tensor<2xf32> {
 
 // -----
 
+// Test invalid tf.Yield operation (parent should be IfRegion)
+func @testInvalidYieldOp(%arg0: f32) -> () {
+  // expected-error @+1 {{expects parent op 'tf.IfRegion'}}
+  "tf.Yield"(%arg0) : (f32) -> ()
+}
+
+// -----
+
+// Test valid tf.IfRegion operation
+// CHECK-LABEL: func @testValidIfRegionOp
+func @testValidIfRegionOp(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// Test valid tf.IfRegion operation with multiple results
+// CHECK-LABEL: func @testValidIfRegionOpWithMultipleResults
+func @testValidIfRegionOpWithMultipleResults(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %0, %1, %2 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t0 = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %t1 = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %t2 = "tf.Acosh"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+    "tf.Yield"(%t0, %t1, %t2) : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> ()
+    }, {
+     %e0 = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %e1 = "tf.Relu"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %e2 = "tf.Sin"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e0, %e1, %e2) : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+
+  %3 = "tf.Add"(%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  %4 = "tf.Add"(%2, %3) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
+  return %4 : tensor<2xf32>
+}
+
+// -----
+
+// Test invalid type for operand #0 for tf.IfRegion operation
+func @testInvalidIfRegionOpType0(%arg0: f32, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{operand #0 must be tensor of tf.dtype values}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (f32, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// Test invalid type for operand #1 for tf.IfRegion operation
+func @testInvalidIfRegionOpType1(%arg0: tensor<i1>, %arg1: f32) -> f32 {
+  // expected-error @+1 {{operand #1 must be tensor of tf.dtype values}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = addf %arg1, %arg1 : f32
+     "tf.Yield"(%t) : (f32) -> ()
+    }, {
+     %e = mulf %arg1, %arg1 : f32
+     "tf.Yield"(%e) : (f32) -> ()
+    }) { is_stateless = false} : (tensor<i1>, f32) -> f32
+
+  return %0 : f32
+}
+
+// -----
+
+// tf.IfRegion operation should have 2 regions
+func @testInvalidIfRegionOp1Region(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{op expected 2 regions}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testInvalidIfRegionOpNoRegions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{op expected 2 regions}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testInvalidIfRegionOp3Regions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{op expected 2 regions}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %te = "tf.Relu"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%te) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// tf.IfRegion regions should be terminated with a tf.Yield
+func @testIfRegionThenTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+2 {{'tf.IfRegion' op expects regions to end with 'tf.Yield'}}
+  // expected-note @+1 {{in custom textual format, the absence of terminator implies 'tf.Yield'}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+   }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+2 {{'tf.IfRegion' op expects regions to end with 'tf.Yield'}}
+  // expected-note @+1 {{in custom textual format, the absence of terminator implies 'tf.Yield'}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// tf.Region yield number of results should match op number of results
+func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{then region should have 1 result}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{else region should have 1 result}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e, %e) : (tensor<2xf32>, tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+// tf.IfRegion yield types should match op result types
+func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{then result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     "tf.Yield"(%arg0) : (tensor<i1>) -> ()
+    }, {
+     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionOpYieldMismatchElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  // expected-error @+1 {{else result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
+  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+     %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     "tf.Yield"(%arg0) : (tensor<i1>) -> ()
+    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
 // Test valid tf.MatrixBandPart
 // CHECK-LABEL: func @testValidMatrixBandPartOp
 func @testValidMatrixBandPartOp(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {

From c7ae49c070fc0ae6cd62711e2c238feaef2903ca Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 30 Apr 2020 19:47:08 +0000
Subject: [PATCH 0968/1533] Replace print(hist) with hist.numpy() to avoid
 print call, also avoid pylint too long (80) error

Remove extra "```" as doctest use ">>>" and "..." instead, and fix pylint issue by shorten the output

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/histogram_ops.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index ffdd900ec71..da1411f8a1f 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -62,15 +62,14 @@ def histogram_fixed_width_bins(values,
 
   Examples:
 
-  ```python
   >>> # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-  ... 
+  ...
   >>> nbins = 5
   >>> value_range = [0.0, 5.0]
   >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-  >>> tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
-  <tf.Tensor: shape=(6,), dtype=int32, numpy=array([0, 0, 1, 2, 4, 4], dtype=int32)>
-  ```
+  >>> indices = tf.histogram_fixed_width_bins(new_values, value_range, nbins=5)
+  >>> indices.numpy()
+  array([0, 0, 1, 2, 4, 4], dtype=int32)
   """
   with ops.name_scope(name, 'histogram_fixed_width_bins',
                       [values, value_range, nbins]):
@@ -129,15 +128,14 @@ def histogram_fixed_width(values,
 
   Examples:
 
-  ```python
   >>> # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
-  ... 
+  ...
   >>> nbins = 5
   >>> value_range = [0.0, 5.0]
   >>> new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
-  >>> tf.histogram_fixed_width(new_values, value_range, nbins=5)
-  <tf.Tensor: shape=(5,), dtype=int32, numpy=array([2, 1, 1, 0, 2], dtype=int32)>
-  ```
+  >>> hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
+  >>> hist.numpy()
+  array([2, 1, 1, 0, 2], dtype=int32)
   """
   with ops.name_scope(name, 'histogram_fixed_width',
                       [values, value_range, nbins]) as name:

From 988991b89219eb317242a348fc0394041229f95a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 21 May 2020 16:45:25 +0000
Subject: [PATCH 0969/1533] include tf_types.def as part of the tf-nightly pip
 install

While trying to build mlir with tf-nightly, there are situations

`tensorflow/compiler/mlir/tensorflow/ir/tf_types.h`

needs to be included. However, this file implicitly includes
`tensorflow/compiler/mlir/tensorflow/ir/tf_types.def` which is not included.

The follow error thrown out:
```
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h:37:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h:25:
bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h:74:10: fatal error: 'tensorflow/compiler/mlir/tensorflow/ir/tf_types.def' file not found
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 error generated.
```

This PR add `.def` file under `tensorflow/compiler` to be part of the pip install, so that
`tf_types.h` could be used.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/tools/pip_package/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 8a5450d78b6..01a3696823d 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -240,6 +240,7 @@ headers = (
     list(find_files('*.proto', 'tensorflow/compiler')) +
     list(find_files('*.proto', 'tensorflow/core')) +
     list(find_files('*.proto', 'tensorflow/python')) +
+    list(find_files('*.def', 'tensorflow/compiler')) +
     list(find_files('*.h', 'tensorflow/c')) +
     list(find_files('*.h', 'tensorflow/cc')) +
     list(find_files('*.h', 'tensorflow/compiler')) +

From f2515dc7d6363c935cbba7103a92b8dc086cecfa Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Thu, 21 May 2020 09:49:40 -0700
Subject: [PATCH 0970/1533] Prevent extra downloads for the xtensa-xpg target.

PiperOrigin-RevId: 312687246
Change-Id: I6ee49dfeb08722ba5ce4475371a7cb3d71fef4cc
---
 tensorflow/lite/micro/tools/make/Makefile     |  5 ++++-
 .../micro/tools/make/download_dependencies.sh | 20 -------------------
 .../make/targets/xtensa_xpg_makefile.inc      | 12 +++++++++++
 3 files changed, 16 insertions(+), 21 deletions(-)
 delete mode 100755 tensorflow/lite/micro/tools/make/download_dependencies.sh

diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 1331163a410..a0a32728baf 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -94,8 +94,10 @@ endif
 # runtime that can be linked in to other programs.
 MICROLITE_LIB_NAME := libtensorflow-microlite.a
 
+# These two must be defined before we include the target specific Makefile.inc
+# because we filter out the examples that are not supported for those targets.
+# See targets/xtensa_xpg_makefile.inc for an example.
 MICRO_LITE_EXAMPLE_TESTS := $(shell find tensorflow/lite/micro/examples/ -name Makefile.inc)
-
 MICRO_LITE_BENCHMARKS := $(wildcard tensorflow/lite/micro/benchmarks/Makefile.inc)
 
 MICROLITE_TEST_SRCS := \
@@ -237,6 +239,7 @@ include $(MAKEFILE_DIR)/third_party_downloads.inc
 THIRD_PARTY_DOWNLOADS :=
 $(eval $(call add_third_party_download,$(GEMMLOWP_URL),$(GEMMLOWP_MD5),gemmlowp,))
 $(eval $(call add_third_party_download,$(FLATBUFFERS_URL),$(FLATBUFFERS_MD5),flatbuffers,))
+$(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,))
 
 # These target-specific makefiles should modify or replace options like
 # CXXFLAGS or LIBS to work for a specific targeted architecture. All logic
diff --git a/tensorflow/lite/micro/tools/make/download_dependencies.sh b/tensorflow/lite/micro/tools/make/download_dependencies.sh
deleted file mode 100755
index df2caedb28d..00000000000
--- a/tensorflow/lite/micro/tools/make/download_dependencies.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-set -e
-
-echo "download_dependencies.sh is no longer needed, just use 'make -f tensorflow/lite/micro/tools/make/Makefile'." >&2
-exit 1
diff --git a/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
index 5ed601f8dd1..dba98b45cd9 100644
--- a/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/xtensa_xpg_makefile.inc
@@ -30,4 +30,16 @@ ifeq ($(TARGET), xtensa-xpg)
   LDFLAGS += -Wl,-gc-sections
 
   TEST_SCRIPT := tensorflow/lite/micro/testing/test_xtensa_xpg_binary.sh
+
+  # TODO(b/156962140): This manually maintained list of excluded examples is
+  # quite error prone.
+  EXCLUDED_EXAMPLE_TESTS := \
+    tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc \
+    tensorflow/lite/micro/examples/magic_wand/Makefile.inc \
+    tensorflow/lite/micro/examples/micro_speech/Makefile.inc \
+    tensorflow/lite/micro/examples/network_tester/Makefile.inc \
+    tensorflow/lite/micro/examples/person_detection/Makefile.inc \
+    tensorflow/lite/micro/examples/person_detection_experimental/Makefile.inc
+  MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS))
+
 endif

From 3d9ec6298a5d167c93e260605d3d2957d294fcb2 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 21 May 2020 09:53:22 -0700
Subject: [PATCH 0971/1533] Limit FillOp custom folder to int and float types
 to avoid crash

DenseElementsAttr::getValue doesn't work for complex and string types and denseelementsAttr::get with Attribute only works for int and float types.

It is possible to handle complex types in the custom folder but not doing that now as complex types are less common and it would be easier to handle those once we have an attribute type for complex types.

PiperOrigin-RevId: 312687907
Change-Id: I4596e82d7b7e1d353bfb045b39b451785a7474e7
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc     |  8 +++++++-
 .../compiler/mlir/tensorflow/tests/canonicalize.mlir | 12 ++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index d166f7bace7..cbbb9fd5db3 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1661,10 +1661,16 @@ void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
 OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
   assert(operands.size() == 2 && "fill op has two operand");
 
+  auto type = getType().cast<ShapedType>();
+  // DenseElementsAttr that is used in this folder only supports int and float
+  // types.
+  // TODO(hinsu): Handle complex types once there is a attribute kind for
+  // complex.
+  if (!type.getElementType().isIntOrFloat()) return {};
+
   auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
   if (!value) return {};
 
-  auto type = getType().cast<ShapedType>();
   if (type.hasStaticShape())
     return DenseElementsAttr::get(type, value.getValue({}));
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 20f4dd79715..a77aa5b8346 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -526,12 +526,20 @@ func @testRankOfRankedTensor(%arg0 : tensor<4x3x2xf32>) -> tensor<i32> {
 }
 
 // CHECK-LABEL: @foldFill
-func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>) {
+func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>) {
   %0 = "tf.Const"() {value = dense<[3, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
   %1 = "tf.Const"() {value = dense<23.0> : tensor<f32>} : () -> tensor<f32>
   // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
   %2 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<3x2x1xf32>
   // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
   %3 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<*xf32>
-  return %2, %3 : tensor<3x2x1xf32>, tensor<*xf32>
+
+  %complex_cst = "tf.Const"() {value = dense<(0.000000e+00,1.000000e+00)> : tensor<complex<f32>>} : () -> tensor<complex<f32>>
+  // Here, custom folder doesn't handle complex dtypes and it is folded through
+  // the constant folding hook.
+  // TODO(hinsu): Handle complex dtypes in the custom folder for FillOp.
+  // CHECK: "tf.Const"() {value = dense<(0.000000e+00,1.000000e+00)> : tensor<3x2x1xcomplex<f32>>} : () -> tensor<*xcomplex<f32>>
+  %4 = "tf.Fill"(%0, %complex_cst) : (tensor<3xi32>, tensor<complex<f32>>) -> tensor<*xcomplex<f32>>
+
+  return %2, %3, %4 : tensor<3x2x1xf32>, tensor<*xf32>, tensor<*xcomplex<f32>>
 }

From 50605edea5c4cfc0e1b9d04cdb1ef92fbf2be395 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sun, 3 May 2020 22:39:33 +0200
Subject: [PATCH 0972/1533] Enable Conv2D op conversion in dynamic shape mode

---
 .../tf2tensorrt/convert/convert_nodes.cc      |  9 +-
 .../tf2tensorrt/convert/convert_nodes_test.cc | 89 +++++++++++--------
 2 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index e791ff9ff60..132c4d6dd68 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -2146,6 +2146,12 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
         "Stride must be 1 for batch and channel dimensions, at ",
         node_def.name());
   }
+  // Channel dim must be static for DepthwiseConv2dNative since we use that
+  // value for num_groups at build time.
+  if (!params->use_implicit_batch && tensor->getDimensions().d[c_index] == -1) {
+    return errors::InvalidArgument("Channel dimension must be static, at ",
+                                   node_def.name());
+  }
   const nvinfer1::DimsHW stride(tf_stride[h_index], tf_stride[w_index]);
   if (params->validation_only) return Status::OK();
 
@@ -2157,11 +2163,12 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group,
   }
   // Dimensions of transposed tensor.
   const auto tensor_dim = tensor->getDimensions();
+  const int c_dim_size = tensor_dim.d[params->use_implicit_batch ? 0 : 1];
 
   // group == 0 signifies that this is a depthwise convolution, so set
   // num_groups to size of input's channel dim. For a non-depthwise conv,
   // num_groups will be 1.
-  const int num_groups = (group == 0) ? tensor_dim.d[0] : group;
+  const int num_groups = (group == 0) ? c_dim_size : group;
 
   // For conv, TF weights are RSCK, and TRT expects KCRS.
   // For backprop, TF weights are RSKC, and TRT expects CKRS.
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index d4badd1cc03..48db355a494 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -4037,15 +4037,16 @@ TEST_F(OpConverterTest, ConvertSlice) {
   }
 }
 
-TEST_F(OpConverterTest, ConvertConv2D) {
+TEST_P(OpConverterTest1, ConvertConv2D) {
   // Get nodedef for Conv2D layer.
+  DataType tf_type = tf_dtype;
   auto get_conv2d_nodedef =
-      [](std::vector<int> strides = {1, 1, 1, 1}, string padding = "SAME",
-         string data_format = "NCHW",
-         std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
+      [tf_type](std::vector<int> strides = {1, 1, 1, 1},
+                string padding = "SAME",  string data_format = "NCHW",
+                std::vector<int> dilations = {1, 1, 1, 1}) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
-    auto filter = ops::Placeholder(s.WithOpName("weights"), DT_FLOAT);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_type);
+    auto filter = ops::Placeholder(s.WithOpName("weights"), tf_type);
     ops::Conv2D::Attrs attrs =
         ops::Conv2D::Attrs().DataFormat(data_format).Dilations(dilations);
     auto conv2d = ops::Conv2D(s.WithOpName("my_conv2d"), input, filter, strides,
@@ -4067,7 +4068,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     // Filter is tensor, should fail.
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {3, 1, 2, 1});
     AddTestTensor("weights", {3, 3, 1, 1});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
@@ -4077,7 +4078,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     // Filter is not 4D, should fail.
     Reset();
     NodeDef node_def = get_conv2d_nodedef();
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4088,7 +4089,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4099,7 +4100,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NCHW", {1, 2, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
                                "Dilation rate must be 1 for batch and channel "
@@ -4110,7 +4111,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1, 1}, "SAME", "NHWC", {1, 1, 1, 2});
-    AddTestTensor("input", {2, 3, 1});
+    AddTestTensor("input", {1, 2, 3, 1});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
                                "Dilation rate must be 1 for batch and channel "
@@ -4121,7 +4122,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::INVALID_ARGUMENT,
@@ -4132,12 +4133,23 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     Reset();
     NodeDef node_def =
         get_conv2d_nodedef({1, 2, 1, 1}, "SAME", "NCHW", {1, 1, 1, 1});
-    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("input", {1, 1, 2, 3});
     AddTestWeights<float>("weights", {3, 3, 1, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
     RunValidationAndConversion(
         node_def, error::UNIMPLEMENTED,
         "Stride must be 1 for batch and channel dimensions, at my_conv2d");
   }
+  if (trt_mode == TrtTestMode::kDynamicShape) {
+    Reset();
+    NodeDef node_def = get_conv2d_nodedef();
+    // Channel dim unknown, should fail.
+    AddTestTensorWithTFDims("input", {-1, -1, -1, -1},
+                            TfDataTypeToTrt(tf_type));
+    AddTestWeights<float>("weights", {1, 2, 1, 1}, {-1, 1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Channel dimension must be static, at my_conv2d");
+  }
 
   struct TestParams {
     std::vector<int> input_dims;
@@ -4155,7 +4167,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
   // Ok.
   std::vector<TestParams> ok_params = {
       // Basic
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4163,10 +4175,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
                  /*expected_output=*/{1, 1, 0, 1}},
       // SAME padding (Asymmetric)
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4174,10 +4186,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
                  /*expected_output=*/{1, 1, -2, 0, 1, -4}},
       // SAME padding (Symmetric)
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 3, 1, 1},
                  /*filter=*/{-1, 0, 1},
@@ -4185,10 +4197,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"SAME",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 3},
+                 /*expected_output_dims=*/{1, 1, 2, 3},
                  /*expected_output=*/{1, 2, -1, 3, 1, -3}},
       // NHWC
-      TestParams{/*input_dims=*/{2, 3, 1},
+      TestParams{/*input_dims=*/{1, 2, 3, 1},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4196,10 +4208,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NHWC",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{2, 2, 1},
+                 /*expected_output_dims=*/{1, 2, 2, 1},
                  /*expected_output=*/{1, 1, 0, 1}},
       // Dilated
-      TestParams{/*input_dims=*/{1, 2, 3},
+      TestParams{/*input_dims=*/{1, 1, 2, 3},
                  /*input=*/{0, 1, 2, 3, 3, 4},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4207,10 +4219,10 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 2},
-                 /*expected_output_dims=*/{1, 2, 1},
+                 /*expected_output_dims=*/{1, 1, 2, 1},
                  /*expected_output=*/{2, 1}},
       // Strided
-      TestParams{/*input_dims=*/{1, 2, 4},
+      TestParams{/*input_dims=*/{1, 1, 2, 4},
                  /*input=*/{0, 1, 2, 2, 3, 4, 4, 7},
                  /*filter_dims=*/{1, 2, 1, 1},
                  /*filter=*/{-1, 1},
@@ -4218,7 +4230,7 @@ TEST_F(OpConverterTest, ConvertConv2D) {
                  /*padding=*/"VALID",
                  /*data_format=*/"NCHW",
                  /*dilations=*/{1, 1, 1, 1},
-                 /*expected_output_dims=*/{1, 2, 2},
+                 /*expected_output_dims=*/{1, 1, 2, 2},
                  /*expected_output=*/{1, 0, 1, 3}},
   };
 
@@ -4227,23 +4239,22 @@ TEST_F(OpConverterTest, ConvertConv2D) {
     NodeDef node_def =
         get_conv2d_nodedef(ok_params[i].strides, ok_params[i].padding,
                            ok_params[i].data_format, ok_params[i].dilations);
-    AddTestTensor("input", ok_params[i].input_dims);
+    std::vector<int> partial_input_shape;
+    if (trt_mode == TrtTestMode::kDynamicShape) {
+      // The channel dim cannot have unknown size, fix that.
+      partial_input_shape.resize(ok_params[i].input_dims.size(), -1);
+      int channel_id = (ok_params[i].data_format == "NCHW") ? 1 : 3;
+      partial_input_shape[channel_id] = ok_params[i].input_dims[channel_id];
+    }
+
+    AddTestTensor("input", ok_params[i].input_dims, tf_dtype,
+                  ok_params[i].input, partial_input_shape);
     AddTestWeights<float>("weights", ok_params[i].filter_dims,
                           ok_params[i].filter);
-    RunValidationAndConversion(node_def);
-    TRT_TensorOrWeights output;
-    TF_EXPECT_OK(GetTensorOrWeights("my_conv2d", &output));
-    ASSERT_TRUE(output.is_tensor());
-    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
-                             output.tensor()->getDimensions());
 
-    const DataVec input_data{{"input", AsTensor<float>(ok_params[i].input)}};
-    DataVec output_data{
-        {"my_conv2d",
-         ConstructTensor<float>(ok_params[i].expected_output.size())}};
-    TF_EXPECT_OK(BuildAndRun(input_data, &output_data));
-    EXPECT_THAT(GetSpanForData<float>(output_data[0]),
-                ElementsAreArray(ok_params[i].expected_output));
+    TestOpConverter("my_conv2d", node_def, ok_params[i].expected_output_dims,
+                    Status::OK(), Status::OK(),
+                    ElementsAreArray(ok_params[i].expected_output));
   }
 }
 

From bfe0b28c3726f5f64d0b70a03b7fc88cfec9bbad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 10:28:05 -0700
Subject: [PATCH 0973/1533] Internal change

PiperOrigin-RevId: 312694720
Change-Id: I04439efff13aabe38d18c98c025d45ae33d33f46
---
 tensorflow/python/eager/forwardprop_test.py   |  10 +-
 .../python/keras/integration_test/BUILD       |  12 +-
 .../gradient_checkpoint_test.py               | 158 ------------------
 tensorflow/python/ops/custom_gradient.py      |  56 ++-----
 tensorflow/python/ops/gradients_test.py       |  48 +-----
 5 files changed, 25 insertions(+), 259 deletions(-)
 delete mode 100644 tensorflow/python/keras/integration_test/gradient_checkpoint_test.py

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index dd0bad30cb8..4ddba6b9be3 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -199,6 +199,7 @@ def _test_gradients(testcase,
   # And the symbolic computations should be much closer.
   testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
 
+
 class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
   def testJVPFunction(self):
@@ -360,17 +361,14 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3)
 
-  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test?
-  def testExceptionCustomGradientRecomputeGradForward(self):
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testCustomGradientRecomputeGrad(self):
 
     @custom_gradient.recompute_grad
     def f(x):
       return math_ops.reduce_prod(math_ops.tanh(x)**2)
 
-    with self.assertRaisesRegexp(NotImplementedError,
-                                 "recompute_grad tried to transpose"):
-      primals = [constant_op.constant([1.])]
-      sym_jac_fwd = _jacfwd(f, primals)
+    _test_gradients(self, f, [constant_op.constant([1.])], order=3)
 
   def testExceptionInCustomGradientNotSwallowed(self):
 
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 80d8fb86345..01c405a86ae 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
-load("//tensorflow:tensorflow.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
     default_visibility = [
@@ -70,13 +70,3 @@ tf_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
-
-cuda_py_test(
-    name = "gradient_checkpoint_test",
-    srcs = ["gradient_checkpoint_test.py"],
-    python_version = "PY3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:extra_py_tests_deps",
-    ],
-)
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
deleted file mode 100644
index 9d9e0a062b3..00000000000
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import tensorflow as tf
-layers = tf.keras.layers
-optimizers = tf.keras.optimizers
-
-
-def _get_big_cnn_model(img_dim, n_channels, num_partitions,
-                       blocks_per_partition):
-  """Creates a test model whose activations are significantly larger than model size."""
-  model = tf.keras.Sequential()
-  model.add(layers.Input(shape=(img_dim, img_dim, n_channels)))
-  for _ in range(num_partitions):
-    for _ in range(blocks_per_partition):
-      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-  model.add(layers.Flatten())
-  model.add(layers.Dense(32, activation=tf.nn.relu))
-  model.add(layers.Dense(10))
-  return model
-
-
-def _get_split_cnn_model(img_dim, n_channels, num_partitions,
-                         blocks_per_partition):
-  """Creates a test model that is split into `num_partitions` smaller models"""
-  models = [tf.keras.Sequential() for _ in range(num_partitions)]
-  models[0].add(layers.Input(shape=(img_dim, img_dim, n_channels)))
-  for i in range(num_partitions):
-    model = models[i]
-    if i > 0:
-      last_shape = models[i - 1].layers[-1].output_shape
-      model.add(layers.Input(shape=last_shape[1:]))
-    for _ in range(blocks_per_partition):
-      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
-      model.add(layers.MaxPooling2D((1, 1), padding='same'))
-  models[-1].add(layers.Flatten())
-  models[-1].add(layers.Dense(32, activation=tf.nn.relu))
-  models[-1].add(layers.Dense(10))
-  return models
-
-
-def _compute_loss(logits, labels):
-  return tf.reduce_mean(
-      tf.nn.sparse_softmax_cross_entropy_with_logits(
-          logits=logits, labels=labels))
-
-
-def _limit_gpu_memory():
-  """Helper function to limit GPU memory for testing  """
-  gpus = tf.config.experimental.list_physical_devices('GPU')
-  if gpus:
-    tf.config.experimental.set_virtual_device_configuration(
-        gpus[0],
-        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
-    return True
-  return False
-
-
-def _get_dummy_data(img_dim, n_channels, batch_size):
-  inputs = tf.ones([batch_size, img_dim, img_dim, n_channels])
-  labels = tf.ones([batch_size], dtype=tf.int64)
-  return inputs, labels
-
-
-def _train_no_recompute(n_steps):
-  """Trains a single large model without gradient checkpointing."""
-  img_dim, n_channels, batch_size = 256, 1, 4
-  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
-  model = _get_big_cnn_model(
-      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
-  optimizer = optimizers.SGD()
-  losses = []
-  tr_vars = model.trainable_variables
-  for _ in range(n_steps):
-    with tf.GradientTape() as tape:
-      logits = model(x)
-      loss = _compute_loss(logits, y)
-      losses.append(loss)
-    grads = tape.gradient(loss, tr_vars)  # tr_vars
-    optimizer.apply_gradients(zip(grads, tr_vars))
-    del grads
-  return losses
-
-
-def _train_with_recompute(n_steps):
-  """Trains a single large model with gradient checkpointing using tf.recompute_grad."""
-  img_dim, n_channels, batch_size = 256, 1, 4
-  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
-  # This model is the same model as _get_big_cnn_model but split into 3 parts.
-  models = _get_split_cnn_model(
-      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
-  model1, model2, model3 = models
-  # Apply gradient checkpointing to the submodels using tf.recompute_grad.
-  model1_re = tf.recompute_grad(model1)
-  model2_re = tf.recompute_grad(model2)
-  model3_re = tf.recompute_grad(model3)
-  optimizer = optimizers.SGD()
-  tr_vars = (
-      model1.trainable_variables + model2.trainable_variables +
-      model3.trainable_variables)
-  losses = []
-  for _ in range(n_steps):
-    with tf.GradientTape() as tape:
-      logits1 = model1_re(x)
-      logits2 = model2_re(logits1)
-      logits3 = model3_re(logits2)
-      loss = _compute_loss(logits3, y)
-      losses.append(loss)
-      grads = tape.gradient(loss, tr_vars)  # tr_vars
-      optimizer.apply_gradients(zip(grads, tr_vars))
-      del grads
-  return losses
-
-
-class GradientCheckpointTest(tf.test.TestCase):
-
-  def test_raises_oom_exception(self):
-    if not _limit_gpu_memory():
-      self.skipTest('No virtual GPUs found')
-    with self.assertRaises(Exception) as context:
-      _train_no_recompute(1)
-    self.assertTrue(
-        context.exception.__class__.__name__ == 'ResourceExhaustedError')
-
-  def test_does_not_raise_oom_exception(self):
-    if not _limit_gpu_memory():
-      self.skipTest('No virtual GPUs found')
-    n_step = 2
-    losses = _train_with_recompute(n_step)
-    self.assertTrue(len(losses) == n_step)
-
-
-if __name__ == '__main__':
-  tf.test.main()
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index a3b336d66f2..4040a4db038 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -28,7 +28,6 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import op_selector
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -483,47 +482,28 @@ def recompute_grad(f):
   def inner(*args, **kwargs):
     """Inner function closure for calculating gradients."""
     current_var_scope = variable_scope.get_variable_scope()
-    with tape_lib.stop_recording():
-      result = f(*args, **kwargs)
 
-    def grad_wrapper(*wrapper_args, **grad_kwargs):
-      """Wrapper function to accomodate lack of kwargs in graph mode decorator."""
+    result = f(*args, **kwargs)
 
-      @custom_gradient
-      def inner_recompute_grad(*dresult):
-        """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
-        # Gradient calculation for reverse mode autodiff.
-        variables = grad_kwargs.get("variables")
-        with backprop.GradientTape() as t:
-          id_args = [gen_array_ops.identity(x) for x in args]
-          t.watch(id_args)
-          if variables is not None:
-            t.watch(variables)
-          with ops.control_dependencies(dresult):
-            with variable_scope.variable_scope(current_var_scope):
-              result = f(*id_args, **kwargs)
-        kw_vars = []
+    def grad(*dresult, **grad_kwargs):
+      """Gradient function calculation for inner function."""
+      variables = grad_kwargs.get("variables")
+      with backprop.GradientTape() as t:
+        id_args = [gen_array_ops.identity(x) for x in args]
+        t.watch(id_args)
         if variables is not None:
-          kw_vars = list(variables)
-        grads = t.gradient(
-            result,
-            list(id_args) + kw_vars,
-            output_gradients=dresult,
-            unconnected_gradients=UnconnectedGradients.ZERO)
+          t.watch(variables)
+        with ops.control_dependencies(dresult):
+          with variable_scope.variable_scope(current_var_scope):
+            result = f(*id_args, **kwargs)
+      kw_vars = []
+      if variables is not None:
+        kw_vars = list(variables)
+      grads = t.gradient(
+          result, list(id_args) + kw_vars, output_gradients=dresult)
+      return grads[:len(id_args)], grads[len(id_args):]
 
-        def transpose(*t_args, **t_kwargs):
-          """Gradient function calculation for forward mode autodiff."""
-          # Just throw an error since gradients / activations are not stored on tape for recompute.
-          raise NotImplementedError(
-              "recompute_grad tried to transpose grad of {}. "
-              "Consider not using recompute_grad in forward mode"
-              "autodiff".format(f.__name__))
-
-        return (grads[:len(id_args)], grads[len(id_args):]), transpose
-
-      return inner_recompute_grad(*wrapper_args)
-
-    return result, grad_wrapper
+    return result, grad
 
   return inner
 
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index a06be7af74b..817d8a1adbe 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -59,7 +59,6 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
-from tensorflow.python.ops import gradient_checker_v2
 
 
 class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -1341,46 +1340,6 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
 
     return grads_re, grads
 
-  def _grad(self, f, argnums=0):
-    """Return a function which computes the gradient of `f`."""
-
-    def _f(*params):
-      with backprop.GradientTape() as tape:
-        tape.watch(params)
-        outputs = f(*params)
-      return tape.gradient(
-          outputs,
-          params[argnums],
-          unconnected_gradients=unconnected_gradients.UnconnectedGradients.ZERO)
-
-    return _f
-
-  def _test_gradients(self, f, inputs, order, delta=1e-3, rtol=1e-2, atol=1e-6):
-    """Tests backward jacobians of `f`'s [0, `order`)-order gradients."""
-    if order < 1:
-      raise ValueError(
-          "`order` should be a positive integer, got '{}'.".format(order))
-    if order > 1:
-      self._test_gradients(
-          f=self._grad(f),
-          inputs=inputs,
-          order=order - 1,
-          delta=delta,
-          rtol=rtol,
-          atol=atol)
-    sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(
-        f, inputs, delta=delta)
-    self.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
-
-  @test_util.run_v2_only
-  def testCustomGradientRecomputeGradHigherOrder(self):
-
-    @custom_gradient.recompute_grad
-    def f(x):
-      return math_ops.reduce_prod(math_ops.tanh(x)**2)
-
-    self._test_gradients(f, [constant_op.constant([1.])], order=3)
-
   @test_util.run_in_graph_and_eager_modes
   def testFnRecompute(self):
     """Checks that recompute_grad works grads of function args."""
@@ -1397,8 +1356,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
           shape=10,
           trainable=True,
       )
-      self.evaluate(test_var.assign(np.ones([10])))
-      test_input = constant(np.ones((10, 10), dtype=np.float32))
+
+      test_input = constant(np.zeros((10, 10), dtype=np.float32))
 
       grads_re, grads = self._TestFnVariablesGradient(test_input, TestFn,
                                                       test_input)
@@ -1441,7 +1400,6 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
             shape=10,
             trainable=True,
         )
-        self.evaluate(test_var.assign(np.ones([10])))
         return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
@@ -1484,8 +1442,6 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
       out_re = test_fn_re(test_input_t)
       out = TestFn(test_input_t)
 
-    init = variables.global_variables_initializer()
-    self.evaluate(init)
     grads_re = gradients.gradients(out_re, variables.trainable_variables())
     grads = gradients.gradients(out, variables.trainable_variables())
 

From c2534e2336fb41d39738a3309c9829d0ecb6c375 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 21 May 2020 10:35:07 -0700
Subject: [PATCH 0974/1533] Remove illegal BroadcastTo op compiler tests

BroadcastTo op requires input shape to be broadcast compatible with the required shape and can't modify dimensions of size greater than one.

Added couple of legal tests to improve coverage.

These were failing in shape inference function and then failing to get lowered in the MLIR bridge.

PiperOrigin-RevId: 312696176
Change-Id: I42a85618b8bbf6ff9dce46de01e6ad3b319a269f
---
 tensorflow/compiler/tests/binary_ops_test.py | 29 +++++---------------
 1 file changed, 7 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 00ed6d83e2e..c7be2c55de7 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1579,8 +1579,6 @@ class BinaryOpsTest(xla_test.XLATestCase):
                        np.array([4, 5, 6], dtype=np.int32),
                        expected=None)
 
-  @test_util.disable_mlir_bridge(
-      "Requires BroadcastInDim method in MlirHloBuilder")
   def testBroadcastTo(self):
     for dtype in self.all_types:
       x = np.random.randint(0, high=100, size=[2, 3])
@@ -1591,29 +1589,16 @@ class BinaryOpsTest(xla_test.XLATestCase):
           expected=x)
       self._testBinary(
           array_ops.broadcast_to,
-          x,
-          np.array([6, 6], dtype=np.int32),
-          expected=np.tile(x, [3, 2]))
+          np.zeros([2, 3], dtype=dtype),
+          np.array([2, 2, 3], dtype=np.int32),
+          expected=np.zeros([2, 2, 3], dtype=dtype))
+
+      x = np.arange(2).reshape((2, 1)).astype(dtype)
       self._testBinary(
           array_ops.broadcast_to,
           x,
-          np.array([7, 4, 3], dtype=np.int32),
-          expected=np.tile(x, [7, 2, 1]))
-      self._testBinary(
-          array_ops.broadcast_to,
-          x,
-          np.array([7, 0, 3], dtype=np.int32),
-          expected=np.zeros([7, 0, 3], dtype=dtype))
-      self._testBinary(
-          array_ops.broadcast_to,
-          x,
-          np.array([7, 1, 2, 9], dtype=np.int32),
-          expected=np.tile(x, [7, 1, 1, 3]))
-      self._testBinary(
-          array_ops.broadcast_to,
-          np.zeros([2, 0], dtype=dtype),
-          np.array([4, 0], dtype=np.int32),
-          expected=np.zeros([4, 0], dtype=dtype))
+          np.array([2, 2, 3], dtype=np.int32),
+          expected=np.tile(x, (2, 1, 3)))
 
       x = np.arange(3).reshape((3, 1, 1, 1)).astype(dtype)
       self._testBinary(

From 37b60af53629d684bba47b8b863a20fc9caa0e87 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Thu, 21 May 2020 10:44:03 -0700
Subject: [PATCH 0975/1533] [TF] Add support for more than one outer batch
 dimension to tf.nn.convolution.

This is part 2/N of adding outer batch dimension support to tf.nn.convXd and keras.layers.ConvXd.

Also added support for batch_shape.ndims > 1 to nn_ops.Convolution and other internal
libraries, so that we can use this in keras.layers.ConvXD.

For now, using tf.nn.convolution with filter.shape == 3 or filter.shape == 5 (conv1d or conv3d) still raises an error deep in the ops, because i haven't yet added reshape
wrappers for gen_nn_ops.conv{1d,3d} but those are gonna be easy to add once
this is in.  I wanted to make sure it works for conv2d first.

No public signature changes.

PiperOrigin-RevId: 312697999
Change-Id: I01107967101f28b9906074b3c88664a3a09e8c4b
---
 .../python/kernel_tests/conv_ops_test.py      |  52 +++
 tensorflow/python/ops/nn_ops.py               | 325 +++++++++++++-----
 2 files changed, 296 insertions(+), 81 deletions(-)

diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 18b7a47fc8c..e01abc8133d 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -455,6 +455,58 @@ class Conv2DTest(test.TestCase):
         conv1,
         self.evaluate(conv2).reshape(conv1.shape))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvolutionClass2DExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    convolver1 = nn_ops.Convolution(
+        input_shape=x1.shape,
+        filter_shape=filter_in.shape,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(convolver1.num_batch_dims, 1)
+    convolver2 = nn_ops.Convolution(
+        input_shape=x2.shape,
+        filter_shape=filter_in.shape,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(convolver2.num_batch_dims, 2)
+    conv1 = convolver1(x1, filter_in)
+    conv2 = convolver2(x2, filter_in)
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConvolutionWith2SpatialDimensionsAndExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    conv1 = nn_ops.convolution(
+        x1,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    conv2 = nn_ops.convolution(
+        x2,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
   @test_util.run_in_graph_and_eager_modes
   def testConv2D2x2Filter2x1Dilation(self):
     self._VerifyDilatedConvValues(
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 4c00d085f82..4c6efe61621 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -131,9 +131,9 @@ def _non_atrous_convolution(
   """
   with ops.name_scope(name, "non_atrous_convolution", [input, filter]) as scope:
     input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-    input_shape = input.get_shape()
+    input_shape = input.shape
     filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    filter_shape = filter.get_shape()
+    filter_shape = filter.shape
     op = _NonAtrousConvolution(
         input_shape,
         filter_shape=filter_shape,
@@ -148,36 +148,51 @@ class _NonAtrousConvolution(object):
   """Helper class for _non_atrous_convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
+  `__call__` are compatible with `input_shape` and filter_shape passed to the
   constructor.
 
   Arguments:
-    input_shape: static input shape, i.e. input.get_shape().
-    filter_shape: static filter shape, i.e. filter.get_shape().
+    input_shape: static input shape, i.e. input.shape.
+    filter_shape: static filter shape, i.e. filter.shape.
     padding: see _non_atrous_convolution.
     data_format: see _non_atrous_convolution.
     strides: see _non_atrous_convolution.
     name: see _non_atrous_convolution.
+    num_batch_dims: (Optional.)  The number of batch dimensions in the input;
+     if not provided, the default of `1` is used.
   """
 
   def __init__(
       self,
       input_shape,
-      filter_shape,  # pylint: disable=redefined-builtin
+      filter_shape,
       padding,
       data_format=None,
       strides=None,
-      name=None):
-    filter_shape = filter_shape.with_rank(input_shape.ndims)
+      name=None,
+      num_batch_dims=1):
+    # filter shape is always rank num_spatial_dims + 2
+    # and num_spatial_dims == input_shape.ndims - num_batch_dims - 1
+    if input_shape.ndims is not None:
+      filter_shape = filter_shape.with_rank(
+          input_shape.ndims - num_batch_dims + 1)
     self.padding = padding
     self.name = name
-    input_shape = input_shape.with_rank(filter_shape.ndims)
+    # input shape is == num_spatial_dims + num_batch_dims + 1
+    # and filter_shape is always rank num_spatial_dims + 2
+    if filter_shape.ndims is not None:
+      input_shape = input_shape.with_rank(
+          filter_shape.ndims + num_batch_dims - 1)
     if input_shape.ndims is None:
-      raise ValueError("Rank of convolution must be known")
-    if input_shape.ndims < 3 or input_shape.ndims > 5:
       raise ValueError(
-          "`input` and `filter` must have rank at least 3 and at most 5")
-    conv_dims = input_shape.ndims - 2
+          "Rank of convolution must be known, but saw input_shape.ndims == {}"
+          .format(input_shape.ndims))
+    if input_shape.ndims < 3 or input_shape.ndims - num_batch_dims + 1 > 5:
+      raise ValueError(
+          "`input_shape.ndims - num_batch_dims + 1` must be at least 3 and at "
+          "most 5 but saw `input_shape.ndims == {}` and `num_batch_dims == {}`"
+          .format(input_shape.ndims, num_batch_dims))
+    conv_dims = input_shape.ndims - num_batch_dims - 1
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
@@ -520,7 +535,7 @@ def with_space_to_batch(
 
   """
   input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-  input_shape = input.get_shape()
+  input_shape = input.shape
 
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
@@ -540,18 +555,19 @@ class _WithSpaceToBatch(object):
   """Helper class for with_space_to_batch.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
-  constructor.
+  `__call__` are compatible with `input_shape`, `filter_shape`, and
+  `spatial_dims` passed to the constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.get_shape().
-    dilation_rate: see with_space_to_batch
-    padding: see with_space_to_batch
+    input_shape: static shape of input. i.e. input.shape.
+    dilation_rate: see `with_space_to_batch`.
+    padding: see `with_space_to_batch`.
     build_op: Function that maps (num_spatial_dims, paddings) -> (function that
       maps (input, filter) -> output).
-    filter_shape: see with_space_to_batch
-    spatial_dims: see with_space_to_batch
-    data_format: see with_space_to_batch
+    filter_shape: see `with_space_to_batch`.
+    spatial_dims: `see with_space_to_batch`.
+    data_format: see `with_space_to_batch`.
+    num_batch_dims: (Optional).  Number of batch dims in `input_shape`.
   """
 
   def __init__(self,
@@ -561,24 +577,25 @@ class _WithSpaceToBatch(object):
                build_op,
                filter_shape=None,
                spatial_dims=None,
-               data_format=None):
+               data_format=None,
+               num_batch_dims=1):
     """Helper class for _with_space_to_batch."""
     dilation_rate = ops.convert_to_tensor(
         dilation_rate, dtypes.int32, name="dilation_rate")
-    try:
-      rate_shape = dilation_rate.get_shape().with_rank(1)
-    except ValueError:
-      raise ValueError("rate must be rank 1")
+    if dilation_rate.shape.ndims not in (None, 1):
+      raise ValueError(
+          "rate must be rank 1 but saw {}".format(dilation_rate.shape.ndims))
 
-    if not dilation_rate.get_shape().is_fully_defined():
-      raise ValueError("rate must have known shape")
+    if not dilation_rate.shape.is_fully_defined():
+      raise ValueError("rate must have known shape, but saw {}"
+                       .format(dilation_rate.shape))
 
-    num_spatial_dims = rate_shape.dims[0].value
+    num_spatial_dims = dilation_rate.shape.dims[0].value
 
     if data_format is not None and data_format.startswith("NC"):
-      starting_spatial_dim = 2
+      starting_spatial_dim = num_batch_dims + 1
     else:
-      starting_spatial_dim = 1
+      starting_spatial_dim = num_batch_dims
 
     if spatial_dims is None:
       spatial_dims = range(starting_spatial_dim,
@@ -588,7 +605,7 @@ class _WithSpaceToBatch(object):
     if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
       raise ValueError(
           "spatial_dims must be a monotonically increasing sequence of "
-          "positive integers")
+          "positive integers, but saw: {}".format(orig_spatial_dims))
 
     if data_format is not None and data_format.startswith("NC"):
       expected_input_rank = spatial_dims[-1]
@@ -599,14 +616,16 @@ class _WithSpaceToBatch(object):
       input_shape.with_rank_at_least(expected_input_rank)
     except ValueError:
       raise ValueError(
-          "input tensor must have rank %d at least" % (expected_input_rank))
+          "input tensor must have rank at least {}, but saw rank {}"
+          .format(expected_input_rank, input_shape.ndims))
 
     const_rate = tensor_util.constant_value(dilation_rate)
     rate_or_const_rate = dilation_rate
     if const_rate is not None:
       rate_or_const_rate = const_rate
       if np.any(const_rate < 1):
-        raise ValueError("dilation_rate must be positive")
+        raise ValueError("dilation_rate must be positive, but saw: {}"
+                         .format(const_rate))
       if np.all(const_rate == 1):
         self.call = build_op(num_spatial_dims, padding)
         return
@@ -672,6 +691,7 @@ class _WithSpaceToBatch(object):
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
           filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
+
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -994,31 +1014,83 @@ def convolution_internal(
     data_format=None,
     dilations=None,
     name=None,
-    call_from_convolution=True):
-  """Internal function which performs rank agnostic convolution."""
-  if isinstance(input.shape, tensor_shape.TensorShape) and \
-        input.shape.rank is not None:
-    n = len(input.shape) - 2
-  elif not isinstance(input.shape, tensor_shape.TensorShape) and \
-        input.shape is not None:
-    n = len(input.shape) - 2
-  elif isinstance(filters.shape, tensor_shape.TensorShape) and \
-        filters.shape.rank is not None:
+    call_from_convolution=True,
+    num_spatial_dims=None):
+  """Internal function which performs rank agnostic convolution.
+
+  Args:
+    input: See `convolution`.
+    filters: See `convolution`.
+    strides: See `convolution`.
+    padding: See `convolution`.
+    data_format: See `convolution`.
+    dilations: See `convolution`.
+    name: See `convolution`.
+    call_from_convolution: See `convolution`.
+    num_spatial_dims: (Optional.).  It is a integer describing the
+      rank of the spatial dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
+      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
+      This argument is only required to disambiguate the rank of `batch_shape`
+      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
+      backwards compatibility, if `num_spatial_dims is None` and
+     `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
+     `1` (i.e., the input is expected to be
+     `[batch_size, num_channels] + input_spatial_shape`
+     or `[batch_size] + input_spatial_shape + [num_channels]`.
+
+  Returns:
+    A tensor of shape and dtype matching that of `input`.
+
+  Raises:
+    ValueError: If input and filter both have unknown shapes, or if
+      `num_spatial_dims` is provided and incompatible with the value
+      estimated from `filters.shape`.
+  """
+  n = None
+  if isinstance(filters, (list, tuple)):
+    filters = np.asarray(filters)
+  if (isinstance(filters.shape, tensor_shape.TensorShape)
+      and filters.shape.rank is not None):
     n = len(filters.shape) - 2
-  elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
-        filters.shape is not None:
+  elif (not isinstance(filters.shape, tensor_shape.TensorShape)
+        and filters.shape is not None):
     n = len(filters.shape) - 2
+
+  if (isinstance(input.shape, tensor_shape.TensorShape)
+      and input.shape.rank is not None):
+    if n is None:
+      n = (num_spatial_dims if num_spatial_dims is not None
+           else len(input.shape) - 2)
+    num_batch_dims = len(input.shape) - n - 1
+  elif (not isinstance(input.shape, tensor_shape.TensorShape)
+        and input.shape is not None):
+    if n is None:
+      n = (num_spatial_dims if num_spatial_dims is not None
+           else len(input.shape) - 2)
+    num_batch_dims = len(input.shape) - n - 1
   else:
+    num_batch_dims = 1  # Default behavior if it cannot be estimated.
+
+  if n is None:
     raise ValueError("rank of input or filter must be known")
 
+  if num_spatial_dims is not None and n != num_spatial_dims:
+    raise ValueError(
+        "inconsistent estimate of spatial dims ({}) vs. actual passed "
+        "num_spatial_dims ({}).  n was estimated as len(filters.shape) - 2, "
+        "but filters shape is: {}".format(n, num_spatial_dims, filters.shape))
+
   if not 1 <= n <= 3:
     raise ValueError(
-        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+        "num_spatial_dims (input.shape.ndims - num_batch_dims - 1) must be one "
+        "of 1, 2 or 3 but saw {}.  num_batch_dims: {}."
+        .format(n, num_batch_dims))
 
   if data_format is None:
-    channel_index = n + 1
+    channel_index = num_batch_dims + n
   else:
-    channel_index = 1 if data_format.startswith("NC") else n + 1
+    channel_index = (
+        num_batch_dims if data_format.startswith("NC") else n + num_batch_dims)
 
   strides = _get_sequence(strides, n, channel_index, "strides")
   dilations = _get_sequence(dilations, n, channel_index, "dilations")
@@ -1031,7 +1103,7 @@ def convolution_internal(
     scope = "convolution"
 
   with ops.name_scope(name, scope, [input, filters]) as name:
-    conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
+    conv_ops = {1: conv1d, 2: _conv2d_expanded_batch, 3: gen_nn_ops.conv3d}
 
     if device_context.enclosing_tpu_context() is not None or all(
         i == 1 for i in dilations):
@@ -1061,7 +1133,8 @@ def convolution_internal(
           strides=strides,
           dilation_rate=dilations,
           name=name,
-          data_format=data_format)
+          data_format=data_format,
+          num_spatial_dims=n)
       return op(input, filters)
 
 
@@ -1069,17 +1142,34 @@ class Convolution(object):
   """Helper class for convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
-  constructor.
+  `__call__` are compatible with `input_shape`, `filter_shape`, and
+  `num_spatial_dims` passed to the constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.get_shape().
-    filter_shape: static shape of the filter. i.e. filter.get_shape().
-    padding:  see convolution.
+    input_shape: static shape of input. i.e. input.shape.  Its length is
+      `batch_shape + input_spatial_shape + [num_channels]` if `data_format`
+      does not start with `NC`, or
+      `batch_shape + [num_channels] + input_spatial_shape` if `data_format`
+      starts with `NC`.
+    filter_shape: static shape of the filter. i.e. filter.shape.
+    padding: The padding algorithm, must be "SAME" or "VALID".
     strides: see convolution.
     dilation_rate: see convolution.
     name: see convolution.
-    data_format: see convolution.
+    data_format: A string or `None`.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (if `data_format` is `None`
+      or does not start with `NC`), or the first post-batch dimension (i.e. if
+      `data_format` starts with `NC`).
+    num_spatial_dims: (Usually optional.)  Python integer, the rank of the
+      spatial and channel dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
+      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
+      This argument is only required to disambiguate the rank of `batch_shape`
+      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
+      backwards compatibility, if `num_spatial_dims is None` and
+      `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
+      `1` (i.e., the input is expected to be
+      `[batch_size, num_channels] + input_spatial_shape`
+      or `[batch_size] + input_spatial_shape + [num_channels]`.
   """
 
   def __init__(self,
@@ -1089,40 +1179,72 @@ class Convolution(object):
                strides=None,
                dilation_rate=None,
                name=None,
-               data_format=None):
+               data_format=None,
+               num_spatial_dims=None):
     """Helper function for convolution."""
-    num_total_dims = filter_shape.ndims
-    if num_total_dims is None:
-      num_total_dims = input_shape.ndims
-    if num_total_dims is None:
-      raise ValueError("rank of input or filter must be known")
+    num_batch_dims = None
+    filter_shape = tensor_shape.as_shape(filter_shape)
+    input_shape = tensor_shape.as_shape(input_shape)
 
-    num_spatial_dims = num_total_dims - 2
+    if filter_shape.ndims is not None:
+      if (num_spatial_dims is not None and
+          filter_shape.ndims != num_spatial_dims + 2):
+        raise ValueError(
+            "Expected filter_shape.ndims == num_spatial_dims + 2, "
+            "but saw filter_shape.ndims == {} and num_spatial_dims == {}"
+            .format(filter_shape.ndims, num_spatial_dims))
+      else:
+        num_spatial_dims = filter_shape.ndims - 2
 
-    try:
-      input_shape.with_rank(num_spatial_dims + 2)
-    except ValueError:
+    if input_shape.ndims is not None and num_spatial_dims is not None:
+      num_batch_dims = input_shape.ndims - num_spatial_dims - 1
+
+    if num_spatial_dims is None:
+      num_spatial_dims = input_shape.ndims - 2
+    else:
+      if input_shape.ndims is not None:
+        if input_shape.ndims < num_spatial_dims + 2:
+          raise ValueError(
+              "Expected input_shape.ndims >= num_spatial_dims + 2, but saw "
+              "input_shape.ndims == {} and num_spatial_dims == {}"
+              .format(input_shape.ndims, num_spatial_dims))
+        else:
+          if num_batch_dims is None:
+            num_batch_dims = input_shape.ndims - num_spatial_dims - 1
+
+    if num_spatial_dims is None:
       raise ValueError(
-          "input tensor must have rank %d" % (num_spatial_dims + 2))
+          "Cannot estimate num_spatial_dims since input_shape.ndims is None, "
+          "filter_shape.ndims is None, and argument num_spatial_dims is also "
+          "None.")
 
-    try:
-      filter_shape.with_rank(num_spatial_dims + 2)
-    except ValueError:
+    if num_batch_dims is None:
+      num_batch_dims = 1
+
+    if num_batch_dims < 1:
       raise ValueError(
-          "filter tensor must have rank %d" % (num_spatial_dims + 2))
+          "num_batch_dims should be >= 1, but saw {}.  num_batch_dims was "
+          "estimated as `input_shape.ndims - num_spatial_dims - 1` and "
+          "num_spatial_dims was either provided or estimated as "
+          "`filter_shape.ndims - 2`.  input_shape.ndims: {}, "
+          "num_spatial_dims: {}, filter_shape.ndims: {}"
+          .format(num_batch_dims, input_shape.ndims, num_spatial_dims,
+                  filter_shape.ndims))
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = tensor_shape.dimension_at_index(
-          input_shape, num_spatial_dims + 1)
-      spatial_dims = range(1, num_spatial_dims + 1)
+          input_shape, num_spatial_dims + num_batch_dims)
+      spatial_dims = range(num_batch_dims, num_spatial_dims + num_batch_dims)
     else:
-      input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1)
-      spatial_dims = range(2, num_spatial_dims + 2)
+      input_channels_dim = tensor_shape.dimension_at_index(
+          input_shape, num_batch_dims)
+      spatial_dims = range(
+          num_batch_dims + 1, num_spatial_dims + num_batch_dims + 1)
 
     if not input_channels_dim.is_compatible_with(
         filter_shape[num_spatial_dims]):
       raise ValueError(
-          "number of input channels does not match corresponding dimension of "
+          "Number of input channels does not match corresponding dimension of "
           "filter, {} != {}".format(input_channels_dim,
                                     filter_shape[num_spatial_dims]))
 
@@ -1136,6 +1258,8 @@ class Convolution(object):
     self.padding = padding
     self.name = name
     self.dilation_rate = dilation_rate
+    self.num_batch_dims = num_batch_dims
+    self.num_spatial_dims = num_spatial_dims
     self.conv_op = _WithSpaceToBatch(
         input_shape,
         dilation_rate=dilation_rate,
@@ -1143,7 +1267,8 @@ class Convolution(object):
         build_op=self._build_op,
         filter_shape=filter_shape,
         spatial_dims=spatial_dims,
-        data_format=data_format)
+        data_format=data_format,
+        num_batch_dims=num_batch_dims)
 
   def _build_op(self, _, padding):
     return _NonAtrousConvolution(
@@ -1152,7 +1277,8 @@ class Convolution(object):
         padding=padding,
         data_format=self.data_format,
         strides=self.strides,
-        name=self.name)
+        name=self.name,
+        num_batch_dims=self.num_batch_dims)
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
     # TPU convolution supports dilations greater than 1.
@@ -1165,7 +1291,8 @@ class Convolution(object):
           data_format=self.data_format,
           dilations=self.dilation_rate,
           name=self.name,
-          call_from_convolution=False)
+          call_from_convolution=False,
+          num_spatial_dims=self.num_spatial_dims)
     else:
       return self.conv_op(inp, filter)
 
@@ -2392,6 +2519,42 @@ def conv2d_transpose_v2(
         name=name)
 
 
+def _conv2d_expanded_batch(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides,
+    padding,
+    data_format,
+    dilations,
+    name):
+  """Helper function for `convolution_internal`; handles expanded batches."""
+  # Try really hard to avoid modifying the legacy name scopes - return early.
+  shape = getattr(input, "shape", None)
+  if shape is not None:
+    ndims = getattr(shape, "ndims", -1)
+    if ndims == -1: ndims = len(shape)
+  if ndims in (4, 3, 2, 1, 0, None):
+    return gen_nn_ops.conv2d(
+        input,
+        filter=filters,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+  return _squeeze_batch_dims(
+      input,
+      functools.partial(
+          gen_nn_ops.conv2d,
+          filter=filters,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations),
+      inner_rank=3,
+      name=name)
+
+
 @tf_export("nn.atrous_conv2d_transpose")
 @dispatch.add_dispatch_support
 def atrous_conv2d_transpose(value,

From 31eeaec3b450b2cbc6780e4087d7512d8cd66c43 Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Thu, 21 May 2020 10:58:17 -0700
Subject: [PATCH 0976/1533] Enable skipped test as Variable.assign(name=xxx)
 now works with CentralStorage

PiperOrigin-RevId: 312701125
Change-Id: I2c43d1da9cf97b359293498085f491908a3ad4ab
---
 tensorflow/python/keras/distribute/keras_utils_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py
index 702d89d95f8..0f65bbbf917 100644
--- a/tensorflow/python/keras/distribute/keras_utils_test.py
+++ b/tensorflow/python/keras/distribute/keras_utils_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import parameter_server_strategy
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.distribute import values
@@ -398,9 +397,6 @@ class TestDistributionStrategyWithNormalizationLayer(test.TestCase,
               optimizer=strategy_combinations
               .gradient_descent_optimizer_keras_v2_fn)))
   def test_batchnorm_correctness(self, distribution, fused, optimizer):
-    if isinstance(distribution.extended,
-                  parameter_server_strategy.ParameterServerStrategyExtended):
-      self.skipTest('b/152353796')
     with self.cached_session():
       with distribution.scope():
         model = keras.models.Sequential()

From 808b545c382505dfe36ff8f3c65e3ab34f2c49bf Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Thu, 21 May 2020 10:58:55 -0700
Subject: [PATCH 0977/1533] Support ShardedVariable in
 `tf.keras.layers.Embedding`.

A typical usage is user / dist strategy can define a variable_strategy_scope that creates ShardedVariable and build embedding layer under that scope. In this way `add_weights` returns ShardedVariable.

Note that this CL also switches to use embedding_lookup_v2, which always use "div" partition_strategy whereas embedding_lookup defaults to"mod". I expect this to be a safe change as we don't explicitly support sharded embedding lookup yet.

PiperOrigin-RevId: 312701263
Change-Id: Ic76ed454244ed4d77f7ee9ae9a07a8b663956458
---
 tensorflow/python/distribute/sharded_variable.py |  4 ++++
 tensorflow/python/keras/engine/BUILD             |  1 +
 tensorflow/python/keras/engine/base_layer.py     |  5 ++++-
 tensorflow/python/keras/layers/BUILD             | 11 +++++++++--
 tensorflow/python/keras/layers/embeddings.py     |  6 +++++-
 .../python/keras/layers/embeddings_test.py       | 16 ++++++++++++++++
 6 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index 9886e42a8b3..7accc066d8a 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -96,6 +96,10 @@ class ShardedVariable(trackable.Trackable):
                        'to the order of the `Variable`s in the list passed to '
                        'the constructor. Found {}'.format(save_slice_info))
 
+  def __iter__(self):
+    """Return an iterable for accessing the underlying sharded variables."""
+    return iter(self._variables)
+
   @property
   def variables(self):
     """The list of `Variable`s that make up the shards of this object."""
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 1ff15d7e2e1..231ab7661f0 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -118,6 +118,7 @@ py_library(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:constraints",
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 0f4bec92e39..0421772a75a 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -34,6 +34,7 @@ from tensorflow.python import tf2
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import context
 from tensorflow.python.eager import execute
 from tensorflow.python.eager import function
@@ -590,7 +591,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       self._handle_weight_regularization(name_in_scope,
                                          variable,
                                          regularizer)
-    if isinstance(variable, tf_variables.PartitionedVariable):
+    if isinstance(
+        variable,
+        (tf_variables.PartitionedVariable, sharded_variable.ShardedVariable)):
       for v in variable:
         backend.track_variable(v)
         if trainable:
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 46ac88754a8..10a9fe088ab 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -213,12 +213,13 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/keras:backend",
-        "//tensorflow/python/keras:base_layer",
         "//tensorflow/python/keras:constraints",
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras:regularizers",
+        "//tensorflow/python/keras/engine:base_layer",
         "//tensorflow/python/keras/utils:tf_utils",
     ],
 )
@@ -593,9 +594,15 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:training_lib",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/keras",
         "//tensorflow/python/keras:combinations",
-        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python/keras:testing_utils",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index e30e93f02dc..3444b3a7665 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
@@ -183,7 +184,10 @@ class Embedding(Layer):
     dtype = K.dtype(inputs)
     if dtype != 'int32' and dtype != 'int64':
       inputs = math_ops.cast(inputs, 'int32')
-    out = embedding_ops.embedding_lookup(self.embeddings, inputs)
+    if isinstance(self.embeddings, sharded_variable.ShardedVariable):
+      out = embedding_ops.embedding_lookup_v2(self.embeddings.variables, inputs)
+    else:
+      out = embedding_ops.embedding_lookup_v2(self.embeddings, inputs)
     return out
 
   def get_config(self):
diff --git a/tensorflow/python/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
index 661b29cd7bf..6aa873b2bd7 100644
--- a/tensorflow/python/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -21,12 +21,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import adagrad
@@ -130,6 +132,20 @@ class EmbeddingTest(keras_parameterized.TestCase):
             [[[1., 1.], [2., 2.], [2., 2.]], [[0., 0.]], [[1., 1.], [2., 2.]]],
             ragged_rank=1))
 
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_embedding_with_sharded_variable(self):
+    layer = keras.layers.Embedding(input_dim=5, output_dim=2)
+    v = [
+        variables.Variable([[1., 2.], [3., 4.]]),
+        variables.Variable([[5., 6.], [7., 8.]]),
+        variables.Variable([[9., 10.]])
+    ]
+    model = keras.models.Sequential([layer])
+    layer.embeddings = sharded_variable.ShardedVariable(v)
+    model.run_eagerly = testing_utils.should_run_eagerly()
+    outputs = model.predict(np.array([[0, 2, 4]], dtype='int32'))
+    self.assertAllClose(outputs, [[[1., 2.], [5., 6.], [9., 10.]]])
+
 
 if __name__ == '__main__':
   test.main()

From c53757b09d8f7cf9bcee7afd0cc537f7cd50b14b Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 21 May 2020 11:03:42 -0700
Subject: [PATCH 0978/1533] Fix makefile benchmark script to build on mac os.

Mac linker does not have -Wl,--whole-archive.

PiperOrigin-RevId: 312702308
Change-Id: Ie0a4b9e8453cea948f884c36c6d7cee96bb9ba86
---
 tensorflow/lite/tools/make/Makefile | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 41f87fb033d..3635ac95167 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -339,11 +339,18 @@ $(BENCHMARK_LIB) : $(LIB_PATH) $(BENCHMARK_LIB_OBJS)
 
 benchmark_lib: $(BENCHMARK_LIB)
 
+BENCHMARK_LINKOPTS :=
+ifeq ($(HOST_OS),osx)
+	BENCHMARK_LINKOPTS += $(LIBFLAGS) -Wl,-force_load $(BENCHMARK_LIB) $(LIBS) $(LDFLAGS) -framework CoreFoundation
+else
+	BENCHMARK_LINKOPTS += $(LIBFLAGS) -Wl,--whole-archive $(BENCHMARK_LIB) -Wl,--no-whole-archive $(LDFLAGS) $(LIBS)
+endif
+
 $(BENCHMARK_BINARY) : $(BENCHMARK_MAIN_OBJ) $(BENCHMARK_LIB)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
 	-o $(BENCHMARK_BINARY) $(BENCHMARK_MAIN_OBJ) \
-	$(LIBFLAGS) -Wl,--whole-archive $(BENCHMARK_LIB) -Wl,--no-whole-archive $(LDFLAGS) $(LIBS)
+	$(LIBFLAGS) $(BENCHMARK_LINKOPTS)
 
 $(BENCHMARK_PERF_OPTIONS_BINARY) : $(BENCHMARK_PERF_OPTIONS_OBJ) $(BENCHMARK_LIB)
 	@mkdir -p $(dir $@)

From e164659e5b607f2e7fc54fbc090894c78745d544 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 21 May 2020 11:04:45 -0700
Subject: [PATCH 0979/1533] Move the feature_column.LinearModel to estimator
 which is the only caller for it.

PiperOrigin-RevId: 312702513
Change-Id: Iac4cb6970ddb0e46fbdf1f043c4d11bf6ebc4429
---
 .../feature_column/feature_column_v2.py       |  262 ----
 .../feature_column/feature_column_v2_test.py  | 1269 -----------------
 .../feature_column/serialization_test.py      |   55 -
 3 files changed, 1586 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 7db4f17c10d..a03e4da0fae 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -145,8 +145,6 @@ from tensorflow.python.framework import tensor_shape
 # TODO(b/118385027): Dependency on keras can be problematic if Keras moves out
 # of the main repo.
 from tensorflow.python.keras import initializers
-from tensorflow.python.keras.engine import training as keras_training
-from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -154,7 +152,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
@@ -383,265 +380,6 @@ class _StateManagerImplV2(_StateManagerImpl):
     return var
 
 
-class _LinearModelLayer(Layer):
-  """Layer that contains logic for `LinearModel`."""
-
-  def __init__(self,
-               feature_columns,
-               units=1,
-               sparse_combiner='sum',
-               trainable=True,
-               name=None,
-               **kwargs):
-    super(_LinearModelLayer, self).__init__(
-        name=name, trainable=trainable, **kwargs)
-
-    self._feature_columns = _normalize_feature_columns(feature_columns)
-    for column in self._feature_columns:
-      if not isinstance(column, (DenseColumn, CategoricalColumn)):
-        raise ValueError(
-            'Items of feature_columns must be either a '
-            'DenseColumn or CategoricalColumn. Given: {}'.format(column))
-
-    self._units = units
-    self._sparse_combiner = sparse_combiner
-
-    self._state_manager = _StateManagerImpl(self, self.trainable)
-    self.bias = None
-
-  def build(self, _):
-    # We need variable scopes for now because we want the variable partitioning
-    # information to percolate down. We also use _pure_variable_scope's here
-    # since we want to open up a name_scope in the `call` method while creating
-    # the ops.
-    with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
-      for column in self._feature_columns:
-        with variable_scope._pure_variable_scope(  # pylint: disable=protected-access
-            _sanitize_column_name_for_variable_scope(column.name)):
-          # Create the state for each feature column
-          column.create_state(self._state_manager)
-
-          # Create a weight variable for each column.
-          if isinstance(column, CategoricalColumn):
-            first_dim = column.num_buckets
-          else:
-            first_dim = column.variable_shape.num_elements()
-          self._state_manager.create_variable(
-              column,
-              name='weights',
-              dtype=dtypes.float32,
-              shape=(first_dim, self._units),
-              initializer=initializers.zeros(),
-              trainable=self.trainable)
-
-      # Create a bias variable.
-      self.bias = self.add_variable(
-          name='bias_weights',
-          dtype=dtypes.float32,
-          shape=[self._units],
-          initializer=initializers.zeros(),
-          trainable=self.trainable,
-          use_resource=True,
-          # TODO(rohanj): Get rid of this hack once we have a mechanism for
-          # specifying a default partitioner for an entire layer. In that case,
-          # the default getter for Layers should work.
-          getter=variable_scope.get_variable)
-
-    super(_LinearModelLayer, self).build(None)
-
-  def call(self, features):
-    if not isinstance(features, dict):
-      raise ValueError('We expected a dictionary here. Instead we got: {}'
-                       .format(features))
-    with ops.name_scope(self.name):
-      transformation_cache = FeatureTransformationCache(features)
-      weighted_sums = []
-      for column in self._feature_columns:
-        with ops.name_scope(
-            _sanitize_column_name_for_variable_scope(column.name)):
-          # All the weights used in the linear model are owned by the state
-          # manager associated with this Linear Model.
-          weight_var = self._state_manager.get_variable(column, 'weights')
-
-          weighted_sum = _create_weighted_sum(
-              column=column,
-              transformation_cache=transformation_cache,
-              state_manager=self._state_manager,
-              sparse_combiner=self._sparse_combiner,
-              weight_var=weight_var)
-          weighted_sums.append(weighted_sum)
-
-      _verify_static_batch_size_equality(weighted_sums, self._feature_columns)
-      predictions_no_bias = math_ops.add_n(
-          weighted_sums, name='weighted_sum_no_bias')
-      predictions = nn_ops.bias_add(
-          predictions_no_bias, self.bias, name='weighted_sum')
-      return predictions
-
-  def get_config(self):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    column_configs = serialization.serialize_feature_columns(
-        self._feature_columns)
-    config = {
-        'feature_columns': column_configs,
-        'units': self._units,
-        'sparse_combiner': self._sparse_combiner
-    }
-
-    base_config = super(  # pylint: disable=bad-super-call
-        _LinearModelLayer, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None):
-    # Import here to avoid circular imports.
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
-    config_cp = config.copy()
-    columns = serialization.deserialize_feature_columns(
-        config_cp['feature_columns'], custom_objects=custom_objects)
-
-    del config_cp['feature_columns']
-    return cls(feature_columns=columns, **config_cp)
-
-
-# TODO(tanzheny): Cleanup it with respect to Premade model b/132690565.
-class LinearModel(keras_training.Model):
-  """Produces a linear prediction `Tensor` based on given `feature_columns`.
-
-  This layer generates a weighted sum based on output dimension `units`.
-  Weighted sum refers to logits in classification problems. It refers to the
-  prediction itself for linear regression problems.
-
-  Note on supported columns: `LinearLayer` treats categorical columns as
-  `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
-  like:
-
-  ```python
-    shape = [2, 2]
-    {
-        [0, 0]: "a"
-        [1, 0]: "b"
-        [1, 1]: "c"
-    }
-  ```
-  `linear_model` assigns weights for the presence of "a", "b", "c' implicitly,
-  just like `indicator_column`, while `input_layer` explicitly requires wrapping
-  each of categorical columns with an `embedding_column` or an
-  `indicator_column`.
-
-  Example of usage:
-
-  ```python
-  price = numeric_column('price')
-  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
-  keywords = categorical_column_with_hash_bucket("keywords", 10K)
-  keywords_price = crossed_column('keywords', price_buckets, ...)
-  columns = [price_buckets, keywords, keywords_price ...]
-  linear_model = LinearLayer(columns)
-
-  features = tf.io.parse_example(..., features=make_parse_example_spec(columns))
-  prediction = linear_model(features)
-  ```
-  """
-
-  def __init__(self,
-               feature_columns,
-               units=1,
-               sparse_combiner='sum',
-               trainable=True,
-               name=None,
-               **kwargs):
-    """Constructs a LinearLayer.
-
-    Args:
-      feature_columns: An iterable containing the FeatureColumns to use as
-        inputs to your model. All items should be instances of classes derived
-        from `_FeatureColumn`s.
-      units: An integer, dimensionality of the output space. Default value is 1.
-      sparse_combiner: A string specifying how to reduce if a categorical column
-        is multivalent. Except `numeric_column`, almost all columns passed to
-        `linear_model` are considered as categorical columns.  It combines each
-        categorical column independently. Currently "mean", "sqrtn" and "sum"
-        are supported, with "sum" the default for linear model. "sqrtn" often
-        achieves good accuracy, in particular with bag-of-words columns.
-          * "sum": do not normalize features in the column
-          * "mean": do l1 normalization on features in the column
-          * "sqrtn": do l2 normalization on features in the column
-        For example, for two features represented as the categorical columns:
-
-          ```python
-          # Feature 1
-
-          shape = [2, 2]
-          {
-              [0, 0]: "a"
-              [0, 1]: "b"
-              [1, 0]: "c"
-          }
-
-          # Feature 2
-
-          shape = [2, 3]
-          {
-              [0, 0]: "d"
-              [1, 0]: "e"
-              [1, 1]: "f"
-              [1, 2]: "g"
-          }
-          ```
-
-        with `sparse_combiner` as "mean", the linear model outputs conceptually
-        are
-        ```
-        y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0
-        y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1
-        ```
-        where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight
-        assigned to the presence of `x` in the input features.
-      trainable: If `True` also add the variable to the graph collection
-        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-      name: Name to give to the Linear Model. All variables and ops created will
-        be scoped by this name.
-      **kwargs: Keyword arguments to construct a layer.
-
-    Raises:
-      ValueError: if an item in `feature_columns` is neither a `DenseColumn`
-        nor `CategoricalColumn`.
-    """
-
-    super(LinearModel, self).__init__(name=name, **kwargs)
-    self.layer = _LinearModelLayer(
-        feature_columns,
-        units,
-        sparse_combiner,
-        trainable,
-        name=self.name,
-        **kwargs)
-
-  def call(self, features):
-    """Returns a `Tensor` the represents the predictions of a linear model.
-
-    Args:
-      features: A mapping from key to tensors. `_FeatureColumn`s look up via
-        these keys. For example `numeric_column('price')` will look at 'price'
-        key in this dict. Values are `Tensor` or `SparseTensor` depending on
-        corresponding `_FeatureColumn`.
-
-    Returns:
-      A `Tensor` which represents predictions/logits of a linear model. Its
-      shape is (batch_size, units) and its dtype is `float32`.
-
-    Raises:
-      ValueError: If features are not a dictionary.
-    """
-    return self.layer(features)
-
-  @property
-  def bias(self):
-    return self.layer.bias
-
-
 def _transform_features_v2(features, feature_columns, state_manager):
   """Returns transformed features based on features columns passed in.
 
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index 076515c84b8..91fb7eadb89 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -48,7 +48,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
-from tensorflow.python.training import rmsprop
 
 
 def _initialized_session(config=None):
@@ -439,36 +438,6 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.]], self.evaluate(price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
-
-  @test_util.run_deprecated_v1
-  def test_linear_model_sanitizes_scope_names(self):
-    price = fc.numeric_column('price > 100')
-    with ops.Graph().as_default():
-      features = {'price > 100': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.]], self.evaluate(price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
-
   def test_old_linear_model(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
@@ -705,63 +674,6 @@ class BucketizedColumnTest(test.TestCase):
     self.assertAllEqual(a_bucketized_copy.variable_shape, (2, 3))
     self.assertEqual(a_bucketized_copy.boundaries, (0, 1))
 
-  def test_linear_model_one_input_value(self):
-    """Tests linear_model() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
-    with ops.Graph().as_default():
-      features = {'price': [[-1.], [1.], [5.], [6.]]}
-      model = fc.LinearModel([bucketized_price])
-      predictions = model(features)
-      bucketized_price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            self.evaluate(bucketized_price_var))
-        self.assertAllClose([[0.], [0.], [0.], [0.]],
-                            self.evaluate(predictions))
-        sess.run(
-            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
-        # price -1. is in the 0th bucket, whose weight is 10.
-        # price 1. is in the 1st bucket, whose weight is 20.
-        # price 5. is in the 3rd bucket, whose weight is 40.
-        # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]],
-                            self.evaluate(predictions))
-        sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]],
-                            self.evaluate(predictions))
-
-  def test_linear_model_two_input_values(self):
-    """Tests linear_model() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
-    with ops.Graph().as_default():
-      features = {'price': [[-1., 1.], [5., 6.]]}
-      model = fc.LinearModel([bucketized_price])
-      predictions = model(features)
-      bucketized_price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        # One weight per bucket per input column, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            self.evaluate(bucketized_price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(
-            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
-                                         [60.], [70.], [80.], [90.], [100.]]))
-        # 1st example:
-        #   price -1. is in the 0th bucket, whose weight is 10.
-        #   price 1. is in the 6th bucket, whose weight is 70.
-        # 2nd example:
-        #   price 5. is in the 3rd bucket, whose weight is 40.
-        #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
-        sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
-
   def test_old_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
     price = fc.numeric_column('price', shape=[1])
@@ -1070,32 +982,6 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
-    self.assertEqual(4, wire_column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((wire_column,))
-      predictions = model({
-          wire_column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=('marlo', 'skywalker', 'omar'),
-                  dense_shape=(2, 2))
-      })
-      wire_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
-      # 'marlo' -> 3: wire_var[3] = 4
-      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column.num_buckets)
@@ -1364,101 +1250,6 @@ class CrossedColumnTest(test.TestCase):
       self.assertAllEqual(expected_values, id_tensor_eval.values)
       self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    """Tests linear_model.
-
-    Uses data from test_get_sparse_tensors_simple.
-    """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((crossed,))
-      predictions = model({
-          'a':
-              constant_op.constant(((-1., .5), (.5, 1.))),
-          'c':
-              sparse_tensor.SparseTensor(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=['cA', 'cB', 'cC'],
-                  dense_shape=(2, 2)),
-      })
-      crossed_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose((0.,), self.evaluate(bias))
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            self.evaluate(crossed_var))
-        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-        sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
-        # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
-        sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
-
-  def test_linear_model_with_weights(self):
-
-    class _TestColumnWithWeights(BaseFeatureColumnForTests,
-                                 fc.CategoricalColumn):
-      """Produces sparse IDs and sparse weights."""
-
-      @property
-      def _is_v2_column(self):
-        return True
-
-      @property
-      def name(self):
-        return 'test_column'
-
-      @property
-      def parse_example_spec(self):
-        return {
-            self.name:
-                parsing_ops.VarLenFeature(dtypes.int32),
-            '{}_weights'.format(self.name):
-                parsing_ops.VarLenFeature(dtypes.float32),
-        }
-
-      @property
-      def num_buckets(self):
-        return 5
-
-      def transform_feature(self, transformation_cache, state_manager):
-        return (transformation_cache.get(self.name, state_manager),
-                transformation_cache.get('{}_weights'.format(self.name),
-                                         state_manager))
-
-      def get_sparse_tensors(self, transformation_cache, state_manager):
-        """Populates both id_tensor and weight_tensor."""
-        ids_and_weights = transformation_cache.get(self, state_manager)
-        return fc.CategoricalColumn.IdWeightPair(
-            id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
-
-    t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(
-          ValueError,
-          'crossed_column does not support weight_tensor.*{}'.format(t.name)):
-        model = fc.LinearModel((crossed,))
-        model({
-            t.name:
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=[0, 1, 2],
-                    dense_shape=(2, 2)),
-            '{}_weights'.format(t.name):
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=[1., 10., 2.],
-                    dense_shape=(2, 2)),
-            'c':
-                sparse_tensor.SparseTensor(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=['cA', 'cB', 'cC'],
-                    dense_shape=(2, 2)),
-        })
-
   def test_old_linear_model(self):
     """Tests linear_model.
 
@@ -1643,668 +1434,6 @@ class CrossedColumnTest(test.TestCase):
     self.assertIs(b, new_crossed.keys[0])
 
 
-class LinearModelTest(test.TestCase):
-
-  def test_raises_if_empty_feature_columns(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 'feature_columns must not be empty'):
-      fc.LinearModel(feature_columns=[])
-
-  def test_should_be_feature_column(self):
-    with self.assertRaisesRegexp(ValueError, 'must be a FeatureColumn'):
-      fc.LinearModel(feature_columns='NotSupported')
-
-  def test_should_be_dense_or_categorical_column(self):
-
-    class NotSupportedColumn(BaseFeatureColumnForTests):
-
-      @property
-      def _is_v2_column(self):
-        return True
-
-      @property
-      def name(self):
-        return 'NotSupportedColumn'
-
-      def transform_feature(self, transformation_cache, state_manager):
-        pass
-
-      @property
-      def parse_example_spec(self):
-        pass
-
-    with self.assertRaisesRegexp(
-        ValueError, 'must be either a DenseColumn or CategoricalColumn'):
-      fc.LinearModel(feature_columns=[NotSupportedColumn()])
-
-  def test_does_not_support_dict_columns(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.LinearModel(feature_columns={'a': fc.numeric_column('a')})
-
-  def test_raises_if_duplicate_name(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'Duplicate feature column name found for columns'):
-      fc.LinearModel(
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
-
-  def test_not_dict_input_features(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = [[1.], [5.]]
-      model = fc.LinearModel([price])
-      with self.assertRaisesRegexp(ValueError, 'We expected a dictionary here'):
-        model(features)
-
-  def test_dense_bias(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        sess.run(price_var.assign([[10.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
-
-  def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast])
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.], [0.], [0.], [0.]],
-                            self.evaluate(wire_cast_var))
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
-
-  def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor, 'price': [[1.], [5.]]}
-      model = fc.LinearModel([wire_cast, price])
-      predictions = model(features)
-      price_var, wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
-
-  def test_dense_and_sparse_column(self):
-    """When the column is both dense and sparse, uses sparse tensors."""
-
-    class _DenseAndSparseColumn(BaseFeatureColumnForTests, fc.DenseColumn,
-                                fc.CategoricalColumn):
-
-      @property
-      def _is_v2_column(self):
-        return True
-
-      @property
-      def name(self):
-        return 'dense_and_sparse_column'
-
-      @property
-      def parse_example_spec(self):
-        return {self.name: parsing_ops.VarLenFeature(self.dtype)}
-
-      def transform_feature(self, transformation_cache, state_manager):
-        return transformation_cache.get(self.name, state_manager)
-
-      @property
-      def variable_shape(self):
-        raise ValueError('Should not use this method.')
-
-      def get_dense_tensor(self, transformation_cache, state_manager):
-        raise ValueError('Should not use this method.')
-
-      @property
-      def num_buckets(self):
-        return 4
-
-      def get_sparse_tensors(self, transformation_cache, state_manager):
-        sp_tensor = sparse_tensor.SparseTensor(
-            indices=[[0, 0], [1, 0], [1, 1]],
-            values=[2, 0, 3],
-            dense_shape=[2, 2])
-        return fc.CategoricalColumn.IdWeightPair(sp_tensor, None)
-
-    dense_and_sparse_column = _DenseAndSparseColumn()
-    with ops.Graph().as_default():
-      sp_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {dense_and_sparse_column.name: sp_tensor}
-      model = fc.LinearModel([dense_and_sparse_column])
-      predictions = model(features)
-      dense_and_sparse_column_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(
-            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
-                                                [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
-
-  def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price], units=3)
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
-        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
-        sess.run(price_var.assign([[10., 100., 1000.]]))
-        sess.run(bias.assign([5., 6., 7.]))
-        self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            self.evaluate(predictions))
-
-  def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], units=3)
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
-        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
-        sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
-                                  [1000., 1100., 1200.],
-                                  [10000., 11000., 12000.]]))
-        sess.run(bias.assign([5., 6., 7.]))
-        self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            self.evaluate(predictions))
-
-  def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, _ = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
-        sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
-
-  def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = array_ops.sparse_placeholder(dtypes.string)
-      wire_value = sparse_tensor.SparseTensorValue(
-          values=['omar', 'stringer', 'marlo', 'omar'],  # hashed = [2, 0, 3, 2]
-          indices=[[0, 0, 0], [0, 1, 0], [1, 0, 0], [1, 0, 1]],
-          dense_shape=[2, 2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast])
-      predictions = model(features)
-      wire_cast_var, _ = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
-        self.assertAllClose(
-            np.zeros((2, 1)),
-            predictions.eval(feed_dict={wire_tensor: wire_value}))
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        self.assertAllClose(
-            [[1010.], [11000.]],
-            predictions.eval(feed_dict={wire_tensor: wire_value}))
-
-  def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], sparse_combiner='mean')
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
-
-  def test_sparse_combiner_sqrtn(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], sparse_combiner='sqrtn')
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.evaluate(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        self.evaluate(bias.assign([5.]))
-        self.assertAllClose([[1005.], [7083.139]], self.evaluate(predictions))
-
-  def test_sparse_combiner_with_negative_weights(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
-
-    with ops.Graph().as_default():
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
-          indices=[[0, 0], [1, 0], [1, 1]],
-          dense_shape=[2, 2])
-      features = {
-          'wire_cast': wire_tensor,
-          'weights': constant_op.constant([[1., 1., -1.0]])
-      }
-      model = fc.LinearModel([wire_cast_weights], sparse_combiner='sum')
-      predictions = model(features)
-      wire_cast_var, bias = model.variables
-      with _initialized_session() as sess:
-        sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
-        sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
-
-  def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1., 2.], [5., 6.]]}
-      model = fc.LinearModel([price], units=3)
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
-        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
-        sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
-        sess.run(bias.assign([2., 3., 4.]))
-        self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            self.evaluate(predictions))
-
-  def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
-    with ops.Graph().as_default():
-      features = {'price': [[1.], [5.]]}
-      with self.assertRaisesRegexp(
-          Exception,
-          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        model = fc.LinearModel([price])
-        model(features)
-
-  def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
-    with ops.Graph().as_default():
-      features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      model = fc.LinearModel([price])
-      predictions = model(features)
-      price_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
-
-  def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      model = fc.LinearModel([price1, price2])
-      predictions = model(features)
-      price1_var, price2_var, bias = model.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias))
-        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
-        self.assertAllClose([[0.]], self.evaluate(price2_var))
-        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
-        sess.run(price1_var.assign([[10.], [100.]]))
-        sess.run(price2_var.assign([[1000.]]))
-        sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
-
-  def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price])
-      model(features)
-      price_var, bias = model.variables
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertIn(bias, trainable_vars)
-      self.assertIn(price_var, trainable_vars)
-
-  def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast])
-      model(features)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      wire_cast_var, bias = model.variables
-      self.assertIn(bias, trainable_vars)
-      self.assertIn(wire_cast_var, trainable_vars)
-
-  def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default() as g:
-      features = {'price': [[1.], [5.]]}
-      model = fc.LinearModel([price], trainable=False)
-      model(features)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertEqual([], trainable_vars)
-
-  def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default() as g:
-      wire_tensor = sparse_tensor.SparseTensor(
-          values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      features = {'wire_cast': wire_tensor}
-      model = fc.LinearModel([wire_cast], trainable=False)
-      model(features)
-      trainable_vars = g.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      self.assertEqual([], trainable_vars)
-
-  def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    with ops.Graph().as_default():
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-          'wire_cast':
-              sparse_tensor.SparseTensor(
-                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      }
-      model = fc.LinearModel([price_a, wire_cast, price_b])
-      model(features)
-
-      my_vars = model.variables
-      self.assertIn('price_a', my_vars[0].name)
-      self.assertIn('price_b', my_vars[1].name)
-      self.assertIn('wire_cast', my_vars[2].name)
-
-    with ops.Graph().as_default():
-      features = {
-          'price_a': [[1.]],
-          'price_b': [[3.]],
-          'wire_cast':
-              sparse_tensor.SparseTensor(
-                  values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
-      }
-      model = fc.LinearModel([wire_cast, price_b, price_a])
-      model(features)
-
-      my_vars = model.variables
-      self.assertIn('price_a', my_vars[0].name)
-      self.assertIn('price_b', my_vars[1].name)
-      self.assertIn('wire_cast', my_vars[2].name)
-
-  def test_variable_names(self):
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
-        dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
-        'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
-        some_sparse_column, dimension=10)
-    all_cols = [price1, dense_feature_bucketized, some_embedding_column]
-
-    with ops.Graph().as_default():
-      model = fc.LinearModel(all_cols)
-      features = {
-          'price1': [[3.], [4.]],
-          'dense_feature': [[-1.], [4.]],
-          'sparse_feature': [['a'], ['x']],
-      }
-      model(features)
-      for var in model.variables:
-        self.assertIsInstance(var, variables_lib.VariableV1)
-      variable_names = [var.name for var in model.variables]
-      self.assertCountEqual([
-          'linear_model/dense_feature_bucketized/weights:0',
-          'linear_model/price1/weights:0',
-          'linear_model/sparse_feature_embedding/embedding_weights:0',
-          'linear_model/sparse_feature_embedding/weights:0',
-          'linear_model/bias_weights:0',
-      ], variable_names)
-
-  def test_fit_and_predict(self):
-    columns = [fc.numeric_column('a')]
-
-    model = fc.LinearModel(columns)
-    model.compile(
-        optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        loss='binary_crossentropy',
-        metrics=['accuracy'])
-
-    x = {'a': np.random.random((10, 1))}
-    y = np.random.randint(0, 2, size=(10, 1))
-    model.fit(x, y, epochs=1, batch_size=5)
-    model.fit(x, y, epochs=1, batch_size=5)
-    model.evaluate(x, y, batch_size=5)
-    model.predict(x, batch_size=5)
-
-  def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': [[1.], [5.], [7.]],  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-      model = fc.LinearModel([price1, price2])
-      model(features)
-
-  def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]],  # batchsize = 2
-          'price3': [[3.], [4.], [5.]]  # batchsize = 3
-      }
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        model = fc.LinearModel([price1, price2, price3])
-        model(features)
-
-  def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
-          'price2': [[3.], [4.]]  # batchsize = 2
-      }
-      model = fc.LinearModel([price1, price2])
-      predictions = model(features)
-      with _initialized_session() as sess:
-        with self.assertRaisesRegexp(errors.OpError,
-                                     'must have the same size and shape'):
-          sess.run(
-              predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
-
-  def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    with ops.Graph().as_default():
-      features = {
-          'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-          'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
-      }
-      model = fc.LinearModel([price1, price2])
-      predictions = model(features)
-      with _initialized_session() as sess:
-        sess.run(
-            predictions,
-            feed_dict={
-                features['price1']: [[1.], [5.]],
-                features['price2']: [[1.], [5.]],
-            })
-
-  @test_util.run_deprecated_v1
-  def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price':
-            constant_op.constant([
-                -1.,
-                12.,
-            ]),
-        'body-style':
-            sparse_tensor.SparseTensor(
-                indices=((0,), (1,)),
-                values=('sedan', 'hardtop'),
-                dense_shape=(2,)),
-    }
-    self.assertEqual(1, features['price'].shape.ndims)
-    self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
-
-    model = fc.LinearModel([price_buckets, body_style])
-    net = model(features)
-    with _initialized_session() as sess:
-      body_style_var, price_buckets_var, bias = model.variables
-
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
-                          self.evaluate(net))
-
-  @test_util.run_deprecated_v1
-  def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
-        price, boundaries=[
-            0.,
-            10.,
-            100.,
-        ])
-    body_style = fc.categorical_column_with_vocabulary_list(
-        'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
-        'country', vocabulary_list=['US', 'JP', 'CA'])
-
-    # Provides 1-dim tensor and dense tensor.
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-        'body-style': array_ops.sparse_placeholder(dtypes.string),
-        'country': array_ops.placeholder(dtypes.string),
-    }
-    self.assertIsNone(features['price'].shape.ndims)
-    self.assertIsNone(features['body-style'].get_shape().ndims)
-
-    price_data = np.array([-1., 12.])
-    body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
-    country_data = np.array(['US', 'CA'])
-
-    model = fc.LinearModel([price_buckets, body_style, country])
-    net = model(features)
-    body_style_var, _, price_buckets_var, bias = model.variables
-    with _initialized_session() as sess:
-      sess.run(price_buckets_var.assign([[10.], [100.], [1000.], [10000.]]))
-      sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
-      sess.run(bias.assign([5.]))
-
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
-                          sess.run(
-                              net,
-                              feed_dict={
-                                  features['price']: price_data,
-                                  features['body-style']: body_style_data,
-                                  features['country']: country_data
-                              }))
-
-  @test_util.run_deprecated_v1
-  def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
-    features = {
-        'price': constant_op.constant(0),
-    }
-    self.assertEqual(0, features['price'].shape.ndims)
-
-    # Static rank 0 should fail
-    with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      model = fc.LinearModel([price])
-      model(features)
-
-    # Dynamic rank 0 should fail
-    features = {
-        'price': array_ops.placeholder(dtypes.float32),
-    }
-    model = fc.LinearModel([price])
-    net = model(features)
-    self.assertEqual(1, net.shape[1])
-    with _initialized_session() as sess:
-      with self.assertRaisesOpError('Feature .* cannot have rank 0'):
-        sess.run(net, feed_dict={features['price']: np.array(1)})
-
-  def test_multiple_linear_models(self):
-    price = fc.numeric_column('price')
-    with ops.Graph().as_default():
-      features1 = {'price': [[1.], [5.]]}
-      features2 = {'price': [[2.], [10.]]}
-      model1 = fc.LinearModel([price])
-      model2 = fc.LinearModel([price])
-      predictions1 = model1(features1)
-      predictions2 = model2(features2)
-      price_var1, bias1 = model1.variables
-      price_var2, bias2 = model2.variables
-      with _initialized_session() as sess:
-        self.assertAllClose([0.], self.evaluate(bias1))
-        sess.run(price_var1.assign([[10.]]))
-        sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
-        self.assertAllClose([0.], self.evaluate(bias2))
-        sess.run(price_var2.assign([[10.]]))
-        sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
-
-
 class OldLinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -4361,36 +3490,6 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             dense_shape=inputs.dense_shape),
         self.evaluate(id_weight_pair.id_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
-        key='wire',
-        vocabulary_file=self._wire_vocabulary_file_name,
-        vocabulary_size=self._wire_vocabulary_size,
-        num_oov_buckets=1)
-    self.assertEqual(4, wire_column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((wire_column,))
-      predictions = model({
-          wire_column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=('marlo', 'skywalker', 'omar'),
-                  dense_shape=(2, 2))
-      })
-      wire_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
-      # 'marlo' -> 2: wire_var[2] = 3
-      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
@@ -4827,35 +3926,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             dense_shape=inputs.dense_shape),
         self.evaluate(id_weight_pair.id_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'),
-        num_oov_buckets=1)
-    self.assertEqual(4, wire_column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((wire_column,))
-      predictions = model({
-          wire_column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=('marlo', 'skywalker', 'omar'),
-                  dense_shape=(2, 2))
-      })
-      wire_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
-      # 'marlo' -> 2: wire_var[2] = 3
-      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5195,32 +4265,6 @@ class IdentityCategoricalColumnTest(test.TestCase):
                   input_shape: (2, 2),
               }))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    self.assertEqual(3, column.num_buckets)
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,))
-      predictions = model({
-          column.name:
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2))
-      })
-      weight_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
-      # weight_var[0] = 1
-      # weight_var[2] + weight_var[1] = 3+2 = 5
-      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column.num_buckets)
@@ -5513,30 +4557,6 @@ class IndicatorColumnTest(test.TestCase):
 
     self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-
-      model = fc.LinearModel([animal])
-      predictions = model(features)
-      weight_var, _ = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      # All should be zero-initialized.
-      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
-      self.assertAllClose([[0.]], self.evaluate(predictions))
-      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
-      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
-
   def test_old_linear_model(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -6171,88 +5191,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
     self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    # Inputs.
-    batch_size = 4
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(batch_size, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    with ops.Graph().as_default():
-      model = fc.LinearModel((embedding_column,))
-      predictions = model({categorical_column.name: sparse_input})
-      expected_var_names = (
-          'linear_model/bias_weights:0',
-          'linear_model/aaa_embedding/weights:0',
-          'linear_model/aaa_embedding/embedding_weights:0',
-      )
-      self.assertCountEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-      trainable_vars = {
-          v.name: v
-          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      }
-      self.assertCountEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
-      embedding_weights = trainable_vars[
-          'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      # Predictions with all zero weights.
-      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
-      self.assertAllClose(zeros_embedding_values,
-                          self.evaluate(embedding_weights))
-      self.assertAllClose(
-          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
-      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
-
-      # Predictions with all non-zero weights.
-      self.evaluate(
-          embedding_weights.assign((
-              (1., 2.),  # id 0
-              (3., 5.),  # id 1
-              (7., 11.)  # id 2
-          )))
-      self.evaluate(linear_weights.assign(((4.,), (6.,))))
-      # example 0, ids [2], embedding[0] = [7, 11]
-      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-      # example 2, ids [], embedding[2] = [0, 0]
-      # example 3, ids [1], embedding[3] = [3, 5]
-      # sum(embeddings * linear_weights)
-      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
-                          self.evaluate(predictions))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -7088,104 +6026,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    # Inputs.
-    batch_size = 2
-    vocabulary_size = 3
-    # -1 values are ignored.
-    input_a = np.array([
-        [2, -1, -1],  # example 0, ids [2]
-        [0, 1, -1]
-    ])  # example 1, ids [0, 1]
-    input_b = np.array([
-        [0, -1, -1],  # example 0, ids [0]
-        [-1, -1, -1]
-    ])  # example 1, ids []
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_shape = (vocabulary_size, embedding_dimension)
-    zeros_embedding_values = np.zeros(embedding_shape)
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual(embedding_shape, shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return zeros_embedding_values
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
-
-    with ops.Graph().as_default():
-      model = fc.LinearModel((embedding_column_a, embedding_column_b))
-      predictions = model({
-          categorical_column_a.name: input_a,
-          categorical_column_b.name: input_b
-      })
-
-      # Linear weights do not follow the column name. But this is a rare use
-      # case, and fixing it would add too much complexity to the code.
-      expected_var_names = (
-          'linear_model/bias_weights:0',
-          'linear_model/aaa_shared_embedding/weights:0',
-          'aaa_bbb_shared_embedding:0',
-          'linear_model/bbb_shared_embedding/weights:0',
-      )
-      self.assertCountEqual(
-          expected_var_names,
-          [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
-      trainable_vars = {
-          v.name: v
-          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-      }
-      self.assertCountEqual(expected_var_names, trainable_vars.keys())
-      bias = trainable_vars['linear_model/bias_weights:0']
-      embedding_weights = trainable_vars['aaa_bbb_shared_embedding:0']
-      linear_weights_a = trainable_vars[
-          'linear_model/aaa_shared_embedding/weights:0']
-      linear_weights_b = trainable_vars[
-          'linear_model/bbb_shared_embedding/weights:0']
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      # Predictions with all zero weights.
-      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
-      self.assertAllClose(zeros_embedding_values,
-                          self.evaluate(embedding_weights))
-      self.assertAllClose(
-          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
-      self.assertAllClose(
-          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
-      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
-
-      # Predictions with all non-zero weights.
-      self.evaluate(
-          embedding_weights.assign((
-              (1., 2.),  # id 0
-              (3., 5.),  # id 1
-              (7., 11.)  # id 2
-          )))
-      self.evaluate(linear_weights_a.assign(((4.,), (6.,))))
-      # example 0, ids [2], embedding[0] = [7, 11]
-      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-      # sum(embeddings * linear_weights)
-      # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
-      self.evaluate(linear_weights_b.assign(((3.,), (5.,))))
-      # example 0, ids [0], embedding[0] = [1, 2]
-      # example 1, ids [], embedding[1] = 0, 0]
-      # sum(embeddings * linear_weights)
-      # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-      self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
-
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
@@ -7424,115 +6264,6 @@ class WeightedCategoricalColumnTest(test.TestCase):
             values=np.array((.5, 1., .1), dtype=np.float32),
             dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
-  @test_util.run_deprecated_v1
-  def test_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,))
-      predictions = model({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(.5, 1., .1),
-                  dense_shape=(2, 2))
-      })
-      weight_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
-      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-      # = 3*1 + 2*.1 = 3+.2 = 3.2
-      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
-
-  def test_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError,
-                                   r'Dimensions.*are not compatible'):
-        model = fc.LinearModel((column,))
-        model({
-            'ids':
-                sparse_tensor.SparseTensorValue(
-                    indices=((0, 0), (1, 0), (1, 1)),
-                    values=(0, 2, 1),
-                    dense_shape=(2, 2)),
-            'values':
-                sparse_tensor.SparseTensorValue(
-                    indices=((0, 0), (0, 1), (1, 0), (1, 1)),
-                    values=(.5, 11., 1., .1),
-                    dense_shape=(2, 2))
-        })
-
-  def test_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,), sparse_combiner='mean')
-      predictions = model({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values': ((.5,), (1.,))
-      })
-      # Disabling the constant folding optimizer here since it changes the
-      # error message differently on CPU and GPU.
-      config = config_pb2.ConfigProto()
-      config.graph_options.rewrite_options.constant_folding = (
-          rewriter_config_pb2.RewriterConfig.OFF)
-      with _initialized_session(config):
-        with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          self.evaluate(predictions)
-
-  def test_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
-            key='ids', num_buckets=3),
-        weight_feature_key='values')
-    with ops.Graph().as_default():
-      model = fc.LinearModel((column,))
-      predictions = model({
-          'ids':
-              sparse_tensor.SparseTensorValue(
-                  indices=((0, 0), (1, 0), (1, 1)),
-                  values=(0, 2, 1),
-                  dense_shape=(2, 2)),
-          'values': ((.5,), (1.,), (.1,))
-      })
-      weight_var, bias = model.variables
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose((0.,), self.evaluate(bias))
-      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
-      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
-      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
-      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-      # = 3*1 + 2*.1 = 3+.2 = 3.2
-      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
-
   def test_old_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 881ca0cca5e..69b954022af 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -18,11 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
 
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
-from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -113,58 +111,5 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', 1, 'sum', None, None),
-      ('trainable', 6, 'mean', True, 'trainable'),
-      ('not_trainable', 10, 'sum', False, 'frozen'))
-  def test_get_config(self, units, sparse_combiner, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.categorical_column_with_identity(key='b', num_buckets=3)]
-    layer = fc._LinearModelLayer(
-        cols, units=units, sparse_combiner=sparse_combiner,
-        trainable=trainable, name=name)
-    config = layer.get_config()
-
-    self.assertEqual(config['name'], layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertEqual(config['units'], units)
-    self.assertEqual(config['sparse_combiner'], sparse_combiner)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'IdentityCategoricalColumn')
-
-  @parameterized.named_parameters(
-      ('default', 1, 'sum', None, None),
-      ('trainable', 6, 'mean', True, 'trainable'),
-      ('not_trainable', 10, 'sum', False, 'frozen'))
-  def test_from_config(self, units, sparse_combiner, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=('1', '2', '3')),
-            fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3)]
-    orig_layer = fc._LinearModelLayer(
-        cols, units=units, sparse_combiner=sparse_combiner,
-        trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = fc._LinearModelLayer.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer._units, units)
-    self.assertEqual(new_layer._sparse_combiner, sparse_combiner)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(
-        new_layer._feature_columns[1].vocabulary_list, ('1', '2', '3'))
-    self.assertEqual(new_layer._feature_columns[2].num_buckets, 3)
-
-
 if __name__ == '__main__':
   test.main()

From 2b7ed8a3458f268231b8c50896624c07f8259db9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 11:05:50 -0700
Subject: [PATCH 0980/1533] cupti tracer allow device synchronization before
 gputracer stop.

PiperOrigin-RevId: 312702777
Change-Id: Ied5df1fb045c6e2c35ae0b63dc73fb04be54104f
---
 .../profiler/internal/gpu/cupti_tracer.cc     | 34 +++++++++++++++++--
 .../core/profiler/internal/gpu/cupti_tracer.h |  4 ++-
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 9119c3d5d0b..51f89bd7b0a 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_map.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
@@ -614,15 +615,42 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
     // Grab timestamp for API exit. API entry timestamp saved in cbdata.
     uint64 end_tsc = CuptiTracer::GetTimestamp();
     uint64 start_tsc = *cbdata->correlationData;
+    TrackContext(cbid, cbdata->context);
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
-  Status Flush() override { return Status::OK(); }
+  Status SyncAndFlush() override {
+    if (option_.sync_devices_before_stop) {
+      CuptiApiTracingDisabler disabler;
+      absl::MutexLock lock(&mutex_);
+      for (auto &ctx : contexts_) {
+        cuCtxPushCurrent(ctx);
+        cuCtxSynchronize();  // Ignore error here for best effort.
+        CUcontext current;
+        cuCtxPopCurrent(&current);
+      }
+    }
+    return Status::OK();
+  }
 
  private:
+  void TrackContext(CUpti_CallbackId cbid, CUcontext ctx) {
+    if (!option_.sync_devices_before_stop) return;
+    if (ctx == NULL) return;
+    absl::MutexLock lock(&mutex_);
+    if (cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy_v2 ||
+        cbid == CUPTI_DRIVER_TRACE_CBID_cuCtxDestroy) {
+      contexts_.erase(ctx);
+    } else {
+      contexts_.emplace(ctx);
+    }
+  }
+
   const CuptiTracerOptions option_;
   CuptiInterface *cupti_interface_;
   CuptiTraceCollector *collector_;
+  absl::Mutex mutex_;
+  absl::flat_hash_set<CUcontext> contexts_ TF_GUARDED_BY(mutex_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithActivityApi);
 };
@@ -1158,7 +1186,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
     return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
                                      start_tsc, end_tsc, domain, cbid, cbdata);
   }
-  Status Flush() override {
+  Status SyncAndFlush() override {
     for (auto &recorder : cuda_event_recorders_) {
       TF_RETURN_IF_ERROR(recorder->Stop());
     }
@@ -1397,7 +1425,7 @@ void CuptiTracer::Disable() {
   }
   cupti_interface_->CleanUp();
   Finalize().IgnoreError();
-  cupti_driver_api_hook_->Flush().IgnoreError();
+  cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
   collector_->Flush();
   collector_ = nullptr;
   option_.reset();
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
index e236afc5c41..a62c08013e8 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h
@@ -147,6 +147,8 @@ struct CuptiTracerOptions {
   std::vector<CUpti_ActivityKind> activities_selected;
   // Whether to call cuptiFinalize.
   bool cupti_finalize = false;
+  // Whether to call cuCtxSynchronize for each device before Stop().
+  bool sync_devices_before_stop = false;
 };
 
 struct CuptiTracerCollectorOptions {
@@ -219,7 +221,7 @@ class CuptiDriverApiHook {
   virtual Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
                                  CUpti_CallbackId cbid,
                                  const CUpti_CallbackData* callback_info) = 0;
-  virtual Status Flush() = 0;
+  virtual Status SyncAndFlush() = 0;
 
  protected:
   static Status AddDriverApiCallbackEvent(

From c9c8ac3cb9077a066ae47d7f3ab9cb96375ec734 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 21 May 2020 11:09:38 -0700
Subject: [PATCH 0981/1533] [tf.data service] Perform tf.data service
 compression within tf.data.

Instead of explicitly compressing/decompressing tensors within the tf.data service, we now amend the user-defined dataset with compression/decompression transformations. This allows us to use tf.data infrastructure to prefetch compressed elements on tf.data workers.

PiperOrigin-RevId: 312703584
Change-Id: I6234200ce7c214ee9d529484449dc8f5c9ff74c6
---
 tensorflow/core/data/service/BUILD            |   2 +-
 .../core/data/service/data_service_test.cc    | 113 ------------------
 tensorflow/core/data/service/worker_impl.cc   |  32 ++++-
 .../experimental/data_service_dataset_op.cc   |   6 +-
 .../data/experimental/ops/data_service_ops.py |  23 +++-
 5 files changed, 55 insertions(+), 121 deletions(-)

diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index b87f4f171cd..b76f93c454e 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -98,7 +98,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/data:dataset_proto_cc",
         "//tensorflow/core/data:standalone",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index bd01cb90a66..a4505d8965f 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -35,9 +35,6 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-namespace {
-constexpr const char kProtocol[] = "grpc+local";
-
 TEST(DataService, ParseParallelEpochsProcessingMode) {
   ProcessingMode mode;
   TF_ASSERT_OK(ParseProcessingMode("parallel_epochs", &mode));
@@ -62,115 +59,5 @@ TEST(DataService, ProcessingModeToString) {
   EXPECT_EQ("one_epoch", ProcessingModeToString(ProcessingMode::ONE_EPOCH));
 }
 
-Status CheckWorkerOutput(const std::string& worker_address, int64 task_id,
-                         std::vector<std::vector<Tensor>> expected_output) {
-  DataServiceWorkerClient worker(worker_address, kProtocol);
-  for (std::vector<Tensor>& expected : expected_output) {
-    bool end_of_sequence;
-    CompressedElement compressed;
-    TF_RETURN_IF_ERROR(
-        worker.GetElement(task_id, &compressed, &end_of_sequence));
-    if (end_of_sequence) {
-      return errors::Internal("Reached end of sequence too early.");
-    }
-    std::vector<Tensor> element;
-    TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
-    TF_RETURN_IF_ERROR(DatasetOpsTestBase::ExpectEqual(element, expected,
-                                                       /*compare_order=*/true));
-  }
-  // Call GetElement a couple more times to verify tha end_of_sequence keeps
-  // returning true.
-  bool end_of_sequence;
-  CompressedElement compressed;
-  TF_RETURN_IF_ERROR(worker.GetElement(task_id, &compressed, &end_of_sequence));
-  if (!end_of_sequence) {
-    return errors::Internal("Expected end_of_sequence to be true");
-  }
-  TF_RETURN_IF_ERROR(worker.GetElement(task_id, &compressed, &end_of_sequence));
-  if (!end_of_sequence) {
-    return errors::Internal("Expected end_of_sequence to be true");
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
-TEST(DataService, IterateDatasetOneWorker) {
-  TestCluster cluster(1);
-  TF_ASSERT_OK(cluster.Initialize());
-  test_util::GraphDefTestCase test_case;
-  TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
-
-  int64 dataset_id;
-  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
-  int64 job_id;
-  TF_ASSERT_OK(
-      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
-  std::vector<TaskInfo> tasks;
-  bool job_finished;
-  TF_ASSERT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  ASSERT_EQ(tasks.size(), 1);
-  EXPECT_EQ(tasks[0].worker_address(), cluster.WorkerAddress(0));
-  EXPECT_FALSE(job_finished);
-
-  TF_EXPECT_OK(CheckWorkerOutput(tasks[0].worker_address(), tasks[0].id(),
-                                 test_case.output));
-}
-
-TEST(DataService, IterateDatasetTwoWorkers) {
-  TestCluster cluster(2);
-  TF_ASSERT_OK(cluster.Initialize());
-  test_util::GraphDefTestCase test_case;
-  TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
-
-  int64 dataset_id;
-  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
-  int64 job_id;
-  TF_ASSERT_OK(
-      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
-  std::vector<TaskInfo> tasks;
-  bool job_finished;
-  TF_EXPECT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  EXPECT_EQ(tasks.size(), 2);
-  EXPECT_FALSE(job_finished);
-
-  // Each worker produces the full dataset.
-  for (TaskInfo task : tasks) {
-    TF_EXPECT_OK(
-        CheckWorkerOutput(task.worker_address(), task.id(), test_case.output));
-  }
-}
-
-TEST(DataService, AddWorkerMidEpoch) {
-  TestCluster cluster(1);
-  TF_ASSERT_OK(cluster.Initialize());
-  test_util::GraphDefTestCase test_case;
-  TF_ASSERT_OK(test_util::map_test_case(&test_case));
-  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
-
-  int64 dataset_id;
-  TF_ASSERT_OK(master.RegisterDataset(test_case.graph_def, &dataset_id));
-  int64 job_id;
-  TF_ASSERT_OK(
-      master.CreateJob(dataset_id, ProcessingMode::PARALLEL_EPOCHS, &job_id));
-  std::vector<TaskInfo> tasks;
-  bool job_finished;
-  TF_ASSERT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  EXPECT_EQ(tasks.size(), 1);
-  EXPECT_FALSE(job_finished);
-  TF_ASSERT_OK(cluster.AddWorker());
-  TF_EXPECT_OK(master.GetTasks(job_id, &tasks, &job_finished));
-  EXPECT_EQ(tasks.size(), 2);
-  EXPECT_FALSE(job_finished);
-
-  // Each worker produces the full dataset.
-  for (TaskInfo task : tasks) {
-    TF_EXPECT_OK(
-        CheckWorkerOutput(task.worker_address(), task.id(), test_case.output));
-  }
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index b4be18ebccd..151410bb219 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/data/compression_utils.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/snappy.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -135,8 +136,33 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
 
   if (!end_of_sequence) {
     VLOG(3) << "Producing an element for task " << request->task_id();
-    TF_RETURN_IF_ERROR(
-        CompressElement(outputs, response->mutable_compressed_element()));
+    if (outputs.size() != 1) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a single scalar variant tensor, but the "
+          "dataset produced ",
+          outputs.size(), " outputs");
+    }
+    if (outputs[0].dtype() != DT_VARIANT) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a single scalar variant tensor, but "
+          "the dataset produced a tensor with type ",
+          DataTypeString(outputs[0].dtype()));
+    }
+    if (!TensorShapeUtils::IsScalar(outputs[0].shape())) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a single scalar variant tensor, but "
+          "the dataset produced a tensor with shape ",
+          outputs[0].shape());
+    }
+    Variant& variant = outputs[0].scalar<Variant>()();
+    CompressedElement* compressed = variant.get<CompressedElement>();
+    if (compressed == nullptr) {
+      return errors::FailedPrecondition(
+          "Expected dataset to produce a CompressedElement variant tensor, but "
+          "it produced ",
+          variant.TypeName());
+    }
+    compressed->Swap(response->mutable_compressed_element());
   }
   response->set_end_of_sequence(end_of_sequence);
 
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 3f8e778d1d8..a106bcb0a7c 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
@@ -29,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/data/name_utils.h"
 #include "tensorflow/core/kernels/data/serialization_utils.h"
@@ -496,7 +496,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
       std::vector<Tensor> element;
       if (!end_of_sequence) {
-        TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
+        Tensor tensor(DT_VARIANT, TensorShape{});
+        tensor.scalar<Variant>()() = std::move(compressed);
+        element.push_back(tensor);
       }
       mutex_lock l(mu_);
       if (end_of_sequence) {
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 67dfadb4841..f2ebd51d187 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -22,6 +22,7 @@ import functools
 import six
 
 from tensorflow.python import tf2
+from tensorflow.python.data.experimental.ops import compression_ops
 from tensorflow.python.data.experimental.ops.distribute_options import ExternalStatePolicy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
@@ -84,6 +85,7 @@ class _DataServiceDatasetV2(dataset_ops.DatasetSource):
     if task_refresh_interval_hint_ms is None:
       task_refresh_interval_hint_ms = dataset_ops.AUTOTUNE
 
+    self._input_dataset = input_dataset
     self._dataset_id = ops.convert_to_tensor(
         dataset_id, dtype=dtypes.int64, name="dataset_id")
     self._processing_mode = ops.convert_to_tensor(
@@ -201,16 +203,28 @@ def _distribute(processing_mode,
   protocol = ops.convert_to_tensor(
       protocol, dtype=dtypes.string, name="protocol")
 
-  def _apply_fn(dataset):
+  def _apply_fn(dataset):  # pylint: disable=missing-docstring
     external_state_policy = dataset.options().experimental_external_state_policy
     if external_state_policy is None:
       external_state_policy = ExternalStatePolicy.WARN
+
+    uncompressed_spec = dataset.element_spec
+    # Compress the dataset elements to reduce the amount of data that needs to
+    # be sent over the network.
+    # TODO(b/157105111): Make this an autotuned parallel map when we have a way
+    # to limit memory usage.
+    dataset = dataset.map(lambda *x: compression_ops.compress(x))
+    # Prefetch one compressed element to reduce latency when requesting data
+    # from tf.data workers.
+    # TODO(b/157105111): Set this to autotune when we have a way to limit
+    # memory usage
+    dataset = dataset.prefetch(1)
     dataset_id = gen_experimental_dataset_ops.register_dataset(
         dataset._variant_tensor,  # pylint: disable=protected-access
         address=address,
         protocol=protocol,
         external_state_policy=external_state_policy.value)
-    return _DataServiceDataset(
+    dataset = _DataServiceDataset(
         input_dataset=dataset,
         dataset_id=dataset_id,
         processing_mode=processing_mode,
@@ -219,6 +233,11 @@ def _distribute(processing_mode,
         job_name=job_name,
         max_outstanding_requests=max_outstanding_requests,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms)
+    # TODO(b/157105111): Make this an autotuned parallel map when we have a way
+    # to limit memory usage.
+    dataset = dataset.map(
+        lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
+    return dataset
 
   return _apply_fn
 

From c030682e9c15ee1121f20b77580c4ac54e04b885 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Thu, 21 May 2020 11:15:27 -0700
Subject: [PATCH 0982/1533] Implement a pass that converts readonly reference
 variables to the corresponding resource variables.

It converts (VariableV2 -> Identity) to (VarHandle -> ReadVariable).

For the background, this pass is a part of hoisting VariableV2 ops by
re-using the pipeline for hoisting (VarHandle -> ReadVariable) cases, which
can be done by the following passes:
  - Capturing resource values into global tensors (importing saved model).
  - Promoting VarHandle ops to function input/outputs.
  - Freezing global tensor pass.

This path assumes that all the VariableV2 ops is read-only via verifying the
heuristic method that assumes that all the users of them is Identity op,
fed directly.

PiperOrigin-RevId: 312704760
Change-Id: I89ac4c0543a7954f6b27d418da63f7f1418490cd
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../mlir/tensorflow/ir/tf_executor.cc         |  34 +---
 .../compiler/mlir/tensorflow/ir/tf_types.cc   |  22 +++
 .../compiler/mlir/tensorflow/ir/tf_types.h    |  10 +
 .../readonly_references_to_resources.mlir     |  85 +++++++++
 .../mlir/tensorflow/transforms/passes.h       |   5 +
 .../readonly_references_to_resources.cc       | 179 ++++++++++++++++++
 7 files changed, 305 insertions(+), 31 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 9b2e6f0292b..b2b4c09df3b 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -430,6 +430,7 @@ cc_library(
         "transforms/parallel_execute_to_islands.cc",
         "transforms/promote_resources_to_args.cc",
         "transforms/raise_control_flow.cc",
+        "transforms/readonly_references_to_resources.cc",
         "transforms/replicate_invariant_op_hoisting.cc",
         "transforms/replicate_to_island.cc",
         "transforms/resource_device_inference.cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
index d5ecbf3e292..9daebc22ba1 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor.cc
@@ -47,37 +47,6 @@ limitations under the License.
 
 namespace mlir {
 namespace tf_executor {
-namespace {
-
-// If the given tensor has elements of type with subtypes, then returns a new
-// type after dropping subtypes info. Otherwise, returns the original type as
-// is.
-ShapedType DropTypeSubTypes(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  auto subtype_ty = element_ty.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (!subtype_ty) return ty;
-
-  Type default_ty = GetDefaultTypeOf(subtype_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
-}
-
-// If the given tensor has elements of type ref, then returns a new type
-// of the shape, but corresponding non-ref type as element type. Otherwise,
-// returns the original type as is.
-ShapedType DropRefType(ShapedType ty) {
-  Type element_ty = ty.getElementType();
-  auto ref_ty = element_ty.dyn_cast<TF::TensorFlowRefType>();
-  if (!ref_ty) return ty;
-
-  Type default_ty = GetDefaultTypeOf(ref_ty);
-  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
-
-  return UnrankedTensorType::get(default_ty);
-}
-
-}  // namespace
 
 //===----------------------------------------------------------------------===//
 // TF Executor Dialect
@@ -85,6 +54,9 @@ ShapedType DropRefType(ShapedType ty) {
 
 namespace {
 
+using TF::DropRefType;
+using TF::DropTypeSubTypes;
+
 struct TensorFlowExecutorInlinerInterface : public DialectInlinerInterface {
   using DialectInlinerInterface::DialectInlinerInterface;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index d312e5e409b..994378ea1cf 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -366,5 +366,27 @@ bool AreCastCompatible(ArrayRef<Type> types) {
   return true;
 }
 
+ShapedType DropTypeSubTypes(ShapedType ty) {
+  Type element_ty = ty.getElementType();
+  auto subtype_ty = element_ty.dyn_cast<TF::TensorFlowTypeWithSubtype>();
+  if (!subtype_ty) return ty;
+
+  Type default_ty = GetDefaultTypeOf(subtype_ty);
+  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
+
+  return UnrankedTensorType::get(default_ty);
+}
+
+ShapedType DropRefType(ShapedType ty) {
+  Type element_ty = ty.getElementType();
+  TF::TensorFlowRefType ref_ty = element_ty.dyn_cast<TF::TensorFlowRefType>();
+  if (!ref_ty) return ty;
+
+  Type default_ty = TF::GetDefaultTypeOf(ref_ty);
+  if (ty.hasRank()) return RankedTensorType::get(ty.getShape(), default_ty);
+
+  return UnrankedTensorType::get(default_ty);
+}
+
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index 4c99aae4706..5f108e834a9 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -319,6 +319,16 @@ bool HasCompatibleElementTypes(Type lhs, Type rhs,
 // compatible.
 bool AreCastCompatible(ArrayRef<Type> types);
 
+// If the given tensor has elements of type with subtypes, then returns a new
+// type after dropping subtypes info. Otherwise, returns the original type as
+// is.
+ShapedType DropTypeSubTypes(ShapedType ty);
+
+// If the given tensor has elements of type ref, then returns a new type
+// of the shape, but corresponding non-ref type as element type. Otherwise,
+// returns the original type as is.
+ShapedType DropRefType(ShapedType ty);
+
 }  // end namespace TF
 }  // end namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir b/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
new file mode 100644
index 00000000000..2970e31c3c9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/readonly_references_to_resources.mlir
@@ -0,0 +1,85 @@
+// RUN: tf-opt -verify-diagnostics -readonly-references-to-resources -split-input-file %s | FileCheck %s --dump-input=fail
+
+// Test case: Basic converting.
+
+func @f() {
+  // CHECK: "tf.VarHandleOp"
+  // CHECK: "tf.ReadVariableOp"
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: Two ReadVariable ops.
+
+func @f() {
+  // CHECK: "tf.VarHandleOp"
+
+  // During lowering to resource variables, this pass will preserve the
+  // locations of the ReadVariableOps as Identity ops to keep the original graph
+  // composition and order.
+
+  // CHECK: "tf.ReadVariableOp"
+  // CHECK: "tf.ReadVariableOp"
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  %val2 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: No follow-up ReadVariable case.
+
+func @f() {
+  // CHECK-NOT: "tf.VariableV2"
+  // CHECK-NOT: "tf.VarHandleOp"
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  return
+}
+
+// -----
+
+// Test case: No converting when there is another use case.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op expects all users to be 'tf.Identity', but got user tf.CustomIdentity}}
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.CustomIdentity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: No class attribute on VariableV2 op.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op has no '_class' attribute}}
+  %val0 = "tf.VariableV2"() {container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: No named location found on VariableV2 op.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op expects variable name in '_class' attribute, but got ["unrelated_class"]}}
+  %val0 = "tf.VariableV2"() {_class = ["unrelated_class"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
+
+// -----
+
+// Test case: Invalid multiple location information in a class attribute on VariableV2 op.
+
+func @f() {
+  // expected-error @+1 {{'tf.VariableV2' op expects only one named location in '_class' attribute, but got ["loc:@v1", "loc:@v2"]}}
+  %val0 = "tf.VariableV2"() {_class = ["loc:@v1", "loc:@v2"], container = "", device = "", shape = #tf.shape<96>, shared_name = ""} : () -> tensor<96x!tf.f32ref>
+  %val1 = "tf.Identity"(%val0) : (tensor<96x!tf.f32ref>) -> tensor<96xf32>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 81d0259d2d6..5c140ddd6aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -95,6 +95,11 @@ std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass();
 // functions.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
 
+// Creates a pass that converts readonly reference variables to the
+// corresponding resource variables.
+std::unique_ptr<OperationPass<FuncOp>>
+CreateConvertReadonlyReferenceVariablesToResourceVariablesPass();
+
 // Marks function visibility using tf.entry_function specification. That is,
 // functions with tf.entry_function attributes are marked with public
 // visibility while the other functions are marked with private visibility.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
new file mode 100644
index 00000000000..a80b84ddeda
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/readonly_references_to_resources.cc
@@ -0,0 +1,179 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// Location attribute.
+constexpr StringRef kClassAttr = "_class";
+constexpr StringRef kLocationPrefix = "loc:@";
+
+// A pass that converts readonly reference variables to the corresponding
+// resource variables.
+//
+// It converts (VariableV2 -> Identity) to (VarHandle -> ReadVariable).
+//
+// For the background, this pass is a part of hoisting VariableV2 ops by
+// re-using the pipeline for hoisting (VarHandle -> ReadVariable) cases, which
+//  can be done by the following passes:
+//  - Capturing resource values into global tensors (importing saved model).
+//  - Promoting VarHandle ops to function input/outputs.
+//  - Freezing global tensor pass.
+//
+// This path assumes that all the VariableV2 ops is read-only via verifying the
+// heuristic method that assumes that all the users of them is Identity op,
+// fed directly.
+class ConvertReadonlyReferenceVariablesToResourceVariablesPass
+    : public PassWrapper<
+          ConvertReadonlyReferenceVariablesToResourceVariablesPass,
+          FunctionPass> {
+ public:
+  void runOnFunction() override;
+};
+
+// Parse node name from "_class" attribute.
+StringRef GetNodeNameFromClassAttr(Operation *op) {
+  ArrayAttr classes_attr = op->getAttrOfType<ArrayAttr>(kClassAttr);
+  if (!classes_attr) {
+    op->emitOpError() << "has no '_class' attribute";
+    return StringRef();
+  }
+
+  StringRef result;
+  for (Attribute class_attr : classes_attr) {
+    StringRef node_name = class_attr.cast<StringAttr>().getValue();
+    if (!node_name.startswith(kLocationPrefix)) {
+      continue;
+    }
+    if (!result.empty()) {
+      // Invalid case since there are multiple loc:@ attributes.
+      op->emitOpError()
+          << "expects only one named location in '_class' attribute, but got "
+          << classes_attr;
+      return StringRef();
+    }
+    result = node_name.drop_front(kLocationPrefix.size());
+  }
+  if (result.empty()) {
+    op->emitOpError() << "expects variable name in '_class' attribute, but got "
+                      << classes_attr;
+  }
+  return result;
+}
+
+void ConvertReadonlyReferenceVariablesToResourceVariablesPass::runOnFunction() {
+  FuncOp func = getFunction();
+
+  OpBuilder builder(func.getContext());
+  SmallVector<VariableV2Op, 4> variable_v2s_to_replace;
+
+  // Checks all the VariableV2 ops is read-only via verifying the heuristic
+  // method that assumes that all the users of them is Identity op, feeded
+  // directly.
+  auto read_only_vars_fn = [&variable_v2s_to_replace](
+                               VariableV2Op variable_v2_op) {
+    if (variable_v2_op.getResult().use_empty()) {
+      // Erase the op when there is no user.
+      variable_v2_op.erase();
+      return mlir::WalkResult::advance();
+    }
+    if (!all_of(variable_v2_op.getResult().getUsers(), [&variable_v2_op](
+                                                           Operation *user) {
+          if (!isa<IdentityOp>(user)) {
+            variable_v2_op.emitOpError()
+                << "expects all users to be 'tf.Identity', but got user "
+                << user->getName();
+            return false;
+          }
+          return true;
+        })) {
+      return mlir::WalkResult::interrupt();
+    }
+    variable_v2s_to_replace.push_back(variable_v2_op);
+    return mlir::WalkResult::advance();
+  };
+
+  WalkResult walk_res = func.walk(read_only_vars_fn);
+  if (walk_res.wasInterrupted()) return signalPassFailure();
+
+  for (VariableV2Op variable_v2_op : variable_v2s_to_replace) {
+    builder.setInsertionPoint(variable_v2_op);
+    ShapedType shaped_type =
+        variable_v2_op.getResult().getType().cast<ShapedType>();
+    TensorType tensor_type = DropRefType(shaped_type).cast<TensorType>();
+    StringAttr device_attr = variable_v2_op.getAttrOfType<StringAttr>("device");
+    if (!device_attr) device_attr = builder.getStringAttr("");
+    StringRef variable_name = GetNodeNameFromClassAttr(variable_v2_op);
+    if (variable_name.empty()) {
+      return signalPassFailure();
+    }
+    VarHandleOp var_handle_op = builder.create<VarHandleOp>(
+        variable_v2_op.getLoc(),
+        ArrayRef<Type>{RankedTensorType::get(
+            {}, TF::ResourceType::get(ArrayRef<TensorType>{tensor_type},
+                                      builder.getContext()))},
+        ArrayRef<Value>{},
+        ArrayRef<NamedAttribute>{
+            builder.getNamedAttr("device", device_attr),
+            builder.getNamedAttr("container", variable_v2_op.containerAttr()),
+            builder.getNamedAttr("shared_name",
+                                 builder.getStringAttr(variable_name))});
+    for (Operation *user :
+         make_early_inc_range(variable_v2_op.getResult().getUsers())) {
+      builder.setInsertionPoint(user);
+      ReadVariableOp read_variable_op = builder.create<ReadVariableOp>(
+          user->getLoc(), ArrayRef<Type>{tensor_type},
+          ArrayRef<Value>{var_handle_op}, ArrayRef<NamedAttribute>{});
+      user->getResult(0).replaceAllUsesWith(read_variable_op.getResult());
+      user->erase();
+    }
+    variable_v2_op.erase();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>>
+CreateConvertReadonlyReferenceVariablesToResourceVariablesPass() {
+  return std::make_unique<
+      ConvertReadonlyReferenceVariablesToResourceVariablesPass>();
+}
+
+static PassRegistration<
+    ConvertReadonlyReferenceVariablesToResourceVariablesPass>
+    pass("readonly-references-to-resources",
+         "Convert readonly reference variables to resource variables.");
+
+}  // namespace TF
+
+}  // namespace mlir

From 5d3ad40d114bab9c9c6911e469304e6948bc1975 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 21 May 2020 11:27:43 -0700
Subject: [PATCH 0983/1533] Fix for hello_world memory error

PiperOrigin-RevId: 312707164
Change-Id: Ibdc1b5bd2161daa093cc79f165c03ac5a6c99acc
---
 .../lite/micro/examples/hello_world/main_functions.cc     | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/examples/hello_world/main_functions.cc b/tensorflow/lite/micro/examples/hello_world/main_functions.cc
index 404c8542432..d1c2cafe850 100644
--- a/tensorflow/lite/micro/examples/hello_world/main_functions.cc
+++ b/tensorflow/lite/micro/examples/hello_world/main_functions.cc
@@ -34,8 +34,12 @@ TfLiteTensor* output = nullptr;
 int inference_count = 0;
 
 // Create an area of memory to use for input, output, and intermediate arrays.
-// Finding the minimum value for your model may require some trial and error.
-constexpr int kTensorArenaSize = 2 * 1024;
+// Minimum arena size, at the time of writing. After allocating tensors
+// you can retrieve this value by invoking interpreter.arena_used_bytes().
+const int kModelArenaSize = 2352;
+// Extra headroom for model + alignment + future interpreter changes.
+const int kExtraArenaSize = 560 + 16 + 100;
+const int kTensorArenaSize = kModelArenaSize + kExtraArenaSize;
 uint8_t tensor_arena[kTensorArenaSize];
 }  // namespace
 

From 1d8bc7222d341a28f0002589f910d432d2c2add0 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Thu, 21 May 2020 11:50:37 -0700
Subject: [PATCH 0984/1533] Switches keras.backend.placeholder +
 keras.backend.function to build a keras model when running eagerly (instead
 of trying to directly lift ops out of a graph into a concretefunction).
 Allows us to strip most of EagerDefinedExecutionFunction from the keras
 backend.

This has the effect of making keras.backend.placeholder + backend.function use the same codepaths as the rest of Keras.

This may have the following impact on user code:
- keras.backend.function no longer supports the `updates` argument when eager execution is enabled.
- keras.backend.placeholder + keras.backend.function now have the same limitations as TF op layers when manipulating the placeholders directly with tf ops. This means  no support outside of a layer for sparse ops & ops that operate on composite tensors.

PiperOrigin-RevId: 312711373
Change-Id: Ie4bab440b83ea2becf1c83b83837771eba185ff5
---
 tensorflow/python/keras/BUILD                 |   1 +
 tensorflow/python/keras/backend.py            | 230 ++++++------------
 tensorflow/python/keras/backend_test.py       |  11 +-
 .../python/keras/engine/base_layer_utils.py   |   5 +-
 tensorflow/python/keras/engine/input_layer.py |   7 +-
 .../python/keras/engine/training_utils.py     |   2 +-
 .../keras/layers/tensorflow_op_layer_test.py  |   7 +-
 .../python/keras/layers/wrappers_test.py      |   8 +-
 tensorflow/python/keras/losses_test.py        |   4 +-
 9 files changed, 113 insertions(+), 162 deletions(-)

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 4cd0af07c74..78e360c8354 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -584,6 +584,7 @@ tf_py_test(
     deps = [
         ":backend",
         ":combinations",
+        ":engine",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 11795625d06..d0c3eb03342 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -294,7 +294,6 @@ def clear_session():
   global _GRAPH_VARIABLES  # pylint: disable=global-variable-not-assigned
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   global _GRAPH
-  global _FREEZABLE_VARS
   _GRAPH.graph = None
   ops.reset_default_graph()
   reset_uids()
@@ -307,7 +306,6 @@ def clear_session():
     _GRAPH_LEARNING_PHASES.setdefault(graph)
     _GRAPH_VARIABLES.pop(graph, None)
     _GRAPH_TF_OPTIMIZERS.pop(graph, None)
-    _FREEZABLE_VARS.pop(graph, None)
 
 
 @keras_export('keras.backend.manual_variable_initialization')
@@ -1059,9 +1057,9 @@ def is_keras_tensor(x):
   >>> tf.keras.backend.is_keras_tensor(keras_var)
   False
   >>> keras_placeholder = tf.keras.backend.placeholder(shape=(2, 4, 5))
-  >>> # A placeholder is not a Keras tensor.
+  >>> # A placeholder is a Keras tensor.
   >>> tf.keras.backend.is_keras_tensor(keras_placeholder)
-  False
+  True
   >>> keras_input = tf.keras.layers.Input([10])
   >>> # An Input is a Keras tensor.
   >>> tf.keras.backend.is_keras_tensor(keras_input)
@@ -1144,6 +1142,14 @@ def placeholder(shape=None,
                              expand_composites=True)
     else:
       x = array_ops.placeholder(dtype, shape=shape, name=name)
+
+  if context.executing_eagerly():
+    # Add keras_history connectivity information to the placeholder
+    # when the placeholder is built in a top-level eager context
+    # (intended to be used with keras.backend.function)
+    from tensorflow.python.keras.engine import input_layer  # pylint: disable=g-import-not-at-top
+    return input_layer.Input(tensor=x)
+
   return x
 
 
@@ -3379,7 +3385,7 @@ def get_value(x):
 
   if ops.executing_eagerly_outside_functions():
     # This method of evaluating works inside the Keras FuncGraph.
-    return function([], x)(x)
+    return eval_in_eager_or_function(x)
 
   with x.graph.as_default():
     return x.eval(session=get_session((x,)))
@@ -3722,161 +3728,74 @@ class GraphExecutionFunction(object):
     return nest.map_structure(self._eval_if_composite, output_structure)
 
 
-class EagerExecutionFunction(object):
-  """Helper class for constructing a TF graph function from the Keras graph.
+def eval_in_eager_or_function(outputs):
+  """Method to evaluate a tensor in eager or in a tf.function.
+
+  In the case of a tf.function, it will lift the tensor out of the function
+  and try to evaluate that piece of the graph.
+
+  Warning: Do not add new usages of this function.
+  TODO(b/150169018): delete this function once _keras_history_helper is no
+  longer needed, after Keras switches to KerasTensors and op layers
+  work via dispatch.
 
   Arguments:
-    inputs: Feed placeholders to the computation graph.
-    outputs: Output tensors to fetch.
-    updates: Additional update ops to be run at function call.
-    name: A name to help users identify what this function does.
-    session_kwargs: Unsupported.
+    outputs: tensors to fetch.
+  Returns:
+    The value of the tensors (as numpy arrays).
   """
+  outputs_structure = outputs
+  outputs = nest.flatten(outputs, expand_composites=True)
 
-  def __init__(self, inputs, outputs, updates=None, name=None):
-    self.name = name
-    self._inputs_structure = inputs
-    inputs = nest.flatten(inputs, expand_composites=True)
-    self._outputs_structure = outputs
-    outputs = nest.flatten(outputs, expand_composites=True)
+  graphs = {
+      i.graph
+      for i in nest.flatten([outputs])
+      if hasattr(i, 'graph')
+  }
+  if len(graphs) > 1:
+    raise ValueError('Cannot create an execution function which is comprised '
+                     'of elements from multiple graphs.')
 
-    updates = updates or []
-    if not isinstance(updates, (list, tuple)):
-      raise TypeError('`updates` in a Keras backend function '
-                      'should be a list or tuple.')
+  source_graph = graphs.pop()
 
-    if updates and not outputs:
-      # Edge case; never happens in practice
-      raise ValueError('Cannot create a Keras backend function with updates'
-                       ' but no outputs during eager execution.')
-    graphs = {
-        i.graph
-        for i in nest.flatten([inputs, outputs, updates])
-        if hasattr(i, 'graph')
-    }
-    if len(graphs) > 1:
-      raise ValueError('Cannot create an execution function which is comprised '
-                       'of elements from multiple graphs.')
-
-    source_graph = graphs.pop()
+  with _scratch_graph() as exec_graph:
     global_graph = get_graph()
+    if source_graph not in (exec_graph, global_graph):
+      raise ValueError('Unknown graph. Aborting.')
 
-    updates_ops = []
-    legacy_update_ops = []
-    for update in updates:
-      # For legacy reasons it is allowed to pass an update as a tuple
-      # `(variable, new_value)` (this maps to an assign op). Otherwise it
-      # is assumed to already be an op -- we cannot control its execution
-      # order.
-      if isinstance(update, tuple):
-        legacy_update_ops.append(update)
-      else:
-        if hasattr(update, 'op'):
-          update = update.op
-        if update is not None:
-          # `update.op` may have been None in certain cases.
-          updates_ops.append(update)
+    if source_graph is global_graph and exec_graph is not global_graph:
+      init_tensors = outputs
+      lifted_map = lift_to_graph.lift_to_graph(
+          tensors=init_tensors,
+          graph=exec_graph,
+          sources=[],
+          add_sources=True,
+          handle_captures=True,
+          base_graph=source_graph)
 
-    self._freezable_vars_to_feed = []
-    self._freezable_vars_values = []
-    freezable_vars_from_keras_graph = object_identity.ObjectIdentitySet(
-        _FREEZABLE_VARS.get(global_graph, {}))
-    with _scratch_graph() as exec_graph:
-      global_graph = get_graph()
-      if source_graph not in (exec_graph, global_graph):
-        raise ValueError('Unknown graph. Aborting.')
+      outputs = [lifted_map[i] for i in outputs]
 
-      if source_graph is global_graph and exec_graph is not global_graph:
-        init_tensors = (
-            outputs + updates_ops + [p for [p, _] in legacy_update_ops] +
-            [p_new for [_, p_new] in legacy_update_ops
-             if isinstance(p_new, ops.Tensor)])
-        lifted_map = lift_to_graph.lift_to_graph(
-            tensors=init_tensors,
-            graph=exec_graph,
-            sources=inputs,
-            add_sources=True,
-            handle_captures=True,
-            base_graph=source_graph)
+  # Consolidate updates
+  with exec_graph.as_default():
+    outputs = cast_variables_to_tensor(outputs)
 
-        inputs = [lifted_map[i] for i in inputs]
-        outputs = [lifted_map[i] for i in outputs]
-        updates_ops = [lifted_map[i] for i in updates_ops]
-        legacy_update_ops = [(lifted_map[p], lifted_map.get(p_new, p_new))
-                             for p, p_new in legacy_update_ops]
+    exec_graph.inputs = exec_graph.internal_captures
+    exec_graph.outputs = outputs
+    graph_fn = eager_function.ConcreteFunction(exec_graph)
 
-        # Keep track of the value to feed to any "freezable variables"
-        # created in this graph.
-        for old_op, new_op in lifted_map.items():
-          if old_op in freezable_vars_from_keras_graph:
-            frozen_var = old_op
-            if frozen_var._initial_value != frozen_var._current_value:
-              # We only feed a frozen_variable if its value has changed;
-              # otherwise it can rely on the default value of the
-              # underlying placeholder_with_default.
-              self._freezable_vars_to_feed.append(new_op)
-              self._freezable_vars_values.append(frozen_var._current_value)
+  graph_fn._num_positional_args = 0
+  graph_fn._arg_keywords = []
 
-    # Consolidate updates
-    with exec_graph.as_default():
-      outputs = cast_variables_to_tensor(outputs)
-      with ops.control_dependencies(outputs):
-        for p, p_new in legacy_update_ops:
-          updates_ops.append(state_ops.assign(p, p_new))
+  outputs = graph_fn()
 
-      self.inputs, self.outputs = inputs, outputs
-      self._input_references = self.inputs + self._freezable_vars_to_feed
-      with ops.control_dependencies(updates_ops):
-        self.outputs[0] = array_ops.identity(self.outputs[0])
-
-      exec_graph.inputs = self._input_references + exec_graph.internal_captures
-      exec_graph.outputs = self.outputs
-      graph_fn = eager_function.ConcreteFunction(exec_graph)
-
-    graph_fn._num_positional_args = len(self._input_references)
-    graph_fn._arg_keywords = []
-    self._graph_fn = graph_fn
-
-    # Handle placeholders with default
-    # (treated as required placeholder by graph functions)
-    self._placeholder_default_values = {}
-    with exec_graph.as_default():
-      for x in self.inputs:
-        if x.op.type == 'PlaceholderWithDefault':
-          self._placeholder_default_values[ops.tensor_id(
-              x)] = tensor_util.constant_value(x.op.inputs[0])
-
-  def __call__(self, inputs):
-    input_values = nest.flatten(inputs, expand_composites=True)
-
-    if self._freezable_vars_values:
-      input_values = input_values + self._freezable_vars_values
-    converted_inputs = []
-    for tensor, value in zip(self._input_references, input_values):
-      if value is None:
-        # Assume `value` is a placeholder with default
-        value = self._placeholder_default_values.get(
-            ops.tensor_id(tensor), None)
-        if value is None:
-          raise ValueError(
-              'You must feed a value for placeholder %s' % (tensor,))
-      if not isinstance(value, ops.Tensor):
-        value = ops.convert_to_tensor_v2(value, dtype=tensor.dtype)
-      if value.dtype != tensor.dtype:
-        # Temporary workaround due to `convert_to_tensor` not casting floats.
-        # See b/119637405
-        value = math_ops.cast(value, tensor.dtype)
-      converted_inputs.append(value)
-    outputs = self._graph_fn(*converted_inputs)
-
-    # EagerTensor.numpy() will often make a copy to ensure memory safety.
-    # However in this case `outputs` is not directly returned, so it is always
-    # safe to reuse the underlying buffer without checking. In such a case the
-    # private numpy conversion method is preferred to guarantee performance.
-    return nest.pack_sequence_as(
-        self._outputs_structure,
-        [x._numpy() for x in outputs],  # pylint: disable=protected-access
-        expand_composites=True)
+  # EagerTensor.numpy() will often make a copy to ensure memory safety.
+  # However in this case `outputs` is not directly returned, so it is always
+  # safe to reuse the underlying buffer without checking. In such a case the
+  # private numpy conversion method is preferred to guarantee performance.
+  return nest.pack_sequence_as(
+      outputs_structure,
+      [x._numpy() for x in outputs],  # pylint: disable=protected-access
+      expand_composites=True)
 
 
 @keras_export('keras.backend.function')
@@ -3900,7 +3819,20 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
     if kwargs:
       raise ValueError('Session keyword arguments are not support during '
                        'eager execution. You passed: %s' % (kwargs,))
-    return EagerExecutionFunction(inputs, outputs, updates=updates, name=name)
+    if updates:
+      raise ValueError('`updates` argument is not support during '
+                       'eager execution. You passed: %s' % (updates,))
+    from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.utils import tf_utils  # pylint: disable=g-import-not-at-top
+    model = models.Model(inputs=inputs, outputs=outputs)
+
+    wrap_outputs = isinstance(outputs, list) and len(outputs) == 1
+    def func(model_inputs):
+      outs = model(model_inputs)
+      if wrap_outputs:
+        outs = [outs]
+      return tf_utils.to_numpy_or_python_type(outs)
+    return func
 
   if kwargs:
     for key in kwargs:
@@ -6344,10 +6276,6 @@ class ContextValueCache(weakref.WeakKeyDictionary):
 # either train mode (learning_phase == 1) or test mode (learning_phase == 0).
 _GRAPH_LEARNING_PHASES = ContextValueCache(_default_learning_phase)
 
-# This dictionary holds a mapping {graph: set_of_freezable_variables}.
-# Each set tracks objects created via `freezable_variable` in the graph.
-_FREEZABLE_VARS = ContextValueCache(object_identity.ObjectIdentityWeakSet)
-
 # This dictionary holds a mapping between a graph and variables to initialize
 # in the graph.
 _GRAPH_VARIABLES = ContextValueCache(object_identity.ObjectIdentityWeakSet)
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 1adc20652b2..20547c570c7 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -1677,8 +1677,10 @@ class BackendCrossEntropyLossesTest(test.TestCase, parameterized.TestCase):
         t, p, from_logits=True, axis=0),
     self.assertArrayNear(self.evaluate(result)[0], [.002, 0, .17], 1e-3)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @combinations.generate(combinations.combine(mode=['graph']))
   def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
+    # This test only runs in graph because the TF op layer is not supported yet
+    # for sparse ops.
     t = backend.placeholder()
     p = backend.placeholder()
     o = backend.sparse_categorical_crossentropy(t, p)
@@ -1870,6 +1872,8 @@ class TestRandomOps(test.TestCase):
 class FunctionTest(test.TestCase):
 
   def test_function_basics(self):
+    if context.executing_eagerly():
+      self.skipTest('eager backend.function does not support updates')
     x1 = backend.placeholder(shape=(), dtype='float32')
     x2 = backend.placeholder(shape=(), dtype='int32')
     v = backend.variable(10.)
@@ -1916,6 +1920,9 @@ class FunctionTest(test.TestCase):
     self.assertEqual(result, 4.)
 
   def test_tuple_updates(self):
+    if context.executing_eagerly():
+      self.skipTest('eager backend.function does not support updates')
+
     x_ph = backend.placeholder(ndim=2)
     v = backend.variable(np.ones((4, 2)))
     output = x_ph ** 2 + v
@@ -1929,7 +1936,7 @@ class FunctionTest(test.TestCase):
 
 class BackendGraphTests(test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @combinations.generate(combinations.combine(mode=['graph']))
   def test_function_placeholder_with_default(self):
     with backend.get_graph().as_default():
       x1 = array_ops.placeholder_with_default(
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 5980eeaf115..7e4e0e5da4a 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -248,7 +248,10 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
             constants[i] = op_input
           else:
             with ops.init_scope():
-              constants[i] = backend.function([], op_input)([])
+              if ops.executing_eagerly_outside_functions():
+                constants[i] = backend.eval_in_eager_or_function(op_input)
+              else:
+                constants[i] = backend.function([], op_input)([])
       layer_inputs = unnest_if_single_tensor(layer_inputs)
       processed_ops, created_layers = _create_keras_history_helper(
           layer_inputs, processed_ops, created_layers)
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index ed715f61897..1fa380815fc 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -161,8 +161,11 @@ class InputLayer(base_layer.Layer):
                          'InputLayer, you should instantiate your model and '
                          'directly call it on your input.')
       self.is_placeholder = False
-      self._batch_input_shape = tuple(input_tensor.shape.as_list())
-
+      try:
+        self._batch_input_shape = tuple(input_tensor.shape.as_list())
+      except ValueError:
+        # If the shape cannot be represented as a tuple (e.g. unknown rank)
+        self._batch_input_shape = None
     # Create an input node.
     input_tensor._keras_mask = None
     node_module.Node(layer=self, outputs=input_tensor)
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index 680f33f75a5..0d7637cb98c 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -1935,7 +1935,7 @@ def get_input_shape_and_dtype(layer):
       raise ValueError('An empty Model cannot be used as a Layer.')
     layer = layer.layers[0]
 
-  if hasattr(layer, '_batch_input_shape'):
+  if getattr(layer, '_batch_input_shape', None):
     return layer._batch_input_shape, layer.dtype
   return None, None
 
diff --git a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
index 73e395f5715..1a328995a80 100644
--- a/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
+++ b/tensorflow/python/keras/layers/tensorflow_op_layer_test.py
@@ -288,9 +288,10 @@ class AutoLambdaTest(keras_parameterized.TestCase):
                         constant_op.constant(40.0, shape=(1, 1)))
 
   def test_no_tracking(self):
-    x = keras.backend.placeholder((10, 10))
-    keras.layers.Dense(1)(x)
-    self.assertTrue(x._keras_history_checked)
+    if not context.executing_eagerly():
+      x = constant_op.constant(1.0, shape=(10, 10))
+      keras.layers.Dense(1)(x)
+      self.assertTrue(x._keras_history_checked)
 
   def test_timing_scales_linearly(self):
 
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index bb22db25591..a73177fff12 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.layers.rnn_cell_wrapper_v2 import ResidualWrapper
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import array_ops
@@ -1213,9 +1214,14 @@ class BidirectionalTest(test.TestCase, parameterized.TestCase):
       f_merged = keras.backend.function([inputs], layer(inputs))
       f_forward = keras.backend.function([inputs],
                                          layer.forward_layer(inputs))
+
+      # TODO(kaftan): after KerasTensor refactor TF op layers should work
+      # with many composite tensors, and this shouldn't need to be a lambda
+      # layer.
+      reverse_layer = core.Lambda(array_ops.reverse, arguments=dict(axis=[1]))
       f_backward = keras.backend.function(
           [inputs],
-          array_ops.reverse(layer.backward_layer(inputs), axis=[1]))
+          reverse_layer(layer.backward_layer(inputs)))
 
       y_merged = f_merged(x)
       y_expected = merge_func(
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index 574d3d3f756..26a586b872b 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -125,8 +125,10 @@ class KerasLossesTest(test.TestCase, parameterized.TestCase):
         backend.eval(output_from_softmax),
         atol=1e-5)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
+  @combinations.generate(combinations.combine(mode=['graph']))
   def test_sparse_categorical_crossentropy_loss_with_unknown_rank_tensor(self):
+    # This test only runs in graph because the TF op layer is not supported yet
+    # for sparse ops.
     t = backend.placeholder()
     p = backend.placeholder()
     o = losses.sparse_categorical_crossentropy(t, p)

From d3cd2a76cc1984b7c1aa6efca74e4e26c359f460 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 11:54:42 -0700
Subject: [PATCH 0985/1533] Internal change

PiperOrigin-RevId: 312712086
Change-Id: Iba2311e8ac40ebe73765f273ef48f5550d76fcfc
---
 .../python/kernel_tests/conv_ops_test.py      |  52 ---
 tensorflow/python/ops/nn_ops.py               | 325 +++++-------------
 2 files changed, 81 insertions(+), 296 deletions(-)

diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index e01abc8133d..18b7a47fc8c 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -455,58 +455,6 @@ class Conv2DTest(test.TestCase):
         conv1,
         self.evaluate(conv2).reshape(conv1.shape))
 
-  @test_util.run_in_graph_and_eager_modes
-  def testConvolutionClass2DExpandedBatch(self):
-    tensor_in_sizes_batch = [10, 2, 3, 3]
-    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
-    filter_in_sizes = [1, 1, 3, 3]
-    filter_in = self._CreateNumpyTensor(filter_in_sizes)
-    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
-    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
-    convolver1 = nn_ops.Convolution(
-        input_shape=x1.shape,
-        filter_shape=filter_in.shape,
-        strides=[1, 1],
-        padding="VALID")
-    self.assertEqual(convolver1.num_batch_dims, 1)
-    convolver2 = nn_ops.Convolution(
-        input_shape=x2.shape,
-        filter_shape=filter_in.shape,
-        strides=[1, 1],
-        padding="VALID")
-    self.assertEqual(convolver2.num_batch_dims, 2)
-    conv1 = convolver1(x1, filter_in)
-    conv2 = convolver2(x2, filter_in)
-    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
-    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
-    self.assertAllEqual(
-        conv1,
-        self.evaluate(conv2).reshape(conv1.shape))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testConvolutionWith2SpatialDimensionsAndExpandedBatch(self):
-    tensor_in_sizes_batch = [10, 2, 3, 3]
-    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
-    filter_in_sizes = [1, 1, 3, 3]
-    filter_in = self._CreateNumpyTensor(filter_in_sizes)
-    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
-    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
-    conv1 = nn_ops.convolution(
-        x1,
-        filter_in,
-        strides=[1, 1],
-        padding="VALID")
-    conv2 = nn_ops.convolution(
-        x2,
-        filter_in,
-        strides=[1, 1],
-        padding="VALID")
-    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
-    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
-    self.assertAllEqual(
-        conv1,
-        self.evaluate(conv2).reshape(conv1.shape))
-
   @test_util.run_in_graph_and_eager_modes
   def testConv2D2x2Filter2x1Dilation(self):
     self._VerifyDilatedConvValues(
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 4c6efe61621..4c00d085f82 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -131,9 +131,9 @@ def _non_atrous_convolution(
   """
   with ops.name_scope(name, "non_atrous_convolution", [input, filter]) as scope:
     input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-    input_shape = input.shape
+    input_shape = input.get_shape()
     filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    filter_shape = filter.shape
+    filter_shape = filter.get_shape()
     op = _NonAtrousConvolution(
         input_shape,
         filter_shape=filter_shape,
@@ -148,51 +148,36 @@ class _NonAtrousConvolution(object):
   """Helper class for _non_atrous_convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  `__call__` are compatible with `input_shape` and filter_shape passed to the
+  __call__ are compatible with input_shape and filter_shape passed to the
   constructor.
 
   Arguments:
-    input_shape: static input shape, i.e. input.shape.
-    filter_shape: static filter shape, i.e. filter.shape.
+    input_shape: static input shape, i.e. input.get_shape().
+    filter_shape: static filter shape, i.e. filter.get_shape().
     padding: see _non_atrous_convolution.
     data_format: see _non_atrous_convolution.
     strides: see _non_atrous_convolution.
     name: see _non_atrous_convolution.
-    num_batch_dims: (Optional.)  The number of batch dimensions in the input;
-     if not provided, the default of `1` is used.
   """
 
   def __init__(
       self,
       input_shape,
-      filter_shape,
+      filter_shape,  # pylint: disable=redefined-builtin
       padding,
       data_format=None,
       strides=None,
-      name=None,
-      num_batch_dims=1):
-    # filter shape is always rank num_spatial_dims + 2
-    # and num_spatial_dims == input_shape.ndims - num_batch_dims - 1
-    if input_shape.ndims is not None:
-      filter_shape = filter_shape.with_rank(
-          input_shape.ndims - num_batch_dims + 1)
+      name=None):
+    filter_shape = filter_shape.with_rank(input_shape.ndims)
     self.padding = padding
     self.name = name
-    # input shape is == num_spatial_dims + num_batch_dims + 1
-    # and filter_shape is always rank num_spatial_dims + 2
-    if filter_shape.ndims is not None:
-      input_shape = input_shape.with_rank(
-          filter_shape.ndims + num_batch_dims - 1)
+    input_shape = input_shape.with_rank(filter_shape.ndims)
     if input_shape.ndims is None:
+      raise ValueError("Rank of convolution must be known")
+    if input_shape.ndims < 3 or input_shape.ndims > 5:
       raise ValueError(
-          "Rank of convolution must be known, but saw input_shape.ndims == {}"
-          .format(input_shape.ndims))
-    if input_shape.ndims < 3 or input_shape.ndims - num_batch_dims + 1 > 5:
-      raise ValueError(
-          "`input_shape.ndims - num_batch_dims + 1` must be at least 3 and at "
-          "most 5 but saw `input_shape.ndims == {}` and `num_batch_dims == {}`"
-          .format(input_shape.ndims, num_batch_dims))
-    conv_dims = input_shape.ndims - num_batch_dims - 1
+          "`input` and `filter` must have rank at least 3 and at most 5")
+    conv_dims = input_shape.ndims - 2
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
@@ -535,7 +520,7 @@ def with_space_to_batch(
 
   """
   input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-  input_shape = input.shape
+  input_shape = input.get_shape()
 
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
@@ -555,19 +540,18 @@ class _WithSpaceToBatch(object):
   """Helper class for with_space_to_batch.
 
   Note that this class assumes that shapes of input and filter passed to
-  `__call__` are compatible with `input_shape`, `filter_shape`, and
-  `spatial_dims` passed to the constructor.
+  __call__ are compatible with input_shape and filter_shape passed to the
+  constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.shape.
-    dilation_rate: see `with_space_to_batch`.
-    padding: see `with_space_to_batch`.
+    input_shape: static shape of input. i.e. input.get_shape().
+    dilation_rate: see with_space_to_batch
+    padding: see with_space_to_batch
     build_op: Function that maps (num_spatial_dims, paddings) -> (function that
       maps (input, filter) -> output).
-    filter_shape: see `with_space_to_batch`.
-    spatial_dims: `see with_space_to_batch`.
-    data_format: see `with_space_to_batch`.
-    num_batch_dims: (Optional).  Number of batch dims in `input_shape`.
+    filter_shape: see with_space_to_batch
+    spatial_dims: see with_space_to_batch
+    data_format: see with_space_to_batch
   """
 
   def __init__(self,
@@ -577,25 +561,24 @@ class _WithSpaceToBatch(object):
                build_op,
                filter_shape=None,
                spatial_dims=None,
-               data_format=None,
-               num_batch_dims=1):
+               data_format=None):
     """Helper class for _with_space_to_batch."""
     dilation_rate = ops.convert_to_tensor(
         dilation_rate, dtypes.int32, name="dilation_rate")
-    if dilation_rate.shape.ndims not in (None, 1):
-      raise ValueError(
-          "rate must be rank 1 but saw {}".format(dilation_rate.shape.ndims))
+    try:
+      rate_shape = dilation_rate.get_shape().with_rank(1)
+    except ValueError:
+      raise ValueError("rate must be rank 1")
 
-    if not dilation_rate.shape.is_fully_defined():
-      raise ValueError("rate must have known shape, but saw {}"
-                       .format(dilation_rate.shape))
+    if not dilation_rate.get_shape().is_fully_defined():
+      raise ValueError("rate must have known shape")
 
-    num_spatial_dims = dilation_rate.shape.dims[0].value
+    num_spatial_dims = rate_shape.dims[0].value
 
     if data_format is not None and data_format.startswith("NC"):
-      starting_spatial_dim = num_batch_dims + 1
+      starting_spatial_dim = 2
     else:
-      starting_spatial_dim = num_batch_dims
+      starting_spatial_dim = 1
 
     if spatial_dims is None:
       spatial_dims = range(starting_spatial_dim,
@@ -605,7 +588,7 @@ class _WithSpaceToBatch(object):
     if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
       raise ValueError(
           "spatial_dims must be a monotonically increasing sequence of "
-          "positive integers, but saw: {}".format(orig_spatial_dims))
+          "positive integers")
 
     if data_format is not None and data_format.startswith("NC"):
       expected_input_rank = spatial_dims[-1]
@@ -616,16 +599,14 @@ class _WithSpaceToBatch(object):
       input_shape.with_rank_at_least(expected_input_rank)
     except ValueError:
       raise ValueError(
-          "input tensor must have rank at least {}, but saw rank {}"
-          .format(expected_input_rank, input_shape.ndims))
+          "input tensor must have rank %d at least" % (expected_input_rank))
 
     const_rate = tensor_util.constant_value(dilation_rate)
     rate_or_const_rate = dilation_rate
     if const_rate is not None:
       rate_or_const_rate = const_rate
       if np.any(const_rate < 1):
-        raise ValueError("dilation_rate must be positive, but saw: {}"
-                         .format(const_rate))
+        raise ValueError("dilation_rate must be positive")
       if np.all(const_rate == 1):
         self.call = build_op(num_spatial_dims, padding)
         return
@@ -691,7 +672,6 @@ class _WithSpaceToBatch(object):
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
           filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
-
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -1014,83 +994,31 @@ def convolution_internal(
     data_format=None,
     dilations=None,
     name=None,
-    call_from_convolution=True,
-    num_spatial_dims=None):
-  """Internal function which performs rank agnostic convolution.
-
-  Args:
-    input: See `convolution`.
-    filters: See `convolution`.
-    strides: See `convolution`.
-    padding: See `convolution`.
-    data_format: See `convolution`.
-    dilations: See `convolution`.
-    name: See `convolution`.
-    call_from_convolution: See `convolution`.
-    num_spatial_dims: (Optional.).  It is a integer describing the
-      rank of the spatial dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
-      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
-      This argument is only required to disambiguate the rank of `batch_shape`
-      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
-      backwards compatibility, if `num_spatial_dims is None` and
-     `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
-     `1` (i.e., the input is expected to be
-     `[batch_size, num_channels] + input_spatial_shape`
-     or `[batch_size] + input_spatial_shape + [num_channels]`.
-
-  Returns:
-    A tensor of shape and dtype matching that of `input`.
-
-  Raises:
-    ValueError: If input and filter both have unknown shapes, or if
-      `num_spatial_dims` is provided and incompatible with the value
-      estimated from `filters.shape`.
-  """
-  n = None
-  if isinstance(filters, (list, tuple)):
-    filters = np.asarray(filters)
-  if (isinstance(filters.shape, tensor_shape.TensorShape)
-      and filters.shape.rank is not None):
+    call_from_convolution=True):
+  """Internal function which performs rank agnostic convolution."""
+  if isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape.rank is not None:
+    n = len(input.shape) - 2
+  elif not isinstance(input.shape, tensor_shape.TensorShape) and \
+        input.shape is not None:
+    n = len(input.shape) - 2
+  elif isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape.rank is not None:
     n = len(filters.shape) - 2
-  elif (not isinstance(filters.shape, tensor_shape.TensorShape)
-        and filters.shape is not None):
+  elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
+        filters.shape is not None:
     n = len(filters.shape) - 2
-
-  if (isinstance(input.shape, tensor_shape.TensorShape)
-      and input.shape.rank is not None):
-    if n is None:
-      n = (num_spatial_dims if num_spatial_dims is not None
-           else len(input.shape) - 2)
-    num_batch_dims = len(input.shape) - n - 1
-  elif (not isinstance(input.shape, tensor_shape.TensorShape)
-        and input.shape is not None):
-    if n is None:
-      n = (num_spatial_dims if num_spatial_dims is not None
-           else len(input.shape) - 2)
-    num_batch_dims = len(input.shape) - n - 1
   else:
-    num_batch_dims = 1  # Default behavior if it cannot be estimated.
-
-  if n is None:
     raise ValueError("rank of input or filter must be known")
 
-  if num_spatial_dims is not None and n != num_spatial_dims:
-    raise ValueError(
-        "inconsistent estimate of spatial dims ({}) vs. actual passed "
-        "num_spatial_dims ({}).  n was estimated as len(filters.shape) - 2, "
-        "but filters shape is: {}".format(n, num_spatial_dims, filters.shape))
-
   if not 1 <= n <= 3:
     raise ValueError(
-        "num_spatial_dims (input.shape.ndims - num_batch_dims - 1) must be one "
-        "of 1, 2 or 3 but saw {}.  num_batch_dims: {}."
-        .format(n, num_batch_dims))
+        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
 
   if data_format is None:
-    channel_index = num_batch_dims + n
+    channel_index = n + 1
   else:
-    channel_index = (
-        num_batch_dims if data_format.startswith("NC") else n + num_batch_dims)
+    channel_index = 1 if data_format.startswith("NC") else n + 1
 
   strides = _get_sequence(strides, n, channel_index, "strides")
   dilations = _get_sequence(dilations, n, channel_index, "dilations")
@@ -1103,7 +1031,7 @@ def convolution_internal(
     scope = "convolution"
 
   with ops.name_scope(name, scope, [input, filters]) as name:
-    conv_ops = {1: conv1d, 2: _conv2d_expanded_batch, 3: gen_nn_ops.conv3d}
+    conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
 
     if device_context.enclosing_tpu_context() is not None or all(
         i == 1 for i in dilations):
@@ -1133,8 +1061,7 @@ def convolution_internal(
           strides=strides,
           dilation_rate=dilations,
           name=name,
-          data_format=data_format,
-          num_spatial_dims=n)
+          data_format=data_format)
       return op(input, filters)
 
 
@@ -1142,34 +1069,17 @@ class Convolution(object):
   """Helper class for convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  `__call__` are compatible with `input_shape`, `filter_shape`, and
-  `num_spatial_dims` passed to the constructor.
+  __call__ are compatible with input_shape and filter_shape passed to the
+  constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.shape.  Its length is
-      `batch_shape + input_spatial_shape + [num_channels]` if `data_format`
-      does not start with `NC`, or
-      `batch_shape + [num_channels] + input_spatial_shape` if `data_format`
-      starts with `NC`.
-    filter_shape: static shape of the filter. i.e. filter.shape.
-    padding: The padding algorithm, must be "SAME" or "VALID".
+    input_shape: static shape of input. i.e. input.get_shape().
+    filter_shape: static shape of the filter. i.e. filter.get_shape().
+    padding:  see convolution.
     strides: see convolution.
     dilation_rate: see convolution.
     name: see convolution.
-    data_format: A string or `None`.  Specifies whether the channel dimension of
-      the `input` and output is the last dimension (if `data_format` is `None`
-      or does not start with `NC`), or the first post-batch dimension (i.e. if
-      `data_format` starts with `NC`).
-    num_spatial_dims: (Usually optional.)  Python integer, the rank of the
-      spatial and channel dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
-      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
-      This argument is only required to disambiguate the rank of `batch_shape`
-      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
-      backwards compatibility, if `num_spatial_dims is None` and
-      `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
-      `1` (i.e., the input is expected to be
-      `[batch_size, num_channels] + input_spatial_shape`
-      or `[batch_size] + input_spatial_shape + [num_channels]`.
+    data_format: see convolution.
   """
 
   def __init__(self,
@@ -1179,72 +1089,40 @@ class Convolution(object):
                strides=None,
                dilation_rate=None,
                name=None,
-               data_format=None,
-               num_spatial_dims=None):
+               data_format=None):
     """Helper function for convolution."""
-    num_batch_dims = None
-    filter_shape = tensor_shape.as_shape(filter_shape)
-    input_shape = tensor_shape.as_shape(input_shape)
+    num_total_dims = filter_shape.ndims
+    if num_total_dims is None:
+      num_total_dims = input_shape.ndims
+    if num_total_dims is None:
+      raise ValueError("rank of input or filter must be known")
 
-    if filter_shape.ndims is not None:
-      if (num_spatial_dims is not None and
-          filter_shape.ndims != num_spatial_dims + 2):
-        raise ValueError(
-            "Expected filter_shape.ndims == num_spatial_dims + 2, "
-            "but saw filter_shape.ndims == {} and num_spatial_dims == {}"
-            .format(filter_shape.ndims, num_spatial_dims))
-      else:
-        num_spatial_dims = filter_shape.ndims - 2
+    num_spatial_dims = num_total_dims - 2
 
-    if input_shape.ndims is not None and num_spatial_dims is not None:
-      num_batch_dims = input_shape.ndims - num_spatial_dims - 1
-
-    if num_spatial_dims is None:
-      num_spatial_dims = input_shape.ndims - 2
-    else:
-      if input_shape.ndims is not None:
-        if input_shape.ndims < num_spatial_dims + 2:
-          raise ValueError(
-              "Expected input_shape.ndims >= num_spatial_dims + 2, but saw "
-              "input_shape.ndims == {} and num_spatial_dims == {}"
-              .format(input_shape.ndims, num_spatial_dims))
-        else:
-          if num_batch_dims is None:
-            num_batch_dims = input_shape.ndims - num_spatial_dims - 1
-
-    if num_spatial_dims is None:
+    try:
+      input_shape.with_rank(num_spatial_dims + 2)
+    except ValueError:
       raise ValueError(
-          "Cannot estimate num_spatial_dims since input_shape.ndims is None, "
-          "filter_shape.ndims is None, and argument num_spatial_dims is also "
-          "None.")
+          "input tensor must have rank %d" % (num_spatial_dims + 2))
 
-    if num_batch_dims is None:
-      num_batch_dims = 1
-
-    if num_batch_dims < 1:
+    try:
+      filter_shape.with_rank(num_spatial_dims + 2)
+    except ValueError:
       raise ValueError(
-          "num_batch_dims should be >= 1, but saw {}.  num_batch_dims was "
-          "estimated as `input_shape.ndims - num_spatial_dims - 1` and "
-          "num_spatial_dims was either provided or estimated as "
-          "`filter_shape.ndims - 2`.  input_shape.ndims: {}, "
-          "num_spatial_dims: {}, filter_shape.ndims: {}"
-          .format(num_batch_dims, input_shape.ndims, num_spatial_dims,
-                  filter_shape.ndims))
+          "filter tensor must have rank %d" % (num_spatial_dims + 2))
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = tensor_shape.dimension_at_index(
-          input_shape, num_spatial_dims + num_batch_dims)
-      spatial_dims = range(num_batch_dims, num_spatial_dims + num_batch_dims)
+          input_shape, num_spatial_dims + 1)
+      spatial_dims = range(1, num_spatial_dims + 1)
     else:
-      input_channels_dim = tensor_shape.dimension_at_index(
-          input_shape, num_batch_dims)
-      spatial_dims = range(
-          num_batch_dims + 1, num_spatial_dims + num_batch_dims + 1)
+      input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1)
+      spatial_dims = range(2, num_spatial_dims + 2)
 
     if not input_channels_dim.is_compatible_with(
         filter_shape[num_spatial_dims]):
       raise ValueError(
-          "Number of input channels does not match corresponding dimension of "
+          "number of input channels does not match corresponding dimension of "
           "filter, {} != {}".format(input_channels_dim,
                                     filter_shape[num_spatial_dims]))
 
@@ -1258,8 +1136,6 @@ class Convolution(object):
     self.padding = padding
     self.name = name
     self.dilation_rate = dilation_rate
-    self.num_batch_dims = num_batch_dims
-    self.num_spatial_dims = num_spatial_dims
     self.conv_op = _WithSpaceToBatch(
         input_shape,
         dilation_rate=dilation_rate,
@@ -1267,8 +1143,7 @@ class Convolution(object):
         build_op=self._build_op,
         filter_shape=filter_shape,
         spatial_dims=spatial_dims,
-        data_format=data_format,
-        num_batch_dims=num_batch_dims)
+        data_format=data_format)
 
   def _build_op(self, _, padding):
     return _NonAtrousConvolution(
@@ -1277,8 +1152,7 @@ class Convolution(object):
         padding=padding,
         data_format=self.data_format,
         strides=self.strides,
-        name=self.name,
-        num_batch_dims=self.num_batch_dims)
+        name=self.name)
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
     # TPU convolution supports dilations greater than 1.
@@ -1291,8 +1165,7 @@ class Convolution(object):
           data_format=self.data_format,
           dilations=self.dilation_rate,
           name=self.name,
-          call_from_convolution=False,
-          num_spatial_dims=self.num_spatial_dims)
+          call_from_convolution=False)
     else:
       return self.conv_op(inp, filter)
 
@@ -2519,42 +2392,6 @@ def conv2d_transpose_v2(
         name=name)
 
 
-def _conv2d_expanded_batch(
-    input,  # pylint: disable=redefined-builtin
-    filters,
-    strides,
-    padding,
-    data_format,
-    dilations,
-    name):
-  """Helper function for `convolution_internal`; handles expanded batches."""
-  # Try really hard to avoid modifying the legacy name scopes - return early.
-  shape = getattr(input, "shape", None)
-  if shape is not None:
-    ndims = getattr(shape, "ndims", -1)
-    if ndims == -1: ndims = len(shape)
-  if ndims in (4, 3, 2, 1, 0, None):
-    return gen_nn_ops.conv2d(
-        input,
-        filter=filters,
-        strides=strides,
-        padding=padding,
-        data_format=data_format,
-        dilations=dilations,
-        name=name)
-  return _squeeze_batch_dims(
-      input,
-      functools.partial(
-          gen_nn_ops.conv2d,
-          filter=filters,
-          strides=strides,
-          padding=padding,
-          data_format=data_format,
-          dilations=dilations),
-      inner_rank=3,
-      name=name)
-
-
 @tf_export("nn.atrous_conv2d_transpose")
 @dispatch.add_dispatch_support
 def atrous_conv2d_transpose(value,

From 37ab9c3bfcbb278ae003cf32f08b3d41a78401a7 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 21 May 2020 12:18:17 -0700
Subject: [PATCH 0986/1533] Take device locality into account during
 prioritization.

After this CL, if multiple devices with identical device type are viable for a placement of an op, the local device (if available) will be selected. (Prior to this change, the device whose job name comes first alphabetically would be selected.)

PiperOrigin-RevId: 312716604
Change-Id: I484c00cf0d34acc23c32ab8dd1cc5c394d32f0f3
---
 tensorflow/core/common_runtime/device_set.cc | 7 +++++--
 tensorflow/core/common_runtime/device_set.h  | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/device_set.cc b/tensorflow/core/common_runtime/device_set.cc
index b062529a3ff..902ca2c2ee2 100644
--- a/tensorflow/core/common_runtime/device_set.cc
+++ b/tensorflow/core/common_runtime/device_set.cc
@@ -116,12 +116,15 @@ void DeviceSet::SortPrioritizedDeviceVector(PrioritizedDeviceVector* vector) {
     if (a_type_name != b_type_name) {
       auto a_priority = DeviceFactory::DevicePriority(a_type_name);
       auto b_priority = DeviceFactory::DevicePriority(b_type_name);
-      // First sort by prioritized device type (higher is preferred) and
-      // then by device name (lexicographically).
       if (a_priority != b_priority) {
         return a_priority > b_priority;
       }
     }
+
+    if (a.first->IsLocal() != b.first->IsLocal()) {
+      return a.first->IsLocal();
+    }
+
     return StringPiece(a.first->name()) < StringPiece(b.first->name());
   };
   std::sort(vector->begin(), vector->end(), device_sort);
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index 608705c32f7..f59f84c2066 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -90,8 +90,8 @@ class DeviceSet {
   //
   // After a call to this function, the argument vector will be sorted by
   // explicit priority (the second element in the `std::pair<DeviceType,
-  // int32>`), then by `DeviceTypeOrder` of the device type, and lastly
-  // by device name.
+  // int32>`), then by `DeviceTypeOrder` of the device type, then by device
+  // locality, and lastly by device name.
   static void SortPrioritizedDeviceVector(PrioritizedDeviceVector* vector);
 
   // Sorts a PrioritizedDeviceTypeVector according to types and explicit

From 49fd845a78752a672479fcad803763fc72ccba2d Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Thu, 21 May 2020 12:36:24 -0700
Subject: [PATCH 0987/1533] Nit: Fix some typos.

PiperOrigin-RevId: 312719982
Change-Id: I8e911c38bf2416b961ef2f4ddd8eb888504d73bf
---
 .../lite/delegates/gpu/common/testing/feature_parity/utils.cc | 2 +-
 .../lite/delegates/gpu/common/testing/feature_parity/utils.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
index 8f6e3cc64bf..bdcbf7ed62e 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.cc
@@ -116,7 +116,7 @@ absl::optional<std::string> CoordinateToString(TfLiteIntArray* shape,
   return result;
 }
 
-// Builds intepreter for a model, allocates tensors.
+// Builds interpreter for a model, allocates tensors.
 absl::Status BuildInterpreter(const Model* model,
                               std::unique_ptr<Interpreter>* interpreter) {
   TfLiteStatus status =
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
index 68c4a1a0d1e..7c34978fb55 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/utils.h
@@ -115,7 +115,7 @@ class TensorEqMatcher {
         return false;
       }
 
-      // 4. Proceed to data comparison. Iterate throught elements as they lay
+      // 4. Proceed to data comparison. Iterate through elements as they lay
       // flat. If some pair of elements don't match, deduct the coordinate
       // basing on the dimensions, then return.
       absl::Span<float> lhs_span(lhs.data.f, lhs.bytes / sizeof(float));
@@ -163,7 +163,7 @@ class TensorEqMatcher {
   const TfLiteTensor rhs_;
 };
 
-// Builds intepreter for a model, allocates tensors.
+// Builds interpreter for a model, allocates tensors.
 absl::Status BuildInterpreter(const Model* model,
                               std::unique_ptr<Interpreter>* interpreter);
 

From 97528c31757797f97a8b57b1d0e024a4ffd42422 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 12:53:37 -0700
Subject: [PATCH 0988/1533] [NFC] Fix typos and adopt Google style variable
 names

PiperOrigin-RevId: 312723375
Change-Id: I4eb23a8b34de55fb35960af7fcca8350cfb8e1c7
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 119 +++++++++---------
 .../transforms/resource_device_inference.cc   |   2 +-
 .../transforms/resource_op_lifting.cc         |   4 +-
 .../tensorflow/transforms/shape_inference.cc  |   4 +-
 4 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index cbbb9fd5db3..389be0d3b2b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1821,71 +1821,71 @@ static LogicalResult Verify(GatherV2Op op) {
 
 static LogicalResult Verify(IfOp op) {
   auto module = op.getParentOfType<ModuleOp>();
-  auto thenFn = module.lookupSymbol<FuncOp>(op.then_branch());
-  if (!thenFn)
+  auto then_fn = module.lookupSymbol<FuncOp>(op.then_branch());
+  if (!then_fn)
     return op.emitOpError("then_branch refers to an undefined function : ")
            << op.then_branch();
-  auto elseFn = module.lookupSymbol<FuncOp>(op.else_branch());
-  if (!elseFn)
+  auto else_fn = module.lookupSymbol<FuncOp>(op.else_branch());
+  if (!else_fn)
     return op.emitOpError("else_branch refers to an undefined function : ")
            << op.else_branch();
-  auto thenFuncType = thenFn.getType();
-  auto elseFuncType = elseFn.getType();
+  auto then_fn_type = then_fn.getType();
+  auto else_fn_type = else_fn.getType();
 
   // Non-conditional operands starting with the second operand are passed to
   // branches and should be pair-wise compatible with branches' inputs.
-  unsigned expectedNumInputs = op.getNumOperands() - 1;
-  if (thenFuncType.getNumInputs() != expectedNumInputs ||
-      elseFuncType.getNumInputs() != expectedNumInputs)
-    return op.emitError("branches should have " + Twine(expectedNumInputs) +
+  unsigned expected_num_inputs = op.getNumOperands() - 1;
+  if (then_fn_type.getNumInputs() != expected_num_inputs ||
+      else_fn_type.getNumInputs() != expected_num_inputs)
+    return op.emitError("branches should have " + Twine(expected_num_inputs) +
                         " inputs");
 
-  for (unsigned i = 0; i < expectedNumInputs; ++i) {
-    auto operandType = op.getOperand(i + 1).getType().cast<TensorType>();
-    auto thenInputType = thenFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operandType, thenInputType}))
+  for (unsigned i = 0; i < expected_num_inputs; ++i) {
+    auto operand_type = op.getOperand(i + 1).getType().cast<TensorType>();
+    auto then_input_type = then_fn_type.getInput(i).cast<TensorType>();
+    if (!AreCastCompatible({operand_type, then_input_type}))
       return op.emitError(
           llvm::formatv("then branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
-                        thenInputType, operandType, i));
+                        then_input_type, operand_type, i));
 
-    auto elseInputType = elseFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible({operandType, elseInputType}))
+    auto else_input_type = else_fn_type.getInput(i).cast<TensorType>();
+    if (!AreCastCompatible({operand_type, else_input_type}))
       return op.emitError(
           llvm::formatv("else branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
-                        elseInputType, operandType, i));
+                        else_input_type, operand_type, i));
 
     // If branches have incompatible input types that means that no tensor can
     // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible({thenInputType, elseInputType}))
+    if (!AreCastCompatible({then_input_type, else_input_type}))
       return op.emitError(llvm::formatv(
           "branches inputs have incompatible types {0} and {1} at index {2}",
-          thenInputType, elseInputType, i));
+          then_input_type, else_input_type, i));
   }
 
   // Branches' results should be pair-wise compatible with the op results.
-  unsigned expectedNumResults = op.getNumResults();
-  if (thenFuncType.getNumResults() != expectedNumResults ||
-      elseFuncType.getNumResults() != expectedNumResults)
-    return op.emitError("branches should have " + Twine(expectedNumResults) +
+  unsigned expected_num_results = op.getNumResults();
+  if (then_fn_type.getNumResults() != expected_num_results ||
+      else_fn_type.getNumResults() != expected_num_results)
+    return op.emitError("branches should have " + Twine(expected_num_results) +
                         " results");
 
-  for (unsigned i = 0; i < expectedNumResults; ++i) {
-    auto resultType = op.getResult(i).getType().cast<TensorType>();
-    auto thenResultType = thenFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({thenResultType, resultType}))
+  for (unsigned i = 0; i < expected_num_results; ++i) {
+    auto result_type = op.getResult(i).getType().cast<TensorType>();
+    auto then_result_type = then_fn_type.getResult(i).cast<TensorType>();
+    if (!AreCastCompatible({then_result_type, result_type}))
       return op.emitError(
           llvm::formatv("then branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
-                        thenResultType, resultType, i));
+                        then_result_type, result_type, i));
 
-    auto elseResultType = elseFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible({elseResultType, resultType}))
+    auto else_result_type = else_fn_type.getResult(i).cast<TensorType>();
+    if (!AreCastCompatible({else_result_type, result_type}))
       return op.emitError(
           llvm::formatv("else branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
-                        elseResultType, resultType, i));
+                        else_result_type, result_type, i));
   }
   return success();
 }
@@ -3887,36 +3887,37 @@ OpFoldResult VariableShapeOp::fold(ArrayRef<Attribute> operands) {
 
 static LogicalResult Verify(WhileOp op) {
   auto module = op.getParentOfType<ModuleOp>();
-  auto condFn = module.lookupSymbol<FuncOp>(op.cond());
-  auto bodyFn = module.lookupSymbol<FuncOp>(op.body());
-  if (!condFn) {
+  auto cond_fn = module.lookupSymbol<FuncOp>(op.cond());
+  auto body_fn = module.lookupSymbol<FuncOp>(op.body());
+  if (!cond_fn) {
     return op.emitOpError("cond refers to an undefined function : ")
            << op.cond();
   }
-  if (!bodyFn) {
+  if (!body_fn) {
     return op.emitOpError("body refers to an undefined function : ")
            << op.body();
   }
 
-  auto condFuncType = condFn.getType();
-  auto bodyFuncType = bodyFn.getType();
+  auto cond_fn_type = cond_fn.getType();
+  auto body_fn_type = body_fn.getType();
 
   // Verify that the cond function has exactly one result.
-  if (condFuncType.getNumResults() != 1)
+  if (cond_fn_type.getNumResults() != 1)
     return op.emitOpError("requires cond function to have exactly one result");
 
   SmallVector<Type, 4> operands(op.getOperandTypes());
 
   // Collect all the type lists for the op so that different pairs of type lists
   // can be compared for the compatibility.
-  int numTypeLists = 5;
-  std::pair<std::string, ArrayRef<Type>> typeLists[] = {
-      {"operand", operands},
-      {"body function result", bodyFuncType.getResults()},
-      {"result", op.getResultTypes()},
-      {"cond function input", condFuncType.getInputs()},
-      {"body function input", bodyFuncType.getInputs()},
-  };
+  constexpr int kNumTypeLists = 5;
+  const std::array<std::pair<std::string, ArrayRef<Type>>, kNumTypeLists>
+      type_lists = {{
+          {"operand", operands},
+          {"body function result", body_fn_type.getResults()},
+          {"result", op.getResultTypes()},
+          {"cond function input", cond_fn_type.getInputs()},
+          {"body function input", body_fn_type.getInputs()},
+      }};
 
   // A pair of type lists should be cast compatible with each other if one is
   // converted to the another for a function call or assignment or there is a
@@ -3940,28 +3941,28 @@ static LogicalResult Verify(WhileOp op) {
   // never converted from one to the another nor there is a common source
   // tensors.  Compatibility requirement is not transitive.
 
-  for (int i = 0; i < numTypeLists; ++i) {
+  for (int i = 0; i < kNumTypeLists; ++i) {
     // Skip the first pair as the While op operands and body function results
     // does not need to be compatible with each other.
-    for (int j = std::max(2, i + 1); j < numTypeLists; ++j) {
-      auto &a = typeLists[i];
-      auto &b = typeLists[j];
+    for (int j = std::max(2, i + 1); j < kNumTypeLists; ++j) {
+      auto &a = type_lists[i];
+      auto &b = type_lists[j];
 
-      int aSize = a.second.size();
-      if (aSize != b.second.size())
+      int a_size = a.second.size();
+      if (a_size != b.second.size())
         return op.emitOpError(
             llvm::formatv("requires the number of {0}s to be equal to the "
                           "number of {1}s. Found {2} and {3}, respectively",
-                          a.first, b.first, aSize, b.second.size()));
+                          a.first, b.first, a_size, b.second.size()));
 
-      for (int idx = 0; idx < aSize; ++idx) {
-        auto aType = a.second[idx];
-        auto bType = b.second[idx];
+      for (int idx = 0; idx < a_size; ++idx) {
+        auto a_type = a.second[idx];
+        auto b_type = b.second[idx];
 
-        if (!AreCastCompatible({aType, bType}))
+        if (!AreCastCompatible({a_type, b_type}))
           return op.emitError(llvm::formatv(
               "{0} type {1} is incompatible with {2} type {3} at index {4}",
-              a.first, aType, b.first, bType, idx));
+              a.first, a_type, b.first, b_type, idx));
       }
     }
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
index d37dfd14590..21d74d81b20 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc
@@ -149,7 +149,7 @@ LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op,
   }
   auto walk_res = func_op.walk([&](Operation* op) {
     if (auto var_handle = llvm::dyn_cast<TF::VarHandleOp>(op)) {
-      // Record VarHanldeOp's device attribute.
+      // Record VarHandleOp's device attribute.
       auto device_attr =
           var_handle.getAttrOfType<mlir::StringAttr>(kDeviceAttr);
       if (!device_attr || device_attr.getValue().empty()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 611c4d2725a..82bc612b1f8 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -571,7 +571,7 @@ void AddLoadsStoresOutsideControlFlowOp(
 }
 
 // Lifts loads/stores from while loop's body and cond functions.
-LogicalResult HanldeWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
+LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
   // Remove identity nodes to avoid aliasing.
   RemoveIdentity(&body.front());
   RemoveIdentity(&cond.front());
@@ -985,7 +985,7 @@ LogicalResult HoistForFunctionalControlFlow(
                                     lifted_partitioned_call_callees);
       HoistForFunctionalControlFlow(&cond.front(), module,
                                     lifted_partitioned_call_callees);
-      if (failed(HanldeWhileLoop(while_op, body, cond))) return failure();
+      if (failed(HandleWhileLoop(while_op, body, cond))) return failure();
     } else if (auto if_op = llvm::dyn_cast<TF::IfOp>(&op)) {
       auto then_branch =
           llvm::cast<FuncOp>(module.lookupSymbol(if_op.then_branch()));
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 5fa810eea33..1e9be76aa66 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -562,7 +562,7 @@ class ShapeInference {
 
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
-  // e.g., first element of OpResult produded) to an Attribute if the ValuePort
+  // e.g., first element of OpResult produced) to an Attribute if the ValuePort
   // corresponds to a constant value.
   ValuePortResultMap results_;
   int64_t graph_version_;
@@ -1144,7 +1144,7 @@ LogicalResult InferShapeForFunction(FuncOp func,
     ArrayRef<int64_t> shape = arg_shapes[i];
     Type element_type;
     if (auto input_ty = func_type.getInput(i).dyn_cast<RankedTensorType>()) {
-      if (!input_ty || input_ty.getShape().size() != shape.size()) {
+      if (input_ty.getRank() != shape.size()) {
         return failure();
       }
       element_type = input_ty.getElementType();

From 8fdb54ea98602e0286fd71dc3836b5f8a35a27f1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 12:53:51 -0700
Subject: [PATCH 0989/1533] Enable gradient tests for tf.linalg.cholesky in
 eager mode.

PiperOrigin-RevId: 312723423
Change-Id: I47d52dc14638301504ef8eccf481c7d7e3a60f48
---
 .../python/kernel_tests/cholesky_op_test.py   | 113 +++++++++---------
 1 file changed, 55 insertions(+), 58 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 01c497a37ed..5dc334c897b 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateless_random_ops
@@ -37,7 +37,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
-from tensorflow.python.platform import tf_logging
 
 
 # Different gradient implementations for benchmark purposes
@@ -181,7 +180,7 @@ class CholeskyOpTest(test.TestCase):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
     seed = [42, 24]
     matrix_shape = [5, 5]
@@ -196,108 +195,106 @@ class CholeskyOpTest(test.TestCase):
 
 
 class CholeskyGradTest(test.TestCase):
-  _backprop_block_size = 32
+  _backprop_block_size = 16
 
   def getShapes(self, shapeList):
     return ((elem, int(np.floor(1.2 * elem))) for elem in shapeList)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSmallMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSmallMatricesComplex(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testOneBlockMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes,
         dtypes=(dtypes_lib.float32, dtypes_lib.float64),
-        scalarTest=True)
+        scalar_test=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.float32,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.float32,), scalar_test=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.float64,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.float64,), scalar_test=True)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixComplexFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.complex64,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.complex64,), scalar_test=True)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testTwoBlockMatrixComplexDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
-        shapes, dtypes=(dtypes_lib.complex128,), scalarTest=True)
+        shapes, dtypes=(dtypes_lib.complex128,), scalar_test=True)
+
+  def _runOneTest(self, shape, dtype, batch, scalar_test):
+    if dtype == dtypes_lib.float64:
+      tol = 1e-5
+    elif dtype == dtypes_lib.complex128:
+      tol = 5e-5
+    else:
+      tol = 5e-3
+    epsilon = np.finfo(dtype.as_numpy_dtype).eps
+    delta = epsilon**(1.0 / 3.0)
+
+    def RandomInput():
+      a = np.random.randn(shape[0], shape[1]).astype(dtype.as_numpy_dtype)
+      if dtype.is_complex:
+        a += 1j * np.random.randn(shape[0], shape[1]).astype(
+            dtype.as_numpy_dtype)
+      return a
+
+    def Compute(x):
+      # Turn the random matrix x into a Hermitian matrix by
+      # computing the quadratic form x * x^H.
+      a = math_ops.matmul(x, math_ops.conj(
+          array_ops.matrix_transpose(x))) / shape[0]
+      if batch:
+        a = array_ops.tile(array_ops.expand_dims(a, 0), [2, 1, 1])
+      # Finally take the cholesky decomposition of the Hermitian matrix.
+      c = linalg_ops.cholesky(a)
+      if scalar_test:
+        # Reduce to a single scalar output to speed up test.
+        c = math_ops.reduce_mean(c)
+      return c
+
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        Compute, [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   def runFiniteDifferences(self,
                            shapes,
                            dtypes=(dtypes_lib.float32, dtypes_lib.float64,
                                    dtypes_lib.complex64, dtypes_lib.complex128),
-                           scalarTest=False):
-    with self.session(use_gpu=True):
-      for shape in shapes:
-        for batch in False, True:
-          for dtype in dtypes:
-            if not scalarTest:
-              data = np.random.randn(shape[0], shape[1])
-              if dtype.is_complex:
-                data = data.astype(np.complex64)
-                data += 1j * np.random.randn(shape[0], shape[1])
-              x = constant_op.constant(data, dtype)
-              tensor = math_ops.matmul(
-                  x, math_ops.conj(array_ops.transpose(x))) / shape[0]
-            else:
-              # This is designed to be a faster test for larger matrices.
-              data = np.random.randn()
-              if dtype.is_complex:
-                data = np.complex64(data)
-                data += 1j * np.random.randn()
-              x = constant_op.constant(data, dtype)
-              R = constant_op.constant(
-                  np.random.randn(shape[0], shape[1]), dtype)
-              e = math_ops.multiply(R, x)
-              tensor = math_ops.matmul(
-                  e, math_ops.conj(array_ops.transpose(e))) / shape[0]
-
-            # Inner-most matrices in tensor are positive definite.
-            if batch:
-              tensor = array_ops.tile(
-                  array_ops.expand_dims(tensor, 0), [4, 1, 1])
-            y = linalg_ops.cholesky(tensor)
-            if scalarTest:
-              y = math_ops.reduce_mean(y)
-            error = gradient_checker.compute_gradient_error(
-                x, x._shape_as_list(), y, y._shape_as_list())
-            tf_logging.info("error = %f", error)
-            if dtype == dtypes_lib.float64:
-              self.assertLess(error, 1e-5)
-            elif dtype == dtypes_lib.complex128:
-              self.assertLess(error, 5e-5)
-            else:
-              self.assertLess(error, 5e-3)
+                           scalar_test=False):
+    for shape_ in shapes:
+      for dtype_ in dtypes:
+        for batch_ in False, True:
+          self._runOneTest(shape_, dtype_, batch_, scalar_test)
 
 
 class CholeskyBenchmark(test.Benchmark):

From 17895acf34048c8492f02b25f10434592af37787 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Thu, 21 May 2020 13:13:26 -0700
Subject: [PATCH 0990/1533] Exporting CategoryEncoding layer.

PiperOrigin-RevId: 312727421
Change-Id: I62552e9b85398a27c5f584b2ea265d915c9661bb
---
 tensorflow/python/keras/layers/__init__.py    |   6 +
 .../python/keras/layers/preprocessing/BUILD   |  22 +-
 .../layers/preprocessing/benchmarks/BUILD     |   6 +-
 ...mark.py => category_encoding_benchmark.py} |   8 +-
 ...rical_encoding.py => category_encoding.py} | 117 ++++-----
 ...=> category_encoding_distribution_test.py} |  31 ++-
 ...ding_test.py => category_encoding_test.py} | 118 ++++-----
 ...encoding_v1.py => category_encoding_v1.py} |  13 +-
 .../preprocessing/text_vectorization.py       |  12 +-
 .../preprocessing/text_vectorization_v1.py    |   4 +-
 .../python/keras/layers/serialization.py      |  16 +-
 ...ing.-category-encoding.__metaclass__.pbtxt |  14 ++
 ...tal.preprocessing.-category-encoding.pbtxt | 234 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 ...ing.-category-encoding.__metaclass__.pbtxt |  14 ++
 ...tal.preprocessing.-category-encoding.pbtxt | 232 +++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 17 files changed, 677 insertions(+), 178 deletions(-)
 rename tensorflow/python/keras/layers/preprocessing/benchmarks/{categorical_encoding_benchmark.py => category_encoding_benchmark.py} (93%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding.py => category_encoding.py} (82%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding_distribution_test.py => category_encoding_distribution_test.py} (64%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding_test.py => category_encoding_test.py} (88%)
 rename tensorflow/python/keras/layers/preprocessing/{categorical_encoding_v1.py => category_encoding_v1.py} (89%)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 67ac91cb9be..e0f087b2453 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -44,6 +44,9 @@ from tensorflow.python.keras.layers.preprocessing.image_preprocessing import Res
 
 # Preprocessing layers.
 if tf2.enabled():
+  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding as CategoryEncodingV1
+  CategoryEncodingV2 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization as NormalizationV1
   NormalizationV2 = Normalization
@@ -51,6 +54,9 @@ if tf2.enabled():
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization as TextVectorizationV1
   TextVectorizationV2 = TextVectorization
 else:
+  from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding as CategoryEncodingV2
+  CategoryEncodingV1 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization as NormalizationV2
   NormalizationV1 = Normalization
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index b7fdc17b81d..af7f6392219 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -196,7 +196,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
         ":string_lookup",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -216,10 +216,10 @@ py_library(
 )
 
 py_library(
-    name = "categorical_encoding",
+    name = "category_encoding",
     srcs = [
-        "categorical_encoding.py",
-        "categorical_encoding_v1.py",
+        "category_encoding.py",
+        "category_encoding_v1.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -308,12 +308,12 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "categorical_encoding_test",
+    name = "category_encoding_test",
     size = "medium",
-    srcs = ["categorical_encoding_test.py"],
+    srcs = ["category_encoding_test.py"],
     python_version = "PY3",
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
         ":preprocessing_test_utils",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/keras",
@@ -324,9 +324,9 @@ tf_py_test(
 )
 
 distribute_py_test(
-    name = "categorical_encoding_distribution_test",
-    srcs = ["categorical_encoding_distribution_test.py"],
-    main = "categorical_encoding_distribution_test.py",
+    name = "category_encoding_distribution_test",
+    srcs = ["category_encoding_distribution_test.py"],
+    main = "category_encoding_distribution_test.py",
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
@@ -335,7 +335,7 @@ distribute_py_test(
         "no_oss",  # b/155502591
     ],
     deps = [
-        ":categorical_encoding",
+        ":category_encoding",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 6d29126bc7e..7c976880059 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -11,12 +11,12 @@ package(
 exports_files(["LICENSE"])
 
 tf_py_test(
-    name = "categorical_encoding_benchmark",
-    srcs = ["categorical_encoding_benchmark.py"],
+    name = "category_encoding_benchmark",
+    srcs = ["category_encoding_benchmark.py"],
     python_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/keras/layers/preprocessing:categorical_encoding",
+        "//tensorflow/python/keras/layers/preprocessing:category_encoding",
     ],
 )
 
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
similarity index 93%
rename from tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
rename to tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
index e68b77ebef9..71b4c7b6b61 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_encoding_benchmark.py
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/category_encoding_benchmark.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Benchmark for Keras categorical_encoding preprocessing layer."""
+"""Benchmark for Keras category_encoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,7 @@ from tensorflow.python import keras
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -42,7 +42,7 @@ class BenchmarkLayer(benchmark.Benchmark):
   def run_dataset_implementation(self, output_mode, batch_size, sequence_length,
                                  max_tokens):
     input_t = keras.Input(shape=(sequence_length,), dtype=dtypes.int32)
-    layer = categorical_encoding.CategoricalEncoding(
+    layer = category_encoding.CategoryEncoding(
         max_tokens=max_tokens, output_mode=output_mode)
     _ = layer(input_t)
 
@@ -68,7 +68,7 @@ class BenchmarkLayer(benchmark.Benchmark):
       ends.append(time.time())
 
     avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
-    name = "categorical_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
+    name = "category_encoding|batch_%s|seq_length_%s|%s_max_tokens" % (
         batch_size, sequence_length, max_tokens)
     self.report_benchmark(iters=num_repeats, wall_time=avg_time, name=name)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
similarity index 82%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding.py
index 466405a27a9..b0a7e746074 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras text CategoricalEncoding preprocessing layer."""
+"""Keras text CategoryEncoding preprocessing layer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -32,11 +32,13 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat
+from tensorflow.python.util.tf_export import keras_export
 
 TFIDF = "tf-idf"
 INT = "int"
@@ -49,14 +51,26 @@ _NUM_ELEMENTS_NAME = "num_elements"
 _IDF_NAME = "idf"
 
 
-class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Categorical encoding layer.
+@keras_export("keras.layers.experimental.preprocessing.CategoryEncoding", v1=[])
+class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
+  """Category encoding layer.
 
   This layer provides options for condensing data into a categorical encoding.
   It accepts integer values as inputs and outputs a dense representation
   (one sample = 1-index tensor of float values representing data about the
   sample's tokens) of those inputs.
 
+  Examples:
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
+  ...           max_tokens=4)
+  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
+  <tf.Tensor: shape=(4, 4), dtype=int64, numpy=
+    array([[1, 1, 0, 0],
+           [2, 0, 0, 0],
+           [0, 1, 1, 0],
+           [0, 1, 0, 1]])>
+
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary.
@@ -72,7 +86,6 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
       `Tensor`. Defaults to `False`.
   """
-  # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
                max_tokens=None,
@@ -83,7 +96,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     layer_utils.validate_string_arg(
         output_mode,
         allowable_strings=(COUNT, BINARY, TFIDF),
-        layer_name="CategoricalEncoding",
+        layer_name="CategoryEncoding",
         arg_name="output_mode")
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
@@ -92,10 +105,10 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       raise ValueError("max_tokens must be > 1.")
 
     # We need to call super() before we call _add_state_variable().
-    combiner = _CategoricalEncodingCombiner(
+    combiner = _CategoryEncodingCombiner(
         compute_max_element=max_tokens is None,
         compute_idf=output_mode == TFIDF)
-    super(CategoricalEncoding, self).__init__(combiner=combiner, **kwargs)
+    super(CategoryEncoding, self).__init__(combiner=combiner, **kwargs)
 
     self._max_tokens = max_tokens
     self._output_mode = output_mode
@@ -158,13 +171,12 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
       RuntimeError: if the layer cannot be adapted at this time.
     """
     if not reset_state:
-      raise ValueError("CategoricalEncoding does not support streaming adapts.")
+      raise ValueError("CategoryEncoding does not support streaming adapts.")
 
     if self._called and self._max_tokens is None:
-      raise RuntimeError(
-          "CategoricalEncoding can't be adapted after being called "
-          "if max_tokens is None.")
-    super(CategoricalEncoding, self).adapt(data, reset_state)
+      raise RuntimeError("CategoryEncoding can't be adapted after being called "
+                         "if max_tokens is None.")
+    super(CategoryEncoding, self).adapt(data, reset_state)
 
   def _set_state_variables(self, updates):
     if not self.built:
@@ -180,7 +192,7 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
         "output_mode": self._output_mode,
         "sparse": self._sparse,
     }
-    base_config = super(CategoricalEncoding, self).get_config()
+    base_config = super(CategoryEncoding, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def _convert_to_ndarray(self, x):
@@ -237,65 +249,40 @@ class CategoricalEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     else:
       out_depth = self._max_tokens
 
-    if self._sparse:
-      if self._output_mode != COUNT:
-        raise ValueError("Only supports `sparse=True` when `output_mode` "
-                         ' is \"count\", got {}'.format(self._output_mode))
-      inputs = self._convert_to_sparse_inputs(inputs)
-
-      # Consider having sparse.one_hot
-      # Append values to indices, and reduce sum to get the counts.
-      tokens = array_ops.expand_dims(
-          math_ops.cast(inputs.values, dtypes.int64), axis=1)
-      count_tokens = array_ops.concat([inputs.indices, tokens], axis=1)
-      count_values = array_ops.ones_like(inputs.values, dtype=dtypes.int64)
-      unreduced_count_shape = array_ops.concat(
-          [inputs.dense_shape, [out_depth]], axis=0)
-      counts = sparse_tensor.SparseTensor(
-          indices=count_tokens,
-          values=count_values,
-          dense_shape=unreduced_count_shape)
-      count_data = sparse_ops.sparse_reduce_sum_v2(
-          counts, axis=1, output_is_sparse=True)
-      return count_data
-
-    # If the input is a sparse tensor, we densify it with the default value of
-    # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
-    # positions from the output encoding.
-    if isinstance(inputs, sparse_tensor.SparseTensor):
-      inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
-
-    if self._output_mode == BINARY:
-      bool_one_hot_data = array_ops.one_hot(
-          inputs, depth=out_depth, on_value=True, off_value=False)
-      reduced_bool_data = math_ops.reduce_any(bool_one_hot_data, axis=1)
-      binary_data = math_ops.cast(reduced_bool_data, dtypes.int64)
-      binary_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
-      return binary_data
-
-    one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
-    counts = math_ops.reduce_sum(one_hot_data, axis=1)
-    if self._output_mode == COUNT:
-      count_data = math_ops.cast(counts, dtypes.int64)
-      count_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
-      return count_data
-
-    tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
-    tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
     if self._output_mode == TFIDF:
+      # If the input is a sparse tensor, we densify it with the default value of
+      # -1. Because -1 is ignored by one_hot, this effectively drops the non-set
+      # positions from the output encoding.
+      if isinstance(inputs, sparse_tensor.SparseTensor):
+        inputs = sparse_ops.sparse_tensor_to_dense(inputs, default_value=-1)
+      one_hot_data = array_ops.one_hot(inputs, depth=out_depth)
+      counts = math_ops.reduce_sum(one_hot_data, axis=1)
+      tf_idf_data = math_ops.multiply(counts, self.tf_idf_weights)
+      tf_idf_data.set_shape(tensor_shape.TensorShape((None, out_depth)))
       return tf_idf_data
 
-    # We can only get here if we didn't recognize the passed mode.
-    raise ValueError("Unknown output mode %s" % self._output_mode)
+    binary_output = (self._output_mode == BINARY)
+    if self._sparse:
+      return bincount_ops.sparse_bincount(
+          inputs, minlength=out_depth, axis=-1, binary_output=binary_output)
+    else:
+      result = bincount_ops.bincount(
+          inputs,
+          minlength=out_depth,
+          dtype=dtypes.int64,
+          axis=-1,
+          binary_output=binary_output)
+      result.set_shape(tensor_shape.TensorShape((None, out_depth)))
+      return result
 
 
-class _CategoricalEncodingAccumulator(
+class _CategoryEncodingAccumulator(
     collections.namedtuple("Accumulator", ["data", "per_doc_count_dict"])):
   pass
 
 
-class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
-  """Combiner for the CategoricalEncoding preprocessing layer.
+class _CategoryEncodingCombiner(base_preprocessing_layer.Combiner):
+  """Combiner for the CategoryEncoding preprocessing layer.
 
   This class encapsulates the logic for computing the number of elements in the
   input dataset and the document frequency for each element.
@@ -411,7 +398,7 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
   def restore(self, output):
     """Creates an accumulator based on 'output'."""
     raise NotImplementedError(
-        "CategoricalEncoding does not restore or support streaming updates.")
+        "CategoryEncoding does not restore or support streaming updates.")
 
   def serialize(self, accumulator):
     """Serializes an accumulator for a remote call."""
@@ -452,4 +439,4 @@ class _CategoricalEncodingCombiner(base_preprocessing_layer.Combiner):
     else:
       per_doc_count_dict = None
     data = [0, 0]
-    return _CategoricalEncodingAccumulator(data, per_doc_count_dict)
+    return _CategoryEncodingAccumulator(data, per_doc_count_dict)
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
similarity index 64%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
index c5214533f94..011495b9314 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_distribution_test.py
@@ -21,39 +21,58 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.platform import test
 
 
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
 @combinations.generate(
     combinations.combine(
-        distribution=strategy_combinations.all_strategies,
+        # (b/156783625): Outside compilation failed for eager mode only.
+        distribution=strategy_combinations.strategies_minus_tpu,
         mode=["eager", "graph"]))
-class CategoricalEncodingDistributionTest(
+class CategoryEncodingDistributionTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_distribution(self, distribution):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
+    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(input_array)
+    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
 
     # pyformat: disable
     expected_output = [[0, 1, 1, 1, 0, 0],
                        [1, 1, 0, 1, 0, 0]]
     # pyformat: enable
     max_tokens = 6
+    config.set_soft_device_placement(True)
 
     with distribution.scope():
       input_data = keras.Input(shape=(4,), dtype=dtypes.int32)
-      layer = categorical_encoding.CategoricalEncoding(
-          max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+      layer = category_encoding.CategoryEncoding(
+          max_tokens=max_tokens, output_mode=category_encoding.BINARY)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
+    output_dataset = model.predict(inp_dataset)
     self.assertAllEqual(expected_output, output_dataset)
 
 
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
similarity index 88%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
index e21e95a0078..08aa6d4871b 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Keras text categorical_encoding preprocessing layer."""
+"""Tests for Keras text category_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -32,8 +32,8 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.layers import core
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -44,15 +44,15 @@ from tensorflow.python.platform import test
 
 def get_layer_class():
   if context.executing_eagerly():
-    return categorical_encoding.CategoricalEncoding
+    return category_encoding.CategoryEncoding
   else:
-    return categorical_encoding_v1.CategoricalEncoding
+    return category_encoding_v1.CategoryEncoding
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
-class CategoricalEncodingInputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingInputTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
 
   def test_dense_input_sparse_output(self):
     input_array = constant_op.constant([[1, 2, 3], [3, 3, 0]])
@@ -67,9 +67,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -80,7 +78,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -103,7 +101,7 @@ class CategoricalEncodingInputTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -128,9 +126,7 @@ class CategoricalEncodingInputTest(
     max_tokens = 6
 
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -141,7 +137,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -163,7 +159,7 @@ class CategoricalEncodingInputTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
 
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -184,9 +180,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
     layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -197,7 +191,7 @@ class CategoricalEncodingInputTest(
     # Assert sparse output is same as dense output.
     layer = get_layer_class()(
         max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
+        output_mode=category_encoding.COUNT,
         sparse=False)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -214,9 +208,7 @@ class CategoricalEncodingInputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     encoding_layer = get_layer_class()(
-        max_tokens=max_tokens,
-        output_mode=categorical_encoding.COUNT,
-        sparse=True)
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
     int_data = encoding_layer(input_data)
     output_data = math_ops.cast(int_data, dtypes.float32)
     weights = variables.Variable([[.1], [.2], [.3], [.4]], dtype=dtypes.float32)
@@ -228,9 +220,9 @@ class CategoricalEncodingInputTest(
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingAdaptTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingAdaptTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
 
   def test_sparse_adapt(self):
     vocab_data = sparse_ops.from_dense(
@@ -248,7 +240,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -273,7 +265,7 @@ class CategoricalEncodingAdaptTest(
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
 
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.adapt(vocab_dataset)
     int_data = layer(input_data)
 
@@ -296,7 +288,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     layer.adapt(vocab_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -306,7 +298,7 @@ class CategoricalEncodingAdaptTest(
     self.assertAllEqual(expected_output, output_dataset)
 
   def test_hard_maximum_set_state_variables_after_build(self):
-    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
 
     # pyformat: disable
@@ -318,7 +310,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     layer._set_state_variables(state_variables)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -339,7 +331,7 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.build(input_data.shape)
     layer.set_num_elements(max_tokens)
     int_data = layer(input_data)
@@ -351,8 +343,7 @@ class CategoricalEncodingAdaptTest(
 
   def test_set_weights_fails_on_wrong_size_weights(self):
     tfidf_data = [.05, .5, .25, .2, .125]
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.TFIDF)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
 
     with self.assertRaisesRegex(ValueError, ".*Layer weight shape.*"):
       layer.set_weights([np.array(tfidf_data)])
@@ -360,7 +351,7 @@ class CategoricalEncodingAdaptTest(
   def test_set_num_elements_after_call_fails(self):
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer.set_num_elements(5)
@@ -370,17 +361,17 @@ class CategoricalEncodingAdaptTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "can't be adapted"):
       layer.adapt(vocab_data)
 
   def test_set_state_variables_after_call_fails(self):
-    state_variables = {categorical_encoding._NUM_ELEMENTS_NAME: 5}
+    state_variables = {category_encoding._NUM_ELEMENTS_NAME: 5}
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     _ = layer(input_data)
     with self.assertRaisesRegex(RuntimeError, "num_elements cannot be changed"):
       layer._set_state_variables(state_variables)
@@ -388,9 +379,9 @@ class CategoricalEncodingAdaptTest(
 
 @keras_parameterized.run_all_keras_modes
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingOutputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
+class CategoryEncodingOutputTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
 
   def test_binary_output_hard_maximum(self):
     input_array = np.array([[1, 2, 3, 1], [0, 3, 1, 0]])
@@ -404,7 +395,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=max_tokens, output_mode=categorical_encoding.BINARY)
+        max_tokens=max_tokens, output_mode=category_encoding.BINARY)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -424,7 +415,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.BINARY)
+        max_tokens=None, output_mode=category_encoding.BINARY)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -444,8 +435,7 @@ class CategoricalEncodingOutputTest(
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.COUNT)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.COUNT)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -465,7 +455,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.COUNT)
+        max_tokens=None, output_mode=category_encoding.COUNT)
     layer.set_weights([np.array(max_tokens)])
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -488,8 +478,7 @@ class CategoricalEncodingOutputTest(
     expected_output_shape = [None, max_tokens]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
-    layer = get_layer_class()(
-        max_tokens=6, output_mode=categorical_encoding.TFIDF)
+    layer = get_layer_class()(max_tokens=6, output_mode=category_encoding.TFIDF)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
@@ -513,7 +502,7 @@ class CategoricalEncodingOutputTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int32)
     layer = get_layer_class()(
-        max_tokens=None, output_mode=categorical_encoding.TFIDF)
+        max_tokens=None, output_mode=category_encoding.TFIDF)
     layer.set_num_elements(max_tokens)
     layer.set_tfidf_data(tfidf_data)
     int_data = layer(input_data)
@@ -524,7 +513,7 @@ class CategoricalEncodingOutputTest(
     self.assertAllClose(expected_output, output_dataset)
 
 
-class CategoricalEncodingModelBuildingTest(
+class CategoryEncodingModelBuildingTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -532,27 +521,27 @@ class CategoricalEncodingModelBuildingTest(
       {
           "testcase_name": "count_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.COUNT
+          "output_mode": category_encoding.COUNT
       }, {
           "testcase_name": "count_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.COUNT
+          "output_mode": category_encoding.COUNT
       }, {
           "testcase_name": "binary_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.BINARY
+          "output_mode": category_encoding.BINARY
       }, {
           "testcase_name": "binary_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.BINARY
+          "output_mode": category_encoding.BINARY
       }, {
           "testcase_name": "tfidf_hard_max",
           "max_tokens": 5,
-          "output_mode": categorical_encoding.TFIDF
+          "output_mode": category_encoding.TFIDF
       }, {
           "testcase_name": "tfidf_soft_max",
           "max_tokens": None,
-          "output_mode": categorical_encoding.TFIDF
+          "output_mode": category_encoding.TFIDF
       })
   def test_end_to_end_bagged_modeling(self, output_mode, max_tokens):
     tfidf_data = np.array([.03, .5, .25, .2, .125])
@@ -564,7 +553,7 @@ class CategoricalEncodingModelBuildingTest(
     weights = []
     if max_tokens is None:
       weights.append(np.array(5))
-    if output_mode == categorical_encoding.TFIDF:
+    if output_mode == category_encoding.TFIDF:
       weights.append(tfidf_data)
 
     layer.set_weights(weights)
@@ -577,7 +566,7 @@ class CategoricalEncodingModelBuildingTest(
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingCombinerTest(
+class CategoryEncodingCombinerTest(
     keras_parameterized.TestCase,
     preprocessing_test_utils.PreprocessingLayerTest):
 
@@ -617,8 +606,7 @@ class CategoricalEncodingCombinerTest(
 
   def test_combiner_api_compatibility_int_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
-        compute_idf=False)
+    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=False)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "num_documents": np.array(2),
@@ -636,8 +624,7 @@ class CategoricalEncodingCombinerTest(
 
   def test_combiner_api_compatibility_tfidf_mode(self):
     data = np.array([[1, 2, 3, 4], [1, 2, 3, 0]])
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
-        compute_idf=True)
+    combiner = category_encoding._CategoryEncodingCombiner(compute_idf=True)
     expected_accumulator_output = {
         "max_element": np.array(4),
         "document_counts": np.array([1, 2, 2, 2, 1]),
@@ -693,7 +680,7 @@ class CategoricalEncodingCombinerTest(
                                 expected_accumulator_output,
                                 expected_extract_output,
                                 compute_idf=True):
-    combiner = categorical_encoding._CategoricalEncodingCombiner(
+    combiner = category_encoding._CategoryEncodingCombiner(
         compute_idf=compute_idf)
     expected_accumulator = combiner._create_accumulator()
     expected_accumulator = self.update_accumulator(expected_accumulator,
@@ -702,6 +689,5 @@ class CategoricalEncodingCombinerTest(
     self.validate_accumulator_extract(combiner, data, expected_extract_output)
 
 
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
similarity index 89%
rename from tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
rename to tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
index 83128ed5095..3afb86b344f 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_encoding_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_v1.py
@@ -12,20 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tensorflow V1 version of the text categorical_encoding preprocessing layer."""
+"""Tensorflow V1 version of the text category_encoding preprocessing layer."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.util.tf_export import keras_export
 
 
-class CategoricalEncoding(categorical_encoding.CategoricalEncoding,
-                          base_preprocessing_layer_v1.CombinerPreprocessingLayer
-                         ):
-  """CategoricalEncoding layer.
+@keras_export(v1=["keras.layers.experimental.preprocessing.CategoryEncoding"])
+class CategoryEncoding(category_encoding.CategoryEncoding,
+                       base_preprocessing_layer_v1.CombinerPreprocessingLayer):
+  """CategoryEncoding layer.
 
   This layer provides options for condensing input data into denser
   representations. It accepts either integer values or strings as inputs,
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 1abc37cb4c3..057575d4ecc 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -26,7 +26,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
@@ -42,10 +42,10 @@ LOWER_AND_STRIP_PUNCTUATION = "lower_and_strip_punctuation"
 
 SPLIT_ON_WHITESPACE = "whitespace"
 
-TFIDF = categorical_encoding.TFIDF
-INT = categorical_encoding.INT
-BINARY = categorical_encoding.BINARY
-COUNT = categorical_encoding.COUNT
+TFIDF = category_encoding.TFIDF
+INT = category_encoding.INT
+BINARY = category_encoding.BINARY
+COUNT = category_encoding.COUNT
 
 # This is an explicit regex of all the tokens that will be stripped if
 # LOWER_AND_STRIP_PUNCTUATION is set. If an application requires other
@@ -307,7 +307,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   # These are V1/V2 shim points. There are V1 implementations in the V1 class.
   def _get_vectorization_class(self):
-    return categorical_encoding.CategoricalEncoding
+    return category_encoding.CategoryEncoding
 
   def _get_index_lookup_class(self):
     return string_lookup.StringLookup
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index a7c7b9136f9..505cdc39547 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -19,7 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
-from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.util.tf_export import keras_export
@@ -77,7 +77,7 @@ class TextVectorization(text_vectorization.TextVectorization,
   """
 
   def _get_vectorization_class(self):
-    return categorical_encoding_v1.CategoricalEncoding
+    return category_encoding_v1.CategoryEncoding
 
   def _get_index_lookup_class(self):
     return string_lookup_v1.StringLookup
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 2eb7cff75bb..992ff562755 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -46,6 +46,8 @@ from tensorflow.python.keras.layers import recurrent_v2
 from tensorflow.python.keras.layers import rnn_cell_wrapper_v2
 from tensorflow.python.keras.layers import wrappers
 from tensorflow.python.keras.layers.preprocessing import category_crossing
+from tensorflow.python.keras.layers.preprocessing import category_encoding
+from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
@@ -61,15 +63,11 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                convolutional_recurrent, core, cudnn_recurrent, dense_attention,
                embeddings, einsum_dense, local, merge, noise, normalization,
                pooling, image_preprocessing, preprocessing_normalization_v1,
-               preprocessing_text_vectorization_v1,
-               recurrent, wrappers, hashing, category_crossing)
-ALL_V2_MODULES = (
-    rnn_cell_wrapper_v2,
-    normalization_v2,
-    recurrent_v2,
-    preprocessing_normalization,
-    preprocessing_text_vectorization
-)
+               preprocessing_text_vectorization_v1, recurrent, wrappers,
+               hashing, category_crossing, category_encoding_v1)
+ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
+                  preprocessing_normalization, preprocessing_text_vectorization,
+                  category_encoding)
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
new file mode 100644
index 00000000000..e907d9a293b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
new file mode 100644
index 00000000000..165a6de49a8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -0,0 +1,234 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding_v1.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_num_elements"
+    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tfidf_data"
+    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index c93b8a89fb8..a922b143910 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "CategoryCrossing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoryEncoding"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
new file mode 100644
index 00000000000..e907d9a293b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
new file mode 100644
index 00000000000..2edcfbb6487
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -0,0 +1,232 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryEncoding"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.category_encoding.CategoryEncoding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'output_mode\', \'sparse\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'count\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_num_elements"
+    argspec: "args=[\'self\', \'num_elements\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_tfidf_data"
+    argspec: "args=[\'self\', \'tfidf_data\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index c93b8a89fb8..a922b143910 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "CategoryCrossing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CategoryEncoding"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"

From 57c5d33f895a166a50c923d902ddc1500a3fc933 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 13:26:41 -0700
Subject: [PATCH 0991/1533] Support the pybind11 `_dtypes.DType` in
 `tf.as_dtype`.

PiperOrigin-RevId: 312729982
Change-Id: I6905b81e7bae6d684236ac570220c88803e345ca
---
 tensorflow/python/framework/dtypes.py      | 3 +++
 tensorflow/python/framework/dtypes_test.py | 8 ++++++++
 2 files changed, 11 insertions(+)

diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 73fb034f061..994a7eea494 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -640,5 +640,8 @@ def as_dtype(type_value):
     except (KeyError, TypeError):
       pass
 
+  if isinstance(type_value, _dtypes.DType):
+    return _INTERN_TABLE[type_value.as_datatype_enum]
+
   raise TypeError("Cannot convert value %r to a TensorFlow DType." %
                   (type_value,))
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index dd2ea446b78..041cc5280cd 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python import _dtypes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import googletest
@@ -64,6 +65,13 @@ class TypesTest(test_util.TensorFlowTestCase):
             dtypes.as_dtype(datatype_enum).base_dtype,
             dtypes.as_dtype(numpy_dtype))
 
+  def testAllPybind11DTypeConvertibleToDType(self):
+    for datatype_enum in types_pb2.DataType.values():
+      if datatype_enum == types_pb2.DT_INVALID:
+        continue
+      dtype = _dtypes.DType(datatype_enum)
+      self.assertEqual(dtypes.as_dtype(datatype_enum), dtype)
+
   def testInvalid(self):
     with self.assertRaises(TypeError):
       dtypes.DType(types_pb2.DT_INVALID)

From 7d9d943192dc837105ba90684eb3190f38619db1 Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Thu, 21 May 2020 13:53:55 -0700
Subject: [PATCH 0992/1533] [TF] Add support for more than one outer batch
 dimension to tf.nn.convolution.

This is part 2/N of adding outer batch dimension support to tf.nn.convXd and keras.layers.ConvXd.

Also added support for batch_shape.ndims > 1 to nn_ops.Convolution and other internal
libraries, so that we can use this in keras.layers.ConvXD.

For now, using tf.nn.convolution with filter.shape == 3 or filter.shape == 5 (conv1d or conv3d) still raises an error deep in the ops, because i haven't yet added reshape
wrappers for gen_nn_ops.conv{1d,3d} but those are gonna be easy to add once
this is in.  I wanted to make sure it works for conv2d first.

No public signature changes.

Rollback of rollback with fixes.

PiperOrigin-RevId: 312735044
Change-Id: I4b4497a2925a965fa45f1812d7bd25d7a2c087ac
---
 .../python/kernel_tests/conv_ops_test.py      |  52 +++
 tensorflow/python/ops/nn_ops.py               | 326 +++++++++++++-----
 2 files changed, 297 insertions(+), 81 deletions(-)

diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 18b7a47fc8c..e01abc8133d 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -455,6 +455,58 @@ class Conv2DTest(test.TestCase):
         conv1,
         self.evaluate(conv2).reshape(conv1.shape))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvolutionClass2DExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    convolver1 = nn_ops.Convolution(
+        input_shape=x1.shape,
+        filter_shape=filter_in.shape,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(convolver1.num_batch_dims, 1)
+    convolver2 = nn_ops.Convolution(
+        input_shape=x2.shape,
+        filter_shape=filter_in.shape,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(convolver2.num_batch_dims, 2)
+    conv1 = convolver1(x1, filter_in)
+    conv2 = convolver2(x2, filter_in)
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConvolutionWith2SpatialDimensionsAndExpandedBatch(self):
+    tensor_in_sizes_batch = [10, 2, 3, 3]
+    tensor_in_sizes_expanded_batch = [2, 5, 2, 3, 3]
+    filter_in_sizes = [1, 1, 3, 3]
+    filter_in = self._CreateNumpyTensor(filter_in_sizes)
+    x1 = self._CreateNumpyTensor(tensor_in_sizes_batch)
+    x2 = x1.reshape(tensor_in_sizes_expanded_batch)
+    conv1 = nn_ops.convolution(
+        x1,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    conv2 = nn_ops.convolution(
+        x2,
+        filter_in,
+        strides=[1, 1],
+        padding="VALID")
+    self.assertEqual(conv1.shape, tensor_in_sizes_batch)
+    self.assertEqual(conv2.shape, tensor_in_sizes_expanded_batch)
+    self.assertAllEqual(
+        conv1,
+        self.evaluate(conv2).reshape(conv1.shape))
+
   @test_util.run_in_graph_and_eager_modes
   def testConv2D2x2Filter2x1Dilation(self):
     self._VerifyDilatedConvValues(
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 4c00d085f82..24ee94fac48 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -131,9 +131,9 @@ def _non_atrous_convolution(
   """
   with ops.name_scope(name, "non_atrous_convolution", [input, filter]) as scope:
     input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-    input_shape = input.get_shape()
+    input_shape = input.shape
     filter = ops.convert_to_tensor(filter, name="filter")  # pylint: disable=redefined-builtin
-    filter_shape = filter.get_shape()
+    filter_shape = filter.shape
     op = _NonAtrousConvolution(
         input_shape,
         filter_shape=filter_shape,
@@ -148,36 +148,51 @@ class _NonAtrousConvolution(object):
   """Helper class for _non_atrous_convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
+  `__call__` are compatible with `input_shape` and filter_shape passed to the
   constructor.
 
   Arguments:
-    input_shape: static input shape, i.e. input.get_shape().
-    filter_shape: static filter shape, i.e. filter.get_shape().
+    input_shape: static input shape, i.e. input.shape.
+    filter_shape: static filter shape, i.e. filter.shape.
     padding: see _non_atrous_convolution.
     data_format: see _non_atrous_convolution.
     strides: see _non_atrous_convolution.
     name: see _non_atrous_convolution.
+    num_batch_dims: (Optional.)  The number of batch dimensions in the input;
+     if not provided, the default of `1` is used.
   """
 
   def __init__(
       self,
       input_shape,
-      filter_shape,  # pylint: disable=redefined-builtin
+      filter_shape,
       padding,
       data_format=None,
       strides=None,
-      name=None):
-    filter_shape = filter_shape.with_rank(input_shape.ndims)
+      name=None,
+      num_batch_dims=1):
+    # filter shape is always rank num_spatial_dims + 2
+    # and num_spatial_dims == input_shape.ndims - num_batch_dims - 1
+    if input_shape.ndims is not None:
+      filter_shape = filter_shape.with_rank(
+          input_shape.ndims - num_batch_dims + 1)
     self.padding = padding
     self.name = name
-    input_shape = input_shape.with_rank(filter_shape.ndims)
+    # input shape is == num_spatial_dims + num_batch_dims + 1
+    # and filter_shape is always rank num_spatial_dims + 2
+    if filter_shape.ndims is not None:
+      input_shape = input_shape.with_rank(
+          filter_shape.ndims + num_batch_dims - 1)
     if input_shape.ndims is None:
-      raise ValueError("Rank of convolution must be known")
-    if input_shape.ndims < 3 or input_shape.ndims > 5:
       raise ValueError(
-          "`input` and `filter` must have rank at least 3 and at most 5")
-    conv_dims = input_shape.ndims - 2
+          "Rank of convolution must be known, but saw input_shape.ndims == {}"
+          .format(input_shape.ndims))
+    if input_shape.ndims < 3 or input_shape.ndims - num_batch_dims + 1 > 5:
+      raise ValueError(
+          "`input_shape.ndims - num_batch_dims + 1` must be at least 3 and at "
+          "most 5 but saw `input_shape.ndims == {}` and `num_batch_dims == {}`"
+          .format(input_shape.ndims, num_batch_dims))
+    conv_dims = input_shape.ndims - num_batch_dims - 1
     if strides is None:
       strides = [1] * conv_dims
     elif len(strides) != conv_dims:
@@ -520,7 +535,7 @@ def with_space_to_batch(
 
   """
   input = ops.convert_to_tensor(input, name="input")  # pylint: disable=redefined-builtin
-  input_shape = input.get_shape()
+  input_shape = input.shape
 
   def build_op(num_spatial_dims, padding):
     return lambda inp, _: op(inp, num_spatial_dims, padding)
@@ -540,18 +555,19 @@ class _WithSpaceToBatch(object):
   """Helper class for with_space_to_batch.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
-  constructor.
+  `__call__` are compatible with `input_shape`, `filter_shape`, and
+  `spatial_dims` passed to the constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.get_shape().
-    dilation_rate: see with_space_to_batch
-    padding: see with_space_to_batch
+    input_shape: static shape of input. i.e. input.shape.
+    dilation_rate: see `with_space_to_batch`.
+    padding: see `with_space_to_batch`.
     build_op: Function that maps (num_spatial_dims, paddings) -> (function that
       maps (input, filter) -> output).
-    filter_shape: see with_space_to_batch
-    spatial_dims: see with_space_to_batch
-    data_format: see with_space_to_batch
+    filter_shape: see `with_space_to_batch`.
+    spatial_dims: `see with_space_to_batch`.
+    data_format: see `with_space_to_batch`.
+    num_batch_dims: (Optional).  Number of batch dims in `input_shape`.
   """
 
   def __init__(self,
@@ -561,24 +577,25 @@ class _WithSpaceToBatch(object):
                build_op,
                filter_shape=None,
                spatial_dims=None,
-               data_format=None):
+               data_format=None,
+               num_batch_dims=1):
     """Helper class for _with_space_to_batch."""
     dilation_rate = ops.convert_to_tensor(
         dilation_rate, dtypes.int32, name="dilation_rate")
-    try:
-      rate_shape = dilation_rate.get_shape().with_rank(1)
-    except ValueError:
-      raise ValueError("rate must be rank 1")
+    if dilation_rate.shape.ndims not in (None, 1):
+      raise ValueError(
+          "rate must be rank 1 but saw {}".format(dilation_rate.shape.ndims))
 
-    if not dilation_rate.get_shape().is_fully_defined():
-      raise ValueError("rate must have known shape")
+    if not dilation_rate.shape.is_fully_defined():
+      raise ValueError("rate must have known shape, but saw {}"
+                       .format(dilation_rate.shape))
 
-    num_spatial_dims = rate_shape.dims[0].value
+    num_spatial_dims = dilation_rate.shape.dims[0].value
 
     if data_format is not None and data_format.startswith("NC"):
-      starting_spatial_dim = 2
+      starting_spatial_dim = num_batch_dims + 1
     else:
-      starting_spatial_dim = 1
+      starting_spatial_dim = num_batch_dims
 
     if spatial_dims is None:
       spatial_dims = range(starting_spatial_dim,
@@ -588,7 +605,7 @@ class _WithSpaceToBatch(object):
     if spatial_dims != orig_spatial_dims or any(x < 1 for x in spatial_dims):
       raise ValueError(
           "spatial_dims must be a monotonically increasing sequence of "
-          "positive integers")
+          "positive integers, but saw: {}".format(orig_spatial_dims))
 
     if data_format is not None and data_format.startswith("NC"):
       expected_input_rank = spatial_dims[-1]
@@ -599,14 +616,16 @@ class _WithSpaceToBatch(object):
       input_shape.with_rank_at_least(expected_input_rank)
     except ValueError:
       raise ValueError(
-          "input tensor must have rank %d at least" % (expected_input_rank))
+          "input tensor must have rank at least {}, but saw rank {}"
+          .format(expected_input_rank, input_shape.ndims))
 
     const_rate = tensor_util.constant_value(dilation_rate)
     rate_or_const_rate = dilation_rate
     if const_rate is not None:
       rate_or_const_rate = const_rate
       if np.any(const_rate < 1):
-        raise ValueError("dilation_rate must be positive")
+        raise ValueError("dilation_rate must be positive, but saw: {}"
+                         .format(const_rate))
       if np.all(const_rate == 1):
         self.call = build_op(num_spatial_dims, padding)
         return
@@ -672,6 +691,7 @@ class _WithSpaceToBatch(object):
       filter_shape = array_ops.shape(filter)
       base_paddings = _with_space_to_batch_base_paddings(
           filter_shape, self.num_spatial_dims, self.rate_or_const_rate)
+
     paddings, crops = array_ops.required_space_to_batch_paddings(
         input_shape=input_spatial_shape,
         base_paddings=base_paddings,
@@ -994,31 +1014,84 @@ def convolution_internal(
     data_format=None,
     dilations=None,
     name=None,
-    call_from_convolution=True):
-  """Internal function which performs rank agnostic convolution."""
-  if isinstance(input.shape, tensor_shape.TensorShape) and \
-        input.shape.rank is not None:
-    n = len(input.shape) - 2
-  elif not isinstance(input.shape, tensor_shape.TensorShape) and \
-        input.shape is not None:
-    n = len(input.shape) - 2
-  elif isinstance(filters.shape, tensor_shape.TensorShape) and \
-        filters.shape.rank is not None:
+    call_from_convolution=True,
+    num_spatial_dims=None):
+  """Internal function which performs rank agnostic convolution.
+
+  Args:
+    input: See `convolution`.
+    filters: See `convolution`.
+    strides: See `convolution`.
+    padding: See `convolution`.
+    data_format: See `convolution`.
+    dilations: See `convolution`.
+    name: See `convolution`.
+    call_from_convolution: See `convolution`.
+    num_spatial_dims: (Optional.).  It is a integer describing the
+      rank of the spatial dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
+      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
+      This argument is only required to disambiguate the rank of `batch_shape`
+      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
+      backwards compatibility, if `num_spatial_dims is None` and
+     `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
+     `1` (i.e., the input is expected to be
+     `[batch_size, num_channels] + input_spatial_shape`
+     or `[batch_size] + input_spatial_shape + [num_channels]`.
+
+  Returns:
+    A tensor of shape and dtype matching that of `input`.
+
+  Raises:
+    ValueError: If input and filter both have unknown shapes, or if
+      `num_spatial_dims` is provided and incompatible with the value
+      estimated from `filters.shape`.
+  """
+  n = None
+  if getattr(filters, 'shape', None) is None:
+    with ops.name_scope(name, 'convolution_internal', [filters, input]):
+      filters = ops.convert_to_tensor(filters, name='filters')
+  if (isinstance(filters.shape, tensor_shape.TensorShape)
+      and filters.shape.rank is not None):
     n = len(filters.shape) - 2
-  elif not isinstance(filters.shape, tensor_shape.TensorShape) and \
-        filters.shape is not None:
+  elif (not isinstance(filters.shape, tensor_shape.TensorShape)
+        and filters.shape is not None):
     n = len(filters.shape) - 2
+
+  if (isinstance(input.shape, tensor_shape.TensorShape)
+      and input.shape.rank is not None):
+    if n is None:
+      n = (num_spatial_dims if num_spatial_dims is not None
+           else len(input.shape) - 2)
+    num_batch_dims = len(input.shape) - n - 1
+  elif (not isinstance(input.shape, tensor_shape.TensorShape)
+        and input.shape is not None):
+    if n is None:
+      n = (num_spatial_dims if num_spatial_dims is not None
+           else len(input.shape) - 2)
+    num_batch_dims = len(input.shape) - n - 1
   else:
+    num_batch_dims = 1  # Default behavior if it cannot be estimated.
+
+  if n is None:
     raise ValueError("rank of input or filter must be known")
 
+  if num_spatial_dims is not None and n != num_spatial_dims:
+    raise ValueError(
+        "inconsistent estimate of spatial dims ({}) vs. actual passed "
+        "num_spatial_dims ({}).  n was estimated as len(filters.shape) - 2, "
+        "but filters shape is: {}".format(n, num_spatial_dims, filters.shape))
+
   if not 1 <= n <= 3:
     raise ValueError(
-        "Input tensor must be of rank 3, 4 or 5 but was {}.".format(n + 2))
+        "num_spatial_dims (input.shape.ndims - num_batch_dims - 1) must be one "
+        "of 1, 2 or 3 but saw {}.  num_batch_dims: {}."
+        .format(n, num_batch_dims))
 
   if data_format is None:
-    channel_index = n + 1
+    channel_index = num_batch_dims + n
   else:
-    channel_index = 1 if data_format.startswith("NC") else n + 1
+    channel_index = (
+        num_batch_dims if data_format.startswith("NC") else n + num_batch_dims)
 
   strides = _get_sequence(strides, n, channel_index, "strides")
   dilations = _get_sequence(dilations, n, channel_index, "dilations")
@@ -1031,7 +1104,7 @@ def convolution_internal(
     scope = "convolution"
 
   with ops.name_scope(name, scope, [input, filters]) as name:
-    conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d}
+    conv_ops = {1: conv1d, 2: _conv2d_expanded_batch, 3: gen_nn_ops.conv3d}
 
     if device_context.enclosing_tpu_context() is not None or all(
         i == 1 for i in dilations):
@@ -1061,7 +1134,8 @@ def convolution_internal(
           strides=strides,
           dilation_rate=dilations,
           name=name,
-          data_format=data_format)
+          data_format=data_format,
+          num_spatial_dims=n)
       return op(input, filters)
 
 
@@ -1069,17 +1143,34 @@ class Convolution(object):
   """Helper class for convolution.
 
   Note that this class assumes that shapes of input and filter passed to
-  __call__ are compatible with input_shape and filter_shape passed to the
-  constructor.
+  `__call__` are compatible with `input_shape`, `filter_shape`, and
+  `num_spatial_dims` passed to the constructor.
 
   Arguments
-    input_shape: static shape of input. i.e. input.get_shape().
-    filter_shape: static shape of the filter. i.e. filter.get_shape().
-    padding:  see convolution.
+    input_shape: static shape of input. i.e. input.shape.  Its length is
+      `batch_shape + input_spatial_shape + [num_channels]` if `data_format`
+      does not start with `NC`, or
+      `batch_shape + [num_channels] + input_spatial_shape` if `data_format`
+      starts with `NC`.
+    filter_shape: static shape of the filter. i.e. filter.shape.
+    padding: The padding algorithm, must be "SAME" or "VALID".
     strides: see convolution.
     dilation_rate: see convolution.
     name: see convolution.
-    data_format: see convolution.
+    data_format: A string or `None`.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (if `data_format` is `None`
+      or does not start with `NC`), or the first post-batch dimension (i.e. if
+      `data_format` starts with `NC`).
+    num_spatial_dims: (Usually optional.)  Python integer, the rank of the
+      spatial and channel dimensions.  For `1-D`, `2-D` and `3-D` convolutions,
+      the value of `num_spatial_dims` is `1`, `2`, and `3`, respectively.
+      This argument is only required to disambiguate the rank of `batch_shape`
+      when `filter_shape.ndims is None` and `len(batch_shape) > 1`.  For
+      backwards compatibility, if `num_spatial_dims is None` and
+      `filter_shape.ndims is None`, then `len(batch_shape)` is assumed to be
+      `1` (i.e., the input is expected to be
+      `[batch_size, num_channels] + input_spatial_shape`
+      or `[batch_size] + input_spatial_shape + [num_channels]`.
   """
 
   def __init__(self,
@@ -1089,40 +1180,72 @@ class Convolution(object):
                strides=None,
                dilation_rate=None,
                name=None,
-               data_format=None):
+               data_format=None,
+               num_spatial_dims=None):
     """Helper function for convolution."""
-    num_total_dims = filter_shape.ndims
-    if num_total_dims is None:
-      num_total_dims = input_shape.ndims
-    if num_total_dims is None:
-      raise ValueError("rank of input or filter must be known")
+    num_batch_dims = None
+    filter_shape = tensor_shape.as_shape(filter_shape)
+    input_shape = tensor_shape.as_shape(input_shape)
 
-    num_spatial_dims = num_total_dims - 2
+    if filter_shape.ndims is not None:
+      if (num_spatial_dims is not None and
+          filter_shape.ndims != num_spatial_dims + 2):
+        raise ValueError(
+            "Expected filter_shape.ndims == num_spatial_dims + 2, "
+            "but saw filter_shape.ndims == {} and num_spatial_dims == {}"
+            .format(filter_shape.ndims, num_spatial_dims))
+      else:
+        num_spatial_dims = filter_shape.ndims - 2
 
-    try:
-      input_shape.with_rank(num_spatial_dims + 2)
-    except ValueError:
+    if input_shape.ndims is not None and num_spatial_dims is not None:
+      num_batch_dims = input_shape.ndims - num_spatial_dims - 1
+
+    if num_spatial_dims is None:
+      num_spatial_dims = input_shape.ndims - 2
+    else:
+      if input_shape.ndims is not None:
+        if input_shape.ndims < num_spatial_dims + 2:
+          raise ValueError(
+              "Expected input_shape.ndims >= num_spatial_dims + 2, but saw "
+              "input_shape.ndims == {} and num_spatial_dims == {}"
+              .format(input_shape.ndims, num_spatial_dims))
+        else:
+          if num_batch_dims is None:
+            num_batch_dims = input_shape.ndims - num_spatial_dims - 1
+
+    if num_spatial_dims is None:
       raise ValueError(
-          "input tensor must have rank %d" % (num_spatial_dims + 2))
+          "Cannot estimate num_spatial_dims since input_shape.ndims is None, "
+          "filter_shape.ndims is None, and argument num_spatial_dims is also "
+          "None.")
 
-    try:
-      filter_shape.with_rank(num_spatial_dims + 2)
-    except ValueError:
+    if num_batch_dims is None:
+      num_batch_dims = 1
+
+    if num_batch_dims < 1:
       raise ValueError(
-          "filter tensor must have rank %d" % (num_spatial_dims + 2))
+          "num_batch_dims should be >= 1, but saw {}.  num_batch_dims was "
+          "estimated as `input_shape.ndims - num_spatial_dims - 1` and "
+          "num_spatial_dims was either provided or estimated as "
+          "`filter_shape.ndims - 2`.  input_shape.ndims: {}, "
+          "num_spatial_dims: {}, filter_shape.ndims: {}"
+          .format(num_batch_dims, input_shape.ndims, num_spatial_dims,
+                  filter_shape.ndims))
 
     if data_format is None or not data_format.startswith("NC"):
       input_channels_dim = tensor_shape.dimension_at_index(
-          input_shape, num_spatial_dims + 1)
-      spatial_dims = range(1, num_spatial_dims + 1)
+          input_shape, num_spatial_dims + num_batch_dims)
+      spatial_dims = range(num_batch_dims, num_spatial_dims + num_batch_dims)
     else:
-      input_channels_dim = tensor_shape.dimension_at_index(input_shape, 1)
-      spatial_dims = range(2, num_spatial_dims + 2)
+      input_channels_dim = tensor_shape.dimension_at_index(
+          input_shape, num_batch_dims)
+      spatial_dims = range(
+          num_batch_dims + 1, num_spatial_dims + num_batch_dims + 1)
 
     if not input_channels_dim.is_compatible_with(
         filter_shape[num_spatial_dims]):
       raise ValueError(
-          "number of input channels does not match corresponding dimension of "
+          "Number of input channels does not match corresponding dimension of "
           "filter, {} != {}".format(input_channels_dim,
                                     filter_shape[num_spatial_dims]))
 
@@ -1136,6 +1259,8 @@ class Convolution(object):
     self.padding = padding
     self.name = name
     self.dilation_rate = dilation_rate
+    self.num_batch_dims = num_batch_dims
+    self.num_spatial_dims = num_spatial_dims
     self.conv_op = _WithSpaceToBatch(
         input_shape,
         dilation_rate=dilation_rate,
@@ -1143,7 +1268,8 @@ class Convolution(object):
         build_op=self._build_op,
         filter_shape=filter_shape,
         spatial_dims=spatial_dims,
-        data_format=data_format)
+        data_format=data_format,
+        num_batch_dims=num_batch_dims)
 
   def _build_op(self, _, padding):
     return _NonAtrousConvolution(
@@ -1152,7 +1278,8 @@ class Convolution(object):
         padding=padding,
         data_format=self.data_format,
         strides=self.strides,
-        name=self.name)
+        name=self.name,
+        num_batch_dims=self.num_batch_dims)
 
   def __call__(self, inp, filter):  # pylint: disable=redefined-builtin
     # TPU convolution supports dilations greater than 1.
@@ -1165,7 +1292,8 @@ class Convolution(object):
           data_format=self.data_format,
           dilations=self.dilation_rate,
           name=self.name,
-          call_from_convolution=False)
+          call_from_convolution=False,
+          num_spatial_dims=self.num_spatial_dims)
     else:
       return self.conv_op(inp, filter)
 
@@ -2392,6 +2520,42 @@ def conv2d_transpose_v2(
         name=name)
 
 
+def _conv2d_expanded_batch(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides,
+    padding,
+    data_format,
+    dilations,
+    name):
+  """Helper function for `convolution_internal`; handles expanded batches."""
+  # Try really hard to avoid modifying the legacy name scopes - return early.
+  shape = getattr(input, "shape", None)
+  if shape is not None:
+    ndims = getattr(shape, "ndims", -1)
+    if ndims == -1: ndims = len(shape)
+  if ndims in (4, 3, 2, 1, 0, None):
+    return gen_nn_ops.conv2d(
+        input,
+        filter=filters,
+        strides=strides,
+        padding=padding,
+        data_format=data_format,
+        dilations=dilations,
+        name=name)
+  return _squeeze_batch_dims(
+      input,
+      functools.partial(
+          gen_nn_ops.conv2d,
+          filter=filters,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations),
+      inner_rank=3,
+      name=name)
+
+
 @tf_export("nn.atrous_conv2d_transpose")
 @dispatch.add_dispatch_support
 def atrous_conv2d_transpose(value,

From a66142c8e978cc09bb1f808855fc9e69d00f1bac Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 21 May 2020 13:58:24 -0700
Subject: [PATCH 0993/1533] [tf.data service] Apply dataset options to tf.data
 service side datasets.

PiperOrigin-RevId: 312735892
Change-Id: I29cd704823e9fe275c18f75dd1e35ac118abd18a
---
 tensorflow/python/data/experimental/ops/data_service_ops.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index f2ebd51d187..782f438c701 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -219,6 +219,9 @@ def _distribute(processing_mode,
     # TODO(b/157105111): Set this to autotune when we have a way to limit
     # memory usage
     dataset = dataset.prefetch(1)
+    # Apply options so that the dataset executed in the tf.data service will
+    # be optimized and support autotuning.
+    dataset = dataset._apply_options()  # pylint: disable=protected-access
     dataset_id = gen_experimental_dataset_ops.register_dataset(
         dataset._variant_tensor,  # pylint: disable=protected-access
         address=address,

From d30e05003ceccc3c6ddb95ab3b2978a21dcd9b96 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 21 May 2020 14:02:35 -0700
Subject: [PATCH 0994/1533] Disable two tests on windows due to image issue.

PiperOrigin-RevId: 312736691
Change-Id: I3922e98d6bd8154d087b9f567e4c909b62a39c1d
---
 tensorflow/core/platform/BUILD     | 1 +
 tensorflow/python/distribute/BUILD | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index f78b738247d..7f7ca0f06cd 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -386,6 +386,7 @@ py_test(
     name = "ram_file_system_test",
     srcs = ["ram_file_system_test.py"],
     python_version = "PY3",
+    tags = ["no_windows"],  # TODO(b/156428279): reenable this test once the image is updated.
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index a7e62a2dc7c..acbffb84089 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1546,6 +1546,7 @@ cuda_py_test(
     srcs = ["parameter_server_strategy_test.py"],
     tags = [
         "multi_and_single_gpu",
+        "no_windows",  # TODO(b/156428279): reenable this test once the image is updated.
     ],
     # b/141096229: Non-atomic AssignAdd
     xla_enable_strict_auto_jit = False,

From 7315b275c05154c6e2701e0c934d11788e671d62 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 21 May 2020 14:14:42 -0700
Subject: [PATCH 0995/1533] Disable flaky test.

PiperOrigin-RevId: 312739001
Change-Id: I7a7a9ad5cc7cf8ad63919d6473c15eb7c274692a
---
 tensorflow/python/keras/distribute/BUILD | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 87625446e2f..50ed6086195 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -431,10 +431,11 @@ py_test(
     python_version = "PY3",
     shard_count = 5,
     tags = [
-        "noasan",
-        "nomsan",
-        "notsan",
-    ],  # TODO(b/156029134)
+        "noasan",  # TODO(b/156029134)
+        "nomsan",  # TODO(b/156029134)
+        "notap",  # TODO(b/157253858)
+        "notsan",  # TODO(b/156029134)
+    ],
     deps = [
         "//tensorflow/python:platform",
         "//tensorflow/python/data/ops:dataset_ops",

From d0a5894b58be100c698a2f49d3371a7c5e273d2f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 14:15:43 -0700
Subject: [PATCH 0996/1533] switch capture_tpu_profile to new api of
 profiler_client. because some options is dropped, therefore two flags are
 deprecated. also allow it specify host trace level.

PiperOrigin-RevId: 312739183
Change-Id: I4e4712441877e697956d539055e333baf8a8d7bd
---
 tensorflow/python/tpu/profiler/BUILD          |  3 +-
 .../tpu/profiler/capture_tpu_profile.py       | 41 ++++++++++---------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index b505262c6a2..84ffb4234c0 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -38,7 +38,8 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:versions",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//tensorflow/python/eager:profiler_client",
+        "//tensorflow/python/profiler:profiler_client",
+        "//tensorflow/python/profiler:profiler_v2",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
     ],
diff --git a/tensorflow/python/tpu/profiler/capture_tpu_profile.py b/tensorflow/python/tpu/profiler/capture_tpu_profile.py
index f0d22027e4e..0068dc402c0 100644
--- a/tensorflow/python/tpu/profiler/capture_tpu_profile.py
+++ b/tensorflow/python/tpu/profiler/capture_tpu_profile.py
@@ -25,7 +25,8 @@ from absl import flags
 from distutils.version import LooseVersion
 
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
-from tensorflow.python.eager import profiler_client
+from tensorflow.python.profiler import profiler_client
+from tensorflow.python.profiler import profiler_v2 as profiler
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import versions
 from tensorflow.python.platform import gfile
@@ -65,9 +66,10 @@ flags.DEFINE_integer('duration_ms', 0,
 flags.DEFINE_integer(
     'num_tracing_attempts', 3, 'Automatically retry N times when no trace '
     'event is collected.')
-flags.DEFINE_boolean('include_dataset_ops', True,
-                     'Set to false to profile longer TPU '
-                     'device traces.')
+flags.DEFINE_boolean('include_dataset_ops', True, 'Deprecated.')
+flags.DEFINE_integer(
+    'host_tracer_level', 2, 'Adjust host tracer level to control the verbosity '
+    ' of the TraceMe event being collected.')
 
 # Monitoring parameters
 flags.DEFINE_integer(
@@ -77,8 +79,7 @@ flags.DEFINE_integer(
 flags.DEFINE_integer(
     'num_queries', 100,
     'This script will run monitoring for num_queries before it stops.')
-flags.DEFINE_boolean('display_timestamp', False,
-                     'Set to true to display timestamp in monitoring results.')
+flags.DEFINE_boolean('display_timestamp', True, 'Deprecated.')
 
 
 def get_workers_list(cluster_resolver):
@@ -111,8 +112,7 @@ def get_workers_list(cluster_resolver):
   return ','.join(workers_list)
 
 
-def monitoring_helper(service_addr, duration_ms, monitoring_level,
-                      display_timestamp, num_queries):
+def monitoring_helper(service_addr, duration_ms, monitoring_level, num_queries):
   """Helper function to print monitoring results.
 
   Helper function to print monitoring results for num_queries times.
@@ -122,15 +122,13 @@ def monitoring_helper(service_addr, duration_ms, monitoring_level,
     duration_ms: Duration of one monitoring sample in milliseconds.
     monitoring_level: An integer between 1 and 2. Level 2 is more verbose than
       level 1 and shows more metrics.
-    display_timestamp: Set to true to display timestamp in monitoring.
     num_queries: Number of monitoring samples to collect.
   """
   if monitoring_level <= 0 or monitoring_level > 2:
     sys.exit('Please choose a monitoring level between 1 and 2.')
 
   for query in range(0, num_queries):
-    res = profiler_client.monitor(service_addr, duration_ms, monitoring_level,
-                                  display_timestamp)
+    res = profiler_client.monitor(service_addr, duration_ms, monitoring_level)
     print('Cloud TPU Monitoring Results (Sample ', query, '):\n\n', res)
 
 
@@ -144,8 +142,8 @@ def main(unused_argv=None):
   print('TensorFlow version %s detected' % tf_version)
   print('Welcome to the Cloud TPU Profiler v%s' % profiler_version.__version__)
 
-  if LooseVersion(tf_version) < LooseVersion('1.14.0'):
-    sys.exit('You must install tensorflow >= 1.14.0 to use this plugin.')
+  if LooseVersion(tf_version) < LooseVersion('2.2.0'):
+    sys.exit('You must install tensorflow >= 2.2.0 to use this plugin.')
 
   if not FLAGS.service_addr and not FLAGS.tpu:
     sys.exit('You must specify either --service_addr or --tpu.')
@@ -184,7 +182,7 @@ def main(unused_argv=None):
           FLAGS.duration_ms, ' ms and show metrics for ', FLAGS.num_queries,
           ' time(s).')
     monitoring_helper(service_addr, duration_ms, FLAGS.monitoring_level,
-                      FLAGS.display_timestamp, FLAGS.num_queries)
+                      FLAGS.num_queries)
   else:
     if not FLAGS.logdir:
       sys.exit('You must specify either --logdir or --monitoring_level.')
@@ -193,11 +191,16 @@ def main(unused_argv=None):
       gfile.MakeDirs(FLAGS.logdir)
 
     try:
-      profiler_client.start_tracing(service_addr,
-                                    os.path.expanduser(FLAGS.logdir),
-                                    duration_ms, workers_list,
-                                    FLAGS.include_dataset_ops,
-                                    FLAGS.num_tracing_attempts)
+      if LooseVersion(tf_version) < LooseVersion('2.3.0'):
+        profiler_client.trace(service_addr, os.path.expanduser(FLAGS.logdir),
+                              duration_ms, workers_list,
+                              FLAGS.num_tracing_attempts)
+      else:
+        options = profiler.ProfilerOptions(
+            host_tracer_level=FLAGS.host_tracer_level)
+        profiler_client.trace(service_addr, os.path.expanduser(FLAGS.logdir),
+                              duration_ms, workers_list,
+                              FLAGS.num_tracing_attempts, options)
     except errors.UnavailableError:
       sys.exit(0)
 

From 8d7f18b250a6356623509dee7a4d0636b8937784 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Thu, 21 May 2020 14:28:20 -0700
Subject: [PATCH 0997/1533] Add memory cost breakdown per operation type and
 memory space to OpMetrics.

PiperOrigin-RevId: 312741555
Change-Id: Id6666a8c6b9d67fe443154fb135cadeeeaecebdc
---
 .../core/profiler/protobuf/op_metrics.proto       | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index c0f34773e02..af38795b7b2 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -26,7 +26,7 @@ message LayoutAnalysis {
 }
 
 // Metrics for an operation (accumulated over all occurrences).
-// Next ID: 19
+// Next ID: 20
 message OpMetrics {
   // HLO module id. 0 for TF ops.
   uint64 hlo_module_id = 13;
@@ -50,6 +50,19 @@ message OpMetrics {
   uint64 flops = 2;
   // Total bytes accessed.
   uint64 bytes_accessed = 5;
+  // Breakdown of memory accessed by operation type and memory space.
+  message MemoryAccessed {
+    enum OperationType {
+      UNKNOWN = 0;
+      READ = 1;
+      WRITE = 2;
+    }
+    OperationType operation_type = 1;
+    // Device-specific id of memory space.
+    uint64 memory_space = 2;
+    uint64 bytes_accessed = 3;
+  }
+  repeated MemoryAccessed memory_accessed_breakdown = 19;
   // Total dma stall time in picoseconds.
   uint64 dma_stall_ps = 10;
   // The data layout for this op. Only set for convolution ops for now.

From b6d6b451aaf59cb11d65c20480cdd10c95df7902 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Thu, 21 May 2020 14:33:48 -0700
Subject: [PATCH 0998/1533] PR #39548: [INTEL MKL] Fix conv_ops_test and
 remapper_test

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/39548

Fix two C++ test failures related to MKL ops.

1. conv_ops_test       // MklConvOp does not support EXPLICIT padding
2. remapper_test      // Fusion of MKL Conv and Mkl FusedBatchNorm is not supported

The fix is to disable the related tests with MKL build.
Copybara import of the project:

--
5d92849778771a475fe339d2954db12c3d4ecc2b by Guozhong Zhu...

***

PiperOrigin-RevId: 312742653
Change-Id: I0393c00589c3d2bc04965e390c2b2ba249da0432
---
 .../core/grappler/optimizers/remapper_test.cc |  2 -
 tensorflow/core/kernels/conv_ops_test.cc      | 46 +++++--------------
 2 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 1946b864b9a..35e09b28205 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -607,7 +607,6 @@ TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
   }
 }
 
-#ifndef INTEL_MKL
 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   using ops::Placeholder;
 
@@ -851,7 +850,6 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
-#endif
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 308ec4053c3..21dffa3cc5e 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -1028,14 +1028,12 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBias(filter_size, filter_count);
 }
 
-#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
   this->VerifyConv2DWithBias(filter_size, filter_count,
                              /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
-#endif
 
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1064,7 +1062,6 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
-#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1075,7 +1072,6 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
-#endif
 
 // -------------------------------------------------------------------------- //
 // Conv2D + FusedBatchNorm + {Activation}                                     //
@@ -1099,7 +1095,6 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
 }
 
-#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
@@ -1107,7 +1102,6 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
       filter_size, filter_count,
       /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
-#endif
 
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1137,7 +1131,6 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
-#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1148,49 +1141,34 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
-#endif
 
-REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,  //
-                            OneByOneConvolution,        //
-                            ImageSizeConvolution,       //
-                            SpatialConvolution,         //
-#ifndef INTEL_MKL
-                            ExplicitPaddingConvolution,  //
-#endif
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
+                            OneByOneConvolution,                //
+                            ImageSizeConvolution,               //
+                            SpatialConvolution,                 //
+                            ExplicitPaddingConvolution,         //
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
-#ifndef INTEL_MKL
-                            SpatialConvolutionAndActivation,  //
+                            SpatialConvolutionAndActivation,    //
                             ExplicitPaddingConvolutionAndActivation);
-#else
-                            SpatialConvolutionAndActivation);
-#endif
 
-REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,  //
-                            OneByOneConvolution,             //
-                            ImageSizeConvolution,            //
-                            SpatialConvolution,              //
-#ifndef INTEL_MKL
-                            ExplicitPaddingConvolution,  //
-#endif
+REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
+                            OneByOneConvolution,                //
+                            ImageSizeConvolution,               //
+                            SpatialConvolution,                 //
+                            ExplicitPaddingConvolution,         //
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
-#ifndef INTEL_MKL
-                            SpatialConvolutionAndActivation,  //
+                            SpatialConvolutionAndActivation,    //
                             ExplicitPaddingConvolutionAndActivation);
-#else
-                            SpatialConvolutionAndActivation);
-#endif
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
-#ifndef INTEL_MKL
 using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
                                FusedBatchNormDataTypes);
-#endif
 
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

From ba7f59aadc94461bc356ecc3c19831d6fbbe5a62 Mon Sep 17 00:00:00 2001
From: Trevor Gale <tgale@google.com>
Date: Thu, 21 May 2020 14:34:03 -0700
Subject: [PATCH 0999/1533] Adding uint32 support for more variable related
 operations.

PiperOrigin-RevId: 312742706
Change-Id: Ifc6958496ad999d517f997012fb81fd839e3166d
---
 tensorflow/core/kernels/resource_variable_ops.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index ccd1e3c835d..b606d411a3d 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -282,6 +282,7 @@ REGISTER_KERNEL_BUILDER(
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_variant(REGISTER_GPU_KERNELS);
+TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 
 REGISTER_KERNEL_BUILDER(Name("_VarHandlesOp")
@@ -511,6 +512,7 @@ class AssignVariableOp<Device, Variant> : public OpKernel {
 
 TF_CALL_ALL_TYPES(REGISTER_KERNELS);
 TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
+TF_CALL_uint32(REGISTER_KERNELS);
 #undef REGISTER_KERNELS
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -524,6 +526,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_KERNELS);
 TF_CALL_GPU_ALL_TYPES(REGISTER_GPU_KERNELS);
 TF_CALL_int64(REGISTER_GPU_KERNELS);
 TF_CALL_variant(REGISTER_GPU_KERNELS);
+TF_CALL_uint32(REGISTER_GPU_KERNELS);
 #undef REGISTER_GPU_KERNELS
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 

From 7273b46195ea0407b08745ae517592a796cd6fe7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 14:35:35 -0700
Subject: [PATCH 1000/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 312742989
Change-Id: I9a182872d968af3c34822e792634332f3f04c35f
---
 .../ops_history_v2/ExtractGlimpseV2.pbtxt     | 47 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 47 +++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt
new file mode 100644
index 00000000000..08725f4504c
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExtractGlimpseV2.pbtxt
@@ -0,0 +1,47 @@
+op {
+  name: "ExtractGlimpseV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 75f8c0dadcb..2f6e0dc0d4c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -15111,6 +15111,53 @@ op {
     }
   }
 }
+op {
+  name: "ExtractGlimpseV2"
+  input_arg {
+    name: "input"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "offsets"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "glimpse"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "centered"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "normalized"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "uniform_noise"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "noise"
+    type: "string"
+    default_value {
+      s: "uniform"
+    }
+  }
+}
 op {
   name: "ExtractImagePatches"
   input_arg {

From dbef0933ebe4d3d85be73e88cfe5f83cac0ae1d6 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 21 May 2020 14:40:15 -0700
Subject: [PATCH 1001/1533] Reapplying #39042 to avoid breaking some internal
 users.

This improves recompute_grad when variables are present.

PiperOrigin-RevId: 312743821
Change-Id: I2debf8f80b036c053ef3325adeb56b78f32dd859
---
 tensorflow/python/eager/forwardprop_test.py   |  10 +-
 .../python/keras/integration_test/BUILD       |  12 +-
 .../gradient_checkpoint_test.py               | 158 ++++++++++++++++++
 tensorflow/python/ops/custom_gradient.py      |  65 ++++---
 tensorflow/python/ops/gradients_test.py       |  48 +++++-
 5 files changed, 261 insertions(+), 32 deletions(-)
 create mode 100644 tensorflow/python/keras/integration_test/gradient_checkpoint_test.py

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 4ddba6b9be3..dd0bad30cb8 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -199,7 +199,6 @@ def _test_gradients(testcase,
   # And the symbolic computations should be much closer.
   testcase.assertAllClose(sym_jac_back, sym_jac_fwd)
 
-
 class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
   def testJVPFunction(self):
@@ -361,14 +360,17 @@ class ForwardpropTest(test.TestCase, parameterized.TestCase):
 
     _test_gradients(self, f, [constant_op.constant([1., 2.])], order=3)
 
-  @test_util.assert_no_new_pyobjects_executing_eagerly
-  def testCustomGradientRecomputeGrad(self):
+  # TODO(allenl): investigate why assert_no_new_pyobjects_executing_eagerly fails around this test?
+  def testExceptionCustomGradientRecomputeGradForward(self):
 
     @custom_gradient.recompute_grad
     def f(x):
       return math_ops.reduce_prod(math_ops.tanh(x)**2)
 
-    _test_gradients(self, f, [constant_op.constant([1.])], order=3)
+    with self.assertRaisesRegexp(NotImplementedError,
+                                 "recompute_grad tried to transpose"):
+      primals = [constant_op.constant([1.])]
+      sym_jac_fwd = _jacfwd(f, primals)
 
   def testExceptionInCustomGradientNotSwallowed(self):
 
diff --git a/tensorflow/python/keras/integration_test/BUILD b/tensorflow/python/keras/integration_test/BUILD
index 01c405a86ae..80d8fb86345 100644
--- a/tensorflow/python/keras/integration_test/BUILD
+++ b/tensorflow/python/keras/integration_test/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Contains Keras integration tests that verify with other TF high level APIs.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test", "tf_py_test")
 
 package(
     default_visibility = [
@@ -70,3 +70,13 @@ tf_py_test(
         "//tensorflow/python:extra_py_tests_deps",
     ],
 )
+
+cuda_py_test(
+    name = "gradient_checkpoint_test",
+    srcs = ["gradient_checkpoint_test.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:extra_py_tests_deps",
+    ],
+)
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
new file mode 100644
index 00000000000..9d9e0a062b3
--- /dev/null
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -0,0 +1,158 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+layers = tf.keras.layers
+optimizers = tf.keras.optimizers
+
+
+def _get_big_cnn_model(img_dim, n_channels, num_partitions,
+                       blocks_per_partition):
+  """Creates a test model whose activations are significantly larger than model size."""
+  model = tf.keras.Sequential()
+  model.add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+  for _ in range(num_partitions):
+    for _ in range(blocks_per_partition):
+      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+  model.add(layers.Flatten())
+  model.add(layers.Dense(32, activation=tf.nn.relu))
+  model.add(layers.Dense(10))
+  return model
+
+
+def _get_split_cnn_model(img_dim, n_channels, num_partitions,
+                         blocks_per_partition):
+  """Creates a test model that is split into `num_partitions` smaller models"""
+  models = [tf.keras.Sequential() for _ in range(num_partitions)]
+  models[0].add(layers.Input(shape=(img_dim, img_dim, n_channels)))
+  for i in range(num_partitions):
+    model = models[i]
+    if i > 0:
+      last_shape = models[i - 1].layers[-1].output_shape
+      model.add(layers.Input(shape=last_shape[1:]))
+    for _ in range(blocks_per_partition):
+      model.add(layers.Conv2D(10, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(40, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+      model.add(layers.Conv2D(20, 5, padding='same', activation=tf.nn.relu))
+      model.add(layers.MaxPooling2D((1, 1), padding='same'))
+  models[-1].add(layers.Flatten())
+  models[-1].add(layers.Dense(32, activation=tf.nn.relu))
+  models[-1].add(layers.Dense(10))
+  return models
+
+
+def _compute_loss(logits, labels):
+  return tf.reduce_mean(
+      tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=logits, labels=labels))
+
+
+def _limit_gpu_memory():
+  """Helper function to limit GPU memory for testing  """
+  gpus = tf.config.experimental.list_physical_devices('GPU')
+  if gpus:
+    tf.config.experimental.set_virtual_device_configuration(
+        gpus[0],
+        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
+    return True
+  return False
+
+
+def _get_dummy_data(img_dim, n_channels, batch_size):
+  inputs = tf.ones([batch_size, img_dim, img_dim, n_channels])
+  labels = tf.ones([batch_size], dtype=tf.int64)
+  return inputs, labels
+
+
+def _train_no_recompute(n_steps):
+  """Trains a single large model without gradient checkpointing."""
+  img_dim, n_channels, batch_size = 256, 1, 4
+  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+  model = _get_big_cnn_model(
+      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
+  optimizer = optimizers.SGD()
+  losses = []
+  tr_vars = model.trainable_variables
+  for _ in range(n_steps):
+    with tf.GradientTape() as tape:
+      logits = model(x)
+      loss = _compute_loss(logits, y)
+      losses.append(loss)
+    grads = tape.gradient(loss, tr_vars)  # tr_vars
+    optimizer.apply_gradients(zip(grads, tr_vars))
+    del grads
+  return losses
+
+
+def _train_with_recompute(n_steps):
+  """Trains a single large model with gradient checkpointing using tf.recompute_grad."""
+  img_dim, n_channels, batch_size = 256, 1, 4
+  x, y = _get_dummy_data(img_dim, n_channels, batch_size)
+  # This model is the same model as _get_big_cnn_model but split into 3 parts.
+  models = _get_split_cnn_model(
+      img_dim, n_channels, num_partitions=3, blocks_per_partition=2)
+  model1, model2, model3 = models
+  # Apply gradient checkpointing to the submodels using tf.recompute_grad.
+  model1_re = tf.recompute_grad(model1)
+  model2_re = tf.recompute_grad(model2)
+  model3_re = tf.recompute_grad(model3)
+  optimizer = optimizers.SGD()
+  tr_vars = (
+      model1.trainable_variables + model2.trainable_variables +
+      model3.trainable_variables)
+  losses = []
+  for _ in range(n_steps):
+    with tf.GradientTape() as tape:
+      logits1 = model1_re(x)
+      logits2 = model2_re(logits1)
+      logits3 = model3_re(logits2)
+      loss = _compute_loss(logits3, y)
+      losses.append(loss)
+      grads = tape.gradient(loss, tr_vars)  # tr_vars
+      optimizer.apply_gradients(zip(grads, tr_vars))
+      del grads
+  return losses
+
+
+class GradientCheckpointTest(tf.test.TestCase):
+
+  def test_raises_oom_exception(self):
+    if not _limit_gpu_memory():
+      self.skipTest('No virtual GPUs found')
+    with self.assertRaises(Exception) as context:
+      _train_no_recompute(1)
+    self.assertTrue(
+        context.exception.__class__.__name__ == 'ResourceExhaustedError')
+
+  def test_does_not_raise_oom_exception(self):
+    if not _limit_gpu_memory():
+      self.skipTest('No virtual GPUs found')
+    n_step = 2
+    losses = _train_with_recompute(n_step)
+    self.assertTrue(len(losses) == n_step)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 4040a4db038..2a9194fb146 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import op_selector
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -351,13 +352,8 @@ def _graph_mode_decorator(f, args, kwargs):
                     "argument 'variables'.")
   if variables_in_signature and not variables:
     # User seems to intend to use variables but none were captured.
-    if not variable_scope.get_variable_scope().use_resource:
-      raise TypeError("If using @custom_gradient with a function that "
-                      "uses variables, the enclosing variable scope must "
-                      "have use_resource=True.")
-    else:
-      logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
-                   "no ResourceVariables were used on the forward pass.")
+    logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
+                 "no ResourceVariables were used on the forward pass.")
   flat_result = nest.flatten(result)
   flat_result_len = len(flat_result)
 
@@ -482,28 +478,47 @@ def recompute_grad(f):
   def inner(*args, **kwargs):
     """Inner function closure for calculating gradients."""
     current_var_scope = variable_scope.get_variable_scope()
+    with tape_lib.stop_recording():
+      result = f(*args, **kwargs)
 
-    result = f(*args, **kwargs)
+    def grad_wrapper(*wrapper_args, **grad_kwargs):
+      """Wrapper function to accomodate lack of kwargs in graph mode decorator."""
 
-    def grad(*dresult, **grad_kwargs):
-      """Gradient function calculation for inner function."""
-      variables = grad_kwargs.get("variables")
-      with backprop.GradientTape() as t:
-        id_args = [gen_array_ops.identity(x) for x in args]
-        t.watch(id_args)
+      @custom_gradient
+      def inner_recompute_grad(*dresult):
+        """Nested custom gradient function for computing grads in reverse and forward mode autodiff."""
+        # Gradient calculation for reverse mode autodiff.
+        variables = grad_kwargs.get("variables")
+        with backprop.GradientTape() as t:
+          id_args = [gen_array_ops.identity(x) for x in args]
+          t.watch(id_args)
+          if variables is not None:
+            t.watch(variables)
+          with ops.control_dependencies(dresult):
+            with variable_scope.variable_scope(current_var_scope):
+              result = f(*id_args, **kwargs)
+        kw_vars = []
         if variables is not None:
-          t.watch(variables)
-        with ops.control_dependencies(dresult):
-          with variable_scope.variable_scope(current_var_scope):
-            result = f(*id_args, **kwargs)
-      kw_vars = []
-      if variables is not None:
-        kw_vars = list(variables)
-      grads = t.gradient(
-          result, list(id_args) + kw_vars, output_gradients=dresult)
-      return grads[:len(id_args)], grads[len(id_args):]
+          kw_vars = list(variables)
+        grads = t.gradient(
+            result,
+            list(id_args) + kw_vars,
+            output_gradients=dresult,
+            unconnected_gradients=UnconnectedGradients.ZERO)
 
-    return result, grad
+        def transpose(*t_args, **t_kwargs):
+          """Gradient function calculation for forward mode autodiff."""
+          # Just throw an error since gradients / activations are not stored on tape for recompute.
+          raise NotImplementedError(
+              "recompute_grad tried to transpose grad of {}. "
+              "Consider not using recompute_grad in forward mode"
+              "autodiff".format(f.__name__))
+
+        return (grads[:len(id_args)], grads[len(id_args):]), transpose
+
+      return inner_recompute_grad(*wrapper_args)
+
+    return result, grad_wrapper
 
   return inner
 
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 817d8a1adbe..a06be7af74b 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -59,6 +59,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
+from tensorflow.python.ops import gradient_checker_v2
 
 
 class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -1340,6 +1341,46 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
 
     return grads_re, grads
 
+  def _grad(self, f, argnums=0):
+    """Return a function which computes the gradient of `f`."""
+
+    def _f(*params):
+      with backprop.GradientTape() as tape:
+        tape.watch(params)
+        outputs = f(*params)
+      return tape.gradient(
+          outputs,
+          params[argnums],
+          unconnected_gradients=unconnected_gradients.UnconnectedGradients.ZERO)
+
+    return _f
+
+  def _test_gradients(self, f, inputs, order, delta=1e-3, rtol=1e-2, atol=1e-6):
+    """Tests backward jacobians of `f`'s [0, `order`)-order gradients."""
+    if order < 1:
+      raise ValueError(
+          "`order` should be a positive integer, got '{}'.".format(order))
+    if order > 1:
+      self._test_gradients(
+          f=self._grad(f),
+          inputs=inputs,
+          order=order - 1,
+          delta=delta,
+          rtol=rtol,
+          atol=atol)
+    sym_jac_back, num_jac = gradient_checker_v2.compute_gradient(
+        f, inputs, delta=delta)
+    self.assertAllClose(num_jac, sym_jac_back, rtol=rtol, atol=atol)
+
+  @test_util.run_v2_only
+  def testCustomGradientRecomputeGradHigherOrder(self):
+
+    @custom_gradient.recompute_grad
+    def f(x):
+      return math_ops.reduce_prod(math_ops.tanh(x)**2)
+
+    self._test_gradients(f, [constant_op.constant([1.])], order=3)
+
   @test_util.run_in_graph_and_eager_modes
   def testFnRecompute(self):
     """Checks that recompute_grad works grads of function args."""
@@ -1356,8 +1397,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
           shape=10,
           trainable=True,
       )
-
-      test_input = constant(np.zeros((10, 10), dtype=np.float32))
+      self.evaluate(test_var.assign(np.ones([10])))
+      test_input = constant(np.ones((10, 10), dtype=np.float32))
 
       grads_re, grads = self._TestFnVariablesGradient(test_input, TestFn,
                                                       test_input)
@@ -1400,6 +1441,7 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
             shape=10,
             trainable=True,
         )
+        self.evaluate(test_var.assign(np.ones([10])))
         return input_t * test_var
 
     test_input_t = constant(np.zeros((10, 10), dtype=np.float32))
@@ -1442,6 +1484,8 @@ class VariablesGradientTest(test_util.TensorFlowTestCase):
       out_re = test_fn_re(test_input_t)
       out = TestFn(test_input_t)
 
+    init = variables.global_variables_initializer()
+    self.evaluate(init)
     grads_re = gradients.gradients(out_re, variables.trainable_variables())
     grads = gradients.gradients(out, variables.trainable_variables())
 

From b91ea0f96e26a37dde2c7290c0e19e9e5ead2602 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Thu, 21 May 2020 14:44:05 -0700
Subject: [PATCH 1002/1533] Fix SideEffecting semantics of xla_hlo ops without
 regions.

AfterAll, DynamicReshape and Return ops do not have side-effects and Trace op does have side effects. This is compatible with the behavior described in HloInstruction::HasSideEffectNoRecurse().

PiperOrigin-RevId: 312744557
Change-Id: Ie3b476841fcc486e3b76b23f665f002bce262738
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 093e79a8613..ed57ded47e7 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -461,7 +461,7 @@ def HLO_ReplicaIdOp : HLO_Op<"replica_id", [NoSideEffect]>,
 // XLA control flow op definitions.
 //===----------------------------------------------------------------------===//
 
-def HLO_AfterAllOp : HLO_Op<"after_all", []> {
+def HLO_AfterAllOp : HLO_Op<"after_all", [NoSideEffect]> {
 
   string summary = "AfterAll operator";
 
@@ -1076,7 +1076,7 @@ def HLO_ReshapeOp: HLO_Op<"reshape",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_DynamicReshapeOp: HLO_Op<"dynamic_reshape", []> {
+def HLO_DynamicReshapeOp: HLO_Op<"dynamic_reshape", [NoSideEffect]> {
   let summary = "Reshape a tensor to a given, possibly dynamic, shape.";
   let description = [{
     Reshapes `operand` to `output_shape`.
@@ -1212,7 +1212,7 @@ def HLO_PadOp: HLO_Op<"pad",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_TraceOp: HLO_Op<"trace", [NoSideEffect]>, BASE_HLO_TraceOp {
+def HLO_TraceOp: HLO_Op<"trace", []>, BASE_HLO_TraceOp {
   let arguments = (ins
     HLO_Tensor:$operand,
     StrAttr:$tag
@@ -1277,7 +1277,7 @@ def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
   // TODO(hinsu): Implement custom printer and parser.
 }
 
-def HLO_ReturnOp : HLO_Op<"return", [Terminator]> {
+def HLO_ReturnOp : HLO_Op<"return", [NoSideEffect, Terminator]> {
   let summary = [{
     The `hlo.return` operation terminates a region and returns values.
   }];

From 7d4fe3dd9c77953630aa9a07c4218b323ebafe87 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 14:45:45 -0700
Subject: [PATCH 1003/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 312744859
Change-Id: If95744addb0d945bfd6f10f72d5bb906dd75b1cc
---
 tensorflow/go/op/wrappers.go | 102 +++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 47f5c4952b6..530ea2fad58 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11935,6 +11935,108 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 	return op.Output(0)
 }
 
+// ExtractGlimpseV2Attr is an optional argument to ExtractGlimpseV2.
+type ExtractGlimpseV2Attr func(optionalAttr)
+
+// ExtractGlimpseV2Centered sets the optional centered attribute to value.
+//
+// value: indicates if the offset coordinates are centered relative to
+// the image, in which case the (0, 0) offset is relative to the center
+// of the input images. If false, the (0,0) offset corresponds to the
+// upper left corner of the input images.
+// If not specified, defaults to true
+func ExtractGlimpseV2Centered(value bool) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["centered"] = value
+	}
+}
+
+// ExtractGlimpseV2Normalized sets the optional normalized attribute to value.
+//
+// value: indicates if the offset coordinates are normalized.
+// If not specified, defaults to true
+func ExtractGlimpseV2Normalized(value bool) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["normalized"] = value
+	}
+}
+
+// ExtractGlimpseV2UniformNoise sets the optional uniform_noise attribute to value.
+//
+// value: indicates if the noise should be generated using a
+// uniform distribution or a Gaussian distribution.
+// If not specified, defaults to true
+func ExtractGlimpseV2UniformNoise(value bool) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["uniform_noise"] = value
+	}
+}
+
+// ExtractGlimpseV2Noise sets the optional noise attribute to value.
+//
+// value: indicates if the noise should `uniform`, `gaussian`, or
+// `zero`. The default is `uniform` which means the the noise type
+// will be decided by `uniform_noise`.
+// If not specified, defaults to "uniform"
+func ExtractGlimpseV2Noise(value string) ExtractGlimpseV2Attr {
+	return func(m optionalAttr) {
+		m["noise"] = value
+	}
+}
+
+// Extracts a glimpse from the input tensor.
+//
+// Returns a set of windows called glimpses extracted at location
+// `offsets` from the input tensor. If the windows only partially
+// overlaps the inputs, the non overlapping areas will be filled with
+// random noise.
+//
+// The result is a 4-D tensor of shape `[batch_size, glimpse_height,
+// glimpse_width, channels]`. The channels and batch dimensions are the
+// same as that of the input tensor. The height and width of the output
+// windows are specified in the `size` parameter.
+//
+// The argument `normalized` and `centered` controls how the windows are built:
+//
+// * If the coordinates are normalized but not centered, 0.0 and 1.0
+//   correspond to the minimum and maximum of each height and width
+//   dimension.
+// * If the coordinates are both normalized and centered, they range from
+//   -1.0 to 1.0. The coordinates (-1.0, -1.0) correspond to the upper
+//   left corner, the lower right corner is located at (1.0, 1.0) and the
+//   center is at (0, 0).
+// * If the coordinates are not normalized they are interpreted as
+//   numbers of pixels.
+//
+// Arguments:
+//	input: A 4-D float tensor of shape `[batch_size, height, width, channels]`.
+//	size: A 1-D tensor of 2 elements containing the size of the glimpses
+// to extract.  The glimpse height must be specified first, following
+// by the glimpse width.
+//	offsets: A 2-D integer tensor of shape `[batch_size, 2]` containing
+// the y, x locations of the center of each window.
+//
+// Returns A tensor representing the glimpses `[batch_size,
+// glimpse_height, glimpse_width, channels]`.
+func ExtractGlimpseV2(scope *Scope, input tf.Output, size tf.Output, offsets tf.Output, optional ...ExtractGlimpseV2Attr) (glimpse tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ExtractGlimpseV2",
+		Input: []tf.Input{
+			input, size, offsets,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ExtractGlimpseAttr is an optional argument to ExtractGlimpse.
 type ExtractGlimpseAttr func(optionalAttr)
 

From 7bfbd3f7be0725ee9c220047fe85032cf126d92b Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Thu, 21 May 2020 15:05:37 -0700
Subject: [PATCH 1004/1533] Fix infinite loop in GetMatchingPaths in GCS

Tested by creating a custom bucket with an object starting with `/` and running the following queries. Without this patch, both of the queries should result in a hanging behavior, but now they return the proper results instead:

```
(env2) mihaimaruseac@ankh:/tmp/tf$ python -c 'import tensorflow as tf; print(tf.io.gfile.glob("gs://bucket/folder/*"))'
[]
(env2) mihaimaruseac@ankh:/tmp/tf$ python -c 'import tensorflow as tf; print(tf.io.gfile.glob("gs://bucket/folder/\/*"))'
['gs://bucket/folder//foo.txt']
```

Fixes #36394

PiperOrigin-RevId: 312748645
Change-Id: I4e2eb82a8be31643cd3ce745451f9e8f2d32173e
---
 .../core/platform/cloud/gcs_file_system.cc    | 19 ++++++-
 .../platform/cloud/gcs_file_system_test.cc    | 50 +++++++++++++++++++
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index e4047c78998..92210498b01 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -158,12 +158,17 @@ string JoinGcsPath(const string& path, const string& subpath) {
 /// For example:
 ///  - for 'a/b/c/d' it will append 'a', 'a/b' and 'a/b/c'
 ///  - for 'a/b/c/' it will append 'a', 'a/b' and 'a/b/c'
+///  - for 'a//b/c/' it will append 'a', 'a//b' and 'a//b/c'
+///  - for '/a/b/c/' it will append '/a', '/a/b' and '/a/b/c'
 std::set<string> AddAllSubpaths(const std::vector<string>& paths) {
   std::set<string> result;
   result.insert(paths.begin(), paths.end());
   for (const string& path : paths) {
     StringPiece subpath = io::Dirname(path);
-    while (!subpath.empty()) {
+    // If `path` starts with `/`, `subpath` will be `/` and then we get into an
+    // infinite loop. Same behavior happens if there is a `//` pattern in
+    // `path`, so we check for that and leave the loop quicker.
+    while (!(subpath.empty() || subpath == "/")) {
       result.emplace(string(subpath));
       subpath = io::Dirname(subpath);
     }
@@ -1349,9 +1354,19 @@ Status GcsFileSystem::GetMatchingPaths(const string& pattern,
 
         const auto& files_and_folders = AddAllSubpaths(all_files);
 
+        // To handle `/` in the object names, we need to remove it from `dir`
+        // and then use `StrCat` to insert it back.
+        const StringPiece dir_no_slash = str_util::StripSuffix(dir, "/");
+
         // Match all obtained paths to the input pattern.
         for (const auto& path : files_and_folders) {
-          const string& full_path = this->JoinPath(dir, path);
+          // Manually construct the path instead of using `JoinPath` for the
+          // cases where `path` starts with a `/` (which is a valid character in
+          // the filenames of GCS objects). `JoinPath` canonicalizes the result,
+          // removing duplicate slashes. We know that `dir_no_slash` does not
+          // end in `/`, so we are safe inserting the new `/` here as the path
+          // separator.
+          const string full_path = strings::StrCat(dir_no_slash, "/", path);
           if (this->Match(full_path, pattern)) {
             results->push_back(full_path);
           }
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 802f18a31ae..14af9f979e6 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -1969,6 +1969,56 @@ TEST(GcsFileSystemTest, GetMatchingPaths_SelfDirectoryMarker) {
   EXPECT_EQ(std::vector<string>({"gs://bucket/path/file3.txt"}), result);
 }
 
+TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectName) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+      "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      "{\"items\": [ "
+      "  { \"name\": \"path/\" },"
+      "  { \"name\": \"path//foo.txt\" }]}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
+
+  std::vector<string> result;
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/*", &result));
+  EXPECT_EQ(std::vector<string>(), result);
+}
+
+TEST(GcsFileSystemTest, GetMatchingPaths_SlashInObjectNameEscaped) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+      "fields=items%2Fname%2CnextPageToken&prefix=path%2F\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      "{\"items\": [ "
+      "  { \"name\": \"path/\" },"
+      "  { \"name\": \"path//foo.txt\" }]}")});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   std::unique_ptr<ZoneProvider>(new FakeZoneProvider),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */, kTestRetryConfig,
+                   kTestTimeoutConfig, *kAllowedLocationsDefault,
+                   nullptr /* gcs additional header */);
+
+  std::vector<string> result;
+  TF_EXPECT_OK(fs.GetMatchingPaths("gs://bucket/path/\\/*", &result));
+  EXPECT_EQ(std::vector<string>({"gs://bucket/path//foo.txt"}), result);
+}
+
 TEST(GcsFileSystemTest, GetMatchingPaths_FolderAndWildcard_NoMatches) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"

From 85c637969a25228065a276044691dab020984361 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 21 May 2020 15:16:46 -0700
Subject: [PATCH 1005/1533] Handle case when input is complex64 type. Fixes
 #38932

PiperOrigin-RevId: 312750937
Change-Id: Icfe1baa83bfb8916277c15b8d8fb254841fb2d38
---
 .../compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc      | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 6dd44e666fb..0effcdc5e4e 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -121,6 +121,8 @@ DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
       return DT_STRING;
     case toco::IODataType::BOOL:
       return DT_BOOL;
+    case toco::IODataType::COMPLEX64:
+      return DT_COMPLEX64;
     default:
       return DT_INVALID;
   }

From 60fb5dcc7db0b65d1147358df19101eeafb387ce Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Thu, 21 May 2020 15:26:09 -0700
Subject: [PATCH 1006/1533] Simplify and address missing features in TPU
 Extract Head Outside Compilation pass.

This updates the TPUExtractHeadTailOutsideCompilation in preparation for outside compilation tail extraction. Certain parts from outside compilation head extraction can be reused. Support for ops with no operands and pruning of aliased results in the cluster is also added.

PiperOrigin-RevId: 312752658
Change-Id: I7b07773b59d2dd009ac694dea083caf4eca74c00
---
 ...extract_head_tail_outside_compilation.mlir |  98 ++++-
 ...u_extract_head_tail_outside_compilation.cc | 335 ++++++++----------
 2 files changed, 238 insertions(+), 195 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 90fa8cff5dc..3e8ade180b1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -6,12 +6,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   // CHECK-LABEL: func @single_head_outside_compilation
   func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
     // CHECK:      tf_device.launch
-    //
     // CHECK:        "tf.A"
     // CHECK-NEXT:   tf_device.return
-    //
-    // CHECK:      device
-    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK:        "tf.C"
@@ -28,6 +25,88 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @ops_no_operands
+  func @ops_no_operands() -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.B"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<i32>)
+      %1 = "tf.B"(%0) {}: (tensor<i32>) -> (tensor<i32>)
+      "tf.C"(%1) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @aliased_output
+  func @aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]], %[[B_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      return %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1
+    %0:3 = "tf_device.cluster"() ( {
+      %1 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<i32>)
+      %2 = "tf.B"(%1) {}: (tensor<i32>) -> (tensor<i32>)
+      %3 = "tf.C"(%2) : (tensor<i32>) -> (tensor<i32>)
+      tf_device.return %1, %3, %2 : tensor<i32>, tensor<i32>, tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
+    return %0#0, %0#1, %0#2 : tensor<i32>, tensor<i32>, tensor<i32>
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @all_head_computation_ops
+  func @all_head_computation_ops(%arg0 : tensor<i32>) -> (tensor<i32>) {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+    // CHECK:        %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]], %arg0)
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]]
+    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   tf_device.return
+    //
+    // CHECK:      return %[[LAUNCH_OUT]]
+    %0 = "tf_device.cluster"() ( {
+      %1 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %2 = "tf.B"(%1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %3 = "tf.C"(%2, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      tf_device.return %3 : tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>)
+    return %0 : tensor<i32>
+  }
+}
+
+// -----
+
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @multiple_head_outside_compilation
   func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
@@ -36,8 +115,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
     // CHECK:        "tf.C"
     // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-    // CHECK:      device
-    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
@@ -83,8 +161,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
     // CHECK-NEXT:   tf_device.return %[[D_OUT]]
-    // CHECK:      device
-    // CHECK-SAME: "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK:        "tf.B"
@@ -105,15 +182,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @test_replicated_head_outside_compilation
   func @test_replicated_head_outside_compilation(%arg0 : tensor<i32>) -> () {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
     // CHECK-NEXT:   tf_device.return %[[D_OUT]]
-    // CHECK:      device
-    // CHECK-SAME: "TPU_REPLICATED_HOST"
+    // CHECK:      device = "TPU_REPLICATED_HOST"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK:        "tf.B"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 02d0c3e849b..5a059ce507c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -14,9 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <tuple>
 #include <type_traits>
 
-#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
@@ -51,181 +53,84 @@ bool HasOutsideCompilationAttribute(Operation* op) {
   return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr;
 }
 
-// Returns whether all operands of `op` are from values inside the
-// `input_value_set`.
-bool OpContainsOperandsFromSet(Operation* op,
-                               const llvm::SetVector<Value>& input_value_set) {
-  for (auto operand : op->getOperands())
-    if (input_value_set.count(operand) == 0) return false;
+Operation* GetOpOfValue(Value value) {
+  if (auto block_arg = value.dyn_cast<BlockArgument>())
+    return block_arg.getOwner()->getParentOp();
 
-  return true;
+  return value.getDefiningOp();
 }
 
-void RecordOutsideCompiledOpsAndUsages(
-    Operation* op, llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops,
-    llvm::SetVector<Value>* outside_compiled_op_usages) {
-  if (HasOutsideCompilationAttribute(op) &&
-      OpContainsOperandsFromSet(op, *outside_compiled_op_usages)) {
-    outside_compiled_ops->insert(op);
-    outside_compiled_op_usages->insert(op->getResults().begin(),
-                                       op->getResults().end());
-  }
-}
+// Returns a set of ops that are outside compiled and can be extracted to before
+// the TPU computation. These ops are either connected to the inputs of the TPU
+// computation or other ops that can be extracted, and have no dependencies with
+// other ops in the TPU computation that cannot be extracted.
+llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
+    tf_device::ClusterOp cluster) {
+  llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
 
-// Traverses the MLIR graph and returns a set of ops that
-// are connected to inputs of TPU computation and outside compiled.
-void ExtractOutsideCompiledOpsConnectedToHead(
-    Value input_value, llvm::SetVector<Value>* values_used_in_host_cluster,
-    llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops) {
-  llvm::SmallSetVector<Operation*, 4> parent_outside_compiled_ops_at_head;
-  for (auto& usage : input_value.getUses()) {
-    auto head_operation = usage.getOwner();
-    RecordOutsideCompiledOpsAndUsages(head_operation,
-                                      &parent_outside_compiled_ops_at_head,
-                                      values_used_in_host_cluster);
-  }
+  auto cluster_ops = cluster.GetBody().without_terminator();
+  for (Operation& cluster_op : cluster_ops) {
+    if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
+    // An outside compiled op can be extracted if its operands are not from
+    // other ops in the cluster that cannot be extracted.
+    auto result = cluster_op.walk([&](Operation* op) {
+      for (Value operand : op->getOperands()) {
+        Operation* operand_op = GetOpOfValue(operand);
+        if (operand_op->isProperAncestor(cluster) ||
+            cluster_op.isAncestor(operand_op) ||
+            head_outside_compiled_ops.count(operand_op))
+          continue;
 
-  // Traverse the graph and find all outside compiled ops connected from
-  // the `input_value`.
-  while (!parent_outside_compiled_ops_at_head.empty()) {
-    llvm::SmallSetVector<Operation*, 4> connected_outside_compiled_ops;
-    for (auto head_outside_compiled_op : parent_outside_compiled_ops_at_head) {
-      auto op_results = head_outside_compiled_op->getOpResults();
-      for (auto op_result : op_results) {
-        for (auto& use : op_result.getUses()) {
-          auto connected_op = use.getOwner();
-          RecordOutsideCompiledOpsAndUsages(connected_op,
-                                            &connected_outside_compiled_ops,
-                                            values_used_in_host_cluster);
-        }
+        return WalkResult::interrupt();
       }
-    }
+      return WalkResult::advance();
+    });
 
-    outside_compiled_ops->insert(parent_outside_compiled_ops_at_head.begin(),
-                                 parent_outside_compiled_ops_at_head.end());
-    std::swap(parent_outside_compiled_ops_at_head,
-              connected_outside_compiled_ops);
-  }
-}
-
-// TODO(hongjunchoi): Also handle ops without inputs that are outside
-// compiled.
-//
-// Returns set of ops that are outside compiled and are directly connected
-// to inputs to the TPU computation.
-llvm::SmallSetVector<Operation*, 4> IdentifyOutsideCompiledOpsAtHead(
-    tf_device::ClusterOp tpu_cluster) {
-  llvm::SmallSetVector<Operation*, 4> outside_compiled_at_head_ops;
-  llvm::SetVector<Value> values_used_in_cluster;
-  auto& cluster_region = tpu_cluster.body();
-  getUsedValuesDefinedAbove(cluster_region, cluster_region,
-                            values_used_in_cluster);
-
-  auto input_value_list = llvm::to_vector<8>(values_used_in_cluster);
-  for (auto input_value : input_value_list)
-    ExtractOutsideCompiledOpsConnectedToHead(
-        input_value, &values_used_in_cluster, &outside_compiled_at_head_ops);
-  return outside_compiled_at_head_ops;
-}
-
-// Returns output values of extracted outside compiled cluster at head that
-// are used by the TPU computation.
-llvm::SmallVector<Value, 8> GetHeadExtractedClusterOutputs(
-    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
-  llvm::SmallVector<Value, 8> outputs;
-  outputs.reserve(head_outside_compiled_ops.size());
-
-  for (auto op : head_outside_compiled_ops) {
-    for (Operation* user : op->getUsers()) {
-      if (!head_outside_compiled_ops.count(user)) {
-        outputs.append(op->result_begin(), op->result_end());
-        break;
-      }
-    }
+    if (!result.wasInterrupted()) head_outside_compiled_ops.insert(&cluster_op);
   }
 
-  return outputs;
+  return head_outside_compiled_ops.takeVector();
 }
 
-// Creates new tf_device.launch op with outside compiled ops extracted
-// from the head of TPU computation.
-llvm::Optional<tf_device::LaunchOp> IsolateHeadExtractedOpsToLaunchOp(
-    OpBuilder* builder, tf_device::ClusterOp cluster,
-    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
-  if (head_outside_compiled_ops.empty())
-    return llvm::Optional<tf_device::LaunchOp>();
-
-  // Create tf_device.launch op to separate all extracted outside compiled ops
-  // before the tf_device.cluster.
-  auto output_values =
-      GetHeadExtractedClusterOutputs(head_outside_compiled_ops);
-
-  llvm::SmallVector<Type, 8> output_return_types;
-  output_return_types.reserve(output_values.size());
-  for (auto output : output_values)
-    output_return_types.emplace_back(output.getType());
-
-  builder->setInsertionPoint(cluster);
-  auto host_launch_op = builder->create<tf_device::LaunchOp>(
-      cluster.getLoc(), builder->getStringAttr(""), output_return_types);
-
-  // Replace all usages of outside compiled ops that are used in TPU
-  // computation with the results of the above created launch op.
-  for (auto output_and_index : llvm::enumerate(output_values)) {
-    auto output_index = output_and_index.index();
-    auto output = output_and_index.value();
-    for (auto& use : output.getUses()) {
-      if (!head_outside_compiled_ops.count(use.getOwner()))
-        use.set(host_launch_op.getResult(output_index));
-    }
+// Parses TPU compilation and execution devices from a TPU cluster and returns
+// the host device for the head and tail computations. If the TPU computation is
+// replicated, kTPUReplicatedHost is returned instead.
+LogicalResult GetHostDeviceForHeadTailComputation(
+    mlir::TF::RuntimeDevices devices, tf_device::ClusterOp cluster,
+    std::string* host_device) {
+  auto replicate = cluster.getParentOfType<tf_device::ReplicateOp>();
+  if (replicate) {
+    *host_device = tensorflow::kTPUReplicatedHost;
+    return success();
   }
 
-  // Create terminator op for the newly created launch op.
-  host_launch_op.body().push_back(new Block());
-  builder->setInsertionPointToEnd(&host_launch_op.GetBody());
-  auto terminator = builder->create<tf_device::ReturnOp>(
-      host_launch_op.getLoc(), output_values);
-
-  // Move all outside compile ops from cluster op to launch op.
-  for (auto outside_compiled_op : head_outside_compiled_ops)
-    outside_compiled_op->moveBefore(terminator);
-
-  return host_launch_op;
-}
-
-// Parses TPU compilation and execution device form tpu cluster and assigns
-// host device to `host_launch` device attribute.
-LogicalResult SetCompilationDeviceToHostLaunch(
-    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
-    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
-  auto num_cores_per_replica_attr = tpu_cluster.getAttrOfType<IntegerAttr>(
-      tensorflow::kNumCoresPerReplicaAttr);
+  auto num_cores_per_replica_attr =
+      cluster.getAttrOfType<IntegerAttr>(tensorflow::kNumCoresPerReplicaAttr);
   if (!num_cores_per_replica_attr)
-    return tpu_cluster.emitOpError(
+    return cluster.emitOpError(
         "cluster op missing `num_cores_per_replica` attribute");
 
   if (num_cores_per_replica_attr.getInt() != 1)
-    return tpu_cluster.emitOpError(
+    return cluster.emitOpError(
         "outside compilation is not supported with model parallelism.");
 
   auto topology_attr =
-      tpu_cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
+      cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
   if (!topology_attr)
-    return tpu_cluster.emitOpError("cluster op missing `topology` attribute");
+    return cluster.emitOpError("cluster op missing `topology` attribute");
 
-  auto device_assignment_attr = tpu_cluster.getAttrOfType<mlir::ArrayAttr>(
-      tensorflow::kDeviceAssignmentAttr);
+  auto device_assignment_attr =
+      cluster.getAttrOfType<mlir::ArrayAttr>(tensorflow::kDeviceAssignmentAttr);
   if (!device_assignment_attr)
-    return tpu_cluster.emitOpError(
-        llvm::formatv("requires attribute '{0}'",
-                      tensorflow::kDeviceAssignmentAttr)
-            .str());
+    return cluster.emitOpError(llvm::formatv("requires attribute '{0}'",
+                                             tensorflow::kDeviceAssignmentAttr)
+                                   .str());
 
   auto status_or_device_coodinates =
       tensorflow::GetDeviceCoordinates(device_assignment_attr);
 
   if (!status_or_device_coodinates.ok())
-    return tpu_cluster.emitError()
+    return cluster.emitError()
            << "error in fetching tpu device coordinates: "
            << status_or_device_coodinates.status().error_message();
 
@@ -236,37 +141,96 @@ LogicalResult SetCompilationDeviceToHostLaunch(
           /*num_cores_per_replica=*/1, topology_attr.getValue(),
           status_or_device_coodinates.ConsumeValueOrDie());
   if (!status_or_tpu_device_assignment.ok())
-    return tpu_cluster.emitError()
+    return cluster.emitError()
            << "error in fetching TPU compilation/execution devices: "
            << status_or_tpu_device_assignment.status().error_message();
   auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
-  host_launch.deviceAttr(
-      builder->getStringAttr(tpu_device_assignment.tpu_devices[0][0].host));
 
+  *host_device = tpu_device_assignment.tpu_devices[0][0].host;
   return success();
 }
 
-// Assigns host device attribute to host launch op or enclosing
-// tf_device.replicate op if TPU computation is replicated.
-LogicalResult HandleHostLaunchDeviceAssignment(
-    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
-    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
-  auto parent_replicate_op =
-      llvm::dyn_cast_or_null<tf_device::ReplicateOp>(host_launch.getParentOp());
-  // If computation is replicated, then add TPU_REPLICATED_HOST device alias
-  // to the host launch op. This device alias would later be a reference to
-  // host device string in the device map of tf_device.replicate op
-  // during tpu_rewrite pass.
-  if (parent_replicate_op) {
-    host_launch.deviceAttr(
-        builder->getStringAttr(tensorflow::kTPUReplicatedHost));
-  } else {
-    if (failed(SetCompilationDeviceToHostLaunch(builder, devices, tpu_cluster,
-                                                host_launch)))
-      return failure();
+// Moves head outside compiled ops into its own `tf_device.LaunchOp`
+// computation.
+tf_device::LaunchOp CreateHeadComputation(
+    OpBuilder* builder, tf_device::ClusterOp cluster,
+    llvm::ArrayRef<Operation*> head_outside_compiled_ops,
+    llvm::StringRef host_device) {
+  Block* launch_block = new Block;
+  for (Operation* head_outside_compiled_op : head_outside_compiled_ops)
+    head_outside_compiled_op->moveBefore(launch_block, launch_block->end());
+
+  // Find results of ops in head computation that needs to returned.
+  llvm::SmallVector<Value, 4> launch_results;
+  llvm::SmallVector<Type, 4> launch_result_types;
+  for (Operation& head_outside_compiled_op : *launch_block) {
+    for (Value result : head_outside_compiled_op.getResults()) {
+      bool has_uses_in_cluster = false;
+      for (Operation* user : result.getUsers()) {
+        if (user->getParentRegion() &&
+            cluster.body().isAncestor(user->getParentRegion())) {
+          has_uses_in_cluster = true;
+          break;
+        }
+      }
+      if (has_uses_in_cluster) {
+        launch_results.push_back(result);
+        launch_result_types.push_back(result.getType());
+      }
+    }
   }
 
-  return success();
+  builder->setInsertionPoint(cluster);
+  auto launch = builder->create<tf_device::LaunchOp>(
+      cluster.getLoc(), builder->getStringAttr(host_device),
+      launch_result_types);
+  launch.body().push_back(launch_block);
+
+  builder->setInsertionPointToEnd(&launch.GetBody());
+  builder->create<tf_device::ReturnOp>(cluster.getLoc(), launch_results);
+
+  for (auto result : llvm::zip(launch_results, launch.getResults()))
+    replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
+                               cluster.body());
+
+  return launch;
+}
+
+// Removes aliased outputs in cluster from head computation after head
+// computation has been extracted.
+void RemoveHeadComputationAliasedOutputs(OpBuilder* builder,
+                                         tf_device::LaunchOp head_computation,
+                                         tf_device::ClusterOp cluster) {
+  llvm::SmallVector<Value, 4> used_old_cluster_results;
+  llvm::SmallVector<Value, 4> new_cluster_results;
+  llvm::SmallVector<Type, 4> new_cluster_result_types;
+  Operation* cluster_terminator = cluster.GetBody().getTerminator();
+  for (auto result :
+       llvm::zip(cluster_terminator->getOperands(), cluster.getResults())) {
+    Value cluster_terminator_operand = std::get<0>(result);
+    if (cluster_terminator_operand.getDefiningOp() == head_computation) {
+      std::get<1>(result).replaceAllUsesWith(cluster_terminator_operand);
+    } else {
+      new_cluster_results.push_back(cluster_terminator_operand);
+      new_cluster_result_types.push_back(cluster_terminator_operand.getType());
+      used_old_cluster_results.push_back(std::get<1>(result));
+    }
+  }
+
+  if (new_cluster_results.size() == cluster.getNumResults()) return;
+
+  builder->setInsertionPoint(cluster);
+  auto new_cluster = builder->create<tf_device::ClusterOp>(
+      cluster.getLoc(), new_cluster_result_types,
+      /*operands=*/llvm::ArrayRef<Value>{}, cluster.getAttrs());
+  new_cluster.body().takeBody(cluster.body());
+  new_cluster.GetBody().getTerminator()->setOperands(new_cluster_results);
+
+  for (auto result :
+       llvm::zip(used_old_cluster_results, new_cluster.getResults()))
+    std::get<0>(result).replaceAllUsesWith(std::get<1>(result));
+
+  cluster.erase();
 }
 
 struct TPUExtractHeadTailOutsideCompilation
@@ -283,22 +247,25 @@ void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
     return signalPassFailure();
 
   OpBuilder builder(&getContext());
-  auto result = module.walk([&](tf_device::ClusterOp cluster) {
-    auto head_outside_compiled_ops = IdentifyOutsideCompiledOpsAtHead(cluster);
-    auto host_launch_op = IsolateHeadExtractedOpsToLaunchOp(
-        &builder, cluster, head_outside_compiled_ops);
-    if (host_launch_op) {
-      if (failed(HandleHostLaunchDeviceAssignment(&builder, devices, cluster,
-                                                  *host_launch_op))) {
-        return WalkResult::interrupt();
-      }
-    }
+  llvm::SmallVector<tf_device::ClusterOp, 4> clusters;
+  module.walk(
+      [&](tf_device::ClusterOp cluster) { clusters.push_back(cluster); });
 
-    // TODO(b/155115766): Implement tail outside compiled op extraction.
-    return WalkResult::advance();
-  });
+  for (tf_device::ClusterOp cluster : clusters) {
+    llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
+        FindOutsideCompiledOpsAtHead(cluster);
+    if (head_outside_compiled_ops.empty()) continue;
+    std::string host_device;
+    if (failed(GetHostDeviceForHeadTailComputation(devices, cluster,
+                                                   &host_device)))
+      return signalPassFailure();
 
-  if (result.wasInterrupted()) signalPassFailure();
+    tf_device::LaunchOp head_computation = CreateHeadComputation(
+        &builder, cluster, head_outside_compiled_ops, host_device);
+    RemoveHeadComputationAliasedOutputs(&builder, head_computation, cluster);
+
+    // TODO(b/157160906): Implement tail outside compiled op extraction.
+  }
 }
 
 }  // anonymous namespace

From e312350702aa8ab87b6fac5dec1a285d3da6a7b0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 15:51:02 -0700
Subject: [PATCH 1007/1533] Enable 1st order gradient tests for tf.linalg.svd
 in eager mode.

PiperOrigin-RevId: 312756858
Change-Id: I20d73e8972014b96bc90952949820390ae77e08d
---
 tensorflow/python/kernel_tests/svd_op_test.py | 55 +++++++++----------
 1 file changed, 26 insertions(+), 29 deletions(-)

diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index eae42f55a3f..cad131dda74 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -225,45 +226,41 @@ def _NormalizingSvd(tf_a, full_matrices_):
 
 def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    np.random.seed(42)
-    a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
-    if dtype_ in [np.complex64, np.complex128]:
-      a += 1j * np.random.uniform(
-          low=-1.0, high=1.0, size=shape_).astype(dtype_)
+
+    def RandomInput():
+      np.random.seed(42)
+      a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
+      if dtype_ in [np.complex64, np.complex128]:
+        a += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=shape_).astype(dtype_)
+      return a
+
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     # See Equation (21) in:
     # http://www.karenkopecky.net/Teaching/eco613614/Notes_NumericalDifferentiation.pdf
     # TODO(rmlarsen): Move step size control to gradient checker.
     epsilon = np.finfo(dtype_).eps
-    delta = 0.1 * epsilon**(1.0 / 3.0)
+    delta = 0.25 * epsilon**(1.0 / 3.0)
     if dtype_ in [np.float32, np.complex64]:
       tol = 3e-2
     else:
       tol = 1e-6
-    with self.session(use_gpu=True):
-      tf_a = constant_op.constant(a)
-      if compute_uv_:
-        tf_s, tf_u, tf_v = _NormalizingSvd(tf_a, full_matrices_)
-        outputs = [tf_s, tf_u, tf_v]
-      else:
-        tf_s = linalg_ops.svd(tf_a, compute_uv=False)
-        outputs = [tf_s]
-      for b in outputs:
-        x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        if dtype_ in [np.complex64, np.complex128]:
-          x_init += 1j * np.random.uniform(
-              low=-1.0, high=1.0, size=shape_).astype(dtype_)
-        theoretical, numerical = gradient_checker.compute_gradient(
-            tf_a,
-            tf_a.get_shape().as_list(),
-            b,
-            b.get_shape().as_list(),
-            x_init_value=x_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    if compute_uv_:
+      funcs = [
+          lambda a: _NormalizingSvd(a, full_matrices_)[0],
+          lambda a: _NormalizingSvd(a, full_matrices_)[1],
+          lambda a: _NormalizingSvd(a, full_matrices_)[2]
+      ]
+    else:
+      funcs = [lambda a: linalg_ops.svd(a, compute_uv=False)]
+
+    for f in funcs:
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          f, [RandomInput()], delta=delta)
+      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
   return Test
 
 
From 7221ad6edae6dd32c779a5e073e08f8a7fec8214 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 15:54:05 -0700
Subject: [PATCH 1008/1533] Enable tests for tf.linalg.matrix_logarithm in
 eager mode.

PiperOrigin-RevId: 312757336
Change-Id: I0323132c43830f37bbb2480be700d6c2bc65f175
---
 .../kernel_tests/matrix_logarithm_op_test.py  | 45 ++++++++++---------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index fa466d975f8..8cc230d2806 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -23,12 +23,13 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg_impl
 from tensorflow.python.platform import benchmark
@@ -57,7 +58,7 @@ class LogarithmOpTest(test.TestCase):
     matrix_batch = np.tile(matrix_batch, [2, 3, 1, 1])
     return matrix_batch
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonsymmetric(self):
     # 2x2 matrices
     matrix1 = np.array([[1., 2.], [3., 4.]])
@@ -71,7 +72,7 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSymmetricPositiveDefinite(self):
     # 2x2 matrices
     matrix1 = np.array([[2., 1.], [1., 2.]])
@@ -85,27 +86,27 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
     # When the logarithm of a non-square matrix is attempted we should return
     # an error
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_logarithm(
           np.array([[1., 2., 3.], [3., 4., 5.]], dtype=np.complex64))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to the logarithm should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.], dtype=dtypes.complex64)
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_logarithm(tensor3)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyLogarithmComplex(np.empty([0, 2, 2], dtype=np.complex64))
     self._verifyLogarithmComplex(np.empty([2, 0, 0], dtype=np.complex64))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testRandomSmallAndLargeComplex64(self):
     np.random.seed(42)
     for batch_dims in [(), (1,), (3,), (2, 2)]:
@@ -116,7 +117,7 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex64)
         self._verifyLogarithmComplex(matrix)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testRandomSmallAndLargeComplex128(self):
     np.random.seed(42)
     for batch_dims in [(), (1,), (3,), (2, 2)]:
@@ -127,17 +128,21 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex128)
         self._verifyLogarithmComplex(matrix)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      matrix1 = math_ops.cast(
-          random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
-      matrix2 = math_ops.cast(
-          random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
-      logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
-      logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
-      logm = self.evaluate([logm1, logm2])
-      self.assertAllEqual(logm[0], logm[1])
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = math_ops.cast(
+        stateless_random_ops.stateless_random_normal(matrix_shape, seed=seed),
+        dtypes.complex64)
+    matrix2 = math_ops.cast(
+        stateless_random_ops.stateless_random_normal(matrix_shape, seed=seed),
+        dtypes.complex64)
+    self.assertAllEqual(matrix1, matrix2)
+    logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
+    logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
+    logm = self.evaluate([logm1, logm2])
+    self.assertAllEqual(logm[0], logm[1])
 
 
 class MatrixLogarithmBenchmark(test.Benchmark):

From ed39014cf6c7e0fcd7a08ce445a52ec27949c251 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Thu, 21 May 2020 15:56:56 -0700
Subject: [PATCH 1009/1533] Don't disable all TPU tests just the ones that
 fail.

* Skips a test that segfaults sometimes when run on TPUs.
* Skips a test on TPU that fails with a different error message.

PiperOrigin-RevId: 312757787
Change-Id: I662c28c55a9f3f907c7f6a8f217506bb17c3a8c7
---
 tensorflow/python/keras/distribute/BUILD                  | 1 -
 .../python/keras/distribute/distribute_strategy_test.py   | 8 ++++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 50ed6086195..b7fe3b5bda6 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -128,7 +128,6 @@ distribute_py_test(
         "multi_and_single_gpu",
         "no_rocm",  # times out on ROCm
         "no_windows_gpu",
-        "notpu",  # TODO(b/155867206) flaky segfault
         "notsan",
     ],
     tpu_tags = [
diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py
index f6a83c499fe..eac1e2feb8b 100644
--- a/tensorflow/python/keras/distribute/distribute_strategy_test.py
+++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py
@@ -575,8 +575,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 
   @combinations.generate(
       combinations.combine(
-          distribution=[strategy_combinations.one_device_strategy] +
-          tpu_strategies,
+          distribution=[strategy_combinations.one_device_strategy],
           mode=['graph', 'eager']))
   def test_optimizer_in_cross_replica_context_raises_error(self, distribution):
 
@@ -1070,6 +1069,11 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   @combinations.generate(all_strategy_combinations())
   def test_on_dataset_with_unknown_cardinality_without_steps(
       self, distribution, mode):
+    # TODO(b/155867206): Investigate why this test occasionally segfaults on TPU
+    # in eager mode.
+    if mode == 'eager' and isinstance(
+        distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+      self.skipTest('caused segfault with TPU in eager mode.')
 
     if mode == 'graph' and isinstance(
         distribution, (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):

From 19ed4a9ccfca2565f130df523e630fedec68728d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 16:04:08 -0700
Subject: [PATCH 1010/1533] Fix issues where index_lookup was improperly
 handling hard vocab caps. Add tests.

PiperOrigin-RevId: 312759072
Change-Id: Id24687eee01a6898473e128b8c2cfeb13be89547
---
 .../layers/preprocessing/index_lookup.py      |  9 ++-
 .../layers/preprocessing/index_lookup_test.py | 75 ++++++++++++++++++-
 .../preprocessing/text_vectorization_test.py  | 34 +++++++++
 3 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index c0d0d266ad3..7d11feae341 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -118,9 +118,14 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     else:
       self._oov_value = -1
 
+    if max_tokens is not None:
+      num_mask_tokens = (0 if mask_token is None else 1)
+      vocab_size = max_tokens - (num_oov_indices + num_mask_tokens)
+    else:
+      vocab_size = None
+
     super(IndexLookup, self).__init__(
-        combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token),
-        **kwargs)
+        combiner=_IndexLookupCombiner(vocab_size, self.mask_token), **kwargs)
 
     self._output_dtype = dtypes.int64
 
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index 73189d9b9f1..a61cef6121f 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -86,7 +86,8 @@ def _get_end_to_end_test_cases():
           "vocab_data":
               np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
                         ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
-          "input_data": np.array([[1], [2], [3], [4], [4], [3], [1], [5]]),
+          "input_data":
+              np.array([[1], [2], [3], [4], [4], [3], [1], [5]]),
           "kwargs": {
               "max_tokens": None,
               "num_oov_indices": 1,
@@ -125,6 +126,78 @@ def _get_end_to_end_test_cases():
           "input_dtype":
               dtypes.int64
       },
+      {
+          "testcase_name":
+              "test_strings_hard_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": 5,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
+          },
+          "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
+          "input_dtype":
+              dtypes.string
+      },
+      {
+          "testcase_name":
+              "test_inverse_strings_hard_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([[1], [2], [3], [4], [4], [3], [1], [5]]),
+          "kwargs": {
+              "max_tokens": 5,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
+              "invert": True
+          },
+          "expected_output":
+              np.array([[b"earth"], [b"wind"], [b"and"], [b"[OOV]"], [b"[OOV]"],
+                        [b"and"], [b"earth"], [b"[OOV]"]]),
+          "input_dtype":
+              dtypes.int64
+      },
+      {
+          "testcase_name":
+              "test_ints_hard_vocab_cap",
+          # Create an array where 1138 is the most frequent term, followed by
+          # 1729, then 725, then 42. This ensures that the vocab accumulator
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
+                        [1729], [725], [725]],
+                       dtype=np.int64),
+          "input_data":
+              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                       dtype=np.int64),
+          "kwargs": {
+              "max_tokens": 5,
+              "num_oov_indices": 1,
+              "mask_token": 0,
+              "oov_token": -1,
+              "dtype": dtypes.int64,
+          },
+          "expected_output": [[2], [3], [4], [1], [1], [4], [2], [1]],
+          "input_dtype":
+              dtypes.int64
+      },
   )
 
   crossed_test_cases = []
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index affa392e42b..5d909498d8a 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -1510,5 +1510,39 @@ class TextVectorizationSavingTest(
     self.assertAllEqual(expected_output, new_output_dataset)
 
 
+@keras_parameterized.run_all_keras_modes
+class TextVectorizationE2ETest(keras_parameterized.TestCase,
+                               preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_keras_vocab_trimming_example(self):
+    vocab_data = np.array([
+        "earth", "earth", "earth", "earth", "wind", "wind", "wind", "and",
+        "and", "fire"
+    ])
+    input_array = np.array([["earth", "wind", "and", "earth"],
+                            ["ohio", "and", "earth", "michigan"]])
+
+    # pyformat: disable
+    expected_output = [[1, 2, 1],
+                       [3, 1, 0]]
+    # pyformat: enable
+    max_tokens = 3
+    expected_output_shape = [None, max_tokens]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=max_tokens,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.COUNT,
+        pad_to_max_tokens=True)
+    int_data = layer(input_data)
+    layer.adapt(vocab_data)
+    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+    model = keras.Model(input_data, int_data)
+    output = model.predict(input_array)
+    self.assertAllEqual(expected_output, output)
+
+
 if __name__ == "__main__":
   test.main()

From 09243a984d47a01a1c9a1a75edcc37be0ec3b31e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 16:37:52 -0700
Subject: [PATCH 1011/1533] Add bfloat16 support for
 SparseSegmentMean*/SparseSegmentSqrtN*

PiperOrigin-RevId: 312764313
Change-Id: I1e5de7e48f6e42a5c22012954b59ba1fea304441
---
 .../core/kernels/segment_reduction_ops_impl.h | 267 ++++++++++--------
 .../kernels/segment_reduction_ops_impl_5.cc   |   2 +
 tensorflow/core/ops/math_ops.cc               |   8 +-
 3 files changed, 154 insertions(+), 123 deletions(-)

diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index 8954dcd4681..ccd775b7ef2 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -508,6 +508,12 @@ class SparseSegmentReductionOpBase : public OpKernel {
                 errors::InvalidArgument("segment ids must be >= 0"));
     auto output_flat = output->flat_outer_dims<T>();
 
+    Tensor temp;
+    if constexpr (std::is_same<T, bfloat16>::value) {
+      temp = tensorflow::Tensor(DT_FLOAT, output_shape);
+    }
+    auto temp_flat = temp.flat_outer_dims<float>();
+
     int64 start = 0, end = 1;
     // Index from which the output is not initialized.
     SegmentId uninitialized_index = 0;
@@ -546,8 +552,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
       }
 
       auto out = output_flat.template chip<0>(out_index);
+      auto temp = temp_flat.template chip<0>(out_index);
       const int bad_offset =
-          Reduce(input_flat, indices_vec, start, end - start, out);
+          Reduce(input_flat, indices_vec, start, end - start, out, temp);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
                       "Bad: indices[", start + bad_offset,
@@ -572,130 +579,152 @@ class SparseSegmentReductionOpBase : public OpKernel {
   }
 
  private:
-  int64 Reduce(const typename TTypes<T>::ConstMatrix& input_flat,
-               const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
-               int64 num,
-               Eigen::TensorChippingOp<0, typename TTypes<T>::Matrix> out) {
+  // TODO(jaideepsi): re-write without macros, simplify Reduce b/157240265
+  int64 Reduce(
+      const typename TTypes<T>::ConstMatrix& input_flat,
+      const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<T>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+#define REDUCE                                                                 \
+  if (num == 1) {                                                              \
+    INDEX(0, 0);                                                               \
+    OUT = L(0);                                                                \
+  } else {                                                                     \
+    int64 r = num & 7;                                                         \
+    DT m(1);                                                                   \
+    if (is_mean_ && (num < 10)) {                                              \
+      m = DT(num);                                                             \
+    }                                                                          \
+    if (is_sqrtn_ && (num < 10)) {                                             \
+      m = DT(sqrt(num));                                                       \
+    }                                                                          \
+    switch (r) {                                                               \
+      case 2: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        OUT = (L(0) + L(1)) / m;                                               \
+        break;                                                                 \
+      }                                                                        \
+      case 3: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        INDEX(2, 2);                                                           \
+        OUT = (L(0) + L(1) + L(2)) / m;                                        \
+        break;                                                                 \
+      }                                                                        \
+      case 4: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        INDEX(2, 2);                                                           \
+        INDEX(3, 3);                                                           \
+        OUT = (L(0) + L(1) + L(2) + L(3)) / m;                                 \
+        break;                                                                 \
+      }                                                                        \
+      case 5: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        INDEX(2, 2);                                                           \
+        INDEX(3, 3);                                                           \
+        INDEX(4, 4);                                                           \
+        OUT = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;                          \
+        break;                                                                 \
+      }                                                                        \
+      case 6: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        INDEX(2, 2);                                                           \
+        INDEX(3, 3);                                                           \
+        INDEX(4, 4);                                                           \
+        INDEX(5, 5);                                                           \
+        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;                   \
+        break;                                                                 \
+      }                                                                        \
+      case 7: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        INDEX(2, 2);                                                           \
+        INDEX(3, 3);                                                           \
+        INDEX(4, 4);                                                           \
+        INDEX(5, 5);                                                           \
+        INDEX(6, 6);                                                           \
+        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;            \
+        break;                                                                 \
+      }                                                                        \
+      case 0: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        INDEX(2, 2);                                                           \
+        INDEX(3, 3);                                                           \
+        INDEX(4, 4);                                                           \
+        INDEX(5, 5);                                                           \
+        INDEX(6, 6);                                                           \
+        INDEX(7, 7);                                                           \
+        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;     \
+        r = 8;                                                                 \
+        break;                                                                 \
+      }                                                                        \
+      case 1: {                                                                \
+        INDEX(0, 0);                                                           \
+        INDEX(1, 1);                                                           \
+        INDEX(2, 2);                                                           \
+        INDEX(3, 3);                                                           \
+        INDEX(4, 4);                                                           \
+        INDEX(5, 5);                                                           \
+        INDEX(6, 6);                                                           \
+        INDEX(7, 7);                                                           \
+        INDEX(8, 8);                                                           \
+        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) / \
+              m;                                                               \
+        r = 9;                                                                 \
+        break;                                                                 \
+      }                                                                        \
+    }                                                                          \
+    for (; r < num; r += 8) {                                                  \
+      INDEX(0, r);                                                             \
+      INDEX(1, r + 1);                                                         \
+      INDEX(2, r + 2);                                                         \
+      INDEX(3, r + 3);                                                         \
+      INDEX(4, r + 4);                                                         \
+      INDEX(5, r + 5);                                                         \
+      INDEX(6, r + 6);                                                         \
+      INDEX(7, r + 7);                                                         \
+      OUT += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);            \
+    }                                                                          \
+    if (is_mean_ && num >= 10) {                                               \
+      OUT = OUT / static_cast<DT>(num);                                        \
+    }                                                                          \
+    if (is_sqrtn_ && num >= 10) {                                              \
+      OUT = OUT / static_cast<DT>(sqrt(num));                                  \
+    }                                                                          \
+  }
+
 #define INDEX(n, i)                               \
   const auto index##n = indices_vec(start + (i)); \
   if (!FastBoundsCheck(index##n, input_flat.dimension(0))) return (i);
 
-#define L(n) input_flat.template chip<0>(index##n)
+    if constexpr (std::is_same<T, bfloat16>::value) {
+#define L(n) input_flat.template chip<0>(index##n).template cast<float>()
+#define OUT temp
+#define DT float
 
-    if (num == 1) {
-      INDEX(0, 0);
-      out = L(0);
-    } else {
-      int64 r = num % 8;
-      T m(1);
-      if (is_mean_ && (num < 10)) {
-        m = T(num);
-      }
-      if (is_sqrtn_ && (num < 10)) {
-        m = T(sqrt(num));
-      }
-      switch (r) {
-        case 2: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          out = (L(0) + L(1)) / m;
-          break;
-        }
-        case 3: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          INDEX(2, 2);
-          out = (L(0) + L(1) + L(2)) / m;
-          break;
-        }
-        case 4: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          INDEX(2, 2);
-          INDEX(3, 3);
-          out = (L(0) + L(1) + L(2) + L(3)) / m;
-          break;
-        }
-        case 5: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          INDEX(2, 2);
-          INDEX(3, 3);
-          INDEX(4, 4);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;
-          break;
-        }
-        case 6: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          INDEX(2, 2);
-          INDEX(3, 3);
-          INDEX(4, 4);
-          INDEX(5, 5);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;
-          break;
-        }
-        case 7: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          INDEX(2, 2);
-          INDEX(3, 3);
-          INDEX(4, 4);
-          INDEX(5, 5);
-          INDEX(6, 6);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;
-          break;
-        }
-        case 0: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          INDEX(2, 2);
-          INDEX(3, 3);
-          INDEX(4, 4);
-          INDEX(5, 5);
-          INDEX(6, 6);
-          INDEX(7, 7);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;
-          r = 8;
-          break;
-        }
-        case 1: {
-          INDEX(0, 0);
-          INDEX(1, 1);
-          INDEX(2, 2);
-          INDEX(3, 3);
-          INDEX(4, 4);
-          INDEX(5, 5);
-          INDEX(6, 6);
-          INDEX(7, 7);
-          INDEX(8, 8);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) /
-                m;
-          r = 9;
-          break;
-        }
-      }
-      for (; r < num; r += 8) {
-        INDEX(0, r);
-        INDEX(1, r + 1);
-        INDEX(2, r + 2);
-        INDEX(3, r + 3);
-        INDEX(4, r + 4);
-        INDEX(5, r + 5);
-        INDEX(6, r + 6);
-        INDEX(7, r + 7);
-        out += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);
-      }
-      if (is_mean_ && num >= 10) {
-        out = out / static_cast<T>(num);
-      }
-      if (is_sqrtn_ && num >= 10) {
-        out = out / static_cast<T>(sqrt(num));
-      }
-    }
-
-    return -1;
+      REDUCE;
+      out = temp.template cast<bfloat16>();
+#undef DT
+#undef OUT
 #undef L
+    } else {
+#define L(n) input_flat.template chip<0>(index##n)
+#define OUT out
+#define DT T
+
+      REDUCE;
+
+#undef DT
+#undef OUT
+#undef L
+    }
+    return -1;
+#undef REDUCE
 #undef INDEX
   }
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
index fee0f818c5e..03a448e52b3 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -64,6 +64,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
                                                   segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
@@ -85,6 +86,7 @@ REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
           CPUDevice, type, index_type, segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index cbf03d7b045..5327995e0a4 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1337,7 +1337,7 @@ REGISTER_OP("SparseSegmentMean")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1348,7 +1348,7 @@ REGISTER_OP("SparseSegmentMeanWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
@@ -1370,7 +1370,7 @@ REGISTER_OP("SparseSegmentSqrtN")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1381,7 +1381,7 @@ REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")

From 50dc3262ea2f605df5774c65343fb4c0e4860951 Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Thu, 21 May 2020 16:56:44 -0700
Subject: [PATCH 1012/1533] Improve diagnostic when a mutable global tensor is
 found

PiperOrigin-RevId: 312767251
Change-Id: I5392241d6b3a3c965b547d7fc44b7665b480d20b
---
 .../mlir/tensorflow/transforms/freeze_global_tensors.cc       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
index d3b064f3efa..9d2a7e787ff 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
@@ -66,7 +66,9 @@ void FreezeGlobalTensorsPass::runOnOperation() {
       // previous optimize global tensors pass). If not, this pass has to fail
       // since it cannot perform one of its goals.
       if (global_tensor.is_mutable()) {
-        global_tensor.emitError() << "is not immutable";
+        global_tensor.emitError() << "is not immutable, try running "
+                                     "tf-saved-model-optimize-global-tensors "
+                                     "to prove tensors are immutable";
         return signalPassFailure();
       }
 

From 27d373215c554bdbccc654f14b1f05738ab381d1 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Thu, 21 May 2020 17:00:17 -0700
Subject: [PATCH 1013/1533] Reduce Layer.__call__ overhead by ~5%.

Layer._call_arg_was_passed now has a shortcut for the common case.

PiperOrigin-RevId: 312767781
Change-Id: I97c926cf266e814f2d75c2beac63023faa715b7d
---
 tensorflow/python/keras/engine/base_layer.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 0421772a75a..53d8cc5ab34 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -2308,15 +2308,17 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return input_masks
 
   def _call_arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
+    # Performance optimization: do no work in most common case.
+    if not args and not kwargs:
+      return False
+
     if arg_name in kwargs:
       return True
     call_fn_args = self._call_fn_args
     if not inputs_in_args:
       # Ignore `inputs` arg.
       call_fn_args = call_fn_args[1:]
-    if arg_name in dict(zip(call_fn_args, args)):
-      return True
-    return False
+    return arg_name in dict(zip(call_fn_args, args))
 
   def _get_call_arg_value(self, arg_name, args, kwargs, inputs_in_args=False):
     if arg_name in kwargs:

From 8d021e40304100ca0ec6a26fd1528919144f72ed Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 21 May 2020 17:02:18 -0700
Subject: [PATCH 1014/1533] Test hello world example binary can run

PiperOrigin-RevId: 312768125
Change-Id: I69e07f5ad797ae963e1802083f0fb50867a21713
---
 .../lite/micro/examples/hello_world/BUILD     |  6 ++++
 .../hello_world/hello_world_binary_test.sh    | 33 +++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100755 tensorflow/lite/micro/examples/hello_world/hello_world_binary_test.sh

diff --git a/tensorflow/lite/micro/examples/hello_world/BUILD b/tensorflow/lite/micro/examples/hello_world/BUILD
index 155aaafd98c..8da319f3095 100644
--- a/tensorflow/lite/micro/examples/hello_world/BUILD
+++ b/tensorflow/lite/micro/examples/hello_world/BUILD
@@ -91,3 +91,9 @@ cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
+
+sh_test(
+    name = "hello_world_binary_test",
+    srcs = ["hello_world_binary_test.sh"],
+    data = [":hello_world"],
+)
diff --git a/tensorflow/lite/micro/examples/hello_world/hello_world_binary_test.sh b/tensorflow/lite/micro/examples/hello_world/hello_world_binary_test.sh
new file mode 100755
index 00000000000..fe7683e5c4f
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/hello_world_binary_test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the example binary.
+
+set -e
+
+OUTPUT_LOG_FILE=${TEST_TMPDIR}/output_log.txt
+
+# Needed for copybara compatibility.
+SCRIPT_BASE_DIR=/org_"tensor"flow
+${TEST_SRCDIR}${SCRIPT_BASE_DIR}/tensorflow/lite/micro/examples/hello_world/hello_world   2>&1 | head > ${OUTPUT_LOG_FILE}
+
+if ! grep -q 'x_value:.*y_value:' ${OUTPUT_LOG_FILE}; then
+  echo "ERROR: Expected logs not found in output '${OUTPUT_LOG_FILE}'"
+  exit 1
+fi
+
+echo
+echo "SUCCESS: hello_world_binary_test PASSED"

From b8b2dd9609489d237cc5299885744a5855cd3d26 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Thu, 21 May 2020 17:16:30 -0700
Subject: [PATCH 1015/1533] Temporarily blacklist keras
 model_coverage_lib_tests from TF2 testing

This target appears to have never been executed under TF2 conditions,
and the Keras-specific tests are now failing.

PiperOrigin-RevId: 312770146
Change-Id: Icfc2ac6c7c73dda1db2c29a0a022d2ea8ea9c0da
---
 .../lite/testing/model_coverage/model_coverage_lib_test.py    | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 9236181f840..03a0004b2fc 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -178,18 +179,21 @@ class EvaluateKerasModel(test.TestCase):
       os.close(fd)
     return keras_file
 
+  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testFloat(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file)
 
+  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testPostTrainingQuantize(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file, post_training_quantize=True)
 
+  @test_util.run_v1_only('Keras test fails under v2, see b/157266669')
   def testTargetOps(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)

From e8e5b32a9eb9446cc936e2a1c50f46581d5fde23 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 21 May 2020 17:21:30 -0700
Subject: [PATCH 1016/1533] Add vlogging for compression data size.

PiperOrigin-RevId: 312770839
Change-Id: I4726f9a96af369e6997d4b153b600fd584b203a4
---
 tensorflow/core/data/compression_utils.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index ea06a082128..3fd4a4078b4 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -72,6 +72,8 @@ Status CompressElement(const std::vector<Tensor>& element,
                              out->mutable_data())) {
     return errors::Internal("Failed to compress using snappy.");
   }
+  VLOG(3) << "Compressed element from " << total_size << " bytes to "
+          << out->data().size() << " bytes";
   return Status::OK();
 }
 

From 13bd111c01dc1c436f1590210f18fd7d71167ab1 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Thu, 21 May 2020 17:22:23 -0700
Subject: [PATCH 1017/1533] Test that micro speech example binary can run

PiperOrigin-RevId: 312770984
Change-Id: I8953a37e2a9d7522cdf6714ebb68b72196e15e93
---
 .../lite/micro/examples/micro_speech/BUILD    |  6 ++++
 .../micro_speech_binary_mock_test.sh          | 33 +++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100755 tensorflow/lite/micro/examples/micro_speech/micro_speech_binary_mock_test.sh

diff --git a/tensorflow/lite/micro/examples/micro_speech/BUILD b/tensorflow/lite/micro/examples/micro_speech/BUILD
index d724972fbed..e0e1ca4ad10 100644
--- a/tensorflow/lite/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/BUILD
@@ -381,3 +381,9 @@ cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
+
+sh_test(
+    name = "micro_speech_binary_mock_test",
+    srcs = ["micro_speech_binary_mock_test.sh"],
+    data = [":micro_speech_mock"],
+)
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_binary_mock_test.sh b/tensorflow/lite/micro/examples/micro_speech/micro_speech_binary_mock_test.sh
new file mode 100755
index 00000000000..f18b7fa2dff
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_binary_mock_test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the example binary.
+
+set -e
+
+OUTPUT_LOG_FILE=${TEST_TMPDIR}/output_log.txt
+
+# Needed for copybara compatibility.
+SCRIPT_BASE_DIR=/org_"tensor"flow
+${TEST_SRCDIR}${SCRIPT_BASE_DIR}/tensorflow/lite/micro/examples/micro_speech/micro_speech_mock 2>&1 | head > ${OUTPUT_LOG_FILE}
+
+if ! grep -q 'Heard ' ${OUTPUT_LOG_FILE}; then
+  echo "ERROR: Expected logs not found in output '${OUTPUT_LOG_FILE}'"
+  exit 1
+fi
+
+echo
+echo "SUCCESS: micro_speech_binary_mock_test PASSED"

From 523269e12be2b468e0b5283f9dcc4860d4500b45 Mon Sep 17 00:00:00 2001
From: Robert Suderman <suderman@google.com>
Date: Thu, 21 May 2020 17:25:13 -0700
Subject: [PATCH 1018/1533] FloorDiv and FloorMod dynamic shape support

PiperOrigin-RevId: 312771368
Change-Id: I053191bca7f885f0146fa84772205ad19e150999
---
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 45 ++++++++++++++-----
 .../xla/transforms/legalize_tf_patterns.td    | 14 +++---
 2 files changed, 42 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 74c5e23dc5f..363e60eb341 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -505,8 +505,8 @@ func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> te
   // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ZEROS3]]
+  // CHECK-DAG: [[ONES:%.+]] = xla_hlo.constant dense<1>
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ONES]]
   // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
@@ -527,8 +527,8 @@ func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32
   // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
   // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
-  // CHECK-DAG: [[ZEROS3:%.+]] = xla_hlo.constant dense<1>
-  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ZEROS3]]
+  // CHECK-DAG: [[ONES:%.+]] = xla_hlo.constant dense<1>
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ONES]]
   // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
   // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
   // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
@@ -571,7 +571,22 @@ func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> te
 
 // CHECK-LABEL: func @floordiv_dynamic
 func @floordiv_dynamic(%arg0: tensor<?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.FloorDiv
+  // CHECK-DAG: [[ZEROS1:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[ZEROS2:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[CMP1]], [[CMP2]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: [[DIV1:%.+]] = xla_chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ABS1:%.+]] = "xla_hlo.abs"(%arg0)
+  // CHECK-DAG: [[ABS2:%.+]] = "xla_hlo.abs"(%arg1)
+  // CHECK-DAG: [[ONES:%.+]] = xla_hlo.constant dense<1>
+  // CHECK-DAG: [[SUB:%.+]] = xla_chlo.broadcast_subtract [[ABS2]], [[ONES]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add [[ABS1]], [[SUB]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[NEG:%.+]] = "xla_hlo.negate"([[ADD]])
+  // CHECK-DAG: [[ABS3:%.+]] = "xla_hlo.abs"(%arg1)
+  // CHECK-DAG: [[DIV2:%.+]] = xla_chlo.broadcast_divide [[NEG]], [[ABS3]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[CMP3]], [[DIV1]], [[DIV2]])
+  // CHECK: return [[SELECT]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<?x?xi32>, tensor<?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
@@ -589,8 +604,8 @@ func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>)
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
   // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR:%.+]] {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM:%.+]], [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"}
   // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {comparison_direction = "NE"}
   // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
   // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]]
@@ -606,8 +621,8 @@ func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32
   // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
   // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = "NE"}
   // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
-  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR:%.+]] {comparison_direction = "LT"}
-  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM:%.+]], [[ZR]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"}
   // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
   // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
   // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
@@ -619,7 +634,17 @@ func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32
 
 // CHECK-LABEL: func @floormod_dynamic
 func @floormod_dynamic(%arg0: tensor<?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?xi32> {
-  // CHECK: tf.FloorMod
+  // CHECK-DAG: [[REM:%.+]] = xla_chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ZL:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP1:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = "NE"}
+  // CHECK-DAG: [[ZR:%.+]] = xla_hlo.constant dense<0>
+  // CHECK-DAG: [[CMP2:%.+]] = xla_chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP3:%.+]] = xla_chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = "LT"}
+  // CHECK-DAG: [[CMP4:%.+]] = xla_chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
+  // CHECK-DAG: [[AND:%.+]] = xla_chlo.broadcast_and [[CMP1]], [[CMP4]]
+  // CHECK-DAG: [[ADD:%.+]] = xla_chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[SELECT:%.+]] = "xla_hlo.select"([[AND]], [[ADD]], [[REM]])
+  // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<?x?xi32>, tensor<?xi32>) -> tensor<?x?xi32>
   return %0: tensor<?x?xi32>
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 4989d97a360..ef5a8356a32 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -135,19 +135,19 @@ def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
 // NOTE: This should be optimized for unsigned integers.
 // Requires static shaped inputs to create constant splats and computation of
 // broadcast attributes.
-def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
+def : Pat<(TF_FloorDivOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (HLO_SelectOp
          (HLOClient_BroadcastCompareOp
-          (HLOClient_BroadcastCompareOp $l, (HLO_ConstOp (ConstantSplat<"0"> $l)),
+          (HLOClient_BroadcastCompareOp $l, (HLO_ConstOp (GetScalarOfType<0> $l)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
-          (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (ConstantSplat<"0"> $r)),
+          (HLOClient_BroadcastCompareOp $r, (HLO_ConstOp (GetScalarOfType<0> $r)),
            (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
           (BinBroadcastDimensions $l, $r), HLO_COMPARISON_DIRECTION_EQ),
         (HLOClient_BroadcastDivOp $l, $r, (BinBroadcastDimensions $l, $r)),
           (HLOClient_BroadcastDivOp
            (HLO_NegOp:$neg (HLOClient_BroadcastAddOp (HLO_AbsOp $l),
                        (HLOClient_BroadcastSubOp (HLO_AbsOp $r),
-                        (HLO_ConstOp (ConstantSplat<"1"> $r)),
+                        (HLO_ConstOp (GetScalarOfType<1> $r)),
                         (NullDenseIntElementsAttr)),
                      (BinBroadcastDimensions $l, $r))),
            (HLO_AbsOp:$abs $r), (BinBroadcastDimensions $neg, $abs))),
@@ -160,16 +160,16 @@ def : Pat<(TF_FloorDivOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
 //   return trunc_mod != 0 && (y < 0 != trunc_mod < 0) ? trunc_mod + y
 // Requires static shaped inputs to create constant splats and computation of
 // broadcast attributes.
-def : Pat<(TF_FloorModOp AnyStaticShapeTensor:$l, AnyStaticShapeTensor:$r),
+def : Pat<(TF_FloorModOp AnyRankedTensor:$l, AnyRankedTensor:$r),
       (HLO_SelectOp
        (HLOClient_BroadcastAndOp
         (HLOClient_BroadcastCompareOp
          (HLOClient_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
-         (HLO_ConstOp:$l_zeros (ConstantSplat<"0"> $l)),
+         (HLO_ConstOp:$l_zeros (GetScalarOfType<0> $l)),
          (BinBroadcastDimensions $l, $rem), HLO_COMPARISON_DIRECTION_NE),
         (HLOClient_BroadcastCompareOp
          (HLOClient_BroadcastCompareOp:$r_cmp $r,
-          (HLO_ConstOp:$r_zeros (ConstantSplat<"0"> $r)),
+          (HLO_ConstOp:$r_zeros (GetScalarOfType<0> $r)),
           (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_LT),
          (HLOClient_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
           (BinBroadcastDimensions $rem, $r_zeros), HLO_COMPARISON_DIRECTION_LT),

From 4eb90ac98fa3eb86cfa0dcc3c063783f304947cb Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Thu, 21 May 2020 17:34:03 -0700
Subject: [PATCH 1019/1533] Improve side-effecting semantics of xla_hlo ops
 with regions.

Add RecursiveSideEffects trait for ops with regions. For xla_hlo.all_reduce, the side-effecting behavior depends on if channel_id has value. Since, it is a dynamic property, we conservatively assume it has side-effects.

This follows XLA semantics defined in HloInstruction::HasSideEffect().

PiperOrigin-RevId: 312772441
Change-Id: Iabacfb49451640e23129338c6555774128021cbd
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    | 24 ++++++++++---------
 .../compiler/mlir/xla/tests/canonicalize.mlir | 20 ++++++++++++++--
 2 files changed, 31 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index ed57ded47e7..6c54e3fbf90 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -481,7 +481,7 @@ def HLO_AfterAllOp : HLO_Op<"after_all", [NoSideEffect]> {
 // Xla Client API has two separate calls for indexed and predicated conditional,
 // although both eventually map to kConditional HLO. IfOp maps to predicated
 // conditional use of kConditional HLO.
-def HLO_IfOp: HLO_Op<"if", []> {
+def HLO_IfOp: HLO_Op<"if", [RecursiveSideEffects]> {
   string summary = "If operator";
 
   string description = [{
@@ -509,7 +509,7 @@ def HLO_IfOp: HLO_Op<"if", []> {
 // Xla Client API has two separate calls for indexed and predicated conditional,
 // although both eventually map to kConditional HLO. CaseOp maps to indexed
 // conditional use of kConditional HLO.
-def HLO_CaseOp: HLO_Op<"case", []>,
+def HLO_CaseOp: HLO_Op<"case", [RecursiveSideEffects]>,
       BASE_HLO_CaseOp {
 
   let arguments = (ins
@@ -525,7 +525,8 @@ def HLO_CaseOp: HLO_Op<"case", []>,
 }
 
 
-def HLO_WhileOp: HLO_Op<"while", [SameOperandsAndResultType]> {
+def HLO_WhileOp: HLO_Op<"while", [RecursiveSideEffects,
+                                  SameOperandsAndResultType]> {
   string summary = "While operator";
 
   string description = [{
@@ -546,7 +547,7 @@ def HLO_WhileOp: HLO_Op<"while", [SameOperandsAndResultType]> {
 }
 
 def HLO_AllReduceOp : HLO_Op<"all_reduce",
-    [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_AllReduceOp {
+    [SameOperandsAndResultType]>, BASE_HLO_AllReduceOp {
 
   let arguments = (ins
     HLO_Tensor:$operand,
@@ -573,7 +574,7 @@ def HLO_AllToAllOp : HLO_Op<"all_to_all",
 }
 
 def HLO_ReduceOp: HLO_Op<"reduce", [
-      NoSideEffect,
+      RecursiveSideEffects,
       SameVariadicOperandSize,
       SingleBlockImplicitTerminator<"ReturnOp">
     ]>, BASE_HLO_ReduceOp {
@@ -1054,8 +1055,8 @@ def HLO_GetDimensionSizeOp: HLO_Op<"get_dimension_size", [NoSideEffect]>,
 }
 
 def HLO_MapOp: HLO_Op<"map",
-      [NoSideEffect, SameOperandsElementType, SameOperandsAndResultShape,
-        SingleBlockImplicitTerminator<"ReturnOp">]>,
+      [RecursiveSideEffects, SameOperandsElementType,
+       SameOperandsAndResultShape, SingleBlockImplicitTerminator<"ReturnOp">]>,
       BASE_HLO_MapOp {
   let arguments = (ins
     Variadic<HLO_Tensor>:$operands,
@@ -1104,7 +1105,8 @@ def ScatterDimensionNumbers : StructAttr<"ScatterDimensionNumbers", HLO_Dialect,
   let description = "Structure of dimension information for scatter";
 }
 
-def HLO_ScatterOp: HLO_Op<"scatter", [NoSideEffect]>, BASE_HLO_ScatterOp {
+def HLO_ScatterOp: HLO_Op<"scatter", [RecursiveSideEffects]>,
+      BASE_HLO_ScatterOp {
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_Tensor:$scatter_indices,
@@ -1133,7 +1135,7 @@ def HLO_SelectOp: HLO_Op<"select", [NoSideEffect, DeclareOpInterfaceMethods<Infe
 }
 
 def HLO_SelectAndScatterOp: HLO_Op<"select_and_scatter",
-      [NoSideEffect]>, BASE_HLO_SelectAndScatterOp {
+      [RecursiveSideEffects]>, BASE_HLO_SelectAndScatterOp {
   let arguments = (ins
     HLO_Tensor:$operand,
     HLO_Tensor:$source,
@@ -1160,7 +1162,7 @@ def HLO_SetDimensionSizeOp: HLO_Op<"set_dimension_size", [NoSideEffect]>,
   let results = (outs HLO_Tensor);
 }
 
-def HLO_SortOp : HLO_Op<"sort", [NoSideEffect]>, BASE_HLO_SortOp {
+def HLO_SortOp : HLO_Op<"sort", [RecursiveSideEffects]>, BASE_HLO_SortOp {
   let arguments = (ins
     Variadic<HLO_Tensor>:$operands,
     DefaultValuedAttr<I64Attr, "-1">:$dimension,
@@ -1246,7 +1248,7 @@ def HLO_TriangularSolveOp: HLO_Op<"triangular_solve",
 }
 
 def HLO_ReduceWindowOp: HLO_Op<"reduce_window", [
-      NoSideEffect,
+      RecursiveSideEffects,
       SingleBlockImplicitTerminator<"ReturnOp">
     ]>, BASE_HLO_ReduceWindowOp {
 
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index 30255586002..afe3e1b73a5 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -387,8 +387,8 @@ func @dynamic_reshape_not_actually_dynamic(%arg0: tensor<4xf32>, %shape: tensor<
   return %0 : tensor<4x1xf32>
 }
 
-// CHECK-LABEL: do_not_dce_while
-func @do_not_dce_while(%arg0: tensor<i64>) -> tensor<i64> {
+// CHECK-LABEL: do_not_dce_while_with_outfeed
+func @do_not_dce_while_with_outfeed(%arg0: tensor<i64>) -> tensor<i64> {
   // CHECK: xla_hlo.while
   %0 = "xla_hlo.while"(%arg0) ( {
   ^bb0(%arg1: tensor<i64>):
@@ -404,3 +404,19 @@ func @do_not_dce_while(%arg0: tensor<i64>) -> tensor<i64> {
 
   return %arg0 : tensor<i64>
 }
+
+// CHECK-LABEL: dce_while_without_side_effect
+func @dce_while_without_side_effect(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK-NOT: xla_hlo.while
+  %0 = "xla_hlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    %1 = "xla_hlo.compare"(%arg1, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "xla_hlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    %1 = "xla_hlo.create_token"() : () -> !xla_hlo.token
+    "xla_hlo.return"(%arg1) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+
+  return %arg0 : tensor<i64>
+}

From 89860c9173ac610df1c27682526a8d6eaacf2e3d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 17:39:10 -0700
Subject: [PATCH 1020/1533] Pfor: support vectorizing tensorlists captured by
 while_loop.

PiperOrigin-RevId: 312773003
Change-Id: I863301d39b85486907dadf5c1ac340db1fbe1e6b
---
 .../ops/parallel_for/control_flow_ops_test.py | 21 +++++++++++++
 tensorflow/python/ops/parallel_for/pfor.py    | 31 +++++++++++++++++--
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 7faba3241a6..243471553d9 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -993,6 +993,27 @@ class TensorListTest(PForTestCase):
 
     self._test_loop_fn(loop_fn, 2)
 
+  def test_tensor_list_reserve_while_loop(self):
+    # Here a loop invariant TensorList is captured by a while_loop, which then
+    # performs loop dependent operations on it, resulting in a loop variant
+    # output. This forces stacking of the variant handle captured by the
+    # while_loop.
+    # We handle this particular case by forcing vectorization of
+    # TensorListReserve operation.
+    v2_enabled = control_flow_v2_toggles.control_flow_v2_enabled()
+    control_flow_v2_toggles.enable_control_flow_v2()
+    def loop_fn(i):
+      handle = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      _, out_handle = control_flow_ops.while_loop(
+          lambda j, _: j < 2,
+          lambda j, h: (j + 1, list_ops.tensor_list_set_item(h, j, i)),
+          (0, handle))
+      return list_ops.tensor_list_stack(out_handle, dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 2)
+    if not v2_enabled:
+      control_flow_v2_toggles.disable_control_flow_v2()
+
 
 class StackTest(PForTestCase):
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index bd6ff9a0bd1..582bfecdc76 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -24,6 +24,7 @@ import string
 import sys
 import traceback
 
+import numpy as np
 import six
 
 from tensorflow.compiler.tf2xla.python import xla
@@ -75,7 +76,19 @@ flags.DEFINE_bool(
 
 def _stack(t, length):
   """stacks `t` `length` times."""
-  assert t.dtype != dtypes.variant
+  # Note that this stacking may currently be triggered, for example, when a
+  # loop invariant tensor with dtype variant is input to a while_loop which then
+  # produces a loop dependent output. Simply stacking the variants may not be
+  # suitable since operations on stacked handles may expect a vectorized version
+  # of the variant.
+  # Given that variant types are generic, we are currently unable to figure out
+  # which particular variant type is being considered here and hence it may not
+  # be safe to allow stacking it.
+  if t.dtype == dtypes.variant:
+    raise NotImplementedError(
+        "Vectorization tried to stack variant tensor %s. "
+        "This is likely because vectorization of that variant "
+        "is not fully supported yet." % t)
   ones = array_ops.ones_like(array_ops.shape(t))
   ones = array_ops.reshape(ones, [-1])
   length = array_ops.reshape(length, [-1])
@@ -104,6 +117,15 @@ passthrough_stateful_ops = set([
 ])
 
 
+# Ops which we will treat like stateful for the purpose of vectorization.
+# Typically this is used to force pfor converters to run for these ops.
+force_stateful_ops = set([
+    # We vectorize this since we need to change the element shape set on the
+    # list.
+    "TensorListReserve",
+])
+
+
 def _is_stateful_pfor_op(op):
   if isinstance(op, WhileOp):
     return op.is_stateful
@@ -112,6 +134,8 @@ def _is_stateful_pfor_op(op):
     return False
   if op.type in passthrough_stateful_ops:
     return False
+  if op.type in force_stateful_ops:
+    return True
   assert hasattr(op, "op_def") and op.op_def is not None, op
   return op.op_def.is_stateful
 
@@ -3481,9 +3505,10 @@ def _stack_tensor_list_shape(shape, pfor_input):
   # Note that negative values in the shape are used to signify unknown shapes
   # and are handled in a special way.
   if shape_value is not None:
-    if shape_value == -1 or -1 in shape_value:
+    shape_value = np.asarray(shape_value)
+    if -1 in shape_value:
       return constant_op.constant(-1)
-    elif not shape_value:
+    elif not shape_value.size:
       return first_dim
   else:
     shape = array_ops.reshape(shape, [-1])

From 063a0402871bb4e3d09d57adbd51bf1b683e440a Mon Sep 17 00:00:00 2001
From: Sean Silva <silvasean@google.com>
Date: Thu, 21 May 2020 17:49:27 -0700
Subject: [PATCH 1021/1533] Make tf_saved_model-related passes check their
 input invariants.

These passes rely on the module to have tf_saved_model.semantics, and previously they would spuriously try to do things on modules that did not have those semantics.

PiperOrigin-RevId: 312774289
Change-Id: I50b6596f1692bbb9fec3942e9031644b82dd768f
---
 .../tests/tf_saved_model_freeze_global_tensors.mlir         | 6 ++++++
 .../tests/tf_saved_model_optimize_global_tensors.mlir       | 6 ++++++
 .../mlir/tensorflow/transforms/freeze_global_tensors.cc     | 3 +++
 .../mlir/tensorflow/transforms/optimize_global_tensors.cc   | 4 ++++
 4 files changed, 19 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
index 38aa078358b..961039e7968 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
@@ -104,3 +104,9 @@ module attributes {tf_saved_model.semantics} {
     return
   }
 }
+
+// -----
+
+// Test running the pass on a module that does not have
+// tf_saved_model.semantics.
+module {}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
index f985be16ab8..80d9a498253 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_optimize_global_tensors.mlir
@@ -136,3 +136,9 @@ module attributes {tf_saved_model.semantics} {
   }
 
 }
+
+// -----
+
+// Test running the pass on a module that does not have
+// tf_saved_model.semantics.
+module {}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
index 9d2a7e787ff..a0cf9c8eb9a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/freeze_global_tensors.cc
@@ -48,6 +48,9 @@ struct FreezeGlobalTensorsPass
 
 void FreezeGlobalTensorsPass::runOnOperation() {
   auto module = getOperation();
+  if (!tf_saved_model::HasTfSavedModelSemantics(module)) {
+    return;
+  }
   SymbolTable symbol_table(module);
   DenseSet<Operation*> frozen_global_tensors;
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
index 550100c8ebf..cd8f988fd5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/optimize_global_tensors.cc
@@ -278,6 +278,10 @@ void EraseUnusedBoundInputs(ModuleOp module) {
 
 void OptimizeGlobalTensorsPass::runOnOperation() {
   auto module = getOperation();
+  if (!tf_saved_model::HasTfSavedModelSemantics(module)) {
+    return;
+  }
+
   EraseUnusedBoundInputs(module);
 
   ResourceAnalyzer resource_analyzer(module);

From c7229fcabb56c4455d5342146ea595f0e8a62d3e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 17:59:10 -0700
Subject: [PATCH 1022/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 312775359
Change-Id: Iffdf619b6352f62ea92362d7419c7bc16a423685
---
 .../ops_history_v2/SparseSegmentMean.pbtxt    | 56 ++++++++++++++
 .../SparseSegmentMeanWithNumSegments.pbtxt    | 73 +++++++++++++++++++
 .../ops_history_v2/SparseSegmentSqrtN.pbtxt   | 56 ++++++++++++++
 .../SparseSegmentSqrtNWithNumSegments.pbtxt   | 73 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  4 +
 5 files changed, 262 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
index a3fde8699b1..5f362b97cb0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
@@ -95,3 +95,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
index 2d1d816200a..60f9c4bbd00 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
@@ -129,3 +129,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
index 6ab44de93ec..68359ea0c08 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
@@ -95,3 +95,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
index 038a5a2bd28..d16063dca08 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -129,3 +129,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2f6e0dc0d4c..98a1b9328be 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -46097,6 +46097,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46215,6 +46216,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46283,6 +46285,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46401,6 +46404,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }

From 8fc976574e66186e3b4c4b94a6477eb090618cab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 18:03:31 -0700
Subject: [PATCH 1023/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/1108f5c737db

PiperOrigin-RevId: 312775865
Change-Id: Iee2170660e6b2cd0a81695e8843bebfb311c480b
---
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   |  4 ++
 .../xla/transforms/hlo_legalize_to_lhlo.cc    |  4 +-
 .../mlir/xla/transforms/lhlo_fuse_linalg.cc   | 26 ++++++----
 third_party/mlir/BUILD                        | 49 ++++++++++++++++++-
 4 files changed, 70 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 020859aa0bf..9a2168d3088 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -444,6 +444,10 @@ def TerminatorOp :
   let description = [{
     Terminator operation for the LHLO dialect.
   }];
+  let builders = [OpBuilder<
+    "OpBuilder &b, OperationState &result, ValueRange operands",
+    [{ build(b, result, llvm::None, operands, llvm::None); }]
+  >];
 }
 
 #endif // LHLO_OPS
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 11b2ae65d8e..5851bad4565 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -43,8 +43,8 @@ constexpr StringRef kTempBufferAttr = "temp";
 template <typename T>
 using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
 using StdReturnOpConverter =
-    NonVoidToVoidReturnOpConverter<mlir::ReturnOp, xla_lhlo::TerminatorOp,
-                                   xla_lhlo::CopyOp>;
+    NoBufferOperandsReturnOpConverter<mlir::ReturnOp, xla_lhlo::TerminatorOp,
+                                      xla_lhlo::CopyOp>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
                                    Value shape_operand,
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
index 43c0911a4a6..ddbb672c70a 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc
@@ -57,8 +57,9 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
     for (auto func_arg : func.getArguments()) {
       func_args.insert(func_arg);
     }
+    MLIRContext* ctx = func.getContext();
     OpBuilder b(func);
-    OperationFolder folder(func.getContext());
+    OperationFolder folder(ctx);
     func.walk([&](linalg::GenericOp generic_op) {
       SmallVector<int64_t, 2> tile_sizes(tile_sizes_.begin(),
                                          tile_sizes_.end());
@@ -68,12 +69,14 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
       auto op = cast<LinalgOp>(generic_op.getOperation());
       for (const Value result : op.getOutputBuffers()) {
         if (!func_args.count(result)) continue;
-        if (tileGenericOp(op, tile_sizes, &b, &folder)) {
+        if (tileGenericOp(op, tile_sizes, &b)) {
           generic_op.erase();
           return;
         }
       }
     });
+    auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
+    applyPatternsAndFoldGreedily(func, patterns);
 
     // Fuse producers of tiled linalg ops.
     llvm::SmallDenseSet<Operation*> erase_set;
@@ -92,19 +95,22 @@ class LhloFuseLinalg : public PassWrapper<LhloFuseLinalg, FunctionPass> {
           *originalOpInLinalgOpsVector = info->fusedProducer.getOperation();
         }
       }
+
+      auto patterns = linalg::getLinalgTilingCanonicalizationPatterns(ctx);
+      applyPatternsAndFoldGreedily(func, patterns);
     }
     for (auto* e : erase_set) e->erase();
   }
 
  private:
-  bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b,
-                     OperationFolder* folder) {
-    auto tiled_generic_op =
-        use_parallel_loops_
-            ? linalg::tileLinalgOpToParallelLoops(*b, op, tile_sizes,
-                                                  /*permutation=*/{}, folder)
-            : linalg::tileLinalgOp(*b, op, tile_sizes,
-                                   /*permutation=*/{}, folder);
+  bool tileGenericOp(LinalgOp op, ArrayRef<int64_t> tile_sizes, OpBuilder* b) {
+    auto loopType = use_parallel_loops_
+                        ? linalg::LinalgTilingLoopType::ParallelLoops
+                        : linalg::LinalgTilingLoopType::Loops;
+    auto tiled_generic_op = linalg::tileLinalgOp(*b, op,
+                                                 linalg::LinalgTilingOptions()
+                                                     .setTileSizes(tile_sizes)
+                                                     .setLoopType(loopType));
     return tiled_generic_op.hasValue();
   }
 
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 5ebcbb6e3d2..a57088432e2 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -175,6 +175,7 @@ filegroup(
 filegroup(
     name = "AffineOpsTdFiles",
     srcs = [
+        "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td",
         "include/mlir/Dialect/Affine/IR/AffineOps.td",
         "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
@@ -207,6 +208,26 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "AffineMemoryOpInterfacesIncGen",
+    strip_include_prefix = "include",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.cpp.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.td",
+    td_srcs = [
+        ":AffineOpsTdFiles",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # AVX512 dialect.
 ##---------------------------------------------------------------------------##
@@ -462,6 +483,7 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
+        ":AffineMemoryOpInterfacesIncGen",
         ":AffineOpsIncGen",
         ":EDSC",
         ":IR",
@@ -677,6 +699,7 @@ cc_library(
     deps = [
         ":CallOpInterfaces",
         ":CommonFolders",
+        ":ControlFlowInterfaces",
         ":Dialect",
         ":IR",
         ":InferTypeOpInterface",
@@ -1153,6 +1176,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "GPURuntimeTransforms",
+    srcs = [
+        "lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp",
+        "lib/Conversion/PassDetail.h",
+    ],
+    hdrs = [
+        "include/mlir/Conversion/GPUCommon/GPUCommonPass.h",
+    ],
+    includes = ["include"],
+    deps = [
+        ":ConversionPassIncGen",
+        ":GPUDialect",
+        ":IR",
+        ":LLVMDialect",
+        ":Pass",
+        ":Support",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 gentbl(
     name = "GPUToNVVMGen",
     strip_include_prefix = "lib/Conversion/GPUToNVVM",
@@ -1265,7 +1310,6 @@ cc_library(
     name = "GPUToCUDATransforms",
     srcs = [
         "lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp",
-        "lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp",
         "lib/Conversion/PassDetail.h",
     ],
     hdrs = ["include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"],
@@ -2446,6 +2490,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Analysis",
+        ":GPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUToSPIRVTransforms",
@@ -2525,6 +2570,7 @@ cc_library(
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUPassIncGen",
+        ":GPURuntimeTransforms",
         ":GPUToCUDATransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
@@ -2730,6 +2776,7 @@ cc_binary(
         ":AllPassesAndDialectsNoRegistration",
         ":ExecutionEngineUtils",
         ":GPUDialect",
+        ":GPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUTransforms",

From 6b715d723897fd1a1f26893143b7e7781d99f42c Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 21 May 2020 18:08:24 -0700
Subject: [PATCH 1024/1533] Remove superfluous tracemes in compression_utils.

Now that we execute compression as a tensorflow op, a traceme will automatically be generated for the op, so we don't need a second traceme inside the compression util.

PiperOrigin-RevId: 312776419
Change-Id: I709c89b6a7fafaf41dbc8a64c8c28025cd7cd287
---
 tensorflow/core/data/BUILD                | 1 -
 tensorflow/core/data/compression_utils.cc | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index e42c46d6348..1b6e6790559 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -29,7 +29,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/memory",
     ],
 )
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index 3fd4a4078b4..d132bdca8da 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -17,16 +17,12 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/platform/snappy.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace data {
 
 Status CompressElement(const std::vector<Tensor>& element,
                        CompressedElement* out) {
-  tensorflow::profiler::TraceMe activity(
-      "CompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
-
   // Step 1: Determine the total uncompressed size. This requires serializing
   // non-memcopyable tensors, which we save to use again later.
   std::vector<TensorProto> non_memcpy_components;
@@ -79,8 +75,6 @@ Status CompressElement(const std::vector<Tensor>& element,
 
 Status UncompressElement(const CompressedElement& compressed,
                          std::vector<Tensor>* out) {
-  tensorflow::profiler::TraceMe activity(
-      "UncompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
   int num_components = compressed.component_metadata_size();
   out->clear();
   out->reserve(num_components);

From e3290d584e4df09588768a12b3491733d68ff246 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 21 May 2020 18:34:56 -0700
Subject: [PATCH 1025/1533] Bump open source llvm revision to
 1108f5c737dbdab0277874a7e5b237491839c43a

PiperOrigin-RevId: 312779244
Change-Id: I76c921236ebc962b3907994d9a26456b995183ed
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f82aa7caa37..b7682468998 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "7af0c8559b6d9426dd5e977370516d2baa4c206f"
-    LLVM_SHA256 = "4c5efbc48755f9983a8522eddd6e448f0b93e3e75a56a507c1ecb44d367db6d5"
+    LLVM_COMMIT = "1108f5c737dbdab0277874a7e5b237491839c43a"
+    LLVM_SHA256 = "bbdaaa145a5a8eed8e6a0f06a3b9965f32b03286eddea5f50c5af2d1f3d008df"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 86309a4f4e58282d547a8d6a21296c9c0fdf24bf Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Thu, 21 May 2020 18:45:15 -0700
Subject: [PATCH 1026/1533] Internal TF change.

PiperOrigin-RevId: 312780274
Change-Id: I1ec0ecccc732b04a838f2b845c0f69e4f94f5a1f
---
 tensorflow/compiler/xla/literal.cc | 26 ++++++++++++++++++++++++++
 tensorflow/compiler/xla/literal.h  |  4 ++++
 2 files changed, 30 insertions(+)

diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index cbbad741ce3..73c37d6b2f3 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -2104,6 +2104,32 @@ MutableBorrowingLiteral::MutableBorrowingLiteral(const char* src_buf_ptr,
   root_piece_->set_subshape(shape_.get());
 }
 
+MutableBorrowingLiteral::MutableBorrowingLiteral(absl::Span<char*> src_buf_ptrs,
+                                                 const Shape& shape)
+    : MutableLiteralBase() {
+  shape_ = absl::make_unique<Shape>(shape);
+  if (!shape_->IsTuple()) {
+    CHECK_EQ(src_buf_ptrs.size(), 1);
+    root_piece_ = new Piece();
+    root_piece_->set_buffer(const_cast<char*>(src_buf_ptrs[0]));
+    root_piece_->set_subshape(shape_.get());
+  } else {
+    CHECK(!ShapeUtil::IsNestedTuple(*shape_));
+    CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(*shape_));
+    root_piece_ = new Piece();
+    root_piece_->set_subshape(shape_.get());
+
+    for (int i = 0; i < src_buf_ptrs.size(); ++i) {
+      Piece child_piece;
+      const auto& src_shape = shape_->tuple_shapes(i);
+      CHECK(src_shape.IsArray());
+      child_piece.set_subshape(&src_shape);
+      child_piece.set_buffer(src_buf_ptrs[i]);
+      root_piece_->emplace_back(std::move(child_piece));
+    }
+  }
+}
+
 MutableBorrowingLiteral::~MutableBorrowingLiteral() {
   if (root_piece_ != nullptr) {
     delete root_piece_;
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index 1553d042e80..a2be92fbf5b 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -776,6 +776,10 @@ class MutableBorrowingLiteral : public MutableLiteralBase {
                           const ShapeIndex& view_root);
   MutableBorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
 
+  // Create a literal from a list of buffers and a shape.
+  // Returns a tuple literal if `shape` is a tuple type.
+  MutableBorrowingLiteral(absl::Span<char*> src_buf_ptrs, const Shape& shape);
+
  private:
   // Recursively copies the subtree from the `src_piece` at the given child
   // index to the `dest_piece`. For buffers only the pointers are copied, but

From 138c8e71459274c7a1cb2dc5f177ff69a9cf752f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Thu, 21 May 2020 18:59:16 -0700
Subject: [PATCH 1027/1533] [Grappler] Do not add data inputs after control
 inputs in ImplementationSelector

PiperOrigin-RevId: 312781559
Change-Id: I5fa39b5c4f4250604274a7c39f23de71cf3d7608
---
 tensorflow/core/grappler/optimizers/BUILD         |  1 +
 .../optimizers/implementation_selector.cc         | 15 +++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index b880055b47d..030064e49fb 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1064,6 +1064,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:graph_view",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 37dda6ab6a3..9c4f74d7268 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/optimizers/function_api_info.h"
+#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/graph_view.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -159,6 +160,15 @@ Status UpdateNodeDef(utils::MutableNodeView* node_view, const string& funcName,
   }
 
   if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) {
+    // Strip node control dependencies. We'll add them back after updating
+    // all the data inputs.
+    std::vector<std::string> control_deps;
+    for (int i = node_def->input_size() - 1; i >= 0; --i) {
+      if (!IsControlInput(node_def->input(i))) break;
+      control_deps.push_back(node_def->input(i));
+      node_def->mutable_input()->RemoveLast();
+    }
+
     // For step 4 above.
     const int prev_input_size = node_def->input_size();
     const int diff = prev_input_size - apiInfo.input_arg_dtypes().size();
@@ -194,6 +204,11 @@ Status UpdateNodeDef(utils::MutableNodeView* node_view, const string& funcName,
       for (int i = 1; i <= -diff; ++i)
         node_def->add_input(strings::StrCat(node_name, ":", i + last_index));
     }
+
+    // Add control dependencies back.
+    for (std::string& control : control_deps)
+      node_def->add_input(std::move(control));
+
   } else if (apiInfo.function_type() == FunctionApiInfo::FORWARD) {
     // For forward function, since the DTYPE of the intermediate state might
     // have been changed, we want to update the down stream Identity node if

From b0b60f9141e49930f14133adb83f4137f6dc6893 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 19:07:16 -0700
Subject: [PATCH 1028/1533] Add bfloat16 support for
 SparseSegmentMean*/SparseSegmentSqrtN*

PiperOrigin-RevId: 312782379
Change-Id: If5cb060bbbb5f624f69c5ac1350f9e9ef2e6a920
---
 .../core/kernels/segment_reduction_ops_impl.h | 263 ++++++++----------
 .../kernels/segment_reduction_ops_impl_5.cc   |   2 -
 tensorflow/core/ops/math_ops.cc               |   8 +-
 3 files changed, 121 insertions(+), 152 deletions(-)

diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index ccd775b7ef2..8954dcd4681 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -508,12 +508,6 @@ class SparseSegmentReductionOpBase : public OpKernel {
                 errors::InvalidArgument("segment ids must be >= 0"));
     auto output_flat = output->flat_outer_dims<T>();
 
-    Tensor temp;
-    if constexpr (std::is_same<T, bfloat16>::value) {
-      temp = tensorflow::Tensor(DT_FLOAT, output_shape);
-    }
-    auto temp_flat = temp.flat_outer_dims<float>();
-
     int64 start = 0, end = 1;
     // Index from which the output is not initialized.
     SegmentId uninitialized_index = 0;
@@ -552,9 +546,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
       }
 
       auto out = output_flat.template chip<0>(out_index);
-      auto temp = temp_flat.template chip<0>(out_index);
       const int bad_offset =
-          Reduce(input_flat, indices_vec, start, end - start, out, temp);
+          Reduce(input_flat, indices_vec, start, end - start, out);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
                       "Bad: indices[", start + bad_offset,
@@ -579,152 +572,130 @@ class SparseSegmentReductionOpBase : public OpKernel {
   }
 
  private:
-  // TODO(jaideepsi): re-write without macros, simplify Reduce b/157240265
-  int64 Reduce(
-      const typename TTypes<T>::ConstMatrix& input_flat,
-      const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
-      int64 num, Eigen::TensorChippingOp<0, typename TTypes<T>::Matrix> out,
-      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
-#define REDUCE                                                                 \
-  if (num == 1) {                                                              \
-    INDEX(0, 0);                                                               \
-    OUT = L(0);                                                                \
-  } else {                                                                     \
-    int64 r = num & 7;                                                         \
-    DT m(1);                                                                   \
-    if (is_mean_ && (num < 10)) {                                              \
-      m = DT(num);                                                             \
-    }                                                                          \
-    if (is_sqrtn_ && (num < 10)) {                                             \
-      m = DT(sqrt(num));                                                       \
-    }                                                                          \
-    switch (r) {                                                               \
-      case 2: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        OUT = (L(0) + L(1)) / m;                                               \
-        break;                                                                 \
-      }                                                                        \
-      case 3: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        INDEX(2, 2);                                                           \
-        OUT = (L(0) + L(1) + L(2)) / m;                                        \
-        break;                                                                 \
-      }                                                                        \
-      case 4: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        INDEX(2, 2);                                                           \
-        INDEX(3, 3);                                                           \
-        OUT = (L(0) + L(1) + L(2) + L(3)) / m;                                 \
-        break;                                                                 \
-      }                                                                        \
-      case 5: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        INDEX(2, 2);                                                           \
-        INDEX(3, 3);                                                           \
-        INDEX(4, 4);                                                           \
-        OUT = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;                          \
-        break;                                                                 \
-      }                                                                        \
-      case 6: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        INDEX(2, 2);                                                           \
-        INDEX(3, 3);                                                           \
-        INDEX(4, 4);                                                           \
-        INDEX(5, 5);                                                           \
-        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;                   \
-        break;                                                                 \
-      }                                                                        \
-      case 7: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        INDEX(2, 2);                                                           \
-        INDEX(3, 3);                                                           \
-        INDEX(4, 4);                                                           \
-        INDEX(5, 5);                                                           \
-        INDEX(6, 6);                                                           \
-        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;            \
-        break;                                                                 \
-      }                                                                        \
-      case 0: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        INDEX(2, 2);                                                           \
-        INDEX(3, 3);                                                           \
-        INDEX(4, 4);                                                           \
-        INDEX(5, 5);                                                           \
-        INDEX(6, 6);                                                           \
-        INDEX(7, 7);                                                           \
-        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;     \
-        r = 8;                                                                 \
-        break;                                                                 \
-      }                                                                        \
-      case 1: {                                                                \
-        INDEX(0, 0);                                                           \
-        INDEX(1, 1);                                                           \
-        INDEX(2, 2);                                                           \
-        INDEX(3, 3);                                                           \
-        INDEX(4, 4);                                                           \
-        INDEX(5, 5);                                                           \
-        INDEX(6, 6);                                                           \
-        INDEX(7, 7);                                                           \
-        INDEX(8, 8);                                                           \
-        OUT = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) / \
-              m;                                                               \
-        r = 9;                                                                 \
-        break;                                                                 \
-      }                                                                        \
-    }                                                                          \
-    for (; r < num; r += 8) {                                                  \
-      INDEX(0, r);                                                             \
-      INDEX(1, r + 1);                                                         \
-      INDEX(2, r + 2);                                                         \
-      INDEX(3, r + 3);                                                         \
-      INDEX(4, r + 4);                                                         \
-      INDEX(5, r + 5);                                                         \
-      INDEX(6, r + 6);                                                         \
-      INDEX(7, r + 7);                                                         \
-      OUT += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);            \
-    }                                                                          \
-    if (is_mean_ && num >= 10) {                                               \
-      OUT = OUT / static_cast<DT>(num);                                        \
-    }                                                                          \
-    if (is_sqrtn_ && num >= 10) {                                              \
-      OUT = OUT / static_cast<DT>(sqrt(num));                                  \
-    }                                                                          \
-  }
-
+  int64 Reduce(const typename TTypes<T>::ConstMatrix& input_flat,
+               const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
+               int64 num,
+               Eigen::TensorChippingOp<0, typename TTypes<T>::Matrix> out) {
 #define INDEX(n, i)                               \
   const auto index##n = indices_vec(start + (i)); \
   if (!FastBoundsCheck(index##n, input_flat.dimension(0))) return (i);
 
-    if constexpr (std::is_same<T, bfloat16>::value) {
-#define L(n) input_flat.template chip<0>(index##n).template cast<float>()
-#define OUT temp
-#define DT float
-
-      REDUCE;
-      out = temp.template cast<bfloat16>();
-#undef DT
-#undef OUT
-#undef L
-    } else {
 #define L(n) input_flat.template chip<0>(index##n)
-#define OUT out
-#define DT T
 
-      REDUCE;
-
-#undef DT
-#undef OUT
-#undef L
+    if (num == 1) {
+      INDEX(0, 0);
+      out = L(0);
+    } else {
+      int64 r = num % 8;
+      T m(1);
+      if (is_mean_ && (num < 10)) {
+        m = T(num);
+      }
+      if (is_sqrtn_ && (num < 10)) {
+        m = T(sqrt(num));
+      }
+      switch (r) {
+        case 2: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          out = (L(0) + L(1)) / m;
+          break;
+        }
+        case 3: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          out = (L(0) + L(1) + L(2)) / m;
+          break;
+        }
+        case 4: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          out = (L(0) + L(1) + L(2) + L(3)) / m;
+          break;
+        }
+        case 5: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;
+          break;
+        }
+        case 6: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;
+          break;
+        }
+        case 7: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          INDEX(6, 6);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;
+          break;
+        }
+        case 0: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          INDEX(6, 6);
+          INDEX(7, 7);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;
+          r = 8;
+          break;
+        }
+        case 1: {
+          INDEX(0, 0);
+          INDEX(1, 1);
+          INDEX(2, 2);
+          INDEX(3, 3);
+          INDEX(4, 4);
+          INDEX(5, 5);
+          INDEX(6, 6);
+          INDEX(7, 7);
+          INDEX(8, 8);
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) /
+                m;
+          r = 9;
+          break;
+        }
+      }
+      for (; r < num; r += 8) {
+        INDEX(0, r);
+        INDEX(1, r + 1);
+        INDEX(2, r + 2);
+        INDEX(3, r + 3);
+        INDEX(4, r + 4);
+        INDEX(5, r + 5);
+        INDEX(6, r + 6);
+        INDEX(7, r + 7);
+        out += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);
+      }
+      if (is_mean_ && num >= 10) {
+        out = out / static_cast<T>(num);
+      }
+      if (is_sqrtn_ && num >= 10) {
+        out = out / static_cast<T>(sqrt(num));
+      }
     }
+
     return -1;
-#undef REDUCE
+#undef L
 #undef INDEX
   }
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
index 03a448e52b3..fee0f818c5e 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -64,7 +64,6 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
                                                   segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
-REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
@@ -86,7 +85,6 @@ REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
           CPUDevice, type, index_type, segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
-REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 5327995e0a4..cbf03d7b045 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1337,7 +1337,7 @@ REGISTER_OP("SparseSegmentMean")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1348,7 +1348,7 @@ REGISTER_OP("SparseSegmentMeanWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
@@ -1370,7 +1370,7 @@ REGISTER_OP("SparseSegmentSqrtN")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1381,7 +1381,7 @@ REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {bfloat16, float, double}")
+    .Attr("T: {float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")

From 08b5c94d57ed1aed1120ffc0ec0a2450be61a144 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 19:18:46 -0700
Subject: [PATCH 1029/1533] Internal change

PiperOrigin-RevId: 312783332
Change-Id: Ic995d972eebdb07a3473d7778addeec73d3193fb
---
 tensorflow/core/kernels/mkl_tmp_bf16_ops.cc |  4 +---
 tensorflow/core/kernels/reduction_ops.h     | 24 ---------------------
 tensorflow/core/ops/nn_grad.cc              |  4 ----
 tensorflow/python/ops/math_ops_test.py      | 10 ---------
 tensorflow/python/ops/nn_grad_test.py       | 16 --------------
 tensorflow/python/ops/nn_test.py            | 15 -------------
 6 files changed, 1 insertion(+), 72 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
index ed5fec677e8..9b2d09fb827 100644
--- a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
+++ b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
@@ -58,9 +58,7 @@ namespace tensorflow {
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("_FusedMatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);  \
   REGISTER_KERNEL_BUILDER(                                                    \
-      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp); \
-  REGISTER_KERNEL_BUILDER(                                                    \
-      Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
+      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
 
 TF_CALL_bfloat16(REGISTER_CPU);
 #undef REGISTER_CPU
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index e492f4b9cdd..46d8051fff1 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -19,7 +19,6 @@ limitations under the License.
 // Functor definitions for Reduction ops, must be compilable by nvcc.
 
 #include <iostream>
-
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -59,29 +58,6 @@ struct ReduceEigenImpl {
   }
 };
 
-// Specialization for BF16 Reducer to fix accuracy.
-// TODO: all BF16 Reducer should have specialization to fix accuracy.
-#define CASTING_SPECIALIZATION(Reducer, ScalarType, IntermediateType)        \
-  template <typename Device, typename OUT_T, typename IN_T,                  \
-            typename ReductionAxes>                                          \
-  struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,                 \
-                         Reducer<ScalarType>> {                              \
-    void operator()(const Device& d, OUT_T out, IN_T in,                     \
-                    const ReductionAxes& reduction_axes,                     \
-                    const Reducer<ScalarType>& reducer) {                    \
-      static_assert(std::is_same<ScalarType, typename OUT_T::Scalar>::value, \
-                    "");                                                     \
-      Reducer<IntermediateType> intermediate_reducer;                        \
-      auto in_as_intermediate = in.template cast<IntermediateType>();        \
-      out.device(d) =                                                        \
-          in_as_intermediate.reduce(reduction_axes, intermediate_reducer)    \
-              .template cast<ScalarType>();                                  \
-    }                                                                        \
-  };
-
-CASTING_SPECIALIZATION(Eigen::internal::SumReducer, bfloat16, float);
-#undef CASTING_SPECIALIZATION
-
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Scalar>
 struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index ae75e6b95b2..7beaf57c10b 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -31,11 +31,7 @@ Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       // Ret val defs
       {"grad_x: T"},
       // Attr defs
-#if defined(INTEL_MKL)
-      {{"T: {float, double, bfloat16}"}},
-#else
       {{"T: {float, double}"}},
-#endif
       // Nodes
       // Based on _SoftmaxGrad in nn_grad.py.
       {
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 7744e3e96aa..2405eec9e49 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -44,16 +44,6 @@ class ReduceTest(test_util.TensorFlowTestCase):
       y_tf = self.evaluate(math_ops.reduce_sum(x))
       self.assertEqual(y_tf, 21)
 
-  def testReduceExtendType(self):
-    in_f32 = np.random.randn(1000, 1000).astype(np.float32)
-    in_bf16 = math_ops.cast(in_f32, dtypes.bfloat16)
-
-    out_f32 = self.evaluate(math_ops.reduce_sum(in_f32))
-    out_bf16 = self.evaluate(math_ops.reduce_sum(in_bf16))
-    expected = math_ops.cast(out_f32, dtypes.bfloat16)
-
-    self.assertAllClose(out_bf16, expected, 1e-3)
-
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with test_util.device(use_gpu=True):
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 5df961503be..9da56cb7200 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -33,22 +33,6 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
-class SoftmaxOpTest(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def testSoftmaxGradGradExtendType(self):
-    if test_util.IsMklEnabled():
-      inputs = constant_op.constant([[-2, -1, 1, 3], [5, 7, 8, 9]],
-                                    dtype=dtypes.bfloat16)
-      r = nn_ops.softmax(inputs)
-      r_g = gradients_impl.gradients(r, inputs)[0]
-      with self.cached_session():
-        error = gradient_checker.compute_gradient_error(inputs,
-                                                        inputs.get_shape(), r_g,
-                                                        r_g.get_shape())
-        self.assertLess(error, 1e-4)
-
-
 class Relu6OpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 8c0277d050d..0088c04f909 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -130,21 +130,6 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
-  @test_util.run_deprecated_v1
-  def testSoftmaxExtendType(self):
-    if test_util.IsMklEnabled():
-      x_shape = [5, 10]
-      x_np = np.random.randn(*x_shape).astype(np.float32)
-
-      x_f32_tf = constant_op.constant(x_np)
-      x_bf16_tf = math_ops.cast(x_f32_tf, dtypes.bfloat16)
-      y_f32_tf = self.evaluate(nn_ops.softmax(x_f32_tf))
-      y_bf16_tf = self.evaluate(nn_ops.softmax(x_bf16_tf))
-      expected = math_ops.cast(y_f32_tf, dtypes.bfloat16)
-      # BF16 type has less precision
-      eps = 1e-2
-      self.assertAllClose(y_bf16_tf, expected, eps)
-
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
   @test_util.run_deprecated_v1
   def testGradient(self, x_shape):

From 431dc17adc4f63d9e9c5a3fedb28ac93bebd3e9c Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 21 May 2020 19:28:11 -0700
Subject: [PATCH 1030/1533] [XLA] Fixup the bug in tautological compare
 simplifier, as spotted by Sanjoy

PiperOrigin-RevId: 312784003
Change-Id: I5f55e0f74cca1750679deb1d791d2fb6a84a929b
---
 .../xla/service/algebraic_simplifier.cc       | 31 +++++++------------
 .../xla/service/algebraic_simplifier_test.cc  | 19 ++++++++++++
 2 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 2fbfd156844..440e04c9205 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -874,28 +874,21 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare(
     int64 constant;
   };
 
-  auto get_compare_info_helper =
-      [&](HloInstruction* lhs,
-          HloInstruction* rhs) -> absl::optional<LessThanCompareInfo> {
-    if (!Match(rhs, m::Constant().WithShape(
-                        m::Shape().IsEffectiveScalar().WithElementType(
-                            PrimitiveType::S32)))) {
-      return absl::nullopt;
-    }
-    return {LessThanCompareInfo{lhs, *rhs->literal().GetFirstInteger()}};
-  };
-
   auto get_compare_info =
       [&](HloInstruction* cmp) -> absl::optional<LessThanCompareInfo> {
     HloInstruction *lhs, *rhs;
-    if (!Match(cmp, m::Compare(m::Op(&lhs), m::Op(&rhs))
-                        .WithComparisonDirection(ComparisonDirection::kLt))) {
-      return absl::nullopt;
-    }
-    if (auto match1 = get_compare_info_helper(lhs, rhs)) {
-      return match1;
-    } else if (auto match2 = get_compare_info_helper(rhs, lhs)) {
-      return match2;
+    auto scalar_shape_matcher =
+        m::Shape().IsEffectiveScalar().WithElementType(PrimitiveType::S32);
+    if (Match(cmp, m::Compare(m::Op(&lhs),
+                              m::Constant(&rhs).WithShape(scalar_shape_matcher))
+                       .WithComparisonDirection(ComparisonDirection::kLt))) {
+      return {LessThanCompareInfo{lhs, *rhs->literal().GetFirstInteger()}};
+    } else if (Match(
+                   cmp,
+                   m::Compare(m::Constant(&lhs).WithShape(scalar_shape_matcher),
+                              m::Op(&rhs))
+                       .WithComparisonDirection(ComparisonDirection::kGt))) {
+      return {LessThanCompareInfo{rhs, *lhs->literal().GetFirstInteger()}};
     }
     return absl::nullopt;
   };
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 0260a925b63..9f823c76d80 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -5780,6 +5780,25 @@ TEST_F(AlgebraicSimplifierTest, CompareSimplified) {
                      .WithComparisonDirection(ComparisonDirection::kLt)));
 }
 
+TEST_F(AlgebraicSimplifierTest, CompareSimplifiedReversed) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[] parameter(0)
+      c1 = s32[] constant(10)
+      c2 = s32[] constant(100)
+      cmp1 = pred[] compare(param, c1), direction=LT
+      cmp2 = pred[] compare(c2, param), direction=GT
+      ROOT out = pred[] and(cmp1, cmp2)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Op(), m::Op().IsConstantScalar(10))
+                     .WithComparisonDirection(ComparisonDirection::kLt)));
+}
+
 TEST_F(AlgebraicSimplifierTest, CanDisableDotToMultiplyRewrite) {
   // Some backends may have better performance by treating an outer product as a
   // Dot, rather than a broadcast Multiply

From 0f178c37083daaeeaef156e79c673b9018e4df6e Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Thu, 21 May 2020 19:44:32 -0700
Subject: [PATCH 1031/1533] [TF/XLA] Support F64 conversion for tf.cumsum

PiperOrigin-RevId: 312785189
Change-Id: I88b4bfe7c2448218230c09eb11eb672e3a40a85a
---
 tensorflow/compiler/tf2xla/kernels/scan_ops.cc       | 6 ++----
 tensorflow/python/eager/def_function_xla_jit_test.py | 9 +++++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 8431724f438..beb8e7aa174 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -36,10 +36,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// TODO(phawkins): implement double-sized windowed reductions in XLA and remove
-// the type constraint.
-constexpr std::array<DataType, 4> kScanOpTypes = {
-    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_INT32}};
+constexpr std::array<DataType, 5> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_INT32}};
 
 class ScanOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 0e89887647a..5fdf0487333 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -355,6 +355,15 @@ class DefFunctionTest(test.TestCase):
     self.assertAllClose([5.0, 5.0, 5.0], g())
     self.assertAllClose(compiled_g(), g())
 
+  def testCumsum(self):
+
+    @def_function.function(experimental_compile=True)
+    def f(x):
+      return math_ops.cumsum(x)
+
+    f64_input = constant_op.constant([1.1, 2.2, 3.3], dtype=dtypes.float64)
+    self.assertAllClose([1.1, 3.3, 6.6], f(f64_input))
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()

From 4ce6280b4812308946cded072b379964f850654a Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Thu, 21 May 2020 19:46:14 -0700
Subject: [PATCH 1032/1533] Fix TFLiteConverter2 API Documentation to read
 frozen_graphs.

PiperOrigin-RevId: 312785311
Change-Id: I6f5ec2dd5ee0d5796e3fd8c0c35fb50f78d56fab
---
 tensorflow/lite/g3doc/convert/1x_compatibility.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/convert/1x_compatibility.md b/tensorflow/lite/g3doc/convert/1x_compatibility.md
index 9f9f277a8d9..ceb99bad5e2 100644
--- a/tensorflow/lite/g3doc/convert/1x_compatibility.md
+++ b/tensorflow/lite/g3doc/convert/1x_compatibility.md
@@ -34,7 +34,7 @@ input_arrays = ['input_name']
 # A list of the names of the model's output tensors
 output_arrays = ['output_name']
 # Load and convert the frozen graph
-converter = tf.lite.TFLiteConverter.from_frozen_graph(
+converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
   graph_def_file, input_arrays, output_arrays)
 tflite_model = converter.convert()
 # Write the converted model to disk

From 221af69be04e5b580add966991da598d48257f5e Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Thu, 21 May 2020 19:52:05 -0700
Subject: [PATCH 1033/1533] [tfdbg2] Document the need to call
 set_soft_device_placement(True) on TPUs

PiperOrigin-RevId: 312785683
Change-Id: I388cb0a4d0d9eac1005bf4e52d10153d7bcd200f
---
 .../python/debug/lib/check_numerics_callback.py  | 15 +++++++++++++++
 tensorflow/python/debug/lib/dumping_callback.py  | 16 ++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index edcafad201e..440dc758e76 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -410,6 +410,21 @@ def enable_check_numerics(stack_height_limit=30,
      z = tf.matmul(y, y)
      ```
 
+  NOTE: If your code is running on TPUs, be sure to call
+  `tf.config.set_soft_device_placement(True)` before calling
+  `tf.debugging.enable_check_numerics()` as this API uses automatic outside
+  compilation on TPUs. For example:
+
+  ```py
+  tf.config.set_soft_device_placement(True)
+  tf.debugging.enable_check_numerics()
+
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  with strategy.scope():
+    # ...
+  ```
+
   Args:
     stack_height_limit: Limit to the height of the printed stack trace.
       Applicable only to ops in `tf.function`s (graphs).
diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py
index 5f7fe5e7ea4..f012faf5f3c 100644
--- a/tensorflow/python/debug/lib/dumping_callback.py
+++ b/tensorflow/python/debug/lib/dumping_callback.py
@@ -721,6 +721,22 @@ def enable_dump_debug_info(dump_root,
   # Code to build, train and run your TensorFlow model...
   ```
 
+  NOTE: If your code is running on TPUs, be sure to call
+  `tf.config.set_soft_device_placement(True)` before calling
+  `tf.debugging.experimental.enable_dump_debug_info()` as this API uses
+  automatic outside compilation on TPUs. For example:
+
+  ```py
+  tf.config.set_soft_device_placement(True)
+  tf.debugging.experimental.enable_dump_debug_info(
+      logdir, tensor_debug_mode="FULL_HEALTH")
+
+  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+  strategy = tf.distribute.experimental.TPUStrategy(resolver)
+  with strategy.scope():
+    # ...
+  ```
+
   Args:
     dump_root: The directory path where the dumping information will be written.
     tensor_debug_mode: Debug mode for tensor values, as a string.

From 21fdbbb07f8ff7d27d3545d740c0bace5a3f23eb Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Thu, 21 May 2020 20:41:46 -0700
Subject: [PATCH 1034/1533] Add tf.function test for device placement logging

PiperOrigin-RevId: 312789434
Change-Id: I26b4f34546cfe759a484a7f2b5b0bb234512d333
---
 tensorflow/python/client/session_test.py | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 1c244c1b297..074b50bf69b 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -34,6 +34,7 @@ from tensorflow.core.lib.core import error_codes_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as framework_device_lib
@@ -1911,8 +1912,8 @@ class SessionTest(test_util.TensorFlowTestCase):
       def __str__(self):
         return self._output
 
+    context.set_log_device_placement(True)
     if context.executing_eagerly():
-      context.set_log_device_placement(True)
       with CaptureStderr() as log:
         a = constant_op.constant(1)
         b = constant_op.constant(2)
@@ -1939,6 +1940,22 @@ class SessionTest(test_util.TensorFlowTestCase):
     add_executions = [l for l in str(log).splitlines() if 'AddV2' in l]
     self.assertEqual(len(add_executions), 2)
 
+    @def_function.function
+    def fn():
+      a = constant_op.constant(1)
+      b = constant_op.constant(2)
+      c = a + b
+      d = a + b
+      return c, d
+
+    with CaptureStderr() as log:
+      c, d = self.evaluate(fn())
+    self.assertEqual(c, 3)
+    self.assertEqual(d, 3)
+    # Ensure that we did log device placement.
+    add_executions = [l for l in str(log).splitlines() if 'AddV2' in l]
+    self.assertEqual(len(add_executions), 2)
+
   @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
     # Test that the timeout passed in a config to the session works correctly.

From 7047ceec37a3f004386621e8e56b825ab0d648a3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 20:44:46 -0700
Subject: [PATCH 1035/1533] Update sparse input documentation.

PiperOrigin-RevId: 312789707
Change-Id: I09410e9adc25cfe6099cf1fd1a77edc3680a3a59
---
 tensorflow/python/keras/engine/input_layer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 1fa380815fc..02e43110697 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -218,7 +218,9 @@ def Input(  # pylint: disable=invalid-name
       dtype: The data type expected by the input, as a string
           (`float32`, `float64`, `int32`...)
       sparse: A boolean specifying whether the placeholder to be created is
-          sparse. Only one of 'ragged' and 'sparse' can be True.
+          sparse. Only one of 'ragged' and 'sparse' can be True. Note that,
+          if `sparse` is False, sparse tensors can still be passed into the
+          input - they will be densified with a default value of 0.
       tensor: Optional existing tensor to wrap into the `Input` layer.
           If set, the layer will not create a placeholder tensor.
       ragged: A boolean specifying whether the placeholder to be created is

From 42273e6b297870747bff2fa0d1ad38181003fb4b Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Thu, 21 May 2020 20:55:24 -0700
Subject: [PATCH 1036/1533] Rearrange the binary __operator__ code in
 TensorFlow to be more dispatch-friendly.

PiperOrigin-RevId: 312790610
Change-Id: I5f95a71c1cf49a612e3d37016e59343ced006587
---
 tensorflow/python/ops/math_ops.py      | 58 ++++++++++++++------------
 tensorflow/python/ops/math_ops_test.py | 39 +++++++++++++++++
 2 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 18dda547cbe..ed1db4f539d 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1110,21 +1110,26 @@ def _OverrideBinaryOperatorHelper(func, op_name, clazz_object=ops.Tensor):
 
   def binary_op_wrapper(x, y):
     with ops.name_scope(None, op_name, [x, y]) as name:
-      if isinstance(x, ops.Tensor) and isinstance(y, ops.Tensor):
+      try:
         return func(x, y, name=name)
-      elif not isinstance(y, sparse_tensor.SparseTensor):
-        try:
-          y = ops.convert_to_tensor_v2(
-              y, dtype_hint=x.dtype.base_dtype, name="y")
-        except TypeError:
-          # If the RHS is not a tensor, it might be a tensor aware object
-          # that can implement the operator with knowledge of itself
-          # and the tensor.
-          if hasattr(type(y), "__r%s__" % op_name):
-            return NotImplemented
-          else:
-            raise
-      return func(x, y, name=name)
+      except (TypeError, ValueError) as e:
+        # Even if dispatching the op failed, the RHS may be a tensor aware
+        # object that can implement the operator with knowledge of itself
+        # and the tensor.
+        # If the RHS is not tensor aware we still want to raise the
+        # original error from the LHS, because it may be more
+        # informative.
+        if hasattr(type(y), "__r%s__" % op_name):
+          try:
+            r_op = getattr(y, "__r%s__" % op_name)
+            out = r_op(x)
+            if out == NotImplemented:
+              raise
+            return out
+          except (TypeError, ValueError):
+            raise e
+        else:
+          raise
 
   def binary_op_wrapper_sparse(sp_x, y):
     with ops.name_scope(None, op_name, [sp_x, y]) as name:
@@ -1204,7 +1209,7 @@ def _sparse_dense_truediv(sp_indices, sp_values, sp_shape, y, name=None):
 def _truediv_python3(x, y, name=None):
   with ops.name_scope(name, "truediv", [x, y]) as name:
     x = ops.convert_to_tensor(x, name="x")
-    y = ops.convert_to_tensor(y, name="y")
+    y = ops.convert_to_tensor(y, dtype_hint=x.dtype.base_dtype, name="y")
     x_dtype = x.dtype.base_dtype
     y_dtype = y.dtype.base_dtype
     if x_dtype != y_dtype:
@@ -1402,6 +1407,9 @@ floormod = gen_math_ops.floor_mod
 
 def _add_dispatch(x, y, name=None):
   """Dispatches to add for strings and add_v2 for all other types."""
+  if not isinstance(y, ops.Tensor) and not isinstance(
+      y, sparse_tensor.SparseTensor):
+    y = ops.convert_to_tensor(y, dtype_hint=x.dtype.base_dtype, name="y")
   if x.dtype == dtypes.string:
     return gen_math_ops.add(x, y, name=name)
   else:
@@ -1410,14 +1418,12 @@ def _add_dispatch(x, y, name=None):
 
 def _mul_dispatch(x, y, name=None):
   """Dispatches cwise mul for "Dense*Dense" and "Dense*Sparse"."""
-  is_tensor_y = isinstance(y, ops.Tensor)
-  if is_tensor_y:
-    return gen_math_ops.mul(x, y, name=name)
-  else:
-    assert isinstance(y, sparse_tensor.SparseTensor)  # Case: Dense * Sparse.
+  if isinstance(y, sparse_tensor.SparseTensor):  # Case: Dense * Sparse.
     new_vals = gen_sparse_ops.sparse_dense_cwise_mul(y.indices, y.values,
                                                      y.dense_shape, x, name)
     return sparse_tensor.SparseTensor(y.indices, new_vals, y.dense_shape)
+  else:
+    return multiply(x, y, name=name)
 
 
 # NOTE(aselle): When integer division is added for sparse_dense_cwise,
@@ -1431,10 +1437,10 @@ _OverrideBinaryOperatorHelper(gen_sparse_ops.sparse_dense_cwise_mul, "mul",
                               sparse_tensor.SparseTensor)
 
 _OverrideBinaryOperatorHelper(_add_dispatch, "add")
-_OverrideBinaryOperatorHelper(gen_math_ops.sub, "sub")
+_OverrideBinaryOperatorHelper(subtract, "sub")
 _OverrideBinaryOperatorHelper(_mul_dispatch, "mul")
-_OverrideBinaryOperatorHelper(_div_python2, "div")
-_OverrideBinaryOperatorHelper(_truediv_python3, "truediv")
+_OverrideBinaryOperatorHelper(div, "div")
+_OverrideBinaryOperatorHelper(truediv, "truediv")
 _OverrideBinaryOperatorHelper(floordiv, "floordiv")
 _OverrideBinaryOperatorHelper(gen_math_ops.floor_mod, "mod")
 _OverrideBinaryOperatorHelper(pow, "pow")
@@ -1531,7 +1537,7 @@ def logical_and(x, y, name=None):
   return gen_math_ops.logical_and(x, y, name)
 
 
-_OverrideBinaryOperatorHelper(gen_math_ops.logical_and, "and")
+_OverrideBinaryOperatorHelper(logical_and, "and")
 _OverrideBinaryOperatorHelper(gen_math_ops.logical_or, "or")
 _OverrideBinaryOperatorHelper(logical_xor, "xor")
 
@@ -3088,10 +3094,10 @@ def matmul(a,
       if not isinstance(a, (ops.EagerTensor, _resource_variable_type)):
         a = ops.convert_to_tensor(a, name="a")
       if not isinstance(b, (ops.EagerTensor, _resource_variable_type)):
-        b = ops.convert_to_tensor(b, name="b")
+        b = ops.convert_to_tensor(b, dtype_hint=a.dtype.base_dtype, name="b")
     else:
       a = ops.convert_to_tensor(a, name="a")
-      b = ops.convert_to_tensor(b, name="b")
+      b = ops.convert_to_tensor(b, dtype_hint=a.dtype.base_dtype, name="b")
 
     # TODO(apassos) remove _shape_tuple here when it is not needed.
     a_shape = a._shape_tuple()  # pylint: disable=protected-access
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 2405eec9e49..9093a06b84a 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -682,6 +682,45 @@ class BinaryOpsTest(test_util.TensorFlowTestCase):
       a = array_ops.ones([1], dtype=dtypes.int32) + 1.0
       self.evaluate(a)
 
+  def testRHSDispatchingAndErrorRaising(self):
+    if context.executing_eagerly():
+      error = ValueError
+      error_message = (
+          r"Attempt to convert a value .* with an unsupported type")
+    else:
+      error = TypeError
+      error_message = (
+          r"Failed to convert object of type .* to Tensor")
+
+    class RHSReturnsTrue(object):
+
+      def __radd__(self, other):
+        return True
+    a = array_ops.ones([1], dtype=dtypes.int32) + RHSReturnsTrue()
+    self.assertEqual(a, True)
+
+    class RHSRaisesError(object):
+
+      def __radd__(self, other):
+        raise TypeError("RHS not implemented")
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + RHSRaisesError()
+      self.evaluate(a)
+
+    class RHSReturnsNotImplemented(object):
+
+      def __radd__(self, other):
+        return NotImplemented
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + RHSReturnsNotImplemented()
+      self.evaluate(a)
+
+    class RHSNotImplemented(object):
+      pass
+    with self.assertRaisesRegexp(error, error_message):
+      a = array_ops.ones([1], dtype=dtypes.int32) + RHSNotImplemented()
+      self.evaluate(a)
+
 
 class SignTest(test_util.TensorFlowTestCase):
 

From 987a095f856046f9c088657dd8666f500770279d Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Thu, 21 May 2020 21:11:39 -0700
Subject: [PATCH 1037/1533] Disable multi-threaded Conv optimizations w/
 non-const filters

The non-ruy, multi-threaded conv implementation performs a filter
repack that is cached. This is only correct if the filter itself
is constant. Disable this path if the filter is non-const.

Fixes #31205.

PiperOrigin-RevId: 312792024
Change-Id: I38013b449e52fa96e89f32b553edbd804e793f4b
---
 tensorflow/lite/kernels/conv.cc      |   4 +-
 tensorflow/lite/kernels/conv_test.cc | 103 +++++++++++++++++++++++++--
 2 files changed, 99 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 403adc725eb..154ecfdb96d 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -370,8 +370,10 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     }
   }
 
-  // The multi-threaded kernel supports neither dilation nor hybrid kernels.
+  // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
+  // requires a constant input filter.
   data->supports_multithreaded_kernel =
+      (filter->allocation_type == kTfLiteMmapRo) &&
       (kernel_type == kMultithreadOptimized) &&
       (context->recommended_num_threads != 1) && !is_hybrid &&
       (params->dilation_width_factor == 1) &&
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 8569809df75..a2201835195 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
+#include <initializer_list>
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
@@ -39,6 +40,7 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename FilterType>
 class BaseConvolutionOpModel : public SingleOpModel {
  public:
   BaseConvolutionOpModel(
@@ -47,9 +49,15 @@ class BaseConvolutionOpModel : public SingleOpModel {
       int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
       int dilation_width_factor = 1, int dilation_height_factor = 1,
-      int num_threads = -1) {
+      int num_threads = -1,
+      std::initializer_list<FilterType> filter_data = {}) {
     input_ = AddInput(input);
-    filter_ = AddInput(filter);
+
+    if (filter_data.size()) {
+      filter_ = AddConstInput(filter, filter_data);
+    } else {
+      filter_ = AddInput(filter);
+    }
 
     int bias_size = GetShape(filter_)[0];
     if (input.type == TensorType_FLOAT32) {
@@ -115,7 +123,7 @@ class BaseConvolutionOpModel : public SingleOpModel {
   int output_;
 };
 
-class ConvolutionOpModel : public BaseConvolutionOpModel {
+class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -553,6 +561,85 @@ TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
                                     234, 261, 121}));
     }
   }
+
+  // Change the filter to ensure non-const filter behavior is correct.
+  m.SetFilter({2, 4, 7, 2, 5, 8, 3, 6, 9});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 313, 359,
+                                               181, 187, 239, 267, 128}));
+}
+
+// TODO(b/157263074): Ideally using a const filter would be a parameterization
+// of the test, so we ensure full test coverage with all the different
+// types and backends.
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32WithConstFilter) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_SAME;
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  const std::initializer_list<float> filter_data = {1, 4, 7, 2, 5, 8, 3, 6, 9};
+  ConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE,
+      /*dilation_width_factor=*/1,
+      /*dilation_height_factor=*/1,
+      /*num_threads=*/-1, filter_data);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // No bias for this test.
+  m.SetBias({0});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+  // This means we should end up with this matrix:
+  // |  105  |  150  |  183  |   95  |
+  // |  235  |  312  |  357  |  178  |
+  // |  187  |  234  |  261  |  121  |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
+                                               178, 187, 234, 261, 121}));
+
+  // Add an additional test for the multi-threaded case, ensuring stability
+  // under different thread counts.
+  if (GetParam() == "MultithreadedOptimized") {
+    for (int i = 1; i < 4; ++i) {
+      m.SetNumThreads(i);
+      m.Invoke();
+      EXPECT_THAT(m.GetOutput(),
+                  ElementsAreArray({105, 150, 183, 95, 235, 312, 357, 178, 187,
+                                    234, 261, 121}));
+    }
+  }
 }
 
 TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
@@ -766,7 +853,7 @@ TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel<uint8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -986,7 +1073,7 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
               ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class HybridConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridConvolutionOpModel : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1325,7 +1412,8 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
-class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class PerChannelQuantizedConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1442,7 +1530,8 @@ TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
 }
 
-class HybridPerChannelConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridPerChannelConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 

From 18ab11e1465f5f1ef6d323d29569f777dfea87f1 Mon Sep 17 00:00:00 2001
From: Jinliang Wei <jlwei@google.com>
Date: Thu, 21 May 2020 21:44:14 -0700
Subject: [PATCH 1038/1533] [XLA] Introduce asynchronous collective-permute
 (CollectivePermuteStart and CollectivePermuteDone) HLO opcodes.

PiperOrigin-RevId: 312794240
Change-Id: I0afa0ed1920fb97ac509ff2075559525265a28e2
---
 .../compiler/xla/service/dfs_hlo_visitor.h    |  2 +
 .../service/dfs_hlo_visitor_with_default.h    |  6 ++
 .../compiler/xla/service/hlo_cost_analysis.cc | 10 ++
 .../compiler/xla/service/hlo_cost_analysis.h  |  2 +
 .../compiler/xla/service/hlo_graph_dumper.cc  |  2 +
 .../compiler/xla/service/hlo_instruction.cc   | 38 +++++++-
 .../compiler/xla/service/hlo_instruction.h    |  9 +-
 .../compiler/xla/service/hlo_instructions.cc  |  9 +-
 .../compiler/xla/service/hlo_instructions.h   |  2 +-
 tensorflow/compiler/xla/service/hlo_opcode.h  |  2 +
 tensorflow/compiler/xla/service/hlo_parser.cc | 20 +++-
 .../compiler/xla/service/hlo_parser_test.cc   | 14 +++
 .../compiler/xla/service/hlo_verifier.cc      | 91 ++++++++++++++-----
 .../compiler/xla/service/hlo_verifier.h       |  2 +
 .../compiler/xla/service/hlo_verifier_test.cc | 87 +++++++++++++++++-
 .../xla/service/instruction_fusion.cc         |  2 +
 .../compiler/xla/service/layout_assignment.cc |  2 +
 17 files changed, 263 insertions(+), 37 deletions(-)

diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index caea9d9095a..bdaac32a0e5 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -120,6 +120,8 @@ class DfsHloVisitorBase {
   virtual Status HandleAllReduce(HloInstructionPtr hlo) = 0;
   virtual Status HandleAllToAll(HloInstructionPtr hlo) = 0;
   virtual Status HandleCollectivePermute(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCollectivePermuteStart(HloInstructionPtr hlo) = 0;
+  virtual Status HandleCollectivePermuteDone(HloInstructionPtr hlo) = 0;
   virtual Status HandleReplicaId(HloInstructionPtr hlo) = 0;
   virtual Status HandlePartitionId(HloInstructionPtr hlo) = 0;
   virtual Status HandleGetDimensionSize(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 9cd220245ba..b1d674fe467 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -110,6 +110,12 @@ class DfsHloVisitorWithDefaultBase
   Status HandleCollectivePermute(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
+  Status HandleCollectivePermuteStart(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
+  Status HandleCollectivePermuteDone(HloInstructionPtr hlo) override {
+    return DefaultAction(hlo);
+  }
   Status HandleReplicaId(HloInstructionPtr hlo) override {
     return DefaultAction(hlo);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 32a9038b15a..50ba2077411 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -736,6 +736,16 @@ Status HloCostAnalysis::HandleCollectivePermute(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleCollectivePermuteStart(
+    const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleCollectivePermuteDone(
+    const HloInstruction* /*hlo*/) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandlePartitionId(const HloInstruction* /*hlo*/) {
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 9fdb42185fb..634a6c0572c 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -80,6 +80,8 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleAllReduce(const HloInstruction* crs) override;
   Status HandleAllToAll(const HloInstruction* hlo) override;
   Status HandleCollectivePermute(const HloInstruction* hlo) override;
+  Status HandleCollectivePermuteStart(const HloInstruction* hlo) override;
+  Status HandleCollectivePermuteDone(const HloInstruction* hlo) override;
   Status HandleReplicaId(const HloInstruction* hlo) override;
   Status HandlePartitionId(const HloInstruction* hlo) override;
   Status HandleInfeed(const HloInstruction* infeed) override;
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index cd2a61d7eff..3930898d665 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1061,6 +1061,8 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kPartitionId:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 9e9c8b0913b..0aadd21d0a1 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -452,7 +452,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           /*channel_id=*/channel_id, split_dimension);
       break;
     }
-    case HloOpcode::kCollectivePermute: {
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart: {
       std::vector<std::pair<int64, int64>> source_target_pairs(
           proto.source_target_pairs_size());
       absl::optional<int64> channel_id;
@@ -463,8 +464,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         source_target_pairs[i].first = proto.source_target_pairs(i).source();
         source_target_pairs[i].second = proto.source_target_pairs(i).target();
       }
-      instruction = CreateCollectivePermute(shape, operands(0),
-                                            source_target_pairs, channel_id);
+
+      if (opcode == HloOpcode::kCollectivePermute) {
+        instruction = CreateCollectivePermute(shape, operands(0),
+                                              source_target_pairs, channel_id);
+      } else if (opcode == HloOpcode::kCollectivePermuteStart) {
+        instruction = CreateCollectivePermuteStart(
+            shape, operands(0), source_target_pairs, channel_id);
+      } else {
+        LOG(FATAL) << "Expect CollectivePermute or CollectivePermuteStart, "
+                   << "but got " << HloOpcodeString(opcode);
+      }
       break;
     }
     case HloOpcode::kReplicaId: {
@@ -805,6 +815,7 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
     case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
@@ -982,7 +993,18 @@ HloInstruction::CreateCollectivePermute(
     const std::vector<std::pair<int64, int64>>& source_target_pairs,
     const absl::optional<int64>& channel_id) {
   return absl::make_unique<HloCollectivePermuteInstruction>(
-      shape, operand, source_target_pairs, channel_id);
+      HloOpcode::kCollectivePermute, shape, operand, source_target_pairs,
+      channel_id);
+}
+
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateCollectivePermuteStart(
+    const Shape& shape, HloInstruction* operand,
+    const std::vector<std::pair<int64, int64>>& source_target_pairs,
+    const absl::optional<int64>& channel_id) {
+  return absl::make_unique<HloCollectivePermuteInstruction>(
+      HloOpcode::kCollectivePermuteStart, shape, operand, source_target_pairs,
+      channel_id);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReplicaId() {
@@ -1549,6 +1571,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kConvolution:
@@ -1575,6 +1598,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClz:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
     case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
@@ -1928,6 +1952,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kComplex:
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
@@ -2029,6 +2054,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
     case HloOpcode::kConvolution:
     case HloOpcode::kCustomCall:
     case HloOpcode::kReduceWindow:
@@ -2888,6 +2914,10 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleAllToAll(this);
     case HloOpcode::kCollectivePermute:
       return visitor->HandleCollectivePermute(this);
+    case HloOpcode::kCollectivePermuteStart:
+      return visitor->HandleCollectivePermuteStart(this);
+    case HloOpcode::kCollectivePermuteDone:
+      return visitor->HandleCollectivePermuteDone(this);
     case HloOpcode::kReplicaId:
       return visitor->HandleReplicaId(this);
     case HloOpcode::kPartitionId:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 8be7a034877..c6cfda8e505 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -681,7 +681,7 @@ class HloInstruction {
       const absl::optional<int64>& channel_id,
       const absl::optional<int64>& split_dimension = absl::nullopt);
 
-  // Creates a communication instructions that permutes data cross replicas.
+  // Creates a communication instruction that permutes data cross replicas.
   // Data is sent/received according to the (source_replica_id,
   // target_replica_id) pairs in `source_target_pairs`. If a replica id is not a
   // target_replica_id in any pair, the output on that replica is a tensor
@@ -691,6 +691,13 @@ class HloInstruction {
       const std::vector<std::pair<int64, int64>>& source_target_pairs,
       const absl::optional<int64>& channel_id);
 
+  // Creates a communication instruction that initiates the start of
+  // CollectivePermute.
+  static std::unique_ptr<HloInstruction> CreateCollectivePermuteStart(
+      const Shape& shape, HloInstruction* operand,
+      const std::vector<std::pair<int64, int64>>& source_target_pairs,
+      const absl::optional<int64>& channel_id);
+
   // Creates an instruction that returns a U32 replica ID.
   static std::unique_ptr<HloInstruction> CreateReplicaId();
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index d5bdd674563..e33d5960894 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -703,10 +703,10 @@ bool HloAllToAllInstruction::IdenticalSlowPath(
 }
 
 HloCollectivePermuteInstruction::HloCollectivePermuteInstruction(
-    const Shape& shape, HloInstruction* operand,
+    HloOpcode opcode, const Shape& shape, HloInstruction* operand,
     const std::vector<std::pair<int64, int64>>& source_target_pairs,
     const absl::optional<int64>& channel_id)
-    : HloChannelInstruction(HloOpcode::kCollectivePermute, shape, channel_id),
+    : HloChannelInstruction(opcode, shape, channel_id),
       source_target_pairs_(source_target_pairs) {
   AppendOperand(operand);
 }
@@ -738,6 +738,9 @@ bool HloCollectivePermuteInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
         eq_computations) const {
+  if (opcode() != other.opcode()) {
+    return false;
+  }
   const auto& casted_other =
       static_cast<const HloCollectivePermuteInstruction&>(other);
   return HloChannelInstruction::IdenticalSlowPath(other, eq_computations) &&
@@ -752,7 +755,7 @@ HloCollectivePermuteInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* /*context*/) const {
   return absl::make_unique<HloCollectivePermuteInstruction>(
-      shape, new_operands[0], source_target_pairs(), channel_id());
+      opcode(), shape, new_operands[0], source_target_pairs(), channel_id());
 }
 
 HloReverseInstruction::HloReverseInstruction(const Shape& shape,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index ae78d365cfa..7f06c801e38 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -463,7 +463,7 @@ class HloAllToAllInstruction : public HloCollectiveInstruction {
 class HloCollectivePermuteInstruction : public HloChannelInstruction {
  public:
   explicit HloCollectivePermuteInstruction(
-      const Shape& shape, HloInstruction* operand,
+      HloOpcode opcode, const Shape& shape, HloInstruction* operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs,
       const absl::optional<int64>& channel_id);
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 664fa10a990..92359bcbdac 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -63,6 +63,8 @@ namespace xla {
   V(kCholesky, "cholesky", 1)                                          \
   V(kClamp, "clamp", 3)                                                \
   V(kCollectivePermute, "collective-permute", 1)                       \
+  V(kCollectivePermuteStart, "collective-permute-start", 1)            \
+  V(kCollectivePermuteDone, "collective-permute-done", 1)              \
   V(kClz, "count-leading-zeros", 1)                                    \
   V(kCompare, "compare", 2)                                            \
   V(kComplex, "complex", 2)                                            \
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index f1908bcb996..d52a60d2555 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -765,6 +765,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
     case HloOpcode::kBitcast:
     case HloOpcode::kCeil:
     case HloOpcode::kClz:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCopy:
     case HloOpcode::kCopyStart:
     case HloOpcode::kCopyDone:
@@ -938,7 +939,8 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
           split_dimension));
       break;
     }
-    case HloOpcode::kCollectivePermute: {
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart: {
       optional<std::vector<std::vector<int64>>> source_targets;
       attrs["source_target_pairs"] = {
           /*required=*/true, AttrTy::kBracedInt64ListList, &source_targets};
@@ -957,9 +959,19 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
         pairs[i].first = (*source_targets)[i][0];
         pairs[i].second = (*source_targets)[i][1];
       }
-      instruction =
-          builder->AddInstruction(HloInstruction::CreateCollectivePermute(
-              shape, operands[0], pairs, channel_id));
+      if (opcode == HloOpcode::kCollectivePermute) {
+        instruction =
+            builder->AddInstruction(HloInstruction::CreateCollectivePermute(
+                shape, operands[0], pairs, channel_id));
+      } else if (opcode == HloOpcode::kCollectivePermuteStart) {
+        instruction = builder->AddInstruction(
+            HloInstruction::CreateCollectivePermuteStart(shape, operands[0],
+                                                         pairs, channel_id));
+      } else {
+        LOG(FATAL) << "Expect opcode to be CollectivePermute or "
+                      "CollectivePermuteStart, but got "
+                   << HloOpcodeString(opcode);
+      }
       break;
     }
     case HloOpcode::kReplicaId: {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 8f63835b43d..a687d0e1921 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -1553,6 +1553,20 @@ ENTRY CollectivePermute {
   ROOT root = f32[128,32]{0,1} collective-permute(input), source_target_pairs={{0,1},{1,2},{2,3}}
 }
 
+)",
+/*replica_count=*/4
+},
+// collective-permute-start and -done
+{
+"CollectivePermuteStartAndDone",
+R"(HloModule CollectivePermuteStartAndDone
+
+ENTRY CollectivePermuteStartAndDone {
+  input = f32[128,32]{0,1} parameter(0)
+  collective-permute-start.1 = (f32[128,32]{0,1}, f32[128,32]{0,1}, u32[], u32[]) collective-permute-start(input), source_target_pairs={{0,1},{1,2},{2,3}}
+  ROOT collective-permute-done.1 = f32[128,32]{0,1} collective-permute-done(collective-permute-start.1)
+}
+
 )",
 /*replica_count=*/4
 },
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index d15a36532eb..4661b8fd9e3 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -74,7 +74,6 @@ Status CheckParameterCount(const HloInstruction* calling_instruction,
   }
   return Status::OK();
 }
-
 }  // namespace
 
 Status ShapeVerifier::Preprocess(HloInstruction* hlo) {
@@ -332,7 +331,9 @@ Status ShapeVerifier::HandleReplicaId(HloInstruction* hlo) {
   return CheckShape(hlo, ShapeUtil::MakeShape(U32, {}));
 }
 
-Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
+namespace {
+
+Status CheckDuplicatedSourceOrTarget(HloInstruction* hlo) {
   // A source or target cannot appear twice in the collective-permute's
   // source-target pairs.
   absl::flat_hash_set<int64> seen_sources;
@@ -351,10 +352,30 @@ Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
           p.second, hlo->ToString());
     }
   }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status ShapeVerifier::HandleCollectivePermute(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(hlo));
   return CheckShape(hlo, ShapeInference::InferCollectivePermuteShape(
                              hlo->operand(0)->shape()));
 }
 
+Status ShapeVerifier::HandleCollectivePermuteStart(HloInstruction* hlo) {
+  TF_RETURN_IF_ERROR(CheckDuplicatedSourceOrTarget(hlo));
+  return CheckShape(
+      hlo, ShapeUtil::MakeTupleShape(
+               {hlo->operand(0)->shape(), hlo->operand(0)->shape(),
+                ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeShape(U32, {})}));
+}
+
+Status ShapeVerifier::HandleCollectivePermuteDone(HloInstruction* hlo) {
+  return CheckShape(
+      hlo, ShapeUtil::GetTupleElementShape(hlo->operand(0)->shape(), 0));
+}
+
 Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
   return CheckShape(reduce_precision, ShapeInference::InferReducePrecisionShape(
                                           reduce_precision->operand(0)->shape(),
@@ -1375,32 +1396,60 @@ Status CheckSameIsHostTransfer(const HloInstruction* instr1,
   return Status::OK();
 }
 
-// Checks CopyStart and CopyDone nodes.
-Status VerifyAsynchronousCopies(const HloModule& module) {
+Status VerifySingleUser(const HloInstruction* instruction,
+                        HloOpcode expected_user) {
+  TF_RET_CHECK(instruction->users().size() == 1)
+      << "The " << HloOpcodeString(instruction->opcode())
+      << " instruction requires one consumer, found "
+      << instruction->users().size();
+
+  const HloInstruction* user = instruction->users().front();
+  TF_RET_CHECK(user->opcode() == expected_user)
+      << "The consumer of a " << HloOpcodeString(instruction->opcode())
+      << " instruction needs to be " << HloOpcodeString(expected_user)
+      << ", found " << HloOpcodeString(user->opcode());
+  return Status::OK();
+}
+
+Status VerifySingleOperand(const HloInstruction* instruction,
+                           HloOpcode expected_operand) {
+  TF_RET_CHECK(instruction->operands().size() == 1)
+      << "The " << HloOpcodeString(instruction->opcode())
+      << " instruction requires one consumer, found "
+      << instruction->users().size();
+
+  const HloInstruction* operand = instruction->operand(0);
+  TF_RET_CHECK(operand->opcode() == expected_operand)
+      << "The operand of a " << HloOpcodeString(instruction->opcode())
+      << " instruction needs to be " << HloOpcodeString(expected_operand)
+      << ", found " << HloOpcodeString(operand->opcode());
+  return Status::OK();
+}
+
+// Checks asynchronous instruction pairs.
+Status VerifyAsynchronousInstructionPairs(const HloModule& module) {
   // CopyStart must have a single CopyDone user.
   for (const HloComputation* computation : module.computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       switch (instruction->opcode()) {
         case HloOpcode::kCopyStart: {
-          TF_RET_CHECK(instruction->users().size() == 1)
-              << "CopyStart instruction requires one consumer, found "
-              << instruction->users().size();
-          const HloInstruction* copy_done = instruction->users().front();
-          TF_RET_CHECK(copy_done->opcode() == HloOpcode::kCopyDone)
-              << "The consumer of a CopyStart instruction needs to be "
-                 "CopyDone, found "
-              << HloOpcodeString(copy_done->opcode());
+          TF_RETURN_IF_ERROR(
+              VerifySingleUser(instruction, HloOpcode::kCopyDone));
           break;
         }
         case HloOpcode::kCopyDone: {
-          TF_RET_CHECK(instruction->operands().size() == 1)
-              << "CopyDone instruction requires one operand, found "
-              << instruction->operands().size();
-          const HloInstruction* copy_start = instruction->operand(0);
-          TF_RET_CHECK(copy_start->opcode() == HloOpcode::kCopyStart)
-              << "The operand of a CopyDone instruction needs to be CopyStart, "
-                 "found "
-              << HloOpcodeString(copy_start->opcode());
+          TF_RETURN_IF_ERROR(
+              VerifySingleOperand(instruction, HloOpcode::kCopyStart));
+          break;
+        }
+        case HloOpcode::kCollectivePermuteStart: {
+          TF_RETURN_IF_ERROR(
+              VerifySingleUser(instruction, HloOpcode::kCollectivePermuteDone));
+          break;
+        }
+        case HloOpcode::kCollectivePermuteDone: {
+          TF_RETURN_IF_ERROR(VerifySingleOperand(
+              instruction, HloOpcode::kCollectivePermuteStart));
           break;
         }
         default:
@@ -1815,7 +1864,7 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
   }
 
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
-  TF_RETURN_IF_ERROR(VerifyAsynchronousCopies(*module));
+  TF_RETURN_IF_ERROR(VerifyAsynchronousInstructionPairs(*module));
   TF_RETURN_IF_ERROR(VerifyChannels(*module));
 
   std::unique_ptr<ShapeVerifier> shape_verifier =
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 7a2d3dc2e6c..85b02e0518c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -60,6 +60,8 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleAllReduce(HloInstruction* crs) override;
   Status HandleAllToAll(HloInstruction* hlo) override;
   Status HandleCollectivePermute(HloInstruction* hlo) override;
+  Status HandleCollectivePermuteStart(HloInstruction* hlo) override;
+  Status HandleCollectivePermuteDone(HloInstruction* hlo) override;
   Status HandlePartitionId(HloInstruction* hlo) override;
   Status HandleReplicaId(HloInstruction* hlo) override;
   Status HandleReducePrecision(HloInstruction* reduce_precision) override;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index e2c363e40c5..294dfbf66fa 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -710,7 +710,7 @@ TEST_F(HloVerifierTest, CopyStartMultipleCopyDone) {
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(
       status.error_message(),
-      HasSubstr("CopyStart instruction requires one consumer, found 2"));
+      HasSubstr("copy-start instruction requires one consumer, found 2"));
 }
 
 TEST_F(HloVerifierTest, CopyDoneNoCopyStart) {
@@ -730,8 +730,8 @@ TEST_F(HloVerifierTest, CopyDoneNoCopyStart) {
   auto status = verifier().Run(module.get()).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
-              HasSubstr("The operand of a CopyDone instruction needs to be "
-                        "CopyStart, found tuple"));
+              HasSubstr("The operand of a copy-done instruction needs to be "
+                        "copy-start, found tuple"));
 }
 
 TEST_F(HloVerifierTest, IotaNonArrayResult) {
@@ -1134,5 +1134,86 @@ TEST_F(HloVerifierTest, CollectiveChannelVerifier) {
               HasSubstr("used for different types of channel instructions"));
 }
 
+TEST_F(HloVerifierTestLayoutSensitive, CollectivePermuteStartAndDone) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteStartAndDone {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    collective-permute-start.1 = (f32[2,3]{1,0:S(1)}, f32[2,3]{1,0:S(1)}, u32[], u32[]) collective-permute-start(p0), source_target_pairs={{0,1},{1,0}}, channel_id=1
+    ROOT collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, CollectivePermuteStartAndDoneWrongType) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteStartAndDoneWrongType {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    collective-permute-start.1 = f32[2,3]{1,0:S(1)} collective-permute-start(p0), source_target_pairs={{0,1},{1,0}}, channel_id=1
+    ROOT collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("Expected instruction to have shape equal to "
+                        "(f32[2,3], f32[2,3], u32[], u32[])"));
+}
+
+TEST_F(HloVerifierTest, CollectivePermuteStartAndMultipleDone) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteStartAndMultipleDone {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    collective-permute-start.1 = (f32[2,3]{1,0:S(1)}, f32[2,3]{1,0:S(1)}, u32[], u32[]) collective-permute-start(p0), source_target_pairs={{0,1},{1,0}}, channel_id=1
+    collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+    ROOT collective-permute-done.2 = f32[2,3]{1,0:S(1)} collective-permute-done(collective-permute-start.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(
+      status.error_message(),
+      HasSubstr("collective-permute-start instruction requires one consumer, "
+                "found 2"));
+}
+
+TEST_F(HloVerifierTest, CollectivePermuteDoneNoCollectivePermuteStart) {
+  const char* const kModuleStr = R"(
+  HloModule Module
+
+  ENTRY CollectivePermuteDoneNoCollectivePermuteStart {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    p1 = f32[2,3]{1,0:S(1)} parameter(1)
+    p2 = u32[] parameter(2)
+    tuple.1 = (f32[2,3], f32[2,3], u32[], u32[]) tuple(p0, p1, p2)
+    ROOT collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(tuple.1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(kModuleStr));
+
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.error_message(),
+              HasSubstr("The operand of a collective-permute-done instruction "
+                        "needs to be collective-permute-start, found tuple"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 5de081c6343..02966cc2bf2 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -149,6 +149,8 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kAllReduce:
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteDone:
+    case HloOpcode::kCollectivePermuteStart:
     case HloOpcode::kCustomCall:
     case HloOpcode::kDomain:
     case HloOpcode::kDot:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 13699f3adf9..82c30f1a710 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2234,6 +2234,8 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kBitcast:
     case HloOpcode::kBroadcast:
     case HloOpcode::kCall:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kConstant:
     case HloOpcode::kConvolution:
     case HloOpcode::kCopy:

From dade83541f5d009e7d3a52191837f3fb3a1fd8ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 21 May 2020 22:04:08 -0700
Subject: [PATCH 1039/1533] Disable multi-threaded Conv optimizations w/
 non-const filters

The non-ruy, multi-threaded conv implementation performs a filter
repack that is cached. This is only correct if the filter itself
is constant. Disable this path if the filter is non-const.

Fixes #31205.

PiperOrigin-RevId: 312795693
Change-Id: I08ddfd2449247d427b860e5678494f9cb88cbef2
---
 tensorflow/lite/kernels/conv.cc      |   4 +-
 tensorflow/lite/kernels/conv_test.cc | 103 ++-------------------------
 2 files changed, 8 insertions(+), 99 deletions(-)

diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 154ecfdb96d..403adc725eb 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -370,10 +370,8 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     }
   }
 
-  // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
-  // requires a constant input filter.
+  // The multi-threaded kernel supports neither dilation nor hybrid kernels.
   data->supports_multithreaded_kernel =
-      (filter->allocation_type == kTfLiteMmapRo) &&
       (kernel_type == kMultithreadOptimized) &&
       (context->recommended_num_threads != 1) && !is_hybrid &&
       (params->dilation_width_factor == 1) &&
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index a2201835195..8569809df75 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
-#include <initializer_list>
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
@@ -40,7 +39,6 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
-template <typename FilterType>
 class BaseConvolutionOpModel : public SingleOpModel {
  public:
   BaseConvolutionOpModel(
@@ -49,15 +47,9 @@ class BaseConvolutionOpModel : public SingleOpModel {
       int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
       int dilation_width_factor = 1, int dilation_height_factor = 1,
-      int num_threads = -1,
-      std::initializer_list<FilterType> filter_data = {}) {
+      int num_threads = -1) {
     input_ = AddInput(input);
-
-    if (filter_data.size()) {
-      filter_ = AddConstInput(filter, filter_data);
-    } else {
-      filter_ = AddInput(filter);
-    }
+    filter_ = AddInput(filter);
 
     int bias_size = GetShape(filter_)[0];
     if (input.type == TensorType_FLOAT32) {
@@ -123,7 +115,7 @@ class BaseConvolutionOpModel : public SingleOpModel {
   int output_;
 };
 
-class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
+class ConvolutionOpModel : public BaseConvolutionOpModel {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -561,85 +553,6 @@ TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
                                     234, 261, 121}));
     }
   }
-
-  // Change the filter to ensure non-const filter behavior is correct.
-  m.SetFilter({2, 4, 7, 2, 5, 8, 3, 6, 9});
-  m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 313, 359,
-                                               181, 187, 239, 267, 128}));
-}
-
-// TODO(b/157263074): Ideally using a const filter would be a parameterization
-// of the test, so we ensure full test coverage with all the different
-// types and backends.
-TEST_P(ConvolutionOpTest, HandCalculatedFloat32WithConstFilter) {
-  const int depth = 1;
-  const int image_width = 4;
-  const int image_height = 3;
-  const int image_batch_count = 1;
-  const int filter_size = 3;
-  const int filter_count = 1;
-  const int stride_width = 1;
-  const int stride_height = 1;
-  const Padding padding = Padding_SAME;
-  // The filter matrix is:
-  // | 1 | 4 | 7 |
-  // | 2 | 5 | 8 |
-  // | 3 | 6 | 9 |
-  const std::initializer_list<float> filter_data = {1, 4, 7, 2, 5, 8, 3, 6, 9};
-  ConvolutionOpModel m(
-      GetRegistration(),
-      {TensorType_FLOAT32,
-       {image_batch_count, image_height, image_width, depth}},
-      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
-      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
-      ActivationFunctionType_NONE,
-      /*dilation_width_factor=*/1,
-      /*dilation_height_factor=*/1,
-      /*num_threads=*/-1, filter_data);
-
-  // The image matrix is:
-  // |  1 |  2 |  3 |  4 |
-  // |  5 |  6 |  7 |  8 |
-  // |  9 | 10 | 11 | 12 |
-  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
-  // No bias for this test.
-  m.SetBias({0});
-
-  m.Invoke();
-  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
-  // the input set to zero because we're using the 'SAME' padding mode.
-  // The calculations behind the expected output are:
-  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
-  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
-  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
-  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
-  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
-  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
-  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
-  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
-  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
-  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
-  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
-  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
-  // This means we should end up with this matrix:
-  // |  105  |  150  |  183  |   95  |
-  // |  235  |  312  |  357  |  178  |
-  // |  187  |  234  |  261  |  121  |
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
-                                               178, 187, 234, 261, 121}));
-
-  // Add an additional test for the multi-threaded case, ensuring stability
-  // under different thread counts.
-  if (GetParam() == "MultithreadedOptimized") {
-    for (int i = 1; i < 4; ++i) {
-      m.SetNumThreads(i);
-      m.Invoke();
-      EXPECT_THAT(m.GetOutput(),
-                  ElementsAreArray({105, 150, 183, 95, 235, 312, 357, 178, 187,
-                                    234, 261, 121}));
-    }
-  }
 }
 
 TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
@@ -853,7 +766,7 @@ TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel<uint8_t> {
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1073,7 +986,7 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
               ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class HybridConvolutionOpModel : public BaseConvolutionOpModel<int8_t> {
+class HybridConvolutionOpModel : public BaseConvolutionOpModel {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1412,8 +1325,7 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
-class PerChannelQuantizedConvolutionOpModel
-    : public BaseConvolutionOpModel<int8_t> {
+class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1530,8 +1442,7 @@ TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
 }
 
-class HybridPerChannelConvolutionOpModel
-    : public BaseConvolutionOpModel<int8_t> {
+class HybridPerChannelConvolutionOpModel : public BaseConvolutionOpModel {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 

From e761103744b40474bd85caa58d02fbfcecc118f2 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Thu, 21 May 2020 22:24:50 -0700
Subject: [PATCH 1040/1533] [XLA] Add AllGather to the HLO matchers.

PiperOrigin-RevId: 312797129
Change-Id: I6a862f34a3b2331d99fe0bd242e21f26da7ed99e
---
 tensorflow/compiler/xla/service/hlo_matchers.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index ec048bef9e8..cb1b1d0dae4 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -203,6 +203,7 @@ HLO_MATCHER(Abs);
 HLO_MATCHER(Add);
 HLO_MATCHER(AddDependency);
 HLO_MATCHER(AfterAll);
+HLO_MATCHER(AllGather);
 HLO_MATCHER(AllReduce);
 HLO_MATCHER(AllToAll);
 HLO_MATCHER(And);

From c27d431c86d6d144d01d047effb2941bac943512 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Thu, 21 May 2020 23:04:00 -0700
Subject: [PATCH 1041/1533] [XLA] CopyStart/CopyDone times should use exclusive
 indices, not inclusive.

PiperOrigin-RevId: 312800046
Change-Id: I6eebe507125841004b504d8fa1b680d69bcb4789
---
 .../compiler/xla/service/memory_space_assignment.cc    | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index bd7a10248b6..e07431bf46f 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -290,7 +290,7 @@ void CostAnalysisPrefetchIntervalPicker::Begin(const HloUse& use,
   end_logical_time_ = end_time;
   // Find the earliest time we're allowed to start prefetching.
   for (current_logical_prefetch_time_ = start_time;
-       current_logical_prefetch_time_ <= end_logical_time_ &&
+       current_logical_prefetch_time_ < end_logical_time_ &&
        max_async_copy_to_overlap_ratio_ * async_copy_elapsed_ <
            GetLogicalIntervalElapsed(current_logical_prefetch_time_,
                                      end_logical_time_);
@@ -305,9 +305,9 @@ int64 CostAnalysisPrefetchIntervalPicker::Next() {
 }
 
 bool CostAnalysisPrefetchIntervalPicker::Done() const {
-  // The end time is inclusive, so we're done if the prefetch time is greater
-  // than that.
-  if (current_logical_prefetch_time_ > end_logical_time_) {
+  // The end time is exclusive, so we're done if the prefetch time is greater
+  // than or equal to the end time.
+  if (current_logical_prefetch_time_ >= end_logical_time_) {
     return true;
   }
   float logical_interval_elapsed = GetLogicalIntervalElapsed(
@@ -1473,6 +1473,7 @@ void AlternateMemoryBestFitHeap::AddAsyncCopy(
                   : "alternate")
           << " memory between " << start_time << " and "
           << copy_done_schedule_before_time << " keeping until " << end_time;
+  CHECK_LT(start_time, copy_done_schedule_before_time);
 
   allocations->push_back(
       absl::make_unique<MemorySpaceAssignment::CopyAllocation>(
@@ -1760,6 +1761,7 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   alternate_mem_interval.size = request.size;
   while (!options_.prefetch_interval_picker->Done()) {
     alternate_mem_interval.start = options_.prefetch_interval_picker->Next();
+    CHECK_LT(alternate_mem_interval.start, request.latest_prefetch_time);
     VLOG(4) << "Trying alternate memory allocation ("
             << alternate_mem_interval.start << ", " << request.end_time << ")";
     // If this additional asynchronous copy would violate the limit, try a

From 63f70b5611d7f50512ea26295d26016c2704901b Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Thu, 21 May 2020 23:11:46 -0700
Subject: [PATCH 1042/1533] Reduce Layer.__call__ overhead by ~5-10%.

Autocasting now only calls expensive nest.map_structure when Tensors need to be
autocast. In the common case where Tensors are passed with the correct dtype,
minimal work is performed.

PiperOrigin-RevId: 312800528
Change-Id: I25cc00c3309ea48b6fdc5ce6915701b960907008
---
 tensorflow/python/keras/engine/base_layer.py | 48 +++++++++++---------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 53d8cc5ab34..b34616632e3 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -912,7 +912,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           # Build layer if applicable (if the `build` method has been
           # overridden).
           self._maybe_build(inputs)
-          cast_inputs = self._maybe_cast_inputs(inputs)
+          cast_inputs = self._maybe_cast_inputs(inputs, input_list)
 
           if not self.dynamic:
             # Wrapping `call` function in autograph to allow for dynamic control
@@ -982,7 +982,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # Eager execution on data tensors.
         with backend.name_scope(self._name_scope()):
           self._maybe_build(inputs)
-          cast_inputs = self._maybe_cast_inputs(inputs)
+          cast_inputs = self._maybe_cast_inputs(inputs, input_list)
           with base_layer_utils.autocast_context_manager(
               self._compute_dtype):
             outputs = self.call(cast_inputs, *args, **kwargs)
@@ -2117,7 +2117,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     """
     return self._dtype_policy.compute_dtype
 
-  def _maybe_cast_inputs(self, inputs):
+  def _maybe_cast_inputs(self, inputs, input_list):
     """Maybe casts the inputs to the compute dtype.
 
     If self._compute_dtype is floating-point, and self_autocast is True,
@@ -2125,32 +2125,38 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
     Args:
       inputs: Input tensor, or structure of input tensors.
+      input_list: Flat list of input tensors.
 
     Returns:
       `inputs`, but tensors may have been casted to self._compute_dtype
     """
     compute_dtype = self._compute_dtype
-    if (self._autocast and compute_dtype and
-        dtypes.as_dtype(compute_dtype).is_floating):
-      def f(x):
-        """Cast a single Tensor or TensorSpec to the compute dtype."""
-        cast_types = (ops.Tensor, sparse_tensor.SparseTensor,
-                      ragged_tensor.RaggedTensor)
-        if (isinstance(x, cast_types) and x.dtype.is_floating and
-            x.dtype.base_dtype.name != compute_dtype):
-          if self._dtype_defaulted_to_floatx:
-            self._warn_about_input_casting(x.dtype.base_dtype)
-          return math_ops.cast(x, compute_dtype)
-        elif isinstance(x, tensor_spec.TensorSpec) and x.dtype.is_floating:
-          # Inputs may be TensorSpecs when this function is called from
-          # model._set_inputs.
-          return tensor_spec.TensorSpec(x.shape, compute_dtype, x.name)
-        else:
-          return x
-      return nest.map_structure(f, inputs)
+    should_autocast = (
+        self._autocast and compute_dtype and
+        dtypes.as_dtype(compute_dtype).is_floating)
+
+    if (should_autocast and
+        any(self._should_cast_single_input(x) for x in input_list)):
+      # Only perform expensive `nest` operation when needed.
+      return nest.map_structure(self._cast_single_input, inputs)
     else:
       return inputs
 
+  def _should_cast_single_input(self, x):
+    cast_types = (ops.Tensor, sparse_tensor.SparseTensor,
+                  ragged_tensor.RaggedTensor)
+    return (isinstance(x, cast_types) and x.dtype.is_floating and
+            x.dtype.base_dtype.name != self._compute_dtype)
+
+  def _cast_single_input(self, x):
+    """Cast a single Tensor or TensorSpec to the compute dtype."""
+    if self._should_cast_single_input(x):
+      if self._dtype_defaulted_to_floatx:
+        self._warn_about_input_casting(x.dtype.base_dtype)
+      return math_ops.cast(x, self._compute_dtype)
+    else:
+      return x
+
   def _warn_about_input_casting(self, input_dtype):
     # self._already_warned_about_input_casting is only retrieved or set in this
     # function.

From a68b15fee7a9ad6e2f7d1932dcb155f0ed697aba Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Fri, 22 May 2020 00:47:27 -0700
Subject: [PATCH 1043/1533] [XLA] Use all-gather in SPMD to replicate a tiled
 tensor.

PiperOrigin-RevId: 312806463
Change-Id: If0fde80b91f1302256694554fe0cd645ad210df0
---
 .../xla/service/spmd/spmd_partitioner.cc      | 191 +++++++++++++-----
 .../xla/service/spmd/spmd_partitioner.h       |  24 ++-
 .../xla/service/spmd/spmd_partitioner_test.cc |   8 +-
 3 files changed, 175 insertions(+), 48 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index b857c8bdbe6..090fcd48893 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -670,26 +670,34 @@ PartitionedHlo PartitionedHlo::Replicate() {
   }
 
   // 'Tiled' to 'Replicated'.
+  HloInstruction* result = nullptr;
+  if (state_.collective_ops_creator.create_cross_partition_all_gather) {
+    result = state_.partitioner->AllGatherShards(state_.b, hlo_, sharding,
+                                                 NewChannel());
+  }
   Shape padded_base_shape = shape;
   for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
     padded_base_shape.set_dimensions(
         i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
   }
-  auto zero = state_.b->AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
-  auto zero_bcast = state_.b->AddInstruction(
-      HloInstruction::CreateBroadcast(padded_base_shape, zero, {}));
-  auto dus = state_.b->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      padded_base_shape, zero_bcast, hlo_,
-      MakePartitionOffsets(padded_base_shape, sharding, state_.partition_id,
-                           state_.b)));
-  HloComputation* reduction =
-      MakeBinaryAdd(shape.element_type(), state_.module);
+  if (result == nullptr) {
+    auto zero = state_.b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(shape.element_type())));
+    auto zero_bcast = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(padded_base_shape, zero, {}));
+    auto dus =
+        state_.b->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+            padded_base_shape, zero_bcast, hlo_,
+            MakePartitionOffsets(padded_base_shape, sharding,
+                                 state_.partition_id, state_.b)));
+    HloComputation* reduction =
+        MakeBinaryAdd(shape.element_type(), state_.module);
 
-  auto all_reduce =
-      state_.collective_ops_creator.create_cross_partition_all_reduce(
-          state_.b, dus, reduction, NewChannel());
-  HloInstruction* result = all_reduce;
+    auto all_reduce =
+        state_.collective_ops_creator.create_cross_partition_all_reduce(
+            state_.b, dus, reduction, NewChannel());
+    result = all_reduce;
+  }
   if (!ShapeUtil::Compatible(base_shape_, padded_base_shape)) {
     std::vector<int64> start_indices(shape.rank(), 0);
     std::vector<int64> strides(shape.rank(), 1);
@@ -4449,42 +4457,133 @@ Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
       "the data is replicated, and if the latter which data is replicated.");
 }
 
+SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64 num_partitions,
+                                                        int64 num_replicas) {
+  return {
+      [](SpmdBuilder* b) {
+        return b->AddInstruction(HloInstruction::CreatePartitionId());
+      },
+      [num_replicas](SpmdBuilder* b, HloInstruction* operand,
+                     HloComputation* reduction, int64 channel_id) {
+        return b->AddInstruction(HloInstruction::CreateAllReduce(
+            operand->shape(), {operand}, reduction,
+            CreateReplicaGroups(num_replicas),
+            /*constrain_layout=*/false, channel_id,
+            /*use_global_device_ids=*/false));
+      },
+      [](SpmdBuilder* b, HloInstruction* operand,
+         std::vector<std::pair<int64, int64>>& src_dst_pairs,
+         int64 channel_id) {
+        return b->AddInstruction(HloInstruction::CreateCollectivePermute(
+            operand->shape(), operand, src_dst_pairs, channel_id));
+      },
+      [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+         const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
+         absl::optional<int64> split_dimension) {
+        std::vector<Shape> shapes(operands.size(), operands[0]->shape());
+        const Shape output_shape = (shapes.size() == 1)
+                                       ? shapes[0]
+                                       : ShapeUtil::MakeTupleShape(shapes);
+        return b->AddInstruction(HloInstruction::CreateAllToAll(
+            output_shape, operands, replica_groups,
+            /*constrain_layout=*/false, channel_id, split_dimension));
+      },
+      [num_replicas, num_partitions](
+          SpmdBuilder* b, HloInstruction* operand, const Shape& ag_shape,
+          const std::vector<std::vector<int64>>& partition_subgroups,
+          int64 channel_id, int64 all_gather_dimension) {
+        std::vector<ReplicaGroup> device_groups;
+        device_groups.reserve(partition_subgroups.size() * num_replicas);
+        for (int64 i = 0; i < num_replicas; ++i) {
+          for (const auto& pgroup : partition_subgroups) {
+            device_groups.emplace_back();
+            for (int64 pid : pgroup) {
+              device_groups.back().add_replica_ids(i * num_partitions + pid);
+            }
+          }
+        }
+        return b->AddInstruction(HloInstruction::CreateAllGather(
+            ag_shape, operand, all_gather_dimension, device_groups,
+            /*constrain_layout=*/false, channel_id,
+            /*use_global_device_ids=*/true));
+      },
+  };
+}
+
 SpmdPartitioner::SpmdPartitioner(int64 num_partitions, int64 num_replicas,
                                  SpmdPartitionerOptions options)
     : SpmdPartitioner(
           num_partitions, num_replicas, std::move(options),
-          SPMDCollectiveOpsCreator{
-              [](SpmdBuilder* b) {
-                return b->AddInstruction(HloInstruction::CreatePartitionId());
-              },
-              [num_replicas](SpmdBuilder* b, HloInstruction* operand,
-                             HloComputation* reduction, int64 channel_id) {
-                return b->AddInstruction(HloInstruction::CreateAllReduce(
-                    operand->shape(), {operand}, reduction,
-                    CreateReplicaGroups(num_replicas),
-                    /*constrain_layout=*/false, channel_id,
-                    /*use_global_device_ids=*/false));
-              },
-              [](SpmdBuilder* b, HloInstruction* operand,
-                 std::vector<std::pair<int64, int64>>& src_dst_pairs,
-                 int64 channel_id) {
-                return b->AddInstruction(
-                    HloInstruction::CreateCollectivePermute(
-                        operand->shape(), operand, src_dst_pairs, channel_id));
-              },
-              [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
-                 const std::vector<ReplicaGroup>& replica_groups,
-                 int64 channel_id, absl::optional<int64> split_dimension) {
-                std::vector<Shape> shapes(operands.size(),
-                                          operands[0]->shape());
-                const Shape output_shape =
-                    (shapes.size() == 1) ? shapes[0]
-                                         : ShapeUtil::MakeTupleShape(shapes);
-                return b->AddInstruction(HloInstruction::CreateAllToAll(
-                    output_shape, operands, replica_groups,
-                    /*constrain_layout=*/false, channel_id, split_dimension));
-              },
-          }) {}
+          GetDefaultCollectiveOpsCreator(num_partitions, num_replicas)) {}
+
+HloInstruction* SpmdPartitioner::AllGatherShards(SpmdBuilder* b,
+                                                 HloInstruction* operand,
+                                                 const HloSharding& sharding,
+                                                 int64 channel_id) {
+  CHECK(!sharding.IsTileMaximal());
+  // Add one leading dimension to gather all partitions.
+  std::vector<int64> shape;
+  shape.push_back(1);
+  for (int64 dim : operand->shape().dimensions()) {
+    shape.push_back(dim);
+  }
+  auto reshape = b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(operand->shape().element_type(), shape), operand));
+  std::vector<std::vector<int64>> partition_subgroups(1);
+  for (int64 pid : sharding.tile_assignment()) {
+    partition_subgroups[0].push_back(pid);
+  }
+  shape[0] = sharding.tile_assignment().num_elements();
+  auto result = collective_ops_creator_.create_cross_partition_all_gather(
+      b, reshape, ShapeUtil::MakeShape(operand->shape().element_type(), shape),
+      partition_subgroups, channel_id, /*all_gather_dimension=*/0);
+  // If n > 1 dimensions are partitioned, split the leading dimension to n.
+  std::vector<int64> tiled_dims;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (sharding.tile_assignment().dim(i) > 1) {
+      tiled_dims.push_back(i);
+    }
+  }
+  if (tiled_dims.size() > 1) {
+    std::vector<int64> split_dim_shape;
+    split_dim_shape.reserve(tiled_dims.size() + operand->shape().rank());
+    for (int64 i : tiled_dims) {
+      split_dim_shape.push_back(sharding.tile_assignment().dim(i));
+    }
+    for (int64 dim : operand->shape().dimensions()) {
+      split_dim_shape.push_back(dim);
+    }
+    result = b->AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(operand->shape().element_type(), split_dim_shape),
+        result));
+  }
+  // Transpose the gathered dimensions to next to their corresponding
+  // partitioned dimensions.
+  std::vector<int64> xpose_permutation(result->shape().rank());
+  int64 split_dims_added = 0;
+  for (int64 i = 0; i < xpose_permutation.size(); ++i) {
+    if (sharding.tile_assignment().dim(i - split_dims_added) == 1) {
+      xpose_permutation[i] = i + tiled_dims.size() - split_dims_added;
+    } else {
+      xpose_permutation[i] = split_dims_added;
+      split_dims_added++;
+      xpose_permutation[i + 1] = i + tiled_dims.size();
+      i++;
+    }
+  }
+  result = b->AddInstruction(HloInstruction::CreateTranspose(
+      ShapeInference::InferTransposeShape(result->shape(), xpose_permutation)
+          .ValueOrDie(),
+      result, xpose_permutation));
+  // Reshape to the desired shape.
+  auto ag_shape = operand->shape();
+  for (int64 i : tiled_dims) {
+    ag_shape.set_dimensions(
+        i, ag_shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  result = b->AddInstruction(HloInstruction::CreateReshape(ag_shape, result));
+  return result;
+}
 
 StatusOr<bool> SpmdPartitioner::PartitionComputation(
     HloComputation* computation, const HloSharding& root_sharding,
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index f22f564be73..2918cd1ef58 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -99,8 +99,20 @@ struct SPMDCollectiveOpsCreator {
       const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
       absl::optional<int64> split_dimension)>
       create_cross_partition_all_to_all;
+
+  // Function used to create a cross-partition all-gather HLO. This is optional:
+  // if it is nullptr, the partitioner will use all-reduce instead.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand, const Shape& ag_shape,
+      const std::vector<std::vector<int64>>& partition_subgroups,
+      int64 channel_id, int64 all_gather_dimension)>
+      create_cross_partition_all_gather;
 };
 
+// Create a default SPMDCollectiveOpsCreator.
+SPMDCollectiveOpsCreator GetDefaultCollectiveOpsCreator(int64 num_partitions,
+                                                        int64 num_replicas);
+
 // Logger to report memory usage during SPMD partitioning.
 class SpmdLogger {
  public:
@@ -153,6 +165,15 @@ class SpmdPartitioner : public HloModulePass {
                                       int64* next_channel_id,
                                       SpmdLogger* logger);
 
+  // Creates all-gather based on HloSharding. Can be overridden to customize.
+  // The default uses a single all-gather even if there are multiple sharded
+  // dimensions, and adds potential reshapes and transposes to achieve that.
+  // If it returns false, the partitioner will fall back to all-reduce.
+  virtual HloInstruction* AllGatherShards(SpmdBuilder* b,
+                                          HloInstruction* operand,
+                                          const HloSharding& sharding,
+                                          int64 channel_id);
+
  protected:
   virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
       HloComputation* computation, int64 num_partitions, int64 num_replicas,
@@ -160,7 +181,6 @@ class SpmdPartitioner : public HloModulePass {
       int64* next_channel_id, SpmdLogger* logger,
       SpmdPartitionerOptions options);
 
- private:
   // Verify that the sharding of instructions in the module are valid, and also
   // fill in missing sharding information.
   Status PreprocessSharding(HloModule* module);
@@ -205,6 +225,7 @@ class PartitionedHlo {
     SPMDCollectiveOpsCreator collective_ops_creator;
     int64* next_channel_id;
     ReshardCache* reshard_cache;
+    SpmdPartitioner* partitioner;
   };
   PartitionedHlo(HloInstruction* hlo, Shape base_shape, PartitioningState state)
       : hlo_(hlo), base_shape_(base_shape), state_(std::move(state)) {
@@ -378,6 +399,7 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
     state.collective_ops_creator = collective_ops_creator_;
     state.next_channel_id = next_channel_id_;
     state.reshard_cache = &reshard_cache_;
+    state.partitioner = partitioner_;
     return state;
   }
 
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index ca1afc816b0..55d7dc43785 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -41,13 +41,19 @@ class SpmdPartitioningTest : public HloTestBase {
     SpmdPartitionerOptions options;
     options.conv_halo_exchange_always_on_lhs = conv_halo_exchange_always_on_lhs;
     options.allow_module_signature_change = true;
+    auto collective_ops_creator =
+        GetDefaultCollectiveOpsCreator(num_devices, /*num_replicas=*/1);
+    // Do not use all-gather for pattern-matching purpose, as the partitioner
+    // might create reshape/transposes around it.
+    collective_ops_creator.create_cross_partition_all_gather = nullptr;
 
     TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(
                                          hlo_module, GetModuleConfigForTest()));
     HloPassPipeline pass("spmd-partitioning");
     pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
                               /*allow_mixed_precision=*/false);
-    pass.AddPass<SpmdPartitioner>(num_devices, /*num_replicas=*/1, options);
+    pass.AddPass<SpmdPartitioner>(num_devices, /*num_replicas=*/1, options,
+                                  collective_ops_creator);
     pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
                               /*allow_mixed_precision=*/false);
     TF_RETURN_IF_ERROR(pass.Run(module.get()).status());

From 1342841b40da5e4e411a3d8b11b2808af9501327 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 May 2020 02:02:20 -0700
Subject: [PATCH 1044/1533] compat: Update forward compatibility horizon to
 2020-05-22

PiperOrigin-RevId: 312811070
Change-Id: I98bf33db520a718880cb042abd3e9c0b2a765654
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 58b777a1310..56bf2894db7 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 21)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 22)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From f8c0e68a8aa5d575a19129ec67c9ed6262652082 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 May 2020 02:02:20 -0700
Subject: [PATCH 1045/1533] Update GraphDef version to 409.

PiperOrigin-RevId: 312811071
Change-Id: I7733dc25650d03e4480efc48294576937e5736f8
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 9db20363349..3724f06ba4b 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 408  // Updated: 2020/5/21
+#define TF_GRAPH_DEF_VERSION 409  // Updated: 2020/5/22
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 0c8327245139da454bedeee08d7bf5cb3b181aab Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 22 May 2020 07:31:23 -0700
Subject: [PATCH 1046/1533] [tf.data] Switching to using multi-device function
 by default.

PiperOrigin-RevId: 312830323
Change-Id: I9e1ae4aea3ab230f06a26dc79a17fc3aa66ca422
---
 .../core/kernels/data/captured_function.cc    | 75 +------------------
 .../core/kernels/data/captured_function.h     |  4 -
 2 files changed, 1 insertion(+), 78 deletions(-)

diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index adba99d37a4..dd64475d7d6 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -560,8 +560,7 @@ Status CapturedFunction::Instantiate(
   if (!metadata_->use_inter_op_parallelism()) {
     inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
-  bool is_multi_device = false;
-  TF_RETURN_IF_ERROR(IsMultiDevice(ctx, &is_multi_device));
+  bool is_multi_device = metadata_->use_multi_device_function();
   inst_opts.is_multi_device_function = is_multi_device;
 
   // We infer the target device from the function library runtime.
@@ -864,77 +863,5 @@ CapturedFunction::CapturedFunction(
     : metadata_(std::move(metadata)),
       captured_inputs_(std::move(captured_inputs)) {}
 
-Status CapturedFunction::IsMultiDevice(IteratorContext* ctx,
-                                       bool* is_multi_device) {
-  if (!metadata_->use_multi_device_function()) {
-    *is_multi_device = false;
-    return Status::OK();
-  }
-
-  const FunctionDef* fdef;
-  TF_RETURN_IF_ERROR(
-      LookupFunction(*metadata_->lib_def(), metadata_->func().name(), &fdef));
-
-  Device* current_device = ctx->flr()->device();
-  DeviceType current_device_type(current_device->device_type());
-  DeviceNameUtils::ParsedName current_device_name;
-  if (!DeviceNameUtils::ParseFullName(current_device->name(),
-                                      &current_device_name)) {
-    return errors::InvalidArgument("Failed to parse device name: ",
-                                   current_device->name());
-  }
-
-  // Check if any of the captured inputs are placed on a device not compatible
-  // with the current device. For non-captured inputs, we assume they are placed
-  // on the current device.
-  for (const auto& input : captured_inputs_) {
-    DataType dtype = input.dtype();
-    if (dtype == DT_RESOURCE) {
-      const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
-      DeviceNameUtils::ParsedName resource_device_name;
-      if (!DeviceNameUtils::ParseFullName(handle.device(),
-                                          &resource_device_name)) {
-        return errors::InvalidArgument("Failed to parse device name: ",
-                                       handle.device());
-      }
-      if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
-                                                  resource_device_name)) {
-        *is_multi_device = true;
-        return Status::OK();
-      }
-    }
-  }
-
-  // Check if all ops could be placed on the current device.
-  for (const auto& name : metadata_->lib_def()->ListFunctionNames()) {
-    const FunctionDef* fdef;
-    TF_RETURN_IF_ERROR(LookupFunction(*metadata_->lib_def(), name, &fdef));
-    for (const auto& node : fdef->node_def()) {
-      // Check if the op has a kernel available for the current device.
-      if (!KernelDefAvailable(current_device_type, node)) {
-        *is_multi_device = true;
-        return Status::OK();
-      }
-      // If the op has a requested device, check if the requested device is
-      // compatible with the current device.
-      if (!node.device().empty()) {
-        DeviceNameUtils::ParsedName node_device_name;
-        if (!DeviceNameUtils::ParseFullName(node.device(), &node_device_name)) {
-          return errors::InvalidArgument("Failed to parse device name: ",
-                                         node.device());
-        }
-        if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
-                                                    node_device_name)) {
-          *is_multi_device = true;
-          return Status::OK();
-        }
-      }
-    }
-  }
-
-  *is_multi_device = false;
-  return Status::OK();
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 284a02091dd..de424fc547c 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -256,10 +256,6 @@ class CapturedFunction {
   CapturedFunction(std::shared_ptr<const FunctionMetadata> metadata,
                    std::vector<Tensor> captured_inputs);
 
-  // Determines whether the captured function requires the use of the
-  // multi-device function backend.
-  Status IsMultiDevice(IteratorContext* ctx, bool* is_multi_device);
-
   const std::shared_ptr<const FunctionMetadata> metadata_;
   const std::vector<Tensor> captured_inputs_;
 

From e0913946055cc13fc78f114150b6f8d0ef4e7930 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 May 2020 08:02:52 -0700
Subject: [PATCH 1047/1533] [tf.data] Switching to using multi-device function
 by default.

PiperOrigin-RevId: 312831784
Change-Id: Icf0c5b26bcd751220e97882ea8e2cc699265d5ab
---
 .../core/kernels/data/captured_function.cc    | 75 ++++++++++++++++++-
 .../core/kernels/data/captured_function.h     |  4 +
 2 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index dd64475d7d6..adba99d37a4 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -560,7 +560,8 @@ Status CapturedFunction::Instantiate(
   if (!metadata_->use_inter_op_parallelism()) {
     inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
-  bool is_multi_device = metadata_->use_multi_device_function();
+  bool is_multi_device = false;
+  TF_RETURN_IF_ERROR(IsMultiDevice(ctx, &is_multi_device));
   inst_opts.is_multi_device_function = is_multi_device;
 
   // We infer the target device from the function library runtime.
@@ -863,5 +864,77 @@ CapturedFunction::CapturedFunction(
     : metadata_(std::move(metadata)),
       captured_inputs_(std::move(captured_inputs)) {}
 
+Status CapturedFunction::IsMultiDevice(IteratorContext* ctx,
+                                       bool* is_multi_device) {
+  if (!metadata_->use_multi_device_function()) {
+    *is_multi_device = false;
+    return Status::OK();
+  }
+
+  const FunctionDef* fdef;
+  TF_RETURN_IF_ERROR(
+      LookupFunction(*metadata_->lib_def(), metadata_->func().name(), &fdef));
+
+  Device* current_device = ctx->flr()->device();
+  DeviceType current_device_type(current_device->device_type());
+  DeviceNameUtils::ParsedName current_device_name;
+  if (!DeviceNameUtils::ParseFullName(current_device->name(),
+                                      &current_device_name)) {
+    return errors::InvalidArgument("Failed to parse device name: ",
+                                   current_device->name());
+  }
+
+  // Check if any of the captured inputs are placed on a device not compatible
+  // with the current device. For non-captured inputs, we assume they are placed
+  // on the current device.
+  for (const auto& input : captured_inputs_) {
+    DataType dtype = input.dtype();
+    if (dtype == DT_RESOURCE) {
+      const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
+      DeviceNameUtils::ParsedName resource_device_name;
+      if (!DeviceNameUtils::ParseFullName(handle.device(),
+                                          &resource_device_name)) {
+        return errors::InvalidArgument("Failed to parse device name: ",
+                                       handle.device());
+      }
+      if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
+                                                  resource_device_name)) {
+        *is_multi_device = true;
+        return Status::OK();
+      }
+    }
+  }
+
+  // Check if all ops could be placed on the current device.
+  for (const auto& name : metadata_->lib_def()->ListFunctionNames()) {
+    const FunctionDef* fdef;
+    TF_RETURN_IF_ERROR(LookupFunction(*metadata_->lib_def(), name, &fdef));
+    for (const auto& node : fdef->node_def()) {
+      // Check if the op has a kernel available for the current device.
+      if (!KernelDefAvailable(current_device_type, node)) {
+        *is_multi_device = true;
+        return Status::OK();
+      }
+      // If the op has a requested device, check if the requested device is
+      // compatible with the current device.
+      if (!node.device().empty()) {
+        DeviceNameUtils::ParsedName node_device_name;
+        if (!DeviceNameUtils::ParseFullName(node.device(), &node_device_name)) {
+          return errors::InvalidArgument("Failed to parse device name: ",
+                                         node.device());
+        }
+        if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
+                                                    node_device_name)) {
+          *is_multi_device = true;
+          return Status::OK();
+        }
+      }
+    }
+  }
+
+  *is_multi_device = false;
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index de424fc547c..284a02091dd 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -256,6 +256,10 @@ class CapturedFunction {
   CapturedFunction(std::shared_ptr<const FunctionMetadata> metadata,
                    std::vector<Tensor> captured_inputs);
 
+  // Determines whether the captured function requires the use of the
+  // multi-device function backend.
+  Status IsMultiDevice(IteratorContext* ctx, bool* is_multi_device);
+
   const std::shared_ptr<const FunctionMetadata> metadata_;
   const std::vector<Tensor> captured_inputs_;
 

From 2662f079df1fcbc6995c26443d5c362c20d905be Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Tue, 24 Mar 2020 06:55:33 +0000
Subject: [PATCH 1048/1533] Use new transfer manager to improve read
 performance

Finish multi part download implementatione xcept error near end of file

Fix bug in get

Fix build error

Add test

Fix build error

Fix test

Fix test

modify test

Reenable test

Add override for tfrecord dataset buffer, Recognize the error when end of file is reached as a special case of error

Fix build error

Cleanup

Allow testing to compare old and new function behaviors

Remove logs in the test

Fix build error

Update test to improve time log

Remove new lines

Fix uploads due to them being too small. Made chunk size 5MB

Use separate transfer managers for upload and download, with different chunk sizes
---
 .../core/kernels/data/tf_record_dataset_op.cc |  14 +-
 tensorflow/core/platform/s3/s3_file_system.cc | 203 +++++++++++++++---
 tensorflow/core/platform/s3/s3_file_system.h  |  27 ++-
 .../core/platform/s3/s3_file_system_test.cc   |  95 +++++++-
 third_party/aws/workspace.bzl                 |   9 +-
 5 files changed, 299 insertions(+), 49 deletions(-)

diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index 94d523b5bfb..42d1e8b345b 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -39,7 +39,9 @@ namespace data {
 constexpr char kCurrentFileIndex[] = "current_file_index";
 constexpr char kOffset[] = "offset";
 constexpr char kGcsFsPrefix[] = "gs://";
+constexpr char kS3FsPrefix[] = "s3://";
 constexpr int64 kCloudTpuBlockSize = 127LL << 20;  // 127MB.
+constexpr int64 kS3BlockSize = kCloudTpuBlockSize;
 
 bool is_cloud_tpu_gcs_fs() {
 #if defined(PLATFORM_CLOUD_TPU) && defined(TPU_GCS_FS)
@@ -237,12 +239,14 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
       errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
   bool is_gcs_fs = true;
+  bool is_s3_fs = true;
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
     VLOG(2) << "Reading file: " << filenames_tensor->flat<tstring>()(i);
     filenames.push_back(filenames_tensor->flat<tstring>()(i));
     is_gcs_fs &= absl::StartsWith(filenames[i], kGcsFsPrefix);
+    is_s3_fs &= absl::StartsWith(filenames[i], kS3FsPrefix);
   }
 
   tstring compression_type;
@@ -255,7 +259,7 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
   OP_REQUIRES(ctx, buffer_size >= 0,
               errors::InvalidArgument(
                   "`buffer_size` must be >= 0 (0 == no buffering)"));
-
+  
   if (is_gcs_fs && is_cloud_tpu_gcs_fs() && buffer_size < kCloudTpuBlockSize) {
     VLOG(2) << "User buffer size is too small for reading Cloud TPU "
             << "TFRecords stored in GCS. Overriding " << buffer_size
@@ -264,6 +268,14 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
     buffer_size = kCloudTpuBlockSize;
   }
 
+  if (is_s3_fs && buffer_size < kS3BlockSize) {
+    VLOG(2) << "User buffer size is too small for reading "
+            << "TFRecords stored in S3. Overriding " << buffer_size
+            << " to the minimum recommended buffer_size = "
+            << kS3BlockSize;
+    buffer_size = kS3BlockSize;
+  }
+  
   *output =
       new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
 }
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 1726c9fbc6c..db3887b1ade 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <aws/core/utils/StringUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/stream/PreallocatedStreamBuf.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/AbortMultipartUploadRequest.h>
@@ -58,10 +59,16 @@ static const char* kS3TempFileTemplate = "/tmp/s3_filesystem_XXXXXX";
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int64 kS3TimeoutMsec = 300000;                       // 5 min
-static const uint64 kS3MultiPartCopyPartSize = 50 * 1024 * 1024;  // 50MB
+static const uint64 kS3MultiPartUploadChunkSize = 50 * 1024 * 1024;  // 50 MB
+static const uint64 kS3MultiPartDownloadChunkSize = 2 * 1024 * 1024;  // 50 MB
 static const int kS3GetChildrenMaxKeys = 100;
-static const int kExecutorPoolSize = 5;
-static const int kUploadRetries = 5;
+
+// With this change multiple threads are used in one single download.
+// Increasing the thread pool size since multiple downloads
+// and uploads can occur in parallel.
+static const int kExecutorPoolSize = 25;
+static const int kUploadRetries = 3;
+static const int kDownloadRetries = 3;
 static const char* kExecutorTag = "TransferManagerExecutor";
 
 Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
@@ -223,10 +230,15 @@ static Status CreateStatusFromAwsError(
 
 class S3RandomAccessFile : public RandomAccessFile {
  public:
-  S3RandomAccessFile(const string& bucket, const string& object,
+  S3RandomAccessFile(const string& bucket, const string& object, 
+                     const bool use_multi_part_download, 
+                     std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager,
                      std::shared_ptr<Aws::S3::S3Client> s3_client)
-      : bucket_(bucket), object_(object), s3_client_(s3_client) {}
-
+                    : bucket_(bucket), object_(object), 
+                      use_multi_part_download_(use_multi_part_download),
+                      transfer_manager_(transfer_manager),
+                      s3_client_(s3_client) {}
+  
   Status Name(StringPiece* result) const override {
     return errors::Unimplemented("S3RandomAccessFile does not support Name()");
   }
@@ -235,6 +247,66 @@ class S3RandomAccessFile : public RandomAccessFile {
               char* scratch) const override {
     VLOG(1) << "ReadFilefromS3 s3://" << bucket_ << "/" << object_ << " from "
             << offset << " for n:" << n;
+    if (use_multi_part_download_) {
+      return ReadS3TransferManager(offset, n, result, scratch);
+    } else {
+      return ReadS3Client(offset, n, result, scratch);
+    }
+  }
+
+  Status ReadS3TransferManager(uint64 offset, size_t n, StringPiece* result,
+                               char* scratch) const {
+    VLOG(3) << "Using TransferManager";
+    
+    auto create_stream_fn = [&]() {  // create stream lambda fn
+       return Aws::New<TFS3UnderlyingStream>(
+           "S3ReadStream",
+           Aws::New<Aws::Utils::Stream::PreallocatedStreamBuf>(
+             "S3ReadStream", reinterpret_cast<unsigned char*>(scratch), n));
+    };
+    
+    VLOG(3) << "Created stream to read with transferManager";
+
+    std::shared_ptr<Aws::Transfer::TransferHandle> handle =
+      transfer_manager_.get()->DownloadFile(
+        bucket_.c_str(), object_.c_str(), offset, n, create_stream_fn);
+    handle->WaitUntilFinished();
+
+    // todo change this
+    int retries = 0;
+    
+    while (
+      handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
+      handle->GetLastError().GetResponseCode() != Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE && 
+      retries++ < kDownloadRetries) {
+      // only failed parts will be downloaded again
+      VLOG(1) << "Retrying read of s3://" << bucket_ << "/" << object_
+              << " after failure. Current retry count:" << retries;
+      transfer_manager_.get()->RetryDownload(handle);
+      handle->WaitUntilFinished();
+    }
+
+    if (handle->GetStatus() != Aws::Transfer::TransferStatus::COMPLETED) {
+      auto error = handle->GetLastError();
+      if (error.GetResponseCode() ==
+          Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE) {
+        // expected when end of file is reached
+        n = 0;
+        *result = StringPiece(scratch, n);
+        return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+      }
+      return CreateStatusFromAwsError(error);
+    } else {
+      n = handle->GetBytesTotalSize();
+      *result = StringPiece(scratch, handle->GetBytesTransferred());
+      return Status::OK();
+    }       
+  }
+
+  Status ReadS3Client(uint64 offset, size_t n, StringPiece* result,
+                      char* scratch) const {
+    VLOG(3) << "ReadFile using S3Client s3://" << bucket_ << "/" << object_;
+      
     Aws::S3::Model::GetObjectRequest getObjectRequest;
     getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
@@ -242,6 +314,7 @@ class S3RandomAccessFile : public RandomAccessFile {
     getObjectRequest.SetResponseStreamFactory([]() {
       return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
     });
+    
     auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
     if (!getObjectOutcome.IsSuccess()) {
       auto error = getObjectOutcome.GetError();
@@ -252,18 +325,21 @@ class S3RandomAccessFile : public RandomAccessFile {
         return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
       }
       return CreateStatusFromAwsError(error);
-    }
-    n = getObjectOutcome.GetResult().GetContentLength();
-    getObjectOutcome.GetResult().GetBody().read(scratch, n);
+    } else {
+      n = getObjectOutcome.GetResult().GetContentLength();
+      getObjectOutcome.GetResult().GetBody().read(scratch, n);
 
-    *result = StringPiece(scratch, n);
-    return Status::OK();
+      *result = StringPiece(scratch, n);
+      return Status::OK();
+    }
   }
 
  private:
   string bucket_;
   string object_;
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
+  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
+  bool use_multi_part_download_;
 };
 
 class S3WritableFile : public WritableFile {
@@ -375,16 +451,51 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 S3FileSystem::S3FileSystem()
     : s3_client_(nullptr, ShutdownClient),
       initialization_lock_(),
-      transfer_manager_(nullptr, ShutdownTransferManager),
       executor_(nullptr, ShutdownExecutor) {
-  const char* part_size_str = getenv("S3_MULTI_PART_COPY_PART_SIZE");
-  multi_part_copy_part_size_ = kS3MultiPartCopyPartSize;
+  
+  const char* part_size_str = getenv("S3_MULTI_PART_UPLOAD_CHUNK_SIZE");
+  multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] = kS3MultiPartUploadChunkSize;
   if (part_size_str) {
     uint64 part_size_num;
     if (strings::safe_strtou64(part_size_str, &part_size_num)) {
-      multi_part_copy_part_size_ = part_size_num;
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] = part_size_num;
     }
   }
+
+  // Different TensorFlow APIs call the download API with different
+  // buffer size. Download performance depends on that size and this chunk size.
+  part_size_str = getenv("S3_MULTI_PART_DOWNLOAD_CHUNK_SIZE");
+  multi_part_chunk_size_[Aws::Transfer::TransferDirection::DOWNLOAD] = kS3MultiPartDownloadChunkSize;
+  if (part_size_str) {
+    uint64 part_size_num;
+    if (strings::safe_strtou64(part_size_str, &part_size_num)) {
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::DOWNLOAD] = part_size_num;
+    }
+  }
+
+  use_multi_part_download_ = true;
+  const char* disable_transfer_mgr = getenv("S3_DISABLE_MULTI_PART_DOWNLOAD");
+  if (disable_transfer_mgr) {
+   if (disable_transfer_mgr[0] == '1') {
+     use_multi_part_download_ = false;
+   }
+  }
+  
+  auto upload_pair = 
+    std::pair<Aws::Transfer::TransferDirection, 
+              std::shared_ptr<Aws::Transfer::TransferManager> > 
+             (Aws::Transfer::TransferDirection::UPLOAD,
+              std::shared_ptr<Aws::Transfer::TransferManager>
+              (nullptr, ShutdownTransferManager));
+  auto download_pair = 
+    std::pair<Aws::Transfer::TransferDirection, 
+              std::shared_ptr<Aws::Transfer::TransferManager> > 
+             (Aws::Transfer::TransferDirection::DOWNLOAD,
+              std::shared_ptr<Aws::Transfer::TransferManager>
+              (nullptr, ShutdownTransferManager));
+  
+  this->transfer_managers_.insert(upload_pair);
+  this->transfer_managers_.insert(download_pair);
 }
 
 S3FileSystem::~S3FileSystem() {}
@@ -424,20 +535,19 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
 }
 
 std::shared_ptr<Aws::Transfer::TransferManager>
-S3FileSystem::GetTransferManager() {
+S3FileSystem::GetTransferManager(const Aws::Transfer::TransferDirection& direction) {
   std::shared_ptr<Aws::S3::S3Client> s3_client = this->GetS3Client();
   std::lock_guard<mutex> lock(this->initialization_lock_);
-  if (this->transfer_manager_.get() == nullptr) {
-    Aws::Transfer::TransferManagerConfiguration config(
-        this->GetExecutor().get());
+  if (this->transfer_managers_[direction].get() == nullptr) {
+    Aws::Transfer::TransferManagerConfiguration config(this->GetExecutor().get());
     config.s3Client = s3_client;
-    config.bufferSize = this->multi_part_copy_part_size_;
-    // must be larger than pool size * multi_part_copy_part_size
+    config.bufferSize = this->multi_part_chunk_size_[direction];
+    // must be larger than pool size * multi part chunk size
     config.transferBufferMaxHeapSize =
-        (kExecutorPoolSize + 1) * this->multi_part_copy_part_size_;
-    this->transfer_manager_ = Aws::Transfer::TransferManager::Create(config);
+      (kExecutorPoolSize + 1) * this->multi_part_chunk_size_[direction];
+    this->transfer_managers_[direction] = Aws::Transfer::TransferManager::Create(config);
   }
-  return this->transfer_manager_;
+  return this->transfer_managers_[direction];
 }
 
 std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor>
@@ -452,9 +562,22 @@ S3FileSystem::GetExecutor() {
 
 Status S3FileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+  return NewRandomAccessFile(fname, result, true);
+}
+
+Status S3FileSystem::NewRandomAccessFile(
+    const string& fname, std::unique_ptr<RandomAccessFile>* result,
+    bool use_multi_part_download) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3RandomAccessFile(bucket, object, this->GetS3Client()));
+
+  // check if an override was defined for this file. used for testing
+  bool use_mpd = this->use_multi_part_download_ && use_multi_part_download;
+  result->reset(new S3RandomAccessFile(
+                      bucket, object, use_mpd,
+                      this->GetTransferManager(
+                        Aws::Transfer::TransferDirection::DOWNLOAD),
+                      this->GetS3Client()));
   return Status::OK();
 }
 
@@ -462,8 +585,12 @@ Status S3FileSystem::NewWritableFile(const string& fname,
                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
-                                   this->GetS3Client()));
+  result->reset(new S3WritableFile(
+                      bucket, object, 
+                      this->GetTransferManager(
+                        Aws::Transfer::TransferDirection::UPLOAD),
+                      this->GetS3Client()));
+
   return Status::OK();
 }
 
@@ -478,8 +605,11 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
 
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
-                                   this->GetS3Client()));
+  result->reset(new S3WritableFile(
+                      bucket, object, 
+                      this->GetTransferManager(
+                        Aws::Transfer::TransferDirection::UPLOAD),
+                      this->GetS3Client()));
 
   while (true) {
     status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
@@ -773,10 +903,10 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
   TF_RETURN_IF_ERROR(
       this->GetFileSize(string(source_full_path.c_str()), &file_length));
   int num_parts;
-  if (file_length <= multi_part_copy_part_size_) {
+  if (file_length <= multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]) {
     num_parts = 1;
   } else {
-    num_parts = ceil((float)file_length / multi_part_copy_part_size_);
+    num_parts = ceil((float)file_length / multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]);
   }
 
   if (num_parts == 1) {
@@ -786,7 +916,8 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
         "MultiPartCopy with number of parts more than 10000 is not supported. "
         "Your object ",
         source, " required ", num_parts,
-        " as multi_part_copy_part_size is set to ", multi_part_copy_part_size_,
+        " as multi_part_copy_part_size is set to ", 
+        multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD],
         ". You can control this part size using the environment variable ",
         "S3_MULTI_PART_COPY_PART_SIZE to increase it.");
     return tensorflow::errors::Unimplemented(message);
@@ -831,7 +962,9 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
 
   Aws::String uploadID = multipartUploadOutcome.GetResult().GetUploadId();
   VLOG(1) << "Copying from " << source << " in " << num_parts
-          << " parts of size " << multi_part_copy_part_size_ << " each";
+          << " parts of size "
+          << multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]
+          << " each";
   Aws::S3::Model::CompletedMultipartUpload completedMPURequest;
 
   // passed to each callback keyed by partNumber
@@ -859,8 +992,8 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
     for (std::map<int, PartState>::iterator it = incompletePartStates.begin();
          it != incompletePartStates.end(); it++) {
       int partNumber = it->first;
-      uint64 startPos = (partNumber - 1) * multi_part_copy_part_size_;
-      uint64 endPos = startPos + kS3MultiPartCopyPartSize - 1;
+      uint64 startPos = (partNumber - 1) * multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD];
+      uint64 endPos = startPos + multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] - 1;
       if (endPos >= file_length) {
         endPos = file_length - 1;
       }
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 7b70c374a51..dd4cdb8ad3e 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -52,6 +52,10 @@ class S3FileSystem : public FileSystem {
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
 
+  Status NewRandomAccessFile(
+      const string& fname, std::unique_ptr<RandomAccessFile>* result,
+      bool use_multi_part_download);
+
   Status NewWritableFile(const string& fname,
                          std::unique_ptr<WritableFile>* result) override;
 
@@ -101,8 +105,9 @@ class S3FileSystem : public FileSystem {
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
 
   // Returns the member transfer manager, initializing as-needed.
-  std::shared_ptr<Aws::Transfer::TransferManager> GetTransferManager();
-  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
+  std::shared_ptr<Aws::Transfer::TransferManager> GetTransferManager(const Aws::Transfer::TransferDirection& direction);
+  void InitializeTransferManagers();
+  std::map<Aws::Transfer::TransferDirection, std::shared_ptr<Aws::Transfer::TransferManager> >transfer_managers_;
 
   // Returns the member executor for transfer manager, initializing as-needed.
   std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor> GetExecutor();
@@ -132,8 +137,11 @@ class S3FileSystem : public FileSystem {
   // Lock held when checking for s3_client_ and transfer_manager_ initialization
   mutex initialization_lock_;
 
-  // size to split objects during multipart copy
-  uint64 multi_part_copy_part_size_;
+  // size to split objects during multipart upload/download/copy
+  std::map<Aws::Transfer::TransferDirection, uint64> multi_part_chunk_size_;
+
+  bool use_multi_part_download_;
+
 };
 
 /// S3 implementation of a file system with retry on failures.
@@ -147,6 +155,17 @@ class RetryingS3FileSystem : public RetryingFileSystem<S3FileSystem> {
                         )) {}
 };
 
+
+  // AWS Streams destroy the buffer (buf) passed, so creating a new
+  // IOStream that retains the buffer so the calling function
+  // can control it's lifecycle
+class TFS3UnderlyingStream : public Aws::IOStream {
+  public:
+    using Base = Aws::IOStream;
+    TFS3UnderlyingStream(std::streambuf* buf) : Base(buf) {}
+    virtual ~TFS3UnderlyingStream() = default;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 224e30c6bb3..52feece88df 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/platform/s3/s3_file_system.h"
-
+#include <time.h>
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/path.h"
@@ -62,6 +62,93 @@ class S3FileSystemTest : public ::testing::Test {
     return Status::OK();
   }
 
+  Status ReadAllInChunks(const string& fname, string* content, bool use_multi_part_download=true) {
+    std::unique_ptr<RandomAccessFile> reader;
+
+    TF_RETURN_IF_ERROR(s3fs.NewRandomAccessFile(fname, &reader, use_multi_part_download));
+    
+    uint64 file_size = 0;
+    TF_RETURN_IF_ERROR(s3fs.GetFileSize(fname, &file_size));
+
+    content->resize(file_size);
+
+    uint64 buffer_size = 16 * 1024 * 1024;
+
+    std::size_t part_count = (std::max)(
+     static_cast<size_t>((file_size + buffer_size - 1) / buffer_size),
+     static_cast<std::size_t>(1));
+    VLOG(1) << "buffersize:" << buffer_size << " file_size:" << file_size
+         << " part_count=" << part_count;
+    std::unique_ptr<char[]> buffer{new char[buffer_size]};
+    std::stringstream ss;
+
+    int offset = 0;
+    int result_size = 0;
+
+    using namespace std::chrono;
+    auto start = high_resolution_clock::now();
+
+    for (int i = 0; i < part_count; i++) {
+      StringPiece result;
+      offset = i * buffer_size;
+      TF_RETURN_IF_ERROR(
+         reader->Read(offset, buffer_size, &result, buffer.get()));
+
+      if (result.size() != 0) {
+        ss.write(result.data(), result.size());
+        result_size += result.size();
+      }
+      if (result_size == file_size) {
+        break;
+      }
+      if (result.size() != buffer_size) {
+        VLOG(1) << "Result size and buffer size did not match";
+        if (result.empty()) {
+          return errors::OutOfRange("eof");
+        } else {
+          return errors::DataLoss("truncated record at ", offset);
+        }
+      }
+    }
+
+    if (file_size != result_size) {
+      return errors::DataLoss("expected ", file_size, " got ", result_size,
+                              " bytes");
+    }
+
+    auto stop = high_resolution_clock::now();
+    duration<double> time_taken = duration_cast<duration<double>>(stop - start);
+    VLOG(1) << "Time Taken" << " : " << time_taken.count() << "seconds";
+
+    memcpy((char*)(content->data()), ss.str().data(),
+        static_cast<size_t>(file_size));
+
+    return Status::OK();
+  }
+     
+  Status ReadLargeFile() {
+    // const string fname = TmpDir("train-00001-of-01024");
+    auto large_file_name = getenv("LARGE_DOWNLOAD_FILE_NAME");
+    const string fname = TmpDir(large_file_name);
+    string content_xfer;
+    string content_s3client;
+    
+    // Read using Chunked Transfer Manager
+    VLOG(1) << "Using transfer manager";
+    TF_RETURN_IF_ERROR(ReadAllInChunks(fname, &content_xfer));
+    
+    VLOG(1) << "Without transfer manager";
+    // Read using old S3 API and see if the contents match with TransferManager
+    TF_RETURN_IF_ERROR(ReadAllInChunks(fname, &content_s3client, false));
+
+    if (content_xfer == content_s3client) {
+      return Status::OK();
+    } else {
+      VLOG(1) << "ReadLargeFile contents DO NOT match";
+      return Status(error::OUT_OF_RANGE, "ReadLargeFile contents DO NOT match");
+    }
+  }
+
   S3FileSystem s3fs;
 };
 
@@ -73,7 +160,7 @@ TEST_F(S3FileSystemTest, NewRandomAccessFile) {
 
   std::unique_ptr<RandomAccessFile> reader;
   TF_EXPECT_OK(s3fs.NewRandomAccessFile(fname, &reader));
-
+  
   string got;
   got.resize(content.size());
   StringPiece result;
@@ -236,5 +323,9 @@ TEST_F(S3FileSystemTest, HasAtomicMove) {
   EXPECT_EQ(has_atomic_move, false);
 }
 
+TEST_F(S3FileSystemTest, NewRandomAccessBigFile) {
+  TF_EXPECT_OK(ReadLargeFile());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index 13c573337d9..c96a85c960c 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -6,14 +6,9 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 # the  copts field of the //third_party/aws:aws target
 
 def repo():
-    third_party_http_archive(
+    native.new_local_repository(
         name = "aws",
-        urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
-        ],
-        sha256 = "39fd8a2999260d2b8fcbc8187f1ed5299972c2b8bd14adb7850fd674fea67fb7",
-        strip_prefix = "aws-sdk-cpp-1.7.266",
+        path="/home/ubuntu/aws-sdk-cpp",
         build_file = "//third_party/aws:BUILD.bazel",
     )
 

From f5ece2f397172edc1d9dc6f529c459682cafcafc Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 22 May 2020 09:40:58 -0700
Subject: [PATCH 1049/1533] Use new release of aws sdk

---
 third_party/aws/workspace.bzl | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index c96a85c960c..cb8733c91df 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -6,9 +6,14 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 # the  copts field of the //third_party/aws:aws target
 
 def repo():
-    native.new_local_repository(
+    third_party_http_archive(
         name = "aws",
-        path="/home/ubuntu/aws-sdk-cpp",
+        urls = [
+             "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
+             "https://github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
+         ],
+        sha256 = "758174f9788fed6cc1e266bcecb20bf738bd5ef1c3d646131c9ed15c2d6c5720",
+        strip_prefix = "aws-sdk-cpp-1.7.336",
         build_file = "//third_party/aws:BUILD.bazel",
     )
 

From ae6061da8311d40faf9ee91a4466a15811c7efbe Mon Sep 17 00:00:00 2001
From: Rahul Huilgol <huilgolr@amazon.com>
Date: Fri, 22 May 2020 10:02:09 -0700
Subject: [PATCH 1050/1533] Whitespace changes

---
 tensorflow/core/kernels/data/tf_record_dataset_op.cc | 2 +-
 tensorflow/core/platform/s3/s3_file_system_test.cc   | 2 +-
 third_party/aws/workspace.bzl                        | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index 42d1e8b345b..b0319b941cf 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -259,7 +259,7 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
   OP_REQUIRES(ctx, buffer_size >= 0,
               errors::InvalidArgument(
                   "`buffer_size` must be >= 0 (0 == no buffering)"));
-  
+
   if (is_gcs_fs && is_cloud_tpu_gcs_fs() && buffer_size < kCloudTpuBlockSize) {
     VLOG(2) << "User buffer size is too small for reading Cloud TPU "
             << "TFRecords stored in GCS. Overriding " << buffer_size
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 52feece88df..35013917fab 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -160,7 +160,7 @@ TEST_F(S3FileSystemTest, NewRandomAccessFile) {
 
   std::unique_ptr<RandomAccessFile> reader;
   TF_EXPECT_OK(s3fs.NewRandomAccessFile(fname, &reader));
-  
+
   string got;
   got.resize(content.size());
   StringPiece result;
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index cb8733c91df..658aaaff00d 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -9,9 +9,9 @@ def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-             "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
-             "https://github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
-         ],
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
+        ],
         sha256 = "758174f9788fed6cc1e266bcecb20bf738bd5ef1c3d646131c9ed15c2d6c5720",
         strip_prefix = "aws-sdk-cpp-1.7.336",
         build_file = "//third_party/aws:BUILD.bazel",

From c64097cb5f68c28491fd6e2b954d203b2fb5eca5 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Fri, 22 May 2020 10:43:42 -0700
Subject: [PATCH 1051/1533] Re-enable multi_worker_tutorial_test (by fixing the
 timeout on shorter epoch/steps)

PiperOrigin-RevId: 312842049
Change-Id: If0d7f0fcb4463c718f5532f62cca17ac23cab99a
---
 tensorflow/python/keras/distribute/BUILD                      | 1 -
 .../python/keras/distribute/multi_worker_tutorial_test.py     | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index b7fe3b5bda6..6a39ebc5007 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -432,7 +432,6 @@ py_test(
     tags = [
         "noasan",  # TODO(b/156029134)
         "nomsan",  # TODO(b/156029134)
-        "notap",  # TODO(b/157253858)
         "notsan",  # TODO(b/156029134)
     ],
     deps = [
diff --git a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
index 1a46bcd7499..3f9ab18f89c 100644
--- a/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_tutorial_test.py
@@ -120,8 +120,8 @@ class MultiWorkerTutorialTest(parameterized.TestCase, test.TestCase):
 
       multi_worker_model.fit(
           multi_worker_dataset,
-          epochs=3,
-          steps_per_epoch=70,
+          epochs=2,
+          steps_per_epoch=20,
           callbacks=callbacks)
 
     with test_util.skip_if_error(self, errors_impl.UnavailableError):

From 227024b31adaabe4b9950578fb96924689941998 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Fri, 22 May 2020 10:49:01 -0700
Subject: [PATCH 1052/1533] Use `is` instead of equality when checking for
 whitelisted modules, to avoid triggering side effects.

PiperOrigin-RevId: 312842395
Change-Id: Ie8294cdedb657adf69af90130ac354dff77220dc
---
 tensorflow/python/autograph/impl/api.py        | 16 +++++++++++-----
 .../python/autograph/impl/api_py3_test.py      | 18 ++++++++++++++++++
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 3ebb5824b7f..98e19fdde86 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -18,13 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import copy
 import functools
 import inspect
 import os
-import pdb
-import re
 import sys
 import textwrap
 import traceback
@@ -344,6 +340,15 @@ def _call_unconverted(f, args, kwargs, options, update_cache=True):
   return f(*args)
 
 
+def _is_of_known_loaded_module(f, module_name):
+  mod = sys.modules.get(module_name, None)
+  if mod is None:
+    return False
+  if any(v is not None for v in mod.__dict__.values() if f is v):
+    return True
+  return False
+
+
 def _is_known_loaded_type(f, module_name, entity_name):
   """Tests whether the function or method is an instance of a known type."""
   if (module_name not in sys.modules or
@@ -511,7 +516,8 @@ def converted_call(f,
   # Other built-in modules are permanently whitelisted.
   # TODO(mdan): Figure out how to do this consistently for all stdlib modules.
   if any(
-      f in m.__dict__.values() for m in (collections, pdb, copy, inspect, re)):
+      _is_of_known_loaded_module(f, m)
+      for m in ('collections', 'pdb', 'copy', 'inspect', 're')):
     logging.log(2, 'Permanently whitelisted: %s: part of builtin module', f)
     return _call_unconverted(f, args, kwargs, options)
 
diff --git a/tensorflow/python/autograph/impl/api_py3_test.py b/tensorflow/python/autograph/impl/api_py3_test.py
index df6544928bf..c460e478008 100644
--- a/tensorflow/python/autograph/impl/api_py3_test.py
+++ b/tensorflow/python/autograph/impl/api_py3_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 
 from tensorflow.python.autograph.core import converter
@@ -60,6 +61,23 @@ class ApiTest(test.TestCase):
 
     self.assertEqual(5, tc.no_arg(2))
 
+  def test_converted_call_avoids_triggering_operators(self):
+
+    test_self = self
+
+    class Pair(collections.namedtuple('Pair', ['a', 'b'])):
+
+      def __call__(self):
+        return self.a + self.b
+
+      def __eq__(self, other):
+        test_self.fail('Triggered operator')
+
+    p = Pair(constant_op.constant(1), constant_op.constant(2))
+
+    x = api.converted_call(p, (), {}, options=DEFAULT_RECURSIVE)
+    self.assertIsNotNone(self.evaluate(x), 3)
+
 
 if __name__ == '__main__':
   os.environ['AUTOGRAPH_STRICT_CONVERSION'] = '1'

From c2aa840a2eaa6068497275a5ddc1b99da3fc0960 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Fri, 22 May 2020 11:51:36 -0700
Subject: [PATCH 1053/1533] GPU delegate: Add Android to
 tflite_extra_gles_deps, instead of manually linking against system libraries
 for EGL and GLESv3.

PiperOrigin-RevId: 312846221
Change-Id: I2179e81d026144c092573a254099838605b8648b
---
 tensorflow/lite/delegates/gpu/BUILD           | 42 +++----------------
 tensorflow/lite/delegates/gpu/cl/BUILD        | 17 ++------
 .../gpu/common/testing/feature_parity/BUILD   |  4 --
 .../lite/delegates/gpu/gl/kernels/BUILD       | 27 ++++--------
 .../delegates/gpu/java/src/main/native/BUILD  |  7 ----
 5 files changed, 17 insertions(+), 80 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index c667c2056f4..5604c16132f 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -29,17 +29,6 @@ cc_library(
     name = "gl_delegate",
     srcs = ["gl_delegate.cc"],
     hdrs = ["gl_delegate.h"],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-lEGL",
-            # We don't need to link libGLESv3, because if it exists,
-            # it is a symlink to libGLESv2.
-            # See Compatibility Definition Document:
-            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
-            "-lGLESv2",
-        ],
-        "//conditions:default": [],
-    }),
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:span",
@@ -120,11 +109,6 @@ cc_binary(
     linkopts = [
         "-Wl,-soname=libtensorflowlite_gpu_gl.so",
     ] + select({
-        "//tensorflow:android": [
-            "-lEGL",
-            "-lGLESv3",
-            "-fvisibility=hidden",
-        ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-fvisibility=hidden",
@@ -136,7 +120,7 @@ cc_binary(
         "nobuilder",
         "notap",
     ],
-    deps = [":gl_delegate"],
+    deps = [":gl_delegate"] + tflite_extra_gles_deps(),
 )
 
 # build -c opt --config android_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt --linkopt -s --strip always :libtensorflowlite_gpu_delegate.so
@@ -145,11 +129,6 @@ cc_binary(
     linkopts = [
         "-Wl,-soname=libtensorflowlite_gpu_delegate.so",
     ] + select({
-        "//tensorflow:android": [
-            "-lEGL",
-            "-lGLESv3",
-            "-fvisibility=hidden",
-        ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-fvisibility=hidden",
@@ -161,7 +140,7 @@ cc_binary(
         "nobuilder",
         "notap",
     ],
-    deps = [":delegate"],
+    deps = [":delegate"] + tflite_extra_gles_deps(),
 )
 
 # bazel build -c opt --cpu ios_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always --cxxopt=-std=c++14 :libtensorflowlite_gpu_metal --apple_platform_type=ios
@@ -221,18 +200,9 @@ cc_library(
     name = "delegate",
     srcs = ["delegate.cc"],
     hdrs = ["delegate.h"],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-lEGL",
-            # We don't need to link libGLESv3, because if it exists,
-            # it is a symlink to libGLESv2.
-            # See Compatibility Definition Document:
-            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
-            "-lGLESv2",
-        ],
-        "//conditions:default": [],
-    }),
     deps = [
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
@@ -246,7 +216,5 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:api2",
         "//tensorflow/lite/kernels/internal:optimized_base",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
-    ],
+    ] + tflite_extra_gles_deps(),
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 2e686810767..134148d084d 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -1,8 +1,6 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
+load("//tensorflow/core/platform:build_config_root.bzl", "tf_gpu_tests_tags")
+load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -267,17 +265,11 @@ cc_library(
     name = "gpu_api_delegate",
     srcs = ["gpu_api_delegate.cc"],
     hdrs = ["gpu_api_delegate.h"],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-lEGL",
-            "-lGLESv3",
-        ],
-        "//conditions:default": [],
-    }),
     deps = [
         ":api",
         ":opencl_wrapper",
         ":tensor_type_util",
+        "@com_google_absl//absl/types:span",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates/gpu:api",
@@ -287,8 +279,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
-        "@com_google_absl//absl/types:span",
-    ],
+    ] + tflite_extra_gles_deps(),
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
index b5ceff30d1e..96dc61ed1e7 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
@@ -35,10 +35,6 @@ cc_library(
 cc_test(
     name = "opengl_test",
     srcs = ["opengl_test.cc"],
-    linkopts = [
-        "-lEGL",
-        "-lGLESv3",
-    ],
     tags = tf_gpu_tests_tags() + [
         "local",
         "nobuilder",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index 700a553a125..e5bd97c7182 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -1,8 +1,5 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
+load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite_combined")
+load("//tensorflow/core/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -32,10 +29,6 @@ cc_test(
     name = "converter_test",
     size = "small",
     srcs = ["converter_test.cc"],
-    linkopts = [
-        "-lEGL",
-        "-lGLESv3",
-    ],
     tags = tf_gpu_tests_tags() + [
         "local",
         "nobuilder",
@@ -44,15 +37,15 @@ cc_test(
     ],
     deps = [
         ":converter",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/types:span",
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:egl_environment",
         "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
         "//tensorflow/lite/delegates/gpu/gl:portable",
-        "@com_google_absl//absl/types:span",
-        "@com_google_googletest//:gtest_main",
-    ],
+    ] + tflite_extra_gles_deps(),
 )
 
 cc_library(
@@ -655,11 +648,9 @@ cc_library(
     testonly = 1,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
-    linkopts = [
-        "-lEGL",
-        "-lGLESv3",
-    ],
     deps = [
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -673,9 +664,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:runtime_options",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
-        "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
-    ],
+    ] + tflite_extra_gles_deps(),
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
index 774fd417758..695cb58381a 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
@@ -13,13 +13,6 @@ cc_library(
     name = "native",
     srcs = ["gpu_delegate_jni.cc"],
     copts = tflite_copts(),
-    linkopts = select({
-        "//tensorflow:android": [
-            "-lGLESv3",
-            "-lEGL",
-        ],
-        "//conditions:default": [],
-    }),
     tags = [
         "manual",
         "notap",

From 4797b3b90859c9eb825428a7a6a46eb86bc03772 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Fri, 22 May 2020 12:12:28 -0700
Subject: [PATCH 1054/1533] When calling `strategy.reduce` in eager mode, wrap
 the `strategy.run` calls inside  with `tf.function` so it is compatible with
 TPUStrategy.

PiperOrigin-RevId: 312847673
Change-Id: I6db92c34ba24e160689da3fca2fe0a3c26223d52
---
 tensorflow/python/distribute/BUILD            | 17 ++++++
 .../custom_training_loop_models_test.py       | 30 +++++++++++
 .../python/distribute/distribute_lib.py       | 43 +++++++++++++--
 .../python/distribute/strategy_reduce_test.py | 52 +++++++++++++++++++
 4 files changed, 139 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/python/distribute/strategy_reduce_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index acbffb84089..01ae1b61f6a 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1181,6 +1181,23 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "strategy_reduce_test",
+    srcs = ["strategy_reduce_test.py"],
+    main = "strategy_reduce_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        ":strategy_combinations",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "minimize_loss_test",
     srcs = ["minimize_loss_test.py"],
diff --git a/tensorflow/python/distribute/custom_training_loop_models_test.py b/tensorflow/python/distribute/custom_training_loop_models_test.py
index 48f2af0349a..5a9384bb7e0 100644
--- a/tensorflow/python/distribute/custom_training_loop_models_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_models_test.py
@@ -26,6 +26,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
@@ -448,6 +449,35 @@ class KerasModelsTest(test.TestCase, parameterized.TestCase):
 
     train_step(input_iterator)
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies, mode=["eager"]))
+  def test_reduce_loss(self, distribution):
+    inputs = np.zeros((10, 4), dtype=np.float32)
+    targets = np.zeros((10, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.batch(10, drop_remainder=False)
+    input_iterator = iter(distribution.experimental_distribute_dataset(dataset))
+
+    with distribution.scope():
+      x = keras.layers.Input(shape=(4), name="input")
+      y = keras.layers.Dense(3, name="dense")(x)
+      model = keras.Model(x, y)
+
+    @def_function.function
+    def train_step(iterator):
+
+      def step_fn(inputs):
+        images, targets = inputs
+        outputs = model(images)
+        loss = keras.losses.sparse_categorical_crossentropy(targets, outputs)
+        return loss
+
+      return distribution.run(step_fn, args=(next(iterator),))
+
+    loss = train_step(input_iterator)
+    loss = distribution.reduce(reduce_util.ReduceOp.MEAN, loss, axis=0)
+
   @combinations.generate(
       combinations.combine(
           distribution=strategy_combinations.tpu_strategies, mode=["eager"]))
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 4531e922840..ecdc4fad159 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -114,6 +114,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.distribute import numpy_dataset
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context as eager_context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -628,6 +629,10 @@ class StrategyBase(object):
         # a sensible value.
         extended._retrace_functions_for_each_device = True
 
+    # Below are the dicts of axis(int) -> `tf.function`.
+    self._mean_reduce_helper_fns = {}
+    self._reduce_sum_fns = {}
+
   @property
   def extended(self):
     """`tf.distribute.StrategyExtended` with additional methods."""
@@ -1014,8 +1019,25 @@ class StrategyBase(object):
     if axis is None:
       return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
     if reduce_op == reduce_util.ReduceOp.SUM:
-      value = self.run(
-          lambda v: math_ops.reduce_sum(v, axis=axis), args=(value,))
+
+      def reduce_sum(v):
+        return math_ops.reduce_sum(v, axis=axis)
+
+      if eager_context.executing_eagerly():
+        # As some strategies (e.g. TPUStrategy) doesn't support pure eager
+        # execution, wrap the `reduce_sum_fn` with a `tf.function` so it can be
+        # run from eager mode. Cache the tf.function by `axis` to avoid the
+        # same function to be traced again.
+        if axis not in self._reduce_sum_fns:
+
+          def reduce_sum_fn(v):
+            return self.run(reduce_sum, args=(v,))
+
+          self._reduce_sum_fns[axis] = def_function.function(reduce_sum_fn)
+        value = self._reduce_sum_fns[axis](value)
+      else:
+        value = self.run(reduce_sum, args=(value,))
+
       return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
     if reduce_op != reduce_util.ReduceOp.MEAN:
       raise TypeError("Expected `reduce_op` to be a `tf.distribute.ReduceOp`, "
@@ -1062,7 +1084,22 @@ class StrategyBase(object):
       # reduce is complete?
       return numer, denom
 
-    numer, denom = self.run(mean_reduce_helper, args=(value,))
+    if eager_context.executing_eagerly():
+      # As some strategies (e.g. TPUStrategy) doesn't support pure eager
+      # execution, wrap the `mean_reduce_helper` with a `tf.function` so it can
+      # be run from eager mode. Cache the tf.function by `axis` to avoid the
+      # same function to be traced again.
+      if axis not in self._mean_reduce_helper_fns:
+
+        def mean_reduce_fn(v):
+          return self.run(mean_reduce_helper, args=(v,))
+
+        self._mean_reduce_helper_fns[axis] = def_function.function(
+            mean_reduce_fn)
+      numer, denom = self._mean_reduce_helper_fns[axis](value)
+    else:
+      numer, denom = self.run(mean_reduce_helper, args=(value,))
+
     # TODO(josh11b): Should batch reduce here instead of doing two.
     numer = self._extended._reduce(reduce_util.ReduceOp.SUM, numer)  # pylint: disable=protected-access
     denom = self._extended._reduce(reduce_util.ReduceOp.SUM, denom)  # pylint: disable=protected-access
diff --git a/tensorflow/python/distribute/strategy_reduce_test.py b/tensorflow/python/distribute/strategy_reduce_test.py
new file mode 100644
index 00000000000..a87cce2f0b8
--- /dev/null
+++ b/tensorflow/python/distribute/strategy_reduce_test.py
@@ -0,0 +1,52 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `strategy.reduce`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+
+
+class StrategyReduceTest(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          distribution=strategy_combinations.all_strategies,
+          mode=["eager"]
+      ))
+  def test_reduce_with_axis(self, distribution):
+
+    @def_function.function
+    def fn():
+      return constant_op.constant([1., 2.])
+    x = distribution.run(fn)
+
+    x_m = distribution.reduce(reduce_util.ReduceOp.MEAN, x, axis=0)
+    self.assertEqual(1.5, self.evaluate(x_m))
+    x_s = distribution.reduce(reduce_util.ReduceOp.SUM, x, axis=0)
+    self.assertEqual(3 * distribution.num_replicas_in_sync, self.evaluate(x_s))
+
+
+if __name__ == "__main__":
+  test.main()

From f29c4058a47d433b1dcfa8f963ed988da38b1803 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 22 May 2020 12:16:21 -0700
Subject: [PATCH 1055/1533] GPU delegate: Add Android to
 tflite_extra_gles_deps, instead of manually linking against system libraries
 for EGL and GLESv3.

PiperOrigin-RevId: 312847902
Change-Id: I6859765a7baa3180897855670672241e71887d87
---
 tensorflow/lite/delegates/gpu/BUILD           | 42 ++++++++++++++++---
 tensorflow/lite/delegates/gpu/cl/BUILD        | 17 ++++++--
 .../gpu/common/testing/feature_parity/BUILD   |  4 ++
 .../lite/delegates/gpu/gl/kernels/BUILD       | 27 ++++++++----
 .../delegates/gpu/java/src/main/native/BUILD  |  7 ++++
 5 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 5604c16132f..c667c2056f4 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -29,6 +29,17 @@ cc_library(
     name = "gl_delegate",
     srcs = ["gl_delegate.cc"],
     hdrs = ["gl_delegate.h"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
+        ],
+        "//conditions:default": [],
+    }),
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/types:span",
@@ -109,6 +120,11 @@ cc_binary(
     linkopts = [
         "-Wl,-soname=libtensorflowlite_gpu_gl.so",
     ] + select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+            "-fvisibility=hidden",
+        ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-fvisibility=hidden",
@@ -120,7 +136,7 @@ cc_binary(
         "nobuilder",
         "notap",
     ],
-    deps = [":gl_delegate"] + tflite_extra_gles_deps(),
+    deps = [":gl_delegate"],
 )
 
 # build -c opt --config android_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt --linkopt -s --strip always :libtensorflowlite_gpu_delegate.so
@@ -129,6 +145,11 @@ cc_binary(
     linkopts = [
         "-Wl,-soname=libtensorflowlite_gpu_delegate.so",
     ] + select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+            "-fvisibility=hidden",
+        ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-fvisibility=hidden",
@@ -140,7 +161,7 @@ cc_binary(
         "nobuilder",
         "notap",
     ],
-    deps = [":delegate"] + tflite_extra_gles_deps(),
+    deps = [":delegate"],
 )
 
 # bazel build -c opt --cpu ios_arm64 --copt -Os --copt -DTFLITE_GPU_BINARY_RELEASE --copt -fvisibility=hidden --linkopt -s --strip always --cxxopt=-std=c++14 :libtensorflowlite_gpu_metal --apple_platform_type=ios
@@ -200,9 +221,18 @@ cc_library(
     name = "delegate",
     srcs = ["delegate.cc"],
     hdrs = ["delegate.h"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
+        ],
+        "//conditions:default": [],
+    }),
     deps = [
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
@@ -216,5 +246,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:api2",
         "//tensorflow/lite/kernels/internal:optimized_base",
-    ] + tflite_extra_gles_deps(),
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+    ],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 134148d084d..2e686810767 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -1,6 +1,8 @@
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
-load("//tensorflow/core/platform:build_config_root.bzl", "tf_gpu_tests_tags")
-load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 
 package(
     default_visibility = ["//visibility:public"],
@@ -265,11 +267,17 @@ cc_library(
     name = "gpu_api_delegate",
     srcs = ["gpu_api_delegate.cc"],
     hdrs = ["gpu_api_delegate.h"],
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lEGL",
+            "-lGLESv3",
+        ],
+        "//conditions:default": [],
+    }),
     deps = [
         ":api",
         ":opencl_wrapper",
         ":tensor_type_util",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates/gpu:api",
@@ -279,7 +287,8 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common/transformations:general_transformations",
-    ] + tflite_extra_gles_deps(),
+        "@com_google_absl//absl/types:span",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
index 96dc61ed1e7..b5ceff30d1e 100644
--- a/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/testing/feature_parity/BUILD
@@ -35,6 +35,10 @@ cc_library(
 cc_test(
     name = "opengl_test",
     srcs = ["opengl_test.cc"],
+    linkopts = [
+        "-lEGL",
+        "-lGLESv3",
+    ],
     tags = tf_gpu_tests_tags() + [
         "local",
         "nobuilder",
diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
index e5bd97c7182..700a553a125 100644
--- a/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/gl/kernels/BUILD
@@ -1,5 +1,8 @@
-load("//tensorflow/lite:special_rules.bzl", "tflite_extra_gles_deps", "tflite_portable_test_suite_combined")
-load("//tensorflow/core/platform:build_config_root.bzl", "tf_gpu_tests_tags")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite_combined")
+load(
+    "//tensorflow/core/platform:build_config_root.bzl",
+    "tf_gpu_tests_tags",
+)
 
 package(
     default_visibility = ["//visibility:public"],
@@ -29,6 +32,10 @@ cc_test(
     name = "converter_test",
     size = "small",
     srcs = ["converter_test.cc"],
+    linkopts = [
+        "-lEGL",
+        "-lGLESv3",
+    ],
     tags = tf_gpu_tests_tags() + [
         "local",
         "nobuilder",
@@ -37,15 +44,15 @@ cc_test(
     ],
     deps = [
         ":converter",
-        "@com_google_googletest//:gtest_main",
-        "@com_google_absl//absl/types:span",
         "//tensorflow/lite/delegates/gpu/common:convert",
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:egl_environment",
         "//tensorflow/lite/delegates/gpu/gl:gl_buffer",
         "//tensorflow/lite/delegates/gpu/gl:portable",
-    ] + tflite_extra_gles_deps(),
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
 )
 
 cc_library(
@@ -648,9 +655,11 @@ cc_library(
     testonly = 1,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
+    linkopts = [
+        "-lEGL",
+        "-lGLESv3",
+    ],
     deps = [
-        "@com_google_googletest//:gtest",
-        "@com_google_googletest//:gtest_main",
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:status",
@@ -664,7 +673,9 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/gl:request_gpu_info",
         "//tensorflow/lite/delegates/gpu/gl:runtime_options",
         "//tensorflow/lite/delegates/gpu/gl/workgroups:default_calculator",
-    ] + tflite_extra_gles_deps(),
+        "@com_google_googletest//:gtest",
+        "@com_google_googletest//:gtest_main",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
index 695cb58381a..774fd417758 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/BUILD
@@ -13,6 +13,13 @@ cc_library(
     name = "native",
     srcs = ["gpu_delegate_jni.cc"],
     copts = tflite_copts(),
+    linkopts = select({
+        "//tensorflow:android": [
+            "-lGLESv3",
+            "-lEGL",
+        ],
+        "//conditions:default": [],
+    }),
     tags = [
         "manual",
         "notap",

From 3ae3c3b92ea4168307bfb8b1fba4469cae928f30 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Fri, 22 May 2020 19:28:12 +0000
Subject: [PATCH 1056/1533] Add tile_functor_cpu_uint[32|64].cc to
 android_extended_ops_group2 BAZEL rule

tile_functor_cpu_uint[32|64].cc are files that were added to tile_ops
in BUILD. However, this might also be needed in android_extended_ops_group2
(like other similiar functor cc sources). This commit adds
tile_functor_cpu_uint[32|64].cc to android_extended_ops_group2 BAZEL rule
just in case they could solve the following issues:
```
Undefined symbols for architecture x86_64:
  "tensorflow::functor::Tile<Eigen::ThreadPoolDevice, unsigned int, int>::operator()(Eigen::ThreadPoolDevice const&, tensorflow::Tensor*, tensorflow::Tensor const&, absl::Span<int const>) const", referenced from:
      void tensorflow::TileOp<Eigen::ThreadPoolDevice, int>::HandleCaseImpl<(tensorflow::DataType)22>(tensorflow::OpKernelContext*, absl::Span<int const> const&, tensorflow::Tensor*) in libportable_tensorflow_kernels.lo(tile_ops_8df9402447fb5c99fae84bd69e6311f0.o)
  "tensorflow::functor::Tile<Eigen::ThreadPoolDevice, unsigned int, long long>::operator()(Eigen::ThreadPoolDevice const&, tensorflow::Tensor*, tensorflow::Tensor const&, absl::Span<long long const>) const", referenced from:
      void tensorflow::TileOp<Eigen::ThreadPoolDevice, long long>::HandleCaseImpl<(tensorflow::DataType)22>(tensorflow::OpKernelContext*, absl::Span<long long const> const&, tensorflow::Tensor*) in libportable_tensorflow_kernels.lo(tile_ops_8df9402447fb5c99fae84bd69e6311f0.o)
  "tensorflow::functor::Tile<Eigen::ThreadPoolDevice, unsigned long long, int>::operator()(Eigen::ThreadPoolDevice const&, tensorflow::Tensor*, tensorflow::Tensor const&, absl::Span<int const>) const", referenced from:
      void tensorflow::TileOp<Eigen::ThreadPoolDevice, int>::HandleCaseImpl<(tensorflow::DataType)23>(tensorflow::OpKernelContext*, absl::Span<int const> const&, tensorflow::Tensor*) in libportable_tensorflow_kernels.lo(tile_ops_8df9402447fb5c99fae84bd69e6311f0.o)
  "tensorflow::functor::Tile<Eigen::ThreadPoolDevice, unsigned long long, long long>::operator()(Eigen::ThreadPoolDevice const&, tensorflow::Tensor*, tensorflow::Tensor const&, absl::Span<long long const>) const", referenced from:
      void tensorflow::TileOp<Eigen::ThreadPoolDevice, long long>::HandleCaseImpl<(tensorflow::DataType)23>(tensorflow::OpKernelContext*, absl::Span<long long const> const&, tensorflow::Tensor*) in libportable_tensorflow_kernels.lo(tile_ops_8df9402447fb5c99fae84bd69e6311f0.o)
ld: symbol(s) not found for architecture x86_64
clang++: error: linker command failed with exit code 1 (use -v to see invocation)
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index daa6093a460..5e8b1fa7b0a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6905,6 +6905,8 @@ filegroup(
         "tile_functor_cpu_int64.cc",
         "tile_functor_cpu_int8.cc",
         "tile_functor_cpu_tstring.cc",
+        "tile_functor_cpu_uint32.cc",
+        "tile_functor_cpu_uint64.cc",
         "tile_functor_cpu_uint8.cc",
         "tile_ops.cc",
         "tile_ops_cpu_impl_1.cc",

From e9654dfbc94632bf24252bdde2cc5a56dc92cb9e Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 22 May 2020 13:51:16 -0700
Subject: [PATCH 1057/1533] [tf.data] Switch to use multi-device function
 backend by default.

PiperOrigin-RevId: 312853967
Change-Id: I45984631cf0ae730ed5a222638a7334197161bbf
---
 .../core/kernels/data/captured_function.cc    | 75 +------------------
 .../core/kernels/data/captured_function.h     |  4 -
 2 files changed, 1 insertion(+), 78 deletions(-)

diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index adba99d37a4..dd64475d7d6 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -560,8 +560,7 @@ Status CapturedFunction::Instantiate(
   if (!metadata_->use_inter_op_parallelism()) {
     inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
-  bool is_multi_device = false;
-  TF_RETURN_IF_ERROR(IsMultiDevice(ctx, &is_multi_device));
+  bool is_multi_device = metadata_->use_multi_device_function();
   inst_opts.is_multi_device_function = is_multi_device;
 
   // We infer the target device from the function library runtime.
@@ -864,77 +863,5 @@ CapturedFunction::CapturedFunction(
     : metadata_(std::move(metadata)),
       captured_inputs_(std::move(captured_inputs)) {}
 
-Status CapturedFunction::IsMultiDevice(IteratorContext* ctx,
-                                       bool* is_multi_device) {
-  if (!metadata_->use_multi_device_function()) {
-    *is_multi_device = false;
-    return Status::OK();
-  }
-
-  const FunctionDef* fdef;
-  TF_RETURN_IF_ERROR(
-      LookupFunction(*metadata_->lib_def(), metadata_->func().name(), &fdef));
-
-  Device* current_device = ctx->flr()->device();
-  DeviceType current_device_type(current_device->device_type());
-  DeviceNameUtils::ParsedName current_device_name;
-  if (!DeviceNameUtils::ParseFullName(current_device->name(),
-                                      &current_device_name)) {
-    return errors::InvalidArgument("Failed to parse device name: ",
-                                   current_device->name());
-  }
-
-  // Check if any of the captured inputs are placed on a device not compatible
-  // with the current device. For non-captured inputs, we assume they are placed
-  // on the current device.
-  for (const auto& input : captured_inputs_) {
-    DataType dtype = input.dtype();
-    if (dtype == DT_RESOURCE) {
-      const ResourceHandle& handle = input.flat<ResourceHandle>()(0);
-      DeviceNameUtils::ParsedName resource_device_name;
-      if (!DeviceNameUtils::ParseFullName(handle.device(),
-                                          &resource_device_name)) {
-        return errors::InvalidArgument("Failed to parse device name: ",
-                                       handle.device());
-      }
-      if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
-                                                  resource_device_name)) {
-        *is_multi_device = true;
-        return Status::OK();
-      }
-    }
-  }
-
-  // Check if all ops could be placed on the current device.
-  for (const auto& name : metadata_->lib_def()->ListFunctionNames()) {
-    const FunctionDef* fdef;
-    TF_RETURN_IF_ERROR(LookupFunction(*metadata_->lib_def(), name, &fdef));
-    for (const auto& node : fdef->node_def()) {
-      // Check if the op has a kernel available for the current device.
-      if (!KernelDefAvailable(current_device_type, node)) {
-        *is_multi_device = true;
-        return Status::OK();
-      }
-      // If the op has a requested device, check if the requested device is
-      // compatible with the current device.
-      if (!node.device().empty()) {
-        DeviceNameUtils::ParsedName node_device_name;
-        if (!DeviceNameUtils::ParseFullName(node.device(), &node_device_name)) {
-          return errors::InvalidArgument("Failed to parse device name: ",
-                                         node.device());
-        }
-        if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
-                                                    node_device_name)) {
-          *is_multi_device = true;
-          return Status::OK();
-        }
-      }
-    }
-  }
-
-  *is_multi_device = false;
-  return Status::OK();
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 284a02091dd..de424fc547c 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -256,10 +256,6 @@ class CapturedFunction {
   CapturedFunction(std::shared_ptr<const FunctionMetadata> metadata,
                    std::vector<Tensor> captured_inputs);
 
-  // Determines whether the captured function requires the use of the
-  // multi-device function backend.
-  Status IsMultiDevice(IteratorContext* ctx, bool* is_multi_device);
-
   const std::shared_ptr<const FunctionMetadata> metadata_;
   const std::vector<Tensor> captured_inputs_;
 

From cdeac44eac3967570d61cbee335373040a455f6d Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Fri, 22 May 2020 23:39:03 +0100
Subject: [PATCH 1058/1533] Revert "Keras grouped convolutions rollback"

This reverts commit dd2ea875d92eeb83e81b1cb92e29e61d488e98b2.
---
 .../python/keras/layers/convolutional.py      | 45 +++++++++-
 .../python/keras/layers/convolutional_test.py | 85 ++++++++++++++++---
 tensorflow/python/ops/nn_ops.py               | 18 ++--
 .../v1/tensorflow.keras.layers.-conv1-d.pbtxt |  2 +-
 .../v1/tensorflow.keras.layers.-conv2-d.pbtxt |  2 +-
 .../v1/tensorflow.keras.layers.-conv3-d.pbtxt |  2 +-
 ...sorflow.keras.layers.-convolution1-d.pbtxt |  2 +-
 ...sorflow.keras.layers.-convolution2-d.pbtxt |  2 +-
 ...sorflow.keras.layers.-convolution3-d.pbtxt |  2 +-
 .../v2/tensorflow.keras.layers.-conv1-d.pbtxt |  2 +-
 .../v2/tensorflow.keras.layers.-conv2-d.pbtxt |  2 +-
 .../v2/tensorflow.keras.layers.-conv3-d.pbtxt |  2 +-
 ...sorflow.keras.layers.-convolution1-d.pbtxt |  2 +-
 ...sorflow.keras.layers.-convolution2-d.pbtxt |  2 +-
 ...sorflow.keras.layers.-convolution3-d.pbtxt |  2 +-
 15 files changed, 138 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index f7148ccd4e9..2e77dc2523b 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -78,6 +78,11 @@ class Conv(Layer):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied.
     use_bias: Boolean, whether the layer uses a bias.
@@ -100,13 +105,15 @@ class Conv(Layer):
     name: A string, the name of the layer.
   """
 
-  def __init__(self, rank,
+  def __init__(self,
+               rank,
                filters,
                kernel_size,
                strides=1,
                padding='valid',
                data_format=None,
                dilation_rate=1,
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -128,6 +135,11 @@ class Conv(Layer):
     if filters is not None and not isinstance(filters, int):
       filters = int(filters)
     self.filters = filters
+    self.groups = groups or 1
+    if filters is not None and filters % self.groups != 0:
+      raise ValueError(
+          'The number of filters must be evenly divisible by the number of '
+          'groups. Received: groups={}, filters={}'.format(groups, filters))
     self.kernel_size = conv_utils.normalize_tuple(
         kernel_size, rank, 'kernel_size')
     if not all(self.kernel_size):
@@ -155,7 +167,14 @@ class Conv(Layer):
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     input_channel = self._get_input_channel(input_shape)
-    kernel_shape = self.kernel_size + (input_channel, self.filters)
+    if input_channel % self.groups != 0:
+      raise ValueError(
+          'The number of input channels must be evenly divisible by the number '
+          'of groups. Received groups={}, but the input has {} channels '
+          '(full input shape is {}).'.format(self.groups, input_channel,
+                                             input_shape))
+    kernel_shape = self.kernel_size + (input_channel // self.groups,
+                                       self.filters)
 
     self.kernel = self.add_weight(
         name='kernel',
@@ -256,6 +275,7 @@ class Conv(Layer):
         'padding': self.padding,
         'data_format': self.data_format,
         'dilation_rate': self.dilation_rate,
+        'groups': self.groups,
         'activation': activations.serialize(self.activation),
         'use_bias': self.use_bias,
         'kernel_initializer': initializers.serialize(self.kernel_initializer),
@@ -371,6 +391,11 @@ class Conv1D(Conv):
       the dilation rate to use for dilated convolution.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any `strides` value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -413,6 +438,7 @@ class Conv1D(Conv):
                padding='valid',
                data_format='channels_last',
                dilation_rate=1,
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -431,6 +457,7 @@ class Conv1D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -517,6 +544,11 @@ class Conv2D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -566,6 +598,7 @@ class Conv2D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1),
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -584,6 +617,7 @@ class Conv2D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
@@ -655,6 +689,11 @@ class Conv3D(Conv):
       all spatial dimensions.
       Currently, specifying any `dilation_rate` value != 1 is
       incompatible with specifying any stride value != 1.
+    groups: A positive integer specifying the number of groups in which the
+      input is split along the channel axis. Each group is convolved
+      separately with `filters / groups` filters. The output is the
+      concatenation of all the `groups` results along the channel axis.
+      Input channels and `filters` must both be divisible by `groups`.
     activation: Activation function to use.
       If you don't specify anything, no activation is applied (
       see `keras.activations`).
@@ -710,6 +749,7 @@ class Conv3D(Conv):
                padding='valid',
                data_format=None,
                dilation_rate=(1, 1, 1),
+               groups=1,
                activation=None,
                use_bias=True,
                kernel_initializer='glorot_uniform',
@@ -728,6 +768,7 @@ class Conv3D(Conv):
         padding=padding,
         data_format=data_format,
         dilation_rate=dilation_rate,
+        groups=groups,
         activation=activations.get(activation),
         use_bias=use_bias,
         kernel_initializer=initializers.get(kernel_initializer),
diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 528bc14adf4..9b3afc2274c 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -26,6 +26,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import random_ops
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
@@ -56,11 +59,15 @@ class Conv1DTest(keras_parameterized.TestCase):
       ('padding_causal', {'padding': 'causal'}, (None, 7, 2)),
       ('strides', {'strides': 2}, (None, 3, 2)),
       ('dilation_rate', {'dilation_rate': 2}, (None, 3, 2)),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {'groups': 3, 'filters': 6}, (None, 5, 6), True),
   )
-  def test_conv1d(self, kwargs, expected_output_shape):
-    kwargs['filters'] = 2
+  def test_conv1d(self, kwargs, expected_output_shape, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = 3
-    self._run_test(kwargs, expected_output_shape)
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
+      self._run_test(kwargs, expected_output_shape)
 
   def test_conv1d_regularizers(self):
     kwargs = {
@@ -156,12 +163,15 @@ class Conv2DTest(keras_parameterized.TestCase):
       ('dilation_rate', {'dilation_rate': (2, 2)}, (None, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
+      ('data_format', {'data_format': 'channels_first'}, None, True),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {'groups': 3, 'filters': 6}, (None, 5, 4, 6), True),
   )
-  def test_conv2d(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
+  def test_conv2d(self, kwargs, expected_output_shape=None, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = (3, 3)
-    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, expected_output_shape)
 
   def test_conv2d_regularizers(self):
@@ -229,12 +239,15 @@ class Conv3DTest(keras_parameterized.TestCase):
       ('dilation_rate', {'dilation_rate': (2, 2, 2)}, (None, 1, 3, 2, 2)),
       # Only runs on GPU with CUDA, channels_first is not supported on CPU.
       # TODO(b/62340061): Support channels_first on CPU.
-      ('data_format', {'data_format': 'channels_first'}),
+      ('data_format', {'data_format': 'channels_first'}, None, True),
+      # Only runs on GPU with CUDA, groups are not supported on CPU.
+      # https://github.com/tensorflow/tensorflow/issues/29005
+      ('group', {'groups': 3, 'filters': 6}, (None, 3, 5, 4, 6), True),
   )
-  def test_conv3d(self, kwargs, expected_output_shape=None):
-    kwargs['filters'] = 2
+  def test_conv3d(self, kwargs, expected_output_shape=None, requires_gpu=False):
+    kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = (3, 3, 3)
-    if 'data_format' not in kwargs or test.is_gpu_available(cuda_only=True):
+    if not requires_gpu or test.is_gpu_available(cuda_only=True):
       self._run_test(kwargs, expected_output_shape)
 
   def test_conv3d_regularizers(self):
@@ -298,6 +311,56 @@ class Conv3DTest(keras_parameterized.TestCase):
             input_data=input_data)
 
 
+class GroupedConvTest(keras_parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('Conv1D', keras.layers.Conv1D),
+      ('Conv2D', keras.layers.Conv2D),
+      ('Conv3D', keras.layers.Conv3D),
+  )
+  def test_group_conv_incorrect_use(self, layer):
+    with self.assertRaisesRegexp(ValueError, 'The number of filters'):
+      layer(16, 3, groups=3)
+    with self.assertRaisesRegexp(ValueError, 'The number of input channels'):
+      layer(16, 3, groups=4).build((32, 12, 12, 3))
+
+  @parameterized.named_parameters(
+      ('Conv1D', keras.layers.Conv1D, (32, 12, 32)),
+      ('Conv2D', keras.layers.Conv2D, (32, 12, 12, 32)),
+      ('Conv3D', keras.layers.Conv3D, (32, 12, 12, 12, 32)),
+  )
+  def test_group_conv(self, layer_cls, input_shape):
+    if test.is_gpu_available(cuda_only=True):
+      with test_util.use_gpu():
+        inputs = random_ops.random_uniform(shape=input_shape)
+
+        layer = layer_cls(16, 3, groups=4, use_bias=False)
+        layer.build(input_shape)
+
+        input_slices = array_ops.split(inputs, 4, axis=-1)
+        weight_slices = array_ops.split(layer.kernel, 4, axis=-1)
+        expected_outputs = array_ops.concat([
+            nn.convolution_v2(inputs, weights)
+            for inputs, weights in zip(input_slices, weight_slices)
+        ], axis=-1)
+
+        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
+
+  def test_group_conv_depthwise(self):
+    if test.is_gpu_available(cuda_only=True):
+      with test_util.use_gpu():
+        inputs = random_ops.random_uniform(shape=(3, 27, 27, 32))
+
+        layer = keras.layers.Conv2D(32, 3, groups=32, use_bias=False)
+        layer.build((3, 27, 27, 32))
+
+        weights_dw = array_ops.reshape(layer.kernel, [3, 3, 32, 1])
+        expected_outputs = nn.depthwise_conv2d(
+            inputs, weights_dw, strides=[1, 1, 1, 1], padding='VALID')
+
+        self.assertAllClose(layer(inputs), expected_outputs, rtol=1e-5)
+
+
 @keras_parameterized.run_all_keras_modes
 class Conv1DTransposeTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 24ee94fac48..0c153b7277d 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1242,12 +1242,12 @@ class Convolution(object):
       spatial_dims = range(
           num_batch_dims + 1, num_spatial_dims + num_batch_dims + 1)
 
-    if not input_channels_dim.is_compatible_with(
-        filter_shape[num_spatial_dims]):
-      raise ValueError(
-          "Number of input channels does not match corresponding dimension of "
-          "filter, {} != {}".format(input_channels_dim,
-                                    filter_shape[num_spatial_dims]))
+    filter_dim = tensor_shape.dimension_at_index(filter_shape, num_spatial_dims)
+    if not (input_channels_dim % filter_dim).is_compatible_with(0):
+      raise ValueError("The number of input channels is not divisible by the "
+                       "corresponding number of output filters. Received: "
+                       "input channels={}, output filters={}".format(
+                           input_channels_dim, filter_dim))
 
     strides, dilation_rate = _get_strides_and_dilation_rate(
         num_spatial_dims, strides, dilation_rate)
@@ -2051,9 +2051,9 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
   Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
   horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
-  
+
   Usage Example:
-  
+
   >>> x_in = np.array([[
   ...   [[2], [1], [2], [0], [1]],
   ...   [[1], [3], [2], [2], [3]],
@@ -3551,7 +3551,7 @@ def softmax(logits, axis=None, name=None, dim=None):
       Tensor.
     RuntimeError: If a registered conversion function returns an invalid
       value.
-      
+
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index cb2f7f03e56..3319122f50b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 9d847c759a1..30736595ce5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 68cbf32998e..84527b26a39 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 76d66200fbc..ccb783c33bf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index 8d874ede685..179acdc7966 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index f97c7617dbd..b7db8afd065 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index cb2f7f03e56..3319122f50b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 9d847c759a1..30736595ce5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 68cbf32998e..84527b26a39 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 76d66200fbc..ccb783c33bf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'valid\', \'channels_last\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index 8d874ede685..179acdc7966 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index f97c7617dbd..b7db8afd065 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'groups\', \'activation\', \'use_bias\', \'kernel_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1, 1)\', \'valid\', \'None\', \'(1, 1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"

From 18aaa18cf12f145841639cbcebeab508446cad33 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Fri, 22 May 2020 17:45:02 -0700
Subject: [PATCH 1059/1533] Added proper Batch dimension support in
 CalculateOutputShape for Concat.

PiperOrigin-RevId: 312867341
Change-Id: I089c71c5e913d089488f80a923caa81f6f156f7b
---
 .../lite/delegates/gpu/common/operations.cc   | 58 ++++++++++++++-----
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index bdcf6f605cc..8fcbe379e11 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -534,9 +534,10 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
   switch (attr.axis) {
     case Axis::CHANNELS:
       for (int i = 1; i < input.size(); i++) {
-        if (input[i].h != new_shape.h || input[i].w != new_shape.w) {
+        if (input[i].h != new_shape.h || input[i].w != new_shape.w ||
+            input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Height and Width must be the same when concatenating "
+              "Height, Width and Batch must be the same when concatenating "
               "by channels axis");
         }
         new_shape.c += input[i].c;
@@ -544,9 +545,10 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
       break;
     case Axis::HEIGHT:
       for (int i = 1; i < input.size(); i++) {
-        if (input[i].w != new_shape.w || input[i].c != new_shape.c) {
+        if (input[i].w != new_shape.w || input[i].c != new_shape.c ||
+            input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Channels and Width must be the same when concatenating "
+              "Channels, Width and Batch must be the same when concatenating "
               "by height axis");
         }
         new_shape.h += input[i].h;
@@ -554,14 +556,26 @@ absl::Status CalculateOutputShape(const std::vector<BHWC>& input,
       break;
     case Axis::WIDTH:
       for (int i = 1; i < input.size(); i++) {
-        if (input[i].h != new_shape.h || input[i].c != new_shape.c) {
+        if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
+            input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Height and Channels must be the same when concatenating "
+              "Height, Channels and Batch must be the same when concatenating "
               "by width axis");
         }
         new_shape.w += input[i].w;
       }
       break;
+    case Axis::BATCH:
+      for (int i = 1; i < input.size(); i++) {
+        if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
+            input[i].w != new_shape.w) {
+          return absl::InvalidArgumentError(
+              "Width, Height and Channels must be the same when concatenating "
+              "by batch axis");
+        }
+        new_shape.b += input[i].b;
+      }
+      break;
     default:
       return absl::InvalidArgumentError("Invalid axis");
       break;
@@ -578,9 +592,10 @@ absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
     case Axis::CHANNELS:
       for (int i = 1; i < input.size(); ++i) {
         if (input[i].h != new_shape.h || input[i].w != new_shape.w ||
-            input[i].d != new_shape.d) {
+            input[i].d != new_shape.d || input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Height, Width and Depth must be the same when concatenating "
+              "Height, Width, Batch and Depth must be the same when "
+              "concatenating "
               "by channels axis");
         }
         new_shape.c += input[i].c;
@@ -589,9 +604,10 @@ absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
     case Axis::HEIGHT:
       for (int i = 1; i < input.size(); ++i) {
         if (input[i].w != new_shape.w || input[i].c != new_shape.c ||
-            input[i].d != new_shape.d) {
+            input[i].d != new_shape.d || input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Width, Depth and Channels must be the same when concatenating "
+              "Width, Depth, Batch and Channels must be the same when "
+              "concatenating "
               "by height axis");
         }
         new_shape.h += input[i].h;
@@ -600,9 +616,10 @@ absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
     case Axis::WIDTH:
       for (int i = 1; i < input.size(); ++i) {
         if (input[i].h != new_shape.h || input[i].c != new_shape.c ||
-            input[i].d != new_shape.d) {
+            input[i].d != new_shape.d || input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Height, Depth and Channels must be the same when concatenating "
+              "Height, Depth, Batch and Channels must be the same when "
+              "concatenating "
               "by width axis");
         }
         new_shape.w += input[i].w;
@@ -611,14 +628,27 @@ absl::Status CalculateOutputShape(const std::vector<BHWDC>& input,
     case Axis::DEPTH:
       for (int i = 1; i < input.size(); ++i) {
         if (input[i].w != new_shape.w || input[i].h != new_shape.h ||
-            input[i].c != new_shape.c) {
+            input[i].c != new_shape.c || input[i].b != new_shape.b) {
           return absl::InvalidArgumentError(
-              "Width, Height and Channels must be the same when concatenating "
+              "Width, Height, Batch and Channels must be the same when "
+              "concatenating "
               "by depth axis");
         }
         new_shape.d += input[i].d;
       }
       break;
+    case Axis::BATCH:
+      for (int i = 1; i < input.size(); ++i) {
+        if (input[i].w != new_shape.w || input[i].h != new_shape.h ||
+            input[i].c != new_shape.c || input[i].d != new_shape.d) {
+          return absl::InvalidArgumentError(
+              "Width, Height, Depth and Channels must be the same when "
+              "concatenating "
+              "by batch axis");
+        }
+        new_shape.b += input[i].b;
+      }
+      break;
     default:
       return absl::InvalidArgumentError("Invalid axis");
   }

From 215161ff7a7cd136feac27686561bcedbdaef493 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Sat, 23 May 2020 02:07:32 +0100
Subject: [PATCH 1060/1533] Disable 3d group conv test on XLA:GPU

---
 tensorflow/python/keras/layers/convolutional_test.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 9b3afc2274c..92ec0c0dc0b 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -218,7 +218,7 @@ class Conv2DTest(keras_parameterized.TestCase):
 @keras_parameterized.run_all_keras_modes
 class Conv3DTest(keras_parameterized.TestCase):
 
-  def _run_test(self, kwargs, expected_output_shape):
+  def _run_test(self, kwargs, expected_output_shape, validate_training=True):
     num_samples = 2
     stack_size = 3
     num_row = 7
@@ -230,7 +230,8 @@ class Conv3DTest(keras_parameterized.TestCase):
           keras.layers.Conv3D,
           kwargs=kwargs,
           input_shape=(num_samples, depth, num_row, num_col, stack_size),
-          expected_output_shape=expected_output_shape)
+          expected_output_shape=expected_output_shape,
+          validate_training=validate_training)
 
   @parameterized.named_parameters(
       ('padding_valid', {'padding': 'valid'}, (None, 3, 5, 4, 2)),
@@ -247,8 +248,10 @@ class Conv3DTest(keras_parameterized.TestCase):
   def test_conv3d(self, kwargs, expected_output_shape=None, requires_gpu=False):
     kwargs['filters'] = kwargs.get('filters', 2)
     kwargs['kernel_size'] = (3, 3, 3)
+    # train_on_batch currently fails with XLA enabled on GPUs
+    test_training = 'groups' not in kwargs or not test_util.is_xla_enabled()
     if not requires_gpu or test.is_gpu_available(cuda_only=True):
-      self._run_test(kwargs, expected_output_shape)
+      self._run_test(kwargs, expected_output_shape, test_training)
 
   def test_conv3d_regularizers(self):
     kwargs = {

From f46014143459ba1ada182172ea06a9db9bce9808 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Fri, 22 May 2020 18:14:56 -0700
Subject: [PATCH 1061/1533] [XLA] Move sharding propagation to third party

This also moves some utilities of interpreting convolutions as dots.

PiperOrigin-RevId: 312868839
Change-Id: I90bdc30217edf6dfb301a9c80b7155653391fa1a
---
 tensorflow/compiler/xla/service/BUILD         |   60 +
 .../xla/service/dot_as_convolution_util.cc    |  139 ++
 .../xla/service/dot_as_convolution_util.h     |   68 +
 .../xla/service/sharding_propagation.cc       | 1478 +++++++++++++++++
 .../xla/service/sharding_propagation.h        |   50 +
 .../xla/service/sharding_propagation_test.cc  | 1329 +++++++++++++++
 tensorflow/compiler/xla/service/spmd/BUILD    |    1 +
 .../xla/service/spmd/spmd_partitioner.cc      |   41 +
 8 files changed, 3166 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/dot_as_convolution_util.cc
 create mode 100644 tensorflow/compiler/xla/service/dot_as_convolution_util.h
 create mode 100644 tensorflow/compiler/xla/service/sharding_propagation.cc
 create mode 100644 tensorflow/compiler/xla/service/sharding_propagation.h
 create mode 100644 tensorflow/compiler/xla/service/sharding_propagation_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 1591b3a95ba..125a42bb2f9 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -491,6 +491,66 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "sharding_propagation",
+    srcs = [
+        "sharding_propagation.cc",
+    ],
+    hdrs = [
+        "sharding_propagation.h",
+    ],
+    deps = [
+        ":dot_as_convolution_util",
+        ":hlo",
+        ":hlo_graph_dumper",
+        ":hlo_pass",
+        ":hlo_sharding_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "sharding_propagation_test",
+    srcs = [
+        "sharding_propagation_test.cc",
+    ],
+    deps = [
+        "hlo_matchers",
+        ":hlo_parser",
+        ":sharding_propagation",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
+cc_library(
+    name = "dot_as_convolution_util",
+    srcs = [
+        "dot_as_convolution_util.cc",
+    ],
+    hdrs = [
+        "dot_as_convolution_util.h",
+    ],
+    deps = [
+        ":hlo",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_parameter_binding_test",
     srcs = ["dynamic_parameter_binding_test.cc"],
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.cc b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
new file mode 100644
index 00000000000..fcdf85d5ecb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.cc
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+namespace dot_as_convolution_util {
+
+/* static */ absl::optional<DotGeneralAsConvolutionDimsInfo>
+ParseDotGeneralFromConvolution(const HloInstruction* conv) {
+  CHECK_EQ(conv->opcode(), HloOpcode::kConvolution);
+  if (conv->feature_group_count() != 1 || conv->batch_group_count() != 1) {
+    return absl::nullopt;
+  }
+  const auto& conv_dims = conv->convolution_dimension_numbers();
+  DotGeneralAsConvolutionDimsInfo dims;
+  dims.lhs_non_contracting_dims.push_back(
+      {conv_dims.input_batch_dimension(), -1,
+       conv_dims.output_batch_dimension(), -1});
+  dims.rhs_non_contracting_dims.push_back(
+      {-1, conv_dims.kernel_output_feature_dimension(),
+       conv_dims.output_feature_dimension(), -1});
+  dims.contracting_dims.push_back({conv_dims.input_feature_dimension(),
+                                   conv_dims.kernel_input_feature_dimension(),
+                                   -1, -1});
+
+  for (int64 i = 0; i < conv_dims.input_spatial_dimensions_size(); ++i) {
+    int64 lhs = conv_dims.input_spatial_dimensions(i);
+    int64 lhs_size = conv->operand(0)->shape().dimensions(lhs);
+    int64 rhs = conv_dims.kernel_spatial_dimensions(i);
+    int64 rhs_size = conv->operand(1)->shape().dimensions(rhs);
+    int64 output = conv_dims.output_spatial_dimensions(i);
+    const auto& wd = conv->window().dimensions(i);
+    if (lhs_size == wd.size() &&
+        std::max<int64>(1, lhs_size - 1) == wd.stride() &&
+        lhs_size == wd.base_dilation() && wd.window_dilation() == 1 &&
+        wd.padding_high() == 0 && wd.padding_low() == 0 &&
+        !wd.window_reversal()) {
+      // A batch dimension in DotGeneral is represented as a spatial dimension
+      // with window size B (batch dimension size), stride B - 1, and base
+      // dilation B.
+      dims.batch_dims.push_back({lhs, rhs, output, i});
+    } else if (lhs_size == wd.size() && wd.base_dilation() == 1 &&
+               wd.window_dilation() == 1 && wd.padding_high() == 0 &&
+               wd.padding_low() == 0 && !wd.window_reversal()) {
+      // A contracting dimension be represented as a spatial dimension with
+      // window size C (contracting dimension size). Stride can be any size
+      // since there is only one window.
+      dims.contracting_dims.push_back({lhs, rhs, output, i});
+    } else if (wd.stride() == 1 && wd.window_dilation() == 1 &&
+               wd.base_dilation() == 1) {
+      if (rhs_size == 1 && wd.size() == 1 && wd.padding_high() == 0 &&
+          wd.padding_low() == 0 && !wd.window_reversal()) {
+        // A LHS non-contracting dimension can be represented as a spatial
+        // dimension with window size 1.
+        dims.lhs_non_contracting_dims.push_back({lhs, rhs, output, i});
+      } else if (lhs_size == 1 && wd.size() == rhs_size &&
+                 wd.padding_high() == rhs_size - 1 &&
+                 wd.padding_low() == rhs_size - 1 && wd.window_reversal()) {
+        // A RHS non-contracting dimension can be represented as a spatial
+        // dimension with window size N (non-contracting dimension size), low
+        // padding N - 1,  high padding N - 1 and window reversal.
+        dims.rhs_non_contracting_dims.push_back({lhs, rhs, output, i});
+      } else {
+        return absl::nullopt;
+      }
+    } else {
+      return absl::nullopt;
+    }
+  }
+
+  return dims;
+}
+
+StatusOr<std::unique_ptr<HloInstruction>>
+CreateShardedConvForDotGeneralConvolution(
+    const HloInstruction& conv,
+    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo) {
+  CHECK_EQ(conv.opcode(), HloOpcode::kConvolution);
+  const auto& conv_dnums = conv.convolution_dimension_numbers();
+  auto window = conv.window();
+  for (const auto& dim : dot_dnums.batch_dims) {
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_lhs_hlo->shape().dimensions(
+        conv_dnums.input_spatial_dimensions(dim.spatial_dim)));
+    wd->set_stride(std::max<int64>(1, wd->size() - 1));
+    wd->set_base_dilation(wd->size());
+  }
+  for (const auto& dim : dot_dnums.contracting_dims) {
+    if (dim.spatial_dim < 0) {
+      continue;
+    }
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_lhs_hlo->shape().dimensions(
+        conv_dnums.input_spatial_dimensions(dim.spatial_dim)));
+  }
+  for (const auto& dim : dot_dnums.rhs_non_contracting_dims) {
+    if (dim.spatial_dim < 0) {
+      continue;
+    }
+    auto wd = window.mutable_dimensions(dim.spatial_dim);
+    wd->set_size(sharded_rhs_hlo->shape().dimensions(
+        conv_dnums.kernel_spatial_dimensions(dim.spatial_dim)));
+    wd->set_padding_high(wd->size() - 1);
+    wd->set_padding_low(wd->size() - 1);
+  }
+  TF_ASSIGN_OR_RETURN(Shape sharded_conv_shape,
+                      ShapeInference::InferConvolveShape(
+                          sharded_lhs_hlo->shape(), sharded_rhs_hlo->shape(),
+                          /*feature_group_count=*/1,
+                          /*batch_group_count=*/1, window, conv_dnums));
+  *sharded_conv_shape.mutable_layout() = conv.shape().layout();
+  return HloInstruction::CreateConvolve(
+      sharded_conv_shape, sharded_lhs_hlo, sharded_rhs_hlo,
+      /*feature_group_count=*/1,
+      /*batch_group_count=*/1, window, conv_dnums, conv.precision_config());
+}
+
+}  // namespace dot_as_convolution_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dot_as_convolution_util.h b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
new file mode 100644
index 00000000000..a3e829a3d31
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_as_convolution_util.h
@@ -0,0 +1,68 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
+
+#include <memory>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+namespace dot_as_convolution_util {
+
+// Describes the dimensions of a convolution that can be interpreted as a dot.
+struct DotGeneralAsConvolutionDimsInfo {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, contracting, non-contracting). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimNums {
+    int64 lhs;
+    int64 rhs;
+    int64 output;
+    // The corresponding spatial dimension in the convolution's config. Set to
+    // -1 if it's not mapped to a spatial dimension.
+    int64 spatial_dim;
+  };
+  std::vector<DimNums> batch_dims;
+  std::vector<DimNums> contracting_dims;
+  std::vector<DimNums> lhs_non_contracting_dims;
+  std::vector<DimNums> rhs_non_contracting_dims;
+};
+
+// Parses a convolution and returns a DotGeneralAsConvolutionDimsInfo if it can
+// be interpreted as a dot, or absl::nullopt otherwise.
+absl::optional<DotGeneralAsConvolutionDimsInfo> ParseDotGeneralFromConvolution(
+    const HloInstruction* conv);
+
+// Creates sharded convolution instruction that can be interpreted as a dot.
+// This is a utility for per-op partitioners.
+//  - 'conv' is the original convolution instruction.
+//  - 'dot_dnums' is the result of ParseDotGeneralFromConvolution() for 'conv'.
+//  - 'sharded_lhs_hlo' and 'sharded_rhs_hlo' are sharded inputs for the result
+//    convolution instruction.
+StatusOr<std::unique_ptr<HloInstruction>>
+CreateShardedConvForDotGeneralConvolution(
+    const HloInstruction& conv,
+    const DotGeneralAsConvolutionDimsInfo& dot_dnums,
+    HloInstruction* sharded_lhs_hlo, HloInstruction* sharded_rhs_hlo);
+
+}  // namespace dot_as_convolution_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DOT_AS_CONVOLUTION_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
new file mode 100644
index 00000000000..bee2e04fabf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -0,0 +1,1478 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sharding_propagation.h"
+
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_split.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+using ComputationMap =
+    absl::flat_hash_map<const HloComputation*, HloInstruction*>;
+
+// Returns true iff the specified hlo or sharding has a spatially partitioned
+// sharding (tiled or replicated) what can be propagated by sharding
+// propagation.
+bool IsSpatiallyPartitioned(const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    return absl::c_any_of(sharding.tuple_elements(), IsSpatiallyPartitioned);
+  } else {
+    return !sharding.IsTileMaximal() || sharding.IsReplicated();
+  }
+}
+bool IsSpatiallyPartitioned(const HloInstruction* hlo) {
+  return hlo->has_sharding() && IsSpatiallyPartitioned(hlo->sharding());
+}
+
+// Returns true if the lhs sharding is preferable over the rhs sharding.
+// The most specific sharding is tile maximal followed by single device tile
+// maximal and finally replicated. This order aims to primarily reduce memory
+// usage and secondly reduce total compute.
+// Note: This does NOT provide a total ordering as we can have 2 different
+// sharding with same preference level.
+bool IsShardingMoreSpecific(const HloSharding& lhs, const HloSharding& rhs) {
+  CHECK_EQ(lhs.IsTuple(), rhs.IsTuple());
+  if (lhs.IsTuple()) {
+    // For tuples we consider lhs to have a better sharding if none of the
+    // elements are worse and at least one element is better then in rhs
+    // sharding.
+    const auto& lhs_shardings = lhs.tuple_elements();
+    const auto& rhs_shardings = rhs.tuple_elements();
+    CHECK_EQ(lhs_shardings.size(), rhs_shardings.size());
+    bool is_better = false;
+    for (int64 i = 0; i < lhs_shardings.size(); ++i) {
+      if (IsShardingMoreSpecific(rhs_shardings[i], lhs_shardings[i])) {
+        return false;
+      }
+      if (IsShardingMoreSpecific(lhs_shardings[i], rhs_shardings[i])) {
+        is_better = true;
+      }
+    }
+    return is_better;
+  }
+  if (!rhs.IsTileMaximal()) {
+    // If we already have a non-tile-maximal sharding then we can't improve
+    // that.
+    return false;
+  } else if (!rhs.IsReplicated()) {
+    // If we are not replicated then only tiled (not tile maximal) shardings
+    // can improve us.
+    return !lhs.IsTileMaximal();
+  } else {
+    // If we are replicated then any non-replicated sharding can improve us.
+    return !lhs.IsReplicated();
+  }
+}
+
+// Returns a sharding where each tuple element is chosen as the more specific
+// one of the corresponding elements in a and b. Requires a an b to have the
+// same tuple nesting.
+HloSharding MergeForMoreSpecificSharding(const HloSharding& a,
+                                         const HloSharding& b) {
+  if (a.IsTuple()) {
+    HloSharding result = a;
+    CHECK(b.IsTuple());
+    CHECK_EQ(a.tuple_elements().size(), b.tuple_elements().size());
+    for (int64 i = 0; i < result.tuple_elements().size(); ++i) {
+      result.tuple_elements()[i] = MergeForMoreSpecificSharding(
+          a.tuple_elements()[i], b.tuple_elements()[i]);
+    }
+    return result;
+  }
+  return IsShardingMoreSpecific(a, b) ? a : b;
+}
+
+// Updates the sharding of the specified instruction with the specified sharding
+// if it is better than the current one and returns true if a new sharding have
+// been applied.
+bool MaybeImproveInstructionSharding(const HloSharding& sharding,
+                                     HloInstruction* instruction) {
+  // We don't want to propagate tile maximal shardings.
+  if (!IsSpatiallyPartitioned(sharding)) {
+    return false;
+  }
+  // Any sharding is better then no sharding.
+  if (!instruction->has_sharding()) {
+    instruction->set_sharding(sharding);
+    return true;
+  }
+  if (IsShardingMoreSpecific(sharding, instruction->sharding())) {
+    instruction->set_sharding(sharding);
+    return true;
+  }
+  return false;
+}
+
+// Sets the sharding for every element within a tuple to replicated (default
+// sharding). This is necessary because there is no way to represent a tuple
+// sharding when only some of the elements are sharded.
+void SetDefaultTupleSharding(HloInstruction* instruction) {
+  instruction->set_sharding(
+      HloSharding::SingleTuple(instruction->shape(), HloSharding::Replicate()));
+}
+
+// We consider a convolution kernel to be small iff it is smaller along all
+// spatial dimensions then the output of the convolution. The rational is that
+// we can either shard the kernel or the output and we want to shard the larger
+// one for better efficiency.
+bool IsConvolutionKernelSmall(const HloInstruction* instruction) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kConvolution);
+  const HloInstruction* rhs = instruction->operand(1);
+  const auto& dnums = instruction->convolution_dimension_numbers();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions().size(); ++i) {
+    int64 kernel_dim =
+        rhs->shape().dimensions(dnums.kernel_spatial_dimensions(i));
+    int64 output_dim =
+        instruction->shape().dimensions(dnums.output_spatial_dimensions(i));
+    if (kernel_dim >= output_dim) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Return the operand which is the most suitable for determining the sharding
+// for the specified instruction or nullptr if there isn't any suitable operand.
+const HloInstruction* PickRepresentativeOperand(
+    const HloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    case HloOpcode::kMap:
+    case HloOpcode::kPad:
+    case HloOpcode::kPower:
+    case HloOpcode::kReverse:
+    case HloOpcode::kSlice:
+    case HloOpcode::kShiftLeft:
+    case HloOpcode::kShiftRightArithmetic:
+    case HloOpcode::kShiftRightLogical:
+      // For these opcodes the output sharding has to be determined by the
+      // sharding of the first operand but we can only determine sharding based
+      // on it if it already has a sharding.
+      if (instruction->operand(0)->has_sharding()) {
+        return instruction->operand(0);
+      }
+      return nullptr;
+    case HloOpcode::kAbs:
+    case HloOpcode::kAdd:
+    case HloOpcode::kAnd:
+    case HloOpcode::kAtan2:
+    case HloOpcode::kBitcastConvert:
+    case HloOpcode::kCeil:
+    case HloOpcode::kClamp:
+    case HloOpcode::kClz:
+    case HloOpcode::kCompare:
+    case HloOpcode::kComplex:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kConvert:
+    case HloOpcode::kCopy:
+    case HloOpcode::kCos:
+    case HloOpcode::kAllGather:
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllToAll:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kDivide:
+    case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
+    case HloOpcode::kFloor:
+    case HloOpcode::kImag:
+    case HloOpcode::kIsFinite:
+    case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kNegate:
+    case HloOpcode::kNot:
+    case HloOpcode::kOr:
+    case HloOpcode::kPopulationCount:
+    case HloOpcode::kReal:
+    case HloOpcode::kReducePrecision:
+    case HloOpcode::kRemainder:
+    case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kRsqrt:
+    case HloOpcode::kSelect:
+    case HloOpcode::kSign:
+    case HloOpcode::kSin:
+    case HloOpcode::kSort:
+    case HloOpcode::kSqrt:
+    case HloOpcode::kCbrt:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kTanh:
+    case HloOpcode::kTupleSelect:
+    case HloOpcode::kWhile:
+    case HloOpcode::kXor: {
+      // For these opcodes the output sharding can be determined by any operand
+      // so we find the operand with the most specific sharding.
+      const HloInstruction* best_operand = nullptr;
+      for (const HloInstruction* operand : instruction->operands()) {
+        if (operand->has_sharding() &&
+            (best_operand == nullptr ||
+             IsShardingMoreSpecific(operand->sharding(),
+                                    best_operand->sharding()))) {
+          best_operand = operand;
+        }
+      }
+      return best_operand;
+    }
+
+    // There is no suitable operand for the rest of the opcodes.
+    case HloOpcode::kAddDependency:
+    case HloOpcode::kAfterAll:
+    case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kCall:
+    case HloOpcode::kCholesky:
+    case HloOpcode::kCollectivePermuteDone:
+    case HloOpcode::kCollectivePermuteStart:
+    case HloOpcode::kConditional:
+    case HloOpcode::kConstant:
+    case HloOpcode::kConvolution:
+    case HloOpcode::kCopyDone:
+    case HloOpcode::kCopyStart:
+    case HloOpcode::kCustomCall:
+    case HloOpcode::kDomain:
+    case HloOpcode::kDot:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kFft:
+    case HloOpcode::kFusion:
+    case HloOpcode::kGather:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kIota:
+    case HloOpcode::kOutfeed:
+    case HloOpcode::kParameter:
+    case HloOpcode::kPartitionId:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
+    case HloOpcode::kReduce:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kReplicaId:
+    case HloOpcode::kReshape:
+    case HloOpcode::kRng:
+    case HloOpcode::kRngGetAndUpdateState:
+    case HloOpcode::kRngBitGenerator:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kTrace:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kTriangularSolve:
+    case HloOpcode::kTuple:
+    case HloOpcode::kGetDimensionSize:
+    case HloOpcode::kSetDimensionSize:
+      return nullptr;
+  }
+}
+
+bool SupportSpatialPartitioning(const HloInstruction* instruction,
+                                const ComputationMap& computation_map,
+                                bool is_spmd) {
+  if (instruction->parent()->root_instruction() == instruction &&
+      computation_map.find(instruction->parent()) == computation_map.end()) {
+    // We don't support sharding the root instruction of a computation yet,
+    // unless the computation is a while body.
+    return false;
+  }
+
+  if (instruction->IsElementwise() &&
+      (instruction->opcode() != HloOpcode::kRng || is_spmd)) {
+    return true;
+  }
+  switch (instruction->opcode()) {
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kConditional:
+    case HloOpcode::kConstant:
+    case HloOpcode::kConvolution:
+    case HloOpcode::kDot:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kGather:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kInfeed:
+    case HloOpcode::kIota:
+    case HloOpcode::kPad:
+    case HloOpcode::kReduceWindow:
+    case HloOpcode::kReshape:
+    case HloOpcode::kScatter:
+    case HloOpcode::kSelectAndScatter:
+    case HloOpcode::kSlice:
+    case HloOpcode::kSort:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kTuple:
+    case HloOpcode::kWhile:
+    case HloOpcode::kReduce:
+      return true;
+    case HloOpcode::kAllReduce:
+      // Only if channel_id is not specified.
+      return instruction->channel_id() == absl::nullopt;
+    case HloOpcode::kParameter:
+      return computation_map.find(instruction->parent()) !=
+             computation_map.end();
+    case HloOpcode::kReverse:
+      return is_spmd;
+    default:
+      return false;
+  }
+}
+
+// Tries to update the sharding of the specified instruction based on its
+// operands and returns true if the sharding of the instruction have been
+// changed and false otherwise.
+bool InferShardingFromOperands(HloInstruction* instruction,
+                               const ComputationMap& computation_map,
+                               bool is_spmd, bool aggressive_prop) {
+  if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
+    // If an array shaped HLO doesn't support spatial partitioning but at least
+    // one of its operand is replicated then we make the HLO replicated as well.
+    if (instruction->shape().IsTuple() || instruction->operand_count() == 0 ||
+        instruction == instruction->parent()->root_instruction() ||
+        instruction->HasSideEffect()) {
+      return false;
+    }
+    if (absl::c_any_of(instruction->operands(), [](const HloInstruction* op) {
+          return op->has_sharding() && op->sharding().IsReplicated();
+        })) {
+      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                             instruction);
+    }
+    return false;
+  }
+
+  switch (instruction->opcode()) {
+    case HloOpcode::kGetTupleElement: {
+      const HloInstruction* operand = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(operand)) {
+        return false;
+      }
+      HloSharding new_sharding = operand->sharding().GetSubSharding(
+          operand->shape(), {instruction->tuple_index()});
+      return MaybeImproveInstructionSharding(new_sharding, instruction);
+    }
+    case HloOpcode::kTuple: {
+      if (absl::c_none_of(instruction->operands(),
+                          [](const HloInstruction* hlo) {
+                            return IsSpatiallyPartitioned(hlo);
+                          })) {
+        // None of the operands have a spatially partitioned sharding.
+        return false;
+      }
+      bool changed = false;
+      if (!instruction->has_sharding()) {
+        // Set the sharding for all elements in the tuple because it isn't
+        // possible to set a partial sharding.
+        SetDefaultTupleSharding(instruction);
+        changed = true;
+      }
+      // Go through each operand and if the operand has a sharding that is
+      // better than the current sharding for that tuple element then update
+      // it.
+      const Shape& shape = instruction->shape();
+      std::vector<HloSharding> sub_shardings =
+          instruction->sharding().tuple_elements();
+      int64 sub_sharding_index = 0;
+      for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+        const HloInstruction* operand = instruction->operand(i);
+        if (operand->has_sharding()) {
+          if (operand->shape().IsTuple()) {
+            for (int64 i = 0, e = ShapeUtil::GetLeafCount(operand->shape());
+                 i < e; ++i) {
+              if (IsShardingMoreSpecific(
+                      operand->sharding().tuple_elements()[i],
+                      sub_shardings[sub_sharding_index + i])) {
+                sub_shardings[sub_sharding_index + i] =
+                    operand->sharding().tuple_elements()[i];
+              }
+            }
+          } else {
+            if (IsShardingMoreSpecific(operand->sharding(),
+                                       sub_shardings[sub_sharding_index])) {
+              sub_shardings[sub_sharding_index] = operand->sharding();
+            }
+          }
+        }
+        sub_sharding_index += ShapeUtil::GetLeafCount(operand->shape());
+      }
+
+      HloSharding new_sharding = HloSharding::Tuple(shape, sub_shardings);
+      if (new_sharding != instruction->sharding()) {
+        instruction->set_sharding(new_sharding);
+        return true;
+      }
+      return changed;
+    }
+    case HloOpcode::kReduce: {
+      // Reduce could have a tuple shape, where the first half of operands are
+      // the arrays to reduce, and the second half of operands are the init
+      // values.
+      bool changed = false;
+      for (int64 operand_id = 0; operand_id < instruction->operand_count() / 2;
+           ++operand_id) {
+        const HloInstruction* operand = instruction->operand(operand_id);
+        if (!IsSpatiallyPartitioned(operand)) {
+          continue;
+        }
+        auto get_maybe_tuple_sharding = [&](const HloSharding& sharding) {
+          if (instruction->operand_count() == 2) {
+            return sharding;
+          }
+          std::vector<HloSharding> tuple(instruction->operand_count() / 2,
+                                         sharding);
+          return HloSharding::Tuple(instruction->shape(), tuple);
+        };
+        if (operand->sharding().IsReplicated()) {
+          changed |= MaybeImproveInstructionSharding(
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
+          continue;
+        }
+        if (absl::c_any_of(instruction->dimensions(), [operand](int64 dim) {
+              return operand->sharding().tile_assignment().dim(dim) > 1;
+            })) {
+          // We are reducing along one of the sharded dimensions. We don't
+          // support tiled sharding in this case.
+          changed |= MaybeImproveInstructionSharding(
+              get_maybe_tuple_sharding(HloSharding::Replicate()), instruction);
+        } else {
+          // We are reducing along some of the non-sharded dimensions. The
+          // result sharding should be the same as the operand sharding with the
+          // reduction dimensions removed as they are removed from the result
+          // shape.
+          std::vector<int64> target_tile_assignment_dimensions;
+          const auto& dimensions = instruction->dimensions();
+          for (int64 i = 0; i < operand->shape().rank(); ++i) {
+            if (absl::c_find(dimensions, i) == dimensions.end()) {
+              target_tile_assignment_dimensions.push_back(
+                  operand->sharding().tile_assignment().dim(i));
+            }
+          }
+          Array<int64> new_tile_assignment =
+              operand->sharding().tile_assignment();
+          new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+          // Use the same sharding for all tuple elements, because they are part
+          // of the same reduce instruction.
+          HloSharding new_sharding =
+              get_maybe_tuple_sharding(HloSharding::Tile(new_tile_assignment));
+          changed |= MaybeImproveInstructionSharding(new_sharding, instruction);
+        }
+      }
+      return changed;
+    }
+    case HloOpcode::kBroadcast: {
+      const HloInstruction* op = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(op) || op->sharding().IsReplicated()) {
+        return false;
+      }
+      // Heuristic: If an operand is more than 8 times fewer elements than its
+      // output, do not propagate sharding.
+      if (ShapeUtil::ElementsIn(instruction->shape()) >
+          8 * ShapeUtil::ElementsIn(op->shape())) {
+        return false;
+      }
+      // The output will be tiled along the broadcasted dimension the same way
+      // as the input for the broadcast while the other dimensions are kept
+      // non-tiled.
+      std::vector<int64> target_tile_assignment_dimensions;
+      const auto& dimensions = instruction->dimensions();
+      for (int64 i = 0; i < instruction->shape().rank(); ++i) {
+        auto it = absl::c_find(dimensions, i);
+        if (it == dimensions.end()) {
+          target_tile_assignment_dimensions.push_back(1);
+        } else {
+          const int64 source_dim = std::distance(dimensions.begin(), it);
+          target_tile_assignment_dimensions.push_back(
+              op->sharding().tile_assignment().dim(source_dim));
+        }
+      }
+      Array<int64> new_tile_assignment = op->sharding().tile_assignment();
+      new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+      HloSharding new_sharding = HloSharding::Tile(new_tile_assignment);
+      return MaybeImproveInstructionSharding(new_sharding, instruction);
+    }
+    case HloOpcode::kConvolution: {
+      const auto& dnums = instruction->convolution_dimension_numbers();
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      auto get_tiled_sharding_based_on_lhs = [&] {
+        CHECK(!lhs->sharding().IsTileMaximal());
+        std::vector<int64> output_to_lhs_indices(instruction->shape().rank());
+        output_to_lhs_indices[dnums.output_batch_dimension()] =
+            dnums.input_batch_dimension();
+        output_to_lhs_indices[dnums.output_feature_dimension()] =
+            dnums.input_feature_dimension();
+        for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+          output_to_lhs_indices[dnums.output_spatial_dimensions(i)] =
+              dnums.input_spatial_dimensions(i);
+        }
+        return hlo_sharding_util::TransposeSharding(lhs->sharding(),
+                                                    output_to_lhs_indices);
+      };
+      auto get_tiled_sharding_based_on_rhs = [&] {
+        CHECK(!rhs->sharding().IsTileMaximal());
+        std::vector<int64> output_to_rhs_indices(instruction->shape().rank());
+        output_to_rhs_indices[dnums.output_batch_dimension()] =
+            dnums.kernel_input_feature_dimension();
+        output_to_rhs_indices[dnums.output_feature_dimension()] =
+            dnums.kernel_output_feature_dimension();
+        for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+          output_to_rhs_indices[dnums.output_spatial_dimensions(i)] =
+              dnums.kernel_spatial_dimensions(i);
+        }
+        return hlo_sharding_util::TransposeSharding(rhs->sharding(),
+                                                    output_to_rhs_indices);
+      };
+      if (auto dot_dims =
+              dot_as_convolution_util::ParseDotGeneralFromConvolution(
+                  instruction)) {
+        // lhs_or_rhs: lhs is 0 and rhs is 1.
+        auto partitioned_only_along =
+            [&](const HloSharding& sharding,
+                std::vector<dot_as_convolution_util::
+                                DotGeneralAsConvolutionDimsInfo::DimNums>& dims,
+                int64 lhs_or_rhs) {
+              if (sharding.IsTileMaximal()) {
+                return false;
+              }
+              int64 partition_count = 1;
+              for (const auto& dim : dims) {
+                if (lhs_or_rhs == 0) {
+                  partition_count *= sharding.tile_assignment().dim(dim.lhs);
+                } else {
+                  CHECK_EQ(lhs_or_rhs, 1);
+                  partition_count *= sharding.tile_assignment().dim(dim.rhs);
+                }
+              }
+              return partition_count ==
+                     sharding.tile_assignment().num_elements();
+            };
+        // If LHS/RHS is partitioned only along the batch dimensions, propagate
+        // the sharding to the output, since batch dimensions are the easiest to
+        // partition.
+        if (IsSpatiallyPartitioned(lhs) &&
+            partitioned_only_along(lhs->sharding(), dot_dims->batch_dims, 0)) {
+          return MaybeImproveInstructionSharding(
+              get_tiled_sharding_based_on_lhs(), instruction);
+        }
+        if (IsSpatiallyPartitioned(rhs) &&
+            partitioned_only_along(rhs->sharding(), dot_dims->batch_dims, 1)) {
+          return MaybeImproveInstructionSharding(
+              get_tiled_sharding_based_on_rhs(), instruction);
+        }
+        if (aggressive_prop) {
+          // If LHS/RHS is partitioned only along the non-contracting
+          // dimensions, propagate the sharding to the output.
+          const bool can_propagate_from_lhs =
+              IsSpatiallyPartitioned(lhs) &&
+              partitioned_only_along(lhs->sharding(),
+                                     dot_dims->lhs_non_contracting_dims, 0);
+          const bool can_propagate_from_rhs =
+              IsSpatiallyPartitioned(rhs) &&
+              partitioned_only_along(rhs->sharding(),
+                                     dot_dims->rhs_non_contracting_dims, 1);
+          // If we can propagate from both operands, choose the larger one which
+          // should help us reduce communications.
+          if (can_propagate_from_lhs && can_propagate_from_rhs) {
+            if (Product(lhs->shape().dimensions()) >=
+                Product(rhs->shape().dimensions())) {
+              return MaybeImproveInstructionSharding(
+                  get_tiled_sharding_based_on_lhs(), instruction);
+            } else {
+              return MaybeImproveInstructionSharding(
+                  get_tiled_sharding_based_on_rhs(), instruction);
+            }
+          }
+          if (can_propagate_from_lhs) {
+            return MaybeImproveInstructionSharding(
+                get_tiled_sharding_based_on_lhs(), instruction);
+          }
+          if (can_propagate_from_rhs) {
+            return MaybeImproveInstructionSharding(
+                get_tiled_sharding_based_on_rhs(), instruction);
+          }
+        }
+      }
+
+      if (!IsSpatiallyPartitioned(lhs)) {
+        return false;
+      }
+      if (lhs->sharding().IsReplicated()) {
+        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                               instruction);
+      }
+
+      if (IsConvolutionKernelSmall(instruction)) {
+        // If the kernel is small compared to the input then we can generate an
+        // output what is sharded the same way as the input.
+        const auto& tile_assignment = lhs->sharding().tile_assignment();
+        if (tile_assignment.dim(dnums.input_feature_dimension()) > 1) {
+          return false;
+        }
+        return MaybeImproveInstructionSharding(
+            get_tiled_sharding_based_on_lhs(), instruction);
+      }
+      // If the kernel is large (e.g backward convolution) then we only support
+      // replicated output.
+      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                             instruction);
+    }
+    case HloOpcode::kTranspose: {
+      const HloInstruction* input = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(input)) {
+        return false;
+      }
+      HloSharding sharding = hlo_sharding_util::TransposeSharding(
+          input->sharding(), instruction->dimensions());
+      return MaybeImproveInstructionSharding(sharding, instruction);
+    }
+    case HloOpcode::kReduceWindow: {
+      const HloInstruction* lhs = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(lhs)) {
+        return false;
+      }
+
+      auto has_dilation = [](const WindowDimension& dimensions) {
+        return dimensions.base_dilation() > 1 ||
+               dimensions.window_dilation() > 1;
+      };
+      if (absl::c_any_of(instruction->window().dimensions(), has_dilation)) {
+        VLOG(2) << "Not applying sharding to reduce window because dilatation "
+                   "isn't supported yet: "
+                << instruction->ToString();
+        return false;
+      }
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+    }
+    case HloOpcode::kSelectAndScatter: {
+      // Shard according to first operand, as output keeps the same shape.
+      const HloInstruction* lhs = instruction->operand(0);
+      if (!IsSpatiallyPartitioned(lhs)) {
+        return false;
+      }
+
+      auto has_base_dilation = [](const WindowDimension& dimensions) {
+        return dimensions.base_dilation() > 1;
+      };
+      if (absl::c_any_of(instruction->window().dimensions(),
+                         has_base_dilation)) {
+        VLOG(2) << "Not applying sharding to select-and-scatter because "
+                   "base dilation isn't supported yet: "
+                << instruction->ToString();
+        return false;
+      }
+      return MaybeImproveInstructionSharding(lhs->sharding(), instruction);
+    }
+    case HloOpcode::kReshape: {
+      if (!IsSpatiallyPartitioned(instruction->operand(0))) {
+        return false;
+      }
+      absl::optional<HloSharding> new_sharding =
+          hlo_sharding_util::ReshapeSharding(
+              instruction->operand(0)->shape(), instruction->shape(),
+              instruction->operand(0)->sharding());
+      if (new_sharding.has_value()) {
+        return MaybeImproveInstructionSharding(new_sharding.value(),
+                                               instruction);
+      }
+      return false;
+    }
+    case HloOpcode::kDot: {
+      auto& dot_dim_numbs = instruction->dot_dimension_numbers();
+      // Batch dimensions are the same for lhs and rhs on dot operations.
+      int64 num_batch_dims = dot_dim_numbs.lhs_batch_dimensions_size();
+      std::vector<int64> contracting_dims(2);
+      contracting_dims[0] = dot_dim_numbs.lhs_contracting_dimensions(0);
+      contracting_dims[1] = dot_dim_numbs.rhs_contracting_dimensions(0);
+      std::vector<const HloSharding*> ops_sharding(2, nullptr);
+      for (int64 op_num = 0; op_num < 2; ++op_num) {
+        const HloInstruction* op = instruction->operand(op_num);
+        if (IsSpatiallyPartitioned(op)) {
+          ops_sharding[op_num] = &op->sharding();
+        }
+      }
+      if (ops_sharding[0] == nullptr && ops_sharding[1] == nullptr) {
+        return false;
+      }
+
+      // Select representative operand.
+      int64 representative_op = -1;
+      if (ops_sharding[0] == nullptr) {
+        representative_op = 1;
+      } else if (ops_sharding[1] == nullptr) {
+        representative_op = 0;
+      } else if (ops_sharding[0]->IsReplicated() &&
+                 ops_sharding[1]->IsReplicated()) {
+        // Both replicated -> replicate
+        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                               instruction);
+      } else if (!ops_sharding[0]->IsReplicated() &&
+                 !ops_sharding[1]->IsReplicated()) {
+        // Both tile sharded. The dot spatial partitioning implementation
+        // replicates the operand corresponding to the non-tiled dimension:
+        // dot(lhs, rhs), sharding={devices=[1, ..., n, 1]} replicates rhs
+        // dot(lhs, rhs), sharding={devices=[1, ..., 1, n]} replicates lhs
+        // so set sharding in order to replicate the smaller of lhs and rhs
+        representative_op =
+            ShapeUtil::ByteSizeOf(instruction->operand(0)->shape()) <
+                    ShapeUtil::ByteSizeOf(instruction->operand(1)->shape())
+                ? 1
+                : 0;
+      } else {
+        // One is replicated and the other is tiled - pick the tiled one.
+        representative_op = ops_sharding[0]->IsReplicated() ? 1 : 0;
+      }
+
+      if (ops_sharding[representative_op]->IsReplicated()) {
+        return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                               instruction);
+      } else {
+        // Tile-shard instruction according to representative op.
+        auto sharding = *ops_sharding[representative_op];
+        if (instruction->shape().dimensions_size() !=
+            sharding.tile_assignment().num_dimensions()) {
+          // It is necessarily the case of a matrix x vector, with
+          // representative_op being the matrix, because the vector op has the
+          // same shape as instruction.
+          CHECK_EQ(sharding.tile_assignment().num_dimensions(),
+                   instruction->shape().dimensions_size() + 1);
+          // Reshape sharding so that last dimension is 1, and then remove
+          // last dimension.
+          std::vector<int64> non_batch_dims(
+              sharding.tile_assignment().num_dimensions() - num_batch_dims);
+          absl::c_iota(non_batch_dims, num_batch_dims);
+          sharding = hlo_sharding_util::ReshapeToTileDimension(
+              sharding, num_batch_dims, non_batch_dims);
+          auto tile_assignment = sharding.tile_assignment();
+          auto dimensions = tile_assignment.dimensions();
+          CHECK_EQ(dimensions.back(), 1);
+          dimensions.pop_back();
+          tile_assignment.Reshape(dimensions);
+          sharding = HloSharding::Tile(tile_assignment);
+        }
+        return MaybeImproveInstructionSharding(sharding, instruction);
+      }
+    }
+    case HloOpcode::kParameter: {
+      auto parent_it = computation_map.find(instruction->parent());
+      if (parent_it == computation_map.end()) {
+        return false;
+      }
+      const HloInstruction* parent = parent_it->second;
+      switch (parent->opcode()) {
+        case HloOpcode::kConditional: {
+          for (int64 i = 1; i < parent->operand_count(); ++i) {
+            if (parent->called_computations()[i - 1] == instruction->parent()) {
+              if (parent->operand(i)->has_sharding()) {
+                return MaybeImproveInstructionSharding(
+                    parent->operand(i)->sharding(), instruction);
+              }
+              return false;
+            }
+          }
+          return false;
+        }
+        default:
+          return false;
+      }
+    }
+    case HloOpcode::kSort: {
+      const HloInstruction* operand = PickRepresentativeOperand(instruction);
+      if (!operand || !IsSpatiallyPartitioned(operand)) {
+        return false;
+      }
+
+      if (!operand->sharding().IsTileMaximal() &&
+          operand->sharding().tile_assignment().dim(
+              instruction->dimensions(0)) != 1) {
+        // Doesn't support sharding the sorting dimension.
+        return false;
+      }
+
+      if (instruction->shape().IsTuple()) {
+        return MaybeImproveInstructionSharding(
+            HloSharding::SingleTuple(instruction->shape(), operand->sharding()),
+            instruction);
+      } else {
+        return MaybeImproveInstructionSharding(operand->sharding(),
+                                               instruction);
+      }
+    }
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice: {
+      auto propagate_slicing = [instruction]() {
+        const HloInstruction* operand =
+            instruction->opcode() == HloOpcode::kDynamicSlice
+                ? instruction->operand(0)
+                : instruction->operand(1);
+        if (!IsSpatiallyPartitioned(operand)) {
+          return false;
+        }
+
+        if (operand->sharding().IsReplicated()) {
+          return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                                 instruction);
+        }
+
+        const auto& tile_assignment = operand->sharding().tile_assignment();
+        for (int64 i = 0; i < instruction->shape().rank(); ++i) {
+          if (tile_assignment.dim(i) > 1 &&
+              instruction->shape().dimensions(i) !=
+                  operand->shape().dimensions(i)) {
+            return false;
+          }
+        }
+        return MaybeImproveInstructionSharding(operand->sharding(),
+                                               instruction);
+      };
+      auto propagate_base = [instruction]() {
+        if (instruction->opcode() != HloOpcode::kDynamicUpdateSlice) {
+          return false;
+        }
+        if (!IsSpatiallyPartitioned(instruction->operand(0))) {
+          return false;
+        }
+        return MaybeImproveInstructionSharding(
+            instruction->operand(0)->sharding(), instruction);
+      };
+      return propagate_slicing() || propagate_base();
+    }
+    case HloOpcode::kGather: {
+      if (!IsSpatiallyPartitioned(instruction->operand(1))) {
+        return false;
+      }
+      HloSharding new_sharding = hlo_sharding_util::GatherOutputSharding(
+          instruction->operand(1)->sharding(), instruction);
+      return MaybeImproveInstructionSharding(new_sharding, instruction);
+    }
+    case HloOpcode::kScatter: {
+      if (!IsSpatiallyPartitioned(instruction->operand(1)) &&
+          !IsSpatiallyPartitioned(instruction->operand(2))) {
+        return false;
+      }
+      return MaybeImproveInstructionSharding(HloSharding::Replicate(),
+                                             instruction);
+    }
+    case HloOpcode::kWhile: {
+      if (!instruction->operand(0)->has_sharding()) {
+        return false;
+      }
+      auto sharding = instruction->operand(0)->sharding();
+      if (instruction->has_sharding()) {
+        sharding =
+            MergeForMoreSpecificSharding(sharding, instruction->sharding());
+      }
+      return MaybeImproveInstructionSharding(sharding, instruction);
+    }
+    default: {
+      const HloInstruction* operand = PickRepresentativeOperand(instruction);
+      if (!operand || !IsSpatiallyPartitioned(operand)) {
+        return false;
+      }
+      return MaybeImproveInstructionSharding(operand->sharding(), instruction);
+    }
+  }
+  return false;
+}
+
+// Return the sharding that should be propagated from user to instruction.
+absl::optional<HloSharding> GetShardingFromUser(
+    const HloInstruction& instruction, const HloInstruction& user,
+    bool aggressive_prop, bool is_spmd) {
+  if (!IsSpatiallyPartitioned(&user)) {
+    return absl::nullopt;
+  }
+  switch (user.opcode()) {
+    case HloOpcode::kBroadcast: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+      // Only support when none of the partitioned dimensions in the broadcast
+      // output belong to new dimensions.
+      for (int64 i = 0; i < user.shape().rank(); ++i) {
+        if (user.sharding().tile_assignment().dim(i) > 1 &&
+            absl::c_count(user.dimensions(), i) == 0) {
+          return absl::nullopt;
+        }
+      }
+
+      // The instruction (operand of broadcast) will be tiled the same way
+      // as the output.
+      std::vector<int64> target_tile_assignment_dimensions;
+      for (int64 output_dim : user.dimensions()) {
+        target_tile_assignment_dimensions.push_back(
+            user.sharding().tile_assignment().dim(output_dim));
+      }
+      Array<int64> new_tile_assignment = user.sharding().tile_assignment();
+      new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+      return HloSharding::Tile(new_tile_assignment);
+    }
+    case HloOpcode::kConcatenate: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+
+      const int64 cdim = user.concatenate_dimension();
+      const Array<int64>& tile_assignment = user.sharding().tile_assignment();
+      if (tile_assignment.dim(cdim) == 1) {
+        // If we are concatenating along a non-sharded dimension then the
+        // operands should have the same sharding as the result.
+        return user.sharding();
+      }
+
+      if (is_spmd) {
+        // SPMD doesn't support tiling with part of the devices. Return the same
+        // sharding.
+        return user.sharding();
+      }
+
+      // If we are concatenating along a sharded dimension then we want the
+      // operands to be distributed among the devices their data is used.
+      int64 start_offset = 0;
+      for (HloInstruction* op : user.operands()) {
+        if (op == &instruction) {
+          break;
+        }
+        start_offset += op->shape().dimensions(cdim);
+      }
+      const int64 tile_shape = CeilOfRatio(user.shape().dimensions(cdim),
+                                           tile_assignment.dimensions()[cdim]);
+      std::vector<int64> start_indices(tile_assignment.num_dimensions());
+      std::vector<int64> end_indices = tile_assignment.dimensions();
+      start_indices[cdim] = start_offset / tile_shape;
+      end_indices[cdim] = CeilOfRatio(
+          start_offset + instruction.shape().dimensions(cdim), tile_shape);
+      auto new_tile_assignment =
+          tile_assignment.Slice(start_indices, end_indices);
+      if (new_tile_assignment.num_elements() == 1) {
+        return HloSharding::AssignDevice(*new_tile_assignment.begin());
+      }
+      return HloSharding::Tile(new_tile_assignment);
+    }
+    case HloOpcode::kConvolution: {
+      if (auto dot_dims =
+              dot_as_convolution_util::ParseDotGeneralFromConvolution(&user)) {
+        const auto& dnums = user.convolution_dimension_numbers();
+        auto partitioned_only_along =
+            [&](const HloSharding& sharding,
+                std::vector<dot_as_convolution_util::
+                                DotGeneralAsConvolutionDimsInfo::DimNums>&
+                    dims) {
+              if (sharding.IsTileMaximal()) {
+                return false;
+              }
+              int64 partition_count = 1;
+              for (const auto& dim : dims) {
+                partition_count *= sharding.tile_assignment().dim(dim.output);
+              }
+              return partition_count ==
+                     sharding.tile_assignment().num_elements();
+            };
+        // If output is partitioned only along the batch dimensions, or only
+        // along the non-contracting dimensions, propagate the sharding to the
+        // operand.
+        if (&instruction == user.operand(0) &&
+            (partitioned_only_along(user.sharding(), dot_dims->batch_dims) ||
+             partitioned_only_along(user.sharding(),
+                                    dot_dims->lhs_non_contracting_dims))) {
+          std::vector<int64> lhs_to_output_indices(user.shape().rank());
+          lhs_to_output_indices[dnums.input_batch_dimension()] =
+              dnums.output_batch_dimension();
+          lhs_to_output_indices[dnums.input_feature_dimension()] =
+              dnums.output_feature_dimension();
+          for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+            lhs_to_output_indices[dnums.input_spatial_dimensions(i)] =
+                dnums.output_spatial_dimensions(i);
+          }
+          return hlo_sharding_util::TransposeSharding(user.sharding(),
+                                                      lhs_to_output_indices);
+        }
+        if (&instruction == user.operand(1) &&
+            (partitioned_only_along(user.sharding(), dot_dims->batch_dims) ||
+             partitioned_only_along(user.sharding(),
+                                    dot_dims->rhs_non_contracting_dims))) {
+          std::vector<int64> rhs_to_output_indices(user.shape().rank());
+          rhs_to_output_indices[dnums.kernel_input_feature_dimension()] =
+              dnums.output_batch_dimension();
+          rhs_to_output_indices[dnums.kernel_output_feature_dimension()] =
+              dnums.output_feature_dimension();
+          for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+            rhs_to_output_indices[dnums.kernel_spatial_dimensions(i)] =
+                dnums.output_spatial_dimensions(i);
+          }
+          return hlo_sharding_util::TransposeSharding(user.sharding(),
+                                                      rhs_to_output_indices);
+        }
+      }
+      return absl::nullopt;
+    }
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+      if (user.opcode() == HloOpcode::kDynamicUpdateSlice &&
+          &instruction == user.operand(0)) {
+        return user.sharding();
+      }
+      const HloInstruction* operand = user.opcode() == HloOpcode::kDynamicSlice
+                                          ? user.operand(0)
+                                          : user.operand(1);
+      if (&instruction != operand) {
+        return absl::nullopt;
+      }
+
+      const auto& tile_assignment = user.sharding().tile_assignment();
+      for (int64 i = 0; i < user.shape().rank(); ++i) {
+        if (tile_assignment.dim(i) > 1 &&
+            user.shape().dimensions(i) != operand->shape().dimensions(i)) {
+          return absl::nullopt;
+        }
+      }
+      return user.sharding();
+    }
+    case HloOpcode::kReduceWindow: {
+      if (&instruction != user.operand(0)) {
+        return absl::nullopt;
+      }
+      return user.sharding();
+    }
+    case HloOpcode::kReshape: {
+      return hlo_sharding_util::ReshapeSharding(
+          user.shape(), instruction.shape(), user.sharding());
+    }
+    case HloOpcode::kTranspose: {
+      // Calculate the dimension numbers for reversing the current transpose
+      // and then use TransposeSharding to convert the output sharding to an
+      // input sharding.
+      std::vector<int64> reverse_dimensions(user.dimensions().size());
+      for (int64 i = 0; i < user.dimensions().size(); ++i) {
+        reverse_dimensions[user.dimensions(i)] = i;
+      }
+      return hlo_sharding_util::TransposeSharding(user.sharding(),
+                                                  reverse_dimensions);
+    }
+    case HloOpcode::kTuple: {
+      return user.sharding().GetSubSharding(user.shape(),
+                                            {user.operand_index(&instruction)});
+    }
+    case HloOpcode::kGetTupleElement: {
+      HloSharding new_sharding =
+          instruction.has_sharding()
+              ? instruction.sharding()
+              : HloSharding::SingleTuple(instruction.shape(),
+                                         HloSharding::Replicate());
+      int64 sharding_index = 0;
+      for (int64 i = 0; i < instruction.shape().tuple_shapes_size(); ++i) {
+        if (i == user.tuple_index()) {
+          break;
+        }
+        if (instruction.shape().tuple_shapes(i).IsArray()) {
+          sharding_index += 1;
+        } else {
+          sharding_index +=
+              instruction.shape().tuple_shapes(i).tuple_shapes_size();
+        }
+      }
+      if (user.shape().IsArray()) {
+        new_sharding.tuple_elements()[sharding_index] = user.sharding();
+      }
+      for (int64 i = 0; i < user.sharding().tuple_elements().size(); ++i) {
+        new_sharding.tuple_elements()[sharding_index + i] =
+            user.sharding().tuple_elements()[i];
+      }
+      return new_sharding;
+    }
+    case HloOpcode::kDot: {
+      if (user.sharding().IsReplicated()) {
+        return user.sharding();
+      }
+      auto& dim_numbers = user.dot_dimension_numbers();
+      int64 op_idx = user.operand_index(&instruction);
+      // Batch dimensions are the same on lhs and rhs for dot operations.
+      int64 num_batch_dims = dim_numbers.lhs_batch_dimensions_size();
+      int64 num_spatial_dims =
+          instruction.shape().dimensions_size() - num_batch_dims;
+      if (num_spatial_dims == 1) {
+        // This is the vector of a matrix x vector operation -> replicate,
+        // since tiling on the vector would necessarily be on the contracting
+        // dimension, which we don't support.
+        CHECK_EQ(op_idx, 1);
+        return HloSharding::Replicate();
+      }
+      // Instruction is necessarily a matrix because it is one of the operands
+      // of a matrix x matrix operation.
+      CHECK_EQ(num_spatial_dims, 2);
+      // Propagate tile sharding to the bigger operand, and replicate the other.
+      auto other_op = user.operand(op_idx ^ 1);
+      if (ShapeUtil::ByteSizeOf(instruction.shape()) >
+          ShapeUtil::ByteSizeOf(other_op->shape())) {
+        return user.sharding();
+      } else {
+        return HloSharding::Replicate();
+      }
+    }
+    case HloOpcode::kReduce: {
+      if (instruction.shape().rank() == 0) {
+        return absl::nullopt;
+      }
+      auto user_sharding =
+          user.shape().IsTuple()
+              ? user.sharding().GetSubSharding(
+                    user.shape(), {user.operand_index(&instruction)})
+              : user.sharding();
+      if (user_sharding.IsTileMaximal()) {
+        return user_sharding;
+      }
+      std::vector<int64> target_tile_assignment_dimensions(
+          instruction.shape().rank());
+      const auto& dimensions = user.dimensions();
+      int64 next_output_dim = 0;
+      for (int64 i = 0; i < instruction.shape().rank(); ++i) {
+        if (absl::c_find(dimensions, i) == dimensions.end()) {
+          target_tile_assignment_dimensions[i] =
+              user_sharding.tile_assignment().dim(next_output_dim++);
+        } else {
+          target_tile_assignment_dimensions[i] = 1;
+        }
+      }
+      auto tile_assignment = user_sharding.tile_assignment();
+      tile_assignment.Reshape(target_tile_assignment_dimensions);
+      return HloSharding::Tile(tile_assignment);
+    }
+    case HloOpcode::kSort: {
+      if (user.sharding().IsTuple()) {
+        return user.sharding().GetSubSharding(
+            user.shape(), {user.operand_index(&instruction)});
+      } else {
+        return user.sharding();
+      }
+    }
+    default: {
+      // If the user output shape is compatible with the current instruction
+      // shape excluding element type and the current instruction is supported
+      // by spatial partitioning, then the user sharding can be used for
+      // propagation to the current instruction.
+      if (ShapeUtil::CompatibleIgnoringElementType(instruction.shape(),
+                                                   user.shape())) {
+        return user.sharding();
+      }
+      return absl::nullopt;
+    }
+  }
+}
+
+// Tries to update the sharding of the specified instruction based on its users
+// and returns true if the sharding of the instruction have been changed and
+// false otherwise.
+bool InferShardingFromUsers(HloInstruction* instruction,
+                            const ComputationMap& computation_map,
+                            bool aggressive_prop, bool is_spmd) {
+  if (!SupportSpatialPartitioning(instruction, computation_map, is_spmd)) {
+    return false;
+  }
+  bool improved_sharding = false;
+  for (const HloInstruction* user : instruction->users()) {
+    absl::optional<HloSharding> user_sharding =
+        GetShardingFromUser(*instruction, *user, aggressive_prop, is_spmd);
+    if (user_sharding) {
+      improved_sharding |=
+          MaybeImproveInstructionSharding(*user_sharding, instruction);
+    }
+  }
+  return improved_sharding;
+}
+
+// Remove Sharding custom-call instruction by folding the sharding attribute
+// to its operand. If the operand alreayd has a different sharding, insert a
+// copy node for reshard.
+StatusOr<bool> ProcessShardingInstruction(HloModule* module) {
+  bool changed = false;
+
+  for (HloComputation* computation : module->computations()) {
+    auto instructions = computation->MakeInstructionPostOrder();
+    std::reverse(instructions.begin(), instructions.end());
+    for (HloInstruction* instruction : instructions) {
+      if (instruction->opcode() != HloOpcode::kCustomCall) {
+        continue;
+      }
+      if (instruction->custom_call_target() != "Sharding") {
+        continue;
+      }
+      TF_RET_CHECK(instruction->has_sharding())
+          << "Sharding instruction must have a sharding attribute";
+      const HloSharding& sharding = instruction->sharding();
+
+      // If the operand has a different sharding from the current sharding
+      // instruction, create a copy node. Otherwise, just remove the sharding
+      // instruction and set the operand sharding.
+      if (instruction->operand(0)->has_sharding() &&
+          instruction->operand(0)->sharding() != sharding) {
+        auto copy = computation->AddInstruction(
+            HloInstruction::CreateUnary(instruction->shape(), HloOpcode::kCopy,
+                                        instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->ReplaceInstruction(instruction, copy));
+        copy->set_sharding(sharding);
+      } else {
+        instruction->mutable_operand(0)->set_sharding(sharding);
+        TF_RETURN_IF_ERROR(
+            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+      }
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+}  // namespace
+
+/*static*/ Status ShardingPropagation::NormalizeDomain(
+    const DomainMetadata::Domain& domain, const DomainMetadata* metadata) {
+  if (metadata != nullptr) {
+    TF_ASSIGN_OR_RETURN(const auto& sharding_metadata,
+                        ShardingMetadata::ToShardingMetadata(metadata));
+    const auto& sharding = sharding_metadata->sharding();
+    if (sharding != nullptr) {
+      bool is_spatially_partitioned = !sharding->HasUniqueDevice();
+      if (sharding->IsTuple()) {
+        is_spatially_partitioned = absl::c_any_of(
+            sharding->tuple_elements(),
+            [](const HloSharding& s) { return !s.HasUniqueDevice(); });
+      }
+      if (is_spatially_partitioned) {
+        for (HloInstruction* domain : domain.exit_domains) {
+          domain->mutable_operand(0)->set_sharding(*sharding);
+        }
+        return Status::OK();
+      }
+    }
+  }
+  return ShardingMetadata::NormalizeShardingDomain(domain, metadata);
+}
+
+StatusOr<bool> ShardingPropagation::Run(HloModule* module) {
+  TF_ASSIGN_OR_RETURN(bool any_changed, ProcessShardingInstruction(module));
+
+  // Association of partitionable embedded computations with their parent
+  // instruction.
+  ComputationMap computation_map;
+
+  // Instructions that are related through a computation and need to share the
+  // same sharding.
+  auto get_related_instructions = [](HloInstruction* inst) {
+    if (inst->opcode() == HloOpcode::kWhile) {
+      return std::vector<HloInstruction*>{
+          inst, inst->while_body()->root_instruction(),
+          inst->while_body()->parameter_instruction(0),
+          inst->while_condition()->parameter_instruction(0)};
+    } else if (inst->opcode() == HloOpcode::kConditional) {
+      std::vector<HloInstruction*> comps{inst};
+      for (HloComputation* c : inst->called_computations()) {
+        comps.push_back(c->root_instruction());
+      }
+      return comps;
+    } else {
+      CHECK(false);
+    }
+  };
+
+  // If instruction is a while, or the root or a parameter of a while body,
+  // then propagate its sharding to the while instruction, to its body root,
+  // and to its condition parameter.
+  std::function<void(HloInstruction*)> maybe_computation_propagation =
+      [&](HloInstruction* instruction) {
+        auto propagate_to_instruction = [&](HloInstruction* search_inst) {
+          auto related_instructions = get_related_instructions(search_inst);
+          if (absl::c_count(related_instructions, instruction)) {
+            for (HloInstruction* inst : related_instructions) {
+              if (!inst->has_sharding() ||
+                  inst->sharding() != instruction->sharding()) {
+                VLOG(2) << "Add computation sharding: " << inst->name();
+                inst->set_sharding(instruction->sharding());
+                maybe_computation_propagation(inst);
+              }
+            }
+          }
+        };
+
+        if (instruction->opcode() == HloOpcode::kConditional ||
+            instruction->opcode() == HloOpcode::kWhile) {
+          propagate_to_instruction(instruction);
+        }
+
+        if (instruction->opcode() == HloOpcode::kParameter ||
+            instruction->parent()->root_instruction() == instruction) {
+          auto it = computation_map.find(instruction->parent());
+          if (it != computation_map.end()) {
+            propagate_to_instruction(it->second);
+          }
+        }
+      };
+
+  // Populate computation_map in order to associate while bodies to their
+  // while instructions.
+  for (auto computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile ||
+          instruction->opcode() == HloOpcode::kConditional) {
+        // Check if any of the related instructions has sharding, in which case
+        // propagate it to the other instructions, so they all share the same
+        // sharding, in case the user didn't shard all of them. We don't check
+        // that user shardings are consistent, because such check is already
+        // done by HloShardingVerifier.
+        const HloInstruction* sharded_inst = nullptr;
+        auto related_instructions = get_related_instructions(instruction);
+        for (auto inst : related_instructions) {
+          if (inst->has_sharding()) {
+            sharded_inst = inst;
+            break;
+          }
+        }
+        if (sharded_inst != nullptr) {
+          // Set the same sharding to all the other related instructions.
+          for (auto inst : related_instructions) {
+            inst->set_sharding(sharded_inst->sharding());
+          }
+        }
+      }
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        computation_map[instruction->while_body()] = instruction;
+      } else if (instruction->opcode() == HloOpcode::kConditional) {
+        for (HloComputation* c : instruction->called_computations()) {
+          computation_map[c] = instruction;
+        }
+      }
+    }
+  }
+
+  // Collect all pre-sharded instructions as we aren't allowed to modify their
+  // sharding.
+  absl::flat_hash_set<const HloInstruction*> provided_shardings;
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* inst : computation->instructions()) {
+      if (inst->has_sharding()) {
+        provided_shardings.insert(inst);
+      }
+    }
+  }
+
+  // Consider the root instruction of the entry module as one with provided
+  // sharding as its sharding have to match with the one expected by the host.
+  provided_shardings.insert(module->entry_computation()->root_instruction());
+
+  // Iterate to a fixpoint that is guaranteed to be reached because we only
+  // strictly improve the sharding of the graph and it can't be improved
+  // indefinitely.
+  int64 iterations = 0;
+  auto run_to_fix_point = [&](bool aggressive_prop) {
+    bool changed = true;
+    while (changed) {
+      changed = false;
+      int64 inferred_from_operand_counter = 0;
+      int64 inferred_from_user_counter = 0;
+      int64 instruction_counter = 0;
+      int64 already_sharded_counter = 0;
+      for (const HloComputation* computation : module->computations()) {
+        std::vector<HloInstruction*> instructions =
+            computation->MakeInstructionPostOrder();
+
+        instruction_counter += instructions.size();
+        for (const HloInstruction* instruction : instructions) {
+          already_sharded_counter += (instruction->has_sharding() ? 1 : 0);
+        }
+
+        // Remove the instructions where the sharding was provided from the
+        // outside so we don't modify them.
+        instructions.erase(
+            std::remove_if(instructions.begin(), instructions.end(),
+                           [&](HloInstruction* instruction) {
+                             return provided_shardings.contains(instruction);
+                           }),
+            instructions.end());
+
+        // First iterate the HLO graph in post order taking shardings from
+        // operands.
+        for (HloInstruction* instruction : instructions) {
+          if (InferShardingFromOperands(instruction, computation_map, is_spmd_,
+                                        aggressive_prop)) {
+            ++inferred_from_operand_counter;
+            changed = true;
+            VLOG(2) << "Add sharding (forward-pass): "
+                    << instruction->ToString();
+            maybe_computation_propagation(instruction);
+          }
+        }
+
+        // Then iterate the HLO graph in reverse post order taking shardings
+        // from users.
+        for (auto it = instructions.rbegin(); it != instructions.rend(); ++it) {
+          if (InferShardingFromUsers(*it, computation_map, aggressive_prop,
+                                     is_spmd_)) {
+            ++inferred_from_user_counter;
+            changed = true;
+            VLOG(2) << "Add sharding (backward-pass): " << (*it)->ToString();
+            maybe_computation_propagation(*it);
+          }
+        }
+      }
+      any_changed |= changed;
+      VLOG(1) << "Sharding propagation iteration " << iterations << ";";
+      VLOG(1) << "  total instructions: " << instruction_counter;
+      VLOG(1) << "  instructions already sharded: " << already_sharded_counter;
+      VLOG(1) << "  shardings inferred from operands: "
+              << inferred_from_operand_counter;
+      VLOG(1) << "  shardings inferred from users: "
+              << inferred_from_user_counter;
+      ++iterations;
+    }
+  };
+  run_to_fix_point(false);
+  run_to_fix_point(true);
+
+  VLOG(1) << "Sharding propagation completed after " << iterations
+          << " iterations";
+  return any_changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.h b/tensorflow/compiler/xla/service/sharding_propagation.h
new file mode 100644
index 00000000000..2c07a4a6a31
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sharding_propagation.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SHARDING_PROPAGATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SHARDING_PROPAGATION_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Propagates sharding information around the graph. HLOs that have shardings
+// are kept as-is, those that do not have shardings are given shardings based on
+// a simple local greedy heuristic.
+class ShardingPropagation : public HloModulePass {
+ public:
+  explicit ShardingPropagation(bool is_spmd = false) : is_spmd_(is_spmd) {}
+  absl::string_view name() const override { return "sharding-propagation"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Function which can be used to apply a spatially partitioned sharding onto a
+  // given domain. It will apply the sharding into the exit edges of the domain
+  // and then rely on the rest of sharding propagation to ensure that the
+  // intermediate nodes get the correct sharding.
+  static Status NormalizeDomain(const DomainMetadata::Domain& domain,
+                                const DomainMetadata* metadata);
+
+ private:
+  bool is_spmd_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SHARDING_PROPAGATION_H_
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
new file mode 100644
index 00000000000..a9d685a7a93
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -0,0 +1,1329 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sharding_propagation.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+using ShardingPropagationTest = HloTestBase;
+
+TEST_F(ShardingPropagationTest, ElementwiseOperationForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1)
+  %add = f32[5,7,11,13]{3,2,1,0} add(%param0, %param1)
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ElementwiseOperationBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %elementwise {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1)
+  %add = f32[5,7,11,13]{3,2,1,0} add(%param0, %param1)
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%add),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, BroadcastForwardPassNoSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[7,11]{1,0} parameter(0),
+    sharding={devices=[2,2]0,1,2,3}
+  %broadcast = f32[5,7,11,13]{3,2,1,0} broadcast(%param0), dimensions={1,2}
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%broadcast)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+// Regression Test for b/129569657.
+TEST_F(ShardingPropagationTest, BroadcastForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[3,2048,2048]{2,1,0} parameter(0),
+    sharding={devices=[1,2,2]0,1,2,3}
+  %broadcast = f32[3,2048,2048,3]{3,2,1,0} broadcast(%param0), dimensions={0,1,2}
+  ROOT %copy = f32[3,2048,2048,3]{3,2,1,0} copy(%broadcast)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "broadcast"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, BroadcastBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[13]{0} parameter(0)
+  %broadcast = f32[5,7,11,13]{3,2,1,0} broadcast(%param0), dimensions={3}
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%broadcast),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "broadcast"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, BroadcastUser) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %broadcast {
+  %param0 = f32[24,8]{0,1} parameter(0)
+  %copy = f32[24,8]{0,1} copy(%param0)
+  ROOT %broadcast = f32[4,24,6,8]{3,2,1,0} broadcast(%copy), dimensions={1,3},
+    sharding={devices=[1,2,1,4]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,4]0,1,2,3,4,5,6,7}"));
+}
+
+TEST_F(ShardingPropagationTest, MaximalReduceForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %init = f32[] parameter(1)
+  %reduce = f32[5,7]{1,0} reduce(%param0, %init), dimensions={2,3}, to_apply=%add
+  ROOT %copy = f32[5,7]{0,1} copy(%reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, ShardedReduceForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+%add {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %init = f32[] parameter(1)
+  %reduce = f32[7,11]{1,0} reduce(%param0, %init), dimensions={0,3}, to_apply=%add
+  ROOT %copy = f32[7,11]{0,1} copy(f32[7,11]{1,0} %reduce)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{devices=[2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ShardedTupleReduceForwardAndBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0)
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,1]0,1}
+  %copy_param0 = f32[28,10] copy(%param0)
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  %reduce = (f32[28], s32[28]) reduce(%copy_param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func
+  %gte0 = f32[28] get-tuple-element(%reduce), index=0
+  %gte1 = s32[28] get-tuple-element(%reduce), index=1
+  %copy0 = f32[28] copy(%gte0)
+  %copy1 = s32[28] copy(%gte1)
+  ROOT %tuple = (f32[28], s32[28]) tuple(%copy0, %copy1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed, ShardingPropagation(/*is_spmd=*/true).Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reduce"),
+              op::Sharding("{{devices=[2]0,1},{devices=[2]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "copy_param0"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, GetTupleElementForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %gte {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
+  %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(
+    %param0, %param0)
+  %tuple.1 = (f32[5,7,11,13]{3,2,1,0},
+              (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0})) tuple(
+    %param0, %tuple),
+    sharding={{devices=[1,2,2,1]0,1,2,3},
+              {replicated},
+              {devices=[1,2,2,1]0,1,2,3}}
+  %gte = f32[5,7,11,13]{3,2,1,0} get-tuple-element(%tuple.1), index=0
+  %gte.1 = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) get-tuple-element(
+    %tuple.1), index=1
+  %gte.2 = f32[5,7,11,13]{3,2,1,0} get-tuple-element(%gte.1), index=0
+  ROOT %copy = f32[5,7,11,13]{3,2,1,0} copy(%gte.2)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "gte"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "gte.1"),
+              op::Sharding("{{replicated},"
+                           " {devices=[1,2,2,1]0,1,2,3}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "gte.2"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, TupleForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %tuple {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={replicated}
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %param2 = f32[5,7,11,13]{3,2,1,0} parameter(2)
+  %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(
+    %param1, %param2)
+  %tuple.1 = (f32[5,7,11,13]{3,2,1,0},
+              (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0})) tuple(
+    %param0, %tuple)
+  ROOT %copy = (f32[5,7,11,13]{3,2,1,0},
+                (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0})) copy(
+    %tuple.1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tuple"),
+              op::Sharding("{{devices=[1,2,2,1]0,1,2,3},"
+                           " {replicated}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "tuple.1"),
+              op::Sharding("{{replicated},"
+                           " {devices=[1,2,2,1]0,1,2,3},"
+                           " {replicated}}"));
+}
+
+TEST_F(ShardingPropagationTest, ForwardConvolutionForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %lhs = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={devices=[2,2,2,1]0,1,2,3,4,5,6,7}
+  %rhs = f32[3,3,13,17]{3,2,1,0} parameter(1)
+  %convolution = f32[5,7,11,17]{3,2,1,0} convolution(%lhs, %rhs),
+    window={size=3x3 pad=1_1x1_1}, dim_labels=b01f_01io->b01f
+  ROOT %copy = f32[5,7,11,17]{3,2,1,0} copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "convolution"),
+              op::Sharding("{devices=[2,2,2,1]0,1,2,3,4,5,6,7}"));
+}
+
+TEST_F(ShardingPropagationTest, ForwardConvolutionLargeDilationForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %lhs = f32[8,64,2]{2,1,0} parameter(0),
+    sharding={devices=[1,4,1]0,1,2,3}
+  %rhs = f32[3,2,2]{2,1,0} parameter(1)
+  %convolution = f32[8,32,2]{2,1,0} convolution(%lhs, %rhs),
+    window={size=3 rhs_dilate=16}, dim_labels=b0f_0io->b0f
+  ROOT %copy = f32[8,32,2]{2,1,0} copy(%convolution)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "convolution"),
+              op::Sharding("{devices=[1,4,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, TransposeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0),
+    sharding={devices=[2,1,2]0,1,2,3}
+  %transpose = f32[11,13,7]{2,1,0} transpose(%param), dimensions={1,2,0}
+  ROOT %copy = f32[11,13,7]{2,1,0} copy(%transpose)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "transpose"),
+              op::Sharding("{devices=[1,2,2]0,2,1,3}"));
+}
+
+TEST_F(ShardingPropagationTest, TransposeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %transpose {
+  %param = f32[7,11,13]{2,1,0} parameter(0)
+  %copy = f32[7,11,13]{2,1,0} copy(%param)
+  ROOT %transpose = f32[11,13,7]{2,1,0} transpose(%copy), dimensions={1,2,0},
+    sharding={devices=[1,2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,1,2]0,2,1,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ReshapeForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[1430,1]{1,0} parameter(0),
+    sharding={devices=[2,1]0,1}
+  %reshape = f32[10,11,13]{2,1,0} reshape(%param0)
+  ROOT %copy = f32[10,11,13]{2,1,0} copy(%reshape)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "reshape"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ReshapeBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[2002,1]{1,0} parameter(0)
+  %copy = f32[2002,1]{1,0} copy(f32[2002,1]{1,0} %param0)
+  ROOT %reshape = f32[14,11,13]{2,1,0} reshape(%copy),
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, PadForwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %pad {
+  %input = f32[11,17]{1,0} parameter(0),
+    sharding={devices=[2,2]0,1,2,3}
+  %pad_value = f32[] parameter(1)
+  %pad = f32[27,51]{1,0} pad(%input, %pad_value), padding=2_4_1x1_1_2
+  ROOT %copy = f32[27,51]{1,0} copy(%pad)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "pad"),
+              op::Sharding("{devices=[2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ShardedPreferredOverReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %replicated {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={replicated}
+  %copy = f32[5,7,11,13]{3,2,1,0} copy(%param0)
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1),
+    sharding={devices=[1,2,2,1]0,1,2,3}
+  %copy.1 = f32[5,7,11,13]{3,2,1,0} copy(%param1)
+  %add = f32[5,7,11,13]{3,2,1,0} add(%copy, %copy.1)
+  ROOT %copy.2 = f32[5,7,11,13]{3,2,1,0} copy(%add)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "copy.1"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[1,2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, DontShardTuplesIfAllInputIsMaximal) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %tuple {
+  %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0),
+    sharding={maximal device=0}
+  %param1 = f32[5,7,11,13]{3,2,1,0} parameter(1),
+    sharding={maximal device=1}
+  %tuple = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) tuple(
+    %param0, %param1)
+  ROOT %copy = (f32[5,7,11,13]{3,2,1,0}, f32[5,7,11,13]{3,2,1,0}) copy(%tuple)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tuple"), op::NoSharding());
+}
+
+TEST_F(ShardingPropagationTest, ValidConvolution) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[13,17,19]{2,1,0} parameter(0),
+    sharding={devices=[1,2,1]0,1}
+  %rhs = f32[19,5,19]{2,1,0} parameter(1)
+  %conv = f32[13,13,19]{2,1,0} convolution(%lhs, %rhs),
+    window={size=5}, dim_labels=b0f_i0o->b0f
+  ROOT %tuple = (f32[13,13,19]{2,1,0}) tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, StridedSlice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %slice {
+  %param = f32[17,13]{1,0} parameter(0),
+    sharding={devices=[2,1]0,1}
+  %slice = f32[7,5]{1,0} slice(%param), slice={[1:15:2], [5:10:1]}
+  ROOT %tuple = (f32[7,5]{1,0}) tuple(%slice)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "slice"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ReduceWindowBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+%add (lhs: f32[], rhs: f32[]) -> f32[] {
+  %lhs = f32[] parameter(0)
+  %rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(%lhs, %rhs)
+}
+ENTRY %reduce_window {
+  %param = f32[13,17]{1,0} parameter(0)
+  %param.copy = f32[13,17]{1,0} copy(%param)
+  %init = f32[] parameter(1)
+  ROOT %reduce-window = f32[7,17]{1,0} reduce-window(%param.copy, %init),
+    window={size=3x2 stride=2x1 pad=1_1x0_1}, to_apply=%add,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "param.copy"),
+              op::Sharding("{devices=[2,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "reduce-window"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ReplicatedConvolutionLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[3,2,3]{2,1,0} parameter(0), sharding={replicated}
+  %rhs = f32[2,2,1]{2,1,0} parameter(1)
+  %conv = f32[3,2,3]{2,1,0} convolution(%lhs, %rhs),
+    window={size=1}, dim_labels=bf0_oi0->bf0
+  ROOT %tuple = f32[3,2,3]{2,1,0} tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs"),
+              op::Sharding("{replicated}"));
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, ConvolutionShardedFeature) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[3,2,3]{2,1,0} parameter(0),
+    sharding={devices=[1,2,1]0,1}
+  %rhs = f32[2,2,1]{2,1,0} parameter(1)
+  %conv = f32[3,2,3]{2,1,0} convolution(%lhs, %rhs),
+    window={size=1}, dim_labels=bf0_oi0->bf0
+  ROOT %tuple = f32[3,2,3]{2,1,0} tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(ShardingPropagationTest, ConvolutionDifferentDimensionNumbers) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY conv {
+  %lhs = f32[8,16,512] parameter(0),
+    sharding={devices=[1,2,1]0,1}
+  %rhs = f32[8,2,512] parameter(1)
+  %conv = f32[3,512,512] convolution(%lhs, %rhs),
+    window={size=2 stride=5},
+    dim_labels=f0b_i0o->0bf
+  ROOT %tuple = f32[3,512,512] tuple(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, Concatenate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %concat {
+  %param.0 = f32[5,7] parameter(0),
+    sharding={devices=[2,1]0,1}
+  %param.1 = f32[5,9] parameter(1),
+    sharding={devices=[2,1]0,1}
+  %concat = f32[5,16] concatenate(%param.0, %param.1),
+    dimensions={1}
+  ROOT %tuple = (f32[5,16]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "concat"),
+              op::Sharding("{devices=[2,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, TupleBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY %tuple {
+  %param.0 = f32[1] parameter(0)
+  %param.1 = f32[3] parameter(1)
+  %copy.0 = f32[1] copy(%param.0)
+  %copy.1 = f32[3] copy(param.1)
+  ROOT %tuple = (f32[1], f32[3]) tuple(%copy.0, %copy.1),
+    sharding={{replicated}, {devices=[2]0,1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "copy.0"),
+              op::Sharding("{replicated}"));
+  EXPECT_THAT(FindInstruction(module.get(), "copy.1"),
+              op::Sharding("{devices=[2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, AllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%add (lhs: f32[], rhs: f32[]) -> f32[] {
+  %add_lhs = f32[] parameter(0)
+  %add_rhs = f32[] parameter(1)
+  ROOT %add = f32[] add(f32[] %add_lhs, f32[] %add_rhs)
+}
+
+ENTRY %entry {
+  %param.0 = f32[3] parameter(0)
+  %param.1 = f32[3] parameter(1)
+
+  %copy_f_t = f32[3] copy(%param.1), sharding={devices=[2]0,1}
+  %crs_f.tiled = f32[3] all-reduce(%copy_f_t), to_apply=%add
+  %crs_f.none = f32[3] all-reduce(%copy_f_t), to_apply=%add,
+    channel_id=1
+
+  %crs_b.replicated = f32[3] all-reduce(%param.0), to_apply=%add
+  %copy_b_r = f32[3] copy(%crs_b.replicated), sharding={replicated}
+
+  ROOT %tuple = (f32[3], f32[3], f32[3], f32[3]) tuple(
+    %crs_f.tiled, crs_f.none, %copy_b_r)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "crs_f.tiled"),
+              op::Sharding("{devices=[2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "crs_f.none"), op::NoSharding());
+
+  EXPECT_THAT(FindInstruction(module.get(), "crs_b.replicated"),
+              op::Sharding("{replicated}"));
+}
+
+TEST_F(ShardingPropagationTest, While) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%cond {
+  %vars.cond = (u32[], f32[10]{0}) parameter(0)
+  %count.cond = u32[] get-tuple-element((u32[], f32[10]{0}) %vars.cond), index=0
+  %limit = u32[] constant(10)
+  ROOT %lt = pred[] compare(u32[] %count.cond, u32[] %limit), direction=LT
+}
+
+%body {
+  %vars = (u32[], f32[10]{0}) parameter(0)
+  %count = u32[] get-tuple-element(%vars), index=0
+  %acc = f32[10]{0} get-tuple-element((u32[], f32[10]{0}) %vars), index=1
+
+  %one = u32[] constant(1)
+  %count.1 = u32[] add(u32[] %count, u32[] %one), sharding={replicated}
+  %acc.1 = f32[10]{0} add(f32[10]{0} %acc, f32[10]{0} %acc)
+  ROOT %tuple = (u32[], f32[10]{0}) tuple(u32[] %count.1, f32[10]{0} %acc.1)
+}
+
+ENTRY %entry {
+  %p0 = f32[10]{0} parameter(0)
+  %p0.copy = f32[10]{0} copy(f32[10]{0} %p0)
+  %p1 = f32[10]{0} parameter(1)
+  %zero = u32[] constant(0)
+  %init = (u32[], f32[10]{0}) tuple(u32[] %zero, f32[10]{0} %p0.copy)
+  %while = (u32[], f32[10]{0}) while((u32[], f32[10]{0}) %init),
+    body=%body, condition=%cond
+  %res = f32[10]{0} get-tuple-element((u32[], f32[10]{0}) %while), index=1
+  %prev = f32[10]{0} get-tuple-element((u32[], f32[10]{0}) %init), index=1
+  %res.1 = f32[10]{0} multiply(f32[10]{0} %res, %prev)
+  ROOT %res_tuple = (f32[10]{0}) tuple(f32[10]{0} %res.1)
+})";
+
+  auto while_is_sharded = [this](HloModule* module,
+                                 const HloSharding& sharding) {
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, ShardingPropagation().Run(module));
+    EXPECT_TRUE(changed);
+    auto while_instr = FindInstruction(module, "while");
+    EXPECT_NE(nullptr, while_instr);
+    std::vector<const HloInstruction*> instructions{
+        while_instr, while_instr->while_body()->root_instruction(),
+        while_instr->while_body()->parameter_instruction(0),
+        while_instr->while_condition()->parameter_instruction(0)};
+
+    for (auto instr : instructions) {
+      EXPECT_TRUE(instr->has_sharding());
+      EXPECT_EQ(sharding, instr->sharding());
+    }
+  };
+  {
+    // Propagation of user-defined partial sharding of while-related instruction
+    // (body root in this test).
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    auto body_root = FindInstruction(module.get(), "tuple");
+    EXPECT_NE(nullptr, body_root);
+    auto sharding =
+        ParseSharding("{{replicated}, {devices=[2]0,1}}").ConsumeValueOrDie();
+    body_root->set_sharding(sharding);
+    while_is_sharded(module.get(), sharding);
+  }
+  {
+    // Propagation from acc.1 to the rest of the loop.
+    TF_ASSERT_OK_AND_ASSIGN(auto module,
+                            ParseAndReturnVerifiedModule(hlo_string));
+    auto acc_1 = FindInstruction(module.get(), "acc.1");
+    EXPECT_NE(nullptr, acc_1);
+    acc_1->set_sharding(ParseSharding("{devices=[2]0,1}").ConsumeValueOrDie());
+
+    while_is_sharded(
+        module.get(),
+        ParseSharding("{{replicated}, {devices=[2]0,1}}").ConsumeValueOrDie());
+  }
+}
+
+TEST_F(ShardingPropagationTest, Dot) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %param.0 = f32[8,256,128] parameter(0)
+  %param.1 = f32[8,128,512] parameter(1)
+  %param.2 = f32[8,128] parameter(2)
+
+  %p0_copy_0 = f32[8,256,128] copy(%param.0),
+    sharding={devices=[1,4,1]0,1,2,3}
+  %p1_copy_0 = f32[8,128,512] copy(%param.1),
+    sharding={devices=[1,2,2]0,1,2,3}
+  %p2_copy = f32[8,128] copy(%param.2)
+  %dot_prop_rhs = f32[8,256,512] dot(%p0_copy_0, %p1_copy_0),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %dot_prop_lhs = f32[8,512,256] dot(%p1_copy_0, %p0_copy_0),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={1}, rhs_contracting_dims={2}
+  %dot_mat_vec = f32[8,256] dot(%p0_copy_0, %p2_copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+
+  %p0_copy_1 = f32[8,256,128] copy(%param.0)
+  %p1_copy_1 = f32[8,128,512] copy(%param.1)
+  %dot_back_prop_rhs = f32[8,256,512] dot(%p0_copy_1, %p1_copy_1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %copy_back_prop_rhs = f32[8,256,512] copy(%dot_back_prop_rhs),
+    sharding={devices=[1,2,2]0,1,2,3}
+
+  ROOT %tuple = (f32[8,256,256], f32[8,256,256], f32[8,256])
+    tuple(%dot_prop_lhs, %dot_prop_rhs, %dot_mat_vec, %copy_back_prop_rhs)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "dot_prop_rhs"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "dot_prop_lhs"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "dot_mat_vec"),
+              op::Sharding("{devices=[1,4]0,1,2,3}"));
+
+  EXPECT_THAT(FindInstruction(module.get(), "p0_copy_1"),
+              op::Sharding("{replicated}"));
+  EXPECT_THAT(FindInstruction(module.get(), "p1_copy_1"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+  EXPECT_THAT(FindInstruction(module.get(), "dot_back_prop_rhs"),
+              op::Sharding("{devices=[1,2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, DotTiledBatchDim) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,256,512] parameter(0)
+  %p1 = f32[8,512,128] parameter(1)
+
+  %add = f32[8,256,512] add(%p0, %p0)
+  %dot = f32[8,256,128] dot(%add, %p1),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={1}
+  %res = f32[8,32768] reshape(%dot), sharding={devices=[2,2]0,1,2,3}
+
+  ROOT %tuple = (f32[8,32768]) tuple(%res)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "add"),
+              op::Sharding("{devices=[2,2,1]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ConcatFromUserUnshardedDim) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,128] parameter(0)
+  %p1 = f32[8,128] parameter(1)
+  %c0 = f32[8,128] copy(%p0)
+  %c1 = f32[8,128] copy(%p1)
+
+  %concat = f32[16,128] concatenate(%c0, %c1),
+    dimensions={0},
+    sharding={devices=[1,2]0,1}
+  ROOT %tuple = (f32[16,128]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, ConcatFromUserShardedDim) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,128] parameter(0)
+  %p1 = f32[8,128] parameter(1)
+  %c0 = f32[8,128] copy(%p0)
+  %c1 = f32[8,128] copy(%p1)
+
+  %concat = f32[16,128] concatenate(%c0, %c1),
+    dimensions={0},
+    sharding={devices=[3,1]0,1,2}
+  ROOT %tuple = (f32[16,128]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[2,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[2,1]1,2}"));
+}
+
+TEST_F(ShardingPropagationTest, ConcatFromUserShardedDimMaximalOperand) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %conv {
+  %p0 = f32[8,128] parameter(0)
+  %p1 = f32[24,128] parameter(1)
+  %c0 = f32[8,128] copy(%p0)
+  %c1 = f32[24,128] copy(%p1)
+
+  %concat = f32[32,128] concatenate(%c0, %c1),
+    dimensions={0},
+    sharding={devices=[4,1]0,1,2,3}
+  ROOT %tuple = (f32[32,128]) tuple(%concat)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"), op::NoSharding());
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[3,1]1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, ReplicatedToSideEffecting) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY entry_computation {
+  %const.0 = s32[] constant(0), sharding={replicated}
+  %const.1 = s32[] constant(2147483647), sharding={replicated}
+  %rng = s32[4]{0} rng(%const.0, %const.1),
+    distribution=rng_uniform
+  ROOT %root = (s32[4]{0}) tuple(%rng)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_FALSE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "rng"), op::NoSharding());
+}
+
+TEST_F(ShardingPropagationTest, PartReplicatedTupleUser) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY entry_computation {
+  %param.0 = f32[5] parameter(0)
+  %param.1 = f32[7] parameter(1)
+  %param.2 = f32[9] parameter(2)
+  %tuple.0 = (f32[5], f32[7]) tuple(%param.0, %param.1)
+  ROOT %tuple.1 = ((f32[5], f32[7]), f32[9]) tuple(%tuple.0, %param.2),
+    sharding={{maximal device=0}, {replicated}, {maximal device=1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tuple.0"),
+              op::Sharding("{{maximal device=0}, {replicated}}"));
+}
+
+TEST_F(ShardingPropagationTest, Conditional) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%true_comp {
+  %tp = (f32[3,5]) parameter(0)
+  %tgte = f32[3,5] get-tuple-element(%tp), index=0
+  %ttr = f32[5,3] transpose(%tgte), dimensions={1,0}
+  ROOT %tr = (f32[5,3]) tuple(%ttr)
+}
+
+%false_comp {
+  %fp = (f32[5,3]) parameter(0)
+  %fgte = f32[5,3] get-tuple-element(%fp), index=0
+  ROOT %fr = (f32[5,3]) tuple(%fgte)
+}
+
+ENTRY entry {
+  %cond = pred[] parameter(0)
+  %true_param = (f32[3,5]) parameter(1), sharding={{devices=[1,2]0,1}}
+  %false_param = (f32[5,3]) parameter(2), sharding={{devices=[1,3]0,1,2}}
+  %conditional = (f32[5,3]) conditional(
+      %cond, %true_param, %false_param),
+    true_computation=%true_comp,
+    false_computation=%false_comp
+  ROOT %root = f32[5,3] get-tuple-element(%conditional), index=0
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "tp"),
+              op::Sharding("{{devices=[1,2]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "tgte"),
+              op::Sharding("{devices=[1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "ttr"),
+              op::Sharding("{devices=[2,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "tr"),
+              op::Sharding("{{devices=[2,1]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "fp"),
+              op::Sharding("{{devices=[1,3]0,1,2}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "fgte"),
+              op::Sharding("{devices=[1,3]0,1,2}"));
+  EXPECT_THAT(FindInstruction(module.get(), "fr"),
+              op::Sharding("{{devices=[2,1]0,1}}"));
+  EXPECT_THAT(FindInstruction(module.get(), "conditional"),
+              op::Sharding("{{devices=[2,1]0,1}}"));
+}
+
+TEST_F(ShardingPropagationTest, TupleFromUser) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[13] parameter(0)
+  %p1 = f32[15] parameter(1)
+  %p2 = f32[17] parameter(2)
+  %t0 = (f32[13], f32[15]) tuple(%p0, %p1)
+  %t1 = ((f32[13], f32[15]), f32[17]) tuple(%t0, %p2)
+  %gte.0 = (f32[13], f32[15]) get-tuple-element(%t1), index=0
+  %gte.1 = f32[13] get-tuple-element(%gte.0), index=0
+  %gte.2 = f32[15] get-tuple-element(%gte.0), index=1
+  %gte.3 = f32[17] get-tuple-element(%t1), index=1
+  ROOT %t2 = (f32[13], f32[15], f32[17]) tuple(%gte.1, %gte.2, %gte.3),
+    sharding={{replicated}, {devices=[2]0,1}, {devices=[3]1,2,3}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "t0"),
+              op::Sharding("{{replicated}, {devices=[2]0,1}}"));
+  EXPECT_THAT(
+      FindInstruction(module.get(), "t1"),
+      op::Sharding("{{replicated}, {devices=[2]0,1}, {devices=[3]1,2,3}}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicSliceForwardPass) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0), sharding={devices=[1,1,2]0,1}
+  %p1 = s32[] parameter(1)
+  %i0 = s32[] constant(0)
+  %ds = f32[11,1,15] dynamic-slice(%c0, %i0, %p1, %i0),
+    dynamic_slice_sizes={11,1,15}
+  ROOT %root = (f32[11,1,15]) tuple(%ds)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "ds"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicSliceBackwardPass) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = s32[] parameter(1)
+  %i0 = s32[] constant(0)
+  %ds = f32[11,1,15] dynamic-slice(%c0, %i0, %p1, %i0),
+    dynamic_slice_sizes={11,1,15},
+    sharding={devices=[1,1,2]0,1}
+  ROOT %root = (f32[11,1,15]) tuple(%ds)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "ds"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicUpdateSliceForwardPassBase) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0), sharding={devices=[1,1,2]0,1}
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1)
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %dus = f32[11,13,15] dynamic-update-slice(%c0, %c1, %i0, %p2, %i0)
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "dus"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicUpdateSliceForwardPassUpdate) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1), sharding={devices=[1,1,2]0,1}
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %dus = f32[11,13,15] dynamic-update-slice(%c0, %c1, %i0, %p2, %i0)
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "dus"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, DynamicUpdateSliceBackwardPass) {
+  const char* hlo_string = R"(
+HloModule module
+ENTRY %entry {
+  %p0 = f32[11,13,15] parameter(0)
+  %c0 = f32[11,13,15] copy(%p0)
+  %p1 = f32[11,1,15] parameter(1)
+  %c1 = f32[11,1,15] copy(%p1)
+  %p2 = s32[] parameter(2)
+  %i0 = s32[] constant(0)
+  %dus = f32[11,13,15] dynamic-update-slice(%c0, %c1, %i0, %p2, %i0),
+    sharding={devices=[1,1,2]0,1}
+  ROOT %root = (f32[11,13,15]) tuple(%dus)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "c0"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "c1"),
+              op::Sharding("{devices=[1,1,2]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumLHSBatchPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs)
+  %conv = f32[32,24,39296] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf_0oi->0bf, window={size=32 stride=31 lhs_dilate=32}
+  ROOT %copy = f32[32,24,39296] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "rhs.copy"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumOutputBatchPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs)
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs)
+  %conv = f32[32,24,39296] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf_0oi->0bf, window={size=32 stride=31 lhs_dilate=32},
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs.copy"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+  EXPECT_THAT(FindInstruction(module.get(), "rhs.copy"),
+              op::Sharding("{devices=[2,1,1]0,1}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumLHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[32,39296,64,1] parameter(1)
+  %rhs.copy = f32[32,39296,64,1] copy(%rhs)
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1, window={size=32x1 stride=31x1 lhs_dilate=32x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,2,1,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumOutputLHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs)
+  %rhs = f32[32,39296,64,1] parameter(1)
+  %rhs.copy = f32[32,39296,64,1] copy(%rhs)
+  ROOT %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1, window={size=32x1 stride=31x1 lhs_dilate=32x1},
+    sharding={devices=[1,2,1,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "lhs.copy"),
+              op::Sharding("{devices=[1,2,1,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumRHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs)
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,1,2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumOutputRHSNonContractingPartitioned) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs)
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs)
+  ROOT %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1},
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "rhs.copy"),
+              op::Sharding("{devices=[1,2,1,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumChooseLargerOperand) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs), sharding={devices=[1,4,1,1]0,1,2,3}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[1,1,2,2]0,1,2,3}"));
+}
+
+TEST_F(ShardingPropagationTest, EinsumChooseBatchFirst) {
+  const char* hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,1] parameter(0)
+  %lhs.copy = f32[32,24,64,1] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  %conv = f32[32,24,39296,128] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=0bf1_0oi1->0bf1,
+    window={size=32x128 stride=31x1 pad=0_0x127_127 lhs_dilate=32x1 rhs_reversal=0x1}
+  ROOT %copy = f32[32,24,39296,128] copy(%conv)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          ShardingPropagation().Run(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(FindInstruction(module.get(), "conv"),
+              op::Sharding("{devices=[2,1,1,1]0,1}"));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index 5be6a04f934..280af2246bb 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -33,6 +33,7 @@ cc_library(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/service:dot_as_convolution_util",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 090fcd48893..8eee452328e 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/dot_as_convolution_util.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -2905,6 +2906,46 @@ Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
 }
 
 Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
+  auto dot_dnums = dot_as_convolution_util::ParseDotGeneralFromConvolution(hlo);
+  if (dot_dnums) {
+    // Use HandleDotHelper() for convs that are actually einsums.
+    spmd::DotGeneralDimsMapping mapping;
+    for (const auto& dims : dot_dnums->batch_dims) {
+      mapping.batch_dims.emplace_back();
+      mapping.batch_dims.back().lhs = dims.lhs;
+      mapping.batch_dims.back().rhs = dims.rhs;
+      mapping.batch_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->contracting_dims) {
+      mapping.contracting_dims.emplace_back();
+      mapping.contracting_dims.back().lhs = dims.lhs;
+      mapping.contracting_dims.back().rhs = dims.rhs;
+      mapping.contracting_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->lhs_non_contracting_dims) {
+      mapping.lhs_non_contracting_dims.emplace_back();
+      mapping.lhs_non_contracting_dims.back().lhs = dims.lhs;
+      mapping.lhs_non_contracting_dims.back().rhs = dims.rhs;
+      mapping.lhs_non_contracting_dims.back().output = dims.output;
+    }
+    for (const auto& dims : dot_dnums->rhs_non_contracting_dims) {
+      mapping.rhs_non_contracting_dims.emplace_back();
+      mapping.rhs_non_contracting_dims.back().lhs = dims.lhs;
+      mapping.rhs_non_contracting_dims.back().rhs = dims.rhs;
+      mapping.rhs_non_contracting_dims.back().output = dims.output;
+    }
+    auto create_sharded_conv =
+        [&](HloInstruction* lhs_hlo, HloInstruction* rhs_hlo,
+            spmd::SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+      TF_ASSIGN_OR_RETURN(
+          auto sharded_conv,
+          dot_as_convolution_util::CreateShardedConvForDotGeneralConvolution(
+              *hlo, *dot_dnums, lhs_hlo, rhs_hlo));
+      return b->AddInstruction(std::move(sharded_conv));
+    };
+    return HandleDotHelper(hlo, mapping, create_sharded_conv);
+  }
+
   auto lhs = GetPartitionedHlo(hlo->operand(0));
   auto rhs = GetPartitionedHlo(hlo->operand(1));
   const HloSharding& sharding = hlo->sharding();

From a33f9b4404f2afd50e08e42ca441c86d5146bfc0 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Fri, 22 May 2020 19:07:28 -0700
Subject: [PATCH 1062/1533] Reduce Layer.__call__ overhead by ~20%

This is achieved by improving the way masks are handled for inputs and outputs.
For the common case where masks are not input and are not output, minimal work
is done now.
For the masking case, the work done is about the same.

PiperOrigin-RevId: 312871996
Change-Id: I2e122551bec27d075193e1881bf236d570d25ce4
---
 tensorflow/python/keras/engine/base_layer.py | 74 +++++++++++---------
 tensorflow/python/keras/engine/functional.py |  3 +-
 tensorflow/python/keras/engine/sequential.py |  4 +-
 3 files changed, 43 insertions(+), 38 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index b34616632e3..9dd05e53df7 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -386,6 +386,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # might want to turn it off, like Sequential model.
     self._auto_track_sub_layers = True
 
+    # Will compute masking if `compute_mask` is overridden or `supports_masking`
+    # is set.
+    self._compute_mask_overridden = (not getattr(self.compute_mask,
+                                                 '_is_default', False))
+
   @trackable.no_automatic_dependency_tracking
   @generic_utils.default
   def build(self, input_shape):
@@ -844,7 +849,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed
     # explicitly take priority.
     mask_arg_passed_by_framework = False
-    input_masks = self._collect_input_masks(inputs, args, kwargs)
+    input_masks = self._collect_input_masks(inputs, input_list, args, kwargs)
     if (self._expects_mask_arg and input_masks is not None and
         not self._call_arg_was_passed('mask', args, kwargs)):
       mask_arg_passed_by_framework = True
@@ -973,7 +978,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             outputs = self._set_connectivity_metadata((inputs,) + args, kwargs,
                                                       outputs)
           self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, input_masks)
+          self._set_mask_metadata(inputs, outputs, input_masks, build_graph)
           if hasattr(self, '_set_inputs') and not self.inputs:
             # Subclassed network: explicitly set metadata normally set by
             # a call to self._set_inputs().
@@ -987,7 +992,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
               self._compute_dtype):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
-          self._set_mask_metadata(inputs, outputs, input_masks)
+          self._set_mask_metadata(inputs, outputs, input_masks, build_graph)
           if hasattr(self, '_set_save_spec'):
             self._set_save_spec(cast_inputs)
 
@@ -2259,47 +2264,45 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           mean_activity_loss = activity_loss / batch_size
           self.add_loss(mean_activity_loss)
 
-  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+  def _set_mask_metadata(self, inputs, outputs, previous_mask, build_graph):
+    # Many `Layer`s don't need to call `compute_mask`.
+    # This method is optimized to do as little work as needed for the common
+    # case.
+    if not self.supports_masking and not self._compute_mask_overridden:
+      return
+
     flat_outputs = nest.flatten(outputs)
 
     mask_already_computed = (
         getattr(self, '_compute_output_and_mask_jointly', False) or
         all(getattr(x, '_keras_mask', None) is not None for x in flat_outputs))
-
-    # Only compute the mask if the Layer explicitly supports masking or has
-    # overridden `compute_mask`.
-    should_compute_mask = (
-        hasattr(self, 'compute_mask') and
-        (self.supports_masking or
-         not getattr(self.compute_mask, '_is_default', False)))
-
     if mask_already_computed:
-      flat_masks = [getattr(x, '_keras_mask', None) for x in flat_outputs]
-    elif not should_compute_mask:
-      flat_masks = [None for _ in flat_outputs]
-    else:
-      output_masks = self.compute_mask(inputs, previous_mask)
-      # `compute_mask` can return a single `None` even when a Layer
-      # has multiple outputs.
-      if output_masks is None:
-        flat_masks = [None for _ in flat_outputs]
-      else:
-        flat_masks = nest.flatten(output_masks)
+      if build_graph:
+        self._set_mask_keras_history_checked(flat_outputs)
+      return
 
-    for output, mask in zip(flat_outputs, flat_masks):
+    output_masks = self.compute_mask(inputs, previous_mask)
+    if output_masks is None:
+      return
+
+    flat_masks = nest.flatten(output_masks)
+    for tensor, mask in zip(flat_outputs, flat_masks):
       try:
-        output._keras_mask = mask
+        tensor._keras_mask = mask
       except AttributeError:
         # C Type such as np.ndarray.
         pass
 
-    if tf_utils.are_all_symbolic_tensors(flat_outputs):
-      for output in flat_outputs:
-        if getattr(output, '_keras_mask', None) is not None:
-          # Do not track masks for `TensorFlowOpLayer` construction.
-          output._keras_mask._keras_history_checked = True
+    if build_graph:
+      self._set_mask_keras_history_checked(flat_outputs)
 
-  def _collect_input_masks(self, inputs, args, kwargs):
+  def _set_mask_keras_history_checked(self, flat_outputs):
+    for output in flat_outputs:
+      if getattr(output, '_keras_mask', None) is not None:
+        # Do not track masks for `TensorFlowOpLayer` construction.
+        output._keras_mask._keras_history_checked = True
+
+  def _collect_input_masks(self, inputs, input_list, args, kwargs):
     """Checks if `mask` argument was passed, else gathers mask from inputs."""
     if self._call_arg_was_passed('mask', args, kwargs):
       return self._get_call_arg_value('mask', args, kwargs)
@@ -2307,11 +2310,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if not self._should_compute_mask:
       return None
 
-    input_masks = nest.map_structure(lambda t: getattr(t, '_keras_mask', None),
-                                     inputs)
-    if generic_utils.is_all_none(input_masks):
+    input_masks = [getattr(t, '_keras_mask', None) for t in input_list]
+    if all(mask is None for mask in input_masks):
       return None
-    return input_masks
+
+    # Only do expensive `nest` operation when masking is actually being used.
+    return nest.pack_sequence_as(inputs, input_masks)
 
   def _call_arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
     # Performance optimization: do no work in most common case.
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index f219e590daf..761955100ea 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -358,7 +358,8 @@ class Functional(training_lib.Model):
     # by itself because it will duplicate any updates and losses in graph
     # mode by `call`ing the Layers again.
     output_tensors = self._run_internal_graph(inputs, mask=mask)
-    return nest.map_structure(lambda t: t._keras_mask, output_tensors)
+    return nest.map_structure(lambda t: getattr(t, '_keras_mask', None),
+                              output_tensors)
 
   def call(self, inputs, training=None, mask=None):
     """Calls the model on new inputs.
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index d07ed477ba9..d8325b98504 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -397,7 +397,7 @@ class Sequential(functional.Functional):
         raise ValueError(SINGLE_LAYER_OUTPUT_ERROR_MSG)
       # `outputs` will be the inputs to the next layer.
       inputs = outputs
-      mask = outputs._keras_mask
+      mask = getattr(outputs, '_keras_mask', None)
     return outputs
 
   def compute_output_shape(self, input_shape):
@@ -411,7 +411,7 @@ class Sequential(functional.Functional):
     # by itself because it will duplicate any updates and losses in graph
     # mode by `call`ing the Layers again.
     outputs = self.call(inputs, mask=mask)
-    return outputs._keras_mask
+    return getattr(outputs, '_keras_mask', None)
 
   @deprecated('2021-01-01', 'Please use `model.predict()` instead.')
   def predict_proba(self, x, batch_size=32, verbose=0):

From 2544e4e277c7142a5a803558ffd1e7dc27d3c1c2 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Fri, 22 May 2020 19:17:58 -0700
Subject: [PATCH 1063/1533] Added new attributes for 3D operations.
 Reshape3DAttributes. Slice3DAttributes. Transpose3DAttributes. Added methods
 for shape calculation for this attributes.

PiperOrigin-RevId: 312872498
Change-Id: Ia2539ad880bae0869f8d1e379d4aedad9a10095a
---
 .../lite/delegates/gpu/common/operations.cc   | 15 +++++++++
 .../lite/delegates/gpu/common/operations.h    | 31 +++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc
index 8fcbe379e11..c3861ca2baa 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.cc
+++ b/tensorflow/lite/delegates/gpu/common/operations.cc
@@ -499,6 +499,14 @@ BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr) {
               StridedSize(attr.ends.c - attr.starts.c, attr.strides.c));
 }
 
+BHWDC CalculateOutputShape(const BHWDC& input, const Slice3DAttributes& attr) {
+  return BHWDC(StridedSize(attr.ends.b - attr.starts.b, attr.strides.b),
+               StridedSize(attr.ends.h - attr.starts.h, attr.strides.h),
+               StridedSize(attr.ends.w - attr.starts.w, attr.strides.w),
+               StridedSize(attr.ends.d - attr.starts.d, attr.strides.d),
+               StridedSize(attr.ends.c - attr.starts.c, attr.strides.c));
+}
+
 BHWC CalculateOutputShape(const BHWC& input, const PadAttributes& attr) {
   return BHWC(attr.appended.b + attr.prepended.b + input.b,
               attr.appended.h + attr.prepended.h + input.h,
@@ -734,5 +742,12 @@ BHWC CalculateOutputShape(const BHWC& input, const TransposeAttributes& attr) {
               input.get(attr.perm.w), input.get(attr.perm.c));
 }
 
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const Transpose3DAttributes& attr) {
+  return BHWDC(input.get(attr.perm.b), input.get(attr.perm.h),
+               input.get(attr.perm.w), input.get(attr.perm.d),
+               input.get(attr.perm.c));
+}
+
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index d0268eee585..9d714d9bc55 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -399,6 +399,9 @@ struct Resize3DAttributes {
   // If true, the centers of the 8 corner pixels of the input and output tensors
   // are aligned, preserving the values at the corner pixels. Defaults to false.
   bool align_corners = false;
+  // half_pixel_centers assumes pixels are of half the actual dimensions, and
+  // yields more accurate resizes. Only applicable to BILINEAR sampling.
+  bool half_pixel_centers = false;
 };
 
 float CalculateResizeScale(int32_t input_size, int32_t output_size,
@@ -460,6 +463,20 @@ struct SliceAttributes {
 //         input.
 BHWC CalculateOutputShape(const BHWC& input, const SliceAttributes& attr);
 
+// Simple slicing without advanced support for shrinking, reverse slicing etc.
+struct Slice3DAttributes {
+  // Specifies start and end dimensions for slicing.
+  BHWDC starts;
+  BHWDC ends;
+
+  // Stride should be >= 1.
+  BHWDC strides;
+};
+
+// @return shape of a tensor after Slice3D operation is applied to the given
+//         input.
+BHWDC CalculateOutputShape(const BHWDC& input, const Slice3DAttributes& attr);
+
 struct AddAttributes {
   TensorOrScalar param;
 };
@@ -485,6 +502,10 @@ struct ReshapeAttributes {
   BHWC new_shape;
 };
 
+struct Reshape3DAttributes {
+  BHWDC new_shape;
+};
+
 struct TransposeAttributes {
   // A permutation of the dimensions of input tensor
   BHWC perm;
@@ -494,6 +515,16 @@ struct TransposeAttributes {
 // the given input.
 BHWC CalculateOutputShape(const BHWC& input, const TransposeAttributes& attr);
 
+struct Transpose3DAttributes {
+  // A permutation of the dimensions of input tensor
+  BHWDC perm;
+};
+
+// @return shape of a tensor after Transpose3D operation is applied to
+// the given input.
+BHWDC CalculateOutputShape(const BHWDC& input,
+                           const Transpose3DAttributes& attr);
+
 struct SpaceToDepthAttributes {
   int block_size;
 };

From f654ac48a62b580ea05fe451593a0b6b698275c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 May 2020 02:02:34 -0700
Subject: [PATCH 1064/1533] Update GraphDef version to 410.

PiperOrigin-RevId: 312894360
Change-Id: I57f76e7bdd6225631c89c976abe7d082fb196c7f
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 3724f06ba4b..a003c62e2d5 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 409  // Updated: 2020/5/22
+#define TF_GRAPH_DEF_VERSION 410  // Updated: 2020/5/23
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 73b9acd1438857c12798e56cdb2b8bf5fd94c878 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 May 2020 02:02:38 -0700
Subject: [PATCH 1065/1533] compat: Update forward compatibility horizon to
 2020-05-23

PiperOrigin-RevId: 312894373
Change-Id: If60600d496f720422ed7cbb769626a07455845da
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 56bf2894db7..09dfe2cc91a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 22)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 23)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From df5e319d05778e6773e7703dc61fa6baaf4fe3b3 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Sat, 23 May 2020 09:43:17 -0700
Subject: [PATCH 1066/1533] Make the tf2xla "tensor list size not set" error
 message a bit more ergonomic

PiperOrigin-RevId: 312916547
Change-Id: Idbbe406a35205a0fb6dc5e620e04cf3bccefa43d
---
 .../compiler/tf2xla/kernels/tensor_list_ops.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index d01f094dc2e..976ff91f6ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -136,8 +136,11 @@ class TensorListReserveOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
     OP_REQUIRES(
         ctx, num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the number of elements."));
+        errors::InvalidArgument(
+            "XLA compilation requires a fixed tensor list size. Set the number "
+            "of elements. This could also happen if you're using a TensorArray "
+            "in a while loop that does not have its maximum_iteration set, you "
+            "can fix this by setting maximum_iteration to a suitable value."));
 
     // If element shape is compile time constant and it's not "unknown rank"
     // shape (-1), create an initialized TensorList. Otherwise create an
@@ -197,10 +200,13 @@ class EmptyTensorListOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
-    OP_REQUIRES(
-        ctx, max_num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the max number of elements."));
+    OP_REQUIRES(ctx, max_num_elements >= 0,
+                errors::InvalidArgument(
+                    "XLA compilation requires a fixed tensor list size. Set "
+                    "the max number of elements. This could also happen if "
+                    "you're using a TensorArray in a while loop that does not "
+                    "have its maximum_iteration set, you can fix this by "
+                    "setting maximum_iteration to a suitable value."));
 
     if (dtype_ != DT_VARIANT) {
       // We are creating a non-nested TensorList.

From 144b3dc7902c05078341b1942fd1312a28f15003 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 23 May 2020 09:59:52 -0700
Subject: [PATCH 1067/1533] Make the tf2xla "tensor list size not set" error
 message a bit more ergonomic

PiperOrigin-RevId: 312917264
Change-Id: I14c373860aafed5050ac42510d341fab95307c8d
---
 .../compiler/tf2xla/kernels/tensor_list_ops.cc | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 976ff91f6ce..d01f094dc2e 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -136,11 +136,8 @@ class TensorListReserveOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
     OP_REQUIRES(
         ctx, num_elements >= 0,
-        errors::InvalidArgument(
-            "XLA compilation requires a fixed tensor list size. Set the number "
-            "of elements. This could also happen if you're using a TensorArray "
-            "in a while loop that does not have its maximum_iteration set, you "
-            "can fix this by setting maximum_iteration to a suitable value."));
+        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
+                                "size. Set the number of elements."));
 
     // If element shape is compile time constant and it's not "unknown rank"
     // shape (-1), create an initialized TensorList. Otherwise create an
@@ -200,13 +197,10 @@ class EmptyTensorListOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
-    OP_REQUIRES(ctx, max_num_elements >= 0,
-                errors::InvalidArgument(
-                    "XLA compilation requires a fixed tensor list size. Set "
-                    "the max number of elements. This could also happen if "
-                    "you're using a TensorArray in a while loop that does not "
-                    "have its maximum_iteration set, you can fix this by "
-                    "setting maximum_iteration to a suitable value."));
+    OP_REQUIRES(
+        ctx, max_num_elements >= 0,
+        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
+                                "size. Set the max number of elements."));
 
     if (dtype_ != DT_VARIANT) {
       // We are creating a non-nested TensorList.

From 913f1c7013cbde912d08b1530bad325812dcdcaf Mon Sep 17 00:00:00 2001
From: Gaurav Singh <gaurav1086@gmail.com>
Date: Sat, 23 May 2020 15:13:44 -0400
Subject: [PATCH 1068/1533] [Lite]: Fix memory leak from model

Signed-off-by: Gaurav Singh <gaurav1086@gmail.com>
---
 .../lite/experimental/delegates/coreml/builders/op_builder.cc    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 2581b58f1e4..4cdfd519daf 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -95,6 +95,7 @@ CoreML::Specification::Model* GraphBuilder::BuildModel() {
         CoreML::Specification::EXACT_ARRAY_MAPPING);
   } else {
     fprintf(stderr, "Unsupported Core ML version: %d\n", coreml_version_);
+	delete(model);
     return nullptr;
   }
   auto* neural_network = model->mutable_neuralnetwork();

From 7738aca0dcf9f2d2d27b7c3bb1b17c0fb41bbb10 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 23 May 2020 22:56:38 +0000
Subject: [PATCH 1069/1533] Add complex tensor support for
 tf.debugging.assert_near

This PR tries to address the issue raised in 39815 where
tf.debugging.assert_near does not support complex tensors as was specified
in docstring.

This PR adds complex tensor support for tf.debugging.assert_near.

This PR fixes 39815.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/check_ops_test.py | 10 ++++++++++
 tensorflow/python/ops/check_ops.py               |  9 ++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 47f392d7438..6a1b5c1f952 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -528,6 +528,16 @@ class AssertAllCloseTest(test.TestCase):
       x = check_ops.assert_near(t1, t2)
       assert x is None
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_doesnt_raise_complex(self):
+    x = constant_op.constant(1. + 0.1j, name="x")
+    y = constant_op.constant(1.1 + 0.1j, name="y")
+    with ops.control_dependencies(
+        [check_ops.assert_near(x, y, atol=0., rtol=0.5,
+                               message="failure message")]):
+      out = array_ops.identity(x)
+      self.evaluate(out)
+
 
 class AssertLessTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 3085e05eaf6..c1a17bc13ab 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -812,12 +812,15 @@ def assert_near(
     x = ops.convert_to_tensor(x, name='x')
     y = ops.convert_to_tensor(y, name='y', dtype=x.dtype)
 
-    eps = np.finfo(x.dtype.as_numpy_dtype).eps
+    dtype = x.dtype
+    if dtype.is_complex:
+      dtype = dtype.real_dtype
+    eps = np.finfo(dtype.as_numpy_dtype).eps
     rtol = 10 * eps if rtol is None else rtol
     atol = 10 * eps if atol is None else atol
 
-    rtol = ops.convert_to_tensor(rtol, name='rtol', dtype=x.dtype)
-    atol = ops.convert_to_tensor(atol, name='atol', dtype=x.dtype)
+    rtol = ops.convert_to_tensor(rtol, name='rtol', dtype=dtype)
+    atol = ops.convert_to_tensor(atol, name='atol', dtype=dtype)
 
     if context.executing_eagerly():
       x_name = _shape_and_dtype_str(x)

From b9f941a53fa9490fee3306c8f448aeb56bed9ce3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 23 May 2020 23:02:55 +0000
Subject: [PATCH 1070/1533] Fix incorrect reference of np.assert_allclose
 (should be np.testing.assert_allclose)

In the docstring of tf.debugging.assert_near, the numpy compatibility
part incorrectly uses np.assert_allclose.

This should be np.testing.assert_allclose instead.

This PR fixes the incorrect docstring.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/check_ops.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 3085e05eaf6..bbb7ebdf8be 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -750,9 +750,9 @@ def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
       statically known.
 
   @compatibility(numpy)
-  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
-  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
-  and even `16bit` data.
+  Similar to `numpy.testing.assert_allclose`, except tolerance depends on data
+  type. This is due to the fact that `TensorFlow` is often used with `32bit`,
+  `64bit`, and even `16bit` data.
   @end_compatibility
   """
   return assert_near(x=x, y=y, rtol=rtol, atol=atol, summarize=summarize,
@@ -802,9 +802,9 @@ def assert_near(
     Op that raises `InvalidArgumentError` if `x` and `y` are not close enough.
 
   @compatibility(numpy)
-  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
-  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
-  and even `16bit` data.
+  Similar to `numpy.testing.assert_allclose`, except tolerance depends on data
+  type. This is due to the fact that `TensorFlow` is often used with `32bit`,
+  `64bit`, and even `16bit` data.
   @end_compatibility
   """
   message = message or ''

From c76a8d14b1465710618e3262ef7c84bc4677b152 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Sat, 23 May 2020 20:28:54 -0700
Subject: [PATCH 1071/1533] Rewrite `del` to treat undefinedness in a
 consistent manner.

PiperOrigin-RevId: 312947175
Change-Id: Ida4cb8c97ff280cb1011e33edc20a5c524fb8f8a
---
 .../python/autograph/converters/variables.py  | 25 ++++++
 .../autograph/converters/variables_test.py    | 84 +++++++++++++++++++
 2 files changed, 109 insertions(+)

diff --git a/tensorflow/python/autograph/converters/variables.py b/tensorflow/python/autograph/converters/variables.py
index 3028a65a69b..9784f50ed56 100644
--- a/tensorflow/python/autograph/converters/variables.py
+++ b/tensorflow/python/autograph/converters/variables.py
@@ -60,6 +60,31 @@ class VariableAccessTransformer(converter.Base):
       node = templates.replace_as_expression('ag__.ld(var_)', var_=node)
     return node
 
+  def visit_Delete(self, node):
+    node = self.generic_visit(node)
+
+    rewrite_targets = []
+    for tgt in node.targets:
+      # Don't rewrite composites like `del a[0]`.
+      if isinstance(tgt, gast.Name):
+        rewrite_targets.append(tgt)
+
+    if not rewrite_targets:
+      return node
+
+    results = []
+    for tgt in rewrite_targets:
+      template = """
+        var_ = ag__.Undefined(var_name)
+      """
+      results.extend(templates.replace(
+          template, var_=tgt, var_name=gast.Constant(tgt.id, kind=None)))
+    remaining_targets = [n for n in node.targets if n not in rewrite_targets]
+    if remaining_targets:
+      results.append(gast.Delete(targets=remaining_targets))
+
+    return results
+
   def visit_AugAssign(self, node):
     if isinstance(node.target, gast.Name):
       template = """
diff --git a/tensorflow/python/autograph/converters/variables_test.py b/tensorflow/python/autograph/converters/variables_test.py
index 556dafbaa8a..93a31e63de3 100644
--- a/tensorflow/python/autograph/converters/variables_test.py
+++ b/tensorflow/python/autograph/converters/variables_test.py
@@ -51,6 +51,90 @@ class VariablesTest(converter_testing.TestCase):
     with self.apply_add_one_conversion(test_fn) as result:
       self.assertEqual(result.test_fn(1), (1 + 1) * 10 + 1)  # two reads
 
+  def test_del(self):
+
+    def test_fn(l):
+      del l
+      return l
+
+    with self.converted(test_fn, variables, {}) as result:
+      with self.assertRaisesRegex(
+          NameError, "'l' is used before assignment"):
+        result.test_fn(1)
+
+  def test_del_getitem_ignored(self):
+
+    def basic_slice(l):
+      del l[0]
+      return l
+
+    with self.converted(basic_slice, variables, {}) as result:
+      self.assertListEqual([2], result.basic_slice([1, 2]))
+
+    def range_slice(l):
+      del l[0:2]
+      return l
+
+    with self.converted(range_slice, variables, {}) as result:
+      self.assertListEqual([], result.range_slice([1, 2]))
+
+  def test_del_getattr_ignored(self):
+
+    def test_fn(l):
+      del l.a
+      return l
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.a = 1
+        self.b = 2
+
+    with self.converted(test_fn, variables, {}) as result:
+      self.assertFalse(hasattr(result.test_fn(TestClass()), 'a'))
+      self.assertEqual(result.test_fn(TestClass()).b, 2)
+
+  def test_del_packing_ignored(self):
+    # Note: test for UnboundLocalError, not NameError because in this case we
+    # don't rewrite the del.
+
+    def list_(a, b):
+      del [a, b]
+      return a
+
+    with self.converted(list_, variables, {}) as result:
+      with self.assertRaises(UnboundLocalError):
+        result.list_(1, 2)
+
+    def nested(a, b, c):
+      del [a, (b, c)]
+      return c
+
+    with self.converted(nested, variables, {}) as result:
+      with self.assertRaises(UnboundLocalError):
+        result.nested(1, 2, 3)
+
+  def test_del_item_multiple_mixed(self):
+
+    def test_fn_failing(a, b, c):
+      del a, b, c[0]
+      a = 1
+      return a, b, c
+
+    with self.converted(test_fn_failing, variables, {}) as result:
+      with self.assertRaisesRegex(
+          NameError, "'b' is used before assignment"):
+        result.test_fn_failing(1, 2, [1, 2])
+
+    def test_fn_passing(a, b, c):
+      del a, b, c[0]
+      a = 1
+      b = 2
+      return c
+
+    with self.converted(test_fn_passing, variables, {}) as result:
+      self.assertListEqual([2], result.test_fn_passing(1, 2, [1, 2]))
+
   def test_attribute(self):
 
     class TestClass(object):

From b3701aac80622dde6529486ad118008c626eed65 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 May 2020 02:02:18 -0700
Subject: [PATCH 1072/1533] Update GraphDef version to 411.

PiperOrigin-RevId: 312963337
Change-Id: I9b9db44aa0010e1dea95442a4e5ff0ae88aef128
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a003c62e2d5..1ccd1d446cd 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 410  // Updated: 2020/5/23
+#define TF_GRAPH_DEF_VERSION 411  // Updated: 2020/5/24
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 1727b70d6ad5a58377588786d704c68bac511db5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 24 May 2020 02:02:21 -0700
Subject: [PATCH 1073/1533] compat: Update forward compatibility horizon to
 2020-05-24

PiperOrigin-RevId: 312963339
Change-Id: I8f115adb0b1d206ec0b363db228eb8b2f884ec59
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 09dfe2cc91a..ede137a73bd 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 23)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 24)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 9dcafbffd9ffe0e9ccc41d2a048d5c2e6c1cf87a Mon Sep 17 00:00:00 2001
From: Rishit Dagli <39672672+Rishit-dagli@users.noreply.github.com>
Date: Sun, 24 May 2020 17:01:16 +0530
Subject: [PATCH 1074/1533] Fixed a minor typo in resources section

Fixed Typo in the resources section from Machine Learning with TensorFLow on GCP to Machine Learning with TensorFlow on GCP

Co-authored-by: Kilaru Yasaswi Sri Chandra Gandhi <yasaswisrichandragandhi@gmail.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d1bc88b8dbc..ea6baec4081 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ Build Type                                                        | Status
 *   [Getting Started with TensorFlow 2 from Coursera](https://www.coursera.org/learn/getting-started-with-tensor-flow2)
 *   [Intro to TensorFlow for Deep Learning from Udacity](https://www.udacity.com/course/intro-to-tensorflow-for-deep-learning--ud187)
 *   [Introduction to TensorFlow Lite from Udacity](https://www.udacity.com/course/intro-to-tensorflow-lite--ud190)
-*   [Machine Learning with TensorFLow on GCP](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp)
+*   [Machine Learning with TensorFlow on GCP](https://www.coursera.org/specializations/machine-learning-tensorflow-gcp)
 *   [TensorFlow Blog](https://blog.tensorflow.org)
 *   [Learn ML with TensorFlow](https://www.tensorflow.org/resources/learn-ml)
 *   [TensorFlow Twitter](https://twitter.com/tensorflow)

From a1f496664ed89e7c23072093cdccef739c2f8014 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Sun, 24 May 2020 12:49:21 -0700
Subject: [PATCH 1075/1533] [tfdbg2] Fix graph-mode path_length_limit and
 stack_heigth_limit in enable_check_numerics()

Cause of the bug:
  - Previously, the helper method get_check_numerics_error_message() was called
    with the proper kwargs only under eager mode. The graph mode code path
    incorrectly omitted the kwargs.

This CL fixes that. The fix is covered by mock-based unit tests.

PiperOrigin-RevId: 312994212
Change-Id: I8800ec85741da6efe8fb8f3115ea7f57a38f0882
---
 .../debug/lib/check_numerics_callback.py      |  4 +-
 .../debug/lib/check_numerics_callback_test.py | 38 +++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/debug/lib/check_numerics_callback.py b/tensorflow/python/debug/lib/check_numerics_callback.py
index 440dc758e76..796fabae301 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback.py
@@ -275,7 +275,9 @@ class CheckNumericsCallback(object):
                   output,
                   inputs,
                   graph=graph,
-                  traceback=output.op.traceback))
+                  traceback=output.op.traceback,
+                  stack_height_limit=self._stack_height_limit,
+                  path_length_limit=self._path_length_limit))
           _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output
           instrumented_outputs.append(self._get_output_tensor(
               op_type_bytes, output, checked_output, is_v1_graph_mode))
diff --git a/tensorflow/python/debug/lib/check_numerics_callback_test.py b/tensorflow/python/debug/lib/check_numerics_callback_test.py
index 5f578da03c3..5c0cc6394ac 100644
--- a/tensorflow/python/debug/lib/check_numerics_callback_test.py
+++ b/tensorflow/python/debug/lib/check_numerics_callback_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 class LimitStringLengthTest(test_util.TensorFlowTestCase):
@@ -105,6 +106,27 @@ class CheckNumericsCallbackTest(test_util.TensorFlowTestCase):
     self.assertAllClose(batches[0], np.log([1.25, 2]))
     self.assertAllClose(batches[1], np.log([3.25, 5]))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGraphModeUsesCorrectPathLengthAndStackHeightLimits(self):
+    check_numerics_callback.enable_check_numerics(
+        stack_height_limit=123, path_length_limit=1200)
+
+    @def_function.function
+    def add_fn(x, y):
+      return x + y
+
+    fake_get_check_numerics_error_message = test.mock.MagicMock(
+        return_value="dummy_message")
+    with test.mock.patch.object(check_numerics_callback,
+                                "get_check_numerics_error_message",
+                                fake_get_check_numerics_error_message):
+      x = constant_op.constant(2.0)
+      y = constant_op.constant(3.0)
+      self.assertAllClose(self.evaluate(add_fn(x, y)), 5.0)
+      (_, call_kwargs) = fake_get_check_numerics_error_message.call_args
+      self.assertEqual(call_kwargs["stack_height_limit"], 123)
+      self.assertEqual(call_kwargs["path_length_limit"], 1200)
+
 
 class CheckNumericsCallbackUnhealthyTest(test_util.TensorFlowTestCase):
   """Test for cases in which enable_check_numerics() catches infs or nans."""
@@ -372,6 +394,22 @@ class CheckNumericsCallbackUnhealthyTest(test_util.TensorFlowTestCase):
                        re.search(r"graph op.*\"Xdivy\"", message)))
       self.assertTrue(re.search(r"dtype.*float32", message))
 
+  def testEagerModeUsesCorrectPathLengthAndStackHeightLimits(self):
+    check_numerics_callback.enable_check_numerics(
+        stack_height_limit=123, path_length_limit=1200)
+    fake_get_check_numerics_error_message = test.mock.MagicMock(
+        return_value="dummy_message")
+    with test.mock.patch.object(check_numerics_callback,
+                                "get_check_numerics_error_message",
+                                fake_get_check_numerics_error_message):
+      x = constant_op.constant(2.0)
+      y = constant_op.constant(0.0)
+      self._assertRaisesInvalidArgumentErrorAndGetMessage(
+          lambda: x / y)  # Expected to generate an inf.
+      (_, call_kwargs) = fake_get_check_numerics_error_message.call_args
+      self.assertEqual(call_kwargs["stack_height_limit"], 123)
+      self.assertEqual(call_kwargs["path_length_limit"], 1200)
+
   @test_util.run_in_graph_and_eager_modes
   def testExpectedNaNOpOutputs(self):
     """Test calling operations with benign NaN output."""

From f8e0e915abb78f4a9c0293072a2edf530f10bed1 Mon Sep 17 00:00:00 2001
From: Will Battel <willbattel@gmail.com>
Date: Sun, 24 May 2020 18:55:23 -0500
Subject: [PATCH 1076/1533] Fix typo in Core ML Delegate docs

---
 tensorflow/lite/g3doc/performance/coreml_delegate.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index c267347cf3f..c3d72b2e01f 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -160,7 +160,7 @@ devices using other libraries such as
 
 ### Using older Core ML version
 
-Although iOS 13 supprots Core ML 3, the model might work better when it is
+Although iOS 13 supports Core ML 3, the model might work better when it is
 converted with Core ML 2 model specification. The target conversion version is
 set to the latest version by default, but you can change this by setting
 `coreMLVersion` (in Swift, `coreml_version` in C API) in the delegate option to

From 2cc80a74f239817971b1e8669fa2d597ffde2cff Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Sun, 24 May 2020 21:49:20 -0700
Subject: [PATCH 1077/1533] Add missing kernels for flex delegate whitelisted
 ops

BroadcastTo, Ceil, FusedPadConv2D and FusedResizeAndPadConv2D ops
are whitelisted but their kernels are missing from the library.

PiperOrigin-RevId: 313025060
Change-Id: I82359019e52fcba546454cf771376e8429c6ffe7
---
 tensorflow/core/kernels/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 492cf0b9fd6..20df4202371 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6614,6 +6614,7 @@ filegroup(
         "avgpooling_op.h",
         "batch_matmul_op_impl.h",
         "batch_norm_op.h",
+        "broadcast_to_op.h",
         "control_flow_ops.h",
         "conv_2d.h",
         "conv_3d.h",
@@ -6703,6 +6704,7 @@ filegroup(
         "conv_ops_fused_float.cc",
         "conv_ops_fused_half.cc",
         "conv_ops_fused_impl.h",
+        "conv_ops_fused_image_transform.cc",
         "conv_ops_using_gemm.cc",
         "crop_and_resize_op.cc",
         "crop_and_resize_op.h",
@@ -6712,6 +6714,7 @@ filegroup(
         "cwise_op_bitwise_and.cc",
         "cwise_op_bitwise_or.cc",
         "cwise_op_bitwise_xor.cc",
+        "cwise_op_ceil.cc",
         "cwise_op_conj.cc",
         "cwise_op_cos.cc",
         "cwise_op_cosh.cc",
@@ -6804,6 +6807,7 @@ filegroup(
     name = "android_extended_ops_group2",
     srcs = [
         "batchtospace_op.cc",
+        "broadcast_to_op.cc",
         "ctc_decoder_ops.cc",
         "decode_bmp_op.cc",
         "depthtospace_op.cc",

From a814cfb7c0e1d2aac129634965fd8b45a8808760 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 May 2020 02:02:29 -0700
Subject: [PATCH 1078/1533] Update GraphDef version to 412.

PiperOrigin-RevId: 313044598
Change-Id: I9c856d49e89c62c3c79602de813a52c3412109d8
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 1ccd1d446cd..8cdf617144d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 411  // Updated: 2020/5/24
+#define TF_GRAPH_DEF_VERSION 412  // Updated: 2020/5/25
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 41f6863695f0887310d8a43c22d73c4e95a2d7f7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 May 2020 02:02:34 -0700
Subject: [PATCH 1079/1533] compat: Update forward compatibility horizon to
 2020-05-25

PiperOrigin-RevId: 313044618
Change-Id: Id140d3407f3aca2380a0f32ea47d3567bdb53a9e
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index ede137a73bd..c8c481c2b76 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 24)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 25)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From bdef91bcfff2ff27e6745262f867624e237d8c96 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 25 May 2020 02:46:49 -0700
Subject: [PATCH 1080/1533] Remove dependence from
 service/gpu:multi_output_fusion to service:multi_output_fusion

PiperOrigin-RevId: 313048943
Change-Id: I570c5300a3a1f3ef55329d6bf13b5f679a364886
---
 tensorflow/compiler/xla/service/gpu/BUILD         | 15 +++++++++------
 .../xla/service/gpu/multi_output_fusion.h         | 10 +++++++++-
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 0f6b2cb72e6..958100ecc03 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -17,15 +17,15 @@ load(
     "tf_cuda_library",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_is_configured",
+)
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 package(
@@ -901,12 +901,15 @@ cc_library(
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_reachability",
-        "//tensorflow/compiler/xla/service:multi_output_fusion",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
index 8d2ef53bfa9..e60f3bc3c14 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
@@ -16,7 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
 
-#include "tensorflow/compiler/xla/service/multi_output_fusion.h"
+#include <queue>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_reachability.h"
+#include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
 namespace gpu {

From bb4c751414c3562ab3ab4298f866f47438078c37 Mon Sep 17 00:00:00 2001
From: Marcel Hlopko <hlopko@google.com>
Date: Mon, 25 May 2020 12:15:22 +0200
Subject: [PATCH 1081/1533] Move -no-as-needed to the top of the linking
 command line

`-no-as-needed` linker flag is position sensitive (it's only effecting
following -l flags), therefore we need to move it before libraries to
link.

This change uncovered that nccl doesn't properly declare it's dependency
on `-lrt`, which is fixed. I suspect this started to be a problem in
https://github.com/tensorflow/tensorflow/commit/f819114a2d9d393a60e954d3a3e42d8700ff3b19.

This change also uncovered that some tests don't need to depend on nccl.
While `-no-as-needed` wasn't taking effect, nccl was just left out as
not needed.
---
 .../crosstool/cc_toolchain_config.bzl.tpl     | 30 ++++++++-----------
 third_party/nccl/archive.BUILD                |  1 +
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index 4acc05ff88c..a336673a307 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -293,7 +293,7 @@ def _cuda_set(cuda_path, actions):
         return []
 
 def _nologo():
-  return flag_group(flags = ["/nologo"])
+    return flag_group(flags = ["/nologo"])
 
 def _features(cpu, compiler, ctx):
     if cpu in ["local", "darwin"]:
@@ -497,6 +497,11 @@ def _features(cpu, compiler, ctx):
                     flag_set(
                         actions = all_link_actions(),
                         flag_groups = [
+                            flag_group(flags = (
+                                ["-Wl,-no-as-needed"] if cpu == "local" else []
+                            ) + [
+                                "-B" + ctx.attr.linker_bin_path,
+                            ]),
                             flag_group(
                                 flags = ["@%{linker_param_file}"],
                                 expand_if_available = "linker_param_file",
@@ -551,27 +556,17 @@ def _features(cpu, compiler, ctx):
                             "-Wl,-z,relro,-z,now",
                         ])],
                     ),
-                ] if cpu == "local" else []) + [
-                    flag_set(
-                        actions = all_link_actions(),
-                        flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-                        with_features = [with_feature_set(features = ["alwayslink"])],
-                    ),
+                ] if cpu == "local" else []) + ([
                     flag_set(
                         actions = all_link_actions(),
                         flag_groups = [
-                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path]),
+                            flag_group(flags = ["-Wl,--gc-sections"]),
+                            flag_group(
+                                flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                            ),
                         ],
                     ),
-                ] + ([flag_set(
-                    actions = all_link_actions(),
-                    flag_groups = [
-                        flag_group(flags = ["-Wl,--gc-sections"]),
-                        flag_group(
-                            flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                        ),
-                    ],
-                )] if cpu == "local" else []) + ([
+                ] if cpu == "local" else []) + ([
                     flag_set(
                         actions = all_link_actions(),
                         flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
@@ -588,7 +583,6 @@ def _features(cpu, compiler, ctx):
                     ),
                 ],
             ),
-            feature(name = "alwayslink", enabled = cpu == "local"),
             feature(name = "opt"),
             feature(name = "fastbuild"),
             feature(name = "dbg"),
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index 4936844b6b2..65c95a2a502 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -90,6 +90,7 @@ cc_library(
     include_prefix = "third_party/nccl",
     strip_include_prefix = "src",
     visibility = ["//visibility:public"],
+    linkopts = ["-lrt"],
     deps = [
         ":device",
         ":include_hdrs",

From b583e81bd4fa50fff84d73559b3d4855cee6bf21 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 25 May 2020 05:07:13 -0700
Subject: [PATCH 1082/1533] [XLA] algsimplify: Cache scalar add computations
 per type

Otherwise we'd generate invalid HLO if there's a dot of different types being
strength reduced in one run of algsimplify.

PiperOrigin-RevId: 313060898
Change-Id: I6e0c3332654f4bfad7590297b66f839c3538115b
---
 .../xla/service/algebraic_simplifier.cc        | 13 +++++++------
 .../xla/service/algebraic_simplifier_test.cc   | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 440e04c9205..e0a8b87c83b 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -472,8 +472,9 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
       HloInstruction* dot);
 
   HloComputation* GetOrCreateScalarAddComputation(PrimitiveType type) {
-    if (scalar_add_computation_) {
-      return scalar_add_computation_;
+    HloComputation*& scalar_add_computation = scalar_add_computations_[type];
+    if (scalar_add_computation) {
+      return scalar_add_computation;
     }
 
     HloComputation::Builder b("scalar_add_computation");
@@ -485,9 +486,9 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
         HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
     auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
         shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
-    scalar_add_computation_ =
+    scalar_add_computation =
         computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
-    return scalar_add_computation_;
+    return scalar_add_computation;
   }
 
   // Tries to fold a kPad in the input or filter into the convolution
@@ -528,8 +529,8 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Whether algebraic simplification has occurred.
   bool changed_ = false;
 
-  // Cached computation for adding two scalar F32.
-  HloComputation* scalar_add_computation_ = nullptr;
+  // Cached computation for adding two scalars of a given type.
+  absl::flat_hash_map<PrimitiveType, HloComputation*> scalar_add_computations_;
 
   AlgebraicSimplifier* simplifier_ = nullptr;
 };
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 9f823c76d80..3ac47821654 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -6520,5 +6520,23 @@ TEST_F(AlgebraicSimplifierTest, ScalarDividePredicate) {
           m::Broadcast(m::Divide(m::ConstantScalar(1), m::Parameter(1))))));
 }
 
+TEST_F(AlgebraicSimplifierTest, MultipleDotStrengthReductions) {
+  constexpr char kModuleStr[] = R"(
+    HloModule test
+    ENTRY test {
+      a = c64[2,2] parameter(0)
+      b = c64[2] parameter(1)
+      cd = c64[2] dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      c = f64[2,2] parameter(2)
+      d = f64[2] parameter(3)
+      dd = f64[2] dot(c, d), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT tuple = (c64[2], f64[2]) tuple(cd, dd)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_EQ(3, m->computation_count());
+}
+
 }  // namespace
 }  // namespace xla

From dfda5bc01e10f744680a4bffa93cc0e2fc49c6b5 Mon Sep 17 00:00:00 2001
From: Khanh LeViet <khanhlvg@google.com>
Date: Mon, 25 May 2020 05:51:03 -0700
Subject: [PATCH 1083/1533] Fix TF Lite text classification tutorial's
 incorrect indent.

PiperOrigin-RevId: 313064347
Change-Id: I563ddec306e69c6b775cae5306ed32c293282f83
---
 .../g3doc/tutorials/model_maker_text_classification.ipynb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
index 8261d6c9e34..e10507ccac7 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_text_classification.ipynb
@@ -632,7 +632,7 @@
         "id": "EoWiA_zX8rxE"
       },
       "source": [
-        "# Advanced Usage\n",
+        "## Advanced Usage\n",
         "\n",
         "The `create` function is the critical part of this library in which parameter `model_spec` defines the specification of the model, currently `AverageWordVecModelSpec` and `BertModelSpec` is supported. The `create` function contains the following steps for `AverageWordVecModelSpec`:\n",
         "\n",
@@ -651,7 +651,7 @@
         "id": "mwtiksguDfhl"
       },
       "source": [
-        "# Adjust the model\n",
+        "## Adjust the model\n",
         "\n",
         "We could adjust the model infrastructure like variables `wordvec_dim`, `seq_len` in `AverageWordVecModelSpec` class.\n"
       ]
@@ -736,7 +736,7 @@
         "id": "LvQuy7RSDir3"
       },
       "source": [
-        "## Change the training hyperparameters\n",
+        "### Change the training hyperparameters\n",
         "We could also change the training hyperparameters like `epochs` and `batch_size` that could affect the model accuracy. For instance,\n",
         "\n",
         "*   `epochs`: more epochs could achieve better accuracy, but may lead to overfitting.\n",
@@ -788,7 +788,7 @@
         "id": "Eq6B9lKMfhS6"
       },
       "source": [
-        "## Change the Model\n",
+        "### Change the Model\n",
         "\n",
         "We could change the model by changing the `model_spec`. The following shows how we change to BERT-base model.\n",
         "\n",

From 0a63948f00030b090d08e12bd496c94c626794b4 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Mon, 25 May 2020 10:04:12 -0700
Subject: [PATCH 1084/1533] Reduce Functional.call internal per-Layer overhead
 by 95% when single Tensor passed.

Node now has a performance optimization for the common case where a single
Tensor is passed to a Layer during the Functional API construction phase.
This means there is almost no overhead imposed by Functional.call in this case,
and the remaining per-layer overhead is all from Layer.__call__.

For a Model like ResNet, this optimization will be active for all Layers except
for the Add() layers.

PiperOrigin-RevId: 313085525
Change-Id: I0b9821e954d17f1f101617449fd5f0851d6ba9c3
---
 tensorflow/python/keras/engine/node.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index a9e0b621d75..708904853b2 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -24,6 +24,7 @@ import json
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -73,6 +74,9 @@ class Node(object):
 
     # Cached for performance.
     self._flat_arguments = nest.flatten((self.call_args, self.call_kwargs))
+    # Used to avoid expensive `nest` operations in the most common case.
+    self._single_positional_tensor_passed = (not self.call_kwargs and len(
+        self.call_args) == 1 and tensor_util.is_tensor(self.call_args[0]))
 
     # Create TensorFlowOpLayers if needed.
     for obj in self._flat_arguments:
@@ -137,13 +141,18 @@ class Node(object):
 
   def map_arguments(self, tensor_dict):
     """Maps Keras Tensors to computed Tensors using `tensor_dict`."""
-    flat_arguments = copy.copy(self._flat_arguments)
-    for kt_id, kt_index in self._keras_inputs_ids_and_indices:
-      flat_arguments[kt_index] = tensor_dict[kt_id].pop()
+    if self._single_positional_tensor_passed:
+      # Performance optimization for most common case.
+      kt_id, _ = self._keras_inputs_ids_and_indices[0]
+      return (tensor_dict[kt_id].pop(),), {}
+    else:
+      flat_arguments = copy.copy(self._flat_arguments)
+      for kt_id, kt_index in self._keras_inputs_ids_and_indices:
+        flat_arguments[kt_index] = tensor_dict[kt_id].pop()
 
-    args, kwargs = nest.pack_sequence_as(
-        (self.call_args, self.call_kwargs), flat_arguments)
-    return args, kwargs
+      args, kwargs = nest.pack_sequence_as((self.call_args, self.call_kwargs),
+                                           flat_arguments)
+      return args, kwargs
 
   def serialize(self, make_node_key, node_conversion_map):
     """Serializes `Node` for Functional API's `get_config`."""

From f0609b8f272c066f548baf2359d706850e5650d9 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Mon, 25 May 2020 10:04:50 -0700
Subject: [PATCH 1085/1533] Reduce Layer.__call__ overhead by ~5%.

Uses ops.name_scope_v2 directly when in eager mode, since it does not increment
in eager mode.

Also when clearing losses, only checks for is_in_tf_function when build_graph=True.
When build_graph=False in the new Layer class, we know that we are in eager mode.

PiperOrigin-RevId: 313085589
Change-Id: I081ab1c592137445f918403c1293ae6a05758b38
---
 tensorflow/python/keras/engine/base_layer.py | 25 +++++++++++++-------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 9dd05e53df7..594bf656cfd 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -822,6 +822,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     inputs, args, kwargs = self._split_out_first_arg(args, kwargs)
 
     call_context = base_layer_utils.call_context()
+    in_call = call_context.in_call
     input_list = nest.flatten(inputs)
 
     # We will attempt to build a TF graph if & only if all inputs are symbolic.
@@ -896,16 +897,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if build_graph and base_layer_utils.needs_keras_history(inputs):
       base_layer_utils.create_keras_history(inputs)
 
-    # Clear eager losses on top level model call.
-    # We are clearing the losses only on the top level model call and not on
-    # every layer/model call because layer/model may be reused.
-    if (base_layer_utils.is_in_eager_or_tf_function() and
-        not call_context.in_call):
-      self._clear_losses()
-
     with call_context.enter(self, inputs, build_graph, training_value):
       # Check input assumptions set after layer building, e.g. input shape.
       if build_graph:
+        # Losses are cleared for all Layers when the outermost layer is called.
+        # Losses are not cleared each time an inner layer is called, bc inner
+        # Layers can be reused in a Model.
+        if not in_call and base_layer_utils.is_in_tf_function():
+          self._clear_losses()
+
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
         # TODO(reedwm): We should assert input compatibility after the inputs
@@ -913,6 +913,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         input_spec.assert_input_compatibility(self.input_spec, inputs,
                                               self.name)
         graph = backend.get_graph()
+        # Use `self._name_scope()` to avoid auto-incrementing the name.
         with graph.as_default(), backend.name_scope(self._name_scope()):
           # Build layer if applicable (if the `build` method has been
           # overridden).
@@ -985,7 +986,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             self._set_inputs(cast_inputs, outputs)
       else:
         # Eager execution on data tensors.
-        with backend.name_scope(self._name_scope()):
+
+        # Losses are cleared for all Layers when the outermost layer is called.
+        # Losses are not cleared each time an inner layer is called, bc inner
+        # Layers can be reused in a Model.
+        if not in_call:
+          self._clear_losses()
+
+        # In Eager mode, `ops.name_scope_v2` does not autoincrement the name.
+        with ops.name_scope_v2(self.name):
           self._maybe_build(inputs)
           cast_inputs = self._maybe_cast_inputs(inputs, input_list)
           with base_layer_utils.autocast_context_manager(

From 86db9c0581616666df559ce10afa13d40fca7693 Mon Sep 17 00:00:00 2001
From: Abhineet Choudhary <abhi.chou4@gmail.com>
Date: Mon, 25 May 2020 22:44:01 +0530
Subject: [PATCH 1086/1533] execute eagerly

---
 tensorflow/python/eager/forwardprop_test.py | 33 +++++++++++----------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py
index 2533db8c232..337bf46fbad 100644
--- a/tensorflow/python/eager/forwardprop_test.py
+++ b/tensorflow/python/eager/forwardprop_test.py
@@ -91,25 +91,26 @@ def _jacfwd(f, primals):
 
 
 def _jvp_batch(f, primal, tangents):
+  tf_function = def_function.function(f)
 
   return control_flow_ops.vectorized_map(
-      functools.partial(_jvp, f, primal), 
+      functools.partial(_jvp, tf_function, primal), 
       tangents
-    )
+  )
   
 
 def _jvp_batch_matmul(f, primals, tangent_batch):
-    """Compute the jacobian of `f` at `primals` multiplied by `tangents`."""
-    jac_fwd = _jacfwd(f, primals)
-    def jac_mul(tangent):
-      flat_tangent = array_ops.reshape(tangent, shape=[-1])
-      tangent_vector = array_ops.expand_dims(flat_tangent, 1)
-      jvp_vector = math_ops.matmul(jac_fwd, tangent_vector)
-      return array_ops.reshape(jvp_vector, tangent.shape)
-
-    return control_flow_ops.vectorized_map(
-        jac_mul, 
-        tangent_batch)
+  """Compute the jacobian of `f` at `primals` multiplied by `tangents`."""
+  jac_fwd = _jacfwd(f, primals)
+  def jac_mul(tangent):
+    flat_tangent = array_ops.reshape(tangent, shape=[-1])
+    tangent_vector = array_ops.expand_dims(flat_tangent, 1)
+    jvp_vector = math_ops.matmul(jac_fwd, tangent_vector)
+    return array_ops.reshape(jvp_vector, tangent.shape)
+  return control_flow_ops.vectorized_map(
+      jac_mul,
+      tangent_batch
+  )
 
 
 def _grad(f, argnums=0):
@@ -962,10 +963,10 @@ class HessianTests(test.TestCase, parameterized.TestCase):
 
 
 class JacobianTests(test.TestCase, parameterized.TestCase):
-
+  
   @parameterized.parameters([
-    (math_ops.sin, (2, 3), 5),
-    (math_ops.sin, (2, 3, 4), 10),
+      (math_ops.sin, (2, 3), 5),
+      (math_ops.sin, (2, 3, 4), 10)
   ])
   def testJVPBatchCorrectness(self, f, primal_shape, batch_size):
     primals = [random_ops.random_uniform(primal_shape)]

From 02a1d07063bb2836b8b0016a0e22604b5d86933e Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 25 May 2020 13:07:23 -0700
Subject: [PATCH 1087/1533] Fix code relying on implicit bool conversion of
 mlir::Value

https://github.com/llvm/llvm-project/commit/a9b5edc5e2c4ec9d506b2c30465ee9f2dc21e5cc breaks this.

PiperOrigin-RevId: 313098804
Change-Id: I047283df456989f398823a322931228beab279a2
---
 .../tensorflow/transforms/tpu_variable_runtime_reformatting.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 9e8745918e3..ec4a25c6fdd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -229,7 +229,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     mapping.emplace_back(it->second, std::move(while_args));
   }
   // Sort the mapping according to execute operand order.
-  llvm::sort(mapping);
+  llvm::sort(mapping, llvm::less_first());
   // Populate the `retval_index_for_sharding` field of the argument metadate.
   for (auto entry : llvm::enumerate(execute.device_var_reads_indices())) {
     int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();

From ff893b4b5f98a05b311d859538fa2bbbc054dab2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 May 2020 13:31:43 -0700
Subject: [PATCH 1088/1533] Fix code relying on implicit bool conversion of
 mlir::Value

https://github.com/llvm/llvm-project/commit/a9b5edc5e2c4ec9d506b2c30465ee9f2dc21e5cc breaks this.

PiperOrigin-RevId: 313100179
Change-Id: Ic92a577c11387b96c955c4b27444b245a27f8098
---
 .../tensorflow/transforms/tpu_variable_runtime_reformatting.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index ec4a25c6fdd..9e8745918e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -229,7 +229,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     mapping.emplace_back(it->second, std::move(while_args));
   }
   // Sort the mapping according to execute operand order.
-  llvm::sort(mapping, llvm::less_first());
+  llvm::sort(mapping);
   // Populate the `retval_index_for_sharding` field of the argument metadate.
   for (auto entry : llvm::enumerate(execute.device_var_reads_indices())) {
     int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();

From a7ed5a542ec51c02648e3db5e6ba0c120671225a Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 25 May 2020 13:39:46 -0700
Subject: [PATCH 1089/1533] Add a show_fusion_subcomputations command to
 interactive_graphviz

Hiding fusion subcomputations is useful when we want to only investigate the
connectivity of the computation that contains the fusion instructions.

PiperOrigin-RevId: 313100588
Change-Id: I6b28eef0852baaa6e74bf8c96597d4e69300e1dc
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 25 ++++++-----
 .../compiler/xla/service/hlo_graph_dumper.h   | 14 ++++--
 .../xla/tools/interactive_graphviz.cc         | 44 +++++++++++++------
 3 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 3930898d665..ad21efa13c9 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -312,12 +312,13 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, absl::string_view label,
-               const DebugOptions& debug_options, bool show_backend_config,
+               const DebugOptions& debug_options,
+               HloRenderOptions hlo_render_options,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
         label_(label),
         debug_options_(debug_options),
-        show_backend_config_(show_backend_config),
+        hlo_render_options_(hlo_render_options),
         profile_(profile),
         filter_(std::move(filter)) {}
 
@@ -384,7 +385,7 @@ class HloDotDumper {
   const HloComputation* computation_;  // never null
   const string label_;                 // overall name for the graph
   const DebugOptions& debug_options_;
-  const bool show_backend_config_;
+  const HloRenderOptions hlo_render_options_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
 
@@ -565,7 +566,8 @@ bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) {
 bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   if (subcomp->IsFusionComputation()) {
     const HloInstruction* fusion = subcomp->FusionInstruction();
-    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion)) {
+    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion) ||
+        !hlo_render_options_.show_fusion_subcomputations) {
       return false;
     }
   }
@@ -1133,7 +1135,8 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
 
 string HloDotDumper::GetInstructionNodeBackendConfig(
     const HloInstruction* instr) {
-  if (!show_backend_config_ || instr->raw_backend_config_string().empty()) {
+  if (!hlo_render_options_.show_backend_config ||
+      instr->raw_backend_config_string().empty()) {
     return "";
   }
 
@@ -1604,14 +1607,14 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
                              const DebugOptions& debug_options,
                              RenderedGraphFormat format,
                              const HloExecutionProfile* hlo_execution_profile,
-                             bool show_backend_config) {
+                             HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return Unavailable("Can't render as URL; no URL renderer was registered.");
   }
 
   string rendered_dot =
-      HloDotDumper(&computation, label, debug_options, show_backend_config,
+      HloDotDumper(&computation, label, debug_options, hlo_render_options,
                    hlo_execution_profile, NodeFilter())
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1619,7 +1622,7 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
 
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config,
+    HloRenderOptions hlo_render_options,
     const absl::flat_hash_set<const HloInstruction*>& boundary) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
@@ -1632,7 +1635,7 @@ StatusOr<string> RenderNeighborhoodAround(
   string rendered_dot =
       HloDotDumper(node.parent(), label,
                    node.GetModule()->config().debug_options(),
-                   show_backend_config, /*profile=*/nullptr,
+                   hlo_render_options, /*profile=*/nullptr,
                    MakeNodeRadiusAroundFilter(&node, radius, boundary))
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1641,7 +1644,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config) {
+                                      HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return FailedPrecondition(
@@ -1663,7 +1666,7 @@ StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                    "NODES***<br/><br/>");
   }
   string rendered_dot =
-      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+      HloDotDumper(from.parent(), label, debug_options, hlo_render_options,
                    /*profile=*/nullptr, filter)
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 324ac67a6dd..528de77e4e6 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -50,6 +50,14 @@ enum class RenderedGraphFormat {
   kUrl,
 };
 
+struct HloRenderOptions {
+  // Include the backend config string in the rendered graph.
+  bool show_backend_config = false;
+
+  // Include the fusion subcomputations in the rendered graph.
+  bool show_fusion_subcomputations = true;
+};
+
 // Renders an HLO module as a human-readable visual graph.
 //
 // Note that this only works well for relatively small graphs (no more than a
@@ -61,7 +69,7 @@ StatusOr<string> RenderGraph(
     const HloComputation& computation, absl::string_view label,
     const DebugOptions& debug_options, RenderedGraphFormat format,
     const HloExecutionProfile* hlo_execution_profile = nullptr,
-    bool show_backend_config = false);
+    HloRenderOptions hlo_render_options = {});
 
 // Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -73,7 +81,7 @@ StatusOr<string> RenderGraph(
 // will be omitted even if they are within the radius.
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config = false,
+    HloRenderOptions hlo_render_options = {},
     const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
 // Renders nodes on any of the paths from `from` to `to`.  If there are more
@@ -82,7 +90,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config = false);
+                                      HloRenderOptions hlo_render_options = {});
 
 // Registers a function which implements RenderedGraphFormat::kUrl.
 //
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 4f8a6b43314..b6c62beff74 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -112,8 +112,7 @@ constexpr int64 kDefaultMaxNumNodesInAllPaths = 100;
 
 using absl::EqualsIgnoreCase;
 
-// A global control for whether backend configuration display is enabled.
-bool show_backend_config = true;
+HloRenderOptions hlo_render_options;
 
 HloInstruction* FindInstruction(const HloModule& module, string node_name) {
   if (absl::StartsWith(node_name, "%")) {
@@ -160,6 +159,8 @@ void DoHelpCommand() {
     Renders all nodes in <computation>.
   backend_config [on|off]
     Controls whether backend operation configuration information is printed.
+  show_fusion_subcomputations [on|off]
+    Controls whether fusion subcomputations are shown.
   list [name|op_name|op_type] <pattern>
     Lists all instructions whose name, metadata op_name, or metadata op_type
     contains <pattern> as a substring.
@@ -182,15 +183,32 @@ void DoHelpCommand() {
 // Turn metadata-printing on or off.
 void DoBackendConfigCommand(const std::vector<string>& tokens) {
   if (tokens.size() == 2 && tokens[1] == "on") {
-    show_backend_config = true;
+    hlo_render_options.show_backend_config = true;
   } else if (tokens.size() == 2 && tokens[1] == "off") {
-    show_backend_config = false;
+    hlo_render_options.show_backend_config = false;
   } else if (tokens.size() != 1) {
     std::cerr << "(Illegal backend_config value.  Use either 'on' or 'off'.)"
               << std::endl;
   }
   std::cout << "Backend configuration display "
-            << (show_backend_config ? "ON" : "OFF") << std::endl;
+            << (hlo_render_options.show_backend_config ? "ON" : "OFF")
+            << std::endl;
+}
+
+// Turn fusion computation display on or off.
+void DoShowFusionSubcomputationsCommand(const std::vector<string>& tokens) {
+  if (tokens.size() == 2 && tokens[1] == "on") {
+    hlo_render_options.show_fusion_subcomputations = true;
+  } else if (tokens.size() == 2 && tokens[1] == "off") {
+    hlo_render_options.show_fusion_subcomputations = false;
+  } else if (tokens.size() != 1) {
+    std::cerr << "(Illegal show_fusion_subcomputations value.  Use either "
+                 "'on' or 'off'.)"
+              << std::endl;
+  }
+  std::cout << "Fusion subcomputations display "
+            << (hlo_render_options.show_fusion_subcomputations ? "ON" : "OFF")
+            << std::endl;
 }
 
 // List all computations in the module.
@@ -373,7 +391,7 @@ void DoExtractCommand(const HloModule& module,
   auto extracted_module = ExtractModule(instr, height);
   std::cout << extracted_module->ToString(
                    HloPrintOptions::ShortParsable().set_print_backend_config(
-                       show_backend_config))
+                       hlo_render_options.show_backend_config))
             << std::endl;
 }
 
@@ -517,7 +535,7 @@ void DoAllPathsCommand(const Options& opts, const HloModule& module,
   }
   RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
     return RenderAllPathsFromTo(*from, *to, max_nodes, format,
-                                /*show_backend_config=*/show_backend_config);
+                                hlo_render_options);
   });
 }
 
@@ -582,15 +600,13 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
       return RenderGraph(*comp, /*label=*/"",
                          comp->parent()->config().debug_options(), format,
-                         /*hlo_execution_profile=*/nullptr,
-                         /*show_backend_config=*/show_backend_config);
+                         /*hlo_execution_profile=*/nullptr, hlo_render_options);
     });
   } else {
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
-      return RenderNeighborhoodAround(
-          *instr, graph_width, format,
-          /*show_backend_config=*/show_backend_config,
-          /*boundary=*/boundary);
+      return RenderNeighborhoodAround(*instr, graph_width, format,
+                                      hlo_render_options,
+                                      /*boundary=*/boundary);
     });
   }
 }
@@ -617,6 +633,8 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
       DoHelpCommand();
     } else if (tokens[0] == "backend_config") {
       DoBackendConfigCommand(tokens);
+    } else if (tokens[0] == "show_fusion_subcomputations") {
+      DoShowFusionSubcomputationsCommand(tokens);
     } else if (tokens[0] == "list") {
       if (tokens.size() > 1 && tokens[1] == "computations") {
         DoListComputationsCommand(module, tokens);

From 02177117cb88d6993671710f22cddd77abcc257a Mon Sep 17 00:00:00 2001
From: Gaurav Singh <gaurav1086@gmail.com>
Date: Mon, 25 May 2020 16:47:59 -0400
Subject: [PATCH 1090/1533] Code review changes

Signed-off-by: Gaurav Singh <gaurav1086@gmail.com>
---
 .../lite/experimental/delegates/coreml/builders/op_builder.cc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 4cdfd519daf..46634d6970a 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -95,7 +95,7 @@ CoreML::Specification::Model* GraphBuilder::BuildModel() {
         CoreML::Specification::EXACT_ARRAY_MAPPING);
   } else {
     fprintf(stderr, "Unsupported Core ML version: %d\n", coreml_version_);
-	delete(model);
+    delete model;
     return nullptr;
   }
   auto* neural_network = model->mutable_neuralnetwork();

From 55c1176fe232b607163352d2a2e6a2f0e4aa284c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 May 2020 13:53:10 -0700
Subject: [PATCH 1091/1533] Add a show_fusion_subcomputations command to
 interactive_graphviz

Hiding fusion subcomputations is useful when we want to only investigate the
connectivity of the computation that contains the fusion instructions.

PiperOrigin-RevId: 313101238
Change-Id: I25e9cfb5857d0cc90e07f45cfa1617fc6d378558
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 25 +++++------
 .../compiler/xla/service/hlo_graph_dumper.h   | 14 ++----
 .../xla/tools/interactive_graphviz.cc         | 44 ++++++-------------
 3 files changed, 27 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index ad21efa13c9..3930898d665 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -312,13 +312,12 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, absl::string_view label,
-               const DebugOptions& debug_options,
-               HloRenderOptions hlo_render_options,
+               const DebugOptions& debug_options, bool show_backend_config,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
         label_(label),
         debug_options_(debug_options),
-        hlo_render_options_(hlo_render_options),
+        show_backend_config_(show_backend_config),
         profile_(profile),
         filter_(std::move(filter)) {}
 
@@ -385,7 +384,7 @@ class HloDotDumper {
   const HloComputation* computation_;  // never null
   const string label_;                 // overall name for the graph
   const DebugOptions& debug_options_;
-  const HloRenderOptions hlo_render_options_;
+  const bool show_backend_config_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
 
@@ -566,8 +565,7 @@ bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) {
 bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   if (subcomp->IsFusionComputation()) {
     const HloInstruction* fusion = subcomp->FusionInstruction();
-    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion) ||
-        !hlo_render_options_.show_fusion_subcomputations) {
+    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion)) {
       return false;
     }
   }
@@ -1135,8 +1133,7 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
 
 string HloDotDumper::GetInstructionNodeBackendConfig(
     const HloInstruction* instr) {
-  if (!hlo_render_options_.show_backend_config ||
-      instr->raw_backend_config_string().empty()) {
+  if (!show_backend_config_ || instr->raw_backend_config_string().empty()) {
     return "";
   }
 
@@ -1607,14 +1604,14 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
                              const DebugOptions& debug_options,
                              RenderedGraphFormat format,
                              const HloExecutionProfile* hlo_execution_profile,
-                             HloRenderOptions hlo_render_options) {
+                             bool show_backend_config) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return Unavailable("Can't render as URL; no URL renderer was registered.");
   }
 
   string rendered_dot =
-      HloDotDumper(&computation, label, debug_options, hlo_render_options,
+      HloDotDumper(&computation, label, debug_options, show_backend_config,
                    hlo_execution_profile, NodeFilter())
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1622,7 +1619,7 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
 
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    HloRenderOptions hlo_render_options,
+    bool show_backend_config,
     const absl::flat_hash_set<const HloInstruction*>& boundary) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
@@ -1635,7 +1632,7 @@ StatusOr<string> RenderNeighborhoodAround(
   string rendered_dot =
       HloDotDumper(node.parent(), label,
                    node.GetModule()->config().debug_options(),
-                   hlo_render_options, /*profile=*/nullptr,
+                   show_backend_config, /*profile=*/nullptr,
                    MakeNodeRadiusAroundFilter(&node, radius, boundary))
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1644,7 +1641,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      HloRenderOptions hlo_render_options) {
+                                      bool show_backend_config) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return FailedPrecondition(
@@ -1666,7 +1663,7 @@ StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                    "NODES***<br/><br/>");
   }
   string rendered_dot =
-      HloDotDumper(from.parent(), label, debug_options, hlo_render_options,
+      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 528de77e4e6..324ac67a6dd 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -50,14 +50,6 @@ enum class RenderedGraphFormat {
   kUrl,
 };
 
-struct HloRenderOptions {
-  // Include the backend config string in the rendered graph.
-  bool show_backend_config = false;
-
-  // Include the fusion subcomputations in the rendered graph.
-  bool show_fusion_subcomputations = true;
-};
-
 // Renders an HLO module as a human-readable visual graph.
 //
 // Note that this only works well for relatively small graphs (no more than a
@@ -69,7 +61,7 @@ StatusOr<string> RenderGraph(
     const HloComputation& computation, absl::string_view label,
     const DebugOptions& debug_options, RenderedGraphFormat format,
     const HloExecutionProfile* hlo_execution_profile = nullptr,
-    HloRenderOptions hlo_render_options = {});
+    bool show_backend_config = false);
 
 // Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -81,7 +73,7 @@ StatusOr<string> RenderGraph(
 // will be omitted even if they are within the radius.
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    HloRenderOptions hlo_render_options = {},
+    bool show_backend_config = false,
     const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
 // Renders nodes on any of the paths from `from` to `to`.  If there are more
@@ -90,7 +82,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      HloRenderOptions hlo_render_options = {});
+                                      bool show_backend_config = false);
 
 // Registers a function which implements RenderedGraphFormat::kUrl.
 //
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index b6c62beff74..4f8a6b43314 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -112,7 +112,8 @@ constexpr int64 kDefaultMaxNumNodesInAllPaths = 100;
 
 using absl::EqualsIgnoreCase;
 
-HloRenderOptions hlo_render_options;
+// A global control for whether backend configuration display is enabled.
+bool show_backend_config = true;
 
 HloInstruction* FindInstruction(const HloModule& module, string node_name) {
   if (absl::StartsWith(node_name, "%")) {
@@ -159,8 +160,6 @@ void DoHelpCommand() {
     Renders all nodes in <computation>.
   backend_config [on|off]
     Controls whether backend operation configuration information is printed.
-  show_fusion_subcomputations [on|off]
-    Controls whether fusion subcomputations are shown.
   list [name|op_name|op_type] <pattern>
     Lists all instructions whose name, metadata op_name, or metadata op_type
     contains <pattern> as a substring.
@@ -183,32 +182,15 @@ void DoHelpCommand() {
 // Turn metadata-printing on or off.
 void DoBackendConfigCommand(const std::vector<string>& tokens) {
   if (tokens.size() == 2 && tokens[1] == "on") {
-    hlo_render_options.show_backend_config = true;
+    show_backend_config = true;
   } else if (tokens.size() == 2 && tokens[1] == "off") {
-    hlo_render_options.show_backend_config = false;
+    show_backend_config = false;
   } else if (tokens.size() != 1) {
     std::cerr << "(Illegal backend_config value.  Use either 'on' or 'off'.)"
               << std::endl;
   }
   std::cout << "Backend configuration display "
-            << (hlo_render_options.show_backend_config ? "ON" : "OFF")
-            << std::endl;
-}
-
-// Turn fusion computation display on or off.
-void DoShowFusionSubcomputationsCommand(const std::vector<string>& tokens) {
-  if (tokens.size() == 2 && tokens[1] == "on") {
-    hlo_render_options.show_fusion_subcomputations = true;
-  } else if (tokens.size() == 2 && tokens[1] == "off") {
-    hlo_render_options.show_fusion_subcomputations = false;
-  } else if (tokens.size() != 1) {
-    std::cerr << "(Illegal show_fusion_subcomputations value.  Use either "
-                 "'on' or 'off'.)"
-              << std::endl;
-  }
-  std::cout << "Fusion subcomputations display "
-            << (hlo_render_options.show_fusion_subcomputations ? "ON" : "OFF")
-            << std::endl;
+            << (show_backend_config ? "ON" : "OFF") << std::endl;
 }
 
 // List all computations in the module.
@@ -391,7 +373,7 @@ void DoExtractCommand(const HloModule& module,
   auto extracted_module = ExtractModule(instr, height);
   std::cout << extracted_module->ToString(
                    HloPrintOptions::ShortParsable().set_print_backend_config(
-                       hlo_render_options.show_backend_config))
+                       show_backend_config))
             << std::endl;
 }
 
@@ -535,7 +517,7 @@ void DoAllPathsCommand(const Options& opts, const HloModule& module,
   }
   RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
     return RenderAllPathsFromTo(*from, *to, max_nodes, format,
-                                hlo_render_options);
+                                /*show_backend_config=*/show_backend_config);
   });
 }
 
@@ -600,13 +582,15 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
       return RenderGraph(*comp, /*label=*/"",
                          comp->parent()->config().debug_options(), format,
-                         /*hlo_execution_profile=*/nullptr, hlo_render_options);
+                         /*hlo_execution_profile=*/nullptr,
+                         /*show_backend_config=*/show_backend_config);
     });
   } else {
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
-      return RenderNeighborhoodAround(*instr, graph_width, format,
-                                      hlo_render_options,
-                                      /*boundary=*/boundary);
+      return RenderNeighborhoodAround(
+          *instr, graph_width, format,
+          /*show_backend_config=*/show_backend_config,
+          /*boundary=*/boundary);
     });
   }
 }
@@ -633,8 +617,6 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
       DoHelpCommand();
     } else if (tokens[0] == "backend_config") {
       DoBackendConfigCommand(tokens);
-    } else if (tokens[0] == "show_fusion_subcomputations") {
-      DoShowFusionSubcomputationsCommand(tokens);
     } else if (tokens[0] == "list") {
       if (tokens.size() > 1 && tokens[1] == "computations") {
         DoListComputationsCommand(module, tokens);

From 83ed5aad57de972ffc0708fae63772e5e62df69c Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 25 May 2020 14:02:01 -0700
Subject: [PATCH 1092/1533] Add `offset` argument to `Rescaling`.

PiperOrigin-RevId: 313101675
Change-Id: Id59e6dcbe4f038d627c7d71fdf4dfeb58e8e05cd
---
 .../preprocessing/image_preprocessing.py       | 18 ++++++++++++++----
 .../preprocessing/image_preprocessing_test.py  | 10 +++++-----
 ...experimental.preprocessing.-rescaling.pbtxt |  2 +-
 ...experimental.preprocessing.-rescaling.pbtxt |  2 +-
 4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 832915dac68..e4b92e44e69 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -292,11 +292,16 @@ class RandomCrop(Layer):
 
 @keras_export('keras.layers.experimental.preprocessing.Rescaling')
 class Rescaling(Layer):
-  """Multiply inputs by `scale`.
+  """Multiply inputs by `scale` and adds `offset`.
 
-  For instance, to rescale an input in the `[0, 255]` range
+  For instance:
+
+  1. To rescale an input in the `[0, 255]` range
   to be in the `[0, 1]` range, you would pass `scale=1./255`.
 
+  2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range,
+  you would pass `scale=1./127.5, offset=-1`.
+
   The rescaling is applied both during training and inference.
 
   Input shape:
@@ -307,16 +312,20 @@ class Rescaling(Layer):
 
   Arguments:
     scale: Float, the scale to apply to the inputs.
+    offset: Float, the offset to apply to the inputs.
     name: A string, the name of the layer.
   """
 
-  def __init__(self, scale, name=None, **kwargs):
+  def __init__(self, scale, offset=0., name=None, **kwargs):
     self.scale = scale
+    self.offset = offset
     super(Rescaling, self).__init__(name=name, **kwargs)
 
   def call(self, inputs):
     dtype = self._compute_dtype
-    return math_ops.cast(inputs, dtype) * math_ops.cast(self.scale, dtype)
+    scale = math_ops.cast(self.scale, dtype)
+    offset = math_ops.cast(self.offset, dtype)
+    return math_ops.cast(inputs, dtype) * scale + offset
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -324,6 +333,7 @@ class Rescaling(Layer):
   def get_config(self):
     config = {
         'scale': self.scale,
+        'offset': self.offset,
     }
     base_config = super(Rescaling, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 38d2d25916a..14720d3541d 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -306,7 +306,7 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_rescaling_base(self):
-    kwargs = {'scale': 0.004}
+    kwargs = {'scale': 1./127.5, 'offset': -1.}
     testing_utils.layer_test(
         image_preprocessing.Rescaling,
         kwargs=kwargs,
@@ -315,18 +315,18 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_float(self):
-    layer = image_preprocessing.Rescaling(0.004)
+    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1.)
     inputs = random_ops.random_uniform((2, 4, 5, 3))
     outputs = layer(inputs)
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_int(self):
-    layer = image_preprocessing.Rescaling(0.004)
+    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1)
     inputs = random_ops.random_uniform((2, 4, 5, 3), 0, 100, dtype='int32')
     outputs = layer(inputs)
     self.assertEqual(outputs.dtype.name, 'float32')
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
   def test_config_with_custom_name(self):
     layer = image_preprocessing.Rescaling(0.5, name='rescaling')
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7036fb926a8..60c0bc92f81 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7036fb926a8..60c0bc92f81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
   member_method {
     name: "add_loss"

From 291125835ea056c6a1621d9fd83054178e5eaedc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 25 May 2020 14:48:04 -0700
Subject: [PATCH 1093/1533] Add `offset` argument to `Rescaling`.

PiperOrigin-RevId: 313104348
Change-Id: I5472da4856a6040e74286a5dc174a5897b8955df
---
 .../preprocessing/image_preprocessing.py       | 18 ++++--------------
 .../preprocessing/image_preprocessing_test.py  | 10 +++++-----
 ...experimental.preprocessing.-rescaling.pbtxt |  2 +-
 ...experimental.preprocessing.-rescaling.pbtxt |  2 +-
 4 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index e4b92e44e69..832915dac68 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -292,16 +292,11 @@ class RandomCrop(Layer):
 
 @keras_export('keras.layers.experimental.preprocessing.Rescaling')
 class Rescaling(Layer):
-  """Multiply inputs by `scale` and adds `offset`.
+  """Multiply inputs by `scale`.
 
-  For instance:
-
-  1. To rescale an input in the `[0, 255]` range
+  For instance, to rescale an input in the `[0, 255]` range
   to be in the `[0, 1]` range, you would pass `scale=1./255`.
 
-  2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range,
-  you would pass `scale=1./127.5, offset=-1`.
-
   The rescaling is applied both during training and inference.
 
   Input shape:
@@ -312,20 +307,16 @@ class Rescaling(Layer):
 
   Arguments:
     scale: Float, the scale to apply to the inputs.
-    offset: Float, the offset to apply to the inputs.
     name: A string, the name of the layer.
   """
 
-  def __init__(self, scale, offset=0., name=None, **kwargs):
+  def __init__(self, scale, name=None, **kwargs):
     self.scale = scale
-    self.offset = offset
     super(Rescaling, self).__init__(name=name, **kwargs)
 
   def call(self, inputs):
     dtype = self._compute_dtype
-    scale = math_ops.cast(self.scale, dtype)
-    offset = math_ops.cast(self.offset, dtype)
-    return math_ops.cast(inputs, dtype) * scale + offset
+    return math_ops.cast(inputs, dtype) * math_ops.cast(self.scale, dtype)
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -333,7 +324,6 @@ class Rescaling(Layer):
   def get_config(self):
     config = {
         'scale': self.scale,
-        'offset': self.offset,
     }
     base_config = super(Rescaling, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 14720d3541d..38d2d25916a 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -306,7 +306,7 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_rescaling_base(self):
-    kwargs = {'scale': 1./127.5, 'offset': -1.}
+    kwargs = {'scale': 0.004}
     testing_utils.layer_test(
         image_preprocessing.Rescaling,
         kwargs=kwargs,
@@ -315,18 +315,18 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_float(self):
-    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1.)
+    layer = image_preprocessing.Rescaling(0.004)
     inputs = random_ops.random_uniform((2, 4, 5, 3))
     outputs = layer(inputs)
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_int(self):
-    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1)
+    layer = image_preprocessing.Rescaling(0.004)
     inputs = random_ops.random_uniform((2, 4, 5, 3), 0, 100, dtype='int32')
     outputs = layer(inputs)
     self.assertEqual(outputs.dtype.name, 'float32')
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
 
   def test_config_with_custom_name(self):
     layer = image_preprocessing.Rescaling(0.5, name='rescaling')
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 60c0bc92f81..7036fb926a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 60c0bc92f81..7036fb926a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"

From 5dbc34f565304a89216038a751302c32207377b3 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Mon, 25 May 2020 15:49:01 -0700
Subject: [PATCH 1094/1533] Support Pad with static paddings in XNNPACK
 delegate

PiperOrigin-RevId: 313107760
Change-Id: I7b04b9977081e760e9604f72d9da5f499ada88f3
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  31 ++
 tensorflow/lite/delegates/xnnpack/README.md   |  37 +--
 tensorflow/lite/delegates/xnnpack/pad_test.cc | 279 ++++++++++++++++++
 .../lite/delegates/xnnpack/pad_tester.cc      | 187 ++++++++++++
 .../lite/delegates/xnnpack/pad_tester.h       |  89 ++++++
 .../delegates/xnnpack/xnnpack_delegate.cc     | 199 ++++++++++++-
 tensorflow/workspace.bzl                      |   8 +-
 7 files changed, 782 insertions(+), 48 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/pad_test.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/pad_tester.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/pad_tester.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index e8e6c061160..6edb757e83f 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -91,6 +91,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pad_tester",
+    testonly = 1,
+    srcs = ["pad_tester.cc"],
+    hdrs = ["pad_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "pool_2d_tester",
     testonly = 1,
@@ -293,6 +309,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "pad_test",
+    srcs = ["pad_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":pad_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "relu_test",
     srcs = ["relu_test.cc"],
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index c4e3f540faf..98a08a4f647 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -92,8 +92,6 @@ Below is the list of current operators and limitations:
 * Only addition with two inputs is supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `AVERAGE_POOL_2D`
 
@@ -101,8 +99,6 @@ Below is the list of current operators and limitations:
 * 1x1 pooling is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `CONV_2D`
 
@@ -111,8 +107,6 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `DEPTHWISE_CONV_2D`
 
@@ -121,8 +115,6 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `FULLY_CONNECTED`
 
@@ -131,20 +123,14 @@ Below is the list of current operators and limitations:
 * Both filter and bias must be static (use `kTfLiteMmapRo` allocation type).
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `HARD_SWISH`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `LOGISTIC`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `MAX_POOL_2D`
 
@@ -152,16 +138,19 @@ Below is the list of current operators and limitations:
 * 1x1 pooling is not supported.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `MUL`
 
 * Inputs and outputs must be in 32-bit floating-point format.
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
+
+### `PAD`
+
+* The first input and the output must be in 32-bit floating-point format.
+* The second input (the input with the padding specification) must be static
+  (use `kTfLiteMmapRo` allocation type).
+* The numbers of padding elements must be non-negative.
 
 ### `PRELU`
 
@@ -169,36 +158,28 @@ Below is the list of current operators and limitations:
 * Slope must be static (use `kTfLiteMmapRo` allocation type).
 * Slope must be either a 1D tensor, or have all its non-channel dimensions equal
   1.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) input and output
-  are not supported.
 
 ### `RELU`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `RELU6`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `RELU_N1_TO_1`
 
 * Inputs and outputs must be in 32-bit floating-point format.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### `SOFTMAX`
 
 * Inputs and outputs must be in 32-bit floating-point format.
 * Only `beta = 1.0` is supported.
-* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
-  output are not supported.
 
 ### Other limitations
 
+* Dynamically allocated (with `kTfLiteDynamic` allocation type) inputs and
+  outputs are not supported.
 * Resizing model inputs (via `Interpreter::ResizeInputTensor`) is supported, but
   cause a complete reinitialization of the delegate instance, which has
   considerable overhead.
diff --git a/tensorflow/lite/delegates/xnnpack/pad_test.cc b/tensorflow/lite/delegates/xnnpack/pad_test.cc
new file mode 100644
index 00000000000..c93ff8ab661
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_test.cc
@@ -0,0 +1,279 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/pad_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(Pad, Full4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng(), pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng(), pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0, 0, 0})
+      .InputPostPaddings({pad_rng(), 0, 0, 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, HeightAndWidth4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng(), pad_rng(), 0})
+      .InputPostPaddings({0, pad_rng(), pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels4D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Full3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0, 0})
+      .InputPostPaddings({pad_rng(), 0, 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Width3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng(), 0})
+      .InputPostPaddings({0, pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels3D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Full2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Batch2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), 0})
+      .InputPostPaddings({pad_rng(), 0})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, Channels2D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, pad_rng()})
+      .InputPostPaddings({0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, 1D) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({pad_rng(), pad_rng()})
+      .InputPostPaddings({pad_rng(), pad_rng()})
+      .InputShape({shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Pad, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto pad_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 3), std::ref(rng));
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+
+  PadTester()
+      .InputPrePaddings({0, 0, 0, pad_rng()})
+      .InputPostPaddings({0, 0, 0, pad_rng()})
+      .InputShape({shape_rng(), shape_rng(), shape_rng(), shape_rng()})
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.cc b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
new file mode 100644
index 00000000000..e364b880124
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.cc
@@ -0,0 +1,187 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/pad_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+std::vector<int32_t> PadTester::OutputShape() const {
+  std::vector<int32_t> output_shape;
+  output_shape.reserve(InputShape().size());
+  for (size_t i = 0; i < InputShape().size(); i++) {
+    int32_t output_dim = InputShape()[i];
+    if (i < InputPrePaddings().size()) {
+      output_dim += InputPrePaddings()[i];
+    }
+    if (i < InputPostPaddings().size()) {
+      output_dim += InputPostPaddings()[i];
+    }
+    output_shape.push_back(output_dim);
+  }
+  return output_shape;
+}
+
+void PadTester::Test(TfLiteDelegate* delegate) const {
+  ASSERT_EQ(InputPrePaddings().size(), InputPostPaddings().size());
+  ASSERT_LE(InputPrePaddings().size(), InputShape().size());
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + ComputeSize(InputShape()),
+                std::ref(input_rng));
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data, default_input_data + ComputeSize(InputShape()),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (size_t i = 0; i < ComputeSize(OutputShape()); i++) {
+    ASSERT_EQ(default_output_data[i], delegate_output_data[i]);
+  }
+}
+
+std::vector<char> PadTester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_PAD);
+
+  std::vector<int32_t> paddings(InputPrePaddings().size() +
+                                InputPostPaddings().size());
+  for (size_t i = 0; i < InputPrePaddings().size(); i++) {
+    paddings[i * 2] = InputPrePaddings()[i];
+    paddings[i * 2 + 1] = InputPostPaddings()[i];
+  }
+  const std::array<flatbuffers::Offset<Buffer>, 2> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(paddings.data()),
+                       sizeof(float) * paddings.size())),
+  }};
+
+  const std::vector<int32_t> output_shape = OutputShape();
+  const std::array<int32_t, 2> paddings_shape{
+      {static_cast<int32_t>(InputPrePaddings().size()), 2}};
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(InputShape().data(),
+                                                 InputShape().size()),
+                   TensorType_FLOAT32),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(paddings_shape.data(),
+                                                 paddings_shape.size()),
+                   TensorType_INT32, /*buffer=*/1),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   TensorType_FLOAT32),
+  }};
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()));
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Pad model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t PadTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.h b/tensorflow/lite/delegates/xnnpack/pad_tester.h
new file mode 100644
index 00000000000..ffcd47e05e9
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.h
@@ -0,0 +1,89 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class PadTester {
+ public:
+  PadTester() = default;
+  PadTester(const PadTester&) = delete;
+  PadTester& operator=(const PadTester&) = delete;
+
+  inline PadTester& InputShape(std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline PadTester& InputPrePaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_pre_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPrePaddings() const {
+    return input_pre_paddings_;
+  }
+
+  inline PadTester& InputPostPaddings(std::initializer_list<int32_t> paddings) {
+    for (auto it = paddings.begin(); it != paddings.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    input_post_paddings_ =
+        std::vector<int32_t>(paddings.begin(), paddings.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t> InputPostPaddings() const {
+    return input_post_paddings_;
+  }
+
+  std::vector<int32_t> OutputShape() const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> input_pre_paddings_;
+  std::vector<int32_t> input_post_paddings_;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 6d9b4dac8f8..2beaa16255d 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <cstring>
 #include <limits>
@@ -120,9 +121,22 @@ class Subgraph {
         return nullptr;
       }
 
-      for (int k = 0; k < node->inputs->size; k++) {
-        const int t = node->inputs->data[k];
-        tensors[t] = t;
+      switch (registration->builtin_code) {
+        case kTfLiteBuiltinPad:
+          // Ignore the second input (static padding), because it is
+          // represented as parameters of the XNNPACK operator rather than
+          // extra input.
+          {
+            const int t = node->inputs->data[0];
+            tensors[t] = t;
+          }
+          break;
+        default:
+          // All other operators: process all inputs
+          for (int k = 0; k < node->inputs->size; k++) {
+            const int t = node->inputs->data[k];
+            tensors[t] = t;
+          }
       }
       for (int k = 0; k < node->outputs->size; k++) {
         const int t = node->outputs->data[k];
@@ -532,10 +546,11 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus CheckTensorFloatType(TfLiteContext* context,
-                                           const TfLiteTensor& tensor,
-                                           int tensor_index, int node_index) {
-    if (tensor.type != kTfLiteFloat32) {
+  static TfLiteStatus CheckTensorType(TfLiteContext* context,
+                                      const TfLiteTensor& tensor,
+                                      TfLiteType expected_type,
+                                      int tensor_index, int node_index) {
+    if (tensor.type != expected_type) {
       TF_LITE_MAYBE_KERNEL_LOG(
           context, "unsupported type %s in tensor #%d in node #%d",
           TfLiteTypeGetName(tensor.type), tensor_index, node_index);
@@ -544,28 +559,64 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckTensorFloatType(TfLiteContext* context,
+                                           const TfLiteTensor& tensor,
+                                           int tensor_index, int node_index) {
+    return CheckTensorType(context, tensor, kTfLiteFloat32, tensor_index,
+                           node_index);
+  }
+
   static TfLiteStatus CheckTensorShape(TfLiteContext* context,
                                        const TfLiteTensor& tensor,
-                                       int expected_num_dims,
+                                       int min_num_dims, int max_num_dims,
                                        int tensor_index) {
-    if (tensor.dims->size != expected_num_dims) {
-      TF_LITE_MAYBE_KERNEL_LOG(
-          context,
-          "unexpected number of shape dimensions (%d != %d) in tensor #%d",
-          tensor.dims->size, expected_num_dims, tensor_index);
-      return kTfLiteError;
+    if (min_num_dims == max_num_dims) {
+      if (tensor.dims->size != min_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "%d dimensions expected",
+            tensor.dims->size, tensor_index, min_num_dims);
+        return kTfLiteError;
+      }
+    } else {
+      if (tensor.dims->size < min_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "at least %d dimensions expected",
+            tensor.dims->size, tensor_index, min_num_dims);
+        return kTfLiteError;
+      }
+      if (tensor.dims->size > max_num_dims) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unsupported number of shape dimensions (%d) in tensor #%d: "
+            "at most %d dimensions expected",
+            tensor.dims->size, tensor_index, max_num_dims);
+        return kTfLiteError;
+      }
     }
     for (int i = 0; i < tensor.dims->size; i++) {
       if (tensor.dims->data[i] <= 0) {
         TF_LITE_MAYBE_KERNEL_LOG(context,
-                                 "invalid dimension #%d (%d) in tensor #%d", i,
-                                 tensor.dims->data[i], tensor_index);
+                                 "invalid num of elements (%d) in "
+                                 "dimension #%d in tensor #%d",
+                                 tensor.dims->data[i], i, tensor_index);
         return kTfLiteError;
       }
     }
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckTensorShape(TfLiteContext* context,
+                                       const TfLiteTensor& tensor,
+                                       int expected_num_dims,
+                                       int tensor_index) {
+    return CheckTensorShape(context, tensor, expected_num_dims,
+                            expected_num_dims, tensor_index);
+  }
+
   static TfLiteStatus CheckSlopeTensorShape(TfLiteContext* context,
                                             const TfLiteTensor& tensor,
                                             int tensor_index, int node_index) {
@@ -592,6 +643,39 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus CheckPaddingsTensorShape(TfLiteContext* context,
+                                               const TfLiteTensor& tensor,
+                                               int expected_rows,
+                                               int tensor_index,
+                                               int node_index) {
+    if (tensor.dims->size != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "expected a 2D tensor",
+                               tensor.dims->size, tensor_index, node_index);
+      return kTfLiteError;
+    }
+    if (tensor.dims->data[0] != expected_rows) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of rows (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "%d rows expected",
+                               tensor.dims->size, tensor_index, node_index,
+                               expected_rows);
+      return kTfLiteError;
+    }
+    if (tensor.dims->data[1] != 2) {
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of columns (%d) in "
+                               "padding tensor #%d in node #%d: "
+                               "2 columns expected",
+                               tensor.dims->size, tensor_index, node_index);
+      return kTfLiteError;
+    }
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus CheckTensorNonDynamicAllocation(
       TfLiteContext* context, const TfLiteTensor& tensor, int tensor_index,
       int node_index) {
@@ -693,6 +777,9 @@ class Subgraph {
         return VisitMulNode(subgraph, logging_context, node_index, node,
                             context->tensors, mul_params, xnnpack_tensors);
       }
+      case kTfLiteBuiltinPad:
+        return VisitPadNode(subgraph, logging_context, node_index, node,
+                            context->tensors, xnnpack_tensors);
       case kTfLiteBuiltinPrelu:
         return VisitPreluNode(subgraph, logging_context, node_index, node,
                               context->tensors, xnnpack_tensors);
@@ -1565,6 +1652,86 @@ class Subgraph {
     return kTfLiteOk;
   }
 
+  static TfLiteStatus VisitPadNode(
+      xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
+      TfLiteNode* node, const TfLiteTensor* tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
+    TF_LITE_ENSURE_STATUS(
+        CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
+
+    const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
+                                           node->inputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, input_tensor, node->inputs->data[0], node_index));
+
+    const TfLiteTensor& paddings_tensor = tensors[node->inputs->data[1]];
+    TF_LITE_ENSURE_STATUS(CheckTensorType(logging_context, paddings_tensor,
+                                          kTfLiteInt32, node->inputs->data[1],
+                                          node_index));
+    TF_LITE_ENSURE_STATUS(CheckPaddingsTensorShape(
+        logging_context, paddings_tensor, input_tensor.dims->size,
+        node->inputs->data[1], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+        logging_context, paddings_tensor, node->inputs->data[1], node_index));
+
+    const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
+    TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+    TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 1,
+                                           XNN_MAX_TENSOR_DIMS,
+                                           node->outputs->data[0]));
+    TF_LITE_ENSURE_STATUS(CheckTensorNonDynamicAllocation(
+        logging_context, output_tensor, node->outputs->data[0], node_index));
+
+    const int32_t* paddings_data =
+        reinterpret_cast<const int32_t*>(paddings_tensor.data.data);
+    for (int i = 0; i < paddings_tensor.dims->size; i++) {
+      const int32_t pre_padding = paddings_data[i * 2 + 0];
+      if (pre_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid pre-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+
+      const int32_t post_padding = paddings_data[i * 2 + 1];
+      if (post_padding < 0) {
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "invalid post-padding %d for dimension #%d in node %d", pre_padding,
+            i, node_index);
+        return kTfLiteError;
+      }
+    }
+
+    if (subgraph != nullptr) {
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> pre_paddings{};
+      std::array<size_t, XNN_MAX_TENSOR_DIMS> post_paddings{};
+      for (int i = 0; i < paddings_tensor.dims->data[0]; i++) {
+        pre_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 0]);
+        post_paddings[i] = static_cast<size_t>(paddings_data[i * 2 + 1]);
+      }
+
+      const xnn_status status = xnn_define_static_constant_pad(
+          subgraph, pre_paddings.data(), post_paddings.data(),
+          /*padding_value=*/0.0f,
+          /*input_id=*/xnnpack_tensors[node->inputs->data[0]],
+          /*output_id=*/xnnpack_tensors[node->outputs->data[0]], /*flags=*/0);
+      if (status != xnn_status_success) {
+        TF_LITE_KERNEL_LOG(logging_context, "failed to delegate PAD node #%d",
+                           node_index);
+        return kTfLiteError;
+      }
+    }
+
+    return kTfLiteOk;
+  }
+
   static TfLiteStatus VisitPreluNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b7682468998..d196675b518 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "0440d9ad632945f10992664be84eb0c0c76581f8474df3c124aa30350981126c",
-        strip_prefix = "XNNPACK-d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee",
+        sha256 = "589acbfe90093c690a2817068fadfd7868000509304b5316d5c8d692b605b379",
+        strip_prefix = "XNNPACK-f5c4625a40ee296d47be936ff5e7b0809858627b",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
-            "https://github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/f5c4625a40ee296d47be936ff5e7b0809858627b.zip",
+            "https://github.com/google/XNNPACK/archive/f5c4625a40ee296d47be936ff5e7b0809858627b.zip",
         ],
     )
 

From 256332096c08e67ecf080cae457b8d5287e241cc Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 25 May 2020 17:04:14 -0700
Subject: [PATCH 1095/1533] Make RandomFourierFeatures state saveable.

PiperOrigin-RevId: 313112328
Change-Id: I21c8881b84d8d40e90e3dc82bb38154bc928b5f4
---
 tensorflow/python/keras/layers/kernelized.py  | 14 +++++------
 .../python/keras/layers/kernelized_test.py    | 23 ++++++++++++++++++-
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/layers/kernelized.py b/tensorflow/python/keras/layers/kernelized.py
index ce53334ebc7..5f401899bec 100644
--- a/tensorflow/python/keras/layers/kernelized.py
+++ b/tensorflow/python/keras/layers/kernelized.py
@@ -191,15 +191,15 @@ class RandomFourierFeatures(base_layer.Layer):
     kernel_initializer = _get_random_features_initializer(
         self.kernel_initializer, shape=(input_dim, self.output_dim))
 
-    unscaled_kernel = self.add_weight(
-        name='unscaled_random_features',
+    self.unscaled_kernel = self.add_weight(
+        name='unscaled_kernel',
         shape=(input_dim, self.output_dim),
         dtype=dtypes.float32,
         initializer=kernel_initializer,
         trainable=False)
 
     self.bias = self.add_weight(
-        name='random_features_bias',
+        name='bias',
         shape=(self.output_dim,),
         dtype=dtypes.float32,
         initializer=init_ops.random_uniform_initializer(
@@ -208,20 +208,20 @@ class RandomFourierFeatures(base_layer.Layer):
 
     if self.scale is None:
       self.scale = _get_default_scale(self.kernel_initializer, input_dim)
-    scale = self.add_weight(
-        name='random_features_scale',
+    self.kernel_scale = self.add_weight(
+        name='kernel_scale',
         shape=(1,),
         dtype=dtypes.float32,
         initializer=init_ops.constant_initializer(self.scale),
         trainable=True,
         constraint='NonNeg')
-    self.kernel = (1.0 / scale) * unscaled_kernel
     super(RandomFourierFeatures, self).build(input_shape)
 
   def call(self, inputs):
     inputs = ops.convert_to_tensor_v2(inputs, dtype=self.dtype)
     inputs = gen_math_ops.cast(inputs, dtypes.float32)
-    outputs = gen_math_ops.mat_mul(inputs, self.kernel)
+    kernel = (1.0 / self.kernel_scale) * self.unscaled_kernel
+    outputs = gen_math_ops.mat_mul(inputs, kernel)
     outputs = nn.bias_add(outputs, self.bias)
     return gen_math_ops.cos(outputs)
 
diff --git a/tensorflow/python/keras/layers/kernelized_test.py b/tensorflow/python/keras/layers/kernelized_test.py
index edb58f77868..a6a9d88423f 100644
--- a/tensorflow/python/keras/layers/kernelized_test.py
+++ b/tensorflow/python/keras/layers/kernelized_test.py
@@ -20,6 +20,8 @@ from __future__ import print_function
 
 import functools
 import math
+import os
+import shutil
 
 from absl.testing import parameterized
 import numpy as np
@@ -35,7 +37,10 @@ from tensorflow.python.keras import backend as keras_backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import kernelized as kernel_layers
+from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import kernelized_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -65,6 +70,22 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertAllClose(expected, actual, atol=atol)
 
+  @test_util.run_v2_only
+  def test_state_saving_and_loading(self):
+    input_data = np.random.random((1, 2))
+    rff_layer = kernel_layers.RandomFourierFeatures(output_dim=10, scale=3.0)
+    inputs = input_layer.Input((2,))
+    outputs = rff_layer(inputs)
+    model = training.Model(inputs, outputs)
+    output_data = model.predict(input_data)
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    saved_model_dir = os.path.join(temp_dir, 'rff_model')
+    model.save(saved_model_dir)
+    new_model = save.load_model(saved_model_dir)
+    new_output_data = new_model.predict(input_data)
+    self.assertAllClose(output_data, new_output_data, atol=1e-4)
+
   def test_invalid_output_dim(self):
     with self.assertRaisesRegexp(
         ValueError, r'`output_dim` should be a positive integer. Given: -3.'):
@@ -246,7 +267,7 @@ class RandomFourierFeaturesTest(test.TestCase, parameterized.TestCase):
     num_trainable_vars = 1 if trainable else 0
     self.assertLen(rff_layer.trainable_variables, num_trainable_vars)
     if trainable:
-      self.assertEqual('random_fourier_features/random_features_scale:0',
+      self.assertEqual('random_fourier_features/kernel_scale:0',
                        rff_layer.trainable_variables[0].name)
     self.assertLen(rff_layer.non_trainable_variables, 3 - num_trainable_vars)
 

From e12a7fb0327218ea2bd8bf8819595775e4abad16 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 25 May 2020 17:05:42 -0700
Subject: [PATCH 1096/1533] Fix the string representation of string literal
 subscripts to be `x['a']` instead of `x[a]`.

PiperOrigin-RevId: 313112458
Change-Id: Ia4a5c8c846c11215c3064b49828cde37594bc6e2
---
 .../python/autograph/pyct/qual_names.py       | 31 +++++++------------
 .../python/autograph/pyct/qual_names_test.py  |  9 +++---
 .../eager/gradient_input_output_exclusions.py |  3 +-
 3 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/tensorflow/python/autograph/pyct/qual_names.py b/tensorflow/python/autograph/pyct/qual_names.py
index f97e595d1dc..d9491691567 100644
--- a/tensorflow/python/autograph/pyct/qual_names.py
+++ b/tensorflow/python/autograph/pyct/qual_names.py
@@ -41,21 +41,13 @@ class Symbol(collections.namedtuple('Symbol', ['name'])):
   """Represents a Python symbol."""
 
 
-class StringLiteral(collections.namedtuple('StringLiteral', ['value'])):
-  """Represents a Python string literal."""
-
-  def __str__(self):
-    return '\'%s\'' % self.value
-
-  def __repr__(self):
-    return str(self)
-
-
-class NumberLiteral(collections.namedtuple('NumberLiteral', ['value'])):
+class Literal(collections.namedtuple('Literal', ['value'])):
   """Represents a Python numeric literal."""
 
   def __str__(self):
-    return '%s' % self.value
+    if isinstance(self.value, str):
+      return "'{}'".format(self.value)
+    return str(self.value)
 
   def __repr__(self):
     return str(self)
@@ -91,7 +83,7 @@ class QN(object):
       self._has_subscript = True
 
     else:
-      if not isinstance(base, (str, StringLiteral, NumberLiteral)):
+      if not isinstance(base, (str, Literal)):
         # TODO(mdan): Require Symbol instead of string.
         raise ValueError(
             'for simple QNs, base must be a string or a Literal object;'
@@ -169,12 +161,13 @@ class QN(object):
             self.has_attr() == other.has_attr())
 
   def __str__(self):
+    root = self.qn[0]
     if self.has_subscript():
-      return str(self.qn[0]) + '[' + str(self.qn[1]) + ']'
+      return '{}[{}]'.format(root, self.qn[1])
     if self.has_attr():
       return '.'.join(map(str, self.qn))
     else:
-      return str(self.qn[0])
+      return str(root)
 
   def __repr__(self):
     return str(self)
@@ -207,13 +200,11 @@ class QN(object):
     if isinstance(base, str):
       return gast.Name(
           base, ctx=CallerMustSetThis, annotation=None, type_comment=None)
-    elif isinstance(base, StringLiteral):
-      return gast.Constant(base.value, kind=None)
-    elif isinstance(base, NumberLiteral):
+    elif isinstance(base, Literal):
       return gast.Constant(base.value, kind=None)
     else:
       assert False, ('the constructor should prevent types other than '
-                     'str, StringLiteral and NumberLiteral')
+                     'str and Literal')
 
 
 class QnResolver(gast.NodeTransformer):
@@ -243,7 +234,7 @@ class QnResolver(gast.NodeTransformer):
       # Continuing silently because some demos use these.
       return node
     if isinstance(s.value, gast.Constant):
-      subscript = QN(NumberLiteral(s.value.value))
+      subscript = QN(Literal(s.value.value))
     else:
       # The index may be an expression, case in which a name doesn't make sense.
       if anno.hasanno(node.slice.value, anno.Basic.QN):
diff --git a/tensorflow/python/autograph/pyct/qual_names_test.py b/tensorflow/python/autograph/pyct/qual_names_test.py
index ce17aecc024..6addb0a7179 100644
--- a/tensorflow/python/autograph/pyct/qual_names_test.py
+++ b/tensorflow/python/autograph/pyct/qual_names_test.py
@@ -75,9 +75,7 @@ class QNTest(test.TestCase):
     b_sub_c = QN(b, subscript=c)
     a_sub_b_sub_c = QN(a, subscript=b_sub_c)
     self.assertEqual(a_sub_b_sub_c.qn, (a, b_sub_c))
-    self.assertTrue(a_sub_b.is_composite())
     self.assertTrue(a_sub_b_sub_c.is_composite())
-    self.assertTrue(a_sub_b.has_subscript())
     self.assertTrue(a_sub_b_sub_c.has_subscript())
     self.assertEqual(b_sub_c.qn, (b, c))
     self.assertEqual(str(a_sub_b_sub_c), 'a[b[c]]')
@@ -154,14 +152,17 @@ class QNTest(test.TestCase):
 
   def test_literals(self):
     a = QN('a')
-    a_sub_str_b = QN(a, subscript=QN(qual_names.StringLiteral('b')))
+    a_sub_str_b = QN(a, subscript=QN(qual_names.Literal('b')))
     a_sub_b = QN(a, subscript=QN('b'))
 
     self.assertNotEqual(a_sub_str_b, a_sub_b)
     self.assertNotEqual(hash(a_sub_str_b), hash(a_sub_b))
+    self.assertEqual(a_sub_str_b.ast().slice.value.value, 'b')
+    self.assertEqual(str(a_sub_str_b), "a['b']")
 
-    a_sub_three = QN(a, subscript=QN(qual_names.NumberLiteral(3)))
+    a_sub_three = QN(a, subscript=QN(qual_names.Literal(3)))
     self.assertEqual(a_sub_three.ast().slice.value.value, 3)
+    self.assertEqual(str(a_sub_three), "a[3]")
 
   def test_support_set(self):
     a = QN('a')
diff --git a/tensorflow/python/eager/gradient_input_output_exclusions.py b/tensorflow/python/eager/gradient_input_output_exclusions.py
index 94962bf6135..442151f667e 100644
--- a/tensorflow/python/eager/gradient_input_output_exclusions.py
+++ b/tensorflow/python/eager/gradient_input_output_exclusions.py
@@ -253,7 +253,8 @@ def _live_tensors(f, attr_name="inputs"):
       # Not a number, assuming it can be anything.
       return _ALL
     subscript_val, = subscript.qn
-    if not isinstance(subscript_val, qual_names.NumberLiteral):
+    if (not isinstance(subscript_val, qual_names.Literal) and
+        not isinstance(subscript_val.value, int)):
       # Not a number, assuming it can be anything.
       return _ALL
     input_output_indices.add(subscript_val.value)

From 0097e04b243b10c0b6117e01b74497ff95aaf5e9 Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Mon, 25 May 2020 19:44:57 -0700
Subject: [PATCH 1097/1533] Skip kOptionalTensor(-1) in IsAllAllowedTensors()

PiperOrigin-RevId: 313121707
Change-Id: I04062b6413f5db677294e3a78495d954c9307fff
---
 tensorflow/lite/delegates/gpu/common/model_builder.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 18b48583295..64b335f10a5 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -2763,10 +2763,13 @@ absl::Status IsSupported(const TfLiteContext* context, TfLiteNode* node,
       ->IsSupported(context, node, registration);
 }
 
-bool IsAllAllowedTensors(TfLiteContext* context, const TfLiteIntArray* array,
+bool IsAllAllowedTensors(TfLiteContext* context,
+                         const TfLiteIntArray* tensor_indices,
                          bool allow_quant_ops = false) {
-  for (int i = 0; i < array->size; ++i) {
-    const TfLiteTensor* t = context->tensors + array->data[i];
+  for (int i = 0; i < tensor_indices->size; ++i) {
+    int tensor_idx = tensor_indices->data[i];
+    if (tensor_idx == kTfLiteOptionalTensor) continue;
+    const TfLiteTensor* t = &context->tensors[tensor_idx];
     bool type_supported =
         (t->type == kTfLiteFloat32 || t->type == kTfLiteFloat16);
     if (allow_quant_ops) {

From 7b48dab3ac85a79921f29bd093cedda8ab09e213 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Mon, 25 May 2020 20:21:16 -0700
Subject: [PATCH 1098/1533] Prune unused includes in XNNPACK tester headers

PiperOrigin-RevId: 313123993
Change-Id: I69549e97cc1c4926ea5c2cab7fb56f3aa1e28b0d
---
 tensorflow/lite/delegates/xnnpack/BUILD                   | 5 +++++
 .../lite/delegates/xnnpack/binary_elementwise_tester.h    | 8 +-------
 .../lite/delegates/xnnpack/depthwise_conv_2d_tester.h     | 8 +-------
 .../lite/delegates/xnnpack/fully_connected_tester.h       | 2 --
 tensorflow/lite/delegates/xnnpack/pad_tester.h            | 2 --
 tensorflow/lite/delegates/xnnpack/pool_2d_tester.h        | 8 +-------
 tensorflow/lite/delegates/xnnpack/softmax_tester.h        | 8 +-------
 .../lite/delegates/xnnpack/unary_elementwise_tester.h     | 8 +-------
 8 files changed, 10 insertions(+), 39 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 6edb757e83f..1cdba72b615 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -53,6 +53,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -68,6 +69,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -115,6 +117,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -130,6 +133,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
@@ -145,6 +149,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
index 6d9a8b6caa9..15c99c3148d 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_BINARY_ELEMENTWISE_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
index ec8e4cea429..16dc5920229 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_DEPTHWISE_CONV_2D_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
index 1c8e3d5d60c..cf1d5513d46 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -17,8 +17,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_FULLY_CONNECTED_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/xnnpack/pad_tester.h b/tensorflow/lite/delegates/xnnpack/pad_tester.h
index ffcd47e05e9..a6951fdf156 100644
--- a/tensorflow/lite/delegates/xnnpack/pad_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/pad_tester.h
@@ -17,8 +17,6 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_PAD_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
index 3125e9231f6..a84be10ad45 100644
--- a/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/pool_2d_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_POOL_2D_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/softmax_tester.h b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
index 9f930a6f21e..674dc9a443e 100644
--- a/tensorflow/lite/delegates/xnnpack/softmax_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/softmax_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_SOFTMAX_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
diff --git a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
index 88508ccd1c1..e3c210fd6b3 100644
--- a/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/unary_elementwise_tester.h
@@ -17,17 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_XNNPACK_UNARY_ELEMENTWISE_TESTER_H_
 
 #include <cstdint>
-#include <functional>
-#include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {

From 07bb0db8627defaafdadd458e4fbaa5b4a4bfcab Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Mon, 25 May 2020 22:04:08 -0700
Subject: [PATCH 1099/1533] Remove the run_deprecated_v1 annotation

PiperOrigin-RevId: 313131936
Change-Id: I6a7dadc51ea399438e80c3fcf90c8ba7df59c0e2
---
 tensorflow/lite/python/util_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/python/util_test.py b/tensorflow/lite/python/util_test.py
index 51a0c57260a..f3c287dd7fc 100644
--- a/tensorflow/lite/python/util_test.py
+++ b/tensorflow/lite/python/util_test.py
@@ -174,7 +174,6 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
         str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-  @test_util.run_deprecated_v1
   def testSetTensorShapeDimensionInvalid(self):
     # Tests set_tensor_shape where the shape passed in is incompatible.
     with ops.Graph().as_default():

From 0b0e35dccf2cd4f58e89236167e0bff55999392c Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Mon, 25 May 2020 22:56:17 -0700
Subject: [PATCH 1100/1533] Separate metal delegate to a subspec Cocoapods

PiperOrigin-RevId: 313135776
Change-Id: I271177d71220b23c8671edd761006d0ea313996a
---
 tensorflow/lite/delegates/gpu/BUILD           |  1 +
 tensorflow/lite/experimental/ios/BUILD.apple  | 26 +++++++++++++++----
 .../ios/TensorFlowLiteC.podspec.template      |  6 +++++
 .../lite/experimental/swift/BUILD.apple       |  1 +
 .../swift/Sources/MetalDelegate.swift         |  2 +-
 .../TensorFlowLiteSwift.podspec.template      |  9 ++++++-
 6 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index c667c2056f4..bb509610c7a 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -80,6 +80,7 @@ objc_library(
     name = "metal_delegate",
     srcs = ["metal_delegate.mm"],
     hdrs = ["metal_delegate.h"],
+    module_name = "TensorFlowLiteCMetal",
     sdk_frameworks = ["Metal"],
     deps = [
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 7e2a3623af1..ddbfc0dec5b 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -78,15 +78,32 @@ ios_static_framework(
     ],
 )
 
+# This target builds the Metal delegate as a separate static framework, which
+# does not include the TensorFlow Lite runtime. As this target does not contain
+# TensorFlow Lite runtime, it is intended to be linked along with the
+# TensorFlowLiteC framework above in a composable way.
+#
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCMetal_framework
+ios_static_framework(
+    name = "TensorFlowLiteCMetal_framework",
+    hdrs = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
+    ],
+    avoid_deps = [
+        ":tensorflow_lite_c",
+    ],
+    bundle_name = "TensorFlowLiteCMetal",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_c",
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
-        "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    ],
-    linkopts = [
-        "-Wl,-weak_framework,Metal",
     ],
     tags = [
         "nobuilder",
@@ -94,7 +111,6 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:c_api",
-        "//tensorflow/lite/delegates/gpu:metal_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
index d8a5ef8f2e1..3f0517e1fe6 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
@@ -31,4 +31,10 @@ Pod::Spec.new do |s|
     coreml.dependency 'TensorFlowLiteC/Core'
     coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.framework'
   end
+
+  s.subspec 'Metal' do |metal|
+    metal.weak_framework = 'Metal'
+    metal.dependency 'TensorFlowLiteC/Core'
+    metal.vendored_frameworks = 'Frameworks/TensorFlowLiteCMetal.framework'
+  end
 end
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index e671721dd1c..d5aeafe4c01 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -34,6 +34,7 @@ swift_library(
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_whitelist(),
     deps = [
+        "//tensorflow/lite/delegates/gpu:metal_delegate",
         "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
         "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
     ],
diff --git a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
index 8fd15f303da..6cde2533f95 100644
--- a/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/MetalDelegate.swift
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import TensorFlowLiteC
+import TensorFlowLiteCMetal
 
 /// A delegate that uses the `Metal` framework for performing TensorFlow Lite graph operations with
 /// GPU acceleration.
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
index a925112f539..b87b4c97d67 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@@ -26,7 +26,7 @@ Pod::Spec.new do |s|
   s.subspec 'Core' do |core|
     core.dependency 'TensorFlowLiteC', "#{s.version}"
     core.source_files = swift_dir + 'Sources/*.swift'
-    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    core.exclude_files = swift_dir + 'Sources/*Delegate.swift'
   end
 
   s.subspec 'CoreML' do |coreml|
@@ -35,6 +35,13 @@ Pod::Spec.new do |s|
     coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
   end
 
+  s.subspec 'Metal' do |metal|
+    metal.source_files = swift_dir + 'Sources/MetalDelegate.swift'
+    metal.dependency 'TensorFlowLiteC/Metal', "#{s.version}"
+    metal.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
+
+
   s.test_spec 'Tests' do |ts|
     ts.source_files = swift_dir + 'Tests/*.swift'
     ts.resources = [

From b0ad9c1aed6d5762acdd84d9f09b27642ff6ebd5 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 26 May 2020 01:15:30 -0700
Subject: [PATCH 1101/1533] Fix code relying on implicit bool conversion of
 mlir::Value

https://github.com/llvm/llvm-project/commit/a9b5edc5e2c4ec9d506b2c30465ee9f2dc21e5cc breaks this.

PiperOrigin-RevId: 313147609
Change-Id: I7c46ddd3539b97c40c4ad0300f074702b7064da2
---
 .../tensorflow/transforms/tpu_variable_runtime_reformatting.cc  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
index 9e8745918e3..ec4a25c6fdd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_variable_runtime_reformatting.cc
@@ -229,7 +229,7 @@ AnnotateCompileOpAndGetExecuteArgToWhileArgsMapping(
     mapping.emplace_back(it->second, std::move(while_args));
   }
   // Sort the mapping according to execute operand order.
-  llvm::sort(mapping);
+  llvm::sort(mapping, llvm::less_first());
   // Populate the `retval_index_for_sharding` field of the argument metadate.
   for (auto entry : llvm::enumerate(execute.device_var_reads_indices())) {
     int64_t arg_index = entry.value().cast<IntegerAttr>().getInt();

From 0fa5426f997856b2ee1055e4d74984bde2d1fc9c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 01:57:24 -0700
Subject: [PATCH 1102/1533] Replacing the call to base::SpecifiedOnCommandLine
 with base::WasPresentOnCommandLine.

PiperOrigin-RevId: 313151744
Change-Id: I13adf6964cf38fc1c6fab07c9f426fe5b44768fa
---
 tensorflow/lite/toco/model_cmdline_flags.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/model_cmdline_flags.cc b/tensorflow/lite/toco/model_cmdline_flags.cc
index 2434481272f..86a1cedd612 100644
--- a/tensorflow/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/lite/toco/model_cmdline_flags.cc
@@ -204,7 +204,7 @@ void ReadModelFlagsFromCommandLineFlags(
   }
 
 #ifdef PLATFORM_GOOGLE
-  CHECK(!((base::SpecifiedOnCommandLine("batch") &&
+  CHECK(!((base::WasPresentOnCommandLine("batch") &&
            parsed_model_flags.variable_batch.specified())))
       << "The --batch and --variable_batch flags are mutually exclusive.";
 #endif

From f0d0fbd7bdd9c50305f13bbac56e8f8e239a5cd8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 02:02:33 -0700
Subject: [PATCH 1103/1533] compat: Update forward compatibility horizon to
 2020-05-26

PiperOrigin-RevId: 313152232
Change-Id: I9a2ffc73f3a3c2b03ac583a2e1f65ede8f672b39
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index c8c481c2b76..927256bc55d 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 25)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 26)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 64a37a9028f9d527ef3e1a137a1c4ba9c8254622 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 02:02:44 -0700
Subject: [PATCH 1104/1533] Update GraphDef version to 413.

PiperOrigin-RevId: 313152254
Change-Id: Ic0dce11fd9e1e6c24390446de3c72f4e6367b26f
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8cdf617144d..b02f78a9dc3 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 412  // Updated: 2020/5/25
+#define TF_GRAPH_DEF_VERSION 413  // Updated: 2020/5/26
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From be46769cee1f15f4b439331bc325a31804dad5ae Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 26 May 2020 02:54:20 -0700
Subject: [PATCH 1105/1533] [XLA:CPU] Switch parallel_task_assignment to a
 blacklist so it doesn't parallelize HLOs it doesn't know about

The remaining list is roughly identical to what can go into a loop fusion. Add
a test that we don't parallelize allreduce.

PiperOrigin-RevId: 313157848
Change-Id: I5e7c85c11d78ba8b9b8a75a15c80eb67cd151064
---
 .../service/cpu/parallel_task_assignment.cc   | 35 +++++++++++--------
 .../cpu/parallel_task_assignment_test.cc      | 21 +++++++++++
 2 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 14afe770ede..225102e6ae6 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -142,24 +142,29 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   //    in-place will only touch the updated elements).
   // TODO(b/27458679) Parallelize instructions which are skipped here.
   auto opcode = instruction->opcode();
-  if (opcode == HloOpcode::kParameter || opcode == HloOpcode::kConstant ||
-      opcode == HloOpcode::kCall || opcode == HloOpcode::kCustomCall ||
-      opcode == HloOpcode::kDot || opcode == HloOpcode::kSelectAndScatter ||
-      opcode == HloOpcode::kGetTupleElement || opcode == HloOpcode::kBitcast ||
-      opcode == HloOpcode::kFft || opcode == HloOpcode::kInfeed ||
-      opcode == HloOpcode::kOutfeed || opcode == HloOpcode::kRng ||
-      opcode == HloOpcode::kSort ||
-      (opcode == HloOpcode::kConvolution &&
-       PotentiallyImplementedAsEigenConvolution(*instruction,
-                                                target_machine_features_)) ||
-      (opcode == HloOpcode::kFusion && !instruction->IsLoopFusion()) ||
-      llvm_ir::MayBeImplementedAsInPlaceDynamicUpdateSlice(instruction) ||
-      instruction->shape().IsTuple()) {
+  if (llvm_ir::MayBeImplementedAsInPlaceDynamicUpdateSlice(instruction) ||
+      instruction->shape().IsTuple() || opcode == HloOpcode::kRng) {
     return 1;
   }
 
-  // Consult 'cost_model_' to compute target parallel task count.
-  return cost_model_->GetParallelTaskCount(instruction);
+  // Only allow known good instructions.
+  if (instruction->IsElementwise() || instruction->IsLoopFusion() ||
+      opcode == HloOpcode::kBroadcast || opcode == HloOpcode::kConcatenate ||
+      opcode == HloOpcode::kDynamicSlice ||
+      opcode == HloOpcode::kDynamicUpdateSlice ||
+      opcode == HloOpcode::kGather || opcode == HloOpcode::kIota ||
+      opcode == HloOpcode::kPad || opcode == HloOpcode::kReduce ||
+      opcode == HloOpcode::kReduceWindow || opcode == HloOpcode::kReshape ||
+      opcode == HloOpcode::kReverse || opcode == HloOpcode::kSlice ||
+      opcode == HloOpcode::kTranspose ||
+      (opcode == HloOpcode::kConvolution &&
+       !PotentiallyImplementedAsEigenConvolution(*instruction,
+                                                 target_machine_features_))) {
+    // Consult 'cost_model_' to compute target parallel task count.
+    return cost_model_->GetParallelTaskCount(instruction);
+  }
+
+  return 1;
 }
 
 StatusOr<bool> ParallelTaskAssigner::Run(HloModule* module) {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index e2c93568b74..e22210a61f2 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -170,5 +170,26 @@ TEST_F(ParallelTaskAssignmentTest, InPlaceDynamicUpdateSliceNotParallelized) {
   EXPECT_FALSE(changed);
 }
 
+TEST_F(ParallelTaskAssignmentTest, AllReduceNotParallelized) {
+  constexpr char hlo_string[] = R"(
+  HloModule TestTaskParallel_allreduce
+    add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    }
+
+    ENTRY CRS {
+      input = f32[1234567] parameter(0)
+      ROOT crs = f32[1234567] all-reduce(input), replica_groups={}, to_apply=add
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> m,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(m.get()));
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla

From e142390e50b6a281fa782f3e3898871cf3238453 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 26 May 2020 03:23:11 -0700
Subject: [PATCH 1106/1533] Fix stable sort for tensors with rank > 1.

Although we used a call to stable_sort, we didn't reset the indices back to the
initial order. For tensors with rank > 1 we need to do several calls to
stable_sort, so only the first call to stable_sort actually resulted in a
stable order relative to the original order.

PiperOrigin-RevId: 313160903
Change-Id: I9bd63b333c05c67c5067204fb010ecd92c9cf113
---
 .../compiler/xla/service/cpu/runtime_key_value_sort.cc       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 7831c1b1b5b..0d4e7055ddb 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -60,6 +60,11 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_KeyValueSort(
   std::unique_ptr<std::string[]> reordered_values(
       new std::string[sort_dimension_elements]);
   for (int64 index = 0; index < num_iteration_elements; ++index) {
+    // If the sort should be stable, we have to reinitialize indices to iota to
+    // guarantee that we still keep the relative order in case of ties.
+    if (is_stable && index > 0) {
+      std::iota(indices.get(), indices.get() + sort_dimension_elements, 0);
+    }
     // 'index' can be split into two values which index into the 'c' dimension
     // and the 'a' dimension, respectively. 'index' % 'c' is the index into the
     // 'c' dimension, 'index' / 'c' is the index into the 'a' dimension. When

From e3e0ba57815094e58c6df16ef1b68281418bf4a6 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Tue, 26 May 2020 03:33:56 -0700
Subject: [PATCH 1107/1533] [XLA:GPU] Use device specific tanh functions for
 f64 inputs to improve accuracy.

PiperOrigin-RevId: 313161798
Change-Id: I5cc9cbf5e48cc0632f396ef2a05df1db7011fadb
---
 .../compiler/xla/python/xla_client_test.py    | 16 +++++-
 .../xla/service/gpu/elemental_ir_emitter.cc   |  7 +++
 .../compiler/xla/service/gpu/target_util.cc   | 53 ++++++++++---------
 .../compiler/xla/service/gpu/target_util.h    | 19 +++----
 4 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index fbdd9921a40..000db2cb16b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -115,6 +115,10 @@ def TestFactory(xla_backend, cloud_tpu=False):
     """Convenience wrapper to create Numpy arrays with a np.float32 dtype."""
     return np.array(*args, dtype=np.float32, **kwargs)
 
+  def NumpyArrayF64(*args, **kwargs):
+    """Convenience wrapper to create Numpy arrays with a np.float64 dtype."""
+    return np.array(*args, dtype=np.float64, **kwargs)
+
   def NumpyArrayS32(*args, **kwargs):
     """Convenience wrapper to create Numpy arrays with a np.int32 dtype."""
     return np.array(*args, dtype=np.int32, **kwargs)
@@ -882,12 +886,20 @@ def TestFactory(xla_backend, cloud_tpu=False):
       ops.Abs(ops.Constant(c, arr))
       self._ExecuteAndCompareClose(c, expected=[np.abs(arr)])
 
-    def testTanh(self):
+    def testTanhF32(self):
       c = self._NewComputation()
-      arr = NumpyArrayF32([3.3, 12.1])
+      arr = NumpyArrayF32([-0.2, 3.3, 12.1, 0.1, 0.0001])
       ops.Tanh(ops.Constant(c, arr))
       self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)])
 
+    def testTanhF64(self):
+      if self.backend.platform == "tpu":
+        self.skipTest("TPU doesn't support 64bit tanh")
+      c = self._NewComputation()
+      arr = NumpyArrayF64([-0.2, 3.3, 12.1, 0.1, 0.0001])
+      ops.Tanh(ops.Constant(c, arr))
+      self._ExecuteAndCompareClose(c, expected=[np.tanh(arr)], rtol=1e-12)
+
     def testTranspose(self):
 
       def _TransposeAndTest(array, permutation):
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 1be0b1b4e7b..eee0fc83481 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -260,6 +260,13 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitAtan2(PrimitiveType prim_type,
 
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
                                                        llvm::Value* value) {
+  // When F64 is being requested, assume performance is less important and use
+  // the more numerically precise tanh function.
+  if (prim_type == F64) {
+    return EmitDeviceMathCall(TargetDeviceFunctionID::kTanh, {value},
+                              {prim_type}, prim_type);
+  }
+
   // Emit a fast approximation of tanh instead of calling __nv_tanh.
   // __nv_tanh is particularly bad because it contains branches, thus
   // preventing LLVM's load-store vectorizer from working its magic across a
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.cc b/tensorflow/compiler/xla/service/gpu/target_util.cc
index 49eadd8c6be..31b590a19ff 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/target_util.cc
@@ -111,47 +111,50 @@ struct TargetDeviceFunction {
 struct TargetDeviceFunction GetDeviceFunctionRoot(
     TargetDeviceFunctionID func_id) {
   switch (func_id) {
-    case TargetDeviceFunctionID::kPow: {
-      return {"__nv_pow", "__ocml_pow"};
-    }
-    case TargetDeviceFunctionID::kErfcinv: {
-      return {"__nv_erfcinv", "__ocml_erfcinv"};
-    }
-    case TargetDeviceFunctionID::kLog: {
-      return {"__nv_log", "__ocml_log"};
-    }
-    case TargetDeviceFunctionID::kLog1p: {
-      return {"__nv_log1p", "__ocml_log1p"};
-    }
-    case TargetDeviceFunctionID::kSin: {
-      return {"__nv_sin", "__ocml_sin"};
+    case TargetDeviceFunctionID::kAtan2: {
+      return {"__nv_atan2", "__ocml_atan2"};
     }
     case TargetDeviceFunctionID::kCos: {
       return {"__nv_cos", "__ocml_cos"};
     }
+    case TargetDeviceFunctionID::kErfcinv: {
+      return {"__nv_erfcinv", "__ocml_erfcinv"};
+    }
     case TargetDeviceFunctionID::kExp: {
       return {"__nv_exp", "__ocml_exp"};
     }
     case TargetDeviceFunctionID::kExpm1: {
       return {"__nv_expm1", "__ocml_expm1"};
     }
-    case TargetDeviceFunctionID::kSqrt: {
-      return {"__nv_sqrt", "__ocml_sqrt"};
-    }
-    case TargetDeviceFunctionID::kRsqrt: {
-      return {"__nv_rsqrt", "__ocml_rsqrt"};
-    }
-    case TargetDeviceFunctionID::kAtan2: {
-      return {"__nv_atan2", "__ocml_atan2"};
-    }
     case TargetDeviceFunctionID::kFmod: {
       return {"__nv_fmod", "__ocml_fmod"};
     }
+    case TargetDeviceFunctionID::kHypot: {
+      return {"__nv_hypot", "__ocml_hypot"};
+    }
+    case TargetDeviceFunctionID::kLog: {
+      return {"__nv_log", "__ocml_log"};
+    }
+    case TargetDeviceFunctionID::kLog1p: {
+      return {"__nv_log1p", "__ocml_log1p"};
+    }
+    case TargetDeviceFunctionID::kPow: {
+      return {"__nv_pow", "__ocml_pow"};
+    }
     case TargetDeviceFunctionID::kRound: {
       return {"__nv_round", "__ocml_round"};
     }
-    case TargetDeviceFunctionID::kHypot: {
-      return {"__nv_hypot", "__ocml_hypot"};
+    case TargetDeviceFunctionID::kRsqrt: {
+      return {"__nv_rsqrt", "__ocml_rsqrt"};
+    }
+    case TargetDeviceFunctionID::kSin: {
+      return {"__nv_sin", "__ocml_sin"};
+    }
+    case TargetDeviceFunctionID::kSqrt: {
+      return {"__nv_sqrt", "__ocml_sqrt"};
+    }
+    case TargetDeviceFunctionID::kTanh: {
+      return {"__nv_tanh", "__ocml_tanh"};
     }
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/target_util.h b/tensorflow/compiler/xla/service/gpu/target_util.h
index 4355ed21136..2bdaea7734a 100644
--- a/tensorflow/compiler/xla/service/gpu/target_util.h
+++ b/tensorflow/compiler/xla/service/gpu/target_util.h
@@ -46,20 +46,21 @@ enum class TargetIntrinsicID {
 
 // Enumeration to get target specific device math function.
 enum class TargetDeviceFunctionID {
-  kPow = 0,
-  kErfcinv,
-  kLog,
-  kLog1p,
-  kSin,
+  kAtan2 = 0,
   kCos,
+  kErfcinv,
   kExp,
   kExpm1,
-  kSqrt,
-  kRsqrt,
-  kAtan2,
   kFmod,
+  kHypot,
+  kLog,
+  kLog1p,
+  kPow,
   kRound,
-  kHypot
+  kRsqrt,
+  kSin,
+  kSqrt,
+  kTanh,
 };
 
 // Emits IR to call a device function named "callee_name" on the given

From 86ea22210462f42c0c85920f5962f603b81a5e55 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 26 May 2020 05:46:06 -0700
Subject: [PATCH 1108/1533] Improve DotDecomposer to not add unnecessary
 non-contracting dimensions.

These would be removed by AlgebraicSimplifier, then DotDecomposer would add
them again, which makes the HloPassFix iterate until it hits the maximum number
of iterations.
Also consider operands of dots without non-contracting dimension to be
canonical.

PiperOrigin-RevId: 313174496
Change-Id: I8e8ac404198a9df01378820ad16834c9893336a5
---
 .../compiler/xla/service/dot_decomposer.cc    | 25 ++++---
 .../xla/service/dot_decomposer_test.cc        | 70 +++++++++++++++++++
 2 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/dot_decomposer.cc b/tensorflow/compiler/xla/service/dot_decomposer.cc
index 353a7f5cebc..40354dec3c6 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer.cc
@@ -31,7 +31,7 @@ namespace {
 
 // Convert a dot into a canonical form where non-contracting and contracting
 // dimensions are reshaped together and batch dimensions are the most major
-// dimensions. The requires transposing and reshapes the lhs and rhs and
+// dimensions. This requires transposing and reshapes of the lhs and rhs and
 // reshaping the output batch to the original shape.
 Status CanonicalizeDot(HloInstruction* original_dot) {
   auto computation = original_dot->parent();
@@ -80,7 +80,9 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
                                        lhs_shape),
           original_dot->mutable_operand(0), lhs_transpose));
   std::vector<int64> lhs_reshape_dims = batch_dim_sizes;
-  lhs_reshape_dims.push_back(lhs_non_contracting_size);
+  if (lhs_non_contracting_size > 1) {
+    lhs_reshape_dims.push_back(lhs_non_contracting_size);
+  }
   lhs_reshape_dims.push_back(lhs_contracting_size);
   // Reshape the contracting and non-contracting dimensions together.
   HloInstruction* reshaped_lhs =
@@ -126,7 +128,9 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
 
   std::vector<int64> rhs_reshape_dims = batch_dim_sizes;
   rhs_reshape_dims.push_back(rhs_contracting_size);
-  rhs_reshape_dims.push_back(rhs_non_contracting_size);
+  if (rhs_non_contracting_size > 1) {
+    rhs_reshape_dims.push_back(rhs_non_contracting_size);
+  }
   // Reshape the contracting and non-contracting dimensions together.
   HloInstruction* reshaped_rhs =
       computation->AddInstruction(HloInstruction::CreateReshape(
@@ -134,15 +138,20 @@ Status CanonicalizeDot(HloInstruction* original_dot) {
           transposed_rhs));
 
   std::vector<int64> dot_dims = batch_dim_sizes;
-  dot_dims.push_back(lhs_non_contracting_size);
-  dot_dims.push_back(rhs_non_contracting_size);
+  if (lhs_non_contracting_size > 1) {
+    dot_dims.push_back(lhs_non_contracting_size);
+  }
+  if (rhs_non_contracting_size > 1) {
+    dot_dims.push_back(rhs_non_contracting_size);
+  }
 
   DotDimensionNumbers dot_dnums;
   for (int64 i = 0; i < num_batch_dims; ++i) {
     dot_dnums.add_lhs_batch_dimensions(i);
     dot_dnums.add_rhs_batch_dimensions(i);
   }
-  dot_dnums.add_lhs_contracting_dimensions(num_batch_dims + 1);
+  dot_dnums.add_lhs_contracting_dimensions(
+      num_batch_dims + (lhs_non_contracting_size > 1 ? 1 : 0));
   dot_dnums.add_rhs_contracting_dimensions(num_batch_dims);
 
   HloInstruction* dot = computation->AddInstruction(HloInstruction::CreateDot(
@@ -174,9 +183,9 @@ StatusOr<bool> DotDecomposer::Run(HloModule* module) {
       }
       // A dot is not canonical if it has more than one non-contracting
       // dimension.
-      if (dnums.lhs_batch_dimensions_size() + 2 !=
+      if (dnums.lhs_batch_dimensions_size() + 2 <
               instruction->operand(0)->shape().rank() ||
-          dnums.rhs_batch_dimensions_size() + 2 !=
+          dnums.rhs_batch_dimensions_size() + 2 <
               instruction->operand(1)->shape().rank()) {
         non_canonical_dots.push_back(instruction);
         continue;
diff --git a/tensorflow/compiler/xla/service/dot_decomposer_test.cc b/tensorflow/compiler/xla/service/dot_decomposer_test.cc
index 67fff50eaf6..c4152393933 100644
--- a/tensorflow/compiler/xla/service/dot_decomposer_test.cc
+++ b/tensorflow/compiler/xla/service/dot_decomposer_test.cc
@@ -50,5 +50,75 @@ TEST_F(DotDecomposerTest, CanonicalizeMultipleNonContractingDims) {
                                 op::Shape("f32[4032,512]"))));
 }
 
+TEST_F(DotDecomposerTest, DontCanonicalizeIfNoNoncontractingDims) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[64,4]{1,0} parameter(0)
+    p1 = f32[64,4]{1,0} parameter(1)
+    ROOT dot = f32[64]{0} dot(p0, p1), lhs_batch_dims={0},
+                                       lhs_contracting_dims={1},
+                                       rhs_batch_dims={0},
+                                       rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_FALSE(canonicalized);
+}
+
+TEST_F(DotDecomposerTest, DontAddLhsNonContractingDimIfOne) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[64,4]{1,0} parameter(0)
+    p1 = f32[64,4,2,1]{3,2,1,0} parameter(1)
+    ROOT dot = f32[64,2,1]{2,1,0} dot(p0, p1), lhs_batch_dims={0},
+                                               lhs_contracting_dims={1},
+                                               rhs_batch_dims={0},
+                                               rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(AllOf(op::Dot(op::Reshape(), op::Reshape(),
+                                        /*lhs_contracting_dim=*/1,
+                                        /*rhs_contracting_dim=*/1),
+                                op::Shape("f32[64,2]"))));
+}
+
+TEST_F(DotDecomposerTest, DontAddRhsNonContractingDimIfOne) {
+  absl::string_view module_string = R"(
+  HloModule module
+
+  ENTRY main {
+    p0 = f32[64,4,2,1]{3,2,1,0} parameter(0)
+    p1 = f32[64,4]{1,0} parameter(1)
+    ROOT dot = f32[64,2,1]{2,1,0} dot(p0, p1), lhs_batch_dims={0},
+                                               lhs_contracting_dims={1},
+                                               rhs_batch_dims={0},
+                                               rhs_contracting_dims={1}
+  })";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool canonicalized,
+                          DotDecomposer().Run(module.get()));
+  EXPECT_TRUE(canonicalized);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Reshape(AllOf(op::Dot(op::Reshape(), op::Reshape(),
+                                        /*lhs_contracting_dim=*/2,
+                                        /*rhs_contracting_dim=*/1),
+                                op::Shape("f32[64,2]"))));
+}
+
 }  // namespace
 }  // namespace xla

From fd98070b2daece57e96d41f211e97fb16cf431e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 06:24:06 -0700
Subject: [PATCH 1109/1533] Expose more XLA debug settings in Python.

PiperOrigin-RevId: 313178400
Change-Id: I236f4f2180f4efd334cb58b445bc6f1ba47401d4
---
 tensorflow/compiler/xla/python/xla.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index c75586c92a7..4cf2b36db27 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -1273,7 +1273,19 @@ PYBIND11_MODULE(xla_extension, m) {
                     &DebugOptions::set_xla_cpu_fast_math_honor_functions)
       .def_property("xla_gpu_enable_fast_min_max",
                     &DebugOptions::xla_gpu_enable_fast_min_max,
-                    &DebugOptions::set_xla_gpu_enable_fast_min_max);
+                    &DebugOptions::set_xla_gpu_enable_fast_min_max)
+      .def_property("xla_backend_optimization_level",
+                    &DebugOptions::xla_backend_optimization_level,
+                    &DebugOptions::set_xla_backend_optimization_level)
+      .def_property("xla_cpu_enable_xprof_traceme",
+                    &DebugOptions::xla_cpu_enable_xprof_traceme,
+                    &DebugOptions::set_xla_cpu_enable_xprof_traceme)
+      .def_property("xla_llvm_disable_expensive_passes",
+                    &DebugOptions::xla_llvm_disable_expensive_passes,
+                    &DebugOptions::set_xla_llvm_disable_expensive_passes)
+      .def_property("xla_test_all_input_layouts",
+                    &DebugOptions::xla_test_all_input_layouts,
+                    &DebugOptions::set_xla_test_all_input_layouts);
 
   py::class_<ExecutableBuildOptions>(m, "ExecutableBuildOptions")
       .def(py::init<>())

From 2e842db3cca65aa5a88e5ac518243023f8e8c32b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 08:58:47 -0700
Subject: [PATCH 1110/1533] Allow normalize to accept integer data.

PiperOrigin-RevId: 313200397
Change-Id: I382c3a9e986ffcfb419537bceeefc9d53d7fcb25
---
 .../python/keras/layers/preprocessing/normalization.py      | 4 ++++
 .../python/keras/layers/preprocessing/normalization_test.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index cf9600a63ab..2ae6fcb7ec2 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -107,6 +107,10 @@ class Normalization(CombinerPreprocessingLayer):
     super(Normalization, self).build(input_shape)
 
   def call(self, inputs):
+    # If the inputs are not floats, cast them to floats. This avoids issues
+    # with int-float multiplication and division below.
+    if inputs.dtype != K.floatx():
+      inputs = math_ops.cast(inputs, K.floatx())
     # We need to reshape the mean and variance data to ensure that Tensorflow
     # broadcasts the data correctly.
     mean = array_ops.reshape(self.mean, self._broadcast_shape)
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 2e6f4990cc5..3503659f919 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -48,6 +48,12 @@ def _get_layer_computation_test_cases():
       "test_data": np.array([[1.], [2.], [3.]], np.float32),
       "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
       "testcase_name": "2d_single_element"
+  }, {
+      "adapt_data": np.array([[1], [2], [3], [4], [5]], dtype=np.int32),
+      "axis": -1,
+      "test_data": np.array([[1], [2], [3]], np.int32),
+      "expected": np.array([[-1.414214], [-.707107], [0]], np.float32),
+      "testcase_name": "2d_int_data"
   }, {
       "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32),
       "axis": None,

From 09af9319d90e8eb4a8122f9dd535029936905f12 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Tue, 26 May 2020 09:21:15 -0700
Subject: [PATCH 1111/1533] Make sure the rendezvous abort check is finished
 before triggering the callback.

PiperOrigin-RevId: 313204522
Change-Id: I88f38391d9ee2296fac9a6e86bb9f9d2c477f1c8
---
 tensorflow/core/distributed_runtime/rpc/BUILD |   2 +
 .../rpc/rpc_rendezvous_mgr.cc                 |  10 +-
 .../rpc/rpc_rendezvous_mgr_test.cc            | 109 +++++++++++++++++-
 .../core/distributed_runtime/test_utils.h     |  16 +--
 4 files changed, 125 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 96e1a63e5a6..60d7172c2fc 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -462,6 +462,8 @@ tf_cuda_cc_tests(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:test_utils",
+        "//tensorflow/core/platform:blocking_counter",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index b973421efa4..512c17fcfcf 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -136,7 +137,12 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
   // Start the main RecvTensor call, checking for an async abort.
   void StartRTCall(std::function<void()> recv_done) {
     resp_.InitAlloc(dst_device_, alloc_attrs_);
-    auto cb = [this, recv_done = std::move(recv_done)](const Status& s) {
+    auto abort_checked = std::make_shared<Notification>();
+    auto cb = [this, abort_checked,
+               recv_done = std::move(recv_done)](const Status& s) {
+      // Make sure the Rendezvous abort checking is finished before running the
+      // callback, which might destroy the current call object.
+      abort_checked->WaitForNotification();
       if (!s.ok()) {
         mutex_lock l(mu_);
         status_.Update(s);
@@ -158,6 +164,8 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     if (!s.ok()) {
       opts_.StartCancel();
     }
+    // Notify that the abort check has finished.
+    abort_checked->Notify();
   }
 
   string src_worker_;
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index 85923542f73..7c5779246bd 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -16,13 +16,16 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -48,13 +51,34 @@ Rendezvous::ParsedKey MakeKey(const string& s) {
 }
 
 namespace {
+// A dummy worker interface implementation that simply triggers the callback
+// with OK status for RecvTensor request.
+class DummyWorker : public TestWorkerInterface {
+ public:
+  void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
+                       TensorResponse* response, StatusCallback done) override {
+    SchedClosure([done = std::move(done)]() {
+      // Simulate a random delay for RPC. This is needed to fill the entire
+      // object buffer in `RpcRecvTensorFreeList` and trigger the destruction of
+      // RPC call objects.
+      const int64 t_us = random::New64() % 100 * 1000;
+      Env::Default()->SleepForMicroseconds(t_us);
+      done(Status::OK());
+    });
+  }
+};
+
 // Fake cache implementation for WorkerEnv.
 class DummyWorkerCache : public WorkerCacheInterface {
   void ListWorkers(std::vector<string>* workers) const override {}
   void ListWorkersInJob(const string& job_name,
                         std::vector<string>* workers) const override {}
   WorkerInterface* GetOrCreateWorker(const string& target) override {
-    return nullptr;
+    if (dummy_remote_worker_ == nullptr) {
+      // Ownership transferred to WorkerFreeList
+      dummy_remote_worker_ = new DummyWorker;
+    }
+    return dummy_remote_worker_;
   }
   Status GetEagerClientCache(
       std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
@@ -66,7 +90,31 @@ class DummyWorkerCache : public WorkerCacheInterface {
   }
   void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
                               StatusCallback done) override {}
+
+ private:
+  DummyWorker* dummy_remote_worker_ = nullptr;
 };
+
+static Device* CreateDevice(const char* type, const char* name) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  return new FakeDevice(attr);
+}
+
+static DeviceMgr* CreateDeviceMgr() {
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:mnist/replica:1/task:2/cpu:1"));
+  std::vector<std::unique_ptr<Device>> devices;
+  devices.emplace_back(std::move(d0));
+  return new StaticDeviceMgr(std::move(devices));
+}
 }  // namespace
 
 class RpcRendezvousMgrTest : public ::testing::Test {
@@ -75,7 +123,7 @@ class RpcRendezvousMgrTest : public ::testing::Test {
       : cache_(new DummyWorkerCache),
         worker_session_("rpc_session", "/job:mnist/replica:1/task:2",
                         std::unique_ptr<WorkerCacheInterface>(cache_),
-                        std::unique_ptr<DeviceMgr>(),
+                        std::unique_ptr<DeviceMgr>(CreateDeviceMgr()),
                         std::unique_ptr<GraphMgr>(), nullptr),
         rmgr_(&env) {
     env.env = Env::Default();
@@ -193,6 +241,7 @@ TEST_F(RpcRendezvousMgrTest, CancelAfterReceived) {
   delete cm;
 }
 
+namespace {
 class DummyDeviceContext : public DeviceContext {
  public:
   explicit DummyDeviceContext(int stream_id) : stream_id_(stream_id) {}
@@ -202,6 +251,7 @@ class DummyDeviceContext : public DeviceContext {
  private:
   const int stream_id_;
 };
+}  // namespace
 
 TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
   DummyDeviceContext* dc = new DummyDeviceContext(123);
@@ -237,6 +287,59 @@ TEST_F(RpcRendezvousMgrTest, TransferDummyDeviceContext) {
   dc->Unref();
 }
 
-// NOTE: Remote Send/Recv is better tested in worker_test.cc
+TEST_F(RpcRendezvousMgrTest, RemoteRecvOne) {
+  const int64 step_id = 123;
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:worker/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  {
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+    core::ScopedUnref unref(rendez);
+    Rendezvous::Args args;
+
+    Tensor val(DT_STRING);
+    bool val_dead = false;
+
+    TF_ASSERT_OK(rendez->Recv(key, args, &val, &val_dead));
+  }
+  rmgr_.Cleanup(step_id);
+}
+
+TEST_F(RpcRendezvousMgrTest, RemoteRecvAsyncMany) {
+  const int64 step_id = 123;
+  const Rendezvous::ParsedKey key = MakeKey(Rendezvous::CreateKey(
+      "/job:worker/replica:1/task:2/cpu:0", 7890,
+      "/job:mnist/replica:1/task:2/cpu:1", "foo", FrameAndIter(0, 0)));
+  {
+    RemoteRendezvous* rendez = rmgr_.Find(step_id);
+    TF_ASSERT_OK(rendez->Initialize(&worker_session_));
+    core::ScopedUnref unref(rendez);
+    Rendezvous::Args args;
+
+    // Send a large number of async RPC requests to fill up the buffer in
+    // `RpcRecvTensorFreeList`, in order to test deleting RPC call objects.
+    int num_requests = 10000;
+    Tensor val(DT_STRING);
+    mutex mu_;
+    Status status = Status::OK();
+    BlockingCounter counter(num_requests);
+
+    for (int i = 0; i < num_requests; i++) {
+      rendez->RecvAsync(
+          key, args,
+          [&mu_, &status, &counter](const Status& s, const Rendezvous::Args&,
+                                    const Rendezvous::Args&, const Tensor&,
+                                    const bool) {
+            mutex_lock l(mu_);
+            status.Update(s);
+            counter.DecrementCount();
+          });
+    }
+    counter.Wait();
+    TF_ASSERT_OK(status);
+  }
+  rmgr_.Cleanup(step_id);
+}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index a93c78e62fd..cec09775469 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -70,28 +70,28 @@ class TestWorkerInterface : public WorkerInterface {
   void CleanupGraphAsync(const CleanupGraphRequest* request,
                          CleanupGraphResponse* response,
                          StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CleanupGraphAsync"));
   }
 
   void CleanupAllAsync(const CleanupAllRequest* request,
                        CleanupAllResponse* response,
                        StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CleanupAllAsync"));
   }
 
   void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
                        TensorResponse* response, StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("RecvTensorAsync"));
   }
 
   void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
                     StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("LoggingAsync"));
   }
 
   void TracingAsync(const TracingRequest* request, TracingResponse* response,
                     StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("TracingAsync"));
   }
 
   void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
@@ -103,20 +103,20 @@ class TestWorkerInterface : public WorkerInterface {
                           const CompleteGroupRequest* request,
                           CompleteGroupResponse* response,
                           StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CompleteGroupAsync"));
   }
 
   void CompleteInstanceAsync(CallOptions* ops,
                              const CompleteInstanceRequest* request,
                              CompleteInstanceResponse* response,
                              StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("CompleteInstanceAsync"));
   }
 
   void GetStepSequenceAsync(const GetStepSequenceRequest* request,
                             GetStepSequenceResponse* response,
                             StatusCallback done) override {
-    done(errors::Unimplemented("RunGraphAsync"));
+    done(errors::Unimplemented("GetStepSequenceAsync"));
   }
 };
 

From a92ff929b818c7dbca2d0c2648ae17e8d6ae3a40 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 09:31:32 -0700
Subject: [PATCH 1112/1533] Fix dimensionality handling issues in
 TextVectorization.

PiperOrigin-RevId: 313206360
Change-Id: I5929e83b26011c975561e525b90aef6949a185b2
---
 .../keras/layers/preprocessing/table_utils.py |  2 +
 .../preprocessing/text_vectorization.py       | 24 ++++----
 .../preprocessing/text_vectorization_test.py  | 57 ++++++++++++++++++-
 3 files changed, 68 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index 05447f6e9ff..16ac633f8dd 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -87,6 +87,8 @@ class TableHandler(object):
         self.table.lookup, inputs)
     indexed_data = ragged_functional_ops.map_flat_values(
         self._replace_oov_buckets, inputs, indexed_data)
+    # table.lookup is not shape-preserving, so we need to set the shape here.
+    indexed_data._set_shape(inputs.shape)  # pylint: disable=protected-access
     # Composite tensors can pass tensor values through, which will cause
     # errors if all operations in the TF graph do so. We can break this chain
     # with an identity here.
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 057575d4ecc..28d339ea5b1 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -490,11 +490,12 @@ class TextVectorization(CombinerPreprocessingLayer):
     # in None for undefined shape axes. If using 'and !=', this causes the
     # expression to evaluate to False instead of True if the shape is undefined;
     # the expression needs to evaluate to True in that case.
-    if self._split is not None and not input_shape[1] == 1:  # pylint: disable=g-comparison-negation
-      raise RuntimeError(
-          "When using TextVectorization to tokenize strings, the first "
-          "dimension of the input array must be 1, got shape "
-          "{}".format(input_shape))
+    if self._split is not None:
+      if input_shape.ndims > 1 and not input_shape[-1] == 1:  # pylint: disable=g-comparison-negation
+        raise RuntimeError(
+            "When using TextVectorization to tokenize strings, the innermost "
+            "dimension of the input array must be 1, got shape "
+            "{}".format(input_shape))
 
     super(TextVectorization, self).build(input_shape)
 
@@ -536,7 +537,8 @@ class TextVectorization(CombinerPreprocessingLayer):
       # If we are splitting, we validate that the 1st axis is of dimension 1 and
       # so can be squeezed out. We do this here instead of after splitting for
       # performance reasons - it's more expensive to squeeze a ragged tensor.
-      inputs = array_ops.squeeze(inputs, axis=1)
+      if inputs.shape.ndims > 1:
+        inputs = array_ops.squeeze(inputs, axis=-1)
       if self._split == SPLIT_ON_WHITESPACE:
         # This treats multiple whitespaces as one whitespace, and strips leading
         # and trailing whitespace.
@@ -561,8 +563,6 @@ class TextVectorization(CombinerPreprocessingLayer):
   def call(self, inputs):
     if isinstance(inputs, (list, tuple, np.ndarray)):
       inputs = ops.convert_to_tensor(inputs)
-    if inputs.shape.rank == 1:
-      inputs = array_ops.expand_dims(inputs, axis=-1)
 
     self._called = True
     inputs = self._preprocess(inputs)
@@ -570,9 +570,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     # If we're not doing any output processing, return right away.
     if self._output_mode is None:
       return inputs
-
     indexed_data = self._index_lookup_layer(inputs)
-
     if self._output_mode == INT:
       # Once we have the dense tensor, we can return it if we weren't given a
       # fixed output sequence length. If we were, though, we have to dynamically
@@ -585,7 +583,6 @@ class TextVectorization(CombinerPreprocessingLayer):
         dense_data = indexed_data
 
       if self._output_sequence_length is None:
-        dense_data.set_shape(tensor_shape.TensorShape((None, None)))
         return dense_data
       else:
         sequence_len = K.shape(dense_data)[1]
@@ -596,8 +593,9 @@ class TextVectorization(CombinerPreprocessingLayer):
             sequence_len < self._output_sequence_length,
             true_fn=pad_fn,
             false_fn=slice_fn)
-        output_tensor.set_shape(
-            tensor_shape.TensorShape((None, self._output_sequence_length)))
+        output_shape = output_tensor.shape.as_list()
+        output_shape[-1] = self._output_sequence_length
+        output_tensor.set_shape(tensor_shape.TensorShape(output_shape))
         return output_tensor
 
     # If we're not returning integers here, we rely on the vectorization layer
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index 5d909498d8a..508f222eac7 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -355,6 +355,59 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
     if context.executing_eagerly():
       self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
 
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "1d",
+          "data": ["0", "a", "b", "c", "d", "e", "a", "b", "c", "d", "f"],
+          "expected": [1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]
+      },
+      {
+          "testcase_name": "2d",
+          "data": [["0", "a", "b", "c", "d"], ["e", "a", "b", "c", "d"], ["f"]],
+          "expected": [[1, 2, 3, 4, 5], [1, 2, 3, 4, 5], [1, 0, 0, 0, 0]]
+      },
+      {
+          "testcase_name":
+              "3d",
+          "data": [[["0", "a", "b"], ["c", "d"]], [["e", "a"], ["b", "c", "d"]],
+                   [["f"]]],
+          "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
+                       [[1, 0, 0], [0, 0, 0]]]
+      },
+  )
+  def test_layer_dimensionality_handling(self, data, expected):
+    vocab = ["a", "b", "c", "d"]
+    vectorization = get_layer_class()(
+        max_tokens=None, standardize=None, split=None, pad_to_max_tokens=False)
+    vectorization.set_vocabulary(vocab)
+    output = vectorization(ragged_factory_ops.constant(data))
+    self.assertAllEqual(expected, output)
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "1d",
+          "data": ["0 a b c d e a b c d f"],
+          "expected": [[1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1]]
+      },
+      {
+          "testcase_name":
+              "3d",
+          "data": [[["0 a b"], ["c d"]], [["e a"], ["b c d"]], [["f"]]],
+          "expected": [[[1, 2, 3], [4, 5, 0]], [[1, 2, 0], [3, 4, 5]],
+                       [[1, 0, 0], [0, 0, 0]]]
+      },
+  )
+  def test_layer_dimensionality_handling_with_split(self, data, expected):
+    vocab = ["a", "b", "c", "d"]
+    vectorization = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=text_vectorization.SPLIT_ON_WHITESPACE,
+        pad_to_max_tokens=False)
+    vectorization.set_vocabulary(vocab)
+    output = vectorization(ragged_factory_ops.constant(data, inner_shape=(1,)))
+    self.assertAllEqual(expected, output)
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationPreprocessingTest(
@@ -580,7 +633,7 @@ class TextVectorizationPreprocessingTest(
         split=text_vectorization.SPLIT_ON_WHITESPACE,
         output_mode=None)
     with self.assertRaisesRegex(RuntimeError,
-                                ".*tokenize strings, the first dimension.*"):
+                                ".*tokenize strings, the innermost dime.*"):
       _ = layer(input_data)
 
   def test_string_splitting_with_non_1d_raggedarray_fails(self):
@@ -591,7 +644,7 @@ class TextVectorizationPreprocessingTest(
         split=text_vectorization.SPLIT_ON_WHITESPACE,
         output_mode=None)
     with self.assertRaisesRegex(RuntimeError,
-                                ".*tokenize strings, the first dimension.*"):
+                                ".*tokenize strings, the innermost dime.*"):
       _ = layer(input_data)
 
   def test_standardization_with_invalid_standardize_arg(self):

From f684ae97cd895f2d150f6e41a9012c2f9a5a40e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 09:55:14 -0700
Subject: [PATCH 1113/1533] Add a clear error message when users attempt to
 pass multi-dimensional arrays to table_utils.

PiperOrigin-RevId: 313210934
Change-Id: Id45d84de3061efc9c1f17e6523512c4c41054e8b
---
 .../python/keras/layers/preprocessing/table_utils.py     | 6 ++++++
 .../keras/layers/preprocessing/table_utils_test.py       | 9 +++++++++
 2 files changed, 15 insertions(+)

diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index 16ac633f8dd..cf1bfd741c9 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -21,6 +21,7 @@ import collections
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.keras import backend as K
 from tensorflow.python.ops import array_ops
@@ -60,6 +61,11 @@ class TableHandler(object):
       raise RuntimeError("Size mismatch between values and key arrays. "
                          "Keys had size %s, values had size %s." %
                          (len(keys), len(values)))
+    keys = ops.convert_to_tensor(keys, dtype=self.table._key_dtype)  # pylint: disable=protected-access
+    values = ops.convert_to_tensor(values, dtype=self.table._value_dtype)  # pylint: disable=protected-access
+    if values.shape.ndims != 1:
+      raise ValueError("`values` must be 1-dimensional, got an input with "
+                       " %s dimensions." % values.shape.ndims)
     self._run(self.table.insert(keys, values))
 
   def _replace_oov_buckets(self, inputs, lookups):
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
index 60a891f6ba8..ab7e80b628c 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils_test.py
@@ -108,6 +108,15 @@ class CategoricalEncodingInputTest(
 
     self.assertAllEqual(expected_output, output_data)
 
+  def test_tensor_multi_dim_values_fails(self):
+    key_data = np.array([0, 1], dtype=np.int64)
+    value_data = np.array([[11, 12], [21, 22]])
+
+    table = get_table(dtype=dtypes.int64, oov_tokens=[1, 2])
+
+    with self.assertRaisesRegexp(ValueError, "must be 1-dimensional"):
+      table.insert(key_data, value_data)
+
 
 @keras_parameterized.run_all_keras_modes
 class CategoricalEncodingMultiOOVTest(

From 6aece71ebf756d32ea730576a7ff12d2cfc7b242 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Tue, 26 May 2020 09:55:55 -0700
Subject: [PATCH 1114/1533] Reduce nest.map_structure and nest.pack_sequence_as
 time by ~20% for common use cases (nested structures of list, tuple, dict).

Places relatively cheap type checks for list, tuple, and dict before
other more expensive checks. Specifically, this avoids calling expensive
checks like isinstance(structure, collections.abc.Mapping) and nest._is_named_tuple in the most common cases (since these abc isinstance checks take ~10x as long as normal isinstance checks).

This reduces the Python overhead of a sample 10-layer Keras Functional Model __call__ by ~5%.

PiperOrigin-RevId: 313211095
Change-Id: I227a3dc379eefef31060698d8c5be5f4bf2c1f50
---
 tensorflow/python/util/nest.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/util/nest.py b/tensorflow/python/util/nest.py
index 695cc4cc909..b4736bee142 100644
--- a/tensorflow/python/util/nest.py
+++ b/tensorflow/python/util/nest.py
@@ -215,7 +215,15 @@ def _yield_sorted_items(iterable):
   Yields:
     The iterable's (key, value) pairs, in order of sorted keys.
   """
-  if isinstance(iterable, _collections_abc.Mapping):
+  # Ordered to check common structure types (list, tuple, dict) first.
+  if isinstance(iterable, list):
+    for item in enumerate(iterable):
+      yield item
+  # namedtuples handled separately to avoid expensive namedtuple check.
+  elif type(iterable) == tuple:  # pylint: disable=unidiomatic-typecheck
+    for item in enumerate(iterable):
+      yield item
+  elif isinstance(iterable, (dict, _collections_abc.Mapping)):
     # Iterate through dictionaries in a deterministic order by sorting the
     # keys. Notice this means that we ignore the original order of `OrderedDict`
     # instances. This is intentional, to avoid potential bugs caused by mixing

From 51504ec873c6d670253e106e325fd8ba965dcf0c Mon Sep 17 00:00:00 2001
From: RJ Skerry-Ryan <rjryan@google.com>
Date: Tue, 26 May 2020 09:57:48 -0700
Subject: [PATCH 1115/1533] Expand CompositeTensors in GradientTape.watch.

For built-in and user-defined CompositeTensors this is useful to be able to watch the composite without having to manually pick specific tensors within it to watch.

PiperOrigin-RevId: 313211503
Change-Id: I16a3fa178aa39a4e06d9b35e9fe40f06b10adcac
---
 tensorflow/python/eager/backprop.py      |  2 +-
 tensorflow/python/eager/backprop_test.py | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 7a3dce7db4e..dc7bb7c4b11 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -882,7 +882,7 @@ class GradientTape(object):
     Raises:
       ValueError: if it encounters something that is not a tensor.
     """
-    for t in nest.flatten(tensor):
+    for t in nest.flatten(tensor, expand_composites=True):
       if not (_pywrap_utils.IsTensor(t) or _pywrap_utils.IsVariable(t)):
         raise ValueError("Passed in object of type {}, not tf.Tensor".format(
             type(t)))
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index b28aaa3a626..a0f98fc0a44 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
@@ -48,6 +49,7 @@ from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import training
 
@@ -1484,6 +1486,19 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(ValueError, 'ndarray'):
       g.watch(np.array(1.))
 
+  def testWatchComposite(self):
+    """Test that tape.watch expands composites and watches component Tensors."""
+    with backprop.GradientTape() as t:
+      values = constant_op.constant([1.0, 2.0], dtypes.float32)
+      s = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 2]],
+          values=values,
+          dense_shape=[3, 4])
+      t.watch(s)
+      z = sparse_ops.sparse_reduce_sum_v2(s)
+    result = t.gradient(z, values)
+    self.assertAllEqual(result, [1.0, 1.0])
+
   def testWatchedVariablesAfterNonPersistentGradientCall(self):
     with backprop.GradientTape(persistent=False) as tape:
       x = resource_variable_ops.ResourceVariable(1.0)

From 956278ab3dd157a531111f12a92a4785fd238f46 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Tue, 26 May 2020 09:59:28 -0700
Subject: [PATCH 1116/1533] Make GrpcEagerClientCache::GetClient thread safe.

PiperOrigin-RevId: 313211894
Change-Id: I3195db70af77816183cf041d024f694c32613164
---
 .../core/distributed_runtime/rpc/eager/BUILD  | 19 ++++++
 .../rpc/eager/grpc_eager_client.cc            |  5 +-
 .../rpc/eager/grpc_eager_client_test.cc       | 58 +++++++++++++++++++
 3 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc

diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index c1deabc23cd..ff362c3411f 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
     default_visibility = [
@@ -57,3 +58,21 @@ cc_library(
         tf_grpc_cc_dependency(),
     ],
 )
+
+tf_cc_test(
+    name = "grpc_eager_client_test",
+    size = "small",
+    srcs = [
+        "grpc_eager_client_test.cc",
+    ],
+    deps = [
+        ":grpc_eager_client",
+        "//tensorflow/c:tf_status_headers",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:strcat",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index c8288f28c36..4e3da8b00e0 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -240,6 +240,7 @@ class GrpcEagerClientCache : public EagerClientCache {
 
   Status GetClient(const string& target,
                    core::RefCountPtr<EagerClient>* client) override {
+    mutex_lock l(clients_mu_);
     auto it = clients_.find(target);
     if (it == clients_.end()) {
       tensorflow::SharedGrpcChannelPtr shared =
@@ -281,7 +282,9 @@ class GrpcEagerClientCache : public EagerClientCache {
   }
 
   std::shared_ptr<tensorflow::GrpcChannelCache> cache_;
-  std::unordered_map<string, core::RefCountPtr<EagerClient>> clients_;
+  mutable mutex clients_mu_;
+  std::unordered_map<string, core::RefCountPtr<EagerClient>> clients_
+      TF_GUARDED_BY(clients_mu_);
   std::vector<core::RefCountPtr<GrpcEagerClientThread>> threads_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
new file mode 100644
index 00000000000..a6da56eca13
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
+
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace eager {
+
+TEST(GrpcEagerClientCache, TestGetClientThreadSafety) {
+  GrpcChannelSpec spec;
+  TF_ASSERT_OK(spec.AddHostPortsJob(
+      "worker", {"a:1", "b:2", "c:3", "d:4", "e:5", "f:6"}));
+  ChannelCreationFunction channel_func =
+      ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
+  auto channel_cache = std::shared_ptr<GrpcChannelCache>(
+      NewGrpcChannelCache(spec, channel_func));
+  std::unique_ptr<EagerClientCache> client_cache(
+      NewGrpcEagerClientCache(channel_cache));
+  const int num_calls = 10;
+  BlockingCounter counter(num_calls);
+
+  for (int i = 0; i < num_calls; i++) {
+    Env::Default()->SchedClosure([&client_cache, i, &counter]() {
+      string target = strings::StrCat("/job:worker/replica:0/task:", i);
+      core::RefCountPtr<EagerClient> eager_client;
+      Status s = client_cache->GetClient(target, &eager_client);
+      // With 6 tasks added to the job, querying client for 0--5 should be OK,
+      // and querying client for 6+ should give invalid argument error.
+      error::Code expected_code = i <= 5 ? error::OK : error::INVALID_ARGUMENT;
+      EXPECT_EQ(expected_code, s.code());
+      counter.DecrementCount();
+    });
+  }
+  counter.Wait();
+}
+
+}  // namespace eager
+}  // namespace tensorflow

From 0e80859784a3d00db0fb815e4c08666120b2e806 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 10:14:32 -0700
Subject: [PATCH 1117/1533] add a context manager and function decorator for
 monitoring time.

PiperOrigin-RevId: 313215897
Change-Id: I42aff9a8b95079a3c8929d32d747e778eba3c6dd
---
 tensorflow/python/eager/monitoring.py      | 45 ++++++++++++++++++++++
 tensorflow/python/eager/monitoring_test.py | 22 +++++++++++
 2 files changed, 67 insertions(+)

diff --git a/tensorflow/python/eager/monitoring.py b/tensorflow/python/eager/monitoring.py
index 26d4d8a55b3..74d98558192 100644
--- a/tensorflow/python/eager/monitoring.py
+++ b/tensorflow/python/eager/monitoring.py
@@ -19,6 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
+import time
 
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python import pywrap_tfe
@@ -428,3 +430,46 @@ class Sampler(Metric):
   def get_cell(self, *labels):
     """Retrieves the cell."""
     return SamplerCell(super(Sampler, self).get_cell(*labels))
+
+
+class MonitoredTimer(object):
+  """A context manager to measure the walltime and increment a Counter cell."""
+
+  def __init__(self, cell):
+    """Creates a new MonitoredTimer.
+
+    Args:
+      cell: the cell associated with the time metric that will be inremented.
+    """
+    self.cell = cell
+
+  def __enter__(self):
+    self.t = time.time()
+    return self
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+    micro_seconds = (time.time() - self.t) * 1000000
+    self.cell.increase_by(int(micro_seconds))
+
+
+def monitored_timer(cell):
+  """A function decorator for adding MonitoredTimer support.
+
+  Arguments:
+    cell: the cell associated with the time metric that will be inremented.
+  Returns:
+    A decorator that measure the function runtime and increment the specified
+    counter cell.
+  """
+
+  def actual_decorator(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+      with MonitoredTimer(cell):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+  return actual_decorator
diff --git a/tensorflow/python/eager/monitoring_test.py b/tensorflow/python/eager/monitoring_test.py
index 3f601735ef2..7cb8c0c2cd1 100644
--- a/tensorflow/python/eager/monitoring_test.py
+++ b/tensorflow/python/eager/monitoring_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 from tensorflow.python.eager import monitoring
 from tensorflow.python.eager import test
 from tensorflow.python.framework import errors
@@ -100,6 +102,26 @@ class MonitoringTest(test_util.TensorFlowTestCase):
     self.assertEqual(histogram_proto1.num, 2.0)
     self.assertEqual(histogram_proto1.sum, 6.0)
 
+  def test_context_manager(self):
+    counter = monitoring.Counter('test/ctxmgr', 'test context manager', 'slot')
+    with monitoring.MonitoredTimer(counter.get_cell('short')):
+      time.sleep(0.001)
+    with monitoring.MonitoredTimer(counter.get_cell('long')):
+      time.sleep(0.02)
+    self.assertGreater(
+        counter.get_cell('long').value(),
+        counter.get_cell('short').value())
+
+  def test_function_decorator(self):
+    counter = monitoring.Counter('test/funcdecorator', 'test func decorator')
+
+    @monitoring.monitored_timer(counter.get_cell())
+    def timed_function(seconds):
+      time.sleep(seconds)
+
+    timed_function(0.001)
+    self.assertGreater(counter.get_cell().value(), 1000)
+
 
 if __name__ == '__main__':
   test.main()

From 831a55584749593400807e0baa7478476b5dbc70 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Tue, 26 May 2020 10:17:29 -0700
Subject: [PATCH 1118/1533] Add an attribute "is_packed" to TPUReplicatedInput
 op which indicates whether the per-replica inputs are packed into one input.

PiperOrigin-RevId: 313216599
Change-Id: I9e9a38ee0fcb64caca9f2d1e2de268c9576ca6c8
---
 tensorflow/core/ops/tpu_replication_ops.cc              | 2 ++
 tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt | 2 +-
 tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/ops/tpu_replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc
index 3bb94044e14..a729d3c3b7b 100644
--- a/tensorflow/core/ops/tpu_replication_ops.cc
+++ b/tensorflow/core/ops/tpu_replication_ops.cc
@@ -44,6 +44,8 @@ REGISTER_OP("TPUReplicatedInput")
     .Attr("is_mirrored_variable: bool = false")
     // Index of the input. If is_mirrored_variable is true, this is ignored.
     .Attr("index: int = -1")
+    // All inputs are packed into one input
+    .Attr("is_packed: bool = false")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle cur = c->input(c->num_inputs() - 1);
       for (int i = c->num_inputs() - 2; i >= 0; --i) {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index a5fe83e713e..37a95cc88d1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -4606,7 +4606,7 @@ tf_module {
   }
   member_method {
     name: "TPUReplicatedInput"
-    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
+    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'is_packed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedOutput"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index a5fe83e713e..37a95cc88d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -4606,7 +4606,7 @@ tf_module {
   }
   member_method {
     name: "TPUReplicatedInput"
-    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'None\'], "
+    argspec: "args=[\'inputs\', \'is_mirrored_variable\', \'index\', \'is_packed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'-1\', \'False\', \'None\'], "
   }
   member_method {
     name: "TPUReplicatedOutput"

From 74e98c29aa24c9eccc44ca41b8e85a02235a4db7 Mon Sep 17 00:00:00 2001
From: Lev Proleev <levp@google.com>
Date: Tue, 26 May 2020 10:19:14 -0700
Subject: [PATCH 1119/1533] Add NNAPI delegate support for fused HardSwish

PiperOrigin-RevId: 313217072
Change-Id: I492a1d6c7b2b5968a29a24b0cef1c82e15898dad
---
 .../lite/delegates/nnapi/nnapi_delegate.cc    | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index b3967800b44..fd6703bd46a 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -660,8 +660,10 @@ class NNAPIOpBuilder {
   // Lower hardswish according to the following equation:
   // hard_swish[x] = x (ReLU6(x + 3)) / 6 == x * (Relu_N1_to_1(x/3) * 3 + 3) / 6
   // = 0.5x * Relu_N1_to_1(x/3) + 0.5x
-  TfLiteStatus AddHardSwish(int lite_input_index, int lite_output_index,
-                            bool need_int8_conversion, int lite_node_index) {
+  TfLiteStatus TransformHardSwishIntoSupportedOps(int lite_input_index,
+                                                  int lite_output_index,
+                                                  bool need_int8_conversion,
+                                                  int lite_node_index) {
     const TfLiteTensor& tensor = context_->tensors[lite_input_index];
     float input_scale = tensor.params.scale;
     int input_zero_point = tensor.params.zero_point;
@@ -2425,6 +2427,9 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarInt32Operand(builtin->activation);
       *nn_op_type = ANEURALNETWORKS_FULLY_CONNECTED;
     } break;
+    case kTfLiteBuiltinHardSwish: {
+      *nn_op_type = ANEURALNETWORKS_HARD_SWISH;
+    } break;
     case kTfLiteBuiltinSoftmax: {
       auto builtin = reinterpret_cast<TfLiteSoftmaxParams*>(
           mapping_args.node->builtin_data);
@@ -3635,10 +3640,14 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
       input_tensor_flags |= NN_TENSOR_FLAG_SCALAR_AS_TENSOR;
     }
 
-    // h_swish will be lowered into supported NNAPI operations.
-    if (reg->builtin_code == kTfLiteBuiltinHardSwish) {
-      builder.AddHardSwish(node->inputs->data[0], node->outputs->data[0],
-                           need_int8_conversion, node_index);
+    // On SDK level less than 30, h_swish will be lowered into supported NNAPI
+    // operations. Since SDK level 30, h_swish is supported as a single
+    // operation.
+    if (reg->builtin_code == kTfLiteBuiltinHardSwish &&
+        nnapi_->android_sdk_version < kMinSdkVersionForNNAPI13) {
+      builder.TransformHardSwishIntoSupportedOps(
+          node->inputs->data[0], node->outputs->data[0], need_int8_conversion,
+          node_index);
       continue;
     }
     // Map inputs to NN API tensor indices.

From e8786b80d7b14f174ce56d408cc1f0dda2a2f303 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 10:19:32 -0700
Subject: [PATCH 1120/1533] Add doctests to Normalization, TextVectorization,
 and Discretization layers.

PiperOrigin-RevId: 313217146
Change-Id: I463399f0cf792f25b82168263e24463c96328e2c
---
 .../layers/preprocessing/discretization.py    | 10 +++
 .../layers/preprocessing/normalization.py     | 15 ++++
 .../preprocessing/normalization_test.py       |  1 +
 .../preprocessing/text_vectorization.py       | 69 ++++++++++---------
 4 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index 003b6e64f90..3052cfb4369 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -52,6 +52,16 @@ class Discretization(Layer):
       exclude the right boundary, so `bins=[0., 1., 2.]` generates bins
       `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
     output_mode: One of 'int', 'binary'. Defaults to 'int'.
+
+  Examples:
+
+  Bucketize float values based on provided buckets.
+  >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
+  >>> layer = Discretization(bins=[0., 1., 2.])
+  >>> layer(input)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+  array([[0, 2, 3, 1],
+         [1, 3, 2, 1]], dtype=int32)>
   """
 
   def __init__(self, bins, output_mode=INTEGER, **kwargs):
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py
index 2ae6fcb7ec2..be04e9947b8 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization.py
@@ -55,6 +55,21 @@ class Normalization(CombinerPreprocessingLayer):
         in the specified axis. If set to 'None', the layer will perform scalar
         normalization (diving the input by a single scalar value). 0 (the batch
         axis) is not allowed.
+
+
+  Examples:
+
+  Calculate the mean and variance by analyzing the dataset in `adapt`.
+
+  >>> adapt_data = np.array([[1.], [2.], [3.], [4.], [5.]], dtype=np.float32)
+  >>> input_data = np.array([[1.], [2.], [3.]], np.float32)
+  >>> layer = Normalization()
+  >>> layer.adapt(adapt_data)
+  >>> layer(input_data)
+  <tf.Tensor: shape=(3, 1), dtype=float32, numpy=
+  array([[-1.4142135 ],
+         [-0.70710677],
+         [ 0.        ]], dtype=float32)>
   """
 
   def __init__(self, axis=-1, dtype=None, **kwargs):
diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
index 3503659f919..e5a429751f4 100644
--- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py
@@ -146,6 +146,7 @@ class NormalizationTest(keras_parameterized.TestCase,
     self.validate_accumulator_extract(combiner, data, expected)
     self.validate_accumulator_extract_and_restore(combiner, data,
                                                   expected)
+
   @parameterized.named_parameters(
       {
           "data": np.array([[1], [2], [3], [4], [5]]),
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 28d339ea5b1..c80f998fe46 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -157,42 +157,43 @@ class TextVectorization(CombinerPreprocessingLayer):
   Example:
   This example instantiates a TextVectorization layer that lowercases text,
   splits on whitespace, strips punctuation, and outputs integer vocab indices.
-  ```
-  max_features = 5000  # Maximum vocab size.
-  max_len = 40  # Sequence length to pad the outputs to.
 
-  # Create the layer.
-  vectorize_layer = text_vectorization.TextVectorization(
-    max_tokens=max_features,
-    output_mode='int',
-    output_sequence_length=max_len)
+  >>> text_dataset = tf.data.Dataset.from_tensor_slices(["foo", "bar", "baz"])
+  >>> max_features = 5000  # Maximum vocab size.
+  >>> max_len = 4  # Sequence length to pad the outputs to.
+  >>> embedding_dims = 2
+  >>>
+  >>> # Create the layer.
+  >>> vectorize_layer = TextVectorization(
+  ...  max_tokens=max_features,
+  ...  output_mode='int',
+  ...  output_sequence_length=max_len)
+  >>>
+  >>> # Now that the vocab layer has been created, call `adapt` on the text-only
+  >>> # dataset to create the vocabulary. You don't have to batch, but for large
+  >>> # datasets this means we're not keeping spare copies of the dataset.
+  >>> vectorize_layer.adapt(text_dataset.batch(64))
+  >>>
+  >>> # Create the model that uses the vectorize text layer
+  >>> model = tf.keras.models.Sequential()
+  >>>
+  >>> # Start by creating an explicit input layer. It needs to have a shape of
+  >>> # (1,) (because we need to guarantee that there is exactly one string
+  >>> # input per batch), and the dtype needs to be 'string'.
+  >>> model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
+  >>>
+  >>> # The first layer in our model is the vectorization layer. After this
+  >>> # layer, we have a tensor of shape (batch_size, max_len) containing vocab
+  >>> # indices.
+  >>> model.add(vectorize_layer)
+  >>>
+  >>> # Now, the model can map strings to integers, and you can add an embedding
+  >>> # layer to map these integers to learned embeddings.
+  >>> input_data = [["foo qux bar"], ["qux baz"]]
+  >>> model.predict(input_data)
+  array([[2, 1, 4, 0],
+         [1, 3, 0, 0]])
 
-  # Now that the vocab layer has been created, call `adapt` on the text-only
-  # dataset to create the vocabulary. You don't have to batch, but for large
-  # datasets this means we're not keeping spare copies of the dataset in memory.
-  vectorize_layer.adapt(text_dataset.batch(64))
-
-  # Create the model that uses the vectorize text layer
-  model = tf.keras.models.Sequential()
-
-  # Start by creating an explicit input layer. It needs to have a shape of (1,)
-  # (because we need to guarantee that there is exactly one string input per
-  # batch), and the dtype needs to be 'string'.
-  model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
-
-  # The first layer in our model is the vectorization layer. After this layer,
-  # we have a tensor of shape (batch_size, max_len) containing vocab indices.
-  model.add(vectorize_layer)
-
-  # Next, we add a layer to map those vocab indices into a space of
-  # dimensionality 'embedding_dims'. Note that we're using max_features+1 here,
-  # since there's an OOV token that gets added to the vocabulary in
-  # vectorize_layer.
-  model.add(tf.keras.layers.Embedding(max_features+1, embedding_dims))
-
-  # At this point, you have embedded float data representing your tokens, and
-  # can add whatever other layers you need to create your model.
-  ```
   """
   # TODO(momernick): Add an examples section to the docstring.
 

From fbb92d9e276eadce6edda8e65a070b5520bdeb93 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 26 May 2020 10:21:29 -0700
Subject: [PATCH 1121/1533] Add offset arg in Rescaling layer.

PiperOrigin-RevId: 313217664
Change-Id: I1c962740eef3d16fcf6272fd274f0c54450159a2
---
 .../preprocessing/image_preprocessing.py       | 18 ++++++++++++++----
 .../preprocessing/image_preprocessing_test.py  | 10 +++++-----
 ...experimental.preprocessing.-rescaling.pbtxt |  2 +-
 ...experimental.preprocessing.-rescaling.pbtxt |  2 +-
 4 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 832915dac68..e4b92e44e69 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -292,11 +292,16 @@ class RandomCrop(Layer):
 
 @keras_export('keras.layers.experimental.preprocessing.Rescaling')
 class Rescaling(Layer):
-  """Multiply inputs by `scale`.
+  """Multiply inputs by `scale` and adds `offset`.
 
-  For instance, to rescale an input in the `[0, 255]` range
+  For instance:
+
+  1. To rescale an input in the `[0, 255]` range
   to be in the `[0, 1]` range, you would pass `scale=1./255`.
 
+  2. To rescale an input in the `[0, 255]` range to be in the `[-1, 1]` range,
+  you would pass `scale=1./127.5, offset=-1`.
+
   The rescaling is applied both during training and inference.
 
   Input shape:
@@ -307,16 +312,20 @@ class Rescaling(Layer):
 
   Arguments:
     scale: Float, the scale to apply to the inputs.
+    offset: Float, the offset to apply to the inputs.
     name: A string, the name of the layer.
   """
 
-  def __init__(self, scale, name=None, **kwargs):
+  def __init__(self, scale, offset=0., name=None, **kwargs):
     self.scale = scale
+    self.offset = offset
     super(Rescaling, self).__init__(name=name, **kwargs)
 
   def call(self, inputs):
     dtype = self._compute_dtype
-    return math_ops.cast(inputs, dtype) * math_ops.cast(self.scale, dtype)
+    scale = math_ops.cast(self.scale, dtype)
+    offset = math_ops.cast(self.offset, dtype)
+    return math_ops.cast(inputs, dtype) * scale + offset
 
   def compute_output_shape(self, input_shape):
     return input_shape
@@ -324,6 +333,7 @@ class Rescaling(Layer):
   def get_config(self):
     config = {
         'scale': self.scale,
+        'offset': self.offset,
     }
     base_config = super(Rescaling, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 38d2d25916a..14720d3541d 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -306,7 +306,7 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_rescaling_base(self):
-    kwargs = {'scale': 0.004}
+    kwargs = {'scale': 1./127.5, 'offset': -1.}
     testing_utils.layer_test(
         image_preprocessing.Rescaling,
         kwargs=kwargs,
@@ -315,18 +315,18 @@ class RescalingTest(keras_parameterized.TestCase):
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_float(self):
-    layer = image_preprocessing.Rescaling(0.004)
+    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1.)
     inputs = random_ops.random_uniform((2, 4, 5, 3))
     outputs = layer(inputs)
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
   @tf_test_util.run_v2_only
   def test_rescaling_correctness_int(self):
-    layer = image_preprocessing.Rescaling(0.004)
+    layer = image_preprocessing.Rescaling(scale=1./127.5, offset=-1)
     inputs = random_ops.random_uniform((2, 4, 5, 3), 0, 100, dtype='int32')
     outputs = layer(inputs)
     self.assertEqual(outputs.dtype.name, 'float32')
-    self.assertAllClose(outputs.numpy(), inputs.numpy() * 0.004)
+    self.assertAllClose(outputs.numpy(), inputs.numpy() * (1./127.5) - 1)
 
   def test_config_with_custom_name(self):
     layer = image_preprocessing.Rescaling(0.5, name='rescaling')
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7036fb926a8..60c0bc92f81 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
index 7036fb926a8..60c0bc92f81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-rescaling.pbtxt
@@ -113,7 +113,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'scale\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'scale\', \'offset\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'0.0\', \'None\'], "
   }
   member_method {
     name: "add_loss"

From 1e053782142362d7853c491b0ea7f032a374810c Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 26 May 2020 10:25:21 -0700
Subject: [PATCH 1122/1533] Fix typo: space between 'on' and 'Android'

PiperOrigin-RevId: 313218645
Change-Id: I13f1d8195b267db23ec3dc6d8aacaff5fa6184e7
---
 tensorflow/lite/tools/delegates/gpu_delegate_provider.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
index db1f32b2282..62805b2644b 100644
--- a/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/gpu_delegate_provider.cc
@@ -154,8 +154,8 @@ TfLiteDelegatePtr GpuDelegateProvider::CreateTfLiteDelegate(
     delegate = TfLiteDelegatePtr(TFLGpuDelegateCreate(&gpu_opts),
                                  &TFLGpuDelegateDelete);
 #else
-    TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported on"
-                        "Android or iOS platforms.";
+    TFLITE_LOG(WARN) << "The GPU delegate compile options are only supported "
+                        "on Android or iOS platforms.";
     delegate = evaluation::CreateGPUDelegate();
 #endif
 

From 64ce2afd2f5026b41683efdc821b8653787cac67 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 26 May 2020 10:42:18 -0700
Subject: [PATCH 1123/1533] Add flow support to XPlane

PiperOrigin-RevId: 313222922
Change-Id: I35cba1610d0512fceb220c7abfb61a7d777db5d8
---
 .../core/profiler/utils/xplane_schema.cc      |  1 +
 .../core/profiler/utils/xplane_schema.h       | 33 +++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 28d5d303940..3705a4786fa 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -163,6 +163,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"stream", kStream},
       // Stats added when processing traces.
       {"group_id", kGroupId},
+      {"flow", kFlow},
       {"step_name", kStepName},
       {"level 0", kLevel0},
       {"tf_op", kTfOp},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 98264c3d6e4..de8dc32a4f1 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -155,6 +155,7 @@ enum StatType {
   kStream,
   // Stats added when processing traces.
   kGroupId,
+  kFlow,
   kStepName,
   kLevel0,
   kTfOp,
@@ -209,6 +210,38 @@ inline bool IsInternalStat(absl::optional<int64> stat_type) {
          stat_type == StatType::kLevel0;
 }
 
+// Support for flow events:
+// This class enables encoding/decoding the flow id and direction, stored as
+// XStat value.
+class XFlow {
+ public:
+  enum FlowDirection {
+    kFlowUnspecified = 0x0,
+    kFlowIn = 0x1,
+    kFlowOut = 0x2,
+    kFlowInOut = 0x3,
+  };
+
+  XFlow(uint64 flow_id, FlowDirection direction)
+      : encoded_((flow_id << 2) | (direction & 0x3)) {
+    DCHECK_NE(Direction(), kFlowUnspecified);
+  }
+
+  // Encoding
+  uint64 ToStatValue() const { return encoded_; }
+
+  // Decoding
+  static XFlow FromStatValue(uint64 encoded) { return XFlow(encoded); }
+
+  uint64 Id() const { return (encoded_ >> 2); }
+  FlowDirection Direction() const { return FlowDirection(encoded_ & 0x3); }
+
+ private:
+  explicit XFlow(uint64 encoded) : encoded_(encoded) {}
+
+  uint64 encoded_;
+};
+
 }  // namespace profiler
 }  // namespace tensorflow
 

From 509325e1b12df34e5d06117ac58242de58bd7798 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 10:48:43 -0700
Subject: [PATCH 1124/1533] Adding memcached_file_block_cache to gstpufs

PiperOrigin-RevId: 313224456
Change-Id: I19892a14a57bd4172f7434b47f946c88ba5eeaa9
---
 tensorflow/core/platform/cloud/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 101d7ac5807..2440549a353 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -20,6 +20,7 @@ package_group(
     packages = [
         "//learning/brain/tfrc/...",
         "//tensorflow/...",
+        "//third_party/gstpufs/...",
     ],
 )
 

From 444ea7fa7f6ef7f0a4cfe15d2e78f107b61b198e Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Tue, 26 May 2020 11:10:11 -0700
Subject: [PATCH 1125/1533] Add tf.Case op to TF dialect and its legalization
 to xla_hlo.case.

PiperOrigin-RevId: 313229441
Change-Id: Idcf29c834eefccdce33fed25e923d334d03da931
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 45 ++++++++++++++--
 .../xla/tests/legalize-tf-control-flow.mlir   | 51 ++++++++++++++++--
 .../transforms/legalize_tf_control_flow.cc    | 52 +++++++++++++++----
 3 files changed, 133 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 957ba4909a9..1df8f7fd519 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1195,6 +1195,46 @@ subsequent operation and then be optimized away, however.)
   }];
 }
 
+def TF_CaseOp : TF_Op<"Case", []> {
+  let summary = [{
+An n-way switch statement which calls a single branch function.
+  }];
+
+  let description = [{
+An n-way switch statement, implementing the following:
+    ```
+    switch (branch_index) {
+      case 0:
+        output = branches[0](input);
+        break;
+      case 1:
+        output = branches[1](input);
+        break;
+      ...
+      case [[nbranches-1]]:
+      default:
+        output = branches[nbranches-1](input);
+        break;
+    }
+    ```
+  }];
+
+  let arguments = (ins
+    I32Tensor:$branch_index,
+    Variadic<TF_Tensor>:$input,
+
+    Confined<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
+    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$output
+  );
+
+  TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_CastOp : TF_Op<"Cast", [NoSideEffect, SameOperandsAndResultShape]> {
   let summary = "Cast x of type SrcT to y of DstT.";
 
@@ -8317,9 +8357,8 @@ def TF_StackV2Op : TF_Op<"StackV2", []> {
   );
 }
 
-def TF_StopGradientOp : TF_Op<"StopGradient",
-    [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
-  let summary = "Stops gradient computation";
+def TF_StopGradientOp : TF_Op<"StopGradient", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
+  let summary = "Stops gradient computation.";
 
   let description = [{
 When executed in a graph, this op outputs its input tensor as-is.
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
index 61f82fcad19..b3307a8f52a 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-control-flow.mlir
@@ -1,7 +1,7 @@
 // RUN: tf-opt -xla-legalize-tf-control-flow %s | FileCheck %s --dump-input-on-failure
 
-// CHECK-LABEL: @conditional
-func @conditional(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>)
+// CHECK-LABEL: @if
+func @if(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>)
 attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
   // CHECK: [[VAL0:%.+]] = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %0 = "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "GT"} : (tensor<f32>, tensor<f32>) -> tensor<i1>
@@ -40,7 +40,52 @@ attributes  {tf._input_shapes = ["tfshape$", "tfshape$"]} {
   return %0 : tensor<f32>
 }
 
-// CHECK-LABEL: @while
+
+// CHECK-LABEL: func @case
+// CHECK-SAME:  %[[BRANCH_INDEX:.*]]: tensor<i32>, %[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>) -> (tensor<f32>, tensor<f32>)
+func @case(%index: tensor<i32>, %arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0:2 = "tf.Case"(%index, %arg0, %arg1) {branches = [@exponential, @log, @floor]} : (tensor<i32>, tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK: %[[TUPLE_INPUT:.*]] = "xla_hlo.tuple"(%[[ARG0]], %[[ARG1]]) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>>
+  // CHECK: %[[CASE:.*]]:2 = "xla_hlo.case"(%[[BRANCH_INDEX]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]], %[[TUPLE_INPUT]]) ( {
+  // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
+  // CHECK:     %[[TUPLE_ELEMENT_0:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[TUPLE_ELEMENT_1:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 1 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[CALL_EXP:.*]]:2 = call @exponential(%[[TUPLE_ELEMENT_0]], %[[TUPLE_ELEMENT_1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK:     "xla_hlo.return"(%[[CALL_EXP]]#0, %[[CALL_EXP]]#1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK:   },  {
+  // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
+  // CHECK:     %[[TUPLE_ELEMENT_0:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[TUPLE_ELEMENT_1:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 1 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[CALL_LOG:.*]]:2 = call @log(%[[TUPLE_ELEMENT_0]], %[[TUPLE_ELEMENT_1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK:     "xla_hlo.return"(%[[CALL_LOG]]#0, %[[CALL_LOG]]#1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK:   },  {
+  // CHECK:   ^bb0(%[[TUPLE_ARG:.*]]: tuple<tensor<f32>, tensor<f32>>):
+  // CHECK:     %[[TUPLE_ELEMENT_0:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 0 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[TUPLE_ELEMENT_1:.*]] = "xla_hlo.get_tuple_element"(%[[TUPLE_ARG]]) {index = 1 : i32} : (tuple<tensor<f32>, tensor<f32>>) -> tensor<f32>
+  // CHECK:     %[[CALL_FLOOR:.*]]:2 = call @floor(%[[TUPLE_ELEMENT_0]], %[[TUPLE_ELEMENT_1]]) : (tensor<f32>, tensor<f32>) -> (tensor<f32>, tensor<f32>)
+  // CHECK:     "xla_hlo.return"(%[[CALL_FLOOR]]#0, %[[CALL_FLOOR]]#1) : (tensor<f32>, tensor<f32>) -> ()
+  // CHECK:   }) : (tensor<i32>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>, tuple<tensor<f32>, tensor<f32>>) -> (tensor<f32>, tensor<f32>)
+  return %0#0, %0#1 : tensor<f32>, tensor<f32>
+// CHECK:   return %[[CASE]]#0, %[[CASE]]#1 : tensor<f32>, tensor<f32>
+}
+
+func @exponential(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0 = "xla_hlo.exponential"(%arg1) : (tensor<f32>) -> tensor<f32>
+  return %0, %arg1 : tensor<f32>, tensor<f32>
+}
+
+func @log(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0 = "xla_hlo.log"(%arg0) : (tensor<f32>) -> tensor<f32>
+  return %0, %arg1 : tensor<f32>, tensor<f32>
+}
+
+func @floor(%arg0: tensor<f32>, %arg1: tensor<f32>) -> (tensor<f32>, tensor<f32>) {
+  %0 = "xla_hlo.floor"(%arg0) : (tensor<f32>) -> tensor<f32>
+  return %0, %arg1 : tensor<f32>, tensor<f32>
+}
+
+
+// CHECK-LABEL: func @while
 func @while(%arg0: tensor<f32> {tf_saved_model.index_path = [0]}) -> (tensor<i32> {tf_saved_model.index_path = []})
 attributes  {tf._input_shapes = ["tfshape$"]} {
   // CHECK: [[VAL0:%.+]] = xla_hlo.constant dense<0>
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
index ef13e66568d..d5e5b6f5a71 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_control_flow.cc
@@ -81,7 +81,6 @@ void Detuple(Value tuple, Operation::result_range replace, OpBuilder* builder) {
 // results of the if operation are tupled together.
 void ImportXlaRegion(mlir::FuncOp func, Region* dest_region, Location loc,
                      bool tuple_return = true) {
-  BlockAndValueMapping mapper;
   OpBuilder builder(dest_region);
 
   auto entry_block = builder.createBlock(dest_region);
@@ -111,11 +110,9 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   // XLA prefers tuple arguments for control flow due to XLA not supporting
   // multiple return values.
   SmallVector<Value, 3> inputs(op.input());
-  builder.setInsertionPoint(op);
   auto tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
 
   // Create the new if op with tuple inputs.
-  SmallVector<Value, 3> operands(op.getOperands());
   auto result_type = builder.getTupleType(op.getResultTypes());
   auto if_op = builder.create<xla_hlo::IfOp>(loc, result_type, op.cond(),
                                              tuple_input, tuple_input);
@@ -123,18 +120,45 @@ void LowerIf(TF::IfOp op, ModuleOp module) {
   // Import the regions for both the true and false cases. These regions
   // must be updated to tuple the return results together and use the xla hlo
   // return op.
-  BlockAndValueMapping mapper;
   auto then_branch = module.lookupSymbol<mlir::FuncOp>(op.then_branch());
   auto else_branch = module.lookupSymbol<mlir::FuncOp>(op.else_branch());
   ImportXlaRegion(then_branch, &if_op.true_branch(), loc);
   ImportXlaRegion(else_branch, &if_op.false_branch(), loc);
 
   // De-tuple the results of the xla hlo if result.
-  builder.setInsertionPointAfter(op);
   Detuple(if_op.getResult(), op.getResults(), &builder);
   op.erase();
 }
 
+void LowerCase(TF::CaseOp op, ModuleOp module) {
+  Location loc = op.getLoc();
+  OpBuilder builder(op);
+
+  // XLA requires one argument per branch so we create a tuple of inputs to pass
+  // to each branch.
+  SmallVector<Value, 4> inputs(op.input());
+  auto tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
+
+  // Create replica of input tuple for each branch
+  SmallVector<Value, 4> n_tuple_inputs(op.branches().size(), tuple_input);
+
+  // Create the new case op with tuple inputs.
+  auto case_op = builder.create<xla_hlo::CaseOp>(
+      loc, op.getResultTypes(), op.branch_index(), n_tuple_inputs,
+      op.branches().size());
+
+  // Import the regions for all branches.
+  for (unsigned i = 0; i < op.branches().size(); ++i) {
+    mlir::FuncOp branch_func = module.lookupSymbol<mlir::FuncOp>(
+        op.branches()[i].cast<SymbolRefAttr>());
+    ImportXlaRegion(branch_func, &case_op.branches()[i], loc,
+                    /*tuple_return=*/false);
+  }
+
+  op.replaceAllUsesWith(case_op.getResults());
+  op.erase();
+}
+
 void LowerWhile(TF::WhileOp op, ModuleOp module) {
   Location loc = op.getLoc();
   OpBuilder builder(op);
@@ -146,7 +170,6 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
   Value tuple_input = builder.create<xla_hlo::TupleOp>(loc, inputs);
 
   // Create the new while op with tuple inputs.
-  SmallVector<Value, 3> operands(op.getOperands());
   auto while_op = builder.create<xla_hlo::WhileOp>(
       loc, builder.getTupleType(op.getResultTypes()), tuple_input);
 
@@ -159,7 +182,6 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
   ImportXlaRegion(cond_branch, &while_op.cond(), loc, /*tuple_return=*/false);
 
   // De-tuple the results of the xla hlo while.
-  builder.setInsertionPointAfter(op);
   Detuple(while_op.getResult(), op.getResults(), &builder);
   op.erase();
 }
@@ -168,8 +190,20 @@ void LowerWhile(TF::WhileOp op, ModuleOp module) {
 void LegalizeTFControlFlow::runOnOperation() {
   auto module = getOperation();
 
-  module.walk([&](TF::WhileOp op) -> void { LowerWhile(op, module); });
-  module.walk([&](TF::IfOp op) -> void { LowerIf(op, module); });
+  module.walk([&](Operation* op) {
+    if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
+      LowerWhile(while_op, module);
+      return;
+    }
+    if (auto if_op = dyn_cast<TF::IfOp>(op)) {
+      LowerIf(if_op, module);
+      return;
+    }
+    if (auto case_op = dyn_cast<TF::CaseOp>(op)) {
+      LowerCase(case_op, module);
+      return;
+    }
+  });
 }
 }  // namespace xla_hlo
 }  // namespace mlir

From 48296300d6194e4622c0ea6447ffc3c22d4f5329 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 11:20:59 -0700
Subject: [PATCH 1126/1533] [XLA] Add support for sinking broadcasts through
 ops with multiple broadcasts operands.

Add support for sinking this kind of pattern:

      p0 = f32[4] parameter(0)
      p1 = f32[4] parameter(1)
      b0 = f32[4,2] broadcast(p0), dimensions={0}
      b1 = f32[4,2] broadcast(p1), dimensions={0}
      ROOT multiply = f32[4,2] multiply(b1, b0)

into:

      p0 = f32[4] parameter(0)
      p1 = f32[4] parameter(1)
      multiply = f32[4] multiply(p1, p0)
      ROOT out = f32[4,2] broadcast(multiply)

PiperOrigin-RevId: 313231737
Change-Id: Ic508b3cf30daaf1a2aa9246886ef63ad49be6a01
---
 .../xla/service/algebraic_simplifier.cc       | 54 ++++++++++----
 .../xla/service/algebraic_simplifier_test.cc  | 73 +++++++++++++++++++
 2 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index e0a8b87c83b..4025cb46f18 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -3058,6 +3058,20 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
     return false;
   }
   HloInstruction* operand = broadcast->mutable_operand(0);
+  auto is_scalar_broadcast = [](const HloInstruction* instruction) {
+    return instruction->opcode() == HloOpcode::kBroadcast &&
+           ShapeUtil::IsScalar(instruction->operand(0)->shape());
+  };
+  auto is_equal_broadcast = [operand,
+                             broadcast](const HloInstruction* instruction) {
+    return instruction->opcode() == HloOpcode::kBroadcast &&
+           ShapeUtil::Equal(operand->shape(),
+                            instruction->operand(0)->shape()) &&
+           broadcast->dimensions() == instruction->dimensions();
+  };
+  auto is_compatible_broadcast = [&](const HloInstruction* instruction) {
+    return is_scalar_broadcast(instruction) || is_equal_broadcast(instruction);
+  };
   for (HloInstruction* user : broadcast->users()) {
     if (user->user_count() == 0 && user != computation_->root_instruction()) {
       continue;
@@ -3076,18 +3090,20 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
       continue;
     }
 
-    // Find the unique non-scalar operand or continue if there isn't one.
-    int64 scalar_broadcast_count = 0;
+    // Check if all the operands of the user are compatible broadcasts for
+    // sinking. (They are either scalar broadcasts or broadcasts casting
+    // from/to the same shape/dimensions)
+    int64 compatible_broadcast_count = 0;
     int64 broadcast_use_count = 0;
     for (HloInstruction* user_operand : user->operands()) {
-      if (user_operand->opcode() == HloOpcode::kBroadcast &&
-          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
-        ++scalar_broadcast_count;
+      if (is_compatible_broadcast(user_operand)) {
+        ++compatible_broadcast_count;
       } else if (broadcast == user_operand) {
         ++broadcast_use_count;
       }
     }
-    if (scalar_broadcast_count + broadcast_use_count != user->operand_count()) {
+    if (compatible_broadcast_count + broadcast_use_count !=
+        user->operand_count()) {
       continue;
     }
     std::vector<HloInstruction*> new_operands;
@@ -3095,14 +3111,24 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
 
     Shape changed_shape;
     for (HloInstruction* user_operand : user->operands()) {
-      if (user_operand->opcode() == HloOpcode::kBroadcast &&
-          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
-        changed_shape = ShapeUtil::ChangeElementType(
-            operand->shape(), user_operand->shape().element_type());
-        simplifier_->UpdateLayout(&changed_shape);
-        new_operands.push_back(
-            computation_->AddInstruction(HloInstruction::CreateBroadcast(
-                changed_shape, user_operand->mutable_operand(0), {})));
+      // If this is a broadcast operand that is not our original broadcast input
+      // to this function then we might need to change the input.
+      if (is_compatible_broadcast(user_operand)) {
+        // If this is a broadcast from a scalar value rewrite a broadcast from
+        // the scalar to the new shape enforced from the other broadcast
+        // operands.
+        if (is_scalar_broadcast(user_operand)) {
+          changed_shape = ShapeUtil::ChangeElementType(
+              operand->shape(), user_operand->shape().element_type());
+          simplifier_->UpdateLayout(&changed_shape);
+          new_operands.push_back(
+              computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                  changed_shape, user_operand->mutable_operand(0), {})));
+        } else {
+          // For the non-scalar broadcasts we guarantee that the shape of the
+          // operand of the broadcast needs to be already a compatible shape.
+          new_operands.push_back(user_operand->mutable_operand(0));
+        }
       } else {
         CHECK_EQ(broadcast, user_operand);
         new_operands.push_back(operand);
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 3ac47821654..bcfc2fdc740 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -338,6 +338,79 @@ TEST_F(AlgebraicSimplifierTest, MultiplyReassociateMergeBroadcastedConstants) {
                                                     m::ConstantScalar(3.0))))));
 }
 
+TEST_F(AlgebraicSimplifierTest, ElementwiseSinkMultipleBroadcastsScalar) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      b0 = f32[4] broadcast(p0), dimensions={}
+      b1 = f32[4] broadcast(p1), dimensions={}
+      ROOT multiply = f32[4] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Broadcast(m::Multiply(m::Broadcast(m::Parameter(1)),
+                                          m::Broadcast(m::Parameter(0))))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ElementwiseSinkMultipleBroadcastsConstantMix) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      c0 = f32[] constant(2.0)
+      b0 = f32[4,2] broadcast(c0), dimensions={}
+      b1 = f32[4,2] broadcast(p0), dimensions={0}
+      ROOT multiply = f32[4,2] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Multiply(
+                  m::Parameter(0), m::Broadcast(m::ConstantScalar(2.0))))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ElementwiseSinkMultipleBroadcastsNonScalar) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      b0 = f32[4,2] broadcast(p0), dimensions={0}
+      b1 = f32[4,2] broadcast(p1), dimensions={0}
+      ROOT multiply = f32[4,2] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Broadcast(m::Multiply(m::Parameter(1), m::Parameter(0)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ElementwiseNoSinkBroadcastsDifferentDims) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[8] parameter(1)
+      b0 = f32[4,8] broadcast(p0), dimensions={0}
+      b1 = f32[4,8] broadcast(p1), dimensions={1}
+      ROOT multiply = f32[4,8] multiply(b1, b0)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Multiply(m::Broadcast(m::Parameter(1)),
+                                     m::Broadcast(m::Parameter(0)))));
+}
+
 TEST_F(AlgebraicSimplifierTest,
        MultiplyReassociateMultiplyOfConstantAndBroadcast) {
   const char* kModuleStr = R"(

From 24879e0878b9724562bc26855e1f916efcd2aec3 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Tue, 26 May 2020 23:09:29 +0530
Subject: [PATCH 1127/1533] Fix LHLO to affine conversion to use
 affine.load/stores

The conversions from LHLO to affine on pointwise binary ops were
generating std dialect load/stores instead of affine.load/stores. These
ops' memref access functions are identity functions on surrounding loop
induction variables and as such are always affine. Fix this.

Signed-off-by: Uday Bondhugula <uday@polymagelabs.com>
---
 .../xla/tests/lhlo-legalize-to-affine.mlir     | 18 +++++++++---------
 .../xla/transforms/lhlo_legalize_to_affine.cc  |  6 +++---
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
index 08ba9f02f3e..aaf65b5a38a 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-affine.mlir
@@ -5,15 +5,15 @@
 func @min_op(%lhs: memref<4x3x2x1xf32>, %rhs: memref<4x3x2x1xf32>,
              %result: memref<4x3x2x1xf32>) -> () {
   // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 4 {
-  // CHECK-NEXT: affine.for %[[J:.*]] = 0 to 3 {
-  // CHECK-NEXT: affine.for %[[K:.*]] = 0 to 2 {
-  // CHECK-NEXT: affine.for %[[L:.*]] = 0 to 1 {
-  // CHECK-NEXT: %[[LHS:.*]] = load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK-NEXT: %[[RHS:.*]] = load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK-NEXT: %[[MIN_PREDICATE:.*]] = cmpf "olt", %[[LHS]], %[[RHS]] : f32
-  // CHECK-NEXT: %[[MIN:.*]] = select %[[MIN_PREDICATE]], %[[LHS]], %[[RHS]] : f32
-  // CHECK-NEXT: store %[[MIN]], %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
-  // CHECK: return
+  // CHECK-NEXT:   affine.for %[[J:.*]] = 0 to 3 {
+  // CHECK-NEXT:     affine.for %[[K:.*]] = 0 to 2 {
+  // CHECK-NEXT:       affine.for %[[L:.*]] = 0 to 1 {
+  // CHECK-NEXT:         %[[LHS:.*]] = affine.load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
+  // CHECK-NEXT:         %[[RHS:.*]] = affine.load %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
+  // CHECK-NEXT:         %[[MIN_PREDICATE:.*]] = cmpf "olt", %[[LHS]], %[[RHS]] : f32
+  // CHECK-NEXT:         %[[MIN:.*]] = select %[[MIN_PREDICATE]], %[[LHS]], %[[RHS]] : f32
+  // CHECK-NEXT:         affine.store %[[MIN]], %{{.*}}[%[[I]], %[[J]], %[[K]], %[[L]]] : memref<4x3x2x1xf32>
+  // CHECK:      return
   "xla_lhlo.minimum"(%lhs, %rhs, %result) {name = "min.1"} :
       (memref<4x3x2x1xf32>, memref<4x3x2x1xf32>, memref<4x3x2x1xf32>) -> ()
   return
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
index 2921a49ba70..f7f5537f882 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_affine.cc
@@ -54,14 +54,14 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
       induction_vars.push_back(forOp.getInductionVar());
       rewriter.setInsertionPointToStart(forOp.getBody());
     }
-    auto l = rewriter.create<LoadOp>(loc, lhs, induction_vars);
-    auto r = rewriter.create<LoadOp>(loc, rhs, induction_vars);
+    auto l = rewriter.create<AffineLoadOp>(loc, lhs, induction_vars);
+    auto r = rewriter.create<AffineLoadOp>(loc, rhs, induction_vars);
     Value opResult = xla_lhlo::XlaOpToStdScalarOp::map<LhloOpTy>(
         op, element_type, {l, r}, &rewriter);
     if (opResult == nullptr) {
       return failure();
     }
-    rewriter.create<StoreOp>(loc, opResult, op.out(), induction_vars);
+    rewriter.create<AffineStoreOp>(loc, opResult, op.out(), induction_vars);
     rewriter.eraseOp(op);
     return success();
   }

From dd7849ed4c4c304ce15f6a95ff4d95c9f4af97bb Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 26 May 2020 11:50:13 -0700
Subject: [PATCH 1128/1533] Fix issue where calling plot_model on Functional
 model that uses add_loss would crash due to model._layers containing
 DictWrapper objects.

PiperOrigin-RevId: 313237777
Change-Id: I1e9685242f3c5d887340fbcfed6f4709681c7cb7
---
 tensorflow/python/keras/utils/vis_utils.py    |  3 ++-
 .../python/keras/utils/vis_utils_test.py      | 27 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 158f6c83748..e56f07e4bb7 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -129,6 +129,7 @@ def model_to_dot(model,
   sub_w_first_node = {}
   sub_w_last_node = {}
 
+  layers = model.layers
   if not model._is_graph_network:
     node = pydot.Node(str(id(model)), label=model.name)
     dot.add_node(node)
@@ -136,7 +137,7 @@ def model_to_dot(model,
   elif isinstance(model, sequential.Sequential):
     if not model.built:
       model.build()
-  layers = model._layers
+    layers = super(sequential.Sequential, model).layers
 
   # Create graph nodes.
   for i, layer in enumerate(layers):
diff --git a/tensorflow/python/keras/utils/vis_utils_test.py b/tensorflow/python/keras/utils/vis_utils_test.py
index 34bc835da32..984014216be 100644
--- a/tensorflow/python/keras/utils/vis_utils_test.py
+++ b/tensorflow/python/keras/utils/vis_utils_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python import keras
 from tensorflow.python.keras.utils import vis_utils
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -67,6 +68,32 @@ class ModelToDotFormatTest(test.TestCase):
     except ImportError:
       pass
 
+  def test_plot_model_with_add_loss(self):
+    inputs = keras.Input(shape=(None, 3))
+    outputs = keras.layers.Dense(1)(inputs)
+    model = keras.Model(inputs, outputs)
+    model.add_loss(math_ops.reduce_mean(outputs))
+    dot_img_file = 'model_3.png'
+    try:
+      vis_utils.plot_model(
+          model, to_file=dot_img_file, show_shapes=True, expand_nested=True)
+      self.assertTrue(file_io.file_exists(dot_img_file))
+      file_io.delete_file(dot_img_file)
+    except ImportError:
+      pass
+
+    model = keras.Sequential([
+        keras.Input(shape=(None, 3)), keras.layers.Dense(1)])
+    model.add_loss(math_ops.reduce_mean(model.output))
+    dot_img_file = 'model_4.png'
+    try:
+      vis_utils.plot_model(
+          model, to_file=dot_img_file, show_shapes=True, expand_nested=True)
+      self.assertTrue(file_io.file_exists(dot_img_file))
+      file_io.delete_file(dot_img_file)
+    except ImportError:
+      pass
+
 
 if __name__ == '__main__':
   test.main()

From 8c8dc2699bc8d91345ae7ea6d38a20f15efb8f31 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Tue, 26 May 2020 12:12:38 -0700
Subject: [PATCH 1129/1533] Cleanup and refactor all allocations in
 MicroAllocator to function calls.

This change is a precursor to adding a new memory logging MicroAllocator subclass that will enable TFLM to keep track of tensor arena tail allocations. Outside of moving all arena allocations to utility methods - I also cleaned up the organization of the methods inside of the cc file.

PiperOrigin-RevId: 313242666
Change-Id: Icddcc07187419fe314bc57708170cda8cd35690a
---
 tensorflow/lite/micro/micro_allocator.cc      | 284 ++++++++++--------
 tensorflow/lite/micro/micro_allocator.h       |  58 +++-
 tensorflow/lite/micro/micro_allocator_test.cc |   6 +-
 tensorflow/lite/micro/micro_interpreter.cc    |   2 +-
 4 files changed, 202 insertions(+), 148 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 1dd1fa4b63c..b67e158980d 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -258,7 +258,7 @@ TfLiteStatus CommitPlan(ErrorReporter* error_reporter, MemoryPlanner* planner,
 
 namespace internal {
 
-TfLiteStatus InitializeRuntimeTensor(
+TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result) {
@@ -380,58 +380,9 @@ TfLiteStatus InitializeRuntimeTensor(
   }
   return kTfLiteOk;
 }
+
 }  // namespace internal
 
-TfLiteStatus MicroAllocator::Init() {
-  auto* subgraphs = model_->subgraphs();
-  if (subgraphs->size() != 1) {
-    TF_LITE_REPORT_ERROR(error_reporter_,
-                         "Only 1 subgraph is currently supported.\n");
-    return kTfLiteError;
-  }
-  subgraph_ = (*subgraphs)[0];
-
-  context_->tensors_size = subgraph_->tensors()->size();
-  context_->tensors =
-      reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
-          sizeof(TfLiteTensor) * context_->tensors_size,
-          alignof(TfLiteTensor)));
-  if (context_->tensors == nullptr) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Failed to allocate memory for context->tensors, %d bytes required",
-        sizeof(TfLiteTensor) * context_->tensors_size);
-    return kTfLiteError;
-  }
-
-  // Initialize runtime tensors in context_ using the flatbuffer.
-  for (size_t i = 0; i < subgraph_->tensors()->size(); ++i) {
-    TfLiteStatus status = internal::InitializeRuntimeTensor(
-        memory_allocator_, *subgraph_->tensors()->Get(i), model_->buffers(),
-        error_reporter_, &context_->tensors[i]);
-    if (status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
-                           i);
-      return kTfLiteError;
-    }
-  }
-
-  return kTfLiteOk;
-}
-
-size_t MicroAllocator::used_bytes() const {
-  if (active_) {
-    return 0;
-  }
-  TF_LITE_REPORT_ERROR(error_reporter_, "Total buffer usage: %d bytes",
-                       memory_allocator_->GetUsedBytes());
-  TF_LITE_REPORT_ERROR(error_reporter_, "Head usage: %d bytes",
-                       memory_allocator_->GetHeadUsedBytes());
-  TF_LITE_REPORT_ERROR(error_reporter_, "Tail usage: %d bytes",
-                       memory_allocator_->GetTailUsedBytes());
-  return memory_allocator_->GetUsedBytes();
-}
-
 MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
                                uint8_t* tensor_arena, size_t arena_size,
                                ErrorReporter* error_reporter)
@@ -450,7 +401,8 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
   // destructed as it's the root allocator.
   memory_allocator_ = CreateInPlaceSimpleMemoryAllocator(
       error_reporter, aligned_arena, aligned_arena_size);
-  TfLiteStatus status = Init();
+
+  TfLiteStatus status = InitGraphAndContextTensorData();
   // TODO(b/147871299): Consider improving this code. A better way of handling
   // failures in the constructor is to have a static function that returns a
   // pointer to the class. If allocation failed, a nullptr will be returned.
@@ -463,88 +415,15 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
   }
 }
 
-TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
+TfLiteStatus MicroAllocator::InitializeFromFlatbuffer(
     const OpResolver& op_resolver,
     NodeAndRegistration** node_and_registrations) {
   if (!active_) {
     return kTfLiteError;
   }
-
-  auto* output = reinterpret_cast<NodeAndRegistration*>(
-      memory_allocator_->AllocateFromTail(
-          sizeof(NodeAndRegistration) * subgraph_->operators()->size(),
-          alignof(NodeAndRegistration)));
-  if (output == nullptr) {
-    TF_LITE_REPORT_ERROR(
-        error_reporter_,
-        "Failed to allocate memory for node_and_registrations.");
-    return kTfLiteError;
-  }
-  TfLiteStatus status = kTfLiteOk;
-  auto* opcodes = model_->operator_codes();
-  MicroBuiltinDataAllocator builtin_data_allocator(memory_allocator_);
-  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
-    const auto* op = subgraph_->operators()->Get(i);
-    size_t index = op->opcode_index();
-    if (index >= opcodes->size()) {
-      TF_LITE_REPORT_ERROR(error_reporter_,
-                           "Missing registration for opcode_index %d\n", index);
-      return kTfLiteError;
-    }
-    auto* opcode = (*opcodes)[index];
-    status = GetRegistrationFromOpCode(opcode, op_resolver, error_reporter_,
-                                       &(output[i].registration));
-    if (status != kTfLiteOk) {
-      TF_LITE_REPORT_ERROR(error_reporter_,
-                           "Failed to get registration from op code %s\n ",
-                           EnumNameBuiltinOperator(opcode->builtin_code()));
-      return status;
-    }
-    const auto* registration = output[i].registration;
-    if (registration == nullptr) {
-      TF_LITE_REPORT_ERROR(error_reporter_, "Skipping op for opcode_index %d\n",
-                           index);
-      return kTfLiteError;
-    }
-    BuiltinOperator op_type =
-        static_cast<BuiltinOperator>(registration->builtin_code);
-
-    if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
-      TF_LITE_REPORT_ERROR(
-          error_reporter_,
-          "Unsupported behavior: found builtin operator %s with custom "
-          "options.\n",
-          EnumNameBuiltinOperator(op_type));
-      return kTfLiteError;
-    }
-
-    const char* custom_data = nullptr;
-    size_t custom_data_size = 0;
-    unsigned char* builtin_data = nullptr;
-    if (op->custom_options()) {
-      custom_data = reinterpret_cast<const char*>(op->custom_options()->data());
-      custom_data_size = op->custom_options()->size();
-    } else {
-      TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
-                                        &builtin_data_allocator,
-                                        (void**)(&builtin_data)));
-    }
-
-    // Disregard const qualifier to workaround with existing API.
-    TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
-    TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
-        reinterpret_cast<const TfLiteIntArray*>(op->outputs()));
-
-    TfLiteNode* node = &(output[i].node);
-    *node = {};
-    node->inputs = inputs_array;
-    node->outputs = outputs_array;
-    node->builtin_data = reinterpret_cast<void*>(builtin_data);
-    node->custom_initial_data = custom_data;
-    node->custom_initial_data_size = custom_data_size;
-  }
-  *node_and_registrations = output;
+  TF_LITE_ENSURE_STATUS(AllocateNodeAndRegistrations(node_and_registrations));
+  TF_LITE_ENSURE_STATUS(PrepareNodeAndRegistrationDataFromFlatbuffer(
+      op_resolver, *node_and_registrations));
   return kTfLiteOk;
 }
 
@@ -679,4 +558,151 @@ void* MicroAllocator::GetScratchBuffer(int buffer_idx) const {
   return scratch_buffer_handles_[scratch_buffer_count_ - buffer_idx - 1].data;
 }
 
+size_t MicroAllocator::used_bytes() const {
+  if (active_) {
+    return 0;
+  }
+  TF_LITE_REPORT_ERROR(error_reporter_, "Total buffer usage: %d bytes",
+                       memory_allocator_->GetUsedBytes());
+  TF_LITE_REPORT_ERROR(error_reporter_, "Head usage: %d bytes",
+                       memory_allocator_->GetHeadUsedBytes());
+  TF_LITE_REPORT_ERROR(error_reporter_, "Tail usage: %d bytes",
+                       memory_allocator_->GetTailUsedBytes());
+  return memory_allocator_->GetUsedBytes();
+}
+
+TfLiteStatus MicroAllocator::InitGraphAndContextTensorData() {
+  auto* subgraphs = model_->subgraphs();
+  if (subgraphs->size() != 1) {
+    TF_LITE_REPORT_ERROR(error_reporter_,
+                         "Only 1 subgraph is currently supported.\n");
+    return kTfLiteError;
+  }
+  subgraph_ = (*subgraphs)[0];
+
+  TF_LITE_ENSURE_STATUS(AllocateTfLiteTensorArray());
+  TF_LITE_ENSURE_STATUS(PopulateTfLiteTensorArrayFromFlatbuffer());
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::AllocateTfLiteTensorArray() {
+  context_->tensors_size = subgraph_->tensors()->size();
+  context_->tensors =
+      reinterpret_cast<TfLiteTensor*>(memory_allocator_->AllocateFromTail(
+          sizeof(TfLiteTensor) * context_->tensors_size,
+          alignof(TfLiteTensor)));
+  if (context_->tensors == nullptr) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to allocate memory for context->tensors, %d bytes required",
+        sizeof(TfLiteTensor) * context_->tensors_size);
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::PopulateTfLiteTensorArrayFromFlatbuffer() {
+  // Initialize tensors in context_ using the flatbuffer for quantization data.
+  for (size_t i = 0; i < subgraph_->tensors()->size(); ++i) {
+    TfLiteStatus status = internal::InitializeTfLiteTensorFromFlatbuffer(
+        memory_allocator_, *subgraph_->tensors()->Get(i), model_->buffers(),
+        error_reporter_, &context_->tensors[i]);
+    if (status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter_, "Failed to initialize tensor %d",
+                           i);
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
+    NodeAndRegistration** node_and_registrations) {
+  NodeAndRegistration* output = reinterpret_cast<NodeAndRegistration*>(
+      memory_allocator_->AllocateFromTail(
+          sizeof(NodeAndRegistration) * subgraph_->operators()->size(),
+          alignof(NodeAndRegistration)));
+  if (output == nullptr) {
+    TF_LITE_REPORT_ERROR(
+        error_reporter_,
+        "Failed to allocate memory for node_and_registrations.");
+    return kTfLiteError;
+  }
+  *node_and_registrations = output;
+  return kTfLiteOk;
+}
+
+TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
+    const OpResolver& op_resolver,
+    NodeAndRegistration* node_and_registrations) {
+  TfLiteStatus status = kTfLiteOk;
+  auto* opcodes = model_->operator_codes();
+  MicroBuiltinDataAllocator builtin_data_allocator(memory_allocator_);
+  for (size_t i = 0; i < subgraph_->operators()->size(); ++i) {
+    const auto* op = subgraph_->operators()->Get(i);
+    const size_t index = op->opcode_index();
+    if (index >= opcodes->size()) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Missing registration for opcode_index %d\n", index);
+      return kTfLiteError;
+    }
+    auto* opcode = (*opcodes)[index];
+    status =
+        GetRegistrationFromOpCode(opcode, op_resolver, error_reporter_,
+                                  &(node_and_registrations[i].registration));
+    if (status != kTfLiteOk) {
+      TF_LITE_REPORT_ERROR(error_reporter_,
+                           "Failed to get registration from op code %s\n ",
+                           EnumNameBuiltinOperator(opcode->builtin_code()));
+      return status;
+    }
+    const auto* registration = node_and_registrations[i].registration;
+    if (registration == nullptr) {
+      TF_LITE_REPORT_ERROR(error_reporter_, "Skipping op for opcode_index %d\n",
+                           index);
+      return kTfLiteError;
+    }
+    BuiltinOperator op_type =
+        static_cast<BuiltinOperator>(registration->builtin_code);
+
+    if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
+      TF_LITE_REPORT_ERROR(
+          error_reporter_,
+          "Unsupported behavior: found builtin operator %s with custom "
+          "options.\n",
+          EnumNameBuiltinOperator(op_type));
+      return kTfLiteError;
+    }
+
+    const char* custom_data = nullptr;
+    size_t custom_data_size = 0;
+    unsigned char* builtin_data = nullptr;
+    if (op->custom_options()) {
+      custom_data = reinterpret_cast<const char*>(op->custom_options()->data());
+      custom_data_size = op->custom_options()->size();
+    } else {
+      TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
+                                        &builtin_data_allocator,
+                                        (void**)(&builtin_data)));
+    }
+
+    // Disregard const qualifier to workaround with existing API.
+    TfLiteIntArray* inputs_array = const_cast<TfLiteIntArray*>(
+        reinterpret_cast<const TfLiteIntArray*>(op->inputs()));
+    TfLiteIntArray* outputs_array = const_cast<TfLiteIntArray*>(
+        reinterpret_cast<const TfLiteIntArray*>(op->outputs()));
+
+    TfLiteNode* node = &(node_and_registrations[i].node);
+    *node = {};
+    node->inputs = inputs_array;
+    node->outputs = outputs_array;
+    node->builtin_data = reinterpret_cast<void*>(builtin_data);
+    node->custom_initial_data = custom_data;
+    node->custom_initial_data_size = custom_data_size;
+  }
+
+  return kTfLiteOk;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index d05974f365a..1dd90c36a4d 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -30,9 +30,9 @@ namespace tflite {
 // Namespace used for unittests.
 namespace internal {
 
-// Sets up all of the data structure members for a runtime tensor
-// based on the contents of a serialized tensor.
-TfLiteStatus InitializeRuntimeTensor(
+// Sets up all of the data structure members for a TfLiteTensor based on the
+// contents of a serialized tensor in the flatbuffer.
+TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
     SimpleMemoryAllocator* allocator, const tflite::Tensor& flatbuffer_tensor,
     const flatbuffers::Vector<flatbuffers::Offset<Buffer>>* buffers,
     ErrorReporter* error_reporter, TfLiteTensor* result);
@@ -86,6 +86,15 @@ class MicroAllocator {
                  uint8_t* tensor_arena, size_t arena_size,
                  ErrorReporter* error_reporter);
 
+  // Run through the model flatbuffer data (loaded from the TfLiteModel
+  // instance) to allocate nodes and registrations. We need to keep them for the
+  // entire life time of the model to allow persistent tensors. This method
+  // needs to be called before FinishTensorAllocation method. This method also
+  // allocates any internal Op data that is required from the flatbuffer.
+  TfLiteStatus InitializeFromFlatbuffer(
+      const OpResolver& op_resolver,
+      NodeAndRegistration** node_and_registrations);
+
   // Runs through the model and allocates all necessary input, output and
   // intermediate tensors.
   // WARNING: doing any allocation after calling this method has the risk of
@@ -93,17 +102,6 @@ class MicroAllocator {
   // called in this class.
   TfLiteStatus FinishTensorAllocation();
 
-  // Returns the arena usage in bytes, only available after
-  // `FinishTensorAllocation`. Otherwise, it will return 0.
-  size_t used_bytes() const;
-
-  // Run through the model to allocate nodes and registrations. We need to keep
-  // them for the entire life time of the model to allow persistent tensors.
-  // This method needs to be called before FinishTensorAllocation method.
-  TfLiteStatus AllocateNodeAndRegistrations(
-      const OpResolver& op_resolver,
-      NodeAndRegistration** node_and_registrations);
-
   // Allocates persistent buffer which has the same life time as the allocator.
   // The memory is immediately available and is allocated from the tail of the
   // arena.
@@ -120,8 +118,38 @@ class MicroAllocator {
   // Returns the pointer to the planned scratch buffer.
   void* GetScratchBuffer(int buffer_idx) const;
 
+  // Returns the arena usage in bytes, only available after
+  // `FinishTensorAllocation`. Otherwise, it will return 0.
+  size_t used_bytes() const;
+
+ protected:
+  // Allocates an array in the arena to hold pointers to the tensors required
+  // to initialize and prepare a model. These allocations are stored and
+  // populated on the context.
+  TfLiteStatus AllocateTfLiteTensorArray();
+
+  // Populates content on the list of tensor pointers required to initialize and
+  // prepare a model from data in the flatbuffer (loaded from the TfLiteModel
+  // instance). Persistent data (e.g. quantization params) is allocated from the
+  // arena.
+  TfLiteStatus PopulateTfLiteTensorArrayFromFlatbuffer();
+
+  // Allocates an array in the arena to hold pointers to the node and
+  // registration pointers required to represent the inference graph of the
+  // model.
+  TfLiteStatus AllocateNodeAndRegistrations(
+      NodeAndRegistration** node_and_registrations);
+
+  // Populates node and registration pointers representing the inference graph
+  // of the model from values inside the flatbuffer (loaded from the TfLiteModel
+  // instance). Persistent data (e.g. operator data) is allocated from the
+  // arena.
+  TfLiteStatus PrepareNodeAndRegistrationDataFromFlatbuffer(
+      const OpResolver& op_resolver,
+      NodeAndRegistration* node_and_registrations);
+
  private:
-  TfLiteStatus Init();
+  TfLiteStatus InitGraphAndContextTensorData();
 
   const Model* model_;
   // A simple memory allocator that always allocate from the arena tail.
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index 78419edbbf9..b34b2dc2866 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -77,7 +77,7 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
 
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::internal::InitializeRuntimeTensor(
+      kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
                      &simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
@@ -103,7 +103,7 @@ TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
 
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::internal::InitializeRuntimeTensor(
+      kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
                      &simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
@@ -129,7 +129,7 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
 
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, tflite::internal::InitializeRuntimeTensor(
+      kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
                      &simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index b46f9ecb9ea..6b78966020e 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -165,7 +165,7 @@ void MicroInterpreter::CorrectTensorDataEndianness(T* data, int32_t size) {
 }
 
 TfLiteStatus MicroInterpreter::AllocateTensors() {
-  TF_LITE_ENSURE_OK(&context_, allocator_.AllocateNodeAndRegistrations(
+  TF_LITE_ENSURE_OK(&context_, allocator_.InitializeFromFlatbuffer(
                                    op_resolver_, &node_and_registrations_));
 
   // Only allow AllocatePersistentBuffer in Init stage.

From 580219546f81b50a0e627d29d8e36ab2245578a5 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Tue, 26 May 2020 12:29:06 -0700
Subject: [PATCH 1130/1533] Clean up micro build rules and split out
 micro_error_reporter and micro_debug_log from micro_framework.

PiperOrigin-RevId: 313245894
Change-Id: Ib6332590887b6f3d0ab7d78ccd71a0011b720408
---
 tensorflow/lite/micro/BUILD                   | 37 ++++++++++++++++---
 tensorflow/lite/micro/benchmarks/BUILD        |  2 +
 .../lite/micro/examples/hello_world/BUILD     |  4 +-
 .../lite/micro/examples/magic_wand/BUILD      |  7 +++-
 .../lite/micro/examples/micro_speech/BUILD    | 27 ++++++++++----
 .../micro_speech/micro_features/BUILD         |  3 +-
 .../micro/examples/person_detection/BUILD     |  7 ++--
 .../person_detection_experimental/BUILD       |  6 ++-
 tensorflow/lite/micro/kernels/BUILD           | 36 ++----------------
 tensorflow/lite/micro/testing/BUILD           |  4 +-
 tensorflow/lite/micro/tools/make/Makefile     |  1 +
 11 files changed, 77 insertions(+), 57 deletions(-)

diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 67471bc64a6..3b05aee30f4 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -25,20 +25,16 @@ cc_library(
 cc_library(
     name = "micro_framework",
     srcs = [
-        "debug_log.cc",
         "memory_helpers.cc",
         "micro_allocator.cc",
-        "micro_error_reporter.cc",
         "micro_interpreter.cc",
         "micro_optional_debug_tools.cc",
         "simple_memory_allocator.cc",
         "test_helpers.cc",
     ],
     hdrs = [
-        "debug_log.h",
         "memory_helpers.h",
         "micro_allocator.h",
-        "micro_error_reporter.h",
         "micro_interpreter.h",
         "micro_mutable_op_resolver.h",
         "micro_optional_debug_tools.h",
@@ -49,7 +45,6 @@ cc_library(
     copts = micro_copts(),
     deps = [
         ":micro_compatibility",
-        ":micro_string",
         ":micro_utils",
         "//tensorflow/lite:type_to_tflitetype",
         "//tensorflow/lite/c:common",
@@ -63,6 +58,36 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "debug_log",
+    srcs = [
+        "debug_log.cc",
+    ],
+    hdrs = [
+        "debug_log.h",
+    ],
+    build_for_embedded = True,
+    copts = micro_copts(),
+)
+
+cc_library(
+    name = "micro_error_reporter",
+    srcs = [
+        "micro_error_reporter.cc",
+    ],
+    hdrs = [
+        "micro_error_reporter.h",
+    ],
+    build_for_embedded = True,
+    copts = micro_copts(),
+    deps = [
+        ":debug_log",
+        ":micro_compatibility",
+        ":micro_string",
+        "//tensorflow/lite/core/api",
+    ],
+)
+
 cc_library(
     name = "micro_string",
     srcs = [
@@ -111,7 +136,7 @@ tflite_micro_cc_test(
         "micro_error_reporter_test.cc",
     ],
     deps = [
-        ":micro_framework",
+        ":micro_error_reporter",
     ],
 )
 
diff --git a/tensorflow/lite/micro/benchmarks/BUILD b/tensorflow/lite/micro/benchmarks/BUILD
index 4af3267d769..73b288d2bc1 100644
--- a/tensorflow/lite/micro/benchmarks/BUILD
+++ b/tensorflow/lite/micro/benchmarks/BUILD
@@ -46,6 +46,7 @@ cc_binary(
     deps = [
         ":keyword_scrambled_model_data",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/micro/testing:micro_benchmark",
@@ -58,6 +59,7 @@ cc_binary(
     deps = [
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro/examples/person_detection:model_settings",
diff --git a/tensorflow/lite/micro/examples/hello_world/BUILD b/tensorflow/lite/micro/examples/hello_world/BUILD
index 8da319f3095..4488c192abb 100644
--- a/tensorflow/lite/micro/examples/hello_world/BUILD
+++ b/tensorflow/lite/micro/examples/hello_world/BUILD
@@ -35,6 +35,7 @@ tflite_micro_cc_test(
     deps = [
         ":model",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/micro/kernels:micro_ops",
@@ -54,7 +55,7 @@ cc_library(
     copts = micro_copts(),
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -86,6 +87,7 @@ cc_binary(
         ":model",
         ":output_handler",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/micro/examples/magic_wand/BUILD b/tensorflow/lite/micro/examples/magic_wand/BUILD
index 7d6f3cdcecd..b0be47c1eeb 100644
--- a/tensorflow/lite/micro/examples/magic_wand/BUILD
+++ b/tensorflow/lite/micro/examples/magic_wand/BUILD
@@ -41,6 +41,7 @@ tflite_micro_cc_test(
         ":magic_wand_model_data",
         ":sample_feature_data",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:all_ops_resolver",
         "//tensorflow/lite/micro/kernels:micro_ops",
@@ -66,7 +67,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -78,6 +79,7 @@ tflite_micro_cc_test(
     deps = [
         ":accelerometer_handler",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -119,7 +121,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -155,6 +157,7 @@ cc_binary(
         ":magic_wand_model_data",
         ":output_handler",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/micro/examples/micro_speech/BUILD b/tensorflow/lite/micro/examples/micro_speech/BUILD
index e0e1ca4ad10..b487b895f7a 100644
--- a/tensorflow/lite/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/BUILD
@@ -50,6 +50,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_test_data",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
@@ -107,7 +108,7 @@ cc_library(
     deps = [
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -122,6 +123,7 @@ tflite_micro_cc_test(
         ":simple_features_generator_test_data",
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -138,7 +140,7 @@ cc_library(
     deps = [
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -153,6 +155,7 @@ tflite_micro_cc_test(
         ":simple_features_generator_test_data",
         ":simple_model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -168,7 +171,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
@@ -184,7 +187,7 @@ cc_library(
     deps = [
         ":audio_large_sample_test_data",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
@@ -197,6 +200,7 @@ tflite_micro_cc_test(
     deps = [
         ":audio_provider",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -212,6 +216,7 @@ tflite_micro_cc_test(
         ":audio_large_sample_test_data",
         ":audio_provider_mock",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -229,7 +234,7 @@ cc_library(
     deps = [
         ":audio_provider",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_generator",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
@@ -244,6 +249,7 @@ tflite_micro_cc_test(
         ":audio_provider",
         ":feature_provider",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -261,7 +267,7 @@ cc_library(
     deps = [
         ":audio_provider_mock",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_generator",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
@@ -275,6 +281,7 @@ tflite_micro_cc_test(
     deps = [
         ":feature_provider_mock",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_features_test_data",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
@@ -292,7 +299,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
     ],
 )
@@ -308,6 +315,7 @@ tflite_micro_cc_test(
     deps = [
         ":recognize_commands",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -323,7 +331,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -335,6 +343,7 @@ tflite_micro_cc_test(
     deps = [
         ":command_responder",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -353,6 +362,7 @@ cc_binary(
         ":feature_provider",
         ":recognize_commands",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
@@ -374,6 +384,7 @@ cc_binary(
         ":feature_provider",
         ":recognize_commands",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:micro_model_settings",
         "//tensorflow/lite/micro/examples/micro_speech/micro_features:model",
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD b/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
index 71010493102..0aa7ff14f73 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/BUILD
@@ -59,7 +59,7 @@ cc_library(
         ":micro_model_settings",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/experimental/microfrontend/lib:frontend",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -85,6 +85,7 @@ tflite_micro_cc_test(
         ":micro_features_generator_test_data",
         ":micro_model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/examples/micro_speech:audio_sample_test_data",
         "//tensorflow/lite/micro/testing:micro_test",
diff --git a/tensorflow/lite/micro/examples/person_detection/BUILD b/tensorflow/lite/micro/examples/person_detection/BUILD
index cb9fdb80c33..75c1bf61fa8 100644
--- a/tensorflow/lite/micro/examples/person_detection/BUILD
+++ b/tensorflow/lite/micro/examples/person_detection/BUILD
@@ -56,7 +56,7 @@ cc_library(
     deps = [
         ":model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -69,7 +69,7 @@ tflite_micro_cc_test(
         ":image_provider",
         ":model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -84,7 +84,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -112,6 +112,7 @@ cc_binary(
         ":model_settings",
         ":person_detect_model_data",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/BUILD b/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
index cb9fdb80c33..49f10c814cb 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/BUILD
@@ -56,7 +56,7 @@ cc_library(
     deps = [
         ":model_settings",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -69,6 +69,7 @@ tflite_micro_cc_test(
         ":image_provider",
         ":model_settings",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -84,7 +85,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
     ],
 )
 
@@ -112,6 +113,7 @@ cc_binary(
         ":model_settings",
         ":person_detect_model_data",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/kernels:micro_ops",
         "//tensorflow/lite/schema:schema_fbs",
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index 50a0a4f9190..b6c6054d604 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -201,7 +201,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -214,7 +214,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -228,7 +227,6 @@ tflite_micro_cc_test(
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -242,7 +240,6 @@ tflite_micro_cc_test(
         ":portable_optimized_ops_resolver",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -269,7 +266,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -282,7 +278,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -295,7 +290,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -308,7 +302,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -322,7 +315,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -335,7 +327,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -348,7 +339,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -361,7 +351,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -374,7 +363,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -387,7 +375,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -400,7 +387,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -412,9 +398,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -426,9 +410,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -441,7 +423,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -454,7 +435,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -467,7 +447,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -480,7 +459,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -493,7 +472,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -506,7 +485,7 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:debug_log",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -552,7 +531,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -566,7 +544,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
@@ -602,7 +579,6 @@ tflite_micro_cc_test(
     deps = [
         ":all_ops_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -614,9 +590,7 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_utils",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
@@ -666,7 +640,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_ops",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
@@ -679,7 +652,6 @@ tflite_micro_cc_test(
     ],
     deps = [
         ":all_ops_resolver",
-        ":micro_ops",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
diff --git a/tensorflow/lite/micro/testing/BUILD b/tensorflow/lite/micro/testing/BUILD
index 245e919bb05..8db93c6eeac 100644
--- a/tensorflow/lite/micro/testing/BUILD
+++ b/tensorflow/lite/micro/testing/BUILD
@@ -22,6 +22,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_framework",
         "//tensorflow/lite/micro:micro_utils",
     ],
@@ -43,8 +44,7 @@ cc_library(
         "micro_benchmark.h",
     ],
     deps = [
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro:micro_error_reporter",
         "//tensorflow/lite/micro:micro_time",
     ],
 )
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index a0a32728baf..13761cca28b 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -139,6 +139,7 @@ tensorflow/lite/c/common.h \
 tensorflow/lite/core/api/error_reporter.h \
 tensorflow/lite/core/api/flatbuffer_conversions.h \
 tensorflow/lite/core/api/op_resolver.h \
+tensorflow/lite/core/api/profiler.h \
 tensorflow/lite/core/api/tensor_utils.h \
 tensorflow/lite/kernels/internal/common.h \
 tensorflow/lite/kernels/internal/compatibility.h \

From 741ef7999c7603225afec347ca95ef3e4b098c5b Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 26 May 2020 12:37:17 -0700
Subject: [PATCH 1131/1533] Fix ops pbtxt

PiperOrigin-RevId: 313247584
Change-Id: I0cdc44f0913984c53865ffabb5d5dabe290f492c
---
 .../ops_history_v2/SparseSegmentMean.pbtxt    |  98 -------------
 .../SparseSegmentMeanWithNumSegments.pbtxt    | 132 ------------------
 .../ops_history_v2/SparseSegmentSqrtN.pbtxt   |  98 -------------
 .../SparseSegmentSqrtNWithNumSegments.pbtxt   | 132 ------------------
 tensorflow/core/ops/ops.pbtxt                 |   4 -
 5 files changed, 464 deletions(-)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
index 5f362b97cb0..526c2c25c04 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
@@ -1,45 +1,3 @@
-op {
-  name: "SparseSegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentMean"
   input_arg {
@@ -95,59 +53,3 @@ op {
     }
   }
 }
-op {
-  name: "SparseSegmentMean"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tsegmentids"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tsegmentids"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
index 60f9c4bbd00..b9984f8df25 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
@@ -1,62 +1,3 @@
-op {
-  name: "SparseSegmentMeanWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentMeanWithNumSegments"
   input_arg {
@@ -129,76 +70,3 @@ op {
     }
   }
 }
-op {
-  name: "SparseSegmentMeanWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tsegmentids"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tsegmentids"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
index 68359ea0c08..17562d4f333 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
@@ -1,45 +1,3 @@
-op {
-  name: "SparseSegmentSqrtN"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentSqrtN"
   input_arg {
@@ -95,59 +53,3 @@ op {
     }
   }
 }
-op {
-  name: "SparseSegmentSqrtN"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tsegmentids"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tsegmentids"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
index d16063dca08..1f24446a587 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -1,62 +1,3 @@
-op {
-  name: "SparseSegmentSqrtNWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type: DT_INT32
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
 op {
   name: "SparseSegmentSqrtNWithNumSegments"
   input_arg {
@@ -129,76 +70,3 @@ op {
     }
   }
 }
-op {
-  name: "SparseSegmentSqrtNWithNumSegments"
-  input_arg {
-    name: "data"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tidx"
-  }
-  input_arg {
-    name: "segment_ids"
-    type_attr: "Tsegmentids"
-  }
-  input_arg {
-    name: "num_segments"
-    type_attr: "Tnumsegments"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-        type: DT_DOUBLE
-      }
-    }
-  }
-  attr {
-    name: "Tidx"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tnumsegments"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "Tsegmentids"
-    type: "type"
-    default_value {
-      type: DT_INT32
-    }
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 98a1b9328be..2f6e0dc0d4c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -46097,7 +46097,6 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46216,7 +46215,6 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46285,7 +46283,6 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46404,7 +46401,6 @@ op {
     type: "type"
     allowed_values {
       list {
-        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }

From c64cdf3098f98de70bb9a916313ffaf6501b9002 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Tue, 26 May 2020 12:37:41 -0700
Subject: [PATCH 1132/1533] Move all header method implementations of
 SimpleMemoryAllocator to the cc file.

This is a cleanup to prepare for some recording and logging subclasses of this class to help TF Micro keep tracking of tail/head buffer allocations.

PiperOrigin-RevId: 313247671
Change-Id: Id6766c02467e1829961addfcc449fdf6990ce684
---
 .../lite/micro/simple_memory_allocator.cc     | 38 +++++++++++++++++++
 .../lite/micro/simple_memory_allocator.h      | 25 +++++-------
 2 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index 911e1e404f7..d55e7e87640 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -23,6 +23,20 @@ limitations under the License.
 
 namespace tflite {
 
+SimpleMemoryAllocator::SimpleMemoryAllocator(ErrorReporter* error_reporter,
+                                             uint8_t* buffer_head,
+                                             uint8_t* buffer_tail)
+    : error_reporter_(error_reporter),
+      buffer_head_(buffer_head),
+      buffer_tail_(buffer_tail),
+      head_(buffer_head),
+      tail_(buffer_tail) {}
+
+SimpleMemoryAllocator::SimpleMemoryAllocator(ErrorReporter* error_reporter,
+                                             uint8_t* buffer,
+                                             size_t buffer_size)
+    : SimpleMemoryAllocator(error_reporter, buffer, buffer + buffer_size) {}
+
 SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(
     ErrorReporter* error_reporter, uint8_t* buffer, size_t buffer_size) {
   SimpleMemoryAllocator tmp =
@@ -64,4 +78,28 @@ uint8_t* SimpleMemoryAllocator::AllocateFromTail(size_t size,
   return aligned_result;
 }
 
+uint8_t* SimpleMemoryAllocator::GetHead() const { return head_; }
+
+uint8_t* SimpleMemoryAllocator::GetTail() const { return tail_; }
+
+size_t SimpleMemoryAllocator::GetHeadUsedBytes() const {
+  return head_ - buffer_head_;
+}
+
+size_t SimpleMemoryAllocator::GetTailUsedBytes() const {
+  return buffer_tail_ - tail_;
+}
+
+size_t SimpleMemoryAllocator::GetAvailableMemory() const {
+  return tail_ - head_;
+}
+
+size_t SimpleMemoryAllocator::GetUsedBytes() const {
+  return GetBufferSize() - GetAvailableMemory();
+}
+
+size_t SimpleMemoryAllocator::GetBufferSize() const {
+  return buffer_tail_ - buffer_head_;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index 223ef8398a4..5be260f9ed2 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -29,15 +29,9 @@ namespace tflite {
 class SimpleMemoryAllocator {
  public:
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer_head,
-                        uint8_t* buffer_tail)
-      : error_reporter_(error_reporter),
-        buffer_head_(buffer_head),
-        buffer_tail_(buffer_tail),
-        head_(buffer_head),
-        tail_(buffer_tail) {}
+                        uint8_t* buffer_tail);
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer,
-                        size_t buffer_size)
-      : SimpleMemoryAllocator(error_reporter, buffer, buffer + buffer_size) {}
+                        size_t buffer_size);
 
   // Allocates memory starting at the head of the arena (lowest address and
   // moving upwards).
@@ -46,16 +40,17 @@ class SimpleMemoryAllocator {
   // moving downwards).
   uint8_t* AllocateFromTail(size_t size, size_t alignment);
 
-  uint8_t* GetHead() const { return head_; }
-  uint8_t* GetTail() const { return tail_; }
-  size_t GetAvailableMemory() const { return tail_ - head_; }
-  size_t GetUsedBytes() const { return GetBufferSize() - GetAvailableMemory(); }
+  uint8_t* GetHead() const;
+  uint8_t* GetTail() const;
 
-  size_t GetHeadUsedBytes() const { return head_ - buffer_head_; }
-  size_t GetTailUsedBytes() const { return buffer_tail_ - tail_; }
+  size_t GetHeadUsedBytes() const;
+  size_t GetTailUsedBytes() const;
+
+  size_t GetAvailableMemory() const;
+  size_t GetUsedBytes() const;
 
  private:
-  size_t GetBufferSize() const { return buffer_tail_ - buffer_head_; }
+  size_t GetBufferSize() const;
 
   ErrorReporter* error_reporter_;
   uint8_t* buffer_head_;

From 13de0f1c981aac2d76cdd4c47f274c6331d2ed68 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Tue, 26 May 2020 12:38:33 -0700
Subject: [PATCH 1133/1533] Update XNNPACK and its dependencies

PiperOrigin-RevId: 313247842
Change-Id: Ifa1fde5bcecf39308611af346cfce8424e7822e9
---
 tensorflow/workspace.bzl          | 16 ++++++++--------
 third_party/FP16/workspace.bzl    |  8 ++++----
 third_party/cpuinfo/workspace.bzl |  8 ++++----
 third_party/psimd/workspace.bzl   |  8 ++++----
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index d196675b518..217edee0f86 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "589acbfe90093c690a2817068fadfd7868000509304b5316d5c8d692b605b379",
-        strip_prefix = "XNNPACK-f5c4625a40ee296d47be936ff5e7b0809858627b",
+        sha256 = "05904bb15b7a5abadc261c16e6be3ac2314d6d4384aa16349b7354d9fa8bbb4f",
+        strip_prefix = "XNNPACK-1e5f80293b3c0197aaf44f3adb9329401fd36ed4",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/f5c4625a40ee296d47be936ff5e7b0809858627b.zip",
-            "https://github.com/google/XNNPACK/archive/f5c4625a40ee296d47be936ff5e7b0809858627b.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/1e5f80293b3c0197aaf44f3adb9329401fd36ed4.zip",
+            "https://github.com/google/XNNPACK/archive/1e5f80293b3c0197aaf44f3adb9329401fd36ed4.zip",
         ],
     )
 
@@ -184,11 +184,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "pthreadpool",
-        sha256 = "c4b148fba41fc937fdf96bc195caadf0cf0be83f1c3e335ef5355934d4501f83",
-        strip_prefix = "pthreadpool-e918b206d26b1f3b2100b0edabf445c18708d2b7",
+        sha256 = "9f5fb7f87dc778d9c1d638826344b762afa23884d0252526337ae710264faef3",
+        strip_prefix = "pthreadpool-18a7156cb9be8e534acefade42e46d4209600c35",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/e918b206d26b1f3b2100b0edabf445c18708d2b7.zip",
-            "https://github.com/Maratyszcza/pthreadpool/archive/e918b206d26b1f3b2100b0edabf445c18708d2b7.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/pthreadpool/archive/18a7156cb9be8e534acefade42e46d4209600c35.zip",
+            "https://github.com/Maratyszcza/pthreadpool/archive/18a7156cb9be8e534acefade42e46d4209600c35.zip",
         ],
     )
 
diff --git a/third_party/FP16/workspace.bzl b/third_party/FP16/workspace.bzl
index 441ef6b15e1..31746d6c371 100644
--- a/third_party/FP16/workspace.bzl
+++ b/third_party/FP16/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "FP16",
-        strip_prefix = "FP16-3c54eacb74f6f5e39077300c5564156c424d77ba",
-        sha256 = "0d56bb92f649ec294dbccb13e04865e3c82933b6f6735d1d7145de45da700156",
+        strip_prefix = "FP16-4dfe081cf6bcd15db339cf2680b9281b8451eeb3",
+        sha256 = "d973501a40c55126b31accc2d9f08d931ec3cc190c0430309a5e341d3c0ce32a",
         urls = [
-            "https://mirror.bazel.build/github.com/Maratyszcza/FP16/archive/3c54eacb74f6f5e39077300c5564156c424d77ba.zip",
-            "https://github.com/Maratyszcza/FP16/archive/3c54eacb74f6f5e39077300c5564156c424d77ba.zip",
+            "https://mirror.bazel.build/github.com/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.zip",
+            "https://github.com/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.zip",
         ],
         build_file = "//third_party/FP16:BUILD.bazel",
     )
diff --git a/third_party/cpuinfo/workspace.bzl b/third_party/cpuinfo/workspace.bzl
index 922ab022486..e7aff433892 100644
--- a/third_party/cpuinfo/workspace.bzl
+++ b/third_party/cpuinfo/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "cpuinfo",
-        strip_prefix = "cpuinfo-0cc563acb9baac39f2c1349bc42098c4a1da59e3",
-        sha256 = "80625d0b69a3d69b70c2236f30db2c542d0922ccf9bb51a61bc39c49fac91a35",
+        strip_prefix = "cpuinfo-19b9316c71e4e45b170a664bf62ddefd7ac9feb5",
+        sha256 = "e0a485c072de957668eb324c49d726dc0fd736cfb9436b334325f20d93085003",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/0cc563acb9baac39f2c1349bc42098c4a1da59e3.tar.gz",
-            "https://github.com/pytorch/cpuinfo/archive/0cc563acb9baac39f2c1349bc42098c4a1da59e3.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/19b9316c71e4e45b170a664bf62ddefd7ac9feb5.zip",
+            "https://github.com/pytorch/cpuinfo/archive/19b9316c71e4e45b170a664bf62ddefd7ac9feb5.zip",
         ],
         build_file = "//third_party/cpuinfo:BUILD.bazel",
     )
diff --git a/third_party/psimd/workspace.bzl b/third_party/psimd/workspace.bzl
index 03d010c3db8..768fd6da839 100644
--- a/third_party/psimd/workspace.bzl
+++ b/third_party/psimd/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "psimd",
-        strip_prefix = "psimd-85427dd4c8521cc037a1ffa6fcd25c55fafc8a00",
-        sha256 = "db23c2bc4a58d6f40c181797e43103300edac7cf9d286ca81590543f66ab95d2",
+        strip_prefix = "psimd-072586a71b55b7f8c584153d223e95687148a900",
+        sha256 = "dc615342bcbe51ca885323e51b68b90ed9bb9fa7df0f4419dbfa0297d5e837b7",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/psimd/archive/85427dd4c8521cc037a1ffa6fcd25c55fafc8a00.zip",
-            "https://github.com/Maratyszcza/psimd/archive/85427dd4c8521cc037a1ffa6fcd25c55fafc8a00.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip",
+            "https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip",
         ],
         build_file = "//third_party/psimd:BUILD.bazel",
     )

From c068a625c5bc4f56f8ef683353afd66e8e7064cf Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Tue, 26 May 2020 12:51:59 -0700
Subject: [PATCH 1134/1533] In tensorflow/core/util/, introduce a
 IncrementalBarrier library.

PiperOrigin-RevId: 313250473
Change-Id: I0cbb2d263d1639b1ea444b05ae7f5ea29fa252ce
---
 tensorflow/core/util/BUILD                    |  24 ++++
 tensorflow/core/util/incremental_barrier.cc   |  64 +++++++++
 tensorflow/core/util/incremental_barrier.h    |  81 +++++++++++
 .../core/util/incremental_barrier_test.cc     | 133 ++++++++++++++++++
 4 files changed, 302 insertions(+)
 create mode 100644 tensorflow/core/util/incremental_barrier.cc
 create mode 100644 tensorflow/core/util/incremental_barrier.h
 create mode 100644 tensorflow/core/util/incremental_barrier_test.cc

diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
index de2dce9c0c2..8e878c2464d 100644
--- a/tensorflow/core/util/BUILD
+++ b/tensorflow/core/util/BUILD
@@ -505,6 +505,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "incremental_barrier",
+    srcs = ["incremental_barrier.cc"],
+    hdrs = ["incremental_barrier.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/functional:bind_front",
+    ],
+)
+
 # Tests.
 
 tf_cc_test(
@@ -632,6 +642,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "incremental_barrier_test",
+    srcs = ["incremental_barrier_test.cc"],
+    deps = [
+        ":incremental_barrier",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/time",
+    ],
+)
+
 # Proto libraries.
 tf_proto_library(
     name = "test_log_proto_impl",
diff --git a/tensorflow/core/util/incremental_barrier.cc b/tensorflow/core/util/incremental_barrier.cc
new file mode 100644
index 00000000000..cbea7f25cc5
--- /dev/null
+++ b/tensorflow/core/util/incremental_barrier.cc
@@ -0,0 +1,64 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/incremental_barrier.h"
+
+#include <atomic>
+#include <functional>
+
+#include "absl/functional/bind_front.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+class InternalIncrementalBarrier {
+ public:
+  explicit InternalIncrementalBarrier(IncrementalBarrier::DoneCallback callback)
+      : left_(1), done_callback_(std::move(callback)) {}
+
+  void operator()() {
+    DCHECK_GE(left_.load(std::memory_order_relaxed), 0);
+
+    if (left_.fetch_sub(1, std::memory_order_acq_rel) - 1 == 0) {
+      IncrementalBarrier::DoneCallback done_callback =
+          std::move(done_callback_);
+      delete this;
+      done_callback();
+    }
+  }
+
+  IncrementalBarrier::BarrierCallback Inc() {
+    left_.fetch_add(1, std::memory_order_acq_rel);
+
+    // std::bind_front is only available ever since C++20.
+    return absl::bind_front(&InternalIncrementalBarrier::operator(), this);
+  }
+
+ private:
+  std::atomic<int> left_;
+  IncrementalBarrier::DoneCallback done_callback_;
+};
+
+IncrementalBarrier::IncrementalBarrier(DoneCallback done_callback)
+    : internal_barrier_(
+          new InternalIncrementalBarrier(std::move(done_callback))) {}
+
+IncrementalBarrier::~IncrementalBarrier() { (*internal_barrier_)(); }
+
+IncrementalBarrier::BarrierCallback IncrementalBarrier::Inc() {
+  return internal_barrier_->Inc();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/incremental_barrier.h b/tensorflow/core/util/incremental_barrier.h
new file mode 100644
index 00000000000..be45e9d4d8b
--- /dev/null
+++ b/tensorflow/core/util/incremental_barrier.h
@@ -0,0 +1,81 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INCREMENTAL_BARRIER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INCREMENTAL_BARRIER_H_
+
+#include <atomic>
+#include <functional>
+
+namespace tensorflow {
+
+class InternalIncrementalBarrier;
+
+// BarrierClosure (see
+// https://github.com/chromium/chromium/blob/master/base/barrier_closure.h)
+// executes a callback after it has been invoked |num_closures| times.
+// Plus, `BarrierClosure` is a continuation-passing style abstraction and self-
+// deleting.
+
+// IncrementalBarrier is a convenience class to be used in place of a barrier
+// closure, which is particularly helpful (e.g. simplify code) because callers
+// don't need to calculate the |num_closures| beforehand.
+//
+// Example Usage:
+//   void MakeCalls() {
+//     typedef std::function<void()> Callback;
+//     typedef std::function<void(Callback)> OtherCallback;
+//     Callback done_callback = ...
+//     OtherCallback cb1 = ...
+//     OtherCallback cb2 = ...
+//     std::thread threads[2];
+//     {
+//         IncrementalBarrier barrier(done_callback);
+//         threads[0] = std::thread(cb1(barrier.Inc());
+//         threads[1] = std::thread(cb2(barrier.Inc());
+//         ... at this moment, `barrier` is incremented twice, and then
+//         destructed....
+//     }
+//     threads[0].join();
+//     threads[1].join();
+//   }
+//
+//  `done_callback` will be called when both conditions are true:
+//  1) after `barrier` is destructed.
+//  2) Each `BarrierCallback` returned by `Inc` is called.
+// This class is thread-safe.
+class IncrementalBarrier {
+ public:
+  typedef std::function<void()> DoneCallback;
+  typedef std::function<void()> BarrierCallback;
+  explicit IncrementalBarrier(DoneCallback callback);
+
+  ~IncrementalBarrier();
+
+  // Returns a BarrierCallback (std::function) that individual task call to
+  // signal its completeness.
+  // The returned BarrierCallback outlives this `IncrementalBarrier` instance.
+  // Furthermore, each task should eventually call the returned function, or
+  // else done_callback wouldn't be called.
+  BarrierCallback Inc();
+
+ private:
+  // self-deleting, thereby not owned by 'IncrementalBarrier'.
+  InternalIncrementalBarrier* internal_barrier_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_INCREMENTAL_BARRIER_H_
diff --git a/tensorflow/core/util/incremental_barrier_test.cc b/tensorflow/core/util/incremental_barrier_test.cc
new file mode 100644
index 00000000000..020cb9ece32
--- /dev/null
+++ b/tensorflow/core/util/incremental_barrier_test.cc
@@ -0,0 +1,133 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/incremental_barrier.h"
+
+#include <atomic>
+
+#include "absl/functional/bind_front.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/threadpool.h"
+
+namespace tensorflow {
+namespace {
+
+// A thread-safe counter class.
+class Counter {
+ public:
+  void Increment() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    ++count_;
+  }
+
+  int GetCount() TF_LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    return count_;
+  }
+
+ private:
+  mutex mu_;
+  int count_ = 0;
+};
+
+TEST(IncrementalBarrierTest, RunInstantlyWhenZeroClosure) {
+  Counter counter;
+  EXPECT_EQ(counter.GetCount(), 0);
+  {
+    IncrementalBarrier::DoneCallback done_callback =
+        absl::bind_front(&Counter::Increment, &counter);
+    IncrementalBarrier barrier(done_callback);
+    EXPECT_EQ(counter.GetCount(), 0);
+  }
+  EXPECT_EQ(counter.GetCount(), 1);
+}
+
+TEST(IncrementalBarrierTest, RunAfterNumClosuresOneNowTwoLater) {
+  Counter counter;
+
+  IncrementalBarrier::BarrierCallback bc1, bc2;
+  {
+    IncrementalBarrier::DoneCallback done_callback =
+        absl::bind_front(&Counter::Increment, &counter);
+    IncrementalBarrier barrier(done_callback);
+
+    CHECK_EQ(counter.GetCount(), 0);
+
+    bc1 = barrier.Inc();
+    bc2 = barrier.Inc();
+
+    IncrementalBarrier::BarrierCallback bc3 = barrier.Inc();
+    bc3();
+
+    CHECK_EQ(counter.GetCount(), 0);
+  }
+
+  CHECK_EQ(counter.GetCount(), 0);
+  bc1();
+  CHECK_EQ(counter.GetCount(), 0);
+  bc2();
+  CHECK_EQ(counter.GetCount(), 1);
+}
+
+TEST(IncrementalBarrierTest, RunAfterNumClosuresConcurrency) {
+  const int num_closure = 100, num_thread = 2;
+  std::atomic<int> schedule_count{0};
+  Counter counter;
+
+  {
+    IncrementalBarrier::DoneCallback done_callback =
+        absl::bind_front(&Counter::Increment, &counter);
+    IncrementalBarrier barrier(done_callback);
+
+    CHECK_EQ(counter.GetCount(), 0);
+
+    tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(),
+                                        "BarrierClosure", num_thread);
+    for (int i = 0; i < num_closure; ++i) {
+      pool.Schedule([&barrier, &schedule_count]() {
+        schedule_count.fetch_add(1);
+        IncrementalBarrier::BarrierCallback bc = barrier.Inc();
+
+        Env::Default()->SleepForMicroseconds(100);
+        bc();
+      });
+    }
+
+    CHECK_EQ(counter.GetCount(), 0);
+  }
+
+  CHECK_EQ(schedule_count.load(std::memory_order_relaxed), 100);
+  CHECK_EQ(counter.GetCount(), 1);
+}
+
+#if defined(PLATFORM_GOOGLE)
+void BM_FunctionInc(benchmark::State& state) {
+  IncrementalBarrier barrier([] {});
+  for (auto _ : state) {
+    barrier.Inc()();
+  }
+}
+
+BENCHMARK(BM_FunctionInc);
+#endif  // PLATFORM_GOOGLE
+
+}  // namespace
+}  // namespace tensorflow

From 15bf2a7e76087abbaeb845f432632c0af74b4632 Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Tue, 26 May 2020 13:07:22 -0700
Subject: [PATCH 1135/1533] tf.function invocation optimization, remove
 redundant list converter

The input is already a list, remove redundant convert, which is also expensive

PiperOrigin-RevId: 313253733
Change-Id: I0a50c04fbf4416ae6ca71fe5d147b4d56b129641
---
 tensorflow/python/eager/function.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 97708f056c2..ce495d772d0 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -1831,9 +1831,9 @@ class ConcreteFunction(object):
       `args` and `kwargs`.
     """
     return self._call_flat(
-        (t for t in nest.flatten((args, kwargs), expand_composites=True)
+        [t for t in nest.flatten((args, kwargs), expand_composites=True)
          if isinstance(t, (ops.Tensor,
-                           resource_variable_ops.BaseResourceVariable))),
+                           resource_variable_ops.BaseResourceVariable))],
         captured_inputs=self.captured_inputs,
         cancellation_manager=cancellation_manager)
 
@@ -1854,7 +1854,6 @@ class ConcreteFunction(object):
     Raises:
       ValueError: If `args` contains anything other than Tensors or Variables.
     """
-    args = list(args)
     ctx = context.context()
     executing_eagerly = ctx.executing_eagerly()
 

From bb34d65cd7e435065967f5089e2b7f12bf619aa6 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Tue, 26 May 2020 13:15:12 -0700
Subject: [PATCH 1136/1533] Speed up creation of visualizer html page for
 TensorFlow Lite.

Use the NumPy functionality of the object based flatbuffer API.
This speeds up a model that took 15 minutes to visualize.

PiperOrigin-RevId: 313255207
Change-Id: Ic9d43cbd97c6d5026d903ee947a0a56a0732f150
---
 tensorflow/lite/tools/BUILD        |  5 ++++-
 tensorflow/lite/tools/visualize.py | 34 ++++++++++++++++++++----------
 2 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index a96c1c3ede3..6ae5c1dda18 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -17,7 +17,10 @@ py_binary(
     srcs = ["visualize.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
-    deps = ["//tensorflow/lite/python:schema_py"],
+    deps = [
+        "//tensorflow/lite/python:schema_py",
+        "//third_party/py/numpy",
+    ],
 )
 
 py_test(
diff --git a/tensorflow/lite/tools/visualize.py b/tensorflow/lite/tools/visualize.py
index 1f89f9c5448..3d22d1bb05b 100644
--- a/tensorflow/lite/tools/visualize.py
+++ b/tensorflow/lite/tools/visualize.py
@@ -28,6 +28,7 @@ import json
 import os
 import re
 import sys
+import numpy as np
 
 from tensorflow.lite.python import schema_py_generated as schema_fb
 
@@ -377,23 +378,34 @@ def CamelCaseToSnakeCase(camel_case_input):
   return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower()
 
 
-def FlatbufferToDict(fb):
-  """Converts a hierarchy of FB objects into a nested dict."""
-  if hasattr(fb, "__dict__"):
+def FlatbufferToDict(fb, preserve_as_numpy):
+  """Converts a hierarchy of FB objects into a nested dict.
+
+  We avoid transforming big parts of the flat buffer into python arrays. This
+  speeds conversion from ten minutes to a few seconds on big graphs.
+
+  Args:
+    fb: a flat buffer structure. (i.e. ModelT)
+    preserve_as_numpy: true if all downstream np.arrays should be preserved.
+      false if all downstream np.array should become python arrays
+  Returns:
+    A dictionary representing the flatbuffer rather than a flatbuffer object.
+  """
+  if isinstance(fb, int) or isinstance(fb, float) or isinstance(fb, str):
+    return fb
+  elif hasattr(fb, "__dict__"):
     result = {}
     for attribute_name in dir(fb):
       attribute = fb.__getattribute__(attribute_name)
       if not callable(attribute) and attribute_name[0] != "_":
         snake_name = CamelCaseToSnakeCase(attribute_name)
-        result[snake_name] = FlatbufferToDict(attribute)
+        preserve = True if attribute_name == "buffers" else preserve_as_numpy
+        result[snake_name] = FlatbufferToDict(attribute, preserve)
     return result
-  elif isinstance(fb, str):
-    return fb
+  elif isinstance(fb, np.ndarray):
+    return fb if preserve_as_numpy else fb.tolist()
   elif hasattr(fb, "__len__"):
-    result = []
-    for entry in fb:
-      result.append(FlatbufferToDict(entry))
-    return result
+    return [FlatbufferToDict(entry, preserve_as_numpy) for entry in fb]
   else:
     return fb
 
@@ -401,7 +413,7 @@ def FlatbufferToDict(fb):
 def CreateDictFromFlatbuffer(buffer_data):
   model_obj = schema_fb.Model.GetRootAsModel(buffer_data, 0)
   model = schema_fb.ModelT.InitFromObj(model_obj)
-  return FlatbufferToDict(model)
+  return FlatbufferToDict(model, preserve_as_numpy=False)
 
 
 def CreateHtmlFile(tflite_input, html_output):

From e7cc47384f4d57cc04ec550dbc2c08e467e42a4a Mon Sep 17 00:00:00 2001
From: Anudhyan Boral <anudhyan@google.com>
Date: Tue, 26 May 2020 13:15:31 -0700
Subject: [PATCH 1137/1533] [TF:XLA] Small change in tf2xla matmul to use
 BatchDot instead of Transpose + Dot.

This has the advantage that we can more easily detect symmetric matmuls (e.g. A * At) before the algebraic simplifier passes. BatchDot simply moves around contract_dims instead of adding a Transpose op.

Benchmarks (JF)
---------------
Summary of changes:
        Compile time  0.99x geomean, range [ 0.80x,  1.58x],  1.00x arith mean
         Host memory  1.00x geomean, range [ 0.77x,  1.25x]
          SMEM usage  1.00x geomean, range [ 0.98x,  1.02x]
   Benchmark runtime  1.00x geomean, range [ 0.99x,  2.43x]
No changes after rounding in HBM usage, VMEM usage, Bundle count, Overlay wait time, Static throttling

PiperOrigin-RevId: 313255256
Change-Id: I13d781161fad9d685c7bfcb96e511130b2b9e182
---
 tensorflow/compiler/tf2xla/kernels/matmul_op.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index a3fcb4d4b8f..bd6f58453df 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -81,9 +82,7 @@ class MatMulOp : public XlaOpKernel {
         b = xla::ConvertElementType(b, xla::F32);
       }
     }
-    auto lhs = (transpose_a_) ? xla::Transpose(a, {1, 0}) : a;
-    auto rhs = (transpose_b_) ? xla::Transpose(b, {1, 0}) : b;
-    ctx->SetOutput(0, xla::Dot(lhs, rhs));
+    ctx->SetOutput(0, xla::BatchDot(a, transpose_a_, b, transpose_b_));
   }
 
  private:

From ee53e70b81c5866465ad41766d28ef6333093452 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Tue, 26 May 2020 13:22:05 -0700
Subject: [PATCH 1138/1533] Converted speech example to int8 model

PiperOrigin-RevId: 313256375
Change-Id: I716b53e8f663dbbd0d78701996f416ed122d1f23
---
 .../examples/micro_speech/feature_provider.cc |    8 +-
 .../examples/micro_speech/feature_provider.h  |    4 +-
 .../feature_provider_mock_test.cc             |    4 +-
 .../micro_speech/feature_provider_test.cc     |    2 +-
 .../examples/micro_speech/main_functions.cc   |   31 +-
 .../micro_features_generator.cc               |   28 +-
 .../micro_features/micro_features_generator.h |    2 +-
 .../micro_features_generator_test.cc          |   21 +-
 .../micro_speech/micro_features/model.cc      | 3080 +++++++++--------
 .../micro_features/no_feature_data_slice.cc   |    8 +-
 .../micro_features/no_feature_data_slice.h    |    2 +-
 .../micro_features/no_micro_features_data.cc  |  311 +-
 .../micro_features/no_micro_features_data.h   |    2 +-
 .../micro_features/yes_feature_data_slice.cc  |    8 +-
 .../micro_features/yes_feature_data_slice.h   |    2 +-
 .../micro_features/yes_micro_features_data.cc |  311 +-
 .../micro_features/yes_micro_features_data.h  |    2 +-
 .../micro_speech/micro_speech_test.cc         |   53 +-
 .../micro_speech/recognize_commands.cc        |   12 +-
 .../micro_speech/recognize_commands.h         |    8 +-
 .../micro_speech/recognize_commands_test.cc   |   32 +-
 21 files changed, 2021 insertions(+), 1910 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc
index 7d917085845..fc2b1420a89 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h"
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h"
 
-FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
+FeatureProvider::FeatureProvider(int feature_size, int8_t* feature_data)
     : feature_size_(feature_size),
       feature_data_(feature_data),
       is_first_run_(true) {
@@ -77,10 +77,10 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
   // +-----------+             +-----------+
   if (slices_to_keep > 0) {
     for (int dest_slice = 0; dest_slice < slices_to_keep; ++dest_slice) {
-      uint8_t* dest_slice_data =
+      int8_t* dest_slice_data =
           feature_data_ + (dest_slice * kFeatureSliceSize);
       const int src_slice = dest_slice + slices_to_drop;
-      const uint8_t* src_slice_data =
+      const int8_t* src_slice_data =
           feature_data_ + (src_slice * kFeatureSliceSize);
       for (int i = 0; i < kFeatureSliceSize; ++i) {
         dest_slice_data[i] = src_slice_data[i];
@@ -106,7 +106,7 @@ TfLiteStatus FeatureProvider::PopulateFeatureData(
                              audio_samples_size, kMaxAudioSampleSize);
         return kTfLiteError;
       }
-      uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
+      int8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
       size_t num_samples_read;
       TfLiteStatus generate_status = GenerateMicroFeatures(
           error_reporter, audio_samples, audio_samples_size, kFeatureSliceSize,
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/micro/examples/micro_speech/feature_provider.h
index fc634ec108d..d086e013dc3 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider.h
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider.h
@@ -32,7 +32,7 @@ class FeatureProvider {
   // remain accessible for the lifetime of the provider object, since subsequent
   // calls will fill it with feature data. The provider does no memory
   // management of this data.
-  FeatureProvider(int feature_size, uint8_t* feature_data);
+  FeatureProvider(int feature_size, int8_t* feature_data);
   ~FeatureProvider();
 
   // Fills the feature data with information from audio inputs, and returns how
@@ -43,7 +43,7 @@ class FeatureProvider {
 
  private:
   int feature_size_;
-  uint8_t* feature_data_;
+  int8_t* feature_data_;
   // Make sure we don't try to use cached information if this is the first call
   // into the provider.
   bool is_first_run_;
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
index 6dcf3da9a3f..aae556bf6e0 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider_mock_test.cc
@@ -27,7 +27,7 @@ TF_LITE_MICRO_TEST(TestFeatureProviderMockYes) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  uint8_t feature_data[kFeatureElementCount];
+  int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
@@ -47,7 +47,7 @@ TF_LITE_MICRO_TEST(TestFeatureProviderMockNo) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  uint8_t feature_data[kFeatureElementCount];
+  int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
diff --git a/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
index 8e0e1f47d15..5d6816a91e4 100644
--- a/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/feature_provider_test.cc
@@ -26,7 +26,7 @@ TF_LITE_MICRO_TEST(TestFeatureProvider) {
   tflite::MicroErrorReporter micro_error_reporter;
   tflite::ErrorReporter* error_reporter = &micro_error_reporter;
 
-  uint8_t feature_data[kFeatureElementCount];
+  int8_t feature_data[kFeatureElementCount];
   FeatureProvider feature_provider(kFeatureElementCount, feature_data);
 
   int how_many_new_slices = 0;
diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index d3989c07333..e5e6aa7c1f7 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -43,8 +43,8 @@ int32_t previous_time = 0;
 // determined by experimentation.
 constexpr int kTensorArenaSize = 10 * 1024;
 uint8_t tensor_arena[kTensorArenaSize];
-uint8_t feature_buffer[kFeatureElementCount];
-uint8_t* model_input_buffer = nullptr;
+int8_t feature_buffer[kFeatureElementCount];
+int8_t* model_input_buffer = nullptr;
 }  // namespace
 
 // The name of this function is important for Arduino compatibility.
@@ -74,19 +74,28 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<3> micro_op_resolver(error_reporter);
+  static tflite::MicroOpResolver<4> micro_op_resolver(error_reporter);
   if (micro_op_resolver.AddBuiltin(
           tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-          tflite::ops::micro::Register_DEPTHWISE_CONV_2D()) != kTfLiteOk) {
+          tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
+          tflite::MicroOpResolverAnyVersion()) != kTfLiteOk) {
     return;
   }
   if (micro_op_resolver.AddBuiltin(
           tflite::BuiltinOperator_FULLY_CONNECTED,
-          tflite::ops::micro::Register_FULLY_CONNECTED()) != kTfLiteOk) {
+          tflite::ops::micro::Register_FULLY_CONNECTED(),
+          tflite::MicroOpResolverAnyVersion()) != kTfLiteOk) {
     return;
   }
   if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                                   tflite::ops::micro::Register_SOFTMAX()) !=
+                                   tflite::ops::micro::Register_SOFTMAX(),
+                                   tflite::MicroOpResolverAnyVersion()) !=
+      kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
+                                   tflite::ops::micro::Register_RESHAPE(),
+                                   tflite::MicroOpResolverAnyVersion()) !=
       kTfLiteOk) {
     return;
   }
@@ -105,15 +114,15 @@ void setup() {
 
   // Get information about the memory area to use for the model's input.
   model_input = interpreter->input(0);
-  if ((model_input->dims->size != 4) || (model_input->dims->data[0] != 1) ||
-      (model_input->dims->data[1] != kFeatureSliceCount) ||
-      (model_input->dims->data[2] != kFeatureSliceSize) ||
-      (model_input->type != kTfLiteUInt8)) {
+  if ((model_input->dims->size != 2) || (model_input->dims->data[0] != 1) ||
+      (model_input->dims->data[1] !=
+       (kFeatureSliceCount * kFeatureSliceSize)) ||
+      (model_input->type != kTfLiteInt8)) {
     TF_LITE_REPORT_ERROR(error_reporter,
                          "Bad input tensor parameters in model");
     return;
   }
-  model_input_buffer = model_input->data.uint8;
+  model_input_buffer = model_input->data.int8;
 
   // Prepare to access the audio spectrograms from a microphone or other source
   // that will provide the inputs to the neural network.
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
index 6a01124ed86..fbb6e6e4a9f 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.cc
@@ -69,7 +69,7 @@ void SetMicroFeaturesNoiseEstimates(const uint32_t* estimate_presets) {
 
 TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
                                    const int16_t* input, int input_size,
-                                   int output_size, uint8_t* output,
+                                   int output_size, int8_t* output,
                                    size_t* num_samples_read) {
   const int16_t* frontend_input;
   if (g_is_first_time) {
@@ -84,16 +84,30 @@ TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
   for (int i = 0; i < frontend_output.size; ++i) {
     // These scaling values are derived from those used in input_data.py in the
     // training pipeline.
-    constexpr int32_t value_scale = (10 * 255);
-    constexpr int32_t value_div = (256 * 26);
+    // The feature pipeline outputs 16-bit signed integers in roughly a 0 to 670
+    // range. In training, these are then arbitrarily divided by 25.6 to get
+    // float values in the rough range of 0.0 to 26.0. This scaling is performed
+    // for historical reasons, to match up with the output of other feature
+    // generators.
+    // The process is then further complicated when we quantize the model. This
+    // means we have to scale the 0.0 to 26.0 real values to the -128 to 127
+    // signed integer numbers.
+    // All this means that to get matching values from our integer feature
+    // output into the tensor input, we have to perform:
+    // input = (((feature / 25.6) / 26.0) * 256) - 128
+    // To simplify this and perform it in 32-bit integer math, we rearrange to:
+    // input = (feature * 256) / (25.6 * 26.0) - 128
+    constexpr int32_t value_scale = 256;
+    constexpr int32_t value_div = static_cast<int32_t>((25.6f * 26.0f) + 0.5f);
     int32_t value =
         ((frontend_output.values[i] * value_scale) + (value_div / 2)) /
         value_div;
-    if (value < 0) {
-      value = 0;
+    value -= 128;
+    if (value < -128) {
+      value = -128;
     }
-    if (value > 255) {
-      value = 255;
+    if (value > 127) {
+      value = 127;
     }
     output[i] = value;
   }
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h
index 7b9bc5faec8..29304239332 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator.h
@@ -26,7 +26,7 @@ TfLiteStatus InitializeMicroFeatures(tflite::ErrorReporter* error_reporter);
 // feeding into a neural network.
 TfLiteStatus GenerateMicroFeatures(tflite::ErrorReporter* error_reporter,
                                    const int16_t* input, int input_size,
-                                   int output_size, uint8_t* output,
+                                   int output_size, int8_t* output,
                                    size_t* num_samples_read);
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_MICRO_FEATURES_GENERATOR_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
index f88f12a5562..ee3ee03763f 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_features_generator_test.cc
@@ -48,7 +48,7 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
   };
   SetMicroFeaturesNoiseEstimates(yes_estimate_presets);
 
-  uint8_t yes_calculated_data[g_yes_feature_data_slice_size];
+  int8_t yes_calculated_data[g_yes_feature_data_slice_size];
   size_t num_samples_read;
   TfLiteStatus yes_status = GenerateMicroFeatures(
       error_reporter, g_yes_30ms_sample_data, g_yes_30ms_sample_data_size,
@@ -56,11 +56,12 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorYes) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, yes_status);
 
   for (int i = 0; i < g_yes_feature_data_slice_size; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(g_yes_feature_data_slice[i],
-                            yes_calculated_data[i]);
-    if (g_yes_feature_data_slice[i] != yes_calculated_data[i]) {
+    const int expected = g_yes_feature_data_slice[i];
+    const int actual = yes_calculated_data[i];
+    TF_LITE_MICRO_EXPECT_EQ(expected, actual);
+    if (expected != actual) {
       TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
-                           g_yes_feature_data_slice[i], yes_calculated_data[i]);
+                           expected, actual);
     }
   }
 }
@@ -81,7 +82,7 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
   };
   SetMicroFeaturesNoiseEstimates(no_estimate_presets);
 
-  uint8_t no_calculated_data[g_no_feature_data_slice_size];
+  int8_t no_calculated_data[g_no_feature_data_slice_size];
   size_t num_samples_read;
   TfLiteStatus no_status = GenerateMicroFeatures(
       error_reporter, g_no_30ms_sample_data, g_no_30ms_sample_data_size,
@@ -89,10 +90,12 @@ TF_LITE_MICRO_TEST(TestMicroFeaturesGeneratorNo) {
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, no_status);
 
   for (int i = 0; i < g_no_feature_data_slice_size; ++i) {
-    TF_LITE_MICRO_EXPECT_EQ(g_no_feature_data_slice[i], no_calculated_data[i]);
-    if (g_no_feature_data_slice[i] != no_calculated_data[i]) {
+    const int expected = g_no_feature_data_slice[i];
+    const int actual = no_calculated_data[i];
+    TF_LITE_MICRO_EXPECT_EQ(expected, actual);
+    if (expected != actual) {
       TF_LITE_REPORT_ERROR(error_reporter, "Expected value %d but found %d",
-                           g_no_feature_data_slice[i], no_calculated_data[i]);
+                           expected, actual);
     }
   }
 }
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
index 45198c781b2..d1e797fcf7d 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc
@@ -33,1528 +33,1564 @@ limitations under the License.
 #endif
 
 const unsigned char g_model[] DATA_ALIGN_ATTRIBUTE = {
-    0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,
-    0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
-    0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x1c, 0x47, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xc0, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
-    0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,
-    0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x0a, 0x00, 0x00, 0x00,
-    0x60, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
-    0x3c, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x0e, 0xba, 0xff, 0xff, 0x38, 0x00, 0x00, 0x00,
-    0xbc, 0xb9, 0xff, 0xff, 0xc0, 0xb9, 0xff, 0xff, 0x1e, 0xba, 0xff, 0xff,
-    0xe0, 0x01, 0x00, 0x00, 0xcc, 0xb9, 0xff, 0xff, 0xd0, 0xb9, 0xff, 0xff,
-    0x2e, 0xba, 0xff, 0xff, 0x60, 0x03, 0x00, 0x00, 0x36, 0xba, 0xff, 0xff,
-    0x7c, 0x06, 0x00, 0x00, 0x3e, 0xba, 0xff, 0xff, 0x68, 0x45, 0x00, 0x00,
-    0xec, 0xb9, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e,
-    0x30, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,
-    0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,
-    0x10, 0xfa, 0xff, 0xff, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x2c, 0x45, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x9c, 0x44, 0x00, 0x00,
-    0x8c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0xdc, 0x01, 0x00, 0x00,
-    0x68, 0x01, 0x00, 0x00, 0x3c, 0x02, 0x00, 0x00, 0x50, 0x05, 0x00, 0x00,
-    0x8e, 0xbb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
-    0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00,
-    0x94, 0xfa, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x12, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x10, 0x00, 0x14, 0x00, 0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x94, 0x48, 0x00, 0x00, 0x34, 0x42, 0x00, 0x00,
+    0x1c, 0x42, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
+    0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x0b, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f,
+    0x72, 0x75, 0x6e, 0x74, 0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73,
+    0x69, 0x6f, 0x6e, 0x00, 0x0c, 0x00, 0x00, 0x00, 0xd4, 0x41, 0x00, 0x00,
+    0xb4, 0x41, 0x00, 0x00, 0x24, 0x03, 0x00, 0x00, 0xf4, 0x02, 0x00, 0x00,
+    0xec, 0x02, 0x00, 0x00, 0xe4, 0x02, 0x00, 0x00, 0xc4, 0x02, 0x00, 0x00,
+    0xbc, 0x02, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0xbd, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e,
+    0x30, 0x00, 0x00, 0x00, 0x94, 0xba, 0xff, 0xff, 0x98, 0xba, 0xff, 0xff,
+    0x32, 0xbd, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00,
+    0xfa, 0xee, 0x28, 0xc4, 0xee, 0xfe, 0xcf, 0x0f, 0x1e, 0xf7, 0x1f, 0x06,
+    0x0d, 0xed, 0xe9, 0x83, 0x5c, 0xc9, 0x18, 0xe3, 0xf9, 0x14, 0x28, 0x2a,
+    0x09, 0xf2, 0x18, 0x34, 0x62, 0xea, 0xef, 0xd6, 0x36, 0xb7, 0x1e, 0xf7,
+    0x3b, 0x22, 0x28, 0x39, 0xc2, 0x9d, 0xf1, 0x07, 0x5e, 0x0b, 0x1e, 0x2c,
+    0x07, 0xdd, 0xfd, 0xc3, 0xd8, 0x4a, 0xf3, 0x28, 0xa7, 0x16, 0xd5, 0xf1,
+    0xc3, 0x05, 0xfd, 0x27, 0xcc, 0xba, 0x1e, 0xcb, 0xd7, 0x3d, 0xd4, 0x29,
+    0x00, 0xfd, 0x28, 0x44, 0xfb, 0xf2, 0xf3, 0xb6, 0x4f, 0xcf, 0x09, 0xf0,
+    0xfa, 0x45, 0x41, 0x49, 0x05, 0xc5, 0x17, 0x5d, 0x64, 0x00, 0xf8, 0xee,
+    0x48, 0x17, 0xf4, 0xe9, 0x2e, 0x4b, 0x2e, 0x3f, 0xdf, 0xee, 0xe4, 0x08,
+    0x38, 0xf1, 0x16, 0x13, 0x2f, 0x2a, 0xed, 0xc2, 0xbf, 0x36, 0xf4, 0x02,
+    0xcf, 0xaa, 0xd2, 0xfa, 0xac, 0x13, 0xf6, 0xe8, 0xb5, 0x68, 0x12, 0xb6,
+    0xce, 0x0e, 0xdf, 0x58, 0xe4, 0x49, 0x14, 0x15, 0x03, 0xed, 0xfa, 0xd4,
+    0x40, 0xa7, 0xf6, 0xca, 0xfb, 0x00, 0x4d, 0x5e, 0xe4, 0x55, 0x1d, 0x30,
+    0x45, 0xe2, 0xfc, 0x01, 0x48, 0x81, 0xe9, 0xf1, 0x1e, 0xfc, 0x21, 0x32,
+    0xed, 0x4b, 0xed, 0xfa, 0x2f, 0xd2, 0xfa, 0xfb, 0x4d, 0xa7, 0xed, 0xc7,
+    0x92, 0xdf, 0xe6, 0xdb, 0xf8, 0x1f, 0xd9, 0xfa, 0x91, 0xf5, 0xe5, 0xc5,
+    0x8c, 0x17, 0x0f, 0xb9, 0xd2, 0xc7, 0xfe, 0x68, 0xd3, 0x51, 0x2e, 0x49,
+    0x1f, 0xbd, 0x01, 0xeb, 0x31, 0x17, 0xf0, 0xef, 0xff, 0xb8, 0x5d, 0x62,
+    0x02, 0x0f, 0x1f, 0x78, 0x6a, 0xb0, 0xf9, 0xfe, 0x4f, 0xcc, 0xd3, 0xff,
+    0x0a, 0x96, 0x1e, 0x2c, 0xed, 0xbc, 0xf4, 0x0b, 0x42, 0xc8, 0xf1, 0xea,
+    0x6e, 0x58, 0xec, 0xc4, 0x99, 0xae, 0xdc, 0xd7, 0x12, 0x87, 0xd8, 0x06,
+    0xa2, 0xc2, 0xe6, 0xa2, 0x81, 0x24, 0xe9, 0xac, 0xce, 0xb6, 0x15, 0x6b,
+    0xba, 0x00, 0x19, 0x58, 0x29, 0xb6, 0xfe, 0x01, 0x25, 0x96, 0xd2, 0xec,
+    0x0e, 0x9c, 0x60, 0x5f, 0xe9, 0xf4, 0xf5, 0x69, 0x6b, 0xb5, 0xe1, 0xf6,
+    0x5e, 0xb7, 0xb1, 0xe5, 0x11, 0x9b, 0x18, 0x10, 0xe3, 0xe1, 0xe0, 0x0d,
+    0x4f, 0xa5, 0xde, 0xe5, 0x6f, 0xe2, 0xfb, 0x99, 0x82, 0xa5, 0xc9, 0xb6,
+    0x1f, 0x46, 0xf3, 0x04, 0xc6, 0xca, 0xd6, 0x97, 0x90, 0x1d, 0xc0, 0x95,
+    0xf0, 0x19, 0x30, 0x77, 0xc2, 0x3c, 0xfa, 0x24, 0x02, 0x4d, 0x06, 0x07,
+    0x15, 0x02, 0xb0, 0xe7, 0x27, 0x22, 0x67, 0x4d, 0xf1, 0xc2, 0xf4, 0x64,
+    0x38, 0x40, 0xdf, 0xf6, 0x3a, 0x43, 0xb8, 0xe1, 0x0d, 0x15, 0x11, 0xfe,
+    0xf5, 0xec, 0xf9, 0xe5, 0x22, 0x36, 0xe4, 0xfd, 0x6d, 0xbf, 0x0d, 0x8e,
+    0xb7, 0x15, 0xbf, 0x9f, 0x16, 0xad, 0x0a, 0x02, 0x8e, 0x14, 0xda, 0x9b,
+    0x8e, 0xc3, 0xa6, 0xca, 0xf5, 0x7f, 0x51, 0x56, 0xc1, 0xb3, 0xd9, 0x35,
+    0xf8, 0x7f, 0x04, 0x0a, 0x03, 0x3f, 0xbe, 0xee, 0x19, 0x68, 0x78, 0x50,
+    0xf9, 0xa7, 0xf7, 0x7f, 0x1d, 0x76, 0xdb, 0xe8, 0x33, 0xb9, 0xd7, 0xe7,
+    0xe8, 0x69, 0x15, 0xf7, 0xf5, 0xb2, 0xfe, 0xe8, 0xf3, 0x5b, 0xe2, 0x06,
+    0x6e, 0x09, 0x36, 0xb7, 0xcc, 0x38, 0xbf, 0x8a, 0x28, 0x14, 0x2e, 0x18,
+    0xa7, 0x26, 0xcb, 0xb2, 0x95, 0x37, 0xac, 0xcd, 0xd7, 0x51, 0x67, 0x44,
+    0xcd, 0x31, 0xde, 0x04, 0xe9, 0x6a, 0x00, 0x13, 0x0a, 0x0c, 0xdd, 0x16,
+    0xe0, 0x24, 0x7e, 0x49, 0xf1, 0xb5, 0x04, 0x52, 0x01, 0x50, 0xdd, 0xf5,
+    0x26, 0xc9, 0xf4, 0xf8, 0xd6, 0x31, 0x1b, 0xd0, 0xef, 0x03, 0x0a, 0xc0,
+    0xd4, 0x4f, 0xe2, 0xfd, 0x72, 0xf4, 0x5a, 0xc9, 0xd7, 0x31, 0xc0, 0x8e,
+    0x17, 0x5e, 0x57, 0x00, 0xb4, 0x3a, 0xc8, 0xd2, 0x92, 0x32, 0xcb, 0xd8,
+    0xc3, 0xa6, 0x63, 0x26, 0xcf, 0xbc, 0xe8, 0x57, 0x9b, 0xe9, 0xf7, 0x1c,
+    0xea, 0x12, 0xf1, 0xf7, 0xdb, 0xb9, 0x7f, 0x16, 0xf6, 0xe0, 0x08, 0x70,
+    0xa2, 0xed, 0xcc, 0xf1, 0x1e, 0x10, 0x04, 0xf7, 0xa9, 0xb7, 0x34, 0xaa,
+    0x0a, 0xdb, 0x2a, 0xa6, 0xb6, 0x10, 0xea, 0xf8, 0x5e, 0x06, 0x72, 0xdd,
+    0xd0, 0xb9, 0xd6, 0xa0, 0x10, 0x9f, 0x5a, 0x17, 0xb1, 0xe7, 0xc0, 0x01,
+    0x9d, 0x01, 0xe0, 0xe0, 0xaf, 0x9c, 0x46, 0xd8, 0xaf, 0xe8, 0xce, 0x02,
+    0x8a, 0xbb, 0xe4, 0xf6, 0xf3, 0x36, 0x07, 0xca, 0xcb, 0x87, 0x6e, 0xcc,
+    0xd6, 0x9e, 0x0a, 0x2a, 0x81, 0xd7, 0xcf, 0xc0, 0x04, 0xeb, 0x24, 0xcc,
+    0xc9, 0x95, 0x33, 0x81, 0xf7, 0xad, 0x1c, 0x9c, 0xa4, 0xd6, 0xf9, 0xe6,
+    0x3d, 0x84, 0x7f, 0xcc, 0xd4, 0xb0, 0xf4, 0xa2, 0xe9, 0x3c, 0x36, 0xee,
+    0xd5, 0xcf, 0xcd, 0x2d, 0x28, 0xbd, 0xff, 0xff, 0xc2, 0xbf, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+    0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x48, 0xbd, 0xff, 0xff, 0x4c, 0xbd, 0xff, 0xff, 0xe6, 0xbf, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x8a, 0xfe, 0xff, 0xff,
+    0xa9, 0x00, 0x00, 0x00, 0xd0, 0xff, 0xff, 0xff, 0xd0, 0x00, 0x00, 0x00,
+    0x52, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4f, 0xfb, 0xff, 0xff,
+    0x4a, 0xfd, 0xff, 0xff, 0x12, 0xc0, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,
+    0x80, 0x3e, 0x00, 0x00, 0xff, 0xf9, 0xfd, 0x0a, 0x07, 0x08, 0x07, 0x03,
+    0x07, 0xf2, 0xd1, 0x09, 0xf0, 0xe9, 0x28, 0x09, 0xdf, 0x05, 0xfa, 0xf0,
+    0xe8, 0xe3, 0x13, 0x0e, 0x08, 0xef, 0xd3, 0xee, 0x0f, 0xe8, 0xeb, 0x14,
+    0xf7, 0xed, 0xfd, 0x1f, 0xe8, 0xd5, 0xeb, 0xfc, 0x0e, 0xf4, 0xf7, 0x07,
+    0x05, 0xea, 0xf6, 0x1f, 0xf8, 0xdb, 0xdc, 0x0b, 0x03, 0xdd, 0xd8, 0xf3,
+    0x0f, 0x19, 0xe1, 0x09, 0xfc, 0xe4, 0x02, 0x04, 0xf1, 0x04, 0xeb, 0xf3,
+    0x1e, 0x06, 0xfd, 0x11, 0xfc, 0xfa, 0xf6, 0x1f, 0x0f, 0x02, 0xf5, 0xf7,
+    0xff, 0x24, 0xdf, 0xf7, 0xf8, 0xf3, 0xf6, 0xe9, 0xef, 0x03, 0xdd, 0xf2,
+    0x28, 0xe1, 0xf2, 0x22, 0xf4, 0x09, 0xf7, 0xf9, 0xf0, 0xd4, 0xf9, 0xee,
+    0xff, 0x14, 0xda, 0xf3, 0x11, 0xe2, 0xf6, 0x0c, 0xf2, 0xeb, 0xf8, 0xe8,
+    0xe3, 0x08, 0x02, 0x17, 0xf4, 0x0b, 0x0c, 0x27, 0xe6, 0x02, 0x03, 0xf9,
+    0x14, 0x18, 0xf6, 0xeb, 0x1f, 0x0c, 0xf1, 0xee, 0xfc, 0x08, 0xf0, 0xfe,
+    0xfd, 0xee, 0x17, 0xfd, 0x1c, 0xef, 0xfd, 0xde, 0x04, 0x05, 0xf0, 0x31,
+    0xfa, 0x0b, 0xdc, 0x0d, 0xed, 0xf5, 0xfa, 0xf4, 0x08, 0x0c, 0xd7, 0x1e,
+    0x15, 0x03, 0xf5, 0x02, 0xf4, 0xfb, 0xed, 0x01, 0xfe, 0xd6, 0x1f, 0xfd,
+    0xfd, 0x0e, 0xfa, 0x06, 0xf1, 0xf9, 0xe2, 0x16, 0xe9, 0xf1, 0x03, 0x0d,
+    0x0d, 0xdf, 0xf9, 0x1a, 0x0e, 0xf6, 0xfc, 0x0a, 0x19, 0xe2, 0xe0, 0x09,
+    0x15, 0xf0, 0xf1, 0x06, 0xf1, 0xe1, 0xef, 0x1a, 0x08, 0xe8, 0xfd, 0x12,
+    0x14, 0x06, 0xf1, 0xfc, 0xea, 0xfb, 0xf7, 0xea, 0x1d, 0x09, 0xfa, 0xf6,
+    0x08, 0xf2, 0xe7, 0xf8, 0xfc, 0x16, 0xf5, 0x0e, 0x08, 0xf9, 0x0a, 0x03,
+    0x26, 0xd8, 0x02, 0xf5, 0xf6, 0xf6, 0xef, 0x1f, 0xe4, 0xe2, 0xfb, 0x02,
+    0x1b, 0xe6, 0xde, 0x00, 0xf2, 0xed, 0xfb, 0x18, 0xe4, 0x16, 0x1a, 0x1d,
+    0xf1, 0xf6, 0xea, 0x16, 0x05, 0xde, 0xfb, 0x18, 0xf5, 0xe4, 0xfe, 0xe2,
+    0x1b, 0x1c, 0x0c, 0xe8, 0x02, 0xee, 0xfb, 0x07, 0x24, 0xf2, 0xe9, 0xfa,
+    0x0d, 0x05, 0xf1, 0x03, 0xfe, 0xf6, 0x19, 0x06, 0xff, 0xf9, 0x04, 0xfb,
+    0x15, 0xef, 0xf1, 0xf8, 0xe9, 0xe1, 0x10, 0x04, 0xfc, 0xe6, 0x1f, 0xed,
+    0x0b, 0xef, 0x00, 0x1e, 0xe6, 0x16, 0xf3, 0x09, 0xfd, 0x08, 0x08, 0x06,
+    0x06, 0x23, 0xdf, 0xfc, 0x08, 0xf4, 0xea, 0x0c, 0xf2, 0xe6, 0x18, 0xf5,
+    0x02, 0xf9, 0x50, 0x09, 0x01, 0xda, 0x0b, 0x05, 0x12, 0x18, 0xef, 0x04,
+    0x0e, 0xd9, 0xff, 0xdc, 0xf6, 0x16, 0xf9, 0xf4, 0xec, 0xff, 0xea, 0xe6,
+    0xfa, 0x0a, 0xed, 0xef, 0x02, 0xf0, 0x25, 0x21, 0xf1, 0x26, 0xf5, 0xed,
+    0x09, 0xea, 0xea, 0x24, 0xfa, 0x11, 0xfc, 0xdf, 0xf3, 0x0a, 0x28, 0x0c,
+    0x19, 0xff, 0xf5, 0xd6, 0x0e, 0xe2, 0x2a, 0x06, 0xfa, 0x03, 0xf9, 0xe6,
+    0xef, 0x23, 0xf9, 0xfa, 0xe6, 0xfe, 0xfc, 0x03, 0x06, 0x1a, 0xf9, 0x08,
+    0xe0, 0xe5, 0xff, 0x05, 0x01, 0xe7, 0x12, 0x02, 0x1d, 0x05, 0x03, 0x05,
+    0x0b, 0xee, 0xed, 0xfc, 0x0f, 0xf3, 0x02, 0xe0, 0x15, 0xdf, 0x02, 0xed,
+    0x10, 0x26, 0xef, 0x0d, 0x06, 0xee, 0xef, 0xf6, 0xeb, 0x11, 0x09, 0xf4,
+    0xf7, 0x06, 0x0f, 0x01, 0x2a, 0x0b, 0x01, 0xdd, 0xfc, 0xf4, 0xf1, 0x17,
+    0x03, 0x04, 0x07, 0xfc, 0x22, 0xfc, 0xde, 0xfe, 0x0b, 0x03, 0xf3, 0xfb,
+    0x0c, 0x25, 0x04, 0x19, 0x04, 0x03, 0x01, 0xfa, 0xfb, 0xf7, 0xf6, 0x0e,
+    0x15, 0x0e, 0x09, 0xff, 0x06, 0xfa, 0xfb, 0x1e, 0xfb, 0x05, 0x22, 0xf9,
+    0xfe, 0xf7, 0x1d, 0xed, 0xdf, 0x18, 0x09, 0xeb, 0xef, 0x04, 0x12, 0xea,
+    0xdf, 0xfb, 0xda, 0xf6, 0xdf, 0x17, 0xef, 0xef, 0xe1, 0x1a, 0xd9, 0xe2,
+    0xe2, 0xfc, 0x05, 0x11, 0xf6, 0xee, 0xe8, 0xf2, 0xe1, 0x08, 0x26, 0x04,
+    0xed, 0x03, 0xe0, 0xfb, 0xee, 0x0c, 0xee, 0xf6, 0x04, 0x2d, 0xf2, 0xd3,
+    0xf4, 0xe0, 0xf8, 0x0c, 0xfe, 0x11, 0x0b, 0xd7, 0xfd, 0x18, 0x07, 0x0d,
+    0x07, 0x08, 0xf4, 0xc6, 0x0a, 0x0a, 0x1f, 0x0c, 0xf4, 0x1d, 0x02, 0x0b,
+    0x09, 0x0e, 0x21, 0xff, 0x17, 0x0b, 0x0d, 0xf2, 0xed, 0xd7, 0x0a, 0xf8,
+    0x03, 0x06, 0xfa, 0xe5, 0xfd, 0x03, 0x14, 0x0f, 0xe9, 0x1a, 0xf4, 0xda,
+    0x01, 0xe6, 0x09, 0x06, 0x11, 0x0d, 0xfd, 0xeb, 0x16, 0x23, 0xfa, 0x00,
+    0x0b, 0x17, 0xf7, 0xda, 0xd7, 0x1b, 0xfa, 0x01, 0x03, 0x05, 0xfe, 0xd6,
+    0x02, 0xee, 0xee, 0x02, 0xf3, 0x06, 0xed, 0x03, 0xec, 0x01, 0xf2, 0x0f,
+    0x05, 0x17, 0x0b, 0xfb, 0x0f, 0x05, 0x03, 0x13, 0xff, 0x06, 0x02, 0xf5,
+    0xf4, 0x18, 0x2b, 0xf0, 0x00, 0x17, 0xfc, 0xfd, 0x05, 0x0b, 0x0e, 0x14,
+    0xe1, 0x24, 0x08, 0x24, 0xe6, 0xeb, 0x21, 0x12, 0xfb, 0x12, 0xe7, 0xf4,
+    0xe8, 0x0e, 0x18, 0xee, 0xf5, 0xf3, 0xd9, 0xf3, 0xdb, 0xec, 0x0c, 0x1e,
+    0xcf, 0x14, 0xdb, 0xe3, 0xdc, 0x02, 0x0c, 0xfb, 0xdb, 0x1b, 0xd0, 0xfe,
+    0xf9, 0xfe, 0x2a, 0xf5, 0x00, 0x0b, 0xcd, 0xe0, 0xe2, 0x0e, 0x04, 0xf8,
+    0xda, 0x1c, 0xe5, 0x0f, 0xe8, 0xf4, 0xf7, 0x15, 0x06, 0xf8, 0x02, 0xf7,
+    0x0f, 0xfb, 0x17, 0xf9, 0xda, 0x01, 0xda, 0xd1, 0xf6, 0x02, 0xfd, 0x16,
+    0xf1, 0xe4, 0xfa, 0x07, 0xee, 0x0a, 0xf3, 0xfd, 0xf2, 0x23, 0xf0, 0xe1,
+    0x0a, 0x1a, 0x12, 0x1f, 0xef, 0x27, 0x09, 0xf1, 0x0c, 0x13, 0x23, 0xfd,
+    0xf5, 0x03, 0xfe, 0x09, 0xfd, 0x16, 0xf8, 0x07, 0x08, 0x25, 0x08, 0xf8,
+    0xf6, 0x0a, 0xf1, 0xf5, 0x07, 0x09, 0x05, 0xcc, 0xf8, 0x08, 0x13, 0xf9,
+    0x1d, 0x11, 0x0f, 0xdc, 0xee, 0xf3, 0x27, 0xf9, 0xf9, 0x22, 0xfa, 0x0d,
+    0xe2, 0x13, 0xfb, 0x11, 0x03, 0x1e, 0xff, 0xfb, 0xed, 0xf1, 0x0e, 0x0b,
+    0x0f, 0x00, 0x06, 0xe0, 0x15, 0xf3, 0x13, 0xfc, 0x18, 0xf9, 0xff, 0x09,
+    0xfa, 0x1f, 0x12, 0xe5, 0xe2, 0x06, 0xf9, 0xf4, 0x07, 0x15, 0x0b, 0x04,
+    0xdb, 0x0d, 0xeb, 0xf3, 0xe6, 0x06, 0xe5, 0xee, 0xd8, 0x22, 0xd8, 0x10,
+    0xea, 0xf9, 0x1c, 0xf7, 0xd3, 0x11, 0xc3, 0xf8, 0xde, 0x05, 0x00, 0xe6,
+    0x07, 0xfd, 0xd3, 0x03, 0xea, 0xe0, 0x13, 0x14, 0xcf, 0xeb, 0xcd, 0xd3,
+    0xde, 0xf5, 0xf0, 0x0c, 0x0c, 0xfa, 0xeb, 0xd3, 0xfb, 0xfd, 0x08, 0xf9,
+    0xf4, 0x10, 0xfa, 0xd3, 0xf4, 0x11, 0x11, 0xf8, 0xef, 0xf8, 0xf8, 0xf1,
+    0xfc, 0xe1, 0xf7, 0x12, 0x04, 0xf4, 0xfb, 0xed, 0xef, 0x0c, 0xfd, 0x1c,
+    0xfe, 0x0e, 0xfd, 0xe2, 0xfe, 0x0a, 0x02, 0xfe, 0xe6, 0x1f, 0xef, 0xe5,
+    0xe6, 0xf8, 0x16, 0x27, 0xe8, 0x20, 0x05, 0xe3, 0xf1, 0xef, 0xee, 0xed,
+    0x0d, 0x11, 0x16, 0xfb, 0xf3, 0xff, 0x14, 0x01, 0xff, 0x15, 0x10, 0x02,
+    0xe5, 0x28, 0x29, 0x13, 0x13, 0x16, 0xe6, 0x00, 0xd2, 0x26, 0xfd, 0x03,
+    0x04, 0x05, 0x07, 0x06, 0xf1, 0x0e, 0x05, 0x0d, 0xe2, 0x0f, 0x02, 0xe1,
+    0x07, 0xf7, 0x1c, 0xfa, 0x14, 0x30, 0xf7, 0xee, 0x00, 0xfa, 0x3d, 0x06,
+    0x1c, 0x04, 0x06, 0x07, 0x05, 0x1a, 0x10, 0xf6, 0xee, 0x0a, 0xeb, 0x04,
+    0xeb, 0xdf, 0x1d, 0x09, 0xd5, 0xe8, 0xd6, 0xf4, 0xf0, 0x0f, 0x1d, 0xea,
+    0xf2, 0xf8, 0xa6, 0x0b, 0xdc, 0x09, 0x08, 0x24, 0xee, 0x24, 0xaa, 0xe4,
+    0xcb, 0x15, 0xef, 0xe7, 0xe9, 0x0c, 0xcf, 0x06, 0xe3, 0x12, 0x11, 0x00,
+    0x07, 0x14, 0xd7, 0xde, 0xf6, 0x0f, 0x0b, 0x04, 0xfb, 0x0d, 0xf8, 0x0d,
+    0xf6, 0x1b, 0xf1, 0x21, 0xdd, 0xfc, 0xf4, 0xe9, 0xf8, 0xe8, 0xf7, 0x06,
+    0x03, 0x1e, 0xce, 0xe1, 0xea, 0xf6, 0x05, 0xf9, 0x16, 0x15, 0x04, 0xe0,
+    0x14, 0xf7, 0x1e, 0x1c, 0x0a, 0x27, 0xef, 0xf3, 0x0f, 0xf3, 0xee, 0x04,
+    0xf8, 0xf1, 0x07, 0xe3, 0x05, 0x0b, 0x00, 0x1c, 0x15, 0x27, 0x07, 0xf7,
+    0xfa, 0x0b, 0xfa, 0xfa, 0x17, 0x13, 0xe1, 0xf5, 0xfb, 0x0c, 0x21, 0x2f,
+    0xd7, 0xfb, 0xf5, 0xfd, 0xd3, 0xf4, 0x07, 0x0e, 0xfd, 0x0b, 0xfc, 0xfa,
+    0xf5, 0x0e, 0x02, 0xfa, 0xfa, 0x19, 0xfd, 0xfa, 0xfc, 0x13, 0x24, 0x0c,
+    0xe4, 0x31, 0xf8, 0x12, 0xf4, 0x04, 0x18, 0x29, 0x27, 0x19, 0xfc, 0x08,
+    0x11, 0xe3, 0x07, 0xfe, 0x26, 0x40, 0x05, 0x02, 0x04, 0x02, 0x0f, 0xee,
+    0xf4, 0x27, 0xea, 0xf4, 0xf5, 0x11, 0x26, 0x0b, 0xe7, 0x05, 0xd2, 0xf6,
+    0xea, 0xfa, 0x0b, 0xf9, 0xfa, 0x16, 0xba, 0x00, 0xfb, 0x0d, 0x0b, 0xf9,
+    0xe6, 0xf6, 0xc5, 0xf8, 0xf6, 0x01, 0x0f, 0xed, 0xed, 0x13, 0xcd, 0x0d,
+    0xda, 0x06, 0x17, 0xee, 0x07, 0x1d, 0xb8, 0xfa, 0xe2, 0xea, 0xf2, 0xee,
+    0x04, 0x00, 0xdc, 0xd0, 0xfb, 0xf5, 0xec, 0xfe, 0xf1, 0x0d, 0xf0, 0xdb,
+    0xf9, 0x0d, 0x03, 0x03, 0x0e, 0x0a, 0xda, 0xd6, 0x01, 0xf2, 0x06, 0x14,
+    0x1c, 0x1f, 0xe8, 0xe8, 0x0e, 0xfd, 0x0c, 0xf5, 0xf3, 0x3d, 0xf3, 0x05,
+    0x10, 0xfa, 0x1b, 0x18, 0x08, 0x36, 0x09, 0xf1, 0xeb, 0xf9, 0x22, 0x01,
+    0xf3, 0xf7, 0xff, 0xf0, 0x0c, 0xe9, 0x01, 0x29, 0x21, 0x15, 0x03, 0xee,
+    0xe9, 0x1a, 0xf7, 0x15, 0x06, 0x25, 0xfa, 0xf0, 0xe4, 0xf1, 0x1f, 0x01,
+    0xdc, 0x2d, 0xce, 0xe9, 0xea, 0x0b, 0x06, 0x2c, 0x0a, 0x30, 0xe7, 0x09,
+    0xf4, 0xf0, 0x10, 0x29, 0xf9, 0x3d, 0xe7, 0xdc, 0xe4, 0xf7, 0x3b, 0x27,
+    0x23, 0x3a, 0x0a, 0x06, 0x0e, 0xfd, 0x2c, 0x07, 0x2b, 0x1c, 0xfa, 0x00,
+    0xf9, 0x11, 0xea, 0x14, 0xeb, 0xfc, 0x18, 0x03, 0xf1, 0x16, 0x12, 0x04,
+    0xcf, 0x12, 0xdd, 0xe4, 0x0e, 0xf0, 0x09, 0xe8, 0xf3, 0xfb, 0xa8, 0xf9,
+    0xee, 0xfb, 0x1e, 0x1d, 0xfd, 0x05, 0xab, 0xe5, 0xff, 0x01, 0xfe, 0x04,
+    0xf9, 0x02, 0xb9, 0xdc, 0xdf, 0x05, 0xf1, 0xef, 0xf1, 0x1e, 0xc7, 0xee,
+    0xf7, 0x1e, 0x00, 0x00, 0xf8, 0x10, 0xec, 0xe8, 0x04, 0x0f, 0xf6, 0xff,
+    0x04, 0x09, 0xe0, 0x0a, 0x0e, 0xe4, 0xf0, 0xf1, 0x16, 0x2b, 0xd3, 0xe1,
+    0x0a, 0xef, 0xf9, 0xfe, 0x0b, 0x22, 0xf5, 0x01, 0x0a, 0xf8, 0x02, 0x00,
+    0x17, 0x19, 0xf3, 0x05, 0x21, 0xfa, 0xee, 0xee, 0x12, 0xf2, 0xfa, 0xf5,
+    0x05, 0x12, 0xee, 0xe4, 0x28, 0xfa, 0xf1, 0x03, 0x15, 0x16, 0x18, 0xfd,
+    0x0f, 0x21, 0x04, 0xf4, 0xe5, 0x0c, 0x06, 0x13, 0xde, 0x36, 0xe8, 0xfb,
+    0xe7, 0xfd, 0xf6, 0x12, 0x0e, 0x1d, 0xea, 0xf8, 0xd4, 0xe8, 0x19, 0x07,
+    0xe5, 0x1c, 0xf7, 0x0c, 0xef, 0x05, 0x0f, 0x09, 0xdd, 0x1a, 0xea, 0xd7,
+    0xf9, 0xf9, 0x12, 0x17, 0x2e, 0x10, 0x08, 0xfe, 0x14, 0xf5, 0x1d, 0xfa,
+    0x06, 0x33, 0xed, 0xfe, 0xf7, 0x11, 0xf0, 0x15, 0xe2, 0x24, 0xf6, 0x0a,
+    0xe2, 0xfc, 0x23, 0x12, 0xdd, 0x11, 0xfd, 0xe5, 0x08, 0xff, 0x15, 0xf6,
+    0xf1, 0x1b, 0xae, 0xfe, 0xe6, 0x15, 0x2c, 0x2d, 0x15, 0x15, 0xc5, 0xf8,
+    0xea, 0xe7, 0x07, 0x04, 0xfe, 0x28, 0xa1, 0xf2, 0xe1, 0xf9, 0xf8, 0xff,
+    0xf4, 0x22, 0xb4, 0xdb, 0x03, 0x20, 0xe6, 0xf3, 0x0e, 0x19, 0xe3, 0x0a,
+    0xfa, 0xee, 0xf3, 0xe5, 0xd8, 0xf9, 0xf1, 0xde, 0x06, 0x05, 0xf2, 0xf5,
+    0xe7, 0x16, 0xd8, 0xfe, 0x07, 0xea, 0xee, 0x0e, 0xfa, 0xff, 0xdb, 0xe7,
+    0x03, 0xed, 0x01, 0xfd, 0x09, 0x1a, 0xfa, 0xe6, 0x05, 0x10, 0xe9, 0x01,
+    0x1f, 0x13, 0xf7, 0xf6, 0xfb, 0x13, 0xff, 0xdb, 0xed, 0xfe, 0x0a, 0x10,
+    0x09, 0x29, 0xf5, 0x04, 0xf5, 0x26, 0x0d, 0x0c, 0xf9, 0x16, 0xfa, 0x02,
+    0xf4, 0x2e, 0xde, 0xf5, 0xe1, 0x1d, 0xfb, 0x02, 0x0b, 0x23, 0x07, 0xea,
+    0xd9, 0x0a, 0xf3, 0x0a, 0x0f, 0x1e, 0xe7, 0xf1, 0xd7, 0x0b, 0xf6, 0xff,
+    0x0d, 0x24, 0xcc, 0x0a, 0xee, 0xda, 0x14, 0x12, 0x11, 0x29, 0xf4, 0x1a,
+    0xef, 0x0b, 0xfa, 0xec, 0x0c, 0x1b, 0xf4, 0xff, 0xf5, 0xef, 0x0f, 0x10,
+    0xd4, 0x04, 0xf9, 0xf8, 0xec, 0xf9, 0x21, 0x05, 0xd3, 0x27, 0xf3, 0x17,
+    0xff, 0xf6, 0x15, 0xf9, 0xed, 0x0a, 0xac, 0x02, 0xfd, 0xfb, 0x04, 0x29,
+    0x06, 0x03, 0xb8, 0xe6, 0xd5, 0x17, 0x09, 0x1b, 0xf6, 0x1b, 0xab, 0xdc,
+    0xdf, 0xfd, 0x06, 0x09, 0x09, 0x37, 0xbb, 0xed, 0x19, 0xd7, 0xe2, 0xdd,
+    0x05, 0x01, 0xec, 0xfb, 0xe4, 0x0e, 0xeb, 0xf0, 0x03, 0x17, 0x04, 0xeb,
+    0x09, 0xee, 0xeb, 0xe7, 0x0c, 0x16, 0xcb, 0x0e, 0x17, 0xd8, 0xe1, 0xf8,
+    0x2b, 0x19, 0xde, 0xeb, 0x10, 0xf2, 0xff, 0xf8, 0xee, 0x0e, 0xe7, 0xf0,
+    0x15, 0x08, 0xf8, 0xdf, 0x06, 0x0d, 0xf9, 0x14, 0xfa, 0x0b, 0x04, 0xfd,
+    0x15, 0x23, 0x20, 0xff, 0xfd, 0x1d, 0x0c, 0xf1, 0xfe, 0x15, 0x0a, 0x02,
+    0xed, 0xfe, 0xfb, 0x04, 0xfb, 0x1e, 0xdd, 0x05, 0xe0, 0x16, 0xf9, 0xf6,
+    0xfd, 0x32, 0xdc, 0xf2, 0xd3, 0x08, 0xf4, 0xec, 0x17, 0x25, 0xe2, 0xf0,
+    0xee, 0xf1, 0x0d, 0xfe, 0x13, 0x2d, 0x01, 0x11, 0xd4, 0xe4, 0x07, 0xfb,
+    0x32, 0x11, 0x14, 0x07, 0xd7, 0x02, 0x10, 0xeb, 0x2b, 0x1d, 0x01, 0xfc,
+    0xf3, 0xf0, 0x13, 0x1a, 0xdb, 0x20, 0x00, 0xf0, 0xf0, 0x05, 0x16, 0x03,
+    0xd4, 0xe3, 0xc2, 0xf0, 0x06, 0x02, 0x1e, 0x0a, 0xec, 0x1f, 0xab, 0xea,
+    0xfa, 0xe3, 0x20, 0x22, 0x03, 0x1b, 0xb3, 0x0e, 0xe3, 0xf3, 0x1d, 0x27,
+    0xe3, 0x10, 0xa7, 0xda, 0xf3, 0x00, 0x0a, 0x0a, 0x04, 0xfb, 0xb2, 0x0f,
+    0x0c, 0xf5, 0x07, 0xff, 0x13, 0x1e, 0xdb, 0xf6, 0xf9, 0xef, 0xe8, 0xe7,
+    0xfb, 0x18, 0xeb, 0xec, 0x09, 0xda, 0xf1, 0xf0, 0x0b, 0x04, 0xe1, 0xfa,
+    0x1c, 0x25, 0xee, 0x01, 0x0b, 0x29, 0xd7, 0x0c, 0x04, 0x0b, 0xef, 0xfd,
+    0x1c, 0xfc, 0xf1, 0xfb, 0x0b, 0x0f, 0xdf, 0xed, 0x17, 0x38, 0x0c, 0xd7,
+    0xff, 0xfd, 0x01, 0xfc, 0xfb, 0xfb, 0x18, 0x1a, 0x18, 0xe3, 0xf9, 0xf4,
+    0xfa, 0x20, 0x06, 0x09, 0x11, 0x08, 0x1d, 0xf8, 0xfa, 0x1d, 0xf5, 0x1c,
+    0xf5, 0xfe, 0x03, 0x07, 0xe4, 0x33, 0xc8, 0x0c, 0xe1, 0x13, 0xff, 0xe5,
+    0x10, 0x2c, 0xd3, 0xf0, 0xed, 0x04, 0x07, 0x01, 0xf1, 0x16, 0xe0, 0x13,
+    0xfa, 0x11, 0x07, 0xfa, 0x19, 0x16, 0x01, 0x00, 0x07, 0x26, 0x00, 0xec,
+    0x1d, 0x23, 0x05, 0xf4, 0x07, 0x17, 0x2c, 0x1d, 0xee, 0xf0, 0x0c, 0x09,
+    0xe3, 0x1a, 0x24, 0x0b, 0xf3, 0x1e, 0xce, 0xfe, 0xfe, 0x12, 0x21, 0x1a,
+    0xf6, 0x23, 0xc3, 0x03, 0xf4, 0x10, 0x1a, 0x2a, 0xf4, 0x08, 0xbf, 0xff,
+    0x04, 0xf4, 0x0b, 0x1d, 0x1a, 0xf8, 0xcc, 0x00, 0xf7, 0x13, 0xf4, 0xfd,
+    0xf4, 0x19, 0xbd, 0xef, 0x0c, 0x0d, 0x02, 0xfc, 0x12, 0x13, 0xe9, 0xe7,
+    0xf5, 0xfa, 0xfa, 0xf6, 0x1a, 0x2e, 0xce, 0xd4, 0x01, 0x12, 0xfd, 0xfc,
+    0x26, 0x10, 0xcc, 0xe7, 0xee, 0x13, 0xee, 0xff, 0xef, 0xea, 0x00, 0x0e,
+    0x1a, 0x17, 0x04, 0x0c, 0x04, 0x0c, 0xe6, 0xf3, 0xf6, 0xdb, 0xdd, 0x04,
+    0xf4, 0x22, 0x11, 0x16, 0xf3, 0x07, 0xec, 0xf8, 0xf2, 0x07, 0x03, 0x02,
+    0xf5, 0x0a, 0xf6, 0x02, 0x1d, 0x1b, 0x11, 0x06, 0xf8, 0x06, 0x02, 0xea,
+    0xf3, 0x1d, 0xce, 0x00, 0xed, 0xf9, 0xef, 0xf6, 0xec, 0x22, 0xc7, 0xf0,
+    0xed, 0xdb, 0xe0, 0x02, 0x11, 0x07, 0xe8, 0xf0, 0xd1, 0xed, 0xff, 0xfd,
+    0x0c, 0x2e, 0xd4, 0xed, 0xec, 0x0e, 0xf1, 0x07, 0x01, 0x0e, 0x0e, 0xfe,
+    0xda, 0x0b, 0x0a, 0x0a, 0x1f, 0x2e, 0x13, 0x07, 0x00, 0x07, 0x14, 0x21,
+    0xe9, 0xfc, 0xf0, 0x1e, 0xd7, 0xea, 0x34, 0x07, 0xc6, 0x0c, 0xd4, 0xec,
+    0xfd, 0x06, 0x24, 0x0a, 0xf3, 0x15, 0xaf, 0xff, 0xe9, 0xf1, 0x0d, 0x3e,
+    0xe9, 0x18, 0xba, 0x13, 0xed, 0xd7, 0x0b, 0x31, 0x05, 0x0e, 0xaf, 0x13,
+    0xd6, 0x0e, 0x10, 0x02, 0x02, 0x14, 0xcb, 0xd5, 0xf9, 0x0c, 0xf9, 0x0e,
+    0x1f, 0x24, 0xd5, 0xeb, 0xff, 0xf1, 0xf5, 0x0c, 0x08, 0x07, 0xf4, 0xd7,
+    0x06, 0x10, 0xe8, 0xef, 0xfc, 0x2f, 0xee, 0xf1, 0x18, 0xf8, 0xf4, 0x02,
+    0x11, 0x21, 0xd3, 0x12, 0x14, 0xe4, 0xf4, 0x02, 0x05, 0x24, 0xca, 0xf2,
+    0xf3, 0xeb, 0xe7, 0xf8, 0x16, 0x1a, 0xeb, 0x0d, 0x05, 0x16, 0xf1, 0xec,
+    0x11, 0x1c, 0x09, 0x1e, 0xe0, 0xe6, 0xfa, 0x0e, 0x0d, 0x2a, 0xea, 0x2e,
+    0xed, 0xf9, 0xf7, 0x16, 0x09, 0x05, 0xdd, 0xd6, 0x02, 0xeb, 0xf5, 0xf3,
+    0xe4, 0x3b, 0xed, 0x04, 0xe0, 0x0e, 0xfd, 0x09, 0xfd, 0x35, 0xdc, 0x18,
+    0xf3, 0x04, 0xfa, 0x05, 0x15, 0x34, 0xe5, 0xe1, 0xe4, 0xf4, 0xe0, 0xf9,
+    0x08, 0x32, 0x04, 0x08, 0xf4, 0x0f, 0xff, 0x08, 0x09, 0x2f, 0x06, 0x02,
+    0xfd, 0x05, 0x0c, 0x24, 0xe3, 0x1e, 0xf5, 0x0c, 0xdd, 0xf8, 0x18, 0x20,
+    0xd8, 0x14, 0xef, 0xf4, 0x17, 0x08, 0x25, 0x14, 0x04, 0x06, 0xb0, 0xf5,
+    0xf5, 0x09, 0x0f, 0x3e, 0xff, 0x28, 0xb3, 0xf5, 0x19, 0xd8, 0x14, 0x21,
+    0xd9, 0xf7, 0xb7, 0xe5, 0xfe, 0xe7, 0x07, 0x1e, 0x04, 0x15, 0xc5, 0xf9,
+    0x14, 0x20, 0xeb, 0x01, 0x01, 0x18, 0xce, 0x00, 0xe6, 0xe2, 0xf7, 0xfb,
+    0xf3, 0x0d, 0xd3, 0xf3, 0x04, 0xf8, 0xf0, 0x03, 0xf1, 0x25, 0xb5, 0xef,
+    0x05, 0xe0, 0x01, 0xf6, 0x04, 0x16, 0xd1, 0x01, 0x0a, 0x21, 0x01, 0x05,
+    0x0e, 0x01, 0xf0, 0x0a, 0xf3, 0x00, 0x03, 0xf8, 0xfa, 0x03, 0x0b, 0xde,
+    0xfe, 0xff, 0xfb, 0xea, 0x09, 0x02, 0xf5, 0xe8, 0xe7, 0x08, 0x00, 0xf5,
+    0xf8, 0x0f, 0x13, 0xfa, 0xeb, 0xe8, 0xfb, 0x1f, 0x08, 0x16, 0xe6, 0xfa,
+    0xe1, 0x00, 0x03, 0xdd, 0xf1, 0x26, 0xe5, 0x1d, 0xd9, 0xff, 0xf2, 0xf8,
+    0xff, 0x33, 0xea, 0xe5, 0x03, 0x0c, 0x07, 0xf9, 0xf8, 0x0f, 0xe1, 0x1e,
+    0xdd, 0x0f, 0x00, 0xf1, 0x06, 0x21, 0x09, 0x05, 0xf3, 0xec, 0xe6, 0x04,
+    0x07, 0x32, 0xf1, 0xf9, 0xf2, 0x01, 0x18, 0x1f, 0xd2, 0xe2, 0x0a, 0xf4,
+    0xca, 0xfc, 0x28, 0x16, 0xc2, 0x10, 0xf2, 0xfc, 0x08, 0xe9, 0x2a, 0x0f,
+    0xfa, 0xf5, 0xa9, 0x07, 0xec, 0xe9, 0x19, 0x43, 0x0b, 0x1c, 0xa6, 0xe9,
+    0xf4, 0x16, 0x0d, 0x2b, 0xfc, 0x11, 0x9a, 0xe1, 0xf1, 0x1c, 0xf5, 0x0f,
+    0xe4, 0x18, 0xc0, 0xd9, 0x14, 0x26, 0xe6, 0xf8, 0x0a, 0x17, 0xec, 0xfb,
+    0xe1, 0x22, 0xdf, 0xf2, 0xfe, 0x1e, 0xd4, 0xeb, 0xd7, 0x0e, 0x08, 0xf6,
+    0xef, 0xfc, 0xe6, 0xd4, 0xf7, 0x0b, 0xfb, 0xf5, 0x01, 0x25, 0xd7, 0xfb,
+    0x0d, 0xfe, 0xff, 0xf3, 0x1d, 0x32, 0xfe, 0xee, 0x12, 0xf2, 0x0c, 0xec,
+    0x02, 0x10, 0xef, 0x01, 0xf2, 0x0b, 0xf3, 0xf7, 0xfa, 0x25, 0xfb, 0x0d,
+    0x11, 0x15, 0x04, 0xfc, 0x0c, 0x21, 0x12, 0x29, 0x00, 0xfa, 0xf6, 0xf5,
+    0x06, 0x22, 0xea, 0xe2, 0xee, 0x00, 0xfd, 0xf0, 0x0b, 0x1d, 0xd3, 0xe4,
+    0xe4, 0x0a, 0xfc, 0xe8, 0xea, 0x2c, 0xed, 0xed, 0xef, 0xe8, 0xf2, 0x05,
+    0xfd, 0x15, 0xd8, 0xda, 0xca, 0xee, 0xfa, 0x00, 0xfe, 0x0e, 0xf2, 0xf0,
+    0x0e, 0xf5, 0x04, 0x03, 0x1d, 0x2b, 0xee, 0x05, 0x0f, 0x10, 0x13, 0x35,
+    0xe2, 0x04, 0x10, 0xdf, 0xcf, 0xeb, 0x40, 0x26, 0xe4, 0x03, 0xf3, 0xf9,
+    0xf5, 0x14, 0x24, 0x2a, 0xdf, 0xfe, 0xab, 0xe5, 0xfe, 0x1c, 0x27, 0x35,
+    0xdb, 0xff, 0xac, 0x01, 0xf6, 0xfc, 0x19, 0x1a, 0x11, 0x1f, 0xa8, 0xf5,
+    0x02, 0x0f, 0x1a, 0x1f, 0xf7, 0xf2, 0xa2, 0x00, 0x15, 0x22, 0xe4, 0x13,
+    0x00, 0x09, 0xd9, 0xd5, 0x02, 0x19, 0xfd, 0xf8, 0xe7, 0xff, 0xfb, 0xe0,
+    0xef, 0xf7, 0xee, 0xf3, 0xf3, 0x19, 0xb0, 0xdf, 0x00, 0x0f, 0x08, 0xf3,
+    0x15, 0x17, 0xec, 0x0f, 0x11, 0x14, 0x02, 0x08, 0x10, 0x17, 0xe6, 0x08,
+    0xf7, 0x00, 0xed, 0xf7, 0x29, 0x07, 0x10, 0x05, 0x05, 0xe7, 0xed, 0xf4,
+    0xf9, 0x15, 0xf9, 0xf0, 0x08, 0x00, 0x03, 0x09, 0x21, 0x28, 0xf6, 0x0e,
+    0xfb, 0xf3, 0x03, 0xf7, 0x0f, 0x0c, 0xf0, 0xf5, 0xe3, 0xd8, 0xf8, 0xf2,
+    0x09, 0x1c, 0xe7, 0xfb, 0xe4, 0xf6, 0xfa, 0xf8, 0xf1, 0x42, 0xf6, 0xda,
+    0xdd, 0xd7, 0xfa, 0xff, 0x2f, 0x2c, 0xda, 0x0a, 0xde, 0xec, 0xf1, 0x14,
+    0xfb, 0x1d, 0xeb, 0xee, 0xf2, 0xeb, 0xf3, 0xed, 0x0e, 0x35, 0xf0, 0x06,
+    0x19, 0x04, 0x2f, 0x23, 0xe2, 0x07, 0x13, 0x0f, 0xe9, 0xf0, 0x22, 0x2e,
+    0xd9, 0x1a, 0xcb, 0xed, 0xfd, 0x04, 0x27, 0x1e, 0xf6, 0x07, 0x96, 0xd6,
+    0xd8, 0x11, 0x18, 0x56, 0xd2, 0xfb, 0x92, 0xfc, 0x0b, 0x0a, 0x17, 0x2c,
+    0xe5, 0x04, 0xa2, 0xf8, 0xe2, 0x04, 0x1a, 0x0d, 0xeb, 0x11, 0xa2, 0xe5,
+    0xe5, 0xf8, 0x02, 0xf7, 0x17, 0x03, 0xca, 0xe9, 0x0c, 0x1f, 0xfe, 0xf5,
+    0x18, 0x12, 0xdd, 0x08, 0x15, 0xff, 0xfc, 0xf6, 0xe1, 0x1d, 0xe2, 0xe1,
+    0xfe, 0xfc, 0x03, 0xff, 0xf2, 0x23, 0xd2, 0x01, 0x13, 0xdd, 0xf3, 0xf4,
+    0xf2, 0x07, 0xef, 0x03, 0x15, 0x21, 0xd8, 0xf8, 0x09, 0xf3, 0xe8, 0xea,
+    0xe8, 0xf2, 0x08, 0xf0, 0x04, 0x1a, 0xf2, 0x19, 0xfb, 0x1b, 0x15, 0xfc,
+    0x1d, 0x30, 0xe5, 0x1e, 0x09, 0xe8, 0xe9, 0x09, 0xf7, 0x2a, 0xe1, 0x0e,
+    0x00, 0x21, 0xf3, 0xff, 0xfb, 0x01, 0xdf, 0xf2, 0xfe, 0xf4, 0xfc, 0xf0,
+    0x0b, 0x0b, 0xdd, 0xe4, 0xd2, 0x14, 0xf7, 0xfe, 0x0b, 0x39, 0x01, 0xe6,
+    0xe4, 0x27, 0xfa, 0xe4, 0x04, 0x2c, 0xe2, 0x04, 0xf5, 0x07, 0xf2, 0x03,
+    0xf0, 0x10, 0xf5, 0xf6, 0xfc, 0x16, 0x22, 0x1b, 0xf8, 0x11, 0xe4, 0x09,
+    0xf6, 0xf0, 0x41, 0x1e, 0xcf, 0x04, 0xea, 0xee, 0x0e, 0xf6, 0x1b, 0x2f,
+    0xc7, 0xf1, 0xba, 0xef, 0x0f, 0x16, 0x1e, 0x39, 0x05, 0x1e, 0x90, 0xe6,
+    0x0d, 0xfa, 0x22, 0x3f, 0xe3, 0x23, 0xa5, 0xe3, 0xe9, 0x0f, 0x05, 0x27,
+    0x02, 0x11, 0x99, 0x05, 0xfa, 0x05, 0x03, 0x01, 0xff, 0x26, 0xd3, 0xf7,
+    0xf7, 0xf9, 0x05, 0xf4, 0xef, 0x23, 0xd2, 0xdd, 0x05, 0x08, 0xfa, 0xff,
+    0x03, 0x04, 0xbd, 0xd7, 0x14, 0x06, 0xef, 0x06, 0xe5, 0x05, 0xea, 0xea,
+    0x02, 0xfd, 0x0d, 0x00, 0x08, 0xff, 0xe7, 0xfb, 0xfe, 0x13, 0xfe, 0xec,
+    0xf9, 0x02, 0xf3, 0xff, 0xff, 0x08, 0x04, 0xed, 0x19, 0x1d, 0xfa, 0x0a,
+    0x0d, 0xf2, 0x0f, 0xec, 0x25, 0x1c, 0xec, 0x0b, 0x01, 0xff, 0x01, 0xf6,
+    0x08, 0x09, 0xe8, 0xe2, 0xec, 0x23, 0xe5, 0xe9, 0xf0, 0x2e, 0xbd, 0xe1,
+    0xef, 0x14, 0xe9, 0xf6, 0xf5, 0x1d, 0xdc, 0xe3, 0xd7, 0xfc, 0xf9, 0xf2,
+    0xfe, 0x24, 0xf2, 0x05, 0xd5, 0xed, 0xe9, 0xf9, 0xfa, 0x2d, 0xf0, 0xfe,
+    0xee, 0xf2, 0xe8, 0xf7, 0x06, 0x14, 0x01, 0x10, 0x06, 0xf3, 0x0e, 0x0e,
+    0xc2, 0x1d, 0xf2, 0x1c, 0xed, 0xe3, 0x53, 0x21, 0xb8, 0x0c, 0xde, 0x03,
+    0x15, 0xeb, 0x46, 0x39, 0xdf, 0xf6, 0xa3, 0xee, 0xf6, 0xe0, 0x33, 0x50,
+    0xdd, 0x27, 0x9f, 0x07, 0x13, 0xe2, 0x1f, 0x35, 0xed, 0x1f, 0xb7, 0x07,
+    0x11, 0xed, 0x17, 0x28, 0xf4, 0x20, 0xc1, 0xec, 0xef, 0x16, 0x02, 0xfa,
+    0xe0, 0x1b, 0xf7, 0xdb, 0xfd, 0x0a, 0xe7, 0xfb, 0xe7, 0x25, 0xe2, 0xe7,
+    0xf8, 0xf0, 0xee, 0xe9, 0x02, 0x06, 0xc9, 0xe4, 0x14, 0xe3, 0xe2, 0xf7,
+    0xf8, 0xfd, 0xdd, 0xe2, 0x08, 0x0a, 0xe4, 0x05, 0xf5, 0x16, 0xe7, 0x01,
+    0x00, 0x1c, 0xe7, 0xf0, 0xf6, 0x19, 0xfe, 0x0c, 0xf2, 0x06, 0x03, 0xe8,
+    0x0b, 0xfe, 0xe3, 0x19, 0x08, 0x1a, 0x10, 0xfd, 0x00, 0x21, 0xf0, 0xeb,
+    0x18, 0x02, 0xf3, 0x04, 0xf0, 0x18, 0xdb, 0x05, 0x01, 0xde, 0xed, 0xe9,
+    0x23, 0x15, 0xaf, 0xe6, 0xf1, 0x0a, 0xe6, 0xea, 0x01, 0x18, 0xd8, 0xfd,
+    0xf1, 0xe6, 0xec, 0xf5, 0x0e, 0x1e, 0xcc, 0xfc, 0xe7, 0x00, 0xe9, 0x11,
+    0x00, 0x30, 0xf9, 0x14, 0xf4, 0x19, 0xdd, 0xf7, 0xf7, 0x2f, 0xf4, 0xf2,
+    0xff, 0x27, 0x15, 0x1c, 0xbc, 0x2f, 0xe9, 0x14, 0xf5, 0xe8, 0x44, 0x30,
+    0xe8, 0x1d, 0xe4, 0x18, 0x11, 0x00, 0x0c, 0x2b, 0xf3, 0x29, 0x96, 0xe0,
+    0x06, 0xee, 0x3e, 0x55, 0xdc, 0x13, 0x98, 0xdf, 0xf0, 0xfe, 0x17, 0x33,
+    0xe8, 0x09, 0xa3, 0x07, 0xef, 0x0e, 0x1d, 0x37, 0xdd, 0xfe, 0xb5, 0x00,
+    0xf7, 0xe0, 0xea, 0xfd, 0xfd, 0x19, 0xbc, 0xfd, 0x15, 0xfe, 0x01, 0xf3,
+    0xd5, 0x20, 0xbf, 0xe3, 0x15, 0x0e, 0xf0, 0xf6, 0xf2, 0x14, 0xcc, 0xf0,
+    0xf7, 0x04, 0xf2, 0xff, 0x0b, 0x02, 0xd2, 0xd8, 0xfa, 0xfc, 0xe5, 0x02,
+    0x00, 0xfb, 0xf0, 0xdc, 0x1e, 0x10, 0x02, 0x01, 0x00, 0x18, 0xe9, 0xdb,
+    0x1e, 0xf6, 0xfc, 0x03, 0xef, 0x0a, 0x00, 0x16, 0x00, 0x0f, 0xf4, 0x16,
+    0xfa, 0x0b, 0xe2, 0xfa, 0xe0, 0x07, 0xfb, 0x02, 0x21, 0x0e, 0xdd, 0x0b,
+    0xea, 0xf0, 0xeb, 0xfb, 0x19, 0x09, 0xd4, 0xf2, 0xef, 0x0b, 0x00, 0xeb,
+    0x1a, 0x2f, 0xea, 0x06, 0x03, 0xf6, 0xf8, 0xfb, 0xfe, 0x1d, 0xea, 0xdd,
+    0xed, 0xfd, 0xfb, 0xe7, 0xfe, 0x18, 0xf4, 0xfc, 0x0b, 0xf6, 0xfc, 0x0b,
+    0xfb, 0x28, 0x07, 0xff, 0x07, 0x1e, 0x03, 0x21, 0xcf, 0x22, 0x05, 0xe6,
+    0xea, 0xe7, 0x43, 0x2e, 0xe7, 0x14, 0xfb, 0x0a, 0x1e, 0xfe, 0x2c, 0x24,
+    0xd5, 0xfd, 0x9e, 0xd1, 0xf2, 0x1c, 0x32, 0x51, 0x01, 0xf3, 0xac, 0xe1,
+    0xf4, 0xe5, 0x1c, 0x37, 0xf1, 0x0f, 0xa7, 0xdb, 0x00, 0xf6, 0x0f, 0x18,
+    0xe1, 0x10, 0xc9, 0xc5, 0xe8, 0xeb, 0xf2, 0xfd, 0xf6, 0x02, 0xc2, 0xff,
+    0x00, 0x19, 0x03, 0x0f, 0x02, 0x22, 0xd4, 0xe7, 0x07, 0x0f, 0xe5, 0x1a,
+    0x09, 0x0b, 0xdc, 0xd2, 0x00, 0x05, 0xee, 0xf8, 0xdc, 0x14, 0xd0, 0x0a,
+    0x0a, 0xfa, 0xeb, 0x04, 0xf3, 0x06, 0xde, 0x05, 0xfb, 0xfd, 0xe3, 0xec,
+    0xfd, 0x14, 0xd7, 0x11, 0x0e, 0xe6, 0x06, 0xec, 0xde, 0x22, 0xd7, 0x00,
+    0x03, 0xf5, 0xf5, 0x0d, 0x01, 0x05, 0xea, 0x0b, 0x16, 0x04, 0xff, 0x13,
+    0xf3, 0x12, 0xd2, 0xdf, 0x0b, 0xe4, 0x06, 0xf6, 0x08, 0x2d, 0xd3, 0xd6,
+    0xe7, 0x0a, 0xec, 0xff, 0xfe, 0x01, 0xdf, 0xf4, 0xdf, 0x1c, 0xfe, 0xf9,
+    0xf7, 0x13, 0xca, 0xff, 0x03, 0x06, 0xe9, 0xf7, 0x06, 0x08, 0xd7, 0xf3,
+    0xed, 0x08, 0xe3, 0xfd, 0x0c, 0x11, 0x15, 0xfb, 0x15, 0x08, 0x28, 0x40,
+    0xe7, 0x0d, 0x08, 0xec, 0xe8, 0x16, 0x67, 0x46, 0xc8, 0x16, 0xf1, 0x02,
+    0x24, 0x00, 0x3a, 0x43, 0xd6, 0x12, 0xae, 0xe7, 0xf4, 0xf8, 0x3a, 0x65,
+    0xe4, 0x0c, 0xb2, 0xef, 0x1f, 0xe8, 0x29, 0x59, 0xf8, 0x11, 0xc4, 0xe1,
+    0xfe, 0xfa, 0x27, 0x43, 0xc9, 0x1e, 0xbb, 0xfb, 0xf3, 0x13, 0x15, 0x0d,
+    0xf1, 0x13, 0xcd, 0xf0, 0x07, 0x19, 0x07, 0x00, 0xd8, 0xeb, 0xbf, 0xf0,
+    0xfc, 0xf6, 0xef, 0x16, 0x01, 0x02, 0xc1, 0xdf, 0xfd, 0xe9, 0x06, 0x06,
+    0xf1, 0x08, 0xd7, 0xcc, 0xfb, 0x0e, 0xfc, 0x14, 0xf2, 0x1a, 0xe2, 0x0d,
+    0xeb, 0x09, 0x07, 0x10, 0xe6, 0x13, 0xeb, 0xf5, 0x15, 0x14, 0xeb, 0xfe,
+    0xf9, 0x17, 0xd2, 0xe3, 0x1e, 0xf5, 0x04, 0x0a, 0xf1, 0x0e, 0xde, 0xe7,
+    0x01, 0x20, 0x0c, 0xfc, 0xdc, 0xf9, 0xe5, 0xe9, 0xff, 0x1d, 0x0a, 0xfe,
+    0xec, 0x25, 0xaf, 0xd2, 0x01, 0x16, 0xfc, 0x17, 0xe8, 0x1e, 0xcd, 0xd9,
+    0xe2, 0xf1, 0xeb, 0x08, 0xff, 0x33, 0xe5, 0xfb, 0xeb, 0x04, 0xfe, 0xf7,
+    0xfd, 0x1f, 0xee, 0xff, 0xed, 0xf8, 0xe0, 0xff, 0xfd, 0x2b, 0x0a, 0xf5,
+    0x15, 0x1d, 0xf3, 0x3f, 0x16, 0xf6, 0xf2, 0xee, 0xf4, 0xef, 0xf0, 0x56,
+    0x0a, 0x1a, 0xbc, 0xfc, 0x2f, 0xfb, 0xf0, 0x56, 0x1e, 0x0e, 0xc6, 0xe8,
+    0x06, 0x0b, 0x11, 0x62, 0x3e, 0xf9, 0xb8, 0xc9, 0xed, 0xeb, 0x02, 0x63,
+    0x2c, 0xfd, 0xc5, 0xe9, 0x00, 0x17, 0x0f, 0x37, 0xfe, 0x20, 0xcc, 0xe0,
+    0xe0, 0x0e, 0xe6, 0x20, 0x0a, 0xfd, 0xdf, 0xee, 0x0b, 0x02, 0xee, 0x1f,
+    0xfb, 0x06, 0xd2, 0xed, 0xfe, 0xeb, 0xfc, 0x12, 0xfd, 0x14, 0x00, 0xd8,
+    0x08, 0xf6, 0xec, 0x17, 0xf9, 0x10, 0x00, 0xd9, 0x18, 0xf1, 0xee, 0x0f,
+    0xf4, 0x03, 0xee, 0xeb, 0xf0, 0xef, 0xf2, 0x06, 0x04, 0x00, 0xf4, 0x0f,
+    0x09, 0x06, 0xf7, 0x0b, 0xfd, 0x01, 0x03, 0x03, 0xf4, 0xf6, 0xdd, 0x14,
+    0x1c, 0xef, 0xf1, 0xdd, 0xf7, 0x13, 0xd9, 0x15, 0xef, 0x02, 0xd2, 0xe7,
+    0x05, 0x05, 0xe2, 0x09, 0xf2, 0x11, 0xf5, 0xba, 0xf0, 0x04, 0xe0, 0x01,
+    0x06, 0x10, 0xe6, 0xef, 0xfc, 0x12, 0xf9, 0xf4, 0x1b, 0x2f, 0xe3, 0x0f,
+    0xd7, 0xf6, 0x0b, 0x11, 0xf7, 0x0c, 0x00, 0x06, 0x18, 0xef, 0x06, 0x03,
+    0x0a, 0x09, 0xf6, 0x1a, 0x0d, 0xed, 0xfe, 0x2c, 0x43, 0xf4, 0xe5, 0xde,
+    0xf5, 0x02, 0x25, 0x5a, 0x49, 0xd4, 0xe6, 0x24, 0x1e, 0xf7, 0x0e, 0x5c,
+    0x5d, 0xf0, 0xf9, 0xe4, 0x1c, 0xeb, 0x28, 0x7f, 0x5b, 0xec, 0xfa, 0xdb,
+    0x0c, 0xf5, 0x20, 0x49, 0x51, 0xe1, 0xed, 0xe6, 0x0e, 0x26, 0x28, 0x33,
+    0x35, 0x05, 0xe1, 0xe4, 0x1f, 0xfc, 0xf9, 0x39, 0x18, 0x04, 0xed, 0xed,
+    0x01, 0xe7, 0xe6, 0x08, 0x09, 0x03, 0xe7, 0xf9, 0x0e, 0x06, 0xec, 0x08,
+    0x12, 0x1a, 0xda, 0xef, 0xdf, 0xf9, 0xe2, 0x1e, 0x1c, 0x00, 0x12, 0xd7,
+    0x01, 0xf7, 0x21, 0x17, 0x13, 0x19, 0xde, 0xe0, 0xec, 0x16, 0x01, 0x1b,
+    0x06, 0x0c, 0xf0, 0xe8, 0x18, 0x03, 0x06, 0x0e, 0x09, 0xfa, 0x03, 0xf3,
+    0xdd, 0x01, 0xfb, 0x0a, 0x2a, 0xf4, 0xf6, 0xda, 0xe9, 0xfe, 0xe9, 0x12,
+    0x19, 0xe9, 0x05, 0xdf, 0x00, 0xeb, 0xf2, 0x10, 0x0c, 0xe1, 0xcd, 0xcb,
+    0xf2, 0x1f, 0xd9, 0x0c, 0xfa, 0xfb, 0xe8, 0xde, 0x00, 0xfc, 0xe5, 0x00,
+    0x11, 0x02, 0xe6, 0x17, 0x14, 0x00, 0xf2, 0xfd, 0x00, 0xe1, 0x10, 0x24,
+    0x12, 0xec, 0xed, 0x1e, 0x09, 0x18, 0x03, 0x0c, 0x04, 0xf4, 0x15, 0x0f,
+    0x10, 0x18, 0xd6, 0x29, 0x10, 0x04, 0x1c, 0xef, 0x0f, 0x0c, 0xc7, 0x04,
+    0xfe, 0xeb, 0xff, 0xf5, 0xe3, 0x15, 0xfe, 0xcb, 0x10, 0xff, 0x12, 0xfb,
+    0xe4, 0xeb, 0xf9, 0x00, 0x02, 0xf1, 0x14, 0x13, 0x01, 0x02, 0xf9, 0x01,
+    0x06, 0x0c, 0xf5, 0x0a, 0x1e, 0x01, 0x19, 0x0e, 0x05, 0xf5, 0x0a, 0xff,
+    0xff, 0xf2, 0xfb, 0xdb, 0xf8, 0x06, 0x17, 0xf2, 0xf7, 0x0d, 0x0e, 0xf4,
+    0xfa, 0xf7, 0x14, 0xdb, 0xe0, 0xfd, 0x08, 0x16, 0xf7, 0x16, 0xfc, 0x09,
+    0x27, 0x07, 0x09, 0xfb, 0x0a, 0xfc, 0x0c, 0xe4, 0xdb, 0xee, 0xff, 0x10,
+    0xf3, 0x09, 0xfa, 0xf4, 0x23, 0xf3, 0xf4, 0x19, 0xff, 0xfa, 0xff, 0x19,
+    0x0f, 0x11, 0xed, 0xec, 0xf8, 0x0f, 0x10, 0xf3, 0xff, 0x0b, 0xf7, 0x06,
+    0x0b, 0x0e, 0x07, 0xe4, 0x18, 0x0a, 0x08, 0x0e, 0x02, 0x0a, 0x05, 0x19,
+    0x02, 0xf3, 0xfe, 0xfe, 0x0b, 0x0f, 0xfc, 0xfa, 0x05, 0xf9, 0xe2, 0xf9,
+    0x1b, 0xf7, 0x0f, 0x07, 0xfc, 0x12, 0xfe, 0x01, 0xfd, 0xf0, 0x04, 0xf4,
+    0xfd, 0x07, 0xf2, 0x04, 0x04, 0x07, 0xef, 0x0c, 0xed, 0x0e, 0xf6, 0xef,
+    0x08, 0x07, 0x04, 0xe9, 0xf3, 0x20, 0xda, 0x15, 0xf8, 0xff, 0xec, 0xe0,
+    0xf6, 0xff, 0xe9, 0x08, 0x01, 0x10, 0xf0, 0xfc, 0xe9, 0x08, 0xe8, 0xf5,
+    0xf8, 0xe5, 0x17, 0xe6, 0x03, 0xfc, 0x09, 0xf5, 0xdd, 0xf2, 0xff, 0x05,
+    0xf6, 0xf8, 0xf5, 0x07, 0xfc, 0xf1, 0x04, 0xf3, 0x13, 0xe1, 0x0f, 0xf2,
+    0x0a, 0xf9, 0xfd, 0x1c, 0xe0, 0x11, 0x1b, 0xe6, 0xef, 0x05, 0x05, 0x0c,
+    0x23, 0x10, 0x09, 0xfe, 0xf7, 0x1a, 0xf1, 0xfc, 0x11, 0x1d, 0xff, 0x03,
+    0x03, 0xe6, 0x07, 0x11, 0x0c, 0x0d, 0x16, 0x05, 0x05, 0x25, 0xf3, 0x10,
+    0x10, 0x06, 0x09, 0xe8, 0x1a, 0xf0, 0xee, 0x09, 0xff, 0x24, 0xf7, 0xfb,
+    0xe6, 0x06, 0xfa, 0x08, 0x03, 0x00, 0xf2, 0x04, 0xf0, 0xeb, 0x14, 0x1c,
+    0x03, 0x21, 0x14, 0x1d, 0xfe, 0x03, 0xf6, 0x02, 0x09, 0xff, 0x00, 0x13,
+    0xef, 0x10, 0x1e, 0x0b, 0x1d, 0x1c, 0xf1, 0xf6, 0xe7, 0xfd, 0x14, 0x01,
+    0xff, 0x13, 0xf7, 0xfc, 0x00, 0x21, 0xe3, 0xeb, 0x07, 0x0e, 0x09, 0xf1,
+    0xf8, 0xfd, 0x03, 0xee, 0x19, 0xfd, 0xff, 0xfb, 0xff, 0xea, 0xfb, 0x07,
+    0xf0, 0x0a, 0x04, 0x04, 0x0b, 0x12, 0xfe, 0x0b, 0xe0, 0xff, 0xf6, 0xe5,
+    0xfc, 0x11, 0xed, 0xfd, 0x15, 0x03, 0xdd, 0xdb, 0x04, 0xfe, 0xff, 0x0e,
+    0xff, 0xfa, 0xfb, 0xe5, 0xef, 0xf6, 0xfe, 0x22, 0x0f, 0xe8, 0xfe, 0xf4,
+    0xfd, 0xd9, 0x03, 0x0a, 0xdf, 0xcf, 0xf1, 0x14, 0x05, 0xfd, 0xfb, 0xf3,
+    0xfb, 0xfb, 0x0f, 0xf8, 0x05, 0x09, 0x03, 0xf7, 0x05, 0x05, 0x13, 0xfb,
+    0xeb, 0x23, 0xe7, 0x18, 0xfb, 0x00, 0xfe, 0xdd, 0xe9, 0xea, 0xd3, 0xe8,
+    0x1a, 0xef, 0x01, 0xf1, 0x09, 0x1d, 0xd8, 0xfc, 0xda, 0x19, 0x03, 0xec,
+    0xe5, 0xf3, 0xed, 0x0a, 0xf4, 0x13, 0x0b, 0xf7, 0x0c, 0x00, 0xf9, 0xea,
+    0xe3, 0xfe, 0xff, 0x0d, 0x0a, 0x1b, 0xd7, 0x17, 0xeb, 0xe9, 0x00, 0x0e,
+    0xee, 0x24, 0xef, 0x09, 0x07, 0xf0, 0xf5, 0x07, 0xf5, 0xf5, 0x10, 0x17,
+    0x06, 0xf7, 0xfc, 0x02, 0xfb, 0xf9, 0xe7, 0x0a, 0x26, 0xf3, 0x01, 0x01,
+    0x09, 0x0b, 0x02, 0x27, 0xf8, 0xee, 0xfd, 0x1c, 0xf8, 0xf2, 0x0f, 0xfc,
+    0x0d, 0xe0, 0xea, 0x02, 0x0b, 0x00, 0xe0, 0x08, 0xfe, 0x10, 0x04, 0xfe,
+    0xeb, 0x13, 0x01, 0x0c, 0x0e, 0xed, 0x09, 0x01, 0x0c, 0xe3, 0x10, 0xdf,
+    0xd1, 0x14, 0xf3, 0xef, 0x09, 0xf0, 0xee, 0xe5, 0x11, 0xf4, 0xf6, 0x00,
+    0xe8, 0x20, 0x0a, 0xfc, 0xea, 0xf7, 0x02, 0x16, 0xe7, 0xf3, 0x0d, 0xe4,
+    0x04, 0xe6, 0xef, 0xf8, 0x0f, 0x23, 0x02, 0xe0, 0x01, 0x01, 0x01, 0x05,
+    0xf5, 0x0d, 0xf5, 0xf5, 0xe1, 0xff, 0x04, 0x00, 0xf4, 0x0d, 0xee, 0xf1,
+    0xef, 0xf7, 0x0b, 0xff, 0x1b, 0xec, 0x05, 0xe7, 0xf3, 0x13, 0x12, 0xf2,
+    0xf3, 0xfc, 0xea, 0x06, 0xfe, 0x13, 0x12, 0xdb, 0x11, 0xe2, 0xfc, 0x0d,
+    0x1c, 0xe8, 0x1d, 0xfc, 0xf2, 0xe2, 0x13, 0x1d, 0xda, 0xf6, 0x1c, 0x18,
+    0x1e, 0xf4, 0xfa, 0x03, 0xdc, 0x0f, 0xff, 0xff, 0x18, 0x0b, 0xed, 0xf1,
+    0xf8, 0x02, 0xf4, 0x10, 0xf9, 0xeb, 0x0b, 0x0e, 0x0f, 0x01, 0x02, 0x1b,
+    0x06, 0x10, 0x00, 0xe7, 0x23, 0x0d, 0xf6, 0x11, 0x08, 0xf5, 0x0f, 0x05,
+    0x13, 0xf7, 0x01, 0x01, 0x0c, 0xf6, 0xf9, 0xf0, 0x29, 0x01, 0xe9, 0x11,
+    0x02, 0xfa, 0xeb, 0x16, 0x0e, 0x10, 0x09, 0x0e, 0x1c, 0x0a, 0xe3, 0xd3,
+    0x01, 0xe3, 0x00, 0x06, 0xe2, 0xe9, 0x19, 0xef, 0x12, 0xf3, 0xfc, 0x02,
+    0x0b, 0x0c, 0x0d, 0xed, 0xfd, 0xf6, 0xf9, 0xe9, 0xf2, 0x28, 0xfe, 0x03,
+    0xec, 0x03, 0x00, 0xf8, 0xde, 0x0d, 0x25, 0x07, 0x1a, 0xe7, 0xfd, 0x29,
+    0xd8, 0xf7, 0xfb, 0xde, 0x0c, 0x08, 0x06, 0x22, 0xee, 0x1d, 0x05, 0x07,
+    0xf0, 0xfb, 0xfe, 0x07, 0xf1, 0x04, 0xe9, 0x01, 0xfc, 0xf1, 0x00, 0xeb,
+    0xe3, 0x08, 0xec, 0xfe, 0x04, 0xeb, 0xfc, 0x01, 0xf6, 0x0e, 0xdf, 0xf8,
+    0x12, 0xe3, 0x16, 0xdc, 0x21, 0x0a, 0xe6, 0x06, 0xe5, 0x10, 0x07, 0xf7,
+    0x1e, 0xde, 0xe3, 0x07, 0x16, 0xed, 0x23, 0xf2, 0x12, 0x0d, 0xe9, 0xf9,
+    0xe8, 0xfe, 0x0e, 0x02, 0x18, 0x0a, 0xea, 0xec, 0xfb, 0xfe, 0x0c, 0x1b,
+    0x19, 0x20, 0xfa, 0x07, 0xe5, 0x0c, 0x04, 0x27, 0xdb, 0xe6, 0xfe, 0x0d,
+    0x0a, 0x0a, 0xfe, 0x39, 0xdd, 0xde, 0x05, 0xec, 0x09, 0x05, 0x0a, 0x2c,
+    0xf4, 0x02, 0x1f, 0xd3, 0x24, 0xee, 0x0f, 0x3c, 0xf5, 0xfd, 0xf8, 0xf8,
+    0x12, 0xf5, 0xf3, 0x19, 0xf9, 0xda, 0xf6, 0x0a, 0x0a, 0xf4, 0x09, 0x0f,
+    0xfc, 0x00, 0x01, 0x01, 0xf3, 0xf8, 0x05, 0xf3, 0x0c, 0x19, 0x0e, 0xfd,
+    0xfa, 0xe1, 0xfc, 0x0c, 0x03, 0xfb, 0x1b, 0x06, 0xcc, 0xe4, 0x08, 0xf9,
+    0x10, 0xe9, 0x06, 0x00, 0x17, 0xe8, 0x0d, 0x12, 0xca, 0xf5, 0x23, 0xe4,
+    0x21, 0xf6, 0x19, 0x33, 0xdd, 0xfa, 0x0c, 0x01, 0x14, 0x07, 0x00, 0x34,
+    0xda, 0x05, 0x07, 0x01, 0x07, 0xe4, 0x06, 0x24, 0x02, 0xff, 0xf0, 0x09,
+    0xfc, 0xf4, 0x03, 0x06, 0xee, 0x08, 0xe2, 0x1d, 0xfa, 0x0c, 0xfc, 0x02,
+    0x03, 0xe5, 0xf0, 0xe2, 0x0a, 0x18, 0x12, 0x0c, 0x1e, 0x20, 0xed, 0x20,
+    0xe4, 0x01, 0x2a, 0x09, 0x0d, 0x0e, 0xd0, 0xf4, 0xdd, 0xfd, 0x2b, 0xf2,
+    0x08, 0x0c, 0xf8, 0xf7, 0xfc, 0xf9, 0x15, 0xef, 0x19, 0x1c, 0x01, 0xff,
+    0xe2, 0x01, 0xf3, 0x30, 0x0e, 0xfb, 0x15, 0xe8, 0x1c, 0x00, 0xfa, 0x16,
+    0xef, 0xea, 0xfb, 0x05, 0xf0, 0x0e, 0x02, 0x13, 0xf4, 0x01, 0x03, 0xe5,
+    0x29, 0x07, 0x09, 0x24, 0xf9, 0xe3, 0xf8, 0xde, 0x2d, 0xf4, 0xf5, 0x40,
+    0xed, 0xdf, 0x07, 0xef, 0x0f, 0x0a, 0x0b, 0x32, 0x0d, 0xe8, 0x00, 0xe6,
+    0xf6, 0xfc, 0xfd, 0x19, 0x11, 0x09, 0xf3, 0x03, 0xea, 0xf1, 0xfb, 0x02,
+    0xfd, 0x06, 0xff, 0xfe, 0x09, 0xec, 0x06, 0x0c, 0x15, 0xf9, 0x06, 0xd7,
+    0xe3, 0xf7, 0xed, 0x01, 0x03, 0xfd, 0x14, 0x01, 0x0e, 0xe0, 0x37, 0x0d,
+    0xd2, 0x18, 0x2f, 0xea, 0x12, 0x0d, 0x05, 0x3a, 0xd5, 0x07, 0x1e, 0xf2,
+    0x21, 0x11, 0xf9, 0x36, 0xd3, 0xf5, 0x12, 0xf6, 0xfb, 0xf6, 0x06, 0x0f,
+    0xde, 0xf9, 0x06, 0x09, 0xdf, 0xff, 0x0b, 0xf3, 0xf5, 0x01, 0xf1, 0xea,
+    0xf2, 0x02, 0x12, 0xfc, 0x0e, 0xee, 0xf8, 0xeb, 0x00, 0xef, 0x21, 0x0f,
+    0x09, 0xef, 0xeb, 0x1e, 0xef, 0xf2, 0x26, 0xf9, 0x17, 0xf1, 0xf1, 0xf0,
+    0x0c, 0x10, 0x1d, 0xff, 0x1d, 0x06, 0x03, 0xf6, 0xfb, 0x14, 0x1b, 0x03,
+    0x22, 0xfd, 0xec, 0x03, 0xfa, 0xf8, 0x01, 0x2b, 0x1e, 0x1b, 0x09, 0x09,
+    0x07, 0xff, 0xf0, 0x20, 0xee, 0x14, 0xfb, 0xf6, 0xf8, 0x11, 0xd9, 0x29,
+    0xf4, 0xfa, 0x07, 0xef, 0x20, 0xf9, 0xf2, 0x30, 0xee, 0xf0, 0xf3, 0xd6,
+    0x0d, 0xfe, 0x03, 0x36, 0xf5, 0xd7, 0x01, 0xe6, 0x04, 0xf0, 0x05, 0x1f,
+    0x0f, 0xdd, 0xff, 0xf8, 0x1f, 0xf2, 0x04, 0x37, 0xfa, 0x00, 0xfd, 0xf8,
+    0x10, 0xe1, 0xfb, 0x0d, 0xed, 0xf6, 0xe2, 0xfe, 0x08, 0xfe, 0x07, 0x08,
+    0x08, 0x11, 0x0a, 0xf0, 0xf8, 0xf5, 0x04, 0xea, 0x08, 0x12, 0x06, 0x0d,
+    0x0f, 0x10, 0x40, 0x28, 0xc0, 0xfb, 0x3f, 0x08, 0x1d, 0x09, 0x1b, 0x3d,
+    0xee, 0xf4, 0x29, 0x13, 0x20, 0xfc, 0x11, 0x4c, 0xdb, 0x02, 0x15, 0x05,
+    0xec, 0xeb, 0x0a, 0x22, 0xe7, 0x00, 0x02, 0x01, 0xd4, 0xea, 0x0a, 0xf3,
+    0xe3, 0xf8, 0xf5, 0xfa, 0x01, 0x0d, 0x19, 0x06, 0x24, 0x13, 0x02, 0xf5,
+    0xf1, 0xf1, 0x1b, 0x0f, 0x19, 0x04, 0xe3, 0xf9, 0xe7, 0x02, 0x29, 0xfc,
+    0x29, 0xec, 0xe9, 0x04, 0xdc, 0x22, 0x1d, 0xfd, 0x1f, 0x01, 0xec, 0xe8,
+    0xf5, 0x14, 0x1b, 0x19, 0x06, 0x0e, 0x02, 0x0d, 0xf9, 0x06, 0xfc, 0x15,
+    0x07, 0xfa, 0x0c, 0xe1, 0x18, 0x1a, 0xe8, 0x1b, 0xe9, 0xef, 0x0a, 0x18,
+    0xfc, 0x05, 0xf9, 0x14, 0xdc, 0x04, 0x01, 0xff, 0x07, 0xfd, 0xf0, 0x2c,
+    0xf2, 0xec, 0x0e, 0xe7, 0x1a, 0x05, 0xe8, 0x35, 0x13, 0x09, 0xf9, 0x07,
+    0xfe, 0xfa, 0x0d, 0x40, 0x0c, 0xea, 0xf4, 0x04, 0x01, 0x11, 0xfc, 0x23,
+    0xeb, 0xf4, 0xe9, 0x04, 0xeb, 0xe7, 0x07, 0x09, 0xfb, 0xf1, 0xf6, 0xfd,
+    0x02, 0xfa, 0x02, 0xff, 0x00, 0xff, 0xf1, 0xf1, 0x1a, 0xe9, 0x10, 0xe3,
+    0x0b, 0x0c, 0x08, 0x04, 0x1b, 0x0a, 0x2b, 0x10, 0xe1, 0x01, 0x1f, 0x06,
+    0x04, 0xec, 0x19, 0x49, 0xee, 0xf8, 0x22, 0x0c, 0x20, 0x02, 0x07, 0x31,
+    0xe7, 0xff, 0x0f, 0xf0, 0xfd, 0xea, 0x13, 0x26, 0xce, 0xfa, 0xff, 0xee,
+    0xe9, 0xfe, 0x15, 0x08, 0x04, 0x05, 0x0d, 0xfa, 0xdd, 0xf8, 0x07, 0x0b,
+    0x33, 0xef, 0xec, 0xf9, 0xd9, 0xe6, 0x1d, 0x10, 0x41, 0xf6, 0xdf, 0x11,
+    0xe3, 0x14, 0x1d, 0xfb, 0x2b, 0x15, 0xdc, 0x09, 0xf6, 0x05, 0x16, 0x00,
+    0x1c, 0x27, 0xe4, 0xfc, 0xf7, 0x16, 0x08, 0x08, 0x2f, 0xdd, 0xf8, 0xfa,
+    0xe9, 0x0e, 0x0b, 0x0b, 0x02, 0x12, 0x02, 0xfd, 0x19, 0x03, 0xeb, 0x11,
+    0xf4, 0x09, 0x09, 0x15, 0x12, 0x0d, 0xef, 0x1c, 0xe4, 0xfe, 0x17, 0x0c,
+    0x09, 0x04, 0xea, 0x2f, 0xf2, 0x1e, 0x02, 0xfb, 0xfe, 0xe3, 0x00, 0x2e,
+    0x04, 0xf9, 0x0c, 0x05, 0x27, 0x0c, 0x07, 0x2d, 0xf7, 0x0b, 0xfb, 0xf9,
+    0x1c, 0xdf, 0x11, 0x36, 0x05, 0xf2, 0x02, 0xf8, 0x0b, 0x07, 0x05, 0xfb,
+    0xfc, 0x0e, 0x13, 0xfa, 0xfb, 0x09, 0xf5, 0xfd, 0x06, 0x15, 0xf9, 0x03,
+    0x18, 0xfd, 0x1a, 0x0a, 0x03, 0xe2, 0xfb, 0x00, 0x1e, 0xfe, 0x4f, 0x27,
+    0xe1, 0xf7, 0x31, 0xf0, 0x1b, 0xec, 0x07, 0x5f, 0xe2, 0xf8, 0x40, 0x05,
+    0x17, 0x24, 0x0c, 0x3c, 0xf3, 0x10, 0x13, 0xf8, 0x0b, 0xf3, 0xf9, 0x36,
+    0xe1, 0xf3, 0xf4, 0xe8, 0xef, 0xf8, 0xfc, 0xeb, 0xe3, 0xfb, 0xf0, 0xee,
+    0xdb, 0x06, 0x0c, 0x11, 0x1e, 0x10, 0xe2, 0xe9, 0xeb, 0x0d, 0x34, 0x0f,
+    0x43, 0xd9, 0xef, 0x08, 0xec, 0x05, 0x1d, 0x02, 0x33, 0xef, 0xf4, 0xf7,
+    0xe6, 0xf9, 0x22, 0x07, 0x04, 0x06, 0xe9, 0x02, 0xf0, 0xfc, 0x24, 0x20,
+    0x24, 0x17, 0xe6, 0x0f, 0x05, 0xf6, 0xfc, 0x1f, 0xf2, 0x01, 0x0d, 0xe7,
+    0xff, 0x1d, 0xf0, 0xfa, 0xd0, 0x00, 0xff, 0x0e, 0x23, 0xf9, 0xf3, 0x11,
+    0xde, 0x0d, 0x05, 0x04, 0x0b, 0x0b, 0xfb, 0x26, 0x0d, 0x0d, 0xff, 0xe8,
+    0x16, 0xe8, 0x0b, 0x3c, 0x18, 0xe4, 0x04, 0xff, 0xfa, 0xf3, 0xff, 0x40,
+    0xee, 0x06, 0xfc, 0x0d, 0x00, 0xf7, 0x13, 0x3f, 0xf7, 0x13, 0x06, 0x08,
+    0xf9, 0x13, 0xf2, 0x19, 0xfd, 0xf9, 0xf3, 0xe6, 0xfc, 0x07, 0xf6, 0xfd,
+    0x0a, 0x22, 0x00, 0x01, 0x19, 0xff, 0xe7, 0xff, 0x08, 0xfd, 0x03, 0xfd,
+    0x1f, 0xe7, 0x28, 0x08, 0xde, 0xf3, 0x43, 0xf6, 0x0c, 0xfe, 0x1e, 0x52,
+    0xf2, 0x04, 0x17, 0xf2, 0x08, 0x0d, 0x04, 0x38, 0xde, 0x0c, 0x10, 0xef,
+    0xdf, 0x0f, 0x01, 0x24, 0xde, 0xe1, 0x0d, 0xfd, 0xd4, 0xf6, 0x12, 0x0e,
+    0xed, 0x01, 0xf0, 0xf3, 0xfd, 0xff, 0x18, 0xf3, 0x36, 0xda, 0xf6, 0xef,
+    0xe8, 0xef, 0x37, 0x27, 0x4e, 0xf8, 0xf4, 0xff, 0xe5, 0xf3, 0x32, 0x0b,
+    0x36, 0x08, 0xe9, 0xf6, 0xe2, 0x13, 0x21, 0xfe, 0x12, 0xed, 0xdd, 0xfb,
+    0xf8, 0x05, 0x0f, 0x03, 0x1c, 0x04, 0xfc, 0xf2, 0x23, 0x0e, 0x03, 0xfc,
+    0xf9, 0x18, 0xf7, 0x01, 0x1b, 0x03, 0xf5, 0xfd, 0xde, 0xf3, 0x19, 0xfc,
+    0x11, 0x02, 0xe7, 0x13, 0xde, 0xd8, 0xf2, 0x05, 0x28, 0x02, 0x02, 0x27,
+    0x07, 0x08, 0xff, 0x07, 0x27, 0x0e, 0x19, 0x40, 0xfb, 0x02, 0x0c, 0xf6,
+    0x0d, 0x07, 0x0f, 0x47, 0xf8, 0x05, 0x0e, 0xfd, 0x03, 0x1e, 0x07, 0x32,
+    0xe7, 0xf6, 0x24, 0x01, 0x01, 0x02, 0x0a, 0xff, 0xf6, 0x26, 0x15, 0xf0,
+    0x04, 0x13, 0x03, 0xfa, 0xfe, 0xf6, 0xf1, 0x09, 0x2a, 0xe6, 0xea, 0xf6,
+    0x17, 0x13, 0xeb, 0xff, 0x15, 0xeb, 0x23, 0x06, 0xc8, 0xf6, 0x33, 0xeb,
+    0xf4, 0xe7, 0x12, 0x2a, 0xe3, 0xe6, 0x32, 0xfa, 0x16, 0x15, 0x17, 0x40,
+    0xf1, 0x08, 0x1a, 0xf3, 0xf6, 0x0c, 0x0c, 0x11, 0xd0, 0x22, 0x02, 0xee,
+    0xea, 0xf4, 0xf8, 0xf9, 0x13, 0x10, 0x17, 0xf5, 0xf1, 0x0a, 0x0e, 0xfd,
+    0x32, 0xda, 0xf1, 0xe2, 0xdb, 0xf2, 0x34, 0x1f, 0x53, 0xfc, 0xe4, 0xf2,
+    0xf6, 0xf2, 0x1d, 0x04, 0x4a, 0xec, 0xee, 0x06, 0xdf, 0x01, 0x1a, 0x04,
+    0x27, 0xfc, 0xe6, 0xfd, 0xd9, 0xfd, 0x0e, 0x00, 0x0c, 0x16, 0xf3, 0x03,
+    0xf7, 0xfc, 0x0e, 0x0f, 0x09, 0x06, 0x06, 0x04, 0x08, 0x02, 0xed, 0xf5,
+    0xe4, 0xe6, 0x07, 0x06, 0x03, 0x18, 0xea, 0x13, 0xe2, 0xfa, 0x10, 0xf2,
+    0x02, 0xec, 0x03, 0x3c, 0xf6, 0xf6, 0x0a, 0x10, 0x09, 0xf8, 0x15, 0x24,
+    0xfd, 0x0d, 0x09, 0x01, 0x00, 0xff, 0x00, 0x1a, 0xf0, 0xee, 0x08, 0x03,
+    0x1d, 0x05, 0x16, 0x46, 0xe6, 0xf8, 0x08, 0x00, 0x09, 0x09, 0xff, 0x01,
+    0xfc, 0x20, 0xfc, 0xec, 0x05, 0x1b, 0x03, 0xf1, 0x12, 0xe4, 0xfa, 0x24,
+    0x1c, 0xf5, 0xf2, 0x05, 0x11, 0xe7, 0xfa, 0x02, 0x20, 0xea, 0x31, 0x10,
+    0xcf, 0xd8, 0x33, 0xee, 0xff, 0x09, 0x20, 0x3f, 0xe2, 0x0a, 0x29, 0xee,
+    0x3a, 0xf2, 0x1e, 0x39, 0x02, 0x1e, 0xfe, 0xf2, 0xef, 0xe2, 0x0d, 0x0f,
+    0xf1, 0x19, 0x02, 0xe7, 0xec, 0xff, 0xfe, 0xe4, 0xfe, 0xfb, 0x02, 0xf6,
+    0xf1, 0xf4, 0x07, 0x1a, 0x2a, 0xf9, 0x06, 0xf9, 0xda, 0xf4, 0x22, 0x02,
+    0x4f, 0x0a, 0xf3, 0xfc, 0xf3, 0xf6, 0x25, 0x0a, 0x28, 0x01, 0xf7, 0x09,
+    0xe6, 0x05, 0x28, 0xf7, 0x1e, 0xf2, 0xee, 0x13, 0xee, 0x05, 0x0f, 0x0a,
+    0x09, 0xe8, 0xe8, 0x0e, 0x05, 0x12, 0x0f, 0x15, 0x02, 0xec, 0xf8, 0x02,
+    0xf7, 0x05, 0xf8, 0xff, 0xdc, 0x00, 0x01, 0x00, 0x12, 0x17, 0xec, 0x19,
+    0xfa, 0x09, 0xfa, 0xf3, 0x1d, 0x0b, 0x07, 0x25, 0xea, 0x0c, 0xf5, 0xfa,
+    0x04, 0xf7, 0xfe, 0x33, 0xfe, 0x14, 0xef, 0x04, 0xf0, 0x00, 0x00, 0x3a,
+    0xea, 0xfa, 0x10, 0x01, 0xe4, 0x00, 0xff, 0x23, 0xe9, 0x26, 0x15, 0x10,
+    0x04, 0x14, 0x0d, 0x08, 0xf8, 0xfd, 0x10, 0xfb, 0x00, 0x21, 0x06, 0xfa,
+    0x0f, 0x08, 0xf1, 0x09, 0x28, 0xf0, 0xd8, 0x0d, 0x08, 0x09, 0x02, 0xfb,
+    0x12, 0x03, 0x0e, 0xfb, 0xce, 0xf0, 0x39, 0xe5, 0x09, 0xf6, 0x1f, 0x35,
+    0xdd, 0x1c, 0x25, 0xef, 0x17, 0x0c, 0xf6, 0x3e, 0xf0, 0x21, 0x08, 0xff,
+    0xd7, 0xfc, 0xfd, 0x1f, 0xe5, 0x18, 0x12, 0xe9, 0xf5, 0xe9, 0x12, 0xf6,
+    0x02, 0x13, 0xf4, 0x0a, 0xfd, 0x03, 0x09, 0x08, 0x2f, 0x07, 0xee, 0xfd,
+    0xd7, 0x00, 0x2b, 0x29, 0x3b, 0xdb, 0xde, 0xf1, 0xe1, 0xf7, 0x47, 0x12,
+    0x35, 0x0c, 0xe4, 0x09, 0xef, 0x17, 0x2b, 0xea, 0x2d, 0xf8, 0xe8, 0x18,
+    0xef, 0x03, 0x11, 0x0a, 0x10, 0xff, 0xe8, 0x07, 0x0c, 0x07, 0x03, 0x18,
+    0x05, 0x08, 0xf8, 0xf8, 0x06, 0x18, 0xe9, 0xf9, 0xe0, 0x0f, 0x0d, 0x18,
+    0x04, 0x01, 0xf0, 0x1c, 0xf6, 0x14, 0xfd, 0x12, 0x0c, 0x0c, 0x02, 0x34,
+    0xf6, 0xe6, 0xfd, 0xf9, 0xf9, 0xfd, 0x00, 0x2a, 0xfc, 0xf9, 0xff, 0x0a,
+    0xfe, 0x1b, 0xf5, 0x34, 0xdc, 0xf9, 0x15, 0x13, 0xe7, 0x1b, 0xf7, 0x25,
+    0xfd, 0x09, 0x08, 0x0a, 0xf0, 0x17, 0x0f, 0x04, 0xf4, 0xe9, 0x06, 0x07,
+    0xf5, 0x02, 0xfc, 0xf5, 0x09, 0xee, 0xf1, 0x07, 0x38, 0x03, 0x05, 0x0f,
+    0x16, 0x0f, 0xed, 0xff, 0x21, 0xf8, 0x34, 0x07, 0xd1, 0xf9, 0x27, 0x00,
+    0x0c, 0x21, 0x18, 0x42, 0xe6, 0x02, 0x1a, 0xf1, 0x2f, 0xf1, 0x0e, 0x3b,
+    0xee, 0xf8, 0x08, 0xea, 0xfe, 0xf9, 0x03, 0x18, 0xf5, 0xf8, 0x0d, 0xeb,
+    0x01, 0x10, 0x09, 0x02, 0x15, 0xfb, 0xf1, 0x0b, 0xf2, 0x06, 0x08, 0x09,
+    0x2f, 0x19, 0x02, 0xfe, 0xe4, 0x06, 0x1f, 0x17, 0x49, 0xf2, 0xe2, 0x02,
+    0xef, 0x04, 0x26, 0x16, 0x3f, 0x08, 0xf1, 0x0a, 0xfd, 0xf9, 0x28, 0x01,
+    0x15, 0x0b, 0xf9, 0x10, 0xdc, 0x02, 0x20, 0xf7, 0x16, 0xe6, 0x09, 0x03,
+    0xf1, 0xf5, 0x12, 0x1c, 0xfb, 0x2a, 0x08, 0xfa, 0x0a, 0x16, 0xf6, 0x15,
+    0xf0, 0x06, 0x11, 0xfd, 0x0e, 0xf9, 0xf6, 0x12, 0xed, 0xf3, 0xfd, 0x1f,
+    0x0b, 0xfa, 0x08, 0x30, 0xf8, 0xff, 0x0b, 0xeb, 0x10, 0xff, 0x07, 0x22,
+    0x0d, 0x07, 0x09, 0x03, 0xf6, 0xf8, 0xfc, 0x26, 0xf8, 0xee, 0x11, 0x02,
+    0x03, 0x0a, 0xef, 0x38, 0xfe, 0x13, 0x1b, 0x09, 0xfe, 0x06, 0x05, 0xf3,
+    0x04, 0xdf, 0xfc, 0x00, 0xe7, 0x15, 0xec, 0xf1, 0xf8, 0xfc, 0xed, 0x05,
+    0x0e, 0xf3, 0x15, 0x09, 0x01, 0x0d, 0xfd, 0x00, 0x24, 0xe2, 0x31, 0x13,
+    0xd5, 0x1b, 0x2b, 0xe8, 0x03, 0x08, 0x1d, 0x33, 0xdc, 0xfd, 0x24, 0xe4,
+    0x20, 0xfa, 0x07, 0x33, 0x01, 0x12, 0x06, 0xf5, 0xef, 0xf7, 0xfa, 0x13,
+    0x01, 0xec, 0xee, 0xe0, 0xfd, 0x0d, 0xff, 0x09, 0xf6, 0x00, 0xed, 0x07,
+    0xea, 0x0e, 0xff, 0x0e, 0x26, 0xfc, 0xf0, 0xe7, 0xe7, 0xfe, 0x30, 0xff,
+    0x24, 0x04, 0x06, 0xf4, 0xf5, 0xf8, 0x23, 0x0e, 0x3d, 0xf2, 0xfd, 0x04,
+    0xe8, 0xfb, 0x23, 0xfe, 0x33, 0xe1, 0x01, 0xfd, 0xdc, 0xfb, 0x0e, 0xfa,
+    0x22, 0xfb, 0x11, 0xfa, 0xff, 0x08, 0x21, 0x30, 0x13, 0x03, 0xf2, 0x03,
+    0xf8, 0x0f, 0xec, 0x0d, 0xef, 0x0f, 0x10, 0x10, 0x0f, 0xf6, 0xf9, 0x1e,
+    0xf7, 0xe5, 0x08, 0xfa, 0x09, 0xff, 0x00, 0x15, 0x02, 0x00, 0x08, 0xfe,
+    0xfb, 0x0e, 0x15, 0x28, 0xfa, 0xfb, 0x13, 0x06, 0xfb, 0x05, 0xf6, 0x11,
+    0xf6, 0x0b, 0x06, 0x15, 0xe1, 0x00, 0xe9, 0x0f, 0xe1, 0x1d, 0x18, 0xfd,
+    0x0b, 0x0f, 0xff, 0xf2, 0xf5, 0xfd, 0x14, 0xff, 0xf4, 0xfe, 0xe2, 0xf8,
+    0x14, 0x0b, 0xeb, 0x07, 0x35, 0xe2, 0xeb, 0x0b, 0x04, 0x22, 0xfe, 0x0e,
+    0x1d, 0xf2, 0x24, 0x11, 0xcc, 0xec, 0x25, 0xf7, 0xff, 0xf9, 0x06, 0x29,
+    0xe4, 0x07, 0x1c, 0xdb, 0xf8, 0x1d, 0xfa, 0x44, 0xf2, 0x01, 0x0f, 0xe6,
+    0x11, 0x03, 0xee, 0x17, 0x06, 0xe0, 0x0c, 0xd8, 0xe9, 0xfd, 0x11, 0xfe,
+    0x07, 0xdd, 0xea, 0xff, 0xde, 0xdd, 0x0a, 0x09, 0x30, 0xf2, 0x01, 0xe4,
+    0xe0, 0xeb, 0x2d, 0x12, 0x2d, 0xeb, 0xfc, 0xf0, 0xe8, 0xf9, 0x1f, 0x08,
+    0x3f, 0xeb, 0x0e, 0x13, 0xf9, 0x0c, 0x1c, 0x02, 0x25, 0xec, 0xf6, 0x05,
+    0xf3, 0xf4, 0x18, 0x08, 0x12, 0xe9, 0xfb, 0xfd, 0xf9, 0x08, 0x13, 0x1c,
+    0x08, 0xec, 0xfe, 0x02, 0xf1, 0x19, 0xf3, 0x1d, 0xf1, 0x07, 0x11, 0x12,
+    0xfa, 0xf2, 0xf6, 0x0d, 0xff, 0x17, 0x0a, 0xfb, 0x1f, 0xf8, 0x11, 0x24,
+    0xf6, 0xfc, 0xfe, 0x07, 0xed, 0x05, 0x1c, 0x21, 0xfe, 0xfe, 0x16, 0x0d,
+    0x08, 0x0f, 0x09, 0x33, 0xf4, 0x1f, 0x14, 0x0c, 0xfe, 0xf5, 0xeb, 0x2a,
+    0xee, 0xf3, 0x12, 0x19, 0xec, 0x01, 0x06, 0xf7, 0x05, 0x22, 0x0b, 0xeb,
+    0xeb, 0x06, 0xe1, 0xf5, 0x0d, 0xee, 0xfb, 0x0a, 0x31, 0xff, 0xe3, 0xea,
+    0x18, 0x09, 0xe3, 0x07, 0x1a, 0xf8, 0x15, 0xfc, 0xcc, 0xf2, 0x2a, 0xe5,
+    0x01, 0xea, 0x10, 0x1f, 0xd9, 0x02, 0x13, 0xf6, 0x16, 0x01, 0x0e, 0x3c,
+    0x02, 0x17, 0x04, 0xf1, 0xf7, 0x02, 0x07, 0x0c, 0x02, 0x1f, 0xf4, 0xe6,
+    0xf0, 0xe9, 0x05, 0xf4, 0xfd, 0xe4, 0xf7, 0xe9, 0xfc, 0xef, 0x06, 0x02,
+    0x26, 0xf1, 0xf1, 0xeb, 0xe9, 0xe6, 0x30, 0x1c, 0x38, 0x0f, 0x03, 0xf1,
+    0x10, 0x04, 0x30, 0x19, 0x1f, 0xfb, 0xfc, 0x05, 0xe2, 0xfe, 0x18, 0xf2,
+    0x1c, 0xf2, 0xf5, 0x0e, 0xf2, 0x05, 0x1d, 0x28, 0x12, 0xf0, 0xf0, 0x0f,
+    0x0a, 0x03, 0x1a, 0x1a, 0xf3, 0x08, 0x13, 0xef, 0xf5, 0x1c, 0x06, 0x00,
+    0xee, 0x12, 0x1d, 0x03, 0x18, 0x06, 0x0a, 0x0e, 0xf0, 0xeb, 0xfa, 0x0d,
+    0x08, 0xff, 0x06, 0x24, 0x0f, 0x03, 0x0a, 0x0f, 0x0e, 0xff, 0x08, 0x33,
+    0xfc, 0x00, 0x0e, 0xfb, 0xfb, 0x05, 0x07, 0x19, 0xe8, 0xe7, 0x12, 0x11,
+    0x15, 0xf7, 0x0c, 0x1a, 0xf6, 0x28, 0x08, 0xeb, 0xf2, 0x25, 0xee, 0x01,
+    0x03, 0xec, 0xed, 0xfa, 0xf0, 0xf2, 0xef, 0xf1, 0x02, 0x23, 0xef, 0x01,
+    0x41, 0xfa, 0xf4, 0xf4, 0x15, 0xf5, 0xf5, 0xf9, 0x28, 0xde, 0x20, 0xf6,
+    0xc7, 0xde, 0x21, 0xe4, 0xfe, 0xec, 0x0d, 0x2c, 0xee, 0x24, 0x10, 0xf0,
+    0x1d, 0x12, 0x0e, 0x2b, 0x06, 0xf8, 0xfd, 0x01, 0x08, 0xef, 0xfd, 0x0f,
+    0xeb, 0xed, 0xe1, 0xdf, 0xf1, 0xe5, 0x16, 0xe3, 0x08, 0xfc, 0xf6, 0xf6,
+    0xd8, 0xf0, 0x23, 0xfc, 0x2b, 0xf5, 0xff, 0xe7, 0xf4, 0xe9, 0x29, 0x09,
+    0x2b, 0x0c, 0xff, 0x08, 0x0b, 0xed, 0x29, 0x14, 0x3c, 0xf5, 0xeb, 0x18,
+    0xf6, 0x10, 0x22, 0xf9, 0x17, 0x23, 0x02, 0x0c, 0xf6, 0xfa, 0x2f, 0xfe,
+    0x1e, 0xeb, 0xfd, 0x03, 0xf0, 0x07, 0x1c, 0x09, 0xfa, 0xe1, 0x0d, 0x0f,
+    0x18, 0x03, 0xfe, 0xf0, 0xec, 0x0b, 0x10, 0x02, 0x14, 0x06, 0xef, 0xf7,
+    0xea, 0x0b, 0x05, 0xfe, 0x1f, 0x06, 0x0e, 0x07, 0x00, 0xe1, 0x01, 0x01,
+    0x07, 0x05, 0x09, 0xf7, 0xef, 0x15, 0xf7, 0x12, 0x05, 0x03, 0x04, 0x1d,
+    0x04, 0x10, 0x12, 0x06, 0x05, 0x00, 0x08, 0x18, 0xd6, 0xf2, 0xfa, 0x07,
+    0xf8, 0x12, 0x07, 0xfd, 0xdd, 0x00, 0x04, 0xfb, 0xf8, 0x09, 0xf3, 0x09,
+    0xfb, 0xf0, 0xe8, 0x09, 0x27, 0xf5, 0xf8, 0x06, 0x01, 0x02, 0x0e, 0xf6,
+    0x1f, 0xfa, 0x29, 0xf8, 0xd6, 0x01, 0x22, 0xf8, 0x1d, 0xe3, 0x1a, 0x39,
+    0x0a, 0x0d, 0x19, 0xf5, 0x12, 0xfb, 0x1d, 0x2a, 0x03, 0xf6, 0x0c, 0xf2,
+    0xfd, 0xec, 0x18, 0x13, 0xfe, 0x1a, 0xe8, 0xdd, 0x01, 0xf8, 0x30, 0x01,
+    0xf8, 0xfe, 0xe4, 0xe7, 0xff, 0xeb, 0x23, 0xfa, 0x2c, 0xf0, 0xfc, 0xe7,
+    0x0a, 0xf8, 0x18, 0x10, 0x23, 0x01, 0xfa, 0xe8, 0xf1, 0xfa, 0x1d, 0x0e,
+    0x17, 0xe7, 0xe4, 0xf5, 0xf9, 0x0c, 0x17, 0x0c, 0x13, 0xe8, 0xe1, 0x17,
+    0x19, 0x05, 0x0b, 0x0f, 0x23, 0xed, 0xff, 0xfe, 0xe0, 0x14, 0x16, 0x00,
+    0x0d, 0x1c, 0x0b, 0xf5, 0xfb, 0x18, 0xee, 0xff, 0xff, 0xf3, 0x18, 0x0c,
+    0x05, 0xfa, 0xf6, 0xfe, 0xfe, 0xf8, 0xf8, 0x09, 0xef, 0xf8, 0x0e, 0xf0,
+    0x00, 0xf8, 0x0c, 0xf8, 0xf6, 0x07, 0x16, 0x11, 0xf8, 0xea, 0xff, 0xff,
+    0x01, 0x20, 0x07, 0x08, 0xfd, 0x1c, 0xfc, 0x06, 0xed, 0x0d, 0x08, 0x15,
+    0xf0, 0x25, 0x01, 0x1b, 0x00, 0x02, 0xfe, 0x01, 0x05, 0x01, 0xfd, 0xf1,
+    0xe5, 0x0c, 0xe4, 0xe1, 0xf0, 0xfa, 0xee, 0x0e, 0x35, 0xee, 0x15, 0xef,
+    0x0a, 0xf9, 0x01, 0xf5, 0x1f, 0x05, 0x1f, 0x0d, 0xe1, 0xf4, 0xff, 0xf5,
+    0x23, 0x02, 0x18, 0x30, 0xfc, 0xf0, 0x0d, 0x04, 0x0d, 0x06, 0x29, 0x1d,
+    0xf9, 0x08, 0x06, 0xe5, 0x13, 0xfd, 0x0d, 0x26, 0xef, 0x09, 0xdc, 0xf2,
+    0x05, 0xdf, 0x0c, 0xf6, 0xf3, 0xd9, 0xf8, 0x08, 0xef, 0xeb, 0x0f, 0xf9,
+    0x3a, 0x03, 0xff, 0xe0, 0xf7, 0xf0, 0x15, 0x12, 0x41, 0x0b, 0xf1, 0x04,
+    0x04, 0xe2, 0x0e, 0x0b, 0x2c, 0x03, 0xea, 0x02, 0xfb, 0xe7, 0x08, 0xe9,
+    0x22, 0xf3, 0xf2, 0x1c, 0xfa, 0xf3, 0x11, 0x04, 0x1f, 0xf5, 0x02, 0x0f,
+    0x1a, 0x1f, 0x24, 0x0b, 0x06, 0x1f, 0xf3, 0x06, 0x00, 0x02, 0xe8, 0xf6,
+    0xf4, 0xe8, 0x07, 0x2e, 0xfb, 0xf8, 0x10, 0x09, 0xf0, 0x0e, 0xff, 0xfe,
+    0x1c, 0x14, 0x17, 0x06, 0xe2, 0xf1, 0xfa, 0x01, 0x11, 0x13, 0x12, 0x29,
+    0xf1, 0x0f, 0x1f, 0xfa, 0xfd, 0xfd, 0x02, 0x07, 0x0e, 0xfb, 0x0e, 0x04,
+    0x01, 0x01, 0xed, 0xfe, 0xde, 0xfd, 0x08, 0xef, 0xf6, 0x0a, 0xff, 0x0f,
+    0xe7, 0xf2, 0x0f, 0x02, 0xea, 0x10, 0xf9, 0xec, 0xfd, 0x09, 0xea, 0x1f,
+    0x46, 0xdd, 0xe2, 0xf7, 0x08, 0xf5, 0xf7, 0xe9, 0x33, 0xfb, 0x2f, 0xf6,
+    0xb5, 0x1d, 0x15, 0xeb, 0x11, 0xf7, 0x2a, 0x2e, 0x08, 0x1d, 0xf4, 0xfb,
+    0x15, 0xfa, 0x22, 0x34, 0xff, 0x06, 0xf6, 0xfd, 0xfa, 0xf9, 0x03, 0xf5,
+    0xf4, 0xf4, 0xd5, 0xea, 0x01, 0x08, 0x22, 0xf1, 0xf2, 0x06, 0xd1, 0xe5,
+    0x0c, 0xef, 0x12, 0x03, 0x08, 0x02, 0xf7, 0x05, 0x1b, 0x07, 0x39, 0x34,
+    0x21, 0xe2, 0xe3, 0x0b, 0x0c, 0xf6, 0x29, 0xf7, 0x24, 0x0a, 0xfc, 0xff,
+    0x1a, 0xfd, 0x05, 0xff, 0xff, 0x0e, 0x0a, 0x1a, 0x09, 0xfb, 0x15, 0x04,
+    0x03, 0xf7, 0xfe, 0x00, 0xfc, 0xfb, 0x11, 0xfa, 0x1d, 0x0e, 0x06, 0xed,
+    0xfc, 0x23, 0xd8, 0xf2, 0x04, 0xe5, 0x0f, 0x16, 0x29, 0xfe, 0xf5, 0xec,
+    0xe2, 0x0e, 0xeb, 0x09, 0x1d, 0x11, 0x05, 0x11, 0xe4, 0x29, 0x12, 0x02,
+    0x12, 0x19, 0x0e, 0x1a, 0xee, 0xf9, 0x05, 0x09, 0xf5, 0xfd, 0x05, 0x04,
+    0xe4, 0xf1, 0x17, 0x01, 0xf2, 0xfe, 0x0b, 0xf4, 0x0d, 0x04, 0x06, 0xfe,
+    0xff, 0xec, 0xe9, 0x00, 0xff, 0x03, 0x03, 0xfd, 0xf1, 0x15, 0xfc, 0xf3,
+    0xff, 0xfe, 0x09, 0xee, 0x3c, 0x01, 0xec, 0x02, 0xf0, 0xf6, 0x20, 0xeb,
+    0x16, 0x07, 0x32, 0xf3, 0xce, 0xf0, 0x02, 0xd4, 0x11, 0xe6, 0x28, 0x0e,
+    0xe3, 0x21, 0xee, 0xce, 0x1e, 0xd9, 0x23, 0x26, 0x06, 0xfa, 0xf9, 0xf1,
+    0x01, 0xe6, 0x0b, 0x07, 0xdc, 0x21, 0xbc, 0xe3, 0xef, 0xf8, 0x12, 0xfc,
+    0xe6, 0xfe, 0xf5, 0xd4, 0x15, 0x0a, 0x00, 0x13, 0xfc, 0xec, 0xf3, 0xd6,
+    0x1a, 0xe3, 0x21, 0x36, 0x2a, 0x03, 0xe9, 0xe3, 0xff, 0x00, 0x13, 0x1c,
+    0x0e, 0x20, 0xe5, 0xf5, 0x24, 0x0b, 0x20, 0x14, 0x13, 0xf8, 0x04, 0x1b,
+    0x2f, 0x0a, 0x15, 0x00, 0xf4, 0x1a, 0x11, 0x0d, 0x03, 0x18, 0x0f, 0x18,
+    0x04, 0x1f, 0xfb, 0xf2, 0x1f, 0x15, 0x03, 0xfb, 0x0b, 0x17, 0xfb, 0x0b,
+    0x1b, 0x1f, 0xf4, 0x07, 0xf9, 0xf9, 0xf8, 0xf4, 0x14, 0x0f, 0xf6, 0xfe,
+    0xdd, 0x0b, 0xff, 0x01, 0x18, 0x04, 0x1b, 0x0a, 0xed, 0xe7, 0xf9, 0x16,
+    0x02, 0x01, 0x00, 0xf7, 0xf1, 0x07, 0xf0, 0x06, 0xf8, 0x0b, 0x02, 0xf3,
+    0xff, 0x20, 0xfd, 0x01, 0x04, 0xf5, 0xd9, 0xf4, 0xf4, 0xf2, 0xe8, 0xff,
+    0x04, 0x00, 0xf0, 0xe2, 0xfe, 0xed, 0x1b, 0xef, 0x20, 0xfa, 0xfb, 0xf4,
+    0x02, 0x18, 0x07, 0xfb, 0xef, 0xe4, 0x08, 0x0d, 0xe1, 0x0e, 0x25, 0xc6,
+    0xfd, 0x0c, 0x1c, 0x0b, 0xf0, 0x01, 0x1c, 0xd4, 0x11, 0xf5, 0x1b, 0x09,
+    0xfb, 0xda, 0x13, 0xe3, 0xf9, 0x10, 0x14, 0xf0, 0xf0, 0xfd, 0x1f, 0xcf,
+    0xf4, 0xe4, 0xfb, 0x0e, 0x0a, 0x11, 0xed, 0xdc, 0xfc, 0xe6, 0xf7, 0xfc,
+    0x13, 0xe1, 0x0b, 0xe4, 0x04, 0x11, 0xee, 0x21, 0x14, 0xe1, 0x07, 0xe4,
+    0xfb, 0x08, 0x03, 0x2b, 0x27, 0xf6, 0x0d, 0x02, 0x1b, 0x09, 0x09, 0xf8,
+    0x14, 0x19, 0x0f, 0x0b, 0x01, 0x10, 0x09, 0x12, 0x03, 0xf5, 0x18, 0xf3,
+    0xfb, 0xf5, 0x02, 0x0e, 0x0d, 0x00, 0x07, 0xfc, 0x18, 0x25, 0x0b, 0xf0,
+    0xf9, 0xe6, 0x08, 0x01, 0x24, 0x14, 0xfa, 0xed, 0xe5, 0x1f, 0x09, 0xfe,
+    0x08, 0xee, 0x1a, 0x1a, 0x05, 0x00, 0xff, 0x0c, 0xfe, 0xf9, 0x11, 0x11,
+    0xea, 0xfe, 0x08, 0xf9, 0xf0, 0xe4, 0x01, 0x0d, 0xf1, 0x00, 0x0b, 0xea,
+    0x19, 0xea, 0xf3, 0xf8, 0x08, 0x12, 0x1c, 0x1f, 0xfb, 0xef, 0xf0, 0xf2,
+    0x14, 0xe1, 0x03, 0xfa, 0xf9, 0xda, 0xe9, 0xfc, 0xf3, 0xff, 0x12, 0x04,
+    0xf7, 0xfc, 0x17, 0x0f, 0xfc, 0x29, 0x03, 0xe5, 0xf2, 0xee, 0x1e, 0xfa,
+    0x04, 0xed, 0x25, 0xf4, 0xe1, 0x15, 0x10, 0x1e, 0xef, 0x1c, 0x04, 0xde,
+    0xe5, 0x08, 0x21, 0xfd, 0xfd, 0xea, 0x03, 0xca, 0xda, 0x26, 0x00, 0x0a,
+    0xfd, 0x05, 0xf0, 0xd4, 0xe1, 0x1a, 0xe4, 0xf5, 0x07, 0xe7, 0xfa, 0xdf,
+    0xd4, 0x03, 0xf0, 0x10, 0x15, 0x0c, 0xf4, 0xed, 0xe3, 0xfb, 0x0f, 0x1e,
+    0x16, 0x09, 0x00, 0xec, 0xea, 0x13, 0x16, 0x0b, 0x01, 0xfb, 0xff, 0x00,
+    0xfb, 0x07, 0x13, 0x08, 0xf4, 0xe4, 0x12, 0x00, 0xfb, 0xfa, 0xfc, 0x08,
+    0xeb, 0x19, 0x02, 0x1c, 0xe8, 0x26, 0xf3, 0x10, 0x09, 0x0f, 0x19, 0x02,
+    0xfb, 0xec, 0xf7, 0xe2, 0xfb, 0xfa, 0x11, 0xf3, 0x0b, 0x08, 0xff, 0xd9,
+    0xf8, 0x12, 0x18, 0x06, 0x07, 0x22, 0xff, 0x19, 0xf5, 0x0b, 0x0a, 0x13,
+    0xf2, 0xfa, 0x02, 0x21, 0xeb, 0x11, 0x17, 0x17, 0xec, 0xe1, 0x0e, 0xf7,
+    0xe8, 0xd8, 0x0e, 0x01, 0xf1, 0xed, 0xed, 0xf0, 0x09, 0xf7, 0xe7, 0xfd,
+    0xf0, 0xf9, 0xdb, 0xee, 0xdc, 0xfb, 0xf8, 0x0a, 0xf5, 0x0b, 0xd4, 0xd7,
+    0x08, 0x06, 0x18, 0x06, 0x0c, 0x13, 0xfd, 0x09, 0x13, 0x26, 0x12, 0xf4,
+    0xef, 0x00, 0xf5, 0x28, 0x18, 0xfe, 0x04, 0x0e, 0x21, 0x1a, 0x0a, 0x1e,
+    0x09, 0xf0, 0x0d, 0x0f, 0xec, 0xf3, 0x17, 0x22, 0x00, 0xec, 0x0e, 0x01,
+    0xe9, 0x08, 0x09, 0xf2, 0xf2, 0x08, 0xf0, 0x0b, 0xd9, 0x09, 0x14, 0xf5,
+    0xf6, 0x04, 0x19, 0xf4, 0x11, 0xe9, 0xf2, 0x0d, 0x20, 0x17, 0x0a, 0x05,
+    0x0c, 0x04, 0x01, 0xfd, 0xf4, 0xfb, 0x1b, 0x0c, 0xf2, 0x0b, 0xff, 0xfe,
+    0x01, 0xd8, 0xfa, 0x0e, 0xf5, 0x14, 0xf9, 0x01, 0x04, 0xf8, 0xfa, 0x02,
+    0xe8, 0xf9, 0xf9, 0xea, 0xf1, 0x07, 0xff, 0x1e, 0x01, 0x0b, 0xf7, 0x0a,
+    0xf7, 0x0c, 0xfd, 0xec, 0xf3, 0x05, 0xf8, 0xda, 0x0b, 0x15, 0xf6, 0xee,
+    0xf9, 0x10, 0xfa, 0xfe, 0x08, 0xf0, 0xe6, 0xec, 0x05, 0xff, 0x15, 0x19,
+    0x1f, 0x11, 0xfc, 0x09, 0x08, 0x01, 0x06, 0xfe, 0x04, 0x08, 0xfb, 0xfb,
+    0x08, 0xf4, 0xf6, 0x28, 0x10, 0xf9, 0x28, 0x0b, 0xf8, 0x0d, 0x01, 0x00,
+    0xff, 0x02, 0x05, 0x08, 0xea, 0xe9, 0xf4, 0xf6, 0x01, 0xea, 0xdf, 0x1f,
+    0xfe, 0x0a, 0xf9, 0xf7, 0x0c, 0x1b, 0x06, 0xed, 0xf6, 0xf2, 0x03, 0x03,
+    0xfd, 0x04, 0xf5, 0x10, 0x0a, 0x0b, 0xf4, 0xf8, 0xf1, 0xe7, 0x05, 0xfe,
+    0xe7, 0x0b, 0xf1, 0xec, 0xf4, 0xec, 0x06, 0xee, 0xde, 0x05, 0x1b, 0xfe,
+    0x13, 0xf3, 0xd9, 0xea, 0x04, 0x10, 0x05, 0xed, 0x15, 0x02, 0x0b, 0x10,
+    0xfa, 0x02, 0x05, 0x0b, 0x02, 0x07, 0xfc, 0xf5, 0x15, 0x14, 0x05, 0xf7,
+    0x0c, 0xfe, 0xf6, 0xf4, 0xfa, 0x06, 0xfc, 0x13, 0xdc, 0xe4, 0x09, 0xfa,
+    0x02, 0x23, 0xec, 0x06, 0x11, 0x13, 0xf8, 0xfa, 0x27, 0x28, 0x0b, 0x23,
+    0xec, 0xf1, 0x09, 0x17, 0x0f, 0x13, 0xff, 0xf2, 0xfc, 0x0a, 0xf5, 0x0d,
+    0x03, 0x26, 0x01, 0x0f, 0xfe, 0xf1, 0xfb, 0xe6, 0xf0, 0x02, 0xf2, 0xff,
+    0x02, 0x11, 0xff, 0xfd, 0x1c, 0x02, 0x0b, 0xf6, 0x14, 0x0c, 0x0b, 0x21,
+    0x28, 0xf0, 0x11, 0x05, 0x06, 0xed, 0xf9, 0x0a, 0xf2, 0xef, 0xf8, 0xf1,
+    0xfe, 0x0d, 0xf9, 0xf7, 0xea, 0x00, 0x08, 0xdb, 0x02, 0x0f, 0xfe, 0x04,
+    0xef, 0x20, 0x16, 0x01, 0xe8, 0xed, 0xe4, 0x22, 0xf6, 0x19, 0x00, 0x04,
+    0x01, 0x13, 0xeb, 0x0d, 0xec, 0x01, 0x08, 0x05, 0x0c, 0x0e, 0xfe, 0x02,
+    0x12, 0xf7, 0x27, 0xf9, 0xfd, 0x18, 0xfe, 0x24, 0xf7, 0x13, 0xed, 0x1e,
+    0x09, 0xff, 0xd8, 0xf4, 0x12, 0xf8, 0x04, 0x0c, 0x1c, 0x11, 0xfd, 0x17,
+    0x1d, 0x01, 0x13, 0xee, 0x11, 0xf3, 0xf8, 0x06, 0xf6, 0x16, 0xfe, 0x15,
+    0x16, 0xdc, 0x1f, 0x00, 0x25, 0xee, 0xff, 0xf7, 0xf6, 0x02, 0xdd, 0x15,
+    0xf1, 0x14, 0x08, 0xe8, 0xe5, 0x21, 0xea, 0xf0, 0x1a, 0x07, 0xea, 0x08,
+    0xea, 0xe4, 0x1e, 0x00, 0x13, 0x17, 0xec, 0x11, 0xd6, 0x11, 0x18, 0x17,
+    0x04, 0x15, 0x03, 0x3a, 0xd6, 0x02, 0x07, 0x04, 0xe6, 0xe5, 0xfe, 0x0e,
+    0xff, 0xed, 0xfc, 0xfb, 0xff, 0x1c, 0x06, 0x0a, 0xfb, 0xf9, 0xea, 0x1a,
+    0x21, 0xf5, 0x04, 0x06, 0x0a, 0xe3, 0x16, 0xea, 0x04, 0xe2, 0xf9, 0xf9,
+    0xe6, 0xfb, 0x0f, 0xfc, 0x06, 0xfb, 0x10, 0x07, 0x07, 0x13, 0x07, 0xfc,
+    0x16, 0xef, 0x07, 0xdc, 0x12, 0x1f, 0x08, 0xf4, 0xe9, 0x14, 0x06, 0xf7,
+    0xf1, 0x0c, 0x01, 0x0c, 0xe6, 0x04, 0xf3, 0xf2, 0xe5, 0xf3, 0xef, 0x1d,
+    0xf6, 0x20, 0x07, 0xfe, 0xf4, 0x05, 0xee, 0x10, 0xfd, 0x0e, 0x0b, 0x02,
+    0x0d, 0xd8, 0x07, 0xfb, 0x26, 0x0a, 0x1c, 0x21, 0x06, 0x1f, 0xf4, 0x06,
+    0x37, 0x18, 0xfa, 0x16, 0x1e, 0x24, 0xfb, 0xf0, 0x12, 0xf9, 0x02, 0x09,
+    0x17, 0x16, 0xf3, 0xf9, 0x17, 0xf2, 0x02, 0x0a, 0x2d, 0xe7, 0xe3, 0x25,
+    0xf0, 0xf9, 0x0f, 0xdd, 0x15, 0xe6, 0x04, 0xfc, 0xf1, 0x17, 0x0a, 0xea,
+    0x24, 0x07, 0xf1, 0x11, 0x13, 0x29, 0xf4, 0xc5, 0xfb, 0x07, 0xef, 0x13,
+    0x0b, 0xe1, 0xf1, 0xeb, 0xf8, 0x1b, 0x09, 0x08, 0x1f, 0x15, 0xf2, 0x05,
+    0x02, 0xdd, 0x09, 0x0f, 0x16, 0x10, 0x01, 0x30, 0xf2, 0xe0, 0x27, 0xfe,
+    0xf1, 0x0e, 0x0e, 0x07, 0xe6, 0x07, 0x0b, 0x18, 0xfe, 0x0f, 0x01, 0x07,
+    0xf4, 0x07, 0x10, 0xe7, 0xfb, 0xf3, 0xf7, 0x0b, 0xf9, 0x15, 0x18, 0x25,
+    0x0c, 0x14, 0x02, 0x08, 0x0a, 0x0f, 0x10, 0xec, 0xee, 0x1a, 0x03, 0x14,
+    0x0f, 0xfa, 0x25, 0xff, 0x18, 0x0d, 0x0b, 0xea, 0x1f, 0x28, 0x10, 0x0c,
+    0xe7, 0xee, 0xf7, 0xfa, 0x03, 0x15, 0x0c, 0x1d, 0x01, 0x00, 0x12, 0xee,
+    0x01, 0xf1, 0xf8, 0x0b, 0xf3, 0xfd, 0x04, 0xf8, 0x02, 0x1e, 0x0e, 0xf3,
+    0x02, 0x10, 0xfd, 0x07, 0x0b, 0x09, 0x03, 0x10, 0x3e, 0x08, 0x0e, 0x0c,
+    0xf4, 0xe7, 0xfd, 0x1c, 0x27, 0x1a, 0xed, 0xe1, 0x08, 0xdc, 0xd9, 0xf1,
+    0x1e, 0x07, 0x12, 0xf1, 0x10, 0xfb, 0xc8, 0x08, 0x0f, 0x03, 0x1d, 0xdc,
+    0x23, 0x04, 0xf9, 0x0a, 0xff, 0x08, 0x0e, 0xc9, 0x39, 0x0a, 0x01, 0x07,
+    0xec, 0xe0, 0x05, 0xe8, 0x14, 0xd8, 0xe1, 0xfa, 0xd6, 0xf8, 0xed, 0xdb,
+    0xff, 0x1d, 0xf5, 0x17, 0x0f, 0x1c, 0xdc, 0xed, 0xff, 0xff, 0x04, 0x13,
+    0xf5, 0xe7, 0xd2, 0x12, 0xdb, 0xe1, 0x13, 0x11, 0x23, 0x0e, 0xf9, 0x31,
+    0xdc, 0xef, 0x07, 0x0a, 0x20, 0xf2, 0xf9, 0x13, 0xff, 0x1c, 0x2a, 0xdf,
+    0xdb, 0xe7, 0x11, 0xf2, 0xfd, 0xfb, 0x28, 0x00, 0x15, 0x03, 0x02, 0x20,
+    0x07, 0xf7, 0x19, 0x13, 0x13, 0xf6, 0x09, 0xfe, 0xfd, 0x20, 0x14, 0xf5,
+    0xf5, 0xfc, 0x14, 0x0e, 0x17, 0xfe, 0x15, 0x04, 0xf9, 0xf6, 0x1d, 0xf6,
+    0x1b, 0xe4, 0xee, 0xfd, 0x00, 0xe9, 0xee, 0xce, 0x0f, 0x20, 0x05, 0x02,
+    0x0d, 0x06, 0x05, 0xf8, 0xef, 0xdf, 0x16, 0x17, 0xe6, 0xf1, 0x10, 0xf3,
+    0x06, 0x04, 0xdb, 0xfb, 0xe7, 0xf8, 0x02, 0x11, 0xff, 0x0d, 0x0a, 0xfa,
+    0x27, 0x0a, 0xfc, 0xe8, 0x11, 0x17, 0xf0, 0x0d, 0x0d, 0xee, 0xdf, 0xdd,
+    0xf1, 0x15, 0xd6, 0xf7, 0x00, 0xef, 0x2e, 0xe6, 0x24, 0xfd, 0xd5, 0x04,
+    0xf0, 0x08, 0x08, 0xed, 0x22, 0x07, 0xe1, 0x09, 0xd0, 0x0b, 0x18, 0xe6,
+    0x3f, 0x0a, 0xe5, 0xe2, 0xf9, 0x08, 0x02, 0xd6, 0x13, 0x15, 0xbd, 0x00,
+    0x0e, 0xf8, 0xe2, 0xca, 0xec, 0x0e, 0xe6, 0xef, 0x15, 0x11, 0xcb, 0xdf,
+    0xf9, 0x03, 0x22, 0x10, 0xfb, 0xf9, 0xe5, 0x08, 0xe1, 0x11, 0x10, 0xfc,
+    0xfa, 0x00, 0xf8, 0x30, 0xe5, 0x08, 0x14, 0xe8, 0x12, 0xe2, 0x04, 0x19,
+    0x0b, 0xfa, 0x33, 0xf3, 0xec, 0xfe, 0xf8, 0x25, 0xf8, 0x21, 0x28, 0xef,
+    0x00, 0xde, 0xff, 0x2b, 0x03, 0xfc, 0x10, 0x0c, 0xcf, 0xfd, 0x19, 0x0a,
+    0x0c, 0xf2, 0xf7, 0x0c, 0xfd, 0x02, 0x1c, 0xdf, 0x26, 0x0d, 0xf0, 0x0b,
+    0xce, 0x15, 0xfb, 0xec, 0x27, 0xf6, 0xf9, 0xe5, 0xe2, 0xfb, 0xfd, 0xd8,
+    0x28, 0xec, 0xe9, 0xf2, 0xca, 0x09, 0x02, 0x06, 0x0c, 0xfa, 0x05, 0x01,
+    0xd5, 0x0a, 0x02, 0xfb, 0x04, 0x17, 0xdd, 0xfe, 0xeb, 0xf1, 0x09, 0x10,
+    0x12, 0xff, 0x00, 0xe0, 0x26, 0xf7, 0xed, 0xf4, 0x00, 0xf2, 0xfa, 0x07,
+    0x02, 0xf5, 0x06, 0xe8, 0x03, 0xfd, 0xdc, 0xf2, 0xc2, 0xff, 0x0b, 0xd6,
+    0x25, 0x04, 0xe9, 0xf0, 0xd9, 0x08, 0x09, 0xc5, 0x23, 0x12, 0xf6, 0x13,
+    0x11, 0xf3, 0x18, 0xf0, 0x34, 0xfe, 0xfe, 0xed, 0xea, 0x02, 0x17, 0xdc,
+    0x1b, 0x1b, 0xea, 0xfe, 0xea, 0xfe, 0xf2, 0xc4, 0xfd, 0x04, 0xe9, 0x0d,
+    0x0d, 0x09, 0xca, 0xd4, 0xe1, 0x04, 0x1e, 0xff, 0x0f, 0xef, 0xd6, 0x0f,
+    0xd5, 0xf8, 0x26, 0xd6, 0x33, 0xe8, 0xf5, 0x3b, 0xf1, 0xe8, 0x39, 0xe8,
+    0x08, 0xe5, 0x01, 0x02, 0x04, 0xf6, 0x19, 0x0a, 0xd0, 0xeb, 0x0b, 0x15,
+    0xf7, 0x0e, 0x23, 0xf6, 0xf4, 0xd8, 0xf4, 0x17, 0x23, 0x25, 0x14, 0x01,
+    0xd7, 0xfd, 0xf9, 0x1f, 0x1b, 0x11, 0x0a, 0x18, 0xf5, 0xf5, 0x0f, 0xe0,
+    0x2e, 0x01, 0xe5, 0xdb, 0xe2, 0xf2, 0x14, 0xfa, 0x2a, 0x00, 0xe2, 0xea,
+    0xfd, 0x0e, 0xfc, 0xc1, 0x35, 0x08, 0xf6, 0xf9, 0xec, 0x00, 0x06, 0x00,
+    0x0b, 0xf6, 0x01, 0xfe, 0xea, 0x0b, 0x08, 0x05, 0xe4, 0xea, 0xd7, 0xfd,
+    0xee, 0xf3, 0x0c, 0x0c, 0x0d, 0x02, 0xfd, 0xee, 0x17, 0x10, 0x13, 0xfd,
+    0x07, 0x03, 0xf8, 0x0c, 0xd4, 0xed, 0xfe, 0x07, 0xf4, 0xee, 0xf4, 0x03,
+    0xc2, 0x18, 0x2c, 0xd1, 0x33, 0xd8, 0xdb, 0xfa, 0xed, 0x10, 0x1c, 0xe3,
+    0x37, 0x0a, 0xea, 0xfe, 0xf6, 0xef, 0x20, 0xed, 0x32, 0xf7, 0xf5, 0xf3,
+    0xca, 0xfd, 0x0a, 0xcf, 0x0d, 0x10, 0xde, 0x07, 0x18, 0x10, 0xf0, 0xd6,
+    0x0c, 0x04, 0xeb, 0x1a, 0xf9, 0x08, 0xc4, 0xcb, 0xe4, 0x0b, 0x19, 0xfc,
+    0x29, 0xf6, 0xec, 0x07, 0xf3, 0xed, 0x2b, 0xe9, 0xfa, 0x02, 0xec, 0x2b,
+    0xf0, 0xf2, 0x2d, 0xe8, 0xed, 0x00, 0x12, 0x13, 0xed, 0x1a, 0x3d, 0xf0,
+    0x05, 0x04, 0xfc, 0x13, 0x10, 0x01, 0x40, 0xf2, 0x06, 0x02, 0xf9, 0x22,
+    0x24, 0xff, 0x18, 0x00, 0xeb, 0xe8, 0x14, 0xf9, 0x25, 0xe0, 0xff, 0x03,
+    0xe5, 0xfd, 0x08, 0xea, 0x2e, 0x0b, 0x05, 0xe7, 0xde, 0xe4, 0xf5, 0xea,
+    0x3a, 0xf4, 0xf4, 0xe7, 0xed, 0xec, 0xf8, 0xee, 0x30, 0x0a, 0xdb, 0x05,
+    0xf7, 0x16, 0xff, 0xf7, 0xfa, 0x1f, 0xef, 0xe4, 0xce, 0xf8, 0x13, 0x04,
+    0xf9, 0x01, 0xe1, 0x03, 0xf9, 0xf9, 0x08, 0x04, 0xfa, 0xe4, 0xe7, 0xf7,
+    0x28, 0xfd, 0xfd, 0x00, 0xfc, 0xfb, 0xef, 0x0a, 0xec, 0x0c, 0x0a, 0xd2,
+    0x05, 0xfb, 0xcd, 0xfb, 0x9d, 0xea, 0x1c, 0xe5, 0x25, 0xe8, 0xea, 0x0b,
+    0xf0, 0xf3, 0x0d, 0xab, 0x49, 0x0e, 0xeb, 0x00, 0xe2, 0x03, 0x29, 0xe0,
+    0x3d, 0x06, 0xf7, 0xf8, 0xcf, 0x0c, 0x1a, 0xd6, 0x1f, 0xef, 0xfd, 0xff,
+    0xef, 0x0c, 0xdb, 0xe0, 0x20, 0x06, 0xdf, 0x1a, 0xe7, 0xfc, 0xb2, 0xd1,
+    0xdf, 0x13, 0x07, 0x1f, 0x0c, 0xf7, 0xde, 0x0a, 0xdb, 0xdf, 0x1a, 0xf5,
+    0x29, 0x0d, 0xeb, 0x2c, 0xcf, 0x0e, 0x26, 0xfe, 0xef, 0x04, 0xf5, 0x14,
+    0x09, 0x13, 0x34, 0xff, 0xfe, 0x0e, 0x06, 0x0e, 0x10, 0xf9, 0x2a, 0x0b,
+    0xe6, 0xfe, 0xf1, 0x1a, 0x36, 0x29, 0x29, 0x05, 0x05, 0xd8, 0x14, 0x12,
+    0x26, 0x0b, 0x18, 0xff, 0xd7, 0xdf, 0x0f, 0xed, 0x31, 0xf7, 0xfc, 0xec,
+    0x0b, 0xef, 0x0c, 0xd2, 0x30, 0xf9, 0x04, 0xfe, 0xef, 0xe4, 0xfb, 0xd1,
+    0x32, 0xe5, 0xee, 0xf0, 0x0c, 0xe6, 0x13, 0xed, 0x1e, 0x0b, 0xe4, 0xe0,
+    0xfa, 0xf4, 0x14, 0xf4, 0x18, 0xf7, 0xd9, 0xf6, 0xed, 0xea, 0xfc, 0x06,
+    0xfc, 0xf5, 0xed, 0xeb, 0x05, 0x03, 0x1b, 0x0b, 0xff, 0x0b, 0xef, 0x01,
+    0xf1, 0x16, 0x05, 0x00, 0xee, 0x0a, 0xdb, 0x10, 0xb4, 0x14, 0x0f, 0xe1,
+    0x1c, 0xfd, 0xf0, 0xf8, 0xc3, 0x11, 0x17, 0xba, 0x47, 0x15, 0xe6, 0x01,
+    0xea, 0xf1, 0x0c, 0x08, 0x4a, 0x15, 0xf0, 0xf7, 0xea, 0x00, 0xf5, 0xd4,
+    0xf1, 0xff, 0xe0, 0x0c, 0xf4, 0x17, 0xd8, 0xea, 0x03, 0xff, 0xd5, 0x18,
+    0xfb, 0x07, 0xc7, 0xc9, 0xdd, 0xf3, 0x15, 0x0d, 0x22, 0xea, 0xdb, 0x0a,
+    0xd6, 0x09, 0x1d, 0xe5, 0x2d, 0x04, 0xfc, 0x35, 0xc6, 0x0e, 0x33, 0xf1,
+    0xd7, 0xea, 0x01, 0x1b, 0x0e, 0x01, 0x2a, 0xff, 0xef, 0xf1, 0xf7, 0x0f,
+    0xff, 0x00, 0x3b, 0xe8, 0x0a, 0xff, 0xf4, 0x0d, 0x1f, 0x04, 0x17, 0xf7,
+    0xdf, 0xec, 0x12, 0x26, 0x36, 0x07, 0x0c, 0x06, 0xe7, 0xd6, 0x13, 0xe3,
+    0x30, 0x09, 0x00, 0xf5, 0xe0, 0xf3, 0x11, 0xe2, 0x38, 0x0d, 0xf6, 0x05,
+    0xec, 0x05, 0x00, 0xe5, 0x24, 0xef, 0xfe, 0xf8, 0x00, 0xd8, 0x18, 0xf1,
+    0x26, 0x0b, 0xf2, 0xfc, 0xe0, 0xe4, 0x06, 0x0b, 0x1a, 0x05, 0xc6, 0xf6,
+    0xe8, 0xde, 0xfe, 0x0c, 0x03, 0x09, 0xfe, 0xe2, 0x18, 0x1b, 0xfb, 0xf7,
+    0x06, 0xf1, 0xfe, 0xf6, 0xef, 0x1b, 0x07, 0x0d, 0x01, 0x0a, 0xed, 0xf0,
+    0xad, 0x1a, 0x17, 0xd6, 0x37, 0xfd, 0xd8, 0xec, 0xca, 0xf1, 0x15, 0xc4,
+    0x33, 0xf1, 0xed, 0xf0, 0xe9, 0x15, 0x0d, 0xf2, 0x36, 0xde, 0xfd, 0x0e,
+    0xfb, 0x10, 0x0f, 0xf6, 0xf9, 0x0c, 0xea, 0xf0, 0xe5, 0x0b, 0xee, 0xc1,
+    0x10, 0xf4, 0xe8, 0x1f, 0xee, 0x00, 0xd0, 0xe4, 0xe7, 0x13, 0x07, 0x27,
+    0x12, 0xea, 0xea, 0x0f, 0xea, 0xf4, 0x14, 0xee, 0xfe, 0x09, 0xfb, 0x31,
+    0xdb, 0x1b, 0x1c, 0xe7, 0xef, 0xf5, 0xf7, 0x1a, 0x06, 0x01, 0x2c, 0xed,
+    0xfb, 0x04, 0xfa, 0x07, 0x19, 0xec, 0x2b, 0x0d, 0xfc, 0xd8, 0xfc, 0x0f,
+    0x1f, 0xfc, 0x2d, 0xf3, 0xc9, 0xda, 0x0a, 0xfe, 0x29, 0x00, 0xfa, 0x09,
+    0xe8, 0xf6, 0x21, 0xf3, 0x4a, 0x1a, 0xf8, 0x00, 0xe7, 0xf0, 0x21, 0x01,
+    0x22, 0xf3, 0x00, 0xe9, 0x06, 0xe3, 0x15, 0xd7, 0x3d, 0x0c, 0x07, 0xf1,
+    0xf3, 0xec, 0x17, 0xdf, 0x29, 0x1b, 0xfd, 0xfe, 0xeb, 0xed, 0x17, 0xf6,
+    0x23, 0x0a, 0xea, 0xee, 0xf9, 0xf3, 0x0f, 0x0c, 0xf8, 0xf5, 0xed, 0xe8,
+    0x1c, 0x14, 0x07, 0x17, 0x0b, 0x0d, 0xed, 0xf7, 0xed, 0x10, 0x07, 0xd5,
+    0xf2, 0x09, 0xd6, 0xf7, 0xb5, 0xf6, 0x19, 0xc9, 0x25, 0x15, 0xe8, 0xf5,
+    0xc4, 0xf9, 0x2a, 0xb0, 0x39, 0x0e, 0x02, 0x11, 0xf0, 0xf7, 0x1d, 0xeb,
+    0x39, 0x10, 0x02, 0x15, 0xe0, 0x08, 0x01, 0xee, 0x1c, 0x1e, 0x08, 0x04,
+    0xf2, 0x02, 0xe8, 0xda, 0xfa, 0xfb, 0xe0, 0xfe, 0x05, 0x02, 0xd3, 0xca,
+    0xf4, 0xec, 0x10, 0x16, 0x05, 0x0d, 0xd7, 0x09, 0xdc, 0xf6, 0x1e, 0xf8,
+    0x10, 0xed, 0xf7, 0x27, 0xf5, 0x08, 0x28, 0xee, 0xec, 0xe0, 0xf8, 0x17,
+    0xfb, 0x23, 0x2e, 0xf1, 0xfa, 0xf5, 0xfc, 0x1a, 0x10, 0xf7, 0x32, 0xfb,
+    0xfb, 0xe8, 0xf1, 0x03, 0x24, 0xeb, 0x25, 0xf9, 0xca, 0xf1, 0xfe, 0x01,
+    0x2e, 0x07, 0x18, 0x03, 0xe5, 0xea, 0x10, 0xfa, 0x3b, 0x07, 0x0f, 0x11,
+    0x04, 0xf7, 0x1d, 0xf1, 0x24, 0xd9, 0x08, 0xef, 0x02, 0xdd, 0x07, 0xc8,
+    0x2c, 0x0d, 0x06, 0xec, 0x17, 0xda, 0x21, 0xdf, 0x34, 0xd9, 0xfb, 0xf2,
+    0xf4, 0xec, 0x0e, 0x0a, 0x0f, 0x0f, 0xdb, 0xf0, 0xfb, 0xe6, 0x0f, 0x00,
+    0x04, 0xf9, 0x01, 0x05, 0x05, 0xfe, 0x08, 0xf3, 0x0e, 0xf2, 0xfb, 0x01,
+    0xfd, 0x18, 0x1d, 0xf6, 0xee, 0x06, 0xcf, 0xfc, 0xae, 0x27, 0x21, 0xd2,
+    0x33, 0x03, 0xe0, 0xe0, 0xc9, 0xfb, 0x3a, 0xbd, 0x4d, 0x04, 0xe8, 0xf5,
+    0xe6, 0xeb, 0x19, 0xf2, 0x4b, 0x1d, 0xfc, 0xf7, 0xd9, 0xff, 0xfe, 0xea,
+    0x0f, 0x04, 0x0e, 0x00, 0xed, 0x19, 0xe9, 0xe9, 0xff, 0x11, 0xef, 0x14,
+    0x01, 0x17, 0xbc, 0xb5, 0xef, 0x0c, 0x22, 0x27, 0x0f, 0x01, 0xd4, 0x03,
+    0xce, 0x01, 0x25, 0xff, 0xf9, 0xf0, 0x0a, 0x1c, 0xe5, 0x0f, 0x1c, 0xee,
+    0xf4, 0xf1, 0xf4, 0x0c, 0x00, 0x08, 0x1c, 0xf4, 0xd5, 0xf1, 0xfc, 0x1f,
+    0x11, 0x00, 0x18, 0x03, 0xf7, 0xe4, 0xff, 0x07, 0x09, 0x1a, 0x18, 0xff,
+    0xea, 0xec, 0xfd, 0x13, 0x2b, 0xf8, 0x0c, 0xfa, 0xdf, 0xf6, 0x11, 0xda,
+    0x2a, 0xdc, 0xfc, 0xff, 0xff, 0xec, 0x12, 0xe1, 0x37, 0xfd, 0xeb, 0xfe,
+    0xea, 0xd1, 0x12, 0xfa, 0x28, 0x1a, 0x0d, 0xf0, 0xf7, 0xe0, 0x0c, 0xeb,
+    0x35, 0x14, 0xeb, 0x00, 0xeb, 0xe7, 0x1b, 0xfc, 0x09, 0x00, 0xf2, 0x04,
+    0xf9, 0xe5, 0x1a, 0x0e, 0x08, 0x12, 0xf8, 0xfe, 0x09, 0x0f, 0x0d, 0xea,
+    0x03, 0xe1, 0xfe, 0xf2, 0xec, 0x0d, 0x02, 0xdb, 0x04, 0x1d, 0xd4, 0x01,
+    0xca, 0x13, 0x29, 0xca, 0x28, 0x04, 0xe2, 0xf1, 0xdb, 0x0b, 0x2c, 0xcd,
+    0x44, 0x00, 0xe7, 0xf4, 0xd0, 0x12, 0x15, 0xff, 0x42, 0x11, 0x05, 0xfd,
+    0xd9, 0x11, 0x1c, 0xf4, 0x15, 0xec, 0xf2, 0x24, 0xd6, 0x1d, 0xec, 0xda,
+    0xf5, 0xec, 0xe5, 0x22, 0xf2, 0x0b, 0xbd, 0xd0, 0xeb, 0x05, 0x07, 0x1b,
+    0x01, 0xed, 0xf5, 0x02, 0xcf, 0x08, 0x15, 0xfd, 0x1c, 0xe5, 0x04, 0x19,
+    0xc7, 0x25, 0x22, 0xf3, 0xde, 0xfb, 0xfb, 0x20, 0xf6, 0xeb, 0x25, 0xfe,
+    0xf5, 0x08, 0xf5, 0x17, 0x0e, 0x04, 0x1c, 0xf9, 0xee, 0xec, 0xe1, 0x06,
+    0x12, 0xff, 0x2a, 0x13, 0xed, 0xfe, 0x05, 0x18, 0x25, 0x20, 0x09, 0x13,
+    0xea, 0xd7, 0x05, 0x06, 0x33, 0x25, 0xff, 0x0a, 0xf0, 0xea, 0x17, 0xe1,
+    0x30, 0xfa, 0x0d, 0x0a, 0x04, 0x00, 0x0e, 0xe9, 0x16, 0x20, 0x0d, 0x02,
+    0xe8, 0xed, 0x07, 0xe8, 0x3c, 0xf1, 0xd9, 0xfa, 0xe1, 0xed, 0x18, 0xfc,
+    0xf0, 0x09, 0xe3, 0x05, 0xfe, 0xd1, 0x0b, 0x0e, 0xf5, 0x25, 0xfd, 0xfb,
+    0x30, 0x1e, 0x08, 0xfc, 0x0c, 0x21, 0xea, 0xfc, 0xe5, 0x1e, 0x16, 0xf5,
+    0xf4, 0xfc, 0xf0, 0xea, 0xc4, 0x21, 0x27, 0xe9, 0x2b, 0xdb, 0xdb, 0xec,
+    0xe5, 0xfe, 0x37, 0xe2, 0x46, 0x25, 0xfa, 0xec, 0xe4, 0xf3, 0x19, 0xf2,
+    0x4c, 0x06, 0x00, 0xfb, 0xeb, 0x10, 0x10, 0xf7, 0x2a, 0xf8, 0xe9, 0x18,
+    0xee, 0x21, 0xe8, 0xd5, 0xf4, 0x0a, 0xed, 0x24, 0xfe, 0xf9, 0xb2, 0xbc,
+    0xf3, 0x1d, 0x00, 0x2f, 0x07, 0x08, 0xe1, 0xf1, 0xed, 0x27, 0x27, 0xfe,
+    0x22, 0xfd, 0x02, 0x20, 0xd8, 0x05, 0x25, 0xec, 0xf1, 0xff, 0x0a, 0x0f,
+    0xe6, 0xfe, 0x46, 0xfd, 0xe1, 0xca, 0xf7, 0x22, 0x03, 0x08, 0x21, 0xf5,
+    0x0f, 0xf7, 0xfb, 0x0c, 0xfb, 0x14, 0x2d, 0x03, 0xe5, 0xe4, 0x09, 0x0b,
+    0x1a, 0xe6, 0x01, 0x28, 0xe9, 0xd6, 0x0b, 0xf7, 0x2c, 0xfb, 0x11, 0xee,
+    0x0b, 0xed, 0x17, 0xf0, 0x3c, 0xf5, 0x08, 0xfa, 0xf8, 0xcd, 0x17, 0xfa,
+    0x39, 0xea, 0x11, 0xf5, 0xed, 0xee, 0x0a, 0xec, 0x41, 0xd6, 0xe7, 0xf9,
+    0xfa, 0xc8, 0x15, 0xf7, 0x08, 0x0e, 0xe3, 0x08, 0xe8, 0xec, 0xfd, 0xfe,
+    0xf1, 0x00, 0xe9, 0xf4, 0x09, 0x26, 0x02, 0x16, 0xf0, 0x01, 0xef, 0x01,
+    0xff, 0x03, 0x22, 0xdb, 0xfc, 0xf5, 0xde, 0xe5, 0xc4, 0x01, 0x28, 0xd4,
+    0x38, 0x08, 0xd0, 0xec, 0xd5, 0x04, 0x2f, 0xce, 0x4e, 0xeb, 0xf9, 0xe7,
+    0xdf, 0xf0, 0x1b, 0xf5, 0x42, 0xf1, 0xf6, 0x09, 0xd5, 0x0a, 0x0d, 0x08,
+    0x04, 0x05, 0xe2, 0x0e, 0xd7, 0x19, 0xdb, 0xda, 0xe1, 0x25, 0xde, 0x15,
+    0x0e, 0x14, 0xbd, 0xb0, 0xe3, 0xe5, 0x24, 0x1e, 0xf8, 0x0d, 0xd8, 0xf7,
+    0xf2, 0xff, 0x18, 0xf5, 0x07, 0xf0, 0x02, 0x25, 0xd5, 0x1e, 0x2e, 0xdf,
+    0xe7, 0x05, 0xef, 0x11, 0xe8, 0xe7, 0x47, 0xf4, 0xe1, 0xde, 0x09, 0x36,
+    0x1a, 0x11, 0x11, 0xf5, 0x12, 0xe5, 0xe7, 0x18, 0x01, 0x17, 0x2a, 0x03,
+    0x05, 0xea, 0x09, 0x0b, 0x12, 0x04, 0x17, 0xf0, 0xee, 0xd7, 0x11, 0xed,
+    0x3c, 0x17, 0x16, 0xff, 0x02, 0xdc, 0x21, 0xf3, 0x2e, 0xe5, 0x13, 0xef,
+    0xec, 0xe2, 0x10, 0xd0, 0x2e, 0xee, 0xff, 0x01, 0xe0, 0xe5, 0x0b, 0xda,
+    0x1f, 0xf8, 0xf6, 0xfb, 0x07, 0xdb, 0x05, 0xf6, 0x0c, 0xf3, 0xf0, 0x10,
+    0xf9, 0xf5, 0xf2, 0x0d, 0x10, 0xf7, 0xf6, 0xff, 0x2b, 0x0d, 0x06, 0x1e,
+    0xf3, 0x0c, 0xe9, 0x01, 0xf2, 0x23, 0xfe, 0xe9, 0xdd, 0x12, 0xdd, 0xf7,
+    0xbb, 0x22, 0x1b, 0xd4, 0x38, 0x29, 0xd4, 0xcf, 0xf5, 0xf9, 0x27, 0xdd,
+    0x47, 0x00, 0xf2, 0xe5, 0x09, 0xfc, 0x0e, 0xf9, 0x34, 0x0a, 0x02, 0xfd,
+    0xec, 0x25, 0x1d, 0x03, 0x15, 0x09, 0xf1, 0x1b, 0xd0, 0x17, 0xda, 0xda,
+    0xe7, 0x07, 0xe3, 0x15, 0xf1, 0x02, 0xb9, 0xce, 0xe6, 0x0c, 0x10, 0x31,
+    0xfe, 0xf7, 0xd9, 0xfa, 0xed, 0xed, 0x33, 0xf4, 0x19, 0xe7, 0xfe, 0x3f,
+    0xe5, 0x06, 0x2e, 0xe6, 0xf2, 0xdc, 0xf5, 0x18, 0xe6, 0x01, 0x2f, 0xee,
+    0xe7, 0xe4, 0xfe, 0x2c, 0x03, 0xf7, 0x20, 0x05, 0x07, 0xe2, 0x06, 0x1e,
+    0x05, 0xed, 0x2f, 0x03, 0xea, 0xf8, 0x0e, 0x0c, 0x1f, 0xff, 0x20, 0xf4,
+    0xe8, 0xe1, 0x1c, 0xec, 0x22, 0x1e, 0x05, 0xfd, 0xf5, 0xca, 0x30, 0xe9,
+    0x30, 0xe4, 0x14, 0xff, 0xf2, 0xdc, 0x17, 0xf8, 0x26, 0xe1, 0x0b, 0x01,
+    0x11, 0xc2, 0x02, 0xf1, 0x36, 0x10, 0x02, 0x05, 0xed, 0xf1, 0x15, 0xfa,
+    0x17, 0xf8, 0xf7, 0xf1, 0xe8, 0xd3, 0xfd, 0x08, 0xfb, 0x27, 0xf5, 0xf5,
+    0x13, 0x06, 0x0b, 0xf0, 0x01, 0xf9, 0xd7, 0x0e, 0xec, 0x12, 0xfe, 0xfd,
+    0xee, 0x25, 0xd8, 0xf1, 0xb2, 0x09, 0x1c, 0xbf, 0x34, 0xea, 0xc8, 0xea,
+    0xdb, 0x0e, 0x24, 0xde, 0x47, 0xfe, 0xdc, 0xe0, 0xf3, 0x06, 0x20, 0xfe,
+    0x2b, 0xf6, 0x18, 0x14, 0xcd, 0x19, 0x16, 0xfe, 0x1a, 0x15, 0xf8, 0x11,
+    0xf4, 0x22, 0xd7, 0xcc, 0xdd, 0x15, 0xdc, 0x14, 0xf9, 0x02, 0xbb, 0xca,
+    0xe3, 0xf3, 0x0d, 0x1e, 0x2a, 0x0c, 0xe4, 0x05, 0xe0, 0x18, 0x2a, 0x07,
+    0x20, 0xed, 0xf6, 0x17, 0xcf, 0xf4, 0x2a, 0xd6, 0xfb, 0xce, 0x03, 0x37,
+    0xe2, 0xfd, 0x1d, 0xfb, 0xe5, 0xe0, 0x05, 0x29, 0xef, 0x16, 0x23, 0xf7,
+    0x01, 0xf4, 0x0c, 0x14, 0xff, 0xee, 0x31, 0xf9, 0x12, 0xf9, 0x14, 0xf6,
+    0x0c, 0xf6, 0x0b, 0x0f, 0xd8, 0xdc, 0xfe, 0x0f, 0x37, 0xfa, 0x01, 0x09,
+    0x04, 0xd1, 0x0b, 0x0c, 0x29, 0xf3, 0x0a, 0xf9, 0xed, 0xc2, 0x18, 0xf4,
+    0x25, 0x18, 0x0f, 0x08, 0xf7, 0xed, 0x1f, 0xf7, 0x4f, 0x0e, 0xf0, 0xe4,
+    0x00, 0xeb, 0xfa, 0x1a, 0x0c, 0x03, 0xe9, 0xfc, 0xf0, 0xcc, 0x06, 0x05,
+    0xf2, 0x12, 0x04, 0xe2, 0x16, 0x0a, 0x0a, 0xf3, 0x0b, 0xf3, 0xdc, 0xfd,
+    0x10, 0xfc, 0x0e, 0xe2, 0xe0, 0xfe, 0xf0, 0xff, 0xb1, 0x06, 0x1b, 0xe4,
+    0x30, 0x13, 0xc6, 0xc3, 0xfa, 0x0c, 0x1e, 0xd9, 0x57, 0x11, 0xe1, 0xd6,
+    0xfa, 0xee, 0x1d, 0xf7, 0x37, 0xea, 0xf0, 0x05, 0xef, 0x24, 0x1e, 0xf1,
+    0x10, 0xe8, 0xeb, 0x19, 0xd1, 0x18, 0xf5, 0xc8, 0xf8, 0xec, 0xf5, 0x1f,
+    0xf2, 0xff, 0xb3, 0xd2, 0xe6, 0x0e, 0x06, 0x2e, 0x07, 0x17, 0xe0, 0xf5,
+    0x02, 0xf9, 0x20, 0x07, 0x16, 0x08, 0xe8, 0x1d, 0xd3, 0x08, 0x34, 0xda,
+    0xf2, 0xce, 0xfb, 0x1f, 0xe1, 0x00, 0x2d, 0xdb, 0xdf, 0xcc, 0x05, 0xfb,
+    0xf7, 0x00, 0x33, 0xf9, 0x0b, 0x01, 0x13, 0x28, 0xf8, 0x07, 0x24, 0xf8,
+    0x0f, 0x03, 0x0d, 0xe9, 0x06, 0xfe, 0x18, 0xf9, 0xed, 0xf5, 0x0c, 0xe0,
+    0x2c, 0x0e, 0xf9, 0x06, 0xfb, 0xce, 0x27, 0xe8, 0x29, 0x19, 0xf9, 0x01,
+    0x0e, 0xc8, 0x25, 0xed, 0x30, 0xeb, 0x01, 0xfe, 0x10, 0xdc, 0x1e, 0x00,
+    0x1e, 0x10, 0xf9, 0x00, 0xfc, 0xc8, 0x0e, 0x04, 0x13, 0x04, 0xf0, 0x02,
+    0xfe, 0xd8, 0x0f, 0x1b, 0xf7, 0xe1, 0xf8, 0xde, 0x12, 0xe2, 0xef, 0x0a,
+    0x02, 0xe0, 0xdd, 0xf1, 0x0e, 0x2a, 0x25, 0x15, 0xeb, 0x02, 0xf4, 0xf0,
+    0xbf, 0xfc, 0x27, 0xdc, 0x42, 0x0f, 0xe9, 0xbf, 0xe8, 0x20, 0x33, 0xc9,
+    0x3f, 0x10, 0xec, 0xf3, 0x03, 0x02, 0x2c, 0x04, 0x38, 0x06, 0x0a, 0xf9,
+    0xe5, 0x1c, 0x3f, 0x0f, 0x0c, 0x25, 0xe2, 0x06, 0xe6, 0x03, 0xf4, 0xd7,
+    0xfe, 0xf6, 0xe7, 0x2f, 0xfa, 0x03, 0xb6, 0xcb, 0xf1, 0x11, 0x0a, 0x2c,
+    0xfc, 0x1e, 0xe0, 0xff, 0xc2, 0xdd, 0x1d, 0xf3, 0x10, 0xfa, 0x07, 0x1e,
+    0xf6, 0x20, 0x07, 0xe6, 0xf1, 0x0a, 0xe8, 0x27, 0xf1, 0xf5, 0x24, 0xed,
+    0xfd, 0xee, 0x13, 0x15, 0xe9, 0xe2, 0x22, 0xe5, 0xf9, 0xdd, 0x1d, 0x32,
+    0x04, 0xfa, 0x25, 0x00, 0xee, 0xfd, 0x0b, 0x0e, 0x23, 0xfa, 0x0f, 0x01,
+    0xf8, 0xf0, 0x15, 0xe4, 0x21, 0xf7, 0x10, 0xf9, 0xe7, 0xc3, 0x19, 0xe1,
+    0x34, 0xff, 0xed, 0xf4, 0xef, 0xd7, 0x21, 0x01, 0x31, 0xee, 0xf7, 0xf2,
+    0xf3, 0xe5, 0x0a, 0xee, 0x2e, 0x1e, 0xf2, 0x0c, 0x07, 0xc2, 0x08, 0x0a,
+    0x14, 0x14, 0x00, 0xfc, 0xf9, 0xd6, 0xfb, 0xf8, 0xe5, 0xf1, 0xfa, 0xe0,
+    0x15, 0x21, 0xef, 0x06, 0xf9, 0x00, 0xf5, 0xf4, 0x0b, 0x0b, 0x18, 0x02,
+    0xf5, 0x04, 0xdb, 0xfd, 0xcc, 0x32, 0x1d, 0xc9, 0x3b, 0x12, 0xd9, 0xaf,
+    0xcf, 0x0f, 0x26, 0xde, 0x35, 0xe4, 0xdb, 0xd3, 0x22, 0x11, 0x2e, 0xfb,
+    0x36, 0xfa, 0xfd, 0x02, 0xeb, 0x0f, 0x37, 0x0b, 0x14, 0x1d, 0xdd, 0x18,
+    0xe0, 0x10, 0xe0, 0xdf, 0x14, 0xf9, 0xf0, 0x19, 0xf7, 0xfb, 0xc4, 0xe5,
+    0xe7, 0x11, 0x01, 0x31, 0x1a, 0xf7, 0xd8, 0xf1, 0xe9, 0xf3, 0x21, 0xf9,
+    0xfe, 0xe4, 0xe9, 0x02, 0xd0, 0x06, 0x14, 0xd7, 0xfc, 0xec, 0x06, 0x10,
+    0xfc, 0xf0, 0x1c, 0xe7, 0xec, 0xe3, 0x03, 0x21, 0xe4, 0x04, 0x12, 0xf0,
+    0xf3, 0xed, 0x16, 0x36, 0x02, 0xfd, 0x13, 0x11, 0xdf, 0xeb, 0x19, 0x07,
+    0x10, 0x0c, 0xf9, 0x08, 0xf8, 0xf4, 0x1d, 0xfd, 0x1d, 0x16, 0xf4, 0x0a,
+    0x08, 0xec, 0x0c, 0x09, 0x3d, 0xe0, 0x0b, 0xee, 0x10, 0xd1, 0x1e, 0x15,
+    0x43, 0xeb, 0xfa, 0xf3, 0x05, 0xc7, 0xf2, 0xd9, 0x25, 0x20, 0xee, 0xe9,
+    0xfd, 0xce, 0x16, 0x0c, 0x27, 0x06, 0x0a, 0x06, 0xf9, 0xd6, 0x0b, 0x05,
+    0xe8, 0x02, 0xe8, 0xd2, 0x10, 0x01, 0xf2, 0x15, 0x09, 0x04, 0xd3, 0xe2,
+    0xfe, 0xf0, 0x32, 0x1b, 0xd9, 0xf5, 0xea, 0xcc, 0xcb, 0x10, 0x1c, 0xf1,
+    0x3b, 0x02, 0xd4, 0xbf, 0xca, 0xfe, 0x12, 0xdb, 0x3b, 0xf8, 0xd5, 0xe7,
+    0x13, 0x10, 0x1a, 0xf4, 0x38, 0x09, 0x08, 0xee, 0xf4, 0xf4, 0x3c, 0xf7,
+    0x15, 0x04, 0xe4, 0xfa, 0xf4, 0x04, 0xee, 0xf4, 0x07, 0xf8, 0xe9, 0x3b,
+    0xe2, 0x1f, 0xd5, 0xed, 0xe6, 0xfd, 0x18, 0x49, 0x21, 0x06, 0xd8, 0xde,
+    0xfa, 0xf0, 0x1b, 0xfe, 0xde, 0x08, 0xf7, 0x14, 0xc7, 0x0f, 0x1d, 0xcf,
+    0x00, 0xea, 0xff, 0x1b, 0xd5, 0x08, 0x0d, 0xd9, 0xf1, 0xf4, 0x16, 0x23,
+    0xd8, 0x0c, 0x29, 0xdc, 0xf1, 0xf2, 0x21, 0x49, 0xfc, 0xe2, 0x08, 0x01,
+    0xf0, 0xf8, 0x17, 0xf9, 0x0f, 0xf5, 0xfa, 0x1a, 0xef, 0xec, 0x09, 0xeb,
+    0x1a, 0x0c, 0x17, 0x09, 0x11, 0xe9, 0x1a, 0xf7, 0x29, 0xf9, 0xfd, 0x07,
+    0x01, 0xdd, 0x0a, 0xec, 0x22, 0x15, 0x03, 0xfd, 0xe2, 0xd2, 0x15, 0xec,
+    0x4d, 0xd7, 0xfc, 0xf6, 0x0b, 0xcc, 0x0e, 0x04, 0x03, 0xf7, 0xfb, 0xfb,
+    0x0d, 0xeb, 0x19, 0x07, 0xf4, 0xf4, 0xe5, 0xde, 0x22, 0x07, 0xea, 0xf7,
+    0xeb, 0x23, 0xc8, 0xee, 0x03, 0x04, 0x0f, 0x19, 0xc3, 0xf8, 0x06, 0xd0,
+    0xf7, 0xfe, 0x0e, 0xe7, 0x0a, 0x02, 0xb0, 0xb8, 0x00, 0xfb, 0x18, 0x0f,
+    0x22, 0xf7, 0xe9, 0xdc, 0x09, 0x15, 0x23, 0x0d, 0x22, 0x13, 0xe2, 0xed,
+    0xeb, 0x18, 0x20, 0x0b, 0x12, 0xfc, 0x02, 0xf1, 0xdb, 0x0e, 0xe1, 0x04,
+    0xdb, 0x0f, 0xf3, 0x1a, 0x06, 0xef, 0xdb, 0xdc, 0xdd, 0xfb, 0x00, 0x2a,
+    0x20, 0xfd, 0xc1, 0xe3, 0xef, 0x01, 0x14, 0xf2, 0x14, 0x00, 0x0f, 0x28,
+    0xd9, 0xff, 0xf4, 0xdc, 0x09, 0xfa, 0x1c, 0x08, 0xd1, 0x03, 0x0a, 0xf4,
+    0xe4, 0xdb, 0x20, 0x30, 0xea, 0x06, 0x11, 0xe2, 0x26, 0xf7, 0x16, 0x22,
+    0xf9, 0x07, 0x02, 0xf5, 0xf6, 0xfb, 0x1d, 0x0c, 0x16, 0x0a, 0x07, 0xf9,
+    0x11, 0xde, 0x20, 0x08, 0x19, 0x04, 0x0a, 0x0b, 0x0c, 0xf7, 0xf4, 0xfc,
+    0x41, 0xf1, 0xf8, 0x16, 0x09, 0xdc, 0x0e, 0x1a, 0x2b, 0x1f, 0xe7, 0xfe,
+    0x01, 0xe0, 0xfd, 0xe2, 0x34, 0xec, 0xf3, 0xf5, 0x03, 0xec, 0x0b, 0xfb,
+    0x04, 0xf6, 0xdd, 0xfd, 0x06, 0x14, 0x0d, 0xfa, 0xfc, 0xf1, 0x0a, 0xca,
+    0x01, 0xec, 0x0e, 0x0e, 0xec, 0xd7, 0xee, 0xd4, 0xf2, 0xfe, 0x16, 0xfa,
+    0xbd, 0x0d, 0xef, 0xcb, 0xc4, 0xee, 0xed, 0x13, 0x10, 0x19, 0xf8, 0xb1,
+    0xf1, 0xe3, 0x00, 0xf3, 0x0c, 0xf6, 0xde, 0xc6, 0x15, 0x27, 0x14, 0x29,
+    0x15, 0xf6, 0xf4, 0xf5, 0xe7, 0x00, 0x0b, 0x2f, 0x0c, 0xef, 0x03, 0x0f,
+    0xfd, 0x08, 0xf3, 0xf9, 0xf9, 0x05, 0x0d, 0x34, 0x15, 0x1b, 0xc8, 0xd1,
+    0xf2, 0x1b, 0x0a, 0x22, 0x12, 0x11, 0xe9, 0xf4, 0xe1, 0x2a, 0x20, 0x03,
+    0xf2, 0xf8, 0x14, 0x0b, 0xd0, 0xf4, 0x0e, 0xbf, 0xc6, 0xd8, 0x04, 0x05,
+    0xf8, 0xf4, 0x04, 0xc9, 0xea, 0xfd, 0xf7, 0xfa, 0xe3, 0x1b, 0x11, 0xde,
+    0x0c, 0x11, 0x25, 0x29, 0xe5, 0x02, 0xef, 0xef, 0x02, 0xfa, 0x1a, 0x21,
+    0x19, 0x09, 0x08, 0x05, 0x04, 0xe5, 0xfa, 0xed, 0x2d, 0x26, 0xfa, 0x17,
+    0xf6, 0xe8, 0x12, 0x12, 0x31, 0xfc, 0x0d, 0x00, 0xf7, 0xeb, 0x19, 0xf1,
+    0x2a, 0x06, 0x14, 0xec, 0x08, 0xd3, 0x21, 0x07, 0x32, 0xe3, 0x02, 0x0b,
+    0xfb, 0xd8, 0x27, 0x07, 0x05, 0xe6, 0xf5, 0xf5, 0x0a, 0xf7, 0x2c, 0x2a,
+    0xd8, 0x1b, 0xda, 0xf7, 0xea, 0xf6, 0xf9, 0x0e, 0xf8, 0x0c, 0x05, 0xc7,
+    0xd6, 0x06, 0x12, 0xe3, 0xe1, 0xe1, 0xd8, 0xdb, 0xc6, 0xf8, 0xe6, 0xfa,
+    0x0c, 0x07, 0xf8, 0xe7, 0xe1, 0x0f, 0x00, 0xf3, 0x03, 0xf0, 0xde, 0xcc,
+    0xf5, 0xfc, 0xef, 0x1e, 0x16, 0x13, 0xfb, 0xf4, 0x03, 0xe9, 0xfc, 0xfa,
+    0x15, 0xe8, 0x15, 0x09, 0xf1, 0x0d, 0xdb, 0x0a, 0xe8, 0x09, 0xf5, 0x1a,
+    0x04, 0xf8, 0xd8, 0xd4, 0x04, 0xee, 0x25, 0x29, 0x09, 0xfe, 0xf3, 0xf5,
+    0xd4, 0x0a, 0x15, 0x19, 0xf5, 0x12, 0xfe, 0x04, 0xe7, 0x01, 0xeb, 0xde,
+    0xbe, 0xfe, 0x09, 0x12, 0xdf, 0x13, 0xe0, 0xef, 0xc7, 0xff, 0x03, 0x08,
+    0xfe, 0xf2, 0x19, 0xe0, 0xe4, 0x0c, 0x22, 0x1e, 0x05, 0xf7, 0x16, 0xf2,
+    0xf9, 0x06, 0x17, 0xf6, 0x0c, 0x1e, 0x23, 0x08, 0xfe, 0xdc, 0xfd, 0x17,
+    0x11, 0xdf, 0xf5, 0x0f, 0x01, 0x03, 0x08, 0xee, 0x1b, 0x02, 0x0b, 0x1b,
+    0x0c, 0x16, 0x1a, 0x00, 0x0f, 0x26, 0x14, 0xf8, 0xf4, 0xf3, 0x19, 0x16,
+    0x22, 0x0a, 0xd0, 0xf9, 0xf1, 0x05, 0x2b, 0x1e, 0x1e, 0xef, 0xf5, 0x06,
+    0x05, 0xe7, 0x3f, 0x2a, 0x06, 0xf0, 0x15, 0x14, 0x13, 0x20, 0x1b, 0xde,
+    0x10, 0x05, 0x33, 0xf8, 0x08, 0x04, 0x17, 0x0d, 0x0f, 0xf6, 0x01, 0xed,
+    0x28, 0x25, 0x1c, 0x13, 0xfb, 0xea, 0xfb, 0xf3, 0x1c, 0xf9, 0x1f, 0xf0,
+    0xfb, 0x17, 0xf8, 0xff, 0x10, 0xf7, 0x0b, 0x24, 0x04, 0x00, 0x0d, 0x0c,
+    0xf7, 0x0a, 0x16, 0x13, 0xf8, 0x05, 0x0a, 0xf1, 0xf5, 0xee, 0xf8, 0x14,
+    0x0e, 0xed, 0xfe, 0x1b, 0xfe, 0x17, 0x13, 0x10, 0x12, 0x21, 0x1c, 0xfa,
+    0xe5, 0x0b, 0x08, 0x0c, 0x10, 0x1b, 0x03, 0xef, 0x0d, 0x05, 0x0a, 0xf0,
+    0x04, 0x11, 0x15, 0x00, 0xfd, 0xef, 0x02, 0x18, 0xf4, 0x09, 0xfa, 0xf6,
+    0x02, 0xf7, 0xfd, 0x13, 0xef, 0x13, 0xf7, 0xf9, 0x17, 0x0f, 0xfa, 0xf8,
+    0x15, 0xff, 0x04, 0xef, 0xf0, 0x15, 0xfa, 0xfe, 0xf0, 0xf4, 0xed, 0x06,
+    0x1c, 0x02, 0xfb, 0xf7, 0x05, 0xfb, 0x0c, 0xef, 0xf4, 0xf0, 0xf6, 0xec,
+    0x17, 0xf3, 0xf5, 0xef, 0x02, 0xfd, 0xe5, 0x21, 0x0c, 0xf1, 0x1e, 0x08,
+    0xf1, 0x0b, 0xf7, 0x09, 0x1d, 0xf2, 0xf9, 0xf2, 0xfb, 0x0e, 0xed, 0xf8,
+    0xfa, 0xdd, 0xf0, 0xfd, 0xdb, 0x1a, 0xf4, 0xef, 0x0c, 0x06, 0x0f, 0xdf,
+    0xe2, 0x06, 0x06, 0xee, 0xfa, 0x0d, 0x17, 0xfc, 0xf9, 0x15, 0x1a, 0xe4,
+    0xfb, 0x0c, 0x1a, 0xfc, 0x1b, 0x04, 0x07, 0x20, 0xff, 0x09, 0x0f, 0xf2,
+    0x26, 0x19, 0x1f, 0x0d, 0x02, 0x16, 0x03, 0x03, 0xfd, 0x05, 0x01, 0x1b,
+    0x0a, 0x11, 0xfa, 0x21, 0x13, 0xfb, 0x0c, 0x05, 0xf3, 0xdd, 0xe4, 0xdc,
+    0x22, 0x1b, 0x15, 0x14, 0x0e, 0xe8, 0x00, 0xf7, 0xf8, 0xf4, 0x0b, 0x0b,
+    0xfd, 0x21, 0xe3, 0x0f, 0xe1, 0x22, 0x01, 0x21, 0x0b, 0x1f, 0x09, 0x10,
+    0xe2, 0x18, 0x11, 0x0e, 0xed, 0x01, 0x14, 0x12, 0xfd, 0x11, 0xf6, 0xe9,
+    0x20, 0xe1, 0xf5, 0x1b, 0x27, 0x22, 0xfa, 0xf7, 0xfe, 0x13, 0xf6, 0xdc,
+    0x06, 0x0d, 0xf4, 0x05, 0x20, 0x0d, 0x0b, 0xe4, 0x15, 0x28, 0x0c, 0x00,
+    0xf5, 0x07, 0x0c, 0x0a, 0x06, 0x0e, 0xf3, 0xfb, 0xfe, 0x04, 0x08, 0xf4,
+    0xef, 0x03, 0xe4, 0xeb, 0x06, 0xee, 0xed, 0xdb, 0xeb, 0x1d, 0xf4, 0xfa,
+    0x0c, 0xfc, 0xfe, 0x11, 0xf7, 0xf8, 0xf5, 0xef, 0xe7, 0xfc, 0x1b, 0xdc,
+    0x17, 0xfd, 0xfe, 0x00, 0xea, 0xf4, 0xf1, 0xf7, 0x0f, 0x21, 0x04, 0xfd,
+    0x0d, 0x0c, 0x0a, 0x14, 0xfd, 0x19, 0x09, 0x01, 0xfd, 0xe2, 0x0c, 0x0c,
+    0xe0, 0x25, 0xfb, 0xff, 0x0d, 0x18, 0xf6, 0x0b, 0x19, 0x12, 0x10, 0x09,
+    0x0b, 0x06, 0x12, 0x1c, 0x10, 0x03, 0x13, 0x0a, 0x05, 0x0f, 0x09, 0x01,
+    0x21, 0xe4, 0x01, 0x26, 0xf9, 0xf4, 0x05, 0x19, 0x00, 0xff, 0x0b, 0xff,
+    0x16, 0x09, 0xe7, 0xee, 0xed, 0xf5, 0x0f, 0x2f, 0xee, 0x19, 0x03, 0x0a,
+    0x10, 0xee, 0xf7, 0x2e, 0xf4, 0x08, 0xf7, 0xee, 0x07, 0x00, 0xfc, 0x0e,
+    0xf0, 0x12, 0x08, 0x05, 0xed, 0x11, 0xfc, 0xfb, 0xf7, 0x25, 0xf1, 0x05,
+    0x0c, 0xf9, 0xfa, 0x03, 0x0c, 0x16, 0x04, 0x25, 0xf8, 0xe7, 0xfc, 0x11,
+    0x0d, 0x19, 0xd8, 0xfa, 0x0b, 0x06, 0xfd, 0xef, 0x13, 0xf6, 0xff, 0x0e,
+    0xf9, 0x04, 0xf1, 0xdc, 0xfb, 0xe1, 0xf6, 0x0b, 0x15, 0x07, 0xf7, 0x02,
+    0x0e, 0xf1, 0xfd, 0xe3, 0xeb, 0x07, 0xf1, 0xef, 0x03, 0xfe, 0xf8, 0x07,
+    0x10, 0xf7, 0x00, 0xf9, 0xf2, 0x0e, 0xf9, 0xf2, 0x1d, 0xf5, 0xd8, 0xff,
+    0xe6, 0x18, 0x2a, 0x1b, 0x03, 0x16, 0xfe, 0xf4, 0xf5, 0xfd, 0x04, 0x01,
+    0xfe, 0xfe, 0x07, 0xfc, 0x0e, 0xfa, 0x15, 0xeb, 0x02, 0x15, 0xea, 0xfd,
+    0x04, 0xe5, 0xfe, 0xed, 0xfe, 0x1a, 0x09, 0x2a, 0x1b, 0xdf, 0xfb, 0xf8,
+    0xf1, 0x04, 0x1a, 0x34, 0x07, 0xf9, 0x0d, 0xf5, 0xef, 0xec, 0x10, 0x1a,
+    0x0b, 0x0f, 0x13, 0xfe, 0x10, 0x22, 0x1e, 0x02, 0xe6, 0xf7, 0x11, 0xfa,
+    0x11, 0xfc, 0x1b, 0x21, 0x12, 0xf4, 0x18, 0x16, 0x29, 0xe4, 0x0c, 0x2e,
+    0x12, 0x07, 0x20, 0xf6, 0x1d, 0xf4, 0x12, 0x33, 0xf4, 0xee, 0xfe, 0x05,
+    0x06, 0xfb, 0x13, 0x0c, 0x0e, 0xf0, 0x00, 0xf8, 0xee, 0xf3, 0x17, 0x00,
+    0xf7, 0xfb, 0xfc, 0x0f, 0xf4, 0xd5, 0x0a, 0xed, 0xeb, 0xf5, 0xe9, 0xef,
+    0xd8, 0xf0, 0xf8, 0xe2, 0x19, 0xf7, 0xf8, 0x0a, 0x0b, 0x09, 0xfa, 0xe7,
+    0x0f, 0xfc, 0xe8, 0x02, 0x00, 0x1a, 0xfe, 0xfd, 0x1b, 0xe6, 0xef, 0x0f,
+    0xe3, 0x10, 0xf1, 0xe2, 0x0b, 0x0e, 0x06, 0x29, 0x00, 0x01, 0xf3, 0x00,
+    0x11, 0x04, 0xf2, 0xf7, 0xea, 0xf8, 0xe0, 0x09, 0x0e, 0x13, 0xf4, 0x00,
+    0x09, 0xfa, 0xf5, 0x0c, 0xff, 0x18, 0x08, 0x0d, 0xfa, 0xde, 0xfa, 0x03,
+    0xf2, 0xf3, 0x1b, 0xeb, 0x06, 0xea, 0xfb, 0xff, 0x0d, 0xf5, 0x10, 0x17,
+    0xf8, 0xe8, 0xf1, 0xf1, 0xf5, 0x00, 0x03, 0x0a, 0x09, 0x0a, 0xf3, 0xfb,
+    0x33, 0x26, 0xe7, 0x17, 0xe3, 0xfa, 0x1f, 0x24, 0xfc, 0x07, 0x02, 0xe2,
+    0xeb, 0x08, 0x2c, 0xf8, 0x02, 0x1f, 0x04, 0xeb, 0x0b, 0x04, 0x17, 0xf7,
+    0xff, 0x1c, 0xed, 0x00, 0x3f, 0xd5, 0x17, 0x1d, 0xfe, 0x03, 0xf1, 0x1c,
+    0x17, 0xec, 0x0e, 0x54, 0xee, 0xf5, 0x25, 0xfa, 0x08, 0xee, 0x13, 0x32,
+    0x0e, 0xd8, 0x09, 0x0f, 0xee, 0xe5, 0x06, 0x10, 0xf4, 0xfb, 0xe4, 0xfb,
+    0x09, 0xde, 0x13, 0xff, 0x02, 0xf9, 0xec, 0x0a, 0x00, 0xe9, 0xfd, 0xdc,
+    0x06, 0x04, 0xdb, 0x06, 0x01, 0xf8, 0x09, 0xe2, 0x0c, 0x14, 0xda, 0xfe,
+    0x20, 0xe3, 0x09, 0xda, 0x14, 0x12, 0xe1, 0x05, 0xff, 0xf3, 0x00, 0x08,
+    0xfb, 0xf1, 0xfd, 0xf3, 0x04, 0xfa, 0x08, 0xff, 0x01, 0x1d, 0x0b, 0xfd,
+    0x0a, 0xf4, 0xfb, 0xfc, 0xf9, 0x19, 0xed, 0xfc, 0xf2, 0x06, 0xe7, 0x02,
+    0xf6, 0x0c, 0xfc, 0xfb, 0x01, 0x0c, 0xeb, 0x1b, 0xff, 0xff, 0x08, 0x1d,
+    0xf7, 0xe8, 0xfc, 0xf4, 0x0c, 0xfa, 0xf1, 0xee, 0xed, 0xdd, 0xfc, 0x06,
+    0x05, 0xdc, 0x1a, 0xfc, 0xf9, 0x07, 0xdf, 0x1b, 0x14, 0x0c, 0xfc, 0x01,
+    0x16, 0xe1, 0xed, 0x09, 0x34, 0xee, 0xe4, 0x1c, 0x1b, 0xfc, 0x3b, 0x03,
+    0x15, 0xf2, 0xeb, 0x14, 0x00, 0xdd, 0x24, 0x04, 0xf1, 0xed, 0xfd, 0xe6,
+    0x32, 0xf9, 0x24, 0x04, 0x0e, 0x22, 0x03, 0x14, 0x2f, 0xf5, 0x1a, 0x37,
+    0xf4, 0x18, 0x03, 0x0f, 0x4b, 0xe6, 0x0d, 0x5c, 0xf7, 0x1f, 0x1c, 0xe6,
+    0x23, 0x0c, 0x15, 0x4e, 0xe0, 0x05, 0x1c, 0xec, 0xff, 0x04, 0x13, 0x15,
+    0xee, 0x07, 0xec, 0x0c, 0xdd, 0xf8, 0x0e, 0x03, 0x0c, 0x1f, 0xe8, 0x0e,
+    0xf5, 0xec, 0xfc, 0xe2, 0xe8, 0xfb, 0xf6, 0x00, 0xe5, 0xea, 0xf3, 0xd3,
+    0xf5, 0xfd, 0xd2, 0xfd, 0x1b, 0xed, 0x09, 0xd1, 0x23, 0xfa, 0xd4, 0xf7,
+    0xe9, 0xf0, 0x0a, 0xd6, 0x14, 0x03, 0xe6, 0x10, 0xf4, 0x18, 0xfe, 0xe1,
+    0x0b, 0x25, 0xf5, 0xfc, 0xe9, 0xf2, 0xe9, 0xf4, 0x0d, 0xf5, 0x00, 0xf9,
+    0x17, 0x02, 0xfd, 0x03, 0x04, 0xf8, 0xf5, 0x14, 0xe3, 0xd3, 0xeb, 0xe7,
+    0x09, 0xf3, 0x14, 0x17, 0xee, 0xe6, 0xf6, 0xff, 0x11, 0x26, 0xf4, 0xf7,
+    0x02, 0xfa, 0x05, 0x08, 0x16, 0xff, 0x0d, 0xf7, 0xf1, 0xf7, 0xe6, 0xfb,
+    0x04, 0x04, 0x07, 0x02, 0x04, 0x09, 0xf5, 0xfc, 0x5f, 0xd6, 0xe7, 0x2a,
+    0x23, 0xf4, 0x1b, 0x06, 0x01, 0xea, 0xe7, 0x05, 0x25, 0xe3, 0x25, 0x07,
+    0xea, 0xfb, 0xfb, 0x09, 0x25, 0xde, 0x37, 0x04, 0x07, 0xe5, 0xff, 0x14,
+    0x2f, 0x0a, 0x30, 0x23, 0x04, 0xf0, 0x23, 0xfe, 0x1c, 0xd2, 0x2b, 0x55,
+    0x01, 0xe5, 0x26, 0xfe, 0x14, 0xed, 0x24, 0x46, 0xe6, 0xee, 0x0f, 0xfd,
+    0xed, 0xef, 0x0e, 0x1e, 0x05, 0x0a, 0x12, 0xff, 0xe4, 0xf5, 0x0c, 0xed,
+    0xfd, 0xea, 0x0d, 0x13, 0x1a, 0xe5, 0xfc, 0xc2, 0xef, 0x0a, 0xe2, 0x0f,
+    0xfe, 0xff, 0x0c, 0xf0, 0xff, 0xdf, 0xea, 0x00, 0xf6, 0xe1, 0x04, 0xd8,
+    0x26, 0x20, 0xdc, 0xf4, 0x19, 0x06, 0xe8, 0xd2, 0x10, 0x04, 0xf1, 0x02,
+    0x0c, 0x06, 0xf0, 0xf0, 0x04, 0x1f, 0xf4, 0xf5, 0xed, 0xf1, 0xfa, 0xf1,
+    0x04, 0x02, 0xf8, 0xfb, 0x04, 0xf1, 0xe5, 0xe4, 0x0a, 0xf0, 0xfe, 0xef,
+    0x1c, 0xe3, 0xeb, 0xf3, 0x00, 0x17, 0x01, 0x13, 0x19, 0xda, 0xf8, 0x06,
+    0xde, 0x11, 0xea, 0xf7, 0xf4, 0xef, 0x03, 0x04, 0x0b, 0xe8, 0x08, 0x0e,
+    0xe2, 0xee, 0xde, 0x06, 0x0e, 0x29, 0xfb, 0xfa, 0x00, 0x02, 0xec, 0x1b,
+    0x52, 0xff, 0xde, 0x3a, 0x2f, 0x13, 0x30, 0xe9, 0xff, 0xf6, 0xe7, 0x15,
+    0x1d, 0xd9, 0x3c, 0x0f, 0xe6, 0x14, 0xee, 0x13, 0x1f, 0xe7, 0x33, 0x08,
+    0xfc, 0x06, 0x0c, 0x08, 0x19, 0xd9, 0x2b, 0x1f, 0x07, 0x10, 0x24, 0x16,
+    0x29, 0xfc, 0x31, 0x4d, 0xf0, 0xd9, 0x3f, 0xf2, 0x20, 0xe2, 0x25, 0x49,
+    0xe5, 0xec, 0x0a, 0xf5, 0xf2, 0xd9, 0x22, 0x1f, 0xed, 0x22, 0x02, 0x0a,
+    0x16, 0x08, 0xf7, 0xfb, 0x0e, 0xfb, 0xfb, 0x1d, 0xf3, 0x1c, 0xf6, 0xe1,
+    0xcf, 0x19, 0xf4, 0x0f, 0xee, 0xf9, 0x04, 0xd1, 0xf9, 0xe2, 0xda, 0xf1,
+    0x24, 0xf5, 0x07, 0xdf, 0x1d, 0xf9, 0xdb, 0x18, 0x0b, 0xea, 0x08, 0xca,
+    0xf2, 0xfa, 0xec, 0x04, 0x0e, 0x17, 0xed, 0xf1, 0x06, 0x15, 0xfc, 0xfd,
+    0x08, 0xfa, 0xe3, 0xe4, 0x0a, 0xfc, 0xee, 0x08, 0xf5, 0x09, 0xef, 0xee,
+    0x06, 0xef, 0xe1, 0x19, 0x07, 0xe8, 0xe6, 0xdf, 0xea, 0x0d, 0xf1, 0x16,
+    0xee, 0xed, 0xf8, 0x09, 0xfa, 0xfb, 0x0c, 0xf8, 0xeb, 0xda, 0x00, 0xfc,
+    0x04, 0xfe, 0xf5, 0xff, 0xf6, 0xe1, 0x0c, 0x0a, 0x13, 0x0d, 0xf6, 0xf5,
+    0x15, 0x07, 0xca, 0xec, 0x50, 0x0e, 0xd0, 0x26, 0x4c, 0xf8, 0x23, 0xeb,
+    0xff, 0x08, 0xe3, 0x11, 0x2c, 0xf9, 0x2a, 0xf1, 0xe9, 0x0b, 0xe9, 0x0f,
+    0x15, 0xec, 0x33, 0x11, 0x0c, 0x0d, 0x01, 0x01, 0x32, 0xe3, 0x41, 0x27,
+    0x11, 0x02, 0x2e, 0x07, 0x09, 0xe3, 0x22, 0x4d, 0xf1, 0x05, 0x27, 0x03,
+    0x25, 0xf5, 0x2c, 0x3b, 0xf4, 0x00, 0x16, 0x0b, 0xec, 0xfe, 0x17, 0x0d,
+    0xff, 0xe7, 0xfe, 0x24, 0x06, 0xee, 0xf0, 0xe9, 0xfa, 0x1c, 0xf2, 0x19,
+    0x08, 0xfa, 0xff, 0xd2, 0x01, 0x02, 0xea, 0x05, 0xf2, 0xf4, 0x0b, 0xd2,
+    0xf9, 0x0d, 0xcd, 0x0d, 0x12, 0xf2, 0x0e, 0xe1, 0x1f, 0x00, 0xe7, 0x14,
+    0x04, 0xff, 0x09, 0xdb, 0xfc, 0xd9, 0x06, 0xf9, 0xeb, 0x01, 0xef, 0xfa,
+    0xfb, 0xf5, 0xfc, 0xfb, 0x14, 0xe2, 0xf9, 0xf5, 0x02, 0xfd, 0xfc, 0x01,
+    0xf7, 0xf3, 0x00, 0xec, 0xe7, 0xf2, 0x00, 0xf1, 0x11, 0xec, 0xf0, 0xe9,
+    0x11, 0x0a, 0x07, 0x04, 0x01, 0xee, 0xfb, 0xf2, 0x14, 0x01, 0x12, 0xf0,
+    0xf2, 0xf1, 0xf0, 0xfb, 0x08, 0x03, 0xf8, 0x01, 0xe8, 0xf9, 0x17, 0x26,
+    0x0f, 0xea, 0xf7, 0xf8, 0x1e, 0xfe, 0xf2, 0xf8, 0x3f, 0x00, 0xd4, 0x1c,
+    0x53, 0xfe, 0x1e, 0x0f, 0xef, 0xdd, 0xed, 0x10, 0x19, 0xe7, 0x34, 0x0e,
+    0xde, 0xdf, 0xfa, 0x0e, 0x29, 0xe3, 0x16, 0x09, 0x06, 0x12, 0xeb, 0xf9,
+    0x32, 0xe0, 0x1a, 0x1d, 0xf3, 0xed, 0x10, 0x07, 0x31, 0xf2, 0x12, 0x52,
+    0xeb, 0xf7, 0x1e, 0xf7, 0x1a, 0xdc, 0x3e, 0x33, 0xe3, 0xfb, 0x1f, 0x0b,
+    0x08, 0xfe, 0x13, 0x1a, 0xf4, 0xf8, 0xfe, 0x08, 0xfc, 0xe9, 0xfe, 0xeb,
+    0xe6, 0xf6, 0x02, 0x18, 0x02, 0xe8, 0xfb, 0xf3, 0x01, 0x08, 0xd7, 0x13,
+    0x04, 0xe6, 0x02, 0xe6, 0xd7, 0x01, 0xd4, 0xf0, 0x0e, 0x05, 0x18, 0xe5,
+    0x08, 0xe5, 0xd2, 0x16, 0x12, 0xfe, 0x0e, 0xd3, 0xfc, 0x1f, 0xe9, 0xf8,
+    0x11, 0x06, 0xf3, 0xd5, 0xf8, 0xff, 0xf0, 0x04, 0x0a, 0xd9, 0xf8, 0xfd,
+    0xf5, 0x12, 0xff, 0x06, 0x1b, 0xe6, 0xfe, 0xfe, 0xde, 0xee, 0xf6, 0x18,
+    0xf1, 0xf8, 0x06, 0xf3, 0x02, 0xea, 0x04, 0x14, 0xfc, 0xee, 0xe6, 0x09,
+    0xf9, 0xee, 0xe3, 0xe7, 0xfc, 0xd9, 0xef, 0xfc, 0x0a, 0x0c, 0x03, 0xf6,
+    0xe2, 0x11, 0x0f, 0x19, 0x18, 0x10, 0xef, 0xe5, 0x22, 0xf5, 0xe5, 0xe9,
+    0x4b, 0xf7, 0xdb, 0x0c, 0x4f, 0xde, 0x22, 0x16, 0x09, 0x16, 0xd1, 0xf8,
+    0x19, 0xe0, 0x24, 0xfe, 0xb8, 0xfb, 0xe5, 0x12, 0x1c, 0xe3, 0x22, 0x09,
+    0x05, 0x29, 0xf7, 0x10, 0x31, 0xe1, 0x33, 0x3f, 0xfd, 0xed, 0x04, 0x03,
+    0x2e, 0xed, 0x30, 0x36, 0xee, 0x16, 0x2f, 0xf5, 0x1b, 0xdc, 0x3a, 0x56,
+    0xe5, 0xef, 0x26, 0xff, 0x03, 0xd7, 0x31, 0x16, 0xef, 0xf1, 0x08, 0x13,
+    0x01, 0x02, 0x03, 0xf1, 0xf2, 0x08, 0xff, 0x05, 0x12, 0xf2, 0xee, 0xda,
+    0xed, 0xec, 0xea, 0xf7, 0x0c, 0xf1, 0x09, 0xe6, 0xe6, 0x00, 0xcc, 0x10,
+    0x0d, 0x0d, 0x20, 0xf4, 0x18, 0x23, 0xec, 0xf9, 0x00, 0xe4, 0x07, 0xd4,
+    0xfb, 0x16, 0xd2, 0x01, 0xe6, 0x01, 0x06, 0xf0, 0xfe, 0x03, 0xf3, 0x09,
+    0x01, 0x0d, 0x05, 0xf7, 0xd4, 0x02, 0xfb, 0xfb, 0x08, 0xf0, 0x1f, 0xf3,
+    0xfe, 0xeb, 0x02, 0x0e, 0x1b, 0x0f, 0x04, 0xf5, 0xf0, 0x1f, 0x14, 0xf7,
+    0x06, 0xdc, 0xf9, 0xe9, 0x01, 0xff, 0x08, 0xf2, 0x06, 0xff, 0xff, 0xf3,
+    0x05, 0x1a, 0xfc, 0xfa, 0xeb, 0xfb, 0xfa, 0x12, 0x20, 0xf6, 0xe0, 0xe8,
+    0x1c, 0xfa, 0xd6, 0x0d, 0x2c, 0x04, 0xe1, 0x09, 0x3b, 0xd3, 0x2a, 0xee,
+    0xf7, 0xed, 0xf1, 0xf7, 0x0d, 0xf0, 0x32, 0x0f, 0xc9, 0x0e, 0x00, 0x10,
+    0x24, 0xfb, 0x31, 0xf0, 0xf4, 0xdd, 0xf5, 0x04, 0x25, 0xc7, 0x27, 0x25,
+    0x16, 0x11, 0x2e, 0x09, 0x30, 0xd1, 0x2c, 0x34, 0xe6, 0xf0, 0x21, 0xf5,
+    0x21, 0xc8, 0x40, 0x39, 0xde, 0xf0, 0x12, 0xf3, 0x10, 0xe8, 0x1f, 0x18,
+    0xfa, 0xea, 0x07, 0x11, 0xdf, 0xed, 0xfa, 0xf0, 0x07, 0xef, 0xf3, 0x05,
+    0x10, 0xe5, 0xf3, 0xe9, 0xe9, 0xe8, 0xd6, 0x01, 0xf9, 0x05, 0x0b, 0xee,
+    0xf9, 0x12, 0xe3, 0x05, 0xfd, 0xe6, 0x16, 0xe2, 0x1b, 0x12, 0xc5, 0x00,
+    0xfd, 0x02, 0x04, 0xd2, 0xff, 0xec, 0xf6, 0xfd, 0x00, 0xe4, 0xf7, 0xf3,
+    0xeb, 0xfa, 0xf8, 0x0d, 0x03, 0xfa, 0xfe, 0xe4, 0xdb, 0xe3, 0x06, 0xff,
+    0xf4, 0xf2, 0x1b, 0xf1, 0xf7, 0x02, 0x01, 0x04, 0x13, 0xe5, 0x0c, 0x05,
+    0xf7, 0x0a, 0x03, 0x03, 0x0b, 0x03, 0xee, 0xf7, 0x21, 0x20, 0xff, 0xf3,
+    0x09, 0xe5, 0xff, 0xec, 0x17, 0x00, 0x06, 0x14, 0xeb, 0xf2, 0x18, 0x16,
+    0x1f, 0xec, 0xee, 0xe1, 0x1e, 0x03, 0xfa, 0xfe, 0x28, 0x03, 0xc9, 0x0c,
+    0x3f, 0xd8, 0x30, 0x16, 0x03, 0xf8, 0xe9, 0xfb, 0x28, 0xe1, 0x36, 0x0a,
+    0xdf, 0xe5, 0xeb, 0x08, 0x1c, 0xcd, 0x29, 0xf2, 0xfc, 0x0a, 0xed, 0x01,
+    0x29, 0xf1, 0x20, 0x13, 0x04, 0xec, 0x17, 0x0a, 0x35, 0xc3, 0x1a, 0x46,
+    0xe0, 0xd7, 0x3c, 0x09, 0x28, 0xd1, 0x22, 0x20, 0xd5, 0xfa, 0x28, 0xfa,
+    0xff, 0xea, 0x1d, 0x23, 0xe0, 0x07, 0x07, 0x0f, 0xf1, 0xf1, 0x08, 0xf0,
+    0xf8, 0xff, 0x05, 0x1b, 0x05, 0xfa, 0xf0, 0xfb, 0xe3, 0xe4, 0xcc, 0x1a,
+    0xf9, 0x09, 0x06, 0xee, 0xf4, 0x03, 0xd0, 0x14, 0xf4, 0xff, 0x1d, 0xe8,
+    0x11, 0xf4, 0xd1, 0xf4, 0x04, 0x0b, 0xfb, 0xdc, 0x0a, 0x0c, 0xeb, 0xed,
+    0x06, 0xf3, 0x04, 0xdd, 0xdf, 0xf9, 0xea, 0xfc, 0xf5, 0xf2, 0xfb, 0xea,
+    0xe3, 0x03, 0xee, 0x0e, 0xff, 0xdb, 0x1e, 0x04, 0xf7, 0x1a, 0x04, 0x0c,
+    0x0d, 0xda, 0x04, 0xe9, 0xff, 0x04, 0x00, 0x0c, 0xf9, 0xe4, 0xfb, 0xf6,
+    0x14, 0xde, 0x1b, 0x00, 0x0b, 0xfe, 0x06, 0xf8, 0x0f, 0xdc, 0x01, 0xef,
+    0xef, 0x0d, 0xf8, 0xf1, 0x0f, 0xf9, 0xf9, 0xdf, 0x0d, 0xe4, 0xd9, 0xf9,
+    0x2b, 0xee, 0xe8, 0x09, 0x40, 0xf9, 0x2f, 0x0a, 0xfa, 0xe8, 0xe9, 0x01,
+    0x0e, 0xe7, 0x23, 0x0a, 0xd0, 0x19, 0xd3, 0x0e, 0x04, 0xda, 0x2b, 0x0f,
+    0xe7, 0xe6, 0xf3, 0xfb, 0x2c, 0xd3, 0x36, 0x19, 0x0e, 0xfe, 0x03, 0x1a,
+    0x2e, 0xd0, 0x23, 0x32, 0xf1, 0xe1, 0x2a, 0x09, 0x1b, 0xf6, 0x29, 0x3e,
+    0xce, 0x15, 0x0a, 0xe8, 0xec, 0xdf, 0x44, 0x28, 0xd9, 0xfd, 0xfa, 0x09,
+    0xff, 0xe7, 0x08, 0xec, 0xf4, 0xef, 0x01, 0x19, 0x11, 0xf3, 0xeb, 0xeb,
+    0xed, 0x1a, 0xdd, 0x15, 0x0f, 0x07, 0xfe, 0xeb, 0xff, 0xd6, 0xd5, 0x04,
+    0xf5, 0x07, 0x10, 0xe6, 0x0c, 0xe4, 0xda, 0x0c, 0x08, 0xee, 0x06, 0xd8,
+    0xf8, 0xf1, 0xe0, 0x01, 0x08, 0xfe, 0xf9, 0xf3, 0xdf, 0x03, 0xe6, 0xf4,
+    0x0a, 0xff, 0xf2, 0xe0, 0xd9, 0xeb, 0x01, 0x10, 0x02, 0xfc, 0x0d, 0x14,
+    0xea, 0xf8, 0x03, 0x18, 0xf3, 0x09, 0xfc, 0x0c, 0x0b, 0x1f, 0xf5, 0x05,
+    0xf7, 0xf9, 0x00, 0xfd, 0x04, 0xfc, 0x16, 0x07, 0x00, 0xdf, 0xf9, 0xfa,
+    0x0c, 0xfb, 0xf4, 0xf7, 0xf0, 0xeb, 0x07, 0x17, 0x20, 0xfb, 0xf0, 0xec,
+    0x04, 0x00, 0xf8, 0xf2, 0x2d, 0xf9, 0xd9, 0x0b, 0x55, 0xec, 0x33, 0x26,
+    0xf8, 0x0a, 0xf2, 0x0b, 0x25, 0xdf, 0x29, 0x05, 0xd1, 0x14, 0xe2, 0xf2,
+    0x12, 0xdd, 0x28, 0xfc, 0xec, 0x08, 0xfd, 0x02, 0x3a, 0xe6, 0x29, 0x25,
+    0x0d, 0x10, 0x09, 0x0a, 0x32, 0xf5, 0x17, 0x2d, 0xea, 0xfb, 0x35, 0xfc,
+    0x28, 0xd0, 0x29, 0x2f, 0xcb, 0x06, 0x0f, 0x04, 0xf2, 0xf3, 0x34, 0x1c,
+    0xf4, 0x08, 0x05, 0xfc, 0xfd, 0xed, 0x0f, 0xf8, 0xe9, 0xf0, 0x09, 0x16,
+    0xfe, 0x02, 0xff, 0xd4, 0xea, 0x0a, 0xeb, 0x0c, 0xf8, 0xf4, 0x09, 0xf4,
+    0xf2, 0x07, 0xd9, 0x0b, 0xfd, 0xe4, 0x1a, 0xef, 0x14, 0x08, 0xd8, 0xfc,
+    0xf5, 0xe1, 0x03, 0xcf, 0xf1, 0x11, 0xdb, 0x15, 0x07, 0x10, 0xf8, 0xfc,
+    0xe2, 0xf1, 0xf5, 0xde, 0xff, 0xe7, 0x01, 0xea, 0xee, 0xe9, 0x02, 0x0a,
+    0x18, 0xec, 0xfe, 0xf9, 0x09, 0xf3, 0x0e, 0x02, 0xf1, 0xfc, 0xf9, 0x16,
+    0x05, 0x07, 0x09, 0x0d, 0x0e, 0xf7, 0x04, 0xed, 0x04, 0xdb, 0x04, 0x04,
+    0xf6, 0xdc, 0xee, 0xec, 0xf5, 0xfe, 0xf4, 0x02, 0xe4, 0x0b, 0xe0, 0x17,
+    0x0a, 0xe0, 0xf7, 0xdc, 0x11, 0xd6, 0xfe, 0xfa, 0x35, 0xde, 0xe6, 0x06,
+    0x44, 0xf9, 0x35, 0x0a, 0xfb, 0xff, 0xec, 0xfb, 0x16, 0xd9, 0x23, 0x0f,
+    0xd4, 0xef, 0xdf, 0x06, 0x0b, 0xd9, 0x25, 0xff, 0xf8, 0xeb, 0xf4, 0x0a,
+    0x20, 0xe5, 0x22, 0x1c, 0xeb, 0xf4, 0x0d, 0x0c, 0x19, 0xe1, 0x1e, 0x31,
+    0xe9, 0xfb, 0x20, 0xf0, 0x23, 0xfe, 0x35, 0x28, 0xb4, 0x06, 0x28, 0xe7,
+    0xfb, 0xe9, 0x2a, 0x1a, 0xef, 0x15, 0x0c, 0xed, 0xf1, 0x04, 0x0e, 0x0a,
+    0xff, 0x16, 0x01, 0x04, 0x17, 0xea, 0xec, 0xdc, 0xf4, 0xf7, 0x04, 0x16,
+    0x1f, 0x0a, 0x11, 0xef, 0x12, 0xdf, 0xd9, 0x0c, 0xf5, 0x10, 0x02, 0xf3,
+    0x10, 0x03, 0xd3, 0xf5, 0x0b, 0x02, 0x00, 0xcb, 0xf6, 0x23, 0xf6, 0xf1,
+    0x1f, 0xf9, 0xfc, 0xf0, 0xf6, 0xfe, 0xfa, 0xf8, 0xf9, 0xf4, 0xfb, 0x0a,
+    0xd6, 0x29, 0x09, 0x02, 0x00, 0xfc, 0xfc, 0xee, 0xf5, 0x05, 0xfb, 0x1e,
+    0xf1, 0xf1, 0xf3, 0x02, 0xec, 0x1c, 0x0c, 0x0e, 0x0b, 0x04, 0xf6, 0xe7,
+    0x14, 0x08, 0x27, 0x01, 0xfe, 0xe5, 0xe7, 0x01, 0x1b, 0xf0, 0xf6, 0xff,
+    0xf4, 0xe7, 0xee, 0x18, 0x0d, 0x08, 0xf8, 0xd6, 0x07, 0xf4, 0x08, 0xff,
+    0x1d, 0x13, 0xe7, 0x0b, 0x42, 0xef, 0x28, 0x00, 0xf9, 0xf0, 0xf3, 0x00,
+    0x15, 0xfd, 0x1a, 0x22, 0xc1, 0xf5, 0xe0, 0xf8, 0x09, 0xe6, 0x0e, 0x05,
+    0xf9, 0xf6, 0x01, 0x01, 0x13, 0xdc, 0x1f, 0x0d, 0xfb, 0x04, 0x08, 0x0b,
+    0x15, 0xdb, 0x28, 0x34, 0xed, 0x0b, 0x3a, 0xed, 0x16, 0xe3, 0x39, 0x32,
+    0xc4, 0x0b, 0x20, 0xe7, 0xf7, 0x02, 0x35, 0x24, 0xfc, 0xe8, 0x1c, 0xf8,
+    0xf1, 0xfa, 0x0c, 0x1d, 0xf2, 0x05, 0xff, 0x12, 0x0f, 0x01, 0xec, 0xea,
+    0xf0, 0x03, 0xe7, 0x15, 0xfd, 0x05, 0x08, 0xe0, 0x1b, 0xf8, 0xe1, 0x1e,
+    0xed, 0xdc, 0x11, 0xeb, 0xfd, 0x1a, 0xeb, 0x09, 0xf9, 0xf3, 0x00, 0xe8,
+    0xe6, 0x08, 0xf7, 0xde, 0x1e, 0x00, 0x00, 0x00, 0xe4, 0x09, 0xf2, 0xf8,
+    0xe7, 0xf2, 0x0d, 0xfa, 0xe2, 0x0f, 0x04, 0x08, 0xf2, 0x13, 0xf8, 0xf9,
+    0xf1, 0xff, 0x03, 0x11, 0x12, 0xe9, 0xf4, 0x13, 0x07, 0x0c, 0x13, 0x2b,
+    0xf7, 0xdd, 0xf9, 0xe9, 0xfa, 0xdb, 0x1d, 0xf6, 0xf6, 0xf9, 0xe4, 0xf6,
+    0x0d, 0xeb, 0x0d, 0x08, 0xe7, 0xe7, 0xf2, 0x03, 0x1d, 0xd9, 0xd8, 0xe4,
+    0xf7, 0xea, 0xdc, 0xdc, 0x26, 0x02, 0xee, 0xfa, 0x38, 0xfc, 0x1a, 0xef,
+    0xda, 0xf1, 0xdf, 0x0b, 0x1a, 0xe0, 0x16, 0x16, 0xdc, 0x04, 0xfa, 0xf7,
+    0xee, 0x02, 0x25, 0x02, 0xf5, 0xfb, 0x08, 0xf6, 0x11, 0xf5, 0x12, 0x08,
+    0xf4, 0xe3, 0x1b, 0xf5, 0x3a, 0xdc, 0x20, 0x2e, 0xe0, 0xf5, 0x30, 0xe4,
+    0x09, 0xf8, 0x3c, 0x45, 0xd3, 0x08, 0x23, 0xd8, 0x09, 0xe4, 0x35, 0x30,
+    0xe4, 0xfe, 0x07, 0xf6, 0x05, 0x01, 0x05, 0xff, 0xf6, 0x0d, 0x02, 0xfd,
+    0x03, 0x05, 0x0d, 0x00, 0xf5, 0xd6, 0xcf, 0x19, 0x06, 0xee, 0x0d, 0xf2,
+    0x01, 0x18, 0xef, 0x12, 0x04, 0x02, 0x21, 0xd9, 0x02, 0x0d, 0xeb, 0xe9,
+    0x13, 0x08, 0x15, 0xf0, 0xee, 0x03, 0xec, 0x06, 0x17, 0xed, 0x00, 0x1a,
+    0xee, 0xf2, 0xfc, 0x09, 0xec, 0xf8, 0xf8, 0x18, 0xf4, 0x13, 0x04, 0xf6,
+    0x02, 0xf0, 0xfc, 0xfe, 0xe3, 0x01, 0x0a, 0x1c, 0x1b, 0xec, 0x0e, 0x01,
+    0xfb, 0x08, 0x11, 0xf5, 0x00, 0x14, 0xe6, 0x12, 0x07, 0xf4, 0x15, 0x07,
+    0xfc, 0xfb, 0xf5, 0xf1, 0x01, 0x21, 0x01, 0xe9, 0xe8, 0xef, 0xdb, 0xdf,
+    0x1f, 0x0a, 0xdd, 0xd1, 0x16, 0x04, 0xfd, 0xe1, 0x24, 0xf0, 0xec, 0xf4,
+    0x38, 0xe1, 0x16, 0xfd, 0xe0, 0xec, 0xe7, 0x0c, 0x2a, 0x04, 0x0c, 0x17,
+    0xdc, 0xe8, 0xf2, 0x03, 0xec, 0xfd, 0x19, 0xfe, 0xf3, 0xf0, 0xf3, 0xfb,
+    0x18, 0xdf, 0x1c, 0x00, 0x09, 0xf4, 0x18, 0x0b, 0x1f, 0xf6, 0x34, 0x22,
+    0xf4, 0x22, 0x45, 0xeb, 0x23, 0xcf, 0x32, 0x34, 0xf2, 0xf9, 0x29, 0xd4,
+    0xf7, 0x0b, 0x38, 0x2a, 0x09, 0xe6, 0x05, 0x01, 0x0b, 0xfe, 0x17, 0xfb,
+    0x00, 0xeb, 0x08, 0xfd, 0x0c, 0x02, 0x1d, 0xea, 0xfa, 0x0b, 0xeb, 0x09,
+    0xfe, 0xfe, 0x10, 0xe0, 0xf6, 0x06, 0xf0, 0x15, 0xf3, 0x09, 0x11, 0xe4,
+    0xf9, 0x07, 0xe1, 0xed, 0x17, 0x05, 0x0c, 0xe1, 0xdb, 0xf2, 0xf8, 0xea,
+    0x22, 0xe9, 0x02, 0x00, 0xfd, 0xe7, 0xf2, 0xf8, 0xf9, 0xfc, 0xfa, 0xe8,
+    0xe8, 0xeb, 0xe9, 0x0d, 0x04, 0xf8, 0xf8, 0xf7, 0xf8, 0x0d, 0x03, 0x0c,
+    0x13, 0xf2, 0x0f, 0xf9, 0xe6, 0xfd, 0x0f, 0x19, 0x08, 0xf7, 0xfa, 0x01,
+    0xf3, 0x12, 0x1e, 0x05, 0x0a, 0x09, 0xfd, 0x0b, 0x07, 0x08, 0x02, 0xfc,
+    0xd6, 0xe8, 0x14, 0x01, 0x13, 0x19, 0xef, 0xda, 0x0e, 0x0a, 0x07, 0xef,
+    0x34, 0xe0, 0x05, 0x1e, 0x4e, 0xe9, 0x19, 0xff, 0xe1, 0x04, 0xfb, 0x0e,
+    0x11, 0x05, 0x1f, 0x15, 0xd4, 0xec, 0xf9, 0xe7, 0xf9, 0xfc, 0x25, 0xff,
+    0x06, 0xf2, 0x01, 0xf6, 0x2a, 0x17, 0x24, 0x11, 0xf3, 0x1a, 0x1f, 0xfb,
+    0x32, 0xeb, 0x33, 0x2f, 0x00, 0x08, 0x2c, 0xf0, 0x26, 0xf4, 0x25, 0x36,
+    0xd9, 0xf1, 0x1a, 0xd5, 0xec, 0xf9, 0x32, 0x27, 0xfc, 0xf4, 0xf0, 0xe3,
+    0xfa, 0x0c, 0x16, 0x17, 0xfa, 0xf9, 0xe5, 0x1f, 0x1f, 0xfa, 0xff, 0xfd,
+    0x0d, 0x02, 0xe9, 0x0e, 0xf0, 0x12, 0x09, 0xda, 0x02, 0xea, 0xe5, 0x0a,
+    0xff, 0x03, 0x13, 0xf0, 0x0a, 0xf9, 0xe9, 0xff, 0x10, 0xfc, 0x1a, 0xf3,
+    0xf7, 0x0f, 0xf4, 0xfa, 0xf4, 0x05, 0x10, 0x0a, 0xdd, 0x09, 0xf7, 0xf0,
+    0xe5, 0x07, 0x07, 0xfa, 0x02, 0xd7, 0xf8, 0xf7, 0x01, 0xfb, 0x0e, 0xf8,
+    0x07, 0x0f, 0xfe, 0x03, 0x12, 0x05, 0x09, 0x13, 0xf8, 0xdc, 0xfd, 0x27,
+    0x0f, 0xec, 0xf7, 0x07, 0x00, 0xfc, 0x12, 0xf8, 0xfb, 0xea, 0xe4, 0xe9,
+    0xe9, 0xe0, 0xff, 0xdc, 0xd6, 0xeb, 0xf2, 0xf7, 0x0d, 0x1b, 0xe9, 0xc4,
+    0x06, 0x00, 0xfd, 0x04, 0x46, 0xf9, 0xe9, 0x13, 0x2d, 0x0c, 0x1f, 0xf8,
+    0xd3, 0x0c, 0x14, 0x11, 0x05, 0xe5, 0x27, 0x08, 0xc5, 0xef, 0xdf, 0xdd,
+    0x04, 0xf8, 0x11, 0x10, 0xf0, 0xe7, 0xfb, 0x03, 0x3c, 0xe7, 0x14, 0x0c,
+    0xf4, 0xf6, 0x1b, 0x0a, 0x23, 0xf2, 0x2d, 0x1a, 0x08, 0xff, 0x32, 0xe7,
+    0x1a, 0x05, 0x2b, 0x34, 0xf1, 0x0a, 0x00, 0xe8, 0x02, 0xdf, 0x2c, 0x2a,
+    0x03, 0xe6, 0xfc, 0xef, 0xfc, 0xe4, 0x03, 0x01, 0x03, 0xee, 0xe9, 0x15,
+    0x05, 0x03, 0x13, 0x11, 0x0e, 0xee, 0xf5, 0x22, 0x1b, 0x0e, 0xfd, 0xf3,
+    0x0a, 0x02, 0xdd, 0x20, 0xeb, 0x06, 0xf8, 0xe2, 0x06, 0x0e, 0xde, 0x0d,
+    0xf9, 0x16, 0x1c, 0x0c, 0xe0, 0xf0, 0xec, 0x0c, 0x0f, 0xf2, 0x27, 0x1d,
+    0xde, 0xe6, 0xf0, 0xf9, 0xf0, 0x02, 0x0a, 0x07, 0x06, 0xf9, 0x0f, 0xfa,
+    0xf0, 0xee, 0xf1, 0xf7, 0xff, 0x02, 0x0b, 0x0d, 0x1b, 0xee, 0xf6, 0x05,
+    0xff, 0x1c, 0x17, 0x04, 0x05, 0x17, 0x00, 0xff, 0x0d, 0xf3, 0x23, 0x10,
+    0xfd, 0x05, 0xfb, 0xea, 0x03, 0x10, 0x07, 0xd7, 0xf7, 0xff, 0xf3, 0xf1,
+    0x17, 0xed, 0xd3, 0xcb, 0x14, 0x1c, 0xf5, 0x03, 0x47, 0xf6, 0xf7, 0xf2,
+    0x3e, 0xf2, 0x22, 0xf4, 0xed, 0xfc, 0xee, 0x0b, 0xf4, 0xf1, 0x25, 0x10,
+    0xd0, 0xf6, 0x00, 0xef, 0x10, 0xfc, 0x15, 0xe5, 0xdb, 0xf3, 0xea, 0x10,
+    0x22, 0xf2, 0x2b, 0x11, 0xf9, 0x0a, 0xfc, 0xf5, 0x53, 0x16, 0x25, 0x43,
+    0xe0, 0x0e, 0x13, 0xfc, 0x2d, 0xe2, 0x55, 0x65, 0xf4, 0x08, 0x01, 0xdf,
+    0x0a, 0x00, 0x49, 0x1c, 0xfe, 0xdf, 0xef, 0xf2, 0xf9, 0xf6, 0xfd, 0xff,
+    0xf3, 0x02, 0xf6, 0x14, 0x0b, 0xe8, 0x09, 0xfc, 0xfc, 0xe2, 0xe5, 0x11,
+    0x03, 0x09, 0xfb, 0x06, 0x10, 0x1a, 0xf3, 0x0d, 0xfa, 0x0a, 0xd5, 0xf5,
+    0x1a, 0x11, 0xf2, 0xfc, 0x1f, 0xfe, 0x0e, 0xe4, 0xef, 0xd7, 0xee, 0x06,
+    0x1e, 0x04, 0x12, 0x28, 0xf7, 0x0e, 0x06, 0xf8, 0xee, 0xf0, 0x1a, 0x01,
+    0xf7, 0xfd, 0x03, 0x11, 0x19, 0x10, 0x04, 0xfb, 0xd7, 0xfa, 0x16, 0x06,
+    0x07, 0x23, 0xfa, 0x14, 0x11, 0xf1, 0x12, 0x10, 0x04, 0xe1, 0xee, 0xf7,
+    0x21, 0x0e, 0x0a, 0x0a, 0xf8, 0x07, 0x0a, 0xee, 0x03, 0x1f, 0xfa, 0xc4,
+    0xec, 0x12, 0x01, 0x1e, 0xfd, 0xf1, 0xe8, 0xcc, 0xf4, 0x17, 0xff, 0xdd,
+    0x45, 0x10, 0xee, 0xfa, 0x3d, 0xe7, 0x27, 0xdd, 0xd7, 0xf9, 0xf4, 0xf6,
+    0x06, 0xf8, 0x1e, 0x13, 0xe7, 0xe2, 0xf1, 0xe3, 0xf3, 0xf7, 0x18, 0x12,
+    0xe4, 0x0a, 0xdb, 0xff, 0xff, 0xfe, 0x20, 0x09, 0x00, 0xf7, 0x23, 0xf6,
+    0x2d, 0x14, 0x26, 0x28, 0xe5, 0xff, 0x0f, 0xe3, 0x1d, 0xe8, 0x56, 0x43,
+    0xe7, 0xfb, 0xf9, 0xe6, 0xe9, 0xe2, 0x19, 0x19, 0x08, 0xfa, 0xf3, 0xe5,
+    0x23, 0x07, 0x0f, 0xf8, 0xf8, 0xf3, 0xfc, 0x11, 0x2a, 0x05, 0xf4, 0xf1,
+    0xfa, 0xfb, 0xf1, 0x1e, 0x13, 0x0f, 0xf9, 0xf5, 0xfa, 0x09, 0xf9, 0x03,
+    0xf0, 0xf0, 0xe7, 0xec, 0xf1, 0x0c, 0xe6, 0xee, 0xf6, 0x20, 0x0f, 0xe9,
+    0x00, 0xf4, 0xfe, 0xf0, 0x13, 0x0a, 0x17, 0x13, 0xee, 0x13, 0xfb, 0xff,
+    0xf8, 0xfd, 0xf4, 0xe2, 0xe8, 0x06, 0xfc, 0x14, 0x03, 0x17, 0x00, 0x03,
+    0xe6, 0xfd, 0xf2, 0x12, 0x12, 0x20, 0xeb, 0x10, 0x02, 0xf7, 0x13, 0x0d,
+    0x11, 0xfd, 0xde, 0xf5, 0x07, 0xf3, 0x04, 0xff, 0x06, 0x05, 0xfb, 0xea,
+    0xf0, 0x0a, 0x00, 0xb5, 0xe8, 0x1a, 0x03, 0xfe, 0x0d, 0x1a, 0xe7, 0xc0,
+    0xd6, 0xdc, 0xf6, 0xf8, 0x39, 0xf5, 0xd5, 0xf8, 0x22, 0xfa, 0x22, 0x05,
+    0xd0, 0xf4, 0x2d, 0xfc, 0x00, 0x0a, 0x1b, 0xfc, 0xe6, 0x09, 0x14, 0xfa,
+    0x00, 0x1d, 0x1a, 0xfd, 0xf3, 0x18, 0xfc, 0xeb, 0x15, 0xf5, 0x0e, 0x0a,
+    0xf3, 0xf1, 0x1b, 0x05, 0x14, 0x03, 0x2d, 0x27, 0xfb, 0x18, 0x22, 0xef,
+    0xf6, 0x06, 0x28, 0x2b, 0xde, 0xec, 0xef, 0xe8, 0xd3, 0xfe, 0x17, 0x12,
+    0x01, 0x13, 0x05, 0xf7, 0x00, 0xde, 0xf3, 0xe5, 0x03, 0xfb, 0x07, 0x0b,
+    0xfd, 0xdc, 0xdf, 0x03, 0x0c, 0x00, 0xfa, 0x06, 0x0e, 0x02, 0x05, 0xfa,
+    0xfd, 0xed, 0x09, 0x0c, 0xfd, 0xfb, 0x0c, 0xf0, 0xe4, 0x04, 0xd6, 0xf3,
+    0x09, 0x0a, 0xf9, 0xf8, 0xe2, 0xef, 0xdf, 0xf0, 0xf8, 0x03, 0x0f, 0x20,
+    0xf4, 0xe3, 0xf8, 0x02, 0xe2, 0xe5, 0x25, 0x0f, 0xeb, 0xf8, 0xe9, 0xfd,
+    0x04, 0x0c, 0x0c, 0xfe, 0x01, 0x08, 0xfc, 0xfc, 0x1b, 0x01, 0xe5, 0x13,
+    0xf9, 0xe8, 0x07, 0x20, 0xfe, 0x06, 0xec, 0xfe, 0x09, 0xef, 0x14, 0x04,
+    0x0b, 0xf5, 0xe7, 0xff, 0x0a, 0x02, 0x09, 0xe9, 0xc4, 0x16, 0x0d, 0xe7,
+    0x15, 0x14, 0xf1, 0xd0, 0xec, 0xe7, 0xf0, 0xf0, 0x33, 0x05, 0xda, 0xf2,
+    0x0b, 0x08, 0x38, 0x01, 0x07, 0xfd, 0xd8, 0x06, 0xd9, 0xf0, 0x16, 0x1f,
+    0xff, 0xf7, 0xe0, 0xd8, 0xf3, 0xf7, 0x12, 0x08, 0x0e, 0x05, 0xf6, 0x03,
+    0xef, 0x1b, 0x12, 0xf4, 0xe8, 0x0f, 0x02, 0xfd, 0xf2, 0x16, 0x26, 0x22,
+    0xe0, 0x07, 0xf7, 0xe6, 0xeb, 0x16, 0x22, 0x1a, 0x0b, 0x01, 0xf5, 0xea,
+    0xd2, 0x22, 0x0f, 0x13, 0x15, 0x08, 0xf0, 0xfb, 0xed, 0x11, 0xf3, 0xe9,
+    0xff, 0xde, 0x0a, 0x18, 0x0f, 0x02, 0xfb, 0xf9, 0xfb, 0xe8, 0x12, 0x18,
+    0x01, 0xf4, 0xf6, 0xf8, 0xf0, 0x1f, 0x24, 0x15, 0xf5, 0x00, 0x1c, 0xf9,
+    0x01, 0x0a, 0x11, 0xd5, 0x01, 0x12, 0x02, 0xec, 0xfd, 0x07, 0xf2, 0xea,
+    0xf9, 0xff, 0xf7, 0xfb, 0x15, 0xec, 0xe5, 0x01, 0xeb, 0x05, 0xf9, 0x10,
+    0xfe, 0x28, 0xe5, 0x0a, 0xeb, 0x1b, 0x0e, 0xf9, 0xde, 0x02, 0x15, 0x0a,
+    0xff, 0xfe, 0x11, 0x24, 0x03, 0xf8, 0x00, 0x08, 0xfd, 0x0e, 0xeb, 0xf3,
+    0xf6, 0xf7, 0x14, 0x0e, 0xfc, 0xf5, 0xde, 0xf5, 0x9e, 0xfe, 0xff, 0xff,
+    0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0xab, 0x01, 0x00, 0x00,
+    0xfa, 0xfd, 0xff, 0xff, 0xa2, 0xff, 0xff, 0xff, 0xba, 0x00, 0x00, 0x00,
+    0x24, 0xfc, 0xff, 0xff, 0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f,
+    0x20, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xfb, 0xff, 0xff,
+    0x68, 0x01, 0x00, 0x00, 0x5c, 0x01, 0x00, 0x00, 0x50, 0x01, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf4, 0x00, 0x00, 0x00,
+    0x90, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0xce, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x03, 0x00, 0x00, 0x00,
+    0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x1a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,
+    0x07, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
+    0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xc4, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x38, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x1a, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0x00,
+    0x38, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,
+    0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+    0x0a, 0x00, 0x00, 0x00, 0x34, 0x04, 0x00, 0x00, 0xcc, 0x03, 0x00, 0x00,
+    0x4c, 0x03, 0x00, 0x00, 0xdc, 0x02, 0x00, 0x00, 0x60, 0x02, 0x00, 0x00,
+    0x20, 0x02, 0x00, 0x00, 0xb0, 0x01, 0x00, 0x00, 0x44, 0x01, 0x00, 0x00,
+    0x70, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xfc, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x44, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf4, 0xfb, 0xff, 0xff,
+    0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x80, 0x3b, 0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65,
+    0x6c, 0x73, 0x5f, 0x73, 0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00,
+    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,
+    0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0xb4, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+    0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00,
+    0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x11, 0x1e, 0x23, 0x3a, 0x9e, 0xa1, 0x15, 0x39,
+    0x23, 0x69, 0x45, 0x3a, 0x09, 0xe4, 0xe4, 0x39, 0x65, 0xd7, 0x13, 0x3a,
+    0xe0, 0xb2, 0xfd, 0x39, 0x1b, 0xc1, 0x53, 0x3a, 0xc2, 0x50, 0x2d, 0x3a,
+    0x12, 0x00, 0x00, 0x00, 0x66, 0x69, 0x72, 0x73, 0x74, 0x5f, 0x77, 0x65,
+    0x69, 0x67, 0x68, 0x74, 0x73, 0x2f, 0x72, 0x65, 0x61, 0x64, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x3a, 0xfd, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x09, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+    0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x2c, 0xfd, 0xff, 0xff,
     0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xc6, 0xd0, 0xd0, 0x3d, 0x01, 0x00, 0x00, 0x00, 0xf5, 0xff, 0xcf, 0x41,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xbc, 0xff, 0xff,
-    0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75,
-    0x00, 0x00, 0x00, 0x00, 0x04, 0xfb, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x09, 0xf5, 0x83, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0x14, 0x71, 0x83, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x72, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02, 0x10, 0x00, 0x00, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-    0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
-    0x64, 0xbc, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2d, 0x95, 0x98, 0x38,
-    0x20, 0x00, 0x00, 0x00, 0x27, 0xff, 0xff, 0xff, 0x97, 0xff, 0xff, 0xff,
-    0x58, 0x00, 0x00, 0x00, 0x66, 0xff, 0xff, 0xff, 0x13, 0xff, 0xff, 0xff,
-    0x72, 0xfe, 0xff, 0xff, 0x5d, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
-    0xea, 0xbc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x05, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,
-    0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0xec, 0xfb, 0xff, 0xff,
-    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,
-    0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7f, 0x3f, 0x01, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x5a, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03,
-    0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
-    0x31, 0x00, 0x00, 0x00, 0x54, 0xfc, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
-    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x9c, 0xd2, 0xb5, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0x48, 0x18, 0x1f, 0x41, 0x01, 0x00, 0x00, 0x00, 0x4a, 0x21, 0x4b, 0xc1,
-    0xc2, 0xbd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-    0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00,
-    0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f, 0x71, 0x75, 0x61, 0x6e,
-    0x74, 0x2f, 0x46, 0x61, 0x6b, 0x65, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x57,
-    0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e, 0x4d, 0x61, 0x78, 0x56, 0x61, 0x72,
-    0x73, 0x00, 0x00, 0x00, 0xe4, 0xfc, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0xb5, 0xfa, 0xfa, 0x39, 0x1f, 0x00, 0x00, 0x00, 0x66, 0x69, 0x6e, 0x61,
+    0x6c, 0x5f, 0x66, 0x63, 0x5f, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73,
+    0x2f, 0x72, 0x65, 0x61, 0x64, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70,
+    0x6f, 0x73, 0x65, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0xa0, 0x0f, 0x00, 0x00, 0xa2, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x58, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x74, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
     0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x8a, 0x0f, 0x3b, 0x3a,
-    0x01, 0x00, 0x00, 0x00, 0xfc, 0x0b, 0xb4, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0xd9, 0x26, 0xbf, 0xbd, 0x80, 0x02, 0x00, 0x00, 0x60, 0x38, 0xab, 0xcb,
-    0xfa, 0x7e, 0xa2, 0x55, 0x6e, 0x87, 0xa5, 0x9b, 0xb4, 0x66, 0x5c, 0x6f,
-    0xae, 0xdb, 0xcd, 0xb6, 0xc2, 0x60, 0xa9, 0x7d, 0xd4, 0xac, 0xa6, 0x90,
-    0x87, 0x6b, 0x50, 0x95, 0xde, 0xcd, 0xaa, 0xa1, 0x9c, 0x65, 0xb5, 0x6d,
-    0xb0, 0xa5, 0xa5, 0x7f, 0x73, 0x95, 0x63, 0x81, 0x7a, 0xc6, 0xaf, 0x82,
-    0x69, 0x89, 0xc3, 0x3c, 0x47, 0x73, 0x89, 0x4f, 0x33, 0xbc, 0x85, 0x5d,
-    0x69, 0x11, 0x5b, 0xb9, 0xf1, 0x95, 0x8f, 0x5c, 0x7c, 0x59, 0x6c, 0xa0,
-    0xa5, 0x7c, 0x5a, 0x7c, 0xb5, 0xa9, 0x7e, 0xa1, 0xb8, 0x65, 0xb3, 0x86,
-    0xc1, 0x9f, 0x5c, 0x86, 0x7f, 0x74, 0x52, 0xa8, 0xc9, 0xc5, 0x71, 0x96,
-    0x7a, 0x65, 0xc7, 0x69, 0x94, 0xa7, 0x65, 0x68, 0x69, 0x8d, 0x6d, 0x9e,
-    0x59, 0xd4, 0x75, 0x7a, 0x4f, 0x70, 0xca, 0x48, 0x25, 0x8a, 0x69, 0x4d,
-    0x2a, 0xa6, 0x76, 0x69, 0x6a, 0x02, 0x3b, 0xa2, 0xea, 0xc2, 0x73, 0x6b,
-    0x86, 0x4d, 0x3a, 0xa2, 0xa2, 0x88, 0x4e, 0x6c, 0xb3, 0x83, 0x39, 0x93,
-    0xa6, 0x85, 0xb8, 0x7a, 0xa8, 0x7d, 0x2e, 0x7b, 0x7f, 0x69, 0x56, 0xb5,
-    0xbb, 0xae, 0x23, 0x78, 0x67, 0x5c, 0xd2, 0x82, 0x7d, 0x96, 0x46, 0x74,
-    0x70, 0x72, 0x6a, 0x90, 0x43, 0xce, 0x44, 0x75, 0x4a, 0x58, 0xc7, 0x5c,
-    0x34, 0x84, 0x46, 0x4b, 0x41, 0x6c, 0x62, 0x83, 0x7e, 0x01, 0x9b, 0x9b,
-    0xeb, 0xf7, 0x58, 0x6f, 0x8a, 0x43, 0xb3, 0x9f, 0x9c, 0x9e, 0x55, 0xa8,
-    0xaa, 0x84, 0x8f, 0x8f, 0xb0, 0x9e, 0xc8, 0x81, 0xb6, 0x80, 0xa0, 0x81,
-    0x86, 0x73, 0x5d, 0xdc, 0xb9, 0xae, 0xa2, 0x6c, 0x46, 0x67, 0xfa, 0x79,
-    0x89, 0xaf, 0xa0, 0x74, 0x76, 0x85, 0x72, 0xb1, 0x2a, 0xbb, 0xa0, 0x6d,
-    0x4f, 0x50, 0xc9, 0x5d, 0x2f, 0xaa, 0x9c, 0x63, 0x3f, 0x59, 0x63, 0x90,
-    0x73, 0x1e, 0xb3, 0x94, 0xcd, 0xff, 0x3c, 0x63, 0x9b, 0x59, 0xc5, 0xa2,
-    0x9f, 0x9a, 0x53, 0xab, 0xb0, 0x74, 0xb2, 0x6f, 0x8a, 0xa7, 0xd5, 0x8d,
-    0xb8, 0x7e, 0x9e, 0x78, 0x84, 0x61, 0x66, 0xe7, 0xa7, 0x9f, 0xb7, 0x45,
-    0x24, 0x61, 0xfd, 0x69, 0x87, 0xb8, 0xb2, 0x7a, 0x7c, 0x58, 0x64, 0xa3,
-    0x07, 0xa9, 0xaf, 0x69, 0x49, 0x2f, 0xc2, 0x46, 0x3b, 0xaf, 0x9a, 0x70,
-    0x6b, 0x25, 0x5f, 0x9d, 0x82, 0x33, 0xa1, 0x54, 0xae, 0xff, 0x31, 0x5d,
-    0xaf, 0x51, 0xb2, 0x82, 0x9c, 0xa9, 0x5b, 0x8c, 0xab, 0x75, 0xb3, 0x32,
-    0x42, 0xbd, 0xcd, 0x77, 0xb6, 0x67, 0x9a, 0x5f, 0x6c, 0x71, 0x6e, 0xc2,
-    0xac, 0x97, 0x9f, 0x4b, 0x21, 0x6a, 0xfc, 0x77, 0x83, 0xa1, 0xa3, 0x6a,
-    0x7a, 0x6d, 0x5e, 0x87, 0x02, 0xa6, 0x8f, 0x7f, 0x5c, 0x2e, 0xc1, 0x51,
-    0x4a, 0xa7, 0x96, 0x79, 0x83, 0x2e, 0x5a, 0x84, 0x82, 0x5c, 0x61, 0x3a,
-    0x4a, 0xff, 0x2a, 0x51, 0xa4, 0x6b, 0x82, 0x5e, 0x67, 0xb3, 0x71, 0x80,
-    0xad, 0x62, 0x59, 0x40, 0x26, 0xd7, 0xcf, 0x68, 0xab, 0x7c, 0x6a, 0x69,
-    0x5b, 0x7c, 0x84, 0xbc, 0x95, 0x68, 0x77, 0x63, 0x3f, 0x85, 0xed, 0x7b,
-    0x71, 0xa0, 0x76, 0x90, 0x8c, 0x6c, 0x61, 0x81, 0x16, 0x74, 0x72, 0x94,
-    0x74, 0x37, 0xb5, 0x3d, 0x55, 0x96, 0x86, 0xad, 0x87, 0x39, 0x59, 0x88,
-    0x5b, 0x65, 0x60, 0x33, 0x33, 0xe6, 0x2b, 0x4a, 0xb6, 0x82, 0x50, 0x56,
-    0x51, 0x97, 0x71, 0x83, 0xa6, 0x60, 0x57, 0x51, 0x58, 0xe4, 0xd0, 0x87,
-    0xa1, 0x78, 0x4c, 0x67, 0x72, 0x74, 0x86, 0xc6, 0x60, 0x47, 0x50, 0x96,
-    0x67, 0x96, 0xdd, 0x7d, 0x63, 0x85, 0x5e, 0x98, 0xa2, 0x64, 0x5f, 0x8a,
-    0x3b, 0x40, 0x54, 0xcb, 0xa0, 0x61, 0xa7, 0x44, 0x5f, 0x6d, 0x57, 0xb3,
-    0xb9, 0x2e, 0x61, 0x8e, 0x54, 0x78, 0x85, 0x58, 0x43, 0xb0, 0x27, 0x5d,
-    0x8a, 0x7c, 0x8a, 0x58, 0x40, 0x83, 0x82, 0x9b, 0x6c, 0x60, 0x6b, 0x72,
-    0x7f, 0xde, 0xc9, 0x7d, 0x6f, 0x5f, 0x90, 0x7e, 0x7e, 0x7e, 0x8b, 0xe5,
-    0x51, 0x37, 0x7a, 0xa9, 0xa2, 0xc5, 0xd3, 0x81, 0x32, 0x4b, 0x80, 0xa9,
-    0xc5, 0x76, 0x56, 0x99, 0x33, 0x19, 0x72, 0xe6, 0xdb, 0x90, 0xa8, 0x50,
-    0x65, 0x44, 0x77, 0xdb, 0xc7, 0x48, 0x65, 0x8d, 0x3d, 0x7f, 0xa2, 0x7c,
-    0x53, 0x55, 0x26, 0x49, 0x5d, 0x7d, 0xa2, 0x6d, 0x3b, 0x5b, 0x87, 0x64,
-    0x3a, 0x5b, 0x8d, 0x93, 0x7a, 0xb4, 0xca, 0x6d, 0x16, 0x5a, 0x99, 0x82,
-    0x8d, 0x6a, 0x92, 0xa0, 0x39, 0x2c, 0x95, 0xc8, 0xb8, 0xf5, 0xc8, 0x66,
-    0x2a, 0x45, 0x84, 0x9c, 0xc7, 0x8e, 0x61, 0x7b, 0x43, 0x28, 0x86, 0xff,
-    0xd2, 0xc8, 0x9c, 0x46, 0x65, 0x33, 0x82, 0xd8, 0xcb, 0x73, 0x63, 0x80,
-    0xda, 0xc0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xa0, 0x0f, 0x00, 0x00,
-    0x31, 0x00, 0x00, 0x00, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73, 0x5f,
-    0x71, 0x75, 0x61, 0x6e, 0x74, 0x5f, 0x31, 0x2f, 0x46, 0x61, 0x6b, 0x65,
-    0x51, 0x75, 0x61, 0x6e, 0x74, 0x57, 0x69, 0x74, 0x68, 0x4d, 0x69, 0x6e,
-    0x4d, 0x61, 0x78, 0x56, 0x61, 0x72, 0x73, 0x2f, 0x74, 0x72, 0x61, 0x6e,
-    0x73, 0x70, 0x6f, 0x73, 0x65, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00,
-    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x87, 0xff, 0xdb, 0x39,
-    0x01, 0x00, 0x00, 0x00, 0xd8, 0xb2, 0x5d, 0x3d, 0x01, 0x00, 0x00, 0x00,
-    0x37, 0xdc, 0x56, 0xbd, 0x80, 0x3e, 0x00, 0x00, 0x67, 0x6d, 0x74, 0x77,
-    0x35, 0x66, 0x87, 0x95, 0x8e, 0x82, 0x5e, 0x70, 0x6e, 0xa7, 0x60, 0x64,
-    0x86, 0x5e, 0x93, 0x7a, 0x76, 0x74, 0x71, 0x8c, 0x61, 0x71, 0x60, 0x8b,
-    0x83, 0x48, 0x8b, 0x5f, 0x95, 0x99, 0x5b, 0x59, 0x49, 0x44, 0x79, 0x62,
-    0x8e, 0x77, 0x71, 0x89, 0x64, 0x46, 0x8f, 0x8e, 0x80, 0x73, 0x71, 0x81,
-    0x85, 0x4a, 0x73, 0x57, 0x66, 0x58, 0x75, 0x93, 0x99, 0x58, 0x8a, 0x7b,
-    0x87, 0x81, 0xa1, 0x46, 0x79, 0x6c, 0x83, 0x7a, 0x92, 0x74, 0x6f, 0x6b,
-    0x79, 0x77, 0x97, 0x8a, 0x95, 0x75, 0xa2, 0x49, 0x80, 0x4e, 0x7f, 0x6d,
-    0xaa, 0xac, 0x6c, 0x5d, 0x57, 0x82, 0x97, 0x77, 0x6f, 0x75, 0x95, 0x73,
-    0x7e, 0x51, 0x9f, 0x5b, 0x54, 0x92, 0x60, 0x72, 0x80, 0x6a, 0x92, 0x83,
-    0x9b, 0x85, 0x7b, 0x4d, 0x55, 0x4d, 0xb2, 0x7d, 0x65, 0x95, 0x76, 0x42,
-    0x61, 0x49, 0xa2, 0x73, 0x9f, 0x7d, 0x7c, 0x54, 0x51, 0x76, 0xa1, 0x7f,
-    0x86, 0x69, 0x98, 0x59, 0x6d, 0x84, 0x9f, 0x7b, 0x86, 0x79, 0x88, 0x55,
-    0x9c, 0x72, 0x95, 0x8a, 0x91, 0x7a, 0x77, 0x95, 0x7b, 0x87, 0x87, 0x85,
-    0x95, 0x72, 0x77, 0x59, 0x7c, 0x80, 0x90, 0x8f, 0x8a, 0x62, 0x76, 0x9f,
-    0x64, 0x84, 0x71, 0x7e, 0x7c, 0x66, 0x8e, 0x94, 0x6e, 0xaa, 0x77, 0x5c,
-    0x6b, 0x63, 0x68, 0x82, 0x89, 0x46, 0x61, 0x74, 0x8e, 0x85, 0x6b, 0x57,
-    0x74, 0x50, 0x87, 0x66, 0x87, 0x98, 0x59, 0x7d, 0xa2, 0x59, 0x75, 0x64,
-    0x72, 0x8c, 0x6a, 0x92, 0x8c, 0x56, 0x88, 0x7a, 0x6e, 0x77, 0x9c, 0x82,
-    0x7e, 0x5a, 0x91, 0x80, 0x9c, 0x9e, 0x60, 0x8b, 0x6d, 0x76, 0x8d, 0x68,
-    0x6c, 0x70, 0x6f, 0x8b, 0x61, 0x6e, 0x86, 0x78, 0x81, 0x81, 0x77, 0x79,
-    0x76, 0x69, 0x7d, 0x7b, 0x96, 0x8b, 0x95, 0x91, 0xa2, 0x7b, 0x86, 0x8d,
-    0x8b, 0x89, 0x86, 0x5a, 0x5c, 0x4d, 0x96, 0x80, 0x81, 0x55, 0x80, 0x80,
-    0x7a, 0x76, 0x99, 0x98, 0x61, 0x95, 0x5a, 0x78, 0x5a, 0x6c, 0x89, 0x81,
-    0x98, 0x77, 0x62, 0x77, 0x93, 0x4d, 0x9f, 0x77, 0x72, 0x87, 0x95, 0x71,
-    0x65, 0x72, 0xac, 0x8c, 0xa2, 0x89, 0x90, 0x7b, 0x67, 0x60, 0x8a, 0xb3,
-    0x72, 0x8f, 0x5c, 0x82, 0x74, 0x76, 0x7c, 0x85, 0x78, 0x6b, 0x97, 0x6d,
-    0x86, 0x82, 0x76, 0x84, 0x89, 0x89, 0x7f, 0x6a, 0x7a, 0x7f, 0x6c, 0x77,
-    0x80, 0x35, 0x7d, 0x66, 0x96, 0x7e, 0x88, 0x55, 0x6b, 0x55, 0x7c, 0xa7,
-    0x7f, 0x9f, 0x64, 0x8b, 0xa0, 0x81, 0x80, 0x97, 0xaf, 0x7a, 0x7d, 0x61,
-    0x7a, 0x77, 0x6f, 0x8c, 0x5e, 0x69, 0x6b, 0x94, 0x70, 0x6a, 0x66, 0x5d,
-    0x78, 0x6e, 0x76, 0x64, 0xa0, 0x73, 0x8f, 0xa2, 0x9d, 0x50, 0x8e, 0x52,
-    0x51, 0x85, 0x78, 0x83, 0x8f, 0x94, 0x83, 0x7c, 0x9c, 0x64, 0x59, 0x7d,
-    0x66, 0x6a, 0x73, 0x80, 0x6a, 0x9b, 0x92, 0x7e, 0x7a, 0x78, 0x7d, 0xa0,
-    0x8a, 0x9b, 0x61, 0x9e, 0x6c, 0x64, 0x6c, 0x8e, 0x86, 0x75, 0x8a, 0x95,
-    0x8e, 0x89, 0x87, 0x8a, 0x5d, 0x8b, 0x82, 0x7c, 0x60, 0x63, 0x85, 0x85,
-    0x63, 0x96, 0xa3, 0x7f, 0x93, 0x78, 0x8c, 0x86, 0x7b, 0x78, 0x8e, 0x71,
-    0x72, 0x8b, 0x8a, 0x5e, 0x8d, 0x75, 0x78, 0xa3, 0x84, 0x67, 0xa7, 0x54,
-    0x6c, 0x80, 0x8e, 0xa8, 0x83, 0x51, 0x6e, 0x9f, 0x8b, 0x86, 0x75, 0x95,
-    0x7f, 0x7a, 0x80, 0x81, 0x8d, 0x9c, 0x83, 0x8a, 0x7b, 0x8a, 0x74, 0x6f,
-    0x8d, 0x96, 0x5b, 0x9c, 0x8d, 0x7b, 0x83, 0x79, 0x7f, 0x65, 0x7e, 0x87,
-    0x7c, 0x5d, 0x71, 0x97, 0x77, 0x44, 0x9a, 0x7f, 0xaa, 0x56, 0x75, 0x5f,
-    0x7c, 0x51, 0x8c, 0x90, 0x84, 0x9a, 0x49, 0x5d, 0x86, 0x52, 0x94, 0x95,
-    0x5b, 0x86, 0x66, 0x7d, 0x51, 0x4f, 0x7a, 0x91, 0x6d, 0x6e, 0x72, 0x70,
-    0x83, 0x4f, 0x9b, 0x9a, 0x8a, 0x77, 0x6a, 0xa1, 0x71, 0x60, 0x61, 0x98,
-    0x67, 0x4e, 0x7a, 0x8a, 0x53, 0x6b, 0x99, 0xa0, 0x91, 0x46, 0x8a, 0x8b,
-    0x47, 0x78, 0xa9, 0x7b, 0x71, 0x6c, 0x81, 0x68, 0x53, 0x73, 0xaf, 0x70,
-    0x62, 0x6d, 0x69, 0x97, 0x70, 0x83, 0x5f, 0x7f, 0x81, 0x87, 0x65, 0x93,
-    0x67, 0x87, 0x70, 0x82, 0x79, 0x9e, 0x80, 0x77, 0x6c, 0x80, 0x92, 0x81,
-    0x8d, 0x8c, 0x89, 0x8b, 0x4e, 0x91, 0x77, 0x84, 0x99, 0x8c, 0x71, 0x88,
-    0x57, 0x7a, 0x9a, 0x8c, 0x82, 0x9b, 0x97, 0x72, 0x69, 0xac, 0x7c, 0x62,
-    0x85, 0x7d, 0x76, 0x7f, 0x59, 0x85, 0x68, 0x63, 0x94, 0x8b, 0x7b, 0x92,
-    0x7b, 0x6f, 0x77, 0x98, 0x66, 0x78, 0x74, 0x99, 0x85, 0x8c, 0x94, 0x89,
-    0x6c, 0x77, 0x89, 0x80, 0x79, 0x8a, 0xa6, 0x95, 0xa9, 0x86, 0x6f, 0x95,
-    0x90, 0x69, 0x98, 0x85, 0xa0, 0x7f, 0x56, 0xab, 0x6f, 0x5a, 0x94, 0x8b,
-    0x5a, 0x72, 0x61, 0x83, 0x54, 0x70, 0x8d, 0x8d, 0x9c, 0x5e, 0x36, 0x9b,
-    0x84, 0x32, 0x6e, 0x84, 0x79, 0x72, 0x64, 0x95, 0x83, 0x58, 0x67, 0x6c,
-    0x9e, 0x8d, 0x6e, 0x9e, 0x4f, 0x78, 0x71, 0x85, 0x75, 0x60, 0x4d, 0x7d,
-    0x64, 0x89, 0x8e, 0x89, 0x6e, 0x92, 0x53, 0x7c, 0x86, 0x8f, 0xa9, 0xb0,
-    0x8e, 0x5e, 0x76, 0x96, 0x65, 0x7c, 0x8a, 0x89, 0x75, 0x8f, 0x65, 0x94,
-    0x6c, 0x6c, 0x8d, 0x6d, 0x66, 0x6a, 0x62, 0x98, 0x53, 0x8f, 0x67, 0x76,
-    0x80, 0x89, 0x66, 0x60, 0x55, 0x81, 0x85, 0x61, 0x75, 0x78, 0x80, 0x92,
-    0x6f, 0x79, 0x66, 0x64, 0x99, 0xa7, 0x88, 0xa1, 0x86, 0x6b, 0x94, 0x88,
-    0x77, 0x83, 0x8f, 0x61, 0x72, 0x7c, 0x6f, 0x8f, 0x61, 0x56, 0x8a, 0x7b,
-    0x66, 0x8b, 0x98, 0x9d, 0x82, 0x65, 0x77, 0x98, 0x55, 0x83, 0x7a, 0x8c,
-    0x74, 0x79, 0x6e, 0x85, 0x82, 0x9a, 0x7d, 0x8d, 0x76, 0x72, 0x64, 0x81,
-    0x9a, 0x8d, 0x9f, 0x7b, 0x7c, 0x7b, 0x7b, 0x84, 0x90, 0x6b, 0xa4, 0x84,
-    0x98, 0x6f, 0x81, 0xb8, 0x6f, 0x6c, 0x87, 0x6d, 0x8c, 0x72, 0x53, 0x85,
-    0x59, 0x4d, 0x9c, 0x94, 0x7d, 0x6f, 0x4f, 0x82, 0x5d, 0x71, 0x6e, 0x78,
-    0x61, 0x61, 0x34, 0x71, 0x6a, 0x5a, 0x73, 0xa3, 0x89, 0x65, 0x4d, 0x80,
-    0x5c, 0x51, 0x81, 0x8e, 0x6c, 0x53, 0x4a, 0x95, 0x3b, 0x72, 0xa7, 0x86,
-    0x7f, 0x75, 0x61, 0xa3, 0x85, 0x6c, 0x99, 0x88, 0x7c, 0x64, 0x7a, 0x8d,
-    0x81, 0x7b, 0x6a, 0x7b, 0x8f, 0x74, 0x6d, 0xae, 0x42, 0x67, 0x88, 0xa1,
-    0x90, 0x4d, 0x7c, 0x7b, 0x62, 0x55, 0x9a, 0x80, 0x4d, 0x76, 0x5c, 0x88,
-    0x60, 0x86, 0x6f, 0x65, 0x67, 0x77, 0x8a, 0x97, 0x99, 0x7c, 0x89, 0x78,
-    0x92, 0xa7, 0x6a, 0x7f, 0x8e, 0x88, 0x9d, 0xa1, 0x7b, 0xb0, 0x69, 0x8c,
-    0x7e, 0x51, 0x76, 0x84, 0x7d, 0x91, 0x7a, 0x88, 0x7b, 0x88, 0x92, 0x79,
-    0x6d, 0x82, 0x6c, 0x8a, 0x99, 0x62, 0x82, 0x9d, 0x99, 0x97, 0x78, 0x6a,
-    0x6e, 0x83, 0x64, 0x7d, 0x8c, 0x78, 0x7c, 0x7a, 0x7d, 0x7b, 0x77, 0x84,
-    0x76, 0x57, 0x63, 0x85, 0x97, 0x94, 0x80, 0x92, 0x88, 0x73, 0x91, 0x91,
-    0x8f, 0x6d, 0x99, 0x86, 0x91, 0x7f, 0x8b, 0x87, 0x98, 0x62, 0x84, 0x70,
-    0x97, 0x7b, 0x2e, 0x9b, 0x6e, 0x2a, 0xa4, 0x9c, 0x79, 0x88, 0x54, 0x81,
-    0x4f, 0x41, 0xa0, 0x85, 0xaf, 0x9a, 0x47, 0x5a, 0x7d, 0x62, 0x7a, 0x84,
-    0x81, 0x6e, 0x41, 0xb4, 0x60, 0x47, 0x8f, 0x98, 0x6c, 0x3c, 0x3b, 0x73,
-    0x59, 0x55, 0x7c, 0xb0, 0x6e, 0x5f, 0x61, 0x97, 0x73, 0x59, 0x9f, 0x92,
-    0x89, 0x5c, 0x70, 0x96, 0x5c, 0x7c, 0x7c, 0x64, 0x7e, 0x54, 0x5c, 0x94,
-    0x56, 0x73, 0x8d, 0x95, 0x59, 0x83, 0x6c, 0x99, 0x6e, 0x5e, 0x7a, 0x99,
-    0x83, 0x93, 0x88, 0x76, 0x5a, 0x5a, 0xa5, 0x95, 0x5d, 0x63, 0x8f, 0x6e,
-    0x74, 0x65, 0x85, 0x86, 0x98, 0x83, 0x7b, 0x8a, 0x5c, 0x5e, 0x7f, 0x88,
-    0x78, 0x68, 0x8f, 0x9f, 0x94, 0x8d, 0x74, 0x7b, 0x6a, 0x91, 0x7a, 0x9a,
-    0x70, 0x67, 0xb2, 0x92, 0x75, 0x4e, 0x74, 0xa3, 0x68, 0x74, 0x91, 0x80,
-    0x55, 0x8e, 0x88, 0x73, 0x70, 0x81, 0xa1, 0xb8, 0x96, 0x48, 0x67, 0xb2,
-    0x76, 0xa1, 0x98, 0xa9, 0x61, 0x6c, 0x5f, 0x98, 0x84, 0x92, 0xa9, 0x83,
-    0x9e, 0x74, 0x7b, 0xa2, 0x6f, 0x72, 0x95, 0xa3, 0xb9, 0x80, 0x81, 0x7b,
-    0x65, 0x6b, 0x96, 0x8b, 0xae, 0x79, 0x2b, 0x86, 0x5c, 0x2c, 0x8b, 0xa3,
-    0x84, 0x74, 0x53, 0x7c, 0x54, 0x4a, 0x65, 0x89, 0xa6, 0x89, 0x47, 0x77,
-    0x50, 0x6d, 0x8b, 0x94, 0x8a, 0x61, 0x32, 0x7c, 0x6f, 0x47, 0x78, 0xa2,
-    0x9f, 0x42, 0x42, 0x71, 0x78, 0x76, 0x9e, 0x88, 0x70, 0x70, 0x56, 0x8a,
-    0x83, 0x95, 0xa7, 0x9d, 0x9d, 0x88, 0x9a, 0x92, 0x48, 0x63, 0xaf, 0x91,
-    0x6c, 0x75, 0x5d, 0x5e, 0x83, 0x86, 0xaa, 0x6f, 0x79, 0x84, 0x67, 0x79,
-    0x63, 0x69, 0x8e, 0x81, 0x6a, 0x96, 0x8d, 0x86, 0x7b, 0x9f, 0xaa, 0x8e,
-    0x63, 0x89, 0x9a, 0x7a, 0x5e, 0x7c, 0x87, 0x83, 0x81, 0x64, 0x7e, 0x59,
-    0x6d, 0x5c, 0xa4, 0x72, 0x78, 0x85, 0x9b, 0x79, 0x85, 0x7d, 0x9c, 0x7d,
-    0x9c, 0x5c, 0x66, 0x75, 0x66, 0x72, 0xb4, 0x7c, 0x83, 0x9e, 0x90, 0xae,
-    0x69, 0x71, 0xb0, 0x84, 0x86, 0x50, 0x66, 0xab, 0x75, 0x96, 0xa8, 0x6c,
-    0x87, 0x7b, 0x7e, 0x7c, 0x60, 0x55, 0x96, 0xb0, 0x6a, 0x79, 0x42, 0x9c,
-    0x97, 0xa8, 0xb2, 0x9a, 0xa0, 0x84, 0x68, 0x90, 0x90, 0x98, 0x67, 0x9c,
-    0xa3, 0x81, 0x71, 0xaa, 0x93, 0x6a, 0x84, 0x8c, 0x77, 0x79, 0x4d, 0x82,
-    0x45, 0x1e, 0x7b, 0x94, 0x86, 0x86, 0x26, 0x82, 0x41, 0x6f, 0x8b, 0x86,
-    0xa4, 0x80, 0x38, 0x71, 0x5e, 0x5b, 0x9a, 0x73, 0x86, 0x60, 0x5a, 0x9d,
-    0x7b, 0x53, 0x89, 0xa0, 0x99, 0x76, 0x57, 0x81, 0x76, 0x5a, 0x9e, 0x85,
-    0x5a, 0x7b, 0x56, 0x74, 0x71, 0x6a, 0x9c, 0x68, 0x7e, 0x76, 0x7d, 0x7f,
-    0x52, 0x71, 0x85, 0xa2, 0x96, 0x63, 0x73, 0x7c, 0x7a, 0x97, 0x9f, 0x7c,
-    0x77, 0x77, 0x59, 0x6b, 0x62, 0x77, 0xbc, 0x6b, 0x7c, 0x79, 0x75, 0x90,
-    0x67, 0x82, 0x92, 0x9c, 0x81, 0x92, 0x84, 0x7a, 0x72, 0x5b, 0x86, 0x82,
-    0x87, 0x73, 0x87, 0x7c, 0x57, 0x76, 0xa6, 0x7d, 0x7d, 0x94, 0x6a, 0x67,
-    0x76, 0x89, 0x9a, 0x6d, 0x7d, 0xa4, 0x6d, 0x7e, 0x74, 0x7e, 0x8f, 0xad,
-    0x99, 0x55, 0x5c, 0x82, 0x75, 0x9e, 0xae, 0x76, 0x6b, 0x93, 0x5d, 0x92,
-    0x6e, 0x54, 0x88, 0x8f, 0x6a, 0x72, 0x64, 0x93, 0x6e, 0x63, 0x8c, 0xa7,
-    0xa6, 0x7a, 0x57, 0x9f, 0x94, 0x91, 0xbd, 0xa4, 0x92, 0x7a, 0x68, 0x9d,
-    0x7d, 0x6b, 0x6b, 0xbc, 0xad, 0x7a, 0x73, 0x92, 0x7b, 0x6d, 0x91, 0x6a,
-    0x66, 0x8d, 0x34, 0x9b, 0x75, 0x3b, 0x93, 0x78, 0x88, 0x58, 0x1a, 0x7f,
-    0x52, 0x61, 0xa3, 0xb1, 0x9c, 0x60, 0x1d, 0x90, 0x7b, 0x37, 0x9f, 0x84,
-    0xa3, 0x6c, 0x2e, 0xac, 0x73, 0x62, 0x92, 0x9a, 0x94, 0x6b, 0x5c, 0x82,
-    0x5f, 0x4c, 0x9a, 0x8c, 0x76, 0x69, 0x77, 0x5f, 0x5d, 0x91, 0x80, 0x9a,
-    0x60, 0x4c, 0x7b, 0x57, 0x67, 0x6b, 0x92, 0x93, 0x64, 0x91, 0x55, 0x75,
-    0x41, 0x82, 0x78, 0x68, 0xa2, 0x55, 0x6a, 0x69, 0x59, 0x70, 0x8a, 0x7b,
-    0x70, 0x6e, 0x63, 0x83, 0x7f, 0xa4, 0x80, 0x85, 0x86, 0x93, 0x7e, 0x6f,
-    0x7b, 0x94, 0xa4, 0xa7, 0x97, 0x7a, 0x87, 0x64, 0x4a, 0x97, 0x94, 0x6a,
-    0x96, 0x73, 0x5e, 0x79, 0x6a, 0x99, 0x86, 0xa0, 0x93, 0xac, 0x79, 0x76,
-    0x7f, 0x7b, 0xa7, 0x75, 0x8a, 0x71, 0x53, 0x87, 0x93, 0x7f, 0x9e, 0x7b,
-    0x81, 0x70, 0x68, 0x8b, 0x8c, 0x9c, 0xaf, 0xa7, 0x6a, 0x9b, 0x49, 0x6d,
-    0x67, 0x80, 0x8b, 0x86, 0x9f, 0x80, 0x74, 0x7a, 0x96, 0x74, 0xc8, 0x9d,
-    0xa4, 0x74, 0x71, 0x6c, 0x75, 0x6a, 0x9a, 0x95, 0x97, 0x8c, 0x6e, 0x8a,
-    0x85, 0x62, 0x5f, 0x7e, 0x9e, 0x6b, 0x48, 0x93, 0x44, 0x37, 0x83, 0xa2,
-    0x97, 0x72, 0x25, 0x79, 0x32, 0x39, 0x68, 0x8f, 0x93, 0x61, 0x2b, 0x96,
-    0x94, 0x43, 0x82, 0x6e, 0x8f, 0x6d, 0x53, 0x9b, 0x65, 0x50, 0x70, 0x9d,
-    0x7d, 0x53, 0x3b, 0x86, 0x77, 0x6c, 0xa6, 0x90, 0x6b, 0x3e, 0x7b, 0x7a,
-    0x50, 0x81, 0xb4, 0x76, 0xa5, 0x74, 0x8b, 0x73, 0x79, 0x69, 0xa8, 0x9a,
-    0x82, 0x4a, 0x5e, 0x6c, 0x8d, 0x66, 0xa3, 0x80, 0x8d, 0x74, 0x5b, 0x7c,
-    0x77, 0xaa, 0x82, 0x69, 0x5e, 0x7d, 0x7f, 0x63, 0xa3, 0x8c, 0xb3, 0x9a,
-    0x81, 0x8f, 0x7b, 0x77, 0x60, 0x89, 0x6a, 0x82, 0x5a, 0x7a, 0x71, 0x61,
-    0x93, 0x73, 0x8b, 0xb0, 0xa2, 0x92, 0x7c, 0x84, 0x8b, 0x72, 0x91, 0x8d,
-    0x91, 0x80, 0x6c, 0x75, 0x7a, 0xb3, 0x95, 0x5e, 0xa5, 0x5d, 0x54, 0x8b,
-    0x63, 0x91, 0xa7, 0x68, 0x96, 0x4c, 0x5a, 0x86, 0x76, 0x82, 0xb6, 0xa0,
-    0x68, 0x6b, 0x53, 0x76, 0x60, 0x65, 0x90, 0xaf, 0x82, 0x66, 0x80, 0x7b,
-    0x84, 0xa0, 0xb0, 0xb8, 0x81, 0x6e, 0x81, 0x8a, 0x74, 0x6e, 0x97, 0xa8,
-    0x89, 0x7b, 0x7b, 0x6e, 0x63, 0x74, 0x5a, 0x7b, 0x7e, 0x84, 0x40, 0x95,
-    0x73, 0x3c, 0x7c, 0x72, 0x9b, 0x92, 0x27, 0x87, 0x69, 0x5b, 0x99, 0x8a,
-    0xa8, 0x65, 0x36, 0x8f, 0x86, 0x3e, 0xa1, 0x79, 0x9f, 0x4d, 0x41, 0xc5,
-    0x8c, 0x6a, 0x7e, 0x7f, 0x68, 0x49, 0x5c, 0x91, 0x50, 0x6a, 0x8c, 0x81,
-    0x75, 0x4c, 0x6a, 0x74, 0x8a, 0x87, 0xa0, 0x93, 0x7e, 0x6d, 0x52, 0x79,
-    0x86, 0x6a, 0x68, 0x6c, 0x83, 0x67, 0x79, 0x73, 0x6f, 0x72, 0x97, 0x84,
-    0x8b, 0x78, 0x64, 0x69, 0x8f, 0x92, 0x86, 0x61, 0x5d, 0x85, 0x70, 0x64,
-    0x7d, 0xa3, 0x92, 0xa0, 0x72, 0x71, 0x5d, 0x63, 0x7c, 0x70, 0xaf, 0x6f,
-    0x93, 0x6a, 0x7e, 0x7f, 0x64, 0xab, 0x85, 0x73, 0x8f, 0x8a, 0x7e, 0x5f,
-    0x7a, 0x6f, 0xaa, 0x71, 0x97, 0x7d, 0x60, 0x7c, 0x48, 0x69, 0xa9, 0xaa,
-    0x98, 0x7c, 0x61, 0x85, 0x66, 0x97, 0xa2, 0x73, 0x74, 0x65, 0x52, 0x67,
-    0x79, 0x8a, 0x79, 0x71, 0x85, 0x6e, 0x6d, 0x67, 0x5e, 0x7f, 0xb9, 0x93,
-    0x96, 0x53, 0x69, 0x6e, 0x7f, 0x8f, 0xab, 0x93, 0xa9, 0x70, 0x6e, 0x71,
-    0x7e, 0x87, 0x98, 0x7a, 0xae, 0x90, 0x64, 0x88, 0x8a, 0x4f, 0x6d, 0x9e,
-    0xac, 0x7e, 0x31, 0x92, 0x50, 0x26, 0x95, 0xb2, 0x90, 0x99, 0x0c, 0x84,
-    0x40, 0x4f, 0x8f, 0x76, 0xa4, 0x46, 0x4c, 0x9d, 0x8b, 0x57, 0x81, 0x79,
-    0x7b, 0x47, 0x4d, 0x9c, 0x5f, 0x3b, 0x6f, 0x90, 0x7a, 0x3f, 0x66, 0x9d,
-    0x6c, 0x45, 0x8b, 0x71, 0x79, 0x62, 0x72, 0x78, 0x93, 0x95, 0x7e, 0x86,
-    0x7a, 0x6b, 0x77, 0x74, 0x6b, 0x86, 0xa4, 0x7e, 0x84, 0x48, 0x78, 0x75,
-    0x6e, 0x8b, 0x8e, 0x56, 0x69, 0x7b, 0x59, 0x68, 0x5d, 0x77, 0x69, 0x66,
-    0x67, 0x9f, 0x75, 0x7b, 0x76, 0x64, 0xc1, 0x78, 0x7d, 0x74, 0x82, 0x73,
-    0x73, 0x90, 0xb8, 0x82, 0x7e, 0x70, 0x7b, 0x7a, 0x64, 0xa1, 0x7e, 0x85,
-    0x83, 0x81, 0x60, 0x7b, 0x91, 0x82, 0x6f, 0x95, 0xa0, 0x86, 0x6d, 0x88,
-    0x75, 0x8d, 0x94, 0x90, 0x76, 0x6d, 0x6e, 0x79, 0x64, 0x74, 0xa8, 0xb1,
-    0x92, 0x6e, 0x61, 0x79, 0x74, 0x91, 0x95, 0x74, 0x65, 0x74, 0x5e, 0x7f,
-    0x8b, 0x60, 0x9b, 0x9f, 0x74, 0x77, 0x4c, 0x66, 0x7c, 0x80, 0x97, 0x98,
-    0x9d, 0x86, 0x55, 0x8a, 0x8a, 0x79, 0x8c, 0x82, 0xb0, 0x7d, 0x63, 0x8c,
-    0x5d, 0x5b, 0x82, 0x58, 0x84, 0x56, 0x51, 0x92, 0x75, 0x24, 0x97, 0x92,
-    0x75, 0x6e, 0x19, 0x8e, 0x47, 0x3e, 0x7b, 0x7b, 0x87, 0x6b, 0x3f, 0xa9,
-    0x59, 0x40, 0x86, 0x74, 0x69, 0x4a, 0x2d, 0xad, 0x91, 0x62, 0xb2, 0xa9,
-    0x74, 0x6c, 0x47, 0x94, 0x51, 0x75, 0xb2, 0x6f, 0x75, 0x4b, 0x60, 0xa2,
-    0x8e, 0x6a, 0xa4, 0x79, 0x6f, 0x57, 0x80, 0x8c, 0x6c, 0x8e, 0x9e, 0x74,
-    0x70, 0x5f, 0x66, 0x80, 0x80, 0x89, 0xb5, 0x8a, 0x7a, 0x96, 0x87, 0x7a,
-    0x7b, 0x85, 0x90, 0x79, 0x59, 0x6d, 0x77, 0x8c, 0x8f, 0x82, 0xb3, 0x9c,
-    0x6a, 0x6a, 0x6b, 0x70, 0x77, 0x89, 0x96, 0x86, 0x94, 0x72, 0x7e, 0x72,
-    0xa9, 0x93, 0x8d, 0x7a, 0x6d, 0x8f, 0x66, 0x72, 0x9a, 0x91, 0x9e, 0x98,
-    0xa0, 0x8b, 0x50, 0x76, 0x5c, 0x74, 0xbc, 0x9a, 0x98, 0x73, 0x80, 0x7d,
-    0x73, 0x7c, 0xc0, 0x8b, 0x86, 0x7a, 0x66, 0x86, 0x83, 0x72, 0x8f, 0x96,
-    0x98, 0x56, 0x45, 0x7b, 0x77, 0x92, 0xac, 0x8a, 0xae, 0x43, 0x33, 0x73,
-    0x78, 0x83, 0x98, 0x84, 0x86, 0x78, 0x54, 0x7e, 0x70, 0x5f, 0xa6, 0xa1,
-    0x94, 0x81, 0x73, 0x8d, 0x83, 0x5b, 0x88, 0x71, 0xb2, 0x91, 0x50, 0x99,
-    0x6b, 0x47, 0x72, 0x92, 0x87, 0x6d, 0x07, 0x99, 0x57, 0x3d, 0x8d, 0x83,
-    0x9d, 0x49, 0x40, 0x9d, 0x5c, 0x57, 0x95, 0x73, 0x6e, 0x4b, 0x49, 0xab,
-    0x97, 0x58, 0x8b, 0x7a, 0x7a, 0x48, 0x47, 0x8b, 0x7e, 0x5d, 0xa9, 0x6d,
-    0x8a, 0x3f, 0x60, 0x82, 0x86, 0x98, 0xa9, 0x7c, 0x74, 0x59, 0x9b, 0x80,
-    0x4e, 0x75, 0x9c, 0x5e, 0x75, 0x8c, 0x67, 0x7e, 0x78, 0x75, 0x87, 0x6c,
-    0x79, 0x73, 0x63, 0x77, 0x6e, 0x7a, 0x8d, 0x73, 0x4e, 0x72, 0x4a, 0x7c,
-    0x8f, 0x79, 0x70, 0x7a, 0x70, 0x73, 0x7b, 0x7a, 0x62, 0xa1, 0x7b, 0x63,
-    0x9a, 0x89, 0x76, 0x64, 0x84, 0x7d, 0x9c, 0x94, 0xb0, 0x7f, 0x6c, 0x7b,
-    0x8d, 0x89, 0x89, 0x7b, 0x9d, 0x99, 0x64, 0x8b, 0x5c, 0x88, 0xa6, 0x8e,
-    0x81, 0x86, 0x7e, 0x85, 0x73, 0x72, 0xad, 0x5d, 0x5f, 0x7e, 0x63, 0x74,
-    0x64, 0xa1, 0x9c, 0x83, 0x7c, 0x83, 0x7b, 0x7b, 0x71, 0xa0, 0x9e, 0xaf,
-    0x89, 0x79, 0x4c, 0x7c, 0x8c, 0x78, 0x91, 0x87, 0x8a, 0x87, 0x5e, 0x85,
-    0x7b, 0x61, 0x9c, 0x88, 0xa5, 0x8d, 0x7c, 0x9c, 0x6b, 0x47, 0x95, 0x85,
-    0x81, 0x80, 0x59, 0xb2, 0x4f, 0x3d, 0xae, 0x8c, 0x8d, 0x71, 0x11, 0x95,
-    0x31, 0x65, 0x9d, 0xa0, 0x8e, 0x64, 0x42, 0xb9, 0x6a, 0x5c, 0x91, 0x82,
-    0x91, 0x50, 0x33, 0xb2, 0x7a, 0x54, 0xac, 0x88, 0x92, 0x61, 0x4e, 0xad,
-    0x65, 0x5c, 0x91, 0xb0, 0x72, 0x65, 0x4a, 0x79, 0x68, 0x77, 0x75, 0x5f,
-    0x79, 0x6d, 0x6f, 0x7c, 0x4d, 0x71, 0xb8, 0x78, 0x8a, 0x87, 0x6e, 0x72,
-    0x7d, 0x79, 0x87, 0x80, 0x5a, 0x78, 0x77, 0x78, 0x80, 0x8f, 0x8c, 0x56,
-    0x7a, 0x8b, 0x62, 0x82, 0x5a, 0x96, 0x82, 0x68, 0x71, 0x5d, 0x75, 0x65,
-    0x93, 0xb5, 0x71, 0x82, 0x82, 0x8a, 0x4b, 0x7c, 0x62, 0x6f, 0xc1, 0x86,
-    0x9d, 0x90, 0x63, 0x71, 0x86, 0x9e, 0x9f, 0x77, 0x90, 0x97, 0x68, 0x81,
-    0x5a, 0x8c, 0xab, 0x5e, 0x81, 0x76, 0x83, 0x79, 0x8f, 0xa1, 0x89, 0x79,
-    0x81, 0x8a, 0x7e, 0x6c, 0x65, 0x79, 0xc7, 0x89, 0x92, 0x68, 0x78, 0x70,
-    0x65, 0x96, 0x9e, 0x82, 0x7d, 0x5f, 0x7b, 0x77, 0x72, 0x84, 0x7e, 0x92,
-    0x97, 0x7b, 0x6e, 0x67, 0x81, 0xa1, 0x9a, 0xab, 0x8d, 0x78, 0x61, 0x78,
-    0x52, 0x66, 0xaa, 0x77, 0x75, 0xa3, 0x5e, 0xa0, 0x51, 0x40, 0x68, 0xb0,
-    0x9a, 0x93, 0x11, 0x82, 0x69, 0x48, 0x9c, 0x77, 0x8d, 0x62, 0x36, 0xac,
-    0x6c, 0x4c, 0xa3, 0xab, 0x8f, 0x32, 0x4f, 0xa9, 0x80, 0x68, 0xab, 0x7a,
-    0x90, 0x61, 0x5c, 0xa5, 0x84, 0x4c, 0x8c, 0x7a, 0x95, 0x54, 0x72, 0xa0,
-    0x66, 0x85, 0xb3, 0x91, 0x69, 0x64, 0x68, 0x56, 0x66, 0x8d, 0xa0, 0x9f,
-    0x7a, 0x88, 0x5d, 0x7d, 0x48, 0x80, 0x7f, 0x7c, 0x7c, 0x99, 0x65, 0x81,
-    0x73, 0x8b, 0x8c, 0x61, 0x44, 0x60, 0x53, 0x8e, 0x64, 0x80, 0x9c, 0x74,
-    0x5d, 0x70, 0x8f, 0x5a, 0x68, 0x7a, 0x82, 0xa1, 0x75, 0x7b, 0x83, 0x60,
-    0x75, 0x5e, 0xa2, 0x94, 0x6a, 0x88, 0x78, 0x71, 0x95, 0x70, 0x8b, 0x86,
-    0x7e, 0x94, 0x5f, 0x65, 0x5f, 0xb1, 0x97, 0x99, 0x94, 0x84, 0x88, 0x7d,
-    0x50, 0x8c, 0xaa, 0x81, 0x7b, 0x7c, 0x77, 0x65, 0x5e, 0x91, 0x9c, 0x89,
-    0x8c, 0x85, 0x75, 0x62, 0x7b, 0x78, 0xc3, 0x7a, 0x62, 0x8c, 0x66, 0x6f,
-    0x79, 0x7a, 0x9c, 0x6d, 0x7c, 0x6b, 0x5c, 0x7d, 0x6d, 0x54, 0x93, 0x87,
-    0x7a, 0x7a, 0x50, 0x85, 0x60, 0x56, 0x5e, 0x6b, 0x90, 0x7c, 0x52, 0xa5,
-    0x54, 0x42, 0x7b, 0x75, 0x83, 0x8c, 0x2c, 0xa6, 0x6f, 0x62, 0x78, 0x78,
-    0x86, 0x36, 0x4b, 0xaa, 0x86, 0x54, 0x92, 0x8d, 0x7f, 0x53, 0x37, 0xbe,
-    0x86, 0x7a, 0x90, 0x7e, 0x8e, 0x50, 0x58, 0xa6, 0x82, 0x58, 0x73, 0x74,
-    0x66, 0x5c, 0x6a, 0x7f, 0xa2, 0x69, 0xbd, 0xa9, 0x74, 0x76, 0x75, 0x6f,
-    0x45, 0x6c, 0xa5, 0x79, 0x82, 0x67, 0x56, 0x7c, 0x7f, 0x81, 0x67, 0x6d,
-    0x81, 0x87, 0x71, 0x69, 0x69, 0x81, 0x85, 0x84, 0x5a, 0x8c, 0x5f, 0x73,
-    0x80, 0x9c, 0x9e, 0x90, 0x77, 0xa0, 0x9c, 0x6c, 0x73, 0x8a, 0x84, 0x72,
-    0x87, 0xa1, 0x67, 0x64, 0x5d, 0x9b, 0x9d, 0x9b, 0x97, 0x83, 0x5f, 0x61,
-    0x77, 0x91, 0xa0, 0x8f, 0x8a, 0x6c, 0x45, 0x5f, 0x6d, 0xa6, 0x9b, 0x76,
-    0x86, 0x93, 0x91, 0x7d, 0x54, 0x61, 0xa4, 0x6a, 0x5b, 0x69, 0x5f, 0x6d,
-    0x83, 0xaf, 0xa0, 0x78, 0x9d, 0x62, 0x65, 0x69, 0x5f, 0x78, 0xbf, 0x91,
-    0x7b, 0x7b, 0x52, 0x5d, 0x70, 0x78, 0xa9, 0x87, 0x93, 0x74, 0x61, 0x74,
-    0x8c, 0x61, 0x97, 0x86, 0x9b, 0x7c, 0x7d, 0x75, 0x4b, 0x64, 0xa7, 0x81,
-    0x8a, 0x9c, 0x29, 0xa2, 0x5f, 0x38, 0x6a, 0xb0, 0x82, 0x53, 0x1a, 0xa7,
-    0x38, 0x47, 0x97, 0x90, 0x8d, 0x41, 0x25, 0xa7, 0x65, 0x63, 0x8b, 0x79,
-    0x8f, 0x3e, 0x21, 0xd0, 0x5e, 0x5d, 0x9d, 0x68, 0x75, 0x3e, 0x68, 0xb6,
-    0x6a, 0x50, 0x9a, 0x71, 0x81, 0x45, 0x6d, 0x9a, 0x7f, 0x86, 0x9c, 0x63,
-    0x7d, 0x74, 0x69, 0x7d, 0x5a, 0x6a, 0x8d, 0x72, 0x6b, 0x69, 0x4c, 0x6f,
-    0x7c, 0x8e, 0xa6, 0x83, 0x70, 0x65, 0x5f, 0x78, 0x69, 0x67, 0x7f, 0x8d,
-    0x58, 0x76, 0x4a, 0x85, 0x80, 0x89, 0x9f, 0x91, 0x52, 0x62, 0x72, 0x60,
-    0x7b, 0x5c, 0x77, 0x6f, 0x9d, 0xa4, 0x98, 0x70, 0x6f, 0xad, 0x94, 0x9f,
-    0x7b, 0x89, 0x74, 0x7e, 0x5d, 0x8d, 0xab, 0x98, 0x8f, 0x90, 0x82, 0x84,
-    0x60, 0x7c, 0xb7, 0x8e, 0x79, 0x83, 0x56, 0x86, 0x87, 0x79, 0x95, 0x75,
-    0x78, 0x71, 0x58, 0x73, 0x87, 0x5d, 0xc6, 0x9f, 0x75, 0x61, 0x4f, 0x71,
-    0x91, 0x88, 0xb3, 0x8c, 0x7d, 0x7c, 0x6a, 0x75, 0x6d, 0x66, 0x8e, 0x94,
-    0x96, 0x74, 0x59, 0x6f, 0x6d, 0x65, 0xb0, 0x8e, 0x7b, 0x89, 0x7a, 0x6a,
-    0x7d, 0x57, 0x82, 0x7a, 0x61, 0x9f, 0x50, 0xab, 0x57, 0x46, 0x86, 0x8d,
-    0xa3, 0x96, 0x18, 0xab, 0x51, 0x6e, 0xb3, 0x7e, 0x90, 0x6d, 0x6d, 0xc0,
-    0x54, 0x35, 0x96, 0x84, 0x8e, 0x49, 0x28, 0xe4, 0x81, 0x5f, 0x9b, 0x87,
-    0x8c, 0x33, 0x56, 0xb4, 0x61, 0x5e, 0x8b, 0x81, 0x99, 0x61, 0x6b, 0x96,
-    0x75, 0x82, 0x9e, 0x7c, 0x90, 0x63, 0x64, 0x6b, 0x55, 0x6e, 0xb6, 0x7f,
-    0x5f, 0x55, 0x65, 0x60, 0x35, 0x8a, 0x85, 0x91, 0x4d, 0x62, 0x90, 0x90,
-    0x57, 0x5a, 0x9f, 0x7b, 0x4c, 0x86, 0x73, 0x83, 0x4a, 0x6d, 0xb0, 0x67,
-    0x65, 0x89, 0x54, 0x68, 0x89, 0x7b, 0x72, 0x4f, 0x7a, 0x93, 0x61, 0x7e,
-    0x79, 0x89, 0x8f, 0x9c, 0x7b, 0x70, 0x48, 0x67, 0x82, 0x75, 0xaa, 0x92,
-    0x9a, 0x8f, 0x79, 0x8c, 0x64, 0x94, 0x98, 0x83, 0x7c, 0x8f, 0x5c, 0x77,
-    0x70, 0x90, 0x91, 0x88, 0x7d, 0x51, 0x5d, 0x5d, 0x8b, 0x9f, 0xbc, 0x78,
-    0x9e, 0x73, 0x67, 0x6d, 0x82, 0x8d, 0xc9, 0x86, 0x96, 0x6a, 0x5d, 0x79,
-    0x7e, 0x6b, 0xb2, 0x79, 0x88, 0x85, 0x65, 0x73, 0x75, 0x6b, 0x9e, 0x7f,
-    0x8e, 0x94, 0x8e, 0x7d, 0x74, 0x61, 0x97, 0x56, 0x97, 0x6b, 0x30, 0xb6,
-    0x5f, 0x5a, 0xaa, 0xa5, 0x85, 0x5d, 0x01, 0xbc, 0x79, 0x63, 0x6e, 0x82,
-    0x72, 0x26, 0x4f, 0xc8, 0x98, 0x56, 0x85, 0x9a, 0x81, 0x1f, 0x48, 0xcf,
-    0x84, 0x74, 0x75, 0x87, 0xae, 0x43, 0x6f, 0xdf, 0x6a, 0x4e, 0x97, 0x5d,
-    0x8f, 0x37, 0x55, 0x89, 0x7d, 0x82, 0xb1, 0x89, 0x6d, 0x52, 0x65, 0x8b,
-    0x71, 0x87, 0x8d, 0x6a, 0x99, 0x5d, 0x65, 0x78, 0x67, 0x8d, 0x7b, 0x51,
-    0x60, 0x8a, 0x59, 0x72, 0x78, 0x93, 0x88, 0x75, 0x46, 0x60, 0x6e, 0x79,
-    0x7b, 0x9d, 0x9c, 0x8c, 0x5c, 0x7c, 0x69, 0x71, 0x60, 0x6f, 0xb0, 0x7d,
-    0x4c, 0x5e, 0x88, 0x77, 0x74, 0x6a, 0x6f, 0x9a, 0xa2, 0x83, 0x48, 0x5a,
-    0x6e, 0xa2, 0x8b, 0x7a, 0x65, 0x5b, 0x4b, 0x80, 0x5b, 0x8f, 0xaf, 0x8e,
-    0x93, 0x4a, 0x59, 0x6e, 0x5e, 0x89, 0x91, 0x87, 0x73, 0x6a, 0x47, 0x6c,
-    0x6c, 0x81, 0xad, 0x5a, 0x76, 0x51, 0x51, 0x6c, 0x80, 0x92, 0x9d, 0xae,
-    0x90, 0x71, 0x6c, 0x7a, 0x7c, 0x84, 0xa7, 0x7d, 0x82, 0x7c, 0x80, 0x59,
-    0x7d, 0x86, 0xa9, 0x94, 0x8e, 0x7b, 0x7c, 0x67, 0x67, 0x66, 0x8f, 0x49,
-    0x5d, 0xa4, 0x4a, 0xbc, 0x5a, 0x34, 0xa7, 0xaa, 0x9e, 0x86, 0x17, 0xc0,
-    0x53, 0x67, 0x76, 0xae, 0x8d, 0x37, 0x4a, 0xd6, 0x76, 0x69, 0x95, 0x7a,
-    0x8a, 0x0e, 0x3f, 0xe8, 0x60, 0x4d, 0x9e, 0x90, 0xad, 0x44, 0x46, 0xc5,
-    0x4c, 0x6e, 0x72, 0x8c, 0x89, 0x49, 0x51, 0xa0, 0x60, 0x84, 0x84, 0x9d,
-    0xa4, 0x5a, 0x84, 0x8d, 0x69, 0x6a, 0x97, 0x78, 0x72, 0x66, 0x72, 0x9b,
-    0x74, 0x7a, 0x95, 0x7c, 0x7a, 0x6e, 0x74, 0x7f, 0x65, 0x94, 0x77, 0x7e,
-    0x85, 0x6d, 0x65, 0x7b, 0x63, 0x7b, 0x87, 0x49, 0x80, 0x74, 0x74, 0x85,
-    0x6e, 0x78, 0xad, 0x66, 0x8a, 0x65, 0x54, 0x7c, 0x4e, 0x62, 0x97, 0x7f,
-    0x82, 0x6c, 0x58, 0x79, 0x91, 0x94, 0xb3, 0x7a, 0x88, 0x82, 0x60, 0x7f,
-    0x8c, 0xa7, 0x7b, 0x93, 0x77, 0x49, 0x6f, 0x6f, 0x5a, 0x8d, 0x93, 0x8b,
-    0x87, 0x59, 0x7d, 0x5e, 0x83, 0x7e, 0x8c, 0x7a, 0x91, 0x4e, 0x6f, 0x89,
-    0x8a, 0x87, 0x8b, 0x85, 0x8e, 0x43, 0x63, 0x8d, 0x90, 0x6c, 0xa5, 0x73,
-    0x8a, 0x78, 0x5f, 0x73, 0x88, 0x57, 0x9e, 0x8f, 0x7f, 0x91, 0x70, 0x77,
-    0x8a, 0x76, 0xa2, 0x77, 0x53, 0x86, 0x51, 0xd8, 0xa9, 0x5b, 0x9b, 0x96,
-    0x7c, 0x71, 0x01, 0xd4, 0x56, 0x4a, 0x95, 0xab, 0x91, 0x54, 0x45, 0xe5,
-    0x74, 0x4f, 0x87, 0x6a, 0xa2, 0x3e, 0x47, 0xff, 0x91, 0x4d, 0x94, 0x97,
-    0x6d, 0x74, 0x77, 0xe0, 0x5d, 0x4e, 0x5f, 0x73, 0x70, 0x3a, 0x68, 0xb2,
-    0x78, 0x61, 0x8c, 0x77, 0xa8, 0x57, 0x8c, 0x99, 0x23, 0x5a, 0x84, 0x78,
-    0x9b, 0x7f, 0x5e, 0xa0, 0x49, 0x84, 0x83, 0x94, 0x99, 0x4d, 0x8d, 0x9a,
-    0x86, 0x90, 0x9b, 0x51, 0x75, 0x73, 0x78, 0x89, 0x59, 0x64, 0x78, 0x91,
-    0x72, 0x9c, 0x72, 0x7e, 0x65, 0x6a, 0x80, 0xaa, 0x94, 0x65, 0x6d, 0x87,
-    0x73, 0x93, 0x97, 0x7d, 0x99, 0x63, 0x75, 0x89, 0x67, 0xa1, 0x90, 0x7f,
-    0x88, 0x65, 0x6d, 0x8f, 0x7d, 0x62, 0x91, 0xa7, 0x8b, 0x73, 0x51, 0x88,
-    0x66, 0x66, 0x99, 0xa7, 0x7c, 0x54, 0x82, 0x67, 0x64, 0x8a, 0x95, 0x7c,
-    0x8a, 0x5d, 0x5e, 0x68, 0x4b, 0x75, 0x92, 0x7a, 0x9f, 0x66, 0x71, 0x8d,
-    0x76, 0x72, 0x8e, 0x77, 0x76, 0x8c, 0x5b, 0x88, 0x9a, 0x92, 0x7c, 0x74,
-    0x95, 0xaa, 0x71, 0x77, 0x97, 0x93, 0x9e, 0x62, 0x96, 0x6a, 0x49, 0xd8,
-    0x81, 0x99, 0xae, 0x87, 0x6c, 0x76, 0x3e, 0xd9, 0x6e, 0x95, 0xa3, 0x86,
-    0x60, 0x6c, 0x5c, 0xbe, 0x98, 0x8a, 0x99, 0x7c, 0x47, 0x45, 0x69, 0xeb,
-    0x9d, 0x7d, 0xbb, 0x90, 0x66, 0x69, 0x70, 0xc6, 0x7b, 0x59, 0x9e, 0x87,
-    0x58, 0x76, 0x7c, 0xae, 0x72, 0x7d, 0x9f, 0x92, 0x82, 0x58, 0x51, 0x7a,
-    0x5d, 0x77, 0xa8, 0x7c, 0x56, 0x68, 0x88, 0x8a, 0x7e, 0x8a, 0x98, 0x68,
-    0x64, 0x79, 0x6e, 0x7a, 0x60, 0x96, 0x98, 0x60, 0x60, 0x71, 0x60, 0x8e,
-    0x7c, 0x8c, 0x92, 0x92, 0x77, 0x80, 0x90, 0x91, 0x81, 0x82, 0x9c, 0x80,
-    0x61, 0x7f, 0x5a, 0x8e, 0x88, 0x7c, 0x8e, 0x79, 0x69, 0x8e, 0x4e, 0x7e,
-    0x84, 0x9e, 0x67, 0x72, 0x5c, 0x78, 0x7b, 0x8c, 0x65, 0x7d, 0x8e, 0xa4,
-    0x5e, 0x7a, 0x5c, 0x97, 0x6a, 0x81, 0xab, 0x85, 0x4d, 0x73, 0x83, 0x96,
-    0x8b, 0x7d, 0xa6, 0x69, 0x74, 0x86, 0x73, 0x79, 0x52, 0x8c, 0xa0, 0x86,
-    0x64, 0x7b, 0x84, 0x77, 0x87, 0x93, 0x7d, 0x6d, 0x98, 0x6d, 0x88, 0x5f,
-    0x7c, 0x84, 0x92, 0x82, 0x81, 0x76, 0x85, 0x77, 0x98, 0x85, 0x88, 0x68,
-    0x7d, 0x71, 0x3c, 0xf1, 0x83, 0x86, 0xa2, 0xb3, 0x6e, 0x77, 0x53, 0xe8,
-    0xa8, 0xc7, 0xb3, 0x83, 0x93, 0x83, 0x63, 0xe8, 0x94, 0xb3, 0x86, 0x6e,
-    0x75, 0x5d, 0x54, 0xf0, 0x89, 0xa7, 0x94, 0xb1, 0x7e, 0x91, 0x9a, 0xb8,
-    0x91, 0x7e, 0x99, 0x50, 0x71, 0x82, 0x8a, 0x91, 0x7a, 0x8a, 0x8b, 0x80,
-    0x64, 0x6a, 0x5f, 0xbe, 0x5d, 0x96, 0xb1, 0x82, 0x45, 0x71, 0x8b, 0x95,
-    0x7c, 0x9b, 0x89, 0x6d, 0x5b, 0x73, 0x81, 0x90, 0x76, 0xab, 0xa6, 0x88,
-    0x62, 0x7d, 0x75, 0x99, 0x7a, 0x8b, 0x6e, 0x9b, 0x83, 0x89, 0x99, 0x93,
-    0x81, 0x9e, 0x8a, 0x76, 0x75, 0x7d, 0x6c, 0x93, 0x68, 0x7a, 0x8d, 0x78,
-    0x88, 0x93, 0x66, 0xa5, 0x6c, 0xae, 0xb1, 0x83, 0x72, 0x8f, 0x6b, 0x7b,
-    0x79, 0x9b, 0x98, 0x7c, 0x82, 0x84, 0x7d, 0x7d, 0x71, 0x7c, 0xb0, 0x81,
-    0x74, 0x89, 0x72, 0x89, 0x98, 0xa0, 0x7d, 0x62, 0x2f, 0x50, 0x7d, 0x8b,
-    0x4c, 0x83, 0x87, 0x89, 0x57, 0x9e, 0x92, 0x8c, 0x81, 0x7e, 0xb9, 0x95,
-    0x7f, 0x76, 0x8e, 0x90, 0x9d, 0x68, 0x78, 0x95, 0x7d, 0xab, 0x84, 0x8a,
-    0x64, 0x9f, 0x80, 0x94, 0x8d, 0x89, 0x76, 0x8e, 0x6f, 0x8b, 0x75, 0x7d,
-    0x89, 0x74, 0x67, 0x8a, 0x7d, 0x63, 0x79, 0x6d, 0x79, 0x8a, 0x78, 0x7f,
-    0x7a, 0x9b, 0x70, 0x70, 0x84, 0x86, 0x80, 0x95, 0x5a, 0x77, 0x80, 0x91,
-    0x9c, 0x92, 0x76, 0x81, 0x69, 0x89, 0x78, 0xa5, 0x7a, 0x8d, 0x86, 0x64,
-    0x8f, 0x8d, 0x7d, 0xa1, 0x8c, 0x7b, 0x77, 0x7e, 0x80, 0x93, 0x86, 0x68,
-    0x90, 0x9c, 0x71, 0x8c, 0x68, 0x52, 0x85, 0x88, 0x89, 0x92, 0x64, 0x8f,
-    0x74, 0x64, 0x7c, 0x88, 0x8d, 0x97, 0x77, 0x97, 0x91, 0xac, 0x74, 0x7f,
-    0x60, 0x7e, 0x6e, 0x70, 0x86, 0x83, 0x7f, 0x81, 0x6f, 0x94, 0x62, 0xa4,
-    0x86, 0x7d, 0x90, 0x7c, 0x89, 0x63, 0x7b, 0x89, 0x75, 0xa1, 0x67, 0x69,
-    0xa6, 0x76, 0x69, 0x9c, 0x71, 0x79, 0x76, 0x7a, 0x8e, 0x78, 0x94, 0x75,
-    0x5a, 0x76, 0x6b, 0x91, 0x84, 0x75, 0x72, 0x93, 0x79, 0x7e, 0x75, 0x9a,
-    0x6f, 0x7a, 0x7b, 0x80, 0x5f, 0x90, 0x74, 0x7d, 0x9b, 0x76, 0x70, 0x89,
-    0x8f, 0x5f, 0x7f, 0x9c, 0x93, 0x6d, 0x81, 0x7f, 0x8d, 0x7d, 0x74, 0x5d,
-    0x75, 0x88, 0x7b, 0x91, 0x75, 0x6b, 0x7f, 0x8c, 0x71, 0x74, 0x87, 0x88,
-    0x83, 0x75, 0x77, 0x96, 0x7f, 0x67, 0x7d, 0x95, 0x81, 0x5c, 0x71, 0x5c,
-    0x6e, 0x75, 0x86, 0x92, 0x5d, 0x7a, 0x77, 0x9f, 0x6e, 0x79, 0x68, 0x60,
-    0x94, 0x88, 0x88, 0x88, 0x79, 0x7e, 0x8a, 0x6d, 0x84, 0xa7, 0x5b, 0x8e,
-    0x67, 0x9c, 0x7e, 0x75, 0x82, 0x96, 0x7c, 0x7b, 0x72, 0x85, 0x8c, 0xa3,
-    0x96, 0x5b, 0x93, 0x67, 0x7e, 0x9f, 0x71, 0x82, 0x79, 0x8c, 0x93, 0x9d,
-    0x6b, 0x90, 0x8a, 0x8a, 0x55, 0x82, 0x94, 0x74, 0x7d, 0xaa, 0x81, 0x78,
-    0x8a, 0x8d, 0x83, 0x7b, 0x97, 0x92, 0x68, 0x64, 0x8c, 0x5d, 0x78, 0x9b,
-    0x73, 0x95, 0x78, 0x77, 0x6f, 0x61, 0x7c, 0x9d, 0x85, 0x6e, 0x84, 0x4c,
-    0x87, 0x57, 0x93, 0x68, 0x8e, 0x77, 0x78, 0x72, 0x87, 0x91, 0x5f, 0x7e,
-    0xa6, 0x75, 0x66, 0x86, 0x7a, 0x7d, 0x70, 0x6f, 0x87, 0x8b, 0x74, 0x85,
-    0x7d, 0x8b, 0x7f, 0x70, 0x7e, 0x82, 0x84, 0x75, 0x89, 0xa6, 0x7b, 0x7a,
-    0xa5, 0x69, 0x73, 0x74, 0x82, 0x65, 0x8f, 0x98, 0x7b, 0x77, 0x84, 0x92,
-    0x73, 0x8a, 0xa1, 0x93, 0x80, 0x81, 0x72, 0x8a, 0x6b, 0x75, 0x8f, 0x98,
-    0x73, 0x74, 0x6f, 0x70, 0x51, 0x6a, 0x84, 0x9e, 0x78, 0x9b, 0x8c, 0x81,
-    0x7e, 0x75, 0x80, 0x88, 0x73, 0x4e, 0x71, 0x74, 0x8c, 0x74, 0x6a, 0x84,
-    0x7f, 0x6b, 0x78, 0xab, 0x77, 0xa2, 0x98, 0x93, 0x77, 0x75, 0x72, 0x5c,
-    0x60, 0x74, 0x84, 0x67, 0x83, 0x7d, 0x7f, 0x7c, 0x5c, 0x72, 0x70, 0x7f,
-    0x6c, 0x84, 0x90, 0xab, 0x97, 0x7f, 0x6b, 0x82, 0x7f, 0x78, 0x73, 0x7d,
-    0x8f, 0x8e, 0x8a, 0x8f, 0x8d, 0xa3, 0x74, 0x6e, 0x5e, 0x8c, 0x94, 0x86,
-    0x57, 0xb0, 0x79, 0xa8, 0x7b, 0x8d, 0x83, 0x77, 0x89, 0xb6, 0x60, 0x9d,
-    0x77, 0x59, 0x72, 0x4d, 0x6f, 0x94, 0x71, 0x75, 0x61, 0x96, 0x86, 0x5d,
-    0x84, 0x68, 0x86, 0x82, 0x8d, 0x70, 0x9a, 0x86, 0x73, 0x64, 0x74, 0x7d,
-    0x80, 0x5a, 0x64, 0x81, 0xa1, 0x71, 0x77, 0x65, 0xa3, 0x76, 0xa3, 0x9d,
-    0x73, 0x7b, 0x8f, 0x7b, 0x79, 0x7d, 0x6c, 0x85, 0x8e, 0x75, 0x65, 0x6a,
-    0x87, 0x70, 0x68, 0x8e, 0x76, 0x5d, 0x66, 0x7c, 0x83, 0x83, 0x7e, 0x89,
-    0x59, 0x8c, 0x75, 0x59, 0x87, 0x7e, 0x7f, 0x90, 0x6b, 0x7b, 0x7e, 0x6d,
-    0x6e, 0x86, 0x69, 0x92, 0x83, 0x8f, 0x8a, 0x60, 0x78, 0x75, 0x61, 0x91,
-    0x73, 0x66, 0x86, 0x86, 0x9f, 0x6f, 0x7b, 0x9a, 0x7c, 0x54, 0x75, 0x8e,
-    0x7e, 0x72, 0x8e, 0x98, 0x94, 0x5f, 0x71, 0x7c, 0x95, 0x9f, 0x8e, 0x83,
-    0x96, 0x4b, 0x8d, 0x84, 0x81, 0x7d, 0x70, 0x84, 0x70, 0x53, 0x8d, 0x84,
-    0x5a, 0x91, 0x88, 0x9a, 0x8f, 0x69, 0x8b, 0x52, 0x85, 0x89, 0x6e, 0x99,
-    0x79, 0x89, 0x9a, 0x82, 0x6e, 0x8b, 0x65, 0x62, 0x80, 0xa8, 0x8f, 0x8a,
-    0x71, 0x61, 0x7e, 0x7d, 0x7e, 0xaa, 0x7f, 0xa0, 0x5e, 0x67, 0x90, 0x86,
-    0x6d, 0xac, 0x74, 0x50, 0x61, 0x91, 0x7d, 0x69, 0x8b, 0x7f, 0x81, 0x7a,
-    0x93, 0x8c, 0x72, 0x64, 0x98, 0x88, 0x91, 0x83, 0x69, 0x6d, 0x78, 0x7a,
-    0x68, 0x7c, 0x76, 0x81, 0xa7, 0x88, 0x8f, 0x79, 0x7d, 0x6c, 0x8a, 0x60,
-    0x88, 0x6d, 0x79, 0x9d, 0x80, 0x82, 0x66, 0x7d, 0x7e, 0x96, 0x78, 0x70,
-    0x9b, 0x70, 0x7e, 0x90, 0x77, 0x94, 0x7b, 0x89, 0x78, 0x84, 0x74, 0x6d,
-    0x7d, 0xa7, 0x75, 0x97, 0x85, 0x83, 0x86, 0x65, 0x75, 0x9a, 0x7c, 0x68,
-    0x87, 0x82, 0x75, 0x68, 0x4c, 0x8a, 0x68, 0x93, 0x7d, 0x88, 0x84, 0x72,
-    0x58, 0x81, 0x5d, 0x83, 0x89, 0x63, 0x83, 0x7d, 0x8e, 0x75, 0x8c, 0x88,
-    0x7f, 0x57, 0x8c, 0x8f, 0xa6, 0x71, 0x8a, 0x95, 0x88, 0x51, 0x74, 0x8a,
-    0x8a, 0x98, 0x72, 0x80, 0x8a, 0x52, 0x90, 0x66, 0x54, 0x8e, 0x7f, 0x94,
-    0x81, 0x49, 0x84, 0x70, 0x5c, 0x93, 0x89, 0x6d, 0x82, 0x7f, 0x70, 0x5d,
-    0x87, 0x8a, 0x71, 0x70, 0x6f, 0xa1, 0x90, 0x9f, 0x74, 0x7c, 0x8c, 0x8b,
-    0x72, 0xbf, 0x89, 0x90, 0x5c, 0x8c, 0x75, 0x72, 0x6f, 0xb2, 0x84, 0x6d,
-    0x61, 0x80, 0x7d, 0x7a, 0x66, 0xaa, 0x75, 0x71, 0x89, 0x6d, 0x69, 0x72,
-    0x73, 0x98, 0x8c, 0x78, 0x5a, 0x8e, 0x8c, 0x81, 0x55, 0x81, 0x96, 0x67,
-    0x6f, 0x71, 0x74, 0x7d, 0x8e, 0x66, 0x9a, 0x67, 0xaa, 0x81, 0x90, 0x79,
-    0x89, 0x59, 0x86, 0x66, 0x8f, 0x7d, 0x7e, 0xa2, 0xa4, 0x99, 0x68, 0x7a,
-    0x8c, 0x73, 0x85, 0x77, 0x8b, 0x74, 0x75, 0x66, 0xaa, 0x98, 0x59, 0x8b,
-    0x91, 0x6c, 0x76, 0x73, 0x87, 0xa4, 0x82, 0x82, 0x63, 0x70, 0x7e, 0x73,
-    0x96, 0x97, 0x6f, 0x86, 0x81, 0x6f, 0x83, 0x82, 0x7b, 0x82, 0xa3, 0xa7,
-    0x95, 0x77, 0x84, 0x65, 0x9b, 0x94, 0x6e, 0xb0, 0x75, 0x66, 0x78, 0x82,
-    0x9c, 0x7a, 0x5f, 0xab, 0x99, 0x2f, 0x7f, 0x68, 0xa4, 0x69, 0x8f, 0x9a,
-    0x91, 0x56, 0x6e, 0x75, 0x63, 0x9b, 0x9e, 0x97, 0x95, 0x68, 0x80, 0x6a,
-    0x40, 0x95, 0x53, 0x72, 0x6f, 0x6b, 0x91, 0x78, 0x7f, 0x93, 0x70, 0x8d,
-    0x62, 0x83, 0x7e, 0x64, 0x5b, 0xaa, 0x70, 0x6c, 0x7e, 0x9c, 0x88, 0x76,
-    0x60, 0x70, 0x66, 0x69, 0x84, 0x97, 0x9d, 0x63, 0x5e, 0x9a, 0x7e, 0x52,
-    0x58, 0xb8, 0x95, 0x7c, 0x4d, 0x96, 0x8f, 0x70, 0x71, 0xbf, 0x83, 0x83,
-    0x9e, 0x70, 0x6f, 0x57, 0x70, 0x9a, 0x8d, 0x6e, 0x98, 0x5a, 0x69, 0x6f,
-    0x90, 0x71, 0x8a, 0x5d, 0x8e, 0x6e, 0x69, 0x7a, 0x90, 0x86, 0x89, 0x88,
-    0xb6, 0x77, 0x84, 0x79, 0x76, 0x86, 0x86, 0x7c, 0xbf, 0x6d, 0x5c, 0x90,
-    0xa1, 0x93, 0x72, 0x63, 0x9a, 0x82, 0x7b, 0x61, 0x91, 0x76, 0x82, 0x96,
-    0xb9, 0x80, 0x77, 0x7f, 0xa0, 0x73, 0x61, 0x80, 0x83, 0xc1, 0x92, 0x67,
-    0x7c, 0x81, 0x90, 0x67, 0x8b, 0xbe, 0x81, 0x91, 0x6c, 0x7e, 0x8d, 0x6c,
-    0x62, 0x83, 0x7e, 0x72, 0x64, 0x8a, 0x83, 0x82, 0xaa, 0x8c, 0x74, 0xab,
-    0x79, 0x85, 0x91, 0x79, 0x90, 0x68, 0x5c, 0x9a, 0x7c, 0x36, 0x80, 0x6e,
-    0x93, 0x76, 0x5e, 0xa0, 0xa5, 0x63, 0x73, 0x7e, 0x8d, 0x94, 0x63, 0x99,
-    0x8f, 0x6a, 0x7f, 0x57, 0x57, 0x6f, 0x6d, 0x86, 0x8e, 0x6b, 0x8d, 0x53,
-    0x94, 0xba, 0x84, 0x6f, 0x5a, 0x7b, 0x8c, 0x5f, 0x73, 0x93, 0x8b, 0x87,
-    0x6f, 0x9e, 0x8a, 0x87, 0x62, 0x97, 0x86, 0x7c, 0x69, 0xab, 0xa1, 0x95,
-    0x42, 0x8c, 0x8b, 0x66, 0x68, 0x99, 0xa8, 0x74, 0x80, 0xa5, 0x7d, 0x82,
-    0x55, 0xb3, 0x6f, 0x81, 0xa8, 0x9a, 0x80, 0x67, 0x62, 0x7f, 0x78, 0x93,
-    0x90, 0x83, 0x83, 0x7b, 0x77, 0x73, 0x8c, 0x56, 0xa7, 0x85, 0x7b, 0x71,
-    0x8f, 0x5d, 0x92, 0x69, 0xbe, 0x5e, 0x7f, 0x7f, 0x8e, 0x71, 0x84, 0x75,
-    0x95, 0x69, 0x88, 0x6b, 0x96, 0x85, 0x78, 0x39, 0xc2, 0x86, 0x7c, 0x99,
-    0xa1, 0x94, 0x6b, 0x86, 0xb5, 0x5e, 0x7e, 0x6e, 0x81, 0x95, 0x6a, 0x88,
-    0x7b, 0x92, 0x8f, 0x68, 0x97, 0x77, 0x84, 0x73, 0x68, 0x96, 0x5a, 0x92,
-    0x66, 0x74, 0x74, 0x6c, 0x7d, 0x81, 0x6c, 0x93, 0x7f, 0x72, 0x86, 0x74,
-    0xbf, 0x8f, 0x53, 0xa4, 0x89, 0x76, 0xa0, 0x87, 0x97, 0x6a, 0x6b, 0xb1,
-    0x91, 0x50, 0x74, 0x68, 0xa3, 0x60, 0x8d, 0xbc, 0xc1, 0x3e, 0x62, 0x59,
-    0x71, 0x72, 0x6d, 0x80, 0x9f, 0x52, 0x82, 0x6b, 0x5d, 0x7f, 0x74, 0x7e,
-    0x74, 0x84, 0x8a, 0x59, 0x5c, 0x85, 0x6d, 0x9c, 0x75, 0x9a, 0x88, 0x89,
-    0x81, 0x9f, 0x81, 0x88, 0x6a, 0x94, 0x84, 0x5f, 0x6b, 0x9b, 0x83, 0x4f,
-    0x7e, 0xca, 0x99, 0x6d, 0x45, 0x7f, 0x87, 0x71, 0x69, 0xad, 0x95, 0x53,
-    0x6e, 0x9b, 0x90, 0x73, 0x5d, 0xb0, 0x8d, 0x67, 0x83, 0x82, 0xa3, 0x70,
-    0x70, 0x92, 0x82, 0x9a, 0x8a, 0x69, 0x6a, 0x6e, 0x7f, 0x89, 0xa4, 0x76,
-    0x97, 0x62, 0x94, 0x80, 0x87, 0x55, 0x80, 0x76, 0xb3, 0x7e, 0x7e, 0x71,
-    0x94, 0x88, 0x8e, 0x74, 0xb6, 0x4d, 0x7b, 0x73, 0x90, 0x86, 0x7c, 0x66,
-    0xb5, 0x80, 0x7f, 0x84, 0x87, 0x82, 0x67, 0x83, 0x97, 0x91, 0x8a, 0x78,
-    0x8b, 0x83, 0x5d, 0x84, 0x82, 0x9f, 0x8c, 0x91, 0x84, 0x8b, 0x6a, 0x68,
-    0x86, 0x82, 0x73, 0x77, 0x7b, 0x83, 0x6a, 0x84, 0x92, 0x93, 0x90, 0x8b,
-    0x4c, 0x94, 0x98, 0x76, 0xb8, 0x7b, 0xa0, 0xa2, 0x7d, 0x3e, 0x95, 0x88,
-    0xa3, 0x6f, 0x5e, 0xc8, 0x9a, 0x52, 0x81, 0x86, 0xa3, 0x79, 0x88, 0xc3,
-    0xbd, 0x54, 0x6c, 0x5e, 0x83, 0x8a, 0x98, 0x88, 0x92, 0x66, 0x73, 0x5b,
-    0x6c, 0x7f, 0x6e, 0x97, 0x8d, 0x58, 0x89, 0x6e, 0x65, 0x7a, 0x7d, 0x7c,
-    0x7e, 0x89, 0x94, 0x89, 0x55, 0xb8, 0x8f, 0x82, 0x6c, 0x9c, 0x96, 0x5e,
-    0x6f, 0xb2, 0x70, 0x76, 0x95, 0xc8, 0x86, 0x78, 0x49, 0xac, 0x7e, 0x6c,
-    0x68, 0xb6, 0xaf, 0x89, 0x68, 0xa5, 0x72, 0x85, 0x69, 0x9c, 0x94, 0x84,
-    0xa4, 0x97, 0x91, 0x61, 0x7a, 0xa3, 0x8f, 0x8e, 0x93, 0x80, 0x8d, 0x76,
-    0x74, 0x84, 0x9b, 0x79, 0x97, 0x4e, 0x67, 0x87, 0x9b, 0x69, 0x85, 0x7d,
-    0xb2, 0x68, 0x76, 0x63, 0xa2, 0x86, 0x97, 0x7f, 0xb5, 0x63, 0x79, 0x76,
-    0x8a, 0x7c, 0x7c, 0x91, 0xb1, 0x42, 0x7d, 0x7a, 0x8c, 0x8e, 0x72, 0xab,
-    0xb8, 0x76, 0xab, 0x81, 0x98, 0x85, 0x56, 0x98, 0x84, 0x9f, 0x70, 0x86,
-    0x76, 0x88, 0x70, 0x8d, 0x71, 0x7b, 0x7a, 0x8d, 0x76, 0x75, 0x62, 0x80,
-    0x81, 0x94, 0x82, 0x6e, 0x57, 0x8d, 0xaf, 0x84, 0xbf, 0x85, 0x82, 0xa7,
-    0x80, 0x89, 0x95, 0x81, 0x91, 0x49, 0x72, 0xa1, 0xa7, 0x3f, 0x72, 0x8b,
-    0x99, 0x72, 0x86, 0xb2, 0xc3, 0x61, 0x55, 0x77, 0x86, 0x77, 0x83, 0xa7,
-    0x95, 0x5a, 0x68, 0x68, 0x6a, 0x63, 0x6a, 0x77, 0x93, 0x7c, 0x88, 0x62,
-    0x79, 0x84, 0x8b, 0x82, 0x58, 0x8f, 0x9c, 0x56, 0x77, 0xb1, 0x65, 0x8c,
-    0x76, 0x91, 0x83, 0x5b, 0x62, 0x91, 0x87, 0x68, 0x71, 0xb0, 0x87, 0x64,
-    0x62, 0x91, 0x94, 0x58, 0x7f, 0xac, 0xa3, 0x84, 0x75, 0xaa, 0xa3, 0x4d,
-    0x7a, 0xc2, 0x84, 0x8a, 0x6d, 0xa2, 0x76, 0x74, 0x8c, 0x9e, 0x7c, 0x71,
-    0x86, 0x70, 0x6d, 0x79, 0x9a, 0x74, 0xb0, 0x8d, 0xa5, 0x7e, 0x6b, 0x63,
-    0x96, 0x74, 0x99, 0x76, 0xd0, 0x62, 0x85, 0x9d, 0x8f, 0x6d, 0x83, 0x88,
-    0xb0, 0x62, 0x9b, 0x87, 0x91, 0x82, 0x7a, 0x90, 0x9c, 0x61, 0x6d, 0x97,
-    0x84, 0x7c, 0x74, 0x8e, 0x8b, 0x75, 0x9a, 0x7e, 0x7c, 0x7d, 0x96, 0x81,
-    0x94, 0x69, 0x83, 0x6f, 0x8e, 0x7c, 0x7b, 0x7a, 0x73, 0x98, 0x74, 0x9e,
-    0x72, 0x8c, 0x5f, 0x7d, 0x99, 0x79, 0x5b, 0x73, 0x65, 0x78, 0xa5, 0x7d,
-    0xa2, 0x98, 0x91, 0x91, 0x87, 0x7b, 0x8c, 0x82, 0xb8, 0x6b, 0x82, 0xba,
-    0xa5, 0x3f, 0x83, 0x7a, 0x9b, 0x73, 0x93, 0xa1, 0xbe, 0x55, 0x6b, 0x75,
-    0x94, 0x7d, 0x9c, 0xa1, 0x82, 0x50, 0x75, 0x5a, 0x88, 0x6e, 0x72, 0x7f,
-    0x99, 0x64, 0x72, 0x49, 0x69, 0x79, 0x6d, 0x94, 0x73, 0x79, 0x80, 0x6f,
-    0x72, 0xbc, 0x9d, 0x71, 0x7a, 0x9d, 0x8a, 0x55, 0x74, 0xaa, 0xa1, 0x85,
-    0x7e, 0xc4, 0xa0, 0x7e, 0x50, 0x99, 0x68, 0x8c, 0x8a, 0xb0, 0x99, 0x6c,
-    0x6d, 0xaf, 0x7b, 0x7b, 0x79, 0xba, 0x8a, 0x7a, 0x9d, 0x8b, 0x67, 0x87,
-    0x76, 0xa9, 0x7f, 0x7e, 0x8b, 0x7b, 0x87, 0x84, 0x82, 0x74, 0xa3, 0x91,
-    0x9a, 0x6a, 0x93, 0x7e, 0x87, 0x5b, 0x95, 0x89, 0xbb, 0x5d, 0x74, 0x6c,
-    0x88, 0x7e, 0x81, 0x7e, 0xb6, 0x6b, 0x91, 0x92, 0x83, 0x78, 0x79, 0x95,
-    0x90, 0x5e, 0x68, 0x8f, 0xa8, 0x92, 0x66, 0x8e, 0x6b, 0x8c, 0x86, 0x80,
-    0x7e, 0x7e, 0x70, 0x84, 0x7d, 0x71, 0x67, 0x94, 0x71, 0x69, 0x84, 0x8f,
-    0x6c, 0x72, 0x85, 0x83, 0x69, 0x76, 0x57, 0x62, 0x83, 0x96, 0x83, 0x77,
-    0x64, 0x5f, 0xae, 0x7c, 0xa7, 0x88, 0x91, 0x8c, 0x9e, 0x7f, 0xa8, 0x8a,
-    0x93, 0x6f, 0x58, 0xae, 0xb4, 0x4b, 0x7f, 0x64, 0x9f, 0x5a, 0x9e, 0xb6,
-    0xa6, 0x6b, 0x79, 0x84, 0x6b, 0x7c, 0x8b, 0x94, 0x85, 0x60, 0x6b, 0x55,
-    0x79, 0x68, 0x77, 0x75, 0x85, 0x5c, 0x91, 0x5e, 0x5a, 0x71, 0x68, 0x7b,
-    0x73, 0x91, 0x6c, 0x6e, 0x71, 0x8b, 0x76, 0x86, 0x99, 0xb8, 0x91, 0x68,
-    0x51, 0xa7, 0x6f, 0x7a, 0x8a, 0xc3, 0x8e, 0x65, 0x64, 0x9e, 0x80, 0x78,
-    0x6c, 0xc5, 0xa2, 0x75, 0x71, 0xa5, 0x96, 0x4f, 0x70, 0xa4, 0x7a, 0x7c,
-    0x8c, 0x80, 0x89, 0x97, 0x9a, 0x9a, 0x85, 0x89, 0x92, 0x8f, 0x81, 0x6f,
-    0x82, 0x6a, 0xb8, 0x74, 0x8f, 0x51, 0x7b, 0x8b, 0x8c, 0x55, 0x7e, 0x8c,
-    0xb2, 0x41, 0x85, 0x77, 0x9c, 0x73, 0x75, 0x8d, 0x9f, 0x64, 0x92, 0x77,
-    0xa0, 0x87, 0x5f, 0x71, 0x85, 0x68, 0x8a, 0x78, 0x91, 0x78, 0x75, 0x7a,
-    0x81, 0x67, 0x96, 0x64, 0x96, 0x85, 0x7a, 0x7e, 0x83, 0x74, 0x82, 0x8f,
-    0x98, 0x75, 0x77, 0x84, 0x7e, 0x88, 0x94, 0x7d, 0x79, 0x8c, 0x47, 0x79,
-    0x96, 0x7f, 0x8e, 0x90, 0x50, 0x7f, 0xa3, 0x77, 0xa8, 0x7f, 0x65, 0x9f,
-    0xb9, 0x4c, 0xa7, 0x7f, 0xaa, 0x6e, 0xa2, 0xb0, 0xb8, 0x51, 0x6b, 0x74,
-    0xaa, 0x63, 0x6c, 0xa3, 0xb6, 0x5e, 0x74, 0x6a, 0x75, 0x69, 0x87, 0x7f,
-    0x9d, 0x71, 0x73, 0x72, 0x70, 0x57, 0x5a, 0x7e, 0x8b, 0x64, 0x9a, 0x4d,
-    0x97, 0x81, 0x7b, 0x75, 0x6e, 0x92, 0x5f, 0x67, 0x7e, 0xaa, 0x90, 0x7a,
-    0x92, 0xae, 0x92, 0x68, 0x79, 0x9d, 0x4f, 0x6c, 0x79, 0xb4, 0x9c, 0x58,
-    0x86, 0x8e, 0x62, 0x72, 0x71, 0xc1, 0xac, 0x7d, 0x7a, 0x94, 0x8f, 0x7b,
-    0x88, 0xa8, 0x8d, 0x82, 0x75, 0x9b, 0x5f, 0x83, 0x82, 0xb3, 0x7a, 0x93,
-    0x94, 0x76, 0x70, 0x7e, 0x72, 0x7e, 0x8f, 0x8c, 0xa7, 0x53, 0x72, 0x77,
-    0x7a, 0x64, 0xa8, 0x83, 0xc5, 0x56, 0x71, 0x7b, 0x96, 0x73, 0x7c, 0x73,
-    0x93, 0x49, 0x83, 0x99, 0xa2, 0x83, 0x74, 0x79, 0xa4, 0x61, 0x8e, 0x84,
-    0x7a, 0x7d, 0x56, 0x98, 0x97, 0x6d, 0x87, 0x8c, 0x7a, 0x77, 0x6a, 0x67,
-    0x8a, 0x6f, 0xa2, 0x82, 0x8d, 0x85, 0x6d, 0x8f, 0x7e, 0x74, 0x72, 0x74,
-    0x91, 0x75, 0x58, 0x7f, 0x9e, 0x7c, 0x9c, 0x75, 0x61, 0x6f, 0x85, 0x7b,
-    0xbe, 0x84, 0x85, 0x9b, 0x8c, 0x3b, 0x9a, 0x90, 0xab, 0x77, 0x8e, 0xa2,
-    0xbd, 0x55, 0x96, 0x70, 0xa8, 0x78, 0x98, 0x9c, 0xc3, 0x67, 0x6e, 0x81,
-    0x70, 0x75, 0x96, 0x9c, 0x8a, 0x5b, 0x73, 0x54, 0x69, 0x6c, 0x5d, 0x82,
-    0x99, 0x5b, 0x8c, 0x6d, 0x87, 0x80, 0x67, 0x86, 0x88, 0x7c, 0x70, 0x6b,
-    0x75, 0xab, 0x8e, 0x79, 0x90, 0x91, 0xaf, 0x67, 0x5c, 0xa1, 0x5c, 0x6f,
-    0x75, 0xa1, 0x95, 0x5f, 0x82, 0x8f, 0x78, 0x5d, 0x7c, 0xb8, 0x8a, 0x8a,
-    0x6a, 0x98, 0x6e, 0x51, 0x6b, 0xaa, 0x7d, 0x7c, 0x80, 0x94, 0x79, 0x6d,
-    0xaa, 0x8a, 0x7e, 0x77, 0xa4, 0x78, 0xa5, 0x6d, 0x7c, 0x75, 0xa8, 0x6f,
-    0xa6, 0x51, 0x8e, 0x80, 0x96, 0x5b, 0x9d, 0x7b, 0xb8, 0x4e, 0x6c, 0x87,
-    0x95, 0x7c, 0x78, 0x71, 0xb0, 0x5a, 0x99, 0xa0, 0x90, 0x87, 0x65, 0x8b,
-    0x98, 0x68, 0x92, 0x76, 0x82, 0x77, 0x6a, 0x8a, 0x91, 0x84, 0x87, 0x8b,
-    0x87, 0x84, 0x7a, 0x81, 0x77, 0x55, 0x8e, 0x86, 0x7a, 0x74, 0x65, 0x88,
-    0x62, 0x51, 0xa1, 0x91, 0x88, 0x76, 0x5f, 0x89, 0x9f, 0x86, 0x66, 0x67,
-    0x64, 0x75, 0x9e, 0x74, 0xc1, 0x80, 0x58, 0xa9, 0x8f, 0x5e, 0x94, 0x88,
-    0xaf, 0x6f, 0x6c, 0xa4, 0xa1, 0x4d, 0x68, 0x66, 0xc2, 0x6e, 0x89, 0x9b,
-    0xa3, 0x5a, 0x63, 0x5b, 0x9c, 0x7a, 0x93, 0x76, 0x9d, 0x6d, 0x71, 0x5d,
-    0x80, 0x66, 0x79, 0x80, 0x7c, 0x65, 0x74, 0x64, 0x88, 0x90, 0x79, 0x89,
-    0x72, 0x88, 0x67, 0x75, 0x6a, 0x96, 0x56, 0x67, 0x88, 0xa1, 0x8c, 0x6c,
-    0x55, 0xb2, 0x8a, 0x71, 0x88, 0xdc, 0x7a, 0x72, 0x94, 0x9d, 0x7c, 0x76,
-    0x6a, 0xaa, 0xa8, 0x7f, 0x80, 0xa0, 0x6b, 0x6f, 0x84, 0xe0, 0x68, 0x93,
-    0xa6, 0x99, 0x69, 0x68, 0x93, 0xa0, 0x93, 0x6b, 0x87, 0x8b, 0x80, 0x90,
-    0x90, 0x89, 0x8f, 0x7f, 0xaf, 0x6f, 0x82, 0x6d, 0x94, 0x70, 0x97, 0x8f,
-    0xb0, 0x40, 0x9b, 0x67, 0x78, 0x86, 0x90, 0x8b, 0xa7, 0x51, 0x7f, 0x79,
-    0x90, 0x71, 0x6d, 0x80, 0x95, 0x63, 0x7d, 0x87, 0xa0, 0x7e, 0x7b, 0x85,
-    0x8e, 0x6d, 0xa1, 0x76, 0x70, 0x7b, 0x66, 0x87, 0x90, 0x7a, 0x86, 0x88,
-    0x89, 0x87, 0x6a, 0x91, 0x78, 0x74, 0x76, 0x8d, 0x7e, 0x86, 0x63, 0x90,
-    0x98, 0x7d, 0x4a, 0x85, 0x4f, 0x9d, 0xa2, 0x7c, 0xb4, 0x88, 0x78, 0xb5,
-    0x8f, 0x3f, 0xa7, 0x7d, 0xa4, 0x7c, 0x60, 0x9c, 0xa8, 0x41, 0x6b, 0x7f,
-    0xa2, 0x7f, 0x68, 0xaa, 0xb4, 0x73, 0x56, 0x62, 0x87, 0x72, 0xa5, 0x7c,
-    0x97, 0x69, 0x58, 0x6b, 0x89, 0x57, 0x51, 0x80, 0x92, 0x7a, 0x7c, 0x4c,
-    0x7c, 0x7b, 0x69, 0x5f, 0x90, 0x77, 0x78, 0x67, 0x7a, 0xad, 0x79, 0x5c,
-    0x9c, 0xbf, 0xa6, 0x64, 0x53, 0xb3, 0x5e, 0x59, 0x86, 0xb9, 0x94, 0x65,
-    0x70, 0x9d, 0x7a, 0x80, 0x7c, 0xae, 0x9c, 0x7b, 0x66, 0xae, 0x83, 0x5f,
-    0x81, 0xc5, 0x8b, 0x7e, 0x9b, 0x89, 0x84, 0x7f, 0x7c, 0xa5, 0x5c, 0x89,
-    0x8a, 0x75, 0x99, 0x6d, 0x8e, 0x90, 0x9f, 0x81, 0x81, 0x6b, 0x87, 0x76,
-    0x92, 0x6f, 0xab, 0x95, 0x95, 0x4c, 0x97, 0x72, 0x80, 0x87, 0x83, 0x87,
-    0xa3, 0x59, 0xad, 0x74, 0x93, 0x7f, 0x77, 0x78, 0x8d, 0x66, 0x9b, 0x7a,
-    0x7d, 0x95, 0x64, 0x7f, 0x6d, 0x5c, 0x8e, 0x94, 0x92, 0x82, 0x60, 0x8d,
-    0x75, 0x55, 0x8c, 0x8b, 0x8f, 0x86, 0x7d, 0x7c, 0x74, 0x57, 0x78, 0x9d,
-    0x71, 0x65, 0x66, 0x7f, 0xaa, 0x92, 0x66, 0x81, 0x5a, 0x71, 0xa6, 0x78,
-    0x9d, 0x8a, 0x5a, 0x8a, 0x91, 0x59, 0xb7, 0x5c, 0xc3, 0x73, 0x89, 0x9d,
-    0xa7, 0x62, 0x77, 0x72, 0x9f, 0x92, 0x6a, 0x9f, 0xaa, 0x71, 0x6b, 0x5e,
-    0x7d, 0x73, 0x8d, 0x89, 0xba, 0x61, 0x73, 0x6e, 0x71, 0x8a, 0x79, 0x7c,
-    0x94, 0x76, 0x76, 0x65, 0x81, 0x6f, 0x4e, 0x75, 0x6e, 0x8b, 0x7d, 0x50,
-    0x56, 0xb8, 0x72, 0x67, 0x93, 0xc6, 0x88, 0x6f, 0x57, 0xb7, 0x80, 0x4c,
-    0x97, 0xc4, 0xb6, 0x71, 0x72, 0x9e, 0x6f, 0x72, 0x8d, 0xa5, 0x8f, 0x89,
-    0x74, 0xae, 0x78, 0x70, 0x6e, 0xbb, 0x8f, 0x73, 0x74, 0x8b, 0x5e, 0x86,
-    0x8b, 0x8a, 0x72, 0x71, 0x84, 0x84, 0x77, 0xa3, 0xa6, 0x73, 0xa4, 0x7e,
-    0xab, 0x5d, 0x75, 0x96, 0x94, 0x5f, 0x8b, 0x74, 0x9c, 0x63, 0x8d, 0x81,
-    0x80, 0x6a, 0x91, 0x88, 0x93, 0x53, 0x80, 0x75, 0x79, 0x8d, 0x78, 0x74,
-    0x7c, 0x73, 0xb2, 0x89, 0x8e, 0xab, 0x75, 0x6c, 0x7a, 0x79, 0x99, 0x77,
-    0x7d, 0x89, 0x5a, 0x81, 0x7c, 0x75, 0x6a, 0x7e, 0x8c, 0x83, 0x78, 0x8e,
-    0x62, 0x76, 0x77, 0x6b, 0x79, 0x66, 0x6e, 0x82, 0xa1, 0x8d, 0x52, 0x79,
-    0x70, 0x7d, 0xa9, 0x6a, 0x95, 0x7f, 0x59, 0x94, 0x8f, 0x73, 0xb7, 0x85,
-    0xb3, 0x80, 0x77, 0x9f, 0xb8, 0x4d, 0x82, 0x7c, 0xa0, 0xa4, 0x7b, 0x8c,
-    0xa9, 0x78, 0x62, 0x6b, 0x8a, 0x93, 0x80, 0x68, 0x9b, 0x6d, 0x6b, 0x7b,
-    0x84, 0x8f, 0x86, 0x70, 0x70, 0x73, 0x84, 0x4f, 0x7c, 0x75, 0x64, 0x8d,
-    0x6e, 0x81, 0x7c, 0x72, 0x81, 0xb0, 0x74, 0x65, 0xa7, 0xae, 0x80, 0x70,
-    0x5e, 0xa4, 0x58, 0x54, 0x8e, 0xa7, 0x96, 0x65, 0x66, 0x8b, 0x6c, 0x5d,
-    0x6b, 0xbe, 0x94, 0x79, 0x80, 0xa1, 0x91, 0x78, 0x6d, 0xc2, 0x82, 0x85,
-    0x81, 0x7d, 0x88, 0x79, 0x93, 0x96, 0x7f, 0x7e, 0x7d, 0x92, 0x75, 0xa2,
-    0x9f, 0x7b, 0x92, 0x77, 0x8a, 0x7c, 0x80, 0x8b, 0x9b, 0x64, 0xa5, 0x74,
-    0xa1, 0x74, 0x7f, 0x7e, 0x85, 0x78, 0x9c, 0x86, 0x9f, 0x62, 0x8f, 0x7f,
-    0x8a, 0x90, 0x6d, 0x7d, 0x93, 0x61, 0x9d, 0x81, 0x9b, 0x99, 0x69, 0x87,
-    0x74, 0x7d, 0x8e, 0x8e, 0x7b, 0x7c, 0x6a, 0x71, 0x7d, 0x7f, 0x74, 0x74,
-    0x7b, 0x65, 0x6e, 0x91, 0x7c, 0x6e, 0x80, 0x8c, 0x8a, 0x6c, 0x6b, 0x76,
-    0xad, 0x94, 0x64, 0x81, 0x69, 0x7b, 0xac, 0x76, 0x9f, 0x71, 0x85, 0x85,
-    0x8b, 0x66, 0xb5, 0x87, 0xb3, 0x63, 0x8b, 0x95, 0x8e, 0x50, 0x91, 0x77,
-    0xa1, 0x99, 0x64, 0x81, 0xb3, 0x63, 0x6e, 0x7a, 0x7f, 0x73, 0x7a, 0x7b,
-    0x93, 0x6d, 0x75, 0x75, 0x7c, 0x7b, 0x59, 0x7c, 0x7c, 0x68, 0x67, 0x78,
-    0x79, 0x75, 0x53, 0x86, 0x84, 0x84, 0x91, 0x71, 0x85, 0xb1, 0x84, 0x64,
-    0x88, 0xc0, 0x94, 0x5f, 0x6f, 0x9b, 0x69, 0x67, 0x97, 0x94, 0x88, 0x6a,
-    0x7e, 0x94, 0x9e, 0x7f, 0x81, 0x9c, 0xa7, 0x7f, 0x7a, 0xa2, 0x63, 0x69,
-    0x82, 0xc2, 0x5e, 0x8d, 0x7c, 0x89, 0x63, 0x93, 0x84, 0xb8, 0x76, 0x89,
-    0x96, 0x87, 0x79, 0x88, 0xa6, 0x8e, 0x9b, 0x93, 0x9c, 0x5d, 0x92, 0x92,
-    0x82, 0x5e, 0x85, 0x88, 0xad, 0x73, 0xa4, 0x6f, 0x74, 0x8e, 0x77, 0x89,
-    0x9b, 0x6e, 0x82, 0x76, 0x93, 0xae, 0x82, 0x87, 0x76, 0x6f, 0x80, 0x76,
-    0x95, 0x8e, 0x5e, 0x85, 0x7b, 0x68, 0x7f, 0x7c, 0x82, 0x94, 0x80, 0x91,
-    0x77, 0x71, 0x7c, 0x94, 0x80, 0x62, 0x65, 0x7c, 0x5e, 0x70, 0x76, 0x75,
-    0x7b, 0x60, 0x5f, 0x69, 0xb3, 0x6e, 0x95, 0x9d, 0x5a, 0x5b, 0x9e, 0x6e,
-    0xa6, 0x80, 0x5d, 0xa5, 0x83, 0x5b, 0xa4, 0x80, 0xb3, 0x79, 0x83, 0xb6,
-    0xa3, 0x73, 0x84, 0x67, 0x8d, 0x8f, 0x9d, 0x78, 0xb8, 0x8a, 0x7b, 0x6c,
-    0x85, 0x87, 0x6d, 0x75, 0xae, 0x75, 0x53, 0x71, 0x6b, 0x87, 0x67, 0x7b,
-    0x7f, 0x86, 0x58, 0x73, 0x7d, 0x87, 0x5d, 0x7f, 0x7d, 0x63, 0x92, 0x65,
-    0x7a, 0x9c, 0x6f, 0x87, 0x81, 0xa9, 0x91, 0x54, 0x66, 0x8e, 0x58, 0x6d,
-    0x92, 0xc2, 0xa9, 0x7b, 0x6e, 0x96, 0x7c, 0x60, 0x7e, 0xa8, 0x85, 0x94,
-    0x90, 0x8b, 0x77, 0x79, 0x77, 0xa7, 0x8f, 0x83, 0x80, 0x99, 0x8c, 0x80,
-    0x93, 0x9c, 0x73, 0x9e, 0x75, 0x90, 0x67, 0x74, 0x99, 0x98, 0x7e, 0x76,
-    0x9f, 0x82, 0x90, 0x95, 0x9d, 0x5f, 0x95, 0x98, 0x8c, 0x5f, 0x77, 0x83,
-    0x7b, 0x72, 0x85, 0x7c, 0x97, 0x74, 0x81, 0x80, 0x8d, 0x89, 0x7d, 0x69,
-    0x95, 0x85, 0x83, 0x5e, 0x95, 0x74, 0x54, 0x7f, 0x6c, 0x67, 0x9b, 0x83,
-    0x88, 0x8e, 0x6f, 0x96, 0x81, 0x7f, 0x6e, 0x87, 0x8f, 0x6f, 0x61, 0x87,
-    0x63, 0x66, 0x72, 0x77, 0x75, 0x6d, 0x59, 0x7d, 0xaa, 0x85, 0x62, 0x83,
-    0x97, 0x94, 0x96, 0x89, 0x9d, 0x90, 0x7d, 0x91, 0x78, 0x57, 0xa0, 0x7f,
-    0xa2, 0x62, 0x63, 0x99, 0x77, 0x71, 0x7f, 0x61, 0x99, 0x89, 0x6f, 0xa2,
-    0xae, 0x92, 0x88, 0x51, 0x87, 0x7a, 0x6f, 0x89, 0xa8, 0x89, 0x64, 0x81,
-    0x84, 0x79, 0x5b, 0x73, 0x82, 0x6e, 0x7e, 0x5d, 0x8f, 0x82, 0x51, 0x69,
-    0x8e, 0x76, 0x8b, 0x58, 0x89, 0xb2, 0x52, 0x72, 0x7f, 0xae, 0x96, 0x5a,
-    0x80, 0xa1, 0x74, 0x62, 0x8d, 0xbe, 0x87, 0x6c, 0x6d, 0xad, 0x83, 0x5a,
-    0x6c, 0xa5, 0x7f, 0x7c, 0x7a, 0xa1, 0x75, 0x6d, 0x85, 0xbe, 0x91, 0x8e,
-    0x96, 0x8c, 0x87, 0x74, 0x8b, 0x82, 0x96, 0x8f, 0x8f, 0x93, 0x8f, 0x8c,
-    0x9a, 0x78, 0x73, 0x6e, 0x91, 0x8d, 0x7e, 0x81, 0x81, 0x52, 0x90, 0x85,
-    0x77, 0x66, 0x7e, 0x75, 0x8a, 0x67, 0x72, 0x76, 0x82, 0x7b, 0x6e, 0x67,
-    0x96, 0x7b, 0x75, 0x76, 0x8d, 0x76, 0x7f, 0x79, 0x84, 0x7b, 0x57, 0x81,
-    0x76, 0x80, 0x67, 0x8c, 0x7c, 0x80, 0x67, 0x85, 0x79, 0x5b, 0x97, 0x74,
-    0x91, 0x75, 0x82, 0x75, 0x6b, 0x94, 0x7e, 0x85, 0x8e, 0x77, 0x5d, 0x78,
-    0xb5, 0x8b, 0x73, 0x7f, 0x62, 0x8f, 0xb1, 0x7d, 0xa2, 0x85, 0x6b, 0x92,
-    0x75, 0x75, 0xb8, 0x7d, 0xb3, 0x67, 0x5f, 0xa6, 0x9b, 0x85, 0x9a, 0x67,
-    0xbe, 0x8d, 0x92, 0x88, 0xa5, 0x7c, 0xaa, 0x5a, 0x71, 0x7b, 0x70, 0x77,
-    0xa0, 0xa4, 0x5e, 0x55, 0x6b, 0x8e, 0x53, 0x89, 0x8a, 0x5a, 0x7c, 0x54,
-    0x7c, 0x8b, 0x53, 0x77, 0x67, 0x77, 0x67, 0x5d, 0x91, 0xac, 0x78, 0x81,
-    0x8e, 0xb5, 0x6d, 0x58, 0x78, 0xa6, 0x7c, 0x85, 0x87, 0xb3, 0x76, 0x5d,
-    0x7c, 0x87, 0x57, 0x68, 0x82, 0x8f, 0x89, 0x76, 0x86, 0x9f, 0x6c, 0x68,
-    0x7c, 0x87, 0x79, 0x9f, 0x86, 0x9e, 0x83, 0x70, 0x8d, 0xb2, 0x84, 0x71,
-    0x71, 0x91, 0x9f, 0x8e, 0x83, 0x84, 0x87, 0x80, 0x94, 0x80, 0x7d, 0x8d,
-    0x7c, 0x56, 0x5f, 0x80, 0x7d, 0x84, 0x61, 0x6e, 0x69, 0x80, 0x8b, 0x67,
-    0xa4, 0x8b, 0x98, 0x7a, 0x8a, 0x6c, 0x77, 0x66, 0x7d, 0x6e, 0x84, 0x78,
-    0x82, 0x7d, 0x61, 0x88, 0x6e, 0x53, 0x92, 0x75, 0x88, 0x77, 0x82, 0x9f,
-    0x9e, 0x6f, 0x9c, 0x76, 0x91, 0x78, 0x69, 0x7f, 0x71, 0x6c, 0x6f, 0x7d,
-    0x83, 0x6e, 0x3c, 0x84, 0x90, 0x8b, 0x71, 0x69, 0x75, 0x81, 0xc8, 0x84,
-    0xa7, 0x8a, 0x8a, 0x90, 0x96, 0x86, 0x9e, 0x68, 0x99, 0x84, 0x8c, 0xa0,
-    0x8a, 0x71, 0x7d, 0x41, 0xa1, 0x98, 0x77, 0x91, 0xaa, 0x86, 0x96, 0x5e,
-    0x86, 0x76, 0xa7, 0x83, 0xac, 0x86, 0x66, 0x46, 0x6a, 0x81, 0x64, 0x77,
-    0x67, 0x53, 0x80, 0x59, 0x73, 0x71, 0x63, 0x71, 0x76, 0x86, 0x62, 0x4f,
-    0x83, 0xa4, 0x5d, 0x66, 0x93, 0x87, 0x87, 0x5b, 0x7f, 0x9d, 0x61, 0x9d,
-    0x94, 0xa4, 0x84, 0x75, 0x67, 0xb3, 0x7b, 0x6d, 0x64, 0x98, 0x62, 0x77,
-    0x7d, 0x98, 0x8e, 0x75, 0x7d, 0xa6, 0xa4, 0x8c, 0x83, 0x8b, 0x7a, 0x97,
-    0x6c, 0x7f, 0x66, 0x7f, 0x8f, 0x98, 0x72, 0x6e, 0x75, 0x65, 0x80, 0x8d,
-    0x88, 0x7d, 0x8c, 0x8d, 0x67, 0x68, 0xab, 0x8c, 0x8b, 0x76, 0x87, 0x69,
-    0x88, 0x6c, 0x83, 0x6e, 0x88, 0x64, 0xa8, 0x67, 0xa5, 0x5b, 0x65, 0x60,
-    0x6b, 0x62, 0x76, 0x78, 0x8c, 0x5b, 0x61, 0x6f, 0x66, 0x65, 0x92, 0x67,
-    0x84, 0x7b, 0x80, 0x86, 0x7b, 0x6c, 0x86, 0x7a, 0x72, 0x7b, 0x4d, 0x94,
-    0x80, 0x67, 0x8e, 0x8d, 0x7f, 0x79, 0x65, 0x78, 0xa3, 0x71, 0x80, 0x74,
-    0xa7, 0xa8, 0x97, 0x78, 0x91, 0x77, 0x98, 0x86, 0x82, 0x64, 0xa5, 0x6e,
-    0x7a, 0x5d, 0x6f, 0xad, 0x9b, 0x7a, 0x91, 0x4b, 0xa1, 0x75, 0x95, 0x76,
-    0xac, 0x9d, 0xa3, 0x65, 0x65, 0x6a, 0x81, 0x8b, 0x9f, 0x67, 0x6b, 0x6a,
-    0x60, 0x5b, 0x77, 0x96, 0x73, 0x78, 0x5a, 0x77, 0x5f, 0x68, 0x70, 0x72,
-    0x78, 0x65, 0x81, 0x20, 0x86, 0x99, 0x80, 0x7a, 0xa5, 0xb1, 0x69, 0x45,
-    0x7d, 0xa6, 0x7d, 0x85, 0xaa, 0xa9, 0x65, 0x60, 0x75, 0x9b, 0x61, 0x92,
-    0x91, 0x8f, 0x8a, 0x81, 0x88, 0x9c, 0x81, 0x7d, 0x7b, 0x8f, 0x7e, 0x9e,
-    0x82, 0x94, 0x95, 0x80, 0x73, 0xae, 0x7b, 0x7a, 0x79, 0x8c, 0x8b, 0x65,
-    0x71, 0x75, 0x8d, 0x7a, 0x90, 0x83, 0x7b, 0x77, 0x71, 0x4f, 0x70, 0x95,
-    0x87, 0x69, 0x97, 0x8e, 0x70, 0x92, 0x6e, 0x91, 0x9d, 0x72, 0x75, 0x82,
-    0xad, 0x81, 0x78, 0x8d, 0x6f, 0x65, 0x88, 0x86, 0x8c, 0x8e, 0x59, 0x8b,
-    0x67, 0x69, 0x8b, 0x78, 0x7f, 0x59, 0x73, 0x87, 0x6f, 0x86, 0x66, 0x7c,
-    0x96, 0x68, 0x59, 0x78, 0x67, 0x92, 0x7b, 0x76, 0x80, 0x6e, 0x4a, 0x7b,
-    0x99, 0x67, 0x72, 0x9c, 0x7a, 0x80, 0x76, 0x5f, 0x8e, 0x4f, 0x71, 0x77,
-    0xab, 0x78, 0x99, 0x50, 0x83, 0x65, 0x78, 0x8c, 0xbb, 0x8d, 0x4e, 0x54,
-    0x81, 0x6f, 0x7f, 0x91, 0xb9, 0x79, 0x9c, 0x65, 0x5a, 0x5a, 0x73, 0x8c,
-    0x9a, 0xac, 0x99, 0x44, 0x7d, 0x4f, 0x78, 0x5a, 0x7d, 0x79, 0x57, 0x44,
-    0x6f, 0x6a, 0x75, 0x7f, 0x5f, 0x6f, 0x72, 0x62, 0x7f, 0x89, 0x57, 0x91,
-    0x8d, 0x83, 0x7e, 0x63, 0x8c, 0x95, 0x48, 0x78, 0xa9, 0x88, 0x84, 0x5b,
-    0x8c, 0xa5, 0x65, 0x71, 0x88, 0x82, 0x7e, 0xa4, 0x8d, 0x7d, 0x7d, 0x8d,
-    0x91, 0x7c, 0x73, 0x7d, 0x99, 0x89, 0x6d, 0xa1, 0x98, 0x84, 0x8b, 0x6b,
-    0x89, 0x86, 0x84, 0x7e, 0x86, 0x87, 0x78, 0x8c, 0x96, 0x92, 0x5a, 0xa0,
-    0x64, 0x73, 0x91, 0x88, 0x8f, 0x6b, 0x96, 0x5c, 0x99, 0x62, 0x78, 0x6c,
-    0x87, 0x4d, 0x5d, 0x69, 0x7b, 0x81, 0x4a, 0x61, 0x71, 0x69, 0x7d, 0x91,
-    0x67, 0x92, 0x68, 0x6f, 0x50, 0x5e, 0x61, 0x7e, 0x81, 0x70, 0x5f, 0x7b,
-    0x6b, 0x55, 0x71, 0x6c, 0x70, 0x53, 0x3f, 0x80, 0x6e, 0x57, 0x96, 0x84,
-    0x75, 0x51, 0x60, 0x9a, 0x7f, 0xa5, 0x80, 0x94, 0x95, 0x74, 0x7c, 0x83,
-    0xa0, 0x93, 0x5d, 0x92, 0x83, 0x66, 0x67, 0x8a, 0x8b, 0x9b, 0x81, 0x69,
-    0x73, 0x91, 0x6b, 0x79, 0x93, 0x88, 0x64, 0x68, 0x81, 0x8c, 0x6f, 0x81,
-    0x6f, 0x80, 0x68, 0x5f, 0x9c, 0x95, 0x76, 0x93, 0x87, 0x68, 0x83, 0x94,
-    0x8b, 0x85, 0x72, 0x7f, 0x64, 0x8c, 0x6a, 0x95, 0x8d, 0x80, 0x69, 0x6b,
-    0x98, 0x86, 0x75, 0x92, 0x7a, 0x7f, 0x5b, 0x7f, 0x9b, 0x57, 0x99, 0x8d,
-    0x8a, 0x7b, 0x58, 0x73, 0x88, 0x6d, 0x8a, 0x8c, 0x8e, 0x82, 0x85, 0xaa,
-    0x72, 0xa6, 0x7f, 0x7a, 0x83, 0x59, 0x6d, 0x6e, 0x79, 0x83, 0x88, 0x84,
-    0x74, 0x85, 0x74, 0x78, 0x80, 0x7c, 0x97, 0x86, 0x94, 0x65, 0x7e, 0x80,
-    0x6f, 0x97, 0x70, 0x74, 0x92, 0x76, 0x71, 0x91, 0x85, 0x72, 0x6e, 0x84,
-    0x78, 0x7e, 0x88, 0x79, 0x7f, 0x80, 0x83, 0x7a, 0x85, 0x75, 0x82, 0x81,
-    0x82, 0x7b, 0x7a, 0xa0, 0x76, 0x7f, 0x75, 0xa7, 0x67, 0x8e, 0x81, 0x98,
-    0xa5, 0x86, 0x77, 0x78, 0x7f, 0x97, 0x90, 0x86, 0x80, 0x6b, 0x89, 0x66,
-    0x9b, 0x5c, 0x8b, 0x74, 0xac, 0x89, 0x89, 0x92, 0x92, 0xa8, 0x61, 0x85,
-    0x8c, 0x86, 0x88, 0x91, 0x92, 0x66, 0x63, 0x6c, 0x7a, 0x80, 0x7d, 0x90,
-    0x6f, 0x7f, 0x92, 0x94, 0x8e, 0x7a, 0x86, 0x98, 0xa1, 0x59, 0x71, 0x8c,
-    0x63, 0xa3, 0x60, 0x7d, 0x88, 0x6a, 0x83, 0x6e, 0x7a, 0x94, 0x7b, 0x81,
-    0x7d, 0x83, 0x77, 0x7e, 0x63, 0xab, 0x75, 0x7b, 0x71, 0x8f, 0x76, 0x6e,
-    0x78, 0x7b, 0x79, 0x86, 0x69, 0x67, 0x67, 0x70, 0x6c, 0x7a, 0x6c, 0x84,
-    0x74, 0xa2, 0x74, 0x77, 0x8a, 0x58, 0x7d, 0xa0, 0x65, 0x7b, 0x79, 0x71,
-    0x7c, 0x3c, 0x85, 0x96, 0x59, 0x76, 0x6a, 0x94, 0xa5, 0x5b, 0x70, 0x99,
-    0x7f, 0x9a, 0x69, 0x7c, 0x6f, 0x79, 0x72, 0x8b, 0x83, 0x6e, 0x73, 0x7f,
-    0x6f, 0x6d, 0x7e, 0xa3, 0x72, 0x87, 0x83, 0x8c, 0x8c, 0x70, 0x77, 0x75,
-    0xa4, 0x5a, 0x89, 0x7d, 0xa0, 0x97, 0x67, 0x80, 0x78, 0x7e, 0x86, 0x6a,
-    0x7b, 0x9c, 0x77, 0x67, 0x7b, 0x74, 0x7f, 0xa5, 0x90, 0x94, 0x92, 0x4d,
-    0x7a, 0x79, 0x9f, 0x87, 0x64, 0x6e, 0x6d, 0x59, 0x83, 0x54, 0x79, 0x82,
-    0x6c, 0x74, 0x82, 0x98, 0x77, 0x90, 0x85, 0xa4, 0x88, 0x81, 0x71, 0x85,
-    0x90, 0x8e, 0x88, 0x68, 0x51, 0x6d, 0x71, 0x7b, 0x80, 0xbc, 0xa5, 0x57,
-    0x8f, 0x9f, 0x95, 0x89, 0xb1, 0x96, 0x69, 0x65, 0x61, 0x73, 0x6f, 0x6c,
-    0x5b, 0x95, 0x99, 0x7f, 0x76, 0x9d, 0x7c, 0x7d, 0x8d, 0xb1, 0x8f, 0x6a,
-    0x76, 0x95, 0x74, 0x7a, 0x7b, 0xae, 0x77, 0x76, 0x6d, 0x99, 0x7d, 0x80,
-    0x6e, 0x89, 0x7f, 0x74, 0x6f, 0x72, 0x89, 0x8b, 0x86, 0x7b, 0x7c, 0x72,
-    0x6b, 0x4f, 0x71, 0x94, 0x80, 0x96, 0x83, 0x7e, 0x75, 0x74, 0x68, 0x83,
-    0x95, 0x8c, 0x85, 0x7a, 0x82, 0x74, 0x85, 0x83, 0x8c, 0x7e, 0x7a, 0xa0,
-    0x8e, 0x67, 0x6b, 0x82, 0x9b, 0x66, 0x6c, 0x8a, 0x88, 0x7e, 0x74, 0x9e,
-    0x88, 0x82, 0x73, 0x73, 0x79, 0x7c, 0x72, 0x6b, 0x74, 0x8b, 0xa4, 0xa4,
-    0xa3, 0x73, 0x73, 0x88, 0x8d, 0x94, 0x84, 0x9a, 0x9e, 0x93, 0x6c, 0x86,
-    0x7a, 0x7a, 0x7e, 0xaa, 0x66, 0x8f, 0x99, 0xa4, 0x70, 0x4c, 0x6f, 0x66,
-    0x8a, 0xaa, 0x69, 0x80, 0x6a, 0x5e, 0x71, 0x8f, 0x8b, 0x84, 0x75, 0x9d,
-    0x5c, 0x60, 0x61, 0x4a, 0x6f, 0x91, 0x78, 0x6e, 0x8c, 0x62, 0x88, 0x75,
-    0x64, 0x7c, 0x7d, 0x92, 0x9b, 0x96, 0x62, 0x72, 0x6c, 0x6f, 0x87, 0x5d,
-    0xa0, 0xa7, 0x7c, 0x58, 0x6e, 0x8c, 0x82, 0x84, 0x7f, 0x8b, 0x54, 0x77,
-    0x5b, 0x9a, 0x6a, 0x78, 0x5d, 0xb9, 0x8e, 0x7d, 0x6e, 0xa1, 0x66, 0x7c,
-    0x87, 0xd2, 0x7a, 0x6c, 0x82, 0xa1, 0x83, 0x59, 0x64, 0x9e, 0x65, 0x6d,
-    0x77, 0x80, 0x7c, 0x9a, 0x50, 0x9f, 0x8b, 0x7a, 0x73, 0x80, 0x92, 0x6d,
-    0x97, 0x7f, 0x74, 0x6a, 0x5f, 0x44, 0x7d, 0x99, 0x95, 0x91, 0x8f, 0x6a,
-    0x63, 0x56, 0x89, 0x96, 0xba, 0xa6, 0x71, 0x98, 0x9d, 0x3a, 0x8f, 0x77,
-    0x6d, 0x76, 0x68, 0xb4, 0x8d, 0x79, 0x7a, 0x83, 0x7f, 0x96, 0x75, 0x94,
-    0x9e, 0x51, 0x83, 0x5b, 0x66, 0x73, 0xa1, 0xbc, 0x8c, 0x70, 0x88, 0x80,
-    0x92, 0x60, 0x7d, 0xa9, 0x97, 0x74, 0x7d, 0x98, 0x7b, 0x78, 0x85, 0xa7,
-    0x8f, 0x8c, 0x91, 0x9d, 0x6a, 0x80, 0x6c, 0x8e, 0x8e, 0x91, 0x76, 0x8b,
-    0x79, 0x59, 0x7d, 0x9c, 0x69, 0x83, 0x8c, 0x95, 0x8e, 0x75, 0x9d, 0x83,
-    0x92, 0x99, 0x8a, 0x59, 0x61, 0x54, 0x63, 0x86, 0x83, 0x86, 0x98, 0x83,
-    0x73, 0x74, 0x91, 0x52, 0x60, 0x8a, 0x7c, 0x57, 0xbc, 0x9d, 0x86, 0x6b,
-    0x63, 0xa2, 0x78, 0x80, 0x75, 0xb1, 0x74, 0x76, 0x69, 0x8b, 0x7e, 0x76,
-    0x7b, 0xb3, 0x77, 0x5b, 0x6c, 0x8b, 0x83, 0x80, 0x7f, 0xd1, 0x7c, 0x58,
-    0x6f, 0x98, 0x71, 0x57, 0x60, 0xd0, 0x84, 0x62, 0x74, 0xa6, 0x8f, 0x7b,
-    0x70, 0xaa, 0x81, 0x6b, 0x7f, 0x89, 0x6a, 0x74, 0x5a, 0x8c, 0x9c, 0x77,
-    0x5d, 0x84, 0x63, 0x94, 0x8e, 0x91, 0x83, 0x4a, 0x49, 0x74, 0x6b, 0x70,
-    0xc0, 0xa0, 0x6a, 0x90, 0x8e, 0x5a, 0x70, 0x96, 0xab, 0x72, 0x7e, 0xba,
-    0xa7, 0x46, 0x86, 0x5d, 0x90, 0x76, 0x95, 0x8d, 0xa5, 0x40, 0x82, 0x8a,
-    0x7d, 0x5e, 0x73, 0x94, 0x9d, 0x58, 0x8c, 0x8b, 0x69, 0x6c, 0x9a, 0x90,
-    0xaa, 0x6f, 0x85, 0x8d, 0x64, 0x58, 0x7b, 0x97, 0xa9, 0x79, 0xa5, 0xa2,
-    0x5f, 0x57, 0x9a, 0xb4, 0x89, 0x70, 0x84, 0x73, 0x46, 0x6c, 0x6e, 0x87,
-    0x70, 0x94, 0x8a, 0x8a, 0x69, 0x7b, 0x6c, 0x68, 0x8e, 0xa2, 0x90, 0x84,
-    0x78, 0x45, 0x63, 0x78, 0x7f, 0x90, 0x9f, 0x90, 0x68, 0x43, 0x92, 0x77,
-    0x78, 0x77, 0x82, 0x7d, 0x8f, 0x6a, 0x7a, 0x70, 0x76, 0x75, 0x87, 0x63,
-    0xbc, 0x8e, 0x6a, 0x71, 0x51, 0x51, 0x75, 0x6b, 0x8a, 0xb4, 0x6a, 0x5b,
-    0x99, 0x84, 0x76, 0x84, 0x74, 0xaf, 0x86, 0x6a, 0x53, 0x97, 0x6e, 0x8e,
-    0x61, 0xc4, 0x7e, 0x5d, 0x4d, 0x96, 0x73, 0x73, 0x53, 0xc0, 0x8f, 0x68,
-    0x58, 0xae, 0x81, 0x83, 0x62, 0x98, 0x7b, 0x89, 0x54, 0x86, 0x78, 0x67,
-    0x70, 0x9b, 0x63, 0x5f, 0x2d, 0x77, 0x84, 0x79, 0x6b, 0xa4, 0x7b, 0x65,
-    0x45, 0x65, 0x56, 0x86, 0xbb, 0x8a, 0x8e, 0x92, 0x86, 0x48, 0x7c, 0x6d,
-    0xb4, 0x7d, 0x56, 0xa4, 0x86, 0x52, 0x8b, 0x6a, 0x8d, 0x5b, 0x9d, 0xa2,
-    0xbf, 0x36, 0x7c, 0x99, 0x9d, 0x65, 0x75, 0xa4, 0x9f, 0x6a, 0x7c, 0x6b,
-    0x6f, 0x55, 0x70, 0x7f, 0xc2, 0x38, 0x6e, 0xa4, 0x74, 0x4c, 0x75, 0xbb,
-    0xa4, 0x75, 0x8e, 0x8f, 0x56, 0x65, 0x57, 0x92, 0x73, 0x7f, 0x7d, 0x86,
-    0x65, 0x76, 0x92, 0x84, 0x70, 0xa8, 0x91, 0x5b, 0x69, 0x74, 0x8e, 0x82,
-    0x78, 0x8a, 0xaa, 0x71, 0x70, 0x50, 0x85, 0x82, 0x7d, 0x94, 0xa0, 0x76,
-    0x6d, 0x55, 0x86, 0x79, 0x71, 0x7f, 0x9b, 0x71, 0x8a, 0x42, 0x87, 0x64,
-    0x57, 0x88, 0xa0, 0x77, 0xa8, 0x91, 0x72, 0x65, 0x7e, 0x6b, 0x7e, 0x81,
-    0x8d, 0x97, 0x7e, 0x6a, 0x92, 0x88, 0x84, 0x7a, 0x61, 0xa9, 0x86, 0x59,
-    0x6c, 0x87, 0x61, 0x72, 0x4f, 0xc8, 0x99, 0x6c, 0x66, 0xa3, 0x80, 0x8b,
-    0x5c, 0xc0, 0x69, 0x7a, 0x6c, 0xb8, 0x8e, 0x91, 0x51, 0x9f, 0x8c, 0x85,
-    0x75, 0x96, 0x8c, 0x84, 0x6b, 0xa6, 0x71, 0x62, 0x42, 0x60, 0x74, 0x72,
-    0x92, 0x91, 0x70, 0x5b, 0x3d, 0x71, 0x5e, 0x91, 0xa3, 0xa5, 0x6a, 0x7c,
-    0x60, 0x58, 0x82, 0x80, 0xa3, 0x73, 0x8f, 0xa0, 0xb2, 0x4b, 0x94, 0x5e,
-    0x9f, 0x75, 0x4d, 0x83, 0xbc, 0x42, 0x5e, 0x80, 0x8f, 0x59, 0x53, 0xac,
-    0xb2, 0x45, 0x68, 0x7d, 0x9a, 0x65, 0x8a, 0xaa, 0xa0, 0x4e, 0x77, 0x72,
-    0x4d, 0x62, 0x6e, 0x98, 0x8c, 0x73, 0x92, 0x5a, 0x49, 0x55, 0x7b, 0x98,
-    0x8d, 0x84, 0x80, 0x8e, 0x2e, 0x56, 0x78, 0x73, 0x7b, 0x8f, 0x9a, 0x69,
-    0x73, 0x68, 0x7a, 0x88, 0x78, 0xa5, 0xb1, 0x5c, 0x8f, 0x55, 0x71, 0x99,
-    0x7a, 0xa9, 0xb0, 0x75, 0x69, 0x44, 0x5f, 0x66, 0x81, 0x7d, 0x9e, 0x4f,
-    0x66, 0x7f, 0x87, 0x7d, 0x5d, 0x7c, 0x95, 0x62, 0xa5, 0x86, 0x90, 0x6f,
-    0x60, 0xa5, 0x6e, 0x70, 0x80, 0x96, 0x6f, 0x55, 0x77, 0x87, 0x99, 0x7b,
-    0x21, 0xaa, 0x7f, 0x60, 0x63, 0xae, 0x47, 0x79, 0x44, 0xb5, 0x83, 0x6e,
-    0x6d, 0x93, 0x76, 0x54, 0x4b, 0xad, 0x91, 0x6b, 0x6a, 0x9c, 0x8c, 0x83,
-    0x62, 0x8a, 0x88, 0x71, 0x73, 0xa0, 0x75, 0x95, 0x54, 0x80, 0x92, 0x65,
-    0x45, 0x80, 0x63, 0x9a, 0x93, 0x9b, 0x78, 0x4e, 0x4d, 0x5f, 0x69, 0x9e,
-    0xbd, 0xa5, 0x75, 0x6b, 0x6e, 0x6a, 0x82, 0x97, 0xab, 0x60, 0x76, 0xb3,
-    0xc1, 0x39, 0x82, 0x5b, 0x71, 0x31, 0x7b, 0x9c, 0xb5, 0x4f, 0x75, 0x79,
-    0x6c, 0x5d, 0x80, 0xa6, 0x9c, 0x53, 0x6f, 0x85, 0x84, 0x5e, 0x7d, 0xb5,
-    0x95, 0x5f, 0x7c, 0x98, 0x72, 0x7c, 0x67, 0x99, 0xbb, 0x6c, 0x73, 0x66,
-    0x59, 0x5c, 0x6c, 0x9a, 0x9b, 0x72, 0x9b, 0x5f, 0x4b, 0x51, 0x63, 0x84,
-    0x74, 0xa0, 0xb3, 0x6e, 0x63, 0xa0, 0x84, 0x90, 0x71, 0x91, 0xba, 0x64,
-    0x6d, 0x72, 0x78, 0x83, 0x6f, 0x8e, 0xbd, 0x64, 0x69, 0x60, 0x95, 0x67,
-    0x70, 0x93, 0x78, 0x4d, 0x91, 0x3f, 0x7b, 0x6d, 0x69, 0x87, 0x7d, 0x8a,
-    0xa3, 0x95, 0x9d, 0x66, 0x6d, 0x8b, 0x7a, 0x75, 0x94, 0x7b, 0x89, 0x52,
-    0x66, 0x65, 0x79, 0x84, 0x49, 0x9c, 0x60, 0x66, 0x3e, 0xab, 0x4a, 0x86,
-    0x54, 0xcd, 0x7c, 0x83, 0x7c, 0xac, 0x8b, 0x53, 0x67, 0xbb, 0x7c, 0x6d,
-    0x72, 0xb3, 0x83, 0x85, 0x4f, 0x97, 0x86, 0x60, 0x7d, 0x93, 0x70, 0x8b,
-    0x64, 0x78, 0x82, 0x73, 0x54, 0x87, 0x6c, 0xaa, 0x6f, 0x97, 0x8d, 0x51,
-    0x2d, 0x50, 0x75, 0xa9, 0xc2, 0x94, 0x8d, 0x6f, 0x6d, 0x71, 0x7b, 0x87,
-    0x93, 0x67, 0x7d, 0xa5, 0xa2, 0x4f, 0x99, 0x83, 0x95, 0x49, 0x70, 0x9c,
-    0xcf, 0x37, 0x84, 0x86, 0x94, 0x5c, 0x95, 0xa1, 0xb6, 0x73, 0x80, 0x8d,
-    0x89, 0x62, 0x6f, 0xb4, 0xa1, 0x5b, 0x64, 0x91, 0x41, 0x4f, 0x53, 0xa6,
-    0xae, 0x75, 0x84, 0x82, 0x58, 0x8e, 0x63, 0x95, 0xa3, 0x8d, 0x8b, 0x76,
-    0x5d, 0x78, 0x80, 0x82, 0x6e, 0x9d, 0xb8, 0x7d, 0x64, 0x8a, 0x7e, 0x80,
-    0x72, 0x99, 0xcf, 0x76, 0x66, 0x77, 0x7c, 0x81, 0x71, 0x6f, 0xa1, 0x6c,
-    0x6b, 0x70, 0x80, 0x7c, 0x6d, 0x83, 0x8e, 0x74, 0x7a, 0x58, 0x69, 0x53,
-    0x58, 0x7d, 0x7f, 0x84, 0x96, 0x9c, 0x75, 0x6e, 0x62, 0x7c, 0x88, 0x7e,
-    0x7f, 0x98, 0x93, 0x61, 0x98, 0x98, 0x80, 0x83, 0x2e, 0x7d, 0x64, 0x69,
-    0x50, 0xa5, 0x38, 0x96, 0x2e, 0xc5, 0x66, 0x56, 0x64, 0xaa, 0x63, 0x64,
-    0x6d, 0xb3, 0x8a, 0x6c, 0x59, 0xb6, 0x69, 0x7a, 0x54, 0x91, 0x58, 0x96,
-    0x6b, 0x9f, 0x6d, 0x88, 0x4a, 0x82, 0x94, 0x67, 0x38, 0x93, 0x60, 0x87,
-    0x8c, 0x93, 0x8c, 0x52, 0x31, 0x43, 0x66, 0xa9, 0xb3, 0x7a, 0x88, 0x64,
-    0x60, 0x5b, 0x80, 0x84, 0xb7, 0x5a, 0x7a, 0x9d, 0x92, 0x50, 0x89, 0x80,
-    0x72, 0x51, 0x7f, 0x85, 0xae, 0x47, 0x76, 0x9a, 0x7a, 0x74, 0x6d, 0x93,
-    0xbd, 0x42, 0x72, 0x6d, 0x58, 0x5e, 0x6e, 0xa4, 0xb5, 0x4e, 0x76, 0x8f,
-    0x75, 0x9b, 0x5d, 0x92, 0xad, 0x77, 0x7f, 0x73, 0x62, 0x7d, 0x65, 0xaf,
-    0x98, 0x87, 0x80, 0x7c, 0x61, 0x81, 0x45, 0xa0, 0x84, 0x99, 0xbb, 0x72,
-    0x86, 0x8f, 0x70, 0x97, 0x6a, 0x8a, 0xd3, 0x70, 0x7c, 0x91, 0x77, 0x82,
-    0x70, 0x8c, 0xd5, 0x6c, 0x7f, 0x51, 0x5f, 0x69, 0x72, 0x89, 0x9a, 0x68,
-    0x79, 0x70, 0x8b, 0x80, 0x52, 0x98, 0x86, 0x7a, 0xa0, 0x7b, 0x61, 0x6e,
-    0x66, 0x6f, 0x77, 0x78, 0x64, 0xac, 0x7e, 0x73, 0x5d, 0x71, 0x6f, 0x80,
-    0x2e, 0xa9, 0x90, 0x5c, 0x56, 0xa1, 0x32, 0x88, 0x55, 0xb9, 0x67, 0x6f,
-    0x5c, 0xa5, 0x87, 0x61, 0x6b, 0xbd, 0x77, 0x7c, 0x62, 0xae, 0x7c, 0x7a,
-    0x66, 0xac, 0x7a, 0x62, 0x5c, 0x9a, 0x58, 0x89, 0x5a, 0x74, 0x72, 0x66,
-    0x5c, 0x8e, 0x51, 0x8e, 0x99, 0x92, 0xa0, 0x49, 0x31, 0x55, 0x68, 0x99,
-    0xba, 0x82, 0xa2, 0x7a, 0x5e, 0x6f, 0x84, 0x98, 0x96, 0x52, 0x73, 0x99,
-    0xb4, 0x5e, 0x7c, 0x59, 0x7d, 0x4a, 0x7e, 0xa0, 0xbe, 0x63, 0x67, 0x8e,
-    0x7f, 0x71, 0x80, 0xaf, 0x93, 0x4e, 0x78, 0x7e, 0x6d, 0x52, 0x66, 0xb3,
-    0x94, 0x56, 0x84, 0x8f, 0x50, 0x6d, 0x65, 0xa8, 0xb3, 0x4b, 0x91, 0x7f,
-    0x4c, 0x8d, 0x69, 0x79, 0x95, 0x8f, 0x8f, 0x7c, 0x66, 0x98, 0x75, 0x9b,
-    0x73, 0x9b, 0xac, 0x79, 0x6e, 0x84, 0x69, 0x9e, 0x80, 0xa0, 0xb0, 0x6c,
-    0x46, 0x8b, 0x3f, 0x7a, 0x79, 0x79, 0xb3, 0x62, 0x6b, 0x60, 0x67, 0x81,
-    0x4a, 0x7e, 0xa7, 0x8c, 0x74, 0x7f, 0x67, 0x4c, 0x4b, 0x8c, 0x8e, 0x67,
-    0x78, 0x9d, 0x94, 0x79, 0x75, 0x7c, 0x86, 0x7b, 0x67, 0x9f, 0xa4, 0x61,
-    0x5b, 0x6e, 0x85, 0x70, 0x20, 0xa5, 0x66, 0x5e, 0x55, 0xad, 0x3e, 0x7c,
-    0x2d, 0xb4, 0x78, 0x6f, 0x4c, 0xc6, 0x7e, 0x6d, 0x54, 0xb4, 0x71, 0x78,
-    0x54, 0xc3, 0x66, 0x6e, 0x4a, 0xa0, 0x7b, 0x85, 0x66, 0x94, 0x75, 0x8d,
-    0x34, 0x88, 0x71, 0x4e, 0x49, 0x8a, 0x3b, 0x9c, 0x88, 0x76, 0x7f, 0x6a,
-    0x37, 0x64, 0x66, 0xb6, 0xa3, 0x82, 0x76, 0x82, 0x6d, 0x65, 0x6f, 0x8c,
-    0x99, 0x5e, 0x77, 0xa1, 0x99, 0x51, 0xa1, 0x67, 0x6f, 0x4c, 0x7f, 0x9e,
-    0xad, 0x40, 0x65, 0x82, 0x76, 0x66, 0x72, 0xb5, 0xb2, 0x5b, 0x71, 0x8a,
-    0x76, 0x74, 0x52, 0xa0, 0x91, 0x37, 0x86, 0x72, 0x6c, 0x75, 0x62, 0xa5,
-    0xb6, 0x57, 0x75, 0x90, 0x3e, 0x7f, 0x49, 0x9f, 0x8e, 0x92, 0x81, 0x87,
-    0x69, 0x9e, 0x6b, 0x86, 0x8d, 0xb1, 0x9e, 0x65, 0x6f, 0x93, 0x70, 0x79,
-    0x7b, 0x87, 0xbe, 0x59, 0x69, 0x7a, 0x56, 0x7a, 0x81, 0x7d, 0xb8, 0x67,
-    0x67, 0x7f, 0x54, 0x8f, 0x71, 0x85, 0xa0, 0x74, 0x89, 0x5d, 0x67, 0x52,
-    0x65, 0x96, 0x89, 0x84, 0x81, 0x83, 0x82, 0x9a, 0x85, 0x73, 0x78, 0x62,
-    0x87, 0x98, 0x75, 0x6a, 0x73, 0x95, 0x86, 0x71, 0x11, 0x9a, 0x91, 0x66,
-    0x6e, 0xa4, 0x35, 0x89, 0x47, 0xbb, 0x5e, 0x46, 0x3a, 0xa8, 0x70, 0x4a,
-    0x65, 0xb9, 0x70, 0x96, 0x66, 0xcf, 0x80, 0x79, 0x60, 0xa4, 0x79, 0x70,
-    0x68, 0x92, 0x7f, 0x89, 0x6b, 0x87, 0x77, 0x67, 0x5b, 0x74, 0x3f, 0x9e,
-    0x94, 0x9b, 0xa1, 0x61, 0x4b, 0x66, 0x70, 0xad, 0xb7, 0x67, 0x70, 0x6c,
-    0x3f, 0x5b, 0x94, 0x88, 0xb3, 0x4f, 0x97, 0x97, 0x8c, 0x55, 0xb8, 0x78,
-    0x60, 0x25, 0x51, 0x91, 0xcd, 0x44, 0x6f, 0x85, 0x5c, 0x65, 0x67, 0xa5,
-    0x9e, 0x5f, 0x6d, 0x85, 0x6d, 0x56, 0x80, 0xae, 0x79, 0x63, 0x4f, 0x7d,
-    0x5f, 0x6b, 0x6e, 0xa7, 0x8e, 0x76, 0x8f, 0x90, 0x6e, 0x8c, 0x88, 0x92,
-    0x81, 0x81, 0x96, 0x7d, 0x48, 0x6b, 0x3f, 0xa1, 0x8c, 0xa2, 0x9f, 0x7f,
-    0x77, 0x97, 0x73, 0x9c, 0x67, 0x95, 0xae, 0x77, 0x7f, 0x7a, 0x52, 0x7e,
-    0x91, 0x77, 0xa8, 0x54, 0x6a, 0x74, 0x52, 0x8a, 0x67, 0x8e, 0x90, 0x8d,
-    0x8b, 0x52, 0x72, 0x5a, 0x73, 0x8f, 0x94, 0x87, 0x7c, 0x88, 0x89, 0x76,
-    0x77, 0x88, 0x5c, 0x77, 0x8f, 0x94, 0xac, 0x58, 0x70, 0x79, 0x75, 0x8a,
-    0x20, 0x9c, 0x91, 0x55, 0x55, 0xa4, 0x5b, 0x84, 0x30, 0xc6, 0x8a, 0x51,
-    0x31, 0xc3, 0x72, 0x6b, 0x65, 0xb9, 0x79, 0x7d, 0x62, 0xad, 0x88, 0x75,
-    0x37, 0xb0, 0x76, 0x8a, 0x7d, 0x85, 0x7f, 0xb4, 0x46, 0x9c, 0x83, 0x7b,
-    0x79, 0x78, 0x56, 0xac, 0x8d, 0xa2, 0xa9, 0x54, 0x44, 0x5a, 0x63, 0xb2,
-    0xa8, 0x72, 0xa4, 0x6b, 0x5d, 0x4d, 0x8e, 0x95, 0x9e, 0x4a, 0x98, 0x8c,
-    0xb0, 0x5c, 0xa5, 0x75, 0x83, 0x3b, 0x46, 0x92, 0xa7, 0x3b, 0x6a, 0x75,
-    0x59, 0x57, 0x52, 0xa1, 0xab, 0x54, 0x68, 0x7c, 0x94, 0x6e, 0x5b, 0x9a,
-    0xa3, 0x5d, 0x73, 0x74, 0x5a, 0x63, 0x56, 0x9e, 0xc1, 0x71, 0x82, 0x79,
-    0x49, 0x92, 0x63, 0xa6, 0x99, 0x7d, 0x71, 0x81, 0x5e, 0x90, 0x5c, 0x8b,
-    0x7e, 0xb4, 0xa0, 0x8c, 0x67, 0x93, 0x4e, 0x72, 0x65, 0x83, 0xb5, 0x77,
-    0x83, 0x92, 0x43, 0x67, 0x8c, 0x81, 0xb1, 0x75, 0x6a, 0x61, 0x66, 0x6f,
-    0x5d, 0x7f, 0x8d, 0x7b, 0x6b, 0x68, 0x6f, 0x85, 0x6e, 0x87, 0x97, 0x89,
-    0x9b, 0x81, 0x7e, 0x7e, 0x9d, 0x83, 0x6b, 0x6a, 0xa5, 0x92, 0x7e, 0x70,
-    0x60, 0x8f, 0x6f, 0x8b, 0x15, 0xa6, 0x66, 0x4e, 0x61, 0xbc, 0x38, 0x67,
-    0x46, 0xab, 0x84, 0x5e, 0x3a, 0xac, 0x74, 0x58, 0x76, 0xc4, 0x7a, 0x76,
-    0x67, 0xc0, 0x76, 0x6f, 0x52, 0xa6, 0xa2, 0x97, 0x76, 0xa6, 0x7f, 0x99,
-    0x5d, 0xa5, 0x5f, 0x60, 0x58, 0x88, 0x3f, 0x9e, 0x7d, 0x81, 0x71, 0x63,
-    0x42, 0x55, 0x3e, 0xbd, 0xa9, 0x7a, 0xa5, 0x67, 0x62, 0x7a, 0x80, 0x9e,
-    0xc3, 0x54, 0x7f, 0x9f, 0x93, 0x73, 0xbd, 0x79, 0x74, 0x2e, 0x54, 0x9e,
-    0xaa, 0x76, 0x68, 0x80, 0x78, 0x64, 0x57, 0x93, 0xa4, 0x56, 0x75, 0x72,
-    0x81, 0x7f, 0x48, 0xad, 0x89, 0x67, 0x60, 0x7e, 0x7a, 0x83, 0x6e, 0x95,
-    0xb0, 0x57, 0x89, 0x91, 0x4d, 0x86, 0x78, 0x7b, 0x74, 0x8c, 0x8f, 0x8d,
-    0x67, 0xa4, 0x64, 0x8d, 0x77, 0x9a, 0xa1, 0x88, 0x6e, 0x94, 0x33, 0x95,
-    0x81, 0x76, 0xc6, 0x7d, 0x7d, 0x85, 0x5a, 0x6e, 0x8e, 0x69, 0x9e, 0x71,
-    0x82, 0x81, 0x59, 0x5b, 0x71, 0x9a, 0x91, 0x8e, 0x80, 0x69, 0x71, 0x73,
-    0x6e, 0x9a, 0x95, 0x94, 0x7b, 0x80, 0x82, 0x7e, 0x76, 0x84, 0x70, 0x72,
-    0x9c, 0xa0, 0x77, 0x66, 0x55, 0xa1, 0x8c, 0x73, 0x35, 0xa0, 0x68, 0x4d,
-    0x3b, 0xaa, 0x44, 0x6f, 0x3c, 0xc0, 0x96, 0x78, 0x33, 0xbd, 0x64, 0x5b,
-    0x75, 0xd2, 0x83, 0x87, 0x59, 0xbd, 0x80, 0x80, 0x6e, 0x8e, 0x65, 0x7a,
-    0x87, 0xb6, 0x8d, 0x94, 0x39, 0x95, 0x8b, 0x5d, 0x66, 0x71, 0x4e, 0x9f,
-    0x96, 0x8a, 0x98, 0x47, 0x41, 0x6c, 0x4c, 0xac, 0x95, 0x81, 0x90, 0x75,
-    0x59, 0x4c, 0xa2, 0x93, 0x99, 0x58, 0x7b, 0xaf, 0xa3, 0x52, 0xb0, 0x6c,
-    0x5f, 0x47, 0x6e, 0x8e, 0xae, 0x3d, 0x81, 0x6d, 0x78, 0x52, 0x4f, 0x81,
-    0x80, 0x68, 0x4b, 0x81, 0x74, 0x71, 0x67, 0xa7, 0x9a, 0x55, 0x84, 0x72,
-    0x64, 0x6b, 0x6e, 0x9d, 0xab, 0x76, 0x79, 0x85, 0x40, 0x84, 0x80, 0x85,
-    0x70, 0x91, 0x9a, 0x81, 0x5b, 0x89, 0x6b, 0x8a, 0x92, 0x8c, 0xa4, 0x7b,
-    0x75, 0x89, 0x54, 0x76, 0x69, 0x69, 0xb3, 0x6c, 0x47, 0x7d, 0x4c, 0x7f,
-    0x81, 0x86, 0x8f, 0x63, 0x71, 0x6a, 0x63, 0x67, 0x7c, 0x8f, 0xa0, 0x68,
-    0x86, 0x58, 0x5b, 0x87, 0x6a, 0x82, 0x89, 0x78, 0x9d, 0x8d, 0xaa, 0x82,
-    0x6e, 0xa4, 0x6f, 0x6d, 0x70, 0x9f, 0x7f, 0x77, 0x41, 0xa5, 0x86, 0x61,
-    0x2d, 0x99, 0xa9, 0x5f, 0x5a, 0xb3, 0x51, 0x70, 0x5a, 0xce, 0x77, 0x68,
-    0x2c, 0xb8, 0x90, 0x44, 0x58, 0xb9, 0x74, 0x8e, 0x70, 0xb3, 0x9a, 0x75,
-    0x6d, 0xc0, 0x9e, 0x8e, 0x8d, 0xa8, 0x7b, 0xa8, 0x4a, 0x89, 0x6e, 0x7f,
-    0x5d, 0x6e, 0x46, 0x91, 0x6d, 0x81, 0x89, 0x3e, 0x35, 0x69, 0x44, 0xaf,
-    0x99, 0x8d, 0x94, 0x54, 0x60, 0x5b, 0xaf, 0x97, 0x92, 0x4e, 0x80, 0xae,
-    0x9e, 0x62, 0xa3, 0x77, 0x6e, 0x5d, 0x71, 0xa0, 0xa6, 0x59, 0x84, 0x5d,
-    0x65, 0x4a, 0x69, 0xa1, 0xa1, 0x40, 0x75, 0x65, 0x6b, 0x68, 0x60, 0xb3,
-    0x92, 0x27, 0x70, 0x67, 0x9b, 0x5e, 0x50, 0xaf, 0xae, 0x64, 0x7a, 0x6e,
-    0x61, 0x94, 0x3b, 0x8f, 0x86, 0x7f, 0x98, 0x88, 0x7a, 0x7f, 0x61, 0x7b,
-    0x64, 0x96, 0x96, 0x79, 0x5c, 0x96, 0x52, 0x92, 0x76, 0x7e, 0xc4, 0x60,
-    0x6d, 0x7b, 0x41, 0x8c, 0x7b, 0x8e, 0x9a, 0x66, 0x79, 0x95, 0x67, 0x6a,
-    0x7a, 0x9b, 0xa9, 0x85, 0x6d, 0x66, 0x55, 0x65, 0x76, 0x8b, 0x90, 0x86,
-    0x88, 0x8b, 0x8f, 0x7e, 0x83, 0x7c, 0x75, 0x5f, 0x78, 0x96, 0x76, 0x47,
-    0x54, 0x9c, 0x8d, 0x7d, 0x24, 0x9f, 0x79, 0x5c, 0x55, 0xb2, 0x3b, 0x67,
-    0x4e, 0xd2, 0x90, 0x79, 0x3c, 0xc3, 0x8b, 0x4a, 0x7c, 0xd7, 0x70, 0x75,
-    0x5b, 0xaf, 0xa8, 0x6b, 0x59, 0xc1, 0x6d, 0x5f, 0x5d, 0x96, 0x87, 0x9a,
-    0x5d, 0x7f, 0x8e, 0x6d, 0x5c, 0x75, 0x3f, 0xb6, 0x8e, 0x81, 0x7b, 0x31,
-    0x47, 0x67, 0x56, 0xb6, 0x90, 0x71, 0x89, 0x63, 0x61, 0x75, 0x8d, 0x8b,
-    0x97, 0x62, 0x62, 0x85, 0x9c, 0x64, 0xb7, 0x61, 0x71, 0x3f, 0x6c, 0x8b,
-    0xaa, 0x43, 0x82, 0x70, 0x52, 0x52, 0x80, 0xaa, 0x9e, 0x5d, 0x90, 0x69,
-    0x8a, 0x77, 0x6d, 0x9f, 0x9e, 0x5f, 0x84, 0x61, 0x87, 0x70, 0x43, 0xab,
-    0x97, 0x6e, 0x84, 0x6c, 0x5d, 0x82, 0x64, 0x85, 0x83, 0x7e, 0x82, 0x7c,
-    0x7b, 0x91, 0x55, 0x7e, 0x77, 0x88, 0xba, 0x71, 0x6d, 0x7b, 0x71, 0x8a,
-    0x7f, 0x84, 0xb5, 0x63, 0x4a, 0x9a, 0x3c, 0x70, 0x7a, 0x99, 0xa3, 0x50,
-    0x84, 0x82, 0x56, 0x4c, 0x74, 0x8e, 0xa3, 0x77, 0x8f, 0x4e, 0x5f, 0x6d,
-    0x97, 0x89, 0xa0, 0x6b, 0x7c, 0x8c, 0x85, 0x82, 0x8e, 0xa1, 0x89, 0x5b,
-    0x7f, 0x8b, 0x8f, 0x5e, 0x74, 0x96, 0x8a, 0x7d, 0x15, 0x7b, 0x8f, 0x88,
-    0x5f, 0xa7, 0x63, 0x5b, 0x39, 0xbd, 0x96, 0x56, 0x4c, 0xb4, 0x7b, 0x53,
-    0x5a, 0xaf, 0x79, 0x7b, 0x5c, 0xa6, 0xaa, 0x74, 0x5f, 0xa0, 0x76, 0x9e,
-    0x71, 0x9a, 0x60, 0xa4, 0x33, 0x87, 0x66, 0x66, 0x64, 0x7d, 0x6d, 0xac,
-    0x9e, 0x8c, 0x78, 0x4f, 0x3d, 0x7b, 0x53, 0xb1, 0x97, 0x8a, 0x96, 0x6e,
-    0x60, 0x4b, 0xa9, 0x9e, 0x93, 0x6e, 0x93, 0xb7, 0xae, 0x46, 0xb9, 0x60,
-    0x72, 0x46, 0x80, 0x95, 0xb5, 0x57, 0x82, 0x53, 0x6e, 0x4e, 0x5b, 0xa2,
-    0x9a, 0x3d, 0x8b, 0x6c, 0x84, 0x65, 0x69, 0xa1, 0x8c, 0x60, 0x83, 0x74,
-    0x73, 0x53, 0x5d, 0x7e, 0x7f, 0x79, 0x6e, 0x81, 0x89, 0x8f, 0x51, 0x81,
-    0x99, 0x97, 0x81, 0x8a, 0x87, 0x83, 0x43, 0x90, 0x89, 0x94, 0x93, 0x7a,
-    0x66, 0x80, 0x82, 0x82, 0x79, 0x85, 0xb0, 0x6b, 0x87, 0x7b, 0x53, 0x89,
-    0x79, 0x9d, 0xab, 0x6e, 0x82, 0x84, 0x50, 0x8f, 0x7e, 0x74, 0x90, 0x74,
-    0x6e, 0x65, 0x84, 0x70, 0x82, 0x7a, 0x9e, 0x6d, 0x8f, 0x62, 0xb2, 0x84,
-    0x78, 0x7e, 0x72, 0x5a, 0x7a, 0x85, 0x8c, 0x4b, 0x70, 0x99, 0x87, 0x78,
-    0x26, 0x95, 0xb9, 0x77, 0x4d, 0xb6, 0x51, 0x6a, 0x41, 0xbf, 0x76, 0x68,
-    0x56, 0xb6, 0x80, 0x53, 0x83, 0xaf, 0x87, 0x79, 0x79, 0xb4, 0x89, 0x7d,
-    0x47, 0x9d, 0xa0, 0x86, 0x89, 0xc3, 0x6d, 0x99, 0x41, 0x89, 0x9a, 0x59,
-    0x54, 0x83, 0x79, 0x9d, 0x7b, 0x73, 0x88, 0x4a, 0x42, 0x64, 0x7a, 0x9f,
-    0x7b, 0x6e, 0x71, 0x7b, 0x6a, 0x61, 0xae, 0xa3, 0xa0, 0x68, 0x95, 0x9d,
-    0x94, 0x49, 0x8b, 0x70, 0x8a, 0x5f, 0x49, 0xbb, 0xa7, 0x4a, 0xa1, 0x59,
-    0x59, 0x59, 0x6d, 0xa0, 0x9f, 0x50, 0xa0, 0x7b, 0x75, 0x49, 0x5a, 0x8c,
-    0x84, 0x68, 0x78, 0x57, 0x7a, 0x6e, 0x6b, 0x87, 0x9c, 0x7b, 0x84, 0x83,
-    0x79, 0x7d, 0x5a, 0x77, 0x77, 0x6f, 0x6f, 0x7c, 0x8f, 0x83, 0x40, 0x62,
-    0x6a, 0x87, 0xab, 0x74, 0x86, 0x96, 0x7a, 0x7d, 0x7b, 0x81, 0x9a, 0x65,
-    0x60, 0x82, 0x61, 0x73, 0x71, 0x77, 0xa7, 0x79, 0x87, 0x8c, 0x4e, 0x72,
-    0x8d, 0x89, 0x94, 0x6d, 0x75, 0x6d, 0x6e, 0x82, 0x7a, 0x8d, 0xa9, 0x77,
-    0x77, 0x7c, 0x74, 0xa7, 0xb7, 0x67, 0x75, 0x67, 0x7e, 0x9f, 0x73, 0x60,
-    0x6c, 0x95, 0x7f, 0x62, 0x31, 0x70, 0x85, 0x7a, 0x5f, 0xc0, 0x69, 0x66,
-    0x71, 0xb0, 0x81, 0x5d, 0x48, 0xc9, 0x86, 0x39, 0x93, 0xa4, 0x8e, 0x7c,
-    0x5e, 0xbb, 0x98, 0x5c, 0x74, 0x9c, 0x89, 0x6d, 0x74, 0xbd, 0x8e, 0x6e,
-    0x5f, 0x9a, 0x6d, 0x70, 0x57, 0x9c, 0x58, 0xb7, 0x8e, 0x94, 0xa0, 0x3f,
-    0x39, 0x75, 0x6f, 0xb4, 0xa2, 0x94, 0xa9, 0x70, 0x61, 0x8a, 0x70, 0x92,
-    0xa7, 0x7f, 0x7f, 0x8d, 0x7a, 0x73, 0xa1, 0x5f, 0x8a, 0x4a, 0x65, 0xaa,
-    0x92, 0x6e, 0x98, 0x51, 0x81, 0x47, 0x57, 0xb8, 0x89, 0x50, 0x8a, 0x6d,
-    0x8b, 0x50, 0x8a, 0x86, 0x9b, 0x7d, 0x5b, 0x4a, 0x68, 0x74, 0x53, 0x9b,
-    0x94, 0x74, 0x7c, 0x6f, 0x62, 0x86, 0x5b, 0x8f, 0x82, 0x96, 0x6e, 0x7c,
-    0x80, 0x8f, 0x47, 0x5b, 0x70, 0x95, 0x97, 0x77, 0x8d, 0x8e, 0x69, 0x62,
-    0x78, 0x8f, 0xbf, 0x5e, 0x76, 0xae, 0x4d, 0x84, 0x73, 0x76, 0xab, 0x6f,
-    0x7f, 0x8c, 0x4b, 0x7d, 0x96, 0x7d, 0xb3, 0x55, 0x78, 0x8d, 0x76, 0x73,
-    0x8d, 0x8e, 0x98, 0x6a, 0x91, 0x86, 0x6d, 0x8c, 0x7d, 0x93, 0x97, 0x56,
-    0x79, 0x8f, 0xa3, 0x7f, 0x7e, 0x82, 0xa0, 0x63, 0x3d, 0x6b, 0x88, 0x5e,
-    0x61, 0xc0, 0x45, 0x5f, 0x66, 0xb0, 0x6c, 0x6d, 0x29, 0xd5, 0x95, 0x3b,
-    0x77, 0xaa, 0x62, 0x70, 0x63, 0xce, 0x8c, 0x6e, 0x56, 0xaa, 0x77, 0x6e,
-    0x90, 0xcc, 0x6d, 0x7e, 0x41, 0x9f, 0x88, 0x4f, 0x5d, 0xb4, 0x4c, 0x9b,
-    0x80, 0x97, 0x98, 0x59, 0x4c, 0x71, 0x53, 0xb4, 0x90, 0x97, 0x93, 0x90,
-    0x46, 0x63, 0xa6, 0x87, 0x9d, 0x56, 0x7f, 0xab, 0x8e, 0x68, 0xc6, 0x5d,
-    0x6e, 0x58, 0x4b, 0x85, 0xa1, 0x70, 0x8a, 0x60, 0x84, 0x44, 0x68, 0x8e,
-    0x9b, 0x3a, 0x8c, 0x57, 0x91, 0x4c, 0x6b, 0x9c, 0xa7, 0x64, 0x82, 0x5f,
-    0x68, 0x6d, 0x4d, 0xa1, 0x6c, 0x91, 0x6c, 0x6b, 0x64, 0x97, 0x86, 0x81,
-    0x8d, 0x8e, 0x80, 0x72, 0x88, 0x96, 0x5d, 0x6e, 0x7c, 0x67, 0x97, 0x69,
-    0x95, 0x93, 0x61, 0x8b, 0x9b, 0x7d, 0xc8, 0x6f, 0x85, 0x80, 0x67, 0x68,
-    0x90, 0x6b, 0xcc, 0x7c, 0xa3, 0xa0, 0x58, 0x81, 0x7a, 0x8d, 0x9f, 0x65,
-    0x81, 0x82, 0x78, 0x6b, 0x85, 0x7b, 0x9b, 0x69, 0x86, 0x6c, 0x83, 0x6c,
-    0x8e, 0x59, 0xab, 0x56, 0x7c, 0x7f, 0x7b, 0x84, 0x71, 0x63, 0x7d, 0x73,
-    0x60, 0x8b, 0x7a, 0x7b, 0x5e, 0xbb, 0x4b, 0x40, 0x30, 0xcc, 0x80, 0x65,
-    0x6c, 0xb7, 0x80, 0x35, 0x7d, 0xa3, 0x5c, 0x6c, 0x49, 0xa6, 0x9b, 0x7b,
-    0x53, 0xba, 0x62, 0x76, 0x78, 0xa0, 0x72, 0x80, 0x78, 0x93, 0x87, 0x62,
-    0x64, 0x84, 0x6f, 0xa1, 0x70, 0x90, 0x9a, 0x6b, 0x42, 0x55, 0x6d, 0xc5,
-    0xa6, 0x8a, 0x79, 0x64, 0x4c, 0x72, 0x7b, 0xa9, 0xa3, 0x70, 0x84, 0x8f,
-    0x63, 0x7a, 0x9c, 0x4e, 0x5a, 0x76, 0x91, 0x67, 0xaf, 0x76, 0xbf, 0x46,
-    0x62, 0x3f, 0x7d, 0xa7, 0x8d, 0x62, 0x90, 0x5b, 0x9a, 0x44, 0x51, 0x80,
-    0xa6, 0x7e, 0x8d, 0x6a, 0x73, 0x65, 0x72, 0x82, 0x99, 0xb4, 0x6a, 0x75,
-    0x85, 0x90, 0x47, 0x62, 0x9e, 0x95, 0x94, 0x78, 0x89, 0x74, 0x5d, 0xa3,
-    0x7f, 0x9d, 0x7d, 0x63, 0x96, 0x86, 0x8d, 0xa2, 0x95, 0xab, 0xae, 0x5d,
-    0x93, 0x8d, 0x3d, 0x76, 0x9e, 0x9c, 0xc4, 0x71, 0x7d, 0xa3, 0x75, 0x7e,
-    0x6d, 0x9d, 0xa3, 0x7f, 0x94, 0x89, 0x47, 0x71, 0x8b, 0x95, 0xb1, 0x72,
-    0x90, 0x53, 0x7e, 0x8f, 0x8c, 0x90, 0xa1, 0x4d, 0x59, 0x62, 0x73, 0xa0,
-    0x69, 0x88, 0x86, 0x71, 0x60, 0x3b, 0x81, 0x57, 0x7d, 0x86, 0x58, 0x63,
-    0x7d, 0x98, 0x74, 0x67, 0x5d, 0xb0, 0x67, 0x45, 0x9b, 0xa9, 0x94, 0x68,
-    0x43, 0x8b, 0x85, 0x56, 0x63, 0x96, 0x87, 0x78, 0x88, 0xbf, 0x92, 0x8d,
-    0x60, 0xa8, 0x7e, 0x7e, 0x78, 0x80, 0x66, 0x92, 0x6e, 0x97, 0xab, 0x7f,
-    0x4f, 0x65, 0x59, 0xb0, 0x9b, 0x6b, 0x9f, 0x70, 0x6f, 0x5c, 0xac, 0x95,
-    0xa3, 0x54, 0x8e, 0xa9, 0x9e, 0x8c, 0xa5, 0x66, 0x5f, 0x5b, 0x6c, 0x83,
-    0x90, 0x73, 0x85, 0x64, 0x61, 0x51, 0x4a, 0x63, 0xa1, 0x96, 0x7e, 0x4e,
-    0x87, 0x60, 0x68, 0xb5, 0x9a, 0x8d, 0x75, 0x4e, 0x8a, 0x7a, 0x5f, 0x9f,
-    0x74, 0x80, 0x69, 0x6d, 0x73, 0x92, 0x79, 0x7e, 0x85, 0x68, 0x83, 0x9d,
-    0xb6, 0x9d, 0x6e, 0x8f, 0x78, 0x91, 0xaf, 0x8f, 0xa0, 0x9d, 0x73, 0x55,
-    0x91, 0x8f, 0xb2, 0x76, 0x97, 0xab, 0x63, 0x63, 0x68, 0x7b, 0xab, 0x5c,
-    0x77, 0xae, 0x4c, 0x72, 0x6e, 0x93, 0xb8, 0x51, 0x79, 0x84, 0x7d, 0x6b,
-    0x7f, 0x8a, 0xba, 0x68, 0x7a, 0x43, 0x9a, 0x8d, 0x77, 0x8a, 0x6d, 0x56,
-    0x79, 0x41, 0x7a, 0x4b, 0x81, 0x7a, 0x5c, 0x68, 0x58, 0x36, 0x6f, 0x6f,
-    0x9f, 0xa6, 0x5f, 0x60, 0x4e, 0x67, 0x70, 0x4c, 0x69, 0x69, 0x94, 0x63,
-    0x6d, 0x7b, 0x88, 0x9e, 0x6d, 0x98, 0x69, 0x68, 0x88, 0x80, 0x80, 0x7a,
-    0x8e, 0x78, 0x5e, 0x8d, 0x7e, 0x91, 0x76, 0x64, 0x7e, 0x7f, 0x4e, 0xc9,
-    0x79, 0x8f, 0x9c, 0x82, 0x3d, 0x62, 0x63, 0xc3, 0xb8, 0x7b, 0x72, 0x7b,
-    0x50, 0x56, 0x95, 0x72, 0x8f, 0x6b, 0x90, 0x9d, 0x76, 0xa4, 0xa5, 0x79,
-    0x54, 0x4f, 0x59, 0x85, 0xc5, 0x92, 0x97, 0x4d, 0x6f, 0x69, 0x77, 0x7f,
-    0x71, 0x7c, 0x87, 0x59, 0x98, 0x61, 0x80, 0x81, 0x88, 0x6b, 0x6d, 0x7f,
-    0x7f, 0x77, 0x60, 0xa2, 0x96, 0x73, 0x69, 0x86, 0x83, 0x8d, 0x60, 0x66,
-    0x88, 0x8c, 0x93, 0x67, 0x98, 0x82, 0x7e, 0x91, 0x99, 0x59, 0x8e, 0x6e,
-    0x90, 0xa1, 0x62, 0x8a, 0x98, 0x7b, 0xc8, 0x67, 0x85, 0x8d, 0x6c, 0xa1,
-    0xa1, 0x92, 0xd0, 0x49, 0x85, 0x76, 0x89, 0x75, 0x88, 0x83, 0xa3, 0x77,
-    0x85, 0x68, 0x82, 0x83, 0x7f, 0x79, 0xae, 0x85, 0x76, 0x84, 0x80, 0x9a,
-    0x9d, 0x7b, 0x83, 0x90, 0x79, 0x88, 0x79, 0x9a, 0x93, 0x6c, 0x69, 0x79,
-    0x5f, 0x90, 0x81, 0x7b, 0x87, 0x9d, 0x86, 0x82, 0x7a, 0x77, 0x71, 0x85,
-    0x8b, 0x99, 0x8f, 0x7b, 0x58, 0x98, 0x84, 0x6e, 0x9a, 0xa1, 0x7a, 0x8c,
-    0x77, 0xa8, 0x86, 0x93, 0x7b, 0x90, 0x79, 0x8a, 0x85, 0x8f, 0x84, 0x97,
-    0x73, 0x83, 0x7b, 0x76, 0x8e, 0xa1, 0x89, 0x8a, 0x83, 0x9c, 0x65, 0x68,
-    0x7b, 0x89, 0x92, 0x84, 0x6d, 0x90, 0x61, 0x78, 0x98, 0x8c, 0x8d, 0x87,
-    0xa0, 0x99, 0x79, 0x7b, 0x69, 0xa4, 0x7a, 0x8d, 0x73, 0x71, 0x70, 0x80,
-    0x82, 0x77, 0x81, 0x67, 0x75, 0x97, 0x71, 0x73, 0x85, 0x6d, 0x8e, 0x86,
-    0x6e, 0x80, 0x86, 0x9e, 0x6f, 0x70, 0x67, 0x59, 0x65, 0x89, 0x67, 0x8b,
-    0x7d, 0x68, 0x69, 0x7a, 0x5b, 0x7e, 0x87, 0xa1, 0x92, 0x7b, 0x64, 0x7e,
-    0x76, 0x72, 0x71, 0xab, 0x7c, 0x83, 0x6f, 0xa1, 0x86, 0x76, 0x71, 0x6f,
-    0x91, 0x77, 0x6c, 0x71, 0x92, 0x78, 0x70, 0x7f, 0x6e, 0x65, 0x77, 0x93,
-    0x7e, 0x6c, 0x85, 0x9d, 0x78, 0x8b, 0x7c, 0x5f, 0x94, 0x86, 0x7c, 0x7f,
-    0x83, 0x6e, 0x72, 0x9e, 0x6e, 0x6b, 0x8d, 0x91, 0x97, 0x8b, 0x7b, 0x72,
-    0x86, 0x75, 0x7f, 0x96, 0x7d, 0x81, 0xa1, 0x55, 0xa6, 0x88, 0x96, 0x87,
-    0x93, 0x68, 0x89, 0x72, 0x6f, 0x9c, 0x75, 0x7c, 0x79, 0x6c, 0x74, 0x84,
-    0x7d, 0xa4, 0x86, 0x84, 0x84, 0x8d, 0x63, 0x7a, 0x63, 0xbc, 0x7e, 0x93,
-    0x80, 0x8d, 0x71, 0x7a, 0x5f, 0x8c, 0x74, 0x96, 0x7e, 0x9b, 0x9d, 0x8d,
-    0x5b, 0xa4, 0x71, 0x5e, 0x83, 0x78, 0x86, 0x7f, 0x70, 0x99, 0x87, 0x85,
-    0x8e, 0x81, 0x93, 0x80, 0x89, 0xa0, 0x7a, 0x77, 0x8e, 0x73, 0x5f, 0x80,
-    0x6d, 0x87, 0x5b, 0x7a, 0x85, 0x7c, 0x85, 0x63, 0x61, 0x9d, 0x6f, 0x68,
-    0x77, 0x86, 0x61, 0x6d, 0x84, 0x98, 0x7c, 0x78, 0x69, 0x84, 0x91, 0x6d,
-    0x81, 0xa1, 0x6c, 0x62, 0x95, 0x6d, 0x86, 0x8b, 0x95, 0x8f, 0x5e, 0x86,
-    0x73, 0xa1, 0x83, 0x58, 0x5f, 0x8e, 0x76, 0x79, 0x9e, 0x92, 0x7c, 0x7b,
-    0x81, 0x8b, 0x83, 0x7b, 0x78, 0x75, 0x70, 0x83, 0x70, 0x5a, 0x6a, 0x59,
-    0xa3, 0x82, 0x7a, 0x91, 0x8b, 0x6e, 0x82, 0x8e, 0x70, 0x73, 0x91, 0x76,
-    0xa5, 0x7f, 0x70, 0x81, 0x6f, 0x85, 0x94, 0xa6, 0x8c, 0x50, 0x76, 0x6e,
-    0x64, 0x95, 0xa0, 0x64, 0x6c, 0x68, 0x8e, 0x8b, 0xa1, 0x7d, 0xa0, 0x7f,
-    0x76, 0x8b, 0x7b, 0x93, 0x7b, 0x6e, 0x7e, 0x64, 0x8a, 0xa7, 0x78, 0x64,
-    0x93, 0x67, 0x7d, 0x68, 0x5c, 0xa0, 0x76, 0x98, 0xaf, 0x80, 0x55, 0x96,
-    0x97, 0x9c, 0x78, 0x75, 0x87, 0x85, 0x77, 0x77, 0x62, 0x93, 0x76, 0x68,
-    0xa0, 0x80, 0x81, 0x7f, 0x9a, 0x68, 0x74, 0x69, 0x94, 0x77, 0x77, 0x72,
-    0x90, 0x9a, 0x6f, 0x95, 0x89, 0x6b, 0x6b, 0x94, 0x7e, 0x9c, 0x6f, 0x67,
-    0x8f, 0x82, 0x80, 0x92, 0x76, 0x80, 0x65, 0x9b, 0x6a, 0x7c, 0x75, 0x5a,
-    0x87, 0xa1, 0x69, 0x7a, 0x79, 0x9e, 0x9a, 0x58, 0x81, 0x92, 0x72, 0x67,
-    0x90, 0x80, 0x82, 0x61, 0x9f, 0x9e, 0x6a, 0x8d, 0x8d, 0x8a, 0x73, 0x81,
-    0x68, 0x7f, 0x5b, 0x59, 0x98, 0x89, 0x71, 0x72, 0x58, 0x7b, 0x94, 0x5d,
-    0xa9, 0x8b, 0x72, 0x7b, 0x65, 0x73, 0x5b, 0x8b, 0x7d, 0x86, 0x6e, 0x8c,
-    0x66, 0x6f, 0x6b, 0x8b, 0x71, 0x80, 0x7f, 0x70, 0x70, 0x88, 0x70, 0x7e,
-    0x84, 0x89, 0x7f, 0x81, 0x87, 0x77, 0x71, 0x88, 0x7f, 0x8f, 0x5e, 0x80,
-    0x5d, 0xa1, 0x89, 0x77, 0x93, 0x8e, 0x55, 0x64, 0x88, 0x9a, 0x8b, 0x80,
-    0x77, 0x6f, 0x91, 0x83, 0x6b, 0x9b, 0x85, 0x5c, 0x57, 0x7e, 0xa9, 0x63,
-    0x83, 0xaa, 0x7c, 0xa1, 0x91, 0x5f, 0x68, 0x76, 0x7a, 0x97, 0x96, 0x84,
-    0xca, 0x8d, 0x8c, 0x8b, 0x71, 0x81, 0x88, 0x92, 0xaa, 0x74, 0x49, 0x7a,
-    0x90, 0x93, 0x7a, 0x61, 0x8c, 0x66, 0x71, 0xa0, 0xab, 0x7d, 0x86, 0x6c,
-    0x9f, 0x77, 0x67, 0x6a, 0x89, 0x89, 0x88, 0x70, 0xad, 0x88, 0x69, 0x84,
-    0x70, 0x8f, 0x79, 0x7c, 0x66, 0xa6, 0x71, 0x8d, 0x77, 0x99, 0x69, 0x76,
-    0x79, 0x7d, 0x9c, 0x6f, 0x64, 0x8b, 0x70, 0x82, 0x69, 0xa4, 0x65, 0x6e,
-    0x7f, 0x9e, 0x7e, 0x84, 0x8c, 0x9c, 0x6c, 0x5b, 0x6e, 0xa7, 0x6d, 0x7a,
-    0x92, 0x78, 0x9a, 0x6f, 0x81, 0x91, 0x71, 0x7d, 0x6b, 0x99, 0x6b, 0x92,
-    0x5e, 0x7e, 0x64, 0x95, 0x78, 0x90, 0x6f, 0x68, 0x8a, 0x85, 0x6f, 0x88,
-    0x64, 0x66, 0x7f, 0x78, 0x7c, 0x95, 0x66, 0x6c, 0x76, 0x6a, 0x9b, 0x8f,
-    0x9d, 0x78, 0x86, 0x95, 0x73, 0x66, 0x6d, 0x71, 0x8b, 0x7f, 0x6f, 0x70,
-    0x64, 0x94, 0xa0, 0x83, 0x6b, 0x6d, 0x85, 0x89, 0x68, 0x92, 0x8e, 0x51,
-    0x81, 0x85, 0x86, 0x6e, 0x83, 0x85, 0x8a, 0x5e, 0x68, 0xbf, 0xc4, 0xa5,
-    0x8b, 0x67, 0x86, 0x59, 0x85, 0x9e, 0x96, 0x67, 0x82, 0x7c, 0x6c, 0x80,
-    0x84, 0xae, 0x9d, 0x80, 0xc2, 0x58, 0x5d, 0x95, 0x85, 0x8b, 0x7f, 0x5d,
-    0xc7, 0x75, 0x75, 0x87, 0xa2, 0x8c, 0x62, 0x71, 0x9c, 0x61, 0x7f, 0x9c,
-    0xca, 0x8d, 0x89, 0x6e, 0x7c, 0x71, 0x81, 0x99, 0x95, 0xa4, 0x76, 0x6f,
-    0x64, 0x7b, 0x6c, 0x72, 0x8b, 0x83, 0x70, 0x70, 0x8b, 0xa4, 0x69, 0x76,
-    0x6e, 0x8d, 0x7a, 0x80, 0x8f, 0x9e, 0x73, 0x4b, 0x75, 0x78, 0x77, 0x7b,
-    0x8e, 0x92, 0x88, 0x49, 0x54, 0x9f, 0x7a, 0x7f, 0x68, 0x9f, 0x7f, 0x57,
-    0x6b, 0xad, 0x85, 0x6f, 0x81, 0xa1, 0x96, 0x6f, 0x73, 0x8d, 0x5e, 0x65,
-    0x7a, 0x8c, 0x7c, 0x6a, 0x7e, 0x7a, 0x6a, 0x97, 0x59, 0x86, 0x62, 0x77,
-    0x70, 0x7a, 0x68, 0x62, 0x68, 0x86, 0x7e, 0x76, 0x9a, 0x7f, 0x6c, 0x7e,
-    0x8a, 0x76, 0x65, 0x8f, 0x7d, 0x65, 0x76, 0xa4, 0x95, 0x62, 0x78, 0x97,
-    0x7a, 0x6e, 0x7a, 0x7a, 0x7e, 0x91, 0x8c, 0x8a, 0x91, 0x82, 0x89, 0x6d,
-    0x87, 0x90, 0x69, 0x71, 0x96, 0xa6, 0x7c, 0x7c, 0xa8, 0xa8, 0x62, 0x77,
-    0x76, 0x99, 0xdd, 0x76, 0x8a, 0x5c, 0x86, 0x6a, 0x69, 0x9c, 0xa5, 0x7d,
-    0x78, 0x6a, 0x88, 0x77, 0x77, 0xae, 0x8a, 0x99, 0xcb, 0x85, 0x59, 0x84,
-    0x7b, 0x97, 0x8a, 0x82, 0xc5, 0x65, 0x8c, 0x93, 0xc3, 0x8c, 0x87, 0x64,
-    0x91, 0x41, 0x70, 0xa8, 0xd1, 0x8b, 0x82, 0x71, 0x9c, 0x71, 0x4e, 0x86,
-    0x98, 0x86, 0x7f, 0x7e, 0x69, 0x99, 0x79, 0x78, 0x77, 0xb3, 0x6b, 0x80,
-    0x84, 0x8b, 0x56, 0x73, 0x84, 0x95, 0x82, 0x94, 0x5b, 0x92, 0x83, 0x46,
-    0x66, 0x89, 0x6d, 0x61, 0x99, 0xa6, 0x99, 0x3f, 0x6c, 0xab, 0x5d, 0x5f,
-    0x6c, 0x8e, 0x6b, 0x4a, 0x72, 0xb6, 0x6c, 0x75, 0x78, 0xa6, 0x6f, 0x5b,
-    0x56, 0x8b, 0x57, 0x74, 0x8f, 0xab, 0x53, 0x56, 0x5d, 0x63, 0x63, 0x8b,
-    0x65, 0x78, 0x71, 0x67, 0x7a, 0x62, 0x8d, 0x78, 0x99, 0x76, 0x94, 0x7a,
-    0xa3, 0x70, 0x55, 0x87, 0x7e, 0x7c, 0x57, 0x57, 0x6e, 0x79, 0x94, 0x8f,
-    0x86, 0x80, 0x90, 0x7d, 0x7d, 0x7f, 0x7f, 0x68, 0x41, 0x86, 0x8c, 0x6f,
-    0x8a, 0x7f, 0x87, 0x8a, 0x7e, 0x7f, 0x5d, 0x71, 0x91, 0x81, 0x93, 0x71,
-    0x91, 0xc6, 0x70, 0x4a, 0x74, 0xa8, 0xf3, 0x72, 0xa7, 0x80, 0x7e, 0x41,
-    0x84, 0xa3, 0xb6, 0x94, 0xba, 0x84, 0x70, 0x74, 0x71, 0xac, 0x9f, 0x9d,
-    0xe4, 0x67, 0x6a, 0x87, 0x92, 0x8e, 0x92, 0x82, 0xdb, 0x5e, 0x9b, 0x90,
-    0xd5, 0x87, 0x8d, 0x7c, 0x9c, 0x3c, 0x6c, 0xab, 0xc2, 0x86, 0x83, 0x79,
-    0x6c, 0x61, 0x51, 0xa9, 0x99, 0x79, 0x72, 0x80, 0x6f, 0x85, 0x57, 0x6c,
-    0x81, 0x86, 0x6e, 0x88, 0x87, 0x8d, 0x8e, 0x81, 0x67, 0x88, 0x62, 0x99,
-    0x87, 0xab, 0x8f, 0x57, 0x60, 0x77, 0x64, 0x81, 0x96, 0xa3, 0x81, 0x3d,
-    0x4e, 0xb9, 0x57, 0x6e, 0x99, 0xad, 0x6a, 0x3e, 0x74, 0x96, 0x7e, 0x79,
-    0x65, 0xa4, 0x7c, 0x6a, 0x53, 0x87, 0x56, 0x6f, 0x5e, 0x97, 0x85, 0x42,
-    0x56, 0x6b, 0x67, 0x78, 0x7d, 0xa6, 0x7c, 0x7c, 0x7d, 0x78, 0x7b, 0x84,
-    0x99, 0x7b, 0x89, 0x71, 0x76, 0x8b, 0x76, 0x73, 0x7d, 0x83, 0x56, 0x4f,
-    0x86, 0x72, 0x83, 0x88, 0x6a, 0x93, 0x69, 0x90, 0x6c, 0x73, 0x6f, 0x63,
-    0x55, 0x88, 0x6b, 0x88, 0x7c, 0x86, 0x87, 0x7b, 0x6c, 0x7e, 0x60, 0x57,
-    0xa8, 0x81, 0xa3, 0x72, 0xba, 0xbf, 0x66, 0x65, 0x70, 0xb9, 0xe4, 0x78,
-    0x99, 0x67, 0x8c, 0x72, 0x88, 0x96, 0xb5, 0x72, 0x8a, 0x66, 0x81, 0x39,
-    0x85, 0x93, 0xa0, 0x9c, 0xdf, 0x74, 0x8a, 0x6d, 0x93, 0xa1, 0x8c, 0x7a,
-    0xb5, 0x4b, 0x89, 0xae, 0xba, 0x9c, 0x96, 0x9a, 0xb4, 0x33, 0x5a, 0xb1,
-    0xcd, 0x88, 0x84, 0x63, 0x8c, 0x5e, 0x71, 0x6d, 0xa7, 0x8a, 0x62, 0x85,
-    0x77, 0x75, 0x62, 0x79, 0x96, 0x73, 0x4f, 0x7d, 0x93, 0x8a, 0x88, 0x7e,
-    0x59, 0x6c, 0x7f, 0x87, 0x6f, 0x91, 0x88, 0x59, 0x6d, 0x83, 0x70, 0x7c,
-    0x7f, 0x8d, 0x7f, 0x26, 0x41, 0xcf, 0x6b, 0x6e, 0x75, 0xa3, 0x90, 0x5e,
-    0x3a, 0x94, 0x61, 0x9a, 0x6f, 0x9f, 0x69, 0x7d, 0x55, 0x8c, 0x60, 0x7c,
-    0x93, 0x85, 0x85, 0x4b, 0x54, 0x71, 0x60, 0x8a, 0x6d, 0x8c, 0x9c, 0x7e,
-    0x5b, 0x79, 0x74, 0x7b, 0x7b, 0x9d, 0x5b, 0x65, 0x81, 0x82, 0x66, 0x89,
-    0x82, 0x72, 0x77, 0x78, 0x75, 0x76, 0x6b, 0x74, 0x89, 0x73, 0x6c, 0x6b,
-    0x77, 0x7e, 0x67, 0x84, 0x41, 0x90, 0x58, 0x87, 0x98, 0x60, 0x96, 0x81,
-    0x6b, 0x74, 0x7d, 0x56, 0x72, 0x71, 0x9a, 0x7d, 0xc5, 0xd0, 0x88, 0x6e,
-    0x4d, 0xbe, 0xef, 0x8a, 0xa7, 0x92, 0x82, 0x67, 0x7f, 0x91, 0xc5, 0x7d,
-    0xad, 0x77, 0x6b, 0x4e, 0x8e, 0x99, 0x9b, 0x8e, 0xc7, 0x7f, 0x8a, 0x8e,
-    0x8f, 0x87, 0x9c, 0x75, 0xb0, 0x53, 0x75, 0x97, 0xc7, 0x98, 0xa4, 0xa4,
-    0x80, 0x41, 0x79, 0xc3, 0xdb, 0x86, 0x9d, 0x75, 0x7f, 0x67, 0x7a, 0x96,
-    0xc3, 0x83, 0x54, 0x8e, 0x6f, 0xa8, 0x7c, 0x65, 0x78, 0x7e, 0x59, 0xa3,
-    0x8a, 0x97, 0x8b, 0x82, 0x5e, 0x66, 0x82, 0x9b, 0x9e, 0x9f, 0x70, 0x49,
-    0x55, 0x88, 0x8a, 0x7e, 0x90, 0xa7, 0x6b, 0x3b, 0x28, 0xc0, 0x63, 0x7e,
-    0x60, 0x90, 0x7c, 0x3f, 0x54, 0x9c, 0x7d, 0x8a, 0x6a, 0xa9, 0x6f, 0x61,
-    0x76, 0x86, 0x64, 0x88, 0x72, 0xa5, 0x6b, 0x4d, 0x56, 0x6c, 0x52, 0xa1,
-    0x84, 0x69, 0x69, 0x5b, 0x71, 0x84, 0x76, 0x9b, 0x92, 0x70, 0x86, 0x8b,
-    0x71, 0x68, 0x56, 0x92, 0x76, 0x8f, 0x8f, 0x72, 0x5a, 0x77, 0x6f, 0x92,
-    0x72, 0x72, 0x5e, 0x7a, 0x70, 0x73, 0x60, 0x7d, 0x5a, 0x93, 0x7f, 0x6b,
-    0x89, 0x6b, 0xa1, 0x85, 0x5c, 0x8d, 0x76, 0x7c, 0x6f, 0x73, 0x96, 0x6d,
-    0xbb, 0xad, 0x53, 0x53, 0x5f, 0x9a, 0xe2, 0x8d, 0xa7, 0x6d, 0x8a, 0x5b,
-    0x85, 0x9c, 0xb4, 0x7b, 0xb3, 0x52, 0x75, 0x7f, 0x7a, 0x8c, 0x91, 0x7e,
-    0xca, 0x5f, 0x64, 0x71, 0x85, 0x9a, 0x91, 0x72, 0xbd, 0x6e, 0x9b, 0x81,
-    0x8f, 0xa8, 0xac, 0x7d, 0xb4, 0x5f, 0x45, 0xc5, 0xc8, 0x7a, 0x93, 0x8e,
-    0x7b, 0x41, 0x69, 0x94, 0x8b, 0x76, 0x59, 0x81, 0x73, 0x92, 0x8e, 0x63,
-    0x8e, 0x74, 0x33, 0xa5, 0x9c, 0xa2, 0x88, 0x48, 0x5d, 0x8c, 0x7d, 0xa6,
-    0x68, 0x9a, 0x6f, 0x58, 0x6c, 0x8f, 0x77, 0x65, 0x97, 0x9d, 0x7a, 0x37,
-    0x59, 0xab, 0x6e, 0x8f, 0x7a, 0xae, 0x65, 0x3e, 0x46, 0xa9, 0x82, 0x82,
-    0x9c, 0x9d, 0x62, 0x79, 0x66, 0x7f, 0x5e, 0x88, 0x9e, 0x8f, 0x84, 0x71,
-    0x5d, 0x6d, 0x70, 0xa0, 0x69, 0x92, 0x7f, 0x70, 0x66, 0x6f, 0x75, 0x8c,
-    0x96, 0x7a, 0x85, 0x6a, 0x5a, 0x7c, 0x72, 0x8a, 0x8d, 0x7b, 0x8b, 0x5c,
-    0x76, 0x69, 0x70, 0x7f, 0x74, 0xa1, 0x71, 0x91, 0x5a, 0x8c, 0x6e, 0x83,
-    0x52, 0x78, 0x71, 0x6d, 0xa9, 0x63, 0x9d, 0x81, 0x52, 0x9e, 0x5d, 0x60,
-    0x76, 0x93, 0x97, 0x67, 0xce, 0xc1, 0x75, 0x5e, 0x5f, 0x8c, 0xea, 0x76,
-    0xad, 0x7a, 0x7d, 0x62, 0x85, 0x92, 0xd0, 0x6a, 0xbc, 0x53, 0x55, 0x5c,
-    0x6d, 0x89, 0x9e, 0x71, 0xd2, 0x8b, 0x64, 0x61, 0x85, 0x9a, 0x77, 0x75,
-    0xb9, 0x67, 0x8a, 0xac, 0x90, 0x8a, 0xb4, 0x91, 0xbb, 0x58, 0x94, 0xaf,
-    0xb2, 0x76, 0xa2, 0x71, 0x95, 0x5e, 0x73, 0xa5, 0x92, 0x8c, 0x52, 0x96,
-    0x53, 0x95, 0x84, 0x91, 0x93, 0x7a, 0x40, 0x88, 0xab, 0xa5, 0x63, 0x70,
-    0x66, 0x88, 0x7e, 0x92, 0x89, 0x84, 0x78, 0x57, 0x3d, 0x8d, 0x84, 0x77,
-    0x9b, 0x87, 0x5e, 0x4e, 0x42, 0xa0, 0x76, 0x8a, 0x77, 0x90, 0x83, 0x4c,
-    0x42, 0x9b, 0x75, 0x7a, 0x88, 0x94, 0x98, 0x69, 0x4c, 0xa2, 0x6b, 0x7b,
-    0x6e, 0x9b, 0x5d, 0x5f, 0x53, 0x6a, 0x63, 0x95, 0x69, 0x8a, 0x61, 0x75,
-    0x6c, 0x7a, 0x58, 0x89, 0x84, 0x8f, 0x6b, 0x5a, 0x71, 0x6f, 0x59, 0x89,
-    0x7d, 0x87, 0x5f, 0x77, 0x4b, 0x61, 0x77, 0x92, 0x67, 0x8e, 0x5c, 0x6f,
-    0x5b, 0x77, 0x76, 0x6b, 0x44, 0x9d, 0x9f, 0x7f, 0x8b, 0x94, 0x9e, 0x7c,
-    0x62, 0x94, 0x60, 0x55, 0x77, 0x8f, 0xa6, 0x62, 0xb5, 0xb2, 0x3c, 0x61,
-    0x5c, 0x99, 0xeb, 0x5b, 0x90, 0x6c, 0x7f, 0x5f, 0x75, 0xa6, 0xcf, 0x77,
-    0x98, 0x5d, 0x75, 0x69, 0x7f, 0x8a, 0xa7, 0x73, 0xc8, 0x74, 0x70, 0x82,
-    0x76, 0x8f, 0xa2, 0x7a, 0xa4, 0x7a, 0x66, 0x81, 0x9b, 0x8f, 0x9e, 0x8b,
-    0xa1, 0x51, 0x7b, 0xba, 0xc8, 0x90, 0xab, 0x92, 0x72, 0x57, 0x5b, 0xa3,
-    0xb0, 0x7f, 0x4c, 0x7d, 0x5f, 0x8e, 0x6c, 0x7d, 0x71, 0x7e, 0x4e, 0x87,
-    0xb7, 0x97, 0x7a, 0x4c, 0x5f, 0x72, 0x78, 0x84, 0x82, 0x7e, 0x63, 0x65,
-    0x68, 0x78, 0x73, 0x85, 0x90, 0x99, 0x80, 0x57, 0x42, 0x8b, 0x8a, 0x77,
-    0x71, 0x97, 0x6d, 0x44, 0x41, 0x8f, 0x78, 0x7d, 0x95, 0x81, 0x95, 0x5f,
-    0x64, 0x87, 0x66, 0x80, 0x89, 0x9a, 0x61, 0x4d, 0x68, 0x7b, 0x72, 0x73,
-    0x85, 0x92, 0x77, 0x7d, 0x73, 0x77, 0x54, 0x7a, 0x77, 0x7d, 0x7d, 0x7a,
-    0x6e, 0x8e, 0x4f, 0x7d, 0x80, 0x9a, 0x79, 0x8b, 0x7b, 0x68, 0x6e, 0x86,
-    0x7f, 0x93, 0x7a, 0x76, 0x72, 0x85, 0x6a, 0x7b, 0x57, 0x84, 0x96, 0x9a,
-    0x8f, 0x91, 0x9b, 0x72, 0x73, 0x91, 0x53, 0x66, 0x76, 0x80, 0xae, 0x63,
-    0xbf, 0x99, 0x5e, 0x77, 0x73, 0x9c, 0xd8, 0x74, 0xa7, 0x79, 0x52, 0x64,
-    0x82, 0x95, 0xc7, 0x4f, 0xa8, 0x4f, 0x6d, 0x42, 0x7c, 0x89, 0xab, 0x83,
-    0xc0, 0x82, 0x6a, 0x5f, 0x83, 0x92, 0xa8, 0x76, 0xc1, 0x77, 0x6e, 0x7b,
-    0xa3, 0x9b, 0xaf, 0x87, 0xab, 0x60, 0x8d, 0xc2, 0xd2, 0x83, 0xb2, 0x78,
-    0x8d, 0x39, 0x57, 0x9c, 0x90, 0x8e, 0x6e, 0x6a, 0x74, 0x79, 0x81, 0x6d,
-    0x6f, 0x8e, 0x77, 0x92, 0x93, 0x7d, 0x5f, 0x68, 0x6a, 0x6c, 0x80, 0x8f,
-    0x99, 0x84, 0x4f, 0x64, 0x5c, 0x93, 0x7c, 0x91, 0x98, 0x82, 0x62, 0x3f,
-    0x41, 0x9f, 0x5d, 0x89, 0x98, 0x89, 0x73, 0x50, 0x32, 0xa8, 0xa0, 0x7a,
-    0xa0, 0x95, 0x78, 0x69, 0x74, 0x7c, 0x89, 0x7b, 0x80, 0x65, 0x56, 0x6b,
-    0x69, 0x78, 0x62, 0x87, 0xaf, 0x94, 0x7a, 0x64, 0x53, 0x86, 0x45, 0x99,
-    0x88, 0x79, 0x4d, 0x74, 0x59, 0x91, 0x5f, 0x7b, 0x88, 0x90, 0x80, 0x86,
-    0x7d, 0x7b, 0x64, 0xa3, 0x7f, 0x74, 0x89, 0x80, 0x7d, 0x7c, 0x7a, 0x87,
-    0x5f, 0x8a, 0x5a, 0x72, 0x79, 0x74, 0x8c, 0x7c, 0x86, 0x91, 0x6e, 0x5d,
-    0x61, 0x8e, 0xa2, 0x68, 0xd4, 0x92, 0x67, 0x66, 0x62, 0xa1, 0xf3, 0x63,
-    0x91, 0x81, 0x74, 0x5f, 0x88, 0x98, 0xbb, 0x5a, 0x9b, 0x54, 0x6a, 0x5c,
-    0x75, 0x88, 0xad, 0x7c, 0xb4, 0x7c, 0x69, 0x74, 0x84, 0x76, 0x9d, 0x9a,
-    0xb0, 0x91, 0x5d, 0xa3, 0xa4, 0x7f, 0xbb, 0x80, 0xa4, 0x5d, 0x83, 0xaf,
-    0xb7, 0x66, 0xb0, 0x7f, 0x89, 0x4b, 0x72, 0x9e, 0x99, 0x7c, 0x66, 0x71,
-    0x6a, 0x6f, 0x6d, 0x67, 0x8d, 0x6d, 0x46, 0xa5, 0x9b, 0x84, 0x7a, 0x61,
-    0x64, 0x5c, 0x88, 0x89, 0x95, 0x8c, 0x70, 0x4b, 0x6c, 0x85, 0x83, 0x8b,
-    0x98, 0x87, 0x6a, 0x44, 0x4d, 0x9d, 0x78, 0x71, 0x78, 0x7e, 0x91, 0x5b,
-    0x3f, 0x9f, 0x80, 0x62, 0xa7, 0x95, 0x5d, 0x74, 0x65, 0x9c, 0x6d, 0x7a,
-    0x98, 0x79, 0x80, 0x61, 0x49, 0x82, 0x65, 0x92, 0x80, 0x96, 0x7c, 0x72,
-    0x4f, 0x76, 0x5e, 0x8d, 0x97, 0xa5, 0x72, 0x57, 0x79, 0x87, 0x67, 0x87,
-    0x80, 0x84, 0x7c, 0x6f, 0x66, 0x6b, 0x70, 0x9b, 0x64, 0x90, 0x59, 0x96,
-    0x7a, 0x6f, 0x75, 0x89, 0x4e, 0x8a, 0x62, 0x6e, 0x9c, 0x8c, 0x9a, 0x78,
-    0x8e, 0x91, 0x3d, 0x50, 0x72, 0x92, 0x9f, 0x63, 0xda, 0x92, 0x72, 0x60,
-    0x59, 0xa6, 0xd0, 0x56, 0xc1, 0x6b, 0x5e, 0x76, 0x6e, 0x81, 0xbb, 0x4b,
-    0xbb, 0x59, 0x68, 0x4f, 0x77, 0x87, 0xa1, 0x73, 0xbf, 0x65, 0x56, 0x67,
-    0x77, 0x84, 0x8a, 0x7e, 0xb8, 0x85, 0x66, 0xa6, 0x99, 0xa0, 0xa5, 0x73,
-    0x8d, 0x4a, 0x7d, 0xab, 0xb0, 0x6a, 0x94, 0x84, 0x87, 0x4c, 0x74, 0xa3,
-    0xb3, 0xa9, 0x62, 0x7a, 0x71, 0x7f, 0x53, 0x79, 0x7a, 0x7c, 0x5e, 0x8f,
-    0xa0, 0x90, 0x5c, 0x76, 0x6c, 0x92, 0x70, 0x9c, 0xb3, 0x8b, 0x7e, 0x57,
-    0x5b, 0x9d, 0x96, 0x85, 0x70, 0x93, 0x8b, 0x67, 0x4c, 0x9c, 0x6a, 0x83,
-    0x84, 0x90, 0x8e, 0x60, 0x56, 0xb3, 0x87, 0x7d, 0x86, 0x88, 0x79, 0x5b,
-    0x58, 0x94, 0x92, 0x8e, 0x90, 0x76, 0x58, 0x51, 0x52, 0x63, 0x57, 0x88,
-    0x9b, 0x7a, 0x85, 0x6c, 0x8b, 0x87, 0x5f, 0x8b, 0x90, 0x92, 0x81, 0x64,
-    0x52, 0x8b, 0x77, 0x94, 0x96, 0x98, 0x69, 0x5b, 0x79, 0x87, 0x61, 0x96,
-    0x7b, 0x9a, 0x61, 0x74, 0x7e, 0x8b, 0x82, 0x92, 0x4f, 0x87, 0x7f, 0x80,
-    0x74, 0x97, 0x98, 0x7a, 0x79, 0x97, 0x65, 0x67, 0x66, 0xb1, 0xb1, 0x49,
-    0xd6, 0x97, 0x58, 0x47, 0x62, 0x94, 0xd5, 0x82, 0xa0, 0x60, 0x3f, 0x67,
-    0x6c, 0x9d, 0xb6, 0x58, 0xb1, 0x6e, 0x58, 0x4e, 0x7c, 0x83, 0x8b, 0x83,
-    0xd5, 0x62, 0x8d, 0x84, 0x84, 0x8c, 0xa9, 0x6e, 0xac, 0x7f, 0x6d, 0x88,
-    0xab, 0x8b, 0xb1, 0x77, 0x9b, 0x46, 0x76, 0xa7, 0xb8, 0x7b, 0xc5, 0x6e,
-    0x73, 0x62, 0x68, 0x95, 0xab, 0x7c, 0x6f, 0x74, 0x56, 0x71, 0x61, 0x83,
-    0x8a, 0x73, 0x54, 0x94, 0x86, 0x91, 0x60, 0x69, 0x65, 0x6b, 0x76, 0x85,
-    0xae, 0x87, 0x8f, 0x55, 0x41, 0x98, 0x68, 0x87, 0x5e, 0x7a, 0x80, 0x38,
-    0x50, 0xaf, 0x93, 0x79, 0x57, 0x96, 0x7b, 0x53, 0x4e, 0xc0, 0xa0, 0x85,
-    0x87, 0x95, 0x86, 0x70, 0x4c, 0x9f, 0x77, 0x7d, 0x8b, 0x7a, 0x7b, 0x6d,
-    0x57, 0x74, 0x81, 0x7d, 0xa2, 0x79, 0x64, 0x6c, 0x55, 0x70, 0x3c, 0x88,
-    0x8a, 0x7a, 0x58, 0x72, 0x71, 0x7d, 0x6a, 0x8d, 0x78, 0x7e, 0x95, 0x8b,
-    0x84, 0x7e, 0x73, 0x7c, 0x7e, 0x67, 0x89, 0x8b, 0x6d, 0x68, 0x66, 0x73,
-    0x5a, 0x93, 0x82, 0x85, 0x97, 0x6b, 0x9a, 0x72, 0x51, 0xa2, 0x4f, 0x67,
-    0x67, 0x7e, 0xbb, 0x37, 0xe3, 0x9c, 0x57, 0x5b, 0x6f, 0xa0, 0xdc, 0x5c,
-    0xa6, 0x7c, 0x71, 0x77, 0x72, 0x88, 0xd0, 0x4d, 0x93, 0x58, 0x74, 0x6d,
-    0x8f, 0x77, 0xa3, 0x76, 0xb7, 0x76, 0x6d, 0x6d, 0x6f, 0x7b, 0xaa, 0x6d,
-    0xaa, 0x6a, 0x72, 0x98, 0x8d, 0x98, 0xb0, 0x52, 0x76, 0x5d, 0x61, 0xb7,
-    0xac, 0x90, 0xa5, 0x75, 0x7e, 0x3d, 0x5b, 0x9a, 0xbf, 0x81, 0x83, 0x7b,
-    0x5c, 0x77, 0x74, 0x82, 0x8d, 0x7e, 0x4f, 0x9f, 0x8f, 0x97, 0x7c, 0x75,
-    0x5b, 0x73, 0x97, 0x73, 0x85, 0x7f, 0x70, 0x5a, 0x53, 0x81, 0x81, 0x89,
-    0x73, 0x8d, 0x8a, 0x5c, 0x5f, 0x84, 0x86, 0x6f, 0x76, 0x78, 0x82, 0x6d,
-    0x4f, 0xbb, 0x91, 0x61, 0x7e, 0x97, 0x6c, 0x67, 0x62, 0x83, 0x61, 0x7d,
-    0x89, 0x76, 0x7b, 0x67, 0x56, 0x74, 0x49, 0x7b, 0x6b, 0x8b, 0x89, 0x74,
-    0x5b, 0x7f, 0x78, 0x7b, 0x80, 0x7e, 0x63, 0x71, 0x5e, 0x91, 0x81, 0x92,
-    0x7b, 0x90, 0x9c, 0x7a, 0x73, 0x85, 0x79, 0x9b, 0x66, 0x93, 0x60, 0x87,
-    0x79, 0x69, 0x73, 0x8b, 0x53, 0x8c, 0x8d, 0x68, 0x93, 0xa0, 0x91, 0x65,
-    0x57, 0x8d, 0x71, 0x65, 0x6c, 0x7e, 0xb3, 0x4f, 0xc7, 0xaa, 0x5a, 0x77,
-    0x6e, 0x85, 0xe4, 0x6c, 0xa3, 0x89, 0x69, 0x54, 0x6d, 0x99, 0xb9, 0x77,
-    0xa0, 0x80, 0x85, 0x71, 0x70, 0x78, 0x99, 0x66, 0xaf, 0x8a, 0x59, 0x64,
-    0x54, 0x62, 0xbf, 0x5c, 0xbd, 0x77, 0x7f, 0xab, 0x95, 0x85, 0xaa, 0x6e,
-    0xaa, 0x5a, 0x7b, 0x9f, 0xc3, 0x65, 0x93, 0x64, 0x7c, 0x2d, 0x4e, 0x8f,
-    0xb2, 0x5f, 0x4e, 0x61, 0x64, 0x73, 0x56, 0x75, 0x79, 0x90, 0x5c, 0x81,
-    0x8a, 0x8c, 0x70, 0x64, 0x74, 0x86, 0x86, 0x82, 0xab, 0x7e, 0x62, 0x4f,
-    0x51, 0x89, 0x7b, 0x88, 0x73, 0x97, 0x77, 0x75, 0x5c, 0x9e, 0x97, 0x70,
-    0x5a, 0x98, 0x7a, 0x54, 0x47, 0x99, 0xab, 0x5d, 0x91, 0xa0, 0x64, 0x51,
-    0x57, 0x88, 0x88, 0x85, 0x81, 0x83, 0xa1, 0x89, 0x6a, 0x88, 0x69, 0x81,
-    0x92, 0x63, 0x6a, 0x71, 0x72, 0x6a, 0x75, 0x8e, 0x90, 0x9d, 0x69, 0x60,
-    0x73, 0x95, 0x79, 0x7b, 0x79, 0x7f, 0x77, 0x6e, 0x69, 0x63, 0x60, 0xa0,
-    0x84, 0x91, 0x80, 0x96, 0x92, 0x70, 0x69, 0x7c, 0x3f, 0x90, 0x5c, 0x79,
-    0x82, 0x63, 0x8d, 0x63, 0x56, 0x8a, 0x8e, 0x7a, 0x5c, 0x8d, 0xb8, 0x4e,
-    0xb6, 0x84, 0x57, 0x79, 0x59, 0x79, 0xe8, 0x7e, 0xa8, 0x71, 0x61, 0x62,
-    0x89, 0x71, 0xb7, 0x83, 0x7b, 0x53, 0x86, 0x88, 0x74, 0x71, 0xb1, 0x61,
-    0xae, 0x7e, 0x8f, 0x69, 0x6b, 0x69, 0xb2, 0x6d, 0xb1, 0x7f, 0x5c, 0x9f,
-    0xaa, 0x8c, 0xbd, 0x74, 0xaa, 0x5b, 0x7f, 0xa5, 0xb0, 0x6e, 0xc1, 0x5c,
-    0x94, 0x34, 0x5b, 0xa6, 0xbc, 0x49, 0x75, 0x5b, 0x6e, 0x74, 0x7a, 0x92,
-    0x92, 0x79, 0x78, 0x8a, 0x9e, 0x97, 0x7c, 0x5f, 0x76, 0x86, 0x59, 0x81,
-    0x83, 0x7a, 0x65, 0x5b, 0x42, 0x95, 0x84, 0x99, 0x81, 0x8d, 0x6a, 0x5e,
-    0x59, 0xb7, 0x96, 0x8a, 0x77, 0x86, 0x7a, 0x67, 0x3b, 0xa8, 0xae, 0x7a,
-    0xa0, 0x97, 0x6c, 0x73, 0x5b, 0x9b, 0x77, 0x84, 0x7a, 0x77, 0x75, 0x6f,
-    0x7d, 0x7a, 0x71, 0x86, 0x6c, 0x6f, 0x7d, 0x71, 0x68, 0x60, 0x64, 0x86,
-    0x90, 0x75, 0x6a, 0x61, 0x60, 0x87, 0x68, 0x99, 0x87, 0x7e, 0x92, 0x87,
-    0x87, 0x5f, 0x60, 0x91, 0x68, 0x8c, 0x7b, 0x67, 0x79, 0x5d, 0x67, 0x77,
-    0x47, 0x72, 0x76, 0x88, 0x82, 0xa2, 0x7a, 0x5d, 0x64, 0x87, 0x75, 0x78,
-    0x5e, 0x6f, 0xa4, 0x52, 0xc2, 0x9d, 0x81, 0x89, 0x55, 0x86, 0xc9, 0x6f,
-    0x95, 0x71, 0x9d, 0x87, 0x95, 0x74, 0xac, 0x7f, 0x95, 0x6c, 0x68, 0x66,
-    0x8a, 0x5f, 0x96, 0x69, 0x95, 0x79, 0x7f, 0x71, 0x86, 0x7e, 0x98, 0x71,
-    0xac, 0x8f, 0x75, 0xa5, 0xac, 0x7a, 0xca, 0x63, 0xa0, 0x63, 0x69, 0xbf,
-    0xae, 0x62, 0xc9, 0x46, 0x74, 0x2c, 0x66, 0x96, 0xb7, 0x70, 0x7c, 0x6b,
-    0x7b, 0x90, 0x72, 0x74, 0x8d, 0x5f, 0x63, 0x93, 0x97, 0x78, 0x79, 0x64,
-    0x67, 0x84, 0x64, 0x82, 0x90, 0x83, 0x91, 0x5f, 0x72, 0x93, 0x91, 0xae,
-    0x6d, 0x99, 0x5b, 0x69, 0x54, 0x9f, 0x97, 0x80, 0x80, 0xa4, 0x91, 0x66,
-    0x65, 0xa4, 0xa7, 0x7b, 0x97, 0x87, 0x72, 0x68, 0x6a, 0x96, 0x7b, 0x79,
-    0x69, 0x83, 0x6f, 0x85, 0x6b, 0x92, 0x7f, 0x71, 0x84, 0x87, 0x6a, 0x7b,
-    0x63, 0x72, 0x5f, 0x87, 0x98, 0x7b, 0x96, 0x71, 0x62, 0x90, 0x71, 0xa3,
-    0x8c, 0x77, 0x90, 0x6f, 0x83, 0x76, 0x65, 0x87, 0x72, 0x8a, 0x64, 0x87,
-    0x75, 0x75, 0x6d, 0x84, 0x54, 0x89, 0x88, 0xa0, 0x87, 0x73, 0x7f, 0x6f,
-    0x5f, 0x90, 0x5e, 0x94, 0x5d, 0x61, 0xa6, 0x56, 0xb3, 0x91, 0x95, 0x75,
-    0x4d, 0x74, 0xd9, 0x87, 0x92, 0x74, 0x7f, 0x79, 0x97, 0x6e, 0x90, 0x54,
-    0x84, 0x5d, 0x5f, 0x75, 0x8b, 0x84, 0xa6, 0x75, 0xb4, 0x77, 0x78, 0x85,
-    0x90, 0x76, 0xbd, 0x78, 0xd1, 0xa0, 0x5d, 0x96, 0xa9, 0x7c, 0xc1, 0x61,
-    0xc2, 0x71, 0x8b, 0xa5, 0xa5, 0x5b, 0xc8, 0x50, 0x7b, 0x4b, 0x93, 0x99,
-    0xae, 0x72, 0x67, 0x54, 0x81, 0x89, 0x96, 0x81, 0x6e, 0x68, 0x55, 0x7f,
-    0x93, 0x8c, 0x5e, 0x65, 0x6c, 0x84, 0x7f, 0x8f, 0x9e, 0x7b, 0x73, 0x7f,
-    0x51, 0x63, 0x8a, 0x8b, 0x6b, 0x9b, 0x9d, 0x57, 0x68, 0x89, 0x98, 0x70,
-    0x73, 0xa3, 0x7f, 0x69, 0x44, 0x89, 0xae, 0x68, 0x89, 0x80, 0x7e, 0x6d,
-    0x70, 0x95, 0x85, 0x65, 0x91, 0x7f, 0x66, 0x74, 0x96, 0x72, 0x60, 0x7a,
-    0x87, 0x85, 0x79, 0x54, 0x53, 0x6c, 0x88, 0x87, 0xa9, 0x90, 0x75, 0x8b,
-    0x69, 0x98, 0x7d, 0x95, 0x85, 0x7a, 0x8b, 0x82, 0x87, 0x6f, 0x86, 0x7f,
-    0x74, 0xab, 0x93, 0x6c, 0x8a, 0x78, 0x68, 0x81, 0x62, 0x88, 0x78, 0x91,
-    0x8b, 0x55, 0xa7, 0x58, 0x64, 0x88, 0x71, 0x93, 0x7d, 0x69, 0xbc, 0x58,
-    0xbe, 0x9a, 0x6f, 0x74, 0x6f, 0x7f, 0xeb, 0x9e, 0xb7, 0x60, 0x63, 0x98,
-    0x82, 0x77, 0x94, 0x63, 0x80, 0x6f, 0x7d, 0x8f, 0x8b, 0x85, 0xa5, 0x62,
-    0xad, 0x86, 0x5f, 0x76, 0x88, 0x74, 0xa5, 0x66, 0xa5, 0x94, 0x88, 0x9b,
-    0x87, 0x9e, 0xa8, 0x5a, 0xc9, 0x81, 0x92, 0xcd, 0xb5, 0x67, 0xb9, 0x63,
-    0x86, 0x65, 0x8d, 0xad, 0x98, 0x7c, 0x8a, 0x40, 0x67, 0x65, 0x60, 0x71,
-    0x8e, 0x84, 0x73, 0x64, 0x98, 0x80, 0x73, 0x81, 0x48, 0x75, 0x71, 0x9e,
-    0x73, 0x89, 0x89, 0x68, 0x73, 0xa6, 0x84, 0x8a, 0x7e, 0x9f, 0x78, 0x83,
-    0x60, 0x77, 0xa1, 0x87, 0x76, 0xab, 0x74, 0x57, 0x6d, 0x99, 0xa5, 0x5e,
-    0x9d, 0x91, 0x6d, 0x6a, 0x76, 0x9c, 0x7b, 0x66, 0x96, 0x84, 0x85, 0x6e,
-    0x6c, 0x75, 0x86, 0x6a, 0x71, 0x67, 0x8a, 0x66, 0x66, 0x68, 0x73, 0x90,
-    0x92, 0x68, 0x8f, 0x71, 0x82, 0x7e, 0x71, 0xad, 0x9f, 0x84, 0x9e, 0x7d,
-    0x77, 0x6b, 0x67, 0x8f, 0x73, 0x9a, 0x91, 0x74, 0x8a, 0x74, 0x5a, 0x87,
-    0x37, 0x80, 0x8c, 0x8f, 0x7f, 0x75, 0xa8, 0x49, 0x63, 0x9b, 0x67, 0x68,
-    0x4f, 0x87, 0xbf, 0x59, 0x9c, 0xbe, 0x93, 0x7e, 0x6f, 0x8a, 0xea, 0x77,
-    0x83, 0x7a, 0x75, 0x8e, 0x7d, 0x50, 0x95, 0x60, 0x74, 0x60, 0x6f, 0x97,
-    0x72, 0x5c, 0xa3, 0x6d, 0xb9, 0x86, 0x7b, 0x89, 0x9a, 0x76, 0xc7, 0x56,
-    0xba, 0x86, 0x8d, 0x93, 0xa9, 0x98, 0xbb, 0x6a, 0x97, 0x74, 0x68, 0x84,
-    0xc3, 0x65, 0xb6, 0x68, 0x89, 0x58, 0x87, 0xa1, 0xac, 0x60, 0x65, 0x68,
-    0x7d, 0x98, 0x67, 0x8f, 0x8e, 0x84, 0x50, 0x75, 0x83, 0x91, 0x8a, 0x90,
-    0x66, 0x74, 0x96, 0x89, 0x81, 0x7a, 0x7a, 0x64, 0x7f, 0x73, 0x8f, 0x95,
-    0x8c, 0x89, 0x96, 0x76, 0x7a, 0x6c, 0x89, 0x91, 0x6d, 0x84, 0x68, 0x8d,
-    0x47, 0x94, 0x9a, 0x67, 0x8f, 0x89, 0x8e, 0x79, 0x73, 0xa8, 0x7f, 0x6c,
-    0x80, 0x64, 0x75, 0x81, 0x96, 0x9c, 0x68, 0x65, 0x76, 0x68, 0x74, 0x72,
-    0x68, 0x76, 0x62, 0x6d, 0x6e, 0x6a, 0x84, 0x65, 0x8a, 0x73, 0x76, 0x91,
-    0x78, 0x7c, 0x7a, 0x88, 0x6a, 0x87, 0x60, 0x99, 0x88, 0x75, 0x7b, 0x71,
-    0x81, 0x7b, 0x76, 0x7d, 0x58, 0x75, 0x65, 0xa3, 0x95, 0x7e, 0x96, 0x3e,
-    0x4c, 0x97, 0x86, 0x7a, 0x62, 0x92, 0xd1, 0x72, 0x8e, 0xaa, 0x85, 0x8e,
-    0x59, 0x5f, 0xec, 0x77, 0x96, 0x66, 0x91, 0x9a, 0x89, 0x6c, 0xa2, 0x69,
-    0x7d, 0x6e, 0x76, 0x63, 0x82, 0x72, 0x9c, 0x72, 0xa3, 0x75, 0x85, 0x7b,
-    0x6d, 0x96, 0xc2, 0x69, 0xa7, 0x6a, 0x6b, 0x83, 0xa2, 0x7d, 0xce, 0x5c,
-    0x94, 0x61, 0x7d, 0xae, 0xc3, 0x6d, 0x9f, 0x3c, 0x52, 0x4d, 0x8e, 0x92,
-    0xae, 0x6e, 0x70, 0x5a, 0x76, 0x84, 0x7f, 0x72, 0x92, 0x72, 0x76, 0x5e,
-    0x73, 0x8e, 0x82, 0x6d, 0x72, 0x81, 0x79, 0x94, 0x81, 0x88, 0x8b, 0x81,
-    0x72, 0x72, 0x69, 0x84, 0x59, 0x6e, 0x74, 0x7d, 0x66, 0x74, 0x8d, 0x7b,
-    0x7d, 0x7e, 0x7a, 0x83, 0x4d, 0x7e, 0x6a, 0x5a, 0x87, 0x66, 0x84, 0xa5,
-    0x50, 0x5d, 0x6a, 0x8e, 0x87, 0x74, 0x88, 0x7c, 0x7d, 0x6c, 0x93, 0x98,
-    0x8c, 0x76, 0x7f, 0xa3, 0x6e, 0x5d, 0x7d, 0x9f, 0x7c, 0x7a, 0x98, 0x88,
-    0x74, 0x73, 0x50, 0x8c, 0x78, 0x8b, 0x71, 0x77, 0x9d, 0x56, 0x71, 0x85,
-    0x6b, 0x8a, 0x93, 0x82, 0x8c, 0x79, 0x68, 0x8b, 0x57, 0x7b, 0x7c, 0x8a,
-    0x6c, 0x87, 0x98, 0x54, 0x63, 0x7e, 0x78, 0x6b, 0x63, 0x77, 0xc1, 0x52,
-    0xcd, 0xab, 0x75, 0x8e, 0x64, 0x68, 0xce, 0x68, 0x88, 0x6d, 0x67, 0x6d,
-    0x68, 0x76, 0xa7, 0x78, 0x83, 0x67, 0x65, 0x5b, 0x8f, 0x63, 0x90, 0x5b,
-    0xa1, 0x6f, 0x6a, 0x88, 0x70, 0x5c, 0x78, 0x49, 0xbc, 0x85, 0x8d, 0x8e,
-    0xa3, 0x90, 0x97, 0x84, 0xa2, 0x46, 0x7a, 0x8e, 0x9e, 0xb1, 0xaa, 0x53,
-    0x7d, 0x6b, 0x72, 0x86, 0x8c, 0x67, 0x6b, 0x48, 0x6f, 0x9c, 0x51, 0x94,
-    0x6d, 0x66, 0x8e, 0x90, 0x79, 0x81, 0x66, 0x9f, 0x82, 0x9f, 0x98, 0x97,
-    0x7c, 0x86, 0x7f, 0x57, 0x57, 0x83, 0x97, 0x8f, 0x73, 0x6f, 0x75, 0x6c,
-    0x56, 0x8f, 0x7f, 0x73, 0x71, 0x84, 0x7d, 0x5f, 0x69, 0x69, 0x8e, 0x67,
-    0x8a, 0x7f, 0x8c, 0x5a, 0x7a, 0x67, 0x82, 0x5a, 0x7a, 0x68, 0x73, 0x58,
-    0x84, 0x83, 0x8d, 0x6d, 0x83, 0x72, 0x80, 0x7a, 0x8e, 0x7a, 0x68, 0x88,
-    0x65, 0x74, 0x78, 0x73, 0x83, 0x97, 0x7b, 0x84, 0x77, 0x6d, 0x95, 0x99,
-    0x76, 0x69, 0x5f, 0x9b, 0x7c, 0x75, 0x91, 0x80, 0x7b, 0x73, 0x6f, 0x9f,
-    0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,
-    0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
-    0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-    0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62,
-    0x69, 0x61, 0x73, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0xaa, 0xcc, 0xe2, 0x37, 0x10, 0x00, 0x00, 0x00, 0xd6, 0x01, 0x00, 0x00,
-    0xfd, 0xfd, 0xff, 0xff, 0x53, 0xfe, 0xff, 0xff, 0x74, 0x01, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0xc0, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
-    0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x14, 0x00, 0x1c, 0x00,
-    0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x18, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,
-    0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-    0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0x00, 0x18, 0x00,
-    0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00, 0x00, 0x00,
-    0x00, 0x00, 0x14, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02,
-    0x10, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
-    0x1c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
-    0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x10, 0x00,
-    0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00,
-    0x02, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-    0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-    0x04, 0x00, 0x00, 0x00, 0xfa, 0xff, 0xff, 0xff, 0x00, 0x19, 0x06, 0x00,
-    0x06, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x09, 0x06, 0x00,
-    0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
-const int g_model_len = 18288;
+    0x01, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xf2, 0xdd, 0xbb, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x32, 0xa3, 0x25, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0xf6, 0xa0, 0x50, 0xc1, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x0e, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,
+    0x2c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x0f, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,
+    0x32, 0x2f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x4a, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x1c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,
+    0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,
+    0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,
+    0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0xc2, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x58, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x94, 0xff, 0xff, 0xff, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,
+    0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0xa8, 0x07, 0x00, 0x00, 0x2e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,
+    0x60, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+    0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x0c, 0x00, 0x10, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+    0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x01, 0x00, 0x00, 0x00, 0x3a, 0x6a, 0xac, 0x3d, 0x01, 0x00, 0x00, 0x00,
+    0xd0, 0xbd, 0xab, 0x41, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+    0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0xaa, 0xff, 0xff, 0xff,
+    0x00, 0x00, 0x00, 0x02, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x2c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff,
+    0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x96, 0x08, 0x29, 0x38, 0x0b, 0x00, 0x00, 0x00,
+    0x4d, 0x61, 0x74, 0x4d, 0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,
+    0x18, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,
+    0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xa0, 0x00, 0x00, 0x00,
+    0x03, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+    0x0c, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00,
+    0x0c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+    0x9a, 0xbb, 0x84, 0x38, 0x83, 0x84, 0x73, 0x37, 0x5b, 0xa3, 0xa0, 0x38,
+    0x16, 0x41, 0x3a, 0x38, 0xc7, 0x9a, 0x70, 0x38, 0xed, 0x70, 0x4e, 0x38,
+    0x54, 0x4f, 0xac, 0x38, 0xfd, 0x07, 0x8d, 0x38, 0x0b, 0x00, 0x00, 0x00,
+    0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,
+    0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+    0x4c, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+    0x04, 0x00, 0x00, 0x00, 0xe6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x19,
+    0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,
+    0x06, 0x00, 0x00, 0x00, 0x00, 0x16, 0x0a, 0x00, 0x0e, 0x00, 0x07, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,
+    0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00,
+    0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,
+    0x03, 0x00, 0x00, 0x00};
+const int g_model_len = 18712;
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
index b523a8185d4..80f2b62546b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h"
 
-const uint8_t g_no_feature_data_slice[g_no_feature_data_slice_size] = {
-    216, 195, 223, 211, 238, 223, 243, 215, 226, 204, 232, 211, 232, 213,
-    240, 218, 235, 214, 238, 205, 207, 173, 149, 201, 215, 200, 230, 213,
-    208, 195, 175, 151, 195, 175, 182, 163, 235, 217, 218, 190,
+const int8_t g_no_feature_data_slice[g_no_feature_data_slice_size] = {
+    89,  68, 96,  83, 111, 96, 115, 87, 99,  76, 105, 84, 105, 86,
+    113, 91, 108, 87, 110, 78, 80,  46, 22,  74, 88,  72, 103, 86,
+    80,  68, 48,  24, 68,  48, 55,  36, 108, 90, 90,  63,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
index 234e7efc388..7c27379f6de 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_feature_data_slice.h
@@ -24,6 +24,6 @@ limitations under the License.
 #include <cstdint>
 
 constexpr int g_no_feature_data_slice_size = 40;
-extern const uint8_t g_no_feature_data_slice[];
+extern const int8_t g_no_feature_data_slice[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
index d7a923364a7..2fa4556a273 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.cc
@@ -15,151 +15,174 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h"
 
-/* File automatically created by
- * tensorflow/examples/speech_commands/wav_to_features.py \
- * --sample_rate=16000 \
- * --clip_duration_ms=1000 \
- * --window_size_ms=30 \
- * --window_stride_ms=20 \
- * --feature_bin_count=40 \
- * --quantize=1 \
- * --preprocess="micro" \
- * --input_wav="speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav" \
- * --output_c_file="/tmp/no_micro_features_data.cc" \
- */
+// Golden test values for the expected spectrogram from a "no" sample file
+// speech_commands_test_set_v0.02/no/f9643d42_nohash_4.wav.
 
 const int g_no_micro_f9643d42_nohash_4_width = 40;
 const int g_no_micro_f9643d42_nohash_4_height = 49;
-const unsigned char g_no_micro_f9643d42_nohash_4_data[] = {
-    230, 205, 191, 203, 202, 181, 180, 194, 205, 187, 183, 197, 203, 198, 196,
-    186, 202, 159, 151, 126, 110, 138, 141, 142, 137, 148, 133, 120, 110, 126,
-    117, 110, 117, 116, 137, 134, 95,  116, 123, 110, 184, 144, 183, 189, 197,
-    172, 188, 164, 194, 179, 175, 174, 182, 173, 184, 174, 200, 145, 154, 148,
-    147, 135, 143, 122, 127, 138, 116, 99,  122, 105, 110, 125, 127, 133, 131,
-    123, 116, 119, 127, 114, 193, 176, 185, 170, 175, 146, 166, 167, 185, 185,
-    185, 183, 195, 185, 176, 178, 197, 155, 137, 144, 164, 132, 153, 132, 138,
-    137, 134, 95,  120, 116, 131, 122, 99,  120, 120, 110, 116, 110, 126, 127,
-    128, 159, 187, 119, 178, 187, 197, 167, 199, 184, 180, 165, 194, 176, 144,
-    134, 187, 136, 142, 134, 145, 132, 145, 105, 119, 123, 125, 116, 125, 102,
-    129, 138, 130, 99,  99,  90,  120, 123, 134, 95,  194, 172, 187, 123, 191,
-    179, 195, 182, 201, 137, 167, 142, 185, 161, 187, 146, 167, 152, 154, 107,
-    152, 112, 134, 144, 117, 116, 105, 85,  105, 105, 99,  90,  123, 112, 112,
-    68,  107, 105, 117, 99,  116, 143, 139, 90,  154, 142, 188, 172, 178, 135,
-    175, 149, 177, 110, 173, 160, 169, 162, 173, 119, 132, 110, 85,  85,  117,
-    129, 117, 112, 117, 51,  112, 95,  139, 102, 105, 90,  128, 119, 112, 99,
-    170, 168, 195, 152, 174, 173, 180, 0,   157, 130, 169, 149, 149, 123, 170,
-    130, 170, 133, 159, 102, 134, 90,  85,  105, 126, 119, 130, 90,  78,  68,
-    127, 120, 95,  51,  122, 110, 112, 78,  116, 95,  180, 135, 179, 146, 179,
-    162, 197, 153, 172, 135, 154, 0,   149, 95,  145, 114, 166, 0,   114, 110,
-    145, 107, 114, 90,  136, 68,  95,  95,  95,  85,  116, 99,  116, 0,   95,
-    68,  102, 51,  102, 78,  185, 157, 138, 158, 180, 117, 173, 142, 145, 117,
-    169, 130, 159, 99,  138, 123, 169, 90,  78,  0,   123, 85,  107, 51,  114,
-    102, 95,  0,   116, 85,  119, 95,  95,  68,  85,  51,  116, 68,  102, 78,
-    167, 105, 164, 163, 178, 126, 164, 154, 154, 51,  177, 120, 156, 85,  134,
-    139, 168, 90,  161, 102, 114, 116, 122, 95,  112, 102, 107, 51,  114, 85,
-    119, 78,  114, 90,  102, 51,  102, 51,  114, 99,  177, 68,  152, 102, 184,
-    166, 179, 129, 177, 129, 180, 110, 158, 105, 139, 0,   145, 85,  148, 102,
-    117, 102, 116, 0,   78,  68,  90,  51,  107, 85,  78,  0,   51,  0,   51,
-    0,   95,  51,  107, 68,  180, 117, 90,  0,   138, 0,   187, 146, 119, 140,
-    164, 90,  136, 0,   131, 51,  159, 99,  141, 138, 116, 51,  90,  51,  90,
-    68,  105, 0,   85,  78,  112, 51,  122, 95,  128, 68,  85,  0,   112, 68,
-    147, 126, 178, 146, 171, 130, 190, 147, 188, 123, 170, 78,  132, 0,   130,
-    125, 159, 95,  102, 0,   110, 0,   95,  85,  120, 68,  78,  51,  99,  51,
-    105, 0,   112, 102, 105, 68,  90,  51,  90,  0,   127, 95,  166, 175, 187,
-    133, 135, 0,   171, 139, 132, 128, 140, 51,  126, 107, 161, 0,   95,  51,
-    119, 0,   114, 0,   95,  110, 116, 51,  112, 0,   90,  0,   116, 51,  68,
-    0,   105, 68,  105, 0,   164, 78,  173, 0,   194, 166, 145, 114, 116, 51,
-    107, 122, 151, 0,   156, 102, 148, 51,  122, 95,  129, 0,   85,  0,   127,
-    78,  90,  0,   78,  0,   95,  0,   110, 0,   68,  119, 120, 68,  68,  0,
-    122, 99,  147, 127, 200, 167, 85,  114, 161, 85,  161, 125, 143, 99,  156,
-    85,  147, 68,  99,  0,   107, 102, 132, 51,  112, 68,  95,  78,  99,  0,
-    68,  0,   51,  0,   90,  78,  128, 51,  95,  0,   166, 136, 174, 138, 189,
-    144, 130, 129, 138, 134, 132, 120, 134, 0,   51,  78,  147, 51,  51,  0,
-    51,  0,   78,  0,   68,  68,  95,  78,  90,  0,   0,   0,   68,  0,   90,
-    68,  110, 0,   95,  51,  165, 151, 157, 0,   0,   0,   112, 0,   112, 95,
-    149, 107, 119, 68,  126, 68,  138, 0,   78,  0,   78,  0,   99,  51,  112,
-    0,   102, 0,   78,  51,  85,  0,   0,   0,   78,  0,   95,  0,   95,  78,
-    105, 0,   152, 0,   0,   51,  132, 105, 159, 0,   129, 102, 114, 0,   138,
-    51,  123, 0,   129, 78,  119, 51,  51,  51,  105, 0,   78,  85,  95,  0,
-    85,  0,   0,   0,   85,  0,   78,  0,   0,   0,   172, 142, 141, 0,   137,
-    0,   148, 128, 157, 120, 146, 120, 120, 0,   95,  78,  141, 68,  68,  0,
-    68,  0,   90,  0,   85,  0,   107, 0,   78,  0,   85,  51,  102, 0,   68,
-    78,  68,  0,   51,  0,   125, 0,   141, 51,  102, 138, 175, 51,  120, 51,
-    173, 85,  116, 141, 164, 68,  150, 123, 133, 51,  114, 0,   117, 68,  150,
-    51,  116, 68,  78,  0,   68,  0,   68,  0,   85,  0,   78,  0,   51,  78,
-    155, 90,  161, 0,   132, 99,  123, 78,  107, 0,   134, 90,  95,  0,   78,
-    0,   162, 143, 85,  0,   107, 78,  125, 90,  90,  51,  51,  0,   85,  0,
-    0,   0,   132, 102, 102, 154, 128, 0,   99,  68,  162, 102, 151, 0,   99,
-    51,  147, 141, 156, 0,   112, 120, 158, 127, 145, 139, 187, 171, 135, 138,
-    146, 0,   95,  68,  127, 0,   85,  0,   105, 0,   0,   0,   187, 170, 162,
-    188, 165, 51,  51,  78,  243, 215, 225, 196, 205, 181, 205, 168, 176, 134,
-    157, 110, 126, 114, 133, 139, 193, 163, 159, 116, 160, 126, 122, 127, 171,
-    99,  114, 68,  123, 85,  90,  0,   157, 146, 166, 179, 136, 0,   116, 90,
-    242, 219, 240, 204, 216, 164, 188, 171, 176, 164, 154, 158, 190, 157, 190,
-    141, 182, 177, 169, 128, 172, 145, 105, 129, 157, 90,  78,  51,  119, 68,
-    137, 68,  116, 78,  141, 132, 151, 122, 156, 140, 234, 206, 229, 201, 216,
-    174, 191, 144, 162, 85,  122, 157, 194, 167, 204, 149, 180, 166, 166, 139,
-    122, 133, 156, 126, 145, 85,  128, 0,   99,  51,  145, 0,   126, 51,  166,
-    162, 166, 162, 177, 157, 228, 198, 221, 197, 214, 177, 173, 166, 173, 139,
-    185, 191, 202, 163, 205, 172, 206, 189, 135, 68,  166, 134, 149, 134, 135,
-    90,  127, 107, 175, 90,  136, 117, 135, 140, 172, 167, 166, 149, 177, 152,
-    221, 191, 215, 194, 211, 0,   156, 147, 182, 178, 208, 163, 190, 157, 208,
-    200, 195, 164, 179, 154, 181, 150, 143, 99,  132, 137, 185, 143, 163, 85,
-    51,  107, 132, 134, 164, 127, 167, 159, 175, 141, 216, 195, 223, 211, 238,
-    223, 243, 215, 226, 204, 232, 211, 232, 213, 240, 218, 235, 214, 238, 205,
-    207, 173, 149, 201, 215, 200, 230, 213, 208, 195, 175, 151, 195, 175, 182,
-    163, 235, 217, 218, 190, 211, 191, 215, 191, 217, 220, 241, 215, 229, 206,
-    236, 210, 227, 216, 236, 188, 183, 149, 202, 189, 208, 172, 191, 201, 220,
-    193, 221, 207, 216, 208, 201, 131, 170, 187, 229, 197, 211, 194, 226, 201,
-    205, 184, 206, 177, 221, 210, 226, 184, 204, 197, 218, 198, 212, 209, 213,
-    141, 172, 110, 175, 167, 180, 156, 213, 188, 192, 179, 213, 205, 204, 174,
-    200, 147, 162, 181, 203, 167, 198, 187, 210, 164, 196, 169, 189, 168, 224,
-    198, 213, 204, 198, 195, 230, 211, 221, 197, 208, 0,   0,   0,   85,  90,
-    167, 130, 175, 173, 203, 164, 193, 144, 170, 145, 185, 148, 154, 139, 198,
-    159, 180, 171, 216, 174, 178, 161, 166, 136, 216, 184, 215, 197, 199, 190,
-    228, 195, 208, 51,  117, 0,   0,   0,   0,   0,   140, 51,  135, 154, 188,
-    155, 168, 0,   90,  0,   156, 85,  110, 0,   174, 90,  172, 154, 179, 99,
-    142, 166, 179, 157, 177, 95,  192, 142, 204, 198, 217, 147, 173, 0,   112,
-    0,   0,   0,   0,   0,   0,   0,   110, 0,   107, 0,   160, 0,   148, 95,
-    172, 0,   0,   0,   116, 0,   122, 114, 170, 0,   0,   0,   0,   0,   179,
-    110, 196, 85,  205, 183, 169, 0,   99,  0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   141, 0,   112, 0,   0,   0,   134, 0,   0,   0,   0,
-    0,   0,   0,   139, 0,   0,   0,   0,   112, 186, 78,  163, 0,   169, 128,
-    174, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   95,
-    0,   105, 0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   95,  0,
-    0,   0,   0,   0,   0,   0,   119, 0,   164, 78,  0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   68,
-    117, 0,   0,   0,   0,   0,   0,   0,   148, 0,   0,   0,   0,   0,   0,
-    0,   0,   0,   116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
-    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   99,  0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+const signed char g_no_micro_f9643d42_nohash_4_data[] = {
+    103,  78,   64,   76,   75,   54,   53,   67,   77,   60,   56,   70,
+    76,   71,   68,   58,   74,   32,   23,   -2,   -18,  11,   13,   15,
+    9,    20,   5,    -7,   -18,  -2,   -10,  -18,  -10,  -12,  9,    7,
+    -33,  -12,  -4,   -18,  57,   17,   55,   62,   70,   45,   61,   37,
+    67,   52,   48,   47,   55,   46,   57,   47,   73,   17,   27,   20,
+    19,   8,    15,   -6,   -1,   10,   -12,  -29,  -6,   -23,  -18,  -3,
+    -1,   5,    3,    -4,   -12,  -8,   -1,   -14,  65,   48,   58,   43,
+    48,   19,   39,   39,   57,   57,   58,   55,   67,   58,   49,   50,
+    70,   27,   9,    16,   37,   4,    25,   4,    11,   9,    7,    -33,
+    -7,   -12,  3,    -6,   -29,  -7,   -7,   -18,  -12,  -18,  -2,   -1,
+    0,    31,   60,   -8,   51,   59,   70,   40,   71,   57,   52,   38,
+    66,   48,   17,   6,    59,   8,    15,   7,    18,   4,    18,   -23,
+    -8,   -4,   -3,   -12,  -3,   -26,  1,    10,   2,    -29,  -29,  -37,
+    -7,   -4,   6,    -33,  67,   44,   59,   -4,   64,   51,   68,   55,
+    74,   9,    40,   15,   57,   33,   60,   18,   40,   25,   27,   -20,
+    25,   -16,  6,    17,   -10,  -12,  -23,  -43,  -23,  -23,  -29,  -37,
+    -4,   -16,  -16,  -60,  -20,  -23,  -10,  -29,  -12,  15,   12,   -37,
+    27,   15,   61,   44,   50,   8,    48,   22,   49,   -18,  46,   33,
+    42,   34,   46,   -8,   4,    -18,  -43,  -43,  -10,  1,    -10,  -16,
+    -10,  -77,  -16,  -33,  11,   -26,  -23,  -37,  0,    -8,   -16,  -29,
+    42,   40,   68,   24,   47,   46,   53,   -128, 30,   2,    42,   21,
+    21,   -4,   43,   2,    43,   5,    32,   -26,  7,    -37,  -43,  -23,
+    -2,   -8,   2,    -37,  -50,  -60,  -1,   -7,   -33,  -77,  -6,   -18,
+    -16,  -50,  -12,  -33,  53,   8,    52,   18,   51,   35,   69,   26,
+    44,   8,    27,   -128, 21,   -33,  17,   -14,  38,   -128, -14,  -18,
+    17,   -20,  -14,  -37,  8,    -60,  -33,  -33,  -33,  -43,  -12,  -29,
+    -12,  -128, -33,  -60,  -26,  -77,  -26,  -50,  57,   29,   11,   30,
+    53,   -10,  45,   15,   18,   -10,  42,   2,    31,   -29,  10,   -4,
+    42,   -37,  -50,  -128, -4,   -43,  -20,  -77,  -14,  -26,  -33,  -128,
+    -12,  -43,  -8,   -33,  -33,  -60,  -43,  -77,  -12,  -60,  -26,  -50,
+    40,   -23,  36,   35,   50,   -2,   37,   27,   26,   -77,  49,   -7,
+    28,   -43,  6,    11,   41,   -37,  33,   -26,  -14,  -12,  -6,   -33,
+    -16,  -26,  -20,  -77,  -14,  -43,  -8,   -50,  -14,  -37,  -26,  -77,
+    -26,  -77,  -14,  -29,  50,   -60,  25,   -26,  57,   38,   51,   1,
+    50,   1,    53,   -18,  30,   -23,  11,   -128, 18,   -43,  20,   -26,
+    -10,  -26,  -12,  -128, -50,  -60,  -37,  -77,  -20,  -43,  -50,  -128,
+    -77,  -128, -77,  -128, -33,  -77,  -20,  -60,  53,   -10,  -37,  -128,
+    10,   -128, 60,   18,   -8,   13,   37,   -37,  8,    -128, 3,    -77,
+    32,   -29,  14,   10,   -12,  -77,  -37,  -77,  -37,  -60,  -23,  -128,
+    -43,  -50,  -16,  -77,  -6,   -33,  0,    -60,  -43,  -128, -16,  -60,
+    20,   -2,   51,   19,   43,   2,    63,   20,   60,   -4,   42,   -50,
+    4,    -128, 2,    -3,   32,   -33,  -26,  -128, -18,  -128, -33,  -43,
+    -7,   -60,  -50,  -77,  -29,  -77,  -23,  -128, -16,  -26,  -23,  -60,
+    -37,  -77,  -37,  -128, -1,   -33,  39,   48,   60,   5,    8,    -128,
+    44,   11,   4,    0,    13,   -77,  -2,   -20,  33,   -128, -33,  -77,
+    -8,   -128, -14,  -128, -33,  -18,  -12,  -77,  -16,  -128, -37,  -128,
+    -12,  -77,  -60,  -128, -23,  -60,  -23,  -128, 36,   -50,  46,   -128,
+    66,   39,   18,   -14,  -12,  -77,  -20,  -6,   24,   -128, 28,   -26,
+    21,   -77,  -6,   -33,  1,    -128, -43,  -128, -1,   -50,  -37,  -128,
+    -50,  -128, -33,  -128, -18,  -128, -60,  -8,   -7,   -60,  -60,  -128,
+    -6,   -29,  20,   -1,   73,   40,   -43,  -14,  33,   -43,  33,   -3,
+    15,   -29,  29,   -43,  20,   -60,  -29,  -128, -20,  -26,  4,    -77,
+    -16,  -60,  -33,  -50,  -29,  -128, -60,  -128, -77,  -128, -37,  -50,
+    0,    -77,  -33,  -128, 39,   8,    47,   10,   62,   16,   2,    1,
+    10,   7,    4,    -7,   6,    -128, -77,  -50,  19,   -77,  -77,  -128,
+    -77,  -128, -50,  -128, -60,  -60,  -33,  -50,  -37,  -128, -128, -128,
+    -60,  -128, -37,  -60,  -18,  -128, -33,  -77,  37,   23,   29,   -128,
+    -128, -128, -16,  -128, -16,  -33,  21,   -20,  -8,   -60,  -2,   -60,
+    11,   -128, -50,  -128, -50,  -128, -29,  -77,  -16,  -128, -26,  -128,
+    -50,  -77,  -43,  -128, -128, -128, -50,  -128, -33,  -128, -33,  -50,
+    -23,  -128, 24,   -128, -128, -77,  4,    -23,  32,   -128, 1,    -26,
+    -14,  -128, 10,   -77,  -4,   -128, 1,    -50,  -8,   -77,  -77,  -77,
+    -23,  -128, -50,  -43,  -33,  -128, -43,  -128, -128, -128, -43,  -128,
+    -50,  -128, -128, -128, 44,   15,   14,   -128, 9,    -128, 21,   0,
+    29,   -7,   18,   -7,   -7,   -128, -33,  -50,  14,   -60,  -60,  -128,
+    -60,  -128, -37,  -128, -43,  -128, -20,  -128, -50,  -128, -43,  -77,
+    -26,  -128, -60,  -50,  -60,  -128, -77,  -128, -3,   -128, 14,   -77,
+    -26,  11,   47,   -77,  -7,   -77,  45,   -43,  -12,  14,   37,   -60,
+    22,   -4,   5,    -77,  -14,  -128, -10,  -60,  22,   -77,  -12,  -60,
+    -50,  -128, -60,  -128, -60,  -128, -43,  -128, -50,  -128, -77,  -50,
+    27,   -37,  33,   -128, 4,    -29,  -4,   -50,  -20,  -128, 6,    -37,
+    -33,  -128, -50,  -128, 34,   15,   -43,  -128, -20,  -50,  -3,   -37,
+    -37,  -77,  -77,  -128, -43,  -128, -128, -128, 4,    -26,  -26,  27,
+    0,    -128, -29,  -60,  35,   -26,  23,   -128, -29,  -77,  19,   14,
+    28,   -128, -16,  -7,   31,   -1,   17,   11,   60,   44,   8,    11,
+    18,   -128, -33,  -60,  -1,   -128, -43,  -128, -23,  -128, -128, -128,
+    59,   43,   35,   61,   37,   -77,  -77,  -50,  116,  88,   98,   69,
+    78,   53,   78,   40,   48,   7,    29,   -18,  -2,   -14,  5,    12,
+    65,   35,   31,   -12,  33,   -2,   -6,   -1,   44,   -29,  -14,  -60,
+    -4,   -43,  -37,  -128, 29,   18,   38,   51,   8,    -128, -12,  -37,
+    115,  91,   113,  77,   89,   36,   60,   44,   49,   36,   27,   31,
+    63,   30,   62,   14,   55,   49,   42,   0,    45,   17,   -23,  1,
+    30,   -37,  -50,  -77,  -8,   -60,  9,    -60,  -12,  -50,  13,   4,
+    23,   -6,   28,   13,   107,  78,   101,  73,   89,   46,   63,   17,
+    34,   -43,  -6,   30,   67,   40,   77,   21,   53,   39,   38,   12,
+    -6,   5,    28,   -2,   18,   -43,  0,    -128, -29,  -77,  18,   -128,
+    -2,   -77,  39,   35,   38,   35,   50,   29,   100,  70,   94,   69,
+    86,   50,   45,   38,   45,   12,   58,   64,   74,   36,   77,   45,
+    78,   62,   8,    -60,  38,   6,    21,   7,    8,    -37,  -1,   -20,
+    48,   -37,  8,    -10,  8,    13,   45,   39,   38,   22,   49,   25,
+    94,   63,   87,   66,   84,   -128, 29,   20,   55,   51,   80,   36,
+    62,   30,   81,   72,   68,   37,   51,   27,   54,   22,   16,   -29,
+    4,    9,    57,   15,   35,   -43,  -77,  -20,  4,    6,    37,   -1,
+    40,   31,   47,   14,   89,   68,   96,   83,   111,  96,   115,  87,
+    99,   76,   105,  84,   105,  86,   113,  91,   108,  87,   110,  78,
+    80,   46,   22,   74,   88,   72,   103,  86,   80,   68,   48,   24,
+    68,   48,   55,   36,   108,  90,   90,   63,   83,   63,   87,   64,
+    90,   92,   113,  88,   102,  79,   109,  83,   100,  89,   109,  60,
+    56,   21,   75,   62,   81,   45,   63,   73,   93,   65,   94,   80,
+    89,   81,   73,   3,    43,   60,   102,  70,   84,   67,   99,   74,
+    78,   57,   79,   50,   93,   82,   98,   56,   77,   70,   91,   71,
+    85,   82,   86,   13,   45,   -18,  48,   40,   53,   28,   85,   60,
+    65,   52,   86,   78,   76,   46,   73,   19,   35,   54,   75,   40,
+    71,   60,   82,   37,   69,   42,   62,   40,   96,   70,   85,   77,
+    70,   68,   103,  84,   94,   69,   81,   -128, -128, -128, -43,  -37,
+    40,   2,    48,   45,   76,   37,   65,   16,   43,   18,   58,   20,
+    27,   12,   71,   31,   53,   44,   88,   47,   50,   33,   39,   8,
+    89,   57,   88,   69,   72,   63,   100,  68,   81,   -77,  -10,  -128,
+    -128, -128, -128, -128, 13,   -77,  8,    27,   60,   28,   41,   -128,
+    -37,  -128, 28,   -43,  -18,  -128, 47,   -37,  45,   27,   51,   -29,
+    15,   39,   52,   30,   49,   -33,  65,   15,   76,   71,   90,   19,
+    46,   -128, -16,  -128, -128, -128, -128, -128, -128, -128, -18,  -128,
+    -20,  -128, 32,   -128, 21,   -33,  45,   -128, -128, -128, -12,  -128,
+    -6,   -14,  43,   -128, -128, -128, -128, -128, 52,   -18,  69,   -43,
+    78,   55,   42,   -128, -29,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, 14,   -128, -16,  -128, -128, -128, 7,    -128,
+    -128, -128, -128, -128, -128, -128, 12,   -128, -128, -128, -128, -16,
+    59,   -50,  35,   -128, 42,   0,    47,   -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -33,  -128, -23,  -128,
+    -128, -128, -23,  -128, -128, -128, -128, -128, -128, -128, -33,  -128,
+    -128, -128, -128, -128, -128, -128, -8,   -128, 36,   -50,  -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -37,  -128, -128, -60,  -10,  -128, -128, -128, -128, -128,
+    -128, -128, 21,   -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -12,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -77,  -128, -128, -128, -29,  -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -29,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -50,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h
index dc4d45b237e..8c1b6d5b57b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/no_micro_features_data.h
@@ -18,6 +18,6 @@ limitations under the License.
 
 extern const int g_no_micro_f9643d42_nohash_4_width;
 extern const int g_no_micro_f9643d42_nohash_4_height;
-extern const unsigned char g_no_micro_f9643d42_nohash_4_data[];
+extern const signed char g_no_micro_f9643d42_nohash_4_data[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_NO_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
index 7597b043d9b..7f077b5ffef 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h"
 
-const uint8_t g_yes_feature_data_slice[g_yes_feature_data_slice_size] = {
-    214, 215, 236, 202, 235, 203, 225, 191, 203, 188, 199, 194, 212, 127,
-    51,  0,   174, 188, 219, 196, 228, 221, 240, 207, 235, 220, 241, 219,
-    237, 207, 212, 142, 95,  0,   139, 78,  162, 177, 197, 183,
+const int8_t g_yes_feature_data_slice[g_yes_feature_data_slice_size] = {
+    86,  88,   108, 75, 108, 76,   98,  64,  75,  61, 71,  66, 85,  -1,
+    -77, -128, 46,  61, 92,  69,   100, 93,  113, 80, 108, 93, 113, 91,
+    110, 80,   85,  15, -33, -128, 12,  -50, 34,  50, 70,  55,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
index 1515449b2c2..2427ee70063 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_feature_data_slice.h
@@ -24,6 +24,6 @@ limitations under the License.
 #include <cstdint>
 
 constexpr int g_yes_feature_data_slice_size = 40;
-extern const uint8_t g_yes_feature_data_slice[];
+extern const int8_t g_yes_feature_data_slice[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_FEATURE_DATA_SLICE_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
index 9c1fb8be0bb..6d9137af2da 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.cc
@@ -15,151 +15,174 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h"
 
-/* File automatically created by
- * tensorflow/examples/speech_commands/wav_to_features.py \
- * --sample_rate=16000 \
- * --clip_duration_ms=1000 \
- * --window_size_ms=30 \
- * --window_stride_ms=20 \
- * --feature_bin_count=40 \
- * --quantize=1 \
- * --preprocess="micro" \
- * --input_wav="speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav" \
- * --output_c_file="yes_micro_features_data.cc" \
- */
+// Golden test values for the expected spectrogram from a "yes" sample file
+// speech_commands_test_set_v0.02/yes/f2e59fea_nohash_1.wav.
 
 const int g_yes_micro_f2e59fea_nohash_1_width = 40;
 const int g_yes_micro_f2e59fea_nohash_1_height = 49;
-const unsigned char g_yes_micro_f2e59fea_nohash_1_data[] = {
-    244, 226, 245, 223, 234, 213, 228, 208, 194, 110, 95,  116, 102, 0,   137,
-    161, 183, 173, 137, 116, 133, 157, 151, 156, 128, 110, 128, 0,   68,  78,
-    78,  90,  68,  68,  78,  102, 95,  78,  95,  78,  210, 188, 209, 183, 204,
-    188, 201, 191, 166, 119, 90,  107, 110, 107, 175, 157, 179, 168, 182, 145,
-    152, 164, 171, 165, 136, 143, 122, 68,  0,   78,  90,  90,  110, 90,  102,
-    99,  90,  68,  78,  68,  223, 186, 179, 123, 182, 110, 196, 171, 159, 110,
-    102, 95,  90,  99,  160, 134, 125, 136, 153, 152, 164, 134, 164, 151, 141,
-    136, 99,  90,  90,  90,  78,  78,  102, 119, 102, 90,  110, 90,  68,  51,
-    177, 175, 211, 172, 183, 0,   95,  68,  129, 102, 68,  85,  114, 105, 110,
-    85,  102, 95,  140, 51,  85,  51,  95,  90,  143, 116, 90,  78,  78,  51,
-    107, 85,  68,  0,   68,  51,  90,  51,  68,  0,   164, 117, 193, 120, 156,
-    0,   138, 51,  90,  0,   51,  0,   51,  85,  0,   0,   51,  0,   0,   0,
-    0,   0,   114, 0,   85,  78,  90,  51,  0,   0,   51,  85,  99,  85,  107,
-    68,  90,  85,  78,  0,   51,  0,   110, 0,   68,  0,   0,   0,   51,  0,
-    51,  0,   0,   0,   68,  90,  107, 0,   68,  0,   0,   0,   68,  0,   51,
-    68,  0,   78,  68,  0,   51,  0,   78,  68,  90,  68,  78,  51,  51,  0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   90,  0,   0,   0,   0,
-    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  68,
-    0,   0,   78,  0,   78,  0,   78,  0,   51,  0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   51,  0,   51,  0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   51,
-    0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,
-    0,   0,   0,   0,   51,  78,  0,   0,   51,  51,  0,   0,   0,   78,  0,
-    213, 170, 192, 180, 196, 188, 173, 131, 173, 116, 137, 105, 159, 127, 0,
-    0,   0,   0,   127, 164, 165, 161, 170, 164, 185, 197, 195, 167, 134, 138,
-    159, 134, 136, 105, 51,  0,   99,  0,   51,  0,   228, 215, 229, 218, 237,
-    215, 228, 210, 237, 222, 239, 211, 208, 211, 234, 218, 220, 209, 225, 219,
-    235, 222, 245, 225, 245, 224, 243, 223, 241, 218, 237, 224, 234, 213, 221,
-    193, 197, 164, 157, 128, 227, 188, 232, 196, 220, 220, 240, 219, 234, 213,
-    234, 211, 231, 218, 233, 213, 239, 215, 228, 207, 229, 206, 224, 208, 226,
-    207, 232, 210, 225, 208, 230, 199, 227, 206, 210, 205, 218, 174, 178, 141,
-    235, 208, 220, 206, 225, 203, 233, 203, 225, 167, 205, 199, 208, 190, 221,
-    204, 223, 207, 225, 188, 225, 197, 215, 188, 199, 183, 225, 195, 224, 200,
-    216, 178, 208, 188, 215, 202, 214, 183, 176, 140, 198, 150, 211, 194, 203,
-    120, 175, 188, 204, 189, 219, 192, 223, 202, 216, 186, 203, 185, 210, 182,
-    214, 183, 204, 170, 204, 125, 184, 187, 206, 185, 198, 182, 210, 161, 202,
-    198, 218, 173, 145, 120, 188, 183, 205, 168, 200, 170, 210, 177, 187, 190,
-    209, 193, 193, 166, 210, 162, 175, 119, 174, 147, 182, 161, 181, 134, 176,
-    143, 187, 165, 186, 149, 185, 141, 192, 181, 202, 123, 170, 143, 144, 78,
-    149, 0,   208, 182, 170, 78,  170, 0,   117, 51,  156, 99,  195, 170, 200,
-    130, 152, 68,  175, 141, 173, 134, 194, 132, 189, 164, 198, 134, 173, 117,
-    171, 149, 183, 181, 185, 99,  153, 117, 125, 0,   166, 0,   173, 117, 144,
-    0,   117, 102, 188, 120, 193, 166, 197, 68,  163, 119, 169, 99,  134, 0,
-    162, 0,   164, 68,  171, 116, 126, 0,   120, 68,  68,  0,   105, 0,   159,
-    95,  150, 51,  90,  85,  0,   0,   131, 0,   105, 0,   145, 51,  170, 51,
-    120, 0,   107, 0,   145, 85,  160, 0,   85,  0,   0,   51,  149, 0,   78,
-    0,   0,   0,   0,   0,   0,   0,   90,  0,   112, 0,   78,  102, 122, 0,
-    0,   0,   0,   0,   105, 0,   0,   0,   0,   0,   0,   0,   0,   0,   112,
-    0,   164, 120, 143, 0,   0,   0,   0,   0,   51,  0,   90,  0,   78,  0,
-    0,   0,   0,   0,   110, 0,   139, 0,   112, 51,  0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   102, 0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   107,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   78,  0,   51,  0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   51,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   127, 110, 133, 0,   167, 0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   132, 0,   190,
-    194, 202, 0,   197, 187, 161, 0,   0,   0,   0,   0,   0,   0,   0,   0,
-    214, 213, 223, 203, 218, 189, 200, 122, 78,  0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   191, 210, 231, 197, 226, 217, 238, 216, 236, 207,
-    199, 0,   0,   0,   0,   0,   107, 122, 155, 160, 214, 215, 236, 202, 235,
-    203, 225, 191, 203, 188, 199, 194, 212, 127, 51,  0,   174, 188, 219, 196,
-    228, 221, 240, 207, 235, 220, 241, 219, 237, 207, 212, 142, 95,  0,   139,
-    78,  162, 177, 197, 183, 211, 199, 235, 208, 238, 215, 227, 207, 211, 201,
-    224, 213, 226, 192, 213, 170, 223, 205, 234, 221, 245, 225, 242, 220, 245,
-    221, 239, 221, 238, 213, 226, 180, 159, 112, 176, 159, 208, 202, 213, 191,
-    205, 191, 225, 197, 238, 219, 224, 201, 227, 200, 221, 201, 225, 203, 212,
-    195, 229, 210, 228, 210, 239, 216, 226, 212, 233, 205, 225, 200, 229, 207,
-    222, 151, 147, 119, 179, 185, 230, 218, 223, 192, 202, 136, 205, 177, 223,
-    204, 228, 215, 232, 209, 221, 189, 221, 205, 209, 200, 226, 209, 229, 205,
-    235, 192, 209, 198, 228, 190, 206, 185, 207, 187, 214, 175, 177, 184, 220,
-    195, 214, 207, 230, 184, 205, 159, 208, 184, 189, 169, 224, 213, 219, 199,
-    229, 203, 216, 205, 222, 204, 224, 206, 231, 208, 231, 176, 197, 184, 216,
-    193, 211, 139, 212, 195, 231, 164, 166, 195, 217, 182, 208, 190, 217, 179,
-    205, 68,  182, 119, 195, 168, 182, 136, 204, 179, 193, 158, 182, 140, 188,
-    154, 197, 169, 190, 99,  184, 0,   125, 0,   131, 0,   99,  68,  179, 85,
-    190, 184, 213, 203, 223, 202, 212, 190, 209, 138, 178, 0,   159, 51,  128,
-    51,  105, 0,   139, 51,  179, 125, 185, 114, 171, 128, 175, 132, 181, 174,
-    155, 0,   0,   0,   90,  0,   125, 0,   176, 188, 227, 217, 244, 215, 234,
-    221, 239, 192, 224, 210, 0,   0,   134, 0,   51,  0,   105, 0,   105, 0,
-    143, 90,  192, 119, 175, 147, 141, 51,  184, 110, 85,  0,   0,   0,   0,
-    0,   0,   0,   151, 139, 201, 203, 232, 203, 226, 208, 236, 206, 230, 212,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   169, 0,   119,
-    0,   78,  0,   0,   0,   0,   0,   0,   0,   0,   0,   68,  0,   0,   133,
-    200, 180, 220, 197, 228, 201, 221, 184, 213, 193, 110, 0,   0,   0,   0,
-    0,   0,   0,   0,   0,   78,  0,   164, 0,   0,   0,   0,   0,   107, 0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   150, 164, 202, 182, 224,
-    197, 211, 179, 212, 193, 134, 0,   0,   0,   0,   0,   0,   0,   0,   0,
-    85,  0,   150, 0,   85,  0,   95,  0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   102, 90,  193, 160, 203, 164, 200, 178, 205, 174,
-    116, 0,   0,   0,   0,   0,   0,   0,   0,   0,   120, 114, 123, 0,   114,
-    0,   145, 68,  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    102, 68,  199, 170, 195, 180, 208, 176, 200, 164, 0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   110, 0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   142, 102, 172, 110, 186,
-    167, 185, 147, 189, 154, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   177, 0,   158, 136, 197, 155, 189, 166,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    85,  0,   155, 90,  175, 117, 175, 138, 202, 165, 0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   51,  0,   139,
-    0,   120, 68,  51,  123, 0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   119, 0,   78,  0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
-    0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+const signed char g_yes_micro_f2e59fea_nohash_1_data[] = {
+    116,  98,   118,  95,   106,  85,   101,  81,   67,   -18,  -33,  -12,
+    -26,  -128, 9,    34,   56,   45,   9,    -12,  5,    30,   23,   28,
+    0,    -18,  0,    -128, -60,  -50,  -50,  -37,  -60,  -60,  -50,  -26,
+    -33,  -50,  -33,  -50,  83,   61,   81,   55,   76,   61,   73,   64,
+    38,   -8,   -37,  -20,  -18,  -20,  48,   29,   52,   41,   55,   18,
+    25,   37,   44,   37,   8,    15,   -6,   -60,  -128, -50,  -37,  -37,
+    -18,  -37,  -26,  -29,  -37,  -60,  -50,  -60,  95,   59,   52,   -4,
+    54,   -18,  68,   43,   31,   -18,  -26,  -33,  -37,  -29,  33,   7,
+    -3,   8,    26,   24,   36,   6,    36,   23,   14,   8,    -29,  -37,
+    -37,  -37,  -50,  -50,  -26,  -8,   -26,  -37,  -18,  -37,  -60,  -77,
+    50,   48,   83,   44,   56,   -128, -33,  -60,  1,    -26,  -60,  -43,
+    -14,  -23,  -18,  -43,  -26,  -33,  13,   -77,  -43,  -77,  -33,  -37,
+    16,   -12,  -37,  -50,  -50,  -77,  -20,  -43,  -60,  -128, -60,  -77,
+    -37,  -77,  -60,  -128, 37,   -10,  65,   -7,   28,   -128, 10,   -77,
+    -37,  -128, -77,  -128, -77,  -43,  -128, -128, -77,  -128, -128, -128,
+    -128, -128, -14,  -128, -43,  -50,  -37,  -77,  -128, -128, -77,  -43,
+    -29,  -43,  -20,  -60,  -37,  -43,  -50,  -128, -77,  -128, -18,  -128,
+    -60,  -128, -128, -128, -77,  -128, -77,  -128, -128, -128, -60,  -37,
+    -20,  -128, -60,  -128, -128, -128, -60,  -128, -77,  -60,  -128, -50,
+    -60,  -128, -77,  -128, -50,  -60,  -37,  -60,  -50,  -77,  -77,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -37,  -128,
+    -128, -128, -128, -128, -77,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -77,  -60,  -128, -128, -50,  -128, -50,  -128,
+    -50,  -128, -77,  -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -77,  -128, -77,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -77,  -128, -77,  -128, -77,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -77,  -128, -128, -128,
+    -128, -77,  -50,  -128, -128, -77,  -77,  -128, -128, -128, -50,  -128,
+    85,   43,   65,   53,   69,   60,   45,   3,    46,   -12,  9,    -23,
+    32,   -1,   -128, -128, -128, -128, -1,   37,   38,   33,   43,   36,
+    58,   70,   68,   39,   6,    10,   32,   6,    8,    -23,  -77,  -128,
+    -29,  -128, -77,  -128, 101,  87,   102,  91,   110,  88,   101,  83,
+    110,  95,   111,  83,   81,   84,   106,  90,   93,   82,   98,   91,
+    108,  95,   118,  97,   118,  97,   116,  96,   113,  90,   110,  96,
+    107,  85,   94,   66,   69,   36,   29,   0,    100,  60,   105,  68,
+    92,   93,   113,  92,   107,  85,   107,  83,   104,  91,   105,  85,
+    112,  88,   101,  80,   101,  79,   96,   80,   98,   80,   105,  83,
+    98,   81,   103,  71,   100,  79,   83,   78,   91,   47,   50,   13,
+    108,  81,   93,   78,   98,   76,   105,  76,   98,   40,   77,   72,
+    81,   62,   93,   77,   96,   80,   98,   61,   97,   69,   88,   61,
+    71,   56,   98,   68,   97,   72,   89,   51,   81,   61,   88,   75,
+    86,   56,   48,   13,   71,   22,   84,   66,   76,   -7,   48,   61,
+    77,   62,   91,   65,   95,   74,   88,   59,   75,   58,   83,   55,
+    87,   55,   76,   43,   76,   -3,   56,   60,   79,   57,   71,   54,
+    82,   33,   74,   71,   91,   45,   18,   -7,   61,   56,   77,   41,
+    73,   42,   82,   49,   59,   63,   82,   65,   66,   38,   83,   34,
+    48,   -8,   46,   20,   54,   33,   54,   6,    48,   16,   60,   37,
+    58,   22,   58,   14,   65,   53,   75,   -4,   42,   16,   16,   -50,
+    22,   -128, 80,   54,   43,   -50,  42,   -128, -10,  -77,  28,   -29,
+    68,   43,   73,   2,    25,   -60,  47,   14,   45,   7,    66,   4,
+    62,   37,   71,   7,    46,   -10,  44,   22,   55,   53,   57,   -29,
+    26,   -10,  -3,   -128, 38,   -128, 46,   -10,  16,   -128, -10,  -26,
+    60,   -7,   65,   38,   70,   -60,  35,   -8,   42,   -29,  6,    -128,
+    34,   -128, 36,   -60,  44,   -12,  -2,   -128, -7,   -60,  -60,  -128,
+    -23,  -128, 31,   -33,  22,   -77,  -37,  -43,  -128, -128, 3,    -128,
+    -23,  -128, 17,   -77,  43,   -77,  -7,   -128, -20,  -128, 17,   -43,
+    32,   -128, -43,  -128, -128, -77,  21,   -128, -50,  -128, -128, -128,
+    -128, -128, -128, -128, -37,  -128, -16,  -128, -50,  -26,  -6,   -128,
+    -128, -128, -128, -128, -23,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -16,  -128, 36,   -7,   16,   -128, -128, -128, -128, -128,
+    -77,  -128, -37,  -128, -50,  -128, -128, -128, -128, -128, -18,  -128,
+    11,   -128, -16,  -77,  -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -26,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -20,  -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -50,  -128, -77,  -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -77,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -1,   -18,  5,    -128,
+    40,   -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, 4,    -128, 63,   66,   75,   -128,
+    70,   60,   34,   -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    87,   86,   95,   76,   91,   62,   72,   -6,   -50,  -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, 64,   83,   104,  70,
+    98,   90,   111,  89,   109,  80,   71,   -128, -128, -128, -128, -128,
+    -20,  -6,   27,   33,   86,   88,   108,  75,   108,  76,   98,   64,
+    75,   61,   71,   66,   85,   -1,   -77,  -128, 46,   61,   92,   69,
+    100,  93,   113,  80,   108,  93,   113,  91,   110,  80,   85,   15,
+    -33,  -128, 12,   -50,  34,   50,   70,   55,   84,   72,   108,  81,
+    111,  88,   100,  80,   84,   73,   97,   86,   99,   65,   85,   43,
+    96,   78,   107,  94,   118,  98,   115,  92,   118,  94,   111,  93,
+    111,  86,   99,   52,   32,   -16,  48,   31,   81,   74,   85,   64,
+    78,   64,   98,   70,   110,  92,   96,   73,   100,  72,   94,   73,
+    98,   76,   85,   67,   101,  83,   101,  83,   112,  89,   98,   85,
+    105,  78,   98,   72,   102,  80,   95,   23,   19,   -8,   52,   57,
+    103,  91,   95,   65,   74,   8,    77,   49,   96,   76,   100,  87,
+    105,  81,   94,   62,   94,   78,   81,   72,   99,   82,   101,  78,
+    108,  65,   82,   70,   100,  63,   79,   58,   80,   59,   87,   48,
+    50,   57,   93,   67,   86,   80,   103,  56,   77,   31,   81,   57,
+    62,   41,   96,   85,   91,   71,   101,  76,   89,   78,   95,   76,
+    96,   79,   103,  81,   103,  48,   70,   57,   88,   66,   84,   11,
+    85,   67,   104,  37,   38,   67,   90,   54,   81,   62,   90,   52,
+    78,   -60,  54,   -8,   68,   40,   55,   8,    77,   52,   66,   31,
+    55,   13,   60,   26,   69,   42,   63,   -29,  57,   -128, -3,   -128,
+    3,    -128, -29,  -60,  52,   -43,  63,   56,   86,   75,   95,   75,
+    85,   63,   82,   10,   50,   -128, 31,   -77,  0,    -77,  -23,  -128,
+    12,   -77,  51,   -3,   58,   -14,  44,   0,    48,   4,    53,   47,
+    28,   -128, -128, -128, -37,  -128, -3,   -128, 49,   61,   100,  90,
+    117,  88,   107,  94,   112,  64,   96,   83,   -128, -128, 7,    -128,
+    -77,  -128, -23,  -128, -23,  -128, 16,   -37,  65,   -8,   48,   20,
+    14,   -77,  57,   -18,  -43,  -128, -128, -128, -128, -128, -128, -128,
+    24,   12,   74,   76,   105,  76,   99,   80,   108,  79,   103,  85,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    42,   -128, -8,   -128, -50,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -60,  -128, -128, 5,    73,   53,   93,   70,   101,  73,
+    94,   57,   86,   66,   -18,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -50,  -128, 36,   -128, -128, -128, -128, -128, -20,  -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 23,   37,
+    75,   54,   97,   70,   83,   52,   85,   65,   7,    -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -43,  -128, 23,   -128, -43,  -128,
+    -33,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -26,  -37,  65,   33,   76,   37,   73,   50,   77,   47,
+    -12,  -128, -128, -128, -128, -128, -128, -128, -128, -128, -7,   -14,
+    -4,   -128, -14,  -128, 18,   -60,  -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -26,  -60,  71,   42,   68,   53,
+    81,   49,   73,   36,   -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -18,  -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 15,   -26,
+    44,   -18,  59,   39,   57,   20,   62,   26,   -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, 49,   -128, 30,   8,    69,   27,   62,   38,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -43,  -128, 28,   -37,  48,   -10,
+    48,   11,   74,   37,   -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -77,  -128, 11,   -128, -7,   -60,  -77,  -4,   -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -8,   -128, -50,  -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128,
+    -128, -128, -128, -128,
 };
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
index 07eccc35f4e..cd1ad10888e 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_features/yes_micro_features_data.h
@@ -18,6 +18,6 @@ limitations under the License.
 
 extern const int g_yes_micro_f2e59fea_nohash_1_width;
 extern const int g_yes_micro_f2e59fea_nohash_1_height;
-extern const unsigned char g_yes_micro_f2e59fea_nohash_1_data[];
+extern const signed char g_yes_micro_f2e59fea_nohash_1_data[];
 
 #endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MICRO_SPEECH_MICRO_FEATURES_YES_MICRO_FEATURES_DATA_H_
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index ca090ec9524..a6e011b1224 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -48,14 +48,19 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // needed by this graph.
   //
   // tflite::ops::micro::AllOpsResolver resolver;
-  tflite::MicroOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
+  tflite::MicroOpResolver<4> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
+                               tflite::MicroOpResolverAnyVersion());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
+                               tflite::ops::micro::Register_FULLY_CONNECTED(),
+                               tflite::MicroOpResolverAnyVersion());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+                               tflite::ops::micro::Register_SOFTMAX(),
+                               tflite::MicroOpResolverAnyVersion());
+  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
+                               tflite::ops::micro::Register_RESHAPE(),
+                               tflite::MicroOpResolverAnyVersion());
 
   // Create an area of memory to use for input, output, and intermediate arrays.
   const int tensor_arena_size = 10 * 1024;
@@ -71,18 +76,16 @@ TF_LITE_MICRO_TEST(TestInvoke) {
 
   // Make sure the input has the properties we expect.
   TF_LITE_MICRO_EXPECT_NE(nullptr, input);
-  TF_LITE_MICRO_EXPECT_EQ(4, input->dims->size);
+  TF_LITE_MICRO_EXPECT_EQ(2, input->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[0]);
-  TF_LITE_MICRO_EXPECT_EQ(49, input->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(40, input->dims->data[2]);
-  TF_LITE_MICRO_EXPECT_EQ(1, input->dims->data[3]);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, input->type);
+  TF_LITE_MICRO_EXPECT_EQ(1960, input->dims->data[1]);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt8, input->type);
 
   // Copy a spectrogram created from a .wav audio file of someone saying "Yes",
   // into the memory area used for the input.
-  const uint8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
+  const int8_t* yes_features_data = g_yes_micro_f2e59fea_nohash_1_data;
   for (int i = 0; i < input->bytes; ++i) {
-    input->data.uint8[i] = yes_features_data[i];
+    input->data.int8[i] = yes_features_data[i];
   }
 
   // Run the model on this input and make sure it succeeds.
@@ -98,7 +101,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt8, output->type);
 
   // There are four possible classes in the output, each with a score.
   const int kSilenceIndex = 0;
@@ -107,18 +110,18 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   const int kNoIndex = 3;
 
   // Make sure that the expected "Yes" score is higher than the other classes.
-  uint8_t silence_score = output->data.uint8[kSilenceIndex];
-  uint8_t unknown_score = output->data.uint8[kUnknownIndex];
-  uint8_t yes_score = output->data.uint8[kYesIndex];
-  uint8_t no_score = output->data.uint8[kNoIndex];
+  uint8_t silence_score = output->data.uint8[kSilenceIndex] + 128;
+  uint8_t unknown_score = output->data.uint8[kUnknownIndex] + 128;
+  uint8_t yes_score = output->data.int8[kYesIndex] + 128;
+  uint8_t no_score = output->data.int8[kNoIndex] + 128;
   TF_LITE_MICRO_EXPECT_GT(yes_score, silence_score);
   TF_LITE_MICRO_EXPECT_GT(yes_score, unknown_score);
   TF_LITE_MICRO_EXPECT_GT(yes_score, no_score);
 
   // Now test with a different input, from a recording of "No".
-  const uint8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
+  const int8_t* no_features_data = g_no_micro_f9643d42_nohash_4_data;
   for (int i = 0; i < input->bytes; ++i) {
-    input->data.uint8[i] = no_features_data[i];
+    input->data.int8[i] = no_features_data[i];
   }
 
   // Run the model on this "No" input.
@@ -134,13 +137,13 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   TF_LITE_MICRO_EXPECT_EQ(2, output->dims->size);
   TF_LITE_MICRO_EXPECT_EQ(1, output->dims->data[0]);
   TF_LITE_MICRO_EXPECT_EQ(4, output->dims->data[1]);
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteUInt8, output->type);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt8, output->type);
 
   // Make sure that the expected "No" score is higher than the other classes.
-  silence_score = output->data.uint8[kSilenceIndex];
-  unknown_score = output->data.uint8[kUnknownIndex];
-  yes_score = output->data.uint8[kYesIndex];
-  no_score = output->data.uint8[kNoIndex];
+  silence_score = output->data.int8[kSilenceIndex] + 128;
+  unknown_score = output->data.int8[kUnknownIndex] + 128;
+  yes_score = output->data.int8[kYesIndex] + 128;
+  no_score = output->data.int8[kNoIndex] + 128;
   TF_LITE_MICRO_EXPECT_GT(no_score, silence_score);
   TF_LITE_MICRO_EXPECT_GT(no_score, unknown_score);
   TF_LITE_MICRO_EXPECT_GT(no_score, yes_score);
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
index 96f35984051..47bd10074d3 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.cc
@@ -47,10 +47,10 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
     return kTfLiteError;
   }
 
-  if (latest_results->type != kTfLiteUInt8) {
+  if (latest_results->type != kTfLiteInt8) {
     TF_LITE_REPORT_ERROR(
         error_reporter_,
-        "The results for recognition should be uint8 elements, but are %d",
+        "The results for recognition should be int8 elements, but are %d",
         latest_results->type);
     return kTfLiteError;
   }
@@ -66,7 +66,7 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
   }
 
   // Add the latest results to the head of the queue.
-  previous_results_.push_back({current_time_ms, latest_results->data.uint8});
+  previous_results_.push_back({current_time_ms, latest_results->data.int8});
 
   // Prune any earlier results that are too old for the averaging window.
   const int64_t time_limit = current_time_ms - average_window_duration_ms_;
@@ -93,12 +93,12 @@ TfLiteStatus RecognizeCommands::ProcessLatestResults(
   for (int offset = 0; offset < previous_results_.size(); ++offset) {
     PreviousResultsQueue::Result previous_result =
         previous_results_.from_front(offset);
-    const uint8_t* scores = previous_result.scores_;
+    const int8_t* scores = previous_result.scores;
     for (int i = 0; i < kCategoryCount; ++i) {
       if (offset == 0) {
-        average_scores[i] = scores[i];
+        average_scores[i] = scores[i] + 128;
       } else {
-        average_scores[i] += scores[i];
+        average_scores[i] += scores[i] + 128;
       }
     }
   }
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
index 059d567fb20..67bdb31bed9 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands.h
@@ -36,14 +36,14 @@ class PreviousResultsQueue {
   // Data structure that holds an inference result, and the time when it
   // was recorded.
   struct Result {
-    Result() : time_(0), scores_() {}
-    Result(int32_t time, uint8_t* scores) : time_(time) {
+    Result() : time_(0), scores() {}
+    Result(int32_t time, int8_t* input_scores) : time_(time) {
       for (int i = 0; i < kCategoryCount; ++i) {
-        scores_[i] = scores[i];
+        scores[i] = input_scores[i];
       }
     }
     int32_t time_;
-    uint8_t scores_[kCategoryCount];
+    int8_t scores[kCategoryCount];
   };
 
   int size() { return size_; }
diff --git a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
index 70911a81776..dcff73cf7ee 100644
--- a/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/recognize_commands_test.cc
@@ -27,13 +27,13 @@ TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
   PreviousResultsQueue queue(error_reporter);
   TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
 
-  uint8_t scores_a[4] = {0, 0, 0, 1};
+  int8_t scores_a[4] = {0, 0, 0, 1};
   queue.push_back({0, scores_a});
   TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
   TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
   TF_LITE_MICRO_EXPECT_EQ(0, queue.back().time_);
 
-  uint8_t scores_b[4] = {0, 0, 1, 0};
+  int8_t scores_b[4] = {0, 0, 1, 0};
   queue.push_back({1, scores_b});
   TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
   TF_LITE_MICRO_EXPECT_EQ(0, queue.front().time_);
@@ -45,7 +45,7 @@ TF_LITE_MICRO_TEST(PreviousResultsQueueBasic) {
   TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
   TF_LITE_MICRO_EXPECT_EQ(1, queue.back().time_);
 
-  uint8_t scores_c[4] = {0, 1, 0, 0};
+  int8_t scores_c[4] = {0, 1, 0, 0};
   queue.push_back({2, scores_c});
   TF_LITE_MICRO_EXPECT_EQ(2, queue.size());
   TF_LITE_MICRO_EXPECT_EQ(1, queue.front().time_);
@@ -60,7 +60,7 @@ TF_LITE_MICRO_TEST(PreviousResultsQueuePushPop) {
   TF_LITE_MICRO_EXPECT_EQ(0, queue.size());
 
   for (int i = 0; i < 123; ++i) {
-    uint8_t scores[4] = {0, 0, 0, 1};
+    int8_t scores[4] = {0, 0, 0, 1};
     queue.push_back({i, scores});
     TF_LITE_MICRO_EXPECT_EQ(1, queue.size());
     TF_LITE_MICRO_EXPECT_EQ(i, queue.front().time_);
@@ -78,11 +78,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBasic) {
 
   RecognizeCommands recognize_commands(error_reporter);
 
-  std::initializer_list<uint8_t> result_data = {255, 0, 0, 0};
+  std::initializer_list<int8_t> result_data = {127, -128, -128, -128};
   auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
       result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -98,11 +98,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> yes_data = {0, 0, 255, 0};
+  std::initializer_list<int8_t> yes_data = {-128, -128, 127, -128};
   auto yes_dims = {2, 1, 4};
   TfLiteTensor yes_results = tflite::testing::CreateQuantizedTensor(
       yes_data, tflite::testing::IntArrayFromInitializer(yes_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   bool has_found_new_command = false;
   const char* new_command;
@@ -126,11 +126,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestFindCommands) {
     TF_LITE_MICRO_EXPECT_EQ(0, tflite::testing::TestStrcmp("yes", new_command));
   }
 
-  std::initializer_list<uint8_t> no_data = {0, 0, 0, 255};
+  std::initializer_list<int8_t> no_data = {-128, -128, -128, 127};
   auto no_dims = {2, 1, 4};
   TfLiteTensor no_results = tflite::testing::CreateQuantizedTensor(
       no_data, tflite::testing::IntArrayFromInitializer(no_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
   has_found_new_command = false;
   new_command = "";
   uint8_t score;
@@ -161,11 +161,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputLength) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> bad_data = {0, 0, 255};
+  std::initializer_list<int8_t> bad_data = {-128, -128, 127};
   auto bad_dims = {2, 1, 3};
   TfLiteTensor bad_results = tflite::testing::CreateQuantizedTensor(
       bad_data, tflite::testing::IntArrayFromInitializer(bad_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -181,11 +181,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestBadInputTimes) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> result_data = {0, 0, 255, 0};
+  std::initializer_list<int8_t> result_data = {-128, -128, 127, -128};
   auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
       result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;
@@ -204,11 +204,11 @@ TF_LITE_MICRO_TEST(RecognizeCommandsTestTooFewInputs) {
 
   RecognizeCommands recognize_commands(error_reporter, 1000, 51);
 
-  std::initializer_list<uint8_t> result_data = {0, 0, 255, 0};
+  std::initializer_list<int8_t> result_data = {-128, -128, 127, -128};
   auto result_dims = {2, 1, 4};
   TfLiteTensor results = tflite::testing::CreateQuantizedTensor(
       result_data, tflite::testing::IntArrayFromInitializer(result_dims),
-      "input_tensor", 0.0f, 128.0f);
+      "input_tensor", -128.0f, 127.0f);
 
   const char* found_command;
   uint8_t score;

From 0e4e0c593bf7957aefd29818e2d24caee00c841a Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 26 May 2020 13:22:10 -0700
Subject: [PATCH 1139/1533] [TF/XLA] Ignore _noinline inside force-compiled
 clusters

The code surrounding the handling of _noinline functions is very rarely hit,
and as a result is not well tested.  For now, the better approach is to follow
a more well-lit codepath and try to minimize the use of _noinline functions.

As a starting point, inline blocks even with _noinline inside force-compiled
blocks.

PiperOrigin-RevId: 313256383
Change-Id: If2f60aac933ac8e27f3dcb65bf6b389611c45bd7
---
 tensorflow/compiler/tf2xla/BUILD              |  1 +
 tensorflow/compiler/tf2xla/xla_compiler.cc    |  7 +++++++
 .../core/common_runtime/graph_optimizer.cc    | 17 +++++++++------
 .../core/common_runtime/graph_optimizer.h     |  6 +++++-
 .../python/eager/def_function_xla_jit_test.py | 21 +++++++++++++++++++
 5 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 55341c0a01f..37110442b26 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -350,6 +350,7 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_cluster_util",
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 3d6083621f4..24ad1e1e311 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/types/variant.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -571,6 +572,10 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
+  bool is_inside_mustcompile;
+  TryGetNodeAttr(AttrSlice(&fbody->fdef.attr()), kXlaMustCompileAttr,
+                 &is_inside_mustcompile);
+
   // Performs a first function inlining pass before shape inference, since
   // otherwise shape inference can't see inside functions and a comprehensive
   // shape_map, including function ops, is needed to constant-propagate Shape
@@ -622,6 +627,8 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   graph_optimizer_options.inline_multi_device_functions = true;
   graph_optimizer_options.inline_impl_selection_group_functions = true;
   graph_optimizer_options.inline_with_single_device_body_placer = true;
+  graph_optimizer_options.ignore_noinline = is_inside_mustcompile;
+
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, graph_optimizer_options);
 
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 746930750ad..ae1a2daa788 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -42,7 +42,7 @@ void GraphOptimizer::Optimize(
     const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn,
     bool inline_multi_device_functions,
     bool inline_impl_selection_group_functions,
-    bool inline_with_single_device_body_placer) {
+    bool inline_with_single_device_body_placer, bool ignore_noinline) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -116,6 +116,11 @@ void GraphOptimizer::Optimize(
             .inline_impl_selection_group_functions = true;
       }
 
+      if (ignore_noinline) {
+        expand_inline_opts.multi_device_options.ignore_noinline = true;
+        expand_inline_opts.native_options.ignore_noinline = true;
+      }
+
       bool was_mutated = ExpandInlineFunctions(runtime, g, expand_inline_opts);
       if (was_mutated) {
         DumpGraph("ExpandInlineFunctions", g);
@@ -138,11 +143,11 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
                               const Device* device,
                               std::unique_ptr<Graph>* graph,
                               const Options& options) {
-  Optimize(runtime, env, device, graph, options.shape_map,
-           options.cse_consider_fn, options.cf_consider_fn,
-           options.inline_multi_device_functions,
-           options.inline_impl_selection_group_functions,
-           options.inline_with_single_device_body_placer);
+  Optimize(
+      runtime, env, device, graph, options.shape_map, options.cse_consider_fn,
+      options.cf_consider_fn, options.inline_multi_device_functions,
+      options.inline_impl_selection_group_functions,
+      options.inline_with_single_device_body_placer, options.ignore_noinline);
 }
 
 void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 099ea8efa12..53bf532bd9c 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -58,6 +58,9 @@ class GraphOptimizer {
     // If true all functions will be inlined with a single device function
     // body placer strategy.
     bool inline_with_single_device_body_placer = false;
+
+    // If true, the _noinline attribute on functions and callers is ignored.
+    bool ignore_noinline = false;
   };
 
   explicit GraphOptimizer(const OptimizerOptions& opts);
@@ -81,7 +84,8 @@ class GraphOptimizer {
       const NodePredicate& cf_consider_fn = nullptr,
       bool inline_multi_device_functions = false,
       bool inline_impl_selection_group_functions = false,
-      bool inline_with_single_device_body_placer = false);
+      bool inline_with_single_device_body_placer = false,
+      bool ignore_noinline = false);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 5fdf0487333..b63a3b434d4 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -355,6 +355,27 @@ class DefFunctionTest(test.TestCase):
     self.assertAllClose([5.0, 5.0, 5.0], g())
     self.assertAllClose(compiled_g(), g())
 
+  def testTensorListConcatGradNestedCompile(self):
+
+    @def_function.function(experimental_compile=True)
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    @def_function.function(experimental_compile=True)
+    def g():
+      x = constant_op.constant([3.14, 2.68, 7.69])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = f(x)
+        out = tape.gradient(y, x)
+      return out
+
+    self.assertAllClose([5.0, 5.0, 5.0], g())
+
   def testCumsum(self):
 
     @def_function.function(experimental_compile=True)

From ee7bffc10cf9e8d56e51d8b9ae3212f7e3c27b09 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 26 May 2020 20:32:18 +0000
Subject: [PATCH 1140/1533] Update update_confusion_matrix_variables to alwasy
 cast to variables_to_update dtype (vs. explicit float32)

This commits updates the function update_confusion_matrix_variables
to alwasy cast to dtype based on  variables_to_update (previously
the values are casted to float32 explicitly and that cuases issues
when keras' backend use non-float32).

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/keras/utils/metrics_utils.py       | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 5cb6fc5f9f8..5f9b57c095e 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -299,9 +299,19 @@ def update_confusion_matrix_variables(variables_to_update,
                      '`multi_label` is True.')
   if variables_to_update is None:
     return
-  y_true = math_ops.cast(y_true, dtype=dtypes.float32)
-  y_pred = math_ops.cast(y_pred, dtype=dtypes.float32)
-  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=dtypes.float32)
+  if not any(
+      key for key in variables_to_update if key in list(ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(ConfusionMatrix), variables_to_update.keys()))
+
+  variable_dtype = list(variables_to_update.values())[0].dtype
+
+  y_true = math_ops.cast(y_true, dtype=variable_dtype)
+  y_pred = math_ops.cast(y_pred, dtype=variable_dtype)
+  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=variable_dtype)
   num_thresholds = thresholds.shape[0]
   if multi_label:
     one_thresh = math_ops.equal(
@@ -314,14 +324,6 @@ def update_confusion_matrix_variables(variables_to_update,
                                                                sample_weight)
     one_thresh = math_ops.cast(True, dtype=dtypes.bool)
 
-  if not any(
-      key for key in variables_to_update if key in list(ConfusionMatrix)):
-    raise ValueError(
-        'Please provide at least one valid confusion matrix '
-        'variable to update. Valid variable key options are: "{}". '
-        'Received: "{}"'.format(
-            list(ConfusionMatrix), variables_to_update.keys()))
-
   invalid_keys = [
       key for key in variables_to_update if key not in list(ConfusionMatrix)
   ]
@@ -401,7 +403,7 @@ def update_confusion_matrix_variables(variables_to_update,
 
   if sample_weight is not None:
     sample_weight = weights_broadcast_ops.broadcast_weights(
-        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+        math_ops.cast(sample_weight, dtype=variable_dtype), y_pred)
     weights_tiled = array_ops.tile(
         array_ops.reshape(sample_weight, thresh_tiles), data_tiles)
   else:

From 7280cb31337d73a2fc869d7ba7cdcaf5295ad26e Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 26 May 2020 13:48:18 -0700
Subject: [PATCH 1141/1533] EGL extension functions must be opened using
 eglGetProcAddress, as they might not be supported at runtime.

PiperOrigin-RevId: 313260643
Change-Id: Ic7756dba33fde7b7150abc4652aa503f7e99e310
---
 tensorflow/lite/delegates/gpu/cl/egl_sync.cc | 40 +++++++++++++++++---
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/egl_sync.cc b/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
index ddc373bce31..f50bc75b8be 100644
--- a/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.cc
@@ -22,8 +22,15 @@ namespace gpu {
 namespace cl {
 
 absl::Status EglSync::NewFence(EGLDisplay display, EglSync* sync) {
+  static auto* egl_create_sync_khr =
+      reinterpret_cast<decltype(&eglCreateSyncKHR)>(
+          eglGetProcAddress("eglCreateSyncKHR"));
+  if (egl_create_sync_khr == nullptr) {
+    // Needs extension: EGL_KHR_fence_sync (EGL) / GL_OES_EGL_sync (OpenGL ES).
+    return absl::InternalError("Not supported: eglCreateSyncKHR.");
+  }
   EGLSyncKHR egl_sync;
-  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglCreateSyncKHR, &egl_sync, display,
+  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(*egl_create_sync_khr, &egl_sync, display,
                                       EGL_SYNC_FENCE_KHR, nullptr));
   if (egl_sync == EGL_NO_SYNC_KHR) {
     return absl::InternalError("Returned empty KHR EGL sync");
@@ -43,25 +50,46 @@ EglSync& EglSync::operator=(EglSync&& sync) {
 
 void EglSync::Invalidate() {
   if (sync_ != EGL_NO_SYNC_KHR) {
-    eglDestroySyncKHR(display_, sync_);
+    static auto* egl_destroy_sync_khr =
+        reinterpret_cast<decltype(&eglDestroySyncKHR)>(
+            eglGetProcAddress("eglDestroySyncKHR"));
+    // Needs extension: EGL_KHR_fence_sync (EGL) / GL_OES_EGL_sync (OpenGL ES).
+    if (egl_destroy_sync_khr) {
+      // Note: we're doing nothing when the function pointer is nullptr, or the
+      // call returns EGL_FALSE.
+      (*egl_destroy_sync_khr)(display_, sync_);
+    }
     sync_ = EGL_NO_SYNC_KHR;
   }
 }
 
 absl::Status EglSync::ServerWait() {
+  static auto* egl_wait_sync_khr = reinterpret_cast<decltype(&eglWaitSyncKHR)>(
+      eglGetProcAddress("eglWaitSyncKHR"));
+  if (egl_wait_sync_khr == nullptr) {
+    // Needs extension: EGL_KHR_wait_sync
+    return absl::InternalError("Not supported: eglWaitSyncKHR.");
+  }
   EGLint result;
   RETURN_IF_ERROR(
-      TFLITE_GPU_CALL_EGL(eglWaitSyncKHR, &result, display_, sync_, 0));
+      TFLITE_GPU_CALL_EGL(*egl_wait_sync_khr, &result, display_, sync_, 0));
   return result == EGL_TRUE ? absl::OkStatus()
                             : absl::InternalError("eglWaitSync failed");
 }
 
 absl::Status EglSync::ClientWait() {
+  static auto* egl_client_wait_sync_khr =
+      reinterpret_cast<decltype(&eglClientWaitSyncKHR)>(
+          eglGetProcAddress("eglClientWaitSyncKHR"));
+  if (egl_client_wait_sync_khr == nullptr) {
+    // Needs extension: EGL_KHR_fence_sync (EGL) / GL_OES_EGL_sync (OpenGL ES).
+    return absl::InternalError("Not supported: eglClientWaitSyncKHR.");
+  }
   EGLint result;
   // TODO(akulik): make it active wait for better performance
-  RETURN_IF_ERROR(TFLITE_GPU_CALL_EGL(eglClientWaitSyncKHR, &result, display_,
-                                      sync_, EGL_SYNC_FLUSH_COMMANDS_BIT_KHR,
-                                      EGL_FOREVER_KHR));
+  RETURN_IF_ERROR(
+      TFLITE_GPU_CALL_EGL(*egl_client_wait_sync_khr, &result, display_, sync_,
+                          EGL_SYNC_FLUSH_COMMANDS_BIT_KHR, EGL_FOREVER_KHR));
   return result == EGL_CONDITION_SATISFIED_KHR
              ? absl::OkStatus()
              : absl::InternalError("eglClientWaitSync failed");

From d847948f59a89ffab2182511944869fe996d715d Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Thu, 7 May 2020 18:17:52 +0200
Subject: [PATCH 1142/1533] Enable BiasAdd op conversion in dynamic shape mode

---
 .../tf2tensorrt/convert/convert_nodes.cc      | 35 ++++++-
 .../tf2tensorrt/convert/convert_nodes_test.cc | 92 ++++++++-----------
 2 files changed, 70 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index e791ff9ff60..4aca5efd6c9 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -404,6 +404,18 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
   // Compare broadcast feasibility
   if (check_feasibility) {
     for (int i = 0; i < broadcast_num_dims; ++i) {
+      if (!use_implicit_batch && (output_l[i] == -1 || output_r[i] == -1)) {
+        // If the condition is true then we are in explicit batch mode and (at
+        // least) one of the input dimensions are unknown. In other words we
+        // are in dynamic shape mode. During conversion time we only see -1 for
+        // the unknown shapes, therefore we cannot decide on the feasibility of
+        // broadcast over the unknown dimensions. Therefore we just continue for
+        // the next dimension. In dynamic shape mode TRT can only check the
+        // feasibility of the broadcast when the actual input dimensions are
+        // specified by SetTrtEngineInputs and the inference job is launched by
+        // TrtEnque.
+        continue;
+      }
       if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
           (output_r[i] != 1)) {
         return errors::InvalidArgument("Infeasible broadcast scheme (",
@@ -3888,11 +3900,26 @@ Status ConvertBiasAdd(OpConverterParams* params) {
 
   nvinfer1::Dims input_shape = inputs.at(0).GetTrtDims();
   nvinfer1::Dims bias_shape = inputs.at(1).GetTrtDims();
-  // If the input is NCHW, then we need to unsqueeze the bias such that its last
-  // dimensions are 1s (and the first dimension is C).
+  // The bias input arg is a 1-D tensor with length C. If the input is NCHW,
+  // then we need to unsqueeze the bias such that its shape is [1, C, 1, 1].
   if (data_format == "NCHW") {
-    bias_shape.nbDims = input_shape.nbDims;
-    std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
+    if (params->use_implicit_batch) {
+      // The batch dim is not included in implicit batch mode, so the shape of
+      // the bias tensor is [C, 1, 1].
+      bias_shape.nbDims = input_shape.nbDims;
+      std::fill(bias_shape.d + 1, bias_shape.d + bias_shape.nbDims, 1);
+    } else {
+      // In explicit batch mode we create a tensor with shape [1, C, 1, 1].
+      std::vector<int> bias_shape_vec(bias_shape.d,
+                                      bias_shape.d + bias_shape.nbDims);
+      // Insert 1 before for batch dim
+      bias_shape_vec.insert(bias_shape_vec.begin(), 1);
+      // Trail with 1s to match input_shape size
+      bias_shape_vec.insert(bias_shape_vec.end(),
+                            input_shape.nbDims - bias_shape_vec.size(), 1);
+      TF_RETURN_IF_ERROR(
+          TensorShapeArrayToTrtDims(bias_shape_vec, &bias_shape));
+    }
   } else {
     // Next, broadcast the bias across the input.
     TF_RETURN_IF_ERROR(GetTrtBroadcastShape(inputs.at(0), inputs.at(1),
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index d4badd1cc03..c7b69818d3d 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1862,6 +1862,13 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::Values(DT_FLOAT),
                        ::testing::Values(TrtPrecisionMode::FP32)));
 
+// Base class for tests that need to be tested for both FP32 and FP16.
+class OpConverterTest2 : public ParameterizedOpConverterTestBase {};
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverterTest2,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_FLOAT, DT_HALF),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
 template <typename T>
 void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
   out->Clear();
@@ -2396,91 +2403,70 @@ TEST_F(OpConverterTest, ConvertBatchMatMul) {
   TestMatMulHelper(this, get_batch_matmul_nodedef, "BatchMatMul");
 }
 
-template <DataType dtype>
-void TestConvertBiasAdd(OpConverterTest* test) {
+TEST_P(OpConverterTest2, ConvertBiasAdd) {
+  // Note that kINT32 is not supported by IScaleLayer, so we don't test
+  // DT_INT32 type here. DT_FLOAT and DT_HALF are tested.
   // Get the NodeDef for BiasAdd.
-  auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef {
+  auto get_biasadd_nodedef = [](const string& data_format,
+                                DataType tf_dtype) -> NodeDef {
     Scope s = Scope::NewRootScope();
-    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
-    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    auto input = ops::Placeholder(s.WithOpName("input"), tf_dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), tf_dtype);
     const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format);
     auto biasadd =
         ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs);
     return biasadd.operation.node()->def();
   };
 
-  typedef typename EnumToDataType<dtype>::Type CType;
   for (const string& data_format : {"NHWC", "NCHW"}) {
     for (const int trt_input_rank : {1, 2, 3, 4}) {
-      test->Reset();
-      NodeDef node_def = get_biasadd_nodedef(data_format);
+      Reset();
+      NodeDef node_def = get_biasadd_nodedef(data_format, tf_dtype);
 
       // Add input, dims_array will be like {2, 1, ..., 1, 3}
-      std::vector<int32> dims_array(trt_input_rank, 1);
+      std::vector<int32> dims_array(trt_input_rank + 1, 1);
       if (trt_input_rank == 1) {
-        dims_array[0] = (data_format == "NHWC" ? 3 : 2);
+        dims_array[1] = (data_format == "NHWC" ? 3 : 2);
       } else {
-        dims_array[0] = 2;
-        dims_array[trt_input_rank - 1] = 3;
+        dims_array[1] = 2;
+        dims_array[trt_input_rank] = 3;
       }
-      test->AddTestTensor("input", dims_array, /*batch_size=*/1,
-                          TfDataTypeToTrt(dtype));
-
-      // Add bias weights.
-      const int channel_size = (data_format == "NHWC" ? 3 : 2);
-      std::vector<CType> bias(channel_size);
-      for (int i = 0; i < channel_size; ++i) {
-        bias[i] = CType(i + 1);  // bias will be {1, 2, 3, ...}
-      }
-      test->AddTestWeights<CType>("weights", {channel_size}, bias);
-
-      // Run the conversion.
-      test->RunValidationAndConversion(node_def);
-      TRT_TensorOrWeights output;
-      TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output));
-      ASSERT_TRUE(output.is_tensor());
-      ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions());
-
-      // Build and run the engine.
       const int num_input = TrtTensorDimsNumElements(GetTestDims(dims_array));
       ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
                 num_input);
+      std::vector<float> input_data(num_input, 0);
+
+      AddTestTensor("input", dims_array, input_data);
+
+      const int channel_size = (data_format == "NHWC" ? 3 : 2);
+      std::vector<float> bias(channel_size);
+      for (int i = 0; i < channel_size; ++i) {
+        bias[i] = i + 1;  // bias will be {1, 2, 3, ...}
+      }
+      AddTestWeights("weights", {channel_size}, bias, tf_dtype);
+
+      // Build and run the engine.
+      std::vector<float> output_data;
 
-      const DataVec input_data{
-          {"input", test->ConstructTensor<CType>(num_input, CType(0))}};
-      DataVec output_data{
-          {"my_biasadd", test->ConstructTensor<CType>(num_input)}};
-      TF_EXPECT_OK(test->BuildAndRun(input_data, &output_data));
       if (trt_input_rank == 1) {
         if (data_format == "NHWC") {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2), CType(3)));
+          output_data = {1, 2, 3};
         } else {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2)));
+          output_data = {1, 2};
         }
       } else {
         if (data_format == "NHWC") {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(2), CType(3), CType(1),
-                                  CType(2), CType(3)));
+          output_data = {1, 2, 3, 1, 2, 3};
         } else {
-          EXPECT_THAT(GetSpanForData<CType>(output_data[0]),
-                      ElementsAre(CType(1), CType(1), CType(1), CType(2),
-                                  CType(2), CType(2)));
+          output_data = {1, 1, 1, 2, 2, 2};
         }
       }
+      TestOpConverter("my_biasadd", node_def, dims_array, Status::OK(),
+                      Status::OK(), ElementsAreArray(output_data));
     }
   }
 }
 
-TEST_F(OpConverterTest, ConvertBiasAdd) {
-  // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
-  // DT_INT32 type here.
-  TestConvertBiasAdd<DT_FLOAT>(this);
-  TestConvertBiasAdd<DT_HALF>(this);
-}
-
 template <typename OpType>
 NodeDef GetBinaryOpNodeDef(const string& input_name_l,
                            const string& input_name_r, DataType dtype) {

From 7f2d8106f5d1dafde01a02c5bc442f5e9e20357f Mon Sep 17 00:00:00 2001
From: Fabio Riccardi <fricc@google.com>
Date: Tue, 26 May 2020 13:58:36 -0700
Subject: [PATCH 1143/1533] Introduce Vulkan API with integration tests.

PiperOrigin-RevId: 313262552
Change-Id: I7d56bba03b7752938bd5f0cf5b08315941118369
---
 tensorflow/lite/delegates/gpu/api.cc | 12 ++++++++++
 tensorflow/lite/delegates/gpu/api.h  | 33 ++++++++++++++++++++++++++--
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/api.cc b/tensorflow/lite/delegates/gpu/api.cc
index 6c299e4965c..1a18fcb87f2 100644
--- a/tensorflow/lite/delegates/gpu/api.cc
+++ b/tensorflow/lite/delegates/gpu/api.cc
@@ -31,6 +31,12 @@ struct ObjectTypeGetter {
   ObjectType operator()(OpenClTexture) const {
     return ObjectType::OPENCL_TEXTURE;
   }
+  ObjectType operator()(VulkanBuffer) const {
+    return ObjectType::VULKAN_BUFFER;
+  }
+  ObjectType operator()(VulkanTexture) const {
+    return ObjectType::VULKAN_TEXTURE;
+  }
   ObjectType operator()(CpuMemory) const { return ObjectType::CPU_MEMORY; }
 };
 
@@ -42,6 +48,8 @@ struct ObjectValidityChecker {
   }
   bool operator()(OpenClBuffer obj) const { return obj.memobj; }
   bool operator()(OpenClTexture obj) const { return obj.memobj; }
+  bool operator()(VulkanBuffer obj) const { return obj.memory; }
+  bool operator()(VulkanTexture obj) const { return obj.memory; }
   bool operator()(CpuMemory obj) const {
     return obj.data != nullptr && obj.size_bytes > 0 &&
            (data_type == DataType::UNKNOWN ||
@@ -81,6 +89,10 @@ bool IsObjectPresent(ObjectType type, const TensorObject& obj) {
       return absl::get_if<OpenClBuffer>(&obj);
     case ObjectType::OPENCL_TEXTURE:
       return absl::get_if<OpenClTexture>(&obj);
+    case ObjectType::VULKAN_BUFFER:
+      return absl::get_if<VulkanBuffer>(&obj);
+    case ObjectType::VULKAN_TEXTURE:
+      return absl::get_if<VulkanTexture>(&obj);
     case ObjectType::UNKNOWN:
       return false;
   }
diff --git a/tensorflow/lite/delegates/gpu/api.h b/tensorflow/lite/delegates/gpu/api.h
index 2a531f1f81b..1dfeeebd700 100644
--- a/tensorflow/lite/delegates/gpu/api.h
+++ b/tensorflow/lite/delegates/gpu/api.h
@@ -71,6 +71,8 @@ enum class ObjectType {
   CPU_MEMORY,
   OPENCL_TEXTURE,
   OPENCL_BUFFER,
+  VULKAN_BUFFER,
+  VULKAN_TEXTURE
 };
 
 struct OpenGlBuffer {
@@ -104,11 +106,37 @@ struct OpenClTexture {
   // TODO(akulik): should it specify texture format?
 };
 
+struct VulkanBuffer {
+  VulkanBuffer() = default;
+  explicit VulkanBuffer(VkBuffer buffer_, VkDeviceSize size_,
+                        VkDeviceMemory memory_, VkDeviceSize offset_)
+      : buffer(buffer_), size(size_), memory(memory_), offset(offset_) {}
+
+  VkBuffer buffer;
+  VkDeviceSize size;
+  VkDeviceMemory memory;
+  VkDeviceSize offset;
+};
+
+struct VulkanTexture {
+  VulkanTexture() = default;
+  explicit VulkanTexture(VkDeviceMemory new_memory) : memory(new_memory) {}
+
+  VkImage image;
+  VkImageView image_view;
+  VkFormat format;
+  VkExtent3D extent;
+  VkDeviceMemory memory;
+  VkDeviceSize offset;
+};
+
 struct VulkanMemory {
   VulkanMemory() = default;
   explicit VulkanMemory(VkDeviceMemory new_memory) : memory(new_memory) {}
 
   VkDeviceMemory memory;
+  VkDeviceSize size;
+  VkDeviceSize offset;
 };
 
 struct CpuMemory {
@@ -195,8 +223,9 @@ bool IsValid(const TensorObjectDef& def);
 // @return the number of elements in a tensor object.
 uint32_t NumElements(const TensorObjectDef& def);
 
-using TensorObject = absl::variant<absl::monostate, OpenGlBuffer, OpenGlTexture,
-                                   CpuMemory, OpenClBuffer, OpenClTexture>;
+using TensorObject =
+    absl::variant<absl::monostate, OpenGlBuffer, OpenGlTexture, CpuMemory,
+                  OpenClBuffer, OpenClTexture, VulkanBuffer, VulkanTexture>;
 
 // @return true if object is set and corresponding values are defined.
 bool IsValid(const TensorObjectDef& def, const TensorObject& object);

From 68adba436cd987d637b46caa90333f7b809ad419 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Tue, 26 May 2020 13:59:25 -0700
Subject: [PATCH 1144/1533] Remove arch/*.c from arch64 builds in aws-c-common.

These files include immintrin and emmintrin, which are x86(64) platform extensions

PiperOrigin-RevId: 313262716
Change-Id: Iced220a4caee3d42a25e4fbfa420316a686f6be2
---
 third_party/aws/aws-c-common.bazel | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/aws/aws-c-common.bazel b/third_party/aws/aws-c-common.bazel
index a66fbcb1164..ab9406805c2 100644
--- a/third_party/aws/aws-c-common.bazel
+++ b/third_party/aws/aws-c-common.bazel
@@ -14,7 +14,6 @@ cc_library(
     srcs = select({
         "@org_tensorflow//tensorflow:linux_aarch64": glob([
             "source/posix/*.c",
-            "source/arch/*.c"
         ]),
         "@org_tensorflow//tensorflow:linux_x86_64": glob([
             "source/posix/*.c",

From 8182ab3bfc11eb0ef450ec66e2c4407c27aff3ec Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Tue, 26 May 2020 14:10:23 -0700
Subject: [PATCH 1145/1533] Stamp the minimum metadata parser version in
 MetadataPopulator.

PiperOrigin-RevId: 313264741
Change-Id: I823cff6f816aa8667ac351ca0fbb0f72178617b3
---
 .../lite/experimental/support/metadata/BUILD  |  1 +
 .../experimental/support/metadata/cc/BUILD    | 16 +++++
 .../support/metadata/cc/metadata_version.cc   | 50 ++++++++++++++
 .../support/metadata/cc/metadata_version.h    | 35 ++++++++++
 .../support/metadata/cc/python/BUILD          | 22 +++++++
 .../metadata/cc/python/metadata_version.cc    | 55 ++++++++++++++++
 .../support/metadata/cc/test/BUILD            | 15 +++++
 .../metadata/cc/test/metadata_version_test.cc | 65 +++++++++++++++++++
 .../experimental/support/metadata/metadata.py | 26 +++++++-
 .../support/metadata/metadata_test.py         | 25 ++++++-
 .../metadata/testdata/golden_json.json        |  3 +-
 11 files changed, 308 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/BUILD
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/python/BUILD
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/test/BUILD
 create mode 100644 tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc

diff --git a/tensorflow/lite/experimental/support/metadata/BUILD b/tensorflow/lite/experimental/support/metadata/BUILD
index d6417a1bfcf..4621c8c55d2 100644
--- a/tensorflow/lite/experimental/support/metadata/BUILD
+++ b/tensorflow/lite/experimental/support/metadata/BUILD
@@ -62,6 +62,7 @@ py_library(
     deps = [
         ":metadata_schema_py",
         ":schema_py",
+        "//tensorflow/lite/experimental/support/metadata/cc/python:_pywrap_metadata_version",
         "//tensorflow/lite/experimental/support/metadata/flatbuffers_lib:_pywrap_flatbuffers",
         "//tensorflow/python:platform",
         "@flatbuffers//:runtime_py",
diff --git a/tensorflow/lite/experimental/support/metadata/cc/BUILD b/tensorflow/lite/experimental/support/metadata/cc/BUILD
new file mode 100644
index 00000000000..2b288abe368
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/BUILD
@@ -0,0 +1,16 @@
+package(
+    default_visibility = ["//tensorflow/lite/experimental/support:users"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_library(
+    name = "metadata_version",
+    srcs = ["metadata_version.cc"],
+    hdrs = ["metadata_version.h"],
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
+        "//tensorflow/lite/tools:logging",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
new file mode 100644
index 00000000000..4f43c1431a7
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
+#include "tensorflow/lite/tools/logging.h"
+
+namespace tflite {
+namespace metadata {
+
+TfLiteStatus GetMinimumMetadataParserVersion(const uint8_t* buffer_data,
+                                             size_t buffer_size,
+                                             std::string* min_version) {
+  flatbuffers::Verifier verifier =
+      flatbuffers::Verifier(buffer_data, buffer_size);
+  if (!tflite::VerifyModelMetadataBuffer(verifier)) {
+    TFLITE_LOG(ERROR) << "The model metadata is not a valid FlatBuffer buffer.";
+    return kTfLiteError;
+  }
+
+  // Returns the version as the initial default one, "1.0.0", because it is the
+  // first version ever for metadata_schema.fbs.
+  //
+  // Later, when new fields are added to the schema, we'll update the logic of
+  // getting the minimum metadata parser version. To be more specific, we'll
+  // have a table that records the new fields and the versions of the schema
+  // they are added to. And the minimum metadata parser version will be the
+  // largest version number of all fields that has been added to a metadata
+  // flatbuffer.
+  // TODO(b/156539454): replace the hardcoded version with template + genrule.
+  static constexpr char kDefaultVersion[] = "1.0.0";
+  *min_version = kDefaultVersion;
+  return kTfLiteOk;
+}
+
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
new file mode 100644
index 00000000000..71e90788af4
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/metadata_version.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
+
+#include <string>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace metadata {
+
+// Gets the minimum metadata parser version that can fully understand all fields
+// in a given metadata flatbuffer. TFLite Metadata follows Semantic Versioning
+// 2.0. Each release version has the form MAJOR.MINOR.PATCH.
+TfLiteStatus GetMinimumMetadataParserVersion(const uint8_t* buffer_data,
+                                             size_t buffer_size,
+                                             std::string* min_version);
+
+}  // namespace metadata
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SUPPORT_METADATA_CC_METADATA_VERSION_H_
diff --git a/tensorflow/lite/experimental/support/metadata/cc/python/BUILD b/tensorflow/lite/experimental/support/metadata/cc/python/BUILD
new file mode 100644
index 00000000000..4128f0ac9d1
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/python/BUILD
@@ -0,0 +1,22 @@
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+
+package(
+    default_visibility = [
+        "//tensorflow/lite/experimental/support/metadata:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+pybind_extension(
+    name = "_pywrap_metadata_version",
+    srcs = [
+        "metadata_version.cc",
+    ],
+    features = ["-use_header_modules"],
+    module_name = "_pywrap_metadata_version",
+    deps = [
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc b/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
new file mode 100644
index 00000000000..7d1f9d1e122
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/python/metadata_version.cc
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
+
+#include "pybind11/pybind11.h"
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace metadata {
+
+PYBIND11_MODULE(_pywrap_metadata_version, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_metadata_version
+    A module that returns the minimum metadata parser version of a given
+    metadata flatbuffer.
+  )pbdoc";
+
+  // Using pybind11 type conversions to convert between Python and native
+  // C++ types. There are other options to provide access to native Python types
+  // in C++ and vice versa. See the pybind 11 instrcution [1] for more details.
+  // Type converstions is recommended by pybind11, though the main downside
+  // is that a copy of the data must be made on every Python to C++ transition:
+  // this is needed since the C++ and Python versions of the same type generally
+  // won’t have the same memory layout.
+  //
+  // [1]: https://pybind11.readthedocs.io/en/stable/advanced/cast/index.html
+  m.def("GetMinimumMetadataParserVersion",
+        [](const std::string& buffer_data) -> std::string {
+          std::string min_version;
+          if (GetMinimumMetadataParserVersion(
+                  reinterpret_cast<const uint8_t*>(buffer_data.c_str()),
+                  buffer_data.length(), &min_version) != kTfLiteOk) {
+            pybind11::value_error(
+                "Error occurred when getting the minimum metadata parser "
+                "version of the metadata flatbuffer.");
+          }
+          return min_version;
+        });
+}
+
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/BUILD b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
new file mode 100644
index 00000000000..fd829124c73
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+cc_test(
+    name = "metadata_version_test",
+    srcs = ["metadata_version_test.cc"],
+    deps = [
+        "//tensorflow/lite/experimental/support/metadata:metadata_schema_cc",
+        "//tensorflow/lite/experimental/support/metadata/cc:metadata_version",
+        "@com_google_googletest//:gtest_main",
+        "@flatbuffers",
+    ],
+)
diff --git a/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
new file mode 100644
index 00000000000..00d9c0902c6
--- /dev/null
+++ b/tensorflow/lite/experimental/support/metadata/cc/test/metadata_version_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/support/metadata/cc/metadata_version.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/experimental/support/metadata/metadata_schema_generated.h"
+
+namespace tflite {
+namespace metadata {
+namespace {
+
+using ::testing::MatchesRegex;
+
+TEST(MetadataVersionTest,
+     GetMinimumMetadataParserVersionSucceedsWithValidMetadata) {
+  // Creates a dummy metadata flatbuffer for test.
+  flatbuffers::FlatBufferBuilder builder(1024);
+  auto name = builder.CreateString("Foo");
+  ModelMetadataBuilder metadata_builder(builder);
+  metadata_builder.add_name(name);
+  auto metadata = metadata_builder.Finish();
+  FinishModelMetadataBuffer(builder, metadata);
+
+  // Gets the mimimum metadata parser version.
+  std::string min_version;
+  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
+                                            builder.GetSize(), &min_version),
+            kTfLiteOk);
+  // Validates that the version is well-formed (x.y.z).
+  EXPECT_THAT(min_version, MatchesRegex("[0-9]*\\.[0-9]*\\.[0-9]"));
+}
+
+TEST(MetadataVersionTest,
+     GetMinimumMetadataParserVersionSucceedsWithInvalidIdentifier) {
+  // Creates a dummy metadata flatbuffer without identifier.
+  flatbuffers::FlatBufferBuilder builder(1024);
+  ModelMetadataBuilder metadata_builder(builder);
+  auto metadata = metadata_builder.Finish();
+  builder.Finish(metadata);
+
+  // Gets the mimimum metadata parser version and triggers error.
+  std::string min_version;
+  EXPECT_EQ(GetMinimumMetadataParserVersion(builder.GetBufferPointer(),
+                                            builder.GetSize(), &min_version),
+            kTfLiteError);
+  EXPECT_TRUE(min_version.empty());
+}
+
+}  // namespace
+}  // namespace metadata
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/support/metadata/metadata.py b/tensorflow/lite/experimental/support/metadata/metadata.py
index 25ca57bb4cc..b3d8d28806b 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata.py
+++ b/tensorflow/lite/experimental/support/metadata/metadata.py
@@ -28,6 +28,7 @@ import zipfile
 from flatbuffers.python import flatbuffers
 from tensorflow.lite.experimental.support.metadata import metadata_schema_py_generated as _metadata_fb
 from tensorflow.lite.experimental.support.metadata import schema_py_generated as _schema_fb
+from tensorflow.lite.experimental.support.metadata.cc.python import _pywrap_metadata_version
 from tensorflow.lite.experimental.support.metadata.flatbuffers_lib import _pywrap_flatbuffers
 from tensorflow.python.platform import resource_loader
 
@@ -55,7 +56,7 @@ class MetadataPopulator(object):
   classifer model using Flatbuffers API. Attach the label file onto the ouput
   tensor (the tensor of probabilities) in the metadata.
 
-  Then, pack the metadata and lable file into the model as follows.
+  Then, pack the metadata and label file into the model as follows.
 
     ```python
     # Populating a metadata file (or a metadta buffer) and associated files to
@@ -78,6 +79,9 @@ class MetadataPopulator(object):
     with open("updated_model.tflite", "wb") as f:
       f.write(updated_model_buf)
     ```
+
+  Note that existing metadata buffer (if applied) will be overridden by the new
+  metadata buffer.
   """
   # As Zip API is used to concatenate associated files after tflite model file,
   # the populating operation is developed based on a model file. For in-memory
@@ -218,12 +222,27 @@ class MetadataPopulator(object):
     Raises:
       ValueError: The metadata to be populated is empty.
       ValueError: The metadata does not have the expected flatbuffer identifer.
+      ValueError: Error occurs when getting the minimum metadata parser version.
     """
     if not metadata_buf:
       raise ValueError("The metadata to be populated is empty.")
 
     _assert_metadata_buffer_identifier(metadata_buf)
-    self._metadata_buf = metadata_buf
+
+    # Gets the minimum metadata parser version of the metadata_buf.
+    min_version = _pywrap_metadata_version.GetMinimumMetadataParserVersion(
+        bytes(metadata_buf))
+
+    # Inserts in the minimum metadata parser version into the metadata_buf.
+    metadata = _metadata_fb.ModelMetadataT.InitFromObj(
+        _metadata_fb.ModelMetadata.GetRootAsModelMetadata(metadata_buf, 0))
+    metadata.minParserVersion = min_version
+
+    b = flatbuffers.Builder(0)
+    b.Finish(metadata.Pack(b), self.METADATA_FILE_IDENTIFIER)
+    metadata_buf_with_version = b.Output()
+
+    self._metadata_buf = metadata_buf_with_version
 
   def load_metadata_file(self, metadata_file):
     """Loads the metadata file to be populated.
@@ -325,6 +344,9 @@ class MetadataPopulator(object):
     Inserts metadata_buf into the metadata field of schema.Model. If the
     MetadataPopulator object is created using the method,
     with_model_file(model_file), the model file will be updated.
+
+    Existing metadata buffer (if applied) will be overridden by the new metadata
+    buffer.
     """
 
     with open(self._model_file, "rb") as f:
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_test.py b/tensorflow/lite/experimental/support/metadata/metadata_test.py
index 81b3eef62f9..28395041746 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_test.py
+++ b/tensorflow/lite/experimental/support/metadata/metadata_test.py
@@ -43,6 +43,8 @@ class MetadataTest(test_util.TensorFlowTestCase):
       f.write(self._empty_model_buf)
     self._model_file = self._create_model_file_with_metadata_and_buf_fields()
     self._metadata_file = self._create_metadata_file()
+    self._metadata_file_with_version = self._create_metadata_file_with_version(
+        self._metadata_file, "1.0.0")
     self._file1 = self.create_tempfile("file1").full_path
     self._file2 = self.create_tempfile("file2").full_path
     self._file3 = self.create_tempfile("file3").full_path
@@ -135,6 +137,25 @@ class MetadataTest(test_util.TensorFlowTestCase):
     b.Finish(model.Pack(b), identifier)
     return b.Output()
 
+  def _create_metadata_file_with_version(self, metadata_file, min_version):
+    # Creates a new metadata file with the specified min_version for testing
+    # purposes.
+    with open(metadata_file, "rb") as f:
+      metadata_buf = bytearray(f.read())
+
+    metadata = _metadata_fb.ModelMetadataT.InitFromObj(
+        _metadata_fb.ModelMetadata.GetRootAsModelMetadata(metadata_buf, 0))
+    metadata.minParserVersion = min_version
+
+    b = flatbuffers.Builder(0)
+    b.Finish(
+        metadata.Pack(b), _metadata.MetadataPopulator.METADATA_FILE_IDENTIFIER)
+
+    metadata_file_with_version = self.create_tempfile().full_path
+    with open(metadata_file_with_version, "wb") as f:
+      f.write(b.Output())
+    return metadata_file_with_version
+
 
 class MetadataPopulatorTest(MetadataTest):
 
@@ -245,7 +266,7 @@ class MetadataPopulatorTest(MetadataTest):
     buffer_data = model.Buffers(buffer_index)
     metadata_buf_np = buffer_data.DataAsNumpy()
     metadata_buf = metadata_buf_np.tobytes()
-    with open(self._metadata_file, "rb") as f:
+    with open(self._metadata_file_with_version, "rb") as f:
       expected_metadata_buf = bytearray(f.read())
     self.assertEqual(metadata_buf, expected_metadata_buf)
 
@@ -293,7 +314,7 @@ class MetadataPopulatorTest(MetadataTest):
     buffer_data = model.Buffers(buffer_index)
     metadata_buf_np = buffer_data.DataAsNumpy()
     metadata_buf = metadata_buf_np.tobytes()
-    with open(self._metadata_file, "rb") as f:
+    with open(self._metadata_file_with_version, "rb") as f:
       expected_metadata_buf = bytearray(f.read())
     self.assertEqual(metadata_buf, expected_metadata_buf)
 
diff --git a/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json b/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
index bc3001e685a..9ff5581fbff 100644
--- a/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
+++ b/tensorflow/lite/experimental/support/metadata/testdata/golden_json.json
@@ -17,5 +17,6 @@
     {
       "name": "file1"
     }
-  ]
+  ],
+  "min_parser_version": "1.0.0"
 }

From bc99898e990a1f714732ec40a0a924aa0ee2fee5 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 26 May 2020 22:15:07 +0100
Subject: [PATCH 1146/1533] Prefer generator expressions over list
 comprehensions

---
 tensorflow/lite/python/lite.py                                | 2 +-
 tensorflow/python/eager/function.py                           | 2 +-
 tensorflow/python/framework/subscribe.py                      | 3 +--
 tensorflow/python/framework/test_util.py                      | 2 +-
 tensorflow/python/keras/callbacks.py                          | 2 +-
 .../python/keras/layers/preprocessing/category_crossing.py    | 4 ++--
 tensorflow/python/keras/layers/preprocessing/hashing.py       | 2 +-
 tensorflow/python/keras/saving/hdf5_format.py                 | 2 +-
 tensorflow/tools/dockerfiles/assembler.py                     | 2 +-
 third_party/gpus/check_cuda_libs.py                           | 2 +-
 10 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index ce59c56a1d0..53814bb0c43 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -279,7 +279,7 @@ class QuantizationMode(object):
     })
 
     for node_def in self._graph_def.node:
-      if any([op in node_def.name for op in training_quant_ops]):
+      if any(op in node_def.name for op in training_quant_ops):
         return True
     return False
 
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index ce495d772d0..37c802b9aa6 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -726,7 +726,7 @@ class _DelayedRewriteGradientFunctions(object):
     # pylint: enable=protected-access
 
     capture_mapping = dict(
-        zip([ops.tensor_id(t) for t in self._func_graph.outputs], op.outputs))
+        zip((ops.tensor_id(t) for t in self._func_graph.outputs), op.outputs))
     remapped_captures = [
         capture_mapping.get(ops.tensor_id(capture), capture)
         for capture in backwards_function.captured_inputs
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index 8c3f91f62d8..c7cf8ce6070 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -58,8 +58,7 @@ def _recursive_apply(tensors, apply_fn):
       return tuple(tensors)
     return tensors_type(*tensors)  # collections.namedtuple
   elif tensors_type is dict:
-    return dict([(k, _recursive_apply(v, apply_fn)) for k, v in tensors.items()
-                ])
+    return dict((k, _recursive_apply(v, apply_fn)) for k, v in tensors.items())
   else:
     raise TypeError('_recursive_apply argument %r has invalid type %r' %
                     (tensors, tensors_type))
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4981e1b68fd..36b73c8ebc6 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -486,7 +486,7 @@ def skip_if_error(test_obj, error_type, messages=None):
   try:
     yield
   except error_type as e:
-    if not messages or any([message in str(e) for message in messages]):
+    if not messages or any(message in str(e) for message in messages):
       test_obj.skipTest("Skipping error: {}".format(str(e)))
     else:
       raise
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index db326ea32f0..58c24afbb61 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -2341,7 +2341,7 @@ class CSVLogger(Callback):
 
     if self.model.stop_training:
       # We set NA so that csv parsers do not fail for this last epoch.
-      logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
+      logs = dict((k, logs[k]) if k in logs else (k, 'NA') for k in self.keys)
 
     if not self.writer:
 
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
index 79c27d9ec36..84e5332bea5 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -140,9 +140,9 @@ class CategoryCrossing(Layer):
   def call(self, inputs):
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
     ragged_out = sparse_out = False
-    if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
+    if any(ragged_tensor.is_ragged(inp) for inp in inputs):
       ragged_out = True
-    elif any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
+    elif any(isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs):
       sparse_out = True
 
     outputs = []
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index 05b4445829a..f4a4ae0ccc8 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -168,7 +168,7 @@ class Hashing(Layer):
   def _process_input_list(self, inputs):
     # TODO(momernick): support ragged_cross_hashed with corrected fingerprint
     # and siphash.
-    if any([isinstance(inp, ragged_tensor.RaggedTensor) for inp in inputs]):
+    if any(isinstance(inp, ragged_tensor.RaggedTensor) for inp in inputs):
       raise ValueError('Hashing with ragged input is not supported yet.')
     sparse_inputs = [
         inp for inp in inputs if isinstance(inp, sparse_tensor.SparseTensor)
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index f3adb2d0695..800d609fe99 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -876,7 +876,7 @@ def _legacy_weights(layer):
       non_trainable_weights.
   """
   weights = layer.trainable_weights + layer.non_trainable_weights
-  if any([not isinstance(w, variables_module.Variable) for w in weights]):
+  if any(not isinstance(w, variables_module.Variable) for w in weights):
     raise NotImplementedError(
         'Save or restore weights that is not an instance of `tf.Variable` is '
         'not supported in h5, use `save_format=\'tf\'` instead. Got a model '
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 7b3dcbd33c0..d2135f38ab4 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -558,7 +558,7 @@ def main(argv):
       # Only build images for host architecture
       proc_arch = platform.processor()
       is_x86 = proc_arch.startswith('x86')
-      if (is_x86 and any([arch in tag for arch in ['ppc64le']]) or
+      if (is_x86 and any(arch in tag for arch in ['ppc64le']) or
           not is_x86 and proc_arch not in tag):
         continue
 
diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
index 479380da975..686d36f5c77 100644
--- a/third_party/gpus/check_cuda_libs.py
+++ b/third_party/gpus/check_cuda_libs.py
@@ -62,7 +62,7 @@ def check_cuda_lib(path, check_soname=True):
     output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
     output = [line for line in output.splitlines() if "SONAME" in line]
     sonames = [line.strip().split(" ")[-1] for line in output]
-    if not any([soname == os.path.basename(path) for soname in sonames]):
+    if not any(soname == os.path.basename(path) for soname in sonames):
       raise ConfigError("None of the libraries match their SONAME: " + path)
 
 
From 280665cb81e01691959b478f883cdf5ac89bd152 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 26 May 2020 14:16:22 -0700
Subject: [PATCH 1147/1533] Include shape dialect registration

Registering it everywhere where TF dialect is as this will be used for dynamic
shape lowering.

PiperOrigin-RevId: 313265819
Change-Id: Ic14f19324d043f52699052f3c3ce3ac3ea0302ff
---
 tensorflow/compiler/mlir/BUILD                              | 1 +
 tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc    | 4 +++-
 tensorflow/compiler/mlir/tensorflow/BUILD                   | 3 +--
 .../compiler/mlir/tensorflow/ir/dialect_registration.cc     | 2 ++
 .../compiler/mlir/tensorflow/utils/compile_mlir_util.cc     | 6 ++++--
 5 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index c0066ecda03..c4472e1185c 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -104,6 +104,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index 11d3e7332db..b2225ec1c4f 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_os_ostream.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -93,9 +94,10 @@ MlirOptimizationPassRegistry& MlirOptimizationPassRegistry::Global() {
 static void RegisterDialects() {
   static bool init_once = []() {
     mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
     mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
     mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     return true;
   }();
   (void)init_once;
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index b2b4c09df3b..de0af94f0cb 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -559,8 +559,7 @@ cc_library(
     srcs = ["ir/dialect_registration.cc"],
     deps = [
         ":tensorflow",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:Shape",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
index ac468d9810c..c95d7b7ca7c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/dialect_registration.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -31,5 +32,6 @@ static DialectRegistration<tf_device::TensorFlowDeviceDialect>
     tf_device_dialect;
 static DialectRegistration<tf_saved_model::TensorFlowSavedModelDialect>
     tf_saved_model_dialect;
+static DialectRegistration<mlir::shape::ShapeDialect> shape_dialect;
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index 03283da0112..fd1ba3b1901 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/Function.h"  // from @llvm-project
@@ -247,9 +248,10 @@ Status RefineShapes(llvm::ArrayRef<TensorShape> arg_shapes,
 
 static void RegisterDialects() {
   static bool init_once = []() {
-    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
-    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
     mlir::registerDialect<mlir::xla_hlo::XlaHloDialect>();
     return true;
   }();

From 92c9a894bd5284c82750c8c7f63dba6d15fe2efc Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 26 May 2020 14:17:41 -0700
Subject: [PATCH 1148/1533] Removed mentioning of libOpenCL-pixel.so.

PiperOrigin-RevId: 313266044
Change-Id: Iecd6b7b55e1e39a103d624c8c08bce89bed05ad2
---
 tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index be551bc9973..fadaabe32a0 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <dlfcn.h>
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -37,6 +39,8 @@ absl::Status LoadOpenCL() {
     LoadOpenCLFunctions(libopencl, false);
     return absl::OkStatus();
   } else {
+    // record error
+    std::string error(dlerror());
     // Pixel phone?
     libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
     if (libopencl) {
@@ -48,7 +52,7 @@ absl::Status LoadOpenCL() {
       return absl::OkStatus();
     } else {
       return absl::UnknownError(
-          absl::StrCat("OpenCL library not loaded - ", dlerror()));
+          absl::StrCat("Can not open OpenCL library on this device - ", error));
     }
   }
 }

From 3258ebf5e18e898a11f9d2bde25efd3224738e43 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 14:27:26 -0700
Subject: [PATCH 1149/1533] Reuse the rendezvous provided by the
 OpKernelContext for PartitionedCallOp. This will allow send/recv across
 different tf.functions.

PiperOrigin-RevId: 313267770
Change-Id: I28fb8e43cb7b3374feeca9b0f203a968a338ec9e
---
 tensorflow/core/kernels/partitioned_function_ops.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 3045fd050d5..a85f3f449fd 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -245,7 +245,6 @@ void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
   run_opts.source_device =
       lib->device() == nullptr ? "" : lib->device()->name();
   run_opts.allow_dead_tensors = true;
-  run_opts.rendezvous = ctx->rendezvous();
 
   std::vector<Tensor>* rets = new std::vector<Tensor>;
   const string& func_name = func_->name();

From 1a93f37e2614b38b3a12f82c9bc25aea9eda3953 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 26 May 2020 14:28:34 -0700
Subject: [PATCH 1150/1533] Add SelectFullyConnectedGeneric for selecting a
 non-device specific implementation. Currently same as
 SelectFullyConnectedAdreno.

PiperOrigin-RevId: 313267998
Change-Id: I749fdfa626840cb5e0b78a0c2570e9a5e054f1fe
---
 .../cl/selectors/fully_connected_selector.cc  | 29 ++++++++++++++-----
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
index 2a04a04460d..12a1d726368 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/fully_connected_selector.cc
@@ -27,6 +27,22 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+absl::Status SelectFullyConnectedGeneric(
+    const FullyConnectedAttributes& attr,
+    const CreationContext& creation_context, const OperationDef& op_def,
+    int batch_size, std::unique_ptr<GPUOperation>* ptr) {
+  if (op_def.IsBatchSupported()) {
+    ConvTexture conv;
+    RETURN_IF_ERROR(CreateConvTexture(creation_context, op_def, attr, &conv));
+    *ptr = absl::make_unique<ConvTexture>(std::move(conv));
+  } else {
+    FullyConnected fc;
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
+    *ptr = absl::make_unique<FullyConnected>(std::move(fc));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
                                         const CreationContext& creation_context,
                                         const OperationDef& op_def,
@@ -38,8 +54,7 @@ absl::Status SelectFullyConnectedAdreno(const FullyConnectedAttributes& attr,
     *ptr = absl::make_unique<ConvTexture>(std::move(conv));
   } else {
     FullyConnected fc;
-    RETURN_IF_ERROR(
-        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
     *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return absl::OkStatus();
@@ -55,8 +70,7 @@ absl::Status SelectFullyConnectedPowerVR(
     *ptr = absl::make_unique<ConvPowerVR>(std::move(conv));
   } else {
     FullyConnected fc;
-    RETURN_IF_ERROR(
-        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
     *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return absl::OkStatus();
@@ -80,8 +94,7 @@ absl::Status SelectFullyConnectedMali(const FullyConnectedAttributes& attr,
     }
   } else {
     FullyConnected fc;
-    RETURN_IF_ERROR(
-        CreateFullyConnected(creation_context, op_def, attr, &fc));
+    RETURN_IF_ERROR(CreateFullyConnected(creation_context, op_def, attr, &fc));
     *ptr = absl::make_unique<FullyConnected>(std::move(fc));
   }
   return absl::OkStatus();
@@ -102,8 +115,8 @@ absl::Status SelectFullyConnected(const FullyConnectedAttributes& attr,
       return SelectFullyConnectedMali(attr, creation_context, op_def,
                                       batch_size, ptr);
     default:
-      return SelectFullyConnectedAdreno(attr, creation_context, op_def,
-                                        batch_size, ptr);
+      return SelectFullyConnectedGeneric(attr, creation_context, op_def,
+                                         batch_size, ptr);
   }
 }
 

From 4fec047a168820387a100d99cf931ad665e4f3f7 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Tue, 26 May 2020 14:34:06 -0700
Subject: [PATCH 1151/1533] Update TPUExtractHeadTailOutsideCompilation tests
 to be under the same module (NFC).

PiperOrigin-RevId: 313269129
Change-Id: Ic9192d15d6d4d697ea369f87a573e9e28183c298
---
 ...extract_head_tail_outside_compilation.mlir | 28 -------------------
 1 file changed, 28 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 3e8ade180b1..9af75255202 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -21,11 +21,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
-}
 
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @ops_no_operands
   func @ops_no_operands() -> () {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
@@ -45,11 +41,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
-}
 
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @aliased_output
   func @aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
@@ -77,11 +69,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
     return %0#0, %0#1, %0#2 : tensor<i32>, tensor<i32>, tensor<i32>
   }
-}
 
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @all_head_computation_ops
   func @all_head_computation_ops(%arg0 : tensor<i32>) -> (tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
@@ -103,11 +91,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>)
     return %0 : tensor<i32>
   }
-}
 
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @multiple_head_outside_compilation
   func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
@@ -129,11 +113,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
-}
 
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
   // CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
   func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
     // CHECK-NOT:  tf_device.launch
@@ -150,11 +130,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
-}
 
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
   // CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
   func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
@@ -178,11 +154,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
-}
 
-// -----
-
-module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @test_replicated_head_outside_compilation
   func @test_replicated_head_outside_compilation(%arg0 : tensor<i32>) -> () {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()

From 13f50c2b7af684fe99c61c90023bdb98640370ea Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Tue, 26 May 2020 14:36:58 -0700
Subject: [PATCH 1152/1533] Update server_lib API to be more consistent with
 tf.distribute.Server.

This is in preparation for exporting MasterServer and WorkerServer in the public API.

This CL also addresses an issue where we relied on counting the number of tasks in job 0 to determine when a worker has registered with the master. Now we directly query the master for how many workers are registered.

PiperOrigin-RevId: 313269683
Change-Id: Ie4284a8ca91bd87fd366761d055761384654aef3
---
 tensorflow/core/data/service/data_service.cc  |  16 ++
 tensorflow/core/data/service/data_service.h   |   4 +
 .../core/data/service/data_service_test.cc    |  13 +
 .../core/data/service/grpc_master_impl.cc     |   1 +
 .../core/data/service/grpc_master_impl.h      |   1 +
 .../core/data/service/grpc_worker_impl.cc     |   1 -
 tensorflow/core/data/service/master.proto     |  15 ++
 tensorflow/core/data/service/master_impl.cc   |  14 ++
 tensorflow/core/data/service/master_impl.h    |   2 +
 tensorflow/core/data/service/server_lib.cc    |  29 ++-
 tensorflow/core/data/service/server_lib.h     |   6 +-
 .../kernel_tests/data_service_ops_test.py     | 105 ++++----
 tensorflow/python/data/service/server_lib.py  | 229 +++++++++++++-----
 .../python/data/service/server_lib_test.py    |  67 ++++-
 .../python/data/service/server_lib_wrapper.cc |  47 ++--
 15 files changed, 391 insertions(+), 159 deletions(-)

diff --git a/tensorflow/core/data/service/data_service.cc b/tensorflow/core/data/service/data_service.cc
index 915435d8fcb..d4e08c77f35 100644
--- a/tensorflow/core/data/service/data_service.cc
+++ b/tensorflow/core/data/service/data_service.cc
@@ -132,6 +132,22 @@ Status DataServiceMasterClient::GetTasks(int64 job_id,
   return Status::OK();
 }
 
+Status DataServiceMasterClient::GetWorkers(std::vector<WorkerInfo>* workers) {
+  TF_RETURN_IF_ERROR(EnsureInitialized());
+  GetWorkersRequest req;
+  GetWorkersResponse resp;
+  grpc_impl::ClientContext ctx;
+  grpc::Status s = stub_->GetWorkers(&ctx, req, &resp);
+  if (!s.ok()) {
+    return grpc_util::WrapError("Failed to get workers", s);
+  }
+  workers->clear();
+  for (auto& worker : resp.workers()) {
+    workers->push_back(worker);
+  }
+  return Status::OK();
+}
+
 Status DataServiceMasterClient::EnsureInitialized() {
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
diff --git a/tensorflow/core/data/service/data_service.h b/tensorflow/core/data/service/data_service.h
index d205b4d9ebf..bb5a8a470f0 100644
--- a/tensorflow/core/data/service/data_service.h
+++ b/tensorflow/core/data/service/data_service.h
@@ -96,6 +96,10 @@ class DataServiceMasterClient : public DataServiceClientBase {
   Status GetTasks(int64 job_id, std::vector<TaskInfo>* tasks,
                   bool* job_finished);
 
+  // Queries the master for its registered workers. The worker info will be
+  // stored in `*workers`.
+  Status GetWorkers(std::vector<WorkerInfo>* workers);
+
  protected:
   Status EnsureInitialized() override;
 
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index a4505d8965f..19392393eeb 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -35,6 +35,10 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+namespace {
+constexpr const char kProtocol[] = "grpc+local";
+}
+
 TEST(DataService, ParseParallelEpochsProcessingMode) {
   ProcessingMode mode;
   TF_ASSERT_OK(ParseProcessingMode("parallel_epochs", &mode));
@@ -59,5 +63,14 @@ TEST(DataService, ProcessingModeToString) {
   EXPECT_EQ("one_epoch", ProcessingModeToString(ProcessingMode::ONE_EPOCH));
 }
 
+TEST(DataService, GetWorkers) {
+  TestCluster cluster(1);
+  TF_ASSERT_OK(cluster.Initialize());
+  DataServiceMasterClient master(cluster.MasterAddress(), kProtocol);
+  std::vector<WorkerInfo> workers;
+  TF_EXPECT_OK(master.GetWorkers(&workers));
+  EXPECT_EQ(1, workers.size());
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/grpc_master_impl.cc b/tensorflow/core/data/service/grpc_master_impl.cc
index ba27959fee7..20ad58a0115 100644
--- a/tensorflow/core/data/service/grpc_master_impl.cc
+++ b/tensorflow/core/data/service/grpc_master_impl.cc
@@ -44,6 +44,7 @@ HANDLER(GetOrRegisterDataset);
 HANDLER(CreateJob);
 HANDLER(GetOrCreateJob);
 HANDLER(GetTasks);
+HANDLER(GetWorkers);
 #undef HANDLER
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/grpc_master_impl.h b/tensorflow/core/data/service/grpc_master_impl.h
index 32eb0f3fc6a..d29bb6759f0 100644
--- a/tensorflow/core/data/service/grpc_master_impl.h
+++ b/tensorflow/core/data/service/grpc_master_impl.h
@@ -48,6 +48,7 @@ class GrpcMasterImpl : public MasterService::Service {
   HANDLER(CreateJob);
   HANDLER(GetOrCreateJob);
   HANDLER(GetTasks);
+  HANDLER(GetWorkers);
 #undef HANDLER
 
  private:
diff --git a/tensorflow/core/data/service/grpc_worker_impl.cc b/tensorflow/core/data/service/grpc_worker_impl.cc
index a5d005d6c6e..7884fa063ba 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl.cc
@@ -30,7 +30,6 @@ GrpcWorkerImpl::GrpcWorkerImpl(ServerBuilder* server_builder,
                                const std::string& protocol)
     : impl_(master_address, protocol) {
   server_builder->RegisterService(this);
-  LOG(INFO) << "GrpcWorkerImpl: master address is " << master_address;
   VLOG(1) << "Registered data service worker";
 }
 
diff --git a/tensorflow/core/data/service/master.proto b/tensorflow/core/data/service/master.proto
index 005e5affb7d..661264cc41b 100644
--- a/tensorflow/core/data/service/master.proto
+++ b/tensorflow/core/data/service/master.proto
@@ -98,6 +98,18 @@ message GetTasksResponse {
   bool job_finished = 2;
 }
 
+message WorkerInfo {
+  string address = 1;
+  int64 id = 2;
+}
+
+message GetWorkersRequest {}
+
+message GetWorkersResponse {
+  // A list of all workers.
+  repeated WorkerInfo workers = 1;
+}
+
 service MasterService {
   // Registers a worker with the master.
   rpc RegisterWorker(RegisterWorkerRequest) returns (RegisterWorkerResponse);
@@ -121,4 +133,7 @@ service MasterService {
 
   // Reports a list of all tasks for a job.
   rpc GetTasks(GetTasksRequest) returns (GetTasksResponse);
+
+  // Reports a list of all workers registered with the master.
+  rpc GetWorkers(GetWorkersRequest) returns (GetWorkersResponse);
 }
diff --git a/tensorflow/core/data/service/master_impl.cc b/tensorflow/core/data/service/master_impl.cc
index 336ab068c40..37a884d540e 100644
--- a/tensorflow/core/data/service/master_impl.cc
+++ b/tensorflow/core/data/service/master_impl.cc
@@ -315,5 +315,19 @@ Status DataServiceMasterImpl::GetTasks(const GetTasksRequest* request,
   return Status::OK();
 }
 
+Status DataServiceMasterImpl::GetWorkers(const GetWorkersRequest* request,
+                                         GetWorkersResponse* response) {
+  mutex_lock l(mu_);
+  VLOG(3) << "Enter GetWorkers";
+  for (auto& worker : workers_) {
+    WorkerInfo* info = response->add_workers();
+    info->set_address(worker.address());
+    info->set_id(worker.worker_id());
+  }
+  VLOG(3) << "Returning list of " << workers_.size()
+          << " workers from GetWorkers";
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/master_impl.h b/tensorflow/core/data/service/master_impl.h
index e8b70e84d0f..0dc049a389c 100644
--- a/tensorflow/core/data/service/master_impl.h
+++ b/tensorflow/core/data/service/master_impl.h
@@ -60,6 +60,8 @@ class DataServiceMasterImpl {
   Status GetOrCreateJob(const GetOrCreateJobRequest* request,
                         GetOrCreateJobResponse* response);
   Status GetTasks(const GetTasksRequest* request, GetTasksResponse* response);
+  Status GetWorkers(const GetWorkersRequest* request,
+                    GetWorkersResponse* response);
 
  private:
   class Worker {
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index 66fc1e20603..33c2232f4dc 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/data/service/grpc_master_impl.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/grpc_worker_impl.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace data {
@@ -31,6 +32,13 @@ GrpcDataServerBase::GrpcDataServerBase(int port, const std::string& protocol)
     : requested_port_(port), protocol_(protocol) {}
 
 Status GrpcDataServerBase::Start() {
+  if (stopped_) {
+    return errors::FailedPrecondition(
+        "Server cannot be started after it has been stopped.");
+  }
+  if (started_) {
+    return Status::OK();
+  }
   ::grpc::ServerBuilder builder;
   std::shared_ptr<::grpc::ServerCredentials> credentials;
   TF_RETURN_IF_ERROR(
@@ -47,11 +55,18 @@ Status GrpcDataServerBase::Start() {
 
   TF_RETURN_IF_ERROR(StartServiceInternal());
 
+  started_ = true;
   VLOG(1) << "Started tf.data service running at 0.0.0.0:" << BoundPort();
   return Status::OK();
 }
 
-void GrpcDataServerBase::Stop() { server_->Shutdown(); }
+void GrpcDataServerBase::Stop() {
+  if (stopped_) {
+    return;
+  }
+  server_->Shutdown();
+  stopped_ = true;
+}
 
 void GrpcDataServerBase::Join() { server_->Wait(); }
 
@@ -68,15 +83,15 @@ void MasterGrpcDataServer::AddServiceToBuilder(grpc::ServerBuilder* builder) {
   service_ = service.release();
 }
 
-Status MasterGrpcDataServer::NumTasks(int* num_tasks) {
-  GetTasksRequest req;
-  GetTasksResponse resp;
+Status MasterGrpcDataServer::NumWorkers(int* num_workers) {
+  GetWorkersRequest req;
+  GetWorkersResponse resp;
   grpc::ServerContext ctx;
-  grpc::Status s = service_->GetTasks(&ctx, &req, &resp);
+  grpc::Status s = service_->GetWorkers(&ctx, &req, &resp);
   if (!s.ok()) {
-    return grpc_util::WrapError("Failed to get num tasks", s);
+    return grpc_util::WrapError("Failed to get workers", s);
   }
-  *num_tasks = resp.task_info_size();
+  *num_workers = resp.workers_size();
   return Status::OK();
 }
 
diff --git a/tensorflow/core/data/service/server_lib.h b/tensorflow/core/data/service/server_lib.h
index 0ef305db89a..72bec665c8e 100644
--- a/tensorflow/core/data/service/server_lib.h
+++ b/tensorflow/core/data/service/server_lib.h
@@ -64,6 +64,8 @@ class GrpcDataServerBase {
 
  private:
   int bound_port_;
+  bool started_ = false;
+  bool stopped_ = false;
 
   std::unique_ptr<grpc::Server> server_;
 };
@@ -73,8 +75,8 @@ class MasterGrpcDataServer : public GrpcDataServerBase {
   MasterGrpcDataServer(int requested_port, const std::string& protocol);
   ~MasterGrpcDataServer() override;
 
-  // Returns the number of tasks created by the master.
-  Status NumTasks(int* num_tasks);
+  // Returns the number of workers registerd with the master.
+  Status NumWorkers(int* num_workers);
 
  protected:
   void AddServiceToBuilder(grpc::ServerBuilder* builder) override;
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 217c586caef..726f0dc1530 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -37,12 +37,12 @@ from tensorflow.python.platform import test
 PROTOCOL = "grpc"
 
 
-def _make_distributed_dataset(dataset, service, job_name=None):
+def _make_distributed_dataset(dataset, address, job_name=None):
   """Creates a distributed dataset with a short task refresh interval."""
   return dataset.apply(
       data_service_ops._distribute(
           "parallel_epochs",
-          service,
+          "{0}://{1}".format(PROTOCOL, address),
           job_name=job_name,
           task_refresh_interval_hint_ms=20))
 
@@ -56,34 +56,32 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       num_workers: The number of workers in the cluster.
 
     Returns:
-      A target for connecting to the service, e.g.
-      "grpc+local://localhost:2000".
+      The address of the master.
     """
-    self._master = server_lib.MasterServer(PROTOCOL)
-    master_address = self._master.target[len(PROTOCOL + "://"):]
-
+    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
     self._servers = []
     for _ in range(num_workers):
       self._servers.append(
-          server_lib.WorkerServer(PROTOCOL, master_address=master_address))
+          server_lib.WorkerServer(
+              port=0, master_address=self._master._address, protocol=PROTOCOL))
 
-    return self._master.target
+    return self._master._address
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeBasic(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     results = [elem.numpy() for elem in ds]
     self.assertEqual(list(range(num_elements)), results)
 
   @combinations.generate(test_base.eager_only_combinations())
   def testMultipleEpochs(self):
     num_elements = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     for _ in range(10):
       self.assertEqual(list(range(num_elements)), [elem.numpy() for elem in ds])
 
@@ -91,9 +89,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testRepeatedDataset(self):
     num_elements = 10
     num_repetitions = 5
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     ds = ds.repeat(num_repetitions)
     self.assertDatasetProduces(
         ds, expected_output=num_repetitions * list(range(num_elements)))
@@ -102,12 +100,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testConcurrentEpoch(self):
     num_elements = 10
     num_datasets = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     iterators = []
     results = []
     for _ in range(num_datasets):
       ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, service)
+      ds = _make_distributed_dataset(ds, master_address)
       iterators.append(iter(ds))
       results.append([])
 
@@ -123,9 +121,9 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.skipTest("Not yet implemented")
     num_elements = 10
     num_iterators = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     result = []
     iterators = []
     for _ in range(num_iterators):
@@ -147,21 +145,20 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testMultiWorker(self):
     num_workers = 3
     num_elements = 10
-    service = self.create_cluster(num_workers)
+    master_address = self.create_cluster(num_workers)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, service)
+    ds = _make_distributed_dataset(ds, master_address)
     results = [elem.numpy() for elem in ds]
     self.assertCountEqual(num_workers * list(range(num_elements)), results)
 
   @combinations.generate(test_base.eager_only_combinations())
   def testAddWorkerMidJob(self):
-    self._master = server_lib.MasterServer(PROTOCOL)
-    master_address = self._master.target[len(PROTOCOL + "://"):]
+    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
     self._worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address)
+        port=0, master_address=self._master._address, protocol=PROTOCOL)
     num_elements = 100
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, self._master.target)
+    ds = _make_distributed_dataset(ds, self._master._address)
     iterator = iter(ds)
     results = []
     # Read halfway through the dataset.
@@ -169,10 +166,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       results.append(next(iterator).numpy())
 
     self._new_worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address)
+        port=0, master_address=self._master._address, protocol=PROTOCOL)
 
     # Wait for the new worker to register with the master.
-    while self._master.num_tasks() < 2:
+    while self._master._num_workers() < 2:
       time.sleep(10 / 1000)  # 10ms
 
     for elem in iterator:
@@ -184,13 +181,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
       combinations.times(test_base.eager_only_combinations(),
                          combinations.combine(use_same_port=[True, False])))
   def testRestartWorker(self, use_same_port):
-    self._master = server_lib.MasterServer(PROTOCOL)
-    master_address = self._master.target[len(PROTOCOL + "://"):]
+    self._master = server_lib.MasterServer(port=0, protocol=PROTOCOL)
     self._worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address)
+        port=0, master_address=self._master._address, protocol=PROTOCOL)
     num_elements = 100
     ds = dataset_ops.Dataset.range(num_elements)
-    ds = _make_distributed_dataset(ds, self._master.target)
+    ds = _make_distributed_dataset(ds, self._master._address)
     iterator = iter(ds)
     # Read halfway through the dataset.
     midpoint = num_elements // 2
@@ -200,11 +196,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     # Stop the original worker and start a new one.
     port = 0
     if use_same_port:
-      worker_address = self._worker.target[len(PROTOCOL + "://"):]
-      port = int(worker_address.split(":")[1])
-    self._worker.stop()
+      port = int(self._worker._address.split(":")[1])
+    self._worker._stop()
     self._new_worker = server_lib.WorkerServer(
-        PROTOCOL, master_address=master_address, port=port)
+        port=port, master_address=self._master._address, protocol=PROTOCOL)
 
     # The dataset starts over now that we read from the new worker.
     for i in range(num_elements):
@@ -219,12 +214,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testMaxOutstandingRequests(self):
     num_elements = 10
     num_workers = 3
-    service = self.create_cluster(num_workers)
+    address = self.create_cluster(num_workers)
     ds = dataset_ops.Dataset.range(num_elements)
     ds = ds.apply(
         data_service_ops._distribute(
             "parallel_epochs",
-            service,
+            "{0}://{1}".format(PROTOCOL, address),
             max_outstanding_requests=1,
             task_refresh_interval_hint_ms=20))
     self.assertCountEqual(num_workers * list(range(num_elements)),
@@ -234,12 +229,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testInsideFunction(self):
     num_workers = 3
     num_elements = 10
-    service = self.create_cluster(num_workers)
+    master_address = self.create_cluster(num_workers)
 
     @def_function.function
     def f():
       ds = dataset_ops.Dataset.range(num_elements)
-      ds = _make_distributed_dataset(ds, service)
+      ds = _make_distributed_dataset(ds, master_address)
       result = tensor_array_ops.TensorArray(
           dtypes.int64, size=num_workers * num_elements, dynamic_size=True)
       i = 0
@@ -254,10 +249,10 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobName(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     iter1 = iter(ds1)
     iter2 = iter(ds2)
     results = []
@@ -273,20 +268,20 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   @combinations.generate(test_base.eager_only_combinations())
   def testDifferentJobNames(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name1")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name2")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name1")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name2")
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, list(range(num_elements)))
 
   @combinations.generate(test_base.eager_only_combinations())
   def testSharedJobNameMultiIteration(self):
     num_elements = 10
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     # iteration 1
     self.assertDatasetProduces(ds1, list(range(num_elements)))
     self.assertDatasetProduces(ds2, [])
@@ -298,11 +293,11 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSharedJobNameRepeat(self):
     num_elements = 10
     num_repetitions = 3
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(num_elements)
-    ds1 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds1 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     ds1 = ds1.repeat(num_repetitions)
-    ds2 = _make_distributed_dataset(ds, service, job_name="job_name")
+    ds2 = _make_distributed_dataset(ds, master_address, job_name="job_name")
     ds2 = ds2.repeat(num_repetitions)
     results = []
     iter1 = iter(ds1)
@@ -326,8 +321,8 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_external_state_policy = external_state_policy
     ds = ds.with_options(options)
 
-    service = self.create_cluster(3)
-    ds = _make_distributed_dataset(ds, service)
+    master_address = self.create_cluster(3)
+    ds = _make_distributed_dataset(ds, master_address)
     next(iter(ds))
 
   @combinations.generate(
@@ -347,12 +342,12 @@ class DataServiceOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.eager_only_combinations())
   def testDistributeFromInterleave(self):
-    service = self.create_cluster(1)
+    master_address = self.create_cluster(1)
     ds = dataset_ops.Dataset.range(2)
 
     def interleave_fn(_):
       ds = dataset_ops.Dataset.range(2)
-      _make_distributed_dataset(ds, service)
+      _make_distributed_dataset(ds, master_address)
       return ds
 
     with self.assertRaisesRegex(
diff --git a/tensorflow/python/data/service/server_lib.py b/tensorflow/python/data/service/server_lib.py
index b8f6e673f2e..df65508e6b2 100644
--- a/tensorflow/python/data/service/server_lib.py
+++ b/tensorflow/python/data/service/server_lib.py
@@ -24,93 +24,208 @@ from tensorflow.python.data.service import _pywrap_server_lib
 
 
 class MasterServer(object):
-  """An in-process tf.data service master, for use in testing."""
+  """An in-process tf.data service master server.
 
-  def __init__(self, protocol):
-    """Creates and starts a new tf.data master server.
+  A `tf.data.experimental.service.MasterServer` coordinates a cluster of
+  `tf.data.experimental.service.WorkerServer`s. When the workers start, they
+  register themselves with the master.
 
-    The server will choose an available port. Use `target()` to get the string
-    for connecting to the server.
+  ```
+  master_server = tf.data.experimental.service.MasterServer(port=5050)
+  worker_server = tf.data.experimental.service.WorkerServer(
+      port=0, master_address="localhost:5050")
+  dataset = tf.data.Dataset.range(10)
+  dataset = dataset.apply(tf.data.experimental.service.distribute(
+      processing_mode="parallel_epochs", service="grpc://localhost:5050"))
+  ```
+
+  When starting a dedicated tf.data master process, use join() to block
+  indefinitely after starting up the server.
+
+  ```
+  master_server = tf.data.experimental.service.MasterServer(port=5050)
+  master_server.join()
+  ```
+  """
+
+  def __init__(self, port, protocol=None, start=True):
+    """Creates a new master server.
 
     Args:
-      protocol: A string representing the type of protocol to use when creating
-        channels. For no security, use "grpc". For local credentials, use
-        "grpc+local", and make sure your binary links in
-        `data/service:local_credentials`.
+      port: Specifies the port to bind to.
+      protocol: (Optional.) Specifies the protocol to be used by the server.
+        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      start: (Optional.) Boolean, indicating whether to start the server after
+        creating it. Defaults to `True`.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        creating the TensorFlow server.
     """
+    if protocol is None:
+      protocol = "grpc"
     self._protocol = protocol
-    self._server = _pywrap_server_lib.TF_DATA_NewMasterServer(0, protocol)
-    self._running = True
+    self._server = _pywrap_server_lib.TF_DATA_NewMasterServer(port, protocol)
+    if start:
+      self._server.start()
 
-  @property
-  def target(self):
-    """Returns the target for connecting to this server.
+  def start(self):
+    """Starts this server.
 
-    The returned string will be in the form protocol://address:port, e.g.
-    "grpc://localhost:1000".
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        starting the server.
     """
-    port = _pywrap_server_lib.TF_DATA_MasterServerBoundPort(self._server)
-    return "{0}://localhost:{1}".format(self._protocol, port)
+    self._server.start()
 
-  def num_tasks(self):
-    """Returns the number of tasks on the master."""
-    return _pywrap_server_lib.TF_DATA_MasterServerNumTasks(self._server)
+  def join(self):
+    """Blocks until the server has shut down.
 
-  def stop(self):
-    """Shuts down and deletes the server.
+    This is useful when starting a dedicated master process.
 
-    This method will block until all outstanding rpcs have completed and the
-    server has been shut down.
+    ```
+    master_server = tf.data.experimental.service.MasterServer(port=5050)
+    master_server.join()
+    ```
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        joining the server.
     """
-    if self._running:
-      self._running = False
-      _pywrap_server_lib.TF_DATA_DeleteMasterServer(self._server)
+    self._server.join()
+
+  def _stop(self):
+    """Stops the server.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        stopping the server.
+    """
+    self._server.stop()
 
   def __del__(self):
-    self.stop()
+    self._stop()
+
+  @property
+  def _address(self):
+    """Returns the address of the server.
+
+    The returned string will be in the form address:port, e.g. "localhost:1000".
+    """
+    return "localhost:{0}".format(self._server.bound_port())
+
+  def _num_workers(self):
+    """Returns the number of workers registered with the master."""
+    return self._server.num_workers()
 
 
 class WorkerServer(object):
-  """An in-process tf.data service worker, for use in testing."""
+  """An in-process tf.data service worker server.
 
-  def __init__(self, protocol, master_address, port=0):
-    """Creates and starts a new tf.data worker server.
+  A `tf.data.experimental.service.WorkerServer` performs `tf.data.Dataset`
+  processing for user-defined datasets, and provides the resulting elements over
+  RPC. A worker is associated with a single
+  `tf.data.experimental.service.MasterServer`.
 
-    The server will choose an available port. Use `target()` to get the string
-    for connecting to the server.
+  ```
+  master_server = tf.data.experimental.service.MasterServer(port=5050)
+  worker_server = tf.data.experimental.service.WorkerServer(
+      port=0, master_address="localhost:5050")
+  dataset = tf.data.Dataset.range(10)
+  dataset = dataset.apply(tf.data.experimental.service.distribute(
+      processing_mode="parallel_epochs", service="grpc://localhost:5050"))
+  ```
+
+  When starting a dedicated tf.data worker process, use join() to block
+  indefinitely after starting up the server.
+
+  ```
+  worker_server = tf.data.experimental.service.WorkerServer(
+      port=5050, master_address="grpc://localhost:5050")
+  worker_server.join()
+  ```
+  """
+
+  def __init__(self,
+               port,
+               master_address,
+               worker_address=None,
+               protocol=None,
+               start=True):
+    """Creates a new worker server.
 
     Args:
-      protocol: A string representing the type of protocol to use when creating
-        channels. For no security, use "grpc". For local credentials, use
-        "grpc+local", and make sure your binary links in
-        `data/service:local_credentials`.
-      master_address: The address of the tf.data master server to register with.
-      port: The port to bind to.
+      port: Specifies the port to bind to. A value of 0 indicates that the
+        worker can bind to any available port.
+      master_address: Specifies the address of the master server.
+      worker_address: (Optional.) Specifies the address of the worker server.
+        This address is passed to the master server so that the master can tell
+        clients how to connect to this worker. Defaults to `"localhost:%port%"`,
+          where `%port%` will be replaced with the port used by the worker.
+      protocol: (Optional.) Specifies the protocol to be used by the server.
+        Acceptable values include `"grpc", "grpc+local"`. Defaults to `"grpc"`.
+      start: (Optional.) Boolean, indicating whether to start the server after
+        creating it. Defaults to `True`.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        creating the TensorFlow server.
     """
+    if worker_address is None:
+      worker_address = "localhost:%port%"
+    if protocol is None:
+      protocol = "grpc"
+
     self._protocol = protocol
     self._server = _pywrap_server_lib.TF_DATA_NewWorkerServer(
-        port, protocol, master_address, "localhost:%port%")
-    self._running = True
+        port, protocol, master_address, worker_address)
+    if start:
+      self._server.start()
 
-  @property
-  def target(self):
-    """Returns the target for connecting to this server.
+  def start(self):
+    """Starts this server.
 
-    The returned string will be in the form protocol://address:port, e.g.
-    "grpc://localhost:1000".
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        starting the server.
     """
-    port = _pywrap_server_lib.TF_DATA_WorkerServerBoundPort(self._server)
-    return "{0}://localhost:{1}".format(self._protocol, port)
+    self._server.start()
 
-  def stop(self):
-    """Shuts down and deletes the server.
+  def join(self):
+    """Blocks until the server has shut down.
 
-    This method will block until all outstanding rpcs have completed and the
-    server has been shut down.
+    This is useful when starting a dedicated worker process.
+
+    ```
+    worker_server = tf.data.experimental.service.WorkerServer(
+        port=5050, master_address="grpc://localhost:5050")
+    worker_server.join()
+    ```
+
+    This method currently blocks forever.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        joining the server.
     """
-    if self._running:
-      self._running = False
-      _pywrap_server_lib.TF_DATA_DeleteWorkerServer(self._server)
+    self._server.join()
+
+  def _stop(self):
+    """Stops the server.
+
+    Raises:
+      tf.errors.OpError: Or one of its subclasses if an error occurs while
+        stopping the server.
+    """
+    self._server.stop()
 
   def __del__(self):
-    self.stop()
+    self._stop()
+
+  @property
+  def _address(self):
+    """Returns the address of the server.
+
+    The returned string will be in the form address:port, e.g. "localhost:1000".
+    """
+    return "localhost:{0}".format(self._server.bound_port())
diff --git a/tensorflow/python/data/service/server_lib_test.py b/tensorflow/python/data/service/server_lib_test.py
index b18262bf52b..59bb731d98e 100644
--- a/tensorflow/python/data/service/server_lib_test.py
+++ b/tensorflow/python/data/service/server_lib_test.py
@@ -22,20 +22,71 @@ from tensorflow.python.data.service import server_lib
 
 from tensorflow.python.platform import test
 
-PROTOCOL = "grpc"
-
 
 class ServerLibTest(test.TestCase):
 
   def testStartMaster(self):
-    master = server_lib.MasterServer(PROTOCOL)
-    self.assertRegex(master.target, PROTOCOL + "://.*:.*")
+    master = server_lib.MasterServer(0, start=False)
+    master.start()
+
+  def testMultipleStartMaster(self):
+    master = server_lib.MasterServer(0, start=True)
+    master.start()
 
   def testStartWorker(self):
-    master = server_lib.MasterServer(PROTOCOL)
-    worker = server_lib.WorkerServer(PROTOCOL,
-                                     master.target[len(PROTOCOL + "://"):])
-    self.assertRegex(worker.target, PROTOCOL + "://.*:.*")
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address, start=False)
+    worker.start()
+
+  def testMultipleStartWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address, start=True)
+    worker.start()
+
+  def testStopMaster(self):
+    master = server_lib.MasterServer(0)
+    master._stop()
+    master._stop()
+
+  def testStopWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address)
+    worker._stop()
+    worker._stop()
+
+  def testStopStartMaster(self):
+    master = server_lib.MasterServer(0)
+    master._stop()
+    with self.assertRaisesRegex(
+        RuntimeError, "Server cannot be started after it has been stopped"):
+      master.start()
+
+  def testStopStartWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address)
+    worker._stop()
+    with self.assertRaisesRegex(
+        RuntimeError, "Server cannot be started after it has been stopped"):
+      worker.start()
+
+  def testJoinMaster(self):
+    master = server_lib.MasterServer(0)
+    master._stop()
+    master.join()
+
+  def testJoinWorker(self):
+    master = server_lib.MasterServer(0)
+    worker = server_lib.WorkerServer(0, master._address)
+    worker._stop()
+    worker.join()
+
+  def testMasterNumWorkers(self):
+    master = server_lib.MasterServer(0)
+    self.assertEqual(0, master._num_workers())
+    worker1 = server_lib.WorkerServer(0, master._address)  # pylint: disable=unused-variable
+    self.assertEqual(1, master._num_workers())
+    worker2 = server_lib.WorkerServer(0, master._address)  # pylint: disable=unused-variable
+    self.assertEqual(2, master._num_workers())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/service/server_lib_wrapper.cc b/tensorflow/python/data/service/server_lib_wrapper.cc
index 8325d74a768..03453a56c7f 100644
--- a/tensorflow/python/data/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/service/server_lib_wrapper.cc
@@ -28,8 +28,24 @@ limitations under the License.
 namespace py = pybind11;
 
 PYBIND11_MODULE(_pywrap_server_lib, m) {
-  py::class_<tensorflow::data::MasterGrpcDataServer>(m, "MasterGrpcDataServer");
-  py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer");
+  py::class_<tensorflow::data::MasterGrpcDataServer>(m, "MasterGrpcDataServer")
+      .def("start", &tensorflow::data::MasterGrpcDataServer::Start)
+      .def("stop", &tensorflow::data::MasterGrpcDataServer::Stop)
+      .def("join", &tensorflow::data::MasterGrpcDataServer::Join)
+      .def("bound_port", &tensorflow::data::MasterGrpcDataServer::BoundPort)
+      .def("num_workers",
+           [](tensorflow::data::MasterGrpcDataServer* server) -> int {
+             int num_workers;
+             tensorflow::Status status = server->NumWorkers(&num_workers);
+             tensorflow::MaybeRaiseFromStatus(status);
+             return num_workers;
+           });
+
+  py::class_<tensorflow::data::WorkerGrpcDataServer>(m, "WorkerGrpcDataServer")
+      .def("start", &tensorflow::data::WorkerGrpcDataServer::Start)
+      .def("stop", &tensorflow::data::WorkerGrpcDataServer::Stop)
+      .def("join", &tensorflow::data::WorkerGrpcDataServer::Join)
+      .def("bound_port", &tensorflow::data::WorkerGrpcDataServer::BoundPort);
 
   m.def(
       "TF_DATA_NewMasterServer",
@@ -39,27 +55,9 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         tensorflow::Status status =
             tensorflow::data::NewMasterServer(port, protocol, &server);
         tensorflow::MaybeRaiseFromStatus(status);
-        server->Start();
         return server;
       },
       py::return_value_policy::reference);
-  m.def(
-      "TF_DATA_MasterServerBoundPort",
-      [](tensorflow::data::MasterGrpcDataServer* server) -> int {
-        return server->BoundPort();
-      },
-      py::return_value_policy::copy);
-  m.def("TF_DATA_DeleteMasterServer",
-        [](tensorflow::data::MasterGrpcDataServer* server) { server->Stop(); });
-  m.def(
-      "TF_DATA_MasterServerNumTasks",
-      [](tensorflow::data::MasterGrpcDataServer* server) -> int {
-        int num_tasks;
-        tensorflow::Status status = server->NumTasks(&num_tasks);
-        tensorflow::MaybeRaiseFromStatus(status);
-        return num_tasks;
-      },
-      py::return_value_policy::copy);
 
   m.def(
       "TF_DATA_NewWorkerServer",
@@ -70,16 +68,7 @@ PYBIND11_MODULE(_pywrap_server_lib, m) {
         tensorflow::Status status = tensorflow::data::NewWorkerServer(
             port, protocol, master_address, worker_address, &server);
         tensorflow::MaybeRaiseFromStatus(status);
-        server->Start();
         return server;
       },
       py::return_value_policy::reference);
-  m.def(
-      "TF_DATA_WorkerServerBoundPort",
-      [](tensorflow::data::WorkerGrpcDataServer* server) -> int {
-        return server->BoundPort();
-      },
-      py::return_value_policy::copy);
-  m.def("TF_DATA_DeleteWorkerServer",
-        [](tensorflow::data::WorkerGrpcDataServer* server) { server->Stop(); });
 };

From 4c90f20c0d371c0fe4db6f045ceb2eda147f56d6 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 26 May 2020 22:43:28 +0100
Subject: [PATCH 1153/1533] Replace list concatenation with generators for auto
 control deps

---
 tensorflow/python/framework/auto_control_deps.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index dfe84f14f26..51dcb248b11 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -535,8 +535,10 @@ def _get_resource_inputs(op):
 
   # Note: A resource handle that is not written to is treated as read-only. We
   # don't have a special way of denoting an unused resource.
-  return ([(t, ResourceType.READ_ONLY) for t in reads] +
-          [(t, ResourceType.READ_WRITE) for t in writes])
+  for t in reads:
+    yield (t, ResourceType.READ_ONLY)
+  for t in writes:
+    yield (t, ResourceType.READ_WRITE)
 
 
 def automatic_control_dependencies(f):

From aff44e4ca1f54cc0d840191775e76e4a72f76c3f Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 26 May 2020 14:48:04 -0700
Subject: [PATCH 1154/1533] Speedup python TraceMe

PiperOrigin-RevId: 313271773
Change-Id: I6358253077190f43059fed416399852bab29dae6
---
 tensorflow/compiler/xla/python/BUILD          |  2 +-
 tensorflow/compiler/xla/python/xla.cc         | 20 +++----
 tensorflow/python/profiler/internal/BUILD     |  6 +-
 .../profiler/internal/traceme_wrapper.cc      | 18 +++---
 ...me_context_manager.h => traceme_wrapper.h} | 58 +++++++++----------
 tensorflow/python/profiler/trace.py           |  8 +--
 6 files changed, 51 insertions(+), 61 deletions(-)
 rename tensorflow/python/profiler/internal/{traceme_context_manager.h => traceme_wrapper.h} (53%)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 863296c681c..5b4182b75e1 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -261,7 +261,7 @@ pybind_extension(
         "//tensorflow/core/profiler/lib:profiler_backends",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/rpc:profiler_server",
-        "//tensorflow/python/profiler/internal:traceme_context_manager",
+        "//tensorflow/python/profiler/internal:traceme_wrapper",
         "//tensorflow/stream_executor:device_memory_allocator",
         "//tensorflow/stream_executor:platform",
     ] + select({
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 4cf2b36db27..abf0937d057 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -64,7 +64,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
-#include "tensorflow/python/profiler/internal/traceme_context_manager.h"
+#include "tensorflow/python/profiler/internal/traceme_wrapper.h"
 #include "tensorflow/stream_executor/platform.h"
 
 namespace xla {
@@ -72,7 +72,7 @@ namespace {
 
 namespace py = pybind11;
 
-using ::tensorflow::profiler::TraceMeContextManager;
+using ::tensorflow::profiler::TraceMeWrapper;
 
 struct Uniquer {
   absl::Mutex mu;
@@ -637,23 +637,19 @@ void BuildProfilerSubmodule(py::module* m) {
       },
       py::arg("port"));
 
-  py::class_<TraceMeContextManager> traceme_class(profiler, "TraceMe",
-                                                  py::module_local());
+  py::class_<TraceMeWrapper> traceme_class(profiler, "TraceMe",
+                                           py::module_local());
   traceme_class.def(py::init<py::str, py::kwargs>())
-      .def("__enter__",
-           [](py::object self) -> py::object {
-             py::cast<TraceMeContextManager*>(self)->Enter();
-             return self;
-           })
+      .def("__enter__", [](py::object self) -> py::object { return self; })
       .def("__exit__",
            [](py::object self, const py::object& ex_type,
               const py::object& ex_value,
               const py::object& traceback) -> py::object {
-             py::cast<TraceMeContextManager*>(self)->Exit();
+             py::cast<TraceMeWrapper*>(self)->Stop();
              return py::none();
            })
-      .def("set_metadata", &TraceMeContextManager::SetMetadata)
-      .def_static("is_enabled", &TraceMeContextManager::IsEnabled);
+      .def("set_metadata", &TraceMeWrapper::SetMetadata)
+      .def_static("is_enabled", &TraceMeWrapper::IsEnabled);
 }
 
 }  // namespace
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index b6648462224..6f7193b3207 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -86,14 +86,14 @@ tf_python_pybind_extension(
         "//tensorflow/python/profiler:__subpackages__",
     ],
     deps = [
-        ":traceme_context_manager",
+        ":traceme_wrapper",
         "@pybind11",
     ],
 )
 
 cc_library(
-    name = "traceme_context_manager",
-    hdrs = ["traceme_context_manager.h"],
+    name = "traceme_wrapper",
+    hdrs = ["traceme_wrapper.h"],
     features = ["-layering_check"],
     visibility = [
         "//tensorflow/compiler/xla/python:__pkg__",
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index b3403fa298f..32a1f423918 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/python/profiler/internal/traceme_wrapper.h"
+
 #include "pybind11/attr.h"
 #include "pybind11/pybind11.h"
-#include "tensorflow/python/profiler/internal/traceme_context_manager.h"
 
-using ::tensorflow::profiler::TraceMeContextManager;
+namespace py = ::pybind11;
+
+using ::tensorflow::profiler::TraceMeWrapper;
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
-  py::class_<TraceMeContextManager> traceme_class(m, "TraceMe",
-                                                  py::module_local());
-  traceme_class.def(py::init<py::str, py::kwargs>())
-      .def("Enter", &TraceMeContextManager::Enter)
-      .def("Exit", &TraceMeContextManager::Exit)
-      .def("SetMetadata", &TraceMeContextManager::SetMetadata)
-      .def_static("IsEnabled", &TraceMeContextManager::IsEnabled);
+  py::class_<TraceMeWrapper>(m, "TraceMe", py::module_local())
+      .def(py::init<const py::str&, const py::kwargs&>())
+      .def("SetMetadata", &TraceMeWrapper::SetMetadata)
+      .def_static("IsEnabled", &TraceMeWrapper::IsEnabled);
 };
diff --git a/tensorflow/python/profiler/internal/traceme_context_manager.h b/tensorflow/python/profiler/internal/traceme_wrapper.h
similarity index 53%
rename from tensorflow/python/profiler/internal/traceme_context_manager.h
rename to tensorflow/python/profiler/internal/traceme_wrapper.h
index fd281684de8..c074e909640 100644
--- a/tensorflow/python/profiler/internal/traceme_context_manager.h
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.h
@@ -12,46 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
-#define TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
+#ifndef TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_
+#define TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_
 
 #include <string>
 #include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
 #include "pybind11/pytypes.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
-namespace py = pybind11;
-
 namespace tensorflow {
 namespace profiler {
 
-// Helper to implement TraceMe as a context manager in Python.
-class TraceMeContextManager {
+// Wraps TraceMe with an interface that takes python types.
+class TraceMeWrapper {
  public:
-  explicit TraceMeContextManager(py::str name, py::kwargs kwargs)
-      : name_(std::move(name)), kwargs_(std::move(kwargs)) {}
+  // pybind11::str and pybind11::kwargs are taken by const reference to avoid
+  // python reference-counting overhead.
+  TraceMeWrapper(const pybind11::str& name, const pybind11::kwargs& kwargs)
+      : traceme_([&]() {
+          std::string name_and_metadata(name);
+          if (!kwargs.empty()) {
+            AppendMetadata(&name_and_metadata, kwargs);
+          }
+          return name_and_metadata;
+        }) {}
 
-  void Enter() {
-    if (IsEnabled()) {
-      traceme_.emplace([this]() {
-        std::string name(name_);
-        if (!kwargs_.empty()) {
-          AppendMetadata(&name, kwargs_);
-        }
-        return name;
-      });
-    }
-  }
-
-  void SetMetadata(py::kwargs kwargs) {
-    if (TF_PREDICT_TRUE(traceme_.has_value() && !kwargs.empty())) {
-      traceme_->AppendMetadata([&kwargs]() {
+  // pybind11::kwargs is taken by const reference to avoid python
+  // reference-counting overhead.
+  void SetMetadata(const pybind11::kwargs& kwargs) {
+    if (TF_PREDICT_FALSE(!kwargs.empty())) {
+      traceme_.AppendMetadata([&]() {
         std::string metadata;
         AppendMetadata(&metadata, kwargs);
         return metadata;
@@ -59,28 +54,27 @@ class TraceMeContextManager {
     }
   }
 
-  void Exit() { traceme_.reset(); }
+  void Stop() { traceme_.Stop(); }
 
   static bool IsEnabled() { return tensorflow::profiler::TraceMe::Active(); }
 
  private:
   // Converts kwargs to strings and appends them to name encoded as TraceMe
   // metadata.
-  static void AppendMetadata(std::string* name, const py::kwargs& kwargs) {
+  static void AppendMetadata(std::string* name,
+                             const pybind11::kwargs& kwargs) {
     name->push_back('#');
     for (const auto& kv : kwargs) {
-      absl::StrAppend(name, std::string(py::str(kv.first)), "=",
-                      std::string(py::str(kv.second)), ",");
+      absl::StrAppend(name, std::string(pybind11::str(kv.first)), "=",
+                      std::string(pybind11::str(kv.second)), ",");
     }
     name->back() = '#';
   }
 
-  py::str name_;
-  py::kwargs kwargs_;
-  absl::optional<tensorflow::profiler::TraceMe> traceme_;
+  tensorflow::profiler::TraceMe traceme_;
 };
 
 }  // namespace profiler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_CONTEXT_MANAGER_
+#endif  // TENSORFLOW_PYTHON_PROFILER_INTERNAL_TRACEME_WRAPPER_
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 2cdbad5118c..ea4eb060488 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -73,13 +73,13 @@ class Trace(object):
       training step being traced.
     """
     if _pywrap_traceme.TraceMe.IsEnabled():
+      # Creating _pywrap_traceme.TraceMe starts the clock.
       self._traceme = _pywrap_traceme.TraceMe(name, **kwargs)
     else:
       self._traceme = None
 
   def __enter__(self):
-    if self._traceme:
-      self._traceme.Enter()
+    # Starting the TraceMe clock here would require an extra Python->C++ call.
     return self
 
   def set_metadata(self, **kwargs):
@@ -117,5 +117,5 @@ class Trace(object):
       self._traceme.SetMetadata(**kwargs)
 
   def __exit__(self, exc_type, exc_val, exc_tb):
-    if self._traceme:
-      self._traceme.Exit()
+    # Deallocating _pywrap_traceme.TraceMe stops the clock.
+    self._traceme = None

From 2b58bb4025df1afe47cd9b523a988d4b75f3f89f Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Tue, 26 May 2020 14:58:36 -0700
Subject: [PATCH 1155/1533] Reduce 1-Layer Functional.__call__ overhead by
 ~25%.

Improvements:

- Layer._clear_losses
- Functional._conform_to_reference_inputs

PiperOrigin-RevId: 313273624
Change-Id: I7eccf5f0b984805e3966e1f40281c535b9cb867d
---
 tensorflow/python/keras/engine/base_layer.py | 11 +++--
 tensorflow/python/keras/engine/functional.py | 49 +++++++++++---------
 2 files changed, 33 insertions(+), 27 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 594bf656cfd..b986f9a405e 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -1353,13 +1353,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           # Possible a loss was added in a Layer's `build`.
           self._losses.append(symbolic_loss)
 
-  @trackable.no_automatic_dependency_tracking
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
-    self._eager_losses = []
-    if hasattr(self, '_layers'):
-      for layer in trackable_layer_utils.filter_empty_layer_containers(
-          self._layers):
+    # Set to thread local directly to avoid Layer.__setattr__ overhead.
+    self._thread_local._eager_losses = []
+    sublayers = getattr(self, '_layers', [])
+    if sublayers:
+      sublayers = trackable_layer_utils.filter_empty_layer_containers(sublayers)
+      for layer in sublayers:
         layer._clear_losses()
 
   @property
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 761955100ea..4958990ad66 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -25,6 +25,7 @@ import itertools
 
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
@@ -535,34 +536,38 @@ class Functional(training_lib.Model):
 
   def _conform_to_reference_input(self, tensor, ref_input):
     """Set shape and dtype based on `keras.Input`s."""
-    # Shape handling (only for non-CompositeTensors).
-    if isinstance(tensor, ops.Tensor) and isinstance(ref_input, ops.Tensor):
+    if isinstance(tensor, ops.Tensor):
       # Allow (None,) and (None, 1) Tensors to be passed interchangably. Use the
       # shape specified by the `keras.Input`.
-      if tensor.shape.rank is not None and ref_input.shape.rank is not None:
-        should_squeeze_last_dim = (
-            tensor.shape.rank == ref_input.shape.rank + 1 and
-            tensor.shape[-1] == 1)
-        should_expand_last_dim = (
-            tensor.shape.rank == ref_input.shape.rank - 1 and
-            ref_input.shape[-1] == 1)
-        if should_squeeze_last_dim:
+      t_shape = tensor.shape
+      t_rank = t_shape.rank
+      ref_shape = ref_input.shape
+      ref_rank = ref_shape.rank
+      if t_rank is not None and ref_rank is not None:
+        # Should squeeze last dimension.
+        # True if tensor is (BATCH, ..., 1) and reference is (BATCH, ...).
+        if (t_rank == ref_rank + 1 and t_shape[-1] == 1):
           tensor = array_ops.squeeze_v2(tensor, axis=-1)
-        elif should_expand_last_dim:
+        # Should expand last_dimension.
+        # True if tensor is (BATCH, ...) and reference is (BATCH, ..., 1).
+        elif (t_rank == ref_rank - 1 and ref_shape[-1] == 1):
           tensor = array_ops.expand_dims_v2(tensor, axis=-1)
 
-      # Add shape hints to Tensors that might have None shape dims but have
-      # shapes defined by the `keras.Input`.
-      try:
-        tensor.set_shape(tensor.shape.merge_with(ref_input.shape))
-      except ValueError:
-        logging.warning(
-            'Model was constructed with shape {} for input {}, but it was '
-            'called on an input with incompatible shape {}.'.format(
-                ref_input.shape, ref_input, tensor.shape))
+      # Add shape hints to Tensors that may have None shape dims but have shapes
+      # defined by the `keras.Input` (not applicable in eager mode).
+      if not context.executing_eagerly():
+        try:
+          tensor.set_shape(tensor.shape.merge_with(ref_input.shape))
+        except ValueError:
+          logging.warning(
+              'Model was constructed with shape {} for input {}, but it was '
+              'called on an input with incompatible shape {}.'.format(
+                  ref_input.shape, ref_input, tensor.shape))
 
-    # Dtype handling.
-    if isinstance(ref_input, (ops.Tensor, composite_tensor.CompositeTensor)):
+      # Dtype casting.
+      tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
+    elif isinstance(tensor, composite_tensor.CompositeTensor):
+      # Dtype casting.
       tensor = math_ops.cast(tensor, dtype=ref_input.dtype)
 
     return tensor

From c3ded069abd157c5a311e970d943c6f93c5318d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 15:04:43 -0700
Subject: [PATCH 1156/1533] Surface libcupti errors to OSS overview page.

PiperOrigin-RevId: 313274858
Change-Id: Ib65176246a378e0fbb8c43ec3eb369555dd43189
---
 tensorflow/core/profiler/convert/BUILD             |  1 +
 .../profiler/convert/op_stats_to_overview_page.cc  |  1 +
 .../core/profiler/convert/xplane_to_op_stats.cc    |  9 +++++++++
 .../core/profiler/convert/xplane_to_op_stats.h     |  3 +++
 .../profiler/convert/xplane_to_op_stats_test.cc    | 12 ++++++++++++
 .../core/profiler/internal/gpu/cupti_tracer.cc     | 14 +++++++++++---
 .../core/profiler/internal/gpu/device_tracer.cc    |  6 +++++-
 7 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 369d26a92d9..390f94157c3 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -242,6 +242,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index bec92e0d998..330b488dc8f 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -294,6 +294,7 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
       bottleneck.input_classification(), bottleneck.input_statement(), "",
       hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
       overview_page.mutable_recommendation());
+  *overview_page.mutable_errors() = op_stats.errors();
   return overview_page;
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index f008219cbd2..4d2a45747e0 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
@@ -109,12 +110,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
 
 }  // namespace
 
+void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) {
+  if (space.errors().empty()) return;
+  absl::flat_hash_set<std::string> unique_errors;
+  unique_errors.insert(space.errors().begin(), space.errors().end());
+  *op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()};
+}
+
 OpStats ConvertXSpaceToOpStats(const XSpace& space) {
   const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
   std::vector<const XPlane*> device_planes =
       FindPlanesWithPrefix(space, kGpuPlanePrefix);
   OpStats op_stats;
   StepEvents step_events;
+  PropagateXSpaceErrorsToOpStats(space, &op_stats);
   // Convert device planes.
   OpMetricsDbCombiner op_metrics_db_combiner(
       op_stats.mutable_device_op_metrics_db());
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index 2d30a5d5fad..4708caa5aae 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -25,6 +25,9 @@ namespace profiler {
 // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
 OpStats ConvertXSpaceToOpStats(const XSpace& space);
 
+// Propagate and dedup the errors in XSpace and add to OpStats.
+void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index 7b4652f6c0b..67901e83dd3 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -185,6 +185,18 @@ TEST(ConcertXPlaneToOpStats, TfFunctionTest) {
   EXPECT_EQ(not_traced_mode.self_time_ps(), 20);
 }
 
+TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
+  XSpace space;
+  static constexpr char kError[] = "host: error";
+  *space.add_errors() = kError;
+  *space.add_errors() = kError;
+
+  OpStats op_stats = ConvertXSpaceToOpStats(space);
+
+  EXPECT_EQ(1, op_stats.errors_size());
+  EXPECT_EQ(kError, op_stats.errors(/*index=*/0));
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
index 51f89bd7b0a..ab16693deae 100644
--- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/host_info.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mem.h"
@@ -1264,6 +1265,11 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
   std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
   TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
 };
+
+/*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
+  return absl::StrCat(port::Hostname(), ": ", error_message);
+}
+
 }  // namespace
 
 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
@@ -1669,11 +1675,13 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
 
 /*static*/ std::string CuptiTracer::ErrorIfAny() {
   if (CuptiTracer::NumGpus() == 0) {
-    return "No GPU detected.";
+    return ErrorWithHostname("No GPU detected.");
   } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
-    return "Insufficient privilege to run libcupti (you need root permission).";
+    return ErrorWithHostname(
+        "Insufficient privilege to run libcupti (you need root permission).");
   } else if (CuptiTracer::GetTimestamp() == 0) {
-    return "Failed to load libcupti (is it installed and accessible?)";
+    return ErrorWithHostname(
+        "Failed to load libcupti (is it installed and accessible?)");
   }
   return "";
 }
diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
index ac6662c8432..0370f6a51f9 100644
--- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc
+++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc
@@ -659,12 +659,16 @@ Status GpuTracer::CollectData(XSpace* space) {
     case State::kStartedOk:
       return errors::FailedPrecondition("Cannot collect trace before stopping");
     case State::kStartedError:
-      LOG(ERROR) << "Cannot collect, xprof failed to start";
+      LOG(ERROR) << "Cannot collect, profiler failed to start";
       return Status::OK();
     case State::kStoppedError:
       VLOG(1) << "No trace data collected";
       return Status::OK();
     case State::kStoppedOk: {
+      std::string cupti_error = CuptiTracer::ErrorIfAny();
+      if (!cupti_error.empty()) {
+        space->add_errors(cupti_error);
+      }
       if (cupti_collector_) {
         cupti_collector_->Export(space);
       }

From 1de7105aeb3358a290f09c3ee46c5fe760a90c75 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Tue, 26 May 2020 15:11:12 -0700
Subject: [PATCH 1157/1533] Test that person detection example binary can run

PiperOrigin-RevId: 313275958
Change-Id: Ie128cccabb6e168b85920f72618530e15477a026
---
 WORKSPACE                                     |  8 +++++
 .../micro/examples/person_detection/BUILD     |  8 ++++-
 .../person_detection_binary_test.sh           | 33 +++++++++++++++++++
 3 files changed, 48 insertions(+), 1 deletion(-)
 create mode 100755 tensorflow/lite/micro/examples/person_detection/person_detection_binary_test.sh

diff --git a/WORKSPACE b/WORKSPACE
index 021ed6d2542..ea741c31c7f 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -114,6 +114,14 @@ http_archive(
     ],
 )
 
+http_archive(
+    name = "person_detect_data",
+    sha256 = "170542270da256994ce24d1e357f6e84a54fdaf7d28ff2b74725a40b70b082cf",
+    urls = [
+        "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2020_05_24.zip",
+    ],
+)
+
 # Required for dependency @com_github_grpc_grpc
 
 load("@com_github_grpc_grpc//bazel:grpc_deps.bzl", "grpc_deps")
diff --git a/tensorflow/lite/micro/examples/person_detection/BUILD b/tensorflow/lite/micro/examples/person_detection/BUILD
index 75c1bf61fa8..84eddba73d4 100644
--- a/tensorflow/lite/micro/examples/person_detection/BUILD
+++ b/tensorflow/lite/micro/examples/person_detection/BUILD
@@ -23,7 +23,7 @@ cc_library(
 cc_library(
     name = "person_detect_model_data",
     srcs = [
-        "person_detect_model_data.cc",
+        "@person_detect_data//:person_detect_model_data",
     ],
     hdrs = [
         "person_detect_model_data.h",
@@ -118,3 +118,9 @@ cc_binary(
         "//tensorflow/lite/schema:schema_fbs",
     ],
 )
+
+sh_test(
+    name = "person_detection_binary_test",
+    srcs = ["person_detection_binary_test.sh"],
+    data = [":person_detection"],
+)
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_binary_test.sh b/tensorflow/lite/micro/examples/person_detection/person_detection_binary_test.sh
new file mode 100755
index 00000000000..00d985d19bf
--- /dev/null
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_binary_test.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Bash unit tests for the example binary.
+
+set -e
+
+OUTPUT_LOG_FILE=${TEST_TMPDIR}/output_log.txt
+
+# Needed for copybara compatibility.
+SCRIPT_BASE_DIR=/org_"tensor"flow
+${TEST_SRCDIR}${SCRIPT_BASE_DIR}/tensorflow/lite/micro/examples/person_detection/person_detection 2>&1 | head > ${OUTPUT_LOG_FILE}
+
+if ! grep -q 'person score' ${OUTPUT_LOG_FILE}; then
+  echo "ERROR: Expected logs not found in output '${OUTPUT_LOG_FILE}'"
+  exit 1
+fi
+
+echo
+echo "SUCCESS: person_detection_binary_test PASSED"

From 0e0ea56c3afbacdfa02f6d0706d7ea21f4f67827 Mon Sep 17 00:00:00 2001
From: Teng Lu <teng.lu@intel.com>
Date: Thu, 23 Apr 2020 20:57:25 +0800
Subject: [PATCH 1158/1533] Implement new DNNL1.x MatMul primitive cache.

---
 tensorflow/core/kernels/mkl_matmul_op.cc      |   9 +-
 .../core/kernels/mkl_matmul_ops_common.h      | 268 ++++++++++++++----
 2 files changed, 218 insertions(+), 59 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 3eccf97f53c..687f67f6283 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -168,14 +168,13 @@ class MklMatMulOp : public OpKernel {
     const int index_transa = transa ? 1 : 0;
     const int index_transb = transb ? 1 : 0;
 
-    Tensor c_float;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {m, n}, &c_float));
 #ifdef ENABLE_MKLDNN_V1
     const char ftrans[] = {'N', 'T', 'C'};
     dnnl_gemm<bfloat16>(ftrans[index_transa], ftrans[index_transb], m, n, k,
-                        alpha, a, lda, b, ldb, beta,
-                        c_float.flat<float>().data(), ldc);
+                        alpha, a, lda, b, ldb, beta, c, ldc);
 #else
+    Tensor c_float;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {m, n}, &c_float));
     const char* const ftrans[] = {"N", "T", "C"};
 
     // MKL-DNN only supports the Fortran API and requires column major while
@@ -185,8 +184,8 @@ class MklMatMulOp : public OpKernel {
                             reinterpret_cast<const mkldnn_bfloat16_t*>(b), &ldb,
                             reinterpret_cast<const mkldnn_bfloat16_t*>(a), &lda,
                             &beta, c_float.flat<float>().data(), &ldc);
-#endif  // ENABLE_MKLDNN_V1
     FloatToBFloat16(c_float.flat<float>().data(), c, c_float.NumElements());
+#endif  // ENABLE_MKLDNN_V1
   }
 #endif  // ENABLE_INTEL_MKL_BFLOAT16
 };
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index ab816ce73fa..c069e0e6b59 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -516,33 +516,191 @@ class MklDnnMatMulOpBase : public OpKernel {
 
 // MatMul support for bfloat16 and int8 types is introduced in DNNLv1.2.
 #ifdef ENABLE_MKLDNN_V1
+
+using mkldnn::matmul;
+
 namespace {
 
-void dnnl_gemm_exec(const memory::desc& a_md, const memory::desc& b_md,
-                    const memory::desc& c_md, const void* a, const void* b,
-                    void* c, const primitive_attr& attr) {
-  // Create a MatMul primitive
-  mkldnn::engine cpu_engine = mkldnn::engine(ENGINE_CPU, 0);
-  mkldnn::matmul::desc matmul_desc(a_md, b_md, c_md);
-  mkldnn::matmul::primitive_desc matmul_pd(matmul_desc, attr, cpu_engine);
-  mkldnn::matmul matmul_prim(matmul_pd);
-  // Wrap raw pointers into DNNL memory objects
-  mkldnn::memory a_memory(a_md, cpu_engine, const_cast<void*>(a));
-  mkldnn::memory b_memory(b_md, cpu_engine, const_cast<void*>(b));
-  mkldnn::memory c_memory(c_md, cpu_engine, c);
-  // Execute the MatMul primitive.
-  // Since here all shapes and parameters are static, please note that we
-  // don't need to pass alpha (scales) again, as they are already hard-coded
-  // in the primitive descriptor. Also, we are not allowed to change the
-  // shapes of matrices A, B, and C -- they should exactly match
-  // the memory descriptors passed to MatMul operation descriptor.
-  mkldnn::stream s(cpu_engine);
-  matmul_prim.execute(s, {{DNNL_ARG_SRC, a_memory},
-                          {DNNL_ARG_WEIGHTS, b_memory},
-                          { DNNL_ARG_DST,
-                            c_memory }});
-  s.wait();
-}
+struct MklMatMulParams {
+  memory::dims a_dims;
+  memory::dims b_dims;
+  memory::dims c_dims;
+  memory::dims a_strides;
+  memory::dims b_strides;
+  memory::dims c_strides;
+
+  MklMatMulParams(memory::dims a_dims, memory::dims b_dims, memory::dims c_dims,
+                  memory::dims a_strides, memory::dims b_strides,
+                  memory::dims c_strides)
+      : a_dims(a_dims),
+        b_dims(b_dims),
+        c_dims(c_dims),
+        a_strides(a_strides),
+        b_strides(b_strides),
+        c_strides(c_strides) {}
+};
+
+template <typename T>
+class MklMatMulPrimitive : public MklPrimitive {
+ public:
+  explicit MklMatMulPrimitive(const MklMatMulParams& params)
+      : cpu_engine_(ENGINE_CPU, 0) {
+    context_.stream.reset(new CPU_STREAM(cpu_engine_));
+    // Create matmul primitive
+    if (context_.matmul_prim == nullptr) {
+      Setup(params);
+    }
+  }
+
+  ~MklMatMulPrimitive() {}
+
+  void Execute(const T* a_data, const T* b_data, T* c_data) {
+    context_.a_mem->set_data_handle(static_cast<void*>(const_cast<T*>(a_data)));
+    context_.b_mem->set_data_handle(static_cast<void*>(const_cast<T*>(b_data)));
+    context_.c_mem->set_data_handle(static_cast<void*>(const_cast<T*>(c_data)));
+
+    execute_primitives(context_.matmul_primtimives, context_.stream,
+                       context_.net_args);
+
+    // After execution, set data handle back
+    context_.a_mem->set_data_handle(DummyData);
+    context_.b_mem->set_data_handle(DummyData);
+    context_.c_mem->set_data_handle(DummyData);
+  }
+
+ private:
+  // Primitive reuse context for MatMul op
+  struct MklMatMulContext {
+    // MKL-DNN memory.
+    std::shared_ptr<mkldnn::memory> a_mem;
+    std::shared_ptr<mkldnn::memory> b_mem;
+    std::shared_ptr<mkldnn::memory> c_mem;
+
+    // Descriptor and primitive-descriptor for MatMul.
+    std::shared_ptr<matmul::desc> desc;
+    std::shared_ptr<matmul::primitive_desc> prim_desc;
+
+    // Memory descriptors.
+    std::shared_ptr<mkldnn::memory::desc> a_md;
+    std::shared_ptr<mkldnn::memory::desc> b_md;
+    std::shared_ptr<mkldnn::memory::desc> c_md;
+
+    // MatMul primitive.
+    std::shared_ptr<mkldnn::primitive> matmul_prim;
+    std::shared_ptr<mkldnn::stream> stream;
+    std::vector<mkldnn::primitive> matmul_primtimives;
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+    MklMatMulContext()
+        : a_mem(nullptr),
+          b_mem(nullptr),
+          c_mem(nullptr),
+          desc(nullptr),
+          prim_desc(nullptr),
+          a_md(nullptr),
+          b_md(nullptr),
+          c_md(nullptr),
+          matmul_prim(nullptr),
+          stream(nullptr) {}
+  };
+
+  void Setup(const MklMatMulParams& params) {
+    // Create MatMul descriptor and primitive descriptor.
+    context_.a_md.reset(
+        new memory::desc({params.a_dims}, MklDnnType<T>(), params.a_strides));
+
+    context_.b_md.reset(
+        new memory::desc({params.b_dims}, MklDnnType<T>(), params.b_strides));
+
+    context_.c_md.reset(
+        new memory::desc({params.c_dims}, MklDnnType<T>(), params.c_strides));
+
+    // Create matmul.
+    context_.desc.reset(
+        new matmul::desc(*context_.a_md, *context_.b_md, *context_.c_md));
+    context_.prim_desc.reset(
+        new matmul::primitive_desc(*context_.desc, cpu_engine_));
+
+    // Create memory primitive based on dummy data.
+    context_.a_mem.reset(
+        new mkldnn::memory(*context_.a_md, cpu_engine_, DummyData));
+    context_.b_mem.reset(
+        new mkldnn::memory(*context_.b_md, cpu_engine_, DummyData));
+    context_.c_mem.reset(
+        new mkldnn::memory(*context_.b_md, cpu_engine_, DummyData));
+
+    // Create matmul primitive.
+    context_.matmul_prim.reset(new mkldnn::matmul(*context_.prim_desc));
+    context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.a_mem},
+                                 {MKLDNN_ARG_WEIGHTS, *context_.b_mem},
+                                 {MKLDNN_ARG_DST, *context_.c_mem}});
+
+    context_.matmul_primtimives.push_back(*context_.matmul_prim);
+    return;
+  }
+
+  struct MklMatMulContext context_;
+  engine cpu_engine_;
+};
+
+template <typename T>
+class MklMatMulPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklMatMulPrimitive<T>* Get(const MklMatMulParams& params,
+                                    bool do_not_cache) {
+    MklMatMulPrimitive<T>* matmul_prim = nullptr;
+
+    if (do_not_cache) {
+      // Always create new primitive
+      matmul_prim = new MklMatMulPrimitive<T>(params);
+    } else {
+      // Try to find a suitable one in pool
+      matmul_prim = dynamic_cast<MklMatMulPrimitive<T>*>(
+          MklMatMulPrimitiveFactory<T>::GetInstance().GetMklMatMul(params));
+      if (matmul_prim == nullptr) {
+        matmul_prim = new MklMatMulPrimitive<T>(params);
+        MklMatMulPrimitiveFactory<T>::GetInstance().SetMklMatMul(params,
+                                                                 matmul_prim);
+      }
+    }
+
+    return matmul_prim;
+  }
+
+ private:
+  MklMatMulPrimitiveFactory() {}
+  ~MklMatMulPrimitiveFactory() {}
+
+  static MklMatMulPrimitiveFactory& GetInstance() {
+    static MklMatMulPrimitiveFactory instance_;
+    return instance_;
+  }
+
+  static string CreateKey(const MklMatMulParams& params) {
+    string prefix = "matmul_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(params.a_dims);
+    key_creator.AddAsKey(params.b_dims);
+    key_creator.AddAsKey(params.c_dims);
+    key_creator.AddAsKey(params.a_strides);
+    key_creator.AddAsKey(params.b_strides);
+    key_creator.AddAsKey(params.c_strides);
+    key_creator.AddAsKey(typeid(T).name());
+
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetMklMatMul(const MklMatMulParams& params) {
+    string key = CreateKey(params);
+    return this->GetOp(key);
+  }
+
+  void SetMklMatMul(const MklMatMulParams& params, MklPrimitive* op) {
+    string key = CreateKey(params);
+    this->SetOp(key, op);
+  }
+};
 
 template <typename T>
 void dnnl_gemm_batch(const std::vector<bool>& transa,
@@ -589,45 +747,47 @@ void dnnl_gemm_batch(const std::vector<bool>& transa,
       !transb[0] ? dims{k[0] * n[0], n[0], 1} : dims{n[0] * k[0], 1, k[0]};
   dims c_strides = dims{m[0] * n[0], n[0], 1};
 
-  // Prepare memory descriptors
-  memory::desc a_md(a_sizes, MklDnnType<T>(), a_strides);
-  memory::desc b_md(b_sizes, MklDnnType<T>(), b_strides);
-  memory::desc c_md(c_sizes, MklDnnType<T>(), c_strides);
-  // Create attributes (to handle alpha and beta if necessary)
-  mkldnn::primitive_attr attr;
-  if (alpha[0] != 1.f) attr.set_output_scales(/* mask */ 0, {alpha[0]});
-  if (beta[0] != 0.f) {
-    mkldnn::post_ops po;
-    po.append_sum(beta[0]);
-    attr.set_post_ops(po);
-  }
-  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<const void*>(a),
-                 static_cast<const void*>(b), static_cast<void*>(c), attr);
+  // MklMatMul uses const alpha and beta, make guarantee here to ensure
+  // they are never changed.
+  DCHECK_EQ(alpha, 1.0f);
+  DCHECK_EQ(beta, 0.f);
+
+  MklMatMulParams params(a_sizes, b_sizes, c_sizes, a_strides, b_strides,
+                         c_strides);
+  MklMatMulPrimitive<T>* matmul_prim =
+      MklMatMulPrimitiveFactory<T>::Get(params, 0);
+
+  // Execute matmul primitive.
+  matmul_prim->Execute(a, b, c);
 }
 
 template <typename T>
 void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
                float alpha, const T* a, int64_t lda, const T* b, int64_t ldb,
-               float beta, float* c, int64_t ldc) {
+               float beta, T* c, int64_t ldc) {
   using dims = mkldnn::memory::dims;
+
   // Prepare strides based on the transa and transb flags: transposed
   // matrices have strides swapped
+  dims a_dims = dims{m, k};
+  dims b_dims = dims{k, n};
+  dims c_dims = dims{m, n};
   dims a_strides = tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
   dims b_strides = tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
-  // Prepare memory descriptors
-  memory::desc a_md({m, k}, MklDnnType<T>(), a_strides);
-  memory::desc b_md({k, n}, MklDnnType<T>(), b_strides);
-  memory::desc c_md({m, n}, MklDnnType<float>(), {ldc, 1});
-  // Create attributes (to handle alpha and beta if necessary)
-  mkldnn::primitive_attr attr;
-  if (alpha != 1.f) attr.set_output_scales(/* mask */ 0, {alpha});
-  if (beta != 0.f) {
-    mkldnn::post_ops po;
-    po.append_sum(beta);
-    attr.set_post_ops(po);
-  }
-  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<const void*>(a),
-                 static_cast<const void*>(b), static_cast<void*>(c), attr);
+  dims c_strides = dims{ldc, 1};
+
+  // MklMatMul uses const alpha and beta, make guarantee here to ensure
+  // they are never changed.
+  DCHECK_EQ(alpha, 1.0f);
+  DCHECK_EQ(beta, 0.f);
+
+  MklMatMulParams params(a_dims, b_dims, c_dims, a_strides, b_strides,
+                         c_strides);
+  MklMatMulPrimitive<T>* matmul_prim =
+      MklMatMulPrimitiveFactory<T>::Get(params, 0);
+
+  // Execute matmul primitive.
+  matmul_prim->Execute(a, b, c);
 }
 
 }  // anonymous namespace

From 53037dcd6612709c6b58367dada2850ff0e5ed60 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 15:34:15 -0700
Subject: [PATCH 1159/1533] [TF/XLA] Ignore _noinline inside force-compiled
 clusters

The code surrounding the handling of _noinline functions is very rarely hit,
and as a result is not well tested.  For now, the better approach is to follow
a more well-lit codepath and try to minimize the use of _noinline functions.

As a starting point, inline blocks even with _noinline inside force-compiled
blocks.

PiperOrigin-RevId: 313280139
Change-Id: I9f2d9b95d4bfe15eb2acea2a3d101b82355c14d5
---
 tensorflow/compiler/tf2xla/BUILD              |  1 -
 tensorflow/compiler/tf2xla/xla_compiler.cc    |  7 -------
 .../core/common_runtime/graph_optimizer.cc    | 17 ++++++---------
 .../core/common_runtime/graph_optimizer.h     |  6 +-----
 .../python/eager/def_function_xla_jit_test.py | 21 -------------------
 5 files changed, 7 insertions(+), 45 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 37110442b26..55341c0a01f 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -350,7 +350,6 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
-        "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_cluster_util",
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 24ad1e1e311..3d6083621f4 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -572,10 +571,6 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
-  bool is_inside_mustcompile;
-  TryGetNodeAttr(AttrSlice(&fbody->fdef.attr()), kXlaMustCompileAttr,
-                 &is_inside_mustcompile);
-
   // Performs a first function inlining pass before shape inference, since
   // otherwise shape inference can't see inside functions and a comprehensive
   // shape_map, including function ops, is needed to constant-propagate Shape
@@ -627,8 +622,6 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   graph_optimizer_options.inline_multi_device_functions = true;
   graph_optimizer_options.inline_impl_selection_group_functions = true;
   graph_optimizer_options.inline_with_single_device_body_placer = true;
-  graph_optimizer_options.ignore_noinline = is_inside_mustcompile;
-
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, graph_optimizer_options);
 
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index ae1a2daa788..746930750ad 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -42,7 +42,7 @@ void GraphOptimizer::Optimize(
     const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn,
     bool inline_multi_device_functions,
     bool inline_impl_selection_group_functions,
-    bool inline_with_single_device_body_placer, bool ignore_noinline) {
+    bool inline_with_single_device_body_placer) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -116,11 +116,6 @@ void GraphOptimizer::Optimize(
             .inline_impl_selection_group_functions = true;
       }
 
-      if (ignore_noinline) {
-        expand_inline_opts.multi_device_options.ignore_noinline = true;
-        expand_inline_opts.native_options.ignore_noinline = true;
-      }
-
       bool was_mutated = ExpandInlineFunctions(runtime, g, expand_inline_opts);
       if (was_mutated) {
         DumpGraph("ExpandInlineFunctions", g);
@@ -143,11 +138,11 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
                               const Device* device,
                               std::unique_ptr<Graph>* graph,
                               const Options& options) {
-  Optimize(
-      runtime, env, device, graph, options.shape_map, options.cse_consider_fn,
-      options.cf_consider_fn, options.inline_multi_device_functions,
-      options.inline_impl_selection_group_functions,
-      options.inline_with_single_device_body_placer, options.ignore_noinline);
+  Optimize(runtime, env, device, graph, options.shape_map,
+           options.cse_consider_fn, options.cf_consider_fn,
+           options.inline_multi_device_functions,
+           options.inline_impl_selection_group_functions,
+           options.inline_with_single_device_body_placer);
 }
 
 void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 53bf532bd9c..099ea8efa12 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -58,9 +58,6 @@ class GraphOptimizer {
     // If true all functions will be inlined with a single device function
     // body placer strategy.
     bool inline_with_single_device_body_placer = false;
-
-    // If true, the _noinline attribute on functions and callers is ignored.
-    bool ignore_noinline = false;
   };
 
   explicit GraphOptimizer(const OptimizerOptions& opts);
@@ -84,8 +81,7 @@ class GraphOptimizer {
       const NodePredicate& cf_consider_fn = nullptr,
       bool inline_multi_device_functions = false,
       bool inline_impl_selection_group_functions = false,
-      bool inline_with_single_device_body_placer = false,
-      bool ignore_noinline = false);
+      bool inline_with_single_device_body_placer = false);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index b63a3b434d4..5fdf0487333 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -355,27 +355,6 @@ class DefFunctionTest(test.TestCase):
     self.assertAllClose([5.0, 5.0, 5.0], g())
     self.assertAllClose(compiled_g(), g())
 
-  def testTensorListConcatGradNestedCompile(self):
-
-    @def_function.function(experimental_compile=True)
-    def f(x):
-      ta = tensor_array_ops.TensorArray(
-          dtype=dtypes.float32, size=2, element_shape=[3])
-      ta = ta.write(0, 2 * x)
-      ta = ta.write(1, 3 * x)
-      return ta.concat()
-
-    @def_function.function(experimental_compile=True)
-    def g():
-      x = constant_op.constant([3.14, 2.68, 7.69])
-      with backprop.GradientTape() as tape:
-        tape.watch(x)
-        y = f(x)
-        out = tape.gradient(y, x)
-      return out
-
-    self.assertAllClose([5.0, 5.0, 5.0], g())
-
   def testCumsum(self):
 
     @def_function.function(experimental_compile=True)

From bba3595ebf353fe56b37d0913e3fb7d25ada8d5d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 15:35:21 -0700
Subject: [PATCH 1160/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 313280323
Change-Id: Ic02129a6e8c2684b823122096517c941a3fd2ba6
---
 tensorflow/go/op/wrappers.go | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 530ea2fad58..33eba9a734f 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -33265,6 +33265,14 @@ func TPUReplicatedInputIndex(value int64) TPUReplicatedInputAttr {
 	}
 }
 
+// TPUReplicatedInputIsPacked sets the optional is_packed attribute to value.
+// If not specified, defaults to false
+func TPUReplicatedInputIsPacked(value bool) TPUReplicatedInputAttr {
+	return func(m optionalAttr) {
+		m["is_packed"] = value
+	}
+}
+
 // Connects N inputs to an N-way replicated TPU computation.
 //
 // This operation holds a replicated input to a `tpu.replicate()` computation subgraph.

From fe523d826dd6e2843058c6ff9ef6217bc450de0f Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Tue, 26 May 2020 15:59:48 -0700
Subject: [PATCH 1161/1533] [XLA:TPU] Move per-memory-space bytes read/written
 code to HloCostAnalysis.

PiperOrigin-RevId: 313284279
Change-Id: I544c7089c51cb4dad733732149e5bb8fb3b05fa9
---
 .../compiler/xla/service/hlo_cost_analysis.cc | 36 +++++++++++++++++++
 .../compiler/xla/service/hlo_cost_analysis.h  |  8 +++++
 2 files changed, 44 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 50ba2077411..8a31bc5fef4 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -1041,6 +1041,42 @@ float HloCostAnalysis::optimal_seconds(const HloInstruction& hlo) const {
   return GetPropertyForHlo(hlo, kOptimalSecondsKey, hlo_properties_);
 }
 
+int64 HloCostAnalysis::GetBytesRead(const HloInstruction& hlo,
+                                    absl::optional<int64> memory_space) const {
+  int64 bytes_read = 0;
+  for (int operand_number = 0; operand_number < hlo.operand_count();
+       ++operand_number) {
+    for (const ShapeUtil::IndexedShape& indexed_shape :
+         ShapeUtil::GetLeafShapes(hlo.operand(operand_number)->shape())) {
+      absl::optional<int64> index_memory_space;
+      if (indexed_shape.shape.has_layout()) {
+        index_memory_space = indexed_shape.shape.layout().memory_space();
+      }
+      if (!memory_space || memory_space == index_memory_space) {
+        bytes_read +=
+            operand_bytes_accessed(hlo, operand_number, indexed_shape.index);
+      }
+    }
+  }
+  return bytes_read;
+}
+
+int64 HloCostAnalysis::GetBytesWritten(
+    const HloInstruction& hlo, absl::optional<int64> memory_space) const {
+  int64 bytes_written = 0;
+  for (const ShapeUtil::IndexedShape& indexed_shape :
+       ShapeUtil::GetLeafShapes(hlo.shape())) {
+    absl::optional<int64> index_memory_space;
+    if (indexed_shape.shape.has_layout()) {
+      index_memory_space = indexed_shape.shape.layout().memory_space();
+    }
+    if (!memory_space || memory_space == index_memory_space) {
+      bytes_written += output_bytes_accessed(hlo, indexed_shape.index);
+    }
+  }
+  return bytes_written;
+}
+
 StatusOr<HloCostAnalysis::Properties> HloCostAnalysis::ProcessSubcomputation(
     HloComputation* computation) {
   auto visitor = CreateNestedCostAnalysis(shape_size_, per_second_rates_);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 634a6c0572c..d9085dd7785 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -164,6 +164,14 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
                               ShapeIndex index = {}) const;
   float optimal_seconds(const HloInstruction& hlo) const;
 
+  // Get bytes read/written by this HLO. If memory_space is provided, it returns
+  // the bytes read/written from/to the given memory space only.
+  int64 GetBytesRead(const HloInstruction& hlo,
+                     absl::optional<int64> memory_space = absl::nullopt) const;
+  int64 GetBytesWritten(
+      const HloInstruction& hlo,
+      absl::optional<int64> memory_space = absl::nullopt) const;
+
   const Properties& properties() const { return properties_sum_; }
   const float property(const string& key) const {
     return GetProperty(key, properties());

From 6a0bdbdb7c48a3491ae1277083ae3dafb4ab4d7a Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Tue, 26 May 2020 16:00:59 -0700
Subject: [PATCH 1162/1533] Remove timeout="long" in model_coverage_test

PiperOrigin-RevId: 313284525
Change-Id: I10f224f331119911b44048389ff2f4f240309fb0
---
 tensorflow/lite/build_def.bzl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 4af4bd4aae8..f6cdb981328 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -702,7 +702,6 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "//tensorflow/lite/python:lite",
                 "//tensorflow/python:client_testlib",
             ] + flex_dep(target_op_sets),
-            timeout = "long",
         )
 
 def if_tflite_experimental_runtime(if_eager, if_non_eager, if_none = []):

From 3f423f882b7dd975799fdc6872e00172676c0b54 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 26 May 2020 16:23:56 -0700
Subject: [PATCH 1163/1533] Exclude Pixel-specific code from compilation when
 not on Android.

PiperOrigin-RevId: 313288675
Change-Id: I5883edf3fc9f90eb6d7bdeac02bb32bd2949ff7b
---
 .../lite/delegates/gpu/cl/opencl_wrapper.cc   | 39 +++++++++++--------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
index fadaabe32a0..bdaa807d83c 100644
--- a/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
+++ b/tensorflow/lite/delegates/gpu/cl/opencl_wrapper.cc
@@ -26,44 +26,51 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+#ifdef __ANDROID__
 #define LoadFunction(function)                                                 \
   if (is_pixel) {                                                              \
     function = reinterpret_cast<PFN_##function>(loadOpenCLPointer(#function)); \
   } else {                                                                     \
     function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));  \
   }
+#else
+#define LoadFunction(function) \
+  function = reinterpret_cast<PFN_##function>(dlsym(libopencl, #function));
+#endif
 
 absl::Status LoadOpenCL() {
   void* libopencl = dlopen("libOpenCL.so", RTLD_NOW | RTLD_LOCAL);
   if (libopencl) {
     LoadOpenCLFunctions(libopencl, false);
     return absl::OkStatus();
-  } else {
-    // record error
-    std::string error(dlerror());
-    // Pixel phone?
-    libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
-    if (libopencl) {
-      typedef void (*enableOpenCL_t)();
-      enableOpenCL_t enableOpenCL =
-          reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
-      enableOpenCL();
-      LoadOpenCLFunctions(libopencl, true);
-      return absl::OkStatus();
-    } else {
-      return absl::UnknownError(
-          absl::StrCat("Can not open OpenCL library on this device - ", error));
-    }
   }
+  // record error
+  std::string error(dlerror());
+#ifdef __ANDROID__
+  // Pixel phone?
+  libopencl = dlopen("libOpenCL-pixel.so", RTLD_NOW | RTLD_LOCAL);
+  if (libopencl) {
+    typedef void (*enableOpenCL_t)();
+    enableOpenCL_t enableOpenCL =
+        reinterpret_cast<enableOpenCL_t>(dlsym(libopencl, "enableOpenCL"));
+    enableOpenCL();
+    LoadOpenCLFunctions(libopencl, true);
+    return absl::OkStatus();
+  }
+#endif
+  return absl::UnknownError(
+      absl::StrCat("Can not open OpenCL library on this device - ", error));
 }
 
 void LoadOpenCLFunctions(void* libopencl, bool is_pixel) {
+#ifdef __ANDROID__
   typedef void* (*loadOpenCLPointer_t)(const char* name);
   loadOpenCLPointer_t loadOpenCLPointer;
   if (is_pixel) {
     loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
         dlsym(libopencl, "loadOpenCLPointer"));
   }
+#endif
 
   LoadFunction(clGetPlatformIDs);
   LoadFunction(clGetPlatformInfo);

From ba074621688ae499311175612694fcdae8b7809d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 16:26:09 -0700
Subject: [PATCH 1164/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 313289048
Change-Id: I7fd33d96318f8f0c03b538defbba16eb1044c38b
---
 .../ops_history_v2/TPUReplicatedInput.pbtxt   | 43 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  7 +++
 2 files changed, 50 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
index a293537e36d..b549b570c13 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TPUReplicatedInput.pbtxt
@@ -56,3 +56,46 @@ op {
     }
   }
 }
+op {
+  name: "TPUReplicatedInput"
+  input_arg {
+    name: "inputs"
+    type_attr: "T"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "is_mirrored_variable"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "index"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "is_packed"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 2f6e0dc0d4c..e2f2e3d00fa 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -49929,6 +49929,13 @@ op {
       i: -1
     }
   }
+  attr {
+    name: "is_packed"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "TPUReplicatedOutput"

From 3f0957496a2eadf3a07a164114e1224b61a22a5c Mon Sep 17 00:00:00 2001
From: tenglu <teng.lu@intel.com>
Date: Wed, 27 May 2020 07:49:28 +0800
Subject: [PATCH 1165/1533] Change class member to local var for MatMul
 primitive cache.

---
 tensorflow/core/kernels/mkl_matmul_ops_common.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index c069e0e6b59..201ca773f45 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -547,9 +547,7 @@ class MklMatMulPrimitive : public MklPrimitive {
       : cpu_engine_(ENGINE_CPU, 0) {
     context_.stream.reset(new CPU_STREAM(cpu_engine_));
     // Create matmul primitive
-    if (context_.matmul_prim == nullptr) {
-      Setup(params);
-    }
+    Setup(params);
   }
 
   ~MklMatMulPrimitive() {}
@@ -559,7 +557,7 @@ class MklMatMulPrimitive : public MklPrimitive {
     context_.b_mem->set_data_handle(static_cast<void*>(const_cast<T*>(b_data)));
     context_.c_mem->set_data_handle(static_cast<void*>(const_cast<T*>(c_data)));
 
-    execute_primitives(context_.matmul_primtimives, context_.stream,
+    execute_primitives(context_.matmul_primitives, context_.stream,
                        context_.net_args);
 
     // After execution, set data handle back
@@ -586,9 +584,8 @@ class MklMatMulPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory::desc> c_md;
 
     // MatMul primitive.
-    std::shared_ptr<mkldnn::primitive> matmul_prim;
     std::shared_ptr<mkldnn::stream> stream;
-    std::vector<mkldnn::primitive> matmul_primtimives;
+    std::vector<mkldnn::primitive> matmul_primitives;
     std::vector<std::unordered_map<int, memory>> net_args;
 
     MklMatMulContext()
@@ -600,11 +597,12 @@ class MklMatMulPrimitive : public MklPrimitive {
           a_md(nullptr),
           b_md(nullptr),
           c_md(nullptr),
-          matmul_prim(nullptr),
           stream(nullptr) {}
   };
 
   void Setup(const MklMatMulParams& params) {
+    std::shared_ptr<mkldnn::primitive> matmul_primitive = nullptr;
+
     // Create MatMul descriptor and primitive descriptor.
     context_.a_md.reset(
         new memory::desc({params.a_dims}, MklDnnType<T>(), params.a_strides));
@@ -630,12 +628,12 @@ class MklMatMulPrimitive : public MklPrimitive {
         new mkldnn::memory(*context_.b_md, cpu_engine_, DummyData));
 
     // Create matmul primitive.
-    context_.matmul_prim.reset(new mkldnn::matmul(*context_.prim_desc));
+    matmul_primitive.reset(new mkldnn::matmul(*context_.prim_desc));
     context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.a_mem},
                                  {MKLDNN_ARG_WEIGHTS, *context_.b_mem},
                                  {MKLDNN_ARG_DST, *context_.c_mem}});
 
-    context_.matmul_primtimives.push_back(*context_.matmul_prim);
+    context_.matmul_primitives.push_back(*matmul_primitive);
     return;
   }
 

From b6f542de704c4f1b8897f2a8c7c359cddb9bd043 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 17:26:27 -0700
Subject: [PATCH 1166/1533] [Profiler} Add link to a doc that describes how to
 use the profiler to debug tf.data performance.

PiperOrigin-RevId: 313298827
Change-Id: Idb1378b1efcb4f09225af5d23044e94737dd92ce
---
 tensorflow/core/profiler/convert/op_stats_to_overview_page.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 330b488dc8f..666463fc0bb 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -97,6 +97,9 @@ void ComputeFaqTips(OverviewPageRecommendation* re) {
 }
 
 void ComputeDocumentationTips(OverviewPageRecommendation* re) {
+  *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
+      "https://www.tensorflow.org/guide/data_performance_analysis",
+      "Analyze tf.data performance with the TF Profiler");
   *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
       "https://www.tensorflow.org/guide/"
       "data_performance",

From b100b185eecacef9990525e9a712b5547fa20689 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 17:28:20 -0700
Subject: [PATCH 1167/1533] Don't crash in 3D pooling ops with empty batch size
 on GPU.

PiperOrigin-RevId: 313299099
Change-Id: I40ce8f57efc386ae820460a325cfebee1be14d77
---
 tensorflow/core/kernels/pooling_ops_3d.cc             | 1 +
 tensorflow/python/kernel_tests/pooling_ops_3d_test.py | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 31ead11dd34..532d861e615 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -192,6 +192,7 @@ class Pooling3DOp : public UnaryOp<T> {
                                             {{out[2], out[1], out[0]}}, depth);
     Tensor* output;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+    if (out_shape.num_elements() == 0) return;
     LaunchPoolingOp<Device, T, Type>::launch(context, tensor_in, window, stride,
                                              padding, data_format_, padding_,
                                              output);
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index d5331dcb3e9..051f7e1168a 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -205,14 +205,14 @@ class PoolingTest(test.TestCase):
         padding="VALID",
         expected=[29.5, 32.5, 50.5, 53.5, 176.5, 179.5, 197.5, 200.5])
 
-  def _MaxPool3DEmptyTensorOutputShape(self):
+  def testMaxPool3DEmptyTensorOutputShape(self):
     """Verifies the output shape of the max pooling function when tensor is empty.
 
     Args: none
     """
     input_sizes = [0, 112, 112, 112, 64]
 
-    input_data = 1
+    input_data = 1.
     input_tensor = constant_op.constant(
         input_data, shape=input_sizes, name="input")
     max_pool_3d = nn_ops.max_pool3d(

From 9a2ac3f89c620eaebc9b260952958b1d9a0e06a9 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 26 May 2020 17:38:21 -0700
Subject: [PATCH 1168/1533] Hexagon Delegate: Support Fully Connected with non
 constant weights.

PiperOrigin-RevId: 313300444
Change-Id: I578c6e769ec38ae9c8ae2c54e8c4d6a515672689
---
 .../hexagon/builders/matmul_builder.cc        | 277 ++++++++++++------
 .../hexagon/builders/matmul_builder.h         |  32 +-
 .../delegates/hexagon/builders/op_builder.cc  |  15 +-
 .../delegates/hexagon/builders/op_builder.h   |   2 +-
 .../delegates/hexagon/builders/op_factory.h   |   4 +-
 .../hexagon/builders/tests/matmul_test.cc     |  66 ++++-
 .../experimental/delegates/hexagon/utils.cc   |   8 +-
 7 files changed, 295 insertions(+), 109 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
index c0c815ffdcc..894f98269ce 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <limits>
 
+#include "hexagon/hexagon_nn_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -27,9 +29,124 @@ namespace tflite {
 namespace delegates {
 namespace hexagon {
 namespace {
+void GetDims(int* batch_size, int* height_size, int* width_size,
+             int* depth_size, const TfLiteIntArray* dims) {
+  int* dim[] = {batch_size, height_size, width_size, depth_size};
+  for (int i = 0; i < 4; ++i) *(dim[i]) = 1;
+  for (int i = 4 - dims->size; i < 4; ++i) {
+    *dim[i] = dims->data[i - (4 - dims->size)];
+  }
+}
 
 constexpr uint8_t k8BitSignFlipConstant = 0x80;
 
+TfLiteStatus AddFullyConnectedHelper(const TfLiteIntArray* inputs,
+                                     const TfLiteIntArray* outputs,
+                                     const OpBuilder::TensorID weights_id,
+                                     const OpBuilder::TensorID weights_min_id,
+                                     const OpBuilder::TensorID weights_max_id,
+                                     GraphBuilder* graph_builder,
+                                     TfLiteContext* context,
+                                     OpBuilder* matmul_op,
+                                     OpBuilder::TensorID* node_output) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  // Data tensor.
+  int data_tensor_id = inputs->data[0];
+  const auto& data_tensor = context->tensors[data_tensor_id];
+  float data_min, data_max;
+  TF_LITE_ENSURE_STATUS(OpBuilder::ComputeMinAndMaxQuantValues(
+      data_tensor, &data_min, &data_max));
+  auto* data_min_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&data_min), sizeof(data_min));
+  auto* data_max_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&data_max), sizeof(data_max));
+
+  // Data and weight tensors in required order.
+  matmul_op->AddInput(graph_builder->GetHexagonTensorId(data_tensor_id));
+  matmul_op->AddInput(weights_id);
+  matmul_op->AddInput(OpBuilder::TensorID(data_min_const->GetID(), 0));
+  matmul_op->AddInput(OpBuilder::TensorID(data_max_const->GetID(), 0));
+  matmul_op->AddInput(weights_min_id);
+  matmul_op->AddInput(weights_max_id);
+
+  // Outputs for the MatMul node, which are in int32 format.
+  // Output shape should still be the same.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  const auto& matmul_out =
+      matmul_op->AddOutput(sizeof(int32_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  const auto& matmul_out_min =
+      matmul_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  const auto& matmul_out_max =
+      matmul_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  // Bias tensor.
+  int bias_tensor_id = inputs->data[2];
+  OpBuilder::TensorID matmul_and_bias_out = matmul_out,
+                      matmul_and_bias_out_min = matmul_out_min,
+                      matmul_and_bias_out_max = matmul_out_max;
+  if (bias_tensor_id != -1) {
+    const auto& bias_tensor = context->tensors[bias_tensor_id];
+    auto* const_bias_node =
+        graph_builder->AddConstNodeWithData(bias_tensor_id, bias_tensor);
+    float bias_min, bias_max;
+    graph_builder->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), 0);
+    OpBuilder::ComputeMinAndMaxQuantValues(bias_tensor, &bias_min, &bias_max);
+    auto* bias_min_const = graph_builder->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&bias_min), sizeof(bias_min));
+    auto* bias_max_const = graph_builder->AddConstNodeWithData(
+        scalar_shape, reinterpret_cast<char*>(&bias_max), sizeof(bias_max));
+
+    // MatMul + Bias.
+    auto* bias_add_op = graph_builder->AddNode(matmul_op->GetTFLiteNodeID());
+    bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
+    bias_add_op->AddInput(matmul_out);
+    bias_add_op->AddInput(graph_builder->GetHexagonTensorId(bias_tensor_id));
+    bias_add_op->AddInput(matmul_out_min);
+    bias_add_op->AddInput(matmul_out_max);
+    bias_add_op->AddInput(OpBuilder::TensorID(bias_min_const->GetID(), 0));
+    bias_add_op->AddInput(OpBuilder::TensorID(bias_max_const->GetID(), 0));
+    matmul_and_bias_out =
+        bias_add_op->AddOutput(sizeof(int32_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+    matmul_and_bias_out_min =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    matmul_and_bias_out_max =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  }
+
+  float output_min, output_max;
+  // Quantize 32-bit result into 8-bit format using output tensor min/max.
+  OpBuilder::ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]],
+                                         &output_min, &output_max);
+  auto* output_min_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+  auto* quantize_biasadd_op =
+      graph_builder->AddNode(matmul_op->GetTFLiteNodeID());
+  quantize_biasadd_op->SetOpType(OP_Requantize_32to8);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_min);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_max);
+  quantize_biasadd_op->AddInput(
+      OpBuilder::TensorID(output_min_const->GetID(), 0));
+  quantize_biasadd_op->AddInput(
+      OpBuilder::TensorID(output_max_const->GetID(), 0));
+  *node_output =
+      quantize_biasadd_op->AddOutput(sizeof(uint8_t), 4,
+                                     {output_batch_size, output_height_size,
+                                      output_width_size, output_depth_size});
+  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  return kTfLiteOk;
+}
+
 }  // namespace
 
 // The TFLite 'Fully-connected' quantized op corresponds to the following
@@ -38,27 +155,14 @@ constexpr uint8_t k8BitSignFlipConstant = 0x80;
 // MatMul out (int32), Bias (int32) => QuantizedBiasAdd => BiasAdd out (int32)
 // BiasAdd out (int32) => Requantize_32to8 => Output (8-bit)
 // TODO(b/129276536): Add activation support.
-TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
-                                               const TfLiteIntArray* outputs,
-                                               TfLiteContext* context) {
+TfLiteStatus MatMulWithConstWeightsOpBuilder::PopulateSubGraph(
+    const TfLiteIntArray* inputs, const TfLiteIntArray* outputs,
+    TfLiteContext* context) {
   static int quant_bound_shape[] = {1, 1, 1, 1};
 
-  // Data tensor.
-  int data_tensor_id = inputs->data[0];
-  const auto& data_tensor = context->tensors[data_tensor_id];
-  TF_LITE_ENSURE_STATUS(
-      ComputeMinAndMaxQuantValues(data_tensor, &data_min_, &data_max_));
-  auto* data_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&data_min_),
-      sizeof(data_min_));
-  auto* data_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&data_max_),
-      sizeof(data_max_));
-
   // Weights vector.
   int weights_tensor_id = inputs->data[1];
   const auto& weights_tensor = context->tensors[weights_tensor_id];
-  // TODO(srjoglekar): Abstract out.
   if (weights_tensor.allocation_type != kTfLiteMmapRo) {
     context->ReportError(
         context, "Weights tensor doesn't have correct allocation type: %s",
@@ -107,90 +211,74 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       quant_bound_shape, reinterpret_cast<char*>(&weights_max_),
       sizeof(weights_max_));
 
-  // Data and weight tensors in required order.
-  AddInput(graph_builder_->GetHexagonTensorId(data_tensor_id));
+  return AddFullyConnectedHelper(
+      inputs, outputs, graph_builder_->GetHexagonTensorId(weights_tensor_id),
+      TensorID(weights_min_const->GetID(), 0),
+      TensorID(weights_max_const->GetID(), 0), graph_builder_, context, this,
+      &node_output_);
+}
+
+TfLiteStatus MatMulWithConstWeightsOpBuilder::RegisterOutputs(
+    const TfLiteIntArray* outputs, TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+  return kTfLiteOk;
+}
+
+TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                               const TfLiteIntArray* outputs,
+                                               TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  const int weights_tensor_id = inputs->data[1];
+  const auto& weights_tensor = context->tensors[weights_tensor_id];
+  int batch_size, height_size, width_size, depth_size;
+  GetDims(&batch_size, &height_size, &width_size, &depth_size,
+          weights_tensor.dims);
+  weights_shape_ = {batch_size, height_size, depth_size, width_size};
+  // Permutation for transposing.
+  int permutation[] = {0, 1, 3, 2};
+  const int permutation_shape[] = {1, 1, 1, 4};
+  auto permutation_node = graph_builder_->AddConstNodeWithData(
+      permutation_shape, reinterpret_cast<char*>(permutation),
+      4 * sizeof(permutation[0]));
   AddInput(graph_builder_->GetHexagonTensorId(weights_tensor_id));
-  AddInput(TensorID(data_min_const->GetID(), 0));
-  AddInput(TensorID(data_max_const->GetID(), 0));
+  AddInput(TensorID(permutation_node->GetID(), 0));
+
+  ComputeMinAndMaxQuantValues(weights_tensor, &weights_min_, &weights_max_);
+  auto* weights_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&weights_min_),
+      sizeof(weights_min_));
+  auto* weights_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&weights_max_),
+      sizeof(weights_max_));
   AddInput(TensorID(weights_min_const->GetID(), 0));
   AddInput(TensorID(weights_max_const->GetID(), 0));
 
-  // Outputs for the MatMul node, which are in int32 format.
-  // Output shape should still be the same.
-  int output_batch_size, output_height_size, output_width_size,
-      output_depth_size;
-  GetDims(&output_batch_size, &output_height_size, &output_width_size,
-          &output_depth_size, context->tensors[outputs->data[0]].dims);
-  const auto& matmul_out = AddOutput(sizeof(int32_t), 4,
-                                     {output_batch_size, output_height_size,
-                                      output_width_size, output_depth_size});
-  const auto& matmul_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  const auto& matmul_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  auto transposed_weights = AddOutput(sizeof(uint8_t), 4, weights_shape_);
+  auto transposed_weights_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  auto transposed_weights_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
 
-  // Bias tensor.
-  int bias_tensor_id = inputs->data[2];
-  TensorID matmul_and_bias_out = matmul_out,
-           matmul_and_bias_out_min = matmul_out_min,
-           matmul_and_bias_out_max = matmul_out_max;
-  if (bias_tensor_id != -1) {
-    const auto& bias_tensor = context->tensors[bias_tensor_id];
-    auto* const_bias_node =
-        graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
-    graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(),
-                                    0);
-    ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
-    auto* bias_min_const = graph_builder_->AddConstNodeWithData(
-        quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
-        sizeof(bias_min_));
-    auto* bias_max_const = graph_builder_->AddConstNodeWithData(
-        quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
-        sizeof(bias_max_));
-
-    // MatMul + Bias.
-    auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
-    bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
-    bias_add_op->AddInput(matmul_out);
-    bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
-    bias_add_op->AddInput(matmul_out_min);
-    bias_add_op->AddInput(matmul_out_max);
-    bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
-    bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
-    matmul_and_bias_out =
-        bias_add_op->AddOutput(sizeof(int32_t), 4,
-                               {output_batch_size, output_height_size,
-                                output_width_size, output_depth_size});
-    matmul_and_bias_out_min =
-        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    matmul_and_bias_out_max =
-        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  }
-
-  // Quantize 32-bit result into 8-bit format using output tensor min/max.
-  ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min_,
-                              &output_max_);
-  auto* output_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&output_min_),
-      sizeof(output_min_));
-  auto* output_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&output_max_),
-      sizeof(output_max_));
-  auto* quantize_biasadd_op = graph_builder_->AddNode(GetTFLiteNodeID());
-  quantize_biasadd_op->SetOpType(OP_Requantize_32to8);
-  quantize_biasadd_op->AddInput(matmul_and_bias_out);
-  quantize_biasadd_op->AddInput(matmul_and_bias_out_min);
-  quantize_biasadd_op->AddInput(matmul_and_bias_out_max);
-  quantize_biasadd_op->AddInput(TensorID(output_min_const->GetID(), 0));
-  quantize_biasadd_op->AddInput(TensorID(output_max_const->GetID(), 0));
-  node_output_ =
-      quantize_biasadd_op->AddOutput(sizeof(uint8_t), 4,
-                                     {output_batch_size, output_height_size,
-                                      output_width_size, output_depth_size});
-  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  quantize_biasadd_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  auto* matmul_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  matmul_op->SetOpType(OP_QuantizedMatMul_8x8to32);
 
+  AddFullyConnected(inputs, outputs, transposed_weights, transposed_weights_min,
+                    transposed_weights_max, context, matmul_op);
   return kTfLiteOk;
 }
 
+TfLiteStatus MatMulOpBuilder::AddFullyConnected(const TfLiteIntArray* inputs,
+                                                const TfLiteIntArray* outputs,
+                                                const TensorID weights_id,
+                                                const TensorID weights_min_id,
+                                                const TensorID weights_max_id,
+                                                TfLiteContext* context,
+                                                OpBuilder* matmul_op) {
+  return AddFullyConnectedHelper(inputs, outputs, weights_id, weights_min_id,
+                                 weights_max_id, graph_builder_, context,
+                                 matmul_op, &node_output_);
+}
+
 TfLiteStatus MatMulOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
                                               TfLiteContext* context) {
   // Should be only 1 output.
@@ -199,9 +287,12 @@ TfLiteStatus MatMulOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
   return kTfLiteOk;
 }
 
-MatMulOpBuilder::~MatMulOpBuilder() {}
+OpBuilder* CreateMatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                                 int op_type) {
+  return new MatMulWithConstWeightsOpBuilder(graph_builder, op_type);
+}
 
-OpBuilder* CreateMatMulBuilder(GraphBuilder* graph_builder, int op_type) {
+OpBuilder* CreateMatMulOpBuilder(GraphBuilder* graph_builder, int op_type) {
   return new MatMulOpBuilder(graph_builder, op_type);
 }
 
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h
index 212ea7be7a3..89f3c1273d7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.h
@@ -23,6 +23,28 @@ namespace tflite {
 namespace delegates {
 namespace hexagon {
 
+// Builder for FullyConnected op in Hexagon with weights as const.
+class MatMulWithConstWeightsOpBuilder : public OpBuilder {
+ public:
+  explicit MatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                           int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  std::vector<int> weights_shape_, bias_shape_;
+  std::vector<float> transposed_weights_;
+  float data_min_, data_max_, weights_min_, weights_max_, bias_min_, bias_max_,
+      output_min_, output_max_;
+};
+
+// Builder for FullyConnected op in Hexagon with non const weights.
 class MatMulOpBuilder : public OpBuilder {
  public:
   explicit MatMulOpBuilder(GraphBuilder* graph_builder, int op_type)
@@ -34,9 +56,15 @@ class MatMulOpBuilder : public OpBuilder {
   TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
                                TfLiteContext* context) override;
 
-  ~MatMulOpBuilder() override;
-
  private:
+  // Adds Fully connected op related ops to the graph.
+  TfLiteStatus AddFullyConnected(const TfLiteIntArray* inputs,
+                                 const TfLiteIntArray* outputs,
+                                 const TensorID weights_id,
+                                 const TensorID weights_min_id,
+                                 const TensorID weights_max_id,
+                                 TfLiteContext* context, OpBuilder* matmul_op);
+
   TensorID node_output_;
   std::vector<int> weights_shape_, bias_shape_;
   std::vector<float> transposed_weights_;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index ba264313805..d851f8cf824 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -23,7 +23,8 @@ namespace tflite {
 namespace delegates {
 namespace hexagon {
 
-OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
+OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type,
+                                                     TfLiteNode* node) {
   switch (op_type) {
     case kTfLiteBuiltinAdd:
       return CreateArithmeticBuilder(this, OP_QuantizedAdd_8p8to8);
@@ -45,8 +46,14 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreatePadBuilder(this, OP_QuantizedPad_8);
     case kTfLiteBuiltinMirrorPad:
       return CreateMirrorPadBuilder(this, OP_MirrorPad_8);
-    case kTfLiteBuiltinFullyConnected:
-      return CreateMatMulBuilder(this, OP_QuantizedMatMul_8x8to32);
+    case kTfLiteBuiltinFullyConnected: {
+      const auto& weights_tensor = context_->tensors[node->inputs->data[1]];
+      if (weights_tensor.allocation_type == kTfLiteMmapRo)
+        return CreateMatMulWithConstWeightsOpBuilder(
+            this, OP_QuantizedMatMul_8x8to32);
+      else
+        return CreateMatMulOpBuilder(this, OP_Transpose_8);
+    }
     case kTfLiteBuiltinAveragePool2d:
       return CreatePool2DBuilder(this, OP_QuantizedAvgPool_8);
     case kTfLiteBuiltinMaxPool2d:
@@ -271,7 +278,7 @@ OpBuilder* GraphBuilder::AddNode(int tflite_node_index) {
 
 OpBuilder* GraphBuilder::AddNodeFromTfLiteOp(int op_type, TfLiteNode* node,
                                              int tflite_node_index) {
-  OpBuilder* op = CreateOpBuilderFromTfLiteOp(op_type);
+  OpBuilder* op = CreateOpBuilderFromTfLiteOp(op_type, node);
   builders_.emplace_back(op);
   op->SetNodeId(builders_.size());
   op->SetTFLiteNodeId(tflite_node_index);
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
index 267fc818ca1..743323c8bd3 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
@@ -197,7 +197,7 @@ class GraphBuilder {
   // Same as above but takes shape of the tensor that will holds the data.
   OpBuilder* AddConstNodeWithData(const int shape[], char* data, int data_size);
 
-  OpBuilder* CreateOpBuilderFromTfLiteOp(int op_type);
+  OpBuilder* CreateOpBuilderFromTfLiteOp(int op_type, TfLiteNode* node);
 
   // Construct Input node with 'input_tensors' as output.
   TfLiteStatus AddInputTensors(const TfLiteIntArray* input_tensors,
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index e44bf78992d..33b56e91f0a 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -26,7 +26,8 @@ class OpBuilder;
 OpBuilder* CreateArgMinMaxOpBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateActivationBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateArithmeticBuilder(GraphBuilder* graph_builder, int op_type);
-OpBuilder* CreateMatMulBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMatMulWithConstWeightsOpBuilder(GraphBuilder* graph_builder,
+                                                 int op_type);
 OpBuilder* CreateConcatBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateConv2DBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateTransposeConv2DBuilder(GraphBuilder* graph_builder,
@@ -57,6 +58,7 @@ OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateSliceOpBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreatePackBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMatMulOpBuilder(GraphBuilder* graph_builder, int op_type);
 
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
index 3a5f320a6a7..ff2c71946e7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
@@ -22,7 +22,8 @@ using testing::ElementsAreArray;
 class FullyConnectedOpModel : public SingleOpModelWithHexagon {
  public:
   FullyConnectedOpModel(int units, int batches, const TensorData& input,
-                        const TensorData& output, bool optional_bias = false)
+                        const TensorData& output, bool optional_bias = false,
+                        bool const_weights = true)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
     for (size_t i = 0; i < input.shape.size(); ++i) {
@@ -54,8 +55,10 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
 
     // Weights & bias tensors need to be constant.
     // We don't use AddConstInput to allow setting filter values later.
-    auto* weights_tensor = interpreter_->tensor(weights_);
-    weights_tensor->allocation_type = kTfLiteMmapRo;
+    if (const_weights) {
+      auto* weights_tensor = interpreter_->tensor(weights_);
+      weights_tensor->allocation_type = kTfLiteMmapRo;
+    }
     if (!optional_bias) {
       auto* bias_tensor = interpreter_->tensor(bias_);
       bias_tensor->allocation_type = kTfLiteMmapRo;
@@ -203,4 +206,61 @@ TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NoBias) {
               ElementsAreArray(ArrayFloatNear(reference_output)));
 }
 
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NonConstWeights) {
+  FullyConnectedOpModel m(/*units=*/3, /*batches*/ 2,
+                          /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+                          /*output=*/{TensorType_INT8, {}, -127, 128},
+                          /*optional_bias=*/false, /*const_weights=*/false);
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8_NonConstWeights) {
+  FullyConnectedOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128}, /*optional_bias=*/false,
+      /*const_weights=*/false);
+
+  m.SetWeights<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear({
+                  24, 25, 26,  //
+                  58, 59, 60,  //
+              })));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAre(151, 152, 153, 185, 186, 187));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index f75447f8ea6..80f82749e80 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -198,22 +198,20 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       if (!InputsWithCorrectTypes(node, context,
                                   {{kTfLiteUInt8, kTfLiteInt8},
                                    {kTfLiteUInt8, kTfLiteInt8},
-                                   {kTfLiteInt32, kTfLiteNoType}}))
+                                   {kTfLiteInt32, kTfLiteNoType}})) {
         return false;
+      }
 
-      const auto& weights_tensor = context->tensors[node->inputs->data[1]];
       bool bias_const_or_no_bias = true;
       if (node->inputs->data[2] != -1) {
         const auto& bias_tensor = context->tensors[node->inputs->data[2]];
         bias_const_or_no_bias = bias_tensor.allocation_type == kTfLiteMmapRo;
       }
-      const bool weights_const =
-          weights_tensor.allocation_type == kTfLiteMmapRo;
 
       const TfLiteFullyConnectedParams* matmul_params =
           reinterpret_cast<const TfLiteFullyConnectedParams*>(
               node->builtin_data);
-      return (weights_const && bias_const_or_no_bias &&
+      return (bias_const_or_no_bias &&
               matmul_params->activation == kTfLiteActNone &&
               matmul_params->keep_num_dims == false &&
               matmul_params->weights_format ==

From 52155ca469c3ded99821105821a2c031fbc723a7 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Tue, 26 May 2020 17:46:37 -0700
Subject: [PATCH 1169/1533] Nit: Use holds_alternative<T> instead of get_if<T>
 on variants when the result is only used as a bool.

PiperOrigin-RevId: 313301420
Change-Id: Ib1b69e94c777759f8029253f758d378987db83a0
---
 tensorflow/lite/delegates/gpu/api.cc               | 14 +++++++-------
 tensorflow/lite/delegates/gpu/cl/gl_interop.cc     |  2 +-
 .../gpu/common/transformations/fuse_add_to_conv.cc | 10 ++++++----
 .../gpu/common/transformations/fuse_mul_to_conv.cc | 11 ++++++-----
 .../common/transformations/merge_padding_with.cc   |  9 +++++----
 .../gpu/common/transformations/remove_noop.cc      |  6 +++---
 tensorflow/lite/delegates/gpu/gl/object.h          |  2 +-
 tensorflow/lite/delegates/gpu/gl/runtime.cc        |  2 +-
 8 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/api.cc b/tensorflow/lite/delegates/gpu/api.cc
index 1a18fcb87f2..cddd14b6855 100644
--- a/tensorflow/lite/delegates/gpu/api.cc
+++ b/tensorflow/lite/delegates/gpu/api.cc
@@ -80,19 +80,19 @@ bool IsValid(const TensorObjectDef& def, const TensorObject& object) {
 bool IsObjectPresent(ObjectType type, const TensorObject& obj) {
   switch (type) {
     case ObjectType::CPU_MEMORY:
-      return absl::get_if<CpuMemory>(&obj);
+      return absl::holds_alternative<CpuMemory>(obj);
     case ObjectType::OPENGL_SSBO:
-      return absl::get_if<OpenGlBuffer>(&obj);
+      return absl::holds_alternative<OpenGlBuffer>(obj);
     case ObjectType::OPENGL_TEXTURE:
-      return absl::get_if<OpenGlTexture>(&obj);
+      return absl::holds_alternative<OpenGlTexture>(obj);
     case ObjectType::OPENCL_BUFFER:
-      return absl::get_if<OpenClBuffer>(&obj);
+      return absl::holds_alternative<OpenClBuffer>(obj);
     case ObjectType::OPENCL_TEXTURE:
-      return absl::get_if<OpenClTexture>(&obj);
+      return absl::holds_alternative<OpenClTexture>(obj);
     case ObjectType::VULKAN_BUFFER:
-      return absl::get_if<VulkanBuffer>(&obj);
+      return absl::holds_alternative<VulkanBuffer>(obj);
     case ObjectType::VULKAN_TEXTURE:
-      return absl::get_if<VulkanTexture>(&obj);
+      return absl::holds_alternative<VulkanTexture>(obj);
     case ObjectType::UNKNOWN:
       return false;
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/gl_interop.cc b/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
index eaeff2cda07..599e6766301 100644
--- a/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.cc
@@ -273,7 +273,7 @@ GlClBufferCopier::GlClBufferCopier(const TensorObjectDef& input_def,
 
 absl::Status GlClBufferCopier::Convert(const TensorObject& input_obj,
                                        const TensorObject& output_obj) {
-  if (absl::get_if<OpenGlBuffer>(&input_obj)) {
+  if (absl::holds_alternative<OpenGlBuffer>(input_obj)) {
     auto ssbo = absl::get_if<OpenGlBuffer>(&input_obj);
     auto cl_mem = absl::get_if<OpenClBuffer>(&output_obj);
     RETURN_IF_ERROR(
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
index 4efb98a6847..b279e49e40c 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_add_to_conv.cc
@@ -48,8 +48,9 @@ class MergeConvolutionWithAdd : public SequenceTransformation {
     }
     AddAttributes add_attr =
         absl::any_cast<AddAttributes>(add_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param) &&
-        !absl::get_if<float>(&add_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            add_attr.param) &&
+        !absl::holds_alternative<float>(add_attr.param)) {
       return {TransformStatus::DECLINED,
               "This fuse applicable only for broadcast or scalar addition."};
     }
@@ -104,8 +105,9 @@ class MergeAddWithConvolution : public SequenceTransformation {
     }
     AddAttributes add_attr =
         absl::any_cast<AddAttributes>(add_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param) &&
-        !absl::get_if<float>(&add_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            add_attr.param) &&
+        !absl::holds_alternative<float>(add_attr.param)) {
       return {TransformStatus::DECLINED,
               "This fuse applicable only for broadcast or scalar addition."};
     }
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
index 749382c3417..f4ace3c0d41 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/fuse_mul_to_conv.cc
@@ -45,8 +45,9 @@ class MergeConvolutionWithMul : public SequenceTransformation {
 
     MultiplyAttributes mul_attr =
         absl::any_cast<MultiplyAttributes>(mul_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&mul_attr.param) &&
-        !absl::get_if<float>(&mul_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            mul_attr.param) &&
+        !absl::holds_alternative<float>(mul_attr.param)) {
       return {
           TransformStatus::DECLINED,
           "This fuse applicable only for broadcast or scalar multiplication."};
@@ -108,9 +109,9 @@ class MergeMulWithConvolution : public SequenceTransformation {
 
     MultiplyAttributes mul_attr =
         absl::any_cast<MultiplyAttributes>(mul_node.operation.attributes);
-    if (!absl::get_if<Tensor<Linear, DataType::FLOAT32>>(
-            &mul_attr.param) &&
-        !absl::get_if<float>(&mul_attr.param)) {
+    if (!absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            mul_attr.param) &&
+        !absl::holds_alternative<float>(mul_attr.param)) {
       return {
           TransformStatus::DECLINED,
           "This fuse applicable only for broadcast or scalar multiplication."};
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
index 23e99bc3305..2f1621eb34b 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/merge_padding_with.cc
@@ -146,10 +146,11 @@ class MergePaddingWithAddOperation : public NodeTransformation {
 
     AddAttributes add_attr =
         absl::any_cast<AddAttributes>(add_node->operation.attributes);
-    const auto add_broadcast =
-        absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&add_attr.param);
-    const float* add_scalar = absl::get_if<float>(&add_attr.param);
-    if (add_broadcast || add_scalar) {
+    const bool is_add_broadcast =
+        absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+            add_attr.param);
+    const bool is_add_scalar = absl::holds_alternative<float>(add_attr.param);
+    if (is_add_broadcast || is_add_scalar) {
       return {TransformStatus::SKIPPED,
               "Cannot remove padding when this broadcast/scalar ADD"};
     }
diff --git a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
index e80b244b34f..b4cdd87109a 100644
--- a/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
+++ b/tensorflow/lite/delegates/gpu/common/transformations/remove_noop.cc
@@ -77,9 +77,9 @@ std::unique_ptr<SequenceTransformation> NewRemoveSingleInputAdd() {
         }
         auto& attr =
             absl::any_cast<const AddAttributes&>(node->operation.attributes);
-        return absl::get_if<Tensor<Linear, DataType::FLOAT32>>(&attr.param) ==
-                   nullptr &&
-               absl::get_if<float>(&attr.param) == nullptr;
+        return !absl::holds_alternative<Tensor<Linear, DataType::FLOAT32>>(
+                   attr.param) &&
+               !absl::holds_alternative<float>(attr.param);
       });
 }
 
diff --git a/tensorflow/lite/delegates/gpu/gl/object.h b/tensorflow/lite/delegates/gpu/gl/object.h
index 3463d0678b6..0c2a2326356 100644
--- a/tensorflow/lite/delegates/gpu/gl/object.h
+++ b/tensorflow/lite/delegates/gpu/gl/object.h
@@ -70,7 +70,7 @@ struct Object {
 
 // @return true if object is a reference.
 inline bool IsRef(const Object& object) {
-  return !absl::get_if<ObjectData>(&object.object);
+  return !absl::holds_alternative<ObjectData>(object.object);
 }
 
 inline ObjectRef GetRef(const Object& object) {
diff --git a/tensorflow/lite/delegates/gpu/gl/runtime.cc b/tensorflow/lite/delegates/gpu/gl/runtime.cc
index 2a48b59c8d9..b7e01a33570 100644
--- a/tensorflow/lite/delegates/gpu/gl/runtime.cc
+++ b/tensorflow/lite/delegates/gpu/gl/runtime.cc
@@ -483,7 +483,7 @@ absl::Status ApplyTexturesAssignment(
     Object* object = global_ref_to_object_ptr[global_ref];
     if (usage_rec_id == kNotAssigned || object == nullptr ||
         object->object_type != ObjectType::TEXTURE ||
-        !absl::get_if<ObjectSizeT>(&object->size)) {
+        !absl::holds_alternative<ObjectSizeT>(object->size)) {
       // Skip objects with other data type, non-textures and textures with wrong
       // number of dimensions.
       continue;

From 3ad4f18cfd67146b5194f365b31234a51988f462 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Tue, 26 May 2020 18:12:03 -0700
Subject: [PATCH 1170/1533] Remove unused items_to_destroy vector in
 EagerExecutor::Shutdown().

PiperOrigin-RevId: 313304658
Change-Id: Ide3b00a0f794a0d582ac11d6181e411697101021
---
 tensorflow/core/common_runtime/eager/eager_executor.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index 7850978410f..ddfdabf9472 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -50,7 +50,6 @@ EagerExecutor::~EagerExecutor() {
 
 Status EagerExecutor::ShutDown() {
   {
-    std::vector<core::RefCountPtr<NodeItem>> items_to_destroy;
     bool has_thread;
     Status status;
     {
@@ -72,9 +71,6 @@ Status EagerExecutor::ShutDown() {
         nodes_pending_.notify_all();
       }
     }
-    for (auto& item : items_to_destroy) {
-      item->node->Abort(status);
-    }
     if (!has_thread) {
       return status;
     }

From a2f840d54e5588a1225443bc4442415e86be13de Mon Sep 17 00:00:00 2001
From: Chuan He <chhe@google.com>
Date: Tue, 26 May 2020 18:24:56 -0700
Subject: [PATCH 1171/1533]    Legalize from "xla_hlo.slice" to "tf.Slice".

PiperOrigin-RevId: 313306321
Change-Id: Ic4a34a4fb7c306c2581d517bf4411b37a8c0539d
---
 .../mlir/tensorflow/tests/legalize_hlo.mlir   | 14 +++++++
 .../tensorflow/transforms/legalize_hlo.cc     | 40 +++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 7691a6bd6e8..abc12b2d89c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -682,6 +682,11 @@ func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
   return %0 : tensor<2xf32>
 }
 
+func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
+  %0 = "xla_hlo.slice"(%arg0) {limit_indices = dense<[1, 4672]> : tensor<2xi64>, start_indices = dense<[0, 4153]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<1x4672xf32>) -> tensor<1x519xf32>
+  return %0 : tensor<1x519xf32>
+}
+
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
 // CHECK-LABEL:   func @biasAdd_NHWC(
@@ -1493,3 +1498,12 @@ func @convert_i32_f32(%arg0: tensor<2xi32>) -> tensor<2xf32> {
 // CHECK:           [[VAL_371:%.*]] = "tf.Cast"([[VAL_370]]) {Truncate = false} : (tensor<2xi32>) -> tensor<2xf32>
 // CHECK:           return [[VAL_371]] : tensor<2xf32>
 // CHECK:         }
+
+// CHECK-LABEL:   func @convert_slice(
+// CHECK-SAME:                          [[VAL_372:%.*]]: tensor<1x4672xf32>) -> tensor<1x519xf32> {
+// CHECK:           [[VAL_373:%.*]] = "tf.Const"() {value = dense<[0, 4153]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           [[VAL_374:%.*]] = "tf.Const"() {value = dense<[1, 519]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           [[VAL_375:%.*]] = "tf.Slice"([[VAL_372]], [[VAL_373]], [[VAL_374]]) : (tensor<1x4672xf32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x519xf32>
+// CHECK:           return [[VAL_375]] : tensor<1x519xf32>
+// CHECK:         }
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index b1cbc41a03e..524b3e4f4b7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -16,12 +16,16 @@ limitations under the License.
 // This file implements logic for legalizing HLO to TensorFlow.
 
 #include <memory>
+#include <vector>
 
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -32,6 +36,41 @@ namespace mlir {
 namespace TF {
 namespace {
 
+class ConvertSliceOp : public OpConversionPattern<xla_hlo::SliceOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      xla_hlo::SliceOp slice_op, ArrayRef<Value> args,
+      ConversionPatternRewriter &rewriter) const final {
+    DenseIntElementsAttr strides = slice_op.strides();
+    // Strides must be 1 otherwise we cannot legalize this `xla_hlo.slice` op.
+    if (!strides.isSplat() ||
+        strides.getSplatValue().cast<IntegerAttr>().getInt() != 1)
+      return failure();
+
+    rewriter.setInsertionPointAfter(slice_op);
+    auto start_indices = slice_op.start_indices();
+    auto limit_indices = slice_op.limit_indices();
+    std::vector<int64_t> size_values;
+    for (auto pair : llvm::zip(start_indices.getValues<APInt>(),
+                               limit_indices.getValues<APInt>())) {
+      size_values.emplace_back(std::get<1>(pair).getSExtValue() -
+                               std::get<0>(pair).getSExtValue());
+    }
+
+    RankedTensorType ty =
+        RankedTensorType::get({static_cast<int64_t>(size_values.size())},
+                              rewriter.getIntegerType(64));
+    auto start = rewriter.create<ConstOp>(slice_op.getLoc(), start_indices);
+    auto size = rewriter.create<ConstOp>(
+        slice_op.getLoc(), DenseIntElementsAttr::get(ty, size_values));
+    rewriter.replaceOpWithNewOp<SliceOp>(slice_op, slice_op.getType(),
+                                         slice_op.operand(), start, size);
+    return success();
+  };
+};
+
 class LegalizeHloToTf : public PassWrapper<LegalizeHloToTf, FunctionPass> {
  public:
   LegalizeHloToTf() = default;
@@ -64,6 +103,7 @@ void LegalizeHloToTf::runOnFunction() {
   // Add legalization patterns to the list.
   OwningRewritePatternList patterns;
   populateWithGenerated(&context, &patterns);
+  patterns.insert<ConvertSliceOp>(&context);
 
   ConversionTarget target(context);
   target.addLegalDialect<TensorFlowDialect>();

From 05653928da6ad7fd74dda297e55348c9bfcdff42 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 26 May 2020 18:27:40 -0700
Subject: [PATCH 1172/1533] Don't flag unsupported variant type ops in
 tensorlist pass

TensorList pass need not know about all DT_VARIANT uses, so instead just use
partial conversion. This would still flag/fail if one of the explicitly marked illegal ops are encountered.

PiperOrigin-RevId: 313306614
Change-Id: I1e56d2ea8f82bf5a7b72f6507efa9310b04e1cad
---
 .../mlir/lite/transforms/lower_static_tensor_list.cc         | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 49be29065fe..45b8c9e5fb2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -838,7 +838,8 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   // TensorFlow operations that doesn't have operands and results of type
   // variant are legal. Here, we don't distinguish between variants encoding
   // TensorList or some other type as that information is not available here.
-  // This constraint should be relaxed to support other variant types in TFLite.
+  // Partial legalization is used below to still allow ops with variant types
+  // still.
   auto is_legal = [](Operation *op) {
     auto is_not_variant = [](Type ty) {
       return !ty.cast<ShapedType>().getElementType().isa<TF::VariantType>();
@@ -873,7 +874,7 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
                   ConvertTensorListPushBack, ConvertTensorListReserve,
                   ConvertTensorListSetItem, ConvertTensorListStack,
                   ConvertTensorListResize, ConvertWhile>(context);
-  return applyFullConversion(func, target, patterns);
+  return applyPartialConversion(func, target, patterns);
 }
 
 void LowerStaticTensorListPass::runOnOperation() {

From 518423dd27e1673a9cafa76507178c11c83de560 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 26 May 2020 18:34:20 -0700
Subject: [PATCH 1173/1533] [XLA:Python] Split bindings for XLA ops into a
 separate file. No functional changes.

This is partially to make xla.cc shorter and partially to parallelize its build time.

PiperOrigin-RevId: 313307447
Change-Id: I4f6de5723dbef4464599813bc9284b4ac9e271d7
---
 tensorflow/compiler/xla/python/BUILD  |  33 ++-
 tensorflow/compiler/xla/python/ops.cc | 356 ++++++++++++++++++++++++++
 tensorflow/compiler/xla/python/ops.h  |  27 ++
 tensorflow/compiler/xla/python/xla.cc | 322 +----------------------
 4 files changed, 411 insertions(+), 327 deletions(-)
 create mode 100644 tensorflow/compiler/xla/python/ops.cc
 create mode 100644 tensorflow/compiler/xla/python/ops.h

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 5b4182b75e1..3dcdc46040a 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -186,6 +186,32 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ops",
+    srcs = ["ops.cc"],
+    hdrs = ["ops.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        ":types",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:qr",
+        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
+        "//tensorflow/compiler/xla/client/lib:sorting",
+        "//tensorflow/compiler/xla/client/lib:svd",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+        "@pybind11",
+    ],
+)
+
 config_setting(
     name = "enable_gpu",
     values = {"define": "xla_python_enable_gpu=true"},
@@ -205,6 +231,7 @@ pybind_extension(
     deps = [
         ":bfloat16",
         ":dlpack",
+        ":ops",
         ":python_ref_manager",
         ":types",
         "@com_google_absl//absl/base",
@@ -228,12 +255,6 @@ pybind_extension(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:comparators",
-        "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:qr",
-        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
-        "//tensorflow/compiler/xla/client/lib:sorting",
-        "//tensorflow/compiler/xla/client/lib:svd",
         "//tensorflow/compiler/xla/pjrt:cpu_device",
         "//tensorflow/compiler/xla/pjrt:nvidia_gpu_device",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
diff --git a/tensorflow/compiler/xla/python/ops.cc b/tensorflow/compiler/xla/python/ops.cc
new file mode 100644
index 00000000000..89891d39f78
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ops.cc
@@ -0,0 +1,356 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ops.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "pybind11/attr.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/qr.h"
+#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
+#include "tensorflow/compiler/xla/client/lib/sorting.h"
+#include "tensorflow/compiler/xla/client/lib/svd.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+namespace py = pybind11;
+
+void BuildOpsSubmodule(py::module* m) {
+  // ops submodule, containing free functions that add operators to an
+  // XlaBuilder.
+  py::module ops = m->def_submodule("ops", "XLA operations");
+
+  py::enum_<TriangularSolveOptions::Transpose>(
+      ops, "TriangularSolveOptions_Transpose")
+      .value("TRANSPOSE_INVALID", TriangularSolveOptions::TRANSPOSE_INVALID)
+      .value("NO_TRANSPOSE", TriangularSolveOptions::NO_TRANSPOSE)
+      .value("TRANSPOSE", TriangularSolveOptions::TRANSPOSE)
+      .value("ADJOINT", TriangularSolveOptions::ADJOINT);
+
+  ops.def("AfterAll", &AfterAll, py::arg("builder"), py::arg("tokens"));
+  ops.def(
+      "AllReduce",
+      static_cast<XlaOp (*)(
+          XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
+          const absl::optional<ChannelHandle>&, const absl::optional<Shape>&)>(
+          &AllReduce),
+      py::arg("operand"), py::arg("computation"),
+      py::arg("replica_groups") = py::list(),
+      py::arg("channel_id") = absl::nullopt,
+      py::arg("shape_with_layout") = absl::nullopt);
+  ops.def("AllToAll", &AllToAll, py::arg("operand"), py::arg("split_dimension"),
+          py::arg("concat_dimension"), py::arg("split_count"),
+          py::arg("replica_groups") = py::list(),
+          py::arg("layout") = absl::nullopt);
+  ops.def("CollectivePermute", &CollectivePermute, py::arg("operand"),
+          py::arg("source_target_pairs"));
+  ops.def("CreateToken", &CreateToken, py::arg("builder"));
+  ops.def("CrossReplicaSum",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
+              &CrossReplicaSum),
+          py::arg("operand"), py::arg("replica_groups") = py::list());
+  ops.def("BitcastConvertType", &BitcastConvertType, py::arg("operand"),
+          py::arg("new_element_type"));
+  ops.def("Broadcast", &Broadcast, py::arg("operand"), py::arg("sizes"));
+  ops.def("BroadcastInDim", &BroadcastInDim, py::arg("operand"),
+          py::arg("shape"), py::arg("broadcast_dimensions"));
+  ops.def("Call", &Call, py::arg("builder"), py::arg("computation"),
+          py::arg("operands"));
+  ops.def("Cholesky", &Cholesky, py::arg("a"), py::arg("lower") = true);
+  ops.def("Clamp", &Clamp, py::arg("min"), py::arg("operand"), py::arg("max"));
+  ops.def("Collapse", &Collapse, py::arg("operand"), py::arg("dimensions"));
+  ops.def("ConcatInDim", &ConcatInDim, py::arg("builder"), py::arg("operands"),
+          py::arg("dimension"));
+  ops.def("Conditional",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
+                                absl::Span<const XlaOp>)>(&Conditional),
+          py::arg("branch_index"), py::arg("branch_computations"),
+          py::arg("branch_operands"));
+  ops.def("Conditional",
+          static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
+                                const XlaComputation&)>(&Conditional),
+          py::arg("predicate"), py::arg("true_operand"),
+          py::arg("true_computation"), py::arg("false_operand"),
+          py::arg("false_computation"));
+  ops.def("Constant", &ConstantLiteral, py::arg("builder"), py::arg("literal"));
+  ops.def("ConstantLiteral", &ConstantLiteral, py::arg("builder"),
+          py::arg("literal"));
+  ops.def("ConvGeneralDilated", &ConvGeneralDilated, py::arg("lhs"),
+          py::arg("rhs"), py::arg("window_strides"), py::arg("padding"),
+          py::arg("lhs_dilation"), py::arg("rhs_dilation"),
+          py::arg("dimension_numbers"), py::arg("feature_group_count") = 1,
+          py::arg("batch_group_count") = 1,
+          py::arg("precision_config") = nullptr);
+  ops.def("ConvertElementType", &ConvertElementType, py::arg("operand"),
+          py::arg("new_element_type"));
+  ops.def(
+      "CustomCall",
+      [](XlaBuilder* builder, const py::bytes& call_target_name,
+         absl::Span<const XlaOp> operands, const Shape& shape,
+         const py::bytes& opaque) -> XlaOp {
+        return CustomCall(builder, call_target_name, operands, shape, opaque);
+      },
+      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
+      py::arg("shape"), py::arg("opaque") = py::bytes(""));
+  ops.def(
+      "CustomCallWithLayout",
+      [](XlaBuilder* builder, const py::bytes& call_target_name,
+         absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
+         absl::Span<const Shape> operand_shapes_with_layout,
+         const py::bytes& opaque) -> XlaOp {
+        return CustomCallWithLayout(builder, call_target_name, operands,
+                                    shape_with_layout,
+                                    operand_shapes_with_layout, opaque);
+      },
+      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
+      py::arg("shape_with_layout"), py::arg("operand_shapes_with_layout"),
+      py::arg("opaque") = py::bytes(""));
+  ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
+          py::arg("precision_config") = nullptr);
+  ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
+          py::arg("dimension_numbers"), py::arg("precision_config") = nullptr);
+  ops.def("DynamicSlice",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
+                                absl::Span<const int64>)>(&DynamicSlice),
+          py::arg("operand"), py::arg("start_indices"), py::arg("slice_sizes"));
+  ops.def("DynamicUpdateSlice",
+          static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
+              &DynamicUpdateSlice),
+          py::arg("operand"), py::arg("update"), py::arg("start_indices"));
+
+  ops.def("Fft", &Fft, py::arg("operand"), py::arg("fft_type"),
+          py::arg("fft_length"));
+
+  ops.def("Gather", &Gather, py::arg("a"), py::arg("start_indices"),
+          py::arg("dimension_numbers"), py::arg("slice_sizes"),
+          py::arg("indices_are_sorted") = false);
+  ops.def("GetTupleElement", &GetTupleElement, py::arg("tuple_data"),
+          py::arg("index"));
+  ops.def("InfeedWithToken", &InfeedWithToken, py::arg("token"),
+          py::arg("shape"), py::arg("config") = "");
+  ops.def("Iota",
+          static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64)>(&Iota),
+          py::arg("builder"), py::arg("shape"), py::arg("iota_dimension"));
+  ops.def("Iota",
+          static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64)>(&Iota),
+          py::arg("builder"), py::arg("type"), py::arg("size"));
+  ops.def("Map", &Map, py::arg("builder"), py::arg("operands"),
+          py::arg("computation"), py::arg("dimensions"),
+          py::arg("static_operands") = py::list());
+  ops.def("NextAfter", &NextAfter, py::arg("from"), py::arg("to"));
+  ops.def("OutfeedWithToken", &OutfeedWithToken, py::arg("operand"),
+          py::arg("token"), py::arg("shape_with_layout"),
+          py::arg("outfeed_config") = "");
+  ops.def("Pad", &Pad, py::arg("operand"), py::arg("padding_value"),
+          py::arg("padding_config"));
+  ops.def("Parameter",
+          static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&,
+                                const std::string&, const std::vector<bool>&)>(
+              &Parameter),
+          py::arg("builder"), py::arg("parameter_number"), py::arg("shape"),
+          py::arg("name") = "",
+          py::arg("replicated_at_leaf_buffers") = std::vector<bool>());
+  ops.def(
+      "QR",
+      [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
+        TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, full_matrices));
+        return std::make_pair(qr.q, qr.r);
+      },
+      py::arg("operand"), py::arg("full_matrices"));
+  ops.def(
+      "Eigh",
+      [](XlaOp a, bool lower, int64 max_iter,
+         float epsilon) -> std::pair<XlaOp, XlaOp> {
+        auto eigh = SelfAdjointEig(a, lower, max_iter, epsilon);
+        return std::make_pair(eigh.v, eigh.w);
+      },
+      py::arg("a"), py::arg("lower") = true, py::arg("max_iter") = 100,
+      py::arg("epsilon") = 1e-6);
+  ops.def(
+      "SVD",
+      [](XlaOp a, int64 max_iter,
+         float epsilon) -> std::tuple<XlaOp, XlaOp, XlaOp> {
+        auto svd = SVD(a, max_iter, epsilon);
+        return std::make_tuple(svd.u, svd.d, svd.v);
+      },
+      py::arg("a"), py::arg("max_iter") = 100, py::arg("epsilon") = 1e-6);
+  ops.def("Reduce",
+          static_cast<XlaOp (*)(XlaBuilder*, absl::Span<const XlaOp>,
+                                absl::Span<const XlaOp>, const XlaComputation&,
+                                absl::Span<const int64>)>(&Reduce),
+          py::arg("builder"), py::arg("operands"), py::arg("init_values"),
+          py::arg("computation"), py::arg("dimensions_to_reduce"));
+  ops.def("ReducePrecision", &ReducePrecision, py::arg("operand"),
+          py::arg("exponent_bits"), py::arg("mantissa_bits"));
+  ops.def("ReduceWindowWithGeneralPadding", &ReduceWindowWithGeneralPadding,
+          py::arg("operand"), py::arg("init_value"), py::arg("computation"),
+          py::arg("window_dimensions"), py::arg("window_strides"),
+          py::arg("base_dilations"), py::arg("window_dilations"),
+          py::arg("padding"));
+  ops.def("ReplicaId", &ReplicaId, py::arg("builder"));
+  ops.def("Reshape",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>,
+                                absl::Span<const int64>)>(&Reshape),
+          py::arg("operand"), py::arg("dimensions"), py::arg("new_sizes"));
+  ops.def("Reshape",
+          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>)>(&Reshape),
+          py::arg("operand"), py::arg("new_sizes"));
+  ops.def("Rev", &Rev, py::arg("operand"), py::arg("dimensions"));
+  ops.def("RngNormal", &RngNormal, py::arg("mu"), py::arg("sigma"),
+          py::arg("shape"));
+  ops.def("RngUniform", &RngUniform, py::arg("a"), py::arg("b"),
+          py::arg("shape"));
+  ops.def("Scatter", &Scatter, py::arg("input"), py::arg("scatter_indices"),
+          py::arg("updates"), py::arg("update_computation"),
+          py::arg("dimension_numbers"), py::arg("indices_are_sorted") = false,
+          py::arg("unique_indices") = false);
+  ops.def("Select", &Select, py::arg("pred"), py::arg("on_true"),
+          py::arg("on_false"));
+  ops.def("SelectAndScatterWithGeneralPadding",
+          &SelectAndScatterWithGeneralPadding, py::arg("operand"),
+          py::arg("select"), py::arg("window_dimensions"),
+          py::arg("window_strides"), py::arg("padding"), py::arg("source"),
+          py::arg("init_value"), py::arg("scatter"));
+  ops.def("Slice", &Slice, py::arg("operand"), py::arg("start_indices"),
+          py::arg("limit_indices"), py::arg("strides"));
+  ops.def("SliceInDim", &SliceInDim, py::arg("operand"), py::arg("start_index"),
+          py::arg("limit_index"), py::arg("stride"), py::arg("dimno"));
+  ops.def(
+      "Sort",
+      [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
+         absl::optional<const XlaComputation*> comparator, int64 dimension,
+         bool is_stable) -> XlaOp {
+        return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+          std::vector<PrimitiveType> operand_types;
+          for (const auto& operand : operands) {
+            TF_ASSIGN_OR_RETURN(auto operand_shape, builder->GetShape(operand));
+            operand_types.push_back(operand_shape.element_type());
+          }
+
+          if (comparator) {
+            return Sort(operands, **comparator, dimension, is_stable);
+          } else {
+            return Sort(operands,
+                        CreateScalarLtComputation(operand_types, builder),
+                        dimension, is_stable);
+          }
+        });
+      },
+      py::arg("builder"), py::arg("operands"),
+      py::arg("comparator") = absl::nullopt, py::arg("dimension") = -1,
+      py::arg("is_stable") = false);
+  ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
+  ops.def("Transpose", &Transpose, py::arg("operand"), py::arg("permutation"));
+  ops.def("TriangularSolve", &TriangularSolve, py::arg("a"), py::arg("b"),
+          py::arg("left_side"), py::arg("lower"), py::arg("unit_diagonal"),
+          py::arg("transpose_a"));
+  ops.def("Tuple", &Tuple, py::arg("builder"), py::arg("elements"));
+  ops.def("While", &While, py::arg("condition"), py::arg("body"),
+          py::arg("init"));
+
+  ops.def("Igamma", &Igamma, py::arg("a"), py::arg("x"));
+  ops.def("Igammac", &Igammac, py::arg("a"), py::arg("x"));
+  ops.def("IgammaGradA", &IgammaGradA, py::arg("a"), py::arg("x"));
+  ops.def("RandomGammaGrad", &RandomGammaGrad, py::arg("a"), py::arg("x"));
+  ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, py::arg("a"),
+          py::arg("b"), py::arg("x"));
+
+#define BINARY_OP(op)                                                 \
+  ops.def(                                                            \
+      #op,                                                            \
+      [](XlaOp a, XlaOp b, absl::optional<std::vector<int64>> dims) { \
+        return dims ? op(a, b, *dims) : op(a, b);                     \
+      },                                                              \
+      py::arg("lhs"), py::arg("rhs"),                                 \
+      py::arg("broadcast_dimensions") = absl::nullopt)
+  BINARY_OP(Eq);
+  BINARY_OP(Ne);
+  BINARY_OP(Ge);
+  BINARY_OP(Gt);
+  BINARY_OP(Lt);
+  BINARY_OP(Le);
+  BINARY_OP(Add);
+  BINARY_OP(Sub);
+  BINARY_OP(Mul);
+  BINARY_OP(Div);
+  BINARY_OP(Rem);
+  BINARY_OP(Max);
+  BINARY_OP(Min);
+  BINARY_OP(And);
+  BINARY_OP(Or);
+  BINARY_OP(Xor);
+  BINARY_OP(ShiftLeft);
+  BINARY_OP(ShiftRightArithmetic);
+  BINARY_OP(ShiftRightLogical);
+  BINARY_OP(Atan2);
+  BINARY_OP(Pow);
+  BINARY_OP(Complex);
+#undef BINARY_OP
+
+#define UNARY_OP(op) ops.def(#op, &op)
+  UNARY_OP(Not);
+  UNARY_OP(PopulationCount);
+  UNARY_OP(Clz);
+  UNARY_OP(Abs);
+  UNARY_OP(Exp);
+  UNARY_OP(Expm1);
+  UNARY_OP(Floor);
+  UNARY_OP(Ceil);
+  UNARY_OP(Round);
+  UNARY_OP(Log);
+  UNARY_OP(Log1p);
+  UNARY_OP(Sign);
+  UNARY_OP(Cos);
+  UNARY_OP(Sin);
+  UNARY_OP(Tanh);
+  UNARY_OP(IsFinite);
+  UNARY_OP(Neg);
+  UNARY_OP(Sqrt);
+  UNARY_OP(Rsqrt);
+  UNARY_OP(Square);
+  UNARY_OP(Reciprocal);
+  UNARY_OP(Erfc);
+  UNARY_OP(Erf);
+  UNARY_OP(ErfInv);
+  UNARY_OP(Lgamma);
+  UNARY_OP(Digamma);
+  UNARY_OP(BesselI0e);
+  UNARY_OP(BesselI1e);
+  UNARY_OP(Acos);
+  UNARY_OP(Asin);
+  UNARY_OP(Atan);
+  UNARY_OP(Tan);
+  UNARY_OP(Acosh);
+  UNARY_OP(Asinh);
+  UNARY_OP(Atanh);
+  UNARY_OP(Cosh);
+  UNARY_OP(Sinh);
+  UNARY_OP(Real);
+  UNARY_OP(Imag);
+  UNARY_OP(Conj);
+#undef UNARY_OP
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ops.h b/tensorflow/compiler/xla/python/ops.h
new file mode 100644
index 00000000000..7fe34e941ba
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ops.h
@@ -0,0 +1,27 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_OPS_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_OPS_H_
+
+#include "pybind11/pybind11.h"
+
+namespace xla {
+
+void BuildOpsSubmodule(pybind11::module* m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_OPS_H_
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index abf0937d057..fb7d7df58f7 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -30,12 +30,6 @@ limitations under the License.
 #include "pybind11/pybind11.h"
 #include "pybind11/pytypes.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/lib/comparators.h"
-#include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/qr.h"
-#include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
-#include "tensorflow/compiler/xla/client/lib/sorting.h"
-#include "tensorflow/compiler/xla/client/lib/svd.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -48,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
+#include "tensorflow/compiler/xla/python/ops.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
@@ -306,321 +301,6 @@ StatusOr<py::dict> PjRtBufferCudaArrayInterface(const PjRtBuffer& buffer) {
   return result;
 }
 
-void BuildOpsSubmodule(py::module* m) {
-  // ops submodule, containing free functions that add operators to an
-  // XlaBuilder.
-  py::module ops = m->def_submodule("ops", "XLA operations");
-
-  py::enum_<TriangularSolveOptions::Transpose>(
-      ops, "TriangularSolveOptions_Transpose")
-      .value("TRANSPOSE_INVALID", TriangularSolveOptions::TRANSPOSE_INVALID)
-      .value("NO_TRANSPOSE", TriangularSolveOptions::NO_TRANSPOSE)
-      .value("TRANSPOSE", TriangularSolveOptions::TRANSPOSE)
-      .value("ADJOINT", TriangularSolveOptions::ADJOINT);
-
-  ops.def("AfterAll", &AfterAll, py::arg("builder"), py::arg("tokens"));
-  ops.def(
-      "AllReduce",
-      static_cast<XlaOp (*)(
-          XlaOp, const XlaComputation&, absl::Span<const ReplicaGroup>,
-          const absl::optional<ChannelHandle>&, const absl::optional<Shape>&)>(
-          &AllReduce),
-      py::arg("operand"), py::arg("computation"),
-      py::arg("replica_groups") = py::list(),
-      py::arg("channel_id") = absl::nullopt,
-      py::arg("shape_with_layout") = absl::nullopt);
-  ops.def("AllToAll", &AllToAll, py::arg("operand"), py::arg("split_dimension"),
-          py::arg("concat_dimension"), py::arg("split_count"),
-          py::arg("replica_groups") = py::list(),
-          py::arg("layout") = absl::nullopt);
-  ops.def("CollectivePermute", &CollectivePermute, py::arg("operand"),
-          py::arg("source_target_pairs"));
-  ops.def("CreateToken", &CreateToken, py::arg("builder"));
-  ops.def("CrossReplicaSum",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const ReplicaGroup>)>(
-              &CrossReplicaSum),
-          py::arg("operand"), py::arg("replica_groups") = py::list());
-  ops.def("BitcastConvertType", &BitcastConvertType, py::arg("operand"),
-          py::arg("new_element_type"));
-  ops.def("Broadcast", &Broadcast, py::arg("operand"), py::arg("sizes"));
-  ops.def("BroadcastInDim", &BroadcastInDim, py::arg("operand"),
-          py::arg("shape"), py::arg("broadcast_dimensions"));
-  ops.def("Call", &Call, py::arg("builder"), py::arg("computation"),
-          py::arg("operands"));
-  ops.def("Cholesky", &Cholesky, py::arg("a"), py::arg("lower") = true);
-  ops.def("Clamp", &Clamp, py::arg("min"), py::arg("operand"), py::arg("max"));
-  ops.def("Collapse", &Collapse, py::arg("operand"), py::arg("dimensions"));
-  ops.def("ConcatInDim", &ConcatInDim, py::arg("builder"), py::arg("operands"),
-          py::arg("dimension"));
-  ops.def("Conditional",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaComputation* const>,
-                                absl::Span<const XlaOp>)>(&Conditional),
-          py::arg("branch_index"), py::arg("branch_computations"),
-          py::arg("branch_operands"));
-  ops.def("Conditional",
-          static_cast<XlaOp (*)(XlaOp, XlaOp, const XlaComputation&, XlaOp,
-                                const XlaComputation&)>(&Conditional),
-          py::arg("predicate"), py::arg("true_operand"),
-          py::arg("true_computation"), py::arg("false_operand"),
-          py::arg("false_computation"));
-  ops.def("Constant", &ConstantLiteral, py::arg("builder"), py::arg("literal"));
-  ops.def("ConstantLiteral", &ConstantLiteral, py::arg("builder"),
-          py::arg("literal"));
-  ops.def("ConvGeneralDilated", &ConvGeneralDilated, py::arg("lhs"),
-          py::arg("rhs"), py::arg("window_strides"), py::arg("padding"),
-          py::arg("lhs_dilation"), py::arg("rhs_dilation"),
-          py::arg("dimension_numbers"), py::arg("feature_group_count") = 1,
-          py::arg("batch_group_count") = 1,
-          py::arg("precision_config") = nullptr);
-  ops.def("ConvertElementType", &ConvertElementType, py::arg("operand"),
-          py::arg("new_element_type"));
-  ops.def(
-      "CustomCall",
-      [](XlaBuilder* builder, const py::bytes& call_target_name,
-         absl::Span<const XlaOp> operands, const Shape& shape,
-         const py::bytes& opaque) -> XlaOp {
-        return CustomCall(builder, call_target_name, operands, shape, opaque);
-      },
-      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
-      py::arg("shape"), py::arg("opaque") = py::bytes(""));
-  ops.def(
-      "CustomCallWithLayout",
-      [](XlaBuilder* builder, const py::bytes& call_target_name,
-         absl::Span<const XlaOp> operands, const Shape& shape_with_layout,
-         absl::Span<const Shape> operand_shapes_with_layout,
-         const py::bytes& opaque) -> XlaOp {
-        return CustomCallWithLayout(builder, call_target_name, operands,
-                                    shape_with_layout,
-                                    operand_shapes_with_layout, opaque);
-      },
-      py::arg("builder"), py::arg("call_target_name"), py::arg("operands"),
-      py::arg("shape_with_layout"), py::arg("operand_shapes_with_layout"),
-      py::arg("opaque") = py::bytes(""));
-  ops.def("Dot", &Dot, py::arg("lhs"), py::arg("rhs"),
-          py::arg("precision_config") = nullptr);
-  ops.def("DotGeneral", &DotGeneral, py::arg("lhs"), py::arg("rhs"),
-          py::arg("dimension_numbers"), py::arg("precision_config") = nullptr);
-  ops.def("DynamicSlice",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const XlaOp>,
-                                absl::Span<const int64>)>(&DynamicSlice),
-          py::arg("operand"), py::arg("start_indices"), py::arg("slice_sizes"));
-  ops.def("DynamicUpdateSlice",
-          static_cast<XlaOp (*)(XlaOp, XlaOp, absl::Span<const XlaOp>)>(
-              &DynamicUpdateSlice),
-          py::arg("operand"), py::arg("update"), py::arg("start_indices"));
-
-  ops.def("Fft", &Fft, py::arg("operand"), py::arg("fft_type"),
-          py::arg("fft_length"));
-
-  ops.def("Gather", &Gather, py::arg("a"), py::arg("start_indices"),
-          py::arg("dimension_numbers"), py::arg("slice_sizes"),
-          py::arg("indices_are_sorted") = false);
-  ops.def("GetTupleElement", &GetTupleElement, py::arg("tuple_data"),
-          py::arg("index"));
-  ops.def("InfeedWithToken", &InfeedWithToken, py::arg("token"),
-          py::arg("shape"), py::arg("config") = "");
-  ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, const Shape&, int64)>(&Iota),
-          py::arg("builder"), py::arg("shape"), py::arg("iota_dimension"));
-  ops.def("Iota",
-          static_cast<XlaOp (*)(XlaBuilder*, PrimitiveType, int64)>(&Iota),
-          py::arg("builder"), py::arg("type"), py::arg("size"));
-  ops.def("Map", &Map, py::arg("builder"), py::arg("operands"),
-          py::arg("computation"), py::arg("dimensions"),
-          py::arg("static_operands") = py::list());
-  ops.def("NextAfter", &NextAfter, py::arg("from"), py::arg("to"));
-  ops.def("OutfeedWithToken", &OutfeedWithToken, py::arg("operand"),
-          py::arg("token"), py::arg("shape_with_layout"),
-          py::arg("outfeed_config") = "");
-  ops.def("Pad", &Pad, py::arg("operand"), py::arg("padding_value"),
-          py::arg("padding_config"));
-  ops.def("Parameter",
-          static_cast<XlaOp (*)(XlaBuilder*, int64, const Shape&,
-                                const std::string&, const std::vector<bool>&)>(
-              &Parameter),
-          py::arg("builder"), py::arg("parameter_number"), py::arg("shape"),
-          py::arg("name") = "",
-          py::arg("replicated_at_leaf_buffers") = std::vector<bool>());
-  ops.def(
-      "QR",
-      [](XlaOp a, bool full_matrices) -> StatusOr<std::pair<XlaOp, XlaOp>> {
-        TF_ASSIGN_OR_RETURN(auto qr, QRDecomposition(a, full_matrices));
-        return std::make_pair(qr.q, qr.r);
-      },
-      py::arg("operand"), py::arg("full_matrices"));
-  ops.def(
-      "Eigh",
-      [](XlaOp a, bool lower, int64 max_iter,
-         float epsilon) -> std::pair<XlaOp, XlaOp> {
-        auto eigh = SelfAdjointEig(a, lower, max_iter, epsilon);
-        return std::make_pair(eigh.v, eigh.w);
-      },
-      py::arg("a"), py::arg("lower") = true, py::arg("max_iter") = 100,
-      py::arg("epsilon") = 1e-6);
-  ops.def(
-      "SVD",
-      [](XlaOp a, int64 max_iter,
-         float epsilon) -> std::tuple<XlaOp, XlaOp, XlaOp> {
-        auto svd = SVD(a, max_iter, epsilon);
-        return std::make_tuple(svd.u, svd.d, svd.v);
-      },
-      py::arg("a"), py::arg("max_iter") = 100, py::arg("epsilon") = 1e-6);
-  ops.def("Reduce",
-          static_cast<XlaOp (*)(XlaBuilder*, absl::Span<const XlaOp>,
-                                absl::Span<const XlaOp>, const XlaComputation&,
-                                absl::Span<const int64>)>(&Reduce),
-          py::arg("builder"), py::arg("operands"), py::arg("init_values"),
-          py::arg("computation"), py::arg("dimensions_to_reduce"));
-  ops.def("ReducePrecision", &ReducePrecision, py::arg("operand"),
-          py::arg("exponent_bits"), py::arg("mantissa_bits"));
-  ops.def("ReduceWindowWithGeneralPadding", &ReduceWindowWithGeneralPadding,
-          py::arg("operand"), py::arg("init_value"), py::arg("computation"),
-          py::arg("window_dimensions"), py::arg("window_strides"),
-          py::arg("base_dilations"), py::arg("window_dilations"),
-          py::arg("padding"));
-  ops.def("ReplicaId", &ReplicaId, py::arg("builder"));
-  ops.def("Reshape",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>,
-                                absl::Span<const int64>)>(&Reshape),
-          py::arg("operand"), py::arg("dimensions"), py::arg("new_sizes"));
-  ops.def("Reshape",
-          static_cast<XlaOp (*)(XlaOp, absl::Span<const int64>)>(&Reshape),
-          py::arg("operand"), py::arg("new_sizes"));
-  ops.def("Rev", &Rev, py::arg("operand"), py::arg("dimensions"));
-  ops.def("RngNormal", &RngNormal, py::arg("mu"), py::arg("sigma"),
-          py::arg("shape"));
-  ops.def("RngUniform", &RngUniform, py::arg("a"), py::arg("b"),
-          py::arg("shape"));
-  ops.def("Scatter", &Scatter, py::arg("input"), py::arg("scatter_indices"),
-          py::arg("updates"), py::arg("update_computation"),
-          py::arg("dimension_numbers"), py::arg("indices_are_sorted") = false,
-          py::arg("unique_indices") = false);
-  ops.def("Select", &Select, py::arg("pred"), py::arg("on_true"),
-          py::arg("on_false"));
-  ops.def("SelectAndScatterWithGeneralPadding",
-          &SelectAndScatterWithGeneralPadding, py::arg("operand"),
-          py::arg("select"), py::arg("window_dimensions"),
-          py::arg("window_strides"), py::arg("padding"), py::arg("source"),
-          py::arg("init_value"), py::arg("scatter"));
-  ops.def("Slice", &Slice, py::arg("operand"), py::arg("start_indices"),
-          py::arg("limit_indices"), py::arg("strides"));
-  ops.def("SliceInDim", &SliceInDim, py::arg("operand"), py::arg("start_index"),
-          py::arg("limit_index"), py::arg("stride"), py::arg("dimno"));
-  ops.def(
-      "Sort",
-      [](XlaBuilder* builder, absl::Span<const XlaOp> operands,
-         absl::optional<const XlaComputation*> comparator, int64 dimension,
-         bool is_stable) -> XlaOp {
-        return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-          std::vector<PrimitiveType> operand_types;
-          for (const auto& operand : operands) {
-            TF_ASSIGN_OR_RETURN(auto operand_shape, builder->GetShape(operand));
-            operand_types.push_back(operand_shape.element_type());
-          }
-
-          if (comparator) {
-            return Sort(operands, **comparator, dimension, is_stable);
-          } else {
-            return Sort(operands,
-                        CreateScalarLtComputation(operand_types, builder),
-                        dimension, is_stable);
-          }
-        });
-      },
-      py::arg("builder"), py::arg("operands"),
-      py::arg("comparator") = absl::nullopt, py::arg("dimension") = -1,
-      py::arg("is_stable") = false);
-  ops.def("TopK", &TopK, py::arg("input"), py::arg("k"));
-  ops.def("Transpose", &Transpose, py::arg("operand"), py::arg("permutation"));
-  ops.def("TriangularSolve", &TriangularSolve, py::arg("a"), py::arg("b"),
-          py::arg("left_side"), py::arg("lower"), py::arg("unit_diagonal"),
-          py::arg("transpose_a"));
-  ops.def("Tuple", &Tuple, py::arg("builder"), py::arg("elements"));
-  ops.def("While", &While, py::arg("condition"), py::arg("body"),
-          py::arg("init"));
-
-  ops.def("Igamma", &Igamma, py::arg("a"), py::arg("x"));
-  ops.def("Igammac", &Igammac, py::arg("a"), py::arg("x"));
-  ops.def("IgammaGradA", &IgammaGradA, py::arg("a"), py::arg("x"));
-  ops.def("RandomGammaGrad", &RandomGammaGrad, py::arg("a"), py::arg("x"));
-  ops.def("RegularizedIncompleteBeta", &RegularizedIncompleteBeta, py::arg("a"),
-          py::arg("b"), py::arg("x"));
-
-#define BINARY_OP(op)                                                 \
-  ops.def(                                                            \
-      #op,                                                            \
-      [](XlaOp a, XlaOp b, absl::optional<std::vector<int64>> dims) { \
-        return dims ? op(a, b, *dims) : op(a, b);                     \
-      },                                                              \
-      py::arg("lhs"), py::arg("rhs"),                                 \
-      py::arg("broadcast_dimensions") = absl::nullopt)
-  BINARY_OP(Eq);
-  BINARY_OP(Ne);
-  BINARY_OP(Ge);
-  BINARY_OP(Gt);
-  BINARY_OP(Lt);
-  BINARY_OP(Le);
-  BINARY_OP(Add);
-  BINARY_OP(Sub);
-  BINARY_OP(Mul);
-  BINARY_OP(Div);
-  BINARY_OP(Rem);
-  BINARY_OP(Max);
-  BINARY_OP(Min);
-  BINARY_OP(And);
-  BINARY_OP(Or);
-  BINARY_OP(Xor);
-  BINARY_OP(ShiftLeft);
-  BINARY_OP(ShiftRightArithmetic);
-  BINARY_OP(ShiftRightLogical);
-  BINARY_OP(Atan2);
-  BINARY_OP(Pow);
-  BINARY_OP(Complex);
-#undef BINARY_OP
-
-#define UNARY_OP(op) ops.def(#op, &op)
-  UNARY_OP(Not);
-  UNARY_OP(PopulationCount);
-  UNARY_OP(Clz);
-  UNARY_OP(Abs);
-  UNARY_OP(Exp);
-  UNARY_OP(Expm1);
-  UNARY_OP(Floor);
-  UNARY_OP(Ceil);
-  UNARY_OP(Round);
-  UNARY_OP(Log);
-  UNARY_OP(Log1p);
-  UNARY_OP(Sign);
-  UNARY_OP(Cos);
-  UNARY_OP(Sin);
-  UNARY_OP(Tanh);
-  UNARY_OP(IsFinite);
-  UNARY_OP(Neg);
-  UNARY_OP(Sqrt);
-  UNARY_OP(Rsqrt);
-  UNARY_OP(Square);
-  UNARY_OP(Reciprocal);
-  UNARY_OP(Erfc);
-  UNARY_OP(Erf);
-  UNARY_OP(ErfInv);
-  UNARY_OP(Lgamma);
-  UNARY_OP(Digamma);
-  UNARY_OP(BesselI0e);
-  UNARY_OP(BesselI1e);
-  UNARY_OP(Acos);
-  UNARY_OP(Asin);
-  UNARY_OP(Atan);
-  UNARY_OP(Tan);
-  UNARY_OP(Acosh);
-  UNARY_OP(Asinh);
-  UNARY_OP(Atanh);
-  UNARY_OP(Cosh);
-  UNARY_OP(Sinh);
-  UNARY_OP(Real);
-  UNARY_OP(Imag);
-  UNARY_OP(Conj);
-#undef UNARY_OP
-}
 
 void BuildProfilerSubmodule(py::module* m) {
   py::module profiler =

From 301e6c8003445d2820a05687192a5fcfcf83a4c6 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Tue, 26 May 2020 19:19:51 -0700
Subject: [PATCH 1174/1533] Modify reader datasets to support thread
 directories instead of singular snapshot files

PiperOrigin-RevId: 313312280
Change-Id: I03f336a2c98e6c156a611239c1e1eb7379f41c4b
---
 .../core/kernels/data/experimental/BUILD      |  1 +
 .../data/experimental/snapshot_util.cc        | 96 ++++++++++++++-----
 .../kernels/data/experimental/snapshot_util.h |  5 +-
 3 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index f4b9240ca31..a9790fd99a4 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -537,6 +537,7 @@ cc_library(
         "//tensorflow/core/platform:random",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 877d05ebb3f..31d1a87087e 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <queue>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -31,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/zlib_inputstream.h"
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/coding.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/random.h"
@@ -44,6 +46,12 @@ namespace snapshot_util {
 /* static */ constexpr const int64 Reader::kSnappyReaderInputBufferSizeBytes;
 /* static */ constexpr const int64 Reader::kSnappyReaderOutputBufferSizeBytes;
 
+std::string GetCurrentCheckpointFile(const std::string& shard_directory,
+                                     const uint64 current_checkpoint_id) {
+  return io::JoinPath(shard_directory,
+                      absl::StrFormat("%08d.snapshot", current_checkpoint_id));
+}
+
 Writer::Writer(const std::string& filename, const std::string& compression_type,
                int version, const DataTypeVector& dtypes)
     : filename_(filename),
@@ -225,12 +233,12 @@ Status Reader::Create(Env* env, const std::string& filename,
 
 class Reader::Dataset : public DatasetBase {
  public:
-  explicit Dataset(const std::string& filename, const std::string& compression,
+  explicit Dataset(const std::string& shard_dir, const std::string& compression,
                    const int64 version, const DataTypeVector& dtypes,
                    const std::vector<PartialTensorShape>& shapes,
                    const int64 start_index, DatasetContext::Params params)
       : DatasetBase(DatasetContext(std::move(params))),
-        filename_(filename),
+        shard_dir_(shard_dir),
         compression_(compression),
         version_(version),
         dtypes_(dtypes),
@@ -253,7 +261,8 @@ class Reader::Dataset : public DatasetBase {
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** node) const override {
-    // TODO(frankchn): Implement for serialization and checkpointing.
+    // Not necessary perform any serialization as this dataset is only
+    // constructed at runtime in C++ and will be reconstructed every time.
     return Status::OK();
   }
 
@@ -264,23 +273,29 @@ class Reader::Dataset : public DatasetBase {
   }
 
  private:
-  std::string filename_;
-  std::string compression_;
-  int64 version_;
-  DataTypeVector dtypes_;
-  std::vector<PartialTensorShape> shapes_;
+  const std::string shard_dir_;
+  const std::string compression_;
+  const int64 version_;
+  const DataTypeVector dtypes_;
+  const std::vector<PartialTensorShape> shapes_;
   const int64 start_index_;
 
   class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<Dataset>(params) {}
+        : DatasetIterator<Dataset>(params), current_checkpoint_id_(0) {}
 
     Status Initialize(IteratorContext* ctx) override {
       TF_RETURN_IF_ERROR(Reader::Create(
-          ctx->env(), dataset()->filename_, dataset()->compression_,
+          ctx->env(), GetCurrentFilename(), dataset()->compression_,
           dataset()->version_, dataset()->dtypes_, &reader_));
-      return reader_->SkipRecords(dataset()->start_index_);
+      bool end_of_sequence;
+      for (int64 i = 0; i < dataset()->start_index_; ++i) {
+        // TODO(frankchn): Optimize this to not parse every single element.
+        std::vector<Tensor> unused;
+        TF_RETURN_IF_ERROR(GetNextInternal(ctx, &unused, &end_of_sequence));
+      }
+      return Status::OK();
     }
 
    protected:
@@ -289,27 +304,53 @@ class Reader::Dataset : public DatasetBase {
                            bool* end_of_sequence) override {
       *end_of_sequence = false;
       Status s = reader_->ReadTensors(out_tensors);
-      if (errors::IsOutOfRange(s)) {
+      if (!errors::IsOutOfRange(s)) {
+        return s;
+      }
+      Status status = AdvanceToNextFile(ctx->env());
+      if (errors::IsNotFound(status)) {
         *end_of_sequence = true;
         return Status::OK();
+      } else {
+        return status;
       }
-      return s;
     }
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to save any state as this iterator will be reconstructed
+      // from scratch when the parent snapshot dataset is restored from
+      // checkpoint.
       return Status::OK();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to restore any state as this iterator will be
+      // reconstructed from scratch when the parent snapshot dataset is restored
+      // from checkpoint.
       return Status::OK();
     }
 
    private:
     std::unique_ptr<Reader> reader_;
+
+    // Stores the id current checkpoint file that we are in the process of
+    // reading (e.g. if the file is currently 00000001.snapshot, then this will
+    // be 1).
+    uint64 current_checkpoint_id_;
+
+    std::string GetCurrentFilename() {
+      return GetCurrentCheckpointFile(dataset()->shard_dir_,
+                                      current_checkpoint_id_);
+    }
+
+    Status AdvanceToNextFile(Env* env) {
+      current_checkpoint_id_++;
+      TF_RETURN_IF_ERROR(env->FileExists(GetCurrentFilename()));
+      return Reader::Create(env, GetCurrentFilename(), dataset()->compression_,
+                            dataset()->version_, dataset()->dtypes_, &reader_);
+    }
   };
 };
 
@@ -340,7 +381,8 @@ class Reader::NestedDataset : public DatasetBase {
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
                             Node** node) const override {
-    // TODO(frankchn): Implement for serialization and checkpointing.
+    // Not necessary perform any serialization as this dataset is only
+    // constructed at runtime in C++ and will be reconstructed every time.
     return Status::OK();
   }
 
@@ -380,13 +422,17 @@ class Reader::NestedDataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to save any state as this iterator will be reconstructed
+      // from scratch when the parent snapshot dataset is restored from
+      // checkpoint.
       return Status::OK();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      // TODO(frankchn): Implement for serialization and checkpointing.
+      // Not necessary to restore any state as this iterator will be
+      // reconstructed from scratch when the parent snapshot dataset is restored
+      // from checkpoint.
       return Status::OK();
     }
 
@@ -396,7 +442,7 @@ class Reader::NestedDataset : public DatasetBase {
 };
 
 Status Reader::MakeNestedDataset(Env* env,
-                                 const std::vector<std::string>& filenames,
+                                 const std::vector<std::string>& shard_dirs,
                                  const string& compression_type, int version,
                                  const DataTypeVector& dtypes,
                                  const std::vector<PartialTensorShape>& shapes,
@@ -404,17 +450,17 @@ Status Reader::MakeNestedDataset(Env* env,
                                  DatasetBase** output) {
   std::vector<DatasetBase*> datasets;
 
-  datasets.reserve(filenames.size());
-  for (const auto& filename : filenames) {
+  datasets.reserve(shard_dirs.size());
+  for (const auto& shard_dir : shard_dirs) {
     // TODO(frankchn): The reading pattern could be controlled in a non-round
     // robin fashion, so we cannot assume a round-robin manner when restoring.
-    int64 dataset_start_index = start_index / filenames.size();
-    if (start_index % filenames.size() > datasets.size()) {
+    int64 dataset_start_index = start_index / shard_dirs.size();
+    if (start_index % shard_dirs.size() > datasets.size()) {
       dataset_start_index++;
     }
 
     datasets.push_back(
-        new Dataset(filename, compression_type, version, dtypes, shapes,
+        new Dataset(shard_dir, compression_type, version, dtypes, shapes,
                     dataset_start_index,
                     DatasetContext::Params({"snapshot_util::Reader::Dataset",
                                             "snapshot_util_reader_Dataset"})));
@@ -423,7 +469,7 @@ Status Reader::MakeNestedDataset(Env* env,
   // Rotate the vector such that the first dataset contains the next element
   // to be produced.
   std::rotate(datasets.begin(),
-              datasets.begin() + (start_index % filenames.size()),
+              datasets.begin() + (start_index % shard_dirs.size()),
               datasets.end());
 
   *output = new NestedDataset(
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.h b/tensorflow/core/kernels/data/experimental/snapshot_util.h
index 79299bb79b4..a6455a85393 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.h
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.h
@@ -49,6 +49,9 @@ constexpr char kModePassthrough[] = "passthrough";
 
 enum Mode { READER = 0, WRITER = 1, PASSTHROUGH = 2 };
 
+std::string GetCurrentCheckpointFile(const std::string& shard_directory,
+                                     const uint64 current_checkpoint_id);
+
 class Writer {
  public:
   static constexpr const size_t kHeaderSize = sizeof(uint64);
@@ -126,7 +129,7 @@ class Reader {
   // dataset. Each element within the nested dataset is itself a dataset, and
   // contains all the elements written out to each individual snapshot file.
   static Status MakeNestedDataset(Env* env,
-                                  const std::vector<std::string>& filenames,
+                                  const std::vector<std::string>& shard_dirs,
                                   const string& compression_type, int version,
                                   const DataTypeVector& dtypes,
                                   const std::vector<PartialTensorShape>& shapes,

From 3d333927a31d11a24971cacca9f6b726a0f68fd9 Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Tue, 26 May 2020 19:23:25 -0700
Subject: [PATCH 1175/1533] Internal change for notebook tests.

PiperOrigin-RevId: 313312575
Change-Id: I3e18d0103bc40cb6bf7ee27bbd0a144a95ce48a7
---
 tensorflow/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index f9645786f8b..13c58c74583 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -56,6 +56,7 @@ visibility = [
     "//third_party/py/tf_slim:__subpackages__",
     # TODO(aselle): to pass open source test.
     "//bazel_pip/tensorflow/lite/toco/python:__pkg__",
+    "//third_party/py/tensorflow_docs:__subpackages__",
 ]
 
 package(

From fa0a9c876a960ec4fe9e768c1259a943cc91a4d5 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Tue, 26 May 2020 20:39:18 -0700
Subject: [PATCH 1176/1533] [XLA] Preserve replication info when cloning a
 parameter

PiperOrigin-RevId: 313319423
Change-Id: Ic92f71d5bc78e0b0ab04264ba1ea0b4416c24159
---
 tensorflow/compiler/xla/service/hlo_instructions.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index e33d5960894..9c5a66f0040 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1867,8 +1867,14 @@ std::unique_ptr<HloInstruction>
 HloParameterInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
-  return absl::make_unique<HloParameterInstruction>(parameter_number_, shape,
-                                                    name());
+  auto clone = absl::make_unique<HloParameterInstruction>(parameter_number_,
+                                                          shape, name());
+  if (parameter_replicated_at_leaf_buffers_ &&
+      ShapeUtil::Equal(shape, this->shape())) {
+    clone->set_parameter_replicated_at_leaf_buffers(
+        *parameter_replicated_at_leaf_buffers_);
+  }
+  return clone;
 }
 
 HloGetTupleElementInstruction::HloGetTupleElementInstruction(

From 0dda89c61ed5ecc72aa28368ff9c1230434424fb Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 26 May 2020 20:42:09 -0700
Subject: [PATCH 1177/1533] [TF/XLA] Rollback of rollback of 313256383, with a
 UB fix.

PiperOrigin-RevId: 313319715
Change-Id: I4b73f95a228b3e6e4fed524492c9389a19629f02
---
 tensorflow/compiler/tf2xla/BUILD              |  1 +
 tensorflow/compiler/tf2xla/xla_compiler.cc    |  7 +++++++
 .../core/common_runtime/graph_optimizer.cc    | 17 +++++++++------
 .../core/common_runtime/graph_optimizer.h     |  6 +++++-
 .../python/eager/def_function_xla_jit_test.py | 21 +++++++++++++++++++
 5 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 55341c0a01f..37110442b26 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -350,6 +350,7 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        "//tensorflow/compiler/jit:common",
         "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:shape_inference",
         "//tensorflow/compiler/jit:xla_cluster_util",
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 3d6083621f4..1cf3e10b774 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/types/variant.h"
+#include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -571,6 +572,10 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   std::unique_ptr<Graph> graph(new Graph(options_.flib_def));
   CopyGraph(*fbody->graph, graph.get());
 
+  bool is_inside_mustcompile = false;
+  TryGetNodeAttr(AttrSlice(&fbody->fdef.attr()), kXlaMustCompileAttr,
+                 &is_inside_mustcompile);
+
   // Performs a first function inlining pass before shape inference, since
   // otherwise shape inference can't see inside functions and a comprehensive
   // shape_map, including function ops, is needed to constant-propagate Shape
@@ -622,6 +627,8 @@ std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
   graph_optimizer_options.inline_multi_device_functions = true;
   graph_optimizer_options.inline_impl_selection_group_functions = true;
   graph_optimizer_options.inline_with_single_device_body_placer = true;
+  graph_optimizer_options.ignore_noinline = is_inside_mustcompile;
+
   optimizer.Optimize(flib_runtime_, flib_runtime_->env(),
                      /*device=*/nullptr, &graph, graph_optimizer_options);
 
diff --git a/tensorflow/core/common_runtime/graph_optimizer.cc b/tensorflow/core/common_runtime/graph_optimizer.cc
index 746930750ad..ae1a2daa788 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.cc
+++ b/tensorflow/core/common_runtime/graph_optimizer.cc
@@ -42,7 +42,7 @@ void GraphOptimizer::Optimize(
     const NodePredicate& cse_consider_fn, const NodePredicate& cf_consider_fn,
     bool inline_multi_device_functions,
     bool inline_impl_selection_group_functions,
-    bool inline_with_single_device_body_placer) {
+    bool inline_with_single_device_body_placer, bool ignore_noinline) {
   Graph* g = graph->get();
   DumpGraph("Initial", g);
 
@@ -116,6 +116,11 @@ void GraphOptimizer::Optimize(
             .inline_impl_selection_group_functions = true;
       }
 
+      if (ignore_noinline) {
+        expand_inline_opts.multi_device_options.ignore_noinline = true;
+        expand_inline_opts.native_options.ignore_noinline = true;
+      }
+
       bool was_mutated = ExpandInlineFunctions(runtime, g, expand_inline_opts);
       if (was_mutated) {
         DumpGraph("ExpandInlineFunctions", g);
@@ -138,11 +143,11 @@ void GraphOptimizer::Optimize(FunctionLibraryRuntime* runtime, Env* env,
                               const Device* device,
                               std::unique_ptr<Graph>* graph,
                               const Options& options) {
-  Optimize(runtime, env, device, graph, options.shape_map,
-           options.cse_consider_fn, options.cf_consider_fn,
-           options.inline_multi_device_functions,
-           options.inline_impl_selection_group_functions,
-           options.inline_with_single_device_body_placer);
+  Optimize(
+      runtime, env, device, graph, options.shape_map, options.cse_consider_fn,
+      options.cf_consider_fn, options.inline_multi_device_functions,
+      options.inline_impl_selection_group_functions,
+      options.inline_with_single_device_body_placer, options.ignore_noinline);
 }
 
 void OptimizeGraph(FunctionLibraryRuntime* lib, std::unique_ptr<Graph>* g,
diff --git a/tensorflow/core/common_runtime/graph_optimizer.h b/tensorflow/core/common_runtime/graph_optimizer.h
index 099ea8efa12..53bf532bd9c 100644
--- a/tensorflow/core/common_runtime/graph_optimizer.h
+++ b/tensorflow/core/common_runtime/graph_optimizer.h
@@ -58,6 +58,9 @@ class GraphOptimizer {
     // If true all functions will be inlined with a single device function
     // body placer strategy.
     bool inline_with_single_device_body_placer = false;
+
+    // If true, the _noinline attribute on functions and callers is ignored.
+    bool ignore_noinline = false;
   };
 
   explicit GraphOptimizer(const OptimizerOptions& opts);
@@ -81,7 +84,8 @@ class GraphOptimizer {
       const NodePredicate& cf_consider_fn = nullptr,
       bool inline_multi_device_functions = false,
       bool inline_impl_selection_group_functions = false,
-      bool inline_with_single_device_body_placer = false);
+      bool inline_with_single_device_body_placer = false,
+      bool ignore_noinline = false);
 
   const OptimizerOptions& options() { return opts_; }
 
diff --git a/tensorflow/python/eager/def_function_xla_jit_test.py b/tensorflow/python/eager/def_function_xla_jit_test.py
index 5fdf0487333..b63a3b434d4 100644
--- a/tensorflow/python/eager/def_function_xla_jit_test.py
+++ b/tensorflow/python/eager/def_function_xla_jit_test.py
@@ -355,6 +355,27 @@ class DefFunctionTest(test.TestCase):
     self.assertAllClose([5.0, 5.0, 5.0], g())
     self.assertAllClose(compiled_g(), g())
 
+  def testTensorListConcatGradNestedCompile(self):
+
+    @def_function.function(experimental_compile=True)
+    def f(x):
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32, size=2, element_shape=[3])
+      ta = ta.write(0, 2 * x)
+      ta = ta.write(1, 3 * x)
+      return ta.concat()
+
+    @def_function.function(experimental_compile=True)
+    def g():
+      x = constant_op.constant([3.14, 2.68, 7.69])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = f(x)
+        out = tape.gradient(y, x)
+      return out
+
+    self.assertAllClose([5.0, 5.0, 5.0], g())
+
   def testCumsum(self):
 
     @def_function.function(experimental_compile=True)

From 0a1449a983e6e88e62e175d0f34564414d26c4dd Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Tue, 26 May 2020 20:51:52 -0700
Subject: [PATCH 1178/1533] Fix bugs in docstring of conv3d_transpose.

Seems like depth was missing and NCHW and NHWC were mentioned
instead of NCDHW and NDHWC.

PiperOrigin-RevId: 313320657
Change-Id: Id599e6e7b91a18f193b952153c50c27839c5693f
---
 tensorflow/python/ops/nn_ops.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 24ee94fac48..b7dd1d20aae 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -3061,12 +3061,12 @@ def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
   rather than an actual deconvolution.
 
   Args:
-    input: A 5-D `Tensor` of type `float` and shape `[batch, height, width,
-      in_channels]` for `NHWC` data format or `[batch, in_channels, height,
-      width]` for `NCHW` data format.
-    filters: A 5-D `Tensor` with the same type as `value` and shape `[height,
-      width, output_channels, in_channels]`.  `filter`'s `in_channels` dimension
-      must match that of `value`.
+    input: A 5-D `Tensor` of type `float` and shape `[batch, depth, height,
+      width, in_channels]` for `NDHWC` data format or `[batch, in_channels,
+      depth, height, width]` for `NCDHW` data format.
+    filters: A 5-D `Tensor` with the same type as `value` and shape `[depth,
+      height, width, output_channels, in_channels]`.  `filter`'s `in_channels`
+      dimension must match that of `value`.
     output_shape: A 1-D `Tensor` representing the output shape of the
       deconvolution op.
     strides: An int or list of `ints` that has length `1`, `3` or `5`.  The

From ddb921bf7703ab04c8f16347484ced95b7f579ee Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 26 May 2020 21:00:47 -0700
Subject: [PATCH 1179/1533] Move test_spec in
 TensorFlowLiteSwift.podspec.template into subspec, so that the Delegate tests
 are only done in corresponding delegates.

PiperOrigin-RevId: 313321621
Change-Id: Ibf39faa449315b75e9b61a4b6a91fac14e454232
---
 .../TensorFlowLiteSwift.podspec.template      | 26 ++++++++++++-------
 .../swift/Tests/InterpreterTests.swift        | 24 ++---------------
 .../swift/Tests/MetalDelegateTests.swift      | 20 ++++++++++++++
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
index b87b4c97d67..1e414f1959f 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@@ -26,7 +26,16 @@ Pod::Spec.new do |s|
   s.subspec 'Core' do |core|
     core.dependency 'TensorFlowLiteC', "#{s.version}"
     core.source_files = swift_dir + 'Sources/*.swift'
-    core.exclude_files = swift_dir + 'Sources/*Delegate.swift'
+    core.exclude_files = swift_dir + 'Sources/{CoreML,Metal}Delegate.swift'
+
+    core.test_spec 'Tests' do |ts|
+      ts.source_files = swift_dir + 'Tests/*.swift'
+      ts.exclude_files = swift_dir + 'Tests/MetalDelegateTests.swift'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/add_quantized.bin',
+      ]
+    end
   end
 
   s.subspec 'CoreML' do |coreml|
@@ -39,14 +48,13 @@ Pod::Spec.new do |s|
     metal.source_files = swift_dir + 'Sources/MetalDelegate.swift'
     metal.dependency 'TensorFlowLiteC/Metal', "#{s.version}"
     metal.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
-  end
 
-
-  s.test_spec 'Tests' do |ts|
-    ts.source_files = swift_dir + 'Tests/*.swift'
-    ts.resources = [
-      tfl_dir + 'testdata/add.bin',
-      tfl_dir + 'testdata/add_quantized.bin',
-    ]
+    metal.test_spec 'Tests' do |ts|
+      ts.source_files = swift_dir + 'Tests/{Interpreter,MetalDelegate}Tests.swift'
+      ts.resources = [
+        tfl_dir + 'testdata/add.bin',
+        tfl_dir + 'testdata/add_quantized.bin',
+      ]
+    end
   end
 end
diff --git a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
index 09b001cb0cb..8d0140279af 100644
--- a/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/InterpreterTests.swift
@@ -50,26 +50,6 @@ class InterpreterTests: XCTestCase {
     XCTAssertNil(interpreter.delegates)
   }
 
-  func testInitWithDelegate() throws {
-    let metalDelegate = MetalDelegate()
-    let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate])
-    XCTAssertEqual(interpreter.delegates?.count, 1)
-    XCTAssertNil(interpreter.options)
-  }
-
-  func testInitWithOptionsAndDelegate() throws {
-    var options = Interpreter.Options()
-    options.threadCount = 1
-    let metalDelegate = MetalDelegate()
-    let interpreter = try Interpreter(
-      modelPath: AddQuantizedModel.path,
-      options: options,
-      delegates: [metalDelegate]
-    )
-    XCTAssertNotNil(interpreter.options)
-    XCTAssertEqual(interpreter.delegates?.count, 1)
-  }
-
   func testInputTensorCount() {
     XCTAssertEqual(interpreter.inputTensorCount, AddModel.inputTensorCount)
   }
@@ -268,7 +248,7 @@ class InterpreterOptionsTests: XCTestCase {
 // MARK: - Constants
 
 /// Values for the `add.bin` model.
-private enum AddModel {
+enum AddModel {
   static let info = (name: "add", extension: "bin")
   static let inputTensorCount = 1
   static let outputTensorCount = 1
@@ -301,7 +281,7 @@ private enum AddModel {
 }
 
 /// Values for the `add_quantized.bin` model.
-private enum AddQuantizedModel {
+enum AddQuantizedModel {
   static let info = (name: "add_quantized", extension: "bin")
   static let inputOutputIndex = 0
   static let shape: Tensor.Shape = [2]
diff --git a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
index 6daa429e2f0..8af43842d7a 100644
--- a/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
+++ b/tensorflow/lite/experimental/swift/Tests/MetalDelegateTests.swift
@@ -32,6 +32,26 @@ class MetalDelegateTests: XCTestCase {
     XCTAssertTrue(delegate.options.allowsPrecisionLoss)
     XCTAssertEqual(delegate.options.waitType, .active)
   }
+
+  func testInitInterpreterWithDelegate() throws {
+    let metalDelegate = MetalDelegate()
+    let interpreter = try Interpreter(modelPath: AddQuantizedModel.path, delegates: [metalDelegate])
+    XCTAssertEqual(interpreter.delegates?.count, 1)
+    XCTAssertNil(interpreter.options)
+  }
+
+  func testInitInterpreterWithOptionsAndDelegate() throws {
+    var options = Interpreter.Options()
+    options.threadCount = 1
+    let metalDelegate = MetalDelegate()
+    let interpreter = try Interpreter(
+      modelPath: AddQuantizedModel.path,
+      options: options,
+      delegates: [metalDelegate]
+    )
+    XCTAssertNotNil(interpreter.options)
+    XCTAssertEqual(interpreter.delegates?.count, 1)
+  }
 }
 
 class MetalDelegateOptionsTests: XCTestCase {

From 692e52c10fef03cf02f667eac4d2526416b98597 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 26 May 2020 21:03:33 -0700
Subject: [PATCH 1180/1533] Add overview errors to gviz. Move the
 RunEnvironmentErrorMessage processing to OpStatsToOverviewPage. This way,
 overview_page to gviz is a simple datable insert.

PiperOrigin-RevId: 313321999
Change-Id: I81444d999ab8133c5986d834da4f6ced0a5e1d01
---
 .../profiler/convert/op_stats_to_overview_page.cc | 15 ++++++++++++++-
 .../profiler/convert/op_stats_to_overview_page.h  |  3 +++
 tensorflow/core/profiler/utils/errors.cc          |  5 +++++
 tensorflow/core/profiler/utils/errors.h           |  2 ++
 4 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 666463fc0bb..62f37c50155 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -297,7 +297,7 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
       bottleneck.input_classification(), bottleneck.input_statement(), "",
       hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
       overview_page.mutable_recommendation());
-  *overview_page.mutable_errors() = op_stats.errors();
+  SetOverviewPageErrorMessage(op_stats, &overview_page);
   return overview_page;
 }
 
@@ -314,5 +314,18 @@ void SetRemarks(const OpStats& op_stats, OverviewPageAnalysis* analysis) {
   }
 }
 
+void SetOverviewPageErrorMessage(const OpStats& op_stats,
+                                 OverviewPage* overview_page) {
+  *overview_page->mutable_errors() = op_stats.errors();
+  absl::c_sort(*overview_page->mutable_errors());
+  if (overview_page->errors().empty()) {
+    // Shows run-environment error only if there is no other existing error.
+    if (op_stats.run_environment().device_type() != "CPU" &&
+        op_stats.run_environment().device_core_count() <= 0) {
+      *overview_page->add_errors() = std::string(kNoDeviceTraceCollected);
+    }
+  }
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index b4b3991a18d..d4d75c03454 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -48,6 +48,9 @@ OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats);
 OverviewPageRunEnvironment ComputeRunEnvironment(
     const RunEnvironment& run_environment);
 
+void SetOverviewPageErrorMessage(const OpStats& op_stats,
+                                 OverviewPage* overview_page);
+
 OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
                                           HardwareType hardware_type);
 
diff --git a/tensorflow/core/profiler/utils/errors.cc b/tensorflow/core/profiler/utils/errors.cc
index 9c678e98a43..1851c624e5c 100644
--- a/tensorflow/core/profiler/utils/errors.cc
+++ b/tensorflow/core/profiler/utils/errors.cc
@@ -33,5 +33,10 @@ const absl::string_view kErrorNoStepMarker =
     " than the step time. For (1), you need to add step instrumentation;"
     " for (2), you may try to profile longer.";
 
+const absl::string_view kNoDeviceTraceCollected =
+    "No device trace was collected. This might happen if your job hadn't been "
+    "run on the device when sampling was turned on. You could try the sampling"
+    " again later.";
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/errors.h b/tensorflow/core/profiler/utils/errors.h
index b213fd05c71..2dcb60e6899 100644
--- a/tensorflow/core/profiler/utils/errors.h
+++ b/tensorflow/core/profiler/utils/errors.h
@@ -28,6 +28,8 @@ ABSL_CONST_INIT extern const absl::string_view kErrorIncompleteStep;
 // step info.
 ABSL_CONST_INIT extern const absl::string_view kErrorNoStepMarker;
 
+ABSL_CONST_INIT extern const absl::string_view kNoDeviceTraceCollected;
+
 }  // namespace profiler
 }  // namespace tensorflow
 

From 8f31b06f53b92cdd172587dc3300e23c846d1973 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Tue, 26 May 2020 21:50:49 -0700
Subject: [PATCH 1181/1533] Added generic arguments to abstract int/float
 uniforms.

PiperOrigin-RevId: 313327440
Change-Id: I12c82d0499b3ed9eb4f839cf8016a87bd0ea4807
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  14 ++
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 173 ++++++++++++++++++
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  88 +++++++++
 tensorflow/lite/delegates/gpu/cl/cl_kernel.h  |   1 +
 .../lite/delegates/gpu/cl/kernels/BUILD       |   2 +
 .../delegates/gpu/cl/kernels/transpose.cc     |  72 +++++---
 .../lite/delegates/gpu/cl/kernels/transpose.h |   2 +
 7 files changed, 326 insertions(+), 26 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/cl/arguments.cc
 create mode 100644 tensorflow/lite/delegates/gpu/cl/arguments.h

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 2e686810767..c149479ae4c 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -38,6 +38,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "arguments",
+    srcs = ["arguments.cc"],
+    hdrs = ["arguments.h"],
+    deps = [
+        ":opencl_wrapper",
+        ":util",
+        "//tensorflow/lite/delegates/gpu/common:status",
+        "//tensorflow/lite/delegates/gpu/common:types",
+        "//tensorflow/lite/delegates/gpu/common:util",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "buffer",
     srcs = ["buffer.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
new file mode 100644
index 00000000000..26d9fc778b3
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -0,0 +1,173 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+std::string GetNextWord(const std::string& code, size_t first_position) {
+  size_t pos = first_position;
+  char t = code[pos];
+  while (absl::ascii_isalnum(t) || t == '_') {
+    pos++;
+    t = code[pos];
+  }
+  return code.substr(first_position, pos - first_position);
+}
+}  // namespace
+
+Arguments::Arguments(Arguments&& args)
+    : int_values_(std::move(args.int_values_)),
+      shared_int4s_data_(std::move(args.shared_int4s_data_)),
+      float_values_(std::move(args.float_values_)),
+      shared_float4s_data_(std::move(args.shared_float4s_data_)) {}
+Arguments& Arguments::operator=(Arguments&& args) {
+  if (this != &args) {
+    int_values_ = std::move(args.int_values_);
+    shared_int4s_data_ = std::move(args.shared_int4s_data_);
+    float_values_ = std::move(args.float_values_);
+    shared_float4s_data_ = std::move(args.shared_float4s_data_);
+  }
+  return *this;
+}
+
+void Arguments::AddFloat(const std::string& name, float value) {
+  float_values_[name].value = value;
+}
+void Arguments::AddInt(const std::string& name, int value) {
+  int_values_[name].value = value;
+}
+
+absl::Status Arguments::SetInt(const std::string& name, int value) {
+  auto ii = int_values_.find(name);
+  if (ii == int_values_.end()) {
+    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+  }
+  ii->second.value = value;
+  if (ii->second.active) {
+    shared_int4s_data_[ii->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetFloat(const std::string& name, float value) {
+  auto fi = float_values_.find(name);
+  if (fi == float_values_.end()) {
+    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+  }
+  fi->second.value = value;
+  if (fi->second.active) {
+    shared_float4s_data_[fi->second.offset] = value;
+  }
+  return absl::OkStatus();
+}
+
+std::string Arguments::GetListOfArgs() {
+  std::string result;
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    absl::StrAppend(&result, ",\n  int4 shared_int4_", i);
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    absl::StrAppend(&result, ",\n  float4 shared_float4_", i);
+  }
+  return result;
+}
+
+absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
+  for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_int4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
+    const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
+                                          &shared_float4s_data_[i * 4]);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  return absl::OkStatus();
+}
+
+std::string Arguments::AddActiveArgument(const std::string& arg_name) {
+  if (auto it = int_values_.find(arg_name); it != int_values_.end()) {
+    int int_index;
+    if (it->second.active) {
+      int_index = it->second.offset;
+    } else {
+      it->second.active = true;
+      it->second.offset = shared_int4s_data_.size();
+      int_index = it->second.offset;
+      shared_int4s_data_.push_back(it->second.value);
+    }
+    std::string index = std::to_string(int_index / 4);
+    std::string postfixes[4] = {"x", "y", "z", "w"};
+    return "shared_int4_" + index + "." + postfixes[int_index % 4];
+  }
+  if (auto it = float_values_.find(arg_name); it != float_values_.end()) {
+    int float_index;
+    if (it->second.active) {
+      float_index = it->second.offset;
+    } else {
+      it->second.active = true;
+      it->second.offset = shared_float4s_data_.size();
+      float_index = it->second.offset;
+      shared_float4s_data_.push_back(it->second.value);
+    }
+    std::string index = std::to_string(float_index / 4);
+    std::string postfixes[4] = {"x", "y", "z", "w"};
+    return "shared_float4_" + index + "." + postfixes[float_index % 4];
+  }
+  return arg_name;
+}
+
+void Arguments::ResolveArgsPass(std::string* code) {
+  std::string result;
+  constexpr char kPrefix[] = "args.";
+  size_t position = 0;
+  size_t next_position = code->find(kPrefix);
+  while (next_position != std::string::npos) {
+    size_t arg_pos = next_position;
+    next_position += strlen(kPrefix);
+    std::string object_name = GetNextWord(*code, next_position);
+    std::string new_name = AddActiveArgument(object_name);
+    code->replace(arg_pos, object_name.size() + strlen(kPrefix), new_name);
+    position = arg_pos + new_name.size();
+    next_position = code->find(kPrefix, position);
+  }
+
+  int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4);
+  shared_int4s_data_.resize(shared_int4s_aligned_size);
+  int shared_float4s_aligned_size = AlignByN(shared_float4s_data_.size(), 4);
+  shared_float4s_data_.resize(shared_float4s_aligned_size);
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
new file mode 100644
index 00000000000..274532d0199
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -0,0 +1,88 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+#include "tensorflow/lite/delegates/gpu/common/types.h"
+#include "tensorflow/lite/delegates/gpu/common/util.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+class Arguments {
+ public:
+  Arguments() = default;
+  void AddFloat(const std::string& name, float value = 0.0f);
+  void AddInt(const std::string& name, int value = 0);
+
+  absl::Status SetInt(const std::string& name, int value);
+  absl::Status SetFloat(const std::string& name, float value);
+
+  std::string GetListOfArgs();
+
+  absl::Status Bind(cl_kernel kernel, int offset);
+
+  void ResolveArgsPass(std::string* code);
+
+  // Move only
+  Arguments(Arguments&& args);
+  Arguments& operator=(Arguments&& args);
+  Arguments(const Arguments&) = delete;
+  Arguments& operator=(const Arguments&) = delete;
+
+ private:
+  std::string AddActiveArgument(const std::string& arg_name);
+
+  struct IntValue {
+    int value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, IntValue> int_values_;
+  std::vector<int32_t> shared_int4s_data_;
+
+  struct FloatValue {
+    float value;
+
+    // many uniforms generated automatically and not used
+    // to reduce amount of data transferred we adding this optimization
+    bool active = false;
+
+    // offset to shared uniform storage.
+    uint32_t offset = -1;
+  };
+  std::map<std::string, FloatValue> float_values_;
+  std::vector<float> shared_float4s_data_;
+};
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_ARGUMENTS_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
index b575684d2b4..be9dc6dbf03 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_kernel.h
@@ -65,6 +65,7 @@ class CLKernel {
   int GetPrivateMemorySize() const { return private_memory_size_; }
   int GetMaxWorkGroupSize() const { return max_work_group_size_; }
 
+  int GetBindingCounter() const { return binding_counter_; }
   void ResetBindingCounter() { binding_counter_ = 0; }
 
   // Do not use this function
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index ff6f06eeb68..b5510b3e8df 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -1290,8 +1290,10 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/common:operations",
         "//tensorflow/lite/delegates/gpu/common:types",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 66a272fa2da..fc3efe32c3b 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
 
@@ -27,37 +29,45 @@ namespace {
 
 std::string GetTransposeCode(
     const OperationDef& op_def, const TransposeAttributes& attr,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
-  TensorCodeGenerator src_tensor(
-      "src_data",
-      WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
-      op_def.src_tensors[0]);
-  TensorCodeGenerator dst_tensor(
-      "dst_data",
-      WHSBPoint{"dst_size.x", "dst_size.y", "dst_size.z", "dst_size.w"},
-      op_def.dst_tensors[0]);
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    Arguments* args) {
+  TensorCodeGenerator src_tensor("src_data",
+                                 WHSBPoint{"args.src_width", "args.src_height",
+                                           "args.src_slices", "args.src_batch"},
+                                 op_def.src_tensors[0]);
+  TensorCodeGenerator dst_tensor("dst_data",
+                                 WHSBPoint{"args.dst_width", "args.dst_height",
+                                           "args.dst_slices", "args.dst_batch"},
+                                 op_def.dst_tensors[0]);
+
+  args->AddInt("src_width");
+  args->AddInt("src_height");
+  args->AddInt("src_slices");
+  args->AddInt("src_batch");
+  args->AddInt("dst_width");
+  args->AddInt("dst_height");
+  args->AddInt("dst_slices");
+  args->AddInt("dst_batch");
+  args->AddInt("dst_channels");
 
   const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ);
   c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
-  c += "    int4 src_size,             \n";
-  c += "    int4 dst_size,             \n";
-  c += "    int src_channels,          \n";
-  c += "    int dst_channels           \n";
-  c += ") {\n";
+  c += dst_tensor.GetDeclaration(AccessType::WRITE);
+  c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
-    c += "  int X = linear_id / dst_size.w;\n";
-    c += "  int B = linear_id % dst_size.w;\n";
+    c += "  int X = linear_id / args.dst_batch;\n";
+    c += "  int B = linear_id % args.dst_batch;\n";
   } else {
     c += "  int X = get_global_id(0);\n";
   }
   c += "  int Y = get_global_id(1);\n";
   c += "  int Z = get_global_id(2);\n";
-  c += "  if (X >= dst_size.x || Y >= dst_size.y || Z >= dst_size.z) { \n";
+  c += "  if (X >= args.dst_width || Y >= args.dst_height || Z >= "
+       "args.dst_slices) { \n";
   c += "    return; \n";
   c += "  } \n";
   c += "  FLT temps[4];\n";
@@ -83,7 +93,7 @@ std::string GetTransposeCode(
   } else {
     c += "  for (int i = 0; i < 4; ++i) {\n";
     c += "    int dst_channel = Z * 4 + i;\n";
-    c += "    if (dst_channel < dst_channels) {;\n";
+    c += "    if (dst_channel < args.dst_channels) {;\n";
     const std::string bhwc[] = {"B", "Y", "X", "dst_channel"};
     std::string src_b = op_def.IsBatchSupported() ? bhwc[remap[0]] : "";
     c += "      int s_y = " + bhwc[remap[1]] + ";\n";
@@ -100,24 +110,27 @@ std::string GetTransposeCode(
   }
   c += "  FLT4 result = (FLT4)(temps[0], temps[1], temps[2], temps[3]);\n";
   std::string x_3dcoord =
-      op_def.IsBatchSupported() ? "X * dst_size.w + B" : "X";
+      op_def.IsBatchSupported() ? "X * args.dst_batch + B" : "X";
   const LinkingContext context{"result", x_3dcoord, "Y", "Z"};
   c += PostProcess(linked_operations, context);
   c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", batch_id);
   c += "}\n";
-  return c;
+  args->ResolveArgsPass(&c);
+  return absl::Substitute(c, args->GetListOfArgs());
 }
 }  // namespace
 
 Transpose::Transpose(Transpose&& operation)
     : GPUOperation(std::move(operation)),
       attr_(operation.attr_),
+      args_(std::move(operation.args_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
 Transpose& Transpose::operator=(Transpose&& operation) {
   if (this != &operation) {
     attr_ = operation.attr_;
+    args_ = std::move(operation.args_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -126,21 +139,28 @@ Transpose& Transpose::operator=(Transpose&& operation) {
 }
 
 absl::Status Transpose::Compile(const CreationContext& creation_context) {
-  const auto code = GetTransposeCode(definition_, attr_, linked_operations_);
+  const auto code =
+      GetTransposeCode(definition_, attr_, linked_operations_, &args_);
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
 }
 
 absl::Status Transpose::BindArguments() {
+  RETURN_IF_ERROR(args_.SetInt("src_width", src_[0]->Width()));
+  RETURN_IF_ERROR(args_.SetInt("src_height", src_[0]->Height()));
+  RETURN_IF_ERROR(args_.SetInt("src_slices", src_[0]->Slices()));
+  RETURN_IF_ERROR(args_.SetInt("src_batch", src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dst_width", dst_[0]->Width()));
+  RETURN_IF_ERROR(args_.SetInt("dst_height", dst_[0]->Height()));
+  RETURN_IF_ERROR(args_.SetInt("dst_slices", dst_[0]->Slices()));
+  RETURN_IF_ERROR(args_.SetInt("dst_batch", dst_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetInt("dst_channels", dst_[0]->Channels()));
   kernel_.ResetBindingCounter();
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
   RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->Channels()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->Channels()));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
index 61038b1e0ca..13f06281012 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TRANSPOSE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_TRANSPOSE_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
@@ -43,6 +44,7 @@ class Transpose : public GPUOperation {
   int3 GetGridSize() const;
 
   TransposeAttributes attr_;
+  Arguments args_;
   CLKernel kernel_;
   int3 work_group_size_;
 };

From 94be4e6db5982e116eea0e1a33b257586b99dcab Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 27 May 2020 12:58:59 +0800
Subject: [PATCH 1182/1533] add @Deprecated to setAllowFp16PrecisionForFp32()

---
 .../lite/java/src/main/java/org/tensorflow/lite/Interpreter.java | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 0faf3c008f4..7b220d348cf 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -106,6 +106,7 @@ public final class Interpreter implements AutoCloseable {
      * {@link org.tensorflow.lite.nnapi.NnApiDelegate.Options#setAllowFp16(boolean enable)}.
      *
      */
+    @Deprecated
     public Options setAllowFp16PrecisionForFp32(boolean allow) {
       this.allowFp16PrecisionForFp32 = allow;
       return this;

From ca47cbd37c8f9483c1fbb1713f4a539230a3a7cb Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Tue, 26 May 2020 22:36:44 -0700
Subject: [PATCH 1183/1533] Migrate int8 quantized add to reuse
 BinaryBroadcastFiveFold func.

PiperOrigin-RevId: 313331967
Change-Id: I122ff676bfc49a023bdfd95a555e58f4709d800e
---
 .../internal/optimized/integer_ops/add.h      | 101 +-----------------
 1 file changed, 4 insertions(+), 97 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index 95b78b3a6b3..44479d93a31 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -275,101 +276,6 @@ inline void Add(const ArithmeticParams& params,
   AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const int8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const int8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 int8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastAddFivefoldInt8/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
-  switched_params.input1_shift = unswitched_params.input2_shift;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
-  switched_params.input2_shift = unswitched_params.input1_shift;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const int8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const int8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  int8* output_data_ptr = output_data;
-  const int8* input1_data_ptr = input1_data;
-  const int8* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 inline void BroadcastAddDispatch(const ArithmeticParams& params,
                                  const RuntimeShape& input1_shape,
                                  const int8* input1_data,
@@ -383,8 +289,9 @@ inline void BroadcastAddDispatch(const ArithmeticParams& params,
         output_shape, output_data);
   }
 
-  BroadcastAddFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  optimized_ops::BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data, AddElementwise, AddScalarBroadcast);
 }
 
 }  // namespace optimized_integer_ops

From a1b64bb516f8eb089d53e3ceb216d1826b8e9ecd Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Tue, 26 May 2020 22:59:59 -0700
Subject: [PATCH 1184/1533] Check PAD tensor shape in IsSupported() phase

PiperOrigin-RevId: 313333989
Change-Id: I5a47cfaf2f5aedca919d737274e2d94c1b5825ce
---
 .../lite/delegates/gpu/common/model_builder.cc       | 12 ++++++++++++
 tensorflow/lite/kernels/kernel_util.h                |  2 +-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 64b335f10a5..daedc277869 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -1348,6 +1348,17 @@ class PadOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
+    auto pad_tensor = tflite::GetInput(context, tflite_node, 1);
+    if (pad_tensor->dims->size != 2) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Invalid paddings tensor dimension: expected 2 dim, got ",
+          pad_tensor->dims->size, " dim"));
+    }
+    if (pad_tensor->dims->data[0] != 4 || pad_tensor->dims->data[1] != 2) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Invalid paddings tensor shape: expected 4x2, got ",
+          pad_tensor->dims->data[0], "x", pad_tensor->dims->data[1]));
+    }
     return absl::OkStatus();
   }
 
@@ -1371,6 +1382,7 @@ class PadOperationParser : public TFLiteOperationParser {
 
     // 4x2 tensor with paddings.
     if (paddings.shape.h != 4 || paddings.shape.w != 2) {
+      // It shouldn't fail here since it's checked at IsSupported().
       return absl::InvalidArgumentError(
           "Paddings tensor has unexpected shape.");
     }
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 5793b08616d..d6a2dac8583 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -28,7 +28,7 @@ inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
   return t->dims->data[dim];
 }
-inline const TfLiteTensor* GetInput(TfLiteContext* context,
+inline const TfLiteTensor* GetInput(const TfLiteContext* context,
                                     const TfLiteNode* node, int index) {
   return &context
               ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])];

From 65264bfc9842f573a2627249fc7018c85e5c6583 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 26 May 2020 23:06:35 -0700
Subject: [PATCH 1185/1533] Format generated CUDA stub files.

PiperOrigin-RevId: 313334624
Change-Id: Ie7fc91e0c59754b9d9a7db686f3876577c8164e7
---
 .../stream_executor/cuda/cublas_10_2.inc      | 5305 +++++++--------
 .../stream_executor/cuda/cublas_9_0.inc       | 5956 ++++++++---------
 tensorflow/stream_executor/cuda/cudnn_6_0.inc | 2310 +++----
 tensorflow/stream_executor/cuda/cudnn_7_0.inc | 2507 +++----
 tensorflow/stream_executor/cuda/cudnn_7_1.inc | 2916 ++++----
 tensorflow/stream_executor/cuda/cudnn_7_3.inc | 3239 ++++-----
 tensorflow/stream_executor/cuda/cudnn_7_4.inc | 3443 +++++-----
 tensorflow/stream_executor/cuda/cudnn_7_6.inc | 4107 ++++++------
 .../stream_executor/cuda/cusparse_10_1.inc    |    2 +-
 .../stream_executor/cuda/cusparse_10_2.inc    |    2 +-
 10 files changed, 14875 insertions(+), 14912 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cublas_10_2.inc b/tensorflow/stream_executor/cuda/cublas_10_2.inc
index 42c4e5fef3b..067ba675288 100644
--- a/tensorflow/stream_executor/cuda/cublas_10_2.inc
+++ b/tensorflow/stream_executor/cuda/cublas_10_2.inc
@@ -2,29 +2,31 @@
 
 extern "C" {
 
-cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, version);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
@@ -37,57 +39,71 @@ size_t CUBLASWINAPI cublasGetCudartVersion(void) {
   return func_ptr();
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
@@ -118,399 +134,384 @@ cublasGetLoggerCallback(cublasLogCallback *userCallback) {
   return func_ptr(userCallback);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *devicePtr, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, devicePtr, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *y, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B, 
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B,
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
-                                                  const void *hostPtr, int incx, 
-                                                  void *devicePtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
-                                                  const void *devicePtr, int incx,
-                                                  void *hostPtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
-  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
   if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
   return func_ptr(srName, info);
 }
 
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *x, 
-                                                     cudaDataType xType,
-                                                     int incx, 
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     const float *y, 
-                                                     int incy,
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     const double *y,
-                                                     int incy,
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *alpha,  /* host or device pointer */
-                                                     cudaDataType alphaType,
-                                                     void *x, 
-                                                     cudaDataType xType,
-                                                     int incx,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *alpha,  /* host or device pointer */
-                                                     float *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *alpha,  /* host or device pointer */
-                                                     double *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuComplex *alpha, /* host or device pointer */
-                                                     cuComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */
-                                                     cuDoubleComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
-                                                      int n,
-                                                      const void *alpha, /* host or device pointer */
-                                                      cudaDataType alphaType,
-                                                      const void *x,
-                                                      cudaDataType xType,
-                                                      int incx,
-                                                      void *y,
-                                                      cudaDataType yType,
-                                                      int incy,
-                                                      cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *alpha, /* host or device pointer */
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
@@ -528,97 +529,82 @@ cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
   return func_ptr(handle, n, x, xType, incx, y, yType, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
@@ -635,45 +621,41 @@ cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
   return func_ptr(handle, n, x, xType, incx, y, yType, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
@@ -690,45 +672,41 @@ cublasStatus_t CUBLASWINAPI cublasIamaxEx(
   return func_ptr(handle, n, x, xType, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
@@ -757,129 +735,113 @@ cublasStatus_t CUBLASWINAPI cublasAsumEx(
   return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,      /* host or device pointer */
-                                                     const cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,            /* host or device pointer */
-                                                     const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
@@ -899,45 +861,50 @@ cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
                   executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
-                                                     float *a,   /* host or device pointer */
-                                                     float *b,   /* host or device pointer */
-                                                     float *c,   /* host or device pointer */
-                                                     float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
-                                                     double *a,  /* host or device pointer */
-                                                     double *b,  /* host or device pointer */
-                                                     double *c,  /* host or device pointer */
-                                                     double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
-                                                     cuComplex *a,  /* host or device pointer */
-                                                     cuComplex *b,  /* host or device pointer */
-                                                     float *c,      /* host or device pointer */
-                                                     cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
-                                                     cuDoubleComplex *a,  /* host or device pointer */
-                                                     cuDoubleComplex *b,  /* host or device pointer */
-                                                     double *c,           /* host or device pointer */
-                                                     cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
@@ -959,27 +926,21 @@ cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
   return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
@@ -999,25 +960,27 @@ cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
                   executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
-                                                      float *d1,        /* host or device pointer */
-                                                      float *d2,        /* host or device pointer */
-                                                      float *x1,        /* host or device pointer */
-                                                      const float *y1,  /* host or device pointer */
-                                                      float *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
-                                                      double *d1,        /* host or device pointer */  
-                                                      double *d2,        /* host or device pointer */  
-                                                      double *x1,        /* host or device pointer */  
-                                                      const double *y1,  /* host or device pointer */  
-                                                      double *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
@@ -1040,2031 +1003,1701 @@ cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
                   paramType, executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      const float *beta,  /* host or device pointer */
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda, 
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda, 
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,   /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x, 
-                                                      int incx,
-                                                      const float *beta,  /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,   /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x, 
-                                                      int incx,
-                                                      const double *beta,   /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha,  /* host or device pointer */                                           
-                                                      const float *AP,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *AP,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *AP,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *AP,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
-                                                     int m,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     const float *y,
-                                                     int incy,
-                                                     float *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
-                                                     int m,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */   
-                                                     const double *x,
-                                                     int incx,
-                                                     const double *y,
-                                                     int incy,
-                                                     double *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *y,
-                                                      int incy,
-                                                      double *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx, 
-                                                      const double *y,
-                                                      int incy,
-                                                      double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb, 
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const void *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const void *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc,
-                                                      cudaDataType computeType,
-                                                      cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cudaDataType, cublasGemmAlgo_t);
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cudaDataType,
+      cublasGemmAlgo_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
-                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
-                                                           int m, int n, int k, 
-                                                           const unsigned char *A, int A_bias, int lda, 
-                                                           const unsigned char *B, int B_bias, int ldb,
-                                                                 unsigned char *C, int C_bias, int ldc,
-                                                           int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      int n, 
-                                                      int k,
-                                                      const cuComplex *alpha, 
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, 
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo, 
-                                                       cublasOperation_t trans, 
-                                                       int n, 
-                                                       int k,
-                                                       const float *alpha, 
-                                                       const void *A, cudaDataType Atype, 
-                                                       int lda,
-                                                       const float *beta, 
-                                                       void *C, 
-                                                       cudaDataType Ctype, 
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const float *alpha, /* host or device pointer */  
-                                                       const float *A,
-                                                       int lda,
-                                                       const float *B,
-                                                       int ldb,
-                                                       const float *beta, /* host or device pointer */  
-                                                       float *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const double *alpha, /* host or device pointer */  
-                                                       const double *A,
-                                                       int lda,
-                                                       const double *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       double *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const cuComplex *beta, /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                       const cuDoubleComplex *A,
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const float *beta,   /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans, 
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                       const cuDoubleComplex *A, 
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const float *alpha, /* host or device pointer */ 
-                                                    const float *A,
-                                                    int lda,
-                                                    const float *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    float *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const double *alpha, /* host or device pointer */ 
-                                                    const double *A,
-                                                    int lda,
-                                                    const double *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    double *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const cuComplex *beta, /* host or device pointer */ 
-                                                    cuComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo, 
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    cuComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      float *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,                                        
-                                                     int lda,
-                                                     cuDoubleComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda, 
-                                                      const float *B,
-                                                      int ldb,
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     const cuComplex *B,
-                                                     int ldb,
-                                                     cuComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     const cuDoubleComplex *B,
-                                                     int ldb,
-                                                     cuDoubleComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
@@ -3079,7 +2712,8 @@ cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
       const float *, float *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
@@ -3094,7 +2728,8 @@ cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
       const double *, double *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
@@ -3110,7 +2745,8 @@ cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
       int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
@@ -3126,7 +2762,8 @@ cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
       int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3144,7 +2781,8 @@ cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
       cuDoubleComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
@@ -3188,200 +2826,155 @@ cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
                   batchCount, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const float *alpha,  /* host or device pointer */
-                                                                 const float *A,
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const float *B,
-                                                                 int ldb,
-                                                                 long long int strideB,
-                                                                 const float *beta,   /* host or device pointer */
-                                                                 float *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const double *alpha,  /* host or device pointer */
-                                                                 const double *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const double *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const double *beta,   /* host or device pointer */
-                                                                 double *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
-                                                                 const cuDoubleComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuDoubleComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuDoubleComplex *beta,   /* host or device poi */
-                                                                 cuDoubleComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const float *alpha, /* host or device pointer */ 
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *beta , /* host or device pointer */ 
-                                                  const float *B, 
-                                                  int ldb,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const double *alpha, /* host or device pointer */ 
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *beta, /* host or device pointer */ 
-                                                  const double *B, 
-                                                  int ldb,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *alpha, /* host or device pointer */ 
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *beta, /* host or device pointer */  
-                                                  const cuComplex *B, 
-                                                  int ldb,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *beta, /* host or device pointer */  
-                                                  const cuDoubleComplex *B, 
-                                                  int ldb,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
@@ -3494,7 +3087,8 @@ cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
       const int *, float *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
@@ -3506,7 +3100,8 @@ cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
       const int *, double *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
@@ -3518,7 +3113,8 @@ cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
       int, const int *, cuComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
@@ -3531,7 +3127,8 @@ cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
       cuDoubleComplex *const[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
@@ -3546,7 +3143,8 @@ cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
       float *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
@@ -3561,7 +3159,8 @@ cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
       double *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
@@ -3576,7 +3175,8 @@ cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
       int, cuComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
@@ -3591,7 +3191,8 @@ cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
       const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
@@ -3710,7 +3311,8 @@ cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       float *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3724,7 +3326,8 @@ cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       double *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3737,7 +3340,8 @@ cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       cuComplex *const[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI
@@ -3751,1467 +3355,1666 @@ cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
       int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *x, 
-                                                  int incx,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *x, 
-                                                  int incx,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *x, 
-                                                  int incx,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *x, 
-                                                  int incx,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *AP,
-                                                     float *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *AP,
-                                                     double *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *AP,
-                                                     cuComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *AP,
-                                                     cuDoubleComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *A,
-                                                     int lda,
-                                                     float *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *A,
-                                                     int lda,
-                                                     double *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     cuDoubleComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus CUBLASWINAPI cublasInit (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasShutdown (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasGetError (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
 cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int *);
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, void **);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasFree (void *devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(void *);
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cudaStream_t);
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream);
 }
 
-float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
-                               int incy) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int, const float *, int);
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
-                               int incy) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int, const double *, int);
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, float *, int);
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, double *, int);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
-                               int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const float *, int, float *, int);
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int);
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int);
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSasum (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDasum (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
   return func_ptr(n, x, incx);
 }
 
-void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
-                              float sc, float ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
-                              double sc, double ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
-                              int incy, float c, cuComplex s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, cuComplex);
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
-                              cuDoubleComplex *y, int incy, double sc, 
-                              cuDoubleComplex cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, cuDoubleComplex);
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
   return func_ptr(n, x, incx, y, incy, sc, cs);
 }
 
-void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
-                               int incy, float c, float s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, float);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex *y, int incy, double c, double s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, float *);
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, double *);
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
-                               cuComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
-                               cuDoubleComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex, double *, cuDoubleComplex *);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
-                              const float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
-                              const double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
-                                const float *sy1, float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
-                                const double *sy1, double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, const double *, double *);
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
-                               const float *A, int lda, const float *x, int incx,
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
-                               const double *A, int lda, const double *x, int incx,
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
-                               const cuComplex *A, int lda, const cuComplex *x, int incx,
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
-                               float alpha, const float *A, int lda, 
-                               const float *x, int incx, float beta, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
-                               double alpha, const double *A, int lda, 
-                               const double *x, int incx, double beta, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuComplex alpha, const cuComplex *A, int lda, 
-                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
-                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
                               cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
-                              float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
-                              char diag, int n, int k, const float *A, 
-                              int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const double *A, 
-                              int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuComplex *A, 
-                              int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuDoubleComplex *A, 
-                              int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
-                               int lda, const float *x, int incx, float beta, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
-                               int lda, const double *x, int incx, double beta, 
-                               double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, const cuComplex *x, int incx, cuComplex beta, 
-                               cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
-                               cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
-                               const float *A, int lda, const float *x, int incx, 
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
-                               const double *A, int lda, const double *x, int incx, 
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
-                              const float *AP, const float *x,
-                              int incx, float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
-                              const double *AP, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, const cuComplex *, int, cuComplex, cuComplex *, int);
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
-                              const float *y, int incy, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
-                              const double *y, int incy, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
-                              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
-                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
-                              int incx, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
-                               int incx, const double *y, int incy, double *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, const cuComplex *y, int incy, cuComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *);
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
-                               const double *x, int incx, const double *y,
-                               int incy, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *);
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
-                               const cuComplex *x, int incx, const cuComplex *y,
-                               int incy, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-                               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
-                               float alpha, const float *A, int lda, 
-                               const float *B, int ldb, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
-                               double alpha, const double *A, int lda, 
-                               const double *B, int ldb, double beta, double *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
-                               int k, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
-                               const float *A, int lda, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
-                               float alpha, const cuComplex *A, int lda,
-                               float beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
-                               double alpha,
-                               const cuDoubleComplex *A, int lda,
-                               double beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
-                                const float *A, int lda, const float *B, int ldb, 
-                                float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
-                                double alpha, const double *A, int lda,
-                                const double *B, int ldb, double beta,
-                                double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, cuComplex beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, float beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, double beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
-                               const double *A, int lda, const double *B, int ldb,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
diff --git a/tensorflow/stream_executor/cuda/cublas_9_0.inc b/tensorflow/stream_executor/cuda/cublas_9_0.inc
index ba46426878f..5e716114b23 100644
--- a/tensorflow/stream_executor/cuda/cublas_9_0.inc
+++ b/tensorflow/stream_executor/cuda/cublas_9_0.inc
@@ -2,5120 +2,4814 @@
 
 extern "C" {
 
-cublasStatus_t CUBLASWINAPI cublasCreate_v2 (cublasHandle_t *handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t *);
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDestroy_v2 (cublasHandle_t handle) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t);
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int *);
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, version);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *);
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetStream_v2 (cublasHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetStream_v2 (cublasHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2 (cublasHandle_t handle, cublasPointerMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t *mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *devicePtr, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, devicePtr, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVector (int n, int elemSize, const void *x, 
-                                             int incx, void *y, int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B, 
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrix (int rows, int cols, int elemSize, 
-                                             const void *A, int lda, void *B,
-                                             int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int);
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetVectorAsync (int n, int elemSize, 
-                                                  const void *hostPtr, int incx, 
-                                                  void *devicePtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetVectorAsync (int n, int elemSize,
-                                                  const void *devicePtr, int incx,
-                                                  void *hostPtr, int incy,
-                                                  cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync (int rows, int cols, int elemSize,
-                                                  const void *A, int lda, void *B,
-                                                  int ldb, cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, int, const void *, int, void *, int, cudaStream_t);
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
 }
 
-void CUBLASWINAPI cublasXerbla (const char *srName, int info) {
-  using FuncPtr = void (CUBLASWINAPI *)(const char *, int);
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
   if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
   return func_ptr(srName, info);
 }
 
-cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *x, 
-                                                     cudaDataType xType,
-                                                     int incx, 
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDotcEx (cublasHandle_t handle,
-                                                     int n, 
-                                                     const void *x,
-                                                     cudaDataType xType, 
-                                                     int incx, 
-                                                     const void *y, 
-                                                     cudaDataType yType,
-                                                     int incy,
-                                                     void *result,
-                                                     cudaDataType resultType,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, int, const void *, cudaDataType, int, void *, cudaDataType, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType, executionType);
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     const float *y, 
-                                                     int incy,
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDdot_v2 (cublasHandle_t handle,
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     const double *y,
-                                                     int incy,
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      const cuComplex *y, 
-                                                      int incy,
-                                                      cuComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotu_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdotc_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *y, 
-                                                      int incy,
-                                                      cuDoubleComplex *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const void *alpha,  /* host or device pointer */
-                                                     cudaDataType alphaType,
-                                                     void *x, 
-                                                     cudaDataType xType,
-                                                     int incx,
-                                                     cudaDataType executionType) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *alpha,  /* host or device pointer */
-                                                     float *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *alpha,  /* host or device pointer */
-                                                     double *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuComplex *alpha, /* host or device pointer */
-                                                     cuComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */
-                                                     cuDoubleComplex *x, 
-                                                     int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasAxpyEx (cublasHandle_t handle,
-                                                      int n,
-                                                      const void *alpha, /* host or device pointer */
-                                                      cudaDataType alphaType,
-                                                      const void *x,
-                                                      cudaDataType xType,
-                                                      int incx,
-                                                      void *y,
-                                                      cudaDataType yType,
-                                                      int incy,
-                                                      cudaDataType executiontype) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const void *, cudaDataType, const void *, cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy, executiontype);
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *alpha, /* host or device pointer */
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZaxpy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, alpha, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZcopy_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      float *x, 
-                                                      int incx, 
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      double *x, 
-                                                      int incx, 
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuComplex *x, 
-                                                      int incx, 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZswap_v2 (cublasHandle_t handle,
-                                                      int n, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const double *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      int *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const float *x, 
-                                                     int incx, 
-                                                     float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     const double *x, 
-                                                     int incx, 
-                                                     double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuComplex *x, 
-                                                      int incx, 
-                                                      float *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, 
-                                                      int n, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx, 
-                                                      double *result) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, result);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,      /* host or device pointer */
-                                                     const cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuComplex *x, 
-                                                     int incx, 
-                                                     cuComplex *y, 
-                                                     int incy, 
-                                                     const float *c,  /* host or device pointer */
-                                                     const float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *, const float *);
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrot_v2 (cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,            /* host or device pointer */
-                                                     const cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     cuDoubleComplex *x, 
-                                                     int incx, 
-                                                     cuDoubleComplex *y, 
-                                                     int incy, 
-                                                     const double *c,  /* host or device pointer */
-                                                     const double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int, const double *, const double *);
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, 
-                                                     float *a,   /* host or device pointer */
-                                                     float *b,   /* host or device pointer */
-                                                     float *c,   /* host or device pointer */
-                                                     float *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, 
-                                                     double *a,  /* host or device pointer */
-                                                     double *b,  /* host or device pointer */
-                                                     double *c,  /* host or device pointer */
-                                                     double *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, 
-                                                     cuComplex *a,  /* host or device pointer */
-                                                     cuComplex *b,  /* host or device pointer */
-                                                     float *c,      /* host or device pointer */
-                                                     cuComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, 
-                                                     cuDoubleComplex *a,  /* host or device pointer */
-                                                     cuDoubleComplex *b,  /* host or device pointer */
-                                                     double *c,           /* host or device pointer */
-                                                     cuDoubleComplex *s) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, a, b, c, s);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     float *x, 
-                                                     int incx, 
-                                                     float *y, 
-                                                     int incy, 
-                                                     const float* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *, int, const float *);
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, 
-                                                     int n, 
-                                                     double *x, 
-                                                     int incx, 
-                                                     double *y, 
-                                                     int incy, 
-                                                     const double* param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *, int, double *, int, const double *);
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, x, incx, y, incy, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, 
-                                                      float *d1,        /* host or device pointer */
-                                                      float *d2,        /* host or device pointer */
-                                                      float *x1,        /* host or device pointer */
-                                                      const float *y1,  /* host or device pointer */
-                                                      float *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, float *, float *, float *, const float *, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, 
-                                                      double *d1,        /* host or device pointer */  
-                                                      double *d2,        /* host or device pointer */  
-                                                      double *x1,        /* host or device pointer */  
-                                                      const double *y1,  /* host or device pointer */  
-                                                      double *param) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, double *, double *, double *, const double *, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, d1, d2, x1, y1, param);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m, 
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x, 
-                                                      int incx, 
-                                                      const float *beta,  /* host or device pointer */
-                                                      float *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */
-                                                      double *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgbmv_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda, 
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda, 
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda, 
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgbmv_v2 (cublasHandle_t handle,
-                                                      cublasOperation_t trans, 
-                                                      int m,
-                                                      int n,
-                                                      int kl,
-                                                      int ku, 
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda, 
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const float *AP, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const double *AP, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuComplex *AP, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      const cuDoubleComplex *AP, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const float *A, 
-                                                      int lda, 
-                                                      float *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuComplex *A, 
-                                                      int lda, 
-                                                      cuComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtbsv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      cublasDiagType_t diag, 
-                                                      int n, 
-                                                      int k, 
-                                                      const cuDoubleComplex *A, 
-                                                      int lda, 
-                                                      cuDoubleComplex *x, 
-                                                      int incx) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta, /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta, /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */ 
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta,   /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,   /* host or device pointer */ 
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *x, 
-                                                      int incx,
-                                                      const float *beta,  /* host or device pointer */ 
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsbmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,   /* host or device pointer */ 
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *x, 
-                                                      int incx,
-                                                      const double *beta,   /* host or device pointer */ 
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */ 
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *x, 
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */ 
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhbmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *x, 
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                      cuDoubleComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha,  /* host or device pointer */                                           
-                                                      const float *AP,
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      float *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *AP,
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpmv_v2 (cublasHandle_t handle, 
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *AP,
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *y,
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpmv_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *AP,
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *y, 
-                                                      int incy) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSger_v2 (cublasHandle_t handle,
-                                                     int m,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     const float *y,
-                                                     int incy,
-                                                     float *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDger_v2 (cublasHandle_t handle, 
-                                                     int m,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */   
-                                                     const double *x,
-                                                     int incx,
-                                                     const double *y,
-                                                     int incy,
-                                                     double *A,
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeru_v2 (cublasHandle_t handle, 
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgerc_v2 (cublasHandle_t handle,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *A, 
-                                                     int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const float *x,
-                                                     int incx,
-                                                     float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const double *x,
-                                                     int incx,
-                                                     double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const float *alpha, /* host or device pointer */  
-                                                     const cuComplex *x,
-                                                     int incx,
-                                                     cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr_v2 (cublasHandle_t handle,
-                                                     cublasFillMode_t uplo,
-                                                     int n,
-                                                     const double *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *x,
-                                                     int incx,
-                                                     cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx,
-                                                      const double *y,
-                                                      int incy,
-                                                      double *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, int n, 
-                                                      const cuComplex *alpha,  /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx, 
-                                                      const cuComplex *y,
-                                                      int incy, 
-                                                      cuComplex *A, 
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n, 
-                                                      const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *A,
-                                                      int lda) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const float *x,
-                                                      int incx,
-                                                      const float *y,
-                                                      int incy,
-                                                      float *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDspr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *x,
-                                                      int incx, 
-                                                      const double *y,
-                                                      int incy,
-                                                      double *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, const double *, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *x,
-                                                      int incx,
-                                                      const cuComplex *y,
-                                                      int incy,
-                                                      cuComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhpr2_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *x,
-                                                      int incx,
-                                                      const cuDoubleComplex *y,
-                                                      int incy,
-                                                      cuDoubleComplex *AP) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A, 
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb, 
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A, 
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb, 
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm_v2 (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemm3m  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A, 
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb, 
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const float *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const void *, cudaDataType, int, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasGemmEx  (cublasHandle_t handle, 
-                                                      cublasOperation_t transa,
-                                                      cublasOperation_t transb, 
-                                                      int m,
-                                                      int n,
-                                                      int k,
-                                                      const void *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const void *B,
-                                                      cudaDataType Btype,
-                                                      int ldb, 
-                                                      const void *beta, /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc,
-                                                      cudaDataType computeType,
-                                                      cublasGemmAlgo_t algo) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const void *, const void *, cudaDataType, int, const void *, cudaDataType, int, const void *, void *, cudaDataType, int, cudaDataType, cublasGemmAlgo_t);
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cudaDataType computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cudaDataType,
+      cublasGemmAlgo_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmEx (cublasHandle_t handle, 
-                                                     cublasOperation_t transa, cublasOperation_t transb,  
-                                                     int m, int n, int k, 
-                                                     const cuComplex *alpha, 
-                                                     const void *A, 
-                                                     cudaDataType Atype, 
-                                                     int lda, 
-                                                     const void *B, 
-                                                     cudaDataType Btype, 
-                                                     int ldb,
-                                                     const cuComplex *beta, 
-                                                     void *C, 
-                                                     cudaDataType Ctype, 
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const void *, cudaDataType, int, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, beta, C, Ctype, ldc);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasUint8gemmBias (cublasHandle_t handle, 
-                                                           cublasOperation_t transa, cublasOperation_t transb, cublasOperation_t transc,  
-                                                           int m, int n, int k, 
-                                                           const unsigned char *A, int A_bias, int lda, 
-                                                           const unsigned char *B, int B_bias, int ldb,
-                                                                 unsigned char *C, int C_bias, int ldc,
-                                                           int C_mult, int C_shift) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t, int, int, int, const unsigned char *, int, int, const unsigned char *, int, int, unsigned char *, int, int, int, int);
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B, B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkEx ( cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
-                                                      cublasFillMode_t uplo, 
-                                                      cublasOperation_t trans, 
-                                                      int n, 
-                                                      int k,
-                                                      const cuComplex *alpha, 
-                                                      const void *A, 
-                                                      cudaDataType Atype, 
-                                                      int lda,
-                                                      const cuComplex *beta, 
-                                                      void *C, 
-                                                      cudaDataType Ctype, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const void *, cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherk_v2 (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const double *alpha,  /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const double *beta,  /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkEx  (cublasHandle_t handle,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      int n,
-                                                      int k,
-                                                      const float *alpha,  /* host or device pointer */  
-                                                      const void *A, 
-                                                      cudaDataType Atype,
-                                                      int lda,
-                                                      const float *beta,   /* host or device pointer */  
-                                                      void *C,
-                                                      cudaDataType Ctype,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherk3mEx (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo, 
-                                                       cublasOperation_t trans, 
-                                                       int n, 
-                                                       int k,
-                                                       const float *alpha, 
-                                                       const void *A, cudaDataType Atype, 
-                                                       int lda,
-                                                       const float *beta, 
-                                                       void *C, 
-                                                       cudaDataType Ctype, 
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const void *, cudaDataType, int, const float *, void *, cudaDataType, int);
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C, Ctype, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const float *alpha, /* host or device pointer */  
-                                                       const float *A,
-                                                       int lda,
-                                                       const float *B,
-                                                       int ldb,
-                                                       const float *beta, /* host or device pointer */  
-                                                       float *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const double *alpha, /* host or device pointer */  
-                                                       const double *A,
-                                                       int lda,
-                                                       const double *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       double *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const cuComplex *beta, /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha,  /* host or device pointer */  
-                                                       const cuDoubleComplex *A,
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const cuDoubleComplex *beta,  /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans,
-                                                       int n,
-                                                       int k,
-                                                       const cuComplex *alpha, /* host or device pointer */  
-                                                       const cuComplex *A,
-                                                       int lda,
-                                                       const cuComplex *B,
-                                                       int ldb,
-                                                       const float *beta,   /* host or device pointer */  
-                                                       cuComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZher2k_v2 (cublasHandle_t handle,
-                                                       cublasFillMode_t uplo,
-                                                       cublasOperation_t trans, 
-                                                       int n,
-                                                       int k,
-                                                       const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                       const cuDoubleComplex *A, 
-                                                       int lda,
-                                                       const cuDoubleComplex *B,
-                                                       int ldb,
-                                                       const double *beta, /* host or device pointer */  
-                                                       cuDoubleComplex *C,
-                                                       int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const float *alpha, /* host or device pointer */ 
-                                                    const float *A,
-                                                    int lda,
-                                                    const float *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    float *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const double *alpha, /* host or device pointer */ 
-                                                    const double *A,
-                                                    int lda,
-                                                    const double *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    double *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const cuComplex *beta, /* host or device pointer */ 
-                                                    cuComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsyrkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo, 
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C, 
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuComplex *alpha, /* host or device pointer */ 
-                                                    const cuComplex *A,
-                                                    int lda,
-                                                    const cuComplex *B,
-                                                    int ldb,
-                                                    const float *beta, /* host or device pointer */ 
-                                                    cuComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const float *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZherkx (cublasHandle_t handle,
-                                                    cublasFillMode_t uplo,
-                                                    cublasOperation_t trans,
-                                                    int n,
-                                                    int k,
-                                                    const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                    const cuDoubleComplex *A,
-                                                    int lda,
-                                                    const cuDoubleComplex *B,
-                                                    int ldb,
-                                                    const double *beta, /* host or device pointer */ 
-                                                    cuDoubleComplex *C,
-                                                    int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      const float *B,
-                                                      int ldb,
-                                                      const float *beta, /* host or device pointer */  
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const float *, const float *, int, const float *, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m, 
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      const double *beta, /* host or device pointer */  
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const double *, const double *, int, const double *, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZsymm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasChemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuComplex *alpha, /* host or device pointer */  
-                                                      const cuComplex *A,
-                                                      int lda,
-                                                      const cuComplex *B,
-                                                      int ldb,
-                                                      const cuComplex *beta, /* host or device pointer */  
-                                                      cuComplex *C, 
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZhemm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      int m,
-                                                      int n,
-                                                      const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                      const cuDoubleComplex *A,
-                                                      int lda,
-                                                      const cuDoubleComplex *B,
-                                                      int ldb,
-                                                      const cuDoubleComplex *beta, /* host or device pointer */  
-                                                      cuDoubleComplex *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsm_v2 (cublasHandle_t handle, 
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda,
-                                                      float *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A, 
-                                                      int lda, 
-                                                      double *B,
-                                                      int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, 
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,                                        
-                                                     int lda,
-                                                     cuDoubleComplex *B,
-                                                     int ldb) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const float *alpha, /* host or device pointer */  
-                                                      const float *A,
-                                                      int lda, 
-                                                      const float *B,
-                                                      int ldb,
-                                                      float *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *, int, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrmm_v2 (cublasHandle_t handle,
-                                                      cublasSideMode_t side,
-                                                      cublasFillMode_t uplo,
-                                                      cublasOperation_t trans,
-                                                      cublasDiagType_t diag,
-                                                      int m,
-                                                      int n,
-                                                      const double *alpha, /* host or device pointer */  
-                                                      const double *A,
-                                                      int lda,
-                                                      const double *B,
-                                                      int ldb,
-                                                      double *C,
-                                                      int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *, int, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
-                                                     cublasSideMode_t side,
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuComplex *alpha, /* host or device pointer */  
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     const cuComplex *B,
-                                                     int ldb,
-                                                     cuComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, cublasSideMode_t side, 
-                                                     cublasFillMode_t uplo,
-                                                     cublasOperation_t trans,
-                                                     cublasDiagType_t diag,
-                                                     int m,
-                                                     int n,
-                                                     const cuDoubleComplex *alpha, /* host or device pointer */  
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     const cuDoubleComplex *B,
-                                                     int ldb,
-                                                     cuDoubleComplex *C,
-                                                     int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, C, ldc);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const float *alpha,  /* host or device pointer */  
-                                                          const float *Aarray[], 
-                                                          int lda,
-                                                          const float *Barray[],
-                                                          int ldb, 
-                                                          const float *beta,   /* host or device pointer */  
-                                                          float *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *[], int, const float *[], int, const float *, float *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *Aarray[], int lda, const float *Barray[], int ldb,
+    const float *beta, /* host or device pointer */
+    float *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *[], int, const float *[], int, const float *,
+      float *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const double *alpha,  /* host or device pointer */ 
-                                                          const double *Aarray[], 
-                                                          int lda,
-                                                          const double *Barray[],
-                                                          int ldb, 
-                                                          const double *beta,  /* host or device pointer */ 
-                                                          double *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *[], int, const double *[], int, const double *, double *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *Aarray[], int lda, const double *Barray[], int ldb,
+    const double *beta, /* host or device pointer */
+    double *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *[], int, const double *[], int,
+      const double *, double *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuComplex *alpha, /* host or device pointer */ 
-                                                          const cuComplex *Aarray[], 
-                                                          int lda,
-                                                          const cuComplex *Barray[],
-                                                          int ldb, 
-                                                          const cuComplex *beta, /* host or device pointer */ 
-                                                          cuComplex *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *[], int, const cuComplex *[], int, const cuComplex *, cuComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *Aarray[], int lda, const cuComplex *Barray[], int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *[], int, const cuComplex *[], int,
+      const cuComplex *, cuComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuComplex *alpha, /* host or device pointer */ 
-                                                          const cuComplex *Aarray[], 
-                                                          int lda,
-                                                          const cuComplex *Barray[],
-                                                          int ldb, 
-                                                          const cuComplex *beta, /* host or device pointer */ 
-                                                          cuComplex *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *[], int, const cuComplex *[], int, const cuComplex *, cuComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *Aarray[], int lda, const cuComplex *Barray[], int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *[], int, const cuComplex *[], int,
+      const cuComplex *, cuComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmBatched (cublasHandle_t handle,
-                                                          cublasOperation_t transa,
-                                                          cublasOperation_t transb, 
-                                                          int m,
-                                                          int n,
-                                                          int k,
-                                                          const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                          const cuDoubleComplex *Aarray[], 
-                                                          int lda,
-                                                          const cuDoubleComplex *Barray[],
-                                                          int ldb, 
-                                                          const cuDoubleComplex *beta, /* host or device pointer */ 
-                                                          cuDoubleComplex *Carray[],
-                                                          int ldc,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *[], int, const cuDoubleComplex *[], int, const cuDoubleComplex *, cuDoubleComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasZgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *Aarray[], int lda, const cuDoubleComplex *Barray[],
+    int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *[], int,
+      const cuDoubleComplex *[], int, const cuDoubleComplex *,
+      cuDoubleComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray, ldb, beta, Carray, ldc, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const float *alpha,  /* host or device pointer */
-                                                                 const float *A,
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const float *B,
-                                                                 int ldb,
-                                                                 long long int strideB,
-                                                                 const float *beta,   /* host or device pointer */
-                                                                 float *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, const float *, int, long long, const float *, int, long long, const float *, float *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const double *alpha,  /* host or device pointer */
-                                                                 const double *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const double *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const double *beta,   /* host or device pointer */
-                                                                 double *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const double *, const double *, int, long long, const double *, int, long long, const double *, double *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuComplex *alpha,  /* host or device pointer */
-                                                                 const cuComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuComplex *beta,   /* host or device pointer */
-                                                                 cuComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuComplex *, const cuComplex *, int, long long, const cuComplex *, int, long long, const cuComplex *, cuComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched (cublasHandle_t handle,
-                                                                 cublasOperation_t transa,
-                                                                 cublasOperation_t transb, 
-                                                                 int m,
-                                                                 int n,
-                                                                 int k,
-                                                                 const cuDoubleComplex *alpha,  /* host or device pointer */
-                                                                 const cuDoubleComplex *A, 
-                                                                 int lda,
-                                                                 long long int strideA,   /* purposely signed */
-                                                                 const cuDoubleComplex *B,
-                                                                 int ldb, 
-                                                                 long long int strideB,
-                                                                 const cuDoubleComplex *beta,   /* host or device poi */
-                                                                 cuDoubleComplex *C,
-                                                                 int ldc,
-                                                                 long long int strideC,
-                                                                 int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int, long long, const cuDoubleComplex *, cuDoubleComplex *, int, long long, int);
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount);
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const float *alpha, /* host or device pointer */ 
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *beta , /* host or device pointer */ 
-                                                  const float *B, 
-                                                  int ldb,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const float *, const float *, int, const float *, const float *, int, float *, int);
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const double *alpha, /* host or device pointer */ 
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *beta, /* host or device pointer */ 
-                                                  const double *B, 
-                                                  int ldb,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const double *, const double *, int, const double *, const double *, int, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *alpha, /* host or device pointer */ 
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *beta, /* host or device pointer */  
-                                                  const cuComplex *B, 
-                                                  int ldb,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuComplex *, const cuComplex *, int, const cuComplex *, const cuComplex *, int, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
-                                                  cublasOperation_t transa, 
-                                                  cublasOperation_t transb,
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *alpha, /* host or device pointer */ 
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *beta, /* host or device pointer */  
-                                                  const cuDoubleComplex *B, 
-                                                  int ldb,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *, int, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C, ldc);
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  float *A[],                      /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                          /*Device Pointer*/
-                                                  int *info,                       /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, float *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle, int n,
+                                                float *A[], /*Device pointer*/
+                                                int lda,
+                                                int *P,    /*Device Pointer*/
+                                                int *info, /*Device Pointer*/
+                                                int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *[],
+                                                 int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  double *A[],                     /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                          /*Device Pointer*/
-                                                  int *info,                       /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, double *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle, int n,
+                                                double *A[], /*Device pointer*/
+                                                int lda,
+                                                int *P,    /*Device Pointer*/
+                                                int *info, /*Device Pointer*/
+                                                int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  cuComplex *A[],                 /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                         /*Device Pointer*/
-                                                  int *info,                      /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
+    cublasHandle_t handle, int n, cuComplex *A[], /*Device pointer*/
+    int lda, int *P,                              /*Device Pointer*/
+    int *info,                                    /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle,
-                                                  int n, 
-                                                  cuDoubleComplex *A[],           /*Device pointer*/
-                                                  int lda, 
-                                                  int *P,                         /*Device Pointer*/
-                                                  int *info,                      /*Device Pointer*/
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, cuDoubleComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
+    cublasHandle_t handle, int n, cuDoubleComplex *A[], /*Device pointer*/
+    int lda, int *P,                                    /*Device Pointer*/
+    int *info,                                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const float *A[],               /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  float *C[],                     /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int, const int *, float *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
+    cublasHandle_t handle, int n, const float *A[], /*Device pointer*/
+    int lda, const int *P,                          /*Device pointer*/
+    float *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int,
+                                     const int *, float *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const double *A[],              /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  double *C[],                    /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int, const int *, double *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
+    cublasHandle_t handle, int n, const double *A[], /*Device pointer*/
+    int lda, const int *P,                           /*Device pointer*/
+    double *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int,
+                                     const int *, double *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const cuComplex *A[],            /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  cuComplex *C[],                 /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[], int, const int *, cuComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
+    cublasHandle_t handle, int n, const cuComplex *A[], /*Device pointer*/
+    int lda, const int *P,                              /*Device pointer*/
+    cuComplex *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *[], int, const int *, cuComplex *[],
+      int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
-                                                  int n,
-                                                  const cuDoubleComplex *A[],     /*Device pointer*/
-                                                  int lda,
-                                                  const int *P,                   /*Device pointer*/
-                                                  cuDoubleComplex *C[],           /*Device pointer*/
-                                                  int ldc,
-                                                  int *info,
-                                                  int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *[], int, const int *, cuDoubleComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetriBatched(
+    cublasHandle_t handle, int n, const cuDoubleComplex *A[], /*Device pointer*/
+    int lda, const int *P,                                    /*Device pointer*/
+    cuDoubleComplex *C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *[], int, const int *,
+      cuDoubleComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasSgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const float *Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            float *Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const float *[], int, const int *, float *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(cublasHandle_t handle,
+                                                cublasOperation_t trans, int n,
+                                                int nrhs, const float *Aarray[],
+                                                int lda, const int *devIpiv,
+                                                float *Barray[], int ldb,
+                                                int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *[], int,
+      const int *, float *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDgetrsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int n, 
-                                                           int nrhs, 
-                                                           const double *Aarray[], 
-                                                           int lda, 
-                                                           const int *devIpiv, 
-                                                           double *Barray[], 
-                                                           int ldb, 
-                                                           int *info,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const double *[], int, const int *, double *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const double *Aarray[], int lda, const int *devIpiv, double *Barray[],
+    int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *[], int,
+      const int *, double *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasCgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const cuComplex *Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            cuComplex *Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuComplex *[], int, const int *, cuComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuComplex *Aarray[], int lda, const int *devIpiv, cuComplex *Barray[],
+    int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *[], int,
+      const int *, cuComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t  CUBLASWINAPI cublasZgetrsBatched( cublasHandle_t handle, 
-                                                            cublasOperation_t trans, 
-                                                            int n, 
-                                                            int nrhs, 
-                                                            const cuDoubleComplex *Aarray[], 
-                                                            int lda, 
-                                                            const int *devIpiv, 
-                                                            cuDoubleComplex *Barray[], 
-                                                            int ldb, 
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *[], int, const int *, cuDoubleComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *Aarray[], int lda, const int *devIpiv,
+    cuDoubleComplex *Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *[],
+      int, const int *, cuDoubleComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const float *alpha,           /*Host or Device Pointer*/
-                                                          const float *A[], 
-                                                          int lda,
-                                                          float *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const float *, const float *[], int, float *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /*Host or Device Pointer*/
+    const float *A[], int lda, float *B[], int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *[], int,
+      float *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const double *alpha,          /*Host or Device Pointer*/
-                                                          const double *A[], 
-                                                          int lda,
-                                                          double *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const double *, const double *[], int, double *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /*Host or Device Pointer*/
+    const double *A[], int lda, double *B[], int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *[], int,
+      double *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const cuComplex *alpha,       /*Host or Device Pointer*/
-                                                          const cuComplex *A[], 
-                                                          int lda,
-                                                          cuComplex *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuComplex *, const cuComplex *[], int, cuComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /*Host or Device Pointer*/
+    const cuComplex *A[], int lda, cuComplex *B[], int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *[], int,
+      cuComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrsmBatched( cublasHandle_t    handle, 
-                                                          cublasSideMode_t  side, 
-                                                          cublasFillMode_t  uplo,
-                                                          cublasOperation_t trans, 
-                                                          cublasDiagType_t  diag,
-                                                          int m, 
-                                                          int n, 
-                                                          const cuDoubleComplex *alpha, /*Host or Device Pointer*/
-                                                          const cuDoubleComplex *A[], 
-                                                          int lda,
-                                                          cuDoubleComplex *B[], 
-                                                          int ldb,
-                                                          int batchCount) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t, int, int, const cuDoubleComplex *, const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int);
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+    const cuDoubleComplex *A[], int lda, cuDoubleComplex *B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb, batchCount);
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const float *A[],                  /*Device pointer*/
-                                                          int lda, 
-                                                          float *Ainv[],               /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const float *[], int, float *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
+    cublasHandle_t handle, int n, const float *A[], /*Device pointer*/
+    int lda, float *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                         /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *[], int, float *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const double *A[],                 /*Device pointer*/
-                                                          int lda, 
-                                                          double *Ainv[],              /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const double *[], int, double *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
+    cublasHandle_t handle, int n, const double *A[], /*Device pointer*/
+    int lda, double *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *[], int, double *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const cuComplex *A[],              /*Device pointer*/
-                                                          int lda, 
-                                                          cuComplex *Ainv[],           /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[], int, cuComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
+    cublasHandle_t handle, int n, const cuComplex *A[], /*Device pointer*/
+    int lda, cuComplex *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                             /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *[],
+                                     int, cuComplex *[], int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
-                                                          int n, 
-                                                          const cuDoubleComplex *A[],        /*Device pointer*/
-                                                          int lda, 
-                                                          cuDoubleComplex *Ainv[],     /*Device pointer*/
-                                                          int lda_inv, 
-                                                          int *info,                   /*Device Pointer*/
-                                                          int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, const cuDoubleComplex *[], int, cuDoubleComplex *[], int, int *, int);
+cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(
+    cublasHandle_t handle, int n, const cuDoubleComplex *A[], /*Device pointer*/
+    int lda, cuDoubleComplex *Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                   /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *[], int, cuDoubleComplex *[],
+      int, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched( cublasHandle_t handle, 
-                                                           int m, 
-                                                           int n,
-                                                           float *Aarray[],           /*Device pointer*/
-                                                           int lda, 
-                                                           float *TauArray[],        /* Device pointer*/                                                           
-                                                           int *info,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, float *[], int, float *[], int *, int);
+cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched(
+    cublasHandle_t handle, int m, int n, float *Aarray[], /*Device pointer*/
+    int lda, float *TauArray[],                           /* Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, float *[], int, float *[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasDgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            double *Aarray[],           /*Device pointer*/
-                                                            int lda, 
-                                                            double *TauArray[],        /* Device pointer*/                                                            
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, double *[], int, double *[], int *, int);
+cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched(
+    cublasHandle_t handle, int m, int n, double *Aarray[], /*Device pointer*/
+    int lda, double *TauArray[],                           /* Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, double *[], int, double *[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasCgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            cuComplex *Aarray[],           /*Device pointer*/
-                                                            int lda, 
-                                                            cuComplex *TauArray[],        /* Device pointer*/                                                            
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuComplex *[], int, cuComplex *[], int *, int);
+cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched(
+    cublasHandle_t handle, int m, int n, cuComplex *Aarray[], /*Device pointer*/
+    int lda, cuComplex *TauArray[], /* Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuComplex *[], int, cuComplex *[], int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasZgeqrfBatched( cublasHandle_t handle, 
-                                                            int m, 
-                                                            int n,
-                                                            cuDoubleComplex *Aarray[],           /*Device pointer*/
-                                                            int lda, 
-                                                            cuDoubleComplex *TauArray[],        /* Device pointer*/                                                          
-                                                            int *info,
-                                                            int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[], int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    cuDoubleComplex *Aarray[],            /*Device pointer*/
+                    int lda, cuDoubleComplex *TauArray[], /* Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[],
+      int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasSgelsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           float *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           float *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray, /* Device pointer*/
-                                                           int batchSize ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, float *[], int, float *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, float *Aarray[],             /*Device pointer*/
+                   int lda, float *Carray[],              /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /* Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, float *[], int,
+      float *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasDgelsBatched( cublasHandle_t handle,
-                                                           cublasOperation_t trans,  
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           double *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           double *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray, /* Device pointer*/
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, double *[], int, double *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, double *Aarray[],            /*Device pointer*/
+                   int lda, double *Carray[],             /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /* Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, double *[], int,
+      double *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasCgelsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           cuComplex *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           cuComplex *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *[], int, cuComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuComplex *Aarray[], /*Device pointer*/
+                   int lda, cuComplex *Carray[],  /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *[], int,
+      cuComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
-cublasStatus_t CUBLASWINAPI  cublasZgelsBatched( cublasHandle_t handle, 
-                                                           cublasOperation_t trans, 
-                                                           int m,  
-                                                           int n,
-                                                           int nrhs,
-                                                           cuDoubleComplex *Aarray[], /*Device pointer*/
-                                                           int lda, 
-                                                           cuDoubleComplex *Carray[], /* Device pointer*/
-                                                           int ldc,                                                                 
-                                                           int *info, 
-                                                           int *devInfoArray,
-                                                           int batchSize) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *[], int, cuDoubleComplex *[], int, int *, int *, int);
+cublasStatus_t CUBLASWINAPI
+cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuDoubleComplex *Aarray[], /*Device pointer*/
+                   int lda, cuDoubleComplex *Carray[],  /* Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuDoubleComplex *[],
+      int, cuDoubleComplex *[], int, int *, int *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info, devInfoArray, batchSize);
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
 }
 
 cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const float *A, 
-                                                  int lda,
-                                                  const float *x, 
-                                                  int incx,
-                                                  float *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const float *, int, const float *, int, float *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const double *A, 
-                                                  int lda,
-                                                  const double *x, 
-                                                  int incx,
-                                                  double *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const double *, int, const double *, int, double *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuComplex *A, 
-                                                  int lda,
-                                                  const cuComplex *x, 
-                                                  int incx,
-                                                  cuComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
 cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
-                                                  cublasSideMode_t mode, 
-                                                  int m, 
-                                                  int n,
-                                                  const cuDoubleComplex *A, 
-                                                  int lda,
-                                                  const cuDoubleComplex *x, 
-                                                  int incx,
-                                                  cuDoubleComplex *C, 
-                                                  int ldc) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *AP,
-                                                     float *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *AP,
-                                                     double *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *AP,
-                                                     cuComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, cuComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtpttr ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *AP,
-                                                     cuDoubleComplex *A,  
-                                                     int lda ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, AP, A, lda);
 }
 
-cublasStatus_t CUBLASWINAPI cublasStrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const float *A,
-                                                     int lda,
-                                                     float *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasDtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const double *A,
-                                                     int lda,
-                                                     double *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasCtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuComplex *A,
-                                                     int lda,
-                                                     cuComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuComplex *, int, cuComplex *);
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus_t CUBLASWINAPI cublasZtrttp ( cublasHandle_t handle, 
-                                                     cublasFillMode_t uplo, 
-                                                     int n,                                     
-                                                     const cuDoubleComplex *A,
-                                                     int lda,
-                                                     cuDoubleComplex *AP ) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, uplo, n, A, lda, AP);
 }
 
-cublasStatus CUBLASWINAPI cublasInit (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasShutdown (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
-cublasStatus CUBLASWINAPI cublasGetError (void) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)();
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr();
 }
 
 cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int *);
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(version);
 }
 
-cublasStatus CUBLASWINAPI cublasAlloc (int n, int elemSize, void **devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(int, int, void **);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(n, elemSize, devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasFree (void *devicePtr) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(void *);
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(devicePtr);
 }
 
-cublasStatus CUBLASWINAPI cublasSetKernelStream (cudaStream_t stream) {
-  using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cudaStream_t);
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stream);
 }
 
-float CUBLASWINAPI cublasSnrm2 (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDnrm2 (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScnrm2 (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDznrm2 (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSdot (int n, const float *x, int incx, const float *y, 
-                               int incy) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int, const float *, int);
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-double CUBLASWINAPI cublasDdot (int n, const double *x, int incx, const double *y, 
-                               int incy) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int, const double *, int);
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotu (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuComplex CUBLASWINAPI cublasCdotc (int n, const cuComplex *x, int incx, const cuComplex *y, 
-                               int incy) {
-  using FuncPtr = cuComplex (CUBLASWINAPI *)(int, const cuComplex *, int, const cuComplex *, int);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotu (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
   return func_ptr(n, x, incx, y, incy);
 }
 
-cuDoubleComplex CUBLASWINAPI cublasZdotc (int n, const cuDoubleComplex *x, int incx, const cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = cuDoubleComplex (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSscal (int n, float alpha, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, float *, int);
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasDscal (int n, double alpha, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, double *, int);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCscal (int n, cuComplex alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZscal (int n, cuDoubleComplex alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasCsscal (int n, float alpha, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasZdscal (int n, double alpha, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
   return func_ptr(n, alpha, x, incx);
 }
 
-void CUBLASWINAPI cublasSaxpy (int n, float alpha, const float *x, int incx, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDaxpy (int n, double alpha, const double *x, 
-                               int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCaxpy (int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZaxpy (int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
   return func_ptr(n, alpha, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasScopy (int n, const float *x, int incx, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const float *, int, float *, int);
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDcopy (int n, const double *x, int incx, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCcopy (int n, const cuComplex *x, int incx, cuComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZcopy (int n, const cuDoubleComplex *x, int incx, cuDoubleComplex *y,
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasSswap (int n, float *x, int incx, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int);
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasDswap (int n, double *x, int incx, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int);
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasCswap (int n, cuComplex *x, int incx, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-void CUBLASWINAPI cublasZswap (int n, cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
   return func_ptr(n, x, incx, y, incy);
 }
 
-int CUBLASWINAPI cublasIsamax (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamax (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamax (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamax (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIsamin (int n, const float *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const float *, int);
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIdamin (int n, const double *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const double *, int);
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIcamin (int n, const cuComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuComplex *, int);
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
   return func_ptr(n, x, incx);
 }
 
-int CUBLASWINAPI cublasIzamin (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = int (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
   if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasSasum (int n, const float *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const float *, int);
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDasum (int n, const double *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const double *, int);
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
   return func_ptr(n, x, incx);
 }
 
-float CUBLASWINAPI cublasScasum (int n, const cuComplex *x, int incx) {
-  using FuncPtr = float (CUBLASWINAPI *)(int, const cuComplex *, int);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
   return func_ptr(n, x, incx);
 }
 
-double CUBLASWINAPI cublasDzasum (int n, const cuDoubleComplex *x, int incx) {
-  using FuncPtr = double (CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
   return func_ptr(n, x, incx);
 }
 
-void CUBLASWINAPI cublasSrot (int n, float *x, int incx, float *y, int incy, 
-                              float sc, float ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrot (int n, double *x, int incx, double *y, int incy, 
-                              double sc, double ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
   return func_ptr(n, x, incx, y, incy, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrot (int n, cuComplex *x, int incx, cuComplex *y, 
-                              int incy, float c, cuComplex s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, cuComplex);
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZrot (int n, cuDoubleComplex *x, int incx, 
-                              cuDoubleComplex *y, int incy, double sc, 
-                              cuDoubleComplex cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, cuDoubleComplex);
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
   return func_ptr(n, x, incx, y, incy, sc, cs);
 }
 
-void CUBLASWINAPI cublasCsrot (int n, cuComplex *x, int incx, cuComplex *y,
-                               int incy, float c, float s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int, float, float);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasZdrot (int n, cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex *y, int incy, double c, double s) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int, double, double);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
   return func_ptr(n, x, incx, y, incy, c, s);
 }
 
-void CUBLASWINAPI cublasSrotg (float *sa, float *sb, float *sc, float *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, float *);
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasDrotg (double *sa, double *sb, double *sc, double *ss) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, double *);
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
   return func_ptr(sa, sb, sc, ss);
 }
 
-void CUBLASWINAPI cublasCrotg (cuComplex *ca, cuComplex cb, float *sc,
-                               cuComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasZrotg (cuDoubleComplex *ca, cuDoubleComplex cb, double *sc,
-                               cuDoubleComplex *cs) {
-  using FuncPtr = void (CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex, double *, cuDoubleComplex *);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
   return func_ptr(ca, cb, sc, cs);
 }
 
-void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy, 
-                              const float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy, 
-                              const double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
   return func_ptr(n, x, incx, y, incy, sparam);
 }
 
-void CUBLASWINAPI cublasSrotmg (float *sd1, float *sd2, float *sx1, 
-                                const float *sy1, float* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasDrotmg (double *sd1, double *sd2, double *sx1, 
-                                const double *sy1, double* sparam) {
-  using FuncPtr = void (CUBLASWINAPI *)(double *, double *, double *, const double *, double *);
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
   return func_ptr(sd1, sd2, sx1, sy1, sparam);
 }
 
-void CUBLASWINAPI cublasSgemv (char trans, int m, int n, float alpha,
-                               const float *A, int lda, const float *x, int incx,
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgemv (char trans, int m, int n, double alpha,
-                               const double *A, int lda, const double *x, int incx,
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgemv (char trans, int m, int n, cuComplex alpha,
-                               const cuComplex *A, int lda, const cuComplex *x, int incx,
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgemv (char trans, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
   return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSgbmv (char trans, int m, int n, int kl, int ku, 
-                               float alpha, const float *A, int lda, 
-                               const float *x, int incx, float beta, float *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDgbmv (char trans, int m, int n, int kl, int ku, 
-                               double alpha, const double *A, int lda, 
-                               const double *x, int incx, double beta, double *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasCgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuComplex alpha, const cuComplex *A, int lda, 
-                               const cuComplex *x, int incx, cuComplex beta, cuComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZgbmv (char trans, int m, int n, int kl, int ku, 
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda, 
-                               const cuDoubleComplex *x, int incx, cuDoubleComplex beta, cuDoubleComplex *y, 
-                               int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
   return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasStrmv (char uplo, char trans, char diag, int n, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrmv (char uplo, char trans, char diag, int n, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrmv (char uplo, char trans, char diag, int n, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrmv (char uplo, char trans, char diag, int n, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStbmv (char uplo, char trans, char diag, int n, int k, 
-                               const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbmv (char uplo, char trans, char diag, int n, int k, 
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float *AP, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float *A, int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double *A, int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex *A, int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *A, int lda, 
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
                               cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
   return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float *AP, 
-                              float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double *AP, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const double *, double *, int);
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuComplex *, cuComplex *, int);
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex *AP, 
-                              cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
   return func_ptr(uplo, trans, diag, n, AP, x, incx);
 }
 
-void CUBLASWINAPI cublasStbsv(char uplo, char trans, 
-                              char diag, int n, int k, const float *A, 
-                              int lda, float *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasDtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const double *A, 
-                              int lda, double *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasCtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuComplex *A, 
-                              int lda, cuComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasZtbsv(char uplo, char trans, 
-                              char diag, int n, int k, const cuDoubleComplex *A, 
-                              int lda, cuDoubleComplex *x, int incx) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
   return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
 }
 
-void CUBLASWINAPI cublasSsymv (char uplo, int n, float alpha, const float *A,
-                               int lda, const float *x, int incx, float beta, 
-                               float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsymv (char uplo, int n, double alpha, const double *A,
-                               int lda, const double *x, int incx, double beta, 
-                               double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChemv (char uplo, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, const cuComplex *x, int incx, cuComplex beta, 
-                               cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhemv (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *A,
-                               int lda, const cuDoubleComplex *x, int incx, cuDoubleComplex beta, 
-                               cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
   return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSsbmv (char uplo, int n, int k, float alpha, 
-                               const float *A, int lda, const float *x, int incx, 
-                               float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDsbmv (char uplo, int n, int k, double alpha, 
-                               const double *A, int lda, const double *x, int incx, 
-                               double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasChbmv (char uplo, int n, int k, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *x, int incx, 
-                               cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasZhbmv (char uplo, int n, int k, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx, 
-                               cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
   return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha,
-                              const float *AP, const float *x,
-                              int incx, float beta, float *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha,
-                              const double *AP, const double *x,
-                              int incx, double beta, double *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
-                              const cuComplex *AP, const cuComplex *x,
-                              int incx, cuComplex beta, cuComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, const cuComplex *, int, cuComplex, cuComplex *, int);
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
 void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
-                              const cuDoubleComplex *AP, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex beta, cuDoubleComplex *y, int incy) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
   return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
 }
 
-void CUBLASWINAPI cublasSger (int m, int n, float alpha, const float *x, int incx,
-                              const float *y, int incy, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDger (int m, int n, double alpha, const double *x, int incx,
-                              const double *y, int incy, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgeru (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCgerc (int m, int n, cuComplex alpha, const cuComplex *x,
-                               int incx, const cuComplex *y, int incy,
-                               cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgeru (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZgerc (int m, int n, cuDoubleComplex alpha, const cuDoubleComplex *x,
-                               int incx, const cuDoubleComplex *y, int incy,
-                               cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
   return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSsyr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasCher (char uplo, int n, float alpha, 
-                              const cuComplex *x, int incx, cuComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasZher (char uplo, int n, double alpha, 
-                              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
   return func_ptr(uplo, n, alpha, x, incx, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr (char uplo, int n, float alpha, const float *x,
-                              int incx, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasDspr (char uplo, int n, double alpha, const double *x,
-                              int incx, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasChpr (char uplo, int n, float alpha, const cuComplex *x,
-                              int incx, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasZhpr (char uplo, int n, double alpha, const cuDoubleComplex *x,
-                              int incx, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
   return func_ptr(uplo, n, alpha, x, incx, AP);
 }
 
-void CUBLASWINAPI cublasSsyr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *, int);
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasDsyr2 (char uplo, int n, double alpha, const double *x, 
-                               int incx, const double *y, int incy, double *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *, int);
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasCher2 (char uplo, int n, cuComplex alpha, const cuComplex *x, 
-                               int incx, const cuComplex *y, int incy, cuComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasZher2 (char uplo, int n, cuDoubleComplex alpha, const cuDoubleComplex *x, 
-                               int incx, const cuDoubleComplex *y, int incy, cuDoubleComplex *A, 
-                               int lda) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
 }
 
-void CUBLASWINAPI cublasSspr2 (char uplo, int n, float alpha, const float *x, 
-                               int incx, const float *y, int incy, float *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, float, const float *, int, const float *, int, float *);
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasDspr2 (char uplo, int n, double alpha,
-                               const double *x, int incx, const double *y,
-                               int incy, double *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, double, const double *, int, const double *, int, double *);
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasChpr2 (char uplo, int n, cuComplex alpha,
-                               const cuComplex *x, int incx, const cuComplex *y,
-                               int incy, cuComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex *);
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasZhpr2 (char uplo, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
-                               int incy, cuDoubleComplex *AP) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex *);
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
   return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
 }
 
-void CUBLASWINAPI cublasSgemm (char transa, char transb, int m, int n, int k, 
-                               float alpha, const float *A, int lda, 
-                               const float *B, int ldb, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDgemm (char transa, char transb, int m, int n, int k,
-                               double alpha, const double *A, int lda, 
-                               const double *B, int ldb, double beta, double *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCgemm (char transa, char transb, int m, int n, int k, 
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZgemm (char transa, char transb, int m, int n,
-                               int k, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C,
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
   return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyrk (char uplo, char trans, int n, int k, float alpha, 
-                               const float *A, int lda, float beta, float *C, 
-                               int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyrk (char uplo, char trans, int n, int k,
-                               double alpha, const double *A, int lda,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyrk (char uplo, char trans, int n, int k,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyrk (char uplo, char trans, int n, int k,
-                               cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCherk (char uplo, char trans, int n, int k,
-                               float alpha, const cuComplex *A, int lda,
-                               float beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZherk (char uplo, char trans, int n, int k,
-                               double alpha,
-                               const cuDoubleComplex *A, int lda,
-                               double beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsyr2k (char uplo, char trans, int n, int k, float alpha, 
-                                const float *A, int lda, const float *B, int ldb, 
-                                float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsyr2k (char uplo, char trans, int n, int k,
-                                double alpha, const double *A, int lda,
-                                const double *B, int ldb, double beta,
-                                double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsyr2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, cuComplex beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsyr2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCher2k (char uplo, char trans, int n, int k,
-                                cuComplex alpha, const cuComplex *A, int lda,
-                                const cuComplex *B, int ldb, float beta,
-                                cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, float, cuComplex *, int);
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZher2k (char uplo, char trans, int n, int k,
-                                cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                                const cuDoubleComplex *B, int ldb, double beta,
-                                cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
   return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasSsymm (char side, char uplo, int m, int n, float alpha, 
-                               const float *A, int lda, const float *B, int ldb,
-                               float beta, float *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, float, const float *, int, const float *, int, float, float *, int);
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasDsymm (char side, char uplo, int m, int n, double alpha, 
-                               const double *A, int lda, const double *B, int ldb,
-                               double beta, double *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, double, const double *, int, const double *, int, double, double *, int);
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasCsymm (char side, char uplo, int m, int n, cuComplex alpha, 
-                               const cuComplex *A, int lda, const cuComplex *B, int ldb,
-                               cuComplex beta, cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZsymm (char side, char uplo, int m, int n, cuDoubleComplex alpha, 
-                               const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
-                               cuDoubleComplex beta, cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasChemm (char side, char uplo, int m, int n,
-                               cuComplex alpha, const cuComplex *A, int lda,
-                               const cuComplex *B, int ldb, cuComplex beta,
-                               cuComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *, int, const cuComplex *, int, cuComplex, cuComplex *, int);
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasZhemm (char side, char uplo, int m, int n,
-                               cuDoubleComplex alpha, const cuDoubleComplex *A, int lda,
-                               const cuDoubleComplex *B, int ldb, cuDoubleComplex beta,
-                               cuDoubleComplex *C, int ldc) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
   return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
 }
 
-void CUBLASWINAPI cublasStrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrsm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrsm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda,
-                               cuDoubleComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasStrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, float alpha, const float *A, int lda,
-                               float *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, float, const float *, int, float *, int);
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasDtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, double alpha,
-                               const double *A, int lda, double *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, double, const double *, int, double *, int);
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasCtrmm (char side, char uplo, char transa, char diag,
-                               int m, int n, cuComplex alpha, const cuComplex *A,
-                               int lda, cuComplex *B, int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex, const cuComplex *, int, cuComplex *, int);
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
 }
 
-void CUBLASWINAPI cublasZtrmm (char side, char uplo, char transa,
-                               char diag, int m, int n, cuDoubleComplex alpha,
-                               const cuDoubleComplex *A, int lda, cuDoubleComplex *B,
-                               int ldb) {
-  using FuncPtr = void (CUBLASWINAPI *)(char, char, char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
   if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
   return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
diff --git a/tensorflow/stream_executor/cuda/cudnn_6_0.inc b/tensorflow/stream_executor/cuda/cudnn_6_0.inc
index 6ac7a695d9f..11288983a4a 100644
--- a/tensorflow/stream_executor/cuda/cudnn_6_0.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_6_0.inc
@@ -3,1771 +3,1823 @@
 extern "C" {
 
 size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
 size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
-                                cudnnTensorDescriptor_t            *tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType, // image data type
-                                int                                 n,        // number of inputs (batch size)
-                                int                                 c,        // number of input feature maps
-                                int                                 h,        // height of input section
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType,  // image data type
+    int n,                     // number of inputs (batch size)
+    int c,                     // number of input feature maps
+    int h,                     // height of input section
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType, // image data type
-                                int                                 n,        // number of inputs (batch size)
-                                int                                 c,        // number of input feature maps
-                                int                                 h,        // height of input section
-                                int                                 w,        // width of input section
-                                int                                 nStride,
-                                int                                 cStride,
-                                int                                 hStride,
-                                int                                 wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType,  // image data type
+    int n,                     // number of inputs (batch size)
+    int c,                     // number of input feature maps
+    int h,                     // height of input section
+    int w,                     // width of input section
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                cudnnDataType_t                    *dataType, // image data type
-                                int                                *n,        // number of inputs (batch size)
-                                int                                *c,        // number of input feature maps
-                                int                                *h,        // height of input section
-                                int                                *w,        // width of input section
-                                int                                *nStride,
-                                int                                *cStride,
-                                int                                *hStride,
-                                int                                *wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType,  // image data type
+    int *n,                     // number of inputs (batch size)
+    int *c,                     // number of input feature maps
+    int *h,                     // height of input section
+    int *w,                     // width of input section
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType,
-                                int                                *nbDims,
-                                int                                 dimA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                size_t                              *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc,
-                                cudnnOpTensorOp_t                   opTensorOp,
-                                cudnnDataType_t                     opTensorCompType,
-                                cudnnNanPropagation_t               opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                cudnnOpTensorOp_t                  *opTensorOp,
-                                cudnnDataType_t                    *opTensorCompType,
-                                cudnnNanPropagation_t              *opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       bDesc,
-                                const void                         *B,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
-                                cudnnReduceTensorOp_t                   reduceTensorOp,
-                                cudnnDataType_t                     reduceTensorCompType,
-                                cudnnNanPropagation_t               reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t          reduceTensorIndices,
-                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
-                                cudnnReduceTensorOp_t                  *reduceTensorOp,
-                                cudnnDataType_t                    *reduceTensorCompType,
-                                cudnnNanPropagation_t              *reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
-                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                void                               *indices,
-                                size_t                              indicesSizeInBytes,
-                                void                               *workspace,
-                                size_t                              workspaceSizeInBytes,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *valuePtr ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *alpha ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
-                                cudnnFilterDescriptor_t            *filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, // image data type
-                                cudnnTensorFormat_t                 format,
-                                int                                 k,        // number of output feature maps
-                                int                                 c,        // number of input feature maps
-                                int                                 h,        // height of each input filter
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType,  // image data type
+                           cudnnTensorFormat_t format,
+                           int k,  // number of output feature maps
+                           int c,  // number of input feature maps
+                           int h,  // height of each input filter
+                           int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                cudnnDataType_t                    *dataType, // image data type
-                                cudnnTensorFormat_t                *format,
-                                int                                *k,        // number of output feature maps
-                                int                                *c,        // number of input feature maps
-                                int                                *h,        // height of each input filter
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType,  // image data type
+                           cudnnTensorFormat_t *format,
+                           int *k,  // number of output feature maps
+                           int *c,  // number of input feature maps
+                           int *h,  // height of each input filter
+                           int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, // image data type
-                                cudnnTensorFormat_t                 format,
-                                int                                 nbDims,
-                                const int                           filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,  // image data type
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType, // image data type
-                                cudnnTensorFormat_t                *format,
-                                int                                *nbDims,
-                                int                                 filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType,  // image data type
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t       *convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    // zero-padding height
-                                                             int pad_w,    // zero-padding width
-                                                             int u,   // vertical filter stride
-                                                             int v,   // horizontal filter stride
-                                                             int dilation_h, // filter dilation in the vertical dimension
-                                                             int dilation_w, // filter dilation in the horizontal dimension
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc,
+    int pad_h,       // zero-padding height
+    int pad_w,       // zero-padding width
+    int u,           // vertical filter stride
+    int v,           // horizontal filter stride
+    int dilation_h,  // filter dilation in the vertical dimension
+    int dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    // zero-padding height
-                                                            int* pad_w,    // zero-padding width
-                                                            int* u,        // vertical filter stride
-                                                            int* v,        // horizontal filter stride
-                                                            int* dilation_h, // filter dilation in the vertical dimension
-                                                            int* dilation_w, // filter dilation in the horizontal dimension
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,       // zero-padding height
+    int *pad_w,       // zero-padding width
+    int *u,           // vertical filter stride
+    int *v,           // horizontal filter stride
+    int *dilation_h,  // filter dilation in the vertical dimension
+    int *dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 arrayLength,             /* nbDims-2 size */
-                                const int                           padA[],
-                                const int                           filterStrideA[],
-                                const int                           dilationA[],
-                                cudnnConvolutionMode_t              mode,
-                                cudnnDataType_t                     computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                 arrayLengthRequested,
-                                int                                *arrayLength,
-                                int                                 padA[],
-                                int                                 strideA[],
-                                int                                 dilationA[],
-                                cudnnConvolutionMode_t             *mode,
-                                cudnnDataType_t                    *computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDims,
-                                int                                 tensorOutputDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdPreference_t     preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionFwdAlgo_t          *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       zDesc,
-                                const void                         *z,
-                                const cudnnTensorDescriptor_t       biasDesc,
-                                const void                         *bias,
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dbDesc,
-                                void                               *db ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                const int                           requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                                cudnnHandle_t                        handle,
-                                const cudnnTensorDescriptor_t        xDesc,
-                                const void                          *x,
-                                const cudnnTensorDescriptor_t        dyDesc,
-                                const void                          *y,
-                                const cudnnConvolutionDescriptor_t   convDesc,
-                                const cudnnFilterDescriptor_t        dwDesc,
-                                void                                *dw,
-                                const int                            requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                void                                *workSpace,
-                                size_t                               workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         xDesc,
-                                const cudnnTensorDescriptor_t         dyDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         dwDesc,
-                                cudnnConvolutionBwdFilterPreference_t preference,
-                                size_t                                memoryLimitInBytes,
-                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       gradDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                void                               *dw ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataPreference_t preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                void                               *colBuffer ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
-                                cudnnPoolingDescriptor_t           *poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                cudnnPoolingMode_t                  mode,
-                                cudnnNanPropagation_t               maxpoolingNanOpt,
-                                int                                 windowHeight,
-                                int                                 windowWidth,
-                                int                                 verticalPadding,
-                                int                                 horizontalPadding,
-                                int                                 verticalStride,
-                                int                                 horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *windowHeight,
-                                int                                *windowWidth,
-                                int                                *verticalPadding,
-                                int                                *horizontalPadding,
-                                int                                *verticalStride,
-                                int                                *horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                const cudnnPoolingMode_t            mode,
-                                const cudnnNanPropagation_t         maxpoolingNanOpt,
-                                int                                 nbDims,
-                                const int                           windowDimA[],
-                                const int                           paddingA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                int                                 nbDimsRequested,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *nbDims,
-                                int                                 windowDimA[],
-                                int                                 paddingA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                 nbDims,
-                                int                                 outputTensorDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                          *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
-                                cudnnActivationDescriptor_t        *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-                                cudnnActivationDescriptor_t         activationDesc,
-                                cudnnActivationMode_t               mode,
-                                cudnnNanPropagation_t               reluNanOpt,
-                                double                              coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                cudnnActivationMode_t              *mode,
-                                cudnnNanPropagation_t              *reluNanOpt,
-                                double*                             coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
-                                cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
-                                cudnnLRNDescriptor_t               *normDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned                            lrnN,
-                                double                              lrnAlpha,
-                                double                              lrnBeta,
-                                double                              lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned*                           lrnN,
-                                double*                             lrnAlpha,
-                                double*                             lrnBeta,
-                                double*                             lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, // same desc for means, temp, temp2
-                                const void                         *x,
-                                const void                         *means, // if NULL, means are assumed to be zero
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc,  // same desc for means, temp, temp2
+    const void *x,
+    const void *means,  // if NULL, means are assumed to be zero
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, // same desc for x, means, dy, temp, temp2
-                                const void                         *x,
-                                const void                         *means, // if NULL, means are assumed to be zero
-                                const void                         *dy,
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dXdMeansDesc, // same desc for dx, dMeans
-                                void                               *dx, // output x differential
-                                void                               *dMeans ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc,  // same desc for x, means, dy, temp, temp2
+    const void *x,
+    const void *means,  // if NULL, means are assumed to be zero
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc,  // same desc for dx, dMeans
+    void *dx,                                    // output x differential
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-                                cudnnTensorDescriptor_t             derivedBnDesc,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                cudnnBatchNormMode_t                mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
-                                const void                         *alpha, // alpha[0] = result blend factor
-                                const void                         *beta,  // beta[0] = dest layer blend factor
+    const void *alpha,  // alpha[0] = result blend factor
+    const void *beta,   // beta[0] = dest layer blend factor
 
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     // NxCxHxW
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     // NxCxHxW
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x,  // NxCxHxW
+    const cudnnTensorDescriptor_t yDesc,
+    void *y,  // NxCxHxW
 
-                                /* Shared desc for the next 6 tensors in the argument list.
-                                   Data type to be set as follows:
-                                   type = (typeOf(x) == double) ? double : float
-                                   Dimensions for this descriptor depend on normalization mode
-                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-                                    (normalization is performed across NxHxW)
-                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
-                                    (normalization is performed across N) */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-                                // 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-                                const void                         *bnScale,
-                                const void                         *bnBias,
+    // 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+    const void *bnScale, const void *bnBias,
 
-                                /* MUST use factor=1 in the very first call of a complete training cycle.
-                                   Use a factor=1/(1+n) at N-th call to the function to get
-                                   Cumulative Moving Average (CMA) behavior
-                                   CMA[n] = (x[1]+...+x[n])/n
-                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-                                double                              exponentialAverageFactor,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
 
-                                /* Used in Training phase only. 
-                                   runningMean = newMean*factor + runningMean*(1-factor) */
-                                void                               *resultRunningMean,
-                                /* Output in training mode, input in inference. Is the moving average
-                                   of  variance[x] (factor is applied in the same way as for runningMean) */
-                                void                               *resultRunningVariance,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
 
-                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
-                                double                              epsilon,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
+    double epsilon,
 
-                                /* Optionally save intermediate results from the forward pass here
-                                   - can be reused to speed up backward pass. NULL if unused */
-                                void                               *resultSaveMean,
-                                void                               *resultSaveInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alpha, // alpha[0] = result blend factor
-                                const void                         *beta,  // beta[0] = dest layer blend factor
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     // NxCxHxW
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     // NxCxHxW
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
-                                const void                         *bnScale,
-                                const void                         *bnBias,
-                                const void                         *estimatedMean,
-                                const void                         *estimatedVariance,
-                                double                              epsilon ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha,  // alpha[0] = result blend factor
+    const void *beta,   // beta[0] = dest layer blend factor
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x,  // NxCxHxW
+    const cudnnTensorDescriptor_t yDesc,
+    void *y,  // NxCxHxW
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alphaDataDiff,
-                                const void                         *betaDataDiff,
-                                const void                         *alphaParamDiff,
-                                const void                         *betaParamDiff,
-                                const cudnnTensorDescriptor_t       xDesc, // same desc for x, dx, dy
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
-                                const void                         *bnScale, // bnBias doesn't affect backpropagation
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void                               *dBnScaleResult,
-                                void                               *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double                              epsilon,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc,  // same desc for x, dx, dy
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale,  // bnBias doesn't affect backpropagation
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void                         *savedMean,
-                                const void                         *savedInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor( 
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
 
-                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-                                cudnnSpatialTransformerDescriptor_t         stDesc,
-                                cudnnSamplerType_t                          samplerType, 
-                                cudnnDataType_t                             dataType,
-                                const int                                   nbDims,
-                                const int                                   dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *theta,
-                                 void                                      *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *dgrid,
-                                 void                                      *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,                                    
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *grid,
-                                 const void                                *beta,
-                                 cudnnTensorDescriptor_t                    yDesc,
-                                 void                                      *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *beta,
-                                 const cudnnTensorDescriptor_t              dxDesc,
-                                 void                                      *dx,
-                                 const void                                *alphaDgrid,
-                                 const cudnnTensorDescriptor_t              dyDesc,
-                                 const void                                *dy,
-                                 const void                                *grid,
-                                 const void                                *betaDgrid,
-                                 void                                      *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t handle,
-                                                    float dropout, 
-                                                    void * states, 
-                                                    size_t stateSizeInBytes, 
-                                                    unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t handle, 
-                                                      const cudnnDropoutDescriptor_t dropoutDesc,
-                                                      const cudnnTensorDescriptor_t xdesc, 
-                                                      const void * x,
-                                                      const cudnnTensorDescriptor_t ydesc,
-                                                      void * y,
-                                                      void * reserveSpace,
-                                                      size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t handle, 
-                                               const cudnnDropoutDescriptor_t dropoutDesc,
-                                               const cudnnTensorDescriptor_t dydesc, 
-                                               const void * dy,
-                                               const cudnnTensorDescriptor_t dxdesc,
-                                               void * dx,
-                                               void * reserveSpace,
-                                               size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                             const int minibatch,
-                                             const cudnnDataType_t dataType,
-                                             cudnnPersistentRNNPlan_t * plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t handle, 
-                                                cudnnRNNDescriptor_t rnnDesc,
-                                                const int hiddenSize, 
-                                                const int numLayers, 
-                                                cudnnDropoutDescriptor_t dropoutDesc, // Between layers, not between recurrent steps.
-                                                cudnnRNNInputMode_t inputMode,                                                 
-                                                cudnnDirectionMode_t direction, 
-                                                cudnnRNNMode_t mode, 
-                                                cudnnRNNAlgo_t algo, 
-                                                cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc,  // Between layers, not between recurrent steps.
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnRNNDescriptor_t rnnDesc,
-                                                int hiddenSize, 
-                                                int numLayers, 
-                                                cudnnDropoutDescriptor_t dropoutDesc, // Between layers, not between recurrent steps.
-                                                cudnnRNNInputMode_t inputMode,                                                 
-                                                cudnnDirectionMode_t direction, 
-                                                cudnnRNNMode_t mode, 
-                                                cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc,  // Between layers, not between recurrent steps.
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t              handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,  
-                                                    const int seqLength, 
-                                                    const cudnnTensorDescriptor_t    *xDesc,
-                                                    size_t                     *sizeInBytes
-                                                    ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t              handle,
-                                                          const cudnnRNNDescriptor_t rnnDesc,  
-                                                          const int seqLength, 
-                                                          const cudnnTensorDescriptor_t    *xDesc,
-                                                          size_t                     *sizeInBytes
-                                                    ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t              handle,
-                                                 const cudnnRNNDescriptor_t rnnDesc,  
-                                                 const cudnnTensorDescriptor_t    xDesc,                                                    
-                                                 size_t                     *sizeInBytes,
-                                                 cudnnDataType_t dataType
-                                                    ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t              handle,
-                             const cudnnRNNDescriptor_t rnnDesc,  
-                             const int layer,
-                             const cudnnTensorDescriptor_t xDesc, 
-                             const cudnnFilterDescriptor_t wDesc, 
-                             const void * w, 
-                             const int linLayerID,  
-                             cudnnFilterDescriptor_t linLayerMatDesc, 
-                             void ** linLayerMat
-                             ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t              handle,
-                             const cudnnRNNDescriptor_t rnnDesc,  
-                             const int layer,
-                             const cudnnTensorDescriptor_t xDesc, 
-                             const cudnnFilterDescriptor_t wDesc, 
-                             const void * w, 
-                             const int linLayerID, 
-                             cudnnFilterDescriptor_t linLayerBiasDesc, 
-                             void ** linLayerBias                       
-                             ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle, 
-                                                    const cudnnRNNDescriptor_t rnnDesc, 
-                                                    const int seqLength, 
-                                                    const cudnnTensorDescriptor_t * xDesc, 
-                                                    const void * x, 
-                                                    const cudnnTensorDescriptor_t hxDesc, 
-                                                    const void * hx, 
-                                                    const cudnnTensorDescriptor_t cxDesc, 
-                                                    const void * cx, 
-                                                    const cudnnFilterDescriptor_t wDesc, 
-                                                    const void * w, 
-                                                    const cudnnTensorDescriptor_t *yDesc,  
-                                                    void * y, 
-                                                    const cudnnTensorDescriptor_t hyDesc, 
-                                                    void * hy, 
-                                                    const cudnnTensorDescriptor_t cyDesc, 
-                                                    void * cy, 
-                                                    void * workspace, 
-                                                    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle, 
-                                                   const cudnnRNNDescriptor_t rnnDesc, 
-                                                   const int seqLength, 
-                                                   const cudnnTensorDescriptor_t *xDesc, 
-                                                   const void * x, 
-                                                   const cudnnTensorDescriptor_t hxDesc, 
-                                                   const void * hx, 
-                                                   const cudnnTensorDescriptor_t cxDesc, 
-                                                   const void * cx, 
-                                                   const cudnnFilterDescriptor_t wDesc, 
-                                                   const void * w, 
-                                                   const cudnnTensorDescriptor_t *yDesc,  
-                                                   void * y, 
-                                                   const cudnnTensorDescriptor_t hyDesc, 
-                                                   void * hy, 
-                                                   const cudnnTensorDescriptor_t cyDesc, 
-                                                   void * cy, 
-                                                   void * workspace, 
-                                                   size_t workSpaceSizeInBytes,
-                                                   void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle, 
-                                                const cudnnRNNDescriptor_t rnnDesc, 
-                                                const int seqLength, 
-                                                const cudnnTensorDescriptor_t * yDesc, 
-                                                const void * y,                                                
-                                                const cudnnTensorDescriptor_t * dyDesc, 
-                                                const void * dy, 
-                                                const cudnnTensorDescriptor_t dhyDesc, 
-                                                const void * dhy, 
-                                                const cudnnTensorDescriptor_t dcyDesc, 
-                                                const void * dcy, 
-                                                const cudnnFilterDescriptor_t wDesc, 
-                                                const void * w, 
-                                                const cudnnTensorDescriptor_t hxDesc, 
-                                                const void * hx,                                                                  
-                                                const cudnnTensorDescriptor_t cxDesc, 
-                                                const void * cx,                                                 
-                                                const cudnnTensorDescriptor_t * dxDesc, 
-                                                void * dx, 
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void * dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void * dcx,
-                                                void * workspace,
-                                                size_t workSpaceSizeInBytes,
-                                                void * reserveSpace, 
-                                                size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle, 
-                                                   const cudnnRNNDescriptor_t rnnDesc, 
-                                                   const int seqLength, 
-                                                   const cudnnTensorDescriptor_t * xDesc, 
-                                                   const void * x, 
-                                                   const cudnnTensorDescriptor_t hxDesc, 
-                                                   const void * hx,                                                   
-                                                   const cudnnTensorDescriptor_t * yDesc, 
-                                                   const void * y,
-                                                   const void * workspace, 
-                                                   size_t workSpaceSizeInBytes, 
-                                                   const cudnnFilterDescriptor_t dwDesc, 
-                                                   void * dw,
-                                                   const void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v4(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 pad_h,      // zero-padding height
-                                int                                 pad_w,      // zero-padding width
-                                int                                 u,          // vertical filter stride
-                                int                                 v,          // horizontal filter stride
-                                int                                 dilation_h, // filter dilation in the vertical dimension
-                                int                                 dilation_w, // filter dilation in the horizontal dimension
-                                cudnnConvolutionMode_t              mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v4");
+    cudnnConvolutionDescriptor_t convDesc,
+    int pad_h,       // zero-padding height
+    int pad_w,       // zero-padding width
+    int u,           // vertical filter stride
+    int v,           // horizontal filter stride
+    int dilation_h,  // filter dilation in the vertical dimension
+    int dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t mode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int,
+                                   int, int, int, cudnnConvolutionMode_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v4");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v5( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    // zero-padding height
-                                                             int pad_w,    // zero-padding width
-                                                             int u,   // vertical filter stride
-                                                             int v,   // horizontal filter stride
-                                                             int dilation_h, // filter dilation in the vertical dimension
-                                                             int dilation_w, // filter dilation in the horizontal dimension
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v5");
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor_v5(
+    cudnnConvolutionDescriptor_t convDesc,
+    int pad_h,       // zero-padding height
+    int pad_w,       // zero-padding width
+    int u,           // vertical filter stride
+    int v,           // horizontal filter stride
+    int dilation_h,  // filter dilation in the vertical dimension
+    int dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v4(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                *pad_h,    // zero-padding height
-                                int                                *pad_w,    // zero-padding width
-                                int                                *u,        // vertical filter stride
-                                int                                *v,        // horizontal filter stride
-                                int                                *dilation_h, // filter dilation in the vertical dimension
-                                int                                *dilation_w, // filter dilation in the horizontal dimension
-                                cudnnConvolutionMode_t             *mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v4");
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,       // zero-padding height
+    int *pad_w,       // zero-padding width
+    int *u,           // vertical filter stride
+    int *v,           // horizontal filter stride
+    int *dilation_h,  // filter dilation in the vertical dimension
+    int *dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t *mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v4");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v5(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    // zero-padding height
-                                                            int* pad_w,    // zero-padding width
-                                                            int* u,        // vertical filter stride
-                                                            int* v,        // horizontal filter stride
-                                                            int* dilation_h, // filter dilation in the vertical dimension
-                                                            int* dilation_w, // filter dilation in the horizontal dimension
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v5");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor_v5(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,       // zero-padding height
+    int *pad_w,       // zero-padding width
+    int *u,           // vertical filter stride
+    int *v,           // horizontal filter stride
+    int *dilation_h,  // filter dilation in the vertical dimension
+    int *dilation_w,  // filter dilation in the horizontal dimension
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_0.inc b/tensorflow/stream_executor/cuda/cudnn_7_0.inc
index d2ea31e366b..008ae9099c0 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_0.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_0.inc
@@ -3,1944 +3,2025 @@
 extern "C" {
 
 size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
 size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(
-                                cudnnHandle_t                       handle,
-                                cudnnStatus_t                      *rstatus,
-                                cudnnErrQueryMode_t                 mode,
-                                cudnnRuntimeTag_t                  *tag ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
-                                cudnnTensorDescriptor_t            *tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w,        /* width of input section */
-                                int                                 nStride,
-                                int                                 cStride,
-                                int                                 hStride,
-                                int                                 wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                int                                *n,        /* number of inputs (batch size) */
-                                int                                *c,        /* number of input feature maps  */
-                                int                                *h,        /* height of input section */
-                                int                                *w,        /* width of input section */
-                                int                                *nStride,
-                                int                                *cStride,
-                                int                                *hStride,
-                                int                                *wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType,
-                                int                                *nbDims,
-                                int                                 dimA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                size_t                              *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc,
-                                cudnnOpTensorOp_t                   opTensorOp,
-                                cudnnDataType_t                     opTensorCompType,
-                                cudnnNanPropagation_t               opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                cudnnOpTensorOp_t                  *opTensorOp,
-                                cudnnDataType_t                    *opTensorCompType,
-                                cudnnNanPropagation_t              *opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       bDesc,
-                                const void                         *B,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
-                                cudnnReduceTensorOp_t                   reduceTensorOp,
-                                cudnnDataType_t                     reduceTensorCompType,
-                                cudnnNanPropagation_t               reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t          reduceTensorIndices,
-                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
-                                cudnnReduceTensorOp_t                  *reduceTensorOp,
-                                cudnnDataType_t                    *reduceTensorCompType,
-                                cudnnNanPropagation_t              *reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
-                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                void                               *indices,
-                                size_t                              indicesSizeInBytes,
-                                void                               *workspace,
-                                size_t                              workspaceSizeInBytes,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *valuePtr ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *alpha ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
-                                cudnnFilterDescriptor_t            *filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 k,        /* number of output feature maps */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of each input filter */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *k,        /* number of output feature maps */
-                                int                                *c,        /* number of input feature maps */
-                                int                                *h,        /* height of each input filter */
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 nbDims,
-                                const int                           filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *nbDims,
-                                int                                 filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t       *convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t *mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int *groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    /* zero-padding height */
-                                                             int pad_w,    /* zero-padding width */
-                                                             int u,   /* vertical filter stride */
-                                                             int v,   /* horizontal filter stride */
-                                                             int dilation_h, /* filter dilation in the vertical dimension */
-                                                             int dilation_w, /* filter dilation in the horizontal dimension */
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    /* zero-padding height */
-                                                            int* pad_w,    /* zero-padding width */
-                                                            int* u,        /* vertical filter stride */
-                                                            int* v,        /* horizontal filter stride */
-                                                            int* dilation_h, /* filter dilation in the vertical dimension */
-                                                            int* dilation_w, /* filter dilation in the horizontal dimension */
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 arrayLength,             /* nbDims-2 size */
-                                const int                           padA[],
-                                const int                           filterStrideA[],
-                                const int                           dilationA[],
-                                cudnnConvolutionMode_t              mode,
-                                cudnnDataType_t                     computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                 arrayLengthRequested,
-                                int                                *arrayLength,
-                                int                                 padA[],
-                                int                                 strideA[],
-                                int                                 dilationA[],
-                                cudnnConvolutionMode_t             *mode,
-                                cudnnDataType_t                    *computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDims,
-                                int                                 tensorOutputDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                       int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdPreference_t     preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionFwdAlgo_t          *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-                                cudnnHandle_t                      handle,
-                                const cudnnTensorDescriptor_t      srcDesc,
-                                const cudnnFilterDescriptor_t      filterDesc,
-                                const cudnnConvolutionDescriptor_t convDesc,
-                                const cudnnTensorDescriptor_t      destDesc,
-                                const int                          requestedAlgoCount,
-                                int                               *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t     *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       zDesc,
-                                const void                         *z,
-                                const cudnnTensorDescriptor_t       biasDesc,
-                                const void                         *bias,
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dbDesc,
-                                void                               *db ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                              int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                const int                           requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                                cudnnHandle_t                        handle,
-                                const cudnnTensorDescriptor_t        xDesc,
-                                const void                          *x,
-                                const cudnnTensorDescriptor_t        dyDesc,
-                                const void                          *y,
-                                const cudnnConvolutionDescriptor_t   convDesc,
-                                const cudnnFilterDescriptor_t        dwDesc,
-                                void                                *dw,
-                                const int                            requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                void                                *workSpace,
-                                size_t                               workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         xDesc,
-                                const cudnnTensorDescriptor_t         dyDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         dwDesc,
-                                cudnnConvolutionBwdFilterPreference_t preference,
-                                size_t                                memoryLimitInBytes,
-                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         srcDesc,
-                                const cudnnTensorDescriptor_t         diffDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         gradDesc,
-                                const int                             requestedAlgoCount,
-                                int                                  *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       gradDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                void                               *dw ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                            int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataPreference_t preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                const cudnnTensorDescriptor_t       diffDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       gradDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                void                               *colBuffer ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
-                                cudnnPoolingDescriptor_t           *poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                cudnnPoolingMode_t                  mode,
-                                cudnnNanPropagation_t               maxpoolingNanOpt,
-                                int                                 windowHeight,
-                                int                                 windowWidth,
-                                int                                 verticalPadding,
-                                int                                 horizontalPadding,
-                                int                                 verticalStride,
-                                int                                 horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *windowHeight,
-                                int                                *windowWidth,
-                                int                                *verticalPadding,
-                                int                                *horizontalPadding,
-                                int                                *verticalStride,
-                                int                                *horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                const cudnnPoolingMode_t            mode,
-                                const cudnnNanPropagation_t         maxpoolingNanOpt,
-                                int                                 nbDims,
-                                const int                           windowDimA[],
-                                const int                           paddingA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                int                                 nbDimsRequested,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *nbDims,
-                                int                                 windowDimA[],
-                                int                                 paddingA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                 nbDims,
-                                int                                 outputTensorDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                          *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
-                                cudnnActivationDescriptor_t        *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-                                cudnnActivationDescriptor_t         activationDesc,
-                                cudnnActivationMode_t               mode,
-                                cudnnNanPropagation_t               reluNanOpt,
-                                double                              coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                cudnnActivationMode_t              *mode,
-                                cudnnNanPropagation_t              *reluNanOpt,
-                                double*                             coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
-                                cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
-                                cudnnLRNDescriptor_t               *normDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned                            lrnN,
-                                double                              lrnAlpha,
-                                double                              lrnBeta,
-                                double                              lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned*                           lrnN,
-                                double*                             lrnAlpha,
-                                double*                             lrnBeta,
-                                double*                             lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for means, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                const void                         *dy,
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dXdMeansDesc, /* same desc for dx, dMeans */
-                                void                               *dx, /* output x differential */
-                                void                               *dMeans ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-                                cudnnTensorDescriptor_t             derivedBnDesc,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                cudnnBatchNormMode_t                mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
 
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
-                                /* Shared desc for the next 6 tensors in the argument list.
-                                   Data type to be set as follows:
-                                   type = (typeOf(x) == double) ? double : float
-                                   Dimensions for this descriptor depend on normalization mode
-                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-                                    (normalization is performed across NxHxW)
-                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
-                                    (normalization is performed across N) */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-                                /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-                                const void                         *bnScale,
-                                const void                         *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
-                                /* MUST use factor=1 in the very first call of a complete training cycle.
-                                   Use a factor=1/(1+n) at N-th call to the function to get
-                                   Cumulative Moving Average (CMA) behavior
-                                   CMA[n] = (x[1]+...+x[n])/n
-                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-                                double                              exponentialAverageFactor,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
 
-                                /* Used in Training phase only. 
-                                   runningMean = newMean*factor + runningMean*(1-factor) */
-                                void                               *resultRunningMean,
-                                /* Output in training mode, input in inference. Is the moving average
-                                   of  variance[x] (factor is applied in the same way as for runningMean) */
-                                void                               *resultRunningVariance,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
 
-                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
-                                double                              epsilon,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
+    double epsilon,
 
-                                /* Optionally save intermediate results from the forward pass here
-                                   - can be reused to speed up backward pass. NULL if unused */
-                                void                               *resultSaveMean,
-                                void                               *resultSaveInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
-                                const void                         *bnScale,
-                                const void                         *bnBias,
-                                const void                         *estimatedMean,
-                                const void                         *estimatedVariance,
-                                double                              epsilon ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alphaDataDiff,
-                                const void                         *betaDataDiff,
-                                const void                         *alphaParamDiff,
-                                const void                         *betaParamDiff,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, dx, dy */
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
-                                const void                         *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void                               *dBnScaleResult,
-                                void                               *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double                              epsilon,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void                         *savedMean,
-                                const void                         *savedInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-                                cudnnSpatialTransformerDescriptor_t         stDesc,
-                                cudnnSamplerType_t                          samplerType, 
-                                cudnnDataType_t                             dataType,
-                                const int                                   nbDims,
-                                const int                                   dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *theta,
-                                 void                                      *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *dgrid,
-                                 void                                      *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,                                    
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *grid,
-                                 const void                                *beta,
-                                 cudnnTensorDescriptor_t                    yDesc,
-                                 void                                      *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *beta,
-                                 const cudnnTensorDescriptor_t              dxDesc,
-                                 void                                      *dx,
-                                 const void                                *alphaDgrid,
-                                 const cudnnTensorDescriptor_t              dyDesc,
-                                 const void                                *dy,
-                                 const void                                *grid,
-                                 const void                                *betaDgrid,
-                                 void                                      *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float                    dropout, 
-                                                    void *                   states, 
-                                                    size_t                   stateSizeInBytes, 
-                                                    unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                        cudnnHandle_t            handle,
-                                                        float                    dropout, 
-                                                        void *                   states, 
-                                                        size_t                   stateSizeInBytes, 
-                                                        unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float *                  dropout, 
-                                                    void **                  states,
-                                                    unsigned long long *     seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t                  handle, 
-                                              const cudnnDropoutDescriptor_t dropoutDesc,
-                                              const cudnnTensorDescriptor_t  xdesc, 
-                                              const void *                   x,
-                                              const cudnnTensorDescriptor_t  ydesc,
-                                              void *                         y,
-                                              void *                         reserveSpace,
-                                              size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t                  handle, 
-                                               const cudnnDropoutDescriptor_t dropoutDesc,
-                                               const cudnnTensorDescriptor_t  dydesc, 
-                                               const void *                   dy,
-                                               const cudnnTensorDescriptor_t  dxdesc,
-                                               void *                         dx,
-                                               void *                         reserveSpace,
-                                               size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t       rnnDesc,
-                                                       const int                  minibatch,
-                                                       const cudnnDataType_t      dataType,
-                                                       cudnnPersistentRNNPlan_t * plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                                    cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t            handle,
-                                                   cudnnRNNDescriptor_t     rnnDesc,
-                                                   const int                hiddenSize,
-                                                   const int                numLayers,
-                                                   cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                   cudnnRNNInputMode_t      inputMode,          
-                                                   cudnnDirectionMode_t     direction,
-                                                   cudnnRNNMode_t           mode,
-                                                   cudnnRNNAlgo_t           algo,
-                                                   cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t              cudnnHandle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                int *                      hiddenSize, 
-                                                int *                      numLayers, 
-                                                cudnnDropoutDescriptor_t * dropoutDesc,
-                                                cudnnRNNInputMode_t *      inputMode, 
-                                                cudnnDirectionMode_t *     direction, 
-                                                cudnnRNNMode_t *           mode, 
-                                                cudnnRNNAlgo_t *           algo, 
-                                                cudnnDataType_t *          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t cudnnHandle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(cudnnHandle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(cudnnHandle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType (cudnnRNNDescriptor_t desc, cudnnMathType_t math) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t desc,
+                                                    cudnnMathType_t math) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(desc, math);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t              handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,  
-                                                    const int seqLength, 
-                                                    const cudnnTensorDescriptor_t    *xDesc,
-                                                    size_t                     *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t              handle,
-                                                          const cudnnRNNDescriptor_t rnnDesc,  
-                                                          const int                  seqLength,
-                                                          const cudnnTensorDescriptor_t    *xDesc,
-                                                          size_t                   *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize( cudnnHandle_t                    handle,
-                                                 const cudnnRNNDescriptor_t       rnnDesc,  
-                                                 const cudnnTensorDescriptor_t    xDesc,
-                                                 size_t                          *sizeInBytes,
-                                                 cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t              handle,
-                                                           const cudnnRNNDescriptor_t rnnDesc, 
-                                                           const int layer,
-                                                           const cudnnTensorDescriptor_t xDesc,
-                                                           const cudnnFilterDescriptor_t wDesc,
-                                                           const void * w, 
-                                                           const int linLayerID,  
-                                                           cudnnFilterDescriptor_t linLayerMatDesc,
-                                                           void ** linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t              handle,
-                                                         const cudnnRNNDescriptor_t rnnDesc, 
-                                                         const int layer,
-                                                         const cudnnTensorDescriptor_t xDesc, 
-                                                         const cudnnFilterDescriptor_t wDesc,
-                                                         const void * w,
-                                                         const int linLayerID,
-                                                         cudnnFilterDescriptor_t linLayerBiasDesc,
-                                                         void ** linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, const int layer,
+    const cudnnTensorDescriptor_t xDesc, const cudnnFilterDescriptor_t wDesc,
+    const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, layer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,
-                                                    const int seqLength,
-                                                    const cudnnTensorDescriptor_t * xDesc,
-                                                    const void * x,
-                                                    const cudnnTensorDescriptor_t hxDesc,
-                                                    const void * hx,
-                                                    const cudnnTensorDescriptor_t cxDesc,
-                                                    const void * cx,
-                                                    const cudnnFilterDescriptor_t wDesc,
-                                                    const void * w,
-                                                    const cudnnTensorDescriptor_t *yDesc,
-                                                    void * y,
-                                                    const cudnnTensorDescriptor_t hyDesc,
-                                                    void * hy,
-                                                    const cudnnTensorDescriptor_t cyDesc,
-                                                    void * cy,
-                                                    void * workspace,
-                                                    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t handle,
-                                                   const cudnnRNNDescriptor_t rnnDesc,
-                                                   const int seqLength,
-                                                   const cudnnTensorDescriptor_t *xDesc,
-                                                   const void * x,
-                                                   const cudnnTensorDescriptor_t hxDesc,
-                                                   const void * hx,
-                                                   const cudnnTensorDescriptor_t cxDesc,
-                                                   const void * cx,
-                                                   const cudnnFilterDescriptor_t wDesc,
-                                                   const void * w,
-                                                   const cudnnTensorDescriptor_t *yDesc,
-                                                   void * y,
-                                                   const cudnnTensorDescriptor_t hyDesc,
-                                                   void * hy,
-                                                   const cudnnTensorDescriptor_t cyDesc,
-                                                   void * cy,
-                                                   void * workspace,
-                                                   size_t workSpaceSizeInBytes,
-                                                   void * reserveSpace,
-                                                   size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t handle,
-                                                const cudnnRNNDescriptor_t rnnDesc,
-                                                const int seqLength,
-                                                const cudnnTensorDescriptor_t * yDesc,
-                                                const void * y,
-                                                const cudnnTensorDescriptor_t * dyDesc,
-                                                const void * dy,
-                                                const cudnnTensorDescriptor_t dhyDesc,
-                                                const void * dhy,
-                                                const cudnnTensorDescriptor_t dcyDesc,
-                                                const void * dcy,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void * w,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void * hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void * cx,
-                                                const cudnnTensorDescriptor_t * dxDesc,
-                                                void * dx,
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void * dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void * dcx,
-                                                void * workspace,
-                                                size_t workSpaceSizeInBytes,
-                                                void * reserveSpace,
-                                                size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t handle,
-                                                   const cudnnRNNDescriptor_t rnnDesc,
-                                                   const int seqLength,
-                                                   const cudnnTensorDescriptor_t * xDesc,
-                                                   const void * x,
-                                                   const cudnnTensorDescriptor_t hxDesc,
-                                                   const void * hx,
-                                                   const cudnnTensorDescriptor_t * yDesc, 
-                                                   const void * y,
-                                                   const void * workspace, 
-                                                   size_t workSpaceSizeInBytes, 
-                                                   const cudnnFilterDescriptor_t dwDesc, 
-                                                   void * dw,
-                                                   const void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t                  compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t*                 compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, 
-                                        const cudnnTensorDescriptor_t probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size)  */
-                                        const void * probs,                          /* probabilities after softmax, in GPU memory */
-                                        const int * labels,                          /* labels, in CPU memory */
-                                        const int * labelLengths,                    /* the length of each label, in CPU memory */
-                                        const int * inputLengths,                    /* the lengths of timing steps in each batch, in CPU memory */
-                                        void * costs,                                /* the returned costs of CTC, in GPU memory */
-                                        const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-                                        const void * gradients,                      /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
-                                        cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-                                        cudnnCTCLossDescriptor_t ctcLossDesc,
-                                        void * workspace,                            /* pointer to the workspace, in GPU memory */
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace, /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       probsDesc,       /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */
-                                const cudnnTensorDescriptor_t       gradientsDesc,   /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */
-                                const int                          * labels,         /* labels, in CPU memory */
-                                const int                          * labelLengths,   /* the length of each label, in CPU memory */
-                                const int                          * inputLengths,   /* the lengths of timing steps in each batch, in CPU memory */
-                                cudnnCTCLossAlgo_t                  algo,            /* algorithm selected, supported now 0 and 1 */
-                                cudnnCTCLossDescriptor_t            ctcLossDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A. To compute costs only, set it to NULL */
+    const int *labels, /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t            handle,
-                                                   cudnnRNNDescriptor_t     rnnDesc,
-                                                   const int                hiddenSize,
-                                                   const int                numLayers,
-                                                   cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                   cudnnRNNInputMode_t      inputMode,          
-                                                   cudnnDirectionMode_t     direction,
-                                                   cudnnRNNMode_t           mode,
-                                                   cudnnRNNAlgo_t           algo,
-                                                   cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t     rnnDesc,
-                                                int                      hiddenSize,
-                                                int                      numLayers,
-                                                cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                cudnnRNNInputMode_t      inputMode,
-                                                cudnnDirectionMode_t     direction,
-                                                cudnnRNNMode_t           mode,
-                                                cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_1.inc b/tensorflow/stream_executor/cuda/cudnn_7_1.inc
index 9f4b28f3fe3..5330e6d0584 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_1.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_1.inc
@@ -3,2279 +3,2359 @@
 extern "C" {
 
 size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
 size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *  CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(
-                                cudnnHandle_t                       handle,
-                                cudnnStatus_t                      *rstatus,
-                                cudnnErrQueryMode_t                 mode,
-                                cudnnRuntimeTag_t                  *tag ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreate        (cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroy       (cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetStream     (cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetStream     (cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorDescriptor(
-                                cudnnTensorDescriptor_t            *tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                int                                 n,        /* number of inputs (batch size) */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of input section */
-                                int                                 w,        /* width of input section */
-                                int                                 nStride,
-                                int                                 cStride,
-                                int                                 hStride,
-                                int                                 wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                int                                *n,        /* number of inputs (batch size) */
-                                int                                *c,        /* number of input feature maps  */
-                                int                                *h,        /* height of input section */
-                                int                                *w,        /* width of input section */
-                                int                                *nStride,
-                                int                                *cStride,
-                                int                                *hStride,
-                                int                                *wStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-                                cudnnTensorDescriptor_t             tensorDesc,
-                                cudnnTensorFormat_t                 format,
-                                cudnnDataType_t                     dataType,
-                                int                                 nbDims,
-                                const int                           dimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType,
-                                int                                *nbDims,
-                                int                                 dimA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-                                const cudnnTensorDescriptor_t       tensorDesc,
-                                size_t                              *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorDescriptor(
-                                cudnnTensorDescriptor_t             tensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t          *opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc,
-                                cudnnOpTensorOp_t                   opTensorOp,
-                                cudnnDataType_t                     opTensorCompType,
-                                cudnnNanPropagation_t               opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                cudnnOpTensorOp_t                  *opTensorOp,
-                                cudnnDataType_t                    *opTensorCompType,
-                                cudnnNanPropagation_t              *opTensorNanOpt ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyOpTensorDescriptor(
-                                cudnnOpTensorDescriptor_t           opTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnOpTensorDescriptor_t     opTensorDesc,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       bDesc,
-                                const void                         *B,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t          *reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc,
-                                cudnnReduceTensorOp_t                   reduceTensorOp,
-                                cudnnDataType_t                     reduceTensorCompType,
-                                cudnnNanPropagation_t               reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t          reduceTensorIndices,
-                                cudnnIndicesType_t                  reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-                                const cudnnReduceTensorDescriptor_t     reduceTensorDesc,
-                                cudnnReduceTensorOp_t                  *reduceTensorOp,
-                                cudnnDataType_t                    *reduceTensorCompType,
-                                cudnnNanPropagation_t              *reduceTensorNanOpt,
-                                cudnnReduceTensorIndices_t         *reduceTensorIndices,
-                                cudnnIndicesType_t                 *reduceTensorIndicesType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-                                cudnnReduceTensorDescriptor_t           reduceTensorDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                                void                               *indices,
-                                size_t                              indicesSizeInBytes,
-                                void                               *workspace,
-                                size_t                              workspaceSizeInBytes,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       aDesc,
-                                const void                         *A,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       cDesc,
-                                void                               *C ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *valuePtr ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const void                         *alpha ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateFilterDescriptor(
-                                cudnnFilterDescriptor_t            *filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 k,        /* number of output feature maps */
-                                int                                 c,        /* number of input feature maps */
-                                int                                 h,        /* height of each input filter */
-                                int                                 w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *k,        /* number of output feature maps */
-                                int                                *c,        /* number of input feature maps */
-                                int                                *h,        /* height of each input filter */
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc,
-                                cudnnDataType_t                     dataType, /* image data type */
-                                cudnnTensorFormat_t                 format,
-                                int                                 nbDims,
-                                const int                           filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDimsRequested,
-                                cudnnDataType_t                    *dataType, /* image data type */
-                                cudnnTensorFormat_t                *format,
-                                int                                *nbDims,
-                                int                                 filterDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFilterDescriptor(
-                                cudnnFilterDescriptor_t             filterDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t       *convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType( cudnnConvolutionDescriptor_t convDesc,
-                                                       cudnnMathType_t *mathType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount( cudnnConvolutionDescriptor_t convDesc,
-                                                         int *groupCount ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor( cudnnConvolutionDescriptor_t convDesc,
-                                                             int pad_h,    /* zero-padding height */
-                                                             int pad_w,    /* zero-padding width */
-                                                             int u,   /* vertical filter stride */
-                                                             int v,   /* horizontal filter stride */
-                                                             int dilation_h, /* filter dilation in the vertical dimension */
-                                                             int dilation_w, /* filter dilation in the horizontal dimension */
-                                                             cudnnConvolutionMode_t mode,
-                                                             cudnnDataType_t computeType
-                                                           ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(  const cudnnConvolutionDescriptor_t convDesc,
-                                                            int* pad_h,    /* zero-padding height */
-                                                            int* pad_w,    /* zero-padding width */
-                                                            int* u,        /* vertical filter stride */
-                                                            int* v,        /* horizontal filter stride */
-                                                            int* dilation_h, /* filter dilation in the vertical dimension */
-                                                            int* dilation_w, /* filter dilation in the horizontal dimension */
-                                                            cudnnConvolutionMode_t* mode,
-                                                            cudnnDataType_t *computeType
-                                                         ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc,
-                                int                                 arrayLength,             /* nbDims-2 size */
-                                const int                           padA[],
-                                const int                           filterStrideA[],
-                                const int                           dilationA[],
-                                cudnnConvolutionMode_t              mode,
-                                cudnnDataType_t                     computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                int                                 arrayLengthRequested,
-                                int                                *arrayLength,
-                                int                                 padA[],
-                                int                                 strideA[],
-                                int                                 dilationA[],
-                                cudnnConvolutionMode_t             *mode,
-                                cudnnDataType_t                    *computeType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                int                                 nbDims,
-                                int                                 tensorOutputDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyConvolutionDescriptor(
-                                cudnnConvolutionDescriptor_t        convDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                       int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t      *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdPreference_t     preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionFwdAlgo_t          *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-                                cudnnHandle_t                      handle,
-                                const cudnnTensorDescriptor_t      srcDesc,
-                                const cudnnFilterDescriptor_t      filterDesc,
-                                const cudnnConvolutionDescriptor_t convDesc,
-                                const cudnnTensorDescriptor_t      destDesc,
-                                const int                          requestedAlgoCount,
-                                int                               *returnedAlgoCount,
-                                cudnnConvolutionFwdAlgoPerf_t     *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha1,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionFwdAlgo_t           algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *alpha2,
-                                const cudnnTensorDescriptor_t       zDesc,
-                                const void                         *z,
-                                const cudnnTensorDescriptor_t       biasDesc,
-                                const void                         *bias,
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dbDesc,
-                                void                               *db ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                              int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                const int                           requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-                                cudnnHandle_t                        handle,
-                                const cudnnTensorDescriptor_t        xDesc,
-                                const void                          *x,
-                                const cudnnTensorDescriptor_t        dyDesc,
-                                const void                          *y,
-                                const cudnnConvolutionDescriptor_t   convDesc,
-                                const cudnnFilterDescriptor_t        dwDesc,
-                                void                                *dw,
-                                const int                            requestedAlgoCount,
-                                int                                 *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                void                                *workSpace,
-                                size_t                               workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         xDesc,
-                                const cudnnTensorDescriptor_t         dyDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         dwDesc,
-                                cudnnConvolutionBwdFilterPreference_t preference,
-                                size_t                                memoryLimitInBytes,
-                                cudnnConvolutionBwdFilterAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-                                cudnnHandle_t                         handle,
-                                const cudnnTensorDescriptor_t         srcDesc,
-                                const cudnnTensorDescriptor_t         diffDesc,
-                                const cudnnConvolutionDescriptor_t    convDesc,
-                                const cudnnFilterDescriptor_t         gradDesc,
-                                const int                             requestedAlgoCount,
-                                int                                  *returnedAlgoCount,
-                                cudnnConvolutionBwdFilterAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnFilterDescriptor_t       gradDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdFilterAlgo_t     algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnFilterDescriptor_t       dwDesc,
-                                void                               *dw ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount( cudnnHandle_t     handle,
-                                                                            int              *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataPreference_t preference,
-                                size_t                              memoryLimitInBytes,
-                                cudnnConvolutionBwdDataAlgo_t      *algo ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       filterDesc,
-                                const cudnnTensorDescriptor_t       diffDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       gradDesc,
-                                const int                           requestedAlgoCount,
-                                int                                *returnedAlgoCount,
-                                cudnnConvolutionBwdDataAlgoPerf_t  *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-                                cudnnHandle_t                       handle,
-                                const void                         *alpha,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const void                         *w,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                cudnnConvolutionBwdDataAlgo_t       algo,
-                                void                               *workSpace,
-                                size_t                              workSpaceSizeInBytes,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnIm2Col(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const cudnnFilterDescriptor_t       wDesc,
-                                const cudnnConvolutionDescriptor_t  convDesc,
-                                void                               *colBuffer ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnSoftmaxAlgorithm_t             algo,
-                                cudnnSoftmaxMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePoolingDescriptor(
-                                cudnnPoolingDescriptor_t           *poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                cudnnPoolingMode_t                  mode,
-                                cudnnNanPropagation_t               maxpoolingNanOpt,
-                                int                                 windowHeight,
-                                int                                 windowWidth,
-                                int                                 verticalPadding,
-                                int                                 horizontalPadding,
-                                int                                 verticalStride,
-                                int                                 horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *windowHeight,
-                                int                                *windowWidth,
-                                int                                *verticalPadding,
-                                int                                *horizontalPadding,
-                                int                                *verticalStride,
-                                int                                *horizontalStride ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc,
-                                const cudnnPoolingMode_t            mode,
-                                const cudnnNanPropagation_t         maxpoolingNanOpt,
-                                int                                 nbDims,
-                                const int                           windowDimA[],
-                                const int                           paddingA[],
-                                const int                           strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                int                                 nbDimsRequested,
-                                cudnnPoolingMode_t                 *mode,
-                                cudnnNanPropagation_t              *maxpoolingNanOpt,
-                                int                                *nbDims,
-                                int                                 windowDimA[],
-                                int                                 paddingA[],
-                                int                                 strideA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                 nbDims,
-                                int                                 outputTensorDimA[] ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dForwardOutputDim(
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const cudnnTensorDescriptor_t       inputTensorDesc,
-                                int                                *n,
-                                int                                *c,
-                                int                                *h,
-                                int                                *w ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPoolingDescriptor(
-                                cudnnPoolingDescriptor_t            poolingDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-                                cudnnHandle_t                       handle,
-                                const cudnnPoolingDescriptor_t      poolingDesc,
-                                const void                          *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateActivationDescriptor(
-                                cudnnActivationDescriptor_t        *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-                                cudnnActivationDescriptor_t         activationDesc,
-                                cudnnActivationMode_t               mode,
-                                cudnnNanPropagation_t               reluNanOpt,
-                                double                              coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetActivationDescriptor(
-                                const cudnnActivationDescriptor_t   activationDesc,
-                                cudnnActivationMode_t              *mode,
-                                cudnnNanPropagation_t              *reluNanOpt,
-                                double*                             coef ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyActivationDescriptor(
-                                cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnActivationDescriptor_t         activationDesc,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateLRNDescriptor(
-                                cudnnLRNDescriptor_t               *normDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned                            lrnN,
-                                double                              lrnAlpha,
-                                double                              lrnBeta,
-                                double                              lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(
-                                cudnnLRNDescriptor_t                normDesc,
-                                unsigned*                           lrnN,
-                                double*                             lrnAlpha,
-                                double*                             lrnBeta,
-                                double*                             lrnK ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyLRNDescriptor( cudnnLRNDescriptor_t lrnDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnLRNMode_t                      lrnMode,
-                                const void*                         alpha,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                const void                         *y,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for means, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnLRNDescriptor_t                normDesc,
-                                cudnnDivNormMode_t                  mode,
-                                const void                         *alpha,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                const void                         *x,
-                                const void                         *means, /* if NULL, means are assumed to be zero */
-                                const void                         *dy,
-                                void                               *temp,
-                                void                               *temp2,
-                                const void                         *beta,
-                                const cudnnTensorDescriptor_t       dXdMeansDesc, /* same desc for dx, dMeans */
-                                void                               *dx, /* output x differential */
-                                void                               *dMeans ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-                                cudnnTensorDescriptor_t             derivedBnDesc,
-                                const cudnnTensorDescriptor_t       xDesc,
-                                cudnnBatchNormMode_t                mode ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
 
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
-                                /* Shared desc for the next 6 tensors in the argument list.
-                                   Data type to be set as follows:
-                                   type = (typeOf(x) == double) ? double : float
-                                   Dimensions for this descriptor depend on normalization mode
-                                   - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-                                    (normalization is performed across NxHxW)
-                                   - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW 
-                                    (normalization is performed across N) */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-                                /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-                                const void                         *bnScale,
-                                const void                         *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
-                                /* MUST use factor=1 in the very first call of a complete training cycle.
-                                   Use a factor=1/(1+n) at N-th call to the function to get
-                                   Cumulative Moving Average (CMA) behavior
-                                   CMA[n] = (x[1]+...+x[n])/n
-                                   Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-                                   ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-                                   CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-                                double                              exponentialAverageFactor,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
 
-                                /* Used in Training phase only. 
-                                   runningMean = newMean*factor + runningMean*(1-factor) */
-                                void                               *resultRunningMean,
-                                /* Output in training mode, input in inference. Is the moving average
-                                   of  variance[x] (factor is applied in the same way as for runningMean) */
-                                void                               *resultRunningVariance,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
 
-                                /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
-                                double                              epsilon,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
+    double epsilon,
 
-                                /* Optionally save intermediate results from the forward pass here
-                                   - can be reused to speed up backward pass. NULL if unused */
-                                void                               *resultSaveMean,
-                                void                               *resultSaveInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alpha, /* alpha[0] = result blend factor */
-                                const void                         *beta,  /* beta[0] = dest layer blend factor */
-                                const cudnnTensorDescriptor_t       xDesc,
-                                const void                         *x,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       yDesc,
-                                void                               *y,     /* NxCxHxW */
-                                const cudnnTensorDescriptor_t       bnScaleBiasMeanVarDesc,
-                                const void                         *bnScale,
-                                const void                         *bnBias,
-                                const void                         *estimatedMean,
-                                const void                         *estimatedVariance,
-                                double                              epsilon ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-                                cudnnHandle_t                       handle,
-                                cudnnBatchNormMode_t                mode,
-                                const void                         *alphaDataDiff,
-                                const void                         *betaDataDiff,
-                                const void                         *alphaParamDiff,
-                                const void                         *betaParamDiff,
-                                const cudnnTensorDescriptor_t       xDesc, /* same desc for x, dx, dy */
-                                const void                         *x,
-                                const cudnnTensorDescriptor_t       dyDesc,
-                                const void                         *dy,
-                                const cudnnTensorDescriptor_t       dxDesc,
-                                void                               *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t       dBnScaleBiasDesc,
-                                const void                         *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void                               *dBnScaleResult,
-                                void                               *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double                              epsilon,
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void                         *savedMean,
-                                const void                         *savedInvVariance ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-                               cudnnSpatialTransformerDescriptor_t        *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-                                cudnnSpatialTransformerDescriptor_t         stDesc,
-                                cudnnSamplerType_t                          samplerType, 
-                                cudnnDataType_t                             dataType,
-                                const int                                   nbDims,
-                                const int                                   dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-                                 cudnnSpatialTransformerDescriptor_t        stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *theta,
-                                 void                                      *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-                                 cudnnHandle_t                              handle,
-                                 const cudnnSpatialTransformerDescriptor_t  stDesc,
-                                 const void                                *dgrid,
-                                 void                                      *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,                                    
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *grid,
-                                 const void                                *beta,
-                                 cudnnTensorDescriptor_t                    yDesc,
-                                 void                                      *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-                                 cudnnHandle_t                              handle,
-                                 cudnnSpatialTransformerDescriptor_t        stDesc,
-                                 const void                                *alpha,
-                                 const cudnnTensorDescriptor_t              xDesc,
-                                 const void                                *x,
-                                 const void                                *beta,
-                                 const cudnnTensorDescriptor_t              dxDesc,
-                                 void                                      *dx,
-                                 const void                                *alphaDgrid,
-                                 const cudnnTensorDescriptor_t              dyDesc,
-                                 const void                                *dy,
-                                 const void                                *grid,
-                                 const void                                *betaDgrid,
-                                 void                                      *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t * dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t * sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float                    dropout, 
-                                                    void *                   states, 
-                                                    size_t                   stateSizeInBytes, 
-                                                    unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                        cudnnHandle_t            handle,
-                                                        float                    dropout, 
-                                                        void *                   states, 
-                                                        size_t                   stateSizeInBytes, 
-                                                        unsigned long long       seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc, 
-                                                    cudnnHandle_t            handle,
-                                                    float *                  dropout, 
-                                                    void **                  states,
-                                                    unsigned long long *     seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(cudnnHandle_t                  handle, 
-                                              const cudnnDropoutDescriptor_t dropoutDesc,
-                                              const cudnnTensorDescriptor_t  xdesc, 
-                                              const void *                   x,
-                                              const cudnnTensorDescriptor_t  ydesc,
-                                              void *                         y,
-                                              void *                         reserveSpace,
-                                              size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(cudnnHandle_t                  handle, 
-                                               const cudnnDropoutDescriptor_t dropoutDesc,
-                                               const cudnnTensorDescriptor_t  dydesc, 
-                                               const void *                   dy,
-                                               const cudnnTensorDescriptor_t  dxdesc,
-                                               void *                         dx,
-                                               void *                         reserveSpace,
-                                               size_t                         reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t * rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx( cudnnHandle_t handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,
-                                                    const int seqLength,
-                                                    const cudnnTensorDescriptor_t * xDesc,
-                                                    const void * x,
-                                                    const cudnnTensorDescriptor_t hxDesc,
-                                                    const void * hx,
-                                                    const cudnnTensorDescriptor_t cxDesc,
-                                                    const void * cx,
-                                                    const cudnnFilterDescriptor_t wDesc,
-                                                    const void * w,
-                                                    const cudnnTensorDescriptor_t *yDesc,
-                                                    void * y,
-                                                    const cudnnTensorDescriptor_t hyDesc,
-                                                    void * hy,
-                                                    const cudnnTensorDescriptor_t cyDesc,
-                                                    void * cy,
-                                                    const float findIntensity,
-                                                    const int requestedAlgoCount,
-                                                    int *returnedAlgoCount,
-                                                    cudnnAlgorithmPerformance_t *perfResults,
-                                                    void * workspace,
-                                                    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx( cudnnHandle_t handle,
-                                                    const cudnnRNNDescriptor_t rnnDesc,
-                                                    const int seqLength,
-                                                    const cudnnTensorDescriptor_t * xDesc,
-                                                    const void * x,
-                                                    const cudnnTensorDescriptor_t hxDesc,
-                                                    const void * hx,
-                                                    const cudnnTensorDescriptor_t cxDesc,
-                                                    const void * cx,
-                                                    const cudnnFilterDescriptor_t wDesc,
-                                                    const void * w,
-                                                    const cudnnTensorDescriptor_t *yDesc,
-                                                    void * y,
-                                                    const cudnnTensorDescriptor_t hyDesc,
-                                                    void * hy,
-                                                    const cudnnTensorDescriptor_t cyDesc,
-                                                    void * cy,
-                                                    const float findIntensity,
-                                                    const int requestedAlgoCount,
-                                                    int *returnedAlgoCount,
-                                                    cudnnAlgorithmPerformance_t *perfResults,
-                                                    void * workspace,
-                                                    size_t workSpaceSizeInBytes,
-                                                    void * reserveSpace,
-                                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx( cudnnHandle_t handle,
-                                                const cudnnRNNDescriptor_t rnnDesc,
-                                                const int seqLength,
-                                                const cudnnTensorDescriptor_t * yDesc,
-                                                const void * y,
-                                                const cudnnTensorDescriptor_t * dyDesc,
-                                                const void * dy,
-                                                const cudnnTensorDescriptor_t dhyDesc,
-                                                const void * dhy,
-                                                const cudnnTensorDescriptor_t dcyDesc,
-                                                const void * dcy,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void * w,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void * hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void * cx,
-                                                const cudnnTensorDescriptor_t * dxDesc,
-                                                void * dx,
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void * dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void * dcx,
-                                                const float findIntensity,
-                                                const int requestedAlgoCount,
-                                                int *returnedAlgoCount,
-                                                cudnnAlgorithmPerformance_t *perfResults,
-                                                void * workspace,
-                                                size_t workSpaceSizeInBytes,
-                                                void * reserveSpace,
-                                                size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
-                                cudnnHandle_t              handle,
-                          const cudnnRNNDescriptor_t       rnnDesc,
-                                int                        *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx( cudnnHandle_t handle,
-                                                   const cudnnRNNDescriptor_t rnnDesc,
-                                                   const int seqLength,
-                                                   const cudnnTensorDescriptor_t * xDesc,
-                                                   const void * x,
-                                                   const cudnnTensorDescriptor_t hxDesc,
-                                                   const void * hx,
-                                                   const cudnnTensorDescriptor_t * yDesc, 
-                                                   const void * y,
-                                                   const float findIntensity,
-                                                   const int requestedAlgoCount,
-                                                   int *returnedAlgoCount,
-                                                   cudnnAlgorithmPerformance_t *perfResults,
-                                                   const void * workspace, 
-                                                   size_t workSpaceSizeInBytes, 
-                                                   const cudnnFilterDescriptor_t dwDesc, 
-                                                   void * dw,
-                                                   const void * reserveSpace, 
-                                                   size_t reserveSpaceSizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t       rnnDesc,
-                                                       const int                  minibatch,
-                                                       const cudnnDataType_t      dataType,
-                                                       cudnnPersistentRNNPlan_t * plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                                                    cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(cudnnHandle_t              handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                const int                  hiddenSize,
-                                                const int                  numLayers,
-                                                cudnnDropoutDescriptor_t   dropoutDesc, /* Between layers, not between recurrent steps. */
-                                                cudnnRNNInputMode_t        inputMode,          
-                                                cudnnDirectionMode_t       direction,
-                                                cudnnRNNMode_t             mode,
-                                                cudnnRNNAlgo_t             algo,
-                                                cudnnDataType_t            dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNProjectionLayers(cudnnHandle_t        handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                const int                  recProjSize,
-                                                const int                  outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(cudnnHandle_t        handle,
-                                                const cudnnRNNDescriptor_t rnnDesc,
-                                                int                        *recProjSize,
-                                                int                        *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t     handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(cudnnHandle_t              handle,
-                                                cudnnRNNDescriptor_t       rnnDesc,
-                                                int *                      hiddenSize, 
-                                                int *                      numLayers, 
-                                                cudnnDropoutDescriptor_t * dropoutDesc,
-                                                cudnnRNNInputMode_t *      inputMode, 
-                                                cudnnDirectionMode_t *     direction, 
-                                                cudnnRNNMode_t *           mode, 
-                                                cudnnRNNAlgo_t *           algo, 
-                                                cudnnDataType_t *          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t* mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize( cudnnHandle_t             handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,  
-                                                const int                     seqLength, 
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                size_t                        *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize( cudnnHandle_t       handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,  
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                size_t                        *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNParamsSize(cudnnHandle_t                 handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,  
-                                                const cudnnTensorDescriptor_t xDesc,
-                                                size_t                        *sizeInBytes,
-                                                cudnnDataType_t               dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
+                      cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams( cudnnHandle_t      handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc, 
-                                                const int                     pseudoLayer,
-                                                const cudnnTensorDescriptor_t xDesc,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w, 
-                                                const int                     linLayerID,  
-                                                cudnnFilterDescriptor_t       linLayerMatDesc,
-                                                void                          **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams( cudnnHandle_t        handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc, 
-                                                const int                     pseudoLayer,
-                                                const cudnnTensorDescriptor_t xDesc, 
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const int                     linLayerID,
-                                                cudnnFilterDescriptor_t       linLayerBiasDesc,
-                                                void                          **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference( cudnnHandle_t             handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                const void                    *x,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void                    *cx,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const cudnnTensorDescriptor_t *yDesc,
-                                                void                          *y,
-                                                const cudnnTensorDescriptor_t hyDesc,
-                                                void                          *hy,
-                                                const cudnnTensorDescriptor_t cyDesc,
-                                                void                          *cy,
-                                                void                          *workspace,
-                                                size_t                        workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining( cudnnHandle_t              handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                const void                    *x,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void                    *cx,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const cudnnTensorDescriptor_t *yDesc,
-                                                void                          *y,
-                                                const cudnnTensorDescriptor_t hyDesc,
-                                                void                          *hy,
-                                                const cudnnTensorDescriptor_t cyDesc,
-                                                void                          *cy,
-                                                void                          *workspace,
-                                                size_t                        workSpaceSizeInBytes,
-                                                void *                        reserveSpace,
-                                                size_t                        reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardData( cudnnHandle_t                 handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *yDesc,
-                                                const void                    *y,
-                                                const cudnnTensorDescriptor_t *dyDesc,
-                                                const void                    *dy,
-                                                const cudnnTensorDescriptor_t dhyDesc,
-                                                const void                    *dhy,
-                                                const cudnnTensorDescriptor_t dcyDesc,
-                                                const void                    *dcy,
-                                                const cudnnFilterDescriptor_t wDesc,
-                                                const void                    *w,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t cxDesc,
-                                                const void                    *cx,
-                                                const cudnnTensorDescriptor_t *dxDesc,
-                                                void                          *dx,
-                                                const cudnnTensorDescriptor_t dhxDesc,
-                                                void                          *dhx,
-                                                const cudnnTensorDescriptor_t dcxDesc,
-                                                void                          *dcx,
-                                                void                          *workspace,
-                                                size_t                        workSpaceSizeInBytes,
-                                                void *                        reserveSpace,
-                                                size_t                        reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights( cudnnHandle_t              handle,
-                                                const cudnnRNNDescriptor_t    rnnDesc,
-                                                const int                     seqLength,
-                                                const cudnnTensorDescriptor_t *xDesc,
-                                                const void                    *x,
-                                                const cudnnTensorDescriptor_t hxDesc,
-                                                const void                    *hx,
-                                                const cudnnTensorDescriptor_t *yDesc, 
-                                                const void                    *y,
-                                                const void                    *workspace, 
-                                                size_t                        workSpaceSizeInBytes, 
-                                                const cudnnFilterDescriptor_t dwDesc, 
-                                                void                          *dw,
-                                                const void                    *reserveSpace, 
-                                                size_t                        reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateCTCLossDescriptor( cudnnCTCLossDescriptor_t* ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t                  compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-                                cudnnCTCLossDescriptor_t         ctcLossDesc,
-                                cudnnDataType_t*                 compType ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyCTCLossDescriptor( cudnnCTCLossDescriptor_t ctcLossDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss( cudnnHandle_t handle, 
-                                        const cudnnTensorDescriptor_t probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size)  */
-                                        const void * probs,                          /* probabilities after softmax, in GPU memory */
-                                        const int * labels,                          /* labels, in CPU memory */
-                                        const int * labelLengths,                    /* the length of each label, in CPU memory */
-                                        const int * inputLengths,                    /* the lengths of timing steps in each batch, in CPU memory */
-                                        void * costs,                                /* the returned costs of CTC, in GPU memory */
-                                        const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-                                        const void * gradients,                      /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
-                                        cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-                                        cudnnCTCLossDescriptor_t ctcLossDesc,
-                                        void * workspace,                            /* pointer to the workspace, in GPU memory */
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace, /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-                                cudnnHandle_t                       handle,
-                                const cudnnTensorDescriptor_t       probsDesc,       /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the mini batch size, A is the alphabet size) */
-                                const cudnnTensorDescriptor_t       gradientsDesc,   /* Tensor descriptor for gradients, the dimensions are T,N,A. To compute costs only, set it to NULL */
-                                const int                          * labels,         /* labels, in CPU memory */
-                                const int                          * labelLengths,   /* the length of each label, in CPU memory */
-                                const int                          * inputLengths,   /* the lengths of timing steps in each batch, in CPU memory */
-                                cudnnCTCLossAlgo_t                  algo,            /* algorithm selected, supported now 0 and 1 */
-                                cudnnCTCLossDescriptor_t            ctcLossDesc,
-                                size_t                             *sizeInBytes ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the mini batch size, A
+                      is the alphabet size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A. To compute costs only, set it to NULL */
+    const int *labels, /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmDescriptor(
-                                cudnnAlgorithmDescriptor_t *algoDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
-                                const cudnnAlgorithmDescriptor_t algoDesc,
-                                cudnnAlgorithm_t* algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
-                                const cudnnAlgorithmDescriptor_t src,
-                                cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmDescriptor(
-                                cudnnAlgorithmDescriptor_t algoDesc ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-                                cudnnAlgorithmPerformance_t* algoPerf,
-                                int numberToCreate ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
-                                cudnnAlgorithmPerformance_t algoPerf,
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                cudnnStatus_t status,
-                                float time,
-                                size_t memory ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
-                                const cudnnAlgorithmPerformance_t algoPerf,
-                                cudnnAlgorithmDescriptor_t* algoDesc,
-                                cudnnStatus_t* status,
-                                float* time,
-                                size_t* memory ) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-                                cudnnAlgorithmPerformance_t* algoPerf,
-                                int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
-                                cudnnHandle_t              handle,
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                size_t*                    algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSaveAlgorithm(
-                                cudnnHandle_t              handle,
-                                cudnnAlgorithmDescriptor_t algoDesc,
-                                void*                      algoSpace,
-                                size_t                     algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
-                                cudnnHandle_t              handle,
-                                void*                      algoSpace,
-                                size_t                     algoSpaceSizeInBytes,
-                                cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(
-                                unsigned            mask,
-                                void                *udata,
-                                cudnnCallback_t     fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(
-                                unsigned            *mask,
-                                void                **udata,
-                                cudnnCallback_t     *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(cudnnHandle_t         handle,
-                                                cudnnRNNDescriptor_t     rnnDesc,
-                                                const int                hiddenSize,
-                                                const int                numLayers,
-                                                cudnnDropoutDescriptor_t dropoutDesc,
-                                                cudnnRNNInputMode_t      inputMode,          
-                                                cudnnDirectionMode_t     direction,
-                                                cudnnRNNMode_t           mode,
-                                                cudnnRNNAlgo_t           algo,
-                                                cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t  rnnDesc,
-                                                int                      hiddenSize,
-                                                int                      numLayers,
-                                                cudnnDropoutDescriptor_t dropoutDesc,
-                                                cudnnRNNInputMode_t      inputMode,
-                                                cudnnDirectionMode_t     direction,
-                                                cudnnRNNMode_t           mode,
-                                                cudnnDataType_t          dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_3.inc b/tensorflow/stream_executor/cuda/cudnn_7_3.inc
index 0ee8e1492d5..f1c25c74d0c 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_3.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_3.inc
@@ -2,73 +2,71 @@
 
 extern "C" {
 
-size_t CUDNNWINAPI
-cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-size_t CUDNNWINAPI
-cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *CUDNNWINAPI
-cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
@@ -76,100 +74,97 @@ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnTensorFormat_t format,
-                           cudnnDataType_t dataType, /* image data type */
-                           int n,                    /* number of inputs (batch size) */
-                           int c,                    /* number of input feature maps */
-                           int h,                    /* height of input section */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnDataType_t dataType, /* image data type */
-                             int n,                    /* number of inputs (batch size) */
-                             int c,                    /* number of input feature maps */
-                             int h,                    /* height of input section */
-                             int w,                    /* width of input section */
-                             int nStride,
-                             int cStride,
-                             int hStride,
-                             int wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           int *n,                    /* number of inputs (batch size) */
-                           int *c,                    /* number of input feature maps  */
-                           int *h,                    /* height of input section */
-                           int *w,                    /* width of input section */
-                           int *nStride,
-                           int *cStride,
-                           int *hStride,
-                           int *wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t dataType,
-                           int nbDims,
-                           const int dimA[],
-                           const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnTensorFormat_t format,
-                             cudnnDataType_t dataType,
-                             int nbDims,
-                             const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType,
-                           int *nbDims,
-                           int dimA[],
-                           int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
@@ -177,35 +172,33 @@ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensor(cudnnHandle_t handle,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t yDesc,
-                     void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnAddTensor(cudnnHandle_t handle,
-               const void *alpha,
-               const cudnnTensorDescriptor_t aDesc,
-               const void *A,
-               const void *beta,
-               const cudnnTensorDescriptor_t cDesc,
-               void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
@@ -213,29 +206,29 @@ cudnnAddTensor(cudnnHandle_t handle,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t opTensorOp,
-                           cudnnDataType_t opTensorCompType,
-                           cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t *opTensorOp,
-                           cudnnDataType_t *opTensorCompType,
-                           cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
@@ -243,126 +236,136 @@ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnOpTensor(cudnnHandle_t handle,
-              const cudnnOpTensorDescriptor_t opTensorDesc,
-              const void *alpha1,
-              const cudnnTensorDescriptor_t aDesc,
-              const void *A,
-              const void *alpha2,
-              const cudnnTensorDescriptor_t bDesc,
-              const void *B,
-              const void *beta,
-              const cudnnTensorDescriptor_t cDesc,
-              void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t reduceTensorOp,
-                               cudnnDataType_t reduceTensorCompType,
-                               cudnnNanPropagation_t reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t reduceTensorIndices,
-                               cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t *reduceTensorOp,
-                               cudnnDataType_t *reduceTensorCompType,
-                               cudnnNanPropagation_t *reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t *reduceTensorIndices,
-                               cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionIndicesSize(cudnnHandle_t handle,
-                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                             const cudnnTensorDescriptor_t aDesc,
-                             const cudnnTensorDescriptor_t cDesc,
-                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
-                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               const cudnnTensorDescriptor_t aDesc,
-                               const cudnnTensorDescriptor_t cDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReduceTensor(cudnnHandle_t handle,
-                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                  void *indices,
-                  size_t indicesSizeInBytes,
-                  void *workspace,
-                  size_t workspaceSizeInBytes,
-                  const void *alpha,
-                  const cudnnTensorDescriptor_t aDesc,
-                  const void *A,
-                  const void *beta,
-                  const cudnnTensorDescriptor_t cDesc,
-                  void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
@@ -370,68 +373,70 @@ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int k,  /* number of output feature maps */
-                           int c,  /* number of input feature maps */
-                           int h,  /* height of each input filter */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *k,  /* number of output feature maps */
-                           int *c,  /* number of input feature maps */
-                           int *h,  /* height of each input filter */
-                           int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int nbDims,
-                           const int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *nbDims,
-                           int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
@@ -439,622 +444,657 @@ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int pad_h,      /* zero-padding height */
-                                int pad_w,      /* zero-padding width */
-                                int u,          /* vertical filter stride */
-                                int v,          /* horizontal filter stride */
-                                int dilation_h, /* filter dilation in the vertical dimension */
-                                int dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int *pad_h,      /* zero-padding height */
-                                int *pad_w,      /* zero-padding width */
-                                int *u,          /* vertical filter stride */
-                                int *v,          /* horizontal filter stride */
-                                int *dilation_h, /* filter dilation in the vertical dimension */
-                                int *dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int *n,
-                                      int *c,
-                                      int *h,
-                                      int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLength, /* nbDims-2 size */
-                                const int padA[],
-                                const int filterStrideA[],
-                                const int dilationA[],
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLengthRequested,
-                                int *arrayLength,
-                                int padA[],
-                                int strideA[],
-                                int dilationA[],
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int nbDims,
-                                      int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                     const cudnnTensorDescriptor_t xDesc,
-                                     const cudnnFilterDescriptor_t wDesc,
-                                     const cudnnConvolutionDescriptor_t convDesc,
-                                     const cudnnTensorDescriptor_t yDesc,
-                                     const int requestedAlgoCount,
-                                     int *returnedAlgoCount,
-                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t xDesc,
-                                       const void *x,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t yDesc,
-                                       void *y,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
-                                       void *workSpace,
-                                       size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                    const cudnnTensorDescriptor_t xDesc,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const cudnnConvolutionDescriptor_t convDesc,
-                                    const cudnnTensorDescriptor_t yDesc,
-                                    cudnnConvolutionFwdPreference_t preference,
-                                    size_t memoryLimitInBytes,
-                                    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t srcDesc,
-                                       const cudnnFilterDescriptor_t filterDesc,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t destDesc,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const cudnnConvolutionDescriptor_t convDesc,
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        cudnnConvolutionFwdAlgo_t algo,
-                                        size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionForward(cudnnHandle_t handle,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnConvolutionDescriptor_t convDesc,
-                        cudnnConvolutionFwdAlgo_t algo,
-                        void *workSpace,
-                        size_t workSpaceSizeInBytes,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t yDesc,
-                        void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
-                                      const void *alpha1,
-                                      const cudnnTensorDescriptor_t xDesc,
-                                      const void *x,
-                                      const cudnnFilterDescriptor_t wDesc,
-                                      const void *w,
-                                      const cudnnConvolutionDescriptor_t convDesc,
-                                      cudnnConvolutionFwdAlgo_t algo,
-                                      void *workSpace,
-                                      size_t workSpaceSizeInBytes,
-                                      const void *alpha2,
-                                      const cudnnTensorDescriptor_t zDesc,
-                                      const void *z,
-                                      const cudnnTensorDescriptor_t biasDesc,
-                                      const void *bias,
-                                      const cudnnActivationDescriptor_t activationDesc,
-                                      const cudnnTensorDescriptor_t yDesc,
-                                      void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardBias(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dbDesc,
-                             void *db) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                            const cudnnTensorDescriptor_t xDesc,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnFilterDescriptor_t dwDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t xDesc,
-                                              const void *x,
-                                              const cudnnTensorDescriptor_t dyDesc,
-                                              const void *y,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t dwDesc,
-                                              void *dw,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                              void *workSpace,
-                                              size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t xDesc,
-                                           const cudnnTensorDescriptor_t dyDesc,
-                                           const cudnnConvolutionDescriptor_t convDesc,
-                                           const cudnnFilterDescriptor_t dwDesc,
-                                           cudnnConvolutionBwdFilterPreference_t preference,
-                                           size_t memoryLimitInBytes,
-                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t srcDesc,
-                                              const cudnnTensorDescriptor_t diffDesc,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t gradDesc,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
-                                               const cudnnTensorDescriptor_t xDesc,
-                                               const cudnnTensorDescriptor_t dyDesc,
-                                               const cudnnConvolutionDescriptor_t convDesc,
-                                               const cudnnFilterDescriptor_t gradDesc,
-                                               cudnnConvolutionBwdFilterAlgo_t algo,
-                                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
-                               const void *alpha,
-                               const cudnnTensorDescriptor_t xDesc,
-                               const void *x,
-                               const cudnnTensorDescriptor_t dyDesc,
-                               const void *dy,
-                               const cudnnConvolutionDescriptor_t convDesc,
-                               cudnnConvolutionBwdFilterAlgo_t algo,
-                               void *workSpace,
-                               size_t workSpaceSizeInBytes,
-                               const void *beta,
-                               const cudnnFilterDescriptor_t dwDesc,
-                               void *dw) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t wDesc,
-                                          const cudnnTensorDescriptor_t dyDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t dxDesc,
-                                          const int requestedAlgoCount,
-                                          int *returnedAlgoCount,
-                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t wDesc,
-                                            const void *w,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const void *dy,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t dxDesc,
-                                            void *dx,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
-                                            void *workSpace,
-                                            size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                         const cudnnFilterDescriptor_t wDesc,
-                                         const cudnnTensorDescriptor_t dyDesc,
-                                         const cudnnConvolutionDescriptor_t convDesc,
-                                         const cudnnTensorDescriptor_t dxDesc,
-                                         cudnnConvolutionBwdDataPreference_t preference,
-                                         size_t memoryLimitInBytes,
-                                         cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t filterDesc,
-                                            const cudnnTensorDescriptor_t diffDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t gradDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
-                                             const cudnnFilterDescriptor_t wDesc,
-                                             const cudnnTensorDescriptor_t dyDesc,
-                                             const cudnnConvolutionDescriptor_t convDesc,
-                                             const cudnnTensorDescriptor_t dxDesc,
-                                             cudnnConvolutionBwdDataAlgo_t algo,
-                                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardData(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnFilterDescriptor_t wDesc,
-                             const void *w,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnConvolutionDescriptor_t convDesc,
-                             cudnnConvolutionBwdDataAlgo_t algo,
-                             void *workSpace,
-                             size_t workSpaceSizeInBytes,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle,
-            const cudnnTensorDescriptor_t xDesc,
-            const void *x,
-            const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc,
-            void *colBuffer) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxForward(cudnnHandle_t handle,
-                    cudnnSoftmaxAlgorithm_t algo,
-                    cudnnSoftmaxMode_t mode,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxBackward(cudnnHandle_t handle,
-                     cudnnSoftmaxAlgorithm_t algo,
-                     cudnnSoftmaxMode_t mode,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t mode,
-                            cudnnNanPropagation_t maxpoolingNanOpt,
-                            int windowHeight,
-                            int windowWidth,
-                            int verticalPadding,
-                            int horizontalPadding,
-                            int verticalStride,
-                            int horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *windowHeight,
-                            int *windowWidth,
-                            int *verticalPadding,
-                            int *horizontalPadding,
-                            int *verticalStride,
-                            int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            const cudnnPoolingMode_t mode,
-                            const cudnnNanPropagation_t maxpoolingNanOpt,
-                            int nbDims,
-                            const int windowDimA[],
-                            const int paddingA[],
-                            const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            int nbDimsRequested,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *nbDims,
-                            int windowDimA[],
-                            int paddingA[],
-                            int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims,
-                                  int outputTensorDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
@@ -1062,72 +1102,69 @@ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n,
-                                  int *c,
-                                  int *h,
-                                  int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingForward(cudnnHandle_t handle,
-                    const cudnnPoolingDescriptor_t poolingDesc,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingBackward(cudnnHandle_t handle,
-                     const cudnnPoolingDescriptor_t poolingDesc,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t mode,
-                             cudnnNanPropagation_t reluNanOpt,
-                             double coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1136,9 +1173,10 @@ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
                              cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt,
-                             double *coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1146,65 +1184,68 @@ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationForward(cudnnHandle_t handle,
-                       cudnnActivationDescriptor_t activationDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t xDesc,
-                       const void *x,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t yDesc,
-                       void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationBackward(cudnnHandle_t handle,
-                        cudnnActivationDescriptor_t activationDesc,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t yDesc,
-                        const void *y,
-                        const cudnnTensorDescriptor_t dyDesc,
-                        const void *dy,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t dxDesc,
-                        void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
@@ -1212,110 +1253,104 @@ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrn
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelForward(cudnnHandle_t handle,
-                            cudnnLRNDescriptor_t normDesc,
-                            cudnnLRNMode_t lrnMode,
-                            const void *alpha,
-                            const cudnnTensorDescriptor_t xDesc,
-                            const void *x,
-                            const void *beta,
-                            const cudnnTensorDescriptor_t yDesc,
-                            void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
-                             cudnnLRNDescriptor_t normDesc,
-                             cudnnLRNMode_t lrnMode,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t yDesc,
-                             const void *y,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
-                                  cudnnLRNDescriptor_t normDesc,
-                                  cudnnDivNormMode_t mode,
-                                  const void *alpha,
-                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-                                  const void *x,
-                                  const void *means, /* if NULL, means are assumed to be zero */
-                                  void *temp,
-                                  void *temp2,
-                                  const void *beta,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
-                                   cudnnLRNDescriptor_t normDesc,
-                                   cudnnDivNormMode_t mode,
-                                   const void *alpha,
-                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                   const void *x,
-                                   const void *means, /* if NULL, means are assumed to be zero */
-                                   const void *dy,
-                                   void *temp,
-                                   void *temp2,
-                                   const void *beta,
-                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-                                   void *dx,                                   /* output x differential */
-                                   void *dMeans) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
-                              const cudnnTensorDescriptor_t xDesc,
-                              cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc,
-    void *y, /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
     /* Shared desc for the next 6 tensors in the argument list.
        Data type to be set as follows:
@@ -1323,13 +1358,13 @@ cudnnBatchNormalizationForwardTraining(
        Dimensions for this descriptor depend on normalization mode
        - Spatial Normalization : tensors are expected to have dims 1xCx1x1
         (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
-        (normalization is performed across N) */
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
     const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-    const void *bnScale,
-    const void *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
     /* MUST use factor=1 in the very first call of a complete training cycle.
        Use a factor=1/(1+n) at N-th call to the function to get
@@ -1347,162 +1382,173 @@ cudnnBatchNormalizationForwardTraining(
        of  variance[x] (factor is applied in the same way as for runningMean) */
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
-                                        cudnnBatchNormMode_t mode,
-                                        const void *alpha, /* alpha[0] = result blend factor */
-                                        const void *beta,  /* beta[0] = dest layer blend factor */
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const void *x, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        void *y, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                        const void *bnScale,
-                                        const void *bnBias,
-                                        const void *estimatedMean,
-                                        const void *estimatedVariance,
-                                        double epsilon) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackward(cudnnHandle_t handle,
-                                cudnnBatchNormMode_t mode,
-                                const void *alphaDataDiff,
-                                const void *betaDataDiff,
-                                const void *alphaParamDiff,
-                                const void *betaParamDiff,
-                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-                                const void *x,
-                                const cudnnTensorDescriptor_t dyDesc,
-                                const void *dy,
-                                const cudnnTensorDescriptor_t dxDesc,
-                                void *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                const void *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void *dBnScaleResult,
-                                void *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double epsilon,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void *savedMean,
-                                const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
-                                       cudnnSamplerType_t samplerType,
-                                       cudnnDataType_t dataType,
-                                       const int nbDims,
-                                       const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
-                                   const cudnnSpatialTransformerDescriptor_t stDesc,
-                                   const void *theta,
-                                   void *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
-                                    const cudnnSpatialTransformerDescriptor_t stDesc,
-                                    const void *dgrid,
-                                    void *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
-                             cudnnSpatialTransformerDescriptor_t stDesc,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *grid,
-                             const void *beta,
-                             cudnnTensorDescriptor_t yDesc,
-                             void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
-                              cudnnSpatialTransformerDescriptor_t stDesc,
-                              const void *alpha,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const void *x,
-                              const void *beta,
-                              const cudnnTensorDescriptor_t dxDesc,
-                              void *dx,
-                              const void *alphaDgrid,
-                              const cudnnTensorDescriptor_t dyDesc,
-                              const void *dy,
-                              const void *grid,
-                              const void *betaDgrid,
-                              void *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
@@ -1510,99 +1556,95 @@ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float dropout,
-                          void *states,
-                          size_t stateSizeInBytes,
-                          unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                              cudnnHandle_t handle,
-                              float dropout,
-                              void *states,
-                              size_t stateSizeInBytes,
-                              unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float *dropout,
-                          void **states,
-                          unsigned long long *seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutForward(cudnnHandle_t handle,
-                    const cudnnDropoutDescriptor_t dropoutDesc,
-                    const cudnnTensorDescriptor_t xdesc,
-                    const void *x,
-                    const cudnnTensorDescriptor_t ydesc,
-                    void *y,
-                    void *reserveSpace,
-                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutBackward(cudnnHandle_t handle,
-                     const cudnnDropoutDescriptor_t dropoutDesc,
-                     const cudnnTensorDescriptor_t dydesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dxdesc,
-                     void *dx,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
@@ -1610,184 +1652,192 @@ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
-                                        const cudnnRNNDescriptor_t rnnDesc,
-                                        const int seqLength,
-                                        const cudnnTensorDescriptor_t *xDesc,
-                                        const void *x,
-                                        const cudnnTensorDescriptor_t hxDesc,
-                                        const void *hx,
-                                        const cudnnTensorDescriptor_t cxDesc,
-                                        const void *cx,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const void *w,
-                                        const cudnnTensorDescriptor_t *yDesc,
-                                        void *y,
-                                        const cudnnTensorDescriptor_t hyDesc,
-                                        void *hy,
-                                        const cudnnTensorDescriptor_t cyDesc,
-                                        void *cy,
-                                        const float findIntensity,
-                                        const int requestedAlgoCount,
-                                        int *returnedAlgoCount,
-                                        cudnnAlgorithmPerformance_t *perfResults,
-                                        void *workspace,
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t cxDesc,
-                                       const void *cx,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       void *y,
-                                       const cudnnTensorDescriptor_t hyDesc,
-                                       void *hy,
-                                       const cudnnTensorDescriptor_t cyDesc,
-                                       void *cy,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnnDesc,
-                                    const int seqLength,
-                                    const cudnnTensorDescriptor_t *yDesc,
-                                    const void *y,
-                                    const cudnnTensorDescriptor_t *dyDesc,
-                                    const void *dy,
-                                    const cudnnTensorDescriptor_t dhyDesc,
-                                    const void *dhy,
-                                    const cudnnTensorDescriptor_t dcyDesc,
-                                    const void *dcy,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const void *w,
-                                    const cudnnTensorDescriptor_t hxDesc,
-                                    const void *hx,
-                                    const cudnnTensorDescriptor_t cxDesc,
-                                    const void *cx,
-                                    const cudnnTensorDescriptor_t *dxDesc,
-                                    void *dx,
-                                    const cudnnTensorDescriptor_t dhxDesc,
-                                    void *dhx,
-                                    const cudnnTensorDescriptor_t dcxDesc,
-                                    void *dcx,
-                                    const float findIntensity,
-                                    const int requestedAlgoCount,
-                                    int *returnedAlgoCount,
-                                    cudnnAlgorithmPerformance_t *perfResults,
-                                    void *workspace,
-                                    size_t workSpaceSizeInBytes,
-                                    void *reserveSpace,
-                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       const void *y,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       const void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       const cudnnFilterDescriptor_t dwDesc,
-                                       void *dw,
-                                       const void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                             const int minibatch,
-                             const cudnnDataType_t dataType,
-                             cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
@@ -1795,289 +1845,285 @@ cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      const int hiddenSize,
-                      const int numLayers,
-                      cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                      cudnnRNNInputMode_t inputMode,
-                      cudnnDirectionMode_t direction,
-                      cudnnRNNMode_t mode,
-                      cudnnRNNAlgo_t algo,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
-                            cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize,
-                            const int outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
-                            const cudnnRNNDescriptor_t rnnDesc,
-                            int *recProjSize,
-                            int *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      int *hiddenSize,
-                      int *numLayers,
-                      cudnnDropoutDescriptor_t *dropoutDesc,
-                      cudnnRNNInputMode_t *inputMode,
-                      cudnnDirectionMode_t *direction,
-                      cudnnRNNMode_t *mode,
-                      cudnnRNNAlgo_t *algo,
-                      cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
-                               const cudnnRNNDescriptor_t rnnDesc,
-                               const int seqLength,
-                               const cudnnTensorDescriptor_t *xDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle,
-                      const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc,
-                      size_t *sizeInBytes,
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
                       cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
-                                const cudnnRNNDescriptor_t rnnDesc,
-                                const int pseudoLayer,
-                                const cudnnTensorDescriptor_t xDesc,
-                                const cudnnFilterDescriptor_t wDesc,
-                                const void *w,
-                                const int linLayerID,
-                                cudnnFilterDescriptor_t linLayerMatDesc,
-                                void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
-                              const cudnnRNNDescriptor_t rnnDesc,
-                              const int pseudoLayer,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const cudnnFilterDescriptor_t wDesc,
-                              const void *w,
-                              const int linLayerID,
-                              cudnnFilterDescriptor_t linLayerBiasDesc,
-                              void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInference(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         const void *x,
-                         const cudnnTensorDescriptor_t hxDesc,
-                         const void *hx,
-                         const cudnnTensorDescriptor_t cxDesc,
-                         const void *cx,
-                         const cudnnFilterDescriptor_t wDesc,
-                         const void *w,
-                         const cudnnTensorDescriptor_t *yDesc,
-                         void *y,
-                         const cudnnTensorDescriptor_t hyDesc,
-                         void *hy,
-                         const cudnnTensorDescriptor_t cyDesc,
-                         void *cy,
-                         void *workspace,
-                         size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTraining(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t cxDesc,
-                        const void *cx,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        void *y,
-                        const cudnnTensorDescriptor_t hyDesc,
-                        void *hy,
-                        const cudnnTensorDescriptor_t cyDesc,
-                        void *cy,
-                        void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength,
-                     const cudnnTensorDescriptor_t *yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy,
-                     const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy,
-                     const cudnnFilterDescriptor_t wDesc,
-                     const void *w,
-                     const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx,
-                     const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx,
-                     const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx,
-                     const cudnnTensorDescriptor_t dhxDesc,
-                     void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc,
-                     void *dcx,
-                     void *workspace,
-                     size_t workSpaceSizeInBytes,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeights(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        const void *y,
-                        const void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        const cudnnFilterDescriptor_t dwDesc,
-                        void *dw,
-                        const void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
@@ -2085,82 +2131,102 @@ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCTCLoss(
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
     cudnnHandle_t handle,
     const cudnnTensorDescriptor_t
-        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
-                          mini batch size, A is the alphabet size)  */
-    const void *probs, /* probabilities after softmax, in GPU memory */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    void *costs,                                 /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the
+                      mini batch size, A is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
     cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
     cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace,              /* pointer to the workspace, in GPU memory */
+    void *workspace, /* pointer to the workspace, in GPU memory */
     size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossWorkspaceSize(
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
     cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
-                                                timing steps, N is the mini batch size, A is the alphabet size) */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
-                                                    dimensions are T,N,A. To compute costs
-                                                    only, set it to NULL */
-    const int *labels,                           /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the
+                      timing steps, N is the mini batch size, A is the alphabet
+                      size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc,       /* Tensor descriptor for gradients, the
+                                dimensions are T,N,A. To compute costs
+                                only, set it to NULL */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
@@ -2168,135 +2234,141 @@ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorith
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t algoDesc,
-                             cudnnStatus_t status,
-                             float time,
-                             size_t memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t *algoDesc,
-                             cudnnStatus_t *status,
-                             float *time,
-                             size_t *memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle,
-                   cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace,
-                   size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreAlgorithm(cudnnHandle_t handle,
-                      void *algoSpace,
-                      size_t algoSpaceSizeInBytes,
-                      cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNSetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t clipMode,
-                cudnnNanPropagation_t clipNanOpt,
-                double lclip,
-                double rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t clipMode,
+                                          cudnnNanPropagation_t clipNanOpt,
+                                          double lclip, double rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
+      cudnnNanPropagation_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNGetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t *clipMode,
-                cudnnNanPropagation_t *clipNanOpt,
-                double *lclip,
-                double *rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t *clipMode,
+                                          cudnnNanPropagation_t *clipNanOpt,
+                                          double *lclip, double *rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
+      cudnnNanPropagation_t *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnRNNPaddingMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
@@ -2304,7 +2376,7 @@ cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *padd
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
@@ -2312,199 +2384,202 @@ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t dataType,
-                          cudnnRNNDataLayout_t layout,
-                          int maxSeqLength,
-                          int batchSize,
-                          int vectorSize,
-                          const int seqLengthArray[], /* length of each sequence in the batch */
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t dataType,
+    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
+    int vectorSize,
+    const int seqLengthArray[], /* length of each sequence in the batch */
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
+      int, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t *dataType,
-                          cudnnRNNDataLayout_t *layout,
-                          int *maxSeqLength,
-                          int *batchSize,
-                          int *vectorSize,
-                          int arrayLengthRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t *dataType,
+    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
+    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
+      int *, int *, int *, int, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, arrayLengthRequested, seqLengthArray,
+                  paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnTensorDescriptor_t cxDesc,
-                          const void *cx,
-                          const cudnnFilterDescriptor_t wDesc,
-                          const void *w,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          void *y,
-                          const cudnnTensorDescriptor_t hyDesc,
-                          void *hy,
-                          const cudnnTensorDescriptor_t cyDesc,
-                          void *cy,
-                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                          const void *keys,                     /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                          void *cAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                          void *iAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                          void *queries,                        /* reserved, should pass NULL */
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
-                           const cudnnRNNDescriptor_t rnnDesc,
-                           const cudnnRNNDataDescriptor_t xDesc,
-                           const void *x,
-                           const cudnnTensorDescriptor_t hxDesc,
-                           const void *hx,
-                           const cudnnTensorDescriptor_t cxDesc,
-                           const void *cx,
-                           const cudnnFilterDescriptor_t wDesc,
-                           const void *w,
-                           const cudnnRNNDataDescriptor_t yDesc,
-                           void *y,
-                           const cudnnTensorDescriptor_t hyDesc,
-                           void *hy,
-                           const cudnnTensorDescriptor_t cyDesc,
-                           void *cy,
-                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                           const void *keys,                     /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                           void *cAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                           void *iAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                           void *queries,                        /* reserved, should pass NULL */
-                           void *workSpace,
-                           size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardDataEx(cudnnHandle_t handle,
-                       const cudnnRNNDescriptor_t rnnDesc,
-                       const cudnnRNNDataDescriptor_t yDesc,
-                       const void *y,
-                       const cudnnRNNDataDescriptor_t dyDesc,
-                       const void *dy,
-                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-                       const void *dcAttn,                    /* reserved, should pass NULL */
-                       const cudnnTensorDescriptor_t dhyDesc,
-                       const void *dhy,
-                       const cudnnTensorDescriptor_t dcyDesc,
-                       const void *dcy,
-                       const cudnnFilterDescriptor_t wDesc,
-                       const void *w,
-                       const cudnnTensorDescriptor_t hxDesc,
-                       const void *hx,
-                       const cudnnTensorDescriptor_t cxDesc,
-                       const void *cx,
-                       const cudnnRNNDataDescriptor_t dxDesc,
-                       void *dx,
-                       const cudnnTensorDescriptor_t dhxDesc,
-                       void *dhx,
-                       const cudnnTensorDescriptor_t dcxDesc,
-                       void *dcx,
-                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-                       void *dkeys,                           /* reserved, should pass NULL */
-                       void *workSpace,
-                       size_t workSpaceSizeInBytes,
-                       void *reserveSpace,
-                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y,
+    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
+    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+    const void *dcAttn,                    /* reserved, should pass NULL */
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+    void *dkeys,                           /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
+                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
+                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
+                  workSpace, workSpaceSizeInBytes, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          const void *y,
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          const cudnnFilterDescriptor_t dwDesc,
-                          void *dw,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
+      const cudnnFilterDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
+                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
-                         cudnnRNNDescriptor_t rnnDesc,
-                         const int hiddenSize,
-                         const int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnRNNAlgo_t algo,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
-                         int hiddenSize,
-                         int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_4.inc b/tensorflow/stream_executor/cuda/cudnn_7_4.inc
index bd9f49f9780..883c8ba8812 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_4.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_4.inc
@@ -2,73 +2,71 @@
 
 extern "C" {
 
-size_t CUDNNWINAPI
-cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-size_t CUDNNWINAPI
-cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *CUDNNWINAPI
-cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
@@ -76,100 +74,97 @@ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnTensorFormat_t format,
-                           cudnnDataType_t dataType, /* image data type */
-                           int n,                    /* number of inputs (batch size) */
-                           int c,                    /* number of input feature maps */
-                           int h,                    /* height of input section */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnDataType_t dataType, /* image data type */
-                             int n,                    /* number of inputs (batch size) */
-                             int c,                    /* number of input feature maps */
-                             int h,                    /* height of input section */
-                             int w,                    /* width of input section */
-                             int nStride,
-                             int cStride,
-                             int hStride,
-                             int wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           int *n,                    /* number of inputs (batch size) */
-                           int *c,                    /* number of input feature maps  */
-                           int *h,                    /* height of input section */
-                           int *w,                    /* width of input section */
-                           int *nStride,
-                           int *cStride,
-                           int *hStride,
-                           int *wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t dataType,
-                           int nbDims,
-                           const int dimA[],
-                           const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnTensorFormat_t format,
-                             cudnnDataType_t dataType,
-                             int nbDims,
-                             const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType,
-                           int *nbDims,
-                           int dimA[],
-                           int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
@@ -177,35 +172,33 @@ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensor(cudnnHandle_t handle,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t yDesc,
-                     void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnAddTensor(cudnnHandle_t handle,
-               const void *alpha,
-               const cudnnTensorDescriptor_t aDesc,
-               const void *A,
-               const void *beta,
-               const cudnnTensorDescriptor_t cDesc,
-               void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
@@ -213,29 +206,29 @@ cudnnAddTensor(cudnnHandle_t handle,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t opTensorOp,
-                           cudnnDataType_t opTensorCompType,
-                           cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t *opTensorOp,
-                           cudnnDataType_t *opTensorCompType,
-                           cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
@@ -243,126 +236,136 @@ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnOpTensor(cudnnHandle_t handle,
-              const cudnnOpTensorDescriptor_t opTensorDesc,
-              const void *alpha1,
-              const cudnnTensorDescriptor_t aDesc,
-              const void *A,
-              const void *alpha2,
-              const cudnnTensorDescriptor_t bDesc,
-              const void *B,
-              const void *beta,
-              const cudnnTensorDescriptor_t cDesc,
-              void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t reduceTensorOp,
-                               cudnnDataType_t reduceTensorCompType,
-                               cudnnNanPropagation_t reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t reduceTensorIndices,
-                               cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t *reduceTensorOp,
-                               cudnnDataType_t *reduceTensorCompType,
-                               cudnnNanPropagation_t *reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t *reduceTensorIndices,
-                               cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionIndicesSize(cudnnHandle_t handle,
-                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                             const cudnnTensorDescriptor_t aDesc,
-                             const cudnnTensorDescriptor_t cDesc,
-                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
-                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               const cudnnTensorDescriptor_t aDesc,
-                               const cudnnTensorDescriptor_t cDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReduceTensor(cudnnHandle_t handle,
-                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                  void *indices,
-                  size_t indicesSizeInBytes,
-                  void *workspace,
-                  size_t workspaceSizeInBytes,
-                  const void *alpha,
-                  const cudnnTensorDescriptor_t aDesc,
-                  const void *A,
-                  const void *beta,
-                  const cudnnTensorDescriptor_t cDesc,
-                  void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
@@ -370,68 +373,70 @@ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int k,  /* number of output feature maps */
-                           int c,  /* number of input feature maps */
-                           int h,  /* height of each input filter */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *k,  /* number of output feature maps */
-                           int *c,  /* number of input feature maps */
-                           int *h,  /* height of each input filter */
-                           int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int nbDims,
-                           const int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *nbDims,
-                           int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
@@ -439,622 +444,657 @@ cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int pad_h,      /* zero-padding height */
-                                int pad_w,      /* zero-padding width */
-                                int u,          /* vertical filter stride */
-                                int v,          /* horizontal filter stride */
-                                int dilation_h, /* filter dilation in the vertical dimension */
-                                int dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int *pad_h,      /* zero-padding height */
-                                int *pad_w,      /* zero-padding width */
-                                int *u,          /* vertical filter stride */
-                                int *v,          /* horizontal filter stride */
-                                int *dilation_h, /* filter dilation in the vertical dimension */
-                                int *dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int *n,
-                                      int *c,
-                                      int *h,
-                                      int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLength, /* nbDims-2 size */
-                                const int padA[],
-                                const int filterStrideA[],
-                                const int dilationA[],
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLengthRequested,
-                                int *arrayLength,
-                                int padA[],
-                                int strideA[],
-                                int dilationA[],
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int nbDims,
-                                      int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                     const cudnnTensorDescriptor_t xDesc,
-                                     const cudnnFilterDescriptor_t wDesc,
-                                     const cudnnConvolutionDescriptor_t convDesc,
-                                     const cudnnTensorDescriptor_t yDesc,
-                                     const int requestedAlgoCount,
-                                     int *returnedAlgoCount,
-                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t xDesc,
-                                       const void *x,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t yDesc,
-                                       void *y,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
-                                       void *workSpace,
-                                       size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                    const cudnnTensorDescriptor_t xDesc,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const cudnnConvolutionDescriptor_t convDesc,
-                                    const cudnnTensorDescriptor_t yDesc,
-                                    cudnnConvolutionFwdPreference_t preference,
-                                    size_t memoryLimitInBytes,
-                                    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t srcDesc,
-                                       const cudnnFilterDescriptor_t filterDesc,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t destDesc,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const cudnnConvolutionDescriptor_t convDesc,
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        cudnnConvolutionFwdAlgo_t algo,
-                                        size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionForward(cudnnHandle_t handle,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnConvolutionDescriptor_t convDesc,
-                        cudnnConvolutionFwdAlgo_t algo,
-                        void *workSpace,
-                        size_t workSpaceSizeInBytes,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t yDesc,
-                        void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
-                                      const void *alpha1,
-                                      const cudnnTensorDescriptor_t xDesc,
-                                      const void *x,
-                                      const cudnnFilterDescriptor_t wDesc,
-                                      const void *w,
-                                      const cudnnConvolutionDescriptor_t convDesc,
-                                      cudnnConvolutionFwdAlgo_t algo,
-                                      void *workSpace,
-                                      size_t workSpaceSizeInBytes,
-                                      const void *alpha2,
-                                      const cudnnTensorDescriptor_t zDesc,
-                                      const void *z,
-                                      const cudnnTensorDescriptor_t biasDesc,
-                                      const void *bias,
-                                      const cudnnActivationDescriptor_t activationDesc,
-                                      const cudnnTensorDescriptor_t yDesc,
-                                      void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardBias(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dbDesc,
-                             void *db) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                            const cudnnTensorDescriptor_t xDesc,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnFilterDescriptor_t dwDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t xDesc,
-                                              const void *x,
-                                              const cudnnTensorDescriptor_t dyDesc,
-                                              const void *y,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t dwDesc,
-                                              void *dw,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                              void *workSpace,
-                                              size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t xDesc,
-                                           const cudnnTensorDescriptor_t dyDesc,
-                                           const cudnnConvolutionDescriptor_t convDesc,
-                                           const cudnnFilterDescriptor_t dwDesc,
-                                           cudnnConvolutionBwdFilterPreference_t preference,
-                                           size_t memoryLimitInBytes,
-                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t srcDesc,
-                                              const cudnnTensorDescriptor_t diffDesc,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t gradDesc,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
-                                               const cudnnTensorDescriptor_t xDesc,
-                                               const cudnnTensorDescriptor_t dyDesc,
-                                               const cudnnConvolutionDescriptor_t convDesc,
-                                               const cudnnFilterDescriptor_t gradDesc,
-                                               cudnnConvolutionBwdFilterAlgo_t algo,
-                                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
-                               const void *alpha,
-                               const cudnnTensorDescriptor_t xDesc,
-                               const void *x,
-                               const cudnnTensorDescriptor_t dyDesc,
-                               const void *dy,
-                               const cudnnConvolutionDescriptor_t convDesc,
-                               cudnnConvolutionBwdFilterAlgo_t algo,
-                               void *workSpace,
-                               size_t workSpaceSizeInBytes,
-                               const void *beta,
-                               const cudnnFilterDescriptor_t dwDesc,
-                               void *dw) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t wDesc,
-                                          const cudnnTensorDescriptor_t dyDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t dxDesc,
-                                          const int requestedAlgoCount,
-                                          int *returnedAlgoCount,
-                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t wDesc,
-                                            const void *w,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const void *dy,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t dxDesc,
-                                            void *dx,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
-                                            void *workSpace,
-                                            size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                         const cudnnFilterDescriptor_t wDesc,
-                                         const cudnnTensorDescriptor_t dyDesc,
-                                         const cudnnConvolutionDescriptor_t convDesc,
-                                         const cudnnTensorDescriptor_t dxDesc,
-                                         cudnnConvolutionBwdDataPreference_t preference,
-                                         size_t memoryLimitInBytes,
-                                         cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t filterDesc,
-                                            const cudnnTensorDescriptor_t diffDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t gradDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
-                                             const cudnnFilterDescriptor_t wDesc,
-                                             const cudnnTensorDescriptor_t dyDesc,
-                                             const cudnnConvolutionDescriptor_t convDesc,
-                                             const cudnnTensorDescriptor_t dxDesc,
-                                             cudnnConvolutionBwdDataAlgo_t algo,
-                                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardData(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnFilterDescriptor_t wDesc,
-                             const void *w,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnConvolutionDescriptor_t convDesc,
-                             cudnnConvolutionBwdDataAlgo_t algo,
-                             void *workSpace,
-                             size_t workSpaceSizeInBytes,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle,
-            const cudnnTensorDescriptor_t xDesc,
-            const void *x,
-            const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc,
-            void *colBuffer) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxForward(cudnnHandle_t handle,
-                    cudnnSoftmaxAlgorithm_t algo,
-                    cudnnSoftmaxMode_t mode,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxBackward(cudnnHandle_t handle,
-                     cudnnSoftmaxAlgorithm_t algo,
-                     cudnnSoftmaxMode_t mode,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t mode,
-                            cudnnNanPropagation_t maxpoolingNanOpt,
-                            int windowHeight,
-                            int windowWidth,
-                            int verticalPadding,
-                            int horizontalPadding,
-                            int verticalStride,
-                            int horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *windowHeight,
-                            int *windowWidth,
-                            int *verticalPadding,
-                            int *horizontalPadding,
-                            int *verticalStride,
-                            int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            const cudnnPoolingMode_t mode,
-                            const cudnnNanPropagation_t maxpoolingNanOpt,
-                            int nbDims,
-                            const int windowDimA[],
-                            const int paddingA[],
-                            const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            int nbDimsRequested,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *nbDims,
-                            int windowDimA[],
-                            int paddingA[],
-                            int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims,
-                                  int outputTensorDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
@@ -1062,72 +1102,69 @@ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n,
-                                  int *c,
-                                  int *h,
-                                  int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingForward(cudnnHandle_t handle,
-                    const cudnnPoolingDescriptor_t poolingDesc,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingBackward(cudnnHandle_t handle,
-                     const cudnnPoolingDescriptor_t poolingDesc,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t mode,
-                             cudnnNanPropagation_t reluNanOpt,
-                             double coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1136,9 +1173,10 @@ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
                              cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt,
-                             double *coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1146,65 +1184,68 @@ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationForward(cudnnHandle_t handle,
-                       cudnnActivationDescriptor_t activationDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t xDesc,
-                       const void *x,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t yDesc,
-                       void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationBackward(cudnnHandle_t handle,
-                        cudnnActivationDescriptor_t activationDesc,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t yDesc,
-                        const void *y,
-                        const cudnnTensorDescriptor_t dyDesc,
-                        const void *dy,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t dxDesc,
-                        void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
@@ -1212,157 +1253,157 @@ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrn
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelForward(cudnnHandle_t handle,
-                            cudnnLRNDescriptor_t normDesc,
-                            cudnnLRNMode_t lrnMode,
-                            const void *alpha,
-                            const cudnnTensorDescriptor_t xDesc,
-                            const void *x,
-                            const void *beta,
-                            const cudnnTensorDescriptor_t yDesc,
-                            void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
-                             cudnnLRNDescriptor_t normDesc,
-                             cudnnLRNMode_t lrnMode,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t yDesc,
-                             const void *y,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
-                                  cudnnLRNDescriptor_t normDesc,
-                                  cudnnDivNormMode_t mode,
-                                  const void *alpha,
-                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-                                  const void *x,
-                                  const void *means, /* if NULL, means are assumed to be zero */
-                                  void *temp,
-                                  void *temp2,
-                                  const void *beta,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
-                                   cudnnLRNDescriptor_t normDesc,
-                                   cudnnDivNormMode_t mode,
-                                   const void *alpha,
-                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                   const void *x,
-                                   const void *means, /* if NULL, means are assumed to be zero */
-                                   const void *dy,
-                                   void *temp,
-                                   void *temp2,
-                                   const void *beta,
-                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-                                   void *dx,                                   /* output x differential */
-                                   void *dMeans) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
-                              const cudnnTensorDescriptor_t xDesc,
-                              cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
-                                                         cudnnBatchNormMode_t mode,
-                                                         cudnnBatchNormOps_t bnOps,
-                                                         const cudnnTensorDescriptor_t xDesc,
-                                                         const cudnnTensorDescriptor_t zDesc,
-                                                         const cudnnTensorDescriptor_t yDesc,
-                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                                         const cudnnActivationDescriptor_t activationDesc,
-                                                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
+                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
-                                                  cudnnBatchNormMode_t mode,
-                                                  cudnnBatchNormOps_t bnOps,
-                                                  const cudnnTensorDescriptor_t xDesc,
-                                                  const cudnnTensorDescriptor_t yDesc,
-                                                  const cudnnTensorDescriptor_t dyDesc,
-                                                  const cudnnTensorDescriptor_t dzDesc,
-                                                  const cudnnTensorDescriptor_t dxDesc,
-                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                                  const cudnnActivationDescriptor_t activationDesc,
-                                                  size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
+                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
-                                                     cudnnBatchNormMode_t mode,
-                                                     cudnnBatchNormOps_t bnOps,
-                                                     const cudnnActivationDescriptor_t activationDesc,
-                                                     const cudnnTensorDescriptor_t xDesc,
-                                                     size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc,
-    void *y, /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
     /* Shared desc for the next 6 tensors in the argument list.
        Data type to be set as follows:
@@ -1370,13 +1411,13 @@ cudnnBatchNormalizationForwardTraining(
        Dimensions for this descriptor depend on normalization mode
        - Spatial Normalization : tensors are expected to have dims 1xCx1x1
         (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
-        (normalization is performed across N) */
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
     const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-    const void *bnScale,
-    const void *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
     /* MUST use factor=1 in the very first call of a complete training cycle.
        Use a factor=1/(1+n) at N-th call to the function to get
@@ -1394,248 +1435,261 @@ cudnnBatchNormalizationForwardTraining(
        of  variance[x] (factor is applied in the same way as for runningMean) */
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
-    cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *xData,
-    const cudnnTensorDescriptor_t zDesc,
-    const void *zData,
-    const cudnnTensorDescriptor_t yDesc,
-    void *yData,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t zDesc, const void *zData,
+    const cudnnTensorDescriptor_t yDesc, void *yData,
 
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const void *bnScale,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
     const void *bnBias,
 
-    double exponentialAverageFactor,
-    void *resultRunningMean,
+    double exponentialAverageFactor, void *resultRunningMean,
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance,
+    void *resultSaveMean, void *resultSaveInvVariance,
 
-    cudnnActivationDescriptor_t activationDesc,
-    void *workspace,
-    size_t workSpaceSizeInBytes,
-    void *reserveSpace,
+    cudnnActivationDescriptor_t activationDesc, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
+                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
+                  exponentialAverageFactor, resultRunningMean,
+                  resultRunningVariance, epsilon, resultSaveMean,
+                  resultSaveInvVariance, activationDesc, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
-                                        cudnnBatchNormMode_t mode,
-                                        const void *alpha, /* alpha[0] = result blend factor */
-                                        const void *beta,  /* beta[0] = dest layer blend factor */
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const void *x, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        void *y, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                        const void *bnScale,
-                                        const void *bnBias,
-                                        const void *estimatedMean,
-                                        const void *estimatedVariance,
-                                        double epsilon) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackward(cudnnHandle_t handle,
-                                cudnnBatchNormMode_t mode,
-                                const void *alphaDataDiff,
-                                const void *betaDataDiff,
-                                const void *alphaParamDiff,
-                                const void *betaParamDiff,
-                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-                                const void *x,
-                                const cudnnTensorDescriptor_t dyDesc,
-                                const void *dy,
-                                const cudnnTensorDescriptor_t dxDesc,
-                                void *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                const void *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void *dBnScaleResult,
-                                void *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double epsilon,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void *savedMean,
-                                const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
-                                  cudnnBatchNormMode_t mode,
-                                  cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
-                                  const void *alphaDataDiff,
-                                  const void *betaDataDiff,
-                                  const void *alphaParamDiff,
-                                  const void *betaParamDiff,
-                                  const cudnnTensorDescriptor_t xDesc,
-                                  const void *xData,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  const void *yData,
-                                  const cudnnTensorDescriptor_t dyDesc,
-                                  const void *dyData,
-                                  const cudnnTensorDescriptor_t dzDesc,
-                                  void *dzData,
-                                  const cudnnTensorDescriptor_t dxDesc,
-                                  void *dxData,
+    const void *alphaDataDiff, const void *betaDataDiff,
+    const void *alphaParamDiff, const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t yDesc, const void *yData,
+    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
+    const cudnnTensorDescriptor_t dzDesc, void *dzData,
+    const cudnnTensorDescriptor_t dxDesc, void *dxData,
 
-                                  /* Shared tensor desc for the 4 tensors below */
-                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                  const void *bnScaleData,
-                                  const void *bnBiasData, /* needed if there is activation */
-                                  void *dBnScaleData,
-                                  void *dBnBiasData,
-                                  double epsilon, /* Same epsilon as forward pass */
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
+    const void *bnBiasData, /* needed if there is activation */
+    void *dBnScaleData, void *dBnBiasData,
+    double epsilon, /* Same epsilon as forward pass */
 
-                                  /* Optionally cached intermediate results from
-                                     forward pass */
-                                  const void *savedMean,
-                                  const void *savedInvVariance,
-                                  cudnnActivationDescriptor_t activationDesc,
-                                  void *workSpace,
-                                  size_t workSpaceSizeInBytes,
-                                  void *reserveSpace,
-                                  size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance,
+    cudnnActivationDescriptor_t activationDesc, void *workSpace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
+      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
+      void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(
+      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
+      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
+      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
+      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
+      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
-                                       cudnnSamplerType_t samplerType,
-                                       cudnnDataType_t dataType,
-                                       const int nbDims,
-                                       const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
-                                   const cudnnSpatialTransformerDescriptor_t stDesc,
-                                   const void *theta,
-                                   void *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
-                                    const cudnnSpatialTransformerDescriptor_t stDesc,
-                                    const void *dgrid,
-                                    void *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
-                             cudnnSpatialTransformerDescriptor_t stDesc,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *grid,
-                             const void *beta,
-                             cudnnTensorDescriptor_t yDesc,
-                             void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
-                              cudnnSpatialTransformerDescriptor_t stDesc,
-                              const void *alpha,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const void *x,
-                              const void *beta,
-                              const cudnnTensorDescriptor_t dxDesc,
-                              void *dx,
-                              const void *alphaDgrid,
-                              const cudnnTensorDescriptor_t dyDesc,
-                              const void *dy,
-                              const void *grid,
-                              const void *betaDgrid,
-                              void *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
@@ -1643,99 +1697,95 @@ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float dropout,
-                          void *states,
-                          size_t stateSizeInBytes,
-                          unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                              cudnnHandle_t handle,
-                              float dropout,
-                              void *states,
-                              size_t stateSizeInBytes,
-                              unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float *dropout,
-                          void **states,
-                          unsigned long long *seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutForward(cudnnHandle_t handle,
-                    const cudnnDropoutDescriptor_t dropoutDesc,
-                    const cudnnTensorDescriptor_t xdesc,
-                    const void *x,
-                    const cudnnTensorDescriptor_t ydesc,
-                    void *y,
-                    void *reserveSpace,
-                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutBackward(cudnnHandle_t handle,
-                     const cudnnDropoutDescriptor_t dropoutDesc,
-                     const cudnnTensorDescriptor_t dydesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dxdesc,
-                     void *dx,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
@@ -1743,184 +1793,192 @@ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
-                                        const cudnnRNNDescriptor_t rnnDesc,
-                                        const int seqLength,
-                                        const cudnnTensorDescriptor_t *xDesc,
-                                        const void *x,
-                                        const cudnnTensorDescriptor_t hxDesc,
-                                        const void *hx,
-                                        const cudnnTensorDescriptor_t cxDesc,
-                                        const void *cx,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const void *w,
-                                        const cudnnTensorDescriptor_t *yDesc,
-                                        void *y,
-                                        const cudnnTensorDescriptor_t hyDesc,
-                                        void *hy,
-                                        const cudnnTensorDescriptor_t cyDesc,
-                                        void *cy,
-                                        const float findIntensity,
-                                        const int requestedAlgoCount,
-                                        int *returnedAlgoCount,
-                                        cudnnAlgorithmPerformance_t *perfResults,
-                                        void *workspace,
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t cxDesc,
-                                       const void *cx,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       void *y,
-                                       const cudnnTensorDescriptor_t hyDesc,
-                                       void *hy,
-                                       const cudnnTensorDescriptor_t cyDesc,
-                                       void *cy,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnnDesc,
-                                    const int seqLength,
-                                    const cudnnTensorDescriptor_t *yDesc,
-                                    const void *y,
-                                    const cudnnTensorDescriptor_t *dyDesc,
-                                    const void *dy,
-                                    const cudnnTensorDescriptor_t dhyDesc,
-                                    const void *dhy,
-                                    const cudnnTensorDescriptor_t dcyDesc,
-                                    const void *dcy,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const void *w,
-                                    const cudnnTensorDescriptor_t hxDesc,
-                                    const void *hx,
-                                    const cudnnTensorDescriptor_t cxDesc,
-                                    const void *cx,
-                                    const cudnnTensorDescriptor_t *dxDesc,
-                                    void *dx,
-                                    const cudnnTensorDescriptor_t dhxDesc,
-                                    void *dhx,
-                                    const cudnnTensorDescriptor_t dcxDesc,
-                                    void *dcx,
-                                    const float findIntensity,
-                                    const int requestedAlgoCount,
-                                    int *returnedAlgoCount,
-                                    cudnnAlgorithmPerformance_t *perfResults,
-                                    void *workspace,
-                                    size_t workSpaceSizeInBytes,
-                                    void *reserveSpace,
-                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       const void *y,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       const void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       const cudnnFilterDescriptor_t dwDesc,
-                                       void *dw,
-                                       const void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                             const int minibatch,
-                             const cudnnDataType_t dataType,
-                             cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
@@ -1928,289 +1986,285 @@ cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      const int hiddenSize,
-                      const int numLayers,
-                      cudnnDropoutDescriptor_t dropoutDesc, /* Between layers, not between recurrent steps. */
-                      cudnnRNNInputMode_t inputMode,
-                      cudnnDirectionMode_t direction,
-                      cudnnRNNMode_t mode,
-                      cudnnRNNAlgo_t algo,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers,
+    cudnnDropoutDescriptor_t
+        dropoutDesc, /* Between layers, not between recurrent steps. */
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
-                            cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize,
-                            const int outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
-                            const cudnnRNNDescriptor_t rnnDesc,
-                            int *recProjSize,
-                            int *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      int *hiddenSize,
-                      int *numLayers,
-                      cudnnDropoutDescriptor_t *dropoutDesc,
-                      cudnnRNNInputMode_t *inputMode,
-                      cudnnDirectionMode_t *direction,
-                      cudnnRNNMode_t *mode,
-                      cudnnRNNAlgo_t *algo,
-                      cudnnDataType_t *dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
-                               const cudnnRNNDescriptor_t rnnDesc,
-                               const int seqLength,
-                               const cudnnTensorDescriptor_t *xDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle,
-                      const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc,
-                      size_t *sizeInBytes,
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
                       cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
-                                const cudnnRNNDescriptor_t rnnDesc,
-                                const int pseudoLayer,
-                                const cudnnTensorDescriptor_t xDesc,
-                                const cudnnFilterDescriptor_t wDesc,
-                                const void *w,
-                                const int linLayerID,
-                                cudnnFilterDescriptor_t linLayerMatDesc,
-                                void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
-                              const cudnnRNNDescriptor_t rnnDesc,
-                              const int pseudoLayer,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const cudnnFilterDescriptor_t wDesc,
-                              const void *w,
-                              const int linLayerID,
-                              cudnnFilterDescriptor_t linLayerBiasDesc,
-                              void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInference(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         const void *x,
-                         const cudnnTensorDescriptor_t hxDesc,
-                         const void *hx,
-                         const cudnnTensorDescriptor_t cxDesc,
-                         const void *cx,
-                         const cudnnFilterDescriptor_t wDesc,
-                         const void *w,
-                         const cudnnTensorDescriptor_t *yDesc,
-                         void *y,
-                         const cudnnTensorDescriptor_t hyDesc,
-                         void *hy,
-                         const cudnnTensorDescriptor_t cyDesc,
-                         void *cy,
-                         void *workspace,
-                         size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTraining(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t cxDesc,
-                        const void *cx,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        void *y,
-                        const cudnnTensorDescriptor_t hyDesc,
-                        void *hy,
-                        const cudnnTensorDescriptor_t cyDesc,
-                        void *cy,
-                        void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength,
-                     const cudnnTensorDescriptor_t *yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy,
-                     const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy,
-                     const cudnnFilterDescriptor_t wDesc,
-                     const void *w,
-                     const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx,
-                     const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx,
-                     const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx,
-                     const cudnnTensorDescriptor_t dhxDesc,
-                     void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc,
-                     void *dcx,
-                     void *workspace,
-                     size_t workSpaceSizeInBytes,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeights(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        const void *y,
-                        const void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        const cudnnFilterDescriptor_t dwDesc,
-                        void *dw,
-                        const void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
@@ -2218,82 +2272,102 @@ cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCTCLoss(
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
     cudnnHandle_t handle,
     const cudnnTensorDescriptor_t
-        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
-                          mini batch size, A is the alphabet size)  */
-    const void *probs, /* probabilities after softmax, in GPU memory */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    void *costs,                                 /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the
+                      mini batch size, A is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
     cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
     cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace,              /* pointer to the workspace, in GPU memory */
+    void *workspace, /* pointer to the workspace, in GPU memory */
     size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossWorkspaceSize(
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
     cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
-                                                timing steps, N is the mini batch size, A is the alphabet size) */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
-                                                    dimensions are T,N,A. To compute costs
-                                                    only, set it to NULL */
-    const int *labels,                           /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the
+                      timing steps, N is the mini batch size, A is the alphabet
+                      size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc,       /* Tensor descriptor for gradients, the
+                                dimensions are T,N,A. To compute costs
+                                only, set it to NULL */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
@@ -2301,135 +2375,141 @@ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorith
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t algoDesc,
-                             cudnnStatus_t status,
-                             float time,
-                             size_t memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t *algoDesc,
-                             cudnnStatus_t *status,
-                             float *time,
-                             size_t *memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle,
-                   cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace,
-                   size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreAlgorithm(cudnnHandle_t handle,
-                      void *algoSpace,
-                      size_t algoSpaceSizeInBytes,
-                      cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNSetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t clipMode,
-                cudnnNanPropagation_t clipNanOpt,
-                double lclip,
-                double rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t clipMode,
+                                          cudnnNanPropagation_t clipNanOpt,
+                                          double lclip, double rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
+      cudnnNanPropagation_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNGetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t *clipMode,
-                cudnnNanPropagation_t *clipNanOpt,
-                double *lclip,
-                double *rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t *clipMode,
+                                          cudnnNanPropagation_t *clipNanOpt,
+                                          double *lclip, double *rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
+      cudnnNanPropagation_t *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnRNNPaddingMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
@@ -2437,7 +2517,7 @@ cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *padd
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
@@ -2445,199 +2525,202 @@ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *RNNDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(RNNDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t dataType,
-                          cudnnRNNDataLayout_t layout,
-                          int maxSeqLength,
-                          int batchSize,
-                          int vectorSize,
-                          const int seqLengthArray[], /* length of each sequence in the batch */
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t dataType,
+    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
+    int vectorSize,
+    const int seqLengthArray[], /* length of each sequence in the batch */
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
+      int, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t RNNDataDesc,
-                          cudnnDataType_t *dataType,
-                          cudnnRNNDataLayout_t *layout,
-                          int *maxSeqLength,
-                          int *batchSize,
-                          int *vectorSize,
-                          int arrayLengthRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t RNNDataDesc, cudnnDataType_t *dataType,
+    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
+    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
+      int *, int *, int *, int, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+  return func_ptr(RNNDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, arrayLengthRequested, seqLengthArray,
+                  paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnTensorDescriptor_t cxDesc,
-                          const void *cx,
-                          const cudnnFilterDescriptor_t wDesc,
-                          const void *w,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          void *y,
-                          const cudnnTensorDescriptor_t hyDesc,
-                          void *hy,
-                          const cudnnTensorDescriptor_t cyDesc,
-                          void *cy,
-                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                          const void *keys,                     /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                          void *cAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                          void *iAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                          void *queries,                        /* reserved, should pass NULL */
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
-                           const cudnnRNNDescriptor_t rnnDesc,
-                           const cudnnRNNDataDescriptor_t xDesc,
-                           const void *x,
-                           const cudnnTensorDescriptor_t hxDesc,
-                           const void *hx,
-                           const cudnnTensorDescriptor_t cxDesc,
-                           const void *cx,
-                           const cudnnFilterDescriptor_t wDesc,
-                           const void *w,
-                           const cudnnRNNDataDescriptor_t yDesc,
-                           void *y,
-                           const cudnnTensorDescriptor_t hyDesc,
-                           void *hy,
-                           const cudnnTensorDescriptor_t cyDesc,
-                           void *cy,
-                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                           const void *keys,                     /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                           void *cAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                           void *iAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                           void *queries,                        /* reserved, should pass NULL */
-                           void *workSpace,
-                           size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardDataEx(cudnnHandle_t handle,
-                       const cudnnRNNDescriptor_t rnnDesc,
-                       const cudnnRNNDataDescriptor_t yDesc,
-                       const void *y,
-                       const cudnnRNNDataDescriptor_t dyDesc,
-                       const void *dy,
-                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-                       const void *dcAttn,                    /* reserved, should pass NULL */
-                       const cudnnTensorDescriptor_t dhyDesc,
-                       const void *dhy,
-                       const cudnnTensorDescriptor_t dcyDesc,
-                       const void *dcy,
-                       const cudnnFilterDescriptor_t wDesc,
-                       const void *w,
-                       const cudnnTensorDescriptor_t hxDesc,
-                       const void *hx,
-                       const cudnnTensorDescriptor_t cxDesc,
-                       const void *cx,
-                       const cudnnRNNDataDescriptor_t dxDesc,
-                       void *dx,
-                       const cudnnTensorDescriptor_t dhxDesc,
-                       void *dhx,
-                       const cudnnTensorDescriptor_t dcxDesc,
-                       void *dcx,
-                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-                       void *dkeys,                           /* reserved, should pass NULL */
-                       void *workSpace,
-                       size_t workSpaceSizeInBytes,
-                       void *reserveSpace,
-                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y,
+    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
+    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+    const void *dcAttn,                    /* reserved, should pass NULL */
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+    void *dkeys,                           /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
+                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
+                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
+                  workSpace, workSpaceSizeInBytes, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          const void *y,
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          const cudnnFilterDescriptor_t dwDesc,
-                          void *dw,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
+      const cudnnFilterDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
+                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
-                         cudnnRNNDescriptor_t rnnDesc,
-                         const int hiddenSize,
-                         const int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnRNNAlgo_t algo,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, dataType);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
-                         int hiddenSize,
-                         int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t dataType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, dataType);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, dataType);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_7_6.inc b/tensorflow/stream_executor/cuda/cudnn_7_6.inc
index 7a5f1c9751d..9dd420a9022 100644
--- a/tensorflow/stream_executor/cuda/cudnn_7_6.inc
+++ b/tensorflow/stream_executor/cuda/cudnn_7_6.inc
@@ -2,73 +2,71 @@
 
 extern "C" {
 
-size_t CUDNNWINAPI
-cudnnGetVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-size_t CUDNNWINAPI
-cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t (CUDNNWINAPI *)();
+size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
+  using FuncPtr = size_t(CUDNNWINAPI *)();
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
   if (!func_ptr) return 0;
   return func_ptr();
 }
 
-const char *CUDNNWINAPI
-cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char * (CUDNNWINAPI *)(cudnnStatus_t);
+const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
+  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
   if (!func_ptr) return "cudnnGetErrorString symbol not found.";
   return func_ptr(status);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
+cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
+                                                 cudnnStatus_t *rstatus,
+                                                 cudnnErrQueryMode_t mode,
+                                                 cudnnRuntimeTag_t *tag) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rstatus, mode, tag);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetProperty(libraryPropertyType type, int *value) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(libraryPropertyType, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
+                                           int *value) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(type, value);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
+                                         cudaStream_t streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
+                                         cudaStream_t *streamId) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, streamId);
@@ -76,100 +74,97 @@ cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnTensorFormat_t format,
-                           cudnnDataType_t dataType, /* image data type */
-                           int n,                    /* number of inputs (batch size) */
-                           int c,                    /* number of input feature maps */
-                           int h,                    /* height of input section */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnDataType_t dataType, /* image data type */
-                             int n,                    /* number of inputs (batch size) */
-                             int c,                    /* number of input feature maps */
-                             int h,                    /* height of input section */
-                             int w,                    /* width of input section */
-                             int nStride,
-                             int cStride,
-                             int hStride,
-                             int wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t dataType, /* image data type */
+    int n,                    /* number of inputs (batch size) */
+    int c,                    /* number of input feature maps */
+    int h,                    /* height of input section */
+    int w,                    /* width of input section */
+    int nStride, int cStride, int hStride, int wStride) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
+                                   int, int, int, int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           int *n,                    /* number of inputs (batch size) */
-                           int *c,                    /* number of input feature maps  */
-                           int *h,                    /* height of input section */
-                           int *w,                    /* width of input section */
-                           int *nStride,
-                           int *cStride,
-                           int *hStride,
-                           int *wStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc,
+    cudnnDataType_t *dataType, /* image data type */
+    int *n,                    /* number of inputs (batch size) */
+    int *c,                    /* number of input feature maps  */
+    int *h,                    /* height of input section */
+    int *w,                    /* width of input section */
+    int *nStride, int *cStride, int *hStride, int *wStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
+      int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride, wStride);
+  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
+                  wStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
-                           cudnnDataType_t dataType,
-                           int nbDims,
-                           const int dimA[],
-                           const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
+    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
-                             cudnnTensorFormat_t format,
-                             cudnnDataType_t dataType,
-                             int nbDims,
-                             const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
+    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
+    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
+                                   cudnnDataType_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType,
-                           int *nbDims,
-                           int dimA[],
-                           int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int, cudnnDataType_t *, int *, int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
+    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
+                                   cudnnDataType_t *, int *, int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
+    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc, size);
@@ -177,126 +172,141 @@ cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(tensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
-                       const cudnnTensorDescriptor_t srcDesc,
-                       cudnnTensorDescriptor_t destDesc,
-                       size_t *destSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t, cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnInitTransformDest(
+    const cudnnTensorTransformDescriptor_t transformDesc,
+    const cudnnTensorDescriptor_t srcDesc, cudnnTensorDescriptor_t destDesc,
+    size_t *destSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t,
+      cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnInitTransformDest");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(transformDesc, srcDesc, destDesc, destSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t *transformDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(transformDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
-                                  const uint32_t nbDims,
-                                  const cudnnTensorFormat_t destFormat,
-                                  const int32_t padBeforeA[],
-                                  const int32_t padAfterA[],
-                                  const uint32_t foldA[],
-                                  const cudnnFoldingDirection_t direction) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, const uint32_t, const cudnnTensorFormat_t, const int32_t [], const int32_t [], const uint32_t [], const cudnnFoldingDirection_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t transformDesc, const uint32_t nbDims,
+    const cudnnTensorFormat_t destFormat, const int32_t padBeforeA[],
+    const int32_t padAfterA[], const uint32_t foldA[],
+    const cudnnFoldingDirection_t direction) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorTransformDescriptor_t, const uint32_t,
+      const cudnnTensorFormat_t, const int32_t[], const int32_t[],
+      const uint32_t[], const cudnnFoldingDirection_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA, foldA, direction);
+  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA,
+                  foldA, direction);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
-                                  uint32_t nbDimsRequested,
-                                  cudnnTensorFormat_t *destFormat,
-                                  int32_t padBeforeA[],
-                                  int32_t padAfterA[],
-                                  uint32_t foldA[],
-                                  cudnnFoldingDirection_t *direction) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *, int32_t [], int32_t [], uint32_t [], cudnnFoldingDirection_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnGetTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t transformDesc, uint32_t nbDimsRequested,
+    cudnnTensorFormat_t *destFormat, int32_t padBeforeA[], int32_t padAfterA[],
+    uint32_t foldA[], cudnnFoldingDirection_t *direction) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *,
+      int32_t[], int32_t[], uint32_t[], cudnnFoldingDirection_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA, padAfterA, foldA, direction);
+  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA,
+                  padAfterA, foldA, direction);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorTransformDescriptor(
+    cudnnTensorTransformDescriptor_t transformDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(transformDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensor(cudnnHandle_t handle,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t yDesc,
-                     void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformTensorEx(cudnnHandle_t handle,
-                       const cudnnTensorTransformDescriptor_t transDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t srcDesc,
-                       const void *srcData,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t destDesc,
-                       void *destData) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformTensorEx(
+    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
+    const void *alpha, const cudnnTensorDescriptor_t srcDesc,
+    const void *srcData, const void *beta,
+    const cudnnTensorDescriptor_t destDesc, void *destData) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
+                  destData);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFoldedConvBackwardDataDescriptors(const cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t filterDesc,
-                                          const cudnnTensorDescriptor_t diffDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t gradDesc,
-                                          const cudnnTensorFormat_t transformFormat,
-                                          cudnnFilterDescriptor_t foldedFilterDesc,
-                                          cudnnTensorDescriptor_t paddedDiffDesc,
-                                          cudnnConvolutionDescriptor_t foldedConvDesc,
-                                          cudnnTensorDescriptor_t foldedGradDesc,
-                                          cudnnTensorTransformDescriptor_t filterFoldTransDesc,
-                                          cudnnTensorTransformDescriptor_t diffPadTransDesc,
-                                          cudnnTensorTransformDescriptor_t gradFoldTransDesc,
-                                          cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorFormat_t, cudnnFilterDescriptor_t, cudnnTensorDescriptor_t, cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
+cudnnStatus_t CUDNNWINAPI cudnnGetFoldedConvBackwardDataDescriptors(
+    const cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc,
+    const cudnnTensorFormat_t transformFormat,
+    cudnnFilterDescriptor_t foldedFilterDesc,
+    cudnnTensorDescriptor_t paddedDiffDesc,
+    cudnnConvolutionDescriptor_t foldedConvDesc,
+    cudnnTensorDescriptor_t foldedGradDesc,
+    cudnnTensorTransformDescriptor_t filterFoldTransDesc,
+    cudnnTensorTransformDescriptor_t diffPadTransDesc,
+    cudnnTensorTransformDescriptor_t gradFoldTransDesc,
+    cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorFormat_t,
+      cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
+      cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t,
+      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t,
+      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, transformFormat, foldedFilterDesc, paddedDiffDesc, foldedConvDesc, foldedGradDesc, filterFoldTransDesc, diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  transformFormat, foldedFilterDesc, paddedDiffDesc,
+                  foldedConvDesc, foldedGradDesc, filterFoldTransDesc,
+                  diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnAddTensor(cudnnHandle_t handle,
-               const void *alpha,
-               const cudnnTensorDescriptor_t aDesc,
-               const void *A,
-               const void *beta,
-               const cudnnTensorDescriptor_t cDesc,
-               void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
+                                         const void *alpha,
+                                         const cudnnTensorDescriptor_t aDesc,
+                                         const void *A, const void *beta,
+                                         const cudnnTensorDescriptor_t cDesc,
+                                         void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
@@ -304,29 +314,29 @@ cudnnAddTensor(cudnnHandle_t handle,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t opTensorOp,
-                           cudnnDataType_t opTensorCompType,
-                           cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
+    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
+    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
+                                   cudnnDataType_t, cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
-                           cudnnOpTensorOp_t *opTensorOp,
-                           cudnnDataType_t *opTensorCompType,
-                           cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
+    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
+    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
+      cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
@@ -334,126 +344,136 @@ cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(opTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnOpTensor(cudnnHandle_t handle,
-              const cudnnOpTensorDescriptor_t opTensorDesc,
-              const void *alpha1,
-              const cudnnTensorDescriptor_t aDesc,
-              const void *A,
-              const void *alpha2,
-              const cudnnTensorDescriptor_t bDesc,
-              const void *B,
-              const void *beta,
-              const cudnnTensorDescriptor_t cDesc,
-              void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
+    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
+    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
+    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
+    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C);
+  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
+                  beta, cDesc, C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t reduceTensorOp,
-                               cudnnDataType_t reduceTensorCompType,
-                               cudnnNanPropagation_t reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t reduceTensorIndices,
-                               cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t, cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
+    cudnnNanPropagation_t reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t reduceTensorIndices,
+    cudnnIndicesType_t reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
+      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               cudnnReduceTensorOp_t *reduceTensorOp,
-                               cudnnDataType_t *reduceTensorCompType,
-                               cudnnNanPropagation_t *reduceTensorNanOpt,
-                               cudnnReduceTensorIndices_t *reduceTensorIndices,
-                               cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *, cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *, cudnnIndicesType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
+    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    cudnnReduceTensorOp_t *reduceTensorOp,
+    cudnnDataType_t *reduceTensorCompType,
+    cudnnNanPropagation_t *reduceTensorNanOpt,
+    cudnnReduceTensorIndices_t *reduceTensorIndices,
+    cudnnIndicesType_t *reduceTensorIndicesType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
+      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
+      cudnnIndicesType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType, reduceTensorNanOpt, reduceTensorIndices, reduceTensorIndicesType);
+  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
+                  reduceTensorNanOpt, reduceTensorIndices,
+                  reduceTensorIndicesType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
+    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(reduceTensorDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionIndicesSize(cudnnHandle_t handle,
-                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                             const cudnnTensorDescriptor_t aDesc,
-                             const cudnnTensorDescriptor_t cDesc,
-                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
-                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                               const cudnnTensorDescriptor_t aDesc,
-                               const cudnnTensorDescriptor_t cDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReduceTensor(cudnnHandle_t handle,
-                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-                  void *indices,
-                  size_t indicesSizeInBytes,
-                  void *workspace,
-                  size_t workspaceSizeInBytes,
-                  const void *alpha,
-                  const cudnnTensorDescriptor_t aDesc,
-                  const void *A,
-                  const void *beta,
-                  const cudnnTensorDescriptor_t cDesc,
-                  void *C) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
+    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+    void *indices, size_t indicesSizeInBytes, void *workspace,
+    size_t workspaceSizeInBytes, const void *alpha,
+    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
+    const cudnnTensorDescriptor_t cDesc, void *C) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
+      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes, workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc, C);
+  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
+                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
+                  C);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
+                                         const cudnnTensorDescriptor_t yDesc,
+                                         void *y, const void *valuePtr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, valuePtr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
+cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           void *y, const void *alpha) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, yDesc, y, alpha);
@@ -461,745 +481,785 @@ cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int k,  /* number of output feature maps */
-                           int c,  /* number of input feature maps */
-                           int h,  /* height of each input filter */
-                           int w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType,          /* image data type */
+    cudnnTensorFormat_t format, int k, /* number of output feature maps */
+    int c,                             /* number of input feature maps */
+    int h,                             /* height of each input filter */
+    int w) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *k,  /* number of output feature maps */
-                           int *c,  /* number of input feature maps */
-                           int *h,  /* height of each input filter */
-                           int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
+    const cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t *dataType,           /* image data type */
+    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
+    int *c,                              /* number of input feature maps */
+    int *h,                              /* height of each input filter */
+    int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
+      int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, k, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
-                           cudnnDataType_t dataType, /* image data type */
-                           cudnnTensorFormat_t format,
-                           int nbDims,
-                           const int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
+    cudnnFilterDescriptor_t filterDesc,
+    cudnnDataType_t dataType, /* image data type */
+    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
+                                   cudnnTensorFormat_t, int, const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
-                           int nbDimsRequested,
-                           cudnnDataType_t *dataType, /* image data type */
-                           cudnnTensorFormat_t *format,
-                           int *nbDims,
-                           int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, int, cudnnDataType_t *, cudnnTensorFormat_t *, int *, int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
+    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
+    cudnnDataType_t *dataType, /* image data type */
+    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
+      cudnnTensorFormat_t *, int *, int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims, filterDimA);
+  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
+                  filterDimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetFilterSizeInBytes(
+    const cudnnFilterDescriptor_t filterDesc, size_t *size) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterSizeInBytes");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc, size);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnTransformFilter(cudnnHandle_t handle,
-                     const cudnnTensorTransformDescriptor_t transDesc,
-                     const void *alpha,
-                     const cudnnFilterDescriptor_t srcDesc,
-                     const void *srcData,
-                     const void *beta,
-                     const cudnnFilterDescriptor_t destDesc,
-                     void *destData) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnTransformFilter(
+    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
+    const void *alpha, const cudnnFilterDescriptor_t srcDesc,
+    const void *srcData, const void *beta,
+    const cudnnFilterDescriptor_t destDesc, void *destData) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *, const void *,
+      const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData);
+  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
+                  destData);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFilterDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(filterDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnReorderFilterAndBias(cudnnHandle_t handle,
-                          const cudnnFilterDescriptor_t filterDesc,
-                          cudnnReorderType_t reorderType,
-                          const void *filterData,
-                          void *reorderedFilterData,
-                          int reorderBias,
-                          const void *biasData,
-                          void *reorderedBiasData) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t, const void *, void *, int, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnReorderFilterAndBias(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    cudnnReorderType_t reorderType, const void *filterData,
+    void *reorderedFilterData, int reorderBias, const void *biasData,
+    void *reorderedBiasData) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t,
+      const void *, void *, int, const void *, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReorderFilterAndBias");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, reorderType, filterData, reorderedFilterData, reorderBias, biasData, reorderedBiasData);
+  return func_ptr(handle, filterDesc, reorderType, filterData,
+                  reorderedFilterData, reorderBias, biasData,
+                  reorderedBiasData);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, mathType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
+    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, groupCount);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionReorderType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnReorderType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionReorderType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, reorderType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, cudnnReorderType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionReorderType(
+    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
+                                               cudnnReorderType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionReorderType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, reorderType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolution2dDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int pad_h,      /* zero-padding height */
-                                int pad_w,      /* zero-padding width */
-                                int u,          /* vertical filter stride */
-                                int v,          /* horizontal filter stride */
-                                int dilation_h, /* filter dilation in the vertical dimension */
-                                int dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, int, int, int, int, int, cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
+    int pad_w,                                        /* zero-padding width */
+    int u,          /* vertical filter stride */
+    int v,          /* horizontal filter stride */
+    int dilation_h, /* filter dilation in the vertical dimension */
+    int dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int *pad_h,      /* zero-padding height */
-                                int *pad_w,      /* zero-padding width */
-                                int *u,          /* vertical filter stride */
-                                int *v,          /* horizontal filter stride */
-                                int *dilation_h, /* filter dilation in the vertical dimension */
-                                int *dilation_w, /* filter dilation in the horizontal dimension */
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *, int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc,
+    int *pad_h,      /* zero-padding height */
+    int *pad_w,      /* zero-padding width */
+    int *u,          /* vertical filter stride */
+    int *v,          /* horizontal filter stride */
+    int *dilation_h, /* filter dilation in the vertical dimension */
+    int *dilation_w, /* filter dilation in the horizontal dimension */
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
+      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode, computeType);
+  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolution2dForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int *n,
-                                      int *c,
-                                      int *h,
-                                      int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLength, /* nbDims-2 size */
-                                const int padA[],
-                                const int filterStrideA[],
-                                const int dilationA[],
-                                cudnnConvolutionMode_t mode,
-                                cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int, const int [], const int [], const int [], cudnnConvolutionMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
+    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
+    const int padA[], const int filterStrideA[], const int dilationA[],
+    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
+      cudnnConvolutionMode_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
+                  computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdDescriptor(const cudnnConvolutionDescriptor_t convDesc,
-                                int arrayLengthRequested,
-                                int *arrayLength,
-                                int padA[],
-                                int strideA[],
-                                int dilationA[],
-                                cudnnConvolutionMode_t *mode,
-                                cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, int, int *, int [], int [], int [], cudnnConvolutionMode_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
+    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
+    int *arrayLength, int padA[], int strideA[], int dilationA[],
+    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
+      cudnnConvolutionMode_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA, dilationA, mode, computeType);
+  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
+                  dilationA, mode, computeType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionNdForwardOutputDim(const cudnnConvolutionDescriptor_t convDesc,
-                                      const cudnnTensorDescriptor_t inputTensorDesc,
-                                      const cudnnFilterDescriptor_t filterDesc,
-                                      int nbDims,
-                                      int tensorOutputDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t inputTensorDesc,
+    const cudnnFilterDescriptor_t filterDesc, int nbDims,
+    int tensorOutputDimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims, tensorOutputDimA);
+  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
+                  tensorOutputDimA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(convDesc);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                     const cudnnTensorDescriptor_t xDesc,
-                                     const cudnnFilterDescriptor_t wDesc,
-                                     const cudnnConvolutionDescriptor_t convDesc,
-                                     const cudnnTensorDescriptor_t yDesc,
-                                     const int requestedAlgoCount,
-                                     int *returnedAlgoCount,
-                                     cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionForwardAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t xDesc,
-                                       const void *x,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t yDesc,
-                                       void *y,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults,
-                                       void *workSpace,
-                                       size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm(cudnnHandle_t handle,
-                                    const cudnnTensorDescriptor_t xDesc,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const cudnnConvolutionDescriptor_t convDesc,
-                                    const cudnnTensorDescriptor_t yDesc,
-                                    cudnnConvolutionFwdPreference_t preference,
-                                    size_t memoryLimitInBytes,
-                                    cudnnConvolutionFwdAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t, cudnnConvolutionFwdAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    cudnnConvolutionFwdPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionFwdAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdPreference_t, size_t,
+      cudnnConvolutionFwdAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithm_v7(cudnnHandle_t handle,
-                                       const cudnnTensorDescriptor_t srcDesc,
-                                       const cudnnFilterDescriptor_t filterDesc,
-                                       const cudnnConvolutionDescriptor_t convDesc,
-                                       const cudnnTensorDescriptor_t destDesc,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnFilterDescriptor_t filterDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionFwdAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardWorkspaceSize(cudnnHandle_t handle,
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const cudnnConvolutionDescriptor_t convDesc,
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        cudnnConvolutionFwdAlgo_t algo,
-                                        size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionForward(cudnnHandle_t handle,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnConvolutionDescriptor_t convDesc,
-                        cudnnConvolutionFwdAlgo_t algo,
-                        void *workSpace,
-                        size_t workSpaceSizeInBytes,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t yDesc,
-                        void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y);
+  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBiasActivationForward(cudnnHandle_t handle,
-                                      const void *alpha1,
-                                      const cudnnTensorDescriptor_t xDesc,
-                                      const void *x,
-                                      const cudnnFilterDescriptor_t wDesc,
-                                      const void *w,
-                                      const cudnnConvolutionDescriptor_t convDesc,
-                                      cudnnConvolutionFwdAlgo_t algo,
-                                      void *workSpace,
-                                      size_t workSpaceSizeInBytes,
-                                      const void *alpha2,
-                                      const cudnnTensorDescriptor_t zDesc,
-                                      const void *z,
-                                      const cudnnTensorDescriptor_t biasDesc,
-                                      const void *bias,
-                                      const cudnnActivationDescriptor_t activationDesc,
-                                      const cudnnTensorDescriptor_t yDesc,
-                                      void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
+    cudnnHandle_t handle, const void *alpha1,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
+    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
+    const cudnnTensorDescriptor_t zDesc, const void *z,
+    const cudnnTensorDescriptor_t biasDesc, const void *bias,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias, activationDesc, yDesc, y);
+  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
+                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
+                  activationDesc, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardBias(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dbDesc,
-                             void *db) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dbDesc, void *db) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                            const cudnnTensorDescriptor_t xDesc,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnFilterDescriptor_t dwDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardFilterAlgorithmEx(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t xDesc,
-                                              const void *x,
-                                              const cudnnTensorDescriptor_t dyDesc,
-                                              const void *y,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t dwDesc,
-                                              void *dw,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults,
-                                              void *workSpace,
-                                              size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *y,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t xDesc,
-                                           const cudnnTensorDescriptor_t dyDesc,
-                                           const cudnnConvolutionDescriptor_t convDesc,
-                                           const cudnnFilterDescriptor_t dwDesc,
-                                           cudnnConvolutionBwdFilterPreference_t preference,
-                                           size_t memoryLimitInBytes,
-                                           cudnnConvolutionBwdFilterAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t, size_t, cudnnConvolutionBwdFilterAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t dwDesc,
+    cudnnConvolutionBwdFilterPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdFilterAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterPreference_t,
+      size_t, cudnnConvolutionBwdFilterAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterAlgorithm_v7(cudnnHandle_t handle,
-                                              const cudnnTensorDescriptor_t srcDesc,
-                                              const cudnnTensorDescriptor_t diffDesc,
-                                              const cudnnConvolutionDescriptor_t convDesc,
-                                              const cudnnFilterDescriptor_t gradDesc,
-                                              const int requestedAlgoCount,
-                                              int *returnedAlgoCount,
-                                              cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, const int, int *,
+      cudnnConvolutionBwdFilterAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardFilterWorkspaceSize(cudnnHandle_t handle,
-                                               const cudnnTensorDescriptor_t xDesc,
-                                               const cudnnTensorDescriptor_t dyDesc,
-                                               const cudnnConvolutionDescriptor_t convDesc,
-                                               const cudnnFilterDescriptor_t gradDesc,
-                                               cudnnConvolutionBwdFilterAlgo_t algo,
-                                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
+    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnFilterDescriptor_t gradDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardFilter(cudnnHandle_t handle,
-                               const void *alpha,
-                               const cudnnTensorDescriptor_t xDesc,
-                               const void *x,
-                               const cudnnTensorDescriptor_t dyDesc,
-                               const void *dy,
-                               const cudnnConvolutionDescriptor_t convDesc,
-                               cudnnConvolutionBwdFilterAlgo_t algo,
-                               void *workSpace,
-                               size_t workSpaceSizeInBytes,
-                               const void *beta,
-                               const cudnnFilterDescriptor_t dwDesc,
-                               void *dw) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnFilterDescriptor_t dwDesc, void *dw) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
+      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
+  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                          const cudnnFilterDescriptor_t wDesc,
-                                          const cudnnTensorDescriptor_t dyDesc,
-                                          const cudnnConvolutionDescriptor_t convDesc,
-                                          const cudnnTensorDescriptor_t dxDesc,
-                                          const int requestedAlgoCount,
-                                          int *returnedAlgoCount,
-                                          cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
+                  returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindConvolutionBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t wDesc,
-                                            const void *w,
-                                            const cudnnTensorDescriptor_t dyDesc,
-                                            const void *dy,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t dxDesc,
-                                            void *dx,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults,
-                                            void *workSpace,
-                                            size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
+      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx, requestedAlgoCount, returnedAlgoCount, perfResults, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm(cudnnHandle_t handle,
-                                         const cudnnFilterDescriptor_t wDesc,
-                                         const cudnnTensorDescriptor_t dyDesc,
-                                         const cudnnConvolutionDescriptor_t convDesc,
-                                         const cudnnTensorDescriptor_t dxDesc,
-                                         cudnnConvolutionBwdDataPreference_t preference,
-                                         size_t memoryLimitInBytes,
-                                         cudnnConvolutionBwdDataAlgo_t *algo) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t, size_t, cudnnConvolutionBwdDataAlgo_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    cudnnConvolutionBwdDataPreference_t preference, size_t memoryLimitInBytes,
+    cudnnConvolutionBwdDataAlgo_t *algo) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataPreference_t,
+      size_t, cudnnConvolutionBwdDataAlgo_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference, memoryLimitInBytes, algo);
+  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, preference,
+                  memoryLimitInBytes, algo);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataAlgorithm_v7(cudnnHandle_t handle,
-                                            const cudnnFilterDescriptor_t filterDesc,
-                                            const cudnnTensorDescriptor_t diffDesc,
-                                            const cudnnConvolutionDescriptor_t convDesc,
-                                            const cudnnTensorDescriptor_t gradDesc,
-                                            const int requestedAlgoCount,
-                                            int *returnedAlgoCount,
-                                            cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
+    const cudnnTensorDescriptor_t diffDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, const int, int *,
+      cudnnConvolutionBwdDataAlgoPerf_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc, requestedAlgoCount, returnedAlgoCount, perfResults);
+  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
+                  requestedAlgoCount, returnedAlgoCount, perfResults);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionBackwardDataWorkspaceSize(cudnnHandle_t handle,
-                                             const cudnnFilterDescriptor_t wDesc,
-                                             const cudnnTensorDescriptor_t dyDesc,
-                                             const cudnnConvolutionDescriptor_t convDesc,
-                                             const cudnnTensorDescriptor_t dxDesc,
-                                             cudnnConvolutionBwdDataAlgo_t algo,
-                                             size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
+    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
+    const cudnnTensorDescriptor_t dyDesc,
+    const cudnnConvolutionDescriptor_t convDesc,
+    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnFilterDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
+      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnConvolutionBackwardData(cudnnHandle_t handle,
-                             const void *alpha,
-                             const cudnnFilterDescriptor_t wDesc,
-                             const void *w,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnConvolutionDescriptor_t convDesc,
-                             cudnnConvolutionBwdDataAlgo_t algo,
-                             void *workSpace,
-                             size_t workSpaceSizeInBytes,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
+    cudnnHandle_t handle, const void *alpha,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnConvolutionDescriptor_t convDesc,
+    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
+    size_t workSpaceSizeInBytes, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
+      size_t, const void *, const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
+  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
+                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle,
-            const cudnnTensorDescriptor_t xDesc,
-            const void *x,
-            const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc,
-            void *colBuffer) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, void *);
+cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
+            const void *x, const cudnnFilterDescriptor_t wDesc,
+            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
+                                   const void *, const cudnnFilterDescriptor_t,
+                                   const cudnnConvolutionDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxForward(cudnnHandle_t handle,
-                    cudnnSoftmaxAlgorithm_t algo,
-                    cudnnSoftmaxMode_t mode,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSoftmaxBackward(cudnnHandle_t handle,
-                     cudnnSoftmaxAlgorithm_t algo,
-                     cudnnSoftmaxMode_t mode,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
+    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc, dx);
+  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
+                  dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t mode,
-                            cudnnNanPropagation_t maxpoolingNanOpt,
-                            int windowHeight,
-                            int windowWidth,
-                            int verticalPadding,
-                            int horizontalPadding,
-                            int verticalStride,
-                            int horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
+    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
+    int verticalPadding, int horizontalPadding, int verticalStride,
+    int horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
+      int, int, int, int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *windowHeight,
-                            int *windowWidth,
-                            int *verticalPadding,
-                            int *horizontalPadding,
-                            int *verticalStride,
-                            int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
+    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
+    int *windowWidth, int *verticalPadding, int *horizontalPadding,
+    int *verticalStride, int *horizontalStride) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight, windowWidth, verticalPadding, horizontalPadding, verticalStride, horizontalStride);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
+                  windowWidth, verticalPadding, horizontalPadding,
+                  verticalStride, horizontalStride);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
-                            const cudnnPoolingMode_t mode,
-                            const cudnnNanPropagation_t maxpoolingNanOpt,
-                            int nbDims,
-                            const int windowDimA[],
-                            const int paddingA[],
-                            const int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t, const cudnnPoolingMode_t, const cudnnNanPropagation_t, int, const int [], const int [], const int []);
+cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
+    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
+    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
+    const int windowDimA[], const int paddingA[], const int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
+      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
+                  paddingA, strideA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
-                            int nbDimsRequested,
-                            cudnnPoolingMode_t *mode,
-                            cudnnNanPropagation_t *maxpoolingNanOpt,
-                            int *nbDims,
-                            int windowDimA[],
-                            int paddingA[],
-                            int strideA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *, cudnnNanPropagation_t *, int *, int [], int [], int []);
+cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
+    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
+    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
+    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
+      cudnnNanPropagation_t *, int *, int[], int[], int[]);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims, windowDimA, paddingA, strideA);
+  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
+                  windowDimA, paddingA, strideA);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims,
-                                  int outputTensorDimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int, int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
+                                  int nbDims, int outputTensorDimA[]) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                   const cudnnTensorDescriptor_t, int, int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
 }
@@ -1207,72 +1267,69 @@ cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
                                   const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n,
-                                  int *c,
-                                  int *h,
-                                  int *w) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnPoolingDescriptor_t, const cudnnTensorDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
+                                  int *n, int *c, int *h, int *w) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               int *, int *, int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(poolingDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingForward(cudnnHandle_t handle,
-                    const cudnnPoolingDescriptor_t poolingDesc,
-                    const void *alpha,
-                    const cudnnTensorDescriptor_t xDesc,
-                    const void *x,
-                    const void *beta,
-                    const cudnnTensorDescriptor_t yDesc,
-                    void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnPoolingBackward(cudnnHandle_t handle,
-                     const cudnnPoolingDescriptor_t poolingDesc,
-                     const void *alpha,
-                     const cudnnTensorDescriptor_t yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t xDesc,
-                     const void *x,
-                     const void *beta,
-                     const cudnnTensorDescriptor_t dxDesc,
-                     void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
+    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t mode,
-                             cudnnNanPropagation_t reluNanOpt,
-                             double coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t, cudnnActivationMode_t, cudnnNanPropagation_t, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
+    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
+    cudnnNanPropagation_t reluNanOpt, double coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
+                                               cudnnActivationMode_t,
+                                               cudnnNanPropagation_t, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1281,9 +1338,10 @@ cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
 cudnnStatus_t CUDNNWINAPI
 cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
                              cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt,
-                             double *coef) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnActivationDescriptor_t, cudnnActivationMode_t *, cudnnNanPropagation_t *, double *);
+                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
+      cudnnNanPropagation_t *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc, mode, reluNanOpt, coef);
@@ -1291,65 +1349,68 @@ cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(activationDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationForward(cudnnHandle_t handle,
-                       cudnnActivationDescriptor_t activationDesc,
-                       const void *alpha,
-                       const cudnnTensorDescriptor_t xDesc,
-                       const void *x,
-                       const void *beta,
-                       const cudnnTensorDescriptor_t yDesc,
-                       void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnActivationBackward(cudnnHandle_t handle,
-                        cudnnActivationDescriptor_t activationDesc,
-                        const void *alpha,
-                        const cudnnTensorDescriptor_t yDesc,
-                        const void *y,
-                        const cudnnTensorDescriptor_t dyDesc,
-                        const void *dy,
-                        const cudnnTensorDescriptor_t xDesc,
-                        const void *x,
-                        const void *beta,
-                        const cudnnTensorDescriptor_t dxDesc,
-                        void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnActivationDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
+    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
+                  beta, dxDesc, dx);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int, double, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned lrnN, double lrnAlpha,
+                                                double lrnBeta, double lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int, double, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
+                                                unsigned *lrnN,
+                                                double *lrnAlpha,
+                                                double *lrnBeta, double *lrnK) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
@@ -1357,157 +1418,157 @@ cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrn
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnLRNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(lrnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelForward(cudnnHandle_t handle,
-                            cudnnLRNDescriptor_t normDesc,
-                            cudnnLRNMode_t lrnMode,
-                            const void *alpha,
-                            const cudnnTensorDescriptor_t xDesc,
-                            const void *x,
-                            const void *beta,
-                            const cudnnTensorDescriptor_t yDesc,
-                            void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
-                             cudnnLRNDescriptor_t normDesc,
-                             cudnnLRNMode_t lrnMode,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t yDesc,
-                             const void *y,
-                             const cudnnTensorDescriptor_t dyDesc,
-                             const void *dy,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *beta,
-                             const cudnnTensorDescriptor_t dxDesc,
-                             void *dx) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
+    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
+    const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
+    const cudnnTensorDescriptor_t dxDesc, void *dx) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx);
+  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
+                  x, beta, dxDesc, dx);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
-                                  cudnnLRNDescriptor_t normDesc,
-                                  cudnnDivNormMode_t mode,
-                                  const void *alpha,
-                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-                                  const void *x,
-                                  const void *means, /* if NULL, means are assumed to be zero */
-                                  void *temp,
-                                  void *temp2,
-                                  const void *beta,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t yDesc, void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
+      const void *, const cudnnTensorDescriptor_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2, beta, yDesc, y);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
+                  beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
-                                   cudnnLRNDescriptor_t normDesc,
-                                   cudnnDivNormMode_t mode,
-                                   const void *alpha,
-                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
-                                   const void *x,
-                                   const void *means, /* if NULL, means are assumed to be zero */
-                                   const void *dy,
-                                   void *temp,
-                                   void *temp2,
-                                   const void *beta,
-                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-                                   void *dx,                                   /* output x differential */
-                                   void *dMeans) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *, void *, const void *, const cudnnTensorDescriptor_t, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
+cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
+    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
+    cudnnDivNormMode_t mode, const void *alpha,
+    const cudnnTensorDescriptor_t
+        xDesc, /* same desc for x, means, dy, temp, temp2 */
+    const void *x,
+    const void *means, /* if NULL, means are assumed to be zero */
+    const void *dy, void *temp, void *temp2, const void *beta,
+    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+    void *dx,                                   /* output x differential */
+    void *dMeans) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp, temp2, beta, dXdMeansDesc, dx, dMeans);
+  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
+                  temp2, beta, dXdMeansDesc, dx, dMeans);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
-                              const cudnnTensorDescriptor_t xDesc,
-                              cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, cudnnBatchNormMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
+    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
+    cudnnBatchNormMode_t mode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
+                                               const cudnnTensorDescriptor_t,
+                                               cudnnBatchNormMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(derivedBnDesc, xDesc, mode);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
-                                                         cudnnBatchNormMode_t mode,
-                                                         cudnnBatchNormOps_t bnOps,
-                                                         const cudnnTensorDescriptor_t xDesc,
-                                                         const cudnnTensorDescriptor_t zDesc,
-                                                         const cudnnTensorDescriptor_t yDesc,
-                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                                         const cudnnActivationDescriptor_t activationDesc,
-                                                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
+    const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
+                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
-                                                  cudnnBatchNormMode_t mode,
-                                                  cudnnBatchNormOps_t bnOps,
-                                                  const cudnnTensorDescriptor_t xDesc,
-                                                  const cudnnTensorDescriptor_t yDesc,
-                                                  const cudnnTensorDescriptor_t dyDesc,
-                                                  const cudnnTensorDescriptor_t dzDesc,
-                                                  const cudnnTensorDescriptor_t dxDesc,
-                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                                  const cudnnActivationDescriptor_t activationDesc,
-                                                  size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
+    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
+    const cudnnTensorDescriptor_t dxDesc,
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
+      const cudnnActivationDescriptor_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes);
+  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
+                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
-                                                     cudnnBatchNormMode_t mode,
-                                                     cudnnBatchNormOps_t bnOps,
-                                                     const cudnnActivationDescriptor_t activationDesc,
-                                                     const cudnnTensorDescriptor_t xDesc,
-                                                     size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
+cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
+    const cudnnActivationDescriptor_t activationDesc,
+    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
+      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc,
-    void *y, /* NxCxHxW */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
 
     /* Shared desc for the next 6 tensors in the argument list.
        Data type to be set as follows:
@@ -1515,13 +1576,13 @@ cudnnBatchNormalizationForwardTraining(
        Dimensions for this descriptor depend on normalization mode
        - Spatial Normalization : tensors are expected to have dims 1xCx1x1
         (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
-        (normalization is performed across N) */
+       - Per-Activation Normalization : tensors are expected to have dims of
+       1xCxHxW (normalization is performed across N) */
     const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
 
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
-    const void *bnScale,
-    const void *bnBias,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
+     */
+    const void *bnScale, const void *bnBias,
 
     /* MUST use factor=1 in the very first call of a complete training cycle.
        Use a factor=1/(1+n) at N-th call to the function to get
@@ -1539,248 +1600,261 @@ cudnnBatchNormalizationForwardTraining(
        of  variance[x] (factor is applied in the same way as for runningMean) */
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
+    void *resultSaveMean, void *resultSaveInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
+  return func_ptr(
+      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
+      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
+      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle,
-    cudnnBatchNormMode_t mode,
-    cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
     const void *alpha, /* alpha[0] = result blend factor */
     const void *beta,  /* beta[0] = dest layer blend factor */
 
-    const cudnnTensorDescriptor_t xDesc,
-    const void *xData,
-    const cudnnTensorDescriptor_t zDesc,
-    const void *zData,
-    const cudnnTensorDescriptor_t yDesc,
-    void *yData,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t zDesc, const void *zData,
+    const cudnnTensorDescriptor_t yDesc, void *yData,
 
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const void *bnScale,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
     const void *bnBias,
 
-    double exponentialAverageFactor,
-    void *resultRunningMean,
+    double exponentialAverageFactor, void *resultRunningMean,
     void *resultRunningVariance,
 
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
+       backward functions. */
     double epsilon,
 
     /* Optionally save intermediate results from the forward pass here
        - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean,
-    void *resultSaveInvVariance,
+    void *resultSaveMean, void *resultSaveInvVariance,
 
-    cudnnActivationDescriptor_t activationDesc,
-    void *workspace,
-    size_t workSpaceSizeInBytes,
-    void *reserveSpace,
+    cudnnActivationDescriptor_t activationDesc, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, void *, void *, double, void *, void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, double, void *, void *, double, void *,
+      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
+                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
+                  exponentialAverageFactor, resultRunningMean,
+                  resultRunningVariance, epsilon, resultSaveMean,
+                  resultSaveInvVariance, activationDesc, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
-                                        cudnnBatchNormMode_t mode,
-                                        const void *alpha, /* alpha[0] = result blend factor */
-                                        const void *beta,  /* beta[0] = dest layer blend factor */
-                                        const cudnnTensorDescriptor_t xDesc,
-                                        const void *x, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t yDesc,
-                                        void *y, /* NxCxHxW */
-                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-                                        const void *bnScale,
-                                        const void *bnBias,
-                                        const void *estimatedMean,
-                                        const void *estimatedVariance,
-                                        double epsilon) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, const void *, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
+    const void *bnBias, const void *estimatedMean,
+    const void *estimatedVariance, double epsilon) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, const void *, const void *, const void *, double);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon);
+  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
+                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
+                  estimatedVariance, epsilon);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackward(cudnnHandle_t handle,
-                                cudnnBatchNormMode_t mode,
-                                const void *alphaDataDiff,
-                                const void *betaDataDiff,
-                                const void *alphaParamDiff,
-                                const void *betaParamDiff,
-                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-                                const void *x,
-                                const cudnnTensorDescriptor_t dyDesc,
-                                const void *dy,
-                                const cudnnTensorDescriptor_t dxDesc,
-                                void *dx,
-                                /* Shared tensor desc for the 4 tensors below */
-                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                const void *bnScale, /* bnBias doesn't affect backpropagation */
-                                /* scale and bias diff are not backpropagated below this layer */
-                                void *dBnScaleResult,
-                                void *dBnBiasResult,
-                                /* Same epsilon as forward pass */
-                                double epsilon,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
+    const void *betaDataDiff, const void *alphaParamDiff,
+    const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dxDesc, void *dx,
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+    const void *bnScale, /* bnBias doesn't affect backpropagation */
+    /* scale and bias diff are not backpropagated below this layer */
+    void *dBnScaleResult, void *dBnBiasResult,
+    /* Same epsilon as forward pass */
+    double epsilon,
 
-                                /* Optionally cached intermediate results from
-                                   forward pass */
-                                const void *savedMean,
-                                const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, void *, void *, double, const void *, const void *);
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
+      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      const void *, void *, void *, double, const void *, const void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx, dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult, epsilon, savedMean, savedInvVariance);
+  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
+                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
+                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
+                  epsilon, savedMean, savedInvVariance);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
-                                  cudnnBatchNormMode_t mode,
-                                  cudnnBatchNormOps_t bnOps,
+cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
+    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
 
-                                  const void *alphaDataDiff,
-                                  const void *betaDataDiff,
-                                  const void *alphaParamDiff,
-                                  const void *betaParamDiff,
-                                  const cudnnTensorDescriptor_t xDesc,
-                                  const void *xData,
-                                  const cudnnTensorDescriptor_t yDesc,
-                                  const void *yData,
-                                  const cudnnTensorDescriptor_t dyDesc,
-                                  const void *dyData,
-                                  const cudnnTensorDescriptor_t dzDesc,
-                                  void *dzData,
-                                  const cudnnTensorDescriptor_t dxDesc,
-                                  void *dxData,
+    const void *alphaDataDiff, const void *betaDataDiff,
+    const void *alphaParamDiff, const void *betaParamDiff,
+    const cudnnTensorDescriptor_t xDesc, const void *xData,
+    const cudnnTensorDescriptor_t yDesc, const void *yData,
+    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
+    const cudnnTensorDescriptor_t dzDesc, void *dzData,
+    const cudnnTensorDescriptor_t dxDesc, void *dxData,
 
-                                  /* Shared tensor desc for the 4 tensors below */
-                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-                                  const void *bnScaleData,
-                                  const void *bnBiasData, /* needed if there is activation */
-                                  void *dBnScaleData,
-                                  void *dBnBiasData,
-                                  double epsilon, /* Same epsilon as forward pass */
+    /* Shared tensor desc for the 4 tensors below */
+    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
+    const void *bnBiasData, /* needed if there is activation */
+    void *dBnScaleData, void *dBnBiasData,
+    double epsilon, /* Same epsilon as forward pass */
 
-                                  /* Optionally cached intermediate results from
-                                     forward pass */
-                                  const void *savedMean,
-                                  const void *savedInvVariance,
-                                  cudnnActivationDescriptor_t activationDesc,
-                                  void *workSpace,
-                                  size_t workSpaceSizeInBytes,
-                                  void *reserveSpace,
-                                  size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
+    /* Optionally cached intermediate results from
+       forward pass */
+    const void *savedMean, const void *savedInvVariance,
+    cudnnActivationDescriptor_t activationDesc, void *workSpace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
+      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
+      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
+      void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(
+      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
+      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
+      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
+      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
+      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t *stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
-                                       cudnnSamplerType_t samplerType,
-                                       cudnnDataType_t dataType,
-                                       const int nbDims,
-                                       const int dimA[]) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t, const int, const int []);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
+    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
+      const int, const int[]);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
+cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
+    cudnnSpatialTransformerDescriptor_t stDesc) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(stDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
-                                   const cudnnSpatialTransformerDescriptor_t stDesc,
-                                   const void *theta,
-                                   void *grid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *theta, void *grid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, theta, grid);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
-                                    const cudnnSpatialTransformerDescriptor_t stDesc,
-                                    const void *dgrid,
-                                    void *dtheta) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
+    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *dgrid, void *dtheta) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, dgrid, dtheta);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
-                             cudnnSpatialTransformerDescriptor_t stDesc,
-                             const void *alpha,
-                             const cudnnTensorDescriptor_t xDesc,
-                             const void *x,
-                             const void *grid,
-                             const void *beta,
-                             cudnnTensorDescriptor_t yDesc,
-                             void *y) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, cudnnTensorDescriptor_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
+    void *y) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      cudnnTensorDescriptor_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
-                              cudnnSpatialTransformerDescriptor_t stDesc,
-                              const void *alpha,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const void *x,
-                              const void *beta,
-                              const cudnnTensorDescriptor_t dxDesc,
-                              void *dx,
-                              const void *alphaDgrid,
-                              const cudnnTensorDescriptor_t dyDesc,
-                              const void *dy,
-                              const void *grid,
-                              const void *betaDgrid,
-                              void *dgrid) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const cudnnTensorDescriptor_t, void *, const void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, void *);
+cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
+    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
+    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
+    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
+    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
+    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *,
+      const cudnnTensorDescriptor_t, void *, const void *,
+      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
+      void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid, dyDesc, dy, grid, betaDgrid, dgrid);
+  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
+                  dyDesc, dy, grid, betaDgrid, dgrid);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
@@ -1788,99 +1862,95 @@ cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
+                                                    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
+    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(xdesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float dropout,
-                          void *states,
-                          size_t stateSizeInBytes,
-                          unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                              cudnnHandle_t handle,
-                              float dropout,
-                              void *states,
-                              size_t stateSizeInBytes,
-                              unsigned long long seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float, void *, size_t, unsigned long long);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
+    void *states, size_t stateSizeInBytes, unsigned long long seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float, void *, size_t, unsigned long long);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
-                          cudnnHandle_t handle,
-                          float *dropout,
-                          void **states,
-                          unsigned long long *seed) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t, float *, void **, unsigned long long *);
+cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
+    void **states, unsigned long long *seed) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
+                                   float *, void **, unsigned long long *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(dropoutDesc, handle, dropout, states, seed);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutForward(cudnnHandle_t handle,
-                    const cudnnDropoutDescriptor_t dropoutDesc,
-                    const cudnnTensorDescriptor_t xdesc,
-                    const void *x,
-                    const cudnnTensorDescriptor_t ydesc,
-                    void *y,
-                    void *reserveSpace,
-                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t xdesc, const void *x,
+    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDropoutBackward(cudnnHandle_t handle,
-                     const cudnnDropoutDescriptor_t dropoutDesc,
-                     const cudnnTensorDescriptor_t dydesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dxdesc,
-                     void *dx,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnDropoutDescriptor_t, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
+    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
+    const cudnnTensorDescriptor_t dydesc, const void *dy,
+    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnDropoutDescriptor_t,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
@@ -1888,132 +1958,130 @@ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      const int hiddenSize,
-                      const int numLayers,
-                      cudnnDropoutDescriptor_t dropoutDesc,
-                      cudnnRNNInputMode_t inputMode,
-                      cudnnDirectionMode_t direction,
-                      cudnnRNNMode_t mode,
-                      cudnnRNNAlgo_t algo,
-                      cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, mathPrec);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDescriptor(cudnnHandle_t handle,
-                      cudnnRNNDescriptor_t rnnDesc,
-                      int *hiddenSize,
-                      int *numLayers,
-                      cudnnDropoutDescriptor_t *dropoutDesc,
-                      cudnnRNNInputMode_t *inputMode,
-                      cudnnDirectionMode_t *direction,
-                      cudnnRNNMode_t *mode,
-                      cudnnRNNAlgo_t *algo,
-                      cudnnDataType_t *mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *, cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *, cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
+    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
+    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
+    cudnnRNNMode_t *mode, cudnnRNNAlgo_t *algo, cudnnDataType_t *mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
+      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
+      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, mathPrec);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
+    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, mType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
+                                              cudnnRNNBiasMode_t biasMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNBiasMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, biasMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
+                                              cudnnRNNBiasMode_t *biasMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBiasMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, biasMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNSetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t clipMode,
-                cudnnNanPropagation_t clipNanOpt,
-                double lclip,
-                double rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t, cudnnNanPropagation_t, double, double);
+cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t clipMode,
+                                          cudnnNanPropagation_t clipNanOpt,
+                                          double lclip, double rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
+      cudnnNanPropagation_t, double, double);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNGetClip(cudnnHandle_t handle,
-                cudnnRNNDescriptor_t rnnDesc,
-                cudnnRNNClipMode_t *clipMode,
-                cudnnNanPropagation_t *clipNanOpt,
-                double *lclip,
-                double *rclip) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *, cudnnNanPropagation_t *, double *, double *);
+cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
+                                          cudnnRNNDescriptor_t rnnDesc,
+                                          cudnnRNNClipMode_t *clipMode,
+                                          cudnnNanPropagation_t *clipNanOpt,
+                                          double *lclip, double *rclip) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
+      cudnnNanPropagation_t *, double *, double *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
-                            cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize,
-                            const int outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize, const int outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
-                            const cudnnRNNDescriptor_t rnnDesc,
-                            int *recProjSize,
-                            int *outProjSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
+    int *outProjSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
-                             const int minibatch,
-                             const cudnnDataType_t dataType,
-                             cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int, const cudnnDataType_t, cudnnPersistentRNNPlan_t *);
+cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
+    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
+                                               const cudnnDataType_t,
+                                               cudnnPersistentRNNPlan_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, minibatch, dataType, plan);
@@ -2021,209 +2089,206 @@ cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnPersistentRNNPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
+    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnPersistentRNNPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, plan);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
-                               const cudnnRNNDescriptor_t rnnDesc,
-                               const int seqLength,
-                               const cudnnTensorDescriptor_t *xDesc,
-                               size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
+    size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle,
-                      const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc,
-                      size_t *sizeInBytes,
+cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
                       cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t, size_t *, cudnnDataType_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
+      size_t *, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
-                                const cudnnRNNDescriptor_t rnnDesc,
-                                const int pseudoLayer,
-                                const cudnnTensorDescriptor_t xDesc,
-                                const cudnnFilterDescriptor_t wDesc,
-                                const void *w,
-                                const int linLayerID,
-                                cudnnFilterDescriptor_t linLayerMatDesc,
-                                void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerMatDesc, linLayerMat);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerMatDesc, linLayerMat);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
-                              const cudnnRNNDescriptor_t rnnDesc,
-                              const int pseudoLayer,
-                              const cudnnTensorDescriptor_t xDesc,
-                              const cudnnFilterDescriptor_t wDesc,
-                              const void *w,
-                              const int linLayerID,
-                              cudnnFilterDescriptor_t linLayerBiasDesc,
-                              void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const void *, const int, cudnnFilterDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
+    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
+    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
+      const void *, const int, cudnnFilterDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID, linLayerBiasDesc, linLayerBias);
+  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
+                  linLayerBiasDesc, linLayerBias);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInference(cudnnHandle_t handle,
-                         const cudnnRNNDescriptor_t rnnDesc,
-                         const int seqLength,
-                         const cudnnTensorDescriptor_t *xDesc,
-                         const void *x,
-                         const cudnnTensorDescriptor_t hxDesc,
-                         const void *hx,
-                         const cudnnTensorDescriptor_t cxDesc,
-                         const void *cx,
-                         const cudnnFilterDescriptor_t wDesc,
-                         const void *w,
-                         const cudnnTensorDescriptor_t *yDesc,
-                         void *y,
-                         const cudnnTensorDescriptor_t hyDesc,
-                         void *hy,
-                         const cudnnTensorDescriptor_t cyDesc,
-                         void *cy,
-                         void *workspace,
-                         size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTraining(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t cxDesc,
-                        const void *cx,
-                        const cudnnFilterDescriptor_t wDesc,
-                        const void *w,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        void *y,
-                        const cudnnTensorDescriptor_t hyDesc,
-                        void *hy,
-                        const cudnnTensorDescriptor_t cyDesc,
-                        void *cy,
-                        void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle,
-                     const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength,
-                     const cudnnTensorDescriptor_t *yDesc,
-                     const void *y,
-                     const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy,
-                     const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy,
-                     const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy,
-                     const cudnnFilterDescriptor_t wDesc,
-                     const void *w,
-                     const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx,
-                     const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx,
-                     const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx,
-                     const cudnnTensorDescriptor_t dhxDesc,
-                     void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc,
-                     void *dcx,
-                     void *workspace,
-                     size_t workSpaceSizeInBytes,
-                     void *reserveSpace,
-                     size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
+                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
+                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
+                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
+                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
+                     const void *w, const cudnnTensorDescriptor_t hxDesc,
+                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
+                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
+                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+                     void *workspace, size_t workSpaceSizeInBytes,
+                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeights(cudnnHandle_t handle,
-                        const cudnnRNNDescriptor_t rnnDesc,
-                        const int seqLength,
-                        const cudnnTensorDescriptor_t *xDesc,
-                        const void *x,
-                        const cudnnTensorDescriptor_t hxDesc,
-                        const void *hx,
-                        const cudnnTensorDescriptor_t *yDesc,
-                        const void *y,
-                        const void *workspace,
-                        size_t workSpaceSizeInBytes,
-                        const cudnnFilterDescriptor_t dwDesc,
-                        void *dw,
-                        const void *reserveSpace,
-                        size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t paddingMode) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNPaddingMode_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(
+    cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *paddingMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
+                                               cudnnRNNPaddingMode_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDesc, paddingMode);
@@ -2231,7 +2296,7 @@ cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNPaddingMode_t *padd
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDataDesc);
@@ -2239,338 +2304,352 @@ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(rnnDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
-                          cudnnDataType_t dataType,
-                          cudnnRNNDataLayout_t layout,
-                          int maxSeqLength,
-                          int batchSize,
-                          int vectorSize,
-                          const int seqLengthArray[], /* length of each sequence in the batch */
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int, int, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t dataType,
+    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
+    int vectorSize,
+    const int seqLengthArray[], /* length of each sequence in the batch */
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
+      int, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, seqLengthArray, paddingFill);
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
-                          cudnnDataType_t *dataType,
-                          cudnnRNNDataLayout_t *layout,
-                          int *maxSeqLength,
-                          int *batchSize,
-                          int *vectorSize,
-                          int arrayLengthRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *, int *, int *, int *, int, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
+    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t *dataType,
+    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
+    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
+    void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
+      int *, int *, int *, int, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize, vectorSize, arrayLengthRequested, seqLengthArray, paddingFill);
+  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
+                  vectorSize, arrayLengthRequested, seqLengthArray,
+                  paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardTrainingEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnTensorDescriptor_t cxDesc,
-                          const void *cx,
-                          const cudnnFilterDescriptor_t wDesc,
-                          const void *w,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          void *y,
-                          const cudnnTensorDescriptor_t hyDesc,
-                          void *hy,
-                          const cudnnTensorDescriptor_t cyDesc,
-                          void *cy,
-                          const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                          const void *keys,                     /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                          void *cAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                          void *iAttn,                          /* reserved, should pass NULL */
-                          const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                          void *queries,                        /* reserved, should pass NULL */
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
-                           const cudnnRNNDescriptor_t rnnDesc,
-                           const cudnnRNNDataDescriptor_t xDesc,
-                           const void *x,
-                           const cudnnTensorDescriptor_t hxDesc,
-                           const void *hx,
-                           const cudnnTensorDescriptor_t cxDesc,
-                           const void *cx,
-                           const cudnnFilterDescriptor_t wDesc,
-                           const void *w,
-                           const cudnnRNNDataDescriptor_t yDesc,
-                           void *y,
-                           const cudnnTensorDescriptor_t hyDesc,
-                           void *hy,
-                           const cudnnTensorDescriptor_t cyDesc,
-                           void *cy,
-                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-                           const void *keys,                     /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-                           void *cAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-                           void *iAttn,                          /* reserved, should pass NULL */
-                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-                           void *queries,                        /* reserved, should pass NULL */
-                           void *workSpace,
-                           size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnRNNDataDescriptor_t yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy,
+    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+    const void *keys,                     /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+    void *cAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+    void *iAttn,                          /* reserved, should pass NULL */
+    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+    void *queries,                        /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
+      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn, iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
+                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
+                  iDesc, iAttn, qDesc, queries, workSpace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardDataEx(cudnnHandle_t handle,
-                       const cudnnRNNDescriptor_t rnnDesc,
-                       const cudnnRNNDataDescriptor_t yDesc,
-                       const void *y,
-                       const cudnnRNNDataDescriptor_t dyDesc,
-                       const void *dy,
-                       const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-                       const void *dcAttn,                    /* reserved, should pass NULL */
-                       const cudnnTensorDescriptor_t dhyDesc,
-                       const void *dhy,
-                       const cudnnTensorDescriptor_t dcyDesc,
-                       const void *dcy,
-                       const cudnnFilterDescriptor_t wDesc,
-                       const void *w,
-                       const cudnnTensorDescriptor_t hxDesc,
-                       const void *hx,
-                       const cudnnTensorDescriptor_t cxDesc,
-                       const void *cx,
-                       const cudnnRNNDataDescriptor_t dxDesc,
-                       void *dx,
-                       const cudnnTensorDescriptor_t dhxDesc,
-                       void *dhx,
-                       const cudnnTensorDescriptor_t dcxDesc,
-                       void *dcx,
-                       const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-                       void *dkeys,                           /* reserved, should pass NULL */
-                       void *workSpace,
-                       size_t workSpaceSizeInBytes,
-                       void *reserveSpace,
-                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y,
+    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
+    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
+    const void *dcAttn,                    /* reserved, should pass NULL */
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
+    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
+    void *dkeys,                           /* reserved, should pass NULL */
+    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *,
+      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
+                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
+                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
+                  workSpace, workSpaceSizeInBytes, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardWeightsEx(cudnnHandle_t handle,
-                          const cudnnRNNDescriptor_t rnnDesc,
-                          const cudnnRNNDataDescriptor_t xDesc,
-                          const void *x,
-                          const cudnnTensorDescriptor_t hxDesc,
-                          const void *hx,
-                          const cudnnRNNDataDescriptor_t yDesc,
-                          const void *y,
-                          void *workSpace,
-                          size_t workSpaceSizeInBytes,
-                          const cudnnFilterDescriptor_t dwDesc,
-                          void *dw,
-                          void *reserveSpace,
-                          size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnRNNDataDescriptor_t, const void *, void *, size_t, const cudnnFilterDescriptor_t, void *, void *, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const cudnnRNNDataDescriptor_t xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
+    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
+    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
+      const void *, const cudnnTensorDescriptor_t, const void *,
+      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
+      const cudnnFilterDescriptor_t, void *, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
+                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
-                                        const cudnnRNNDescriptor_t rnnDesc,
-                                        const int seqLength,
-                                        const cudnnTensorDescriptor_t *xDesc,
-                                        const void *x,
-                                        const cudnnTensorDescriptor_t hxDesc,
-                                        const void *hx,
-                                        const cudnnTensorDescriptor_t cxDesc,
-                                        const void *cx,
-                                        const cudnnFilterDescriptor_t wDesc,
-                                        const void *w,
-                                        const cudnnTensorDescriptor_t *yDesc,
-                                        void *y,
-                                        const cudnnTensorDescriptor_t hyDesc,
-                                        void *hy,
-                                        const cudnnTensorDescriptor_t cyDesc,
-                                        void *cy,
-                                        const float findIntensity,
-                                        const int requestedAlgoCount,
-                                        int *returnedAlgoCount,
-                                        cudnnAlgorithmPerformance_t *perfResults,
-                                        void *workspace,
-                                        size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNForwardTrainingAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNForwardTrainingAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t cxDesc,
-                                       const void *cx,
-                                       const cudnnFilterDescriptor_t wDesc,
-                                       const void *w,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       void *y,
-                                       const cudnnTensorDescriptor_t hyDesc,
-                                       void *hy,
-                                       const cudnnTensorDescriptor_t cyDesc,
-                                       void *cy,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t *yDesc, void *y,
+    const cudnnTensorDescriptor_t hyDesc, void *hy,
+    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
+                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardDataAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardDataAlgorithmEx(cudnnHandle_t handle,
-                                    const cudnnRNNDescriptor_t rnnDesc,
-                                    const int seqLength,
-                                    const cudnnTensorDescriptor_t *yDesc,
-                                    const void *y,
-                                    const cudnnTensorDescriptor_t *dyDesc,
-                                    const void *dy,
-                                    const cudnnTensorDescriptor_t dhyDesc,
-                                    const void *dhy,
-                                    const cudnnTensorDescriptor_t dcyDesc,
-                                    const void *dcy,
-                                    const cudnnFilterDescriptor_t wDesc,
-                                    const void *w,
-                                    const cudnnTensorDescriptor_t hxDesc,
-                                    const void *hx,
-                                    const cudnnTensorDescriptor_t cxDesc,
-                                    const void *cx,
-                                    const cudnnTensorDescriptor_t *dxDesc,
-                                    void *dx,
-                                    const cudnnTensorDescriptor_t dhxDesc,
-                                    void *dhx,
-                                    const cudnnTensorDescriptor_t dcxDesc,
-                                    void *dcx,
-                                    const float findIntensity,
-                                    const int requestedAlgoCount,
-                                    int *returnedAlgoCount,
-                                    cudnnAlgorithmPerformance_t *perfResults,
-                                    void *workspace,
-                                    size_t workSpaceSizeInBytes,
-                                    void *reserveSpace,
-                                    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
+    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
+    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
+    const cudnnFilterDescriptor_t wDesc, const void *w,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t cxDesc, const void *cx,
+    const cudnnTensorDescriptor_t *dxDesc, void *dx,
+    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
+    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
+    const int requestedAlgoCount, int *returnedAlgoCount,
+    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
+    size_t workSpaceSizeInBytes, void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnFilterDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
+      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
+                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
+                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
+                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
+                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNBackwardWeightsAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
+cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, rnnDesc, count);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnFindRNNBackwardWeightsAlgorithmEx(cudnnHandle_t handle,
-                                       const cudnnRNNDescriptor_t rnnDesc,
-                                       const int seqLength,
-                                       const cudnnTensorDescriptor_t *xDesc,
-                                       const void *x,
-                                       const cudnnTensorDescriptor_t hxDesc,
-                                       const void *hx,
-                                       const cudnnTensorDescriptor_t *yDesc,
-                                       const void *y,
-                                       const float findIntensity,
-                                       const int requestedAlgoCount,
-                                       int *returnedAlgoCount,
-                                       cudnnAlgorithmPerformance_t *perfResults,
-                                       const void *workspace,
-                                       size_t workSpaceSizeInBytes,
-                                       const cudnnFilterDescriptor_t dwDesc,
-                                       void *dw,
-                                       const void *reserveSpace,
-                                       size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnRNNDescriptor_t, const int, const cudnnTensorDescriptor_t *, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t *, const void *, const float, const int, int *, cudnnAlgorithmPerformance_t *, const void *, size_t, const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
+cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
+    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
+    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
+    const cudnnTensorDescriptor_t hxDesc, const void *hx,
+    const cudnnTensorDescriptor_t *yDesc, const void *y,
+    const float findIntensity, const int requestedAlgoCount,
+    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
+    const void *workspace, size_t workSpaceSizeInBytes,
+    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
+    size_t reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
+      const cudnnTensorDescriptor_t *, const void *,
+      const cudnnTensorDescriptor_t, const void *,
+      const cudnnTensorDescriptor_t *, const void *, const float, const int,
+      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
+      const cudnnFilterDescriptor_t, void *, const void *, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y, findIntensity, requestedAlgoCount, returnedAlgoCount, perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace, reserveSpaceSizeInBytes);
+  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
+                  findIntensity, requestedAlgoCount, returnedAlgoCount,
+                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
+                  reserveSpace, reserveSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(seqDataDesc);
@@ -2578,47 +2657,43 @@ cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(seqDataDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
-                          cudnnDataType_t dataType,
-                          int nbDims,
-                          const int dimA[],
-                          const cudnnSeqDataAxis_t axes[],
-                          size_t seqLengthArraySize,
-                          const int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int [], const cudnnSeqDataAxis_t [], size_t, const int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnSetSeqDataDescriptor(
+    cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t dataType, int nbDims,
+    const int dimA[], const cudnnSeqDataAxis_t axes[],
+    size_t seqLengthArraySize, const int seqLengthArray[], void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int[],
+      const cudnnSeqDataAxis_t[], size_t, const int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize, seqLengthArray, paddingFill);
+  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize,
+                  seqLengthArray, paddingFill);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
-                          cudnnDataType_t *dataType,
-                          int *nbDims,
-                          int nbDimsRequested,
-                          int dimA[],
-                          cudnnSeqDataAxis_t axes[],
-                          size_t *seqLengthArraySize,
-                          size_t seqLengthSizeRequested,
-                          int seqLengthArray[],
-                          void *paddingFill) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int [], cudnnSeqDataAxis_t [], size_t *, size_t, int [], void *);
+cudnnStatus_t CUDNNWINAPI cudnnGetSeqDataDescriptor(
+    const cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t *dataType,
+    int *nbDims, int nbDimsRequested, int dimA[], cudnnSeqDataAxis_t axes[],
+    size_t *seqLengthArraySize, size_t seqLengthSizeRequested,
+    int seqLengthArray[], void *paddingFill) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int[],
+      cudnnSeqDataAxis_t[], size_t *, size_t, int[], void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetSeqDataDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes, seqLengthArraySize, seqLengthSizeRequested, seqLengthArray, paddingFill);
+  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes,
+                  seqLengthArraySize, seqLengthSizeRequested, seqLengthArray,
+                  paddingFill);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(attnDesc);
@@ -2626,217 +2701,198 @@ cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(attnDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
-                       cudnnAttnQueryMap_t queryMap,
-                       int nHeads,
-                       double smScaler,
-                       cudnnDataType_t dataType,
-                       cudnnDataType_t computePrec,
-                       cudnnMathType_t mathType,
-                       cudnnDropoutDescriptor_t attnDropoutDesc,
-                       cudnnDropoutDescriptor_t postDropoutDesc,
-                       int qSize,
-                       int kSize,
-                       int vSize,
-                       int qProjSize,
-                       int kProjSize,
-                       int vProjSize,
-                       int oProjSize,
-                       int qoMaxSeqLength,
-                       int kvMaxSeqLength,
-                       int maxBatchSize,
-                       int maxBeamSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, cudnnAttnQueryMap_t, int, double, cudnnDataType_t, cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t, cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int, int, int);
+cudnnStatus_t CUDNNWINAPI cudnnSetAttnDescriptor(
+    cudnnAttnDescriptor_t attnDesc, cudnnAttnQueryMap_t queryMap, int nHeads,
+    double smScaler, cudnnDataType_t dataType, cudnnDataType_t computePrec,
+    cudnnMathType_t mathType, cudnnDropoutDescriptor_t attnDropoutDesc,
+    cudnnDropoutDescriptor_t postDropoutDesc, int qSize, int kSize, int vSize,
+    int qProjSize, int kProjSize, int vProjSize, int oProjSize,
+    int qoMaxSeqLength, int kvMaxSeqLength, int maxBatchSize, int maxBeamSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnAttnDescriptor_t, cudnnAttnQueryMap_t, int, double, cudnnDataType_t,
+      cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t,
+      cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int,
+      int, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec,
+                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
+                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
+                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
-                       cudnnAttnQueryMap_t *queryMap,
-                       int *nHeads,
-                       double *smScaler,
-                       cudnnDataType_t *dataType,
-                       cudnnDataType_t *computePrec,
-                       cudnnMathType_t *mathType,
-                       cudnnDropoutDescriptor_t *attnDropoutDesc,
-                       cudnnDropoutDescriptor_t *postDropoutDesc,
-                       int *qSize,
-                       int *kSize,
-                       int *vSize,
-                       int *qProjSize,
-                       int *kProjSize,
-                       int *vProjSize,
-                       int *oProjSize,
-                       int *qoMaxSeqLength,
-                       int *kvMaxSeqLength,
-                       int *maxBatchSize,
-                       int *maxBeamSize) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAttnDescriptor_t, cudnnAttnQueryMap_t *, int *, double *, cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *, cudnnDropoutDescriptor_t *, cudnnDropoutDescriptor_t *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *, int *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAttnDescriptor(
+    cudnnAttnDescriptor_t attnDesc, cudnnAttnQueryMap_t *queryMap, int *nHeads,
+    double *smScaler, cudnnDataType_t *dataType, cudnnDataType_t *computePrec,
+    cudnnMathType_t *mathType, cudnnDropoutDescriptor_t *attnDropoutDesc,
+    cudnnDropoutDescriptor_t *postDropoutDesc, int *qSize, int *kSize,
+    int *vSize, int *qProjSize, int *kProjSize, int *vProjSize, int *oProjSize,
+    int *qoMaxSeqLength, int *kvMaxSeqLength, int *maxBatchSize,
+    int *maxBeamSize) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnAttnDescriptor_t, cudnnAttnQueryMap_t *, int *, double *,
+      cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *,
+      cudnnDropoutDescriptor_t *, cudnnDropoutDescriptor_t *, int *, int *,
+      int *, int *, int *, int *, int *, int *, int *, int *, int *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAttnDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec, mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize, vSize, qProjSize, kProjSize, vProjSize, oProjSize, qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
+  return func_ptr(attnDesc, queryMap, nHeads, smScaler, dataType, computePrec,
+                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
+                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
+                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
-                             const cudnnAttnDescriptor_t attnDesc,
-                             size_t *weightSizeInBytes,
-                             size_t *workSpaceSizeInBytes,
-                             size_t *reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnBuffers(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    size_t *weightSizeInBytes, size_t *workSpaceSizeInBytes,
+    size_t *reserveSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnBuffers");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes, reserveSpaceSizeInBytes);
+  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes,
+                  reserveSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
-                             const cudnnAttnDescriptor_t attnDesc,
-                             cudnnMultiHeadAttnWeightKind_t wKind,
-                             size_t weightSizeInBytes,
-                             const void *w,
-                             cudnnTensorDescriptor_t wDesc,
-                             void **wAddr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnMultiHeadAttnWeightKind_t, size_t, const void *, cudnnTensorDescriptor_t, void **);
+cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnWeights(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    cudnnMultiHeadAttnWeightKind_t wKind, size_t weightSizeInBytes,
+    const void *w, cudnnTensorDescriptor_t wDesc, void **wAddr) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t,
+      cudnnMultiHeadAttnWeightKind_t, size_t, const void *,
+      cudnnTensorDescriptor_t, void **);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, attnDesc, wKind, weightSizeInBytes, w, wDesc, wAddr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnMultiHeadAttnForward(cudnnHandle_t handle,
-                          const cudnnAttnDescriptor_t attnDesc,
-                          int currIdx,
-                          const int *loWinIdx,
-                          const int *hiWinIdx,
-                          const int *seqLengthArrayQRO,
-                          const int *seqLengthArrayKV,
-                          const cudnnSeqDataDescriptor_t qDesc,
-                          const void *queries,
-                          const void *residuals,
-                          const cudnnSeqDataDescriptor_t kDesc,
-                          const void *keys,
-                          const cudnnSeqDataDescriptor_t vDesc,
-                          const void *values,
-                          const cudnnSeqDataDescriptor_t oDesc,
-                          void *out,
-                          size_t weightSizeInBytes,
-                          const void *w,
-                          size_t workSpaceSizeInBytes,
-                          void *workSpace,
-                          size_t reserveSpaceSizeInBytes,
-                          void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int *, const int *, const int *, const int *, const cudnnSeqDataDescriptor_t, const void *, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t, void *, size_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnForward(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc, int currIdx,
+    const int *loWinIdx, const int *hiWinIdx, const int *seqLengthArrayQRO,
+    const int *seqLengthArrayKV, const cudnnSeqDataDescriptor_t qDesc,
+    const void *queries, const void *residuals,
+    const cudnnSeqDataDescriptor_t kDesc, const void *keys,
+    const cudnnSeqDataDescriptor_t vDesc, const void *values,
+    const cudnnSeqDataDescriptor_t oDesc, void *out, size_t weightSizeInBytes,
+    const void *w, size_t workSpaceSizeInBytes, void *workSpace,
+    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int *, const int *,
+      const int *, const int *, const cudnnSeqDataDescriptor_t, const void *,
+      const void *, const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t,
+      void *, size_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnForward");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx, seqLengthArrayQRO, seqLengthArrayKV, qDesc, queries, residuals, kDesc, keys, vDesc, values, oDesc, out, weightSizeInBytes, w, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx,
+                  seqLengthArrayQRO, seqLengthArrayKV, qDesc, queries,
+                  residuals, kDesc, keys, vDesc, values, oDesc, out,
+                  weightSizeInBytes, w, workSpaceSizeInBytes, workSpace,
+                  reserveSpaceSizeInBytes, reserveSpace);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
-                               const cudnnAttnDescriptor_t attnDesc,
-                               const int *loWinIdx,
-                               const int *hiWinIdx,
-                               const int *seqLengthArrayDQDO,
-                               const int *seqLengthArrayDKDV,
-                               const cudnnSeqDataDescriptor_t doDesc,
-                               const void *dout,
-                               const cudnnSeqDataDescriptor_t dqDesc,
-                               void *dqueries,
-                               const void *queries,
-                               const cudnnSeqDataDescriptor_t dkDesc,
-                               void *dkeys,
-                               const void *keys,
-                               const cudnnSeqDataDescriptor_t dvDesc,
-                               void *dvalues,
-                               const void *values,
-                               size_t weightSizeInBytes,
-                               const void *w,
-                               size_t workSpaceSizeInBytes,
-                               void *workSpace,
-                               size_t reserveSpaceSizeInBytes,
-                               void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, const int *, const int *, const int *, const int *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, const cudnnSeqDataDescriptor_t, void *, const void *, size_t, const void *, size_t, void *, size_t, void *);
+cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardData(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    const int *loWinIdx, const int *hiWinIdx, const int *seqLengthArrayDQDO,
+    const int *seqLengthArrayDKDV, const cudnnSeqDataDescriptor_t doDesc,
+    const void *dout, const cudnnSeqDataDescriptor_t dqDesc, void *dqueries,
+    const void *queries, const cudnnSeqDataDescriptor_t dkDesc, void *dkeys,
+    const void *keys, const cudnnSeqDataDescriptor_t dvDesc, void *dvalues,
+    const void *values, size_t weightSizeInBytes, const void *w,
+    size_t workSpaceSizeInBytes, void *workSpace,
+    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, const int *, const int *,
+      const int *, const int *, const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, void *, const void *,
+      const cudnnSeqDataDescriptor_t, void *, const void *,
+      const cudnnSeqDataDescriptor_t, void *, const void *, size_t,
+      const void *, size_t, void *, size_t, void *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardData");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, seqLengthArrayDQDO, seqLengthArrayDKDV, doDesc, dout, dqDesc, dqueries, queries, dkDesc, dkeys, keys, dvDesc, dvalues, values, weightSizeInBytes, w, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, seqLengthArrayDQDO,
+                  seqLengthArrayDKDV, doDesc, dout, dqDesc, dqueries, queries,
+                  dkDesc, dkeys, keys, dvDesc, dvalues, values,
+                  weightSizeInBytes, w, workSpaceSizeInBytes, workSpace,
+                  reserveSpaceSizeInBytes, reserveSpace);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
-                                  const cudnnAttnDescriptor_t attnDesc,
-                                  cudnnWgradMode_t addGrad,
-                                  const cudnnSeqDataDescriptor_t qDesc,
-                                  const void *queries,
-                                  const cudnnSeqDataDescriptor_t kDesc,
-                                  const void *keys,
-                                  const cudnnSeqDataDescriptor_t vDesc,
-                                  const void *values,
-                                  const cudnnSeqDataDescriptor_t doDesc,
-                                  const void *dout,
-                                  size_t weightSizeInBytes,
-                                  const void *w,
-                                  void *dw,
-                                  size_t workSpaceSizeInBytes,
-                                  void *workSpace,
-                                  size_t reserveSpaceSizeInBytes,
-                                  void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, const cudnnSeqDataDescriptor_t, const void *, size_t, const void *, void *, size_t, void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
+cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardWeights(
+    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
+    cudnnWgradMode_t addGrad, const cudnnSeqDataDescriptor_t qDesc,
+    const void *queries, const cudnnSeqDataDescriptor_t kDesc, const void *keys,
+    const cudnnSeqDataDescriptor_t vDesc, const void *values,
+    const cudnnSeqDataDescriptor_t doDesc, const void *dout,
+    size_t weightSizeInBytes, const void *w, void *dw,
+    size_t workSpaceSizeInBytes, void *workSpace,
+    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *,
+      const cudnnSeqDataDescriptor_t, const void *, size_t, const void *,
+      void *, size_t, void *, size_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc, values, doDesc, dout, weightSizeInBytes, w, dw, workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes, reserveSpace);
+  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc,
+                  values, doDesc, dout, weightSizeInBytes, w, dw,
+                  workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes,
+                  reserveSpace);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
-                            cudnnDataType_t compType,
-                            cudnnLossNormalizationMode_t normMode,
-                            cudnnNanPropagation_t gradMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t, cudnnNanPropagation_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptorEx(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType,
+    cudnnLossNormalizationMode_t normMode, cudnnNanPropagation_t gradMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t,
+      cudnnNanPropagation_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType, normMode, gradMode);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
-                            cudnnDataType_t *compType,
-                            cudnnLossNormalizationMode_t *normMode,
-                            cudnnNanPropagation_t *gradMode) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *, cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptorEx(
+    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType,
+    cudnnLossNormalizationMode_t *normMode, cudnnNanPropagation_t *gradMode) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnCTCLossDescriptor_t, cudnnDataType_t *,
+      cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptorEx");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc, compType, normMode, gradMode);
@@ -2844,82 +2900,102 @@ cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(ctcLossDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCTCLoss(
+cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
     cudnnHandle_t handle,
     const cudnnTensorDescriptor_t
-        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
-                          mini batch size, A is the alphabet size)  */
-    const void *probs, /* probabilities after softmax, in GPU memory */
-    const int *labels, /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    void *costs,                                 /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
-    const void *gradients,   /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the timing steps, N is the
+                      mini batch size, A is the alphabet size)  */
+    const void *probs,       /* probabilities after softmax, in GPU memory */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    void *costs,             /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t
+        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
+                          T,N,A */
+    const void *gradients,   /* the returned CTC gradients, in GPU memory, to
+                                compute costs only, set it to NULL */
     cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
     cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace,              /* pointer to the workspace, in GPU memory */
+    void *workspace, /* pointer to the workspace, in GPU memory */
     size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *, const int *, const int *, void *, const cudnnTensorDescriptor_t, const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int *,
+      const int *, const int *, void *, const cudnnTensorDescriptor_t,
+      const void *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *,
+      size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths, costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace, workSpaceSizeInBytes);
+  return func_ptr(handle, probsDesc, probs, labels, labelLengths, inputLengths,
+                  costs, gradientsDesc, gradients, algo, ctcLossDesc, workspace,
+                  workSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCTCLossWorkspaceSize(
+cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
     cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
-                                                timing steps, N is the mini batch size, A is the alphabet size) */
-    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
-                                                    dimensions are T,N,A. To compute costs
-                                                    only, set it to NULL */
-    const int *labels,                           /* labels, in CPU memory */
-    const int *labelLengths,                     /* the length of each label, in CPU memory */
-    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
-    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const int *, const int *, const int *, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
+    const cudnnTensorDescriptor_t
+        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
+                      T,N,A (T is the
+                      timing steps, N is the mini batch size, A is the alphabet
+                      size) */
+    const cudnnTensorDescriptor_t
+        gradientsDesc,       /* Tensor descriptor for gradients, the
+                                dimensions are T,N,A. To compute costs
+                                only, set it to NULL */
+    const int *labels,       /* labels, in CPU memory */
+    const int *labelLengths, /* the length of each label, in CPU memory */
+    const int *inputLengths, /* the lengths of timing steps in each batch, in
+                                CPU memory */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, const cudnnTensorDescriptor_t,
+      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
+      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths, inputLengths, algo, ctcLossDesc, sizeInBytes);
+  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
+                  inputLengths, algo, ctcLossDesc, sizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
+    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithm_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithm_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc, algorithm);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
+    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(src, dest);
@@ -2927,236 +3003,255 @@ cudnnCopyAlgorithmDescriptor(const cudnnAlgorithmDescriptor_t src, cudnnAlgorith
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToCreate);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetAlgorithmPerformance(cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t algoDesc,
-                             cudnnStatus_t status,
-                             float time,
-                             size_t memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t, cudnnStatus_t, float, size_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
+    cudnnStatus_t status, float time, size_t memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
+                                               cudnnAlgorithmDescriptor_t,
+                                               cudnnStatus_t, float, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmPerformance(const cudnnAlgorithmPerformance_t algoPerf,
-                             cudnnAlgorithmDescriptor_t *algoDesc,
-                             cudnnStatus_t *status,
-                             float *time,
-                             size_t *memory) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *, cudnnStatus_t *, float *, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
+    const cudnnAlgorithmPerformance_t algoPerf,
+    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
+    size_t *memory) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
+      cudnnStatus_t *, float *, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, algoDesc, status, time, memory);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmPerformance(cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
+cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
+    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(algoPerf, numberToDestroy);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetAlgorithmSpaceSize(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc, size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
+    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+    size_t *algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle,
-                   cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace,
-                   size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
+cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
+                   void *algoSpace, size_t algoSpaceSizeInBytes) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnRestoreAlgorithm(cudnnHandle_t handle,
-                      void *algoSpace,
-                      size_t algoSpaceSizeInBytes,
-                      cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, void *, size_t, cudnnAlgorithmDescriptor_t);
+cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
+    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
+    cudnnAlgorithmDescriptor_t algoDesc) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
+                                               cudnnAlgorithmDescriptor_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
+                                           cudnnCallback_t fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
+cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
+                                           cudnnCallback_t *fptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(mask, udata, fptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *, cudnnFusedOps_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
+cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsConstParamPack(
+    cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *,
+                                               cudnnFusedOps_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack, ops);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFusedOpsConstParamPackAttribute(cudnnFusedOpsConstParamPack_t constPack,
-                                        cudnnFusedOpsConstParamLabel_t paramLabel,
-                                        const void *param) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsConstParamPackAttribute(
+    cudnnFusedOpsConstParamPack_t constPack,
+    cudnnFusedOpsConstParamLabel_t paramLabel, const void *param) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t,
+                                               cudnnFusedOpsConstParamLabel_t,
+                                               const void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack, paramLabel, param);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFusedOpsConstParamPackAttribute(const cudnnFusedOpsConstParamPack_t constPack,
-                                        cudnnFusedOpsConstParamLabel_t paramLabel,
-                                        void *param,
-                                        int *isNULL) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t, void *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsConstParamPackAttribute(
+    const cudnnFusedOpsConstParamPack_t constPack,
+    cudnnFusedOpsConstParamLabel_t paramLabel, void *param, int *isNULL) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t,
+      void *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(constPack, paramLabel, param, isNULL);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
+cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsVariantParamPack(
+    cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack, ops);
 }
 
 cudnnStatus_t CUDNNWINAPI
 cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetFusedOpsVariantParamPackAttribute(cudnnFusedOpsVariantParamPack_t varPack,
-                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
-                                          void *ptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsVariantParamPackAttribute(
+    cudnnFusedOpsVariantParamPack_t varPack,
+    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t,
+                                   cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack, paramLabel, ptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnGetFusedOpsVariantParamPackAttribute(const cudnnFusedOpsVariantParamPack_t varPack,
-                                          cudnnFusedOpsVariantParamLabel_t paramLabel,
-                                          void *ptr) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t, cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
+cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsVariantParamPackAttribute(
+    const cudnnFusedOpsVariantParamPack_t varPack,
+    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t,
+                                   cudnnFusedOpsVariantParamLabel_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(varPack, paramLabel, ptr);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
+cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan,
+                                                  cudnnFusedOps_t ops) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan, ops);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
+cudnnStatus_t CUDNNWINAPI cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(plan);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnMakeFusedOpsPlan(cudnnHandle_t handle,
-                      cudnnFusedOpsPlan_t plan,
+cudnnMakeFusedOpsPlan(cudnnHandle_t handle, cudnnFusedOpsPlan_t plan,
                       const cudnnFusedOpsConstParamPack_t constPack,
                       size_t *workspaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t, size_t *);
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t,
+      size_t *);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMakeFusedOpsPlan");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, plan, constPack, workspaceSizeInBytes);
 }
 
 cudnnStatus_t CUDNNWINAPI
-cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan, cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t, cudnnFusedOpsVariantParamPack_t);
+cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan,
+                     cudnnFusedOpsVariantParamPack_t varPack) {
+  using FuncPtr =
+      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t,
+                                   cudnnFusedOpsVariantParamPack_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFusedOpsExecute");
   if (!func_ptr) return GetSymbolNotFoundError();
   return func_ptr(handle, plan, varPack);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
-                         cudnnRNNDescriptor_t rnnDesc,
-                         const int hiddenSize,
-                         const int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnRNNAlgo_t algo,
-                         cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
+    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
+    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
+    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
+    cudnnRNNMode_t mode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
+      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
+      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, algo, mathPrec);
+  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
+                  inputMode, direction, mode, algo, mathPrec);
 }
 
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNDescriptor_v5(cudnnRNNDescriptor_t rnnDesc,
-                         int hiddenSize,
-                         int numLayers,
-                         cudnnDropoutDescriptor_t dropoutDesc,
-                         cudnnRNNInputMode_t inputMode,
-                         cudnnDirectionMode_t direction,
-                         cudnnRNNMode_t mode,
-                         cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t (CUDNNWINAPI *)(cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t, cudnnDataType_t);
+cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v5(
+    cudnnRNNDescriptor_t rnnDesc, int hiddenSize, int numLayers,
+    cudnnDropoutDescriptor_t dropoutDesc, cudnnRNNInputMode_t inputMode,
+    cudnnDirectionMode_t direction, cudnnRNNMode_t mode,
+    cudnnDataType_t mathPrec) {
+  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
+      cudnnRNNDescriptor_t, int, int, cudnnDropoutDescriptor_t,
+      cudnnRNNInputMode_t, cudnnDirectionMode_t, cudnnRNNMode_t,
+      cudnnDataType_t);
   static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v5");
   if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode, direction, mode, mathPrec);
+  return func_ptr(rnnDesc, hiddenSize, numLayers, dropoutDesc, inputMode,
+                  direction, mode, mathPrec);
 }
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_10_1.inc b/tensorflow/stream_executor/cuda/cusparse_10_1.inc
index 3b7f3815829..e94aa081b8c 100644
--- a/tensorflow/stream_executor/cuda/cusparse_10_1.inc
+++ b/tensorflow/stream_executor/cuda/cusparse_10_1.inc
@@ -8225,6 +8225,6 @@ cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
                   bufferSize);
 }
 
-#endif // _WIN32
+#endif  // _WIN32
 
 }  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_10_2.inc b/tensorflow/stream_executor/cuda/cusparse_10_2.inc
index 3b7f3815829..e94aa081b8c 100644
--- a/tensorflow/stream_executor/cuda/cusparse_10_2.inc
+++ b/tensorflow/stream_executor/cuda/cusparse_10_2.inc
@@ -8225,6 +8225,6 @@ cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
                   bufferSize);
 }
 
-#endif // _WIN32
+#endif  // _WIN32
 
 }  // extern "C"

From 7b495df2da45bce9fdecf72c13879063d0e5284d Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Tue, 26 May 2020 23:15:21 -0700
Subject: [PATCH 1186/1533] Add CUDA 11 stub files.

PiperOrigin-RevId: 313335318
Change-Id: I6e7717b580540fa2b039057702115fd25c3c54ad
---
 .../stream_executor/cuda/cublas_11_0.inc      | 5023 +++++++++++++
 tensorflow/stream_executor/cuda/cuda_11_0.inc | 2430 ++++++
 .../cuda/cuda_runtime_11_0.inc                | 1974 +++++
 .../cuda/cusolver_dense_11_0.inc              | 4686 ++++++++++++
 .../stream_executor/cuda/cusparse_11_0.inc    | 6584 +++++++++++++++++
 5 files changed, 20697 insertions(+)
 create mode 100644 tensorflow/stream_executor/cuda/cublas_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cuda_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
 create mode 100644 tensorflow/stream_executor/cuda/cusparse_11_0.inc

diff --git a/tensorflow/stream_executor/cuda/cublas_11_0.inc b/tensorflow/stream_executor/cuda/cublas_11_0.inc
new file mode 100644
index 00000000000..c30b2cf8f68
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cublas_11_0.inc
@@ -0,0 +1,5023 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t *handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type,
+                                              int *value) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+size_t CUBLASWINAPI cublasGetCudartVersion(void) {
+  using FuncPtr = size_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetCudartVersion");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasGetCudartVersion");
+  return func_ptr();
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetStream_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle,
+                                                    cublasPointerMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasPointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetPointerMode_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle,
+                                                 cublasAtomicsMode_t mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasAtomicsMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetAtomicsMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle,
+                                              cublasMath_t *mode) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle,
+                                              cublasMath_t mode) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasMath_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMathMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, int logToStdOut,
+                                                  int logToStdErr,
+                                                  const char *logFileName) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasLoggerConfigure");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(logIsOn, logToStdOut, logToStdErr, logFileName);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSetLoggerCallback(cublasLogCallback userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasGetLoggerCallback(cublasLogCallback *userCallback) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasLogCallback *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetLoggerCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(userCallback);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void *x,
+                                            int incx, void *devicePtr,
+                                            int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, devicePtr, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void *x,
+                                            int incx, void *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVector");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize,
+                                            const void *A, int lda, void *B,
+                                            int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, int, const void *,
+                                                 int, void *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrix");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(int n, int elemSize,
+                                                 const void *hostPtr, int incx,
+                                                 void *devicePtr, int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, hostPtr, incx, devicePtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(int n, int elemSize,
+                                                 const void *devicePtr,
+                                                 int incx, void *hostPtr,
+                                                 int incy,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, const void *, int,
+                                                 void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVectorAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr, incx, hostPtr, incy, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync(int rows, int cols,
+                                                 int elemSize, const void *A,
+                                                 int lda, void *B, int ldb,
+                                                 cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      int, int, int, const void *, int, void *, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetMatrixAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(rows, cols, elemSize, A, lda, B, ldb, stream);
+}
+
+void CUBLASWINAPI cublasXerbla(const char *srName, int info) {
+  using FuncPtr = void(CUBLASWINAPI *)(const char *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasXerbla");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasXerbla");
+  return func_ptr(srName, info);
+}
+
+cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *result,
+                                         cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasNrm2Ex");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSnrm2_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDnrm2_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScnrm2_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDznrm2_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, int n,
+                                        const void *x, cudaDataType xType,
+                                        int incx, const void *y,
+                                        cudaDataType yType, int incy,
+                                        void *result, cudaDataType resultType,
+                                        cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, const void *y,
+                                         cudaDataType yType, int incy,
+                                         void *result, cudaDataType resultType,
+                                         cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDotcEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, result, resultType,
+                  executionType);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, int n,
+                                          const float *x, int incx,
+                                          const float *y, int incy,
+                                          float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, int n,
+                                          const double *x, int incx,
+                                          const double *y, int incy,
+                                          double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           const cuComplex *y, int incy,
+                                           cuComplex *result) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           const cuDoubleComplex *y, int incy,
+                                           cuDoubleComplex *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, result);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasScalEx(cublasHandle_t handle, int n,
+             const void *alpha, /* host or device pointer */
+             cudaDataType alphaType, void *x, cudaDataType xType, int incx,
+             cudaDataType executionType) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, void *, cudaDataType,
+      int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScalEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, executionType);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n,
+                const float *alpha, /* host or device pointer */
+                cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n,
+                const double *alpha, /* host or device pointer */
+                cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAxpyEx(
+    cublasHandle_t handle, int n,
+    const void *alpha, /* host or device pointer */
+    cudaDataType alphaType, const void *x, cudaDataType xType, int incx,
+    void *y, cudaDataType yType, int incy, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, const void *,
+      cudaDataType, int, void *, cudaDataType, int, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAxpyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, alphaType, x, xType, incx, y, yType, incy,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n,
+               const double *alpha, /* host or device pointer */
+               const double *x, int incx, double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCaxpy_v2(cublasHandle_t handle, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, cuComplex *y, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const cuComplex *,
+                                     const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(
+    cublasHandle_t handle, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *x, int incx, cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, alpha, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCopyEx(cublasHandle_t handle, int n,
+                                         const void *x, cudaDataType xType,
+                                         int incx, void *y, cudaDataType yType,
+                                         int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCopyEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScopy_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDcopy_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCcopy_v2(cublasHandle_t handle, int n,
+                                           const cuComplex *x, int incx,
+                                           cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZcopy_v2(cublasHandle_t handle, int n,
+                                           const cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuDoubleComplex *, int,
+                                                 cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSswap_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *,
+                                                 int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDswap_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, double *,
+                                                 int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCswap_v2(cublasHandle_t handle, int n,
+                                           cuComplex *x, int incx, cuComplex *y,
+                                           int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZswap_v2(cublasHandle_t handle, int n,
+                                           cuDoubleComplex *x, int incx,
+                                           cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSwapEx(cublasHandle_t handle, int n, void *x,
+                                         cudaDataType xType, int incx, void *y,
+                                         cudaDataType yType, int incy) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, void *, cudaDataType,
+                                     int, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSwapEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamax_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamax_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamax_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamax_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIamaxEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    int *result /* host or device pointer */
+) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIamaxEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIsamin_v2(cublasHandle_t handle, int n,
+                                            const float *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIdamin_v2(cublasHandle_t handle, int n,
+                                            const double *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIcamin_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIzamin_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            int *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasIaminEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    int *result /* host or device pointer */
+) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIaminEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasAsumEx(
+    cublasHandle_t handle, int n, const void *x, cudaDataType xType, int incx,
+    void *result, cudaDataType resultType, /* host or device pointer */
+    cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const void *, cudaDataType, int, void *,
+      cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAsumEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, result, resultType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSasum_v2(cublasHandle_t handle, int n,
+                                           const float *x, int incx,
+                                           float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDasum_v2(cublasHandle_t handle, int n,
+                                           const double *x, int incx,
+                                           double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int,
+                                                 const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasScasum_v2(cublasHandle_t handle, int n,
+                                            const cuComplex *x, int incx,
+                                            float *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDzasum_v2(cublasHandle_t handle, int n,
+                                            const cuDoubleComplex *x, int incx,
+                                            double *result) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, result);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float *x, int incx, float *y,
+              int incy, const float *c, /* host or device pointer */
+              const float *s) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, float *, int, float *,
+                                     int, const float *, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double *x, int incx, double *y,
+              int incy, const double *c, /* host or device pointer */
+              const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *,
+      const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex *x, int incx, cuComplex *y,
+    int incy, const float *c, /* host or device pointer */
+    const float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *, int, cuComplex *, int, const float *,
+      const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdrot_v2(
+    cublasHandle_t handle, int n, cuDoubleComplex *x, int incx,
+    cuDoubleComplex *y, int incy, const double *c, /* host or device pointer */
+    const double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      const double *, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotEx(cublasHandle_t handle, int n, void *x, cudaDataType xType, int incx,
+            void *y, cudaDataType yType, int incy,
+            const void *c, /* host or device pointer */
+            const void *s, cudaDataType csType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
+      const void *, const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, c, s, csType,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrotg_v2(cublasHandle_t handle, float *a, /* host or device pointer */
+               float *b,                        /* host or device pointer */
+               float *c,                        /* host or device pointer */
+               float *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, float *,
+                                                 float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrotg_v2(cublasHandle_t handle, double *a, /* host or device pointer */
+               double *b,                        /* host or device pointer */
+               double *c,                        /* host or device pointer */
+               double *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, double *,
+                                                 double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex *a, /* host or device pointer */
+               cuComplex *b,                        /* host or device pointer */
+               float *c,                            /* host or device pointer */
+               cuComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuComplex *, cuComplex *, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZrotg_v2(
+    cublasHandle_t handle, cuDoubleComplex *a, /* host or device pointer */
+    cuDoubleComplex *b,                        /* host or device pointer */
+    double *c,                                 /* host or device pointer */
+    cuDoubleComplex *s) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cuDoubleComplex *, cuDoubleComplex *, double *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, c, s);
+}
+
+cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
+                                         void *a, /* host or device pointer */
+                                         void *b, /* host or device pointer */
+                                         cudaDataType abType,
+                                         void *c, /* host or device pointer */
+                                         void *s, /* host or device pointer */
+                                         cudaDataType csType,
+                                         cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, void *, void *,
+                                                 cudaDataType, void *, void *,
+                                                 cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, a, b, abType, c, s, csType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, int n,
+                                           float *x, int incx, float *y,
+                                           int incy, const float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, int n,
+                                           double *x, int incx, double *y,
+                                           int incy, const double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, incx, y, incy, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotmEx(cublasHandle_t handle, int n, void *x, cudaDataType xType,
+             int incx, void *y, cudaDataType yType, int incy,
+             const void *param, /* host or device pointer */
+             cudaDataType paramType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, void *, cudaDataType, int, void *, cudaDataType, int,
+      const void *, cudaDataType, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, x, xType, incx, y, yType, incy, param, paramType,
+                  executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float *d1, /* host or device pointer */
+                float *d2,                        /* host or device pointer */
+                float *x1,                        /* host or device pointer */
+                const float *y1,                  /* host or device pointer */
+                float *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double *d1, /* host or device pointer */
+                double *d2,                        /* host or device pointer */
+                double *x1,                        /* host or device pointer */
+                const double *y1,                  /* host or device pointer */
+                double *param) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, double *, double *, double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d2, x1, y1, param);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasRotmgEx(cublasHandle_t handle, void *d1,     /* host or device pointer */
+              cudaDataType d1Type, void *d2,       /* host or device pointer */
+              cudaDataType d2Type, void *x1,       /* host or device pointer */
+              cudaDataType x1Type, const void *y1, /* host or device pointer */
+              cudaDataType y1Type, void *param,    /* host or device pointer */
+              cudaDataType paramType, cudaDataType executiontype) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, void *, cudaDataType, void *, cudaDataType, void *,
+      cudaDataType, const void *, cudaDataType, void *, cudaDataType,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasRotmgEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, d1, d1Type, d2, d2Type, x1, x1Type, y1, y1Type, param,
+                  paramType, executiontype);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgemv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const float *,
+      const float *, int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgbmv_v2(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+               int kl, int ku, const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *x, int incx,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(
+    cublasHandle_t handle, cublasOperation_t trans, int m, int n, int kl,
+    int ku, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y,
+                  incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *A, int lda, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const float *AP, float *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const double *AP, double *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    cublasDiagType_t diag, int n, const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n,
+                                           const cuDoubleComplex *AP,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, AP, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const float *A, int lda, float *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const double *A, int lda, double *x,
+                                           int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuComplex *A, int lda,
+                                           cuComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           cublasDiagType_t diag, int n, int k,
+                                           const cuDoubleComplex *A, int lda,
+                                           cuDoubleComplex *x, int incx) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, cublasDiagType_t,
+      int, int, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsymv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChemv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const float *alpha, /* host or device pointer */
+               const float *A, int lda, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const float *, const float *,
+      int, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const double *alpha, /* host or device pointer */
+               const double *A, int lda, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const double *,
+      const double *, int, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChbmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *A, int lda, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, const cuComplex *,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *x, int incx,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *AP, const float *x, int incx,
+               const float *beta, /* host or device pointer */
+               float *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *,
+      const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDspmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const double *alpha, /* host or device pointer */
+               const double *AP, const double *x, int incx,
+               const double *beta, /* host or device pointer */
+               double *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *AP, const cuComplex *x, int incx,
+               const cuComplex *beta, /* host or device pointer */
+               cuComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpmv_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *AP, const cuDoubleComplex *x, int incx,
+               const cuDoubleComplex *beta, /* host or device pointer */
+               cuDoubleComplex *y, int incy) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSger_v2(
+    cublasHandle_t handle, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDger_v2(
+    cublasHandle_t handle, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgeru_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgerc_v2(cublasHandle_t handle, int m, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuComplex *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZsyr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZher_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const float *x, int incx, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDspr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const double *x, int incx, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasChpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const float *alpha, /* host or device pointer */
+              const cuComplex *x, int incx, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const cuComplex *,
+      int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpr_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+              const double *alpha, /* host or device pointer */
+              const cuDoubleComplex *x, int incx, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const float *alpha, /* host or device pointer */
+    const float *x, int incx, const float *y, int incy, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZsyr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuComplex *alpha, /* host or device pointer */
+               const cuComplex *x, int incx, const cuComplex *y, int incy,
+               cuComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZher2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSspr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const float *alpha, /* host or device pointer */
+               const float *x, int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, const float *, int,
+      const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDspr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const double *alpha, /* host or device pointer */
+    const double *x, int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, const double *,
+      int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChpr2_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *x, int incx, const cuComplex *y, int incy, cuComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZhpr2_v2(cublasHandle_t handle, cublasFillMode_t uplo, int n,
+               const cuDoubleComplex *alpha, /* host or device pointer */
+               const cuDoubleComplex *x, int incx, const cuDoubleComplex *y,
+               int incy, cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3m(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemm_v2(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgemm3m(cublasHandle_t handle, cublasOperation_t transa,
+              cublasOperation_t transb, int m, int n, int k,
+              const cuDoubleComplex *alpha, /* host or device pointer */
+              const cuDoubleComplex *A, int lda, const cuDoubleComplex *B,
+              int ldb, const cuDoubleComplex *beta, /* host or device pointer */
+              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm3m");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const float *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda, const void *B,
+    cudaDataType Btype, int ldb, const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, cublasComputeType_t computeType,
+    cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, const void *, cudaDataType,
+      int, const void *, void *, cudaDataType, int, cublasComputeType_t,
+      cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, const void *A,
+    cudaDataType Atype, int lda, const void *B, cudaDataType Btype, int ldb,
+    const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const void *,
+      cudaDataType, int, const cuComplex *, void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda, B,
+                  Btype, ldb, beta, C, Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    cublasOperation_t transc, int m, int n, int k, const unsigned char *A,
+    int A_bias, int lda, const unsigned char *B, int B_bias, int ldb,
+    unsigned char *C, int C_bias, int ldc, int C_mult, int C_shift) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, cublasOperation_t,
+      int, int, int, const unsigned char *, int, int, const unsigned char *,
+      int, int, unsigned char *, int, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasUint8gemmBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, transc, m, n, k, A, A_bias, lda, B,
+                  B_bias, ldb, C, C_bias, ldc, C_mult, C_shift);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, const void *A, cudaDataType Atype,
+    int lda, const cuComplex *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const void *, cudaDataType, int, const cuComplex *,
+      void *, cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha,               /* host or device pointer */
+    const cuComplex *A, int lda, const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const cuComplex *, int, const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherk_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const cuDoubleComplex *, int, const double *,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    const float *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherk3mEx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, const void *A, cudaDataType Atype,
+    int lda, const float *beta, void *C, cudaDataType Ctype, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const void *, cudaDataType, int, const float *, void *,
+      cudaDataType, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk3mEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, Atype, lda, beta, C,
+                  Ctype, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZher2k_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsyrkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const float *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const float *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZherkx(
+    cublasHandle_t handle, cublasFillMode_t uplo, cublasOperation_t trans,
+    int n, int k, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const double *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherkx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const float *, const float *, int, const float *, int, const float *,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const double *, const double *, int, const double *, int, const double *,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZsymm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasChemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *, int,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZhemm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, int m,
+    int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, m, n, alpha, A, lda, B, ldb, beta, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, float *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int, float *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, double *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int, double *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, cuComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /* host or device pointer */
+    const float *A, int lda, const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /* host or device pointer */
+    const double *A, int lda, const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, const cuComplex *B, int ldb, cuComplex *C,
+    int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha, /* host or device pointer */
+    const float *const Aarray[], int lda, const float *const Barray[], int ldb,
+    const float *beta, /* host or device pointer */
+    float *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *const[], int, const float *const[], int,
+      const float *, float *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *const Aarray[], int lda, const double *const Barray[],
+    int ldb, const double *beta, /* host or device pointer */
+    double *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *const[], int, const double *const[], int,
+      const double *, double *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *const Aarray[], int lda, const cuComplex *const Barray[],
+    int ldb, const cuComplex *beta, /* host or device pointer */
+    cuComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *const[], int,
+      const cuComplex *const[], int, const cuComplex *, cuComplex *const[], int,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgemmBatched(cublasHandle_t handle, cublasOperation_t transa,
+                   cublasOperation_t transb, int m, int n, int k,
+                   const cuDoubleComplex *alpha, /* host or device pointer */
+                   const cuDoubleComplex *const Aarray[], int lda,
+                   const cuDoubleComplex *const Barray[], int ldb,
+                   const cuDoubleComplex *beta, /* host or device pointer */
+                   cuDoubleComplex *const Carray[], int ldc, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *const[], int,
+      const cuDoubleComplex *const[], int, const cuDoubleComplex *,
+      cuDoubleComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, lda, Barray,
+                  ldb, beta, Carray, ldc, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *const Aarray[], cudaDataType Atype, int lda,
+    const void *const Barray[], cudaDataType Btype, int ldb,
+    const void *beta, /* host or device pointer */
+    void *const Carray[], cudaDataType Ctype, int ldc, int batchCount,
+    cublasComputeType_t computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *const[], cudaDataType, int, const void *const[],
+      cudaDataType, int, const void *, void *const[], cudaDataType, int, int,
+      cublasComputeType_t, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, Aarray, Atype, lda,
+                  Barray, Btype, ldb, beta, Carray, Ctype, ldc, batchCount,
+                  computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const void *alpha, /* host or device pointer */
+    const void *A, cudaDataType Atype, int lda,
+    long long int strideA, /* purposely signed */
+    const void *B, cudaDataType Btype, int ldb, long long int strideB,
+    const void *beta, /* host or device pointer */
+    void *C, cudaDataType Ctype, int ldc, long long int strideC, int batchCount,
+    cublasComputeType_t computeType, cublasGemmAlgo_t algo) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const void *, const void *, cudaDataType, int, long long, const void *,
+      cudaDataType, int, long long, const void *, void *, cudaDataType, int,
+      long long, int, cublasComputeType_t, cublasGemmAlgo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGemmStridedBatchedEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, Atype, lda,
+                  strideA, B, Btype, ldb, strideB, beta, C, Ctype, ldc, strideC,
+                  batchCount, computeType, algo);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const float *alpha,        /* host or device pointer */
+    const float *A, int lda, long long int strideA, /* purposely signed */
+    const float *B, int ldb, long long int strideB,
+    const float *beta, /* host or device pointer */
+    float *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const float *, const float *, int, long long, const float *, int,
+      long long, const float *, float *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const double *alpha, /* host or device pointer */
+    const double *A, int lda, long long int strideA, /* purposely signed */
+    const double *B, int ldb, long long int strideB,
+    const double *beta, /* host or device pointer */
+    double *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const double *, const double *, int, long long, const double *, int,
+      long long, const double *, double *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda, long long int strideA, /* purposely signed */
+    const cuComplex *B, int ldb, long long int strideB,
+    const cuComplex *beta, /* host or device pointer */
+    cuComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuComplex *, const cuComplex *, int, long long, const cuComplex *,
+      int, long long, const cuComplex *, cuComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm3mStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, int k,
+    const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    long long int strideA, /* purposely signed */
+    const cuDoubleComplex *B, int ldb, long long int strideB,
+    const cuDoubleComplex *beta, /* host or device poi */
+    cuDoubleComplex *C, int ldc, long long int strideC, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *,
+      cuDoubleComplex *, int, long long, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemmStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B,
+                  ldb, strideB, beta, C, ldc, strideC, batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const float *alpha,           /* host or device pointer */
+    const float *A, int lda, const float *beta, /* host or device pointer */
+    const float *B, int ldb, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const float *, const float *, int, const float *, const float *, int,
+      float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const double *alpha,            /* host or device pointer */
+    const double *A, int lda, const double *beta, /* host or device pointer */
+    const double *B, int ldb, double *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const double *, const double *, int, const double *, const double *, int,
+      double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuComplex *alpha, /* host or device pointer */
+    const cuComplex *A, int lda,
+    const cuComplex *beta, /* host or device pointer */
+    const cuComplex *B, int ldb, cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuComplex *, const cuComplex *, int, const cuComplex *,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeam(
+    cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb,
+    int m, int n, const cuDoubleComplex *alpha, /* host or device pointer */
+    const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *beta, /* host or device pointer */
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeam");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transa, transb, m, n, alpha, A, lda, beta, B, ldb, C,
+                  ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(
+    cublasHandle_t handle, int n, float *const A[], /*Device pointer*/
+    int lda, int *P,                                /*Device Pointer*/
+    int *info,                                      /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, float *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(
+    cublasHandle_t handle, int n, double *const A[], /*Device pointer*/
+    int lda, int *P,                                 /*Device Pointer*/
+    int *info,                                       /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, double *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(
+    cublasHandle_t handle, int n, cuComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                    /*Device Pointer*/
+    int *info,                                          /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
+    cublasHandle_t handle, int n, cuDoubleComplex *const A[], /*Device pointer*/
+    int lda, int *P,                                          /*Device Pointer*/
+    int *info,                                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, cuDoubleComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetriBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, const int *P,                                /*Device pointer*/
+    float *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const float *const[], int, const int *,
+      float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetriBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, const int *P,                                 /*Device pointer*/
+    double *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const double *const[], int, const int *,
+      double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetriBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, const int *P,                                    /*Device pointer*/
+    cuComplex *const C[],                                     /*Device pointer*/
+    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, const int *,
+      cuComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgetriBatched(cublasHandle_t handle, int n,
+                    const cuDoubleComplex *const A[], /*Device pointer*/
+                    int lda, const int *P,            /*Device pointer*/
+                    cuDoubleComplex *const C[],       /*Device pointer*/
+                    int ldc, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetriBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, P, C, ldc, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const float *const Aarray[], int lda, const int *devIpiv,
+    float *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const float *const[], int,
+      const int *, float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const double *const Aarray[], int lda, const int *devIpiv,
+    double *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const double *const[], int,
+      const int *, double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuComplex *const Aarray[], int lda, const int *devIpiv,
+    cuComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, const cuComplex *const[],
+      int, const int *, cuComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(
+    cublasHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *const Aarray[], int lda, const int *devIpiv,
+    cuDoubleComplex *const Barray[], int ldb, int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int,
+      const cuDoubleComplex *const[], int, const int *,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgetrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb,
+                  info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const float *alpha, /*Host or Device Pointer*/
+    const float *const A[], int lda, float *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const float *, const float *const[], int,
+      float *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const double *alpha, /*Host or Device Pointer*/
+    const double *const A[], int lda, double *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const double *, const double *const[], int,
+      double *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuComplex *alpha, /*Host or Device Pointer*/
+    const cuComplex *const A[], int lda, cuComplex *const B[], int ldb,
+    int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuComplex *, const cuComplex *const[],
+      int, cuComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(
+    cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, cublasDiagType_t diag, int m, int n,
+    const cuDoubleComplex *alpha, /*Host or Device Pointer*/
+    const cuDoubleComplex *const A[], int lda, cuDoubleComplex *const B[],
+    int ldb, int batchCount) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      cublasDiagType_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsmBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, diag, m, n, alpha, A, lda, B, ldb,
+                  batchCount);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(
+    cublasHandle_t handle, int n, const float *const A[], /*Device pointer*/
+    int lda, float *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                               /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const float *const[],
+                                     int, float *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(
+    cublasHandle_t handle, int n, const double *const A[], /*Device pointer*/
+    int lda, double *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, const double *const[],
+                                     int, double *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(
+    cublasHandle_t handle, int n, const cuComplex *const A[], /*Device pointer*/
+    int lda, cuComplex *const Ainv[],                         /*Device pointer*/
+    int lda_inv, int *info,                                   /*Device Pointer*/
+    int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuComplex *const[], int, cuComplex *const[],
+      int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZmatinvBatched(cublasHandle_t handle, int n,
+                     const cuDoubleComplex *const A[],       /*Device pointer*/
+                     int lda, cuDoubleComplex *const Ainv[], /*Device pointer*/
+                     int lda_inv, int *info,                 /*Device Pointer*/
+                     int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, const cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZmatinvBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, Ainv, lda_inv, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    float *const Aarray[],            /*Device pointer*/
+                    int lda, float *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, float *const[],
+                                     int, float *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    double *const Aarray[],            /*Device pointer*/
+                    int lda, double *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, int, int, double *const[],
+                                     int, double *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgeqrfBatched(cublasHandle_t handle, int m, int n,
+                    cuComplex *const Aarray[],            /*Device pointer*/
+                    int lda, cuComplex *const TauArray[], /*Device pointer*/
+                    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuComplex *const[], int, cuComplex *const[],
+      int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(
+    cublasHandle_t handle, int m, int n,
+    cuDoubleComplex *const Aarray[],            /*Device pointer*/
+    int lda, cuDoubleComplex *const TauArray[], /*Device pointer*/
+    int *info, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, int, int, cuDoubleComplex *const[], int,
+      cuDoubleComplex *const[], int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeqrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Aarray, lda, TauArray, info, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasSgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, float *const Aarray[],       /*Device pointer*/
+                   int lda, float *const Carray[],        /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, float *const[], int,
+      float *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasDgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, double *const Aarray[],      /*Device pointer*/
+                   int lda, double *const Carray[],       /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, /*Device pointer*/
+                   int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, double *const[], int,
+      double *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasCgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int, cuComplex *const[], int,
+      cuComplex *const[], int, int *, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI
+cublasZgelsBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n,
+                   int nrhs, cuDoubleComplex *const Aarray[], /*Device pointer*/
+                   int lda, cuDoubleComplex *const Carray[],  /*Device pointer*/
+                   int ldc, int *info, int *devInfoArray, int batchSize) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasOperation_t, int, int, int,
+      cuDoubleComplex *const[], int, cuDoubleComplex *const[], int, int *,
+      int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgelsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, m, n, nrhs, Aarray, lda, Carray, ldc, info,
+                  devInfoArray, batchSize);
+}
+
+cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const float *A, int lda, const float *x,
+                                        int incx, float *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const float *, int,
+      const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const double *A, int lda,
+                                        const double *x, int incx, double *C,
+                                        int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const double *, int,
+      const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuComplex *A, int lda,
+                                        const cuComplex *x, int incx,
+                                        cuComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuComplex *, int,
+      const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                        cublasSideMode_t mode, int m, int n,
+                                        const cuDoubleComplex *A, int lda,
+                                        const cuDoubleComplex *x, int incx,
+                                        cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasSideMode_t, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdgmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode, m, n, A, lda, x, incx, C, ldc);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *AP, float *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *AP, double *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *AP, cuComplex *A,
+                                         int lda) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtpttr(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *AP,
+                                         cuDoubleComplex *A, int lda) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *,
+      cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpttr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, AP, A, lda);
+}
+
+cublasStatus_t CUBLASWINAPI cublasStrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const float *A, int lda, float *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasDtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const double *A, int lda, double *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasCtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuComplex *A, int lda,
+                                         cuComplex *AP) {
+  using FuncPtr =
+      cublasStatus_t(CUBLASWINAPI *)(cublasHandle_t, cublasFillMode_t, int,
+                                     const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus_t CUBLASWINAPI cublasZtrttp(cublasHandle_t handle,
+                                         cublasFillMode_t uplo, int n,
+                                         const cuDoubleComplex *A, int lda,
+                                         cuDoubleComplex *AP) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(
+      cublasHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrttp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, AP);
+}
+
+cublasStatus CUBLASWINAPI cublasInit(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasShutdown(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasShutdown");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetError(void) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int *version) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void **devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(int, int, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(n, elemSize, devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasFree(void *devicePtr) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devicePtr);
+}
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream) {
+  using FuncPtr = cublasStatus_t(CUBLASWINAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSetKernelStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+float CUBLASWINAPI cublasSnrm2(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDnrm2(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDnrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScnrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScnrm2");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDznrm2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDznrm2");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSdot(int n, const float *x, int incx, const float *y,
+                              int incy) {
+  using FuncPtr =
+      float(CUBLASWINAPI *)(int, const float *, int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+double CUBLASWINAPI cublasDdot(int n, const double *x, int incx,
+                               const double *y, int incy) {
+  using FuncPtr =
+      double(CUBLASWINAPI *)(int, const double *, int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDdot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDdot");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex *x, int incx,
+                                   const cuComplex *y, int incy) {
+  using FuncPtr = cuComplex(CUBLASWINAPI *)(int, const cuComplex *, int,
+                                            const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotu");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotu");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex *x,
+                                         int incx, const cuDoubleComplex *y,
+                                         int incy) {
+  using FuncPtr = cuDoubleComplex(CUBLASWINAPI *)(
+      int, const cuDoubleComplex *, int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdotc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdotc");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSscal(int n, float alpha, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasDscal(int n, double alpha, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex *x,
+                               int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdscal");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdscal");
+  return func_ptr(n, alpha, x, incx);
+}
+
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float *x, int incx,
+                              float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double *x, int incx,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex *x,
+                              int incx, cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex, const cuComplex *, int,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZaxpy(int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZaxpy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZaxpy");
+  return func_ptr(n, alpha, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasScopy(int n, const float *x, int incx, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDcopy(int n, const double *x, int incx, double *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, const cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZcopy");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZcopy");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasSswap(int n, float *x, int incx, float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasDswap(int n, double *x, int incx, double *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasCswap(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZswap");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZswap");
+  return func_ptr(n, x, incx, y, incy);
+}
+
+int CUBLASWINAPI cublasIsamax(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamax(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamax");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamax");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIsamin(int n, const float *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIsamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIsamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIdamin(int n, const double *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIdamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIdamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIcamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIcamin");
+  return func_ptr(n, x, incx);
+}
+
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = int(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasIzamin");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasIzamin");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasSasum(int n, const float *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDasum(int n, const double *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDasum");
+  return func_ptr(n, x, incx);
+}
+
+float CUBLASWINAPI cublasScasum(int n, const cuComplex *x, int incx) {
+  using FuncPtr = float(CUBLASWINAPI *)(int, const cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasScasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasScasum");
+  return func_ptr(n, x, incx);
+}
+
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex *x, int incx) {
+  using FuncPtr = double(CUBLASWINAPI *)(int, const cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDzasum");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDzasum");
+  return func_ptr(n, x, incx);
+}
+
+void CUBLASWINAPI cublasSrot(int n, float *x, int incx, float *y, int incy,
+                             float sc, float ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrot(int n, double *x, int incx, double *y, int incy,
+                             double sc, double ss) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrot");
+  return func_ptr(n, x, incx, y, incy, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrot(int n, cuComplex *x, int incx, cuComplex *y,
+                             int incy, float c, cuComplex s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZrot(int n, cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *y, int incy, double sc,
+                             cuDoubleComplex cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+                           double, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrot");
+  return func_ptr(n, x, incx, y, incy, sc, cs);
+}
+
+void CUBLASWINAPI cublasCsrot(int n, cuComplex *x, int incx, cuComplex *y,
+                              int incy, float c, float s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuComplex *, int, cuComplex *, int,
+                                       float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex *x, int incx,
+                              cuDoubleComplex *y, int incy, double c,
+                              double s) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, cuDoubleComplex *, int,
+                                       cuDoubleComplex *, int, double, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZdrot");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZdrot");
+  return func_ptr(n, x, incx, y, incy, c, s);
+}
+
+void CUBLASWINAPI cublasSrotg(float *sa, float *sb, float *sc, float *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(float *, float *, float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasDrotg(double *sa, double *sb, double *sc, double *ss) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotg");
+  return func_ptr(sa, sb, sc, ss);
+}
+
+void CUBLASWINAPI cublasCrotg(cuComplex *ca, cuComplex cb, float *sc,
+                              cuComplex *cs) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(cuComplex *, cuComplex, float *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex *ca, cuDoubleComplex cb,
+                              double *sc, cuDoubleComplex *cs) {
+  using FuncPtr = void(CUBLASWINAPI *)(cuDoubleComplex *, cuDoubleComplex,
+                                       double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZrotg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZrotg");
+  return func_ptr(ca, cb, sc, cs);
+}
+
+void CUBLASWINAPI cublasSrotm(int n, float *x, int incx, float *y, int incy,
+                              const float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, float *, int, float *, int, const float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasDrotm(int n, double *x, int incx, double *y, int incy,
+                              const double *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, double *, int, double *, int, const double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotm");
+  return func_ptr(n, x, incx, y, incy, sparam);
+}
+
+void CUBLASWINAPI cublasSrotmg(float *sd1, float *sd2, float *sx1,
+                               const float *sy1, float *sparam) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(float *, float *, float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasDrotmg(double *sd1, double *sd2, double *sx1,
+                               const double *sy1, double *sparam) {
+  using FuncPtr = void(CUBLASWINAPI *)(double *, double *, double *,
+                                       const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDrotmg");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDrotmg");
+  return func_ptr(sd1, sd2, sx1, sy1, sparam);
+}
+
+void CUBLASWINAPI cublasSgemv(char trans, int m, int n, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgemv(char trans, int m, int n, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgemv(char trans, int m, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgemv(char trans, int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemv");
+  return func_ptr(trans, m, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSgbmv(char trans, int m, int n, int kl, int ku,
+                              float alpha, const float *A, int lda,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDgbmv(char trans, int m, int n, int kl, int ku,
+                              double alpha, const double *A, int lda,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasCgbmv(char trans, int m, int n, int kl, int ku,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *x, int incx, cuComplex beta,
+                              cuComplex *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZgbmv(char trans, int m, int n, int kl, int ku,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgbmv");
+  return func_ptr(trans, m, n, kl, ku, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStbmv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbmv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbmv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbmv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpmv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpmv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const float *,
+                                       int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtrsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtrsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const cuDoubleComplex *, int,
+                           cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsv");
+  return func_ptr(uplo, trans, diag, n, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n,
+                              const float *AP, float *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n,
+                              const double *AP, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const double *,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n,
+                              const cuComplex *AP, cuComplex *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, const cuComplex *,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasZtpsv(char uplo, char trans, char diag, int n,
+                              const cuDoubleComplex *AP, cuDoubleComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtpsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtpsv");
+  return func_ptr(uplo, trans, diag, n, AP, x, incx);
+}
+
+void CUBLASWINAPI cublasStbsv(char uplo, char trans, char diag, int n, int k,
+                              const float *A, int lda, float *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasDtbsv(char uplo, char trans, char diag, int n, int k,
+                              const double *A, int lda, double *x, int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, int, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasCtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuComplex *A, int lda, cuComplex *x,
+                              int incx) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, char, int, int, const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasZtbsv(char uplo, char trans, char diag, int n, int k,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *x, int incx) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, int, int, const cuDoubleComplex *,
+                           int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtbsv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtbsv");
+  return func_ptr(uplo, trans, diag, n, k, A, lda, x, incx);
+}
+
+void CUBLASWINAPI cublasSsymv(char uplo, int n, float alpha, const float *A,
+                              int lda, const float *x, int incx, float beta,
+                              float *y, int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsymv(char uplo, int n, double alpha, const double *A,
+                              int lda, const double *x, int incx, double beta,
+                              double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChemv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhemv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemv");
+  return func_ptr(uplo, n, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSsbmv(char uplo, int n, int k, float alpha,
+                              const float *A, int lda, const float *x, int incx,
+                              float beta, float *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDsbmv(char uplo, int n, int k, double alpha,
+                              const double *A, int lda, const double *x,
+                              int incx, double beta, double *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChbmv(char uplo, int n, int k, cuComplex alpha,
+                              const cuComplex *A, int lda, const cuComplex *x,
+                              int incx, cuComplex beta, cuComplex *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhbmv(char uplo, int n, int k, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhbmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhbmv");
+  return func_ptr(uplo, n, k, alpha, A, lda, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSspmv(char uplo, int n, float alpha, const float *AP,
+                              const float *x, int incx, float beta, float *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasDspmv(char uplo, int n, double alpha, const double *AP,
+                              const double *x, int incx, double beta, double *y,
+                              int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, const double *,
+                           int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasChpmv(char uplo, int n, cuComplex alpha,
+                              const cuComplex *AP, const cuComplex *x, int incx,
+                              cuComplex beta, cuComplex *y, int incy) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *,
+                           const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasZhpmv(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *AP,
+                              const cuDoubleComplex *x, int incx,
+                              cuDoubleComplex beta, cuDoubleComplex *y,
+                              int incy) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpmv");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpmv");
+  return func_ptr(uplo, n, alpha, AP, x, incx, beta, y, incy);
+}
+
+void CUBLASWINAPI cublasSger(int m, int n, float alpha, const float *x,
+                             int incx, const float *y, int incy, float *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, float, const float *, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDger(int m, int n, double alpha, const double *x,
+                             int incx, const double *y, int incy, double *A,
+                             int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(int, int, double, const double *, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDger");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDger");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgeru(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCgerc(int m, int n, cuComplex alpha, const cuComplex *x,
+                              int incx, const cuComplex *y, int incy,
+                              cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(int, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgeru(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgeru");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgeru");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZgerc(int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgerc");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgerc");
+  return func_ptr(m, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasZher(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher");
+  return func_ptr(uplo, n, alpha, x, incx, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float *x,
+                             int incx, float *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, float, const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double *x,
+                             int incx, double *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, double, const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex *x,
+                             int incx, cuComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const cuComplex *, int,
+                                       cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha,
+                             const cuDoubleComplex *x, int incx,
+                             cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, double, const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr");
+  return func_ptr(uplo, n, alpha, x, incx, AP);
+}
+
+void CUBLASWINAPI cublasSsyr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasDsyr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *A,
+                              int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasCher2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *A, int lda) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasZher2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *A, int lda) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, A, lda);
+}
+
+void CUBLASWINAPI cublasSspr2(char uplo, int n, float alpha, const float *x,
+                              int incx, const float *y, int incy, float *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, float, const float *, int,
+                                       const float *, int, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasDspr2(char uplo, int n, double alpha, const double *x,
+                              int incx, const double *y, int incy, double *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, int, double, const double *, int,
+                                       const double *, int, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDspr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDspr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasChpr2(char uplo, int n, cuComplex alpha,
+                              const cuComplex *x, int incx, const cuComplex *y,
+                              int incy, cuComplex *AP) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, int, cuComplex, const cuComplex *, int,
+                           const cuComplex *, int, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasZhpr2(char uplo, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *x, int incx,
+                              const cuDoubleComplex *y, int incy,
+                              cuDoubleComplex *AP) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhpr2");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhpr2");
+  return func_ptr(uplo, n, alpha, x, incx, y, incy, AP);
+}
+
+void CUBLASWINAPI cublasSgemm(char transa, char transb, int m, int n, int k,
+                              float alpha, const float *A, int lda,
+                              const float *B, int ldb, float beta, float *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDgemm(char transa, char transb, int m, int n, int k,
+                              double alpha, const double *A, int lda,
+                              const double *B, int ldb, double beta, double *C,
+                              int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, int, double, const double *,
+                           int, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCgemm(char transa, char transb, int m, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZgemm(char transa, char transb, int m, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZgemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZgemm");
+  return func_ptr(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyrk(char uplo, char trans, int n, int k, float alpha,
+                              const float *A, int lda, float beta, float *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, float,
+                                       const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyrk(char uplo, char trans, int n, int k, double alpha,
+                              const double *A, int lda, double beta, double *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, double, const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyrk(char uplo, char trans, int n, int k,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              cuComplex beta, cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, cuComplex, const cuComplex *,
+                           int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyrk(char uplo, char trans, int n, int k,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, cuDoubleComplex,
+                                       const cuDoubleComplex *, int,
+                                       cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyrk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyrk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCherk(char uplo, char trans, int n, int k, float alpha,
+                              const cuComplex *A, int lda, float beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const cuComplex *, int,
+                           float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZherk(char uplo, char trans, int n, int k, double alpha,
+                              const cuDoubleComplex *A, int lda, double beta,
+                              cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, int, int, double,
+                                       const cuDoubleComplex *, int, double,
+                                       cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZherk");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZherk");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsyr2k(char uplo, char trans, int n, int k, float alpha,
+                               const float *A, int lda, const float *B, int ldb,
+                               float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsyr2k(char uplo, char trans, int n, int k,
+                               double alpha, const double *A, int lda,
+                               const double *B, int ldb, double beta, double *C,
+                               int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsyr2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, cuComplex beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsyr2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               cuDoubleComplex beta, cuDoubleComplex *C,
+                               int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsyr2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsyr2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCher2k(char uplo, char trans, int n, int k,
+                               cuComplex alpha, const cuComplex *A, int lda,
+                               const cuComplex *B, int ldb, float beta,
+                               cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, float, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZher2k(char uplo, char trans, int n, int k,
+                               cuDoubleComplex alpha, const cuDoubleComplex *A,
+                               int lda, const cuDoubleComplex *B, int ldb,
+                               double beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZher2k");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZher2k");
+  return func_ptr(uplo, trans, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasSsymm(char side, char uplo, int m, int n, float alpha,
+                              const float *A, int lda, const float *B, int ldb,
+                              float beta, float *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, float, const float *, int,
+                           const float *, int, float, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasSsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasSsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasDsymm(char side, char uplo, int m, int n, double alpha,
+                              const double *A, int lda, const double *B,
+                              int ldb, double beta, double *C, int ldc) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, int, int, double, const double *, int,
+                           const double *, int, double, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasCsymm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZsymm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZsymm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZsymm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasChemm(char side, char uplo, int m, int n,
+                              cuComplex alpha, const cuComplex *A, int lda,
+                              const cuComplex *B, int ldb, cuComplex beta,
+                              cuComplex *C, int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuComplex, const cuComplex *, int,
+      const cuComplex *, int, cuComplex, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasChemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasChemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasZhemm(char side, char uplo, int m, int n,
+                              cuDoubleComplex alpha, const cuDoubleComplex *A,
+                              int lda, const cuDoubleComplex *B, int ldb,
+                              cuDoubleComplex beta, cuDoubleComplex *C,
+                              int ldc) {
+  using FuncPtr = void(CUBLASWINAPI *)(
+      char, char, int, int, cuDoubleComplex, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, cuDoubleComplex, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZhemm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZhemm");
+  return func_ptr(side, uplo, m, n, alpha, A, lda, B, ldb, beta, C, ldc);
+}
+
+void CUBLASWINAPI cublasStrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrsm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrsm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrsm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasStrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, float alpha, const float *A,
+                              int lda, float *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, float,
+                                       const float *, int, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasStrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasStrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasDtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, double alpha, const double *A,
+                              int lda, double *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int, double,
+                                       const double *, int, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasDtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasDtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasCtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuComplex alpha, const cuComplex *A,
+                              int lda, cuComplex *B, int ldb) {
+  using FuncPtr =
+      void(CUBLASWINAPI *)(char, char, char, char, int, int, cuComplex,
+                           const cuComplex *, int, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasCtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasCtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+void CUBLASWINAPI cublasZtrmm(char side, char uplo, char transa, char diag,
+                              int m, int n, cuDoubleComplex alpha,
+                              const cuDoubleComplex *A, int lda,
+                              cuDoubleComplex *B, int ldb) {
+  using FuncPtr = void(CUBLASWINAPI *)(char, char, char, char, int, int,
+                                       cuDoubleComplex, const cuDoubleComplex *,
+                                       int, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cublasZtrmm");
+  if (!func_ptr) LogFatalSymbolNotFound("cublasZtrmm");
+  return func_ptr(side, uplo, transa, diag, m, n, alpha, A, lda, B, ldb);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cuda_11_0.inc b/tensorflow/stream_executor/cuda/cuda_11_0.inc
new file mode 100644
index 00000000000..18f3ff4cd57
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_11_0.inc
@@ -0,0 +1,2430 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorString");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUresult, const char **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetErrorName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(error, pStr);
+}
+
+CUresult CUDAAPI cuInit(unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuInit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(Flags);
+}
+
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, ordinal);
+}
+
+CUresult CUDAAPI cuDeviceGetCount(int *count) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetName");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(name, len, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUuuid *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetUuid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(uuid, dev);
+}
+
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceTotalMem_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(bytes, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                                      CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice_attribute, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, dev);
+}
+
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList,
+                                                CUdevice dev, int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdevice, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, dev, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop,
+                                                         CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevprop *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, dev);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major,
+                                                             int *minor,
+                                                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceComputeCapability");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(major, minor, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRetain");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxRelease_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxSetFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags,
+                                            int *active) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice, unsigned int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxGetState");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, flags, active);
+}
+
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDevicePrimaryCtxReset_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev);
+}
+
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags,
+                             CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags, dev);
+}
+
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPushCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxPopCurrent_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCurrent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx);
+}
+
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+CUresult CUDAAPI cuCtxSynchronize(void) {
+  using FuncPtr = CUresult(CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUlimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pvalue, limit);
+}
+
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pconfig);
+}
+
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetApiVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx, version);
+}
+
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority,
+                                             int *greatestPriority) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void) {
+  using FuncPtr = CUresult(CUDAAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxAttach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pctx, flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDetach");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ctx);
+}
+
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoad");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fname);
+}
+
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadData");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image);
+}
+
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image,
+                                    unsigned int numOptions,
+                                    CUjit_option *options,
+                                    void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *, unsigned int,
+                                      CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadDataEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, image, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleLoadFatBinary");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(module, fatCubin);
+}
+
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleUnload");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod);
+}
+
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                                     const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetFunction");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes,
+                                   CUmodule hmod, const char *name) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetGlobal_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytes, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod,
+                                   const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef, hmod, name);
+}
+
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod,
+                                    const char *name) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref *, CUmodule, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuModuleGetSurfRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfRef, hmod, name);
+}
+
+CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options,
+                              void **optionValues, CUlinkState *stateOut) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUjit_option *, void **, CUlinkState *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numOptions, options, optionValues, stateOut);
+}
+
+CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type,
+                               void *data, size_t size, const char *name,
+                               unsigned int numOptions, CUjit_option *options,
+                               void **optionValues) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, void *, size_t,
+                          const char *, unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddData_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, data, size, name, numOptions, options,
+                  optionValues);
+}
+
+CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type,
+                               const char *path, unsigned int numOptions,
+                               CUjit_option *options, void **optionValues) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, CUjitInputType, const char *,
+                                      unsigned int, CUjit_option *, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkAddFile_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, type, path, numOptions, options, optionValues);
+}
+
+CUresult CUDAAPI cuLinkComplete(CUlinkState state, void **cubinOut,
+                                size_t *sizeOut) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState, void **, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkComplete");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state, cubinOut, sizeOut);
+}
+
+CUresult CUDAAPI cuLinkDestroy(CUlinkState state) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUlinkState);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLinkDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(state);
+}
+
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetInfo_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAlloc_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize);
+}
+
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch,
+                                 size_t WidthInBytes, size_t Height,
+                                 unsigned int ElementSizeBytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, size_t, size_t,
+                                      unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocPitch_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes);
+}
+
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFree_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize,
+                                      CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAddressRange_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbase, psize, dptr);
+}
+
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocHost_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize);
+}
+
+CUresult CUDAAPI cuMemFreeHost(void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize,
+                                unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pp, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p,
+                                           unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetDevicePointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, p, Flags);
+}
+
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, p);
+}
+
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize,
+                                   unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAllocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr, bytesize, flags);
+}
+
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdevice *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dev, pciBusId);
+}
+
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) {
+  using FuncPtr = CUresult(CUDAAPI *)(char *, int, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, dev);
+}
+
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcEventHandle *, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, event);
+}
+
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent,
+                                      CUipcEventHandle handle) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, CUipcEventHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, handle);
+}
+
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUipcMemHandle *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, dptr);
+}
+
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle,
+                                    unsigned int Flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, CUipcMemHandle, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, handle, Flags);
+}
+
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dptr);
+}
+
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostRegister_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, bytesize, Flags);
+}
+
+CUresult CUDAAPI cuMemHostUnregister(void *p) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext,
+                              CUdeviceptr srcDevice, CUcontext srcContext,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset,
+                              CUdeviceptr srcDevice, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcDevice, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray,
+                              size_t srcOffset, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoD_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset,
+                              const void *srcHost, size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoH_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset,
+                              CUarray srcArray, size_t srcOffset,
+                              size_t ByteCount) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray, size_t, CUarray, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoA_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcArray, srcOffset, ByteCount);
+}
+
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DUnaligned_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3D_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy);
+}
+
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src,
+                               size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext,
+                                   CUdeviceptr srcDevice, CUcontext srcContext,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, CUcontext, CUdeviceptr,
+                                      CUcontext, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstContext, srcDevice, srcContext, ByteCount,
+                  hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice,
+                                   size_t ByteCount, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, CUdeviceptr, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyDtoDAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, srcDevice, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset,
+                                   const void *srcHost, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray, size_t, const void *, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyHtoAAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstArray, dstOffset, srcHost, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray,
+                                   size_t srcOffset, size_t ByteCount,
+                                   CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(void *, CUarray, size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpyAtoHAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstHost, srcArray, srcOffset, ByteCount, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY2D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy2DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DAsync_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy,
+                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(const CUDA_MEMCPY3D_PEER *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCopy, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N);
+}
+
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us,
+                             size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N);
+}
+
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N);
+}
+
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch,
+                              unsigned char uc, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned short us, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch,
+                               unsigned int ui, size_t Width, size_t Height) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height);
+}
+
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc,
+                                 size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned char, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, uc, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned short, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, us, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui,
+                                  size_t N, CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, unsigned int, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, ui, N, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                   unsigned char uc, size_t Width,
+                                   size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned char,
+                                      size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D8Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, uc, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned short us, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned short,
+                                      size_t, size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D16Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, us, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch,
+                                    unsigned int ui, size_t Width,
+                                    size_t Height, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, unsigned int, size_t,
+                                      size_t, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemsetD2D32Async");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dstDevice, dstPitch, ui, Width, Height, hStream);
+}
+
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle,
+                               const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor,
+                                      CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hArray);
+}
+
+CUresult CUDAAPI cuArray3DCreate(
+    CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DCreate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pAllocateArray);
+}
+
+CUresult CUDAAPI cuArray3DGetDescriptor(
+    CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_ARRAY3D_DESCRIPTOR *, CUarray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuArray3DGetDescriptor_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArrayDescriptor, hArray);
+}
+
+CUresult CUDAAPI
+cuMipmappedArrayCreate(CUmipmappedArray *pHandle,
+                       const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc,
+                       unsigned int numMipmapLevels) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUmipmappedArray *, const CUDA_ARRAY3D_DESCRIPTOR *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHandle, pMipmappedArrayDesc, numMipmapLevels);
+}
+
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray,
+                                          CUmipmappedArray hMipmappedArray,
+                                          unsigned int level) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUarray *, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayGetLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pLevelArray, hMipmappedArray, level);
+}
+
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMipmappedArrayDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hMipmappedArray);
+}
+
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size,
+                                     size_t alignment, CUdeviceptr addr,
+                                     unsigned long long flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, size_t, size_t,
+                                      CUdeviceptr, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressReserve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, alignment, addr, flags);
+}
+
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAddressFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                             const CUmemAllocationProp *prop,
+                             unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, size_t,
+                          const CUmemAllocationProp *, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, size, prop, flags);
+}
+
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRelease");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
+                          CUmemGenericAllocationHandle handle,
+                          unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, size_t,
+                          CUmemGenericAllocationHandle, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemMap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, offset, handle, flags);
+}
+
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemUnmap");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                                const CUmemAccessDesc *desc, size_t count) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, const CUmemAccessDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemSetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, desc, count);
+}
+
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags,
+                                const CUmemLocation *location,
+                                CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned long long *,
+                                      const CUmemLocation *, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags, location, ptr);
+}
+
+CUresult CUDAAPI cuMemExportToShareableHandle(
+    void *shareableHandle, CUmemGenericAllocationHandle handle,
+    CUmemAllocationHandleType handleType, unsigned long long flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(void *, CUmemGenericAllocationHandle,
+                          CUmemAllocationHandleType, unsigned long long);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemExportToShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(shareableHandle, handle, handleType, flags);
+}
+
+CUresult CUDAAPI cuMemImportFromShareableHandle(
+    CUmemGenericAllocationHandle *handle, void *osHandle,
+    CUmemAllocationHandleType shHandleType) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *,
+                                      CUmemAllocationHandleType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemImportFromShareableHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, osHandle, shHandleType);
+}
+
+CUresult CUDAAPI cuMemGetAllocationGranularity(
+    size_t *granularity, const CUmemAllocationProp *prop,
+    CUmemAllocationGranularity_flags option) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, const CUmemAllocationProp *,
+                                      CUmemAllocationGranularity_flags);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemGetAllocationGranularity");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(granularity, prop, option);
+}
+
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(
+    CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmemAllocationProp *, CUmemGenericAllocationHandle);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuMemGetAllocationPropertiesFromHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, handle);
+}
+
+CUresult CUDAAPI
+cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmemGenericAllocationHandle *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRetainAllocationHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, addr);
+}
+
+CUresult CUDAAPI cuPointerGetAttribute(void *data,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, attribute, ptr);
+}
+
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count,
+                                    CUdevice dstDevice, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUdevice, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, hStream);
+}
+
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count,
+                             CUmem_advise advice, CUdevice device) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr, size_t, CUmem_advise, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize,
+                                        CUmem_range_attribute attribute,
+                                        CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(void *, size_t, CUmem_range_attribute,
+                                      CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes,
+                                         CUmem_range_attribute *attributes,
+                                         size_t numAttributes,
+                                         CUdeviceptr devPtr, size_t count) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      void **, size_t *, CUmem_range_attribute *, size_t, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+CUresult CUDAAPI cuPointerSetAttribute(const void *value,
+                                       CUpointer_attribute attribute,
+                                       CUdeviceptr ptr) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(const void *, CUpointer_attribute, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attribute, ptr);
+}
+
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes,
+                                        CUpointer_attribute *attributes,
+                                        void **data, CUdeviceptr ptr) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int, CUpointer_attribute *,
+                                      void **, CUdeviceptr);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numAttributes, attributes, data, ptr);
+}
+
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, Flags);
+}
+
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream,
+                                            unsigned int flags, int priority) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phStream, flags, priority);
+}
+
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUcontext *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCtx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, pctx);
+}
+
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent,
+                                   unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUevent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, hEvent, Flags);
+}
+
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream,
+                                     CUstreamCallback callback, void *userData,
+                                     unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamCallback, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, callback, userData, flags);
+}
+
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream,
+                                      CUstreamCaptureMode mode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBeginCapture_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, mode);
+}
+
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, phGraph);
+}
+
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream,
+                                     CUstreamCaptureStatus *captureStatus) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus);
+}
+
+CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream,
+                                        CUstreamCaptureStatus *captureStatus,
+                                        cuuint64_t *id) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamCaptureStatus *, cuuint64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, captureStatus, id);
+}
+
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr,
+                                        size_t length, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, dptr, length, flags);
+}
+
+CUresult CUDAAPI cuStreamQuery(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream);
+}
+
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUstreamAttrID, const CUstreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phEvent, Flags);
+}
+
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent, hStream);
+}
+
+CUresult CUDAAPI cuEventQuery(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventDestroy_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hEvent);
+}
+
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
+                                    CUevent hEnd) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUevent, CUevent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMilliseconds, hStart, hEnd);
+}
+
+CUresult CUDAAPI
+cuImportExternalMemory(CUexternalMemory *extMem_out,
+                       const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory *,
+                                      const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(
+    CUdeviceptr *devPtr, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUexternalMemory,
+                                      const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(
+    CUmipmappedArray *mipmap, CUexternalMemory extMem,
+    const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUmipmappedArray *, CUexternalMemory,
+                          const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalMemory);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+CUresult CUDAAPI cuImportExternalSemaphore(
+    CUexternalSemaphore *extSem_out,
+    const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *,
+      const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *, unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(
+    const CUexternalSemaphore *extSemArray,
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray,
+    unsigned int numExtSems, CUstream stream) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      const CUexternalSemaphore *, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *,
+      unsigned int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUexternalSemaphore);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr,
+                                     cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr,
+                                     cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWaitValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr,
+                                      cuuint32_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint32_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue32");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr,
+                                      cuuint64_t value, unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUstream, CUdeviceptr, cuuint64_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamWriteValue64");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, addr, value, flags);
+}
+
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count,
+                                    CUstreamBatchMemOpParams *paramArray,
+                                    unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, unsigned int,
+                                      CUstreamBatchMemOpParams *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuStreamBatchMemOp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, count, paramArray, flags);
+}
+
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
+                                    CUfunction hfunc) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction_attribute, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pi, attrib, hfunc);
+}
+
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc,
+                                    CUfunction_attribute attrib, int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunction_attribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, attrib, value);
+}
+
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUfunc_cache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc,
+                                          CUsharedconfig config) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, CUsharedconfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, config);
+}
+
+CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                                unsigned int gridDimY, unsigned int gridDimZ,
+                                unsigned int blockDimX, unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes, CUstream hStream,
+                                void **kernelParams, void **extra) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUfunction, unsigned int, unsigned int, unsigned int, unsigned int,
+      unsigned int, unsigned int, unsigned int, CUstream, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY,
+                  blockDimZ, sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(
+    CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUDA_LAUNCH_PARAMS *, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn,
+                                  void *userData) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUstream, CUhostFn, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, fn, userData);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x,
+                                                       int y, int z) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetBlockShape");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, x, y, z);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc,
+                                                       unsigned int bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncSetSharedSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc,
+                                                  unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset,
+                                               unsigned int value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSeti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset,
+                                               float value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, value);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset,
+                                               void *ptr,
+                                               unsigned int numbytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, offset, ptr, numbytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width,
+                                                int grid_height) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGrid");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f,
+                                                     int grid_width,
+                                                     int grid_height,
+                                                     CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, int, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuLaunchGridAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(f, grid_width, grid_height, hStream);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc,
+                                                    int texunit,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfunction, int, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuParamSetTexRef");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hfunc, texunit, hTexRef);
+}
+
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraph, flags);
+}
+
+CUresult CUDAAPI cuGraphAddKernelNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeGetParams(
+    CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeSetParams(
+    CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                      const CUgraphNode *dependencies,
+                                      size_t numDependencies,
+                                      const CUDA_MEMCPY3D *copyParams,
+                                      CUcontext ctx) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode,
+                                            CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode,
+                                            const CUDA_MEMCPY3D *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMCPY3D *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddMemsetNode(
+    CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies,
+    size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams,
+    CUcontext ctx) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  memsetParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(
+    CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(
+    CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                    const CUgraphNode *dependencies,
+                                    size_t numDependencies,
+                                    const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t,
+                          const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode,
+                                          CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphHostNodeSetParams(
+    CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode, const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode,
+                                          CUgraph hGraph,
+                                          const CUgraphNode *dependencies,
+                                          size_t numDependencies,
+                                          CUgraph childGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraph,
+                                      const CUgraphNode *, size_t, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies,
+                  childGraph);
+}
+
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode,
+                                               CUgraph *phGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraph *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, phGraph);
+}
+
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph,
+                                     const CUgraphNode *dependencies,
+                                     size_t numDependencies) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraphNode *, CUgraph, const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphNode, hGraph, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph *, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphClone, originalGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode,
+                                        CUgraphNode hOriginalNode,
+                                        CUgraph hClonedGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode *, CUgraphNode, CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phNode, hOriginalNode, hClonedGraph);
+}
+
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, type);
+}
+
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes,
+                                 size_t *numNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, nodes, numNodes);
+}
+
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes,
+                                     size_t *numRootNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, rootNodes, numRootNodes);
+}
+
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from,
+                                 CUgraphNode *to, size_t *numEdges) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUgraph, CUgraphNode *, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numEdges);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode,
+                                            CUgraphNode *dependencies,
+                                            size_t *numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependencies, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode,
+                                              CUgraphNode *dependentNodes,
+                                              size_t *numDependentNodes) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, dependentNodes, numDependentNodes);
+}
+
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from,
+                                        const CUgraphNode *to,
+                                        size_t numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
+                                      const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph,
+                                           const CUgraphNode *from,
+                                           const CUgraphNode *to,
+                                           size_t numDependencies) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph, const CUgraphNode *,
+                                      const CUgraphNode *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph, from, to, numDependencies);
+}
+
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode);
+}
+
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph,
+                                    CUgraphNode *phErrorNode, char *logBuffer,
+                                    size_t bufferSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec *, CUgraph, CUgraphNode *,
+                                      char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphInstantiate_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize);
+}
+
+CUresult CUDAAPI
+cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
+                               const CUDA_KERNEL_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_KERNEL_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec,
+                                                CUgraphNode hNode,
+                                                const CUDA_MEMCPY3D *copyParams,
+                                                CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_MEMCPY3D *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, copyParams, ctx);
+}
+
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(
+    CUgraphExec hGraphExec, CUgraphNode hNode,
+    const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS *, CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, memsetParams, ctx);
+}
+
+CUresult CUDAAPI
+cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode,
+                             const CUDA_HOST_NODE_PARAMS *nodeParams) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraphNode,
+                                      const CUDA_HOST_NODE_PARAMS *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hNode, nodeParams);
+}
+
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hStream);
+}
+
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec);
+}
+
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraph);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraph);
+}
+
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph,
+                                   CUgraphNode *hErrorNode_out,
+                                   CUgraphExecUpdateResult *updateResult_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphExec, CUgraph, CUgraphNode *,
+                                      CUgraphExecUpdateResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
+}
+
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst,
+                                                 CUgraphNode src) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUgraphNode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+CUresult CUDAAPI
+cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                              CUkernelNodeAttrValue *value_out) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
+                                      CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+CUresult CUDAAPI
+cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                              const CUkernelNodeAttrValue *value) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphNode, CUkernelNodeAttrID,
+                                      const CUkernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUfunction, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize,
+    unsigned int flags) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUfunction, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, int *, CUfunction,
+                                      CUoccupancyB2DSize, size_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit);
+}
+
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(
+    int *minGridSize, int *blockSize, CUfunction func,
+    CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize,
+    int blockSizeLimit, unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(
+      int *, int *, CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyMaxPotentialBlockSizeWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(minGridSize, blockSize, func, blockSizeToDynamicSMemSize,
+                  dynamicSMemSize, blockSizeLimit, flags);
+}
+
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(
+    size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUfunction, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef,
+                                                    CUarray hArray,
+                                                    unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(
+    CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUmipmappedArray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, hMipmappedArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset,
+                                                      CUtexref hTexRef,
+                                                      CUdeviceptr dptr,
+                                                      size_t bytes) {
+  using FuncPtr = CUresult(CUDAAPI *)(size_t *, CUtexref, CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ByteOffset, hTexRef, dptr, bytes);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc,
+                     CUdeviceptr dptr, size_t Pitch) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, const CUDA_ARRAY_DESCRIPTOR *,
+                                      CUdeviceptr, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddress2D_v3");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, desc, dptr, Pitch);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef,
+                                                     CUarray_format fmt,
+                                                     int NumPackedComponents) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUarray_format, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fmt, NumPackedComponents);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef,
+                                                          int dim,
+                                                          CUaddress_mode am) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, int, CUaddress_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, dim, am);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef,
+                                                         CUfilter_mode fm) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, CUfilter_mode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, fm);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef,
+                                                              float bias) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, bias);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(
+    CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, maxAniso);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef,
+                                                          float *pBorderColor) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, pBorderColor);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef,
+                                                    unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefSetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr,
+                                                      CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUdeviceptr *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddress_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pdptr, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(
+    CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phMipmappedArray, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam,
+                                                          CUtexref hTexRef,
+                                                          int dim) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUaddress_mode *, CUtexref, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetAddressMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pam, hTexRef, dim);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm,
+                                                         CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat,
+                                                     int *pNumChannels,
+                                                     CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray_format *, int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFormat, pNumChannels, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUfilter_mode *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapFilterMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pfm, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelBias");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pbias, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI
+cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp,
+                            float *pmaxMipmapLevelClamp, CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMipmapLevelClamp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso,
+                                                            CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetMaxAnisotropy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pmaxAniso, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor,
+                                                          CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(float *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetBorderColor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pBorderColor, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags,
+                                                    CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(unsigned int *, CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexRefDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hTexRef);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef,
+                                                     CUarray hArray,
+                                                     unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfref, CUarray, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefSetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSurfRef, hArray, Flags);
+}
+
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray,
+                                                     CUsurfref hSurfRef) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUsurfref);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfRefGetArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(phArray, hSurfRef);
+}
+
+CUresult CUDAAPI
+cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc,
+                  const CUDA_TEXTURE_DESC *pTexDesc,
+                  const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject *, const CUDA_RESOURCE_DESC *,
+                                      const CUDA_TEXTURE_DESC *,
+                                      const CUDA_RESOURCE_VIEW_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                            CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc,
+                                           CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_TEXTURE_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(
+    CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_VIEW_DESC *, CUtexObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuTexObjectGetResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject,
+                                    const CUDA_RESOURCE_DESC *pResDesc) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUsurfObject *, const CUDA_RESOURCE_DESC *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc,
+                                             CUsurfObject surfObject) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUDA_RESOURCE_DESC *, CUsurfObject);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuSurfObjectGetResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev,
+                                       CUdevice peerDev) {
+  using FuncPtr = CUresult(CUDAAPI *)(int *, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, dev, peerDev);
+}
+
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext,
+                                       unsigned int Flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext, Flags);
+}
+
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUcontext);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuCtxDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerContext);
+}
+
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int *value,
+                                         CUdevice_P2PAttribute attrib,
+                                         CUdevice srcDevice,
+                                         CUdevice dstDevice) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(int *, CUdevice_P2PAttribute, CUdevice, CUdevice);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attrib, srcDevice, dstDevice);
+}
+
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(
+    CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex,
+    unsigned int mipLevel) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUarray *, CUgraphicsResource,
+                                      unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pArray, resource, arrayIndex, mipLevel);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(
+    CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmipmappedArray *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pMipmappedArray, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(
+    CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(CUdeviceptr *, size_t *, CUgraphicsResource);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceGetMappedPointer_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevPtr, pSize, resource);
+}
+
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource,
+                                               unsigned int flags) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUgraphicsResource, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cuGraphicsResourceSetMapFlags_v2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count,
+                                        CUgraphicsResource *resources,
+                                        CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count,
+                                          CUgraphicsResource *resources,
+                                          CUstream hStream) {
+  using FuncPtr =
+      CUresult(CUDAAPI *)(unsigned int, CUgraphicsResource *, CUstream);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, hStream);
+}
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable,
+                                  const CUuuid *pExportTableId) {
+  using FuncPtr = CUresult(CUDAAPI *)(const void **, const CUuuid *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) {
+  using FuncPtr = CUresult(CUDAAPI *)(CUmodule *, CUfunction);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cuFuncGetModule");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hmod, hfunc);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc b/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
new file mode 100644
index 00000000000..df3ada219e2
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cuda_runtime_11_0.inc
@@ -0,0 +1,1974 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceReset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit,
+                                                         size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetStreamPriorityRange");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(leastPriority, greatestPriority);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(config);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceGetByPCIBusId(int *device, const char *pciBusId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const char *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetByPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, pciBusId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId,
+                                                            int len,
+                                                            int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(char *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetPCIBusId");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pciBusId, len, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcEventHandle_t *, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, event);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, cudaIpcEventHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenEventHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, handle);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaIpcMemHandle_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcGetMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(
+    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaIpcMemHandle_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcOpenMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, handle, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaIpcCloseMemHandle");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadExit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSynchronize(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetLimit(enum cudaLimit limit, size_t value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaLimit, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(limit, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, enum cudaLimit);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetLimit");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pValue, limit);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadGetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pCacheConfig);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaThreadSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(cacheConfig);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaPeekAtLastError(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPeekAtLastError");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorName(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorName");
+  if (!func_ptr) return "cudaGetErrorName symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ const char *CUDARTAPI
+cudaGetErrorString(cudaError_t error) {
+  using FuncPtr = const char *(CUDARTAPI *)(cudaError_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetErrorString");
+  if (!func_ptr) return "cudaGetErrorString symbol not found.";
+  return func_ptr(error);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceCount(int *count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceCount");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaDeviceProp *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceProperties");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(prop, device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceAttr, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(
+    void *nvSciSyncAttrList, int device, int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaDeviceGetNvSciSyncAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(nvSciSyncAttrList, device, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr,
+                          int srcDevice, int dstDevice) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, enum cudaDeviceP2PAttr, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceGetP2PAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(value, attr, srcDevice, dstDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaChooseDevice(int *device, const struct cudaDeviceProp *prop) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const struct cudaDeviceProp *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaChooseDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device, prop);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaGetDevice(int *device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr,
+                                                          int len) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetValidDevices");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(device_arr, len);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags(unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags(unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetDeviceFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags,
+                             int priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t *, unsigned int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCreateWithPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pStream, flags, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetPriority(cudaStream_t hStream, int *priority) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetPriority");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, priority);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, unsigned int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)();
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCtxResetPersistingL2Cache");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr();
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamGetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
+                       union cudaStreamAttrValue *value_out) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
+                                           union cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value_out);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamSetAttribute(cudaStream_t hStream, enum cudaStreamAttrID attr,
+                       const union cudaStreamAttrValue *value) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamAttrID,
+                                           const union cudaStreamAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hStream, attr, value);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamDestroy(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(
+    cudaStream_t stream, cudaEvent_t event, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, cudaEvent_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamWaitEvent");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, event, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback,
+                      void *userData, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaStreamCallback_t,
+                                           void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAddCallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, callback, userData, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamSynchronize(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr,
+                         size_t length __dv(0),
+                         unsigned int flags __dv(cudaMemAttachSingle)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamAttachMemAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, devPtr, length, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureMode);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamBeginCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(enum cudaStreamCaptureMode *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaThreadExchangeStreamCaptureMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mode);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamEndCapture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaStream_t, enum cudaStreamCaptureStatus *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamIsCapturing");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(
+    cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus,
+    unsigned long long *pId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaStreamGetCaptureInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, pCaptureStatus, pId);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventCreateWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventRecord");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventQuery");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventSynchronize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaEventDestroy(cudaEvent_t event) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(event);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms,
+                                                           cudaEvent_t start,
+                                                           cudaEvent_t end) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(float *, cudaEvent_t, cudaEvent_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaEventElapsedTime");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ms, start, end);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(
+    cudaExternalMemory_t *extMem_out,
+    const struct cudaExternalMemoryHandleDesc *memHandleDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaExternalMemory_t *, const struct cudaExternalMemoryHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem_out, memHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(
+    void **devPtr, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryBufferDesc *bufferDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, cudaExternalMemory_t,
+                               const struct cudaExternalMemoryBufferDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedBuffer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, extMem, bufferDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem,
+    const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, cudaExternalMemory_t,
+      const struct cudaExternalMemoryMipmappedArrayDesc *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaExternalMemoryGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmap, extMem, mipmapDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalMemory(cudaExternalMemory_t extMem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalMemory_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalMemory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extMem);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(
+    cudaExternalSemaphore_t *extSem_out,
+    const struct cudaExternalSemaphoreHandleDesc *semHandleDesc) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreHandleDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaImportExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem_out, semHandleDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreSignalParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreSignalParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaSignalExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(
+    const cudaExternalSemaphore_t *extSemArray,
+    const struct cudaExternalSemaphoreWaitParams *paramsArray,
+    unsigned int numExtSems, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const cudaExternalSemaphore_t *,
+                               const struct cudaExternalSemaphoreWaitParams *,
+                               unsigned int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaWaitExternalSemaphoresAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSemArray, paramsArray, numExtSems, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaExternalSemaphore_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyExternalSemaphore");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(extSem);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args,
+                 size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(
+    const void *func, dim3 gridDim, dim3 blockDim, void **args,
+    size_t sharedMem, cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, dim3, dim3, void **,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(
+    struct cudaLaunchParams *launchParamsList, unsigned int numDevices,
+    unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaLaunchParams *,
+                                           unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaLaunchCooperativeKernelMultiDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(launchParamsList, numDevices, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncCache);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetCacheConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, cacheConfig);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaSharedMemConfig);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetSharedMemConfig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, config);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaFuncAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attr, func);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, enum cudaFuncAttribute, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFuncSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(func, attr, value);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForDevice(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForDevice");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaSetDoubleForHost(double *d) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaSetDoubleForHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(d);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream,
+                                                         cudaHostFn_t fn,
+                                                         void *userData) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaStream_t, cudaHostFn_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaLaunchHostFunc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(stream, fn, userData);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func,
+                                              int blockSize,
+                                              size_t dynamicSMemSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyMaxActiveBlocksPerMultiprocessor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize,
+                                          const void *func, int numBlocks,
+                                          int blockSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *, int, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaOccupancyAvailableDynamicSMemPerBlock");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dynamicSmemSize, func, numBlocks, blockSize);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks,
+                                                       const void *func,
+                                                       int blockSize,
+                                                       size_t dynamicSMemSize,
+                                                       unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int *, const void *, int, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>(
+      "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(numBlocks, func, blockSize, dynamicSMemSize, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(
+    void **devPtr, size_t size, unsigned int flags __dv(cudaMemAttachGlobal)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocManaged");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMalloc(void **devPtr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr,
+                                                      size_t *pitch,
+                                                      size_t width,
+                                                      size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t *, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocPitch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(
+    cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width,
+    size_t height __dv(0), unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           size_t, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, width, height, flags);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaFree(void *devPtr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFree");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeHost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaFreeMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size,
+                                                    unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostAlloc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pHost, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size,
+                                                       unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostRegister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr, size, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostUnregister");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, void *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetDevicePointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pDevice, pHost, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags,
+                                                       void *pHost) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(unsigned int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaHostGetFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pFlags, pHost);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3D(struct cudaPitchedPtr *pitchedDevPtr, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr *, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, extent);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc,
+                  struct cudaExtent extent, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t *,
+                                           const struct cudaChannelFormatDesc *,
+                                           struct cudaExtent, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMalloc3DArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, desc, extent, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray,
+    const struct cudaChannelFormatDesc *desc, struct cudaExtent extent,
+    unsigned int numLevels, unsigned int flags __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaMipmappedArray_t *, const struct cudaChannelFormatDesc *,
+      struct cudaExtent, unsigned int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMallocMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, desc, extent, numLevels, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(
+    cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray,
+    unsigned int level) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaMipmappedArray_const_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetMipmappedArrayLevel");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(levelArray, mipmappedArray, level);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3D(const struct cudaMemcpy3DParms *p) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(
+    const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DParms *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(
+    const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct cudaMemcpy3DPeerParms *,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy3DPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(p, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free,
+                                                     size_t *total) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(free, total);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent,
+                 unsigned int *flags, cudaArray_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           struct cudaExtent *, unsigned int *,
+                                           cudaArray_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaArrayGetInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, extent, flags, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src,
+                                                 size_t count,
+                                                 enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice,
+                                                     const void *src,
+                                                     int srcDevice,
+                                                     size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, int, const void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch,
+                                                   const void *src,
+                                                   size_t spitch, size_t width,
+                                                   size_t height,
+                                                   enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t, size_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+    size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           cudaArray_const_t, size_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  width, height, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(
+    const void *symbol, const void *src, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, const void *, size_t,
+                                           size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(
+    void *dst, const void *symbol, size_t count, size_t offset __dv(0),
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemcpyAsync(void *dst, const void *src, size_t count,
+                enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
+                    size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, const void *, int,
+                                           size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyPeerAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dstDevice, src, srcDevice, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(
+    void *dst, size_t dpitch, const void *src, size_t spitch, size_t width,
+    size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, size_t, const void *, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, spitch, width, height, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t,
+                                           const void *, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, spitch, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(
+    void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset,
+    size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind,
+    cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, cudaArray_const_t,
+                                           size_t, size_t, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpy2DFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, dpitch, src, wOffset, hOffset, width, height, kind,
+                  stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(
+    const void *symbol, const void *src, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, const void *, size_t, size_t,
+                               enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(symbol, src, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(
+    void *dst, const void *symbol, size_t count, size_t offset,
+    enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, const void *, size_t, size_t,
+                                           enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromSymbolAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, symbol, count, offset, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value,
+                                                 size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch,
+                                                   int value, size_t width,
+                                                   size_t height) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(
+    struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int, struct cudaExtent);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(
+    void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, int, size_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemsetAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, value, count, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width,
+                  size_t height, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, size_t, int, size_t, size_t,
+                                           cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset2DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, pitch, value, width, height, stream);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value,
+                  struct cudaExtent extent, cudaStream_t stream __dv(0)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaPitchedPtr, int,
+                                           struct cudaExtent, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemset3DAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pitchedDevPtr, value, extent, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr,
+                                                           const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolAddress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size,
+                                                        const void *symbol) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(size_t *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSymbolSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(size, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice,
+                     cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const void *, size_t, int, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemPrefetchAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, dstDevice, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice,
+              int device) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void *, size_t,
+                                           enum cudaMemoryAdvise, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemAdvise");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, count, advice, device);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(
+    void *data, size_t dataSize, enum cudaMemRangeAttribute attribute,
+    const void *devPtr, size_t count) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      void *, size_t, enum cudaMemRangeAttribute, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSize, attribute, devPtr, count);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(
+    void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes,
+    size_t numAttributes, const void *devPtr, size_t count) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, enum cudaMemRangeAttribute *,
+                               size_t, const void *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemRangeGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(data, dataSizes, attributes, numAttributes, devPtr, count);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset,
+                  const void *src, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t, size_t, size_t, const void *, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset,
+                    size_t hOffset, size_t count, enum cudaMemcpyKind kind) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t,
+                                           size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(
+    cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst,
+    cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count,
+    enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, cudaArray_const_t,
+                               size_t, size_t, size_t, enum cudaMemcpyKind);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyArrayToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc,
+                  count, kind);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(
+    cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src,
+    size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaArray_t, size_t, size_t, const void *,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyToArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, wOffset, hOffset, src, count, kind, stream);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset,
+                         size_t hOffset, size_t count, enum cudaMemcpyKind kind,
+                         cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void *, cudaArray_const_t, size_t, size_t,
+                               size_t, enum cudaMemcpyKind, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaMemcpyFromArrayAsync");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dst, src, wOffset, hOffset, count, kind, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(
+    struct cudaPointerAttributes *attributes, const void *ptr) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaPointerAttributes *, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaPointerGetAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(attributes, ptr);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *, int, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceCanAccessPeer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(canAccessPeer, device, peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceEnablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDeviceDisablePeerAccess(int peerDevice) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDeviceDisablePeerAccess");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(peerDevice);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnregisterResource");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(
+    cudaGraphicsResource_t resource, unsigned int flags) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphicsResource_t, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsResourceSetMapFlags");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(resource, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsMapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(
+    int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(int, cudaGraphicsResource_t *, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphicsUnmapResources");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(count, resources, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(
+    void **devPtr, size_t *size, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(void **, size_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedPointer");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(devPtr, size, resource);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(
+    cudaArray_t *array, cudaGraphicsResource_t resource,
+    unsigned int arrayIndex, unsigned int mipLevel) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaArray_t *, cudaGraphicsResource_t, unsigned int, unsigned int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsSubResourceGetMappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(array, resource, arrayIndex, mipLevel);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphicsResourceGetMappedMipmappedArray(
+    cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaMipmappedArray_t *, cudaGraphicsResource_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphicsResourceGetMappedMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(mipmappedArray, resource);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(
+    size_t *offset, const struct textureReference *texref, const void *devPtr,
+    const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, size);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaBindTexture2D(size_t *offset, const struct textureReference *texref,
+                  const void *devPtr, const struct cudaChannelFormatDesc *desc,
+                  size_t width, size_t height, size_t pitch) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      size_t *, const struct textureReference *, const void *,
+      const struct cudaChannelFormatDesc *, size_t, size_t, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTexture2D");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref, devPtr, desc, width, height, pitch);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(
+    const struct textureReference *texref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaBindTextureToMipmappedArray(const struct textureReference *texref,
+                                cudaMipmappedArray_const_t mipmappedArray,
+                                const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct textureReference *, cudaMipmappedArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindTextureToMipmappedArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, mipmappedArray, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaUnbindTexture(const struct textureReference *texref) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaUnbindTexture");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI
+cudaGetTextureAlignmentOffset(size_t *offset,
+                              const struct textureReference *texref) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(size_t *, const struct textureReference *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureAlignmentOffset");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(offset, texref);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(
+    const struct textureReference **texref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct textureReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texref, symbol);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(
+    const struct surfaceReference *surfref, cudaArray_const_t array,
+    const struct cudaChannelFormatDesc *desc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      const struct surfaceReference *, cudaArray_const_t,
+      const struct cudaChannelFormatDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaBindSurfaceToArray");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, array, desc);
+}
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(
+    const struct surfaceReference **surfref, const void *symbol) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(const struct surfaceReference **, const void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetSurfaceReference");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfref, symbol);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(
+    struct cudaChannelFormatDesc *desc, cudaArray_const_t array) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaChannelFormatDesc *,
+                                           cudaArray_const_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetChannelDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(desc, array);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(
+    cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc,
+    const struct cudaTextureDesc *pTexDesc,
+    const struct cudaResourceViewDesc *pResViewDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(
+      cudaTextureObject_t *, const struct cudaResourceDesc *,
+      const struct cudaTextureDesc *, const struct cudaResourceViewDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexObject, pResDesc, pTexDesc, pResViewDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroyTextureObject(cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroyTextureObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(
+    struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaTextureDesc *, cudaTextureObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetTextureObjectTextureDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pTexDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(
+    struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(struct cudaResourceViewDesc *,
+                                           cudaTextureObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetTextureObjectResourceViewDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResViewDesc, texObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(
+    cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t *,
+                                           const struct cudaResourceDesc *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaCreateSurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pSurfObject, pResDesc);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaSurfaceObject_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDestroySurfaceObject");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(
+    struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(struct cudaResourceDesc *, cudaSurfaceObject_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGetSurfaceObjectResourceDesc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pResDesc, surfObject);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaDriverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(driverVersion);
+}
+
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI
+cudaRuntimeGetVersion(int *runtimeVersion) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaRuntimeGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(runtimeVersion);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph,
+                                                      unsigned int flags) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, unsigned int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraph, flags);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddKernelNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(
+    cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(
+    cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphKernelNodeCopyAttributes(cudaGraphNode_t hSrc, cudaGraphNode_t hDst) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphKernelNodeCopyAttributes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hSrc, hDst);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
+    union cudaKernelNodeAttrValue *value_out) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
+                               union cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeGetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode, enum cudaKernelNodeAttrID attr,
+    const union cudaKernelNodeAttrValue *value) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaKernelNodeAttrID,
+                               const union cudaKernelNodeAttrValue *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphKernelNodeSetAttribute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hNode, attr, value);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemcpy3DParms *pCopyParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemcpyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pCopyParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaMemsetParams *pMemsetParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddMemsetNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pMemsetParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(
+    cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(
+    cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies,
+    const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddHostNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(
+    cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeGetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(
+    cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+                           const cudaGraphNode_t *pDependencies,
+                           size_t numDependencies, cudaGraph_t childGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                               const cudaGraphNode_t *, size_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddChildGraphNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies,
+                  childGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraph_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphChildGraphNodeGetGraph");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(
+    cudaGraphNode_t *pGraphNode, cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraph_t,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddEmptyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphNode, graph, pDependencies, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t *, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphClone, originalGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode,
+                         cudaGraph_t clonedGraph) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t *, cudaGraphNode_t, cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeFindInClone");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pNode, originalNode, clonedGraph);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, enum cudaGraphNodeType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pType);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph,
+                                                        cudaGraphNode_t *nodes,
+                                                        size_t *numNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, nodes, numNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(
+    cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetRootNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, pRootNodes, pNumRootNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph,
+                                                        cudaGraphNode_t *from,
+                                                        cudaGraphNode_t *to,
+                                                        size_t *numEdges) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, cudaGraphNode_t *,
+                                           cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphGetEdges");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numEdges);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependencies,
+    size_t *pNumDependencies) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependencies, pNumDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(
+    cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes,
+    size_t *pNumDependentNodes) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphNode_t, cudaGraphNode_t *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphNodeGetDependentNodes");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node, pDependentNodes, pNumDependentNodes);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                         const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphAddDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from,
+                            const cudaGraphNode_t *to, size_t numDependencies) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t, const cudaGraphNode_t *,
+                                           const cudaGraphNode_t *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphRemoveDependencies");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph, from, to, numDependencies);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphDestroyNode(cudaGraphNode_t node) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphNode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroyNode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(node);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
+    char *pLogBuffer, size_t bufferSize) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t *, cudaGraph_t,
+                                           cudaGraphNode_t *, char *, size_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphInstantiate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaKernelNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaKernelNodeParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecKernelNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemcpy3DParms *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemcpy3DParms *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemcpyNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(
+    cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+    const struct cudaMemsetParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaMemsetParams *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cudaGraphExecMemsetNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node,
+                               const struct cudaHostNodeParams *pNodeParams) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraphNode_t,
+                                           const struct cudaHostNodeParams *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecHostNodeSetParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, node, pNodeParams);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph,
+                    cudaGraphNode_t *hErrorNode_out,
+                    enum cudaGraphExecUpdateResult *updateResult_out) {
+  using FuncPtr =
+      cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaGraph_t, cudaGraphNode_t *,
+                               enum cudaGraphExecUpdateResult *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecUpdate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(hGraphExec, hGraph, hErrorNode_out, updateResult_out);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec,
+                                                      cudaStream_t stream) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphLaunch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec, stream);
+}
+
+extern __host__ cudaError_t CUDARTAPI
+cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraphExec_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphExecDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graphExec);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(cudaGraph_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGraphDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(graph);
+}
+
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(
+    const void **ppExportTable, const cudaUUID_t *pExportTableId) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(const void **, const cudaUUID_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cudaGetExportTable");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(ppExportTable, pExportTableId);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc b/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
new file mode 100644
index 00000000000..c4f32c84680
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusolver_dense_11_0.inc
@@ -0,0 +1,4686 @@
+// Auto-generated, do not edit.
+
+extern "C" {
+
+cusolverStatus_t CUSOLVERAPI cusolverGetProperty(libraryPropertyType type,
+                                                 int *value) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverGetVersion(int *version) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(version);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSetStream(cusolverDnHandle_t handle,
+                                                 cudaStream_t streamId) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetStream(cusolverDnHandle_t handle,
+                                                 cudaStream_t *streamId) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsCreate(cusolverDnIRSParams_t *params_ptr) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params_ptr);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsDestroy(cusolverDnIRSParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetRefinementSolver(
+    cusolverDnIRSParams_t params, cusolverIRSRefinement_t refinement_solver) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
+                                                  cusolverIRSRefinement_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetRefinementSolver");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, refinement_solver);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverMainPrecision(
+    cusolverDnIRSParams_t params, cusolverPrecType_t solver_main_precision) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
+                                                  cusolverPrecType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverMainPrecision");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_main_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverLowestPrecision(
+    cusolverDnIRSParams_t params, cusolverPrecType_t solver_lowest_precision) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t,
+                                                  cusolverPrecType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverLowestPrecision");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_lowest_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverPrecisions(
+    cusolverDnIRSParams_t params, cusolverPrecType_t solver_main_precision,
+    cusolverPrecType_t solver_lowest_precision) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnIRSParams_t, cusolverPrecType_t, cusolverPrecType_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetSolverPrecisions");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, solver_main_precision, solver_lowest_precision);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsSetTol(cusolverDnIRSParams_t params, double val) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTol");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, val);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsSetTolInner(cusolverDnIRSParams_t params, double val) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetTolInner");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, val);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxIters(
+    cusolverDnIRSParams_t params, cusolver_int_t maxiters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxItersInner(
+    cusolverDnIRSParams_t params, cusolver_int_t maxiters_inner) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsSetMaxItersInner");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters_inner);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsGetMaxIters(
+    cusolverDnIRSParams_t params, cusolver_int_t *maxiters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSParamsGetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsEnableFallback(cusolverDnIRSParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsEnableFallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSParamsDisableFallback(cusolverDnIRSParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSParams_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSParamsDisableFallback");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSInfosDestroy(cusolverDnIRSInfos_t infos) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSInfosCreate(cusolverDnIRSInfos_t *infos_ptr) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos_ptr);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetNiters(
+    cusolverDnIRSInfos_t infos, cusolver_int_t *niters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetNiters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, niters);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetOuterNiters(
+    cusolverDnIRSInfos_t infos, cusolver_int_t *outer_niters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetOuterNiters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, outer_niters);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnIRSInfosRequestResidual(cusolverDnIRSInfos_t infos) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSInfosRequestResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetResidualHistory(
+    cusolverDnIRSInfos_t infos, void **residual_history) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, void **);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetResidualHistory");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, residual_history);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetMaxIters(
+    cusolverDnIRSInfos_t infos, cusolver_int_t *maxiters) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnIRSInfos_t, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSInfosGetMaxIters");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(infos, maxiters);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuDoubleComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuDoubleComplex *,
+      cusolver_int_t, cusolver_int_t *, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    cuComplex *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cuComplex *,
+      cusolver_int_t, cusolver_int_t *, cuComplex *, cusolver_int_t,
+      cuComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs,
+    double *dA, cusolver_int_t ldda, cusolver_int_t *dipiv, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, double *,
+      cusolver_int_t, cusolver_int_t *, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t n, cusolver_int_t nrhs, float *dA,
+    cusolver_int_t ldda, cusolver_int_t *dipiv, float *dB, cusolver_int_t lddb,
+    float *dX, cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, float *,
+      cusolver_int_t, cusolver_int_t *, float *, cusolver_int_t, float *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, nrhs, dA, ldda, dipiv, dB, lddb, dX, lddx,
+                  dWorkspace, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZZgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZCgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZKgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZEgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZYgels(cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+                 cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+                 cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+                 cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+                 cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgels(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *iter, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t, cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes, iter, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZZgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZZgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZCgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZCgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZKgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZKgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZEgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZEgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZYgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuDoubleComplex *dA, cusolver_int_t ldda,
+    cuDoubleComplex *dB, cusolver_int_t lddb, cuDoubleComplex *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, cuDoubleComplex *, cusolver_int_t,
+      cuDoubleComplex *, cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZYgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCCgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCCgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCKgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCKgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCEgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCEgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCYgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, cuComplex *dA, cusolver_int_t ldda, cuComplex *dB,
+    cusolver_int_t lddb, cuComplex *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      cuComplex *, cusolver_int_t, cuComplex *, cusolver_int_t, cuComplex *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCYgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDDgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDDgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDSgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDSgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDHgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDHgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDBgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDBgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDXgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, double *dA, cusolver_int_t ldda, double *dB,
+    cusolver_int_t lddb, double *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      double *, cusolver_int_t, double *, cusolver_int_t, double *,
+      cusolver_int_t, void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSSgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSSgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSHgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSHgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSBgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSBgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSXgels_bufferSize(
+    cusolverDnHandle_t handle, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, float *dA, cusolver_int_t ldda, float *dB,
+    cusolver_int_t lddb, float *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolver_int_t, cusolver_int_t, cusolver_int_t,
+      float *, cusolver_int_t, float *, cusolver_int_t, float *, cusolver_int_t,
+      void *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nrhs, dA, ldda, dB, lddb, dX, lddx, dWorkspace,
+                  lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t gesv_irs_params,
+    cusolverDnIRSInfos_t gesv_irs_infos, cusolver_int_t n, cusolver_int_t nrhs,
+    void *dA, cusolver_int_t ldda, void *dB, cusolver_int_t lddb, void *dX,
+    cusolver_int_t lddx, void *dWorkspace, size_t lwork_bytes,
+    cusolver_int_t *niters, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t,
+      cusolver_int_t, cusolver_int_t, void *, cusolver_int_t, void *,
+      cusolver_int_t, void *, cusolver_int_t, void *, size_t, cusolver_int_t *,
+      cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, gesv_irs_params, gesv_irs_infos, n, nrhs, dA, ldda,
+                  dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t params, cusolver_int_t n,
+    cusolver_int_t nrhs, size_t *lwork_bytes) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cusolverDnIRSParams_t,
+                                      cusolver_int_t, cusolver_int_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgesv_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, n, nrhs, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t gels_irs_params,
+    cusolverDnIRSInfos_t gels_irs_infos, cusolver_int_t m, cusolver_int_t n,
+    cusolver_int_t nrhs, void *dA, cusolver_int_t ldda, void *dB,
+    cusolver_int_t lddb, void *dX, cusolver_int_t lddx, void *dWorkspace,
+    size_t lwork_bytes, cusolver_int_t *niters, cusolver_int_t *d_info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolverDnIRSInfos_t,
+      cusolver_int_t, cusolver_int_t, cusolver_int_t, void *, cusolver_int_t,
+      void *, cusolver_int_t, void *, cusolver_int_t, void *, size_t,
+      cusolver_int_t *, cusolver_int_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, gels_irs_params, gels_irs_infos, m, n, nrhs, dA, ldda,
+                  dB, lddb, dX, lddx, dWorkspace, lwork_bytes, niters, d_info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnIRSParams_t params, cusolver_int_t m,
+    cusolver_int_t n, cusolver_int_t nrhs, size_t *lwork_bytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnIRSParams_t, cusolver_int_t, cusolver_int_t,
+      cusolver_int_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnIRSXgels_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, nrhs, lwork_bytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotrf_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              float *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              double *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const float *A, int lda,
+                                              float *B, int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const double *A,
+                                              int lda, double *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const cuComplex *A,
+                                              int lda, cuComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs,
+                                              const cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, float *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, double *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(cusolverDnHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n, cuComplex *Aarray[],
+                                                     int lda, int *infoArray,
+                                                     int batchSize) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    cuDoubleComplex *Aarray[], int lda, int *infoArray, int batchSize) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrfBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, Aarray, lda, infoArray, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    int nrhs, /* only support rhs = 1*/
+    float *A[], int lda, float *B[], int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, float *[], int, float *[],
+      int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    int nrhs, /* only support rhs = 1*/
+    double *A[], int lda, double *B[], int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, double *[], int,
+      double *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                        int nrhs, /* only support rhs = 1*/
+                        cuComplex *A[], int lda, cuComplex *B[], int ldb,
+                        int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, cuComplex *[], int,
+      cuComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotrsBatched(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                        int nrhs, /* only support rhs = 1*/
+                        cuDoubleComplex *A[], int lda, cuDoubleComplex *B[],
+                        int ldb, int *d_info, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, cuDoubleComplex *[], int,
+      cuDoubleComplex *[], int, int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotrsBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, B, ldb, d_info, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZpotri_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZpotri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnStrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, float *, int,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnStrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDtrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, double *,
+      int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCtrtri(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, cublasDiagType_t diag,
+    int n, cuComplex *A, int lda, cuComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int, cuComplex *,
+      int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZtrtri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo,
+                                              cublasDiagType_t diag, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, cublasDiagType_t, int,
+      cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZtrtri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, diag, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnClauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZlauum_bufferSize(cusolverDnHandle_t handle, cublasFillMode_t uplo,
+                            int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *work,
+                                              int lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              cuComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlauum");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, work, lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgetrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuDoubleComplex *A, int lda, int *Lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *Workspace, int *devIpiv,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *Workspace, int *devIpiv,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, double *, int, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              cuComplex *Workspace,
+                                              int *devIpiv, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, cuComplex *,
+                                      int, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuDoubleComplex *A,
+                                              int lda,
+                                              cuDoubleComplex *Workspace,
+                                              int *devIpiv, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
+      int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, Workspace, devIpiv, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(cusolverDnHandle_t handle, int n,
+                                              float *A, int lda, int k1, int k2,
+                                              const int *devIpiv, int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, float *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(cusolverDnHandle_t handle, int n,
+                                              double *A, int lda, int k1,
+                                              int k2, const int *devIpiv,
+                                              int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, double *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(cusolverDnHandle_t handle, int n,
+                                              cuComplex *A, int lda, int k1,
+                                              int k2, const int *devIpiv,
+                                              int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, cuComplex *, int, int, int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnClaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(cusolverDnHandle_t handle, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              int k1, int k2,
+                                              const int *devIpiv, int incx) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  cuDoubleComplex *, int, int,
+                                                  int, const int *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZlaswp");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, k1, k2, devIpiv, incx);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const float *A, int lda,
+                                              const int *devIpiv, float *B,
+                                              int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const float *, int,
+      const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const double *A,
+                                              int lda, const int *devIpiv,
+                                              double *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const double *, int,
+      const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(cusolverDnHandle_t handle,
+                                              cublasOperation_t trans, int n,
+                                              int nrhs, const cuComplex *A,
+                                              int lda, const int *devIpiv,
+                                              cuComplex *B, int ldb,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
+    cusolverDnHandle_t handle, cublasOperation_t trans, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *devIpiv, cuDoubleComplex *B,
+    int ldb, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasOperation_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, trans, n, nrhs, A, lda, devIpiv, B, ldb, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgeqrf_bufferSize(cusolverDnHandle_t handle, int m, int n,
+                            cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *TAU, float *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *TAU, double *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, double *,
+                                      int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              cuComplex *TAU,
+                                              cuComplex *Workspace, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  cuComplex *, int, cuComplex *,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(cusolverDnHandle_t handle, int m,
+                                              int n, cuDoubleComplex *A,
+                                              int lda, cuDoubleComplex *TAU,
+                                              cuDoubleComplex *Workspace,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, cuDoubleComplex *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, TAU, Workspace, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const float *A, int lda,
+    const float *tau, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int,
+                                      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const double *A, int lda,
+    const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  int, const double *, int,
+                                                  const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const cuComplex *A, int lda,
+    const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int,
+                                                  int, const cuComplex *, int,
+                                                  const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int k, const cuDoubleComplex *A,
+    int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, float *, int, const float *, float *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, double *, int, const double *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(cusolverDnHandle_t handle, int m,
+                                              int n, int k, cuComplex *A,
+                                              int lda, const cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, cuComplex *, int, const cuComplex *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
+    cusolverDnHandle_t handle, int m, int n, int k, cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, int, cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const float *A, int lda, const float *tau,
+    const float *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const double *A, int lda, const double *tau,
+    const double *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const double *, int, const double *, const double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
+    const cuComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuComplex *, int, const cuComplex *, const cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const float *A, int lda, const float *tau, float *C,
+    int ldc, float *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const float *, int, const float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const double *A, int lda, const double *tau, double *C,
+    int ldc, double *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const double *, int, const double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuComplex *A, int lda, const cuComplex *tau,
+    cuComplex *C, int ldc, cuComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuComplex *, int, const cuComplex *, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasOperation_t trans,
+    int m, int n, int k, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, cuDoubleComplex *C, int ldc,
+    cuDoubleComplex *work, int lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasOperation_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmqr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, trans, m, n, k, A, lda, tau, C, ldc, work,
+                  lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, float *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, double *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, cuComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int,
+                                                  cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
+    cusolverDnHandle_t handle, int n, cuDoubleComplex *A, int lda, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, A, lda, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, int *ipiv,
+                                              float *work, int lwork,
+                                              int *info) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, int *ipiv,
+                                              double *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, int *, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, int *ipiv,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, int *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              int *ipiv, cuDoubleComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int, int *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const float *A, int lda, const int *ipiv, float *B, int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      const int *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const double *A, int lda, const int *ipiv, double *B, int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      const int *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuComplex *A, int lda, const int *ipiv, cuComplex *B, int ldb,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
+    int ldb, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const float *A, int lda,
+                                              const int *ipiv, float *B,
+                                              int ldb, float *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const float *, int,
+      const int *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrs(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              int nrhs, const double *A,
+                                              int lda, const int *ipiv,
+                                              double *B, int ldb, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const double *, int,
+      const int *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCsytrs(cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+                 int nrhs, const cuComplex *A, int lda, const int *ipiv,
+                 cuComplex *B, int ldb, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuComplex *, int,
+      const int *, cuComplex *, int, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytrs(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, int nrhs,
+    const cuDoubleComplex *A, int lda, const int *ipiv, cuDoubleComplex *B,
+    int ldb, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, int, const cuDoubleComplex *,
+      int, const int *, cuDoubleComplex *, int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, nrhs, A, lda, ipiv, B, ldb, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, float *A, int lda,
+    const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      float *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+    const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      double *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, const int *ipiv, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      cuComplex *, int, const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, const int *ipiv, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              const int *ipiv, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const int *,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              const int *ipiv, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const int *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda,
+                                              const int *ipiv, cuComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, const int *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, const int *ipiv, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const int *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZsytri");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, ipiv, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *Lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, Lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, float *A, int lda,
+                                              float *D, float *E, float *TAUQ,
+                                              float *TAUP, float *Work,
+                                              int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, float *, int, float *, float *, float *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, double *A, int lda,
+                                              double *D, double *E,
+                                              double *TAUQ, double *TAUP,
+                                              double *Work, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, double *, int, double *, double *, double *,
+      double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(cusolverDnHandle_t handle, int m,
+                                              int n, cuComplex *A, int lda,
+                                              float *D, float *E,
+                                              cuComplex *TAUQ, cuComplex *TAUP,
+                                              cuComplex *Work, int Lwork,
+                                              int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuComplex *, int, float *, float *,
+      cuComplex *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
+    cusolverDnHandle_t handle, int m, int n, cuDoubleComplex *A, int lda,
+    double *D, double *E, cuDoubleComplex *TAUQ, cuDoubleComplex *TAUP,
+    cuDoubleComplex *Work, int Lwork, int *devInfo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, int, int, cuDoubleComplex *, int, double *, double *,
+      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgebrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, D, E, TAUQ, TAUP, Work, Lwork, devInfo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const float *A, int lda, const float *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const float *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const double *A, int lda, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const cuComplex *A, int lda, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, const cuComplex *,
+      int, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n, int k,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, float *, int,
+      const float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, double *, int,
+      const double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(cusolverDnHandle_t handle,
+                                              cublasSideMode_t side, int m,
+                                              int n, int k, cuComplex *A,
+                                              int lda, const cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuComplex *, int,
+      const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZungbr(cusolverDnHandle_t handle, cublasSideMode_t side, int m, int n,
+                 int k, cuDoubleComplex *A, int lda, const cuDoubleComplex *tau,
+                 cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, int, int, int, cuDoubleComplex *,
+      int, const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungbr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, m, n, k, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *d, const float *e, const float *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const float *, int,
+      const float *, const float *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *d, const double *e, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
+      const double *, const double *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const float *d, const float *e, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
+      const float *, const float *, const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const double *d, const double *e,
+    const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const double *, const double *, const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *d,
+                                              float *e, float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, float *, float *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, double *A, int lda,
+    double *d, double *e, double *tau, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, double *,
+      double *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsytrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *d,
+                                              float *e, cuComplex *tau,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int, float *,
+      float *, cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, double *d, double *e, cuDoubleComplex *tau, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      double *, double *, cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhetrd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, d, e, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *tau, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, cublasFillMode_t, int,
+                                      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const cuComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuComplex *, int,
+      const cuComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *tau, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda,
+                                              const float *tau, float *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, float *, int, const float *,
+      float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda,
+                                              const double *tau, double *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, double *, int, const double *,
+      double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDorgtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
+    cusolverDnHandle_t handle, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, const cuComplex *tau, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuComplex *, int,
+      const cuComplex *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(cusolverDnHandle_t handle,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              const cuDoubleComplex *tau,
+                                              cuDoubleComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZungtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, uplo, n, A, lda, tau, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const float *A, int lda,
+    const float *tau, const float *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const float *, int, const float *, const float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const double *A, int lda,
+    const double *tau, const double *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const double *, int, const double *, const double *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const cuComplex *A, int lda,
+    const cuComplex *tau, const cuComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const cuComplex *, int, const cuComplex *, const cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *tau, const cuDoubleComplex *C, int ldc, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, const cuDoubleComplex *, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, float *A, int lda, float *tau,
+    float *C, int ldc, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, float *, int, float *, float *, int, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, double *A, int lda, double *tau,
+    double *C, int ldc, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, double *, int, double *, double *, int, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDormtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCunmtr(cusolverDnHandle_t handle, cublasSideMode_t side,
+                 cublasFillMode_t uplo, cublasOperation_t trans, int m, int n,
+                 cuComplex *A, int lda, cuComplex *tau, cuComplex *C, int ldc,
+                 cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, cuComplex *, int, cuComplex *, cuComplex *, int, cuComplex *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
+    cusolverDnHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo,
+    cublasOperation_t trans, int m, int n, cuDoubleComplex *A, int lda,
+    cuDoubleComplex *tau, cuDoubleComplex *C, int ldc, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cublasSideMode_t, cublasFillMode_t, cublasOperation_t,
+      int, int, cuDoubleComplex *, int, cuDoubleComplex *, cuDoubleComplex *,
+      int, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZunmtr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, side, uplo, trans, m, n, A, lda, tau, C, ldc, work,
+                  lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
+    cusolverDnHandle_t handle, int m, int n, int *lwork) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
+    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
+    int n, float *A, int lda, float *S, float *U, int ldu, float *VT, int ldvt,
+    float *work, int lwork, float *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, float *, int,
+      float *, float *, int, float *, int, float *, int, float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
+    cusolverDnHandle_t handle, signed char jobu, signed char jobvt, int m,
+    int n, double *A, int lda, double *S, double *U, int ldu, double *VT,
+    int ldvt, double *work, int lwork, double *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, double *, int,
+      double *, double *, int, double *, int, double *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
+                 int m, int n, cuComplex *A, int lda, float *S, cuComplex *U,
+                 int ldu, cuComplex *VT, int ldvt, cuComplex *work, int lwork,
+                 float *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, float *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZgesvd(cusolverDnHandle_t handle, signed char jobu, signed char jobvt,
+                 int m, int n, cuDoubleComplex *A, int lda, double *S,
+                 cuDoubleComplex *U, int ldu, cuDoubleComplex *VT, int ldvt,
+                 cuDoubleComplex *work, int lwork, double *rwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, signed char, signed char, int, int, cuDoubleComplex *,
+      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobu, jobvt, m, n, A, lda, S, U, ldu, VT, ldvt, work,
+                  lwork, rwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, float *A, int lda, float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, double *A, int lda, double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *W,
+                                              cuComplex *work, int lwork,
+                                              int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuDoubleComplex *A, int lda,
+                                              double *W, cuDoubleComplex *work,
+                                              int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const float *A, int lda, float vl, float vu,
+    int il, int iu, int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const float *, int, float, float, int, int, int *,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const double *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const double *, int, double, double, int, int,
+      int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda, float vl,
+    float vu, int il, int iu, int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const cuComplex *, int, float, float, int, int,
+      int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int, double, double, int,
+      int, int *, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, float *A, int lda, float vl, float vu, int il,
+    int iu, int *meig, float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, float *, int, float, float, int, int, int *,
+      float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, double *A, int lda, double vl, double vu,
+    int il, int iu, int *meig, double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, double *, int, double, double, int, int, int *,
+      double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCheevdx(cusolverDnHandle_t handle, cusolverEigMode_t jobz,
+                  cusolverEigRange_t range, cublasFillMode_t uplo, int n,
+                  cuComplex *A, int lda, float vl, float vu, int il, int iu,
+                  int *meig, float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, cuComplex *, int, float, float, int, int, int *,
+      float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cusolverEigRange_t range,
+    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda, double vl,
+    double vu, int il, int iu, int *meig, double *W, cuDoubleComplex *work,
+    int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cusolverEigRange_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, double, double, int, int,
+      int *, double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, range, uplo, n, A, lda, vl, vu, il, iu, meig, W,
+                  work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const float *A,
+    int lda, const float *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const float *, int,
+      const float *, int, float, float, int, int, int *, const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const double *A,
+    int lda, const double *B, int ldb, double vl, double vu, int il, int iu,
+    int *meig, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const double *, int,
+      const double *, int, double, double, int, int, int *, const double *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, const cuComplex *A,
+    int lda, const cuComplex *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const cuComplex *, int,
+      const cuComplex *, int, float, float, int, int, int *, const float *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n,
+    const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb,
+    double vl, double vu, int il, int iu, int *meig, const double *W,
+    int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, double, double, int, int, int *,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, float *A, int lda,
+    float *B, int ldb, float vl, float vu, int il, int iu, int *meig, float *W,
+    float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, float *, int, float *, int,
+      float, float, int, int, int *, float *, float *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, double *A, int lda,
+    double *B, int ldb, double vl, double vu, int il, int iu, int *meig,
+    double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, double *, int, double *, int,
+      double, double, int, int, int *, double *, double *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuComplex *A,
+    int lda, cuComplex *B, int ldb, float vl, float vu, int il, int iu,
+    int *meig, float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, cuComplex *, int, cuComplex *,
+      int, float, float, int, int, int *, float *, cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cusolverEigRange_t range, cublasFillMode_t uplo, int n, cuDoubleComplex *A,
+    int lda, cuDoubleComplex *B, int ldb, double vl, double vu, int il, int iu,
+    int *meig, double *W, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, double, double, int, int, int *, double *,
+      cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, range, uplo, n, A, lda, B, ldb, vl, vu,
+                  il, iu, meig, W, work, lwork, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
+    int ldb, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const float *, int, const float *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
+    int ldb, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const double *, int, const double *, int,
+      const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
+    const cuComplex *B, int ldb, const float *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
+      const float *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *B, int ldb, const double *W, int *lwork) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
+    float *W, float *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
+    double *W, double *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
+      int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
+    float *W, cuComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
+      cuComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnZhegvd(cusolverDnHandle_t handle, cusolverEigType_t itype,
+                 cusolverEigMode_t jobz, cublasFillMode_t uplo, int n,
+                 cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb,
+                 double *W, cuDoubleComplex *work, int lwork, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateSyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroySyevjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetTolerance(syevjInfo_t info,
+                                                          double tolerance) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info,
+                                                          int max_sweeps) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjSetSortEig(syevjInfo_t info,
+                                                        int sort_eig) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_eig);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
+    cusolverDnHandle_t handle, syevjInfo_t info, double *residual) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
+                                                  syevjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
+    cusolverDnHandle_t handle, syevjInfo_t info, int *executed_sweeps) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, syevjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXsyevjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZheevjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, float *A, int lda, float *W, float *work, int lwork, int *info,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, double *A, int lda, double *W, double *work, int lwork, int *info,
+    syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuComplex *A, int lda, float *W, cuComplex *work, int lwork,
+    int *info, syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *, syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
+    int lwork, int *info, syevjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
+      syevjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const float *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const float *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const double *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const double *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuComplex *A, int lda, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuComplex *, int, const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, const cuDoubleComplex *A, int lda, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, lwork, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              float *A, int lda, float *W,
+                                              float *work, int lwork, int *info,
+                                              syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, float *,
+      int, float *, float *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              double *A, int lda, double *W,
+                                              double *work, int lwork,
+                                              int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, double *,
+      int, double *, double *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsyevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(cusolverDnHandle_t handle,
+                                              cusolverEigMode_t jobz,
+                                              cublasFillMode_t uplo, int n,
+                                              cuComplex *A, int lda, float *W,
+                                              cuComplex *work, int lwork,
+                                              int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int, cuComplex *,
+      int, float *, cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo,
+    int n, cuDoubleComplex *A, int lda, double *W, cuDoubleComplex *work,
+    int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, cublasFillMode_t, int,
+      cuDoubleComplex *, int, double *, cuDoubleComplex *, int, int *,
+      syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZheevj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, uplo, n, A, lda, W, work, lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const float *A, int lda, const float *B,
+    int ldb, const float *W, int *lwork, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const float *, int, const float *, int,
+      const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const double *A, int lda, const double *B,
+    int ldb, const double *W, int *lwork, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const double *, int, const double *, int,
+      const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuComplex *A, int lda,
+    const cuComplex *B, int ldb, const float *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuComplex *, int, const cuComplex *, int,
+      const float *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, const cuDoubleComplex *A, int lda,
+    const cuDoubleComplex *B, int ldb, const double *W, int *lwork,
+    syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, const double *, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, float *A, int lda, float *B, int ldb,
+    float *W, float *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, float *, int, float *, int, float *, float *, int,
+      int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, double *A, int lda, double *B, int ldb,
+    double *W, double *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, double *, int, double *, int, double *, double *,
+      int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDsygvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuComplex *A, int lda, cuComplex *B, int ldb,
+    float *W, cuComplex *work, int lwork, int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuComplex *, int, cuComplex *, int, float *,
+      cuComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnChegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
+    cusolverDnHandle_t handle, cusolverEigType_t itype, cusolverEigMode_t jobz,
+    cublasFillMode_t uplo, int n, cuDoubleComplex *A, int lda,
+    cuDoubleComplex *B, int ldb, double *W, cuDoubleComplex *work, int lwork,
+    int *info, syevjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigType_t, cusolverEigMode_t,
+      cublasFillMode_t, int, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, int *, syevjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZhegvj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, itype, jobz, uplo, n, A, lda, B, ldb, W, work, lwork,
+                  info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyGesvdjInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info,
+                                                           double tolerance) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetTolerance");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, tolerance);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info,
+                                                           int max_sweeps) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetMaxSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, max_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info,
+                                                         int sort_svd) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjSetSortEig");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, sort_svd);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
+    cusolverDnHandle_t handle, gesvdjInfo_t info, double *residual) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t,
+                                                  gesvdjInfo_t, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetResidual");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, residual);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
+    cusolverDnHandle_t handle, gesvdjInfo_t info, int *executed_sweeps) {
+  using FuncPtr =
+      cusolverStatus_t(CUSOLVERAPI *)(cusolverDnHandle_t, gesvdjInfo_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnXgesvdjGetSweeps");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, executed_sweeps);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const float *A, int lda, const float *S, const float *U, int ldu,
+    const float *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const float *, int,
+      const float *, const float *, int, const float *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const double *A, int lda, const double *S, const double *U, int ldu,
+    const double *V, int ldv, int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const double *, int,
+      const double *, const double *, int, const double *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
+    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuComplex *, int,
+      const float *, const cuComplex *, int, const cuComplex *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    const cuDoubleComplex *A, int lda, const double *S,
+    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
+    int *lwork, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, const cuDoubleComplex *,
+      int, const double *, const cuDoubleComplex *, int,
+      const cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, lwork, params,
+                  batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, float *A,
+    int lda, float *S, float *U, int ldu, float *V, int ldv, float *work,
+    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, float *, int, float *,
+      float *, int, float *, int, float *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n, double *A,
+    int lda, double *S, double *U, int ldu, double *V, int ldv, double *work,
+    int lwork, int *info, gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, double *, int, double *,
+      double *, int, double *, int, double *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
+    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
+      gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int m, int n,
+    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
+    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
+    gesvdjInfo_t params, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, cuDoubleComplex *, int,
+      double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *, gesvdjInfo_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdjBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, m, n, A, lda, S, U, ldu, V, ldv, work, lwork,
+                  info, params, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const float *A, int lda, const float *S, const float *U, int ldu,
+    const float *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      const float *, const float *, int, const float *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const double *A, int lda, const double *S, const double *U, int ldu,
+    const double *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      const double *, const double *, int, const double *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const cuComplex *A, int lda, const float *S, const cuComplex *U, int ldu,
+    const cuComplex *V, int ldv, int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, const float *, const cuComplex *, int, const cuComplex *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    const cuDoubleComplex *A, int lda, const double *S,
+    const cuDoubleComplex *U, int ldu, const cuDoubleComplex *V, int ldv,
+    int *lwork, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, const double *, const cuDoubleComplex *,
+      int, const cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, lwork,
+                  params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    float *A, int lda, float *S, float *U, int ldu, float *V, int ldv,
+    float *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, float *, int,
+      float *, float *, int, float *, int, float *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    double *A, int lda, double *S, double *U, int ldu, double *V, int ldv,
+    double *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, double *, int,
+      double *, double *, int, double *, int, double *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    cuComplex *A, int lda, float *S, cuComplex *U, int ldu, cuComplex *V,
+    int ldv, cuComplex *work, int lwork, int *info, gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuComplex *, int,
+      float *, cuComplex *, int, cuComplex *, int, cuComplex *, int, int *,
+      gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int econ, int m, int n,
+    cuDoubleComplex *A, int lda, double *S, cuDoubleComplex *U, int ldu,
+    cuDoubleComplex *V, int ldv, cuDoubleComplex *work, int lwork, int *info,
+    gesvdjInfo_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, cuDoubleComplex *,
+      int, double *, cuDoubleComplex *, int, cuDoubleComplex *, int,
+      cuDoubleComplex *, int, int *, gesvdjInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdj");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, econ, m, n, A, lda, S, U, ldu, V, ldv, work,
+                  lwork, info, params);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const float *d_A, int lda, long long int strideA, const float *d_S,
+    long long int strideS, const float *d_U, int ldu, long long int strideU,
+    const float *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      long long, const float *, long long, const float *, int, long long,
+      const float *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const double *d_A, int lda, long long int strideA, const double *d_S,
+    long long int strideS, const double *d_U, int ldu, long long int strideU,
+    const double *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      long long, const double *, long long, const double *, int, long long,
+      const double *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuComplex *d_A, int lda, long long int strideA, const float *d_S,
+    long long int strideS, const cuComplex *d_U, int ldu, long long int strideU,
+    const cuComplex *d_V, int ldv, long long int strideV, int *lwork,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, long long, const float *, long long, const cuComplex *, int,
+      long long, const cuComplex *, int, long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuDoubleComplex *d_A, int lda, long long int strideA,
+    const double *d_S, long long int strideS, const cuDoubleComplex *d_U,
+    int ldu, long long int strideU, const cuDoubleComplex *d_V, int ldv,
+    long long int strideV, int *lwork, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, long long, const double *, long long,
+      const cuDoubleComplex *, int, long long, const cuDoubleComplex *, int,
+      long long, int *, int);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, lwork, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const float *d_A, int lda, long long int strideA, float *d_S,
+    long long int strideS, float *d_U, int ldu, long long int strideU,
+    float *d_V, int ldv, long long int strideV, float *d_work, int lwork,
+    int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const float *, int,
+      long long, float *, long long, float *, int, long long, float *, int,
+      long long, float *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const double *d_A, int lda, long long int strideA, double *d_S,
+    long long int strideS, double *d_U, int ldu, long long int strideU,
+    double *d_V, int ldv, long long int strideV, double *d_work, int lwork,
+    int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const double *, int,
+      long long, double *, long long, double *, int, long long, double *, int,
+      long long, double *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuComplex *d_A, int lda, long long int strideA, float *d_S,
+    long long int strideS, cuComplex *d_U, int ldu, long long int strideU,
+    cuComplex *d_V, int ldv, long long int strideV, cuComplex *d_work,
+    int lwork, int *d_info, double *h_R_nrmF, int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int, const cuComplex *,
+      int, long long, float *, long long, cuComplex *, int, long long,
+      cuComplex *, int, long long, cuComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
+    cusolverDnHandle_t handle, cusolverEigMode_t jobz, int rank, int m, int n,
+    const cuDoubleComplex *d_A, int lda, long long int strideA, double *d_S,
+    long long int strideS, cuDoubleComplex *d_U, int ldu, long long int strideU,
+    cuDoubleComplex *d_V, int ldv, long long int strideV,
+    cuDoubleComplex *d_work, int lwork, int *d_info, double *h_R_nrmF,
+    int batchSize) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverEigMode_t, int, int, int,
+      const cuDoubleComplex *, int, long long, double *, long long,
+      cuDoubleComplex *, int, long long, cuDoubleComplex *, int, long long,
+      cuDoubleComplex *, int, int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnZgesvdaStridedBatched");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, jobz, rank, m, n, d_A, lda, strideA, d_S, strideS,
+                  d_U, ldu, strideU, d_V, ldv, strideV, d_work, lwork, d_info,
+                  h_R_nrmF, batchSize);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnCreateParams(cusolverDnParams_t *params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnParams_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnCreateParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnDestroyParams(cusolverDnParams_t params) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(cusolverDnParams_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnDestroyParams");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSetAdvOptions(cusolverDnParams_t params,
+                        cusolverDnFunction_t function, cusolverAlgMode_t algo) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnParams_t, cusolverDnFunction_t, cusolverAlgMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSetAdvOptions");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(params, function, algo);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnPotrf_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
+    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
+      cudaDataType, const void *, int64_t, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
+                  workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnPotrf(cusolverDnHandle_t handle, cusolverDnParams_t params,
+                cublasFillMode_t uplo, int64_t n, cudaDataType dataTypeA,
+                void *A, int64_t lda, cudaDataType computeType, void *pBuffer,
+                size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
+      cudaDataType, void *, int64_t, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, dataTypeA, A, lda, computeType,
+                  pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnPotrs(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, cublasFillMode_t uplo,
+    int64_t n, int64_t nrhs, cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType dataTypeB, void *B, int64_t ldb, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasFillMode_t, int64_t,
+      int64_t, cudaDataType, const void *, int64_t, cudaDataType, void *,
+      int64_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnPotrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, uplo, n, nrhs, dataTypeA, A, lda, dataTypeB,
+                  B, ldb, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGeqrf_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
+    cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType dataTypeTau, const void *tau, cudaDataType computeType,
+    size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      const void *, int64_t, cudaDataType, const void *, cudaDataType,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
+                  computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnGeqrf(cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m,
+                int64_t n, cudaDataType dataTypeA, void *A, int64_t lda,
+                cudaDataType dataTypeTau, void *tau, cudaDataType computeType,
+                void *pBuffer, size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      void *, int64_t, cudaDataType, void *, cudaDataType, void *, size_t,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGeqrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, dataTypeTau, tau,
+                  computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetrf_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m, int64_t n,
+    cudaDataType dataTypeA, const void *A, int64_t lda,
+    cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      const void *, int64_t, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, computeType,
+                  workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnGetrf(cusolverDnHandle_t handle, cusolverDnParams_t params, int64_t m,
+                int64_t n, cudaDataType dataTypeA, void *A, int64_t lda,
+                int64_t *ipiv, cudaDataType computeType, void *pBuffer,
+                size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, int64_t, int64_t, cudaDataType,
+      void *, int64_t, int64_t *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrf");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, m, n, dataTypeA, A, lda, ipiv, computeType,
+                  pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnGetrs(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cublasOperation_t trans, int64_t n, int64_t nrhs, cudaDataType dataTypeA,
+    const void *A, int64_t lda, const int64_t *ipiv, cudaDataType dataTypeB,
+    void *B, int64_t ldb, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cublasOperation_t, int64_t,
+      int64_t, cudaDataType, const void *, int64_t, const int64_t *,
+      cudaDataType, void *, int64_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnGetrs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, trans, n, nrhs, dataTypeA, A, lda, ipiv,
+                  dataTypeB, B, ldb, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevd_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
+    cudaDataType dataTypeA, const void *A, int64_t lda, cudaDataType dataTypeW,
+    const void *W, cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cublasFillMode_t, int64_t, cudaDataType, const void *, int64_t,
+      cudaDataType, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
+                  W, computeType, workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI
+cusolverDnSyevd(cusolverDnHandle_t handle, cusolverDnParams_t params,
+                cusolverEigMode_t jobz, cublasFillMode_t uplo, int64_t n,
+                cudaDataType dataTypeA, void *A, int64_t lda,
+                cudaDataType dataTypeW, void *W, cudaDataType computeType,
+                void *pBuffer, size_t workspaceInBytes, int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cublasFillMode_t, int64_t, cudaDataType, void *, int64_t, cudaDataType,
+      void *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevd");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, uplo, n, dataTypeA, A, lda, dataTypeW,
+                  W, computeType, pBuffer, workspaceInBytes, info);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx_bufferSize(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
+    int64_t n, cudaDataType dataTypeA, const void *A, int64_t lda, void *vl,
+    void *vu, int64_t il, int64_t iu, int64_t *h_meig, cudaDataType dataTypeW,
+    const void *W, cudaDataType computeType, size_t *workspaceInBytes) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, const void *,
+      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
+      const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
+                  vu, il, iu, h_meig, dataTypeW, W, computeType,
+                  workspaceInBytes);
+}
+
+cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx(
+    cusolverDnHandle_t handle, cusolverDnParams_t params,
+    cusolverEigMode_t jobz, cusolverEigRange_t range, cublasFillMode_t uplo,
+    int64_t n, cudaDataType dataTypeA, void *A, int64_t lda, void *vl, void *vu,
+    int64_t il, int64_t iu, int64_t *meig64, cudaDataType dataTypeW, void *W,
+    cudaDataType computeType, void *pBuffer, size_t workspaceInBytes,
+    int *info) {
+  using FuncPtr = cusolverStatus_t(CUSOLVERAPI *)(
+      cusolverDnHandle_t, cusolverDnParams_t, cusolverEigMode_t,
+      cusolverEigRange_t, cublasFillMode_t, int64_t, cudaDataType, void *,
+      int64_t, void *, void *, int64_t, int64_t, int64_t *, cudaDataType,
+      void *, cudaDataType, void *, size_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusolverDnSyevdx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, params, jobz, range, uplo, n, dataTypeA, A, lda, vl,
+                  vu, il, iu, meig64, dataTypeW, W, computeType, pBuffer,
+                  workspaceInBytes, info);
+}
+
+}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cusparse_11_0.inc b/tensorflow/stream_executor/cuda/cusparse_11_0.inc
new file mode 100644
index 00000000000..31eb65c24ec
--- /dev/null
+++ b/tensorflow/stream_executor/cuda/cusparse_11_0.inc
@@ -0,0 +1,6584 @@
+// Auto-generated, do not edit.
+
+#define CUSPARSE_DEPRECATED(new_func)
+
+extern "C" {
+
+cusparseStatus_t CUSPARSEAPI cusparseCreate(cusparseHandle_t *handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreate");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroy(cusparseHandle_t handle) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetVersion(cusparseHandle_t handle,
+                                                int *version) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetVersion");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, version);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetProperty(libraryPropertyType type,
+                                                 int *value) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(libraryPropertyType, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetProperty");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(type, value);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorName(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorName");
+  if (!func_ptr) return "cusparseGetErrorName symbol not found.";
+  return func_ptr(status);
+}
+
+const char *CUSPARSEAPI cusparseGetErrorString(cusparseStatus_t status) {
+  using FuncPtr = const char *(CUSPARSEAPI *)(cusparseStatus_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetErrorString");
+  if (!func_ptr) return "cusparseGetErrorString symbol not found.";
+  return func_ptr(status);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetStream(cusparseHandle_t handle,
+                                               cudaStream_t streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetStream(cusparseHandle_t handle,
+                                               cudaStream_t *streamId) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cudaStream_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetStream");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, streamId);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t *mode) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t,
+                                                  cusparsePointerMode_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetPointerMode(cusparseHandle_t handle, cusparsePointerMode_t mode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, cusparsePointerMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetPointerMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateMatDescr(cusparseMatDescr_t *descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyMatDescr(cusparseMatDescr_t descrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCopyMatDescr(cusparseMatDescr_t dest, const cusparseMatDescr_t src) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t,
+                                                  const cusparseMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCopyMatDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dest, src);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatType(cusparseMatDescr_t descrA,
+                                                cusparseMatrixType_t type) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseMatrixType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, type);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatFillMode(cusparseMatDescr_t descrA, cusparseFillMode_t fillMode) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseFillMode_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatFillMode");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, fillMode);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatDiagType(cusparseMatDescr_t descrA, cusparseDiagType_t diagType) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseDiagType_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatDiagType");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, diagType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetMatIndexBase(cusparseMatDescr_t descrA,
+                                                     cusparseIndexBase_t base) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseMatDescr_t, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetMatIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descrA, base);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsv2Info(csrsv2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsv2Info(csrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsric02Info(csric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsric02Info(csric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsric02Info(bsric02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsric02Info(bsric02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsric02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsric02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrilu02Info(csrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrilu02Info(csrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrilu02Info(bsrilu02Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrilu02Info(bsrilu02Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrilu02Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrilu02Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsv2Info(bsrsv2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsv2Info(bsrsv2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsv2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsv2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateBsrsm2Info(bsrsm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyBsrsm2Info(bsrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(bsrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyBsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsru2csrInfo(csru2csrInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsru2csrInfo(csru2csrInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csru2csrInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsru2csrInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateColorInfo(cusparseColorInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyColorInfo(cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyColorInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t alg) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t, cusparseColorAlg_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseGetColorAlgs(cusparseColorInfo_t info,
+                                                  cusparseColorAlg_t *alg) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseColorInfo_t,
+                                                  cusparseColorAlg_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseGetColorAlgs");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info, alg);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreatePruneInfo(pruneInfo_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreatePruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyPruneInfo(pruneInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(pruneInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyPruneInfo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSaxpyi(cusparseHandle_t handle, int nnz,
+                                            const float *alpha,
+                                            const float *xVal, const int *xInd,
+                                            float *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const int *, float *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDaxpyi(cusparseHandle_t handle, int nnz,
+                                            const double *alpha,
+                                            const double *xVal, const int *xInd,
+                                            double *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const int *,
+      double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCaxpyi(cusparseHandle_t handle, int nnz,
+                                            const cuComplex *alpha,
+                                            const cuComplex *xVal,
+                                            const int *xInd, cuComplex *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *, const int *,
+      cuComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZaxpyi(cusparseHandle_t handle, int nnz,
+                                            const cuDoubleComplex *alpha,
+                                            const cuDoubleComplex *xVal,
+                                            const int *xInd, cuDoubleComplex *y,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const int *, cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZaxpyi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, alpha, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgthr(cusparseHandle_t handle, int nnz,
+                                           const float *y, float *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, float *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgthr(cusparseHandle_t handle, int nnz,
+                                           const double *y, double *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, double *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgthr(cusparseHandle_t handle, int nnz,
+                                           const cuComplex *y, cuComplex *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, cuComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgthr(cusparseHandle_t handle, int nnz,
+                                           const cuDoubleComplex *y,
+                                           cuDoubleComplex *xVal,
+                                           const int *xInd,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, cuDoubleComplex *,
+      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgthrz(cusparseHandle_t handle, int nnz,
+                                            float *y, float *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, float *, float *,
+                                      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgthrz(cusparseHandle_t handle, int nnz,
+                                            double *y, double *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, double *, double *,
+                                      const int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgthrz(cusparseHandle_t handle, int nnz,
+                                            cuComplex *y, cuComplex *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cuComplex *, cuComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgthrz(cusparseHandle_t handle, int nnz,
+                                            cuDoubleComplex *y,
+                                            cuDoubleComplex *xVal,
+                                            const int *xInd,
+                                            cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cuDoubleComplex *, cuDoubleComplex *, const int *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgthrz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, y, xVal, xInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSsctr(cusparseHandle_t handle, int nnz,
+                                           const float *xVal, const int *xInd,
+                                           float *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int,
+                                                  const float *, const int *,
+                                                  float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDsctr(cusparseHandle_t handle, int nnz,
+                                           const double *xVal, const int *xInd,
+                                           double *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const int *, double *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsctr(cusparseHandle_t handle, int nnz,
+                                           const cuComplex *xVal,
+                                           const int *xInd, cuComplex *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const int *, cuComplex *,
+      cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZsctr(cusparseHandle_t handle, int nnz,
+                                           const cuDoubleComplex *xVal,
+                                           const int *xInd, cuDoubleComplex *y,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const int *,
+      cuDoubleComplex *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZsctr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSroti(cusparseHandle_t handle, int nnz,
+                                           float *xVal, const int *xInd,
+                                           float *y, const float *c,
+                                           const float *s,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, float *, const int *, float *, const float *,
+      const float *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDroti(cusparseHandle_t handle, int nnz,
+                                           double *xVal, const int *xInd,
+                                           double *y, const double *c,
+                                           const double *s,
+                                           cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, double *, const int *, double *, const double *,
+      const double *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDroti");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, nnz, xVal, xInd, y, c, s, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const float *alpha, const float *A, int lda, int nnz,
+               const float *xVal, const int *xInd, const float *beta, float *y,
+               cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const float *, int, int, const float *, const int *, const float *,
+      float *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi(cusparseHandle_t handle, cusparseOperation_t transA, int m,
+               int n, const double *alpha, const double *A, int lda, int nnz,
+               const double *xVal, const int *xInd, const double *beta,
+               double *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const double *, int, int, const double *, const int *, const double *,
+      double *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuComplex *alpha, const cuComplex *A, int lda, int nnz,
+    const cuComplex *xVal, const int *xInd, const cuComplex *beta, cuComplex *y,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cuComplex *, int, int, const cuComplex *, const int *,
+      const cuComplex *, cuComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgemvi(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int n,
+    const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, int nnz,
+    const cuDoubleComplex *xVal, const int *xInd, const cuDoubleComplex *beta,
+    cuDoubleComplex *y, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, int, const cuDoubleComplex *, const int *,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, alpha, A, lda, nnz, xVal, xInd, beta, y,
+                  idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi_bufferSize(cusparseHandle_t handle, cusparseOperation_t transA,
+                          int m, int n, int nnz, int *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemvi_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, n, nnz, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx_bufferSize(
+    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
+    const cusparseMatDescr_t descrA, const void *csrValA,
+    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
+    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
+    void *y, cudaDataType ytype, cudaDataType executiontype,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
+      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
+      cudaDataType, const int *, const int *, const void *, cudaDataType,
+      const void *, cudaDataType, void *, cudaDataType, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
+                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
+                  betatype, y, ytype, executiontype, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrmvEx(
+    cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA,
+    int m, int n, int nnz, const void *alpha, cudaDataType alphatype,
+    const cusparseMatDescr_t descrA, const void *csrValA,
+    cudaDataType csrValAtype, const int *csrRowPtrA, const int *csrColIndA,
+    const void *x, cudaDataType xtype, const void *beta, cudaDataType betatype,
+    void *y, cudaDataType ytype, cudaDataType executiontype, void *buffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseAlgMode_t, cusparseOperation_t, int, int, int,
+      const void *, cudaDataType, const cusparseMatDescr_t, const void *,
+      cudaDataType, const int *, const int *, const void *, cudaDataType,
+      const void *, cudaDataType, void *, cudaDataType, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrmvEx");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, alg, transA, m, n, nnz, alpha, alphatype, descrA,
+                  csrValA, csrValAtype, csrRowPtrA, csrColIndA, x, xtype, beta,
+                  betatype, y, ytype, executiontype, buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const float *x, const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, const float *, const float *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const double *x, const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, const double *, const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+               cusparseOperation_t transA, int mb, int nb, int nnzb,
+               const cuComplex *alpha, const cusparseMatDescr_t descrA,
+               const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+               const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+               const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const cuComplex *, const cuComplex *,
+      cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockDim,
+                  x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+                const float *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const float *x,
+                const float *beta, float *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const int *, const int *, int, const float *, const float *,
+      float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrxmv(cusparseHandle_t handle, cusparseDirection_t dirA,
+                cusparseOperation_t transA, int sizeOfMask, int mb, int nb,
+                int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+                const double *bsrSortedValA, const int *bsrSortedMaskPtrA,
+                const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+                const int *bsrSortedColIndA, int blockDim, const double *x,
+                const double *beta, double *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const int *, const int *, int, const double *,
+      const double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuComplex *x,
+    const cuComplex *beta, cuComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const int *, const int *, int,
+      const cuComplex *, const cuComplex *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrxmv(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int sizeOfMask, int mb, int nb, int nnzb,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedValA, const int *bsrSortedMaskPtrA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedEndPtrA,
+    const int *bsrSortedColIndA, int blockDim, const cuDoubleComplex *x,
+    const cuDoubleComplex *beta, cuDoubleComplex *y) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const int *,
+      const int *, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrxmv");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, sizeOfMask, mb, nb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedMaskPtrA, bsrSortedRowPtrA,
+                  bsrSortedEndPtrA, bsrSortedColIndA, blockDim, x, beta, y);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
+                                                       csrsv2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, csrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_analysis(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, csrsv2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csrsv2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const float *alpha, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const float *f, float *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      csrsv2Info_t, const float *, float *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const double *alpha, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const double *f, double *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      csrsv2Info_t, const double *, double *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const cuComplex *f,
+    cuComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      csrsv2Info_t, const cuComplex *, cuComplex *, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsv2_solve(
+    cusparseHandle_t handle, cusparseOperation_t transA, int m, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrsv2Info_t info, const cuDoubleComplex *f,
+    cuDoubleComplex *x, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, csrsv2Info_t, const cuDoubleComplex *, cuDoubleComplex *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, transA, m, nnz, alpha, descrA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, info, f, x, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsv2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsv2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, float *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, double *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuComplex *, const int *, const int *, int,
+      bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockSize,
+    bsrsv2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, const int *, const int *,
+      int, bsrsv2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockSize, info,
+                  pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsv2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, policy,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const float *f, float *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, int, bsrsv2Info_t, const float *, float *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const double *f, double *x, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const double *, const cusparseMatDescr_t, const double *, const int *,
+      const int *, int, bsrsv2Info_t, const double *, double *,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuComplex *f, cuComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, bsrsv2Info_t, const cuComplex *,
+      cuComplex *, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsv2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, int mb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    bsrsv2Info_t info, const cuDoubleComplex *f, cuDoubleComplex *x,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t, int, int,
+      const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsv2Info_t,
+      const cuDoubleComplex *, cuDoubleComplex *, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsv2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, mb, nnzb, alpha, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, blockDim, info, f, x,
+                  policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const float *B,
+    const int ldb, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      const int, const float *, const int, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const double *B,
+    const int ldb, const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      const int, const double *, const int, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedValA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, const int blockSize, const cuComplex *B,
+    const int ldb, const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      const int, const cuComplex *, const int, const cuComplex *, cuComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrmm(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int kb, int nnzb, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA,
+    const int blockSize, const cuDoubleComplex *B, const int ldb,
+    const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, const int, const cuDoubleComplex *, const int,
+      const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrmm");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, kb, nnzb, alpha, descrA,
+                  bsrSortedValA, bsrSortedRowPtrA, bsrSortedColIndA, blockSize,
+                  B, ldb, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz, const float *alpha,
+    const float *A, int lda, const float *cscValB, const int *cscColPtrB,
+    const int *cscRowIndB, const float *beta, float *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const float *, const float *, int,
+      const float *, const int *, const int *, const float *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz, const double *alpha,
+    const double *A, int lda, const double *cscValB, const int *cscColPtrB,
+    const int *cscRowIndB, const double *beta, double *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const double *, const double *, int,
+      const double *, const int *, const int *, const double *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgemmi(
+    cusparseHandle_t handle, int m, int n, int k, int nnz,
+    const cuComplex *alpha, const cuComplex *A, int lda,
+    const cuComplex *cscValB, const int *cscColPtrB, const int *cscRowIndB,
+    const cuComplex *beta, cuComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const cuComplex *,
+      const cuComplex *, int, const cuComplex *, const int *, const int *,
+      const cuComplex *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemmi(cusparseHandle_t handle, int m, int n, int k, int nnz,
+               const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda,
+               const cuDoubleComplex *cscValB, const int *cscColPtrB,
+               const int *cscRowIndB, const cuDoubleComplex *beta,
+               cuDoubleComplex *C, int ldc) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, cuDoubleComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgemmi");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, nnz, alpha, A, lda, cscValB, cscColPtrB,
+                  cscRowIndB, beta, C, ldc);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrsm2Info(csrsm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrsm2Info(csrsm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrsm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrsm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
+                                                       csrsm2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, const float *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_analysis(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cuDoubleComplex *, int, csrsm2Info_t, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const float *alpha,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float *B, int ldb,
+    csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const float *, const cusparseMatDescr_t, const float *, const int *,
+      const int *, float *, int, csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz, const double *alpha,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, double *B,
+    int ldb, csrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const double *, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuComplex *, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int, csrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrsm2_solve(
+    cusparseHandle_t handle, int algo, cusparseOperation_t transA,
+    cusparseOperation_t transB, int m, int nrhs, int nnz,
+    const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuDoubleComplex *B, int ldb, csrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, cusparseOperation_t, cusparseOperation_t, int, int,
+      int, const cuDoubleComplex *, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *, int,
+      csrsm2Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, transA, transB, m, nrhs, nnz, alpha, descrA,
+                  csrSortedValA, csrSortedRowPtrA, csrSortedColIndA, B, ldb,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
+                                                       bsrsm2Info_t info,
+                                                       int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrsm2_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transB, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transB, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrsm2Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsrsm2Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const float *alpha, const cusparseMatDescr_t descrA,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const float *B, int ldb, float *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      bsrsm2Info_t, const float *, int, float *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const double *alpha, const cusparseMatDescr_t descrA,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const double *B, int ldb, double *X, int ldx, cusparseSolvePolicy_t policy,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      bsrsm2Info_t, const double *, int, double *, int, cusparseSolvePolicy_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuComplex *B, int ldb, cuComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, bsrsm2Info_t, const cuComplex *, int, cuComplex *, int,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrsm2_solve(
+    cusparseHandle_t handle, cusparseDirection_t dirA,
+    cusparseOperation_t transA, cusparseOperation_t transXY, int mb, int n,
+    int nnzb, const cuDoubleComplex *alpha, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int blockSize, bsrsm2Info_t info,
+    const cuDoubleComplex *B, int ldb, cuDoubleComplex *X, int ldx,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
+      cusparseOperation_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, bsrsm2Info_t, const cuDoubleComplex *, int,
+      cuDoubleComplex *, int, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrsm2_solve");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, transA, transXY, mb, n, nnzb, alpha, descrA,
+                  bsrSortedVal, bsrSortedRowPtr, bsrSortedColInd, blockSize,
+                  info, B, ldb, X, ldx, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_numericBoost(
+    cusparseHandle_t handle, csrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, csrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrilu02_zeroPivot(
+    cusparseHandle_t handle, csrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrilu02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csrilu02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csrilu02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    float *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, float *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    double *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, double *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_numericBoost(
+    cusparseHandle_t handle, bsrilu02Info_t info, int enable_boost, double *tol,
+    cuDoubleComplex *boost_val) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, bsrilu02Info_t, int, double *, cuDoubleComplex *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_numericBoost");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, enable_boost, tol, boost_val);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsrilu02_zeroPivot(
+    cusparseHandle_t handle, bsrilu02Info_t info, int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsrilu02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsrilu02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsrilu02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsrilu02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsrilu02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsrilu02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
+                                                        csric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSize(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedVal, const int *csrSortedRowPtr, const int *csrSortedColInd,
+    csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_bufferSizeExt(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedVal, const int *csrSortedRowPtr,
+    const int *csrSortedColInd, csric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedVal, csrSortedRowPtr,
+                  csrSortedColInd, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02_analysis(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, csric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    float *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, float *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    double *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, double *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsric02(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    cuDoubleComplex *csrSortedValA_valM, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, csric02Info_t info,
+    cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, cuDoubleComplex *,
+      const int *, const int *, csric02Info_t, cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA_valM, csrSortedRowPtrA,
+                  csrSortedColIndA, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
+                                                        bsric02Info_t info,
+                                                        int *position) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXbsric02_zeroPivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, info, position);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockSize,
+    bsric02Info_t info, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockSize, info, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02_analysis(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pInputBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02_analysis");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pInputBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, float *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      float *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, double *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      double *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsric02(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nnzb,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *bsrSortedVal,
+    const int *bsrSortedRowPtr, const int *bsrSortedColInd, int blockDim,
+    bsric02Info_t info, cusparseSolvePolicy_t policy, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, const int *, int, bsric02Info_t,
+      cusparseSolvePolicy_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsric02");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nnzb, descrA, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, blockDim, info, policy, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const float *dl,
+                                            const float *d, const float *du,
+                                            float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const double *dl,
+                                            const double *d, const double *du,
+                                            double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuComplex *dl,
+                                            const cuComplex *d,
+                                            const cuComplex *du, cuComplex *B,
+                                            int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2(cusparseHandle_t handle, int m,
+                                            int n, const cuDoubleComplex *dl,
+                                            const cuDoubleComplex *d,
+                                            const cuDoubleComplex *du,
+                                            cuDoubleComplex *B, int ldb,
+                                            void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, const float *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, const double *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *B, int ldb,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *B, int ldb, size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const float *dl, const float *d,
+    const float *du, float *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const double *dl, const double *d,
+    const double *du, double *B, int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, cuComplex *B, int ldb,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2_nopivot(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *B,
+    int ldb, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, cuDoubleComplex *, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2_nopivot");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, dl, d, du, B, ldb, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      const float *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      const double *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, const cuComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, int batchStride,
+    size_t *bufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride,
+                  bufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const float *dl, const float *d,
+    const float *du, float *x, int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const float *, const float *, const float *,
+      float *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch(cusparseHandle_t handle, int m, const double *dl,
+                           const double *d, const double *du, double *x,
+                           int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const double *, const double *, const double *,
+      double *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuComplex *dl, const cuComplex *d,
+    const cuComplex *du, cuComplex *x, int batchCount, int batchStride,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, cuComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsv2StridedBatch(
+    cusparseHandle_t handle, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du, cuDoubleComplex *x,
+    int batchCount, int batchStride, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, cuDoubleComplex *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsv2StridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, dl, d, du, x, batchCount, batchStride, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *dl, const float *d,
+    const float *du, const float *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *dl, const double *d,
+    const double *du, const double *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *dl,
+    const cuComplex *d, const cuComplex *du, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *dl,
+    const cuDoubleComplex *d, const cuDoubleComplex *du,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *dl, float *d, float *du,
+    float *x, int batchCount, void *pBuffer) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int, float *,
+                                      float *, float *, float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *dl, double *d, double *du,
+    double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int,
+                                                  double *, double *, double *,
+                                                  double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *dl, cuComplex *d,
+    cuComplex *du, cuComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgtsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *dl,
+    cuDoubleComplex *d, cuDoubleComplex *du, cuDoubleComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgtsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, dl, d, du, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const float *ds, const float *dl,
+    const float *d, const float *du, const float *dw, const float *x,
+    int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const float *, const float *,
+      const float *, const float *, const float *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const double *ds,
+    const double *dl, const double *d, const double *du, const double *dw,
+    const double *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const double *,
+      const double *, const double *, const double *, const double *, int,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuComplex *ds,
+    const cuComplex *dl, const cuComplex *d, const cuComplex *du,
+    const cuComplex *dw, const cuComplex *x, int batchCount,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cuComplex *,
+      const cuComplex *, const cuComplex *, const cuComplex *,
+      const cuComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch_bufferSizeExt(
+    cusparseHandle_t handle, int algo, int m, const cuDoubleComplex *ds,
+    const cuDoubleComplex *dl, const cuDoubleComplex *d,
+    const cuDoubleComplex *du, const cuDoubleComplex *dw,
+    const cuDoubleComplex *x, int batchCount, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, const cuDoubleComplex *,
+      const cuDoubleComplex *, const cuDoubleComplex *, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, float *ds, float *dl, float *d,
+    float *du, float *dw, float *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, float *, float *, float *, float *, float *,
+      float *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, double *ds, double *dl, double *d,
+    double *du, double *dw, double *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, double *, double *, double *, double *,
+      double *, double *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuComplex *ds, cuComplex *dl,
+    cuComplex *d, cuComplex *du, cuComplex *dw, cuComplex *x, int batchCount,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuComplex *, cuComplex *, cuComplex *,
+      cuComplex *, cuComplex *, cuComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgpsvInterleavedBatch(
+    cusparseHandle_t handle, int algo, int m, cuDoubleComplex *ds,
+    cuDoubleComplex *dl, cuDoubleComplex *d, cuDoubleComplex *du,
+    cuDoubleComplex *dw, cuDoubleComplex *x, int batchCount, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, cuDoubleComplex *, cuDoubleComplex *,
+      cuDoubleComplex *, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgpsvInterleavedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, algo, m, ds, dl, d, du, dw, x, batchCount, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsrgemm2Info(csrgemm2Info_t *info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseDestroyCsrgemm2Info(csrgemm2Info_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(csrgemm2Info_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyCsrgemm2Info");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(info);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
+    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, csrgemm2Info_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
+      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const float *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
+      int, const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const double *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cuComplex *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuDoubleComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    csrgemm2Info_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int, const int *, const int *,
+      const cuDoubleComplex *, const cusparseMatDescr_t, int, const int *,
+      const int *, csrgemm2Info_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, beta, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, info, pBufferSizeInBytes);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgemm2Nnz(
+    cusparseHandle_t handle, int m, int n, int k,
+    const cusparseMatDescr_t descrA, int nnzA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrD, int nnzD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, const csrgemm2Info_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, int,
+      const int *, const int *, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgemm2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrD, nnzD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseScsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB, const float *beta,
+    const cusparseMatDescr_t descrD, int nnzD, const float *csrSortedValD,
+    const int *csrSortedRowPtrD, const int *csrSortedColIndD,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const cusparseMatDescr_t,
+      int, const float *, const int *, const int *, const cusparseMatDescr_t,
+      int, const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const double *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const double *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const double *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const cusparseMatDescr_t,
+      int, const double *, const int *, const int *, const cusparseMatDescr_t,
+      int, const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const cuComplex *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cuComplex *beta, const cusparseMatDescr_t descrD, int nnzD,
+    const cuComplex *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuComplex *,
+      const int *, const int *, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuComplex *, const int *, int *,
+      const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgemm2(
+    cusparseHandle_t handle, int m, int n, int k, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrD, int nnzD,
+    const cuDoubleComplex *csrSortedValD, const int *csrSortedRowPtrD,
+    const int *csrSortedColIndD, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    int *csrSortedColIndC, const csrgemm2Info_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cusparseMatDescr_t, int, const cuDoubleComplex *,
+      const int *, const int *, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuDoubleComplex *, const int *,
+      int *, const csrgemm2Info_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgemm2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, k, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, beta,
+                  descrD, nnzD, csrSortedValD, csrSortedRowPtrD,
+                  csrSortedColIndD, descrC, csrSortedValC, csrSortedRowPtrC,
+                  csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, const cuComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    const cuDoubleComplex *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrgeam2Nnz(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    int nnzA, const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrB, int nnzB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *workspace) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, int, const int *,
+      const int *, const cusparseMatDescr_t, int, const int *, const int *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrgeam2Nnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, nnzA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrB, nnzB, csrSortedRowPtrB,
+                  csrSortedColIndB, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, workspace);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrgeam2(
+    cusparseHandle_t handle, int m, int n, const float *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, const float *beta,
+    const cusparseMatDescr_t descrB, int nnzB, const float *csrSortedValB,
+    const int *csrSortedRowPtrB, const int *csrSortedColIndB,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, const cusparseMatDescr_t, int,
+      const float *, const int *, const int *, const float *,
+      const cusparseMatDescr_t, int, const float *, const int *, const int *,
+      const cusparseMatDescr_t, float *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const double *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const double *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, const cusparseMatDescr_t, int,
+      const double *, const int *, const int *, const double *,
+      const cusparseMatDescr_t, int, const double *, const int *, const int *,
+      const cusparseMatDescr_t, double *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cuComplex *beta, const cusparseMatDescr_t descrB, int nnzB,
+    const cuComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuComplex *csrSortedValC, int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuComplex *, const cusparseMatDescr_t,
+      int, const cuComplex *, const int *, const int *, const cuComplex *,
+      const cusparseMatDescr_t, int, const cuComplex *, const int *,
+      const int *, const cusparseMatDescr_t, cuComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrgeam2(
+    cusparseHandle_t handle, int m, int n, const cuDoubleComplex *alpha,
+    const cusparseMatDescr_t descrA, int nnzA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cuDoubleComplex *beta,
+    const cusparseMatDescr_t descrB, int nnzB,
+    const cuDoubleComplex *csrSortedValB, const int *csrSortedRowPtrB,
+    const int *csrSortedColIndB, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cuDoubleComplex *,
+      const cusparseMatDescr_t, int, const cuDoubleComplex *, const int *,
+      const int *, const cuDoubleComplex *, const cusparseMatDescr_t, int,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrgeam2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, alpha, descrA, nnzA, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA, beta, descrB, nnzB,
+                  csrSortedValB, csrSortedRowPtrB, csrSortedColIndB, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const float *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, const float *, int *, int *, int *,
+      const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsrcolor(
+    cusparseHandle_t handle, int m, int nnz, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const double *fractionToColor, int *ncolors,
+    int *coloring, int *reordering, const cusparseColorInfo_t info) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, const double *, int *,
+      int *, int *, const cusparseColorInfo_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsrcolor");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, nnz, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, fractionToColor, ncolors, coloring,
+                  reordering, info);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const float *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const double *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz(cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+             const cusparseMatDescr_t descrA, const cuDoubleComplex *A, int lda,
+             int *nnzPerRowCol, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, A, lda, nnzPerRowCol,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const float *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const float *,
+      const int *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const double *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const double *,
+      const int *, int *, int *, double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA, int *nnzPerRow,
+    int *nnzC, cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, int *, int *, cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZnnz_compress(
+    cusparseHandle_t handle, int m, const cusparseMatDescr_t descr,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    int *nnzPerRow, int *nnzC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, const cusparseMatDescr_t, const cuDoubleComplex *,
+      const int *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZnnz_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, descr, csrSortedValA, csrSortedRowPtrA, nnzPerRow,
+                  nnzC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    float *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    float tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, int, const int *, float *, int *, int *, float);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    double *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    double tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, int, const int *, double *, int *, int *,
+      double);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuComplex *csrSortedValC, int *csrSortedColIndC, int *csrSortedRowPtrC,
+    cuComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, int, const int *, cuComplex *, int *, int *,
+      cuComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csr_compress(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedColIndA,
+    const int *csrSortedRowPtrA, int nnzA, const int *nnzPerRow,
+    cuDoubleComplex *csrSortedValC, int *csrSortedColIndC,
+    int *csrSortedRowPtrC, cuDoubleComplex tol) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, const int *,
+      cuDoubleComplex *, int *, int *, cuDoubleComplex);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csr_compress");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedColIndA,
+                  csrSortedRowPtrA, nnzA, nnzPerRow, csrSortedValC,
+                  csrSortedColIndC, csrSortedRowPtrC, tol);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *A, int lda, const int *nnzPerRow, float *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
+      const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *A, int lda, const int *nnzPerRow, double *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
+      const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *A, int lda, const int *nnzPerRow, cuComplex *csrSortedValA,
+    int *csrSortedRowPtrA, int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdense2csr(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *A, int lda, const int *nnzPerRow,
+    cuDoubleComplex *csrSortedValA, int *csrSortedRowPtrA,
+    int *csrSortedColIndA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerRow, csrSortedValA,
+                  csrSortedRowPtrA, csrSortedColIndA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, float *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, double *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *csrSortedValA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *A, int lda, const int *nnzPerCol, float *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *, int,
+      const int *, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *A, int lda, const int *nnzPerCol, double *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *, int,
+      const int *, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *A, int lda, const int *nnzPerCol, cuComplex *cscSortedValA,
+    int *cscSortedRowIndA, int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      int, const int *, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZdense2csc(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *A, int lda, const int *nnzPerCol,
+    cuDoubleComplex *cscSortedValA, int *cscSortedRowIndA,
+    int *cscSortedColPtrA) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, int, const int *, cuDoubleComplex *, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZdense2csc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, A, lda, nnzPerCol, cscSortedValA,
+                  cscSortedRowIndA, cscSortedColPtrA);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const float *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, float *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const double *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, double *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, double *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cuComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t, const cuComplex *,
+      const int *, const int *, cuComplex *, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsc2dense(
+    cusparseHandle_t handle, int m, int n, const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *cscSortedValA, const int *cscSortedRowIndA,
+    const int *cscSortedColPtrA, cuDoubleComplex *A, int lda) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, cuDoubleComplex *,
+      int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsc2dense");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, descrA, cscSortedValA, cscSortedRowIndA,
+                  cscSortedColPtrA, A, lda);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoo2csr(cusparseHandle_t handle,
+                                              const int *cooRowInd, int nnz,
+                                              int m, int *csrSortedRowPtr,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoo2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, cooRowInd, nnz, m, csrSortedRowPtr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2coo(cusparseHandle_t handle,
+                                              const int *csrSortedRowPtr,
+                                              int nnz, int m, int *cooRowInd,
+                                              cusparseIndexBase_t idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, const int *, int, int, int *, cusparseIndexBase_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2coo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, csrSortedRowPtr, nnz, m, cooRowInd, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2bsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, int blockDim, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int *nnzTotalDevHostPtr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, const cusparseMatDescr_t, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2bsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedRowPtrC,
+                  nnzTotalDevHostPtr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2bsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2bsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, blockDim, descrC, bsrSortedValC,
+                  bsrSortedRowPtrC, bsrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, const cusparseMatDescr_t,
+      float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, const cusparseMatDescr_t,
+      double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZbsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int blockDim,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZbsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, blockDim, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSize(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc_bufferSizeExt(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const float *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim, float *bscVal,
+    int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const float *, const int *, const int *,
+      int, int, float *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const double *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    double *bscVal, int *bscRowInd, int *bscColPtr, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const double *, const int *, const int *,
+      int, int, double *, int *, int *, cusparseAction_t, cusparseIndexBase_t,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuComplex *, const int *,
+      const int *, int, int, cuComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsc(
+    cusparseHandle_t handle, int mb, int nb, int nnzb,
+    const cuDoubleComplex *bsrSortedVal, const int *bsrSortedRowPtr,
+    const int *bsrSortedColInd, int rowBlockDim, int colBlockDim,
+    cuDoubleComplex *bscVal, int *bscRowInd, int *bscColPtr,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cuDoubleComplex *, const int *,
+      const int *, int, int, cuDoubleComplex *, int *, int *, cusparseAction_t,
+      cusparseIndexBase_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsc");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, mb, nb, nnzb, bsrSortedVal, bsrSortedRowPtr,
+                  bsrSortedColInd, rowBlockDim, colBlockDim, bscVal, bscRowInd,
+                  bscColPtr, copyValues, idxBase, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDim, int colBlockDim,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, int, int, const cusparseMatDescr_t, int *,
+      int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, float *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, double *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC, cuComplex *csrSortedValC,
+    int *csrSortedRowPtrC, int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2csr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDim,
+    int colBlockDim, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *csrSortedValC, int *csrSortedRowPtrC,
+    int *csrSortedColIndC) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, descrA, bsrSortedValA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDim, colBlockDim, descrC,
+                  csrSortedValC, csrSortedRowPtrC, csrSortedColIndC);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseScsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, int rowBlockDim,
+    int colBlockDim, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZcsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, rowBlockDim, colBlockDim, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const int *csrSortedRowPtrA,
+    const int *csrSortedColIndA, const cusparseMatDescr_t descrC,
+    int *bsrSortedRowPtrC, int rowBlockDim, int colBlockDim,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const int *, const int *, const cusparseMatDescr_t, int *, int, int,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedRowPtrC, rowBlockDim,
+                  colBlockDim, nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const float *, const int *, const int *, const cusparseMatDescr_t,
+      float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const double *, const int *, const int *, const cusparseMatDescr_t,
+      double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuComplex *, const int *, const int *, const cusparseMatDescr_t,
+      cuComplex *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int m, int n,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const cusparseMatDescr_t descrC, cuDoubleComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDim,
+    int colBlockDim, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, const cusparseMatDescr_t,
+      const cuDoubleComplex *, const int *, const int *,
+      const cusparseMatDescr_t, cuDoubleComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, m, n, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDim, colBlockDim, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSize(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC,
+    int *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr_bufferSizeExt(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, int rowBlockDimC, int colBlockDimC, size_t *pBufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, int, int, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, rowBlockDimC, colBlockDimC, pBufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXgebsr2gebsrNnz(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const int *bsrSortedRowPtrA,
+    const int *bsrSortedColIndA, int rowBlockDimA, int colBlockDimA,
+    const cusparseMatDescr_t descrC, int *bsrSortedRowPtrC, int rowBlockDimC,
+    int colBlockDimC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const int *, const int *, int, int,
+      const cusparseMatDescr_t, int *, int, int, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXgebsr2gebsrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedRowPtrA,
+                  bsrSortedColIndA, rowBlockDimA, colBlockDimA, descrC,
+                  bsrSortedRowPtrC, rowBlockDimC, colBlockDimC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const float *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, float *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const float *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, float *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const double *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, double *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const double *, const int *, const int *, int,
+      int, const cusparseMatDescr_t, double *, int *, int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC, cuComplex *bsrSortedValC,
+    int *bsrSortedRowPtrC, int *bsrSortedColIndC, int rowBlockDimC,
+    int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuComplex *, const int *, const int *,
+      int, int, const cusparseMatDescr_t, cuComplex *, int *, int *, int, int,
+      void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZgebsr2gebsr(
+    cusparseHandle_t handle, cusparseDirection_t dirA, int mb, int nb, int nnzb,
+    const cusparseMatDescr_t descrA, const cuDoubleComplex *bsrSortedValA,
+    const int *bsrSortedRowPtrA, const int *bsrSortedColIndA, int rowBlockDimA,
+    int colBlockDimA, const cusparseMatDescr_t descrC,
+    cuDoubleComplex *bsrSortedValC, int *bsrSortedRowPtrC,
+    int *bsrSortedColIndC, int rowBlockDimC, int colBlockDimC, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseDirection_t, int, int, int,
+      const cusparseMatDescr_t, const cuDoubleComplex *, const int *,
+      const int *, int, int, const cusparseMatDescr_t, cuDoubleComplex *, int *,
+      int *, int, int, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZgebsr2gebsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, dirA, mb, nb, nnzb, descrA, bsrSortedValA,
+                  bsrSortedRowPtrA, bsrSortedColIndA, rowBlockDimA,
+                  colBlockDimA, descrC, bsrSortedValC, bsrSortedRowPtrC,
+                  bsrSortedColIndC, rowBlockDimC, colBlockDimC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateIdentityPermutation(cusparseHandle_t handle, int n, int *p) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseHandle_t, int, int *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseCreateIdentityPermutation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, n, p);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cooRowsA,
+    const int *cooColsA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByRow(cusparseHandle_t handle,
+                                                   int m, int n, int nnz,
+                                                   int *cooRowsA, int *cooColsA,
+                                                   int *P, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByRow");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcoosortByColumn(cusparseHandle_t handle,
+                                                      int m, int n, int nnz,
+                                                      int *cooRowsA,
+                                                      int *cooColsA, int *P,
+                                                      void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, int *, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcoosortByColumn");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cooRowsA, cooColsA, P, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *csrRowPtrA,
+    const int *csrColIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrRowPtrA, csrColIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcsrsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *csrRowPtrA,
+                                              int *csrColIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcsrsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrRowPtrA, csrColIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, const int *cscColPtrA,
+    const int *cscRowIndA, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const int *, const int *, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, cscColPtrA, cscRowIndA,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseXcscsort(cusparseHandle_t handle, int m,
+                                              int n, int nnz,
+                                              const cusparseMatDescr_t descrA,
+                                              const int *cscColPtrA,
+                                              int *cscRowIndA, int *P,
+                                              void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const int *,
+      int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseXcscsort");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, cscColPtrA, cscRowIndA, P,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, float *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, float *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, double *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, double *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnz, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, cuDoubleComplex *, const int *, int *,
+      csru2csrInfo_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, info,
+                  pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsru2csr(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsru2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseScsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, float *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, float *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseScsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, double *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, double *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuComplex *csrVal, const int *csrRowPtr,
+    int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, cuComplex *,
+      const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseZcsr2csru(
+    cusparseHandle_t handle, int m, int n, int nnz,
+    const cusparseMatDescr_t descrA, cuDoubleComplex *csrVal,
+    const int *csrRowPtr, int *csrColInd, csru2csrInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t,
+      cuDoubleComplex *, const int *, int *, csru2csrInfo_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseZcsr2csru");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, descrA, csrVal, csrRowPtr, csrColInd, info,
+                  pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnz(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, const float *,
+      const cusparseMatDescr_t, float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csr(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, const double *,
+      const cusparseMatDescr_t, double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneDense2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      const float *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      const double *, const int *, const int *, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnz(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    int *csrSortedRowPtrC, int *nnzTotalDevHostPtr, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t, int *,
+      int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnz");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const float *threshold, const cusparseMatDescr_t descrC,
+    float *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, const float *, const cusparseMatDescr_t,
+      float *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csr(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA,
+    const double *threshold, const cusparseMatDescr_t descrC,
+    double *csrSortedValC, const int *csrSortedRowPtrC, int *csrSortedColIndC,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, const double *, const cusparseMatDescr_t,
+      double *, const int *, int *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDpruneCsr2csr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, threshold, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const float *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, const float *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC,
+    const double *csrSortedValC, const int *csrSortedRowPtrC,
+    const int *csrSortedColIndC, pruneInfo_t info, size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, const double *, const int *, const int *,
+      pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, int *csrRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const float *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const float *, int, float,
+      const cusparseMatDescr_t, float *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneDense2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, const double *A, int lda,
+    float percentage, const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, const double *, int, float,
+      const cusparseMatDescr_t, double *, const int *, int *, pruneInfo_t,
+      void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneDense2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, A, lda, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const float *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, const float *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, const double *csrSortedValC,
+    const int *csrSortedRowPtrC, const int *csrSortedColIndC, pruneInfo_t info,
+    size_t *pBufferSizeInBytes) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, const double *,
+      const int *, const int *, pruneInfo_t, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage_bufferSizeExt");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBufferSizeInBytes);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrNnzByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, int *csrSortedRowPtrC,
+    int *nnzTotalDevHostPtr, pruneInfo_t info, void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, int *, int *,
+      pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrNnzByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedRowPtrC,
+                  nnzTotalDevHostPtr, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const float *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, float *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const float *,
+      const int *, const int *, float, const cusparseMatDescr_t, float *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseSpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDpruneCsr2csrByPercentage(
+    cusparseHandle_t handle, int m, int n, int nnzA,
+    const cusparseMatDescr_t descrA, const double *csrSortedValA,
+    const int *csrSortedRowPtrA, const int *csrSortedColIndA, float percentage,
+    const cusparseMatDescr_t descrC, double *csrSortedValC,
+    const int *csrSortedRowPtrC, int *csrSortedColIndC, pruneInfo_t info,
+    void *pBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const cusparseMatDescr_t, const double *,
+      const int *, const int *, float, const cusparseMatDescr_t, double *,
+      const int *, int *, pruneInfo_t, void *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseDpruneCsr2csrByPercentage");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnzA, descrA, csrSortedValA, csrSortedRowPtrA,
+                  csrSortedColIndA, percentage, descrC, csrSortedValC,
+                  csrSortedRowPtrC, csrSortedColIndC, info, pBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, void *buffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  buffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsr2cscEx2_bufferSize(
+    cusparseHandle_t handle, int m, int n, int nnz, const void *csrVal,
+    const int *csrRowPtr, const int *csrColInd, void *cscVal, int *cscColPtr,
+    int *cscRowInd, cudaDataType valType, cusparseAction_t copyValues,
+    cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, int, int, int, const void *, const int *, const int *,
+      void *, int *, int *, cudaDataType, cusparseAction_t, cusparseIndexBase_t,
+      cusparseCsr2CscAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsr2cscEx2_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, m, n, nnz, csrVal, csrRowPtr, csrColInd, cscVal,
+                  cscColPtr, cscRowInd, valType, copyValues, idxBase, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSpVec(cusparseSpVecDescr_t *spVecDescr, int64_t size, int64_t nnz,
+                    void *indices, void *values, cusparseIndexType_t idxType,
+                    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpVecDescr_t *, int64_t, int64_t, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateSpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
+                                              int64_t *size, int64_t *nnz,
+                                              void **indices, void **values,
+                                              cusparseIndexType_t *idxType,
+                                              cusparseIndexBase_t *idxBase,
+                                              cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpVecDescr_t, int64_t *, int64_t *, void **, void **,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, size, nnz, indices, values, idxType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVecGetIndexBase(
+    cusparseSpVecDescr_t spVecDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnVec(cusparseDnVecDescr_t *dnVecDescr, int64_t size,
+                    void *values, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnVecDescr_t *, int64_t, void *, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnVec");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
+                                              int64_t *size, void **values,
+                                              cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnVecDescr_t, int64_t *, void **, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, size, values, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnVecDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnVecSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnVecDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroySpMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetFormat(
+    cusparseSpMatDescr_t spMatDescr, cusparseFormat_t *format) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, cusparseFormat_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetFormat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, format);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMatGetIndexBase(
+    cusparseSpMatDescr_t spMatDescr, cusparseIndexBase_t *idxBase) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t,
+                                                  cusparseIndexBase_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetIndexBase");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, idxBase);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetSize(cusparseSpMatDescr_t spMatDescr, int64_t *rows,
+                     int64_t *cols, int64_t *nnz) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr, int batchCount) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetStridedBatch(cusparseSpMatDescr_t spMatDescr, int *batchCount) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, int *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, batchCount);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCsr(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *csrRowOffsets, void *csrColInd, void *csrValues,
+    cusparseIndexType_t csrRowOffsetsType, cusparseIndexType_t csrColIndType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexType_t, cusparseIndexBase_t,
+      cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCsr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCsrGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
+    void **csrRowOffsets, void **csrColInd, void **csrValues,
+    cusparseIndexType_t *csrRowOffsetsType, cusparseIndexType_t *csrColIndType,
+    cusparseIndexBase_t *idxBase, cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      void **, cusparseIndexType_t *, cusparseIndexType_t *,
+      cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, csrRowOffsets, csrColInd,
+                  csrValues, csrRowOffsetsType, csrColIndType, idxBase,
+                  valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr, void *csrRowOffsets,
+                       void *csrColInd, void *csrValues) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpMatDescr_t, void *,
+                                                  void *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCsrSetPointers");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, csrRowOffsets, csrColInd, csrValues);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCoo(cusparseSpMatDescr_t *spMatDescr,
+                                               int64_t rows, int64_t cols,
+                                               int64_t nnz, void *cooRowInd,
+                                               void *cooColInd, void *cooValues,
+                                               cusparseIndexType_t cooIdxType,
+                                               cusparseIndexBase_t idxBase,
+                                               cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCoo");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  cooIdxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateCooAoS(
+    cusparseSpMatDescr_t *spMatDescr, int64_t rows, int64_t cols, int64_t nnz,
+    void *cooInd, void *cooValues, cusparseIndexType_t cooIdxType,
+    cusparseIndexBase_t idxBase, cudaDataType valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t *, int64_t, int64_t, int64_t, void *, void *,
+      cusparseIndexType_t, cusparseIndexBase_t, cudaDataType);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateCooAoS");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, cooIdxType,
+                  idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCooGet(
+    cusparseSpMatDescr_t spMatDescr, int64_t *rows, int64_t *cols, int64_t *nnz,
+    void **cooRowInd,  // COO row indices
+    void **cooColInd,  // COO column indices
+    void **cooValues,  // COO values
+    cusparseIndexType_t *idxType, cusparseIndexBase_t *idxBase,
+    cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      void **, cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooRowInd, cooColInd, cooValues,
+                  idxType, idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCooAoSGet(cusparseSpMatDescr_t spMatDescr,
+                                               int64_t *rows, int64_t *cols,
+                                               int64_t *nnz,
+                                               void **cooInd,     // COO indices
+                                               void **cooValues,  // COO values
+                                               cusparseIndexType_t *idxType,
+                                               cusparseIndexBase_t *idxBase,
+                                               cudaDataType *valueType) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseSpMatDescr_t, int64_t *, int64_t *, int64_t *, void **, void **,
+      cusparseIndexType_t *, cusparseIndexBase_t *, cudaDataType *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCooAoSGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(spMatDescr, rows, cols, nnz, cooInd, cooValues, idxType,
+                  idxBase, valueType);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseCreateDnMat(
+    cusparseDnMatDescr_t *dnMatDescr, int64_t rows, int64_t cols, int64_t ld,
+    void *values, cudaDataType valueType, cusparseOrder_t order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnMatDescr_t *, int64_t, int64_t, int64_t, void *, cudaDataType,
+      cusparseOrder_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseCreateDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, valueType, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDestroyDnMat");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
+                                              int64_t *rows, int64_t *cols,
+                                              int64_t *ld, void **values,
+                                              cudaDataType *type,
+                                              cusparseOrder_t *order) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseDnMatDescr_t, int64_t *, int64_t *, int64_t *, void **,
+      cudaDataType *, cusparseOrder_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGet");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, rows, cols, ld, values, type, order);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr, void **values) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void **);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr, void *values) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetValues");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, values);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatSetStridedBatch(
+    cusparseDnMatDescr_t dnMatDescr, int batchCount, int64_t batchStride) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int, int64_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatSetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseDnMatGetStridedBatch(
+    cusparseDnMatDescr_t dnMatDescr, int *batchCount, int64_t *batchStride) {
+  using FuncPtr =
+      cusparseStatus_t(CUSPARSEAPI *)(cusparseDnMatDescr_t, int *, int64_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseDnMatGetStridedBatch");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(dnMatDescr, batchCount, batchStride);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpVV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opX, cusparseSpVecDescr_t vecX,
+    cusparseDnVecDescr_t vecY, const void *result, cudaDataType computeType,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t,
+      cusparseDnVecDescr_t, const void *, cudaDataType, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV(cusparseHandle_t handle, cusparseOperation_t opX,
+             cusparseSpVecDescr_t vecX, cusparseDnVecDescr_t vecY, void *result,
+             cudaDataType computeType, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseSpVecDescr_t,
+      cusparseDnVecDescr_t, void *, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpVV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opX, vecX, vecY, result, computeType, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX, const void *beta,
+    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg,
+    void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
+      cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType,
+      cusparseSpMVAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMV_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, const void *alpha,
+    cusparseSpMatDescr_t matA, cusparseDnVecDescr_t vecX, const void *beta,
+    cusparseDnVecDescr_t vecY, cudaDataType computeType, cusparseSpMVAlg_t alg,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, const void *, cusparseSpMatDescr_t,
+      cusparseDnVecDescr_t, const void *, cusparseDnVecDescr_t, cudaDataType,
+      cusparseSpMVAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMV_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, alpha, matA, vecX, beta, vecY, computeType, alg,
+                  bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseDnMatDescr_t matC, cudaDataType computeType,
+    cusparseSpMMAlg_t alg, void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseDnMatDescr_t matC, cudaDataType computeType,
+    cusparseSpMMAlg_t alg, size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseDnMatDescr_t, cudaDataType, cusparseSpMMAlg_t, size_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, bufferSize);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t *descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_createDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_destroyDescr");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(descr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_workEstimation(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr,
+    size_t *bufferSize1, void *externalBuffer1) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_workEstimation");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr, bufferSize1, externalBuffer1);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_compute(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr,
+    size_t *bufferSize2, void *externalBuffer2) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t, size_t *, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_compute");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr, bufferSize2, externalBuffer2);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseSpGEMM_copy(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseSpMatDescr_t matA, cusparseSpMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    cusparseSpGEMMAlg_t alg, cusparseSpGEMMDescr_t spgemmDescr) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseSpMatDescr_t, cusparseSpMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, cusparseSpGEMMAlg_t,
+      cusparseSpGEMMDescr_t);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseSpGEMM_copy");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  alg, spgemmDescr);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    void *externalBuffer) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, void *);
+  static auto func_ptr = LoadSymbol<FuncPtr>("cusparseConstrainedGeMM");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  externalBuffer);
+}
+
+cusparseStatus_t CUSPARSEAPI cusparseConstrainedGeMM_bufferSize(
+    cusparseHandle_t handle, cusparseOperation_t opA, cusparseOperation_t opB,
+    const void *alpha, cusparseDnMatDescr_t matA, cusparseDnMatDescr_t matB,
+    const void *beta, cusparseSpMatDescr_t matC, cudaDataType computeType,
+    size_t *bufferSize) {
+  using FuncPtr = cusparseStatus_t(CUSPARSEAPI *)(
+      cusparseHandle_t, cusparseOperation_t, cusparseOperation_t, const void *,
+      cusparseDnMatDescr_t, cusparseDnMatDescr_t, const void *,
+      cusparseSpMatDescr_t, cudaDataType, size_t *);
+  static auto func_ptr =
+      LoadSymbol<FuncPtr>("cusparseConstrainedGeMM_bufferSize");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(handle, opA, opB, alpha, matA, matB, beta, matC, computeType,
+                  bufferSize);
+}
+
+}  // extern "C"

From aacff53fdd5cf5a567fb8fd5258c0894ee7d71e7 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 26 May 2020 23:22:09 -0700
Subject: [PATCH 1187/1533] Add missing dependency

PiperOrigin-RevId: 313335824
Change-Id: Ic7c4c08fda21d7582ad4e70c1c23bd58067941f7
---
 tensorflow/lite/experimental/delegates/coreml/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index c04aba65aa0..193f2e0223b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -56,6 +56,7 @@ objc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/experimental/delegates/coreml/builders:op_builder",
     ],
 )

From d8f3bd71ff01516f04e7bde9bd1a3f9f43119640 Mon Sep 17 00:00:00 2001
From: Sam Kaufman <samkaufman@google.com>
Date: Wed, 27 May 2020 00:03:51 -0700
Subject: [PATCH 1188/1533] Add missing `override` to mutable_dimensions to
 HloMapInstruction.

PiperOrigin-RevId: 313339316
Change-Id: Ieaf34037be0b4312859eee9386e594c8ded277e1
---
 tensorflow/compiler/xla/service/hlo_instructions.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 7f06c801e38..d84322c0977 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -706,7 +706,7 @@ class HloMapInstruction : public HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   const std::vector<int64>& dimensions() const override { return dimensions_; }
   int64 dimensions(int64 index) const override { return dimensions()[index]; }
-  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
+  std::vector<int64>* mutable_dimensions() override { return &dimensions_; }
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 

From 2b0a611ac461e803da6548d401643e15dfa395bf Mon Sep 17 00:00:00 2001
From: Sam Kaufman <samkaufman@google.com>
Date: Wed, 27 May 2020 00:05:04 -0700
Subject: [PATCH 1189/1533] Add mutable_padding_config to HloPadInstruction.

PiperOrigin-RevId: 313339474
Change-Id: If64214314f272c378ca49b15ac171863c04bdcd3
---
 tensorflow/compiler/xla/service/hlo_instruction.cc | 4 ++++
 tensorflow/compiler/xla/service/hlo_instruction.h  | 1 +
 tensorflow/compiler/xla/service/hlo_instructions.h | 1 +
 3 files changed, 6 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 0aadd21d0a1..c02100debc3 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -3995,6 +3995,10 @@ const PaddingConfig& HloInstruction::padding_config() const {
   return Cast<HloPadInstruction>(this)->padding_config();
 }
 
+PaddingConfig* HloInstruction::mutable_padding_config() {
+  return Cast<HloPadInstruction>(this)->mutable_padding_config();
+}
+
 int64 HloInstruction::slice_sizes(int64 dimension) const {
   return Cast<HloDynamicSliceInstruction>(this)->slice_sizes(dimension);
 }
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index c6cfda8e505..7a5d506b681 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1817,6 +1817,7 @@ class HloInstruction {
 
   // Delegates to HloPadInstruction::padding_config.
   const PaddingConfig& padding_config() const;
+  PaddingConfig* mutable_padding_config();
 
   // Delegates to HloDynamicSliceInstruction::slice_sizes.
   int64 slice_sizes(int64 dimension) const;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index d84322c0977..6da01dc088e 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -1409,6 +1409,7 @@ class HloPadInstruction : public HloInstruction {
                              const PaddingConfig& padding_config);
   // Returns the padding configuration for a pad node.
   const PaddingConfig& padding_config() const { return padding_config_; }
+  PaddingConfig* mutable_padding_config() { return &padding_config_; }
   // Returns the padding value.
   const HloInstruction* padding_value() const { return operand(1); }
   HloInstruction* mutable_padding_value() { return mutable_operand(1); }

From 7cb09cd4203f519c4ca302b8858826c8a56eb139 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 27 May 2020 01:02:25 -0700
Subject: [PATCH 1190/1533] Fix infinite symlink expansion detected with bazel
 in Docker for gpu_pip_on_cpu/nightly build.

PiperOrigin-RevId: 313345563
Change-Id: I2244eba3d1abb373de5b402c1b20fd1fea172984
---
 tensorflow/tools/ci_build/builds/docker_cpu_pip.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
index 3bb8d8b7afa..cf0036fb98f 100755
--- a/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
+++ b/tensorflow/tools/ci_build/builds/docker_cpu_pip.sh
@@ -40,7 +40,7 @@ yes "" | python configure.py
 PIP_TEST_ROOT=pip_test_root
 mkdir -p ${PIP_TEST_ROOT}
 ln -s $(pwd)/tensorflow ${PIP_TEST_ROOT}/tensorflow
-bazel test --define=no_tensorflow_py_deps=true \
+bazel --output_base=/tmp test --define=no_tensorflow_py_deps=true \
       --test_lang_filters=py \
       --build_tests_only \
       -k \

From f50cb17d92635a6470342acba4b0b7c5b56df57c Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 27 May 2020 01:12:01 -0700
Subject: [PATCH 1191/1533] Add op sanity checks for RNN ops in TFLite

- LSTMOp
- UnidirectionalSequenceLSTMOp
- UnidirectionalSequenceRNNOp

PiperOrigin-RevId: 313346702
Change-Id: I668585e637d0513f97ff46ac9fd655a814d76303
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 138 +++++++++++-------
 .../mlir/lite/tests/flatbuffer2mlir/lstm.mlir |   6 +-
 .../mlir/lite/tests/mlir2flatbuffer/lstm.mlir |  20 +--
 .../unidirectional_sequence_lstm.mlir         |  52 +++----
 .../unidirectional_sequence_rnn.mlir          |   8 +-
 tensorflow/compiler/mlir/lite/tests/ops.mlir  |  48 +++---
 .../lite/tests/split-merged-operands.mlir     |  36 ++---
 7 files changed, 173 insertions(+), 135 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index fb93bec5b56..923efdbaf9d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -254,6 +254,14 @@ class TFL_TFOperandTypesWithSameBits<int i, int j, int num> :
     Or<[CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isa<mlir::TF::Quint" # num # "Type>()">,
         CPred<"getElementTypeOrSelf($_op.getOperand(" # j # ")).isUnsignedInteger(" # num # ")">]>]>;
 
+class TFL_OperandIsNoneOrHasRank<int n, int m> :
+  PredOpTrait<"operand " # n # " is " # m # "-D",
+    Or<[
+      CPred<"$_op.getOperand(" # n # ").getType().isa<NoneType>()">,
+      TFL_OperandIsUnrankedPred<n>,
+      CPred<"$_op.getOperand(" # n #
+      ").getType().cast<ShapedType>().getRank() == " # m>]>>;
+
 class TFL_OperandIsNoneOrHasRankAtMost<int n, int m> :
   PredOpTrait<"operand " # n # " is at most " # m # "-D",
     Or<[
@@ -3539,6 +3547,19 @@ def TFL_LSTMOp :
            LstmOptionalPeepholeWeightConstraint,
            LstmProjectionWeightBiasConstraint,
            LstmResultConstraint,
+           TFL_OperandHasRank<2, 2>,           // input_to_forget_weights
+           TFL_OperandHasRank<3, 2>,           // input_to_cell_weights
+           TFL_OperandIsNoneOrHasRank<5, 2>,   // recurrent_to_input_weights
+           TFL_OperandHasRank<6, 2>,           // recurrent_to_forget_weights
+           TFL_OperandHasRank<7, 2>,           // recurrent_to_cell_weights
+           TFL_OperandIsNoneOrHasRank<9, 1>,   // cell_to_input_weights
+           TFL_OperandIsNoneOrHasRank<10, 1>,  // cell_to_forget_weights
+           TFL_OperandIsNoneOrHasRank<11, 1>,  // cell_to_output_weights
+           TFL_OperandHasRank<13, 1>,          // forget_gate_bias
+           TFL_OperandHasRank<14, 1>,          // cell_gate_bias
+           TFL_OperandHasRank<15, 1>,          // output_gate_bias
+           TFL_OperandIsNoneOrHasRank<16, 2>,  // projection_weights
+           TFL_OperandIsNoneOrHasRank<17, 1>,  // projection_bias
            TFL_StatefulOp]> {
   let summary = "The full lstm operator";
 
@@ -3565,23 +3586,23 @@ Ba et al. 'Layer Normalization'
     ins TFL_TensorOf<[F32, QI8]>:$input,
 
     // Weights
-    TFL_TensorOfOrNone<[F32, I8, QI8]>:$input_to_input_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$input_to_forget_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$input_to_cell_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$input_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$input_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_output_weights,
 
     // Recurrent weights
-    TFL_TensorOfOrNone<[F32, I8, QI8]>:$recurrent_to_input_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$recurrent_to_forget_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$recurrent_to_cell_weights,
-    TFL_TensorOf<[F32, I8, QI8]>:$recurrent_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$recurrent_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_output_weights,
 
     // Cell weights
-    TFL_TensorOfOrNone<[F32, I8, QI16]>:$cell_to_input_weights,
+    TFL_TensorOfOrNone<[F32, QI8, QI16]>:$cell_to_input_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8, QI16]>:$cell_to_forget_weights,
+    TFL_TensorOfOrNone<[F32, QI8, QI16]>:$cell_to_forget_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8, QI16]>:$cell_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8, QI16]>:$cell_to_output_weights,
 
     // Bias
     TFL_TensorOfOrNone<[F32, QI32]>:$input_gate_bias,
@@ -3590,7 +3611,7 @@ Ba et al. 'Layer Normalization'
     TFL_TensorOf<[F32, QI32]>:$output_gate_bias,
 
     // Projection weight and bias
-    TFL_TensorOfOrNone<[F32, I8, QI8]>:$projection_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$projection_weights,
     // Optional input
     TFL_TensorOfOrNone<[F32, QI32]>:$projection_bias,
 
@@ -3606,8 +3627,8 @@ Ba et al. 'Layer Normalization'
 
     // Attributes
     TFL_AFAttr:$fused_activation_function,
-    DefaultValuedAttr<F32Attr, "0.0f">:$cell_clip,
-    DefaultValuedAttr<F32Attr, "0.0f">:$proj_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$cell_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$proj_clip,
     // Since this op is the FULL kernel only, constrain it.
     Confined<
       DefaultValuedAttr<TFL_LSTMKernelTypeAttr, "FULL">,
@@ -3647,6 +3668,24 @@ def TFL_UnidirectionalSequenceLSTMOp :
            LstmOptionalPeepholeWeightConstraint,
            LstmProjectionWeightBiasConstraint,
            LstmResultConstraint,
+           TFL_OperandHasRankAtLeast<0, 2>,    // input
+           TFL_OperandIsNoneOrHasRank<1, 2>,   // input_to_input_weights
+           TFL_OperandHasRank<2, 2>,           // input_to_forget_weights
+           TFL_OperandHasRank<3, 2>,           // input_to_cell_weights
+           TFL_OperandHasRank<4, 2>,           // input_to_output_weights
+           TFL_OperandIsNoneOrHasRank<5, 2>,   // recurrent_to_input_weights
+           TFL_OperandHasRank<6, 2>,           // recurrent_to_forget_weights
+           TFL_OperandHasRank<7, 2>,           // recurrent_to_cell_weights
+           TFL_OperandHasRank<8, 2>,           // recurrent_to_output_weights
+           TFL_OperandIsNoneOrHasRank<9, 1>,   // cell_to_input_weights
+           TFL_OperandIsNoneOrHasRank<10, 1>,  // cell_to_forget_weights
+           TFL_OperandIsNoneOrHasRank<11, 1>,  // cell_to_output_weights
+           TFL_OperandIsNoneOrHasRank<12, 1>,  // input_gate_bias
+           TFL_OperandHasRank<13, 1>,          // forget_gate_bias
+           TFL_OperandHasRank<14, 1>,          // cell_gate_bias
+           TFL_OperandHasRank<15, 1>,          // output_gate_bias
+           TFL_OperandIsNoneOrHasRank<16, 2>,  // projection_weights
+           TFL_OperandIsNoneOrHasRank<17, 2>,  // projection_bias
            TFL_StatefulOp]> {
   let summary = "Unidirectional sequence lstm operator";
 
@@ -3662,35 +3701,35 @@ def TFL_UnidirectionalSequenceLSTMOp :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8]>:$input,
+    ins TFL_FpTensor:$input,
 
     // Weights
-    TFL_TensorOfOrNone<[F32, I8]>:$input_to_input_weights,
-    TFL_TensorOf<[F32, I8]>:$input_to_forget_weights,
-    TFL_TensorOf<[F32, I8]>:$input_to_cell_weights,
-    TFL_TensorOf<[F32, I8]>:$input_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$input_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_output_weights,
 
     // Recurrent weights
-    TFL_TensorOfOrNone<[F32, I8]>:$recurrent_to_input_weights,
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_forget_weights,
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_cell_weights,
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$recurrent_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_forget_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_cell_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_output_weights,
 
     // Cell weights
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_to_input_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_to_input_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_to_forget_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_to_forget_weights,
     // Optional input
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_to_output_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_to_output_weights,
 
     // Bias
     TFL_TensorOfOrNone<[F32]>:$input_gate_bias,
-    TFL_TensorOf<[F32]>:$forget_gate_bias,
-    TFL_TensorOf<[F32]>:$cell_bias,
-    TFL_TensorOf<[F32]>:$output_gate_bias,
+    TFL_FpTensor:$forget_gate_bias,
+    TFL_FpTensor:$cell_bias,
+    TFL_FpTensor:$output_gate_bias,
 
     // Projection weight and bias
-    TFL_TensorOfOrNone<[F32, I8]>:$projection_weights,
+    TFL_TensorOfOrNone<[F32, QI8]>:$projection_weights,
     // Optional input
     TFL_TensorOfOrNone<[F32]>:$projection_bias,
 
@@ -3699,19 +3738,19 @@ def TFL_UnidirectionalSequenceLSTMOp :
     TFL_StatefulTensor:$input_cell_state,
 
     // Layer norm coefficients
-    TFL_TensorOfOrNone<[F32, I8]>:$input_layer_norm_coefficients,
-    TFL_TensorOfOrNone<[F32, I8]>:$forget_layer_norm_coefficients,
-    TFL_TensorOfOrNone<[F32, I8]>:$cell_layer_norm_coefficients,
-    TFL_TensorOfOrNone<[F32, I8]>:$output_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$input_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$forget_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$cell_layer_norm_coefficients,
+    TFL_TensorOfOrNone<[F32, QI8]>:$output_layer_norm_coefficients,
 
     // Attributes
     TFL_AFAttr:$fused_activation_function,
-    DefaultValuedAttr<F32Attr, "0.0f">:$cell_clip,
-    DefaultValuedAttr<F32Attr, "0.0f">:$proj_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$cell_clip,
+    Confined<DefaultValuedAttr<F32Attr, "0.0f">, [TFL_FloatNonNegative]>:$proj_clip,
     BoolAttr:$time_major
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8]>:$output);
 
   let hasOptions = 1;
 
@@ -3908,15 +3947,14 @@ def TFL_BidirectionalSequenceLSTMOp :
   }];
 }
 
-def RnnResultConstraint : PredOpTrait<
-  "the input and result tensor elemental types must be same",
-  TCresVTEtIsSameAsOp<0, 0>>;
-
 // UnidirectionalSequenceRNN op.
-def TFL_UnidirectionalSequenceRNNOp :
-  TFL_Op<"unidirectional_sequence_rnn",
-         [RnnResultConstraint, TFL_StatefulOp]> {
-
+def TFL_UnidirectionalSequenceRNNOp : TFL_Op<"unidirectional_sequence_rnn", [
+    TFL_OperandHasRank<4, 2>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    PredOpTrait<"input and constant value operands must have same element type",
+      TFL_TCopVTEtAreSameAt<1, 2>>,
+    TFL_StatefulOp]> {
   let summary = "Unidirectional sequence rnn operator";
 
   let description = [{
@@ -3933,16 +3971,16 @@ def TFL_UnidirectionalSequenceRNNOp :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I8]>:$input,
+    ins TFL_FpTensor:$input,
 
     // Weights
-    TFL_TensorOf<[F32, I8]>:$input_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$input_to_input_weights,
 
     // Recurrent weights
-    TFL_TensorOf<[F32, I8]>:$recurrent_to_input_weights,
+    TFL_TensorOf<[F32, QI8]>:$recurrent_to_input_weights,
 
     // Bias
-    TFL_TensorOf<[F32]>:$input_gate_bias,
+    TFL_FpTensor:$input_gate_bias,
 
     // Hidden state.
     TFL_StatefulTensor:$hidden_state,
@@ -3952,7 +3990,7 @@ def TFL_UnidirectionalSequenceRNNOp :
     TFL_AFAttr:$fused_activation_function
   );
 
-  let results = (outs TFL_TensorOf<[F32, I8]>:$output);
+  let results = (outs TFL_FpTensor:$output);
 
   let hasOptions = 1;
 
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
index 0dd8ddc4c91..d793ea2d62f 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/lstm.mlir
@@ -1,15 +1,15 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_translate --tflite-flatbuffer-to-mlir - -o - | FileCheck --dump-input-on-failure %s
 // Ensure lstm roundtrip exactly
 
-func @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
+func @main(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 // CHECK-LABEL: main
 // seperate lines since there is no region for this op. third_party/tensorflow/compiler/mlir/lite/ir/tfl_ops.td: 3252
 // CHECK: %[[RES0:.*]] = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg22, %arg23, %arg18, %arg19, %arg20, %arg21) ( {
-// CHECK:  }) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+// CHECK:  }) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
 // CHECK: return %[[RES0]]
 
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
index e278572cd1e..ef78f993cc4 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir
@@ -1,6 +1,6 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
-func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32> {
+func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32> {
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
@@ -72,21 +72,21 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 10,
 // CHECK-NEXT:       name: "arg9",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 11,
 // CHECK-NEXT:       name: "arg10",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 12,
 // CHECK-NEXT:       name: "arg11",
 // CHECK-NEXT:       quantization: {
@@ -100,21 +100,21 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 14,
 // CHECK-NEXT:       name: "arg13",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 15,
 // CHECK-NEXT:       name: "arg14",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 16,
 // CHECK-NEXT:       name: "arg15",
 // CHECK-NEXT:       quantization: {
@@ -128,7 +128,7 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 1, 4 ],
+// CHECK-NEXT:       shape: [ 4 ],
 // CHECK-NEXT:       buffer: 18,
 // CHECK-NEXT:       name: "arg17",
 // CHECK-NEXT:       quantization: {
@@ -261,9 +261,9 @@ func @main(tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, t
 // CHECK-EMPTY:
 
 
-^bb0(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>):
+^bb0(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>):
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
index 8e579421b0b..d9bba58b7d7 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir
@@ -1,6 +1,6 @@
 // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s
 
-func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> {
+func @main(tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32> {
 // CHECK: {
 // CHECK-NEXT:   version: 3,
 // CHECK-NEXT:   operator_codes: [ {
@@ -9,63 +9,63 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT:   } ],
 // CHECK-NEXT:   subgraphs: [ {
 // CHECK-NEXT:     tensors: [ {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 1,
 // CHECK-NEXT:       name: "arg0",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 2,
 // CHECK-NEXT:       name: "arg1",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 3,
 // CHECK-NEXT:       name: "arg2",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 4,
 // CHECK-NEXT:       name: "arg3",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 5,
 // CHECK-NEXT:       name: "arg4",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 6,
 // CHECK-NEXT:       name: "arg5",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 7,
 // CHECK-NEXT:       name: "arg6",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 8,
 // CHECK-NEXT:       name: "arg7",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 9,
 // CHECK-NEXT:       name: "arg8",
 // CHECK-NEXT:       quantization: {
@@ -121,63 +121,63 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 17,
 // CHECK-NEXT:       name: "arg16",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 18,
 // CHECK-NEXT:       name: "arg17",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 19,
 // CHECK-NEXT:       name: "arg18",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 20,
 // CHECK-NEXT:       name: "arg19",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 21,
 // CHECK-NEXT:       name: "arg20",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 22,
 // CHECK-NEXT:       name: "arg21",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       }
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       name: "Const",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       },
 // CHECK-NEXT:       is_variable: true
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       name: "Const1",
 // CHECK-NEXT:       quantization: {
 // CHECK-EMPTY:
 // CHECK-NEXT:       },
 // CHECK-NEXT:       is_variable: true
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:       shape: [ 4 ],
+// CHECK-NEXT:       shape: [ 4, 4 ],
 // CHECK-NEXT:       buffer: 25,
 // CHECK-NEXT:       name: "tfl.unidirectional_sequence_lstm",
 // CHECK-NEXT:       quantization: {
@@ -244,9 +244,9 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
-// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:     data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:   }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:   }, {
@@ -259,9 +259,9 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, t
 // CHECK-NEXT: }
 // CHECK-EMPTY:
 
-^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg18: tensor<4 x f32>, %arg19: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>):
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %2 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %0, %1, %arg18, %arg19, %arg20, %arg21) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %2 : tensor<4xf32>
+^bb0(%arg0: tensor<4x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4x4xf32>, %arg18: tensor<4x4xf32>, %arg19: tensor<4x4xf32>, %arg20: tensor<4x4xf32>, %arg21: tensor<4x4xf32>):
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %2 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %0, %1, %arg18, %arg19, %arg20, %arg21) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %2 : tensor<4x4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
index 7ba24bd5c51..f2b99bcd0df 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_rnn.mlir
@@ -37,7 +37,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-EMPTY:
 // CHECK-NEXT:         }
 // CHECK-NEXT:       }, {
-// CHECK-NEXT:         shape: [ 4 ],
+// CHECK-NEXT:         shape: [ 4, 4 ],
 // CHECK-NEXT:         name: "Const",
 // CHECK-NEXT:         quantization: {
 // CHECK-EMPTY:
@@ -76,7 +76,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:     }, {
-// CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
+// CHECK-NEXT:      data: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
 // CHECK-NEXT:     }, {
 // CHECK-EMPTY:
 // CHECK-NEXT:     }, {
@@ -90,7 +90,7 @@ func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -
 // CHECK-EMPTY:
 
 ^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>):
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %0) {fused_activation_function = "TANH", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %0) {fused_activation_function = "TANH", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>) -> tensor<4xf32>
   return %1 : tensor<4xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 981f08d277e..3451f28380b 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -581,36 +581,36 @@ func @testLogisticWithWrongInputType(tensor<?xi32>) -> tensor<?xi32> {
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceRnn
-func @testUnidirectionalSequenceRnn(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceRnn(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x ? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_rnn"(%arg0, %arg1, %arg2, %arg3, %arg4) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithoutProjection
-func @testUnidirectionalSequenceLstmWithoutProjection(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: none, %arg17: none, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceLstmWithoutProjection(%arg0: tensor<? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: none, %arg17: none, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, none, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceLstm
-func @testUnidirectionalSequenceLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceLstm(%arg0: tensor<? x ? x f32>, %arg1: tensor<? x ? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x ? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
 // -----
 
 // CHECK-LABEL: testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr
-func @testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
-  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+func @testUnidirectionalSequenceLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x ? x f32>, %arg1: none, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x ? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x ? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x ? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+  // CHECK: "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", time_major = false} : (tensor<?x?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -663,10 +663,10 @@ func @testLstmQuantizedType(%arg0: tensor<1x528x!quant.uniform<i8:f32, 0.0372480
 // -----
 
 // CHECK-LABEL: testLstm
-func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+func @testLstm(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
-  // CHECK-NEXT: {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-NEXT: {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -689,10 +689,10 @@ func @testQuantizedBasicLstm(%arg0: tensor<1x384x!quant.uniform<u8:f32, 7.812500
 // -----
 
 // CHECK-LABEL: testLstmWithNoneTypeAndOverrideAttr
-func @testLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x f32>, %arg3: tensor<? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x f32>, %arg6: tensor<? x f32>, %arg7: tensor<? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
+func @testLstmWithNoneTypeAndOverrideAttr(%arg0: tensor<? x f32>, %arg1: none, %arg2: tensor<? x ? x f32>, %arg3: tensor<? x ? x f32>, %arg4: tensor<? x f32>, %arg5: tensor<? x ? x f32>, %arg6: tensor<? x ? x f32>, %arg7: tensor<? x ? x f32>, %arg8: tensor<? x f32>, %arg9: tensor<? x f32>, %arg10: tensor<? x f32>, %arg11: tensor<? x f32>, %arg12: tensor<? x f32>, %arg13: tensor<? x f32>, %arg14: tensor<? x f32>, %arg15: tensor<? x f32>, %arg16: tensor<? x ? x f32>, %arg17: tensor<? x f32>, %arg18: tensor<? x f32>, %arg19: tensor<? x f32>, %arg20: tensor<? x f32>, %arg21: tensor<? x f32>, %arg22: tensor<? x f32>, %arg23: tensor<? x f32>) -> tensor<? x f32> {
   // CHECK: "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23)
-  // CHECK-NEXT: {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-NEXT: {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %arg18, %arg19, %arg20, %arg21, %arg22, %arg23) ({}) {cell_clip = 1.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<?xf32>, none, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?x?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
 
@@ -707,11 +707,11 @@ func @testLstmWithInvalidNoneType(%arg0: tensor<? x f32>, %arg1: tensor<? x f32>
 
 // -----
 
-// test invalid input dimension, the first input operand for lstm op should be at least 2D tensor.
+// test invalid input dimension, the third input operand for lstm op should be 2-D tensor.
 func @testLstmWithInvalidInputDimension(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg18: tensor<4 x f32>, %arg19: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>) -> tensor<4 x f32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  // expected-error @+1 {{'tfl.lstm' op the first input operand should have more than 2 dimensions.}}
+  // expected-error @+1 {{'tfl.lstm' op failed to verify that operand 2 is 2-D}}
   %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
   return %24 : tensor<4xf32>
 
@@ -720,22 +720,22 @@ func @testLstmWithInvalidInputDimension(%arg0: tensor<4 x f32>, %arg1: tensor<4
 // -----
 
 // 'input_to_output_weights' input for lstm op has unmatched rank with `input`.
-func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x2xf32>, %arg2: tensor<4x2xf32>, %arg3: tensor<4x2xf32>, %arg4: tensor<4x2xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
+func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x2xf32>, %arg2: tensor<4x2xf32>, %arg3: tensor<4x2xf32>, %arg4: tensor<4x2xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<4xf32>, %arg19: tensor<4xf32>, %arg20: tensor<4xf32>, %arg21: tensor<4xf32>) -> tensor<1x4xf32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   // expected-error @+1 {{'tfl.lstm' op inputs don't match with the dimensions.}}
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 }
 
 // -----
 
 // Coefficient inputs of LSTM op don't match the dimension with input operand `input_to_output_weights`.
-func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4x4xf32>, %arg10: tensor<4x4xf32>, %arg11: tensor<4x4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<1x4xf32>, %arg14: tensor<1x4xf32>, %arg15: tensor<1x4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<1x4xf32>, %arg18: tensor<3xf32>, %arg19: tensor<3xf32>, %arg20: tensor<3xf32>, %arg21: tensor<3xf32>) -> tensor<1x4xf32> {
+func @testLstmWithInvalidInputsRankMatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4x4xf32>, %arg2: tensor<4x4xf32>, %arg3: tensor<4x4xf32>, %arg4: tensor<4x4xf32>, %arg5: tensor<4x4xf32>, %arg6: tensor<4x4xf32>, %arg7: tensor<4x4xf32>, %arg8: tensor<4x4xf32>, %arg9: tensor<4xf32>, %arg10: tensor<4xf32>, %arg11: tensor<4xf32>, %arg12: tensor<1x4xf32>, %arg13: tensor<4xf32>, %arg14: tensor<4xf32>, %arg15: tensor<4xf32>, %arg16: tensor<4x4xf32>, %arg17: tensor<4xf32>, %arg18: tensor<3xf32>, %arg19: tensor<3xf32>, %arg20: tensor<3xf32>, %arg21: tensor<3xf32>) -> tensor<1x4xf32> {
   %cst0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   %cst1 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<1x4xf32>} : () -> tensor<1x4xf32> loc("Const")
   // expected-error @+1 {{'tfl.lstm' op coefficient inputs have more than 2 dimensions or don't match the dimension with input operand `input_to_output_weights`.}}
-  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<4x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>) -> tensor<1x4xf32>
+  %24 = "tfl.lstm"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9, %arg10, %arg11, %arg12, %arg13, %arg14, %arg15, %arg16, %arg17, %cst0, %cst1, %arg18, %arg19, %arg20, %arg21) ({}) {cell_clip = 0.000000e+00 : f32, fused_activation_function = "NONE", kernel_type = "FULL", proj_clip = 0.000000e+00 : f32} : (tensor<1x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<1x4xf32>, tensor<1x4xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>, tensor<3xf32>) -> tensor<1x4xf32>
   return %24 : tensor<1x4xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir b/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
index d2d0e43e0e9..c5c9ee645f4 100644
--- a/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/split-merged-operands.mlir
@@ -1,27 +1,27 @@
 // RUN: tf-opt -tfl-split-merged-operands %s | FileCheck %s
 
-func @testSingleLstm(%arg0: tensor<4 x f32>) -> tensor<4xf32> {
+func @testSingleLstm(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
   // CHECK-LABEL: testSingleLstm
-  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[LSTM:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
 
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %1 : tensor<4xf32>
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %1 : tensor<4x4xf32>
 }
 
-func @testMultipleLstms(%arg0: tensor<4 x f32>) -> tensor<4xf32> {
+func @testMultipleLstms(%arg0: tensor<4x4xf32>, %arg1: tensor<4xf32>) -> tensor<4x4xf32> {
   // CHECK-LABEL: testMultipleLstms
-  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[LSTM_1:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  // CHECK:  %[[CST_2:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[CST_3:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4xf32>} : () -> tensor<4xf32>
-  // CHECK:  %[[LSTM_2:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%[[LSTM_1]], %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %[[CST_2]], %[[CST_3]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+  // CHECK:  %[[CST_0:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[CST_1:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM_1:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %[[CST_0]], %[[CST_1]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  // CHECK:  %[[CST_2:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[CST_3:.*]] = "tfl.pseudo_const"() {value = dense<0.000000e+00> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+  // CHECK:  %[[LSTM_2:[a-z0-9]*]] = "tfl.unidirectional_sequence_lstm"(%[[LSTM_1]], %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %[[CST_2]], %[[CST_3]], %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
 
-  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32> loc("Const")
-  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  %2 = "tfl.unidirectional_sequence_lstm"(%1, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
-  return %2 : tensor<4xf32>
+  %0 = "tfl.pseudo_const" () {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32> loc("Const")
+  %1 = "tfl.unidirectional_sequence_lstm"(%arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  %2 = "tfl.unidirectional_sequence_lstm"(%1, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg0, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg1, %arg0, %arg0, %0, %0, %arg0, %arg0, %arg0, %arg0) {fused_activation_function = "NONE", time_major = true} : (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+  return %2 : tensor<4x4xf32>
 }

From 698bcf78c4ace7645b447aa49711da9fe8bdbd71 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 02:02:43 -0700
Subject: [PATCH 1192/1533] compat: Update forward compatibility horizon to
 2020-05-27

PiperOrigin-RevId: 313351216
Change-Id: I4097df617d325ccdceae667a9e0f921087a24f0e
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 927256bc55d..53545c58a2d 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 26)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 27)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 1ac5f274538957e1e3d5d457143cfda99418b9e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 02:02:46 -0700
Subject: [PATCH 1193/1533] Update GraphDef version to 414.

PiperOrigin-RevId: 313351224
Change-Id: Idf6c068e73325b0ca50236e785786981193f36bd
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index b02f78a9dc3..7131d1f7227 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 413  // Updated: 2020/5/26
+#define TF_GRAPH_DEF_VERSION 414  // Updated: 2020/5/27
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 9084090e8d785536b9eabb45d2adbac42466b683 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 02:06:21 -0700
Subject: [PATCH 1194/1533] Fix tensorflow::errors:* calls, which use StrCat
 instead of StrFormat

PiperOrigin-RevId: 313351641
Change-Id: I4ac63354e4dd845cad9c2e720a3ac8d3ed2c0dab
---
 .../compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 0effcdc5e4e..a1401323e89 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -254,7 +254,7 @@ Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
   std::string error_message;
   auto output = mlir::openOutputFile(filename, &error_message);
   if (!error_message.empty()) {
-    return errors::InvalidArgument("Failed to open file in %s.", filename);
+    return errors::InvalidArgument("Failed to open file in ", filename);
   }
   mlir::PassManager pm(module.getContext());
   pm.addPass(mlir::createPrintOpGraphPass(output->os()));

From 7738c1818eaecd58a2eb822fdcd2fb4463bacc1b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 02:22:10 -0700
Subject: [PATCH 1195/1533] Roll forward API for automatic fallback on
 delegation failure.

PiperOrigin-RevId: 313353207
Change-Id: I0f7824ecc5421a179c10a6de4fc5192e9815abb7
---
 tensorflow/lite/BUILD                         |  4 +-
 tensorflow/lite/core/subgraph.cc              |  7 ++
 tensorflow/lite/core/subgraph.h               |  6 ++
 tensorflow/lite/delegates/BUILD               | 11 +++
 tensorflow/lite/delegates/delegate_test.cc    | 73 ++++++++++++++++++-
 .../lite/delegates/interpreter_utils.cc       | 65 +++++++++++++++++
 tensorflow/lite/delegates/interpreter_utils.h | 52 +++++++++++++
 tensorflow/lite/delegates/utils.h             |  2 +
 tensorflow/lite/interpreter.cc                |  4 +
 tensorflow/lite/interpreter.h                 | 13 ++++
 10 files changed, 234 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/lite/delegates/interpreter_utils.cc
 create mode 100644 tensorflow/lite/delegates/interpreter_utils.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index ef25f03562f..6477c0491f9 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -376,7 +376,9 @@ cc_test(
 cc_test(
     name = "interpreter_test",
     size = "small",
-    srcs = ["interpreter_test.cc"],
+    srcs = [
+        "interpreter_test.cc",
+    ],
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     tags = [
         "tflite_not_portable_ios",  # TODO(b/117786830)
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 7f4e0e286ea..81710df128b 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -533,6 +533,11 @@ void Subgraph::SetCancellationFunction(void* data,
   check_cancelled_func_ = check_cancelled_func;
 }
 
+bool Subgraph::IsCancelled() {
+  return (check_cancelled_func_ != nullptr) &&
+         (*check_cancelled_func_)(cancellation_data_);
+}
+
 void Subgraph::ReserveNodes(int count) {
   nodes_and_registration_.reserve(count);
 }
@@ -1316,6 +1321,8 @@ TfLiteStatus Subgraph::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Subgraph::HasDelegates() { return !delegates_applied_.empty(); }
+
 TfLiteStatus Subgraph::EnsureMemoryAllocations() {
   if (memory_planner_) {
     state_ = kStateUninvokable;
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 0b0c1e31e89..d6067daaa6a 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -553,6 +553,9 @@ class Subgraph {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if the subgraph has delegates applied.
+  bool HasDelegates();
+
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
@@ -578,6 +581,9 @@ class Subgraph {
   // Ensures the memory required is planned and allocated.
   TfLiteStatus EnsureMemoryAllocations();
 
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
   // The state of the Interpreter.
   enum State {
     // The interpreter isn't ready to be invoked.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 619c4d75130..8a05298d01a 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -32,6 +32,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "interpreter_utils",
+    srcs = ["interpreter_utils.cc"],
+    hdrs = ["interpreter_utils.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
 cc_test(
     name = "utils_test",
     srcs = ["utils_test.cc"],
@@ -53,6 +63,7 @@ cc_test(
         "tflite_not_portable_ios",  # TODO(b/117786830)
     ],
     deps = [
+        ":interpreter_utils",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 566cc644d3e..1efe6e44d54 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/interpreter_utils.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -261,8 +262,10 @@ class TestDelegate : public ::testing::Test {
         for (int i = 0; i < num; i++) {
           out->data.f[i] = a0->data.f[i] + a1->data.f[i];
         }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        out->data_is_stale = true;
+        if (out->buffer_handle != kTfLiteNullBufferHandle) {
+          // Make the data stale so that CopyFromBufferHandle can be invoked
+          out->data_is_stale = true;
+        }
         return kTfLiteOk;
       };
       if (fail_delegate_node_invoke_) {
@@ -397,6 +400,34 @@ TEST_F(TestDelegate, DelegateNodeInvokeFailure) {
   }
 }
 
+TEST_F(TestDelegate, DelegateNodeInvokeFailureFallback) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Delegation modified execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // Delegation removed, returning to original execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // Check outputs.
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
 TEST_F(TestDelegate, SecondDelegationPrepareFailure) {
   // First delegate only supports nodes 1, 2. Gets applied successfully.
   // This delegate should support dynamic tensors, otherwise the second won't be
@@ -713,6 +744,44 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
   }
 }
 
+TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  // Pre-delegation execution plan should have three nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // All delegates should be undone.
+  EXPECT_EQ(interpreter_->execution_plan().size(), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
 TEST_F(TestDelegate, ReleaseNonPersistentMemoryWithDelegates) {
   // First delegate only supports node 0.
   // This delegate should support dynamic tensors, otherwise the second won't be
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
new file mode 100644
index 00000000000..89955b23361
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/interpreter_utils.h"
+
+namespace tflite {
+namespace delegates {
+TfLiteStatus InterpreterUtils::InvokeWithCPUFallback(Interpreter* interpreter) {
+  TfLiteStatus status = interpreter->Invoke();
+  if (status == kTfLiteOk || interpreter->IsCancelled() ||
+      !interpreter->HasDelegates()) {
+    return status;
+  }
+  // Retry without delegation.
+  // TODO(b/138706191): retry only if error is due to delegation.
+  TF_LITE_REPORT_ERROR(
+      interpreter->error_reporter(),
+      "Invoke() failed in the presence of delegation. Retrying without.");
+
+  // Copy input data to a buffer.
+  // Input data is safe since Subgraph::PrepareOpsAndTensors() passes
+  // preserve_inputs=true to ArenaPlanner.
+  std::vector<char> buf;
+  size_t input_size = 0;
+
+  for (auto i : interpreter->inputs()) {
+    TF_LITE_ENSURE_STATUS(interpreter->EnsureTensorDataIsReadable(i));
+    TfLiteTensor* t = interpreter->tensor(i);
+    input_size += t->bytes;
+  }
+  buf.reserve(input_size);
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    buf.insert(buf.end(), t->data.raw, t->data.raw + t->bytes);
+  }
+
+  TF_LITE_ENSURE_STATUS(interpreter->RemoveAllDelegates());
+
+  // Copy inputs from buffer.
+  auto bufp = buf.begin();
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::copy(bufp, bufp + t->bytes, t->data.raw);
+    bufp += t->bytes;
+  }
+
+  // Invoke again.
+  TF_LITE_ENSURE_STATUS(interpreter->Invoke());
+  return kTfLiteDelegateError;
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
new file mode 100644
index 00000000000..f736c2db1f4
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+
+#include "tensorflow/lite/interpreter.h"
+
+// Utility functions and classes for using delegates.
+
+namespace tflite {
+namespace delegates {
+#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+class InterpreterUtils {
+ public:
+  /// Invokes an interpreter with automatic fallback from delegation to CPU.
+  ///
+  /// If using the delegate fails, the delegate is automatically undone and an
+  /// attempt made to return the interpreter to an invokable state.
+  ///
+  /// Allowing the fallback is suitable only if both of the following hold:
+  /// - The caller is known not to cache pointers to tensor data across Invoke()
+  ///   calls.
+  /// - The model is not stateful (no variables, no LSTMs) or the state isn't
+  ///   needed between batches.
+  ///
+  /// Returns one of the following three status codes:
+  /// 1. kTfLiteOk: Success. Output is valid.
+  /// 2. kTfLiteDelegateError: Delegate error but fallback succeeded. Output is
+  /// valid.
+  /// NOTE: This undoes all delegates previously applied to the Interpreter.
+  /// 3. kTfLiteError: Unexpected/runtime failure. Output is invalid.
+  /// WARNING: This is an experimental API and subject to change.
+  static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
+};
+#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 2238ba681e6..11ad9990426 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_H_
 
+// Utility functions and classes for implementing delegates.
+
 #include <functional>
 #include <limits>
 #include <set>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index c8ccf671d60..167254a2a62 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -310,6 +310,8 @@ void Interpreter::SetCancellationFunction(void* data,
   }
 }
 
+bool Interpreter::IsCancelled() { return primary_subgraph().IsCancelled(); }
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
@@ -340,6 +342,8 @@ TfLiteStatus Interpreter::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
+
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 5278bc85eec..0e01ce44e0c 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -42,6 +42,9 @@ namespace tflite {
 
 class InterpreterTest;
 class TestDelegate;
+namespace delegates {
+class InterpreterUtils;  // Class for friend declarations.
+}  // namespace delegates
 
 namespace impl {
 
@@ -529,6 +532,7 @@ class Interpreter {
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
   friend class tflite::TestDelegate;
+  friend class tflite::delegates::InterpreterUtils;
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -542,6 +546,15 @@ class Interpreter {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if delegates have been applied.
+  bool HasDelegates();
+
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
+  // Get the error reporter associated with this interpreter.
+  ErrorReporter* error_reporter() { return error_reporter_; }
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.

From f0ef163443b301ca913e967be566d8401c1bbf7a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Wed, 27 May 2020 02:56:50 -0700
Subject: [PATCH 1196/1533] Add an MLIR tracing implementation to the C unified
 API

This is plumbing just enough to pass all the unit-tests.
The conversion to the function library is quite inefficient, but it isn't
clear if we want to optimize this or just focus on TFRT moving forward.

PiperOrigin-RevId: 313356850
Change-Id: I83815317d4958786d0103168b5d88498f89511ed
---
 tensorflow/c/BUILD                            |   1 +
 tensorflow/c/eager/BUILD                      |  19 +
 .../c_api_unified_experimental_internal.h     |   6 +-
 .../eager/c_api_unified_experimental_test.cc  |   3 +-
 tensorflow/compiler/mlir/tensorflow/BUILD     |   3 +
 tensorflow/compiler/mlir/tensorflow/c/BUILD   |  55 ++
 .../c/c_api_unified_experimental_mlir.cc      | 493 ++++++++++++++++++
 ..._unified_experimental_mlir_registration.cc |  31 ++
 .../tensorflow/translate/export_graphdef.cc   |  18 +
 .../tensorflow/translate/export_graphdef.h    |   7 +
 10 files changed, 632 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/c/BUILD
 create mode 100644 tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
 create mode 100644 tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index e2781afc3e5..12021a294e8 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -216,6 +216,7 @@ tf_cuda_library(
     ],
     visibility = [
         "//tensorflow/c:__subpackages__",
+        "//tensorflow/compiler/mlir/tensorflow/c:__subpackages__",
     ],
     deps = select({
         "//tensorflow:android": [
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index eb3035cc3d7..b8429646960 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -144,6 +144,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "c_api_unified_internal",
+    hdrs = [
+        "c_api_unified_experimental_internal.h",
+    ],
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_status",
+        "//tensorflow/core/platform:casts",
+        "//tensorflow/core/platform:types",
+    ],
+)
+
 cc_library(
     name = "tensor_handle_interface",
     hdrs = ["tensor_handle_interface.h"],
@@ -514,6 +532,7 @@ tf_cuda_cc_test(
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_test_util",
         "//tensorflow/cc/profiler",
+        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
index 49212a230ee..8fc696f0f2f 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_internal.h
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -58,7 +58,7 @@ T* dyncast(S source) {
 // GraphContext and vice-versa).
 class AbstractTensor {
  protected:
-  enum AbstractTensorKind { kGraphTensor, kEagerTensor, kMLIRTensor };
+  enum AbstractTensorKind { kMlirTensor, kGraphTensor, kEagerTensor };
   explicit AbstractTensor(AbstractTensorKind kind) : kind_(kind) {}
 
  public:
@@ -101,7 +101,7 @@ class AbstractFunction {
 // on a given context, with the same or different input tensors.
 class AbstractOp {
  protected:
-  enum AbstractOpKind { kGraphOp, kEagerOp };
+  enum AbstractOpKind { kMlirOp, kGraphOp, kEagerOp };
   explicit AbstractOp(AbstractOpKind kind) : kind_(kind) {}
 
  public:
@@ -129,7 +129,7 @@ class AbstractOp {
 // eager implementation or to a graph implementation.
 struct ExecutionContext {
  protected:
-  enum ExecutionContextKind { kGraphContext, kEagerContext };
+  enum ExecutionContextKind { kMlirContext, kGraphContext, kEagerContext };
   explicit ExecutionContext(ExecutionContextKind kind) : k(kind) {}
 
  public:
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 9776b4d13ed..24d170f2f99 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -477,7 +477,8 @@ TEST_P(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
-INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI, ::testing::Values("graphdef"));
+INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
+                         ::testing::Values("graphdef", "mlir"));
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index de0af94f0cb..5110ea7fbf5 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -788,6 +788,9 @@ cc_library(
     name = "convert_type",
     srcs = ["utils/convert_type.cc"],
     hdrs = ["utils/convert_type.h"],
+    visibility = [
+        "//visibility:public",
+    ],
     deps = [
         ":tensorflow_types",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
new file mode 100644
index 00000000000..3a503685fc6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -0,0 +1,55 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_copts",
+    "tf_cuda_library",
+    "tfe_xla_copts",
+)
+
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    packages = ["//tensorflow/..."],
+)
+
+tf_cuda_library(
+    name = "mlir_c_api",
+    srcs = [
+        "c_api_unified_experimental_mlir.cc",
+    ],
+    copts = tf_copts() + tfe_xla_copts(),
+    deps = [
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/c:tf_status_internal",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:c_api_unified_internal",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:casts",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "mlir_c_api_registration",
+    srcs = ["c_api_unified_experimental_mlir_registration.cc"],
+    deps = [
+        ":mlir_c_api",
+        "//tensorflow/c/eager:c_api_unified_internal",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
new file mode 100644
index 00000000000..0e8b7fedd9b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -0,0 +1,493 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstddef>
+#include <memory>
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+#include "tensorflow/c/tf_status.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/c/tf_status_internal.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+
+namespace mlir {
+namespace TF {
+using tensorflow::internal::AbstractFunction;
+using tensorflow::internal::AbstractOp;
+using tensorflow::internal::AbstractTensor;
+using tensorflow::internal::dyncast;
+using tensorflow::internal::ExecutionContext;
+using tensorflow::internal::OutputList;
+
+namespace {
+
+static void RegisterDialects() {
+  static bool init_once = []() {
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::tf_device::TensorFlowDeviceDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    return true;
+  }();
+  (void)init_once;
+}
+
+Status ConvertDataTypeToTensor(tensorflow::DataType dtype, Builder builder,
+                               Type* type) {
+  Status s = tensorflow::ConvertDataType(dtype, builder, type);
+  if (s.ok()) *type = UnrankedTensorType::get(*type);
+  return s;
+}
+
+class MlirTensor : public AbstractTensor {
+ public:
+  explicit MlirTensor(Value value) : AbstractTensor(kKind), value_(value) {}
+
+  Value getValue() { return value_; }
+
+  static constexpr AbstractTensorKind kKind = kMlirTensor;
+
+ private:
+  Value value_;
+};
+
+class MlirAbstractOp : public AbstractOp {
+ public:
+  explicit MlirAbstractOp(MLIRContext* context)
+      : AbstractOp(kKind), context_(context) {}
+
+  void SetOpType(const char* op_type, TF_Status* s) override;
+
+  void SetAttrType(const char* attr_name, TF_DataType dtype,
+                   TF_Status* s) override;
+
+  void SetOpName(const char* const op_name, TF_Status* s) override;
+
+  MLIRContext* GetContext() { return context_; }
+
+  Type AddRef(Type type, TF_Status* s);
+
+  OperationState* Create(ArrayRef<Value> operands, TF_Status* s);
+
+  static constexpr AbstractOpKind kKind = kMlirOp;
+
+ private:
+  MLIRContext* context_;
+  llvm::StringMap<Attribute> attrs_;
+  std::unique_ptr<OperationState> state_;
+  const char* op_name_ = nullptr;
+};
+
+// MlirFunction is a thin wrapper over a FuncOp.
+class MlirFunction : public AbstractFunction {
+ public:
+  explicit MlirFunction(std::unique_ptr<MLIRContext> context,
+                        OwningModuleRef module, FuncOp func)
+      : AbstractFunction(kKind),
+        context_(std::move(context)),
+        module_(std::move(module)),
+        func_(func) {}
+
+  TF_Function* GetTfFunction(TF_Status* s) override;
+
+  static constexpr AbstractFunctionKind kKind = kGraphFunc;
+
+ private:
+  std::unique_ptr<MLIRContext> context_;
+  OwningModuleRef module_;
+  FuncOp func_;
+};
+
+class MlirFunctionContext : public ExecutionContext {
+ public:
+  explicit MlirFunctionContext(const char* name)
+      : ExecutionContext(kKind),
+        context_(std::make_unique<MLIRContext>()),
+        builder_(context_.get()) {
+    // TODO(aminim) figure out the location story here
+    module_ = ModuleOp::create(builder_.getUnknownLoc());
+    func_ = FuncOp::create(builder_.getUnknownLoc(), name,
+                           builder_.getFunctionType(llvm::None, llvm::None));
+    module_->push_back(func_);
+    builder_ = OpBuilder::atBlockBegin(func_.addEntryBlock());
+  }
+
+  AbstractOp* CreateOperation() override {
+    return new MlirAbstractOp(context_.get());
+  }
+
+  void ExecuteOperation(AbstractOp* abstract_op, int num_inputs,
+                        AbstractTensor* const* inputs, OutputList* o,
+                        TF_Status* s) override;
+
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override;
+
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override;
+
+  void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
+    s->status = tensorflow::errors::Unimplemented(
+        "Registering graph functions has not been implemented yet.");
+  }
+
+  static constexpr ExecutionContextKind kKind = kMlirContext;
+
+ private:
+  std::unique_ptr<MLIRContext> context_;
+  OpBuilder builder_;
+  FuncOp func_;
+  OwningModuleRef module_;
+};
+
+void MlirAbstractOp::SetOpType(const char* op_type, TF_Status* s) {
+  if (state_) {
+    s->status = tensorflow::errors::FailedPrecondition(
+        "SetOpType called on already built op.");
+    return;
+  }
+  std::string name = "tf.";
+  name += op_type;
+  // TODO(aminim) figure out the location story here
+  state_ = std::make_unique<OperationState>(UnknownLoc::get(context_), name);
+}
+
+void MlirAbstractOp::SetAttrType(const char* attr_name, TF_DataType dtype,
+                                 TF_Status* s) {
+  if (!state_) {
+    s->status = tensorflow::errors::FailedPrecondition(
+        "op_type must be specified before specifying attrs.");
+    return;
+  }
+  Type mlir_type;
+  Builder builder(context_);
+  s->status = ConvertDataTypeToTensor(static_cast<tensorflow::DataType>(dtype),
+                                      builder, &mlir_type);
+  if (!s->status.ok()) return;
+  attrs_[attr_name] = TypeAttr::get(mlir_type);
+}
+
+void MlirAbstractOp::SetOpName(const char* const op_name, TF_Status* s) {
+  // TODO(aminim): should we use a location?
+  if (op_name_) {
+    s->status = tensorflow::errors::FailedPrecondition(
+        "SetOpName called on already built op.");
+    return;
+  }
+  op_name_ = op_name;
+}
+
+Type MlirAbstractOp::AddRef(Type type, TF_Status* s) {
+  Type elt_type = getElementTypeOrSelf(type);
+  if (elt_type.isa<mlir::TF::TensorFlowRefType>()) {
+    s->status = tensorflow::errors::InvalidArgument(
+        "Requested reference to a reference type");
+    return nullptr;
+  }
+  elt_type = TensorFlowRefType::get(elt_type);
+  if (RankedTensorType tensor_type = type.dyn_cast<RankedTensorType>()) {
+    return RankedTensorType::get(tensor_type.getShape(), elt_type);
+  }
+  return UnrankedTensorType::get(elt_type);
+}
+
+OperationState* MlirAbstractOp::Create(ArrayRef<Value> operands, TF_Status* s) {
+  state_->operands = llvm::to_vector<4>(operands);
+  const tensorflow::OpDef* op_def;
+  auto node_name = state_->name.getStringRef().drop_front(
+      TensorFlowDialect::getDialectNamespace().size() + 1);
+  s->status =
+      tensorflow::OpRegistry::Global()->LookUpOpDef(node_name.str(), &op_def);
+  if (!s->status.ok()) return nullptr;
+  Builder builder(context_);
+  // Process operands according to the op_def and infer derived attributes.
+  int current_operand = 0;
+  for (const tensorflow::OpDef::ArgDef& input_arg : op_def->input_arg()) {
+    if (!input_arg.number_attr().empty()) {
+      // TODO(b/156122856): we don't support variadic operands.
+      s->status = tensorflow::errors::Unimplemented(
+          "Unsupported 'number_attr' for '", input_arg.number_attr(), "'");
+      return nullptr;
+    } else if (!input_arg.type_list_attr().empty()) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Unsupported 'type_list_attr' for '", input_arg.number_attr(), "'");
+      return nullptr;
+    }
+    if (current_operand >= operands.size()) {
+      s->status = tensorflow::errors::InvalidArgument("Missing operand for '",
+                                                      input_arg.name(), "'");
+      return nullptr;
+    }
+    Type expected_type;
+    if (input_arg.type() != tensorflow::DT_INVALID) {
+      s->status =
+          ConvertDataTypeToTensor(input_arg.type(), builder, &expected_type);
+      if (!s->status.ok()) return nullptr;
+      if (input_arg.is_ref()) expected_type = AddRef(expected_type, s);
+      if (!s->status.ok()) return nullptr;
+    } else {
+      expected_type = operands[current_operand].getType();
+    }
+    if (!input_arg.type_attr().empty()) {
+      attrs_[input_arg.type_attr()] = TypeAttr::get(expected_type);
+    }
+    ++current_operand;
+  }
+
+  for (const tensorflow::OpDef::ArgDef& output_arg : op_def->output_arg()) {
+    int original_size = state_->types.size();
+    if (!output_arg.number_attr().empty()) {
+      // Same type repeated "repeats" times.
+      Attribute repeats_attr = attrs_[output_arg.number_attr()];
+      if (!repeats_attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing attribute '", output_arg.number_attr(),
+            "' required for output list '", output_arg.name(), "'");
+        return nullptr;
+      }
+      if (!repeats_attr.isa<IntegerAttr>()) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Attribute '", output_arg.number_attr(),
+            "' required for output list '", output_arg.name(),
+            "' isn't an integer");
+        return nullptr;
+      }
+      int64_t repeats = repeats_attr.cast<IntegerAttr>().getInt();
+
+      if (!output_arg.type_attr().empty()) {
+        // Same type repeated "repeats" times.
+        Attribute attr = attrs_[output_arg.type_attr()];
+        if (!attr) {
+          s->status = tensorflow::errors::InvalidArgument(
+              "Missing attribute '", output_arg.type_attr(),
+              "' required for output '", output_arg.name(), "'");
+          return nullptr;
+        }
+        TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+        if (!type_attr) {
+          s->status = tensorflow::errors::InvalidArgument(
+              "Attribute '", output_arg.type_attr(), "' required for output '",
+              output_arg.name(), "' isn't a type attribute");
+          return nullptr;
+        }
+        for (int i = 0; i < repeats; ++i)
+          state_->types.push_back(type_attr.getType());
+      } else if (output_arg.type() != tensorflow::DT_INVALID) {
+        for (int i = 0; i < repeats; ++i) {
+          Type type;
+          s->status =
+              ConvertDataTypeToTensor(output_arg.type(), builder, &type);
+          if (!s->status.ok()) return nullptr;
+          state_->types.push_back(type);
+        }
+      } else {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing type or type_attr field in ",
+            output_arg.ShortDebugString());
+        return nullptr;
+      }
+    } else if (!output_arg.type_attr().empty()) {
+      Attribute attr = attrs_[output_arg.type_attr()];
+      if (!attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing attribute '", output_arg.type_attr(),
+            "' required for output '", output_arg.name(), "'");
+        return nullptr;
+      }
+      TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+      if (!type_attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Attribute '", output_arg.type_attr(), "' required for output '",
+            output_arg.name(), "' isn't a type attribute");
+        return nullptr;
+      }
+      state_->types.push_back(type_attr.getValue());
+    } else if (!output_arg.type_list_attr().empty()) {
+      // This is pointing to an attribute which is an array of types.
+      Attribute attr = attrs_[output_arg.type_list_attr()];
+      if (!attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Missing attribute '", output_arg.type_list_attr(),
+            "' required for output '", output_arg.name(), "'");
+        return nullptr;
+      }
+      ArrayAttr array_attr = attr.dyn_cast<ArrayAttr>();
+      if (!array_attr) {
+        s->status = tensorflow::errors::InvalidArgument(
+            "Attribute '", output_arg.type_list_attr(),
+            "' required for output '", output_arg.name(),
+            "' isn't an array attribute");
+        return nullptr;
+      }
+      for (Attribute attr : array_attr) {
+        TypeAttr type_attr = attr.dyn_cast<TypeAttr>();
+        if (!type_attr) {
+          s->status = tensorflow::errors::InvalidArgument(
+              "Array Attribute '", output_arg.type_list_attr(),
+              "' required for output '", output_arg.name(),
+              "' has a non-Type element");
+          return nullptr;
+        }
+        state_->types.push_back(type_attr.getValue());
+      }
+    } else if (output_arg.type() != tensorflow::DT_INVALID) {
+      Type type;
+      Builder builder(context_);
+      s->status = ConvertDataTypeToTensor(output_arg.type(), builder, &type);
+      if (!s->status.ok()) return nullptr;
+      state_->types.push_back(type);
+    } else {
+      s->status = tensorflow::errors::InvalidArgument(
+          "No type fields in ", output_arg.ShortDebugString());
+      if (!s->status.ok()) return nullptr;
+    }
+    if (output_arg.is_ref()) {
+      // For all types that were added by this function call, make them refs.
+      for (Type& type : llvm::make_range(&state_->types[original_size],
+                                         state_->types.end())) {
+        type = AddRef(type, s);
+        if (!s->status.ok()) return nullptr;
+      }
+    }
+  }
+  return state_.get();
+}
+
+TF_Function* MlirFunction::GetTfFunction(TF_Status* s) {
+  PassManager pm(func_.getContext());
+  pm.addNestedPass<FuncOp>(CreateFunctionalToExecutorDialectConversionPass());
+  pm.addNestedPass<FuncOp>(CreateBreakUpIslandsPass());
+
+  // In case of failure, the `diag_handler` converts MLIR errors emitted to
+  // the MLIRContext into a tensorflow::Status.
+  StatusScopedDiagnosticHandler diag_handler(func_.getContext());
+  LogicalResult result = pm.run(func_.getParentOfType<ModuleOp>());
+  (void)result;
+  s->status = diag_handler.ConsumeStatus();
+  if (!s->status.ok()) return nullptr;
+
+  tensorflow::GraphExportConfig configs;
+  std::unique_ptr<TF_Function> tf_function(new TF_Function);
+  s->status = ConvertMlirFunctionToFunctionLibraryDef(func_, configs,
+                                                      &tf_function->fdef);
+  return tf_function.release();
+}
+
+void MlirFunctionContext::ExecuteOperation(AbstractOp* abstract_op,
+                                           int num_inputs,
+                                           AbstractTensor* const* inputs,
+                                           OutputList* o, TF_Status* s) {
+  auto* mlir_op = dyncast<MlirAbstractOp>(abstract_op);
+  if (mlir_op == nullptr) {
+    s->status = tensorflow::errors::InvalidArgument(
+        "Unable to cast AbstractOp to TF_GraphOp.");
+    return;
+  }
+  SmallVector<Value, 8> operands;
+  for (int i = 0; i < num_inputs; ++i) {
+    auto* operand = dyncast<MlirTensor>(inputs[i]);
+    if (!operand) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing eager tensors is not supported yet.");
+      return;
+    }
+    if (operand->getValue().getContext() != context_.get()) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing tensors from other context is not supported.");
+      return;
+    }
+    operands.push_back(operand->getValue());
+  }
+  OperationState* state = mlir_op->Create(operands, s);
+  if (!s->status.ok() || !state) return;
+  Operation* op = builder_.createOperation(*state);
+  int num_results = op->getNumResults();
+  o->outputs.clear();
+  o->outputs.reserve(num_results);
+  for (Value result : op->getResults())
+    o->outputs.push_back(new MlirTensor(result));
+}
+
+AbstractTensor* MlirFunctionContext::AddParameter(TF_DataType dtype,
+                                                  TF_Status* s) {
+  Type type;
+  s->status = ConvertDataTypeToTensor(static_cast<tensorflow::DataType>(dtype),
+                                      builder_, &type);
+  if (!s->status.ok()) return nullptr;
+  return new MlirTensor(func_.getBody().front().addArgument(type));
+}
+
+AbstractFunction* MlirFunctionContext::Finalize(OutputList* outputs,
+                                                TF_Status* s) {
+  Block& body = func_.getBody().front();
+  SmallVector<Value, 8> ret_operands;
+  for (AbstractTensor* output : outputs->outputs) {
+    auto* operand = dyncast<MlirTensor>(output);
+    if (!operand) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing eager tensors is not supported yet.");
+      return nullptr;
+    }
+    if (operand->getValue().getContext() != context_.get()) {
+      s->status = tensorflow::errors::InvalidArgument(
+          "Capturing tensors from other context is not supported.");
+      return nullptr;
+    }
+    ret_operands.push_back(operand->getValue());
+  }
+  builder_.create<ReturnOp>(func_.getLoc(), ret_operands);
+
+  auto arg_types = llvm::to_vector<8>(body.getArgumentTypes());
+  auto result_types =
+      llvm::to_vector<8>(body.getTerminator()->getOperandTypes());
+  func_.setType(FunctionType::get(arg_types, result_types, func_.getContext()));
+  return new MlirFunction(std::move(context_), std::move(module_), func_);
+}
+
+extern "C" {
+ExecutionContext* MlirTracingFactory(const char* fn_name, TF_Status* s) {
+  RegisterDialects();
+  return new MlirFunctionContext(fn_name);
+}
+}
+
+}  // end anonymous namespace
+}  // end namespace TF
+}  // end namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc
new file mode 100644
index 00000000000..778f4b777a3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir_registration.cc
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
+
+using tensorflow::internal::ExecutionContext;
+
+extern "C" {
+ExecutionContext* MlirTracingFactory(const char* fn_name, TF_Status* s);
+}
+
+namespace {
+// Register the tracing implemented in this file as the default tracing engine.
+static bool register_tracing = [] {
+  RegisterTracingEngineFactory("mlir", MlirTracingFactory);
+  return true;
+}();
+
+}  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 75fcede8fbb..2bf55922d4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -782,4 +782,22 @@ StatusOr<std::unique_ptr<GraphDef>> ConvertMlirToGraphdef(
   return graphdef;
 }
 
+stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
+    mlir::FuncOp func, const GraphExportConfig& configs,
+    FunctionDef* function_def) {
+  Dialect* tf_dialect = func.getContext()->getRegisteredDialect("tf");
+  FunctionDefLibrary flib;
+  TF_RETURN_IF_ERROR(
+      Exporter::ConvertLibFunction(configs, tf_dialect, func, &flib));
+  for (auto& func_def : flib.function()) {
+    if (func_def.signature().name() == func.getName()) {
+      *function_def = func_def;
+      return Status::OK();
+    }
+  }
+  return errors::InvalidArgument(
+      "Function couldn't be found in the FunctionDefLibrary after converting "
+      "from MLIR");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
index 2d522f6031e..a5aebd16146 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Function.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Module.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
@@ -50,6 +51,12 @@ stream_executor::port::Status ConvertMlirToGraph(
 stream_executor::port::Status ConvertMlirToGraph(
     mlir::ModuleOp module, const GraphExportConfig& configs,
     std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def);
+
+// Converts an MLIR function and adds it to a FunctionLibraryDefinition.
+stream_executor::port::Status ConvertMlirFunctionToFunctionLibraryDef(
+    mlir::FuncOp func, const GraphExportConfig& configs,
+    FunctionDef* function_def);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSLATE_EXPORT_GRAPHDEF_H_

From 9b7b8f16f38ad06ef0efde4168fad2c482626a4a Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 27 May 2020 03:23:33 -0700
Subject: [PATCH 1197/1533] Support compiling for a separate set of virtual and
 real CUDA compute architectures.

We currently use the following setup to select which compute architectures to compile for:

- ./configure allows specifying a set of CUDA compute architectures to compile for, e.g. '5.2,6.0'.
- .tf_configure.bazelrc maps this to an environment variable (TF_CUDA_COMPUTE_CAPABILITIES=5.2,6.0)
- cuda_configure.bzl turns this into compiler flags (copts) for clang, which the crosstool maps to nvcc if needed.
- The kernels are always compiled to both the virtual (ptx) and the real (sass) architecture.

This change adds support for specifying just real (sm_xy) or both virtual and real (compute_xy) compute architectures in TF_CUDA_COMPUTE_CAPABILITIES.

./configure is left unchanged, the old 'x.y' strings are mapped to 'compute_xy' in cuda_configure.bzl.

PiperOrigin-RevId: 313359468
Change-Id: I96c5b8b0a02b2ce62df27df7cc5272ddd42217aa
---
 .../core/kernels/cubin_headers/build_defs.bzl |  2 +
 .../crosstool_wrapper_driver_is_not_gcc.tpl   |  8 ++-
 .../windows/msvc_wrapper_for_nvcc.py.tpl      | 14 +++-
 third_party/gpus/cuda_configure.bzl           | 70 +++++++++++--------
 third_party/nccl/build_defs.bzl.tpl           |  1 +
 5 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index 14f47601f06..f9dac50591a 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -22,6 +22,8 @@ def _gen_kernel_image_hdr_impl(ctx):
     cubins = []
     images = []
     for arch in ctx.attr.gpu_archs:
+        # TODO(b/152737872): 'compute_' should generate both SASS and PTX.
+        arch = arch.replace("compute_", "sm_")
         filename = "%s.%s.cubin" % (name, arch)
         cubin = ctx.actions.declare_file(filename)
         ctx.actions.run(
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 303339e77f7..9cc06ef99f5 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -221,8 +221,12 @@ def InvokeNvcc(argv, log=False):
   nvccopts = '-D_FORCE_INLINES '
   for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
     capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s\" ' % (capability,
+                                                               capability)
+  for capability in GetOptionValue(argv, '--cuda-include-ptx'):
+    capability = capability[len('sm_'):]
+    nvccopts += r'-gencode=arch=compute_%s,\"code=compute_%s\" ' % (capability,
+                                                                    capability)
   nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index de6512e3088..c00e7077b59 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -138,10 +138,18 @@ def InvokeNvcc(argv, log=False):
   nvccopts = ['-D_FORCE_INLINES']
   compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
   for capability in compute_capabilities:
-    print(capability)
     capability = capability[len('sm_'):]
-    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
-        capability, capability, capability)]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=sm_%s"' % (capability, capability)
+    ]
+  compute_capabilities, argv = GetOptionValue(argv, '--cuda-include-ptx')
+  for capability in compute_capabilities:
+    capability = capability[len('sm_'):]
+    nvccopts += [
+        r'-gencode=arch=compute_%s,"code=compute_%s"' % (capability, capability)
+    ]
+  _, argv = GetOptionValue(argv, '--no-cuda-include-ptx')
+
   nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 91eb0444b7c..35e86d8d77b 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -66,8 +66,6 @@ _TF_CUDA_CONFIG_REPO = "TF_CUDA_CONFIG_REPO"
 _TF_DOWNLOAD_CLANG = "TF_DOWNLOAD_CLANG"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = ["3.5", "5.2"]
-
 def to_list_of_strings(elements):
     """Convert the list of ["a", "b", "c"] into '"a", "b", "c"'.
 
@@ -410,18 +408,40 @@ _NVCC_VERSION_PREFIX = "Cuda compilation tools, release "
 _DEFINE_CUDNN_MAJOR = "#define CUDNN_MAJOR"
 
 def compute_capabilities(repository_ctx):
-    """Returns a list of strings representing cuda compute capabilities."""
-    capabilities_str = get_host_environ(repository_ctx, _TF_CUDA_COMPUTE_CAPABILITIES)
-    if capabilities_str == None:
-        return _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    capabilities = capabilities_str.split(",")
-    for capability in capabilities:
-        # Workaround for Skylark's lack of support for regex. This check should
-        # be equivalent to checking:
-        #     if re.match("[0-9]+.[0-9]+", capability) == None:
+    """Returns a list of strings representing cuda compute capabilities.
+
+    Args:
+      repository_ctx: the repo rule's context.
+    Returns: list of cuda architectures to compile for. 'compute_xy' refers to
+      both PTX and SASS, 'sm_xy' refers to SASS only.
+    """
+    capabilities = get_host_environ(
+        repository_ctx,
+        _TF_CUDA_COMPUTE_CAPABILITIES,
+        "compute_35,compute_52",
+    ).split(",")
+
+    # Map old 'x.y' capabilities to 'compute_xy'.
+    for i, capability in enumerate(capabilities):
         parts = capability.split(".")
-        if len(parts) != 2 or not parts[0].isdigit() or not parts[1].isdigit():
+        if len(parts) != 2:
+            continue
+        capabilities[i] = "compute_%s%s" % (parts[0], parts[1])
+
+    # Make list unique
+    capabilities = dict(zip(capabilities, capabilities)).keys()
+
+    # Validate capabilities.
+    for capability in capabilities:
+        if not capability.startswith(("compute_", "sm_")):
             auto_configure_fail("Invalid compute capability: %s" % capability)
+        for prefix in ["compute_", "sm_"]:
+            if not capability.startswith(prefix):
+                continue
+            if len(capability) == len(prefix) + 2 and capability[-2:].isdigit():
+                continue
+            auto_configure_fail("Invalid compute capability: %s" % capability)
+
     return capabilities
 
 def lib_name(base_name, cpu_value, version = None, static = False):
@@ -849,22 +869,15 @@ def _tf_sysroot(repository_ctx):
     return get_host_environ(repository_ctx, _TF_SYSROOT, "")
 
 def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
-    capability_flags = [
-        "--cuda-gpu-arch=sm_" + cap.replace(".", "")
-        for cap in compute_capabilities
-    ]
+    capability_flags = ["--no-cuda-include-ptx=all"]
+    for capability in compute_capabilities:
+        if capability.startswith("compute_"):
+            capability = capability.replace("compute_", "sm_")
+            capability_flags.append("--cuda-include-ptx=%s" % capability)
+        capability_flags.append("--cuda-gpu-arch=%s" % capability)
+
     return str(capability_flags)
 
-def _compute_cuda_gpu_architectures(repository_ctx, compute_capabilities):
-    gpu_architectures = [
-        "sm_" + capability.replace(".", "")
-        for capability in compute_capabilities
-    ]
-
-    # Make the list unique.
-    gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
-    return str(gpu_architectures)
-
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
 
@@ -996,10 +1009,7 @@ def _create_local_cuda_repository(repository_ctx):
                 repository_ctx,
                 cuda_config.compute_capabilities,
             ),
-            "%{cuda_gpu_architectures}": _compute_cuda_gpu_architectures(
-                repository_ctx,
-                cuda_config.compute_capabilities,
-            ),
+            "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
         },
     )
 
diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl
index 7585949ea92..9268af7c890 100644
--- a/third_party/nccl/build_defs.bzl.tpl
+++ b/third_party/nccl/build_defs.bzl.tpl
@@ -84,6 +84,7 @@ def _device_link_impl(ctx):
     cubins = []
     images = []
     for arch in ctx.attr.gpu_archs:
+        arch = arch.replace("compute_", "sm_")  # PTX is JIT-linked at runtime.
         cubin = ctx.actions.declare_file("%s_%s.cubin" % (name, arch))
         register_h = ctx.actions.declare_file("%s_register_%s.h" % (name, arch))
         ctx.actions.run(

From 463c3055ecd3bba92d7e1da3ebe48e7e8394a0c1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 05:48:22 -0700
Subject: [PATCH 1198/1533] An implementation of a multithreaded runtime for
 receiving outfeed data from devices and pushing it back to Python.

Design and implementation notes in outfeed_received.cc

PiperOrigin-RevId: 313373094
Change-Id: I4278d6ebf4e204b0e91c536d1de8c3f49dca6a34
---
 tensorflow/compiler/xla/python/BUILD          |  59 +++
 .../compiler/xla/python/outfeed_receiver.cc   | 492 ++++++++++++++++++
 .../compiler/xla/python/outfeed_receiver.h    |  77 +++
 .../xla/python/outfeed_receiver_py.cc         | 156 ++++++
 .../compiler/xla/python/outfeed_receiver_py.h |  27 +
 .../xla/python/outfeed_receiver_test.cc       | 258 +++++++++
 tensorflow/compiler/xla/python/xla.cc         |   2 +
 7 files changed, 1071 insertions(+)
 create mode 100644 tensorflow/compiler/xla/python/outfeed_receiver.cc
 create mode 100644 tensorflow/compiler/xla/python/outfeed_receiver.h
 create mode 100644 tensorflow/compiler/xla/python/outfeed_receiver_py.cc
 create mode 100644 tensorflow/compiler/xla/python/outfeed_receiver_py.h
 create mode 100644 tensorflow/compiler/xla/python/outfeed_receiver_test.cc

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 3dcdc46040a..10737489331 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_py_test_deps")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "pybind_extension")
@@ -212,6 +213,63 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "outfeed_receiver",
+    srcs = ["outfeed_receiver.cc"],
+    hdrs = ["outfeed_receiver.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_outfeed_receiver_test",
+    size = "small",
+    srcs = ["outfeed_receiver_test.cc"],
+    deps = [
+        ":outfeed_receiver",
+        "//tensorflow/compiler/jit:xla_cpu_jit",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:executable_build_options",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/pjrt:cpu_device",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "outfeed_receiver_py",
+    srcs = ["outfeed_receiver_py.cc"],
+    hdrs = ["outfeed_receiver_py.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        ":outfeed_receiver",
+        ":types",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/synchronization",
+        "@pybind11",
+    ],
+)
+
 config_setting(
     name = "enable_gpu",
     values = {"define": "xla_python_enable_gpu=true"},
@@ -233,6 +291,7 @@ pybind_extension(
         ":dlpack",
         ":ops",
         ":python_ref_manager",
+        ":outfeed_receiver_py",
         ":types",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/hash",
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
new file mode 100644
index 00000000000..0be4167c397
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -0,0 +1,492 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/outfeed_receiver.h"
+
+#include <sys/types.h>
+
+#include <memory>
+#include <sstream>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+
+// Implementation notes:
+//
+// Startup:
+// -------
+//
+// The startup is initiated by a call from Python to StartOutfeedReceiver,
+// which starts N threads for listening to the N devices and for enqueueing
+// the received data into a callback queue. There is one additional callback
+// thread for dequeing the data and invoking the Python callback.
+//
+// Framing protocol
+// ----------------
+//
+// The outfeed mechanism has a single channel and the receiver must know
+// exactly the shape and number of outfeed operations issued by the compiled
+// code. This makes it hard to use outfeed in conditionals and loops and
+// especially when outfeeding different-shaped data.
+//
+// To address this, when we compile the code we capture the shape of the
+// data being outfed, and we generate a consumer ID (uint32_t) that is unique
+// across the lifetime of the program to: the Python callable to callback to,
+// the shape of the arguments, the keyword arguments to pass to the callable.
+// Each outfeed payload is preceeded by a header (of shape u32[2]) with a
+// special first value and the consumer ID. We maintain a registry of shapes
+// by consumer ID. When receiving we lookup the shape by consumer ID, and then
+// we read the payload.
+//
+// Back pressure:
+// --------------
+//
+// We maintain a sum of the bytes from all the data waiting in the callback
+// queue. The listening threads will wait for the sum to drop below a
+// configurable threshold, default 256Mb. While the listening thread is waiting,
+// on CPU and GPU the next outfeed operation from the device will block. On
+// TPU there is a buffer, but eventually the TPU will also block.
+//
+// Shutdown:
+// ---------
+//
+// The shutdown is initiated automatically when the last reference to the
+// outfeed receiver object is dropped, and the Python garbage collector invokes
+// the destructor.
+//
+// The shutdown sequence is implemented as follows:
+// * we enqueue on all devices a computation that outfeeds a special header
+//   with customer ID kOutfeedCidShutdown.
+// * when each listening threads gets the shutdown header, it decrements
+//   a counter of listening threads, and if the counter reaches 0, it
+//   enqueues a special shutdown callback.
+// * when the callback thread gets the shutdown callback marker, it terminates.
+// * the shutdown code waits until all threads terminate.
+//
+// Since we currently keep the shape registry in the OutfeedReceiver, it is
+// not safe to replace the OutfeedReceiver instance during the lifetime of
+// the JAX program, or else previously cached jitted computations may refer
+// to previously cached shapes. This can be solved, but for now we disallow
+// replacing the OutfeedReceiver, and do not provide a Shutdown API to the
+// Python program.
+
+namespace xla {
+
+// The header contains:
+// 0. kOutfeedHeaderStart
+// 1. consumer id
+int constexpr kOutfeedHeaderWords = 2;
+uint32_t constexpr kOutfeedHeaderStart = 271828;
+// Special consumer IDs, without outfeed payload.
+uint32_t constexpr kOutfeedCidShutdown = 0;
+
+// A Device and its PjRtClient.
+struct DeviceWithClient {
+  Device* device;
+  std::shared_ptr<PjRtClient> client;
+};
+
+// Encapsulates data received from a device outfeed.
+class OutfeedData {
+ public:
+  OutfeedData(DeviceWithClient device_client, uint32_t consumer_id, Shape shape)
+      : device_client_(device_client),
+        consumer_id_(consumer_id),
+        shape_(shape),
+        literal_(nullptr),
+        literal_size_bytes_(0) {}
+
+  DeviceWithClient device_client() { return device_client_; }
+  uint32_t consumer_id() const { return consumer_id_; }
+  Shape shape() const { return shape_; }
+  std::unique_ptr<Literal> literal() {
+    CHECK(literal_);
+    return std::move(literal_);
+  }
+
+  void SetLiteral(std::unique_ptr<Literal> literal);
+
+  ssize_t literal_size_bytes() const { return literal_size_bytes_; }
+
+  std::string DebugString() const;
+
+ private:
+  DeviceWithClient device_client_;
+  uint32_t consumer_id_;
+  Shape shape_;
+  std::unique_ptr<Literal> literal_;
+  ssize_t literal_size_bytes_;
+};
+
+void OutfeedData::SetLiteral(std::unique_ptr<Literal> literal) {
+  literal_ = std::move(literal);
+  shape_ = literal_->shape();
+  int total_size_bytes = 0;
+  ShapeUtil::ForEachSubshape(
+      shape_, [&](const Shape& literal_subshape, const ShapeIndex& index) {
+        if (!literal_subshape.IsTuple()) {
+          total_size_bytes += ShapeUtil::ByteSizeOf(literal_subshape, 8);
+        }
+      });
+  literal_size_bytes_ = total_size_bytes;
+}
+
+std::string OutfeedData::DebugString() const {
+  return absl::StrFormat("dev=%s; cons=%d; shape=%s",
+                         device_client_.device->DebugString(), consumer_id_,
+                         shape_.ToString());
+}
+
+class OutfeedReceiverImpl {
+ public:
+  OutfeedReceiverImpl(OutfeedReceiver::Callback callback,
+                      std::vector<std::shared_ptr<PjRtClient>> clients,
+                      ssize_t max_callback_queue_size_bytes);
+
+  OutfeedReceiverImpl(const OutfeedReceiverImpl&) = delete;
+  OutfeedReceiverImpl& operator=(const OutfeedReceiverImpl&) = delete;
+
+  // Blocks until all data has been received from devices and all data
+  // in the queue has been passed to Python.
+  ~OutfeedReceiverImpl();
+
+  void Start();
+
+  StatusOr<XlaOp> AddOutfeedToBuilder(XlaBuilder* builder, XlaOp token,
+                                      uint32_t consumer_id,
+                                      std::vector<XlaOp> arrays);
+
+ private:
+  bool CallbackQueueNotEmpty() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return !callback_queue_.empty();
+  }
+
+  bool CallbackQueueHasSpace() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return callback_queue_size_bytes_ < max_callback_queue_size_bytes_;
+  }
+
+  bool ShutdownDone() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return (num_working_callback_threads_ == 0 && num_listening_threads_ == 0);
+  }
+
+  void CallbackThreadLoop();
+  void DeviceListenerThreadLoop(int device_idx);
+
+  // Enqueues to a device an outfeed operation with a shutdown consumer ID.
+  Status SendShutdownOutfeedHeader(int device_idx);
+
+  // Receives a raw Literal from a device outfeed.
+  StatusOr<std::unique_ptr<Literal>> ReceiveRawFromOutfeed(const Device* device,
+                                                           const Shape& shape);
+
+  // Enqueues received data in the callbaback queue.
+  void EnqueueReceivedData(std::unique_ptr<OutfeedData> received)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  // Shuts down the threads. See implementation notes at top of file.
+  // It is not safe to restart an OutfeedReceiver after shutting down one.
+  void Shutdown();
+
+  OutfeedReceiver::Callback callback_;
+  // The devices on which we are listening, with their clients.
+  std::vector<DeviceWithClient> devices_;
+  // Maximum bytes capacity of the callback queue.
+  uint64_t max_callback_queue_size_bytes_;
+
+  absl::Mutex mu_;
+  // Registered shapes by consumer id.
+  // The shape registry must be alive as long as the program exists.
+  // Right now we tell the user to never restart after Shutdown.
+  absl::flat_hash_map<uint32_t, Shape> shape_registry_ TF_GUARDED_BY(mu_);
+  // How many bytes of Literal are in the callback queue.
+  uint64_t callback_queue_size_bytes_ TF_GUARDED_BY(mu_);
+  // Threads listening.
+  int num_listening_threads_ TF_GUARDED_BY(mu_);
+  bool shutdown_started_ TF_GUARDED_BY(mu_);
+
+  // How many callback threads are still working. Used for shutdown.
+  int num_working_callback_threads_ TF_GUARDED_BY(mu_);
+
+  std::queue<std::unique_ptr<OutfeedData>> callback_queue_ TF_GUARDED_BY(mu_);
+  // The threadpool must come last to ensure the queue exists
+  // when the pool destructor is called.
+  std::unique_ptr<tensorflow::thread::ThreadPool> threads_;
+};
+
+OutfeedReceiverImpl::OutfeedReceiverImpl(
+    OutfeedReceiver::Callback callback,
+    std::vector<std::shared_ptr<PjRtClient>> clients,
+    ssize_t max_callback_queue_size_bytes) {
+  callback_ = callback;
+  max_callback_queue_size_bytes_ = max_callback_queue_size_bytes;
+  for (const auto& client : clients) {
+    for (const auto& device : client->devices()) {
+      devices_.push_back(DeviceWithClient{device.get(), client});
+    }
+  }
+  CHECK_GT(devices_.size(), 0);
+
+  callback_queue_size_bytes_ = 0;
+  num_listening_threads_ = 0;
+  num_working_callback_threads_ = 0;
+  shutdown_started_ = false;
+}
+
+void OutfeedReceiverImpl::Start() {
+  {
+    absl::MutexLock lock(&mu_);
+    CHECK(!shutdown_started_);
+  }
+  int num_threads = 1 + devices_.size();
+  threads_ = absl::make_unique<tensorflow::thread::ThreadPool>(
+      tensorflow::Env::Default(), "outfeed_receiver", num_threads);
+  threads_->Schedule([this]() { CallbackThreadLoop(); });
+  for (int device_idx = 0; device_idx < devices_.size(); ++device_idx) {
+    threads_->Schedule(
+        [this, device_idx]() { DeviceListenerThreadLoop(device_idx); });
+  }
+}
+
+void OutfeedReceiverImpl::Shutdown() {
+  VLOG(2) << "Shutdown start";
+  {
+    absl::MutexLock lock(&mu_);
+    CHECK(!shutdown_started_);
+    shutdown_started_ = true;
+  }
+  for (int device_idx = 0; device_idx < devices_.size(); ++device_idx) {
+    CHECK(SendShutdownOutfeedHeader(device_idx).ok());
+  }
+  VLOG(2) << "Shutdown waiting for listening and callback threads to stop";
+  absl::MutexLock lock(&mu_);
+  mu_.Await(absl::Condition(this, &OutfeedReceiverImpl::ShutdownDone));
+  VLOG(2) << "Shutdown done";
+}
+
+OutfeedReceiverImpl::~OutfeedReceiverImpl() {
+  VLOG(2) << "~OutfeedReceiverImpl";
+  Shutdown();
+}
+
+void OutfeedReceiverImpl::DeviceListenerThreadLoop(int device_idx) {
+  {
+    absl::MutexLock lock(&mu_);
+    ++num_listening_threads_;
+  }
+  DeviceWithClient device_client = devices_[device_idx];
+  while (true) {
+    Shape header_shape = ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords});
+    std::unique_ptr<Literal> header =
+        ReceiveRawFromOutfeed(device_client.device, header_shape).ValueOrDie();
+    absl::Span<uint32_t> header_data = header->data<uint32>();
+    CHECK_EQ(header_data.size(), kOutfeedHeaderWords);
+    CHECK_EQ(header_data[0], kOutfeedHeaderStart);
+    uint32_t consumer_id = header_data[1];
+    Shape shape;
+    {
+      absl::MutexLock lock(&mu_);
+      auto registered_shape = shape_registry_.find(consumer_id);
+      if (registered_shape == shape_registry_.end()) {
+        LOG(FATAL)
+            << "[" << device_client.device->DebugString()
+            << "] Cannot find registered shape for consumer ID " << consumer_id
+            << ". Perhaps the code was compiled with a different instance "
+            << "of OutfeedReceiver.";
+      }
+      shape = registered_shape->second;
+    }
+    auto received =
+        absl::make_unique<OutfeedData>(device_client, consumer_id, shape);
+    VLOG(2) << "Listener received header " << received->DebugString();
+    if (consumer_id == kOutfeedCidShutdown) {
+      VLOG(2) << "[" << device_client.device->DebugString()
+              << "] Listener received shutdown header";
+      absl::MutexLock lock(&mu_);
+      --num_listening_threads_;
+      if (num_listening_threads_ == 0) {
+        VLOG(2) << "Last listener shutdown; enqueue shutdown callback";
+        EnqueueReceivedData(std::move(received));
+      }
+      return;
+    }
+    std::unique_ptr<Literal> data =
+        ReceiveRawFromOutfeed(device_client.device, shape).ValueOrDie();
+    received->SetLiteral(std::move(data));
+    absl::MutexLock lock(&mu_);
+    EnqueueReceivedData(std::move(received));
+  }
+}
+
+void OutfeedReceiverImpl::EnqueueReceivedData(
+    std::unique_ptr<OutfeedData> received) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  mu_.Await(absl::Condition(this, &OutfeedReceiverImpl::CallbackQueueHasSpace));
+  ssize_t literal_size_bytes = received->literal_size_bytes();
+  callback_queue_size_bytes_ += literal_size_bytes;
+  VLOG(2) << "Listener enqueues data " << received->DebugString() << " of size "
+          << literal_size_bytes << " bytes; " << (1 + callback_queue_.size())
+          << " callbacks in queue of total size " << callback_queue_size_bytes_
+          << " bytes.\n";
+  callback_queue_.push(std::move(received));
+}
+
+StatusOr<std::unique_ptr<Literal>> OutfeedReceiverImpl::ReceiveRawFromOutfeed(
+    const Device* device, const Shape& shape) {
+  std::shared_ptr<Literal> literal_shared;
+
+  TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
+                      device->GetLocalDeviceState());
+  TF_ASSIGN_OR_RETURN(Literal literal,
+                      local_device->client()->TransferFromOutfeedLocal(
+                          shape, local_device->device_ordinal()));
+
+  return absl::make_unique<Literal>(std::move(literal));
+}
+
+void OutfeedReceiverImpl::CallbackThreadLoop() {
+  {
+    absl::MutexLock lock(&mu_);
+    num_working_callback_threads_++;
+    CHECK_EQ(num_working_callback_threads_, 1);
+  }
+  while (true) {
+    std::unique_ptr<OutfeedData> received;
+    {
+      absl::MutexLock lock(&mu_);
+      mu_.Await(
+          absl::Condition(this, &OutfeedReceiverImpl::CallbackQueueNotEmpty));
+      received = std::move(callback_queue_.front());
+      callback_queue_.pop();
+      callback_queue_size_bytes_ -= received->literal_size_bytes();
+      VLOG(2) << "Dequeued callback for " << received->DebugString() << "; "
+              << callback_queue_.size() << " callbacks in queue of total size "
+              << callback_queue_size_bytes_ << " bytes.\n";
+    }
+    if (received->consumer_id() == kOutfeedCidShutdown) {
+      VLOG(2) << "Callback loop received shutdown signal";
+      {
+        absl::MutexLock lock(&mu_);
+        CHECK(callback_queue_.empty());
+        CHECK_EQ(callback_queue_size_bytes_, 0);
+        --num_working_callback_threads_;
+      }
+      VLOG(2) << "Callback loop done";
+      return;
+    }
+    {
+      tensorflow::profiler::TraceMe traceme("OutfeedReceiver::Callback");
+      DeviceWithClient device_client = received->device_client();
+      callback_(device_client.device, std::move(device_client.client),
+                received->consumer_id(), received->literal());
+    }
+  }
+}
+
+Status OutfeedReceiverImpl::SendShutdownOutfeedHeader(int device_idx) {
+  const Device* device = devices_[device_idx].device;
+  constexpr int consumer_id = kOutfeedCidShutdown;
+  VLOG(2) << "[" << device->DebugString()
+          << "] SendSpecialHeader cons=" << consumer_id;
+  XlaBuilder builder(
+      absl::StrFormat("special_outfeed_header_%d_%d", consumer_id, device_idx));
+  XlaOp send =
+      AddOutfeedToBuilder(&builder, CreateToken(&builder), consumer_id, {})
+          .ValueOrDie();
+  XlaComputation computation = builder.Build(send).ValueOrDie();
+
+  CompileOptions compile_options;
+  compile_options.executable_build_options.set_num_replicas(1);
+  compile_options.executable_build_options.set_num_partitions(1);
+  DeviceAssignment device_assignment(1, 1);
+  device_assignment(0, 0) = device->id();
+  compile_options.executable_build_options.set_device_assignment(
+      device_assignment);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtExecutable> executable,
+      PjRtExecutable::Compile(computation, devices_[device_idx].client.get(),
+                              std::move(compile_options)));
+  ExecuteOptions execute_options;
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
+                      executable->Execute({}, execute_options));
+  return Status::OK();
+}
+
+StatusOr<XlaOp> OutfeedReceiverImpl::AddOutfeedToBuilder(
+    XlaBuilder* builder, XlaOp token, uint32_t consumer_id,
+    std::vector<XlaOp> arrays) {
+  XlaOp data = Tuple(builder, std::move(arrays));
+  Shape shape_with_layout = builder->GetShape(data).ValueOrDie();
+  ShapeUtil::ForEachMutableSubshape(
+      &shape_with_layout, [](Shape* subshape, const ShapeIndex&) {
+        if (!subshape->has_layout()) {
+          LayoutUtil::SetToDefaultLayout(subshape);
+        }
+      });
+  VLOG(2) << "RegisterShape cons=" << consumer_id
+          << "; shape=" << shape_with_layout.ToString();
+  {
+    absl::MutexLock lock(&mu_);
+    auto found = shape_registry_.find(consumer_id);
+    if (found != shape_registry_.end()) {
+      if (!ShapeUtil::Equal(shape_with_layout, found->second)) {
+        return InvalidArgument(
+            "Shape %s does not match previous shape %s used "
+            "for consumer id %d",
+            shape_with_layout.DebugString(), found->second.DebugString(),
+            consumer_id);
+      }
+    } else {
+      shape_registry_.insert({consumer_id, shape_with_layout});
+    }
+  }
+
+  std::vector<uint32_t> header{kOutfeedHeaderStart, consumer_id};
+  XlaOp header_op = ConstantR1<uint32_t>(builder, header);
+  token = OutfeedWithToken(
+      header_op, token, ShapeUtil::MakeShape(U32, {kOutfeedHeaderWords}), "");
+  if (consumer_id != kOutfeedCidShutdown) {
+    token = OutfeedWithToken(data, token, shape_with_layout, "");
+  }
+  return token;
+}
+
+OutfeedReceiver::OutfeedReceiver(
+    Callback callback, std::vector<std::shared_ptr<PjRtClient>> clients,
+    ssize_t max_callback_queue_size_bytes) {
+  p_impl_ = absl::make_unique<OutfeedReceiverImpl>(
+      callback, std::move(clients), max_callback_queue_size_bytes);
+}
+
+OutfeedReceiver::~OutfeedReceiver() {}
+
+void OutfeedReceiver::Start() { p_impl_->Start(); }
+
+StatusOr<XlaOp> OutfeedReceiver::AddOutfeedToBuilder(
+    XlaBuilder* builder, XlaOp token, uint32_t consumer_id,
+    std::vector<XlaOp> arrays) {
+  if (consumer_id == kOutfeedCidShutdown) {
+    return InvalidArgument("Consumer ID cannot be a reserved value: %d",
+                           consumer_id);
+  }
+  return p_impl_->AddOutfeedToBuilder(builder, token, consumer_id, arrays);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.h b/tensorflow/compiler/xla/python/outfeed_receiver.h
new file mode 100644
index 00000000000..a0fdfcd36f0
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.h
@@ -0,0 +1,77 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+class OutfeedReceiverImpl;
+
+// Implements a multithreaded receiver of outfeeds from devices.
+class OutfeedReceiver {
+ public:
+  // A callback takes: device, client (for the device), consumer id, received.
+  // The client pointer should be alive while the device is used.
+  using Callback = std::function<void(Device*, std::shared_ptr<PjRtClient>,
+                                      uint32_t, std::shared_ptr<Literal>)>;
+
+  // Constructs the receiver for the given clients and callback function.
+  //
+  // Args:
+  //   callback: a function to be called when an outfeed is ready for
+  //     processing.
+  //   clients: the clients for whose devices to listen.
+  //   max_callback_queue_size_bytes: the maximum number of bytes for all
+  //     received outfeeds queued to be processed. When this limit is reached
+  //     we pause receiving outfeeds from devices.
+  OutfeedReceiver(Callback callback,
+                  std::vector<std::shared_ptr<PjRtClient>> clients,
+                  ssize_t max_callback_queue_size_bytes);
+
+  OutfeedReceiver(const OutfeedReceiver&) = delete;
+  OutfeedReceiver& operator=(const OutfeedReceiver&) = delete;
+
+  // Blocks until all data has been received from devices and all data
+  // in the queue has been passed to Python.
+  ~OutfeedReceiver();
+
+  // Starts the listener threads and the callback thread.
+  void Start();
+
+  // Adds to the computation builder the outfeed of the arrays.
+  // Has the side-effect of registering the sent shape for the consumer_id.
+  // Returns error status if the outfeed shape is different than the
+  // previously used shape for the same consumer_id or the consumer id is
+  // invalid.
+  StatusOr<XlaOp> AddOutfeedToBuilder(XlaBuilder* builder, XlaOp token,
+                                      uint32_t consumer_id,
+                                      std::vector<XlaOp> arrays);
+
+ private:
+  std::unique_ptr<OutfeedReceiverImpl> p_impl_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_H_
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.cc b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
new file mode 100644
index 00000000000..a6256cfe86c
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.cc
@@ -0,0 +1,156 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
+
+#include <memory>
+
+#include "absl/memory/memory.h"
+#include "absl/synchronization/mutex.h"
+#include "pybind11/functional.h"
+#include "pybind11/pybind11.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/outfeed_receiver.h"
+#include "tensorflow/compiler/xla/python/types.h"
+
+namespace xla {
+
+namespace py = pybind11;
+
+namespace {
+
+// A wrapper for OutfeedReceiver for use from Python, useful for ensuring
+// that the GIL is released before destroying the OutfeedReceiver.
+class OutfeedReceiverForPython {
+ public:
+  // A callback to Python takes: consumer id, received literal.
+  using CallbackToPython =
+      std::function<void(ClientAndPtr<Device>, uint32_t, pybind11::object)>;
+
+  OutfeedReceiverForPython(CallbackToPython callback_python,
+                           std::vector<std::shared_ptr<PjRtClient>> clients,
+                           ssize_t max_callback_queue_size_bytes) {
+    callback_python_ = callback_python;
+    outfeed_receiver_shutting_down_ = false;
+    OutfeedReceiver::Callback callback =
+        [this](Device* device, std::shared_ptr<PjRtClient> client,
+               uint32_t consumer_id, std::shared_ptr<Literal> literal) {
+          this->Callback(device, client, consumer_id, literal);
+        };
+    outfeed_receiver_ = absl::make_unique<OutfeedReceiver>(
+        callback, std::move(clients), max_callback_queue_size_bytes);
+  }
+  OutfeedReceiverForPython(const OutfeedReceiverForPython&) = delete;
+  OutfeedReceiverForPython& operator=(const OutfeedReceiverForPython&) = delete;
+
+  ~OutfeedReceiverForPython() {
+    // This destructor is called from the Python GC. Release it for the duration
+    // of the destruction, including the destruction of the OutfeedReceiver,
+    // when we may actually have to wait for threads to end. During this time
+    // we do not callback to Python (sometimes we get an exception
+    // "std::runtime_error: scoped_acquire::dec_ref(): thread state must
+    // be current!"").
+    {
+      absl::MutexLock lock(&mu_);
+      outfeed_receiver_shutting_down_ = true;
+    }
+    py::gil_scoped_release gil_release;
+    outfeed_receiver_ = nullptr;  // Shutdown the outfeed receiver.
+  }
+
+  void Start() { outfeed_receiver_->Start(); }
+
+  StatusOr<XlaOp> AddOutfeed(XlaBuilder* builder, XlaOp token,
+                             uint32_t consumer_id, std::vector<XlaOp> arrays) {
+    return outfeed_receiver_->AddOutfeedToBuilder(builder, token, consumer_id,
+                                                  arrays);
+  }
+
+  void Callback(Device* device, std::shared_ptr<PjRtClient> client,
+                uint32_t consumer_id, std::shared_ptr<Literal> literal) {
+    {
+      absl::MutexLock lock(&mu_);
+      if (outfeed_receiver_shutting_down_) {
+        VLOG(2) << "Ignoring unsafe callback to Python during shutdown";
+        return;
+      }
+    }
+    py::gil_scoped_acquire gil_acquire;  // Need GIL also for LiteralToPython
+    py::object literal_python =
+        LiteralToPython(std::move(literal)).ValueOrDie();
+    // The callback_ should handle all exceptions in user-code. If we get
+    // an exception here, it is a bug in the callback and we should stop.
+    callback_python_(WrapWithClient<Device>(std::move(client), device),
+                     consumer_id, std::move(literal_python));
+  }
+
+ private:
+  CallbackToPython callback_python_;
+  absl::Mutex mu_;
+  bool outfeed_receiver_shutting_down_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<OutfeedReceiver> outfeed_receiver_;
+};
+
+}  // namespace
+
+void BuildOutfeedReceiverSubmodule(py::module* m) {
+  py::module outfeed_receiver =
+      m->def_submodule("outfeed_receiver", "Outfeed receiver");
+  outfeed_receiver.def(
+      "start",
+      [](OutfeedReceiverForPython::CallbackToPython callback_to_python,
+         std::vector<std::shared_ptr<PjRtClient>> clients,
+         ssize_t max_callback_queue_size_bytes)
+          -> std::unique_ptr<OutfeedReceiverForPython> {
+        auto server = absl::make_unique<OutfeedReceiverForPython>(
+            callback_to_python, clients, max_callback_queue_size_bytes);
+        server->Start();
+        return server;
+      },
+      py::arg("callback_to_python"), py::arg("backends"),
+      py::arg("max_queue_size_bytes") = 256 * 1024 * 1024,
+      R"(Starts a multithreaded outfeed receiver.
+
+      There is one thread for each of the specified devices. When Python
+      drops the last reference to the returned object, the receiver is shut
+      down. The destructor will block until all data is received from
+      devices.
+
+      Args:
+        * callback_to_python: a Python callback to call, with <consumer_id>
+          and the data received.
+        * backends: the list of backends to listen on.
+        * max_queue_size_bytes: an optional integer to bound the maximum size
+            of arrays in the callback queue. When this limit is reached the
+            device listener pauses.
+      )",
+      py::call_guard<py::gil_scoped_release>());
+
+  py::class_<OutfeedReceiverForPython> outfeed_receiver_class(
+      outfeed_receiver, "OutfeedReceiverForPython");
+
+  outfeed_receiver_class.def(
+      "add_outfeed", &OutfeedReceiverForPython::AddOutfeed, py::arg("builder"),
+      py::arg("token"), py::arg("consumer_id"), py::arg("arrays"),
+      R"(Adds an outfeed into the given computation builder.
+
+      Has the side-effect of registering the sent shape along with the consumer
+      ID. Returns error if the outfeed shape is not compatible with previously
+      used shape for the same consumer ID.)",
+      py::call_guard<py::gil_scoped_release>());
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_py.h b/tensorflow/compiler/xla/python/outfeed_receiver_py.h
new file mode 100644
index 00000000000..6b1a712327a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_py.h
@@ -0,0 +1,27 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_PY_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_PY_H_
+
+#include "pybind11/pybind11.h"
+
+namespace xla {
+
+void BuildOutfeedReceiverSubmodule(pybind11::module* m);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_OUTFEED_RECEIVER_PY_H_
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
new file mode 100644
index 00000000000..ea84b4e18d6
--- /dev/null
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -0,0 +1,258 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/outfeed_receiver.h"
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/pjrt/cpu_device.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+
+namespace {
+
+Status CompileAndExecute(XlaBuilder* builder, XlaOp root, int device_id,
+                         PjRtClient* client) {
+  XlaComputation computation = builder->Build(root).ValueOrDie();
+
+  CompileOptions compile_options;
+  compile_options.executable_build_options.set_num_replicas(1);
+  compile_options.executable_build_options.set_num_partitions(1);
+  DeviceAssignment device_assignment(1, 1);
+  device_assignment(0, 0) = device_id;
+  compile_options.executable_build_options.set_device_assignment(
+      device_assignment);
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<PjRtExecutable> executable,
+      PjRtExecutable::Compile(computation, client, std::move(compile_options)));
+  ExecuteOptions execute_options;
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
+                      executable->Execute({}, execute_options));
+  return Status::OK();
+}
+
+// Accumulates the received data.
+class Accumulator {
+ public:
+  struct Data {
+    uint32_t consumer_id;
+    std::shared_ptr<Literal> data;
+  };
+
+  void Receive(uint32_t consumer_id, std::shared_ptr<Literal> data) {
+    absl::MutexLock lock(&mutex_);
+    received_.push_back(Data{consumer_id, data});
+  }
+
+  std::vector<Data> received() {
+    absl::MutexLock lock(&mutex_);
+    return received_;
+  }
+
+ private:
+  absl::Mutex mutex_;
+  std::vector<Data> received_ TF_GUARDED_BY(mutex_);
+};
+
+TEST(OutfeedReceiverTest, ReceiveOutfeedSimple) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data = Iota(&builder, shape0, 0);
+  XlaOp send = outfeed_receiver
+                   ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
+                                         consumer_id0, {data})
+                   .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder, send, 0, cpu_client.get()).ok());
+
+  // Shutdown the receiver, to force it to wait to deliver the callbacks.
+  outfeed_receiver = nullptr;
+  std::vector<Accumulator::Data> received = receiver->received();
+  EXPECT_EQ(1, received.size());
+  EXPECT_EQ(consumer_id0, received[0].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
+}
+
+TEST(OutfeedReceiverTest, ReceiveOutfeedTwoComputations) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder0("execute_test_outfeed_0");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder0, shape0, 0);
+  XlaOp send0 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder0, CreateToken(&builder0),
+                                          consumer_id0, {data0})
+                    .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder0, send0, 0, cpu_client.get()).ok());
+
+  XlaBuilder builder1("execute_test_outfeed_1");
+  constexpr int consumer_id1 = 6;
+  const Shape shape1 = ShapeUtil::MakeShape(U32, {128});
+  XlaOp data1 = Iota(&builder1, shape1, 0);
+  XlaOp send1 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder1, CreateToken(&builder1),
+                                          consumer_id1, {data1})
+                    .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder1, send1, 0, cpu_client.get()).ok());
+
+  // Shutdown the receiver, to force it to wait to deliver the callbacks.
+  outfeed_receiver = nullptr;
+  std::vector<Accumulator::Data> received = receiver->received();
+  EXPECT_EQ(2, received.size());
+  EXPECT_EQ(consumer_id0, received[0].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
+  EXPECT_EQ(consumer_id1, received[1].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape1}), received[1].data->shape());
+}
+
+TEST(OutfeedReceiverTest, ReceiveOutfeedTwoOutfeed) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder, shape0, 0);
+  XlaOp send0 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
+                                          consumer_id0, {data0})
+                    .ValueOrDie();
+
+  constexpr int consumer_id1 = 6;
+  const Shape shape1 = ShapeUtil::MakeShape(U32, {128});
+  XlaOp data1 = Iota(&builder, shape1, 0);
+  XlaOp send1 =
+      outfeed_receiver
+          ->AddOutfeedToBuilder(&builder, send0, consumer_id1, {data1})
+          .ValueOrDie();
+  EXPECT_TRUE(CompileAndExecute(&builder, send1, 0, cpu_client.get()).ok());
+
+  // Shutdown the receiver, to force it to wait to deliver the callbacks.
+  outfeed_receiver = nullptr;
+  std::vector<Accumulator::Data> received = receiver->received();
+  EXPECT_EQ(2, received.size());
+  EXPECT_EQ(consumer_id0, received[0].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape0}), received[0].data->shape());
+  EXPECT_EQ(consumer_id1, received[1].consumer_id);
+  EXPECT_EQ(ShapeUtil::MakeTupleShape({shape1}), received[1].data->shape());
+}
+
+TEST(OutfeedReceiverTest, DifferentShapeForConsumerIdError) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  constexpr int consumer_id0 = 5;
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder, shape0, 0);
+  XlaOp send0 = outfeed_receiver
+                    ->AddOutfeedToBuilder(&builder, CreateToken(&builder),
+                                          consumer_id0, {data0})
+                    .ValueOrDie();
+
+  const Shape shape1 = ShapeUtil::MakeShape(U32, {128});
+  XlaOp data1 = Iota(&builder, shape1, 0);
+  // A different shape for the same consumer ID.
+  StatusOr<XlaOp> send1 = outfeed_receiver->AddOutfeedToBuilder(
+      &builder, send0, consumer_id0, {data1});
+  EXPECT_FALSE(send1.ok());
+  EXPECT_THAT(send1.status().ToString(),
+              testing::HasSubstr("does not match previous shape element_type"));
+}
+
+TEST(OutfeedReceiverTest, InvalidConsumerIdError) {
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<PjRtClient> cpu_client,
+                          GetCpuClient(true));
+  std::vector<std::shared_ptr<PjRtClient>> clients{cpu_client};
+
+  auto receiver = absl::make_unique<Accumulator>();
+  OutfeedReceiver::Callback callback =
+      [&receiver](Device* device, std::shared_ptr<PjRtClient> client,
+                  uint32_t consumer_id, std::shared_ptr<Literal> data) {
+        receiver->Receive(consumer_id, data);
+      };
+  auto outfeed_receiver =
+      std::make_shared<OutfeedReceiver>(callback, clients, 128);
+  outfeed_receiver->Start();
+
+  XlaBuilder builder("execute_test_outfeed");
+  const Shape shape0 = ShapeUtil::MakeShape(U32, {16});
+  XlaOp data0 = Iota(&builder, shape0, 0);
+  StatusOr<XlaOp> send0 = outfeed_receiver->AddOutfeedToBuilder(
+      &builder, CreateToken(&builder), 0, {data0});
+
+  EXPECT_FALSE(send0.ok());
+  EXPECT_THAT(send0.status().ToString(),
+              testing::HasSubstr("Consumer ID cannot be a reserved value"));
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index fb7d7df58f7..0b6824e83e9 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/bfloat16.h"
 #include "tensorflow/compiler/xla/python/dlpack.h"
 #include "tensorflow/compiler/xla/python/ops.h"
+#include "tensorflow/compiler/xla/python/outfeed_receiver_py.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
@@ -1165,6 +1166,7 @@ PYBIND11_MODULE(xla_extension, m) {
 
   BuildOpsSubmodule(&m);
   BuildProfilerSubmodule(&m);
+  BuildOutfeedReceiverSubmodule(&m);
 
   py::class_<DistributedRuntimeService,
              std::unique_ptr<DistributedRuntimeService>>

From aa37eea584f6364c5e1bb97c40db53d529aa7e36 Mon Sep 17 00:00:00 2001
From: Jake Tae <jaesungtae@gmail.com>
Date: Wed, 27 May 2020 22:12:18 +0900
Subject: [PATCH 1199/1533] fix: remove unnecessary docstring

---
 .../python/keras/applications/mobilenet_v2.py | 25 -------------------
 1 file changed, 25 deletions(-)

diff --git a/tensorflow/python/keras/applications/mobilenet_v2.py b/tensorflow/python/keras/applications/mobilenet_v2.py
index 87fe82ef43e..dda1ed64215 100644
--- a/tensorflow/python/keras/applications/mobilenet_v2.py
+++ b/tensorflow/python/keras/applications/mobilenet_v2.py
@@ -500,37 +500,12 @@ def _make_divisible(v, divisor, min_value=None):
 
 @keras_export('keras.applications.mobilenet_v2.preprocess_input')
 def preprocess_input(x, data_format=None):
-  """Preprocesses a numpy array encoding a batch of images.
-
-  Arguments
-    x: A 4D numpy array consists of RGB values within [0, 255].
-
-  Returns
-    Preprocessed array.
-
-  Raises
-    ValueError: In case of unknown `data_format` argument.
-  """
   return imagenet_utils.preprocess_input(
       x, data_format=data_format, mode='tf')
 
 
 @keras_export('keras.applications.mobilenet_v2.decode_predictions')
 def decode_predictions(preds, top=5):
-  """Decodes the prediction result from the model.
-
-  Arguments
-    preds: Numpy tensor encoding a batch of predictions.
-    top: Integer, how many top-guesses to return.
-
-  Returns
-    A list of lists of top class prediction tuples
-    `(class_name, class_description, score)`.
-    One list of tuples per sample in batch input.
-
-  Raises
-    ValueError: In case of invalid shape of the `preds` array (must be 2D).
-  """
   return imagenet_utils.decode_predictions(preds, top=top)
 
 
From b1a712d40d67a7a9f88d6e2f5f5fe28fa10c7f1e Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Wed, 27 May 2020 07:59:16 -0700
Subject: [PATCH 1200/1533] Remove the
 xla_gpu_unsafe_fallback_to_driver_on_ptxas_error flag.

PiperOrigin-RevId: 313389132
Change-Id: Ic97116d9b471e96822ee28032ce0ddef5616a4f0
---
 tensorflow/compiler/xla/debug_options_flags.cc  | 10 ----------
 .../compiler/xla/service/gpu/nvptx_compiler.cc  | 17 ++++++-----------
 tensorflow/compiler/xla/xla.proto               | 10 ++++------
 3 files changed, 10 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 4152982bf4c..cad73b593a2 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -73,7 +73,6 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_enable_xprof_traceme(true);
   // TODO(b/155295372): disable ptxas fallback by default.
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(true);
-  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(false);
 
   return opts;
 }
@@ -567,15 +566,6 @@ static void AllocateFlags() {
       "that falling back to the driver can have drawbacks like using more "
       "memory and/or other bugs during compilation, so we recommend setting "
       "this flag to false."));
-  flag_objects->push_back(tensorflow::Flag(
-      "xla_gpu_unsafe_fallback_to_driver_on_ptxas_error",
-      bool_setter_for(
-          &DebugOptions::set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error),
-      flag_values->xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(),
-      "If true, XLA GPU falls back to the driver if there is an error when "
-      "running ptxas. Note that falling back to the driver can have drawbacks "
-      "like using more memory and/or other bugs during compilation, so we "
-      "recommend setting this flag to false."));
   ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 7ff8d40b440..b0b214832ea 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -411,17 +411,12 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
                    " Use at your own risk though, it has known drawbacks like "
                    "increased memory consumption.";
           } else {
-            LOG(ERROR) << "Error during compilation of ptx to sass: "
-                       << maybe_cubin.status();
-            CHECK(hlo_module_config.debug_options()
-                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_error())
-                << "There was an error when trying to compile ptx into sass "
-                   "code. Up until May 14 2020, XLA silently ignored such "
-                   "errors and fell back to the GPU driver. This is likely to "
-                   "trigger subtle runtime issues and is hence discouraged. "
-                   "If you want to temporarily restore this behavior use the "
-                   "flag --xla_gpu_unsafe_fallback_to_driver_on_ptxas_error "
-                   "and file a bug in b/components/366096.";
+            LOG(FATAL) << "ptxas returned an error during compilation of ptx "
+                          "to sass: '"
+                       << maybe_cubin.status() << "'  "
+                       << "If the error message indicates that a file could "
+                          "not be written, please verify that sufficient "
+                          "filesystem space is provided.";
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 9374b1fca6a..6595bcbe292 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -287,18 +287,16 @@ message DebugOptions {
   // memory, or have bugs.
   bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found = 138;
 
-  // It is usually preferable to not fallback to the driver; it can consume more
-  // memory, or have bugs.
-  bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_error = 139;
-
   // Next id: 141
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
 
-  reserved 5, 117, 133;  // were xla_hlo_dump_as_graphdef, xla_dump_to, and
-                         // xla_gpu_use_horizontal_fusion
+  reserved 5, 117, 133,
+      139;  // were xla_hlo_dump_as_graphdef, xla_dump_to,
+            // xla_gpu_use_horizontal_fusion, and
+            // xla_gpu_unsafe_fallback_to_driver_on_ptxas_error
 }
 
 // These settings control how XLA compiles and/or runs code.  Not all settings

From b6e69da1f90f059d23a72d51af37346ffa109b9f Mon Sep 17 00:00:00 2001
From: stevensa <andrew.stevens@infineon.com>
Date: Wed, 27 May 2020 17:31:35 +0200
Subject: [PATCH 1201/1533] Tidy: consistent formatting of patched pattern.

---
 .../compiler/mlir/lite/transforms/optimize_patterns.td    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 6797d39ccdf..e8f1c9c2cf3 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -33,11 +33,10 @@ def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
 class HasRankAtMost<int n> : Constraint<
     CPred<"$0.getType().cast<ShapedType>().getRank() <= " # n>>;
 
-
 // Checks value is not produce by a TLF_QUant with
 // different quantization attribute
-
 def NotFromQuantOpDifferentQuant : Constraint<CPred<"NotFromQuantOpDifferentQuant($0,$1)">>;
+
 //===----------------------------------------------------------------------===//
 // Ternary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -169,7 +168,10 @@ foreach BinaryOp = [TFL_DivOp, TFL_MulOp] in
 // This pattern applies when the same quantize/dequantize have been used twice
 // with the same scale. We want to remove the redundancy.
 // TODO(fengliuai): move this to the sanity check of pre-quantize pass.
-def eliminate_dq_q_pairs : Pat<(TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), (replaceWithValue $in), [(NotFromQuantOpDifferentQuant $in, $qt)]>;
+def eliminate_dq_q_pairs : Pat<
+  (TFL_QuantizeOp (TFL_DequantizeOp $in), $qt), 
+  (replaceWithValue $in), 
+  [(NotFromQuantOpDifferentQuant $in, $qt)]>;
 
 
 // Constraint that makes sure both operands are the same operands.

From b97bfb5d69eac209e45a765cba7ee4ab0d5333a0 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 27 May 2020 08:45:22 -0700
Subject: [PATCH 1202/1533] Prevent local directory traversal when GCS has a
 name starting with `/`.

PiperOrigin-RevId: 313396192
Change-Id: If18872476818bc9b2ad2340b22b4275140fbb000
---
 tensorflow/python/lib/io/file_io.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 7c484c825d3..c904cba08f8 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -736,6 +736,15 @@ def walk_v2(top, topdown=True, onerror=None):
     `(dirname, [subdirname, subdirname, ...], [filename, filename, ...])`.
     Each item is a string.
   """
+
+  def _make_full_path(parent, item):
+    # Since `os.path.join` discards paths before one that starts with the path
+    # separator (https://docs.python.org/3/library/os.path.html#os.path.join),
+    # we have to manually handle that case as `/` is a valid character on GCS.
+    if item[0] == os.sep:
+      return "".join([os.path.join(parent, ""), item])
+    return os.path.join(parent, item)
+
   top = compat.as_str_any(top)
   try:
     listing = list_directory(top)
@@ -748,7 +757,7 @@ def walk_v2(top, topdown=True, onerror=None):
   files = []
   subdirs = []
   for item in listing:
-    full_path = os.path.join(top, item)
+    full_path = _make_full_path(top, item)
     if is_directory(full_path):
       subdirs.append(item)
     else:
@@ -760,7 +769,8 @@ def walk_v2(top, topdown=True, onerror=None):
     yield here
 
   for subdir in subdirs:
-    for subitem in walk_v2(os.path.join(top, subdir), topdown, onerror=onerror):
+    for subitem in walk_v2(
+        _make_full_path(top, subdir), topdown, onerror=onerror):
       yield subitem
 
   if not topdown:

From 96ba1c3609b8a0210bdc72c2ba339cf81831f998 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Wed, 27 May 2020 09:31:24 -0700
Subject: [PATCH 1203/1533] [XLA] BufferValue::Color now type aliases int64.

PiperOrigin-RevId: 313404227
Change-Id: I2d393d426865c61ff210f10e3d9b8402a1813cf1
---
 .../compiler/xla/service/buffer_assignment.cc | 26 +++++++------------
 .../compiler/xla/service/buffer_assignment.h  |  3 +--
 .../compiler/xla/service/buffer_value.cc      |  2 +-
 .../compiler/xla/service/buffer_value.h       |  5 ++--
 tensorflow/compiler/xla/service/hlo_value.cc  |  3 +--
 .../compiler/xla/service/logical_buffer.cc    |  2 +-
 6 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 67cdb081a91..6cd58b86f0c 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -261,7 +261,7 @@ void BufferAllocation::AddAssignment(const HloValue& buffer, int64 offset,
     Shape* shape = ShapeUtil::GetMutableSubshape(
         position.instruction->mutable_shape(), position.index);
     if (shape->has_layout()) {
-      shape->mutable_layout()->set_memory_space(buffer.color().value());
+      shape->mutable_layout()->set_memory_space(buffer.color());
     }
   }
 }
@@ -272,7 +272,7 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   proto.set_size(size_);
   proto.set_is_thread_local(is_thread_local_);
   proto.set_is_tuple(is_tuple_);
-  proto.set_color(color_.value());
+  proto.set_color(color_);
   if (is_entry_computation_parameter_) {
     proto.set_is_entry_computation_parameter(true);
     for (int64 idx : param_shape_index()) {
@@ -336,8 +336,8 @@ static const HloInstruction* GetOutputInstruction(
 string BufferAllocation::ToString() const {
   string output;
   StrAppendFormat(&output, "allocation %d: %p, size %d", index_, this, size());
-  if (color().value() != 0) {
-    StrAppend(&output, ", color ", color().value());
+  if (color() != 0) {
+    StrAppend(&output, ", color ", color());
   }
   if (is_entry_computation_parameter()) {
     const HloInstruction* param = GetEntryParameterInstruction(*this);
@@ -607,9 +607,7 @@ void BufferAssignment::AddAssignment(BufferAllocation* allocation,
 // BufferAllocation.
 void BufferAssignment::CombineTempAllocations() {
   VLOG(1) << "CombineTempAllocations()";
-  flat_hash_map<BufferValue::Color, BufferAllocation,
-                BufferValue::Color::Hasher>
-      combined_allocation_map;
+  flat_hash_map<BufferValue::Color, BufferAllocation> combined_allocation_map;
 
   // Move all temp allocations into a single run at the end of the allocations
   // vector.
@@ -1059,8 +1057,8 @@ Status BufferAssigner::MergeInplaceOpBuffers(BufferAssignment* assignment) {
 
       // The instruction or operand color is excluded because it was assigned by
       // memory_space_assignment.
-      if (excluded_colors.contains(instruction_buffer.color().value()) ||
-          excluded_colors.contains(operand_buffer.color().value())) {
+      if (excluded_colors.contains(instruction_buffer.color()) ||
+          excluded_colors.contains(operand_buffer.color())) {
         continue;
       }
 
@@ -1353,13 +1351,10 @@ Status BufferAssigner::AssignBuffersForComputations(
   return Status::OK();
 }
 
-flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>,
-              LogicalBuffer::Color::Hasher>
+flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>>
 BufferAssigner::SplitBuffersByColor(
     const flat_hash_set<const HloValue*>& buffers) {
-  flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>,
-                LogicalBuffer::Color::Hasher>
-      color_map;
+  flat_hash_map<LogicalBuffer::Color, flat_hash_set<const HloValue*>> color_map;
   for (auto buffer : buffers) {
     color_map[buffer->color()].insert(buffer);
   }
@@ -1374,8 +1369,7 @@ Status BufferAssigner::AssignPresetBuffers(
   }
 
   // Create an allocation for each preset color.
-  absl::flat_hash_map<LogicalBuffer::Color, BufferAllocation*,
-                      LogicalBuffer::Color::Hasher>
+  absl::flat_hash_map<LogicalBuffer::Color, BufferAllocation*>
       preset_allocations;
   for (auto& color_and_info : preset_assignments_->assignment_informations()) {
     LogicalBuffer::Color color(color_and_info.first);
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 2a02d3776ce..50a4750601b 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -673,8 +673,7 @@ class BufferAssigner {
   // Split a set of buffers into several sets, each of which contains buffers
   // colored with the same color.
   absl::flat_hash_map<LogicalBuffer::Color,
-                      absl::flat_hash_set<const HloValue*>,
-                      LogicalBuffer::Color::Hasher>
+                      absl::flat_hash_set<const HloValue*>>
   SplitBuffersByColor(const absl::flat_hash_set<const HloValue*>& buffers);
 
   // If true, allocate buffers for constant instructions.
diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc
index b1abba20689..58e8086f5e9 100644
--- a/tensorflow/compiler/xla/service/buffer_value.cc
+++ b/tensorflow/compiler/xla/service/buffer_value.cc
@@ -59,7 +59,7 @@ LogicalBufferProto BufferValue::ToProto(const SizeFunction& size_fn) const {
       ToLocationProto(*instruction(), index());
   proto.mutable_defined_at()->Swap(&proto_location);
   if (has_color()) {
-    proto.set_color(color().value());
+    proto.set_color(color());
   }
   return proto;
 }
diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h
index 44cd7b5ebbd..bd2a09e4aaf 100644
--- a/tensorflow/compiler/xla/service/buffer_value.h
+++ b/tensorflow/compiler/xla/service/buffer_value.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/gtl/int_type.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -86,7 +85,7 @@ namespace xla {
 
 class BufferValue {
  public:
-  TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64);
+  using Color = int64;
 
   // Id is a unique identifier for the BufferValue to facilitate efficient
   // collections of BufferValues with stable iteration order.
@@ -154,7 +153,7 @@ class BufferValue {
   static LogicalBufferProto::Location ToLocationProto(
       const HloInstruction& instruction, const ShapeIndex& index);
 
-  const Color kInvalidColor = Color(-1);
+  const Color kInvalidColor = -1;
 
  protected:
   BufferValue(HloInstruction* instruction, const ShapeIndex& index, Id id);
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index acc077ab12d..e57c8a83b23 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -91,8 +91,7 @@ string HloValue::ToShortString() const {
   return absl::StrFormat(
       "<%d %s%s%s%s>", id(), instruction()->name(),
       instruction()->shape().IsTuple() ? index().ToString() : "",
-      is_phi() ? " (phi)" : "",
-      has_color() ? StrCat(" @", color().value()) : "");
+      is_phi() ? " (phi)" : "", has_color() ? StrCat(" @", color()) : "");
 }
 
 string HloValue::ToString(int indent) const {
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index e1f56727bd2..d937d53d550 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -34,7 +34,7 @@ LogicalBuffer::~LogicalBuffer() {}
 string LogicalBuffer::ToString() const {
   string color_string;
   if (has_color()) {
-    color_string = absl::StrCat(" @", color().value());
+    color_string = absl::StrCat(" @", color());
   }
   return absl::StrCat(instruction_->name(), "[", absl::StrJoin(index_, ","),
                       "](#", id(), color_string, ")");

From b847ff9b3067a101296d1d857358b5bdeefd2342 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Wed, 27 May 2020 09:37:50 -0700
Subject: [PATCH 1204/1533] Throw relevant exceptions based on status when
 copying Eager tensors.

Instead of blindly throwing a RuntimeError, throw a registered OpError exception based on the status when executing EagerTensor `.numpy()`.

PiperOrigin-RevId: 313405387
Change-Id: I6ee8e804f96c9baf0c1af77a958bb1f4b26e614b
---
 tensorflow/python/eager/BUILD            |  8 ++++++++
 tensorflow/python/eager/pywrap_tensor.cc | 13 ++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index adc30eab5e1..a44d8a493c1 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -2,6 +2,8 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
@@ -28,6 +30,10 @@ cc_library(
         "pywrap_tensor_conversion.h",
         "pywrap_tfe.h",
     ],
+    copts = ["-fexceptions"],
+    features = [
+        "-use_header_modules",  # Required for pybind11
+    ],
     visibility = [
         "//learning/deepmind/courier:__subpackages__",
         "//tensorflow:internal",
@@ -54,6 +60,7 @@ cc_library(
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
         "//tensorflow/python:numpy_lib",
+        "//tensorflow/python:py_exception_registry",
         "//tensorflow/python:py_seq_tensor",
         "//tensorflow/python:safe_ptr",
         "//third_party/py/numpy:headers",
@@ -63,6 +70,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
+        "@pybind11",
     ],
 )
 
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index b209ddb6162..031545531f1 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include <cmath>
 
 #include "structmember.h"  // NOLINT // For PyMemberDef
+#include "pybind11/pybind11.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -32,6 +34,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 #include "tensorflow/python/lib/core/ndarray_tensor_bridge.h"
 #include "tensorflow/python/lib/core/numpy.h"
+#include "tensorflow/python/lib/core/py_exception_registry.h"
 #include "tensorflow/python/lib/core/py_seq_tensor.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
@@ -300,7 +303,15 @@ TFE_TensorHandle* ConvertToEagerTensorUncached(TFE_Context* ctx,
       strstr(device_name, "/device:CPU:0") != nullptr) {
     handle = make_safe(TFE_TensorHandleCopyToDevice(handle.get(), ctx,
                                                     device_name, status.get()));
-    if (MaybeRaiseExceptionFromTFStatus(status.get(), PyExc_RuntimeError)) {
+    const TF_Code code = TF_GetCode(status.get());
+    if (code != TF_OK) {
+      // Instead of raising a generic RuntimeError, raise an exception type
+      // based on the status error code.
+      PyObject* exception = PyExceptionRegistry::Lookup(code);
+      PyErr_SetObject(exception,
+                      pybind11::make_tuple(pybind11::none(), pybind11::none(),
+                                           TF_Message(status.get()))
+                          .ptr());
       return nullptr;
     }
   }

From 5db729c9d63a224625fd1f396a7c45500145d73a Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Wed, 27 May 2020 09:44:03 -0700
Subject: [PATCH 1205/1533] Eliminate tf.IfRegion non-condition inputs

The then and else regions can reference their inputs directly without having to wire them through the IfRegion op inputs. This will allow a more direct representation of how these values are used within these regions

PiperOrigin-RevId: 313406455
Change-Id: I0756f659c9dec4ef348c38f358bf294b3d004ae3
---
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  2 -
 .../mlir/tensorflow/tests/tf-ops.mlir         | 65 +++++++------------
 2 files changed, 25 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 1b8f9eb4bb6..7f31c274a09 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -237,7 +237,6 @@ cond: A Tensor. If the tensor is a scalar of non-boolean type, the
     True and zero means False; if the scalar is a string, non-empty
     means True and empty means False. If the tensor is not a scalar,
     being empty means False and being non-empty means True.
-input: A list of input tensors.
 then_branch: A region that computes the outputs of the op if cond = true.
     It returns a list of tensors using tf.yield (as the terminator). The
     types of these returned tensors is same as that of the else_branch
@@ -248,7 +247,6 @@ else_branch: A region that computes the outputs of the op if cond = false.
 
   let arguments = (ins
     TF_Tensor:$cond,
-    Variadic<TF_Tensor>:$input,
 
     DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index c0d1a914788..2e00dd6a517 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -865,13 +865,14 @@ func @testInvalidYieldOp(%arg0: f32) -> () {
 // Test valid tf.IfRegion operation
 // CHECK-LABEL: func @testValidIfRegionOp
 func @testValidIfRegionOp(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %neg = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
     }, {
-     %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     %e = "tf.Acos"(%neg) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e) : (tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -881,7 +882,7 @@ func @testValidIfRegionOp(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf
 // Test valid tf.IfRegion operation with multiple results
 // CHECK-LABEL: func @testValidIfRegionOpWithMultipleResults
 func @testValidIfRegionOpWithMultipleResults(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
-  %0, %1, %2 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0, %1, %2 = "tf.IfRegion"(%arg0) ({
      %t0 = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      %t1 = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      %t2 = "tf.Acosh"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
@@ -891,7 +892,7 @@ func @testValidIfRegionOpWithMultipleResults(%arg0: tensor<i1>, %arg1: tensor<2x
      %e1 = "tf.Relu"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      %e2 = "tf.Sin"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e0, %e1, %e2) : (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+    }) { is_stateless = false} : (tensor<i1>) -> (tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
 
   %3 = "tf.Add"(%0, %1) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
   %4 = "tf.Add"(%2, %3) : (tensor<2xf32>, tensor<2xf32>) -> tensor<2xf32>
@@ -903,42 +904,26 @@ func @testValidIfRegionOpWithMultipleResults(%arg0: tensor<i1>, %arg1: tensor<2x
 // Test invalid type for operand #0 for tf.IfRegion operation
 func @testInvalidIfRegionOpType0(%arg0: f32, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{operand #0 must be tensor of tf.dtype values}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
     }, {
      %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e) : (tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (f32, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (f32) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
 
 // -----
 
-// Test invalid type for operand #1 for tf.IfRegion operation
-func @testInvalidIfRegionOpType1(%arg0: tensor<i1>, %arg1: f32) -> f32 {
-  // expected-error @+1 {{operand #1 must be tensor of tf.dtype values}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
-     %t = addf %arg1, %arg1 : f32
-     "tf.Yield"(%t) : (f32) -> ()
-    }, {
-     %e = mulf %arg1, %arg1 : f32
-     "tf.Yield"(%e) : (f32) -> ()
-    }) { is_stateless = false} : (tensor<i1>, f32) -> f32
-
-  return %0 : f32
-}
-
-// -----
-
 // tf.IfRegion operation should have 2 regions
 func @testInvalidIfRegionOp1Region(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{op expected 2 regions}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -947,7 +932,7 @@ func @testInvalidIfRegionOp1Region(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> t
 
 func @testInvalidIfRegionOpNoRegions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{op expected 2 regions}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+  %0 = "tf.IfRegion"(%arg0) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -956,7 +941,7 @@ func @testInvalidIfRegionOpNoRegions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) ->
 
 func @testInvalidIfRegionOp3Regions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{op expected 2 regions}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
     }, {
@@ -965,7 +950,7 @@ func @testInvalidIfRegionOp3Regions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) ->
     }, {
      %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e) : (tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -976,12 +961,12 @@ func @testInvalidIfRegionOp3Regions(%arg0: tensor<i1>, %arg1: tensor<2xf32>) ->
 func @testIfRegionThenTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+2 {{'tf.IfRegion' op expects regions to end with 'tf.Yield'}}
   // expected-note @+1 {{in custom textual format, the absence of terminator implies 'tf.Yield'}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
    }, {
      %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e) : (tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -991,12 +976,12 @@ func @testIfRegionThenTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> ten
 func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+2 {{'tf.IfRegion' op expects regions to end with 'tf.Yield'}}
   // expected-note @+1 {{in custom textual format, the absence of terminator implies 'tf.Yield'}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
     }, {
      %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -1006,13 +991,13 @@ func @testIfRegionElseTerminator(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> ten
 // tf.Region yield number of results should match op number of results
 func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{then region should have 1 result}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t, %t) : (tensor<2xf32>, tensor<2xf32>) -> ()
     }, {
      %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e) : (tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -1021,13 +1006,13 @@ func @testIfRegionThenResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> te
 
 func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{else region should have 1 result}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Abs"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
     }, {
      %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e, %e) : (tensor<2xf32>, tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -1037,12 +1022,12 @@ func @testIfRegionElseResultCount(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> te
 // tf.IfRegion yield types should match op result types
 func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{then result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      "tf.Yield"(%arg0) : (tensor<i1>) -> ()
     }, {
      %e = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%e) : (tensor<2xf32>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }
@@ -1051,12 +1036,12 @@ func @testIfRegionOpYieldMismatchThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -
 
 func @testIfRegionOpYieldMismatchElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
   // expected-error @+1 {{else result type tensor<i1> is incompatible with tf.IfRegion result type tensor<2xf32> at index 0}}
-  %0 = "tf.IfRegion"(%arg0, %arg1) ({
+  %0 = "tf.IfRegion"(%arg0) ({
      %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
     }, {
      "tf.Yield"(%arg0) : (tensor<i1>) -> ()
-    }) { is_stateless = false} : (tensor<i1>, tensor<2xf32>) -> tensor<2xf32>
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
 
   return %0 : tensor<2xf32>
 }

From 68ededda0302592c2ba19b1dd1e9c619b4759759 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Wed, 27 May 2020 09:47:53 -0700
Subject: [PATCH 1206/1533] Add missing symbols_pybind.txt to fix windows pip
 build.

PiperOrigin-RevId: 313407286
Change-Id: I84fedf5e5f9d09a4bc52e7d19a7a45a5d442e917
---
 tensorflow/tools/def_file_filter/symbols_pybind.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/def_file_filter/symbols_pybind.txt b/tensorflow/tools/def_file_filter/symbols_pybind.txt
index ed8747a73f0..0c75b70f5dd 100644
--- a/tensorflow/tools/def_file_filter/symbols_pybind.txt
+++ b/tensorflow/tools/def_file_filter/symbols_pybind.txt
@@ -93,10 +93,11 @@ tensorflow::DoQuantizeTrainingOnSerializedGraphDef
 tensorflow::SessionState::kTensorHandleResourceTypeName
 
 [server_lib] # server_lib
+tensorflow::data::GrpcDataServerBase::Join
 tensorflow::data::GrpcDataServerBase::Start
 tensorflow::data::GrpcDataServerBase::Stop
 tensorflow::data::GrpcDataServerBase::BoundPort
-tensorflow::data::MasterGrpcDataServer::NumTasks
+tensorflow::data::MasterGrpcDataServer::NumWorkers
 tensorflow::data::NewMasterServer
 tensorflow::data::NewWorkerServer
 

From a5fef39a3864ea6684127a8ffea7d36588edd540 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 27 May 2020 09:50:04 -0700
Subject: [PATCH 1207/1533] [XLA:CPU] Wire up C64/C128 matmul to Eigen

This is much faster than a naive loop. Also add some more testing now that we
can support it in the evaluator.

PiperOrigin-RevId: 313407740
Change-Id: I692de60af47e86a269ab4d121e97d2b472b7a8e3
---
 .../compiler/xla/service/cpu/cpu_runtime.cc   |  8 ++++++
 .../compiler/xla/service/cpu/cpu_runtime.h    |  4 +++
 .../xla/service/cpu/dot_op_emitter.cc         | 27 ++++++++++++++-----
 .../xla/service/cpu/runtime_matmul.cc         | 16 +++++++++++
 .../compiler/xla/service/cpu/runtime_matmul.h | 14 ++++++++++
 .../cpu/runtime_single_threaded_matmul.cc     | 18 +++++++++++++
 .../cpu/runtime_single_threaded_matmul.h      | 16 +++++++++++
 .../xla/service/cpu/simple_orc_jit.cc         |  4 +++
 .../compiler/xla/service/hlo_evaluator.cc     | 14 ++++++++++
 .../compiler/xla/service/hlo_evaluator.h      |  6 +++++
 .../compiler/xla/tests/dot_operation_test.cc  |  4 +++
 11 files changed, 125 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index bd949aa24c7..7abf5da0b64 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -67,6 +67,10 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kEigenMatMulC64SymbolName =
+    "__xla_cpu_runtime_EigenMatMulC64";
+extern const char* const kEigenMatMulC128SymbolName =
+    "__xla_cpu_runtime_EigenMatMulC128";
 extern const char* const kEigenMatMulS32SymbolName =
     "__xla_cpu_runtime_EigenMatMulS32";
 extern const char* const kMKLConvF32SymbolName = "__xla_cpu_runtime_MKLConvF32";
@@ -91,6 +95,10 @@ extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF32";
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF64";
+extern const char* const kEigenSingleThreadedMatMulC64SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedMatMulC64";
+extern const char* const kEigenSingleThreadedMatMulC128SymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedMatMulC128";
 extern const char* const kEigenSingleThreadedMatMulS32SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulS32";
 extern const char* const kEigenSingleThreadedConvF16SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index 14ea5448eef..492ce3f68b2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -46,6 +46,8 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kEigenMatMulC64SymbolName;
+extern const char* const kEigenMatMulC128SymbolName;
 extern const char* const kEigenMatMulS32SymbolName;
 extern const char* const kMKLConvF32SymbolName;
 extern const char* const kMKLMatMulF32SymbolName;
@@ -59,6 +61,8 @@ extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
+extern const char* const kEigenSingleThreadedMatMulC64SymbolName;
+extern const char* const kEigenSingleThreadedMatMulC128SymbolName;
 extern const char* const kEigenSingleThreadedMatMulS32SymbolName;
 extern const char* const kEigenSingleThreadedConvF16SymbolName;
 extern const char* const kEigenSingleThreadedConvF32SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index e1ad14600d7..9e75c1b9ac5 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -657,6 +657,8 @@ Status DotOpEmitter::EmitCallToRuntime() {
   bool multi_threaded = ShouldUseMultiThreadedEigen(hlo_module_config_);
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
+  llvm::Function* function = b_->GetInsertBlock()->getParent();
+  llvm::Module* module = function->getParent();
   llvm::Type* float_type;
   const char* fn_name;
   switch (type) {
@@ -684,6 +686,18 @@ Status DotOpEmitter::EmitCallToRuntime() {
                            : runtime::kEigenSingleThreadedMatMulF64SymbolName);
       float_type = b_->getDoubleTy();
       break;
+    case C64:
+      fn_name = multi_threaded
+                    ? runtime::kEigenMatMulC64SymbolName
+                    : runtime::kEigenSingleThreadedMatMulC64SymbolName;
+      float_type = llvm_ir::PrimitiveTypeToIrType(C64, module);
+      break;
+    case C128:
+      fn_name = multi_threaded
+                    ? runtime::kEigenMatMulC128SymbolName
+                    : runtime::kEigenSingleThreadedMatMulC128SymbolName;
+      float_type = llvm_ir::PrimitiveTypeToIrType(C128, module);
+      break;
     case S32:
       fn_name = multi_threaded
                     ? runtime::kEigenMatMulS32SymbolName
@@ -705,9 +719,6 @@ Status DotOpEmitter::EmitCallToRuntime() {
        int64_type, int64_type, int64_type, int32_type, int32_type},
       /*isVarArg=*/false);
 
-  llvm::Function* function = b_->GetInsertBlock()->getParent();
-  llvm::Module* module = function->getParent();
-
   llvm::FunctionCallee matmul_func =
       module->getOrInsertFunction(fn_name, matmul_type);
   if (auto* fn = llvm::dyn_cast<llvm::Function>(matmul_func.getCallee())) {
@@ -853,9 +864,11 @@ bool AreGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
       << output_shape.DebugString();
 
   switch (output_shape.element_type()) {
-    case F64:
-    case F32:
     case F16:
+    case F32:
+    case F64:
+    case C64:
+    case C128:
     case S32:
       return IsRank2(lhs_shape) && IsRank2(rhs_shape) && IsRank2(output_shape);
     default:
@@ -904,7 +917,9 @@ bool CanEmitTiledLlvmIrGemm(
     return false;
   }
 
-  if (dot_info.result_shape.element_type() == F16) {
+  if (dot_info.result_shape.element_type() == F16 ||
+      dot_info.result_shape.element_type() == C64 ||
+      dot_info.result_shape.element_type() == C128) {
     // TODO(sanjoy): This is probably easy to fix, but I want to keep the CL
     // adding this comment NFC.
     return false;
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
index 7d6c4942b69..35db15fed2c 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.cc
@@ -114,6 +114,22 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulF64(
                          transpose_rhs);
 }
 
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC64(
+    const void* run_options_ptr, std::complex<float>* out,
+    std::complex<float>* lhs, std::complex<float>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  MatMulDispatch<std::complex<float>>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                      transpose_lhs, transpose_rhs);
+}
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulC128(
+    const void* run_options_ptr, std::complex<double>* out,
+    std::complex<double>* lhs, std::complex<double>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  MatMulDispatch<std::complex<double>>(run_options_ptr, out, lhs, rhs, m, n, k,
+                                       transpose_lhs, transpose_rhs);
+}
+
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenMatMulS32(
     const void* run_options_ptr, int32* out, int32* lhs, int32* rhs, int64 m,
     int64 n, int64 k, int32 transpose_lhs, int32 transpose_rhs) {
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul.h b/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
index 1280d04d01f..11dfc5c1d80 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_MATMUL_H_
 
+#include <complex>
+
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
@@ -44,6 +46,18 @@ extern void __xla_cpu_runtime_EigenMatMulF64(
     tensorflow::int64 k, tensorflow::int32 transpose_lhs,
     tensorflow::int32 transpose_rhs);
 
+extern void __xla_cpu_runtime_EigenMatMulC64(
+    const void* run_options_ptr, std::complex<float>* out,
+    std::complex<float>* lhs, std::complex<float>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenMatMulC128(
+    const void* run_options_ptr, std::complex<double>* out,
+    std::complex<double>* lhs, std::complex<double>* rhs, tensorflow::int64 m,
+    tensorflow::int64 n, tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
 extern void __xla_cpu_runtime_EigenMatMulS32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
     tensorflow::int32* out, tensorflow::int32* lhs, tensorflow::int32* rhs,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
index e395bc7426c..c7601f939c7 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.cc
@@ -112,6 +112,24 @@ __xla_cpu_runtime_EigenSingleThreadedMatMulF64(const void* run_options_ptr,
                                        transpose_lhs, transpose_rhs);
 }
 
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
+__xla_cpu_runtime_EigenSingleThreadedMatMulC64(
+    const void* run_options_ptr, std::complex<float>* out,
+    std::complex<float>* lhs, std::complex<float>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  SingleThreadedMatMulDispatch<std::complex<float>>(
+      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+}
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
+__xla_cpu_runtime_EigenSingleThreadedMatMulC128(
+    const void* run_options_ptr, std::complex<double>* out,
+    std::complex<double>* lhs, std::complex<double>* rhs, int64 m, int64 n,
+    int64 k, int32 transpose_lhs, int32 transpose_rhs) {
+  SingleThreadedMatMulDispatch<std::complex<double>>(
+      run_options_ptr, out, lhs, rhs, m, n, k, transpose_lhs, transpose_rhs);
+}
+
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
 __xla_cpu_runtime_EigenSingleThreadedMatMulS32(const void* run_options_ptr,
                                                int32* out, int32* lhs,
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
index eb695910729..61fe224d420 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_MATMUL_H_
 
+#include <complex>
+
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/platform/types.h"
 
@@ -44,6 +46,20 @@ extern void __xla_cpu_runtime_EigenSingleThreadedMatMulF64(
     tensorflow::int64 k, tensorflow::int32 transpose_lhs,
     tensorflow::int32 transpose_rhs);
 
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC64(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    std::complex<float>* out, std::complex<float>* lhs,
+    std::complex<float>* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
+extern void __xla_cpu_runtime_EigenSingleThreadedMatMulC128(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
+    std::complex<double>* out, std::complex<double>* lhs,
+    std::complex<double>* rhs, tensorflow::int64 m, tensorflow::int64 n,
+    tensorflow::int64 k, tensorflow::int32 transpose_lhs,
+    tensorflow::int32 transpose_rhs);
+
 extern void __xla_cpu_runtime_EigenSingleThreadedMatMulS32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
     tensorflow::int32* out, tensorflow::int32* lhs, tensorflow::int32* rhs,
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index 395eb31c13f..4cc9e373b3e 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -246,6 +246,8 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulC128);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenMatMulS32);
   REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(MKLMatMulF64);
@@ -257,6 +259,8 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC64);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulC128);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulS32);
   REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin);
   REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue);
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 106ebb7be0e..02443ff3c3c 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -2556,6 +2556,20 @@ std::unique_ptr<Array2D<double>> HloEvaluator::MatmulArray2D(
       lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulF64);
 }
 
+std::unique_ptr<Array2D<std::complex<float>>> HloEvaluator::MatmulArray2D(
+    const Array2D<std::complex<float>>& lhs,
+    const Array2D<std::complex<float>>& rhs) {
+  return MatmulArray2DImpl<std::complex<float>>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC64);
+}
+
+std::unique_ptr<Array2D<std::complex<double>>> HloEvaluator::MatmulArray2D(
+    const Array2D<std::complex<double>>& lhs,
+    const Array2D<std::complex<double>>& rhs) {
+  return MatmulArray2DImpl<std::complex<double>>(
+      lhs, rhs, __xla_cpu_runtime_EigenSingleThreadedMatMulC128);
+}
+
 std::unique_ptr<Array2D<int32>> HloEvaluator::MatmulArray2D(
     const Array2D<int32>& lhs, const Array2D<int32>& rhs) {
   return MatmulArray2DImpl<int32>(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 803004225d2..dcd4129adcd 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -164,6 +164,12 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       const Array2D<float>& lhs, const Array2D<float>& rhs);
   static std::unique_ptr<Array2D<double>> MatmulArray2D(
       const Array2D<double>& lhs, const Array2D<double>& rhs);
+  static std::unique_ptr<Array2D<std::complex<float>>> MatmulArray2D(
+      const Array2D<std::complex<float>>& lhs,
+      const Array2D<std::complex<float>>& rhs);
+  static std::unique_ptr<Array2D<std::complex<double>>> MatmulArray2D(
+      const Array2D<std::complex<double>>& lhs,
+      const Array2D<std::complex<double>>& rhs);
   static std::unique_ptr<Array2D<int32>> MatmulArray2D(
       const Array2D<int32>& lhs, const Array2D<int32>& rhs);
 
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 6d64cb0a510..26cb25acbfe 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -416,6 +416,10 @@ XLA_TEST_P(ParametricDotTest, TestF16) { TestImpl<Eigen::half>(); }
 #endif
 XLA_TEST_P(ParametricDotTest, TestF32) { TestImpl<float>(); }
 XLA_TEST_P(ParametricDotTest, TestF64) { TestImpl<double>(); }
+XLA_TEST_P(ParametricDotTest, TestC64) { TestImpl<std::complex<float>>(); }
+#ifndef XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX128
+XLA_TEST_P(ParametricDotTest, TestC128) { TestImpl<std::complex<double>>(); }
+#endif
 XLA_TEST_P(ParametricDotTest, TestS32) { TestImpl<int32>(); }
 
 INSTANTIATE_TEST_CASE_P(DotTests, ParametricDotTest,

From e0eb14595bd838e3329716849884672a7ccd08e4 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 27 May 2020 09:57:11 -0700
Subject: [PATCH 1208/1533] Fix mlir_c_api BUILD deps

PiperOrigin-RevId: 313409176
Change-Id: I48c8c2baf5a9476e188961f00fabcd76b04ff623
---
 tensorflow/compiler/mlir/tensorflow/c/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/c/BUILD b/tensorflow/compiler/mlir/tensorflow/c/BUILD
index 3a503685fc6..9528874f419 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/c/BUILD
@@ -35,7 +35,7 @@ tf_cuda_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:casts",
+        "//tensorflow/core/platform:errors",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",

From f809169da0a9efe902bff7e23b03150e13d5e5d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 10:08:06 -0700
Subject: [PATCH 1209/1533] Fix tensorflow::errors:* calls, which use StrCat
 instead of StrFormat

PiperOrigin-RevId: 313411517
Change-Id: Ibe854b2cf53fa3e74664a69916a0a88788e9cc28
---
 tensorflow/lite/tools/signature/signature_def_util.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
index e44fe98b3cc..60ec27e4d22 100644
--- a/tensorflow/lite/tools/signature/signature_def_util.cc
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -132,7 +132,7 @@ Status GetSignatureDefMap(const Model* model,
     SerializedSignatureDefMap signature_defs;
     auto status = ReadSignatureDefMap(model, metadata, &signature_defs);
     if (status != tensorflow::Status::OK()) {
-      return tensorflow::errors::Internal("Error reading signature def map: %s",
+      return tensorflow::errors::Internal("Error reading signature def map: ",
                                           status.error_message());
     }
     for (const auto& entry : signature_defs) {

From 1c2527fd1134753431796831dcf225ce0846862d Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 27 May 2020 17:17:39 +0000
Subject: [PATCH 1210/1533] [ROCm] Fix for ROCm CSB breakage - 200527

The following commit introduces a new unit-test which fails on ROCm.

https://github.com/tensorflow/tensorflow/commit/dbef0933ebe4d3d85be73e88cfe5f83cac0ae1d6

I think that this unit-test is for checking the reduced memory usage of the gradient checkpointing method.

The sub-test `test_does_not_raise_oom_exception` fails on ROCm, because on the ROCm platform the scratch space required for doing backward convolution pushes the total memory allocation just beyond the 1GB limit imposed by the testcase.

This fix moves up the threshold by 128MB (from 1024 MB to 1152 MB). This still presevers the intent of the unit-test, i.e. the `test_raises_oom_exception` continues to raise the exception, while also allowing the `test_does_not_raise_oom_exception` sub-test to pass on the ROCm platform.
---
 .../python/keras/integration_test/gradient_checkpoint_test.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
index 9d9e0a062b3..100f3ca2022 100644
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -75,7 +75,7 @@ def _limit_gpu_memory():
   if gpus:
     tf.config.experimental.set_virtual_device_configuration(
         gpus[0],
-        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
+        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1152)])
     return True
   return False
 

From a67ee929f5aa2e16478d10e3287a248f34078cb5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 10:21:19 -0700
Subject: [PATCH 1211/1533] add a tensorflow::batch_util::CopyContiguousSlices
 utility function for slicing out a contiguous pieces of tensors along the
 batch dimension and copying them to another tensor.

PiperOrigin-RevId: 313414257
Change-Id: I2530c58ed53ad8e92e5f976f2dd1728296d12185
---
 tensorflow/core/framework/BUILD              |  1 +
 tensorflow/core/framework/batch_util_test.cc | 61 ++++++++++++++++
 tensorflow/core/framework/tensor.h           |  5 ++
 tensorflow/core/util/batch_util.cc           | 73 ++++++++++++++++++++
 tensorflow/core/util/batch_util.h            | 11 +++
 5 files changed, 151 insertions(+)
 create mode 100644 tensorflow/core/framework/batch_util_test.cc

diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 0e923bd1236..52f15dcb5c2 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1008,6 +1008,7 @@ tf_cc_tests(
     srcs = [
         "allocator_test.cc",
         "attr_value_util_test.cc",
+        "batch_util_test.cc",
         "bfloat16_test.cc",
         "cancellation_test.cc",
         "common_shape_fns_test.cc",
diff --git a/tensorflow/core/framework/batch_util_test.cc b/tensorflow/core/framework/batch_util_test.cc
new file mode 100644
index 00000000000..4e98371bda7
--- /dev/null
+++ b/tensorflow/core/framework/batch_util_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(CopyContiguousSlicesTest, CompatibleShape) {
+  Tensor src(DT_FLOAT, {7, 1, 2});
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/2, /*dst_offset=*/0, /*num_slices=*/5, &dst);
+  ASSERT_EQ(error::OK, s.code());
+}
+
+TEST(CopyContiguousSlicesTest, SourceOffsetOutOfRange) {
+  Tensor src(DT_FLOAT, {7, 1, 2});
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/7, /*dst_offset=*/0, /*num_slices=*/5, &dst);
+  ASSERT_EQ(error::FAILED_PRECONDITION, s.code());
+}
+
+TEST(CopyContiguousSlicesTest, DstOffsetOutOfRange) {
+  Tensor src(DT_FLOAT, {7, 1, 2});
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/0, /*dst_offset=*/0, /*num_slices=*/8, &dst);
+  ASSERT_EQ(error::FAILED_PRECONDITION, s.code());
+}
+
+TEST(CopyContiguousSlicesTest, CheckDstWithExpectedValues) {
+  auto src = test::AsTensor<float>({0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                   TensorShape({5, 2}));
+  Tensor dst(DT_FLOAT, {9, 2, 1});
+  auto s = batch_util::CopyContiguousSlices(
+      src, /*src_offset=*/1, /*dst_offset=*/5, /*num_slices=*/3, &dst);
+  ASSERT_EQ(error::OK, s.code());
+  test::ExpectTensorEqual<float>(
+      test::AsTensor<float>({2, 3, 4, 5, 6, 7}, TensorShape({3, 2, 1})),
+      dst.Slice(5, 8));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 744a14e007e..28eab3ab1e0 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -53,6 +53,8 @@ namespace batch_util {
 Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
+Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
+                            int64 dst_offset, int64 num_slices, Tensor* dst);
 }  // namespace batch_util
 
 /// @ingroup core
@@ -679,6 +681,9 @@ class Tensor {
   friend Status batch_util::MaybeMoveSliceToElement(
       Tensor* parent, Tensor* element,
       int64 index);  // For access to base<T>().
+  friend Status batch_util::CopyContiguousSlices(
+      const Tensor& src, int64 src_offset, int64 dst_offset, int64 num_slices,
+      Tensor* dst);  // For access to base<T>().
 
   bool CanUseDMA() const;
 
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index 0aff6b00f1c..b88c365ced0 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -216,6 +216,79 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index) {
   }
 }
 
+Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
+                            int64 dst_offset, int64 num_slices, Tensor* dst) {
+  if (src.dtype() != dst->dtype()) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: src and dst have different "
+        "dtypes. Source dtype: ",
+        src.dtype(), " dstination dtype: ", dst->dtype(), ".");
+  }
+  if (src.dims() < 1) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: src has to be a tensor with "
+        "rank >= 1. Source shape: ",
+        src.shape().DebugString());
+  }
+
+  if (dst->dims() < 1) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: dst has to be a tensor "
+        "with rank >= 1. Dest shape: ",
+        dst->shape().DebugString());
+  }
+
+  const int64 src_dim0 = src.dim_size(0);
+  const int64 dst_dim0 = dst->dim_size(0);
+  int64 src_chip_size = 1;
+  int64 dst_chip_size = 1;
+  for (int i = 1; i < src.dims(); ++i) {
+    src_chip_size *= src.dim_size(i);
+  }
+  for (int i = 1; i < dst->dims(); ++i) {
+    dst_chip_size *= dst->dim_size(i);
+  }
+
+  if (src_chip_size != dst_chip_size) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: source and dst shapes are"
+        "not compatible. Source shape: ",
+        src.shape().DebugString(), ", dst shape: ", dst->shape().DebugString());
+  }
+
+  if (src_chip_size == 0 && dst_chip_size == 0) {
+    return Status::OK();
+  }
+
+  if (src_offset < 0 || src_offset + num_slices > src_dim0 || dst_offset < 0 ||
+      dst_offset + num_slices > dst_dim0) {
+    return errors::FailedPrecondition(
+        "CopyContiguousSlices cannot perform copy: index out of range. "
+        "src_offset: ",
+        src_offset, ", num_slices: ", num_slices, ", src_dim0: ", src_dim0,
+        ", dst_offset: ", dst_offset, ", dst_dim0: ", dst_dim0, ".");
+  }
+
+#define HANDLE_TYPE(T)                                                 \
+  case DataTypeToEnum<T>::value: {                                     \
+    const T* src_p = src.base<T>() + (src_chip_size * src_offset);     \
+    T* dst_p = dst->base<T>() + (dst_chip_size * dst_offset);          \
+    HandleSliceToElement<T>(src_p, dst_p, src_chip_size * num_slices); \
+    return Status::OK();                                               \
+  }
+
+  switch (src.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return errors::Unimplemented("CopyContiguousSlices unhandled data type: ",
+                                   src.dtype());
+  }
+}
+
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 //
 // NOTE(mrry): The implementation may be able to optimize the copy to a move.
diff --git a/tensorflow/core/util/batch_util.h b/tensorflow/core/util/batch_util.h
index eee0309fbc4..d44d82ec0a2 100644
--- a/tensorflow/core/util/batch_util.h
+++ b/tensorflow/core/util/batch_util.h
@@ -32,6 +32,17 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
 
+// Copies 'num_slices' contiguous slices from 'src' tensor starting from index
+// 'src_offset' into target tensor 'dst', and places them into slices
+// starting from 'dst_offset'.
+//
+// This function requires 'src' and 'dst' to have compatible shapes. That is it
+// requires cum_prod(src.shape[1:] == cum_prod(dst->shape[1:]). For example if
+// source is of shape [x, 2, 1] and dst is a tensor of shape [y, 1, 2], this
+// function can still proceed successfully.
+Status CopyContiguousSlices(const Tensor& src, int64 src_offset,
+                            int64 dst_offset, int64 num_slices, Tensor* dst);
+
 // Copies the index^th slice of parent (in the 0th dimension) into element.
 //
 // NOTE(mrry): The implementation may be able to optimize the copy to a move.

From 624390fc196b50c344be697b4923db33e5420e4d Mon Sep 17 00:00:00 2001
From: Davide Libenzi <dlibenzi@google.com>
Date: Wed, 27 May 2020 10:23:22 -0700
Subject: [PATCH 1212/1533] Use INFO logging for device deprecation, for easier
 filtering.

PiperOrigin-RevId: 313414703
Change-Id: I2d496e7ae3381b469d7da8eda55a8886a9936a24
---
 tensorflow/compiler/jit/xla_device.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index abb42aa1815..7842513331d 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -395,12 +395,11 @@ static void ShowXlaDeviceDeprecationWarning(
   if (absl::StrContains(compilation_device_name, "CPU") ||
       absl::StrContains(compilation_device_name, "GPU")) {
     absl::call_once(once, [] {
-      LOG(WARNING)
-          << "XLA_GPU and XLA_CPU devices are deprecated and will be "
-             "removed in subsequent releases. Instead, use either "
-             "@tf.function(experimental_compile=True) for must-compile "
-             "semantics, or run with TF_XLA_FLAGS=--tf_xla_auto_jit=2 "
-             "for auto-clustering best-effort compilation.";
+      LOG(INFO) << "XLA_GPU and XLA_CPU devices are deprecated and will be "
+                   "removed in subsequent releases. Instead, use either "
+                   "@tf.function(experimental_compile=True) for must-compile "
+                   "semantics, or run with TF_XLA_FLAGS=--tf_xla_auto_jit=2 "
+                   "for auto-clustering best-effort compilation.";
     });
   }
 }

From bbf6da560a77f42e296b2ef2105e5cd478e366fe Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 27 May 2020 10:27:57 -0700
Subject: [PATCH 1213/1533] Google specific refactoring.

PiperOrigin-RevId: 313415673
Change-Id: I9f4948710229d154871afc5ef25aaf6799ddc380
---
 tensorflow/lite/delegates/gpu/cl/api.cc             | 3 ++-
 tensorflow/lite/delegates/gpu/cl/api.h              | 3 ++-
 tensorflow/lite/delegates/gpu/cl/egl_sync.h         | 1 +
 tensorflow/lite/delegates/gpu/cl/gl_interop.h       | 5 +++--
 tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h | 4 ++--
 tensorflow/lite/delegates/gpu/gl_delegate.cc        | 5 +++--
 tensorflow/lite/delegates/gpu/gl_delegate.h         | 2 +-
 7 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/api.cc b/tensorflow/lite/delegates/gpu/cl/api.cc
index 475eed4dccc..e82f67392e8 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.cc
+++ b/tensorflow/lite/delegates/gpu/cl/api.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/api.h"
 
+#include <EGL/eglext.h>
+
 #include <algorithm>
 #include <cstring>
 
-#include <EGL/eglext.h>
 #include "absl/memory/memory.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
diff --git a/tensorflow/lite/delegates/gpu/cl/api.h b/tensorflow/lite/delegates/gpu/cl/api.h
index 9d3f9f7214c..bddf7de3363 100644
--- a/tensorflow/lite/delegates/gpu/cl/api.h
+++ b/tensorflow/lite/delegates/gpu/cl/api.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_API_H_
 
+#include <EGL/egl.h>
+
 #include <cstdint>
 #include <memory>
 
-#include <EGL/egl.h>
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/api.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
diff --git a/tensorflow/lite/delegates/gpu/cl/egl_sync.h b/tensorflow/lite/delegates/gpu/cl/egl_sync.h
index d0943a797ee..dbea2436d73 100644
--- a/tensorflow/lite/delegates/gpu/cl/egl_sync.h
+++ b/tensorflow/lite/delegates/gpu/cl/egl_sync.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
+
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/cl/gl_interop.h b/tensorflow/lite/delegates/gpu/cl/gl_interop.h
index 1ca0181e8e5..aac769b9682 100644
--- a/tensorflow/lite/delegates/gpu/cl/gl_interop.h
+++ b/tensorflow/lite/delegates/gpu/cl/gl_interop.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_GL_INTEROP_H_
 
-#include <vector>
-
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
+
+#include <vector>
+
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
index c3df1f7a426..1a9fb73e6ab 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_api_delegate.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_API_DELEGATE_H_
 
-#include <stdint.h>
-
 #include <EGL/egl.h>
 #include <GLES3/gl31.h>
+#include <stdint.h>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.cc b/tensorflow/lite/delegates/gpu/gl_delegate.cc
index f6b2067d90c..f18c665a15d 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.cc
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/gl_delegate.h"
 
+#include <EGL/egl.h>
+#include <GLES3/gl31.h>
+
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
@@ -22,8 +25,6 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include <EGL/egl.h>
-#include <GLES3/gl31.h>
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
diff --git a/tensorflow/lite/delegates/gpu/gl_delegate.h b/tensorflow/lite/delegates/gpu/gl_delegate.h
index bfc15fb120e..fa8eec2ad6b 100644
--- a/tensorflow/lite/delegates/gpu/gl_delegate.h
+++ b/tensorflow/lite/delegates/gpu/gl_delegate.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_GL_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_GL_DELEGATE_H_
 
+#include <GLES3/gl31.h>
 #include <stdint.h>
 
-#include <GLES3/gl31.h>
 #include "absl/base/macros.h"
 #include "tensorflow/lite/c/common.h"
 

From 1d0dfbde011a30174e63d3916175a201dc3b271b Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Wed, 27 May 2020 10:45:40 -0700
Subject: [PATCH 1214/1533] [TF:TRT] Rewrite cast-to-fp32 operations to support
 TensorRT conversion.

When the requested precision is fp16, split a cast-to-fp32 operation into
cast-to-fp16 then cast-to-fp32, so that the new cast-to-fp32 can be added to
a tensorrt network.

Enhance _VerifyConnections in TfTrtIntegrationTestBase to allow the split of a
cast operation into a chain of two cast operations.

Add test cases.

PiperOrigin-RevId: 313419304
Change-Id: I43ed3f886c65552cfc0cc0b436b8ba02e759d038
---
 tensorflow/compiler/tf2tensorrt/BUILD         |  1 +
 .../tf2tensorrt/convert/convert_graph.cc      | 78 ++++++++++++++++++-
 .../tf2tensorrt/convert/convert_nodes.cc      |  4 +-
 .../compiler/tf2tensorrt/convert/utils.h      |  2 +
 tensorflow/python/compiler/tensorrt/BUILD     |  1 +
 .../compiler/tensorrt/test/cast_test.py       | 56 +++++++++++++
 .../test/tf_trt_integration_test_base.py      | 17 +++-
 7 files changed, 155 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/python/compiler/tensorrt/test/cast_test.py

diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 356798c19bd..3d3eab51268 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -403,6 +403,7 @@ tf_cuda_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/stream_executor/lib",
+        "//tensorflow/tools/graph_transforms:transform_utils",
     ] + if_tensorrt([":tensorrt_lib"]) + tf_custom_op_library_additional_deps(),
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
index aed422a5627..414d27477bc 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/device_properties.pb.h"  // NOLINT
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"  // NOLINT
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -555,6 +556,51 @@ int64 GetNextGraphSequenceNumber() {
   return graph_sequence_num++;
 }
 
+constexpr char kCastInputTypeAttrName[] = "SrcT";
+
+// Transforms node = cast(x, fp32) where datatype(x) != fp16 to:
+//   castToFp16 = cast(x, fp16)
+//   node = cast(castToFp16, fp32)
+//
+Status MaybeRewriteCastToFp32(GraphDef* graph_def, NodeDef* node_def) {
+  if (node_def->op() != "Cast") {
+    return Status::OK();
+  }
+
+  DataTypeVector input_types;
+  DataTypeVector output_types;
+  TF_RETURN_IF_ERROR(
+      graph_transforms::GetInOutTypes(*node_def, &input_types, &output_types));
+
+  if (input_types.size() != 1 || output_types.size() != 1) {
+    return errors::Internal("Bad cast operation");
+  }
+
+  if (input_types[0] == DT_HALF || output_types[0] != DT_FLOAT) {
+    return Status::OK();
+  }
+
+  VLOG(2) << "Rewriting cast to FP32 " << node_def->DebugString();
+
+  NodeDef* castToFp16 = graph_def->add_node();
+  for (auto attr_value : node_def->attr()) {
+    (*castToFp16->mutable_attr())[attr_value.first] = attr_value.second;
+  }
+  castToFp16->set_name(node_def->name() + "_split");
+  castToFp16->set_op("Cast");
+  castToFp16->set_device(node_def->device());
+  castToFp16->add_input(node_def->input(0));
+  (*castToFp16->mutable_attr())[kCastOutputTypeAttrName].set_type(DT_HALF);
+
+  node_def->set_input(0, castToFp16->name() + ":0");
+  (*node_def->mutable_attr())[kCastInputTypeAttrName].set_type(DT_HALF);
+
+  VLOG(2) << castToFp16->DebugString();
+  VLOG(2) << node_def->DebugString();
+
+  return Status::OK();
+}
+
 }  // namespace
 
 Status RegisterGraphToFunctionLibrary(const GraphDef& segment_graph_def,
@@ -640,10 +686,38 @@ Status ConvertAfterShapes(const ConversionParams& params) {
         "Calibration with FP32 or FP16 is not supported.");
   }
 
-  grappler::GraphProperties static_graph_properties(*params.grappler_item);
+  // Make a copy of the input_graph_def because grappler doesn't allow changes
+  // to the input_graph_def and GraphProperties only accepts GraphDef, but not
+  // Graph, as inputs.
+  //
+  // If the overhead of copying the input_graph_def becomes a concern, we can
+  // avoid the copy by (1) enhancing the GraphPropertiers representation to
+  // allow adding shape properties for newly created graph nodes and (2) rewrite
+  // the GraphDef transformation to Graph transformation.
+  GraphDef modified_graph_def = params.grappler_item->graph;
+  // When precision_mode is FP16, transform cast(x, fp32) to
+  // cast(cast(x, fp16), fp32). This creates cast(fp16, f32) that can be
+  // included in the TRTEngineOp as an TensorRT Identity layer for performance:
+  //  . Avoid cast(fp32, fp16) in the TRT engine implementation for fp16
+  //    precision.
+  //  . Changing the input to the TRTEngine from fp32 to fp16 may reduce data
+  //    moving from the host to the GPU.
+  if (params.precision_mode == TrtPrecisionMode::FP16) {
+    for (int i = 0; i < modified_graph_def.node_size(); i++) {
+      NodeDef* node_def = modified_graph_def.mutable_node(i);
+      TF_RETURN_IF_ERROR(MaybeRewriteCastToFp32(&modified_graph_def, node_def));
+    }
+  }
+
+  // Construct a GrapplerItem using the modified graph_def and the input
+  // grappler_item.
+  grappler::GrapplerItem grappler_item =
+      params.grappler_item->WithGraph(std::move(modified_graph_def));
+  const GraphDef& graph_def = grappler_item.graph;
+
+  grappler::GraphProperties static_graph_properties(grappler_item);
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
 
-  const GraphDef& graph_def = params.grappler_item->graph;
   // Convert graphdef to graph.
   FunctionLibraryDefinition flib(OpRegistry::Global(), graph_def.library());
   Graph graph(flib);
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 619c62f7c0e..8ca7c4cdf8f 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -4683,7 +4683,9 @@ Status ConvertCast(OpConverterParams* params) {
   }
 
   DataType output_type;
-  TF_RETURN_IF_ERROR(GetOutputTfType(*params, &output_type));
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(params->node_def, &output_type,
+                                      kCastOutputTypeAttrName));
+
   if (output_type != DataType::DT_FLOAT) {
     return unsupport_cast_error();
   }
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 59eeb420134..43697573bbd 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -29,6 +29,8 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {
 
+static constexpr char kCastOutputTypeAttrName[] = "DstT";
+
 class IONamePrefixes {
  public:
   static constexpr const char* const kInputPHName = "TensorRTInputPH_";
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 192ba71cebd..2b26dd42818 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -122,6 +122,7 @@ cuda_py_tests(
         "test/batch_matmul_test.py",
         "test/biasadd_matmul_test.py",
         "test/binary_tensor_weight_broadcast_test.py",
+        "test/cast_test.py",
         "test/combined_nms_test.py",
         "test/concatenation_test.py",
         "test/const_broadcast_test.py",
diff --git a/tensorflow/python/compiler/tensorrt/test/cast_test.py b/tensorflow/python/compiler/tensorrt/test/cast_test.py
new file mode 100644
index 00000000000..381aa5b93c2
--- /dev/null
+++ b/tensorflow/python/compiler/tensorrt/test/cast_test.py
@@ -0,0 +1,56 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test conversion of graphs involving INT32 tensors and operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compiler.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class CastInt32ToFp32Test(trt_test.TfTrtIntegrationTestBase):
+  """Tests cast to FP32 are splitted in FP16 mode."""
+
+  def _ConstOp(self, shape, dtype):
+    return constant_op.constant(np.random.randn(*shape), dtype=dtype)
+
+  def GraphFn(self, x):
+    b_f = self._ConstOp((1, 10), dtypes.float32)
+    x_f = math_ops.cast(x, dtypes.float32)
+    x_f = math_ops.mul(x_f, b_f)
+    b_f = self._ConstOp((1, 10), dtypes.float32)
+    x_f = math_ops.add(x_f, b_f)
+    return array_ops.identity(x_f, name="output_0")
+
+  def GetParams(self):
+    return self.BuildParams(self.GraphFn, dtypes.int32, [[1, 10]], [[1, 10]])
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Returns the expected engines to build."""
+    if run_params.precision_mode == "FP16":
+      return {"TRTEngineOp_0": ["Cast", "Add", "Mul"]}
+    else:
+      return {"TRTEngineOp_0": ["Add", "Mul"]}
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
index 773061d57a7..8b93750fde4 100644
--- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py
@@ -595,17 +595,32 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         if k not in removed_const_nodes
     }
 
-    # Compute the actual mapping from each node to its input nodes.
+    # Compute the actual mapping from each node to its input nodes. If a cast
+    # op doesn't exist in the original graph, we replace the use of the cast op
+    # with the input of the op. This allows the verification to handle the case
+    # where the TF-TRT bridge splits a cast op into a chain of two cast ops.
+    new_cast_op_name_to_node_map = {
+        node.name: node
+        for node in converted_gdef.node
+        if (node.name not in old_to_new_node_map and node.op == "Cast")
+    }
     actual_input_map = {}
     for node in converted_gdef.node:
       name_str = node.name
+      # Only nodes from the original graph or TRTEngineOp nodes are added as
+      # keys to the map.
       if node.op == "TRTEngineOp":
         name_str = self._RemoveGraphSequenceNumber(name_str)
+      elif name_str not in old_to_new_node_map:
+        continue
       actual_input_map[name_str] = set()
       input_set = actual_input_map[name_str]
       for inp in node.input:
         (prefix, node_name) = _InputName(inp)
         node_name = self._MayRemoveGraphSequenceNumber(node_name)
+        if node_name in new_cast_op_name_to_node_map:
+          (prefix, node_name) = _InputName(
+              new_cast_op_name_to_node_map[node_name].input[0])
         input_set.add(prefix + node_name)
 
     self.assertEqual(

From 5e6cb6e3241cf44b96351a451d2abeee32449bba Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 10:48:43 -0700
Subject: [PATCH 1215/1533] Fix for scalar tensors with BytesRequiredForTensor.

PiperOrigin-RevId: 313419954
Change-Id: Id6ce0fec1a1640d332bd55694ed23dc6b9da58da
---
 tensorflow/lite/micro/memory_helpers.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index c1b761bf088..05105f83ff3 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -83,8 +83,12 @@ TfLiteStatus BytesRequiredForTensor(const tflite::Tensor& flatbuffer_tensor,
                                     size_t* bytes, size_t* type_size,
                                     ErrorReporter* error_reporter) {
   int element_count = 1;
-  for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
-    element_count *= flatbuffer_tensor.shape()->Get(n);
+  // If flatbuffer_tensor.shape == nullptr, then flatbuffer_tensor is a scalar
+  // so has 1 element.
+  if (flatbuffer_tensor.shape() != nullptr) {
+    for (size_t n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) {
+      element_count *= flatbuffer_tensor.shape()->Get(n);
+    }
   }
 
   TfLiteType tf_lite_type;

From 19e210f87c7e28528dd063c77b565b598d31e1a1 Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 27 May 2020 10:53:42 -0700
Subject: [PATCH 1216/1533] sgemm fix.

---
 tensorflow/core/kernels/mkl_matmul_op.cc | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 3eccf97f53c..7e76e7fd6ca 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -25,7 +25,11 @@ limitations under the License.
 
 #if defined(INTEL_MKL)
 
+#ifdef ENABLE_MKLDNN_V1
 #include "mkldnn.hpp"
+#else
+#include "mkl_cblas.h"
+#endif  // ENABLE_MKLDNN_V1
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -58,11 +62,11 @@ class MklMatMulOp : public OpKernel {
     dim_pair[0].first = transpose_a_ ? 0 : 1;
     dim_pair[0].second = transpose_b_ ? 1 : 0;
 
-    OP_REQUIRES(
-        ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
-        errors::InvalidArgument(
-            "Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
-            ", In[1]: ", b.shape().DebugString()));
+    OP_REQUIRES(ctx,
+                a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
+                errors::InvalidArgument("Matrix size-incompatible: In[0]: ",
+                                        a.shape().DebugString(), ", In[1]: ",
+                                        b.shape().DebugString()));
     int a_dim_remaining = 1 - dim_pair[0].first;
     int b_dim_remaining = 1 - dim_pair[0].second;
     TensorShape out_shape(
@@ -151,11 +155,18 @@ class MklMatMulOp : public OpKernel {
     // 1.0 and 0.0 respectively.
     const float alpha = 1.0f;
     const float beta = 0.0f;
+#ifdef ENABLE_MKLDNN_V1
     char char_transa = transa ? 'T' : 'N';
     char char_transb = transb ? 'T' : 'N';
     VLOG(2) << "MKL DNN SGEMM CALLED";
     dnnl_sgemm(char_transa, char_transb, m, n, k, alpha, a, lda, b, ldb, beta,
                c, ldc);
+#else
+    // TODO(intel-tf): Remove this after TF2.3 fork.
+    cblas_sgemm(CblasRowMajor, transa ? CblasTrans : CblasNoTrans,
+                transb ? CblasTrans : CblasNoTrans, m, n, k, alpha, a, lda, b,
+                ldb, beta, c, ldc);
+#endif  // ENABLE_MKLDNN_V1
   }
 
 #ifdef ENABLE_INTEL_MKL_BFLOAT16

From 12f571f5aff9f9a1bfc2a2845f1c499efb807a5c Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 27 May 2020 10:52:00 -0700
Subject: [PATCH 1217/1533] [Docs] Document some known TF/XLA limitations.

PiperOrigin-RevId: 313420872
Change-Id: I506c3115a807b64b7245a855ca01b55fd006b960
---
 tensorflow/compiler/xla/g3doc/index.md | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/g3doc/index.md b/tensorflow/compiler/xla/g3doc/index.md
index b7868fedb8b..60bde306266 100644
--- a/tensorflow/compiler/xla/g3doc/index.md
+++ b/tensorflow/compiler/xla/g3doc/index.md
@@ -174,9 +174,33 @@ When filing bugs, attach the contents of the `/tmp/generated` directory
 
 If possible, try to isolate
 a bug to a single XLA program by using the
-[`replay_computation`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tools/replay_computation.cc)
+[`replay_computation`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tools/run_hlo_module_main.cc)
 and iteratively running it on generated programs.
 
+## Known Issues
+
+Compilation with XLA can greatly improve the performance of your programs, but
+the TensorFlow interop has a number of known sharp corners.
+
+### TensorArray TF/XLA Interconversion
+
+The problem manifests itself as an error message
+`Support for TensorList crossing the XLA/TF boundary is not implemented`.
+
+XLA supports `tf.TensorArray`. However, the _interconversion_ between TF and
+XLA representations is not implemented yet.
+This error often arises when the `TensorArray` is used inside the compiled
+block, but the derivative is taken outside.
+
+Workaround: compile the outermost scope which is taking the derivative.
+
+### Random Number Generation
+
+XLA currently ignores TF seeds to random operations. This affects stateful TF
+random operations, such as `tf.random.normal`, or `tf.nn.dropout`.  XLA will
+behave as if the compilation was seeded with a new unique seed at each run. This
+limitation does not apply to stateless random ops.
+
 ## XLA Frontends
 
 Apart from TensorFlow, XLA programs can be generated by:

From 076bbc5edfe655299b006b3c1b2c6281d330d638 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 11:02:15 -0700
Subject: [PATCH 1218/1533] Fix for getting shape of a 0-dimensional tensor.

PiperOrigin-RevId: 313423211
Change-Id: I99fb793bdac2b3aba59c045cebe15b1c35c43c97
---
 tensorflow/lite/micro/micro_allocator.cc | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index b67e158980d..c4f7f859e99 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -46,6 +46,10 @@ struct AllocationInfo {
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
 
+// Static instance of a zero-length int to pass as tensor dims for a flatbuffer
+// Tensor with no shape.
+constexpr TfLiteIntArray kZeroLengthIntArray = {0, {}};
+
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
   explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator)
@@ -311,11 +315,17 @@ TfLiteStatus InitializeTfLiteTensorFromFlatbuffer(
       flatbuffer_tensor, &result->bytes, &type_size, error_reporter));
 
   // TFLM doesn't allow reshaping the tensor which requires dynamic memory
-  // allocation so it is safe to drop the const qualifier. In the future, if we
-  // really want to update the tensor shape, we can always pass in a new
+  // allocation so it is safe to drop the const qualifier. In the future, if
+  // we really want to update the tensor shape, we can always pass in a new
   // TfLiteIntArray - especially we have to do so if the dimension is changed.
-  result->dims = const_cast<TfLiteIntArray*>(
-      reinterpret_cast<const TfLiteIntArray*>(flatbuffer_tensor.shape()));
+  if (flatbuffer_tensor.shape() == nullptr) {
+    // flatbuffer_tensor.shape() can return a nullptr in the case of a scalar
+    // tensor.
+    result->dims = const_cast<TfLiteIntArray*>(&kZeroLengthIntArray);
+  } else {
+    result->dims = const_cast<TfLiteIntArray*>(
+        reinterpret_cast<const TfLiteIntArray*>(flatbuffer_tensor.shape()));
+  }
 
   // Copy the quantization information from the serialized data.
   const auto* src_quantization = flatbuffer_tensor.quantization();

From 14da8c0f32bcfef47b39fba06ebfa5444e7b01fe Mon Sep 17 00:00:00 2001
From: Alan Chiao <alanchiao@google.com>
Date: Wed, 27 May 2020 11:06:25 -0700
Subject: [PATCH 1219/1533] Remove dead link to "quantization".

What it pointed to previously (TFMOT post-training docs) didn't provide additional useful information beyond this paragraph itself. For more on "what quantization is", the available information is available as people need it (when they use the different forms of quantization tools)

PiperOrigin-RevId: 313424121
Change-Id: Idd1014d9fcdd3ea415ee07f3630d52a96f714f39
---
 tensorflow/lite/g3doc/performance/model_optimization.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index c66b06f9b59..c45aacbb0c8 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -79,10 +79,9 @@ with TensorFlow Lite.
 
 ### Quantization
 
-[Quantization](https://www.tensorflow.org/model_optimization/guide/quantization)
-works by reducing the precision of the numbers used to represent a model's
-parameters, which by default are 32-bit floating point numbers. This results in
-a smaller model size and faster computation.
+Quantization works by reducing the precision of the numbers used to represent a
+model's parameters, which by default are 32-bit floating point numbers. This
+results in a smaller model size and faster computation.
 
 The following types of quantization are available in TensorFlow Lite:
 

From 42ca7c206316978a8efff46c09bc2a495a88c4bc Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Wed, 27 May 2020 16:22:38 +0000
Subject: [PATCH 1220/1533] in resolution of [Wsign-compare] warning id 1

---
 tensorflow/core/platform/protobuf.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/platform/protobuf.cc b/tensorflow/core/platform/protobuf.cc
index 1912ab11e62..a9a467e2813 100644
--- a/tensorflow/core/platform/protobuf.cc
+++ b/tensorflow/core/platform/protobuf.cc
@@ -23,7 +23,7 @@ const char* kProtobufUint64Typename = "::tensorflow::protobuf_uint64";
 TStringOutputStream::TStringOutputStream(tstring* target) : target_(target) {}
 
 bool TStringOutputStream::Next(void** data, int* size) {
-  int old_size = target_->size();
+  size_t old_size = target_->size();
 
   // Grow the string.
   if (old_size < target_->capacity()) {
@@ -32,7 +32,7 @@ bool TStringOutputStream::Next(void** data, int* size) {
     target_->resize_uninitialized(target_->capacity());
   } else {
     // Size has reached capacity, try to double the size.
-    if (old_size > std::numeric_limits<int>::max() / 2) {
+    if (old_size > std::numeric_limits<size_t>::max() / 2) {
       // Can not double the size otherwise it is going to cause integer
       // overflow in the expression below: old_size * 2 ";
       return false;
@@ -41,7 +41,7 @@ bool TStringOutputStream::Next(void** data, int* size) {
     // kMinimumSize.
     target_->resize_uninitialized(
         std::max(old_size * 2,
-                 kMinimumSize + 0));  // "+ 0" works around GCC4 weirdness.
+                 (size_t)kMinimumSize + 0));  // "+ 0" works around GCC4 weirdness.
   }
 
   *data = target_->data() + old_size;

From 56479bb19a00894664d24abb0d0cb5f211415c95 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 27 May 2020 11:16:17 -0700
Subject: [PATCH 1221/1533] [XLA:SPMD] Use a shape size function that does not
 check layout

PiperOrigin-RevId: 313426122
Change-Id: Icf474ce9e10368ce22c30afc32ff0b2d2e480f91
---
 .../xla/service/spmd/spmd_partitioner.cc      | 34 +++++++++----------
 .../xla/service/spmd/spmd_partitioner_util.cc |  5 +++
 .../xla/service/spmd/spmd_partitioner_util.h  |  4 +++
 3 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 8eee452328e..eb0a9c330c3 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -81,11 +81,10 @@ void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
   string report = hlo->ToString();
   int64 max_value = -1;
   for (HloInstruction* inst : group) {
-    if (inst->shape().IsTuple()) {
+    if (!inst->shape().IsArray()) {
       continue;
     }
-    max_value =
-        std::max<int64>(max_value, ShapeUtil::ByteSizeOf(inst->shape(), 4));
+    max_value = std::max<int64>(max_value, ShapeSizeInBytes(inst->shape()));
     absl::StrAppend(&report, "     * ", inst->ToString(), "\n");
   }
   entries_.push_back(std::make_pair(max_value, report));
@@ -149,14 +148,14 @@ template <typename F>
   const auto add_report = [&](std::vector<HloInstruction*>* insts) {
     std::sort(insts->begin(), insts->end(),
               [](const HloInstruction* inst0, const HloInstruction* inst1) {
-                return ShapeUtil::ByteSizeOf(inst0->shape()) >
-                       ShapeUtil::ByteSizeOf(inst1->shape());
+                return ShapeSizeInBytes(inst0->shape()) >
+                       ShapeSizeInBytes(inst1->shape());
               });
     for (int64 i = 0;
          i < std::min<int64>(report_instruction_count, insts->size()); ++i) {
       absl::StrAppend(&report, "  ",
                       tensorflow::strings::HumanReadableNumBytes(
-                          ShapeUtil::ByteSizeOf((*insts)[i]->shape())),
+                          ShapeSizeInBytes((*insts)[i]->shape())),
                       " : ", (*insts)[i]->ToString(), "\n");
     }
   };
@@ -1180,8 +1179,8 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
             operand, scatter_dims_to_operand_dims, slice_size,
             num_partitions_) &&
-        ShapeUtil::ByteSizeOf(updates.base_shape()) <
-            ShapeUtil::ByteSizeOf(scatter->shape())) {
+        ShapeSizeInBytes(updates.base_shape()) <
+            ShapeSizeInBytes(scatter->shape())) {
       // Operand is sharded on trivial slice dims (update slice size 1). We can
       // adjust the indices on each partition by subtracting the offsets. Then
       // we execute a scatter on full updated indices, and out-of-bound accesses
@@ -1968,8 +1967,8 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
             operand, start_index_map, gather->gather_slice_sizes(),
             num_partitions_) &&
-        ShapeUtil::ByteSizeOf(gather->shape()) <
-            ShapeUtil::ByteSizeOf(gather->operand(0)->shape())) {
+        ShapeSizeInBytes(gather->shape()) <
+            ShapeSizeInBytes(gather->operand(0)->shape())) {
       indices = indices.Reshard(HloSharding::Replicate());
       // Now the operand is partitioned in trivial slice dimensions, and the
       // indices are replicated. We execute a gather on partitioned operand,
@@ -2762,8 +2761,7 @@ Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
 
   auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::Zero(hlo->shape().element_type())));
-  if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
-      ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+  if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
     if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
       return DefaultAction(hlo);
     }
@@ -3005,8 +3003,8 @@ Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
       };
       auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::Zero(hlo->shape().element_type())));
-      if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
-          ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+      if (ShapeSizeInBytes(lhs.base_shape()) <
+          ShapeSizeInBytes(rhs.base_shape())) {
         if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
           return DefaultAction(hlo);
         }
@@ -3731,7 +3729,7 @@ Status SpmdPartitioningVisitor::HandleDotHelper(
   };
   if (output_lhs_non_contracting_partitions == num_partitions_ &&
       output_sharding_transposed_to_match_lhs == lhs_sharding &&
-      ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()) >=
+      ShapeSizeInBytes(hlo->operand(1)->shape()) >=
           options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (rhs_contracting_partitions == num_partitions_) {
       return emit_windowed_dot_general(0, 1, true, false);
@@ -3745,7 +3743,7 @@ Status SpmdPartitioningVisitor::HandleDotHelper(
   }
   if (output_rhs_non_contracting_partitions == num_partitions_ &&
       output_sharding_transposed_to_match_rhs == rhs_sharding &&
-      ShapeUtil::ByteSizeOf(hlo->operand(0)->shape()) >=
+      ShapeSizeInBytes(hlo->operand(0)->shape()) >=
           options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (lhs_contracting_partitions == num_partitions_) {
       return emit_windowed_dot_general(1, 0, true, false);
@@ -3775,8 +3773,8 @@ Status SpmdPartitioningVisitor::HandleDotHelper(
         LiteralUtil::Zero(hlo->shape().element_type())));
     // Pad both sides with zero, since NaN at one side cannot be masked by zero
     // on the other side.
-    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
-        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+    if (ShapeSizeInBytes(lhs.base_shape()) <
+        ShapeSizeInBytes(rhs.base_shape())) {
       lhs =
           lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
       rhs = rhs.PadWithValue(zero);
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 207f854cd9f..57617b59ffb 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -104,6 +104,11 @@ Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding) {
   return sharding.TileShape(shape);
 }
 
+int64 ShapeSizeInBytes(const Shape& shape) {
+  return ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type()) *
+         ShapeUtil::ElementsIn(shape);
+}
+
 Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
                                           const HloSharding& sharding,
                                           int64 partition_id) {
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index f96b23d7073..440f0e78112 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -57,6 +57,10 @@ bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding);
 // target sharding.
 Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
 
+// Similar to ShapeUtil::ByteSizeOf(), but does not check it has dense layout
+// since this can be before layout assignment.
+int64 ShapeSizeInBytes(const Shape& shape);
+
 // Returns the shard shape for a partition without padding due to uneven
 // sharding.
 Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,

From a5622fee575a238ecae9b70ff079d2f7a2f903a6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 11:18:31 -0700
Subject: [PATCH 1222/1533] Fix bugs in quantized PRELU operations.

PiperOrigin-RevId: 313426576
Change-Id: Ifb53ef0add80b5793e428fbfacbbea779bc9ae63
---
 tensorflow/lite/micro/kernels/prelu.cc      |  5 +++--
 tensorflow/lite/micro/kernels/prelu_test.cc | 22 ++++++++++-----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index 801181abba4..921aa208ea2 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -68,8 +68,9 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   int output_shift_1 = 0;
   int32_t output_multiplier_2 = 0;
   int output_shift_2 = 0;
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
-    double real_multiplier_1 = static_cast<double>(input->params.scale) *
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8 ||
+      output->type == kTfLiteInt16) {
+    double real_multiplier_1 = static_cast<double>(input->params.scale) /
                                static_cast<double>(output->params.scale);
     double real_multiplier_2 = static_cast<double>(input->params.scale) *
                                static_cast<double>(alpha->params.scale) /
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 66c0a609e8a..4199ae69689 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -156,14 +156,14 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
   const int output_dims_count = 12;
   float output_data[output_dims_count];
-  tflite::testing::TestPreluFloat({1, 2, 2, 3},  // input shape
+  tflite::testing::TestPreluFloat({3, 2, 2, 3},  // input shape
                                   {
                                       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
                                       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
                                       -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
                                       -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
                                   },
-                                  {1, 1, 1, 3},        // alpha shape
+                                  {3, 1, 1, 3},        // alpha shape
                                   {0.0f, 1.0f, 2.0f},  // alpha values
                                   {
                                       0.0f, 0.0f, 0.0f,    // Row 1, Column 1
@@ -171,26 +171,26 @@ TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
                                       0.0f, -1.0f, -2.0f,  // Row 2, Column 1
                                       0.0f, -2.0f, -4.0f,  // Row 1, Column 2
                                   },
-                                  {1, 2, 2, 3},  // output shape
+                                  {3, 2, 2, 3},  // output shape
                                   output_data);
 }
 
 TF_LITE_MICRO_TEST(QuantizedUint8PreluActivationsOpTest) {
   using tflite::testing::F2Q;
-  const float kMin = -1;
-  const float kMax = 127.f / 128.f;
+  const float kMin = -4;
+  const float kMax = 127.f / 32.f;
   const float kAlphaMin = -0.5f;
   const float kAlphaMax = 0.5f;
   const int output_dims_count = 12;
   uint8_t output_data[output_dims_count];
   tflite::testing::TestPreluQuantized(
-      {1, 2, 2, 3},  // input shape
+      {3, 2, 2, 3},  // input shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
        F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax), F2Q(-0.25f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax)},
-      kMin, kMax, {1, 1, 1, 3},  // alpha shape
+      kMin, kMax, {3, 1, 1, 3},  // alpha shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(-0.5f, kMin, kMax)},
       kMin, kMax,
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
@@ -198,7 +198,7 @@ TF_LITE_MICRO_TEST(QuantizedUint8PreluActivationsOpTest) {
        F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax),
        F2Q(0.125f, kMin, kMax)},
-      {1, 2, 2, 3},  // output shape
+      {3, 2, 2, 3},  // output shape
       kMin, kMax, output_data);
 }
 
@@ -211,13 +211,13 @@ TF_LITE_MICRO_TEST(QuantizedInt8PreluActivationsOpTest) {
   const int output_dims_count = 12;
   int8_t output_data[output_dims_count];
   tflite::testing::TestPreluQuantized(
-      {1, 2, 2, 3},  // input shape
+      {3, 2, 2, 3},  // input shape
       {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
        F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
        F2QS(-1.0f, kMin, kMax), F2QS(-1.0f, kMin, kMax),
        F2QS(-1.0f, kMin, kMax), F2QS(-0.25f, kMin, kMax),
        F2QS(-0.25f, kMin, kMax), F2QS(-0.25f, kMin, kMax)},
-      kMin, kMax, {1, 1, 1, 3},  // alpha shape
+      kMin, kMax, {3, 1, 1, 3},  // alpha shape
       {F2QS(0.0f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(-0.5f, kMin, kMax)},
       kMin, kMax,
       {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
@@ -225,7 +225,7 @@ TF_LITE_MICRO_TEST(QuantizedInt8PreluActivationsOpTest) {
        F2QS(0.0f, kMin, kMax), F2QS(-0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
        F2QS(0.0f, kMin, kMax), F2QS(-0.125f, kMin, kMax),
        F2QS(0.125f, kMin, kMax)},
-      {1, 2, 2, 3},  // output shape
+      {3, 2, 2, 3},  // output shape
       kMin, kMax, output_data);
 }
 TF_LITE_MICRO_TESTS_END

From dc18758c270de25d5b37a55d4b41af1157dbe625 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 27 May 2020 11:20:16 -0700
Subject: [PATCH 1223/1533] Roll forward "Add a show_fusion_subcomputations
 command to interactive_graphviz" with fix

PiperOrigin-RevId: 313426932
Change-Id: Ia2366ee899d7bd0d69448144d1c18164d5801753
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 25 ++++++-----
 .../compiler/xla/service/hlo_graph_dumper.h   | 14 ++++--
 .../xla/tools/interactive_graphviz.cc         | 44 +++++++++++++------
 3 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 3930898d665..ad21efa13c9 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -312,12 +312,13 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, absl::string_view label,
-               const DebugOptions& debug_options, bool show_backend_config,
+               const DebugOptions& debug_options,
+               HloRenderOptions hlo_render_options,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
         label_(label),
         debug_options_(debug_options),
-        show_backend_config_(show_backend_config),
+        hlo_render_options_(hlo_render_options),
         profile_(profile),
         filter_(std::move(filter)) {}
 
@@ -384,7 +385,7 @@ class HloDotDumper {
   const HloComputation* computation_;  // never null
   const string label_;                 // overall name for the graph
   const DebugOptions& debug_options_;
-  const bool show_backend_config_;
+  const HloRenderOptions hlo_render_options_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
 
@@ -565,7 +566,8 @@ bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) {
 bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   if (subcomp->IsFusionComputation()) {
     const HloInstruction* fusion = subcomp->FusionInstruction();
-    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion)) {
+    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion) ||
+        !hlo_render_options_.show_fusion_subcomputations) {
       return false;
     }
   }
@@ -1133,7 +1135,8 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
 
 string HloDotDumper::GetInstructionNodeBackendConfig(
     const HloInstruction* instr) {
-  if (!show_backend_config_ || instr->raw_backend_config_string().empty()) {
+  if (!hlo_render_options_.show_backend_config ||
+      instr->raw_backend_config_string().empty()) {
     return "";
   }
 
@@ -1604,14 +1607,14 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
                              const DebugOptions& debug_options,
                              RenderedGraphFormat format,
                              const HloExecutionProfile* hlo_execution_profile,
-                             bool show_backend_config) {
+                             HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return Unavailable("Can't render as URL; no URL renderer was registered.");
   }
 
   string rendered_dot =
-      HloDotDumper(&computation, label, debug_options, show_backend_config,
+      HloDotDumper(&computation, label, debug_options, hlo_render_options,
                    hlo_execution_profile, NodeFilter())
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1619,7 +1622,7 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
 
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config,
+    HloRenderOptions hlo_render_options,
     const absl::flat_hash_set<const HloInstruction*>& boundary) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
@@ -1632,7 +1635,7 @@ StatusOr<string> RenderNeighborhoodAround(
   string rendered_dot =
       HloDotDumper(node.parent(), label,
                    node.GetModule()->config().debug_options(),
-                   show_backend_config, /*profile=*/nullptr,
+                   hlo_render_options, /*profile=*/nullptr,
                    MakeNodeRadiusAroundFilter(&node, radius, boundary))
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1641,7 +1644,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config) {
+                                      HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return FailedPrecondition(
@@ -1663,7 +1666,7 @@ StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                    "NODES***<br/><br/>");
   }
   string rendered_dot =
-      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+      HloDotDumper(from.parent(), label, debug_options, hlo_render_options,
                    /*profile=*/nullptr, filter)
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 324ac67a6dd..528de77e4e6 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -50,6 +50,14 @@ enum class RenderedGraphFormat {
   kUrl,
 };
 
+struct HloRenderOptions {
+  // Include the backend config string in the rendered graph.
+  bool show_backend_config = false;
+
+  // Include the fusion subcomputations in the rendered graph.
+  bool show_fusion_subcomputations = true;
+};
+
 // Renders an HLO module as a human-readable visual graph.
 //
 // Note that this only works well for relatively small graphs (no more than a
@@ -61,7 +69,7 @@ StatusOr<string> RenderGraph(
     const HloComputation& computation, absl::string_view label,
     const DebugOptions& debug_options, RenderedGraphFormat format,
     const HloExecutionProfile* hlo_execution_profile = nullptr,
-    bool show_backend_config = false);
+    HloRenderOptions hlo_render_options = {});
 
 // Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -73,7 +81,7 @@ StatusOr<string> RenderGraph(
 // will be omitted even if they are within the radius.
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config = false,
+    HloRenderOptions hlo_render_options = {},
     const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
 // Renders nodes on any of the paths from `from` to `to`.  If there are more
@@ -82,7 +90,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config = false);
+                                      HloRenderOptions hlo_render_options = {});
 
 // Registers a function which implements RenderedGraphFormat::kUrl.
 //
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 4f8a6b43314..b6c62beff74 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -112,8 +112,7 @@ constexpr int64 kDefaultMaxNumNodesInAllPaths = 100;
 
 using absl::EqualsIgnoreCase;
 
-// A global control for whether backend configuration display is enabled.
-bool show_backend_config = true;
+HloRenderOptions hlo_render_options;
 
 HloInstruction* FindInstruction(const HloModule& module, string node_name) {
   if (absl::StartsWith(node_name, "%")) {
@@ -160,6 +159,8 @@ void DoHelpCommand() {
     Renders all nodes in <computation>.
   backend_config [on|off]
     Controls whether backend operation configuration information is printed.
+  show_fusion_subcomputations [on|off]
+    Controls whether fusion subcomputations are shown.
   list [name|op_name|op_type] <pattern>
     Lists all instructions whose name, metadata op_name, or metadata op_type
     contains <pattern> as a substring.
@@ -182,15 +183,32 @@ void DoHelpCommand() {
 // Turn metadata-printing on or off.
 void DoBackendConfigCommand(const std::vector<string>& tokens) {
   if (tokens.size() == 2 && tokens[1] == "on") {
-    show_backend_config = true;
+    hlo_render_options.show_backend_config = true;
   } else if (tokens.size() == 2 && tokens[1] == "off") {
-    show_backend_config = false;
+    hlo_render_options.show_backend_config = false;
   } else if (tokens.size() != 1) {
     std::cerr << "(Illegal backend_config value.  Use either 'on' or 'off'.)"
               << std::endl;
   }
   std::cout << "Backend configuration display "
-            << (show_backend_config ? "ON" : "OFF") << std::endl;
+            << (hlo_render_options.show_backend_config ? "ON" : "OFF")
+            << std::endl;
+}
+
+// Turn fusion computation display on or off.
+void DoShowFusionSubcomputationsCommand(const std::vector<string>& tokens) {
+  if (tokens.size() == 2 && tokens[1] == "on") {
+    hlo_render_options.show_fusion_subcomputations = true;
+  } else if (tokens.size() == 2 && tokens[1] == "off") {
+    hlo_render_options.show_fusion_subcomputations = false;
+  } else if (tokens.size() != 1) {
+    std::cerr << "(Illegal show_fusion_subcomputations value.  Use either "
+                 "'on' or 'off'.)"
+              << std::endl;
+  }
+  std::cout << "Fusion subcomputations display "
+            << (hlo_render_options.show_fusion_subcomputations ? "ON" : "OFF")
+            << std::endl;
 }
 
 // List all computations in the module.
@@ -373,7 +391,7 @@ void DoExtractCommand(const HloModule& module,
   auto extracted_module = ExtractModule(instr, height);
   std::cout << extracted_module->ToString(
                    HloPrintOptions::ShortParsable().set_print_backend_config(
-                       show_backend_config))
+                       hlo_render_options.show_backend_config))
             << std::endl;
 }
 
@@ -517,7 +535,7 @@ void DoAllPathsCommand(const Options& opts, const HloModule& module,
   }
   RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
     return RenderAllPathsFromTo(*from, *to, max_nodes, format,
-                                /*show_backend_config=*/show_backend_config);
+                                hlo_render_options);
   });
 }
 
@@ -582,15 +600,13 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
       return RenderGraph(*comp, /*label=*/"",
                          comp->parent()->config().debug_options(), format,
-                         /*hlo_execution_profile=*/nullptr,
-                         /*show_backend_config=*/show_backend_config);
+                         /*hlo_execution_profile=*/nullptr, hlo_render_options);
     });
   } else {
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
-      return RenderNeighborhoodAround(
-          *instr, graph_width, format,
-          /*show_backend_config=*/show_backend_config,
-          /*boundary=*/boundary);
+      return RenderNeighborhoodAround(*instr, graph_width, format,
+                                      hlo_render_options,
+                                      /*boundary=*/boundary);
     });
   }
 }
@@ -617,6 +633,8 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
       DoHelpCommand();
     } else if (tokens[0] == "backend_config") {
       DoBackendConfigCommand(tokens);
+    } else if (tokens[0] == "show_fusion_subcomputations") {
+      DoShowFusionSubcomputationsCommand(tokens);
     } else if (tokens[0] == "list") {
       if (tokens.size() > 1 && tokens[1] == "computations") {
         DoListComputationsCommand(module, tokens);

From b266b468252baa1b2a8348c86ffec071fc90fa95 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 27 May 2020 11:26:31 -0700
Subject: [PATCH 1224/1533] [XLA:GPU] Use the generic implementation for
 elemental reduce

The generic version used in fusions didn't support variadic reduction
on GPU (it did on CPU), so tie up some loose ends and use the generic version.

PiperOrigin-RevId: 313428251
Change-Id: Ide547280b0fcf04a99a51b721d8ca860c9da6305
---
 .../xla/service/gpu/elemental_ir_emitter.h    |   9 +-
 .../compiler/xla/service/gpu/ir_emitter.cc    | 136 +++---------------
 .../compiler/xla/service/gpu/ir_emitter.h     |   3 +-
 3 files changed, 25 insertions(+), 123 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 3c4e9f7c1e6..a3056b1ddad 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -40,7 +40,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
  public:
   // A NestedComputer computes an element of the output of the given computation
   // given a Span of its input elements.
-  using NestedComputer = std::function<StatusOr<llvm::Value*>(
+  using NestedComputer = std::function<StatusOr<std::vector<llvm::Value*>>(
       const HloComputation&, absl::Span<llvm::Value* const>)>;
 
   GpuElementalIrEmitter(const HloModuleConfig& hlo_module_config,
@@ -91,12 +91,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view) override {
-    // TODO(b/118332391): Supported variadic return values.
-    auto result = compute_nested_(callee, parameters);
-    if (!result.ok()) {
-      return result.status();
-    }
-    return std::vector<llvm::Value*>{result.ValueOrDie()};
+    return compute_nested_(callee, parameters);
   }
 
   llvm::Value* EmitThreadId() override;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 744cd7b56bf..aa8a6215cc7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -698,115 +698,6 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleReduce(HloInstruction* instr) {
-  const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(instr);
-  const Shape& out_shape = reduce->shape();
-  bool returns_tuple = !out_shape.IsArray();
-  int accumulators_count = 1;
-  if (returns_tuple) {
-    CHECK(out_shape.IsTuple());
-    accumulators_count = out_shape.tuple_shapes_size();
-  }
-
-  auto arg = reduce->operand(0);
-  absl::Span<const int64> dimensions(reduce->dimensions());
-  HloComputation* function = reduce->to_apply();
-  return EmitTargetElementLoop(
-      *reduce,
-      [=](const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        std::vector<llvm::Value*> accumulator_addrs;
-        std::vector<llvm::Type*> accumulator_types;
-
-        // Initialize accumulators with initial values.
-        for (int i = 0; i < accumulators_count; i++) {
-          auto init_value = reduce->init_values()[i];
-          const Shape& element_shape =
-              returns_tuple ? out_shape.tuple_shapes(i) : out_shape;
-          PrimitiveType accumulator_type = element_shape.element_type();
-          llvm::Type* accumulator_llvm_type =
-              llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
-          llvm::AllocaInst* accumulator_addr = Alloca(accumulator_llvm_type);
-          Store(Load(GetBasePointer(*init_value)), accumulator_addr);
-          accumulator_addrs.push_back(accumulator_addr);
-          accumulator_types.push_back(accumulator_llvm_type);
-        }
-
-        // The enclosing loops go over all the target elements. Now we have to
-        // compute the actual target element. For this, we build a new loop nest
-        // to iterate over all the reduction dimensions in the argument.
-        // AddLoopsForShapeOnDimensions will return an Index where induction
-        // Value*s are placed for each dimension in dimensions, and all the rest
-        // are nullptrs.
-        llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-        std::vector<llvm::Value*> input_multi_index =
-            loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
-                                               "reduction_dim");
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-        // Build a full index for the input argument, using reduced_dims_index
-        // as the base. In reduced_dims_index only the reduction dimensions are
-        // filled in. We fill in the rest of the dimensions with induction
-        // Value*s taken from 'index' which iterates over the target array.
-        // See the high-level description in the XLA documentation for details.
-        llvm_ir::IrArray::Index::const_iterator it = index.begin();
-
-        for (auto& i : input_multi_index) {
-          if (i == nullptr) {
-            i = *it++;
-          }
-        }
-        CHECK(index.end() == it);
-
-        // Apply the reduction function to the loaded value.
-        llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
-                                            b_.getInt64Ty());
-        std::vector<llvm::Value*> reduction_operands(accumulator_addrs.begin(),
-                                                     accumulator_addrs.end());
-        for (int i = 0; i < accumulators_count; i++) {
-          llvm::Value* input_address =
-              GetIrArray(*reduce->operand(i), *reduce)
-                  .EmitArrayElementAddress(input_index, &b_);
-          reduction_operands.push_back(input_address);
-        }
-
-        llvm::Value* ret_argument;
-        if (!returns_tuple) {
-          CHECK_EQ(accumulator_addrs.size(), 1);
-          ret_argument = accumulator_addrs[0];
-        } else {
-          const Shape& return_shape = function->root_instruction()->shape();
-
-          llvm::Type* return_value_buffer_type =
-              llvm_ir::ShapeToIrType(return_shape, module_);
-          ret_argument = Alloca(return_value_buffer_type);
-          llvm_ir::IrArray tuple_array(ret_argument, return_shape);
-          EmitTuple(tuple_array, accumulator_addrs, &b_);
-        }
-
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *function, reduction_operands, ret_argument));
-
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-
-        if (!returns_tuple) {
-          CHECK_EQ(accumulator_addrs.size(), 1);
-          return Load(accumulator_addrs[0]);
-        } else {
-          // Emit a struct for the LoopEmitter dealing with multi-output
-          // fusion.
-          llvm::Value* returned_structure = llvm::UndefValue::get(
-              llvm::StructType::get(b_.getContext(), accumulator_types));
-          for (int i = 0; i < accumulators_count; i++) {
-            llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
-            returned_structure =
-                b_.CreateInsertValue(returned_structure, accumulator_value, i);
-          }
-          return returned_structure;
-        }
-      });
-}
-
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   // kFusion for library calls should be handled by
   // IrEmitterUnnested::HandleFusion.
@@ -866,22 +757,39 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
       "to a cudnn CustomCall using CudnnBatchNormRewriter.");
 }
 
-StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
+StatusOr<std::vector<llvm::Value*>> IrEmitter::ComputeNestedElement(
     const HloComputation& computation,
     absl::Span<llvm::Value* const> parameter_elements) {
+  const Shape& return_shape = computation.root_instruction()->shape();
   llvm::Value* return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
-      llvm_ir::PrimitiveTypeToIrType(
-          computation.root_instruction()->shape().element_type(), module_),
-      "return_buffer", &b_);
+      llvm_ir::ShapeToIrType(return_shape, module_), "return_buffer", &b_);
   std::vector<llvm::Value*> parameter_buffers;
   for (llvm::Value* parameter_element : parameter_elements) {
     parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
         parameter_element->getType(), "parameter_buffer", &b_));
     Store(parameter_element, parameter_buffers.back());
   }
+
+  std::vector<llvm::Value*> allocas_for_returned_scalars;
+  if (!return_shape.IsTuple()) {
+    allocas_for_returned_scalars.push_back(return_buffer);
+  } else {
+    allocas_for_returned_scalars =
+        llvm_ir::EmitTupleAllocasAtFunctionEntry(return_shape, &b_);
+    llvm_ir::IrArray tuple_array(return_buffer, return_shape);
+
+    EmitTuple(tuple_array, allocas_for_returned_scalars, &b_);
+  }
+
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(computation, parameter_buffers,
                                                  return_buffer));
-  return Load(return_buffer);
+
+  std::vector<llvm::Value*> returned_scalars;
+  returned_scalars.reserve(allocas_for_returned_scalars.size());
+  for (llvm::Value* addr : allocas_for_returned_scalars) {
+    returned_scalars.push_back(Load(addr));
+  }
+  return returned_scalars;
 }
 
 std::vector<llvm_ir::IrArray> IrEmitter::ConstructIrArrayForOutputs(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index e0fe454dcfe..93712961ea2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -89,7 +89,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleRecv(HloInstruction* recv) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleParameter(HloInstruction* parameter) override;
-  Status HandleReduce(HloInstruction* reduce) override;
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
@@ -213,7 +212,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
                        const llvm_ir::IrArray::Index& compare_keys_index,
                        const llvm_ir::IrArray& keys_array);
 
-  StatusOr<llvm::Value*> ComputeNestedElement(
+  StatusOr<std::vector<llvm::Value*>> ComputeNestedElement(
       const HloComputation& computation,
       absl::Span<llvm::Value* const> parameter_elements);
 

From 7cedc771d48db5d18048696a8083be59cc3e1437 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Wed, 27 May 2020 11:26:43 -0700
Subject: [PATCH 1225/1533] Internal testing change for coverage tests

PiperOrigin-RevId: 313428290
Change-Id: I2a9d6684ddb26e71ea5be4de4c1ed7c1b11cb0d8
---
 tensorflow/lite/build_def.bzl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index f6cdb981328..fd51ad0a4aa 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -680,6 +680,9 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
         if failure_type[i] != "none":
             args.append("--failure_type=%s" % failure_type[i])
         i = i + 1
+
+        # Avoid coverage timeouts for large/enormous tests.
+        coverage_tags = ["nozapfhahn"] if size in ["large", "enormous"] else []
         native.py_test(
             name = "model_coverage_test_%s_%s" % (model_name, target_op_sets.lower().replace(",", "_")),
             srcs = [src],
@@ -696,7 +699,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "no_gpu",  # Executing with TF GPU configurations is redundant.
                 "no_oss",
                 "no_windows",
-            ] + tags,
+            ] + tags + coverage_tags,
             deps = [
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
                 "//tensorflow/lite/python:lite",

From 90b80fba1ade0222713b8a33af00858190532075 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 27 May 2020 11:54:19 -0700
Subject: [PATCH 1226/1533] [TF/XLA] On compilation failure, do not overflow
 the max size of the bad status by a huge list of function inputs

PiperOrigin-RevId: 313433935
Change-Id: Iaff5c61ce01c6eac7894bed4edd76a396f846151
---
 tensorflow/compiler/jit/xla_kernel_creator_util.cc | 2 +-
 tensorflow/core/framework/node_def_util.cc         | 6 +++++-
 tensorflow/core/framework/node_def_util.h          | 7 ++++++-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.cc b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
index 99046c0bd76..3cc68f2a1a4 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
@@ -91,7 +91,7 @@ Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
     }
     string message = absl::StrCat(
         "Function invoked by the following node is not compilable: ",
-        SummarizeNodeDef(node_def), ".\n");
+        SummarizeNodeDef(node_def, /*max_inputs_in_summary=*/10), ".\n");
     absl::StrAppend(&message, "Uncompilable nodes:");
     for (const auto& node_info : uncompilable_node_info) {
       string node_message =
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 86911c7310a..0a26ceca66f 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -100,7 +100,7 @@ string AttrSlice::DebugString() const {
   return absl::StrJoin(attr_key_vals, ", ");
 }
 
-string SummarizeNodeDef(const NodeDef& node_def) {
+string SummarizeNodeDef(const NodeDef& node_def, int max_inputs_in_summary) {
   string ret = strings::StrCat(errors::FormatNodeNameForError(node_def.name()),
                                " = ", node_def.op(), "[");
   strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
@@ -111,6 +111,10 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   for (const string& input : node_def.input()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
+    if (max_inputs_in_summary-- == 0) {
+      strings::StrAppend(&ret, "...");
+      break;
+    }
     strings::StrAppend(&ret, input);
   }
   strings::StrAppend(&ret, ")");
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index db3f2570a92..d937a8e51e1 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -58,7 +58,12 @@ extern const char* const kColocationGroupPrefix;
 
 // Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
-string SummarizeNodeDef(const NodeDef& node_def);
+//
+// The parameter `max_inputs_in_summary` specifies how many inputs at most to
+// serialize in the output (in order not to get a string which is overly large).
+// The value `-1` specifies that all inputs will be shown.
+string SummarizeNodeDef(const NodeDef& node_def,
+                        int max_inputs_in_summary = -1);
 string SummarizeAttrs(const NodeDef& node_def);
 string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device);
 

From a8a4d3d73c247f96d577695a660cc7aa6a61240a Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Wed, 27 May 2020 11:57:02 -0700
Subject: [PATCH 1227/1533] Stop explicitly passing a `name` arg to `slice`
 from `slice_helper`. (while leaving in the name_scopes). This makes the slice
 operator more dispatch-friendly, because `slice` dispatches while
 `slice_helper` doesn't. This means the `dispatched` operation no longer has a
 name locked in and is less likely to run into issues where multiple ops are
 created w/ the same name.

PiperOrigin-RevId: 313434525
Change-Id: I9c7e87b4685a08de7c57a4703b7489e76eb48b76
---
 tensorflow/python/ops/array_ops.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a641633b1f5..118c2cfca55 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -983,7 +983,7 @@ def _slice_helper(tensor, slice_spec, var=None):
   with ops.name_scope(
       None,
       "strided_slice", [tensor] + begin + end + strides,
-      skip_on_eager=False) as name:
+      skip_on_eager=False):
     if begin:
       packed_begin, packed_end, packed_strides = (stack(begin), stack(end),
                                                   stack(strides))
@@ -1009,8 +1009,7 @@ def _slice_helper(tensor, slice_spec, var=None):
         shrink_axis_mask=shrink_axis_mask,
         new_axis_mask=new_axis_mask,
         ellipsis_mask=ellipsis_mask,
-        var=var,
-        name=name)
+        var=var)
 
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
@@ -1194,7 +1193,7 @@ def strided_slice(input_,
       if var is None:
         raise ValueError("Sliced assignment is only supported for variables")
       else:
-        if name is None:
+        if name is None and parent_name:
           name = parent_name + "_assign"
 
         return var._strided_slice_assign(

From 6f7765b1bdaa3ed34958585311e69cfc53137405 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Wed, 27 May 2020 12:47:20 -0700
Subject: [PATCH 1228/1533] Reduce Layer.__call__ overhead by 5-10%

Only retrieve mask arg if Layer.call or Layer.compute_mask need it.
Skips checking for implicit masks entirely otherwise.

PiperOrigin-RevId: 313444769
Change-Id: Ife930d4c299dce6463836e0e238d236b7582b2ee
---
 tensorflow/python/keras/engine/base_layer.py | 48 ++++++++++----------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index b986f9a405e..4a43b0526f6 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -850,11 +850,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed
     # explicitly take priority.
     mask_arg_passed_by_framework = False
-    input_masks = self._collect_input_masks(inputs, input_list, args, kwargs)
-    if (self._expects_mask_arg and input_masks is not None and
-        not self._call_arg_was_passed('mask', args, kwargs)):
-      mask_arg_passed_by_framework = True
+    input_masks, mask_is_implicit = self._get_input_masks(
+        inputs, input_list, args, kwargs)
+    if self._expects_mask_arg and mask_is_implicit:
       kwargs['mask'] = input_masks
+      mask_arg_passed_by_framework = True
 
     # If `training` argument is None or not explicitly passed,
     # propagate `training` value from this layer's calling layer.
@@ -2312,20 +2312,26 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # Do not track masks for `TensorFlowOpLayer` construction.
         output._keras_mask._keras_history_checked = True
 
-  def _collect_input_masks(self, inputs, input_list, args, kwargs):
-    """Checks if `mask` argument was passed, else gathers mask from inputs."""
-    if self._call_arg_was_passed('mask', args, kwargs):
-      return self._get_call_arg_value('mask', args, kwargs)
-
-    if not self._should_compute_mask:
-      return None
-
-    input_masks = [getattr(t, '_keras_mask', None) for t in input_list]
-    if all(mask is None for mask in input_masks):
-      return None
-
-    # Only do expensive `nest` operation when masking is actually being used.
-    return nest.pack_sequence_as(inputs, input_masks)
+  def _get_input_masks(self, inputs, input_list, args, kwargs):
+    if (not self._expects_mask_arg and not self.supports_masking and
+        not self._compute_mask_overridden):
+      # Input masks only need to be retrieved if they are needed for `call`
+      # or `compute_mask`.
+      input_masks = None
+      implicit_mask = False
+    elif self._call_arg_was_passed('mask', args, kwargs):
+      input_masks = self._get_call_arg_value('mask', args, kwargs)
+      implicit_mask = False
+    else:
+      input_masks = [getattr(t, '_keras_mask', None) for t in input_list]
+      if all(mask is None for mask in input_masks):
+        input_masks = None
+        implicit_mask = False
+      else:
+        # Only do expensive `nest` op when masking is actually being used.
+        input_masks = nest.pack_sequence_as(inputs, input_masks)
+        implicit_mask = True
+    return input_masks, implicit_mask
 
   def _call_arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
     # Performance optimization: do no work in most common case.
@@ -2751,12 +2757,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def _call_accepts_kwargs(self):
     return self._call_full_argspec.varkw is not None
 
-  @property
-  @tracking.cached_per_instance
-  def _should_compute_mask(self):
-    return ('mask' in self._call_fn_args or
-            getattr(self, 'compute_mask', None) is not None)
-
   @property
   def _eager_losses(self):
     # A list of loss values containing activity regularizers and losses

From 426869b50f88fb3efccaa3553ec86ba1eedb6aa6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 12:53:50 -0700
Subject: [PATCH 1229/1533] patch cl/312773551

PR #38585: Fix invalid shape issue in random.uniform

Imported from GitHub PR https://github.com/tensorflow/tensorflow/pull/38585

Note: This PR is a resubmission from #34399

This PR tries to address the issue raised in #34363 where
invalid shape passed to minval/maxval (expected to be 0-D)
does not raise an error.

The issue was that in most of the scenarios the shape was
checked inside the C++ kernel ops.

However, in one condition math_ops.add was used which will
implicitly do broadcast when necessarily.
This results in maxval/minval's shape getting carried.

This PR adds the shape check before math_ops.add, to make
sure the shape is guaranteed.

This PR fixes #34363.

Signed-off-by: Yong Tang yong.tang.github@outlook.com
Copybara import of the project:

--
1c480a2175ed7d8a86210882bfbb0ed45f0730d6 by Yong Tang <yong.tang.github@outlook.com>:

Fix invalid shape issue in random.uniform

This PR tries to address the issue raised in 34363 where
invalid shape passed to minval/maxval (expected to be 0-D)
does not raise an error.

The issue was that in most of the scenarios the shape was
checked inside the C++ kernel ops.

However, in one condition math_ops.add was used which will
implicitly do broadcast when necessarily.
This results in maxval/minval's shape getting carried.

This PR adds the shape check before math_ops.add, to make
sure the shape is guaranteed.

This PR fixes 34363.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

--
81dca0016c1efbfc99d7f22e2ac6d26e0c5099b5 by Yong Tang <yong.tang.github@outlook.com>:

Add test case for invalid shape issue in random.uniform

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

--
be3dee4337f45883326536bc2fad7539cd1a2244 by Yong Tang <yong.tang.github@outlook.com>:

Use explicit broadcast_to to prevent shape overflow

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
RELNOTES=n/a
PiperOrigin-RevId: 313446121
Change-Id: I34b076d79c13a7db040bf46aa5b2f2b43075c55f
---
 tensorflow/python/kernel_tests/random/random_ops_test.py | 8 ++++++++
 tensorflow/python/ops/random_ops.py                      | 6 ++++++
 2 files changed, 14 insertions(+)

diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 73c8bd09db0..1d3fdec3179 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
@@ -415,6 +416,13 @@ class RandomUniformTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             graph_seed=965)
 
+  def testUniformWithInvalidMaxMindShape(self):
+    # Test case for GitHub issue 34363.
+    with self.assertRaises(
+        (errors.InvalidArgumentError, errors.UnknownError, ValueError)):
+      array = array_ops.zeros(shape=(1,))
+      random_ops.random_uniform(shape=(), minval=array)
+
 
 class RandomShapeTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 1af91ed0dd3..30d8e45b2c4 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -304,6 +304,12 @@ def random_uniform(shape,
         if not maxval_is_one:
           result = math_ops.multiply(result, maxval)
       else:
+        # Use explicit "broadcast_to" so that any shape incompatibility
+        # are returned with InvalidArgument error.
+        # This prevent "slient broadcast" that may cause the shape of
+        # result "overflow" when minval or maxval is larger than expected shape
+        maxval = array_ops.broadcast_to(maxval, shape)
+        minval = array_ops.broadcast_to(minval, shape)
         result = math_ops.add(result * (maxval - minval), minval, name=name)
     # TODO(b/132092188): C++ shape inference inside functional ops does not
     # cross FuncGraph boundaries since that information is only available in

From 49a5378fb1f9154ea499007e35c23ab8d4605753 Mon Sep 17 00:00:00 2001
From: RJ Skerry-Ryan <rjryan@google.com>
Date: Wed, 27 May 2020 12:58:04 -0700
Subject: [PATCH 1230/1533] Expand composites in tf.test.TestCase.evaluate.

This is helpful to allow CompositeTensors to be used with tf.test.TestCase.evaluate.

PiperOrigin-RevId: 313446907
Change-Id: I74ab70647b674689f36ea5738b57d44d02ec9685
---
 tensorflow/python/framework/test_util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4981e1b68fd..4451d90a490 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -2128,7 +2128,9 @@ class TensorFlowTestCase(googletest.TestCase):
               values=tensor.values.numpy(),
               indices=tensor.indices.numpy(),
               dense_shape=tensor.dense_shape.numpy())
-        return tensor.numpy()
+        # Convert tensors and composite tensors to numpy arrays.
+        return nest.map_structure(lambda t: t.numpy(), tensor,
+                                  expand_composites=True)
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
 

From f6bf10607fc0bd00e94704e1ae20f06f34b81df3 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 27 May 2020 13:33:45 -0700
Subject: [PATCH 1231/1533] [tf.data] Fix a bug in prefetch dataset
 serialization logic.

PiperOrigin-RevId: 313453820
Change-Id: I573d4288fbb10b7491778ce4edf24241f5e35fa1
---
 tensorflow/core/kernels/data/prefetch_dataset_op.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 0c3fe43d6c3..0230bcd146d 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -100,9 +100,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
     AttrValue slack_period_attr;
     b->BuildAttrValue(slack_period_, &slack_period_attr);
-    TF_RETURN_IF_ERROR(b->AddDataset(
-        this, {input_graph_node, buffer_size},
-        {std::make_pair(kSlackPeriod, slack_period_attr)}, output));
+    AttrValue legacy_autotune_attr;
+    b->BuildAttrValue(legacy_autotune_, &legacy_autotune_attr);
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_graph_node, buffer_size},
+                      {std::make_pair(kSlackPeriod, slack_period_attr),
+                       std::make_pair(kLegacyAutotune, legacy_autotune_attr)},
+                      output));
     return Status::OK();
   }
 

From f44bb87843cababb97401e93ae9d91e6a296c293 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Wed, 27 May 2020 13:38:35 -0700
Subject: [PATCH 1232/1533] Added new argument types to Arguments. New types -
 buffer and image2d. Added interface for generic gpu object.

PiperOrigin-RevId: 313454794
Change-Id: Iba0a1a6e90237f50af2576cdf3622b2a170cf854
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  12 ++
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 132 +++++++++++++++++-
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  27 +++-
 tensorflow/lite/delegates/gpu/cl/gpu_object.h | 121 ++++++++++++++++
 .../delegates/gpu/cl/kernels/transpose.cc     |   7 +-
 5 files changed, 290 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/cl/gpu_object.h

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index c149479ae4c..95b20bc6e81 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -43,8 +43,10 @@ cc_library(
     srcs = ["arguments.cc"],
     hdrs = ["arguments.h"],
     deps = [
+        ":gpu_object",
         ":opencl_wrapper",
         ":util",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
@@ -305,6 +307,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_object",
+    hdrs = ["gpu_object.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
 cc_library(
     name = "inference_context",
     srcs = ["inference_context.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 26d9fc778b3..bdfae935f28 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -23,10 +23,14 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
 std::string GetNextWord(const std::string& code, size_t first_position) {
   size_t pos = first_position;
   char t = code[pos];
-  while (absl::ascii_isalnum(t) || t == '_') {
+  while (IsWordSymbol(t)) {
     pos++;
     t = code[pos];
   }
@@ -38,13 +42,19 @@ Arguments::Arguments(Arguments&& args)
     : int_values_(std::move(args.int_values_)),
       shared_int4s_data_(std::move(args.shared_int4s_data_)),
       float_values_(std::move(args.float_values_)),
-      shared_float4s_data_(std::move(args.shared_float4s_data_)) {}
+      shared_float4s_data_(std::move(args.shared_float4s_data_)),
+      buffers_(std::move(args.buffers_)),
+      images2d_(std::move(args.images2d_)),
+      objects_(std::move(args.objects_)) {}
 Arguments& Arguments::operator=(Arguments&& args) {
   if (this != &args) {
     int_values_ = std::move(args.int_values_);
     shared_int4s_data_ = std::move(args.shared_int4s_data_);
     float_values_ = std::move(args.float_values_);
     shared_float4s_data_ = std::move(args.shared_float4s_data_);
+    buffers_ = std::move(args.buffers_);
+    images2d_ = std::move(args.images2d_);
+    objects_ = std::move(args.objects_);
   }
   return *this;
 }
@@ -55,11 +65,40 @@ void Arguments::AddFloat(const std::string& name, float value) {
 void Arguments::AddInt(const std::string& name, int value) {
   int_values_[name].value = value;
 }
+void Arguments::AddBuffer(const std::string& name,
+                          const GPUBufferDescriptor& desc) {
+  buffers_[name] = desc;
+}
+void Arguments::AddImage2D(const std::string& name,
+                           const GPUImage2DDescriptor& desc) {
+  images2d_[name] = desc;
+}
+
+void Arguments::AddObject(const std::string& name, GPUObjectPtr&& object) {
+  objects_[name] = {AccessType::READ, std::move(object)};
+}
+
+void Arguments::AddGPUResources(const std::string& name,
+                                const GPUResources& resources) {
+  for (const auto& r : resources.ints) {
+    AddInt(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.floats) {
+    AddFloat(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.buffers) {
+    AddBuffer(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.images2d) {
+    AddImage2D(absl::StrCat(name, "_", r.first), r.second);
+  }
+}
 
 absl::Status Arguments::SetInt(const std::string& name, int value) {
   auto ii = int_values_.find(name);
   if (ii == int_values_.end()) {
-    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+    return absl::NotFoundError(
+        absl::StrCat("No int argument with name - ", name));
   }
   ii->second.value = value;
   if (ii->second.active) {
@@ -71,7 +110,8 @@ absl::Status Arguments::SetInt(const std::string& name, int value) {
 absl::Status Arguments::SetFloat(const std::string& name, float value) {
   auto fi = float_values_.find(name);
   if (fi == float_values_.end()) {
-    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+    return absl::NotFoundError(
+        absl::StrCat("No float argument with name - ", name));
   }
   fi->second.value = value;
   if (fi->second.active) {
@@ -80,8 +120,60 @@ absl::Status Arguments::SetFloat(const std::string& name, float value) {
   return absl::OkStatus();
 }
 
+absl::Status Arguments::SetImage2D(const std::string& name, cl_mem memory) {
+  auto ti = images2d_.find(name);
+  if (ti == images2d_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image2D argument with name - ", name));
+  }
+  ti->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetBuffer(const std::string& name, cl_mem memory) {
+  auto it = buffers_.find(name);
+  if (it == buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No buffer argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetGPUResources(
+    const std::string& name, const GPUResourcesWithValue& resources) {
+  for (const auto& r : resources.ints) {
+    RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.floats) {
+    RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.buffers) {
+    RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.images2d) {
+    RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::TransformToCLCode(std::string* code) {
+  RETURN_IF_ERROR(AddObjectArgs());
+  ResolveArgsPass(code);
+  return absl::OkStatus();
+}
+
 std::string Arguments::GetListOfArgs() {
   std::string result;
+  for (auto& t : buffers_) {
+    const std::string type_name =
+        t.second.data_type == DataType::FLOAT32 ? "float" : "half";
+    absl::StrAppend(&result, ",\n  __global ", type_name, t.second.element_size,
+                    "* ", t.first);
+  }
+  for (auto& t : images2d_) {
+    absl::StrAppend(&result, ",\n  __read_only image2d_t ", t.first);
+  }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     absl::StrAppend(&result, ",\n  int4 shared_int4_", i);
   }
@@ -92,6 +184,26 @@ std::string Arguments::GetListOfArgs() {
 }
 
 absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
+  for (auto& t : buffers_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : images2d_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
                                           &shared_int4s_data_[i * 4]);
@@ -148,8 +260,8 @@ std::string Arguments::AddActiveArgument(const std::string& arg_name) {
 }
 
 void Arguments::ResolveArgsPass(std::string* code) {
-  std::string result;
   constexpr char kPrefix[] = "args.";
+  std::string result;
   size_t position = 0;
   size_t next_position = code->find(kPrefix);
   while (next_position != std::string::npos) {
@@ -168,6 +280,16 @@ void Arguments::ResolveArgsPass(std::string* code) {
   shared_float4s_data_.resize(shared_float4s_aligned_size);
 }
 
+absl::Status Arguments::AddObjectArgs() {
+  for (auto& t : objects_) {
+    AddGPUResources(t.first,
+                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources());
+    RETURN_IF_ERROR(
+        SetGPUResources(t.first, t.second.obj_ptr->GetGPUResources()));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 274532d0199..f1059e77c93 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -20,8 +20,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
@@ -35,15 +37,21 @@ class Arguments {
   Arguments() = default;
   void AddFloat(const std::string& name, float value = 0.0f);
   void AddInt(const std::string& name, int value = 0);
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+
+  void AddObject(const std::string& name, GPUObjectPtr&& object);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
+  absl::Status SetImage2D(const std::string& name, cl_mem memory);
+  absl::Status SetBuffer(const std::string& name, cl_mem memory);
 
   std::string GetListOfArgs();
 
   absl::Status Bind(cl_kernel kernel, int offset);
 
-  void ResolveArgsPass(std::string* code);
+  absl::Status TransformToCLCode(std::string* code);
 
   // Move only
   Arguments(Arguments&& args);
@@ -53,6 +61,14 @@ class Arguments {
 
  private:
   std::string AddActiveArgument(const std::string& arg_name);
+  void AddGPUResources(const std::string& name, const GPUResources& resources);
+
+  absl::Status SetGPUResources(const std::string& name,
+                               const GPUResourcesWithValue& resources);
+
+  absl::Status AddObjectArgs();
+
+  void ResolveArgsPass(std::string* code);
 
   struct IntValue {
     int value;
@@ -79,6 +95,15 @@ class Arguments {
   };
   std::map<std::string, FloatValue> float_values_;
   std::vector<float> shared_float4s_data_;
+
+  std::map<std::string, GPUBufferDescriptor> buffers_;
+  std::map<std::string, GPUImage2DDescriptor> images2d_;
+
+  struct ObjectArg {
+    AccessType access_type;
+    GPUObjectPtr obj_ptr;
+  };
+  std::map<std::string, ObjectArg> objects_;
 };
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
new file mode 100644
index 00000000000..5cc045c6fc7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct GPUImage2DDescriptor {
+  DataType data_type;
+  cl_mem memory;
+};
+
+struct GPUBufferDescriptor {
+  DataType data_type;
+  int element_size;
+  cl_mem memory;
+};
+
+struct GPUResources {
+  std::vector<std::string> ints;
+  std::vector<std::string> floats;
+  std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
+  std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
+
+  std::vector<std::string> GetNames() const {
+    std::vector<std::string> names = ints;
+    names.insert(names.end(), floats.begin(), floats.end());
+    for (const auto& obj : buffers) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images2d) {
+      names.push_back(obj.first);
+    }
+    return names;
+  }
+};
+
+struct GPUResourcesWithValue {
+  std::vector<std::pair<std::string, int>> ints;
+  std::vector<std::pair<std::string, float>> floats;
+  std::vector<std::pair<std::string, cl_mem>> buffers;
+  std::vector<std::pair<std::string, cl_mem>> images2d;
+};
+
+class GPUObjectDescriptor {
+ public:
+  GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor& obj_desc)
+      : state_vars_(obj_desc.state_vars_) {}
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor& obj_desc) {
+    if (this != &obj_desc) {
+      state_vars_ = obj_desc.state_vars_;
+    }
+    return *this;
+  }
+  virtual ~GPUObjectDescriptor() = default;
+
+  void SetStateVar(const std::string& key, const std::string& value) const {
+    state_vars_[key] = value;
+  }
+
+  virtual std::string PerformConstExpr(const std::string& const_expr) const {
+    return "";
+  }
+
+  virtual absl::Status PerformSelector(const std::string& selector,
+                                       const std::vector<std::string>& args,
+                                       std::string* result) const {
+    *result = "";
+    return absl::OkStatus();
+  }
+  virtual GPUResources GetGPUResources() const { return GPUResources(); }
+
+ protected:
+  mutable std::map<std::string, std::string> state_vars_;
+};
+
+class GPUObject {
+ public:
+  GPUObject() = default;
+  // Move only
+  GPUObject(GPUObject&& obj_desc) = default;
+  GPUObject& operator=(GPUObject&& obj_desc) = default;
+  GPUObject(const GPUObject&) = delete;
+  GPUObject& operator=(const GPUObject&) = delete;
+  virtual ~GPUObject() = default;
+  virtual const GPUObjectDescriptor* GetGPUDescriptor() const = 0;
+  virtual GPUResourcesWithValue GetGPUResources() const = 0;
+};
+
+using GPUObjectPtr = std::unique_ptr<GPUObject>;
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index fc3efe32c3b..7a1d454b571 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -115,8 +115,7 @@ std::string GetTransposeCode(
   c += PostProcess(linked_operations, context);
   c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", batch_id);
   c += "}\n";
-  args->ResolveArgsPass(&c);
-  return absl::Substitute(c, args->GetListOfArgs());
+  return c;
 }
 }  // namespace
 
@@ -139,8 +138,10 @@ Transpose& Transpose::operator=(Transpose&& operation) {
 }
 
 absl::Status Transpose::Compile(const CreationContext& creation_context) {
-  const auto code =
+  std::string code =
       GetTransposeCode(definition_, attr_, linked_operations_, &args_);
+  RETURN_IF_ERROR(args_.TransformToCLCode(&code));
+  code = absl::Substitute(code, args_.GetListOfArgs());
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);

From 45800a6ac32cd928c7263479362152c28fea2a45 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Wed, 27 May 2020 13:55:55 -0700
Subject: [PATCH 1233/1533] Update cpuinfo dependency

Pull commits for improved NEON DotProduct detection

PiperOrigin-RevId: 313458099
Change-Id: If116c7fba6a9b9f9e1e75e29fe1dd3330c80bf1c
---
 third_party/cpuinfo/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/cpuinfo/workspace.bzl b/third_party/cpuinfo/workspace.bzl
index e7aff433892..ed5b8aea41a 100644
--- a/third_party/cpuinfo/workspace.bzl
+++ b/third_party/cpuinfo/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "cpuinfo",
-        strip_prefix = "cpuinfo-19b9316c71e4e45b170a664bf62ddefd7ac9feb5",
-        sha256 = "e0a485c072de957668eb324c49d726dc0fd736cfb9436b334325f20d93085003",
+        strip_prefix = "cpuinfo-5cefcd6293e6881754c2c53f99e95b159d2d8aa5",
+        sha256 = "8ea076bcc4ff73cdff520ece01b776d2a778ced60956f5eb88697a78e22c389d",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/19b9316c71e4e45b170a664bf62ddefd7ac9feb5.zip",
-            "https://github.com/pytorch/cpuinfo/archive/19b9316c71e4e45b170a664bf62ddefd7ac9feb5.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/5cefcd6293e6881754c2c53f99e95b159d2d8aa5.zip",
+            "https://github.com/pytorch/cpuinfo/archive/5cefcd6293e6881754c2c53f99e95b159d2d8aa5.zip",
         ],
         build_file = "//third_party/cpuinfo:BUILD.bazel",
     )

From 4bf3e4f2219cd1f1a56034862015477648993f56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 14:08:04 -0700
Subject: [PATCH 1234/1533] Enable linear algebra gradient tests in eager mode.

PiperOrigin-RevId: 313460478
Change-Id: Ibee41e0883e962eca09c9ebd011fade49f21c029
---
 .../python/kernel_tests/linalg_grad_test.py   | 94 ++++++++-----------
 1 file changed, 41 insertions(+), 53 deletions(-)

diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 3e0676b0746..36e58bee829 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -61,35 +61,32 @@ class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
 
 def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.enable_control_flow_v2
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    with self.session(use_gpu=True):
+
+    def RandomInput():
       np.random.seed(1)
-      a_np = np.random.uniform(
+      return np.random.uniform(
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      a = constant_op.constant(a_np)
-      if functor_.__name__ == 'matrix_square_root':
-        # Square the input matrix to ensure that its matrix square root exists
-        a = math_ops.matmul(a, a)
-        a_np = self.evaluate(a)
-      b = functor_(a, **kwargs_)
 
-      # Optimal stepsize for central difference is O(epsilon^{1/3}).
-      epsilon = np.finfo(dtype_).eps
-      delta = epsilon**(1.0 / 3.0)
-      # tolerance obtained by looking at actual differences using
-      # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-      tol = 1e-6 if dtype_ == np.float64 else 0.05
+    if functor_.__name__ == 'matrix_square_root':
+      # Square the input matrix to ensure that its matrix square root exists
+      f = lambda x: functor_(math_ops.matmul(x, x), **kwargs_)
+    else:
+      f = functor_
 
-      theoretical, numerical = gradient_checker.compute_gradient(
-          a,
-          a.get_shape().as_list(),
-          b,
-          b.get_shape().as_list(),
-          x_init_value=a_np,
-          delta=delta)
-      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else 0.05
+
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        f, [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 
@@ -104,42 +101,33 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         float32_tol_fudge=1.0,
                                         **kwargs_):
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    # TODO(rmlarsen): Debug illegal address bug on CUDA and re-enable
-    # GPU test for matrix_solve.
-    use_gpu = False if functor_ == linalg_ops.matrix_solve else True
 
-    with self.session(use_gpu=use_gpu):
+    def RandomInput():
       np.random.seed(1)
-      a_np = np.random.uniform(
+      return np.random.uniform(
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      a = constant_op.constant(a_np)
 
-      b_np = np.random.uniform(
-          low=-1.0, high=1.0,
-          size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      b = constant_op.constant(b_np)
-      c = functor_(a, b, **kwargs_)
+    fixed = RandomInput()
 
-      # Optimal stepsize for central difference is O(epsilon^{1/3}).
-      epsilon = np.finfo(dtype_).eps
-      delta = epsilon**(1.0 / 3.0)
-      # tolerance obtained by looking at actual differences using
-      # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-      tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
-      # The gradients for a and b may be of very different magnitudes,
-      # so to not get spurious failures we test them separately.
-      for factor, factor_init in [a, a_np], [b, b_np]:
-        theoretical, numerical = gradient_checker.compute_gradient(
-            factor,
-            factor.get_shape().as_list(),
-            c,
-            c.get_shape().as_list(),
-            x_init_value=factor_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
+
+    # check gradient w.r.t. left argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda x: functor_(x, fixed, **kwargs_), [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+    # check gradient w.r.t. right argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda y: functor_(fixed, y, **kwargs_), [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 

From f9786be2080f2bf0d6563b3335200a010df4fc17 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Wed, 27 May 2020 14:08:37 -0700
Subject: [PATCH 1235/1533] Add eager microbenchmarks for Layer.__call__,
 name_scope, nest.map_structure, and nest.pack_sequence_as

PiperOrigin-RevId: 313460594
Change-Id: Ib4aaa659f18631a4dea623a5833320bf5f4aa482
---
 tensorflow/python/eager/benchmarks_test.py | 43 ++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 223b62ededa..b6a504eb291 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -52,6 +52,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -61,6 +62,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
@@ -1356,6 +1358,47 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
+  def benchmark_tf_name_scope(self):
+
+    def fn():
+      with ops.name_scope_v2("name"):
+        pass
+
+    self._run(fn, 10000)
+
+  def benchmark_tf_nest_map_structure(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+
+    def fn():
+      nest.map_structure(lambda x: x, nested)
+
+    self._run(fn, 10000)
+
+  def benchmark_tf_nest_pack_sequence_as(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+    flat = nest.flatten(nested)
+
+    def fn():
+      nest.pack_sequence_as(nested, flat)
+
+    self._run(fn, 10000)
+
+  # TODO(b/157587712): Move to keras when benchmarks are setup.
+  def benchmark_tf_keras_layer_call(self):
+
+    class OnlyOverheadLayer(base_layer.Layer):
+
+      def call(self, x):
+        return x
+
+    layer = OnlyOverheadLayer()
+    x = ops.convert_to_tensor([[1.]])
+
+    def fn():
+      layer(x)
+
+    self._run(fn, 10000)
+
 
 if __name__ == "__main__":
   test.main()

From df1131c395c638b878d75f9ff9a588bd75eab5e4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 14:09:52 -0700
Subject: [PATCH 1236/1533] Add BackupAndRestore callback to help model back up
 periodically and recover from unexpected failure, e.g. preemption. Remove
 backup and restore function from ModelCheckpointCallback in multi-worker
 mode.

PiperOrigin-RevId: 313460848
Change-Id: I7dab1b8cec7ee694a32496eb45596662221c4d05
---
 tensorflow/python/keras/BUILD                 |   2 +-
 tensorflow/python/keras/callbacks.py          | 196 +++++++---
 tensorflow/python/keras/distribute/BUILD      |  35 +-
 .../multi_worker_callback_tf2_test.py         |  80 +++-
 .../multi_worker_fault_tolerance_test.py      | 346 ------------------
 .../distribute/multi_worker_training_state.py | 233 ------------
 .../keras/distribute/worker_training_state.py | 153 ++++++++
 ..._test.py => worker_training_state_test.py} |  19 +-
 .../tools/api/generator/api_init_files.bzl    |   1 +
 .../tools/api/generator/api_init_files_v1.bzl |   1 +
 ...cks.experimental.-backup-and-restore.pbtxt |  82 +++++
 ...sorflow.keras.callbacks.experimental.pbtxt |   7 +
 .../v2/tensorflow.keras.callbacks.pbtxt       |   4 +
 13 files changed, 477 insertions(+), 682 deletions(-)
 delete mode 100644 tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py
 delete mode 100644 tensorflow/python/keras/distribute/multi_worker_training_state.py
 create mode 100644 tensorflow/python/keras/distribute/worker_training_state.py
 rename tensorflow/python/keras/distribute/{multi_worker_training_state_test.py => worker_training_state_test.py} (78%)
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt

diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 78e360c8354..67fa0c18ebd 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -140,7 +140,7 @@ py_library(
     deps = [
         ":backend",
         "//tensorflow/python/distribute:distributed_file_utils",
-        "//tensorflow/python/keras/distribute:multi_worker_training_state",
+        "//tensorflow/python/keras/distribute:worker_training_state",
         "//tensorflow/python/keras/protobuf:projector_config_proto_py",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index db326ea32f0..e00559e7bc1 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -33,11 +33,14 @@ import numpy as np
 import six
 
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distributed_file_utils
-from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
+from tensorflow.python.keras.distribute import worker_training_state
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
@@ -1192,51 +1195,19 @@ class ModelCheckpoint(Callback):
       self.save_weights_only = True
 
   def on_train_begin(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode():
-      # MultiWorkerTrainingState is used to manage the training state needed
-      # for preemption-recovery of a worker in multi-worker training.
-      self.model._training_state = (
-          training_state.MultiWorkerTrainingState(self.model, self.filepath))
-      self._training_state = self.model._training_state
-      if self._training_state.restore():
-        # If the training state needs to be and is successfully restored,
-        # it is recovering from a previous failure (or preemption). In such
-        # case, do not load the weights from user specified file path.
-        return
-
-    # If this is not multi worker training, restoring is not needed, or
-    # restoring failed, check if it should load weights on restart.
     if self.load_weights_on_restart:
-      if (not self.model._in_multi_worker_mode() or
-          multi_worker_util.should_load_checkpoint()):
-        filepath_to_load = (
-            self._get_most_recently_modified_file_matching_pattern(
-                self.filepath))
-        if (filepath_to_load is not None and
-            training_state.checkpoint_exists(filepath_to_load)):
-          try:
-            # `filepath` may contain placeholders such as `{epoch:02d}`, and
-            # thus it attempts to load the most recently modified file with file
-            # name matching the pattern.
-            self.model.load_weights(filepath_to_load)
-          except (IOError, ValueError) as e:
-            raise ValueError('Error loading file from {}. Reason: {}'.format(
-                filepath_to_load, e))
-
-  def on_train_end(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode():
-      if self.model.stop_training or getattr(
-          self.model, '_successful_loop_finish', False):
-        # In multi-worker training, on successful exit of training, delete the
-        # training state backup file that was saved for the purpose of worker
-        # recovery.
-        self._training_state.delete_backup()
-        # Restore the training state so the model is ready for next (possible)
-        # multi worker training.
-        del self._training_state
-        self.model._training_state = None
+      filepath_to_load = (
+          self._get_most_recently_modified_file_matching_pattern(self.filepath))
+      if (filepath_to_load is not None and
+          self._checkpoint_exists(filepath_to_load)):
+        try:
+          # `filepath` may contain placeholders such as `{epoch:02d}`, and
+          # thus it attempts to load the most recently modified file with file
+          # name matching the pattern.
+          self.model.load_weights(filepath_to_load)
+        except (IOError, ValueError) as e:
+          raise ValueError('Error loading file from {}. Reason: {}'.format(
+              filepath_to_load, e))
 
   def on_train_batch_end(self, batch, logs=None):
     if self._should_save_on_batch(batch):
@@ -1249,17 +1220,7 @@ class ModelCheckpoint(Callback):
     self.epochs_since_last_save += 1
     # pylint: disable=protected-access
     if self.save_freq == 'epoch':
-      if self.model._in_multi_worker_mode():
-        # Exclude training state variables in user-requested checkpoint file.
-        with self._training_state.untrack_vars():
-          self._save_model(epoch=epoch, logs=logs)
-      else:
-        self._save_model(epoch=epoch, logs=logs)
-    if self.model._in_multi_worker_mode():
-      # For multi-worker training, back up the weights and current training
-      # state for possible future recovery.
-      # TODO(rchao): Call `back_up` at finer period such as N steps.
-      self._training_state.back_up(epoch)
+      self._save_model(epoch=epoch, logs=logs)
 
   def _should_save_on_batch(self, batch):
     """Handles batch-level saving logic, supports steps_per_execution."""
@@ -1353,6 +1314,14 @@ class ModelCheckpoint(Callback):
     distributed_file_utils.remove_temp_dir_with_filepath(
         self._write_filepath, self.model.distribute_strategy)
 
+  def _checkpoint_exists(self, filepath):
+    """Returns whether the checkpoint `filepath` refers to exists."""
+    if filepath.endswith('.h5'):
+      return file_io.file_exists(filepath)
+    tf_saved_model_exists = file_io.file_exists(filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+    return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+
   def _get_most_recently_modified_file_matching_pattern(self, pattern):
     """Returns the most recently modified filepath matching pattern.
 
@@ -1445,6 +1414,119 @@ class ModelCheckpoint(Callback):
       return file_path_with_largest_file_name
 
 
+@keras_export('keras.callbacks.experimental.BackupAndRestore', v1=[])
+class BackupAndRestore(Callback):
+  """Callback to back up and restore the training state.
+
+  `BackupAndRestore` callback is intended to recover from interruptions that
+  happened in the middle of a model.fit execution by backing up the
+  training states in a temporary checkpoint file (based on TF CheckpointManager)
+  at the end of each epoch. If training restarted before completion, the
+  training state and model are restored to the most recently saved state at the
+  beginning of a new model.fit() run.
+  Note that user is responsible to bring jobs back up.
+  This callback is important for the backup and restore mechanism for fault
+  tolerance purpose. And the model to be restored from an previous checkpoint is
+  expected to be the same as the one used to back up. If user changes arguments
+  passed to compile or fit, the checkpoint saved for fault tolerance can become
+  invalid.
+
+  Note:
+  1. This callback is not compatible with disabling eager execution.
+  2. A checkpoint is saved at the end of each epoch, when restoring we'll redo
+  any partial work from an unfinished epoch in which the training got restarted
+  (so the work done before a interruption doesn't affect the final model state).
+  3. This works for both single worker and multi-worker mode, only
+  MirroredStrategy and MultiWorkerMirroredStrategy are supported for now.
+
+  Example:
+
+  >>> class InterruptingCallback(tf.keras.callbacks.Callback):
+  ...   def on_epoch_begin(self, epoch, logs=None):
+  ...     if epoch == 4:
+  ...       raise RuntimeError('Interrupting!')
+  >>> callback = tf.keras.callbacks.experimental.BackupAndRestore(
+  ... backup_dir="/tmp")
+  >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+  >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
+  >>> try:
+  ...   model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
+  ...             batch_size=1, callbacks=[callback, InterruptingCallback()],
+  ...             verbose=0)
+  ... except:
+  ...   pass
+  >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
+  ...             batch_size=1, callbacks=[callback], verbose=0)
+  >>> # Only 6 more epochs are run, since first trainning got interrupted at
+  >>> # zero-indexed epoch 4, second training will continue from 4 to 9.
+  >>> len(history.history['loss'])
+  6
+
+  Arguments:
+      backup_dir: String, path to save the model file. This is the directory in
+        which the system stores temporary files to recover the model from jobs
+        terminated unexpectedly. The directory cannot be reused elsewhere to
+        store other checkpoints, e.g. by BackupAndRestore callback of another
+        training, or by another callback (ModelCheckpoint) of the same training.
+  """
+
+  def __init__(self, backup_dir):
+    super(BackupAndRestore, self).__init__()
+    self.backup_dir = backup_dir
+    self._supports_tf_logs = True
+    self._supported_strategies = (
+        distribute_lib._DefaultDistributionStrategy,
+        mirrored_strategy.MirroredStrategy,
+        collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+
+    if not context.executing_eagerly():
+      if ops.inside_function():
+        raise ValueError('This Callback\'s method contains Python state and '
+                         'should be called outside of `tf.function`s.')
+      else:  # Legacy graph mode:
+        raise ValueError(
+            'BackupAndRestore only supports eager mode. In graph '
+            'mode, consider using ModelCheckpoint to manually save '
+            'and restore weights with `model.load_weights()` and by '
+            'providing `initial_epoch` in `model.fit()` for fault tolerance.')
+
+    # Only the chief worker writes model checkpoints, but all workers
+    # restore checkpoint at on_train_begin().
+    self._chief_worker_only = False
+
+  def set_model(self, model):
+    self.model = model
+
+  def on_train_begin(self, logs=None):
+    # TrainingState is used to manage the training state needed for
+    # failure-recovery of a worker in training.
+    # pylint: disable=protected-access
+
+    if not isinstance(self.model.distribute_strategy,
+                      self._supported_strategies):
+      raise NotImplementedError(
+          'Currently only support empty strategy, MirroredStrategy and '
+          'MultiWorkerMirroredStrategy.')
+    self.model._training_state = (
+        worker_training_state.WorkerTrainingState(self.model, self.backup_dir))
+    self._training_state = self.model._training_state
+    self._training_state.restore()
+
+  def on_train_end(self, logs=None):
+    # pylint: disable=protected-access
+    # On exit of training, delete the training state backup file that was saved
+    # for the purpose of worker recovery.
+    self._training_state.delete_backup()
+
+    # Clean up the training state.
+    del self._training_state
+    del self.model._training_state
+
+  def on_epoch_end(self, epoch, logs=None):
+    # Back up the model and current epoch for possible future recovery.
+    self._training_state.back_up(epoch)
+
+
 @keras_export('keras.callbacks.EarlyStopping')
 class EarlyStopping(Callback):
   """Stop training when a monitored metric has stopped improving.
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 6a39ebc5007..ef5302b45a5 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -44,7 +44,6 @@ py_library(
         "//tensorflow/python/keras:losses",
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
-        "//tensorflow/python/keras/distribute:multi_worker_training_state",
         "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
         "//tensorflow/python/keras/mixed_precision/experimental:policy",
         "//tensorflow/python/keras/saving",
@@ -56,23 +55,20 @@ py_library(
 )
 
 py_library(
-    name = "multi_worker_training_state",
+    name = "worker_training_state",
     srcs = [
-        "multi_worker_training_state.py",
+        "worker_training_state.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/distribute:multi_worker_util",
-    ],
 )
 
 cuda_py_test(
-    name = "multi_worker_training_state_test",
-    srcs = ["multi_worker_training_state_test.py"],
+    name = "worker_training_state_test",
+    srcs = ["worker_training_state_test.py"],
     shard_count = 4,
     deps = [
         ":multi_worker_testing_utils",
-        ":multi_worker_training_state",
+        ":worker_training_state",
         "//tensorflow/python:platform",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_worker_test_base",
@@ -375,27 +371,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "multi_worker_fault_tolerance_test",
-    srcs = ["multi_worker_fault_tolerance_test.py"],
-    shard_count = 14,
-    # TODO(b/132384649): Enable once fixed.
-    tags = [
-        "no_oss",
-    ],
-    deps = [
-        ":distribute",
-        ":multi_worker_testing_utils",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:distribute_config",
-        "//tensorflow/python/distribute:distribute_coordinator",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/keras",
-    ],
-)
-
 py_library(
     name = "multi_worker_testing_utils",
     srcs = [
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 7ea385e0b04..8daa46f6ea3 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -28,11 +28,19 @@ from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 
+def checkpoint_exists(filepath):
+  """Returns whether the checkpoint `filepath` refers to exists."""
+  if filepath.endswith('.h5'):
+    return file_io.file_exists(filepath)
+  tf_saved_model_exists = file_io.file_exists(filepath)
+  tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+  return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+
+
 def _model_setup(test_obj, file_format):
   """Set up a MNIST Keras model for testing purposes.
 
@@ -89,7 +97,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           (test_base.get_task_type(), test_base.get_task_index(), extension))
 
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
+      test_obj.assertFalse(checkpoint_exists(saving_filepath))
 
       model.fit(
           x=train_ds,
@@ -104,15 +112,14 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
       # If it's chief, the model should be saved; if not, the model shouldn't.
       test_obj.assertEqual(
-          training_state.checkpoint_exists(saving_filepath),
-          test_base.is_chief())
+          checkpoint_exists(saving_filepath), test_base.is_chief())
 
       # If it's chief, the model should be saved (`write_filepath` should
       # simply return `saving_filepath`); if not, i.e. for non-chief workers,
       # the temporary path generated by `write_filepath` should no longer
       # contain the checkpoint that has been deleted.
       test_obj.assertEqual(
-          training_state.checkpoint_exists(
+          checkpoint_exists(
               distributed_file_utils.write_filepath(
                   saving_filepath, model._distribution_strategy)),
           test_base.is_chief())
@@ -148,6 +155,69 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
+
+    class InterruptingCallback(callbacks.Callback):
+
+      def on_epoch_begin(self, epoch, logs=None):
+        if epoch == 2:
+          raise RuntimeError('Interrupting!')
+
+    class AssertCallback(callbacks.Callback):
+
+      def on_epoch_begin(self, epoch, logs=None):
+        # the interruption happened on epoch 2 as specified in
+        # InterruptingCallback, so the initial epoch after restart will begin
+        # at 2.
+        assert epoch > 1
+
+    def proc_model_checkpoint_works_with_same_file_path(test_obj,
+                                                        saving_filepath):
+      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
+      num_epoch = 4
+
+      # The saving_filepath shouldn't exist at the beginning (as it's unique).
+      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+      bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')
+
+      try:
+        model.fit(
+            x=train_ds,
+            epochs=num_epoch,
+            steps_per_epoch=steps,
+            callbacks=[
+                callbacks.ModelCheckpoint(filepath=saving_filepath),
+                callbacks.BackupAndRestore(backup_dir=bar_dir),
+                InterruptingCallback()
+            ])
+      except RuntimeError as e:
+        if 'Interrupting!' not in str(e):
+          raise
+
+      backup_filepath = os.path.join(bar_dir, 'checkpoint')
+      test_obj.assertTrue(file_io.file_exists(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+      model.fit(
+          x=train_ds,
+          epochs=num_epoch,
+          steps_per_epoch=steps,
+          callbacks=[
+              callbacks.ModelCheckpoint(filepath=saving_filepath),
+              callbacks.BackupAndRestore(backup_dir=bar_dir),
+              AssertCallback()
+          ])
+      test_obj.assertFalse(file_io.file_exists(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
+
+    multi_process_runner.run(
+        proc_model_checkpoint_works_with_same_file_path,
+        cluster_spec=test_base.create_cluster_spec(num_workers=2),
+        args=(self, saving_filepath))
+
   @combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py b/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py
deleted file mode 100644
index fa58d2479ac..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests Keras multi worker fault tolerance."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import json
-import os
-import signal
-import sys
-import tempfile
-import threading
-
-from absl.testing import parameterized
-from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks
-from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
-from tensorflow.python.platform import test
-
-
-def get_strategy_object(strategy_cls):
-  if strategy_cls == mirrored_strategy.MirroredStrategy:
-    return strategy_cls(mirrored_strategy.all_local_devices())
-  else:
-    # CollectiveAllReduceStrategy and ParameterServerStrategy.
-    return strategy_cls()
-
-
-class KerasMultiWorkerFaultToleranceTest(test_base.IndependentWorkerTestBase,
-                                         parameterized.TestCase):
-
-  class PreemptionAtBatchBoundarySimulatingCallback(callbacks.Callback):
-    """Callback to simulate preemption at batch boundary."""
-
-    def on_epoch_begin(self, epoch, logs=None):
-      self._current_epoch = epoch
-
-    def on_batch_begin(self, batch, logs=None):
-      if self._current_epoch == 1 and batch == 1 and not test_base.is_chief():
-        # Simulate preemption at the start of second batch of second epoch.
-        raise RuntimeError('Preemption!')
-
-    def on_batch_end(self, batch, logs=None):
-      assert self._current_epoch < 1 or batch < 1
-
-    def on_epoch_end(self, epoch, logs=None):
-      assert epoch < 1
-
-  # TODO(rchao): Add tests for checking 0th and 2nd epoch boundary.
-  class PreemptionAtEpochBoundarySimulatingCallback(callbacks.Callback):
-    """Callback to simulate preemption at epoch boundary."""
-
-    def on_epoch_begin(self, epoch, logs=None):
-      if epoch == 1 and not test_base.is_chief():
-        # Simulate preemption at the start of second epoch.
-        raise RuntimeError('Preemption!')
-
-    def on_epoch_end(self, epoch, logs=None):
-      assert epoch < 1
-
-  @combinations.generate(
-      combinations.combine(
-          # Eager runtime unfortunately cannot be tested with multi-threading.
-          # TODO(rchao): Add test to use multi-process for eager mode after
-          # b/132095481 is resolved.
-          mode=['graph'],
-          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
-          required_gpus=[0, 1],
-          file_format=['h5', 'tf'],
-          preemption_callback=[
-              PreemptionAtEpochBoundarySimulatingCallback,
-              PreemptionAtBatchBoundarySimulatingCallback
-          ],
-          # FT should work regardless of `ModelCheckpoint`'s parameters.
-          save_weights_only=[True, False],
-          load_weights_on_restart=[True, False],
-      ))
-  def testFaultToleranceInSyncStrategy(self, strategy_cls, file_format,
-                                       preemption_callback, save_weights_only,
-                                       load_weights_on_restart):
-    """Test fault-tolerance with multi-threading using sync dist-strat.
-
-    This test simulates multi-worker training that is interrupted by a
-    preemption, by having two threads, each of which represents a chief and a
-    non-chief worker, where the non-chief raises an error in the middle of
-    training loop. Upon excepting the error, a new thread with a new cluster
-    spec is created to simulate the recovered non-chief worker. Meanwhile, the
-    chief worker cannot proceed and hangs since the non-chief worker has
-    crashed. To simulate a restart of the chief, a new thread has been prepared
-    to run to take over chief with the help of a condition variable. It is
-    expected that after the restart of both chief and non-chief workers, the
-    training continues from the epoch they previously failed at. The test
-    concludes by verifying the preemption-interrupted training can finish with
-    the same loss and accuracy had the preemption not occurred.
-
-    TODO(rchao): Add test to check preemption on chief (possibly using multi
-    processes).
-
-    TODO(rchao): Add test to check fault-tolerance with multiple `model.fit()`.
-
-    Arguments:
-      strategy_cls: The strategy class to use.
-      file_format: `h5` or `tf`.
-      preemption_callback: The callback to simulate preemption.
-      save_weights_only: The argument for `model.fit()`'s `save_weights_only`.
-      load_weights_on_restart: The argument for `model.fit()`'s
-        `load_weights_on_restart`.
-    """
-
-    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
-      with test.mock.patch.object(dc, '_run_std_server',
-                                  self._make_mock_run_std_server()):
-        # `before_restart` is True for the threads that represent the original
-        # chief and non-chief worker, and False for threads that represent the
-        # restarted chief and non-chief workers.
-        before_restart = kwargs['before_restart']
-
-        # Model building under strategy scope. Following is the code we expect
-        # the user runs on every worker.
-        strategy = get_strategy_object(strategy_cls)
-        batch_size = 64
-        steps = 3
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-
-        with strategy.scope():
-          model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-
-        # Function to start a new thread. This will be called twice in the
-        # following code: one represents the restart of the non-chief, and one
-        # represents the restart of the chief as a result of the restart of the
-        # non-chief (so the training can continue in sync).
-        def start_new_thread(new_chief):
-          new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
-
-          # Update the ports in new chief and new worker threads.
-          new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports']
-
-          # Since both new chief and new worker threads are started from the
-          # worker thread, we need to overwrite the tf config task index.
-          new_thread_tf_config['task']['index'] = 0 if new_chief else 1
-          return self._run_task_in_thread(
-              task_fn=_independent_worker_fn,
-              cluster_spec=None,
-              task_type=None,
-              task_id=None,
-              tf_config=new_thread_tf_config,
-              before_restart=False,
-              new_chief=new_chief)
-
-        try:
-
-          class CkptSavedEpochAssertingCallback(callbacks.Callback):
-
-            def __init__(self, test_obj):
-              super(CkptSavedEpochAssertingCallback, self).__init__()
-              self.test_obj = test_obj
-
-            def on_epoch_begin(self, epoch, logs=None):
-              # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
-              self.test_obj.assertEqual(
-                  K.eval(self.model._ckpt_saved_epoch) ==
-                  training_state.CKPT_SAVED_EPOCH_UNUSED_VALUE, epoch == 0)
-
-          callbacks_list = [
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath,
-                  save_weights_only=save_weights_only,
-                  load_weights_on_restart=load_weights_on_restart),
-              CkptSavedEpochAssertingCallback(self)
-          ]
-          if before_restart:
-            callbacks_list.append(preemption_callback())
-
-          self.assertFalse(hasattr(model, training_state.CKPT_SAVED_EPOCH))
-          history = model.fit(
-              x=train_ds,
-              epochs=num_epoch,
-              steps_per_epoch=steps,
-              callbacks=callbacks_list)
-          self.assertFalse(hasattr(model, training_state.CKPT_SAVED_EPOCH))
-
-          # `history` of the training result is collected to be compared against
-          # each other. It is expected that the training results (loss and
-          # accuracy`) are the same with or without preemption.
-          self._histories.append(history.history)
-
-        except RuntimeError:
-          # pylint: disable=g-assert-in-except
-          self.assertTrue(before_restart)
-          # Reset the barrier so the new threads simulating recovery can
-          # continue.
-          self._barrier._counter = 0
-          self._barrier._flag = False
-
-          # At this point we block the original non-chief thread, and
-          # start the new threads that simulate the restarted chief and
-          # non-chief, joining the threads and return.
-          new_chief_thread = start_new_thread(new_chief=True)
-          new_worker_thread = start_new_thread(new_chief=False)
-          self.join_independent_workers([new_chief_thread, new_worker_thread])
-          return
-
-        # Successful end of a `fit()` call.
-        with self._lock:
-          self._successful_thread_ends += 1
-        self.assertFalse(before_restart)
-
-    # Common parameters
-    num_workers = 2
-    num_epoch = 3
-    # History list storing the results for preemption and no preemption cases.
-    self._histories = []
-    # Lock required to prevent race condition between two threads.
-    self._lock = threading.Lock()
-    strategy = get_strategy_object(strategy_cls)
-
-    def handler(signum, frame):
-      del signum, frame
-      # `session.run()` within `model.fit()` can time out. Skipping it as it
-      # doesn't represent the failure of this test.
-      self.skipTest('Skipping test due to `session.run()` timeout.')
-
-    signal.signal(signal.SIGALRM, handler)
-    # Alarming within 5 min before the test timeouts and fails.
-    signal.alarm(240)
-
-    def get_saving_dir_and_filepath():
-      saving_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
-      saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
-      return saving_dir, saving_filepath
-
-    # Case 1: Training for `num_epoch` without preemptions.
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-    self._successful_thread_ends = 0
-    # Get a new temporary filepath to save the checkpoint to.
-    saving_dir, saving_filepath = get_saving_dir_and_filepath()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        # Pass `saving_filepath` from the parent thread to ensure every worker
-        # has the same filepath to save.
-        saving_filepath=saving_filepath,
-        before_restart=False,
-        new_chief=False)
-    threads_to_join = []
-    if strategy.extended.experimental_between_graph:
-      for ts in threads.values():
-        threads_to_join.extend(ts)
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-
-    # `self.test_skipped_reason` could be set when a non-main thread attempts
-    # to skip the test.
-    # `multi_worker_test_base.skip_if_grpc_server_cant_be_started()` is an
-    # example of where this can be set. Since raising `SkipTest` in a non-main
-    # thread doesn't actually skip the test, we check if the test should be
-    # skipped here once we have joined the threads.
-    if getattr(self, 'test_skipped_reason', None) is not None:
-      self.skipTest(self.test_skipped_reason)
-
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertEqual(self._successful_thread_ends, 2)
-
-    # Case 2: Training for `num_epoch` epoch with preemptions.
-    # The preemption is simulated at both epoch boundary and batch boundary.
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-    # Ports reserved for new threads simulating recovery.
-    reserved_ports = [
-        'localhost:%s' % test_base.pick_unused_port()
-        for _ in range(num_workers)
-    ]
-    self._successful_thread_ends = 0
-    # Get a new temporary filepath to save the checkpoint to.
-    saving_dir, saving_filepath = get_saving_dir_and_filepath()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        # Pass `saving_filepath` from the parent thread to ensure every worker
-        # has the same filepath to save.
-        saving_filepath=saving_filepath,
-        reserved_ports=reserved_ports,
-        before_restart=True,
-        new_chief=False)
-    threads_to_join = []
-    if strategy.extended.experimental_between_graph:
-      # Only join the non-chief thread since the first thread for chief will
-      # eventually hang and be ignored.
-      threads_to_join = [threads['worker'][1]]
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-    if getattr(self, 'test_skipped_reason', None) is not None:
-      self.skipTest(self.test_skipped_reason)
-
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertEqual(self._successful_thread_ends, 2)
-
-    def assert_all_elements_are_identical(list_to_check):
-      first_item = list_to_check[0]
-      for item in list_to_check[1:]:
-        self.assertAllClose(first_item, item, rtol=2e-5, atol=1e-5)
-
-    # Important: the results from preemption interrupted and non-interrupted
-    # cases should give the same final results.
-    assert_all_elements_are_identical(
-        [history['acc'][-1] for history in self._histories])
-    assert_all_elements_are_identical(
-        [history['loss'][-1] for history in self._histories])
-    # The length of `self._histories` would be num_workers * num_runs (3).
-    self.assertLen(self._histories, 4)
-
-    # Results from case 1 should have 3 full epochs.
-    self.assertLen(self._histories[0]['acc'], 3)
-    # Results from case 2 should only have 2 full epochs because it restarted at
-    # epoch 1.
-    self.assertLen(self._histories[-1]['acc'], 2)
-
-
-if __name__ == '__main__':
-  with test.mock.patch.object(sys, 'exit', os._exit):
-    test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state.py b/tensorflow/python/keras/distribute/multi_worker_training_state.py
deleted file mode 100644
index d967cf8b1d2..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_training_state.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Training state management in multi-worker distributed training."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import os
-import uuid
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.utils import mode_keys
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import variables
-from tensorflow.python.training.tracking import tracking
-
-# Constant for `tf.keras.Model` attribute to store the epoch at which the most
-# recently saved checkpoint was saved.
-CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
-
-CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
-
-
-def checkpoint_exists(filepath):
-  """Returns whether the checkpoint `filepath` refers to exists."""
-  if filepath.endswith('.h5'):
-    return file_io.file_exists(filepath)
-  tf_saved_model_exists = file_io.file_exists(filepath)
-  tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
-  return tf_saved_model_exists or tf_weights_only_checkpoint_exists
-
-
-def remove_checkpoint_if_exists(ckpt_dir, filepath):
-  """Removes the checkpoint if it exists and returns whether it has removed."""
-  if checkpoint_exists(filepath):
-    _remove_dir(ckpt_dir)
-    return True
-  return False
-
-
-def _remove_dir(dir_to_remove):
-  file_io.delete_recursively(dir_to_remove)
-
-
-def _get_backup_filepath(original_filepath):
-  backup_dir = os.path.join(os.path.dirname(original_filepath), 'backup')
-  return backup_dir, os.path.join(backup_dir, 'training_state')
-
-
-def _get_temp_filepath(original_filepath):
-  temp_dir = os.path.join(
-      os.path.dirname(original_filepath), 'temp_training_states',
-      str(uuid.uuid4()))
-  return temp_dir, os.path.join(temp_dir, 'training_state')
-
-
-class MultiWorkerTrainingState(object):
-  """Training state management class in multi-worker distributed training.
-
-  In multi-worker training, model weights and epoch information are saved
-  periodically for fault-tolerance, also known as preemption-recovery purpose.
-  This class provides apis for backing up and restoring the training state.
-  """
-
-  def __init__(self, model, original_filepath):
-    self._model = model
-
-    # The directory and filepath that store the training state backup file.
-    self._backup_dir, self._backup_filepath = _get_backup_filepath(
-        original_filepath)
-
-    # For those who should not checkpoint (e.g. non-chief worker in sync
-    # training), create a temporary directory to write to (that will be
-    # removed later).
-    if not multi_worker_util.should_save_checkpoint():
-      self._temp_dir, self._temp_filepath = _get_temp_filepath(
-          original_filepath)
-
-    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
-    # GPU device only has int64 dtype registered VarHandleOp.
-    self._ckpt_saved_epoch = variables.Variable(
-        initial_value=constant_op.constant(
-            CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=dtypes.int64),
-        name='ckpt_saved_epoch')
-
-    # Variable initialization.
-    K.set_value(self._ckpt_saved_epoch, CKPT_SAVED_EPOCH_UNUSED_VALUE)
-
-    # Calling `AutoTrackable.__setattr__` to avoid getting added as a weight of
-    # model (which is done in `Layer.__setattr__`), which breaks saving/loading
-    # in hdf5 format. Once becomes an attr of `model`, _ckpt_saved_epoch gets
-    # tracked and will be included in the checkpoint file when backing up.
-    tracking.AutoTrackable.__setattr__(self._model, CKPT_SAVED_EPOCH,
-                                       self._ckpt_saved_epoch)
-
-  def back_up(self, epoch):
-    """Back up the current state of training into a checkpoint file.
-
-    Arguments:
-      epoch: The current epoch information to be saved.
-    """
-    # pylint: disable=protected-access
-    self._assert_in_multi_worker_mode()
-
-    # Update `_ckpt_saved_epoch`.
-    K.set_value(self._ckpt_saved_epoch, epoch)
-
-    # If this is multi-worker training, and this worker should not
-    # save checkpoint, we replace the filepath with a dummy filepath so
-    # it writes to a file that will be removed at the end of _save_model()
-    # call. This is because the SyncOnReadVariable needs to be synced across
-    # all the workers in order to be read, and all workers need to initiate
-    # that.
-    if multi_worker_util.should_save_checkpoint():
-      save_filepath = self._backup_filepath
-    else:
-      save_filepath = self._temp_filepath
-
-    # Save the weights plus CKPT_SAVED_EPOCH variable.
-    self._model.save_weights(save_filepath, overwrite=True)
-
-    if not multi_worker_util.should_save_checkpoint():
-      # Remove the file in multi-worker training where this worker should
-      # not checkpoint. It is a dummy file previously saved for sync distributed
-      # training.
-      _remove_dir(self._temp_dir)
-
-  def restore(self):
-    """Restore the training state from the backed up checkpoint file.
-
-    Returns:
-      True if the training state is successfully restored. False if the training
-      state doesn't need to be restored, or error occurred so it can't.
-    """
-    self._assert_in_multi_worker_mode()
-    if not multi_worker_util.should_load_checkpoint():
-      # For multi-worker training, it should not restore a model in certain
-      # worker setting (e.g. non-chief worker in ParameterServerStrategy).
-      return False
-    if file_io.file_exists(self._backup_dir):
-      try:
-        # Load the weights plus CKPT_SAVED_EPOCH variable.
-        self._model.load_weights(self._backup_filepath)
-        return True
-
-      except (IOError, ValueError) as e:
-        raise ValueError('Error loading file from {}. Reason: {}'.format(
-            self._backup_filepath, e))
-    return False
-
-  def delete_backup(self):
-    """Delete the backup directories.
-
-    Delete the backup directories which should not exist after `fit()`
-    successfully finishes.
-    """
-    self._assert_in_multi_worker_mode()
-    # Model may not have such attr if there was a failure before the attr was
-    # added to the model
-    if hasattr(self._model, CKPT_SAVED_EPOCH):
-      tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
-    if multi_worker_util.should_save_checkpoint():
-      _remove_dir(self._backup_dir)
-    else:
-      assert not file_io.file_exists(self._temp_dir)
-
-  def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
-    """Maybe load initial epoch from ckpt considering possible worker recovery.
-
-    When `_ckpt_saved_epoch` attribute exists and is not
-    `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training setting
-    and indicates the worker is recovering from previous failure. In this case,
-    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
-    unfinished training from certain epoch.
-
-    Arguments:
-      initial_epoch: The original initial_epoch user passes in in `fit()`.
-      mode: The mode for running `model.fit()`.
-
-    Returns:
-      If the training is recovering from previous failure under multi-worker
-      training setting, return the epoch the training is supposed to continue
-      at. Otherwise, return the `initial_epoch` the user passes in.
-    """
-    self._assert_in_multi_worker_mode()
-
-    # TODO(rchao): Add recovery for validation case
-    # (when mode == ModeKeys.TEST).
-    epoch = K.eval(self._ckpt_saved_epoch)
-    if mode == mode_keys.ModeKeys.TRAIN and epoch >= 0:
-      # The most recently saved epoch is one epoch prior to the epoch it
-      # failed at, so return the value of 'self._ckpt_saved_epoch' plus one.
-      return epoch + 1
-    return initial_epoch
-
-  @contextlib.contextmanager
-  def untrack_vars(self):
-    """Provides a scope within which training state variables are untracked.
-
-    Regular checkpoint file saved by `ModelCheckpoint` callback that the user
-    requests should not contain training state variables such as
-    `CKPT_SAVED_EPOCH`, or the epoch the checkpoint is most recently saved at.
-
-    Yields:
-      None.
-    """
-    tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
-    yield
-    tracking.AutoTrackable.__setattr__(self._model, CKPT_SAVED_EPOCH,
-                                       self._ckpt_saved_epoch)
-
-  def _assert_in_multi_worker_mode(self):
-    # pylint: disable=protected-access
-    if not self._model._in_multi_worker_mode():
-      raise ValueError('MultiWorkerTrainingState is only supposed to be used '
-                       'in multi-worker training. This indicates some error '
-                       'that needs to be fixed. Please submit a bug issue to '
-                       'tf.keras team.')
diff --git a/tensorflow/python/keras/distribute/worker_training_state.py b/tensorflow/python/keras/distribute/worker_training_state.py
new file mode 100644
index 00000000000..06cf46b3333
--- /dev/null
+++ b/tensorflow/python/keras/distribute/worker_training_state.py
@@ -0,0 +1,153 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training state management."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.distribute import distributed_file_utils
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import mode_keys
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as trackable_util
+
+# Constant for `tf.keras.Model` attribute to store the epoch at which the most
+# recently saved checkpoint was saved.
+CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
+
+CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
+
+
+class WorkerTrainingState(object):
+  """Training state management class.
+
+  This class provides apis for backing up and restoring the training state.
+  This allows model and epoch information to be saved periodically and restore
+  for fault-tolerance, also known as preemption-recovery purpose.
+  """
+
+  def __init__(self, model, checkpoint_dir):
+    self._model = model
+
+    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
+    # GPU device only has int64 dtype registered VarHandleOp.
+    self._ckpt_saved_epoch = variables.Variable(
+        initial_value=constant_op.constant(
+            CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=dtypes.int64),
+        name='ckpt_saved_epoch')
+
+    # Variable initialization.
+    K.set_value(self._ckpt_saved_epoch, CKPT_SAVED_EPOCH_UNUSED_VALUE)
+
+    # _ckpt_saved_epoch gets tracked and is included in the checkpoint file
+    # when backing up.
+    checkpoint = trackable_util.Checkpoint(
+        model=self._model, ckpt_saved_epoch=self._ckpt_saved_epoch)
+
+    # If this is single-worker training, checkpoint_dir are the same for
+    # write_checkpoint_manager and read_checkpoint_manager.
+    #
+    # If this is multi-worker training, and this worker should not
+    # save checkpoint, we replace the write_checkpoint_manager's checkpoint_dir
+    # with a temp filepath, so it writes to a file that will be removed at the
+    # end of back_up() call. This is necessary because the SyncOnReadVariable
+    # needs to be synced across all the workers in order to be read, and all
+    # workers need to perform `save()`.
+    # But all workers should restore from the same checkpoint_dir as passed in
+    # read_checkpoint_manager.
+    self.write_checkpoint_dir = distributed_file_utils.write_dirpath(
+        checkpoint_dir, self._model.distribute_strategy)
+    self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory=self.write_checkpoint_dir, max_to_keep=1)
+    if self.write_checkpoint_dir == checkpoint_dir:
+      self.read_checkpoint_manager = self.write_checkpoint_manager
+    else:
+      self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory=checkpoint_dir, max_to_keep=1)
+
+  def back_up(self, epoch):
+    """Back up the current state of training into a checkpoint file.
+
+    Arguments:
+      epoch: The current epoch information to be saved.
+    """
+    K.set_value(self._ckpt_saved_epoch, epoch)
+    # Save the model plus CKPT_SAVED_EPOCH variable.
+    if self.write_checkpoint_manager.save():
+      distributed_file_utils.remove_temp_dirpath(
+          self.write_checkpoint_manager.directory,
+          self._model.distribute_strategy)
+
+  def restore(self):
+    """Restore the training state from the backed up checkpoint file.
+
+    Returns:
+      True if the training state is successfully restored. False if the training
+      state doesn't need to be restored, or error occurred so it can't.
+    """
+    # For multi-worker training, it should not restore a model in certain
+    # worker setting (e.g. non-chief worker in ParameterServerStrategy).
+    # pylint: disable=protected-access
+    if self._model._in_multi_worker_mode(
+    ) and not multi_worker_util.should_load_checkpoint():
+      return
+    self.read_checkpoint_manager.restore_or_initialize()
+
+  def delete_backup(self):
+    """Delete the backup directories.
+
+    Delete the backup directories which should not exist after `fit()`
+    successfully finishes.
+    """
+    # pylint: disable=protected-access
+    for pathname in file_io.get_matching_files(
+        self.write_checkpoint_manager._prefix + '*'):
+      file_io.delete_recursively(pathname)
+    for pathname in file_io.get_matching_files(
+        os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
+      file_io.delete_recursively(pathname)
+
+  def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
+    """Maybe load initial epoch from ckpt considering possible worker recovery.
+
+    When `_ckpt_saved_epoch` attribute exists and is not
+    `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training setting
+    and indicates the worker is recovering from previous failure. In this case,
+    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
+    unfinished training from certain epoch.
+
+    Arguments:
+      initial_epoch: The original initial_epoch user passes in in `fit()`.
+      mode: The mode for running `model.fit()`.
+
+    Returns:
+      If the training is recovering from previous failure under multi-worker
+      training setting, return the epoch the training is supposed to continue
+      at. Otherwise, return the `initial_epoch` the user passes in.
+    """
+
+    epoch = K.eval(self._ckpt_saved_epoch)
+    if mode == mode_keys.ModeKeys.TRAIN and epoch >= 0:
+      # The most recently saved epoch is one epoch prior to the epoch it
+      # failed at, so return the value of 'self._ckpt_saved_epoch' plus one.
+      return epoch + 1
+    return initial_epoch
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py b/tensorflow/python/keras/distribute/worker_training_state_test.py
similarity index 78%
rename from tensorflow/python/keras/distribute/multi_worker_training_state_test.py
rename to tensorflow/python/keras/distribute/worker_training_state_test.py
index 984db20b3b9..80a3deaa914 100644
--- a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py
+++ b/tensorflow/python/keras/distribute/worker_training_state_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests of `multi_worker_training_state.py` utilities."""
+"""Tests of `worker_training_state.py` utilities."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,12 +26,12 @@ from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 
-class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
-                                   parameterized.TestCase):
+class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
+                          parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
@@ -48,7 +48,7 @@ class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
         callbacks.ModelCheckpoint(
             filepath=saving_filepath, save_weights_only=save_weights_only)
     ]
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
+    self.assertFalse(file_io.file_exists(saving_filepath))
 
     try:
       model.fit(
@@ -56,11 +56,10 @@ class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
     except NotFoundError as e:
       if 'Failed to create a NewWriteableFile' in e.message:
         self.skipTest('b/138941852, path not found error in Windows py35.')
-
-    self.assertTrue(training_state.checkpoint_exists(saving_filepath))
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
+    tf_saved_model_exists = file_io.file_exists(saving_filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists(saving_filepath +
+                                                            '.index')
+    self.assertTrue(tf_saved_model_exists or tf_weights_only_checkpoint_exists)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 03120fb8dc4..7d27d21188f 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -99,6 +99,7 @@ KERAS_API_INIT_FILES = [
     "keras/applications/xception/__init__.py",
     "keras/backend/__init__.py",
     "keras/callbacks/__init__.py",
+    "keras/callbacks/experimental/__init__.py",
     "keras/constraints/__init__.py",
     "keras/datasets/__init__.py",
     "keras/datasets/boston_housing/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index a8154c6f35c..c0756a173c9 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -119,6 +119,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/applications/xception/__init__.py",
     "keras/backend/__init__.py",
     "keras/callbacks/__init__.py",
+    "keras/callbacks/experimental/__init__.py",
     "keras/constraints/__init__.py",
     "keras/datasets/__init__.py",
     "keras/datasets/boston_housing/__init__.py",
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt
new file mode 100644
index 00000000000..4b0ab2536ae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt
@@ -0,0 +1,82 @@
+path: "tensorflow.keras.callbacks.experimental.BackupAndRestore"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.callbacks.BackupAndRestore\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'backup_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "on_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_begin"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_model"
+    argspec: "args=[\'self\', \'model\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_params"
+    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt
new file mode 100644
index 00000000000..670df243e9c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.callbacks.experimental"
+tf_module {
+  member {
+    name: "BackupAndRestore"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
index 31716a24407..cd1d0d940a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
@@ -56,4 +56,8 @@ tf_module {
     name: "TerminateOnNaN"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
 }

From 1c1d4b619a3ee0a45d26edacdee591648e911314 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Wed, 27 May 2020 14:10:51 -0700
Subject: [PATCH 1237/1533] Uniformize the handling of undefined simple and
 composite names in control flow.

PiperOrigin-RevId: 313461038
Change-Id: Ic70f11291dfa6da52073ec4cacecda883a4d126c
---
 .../autograph/converters/control_flow.py      | 46 ++++++++++++-------
 .../autograph/converters/control_flow_test.py | 26 +++++------
 .../python/autograph/operators/__init__.py    |  1 +
 .../python/autograph/operators/variables.py   | 25 ++++++++++
 4 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 673781e47dd..b54770cbd28 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -72,31 +72,43 @@ class ControlFlowTransformer(converter.Base):
     return results
 
   def _create_state_functions(
-      self, loop_vars, nonlocal_declarations, getter_name, setter_name):
-    if loop_vars:
-      template = """
-        def getter_name():
-          return state_vars,
-        def setter_name(vars_):
-          nonlocal_declarations
-          state_vars, = vars_
-      """
-      return templates.replace(
-          template,
-          nonlocal_declarations=nonlocal_declarations,
-          getter_name=getter_name,
-          setter_name=setter_name,
-          state_vars=tuple(loop_vars))
-    else:
+      self, block_vars, nonlocal_declarations, getter_name, setter_name):
+    if not block_vars:
       template = """
         def getter_name():
           return ()
-        def setter_name(loop_vars):
+        def setter_name(block_vars):
           pass
       """
       return templates.replace(
           template, getter_name=getter_name, setter_name=setter_name)
 
+    guarded_block_vars = []
+    for v in block_vars:
+      if v.is_simple():
+        guarded_block_vars.append(v)
+      else:
+        guarded_block_vars.append(
+            templates.replace_as_expression(
+                'ag__.ldu(lambda: var_, name)',
+                var_=v,
+                name=gast.Constant(str(v), kind=None)))
+
+    template = """
+      def getter_name():
+        return guarded_state_vars,
+      def setter_name(vars_):
+        nonlocal_declarations
+        state_vars, = vars_
+    """
+    return templates.replace(
+        template,
+        nonlocal_declarations=nonlocal_declarations,
+        getter_name=getter_name,
+        guarded_state_vars=guarded_block_vars,
+        setter_name=setter_name,
+        state_vars=tuple(block_vars))
+
   def _create_loop_options(self, node):
     if not anno.hasanno(node, anno.Basic.DIRECTIVES):
       return gast.Dict([], [])
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 935e2cec4b8..f0681128698 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -189,9 +189,9 @@ class WhileStatementTest(ControlFlowTestBase):
         symbols={'TestClass': TestClass})
     with self.converted(
         test_fn, control_flow, {'TestClass': TestClass}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(AttributeError, 'subattr'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, "'tc.subattr' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_slice_initialized_in_loop(self):
 
@@ -209,9 +209,9 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 14})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(KeyError, 'subkey'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, r"'d\[k\]' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_literal_slice_initialized_in_loop(self):
 
@@ -228,9 +228,9 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 14})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(KeyError, 'subkey'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, r"'d\['subkey'\]' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_slice_aliased_to_local(self):
 
@@ -245,7 +245,7 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 11})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
+      # TODO(b/136999953): Better error message.
       # Note that this error happens at execution time.
       with self.assertRaises(errors.InaccessibleTensorError):
         graph_fn = def_function.function(result.test_fn, autograph=False)
@@ -671,11 +671,9 @@ class ForStatementTest(ControlFlowTestBase):
         symbols={'TestClass': TestClass})
     with self.converted(
         test_fn, control_flow, {'TestClass': TestClass}) as result:
-      # TODO(b/128519776): Better error message.
       with self.assertRaisesRegex(
-          AttributeError, '\'TestClass\' object has no attribute \'x\''):
-        result.test_fn(
-            constant_op.constant(list(range(5))), constant_op.constant(5))
+          ValueError, "'tc.x' must be defined before the loop"):
+        result.test_fn(constant_op.constant(list(range(5))), 0)
 
   def test_tuple_unpacking(self):
     def test_fn(x_list):
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 8ac4e1d8bb3..a42dcf326c3 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -62,5 +62,6 @@ from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
 from tensorflow.python.autograph.operators.variables import ld
+from tensorflow.python.autograph.operators.variables import ldu
 from tensorflow.python.autograph.operators.variables import Undefined
 from tensorflow.python.autograph.operators.variables import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/variables.py b/tensorflow/python/autograph/operators/variables.py
index 150f64e1758..c3bedc3fecf 100644
--- a/tensorflow/python/autograph/operators/variables.py
+++ b/tensorflow/python/autograph/operators/variables.py
@@ -26,6 +26,31 @@ def ld(v):
   return v
 
 
+def ldu(load_v, name):
+  """Load variable operator that returns Undefined when failing to evaluate.
+
+  Note: the name ("load or return undefined") is abbreviated to minimize
+  the amount of clutter in generated code.
+
+  This variant of `ld` is useful when loading symbols that may be undefined at
+  runtime, such as composite symbols, and whether they are defined or not cannot
+  be determined statically. For example `d['a']` is undefined when `d` is an
+  empty dict.
+
+  Args:
+    load_v: Lambda that executes the actual read.
+    name: Human-readable name of the symbol being read.
+  Returns:
+    Either the value of the symbol, or Undefined, if the symbol is not fully
+    defined.
+  """
+  try:
+    # TODO(mdan): Use locals()/globals() here.
+    return load_v()
+  except (KeyError, AttributeError, NameError):
+    return Undefined(name)
+
+
 class Undefined(object):
   """Represents an undefined symbol in Python.
 

From 67a155f782aa86511bff264b89f3a8b8012c3c00 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 14:10:58 -0700
Subject: [PATCH 1238/1533] Add bfloat16 support for
 SparseSegmentMean*/SparseSegmentSqrtN*

PiperOrigin-RevId: 313461080
Change-Id: Ibf4ed3b6531231e797358940e584471f2c682848
---
 .../core/kernels/segment_reduction_ops_impl.h | 110 +++++++++++++-----
 .../kernels/segment_reduction_ops_impl_5.cc   |   2 +
 tensorflow/core/ops/math_ops.cc               |   8 +-
 3 files changed, 90 insertions(+), 30 deletions(-)

diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index 8954dcd4681..6c3fad668ae 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -508,6 +508,12 @@ class SparseSegmentReductionOpBase : public OpKernel {
                 errors::InvalidArgument("segment ids must be >= 0"));
     auto output_flat = output->flat_outer_dims<T>();
 
+    Tensor temp;
+    if (input.dtype() == DT_BFLOAT16) {
+      temp = tensorflow::Tensor(DT_FLOAT, output_shape);
+    }
+    auto temp_flat = temp.flat_outer_dims<float>();
+
     int64 start = 0, end = 1;
     // Index from which the output is not initialized.
     SegmentId uninitialized_index = 0;
@@ -546,8 +552,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
       }
 
       auto out = output_flat.template chip<0>(out_index);
-      const int bad_offset =
-          Reduce(input_flat, indices_vec, start, end - start, out);
+      auto temp = temp_flat.template chip<0>(out_index);
+      const int bad_offset = Reduce<T, Index>(input_flat, indices_vec, start,
+                                              end - start, out, temp);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
                       "Bad: indices[", start + bad_offset,
@@ -572,40 +579,89 @@ class SparseSegmentReductionOpBase : public OpKernel {
   }
 
  private:
-  int64 Reduce(const typename TTypes<T>::ConstMatrix& input_flat,
-               const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
-               int64 num,
-               Eigen::TensorChippingOp<0, typename TTypes<T>::Matrix> out) {
+  template <typename Tin>
+  using EnableIfBfloat16 =
+      typename std::enable_if<std::is_same<Tin, bfloat16>::value, int>::type;
+  template <typename Tin>
+  using EnableIfNotBfloat16 =
+      typename std::enable_if<!std::is_same<Tin, bfloat16>::value, int>::type;
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index);
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index).template cast<float>();
+  }
+
+  template <typename Tout>
+  EIGEN_ALWAYS_INLINE Tout get_scaling_factor(int64 num) {
+    Tout m(1);
+    if (is_mean_ && (num < 10)) {
+      m = Tout(num);
+    }
+    if (is_sqrtn_ && (num < 10)) {
+      m = Tout(sqrt(num));
+    }
+    return Tout(1) / m;
+  }
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16<Tin> = 0>
+  int64 Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    return ReduceImpl<Tin, Tindex, Tin>(input_flat, indices_vec, start, num,
+                                        out, get_scaling_factor<Tin>(num));
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16<Tin> = 0>
+  int64 Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    int64 res =
+        ReduceImpl<Tin, Tindex, float>(input_flat, indices_vec, start, num,
+                                       temp, get_scaling_factor<float>(num));
+    out = temp.template cast<bfloat16>();
+    return res;
+  }
+
+  template <typename Tin, typename Tindex, typename Tout>
+  int64 ReduceImpl(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tout>::Matrix> out,
+      const Tout scaling_factor) {
 #define INDEX(n, i)                               \
   const auto index##n = indices_vec(start + (i)); \
   if (!FastBoundsCheck(index##n, input_flat.dimension(0))) return (i);
 
-#define L(n) input_flat.template chip<0>(index##n)
+#define L(n) fetch_val<Tin, Tindex>(input_flat, index##n)
 
     if (num == 1) {
       INDEX(0, 0);
       out = L(0);
     } else {
-      int64 r = num % 8;
-      T m(1);
-      if (is_mean_ && (num < 10)) {
-        m = T(num);
-      }
-      if (is_sqrtn_ && (num < 10)) {
-        m = T(sqrt(num));
-      }
+      int64 r = num & 7;
       switch (r) {
         case 2: {
           INDEX(0, 0);
           INDEX(1, 1);
-          out = (L(0) + L(1)) / m;
+          out = (L(0) + L(1)) * scaling_factor;
           break;
         }
         case 3: {
           INDEX(0, 0);
           INDEX(1, 1);
           INDEX(2, 2);
-          out = (L(0) + L(1) + L(2)) / m;
+          out = (L(0) + L(1) + L(2)) * scaling_factor;
           break;
         }
         case 4: {
@@ -613,7 +669,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(1, 1);
           INDEX(2, 2);
           INDEX(3, 3);
-          out = (L(0) + L(1) + L(2) + L(3)) / m;
+          out = (L(0) + L(1) + L(2) + L(3)) * scaling_factor;
           break;
         }
         case 5: {
@@ -622,7 +678,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(2, 2);
           INDEX(3, 3);
           INDEX(4, 4);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4)) * scaling_factor;
           break;
         }
         case 6: {
@@ -632,7 +688,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(3, 3);
           INDEX(4, 4);
           INDEX(5, 5);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) * scaling_factor;
           break;
         }
         case 7: {
@@ -643,7 +699,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(4, 4);
           INDEX(5, 5);
           INDEX(6, 6);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;
+          out =
+              (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) * scaling_factor;
           break;
         }
         case 0: {
@@ -655,7 +712,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(5, 5);
           INDEX(6, 6);
           INDEX(7, 7);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) *
+                scaling_factor;
           r = 8;
           break;
         }
@@ -669,8 +727,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(6, 6);
           INDEX(7, 7);
           INDEX(8, 8);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) /
-                m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) *
+                scaling_factor;
           r = 9;
           break;
         }
@@ -687,10 +745,10 @@ class SparseSegmentReductionOpBase : public OpKernel {
         out += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);
       }
       if (is_mean_ && num >= 10) {
-        out = out / static_cast<T>(num);
+        out = out / static_cast<Tout>(num);
       }
       if (is_sqrtn_ && num >= 10) {
-        out = out / static_cast<T>(sqrt(num));
+        out = out / static_cast<Tout>(sqrt(num));
       }
     }
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
index fee0f818c5e..03a448e52b3 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -64,6 +64,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
                                                   segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
@@ -85,6 +86,7 @@ REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
           CPUDevice, type, index_type, segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 972d6e27b75..dfc2463915c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1337,7 +1337,7 @@ REGISTER_OP("SparseSegmentMean")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1348,7 +1348,7 @@ REGISTER_OP("SparseSegmentMeanWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
@@ -1370,7 +1370,7 @@ REGISTER_OP("SparseSegmentSqrtN")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1381,7 +1381,7 @@ REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")

From 3446f153cbe9230244467fcee044bfe597e14b8e Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Wed, 27 May 2020 14:11:01 -0700
Subject: [PATCH 1239/1533] Support Google-internal TPU resolution in strategy
 combinations.

PiperOrigin-RevId: 313461094
Change-Id: I42c3c87a7f8c41d6b11b6de408ddfc44fd8bbda0
---
 .../custom_training_loop_gradient_test.py     |  1 -
 .../distribute/strategy_combinations.py       | 29 ++++++++++++-------
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/distribute/custom_training_loop_gradient_test.py b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
index ebf5d440c3e..c2ce2caccd0 100644
--- a/tensorflow/python/distribute/custom_training_loop_gradient_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
@@ -111,7 +111,6 @@ class GradientTapeTest(test.TestCase, parameterized.TestCase,
         return grads
       return distribution.experimental_local_results(
           distribution.run(train_step, args=(x,)))
-
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
     results = []
     for x in dist_dataset:
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index e69c8c7f129..ca7a0d6d1fc 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -47,7 +47,6 @@ from tensorflow.python.training import ftrl
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
-
 FLAGS = flags.FLAGS
 
 _did_connect_to_cluster = False
@@ -58,16 +57,26 @@ def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
   def _create_tpu_strategy():
     global _did_connect_to_cluster
 
-    # These flags will be defined by tpu_test_wrapper.py.
-    resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
-        zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
-        project=hasattr(FLAGS, "project") and FLAGS.project or None,
-    )
+    try:
+      # Attempt to locally discover the TPU. This will fail for Cloud TPU, in
+      # which case we fall back to the values passed as flags.
+      resolver = tpu_cluster_resolver.TPUClusterResolver()
+      did_automatically_resolve = True
+    except ValueError:
+      did_automatically_resolve = False
+
+      # These flags will be defined by tpu_test_wrapper.py.
+      resolver = tpu_cluster_resolver.TPUClusterResolver(
+          tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
+          zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
+          project=hasattr(FLAGS, "project") and FLAGS.project or None,
+      )
+
     # Only connect once per process, rather than per test method.
-    if hasattr(FLAGS, "tpu") and FLAGS.tpu and not _did_connect_to_cluster:
-      remote.connect_to_cluster(resolver)
-      _did_connect_to_cluster = True
+    if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
+      if not _did_connect_to_cluster:
+        remote.connect_to_cluster(resolver)
+        _did_connect_to_cluster = True
 
     topology = tpu_strategy_util.initialize_tpu_system(resolver)
     device_assignment = None

From 59cebfb7113760158920c1b03a3d1852a7e062a9 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Wed, 27 May 2020 14:24:45 -0700
Subject: [PATCH 1240/1533] Add dill and tblib to RBE image

This is needed for tf.distribute multi worker tests. Those tests use multi processes to simulate multiple workers, and need to serialize test arguments and results. pickle is too limited.

PiperOrigin-RevId: 313463711
Change-Id: Id01c997bf1b2f2cc98b90357759ecf1c2c0bd7c4
---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 5 +++++
 third_party/toolchains/preconfig/generate/containers.bzl  | 6 +++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index c52b94e0f1f..3009213d43a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -147,3 +147,8 @@ pip3 install --upgrade argparse
 # tree
 pip2 install dm-tree
 pip3 install dm-tree
+
+# tf.distribute multi worker tests require the following:
+# Those tests are Python3 only.
+pip3 install --upgrade dill
+pip3 install --upgrade tblib
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 8e6f48df99e..05b233232e3 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -2,14 +2,14 @@
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
-    "ubuntu16.04-manylinux2010": "sha256:d5b056506e14eb216b6e27988814617a09dea77ec1ab46972072038f9df3e728",
+    "ubuntu16.04-manylinux2010": "sha256:5d855d2e9905c3824d71129fbf29696eb18d2237c5d152ab8d23f6882b83f115",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0",
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:1e4e888f14a3d5b127151f7970487613a46ca957babe0432786627c78c0b1a36",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:13aa5e700bb609521cd4365d4152d7d8f4118cae7ce174ce7d54cc529e21766a",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:4dd708781c17a9e8d641c6ad05cc6e235e7147ff70f7b4a2ff6b31af43be4540",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }

From 778973620a53a2426127b21dbe9167493a86c5fd Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Wed, 27 May 2020 14:26:11 -0700
Subject: [PATCH 1241/1533] Don't grab `learning_phase` to pass to functional
 model construction when in v2 & the global learning phase has not been set,
 because it isn't needed.

PiperOrigin-RevId: 313463994
Change-Id: Ice070617d0a82451aaa9f1ea164404263a213c8e
---
 tensorflow/python/keras/engine/base_layer.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 4a43b0526f6..0eb6954d2a8 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -873,11 +873,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # Priority 3a: `learning_phase()` has been set.
       elif backend.global_learning_phase_is_set():
         training_value = backend.learning_phase()
-      # Priority 3b: Pass the `learning_phase()` if in the Keras FuncGraph.
-      elif build_graph:
-        with backend.get_graph().as_default():
-          if base_layer_utils.is_in_keras_graph():
-            training_value = backend.learning_phase()
 
       if self._expects_training_arg and training_value is not None:
         # Force the training_value to be bool type which matches to the contract
@@ -1491,7 +1486,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           self._metrics.append(metric_obj)
         else:
           from tensorflow.python.keras import metrics as metrics_mod  # pylint:disable=g-import-not-at-top
-          metric_obj = metrics_mod.Mean(name=name, dtype=value.dtype)
+          # Build the metric object with the value's dtype if it defines one
+          metric_obj = metrics_mod.Mean(
+              name=name, dtype=getattr(value, 'dtype', None))
           self._metrics.append(metric_obj)
 
       if should_update_state:

From 8dd818b0f60d0cc2cbd432994d3d82f84f2982b1 Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Wed, 27 May 2020 14:28:30 -0700
Subject: [PATCH 1242/1533] Add gpu doc_test that covers all modules under
 tensorflow.python.distribute excepts tpu_strategy.

PiperOrigin-RevId: 313464425
Change-Id: I7f691e66f76d1404b1c5763eb85182e598c85703
---
 tensorflow/tools/docs/BUILD | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index c0442a5986d..1f2dd5d31d2 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -11,6 +11,8 @@ package(
 
 exports_files(["LICENSE"])
 
+tpu_module = "tpu.,distribute.tpu_strategy"
+
 py_library(
     name = "tf_doctest_lib",
     srcs = ["tf_doctest_lib.py"],
@@ -23,7 +25,7 @@ py_library(
 py_test(
     name = "tf_doctest",
     srcs = ["tf_doctest.py"],
-    args = ["--module_prefix_skip=tpu.,distribute.tpu_strategy"],
+    args = ["--module_prefix_skip=" + tpu_module],
     python_version = "PY3",
     tags = [
         "no_oss_py2",
@@ -45,7 +47,7 @@ py_test(
 tpu_py_test(
     name = "tf_doctest_tpu",
     srcs = ["tf_doctest.py"],
-    args = ["--module=tpu.,distribute.tpu_strategy"],
+    args = ["--module=" + tpu_module],
     disable_experimental = True,
     disable_v3 = True,
     main = "tf_doctest.py",
@@ -64,6 +66,32 @@ tpu_py_test(
     ],
 )
 
+py_test(
+    name = "tf_doctest_gpu",
+    srcs = ["tf_doctest.py"],
+    args = [
+        "--module=distribute.",
+        "--module_prefix_skip=" + tpu_module,
+    ],
+    main = "tf_doctest.py",
+    python_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+        "no_rocm",
+        "no_windows",  # numpy prints differently on windows.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":tf_doctest_lib",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "tf_doctest_test",
     srcs = ["tf_doctest_test.py"],

From 4eae0941b70bab5a3d00ce8e077e1ffa32416e4e Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Wed, 27 May 2020 14:31:00 -0700
Subject: [PATCH 1243/1533] Reduce Layer.__call__ overhead by ~3%

Reduces method call invocation overhead since these methods are regularly used and reducing Python method call overhead is meaningful here

Improvements:
- Faster get_default_graph (~3x faster)
  - Check `if self.stack` rather than `if len(self.stack) >= 1'
  - Replace expensive calls to `super` and `_GetGlobalDefaultGraph()` with
    explicit logic.

- Faster name_scope (~15% faster)
  - Remove redundant `name is not None` check.
  - One str concat operation instead of two (for nested scopes)
  - Move enter_eager_name_scope logic directly to __enter__
  - Use `name[-] == '/'` instead of `name.endswith('/')`, faster for 1 char
  - Use `ctx.scope_name = old_name` rather than more expensive `setattr(ctx,
    'scope_name', old_name)`

PiperOrigin-RevId: 313464894
Change-Id: I9721b8af66f1d08c6abf5ebd92d14f15c9a9e8cc
---
 tensorflow/python/framework/ops.py | 71 ++++++++++++++----------------
 1 file changed, 34 insertions(+), 37 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 5b6dac5be34..b68d613e045 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5353,7 +5353,7 @@ class _DefaultStack(threading.local):
     self.stack = []
 
   def get_default(self):
-    return self.stack[-1] if len(self.stack) >= 1 else None
+    return self.stack[-1] if self.stack else None
 
   def reset(self):
     self.stack = []
@@ -5541,10 +5541,13 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
 
   def get_default(self):
     """Override that returns a global default if the stack is empty."""
-    ret = super(_DefaultGraphStack, self).get_default()
-    if ret is None:
-      ret = self._GetGlobalDefaultGraph()
-    return ret
+    if self.stack:
+      return self.stack[-1]
+    elif self._global_default_graph:
+      return self._global_default_graph
+    else:
+      self._global_default_graph = Graph()
+      return self._global_default_graph
 
   def _GetGlobalDefaultGraph(self):
     if self._global_default_graph is None:
@@ -6535,24 +6538,6 @@ class name_scope_v1(object):  # pylint: disable=invalid-name
     return self._name_scope.__exit__(*exc_info)
 
 
-def enter_eager_name_scope(ctx, name):
-  """Updates the eager context to enter the given name scope."""
-  old_name = ctx.scope_name
-  if not name:
-    scope_name = ""
-  else:
-    if name.endswith("/"):
-      # A trailing slash breaks out of nested name scopes, indicating a
-      # fully specified scope name, for compatibility with Graph.name_scope.
-      scope_name = name
-    else:
-      scope_name = name + "/"
-      if old_name:
-        scope_name = old_name + scope_name
-  ctx.scope_name = scope_name
-  return scope_name, old_name
-
-
 @tf_export("name_scope", v1=[])
 class name_scope_v2(object):
   """A context manager for use when defining a Python op.
@@ -6575,9 +6560,9 @@ class name_scope_v2(object):
   When executed, the Tensors `a`, `b`, `c`, will have names `MyOp/a`, `MyOp/b`,
   and `MyOp/c`.
 
-  If the scope name already exists, the name will be made unique by appending
-  `_n`. For example, calling `my_op` the second time will generate `MyOp_1/a`,
-  etc.
+  Inside a `tf.function`, if the scope name already exists, the name will be
+  made unique by appending `_n`. For example, calling `my_op` the second time
+  will generate `MyOp_1/a`, etc.
   """
 
   def __init__(self, name):
@@ -6587,9 +6572,9 @@ class name_scope_v2(object):
       name: The prefix to use on all names created within the name scope.
 
     Raises:
-      ValueError: If name is None, or not a string.
+      ValueError: If name is not a string.
     """
-    if name is None or not isinstance(name, six.string_types):
+    if not isinstance(name, six.string_types):
       raise ValueError("name for name_scope must be a string.")
     self._name = name
     self._exit_fns = []
@@ -6603,16 +6588,29 @@ class name_scope_v2(object):
 
     Returns:
       The scope name.
-
-    Raises:
-      ValueError: if neither `name` nor `default_name` is provided
-        but `values` are.
     """
     ctx = context.context()
     if ctx.executing_eagerly():
-      scope_name, old_scope_name = enter_eager_name_scope(ctx, self._name)
-      self._exit_fns.append(
-          lambda *a: setattr(ctx, "scope_name", old_scope_name))
+      # Names are not auto-incremented in eager mode.
+      # A trailing slash breaks out of nested name scopes, indicating a
+      # fully specified scope name, for compatibility with Graph.name_scope.
+      # This also prevents auto-incrementing.
+      old_name = ctx.scope_name
+      name = self._name
+      if not name:
+        scope_name = ""
+      elif name[-1] == "/":
+        scope_name = name
+      elif old_name:
+        scope_name = old_name + name + "/"
+      else:
+        scope_name = name + "/"
+      ctx.scope_name = scope_name
+
+      def _restore_name_scope(*_):
+        ctx.scope_name = old_name
+
+      self._exit_fns.append(_restore_name_scope)
     else:
       scope = get_default_graph().name_scope(self._name)
       scope_name = scope.__enter__()
@@ -6620,8 +6618,7 @@ class name_scope_v2(object):
     return scope_name
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    exit_fn = self._exit_fns.pop()
-    exit_fn(type_arg, value_arg, traceback_arg)
+    self._exit_fns.pop()(type_arg, value_arg, traceback_arg)
     return False  # False values do not suppress exceptions
 
 
From 0ef1057c2d0850f3380b90d64f1daccce82f0a7c Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 27 May 2020 14:33:20 -0700
Subject: [PATCH 1244/1533] [XLA:SPMD] Fix all-gather transpose when there are
 more than one sharded dimensions.

PiperOrigin-RevId: 313465338
Change-Id: I9c8a2763dea5dbbf1c40e114c8b0b2f25aa9c941
---
 tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index eb0a9c330c3..068442ad5c7 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -4605,8 +4605,8 @@ HloInstruction* SpmdPartitioner::AllGatherShards(SpmdBuilder* b,
       xpose_permutation[i] = i + tiled_dims.size() - split_dims_added;
     } else {
       xpose_permutation[i] = split_dims_added;
+      xpose_permutation[i + 1] = i + tiled_dims.size() - split_dims_added;
       split_dims_added++;
-      xpose_permutation[i + 1] = i + tiled_dims.size();
       i++;
     }
   }

From 0101b7cad5016eef7234fe9ba603322719243aee Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 27 May 2020 14:48:52 -0700
Subject: [PATCH 1245/1533] minor bug fixes for unit tests.

---
 tensorflow/core/util/mkl_threadpool.h | 3 ++-
 third_party/mkl_dnn/mkldnn_v1.BUILD   | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/util/mkl_threadpool.h b/tensorflow/core/util/mkl_threadpool.h
index da4b516d3b8..493f7732b8f 100644
--- a/tensorflow/core/util/mkl_threadpool.h
+++ b/tensorflow/core/util/mkl_threadpool.h
@@ -24,7 +24,6 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 #include <vector>
-
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/threadpool.h"
@@ -114,6 +113,7 @@ class MklDnnThreadPoolWrapper {
     return instance_;
   }
   MklDnnThreadPool* CreateThreadPoolPtr(OpKernelContext* ctx) {
+    mutex_lock l(m_);
     if (threadpool_map_.empty() ||
         threadpool_map_.find(ctx->device()) == threadpool_map_.end()) {
       auto tp_iface = new MklDnnThreadPool(ctx);
@@ -126,6 +126,7 @@ class MklDnnThreadPoolWrapper {
   }
 
  private:
+  mutex m_;
   std::unordered_map<DeviceBase*, MklDnnThreadPool*> threadpool_map_;
   MklDnnThreadPoolWrapper() {}
   MklDnnThreadPoolWrapper(const MklDnnThreadPoolWrapper&) = delete;
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 313f81c8108..47a7efecda3 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -86,6 +86,9 @@ cc_library(
     ]) + if_mkl_v1_open_source_only([
         "-UUSE_MKL",
         "-UUSE_CBLAS",
+    ]) + if_mkldnn_threadpool([
+        "-UUSE_MKL",
+        "-UUSE_CBLAS",
     ]) + select({
         "@org_tensorflow//tensorflow:linux_x86_64": [
             "-fopenmp",  # only works with gcc

From f82acdd576fe067788aa35a2095f919c4ff9bd2a Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 27 May 2020 14:37:12 -0700
Subject: [PATCH 1246/1533] [tf.data] Refactoring of tf.data static
 optimizations.

After this CL, Grappler optimizations are applied to tf.data-based tf.functions during function instantiation as opposed to during input pipeline graph construction. This makes the application of Grappler optimizations consistent between tf.data and non-tf.data tf.functions.

PiperOrigin-RevId: 313466120
Change-Id: I43a6f9bdedc12baad4aca344b462735b066f58e0
---
 .../core/grappler/optimizers/data/BUILD       |  8 ---
 .../optimizers/data/meta_optimizer.cc         | 53 +++----------------
 tensorflow/core/kernels/data/BUILD            |  8 ++-
 .../core/kernels/data/captured_function.cc    | 27 ++++++++++
 4 files changed, 39 insertions(+), 57 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index bab28d44686..a927afc5b30 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -603,16 +603,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:arithmetic_optimizer",
-        "//tensorflow/core/grappler/optimizers:common_subgraph_elimination",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler/optimizers:dependency_optimizer",
-        "//tensorflow/core/grappler/optimizers:function_optimizer",
-        "//tensorflow/core/grappler/optimizers:loop_optimizer",
-        "//tensorflow/core/grappler/optimizers:model_pruner",
-        "//tensorflow/core/grappler/optimizers:remapper",
-        "//tensorflow/core/grappler/optimizers:shape_optimizer",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 3591cd525ac..5804c3ee01a 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -21,15 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
-#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/model_pruner.h"
-#include "tensorflow/core/grappler/optimizers/remapper.h"
-#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/ptr_util.h"
@@ -60,14 +52,6 @@ constexpr std::array<const char*, 15> kTFDataOptimizations = {
     "slack",
     "inject_prefetch"};
 
-// Standard grappler optimizations, in the order we want to perform them.
-// The order matches the order in the generic meta optimizer.
-constexpr std::array<const char*, 9> kGrapplerOptimizations = {
-    "pruning",  "function",   "common_subgraph_elimination",
-    "shape",    "arithmetic", "layout_optimizer",
-    "remapper", "loop",       "dependency",
-};
-
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
 Status ToConfigMap(
@@ -118,11 +102,6 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         ApplyOptimization(optimization, cluster, &optimized_item));
   }
 
-  for (const auto& optimization : kGrapplerOptimizations) {
-    TF_RETURN_IF_ERROR(
-        ApplyOptimization(optimization, cluster, &optimized_item));
-  }
-
   // Store the final result of all the optimizations in `output`.
   output->Swap(&optimized_item.graph);
 
@@ -132,16 +111,17 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           .ReachableDefinitions(*output);
   const auto producer = output->versions().producer();
   bool optimized_functions = false;
-  for (const FunctionDef& func : output->library().function()) {
+  for (const auto& name : flib.ListFunctionNames()) {
+    auto* func = flib.Find(name);
     // Skip non tf.data functions.
-    if (!func.attr().contains(data::kTFDataFunction)) continue;
-    VLOG(3) << "Optimize function: function=" << func.signature().name();
+    if (!func->attr().contains(data::kTFDataFunction)) continue;
+    VLOG(3) << "Optimize function: function=" << func->signature().name();
     optimized_functions = true;
 
     // Make a GrapplerItem from a FunctionDef.
     GrapplerFunctionItem func_item;
     TF_RETURN_IF_ERROR(
-        MakeGrapplerFunctionItem(func, flib, producer, &func_item));
+        MakeGrapplerFunctionItem(*func, flib, producer, &func_item));
 
     GraphDef optimized_func_graph;
     TF_RETURN_IF_ERROR(Optimize(cluster, func_item, &optimized_func_graph));
@@ -162,7 +142,7 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // Replace optimized function with a new FunctionDef.
     TF_RETURN_IF_ERROR(
-        flib.ReplaceFunction(func.signature().name(), optimized_func));
+        flib.ReplaceFunction(func->signature().name(), optimized_func));
   }
   if (optimized_functions) {
     *output->mutable_library() = flib.ToProto();
@@ -221,27 +201,6 @@ Status TFDataMetaOptimizer::Init(
     }
   }
 
-  // Enable a subset of grappler optimization that are enabled by default.
-  //
-  // Layout optimizations are excluded because they assume that ops without
-  // explicit device assignment will be placed on GPU (if available) but that's
-  // not the case for operations within tf.data functions.
-  //
-  // TODO(b/120437209): Re-enable constant folding.
-  //
-  // TODO(jsimsa): Make the set of generic Grappler optimization applied to
-  // tf.data functions configurable.
-  enabled_optimizers_["pruning"] = MakeUnique<ModelPruner>();
-  enabled_optimizers_["shape"] = MakeUnique<ShapeOptimizer>();
-  enabled_optimizers_["remapping"] = MakeUnique<Remapper>(RewriterConfig::ON);
-  enabled_optimizers_["common_subgraph_elimination"] =
-      MakeUnique<CommonSubgraphElimination>();
-  enabled_optimizers_["arithmetic"] = MakeUnique<ArithmeticOptimizer>();
-  enabled_optimizers_["dependency"] = MakeUnique<DependencyOptimizer>();
-  enabled_optimizers_["loop"] = MakeUnique<LoopOptimizer>();
-  enabled_optimizers_["function"] = MakeUnique<FunctionOptimizer>(
-      RewriterConfig::ON, /*lower_control_flow=*/true);
-
   return Status::OK();
 }
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index d088abc00e6..6d0351202df 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -3,6 +3,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_not_mobile",
     "tf_cc_test",
     "tf_kernel_library",
 )
@@ -150,6 +151,7 @@ cc_library(
         ":dataset_utils",
         ":single_threaded_executor",
         ":stats_utils",
+        "@com_google_absl//absl/time",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -158,8 +160,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/time",
-    ],
+    ] + if_not_mobile([
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ]),
 )
 
 cc_library(
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index dd64475d7d6..d79cb25ec8b 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -35,6 +35,11 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif  // !IS_MOBILE_PLATFORM
+
 namespace tensorflow {
 namespace data {
 namespace {
@@ -612,6 +617,28 @@ Status CapturedFunction::Instantiate(
     for (size_t i = 0; i < fdef->signature().output_arg_size(); ++i) {
       inst_opts.output_devices.push_back(inst_opts.target);
     }
+
+#if !defined(IS_MOBILE_PLATFORM)
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+    optimization_options.allow_pruning_stateful_and_dataset_ops = false;
+    ConfigProto config_proto = inst_opts.config_proto;
+    // Layout optimizations are excluded because they assume that ops without
+    // explicit device assignment will be placed on GPU (if available) but
+    // that's not the case for operations within tf.data functions.
+    config_proto.mutable_graph_options()
+        ->mutable_rewrite_options()
+        ->set_layout_optimizer(RewriterConfig::OFF);
+    // TODO(b/120437209): Re-enable constant folding.
+    config_proto.mutable_graph_options()
+        ->mutable_rewrite_options()
+        ->set_constant_folding(RewriterConfig::OFF);
+    inst_opts.optimize_graph_fn =
+        std::bind(tensorflow::grappler::OptimizeGraph, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3,
+                  std::placeholders::_4, std::placeholders::_5,
+                  std::move(config_proto), fdef->signature().name(),
+                  std::move(optimization_options), std::placeholders::_6);
+#endif  // !IS_MOBILE_PLATFORM
   }
 
   FunctionLibraryRuntime::Handle f_handle;

From b3925cc9b62775904734eb789da98bd3d0a9dd0b Mon Sep 17 00:00:00 2001
From: Yanhui Liang <yhliang@google.com>
Date: Wed, 27 May 2020 14:42:39 -0700
Subject: [PATCH 1247/1533] Initialize benchmark/ directory to hold all Keras
 benchmarks.

PiperOrigin-RevId: 313467179
Change-Id: I06f3505182f67eab4866e1cbfa6043588a6eecc1
---
 tensorflow/python/keras/benchmark/README.md   |  3 +++
 tensorflow/python/keras/benchmark/__init__.py | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 tensorflow/python/keras/benchmark/README.md
 create mode 100644 tensorflow/python/keras/benchmark/__init__.py

diff --git a/tensorflow/python/keras/benchmark/README.md b/tensorflow/python/keras/benchmark/README.md
new file mode 100644
index 00000000000..17e458b3a77
--- /dev/null
+++ b/tensorflow/python/keras/benchmark/README.md
@@ -0,0 +1,3 @@
+# Keras Benchmark
+
+This package contains benchmarks on Keras models and components.
diff --git a/tensorflow/python/keras/benchmark/__init__.py b/tensorflow/python/keras/benchmark/__init__.py
new file mode 100644
index 00000000000..a70e59e1834
--- /dev/null
+++ b/tensorflow/python/keras/benchmark/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras Benchmark."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function

From 3bb7d689d301cbbb5cd752f1ea3f19c7bdbc99df Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Wed, 27 May 2020 14:57:12 -0700
Subject: [PATCH 1248/1533] Populate the memory access breakdown information in
 OpMetrics.

PiperOrigin-RevId: 313469967
Change-Id: Ib8783f7a4a81cbda79a32410440866946972a1f2
---
 tensorflow/core/profiler/utils/op_utils.cc | 15 ++++++++-------
 tensorflow/core/profiler/utils/op_utils.h  |  6 +++++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index 921e0617902..75789bc1071 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -63,13 +63,11 @@ void HostOpMetricsDbBuilder::UpdateHostInfeedEnqInfo(
       start_timestamp_ps_diff);
 }
 
-void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
-                                       absl::string_view name,
-                                       absl::string_view category,
-                                       absl::string_view provenance,
-                                       bool is_eager, uint64 occurrences,
-                                       uint64 time_ps, uint64 children_time_ps,
-                                       int64 flops, int64 bytes_accessed) {
+void DeviceOpMetricsDbBuilder::EnterOp(
+    uint64 program_id, absl::string_view name, absl::string_view category,
+    absl::string_view provenance, bool is_eager, uint64 occurrences,
+    uint64 time_ps, uint64 children_time_ps, int64 flops, int64 bytes_accessed,
+    const std::vector<OpMetrics::MemoryAccessed>& memory_accessed_breakdown) {
   uint64 self_time_ps = time_ps - children_time_ps;
   DCHECK_GE(time_ps, self_time_ps);
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
@@ -89,6 +87,9 @@ void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
       op_metrics->bytes_accessed() +
       GetCappedPerf(bytes_accessed * occurrences, self_time_ps,
                     peak_hbm_bw_giga_bytes_per_second_ / 1000));
+  for (const auto& memory_accessed : memory_accessed_breakdown) {
+    *op_metrics->add_memory_accessed_breakdown() = memory_accessed;
+  }
   db()->set_total_op_time_ps(db()->total_op_time_ps() + self_time_ps);
 }
 
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index f94328d1b8d..9c9762853b8 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -69,10 +69,14 @@ class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
   //                      picoseconds.
   //   flops = the number of floating-point operations computed.
   //   bytes_accessed = the sum of bytes read and bytes written by this OP.
+  //   memory_accessed_breakdown = the breakdown of memory accessed by operation
+  //                               type and memory space.
   void EnterOp(uint64 program_id, absl::string_view name,
                absl::string_view category, absl::string_view provenance,
                bool is_eager, uint64 occurrences, uint64 time_ps,
-               uint64 children_time_ps, int64 flops, int64 bytes_accessed);
+               uint64 children_time_ps, int64 flops, int64 bytes_accessed,
+               const std::vector<OpMetrics::MemoryAccessed>&
+                   memory_accessed_breakdown = {});
 
  protected:
   // Peak performance of a TensorCore or a GPU in TFLOP/s.

From 43862c55a34292ba25c0539c27c2aec9e3232947 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 14:58:34 -0700
Subject: [PATCH 1249/1533] Enable ROI v2 ops: TransformTensorBilinear,
 TransformLandmarks, ROI2TransformMatrix.

PiperOrigin-RevId: 313470232
Change-Id: Id581304ec9313070369c68bffe2fa12690e45c0b
---
 tensorflow/lite/delegates/gpu/common/model_builder.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index daedc277869..061c65095eb 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -2453,15 +2453,15 @@ class TransformLandmarksV2OperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
     std::string op_name = "transform_landmarks_v2";
     node->operation.type = op_name;
-    BHWC output_shape;
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
+    BHWC output_shape = output_value->tensor.shape;
     RETURN_IF_ERROR(
         ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
                               tflite_node->custom_initial_data_size,
                               &(node->operation.attributes), &output_shape));
 
-    auto output_value = graph->FindOutputs(node->id)[0];
-
-    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
     return absl::OkStatus();
   }
 

From d807bfcb6460b8cf4d48137c2c16be43826d816d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 15:00:03 -0700
Subject: [PATCH 1250/1533] Stop explicitly passing a `name` arg to `slice`
 from `slice_helper`. (while leaving in the name_scopes). This makes the slice
 operator more dispatch-friendly, because `slice` dispatches while
 `slice_helper` doesn't. This means the `dispatched` operation no longer has a
 name locked in and is less likely to run into issues where multiple ops are
 created w/ the same name.

PiperOrigin-RevId: 313470510
Change-Id: Id26a5713258124d8a02ad4938f444f3f8103e528
---
 tensorflow/python/ops/array_ops.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 118c2cfca55..a641633b1f5 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -983,7 +983,7 @@ def _slice_helper(tensor, slice_spec, var=None):
   with ops.name_scope(
       None,
       "strided_slice", [tensor] + begin + end + strides,
-      skip_on_eager=False):
+      skip_on_eager=False) as name:
     if begin:
       packed_begin, packed_end, packed_strides = (stack(begin), stack(end),
                                                   stack(strides))
@@ -1009,7 +1009,8 @@ def _slice_helper(tensor, slice_spec, var=None):
         shrink_axis_mask=shrink_axis_mask,
         new_axis_mask=new_axis_mask,
         ellipsis_mask=ellipsis_mask,
-        var=var)
+        var=var,
+        name=name)
 
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
@@ -1193,7 +1194,7 @@ def strided_slice(input_,
       if var is None:
         raise ValueError("Sliced assignment is only supported for variables")
       else:
-        if name is None and parent_name:
+        if name is None:
           name = parent_name + "_assign"
 
         return var._strided_slice_assign(

From ad8a4e1bdaf172a6490552c0bb072207fbaf08c0 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 27 May 2020 15:06:00 -0700
Subject: [PATCH 1251/1533] [XLA:SPMD] Halo exchange beyond direct neighbors

PiperOrigin-RevId: 313471820
Change-Id: Ie0c4fa412dff534ebae462726dd880c2e7093d40
---
 .../xla/service/spmd/spmd_partitioner_test.cc | 37 ++++++++++++
 .../xla/service/spmd/spmd_partitioner_util.cc | 59 ++++++++++---------
 2 files changed, 67 insertions(+), 29 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 55d7dc43785..e766695385b 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -649,6 +649,43 @@ ENTRY entry {
                           op::ReduceWindow(masked, op::Constant())));
 }
 
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideHaloBeyondNeighbor) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  param = f32[9,2] parameter(0), sharding={devices=[5,1]0,1,2,3,4}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[5,2]{1,0} reduce-window(param, constant.1),
+    window={size=4x1 stride=2x1 pad=3_0x0_0}, to_apply=sum,
+    sharding={devices=[5,1]0,1,2,3,4}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/5));
+  VLOG(1) << module->ToString();
+  auto halo0 = AllOf(op::Shape("f32[1,2]"),
+                     op::CollectivePermute(op::Slice(op::Parameter(0))));
+  auto halo1 =
+      AllOf(op::Shape("f32[2,2]"), op::CollectivePermute(op::Parameter(0)));
+  auto pre_mask =
+      AllOf(op::Shape("f32[4,2]"),
+            op::Slice(AllOf(op::Shape("f32[5,2]"),
+                            op::Concatenate(halo0, halo1, op::Parameter(0)))));
+  auto masked =
+      op::Select(op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply())),
+                             op::Broadcast(op::Constant())),
+                 pre_mask, op::Broadcast(op::Constant()));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
 TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideUnequalHalo) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 57617b59ffb..8db2ca84a05 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
 
+#include <algorithm>
+
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -23,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -407,33 +410,30 @@ absl::optional<HloInstruction*> ExchangeHalo(
   std::vector<HloInstruction*> concat_pieces;
 
   int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
-  if (max_left_halo_size > input_shard_size) {
-    VLOG(1) << "ExchangeHalo failed: halo is beyond the left neighbor.";
-    return absl::nullopt;
-  }
-  if (max_left_halo_size > 0) {
+  for (int64 i = CeilOfRatio(max_left_halo_size, input_shard_size) - 1; i >= 0;
+       --i) {
     std::vector<std::pair<int64, int64>> source_target_pairs;
     target.tile_assignment().Each(
         [&](absl::Span<const int64> indices, int64 device) {
-          if (indices[dim] > 0) {
+          if (indices[dim] > i) {
             std::vector<int64> source_indices(indices.begin(), indices.end());
-            source_indices[dim] -= 1;
+            source_indices[dim] -= i + 1;
             source_target_pairs.emplace_back(
                 target.tile_assignment()(source_indices), device);
           }
         });
+    int64 halo_size =
+        std::min(max_left_halo_size - input_shard_size * i, input_shard_size);
     auto halo_shape = hlo->shape();
     auto source_halo_slice = hlo;
-    if (max_left_halo_size != hlo->shape().dimensions(dim)) {
-      halo_shape.set_dimensions(dim, max_left_halo_size);
+    if (halo_size != hlo->shape().dimensions(dim)) {
+      halo_shape.set_dimensions(dim, halo_size);
       std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
-      halo_start_indices[dim] =
-          hlo->shape().dimensions(dim) - max_left_halo_size;
+      halo_start_indices[dim] = hlo->shape().dimensions(dim) - halo_size;
       std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
-
-      source_halo_slice = b->AddInstruction(
-          hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
-                           hlo->shape().dimensions(), halo_slice_strides));
+      source_halo_slice = b->AddInstruction(HloInstruction::CreateSlice(
+          halo_shape, hlo, halo_start_indices, hlo->shape().dimensions(),
+          halo_slice_strides));
     }
     auto left_halo =
         collective_ops_creator.create_cross_partition_collective_permute(
@@ -446,29 +446,30 @@ absl::optional<HloInstruction*> ExchangeHalo(
   // Right halo.
   int64 max_right_halo_size =
       right_halo_size_function.MaxInRange(0, shard_count - 1);
-  if (max_right_halo_size > input_shard_size) {
-    VLOG(1) << "ExchangeHalo failed: halo is beyond the right neighbor.";
-    return absl::nullopt;
-  }
-  if (max_right_halo_size > 0) {
+  for (int64 i = 0; i < CeilOfRatio(max_right_halo_size, input_shard_size);
+       ++i) {
     std::vector<std::pair<int64, int64>> source_target_pairs;
     target.tile_assignment().Each(
         [&](absl::Span<const int64> indices, int64 device) {
-          if (indices[dim] > 0) {
+          if (indices[dim] > i) {
             std::vector<int64> target_indices(indices.begin(), indices.end());
-            target_indices[dim] -= 1;
+            target_indices[dim] -= i + 1;
             source_target_pairs.emplace_back(
                 device, target.tile_assignment()(target_indices));
           }
         });
+    int64 halo_size =
+        std::min(max_right_halo_size - input_shard_size * i, input_shard_size);
     auto halo_shape = hlo->shape();
-    halo_shape.set_dimensions(dim, max_right_halo_size);
-    std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
-    std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
-
-    auto source_halo_slice = b->AddInstruction(
-        hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
-                         halo_shape.dimensions(), halo_slice_strides));
+    HloInstruction* source_halo_slice = hlo;
+    if (halo_size != halo_shape.dimensions(dim)) {
+      halo_shape.set_dimensions(dim, halo_size);
+      std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+      std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+      source_halo_slice = b->AddInstruction(HloInstruction::CreateSlice(
+          halo_shape, hlo, halo_start_indices, halo_shape.dimensions(),
+          halo_slice_strides));
+    }
     auto right_halo =
         collective_ops_creator.create_cross_partition_collective_permute(
             b, source_halo_slice, source_target_pairs, (*next_channel_id)++);

From 75fc53e381ab874eed2eafb13bfdd42dcfdcd440 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Wed, 27 May 2020 15:11:06 -0700
Subject: [PATCH 1252/1533] Introduce op attribute
 'enable_large_batch_splitting' and keep default behavior consistent.

Plug 'enable_large_batch_splitting' into queue options, so Queue can split input within its 'Schedule' method.

PiperOrigin-RevId: 313472810
Change-Id: I72753d3fc31d887d77d2015280b7b1628ba2c0aa
---
 .../base_api/api_def_BatchFunction.pbtxt      |  7 ++++++
 tensorflow/core/kernels/batch_kernels.cc      | 23 +++++++++++++++----
 .../batching_util/shared_batch_scheduler.h    |  4 ++++
 tensorflow/core/ops/batch_ops.cc              | 19 +++++++++++++++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |  2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |  2 +-
 6 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
index 09eff6177b1..ae5942b3617 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -84,6 +84,13 @@ END
     name: "Tout"
     description: <<END
 the types of the output tensors.
+END
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    description: <<END
+input with a large size (i.e., larger than the largest value of
+`allowed_batch_sizes`) will be splitted into multiple batches with batch size.
 END
   }
   summary: "Batches all the inputs tensors to the computation done by the function."
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 151f2367c95..fd4bdacff93 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -272,6 +272,7 @@ class BatchResource : public ResourceBase {
                        int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
                        FunctionLibraryRuntime::Handle fhandle,
+                       bool enable_large_batch_splitting,
                        std::unique_ptr<BatchResource>* resource) {
     std::unique_ptr<BatchResource> new_resource(new BatchResource);
 
@@ -286,6 +287,10 @@ class BatchResource : public ResourceBase {
     new_resource->batcher_queue_options_.batch_timeout_micros =
         batch_timeout_micros;
 
+    // Support for splitting large batch is still in progress.
+    new_resource->batcher_queue_options_.enable_large_batch_splitting =
+        enable_large_batch_splitting;
+
     new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
 
     new_resource->fhandle_ = fhandle;
@@ -786,6 +791,13 @@ class BatchFunctionKernel : public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("f", &func));
     OP_REQUIRES_OK(
         c, lib->Instantiate(func.name(), AttrSlice(&func.attr()), &fhandle_));
+
+    if (c->HasAttr("enable_large_batch_splitting")) {
+      OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
+                                   &enable_large_batch_splitting_));
+    } else {
+      enable_large_batch_splitting_ = false;
+    }
   }
 
   bool IsExpensive() override { return false; }
@@ -794,10 +806,10 @@ class BatchFunctionKernel : public AsyncOpKernel {
     BatchResource* br;
     std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
-      TF_RETURN_IF_ERROR(
-          BatchResource::Create(num_batch_threads_, max_batch_size_,
-                                batch_timeout_micros_, max_enqueued_batches_,
-                                allowed_batch_sizes_, fhandle_, &new_resource));
+      TF_RETURN_IF_ERROR(BatchResource::Create(
+          num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+          max_enqueued_batches_, allowed_batch_sizes_, fhandle_,
+          enable_large_batch_splitting_, &new_resource));
       *r = new_resource.release();
       return Status::OK();
     };
@@ -844,6 +856,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
   FunctionLibraryRuntime::Handle fhandle_;
+  bool enable_large_batch_splitting_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BatchFunction").Device(DEVICE_CPU),
@@ -876,7 +889,7 @@ class BatchKernel : public AsyncOpKernel {
       std::unique_ptr<BatchResource> new_resource;
       TF_RETURN_IF_ERROR(BatchResource::Create(
           num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle, false,
           &new_resource));
       *r = new_resource.release();
       return Status::OK();
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index c44de023ced..66bdff933d8 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -160,6 +160,10 @@ class SharedBatchScheduler
     // See the class documentation above for guidelines on how to tune this
     // parameter.
     size_t max_enqueued_batches = 10;
+
+    // If true, queue implementation would split one input batch task into
+    // subtasks and fit them into different batches.
+    bool enable_large_batch_splitting = false;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
index ba7faeb5e8a..cfa4049938d 100644
--- a/tensorflow/core/ops/batch_ops.cc
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -25,6 +25,19 @@ REGISTER_OP("BatchFunction")
     .Output("out_tensors: Tout")
     .Attr("f: func")
     .Attr("num_batch_threads: int")
+    // 'max_batch_size' denotes the maximum batch size acceptable, i.e., inputs
+    // with larger batch size are simply invalidated.
+    // By default, 'max_batch_size' must be equal to max value of
+    // 'allowed_batch_sizes'.
+    // By setting 'enable_large_batch_splitting' (attribute below) to true,
+    // 'max_batch_size' can be greater than or equal to max value of
+    // 'allowed_batch_sizes', in other words,
+    // 1) input with size > 'max_batch_size' is still invalidated.
+    // 2) input with
+    //    a) size <= 'max_batch_size'
+    //    b) size > max value of 'allowed_batch_sizes'
+    //    will automatically be split into multiple batches (with batch size in
+    //    'allowed_batch_sizes'), executed, and re-composed (as final output).
     .Attr("max_batch_size: int")
     .Attr("batch_timeout_micros: int")
     .Attr("max_enqueued_batches: int = 10")
@@ -35,6 +48,12 @@ REGISTER_OP("BatchFunction")
     .Attr("Tin: list(type)")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")
+    // If 'enable_large_batch_splitting' is true, for input batches exceeding
+    // the largest value in "allowed_batch_sizes", allow the batch to be split
+    // into multiple batches with batch size within "allowed_batch_sizes".
+    // NOTE: Support for `enable_large_batch_splitting == true` is still
+    // developed in progress.
+    .Attr("enable_large_batch_splitting: bool = false")
     // TODO(apassos): Fix this shape inference function. It requires shape
     // inference of function calls.
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 37a95cc88d1..a8efb9e59b5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -342,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 37a95cc88d1..a8efb9e59b5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -342,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"

From 8b1f77197a800fb86c32e252b38b472acca1b25f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 15:14:59 -0700
Subject: [PATCH 1253/1533] Internal change

PiperOrigin-RevId: 313473478
Change-Id: I53882ee616693d8f3006870786c153e470b4b939
---
 tensorflow/python/kernel_tests/random/random_ops_test.py | 8 --------
 tensorflow/python/ops/random_ops.py                      | 6 ------
 2 files changed, 14 deletions(-)

diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 1d3fdec3179..73c8bd09db0 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -23,7 +23,6 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
@@ -416,13 +415,6 @@ class RandomUniformTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             graph_seed=965)
 
-  def testUniformWithInvalidMaxMindShape(self):
-    # Test case for GitHub issue 34363.
-    with self.assertRaises(
-        (errors.InvalidArgumentError, errors.UnknownError, ValueError)):
-      array = array_ops.zeros(shape=(1,))
-      random_ops.random_uniform(shape=(), minval=array)
-
 
 class RandomShapeTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 30d8e45b2c4..1af91ed0dd3 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -304,12 +304,6 @@ def random_uniform(shape,
         if not maxval_is_one:
           result = math_ops.multiply(result, maxval)
       else:
-        # Use explicit "broadcast_to" so that any shape incompatibility
-        # are returned with InvalidArgument error.
-        # This prevent "slient broadcast" that may cause the shape of
-        # result "overflow" when minval or maxval is larger than expected shape
-        maxval = array_ops.broadcast_to(maxval, shape)
-        minval = array_ops.broadcast_to(minval, shape)
         result = math_ops.add(result * (maxval - minval), minval, name=name)
     # TODO(b/132092188): C++ shape inference inside functional ops does not
     # cross FuncGraph boundaries since that information is only available in

From 844497338b6f9f6f334a39993cd0657347944963 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 15:18:47 -0700
Subject: [PATCH 1254/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 313474152
Change-Id: Ib47e52d03ea1d9a6da1e1672dc9e7d761105fb73
---
 .../compat/ops_history_v2/BatchFunction.pbtxt | 91 +++++++++++++++++++
 .../ops_history_v2/SparseSegmentMean.pbtxt    | 56 ++++++++++++
 .../SparseSegmentMeanWithNumSegments.pbtxt    | 73 +++++++++++++++
 .../ops_history_v2/SparseSegmentSqrtN.pbtxt   | 56 ++++++++++++
 .../SparseSegmentSqrtNWithNumSegments.pbtxt   | 73 +++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 11 +++
 6 files changed, 360 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
index daf3c4692b8..1a3cb96431d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
@@ -82,3 +82,94 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
index 526c2c25c04..31b04288b75 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
@@ -53,3 +53,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
index b9984f8df25..1fe616e3552 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
@@ -70,3 +70,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
index 17562d4f333..581fad0b5d8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
@@ -53,3 +53,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
index 1f24446a587..1ac5edbd39a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -70,3 +70,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index e2f2e3d00fa..d2a14590bc5 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3553,6 +3553,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "BatchIFFT"
@@ -46097,6 +46104,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46215,6 +46223,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46283,6 +46292,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46401,6 +46411,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }

From a3ebe7d1aae995ee5f914d9b23e6d7bdaaa0c045 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 15:23:34 -0700
Subject: [PATCH 1255/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/5b4cd2d4c423

Update for simple return types that can be inferred now.

PiperOrigin-RevId: 313475038
Change-Id: Ieedea5d120e1f4e00ff6ec21d3718370b78a885c
---
 third_party/mlir/BUILD | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index a57088432e2..df875ebb62d 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2895,6 +2895,14 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/OpenMP/OpenMPOps.cpp.inc",
         ),
+        (
+            "-gen-enum-decls",
+            "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc",
+        ),
+        (
+            "-gen-enum-defs",
+            "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc",
+        ),
         (
             "-gen-dialect-decls",
             "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc",
@@ -2926,6 +2934,7 @@ cc_library(
     deps = [
         ":IR",
         ":OpenMPOpsIncGen",
+        ":StandardOps",
         "@llvm-project//llvm:support",
     ],
 )

From cbc64487756332683ca2d5ef3a0e70ca27ac41dd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 15:31:23 -0700
Subject: [PATCH 1256/1533] Register uint8, int16, and uint16 for strided slice
 on GPU, such that it supports the same types as on CPU.

PiperOrigin-RevId: 313476317
Change-Id: Ica1723c1f9ea0e71990d7647da723c5dc38a2cc4
---
 .../kernels/strided_slice_op_gpu_int.cu.cc    |  4 +--
 .../python/kernel_tests/slice_op_test.py      | 29 ++++++++++---------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
index d86c7b2fe23..02dd9259a76 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
@@ -21,9 +21,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
 
 namespace tensorflow {
-TF_CALL_int8(DEFINE_GPU_KERNELS);
-TF_CALL_int32(DEFINE_GPU_KERNELS);
-TF_CALL_int64(DEFINE_GPU_KERNELS);
+TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_KERNELS);
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 39d88896416..15a340ef270 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -235,19 +235,22 @@ class SliceTest(test.TestCase):
       self.assertAllEqual(slice_val, inp[x, 0:y])
 
   def testSimple(self):
-    with self.session(use_gpu=True) as sess:
-      inp = np.random.rand(4, 4).astype("f")
-      a = constant_op.constant(
-          [float(x) for x in inp.ravel(order="C")],
-          shape=[4, 4],
-          dtype=dtypes.float32)
-      slice_t = array_ops.slice(a, [0, 0], [2, 2])
-      slice2_t = a[:2, :2]
-      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
-    self.assertAllEqual(slice_val, inp[:2, :2])
-    self.assertAllEqual(slice2_val, inp[:2, :2])
-    self.assertEqual(slice_val.shape, slice_t.get_shape())
-    self.assertEqual(slice2_val.shape, slice2_t.get_shape())
+    with test_util.use_gpu():
+      for dtype in [
+          np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+          np.float16, np.float32, np.float64, np.complex64, np.complex128,]:
+        inp = np.random.rand(4, 4).astype(dtype)
+        a = constant_op.constant(
+            [float(x) for x in inp.ravel(order="C")],
+            shape=[4, 4],
+            dtype=dtypes.float32)
+        slice_t = array_ops.slice(a, [0, 0], [2, 2])
+        slice2_t = a[:2, :2]
+        slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
+        self.assertAllEqual(slice_val, inp[:2, :2])
+        self.assertAllEqual(slice2_val, inp[:2, :2])
+        self.assertEqual(slice_val.shape, slice_t.get_shape())
+        self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 
   @test_util.run_deprecated_v1
   def testComplex(self):

From 66ae3a8ccc258a7bdaf395c380d004b5df1f0142 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Wed, 27 May 2020 15:36:25 -0700
Subject: [PATCH 1257/1533] Fix ImportError: cannot import name
 'dense_features'. This module was added to tf-estimator-nightly. Change the
 nightly build to depend on tf-estimator-nightly instead of
 tensorflow-estimator.

PiperOrigin-RevId: 313477121
Change-Id: Ia5eab6b0542789ba41dc691118a22c7e64b13258
---
 .../ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh     | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
index 34ef1974916..a90a3e5a212 100755
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
@@ -45,10 +45,15 @@ bazel build --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   tensorflow/tools/pip_package:build_pip_package
 
+# Set TF nightly flag so we get the proper version of estimator
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
 PIP_WHL_DIR=whl
 mkdir -p ${PIP_WHL_DIR}
 PIP_WHL_DIR=$(readlink -f ${PIP_WHL_DIR})  # Get absolute path
-bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}"
+bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}" "${NIGHTLY_FLAG}"
 WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
 
 cp "${WHL_PATH}" "$(pwd)"/.

From 0da0bdd8095842a37f9d383e11629393057975f2 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Wed, 27 May 2020 16:02:07 -0700
Subject: [PATCH 1258/1533] Add redirects for old training script locations

PiperOrigin-RevId: 313481629
Change-Id: I34b2b3c4e85be8d3de313ae7b5e6aab875d43a13
---
 .../lite/micro/examples/hello_world/create_sine_model.ipynb      | 1 +
 .../lite/micro/examples/micro_speech/train_speech_model.ipynb    | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
 create mode 100644 tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb

diff --git a/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
new file mode 100644
index 00000000000..2cedb4d3fc9
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyOIPo/+nZRJ3iJImegJmzTI"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"RmcjrlLpRHKk","colab_type":"code","colab":{}},"source":["This Colab notebook has been moved to https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"b6Hnl5__RKOH","colab_type":"text"},"source":[""]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
new file mode 100644
index 00000000000..ec68bf28cda
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyOIPo/+nZRJ3iJImegJmzTI"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"RmcjrlLpRHKk","colab_type":"code","colab":{}},"source":["This Colab notebook has been moved to https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"b6Hnl5__RKOH","colab_type":"text"},"source":[""]}]}
\ No newline at end of file

From 35f41be4525db93c35a84290cddd8a141ff61b18 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 27 May 2020 16:06:33 -0700
Subject: [PATCH 1259/1533] Extend PRelu op in TFLite to accept the same shape
 inputs.

PiperOrigin-RevId: 313482424
Change-Id: I0b11b1a1c2b906f5de89361e94353ae55a0bbcd0
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  15 +--
 tensorflow/lite/kernels/activations.cc        |  50 ++++++--
 tensorflow/lite/kernels/activations_test.cc   | 111 ++++++++++++++++--
 .../lite/kernels/internal/reference/prelu.h   |  31 +++++
 4 files changed, 174 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 923efdbaf9d..edb533c9442 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -2297,26 +2297,17 @@ def TFL_PReluOp : TFL_Op<"prelu", [
     NoSideEffect,
     ResultsBroadcastableShape,
     TFL_GpuTargetOp,
-    TFL_OperandHasRankAtMost<0, 4>,
-    TFL_OperandHasRankAtMost<1, 4>,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
     BinaryOpSameElementTypeConstraint,
     PredOpTrait<"input and output must have the same element type",
-      TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    PredOpTrait<"'alpha' should have one less rank than 'input'.",
-      Or<[TFL_OperandIsUnrankedPred<0>,
-          TFL_OperandIsUnrankedPred<1>,
-          CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() == "
-                "$_op.getOperand(1).getType().cast<ShapedType>().getRank() "
-                "+ 1">]>>]> {
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Parameterized Relu operator";
 
   let description = [{
     Parameterized Relu operator
       x -> x >= 0 ? x : (alpha * x)
     where alpha is a trainable tensor.
-    alpha should have one less rank than the input as it doesn't have the batch
-    dimension, and the other dimensions either should be the same size as input
-    or size 1, where it is broadcasted in the second case.
+    input and alpha should be the same size as input or be broadcastable.
   }];
 
   let arguments = (
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 47146771b50..fc0c461a7c1 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -88,6 +88,7 @@ struct PreluOpData : public OpData {
   int32_t output_shift_1 = 0;
   int32_t output_multiplier_2 = 0;
   int32_t output_shift_2 = 0;
+  bool requires_broadcast;
 };
 
 struct HardSwishData {
@@ -693,6 +694,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
                        &data->output_shift_2);
   }
 
+  data->requires_broadcast = !HaveSameShapes(input, alpha);
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
   // This means it's always required to "broadcast" alpha values in PRelu.
   TfLiteIntArray* output_size = nullptr;
@@ -1161,11 +1163,19 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
-      reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-          GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(alpha), GetTensorData<float>(alpha),
-          GetTensorShape(output), GetTensorData<float>(output),
-          ApplyPrelu<float>);
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(alpha), GetTensorData<float>(alpha),
+            GetTensorShape(output), GetTensorData<float>(output),
+            ApplyPrelu<float>);
+      } else {
+        reference_ops::BinaryFunction<float, float, float>(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(alpha), GetTensorData<float>(alpha),
+            GetTensorShape(output), GetTensorData<float>(output),
+            ApplyPrelu<float>);
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
@@ -1177,10 +1187,17 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.output_shift_1 = data->output_shift_1;
       op_params.output_multiplier_2 = data->output_multiplier_2;
       op_params.output_shift_2 = data->output_shift_2;
-      reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastPrelu4DSlow(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::Prelu(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
@@ -1192,10 +1209,17 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.output_shift_1 = data->output_shift_1;
       op_params.output_multiplier_2 = data->output_multiplier_2;
       op_params.output_shift_2 = data->output_shift_2;
-      reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastPrelu4DSlow(
+            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      } else {
+        reference_ops::Prelu(
+            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      }
       return kTfLiteOk;
     } break;
     default:
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 9f6fb932d34..5a679147469 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -2060,7 +2060,7 @@ TEST(FloatActivationsOpTest, PRelu) {
       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
-      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+      -2.0f, -2.0f, -2.0f,  // Row 2, Column 2
   });
   m.SetAlpha({0.0f, 1.0f, 2.0f});
   m.Invoke();
@@ -2068,7 +2068,32 @@ TEST(FloatActivationsOpTest, PRelu) {
                                  0.0f, 0.0f, 0.0f,    // Row 1, Column 1
                                  1.0f, 1.0f, 1.0f,    // Row 1, Column 2
                                  0.0f, -1.0f, -2.0f,  // Row 2, Column 1
-                                 0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+                                 0.0f, -2.0f, -4.0f,  // Row 2, Column 2
+                             }));
+}
+
+TEST(FloatActivationsOpTest, PReluSameShapes) {
+  FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                      {TensorType_FLOAT32, {1, 2, 2, 3}});
+
+  m.SetInput({
+      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
+      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+      -2.0f, -2.0f, -2.0f,  // Row 2, Column 2
+  });
+  m.SetAlpha({
+      0.0f, 1.0f, 2.0f,  // Row 1, Column 1
+      0.0f, 1.0f, 2.0f,  // Row 1, Column 2
+      0.0f, 1.0f, 2.0f,  // Row 2, Column 1
+      0.0f, 1.0f, 2.0f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                                 0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+                                 0.0f, -2.0f, -4.0f,  // Row 2, Column 2
                              }));
 }
 
@@ -2081,7 +2106,7 @@ TEST(QuantizedActivationsOpTest, PRelu) {
       0.0f, 0.0f, 0.0f,        // Row 1, Column 1
       0.5f, 0.5f, 0.5f,        // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
   });
   m.SetAlpha<uint8_t>({0.0f, 0.5f, -0.5f});
   m.Invoke();
@@ -2091,14 +2116,49 @@ TEST(QuantizedActivationsOpTest, PRelu) {
                       0.0f, 0.0f, 0.0f,       // Row 1, Column 1
                       0.5f, 0.5f, 0.5f,       // Row 1, Column 2
                       0.0f, -0.5f, 0.5f,      // Row 2, Column 1
-                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
                                           128, 128, 128,  // Row 1, Column 1
                                           192, 192, 192,  // Row 1, Column 2
                                           128, 64, 192,   // Row 2, Column 1
-                                          128, 112, 144,  // Row 1, Column 2
+                                          128, 112, 144,  // Row 2, Column 2
+                                      }));
+}
+
+TEST(QuantizedActivationsOpTest, PReluSameShapes) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax});
+  m.SetInput<uint8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
+  });
+  m.SetAlpha<uint8_t>({
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 2
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128, 128, 128,  // Row 1, Column 1
+                                          192, 192, 192,  // Row 1, Column 2
+                                          128, 64, 192,   // Row 2, Column 1
+                                          128, 112, 144,  // Row 2, Column 2
                                       }));
 }
 
@@ -2111,7 +2171,7 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
       0.0f, 0.0f, 0.0f,        // Row 1, Column 1
       0.5f, 0.5f, 0.5f,        // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
   });
   m.SetAlpha<int8_t>({0.0f, 0.5f, -0.5f});
   m.Invoke();
@@ -2121,14 +2181,49 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
                       0.0f, 0.0f, 0.0f,       // Row 1, Column 1
                       0.5f, 0.5f, 0.5f,       // Row 1, Column 2
                       0.0f, -0.5f, 0.5f,      // Row 2, Column 1
-                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
                                          0, 0, 0,     // Row 1, Column 1
                                          64, 64, 64,  // Row 1, Column 2
                                          0, -64, 64,  // Row 2, Column 1
-                                         0, -16, 16,  // Row 1, Column 2
+                                         0, -16, 16,  // Row 2, Column 2
+                                     }));
+}
+
+TEST(QuantizedActivationsOpTest, PReluInt8SameShapes) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_INT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<int8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
+  });
+  m.SetAlpha<int8_t>({
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 2
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         0, 0, 0,     // Row 1, Column 1
+                                         64, 64, 64,  // Row 1, Column 2
+                                         0, -64, 64,  // Row 2, Column 1
+                                         0, -16, 16,  // Row 2, Column 2
                                      }));
 }
 
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index 50d9ad24dd9..4633cb9599a 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -72,6 +72,37 @@ inline void BroadcastPrelu4DSlow(
   }
 }
 
+template <typename T>
+inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
+                  const T* input_data, const RuntimeShape& alpha_shape,
+                  const T* alpha_data, const RuntimeShape& output_shape,
+                  T* output_data) {
+  const int32 quantized_min = std::numeric_limits<T>::min();
+  const int32 quantized_max = std::numeric_limits<T>::max();
+
+  const int flat_size =
+      MatchingElementsSize(input_shape, alpha_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32 input_value = params.input_offset + input_data[i];
+    int32 output_value;
+    if (input_value >= 0) {
+      output_value = MultiplyByQuantizedMultiplier(
+          input_value, params.output_multiplier_1, params.output_shift_1);
+    } else {
+      const int32 alpha_value = params.alpha_offset + alpha_data[i];
+
+      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
+                                                   params.output_multiplier_2,
+                                                   params.output_shift_2);
+    }
+    output_value += params.output_offset;
+
+    const int32 clamped_output =
+        std::min(quantized_max, std::max(quantized_min, output_value));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 

From 01e089369f081cd306f8d77489e55d890c73c9db Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 27 May 2020 16:10:31 -0700
Subject: [PATCH 1260/1533] Export Discretization keras preprocessing layer.

PiperOrigin-RevId: 313483178
Change-Id: Ia01fa00be7849eb3ca15791606f4c409f8ec8817
---
 tensorflow/python/keras/layers/__init__.py    |   1 +
 .../layers/preprocessing/discretization.py    |  53 ++---
 .../discretization_distribution_test.py       |   3 +-
 .../preprocessing/discretization_test.py      |  43 +---
 .../python/keras/layers/serialization.py      |   3 +-
 ...mental.preprocessing.-discretization.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 ...mental.preprocessing.-discretization.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 9 files changed, 471 insertions(+), 76 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index e0f087b2453..36e58ef6552 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -64,6 +64,7 @@ else:
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
 from tensorflow.python.keras.layers.preprocessing.category_crossing import CategoryCrossing
+from tensorflow.python.keras.layers.preprocessing.discretization import Discretization
 from tensorflow.python.keras.layers.preprocessing.hashing import Hashing
 
 # Advanced activations.
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index 3052cfb4369..d621410146c 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -19,70 +19,59 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-
-INTEGER = "int"
-BINARY = "binary"
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export("keras.layers.experimental.preprocessing.Discretization")
 class Discretization(Layer):
   """Buckets data into discrete ranges.
 
   This layer will place each element of its input data into one of several
-  contiguous ranges and output either an integer index or a one-hot vector
-  indicating which range each element was placed in.
-
-  What happens in `adapt()`: The dataset is examined and sliced.
+  contiguous ranges and output an integer index indicating which range each
+  element was placed in.
 
   Input shape:
     Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.
 
   Output shape:
-    The same as the input shape if `output_mode` is 'int', or
-      `[output_shape, num_buckets]` if `output_mode` is 'binary'.
+    Same as input shape.
 
   Attributes:
     bins: Optional boundary specification. Bins include the left boundary and
       exclude the right boundary, so `bins=[0., 1., 2.]` generates bins
       `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
-    output_mode: One of 'int', 'binary'. Defaults to 'int'.
 
   Examples:
 
   Bucketize float values based on provided buckets.
   >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-  >>> layer = Discretization(bins=[0., 1., 2.])
+  >>> layer = tf.keras.layers.experimental.preprocessing.Discretization(
+  ...          bins=[0., 1., 2.])
   >>> layer(input)
   <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
   array([[0, 2, 3, 1],
          [1, 3, 2, 1]], dtype=int32)>
   """
 
-  def __init__(self, bins, output_mode=INTEGER, **kwargs):
+  def __init__(self, bins, **kwargs):
     super(Discretization, self).__init__(**kwargs)
     self.bins = bins
-    self.output_mode = output_mode
 
   def get_config(self):
     config = {
         "bins": self.bins,
-        "output_mode": self.output_mode,
     }
     base_config = super(Discretization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def compute_output_shape(self, input_shape):
-    if self.output_mode == INTEGER:
-      return input_shape
-    else:
-      return tensor_shape.TensorShape([dim for dim in input_shape] +
-                                      [len(self.bins)])
+    return input_shape
 
   def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
@@ -99,26 +88,14 @@ class Discretization(Layer):
       # Ragged map_flat_values doesn't touch the non-values tensors in the
       # ragged composite tensor. If this op is the only op a Keras model,
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
-      integer_buckets = array_ops.identity(integer_buckets)
+      return array_ops.identity(integer_buckets)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
       integer_buckets = math_ops._bucketize(  # pylint: disable=protected-access
           inputs.values,
           boundaries=self.bins)
+      return sparse_tensor.SparseTensor(
+          indices=array_ops.identity(inputs.indices),
+          values=integer_buckets,
+          dense_shape=array_ops.identity(inputs.dense_shape))
     else:
-      integer_buckets = math_ops._bucketize(inputs, boundaries=self.bins)  # pylint: disable=protected-access
-
-    if self.output_mode == INTEGER:
-      if isinstance(inputs, sparse_tensor.SparseTensor):
-        return sparse_tensor.SparseTensor(
-            indices=array_ops.identity(inputs.indices),
-            values=integer_buckets,
-            dense_shape=array_ops.identity(inputs.dense_shape))
-      return integer_buckets
-    else:
-      if isinstance(inputs, sparse_tensor.SparseTensor):
-        raise ValueError("`output_mode=binary` is not supported for "
-                         "sparse input")
-      # The 'bins' array is the set of boundaries between the bins. We actually
-      # have 'len(bins)+1' outputs.
-      # TODO(momernick): This will change when we have the ability to adapt().
-      return array_ops.one_hot(integer_buckets, depth=len(self.bins) + 1)
+      return math_ops._bucketize(inputs, boundaries=self.bins)  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
index 7da40b88920..aaeef8ea868 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
@@ -45,8 +45,7 @@ class DiscretizationDistributionTest(
 
     with distribution.scope():
       input_data = keras.Input(shape=(None,))
-      layer = discretization.Discretization(
-          bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+      layer = discretization.Discretization(bins=[0., 1., 2.])
       bucket_data = layer(input_data)
       self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
index 110bccd55e1..54acf267066 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
@@ -32,29 +32,8 @@ from tensorflow.python.platform import test
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingInputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_bucketize_with_explicit_buckets_one_hot(self):
-    input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-
-    # pyformat: disable
-    expected_output = [[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0]],
-                       [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0]]]
-    # pyformat: enable
-    num_buckets = 4
-    expected_output_shape = [None, None, num_buckets]
-
-    input_data = keras.Input(shape=(None,))
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.BINARY)
-    bucket_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+class DiscretizationTest(keras_parameterized.TestCase,
+                         preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_bucketize_with_explicit_buckets_integer(self):
     input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
@@ -63,8 +42,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,))
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -79,8 +57,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -94,8 +71,7 @@ class CategoricalEncodingInputTest(
         indices=indices, values=[-1.5, 1.0, 3.4], dense_shape=[2, 3])
     expected_output = [0, 2, 3]
     input_data = keras.Input(shape=(3,), dtype=dtypes.float32, sparse=True)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=bucket_data)
@@ -111,8 +87,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True)
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -128,8 +103,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True, dtype=dtypes.int64)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -143,8 +117,7 @@ class CategoricalEncodingInputTest(
         indices=indices, values=[-1, 1, 3], dense_shape=[2, 3])
     expected_output = [0, 2, 3]
     input_data = keras.Input(shape=(3,), dtype=dtypes.int32, sparse=True)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=bucket_data)
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 992ff562755..6b58a08a4bf 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -48,6 +48,7 @@ from tensorflow.python.keras.layers import wrappers
 from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import discretization
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
@@ -64,7 +65,7 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                embeddings, einsum_dense, local, merge, noise, normalization,
                pooling, image_preprocessing, preprocessing_normalization_v1,
                preprocessing_text_vectorization_v1, recurrent, wrappers,
-               hashing, category_crossing, category_encoding_v1)
+               hashing, category_crossing, category_encoding_v1, discretization)
 ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
                   preprocessing_normalization, preprocessing_text_vectorization,
                   category_encoding)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
new file mode 100644
index 00000000000..82ce2303a76
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index a922b143910..4a0522dc08f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Discretization"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Hashing"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
new file mode 100644
index 00000000000..82ce2303a76
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index a922b143910..4a0522dc08f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Discretization"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Hashing"
     mtype: "<type \'type\'>"

From 24972b90bdf20d90fb86fa24f9cbbcae4f0b4638 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 27 May 2020 16:13:37 -0700
Subject: [PATCH 1261/1533] Force install and upgrade estimator nightly
 package.

PiperOrigin-RevId: 313483729
Change-Id: I0ff9bdce47331201f5f3c5b3d054f8ed22bb50ad
---
 tensorflow/tools/ci_build/release/common_win.bat | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 464782dcefd..03217ce7e56 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -28,7 +28,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install --ignore-installed tf-estimator-nightly --no-deps
+%PIP_EXE% install --ignore-installed --force-reinstall --upgrade tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
 %PIP_EXE% install numpy --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade

From 1de60c367814d70b6d75ab5867e14aee829d142c Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 27 May 2020 16:18:04 -0700
Subject: [PATCH 1262/1533] [XLA] Correct ScopedShapedBuffer documentation

PiperOrigin-RevId: 313484485
Change-Id: Ie896610eee3ba007d1d0965911b3974efaef3225
---
 tensorflow/compiler/xla/service/shaped_buffer.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index b7a67b4e66e..995b0ece7cd 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -137,9 +137,8 @@ class ShapedBuffer {
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 
-// ShapedBuffer derived class which allocates all internal buffers on
-// construction and deallocates the memory when the object is
-// destructed.
+// ScopedShapedBuffer takes allocated buffers as inputs, and deallocates on
+// destruction. This class represents an owning wrapper around `ShapedBuffer`.
 //
 // TODO(timshen): Remove inheritance between ScopedShapedBuffer and
 // ShapedBuffer.  There should never be a need to consider a ScopedShapedBuffer

From fa24338edcd3dd505b0a47aaaceb0aea89c138a5 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Wed, 27 May 2020 16:21:20 -0700
Subject: [PATCH 1263/1533] Remove checks from keras base_layer tests that try
 to check the graph on the output keras tensor (because this is checking
 implementation details that are going to change)

PiperOrigin-RevId: 313485038
Change-Id: Id28df75c3345863caae8f793b7250cfb07126fe0
---
 .../python/keras/engine/base_layer_test.py       | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 82c60eb34c8..ca138d79020 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -622,26 +622,22 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
     x = input_layer.Input((3,))
-    y = math_ops.square(x)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.square(x)
 
     # Multi-inputs.
     x1, x2 = input_layer.Input((3,)), input_layer.Input((3,))
-    y = array_ops.concat([x1, x2], axis=1)
-    self.assertEqual(y.graph, backend.get_graph())
+    array_ops.concat([x1, x2], axis=1)
 
     # Mixing Keras symbolic tensors and graph tensors from the same graph works.
     with backend.get_graph().as_default():
       x1 = input_layer.Input((3,))
     x2 = input_layer.Input((3,))
-    y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.matmul(x1, x2)
 
     # Creating same op type (matmul) multiple times in the Keras graph works.
     x1 = input_layer.Input((3,))
     x2 = input_layer.Input((3,))
-    y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.matmul(x1, x2)
 
   def test_mixing_eager_and_graph_tensors(self):
     with ops.Graph().as_default():
@@ -663,7 +659,7 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
     x1 = input_layer.Input((3,))
     x2 = array_ops.ones((3, 3))
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+
     fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))
@@ -676,7 +672,7 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
     x1 = input_layer.Input((3,))
     x2 = np.ones((3, 3), dtype='float32')
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+
     fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))

From b7ca93ad917ae84c47d4aee26a03d284f99032b5 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Wed, 27 May 2020 16:59:20 -0700
Subject: [PATCH 1264/1533] Rename experimental version of snapshot to
 legacy_snapshot in preparation for opensourcing of new snapshot
 implementation

PiperOrigin-RevId: 313491515
Change-Id: I85cbc67ddb44a48f7a31e2858ea7c33b06ff0f1e
---
 .../benchmarks/snapshot_dataset_benchmark.py  |  3 +-
 .../snapshot_dataset_serialization_test.py    |  2 +-
 .../kernel_tests/snapshot_test.py             | 97 +++++++++++--------
 .../python/data/experimental/ops/snapshot.py  | 34 +++----
 4 files changed, 76 insertions(+), 60 deletions(-)

diff --git a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
index 88230153181..1a0e0a101e2 100644
--- a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
@@ -51,7 +51,8 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
     dataset = dataset.map(
         lambda x: gen_array_ops.broadcast_to(x, [50, 50, 3]))
     dataset = dataset.repeat(num_elems)
-    dataset = dataset.apply(snapshot.snapshot(tmp_dir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmp_dir, compression=compression))
 
     return dataset
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
index 53261d4b298..d54e50ebfa3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
@@ -46,7 +46,7 @@ class SnapshotDatasetSerializationTest(
         os.mkdir(self.snapshot_dir)
       dataset = dataset_ops.Dataset.range(1000)
       dataset = dataset.apply(
-          snapshot.snapshot(
+          snapshot.legacy_snapshot(
               self.snapshot_dir,
               num_writer_threads=num_threads,
               writer_buffer_size=2 * num_threads,
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 535cf884dc6..6169a1752fc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -97,11 +97,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(1000)))
 
     dataset = dataset_ops.Dataset.range(1001)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(1001)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 2, 1, 1)
@@ -111,11 +111,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset1 = dataset_ops.Dataset.range(1000)
-    dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
+    dataset1 = dataset1.apply(snapshot.legacy_snapshot(tmpdir))
     next1 = self.getNext(dataset1)
 
     dataset2 = dataset_ops.Dataset.range(1000)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     next2 = self.getNext(dataset2)
 
     for i in range(0, 1000):
@@ -132,11 +132,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     # We create two iterators but call getNext on only one.
     dataset1 = dataset_ops.Dataset.range(1000)
-    dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
+    dataset1 = dataset1.apply(snapshot.legacy_snapshot(tmpdir))
     next1 = self.getNext(dataset1)
 
     dataset2 = dataset_ops.Dataset.range(1001)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     _ = self.getNext(dataset2)
 
     for _ in range(1000):
@@ -156,7 +156,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, list(range(1000)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@@ -172,7 +173,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -194,7 +196,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       return (x, string_ops.as_string(x), string_ops.as_string(2 * x), 2 * x)
 
     dataset = dataset.map(map_fn)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     dataset = dataset.repeat(10)
 
     expected = []
@@ -210,7 +213,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(tmpdir, snapshot_name="my_custom_snapshot"))
+        snapshot.legacy_snapshot(tmpdir, snapshot_name="my_custom_snapshot"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -226,7 +229,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="passthrough"))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, mode="passthrough"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -237,7 +241,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="write"))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir, mode="write"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -251,7 +255,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # We write a copy of the snapshot first.
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, mode="write", snapshot_name="my_custom_snapshot"))
     self.assertDatasetProduces(dataset, list(range(10)))
 
@@ -264,7 +268,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # longer exists after we moved, we force it to read from the run we specify.
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, mode="read", snapshot_name="my_custom_snapshot_2"))
     self.assertDatasetProduces(dataset, list(range(10)))
 
@@ -276,7 +280,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaises(errors.NotFoundError):
-      dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="read"))
+      dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir, mode="read"))
       get_next = self.getNext(dataset)
       self.evaluate(get_next())
 
@@ -286,7 +290,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaises(errors.NotFoundError):
       dataset = dataset.apply(
-          snapshot.snapshot(
+          snapshot.legacy_snapshot(
               tmpdir, mode="read", snapshot_name="my_nonexistent_snapshot"))
       get_next = self.getNext(dataset)
       self.evaluate(get_next())
@@ -310,15 +314,16 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
     self.removeTFRecords()
 
     dataset2 = core_readers._TFRecordDataset(filenames)
-    dataset2 = dataset2.apply(snapshot.snapshot(
-        tmpdir, compression=compression))
+    dataset2 = dataset2.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -334,7 +339,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, shard_size_bytes=100))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=100))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
@@ -342,7 +348,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True))
+        snapshot.legacy_snapshot(
+            tmpdir, shard_size_bytes=100, shuffle_on_read=True))
     next2 = self.getNext(dataset2)
 
     res1 = self.evaluate(next2())
@@ -357,7 +364,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # make sure all the elements are still there
     dataset3 = core_readers._TFRecordDataset(filenames)
     dataset3 = dataset3.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True))
+        snapshot.legacy_snapshot(
+            tmpdir, shard_size_bytes=100, shuffle_on_read=True))
     self.assertDatasetProduces(dataset3, expected, assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -373,7 +381,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, shard_size_bytes=10))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
@@ -381,14 +390,20 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10, shuffle_on_read=True,
-                          shuffle_seed=123456))
+        snapshot.legacy_snapshot(
+            tmpdir,
+            shard_size_bytes=10,
+            shuffle_on_read=True,
+            shuffle_seed=123456))
     next2 = self.getNext(dataset2)
 
     dataset3 = core_readers._TFRecordDataset(filenames)
     dataset3 = dataset3.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10,
-                          shuffle_on_read=True, shuffle_seed=123456))
+        snapshot.legacy_snapshot(
+            tmpdir,
+            shard_size_bytes=10,
+            shuffle_on_read=True,
+            shuffle_seed=123456))
     next3 = self.getNext(dataset3)
 
     # make sure that the items are read back in the same order for both datasets
@@ -417,7 +432,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
@@ -431,7 +446,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
@@ -464,7 +479,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             compression=compression,
             num_writer_threads=threads,
@@ -477,7 +492,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, compression=compression))
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -489,7 +504,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset3 = dataset_ops.Dataset.range(200, 300)
 
     dataset = dataset1.concatenate(dataset2).concatenate(dataset3)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(300)))
 
     dataset4 = dataset_ops.Dataset.range(200, 300)
@@ -497,7 +512,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset6 = dataset_ops.Dataset.range(0, 100)
 
     dataset = dataset6.concatenate(dataset5).concatenate(dataset4)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(300)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@@ -508,7 +523,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset1 = dataset_ops.Dataset.range(1000)
     dataset1 = dataset1.apply(
-        snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
+        snapshot.legacy_snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
     next1 = self.getNext(dataset1)
 
     # Don't finish reading dataset1, so it is never finalized
@@ -524,7 +539,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # snapshot has expired.
     dataset2 = dataset_ops.Dataset.range(1000)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
+        snapshot.legacy_snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
     next2 = self.getNext(dataset2)
 
     for _ in range(500):
@@ -537,7 +552,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset1 = dataset_ops.Dataset.range(1000)
     dataset1 = dataset1.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10000))
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10000))
     next1 = self.getNext(dataset1)
 
     for _ in range(1000):
@@ -547,7 +562,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # Create second snapshot with a different shard_size_bytes
     dataset2 = dataset_ops.Dataset.range(1000)
     dataset2 = dataset1.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=20000))
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=20000))
     next2 = self.getNext(dataset2)
 
     for _ in range(1000):
@@ -568,7 +583,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset = dataset.map(lambda x: gen_array_ops.broadcast_to(x, [1024, 1024]))
     dataset = dataset.repeat(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, shard_size_bytes=10 * 1024 * 1024, compression=compression))
     next_fn = self.getNext(dataset)
 
@@ -593,14 +608,14 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
     self.removeTFRecords()
 
     dataset2 = core_readers._TFRecordDataset(filenames)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset2, expected)
 
     expected_after = [
@@ -610,7 +625,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
 
     dataset3 = core_readers._TFRecordDataset(filenames)
-    dataset3 = dataset3.apply(snapshot.snapshot(tmpdir))
+    dataset3 = dataset3.apply(snapshot.legacy_snapshot(tmpdir))
     dataset3 = dataset3.map(lambda x: string_ops.substr_v2(x, 2, 1000))
     self.assertDatasetProduces(dataset3, expected_after)
 
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 942aec712c3..490455fcbc3 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -29,7 +29,7 @@ COMPRESSION_SNAPPY = "SNAPPY"
 COMPRESSION_NONE = None
 
 
-class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
+class _LegacySnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A Dataset that captures a snapshot or reads from a snapshot."""
 
   def __init__(self,
@@ -96,23 +96,23 @@ class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
         snapshot_name=self._snapshot_name,
         **self._flat_structure)
 
-    super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
+    super(_LegacySnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
 
-def snapshot(path,
-             compression=None,
-             reader_path_prefix=None,
-             writer_path_prefix=None,
-             shard_size_bytes=None,
-             pending_snapshot_expiry_seconds=None,
-             num_reader_threads=None,
-             reader_buffer_size=None,
-             num_writer_threads=None,
-             writer_buffer_size=None,
-             shuffle_on_read=None,
-             shuffle_seed=None,
-             mode=None,
-             snapshot_name=None):
+def legacy_snapshot(path,
+                    compression=None,
+                    reader_path_prefix=None,
+                    writer_path_prefix=None,
+                    shard_size_bytes=None,
+                    pending_snapshot_expiry_seconds=None,
+                    num_reader_threads=None,
+                    reader_buffer_size=None,
+                    num_writer_threads=None,
+                    writer_buffer_size=None,
+                    shuffle_on_read=None,
+                    shuffle_seed=None,
+                    mode=None,
+                    snapshot_name=None):
   """Writes to/reads from a snapshot of a dataset.
 
   This function attempts to determine whether a valid snapshot exists at the
@@ -168,7 +168,7 @@ def snapshot(path,
   """
 
   def _apply_fn(dataset):
-    return _SnapshotDataset(
+    return _LegacySnapshotDataset(
         input_dataset=dataset,
         path=path,
         compression=compression,

From 7e333c6fbe6aea8904c81094e9f8b5b60d7fc5d1 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Wed, 27 May 2020 17:07:46 -0700
Subject: [PATCH 1265/1533] Read from a sharded checkpoint in parallel with
 multiple threads.

PiperOrigin-RevId: 313492933
Change-Id: Ie0bf7322ae0cdf9d980bb75146b3632ad523763c
---
 .../core/util/tensor_bundle/tensor_bundle.cc  | 148 +++++++++++-------
 .../core/util/tensor_bundle/tensor_bundle.h   |   3 +
 2 files changed, 92 insertions(+), 59 deletions(-)

diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index e1234d330fc..ad9ee2a7c0f 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/crc32c.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -41,6 +42,8 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 #include "tensorflow/core/util/tensor_bundle/byte_swap.h"
@@ -1021,79 +1024,106 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
         " to restore in slice_spec: ", slice_spec.DebugString());
   }
 
-  // The union of the slices in "details" covers "slice_spec".  Performs the
-  // copies from each.
-  BundleEntryProto stored_slice_entry = full_tensor_entry;
+  BlockingCounter counter(static_cast<int>(details.size()));
+  auto runner = [this, &details](std::function<void()> fn) {
+    if (details.size() > 1) {
+      // If there are multiple slices to read, perform the read in parallel
+      // using multiple threads.
+      env_->SchedClosure(fn);
+    } else {
+      fn();
+    }
+  };
+
   for (const auto& slice_tag_pair : details) {
-    // Seeks for the stored slice.
-    const TensorSlice& stored_slice = slice_tag_pair.first;
+    runner([this, &slice_spec, &full_shape, &slice_tag_pair, &full_tensor_entry,
+            &full_tensor_key_string, &counter, val]() {
+      // The union of the slices in "details" covers "slice_spec".  Performs the
+      // copies from each.
+      BundleEntryProto stored_slice_entry = full_tensor_entry;
+      // Seeks for the stored slice.
+      const TensorSlice& stored_slice = slice_tag_pair.first;
 
-    // We already have the entry for the full tensor, so don't query again if
-    // the slice is full.
-    if (!stored_slice.IsFull()) {
-      const string encoded_stored_slice_name =
-          checkpoint::EncodeTensorNameSlice(full_tensor_key_string,
-                                            stored_slice);
-      status_ =
-          GetBundleEntryProto(encoded_stored_slice_name, &stored_slice_entry);
-      if (!status_.ok()) return status_;
-    }
+      // We already have the entry for the full tensor, so don't query again if
+      // the slice is full.
+      if (!stored_slice.IsFull()) {
+        const string encoded_stored_slice_name =
+            checkpoint::EncodeTensorNameSlice(full_tensor_key_string,
+                                              stored_slice);
+        mutex_lock l(mu_);
+        // `GetBundleEntryProto` will access `iter_`, so protecting it with a
+        // mutex lock.
+        status_ =
+            GetBundleEntryProto(encoded_stored_slice_name, &stored_slice_entry);
+        if (!status_.ok()) return;
+      }
 
-    // TODO(zongheng): should we take an OpKernelContext, so that we can call
-    // allocate_temp()?  Note that without major refactorings to Saver, it's
-    // hard for the caller of the tensor bundle module to allocate these
-    // precisely-shaped scratch storage.
+      auto cleanup = gtl::MakeCleanup([&counter] { counter.DecrementCount(); });
 
-    // Optimization for the common case: the stored slice can be directly
-    // copied to the destination without additional slicing. This is true when
-    // either the slices are equal or when they are both full slices having the
-    // same shape.
-    TensorShape stored_slice_shape(stored_slice_entry.shape());
-    if (stored_slice == slice_spec ||
-        (stored_slice_shape == val->shape() &&
-         IsFullSlice(stored_slice, stored_slice_shape) &&
-         IsFullSlice(slice_spec, stored_slice_shape))) {
-      VLOG(1) << "Optimized for common case: directly copying into "
-                 "pre-allocated buffer; spec: "
-              << slice_spec.DebugString();
-      status_ = GetValue(stored_slice_entry, val);
-      return status_;
-    }
+      // TODO(zongheng): should we take an OpKernelContext, so that we can
+      // call allocate_temp()?  Note that without major refactorings to
+      // Saver, it's hard for the caller of the tensor bundle module to
+      // allocate these precisely-shaped scratch storage.
 
-    Tensor stored_slice_tensor(stored_slice_entry.dtype(), stored_slice_shape);
-    status_ = GetValue(stored_slice_entry, &stored_slice_tensor);
-    if (!status_.ok()) return status_;
+      // Optimization for the common case: the stored slice can be directly
+      // copied to the destination without additional slicing. This is true
+      // when either the slices are equal or when they are both full slices
+      // having the same shape.
+      TensorShape stored_slice_shape(stored_slice_entry.shape());
+      if (stored_slice == slice_spec ||
+          (stored_slice_shape == val->shape() &&
+           IsFullSlice(stored_slice, stored_slice_shape) &&
+           IsFullSlice(slice_spec, stored_slice_shape))) {
+        VLOG(1) << "Optimized for common case: directly copying into "
+                   "pre-allocated buffer; spec: "
+                << slice_spec.DebugString();
+        status_ = GetValue(stored_slice_entry, val);
+        return;
+      }
 
-    // Copies the intersection over.
-    const DataType common_dtype = full_tensor_entry.dtype();
-    switch (common_dtype) {
+      Tensor stored_slice_tensor(stored_slice_entry.dtype(),
+                                 stored_slice_shape);
+      status_ = GetValue(stored_slice_entry, &stored_slice_tensor);
+      if (!status_.ok()) return;
+
+      // Copies the intersection over.
+      mutex_lock l(mu_);
+      // `CopyDataFromTensorSliceToTensorSlice` will write to `val`, so
+      // protecting it with a mutex lock.
+      const DataType common_dtype = full_tensor_entry.dtype();
+      switch (common_dtype) {
 #define HANDLE_COPY(T)                                                 \
   case DataTypeToEnum<T>::value:                                       \
     CHECK(CopyDataFromTensorSliceToTensorSlice(                        \
         full_shape, stored_slice, slice_spec,                          \
         stored_slice_tensor.flat<T>().data(), val->flat<T>().data())); \
     break;
-
-      HANDLE_COPY(float)
-      HANDLE_COPY(double)
-      HANDLE_COPY(int32)
-      HANDLE_COPY(uint8)
-      HANDLE_COPY(int16)
-      HANDLE_COPY(int8)
-      HANDLE_COPY(complex64)
-      HANDLE_COPY(complex128)
-      HANDLE_COPY(int64)
-      HANDLE_COPY(bool)
-      HANDLE_COPY(qint32)
-      HANDLE_COPY(quint8)
-      HANDLE_COPY(qint8)
-      HANDLE_COPY(bfloat16)
-      default:
-        return errors::InvalidArgument("Dtype ", DataTypeString(common_dtype),
-                                       " not supported.");
-    }
+        HANDLE_COPY(float)
+        HANDLE_COPY(double)
+        HANDLE_COPY(int32)
+        HANDLE_COPY(uint8)
+        HANDLE_COPY(int16)
+        HANDLE_COPY(int8)
+        HANDLE_COPY(complex64)
+        HANDLE_COPY(complex128)
+        HANDLE_COPY(int64)
+        HANDLE_COPY(bool)
+        HANDLE_COPY(qint32)
+        HANDLE_COPY(quint8)
+        HANDLE_COPY(qint8)
+        HANDLE_COPY(bfloat16)
+        default:
+          status_ = errors::InvalidArgument(
+              "Dtype ", DataTypeString(common_dtype), " not supported.");
+          if (!status_.ok()) return;
+      }
 #undef HANDLE_COPY
+    });
   }
+
+  counter.Wait();
+  TF_RETURN_IF_ERROR(status_);
+
   return Status::OK();
 }
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index c441000e47d..24a9c488cbb 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -306,6 +306,9 @@ class BundleReader {
   // differs from that of the current system's processor architecture.
   bool need_to_swap_bytes_;
 
+  // Protect internal states when accessing from multiple threads.
+  mutable mutex mu_;
+
   friend class TensorBundleAlignmentTest;  // For testing data alignment.
 
   TF_DISALLOW_COPY_AND_ASSIGN(BundleReader);

From b5552d7b1810c738343deec5c2b1bf93cc27b785 Mon Sep 17 00:00:00 2001
From: Khanh LeViet <khanhlvg@google.com>
Date: Wed, 27 May 2020 17:40:14 -0700
Subject: [PATCH 1266/1533] Add snippets to save TF Lite model as a file

PiperOrigin-RevId: 313497415
Change-Id: If693d071960859341ef58770cf4ab0f939c20a4b
---
 tensorflow/lite/g3doc/convert/python_api.md | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index ba86eac25fd..44b58fb0759 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -42,7 +42,7 @@ root.v1 = tf.Variable(3.)
 root.v2 = tf.Variable(2.)
 root.f = tf.function(lambda x: root.v1 * root.v2 * x)
 
-# Save the model.
+# Save the model in SavedModel format.
 export_dir = "/tmp/test_saved_model"
 input_data = tf.constant(1., shape=[1, 1])
 to_save = root.f.get_concrete_function(input_data)
@@ -51,6 +51,10 @@ tf.saved_model.save(root, export_dir, to_save)
 # Convert the model.
 converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 This API does not have the option of specifying the input shape of any input
@@ -87,6 +91,10 @@ model.fit(x, y, epochs=50)
 # Convert the model.
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 ### Converting a concrete function <a name="concrete_function"></a>
@@ -115,6 +123,10 @@ concrete_func = root.f.get_concrete_function(input_data)
 # functions is under development.
 converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 ### End-to-end MobileNet conversion <a name="mobilenet"></a>

From fb86acf839304067afa5a79f6786092d0561f2f8 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Wed, 27 May 2020 18:39:55 -0700
Subject: [PATCH 1267/1533] Update markup to provide clickable redirect URLs
 The previous version wasn't using a link markup, so the URL wasn't clickable

PiperOrigin-RevId: 313504621
Change-Id: Id27cd9822310fea290d7fba5dbb5536c1904da62
---
 .../lite/micro/examples/hello_world/create_sine_model.ipynb     | 2 +-
 .../lite/micro/examples/micro_speech/train_speech_model.ipynb   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
index 2cedb4d3fc9..b34c0bb3fb8 100644
--- a/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
+++ b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
@@ -1 +1 @@
-{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyOIPo/+nZRJ3iJImegJmzTI"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"RmcjrlLpRHKk","colab_type":"code","colab":{}},"source":["This Colab notebook has been moved to https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"b6Hnl5__RKOH","colab_type":"text"},"source":[""]}]}
\ No newline at end of file
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO1u6oks1qPVEQNnHFD3Cyo"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"86C-FMxpdZxv","colab_type":"text"},"source":["This Colab notebook has been moved to [https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb)\n"]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
index ec68bf28cda..fb56dc3a4d9 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
+++ b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
@@ -1 +1 @@
-{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyOIPo/+nZRJ3iJImegJmzTI"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"code","metadata":{"id":"RmcjrlLpRHKk","colab_type":"code","colab":{}},"source":["This Colab notebook has been moved to https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"b6Hnl5__RKOH","colab_type":"text"},"source":[""]}]}
\ No newline at end of file
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO1u6oks1qPVEQNnHFD3Cyo"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"86C-FMxpdZxv","colab_type":"text"},"source":["This Colab notebook has been moved to [https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb)\n"]}]}
\ No newline at end of file

From a7048d89a11f7f7ef6234ca0d01b341b1e5780f7 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Wed, 27 May 2020 18:50:31 -0700
Subject: [PATCH 1268/1533] Support models with FP16 weights in XNNPACK
 delegate

PiperOrigin-RevId: 313505742
Change-Id: Id21f7528741073e93a7132d529c3cd79957a73fb
---
 tensorflow/lite/delegates/xnnpack/BUILD       |   6 +
 tensorflow/lite/delegates/xnnpack/add_test.cc |  29 ++
 .../xnnpack/binary_elementwise_tester.cc      | 137 +++++---
 .../xnnpack/binary_elementwise_tester.h       |   8 +
 .../lite/delegates/xnnpack/conv_2d_test.cc    | 241 ++++++++++----
 .../xnnpack/depthwise_conv_2d_test.cc         |  31 ++
 .../xnnpack/depthwise_conv_2d_tester.cc       | 211 ++++++++----
 .../xnnpack/depthwise_conv_2d_tester.h        |   8 +
 .../delegates/xnnpack/fully_connected_test.cc |  23 ++
 .../xnnpack/fully_connected_tester.cc         | 180 ++++++----
 .../xnnpack/fully_connected_tester.h          |   8 +
 tensorflow/lite/delegates/xnnpack/mul_test.cc |  29 ++
 .../delegates/xnnpack/xnnpack_delegate.cc     | 314 +++++++++++++++---
 13 files changed, 942 insertions(+), 283 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 1cdba72b615..df70a314308 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@XNNPACK",
     ],
 )
@@ -39,6 +40,7 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@XNNPACK",
     ],
 )
@@ -56,6 +58,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -72,6 +75,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -88,6 +92,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -215,6 +220,7 @@ cc_test(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/delegates/xnnpack/add_test.cc b/tensorflow/lite/delegates/xnnpack/add_test.cc
index dd2857e01ce..6bc8f8d6bca 100644
--- a/tensorflow/lite/delegates/xnnpack/add_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/add_test.cc
@@ -679,6 +679,35 @@ TEST(Add, 2DByStatic0D) {
       .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
 }
 
+TEST(Add, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
 TEST(Add, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index e846cbeffe3..ad5b197d6fa 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -62,6 +63,9 @@ void BinaryElementwiseTester::Test(tflite::BuiltinOperator binary_op,
   if (Input1Static()) {
     ASSERT_FALSE(Input2Static());
   }
+  if (FP16Weights()) {
+    ASSERT_TRUE(Input1Static() || Input2Static());
+  }
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -180,8 +184,12 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
   auto input2_rng = std::bind(input2_distribution, std::ref(rng));
 
   flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, binary_op);
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, binary_op)}};
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+  }
 
   std::vector<flatbuffers::Offset<Buffer>> buffers{{
       CreateBuffer(builder, builder.CreateVector({})),
@@ -189,43 +197,89 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
 
   int32_t input1_buffer = 0;
   if (Input1Static()) {
-    std::vector<float> input1_data(ComputeSize(Input1Shape()));
-    std::generate(input1_data.begin(), input1_data.end(), input1_rng);
+    if (FP16Weights()) {
+      std::vector<uint16_t> input1_data(ComputeSize(Input1Shape()));
+      std::generate(input1_data.begin(), input1_data.end(),
+                    std::bind(fp16_ieee_from_fp32_value, input1_rng));
 
-    input1_buffer = buffers.size();
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(input1_data.data()),
-                     sizeof(float) * input1_data.size())));
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input1_data.data()),
+                       sizeof(uint16_t) * input1_data.size())));
+    } else {
+      std::vector<float> input1_data(ComputeSize(Input1Shape()));
+      std::generate(input1_data.begin(), input1_data.end(), input1_rng);
+
+      input1_buffer = buffers.size();
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input1_data.data()),
+                       sizeof(float) * input1_data.size())));
+    }
   }
 
   int32_t input2_buffer = 0;
   if (Input2Static()) {
-    std::vector<float> input2_data(ComputeSize(Input2Shape()));
-    std::generate(input2_data.begin(), input2_data.end(), input2_rng);
+    if (FP16Weights()) {
+      std::vector<uint16_t> input2_data(ComputeSize(Input2Shape()));
+      std::generate(input2_data.begin(), input2_data.end(),
+                    std::bind(fp16_ieee_from_fp32_value, input1_rng));
 
-    input2_buffer = buffers.size();
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(input2_data.data()),
-                     sizeof(float) * input2_data.size())));
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input2_data.data()),
+                       sizeof(uint16_t) * input2_data.size())));
+    } else {
+      std::vector<float> input2_data(ComputeSize(Input2Shape()));
+      std::generate(input2_data.begin(), input2_data.end(), input2_rng);
+
+      input2_buffer = buffers.size();
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input2_data.data()),
+                       sizeof(float) * input2_data.size())));
+    }
   }
 
   const std::vector<int32_t> output_shape = OutputShape();
-  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(Input1Shape().data(),
-                                                 Input1Shape().size()),
-                   TensorType_FLOAT32, input1_buffer),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(Input2Shape().data(),
-                                                 Input2Shape().size()),
-                   TensorType_FLOAT32, input2_buffer),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  if (FP16Weights() && Input1Static()) {
+    tensors.emplace_back(
+        CreateTensor(builder,
+                     builder.CreateVector<int32_t>(Input1Shape().data(),
+                                                   Input1Shape().size()),
+                     TensorType_FLOAT16, 1));
+  }
+  if (FP16Weights() && Input2Static()) {
+    tensors.emplace_back(
+        CreateTensor(builder,
+                     builder.CreateVector<int32_t>(Input2Shape().data(),
+                                                   Input2Shape().size()),
+                     TensorType_FLOAT16, 1));
+  }
+  if (FP16Weights()) {
+    const std::array<int32_t, 1> dequantize_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_outputs{{Input1Static() ? 1 : 2}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_inputs.data(),
+                                      dequantize_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_outputs.data(),
+                                      dequantize_outputs.size())));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(Input1Shape().data(), Input1Shape().size()),
+      TensorType_FLOAT32, input1_buffer));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(Input2Shape().data(), Input2Shape().size()),
+      TensorType_FLOAT32, input2_buffer));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
   tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE;
   flatbuffers::Offset<void> builtin_options = 0;
@@ -250,35 +304,40 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
       EXPECT_EQ(Activation(), ActivationFunctionType_NONE);
   }
 
-  const std::array<int32_t, 2> op_inputs{{0, 1}};
-  const std::array<int32_t, 1> op_outputs{{2}};
-  flatbuffers::Offset<Operator> op = CreateOperator(
+  const std::array<int32_t, 2> op_inputs{
+      {static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      builtin_options_type, builtin_options);
+      builtin_options_type, builtin_options));
 
   std::vector<int32_t> subgraph_inputs;
   if (!Input1Static()) {
-    subgraph_inputs.push_back(0);
+    subgraph_inputs.push_back(tensors.size() - 3);
   }
   if (!Input2Static()) {
-    subgraph_inputs.push_back(1);
+    subgraph_inputs.push_back(tensors.size() - 2);
   }
-  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Binary operator model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
index 15c99c3148d..a0c2440f59a 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
@@ -74,6 +74,13 @@ class BinaryElementwiseTester {
 
   inline bool Input2Static() const { return input2_static_; }
 
+  inline BinaryElementwiseTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline BinaryElementwiseTester& ReluActivation() {
     activation_ = ::tflite::ActivationFunctionType_RELU;
     return *this;
@@ -114,6 +121,7 @@ class BinaryElementwiseTester {
   std::vector<int32_t> input2_shape_;
   bool input1_static_ = false;
   bool input2_static_ = false;
+  bool fp16_weights_ = false;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 95a358d1b9c..a8c6a1956bc 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cstdint>
 #include <functional>
 #include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
@@ -146,6 +148,13 @@ class Conv2DTester {
 
   int32_t DilationWidth() const { return dilation_width_; }
 
+  inline Conv2DTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   Conv2DTester& SamePadding(bool same_padding) {
     same_padding_ = same_padding;
     return *this;
@@ -154,11 +163,7 @@ class Conv2DTester {
   bool SamePadding() const { return same_padding_; }
 
   void Test(TfLiteDelegate* delegate) const {
-    std::random_device random_device;
-    auto rng = std::mt19937(random_device());
-    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
-
-    std::vector<char> buffer = CreateTfLiteModel(std::ref(f32rng));
+    std::vector<char> buffer = CreateTfLiteModel();
     const Model* model = GetModel(buffer.data());
 
     std::unique_ptr<Interpreter> delegate_interpreter;
@@ -187,6 +192,10 @@ class Conv2DTester {
     ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate),
               kTfLiteOk);
 
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
+
     float* default_input_data = default_interpreter->typed_tensor<float>(
         default_interpreter->inputs()[0]);
     std::generate(default_input_data,
@@ -219,82 +228,149 @@ class Conv2DTester {
   }
 
  private:
-  std::vector<char> CreateTfLiteModel(std::function<float()> f32rng) const {
+  std::vector<char> CreateTfLiteModel() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
+
     flatbuffers::FlatBufferBuilder builder;
-    flatbuffers::Offset<OperatorCode> operator_code =
-        CreateOperatorCode(builder, BuiltinOperator_CONV_2D, 0);
+    std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+        {CreateOperatorCode(builder, BuiltinOperator_CONV_2D, 0)}};
+    std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+    std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+        {CreateBuffer(builder, builder.CreateVector({}))}};
+
+    if (FP16Weights()) {
+      operator_codes.emplace_back(
+          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+
+      auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+      std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
+                                        KernelWidth() * InputChannels());
+      std::vector<uint16_t> bias_data(OutputChannels());
+
+      std::generate(filter_data.begin(), filter_data.end(), f16rng);
+      std::generate(bias_data.begin(), bias_data.end(), f16rng);
+
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(uint16_t) * filter_data.size())));
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(uint16_t) * bias_data.size())));
+
+      const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+      const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+      operators.emplace_back(CreateOperator(
+          builder, /*opcode_index=*/1,
+          builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                        dequantize_filter_inputs.size()),
+          builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                        dequantize_filter_outputs.size())));
+      const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+      const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+      operators.emplace_back(CreateOperator(
+          builder, /*opcode_index=*/1,
+          builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                        dequantize_bias_inputs.size()),
+          builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                        dequantize_bias_outputs.size())));
+    } else {
+      std::vector<float> filter_data(OutputChannels() * KernelHeight() *
+                                     KernelWidth() * InputChannels());
+      std::vector<float> bias_data(OutputChannels());
+
+      std::generate(filter_data.begin(), filter_data.end(), f32rng);
+      std::generate(bias_data.begin(), bias_data.end(), f32rng);
+
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())));
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(float) * bias_data.size())));
+    }
+
+    const std::array<int32_t, 4> input_shape{
+        {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
+    const std::array<int32_t, 4> output_shape{
+        {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
+    const std::array<int32_t, 4> filter_shape{
+        {OutputChannels(), KernelHeight(), KernelWidth(), InputChannels()}};
+    const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
+
+    std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+    if (FP16Weights()) {
+      tensors.emplace_back(
+          CreateTensor(builder,
+                       builder.CreateVector<int32_t>(filter_shape.data(),
+                                                     filter_shape.size()),
+                       TensorType_FLOAT16, /*buffer=*/1));
+      tensors.emplace_back(CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+          TensorType_FLOAT16, /*buffer=*/2));
+    }
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+        TensorType_FLOAT32));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+        TensorType_FLOAT32));
+
+    const std::array<int32_t, 3> op_inputs{
+        {static_cast<int>(tensors.size()) - 4,
+         static_cast<int>(tensors.size()) - 3,
+         static_cast<int>(tensors.size()) - 2}};
+    const std::array<int32_t, 1> op_outputs{
+        {static_cast<int>(tensors.size()) - 1}};
 
     flatbuffers::Offset<Conv2DOptions> conv2d_options = CreateConv2DOptions(
         builder, SamePadding() ? tflite::Padding_SAME : tflite::Padding_VALID,
         StrideWidth(), StrideHeight(), ActivationFunctionType_NONE,
         DilationWidth(), DilationHeight());
 
-    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                   KernelWidth() * InputChannels());
-    std::vector<float> bias_data(OutputChannels());
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/0,
+        builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+        builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+        BuiltinOptions_Conv2DOptions, conv2d_options.Union()));
 
-    std::generate(filter_data.begin(), filter_data.end(), f32rng);
-    std::generate(bias_data.begin(), bias_data.end(), f32rng);
-
-    flatbuffers::Offset<Buffer> buffers[3] = {
-        CreateBuffer(builder, builder.CreateVector({})),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(filter_data.data()),
-                         sizeof(float) * filter_data.size())),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(bias_data.data()),
-                         sizeof(float) * bias_data.size())),
-    };
-
-    const int32_t input_shape[4] = {BatchSize(), InputHeight(), InputWidth(),
-                                    InputChannels()};
-    const int32_t output_shape[4] = {BatchSize(), OutputHeight(), OutputWidth(),
-                                     OutputChannels()};
-    const int32_t filter_shape[4] = {OutputChannels(), KernelHeight(),
-                                     KernelWidth(), InputChannels()};
-    const int32_t bias_shape[1] = {OutputChannels()};
-
-    flatbuffers::Offset<Tensor> tensors[4] = {
-        CreateTensor(builder, builder.CreateVector<int32_t>(input_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("X")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(filter_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/1,
-                     builder.CreateString("W")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(bias_shape, 1),
-                     TensorType_FLOAT32, /*buffer=*/2,
-                     builder.CreateString("b")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(output_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("Y")),
-    };
-
-    const int32_t op_inputs[3] = {0, 1, 2};
-    const int32_t op_outputs[1] = {3};
-
-    flatbuffers::Offset<Operator> op =
-        CreateOperator(builder, /*opcode_index=*/0,
-                       builder.CreateVector<int32_t>(op_inputs, 3),
-                       builder.CreateVector<int32_t>(op_outputs, 1),
-                       BuiltinOptions_Conv2DOptions, conv2d_options.Union());
-
-    int32_t subgraph_inputs[1] = {0};
-    int32_t subgraph_outputs[1] = {3};
-    flatbuffers::Offset<SubGraph> subgraph =
-        CreateSubGraph(builder, builder.CreateVector(tensors, 4),
-                       builder.CreateVector<int32_t>(subgraph_inputs, 1),
-                       builder.CreateVector<int32_t>(subgraph_outputs, 1),
-                       builder.CreateVector(&op, 1), /*name=*/0);
+    const std::array<int32_t, 1> subgraph_inputs{
+        {static_cast<int>(tensors.size()) - 4}};
+    const std::array<int32_t, 1> subgraph_outputs{
+        {static_cast<int>(tensors.size()) - 1}};
+    flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+        builder, builder.CreateVector(tensors.data(), tensors.size()),
+        builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                      subgraph_inputs.size()),
+        builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                      subgraph_outputs.size()),
+        builder.CreateVector(operators.data(), operators.size()));
 
     flatbuffers::Offset<flatbuffers::String> description =
         builder.CreateString("Conv2D model");
 
     flatbuffers::Offset<Model> model_buffer = CreateModel(
-        builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+        builder, TFLITE_SCHEMA_VERSION,
+        builder.CreateVector(operator_codes.data(), operator_codes.size()),
         builder.CreateVector(&subgraph, 1), description,
-        builder.CreateVector(buffers, 3));
+        builder.CreateVector(buffers.data(), buffers.size()));
 
     builder.Finish(model_buffer);
 
@@ -313,6 +389,7 @@ class Conv2DTester {
   int32_t stride_width_ = 1;
   int32_t dilation_height_ = 1;
   int32_t dilation_width_ = 1;
+  bool fp16_weights_ = false;
   bool same_padding_ = true;
 };
 
@@ -506,5 +583,35 @@ TEST(Conv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(Conv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding(true)
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index fd82e4fd83f..c9d274cbe01 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -371,6 +371,37 @@ TEST(DepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(DepthwiseConv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 TEST(DepthwiseConv2D, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index b6d1dfec69b..9b6749e42f6 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -107,56 +108,110 @@ void DepthwiseConv2DTester::Test(TfLiteDelegate* delegate) const {
 }
 
 std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D);
-
-  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
-      CreateDepthwiseConv2DOptions(
-          builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
-          Activation(), DilationWidth(), DilationHeight());
-
-  std::vector<float> filter_data(KernelHeight() * KernelWidth() *
-                                 OutputChannels());
-  std::vector<float> bias_data(OutputChannels());
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
-  for (int32_t ic = 0; ic < InputChannels(); ic++) {
-    // Use the same range of all-positive or all-negative values to generate
-    // all pixels within the same batch index & channel, but different ranges
-    // for different channels or batches. This ensures that no catastrophic
-    // cancellation occur, but test covers both positive and negative inputs.
-    const float range = range_rng();
-    auto value_rng =
-        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
-                                                        std::max(range, 0.0f)),
-                  std::ref(rng));
-    for (int32_t m = 0; m < DepthMultiplier(); m++) {
-      const int32_t oc = ic * DepthMultiplier() + m;
-      bias_data[oc] = value_rng();
-      for (int32_t y = 0; y < KernelHeight(); y++) {
-        for (int32_t x = 0; x < KernelWidth(); x++) {
-          const int32_t index = (y * KernelWidth() + x) * OutputChannels() + oc;
-          filter_data[index] = value_rng();
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D)}};
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+
+    std::vector<uint16_t> filter_data(KernelHeight() * KernelWidth() *
+                                      OutputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all pixels within the same batch index & channel, but different ranges
+      // for different channels or batches. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+      for (int32_t m = 0; m < DepthMultiplier(); m++) {
+        const int32_t oc = ic * DepthMultiplier() + m;
+        bias_data[oc] = value_rng();
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                (y * KernelWidth() + x) * OutputChannels() + oc;
+            filter_data[index] = value_rng();
+          }
         }
       }
     }
-  }
 
-  const std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers{{
-      CreateBuffer(builder, builder.CreateVector({})),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())),
-  }};
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(KernelHeight() * KernelWidth() *
+                                   OutputChannels());
+    std::vector<float> bias_data(OutputChannels());
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all pixels within the same batch index & channel, but different ranges
+      // for different channels or batches. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+      for (int32_t m = 0; m < DepthMultiplier(); m++) {
+        const int32_t oc = ic * DepthMultiplier() + m;
+        bias_data[oc] = value_rng();
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                (y * KernelWidth() + x) * OutputChannels() + oc;
+            filter_data[index] = value_rng();
+          }
+        }
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
 
   const std::array<int32_t, 4> input_shape{
       {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
@@ -166,49 +221,69 @@ std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
       {1, KernelHeight(), KernelWidth(), OutputChannels()}};
   const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
 
-  const std::array<flatbuffers::Offset<tflite::Tensor>, 4> tensors{{
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
-          TensorType_FLOAT32),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(filter_shape.data(),
-                                                 filter_shape.size()),
-                   TensorType_FLOAT32, /*buffer=*/1),
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT32, /*buffer=*/2),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
-  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
-  const std::array<int32_t, 1> op_outputs{{3}};
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
 
-  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
+      CreateDepthwiseConv2DOptions(
+          builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
+          Activation(), DilationWidth(), DilationHeight());
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union());
+      BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{{0}};
-  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("DepthwiseConv2D model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
index 16dc5920229..102c66af340 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -152,6 +152,13 @@ class DepthwiseConv2DTester {
     return (KernelWidth() - 1) * DilationWidth() + 1;
   }
 
+  inline DepthwiseConv2DTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline DepthwiseConv2DTester& SamePadding() {
     padding_ = ::tflite::Padding_SAME;
     return *this;
@@ -209,6 +216,7 @@ class DepthwiseConv2DTester {
   int32_t stride_width_ = 1;
   int32_t dilation_height_ = 1;
   int32_t dilation_width_ = 1;
+  bool fp16_weights_ = false;
   ::tflite::Padding padding_ = ::tflite::Padding_VALID;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index a801ce141ed..0dffd1dee19 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -228,6 +228,29 @@ TEST(FullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(FullyConnected, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 TEST(FullyConnected, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index 05716bf18fb..8962b8ba7ba 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -109,98 +110,165 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
 std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
 
   flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED);
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED)}};
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  std::vector<flatbuffers::Offset<Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
 
-  std::vector<float> filter_data(InputChannels() * OutputChannels());
-  std::vector<float> bias_data(OutputChannels());
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
 
-  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
-    // Use the same range of all-positive or all-negative values to generate
-    // all filter & bias weights within the same channel, but different ranges
-    // for different output channels. This ensures that no catastrophic
-    // cancellation occur, but test covers both positive and negative inputs.
-    const float range = range_rng();
-    auto value_rng =
-        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
-                                                        std::max(range, 0.0f)),
-                  std::ref(rng));
+    std::vector<uint16_t> filter_data(InputChannels() * OutputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
 
-    bias_data[oc] = value_rng();
-    for (int32_t ic = 0; ic < InputChannels(); ic++) {
-      filter_data[oc * InputChannels() + ic] = value_rng();
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all filter & bias weights within the same channel, but different ranges
+      // for different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        filter_data[oc * InputChannels() + ic] = value_rng();
+      }
     }
-  }
 
-  std::array<flatbuffers::Offset<Buffer>, 3> buffers{{
-      CreateBuffer(builder, builder.CreateVector({})),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())),
-  }};
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(InputChannels() * OutputChannels());
+    std::vector<float> bias_data(OutputChannels());
+
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all filter & bias weights within the same channel, but different ranges
+      // for different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        filter_data[oc * InputChannels() + ic] = value_rng();
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
 
   const std::array<int32_t, 2> filter_shape(
       {OutputChannels(), InputChannels()});
   const std::array<int32_t, 1> bias_shape({OutputChannels()});
 
   const std::vector<int32_t> output_shape = OutputShape();
-  const std::array<flatbuffers::Offset<Tensor>, 4> tensors{{
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(InputShape().data(),
-                                                 InputShape().size()),
-                   TensorType_FLOAT32),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(filter_shape.data(),
-                                                 filter_shape.size()),
-                   TensorType_FLOAT32, /*buffer=*/1),
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT32, /*buffer=*/2),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
   flatbuffers::Offset<FullyConnectedOptions> fully_connected_options =
       CreateFullyConnectedOptions(builder, Activation(),
                                   FullyConnectedOptionsWeightsFormat_DEFAULT,
                                   KeepDims());
 
-  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
-  const std::array<int32_t, 1> op_outputs{{3}};
-  flatbuffers::Offset<Operator> op = CreateOperator(
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union());
+      BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{{0}};
-  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Fully Connected model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
index cf1d5513d46..6350bc8d739 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -71,6 +71,13 @@ class FullyConnectedTester {
 
   inline bool KeepDims() const { return keep_dims_; }
 
+  inline FullyConnectedTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline FullyConnectedTester& ReluActivation() {
     activation_ = ::tflite::ActivationFunctionType_RELU;
     return *this;
@@ -102,6 +109,7 @@ class FullyConnectedTester {
   int32_t input_channels_ = 1;
   int32_t output_channels_ = 1;
   bool keep_dims_ = false;
+  bool fp16_weights_ = false;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/mul_test.cc b/tensorflow/lite/delegates/xnnpack/mul_test.cc
index 6c0475e2b64..2dbb2663b80 100644
--- a/tensorflow/lite/delegates/xnnpack/mul_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/mul_test.cc
@@ -679,6 +679,35 @@ TEST(Mul, 2DByStatic0D) {
       .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
 }
 
+TEST(Mul, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
 TEST(Mul, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 2beaa16255d..32fcbee4c22 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -22,10 +22,12 @@ limitations under the License.
 #include <limits>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include <fp16.h>
 #include <xnnpack.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -39,6 +41,8 @@ namespace {
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
 
 class Delegate {
+  friend class Subgraph;
+
  public:
   explicit Delegate(const TfLiteXNNPackDelegateOptions* options) {
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
@@ -49,9 +53,10 @@ class Delegate {
 #endif
   }
 
+  TfLiteIntArray* PrepareOpsToDelegate(TfLiteContext* context);
   TfLiteDelegate* tflite_delegate() { return &delegate_; }
 
-  pthreadpool_t threadpool() {
+  pthreadpool_t threadpool() const {
 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
     return nullptr;
 #else
@@ -69,6 +74,17 @@ class Delegate {
       kTfLiteDelegateFlagsNone,       // .flags
   };
 
+  // Unpacked data for quasi-static tensors, i.e. tensors produced by
+  // dequantizing or unpacking static buffers.
+  std::vector<char> static_unpacked_data_;
+  // Mapping from a tensor index for a quasi-static tensor to the offset to
+  // its unpacked data within static_unpacked_data_.
+  std::unordered_map<int, size_t> static_unpacked_data_map_;
+  // Set of indices of nodes which unpack static data, e.g. Dequantize
+  // operators which convert FP16 static weights to FP32. These nodes are simply
+  // ignored in the delegate implementation, because their outputs are
+  // pre-unpacked in DelegatePrepare.
+  std::unordered_set<int> static_unpack_nodes_;
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
   // Thread pool with smart-pointer for lifetime management.
   std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_{
@@ -80,7 +96,7 @@ class Subgraph {
  public:
   static Subgraph* Create(TfLiteContext* context,
                           const TfLiteDelegateParams* params,
-                          pthreadpool_t threadpool) {
+                          const Delegate* delegate) {
     // Convert subgraph inputs and outputs to hash sets for faster lookup.
     const std::unordered_set<int> inputs(
         &params->input_tensors->data[0],
@@ -113,11 +129,17 @@ class Subgraph {
     // filtered out and removed later.
     std::vector<int> tensors(context->tensors_size, -1);
     for (int i = 0; i < params->nodes_to_replace->size; i++) {
+      const int node_index = params->nodes_to_replace->data[i];
+      if (delegate->static_unpack_nodes_.count(node_index)) {
+        // The node unpacks static input and can be skipped because its input
+        // was pre-unpacked in DelegatePrepare.
+        continue;
+      }
+
       TfLiteNode* node = nullptr;
       TfLiteRegistration* registration = nullptr;
-      if (context->GetNodeAndRegistration(context,
-                                          params->nodes_to_replace->data[i],
-                                          &node, &registration) != kTfLiteOk) {
+      if (context->GetNodeAndRegistration(context, node_index, &node,
+                                          &registration) != kTfLiteOk) {
         return nullptr;
       }
 
@@ -164,6 +186,12 @@ class Subgraph {
       const void* data = nullptr;
       if (context->tensors[t].allocation_type == kTfLiteMmapRo) {
         data = context->tensors[t].data.raw_const;
+      } else {
+        // Check for quasi-static data.
+        const auto it = delegate->static_unpacked_data_map_.find(t);
+        if (it != delegate->static_unpacked_data_map_.end()) {
+          data = delegate->static_unpacked_data_.data() + it->second;
+        }
       }
       if (inputs.count(t) != 0) {
         flags |= XNN_VALUE_FLAG_EXTERNAL_INPUT;
@@ -189,25 +217,38 @@ class Subgraph {
       }
     }
 
+    // Create a set of quasi-static tensors for VisitNode function
+    std::unordered_set<int> quasi_static_tensors;
+    for (const std::pair<const int, size_t>& entry :
+         delegate->static_unpacked_data_map_) {
+      quasi_static_tensors.insert(entry.first);
+    }
+
     // Create XNNPACK nodes for TFLite delegate nodes
     for (int i = 0; i < params->nodes_to_replace->size; i++) {
+      const int node_index = params->nodes_to_replace->data[i];
+      if (delegate->static_unpack_nodes_.count(node_index)) {
+        // The node unpacks static input and can be skipped because its input
+        // was pre-unpacked in DelegatePrepare.
+        continue;
+      }
+
       TfLiteNode* node = nullptr;
       TfLiteRegistration* registration = nullptr;
-      if (context->GetNodeAndRegistration(context,
-                                          params->nodes_to_replace->data[i],
-                                          &node, &registration) != kTfLiteOk) {
+      if (context->GetNodeAndRegistration(context, node_index, &node,
+                                          &registration) != kTfLiteOk) {
         return nullptr;
       }
 
-      if (VisitNode(subgraph.get(), context, registration, node, i,
-                    xnnpack_tensors) != kTfLiteOk) {
+      if (VisitNode(subgraph.get(), context, registration, node, node_index,
+                    quasi_static_tensors, xnnpack_tensors) != kTfLiteOk) {
         return nullptr;
       }
     }
 
     xnn_runtime_t runtime_ptr = nullptr;
-    status = xnn_create_runtime_v2(subgraph.get(), threadpool, /*flags=*/0,
-                                   &runtime_ptr);
+    status = xnn_create_runtime_v2(subgraph.get(), delegate->threadpool(),
+                                   /*flags=*/0, &runtime_ptr);
     if (status != xnn_status_success) {
       TF_LITE_KERNEL_LOG(context, "failed to create XNNPACK runtime");
       return nullptr;
@@ -707,10 +748,11 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitNode(xnn_subgraph_t subgraph, TfLiteContext* context,
-                                TfLiteRegistration* registration,
-                                TfLiteNode* node, int node_index,
-                                const std::vector<uint32_t>& xnnpack_tensors) {
+  static TfLiteStatus VisitNode(
+      xnn_subgraph_t subgraph, TfLiteContext* context,
+      TfLiteRegistration* registration, TfLiteNode* node, int node_index,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
     // TFLite context used for logging purposes. When we create a new node
     // (subgraph is non-null), logging context is the same as context, and error
     // messages are passed to TFLite. When we detect supported operations
@@ -738,7 +780,8 @@ class Subgraph {
             static_cast<const TfLiteConvParams*>(node->builtin_data);
 
         return VisitConv2DNode(subgraph, logging_context, node_index, node,
-                               context->tensors, conv_params, xnnpack_tensors);
+                               context->tensors, conv_params,
+                               quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinDepthwiseConv2d: {
         const TfLiteDepthwiseConvParams* dwconv_params =
@@ -746,7 +789,7 @@ class Subgraph {
 
         return VisitDepthwiseConv2DNode(subgraph, logging_context, node_index,
                                         node, context->tensors, dwconv_params,
-                                        xnnpack_tensors);
+                                        quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinFullyConnected: {
         const TfLiteFullyConnectedParams* fc_params =
@@ -754,7 +797,7 @@ class Subgraph {
 
         return VisitFullyConnectedNode(subgraph, logging_context, node_index,
                                        node, context->tensors, fc_params,
-                                       xnnpack_tensors);
+                                       quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinHardSwish:
         return VisitHardSwishNode(subgraph, logging_context, node_index, node,
@@ -782,7 +825,8 @@ class Subgraph {
                             context->tensors, xnnpack_tensors);
       case kTfLiteBuiltinPrelu:
         return VisitPreluNode(subgraph, logging_context, node_index, node,
-                              context->tensors, xnnpack_tensors);
+                              context->tensors, quasi_static_tensors,
+                              xnnpack_tensors);
       case kTfLiteBuiltinRelu:
         return VisitReluNode(
             subgraph, logging_context, node_index, node, context->tensors, 0.0f,
@@ -810,7 +854,7 @@ class Subgraph {
 
           return VisitMediaPipeDeconvolutionNode(
               subgraph, context, node_index, node, context->tensors,
-              &deconv_params, xnnpack_tensors);
+              &deconv_params, quasi_static_tensors, xnnpack_tensors);
         } else if (strcmp(registration->custom_name,
                           "MaxPoolingWithArgmax2D") == 0) {
           TfLitePoolParams pool_params = {kTfLitePaddingUnknown};
@@ -948,6 +992,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteConvParams* conv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckConvolutionParams(logging_context, conv_params, node_index));
@@ -968,16 +1013,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1034,6 +1083,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteDepthwiseConvParams* dwconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
@@ -1051,16 +1101,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1123,6 +1177,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteFullyConnectedParams* fc_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckFullyConnectedParams(logging_context, fc_params, node_index));
@@ -1141,16 +1196,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 2,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1387,6 +1446,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteTransposeConvParams* deconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
@@ -1404,16 +1464,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1735,6 +1799,7 @@ class Subgraph {
   static TfLiteStatus VisitPreluNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
@@ -1752,8 +1817,10 @@ class Subgraph {
         logging_context, slope_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckSlopeTensorShape(
         logging_context, slope_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, slope_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, slope_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1869,15 +1936,29 @@ class Subgraph {
   bool first_run_{true};
 };
 
-TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
+TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
+  // Clear previous data, in case the delegate is reused without re-creation.
+  static_unpacked_data_map_.clear();
+  static_unpacked_data_.clear();
+  static_unpack_nodes_.clear();
+
   TfLiteIntArray* execution_plan = nullptr;
   if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
     TF_LITE_KERNEL_LOG(context, "Unable to get graph execution plan.");
     return nullptr;
   }
 
-  TfLiteIntArray* nodes_to_replace = TfLiteIntArrayCreate(execution_plan->size);
-  nodes_to_replace->size = 0;
+  // Mapping for quasi-static (unpacked from static) tensor index to the node
+  // index that produced it.
+  std::unordered_map<int, int> quasi_static_tensors_producers;
+  // Set of all quasi-static tensors in the execution plan.
+  std::unordered_set<int> quasi_static_tensors;
+  // Set of quasi-static tensors consumed by the delegated nodes.
+  std::unordered_set<int> quasi_static_tensors_to_unpack;
+
+  TfLiteIntArray* nodes_to_delegate =
+      TfLiteIntArrayCreate(execution_plan->size);
+  nodes_to_delegate->size = 0;
   for (int i = 0; i < execution_plan->size; ++i) {
     const int node_index = execution_plan->data[i];
 
@@ -1892,15 +1973,142 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
       continue;  // Soft error (skip this node).
     }
 
+    if (registration->builtin_code == kTfLiteBuiltinDequantize &&
+        node->inputs->size == 1 && node->outputs->size == 1) {
+      const TfLiteTensor& input_tensor =
+          context->tensors[node->inputs->data[0]];
+      const TfLiteTensor& output_tensor =
+          context->tensors[node->outputs->data[0]];
+      if (input_tensor.allocation_type == kTfLiteMmapRo &&
+          input_tensor.type == kTfLiteFloat16 &&
+          output_tensor.type == kTfLiteFloat32) {
+        static_unpack_nodes_.insert(i);
+        quasi_static_tensors_producers[node->outputs->data[0]] = i;
+        quasi_static_tensors.insert(node->outputs->data[0]);
+
+        // Skip this node for now. If output of the node is consumed only by
+        // delegated nodes, it will be added to nodes_to_delegate in the end.
+        continue;
+      }
+    }
+
     if (Subgraph::VisitNode(/*subgraph=*/nullptr, context, registration, node,
-                            node_index, std::vector<uint32_t>()) != kTfLiteOk) {
+                            node_index, quasi_static_tensors,
+                            std::vector<uint32_t>()) != kTfLiteOk) {
+      // If a non-delegated node consumes output of a node that unpacks static
+      // data, that node shouldn't be delegated.
+      for (int j = 0; j < node->inputs->size; j++) {
+        const auto it =
+            quasi_static_tensors_producers.find(node->inputs->data[j]);
+        if (it != quasi_static_tensors_producers.end()) {
+          static_unpack_nodes_.erase(it->second);
+        }
+      }
+
       // Non-delegatable node is not an error.
       continue;
     }
 
-    nodes_to_replace->data[nodes_to_replace->size++] = node_index;
+    for (int j = 0; j < node->inputs->size; j++) {
+      if (quasi_static_tensors.count(node->inputs->data[j]) != 0) {
+        quasi_static_tensors_to_unpack.insert(node->inputs->data[j]);
+      }
+    }
+
+    nodes_to_delegate->data[nodes_to_delegate->size++] = node_index;
   }
 
+  // Unpack static data of all tensors
+  for (int t : quasi_static_tensors_to_unpack) {
+    const int producer_index = quasi_static_tensors_producers[t];
+    // Check if TFLite nodes can be delegated to XNNPACK
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+    if (context->GetNodeAndRegistration(context, producer_index, &node,
+                                        &registration) != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context,
+                         "Unable to get node and registration for node %d.",
+                         producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    if (node->inputs->size != 1) {
+      TF_LITE_KERNEL_LOG(context, "unexpected number of inputs (%d) in node %d",
+                         node->inputs->size, producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    if (node->outputs->size != 1) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected number of outputs (%d) in node %d",
+                         node->outputs->size, producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    const TfLiteTensor& input_tensor = context->tensors[node->inputs->data[0]];
+    if (input_tensor.allocation_type != kTfLiteMmapRo) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected allocation type in tensor %d in node %d",
+                         node->inputs->data[0], producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    const TfLiteTensor& output_tensor = context->tensors[t];
+    if (output_tensor.type != kTfLiteFloat32) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected datatype (%s) in tensor %d in node %d",
+                         TfLiteTypeGetName(output_tensor.type),
+                         node->outputs->data[0], producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+    const size_t tensor_elements = output_tensor.bytes / sizeof(float);
+
+    // Align to XNN_EXTRA_BYTES bytes
+    while (static_unpacked_data_.size() % XNN_EXTRA_BYTES != 0) {
+      static_unpacked_data_.push_back(0);
+    }
+    const size_t tensor_offset = static_unpacked_data_.size();
+    static_unpacked_data_.resize(tensor_offset + context->tensors[t].bytes);
+
+    float* unpacked_data =
+        reinterpret_cast<float*>(static_unpacked_data_.data() + tensor_offset);
+    switch (input_tensor.type) {
+      case kTfLiteFloat16: {
+        const uint16_t* packed_data =
+            static_cast<const uint16_t*>(input_tensor.data.data);
+        for (size_t i = 0; i < tensor_elements; i++) {
+          unpacked_data[i] = fp16_ieee_to_fp32_value(packed_data[i]);
+        }
+        break;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context,
+                           "unexpected datatype (%s) in tensor %d in node %d",
+                           TfLiteTypeGetName(output_tensor.type),
+                           node->outputs->data[0], producer_index);
+        TfLiteIntArrayFree(nodes_to_delegate);
+        return nullptr;  // Hard error.
+    }
+
+    static_unpacked_data_map_[t] = tensor_offset;
+  }
+
+  // Add nodes that unpack static data consumed by delegated nodes.
+  // Note: this is done purely to avoid the overhead of running these nodes
+  // again in TFLite interpreter which would allocate memory for their outputs.
+  // We mark them as delegated, but the delegate would simply ignore these nodes
+  // as the static weights are already unpacked.
+  for (int node_index : static_unpack_nodes_) {
+    nodes_to_delegate->data[nodes_to_delegate->size++] = node_index;
+  }
+  std::sort(&nodes_to_delegate->data[0],
+            &nodes_to_delegate->data[nodes_to_delegate->size]);
+
 #ifdef XNNPACK_DELEGATE_TEST_MODE
   // In the test mode build (used by unit tests), XNNPACK delegate claims to
   // support all operators in the execution plan to disable fallback to the
@@ -1908,24 +2116,22 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
   // not supported by the delegate, they will cause a failure in
   // ::tflite::Interpreter::ModifyGraphWithDelegate, to be caught in the unit
   // tests.
-  nodes_to_replace->size = execution_plan->size;
+  nodes_to_delegate->size = execution_plan->size;
   std::copy(&execution_plan->data[0],
             &execution_plan->data[execution_plan->size],
-            &nodes_to_replace->data[0]);
+            &nodes_to_delegate->data[0]);
 #endif
 
-  return nodes_to_replace;
+  return nodes_to_delegate;
 }
 
 void* SubgraphInit(TfLiteContext* context, const char* buffer, size_t length) {
   const TfLiteDelegateParams* params =
       reinterpret_cast<const TfLiteDelegateParams*>(buffer);
 
-  pthreadpool_t threadpool =
-      static_cast<::tflite::xnnpack::Delegate*>(params->delegate->data_)
-          ->threadpool();
-
-  return static_cast<void*>(Subgraph::Create(context, params, threadpool));
+  return static_cast<void*>(Subgraph::Create(
+      context, params,
+      static_cast<::tflite::xnnpack::Delegate*>(params->delegate->data_)));
 }
 
 TfLiteStatus SubgraphPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -1962,7 +2168,9 @@ const TfLiteRegistration kSubgraphRegistration = {
 };
 
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+  TfLiteIntArray* ops_to_replace =
+      static_cast<::tflite::xnnpack::Delegate*>(delegate->data_)
+          ->PrepareOpsToDelegate(context);
   const TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
       context, kSubgraphRegistration, ops_to_replace, delegate);
   TfLiteIntArrayFree(ops_to_replace);

From 102bf84e2612a8183fbd2da3d1937c8a11039e70 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Wed, 27 May 2020 18:51:08 -0700
Subject: [PATCH 1269/1533] Reland "Disable multi-threaded Conv optimizations
 w/ non-const filters"

The non-ruy, multi-threaded conv implementation performs a filter
repack that is cached. This is only correct if the filter itself
is constant. Disable this path if the filter is non-const.

Fixes #31205.

PiperOrigin-RevId: 313505801
Change-Id: Ia1e3aaa32770b9628f04dd823d24781d028f2ba1
---
 tensorflow/lite/kernels/conv.cc      |   7 +-
 tensorflow/lite/kernels/conv_test.cc | 103 +++++++++++++++++++++++++--
 2 files changed, 101 insertions(+), 9 deletions(-)

diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 403adc725eb..21ee5f806a8 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -370,12 +370,15 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     }
   }
 
-  // The multi-threaded kernel supports neither dilation nor hybrid kernels.
+  // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
+  // is incompatible with mutable input filters that might change between evals.
   data->supports_multithreaded_kernel =
       (kernel_type == kMultithreadOptimized) &&
       (context->recommended_num_threads != 1) && !is_hybrid &&
       (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1);
+      (params->dilation_height_factor == 1) &&
+      (filter->allocation_type != kTfLiteArenaRw) &&
+      !IsDynamicTensor(filter);
 
   TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
       context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type));
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 8569809df75..a2201835195 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
+#include <initializer_list>
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
@@ -39,6 +40,7 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename FilterType>
 class BaseConvolutionOpModel : public SingleOpModel {
  public:
   BaseConvolutionOpModel(
@@ -47,9 +49,15 @@ class BaseConvolutionOpModel : public SingleOpModel {
       int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
       int dilation_width_factor = 1, int dilation_height_factor = 1,
-      int num_threads = -1) {
+      int num_threads = -1,
+      std::initializer_list<FilterType> filter_data = {}) {
     input_ = AddInput(input);
-    filter_ = AddInput(filter);
+
+    if (filter_data.size()) {
+      filter_ = AddConstInput(filter, filter_data);
+    } else {
+      filter_ = AddInput(filter);
+    }
 
     int bias_size = GetShape(filter_)[0];
     if (input.type == TensorType_FLOAT32) {
@@ -115,7 +123,7 @@ class BaseConvolutionOpModel : public SingleOpModel {
   int output_;
 };
 
-class ConvolutionOpModel : public BaseConvolutionOpModel {
+class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -553,6 +561,85 @@ TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
                                     234, 261, 121}));
     }
   }
+
+  // Change the filter to ensure non-const filter behavior is correct.
+  m.SetFilter({2, 4, 7, 2, 5, 8, 3, 6, 9});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 313, 359,
+                                               181, 187, 239, 267, 128}));
+}
+
+// TODO(b/157263074): Ideally using a const filter would be a parameterization
+// of the test, so we ensure full test coverage with all the different
+// types and backends.
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32WithConstFilter) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_SAME;
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  const std::initializer_list<float> filter_data = {1, 4, 7, 2, 5, 8, 3, 6, 9};
+  ConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE,
+      /*dilation_width_factor=*/1,
+      /*dilation_height_factor=*/1,
+      /*num_threads=*/-1, filter_data);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // No bias for this test.
+  m.SetBias({0});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+  // This means we should end up with this matrix:
+  // |  105  |  150  |  183  |   95  |
+  // |  235  |  312  |  357  |  178  |
+  // |  187  |  234  |  261  |  121  |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
+                                               178, 187, 234, 261, 121}));
+
+  // Add an additional test for the multi-threaded case, ensuring stability
+  // under different thread counts.
+  if (GetParam() == "MultithreadedOptimized") {
+    for (int i = 1; i < 4; ++i) {
+      m.SetNumThreads(i);
+      m.Invoke();
+      EXPECT_THAT(m.GetOutput(),
+                  ElementsAreArray({105, 150, 183, 95, 235, 312, 357, 178, 187,
+                                    234, 261, 121}));
+    }
+  }
 }
 
 TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
@@ -766,7 +853,7 @@ TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel<uint8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -986,7 +1073,7 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
               ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class HybridConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridConvolutionOpModel : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1325,7 +1412,8 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
-class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class PerChannelQuantizedConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1442,7 +1530,8 @@ TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
 }
 
-class HybridPerChannelConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridPerChannelConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 

From ea97139d4d00fd71e0fcb52504ed7cca3c445555 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 27 May 2020 18:57:52 -0700
Subject: [PATCH 1270/1533] Read from a sharded checkpoint in parallel with
 multiple threads.

PiperOrigin-RevId: 313506462
Change-Id: I1cef16cdaa9e03fd3161727614007429221089e0
---
 .../core/util/tensor_bundle/tensor_bundle.cc  | 148 +++++++-----------
 .../core/util/tensor_bundle/tensor_bundle.h   |   3 -
 2 files changed, 59 insertions(+), 92 deletions(-)

diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index ad9ee2a7c0f..e1234d330fc 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/crc32c.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -42,8 +41,6 @@ limitations under the License.
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/blocking_counter.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 #include "tensorflow/core/util/tensor_bundle/byte_swap.h"
@@ -1024,106 +1021,79 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
         " to restore in slice_spec: ", slice_spec.DebugString());
   }
 
-  BlockingCounter counter(static_cast<int>(details.size()));
-  auto runner = [this, &details](std::function<void()> fn) {
-    if (details.size() > 1) {
-      // If there are multiple slices to read, perform the read in parallel
-      // using multiple threads.
-      env_->SchedClosure(fn);
-    } else {
-      fn();
-    }
-  };
-
+  // The union of the slices in "details" covers "slice_spec".  Performs the
+  // copies from each.
+  BundleEntryProto stored_slice_entry = full_tensor_entry;
   for (const auto& slice_tag_pair : details) {
-    runner([this, &slice_spec, &full_shape, &slice_tag_pair, &full_tensor_entry,
-            &full_tensor_key_string, &counter, val]() {
-      // The union of the slices in "details" covers "slice_spec".  Performs the
-      // copies from each.
-      BundleEntryProto stored_slice_entry = full_tensor_entry;
-      // Seeks for the stored slice.
-      const TensorSlice& stored_slice = slice_tag_pair.first;
+    // Seeks for the stored slice.
+    const TensorSlice& stored_slice = slice_tag_pair.first;
 
-      // We already have the entry for the full tensor, so don't query again if
-      // the slice is full.
-      if (!stored_slice.IsFull()) {
-        const string encoded_stored_slice_name =
-            checkpoint::EncodeTensorNameSlice(full_tensor_key_string,
-                                              stored_slice);
-        mutex_lock l(mu_);
-        // `GetBundleEntryProto` will access `iter_`, so protecting it with a
-        // mutex lock.
-        status_ =
-            GetBundleEntryProto(encoded_stored_slice_name, &stored_slice_entry);
-        if (!status_.ok()) return;
-      }
+    // We already have the entry for the full tensor, so don't query again if
+    // the slice is full.
+    if (!stored_slice.IsFull()) {
+      const string encoded_stored_slice_name =
+          checkpoint::EncodeTensorNameSlice(full_tensor_key_string,
+                                            stored_slice);
+      status_ =
+          GetBundleEntryProto(encoded_stored_slice_name, &stored_slice_entry);
+      if (!status_.ok()) return status_;
+    }
 
-      auto cleanup = gtl::MakeCleanup([&counter] { counter.DecrementCount(); });
+    // TODO(zongheng): should we take an OpKernelContext, so that we can call
+    // allocate_temp()?  Note that without major refactorings to Saver, it's
+    // hard for the caller of the tensor bundle module to allocate these
+    // precisely-shaped scratch storage.
 
-      // TODO(zongheng): should we take an OpKernelContext, so that we can
-      // call allocate_temp()?  Note that without major refactorings to
-      // Saver, it's hard for the caller of the tensor bundle module to
-      // allocate these precisely-shaped scratch storage.
+    // Optimization for the common case: the stored slice can be directly
+    // copied to the destination without additional slicing. This is true when
+    // either the slices are equal or when they are both full slices having the
+    // same shape.
+    TensorShape stored_slice_shape(stored_slice_entry.shape());
+    if (stored_slice == slice_spec ||
+        (stored_slice_shape == val->shape() &&
+         IsFullSlice(stored_slice, stored_slice_shape) &&
+         IsFullSlice(slice_spec, stored_slice_shape))) {
+      VLOG(1) << "Optimized for common case: directly copying into "
+                 "pre-allocated buffer; spec: "
+              << slice_spec.DebugString();
+      status_ = GetValue(stored_slice_entry, val);
+      return status_;
+    }
 
-      // Optimization for the common case: the stored slice can be directly
-      // copied to the destination without additional slicing. This is true
-      // when either the slices are equal or when they are both full slices
-      // having the same shape.
-      TensorShape stored_slice_shape(stored_slice_entry.shape());
-      if (stored_slice == slice_spec ||
-          (stored_slice_shape == val->shape() &&
-           IsFullSlice(stored_slice, stored_slice_shape) &&
-           IsFullSlice(slice_spec, stored_slice_shape))) {
-        VLOG(1) << "Optimized for common case: directly copying into "
-                   "pre-allocated buffer; spec: "
-                << slice_spec.DebugString();
-        status_ = GetValue(stored_slice_entry, val);
-        return;
-      }
+    Tensor stored_slice_tensor(stored_slice_entry.dtype(), stored_slice_shape);
+    status_ = GetValue(stored_slice_entry, &stored_slice_tensor);
+    if (!status_.ok()) return status_;
 
-      Tensor stored_slice_tensor(stored_slice_entry.dtype(),
-                                 stored_slice_shape);
-      status_ = GetValue(stored_slice_entry, &stored_slice_tensor);
-      if (!status_.ok()) return;
-
-      // Copies the intersection over.
-      mutex_lock l(mu_);
-      // `CopyDataFromTensorSliceToTensorSlice` will write to `val`, so
-      // protecting it with a mutex lock.
-      const DataType common_dtype = full_tensor_entry.dtype();
-      switch (common_dtype) {
+    // Copies the intersection over.
+    const DataType common_dtype = full_tensor_entry.dtype();
+    switch (common_dtype) {
 #define HANDLE_COPY(T)                                                 \
   case DataTypeToEnum<T>::value:                                       \
     CHECK(CopyDataFromTensorSliceToTensorSlice(                        \
         full_shape, stored_slice, slice_spec,                          \
         stored_slice_tensor.flat<T>().data(), val->flat<T>().data())); \
     break;
-        HANDLE_COPY(float)
-        HANDLE_COPY(double)
-        HANDLE_COPY(int32)
-        HANDLE_COPY(uint8)
-        HANDLE_COPY(int16)
-        HANDLE_COPY(int8)
-        HANDLE_COPY(complex64)
-        HANDLE_COPY(complex128)
-        HANDLE_COPY(int64)
-        HANDLE_COPY(bool)
-        HANDLE_COPY(qint32)
-        HANDLE_COPY(quint8)
-        HANDLE_COPY(qint8)
-        HANDLE_COPY(bfloat16)
-        default:
-          status_ = errors::InvalidArgument(
-              "Dtype ", DataTypeString(common_dtype), " not supported.");
-          if (!status_.ok()) return;
-      }
+
+      HANDLE_COPY(float)
+      HANDLE_COPY(double)
+      HANDLE_COPY(int32)
+      HANDLE_COPY(uint8)
+      HANDLE_COPY(int16)
+      HANDLE_COPY(int8)
+      HANDLE_COPY(complex64)
+      HANDLE_COPY(complex128)
+      HANDLE_COPY(int64)
+      HANDLE_COPY(bool)
+      HANDLE_COPY(qint32)
+      HANDLE_COPY(quint8)
+      HANDLE_COPY(qint8)
+      HANDLE_COPY(bfloat16)
+      default:
+        return errors::InvalidArgument("Dtype ", DataTypeString(common_dtype),
+                                       " not supported.");
+    }
 #undef HANDLE_COPY
-    });
   }
-
-  counter.Wait();
-  TF_RETURN_IF_ERROR(status_);
-
   return Status::OK();
 }
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.h b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
index 24a9c488cbb..c441000e47d 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.h
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.h
@@ -306,9 +306,6 @@ class BundleReader {
   // differs from that of the current system's processor architecture.
   bool need_to_swap_bytes_;
 
-  // Protect internal states when accessing from multiple threads.
-  mutable mutex mu_;
-
   friend class TensorBundleAlignmentTest;  // For testing data alignment.
 
   TF_DISALLOW_COPY_AND_ASSIGN(BundleReader);

From 8dabb7fc055a5da75cf87d8fd019102746c431da Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Wed, 27 May 2020 19:15:15 -0700
Subject: [PATCH 1271/1533] External_delegate_provider depends on libdl (except
 on Windows), so add the necessary linkopts.

PiperOrigin-RevId: 313508426
Change-Id: I5513f518d98197b90d2c73e4960d7ff517ab93cd
---
 tensorflow/lite/tools/delegates/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index d2eac9d7348..9f2108d07d6 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -143,6 +143,10 @@ cc_library(
     name = "external_delegate_provider",
     srcs = ["external_delegate_provider.cc"],
     copts = tflite_copts(),
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-ldl"],
+    }),
     linkstatic = True,
     visibility = ["//visibility:public"],
     deps = [

From 4707ae6dc0b71253ab1dc3ee9f3d07d54a84a338 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Wed, 27 May 2020 20:03:41 -0700
Subject: [PATCH 1272/1533] Reduce Layer.__call__ overhead by ~10-15%.

Improvements:

- Use DType objects rather than strings where possible, to avoid expensive
  dtypes.as_dtype calls.
- Better ordering of type checks in tf_utils.is_symbolic_tensor for common
  cases
- Avoid a double attr read in _auto_cast_variable_read_dtype

PiperOrigin-RevId: 313512845
Change-Id: I9107bce81bcd6e65732215b3774ffd5a582b24f0
---
 tensorflow/python/framework/ops.py            |  7 ++--
 tensorflow/python/keras/engine/base_layer.py  | 32 +++++++++++++------
 .../python/keras/engine/base_layer_utils.py   |  2 +-
 .../python/keras/engine/base_layer_v1.py      | 12 +++++--
 .../experimental/autocast_variable_test.py    |  3 +-
 .../keras/saving/saved_model/save_impl.py     |  3 +-
 tensorflow/python/keras/utils/tf_utils.py     | 23 ++++++-------
 7 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index b68d613e045..f83f65152c1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5133,14 +5133,13 @@ class Graph(object):
     Returns:
       The dtype that instances of `AutoCastVariable` will be casted to.
     """
-    if not hasattr(self._thread_local, "_auto_cast_variable_read_dtype"):
+    dtype = getattr(self._thread_local, "_auto_cast_variable_read_dtype", None)
+    if dtype is None:
       self._thread_local._auto_cast_variable_read_dtype = None  # pylint: disable=protected-access
-    return self._thread_local._auto_cast_variable_read_dtype  # pylint: disable=protected-access
+    return dtype
 
   @_auto_cast_variable_read_dtype.setter
   def _auto_cast_variable_read_dtype(self, dtype):
-    if dtype:
-      dtype = dtypes.as_dtype(dtype)
     self._thread_local._auto_cast_variable_read_dtype = dtype  # pylint: disable=protected-access
 
   @tf_contextlib.contextmanager
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 0eb6954d2a8..9958f70ed55 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -88,6 +88,11 @@ from tensorflow.tools.docs import doc_controls
 # Prefix that is added to the TF op layer names.
 _TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
 
+# TODO(mdan): Should we have a single generic type for types that can be passed
+# to tf.cast?
+_AUTOCAST_TYPES = (ops.Tensor, sparse_tensor.SparseTensor,
+                   ragged_tensor.RaggedTensor)
+
 _keras_layers_gauge = monitoring.BoolGauge('/tensorflow/api/keras/layers',
                                            'keras layers usage', 'method')
 _keras_model_gauge = monitoring.BoolGauge(
@@ -931,7 +936,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
             try:
               with base_layer_utils.autocast_context_manager(
-                  self._compute_dtype):
+                  self._compute_dtype_object):
                 # Add auto_control_deps in V2 when they are not already added by
                 # a `tf.function`.
                 if (ops.executing_eagerly_outside_functions() and
@@ -993,7 +998,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           self._maybe_build(inputs)
           cast_inputs = self._maybe_cast_inputs(inputs, input_list)
           with base_layer_utils.autocast_context_manager(
-              self._compute_dtype):
+              self._compute_dtype_object):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks, build_graph)
@@ -2115,6 +2120,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._dtype_defaulted_to_floatx = (not dtype and
                                        policy.policy_defaults_to_floatx())
 
+    # Performance optimization: cache the compute dtype as a Dtype object or
+    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
+    # TODO(b/157486353): Investigate returning DTypes in Policy.
+    if self._dtype_policy.compute_dtype:
+      self._compute_dtype_object = dtypes.as_dtype(
+          self._dtype_policy.compute_dtype)
+    else:
+      self._compute_dtype_object = None
+
   # TODO(reedwm): Expose this property?
   @property
   def _compute_dtype(self):
@@ -2142,22 +2156,20 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       `inputs`, but tensors may have been casted to self._compute_dtype
     """
-    compute_dtype = self._compute_dtype
+    compute_dtype_object = self._compute_dtype_object
     should_autocast = (
-        self._autocast and compute_dtype and
-        dtypes.as_dtype(compute_dtype).is_floating)
+        self._autocast and compute_dtype_object and
+        compute_dtype_object.is_floating)
 
     if (should_autocast and
-        any(self._should_cast_single_input(x) for x in input_list)):
+        any(map(self._should_cast_single_input, input_list))):
       # Only perform expensive `nest` operation when needed.
       return nest.map_structure(self._cast_single_input, inputs)
     else:
       return inputs
 
   def _should_cast_single_input(self, x):
-    cast_types = (ops.Tensor, sparse_tensor.SparseTensor,
-                  ragged_tensor.RaggedTensor)
-    return (isinstance(x, cast_types) and x.dtype.is_floating and
+    return (isinstance(x, _AUTOCAST_TYPES) and x.dtype.is_floating and
             x.dtype.base_dtype.name != self._compute_dtype)
 
   def _cast_single_input(self, x):
@@ -2165,7 +2177,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if self._should_cast_single_input(x):
       if self._dtype_defaulted_to_floatx:
         self._warn_about_input_casting(x.dtype.base_dtype)
-      return math_ops.cast(x, self._compute_dtype)
+      return math_ops.cast(x, self._compute_dtype_object)
     else:
       return x
 
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 7e4e0e5da4a..6d25995e4c2 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -493,7 +493,7 @@ def autocast_context_manager(dtype):
   Returns:
     A context manager to automatically cast AutoCastVariables.
   """
-  if dtype and not dtypes.as_dtype(dtype).is_floating:
+  if dtype and not dtype.is_floating:
     dtype = None
   return ops.get_default_graph()._enable_auto_casting_variables(dtype)  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 80e0b4be2f1..4c0826be4dc 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -768,7 +768,7 @@ class Layer(base_layer.Layer):
           if not self.dynamic:
             try:
               with base_layer_utils.autocast_context_manager(
-                  self._compute_dtype):
+                  self._compute_dtype_object):
                 outputs = call_fn(cast_inputs, *args, **kwargs)
 
             except errors.OperatorNotAllowedInGraphError as e:
@@ -813,7 +813,7 @@ class Layer(base_layer.Layer):
           self._maybe_build(inputs)
           cast_inputs = self._maybe_cast_inputs(inputs)
           with base_layer_utils.autocast_context_manager(
-              self._compute_dtype):
+              self._compute_dtype_object):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
@@ -1749,6 +1749,14 @@ class Layer(base_layer.Layer):
     self._dtype_defaulted_to_floatx = (not dtype and
                                        policy.policy_defaults_to_floatx())
 
+    # Performance optimization: cache the compute dtype as a Dtype object or
+    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
+    if self._dtype_policy.compute_dtype:
+      self._compute_dtype_object = dtypes.as_dtype(
+          self._dtype_policy.compute_dtype)
+    else:
+      self._compute_dtype_object = None
+
   # TODO(reedwm): Expose this property?
   @property
   def _compute_dtype(self):
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 41591d3edbd..78041973cc1 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -146,7 +146,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(x.true_dtype, dtypes.float32)
       self.assertIsInstance(x.true_dtype, dtypes.DType)
 
-      with ops.get_default_graph()._enable_auto_casting_variables('float16'):
+      dtype = dtypes.float16
+      with ops.get_default_graph()._enable_auto_casting_variables(dtype):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertIsInstance(x.dtype, dtypes.DType)
         self.assertEqual(x.true_dtype, dtypes.float32)
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index f3be6d0a595..7802470c523 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -521,7 +521,8 @@ def layer_call_wrapper(call_collection, method):
     with base_layer_utils.call_context().enter(
         layer, inputs=inputs, build_graph=False, training=training,
         saving=True):
-      with base_layer_utils.autocast_context_manager(layer._compute_dtype):  # pylint: disable=protected-access
+      with base_layer_utils.autocast_context_manager(
+          layer._compute_dtype_object):  # pylint: disable=protected-access
         ret = method(*args, **kwargs)
     _restore_layer_losses(original_losses)
     return ret
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 220df9c7f8a..b87ca1623b0 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -328,7 +328,7 @@ def shape_type_conversion(fn):
 
 
 def are_all_symbolic_tensors(tensors):
-  return all(is_symbolic_tensor(tensor) for tensor in tensors)
+  return all(map(is_symbolic_tensor, tensors))
 
 
 _user_convertible_tensor_types = set()
@@ -346,9 +346,12 @@ def is_symbolic_tensor(tensor):
   Returns:
     True for symbolic tensors, False for eager tensors.
   """
-  if isinstance(tensor, tuple(_user_convertible_tensor_types)):
-    tensor = ops.convert_to_tensor_or_composite(tensor)
-  if isinstance(tensor, variables.Variable):
+  if isinstance(tensor, ops.Tensor):
+    return hasattr(tensor, 'graph')
+  elif isinstance(tensor, composite_tensor.CompositeTensor):
+    component_tensors = nest.flatten(tensor, expand_composites=True)
+    return any(hasattr(t, 'graph') for t in component_tensors)
+  elif isinstance(tensor, variables.Variable):
     # Variables that are output of a Keras Layer in Functional API mode
     # should be considered symbolic.
     # TODO(omalleyt): We need a better way to check this in order to
@@ -356,12 +359,11 @@ def is_symbolic_tensor(tensor):
     # return Variables as outputs.
     return (getattr(tensor, '_keras_history', False) or
             not context.executing_eagerly())
-  if isinstance(tensor, composite_tensor.CompositeTensor):
-    component_tensors = nest.flatten(tensor, expand_composites=True)
-    return any(hasattr(t, 'graph') for t in component_tensors)
-  if isinstance(tensor, ops.Tensor):
-    return hasattr(tensor, 'graph')
-  return False
+  elif isinstance(tensor, tuple(_user_convertible_tensor_types)):
+    tensor = ops.convert_to_tensor_or_composite(tensor)
+    return is_symbolic_tensor(tensor)
+  else:
+    return False
 
 
 def register_symbolic_tensor_type(cls):
@@ -526,4 +528,3 @@ def to_numpy_or_python_type(tensors):
     return t  # Don't turn ragged or sparse tensors to NumPy.
 
   return nest.map_structure(_to_single_numpy_or_python_type, tensors)
-

From d794d8fa3339b57d5fdd2a828b5c4fada5f850bf Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Wed, 27 May 2020 20:14:23 -0700
Subject: [PATCH 1273/1533] internal code change

PiperOrigin-RevId: 313514006
Change-Id: I2c708bcff4e52797c203a7c7f4637346e7df2125
---
 tensorflow/lite/experimental/delegates/coreml/BUILD | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 193f2e0223b..2985cd3a315 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -1,4 +1,4 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
 package(default_visibility = [
     "//visibility:public",
 ])

From 0e1d9dcd8078121f3501a0f32b5fcf64efd41c19 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 27 May 2020 20:25:33 -0700
Subject: [PATCH 1274/1533] [XLA] [NFC] Refactor `unique_ptr<ShapedBuffer>`
 into `ShapedBuffer`

std::unique_ptr wrapping `ShapedBuffer` is not idiomatic, cf. cl/193599773
PiperOrigin-RevId: 313515037
Change-Id: Ic29a419104f4d2cd908642217d3763cff990793b
---
 tensorflow/compiler/jit/xla_launch_util.cc | 10 ++++------
 tensorflow/compiler/jit/xla_launch_util.h  |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index e0ec990462b..8c24f182f5c 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -201,9 +201,7 @@ void XlaComputationLaunchContext::PopulateInputs(
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   // Build ShapedBuffers that point directly to the Tensor buffers.
-  arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1);
-  arg_buffers_.resize(kernel->xla_input_shapes.size());
-  arg_ptrs_ = std::vector<ShapedBuffer*>(arg_buffers_.size());
+  arg_ptrs_ = std::vector<ShapedBuffer*>(kernel->xla_input_shapes.size());
 
   // Pass remaining parameters.
   const Tensor* t;
@@ -239,11 +237,11 @@ void XlaComputationLaunchContext::PopulateInputs(
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
       se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
-      arg_buffers_[i] = absl::make_unique<ShapedBuffer>(
+      arg_buffers_.emplace_back(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
-      arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
-      arg_ptrs_[i] = arg_buffers_[i].get();
+      arg_buffers_.back().set_buffer(dmem, /*index=*/{});
+      arg_ptrs_[i] = &arg_buffers_.back();
     }
   }
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 511e0f1451a..cf68dcb7dd6 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -165,7 +165,7 @@ class XlaComputationLaunchContext {
   se::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
   bool use_multiple_streams_;
-  std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
+  std::deque<xla::ShapedBuffer> arg_buffers_;
   std::vector<xla::ShapedBuffer*> arg_ptrs_;
 };
 

From 96c6a5af91b30cb0b77943dd396b19d12ed60322 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Wed, 27 May 2020 20:41:03 -0700
Subject: [PATCH 1275/1533] [XLA] [NFC] Drop unused constructor of
 ExecutionInput

PiperOrigin-RevId: 313516652
Change-Id: Ic8cbde796e423395a0dea843640f27f9539c3b9e
---
 tensorflow/compiler/xla/service/executable.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index f1ac1fef451..5d7bd26b01e 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -63,10 +63,6 @@ class ExecutionInput {
   explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {}
   explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
       : buffers_(std::move(buffers)) {}
-  ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
-                 std::vector<ShapeIndex> owner_held_indices)
-      : buffers_(std::move(buffers)),
-        unowned_indices_(std::move(owner_held_indices)) {}
   ExecutionInput(ExecutionInput&&) = default;
 
   ~ExecutionInput() {

From fe4609b44d6fecbc7eb1d2a0a63f4b9a9df290d0 Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Wed, 27 May 2020 20:59:13 -0700
Subject: [PATCH 1276/1533] Add profiling label to distinguish sparse FC from
 dense ones.

PiperOrigin-RevId: 313518411
Change-Id: I401e062dd284145106a946de9231ac1431a0b92a
---
 .../kernels/internal/optimized/sparse_ops/fully_connected.h  | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
index 750e63e152f..969caa57d47 100644
--- a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -31,6 +32,8 @@ inline void FullyConnectedSparseWeight(
     const RuntimeShape& weights_shape, const float* weights_data,
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("Random Sparse");
   const float output_activation_min = params.float_activation_min;
   const float output_activation_max = params.float_activation_max;
 
@@ -75,6 +78,8 @@ inline void FullyConnectedSparseWeight1x4(
     const RuntimeShape& weights_shape, const float* weights_data,
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("1x4 Block Sparse");
   const float output_activation_min = params.float_activation_min;
   const float output_activation_max = params.float_activation_max;
 

From dedac5053f1ca2d6a7820e330714e50d2d724cee Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Wed, 27 May 2020 21:41:37 -0700
Subject: [PATCH 1277/1533] Fix edge case bug in handling FP16 weights in
 XNNPACK delegate

Quasi-static tensors may become subgraph outputs after partitioning; we need to
explicitly exclude them from outputs and treat as static tensors.

PiperOrigin-RevId: 313522428
Change-Id: I621cb575b52caa59910a281078c3c505e796880f
---
 .../lite/delegates/xnnpack/xnnpack_delegate.cc       | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 32fcbee4c22..9cbdea60706 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -101,9 +101,15 @@ class Subgraph {
     const std::unordered_set<int> inputs(
         &params->input_tensors->data[0],
         &params->input_tensors->data[params->input_tensors->size]);
-    const std::unordered_set<int> outputs(
-        &params->output_tensors->data[0],
-        &params->output_tensors->data[params->output_tensors->size]);
+    std::unordered_set<int> outputs;
+    for (int o = 0; o < params->output_tensors->size; o++) {
+      const int output_tensor_idx = params->output_tensors->data[o];
+      // Exclude quasi-static tensors which may have become subgraph outputs
+      // after partitioning.
+      if (delegate->static_unpacked_data_map_.count(output_tensor_idx) == 0) {
+        outputs.insert(output_tensor_idx);
+      }
+    }
     std::unordered_set<int> externals(outputs);
 
     TfLiteIntArray* execution_plan;

From 4bc4fa3cb7a15566f89f0347b3022249a2092e5c Mon Sep 17 00:00:00 2001
From: "Kim, Young Soo" <kimys@victech.io>
Date: Thu, 28 May 2020 13:51:35 +0900
Subject: [PATCH 1278/1533] Fix a python exception issue on using Sequence
 object of keras with MirroredStrategy

---
 tensorflow/python/keras/utils/data_utils.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
index 6c0122cdf72..3456db013d3 100644
--- a/tensorflow/python/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -886,15 +886,18 @@ class OrderedEnqueuer(SequenceEnqueuer):
         `(inputs, targets)` or
         `(inputs, targets, sample_weights)`.
     """
-    try:
-      while self.is_running():
-        inputs = self.queue.get(block=True).get()
-        self.queue.task_done()
+    while self.is_running():
+      try:
+        inputs = self.queue.get(block=True, timeout=5).get()
+        if self.is_running():
+          self.queue.task_done()
         if inputs is not None:
           yield inputs
-    except Exception:  # pylint: disable=broad-except
-      self.stop()
-      six.reraise(*sys.exc_info())
+      except queue.Empty:
+        pass
+      except Exception:  # pylint: disable=broad-except
+        self.stop()
+        six.reraise(*sys.exc_info())
 
 
 def init_pool_generator(gens, random_seed=None, id_queue=None):

From f1487900c65910510dd38be0135444f9e39b2a93 Mon Sep 17 00:00:00 2001
From: Fausto Morales <faustomorales@gmail.com>
Date: Thu, 28 May 2020 00:02:04 -0500
Subject: [PATCH 1279/1533] Format ctc_decode test correctly.

---
 tensorflow/python/keras/backend_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 2dff49e61e6..95de4f5824c 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -1771,7 +1771,10 @@ class TestCTC(test.TestCase):
         -3.777835    # output beam 1
     ], np.float32)[np.newaxis, :]
 
-    decode_truth = [np.array([1, 0, -1, -1, -1, -1, -1]), np.array([0, 1, 0, -1, -1, -1 ,-1])]
+    decode_truth = [
+        np.array([1, 0, -1, -1, -1, -1, -1]),
+        np.array([0, 1, 0, -1, -1, -1, -1])
+    ]
     beam_width = 2
     top_paths = 2
 

From edab186fff8bd2e11a723d9be381f2294cb5cb3a Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Wed, 27 May 2020 22:01:01 -0700
Subject: [PATCH 1280/1533] Add reduce_logsumexp benchmark with experiement
 compile

PiperOrigin-RevId: 313524161
Change-Id: I3e8b2124af94ddd86fcbd7898286356a5435bed9
---
 tensorflow/python/eager/benchmarks_test.py | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index b6a504eb291..72eaa663c6c 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -852,11 +852,13 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def _benchmark_tf_reduce_logsumexp(self,
                                      device=CPU,
                                      execution_mode=None,
-                                     defunc=False):
+                                     defunc=False,
+                                     xla_compile=False):
     with context.device(device):
       x = constant_op.constant([[1, 0.], [0., 0.]])
       if defunc:
-        reduce_func = def_function.function(math_ops.reduce_logsumexp)
+        reduce_func = def_function.function(
+            math_ops.reduce_logsumexp, experimental_compile=xla_compile)
         func = lambda: reduce_func(x)
       else:
         func = lambda: math_ops.reduce_logsumexp(x)
@@ -897,6 +899,16 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._benchmark_tf_reduce_logsumexp(
         device=GPU, execution_mode=context.ASYNC, defunc=True)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
+  def benchmark_tf_reduce_logsumexp_GPU_defun_compile(self):
+    self._benchmark_tf_reduce_logsumexp(
+        device=GPU, defunc=True, xla_compile=True)
+
+  @test_util.disable_tfrt("reduce logsumexp not supported")
+  def benchmark_tf_reduce_logsumexp_GPU_async_defun_compile(self):
+    self._benchmark_tf_reduce_logsumexp(
+        device=GPU, execution_mode=context.ASYNC, defunc=True, xla_compile=True)
+
   def _benchmark_tf_tensordot(self, device=CPU, execution_mode=None):
     with context.device(device):
       a = array_ops.ones((2, 2))

From ad74ce74914bb221d31cc95cd843c54174a998d7 Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Wed, 27 May 2020 22:47:03 -0700
Subject: [PATCH 1281/1533] Provide more verbose error message in
 SetAllDimensions()

Error messages will include actual tensor dimension information.

PiperOrigin-RevId: 313529606
Change-Id: I88631ea2ebba796fe266a5d0ea3ea73e4e3ad3ed
---
 .../gpu/common/model_builder_helper.cc        | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 4973a8179cd..9a15f940fbd 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context.h"
@@ -220,14 +221,18 @@ absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
   return absl::OkStatus();
 }
 
+const std::string GetDimensionString(const TfLiteIntArray* dimensions) {
+  return absl::StrJoin(TfLiteIntArrayView(dimensions), "x");
+}
+
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape) {
   if (dimensions->size < 0) {
     return absl::InvalidArgumentError("Invalid Scalar dimensions");
   }
   for (int i = 0; i < dimensions->size; ++i) {
     if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to scalar.");
+      return absl::InvalidArgumentError(absl::StrCat(
+          GetDimensionString(dimensions), "  cannot be reduced to scalar."));
     }
   }
   shape->v = 1;
@@ -240,8 +245,8 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
   }
   for (int i = 0; i < dimensions->size - 1; ++i) {
     if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to linear.");
+      return absl::InvalidArgumentError(absl::StrCat(
+          GetDimensionString(dimensions), "  cannot be reduced to linear."));
     }
   }
   shape->v = dimensions->data[dimensions->size - 1];
@@ -250,7 +255,9 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape) {
   if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not HWC");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 4D tensor of shape 1xHxWxC but got ",
+                     GetDimensionString(dimensions)));
   }
   if (dimensions->data[0] != 1) {
     return absl::UnimplementedError("Batch size is not equal to 1.");
@@ -263,7 +270,9 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape) {
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {
   if (dimensions->size != 2) {
-    return absl::InvalidArgumentError("Dimensions are not HW");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 2D tensor of shape HxW but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->h = dimensions->data[0];
   shape->w = dimensions->data[1];
@@ -273,7 +282,8 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape) {
   if (dimensions->size != 4) {
     return absl::InvalidArgumentError(
-        absl::StrCat("Dimensions are not OHWI: ", dimensions->size));
+        absl::StrCat("Expected a 4D tensor of shape OxHxWxI but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->o = dimensions->data[0];
   shape->h = dimensions->data[1];
@@ -284,7 +294,9 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape) {
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape) {
   if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not BHWC");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 4D tensor of shape BxHxWxC but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->b = dimensions->data[0];
   shape->h = dimensions->data[1];

From 3e7ba8944b872e755f63e0ebeee649ad97adb9ec Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Wed, 27 May 2020 22:55:00 -0700
Subject: [PATCH 1282/1533] Explicitly specify input array `dtype` to
 TensorFlow's `assertAllEqual()` test util function.

PiperOrigin-RevId: 313530667
Change-Id: Ia251459edaeb89318d012cf0ff54c9b43a56fe49
---
 tensorflow/python/kernel_tests/slice_op_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 15a340ef270..b53147552c3 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -247,8 +247,8 @@ class SliceTest(test.TestCase):
         slice_t = array_ops.slice(a, [0, 0], [2, 2])
         slice2_t = a[:2, :2]
         slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
-        self.assertAllEqual(slice_val, inp[:2, :2])
-        self.assertAllEqual(slice2_val, inp[:2, :2])
+        self.assertAllEqual(slice_val, np.array(inp[:2, :2], dtype=np.float32))
+        self.assertAllEqual(slice2_val, np.array(inp[:2, :2], dtype=np.float32))
         self.assertEqual(slice_val.shape, slice_t.get_shape())
         self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 

From 9fef2db9ff0007a7c35ba845c753874569187a02 Mon Sep 17 00:00:00 2001
From: Xinyi Wang <wxinyi@google.com>
Date: Wed, 27 May 2020 23:25:28 -0700
Subject: [PATCH 1283/1533] Implement
 experimental_distribute_dataset_from_function for MultiWorkerMirroredStrategy

PiperOrigin-RevId: 313533630
Change-Id: I6c517116f0e20de945911516898542c1aa8736ae
---
 tensorflow/python/distribute/BUILD            |  5 +++
 .../collective_all_reduce_strategy.py         |  8 +++++
 .../python/distribute/strategy_common_test.py | 35 +++++++++++++++++++
 3 files changed, 48 insertions(+)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 01ae1b61f6a..585af92d15c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1679,11 +1679,16 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
     deps = [
         ":combinations",
+        ":multi_worker_test_base",
         ":reduce_util",
         ":strategy_combinations",
+        ":strategy_test_lib",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 7c7f521af98..40c60241ac0 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -418,6 +418,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         split_batch_by=self._num_replicas_in_sync,
         input_context=input_context)
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    input_context = self._make_input_context()
+    return input_lib.get_distributed_datasets_from_function(
+        dataset_fn=dataset_fn,
+        input_workers=self._input_workers,
+        input_contexts=[input_context],
+        strategy=self._container_strategy())
+
   def _make_dataset_iterator(self, dataset):
     """Distributes the dataset to each local GPU."""
     input_context = self._make_input_context()
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index c277310b6a0..4ed5054af2d 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -61,5 +65,36 @@ class StrategyReduceTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
 
 
+class DistributedCollectiveAllReduceStrategyTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers],
+          mode=['eager']))
+  def testDatasetFromFunction(self, strategy):
+    def dataset_fn(input_context):
+      global_batch_size = 10
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size)
+      return d.shard(input_context.num_input_pipelines,
+                     input_context.input_pipeline_id)
+
+    expected_sum_on_workers = [10, 35]
+    input_iterator = iter(
+        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def run(iterator):
+      return strategy.experimental_local_results(iterator.get_next())
+
+    result = run(input_iterator)
+    sum_value = math_ops.reduce_sum(result)
+    self.assertEqual(
+        sum_value.numpy(),
+        expected_sum_on_workers[multi_worker_test_base.get_task_index()])
+
+
 if __name__ == '__main__':
   combinations.main()

From 32a7d53c828a4d4656db818fe35fd96acfd6e18c Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Thu, 28 May 2020 00:08:40 -0700
Subject: [PATCH 1284/1533] Typo fix in strategy_combinations.py

PiperOrigin-RevId: 313537352
Change-Id: I31d1d628c37316d04e472cc87d54fa72ce8e26fe
---
 tensorflow/python/distribute/strategy_combinations.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index ca7a0d6d1fc..350b187f67f 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -159,13 +159,13 @@ central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
         ["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 multi_worker_mirrored_two_workers = combinations.NamedDistribution(
-    "MultiWorkerMirrroedTwoWorkers",
+    "MultiWorkerMirroredTwoWorkers",
     collective_all_reduce_strategy.CollectiveAllReduceStrategy,
     has_chief=False,
     num_workers=2,
 )
 multi_worker_mirrored_one_chief_one_worker = combinations.NamedDistribution(
-    "MultiWorkerMirrroedOneChiefOneWorker",
+    "MultiWorkerMirroredOneChiefOneWorker",
     collective_all_reduce_strategy.CollectiveAllReduceStrategy,
     has_chief=True,
     num_workers=1,

From 1a13e3c8234d9040c26010af7ba4193a970b52b4 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 28 May 2020 00:10:31 -0700
Subject: [PATCH 1285/1533] Re-land "Make the tf2xla "tensor list size not set"
 error message a bit more ergonomic"

The CL was rolled back due to a flaky test.

PiperOrigin-RevId: 313537542
Change-Id: I4a60e924e53178802f70cb041a384f625e278b80
---
 .../compiler/tf2xla/kernels/tensor_list_ops.cc | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index d01f094dc2e..976ff91f6ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -136,8 +136,11 @@ class TensorListReserveOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
     OP_REQUIRES(
         ctx, num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the number of elements."));
+        errors::InvalidArgument(
+            "XLA compilation requires a fixed tensor list size. Set the number "
+            "of elements. This could also happen if you're using a TensorArray "
+            "in a while loop that does not have its maximum_iteration set, you "
+            "can fix this by setting maximum_iteration to a suitable value."));
 
     // If element shape is compile time constant and it's not "unknown rank"
     // shape (-1), create an initialized TensorList. Otherwise create an
@@ -197,10 +200,13 @@ class EmptyTensorListOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
-    OP_REQUIRES(
-        ctx, max_num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the max number of elements."));
+    OP_REQUIRES(ctx, max_num_elements >= 0,
+                errors::InvalidArgument(
+                    "XLA compilation requires a fixed tensor list size. Set "
+                    "the max number of elements. This could also happen if "
+                    "you're using a TensorArray in a while loop that does not "
+                    "have its maximum_iteration set, you can fix this by "
+                    "setting maximum_iteration to a suitable value."));
 
     if (dtype_ != DT_VARIANT) {
       // We are creating a non-nested TensorList.

From 83a67afb5936f8d0fc27cb91bd8128280801ed48 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Thu, 28 May 2020 00:12:32 -0700
Subject: [PATCH 1286/1533] Disable MLIR HLO buffer-assignment test

MLIR is getting more defensive against misconfiguration, the "trick" used for this test is not possible anymore.

PiperOrigin-RevId: 313537734
Change-Id: Ib8596960d18ddaba877443226b46af1ea799e254
---
 tensorflow/compiler/mlir/xla/tests/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index ad69383bd98..e2f747085c1 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -6,6 +6,7 @@ package(licenses = ["notice"])
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
+    exclude = ["buffer-assignment.mlir"],  # TODO(b/157616173)
     test_file_exts = ["mlir"],
 )
 

From e651638ee41e42fdf9f4bed4b062792fbb67cef1 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Thu, 28 May 2020 00:33:52 -0700
Subject: [PATCH 1287/1533] Add missing dialect registration to MLIR TF Lite
 lstm_utils_test

This is caught by new assertions added in MLIR upstream to detect such misconfiguration.

PiperOrigin-RevId: 313539845
Change-Id: I90735dca1c7e417b3e3f42fa144177522bc242a4
---
 .../compiler/mlir/lite/utils/lstm_utils_test.cc    | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index 5df57de6f71..081ba7ac6e7 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace mlir {
@@ -92,7 +93,9 @@ class LstmUtilsTest : public ::testing::Test {
   LstmUtilsTest() {}
 
   void SetUp() override {
-    builder_ = std::unique_ptr<mlir::Builder>(new Builder(&context_));
+    RegisterDialects();
+    context_ = std::make_unique<mlir::MLIRContext>();
+    builder_ = std::unique_ptr<mlir::Builder>(new Builder(context_.get()));
     fused_lstm_func_ = createLstmCompositeFunc(builder_.get(), false, false);
     fused_lstm_func_cifg_ =
         createLstmCompositeFunc(builder_.get(), false, true);
@@ -105,10 +108,17 @@ class LstmUtilsTest : public ::testing::Test {
     fused_ln_lstm_func_.erase();
     builder_.reset();
   }
+
+  void RegisterDialects() {
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<TensorFlowLiteDialect>();
+  }
+
   FuncOp fused_lstm_func_;
   FuncOp fused_lstm_func_cifg_;
   FuncOp fused_ln_lstm_func_;
-  mlir::MLIRContext context_;
+  std::unique_ptr<mlir::MLIRContext> context_;
   std::unique_ptr<mlir::Builder> builder_;
 };
 

From 242e60bf2198bd9a19cb4879fcd98ef74b939ad2 Mon Sep 17 00:00:00 2001
From: Vincent ABRIOU <vincent.abriou@st.com>
Date: Wed, 20 May 2020 17:37:07 +0200
Subject: [PATCH 1288/1533] TFLite: add EXTRA_CFLAGS variable

Since commit SHA1: d28cf21aa51d12ce9c526f7baf5137bc2e2b7f7d
Cross compilation of TFLite is failing.
Add an EXTRA_CFLAGS variable to allow cross compilation environment to
define extra CFLAGS when needed.

Signed-off-by: Vincent ABRIOU <vincent.abriou@st.com>
---
 tensorflow/lite/tools/make/Makefile        | 2 +-
 tensorflow/lite/tools/pip_package/setup.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 3635ac95167..aa96714fe3b 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -58,7 +58,7 @@ LIBS := \
 # There are no rules for compiling objects for the host system (since we don't
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
-CFLAGS := -O3 -DNDEBUG -fPIC
+CFLAGS := -O3 -DNDEBUG -fPIC $(EXTRA_CFLAGS)
 CXXFLAGS := $(CFLAGS) --std=c++11 $(EXTRA_CXXFLAGS)
 LDOPTS := -L/usr/local/lib
 ARFLAGS := -r
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
index 2f2515145c4..98945f5aed6 100644
--- a/tensorflow/lite/tools/pip_package/setup.py
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -58,7 +58,8 @@ elif TARGET == 'aarch64':
   os.environ['CC'] = 'aarch64-linux-gnu-gcc'
 
 MAKE_CROSS_OPTIONS = []
-for name in ['TARGET', 'TARGET_ARCH', 'CC_PREFIX', 'EXTRA_CXXFLAGS']:
+for name in ['TARGET', 'TARGET_ARCH', 'CC_PREFIX', 'EXTRA_CXXFLAGS',
+             'EXTRA_CFLAGS']:
   value = os.environ.get('TENSORFLOW_%s' % name)
   if value:
     MAKE_CROSS_OPTIONS.append('%s=%s' % (name, value))

From 17c5a9bbd67f34b25c3b5ffaba2f72a54ec0a105 Mon Sep 17 00:00:00 2001
From: Kamil Rakoczy <krakoczy@antmicro.com>
Date: Wed, 1 Apr 2020 10:09:02 +0200
Subject: [PATCH 1289/1533] Change std::max, std::min, std::round to
 tflite::TfLiteMax, tflite::TfLiteMin, tflite::TfLiteRound

Signed-off-by: Kamil Rakoczy <krakoczy@antmicro.com>
Signed-off-by: Karol Gugala <kgugala@antmicro.com>
---
 tensorflow/lite/kernels/internal/BUILD        |  6 +++-
 tensorflow/lite/kernels/internal/cppmath.h    |  2 +-
 tensorflow/lite/kernels/internal/max.h        | 35 +++++++++++++++++++
 tensorflow/lite/kernels/internal/min.h        | 35 +++++++++++++++++++
 .../lite/kernels/internal/reference/reduce.h  |  6 ++--
 .../reference/resize_nearest_neighbor.h       |  3 +-
 .../lite/micro/kernels/activation_utils.h     |  8 +++--
 7 files changed, 87 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/max.h
 create mode 100644 tensorflow/lite/kernels/internal/min.h

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index d6a96efdbf7..6ef54d971df 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -362,7 +362,11 @@ cc_test(
 cc_library(
     name = "cppmath",
     srcs = [],
-    hdrs = ["cppmath.h"],
+    hdrs = [
+        "cppmath.h",
+        "max.h",
+        "min.h",
+    ],
     build_for_embedded = True,
     copts = tflite_copts(),
 )
diff --git a/tensorflow/lite/kernels/internal/cppmath.h b/tensorflow/lite/kernels/internal/cppmath.h
index 611a8d2588a..205ba189d71 100644
--- a/tensorflow/lite/kernels/internal/cppmath.h
+++ b/tensorflow/lite/kernels/internal/cppmath.h
@@ -20,7 +20,7 @@ limitations under the License.
 namespace tflite {
 
 #if defined(TF_LITE_USE_GLOBAL_CMATH_FUNCTIONS) || \
-    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO)
+    (defined(__ANDROID__) && !defined(__NDK_MAJOR__)) || defined(ARDUINO) || defined(__ZEPHYR__)
 #define TF_LITE_GLOBAL_STD_PREFIX
 #else
 #define TF_LITE_GLOBAL_STD_PREFIX std
diff --git a/tensorflow/lite/kernels/internal/max.h b/tensorflow/lite/kernels/internal/max.h
new file mode 100644
index 00000000000..c18100272db
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/max.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MAX) || defined(__ZEPHYR__)
+inline float TfLiteMax(const float& x, const float& y) {
+  return std::max(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMax(const T& x, const T& y) {
+  return std::fmax(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MAX_H_
diff --git a/tensorflow/lite/kernels/internal/min.h b/tensorflow/lite/kernels/internal/min.h
new file mode 100644
index 00000000000..62035dccd89
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/min.h
@@ -0,0 +1,35 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
+
+#include <cmath>
+
+namespace tflite {
+
+#if defined(TF_LITE_USE_GLOBAL_MIN) || defined(__ZEPHYR__)
+inline float TfLiteMin(const float& x, const float& y) {
+  return std::min(x, y);
+}
+#else
+template <class T>
+inline T TfLiteMin(const T& x, const T& y) {
+  return std::fmin(x, y);
+}
+#endif
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_MIN_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reduce.h b/tensorflow/lite/kernels/internal/reference/reduce.h
index 17dfd8557ae..95de81668d3 100644
--- a/tensorflow/lite/kernels/internal/reference/reduce.h
+++ b/tensorflow/lite/kernels/internal/reference/reduce.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/cppmath.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/min.h"
+#include "tensorflow/lite/kernels/internal/max.h"
 
 namespace tflite {
 
@@ -382,10 +384,10 @@ inline bool QuantizedMeanOrSum(const T* input_data, int32 input_zero_point,
         float float_mean = static_cast<float>(temp_sum[idx]) /
                            static_cast<float>(num_elements_in_axis);
         float result =
-            std::min(TfLiteRound(float_mean * scale + bias) + output_zero_point,
+            TfLiteMin(TfLiteRound(float_mean * scale + bias) + output_zero_point,
                      static_cast<float>(std::numeric_limits<T>::max()));
         result =
-            std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
+            TfLiteMax(result, static_cast<float>(std::numeric_limits<T>::min()));
         output_data[idx] = static_cast<T>(result);
       }
     }
diff --git a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
index ed87863a7e5..1c71ed37c71 100644
--- a/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
+++ b/tensorflow/lite/kernels/internal/reference/resize_nearest_neighbor.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cmath>
 
 #include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
 
 namespace tflite {
 
@@ -34,7 +35,7 @@ inline int32 GetNearestNeighbor(const int input_value, const int32 input_size,
   const float offset = half_pixel_centers ? 0.5f : 0.0f;
   int32 output_value = std::min(
       align_corners
-          ? static_cast<int32>(std::round((input_value + offset) * scale))
+          ? static_cast<int32>(TfLiteRound((input_value + offset) * scale))
           : static_cast<int32>(std::floor((input_value + offset) * scale)),
       input_size - 1);
   if (half_pixel_centers) {
diff --git a/tensorflow/lite/micro/kernels/activation_utils.h b/tensorflow/lite/micro/kernels/activation_utils.h
index 7525bc93b0a..a71826211c0 100644
--- a/tensorflow/lite/micro/kernels/activation_utils.h
+++ b/tensorflow/lite/micro/kernels/activation_utils.h
@@ -21,6 +21,8 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/max.h"
+#include "tensorflow/lite/kernels/internal/min.h"
 
 namespace tflite {
 namespace ops {
@@ -32,11 +34,11 @@ inline float ActivationValFloat(TfLiteFusedActivation act, float a) {
     case kTfLiteActNone:
       return a;
     case kTfLiteActRelu:
-      return std::max(0.0f, a);
+      return TfLiteMax(0.0f, a);
     case kTfLiteActRelu1:
-      return std::max(-1.0f, std::min(a, 1.0f));
+      return TfLiteMax(-1.0f, TfLiteMin(a, 1.0f));
     case kTfLiteActRelu6:
-      return std::max(0.0f, std::min(a, 6.0f));
+      return TfLiteMax(0.0f, TfLiteMin(a, 6.0f));
     case kTfLiteActTanh:
       return std::tanh(a);
     case kTfLiteActSignBit:

From c15dfabf7971685f2875e4a742dcf40b36fdf5a4 Mon Sep 17 00:00:00 2001
From: Kamil Rakoczy <krakoczy@antmicro.com>
Date: Wed, 1 Apr 2020 10:12:23 +0200
Subject: [PATCH 1290/1533] Add zephyr_riscv hello_world and magic_wand
 examples

Signed-off-by: Kamil Rakoczy <krakoczy@antmicro.com>
Signed-off-by: Karol Gugala <kgugala@antmicro.com>
---
 .../hello_world/zephyr_riscv/Makefile.inc     | 28 ++++++
 .../hello_world/zephyr_riscv/prj.conf         |  2 +
 .../hello_world/zephyr_riscv/src/assert.cc    |  7 ++
 .../magic_wand/zephyr_riscv/Makefile.inc      | 31 ++++++
 .../boards/litex_vexriscv.overlay             | 23 +++++
 .../examples/magic_wand/zephyr_riscv/prj.conf |  8 ++
 .../zephyr_riscv/src/accelerometer_handler.cc | 97 +++++++++++++++++++
 .../zephyr_riscv/src/accelerometer_handler.h  | 29 ++++++
 .../magic_wand/zephyr_riscv/src/assert.cc     |  7 ++
 .../micro/tools/make/download_and_extract.sh  | 13 +++
 .../micro/tools/make/helper_functions.inc     |  8 ++
 .../make/targets/litex_vexriscv_makefile.inc  |  6 ++
 .../templates/zephyr_cmake_project.cmake.tpl  | 18 ++++
 .../tools/make/third_party_downloads.inc      |  3 +
 14 files changed, 280 insertions(+)
 create mode 100644 tensorflow/lite/micro/examples/hello_world/zephyr_riscv/Makefile.inc
 create mode 100644 tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
 create mode 100644 tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
 create mode 100644 tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/Makefile.inc
 create mode 100644 tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
 create mode 100644 tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
 create mode 100644 tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
 create mode 100644 tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h
 create mode 100644 tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
 create mode 100644 tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc
 create mode 100644 tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl

diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/Makefile.inc b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/Makefile.inc
new file mode 100644
index 00000000000..292adbba732
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/Makefile.inc
@@ -0,0 +1,28 @@
+ifeq ($(TARGET), zephyr_vexriscv)
+	export ZEPHYR_TOOLCHAIN_VARIANT?=zephyr
+	export TOOLCHAIN_BASE=${ZEPHYR_SDK_INSTALL_DIR}/riscv64-zephyr-elf/riscv64-zephyr-elf
+	export TOOLCHAIN_VERSION=9.2.0
+	PROJECT_INCLUDES += ${CURDIR} ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION} ${TOOLCHAIN_BASE}/include ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION}/riscv64-zephyr-elf/rv32i/ilp32
+	ZEPHYR_HELLO_WORLD_SRCS = \
+tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc \
+tensorflow/lite/micro/examples/hello_world/main.cc \
+tensorflow/lite/micro/examples/hello_world/main_functions.cc \
+tensorflow/lite/micro/examples/hello_world/constants.cc \
+tensorflow/lite/micro/examples/hello_world/output_handler.cc \
+tensorflow/lite/micro/examples/hello_world/model.cc \
+prj.conf
+
+$(eval $(call generate_project,cmake,zephyr_cmake_project.cmake,hello_world,$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(ZEPHYR_HELLO_WORLD_SRCS) $(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(HELLO_WORLD_HDRS),,$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),))
+
+$(PRJDIR)hello_world/cmake/CMakeLists.txt: $(PRJDIR)hello_world/cmake/zephyr_cmake_project.cmake
+	@sed -E 's#\%\{INCLUDE_DIRS\}\%#$(PROJECT_INCLUDES)#g' $< > $@
+
+#We are skipping here copy of `zephyr` third_party repository
+#To compile standalone project ZEPHYR_BASE enviroment variable should be set
+hello_world_bin: generate_hello_world_cmake_project $(PRJDIR)hello_world/cmake/CMakeLists.txt
+	( \
+	  . ${ZEPHYR_BASE}/venv-zephyr/bin/activate; \
+	  cmake -B${GENDIR}hello_world/build -DBOARD="litex_vexriscv" -H${PRJDIR}hello_world/cmake/ -DPython_ROOT_DIR=${ZEPHYR_BASE}/venv-zephyr/bin/; \
+	  make -C ${GENDIR}hello_world/build; \
+	)
+endif
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
new file mode 100644
index 00000000000..7f34b46a0e9
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
@@ -0,0 +1,2 @@
+CONFIG_CPLUSPLUS=y
+CONFIG_NEWLIB_LIBC=y
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
new file mode 100644
index 00000000000..595b05c0e79
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
@@ -0,0 +1,7 @@
+extern "C" {
+
+  void __assert_func (const char *, int, const char *, const char *) {
+  }
+
+}
+
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/Makefile.inc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/Makefile.inc
new file mode 100644
index 00000000000..e257e6620db
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/Makefile.inc
@@ -0,0 +1,31 @@
+ifeq ($(TARGET), zephyr_vexriscv)
+	export ZEPHYR_TOOLCHAIN_VARIANT?=zephyr
+	export TOOLCHAIN_BASE=${ZEPHYR_SDK_INSTALL_DIR}/riscv64-zephyr-elf/riscv64-zephyr-elf
+	export TOOLCHAIN_VERSION=9.2.0
+	PROJECT_INCLUDES += ${CURDIR} ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION} ${TOOLCHAIN_BASE}/include ${TOOLCHAIN_BASE}/include/c++/${TOOLCHAIN_VERSION}/riscv64-zephyr-elf/rv32i/ilp32
+	ZEPHYR_MAGIC_WAND_SRCS = \
+tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc \
+tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc \
+tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h \
+tensorflow/lite/micro/examples/magic_wand/main.cc \
+tensorflow/lite/micro/examples/magic_wand/main_functions.cc \
+tensorflow/lite/micro/examples/magic_wand/magic_wand_model_data.cc \
+tensorflow/lite/micro/examples/magic_wand/gesture_predictor.cc \
+tensorflow/lite/micro/examples/magic_wand/output_handler.cc \
+boards/litex_vexriscv.overlay \
+prj.conf
+
+$(eval $(call generate_project,cmake,zephyr_cmake_project.cmake,magic_wand,$(MICROLITE_CC_SRCS) $(THIRD_PARTY_CC_SRCS) $(ZEPHYR_MAGIC_WAND_SRCS) $(MICROLITE_CC_HDRS) $(THIRD_PARTY_CC_HDRS) $(magic_wand_HDRS),,$(LDFLAGS) $(MICROLITE_LIBS),$(CXXFLAGS),$(CCFLAGS),))
+
+$(PRJDIR)magic_wand/cmake/CMakeLists.txt: $(PRJDIR)magic_wand/cmake/zephyr_cmake_project.cmake
+	@sed -E 's#\%\{INCLUDE_DIRS\}\%#$(PROJECT_INCLUDES)#g' $< > $@
+
+#We are skipping here copy of `zephyr` third_party repository 
+#To compile standalone project ZEPHYR_BASE enviroment variable should be set
+magic_wand_bin: generate_magic_wand_cmake_project $(PRJDIR)magic_wand/cmake/CMakeLists.txt
+	( \
+	  . ${ZEPHYR_BASE}/venv-zephyr/bin/activate; \
+	  cmake -B${GENDIR}magic_wand/build -DBOARD="litex_vexriscv" -H${PRJDIR}magic_wand/cmake/ -DPython_ROOT_DIR=${ZEPHYR_BASE}/venv-zephyr/bin/; \
+	  make -C ${GENDIR}magic_wand/build; \
+	)
+endif
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
new file mode 100644
index 00000000000..6f6ca9d6794
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
@@ -0,0 +1,23 @@
+&i2c0 {
+	label = "I2C0";
+	reg = <0xe0003000 0x4 0xe0003004 0x4>;
+
+	adxl@1d {
+		compatible = "adi,adxl345";
+		label = "accel-0";
+		reg = <0x1d>;
+	};
+
+};
+
+&pwm0 {
+	status = "disabled";
+};
+
+&eth0 {
+	status = "disabled";
+};
+
+&prbs0 {
+	status = "disabled";
+};
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
new file mode 100644
index 00000000000..ec75dbabd8e
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
@@ -0,0 +1,8 @@
+CONFIG_CPLUSPLUS=y
+CONFIG_NEWLIB_LIBC=y
+CONFIG_SENSOR=y
+CONFIG_ADXL345=y
+CONFIG_PWM=n
+CONFIG_PWM_LITEX=n
+CONFIG_NETWORKING=n
+CONFIG_MAIN_STACK_SIZE=4096
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
new file mode 100644
index 00000000000..70d036de93b
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
@@ -0,0 +1,97 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/examples/magic_wand/accelerometer_handler.h"
+#include <stdio.h>
+#include <zephyr.h>
+#include <device.h>
+#include <string.h>
+#include <drivers/sensor.h>
+
+#define BUFLEN 300
+int begin_index = 0;
+struct device *sensor = NULL;
+int current_index = 0;
+
+float bufx[BUFLEN] = {0.0f};
+float bufy[BUFLEN] = {0.0f};
+float bufz[BUFLEN] = {0.0f};
+
+bool initial = true;
+
+TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
+  sensor = device_get_binding(DT_INST_0_ADI_ADXL345_LABEL);
+  if(sensor == NULL) {
+    error_reporter->Report("Failed to get accelerometer, label: %s\n", DT_INST_0_ADI_ADXL345_LABEL);
+  } else {
+    error_reporter->Report("Got accelerometer, label: %s\n", DT_INST_0_ADI_ADXL345_LABEL);
+  }
+  return kTfLiteOk;
+}
+
+bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
+                       int length) {
+  int rc;
+  struct sensor_value accel[3];
+  int samples_count;
+
+  rc = sensor_sample_fetch(sensor);
+  if(rc < 0) {
+    error_reporter -> Report("Fetch failed\n");
+    return false;
+  }
+   //skip if there is no data
+  if(!rc) {
+    return false;
+  }
+
+  samples_count = rc;
+  for(int i = 0; i < samples_count; i++) {
+    rc = sensor_channel_get(sensor,
+                            SENSOR_CHAN_ACCEL_XYZ,
+                            accel);
+    if (rc < 0) {
+      error_reporter->Report("ERROR: Update failed: %d\n", rc);
+      return false;
+    }
+    bufx[begin_index] = (float)sensor_value_to_double(&accel[0]);
+    bufy[begin_index] = (float)sensor_value_to_double(&accel[1]);
+    bufz[begin_index] = (float)sensor_value_to_double(&accel[2]);
+    begin_index++;
+    if (begin_index >= BUFLEN) begin_index = 0;
+
+  }
+
+  if(initial && begin_index >= 100) {
+    initial = false;
+  }
+
+  if (initial) {
+    return false;
+  }
+
+  int sample = 0;
+  for (int i = 0; i < (length - 3); i+=3) {
+    int ring_index = begin_index + sample - length/3;
+    if(ring_index < 0) {
+      ring_index += BUFLEN;
+    }
+    input[i] =   bufx[ring_index];
+    input[i+1] = bufy[ring_index];
+    input[i+2] = bufz[ring_index];
+    sample++;
+  }
+  return true;
+}
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h
new file mode 100644
index 00000000000..5b3fb54a4e4
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.h
@@ -0,0 +1,29 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_ACCELEROMETER_HANDLER_H_
+#define TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_ACCELEROMETER_HANDLER_H_
+
+#define kChannelNumber 3
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/micro/micro_error_reporter.h"
+
+extern int begin_index;
+extern TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter);
+extern bool ReadAccelerometer(tflite::ErrorReporter* error_reporter,
+                              float* input, int length, bool reset_buffer);
+
+#endif  // TENSORFLOW_LITE_MICRO_EXAMPLES_MAGIC_WAND_ACCELEROMETER_HANDLER_H_
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
new file mode 100644
index 00000000000..595b05c0e79
--- /dev/null
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
@@ -0,0 +1,7 @@
+extern "C" {
+
+  void __assert_func (const char *, int, const char *, const char *) {
+  }
+
+}
+
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 2f602ce9d4c..c82b247ef8c 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -93,6 +93,17 @@ build_embarc_mli() {
   make -j 4 -C ${1}/lib/make TCF_FILE=${2}
 }
 
+setup_zephyr() {
+  command -v virtualenv >/dev/null 2>&1 || {
+    echo >&2 "The required 'virtualenv' tool isn't installed. Try 'pip install virtualenv'."; exit 1;
+  }
+  virtualenv -p python3 ${1}/venv-zephyr
+  . ${1}/venv-zephyr/bin/activate
+  python ${1}/venv-zephyr/bin/pip install -r ${1}/scripts/requirements.txt
+  west init -m https://github.com/zephyrproject-rtos/zephyr.git
+  deactivate
+}
+
 # Main function handling the download, verify, extract, and patch process.
 download_and_extract() {
   local usage="Usage: download_and_extract URL MD5 DIR [ACTION] [ACTION_PARAM]"
@@ -179,6 +190,8 @@ download_and_extract() {
     else
       build_embarc_mli ${dir} ${action_param1}
     fi
+  elif [[ ${action} == "setup_zephyr" ]]; then
+    setup_zephyr ${dir}
   elif [[ ${action} ]]; then
     echo "Unknown action '${action}'"
     exit 1
diff --git a/tensorflow/lite/micro/tools/make/helper_functions.inc b/tensorflow/lite/micro/tools/make/helper_functions.inc
index 1cf9afa8794..d3403a81ef0 100644
--- a/tensorflow/lite/micro/tools/make/helper_functions.inc
+++ b/tensorflow/lite/micro/tools/make/helper_functions.inc
@@ -74,6 +74,14 @@ $(PRJDIR)$(3)/$(1)/%: % third_party_downloads
 	@mkdir -p $$(dir $$@)
 	@cp $$< $$@
 
+$(PRJDIR)$(3)/cmake/boards/%: tensorflow/lite/micro/examples/$(3)/zephyr_riscv/boards/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+
+$(PRJDIR)$(3)/cmake/%: tensorflow/lite/micro/examples/$(3)/zephyr_riscv/%
+	@mkdir -p $$(dir $$@)
+	@cp $$< $$@
+
 $(PRJDIR)$(3)/$(1)/third_party/%: tensorflow/lite/micro/tools/make/downloads/% third_party_downloads
 	@mkdir -p $$(dir $$@)
 	@cp $$< $$@
diff --git a/tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc b/tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc
new file mode 100644
index 00000000000..c9764ed2f26
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/litex_vexriscv_makefile.inc
@@ -0,0 +1,6 @@
+ifeq ($(TARGET), zephyr_vexriscv)
+  $(eval $(call add_third_party_download,$(ZEPHYR_URL),$(ZEPHYR_MD5),zephyr,setup_zephyr))
+  export ZEPHYR_SDK_INSTALL_DIR?=/opt/zephyr-sdk
+  export ZEPHYR_BASE?=$(realpath $(MAKEFILE_DIR)/downloads/zephyr)
+endif
+
diff --git a/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
new file mode 100644
index 00000000000..d7bb4511f32
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/templates/zephyr_cmake_project.cmake.tpl
@@ -0,0 +1,18 @@
+cmake_minimum_required(VERSION 3.13.1)
+include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
+project(tf_lite_magic_wand)
+
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} %{CXX_FLAGS}%")
+set(CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} %{CC_FLAGS}%")
+set(CMAKE_EXE_LINKER_FLAGS "%{LINKER_FLAGS}%")
+
+# -fno-threadsafe-statics -- disables the mutex around initialization of local static variables
+target_compile_options(app PRIVATE "-fno-threadsafe-statics")
+
+target_sources(app PRIVATE
+		%{SRCS}%
+		)
+
+target_include_directories(app PRIVATE
+		%{INCLUDE_DIRS}%
+                )
diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 806501a004a..b7ba8c465a8 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -77,6 +77,9 @@ EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"
 EMBARC_MLI_PRE_COMPILED_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/releases/download/Release_1.1_RC2/embARC_MLI_package.zip"
 EMBARC_MLI_PRE_COMPILED_MD5 := "a95ff9e0370434484f14e7e4114327f6"
 
+ZEPHYR_URL := "https://github.com/antmicro/zephyr/archive/55e36b9.zip"
+ZEPHYR_MD5 := "755622eb4812fde918a6382b65d50c3b"
+
 XTENSA_HIFI4_URL :="https://github.com/foss-xtensa/nnlib-hifi4/raw/master/archive/xa_nnlib_04_07.zip"
 XTENSA_HIFI4_MD5 :="f234764928f9a42901df33a27e118c8b"
 

From 7ae64faf2c8cffed288657f23242b577197780ec Mon Sep 17 00:00:00 2001
From: Karol Gugala <kgugala@antmicro.com>
Date: Thu, 7 May 2020 11:03:24 +0200
Subject: [PATCH 1291/1533] lite: magic_wand: zephyr: report errors with macro

Signed-off-by: Karol Gugala <kgugala@antmicro.com>
---
 .../magic_wand/zephyr_riscv/src/accelerometer_handler.cc  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
index 70d036de93b..cebac640b48 100644
--- a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/accelerometer_handler.cc
@@ -34,9 +34,9 @@ bool initial = true;
 TfLiteStatus SetupAccelerometer(tflite::ErrorReporter* error_reporter) {
   sensor = device_get_binding(DT_INST_0_ADI_ADXL345_LABEL);
   if(sensor == NULL) {
-    error_reporter->Report("Failed to get accelerometer, label: %s\n", DT_INST_0_ADI_ADXL345_LABEL);
+    TF_LITE_REPORT_ERROR(error_reporter, "Failed to get accelerometer, label: %s\n", DT_INST_0_ADI_ADXL345_LABEL);
   } else {
-    error_reporter->Report("Got accelerometer, label: %s\n", DT_INST_0_ADI_ADXL345_LABEL);
+    TF_LITE_REPORT_ERROR(error_reporter, "Got accelerometer, label: %s\n", DT_INST_0_ADI_ADXL345_LABEL);
   }
   return kTfLiteOk;
 }
@@ -49,7 +49,7 @@ bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
 
   rc = sensor_sample_fetch(sensor);
   if(rc < 0) {
-    error_reporter -> Report("Fetch failed\n");
+    TF_LITE_REPORT_ERROR(error_reporter, "Fetch failed\n");
     return false;
   }
    //skip if there is no data
@@ -63,7 +63,7 @@ bool ReadAccelerometer(tflite::ErrorReporter* error_reporter, float* input,
                             SENSOR_CHAN_ACCEL_XYZ,
                             accel);
     if (rc < 0) {
-      error_reporter->Report("ERROR: Update failed: %d\n", rc);
+      TF_LITE_REPORT_ERROR(error_reporter, "ERROR: Update failed: %d\n", rc);
       return false;
     }
     bufx[begin_index] = (float)sensor_value_to_double(&accel[0]);

From 9e9bf48904489d880f58d2b9e398fba49c444491 Mon Sep 17 00:00:00 2001
From: Kamil Rakoczy <krakoczy@antmicro.com>
Date: Thu, 14 May 2020 12:50:54 +0200
Subject: [PATCH 1292/1533] Add missing copyright headers

Signed-off-by: Kamil Rakoczy <krakoczy@antmicro.com>
---
 .../examples/hello_world/zephyr_riscv/prj.conf    | 14 ++++++++++++++
 .../hello_world/zephyr_riscv/src/assert.cc        | 15 +++++++++++++++
 .../zephyr_riscv/boards/litex_vexriscv.overlay    | 15 +++++++++++++++
 .../examples/magic_wand/zephyr_riscv/prj.conf     | 14 ++++++++++++++
 .../magic_wand/zephyr_riscv/src/assert.cc         | 15 +++++++++++++++
 5 files changed, 73 insertions(+)

diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
index 7f34b46a0e9..e36145c332a 100644
--- a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/prj.conf
@@ -1,2 +1,16 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 CONFIG_CPLUSPLUS=y
 CONFIG_NEWLIB_LIBC=y
diff --git a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
index 595b05c0e79..e99eb409125 100644
--- a/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
+++ b/tensorflow/lite/micro/examples/hello_world/zephyr_riscv/src/assert.cc
@@ -1,3 +1,18 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 extern "C" {
 
   void __assert_func (const char *, int, const char *, const char *) {
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
index 6f6ca9d6794..a75435b3ea6 100644
--- a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/boards/litex_vexriscv.overlay
@@ -1,3 +1,18 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 &i2c0 {
 	label = "I2C0";
 	reg = <0xe0003000 0x4 0xe0003004 0x4>;
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
index ec75dbabd8e..e4152086d5f 100644
--- a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/prj.conf
@@ -1,3 +1,17 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 CONFIG_CPLUSPLUS=y
 CONFIG_NEWLIB_LIBC=y
 CONFIG_SENSOR=y
diff --git a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
index 595b05c0e79..e99eb409125 100644
--- a/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/zephyr_riscv/src/assert.cc
@@ -1,3 +1,18 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
 extern "C" {
 
   void __assert_func (const char *, int, const char *, const char *) {

From 853c12e4b94b07f739398e94c9f7585c19569d01 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 02:02:19 -0700
Subject: [PATCH 1293/1533] Update GraphDef version to 415.

PiperOrigin-RevId: 313548695
Change-Id: If7864c8b03693cfc90ccd6fece903eab76fde8e7
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 7131d1f7227..4f06edd4162 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 414  // Updated: 2020/5/27
+#define TF_GRAPH_DEF_VERSION 415  // Updated: 2020/5/28
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 2217251dfa78bf1c935452ddc682ef1cb873242a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 02:02:37 -0700
Subject: [PATCH 1294/1533] compat: Update forward compatibility horizon to
 2020-05-28

PiperOrigin-RevId: 313548744
Change-Id: I813b94e1b38aa68544a50968a0088dd186675cec
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 53545c58a2d..d21f1755d94 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 27)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 28)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 35312cceb134c28c9fe1a53cb8d5c27f281c4054 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Thu, 28 May 2020 03:52:35 -0700
Subject: [PATCH 1295/1533] Remove workarounds for XLA's previous inf/nan
 behavior after it's been fixed.

PiperOrigin-RevId: 313559788
Change-Id: I3d5fe3d7b7267d073ef45fe042503932d99b03cb
---
 tensorflow/compiler/tests/binary_ops_test.py  |  5 -----
 tensorflow/compiler/tests/unary_ops_test.py   | 20 +++++++++----------
 tensorflow/python/BUILD                       |  2 --
 .../python/kernel_tests/numerics_test.py      |  6 ------
 tensorflow/python/ops/nn_test.py              |  2 ++
 5 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index c7be2c55de7..422695c374b 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import os
 
 import numpy as np
 
@@ -1609,8 +1608,4 @@ class BinaryOpsTest(xla_test.XLATestCase):
 
 
 if __name__ == "__main__":
-  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
-  os.environ[
-      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
-          "XLA_FLAGS", "")
   googletest.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index d0e928a5ce6..85bf89c4f9e 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -347,17 +347,15 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array(
               [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype))
 
-      # TODO(b/130689556): Turn this on for CPU when we start honoring NaNs.
-      if self.device != "XLA_CPU":
-        self._assertOpOutputMatchesExpected(
-            math_ops.tanh,
-            np.array([[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20],
-                      [19, -19, 22, -22]],
-                     dtype=dtype),
-            expected=np.array(
-                [[0.76159418, 0.96402758, 0.99505478, 0.99932933],
-                 [1.0, -1.0, np.nan, 1.0], [1.0, -1.0, 1.0, -1.0]],
-                dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          math_ops.tanh,
+          np.array([[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20],
+                    [19, -19, 22, -22]],
+                   dtype=dtype),
+          expected=np.array(
+              [[0.76159418, 0.96402758, 0.99505478, 0.99932933],
+               [1.0, -1.0, np.nan, 1.0], [1.0, -1.0, 1.0, -1.0]],
+              dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           nn_ops.log_softmax,
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 13c58c74583..2fb22a89706 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5122,8 +5122,6 @@ cuda_py_test(
     srcs = ["ops/nn_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
-    # TODO(b/130689556): Numerical differences due to fast math on CPU.
-    xla_enable_strict_auto_jit = False,
     deps = [
         ":array_ops",
         ":client_testlib",
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 475badb6efe..eadb8ceff07 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -133,8 +131,4 @@ class NumericsTest(test.TestCase):
 
 
 if __name__ == "__main__":
-  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
-  os.environ[
-      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
-          "XLA_FLAGS", "")
   test.main()
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 0088c04f909..477e0528c0d 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -1207,6 +1207,7 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [4, 9])
 
+  @test_util.disable_xla("unsupported data format")
   def testNHWCToWHCN(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
@@ -1215,6 +1216,7 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [9, 4, 3, 7])
 
+  @test_util.disable_xla("unsupported data format")
   def testNHWCToWHCN_Size2(self):
     x_val = [4, 9]
     x = constant_op.constant(x_val)

From 9568c0ce9170337a33f0e06afcecba88a5f6d41f Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Thu, 28 May 2020 04:17:04 -0700
Subject: [PATCH 1296/1533] Stop explicitly passing a `name` arg to `slice`
 from `slice_helper`. (while leaving in the name_scopes). This makes the slice
 operator more dispatch-friendly, because `slice` dispatches while
 `slice_helper` doesn't. This means the `dispatched` operation no longer has a
 name locked in and is less likely to run into issues where multiple ops are
 created w/ the same name.

PiperOrigin-RevId: 313562067
Change-Id: Ia55e850ed1a86d8c58f1db58357b4cc123f49714
---
 tensorflow/python/ops/array_ops.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a641633b1f5..118c2cfca55 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -983,7 +983,7 @@ def _slice_helper(tensor, slice_spec, var=None):
   with ops.name_scope(
       None,
       "strided_slice", [tensor] + begin + end + strides,
-      skip_on_eager=False) as name:
+      skip_on_eager=False):
     if begin:
       packed_begin, packed_end, packed_strides = (stack(begin), stack(end),
                                                   stack(strides))
@@ -1009,8 +1009,7 @@ def _slice_helper(tensor, slice_spec, var=None):
         shrink_axis_mask=shrink_axis_mask,
         new_axis_mask=new_axis_mask,
         ellipsis_mask=ellipsis_mask,
-        var=var,
-        name=name)
+        var=var)
 
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
@@ -1194,7 +1193,7 @@ def strided_slice(input_,
       if var is None:
         raise ValueError("Sliced assignment is only supported for variables")
       else:
-        if name is None:
+        if name is None and parent_name:
           name = parent_name + "_assign"
 
         return var._strided_slice_assign(

From a2e1334b92cb6489593e9c82840a0ac84bfad6c2 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Thu, 28 May 2020 04:26:43 -0700
Subject: [PATCH 1297/1533] Bump open source llvm revision to
 cf86a234ba86acf0bb875e21d27833be36e08be4

PiperOrigin-RevId: 313563029
Change-Id: I3e915d75402b0fb38ef8f732274e50592ece0c3e
---
 tensorflow/workspace.bzl                  |  4 ++--
 third_party/llvm/llvm.autogenerated.BUILD | 22 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 217edee0f86..db87f9a730d 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "1108f5c737dbdab0277874a7e5b237491839c43a"
-    LLVM_SHA256 = "bbdaaa145a5a8eed8e6a0f06a3b9965f32b03286eddea5f50c5af2d1f3d008df"
+    LLVM_COMMIT = "cf86a234ba86acf0bb875e21d27833be36e08be4"
+    LLVM_SHA256 = "5375bdcdabd4886ab86eddfddef6e21dbc3cac9df67af7d3c44fadb527f74e25"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 4c3d56c42a7..c16b62f635a 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -2744,6 +2744,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ml_policies",
+    srcs = glob([
+        "lib/Analysis/ML/*.c",
+        "lib/Analysis/ML/*.cpp",
+        "lib/Analysis/ML/*.inc",
+        "lib/Analysis/ML/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Analysis/ML/*.h",
+        "include/llvm/Analysis/ML/*.def",
+        "include/llvm/Analysis/ML/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
 cc_library(
     name = "msp430_asm_parser",
     srcs = glob([
@@ -3227,6 +3248,7 @@ cc_library(
         ":inst_combine",
         ":instrumentation",
         ":ipo",
+        ":ml_policies",
         ":scalar",
         ":support",
         ":target",

From 3c9dfef4696c821de6e02f3c676c0c356602894b Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Thu, 28 May 2020 04:32:13 -0700
Subject: [PATCH 1298/1533] Check shape of constant tensor for ADD

GPU only handles 1x1x...xn dimiensions tensors. Do not handle random
constants.

PiperOrigin-RevId: 313563512
Change-Id: Ifee00ccc2138b4aa1067d476f8f73e5c8cc1e19a
---
 tensorflow/lite/delegates/gpu/common/model_builder.cc      | 7 +++++++
 .../lite/delegates/gpu/common/model_builder_helper.cc      | 7 ++++++-
 .../lite/delegates/gpu/common/model_builder_helper.h       | 2 ++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 061c65095eb..29d9813379e 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -402,6 +402,13 @@ class AddOperationParser : public TFLiteOperationParser {
       return absl::UnimplementedError("ADD requires two input tensors.");
     }
     // TODO(eignasheva): Add shapes check.
+    for (int i = 0; i < 2; i++) {
+      auto input = tflite::GetInput(context, tflite_node, i);
+      if (IsConstantTensor(input) && input->dims->size > 0) {
+        RETURN_IF_ERROR(CheckIfLinearConvertible(input->dims));
+      }
+    }
+
     TfLiteAddParams* tf_options = nullptr;
     return RetrieveBuiltinData(tflite_node, &tf_options);
   }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 9a15f940fbd..a1705e6cf78 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -239,7 +239,7 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape) {
   return absl::OkStatus();
 }
 
-absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
+absl::Status CheckIfLinearConvertible(const TfLiteIntArray* dimensions) {
   if (dimensions->size <= 0) {
     return absl::InvalidArgumentError("Dimension is empty.");
   }
@@ -249,6 +249,11 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
           GetDimensionString(dimensions), "  cannot be reduced to linear."));
     }
   }
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
+  RETURN_IF_ERROR(CheckIfLinearConvertible(dimensions));
   shape->v = dimensions->data[dimensions->size - 1];
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 9caa5630037..6cbfcd9e7d6 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -108,6 +108,8 @@ absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape);
 
+absl::Status CheckIfLinearConvertible(const TfLiteIntArray* dimensions);
+
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape);
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape);

From f0f84935e31bc7957d7311ceba53ba02271ad9f4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 05:26:54 -0700
Subject: [PATCH 1299/1533] Add quantized TANH operation, mostly copied from
 LOGISTIC.

PiperOrigin-RevId: 313569148
Change-Id: Id7801e9afaa7cb10dc51234cba9bf4d9320a0dc5
---
 tensorflow/lite/kernels/internal/BUILD        |   2 +
 .../internal/reference/reference_ops.h        |  54 +----
 .../lite/kernels/internal/reference/tanh.h    |  86 +++++++
 tensorflow/lite/micro/kernels/BUILD           |  13 ++
 tensorflow/lite/micro/kernels/elementwise.cc  |  17 --
 tensorflow/lite/micro/kernels/tanh.cc         | 128 ++++++++++
 tensorflow/lite/micro/kernels/tanh_test.cc    | 220 ++++++++++++++++++
 tensorflow/lite/micro/tools/make/Makefile     |   4 +-
 8 files changed, 453 insertions(+), 71 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/reference/tanh.h
 create mode 100644 tensorflow/lite/micro/kernels/tanh.cc
 create mode 100644 tensorflow/lite/micro/kernels/tanh_test.cc

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index d6a96efdbf7..51b58f92de1 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -481,6 +481,7 @@ cc_library(
         "reference/strided_slice.h",
         "reference/sub.h",
         "reference/svdf.h",
+        "reference/tanh.h",
     ],
     build_for_embedded = True,
     copts = tflite_copts(),
@@ -551,6 +552,7 @@ cc_library(
         "reference/softmax.h",
         "reference/strided_slice.h",
         "reference/sub.h",
+        "reference/tanh.h",
     ],
     copts = tflite_copts(),
     deps = [
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 1a6c6d0d80e..e991a21e3bd 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
 #include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -1343,59 +1344,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
-                 const RuntimeShape& output_shape, float* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    float val = input_data[i];
-    float result = std::tanh(val);
-    output_data[i] = result;
-  }
-}
-
-// Convenience version that allows, for example, generated-code calls to be
-// uniform between data types.
-inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
-                 const float* input_data, const RuntimeShape& output_shape,
-                 float* output_data) {
-  // Drop params: not needed.
-  Tanh(input_shape, input_data, output_shape, output_data);
-}
-
-inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
-                 const int16* input_data, const RuntimeShape& output_shape,
-                 int16* output_data) {
-  const int input_left_shift = params.input_left_shift;
-  // Support for shifts is limited until we have a parameterized version of
-  // SaturatingRoundingMultiplyByPOT().
-  TFLITE_DCHECK_GE(input_left_shift, 0);
-  TFLITE_DCHECK_LE(input_left_shift, 1);
-
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  // F0 uses 0 integer bits, range [-1, 1].
-  // This is the return type of math functions such as tanh, logistic,
-  // whose range is in [-1, 1].
-  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
-  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
-  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
-
-  if (input_left_shift == 0) {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(input_data[i]);
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
-    }
-  } else {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(
-          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
-    }
-  }
-}
 
 inline void Dequantize(const RuntimeShape& input_shape,
                        const Eigen::half* input_data,
diff --git a/tensorflow/lite/kernels/internal/reference/tanh.h b/tensorflow/lite/kernels/internal/reference/tanh.h
new file mode 100644
index 00000000000..0f31d4ddeef
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& output_shape,
+                 float* output_data) {
+  // Drop params: not needed.
+  Tanh(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const int16* input_data, const RuntimeShape& output_shape,
+                 int16* output_data) {
+  const int input_left_shift = params.input_left_shift;
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // F0 uses 0 integer bits, range [-1, 1].
+  // This is the return type of math functions such as tanh, logistic,
+  // whose range is in [-1, 1].
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+  if (input_left_shift == 0) {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(input_data[i]);
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  } else {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(
+          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index b6c6054d604..bbb5c67d9e5 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "split.cc",
         "strided_slice.cc",
         "sub.cc",
+        "tanh.cc",
         "unpack.cc",
     ] + select({
         "//conditions:default": [
@@ -153,6 +154,7 @@ cc_library(
         "strided_slice.cc",
         "sub.cc",
         "svdf.cc",
+        "tanh.cc",
         "unpack.cc",
     ],
     hdrs = ["micro_ops.h"],
@@ -656,3 +658,14 @@ tflite_micro_cc_test(
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "tanh_test",
+    srcs = ["tanh_test.cc"],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/micro/kernels/elementwise.cc b/tensorflow/lite/micro/kernels/elementwise.cc
index 93fc4ec0d88..b69d260a826 100644
--- a/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/micro/kernels/elementwise.cc
@@ -106,9 +106,6 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
-TfLiteStatus TANHEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, std::tanh);
-}
 
 }  // namespace
 }  // namespace elementwise
@@ -225,20 +222,6 @@ TfLiteRegistration* Register_LOGICAL_NOT() {
   return &r;
 }
 
-TfLiteRegistration* Register_TANH() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::TANHEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
-}
-
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
new file mode 100644
index 00000000000..9ee5b74bde4
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace activations {
+namespace {
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
+  int input_left_shift;
+};
+
+TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
+                                       OpData* data) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    // The number if input integer bits is set to be consistent with the
+    // required value in reference_integer_ops::Tanh
+    static constexpr int kInputIntegerBits = 4;
+    const double input_real_multiplier =
+        static_cast<double>(input->params.scale) *
+        static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
+    data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));
+
+    data->input_range_radius =
+        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  OpData data;
+  CalculateArithmeticOpData(context, node, &data);
+
+  if (input->type == kTfLiteFloat32) {
+    switch (output->type) {
+      case kTfLiteFloat32: {
+        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+        return kTfLiteOk;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt8) {
+    switch (output->type) {
+      case kTfLiteInt8: {
+        reference_integer_ops::Tanh(
+            input->params.zero_point, data.input_range_radius,
+            data.input_multiplier, data.input_left_shift,
+            NumElements(input->dims), GetTensorData<int8_t>(input),
+            GetTensorData<int8_t>(output));
+        return kTfLiteOk;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                       TfLiteTypeGetName(input->type),
+                       TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace activations
+
+TfLiteRegistration* Register_TANH() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/activations::TanhEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
new file mode 100644
index 00000000000..2a367107771
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -0,0 +1,220 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestTanhFloat(std::initializer_list<int> input_dims_data,
+                   std::initializer_list<float> input_data,
+                   std::initializer_list<float> expected_output_data,
+                   std::initializer_list<int> output_dims_data,
+                   float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_TANH, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestTanhInt8(std::initializer_list<int> input_dims_data,
+                  std::initializer_list<int8_t> input_data, float input_min,
+                  float input_max,
+                  std::initializer_list<int8_t> expected_output_data,
+                  std::initializer_list<int> output_dims_data, float output_min,
+                  float output_max, int8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_TANH, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 1;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTestFloat) {
+  const int output_elements_count = 10;
+  float output_data[output_elements_count];
+  tflite::testing::TestTanhFloat({2, 1, 5},  // Input shape.
+                                 {
+                                     1.0,
+                                     2.0,
+                                     3.0,
+                                     4.0,
+                                     93.0,
+                                     -1.0,
+                                     -2.0,
+                                     -3.0,
+                                     -4.0,
+                                     -93.0,
+                                 },
+                                 {
+                                     // Expected results.
+                                     0.76159416,
+                                     0.96402758,
+                                     0.99505475,
+                                     0.9993293,
+                                     1.0,
+                                     -0.76159416,
+                                     -0.96402758,
+                                     -0.99505475,
+                                     -0.9993293,
+                                     -1.0,
+                                 },
+                                 {2, 1, 5},  // Output shape.
+                                 output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestInt8) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -31.75f;
+  const float input_max = 32.0f;
+  const float output_min = -1.0f;
+  const float output_max = (127.0f / 128.0f);
+
+  const int output_elements_count = 10;
+  int8_t output_data[output_elements_count];
+  tflite::testing::TestTanhInt8(
+      {2, 1, output_elements_count},  // Input shape.
+      {F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
+       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
+       F2QS(5.0, input_min, input_max), F2QS(-1.0, input_min, input_max),
+       F2QS(-2.0, input_min, input_max), F2QS(-3.0, input_min, input_max),
+       F2QS(-4.0, input_min, input_max), F2QS(-5.0, input_min, input_max)},
+      input_min, input_max,  // Input quantized range.
+      {                      // Expected results.
+       F2QS(0.76159416, output_min, output_max),
+       F2QS(0.96402758, output_min, output_max),
+       F2QS(0.99505475, output_min, output_max),
+       F2QS(0.9993293, output_min, output_max),
+       F2QS(0.9999092, output_min, output_max),
+       F2QS(-0.76159416, output_min, output_max),
+       F2QS(-0.96402758, output_min, output_max),
+       F2QS(-0.99505475, output_min, output_max),
+       F2QS(-0.9993293, output_min, output_max),
+       F2QS(-0.9999092, output_min, output_max)},
+      {2, 1, output_elements_count},  // Output shape.
+      output_min, output_max,         // Output quantized range.
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 13761cca28b..a94d643a3d0 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -164,6 +164,8 @@ tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/mul.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h \
 tensorflow/lite/kernels/internal/reference/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/maximum_minimum.h \
 tensorflow/lite/kernels/internal/reference/mul.h \
@@ -181,7 +183,7 @@ tensorflow/lite/kernels/internal/reference/softmax.h \
 tensorflow/lite/kernels/internal/reference/sub.h \
 tensorflow/lite/kernels/internal/reference/logistic.h \
 tensorflow/lite/kernels/internal/reference/strided_slice.h \
-tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
+tensorflow/lite/kernels/internal/reference/tanh.h \
 tensorflow/lite/kernels/internal/cppmath.h \
 tensorflow/lite/kernels/internal/strided_slice_logic.h \
 tensorflow/lite/kernels/internal/tensor.h \

From 6688e1c23b0f01400a02153408294e37295c5e96 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Thu, 28 May 2020 06:14:12 -0700
Subject: [PATCH 1300/1533] 1. Add a new event type for general tflite runtime
 instrumentation, and allow a Profiler to choose which event type it will
 record.

2. An initial introduction of InterpreterDetailedStatus to detail the error code that a TFLite interpreter may have during runtime.

3. Apply the above two to instrument the overall invoke latency and status as an initial exemplar usage.

PiperOrigin-RevId: 313573701
Change-Id: I8c4189c72066d7d6f4c91014ef4f30e32635c115
---
 tensorflow/lite/BUILD                         |   1 +
 tensorflow/lite/core/api/profiler.h           | 105 ++++++++++++++----
 tensorflow/lite/core/subgraph.h               |  33 +++---
 tensorflow/lite/delegates/BUILD               |   9 ++
 tensorflow/lite/delegates/status.h            |  83 ++++++++++++++
 .../lite/examples/label_image/label_image.cc  |   2 +-
 tensorflow/lite/interpreter.cc                |  31 +++++-
 tensorflow/lite/interpreter.h                 |   9 +-
 tensorflow/lite/profiling/atrace_profiler.cc  |  17 +--
 tensorflow/lite/profiling/buffered_profiler.h |  36 ++++--
 tensorflow/lite/profiling/noop_profiler.h     |   2 +-
 tensorflow/lite/profiling/profile_buffer.h    |  32 ++++--
 .../lite/profiling/profile_buffer_test.cc     |  32 +++++-
 .../lite/profiling/profile_summarizer.cc      |   4 +-
 .../lite/profiling/profile_summarizer_test.cc |  12 +-
 tensorflow/lite/profiling/profiler_test.cc    |  17 ++-
 16 files changed, 335 insertions(+), 90 deletions(-)
 create mode 100644 tensorflow/lite/delegates/status.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 6477c0491f9..810f3ab1a2b 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -252,6 +252,7 @@ cc_library(
         ":version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates:status",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels/internal:compatibility",
diff --git a/tensorflow/lite/core/api/profiler.h b/tensorflow/lite/core/api/profiler.h
index 938652cf698..897efbe1438 100644
--- a/tensorflow/lite/core/api/profiler.h
+++ b/tensorflow/lite/core/api/profiler.h
@@ -22,34 +22,56 @@ namespace tflite {
 // A simple utility for enabling profiled event tracing in TensorFlow Lite.
 class Profiler {
  public:
+  // As certain Profiler instance might be only interested in certain event
+  // types, we define each event type value to allow a Profiler to use
+  // bitmasking bitwise operations to determine whether an event should be
+  // recorded or not.
   enum class EventType {
     // Default event type, the metadata field has no special significance.
-    DEFAULT = 0,
+    DEFAULT = 1,
 
     // The event is an operator invocation and the event_metadata field is the
     // index of operator node.
-    OPERATOR_INVOKE_EVENT = 1,
+    OPERATOR_INVOKE_EVENT = 2,
 
     // The event is an invocation for an internal operator of a TFLite delegate.
     // The event_metadata field is the index of operator node that's specific to
     // the delegate.
-    DELEGATE_OPERATOR_INVOKE_EVENT = 2
+    DELEGATE_OPERATOR_INVOKE_EVENT = 4,
+
+    // The event is a recording of runtime instrumentation such as the overall
+    // TFLite runtime status, the TFLite delegate status (if a delegate
+    // is applied), and the overall model inference latency etc.
+    // Note, the delegate status and overall status are stored as separate
+    // event_metadata fields. In particular, the delegate status is encoded
+    // as DelegateStatus::full_status().
+    GENERAL_RUNTIME_INSTRUMENTATION_EVENT = 8,
   };
 
   virtual ~Profiler() {}
 
-  // Signals the beginning of an event from a subgraph indexed at
-  // 'event_subgraph_index', returning a handle to the profile event.
+  // Signals the beginning of an event and returns a handle to the profile
+  // event. The `event_metadata1` and `event_metadata2` have different
+  // interpretations based on the actual Profiler instance and the `event_type`.
+  // For example, as for the 'SubgraphAwareProfiler' defined in
+  // lite/core/subgraph.h, when the event_type is OPERATOR_INVOKE_EVENT,
+  // `event_metadata1` represents the index of a TFLite node, and
+  // `event_metadata2` represents the index of the subgraph that this event
+  // comes from.
   virtual uint32_t BeginEvent(const char* tag, EventType event_type,
-                              uint32_t event_metadata,
-                              uint32_t event_subgraph_index) = 0;
-  // Similar w/ the above, but the event comes from the primary subgraph that's
-  // indexed at 0.
-  virtual uint32_t BeginEvent(const char* tag, EventType event_type,
-                              uint32_t event_metadata) {
-    return BeginEvent(tag, event_type, event_metadata, /*primary subgraph*/ 0);
+                              int64_t event_metadata1,
+                              int64_t event_metadata2) = 0;
+  // Similar w/ the above, but `event_metadata2` defaults to 0.
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata) {
+    return BeginEvent(tag, event_type, event_metadata, /*event_metadata2*/ 0);
   }
 
+  // Signals an end to the specified profile event with 'event_metadata's, This
+  // is useful when 'event_metadata's are not available when the event begins
+  // or when one wants to overwrite the 'event_metadata's set at the beginning.
+  virtual void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                        int64_t event_metadata2) {}
   // Signals an end to the specified profile event.
   virtual void EndEvent(uint32_t event_handle) = 0;
 
@@ -60,15 +82,18 @@ class Profiler {
   // they assume the value is in "usec", if in any case subclasses
   // didn't put usec, then the values are not meaningful.
   // TODO karimnosseir: Revisit and make the function more clear.
-  virtual void AddEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata, uint64_t start, uint64_t end) {
-    AddEvent(tag, event_type, event_metadata, start, end,
-             /*event_subgraph_index*/ 0);
+  void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                uint64_t end, int64_t event_metadata) {
+    AddEvent(tag, event_type, start, end, event_metadata,
+             /*event_metadata2*/ 0);
   }
 
-  virtual void AddEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata, uint64_t start, uint64_t end,
-                        uint32_t event_subgraph_index) {}
+  virtual void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                        uint64_t end, int64_t event_metadata1,
+                        int64_t event_metadata2) {}
+
+ protected:
+  friend class ScopedProfile;
 };
 
 // Adds a profile event to `profiler` that begins with the construction
@@ -79,7 +104,7 @@ class ScopedProfile {
  public:
   ScopedProfile(Profiler* profiler, const char* tag,
                 Profiler::EventType event_type = Profiler::EventType::DEFAULT,
-                uint32_t event_metadata = 0)
+                int64_t event_metadata = 0)
       : profiler_(profiler), event_handle_(0) {
     if (profiler) {
       event_handle_ = profiler_->BeginEvent(tag, event_type, event_metadata);
@@ -92,8 +117,8 @@ class ScopedProfile {
     }
   }
 
- private:
-  Profiler* const profiler_;
+ protected:
+  Profiler* profiler_;
   uint32_t event_handle_;
 };
 
@@ -113,6 +138,31 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
                       static_cast<uint32_t>(node_index)) {}
 };
 
+class ScopedRuntimeInstrumentationProfile : public ScopedProfile {
+ public:
+  ScopedRuntimeInstrumentationProfile(Profiler* profiler, const char* tag)
+      : ScopedProfile(
+            profiler, tag,
+            Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, -1) {}
+
+  void set_runtime_status(int64_t delegate_status, int64_t interpreter_status) {
+    if (profiler_) {
+      delegate_status_ = delegate_status;
+      interpreter_status_ = interpreter_status;
+    }
+  }
+
+  ~ScopedRuntimeInstrumentationProfile() {
+    if (profiler_) {
+      profiler_->EndEvent(event_handle_, delegate_status_, interpreter_status_);
+    }
+  }
+
+ private:
+  int64_t delegate_status_;
+  int64_t interpreter_status_;
+};
+
 }  // namespace tflite
 
 #define TFLITE_VARNAME_UNIQ_IMPL(name, ctr) name##ctr
@@ -130,4 +180,15 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
   tflite::ScopedDelegateOperatorProfile TFLITE_VARNAME_UNIQ(               \
       _profile_, __COUNTER__)((profiler), (tag), (node_index))
 
+#define TFLITE_ADD_RUNTIME_INSTRUMENTATION_EVENT(                          \
+    profiler, tag, delegate_status, interpreter_status)                    \
+  do {                                                                     \
+    if (!profiler) {                                                       \
+      const auto handle = profiler->BeginEvent(                            \
+          tag, Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, \
+          delegate_status, interpreter_status);                            \
+      profiler->EndEvent(handle);                                          \
+    }                                                                      \
+  } while (false);
+
 #endif  // TENSORFLOW_LITE_CORE_API_PROFILER_H_
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d6067daaa6a..d9ccff35105 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
 #define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
 
+#include <cstdint>
 #include <cstdlib>
 #include <map>
 #include <utility>
@@ -338,21 +339,16 @@ class Subgraph {
   class SubgraphAwareProfiler : public Profiler {
    public:
     // Constructor should be called with the non-nullptr profiler argument.
-    SubgraphAwareProfiler(Profiler* profiler, uint32_t subgraph_index)
+    SubgraphAwareProfiler(Profiler* profiler, int64_t subgraph_index)
         : profiler_(profiler), subgraph_index_(subgraph_index) {}
     ~SubgraphAwareProfiler() override {}
 
     uint32_t BeginEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata,
-                        uint32_t subgraph_index) override {
+                        int64_t event_metadata1,
+                        int64_t event_metadata2) override {
       if (!profiler_) return 0;
-      return profiler_->BeginEvent(tag, event_type, event_metadata,
-                                   subgraph_index);
-    }
-
-    uint32_t BeginEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata) override {
-      return BeginEvent(tag, event_type, event_metadata, subgraph_index_);
+      return profiler_->BeginEvent(tag, event_type, event_metadata1,
+                                   subgraph_index_);
     }
 
     void EndEvent(uint32_t event_handle) override {
@@ -360,17 +356,24 @@ class Subgraph {
       profiler_->EndEvent(event_handle);
     }
 
-    void AddEvent(const char* tag, EventType event_type,
-                  uint32_t event_metadata, uint64_t start,
-                  uint64_t end) override {
+    void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                  int64_t event_metadata2) override {
       if (!profiler_) return;
-      profiler_->AddEvent(tag, event_type, event_metadata, start, end);
+      profiler_->EndEvent(event_handle, event_metadata1, event_metadata2);
+    }
+
+    void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                  uint64_t end, int64_t event_metadata1,
+                  int64_t event_metadata2) override {
+      if (!profiler_) return;
+      profiler_->AddEvent(tag, event_type, start, end, event_metadata1,
+                          subgraph_index_);
     }
 
    private:
     // Not own the memory.
     Profiler* const profiler_;
-    const uint32_t subgraph_index_;
+    const int64_t subgraph_index_;
   };
 
   // Prevent 'context_' from accessing functions that are only available to
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 8a05298d01a..e1f91f32c34 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -20,6 +20,15 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "status",
+    hdrs = ["status.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["utils.cc"],
diff --git a/tensorflow/lite/delegates/status.h b/tensorflow/lite/delegates/status.h
new file mode 100644
index 00000000000..e56bf7ce577
--- /dev/null
+++ b/tensorflow/lite/delegates/status.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_STATUS_H_
+#define TENSORFLOW_LITE_DELEGATES_STATUS_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/c/common.h"
+
+// This file defines data structures to represent detailed TFLite delegate
+// status, e.g. NNAPI delegate application failure because of a driver issue
+// etc. Such status is ONLY to be used for internal APIs.
+// Note, we simply use TfLiteStatus to represent high-level status while
+// delegate-specific status codes are defined with DelegateStatus.
+// WARNING: This is an experimental feature that is subject to change.
+namespace tflite {
+namespace delegates {
+
+// Defines the source of the code where it is generated from. We list all TFLite
+// delegates that're officially implemented and available as of April, 2020
+// (i.e. w/ 'TFLITE_' prefix to imply this).
+enum class DelegateStatusSource {
+  NONE = 0,
+  TFLITE_GPU = 1,
+  TFLITE_NNAPI = 2,
+  TFLITE_HEXAGON = 3,
+  TFLITE_XNNPACK = 4,
+  TFLITE_COREML = 5,
+  MAX_NUM_SOURCES = std::numeric_limits<int32_t>::max(),
+};
+
+// Defines the detailed status that combines a DelegateStatusSource and a
+// status int32_t code.
+class DelegateStatus {
+ public:
+  DelegateStatus() : DelegateStatus(DelegateStatusSource::NONE, 0) {}
+  explicit DelegateStatus(int32_t code)
+      : DelegateStatus(DelegateStatusSource::NONE, code) {}
+  explicit DelegateStatus(int64_t full_status)
+      : DelegateStatus(
+            static_cast<DelegateStatusSource>(
+                full_status >> 32 &
+                static_cast<int32_t>(DelegateStatusSource::MAX_NUM_SOURCES)),
+            static_cast<int32_t>(full_status &
+                                 std::numeric_limits<int32_t>::max())) {}
+  DelegateStatus(DelegateStatusSource source, int32_t code)
+      : source_(static_cast<int32_t>(source)), code_(code) {}
+
+  // Return the detailed full status encoded as a int64_t value.
+  int64_t full_status() const {
+    return static_cast<int64_t>(source_) << 32 | code_;
+  }
+
+  DelegateStatusSource source() const {
+    return static_cast<DelegateStatusSource>(source_);
+  }
+
+  int32_t code() const { return code_; }
+
+ private:
+  // value of a DelegateStatusSource, like DelegateStatusSource::TFLITE_GPU
+  int32_t source_;
+  // value of a status code, like kTfLiteOk.
+  int32_t code_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_STATUS_H_
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 364ac325967..5967c23be33 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -301,7 +301,7 @@ void RunInference(Settings* s) {
     profiler->StopProfiling();
     auto profile_events = profiler->GetProfileEvents();
     for (int i = 0; i < profile_events.size(); i++) {
-      auto subgraph_index = profile_events[i]->event_subgraph_index;
+      auto subgraph_index = profile_events[i]->extra_event_metadata;
       auto op_index = profile_events[i]->event_metadata;
       const auto subgraph = interpreter->subgraph(subgraph_index);
       const auto node_and_registration =
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 167254a2a62..cae2ca7dde0 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/delegates/status.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/minimal_logging.h"
@@ -71,6 +72,17 @@ TfLiteQuantization GetQuantizationFromLegacy(
   return quantization;
 }
 
+// TODO(b/153131797): We have put 'delegate_status' to 0 in the following macro
+// temporarily because delegate-specific error codes are either not retrievable
+// at the moment, which we will add later.
+#define TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(runtime_event, a) \
+  do {                                                                      \
+    TfLiteStatus status = (a);                                              \
+    runtime_event.set_runtime_status(/*delegate_status=*/0,                 \
+                                     static_cast<int64_t>(status));         \
+    TF_LITE_ENSURE_STATUS(status);                                          \
+  } while (0)
+
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
@@ -210,11 +222,15 @@ TfLiteStatus Interpreter::ReleaseNonPersistentMemory() {
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  TF_LITE_ENSURE_STATUS(primary_subgraph().Invoke());
+  ScopedRuntimeInstrumentationProfile scoped_runtime_event(installed_profiler_,
+                                                           "invoke");
+  TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(
+      scoped_runtime_event, primary_subgraph().Invoke());
 
   if (!allow_buffer_handle_output_) {
     for (int tensor_index : outputs()) {
-      TF_LITE_ENSURE_STATUS(
+      TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(
+          scoped_runtime_event,
           primary_subgraph().EnsureTensorDataIsReadable(tensor_index));
     }
   }
@@ -381,18 +397,21 @@ void Interpreter::SetProfiler(Profiler* profiler) {
   // Release resources occupied by owned_profiler_ which is replaced by
   // caller-owned profiler.
   owned_profiler_.reset(nullptr);
-  SetSubgraphProfiler(profiler);
+  installed_profiler_ = profiler;
+  SetSubgraphProfiler();
 }
 
 void Interpreter::SetProfiler(std::unique_ptr<Profiler> profiler) {
   owned_profiler_ = std::move(profiler);
-  SetSubgraphProfiler(owned_profiler_.get());
+  installed_profiler_ = owned_profiler_.get();
+  SetSubgraphProfiler();
 }
 
-void Interpreter::SetSubgraphProfiler(Profiler* profiler) {
+void Interpreter::SetSubgraphProfiler() {
   for (int subgraph_index = 0; subgraph_index < subgraphs_.size();
        ++subgraph_index) {
-    subgraphs_[subgraph_index]->SetProfiler(profiler, subgraph_index);
+    subgraphs_[subgraph_index]->SetProfiler(installed_profiler_,
+                                            subgraph_index);
   }
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 0e01ce44e0c..59cab6add6d 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -540,7 +540,7 @@ class Interpreter {
                                  TfLiteExternalContext* ctx);
 
   // Sets the profiler to all subgraphs.
-  void SetSubgraphProfiler(Profiler* profiler);
+  void SetSubgraphProfiler();
 
   // Remove delegates (for fallback behaviour). The interpreter is invokable
   // afterwards.
@@ -559,10 +559,10 @@ class Interpreter {
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
   // This is the primary subgraph context.
-  TfLiteContext* context_;
+  TfLiteContext* context_ = nullptr;
 
   // The error reporter delegate that tflite will forward queries errors to.
-  ErrorReporter* error_reporter_;
+  ErrorReporter* error_reporter_ = nullptr;
 
   // List of delegates that have been installed and are owned by this
   // interpreter instance. Useful if client delegate ownership is burdensome.
@@ -574,6 +574,9 @@ class Interpreter {
   // Useful if client profiler ownership is burdensome.
   std::unique_ptr<Profiler> owned_profiler_;
 
+  // Points to the installed Profiler instance.
+  Profiler* installed_profiler_ = nullptr;
+
   bool allow_buffer_handle_output_ = false;
 
   // List of active external contexts.
diff --git a/tensorflow/lite/profiling/atrace_profiler.cc b/tensorflow/lite/profiling/atrace_profiler.cc
index 2f6672d6bb7..4bdaf9d9e06 100644
--- a/tensorflow/lite/profiling/atrace_profiler.cc
+++ b/tensorflow/lite/profiling/atrace_profiler.cc
@@ -57,16 +57,19 @@ class ATraceProfiler : public tflite::Profiler {
   }
 
   uint32_t BeginEvent(const char* tag, EventType event_type,
-                      uint32_t event_metadata,
-                      uint32_t event_subgraph_index) override {
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override {
     if (handle_ && atrace_is_enabled_()) {
       // Note: When recording an OPERATOR_INVOKE_EVENT, we have recorded the op
-      // name as tag and node index as event_metadata. See the macro
-      // TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE defined in
-      // tensorflow/lite/core/api/profiler.h for details.
-      // op_name@node_index/subgraph_index
+      // name
+      // as tag, node index as event_metadata1 and subgraph index as
+      // event_metadata2. See the macro TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE
+      // defined in tensorflow/lite/core/api/profiler.h for details.
+      // Regardless the 'event_type', we encode the perfetto event name as
+      // tag@event_metadata1/event_metadata2. In case of OPERATOR_INVOKE_EVENT,
+      // the perfetto event name will be op_name@node_index/subgraph_index
       std::string trace_event_tag =
-          absl::StrCat(tag, "@", event_metadata, "/", event_subgraph_index);
+          absl::StrCat(tag, "@", event_metadata1, "/", event_metadata2);
       atrace_begin_section_(trace_event_tag.c_str());
     }
     return 0;
diff --git a/tensorflow/lite/profiling/buffered_profiler.h b/tensorflow/lite/profiling/buffered_profiler.h
index 2b617c92aeb..cfd96e0490a 100644
--- a/tensorflow/lite/profiling/buffered_profiler.h
+++ b/tensorflow/lite/profiling/buffered_profiler.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
 #define TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/lite/core/api/profiler.h"
@@ -75,24 +76,33 @@ namespace profiling {
 class BufferedProfiler : public tflite::Profiler {
  public:
   explicit BufferedProfiler(uint32_t max_num_entries)
-      : buffer_(max_num_entries, false) {}
+      : buffer_(max_num_entries, false),
+        supported_event_types_(~static_cast<uint64_t>(
+            EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT)) {}
 
   uint32_t BeginEvent(const char* tag, EventType event_type,
-                      uint32_t event_metadata,
-                      uint32_t event_subgraph_index) override {
-    return buffer_.BeginEvent(tag, event_type, event_metadata,
-                              event_subgraph_index);
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return kInvalidEventHandle;
+    return buffer_.BeginEvent(tag, event_type, event_metadata1,
+                              event_metadata2);
   }
 
   void EndEvent(uint32_t event_handle) override {
     buffer_.EndEvent(event_handle);
   }
 
-  void AddEvent(const char* tag, EventType event_type, uint32_t event_metadata,
-                uint64_t start, uint64_t end,
-                uint32_t event_subgraph_index) override {
-    buffer_.AddEvent(tag, event_type, event_metadata, start, end,
-                     event_subgraph_index);
+  void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                int64_t event_metadata2) override {
+    buffer_.EndEvent(event_handle, &event_metadata1, &event_metadata2);
+  }
+
+  void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                uint64_t end, int64_t event_metadata1,
+                int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return;
+    buffer_.AddEvent(tag, event_type, start, end, event_metadata1,
+                     event_metadata2);
   }
 
   void StartProfiling() { buffer_.SetEnabled(true); }
@@ -107,9 +117,15 @@ class BufferedProfiler : public tflite::Profiler {
     return profile_events;
   }
 
+ protected:
+  bool ShouldAddEvent(EventType event_type) {
+    return (static_cast<uint64_t>(event_type) & supported_event_types_) != 0;
+  }
+
  private:
   ProfileBuffer* GetProfileBuffer() { return &buffer_; }
   ProfileBuffer buffer_;
+  const uint64_t supported_event_types_;
 };
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/noop_profiler.h b/tensorflow/lite/profiling/noop_profiler.h
index 27363fc6788..078d0e8ee6d 100644
--- a/tensorflow/lite/profiling/noop_profiler.h
+++ b/tensorflow/lite/profiling/noop_profiler.h
@@ -29,7 +29,7 @@ class NoopProfiler : public tflite::Profiler {
   NoopProfiler() {}
   explicit NoopProfiler(int max_profiling_buffer_entries) {}
 
-  uint32_t BeginEvent(const char*, EventType, uint32_t, uint32_t) override {
+  uint32_t BeginEvent(const char*, EventType, int64_t, int64_t) override {
     return 0;
   }
   void EndEvent(uint32_t) override {}
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 67d19d02afe..3b34cf9612a 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -51,10 +51,11 @@ struct ProfileEvent {
   // The field containing the type of event. This must be one of the event types
   // in EventType.
   EventType event_type;
-  // Extra data describing the details of the event.
-  uint32_t event_metadata;
-  // The index of subgraph where an event came from.
-  uint32_t event_subgraph_index;
+  // Meta data associated w/ the event.
+  int64_t event_metadata;
+  // Note: if this is an OPERATOR_INVOKE_EVENT, 'extra_event_metadata' will
+  // represent the index of the subgraph that this event comes from.
+  int64_t extra_event_metadata;
 };
 
 // A ring buffer of profile events.
@@ -69,7 +70,7 @@ class ProfileBuffer {
   // buffer is disabled this has no affect.
   // The tag of the event should remain valid till the buffer is valid.
   uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
-                      uint32_t event_metadata, uint32_t event_subgraph_index) {
+                      int64_t event_metadata1, int64_t event_metadata2) {
     if (!enabled_) {
       return kInvalidEventHandle;
     }
@@ -81,8 +82,8 @@ class ProfileBuffer {
     }
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_subgraph_index = event_subgraph_index;
-    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].event_metadata = event_metadata1;
+    event_buffer_[index].extra_event_metadata = event_metadata2;
     event_buffer_[index].begin_timestamp_us = timestamp;
     event_buffer_[index].end_timestamp_us = 0;
     if (event_type != Profiler::EventType::OPERATOR_INVOKE_EVENT) {
@@ -98,7 +99,8 @@ class ProfileBuffer {
   // Sets the end timestamp for event for the handle to current time.
   // If the buffer is disabled or previous event has been overwritten this
   // operation has not effect.
-  void EndEvent(uint32_t event_handle) {
+  void EndEvent(uint32_t event_handle, const int64_t* event_metadata1 = nullptr,
+                const int64_t* event_metadata2 = nullptr) {
     if (!enabled_ || event_handle == kInvalidEventHandle ||
         event_handle > current_index_) {
       return;
@@ -116,11 +118,17 @@ class ProfileBuffer {
         Profiler::EventType::OPERATOR_INVOKE_EVENT) {
       event_buffer_[event_index].end_mem_usage = memory::GetMemoryUsage();
     }
+    if (event_metadata1) {
+      event_buffer_[event_index].event_metadata = *event_metadata1;
+    }
+    if (event_metadata2) {
+      event_buffer_[event_index].extra_event_metadata = *event_metadata2;
+    }
   }
 
   void AddEvent(const char* tag, ProfileEvent::EventType event_type,
-                uint32_t event_metadata, uint64_t start, uint64_t end,
-                uint32_t event_subgraph_index) {
+                uint64_t start, uint64_t end, int64_t event_metadata1,
+                int64_t event_metadata2) {
     if (!enabled_) {
       return;
     }
@@ -131,8 +139,8 @@ class ProfileBuffer {
     }
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_subgraph_index = event_subgraph_index;
-    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].event_metadata = event_metadata1;
+    event_buffer_[index].extra_event_metadata = event_metadata2;
     event_buffer_[index].begin_timestamp_us = start;
     event_buffer_[index].end_timestamp_us = end;
     current_index_++;
diff --git a/tensorflow/lite/profiling/profile_buffer_test.cc b/tensorflow/lite/profiling/profile_buffer_test.cc
index 584d21255a5..ab98cbb0d13 100644
--- a/tensorflow/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/lite/profiling/profile_buffer_test.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+#include <cstdint>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/profiling/profile_buffer.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -43,7 +45,7 @@ TEST(ProfileBufferTest, AddEvent) {
   EXPECT_EQ(0, buffer.Size());
   auto event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
 
   EXPECT_GE(event_handle, 0);
   EXPECT_EQ(1, buffer.Size());
@@ -59,6 +61,28 @@ TEST(ProfileBufferTest, AddEvent) {
   EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
 }
 
+TEST(ProfileBufferTest, EndEventWithMetadata) {
+  ProfileBuffer buffer(/*max_size*/ 10, /*enabled*/ true);
+  EXPECT_EQ(0, buffer.Size());
+  auto event_handle =
+      buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
+  const int64_t kEventMetadata1 = 18;
+  const int64_t kEventMetadata2 = 36;
+  buffer.EndEvent(event_handle, &kEventMetadata1, &kEventMetadata2);
+
+  EXPECT_GE(event_handle, 0);
+  EXPECT_EQ(1, buffer.Size());
+  auto event = GetProfileEvents(buffer)[0];
+  EXPECT_EQ(event->tag, "hello");
+  EXPECT_GT(event->begin_timestamp_us, 0);
+  EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT);
+  EXPECT_EQ(event->event_metadata, kEventMetadata1);
+  EXPECT_EQ(event->extra_event_metadata, kEventMetadata2);
+  EXPECT_EQ(1, buffer.Size());
+  EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
+}
+
 TEST(ProfileBufferTest, OverFlow) {
   const int max_size = 4;
   ProfileBuffer buffer{max_size, true};
@@ -83,13 +107,13 @@ TEST(ProfileBufferTest, Enable) {
   EXPECT_EQ(0, buffer.Size());
   auto event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
   EXPECT_EQ(kInvalidEventHandle, event_handle);
   EXPECT_EQ(0, buffer.Size());
   buffer.SetEnabled(true);
   event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
   EXPECT_GE(event_handle, 0);
   EXPECT_EQ(1, buffer.Size());
 }
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index acf630c93cf..2fc04f99659 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -128,7 +128,7 @@ void ProfileSummarizer::ProcessProfiles(
   int64_t delegate_internal_total_us = 0;
 
   for (auto event : events) {
-    const auto subgraph_index = event->event_subgraph_index;
+    const auto subgraph_index = event->extra_event_metadata;
     auto stats_calculator = GetStatsCalculator(subgraph_index);
     int64_t start_us = event->begin_timestamp_us - base_start_us;
     int64_t node_exec_time =
@@ -174,7 +174,7 @@ void ProfileSummarizer::ProcessProfiles(
       const memory::MemoryUsage node_mem_usage =
           event->end_mem_usage - event->begin_mem_usage;
       std::string node_name(event->tag);
-      node_name += "/" + std::to_string(event->event_subgraph_index);
+      node_name += "/" + std::to_string(event->extra_event_metadata);
       stats_calculator->AddNodeStats(node_name, event->tag, node_num, start_us,
                                      node_exec_time,
                                      node_mem_usage.max_rss_kb * 1000.0);
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index 87e689e9985..98d26196b75 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -182,13 +182,13 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfTrue) {
   EXPECT_EQ(2, events.size());
   int event_count_of_subgraph_zero = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 0; });
+      [](auto event) { return event->extra_event_metadata == 0; });
   int event_count_of_subgraph_one = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 1; });
+      [](auto event) { return event->extra_event_metadata == 1; });
   int event_count_of_subgraph_two = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 2; });
+      [](auto event) { return event->extra_event_metadata == 2; });
   EXPECT_EQ(1, event_count_of_subgraph_zero);
   EXPECT_EQ(1, event_count_of_subgraph_one);
   EXPECT_EQ(0, event_count_of_subgraph_two);
@@ -209,13 +209,13 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfFalse) {
   EXPECT_EQ(2, events.size());
   int event_count_of_subgraph_zero = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 0; });
+      [](auto event) { return event->extra_event_metadata == 0; });
   int event_count_of_subgraph_one = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 1; });
+      [](auto event) { return event->extra_event_metadata == 1; });
   int event_count_of_subgraph_two = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 2; });
+      [](auto event) { return event->extra_event_metadata == 2; });
   EXPECT_EQ(1, event_count_of_subgraph_zero);
   EXPECT_EQ(0, event_count_of_subgraph_one);
   EXPECT_EQ(1, event_count_of_subgraph_two);
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index cedb109697d..1d8455e3647 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/profiling/profiler.h"
+
 #include <unistd.h>
 
 #include <chrono>  // NOLINT(build/c++11)
@@ -20,7 +22,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -55,6 +56,20 @@ TEST(ProfilerTest, NoProfilesAreCollectedWhenDisabled) {
   EXPECT_EQ(0, profile_events.size());
 }
 
+TEST(ProfilerTest, NoProfilesAreCollectedWhenEventTypeUnsupported) {
+  BufferedProfiler profiler(1024);
+  tflite::Profiler* p = &profiler;
+  p->AddEvent("Hello",
+              Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT,
+              /*start*/ 0, /*end*/ 1,
+              /*event_metadata*/ 2);
+  auto handler = p->BeginEvent(
+      "begin", Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, 0);
+  p->EndEvent(handler);
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(0, profile_events.size());
+}
+
 TEST(ProfilingTest, ProfilesAreCollected) {
   BufferedProfiler profiler(1024);
   profiler.StartProfiling();

From ea8e87c8e945c5424b130d66a4fbf5fca7e5c9cd Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Thu, 28 May 2020 06:42:39 -0700
Subject: [PATCH 1301/1533] [XLA][MLIR] Register BufferAssignmentTestDialect.

It was made harder to shoot yourself in the foot when using operations that are
not registered with MLIRContext. In order to use test ops in the
buffer_assignment_test.mlir we have to register them first.

PiperOrigin-RevId: 313576961
Change-Id: Id3c711a2d1776a6fdee272d31d932ec5010cd0c2
---
 tensorflow/compiler/mlir/xla/tests/BUILD      |   1 -
 .../mlir/xla/tests/buffer-assignment.mlir     |  18 +--
 .../xla/transforms/buffer_assignment_test.cc  | 108 ++++++++++--------
 3 files changed, 68 insertions(+), 59 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index e2f747085c1..ad69383bd98 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -6,7 +6,6 @@ package(licenses = ["notice"])
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
-    exclude = ["buffer-assignment.mlir"],  # TODO(b/157616173)
     test_file_exts = ["mlir"],
 )
 
diff --git a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
index ad007d0eb50..d6c164f8160 100644
--- a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
@@ -203,12 +203,12 @@ func @moving_alloc_and_inserting_missing_dealloc(%cond : i1, %arg0 : memref<2xf3
     "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
     br ^exit(%1 : memref<2xf32>)
   ^exit(%arg2: memref<2xf32>):
-    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
 // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
 // CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
-//      CHECK: "bufer_assignment_test.copy"
+//      CHECK: "buffer_assignment_test.copy"
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: return
@@ -226,11 +226,11 @@ func @moving_invalid_dealloc_op_complex(%cond : i1, %arg0 : memref<2xf32>, %arg1
     dealloc %1 : memref<2xf32>
     br ^exit(%1 : memref<2xf32>)
   ^exit(%arg2: memref<2xf32>):
-    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
 // CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-//      CHECK: bufer_assignment_test.copy
+//      CHECK: buffer_assignment_test.copy
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: return
 
@@ -240,10 +240,10 @@ func @moving_invalid_dealloc_op_complex(%cond : i1, %arg0 : memref<2xf32>, %arg1
 func @inserting_missing_dealloc_simple(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
     %0 = alloc() : memref<2xf32>
     "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
-//      CHECK: bufer_assignment_test.copy
+//      CHECK: buffer_assignment_test.copy
 // CHECK-NEXT: dealloc
 
 // -----
@@ -253,8 +253,8 @@ func @moving_invalid_dealloc_op(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
     %0 = alloc() : memref<2xf32>
     "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
     dealloc %0 : memref<2xf32>
-    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
-//      CHECK: bufer_assignment_test.copy
-// CHECK-NEXT: dealloc
\ No newline at end of file
+//      CHECK: buffer_assignment_test.copy
+// CHECK-NEXT: dealloc
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
index 5a0d791079c..40c115f4cbc 100644
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
@@ -29,60 +29,66 @@ limitations under the License.
 namespace mlir {
 namespace xla {
 namespace {
+
+/// This dialect independent unary operation has been defined only for testing
+/// buffer assignment.
+class BufferAssignmentTestUnaryOp
+    : public Op<BufferAssignmentTestUnaryOp, OpTrait::OneResult,
+                OpTrait::OneOperand> {
+ public:
+  using Op::Op;
+  static StringRef getOperationName() { return "buffer_assignment_test.unary"; }
+  static void build(OpBuilder& b, OperationState& state, Value source) {
+    state.addOperands(source);
+  }
+};
+
+/// This dialect independent lowered unary operation has been defined only for
+/// testing buffer assignment.
+class BufferAssignmentTestUnaryLoweredOp
+    : public Op<BufferAssignmentTestUnaryLoweredOp, OpTrait::ZeroResult,
+                OpTrait::NOperands<2>::Impl> {
+ public:
+  using Op::Op;
+  static StringRef getOperationName() {
+    return "buffer_assignment_test.unary_lowered";
+  }
+  static void build(OpBuilder& b, OperationState& state, Value source,
+                    Value target) {
+    state.addOperands(source);
+    state.addOperands(target);
+  }
+};
+
+/// This dialect independent copy operation has been defined only for testing
+/// NonVoidToVoidReturnOpConverter
+class BufferAssignmentTestCopyOp
+    : public Op<BufferAssignmentTestCopyOp, OpTrait::ZeroResult,
+                OpTrait::NOperands<2>::Impl> {
+ public:
+  using Op::Op;
+  static StringRef getOperationName() { return "buffer_assignment_test.copy"; }
+  static void build(OpBuilder& b, OperationState& state, Value from, Value to) {
+    state.addOperands(from);
+    state.addOperands(to);
+  }
+};
+
+class BufferAssignmentTestDialect : public Dialect {
+ public:
+  explicit BufferAssignmentTestDialect(MLIRContext* context)
+      : Dialect(getDialectNamespace(), context) {
+    addOperations<BufferAssignmentTestCopyOp, BufferAssignmentTestUnaryOp,
+                  BufferAssignmentTestUnaryLoweredOp>();
+  }
+  static StringRef getDialectNamespace() { return "buffer_assignment_test"; }
+};
+
 /// This pass tests two provided operation converters,
 /// FunctionAndBlockSignatureConverter and NonVoidToVoidReturnOpConverter, for
 /// Buffer Assignment.
 struct BufferAssignmentPreparationTestPass
     : mlir::PassWrapper<BufferAssignmentPreparationTestPass, FunctionPass> {
-  /// This dialect independent unary operation has been defined only for testing
-  /// buffer assignment.
-  class BufferAssignmentTestUnaryOp
-      : public Op<BufferAssignmentTestUnaryOp, OpTrait::OneResult,
-                  OpTrait::OneOperand> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.unary";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value source) {
-      state.addOperands(source);
-    }
-  };
-
-  /// This dialect independent lowered unary operation has been defined only for
-  /// testing buffer assignment.
-  class BufferAssignmentTestUnaryLoweredOp
-      : public Op<BufferAssignmentTestUnaryLoweredOp, OpTrait::ZeroResult,
-                  OpTrait::NOperands<2>::Impl> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.unary_lowered";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value source,
-                      Value target) {
-      state.addOperands(source);
-      state.addOperands(target);
-    }
-  };
-
-  /// This dialect independent copy operation has been defined only for testing
-  /// NonVoidToVoidReturnOpConverter
-  class BufferAssignmentTestCopyOp
-      : public Op<BufferAssignmentTestCopyOp, OpTrait::ZeroResult,
-                  OpTrait::NOperands<2>::Impl> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.copy";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value from,
-                      Value to) {
-      state.addOperands(from);
-      state.addOperands(to);
-    }
-  };
-
   /// A simple converter that legalizes a BufferAssignmentTestUnaryOp to a
   /// BufferAssignmentTestUnaryLoweredOp and creates buffer allocation for
   /// the result of the computation.
@@ -151,8 +157,12 @@ struct BufferAssignmentPreparationTestPass
     }
   };
 };
+
 }  // namespace
 
+static mlir::DialectRegistration<BufferAssignmentTestDialect>
+    buffer_assignment_test_ops;
+
 /// This pass tests helper methods such as computeAllocPosition,
 /// FunctionAndBlockSignatureConverter, NonVoidToVoidReturnOpConverter
 /// conversion patterns. Furthermore, it checks buffer-assignment pass that

From cff621f1155cdd3a885dad67534814095f3eeccb Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Thu, 28 May 2020 21:59:18 +0700
Subject: [PATCH 1302/1533] Registration for GCS C API Filesystem

---
 .../experimental/filesystem/plugins/gcs/BUILD | 27 +++++++
 .../filesystem/plugins/gcs/gcs_filesystem.cc  | 72 +++++++++++++++++++
 2 files changed, 99 insertions(+)
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
 create mode 100644 tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
new file mode 100644
index 00000000000..90ddd4a891d
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -0,0 +1,27 @@
+# Experimental gcs filesystem plugin.
+load("//tensorflow:tensorflow.bzl", "get_win_copts", "tf_cc_shared_object")
+
+package(
+    licenses = ["notice"],  # Apache 2.0
+)
+
+# Filesystem implementation for GCS environments
+tf_cc_shared_object(
+    name = "gcs_filesystem",
+    framework_so = [],
+    linkstatic = False,
+    per_os_targets = 1,
+    visibility = ["//visibility:public"],
+    deps = [":gcs_filesystem_impl"],
+)
+
+# The real implementation of the filesystem.
+cc_library(
+    name = "gcs_filesystem_impl",
+    srcs = ["gcs_filesystem.cc"],
+    copts = get_win_copts(),
+    deps = [
+        "//tensorflow/c:tf_status",
+        "//tensorflow/c/experimental/filesystem:filesystem_interface",
+    ],
+)
diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
new file mode 100644
index 00000000000..ea9f59f1af3
--- /dev/null
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/gcs_filesystem.cc
@@ -0,0 +1,72 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/c/experimental/filesystem/filesystem_interface.h"
+#include "tensorflow/c/tf_status.h"
+
+// Implementation of a filesystem for GCS environments.
+// This filesystem will support `gs://` URI schemes.
+
+static void* plugin_memory_allocate(size_t size) { return calloc(1, size); }
+static void plugin_memory_free(void* ptr) { free(ptr); }
+
+// SECTION 1. Implementation for `TF_RandomAccessFile`
+// ----------------------------------------------------------------------------
+namespace tf_random_access_file {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_random_access_file
+
+// SECTION 2. Implementation for `TF_WritableFile`
+// ----------------------------------------------------------------------------
+namespace tf_writable_file {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_writable_file
+
+// SECTION 3. Implementation for `TF_ReadOnlyMemoryRegion`
+// ----------------------------------------------------------------------------
+namespace tf_read_only_memory_region {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_read_only_memory_region
+
+// SECTION 4. Implementation for `TF_Filesystem`, the actual filesystem
+// ----------------------------------------------------------------------------
+namespace tf_gcs_filesystem {
+
+// TODO(vnvo2409): Implement later
+
+}  // namespace tf_gcs_filesystem
+
+static void ProvideFilesystemSupportFor(TF_FilesystemPluginOps* ops,
+                                        const char* uri) {
+  TF_SetFilesystemVersionMetadata(ops);
+  ops->scheme = strdup(uri);
+}
+
+void TF_InitPlugin(TF_FilesystemPluginInfo* info) {
+  info->plugin_memory_allocate = plugin_memory_allocate;
+  info->plugin_memory_free = plugin_memory_free;
+  info->num_schemes = 1;
+  info->ops = static_cast<TF_FilesystemPluginOps*>(
+      plugin_memory_allocate(info->num_schemes * sizeof(info->ops[0])));
+  ProvideFilesystemSupportFor(&info->ops[0], "gs");
+}
\ No newline at end of file

From 290487b03ed7a9fa78af6faa2d8c19f7e5fde30e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 09:18:29 -0700
Subject: [PATCH 1303/1533] Fix for build errors with constexpr TfLiteIntArray.

PiperOrigin-RevId: 313599824
Change-Id: Ia37465dd2f782e234839bdfbe991516d9fc06c40
---
 tensorflow/lite/micro/micro_allocator.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index c4f7f859e99..d43f0ec076f 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -46,9 +46,14 @@ struct AllocationInfo {
 // requirement for SIMD extensions.
 constexpr int kBufferAlignment = 16;
 
-// Static instance of a zero-length int to pass as tensor dims for a flatbuffer
-// Tensor with no shape.
-constexpr TfLiteIntArray kZeroLengthIntArray = {0, {}};
+// Instance of a zero-length int to pass as tensor dims for a flatbuffer
+// Tensor with no shape. Note that the second member of a TfLiteArray is a
+// flexible array member, which is not strictly valid C++. However it is
+// supported by both GCC and clang, as long as the flexible array element is not
+// initialized, which is ok in this case as it should never be accessed.
+// Declaring this as constexpr causes build errors with clang, as it requires
+// the flexible array element to be initialized.
+const TfLiteIntArray kZeroLengthIntArray = {0};
 
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:

From 4594b8cfb5c698ae500841a16ea3893162cf20b0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 09:31:41 -0700
Subject: [PATCH 1304/1533] Fixing null pointer sanitization issue.

PiperOrigin-RevId: 313602040
Change-Id: Ieaa064d4983b0a645d4eba9ac7e4580899401c04
---
 tensorflow/core/util/sparse/sparse_tensor.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 2654d126e86..54e24da0ff5 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -474,7 +474,9 @@ inline SparseTensor SparseTensor::Concat(
     const int st_num_entries = st.num_entries();
 
     // Fill in indices & values.
-    std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
+    if (st_num_entries > 0) {
+      std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
+    }
 
     const auto* st_ix = &st.ix_.matrix<int64>()(0, 0);
     auto* ix_out = &ix_t(offset, 0);

From 2de551ba87cde98325a5d6a3c3b3c7d092af4376 Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Thu, 28 May 2020 09:51:29 -0700
Subject: [PATCH 1305/1533] Add a function to dynamic choose and execute the
 proper implementation based on underlying device placement

PiperOrigin-RevId: 313605766
Change-Id: I877b684dcef782b375df0504c0250acd9e808ce9
---
 .../base_api/api_def_DeviceIndex.pbtxt        |   5 +
 tensorflow/core/grappler/optimizers/BUILD     |   1 +
 .../optimizers/implementation_selector.cc     |  85 ++++++++-
 .../optimizers/implementation_selector.h      |  87 +++++++++-
 .../implementation_selector_test.cc           | 162 ++++++++++++++++++
 tensorflow/core/kernels/functional_ops.cc     |  32 ++++
 tensorflow/core/ops/functional_ops.cc         |   6 +
 tensorflow/python/ops/cond_v2.py              |  17 +-
 tensorflow/python/ops/control_flow_ops.py     |  57 +++++-
 .../python/ops/control_flow_ops_test.py       |  86 ++++++++++
 tensorflow/python/ops/control_flow_util_v2.py |  17 +-
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 +
 13 files changed, 550 insertions(+), 13 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
new file mode 100644
index 00000000000..9a4e5abd110
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "DeviceIndex"
+  visibility: HIDDEN
+  summary: "Return the index of device the op runs."
+}
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 030064e49fb..7432e2d54ea 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1062,6 +1062,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 9c4f74d7268..2b0a27aaa2d 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -36,6 +39,11 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kConstOp[] = "Const";
+constexpr char kCaseOp[] = "Case";
+constexpr char kDeviceIndexOp[] = "DeviceIndex";
+
+// TODO(b/157615690): clean up function implementation swap code.
 // The overall idea for the function swap is like below:
 //          -----------                            -----------
 //  inp_1 ->|  P_C    | -> out_1         g_inp_1 ->|  P_C    | -> g_out_1
@@ -292,6 +300,74 @@ Status ImplementationSelector::MaybeOptimizeFunctionCall(
   return Status::OK();
 }
 
+// Finds the index of the device from the device name list.
+Status FindDeviceIndex(const utils::MutableNodeView* device_index_node,
+                       const string& device, int* index) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(device, &parsed_name) ||
+      !parsed_name.has_type) {
+    return errors::Internal("Could not parse device name:", device);
+  }
+  const auto& device_list =
+      device_index_node->GetAttr("device_names")->list().s();
+  auto it = absl::c_find(device_list, parsed_name.type);
+  if (it != device_list.end()) {
+    *index = it - device_list.begin();
+  } else {
+    // Sets *index to device_list.size() because the default_fn is guaranteed to
+    // be the final item in the case op branching list.
+    *index = device_list.size();
+  }
+  return Status::OK();
+}
+
+// Rewrites the device_index op to a const op with value of the index.
+void RewriteDeviceIndexOp(utils::MutableNodeView* device_index_node,
+                          int index) {
+  // Modifies the DeviceIndex node to be an Const op with correct device index.
+  auto node = device_index_node->node();
+  node->set_op(kConstOp);
+  node->clear_attr();
+  (*node->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*node->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  tensor->add_int_val(index);
+  VLOG(2) << "Node after rewriting:" << node->DebugString();
+}
+
+Status ImplementationSelector::SelectDeviceIndex(GraphDef* graph) const {
+  Status status;
+  VLOG(2) << "graph before rewriting device index:" << graph->DebugString();
+  utils::MutableGraphView graph_view(graph, &status);
+  TF_RETURN_IF_ERROR(status);
+  const int num_nodes = graph_view.NumNodes();
+  for (int k = 0; k < num_nodes; ++k) {
+    auto* node_view = graph_view.GetNode(k);
+    if (node_view->GetOp() != kDeviceIndexOp) {
+      continue;
+    }
+    VLOG(2) << "Found a node to rewrite the device index";
+
+    // Find the case node with device index node as input, rewrite the
+    // DeviceIndex node to have the value of the index of device type of the
+    // case node.
+    for (const auto& fanouts : node_view->GetRegularFanouts()) {
+      for (const auto& fanout : fanouts) {
+        if (fanout.node_view()->GetOp() != kCaseOp) continue;
+        int index;
+        // If any error is thrown out during device parsing, we simply skip
+        // and do not modify the DeviceIndexNode.
+        Status status =
+            FindDeviceIndex(node_view, fanout.node_view()->GetDevice(), &index);
+        if (status.ok()) {
+          RewriteDeviceIndexOp(node_view, index);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
   if (!graph->has_library()) {
     VLOG(2) << "Skipping graph since it does not have function def";
@@ -307,8 +383,9 @@ Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
   TF_RETURN_IF_ERROR(status);
 
   const int num_nodes = graph_view.NumNodes();
-  for (int k = 0; k < num_nodes; ++k)
+  for (int k = 0; k < num_nodes; ++k) {
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph_view.GetNode(k)));
+  }
 
   return Status::OK();
 }
@@ -326,7 +403,13 @@ Status ImplementationSelector::Optimize(Cluster* cluster,
             << "libraries: " << status;
     return errors::Aborted("Skipped Optimization");
   }
+
   *optimized_graph = item.graph;
+  status = SelectDeviceIndex(optimized_graph);
+  if (!status.ok()) {
+    *optimized_graph = item.graph;
+    VLOG(2) << "Could not rewrite device index due to error:" << status;
+  }
   return SelectImplementation(optimized_graph);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
index 57d19fe7046..f6962e0a10d 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -34,6 +34,28 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Motivation: To achieve the same high level functionality, the underlying
+// implementations sometimes are different for various devices where the
+// function runs. In order to achieve the correct result and best performance,
+// the proper implementation needs to be picked dynamically.
+//
+// Currently there are two approaches to do this.
+// (1) Utilize case op and dynamacically change the branch index.
+// (2) Swap function implementation, it will be deprecated.
+//
+// Idea for approach 1.
+// This transformation rewrites the DeviceIndex op with a Const op with value
+// of the index of the device the associcated Case op runs.
+// Example:
+// def plus_one_gpu(x): return x + 1.0
+// def plus_one_reference_implementation(x): return x + 1.0
+// input = tf.constant(2.0, dtype=tf.float32)
+// cpu_fn = lambda:plus_one_reference_implementation(input)
+// gpu_fn = lambda:plus_one_gpu(input)
+// control_flow_ops.execute_fn_for_device(
+//  {"CPU": cpu_fn, "GPU":gpu_fn)}, default_fn=cpu_fn)
+//
+// Idea for approach 2.
 // This transformation replaces function calls by the appropriate function
 // definition based on properties of the runtime system. For instance,
 // we may choose one implementation over another if we have a GPU with
@@ -58,7 +80,8 @@ namespace grappler {
 // z = plus_one_gpu(input)
 // print(sess.run(z))
 //
-// At runtime, we will trim either `plus_one_gpu` or
+
+// At runtime, we will select either `plus_one_gpu` or
 // `plus_one_reference_implementation` based on the availability of the GPU.
 //
 // Available annotations:
@@ -106,6 +129,68 @@ class ImplementationSelector : public CustomGraphOptimizer {
   // gradients.
   Status SelectImplementation(GraphDef* graph) const;
 
+  // Rewrites the DeviceIndex op with a Const op with value of the index of the
+  // device the associcated Case op runs.
+
+  // This function first looks up all the DeviceIndex ops.
+  // Then for each of these ops, it finds the device of the
+  // associated Case op that takes the DeviceIndex op as the input, and
+  // caculates the index of the device in the device list of DeviceIndex op.
+  // Lastly, it rewrites the DeviceIndex op with a Const op and sets the value
+  // to be the index.
+  //
+  // Example input nodes:
+  // node {
+  //   name: "x"
+  //   op: "DeviceIndex"
+  //   device: "/device:CPU:0"
+  //   attr {
+  //     key: "device_names"
+  //     value {
+  //       list {
+  //         s: "CPU"
+  //         s: "TPU_REPLICATED_CORE"
+  //         s: "GPU"
+  //       }
+  //     }
+  //   }
+  // }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  // Example output nodes:
+  //
+  //  name: "x"
+  //  op: "Const"
+  //  device: "/device:CPU:0"
+  //  attr {
+  //    key: "dtype"
+  //    value {
+  //      type: DT_INT32
+  //    }
+  //  }
+  //  attr {
+  //    key: "value"
+  //    value {
+  //      tensor {
+  //        dtype: DT_INT32
+  //        int_val: 2
+  //      }
+  //    }
+  //  }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  Status SelectDeviceIndex(GraphDef* graph) const;
+
   std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ImplementationSelector);
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
index 914570fcadb..2ef8bb878cc 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -58,6 +59,167 @@ TEST_F(ImplementationSelectorTest, NoUpdate) {
   EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
+TEST_F(ImplementationSelectorTest, SelectDeviceIndex) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexMultiOps) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("TPU_REPLICATED_CORE");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y", "DeviceIndex", {}, {{"device_names", device_names}},
+            GpuDevice),
+       NDef("case_y", "Case", {"y"}, {{"T", DT_FLOAT}}, TpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y") {
+      // Rewrite DeviceIndex op to a Const op with value of CPU index 0.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexNotFound) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, TpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of device names length.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexError) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, "")});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Device parse has error, do not rewrite the DeviceIndexNode.
+      EXPECT_EQ("DeviceIndex", node.op());
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, TwoTypesOfSwapImplementation) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  // DeviceIndex op based implementation selector.
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("TPU_REPLICATED_CORE");
+  device_names.mutable_list()->add_s("GPU");
+
+  // Function swap based implementation selector.
+  auto cpu_def = test::function::XTimesTwo();
+  auto* func_attr = cpu_def.mutable_attr();
+  (*func_attr)["api_implements"].set_s("times_two");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
+
+  auto gpu_def = test::function::XAddX();
+  auto* func2_attr = gpu_def.mutable_attr();
+  (*func2_attr)["api_implements"].set_s("times_two");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
+
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y", "DeviceIndex", {}, {{"device_names", device_names}},
+            GpuDevice),
+       NDef("case_y", "Case", {"y"}, {{"T", DT_FLOAT}}, TpuDevice),
+       NDef("y1", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("z1", "Identity", {"y1"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y2", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, CpuDevice),
+       NDef("z2", "Identity", {"y2"}, {{"T", DT_FLOAT}}, CpuDevice)},
+      // FunctionLib
+      {cpu_def, gpu_def});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y") {
+      // Rewrite DeviceIndex op to a Const op with value of CPU index 0.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y1") {
+      // Make sure the implementation has been swapped to use the GPU version.
+      EXPECT_EQ("XAddX", node.op());
+    } else if (node.name() == "y2") {
+      // Make sure the implementation is not changed.
+      EXPECT_EQ("XTimesTwo", node.op());
+    }
+  }
+}
+
 TEST_F(ImplementationSelectorTest, SwapImplementation) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 7f4d1144cb2..96c0a3d6bdc 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -924,5 +924,37 @@ class FakeParamOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_GPU), FakeParamOp);
 
+// DeviceIndexOP returns the current device index.
+class DeviceIndexOp : public OpKernel {
+ public:
+  explicit DeviceIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_names", &device_names_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* device_name_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &device_name_t));
+    DeviceNameUtils::ParsedName parsed_name;
+    int index = device_names_.size();
+    if (DeviceNameUtils::ParseFullName(ctx->device()->name(), &parsed_name) &&
+        parsed_name.has_type) {
+      auto it = absl::c_find(device_names_, parsed_name.type);
+      if (it != device_names_.end()) {
+        index = it - device_names_.begin();
+      }
+    }
+    device_name_t->scalar<int32>()() = index;
+  }
+
+ private:
+  PersistentTensor value_handle_;
+  std::vector<string> device_names_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DeviceIndex").Device(DEVICE_CPU), DeviceIndexOp);
+REGISTER_KERNEL_BUILDER(
+    Name("DeviceIndex").Device(DEVICE_GPU).HostMemory("index"), DeviceIndexOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 0a08925f7e1..11b10f3c504 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -299,4 +299,10 @@ REGISTER_OP("FakeParam")
       return Status::OK();
     });
 
+// Returns the device index.
+REGISTER_OP("DeviceIndex")
+    .Output("index: int32")
+    .Attr("device_names: list(string)")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index dababc7615e..479d1122742 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -942,7 +942,10 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
     return captured_tensor
 
 
-def indexed_case(branch_index, branch_fns, name="indexed_case"):
+def indexed_case(branch_index,
+                 branch_fns,
+                 name="indexed_case",
+                 lower_using_switch_merge=None):
   """Like conv_v2, except emits a Case op instead of an If."""
   if isinstance(branch_index, int):
     raise TypeError("branch_index must not be a Python int", branch_index)
@@ -976,7 +979,8 @@ def indexed_case(branch_index, branch_fns, name="indexed_case"):
     return _build_case(
         branch_index,
         branch_graphs, [g.external_captures for g in branch_graphs],
-        name=scope)
+        name=scope,
+        lower_using_switch_merge=lower_using_switch_merge)
 
 
 @ops.RegisterGradient("Case")
@@ -1064,7 +1068,11 @@ def _CaseGrad(op, *grads):  # pylint: disable=invalid-name
   return [None] + outputs
 
 
-def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
+def _build_case(branch_index,
+                branch_graphs,
+                branch_inputs,
+                name=None,
+                lower_using_switch_merge=None):
   """Creates an `Case` op from `branch_index`, branch graphs and inputs.
 
   Note that this modifies `branch_graphs` to make the inputs match, and to
@@ -1080,6 +1088,7 @@ def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
     branch_inputs: List of lists of Tensors to be passed to corresponding
       branch_graph as input.
     name: the name for the Case op.
+    lower_using_switch_merge: Lower this op using switch merge ops (optional).
 
   Returns:
     A list of Tensors which are the outputs of the Case op. Does not include
@@ -1105,7 +1114,7 @@ def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
   case_op, tensors = _get_op_and_outputs(tensors)
 
   if case_op is not None:
-    util.maybe_set_lowering_attr(case_op)
+    util.maybe_set_lowering_attr(case_op, lower_using_switch_merge)
     util.maybe_propagate_compile_time_consts_in_xla(case_op)
     _set_read_only_resource_inputs_attr(case_op, branch_graphs)
     # Prevent fetching since the variant outputs can't be fetched directly.
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 918c989432d..3398308d42e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -3283,7 +3284,11 @@ def _indexed_case_verify_and_canonicalize_args(branch_fns, default,
   return actions
 
 
-def _indexed_case_helper(branch_fns, default, branch_index, name):
+def _indexed_case_helper(branch_fns,
+                         default,
+                         branch_index,
+                         name,
+                         lower_using_switch_merge=None):
   """Implementation of case that emits the n-way indexed Case op.
 
   Args:
@@ -3293,6 +3298,7 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
     branch_index: Optional int `Tensor`, which selects for the corresponding
       pred_fn_pair.
     name: A name for this operation (optional).
+    lower_using_switch_merge: Lower this op using switch merge ops (optional).
 
   Returns:
     The tensors returned by the pair whose key matched branch_index, or
@@ -3314,7 +3320,10 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
           | math_ops.greater_equal(branch_index, len(branch_fns)),
           len(branch_fns) - 1, branch_index)
       return branch_fns[int(branch_index)]()
-    return cond_v2.indexed_case(branch_index, branch_fns)
+    return cond_v2.indexed_case(
+        branch_index,
+        branch_fns,
+        lower_using_switch_merge=lower_using_switch_merge)
 
 
 @tf_export("case", v1=[])
@@ -3607,6 +3616,50 @@ def switch_case(branch_index,
   return _indexed_case_helper(branch_fns, default, branch_index, name)
 
 
+def execute_fn_for_device(device_branch_fns, default_fn, name="execute_fn"):
+  """Executes one of the provided callables based on the device placement.
+
+  This API is used when the implementations for high level function depend on
+  the underlying device placement. It takes a dictionary of device type to
+  callables. The device type includes "CPU", "GPU", "TPU", etc. When the type of
+  the device where to run this op matches the key in 'device_branch_fns',
+  the corresponding callable is executed, falling back to 'default_fn' if none
+  matches.
+
+  **Example:**
+  ```python
+  def f1(): return tf.constant(1)
+  def f2(): return tf.constant(2)
+  r = tf.execute_fn_for_device({"CPU": f1, "GPU": f2}, default_fn=f1)
+  ```
+  'r' is evaluated as 1 when it runs on CPU, 2 running on GPU, 1 running on
+  any other device types.
+
+
+  Args:
+    device_branch_fns: a dictionary of device types to the callables. Each
+      callable must return a matching structure of tensors.
+    default_fn: fallback callable when the underlying device does not match any
+      key in the 'device_branch_fns'.
+    name: A name for this operation (optional).
+
+  Returns:
+    The tensors returned by the callable identified by device type during
+    execution, or those returned by 'default_fn' if no key matches.
+  """
+
+  device_branch_fns_upper = {k.upper(): v for k, v in device_branch_fns.items()}
+  branch_fns = list(device_branch_fns_upper.values())
+  devices = list(device_branch_fns_upper.keys())
+  device_index = gen_functional_ops.device_index(device_names=devices)
+  return _indexed_case_helper(
+      branch_fns,
+      default_fn,
+      device_index,
+      name,
+      lower_using_switch_merge=False)
+
+
 class XLAControlFlowContext(ControlFlowContext):
   """Base class for XLA and TPU control flow contexts."""
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 2979eb79bfd..f4459d8e34a 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -1211,6 +1211,92 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       control_flow_ops.switch_case(array_ops.constant(1), branches)
 
 
+class ExecuteFnForDeviceTest(test_util.TensorFlowTestCase):
+
+  def testCommonCases(self):
+
+    def cpu_fn(x):
+      return x + x
+
+    def gpu_fn(x):
+      return x * x
+
+    def flexible_fn(a):
+      branches = {"CPU": lambda: cpu_fn(a), "GPU": lambda: gpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(branches, lambda: cpu_fn(a))
+
+    @def_function.function
+    def flexible_defun(a):
+      return flexible_fn(a)
+
+    def run_defun_and_tape(a):
+      with backprop.GradientTape() as tape:
+        tape.watch(a)
+        result = flexible_defun(a)
+      grad = tape.gradient(result, a)
+      r = flexible_fn(a)
+      return r, result, grad
+
+    a = array_ops.constant(3.)
+    with ops.device("cpu:0"):
+      r, result, grad = run_defun_and_tape(a)
+      self.assertEqual(6., self.evaluate(r))
+      self.assertEqual(6., self.evaluate(result))
+      self.assertEqual([2.], self.evaluate(grad))
+
+    if test_util.is_gpu_available():
+      with ops.device("gpu:0"):
+        r, result, grad = run_defun_and_tape(a)
+        self.assertEqual(9., self.evaluate(r))
+        self.assertEqual(9., self.evaluate(result))
+        self.assertEqual([6.], self.evaluate(grad))
+
+    # no device annotation
+    r, result, grad = run_defun_and_tape(a)
+    if test_util.is_gpu_available():
+      self.assertEqual(9., self.evaluate(r))
+      self.assertEqual(9., self.evaluate(result))
+      self.assertEqual([6.], self.evaluate(grad))
+    else:
+      self.assertEqual(6., self.evaluate(r))
+      self.assertEqual(6., self.evaluate(result))
+      self.assertEqual([2.], self.evaluate(grad))
+
+  def testFallBack(self):
+
+    def default_fn(x):
+      return x
+
+    def tpu_fn(x):
+      return x * x * x
+
+    def flexible_fn(a):
+      branches = {"TPU": lambda: tpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(
+          branches, default_fn=lambda: default_fn(a))
+
+    @def_function.function
+    def flexible_defun(a):
+      return flexible_fn(a)
+
+    a = array_ops.constant(3.)
+    with ops.device("cpu:0"):
+      result_defun = flexible_defun(a)
+      result_defun = flexible_fn(a)
+      self.assertEqual(3., self.evaluate(result_defun))
+      # execute_fn_for_device is not inside defun_function.
+      result = flexible_fn(a)
+      self.assertEqual(3., self.evaluate(result))
+
+    if test_util.is_gpu_available():
+      with ops.device("gpu:0"):
+        result_defun = flexible_defun(a)
+        self.assertEqual(3., self.evaluate(result_defun))
+        # execute_fn_for_device is not inside defun_function.
+        result = flexible_fn(a)
+        self.assertEqual(3., self.evaluate(result))
+
+
 class CaseTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 7e87d25fe99..4fc464c545c 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -92,7 +92,7 @@ def unique_grad_fn_name(forward_name):
   return "%s_grad_%s" % (forward_name, ops.uid())
 
 
-def maybe_set_lowering_attr(op):
+def maybe_set_lowering_attr(op, lower_using_switch_merge=None):
   """Sets the flag to enable lowering on `op` if necessary.
 
   Lowering allows cond_v2 and while_v2 to avoid some of the limitations of
@@ -108,14 +108,21 @@ def maybe_set_lowering_attr(op):
     - When the eager execution context specifies the executor of functions to
       be the single threaded executor (see context.function_executor_type()).
       Because the single threaded executor does not support v1 control flow ops.
+    - When 'lower_using_switch_merge' is explicitly set to False.
 
   Args:
     op: An `If` or `While` Operation.
+    lower_using_switch_merge: Explicit value to lower or not (optional).
   """
-  if (not _DISABLE_LOWER_USING_SWITCH_MERGE and
-      not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
-      context.context().function_call_options.executor_type !=
-      "SINGLE_THREADED_EXECUTOR"):
+  if lower_using_switch_merge is not None:
+    # pylint: disable=protected-access
+    op._set_attr("_lower_using_switch_merge",
+                 attr_value_pb2.AttrValue(b=lower_using_switch_merge))
+    # pylint: enable=protected-access
+  elif (not _DISABLE_LOWER_USING_SWITCH_MERGE and
+        not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
+        context.context().function_call_options.executor_type !=
+        "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
     op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
     # pylint: enable=protected-access
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index a8efb9e59b5..25ae132c775 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1140,6 +1140,10 @@ tf_module {
     name: "DestroyTemporaryVariable"
     argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeviceIndex"
+    argspec: "args=[\'device_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Diag"
     argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index a8efb9e59b5..25ae132c775 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1140,6 +1140,10 @@ tf_module {
     name: "DestroyTemporaryVariable"
     argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeviceIndex"
+    argspec: "args=[\'device_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Diag"
     argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "

From 3883662dafd32c04cfc7a75df98a4e64782508ea Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Thu, 28 May 2020 10:00:34 -0700
Subject: [PATCH 1306/1533] fix dispatching for several keras backend methods.
 (The convert_to_tensor at the start will lead to a typeerror & trigger
 dispatching if necessary)

PiperOrigin-RevId: 313607546
Change-Id: Ibcc28ce6d4d9f43b625fc204ff5670a48511fcaa
---
 tensorflow/python/keras/backend.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index d0c3eb03342..c8f29425edf 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -4578,6 +4578,9 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   [0. 0. 0.]
 
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   target.shape.assert_is_compatible_with(output.shape)
   if from_logits:
     return nn.softmax_cross_entropy_with_logits_v2(
@@ -4625,6 +4628,9 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   if not from_logits and not isinstance(
       output, (ops.EagerTensor, variables_module.Variable)):
     output = _backtrack_identity(output)
@@ -4700,6 +4706,9 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   if from_logits:
     return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 

From 4a7a16b6aaa183514f61eebe63860f8f918d6cb9 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Thu, 28 May 2020 10:04:51 -0700
Subject: [PATCH 1307/1533] Update doc string to reflect the latest changes to
 `experimental_distribute_datasets_from_function`.

PiperOrigin-RevId: 313608527
Change-Id: I060ec4b8f4e749f687450a323feaec26626b79ff
---
 .../python/distribute/distribute_lib.py       | 56 +++++++++----------
 1 file changed, 25 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index ecdc4fad159..fbc8923e050 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -836,46 +836,40 @@ class StrategyBase(object):
     where that limitation does not exist.
 
     The `dataset_fn` should take an `tf.distribute.InputContext` instance where
-    information about batching and input replication can be accessed:
+    information about batching and input replication can be accessed.
 
-    ```
-    def dataset_fn(input_context):
-      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
-      return d.shard(
-          input_context.num_input_pipelines, input_context.input_pipeline_id)
+    You can also use the `element_spec` property of the distributed dataset
+    returned by this API to query the `tf.TypeSpec` of the elements returned
+    by the iterator. This can be used to set the `input_signature` property
+    of a `tf.function`.
 
-    inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+    >>> global_batch_size = 8
+    >>> def dataset_fn(input_context):
+    ...   batch_size = input_context.get_per_replica_batch_size(
+    ...                    global_batch_size)
+    ...   d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
+    ...   return d.shard(
+    ...       input_context.num_input_pipelines,
+    ...       input_context.input_pipeline_id)
 
-    for batch in inputs:
-      replica_results = strategy.run(replica_fn, args=(batch,))
-    ```
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> ds = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+
+    >>> def train(ds):
+    ...   @tf.function(input_signature=[ds.element_spec])
+    ...   def step_fn(inputs):
+    ...     # train the model with inputs
+    ...     return inputs
+
+    ...   for batch in ds:
+    ...     replica_results = strategy.run(replica_fn, args=(batch,))
+    >>> train(ds)
 
     IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
     per-replica batch size, unlike `experimental_distribute_dataset`, which uses
     the global batch size.  This may be computed using
     `input_context.get_per_replica_batch_size`.
 
-    To query the `tf.TypeSpec` of the elements in the distributed dataset
-    returned by this API, you need to use the `element_spec` property of the
-    distributed iterator. This `tf.TypeSpec` can be used to set the
-    `input_signature` property of a `tf.function`.
-
-    ```python
-    # If you want to specify `input_signature` for a `tf.function` you must
-    # first create the iterator.
-    iterator = iter(inputs)
-
-    @tf.function(input_signature=[iterator.element_spec])
-    def replica_fn_with_signature(inputs):
-      # train the model with inputs
-      return
-
-    for _ in range(steps):
-      strategy.run(replica_fn_with_signature,
-          args=(next(iterator),))
-    ```
-
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.

From 5c3ac7400a3d65db9be8cbdea98abe33f4bdae2a Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Thu, 28 May 2020 10:18:55 -0700
Subject: [PATCH 1308/1533] Add TF (CPU/GPU) collective ops to the TF dialect.

PiperOrigin-RevId: 313611589
Change-Id: Ib6d68d15fb18a351a7ce044889c0be79a3b70470
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 1df8f7fd519..9f407ea774a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1318,6 +1318,126 @@ greater than `clip_value_max` are set to `clip_value_max`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
+  let summary = "Receives a tensor value broadcast from another device.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+  );
+
+  TF_DerivedResultTypeAttr T = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_CollectiveBcastSendOp : TF_Op<"CollectiveBcastSend", []> {
+  let summary = "Broadcasts a tensor value to one or more other devices.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectiveGatherOp : TF_Op<"CollectiveGather", []> {
+  let summary = [{
+Mutually accumulates multiple tensors of identical type and shape.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectivePermuteOp : TF_Op<"CollectivePermute", [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "An Op to permute tensors across replicated TPU instances.";
+
+  let description = [{
+Each instance supplies its own input.
+
+For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+`[D, A, B, C]`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    I32Tensor:$source_target_pairs
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectiveReduceOp : TF_Op<"CollectiveReduce", [SameOperandsAndResultType]> {
+  let summary = [{
+Mutually reduces multiple tensors of identical type and shape.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add"]>:$merge_op,
+    TF_AnyStrAttrOf<["Id", "Div"]>:$final_op,
+    I64ArrayAttr:$subdiv_offsets,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$wait_for,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ComplexOp : TF_Op<"Complex", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "Converts two real numbers to a complex number.";
 

From 61330290aa20eea4be69b99cef4ad84f550344a7 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Thu, 28 May 2020 10:20:11 -0700
Subject: [PATCH 1309/1533] Make quantized_input_stats optional inside the
 converter

This is to avoid using some default protobuf field values inside the converter
when the user doesn't specify the quantized_input_stats.

PiperOrigin-RevId: 313611828
Change-Id: I2da39069b67aac409fe8290709712572b17a1b6e
---
 .../lite/python/graphdef_to_tfl_flatbuffer.cc |  4 +--
 .../python/saved_model_to_tfl_flatbuffer.cc   |  4 +--
 .../lite/python/tf_tfl_flatbuffer_helpers.cc  | 30 +++++++++++--------
 .../lite/python/tf_tfl_flatbuffer_helpers.h   | 15 +++++-----
 .../lite/quantization/quantization_config.cc  | 14 ++++-----
 .../lite/quantization/quantization_config.h   | 14 +++++----
 .../mlir/lite/transforms/prepare_quantize.cc  | 14 +++++----
 7 files changed, 52 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index a07b7b8dd1d..8a2faebcbe6 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -55,8 +55,8 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   std::vector<string> node_names;
   std::vector<string> node_dtypes;
   std::vector<std::vector<int>> node_shapes;
-  std::vector<double> node_mins;
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_mins;
+  std::vector<llvm::Optional<double>> node_maxs;
 
   // Populate quantization specs.
   TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 51fcbb97360..ab80746f8b7 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -125,8 +125,8 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
   std::vector<string> node_names;
   std::vector<string> node_dtypes;
   std::vector<std::vector<int>> node_shapes;
-  std::vector<double> node_mins;
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_mins;
+  std::vector<llvm::Optional<double>> node_maxs;
 
   // Populate quantization specs.
   TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index a1401323e89..8f2c8bc362c 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -177,14 +177,13 @@ Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags) {
   return RegisterCustomBuiltinOps(extra_tf_opdefs);
 }
 
-Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
-                                 const toco::TocoFlags& toco_flags,
-                                 mlir::TFL::QuantizationSpecs* quant_specs,
-                                 std::vector<string>* node_names,
-                                 std::vector<string>* node_dtypes,
-                                 std::vector<std::vector<int>>* node_shapes,
-                                 std::vector<double>* node_mins,
-                                 std::vector<double>* node_maxs) {
+Status PopulateQuantizationSpecs(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
+    std::vector<string>* node_dtypes,
+    std::vector<std::vector<int>>* node_shapes,
+    std::vector<llvm::Optional<double>>* node_mins,
+    std::vector<llvm::Optional<double>>* node_maxs) {
   quant_specs->inference_input_type =
       ConvertIODataTypeToDataType(toco_flags.inference_input_type());
   tensorflow::DataType inference_type =
@@ -211,11 +210,16 @@ Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
                                             flag.shape().dims().end()));
     // Currently, only UINT8 and INT8 require inputs stats
     if (inference_type == DT_QINT8 || inference_type == DT_QUINT8) {
-      TF_ASSIGN_OR_RETURN(
-          auto min_max, InputStatsToMinMax(flag.mean_value(), flag.std_value(),
-                                           inference_type));
-      node_mins->push_back(min_max.first);
-      node_maxs->push_back(min_max.second);
+      if (flag.has_mean_value() && flag.has_std_value()) {
+        TF_ASSIGN_OR_RETURN(
+            auto min_max, InputStatsToMinMax(flag.mean_value(),
+                                             flag.std_value(), inference_type));
+        node_mins->push_back(min_max.first);
+        node_maxs->push_back(min_max.second);
+      } else {
+        node_mins->push_back(llvm::None);
+        node_maxs->push_back(llvm::None);
+      }
     }
   }
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index 3ea36e5eb1d..87e73912a46 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -34,14 +34,13 @@ Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags);
 
 // Populate quantization specs (or not) given user specified ranges for each
 // input arrays.
-Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
-                                 const toco::TocoFlags& toco_flags,
-                                 mlir::TFL::QuantizationSpecs* quant_specs,
-                                 std::vector<string>* node_names,
-                                 std::vector<string>* node_dtypes,
-                                 std::vector<std::vector<int>>* node_shapes,
-                                 std::vector<double>* node_mins,
-                                 std::vector<double>* node_maxs);
+Status PopulateQuantizationSpecs(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
+    std::vector<string>* node_dtypes,
+    std::vector<std::vector<int>>* node_shapes,
+    std::vector<llvm::Optional<double>>* node_mins,
+    std::vector<llvm::Optional<double>>* node_maxs);
 
 // Convert imported MLIR file to TfLite flatbuffer.
 // This will also run relevant passes as well.
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 6b897bd5608..3edd9c36760 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -45,7 +45,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
                               absl::string_view inference_type,
                               QuantizationSpecs* quant_specs) {
   std::vector<std::string> input_nodes = absl::StrSplit(node_names, ',');
-  std::vector<double> node_mins;
+  std::vector<llvm::Optional<double>> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
     for (int i = 0; i < node_mins_str.size(); i++) {
@@ -57,7 +57,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
     }
   }
 
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
     for (int i = 0; i < node_maxs_str.size(); i++) {
@@ -79,11 +79,11 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
                                 quant_specs);
 }
 
-bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
-                            const std::vector<double>& node_mins,
-                            const std::vector<double>& node_maxs,
-                            tensorflow::DataType inference_type,
-                            QuantizationSpecs* quant_specs) {
+bool GetInputNodeQuantSpecs(
+    const std::vector<std::string>& node_names,
+    const std::vector<llvm::Optional<double>>& node_mins,
+    const std::vector<llvm::Optional<double>>& node_maxs,
+    tensorflow::DataType inference_type, QuantizationSpecs* quant_specs) {
   quant_specs->inference_type = inference_type;
 
   // If min/max are not specified, just return;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index 2ffba579548..a4046553d17 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -19,6 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -69,7 +70,8 @@ struct QuantizationSpecs {
   // arguments. They are only used when `weight_quantization` is set to false,
   // and the model is required to have quantization parameters, either from
   // quantization aware training or calibration, for the remaining tensors.
-  std::vector<std::pair<double, double>> input_ranges;
+  std::vector<std::pair<llvm::Optional<double>, llvm::Optional<double>>>
+      input_ranges;
 
   // The default ranges can be used when a tensor doesn't have quantization
   // parameters and couldn't be quantized. Used only for latency tests.
@@ -130,11 +132,11 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
 // Gets the quantization specification for input arrays. The array names are not
 // stored in the spec, and will be matched by position. The min/max will be
 // ignored if the inference_type isn't a quantized type. Returns true if failed.
-bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
-                            const std::vector<double>& node_mins,
-                            const std::vector<double>& node_maxs,
-                            tensorflow::DataType inference_type,
-                            QuantizationSpecs* quant_specs);
+bool GetInputNodeQuantSpecs(
+    const std::vector<std::string>& node_names,
+    const std::vector<llvm::Optional<double>>& node_mins,
+    const std::vector<llvm::Optional<double>>& node_maxs,
+    tensorflow::DataType inference_type, QuantizationSpecs* quant_specs);
 
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 87cae3dd957..702808ac892 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -109,8 +109,8 @@ class PrepareQuantizePass
   // Get the min and max values from the quantization specification for the
   // current function function and argument index. Uses default values if
   // the function is specified in the `quantize_whitelist`.
-  std::pair<double, double> GetMinMaxValuesForArgument(
-      llvm::StringRef func_name, int index) {
+  std::pair<llvm::Optional<double>, llvm::Optional<double>>
+  GetMinMaxValuesForArgument(llvm::StringRef func_name, int index) {
     if (func_name == quant_specs_.target_func) {
       return quant_specs_.input_ranges[index];
     } else {
@@ -160,10 +160,14 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(FuncOp func) {
         }
 
         auto min_max = GetMinMaxValuesForArgument(func_name, i);
+        // The input min/max or mean/std are not specified, then skip.
+        if (!min_max.first.hasValue() || !min_max.second.hasValue()) return;
+
         TypeAttr params = quant::GetQuantizedTypeAttr(
-            builder, input_type, builder.getF64FloatAttr(min_max.first),
-            builder.getF64FloatAttr(min_max.second), /*quant_dim=*/-1, num_bits,
-            narrow_range, is_signed);
+            builder, input_type,
+            builder.getF64FloatAttr(min_max.first.getValue()),
+            builder.getF64FloatAttr(min_max.second.getValue()),
+            /*quant_dim=*/-1, num_bits, narrow_range, is_signed);
         builder.setInsertionPoint(block, insertion_point);
         auto q_op =
             builder.create<quant::QuantizeCastOp>(loc, params.getValue(), arg);

From 4d9269874862bfad7e039efe720b219ce702e29b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 10:24:28 -0700
Subject: [PATCH 1310/1533] Check rank that rank(updates) = rank(indices +
 params[1:]) in resource_scatter_update, to match behavior of V1
 scatter_update and shape function.

PiperOrigin-RevId: 313612836
Change-Id: If7d3f1e9e34c9622b3d874157ee5db4caee6f11b
---
 tensorflow/core/kernels/resource_variable_ops.cc | 11 +++++++++++
 tensorflow/core/ops/state_ops_test.cc            |  9 +++++++++
 2 files changed, 20 insertions(+)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index b606d411a3d..0fc1d53749f 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -887,6 +887,17 @@ class ResourceScatterUpdateOp : public OpKernel {
     const Tensor& indices = c->input(1);
     const Tensor& updates = c->input(2);
 
+    // Check that rank(updates.shape) = rank(indices.shape + params.shape[1:])
+    OP_REQUIRES(c,
+                updates.dims() == 0 ||
+                    updates.dims() == indices.dims() + params->dims() - 1,
+                errors::InvalidArgument(
+                    "Must have updates.shape = indices.shape + "
+                    "params.shape[1:] or updates.shape = [], got ",
+                    "updates.shape ", updates.shape().DebugString(),
+                    ", indices.shape ", indices.shape().DebugString(),
+                    ", params.shape ", params->shape().DebugString()));
+
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
     OP_REQUIRES(
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
index 6d05dd0b96c..a0caad4a49f 100644
--- a/tensorflow/core/ops/state_ops_test.cc
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -58,6 +58,15 @@ TEST(StateOpsTest, ScatterUpdate_ShapeFn) {
 
   // Resolve shape on first updates dimension.
   INFER_OK(op, "[1,2];[3];[?,2]", "in0");
+
+  // Allow the update to be a scalar.
+  INFER_OK(op, "[1,2];[3];?", "in0");
+
+  // Allow a scalar index.
+  INFER_OK(op, "[1,2];[];[2]", "in0");
+
+  // Check the requirement updates.shape = indices.shape + ref.shape[1:].
+  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op, "[2];[];[2]");
 }
 
 TEST(StateOpsTest, TemporaryVariable_ShapeFn) {

From f1e137db12c2641953a1e176b0f01d9283ab80ee Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Thu, 28 May 2020 10:39:06 -0700
Subject: [PATCH 1311/1533] Make lite/tools/evaluation:utils depend on the
 NNAPI delegate only when building for Android.

PiperOrigin-RevId: 313615943
Change-Id: Idf3b05cfea63c9578c726e5ed7b5afacd9e1a495
---
 .../delegates/nnapi_delegate_provider.cc      | 32 +++++++++++--------
 tensorflow/lite/tools/evaluation/BUILD        |  2 +-
 .../evaluation_delegate_provider_test.cc      |  4 +++
 tensorflow/lite/tools/evaluation/utils.cc     |  8 ++---
 tensorflow/lite/tools/evaluation/utils.h      |  6 ++--
 5 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index 2fbfb791e8c..bde9c0e03e3 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -26,6 +26,7 @@ namespace tools {
 class NnapiDelegateProvider : public DelegateProvider {
  public:
   NnapiDelegateProvider() {
+#if defined(__ANDROID__)
     default_params_.AddParam("use_nnapi", ToolParam::Create<bool>(false));
     default_params_.AddParam("nnapi_execution_preference",
                              ToolParam::Create<std::string>(""));
@@ -35,6 +36,7 @@ class NnapiDelegateProvider : public DelegateProvider {
                              ToolParam::Create<bool>(false));
     default_params_.AddParam("nnapi_allow_fp16",
                              ToolParam::Create<bool>(false));
+#endif
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -49,18 +51,21 @@ REGISTER_DELEGATE_PROVIDER(NnapiDelegateProvider);
 
 std::vector<Flag> NnapiDelegateProvider::CreateFlags(ToolParams* params) const {
   std::vector<Flag> flags = {
-      CreateFlag<bool>("use_nnapi", params, "use nnapi delegate api"),
-      CreateFlag<std::string>("nnapi_execution_preference", params,
-                              "execution preference for nnapi delegate. Should "
-                              "be one of the following: fast_single_answer, "
-                              "sustained_speed, low_power, undefined"),
-      CreateFlag<std::string>(
-          "nnapi_accelerator_name", params,
-          "the name of the nnapi accelerator to use (requires Android Q+)"),
-      CreateFlag<bool>("disable_nnapi_cpu", params,
-                       "Disable the NNAPI CPU device"),
-      CreateFlag<bool>("nnapi_allow_fp16", params,
-                       "Allow fp32 computation to be run in fp16")};
+#if defined(__ANDROID__)
+    CreateFlag<bool>("use_nnapi", params, "use nnapi delegate api"),
+    CreateFlag<std::string>("nnapi_execution_preference", params,
+                            "execution preference for nnapi delegate. Should "
+                            "be one of the following: fast_single_answer, "
+                            "sustained_speed, low_power, undefined"),
+    CreateFlag<std::string>(
+        "nnapi_accelerator_name", params,
+        "the name of the nnapi accelerator to use (requires Android Q+)"),
+    CreateFlag<bool>("disable_nnapi_cpu", params,
+                     "Disable the NNAPI CPU device"),
+    CreateFlag<bool>("nnapi_allow_fp16", params,
+                     "Allow fp32 computation to be run in fp16")
+#endif
+  };
 
   return flags;
 }
@@ -98,6 +103,7 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
 TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
     const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
+#if defined(__ANDROID__)
   if (params.Get<bool>("use_nnapi")) {
     StatefulNnApiDelegate::Options options;
     std::string accelerator_name =
@@ -157,7 +163,7 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
                      << params.Get<std::string>("nnapi_execution_preference")
                      << ") to be used.";
   }
-
+#endif
   return delegate;
 }
 
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 1bc35211b0a..f6d2c8d8141 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -41,10 +41,10 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
+            "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         ],
         "//conditions:default": [],
     }) + select({
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
index 5d0a4dfa7d3..0282b258082 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
@@ -43,7 +43,11 @@ TEST(EvaluationDelegateProviderTest, CreateTfLiteDelegate) {
 TEST(EvaluationDelegateProviderTest, DelegateProvidersParams) {
   DelegateProviders providers;
   const auto& params = providers.GetAllParams();
+#if defined(__ANDROID__)
   EXPECT_TRUE(params.HasParam("use_nnapi"));
+#else
+  EXPECT_FALSE(params.HasParam("use_nnapi"));
+#endif
   EXPECT_TRUE(params.HasParam("use_gpu"));
 
   int argc = 3;
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 33967b6f4ea..c766a932999 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -101,20 +101,18 @@ TfLiteDelegatePtr CreateNNAPIDelegate() {
       // NnApiDelegate() returns a singleton, so provide a no-op deleter.
       [](TfLiteDelegate*) {});
 #else
-  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+  return CreateNullDelegate();
 #endif  // defined(__ANDROID__)
 }
 
-TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 #if defined(__ANDROID__)
+TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
   return TfLiteDelegatePtr(
       new StatefulNnApiDelegate(options), [](TfLiteDelegate* delegate) {
         delete reinterpret_cast<StatefulNnApiDelegate*>(delegate);
       });
-#else
-  return CreateNullDelegate();
-#endif  // defined(__ANDROID__)
 }
+#endif  // defined(__ANDROID__)
 
 #if defined(__ANDROID__)
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index ef2c609723e..7269d04a802 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
 
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
 #if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #if (defined(__arm__) || defined(__aarch64__))
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h"
 #endif
@@ -33,7 +35,6 @@ limitations under the License.
 #endif
 
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 
 namespace tflite {
 namespace evaluation {
@@ -61,8 +62,9 @@ inline TfLiteStatus GetSortedFileNames(const std::string& directory,
 }
 
 TfLiteDelegatePtr CreateNNAPIDelegate();
-
+#if defined(__ANDROID__)
 TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options);
+#endif
 
 TfLiteDelegatePtr CreateGPUDelegate();
 #if defined(__ANDROID__)

From 393e92ae5f928cdf75465fd905aa2e17650ab987 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Thu, 28 May 2020 10:44:42 -0700
Subject: [PATCH 1312/1533] Explicitly specify input array `dtype` to
 TensorFlow's `assertAllEqual()` test util function.

PiperOrigin-RevId: 313616979
Change-Id: Id3aabf89bb7a05e1f338fb05b20da3a0848a0440
---
 .../layers/preprocessing/image_preprocessing_test.py  | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 14720d3541d..f00a9657039 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import image_ops_impl as image_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
@@ -1114,7 +1115,10 @@ class RandomHeightTest(keras_parameterized.TestCase):
       with tf_test_util.use_gpu():
         input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
         layer = image_preprocessing.RandomHeight(factor=(1., 1.))
-        output_image = layer(np.expand_dims(input_image, axis=0))
+        # Return type of RandomHeight() is float32 if `interpolation` is not
+        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
+        output_image = math_ops.cast(layer(np.expand_dims(input_image, axis=0)),
+                                     dtype=dtype)
         # pyformat: disable
         expected_output = np.asarray([
             [0, 1, 2],
@@ -1202,7 +1206,10 @@ class RandomWidthTest(keras_parameterized.TestCase):
       with tf_test_util.use_gpu():
         input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
         layer = image_preprocessing.RandomWidth(factor=(1., 1.))
-        output_image = layer(np.expand_dims(input_image, axis=0))
+        # Return type of RandomWidth() is float32 if `interpolation` is not
+        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
+        output_image = math_ops.cast(layer(np.expand_dims(input_image, axis=0)),
+                                     dtype=dtype)
         # pyformat: disable
         expected_output = np.asarray([
             [0, 0.25, 0.75, 1],

From 8d31fb4b765260a6d0bcaf825ca68766b5b798ee Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Thu, 28 May 2020 10:47:27 -0700
Subject: [PATCH 1313/1533] Refactor values.py into a utility file and a PS
 values file.

PiperOrigin-RevId: 313617630
Change-Id: Ie51b0f69af65b3f85701f58190da2c7eb46e1d29
---
 tensorflow/python/distribute/BUILD            | 101 ++++-
 .../python/distribute/cross_device_ops.py     |   5 +-
 .../distribute/parameter_server_strategy.py   |   9 +-
 .../parameter_server_strategy_test.py         |   7 +-
 tensorflow/python/distribute/ps_values.py     | 304 +++++++++++++++
 .../python/distribute/ps_values_test.py       |  65 ++++
 tensorflow/python/distribute/values.py        | 367 +-----------------
 tensorflow/python/distribute/values_test.py   |  32 --
 tensorflow/python/distribute/values_util.py   |  91 +++++
 .../keras/mixed_precision/experimental/BUILD  |   5 +-
 .../experimental/autocast_variable.py         |   6 +-
 tensorflow/python/module/module_test.py       |   7 +-
 12 files changed, 590 insertions(+), 409 deletions(-)
 create mode 100644 tensorflow/python/distribute/ps_values.py
 create mode 100644 tensorflow/python/distribute/ps_values_test.py
 create mode 100644 tensorflow/python/distribute/values_util.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 585af92d15c..ed93e87088c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -67,6 +67,7 @@ py_library(
         ":collective_util",
         ":cross_device_utils",
         ":device_util",
+        ":ps_values",
         ":reduce_util",
         ":tpu_values",
         ":values",
@@ -78,7 +79,9 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:executor",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
@@ -315,18 +318,23 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":cross_device_ops",
+        ":device_util",
+        ":distribute_lib",
         ":input_lib",
         ":mirrored_run",
         ":multi_worker_util",
         ":numpy_dataset",
-        ":reduce_util",
+        ":ps_values",
         ":values",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:device",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
     ],
@@ -671,12 +679,14 @@ py_library(
         ":device_util",
         ":distribute_lib",
         ":reduce_util",
+        ":values_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
@@ -686,6 +696,34 @@ py_library(
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/types",
+    ],
+)
+
+py_library(
+    name = "ps_values",
+    srcs = ["ps_values.py"],
+    deps = [
+        ":distribute_lib",
+        ":values",
+        ":values_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/types",
+    ],
+)
+
+py_library(
+    name = "values_util",
+    srcs = ["values_util.py"],
+    deps = [
+        ":distribute_lib",
+        ":reduce_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -1037,23 +1075,57 @@ distribute_py_test(
     ],
     deps = [
         ":combinations",
-        ":device_util",
         ":distribute_lib",
-        ":mirrored_strategy",
-        ":parameter_server_strategy",
         ":strategy_combinations",
+        ":tpu_strategy",
+        ":tpu_values",
         ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:indexed_slices",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/types",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "ps_values_test",
+    size = "medium",
+    srcs = ["ps_values_test.py"],
+    main = "ps_values_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        ":ps_values",
+        ":strategy_combinations",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1570,10 +1642,13 @@ cuda_py_test(
     deps = [
         ":central_storage_strategy",
         ":combinations",
+        ":device_util",
+        ":distribute_lib",
         ":multi_worker_test_base",
         ":multi_worker_util",
         ":parameter_server_strategy",
-        ":strategy_combinations",
+        ":ps_values",
+        ":reduce_util",
         ":strategy_test_lib",
         ":values",
         "//tensorflow/core:protos_all_py",
@@ -1581,16 +1656,22 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:session",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/keras/layers:core",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 8c8970f4aeb..aaca66833e0 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -19,15 +19,16 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import enum
 import threading
 
-import enum
 import six
 
 from tensorflow.python.client import device_lib
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
@@ -64,7 +65,7 @@ def validate_destinations(destinations):
   """Validates the `destination` is one of expected types."""
   if not isinstance(
       destinations,
-      (value_lib.DistributedValues, ops.Tensor, value_lib.AggregatingVariable,
+      (value_lib.DistributedValues, ops.Tensor, ps_values.AggregatingVariable,
        six.string_types, tpu_values.TPUMirroredVariable
       )) and not resource_variable_ops.is_resource_variable(destinations):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index ce2b7ceb159..142684bb3e9 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -28,6 +28,7 @@ from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_run
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
@@ -441,8 +442,8 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
         # Create and wrap the variable.
         v = next_creator(**kwargs)
-        wrapped = values.AggregatingVariable(
-            self._container_strategy(), v, aggregation)
+        wrapped = ps_values.AggregatingVariable(self._container_strategy(), v,
+                                                aggregation)
 
         # Add the wrapped variable to the requested collections.
         # The handling of eager mode and the global step matches
@@ -539,7 +540,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     return nest.map_structure(_select_fn, structured)
 
   def _update(self, var, fn, args, kwargs, group):
-    if isinstance(var, values.AggregatingVariable):
+    if isinstance(var, ps_values.AggregatingVariable):
       var = var.get()
     if not resource_variable_ops.is_resource_variable(var):
       raise ValueError(
@@ -569,7 +570,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
   def value_container(self, val):
     if (hasattr(val, "_aggregating_container") and
-        not isinstance(val, values.AggregatingVariable)):
+        not isinstance(val, ps_values.AggregatingVariable)):
       wrapper = val._aggregating_container()  # pylint: disable=protected-access
       if wrapper is not None:
         return wrapper
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index de5cf9e90d1..d67ed72a576 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.distribute import distribution_strategy_context as ds_con
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import values
@@ -796,8 +797,8 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                        msg=('created_step %s type %s vs. get_step %s type %s' %
                             (id(created_step), created_step.__class__.__name__,
                              id(get_step), get_step.__class__.__name__)))
-      self.assertIs(values.AggregatingVariable, type(created_step))
-      self.assertIs(values.AggregatingVariable, type(get_step))
+      self.assertIs(ps_values.AggregatingVariable, type(created_step))
+      self.assertIs(ps_values.AggregatingVariable, type(get_step))
       self.assertIs(strategy, created_step.distribute_strategy)
 
   @combinations.generate(combinations.combine(mode=['graph']))
@@ -828,7 +829,7 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
           _ = v * v
         v, = tape.watched_variables()
         w = strategy.extended.value_container(v)
-        self.assertIs(values.AggregatingVariable, type(w))
+        self.assertIs(ps_values.AggregatingVariable, type(w))
 
       strategy.extended.call_for_each_replica(f)
 
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
new file mode 100644
index 00000000000..37cd6e12d90
--- /dev/null
+++ b/tensorflow/python/distribute/ps_values.py
@@ -0,0 +1,304 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed values for PS."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import weakref
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute import values_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
+
+
+# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
+class AggregatingVariable(variables_lib.Variable, core.Tensor):
+  """A wrapper around a variable that aggregates updates across replicas."""
+
+  def __init__(self, strategy, v, aggregation):
+    self._distribute_strategy = strategy
+    self._v = v
+    # NOTE: We don't use "_distributed_container" here because we don't want
+    # to trigger that code path in regroup().
+    v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._aggregation = aggregation
+
+  def get(self):
+    return self._v
+
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
+
+  def __getattr__(self, name):
+    return getattr(self._v, name)
+
+  def _assign_func(self, *args, **kwargs):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      f = kwargs.pop("f")
+      if ds_context.in_cross_replica_context():
+        if distribute_lib.get_update_replica_id() is not None:
+          # We are calling an assign function in an update context.
+          return f(self._v, *args, **kwargs)
+
+        # We are calling an assign function in cross replica context, wrap it in
+        # an update call.
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
+      else:
+        replica_context = ds_context.get_replica_context()
+        assert replica_context
+        # We are calling an assign function in replica context.
+        # We reduce the value we want to assign/add/sub. More details about how
+        # we handle the different use cases can be found in the _reduce method.
+        # We call the function with the reduced value.
+        if self._aggregation == vs.VariableAggregation.NONE:
+          raise ValueError(
+              values_util.aggregation_error_msg.format(
+                  variable_type="AggregatingVariable"))
+
+        def merge_fn(strategy,
+                     value,
+                     use_locking=False,
+                     name=None,
+                     read_value=True):
+          v = values_util.apply_aggregation(strategy, value, self._aggregation,
+                                            self)
+          if name and isinstance(name, values.PerReplica):
+            name = name.values[0]
+          return strategy.extended.update(
+              self,
+              f,
+              args=(v,),
+              kwargs={
+                  "use_locking": use_locking,
+                  "name": name,
+                  "read_value": read_value
+              })
+        return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
+
+  def assign_sub(self, *args, **kwargs):
+    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def initializer(self):
+    return self._v.initializer
+
+  def initialized_value(self):
+    return self._v.initialized_value()
+
+  @property
+  def initial_value(self):
+    return self._v.initial_value
+
+  @property
+  def op(self):
+    return self._v.op
+
+  def read_value(self):
+    return self._v.read_value()
+
+  def eval(self, session=None):
+    return self._v.eval(session)
+
+  @property
+  def graph(self):
+    return self._v.graph
+
+  @property
+  def device(self):
+    return self._v.device
+
+  @property
+  def shape(self):
+    return self._v.shape
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  @property
+  def synchronization(self):
+    return self._v.synchronization
+
+  @property
+  def name(self):
+    return self._v.name
+
+  @property
+  def trainable(self):
+    return self._v.trainable
+
+  @property
+  def dtype(self):
+    return self._v.dtype
+
+  # TODO(josh11b): Test saving & restoring.
+  def _gather_saveables_for_checkpoint(self):
+    return {trackable.VARIABLE_VALUE_KEY: self._v}
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o):
+    return self._v + o
+
+  def __radd__(self, o):
+    return o + self._v
+
+  def __sub__(self, o):
+    return self._v - o
+
+  def __rsub__(self, o):
+    return o - self._v
+
+  def __mul__(self, o):
+    return self._v * o
+
+  def __rmul__(self, o):
+    return o * self._v
+
+  def __truediv__(self, o):
+    return self._v / o
+
+  def __rtruediv__(self, o):
+    return o / self._v
+
+  def __floordiv__(self, o):
+    return self._v // o
+
+  def __rfloordiv__(self, o):
+    return o // self._v
+
+  def __mod__(self, o):
+    return self._v % o
+
+  def __rmod__(self, o):
+    return o % self._v
+
+  def __lt__(self, o):
+    return self._v < o
+
+  def __le__(self, o):
+    return self._v <= o
+
+  def __gt__(self, o):
+    return self._v > o
+
+  def __ge__(self, o):
+    return self._v >= o
+
+  def __and__(self, o):
+    return self._v & o
+
+  def __rand__(self, o):
+    return o & self._v
+
+  def __or__(self, o):
+    return self._v | o
+
+  def __ror__(self, o):
+    return o | self._v
+
+  def __xor__(self, o):
+    return self._v ^ o
+
+  def __rxor__(self, o):
+    return o ^ self._v
+
+  def __getitem__(self, o):
+    return self._v[o]
+
+  def __pow__(self, o, modulo=None):
+    return pow(self._v, o, modulo)
+
+  def __rpow__(self, o):
+    return pow(o, self._v)
+
+  def __invert__(self):
+    return ~self._v
+
+  def __neg__(self):
+    return -self._v
+
+  def __abs__(self):
+    return abs(self._v)
+
+  def __div__(self, o):
+    try:
+      return self._v.__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self._v.__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self._v.__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self._v.__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __str__(self):
+    return str(self._v)
+
+  def __repr__(self):
+    return repr(self._v)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    return ops.convert_to_tensor(self.get(), dtype=dtype, name=name,
+                                 as_ref=as_ref)
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
+  return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(AggregatingVariable,
+                                        _tensor_conversion_aggregate)
diff --git a/tensorflow/python/distribute/ps_values_test.py b/tensorflow/python/distribute/ps_values_test.py
new file mode 100644
index 00000000000..b8d6b3f35a0
--- /dev/null
+++ b/tensorflow/python/distribute/ps_values_test.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the distributed values library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import ps_values
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.central_storage_strategy_with_two_gpus
+        ],
+        mode=["graph", "eager"]))
+class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
+
+  def testAssignOutOfScope(self, distribution):
+    with distribution.scope():
+      aggregating = variables_lib.Variable(1.)
+    self.assertIsInstance(aggregating, ps_values.AggregatingVariable)
+    self.evaluate(aggregating.assign(3.))
+    self.assertEqual(self.evaluate(aggregating.read_value()), 3.)
+    self.assertEqual(self.evaluate(aggregating._v.read_value()), 3.)
+
+  def testAssignAdd(self, distribution):
+    with distribution.scope():
+      v = variable_scope.variable(
+          1, aggregation=variables_lib.VariableAggregation.MEAN)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    @def_function.function
+    def assign():
+      return v.assign_add(2)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.experimental_run_v2(assign)))
+    self.assertAllEqual([3], per_replica_results)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index d03628f4714..9c830f7081c 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import weakref
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import composite_tensor
@@ -43,72 +43,6 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-# Utility functions used by the different classes below.
-def _get_current_replica_id_as_int():
-  """Returns the current replica ID as an integer, or `None`."""
-  replica_context = ds_context.get_replica_context()
-  if replica_context:
-    replica_id = replica_context.replica_id_in_sync_group
-    if not isinstance(replica_id, int):
-      replica_id = tensor_util.constant_value(replica_id)
-  else:
-    replica_id = distribute_lib.get_update_replica_id()
-  return replica_id
-
-
-def _assign_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign(tensor)
-
-
-def _assign_add_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_add(tensor)
-
-
-def _assign_sub_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_sub(tensor)
-
-
-def _assert_replica_context(strategy):
-  replica_context = ds_context.get_replica_context()
-  if not replica_context:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-  if replica_context.strategy is not strategy:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-
-
-def _apply_aggregation(strategy, value, aggregation, destinations):
-  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.extended.broadcast_to(
-        strategy.experimental_local_results(value)[0],
-        destinations=destinations)
-  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
-  return strategy.extended.reduce_to(reduce_op, value, destinations)
-
-
-_aggregation_error_msg = (
-    "You must specify an aggregation method to update a "
-    "{variable_type} in Replica Context. You can do so by passing "
-    "an explicit value for argument `aggregation` to tf.Variable(..)."
-    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
-    "`tf.VariableAggregation` lists the possible aggregation methods."
-    "This is required because {variable_type} should always be "
-    "kept in sync. When updating them or assigning to them in a "
-    "replica context, we automatically try to aggregate the values "
-    "before updating the variable. For this aggregation, we need to "
-    "know the aggregation method. "
-    "Another alternative is to not try to update such "
-    "{variable_type} in replica context, but in cross replica "
-    "context. You can enter cross replica context by calling "
-    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
-    "Inside `merge_fn`, you can then update the {variable_type} "
-    "using `tf.distribute.StrategyExtended.update()`.")
-
-
 @tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
   """Base class for representing distributed values.
@@ -182,7 +116,7 @@ class DistributedValues(object):
 
   def _get(self):
     """Returns the value for the current device or raises a ValueError."""
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       return self._get_cross_replica()
     else:
@@ -195,7 +129,7 @@ class DistributedValues(object):
 
   def _get_on_device_or_primary(self):
     """Returns value in same replica or device if possible, else the _primary."""
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       # Try to find a value on the current device.
       current_device = device_util.canonicalize(device_util.current())
@@ -568,7 +502,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def handle(self):
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
@@ -774,7 +708,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
           return update_fn(self._values[update_replica_id], value, **kwargs)
         return self._update_cross_replica(update_fn, value, **kwargs)
       else:
-        _assert_replica_context(self.distribute_strategy)
+        values_util.assert_replica_context(self.distribute_strategy)
         return self._update_replica(update_fn, value, **kwargs)
 
   def _should_act_as_resource_variable(self):
@@ -794,7 +728,7 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
     tensor, = restored_tensors
     return control_flow_ops.group(
         tuple(
-            _assign_on_device(v.device, v, tensor)
+            values_util.assign_on_device(v.device, v, tensor)
             for v in self._mirrored_variable.values))
 
 
@@ -804,7 +738,8 @@ class MirroredVariable(DistributedVariable, Mirrored):
   def _update_replica(self, update_fn, value, **kwargs):
     if self.aggregation == vs.VariableAggregation.NONE:
       raise ValueError(
-          _aggregation_error_msg.format(variable_type="MirroredVariable"))
+          values_util.aggregation_error_msg.format(
+              variable_type="MirroredVariable"))
 
     def merge_fn(strategy, value, **kwargs):
       """Aggregate values and update all variables in cross replica context."""
@@ -824,7 +759,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
             "cross-replica context.")
 
       assert strategy == self.distribute_strategy
-      v = _apply_aggregation(strategy, value, self.aggregation, self)
+      v = values_util.apply_aggregation(strategy, value, self.aggregation, self)
       return self._update_cross_replica(update_fn, v, **kwargs)
 
     return ds_context.get_replica_context().merge_call(
@@ -930,7 +865,7 @@ class _SyncOnReadSaveable(saveable_object.SaveableObject):
                              self._sync_on_read_variable.dtype)
     return control_flow_ops.group(
         tuple(
-            _assign_on_device(v.device, v, tensor)
+            values_util.assign_on_device(v.device, v, tensor)
             for v in self._sync_on_read_variable.values))
 
 
@@ -960,8 +895,8 @@ class SyncOnReadVariable(DistributedVariable):
               "SyncOnReadVariable does not support `assign_sub` in "
               "cross-replica context when aggregation is set to "
               "`tf.VariableAggregation.SUM`.")
-        return self._assign_on_each_device(_assign_sub_on_device, value,
-                                           read_value)
+        return self._assign_on_each_device(values_util.assign_sub_on_device,
+                                           value, read_value)
       else:
         return super(SyncOnReadVariable,
                      self).assign_sub(value, use_locking, name, read_value)
@@ -974,8 +909,8 @@ class SyncOnReadVariable(DistributedVariable):
               "SyncOnReadVariable does not support `assign_add` in "
               "cross-replica context when aggregation is set to "
               "`tf.VariableAggregation.SUM`.")
-        return self._assign_on_each_device(_assign_add_on_device, value,
-                                           read_value)
+        return self._assign_on_each_device(values_util.assign_add_on_device,
+                                           value, read_value)
       else:
         return super(SyncOnReadVariable,
                      self).assign_add(value, use_locking, name, read_value)
@@ -988,7 +923,7 @@ class SyncOnReadVariable(DistributedVariable):
         # when saving.
         if self._aggregation == vs.VariableAggregation.SUM:
           value = math_ops.cast(value / len(self._values), self.dtype)
-        return self._assign_on_each_device(_assign_on_device, value,
+        return self._assign_on_each_device(values_util.assign_on_device, value,
                                            read_value)
       else:
         return super(SyncOnReadVariable,
@@ -1388,275 +1323,3 @@ def validate_colocate(v, extended):
         "`colocate_vars_with` must only be passed a variable created in this "
         "tf.distribute.Strategy.scope(), not: %r" % (v,))
   _validate_colocate_extended(v, extended)
-
-
-# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
-class AggregatingVariable(variables_lib.Variable, core.Tensor):
-  """A wrapper around a variable that aggregates updates across replicas."""
-
-  def __init__(self, strategy, v, aggregation):
-    self._distribute_strategy = strategy
-    self._v = v
-    # NOTE: We don't use "_distributed_container" here because we don't want
-    # to trigger that code path in regroup().
-    v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
-    self._aggregation = aggregation
-
-  def get(self):
-    return self._v
-
-  @property
-  def distribute_strategy(self):
-    return self._distribute_strategy
-
-  def __getattr__(self, name):
-    return getattr(self._v, name)
-
-  def _assign_func(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      f = kwargs.pop("f")
-      if ds_context.in_cross_replica_context():
-        if distribute_lib.get_update_replica_id() is not None:
-          # We are calling an assign function in an update context.
-          return f(self._v, *args, **kwargs)
-
-        # We are calling an assign function in cross replica context, wrap it in
-        # an update call.
-        return self._distribute_strategy.extended.update(
-            self, f, args=args, kwargs=kwargs)
-      else:
-        replica_context = ds_context.get_replica_context()
-        assert replica_context
-        # We are calling an assign function in replica context.
-        # We reduce the value we want to assign/add/sub. More details about how
-        # we handle the different use cases can be found in the _reduce method.
-        # We call the function with the reduced value.
-        if self._aggregation == vs.VariableAggregation.NONE:
-          raise ValueError(
-              _aggregation_error_msg.format(
-                  variable_type="AggregatingVariable"))
-
-        def merge_fn(strategy,
-                     value,
-                     use_locking=False,
-                     name=None,
-                     read_value=True):
-          v = _apply_aggregation(strategy, value, self._aggregation, self)
-          if name and isinstance(name, PerReplica):
-            name = name.values[0]
-          return strategy.extended.update(
-              self,
-              f,
-              args=(v,),
-              kwargs={
-                  "use_locking": use_locking,
-                  "name": name,
-                  "read_value": read_value
-              })
-        return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
-
-  def assign_sub(self, *args, **kwargs):
-    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
-    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
-    return self._assign_func(f=assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
-    return self._assign_func(f=assign_fn, *args, **kwargs)
-
-  @property
-  def initializer(self):
-    return self._v.initializer
-
-  def initialized_value(self):
-    return self._v.initialized_value()
-
-  @property
-  def initial_value(self):
-    return self._v.initial_value
-
-  @property
-  def op(self):
-    return self._v.op
-
-  def read_value(self):
-    return self._v.read_value()
-
-  def eval(self, session=None):
-    return self._v.eval(session)
-
-  @property
-  def graph(self):
-    return self._v.graph
-
-  @property
-  def device(self):
-    return self._v.device
-
-  @property
-  def shape(self):
-    return self._v.shape
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  @property
-  def synchronization(self):
-    return self._v.synchronization
-
-  @property
-  def name(self):
-    return self._v.name
-
-  @property
-  def trainable(self):
-    return self._v.trainable
-
-  @property
-  def dtype(self):
-    return self._v.dtype
-
-  # TODO(josh11b): Test saving & restoring.
-  def _gather_saveables_for_checkpoint(self):
-    return {trackable.VARIABLE_VALUE_KEY: self._v}
-
-  # pylint: disable=multiple-statements
-  def __add__(self, o):
-    return self._v + o
-
-  def __radd__(self, o):
-    return o + self._v
-
-  def __sub__(self, o):
-    return self._v - o
-
-  def __rsub__(self, o):
-    return o - self._v
-
-  def __mul__(self, o):
-    return self._v * o
-
-  def __rmul__(self, o):
-    return o * self._v
-
-  def __truediv__(self, o):
-    return self._v / o
-
-  def __rtruediv__(self, o):
-    return o / self._v
-
-  def __floordiv__(self, o):
-    return self._v // o
-
-  def __rfloordiv__(self, o):
-    return o // self._v
-
-  def __mod__(self, o):
-    return self._v % o
-
-  def __rmod__(self, o):
-    return o % self._v
-
-  def __lt__(self, o):
-    return self._v < o
-
-  def __le__(self, o):
-    return self._v <= o
-
-  def __gt__(self, o):
-    return self._v > o
-
-  def __ge__(self, o):
-    return self._v >= o
-
-  def __and__(self, o):
-    return self._v & o
-
-  def __rand__(self, o):
-    return o & self._v
-
-  def __or__(self, o):
-    return self._v | o
-
-  def __ror__(self, o):
-    return o | self._v
-
-  def __xor__(self, o):
-    return self._v ^ o
-
-  def __rxor__(self, o):
-    return o ^ self._v
-
-  def __getitem__(self, o):
-    return self._v[o]
-
-  def __pow__(self, o, modulo=None):
-    return pow(self._v, o, modulo)
-
-  def __rpow__(self, o):
-    return pow(o, self._v)
-
-  def __invert__(self):
-    return ~self._v
-
-  def __neg__(self):
-    return -self._v
-
-  def __abs__(self):
-    return abs(self._v)
-
-  def __div__(self, o):
-    try:
-      return self._v.__div__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rdiv__(self, o):
-    try:
-      return self._v.__rdiv__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __matmul__(self, o):
-    try:
-      return self._v.__matmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rmatmul__(self, o):
-    try:
-      return self._v.__rmatmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __str__(self):
-    return str(self._v)
-
-  def __repr__(self):
-    return repr(self._v)
-
-  def _should_act_as_resource_variable(self):
-    """Pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    return ops.convert_to_tensor(self.get(), dtype=dtype, name=name,
-                                 as_ref=as_ref)
-
-
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
-  return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(AggregatingVariable,
-                                        _tensor_conversion_aggregate)
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index bbff6c631cf..8016bfe9265 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -2008,38 +2008,6 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
       self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
 
 
-@combinations.generate(
-    combinations.combine(
-        distribution=[
-            strategy_combinations.central_storage_strategy_with_two_gpus
-        ],
-        mode=["graph", "eager"]))
-class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
-
-  def testAssignOutOfScope(self, distribution):
-    with distribution.scope():
-      aggregating = variables_lib.Variable(1.)
-    self.assertIsInstance(aggregating, values.AggregatingVariable)
-    self.evaluate(aggregating.assign(3.))
-    self.assertEqual(self.evaluate(aggregating.read_value()), 3.)
-    self.assertEqual(self.evaluate(aggregating._v.read_value()), 3.)
-
-  def testAssignAdd(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          1, aggregation=variables_lib.VariableAggregation.MEAN)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def assign():
-      return v.assign_add(2)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.experimental_run_v2(assign)))
-    self.assertAllEqual([3], per_replica_results)
-
-
 class MirroredTest(test.TestCase):
 
   def testAddOp(self):
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
new file mode 100644
index 00000000000..c42ac9e4de1
--- /dev/null
+++ b/tensorflow/python/distribute/values_util.py
@@ -0,0 +1,91 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions used by values.py and ps_values.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import variable_scope as vs
+
+
+def get_current_replica_id_as_int():
+  """Returns the current replica ID as an integer, or `None`."""
+  replica_context = ds_context.get_replica_context()
+  if replica_context:
+    replica_id = replica_context.replica_id_in_sync_group
+    if not isinstance(replica_id, int):
+      replica_id = tensor_util.constant_value(replica_id)
+  else:
+    replica_id = distribute_lib.get_update_replica_id()
+  return replica_id
+
+
+def assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(tensor)
+
+
+def assign_add_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_add(tensor)
+
+
+def assign_sub_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_sub(tensor)
+
+
+def assert_replica_context(strategy):
+  replica_context = ds_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+
+
+def apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.extended.broadcast_to(
+        strategy.experimental_local_results(value)[0],
+        destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
+aggregation_error_msg = (
+    "You must specify an aggregation method to update a "
+    "{variable_type} in Replica Context. You can do so by passing "
+    "an explicit value for argument `aggregation` to tf.Variable(..)."
+    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
+    "`tf.VariableAggregation` lists the possible aggregation methods."
+    "This is required because {variable_type} should always be "
+    "kept in sync. When updating them or assigning to them in a "
+    "replica context, we automatically try to aggregate the values "
+    "before updating the variable. For this aggregation, we need to "
+    "know the aggregation method. "
+    "Another alternative is to not try to update such "
+    "{variable_type} in replica context, but in cross replica "
+    "context. You can enter cross replica context by calling "
+    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
+    "Inside `merge_fn`, you can then update the {variable_type} "
+    "using `tf.distribute.StrategyExtended.update()`.")
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 25d05b78c3e..4b52b442754 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -125,11 +125,14 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/types",
     ],
 )
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 29e5a68c854..7d0abe30581 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import ps_values as ps_distribute_values
 from tensorflow.python.distribute import values as distribute_values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
@@ -437,7 +438,7 @@ def create_autocast_variable(variable):
     An AutoCastVariable that wraps the variable.
   """
   if not isinstance(variable, (distribute_values.DistributedVariable,
-                               distribute_values.AggregatingVariable)):
+                               ps_distribute_values.AggregatingVariable)):
     return AutoCastVariable(variable)
 
   class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
@@ -448,7 +449,8 @@ def create_autocast_variable(variable):
     """
 
     def __repr__(self):
-      if issubclass(distribute_values.AggregatingVariable, variable.__class__):
+      if issubclass(ps_distribute_values.AggregatingVariable,
+                    variable.__class__):
         # AggregatingVariable's __repr__ simply calls super.__repr__. So we do
         # the same here for consistency, which calls AutoCastVariable.__repr__.
         return super(AutoCastDistributedVariable, self).__repr__()
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 963d3549b2b..0578823c7fb 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -26,6 +26,7 @@ from absl.testing import parameterized
 import six
 
 from tensorflow.python import tf2
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import context
@@ -250,7 +251,7 @@ class VariableTrackingTest(test_util.TensorFlowTestCase):
         None, [variables.Variable(1.)], variables.VariableAggregation.SUM)
     tpu = tpu_values.TPUMirroredVariable(
         strategy=None, values=[variables.Variable(42.)], aggregation=None)
-    aggregating = distributed_values.AggregatingVariable(
+    aggregating = ps_values.AggregatingVariable(
         strategy=None, v=variables.Variable(1.), aggregation=None)
 
     m = module.Module()
@@ -514,8 +515,8 @@ class FlattenTest(parameterized.TestCase, test_util.TensorFlowTestCase):
 
     m = module.Module()
     m.layers = {non_orderable(): None, non_orderable(): None}
-    with self.assertRaisesRegexp(ValueError,
-                                 "Error processing property 'layers'"):
+    with self.assertRaisesRegex(ValueError,
+                                "Error processing property 'layers'"):
       m.variables  # pylint: disable=pointless-statement
 
 
From c92d4d8c661ef70b42d4a825c6bb9e0959fde076 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 10:51:13 -0700
Subject: [PATCH 1314/1533] docs: fix code formatting in Model docs for save()

Committer: @miraleung
PiperOrigin-RevId: 313618534
Change-Id: I8b7d803f7cb5ad6ff7adf4390e3c8d148322369a
---
 tensorflow/python/keras/saving/save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 43c09a62ea9..7f725d3978e 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -76,7 +76,7 @@ def save_model(model,
 
   Note that the model weights may have different scoped names after being
   loaded. Scoped names include the model/layer names, such as
-  "dense_1/kernel:0"`. It is recommended that you use the layer properties to
+  `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
   access specific variables, e.g. `model.get_layer("dense_1").kernel`.
 
   _SavedModel serialization_

From 44a649cc7a00540eb256488152aaf11f54000f3c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 28 May 2020 11:05:47 -0700
Subject: [PATCH 1315/1533] Update Eigen to
 8719b9c5bc1a97e62d675c02495ed72dda6fae73 to fix compiling error

This PR tries to update Eigen to 8719b9c5bc1a97e62d675c02495ed72dda6fae73.
The reason to update Eigen is to fix the build error for custom ops (See error below).
The issue is that in Eigen there was a bug that uses `if defined(EIGEN_ARCH_PPC)` incorrectly (should be `if EIGEN_ARCH_PPC`).

I have created a PR in Eigen https://gitlab.com/libeigen/eigen/-/merge_requests/131 and the PR has already been merged.

This PR is a follow up in tensorflow repo to bump the Eigen to the latest version.

The error before this PR, when building a custom ops:
```
Execution platform: @local_config_platform//:host

Use --sandbox_debug to see verbose messages from the sandbox
In file included from tensorflow_io/core/kernels/io_optimization.cc:22:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h:20:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/core/common_runtime/function_optimization_registry.h:23:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/core/common_runtime/device_set.h:23:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/core/common_runtime/device.h:35:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/core/framework/allocator.h:26:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/tensorflow/core/framework/numeric_types.h:20:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/third_party/eigen3/unsupported/Eigen/CXX11/Tensor:1:
In file included from bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/unsupported/Eigen/CXX11/Tensor:14:
bazel-out/darwin-fastbuild/bin/external/local_config_tf/include/unsupported/Eigen/CXX11/../../../Eigen/Core:334:10: fatal error: 'src/Core/arch/AltiVec/MatrixProduct.h' file not found
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 error generated.
Target //tensorflow_io/core:optimization failed to build
INFO: Elapsed time: 4.778s, Critical Path: 4.54s
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index db87f9a730d..5806fd28dd7 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "854eabe6817e38d7738fde6ec39c3dfc55fd5e68b2523de8cae936f391a38a69",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-cc86a31e20b48b0f03d714b4d1b1f50d52848d36",
+        sha256 = "615be1295290c13039b0c980a4a55933be26b1e06194d86c6014876fa85c7c6b",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-8719b9c5bc1a97e62d675c02495ed72dda6fae73",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/cc86a31e20b48b0f03d714b4d1b1f50d52848d36/eigen-cc86a31e20b48b0f03d714b4d1b1f50d52848d36.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/cc86a31e20b48b0f03d714b4d1b1f50d52848d36/eigen-cc86a31e20b48b0f03d714b4d1b1f50d52848d36.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/8719b9c5bc1a97e62d675c02495ed72dda6fae73/eigen-8719b9c5bc1a97e62d675c02495ed72dda6fae73.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/8719b9c5bc1a97e62d675c02495ed72dda6fae73/eigen-8719b9c5bc1a97e62d675c02495ed72dda6fae73.tar.gz",
         ],
     )
 

From 1dd8c48f907f802f418d9c33cc80309e0d2b3f27 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 10:55:56 -0700
Subject: [PATCH 1316/1533] Remove test that does not exist

PiperOrigin-RevId: 313619568
Change-Id: I0b749472f6fce149dc2121c71e219dfe665999a0
---
 tensorflow/compiler/mlir/lite/BUILD | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 6eff7dbd084..1a508bdb190 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -279,22 +279,6 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "tftext_utils_test",
-    size = "small",
-    srcs = ["utils/lstm_utils_test.cc"],
-    deps = [
-        ":lstm_utils",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
 cc_library(
     name = "stateful_ops_utils",
     srcs = [

From b0fcb6c18c98dc8ce7177034ce099216b937e529 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 28 May 2020 11:01:00 -0700
Subject: [PATCH 1317/1533] [tf.data service] Export distribute transformation

PiperOrigin-RevId: 313620607
Change-Id: I4f43f44ac7a8a9b27226b7d63aef374f3c8a178d
---
 tensorflow/python/data/experimental/BUILD     |  1 +
 .../python/data/experimental/__init__.py      |  9 +-
 .../data/experimental/ops/data_service_ops.py | 84 ++++++++++++++-----
 .../python/data/experimental/service/BUILD    | 15 ++++
 .../data/experimental/service/__init__.py     | 21 +++++
 .../tools/api/generator/api_init_files.bzl    |  1 +
 .../tools/api/generator/api_init_files_v1.bzl |  1 +
 .../v1/tensorflow.data.experimental.pbtxt     |  4 +
 ...tensorflow.data.experimental.service.pbtxt |  7 ++
 .../v2/tensorflow.data.experimental.pbtxt     |  4 +
 ...tensorflow.data.experimental.service.pbtxt |  7 ++
 11 files changed, 132 insertions(+), 22 deletions(-)
 create mode 100644 tensorflow/python/data/experimental/service/BUILD
 create mode 100644 tensorflow/python/data/experimental/service/__init__.py
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt

diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
index 5264b4a7791..a1e45ecd17e 100644
--- a/tensorflow/python/data/experimental/BUILD
+++ b/tensorflow/python/data/experimental/BUILD
@@ -13,5 +13,6 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/data/experimental/ops:dataset_ops",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python/data/experimental/service",
     ],
 )
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index b64ecc32e3e..badc173acec 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -53,6 +53,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@copy_to_device
 @@dense_to_ragged_batch
 @@dense_to_sparse_batch
+@@distribute
 @@enumerate_dataset
 @@from_variant
 @@get_next_as_optional
@@ -89,6 +90,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.data.experimental import service
 from tensorflow.python.data.experimental.ops.batching import dense_to_ragged_batch
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
@@ -150,4 +152,9 @@ from tensorflow.python.framework.type_spec import TypeSpec as Structure
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__)
+
+_allowed_symbols = [
+    "service",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 782f438c701..39790d843ba 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -23,11 +23,13 @@ import six
 
 from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import compression_ops
+from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.experimental.ops.distribute_options import ExternalStatePolicy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 class ProcessingMode(object):
@@ -240,11 +242,18 @@ def _distribute(processing_mode,
     # to limit memory usage.
     dataset = dataset.map(
         lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
+
+    # Disable autosharding for shared jobs.
+    if job_name:
+      options = dataset_ops.Options()
+      options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
+      dataset = dataset.with_options(options)
     return dataset
 
   return _apply_fn
 
 
+@tf_export("data.experimental.service.distribute")
 def distribute(processing_mode,
                service,
                job_name=None,
@@ -289,32 +298,65 @@ def distribute(processing_mode,
   executed locally.
 
   The `job_name` argument allows jobs to be shared across multiple
-  datasets. Instead of each dataset creating its own job, all datasets with the
-  same `job_name` will consume from the same job. A new job will
-  be created for each iteration of the dataset (with each repetition of
-  `Dataset.repeat` counting as a new iteration). The following example
-  demonstrates shared iteration, with the assumption that the tf.data service is
-  running with a single worker.
+  datasets. Instead of each dataset creating its own job, all
+  datasets with the same `job_name` will consume from the same job. A new job
+  will be created for each iteration of the dataset (with each repetition of
+  `Dataset.repeat` counting as a new iteration). Suppose two training workers
+  (in either a single client or multi-client setup) iterate over the below
+  dataset, and there is a single tf.data worker:
 
   ```
   range5_dataset = tf.data.Dataset.range(5)
-  dataset1 = range5_dataset.apply(tf.data.experimental.service.distribute(
-      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
-  dataset2 = range5_dataset.apply(tf.data.experimental.service.distribute(
-      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
-  iter_1_1 = iter(dataset1)
-  iter_1_2 = iter(dataset1)
-  iter_2_1 = iter(dataset2)
-  iter_2_2 = iter(dataset2)
-  print(next(iter_1_1))  # Prints "0"
-  # iter_1_2 consumes from the same job as iter_1_1
-  print(next(iter_1_2))  # Prints "1"
-  # iter_2_1 consumes from a new job
-  print(next(iter_2_1))  # Prints "0"
-  # iter_2_2 consumes from the same job as iter_2_1
-  print(next(iter_2_2))  # Prints "1"
+  dataset = range5_dataset.apply(tf.data.experimental.service.distribute(
+      "parallel_epochs", "grpc://dataservice:5000", job_name="my_job_name"))
+  for iteration in range(3):
+    print(list(dataset))
   ```
 
+  The elements of each job will be split between the two processes, with
+  elements being consumed by the processes on a first-come first-served basis.
+  One possible result is that process 1 prints
+
+  ```
+  [0, 2, 4]
+  [0, 1, 3]
+  [1]
+  ```
+
+  and process 2 prints
+
+  ```
+  [1, 3]
+  [2, 4]
+  [0, 2, 3, 4]
+  ```
+
+  Job names must not be re-used across different training jobs within the
+  lifetime of the tf.data service. In general, the tf.data service is expected
+  to live for the duration of a single training job.
+  To use the tf.data service with multiple training jobs, make sure to use
+  different job names to avoid conflicts. For example, suppose a training job
+  calls `distribute` with `job_name="job"` and reads until end of input. If
+  another independent job connects to the same tf.data service and tries to read
+  from `job_name="job"`, it will immediately receive end of input, without
+  getting any data.
+
+  **Keras and Distribution Strategies**
+
+  The dataset produced by the `distribute` transformation can be passed to
+  Keras' `Model.fit` or Distribution Strategy's
+  `tf.distribute.Strategy.experimental_distribute_dataset` like any other
+  `tf.data.Dataset`. We recommend setting a `job_name` on the call to
+  `distribute` so that if there are multiple workers, they read data from the
+  same job. Note that the autosharding normally performed by
+  `experimental_distribute_dataset` will be disabled when setting a `job_name`,
+  since sharing the job already results in splitting data across the workers.
+  When using a shared job, data will be dynamically balanced across workers, so
+  that they reach end of input about the same time. This results in better
+  worker utilization than with autosharding, where each worker processes an
+  independent set of files, and some workers may run out of data earlier than
+  others.
+
   Args:
     processing_mode: A string specifying the policy for how data should be
       processed by tf.data workers. Currently, the only supported value is
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
new file mode 100644
index 00000000000..5e1d8473633
--- /dev/null
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "service",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/service/__init__.py b/tensorflow/python/data/experimental/service/__init__.py
new file mode 100644
index 00000000000..f3c8aff2b3f
--- /dev/null
+++ b/tensorflow/python/data/experimental/service/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for using the tf.data service."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops.data_service_ops import distribute
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 7d27d21188f..629c8a4a52f 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -16,6 +16,7 @@ TENSORFLOW_API_INIT_FILES = [
     "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
+    "data/experimental/service/__init__.py",
     "debugging/__init__.py",
     "debugging/experimental/__init__.py",
     "distribute/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index c0756a173c9..5c9f1694081 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -16,6 +16,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
+    "data/experimental/service/__init__.py",
     "debugging/__init__.py",
     "debugging/experimental/__init__.py",
     "distribute/__init__.py",
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 498d8cfdcbb..523129c7037 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -80,6 +80,10 @@ tf_module {
     name: "UNKNOWN_CARDINALITY"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "service"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
new file mode 100644
index 00000000000..12f4f3c2b08
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.data.experimental.service"
+tf_module {
+  member_method {
+    name: "distribute"
+    argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index ce77b867902..2fc32b21adc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -68,6 +68,10 @@ tf_module {
     name: "UNKNOWN_CARDINALITY"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "service"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
new file mode 100644
index 00000000000..12f4f3c2b08
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.data.experimental.service"
+tf_module {
+  member_method {
+    name: "distribute"
+    argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}

From f7b9cc702ac5e45f6dc64e6a3be39cec736fa0b7 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Thu, 28 May 2020 11:07:05 -0700
Subject: [PATCH 1318/1533] Rollback of a commit that changed the generated
 graphdef node name when slicing via `tensor[slice]`, because too many things
 are dependent on it

PiperOrigin-RevId: 313622141
Change-Id: Id78a33158ce6d63a4a4638430b44c0481cf32957
---
 tensorflow/python/ops/array_ops.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 118c2cfca55..a641633b1f5 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -983,7 +983,7 @@ def _slice_helper(tensor, slice_spec, var=None):
   with ops.name_scope(
       None,
       "strided_slice", [tensor] + begin + end + strides,
-      skip_on_eager=False):
+      skip_on_eager=False) as name:
     if begin:
       packed_begin, packed_end, packed_strides = (stack(begin), stack(end),
                                                   stack(strides))
@@ -1009,7 +1009,8 @@ def _slice_helper(tensor, slice_spec, var=None):
         shrink_axis_mask=shrink_axis_mask,
         new_axis_mask=new_axis_mask,
         ellipsis_mask=ellipsis_mask,
-        var=var)
+        var=var,
+        name=name)
 
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
@@ -1193,7 +1194,7 @@ def strided_slice(input_,
       if var is None:
         raise ValueError("Sliced assignment is only supported for variables")
       else:
-        if name is None and parent_name:
+        if name is None:
           name = parent_name + "_assign"
 
         return var._strided_slice_assign(

From 0e90836047af6ecd4a098a5f095a7ab2891e2c2a Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Thu, 28 May 2020 11:47:34 -0700
Subject: [PATCH 1319/1533] Store nvcc command line options in a file on
 Windows to prevent command lines that are too long.

PiperOrigin-RevId: 313629844
Change-Id: I504fc38b802e642f05579051752116a724a17a22
---
 .../crosstool/windows/msvc_wrapper_for_nvcc.py.tpl    | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index c00e7077b59..73012876691 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -167,10 +167,15 @@ def InvokeNvcc(argv, log=False):
   # Provide a unique dir for each compiling action to avoid conflicts.
   tempdir = tempfile.mkdtemp(dir = NVCC_TEMP_DIR)
   nvccopts += ['--keep', '--keep-dir', tempdir]
-  cmd = [NVCC_PATH] + nvccopts
   if log:
-    Log(cmd)
-  proc = subprocess.Popen(cmd,
+    Log([NVCC_PATH] + nvccopts)
+
+  # Store command line options in a file to avoid hitting the character limit.
+  optsfile = tempfile.NamedTemporaryFile(mode='w', dir=tempdir, delete=False)
+  optsfile.write("\n".join(nvccopts))
+  optsfile.close()
+
+  proc = subprocess.Popen([NVCC_PATH, "--options-file", optsfile.name],
                           stdout=sys.stdout,
                           stderr=sys.stderr,
                           env=os.environ.copy(),

From a4fb4cb6d4440212fbd2e694bbe4d16f02708384 Mon Sep 17 00:00:00 2001
From: Abdurrahman Akkas <akkas@google.com>
Date: Thu, 28 May 2020 11:51:57 -0700
Subject: [PATCH 1320/1533] Legalize xla_hlo.reshape to tf.Reshape

PiperOrigin-RevId: 313630645
Change-Id: Ie8d2c1f0963c6b4a61e593bb95449ef7bf8915ff
---
 .../mlir/tensorflow/tests/legalize_hlo.mlir       | 12 ++++++++++++
 .../mlir/tensorflow/transforms/legalize_hlo.cc    | 12 +++++++++++-
 .../transforms/legalize_hlo_patterns.td           | 15 ++++++++++-----
 3 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index abc12b2d89c..c7934a377b3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -687,6 +687,11 @@ func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
   return %0 : tensor<1x519xf32>
 }
 
+func @reshape(%arg0: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
+  %0 = "xla_hlo.reshape"(%arg0) : (tensor<4x6xf32>) -> tensor<2x2x6xf32>
+  return %0 : tensor<2x2x6xf32>
+}
+
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
 // CHECK-LABEL:   func @biasAdd_NHWC(
@@ -1507,3 +1512,10 @@ func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
 // CHECK:           return [[VAL_375]] : tensor<1x519xf32>
 // CHECK:         }
 
+// CHECK-LABEL:   func @reshape(
+// CHECK-SAME:                  [[VAL_372:%.*]]: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
+// CHECK:           [[VAL_373:%.*]] = constant dense<[2, 2, 6]> : tensor<3xi64>
+// CHECK:           [[VAL_374:%.*]] = "tf.Reshape"([[VAL_372]], [[VAL_373]]) : (tensor<4x6xf32>, tensor<3xi64>) -> tensor<2x2x6xf32>
+// CHECK:           return [[VAL_374]] : tensor<2x2x6xf32>
+// CHECK:         }
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index 524b3e4f4b7..c0cdc3e6b8e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -94,6 +95,15 @@ static bool AreBroadcastCompatible(Value x, Value y) {
                                             y_ranked.getShape(), resultShape);
 }
 
+// Returns the shape of the given value in a Constant Op.
+ConstantOp ShapeToConst(PatternRewriter &rewriter, Value value) {
+  ArrayRef<int64_t> shape = value.getType().cast<ShapedType>().getShape();
+  auto attr_type = RankedTensorType::get({static_cast<int64_t>(shape.size())},
+                                         rewriter.getIntegerType(64));
+  auto attr = DenseElementsAttr::get(attr_type, shape);
+  return rewriter.create<ConstantOp>(value.getLoc(), attr_type, attr);
+}
+
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_legalize_hlo.inc"
 
 /// Performs the lowering to XLA dialect.
@@ -107,7 +117,7 @@ void LegalizeHloToTf::runOnFunction() {
 
   ConversionTarget target(context);
   target.addLegalDialect<TensorFlowDialect>();
-  target.addLegalOp<CallOp>();
+  target.addLegalOp<CallOp, ConstantOp>();
   if (failed(applyPartialConversion(getFunction(), target, patterns)))
     signalPassFailure();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 6fd7556084d..08250ff8e8c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -21,6 +21,14 @@ include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
+// Check that two values can be broadcasted together
+// TODO(jpienaar): Move somewhere more general
+def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
+    "types must be broadcastable">;
+
+// Return a constant op that carries the shape of the given value.
+def ShapeToConst : NativeCodeCall<"ShapeToConst($_builder, $0)">;
+
 def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
 
 //===----------------------------------------------------------------------===//
@@ -30,11 +38,6 @@ def : Pat<(HLO_ConstOp $value), (TF_ConstOp $value)>;
 // context, getting to these ops may require some raising.
 //===----------------------------------------------------------------------===//
 
-// Check that two values can be broadcasted together
-// TODO(jpienaar): Move somewhere more general
-def AreBroadcastCompatible : Constraint<CPred<"AreBroadcastCompatible($0, $1)">,
-    "types must be broadcastable">;
-
 foreach fromToBinPair = [[HLO_AddOp, HLOClient_BroadcastAddOp, TF_AddV2Op],
                          [HLO_DivOp, HLOClient_BroadcastDivOp, TF_DivOp],
                          [HLO_ShiftLeftOp, HLOClient_BroadcastShiftLeftOp, TF_LeftShiftOp],
@@ -112,6 +115,8 @@ def : Pat<(HLO_BroadcastOp $arg, $shape),
 def : Pat<(HLO_TransposeOp $arg, $permutation),
           (TF_TransposeOp $arg, (TF_ConstOp $permutation))>;
 def : Pat<(HLO_ReverseOp $op, $dims), (TF_ReverseV2Op $op, (TF_ConstOp $dims))>;
+def : Pat<(HLO_ReshapeOp:$output $input),
+          (TF_ReshapeOp $input, (ShapeToConst $output))>;
 
 //===----------------------------------------------------------------------===//
 // Ternary op patterns.

From 9407005102b2c63c0b69780d671885945d0d25ae Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Thu, 28 May 2020 12:21:08 -0700
Subject: [PATCH 1321/1533] Fix bug in detecting ops in cluster when extracting
 head computation in TPUExtractHeadTailOutsideCompilation pass.

Check for if an op is in a cluster has been updated using cluster op as an ancestor.

PiperOrigin-RevId: 313636229
Change-Id: I200cccb038c48e1c7b0463d680fd6e3c5958deba
---
 ...extract_head_tail_outside_compilation.mlir | 112 +++++++++++-------
 ...u_extract_head_tail_outside_compilation.cc |  14 +--
 2 files changed, 75 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 9af75255202..474bfb1eef1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -4,7 +4,7 @@
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
   // CHECK-LABEL: func @single_head_outside_compilation
-  func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+  func @single_head_outside_compilation(%arg0: tensor<i32>) {
     // CHECK:      tf_device.launch
     // CHECK:        "tf.A"
     // CHECK-NEXT:   tf_device.return
@@ -23,7 +23,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
   }
 
   // CHECK-LABEL: func @ops_no_operands
-  func @ops_no_operands() -> () {
+  func @ops_no_operands() {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
@@ -34,14 +34,36 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:   "tf.C"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %0 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<i32>)
-      %1 = "tf.B"(%0) {}: (tensor<i32>) -> (tensor<i32>)
+      %0 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      %1 = "tf.B"(%0) {}: (tensor<i32>) -> tensor<i32>
       "tf.C"(%1) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
 
+  // CHECK-LABEL: func @op_operand_outside_cluster
+  func @op_operand_outside_cluster() {
+    // CHECK:      %[[A_OUT:.*]] = "tf.A"
+    %0 = "tf.A"() : () -> tensor<i32>
+    // CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.C"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   "tf.D"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %2 = "tf.C"(%1) {}: (tensor<i32>) -> tensor<i32>
+      "tf.D"(%2) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
   // CHECK-LABEL: func @aliased_output
   func @aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
@@ -62,16 +84,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     //
     // CHECK:      return %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1
     %0:3 = "tf_device.cluster"() ( {
-      %1 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<i32>)
-      %2 = "tf.B"(%1) {}: (tensor<i32>) -> (tensor<i32>)
-      %3 = "tf.C"(%2) : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      %2 = "tf.B"(%1) {}: (tensor<i32>) -> tensor<i32>
+      %3 = "tf.C"(%2) : (tensor<i32>) -> tensor<i32>
       tf_device.return %1, %3, %2 : tensor<i32>, tensor<i32>, tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
     return %0#0, %0#1, %0#2 : tensor<i32>, tensor<i32>, tensor<i32>
   }
 
   // CHECK-LABEL: func @all_head_computation_ops
-  func @all_head_computation_ops(%arg0 : tensor<i32>) -> (tensor<i32>) {
+  func @all_head_computation_ops(%arg0: tensor<i32>) -> tensor<i32> {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
@@ -84,16 +106,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     //
     // CHECK:      return %[[LAUNCH_OUT]]
     %0 = "tf_device.cluster"() ( {
-      %1 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-      %2 = "tf.B"(%1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-      %3 = "tf.C"(%2, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %2 = "tf.B"(%1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %3 = "tf.C"(%2, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
       tf_device.return %3 : tensor<i32>
-    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>)
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
     return %0 : tensor<i32>
   }
 
   // CHECK-LABEL: func @multiple_head_outside_compilation
-  func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+  func @multiple_head_outside_compilation(%arg0: tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
@@ -105,8 +127,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
       "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
       "tf.D"(%1) : (tensor<i32>) -> ()
       tf_device.return
@@ -114,8 +136,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     return
   }
 
-  // CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
-  func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
+  // CHECK-LABEL: func @no_extraction_middle_outside_compiled_ops
+  func @no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
     // CHECK-NOT:  tf_device.launch
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.A"
@@ -123,16 +145,16 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-NEXT:   "tf.C"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
-      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+      %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> tensor<i32>
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
       "tf.C"(%1) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
 
-  // CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
-  func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
+  // CHECK-LABEL: func @no_extraction_tpu_op_operands
+  func @no_extraction_tpu_op_operands(%arg0: tensor<i32>) {
     // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
     // CHECK:        %[[A_OUT:.*]] = "tf.A"
     // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
@@ -145,36 +167,38 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:        "tf.E"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-      %1 = "tf.B"() {} : () -> (tensor<i32>)
-      %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-      %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
-      %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %1 = "tf.B"() {} : () -> tensor<i32>
+      %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
+      %4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
 
-  // CHECK-LABEL: func @test_replicated_head_outside_compilation
-  func @test_replicated_head_outside_compilation(%arg0 : tensor<i32>) -> () {
-    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:        %[[A_OUT:.*]] = "tf.A"
-    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
-    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
-    // CHECK:      device = "TPU_REPLICATED_HOST"
+  // CHECK-LABEL: func @replicated_head_outside_compilation
+  func @replicated_head_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
+    // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
     //
-    // CHECK:      "tf_device.cluster"
-    // CHECK:        "tf.B"
-    // CHECK:        "tf.C"
-    // CHECK:        "tf.E"
-    // CHECK-NEXT:   tf_device.return
-    tf_device.replicate() {n = 2 : i32} {
+    // CHECK-NEXT:   %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:          %[[A_OUT:.*]] = "tf.A"(%[[RI]])
+    // CHECK:          %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+    // CHECK-NEXT:     tf_device.return %[[D_OUT]]
+    // CHECK:        device = "TPU_REPLICATED_HOST"
+    //
+    // CHECK:        "tf_device.cluster"
+    // CHECK:          %[[B_OUT:.*]] = "tf.B"
+    // CHECK:          "tf.C"(%[[RI]], %[[B_OUT]])
+    // CHECK:          "tf.E"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:     tf_device.return
+    tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
       "tf_device.cluster"() ( {
-        %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-        %1 = "tf.B"() {} : () -> (tensor<i32>)
-        %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-        %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
-        %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+        %0 = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+        %1 = "tf.B"() {} : () -> tensor<i32>
+        %2 = "tf.C"(%ri, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
+        %4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
         tf_device.return
       }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
       tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 5a059ce507c..1c442345d43 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -66,6 +66,7 @@ Operation* GetOpOfValue(Value value) {
 // other ops in the TPU computation that cannot be extracted.
 llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
     tf_device::ClusterOp cluster) {
+  Region* cluster_region = &cluster.body();
   llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
 
   auto cluster_ops = cluster.GetBody().without_terminator();
@@ -73,20 +74,19 @@ llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
     if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
     // An outside compiled op can be extracted if its operands are not from
     // other ops in the cluster that cannot be extracted.
-    auto result = cluster_op.walk([&](Operation* op) {
+    auto walk_result = cluster_op.walk([&](Operation* op) {
       for (Value operand : op->getOperands()) {
         Operation* operand_op = GetOpOfValue(operand);
-        if (operand_op->isProperAncestor(cluster) ||
-            cluster_op.isAncestor(operand_op) ||
-            head_outside_compiled_ops.count(operand_op))
-          continue;
+        if (head_outside_compiled_ops.count(operand_op)) continue;
 
-        return WalkResult::interrupt();
+        if (operand_op->getParentRegion() == cluster_region)
+          return WalkResult::interrupt();
       }
       return WalkResult::advance();
     });
 
-    if (!result.wasInterrupted()) head_outside_compiled_ops.insert(&cluster_op);
+    if (!walk_result.wasInterrupted())
+      head_outside_compiled_ops.insert(&cluster_op);
   }
 
   return head_outside_compiled_ops.takeVector();

From ede5b12292f83fa2f0873d95143f7e5aa2d30d1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 12:22:17 -0700
Subject: [PATCH 1322/1533] Fix _cuda_set types

PiperOrigin-RevId: 313636489
Change-Id: I2664ceda018961665d12e47cb5230be512e6ad05
---
 third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index a336673a307..082ed950b04 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -281,14 +281,14 @@ def _no_canonical_prefixes_group(extra_flags):
 
 def _cuda_set(cuda_path, actions):
     if cuda_path:
-        return flag_set(
+        return [flag_set(
             actions = actions,
             flag_groups = [
                 flag_group(
                     flags = ["--cuda-path=" + cuda_path],
                 ),
             ],
-        )
+        )]
     else:
         return []
 
@@ -415,7 +415,7 @@ def _features(cpu, compiler, ctx):
                     ),
                 ] + _cuda_set(
                     ctx.attr.cuda_path,
-                    all_compile_actions,
+                    all_compile_actions(),
                 ) + [
                     flag_set(
                         actions = all_compile_actions(),

From 2b8a23c16946a96c7f2b80e5aa4dd930ff528eff Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Thu, 28 May 2020 12:33:19 -0700
Subject: [PATCH 1323/1533] Add `inference_input_type` and
 `inference_output_type` flags in TF 2.x TFLiteConverter (backward compatible
 with TF 1.x) to support integer (tf.int8, tf.uint8) input and output types in
 post training full integer quantized models.

PiperOrigin-RevId: 313638712
Change-Id: If1f1f40dac79d06b56b85882f2f69b4519f78acd
---
 tensorflow/lite/python/lite.py         |  93 ++++++++++++---
 tensorflow/lite/python/lite_v2_test.py | 156 ++++++++++++++++++++++---
 2 files changed, 217 insertions(+), 32 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 53814bb0c43..af42c28172a 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -201,6 +201,11 @@ class QuantizationMode(object):
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
+  def is_post_training_integer_quantize(self):
+    """Post training integer quantization."""
+    return (self.post_training_int8_no_float() or
+            self.post_training_int8_allow_float())
+
   def training_time_int8_allow_float(self):
     """Training-time int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
@@ -413,7 +418,56 @@ class TFLiteConverterBase(object):
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
-  """Converter subclass to share functionality between V2 converters."""
+  """Converter subclass to share functionality between V2 converters.
+
+  Attributes:
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When False, any unknown operation is an error. When True, custom ops are
+      created for any op that is unknown. The developer needs to provide these
+      to the TensorFlow Lite runtime with a custom resolver. (default False)
+    optimizations: Experimental flag, subject to change. A list of optimizations
+      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
+    representative_dataset: A representative dataset that can be used to
+      generate input and output samples for the model. The converter can use the
+      dataset to evaluate different optimizations. Note that this is an optional
+      attribute but it is necessary if INT8 is the only support builtin ops in
+      target ops.
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
+    inference_input_type: Data type of the input layer. Note that integer types
+      (tf.int8 and tf.uint8) are currently only supported for post training
+      integer quantization. (default tf.float32, must be in {tf.float32,
+      tf.int8, tf.uint8})
+    inference_output_type: Data type of the output layer. Note that integer
+      types (tf.int8 and tf.uint8) are currently only supported for post
+      training integer quantization. (default tf.float32, must be in
+      {tf.float32, tf.int8, tf.uint8})
+    experimental_new_converter: Experimental flag, subject to change. Enables
+      MLIR-based conversion instead of TOCO conversion.
+  """
+
+  def __init__(self):
+    """Constructor for TFLiteConverter."""
+    super(TFLiteConverterBaseV2, self).__init__()
+    self.inference_input_type = constants.FLOAT
+    self.inference_output_type = constants.FLOAT
+
+  def _validate_inference_input_output_types(self, quant_mode):
+    """Validate inference_input_type and inference_output_type flags."""
+    default_types = [constants.FLOAT, None]
+    # We only support integer types for post training integer quantization
+    # as we have statistical information to quantize the input and output.
+    if quant_mode.is_post_training_integer_quantize():
+      all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
+      if self.inference_input_type not in all_types or \
+          self.inference_output_type not in all_types:
+        all_types_names = ["tf." + t.name for t in all_types]
+        raise ValueError("The inference_input_type and inference_output_type "
+                         "must be in {}.".format(all_types_names))
+    elif self.inference_input_type not in default_types or \
+        self.inference_output_type not in default_types:
+      raise ValueError("The inference_input_type and inference_output_type "
+                       "must be tf.float32.")
 
   def convert(self, graph_def, input_tensors, output_tensors):
     """Converts a TensorFlow GraphDef based on instance variables.
@@ -437,6 +491,8 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     quant_mode = QuantizationMode(self.optimizations, self.target_spec,
                                   self.representative_dataset, graph_def)
 
+    self._validate_inference_input_output_types(quant_mode)
+
     if not self._is_unknown_shapes_allowed():
       # Checks dimensions in input tensor.
       for tensor in input_tensors:
@@ -479,6 +535,9 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           "quantize_to_float16": True,
       })
 
+    # Converter requires that the inference_input_type flag is set to FLOAT
+    converter_kwargs.update({"inference_input_type": constants.FLOAT})
+
     if not self.experimental_new_converter:
       logging.warning(
           "Please consider switching to use new converter by setting "
@@ -498,11 +557,11 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         **converter_kwargs)
 
     if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, False)
+      result = self._calibrate_quantize_model(result, self.inference_input_type,
+                                              self.inference_output_type, False)
     elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, True)
+      result = self._calibrate_quantize_model(result, self.inference_input_type,
+                                              self.inference_output_type, True)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -758,12 +817,9 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
 
   Attributes:
     allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When false any unknown operation is an error. When true, custom ops are
-      created for any op that is unknown. The developer will need to provide
-      these to the TensorFlow Lite runtime with a custom resolver.
-      (default False)
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
+      When False, any unknown operation is an error. When True, custom ops are
+      created for any op that is unknown. The developer needs to provide these
+      to the TensorFlow Lite runtime with a custom resolver. (default False)
     optimizations: Experimental flag, subject to change. A list of optimizations
       to apply when converting the model. E.g. `[Optimize.DEFAULT]`
     representative_dataset: A representative dataset that can be used to
@@ -771,8 +827,19 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       dataset to evaluate different optimizations. Note that this is an optional
       attribute but it is necessary if INT8 is the only support builtin ops in
       target ops.
-    experimental_new_converter: Experimental flag, subject to change.
-      Enables MLIR-based conversion instead of TOCO conversion.
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
+    inference_input_type: Data type of the input layer. Note that integer types
+      (tf.int8 and tf.uint8) are currently only supported for post training
+      integer quantization. (default tf.float32, must be in {tf.float32,
+      tf.int8, tf.uint8})
+    inference_output_type: Data type of the output layer. Note that integer
+      types (tf.int8 and tf.uint8) are currently only supported for post
+      training integer quantization. (default tf.float32, must be in
+      {tf.float32, tf.int8, tf.uint8})
+    experimental_new_converter: Experimental flag, subject to change. Enables
+      MLIR-based conversion instead of TOCO conversion.
+
   Example usage:
 
     ```python
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 9af37df2975..fae55e99cd1 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -71,6 +71,27 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
     self.assertEqual(expected_value.numpy(), actual_value)
 
+  @parameterized.named_parameters(
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  @test_util.run_v2_only
+  def testInvalidFloat(self, inference_input_output_type):
+    root = self._getSimpleVariableModel()
+    input_data = tf.constant(1., shape=[1])
+    concrete_func = root.f.get_concrete_function(input_data)
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
+    # We don't support integer types as we don't have statistical information
+    # to quantize (only supported for post training integer quantization).
+    with self.assertRaises(ValueError) as error:
+      converter.inference_input_type = inference_input_output_type
+      converter.inference_output_type = inference_input_output_type
+      converter.convert()
+    self.assertEqual(
+        'The inference_input_type and inference_output_type '
+        'must be tf.float32.', str(error.exception))
+
   @test_util.run_v2_only
   def testScalarInput(self):
     root = self._getSimpleVariableModel()
@@ -172,39 +193,113 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
-      ('EnableMlirQuantizer', True),  # enable mlir quantizer
-      ('DisableMlirQuantizer', False))  # disable mlir quantizer
-  def testCalibrateAndQuantizeBuiltinInt8(self, mlir_quantizer):
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  @test_util.run_v2_only
+  def testInvalidPostTrainingDynamicRangeQuantization(
+      self, inference_input_output_type):
+    func, _ = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    # We don't support integer types as we don't have statistical information
+    # to quantize (only supported for post training integer quantization).
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.inference_input_type = inference_input_output_type
+      quantized_converter.inference_output_type = inference_input_output_type
+      quantized_converter.convert()
+    self.assertEqual(
+        'The inference_input_type and inference_output_type '
+        'must be tf.float32.', str(error.exception))
+
+  @parameterized.named_parameters(
+      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  def testPostTrainingIntegerAllowFloatQuantization(
+      self, inference_input_output_type):
     func, calibration_gen = self._getCalibrationQuantizeModel()
 
     # Convert float model.
-    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    float_tflite = float_converter.convert()
-    self.assertTrue(float_tflite)
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Convert quantized model.
+    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.representative_dataset = calibration_gen
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertTrue(quantized_tflite_model)
+
+    interpreter = Interpreter(model_content=quantized_tflite_model)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     output_details[0]['dtype'])
+
+    # Ensure that the quantized tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(tflite_model))
+
+  @parameterized.named_parameters(
+      ('_DefaultFLOAT32InputOutput_UseTargetTypesFlag',
+       lite.constants.FLOAT, False),
+      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True),
+      ('_INT8InputOutput', lite.constants.INT8, True),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True))
+  @test_util.run_v2_only
+  def testPostTrainingIntegerNoFloatQuantization(self,
+                                                 inference_input_output_type,
+                                                 use_target_ops_flag):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
 
     # Convert model by specifying target spec (instead of optimizations), since
     # when targeting an integer only backend, quantization is mandatory.
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.target_spec.supported_ops = [
-        lite.OpsSet.TFLITE_BUILTINS_INT8
-    ]
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_converter.representative_dataset = calibration_gen
-    quantized_converter._experimental_new_quantizer = mlir_quantizer
-    quantized_tflite = quantized_converter.convert()
-    self.assertTrue(quantized_tflite)
+    if use_target_ops_flag:
+      quantized_converter.target_spec.supported_ops = [
+          lite.OpsSet.TFLITE_BUILTINS_INT8
+      ]
+    else:
+      quantized_converter.target_spec.supported_types = [lite.constants.INT8]
+    quantized_converter.inference_input_type = inference_input_output_type
+    quantized_converter.inference_output_type = inference_input_output_type
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertTrue(quantized_tflite_model)
 
-    # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter = Interpreter(model_content=quantized_tflite_model)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     input_details[0]['dtype'])
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
-    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertEqual(inference_input_output_type.as_numpy_dtype,
+                     output_details[0]['dtype'])
 
-    # Ensure that the quantized weights tflite model is smaller.
-    self.assertLess(len(quantized_tflite), len(float_tflite))
+    # Ensure that the quantized tflite model is smaller.
+    self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
   def testCalibrateAndQuantizeBuiltinInt16(self):
     func, calibration_gen = self._getCalibrationQuantizeModel()
@@ -279,7 +374,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
 
   @test_util.run_v2_only
-  def testTrainingTimeQuantizeConversion(self):
+  def testTrainingTimeQuantization(self):
     model = self._getTrainingTimeQuantizedModel()
 
     float_converter = lite.TFLiteConverterV2.from_keras_model(model)
@@ -297,6 +392,29 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     interpreter = Interpreter(model_content=quantized_tflite)
     self.assertEqual(np.float32, interpreter.get_input_details()[0]['dtype'])
 
+  @parameterized.named_parameters(
+      ('_INT8InputOutput', lite.constants.INT8),
+      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
+  def testInvalidTrainingTimeQuantization(self, inference_input_output_type):
+    # We currently don't support integer inference_input_type and
+    # inference_output_type flags for training time quantization.
+
+    model = self._getTrainingTimeQuantizedModel()
+
+    converter = lite.TFLiteConverterV2.from_keras_model(model)
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    with self.assertRaises(ValueError) as error:
+      quantized_converter.inference_input_type = inference_input_output_type
+      quantized_converter.inference_output_type = inference_input_output_type
+      quantized_converter.convert()
+    self.assertEqual(
+        'The inference_input_type and inference_output_type '
+        'must be tf.float32.', str(error.exception))
+
   @test_util.run_v2_only
   def testNewQuantizer(self):
     """Test the model quantized by the new converter."""

From 7769f52b9f6433c4e97ae42ba2060034e357a071 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 12:48:15 -0700
Subject: [PATCH 1324/1533] Check rank that rank(updates) = rank(indices +
 params[1:]) in resource_scatter_update, to match behavior of V1
 scatter_update and shape function.

PiperOrigin-RevId: 313641668
Change-Id: I7ac7507b8e2e16fdca38ca5431a7b22e80b38cd7
---
 tensorflow/core/kernels/resource_variable_ops.cc | 11 -----------
 tensorflow/core/ops/state_ops_test.cc            |  9 ---------
 2 files changed, 20 deletions(-)

diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 0fc1d53749f..b606d411a3d 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -887,17 +887,6 @@ class ResourceScatterUpdateOp : public OpKernel {
     const Tensor& indices = c->input(1);
     const Tensor& updates = c->input(2);
 
-    // Check that rank(updates.shape) = rank(indices.shape + params.shape[1:])
-    OP_REQUIRES(c,
-                updates.dims() == 0 ||
-                    updates.dims() == indices.dims() + params->dims() - 1,
-                errors::InvalidArgument(
-                    "Must have updates.shape = indices.shape + "
-                    "params.shape[1:] or updates.shape = [], got ",
-                    "updates.shape ", updates.shape().DebugString(),
-                    ", indices.shape ", indices.shape().DebugString(),
-                    ", params.shape ", params->shape().DebugString()));
-
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
     OP_REQUIRES(
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
index a0caad4a49f..6d05dd0b96c 100644
--- a/tensorflow/core/ops/state_ops_test.cc
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -58,15 +58,6 @@ TEST(StateOpsTest, ScatterUpdate_ShapeFn) {
 
   // Resolve shape on first updates dimension.
   INFER_OK(op, "[1,2];[3];[?,2]", "in0");
-
-  // Allow the update to be a scalar.
-  INFER_OK(op, "[1,2];[3];?", "in0");
-
-  // Allow a scalar index.
-  INFER_OK(op, "[1,2];[];[2]", "in0");
-
-  // Check the requirement updates.shape = indices.shape + ref.shape[1:].
-  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op, "[2];[];[2]");
 }
 
 TEST(StateOpsTest, TemporaryVariable_ShapeFn) {

From a7fac003070570dc2cb9cba52144c81063f37d35 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 12:54:21 -0700
Subject: [PATCH 1325/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 313642927
Change-Id: Ic29ce98c72aec01830a895a8f36562ac7c0c243e
---
 .../core/ops/compat/ops_history_v2/DeviceIndex.pbtxt  | 11 +++++++++++
 tensorflow/core/ops/ops.pbtxt                         | 11 +++++++++++
 2 files changed, 22 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt
new file mode 100644
index 00000000000..a00f6849c33
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DeviceIndex.pbtxt
@@ -0,0 +1,11 @@
+op {
+  name: "DeviceIndex"
+  output_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  attr {
+    name: "device_names"
+    type: "list(string)"
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d2a14590bc5..9324181d287 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -12224,6 +12224,17 @@ op {
     type: "string"
   }
 }
+op {
+  name: "DeviceIndex"
+  output_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  attr {
+    name: "device_names"
+    type: "list(string)"
+  }
+}
 op {
   name: "Diag"
   input_arg {

From dbabb8aefefda066c799b4ae176475d40836b3de Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 12:54:50 -0700
Subject: [PATCH 1326/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 313643019
Change-Id: Icee624ee4c736833c205651d74d03648723bb7e4
---
 tensorflow/go/op/wrappers.go | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 33eba9a734f..57924f08f83 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -11540,6 +11540,21 @@ func AssertNextDataset(scope *Scope, input_dataset tf.Output, transformations tf
 	return op.Output(0)
 }
 
+// Return the index of device the op runs.
+func DeviceIndex(scope *Scope, device_names []string) (index tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"device_names": device_names}
+	opspec := tf.OpSpec{
+		Type: "DeviceIndex",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ShardDatasetAttr is an optional argument to ShardDataset.
 type ShardDatasetAttr func(optionalAttr)
 

From 07030bfcc86e3b3640551008898d55408a883216 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 28 May 2020 13:00:26 -0700
Subject: [PATCH 1327/1533] Hexagon Delegate: Update condition which enables
 wrong ops to be delegated. Currently activation function not supported for
 Add/Sub ops.

PiperOrigin-RevId: 313644017
Change-Id: I7c798989faf2f427d25aca4b05c078f80eda4dfa
---
 tensorflow/lite/experimental/delegates/hexagon/utils.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index 80f82749e80..d283b329015 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -142,7 +142,7 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         return false;
       const TfLiteAddParams* add_params =
           reinterpret_cast<const TfLiteAddParams*>(node->builtin_data);
-      return IsActivationReluOrNone(add_params->activation);
+      return add_params->activation == kTfLiteActNone;
     }
     case kTfLiteBuiltinMul: {
       if (!InputsWithCorrectTypes(
@@ -161,7 +161,7 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         return false;
       const TfLiteSubParams* sub_params =
           reinterpret_cast<const TfLiteSubParams*>(node->builtin_data);
-      return IsActivationReluOrNone(sub_params->activation);
+      return sub_params->activation == kTfLiteActNone;
     }
     case kTfLiteBuiltinSum:
       // TODO(b/139277813): Enable these when they pass unit tests. These seem

From b2d128af63e4e38c53462df8d82c73aea737b070 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 28 May 2020 13:03:24 -0700
Subject: [PATCH 1328/1533] Hexagon Delegate: * Update arithmetic builder to
 handle const operand for uint8/int8. * Added unit-tests for Add Op * Add
 const tensors earlier in the flow, this to simplifies separate builders and
 avoid duplication of data. * Updated some GraphBuilder methods to support the
 above cases. * Removed unused param in AddCastOp

PiperOrigin-RevId: 313644626
Change-Id: I2b8410387b160a356e3ac335845367a8b4392723
---
 .../hexagon/builders/arithmetic_builder.cc    |   6 -
 .../hexagon/builders/matmul_builder.cc        |   5 +-
 .../delegates/hexagon/builders/op_builder.cc  |  22 ++-
 .../delegates/hexagon/builders/op_builder.h   |   9 +-
 .../delegates/hexagon/builders/tests/BUILD    |   1 +
 .../hexagon/builders/tests/arithmetic_test.cc | 175 ++++++++++++++++++
 .../hexagon/hexagon_delegate_kernel.cc        |  13 ++
 7 files changed, 208 insertions(+), 23 deletions(-)
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/tests/arithmetic_test.cc

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
index 9494de0da29..090d3b4a9a7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/arithmetic_builder.cc
@@ -47,12 +47,6 @@ TfLiteStatus ArithmeticOpBuilder::PopulateSubGraph(
   // Second input data tensor.
   tensor_id = inputs->data[1];
   const auto& input2_tensor = context->tensors[tensor_id];
-  // TODO(karimnosseir): Have this as util to generalize to all ops.
-  if (input2_tensor.allocation_type == kTfLiteMmapRo) {
-    auto* const_input_node =
-        graph_builder_->AddConstNodeWithData(tensor_id, input2_tensor);
-    graph_builder_->AddTensorWithID(tensor_id, const_input_node->GetID(), 0);
-  }
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
   TF_LITE_ENSURE_STATUS(
       ComputeMinAndMaxQuantValues(input2_tensor, &input2_min_, &input2_max_));
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
index 894f98269ce..2d02cff82ad 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
@@ -91,10 +91,7 @@ TfLiteStatus AddFullyConnectedHelper(const TfLiteIntArray* inputs,
                       matmul_and_bias_out_max = matmul_out_max;
   if (bias_tensor_id != -1) {
     const auto& bias_tensor = context->tensors[bias_tensor_id];
-    auto* const_bias_node =
-        graph_builder->AddConstNodeWithData(bias_tensor_id, bias_tensor);
     float bias_min, bias_max;
-    graph_builder->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), 0);
     OpBuilder::ComputeMinAndMaxQuantValues(bias_tensor, &bias_min, &bias_max);
     auto* bias_min_const = graph_builder->AddConstNodeWithData(
         scalar_shape, reinterpret_cast<char*>(&bias_min), sizeof(bias_min));
@@ -202,7 +199,7 @@ TfLiteStatus MatMulWithConstWeightsOpBuilder::PopulateSubGraph(
       weights_shape_.data(), reinterpret_cast<char*>(nhcw.data()),
       weights_tensor.bytes);
   graph_builder_->AddTensorWithID(weights_tensor_id,
-                                  const_weights_node->GetID(), 0);
+                                  const_weights_node->GetID(), 0, true);
   ComputeMinAndMaxQuantValues(weights_tensor, &weights_min_, &weights_max_);
   auto* weights_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&weights_min_),
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index d851f8cf824..543bd7caf43 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -131,7 +131,8 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(const int shape[], char* data,
 }
 
 OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
-                                              const TfLiteTensor& tensor) {
+                                              const TfLiteTensor& tensor,
+                                              bool int8_to_uint8) {
   builders_.emplace_back(new OpBuilder(this, OP_Const));
   const int node_id = builders_.size();
   builders_.back()->SetConstNode();
@@ -147,14 +148,19 @@ OpBuilder* GraphBuilder::AddConstNodeWithData(int tensor_id,
     return nullptr;
   }
   AddTensorWithID(tensor_id, node_id, 0);
+  // Cast int8 to uint8 if requested.
+  // This will add cast op to uint8 and update tensor map to point
+  // to the casted tensor.
+  if (int8_to_uint8 && tensor.type == kTfLiteInt8) {
+    AddCastOp(context_, OP_Quantized_CastInt8ToUInt8, tensor_id);
+  }
   return builders_.back().get();
 }
 
 // TODO(b/154604279): Support these casting ops in Hexagon op profiling (which
 // seems to key tensors on a single op, which may not be the case now).
 TfLiteStatus GraphBuilder::AddCastOp(TfLiteContext* context, int op_type,
-                                     int tensor_id,
-                                     OpBuilder::TensorID hexagon_input) {
+                                     int tensor_id) {
   // Create a new OpBuilder for casting the tensor.
   OpBuilder* cast_builder = CreateCastBuilder(this, op_type);
   builders_.emplace_back(cast_builder);
@@ -188,9 +194,8 @@ TfLiteStatus GraphBuilder::AddInputTensors(const TfLiteIntArray* input_tensors,
     AddTensorWithID(tensor_id, input_op->GetID(), num_inputs);
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastInt8ToUInt8,
-                                      tensor_id,
-                                      GetHexagonTensorId(tensor_id)));
+      TF_LITE_ENSURE_STATUS(
+          AddCastOp(context, OP_Quantized_CastInt8ToUInt8, tensor_id));
     }
     ++num_inputs;
   }
@@ -208,9 +213,8 @@ TfLiteStatus GraphBuilder::AddOutputTensors(
     const auto& tensor = context->tensors[tensor_id];
     // If tensor is of type int8, add an op to cast it to uint8.
     if (tensor.type == kTfLiteInt8) {
-      TF_LITE_ENSURE_STATUS(AddCastOp(context, OP_Quantized_CastUInt8ToInt8,
-                                      tensor_id,
-                                      GetHexagonTensorId(tensor_id)));
+      TF_LITE_ENSURE_STATUS(
+          AddCastOp(context, OP_Quantized_CastUInt8ToInt8, tensor_id));
     }
     hexagon_output_ids.push_back(GetHexagonTensorId(tensor_id));
   }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
index 743323c8bd3..436dd1336df 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h
@@ -192,7 +192,10 @@ class GraphBuilder {
   OpBuilder* AddNode(int tflite_node_index = -1);
 
   // Add const node that provides the data held by 'tensor'.
-  OpBuilder* AddConstNodeWithData(int tensor_id, const TfLiteTensor& tensor);
+  // If `int8_to_uint8` is true, then the data will be casted to uint8 from
+  // int8.
+  OpBuilder* AddConstNodeWithData(int tensor_id, const TfLiteTensor& tensor,
+                                  bool int8_to_uint8 = false);
 
   // Same as above but takes shape of the tensor that will holds the data.
   OpBuilder* AddConstNodeWithData(const int shape[], char* data, int data_size);
@@ -217,7 +220,6 @@ class GraphBuilder {
   // Returns tensor id inside Hexagon graph.
   OpBuilder::TensorID GetHexagonTensorId(int tflite_tensor_index) {
     if (!HasTensor(tflite_tensor_index)) {
-      printf("Could not find tensor id: %d\n", tflite_tensor_index);
       // Return invalid ID.
       return OpBuilder::TensorID(-1, -1);
     }
@@ -310,8 +312,7 @@ class GraphBuilder {
   }
 
   // Adds a Cast op to convert a tensor from int8 to uint8 (or vice versa).
-  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id,
-                         OpBuilder::TensorID hexagon_input);
+  TfLiteStatus AddCastOp(TfLiteContext* context, int op_type, int tensor_id);
 
   const HexagonNN* hexagon_nn_ = nullptr;
   TfLiteContext* context_ = nullptr;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index 0627d5b202d..1a496250467 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -26,6 +26,7 @@ hexagon_op_tests(
     srcs = [
         "activations_test.cc",
         "arg_min_max_test.cc",
+        "arithmetic_test.cc",
         "concat_test.cc",
         "conv_test.cc",
         "l2_norm_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arithmetic_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arithmetic_test.cc
new file mode 100644
index 00000000000..6f9b48054be
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/arithmetic_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+class ArithmeticOpBaseModel : public SingleOpModelWithHexagon {
+ public:
+  ArithmeticOpBaseModel(const TensorData& input1, const TensorData& input2,
+                        const TensorData& output)
+      : SingleOpModelWithHexagon() {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+  }
+  ArithmeticOpBaseModel(const TensorData& input1, const TensorData& input2,
+                        const TensorData& output,
+                        const std::initializer_list<uint8_t>& input1_data,
+                        const std::initializer_list<uint8_t>& input2_data) {
+    if (input1_data.size() > 0)
+      input1_ = AddConstInput(input1, input1_data);
+    else
+      input1_ = AddInput(input1);
+    if (input2_data.size() > 0)
+      input2_ = AddConstInput(input2, input2_data);
+    else
+      input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+  }
+
+  void InitInterpreter() {
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  template <typename T>
+  void SetInput1(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(input1_, data);
+  }
+
+  template <typename T>
+  void SetInput2(const std::vector<float>& data) {
+    QuantizeAndPopulate<T>(input2_, data);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class AddOpModel : public ArithmeticOpBaseModel {
+ public:
+  AddOpModel(const TensorData& input1, const TensorData& input2,
+             const TensorData& output)
+      : ArithmeticOpBaseModel(input1, input2, output) {}
+  AddOpModel(const TensorData& input1, const TensorData& input2,
+             const TensorData& output,
+             const std::initializer_list<uint8_t>& input1_data,
+             const std::initializer_list<uint8_t>& input2_data)
+      : ArithmeticOpBaseModel(input1, input2, output, input1_data,
+                              input2_data) {}
+
+  void InitInterpreter() {
+    SetBuiltinOp(
+        BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+        CreateAddOptions(builder_, ActivationFunctionType_NONE).Union());
+    ArithmeticOpBaseModel::InitInterpreter();
+  }
+};
+
+template <TensorType tensor_type, typename integer_dtype>
+void QuantizedTestsNoActivation() {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  std::vector<std::vector<float>> inputs1 = {
+      {0.1, 0.2, 0.3, 0.4}, {-0.8, 0.2, 0.4, 0.7}, {-0.8, 0.2, 0.7, 0.3}};
+  std::vector<std::vector<float>> inputs2 = {
+      {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}};
+  for (size_t i = 0; i < 1; ++i) {
+    AddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                 {tensor_type, {1, 2, 2, 1}, -1.0, 1.0},
+                 {tensor_type, {1, 2, 2, 1}, -1.0, 1.0});
+    m.InitInterpreter();
+    m.SetInput1<integer_dtype>(inputs1[i]);
+    m.SetInput2<integer_dtype>(inputs2[i]);
+    m.Invoke();
+    auto reference_output = m.GetDequantizedOutput<integer_dtype>();
+    m.ApplyDelegateAndInvoke();
+    EXPECT_THAT(
+        m.GetDequantizedOutput<integer_dtype>(),
+        ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)))
+        << "With test number " << i;
+  }
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8) {
+  QuantizedTestsNoActivation<TensorType_UINT8, uint8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt8) {
+  QuantizedTestsNoActivation<TensorType_INT8, int8_t>();
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8_ConstInput_1) {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  AddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {110, 142, 156, 171}, {});
+  m.InitInterpreter();
+  m.SetInput1<uint8_t>({0.1, 0.2, 0.3, 0.4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationUInt8_ConstInput_2) {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  AddOpModel m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0}, {},
+               {110, 142, 156, 171});
+  m.InitInterpreter();
+  m.SetInput2<uint8_t>({0.1, 0.2, 0.3, 0.4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt8_ConstInput) {
+  const float kQuantizedTolerance = 2.0 / 255.0;
+  AddOpModel m({TensorType_INT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_INT8, {1, 2, 2, 1}, -1.0, 1.0},
+               {TensorType_INT8, {1, 2, 2, 1}, -1.0, 1.0}, {},
+               {110, 101, 105, 120});
+  m.InitInterpreter();
+  m.SetInput2<int8_t>({0.1, 0.2, 0.3, 0.4});
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+  m.ApplyDelegateAndInvoke();
+  EXPECT_THAT(
+      m.GetDequantizedOutput<int8_t>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
index 5786562fc6a..d28791ce393 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate_kernel.cc
@@ -300,6 +300,19 @@ TfLiteStatus HexagonDelegateKernel::BuildGraph(
   for (int node_index : nodes_) {
     TF_LITE_ENSURE_STATUS(
         context->GetNodeAndRegistration(context, node_index, &node, &reg));
+    // Const inputs needs to be added to the hexagon graph as const nodes.
+    // Adding them earlier here to the graph
+    // - Simplifies separate builders
+    // - Simplifies int8 vs uint8 cases, builders don't need to handle them.
+    for (int i = 0; i < node->inputs->size; ++i) {
+      const int tensor_id = node->inputs->data[i];
+      if (tensor_id == -1) continue;
+      const auto& input_tensor = context->tensors[tensor_id];
+      if (input_tensor.allocation_type == kTfLiteMmapRo) {
+        builder_->AddConstNodeWithData(tensor_id, input_tensor,
+                                       /*int8_to_uint8*/ true);
+      }
+    }
     auto* op_builder =
         builder_->AddNodeFromTfLiteOp(reg->builtin_code, node, node_index);
     TF_LITE_ENSURE_STATUS(

From 9322698c3b7b389d893d619f8c2fdb57b788522a Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 28 May 2020 13:06:39 -0700
Subject: [PATCH 1329/1533] Hexagon Delegate: Add note when using App bundle
 for application publishing. The delegate relies on file path for setting
 environment variable to the library path.

PiperOrigin-RevId: 313645240
Change-Id: Icdf5d78e25361a7e0809c26510ac8a091a21b1da
---
 tensorflow/lite/g3doc/performance/hexagon_delegate.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 0e947d1d5e1..b76b4ba9fdf 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -244,6 +244,10 @@ TfLiteHexagonTearDown();  // Needed once at end of app/DSP usage.
     *   ARM 32-bit: `app/src/main/jniLibs/armeabi-v7a`
 *   Put your .so in the directory that match the architecture.
 
+Note: If you're using App Bundle for publishing your Application, you might want
+to set android.bundle.enableUncompressedNativeLibs=false in the
+gradle.properties file.
+
 ## Feedback
 
 For issues, please create a
@@ -255,7 +259,8 @@ ro.board.platform`).
 ## FAQ
 
 *   Which ops are supported by the delegate?
-    *   See the current list of [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
+    *   See the current list of
+        [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
 *   How can I tell that the model is using the DSP when I enable the delegate?
     *   Two log messages will be printed when you enable the delegate - one to
         indicate if the delegate was created and another to indicate how many

From 245f3908068606c7b80da3375e04c2ede5125516 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 28 May 2020 13:08:24 -0700
Subject: [PATCH 1330/1533] Update Conv2D tests for XNNPACK delegate

- Extract Conv2DTester class into a separate target
- Add test cases for convolution with activations and multi-threaded inference

PiperOrigin-RevId: 313645557
Change-Id: Ie0d2e9ff69cbc4dee04540671061a89dddce15a6
---
 tensorflow/lite/delegates/xnnpack/BUILD       |  26 +-
 .../lite/delegates/xnnpack/conv_2d_test.cc    | 676 ++++++++----------
 .../lite/delegates/xnnpack/conv_2d_tester.cc  | 298 ++++++++
 .../lite/delegates/xnnpack/conv_2d_tester.h   | 224 ++++++
 4 files changed, 830 insertions(+), 394 deletions(-)
 create mode 100644 tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
 create mode 100644 tensorflow/lite/delegates/xnnpack/conv_2d_tester.h

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index df70a314308..00c0709ccc3 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -64,6 +64,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "conv_2d_tester",
+    testonly = 1,
+    srcs = ["conv_2d_tester.cc"],
+    hdrs = ["conv_2d_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "depthwise_conv_2d_tester",
     testonly = 1,
@@ -214,15 +231,10 @@ cc_test(
         "//conditions:default": [],
     }),
     deps = [
+        ":conv_2d_tester",
         ":test_main",
-        ":xnnpack_delegate",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:schema_fbs_version",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/schema:schema_fbs",
-        "@FP16",
+        ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
-        "@flatbuffers",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index a8c6a1956bc..816b835883b 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -13,407 +13,95 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <array>
 #include <cstdint>
 #include <functional>
+#include <memory>
 #include <random>
-#include <vector>
 
 #include <gtest/gtest.h>
-#include <fp16.h>
-#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/schema/schema_generated.h"
-#include "tensorflow/lite/version.h"
 
 namespace tflite {
 namespace xnnpack {
 
-namespace {
-
-class Conv2DTester {
- public:
-  Conv2DTester() = default;
-  Conv2DTester(const Conv2DTester&) = delete;
-  Conv2DTester& operator=(const Conv2DTester&) = delete;
-
-  Conv2DTester& BatchSize(int32_t batch_size) {
-    EXPECT_GT(batch_size, 0);
-    batch_size_ = batch_size;
-    return *this;
-  }
-
-  int32_t BatchSize() const { return batch_size_; }
-
-  Conv2DTester& InputChannels(int32_t input_channels) {
-    EXPECT_GT(input_channels, 0);
-    input_channels_ = input_channels;
-    return *this;
-  }
-
-  int32_t InputChannels() const { return input_channels_; }
-
-  Conv2DTester& OutputChannels(int32_t output_channels) {
-    EXPECT_GT(output_channels, 0);
-    output_channels_ = output_channels;
-    return *this;
-  }
-
-  int32_t OutputChannels() const { return output_channels_; }
-
-  Conv2DTester& InputHeight(int32_t input_height) {
-    EXPECT_GT(input_height, 0);
-    input_height_ = input_height;
-    return *this;
-  }
-
-  int32_t InputHeight() const { return input_height_; }
-
-  Conv2DTester& InputWidth(int32_t input_width) {
-    EXPECT_GT(input_width, 0);
-    input_width_ = input_width;
-    return *this;
-  }
-
-  int32_t InputWidth() const { return input_width_; }
-
-  int32_t OutputWidth() const {
-    if (SamePadding()) {
-      return (InputWidth() - 1) / StrideWidth() + 1;
-    } else {
-      return (InputWidth() - (KernelWidth() - 1) * DilationWidth() - 1) /
-                 StrideWidth() +
-             1;
-    }
-  }
-
-  int32_t OutputHeight() const {
-    if (SamePadding()) {
-      return (InputHeight() - 1) / StrideHeight() + 1;
-    } else {
-      return (InputHeight() - (KernelHeight() - 1) * DilationHeight() - 1) /
-                 StrideHeight() +
-             1;
-    }
-  }
-
-  Conv2DTester& KernelHeight(int32_t kernel_height) {
-    EXPECT_GT(kernel_height, 0);
-    kernel_height_ = kernel_height;
-    return *this;
-  }
-
-  int32_t KernelHeight() const { return kernel_height_; }
-
-  Conv2DTester& KernelWidth(int32_t kernel_width) {
-    EXPECT_GT(kernel_width, 0);
-    kernel_width_ = kernel_width;
-    return *this;
-  }
-
-  int32_t KernelWidth() const { return kernel_width_; }
-
-  Conv2DTester& StrideHeight(int32_t stride_height) {
-    EXPECT_GT(stride_height, 0);
-    stride_height_ = stride_height;
-    return *this;
-  }
-
-  int32_t StrideHeight() const { return stride_height_; }
-
-  Conv2DTester& StrideWidth(int32_t stride_width) {
-    EXPECT_GT(stride_width, 0);
-    stride_width_ = stride_width;
-    return *this;
-  }
-
-  int32_t StrideWidth() const { return stride_width_; }
-
-  Conv2DTester& DilationHeight(int32_t dilation_height) {
-    EXPECT_GT(dilation_height, 0);
-    dilation_height_ = dilation_height;
-    return *this;
-  }
-
-  int32_t DilationHeight() const { return dilation_height_; }
-
-  Conv2DTester& DilationWidth(int32_t dilation_width) {
-    EXPECT_GT(dilation_width, 0);
-    dilation_width_ = dilation_width;
-    return *this;
-  }
-
-  int32_t DilationWidth() const { return dilation_width_; }
-
-  inline Conv2DTester& FP16Weights() {
-    fp16_weights_ = true;
-    return *this;
-  }
-
-  inline bool FP16Weights() const { return fp16_weights_; }
-
-  Conv2DTester& SamePadding(bool same_padding) {
-    same_padding_ = same_padding;
-    return *this;
-  }
-
-  bool SamePadding() const { return same_padding_; }
-
-  void Test(TfLiteDelegate* delegate) const {
-    std::vector<char> buffer = CreateTfLiteModel();
-    const Model* model = GetModel(buffer.data());
-
-    std::unique_ptr<Interpreter> delegate_interpreter;
-    ASSERT_EQ(
-        InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-            &delegate_interpreter),
-        kTfLiteOk);
-    std::unique_ptr<Interpreter> default_interpreter;
-    ASSERT_EQ(
-        InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
-            &default_interpreter),
-        kTfLiteOk);
-
-    ASSERT_TRUE(delegate_interpreter);
-    ASSERT_TRUE(default_interpreter);
-
-    ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
-    ASSERT_EQ(default_interpreter->inputs().size(), 1);
-
-    ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
-    ASSERT_EQ(default_interpreter->outputs().size(), 1);
-
-    ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
-    ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
-
-    ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate),
-              kTfLiteOk);
-
-    std::random_device random_device;
-    auto rng = std::mt19937(random_device());
-    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
-
-    float* default_input_data = default_interpreter->typed_tensor<float>(
-        default_interpreter->inputs()[0]);
-    std::generate(default_input_data,
-                  default_input_data + BatchSize() * InputHeight() *
-                                           InputWidth() * InputChannels(),
-                  std::ref(f32rng));
-
-    float* xnnpack_input_data = delegate_interpreter->typed_tensor<float>(
-        delegate_interpreter->inputs()[0]);
-    std::copy(default_input_data,
-              default_input_data +
-                  BatchSize() * InputHeight() * InputWidth() * InputChannels(),
-              xnnpack_input_data);
-
-    default_interpreter->Invoke();
-    delegate_interpreter->Invoke();
-
-    float* default_output_data = default_interpreter->typed_tensor<float>(
-        default_interpreter->outputs()[0]);
-    float* xnnpack_output_data = delegate_interpreter->typed_tensor<float>(
-        delegate_interpreter->outputs()[0]);
-
-    for (size_t i = 0;
-         i < BatchSize() * OutputHeight() * OutputWidth() * OutputChannels();
-         i++) {
-      ASSERT_NEAR(default_output_data[i], xnnpack_output_data[i],
-                  std::numeric_limits<float>::epsilon() *
-                      std::max(std::abs(default_output_data[i]) * 25.0f, 1.0f));
-    }
-  }
-
- private:
-  std::vector<char> CreateTfLiteModel() const {
-    std::random_device random_device;
-    auto rng = std::mt19937(random_device());
-    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
-
-    flatbuffers::FlatBufferBuilder builder;
-    std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
-        {CreateOperatorCode(builder, BuiltinOperator_CONV_2D, 0)}};
-    std::vector<flatbuffers::Offset<tflite::Operator>> operators;
-    std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
-        {CreateBuffer(builder, builder.CreateVector({}))}};
-
-    if (FP16Weights()) {
-      operator_codes.emplace_back(
-          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
-
-      auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
-
-      std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
-                                        KernelWidth() * InputChannels());
-      std::vector<uint16_t> bias_data(OutputChannels());
-
-      std::generate(filter_data.begin(), filter_data.end(), f16rng);
-      std::generate(bias_data.begin(), bias_data.end(), f16rng);
-
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(uint16_t) * filter_data.size())));
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(uint16_t) * bias_data.size())));
-
-      const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
-      const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
-      operators.emplace_back(CreateOperator(
-          builder, /*opcode_index=*/1,
-          builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
-                                        dequantize_filter_inputs.size()),
-          builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
-                                        dequantize_filter_outputs.size())));
-      const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
-      const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
-      operators.emplace_back(CreateOperator(
-          builder, /*opcode_index=*/1,
-          builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
-                                        dequantize_bias_inputs.size()),
-          builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
-                                        dequantize_bias_outputs.size())));
-    } else {
-      std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                     KernelWidth() * InputChannels());
-      std::vector<float> bias_data(OutputChannels());
-
-      std::generate(filter_data.begin(), filter_data.end(), f32rng);
-      std::generate(bias_data.begin(), bias_data.end(), f32rng);
-
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())));
-      buffers.emplace_back(CreateBuffer(
-          builder, builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())));
-    }
-
-    const std::array<int32_t, 4> input_shape{
-        {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
-    const std::array<int32_t, 4> output_shape{
-        {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
-    const std::array<int32_t, 4> filter_shape{
-        {OutputChannels(), KernelHeight(), KernelWidth(), InputChannels()}};
-    const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
-
-    std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
-    if (FP16Weights()) {
-      tensors.emplace_back(
-          CreateTensor(builder,
-                       builder.CreateVector<int32_t>(filter_shape.data(),
-                                                     filter_shape.size()),
-                       TensorType_FLOAT16, /*buffer=*/1));
-      tensors.emplace_back(CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT16, /*buffer=*/2));
-    }
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
-        TensorType_FLOAT32));
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
-        TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-        TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
-    tensors.emplace_back(CreateTensor(
-        builder,
-        builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
-        TensorType_FLOAT32));
-
-    const std::array<int32_t, 3> op_inputs{
-        {static_cast<int>(tensors.size()) - 4,
-         static_cast<int>(tensors.size()) - 3,
-         static_cast<int>(tensors.size()) - 2}};
-    const std::array<int32_t, 1> op_outputs{
-        {static_cast<int>(tensors.size()) - 1}};
-
-    flatbuffers::Offset<Conv2DOptions> conv2d_options = CreateConv2DOptions(
-        builder, SamePadding() ? tflite::Padding_SAME : tflite::Padding_VALID,
-        StrideWidth(), StrideHeight(), ActivationFunctionType_NONE,
-        DilationWidth(), DilationHeight());
-
-    operators.emplace_back(CreateOperator(
-        builder, /*opcode_index=*/0,
-        builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
-        builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-        BuiltinOptions_Conv2DOptions, conv2d_options.Union()));
-
-    const std::array<int32_t, 1> subgraph_inputs{
-        {static_cast<int>(tensors.size()) - 4}};
-    const std::array<int32_t, 1> subgraph_outputs{
-        {static_cast<int>(tensors.size()) - 1}};
-    flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
-        builder, builder.CreateVector(tensors.data(), tensors.size()),
-        builder.CreateVector<int32_t>(subgraph_inputs.data(),
-                                      subgraph_inputs.size()),
-        builder.CreateVector<int32_t>(subgraph_outputs.data(),
-                                      subgraph_outputs.size()),
-        builder.CreateVector(operators.data(), operators.size()));
-
-    flatbuffers::Offset<flatbuffers::String> description =
-        builder.CreateString("Conv2D model");
-
-    flatbuffers::Offset<Model> model_buffer = CreateModel(
-        builder, TFLITE_SCHEMA_VERSION,
-        builder.CreateVector(operator_codes.data(), operator_codes.size()),
-        builder.CreateVector(&subgraph, 1), description,
-        builder.CreateVector(buffers.data(), buffers.size()));
-
-    builder.Finish(model_buffer);
-
-    return std::vector<char>(builder.GetBufferPointer(),
-                             builder.GetBufferPointer() + builder.GetSize());
-  }
-
-  int32_t batch_size_ = 1;
-  int32_t input_channels_ = 1;
-  int32_t output_channels_ = 1;
-  int32_t input_height_ = 1;
-  int32_t input_width_ = 1;
-  int32_t kernel_height_ = 1;
-  int32_t kernel_width_ = 1;
-  int32_t stride_height_ = 1;
-  int32_t stride_width_ = 1;
-  int32_t dilation_height_ = 1;
-  int32_t dilation_width_ = 1;
-  bool fp16_weights_ = false;
-  bool same_padding_ = true;
-};
-
-}  // namespace
-
-TEST(Conv2D, Pointwise) {
+TEST(Conv2D, 1x1) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
                        TfLiteXNNPackDelegateDelete);
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
       .OutputChannels(channel_rng())
       .KernelHeight(1)
       .KernelWidth(1)
+      .ValidPadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, 3x3) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .SamePadding()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, 3x3Stride2) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(5, 25), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(3)
+      .KernelWidth(3)
+      .StrideHeight(2)
+      .StrideWidth(2)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -424,21 +112,24 @@ TEST(Conv2D, SmallKernelWithSamePadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
       .OutputChannels(channel_rng())
       .KernelHeight(kernel_rng())
       .KernelWidth(kernel_rng())
-      .SamePadding(true)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -449,21 +140,24 @@ TEST(Conv2D, SmallKernelWithValidPadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 7), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
       .OutputChannels(channel_rng())
       .KernelHeight(kernel_rng())
       .KernelWidth(kernel_rng())
-      .SamePadding(false)
+      .ValidPadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -474,6 +168,8 @@ TEST(Conv2D, StrideWithSamePadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -481,9 +177,10 @@ TEST(Conv2D, StrideWithSamePadding) {
   auto stride_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -492,7 +189,7 @@ TEST(Conv2D, StrideWithSamePadding) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .SamePadding(true)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -503,6 +200,8 @@ TEST(Conv2D, StrideWithValidPadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -510,9 +209,10 @@ TEST(Conv2D, StrideWithValidPadding) {
   auto stride_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -521,7 +221,7 @@ TEST(Conv2D, StrideWithValidPadding) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .SamePadding(false)
+      .ValidPadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -532,6 +232,8 @@ TEST(Conv2D, DilationWithSamePadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -539,9 +241,10 @@ TEST(Conv2D, DilationWithSamePadding) {
   auto dilation_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -550,7 +253,7 @@ TEST(Conv2D, DilationWithSamePadding) {
       .KernelWidth(kernel_rng())
       .DilationHeight(dilation_rng())
       .DilationWidth(dilation_rng())
-      .SamePadding(true)
+      .SamePadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -561,6 +264,8 @@ TEST(Conv2D, DilationWithValidPadding) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -568,9 +273,10 @@ TEST(Conv2D, DilationWithValidPadding) {
   auto dilation_rng =
       std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
   auto channel_rng =
-      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -579,7 +285,7 @@ TEST(Conv2D, DilationWithValidPadding) {
       .KernelWidth(kernel_rng())
       .DilationHeight(dilation_rng())
       .DilationWidth(dilation_rng())
-      .SamePadding(false)
+      .ValidPadding()
       .Test(xnnpack_delegate.get());
 }
 
@@ -590,6 +296,8 @@ TEST(Conv2D, FP16Weights) {
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
   auto input_rng =
       std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
   auto kernel_rng =
@@ -600,6 +308,7 @@ TEST(Conv2D, FP16Weights) {
       std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
 
   Conv2DTester()
+      .BatchSize(batch_rng())
       .InputHeight(input_rng())
       .InputWidth(input_rng())
       .InputChannels(channel_rng())
@@ -608,10 +317,203 @@ TEST(Conv2D, FP16Weights) {
       .KernelWidth(kernel_rng())
       .StrideHeight(stride_rng())
       .StrideWidth(stride_rng())
-      .SamePadding(true)
       .FP16Weights()
       .Test(xnnpack_delegate.get());
 }
 
+TEST(Conv2D, ReluActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ReluActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, Relu6Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .Relu6Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, ReluMinus1To1Activation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .ReluMinus1To1Activation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, DISABLED_TanhActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .TanhActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, DISABLED_SignBitActivation) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SignBitActivation()
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(Conv2D, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .Test(xnnpack_delegate.get());
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
new file mode 100644
index 00000000000..476889bde51
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.cc
@@ -0,0 +1,298 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/conv_2d_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include <fp16.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+void Conv2DTester::Test(TfLiteDelegate* delegate) const {
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(model, ::tflite::ops::builtin::BuiltinOpResolver())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_real_distribution<float>(), std::ref(rng));
+  float* default_input_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->inputs()[0]);
+  std::generate(default_input_data,
+                default_input_data + BatchSize() * InputHeight() *
+                                         InputWidth() * InputChannels(),
+                input_rng);
+
+  float* delegate_input_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->inputs()[0]);
+  std::copy(default_input_data,
+            default_input_data +
+                BatchSize() * InputHeight() * InputWidth() * InputChannels(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  float* default_output_data = default_interpreter->typed_tensor<float>(
+      default_interpreter->outputs()[0]);
+  float* delegate_output_data = delegate_interpreter->typed_tensor<float>(
+      delegate_interpreter->outputs()[0]);
+
+  for (int32_t i = 0; i < BatchSize(); i++) {
+    for (int32_t y = 0; y < OutputHeight(); y++) {
+      for (int32_t x = 0; x < OutputWidth(); x++) {
+        for (int32_t c = 0; c < OutputChannels(); c++) {
+          const int32_t index = ((i * OutputHeight() + y) * OutputWidth() + x) *
+                                    OutputChannels() +
+                                c;
+          ASSERT_NEAR(default_output_data[index], delegate_output_data[index],
+                      std::abs(default_output_data[index]) * 3.0e-6f)
+              << "batch " << i << " / " << BatchSize() << ", y position " << y
+              << " / " << OutputHeight() << ", x position " << x << " / "
+              << OutputWidth() << ", channel " << c << " / "
+              << OutputChannels();
+        }
+      }
+    }
+  }
+}
+
+std::vector<char> Conv2DTester::CreateTfLiteModel() const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto range_rng = std::bind(
+      std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_CONV_2D)}};
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+
+    std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
+                                      KernelWidth() * InputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all weights within the same output channel, but different ranges for
+      // different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                ((oc * KernelHeight() + y) * KernelWidth() + x) *
+                    InputChannels() +
+                ic;
+            filter_data[index] = value_rng();
+          }
+        }
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
+                                   KernelWidth() * InputChannels());
+    std::vector<float> bias_data(OutputChannels());
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all weights within the same output channel, but different ranges for
+      // different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                ((oc * KernelHeight() + y) * KernelWidth() + x) *
+                    InputChannels() +
+                ic;
+            filter_data[index] = value_rng();
+          }
+        }
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
+
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
+  const std::array<int32_t, 4> filter_shape{
+      {OutputChannels(), KernelHeight(), KernelWidth(), InputChannels()}};
+  const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
+
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
+
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+
+  flatbuffers::Offset<Conv2DOptions> conv2d_options =
+      CreateConv2DOptions(builder, Padding(), StrideWidth(), StrideHeight(),
+                          Activation(), DilationWidth(), DilationHeight());
+  operators.emplace_back(CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_Conv2DOptions, conv2d_options.Union()));
+
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(operators.data(), operators.size()));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Conv2D model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
new file mode 100644
index 00000000000..6b22ded3458
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_tester.h
@@ -0,0 +1,224 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class Conv2DTester {
+ public:
+  Conv2DTester() = default;
+  Conv2DTester(const Conv2DTester&) = delete;
+  Conv2DTester& operator=(const Conv2DTester&) = delete;
+
+  inline Conv2DTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline Conv2DTester& InputChannels(int32_t input_channels) {
+    EXPECT_GT(input_channels, 0);
+    input_channels_ = input_channels;
+    return *this;
+  }
+
+  inline int32_t InputChannels() const { return input_channels_; }
+
+  inline Conv2DTester& OutputChannels(int32_t output_channels) {
+    EXPECT_GT(output_channels, 0);
+    output_channels_ = output_channels;
+    return *this;
+  }
+
+  inline int32_t OutputChannels() const { return output_channels_; }
+
+  inline Conv2DTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline Conv2DTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline int32_t OutputWidth() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputWidth(), 1);
+      return (InputWidth() - 1) / StrideWidth() + 1;
+    } else {
+      EXPECT_GE(InputWidth(), DilatedKernelWidth());
+      return 1 + (InputWidth() - DilatedKernelWidth()) / StrideWidth();
+    }
+  }
+
+  inline int32_t OutputHeight() const {
+    if (Padding() == ::tflite::Padding_SAME) {
+      EXPECT_GE(InputHeight(), 1);
+      return (InputHeight() - 1) / StrideHeight() + 1;
+    } else {
+      EXPECT_GE(InputHeight(), DilatedKernelHeight());
+      return 1 + (InputHeight() - DilatedKernelHeight()) / StrideHeight();
+    }
+  }
+
+  inline Conv2DTester& KernelHeight(int32_t kernel_height) {
+    EXPECT_GT(kernel_height, 0);
+    kernel_height_ = kernel_height;
+    return *this;
+  }
+
+  inline int32_t KernelHeight() const { return kernel_height_; }
+
+  inline Conv2DTester& KernelWidth(int32_t kernel_width) {
+    EXPECT_GT(kernel_width, 0);
+    kernel_width_ = kernel_width;
+    return *this;
+  }
+
+  inline int32_t KernelWidth() const { return kernel_width_; }
+
+  inline Conv2DTester& StrideHeight(int32_t stride_height) {
+    EXPECT_GT(stride_height, 0);
+    stride_height_ = stride_height;
+    return *this;
+  }
+
+  inline int32_t StrideHeight() const { return stride_height_; }
+
+  inline Conv2DTester& StrideWidth(int32_t stride_width) {
+    EXPECT_GT(stride_width, 0);
+    stride_width_ = stride_width;
+    return *this;
+  }
+
+  inline int32_t StrideWidth() const { return stride_width_; }
+
+  inline Conv2DTester& DilationHeight(int32_t dilation_height) {
+    EXPECT_GT(dilation_height, 0);
+    dilation_height_ = dilation_height;
+    return *this;
+  }
+
+  inline int32_t DilationHeight() const { return dilation_height_; }
+
+  inline Conv2DTester& DilationWidth(int32_t dilation_width) {
+    EXPECT_GT(dilation_width, 0);
+    dilation_width_ = dilation_width;
+    return *this;
+  }
+
+  inline int32_t DilationWidth() const { return dilation_width_; }
+
+  inline int32_t DilatedKernelHeight() const {
+    return (KernelHeight() - 1) * DilationHeight() + 1;
+  }
+
+  inline int32_t DilatedKernelWidth() const {
+    return (KernelWidth() - 1) * DilationWidth() + 1;
+  }
+
+  inline Conv2DTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
+  inline Conv2DTester& SamePadding() {
+    padding_ = ::tflite::Padding_SAME;
+    return *this;
+  }
+
+  inline Conv2DTester& ValidPadding() {
+    padding_ = ::tflite::Padding_VALID;
+    return *this;
+  }
+
+  inline Conv2DTester& ReluActivation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU;
+    return *this;
+  }
+
+  inline Conv2DTester& Relu6Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU6;
+    return *this;
+  }
+
+  inline Conv2DTester& ReluMinus1To1Activation() {
+    activation_ = ::tflite::ActivationFunctionType_RELU_N1_TO_1;
+    return *this;
+  }
+
+  inline Conv2DTester& TanhActivation() {
+    activation_ = ::tflite::ActivationFunctionType_TANH;
+    return *this;
+  }
+
+  inline Conv2DTester& SignBitActivation() {
+    activation_ = ::tflite::ActivationFunctionType_SIGN_BIT;
+    return *this;
+  }
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  inline ::tflite::Padding Padding() const { return padding_; }
+
+  inline ::tflite::ActivationFunctionType Activation() const {
+    return activation_;
+  }
+
+  int32_t batch_size_ = 1;
+  int32_t input_channels_ = 1;
+  int32_t output_channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t kernel_height_ = 1;
+  int32_t kernel_width_ = 1;
+  int32_t stride_height_ = 1;
+  int32_t stride_width_ = 1;
+  int32_t dilation_height_ = 1;
+  int32_t dilation_width_ = 1;
+  bool fp16_weights_ = false;
+  ::tflite::Padding padding_ = ::tflite::Padding_VALID;
+  ::tflite::ActivationFunctionType activation_ =
+      ::tflite::ActivationFunctionType_NONE;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_CONV_2D_TESTER_H_

From 16597a909a21a4e9d52ad4069ed33b06bcc886ae Mon Sep 17 00:00:00 2001
From: Robert Suderman <suderman@google.com>
Date: Thu, 28 May 2020 13:30:47 -0700
Subject: [PATCH 1331/1533] Added mlir tests for dynamic strided slice

Strided slice has no tests for dynamically shaped tensors. Included some simple
examples for lowerings when the Shape dialect is ready.

PiperOrigin-RevId: 313649349
Change-Id: I6aca6397c9a2e1ceaab13b9ed6d72703906b5fdb
---
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 363e60eb341..6406e2fee48 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -2180,6 +2180,18 @@ func @simple_strided_slice(%input: tensor<4x8xf32>) -> tensor<3x2xf32> {
   return %output : tensor<3x2xf32>
 }
 
+// CHECK-LABEL: dynamic_strided_slice
+func @dynamic_strided_slice(%input: tensor<?x8xf32>) -> tensor<?x2xf32> {
+  %begin = "tf.Const"() {value = dense<[0, 1]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %end = "tf.Const"() {value = dense<[3, 7]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %strides = "tf.Const"() {value = dense<[1, 3]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+  // CHECK: "tf.StridedSlice"
+  %output = "tf.StridedSlice"(%input, %begin, %end, %strides)
+      : (tensor<?x8xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<?x2xf32>
+  return %output : tensor<?x2xf32>
+}
+
 // CHECK-LABEL: strided_slice_negative_indices
 func @strided_slice_negative_indices(%input: tensor<4x8xf32>) -> tensor<3x2xf32> {
   %begin = "tf.Const"() {value = dense<[-1, -2]> : tensor<2xi32>} : () -> (tensor<2xi32>)
@@ -2199,6 +2211,18 @@ func @strided_slice_negative_indices(%input: tensor<4x8xf32>) -> tensor<3x2xf32>
   return %output : tensor<3x2xf32>
 }
 
+// CHECK-LABEL: dynamic_strided_slice_negative_indices
+func @dynamic_strided_slice_negative_indices(%input: tensor<?x8xf32>) -> tensor<?x2xf32> {
+  %begin = "tf.Const"() {value = dense<[-1, -2]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %end = "tf.Const"() {value = dense<[-4, -8]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+  %strides = "tf.Const"() {value = dense<[-1, -3]> : tensor<2xi32>} : () -> (tensor<2xi32>)
+
+  // CHECK: tf.StridedSlice
+  %output = "tf.StridedSlice"(%input, %begin, %end, %strides)
+      : (tensor<?x8xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<?x2xf32>
+  return %output : tensor<?x2xf32>
+}
+
 // CHECK-LABEL: strided_slice_range_clamping
 func @strided_slice_range_clamping(%input: tensor<4x8xf32>) -> tensor<0x3xf32> {
   %begin = "tf.Const"() {value = dense<[-4, -10]> : tensor<2xi32>} : () -> (tensor<2xi32>)

From 1df5cf18634c1d2bce98e6d300ed4486d8ff62b8 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 28 May 2020 13:45:49 -0700
Subject: [PATCH 1332/1533] Broadcast start, stop together in linspace to match
 numpy behavior.

PiperOrigin-RevId: 313652044
Change-Id: I7dc210ec242fc08051a2b8022306f823a76b6c53
---
 tensorflow/python/ops/math_ops.py      | 18 ++++++++++++-----
 tensorflow/python/ops/math_ops_test.py | 27 ++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index ed1db4f539d..9b3a211d6fe 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -167,6 +167,11 @@ def linspace_nd(start, stop, num, name=None, axis=0):
     num_int = array_ops.convert_to_int_tensor(num, name="num")
     num = cast(num_int, dtype=start.dtype)
 
+    broadcast_shape = array_ops.broadcast_dynamic_shape(
+        array_ops.shape(start), array_ops.shape(stop))
+    start = array_ops.broadcast_to(start, broadcast_shape)
+    stop = array_ops.broadcast_to(stop, broadcast_shape)
+
     expanded_start = array_ops.expand_dims(start, axis=axis)
     expanded_stop = array_ops.expand_dims(stop, axis=axis)
 
@@ -175,15 +180,18 @@ def linspace_nd(start, stop, num, name=None, axis=0):
 
     axis = array_ops.where_v2(axis >= 0, axis, ndims + axis)
 
-    # to avoid having negative values in the range or zero division
-    # The result is sliced in the end so a correct result is returned for
+    # To avoid having negative values in the range or zero division
+    # the result is sliced in the end so a correct result is returned for
     # num == 1.
-    n_steps = gen_math_ops.maximum(num - 1., 1.)
-    delta = (expanded_stop - expanded_start) / n_steps
+    n_steps = gen_math_ops.maximum(num_int - 1, 1)
+    delta = (expanded_stop - expanded_start) / cast(n_steps,
+                                                    expanded_stop.dtype)
     # If num < 0, we will throw exception in the range
     # otherwise use the same div for delta
     range_end = array_ops.where_v2(num_int > 0, n_steps, -1)
-    num_range = range(1., range_end, dtype=start.dtype)
+    # Even though range supports an output dtype, its limited
+    # (e.g. doesn't support half at the moment).
+    num_range = cast(range(1, range_end, dtype=dtypes.int64), start.dtype)
     shape_range = range(ndims)
     ones_like_shape_range = array_ops.ones_like(shape_range)
     axis_tiled = ones_like_shape_range * axis
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 9093a06b84a..0434aadc066 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -17,6 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import distutils
+import itertools
+
 import numpy as np
 
 from tensorflow.python.eager import backprop
@@ -785,5 +788,29 @@ class RangeTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(values, self.evaluate(tensor))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class LinspaceTest(test_util.TensorFlowTestCase):
+
+  def testLinspaceBroadcasts(self):
+    if distutils.version.LooseVersion(
+        np.version.version) < distutils.version.LooseVersion("1.16.0"):
+      self.skipTest("numpy doesn't support axes before version 1.16.0")
+
+    shapes = [(), (2,), (2, 2)]
+
+    for start_shape, stop_shape in itertools.product(shapes, repeat=2):
+      for num in [1, 2, 20]:
+        ndims = max(len(start_shape), len(stop_shape))
+        for axis in range(-ndims, ndims):
+          start = np.ones(start_shape)
+          stop = 10 * np.ones(stop_shape)
+
+          np_ans = np.linspace(start, stop, num, axis=axis)
+          tf_ans = self.evaluate(
+              math_ops.linspace_nd(start, stop, num, axis=axis))
+
+          self.assertAllClose(np_ans, tf_ans)
+
+
 if __name__ == "__main__":
   googletest.main()

From 8ab1c251ed27950b850cbc106bbe9a798b16b79a Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 28 May 2020 13:46:00 -0700
Subject: [PATCH 1333/1533] Remove unused <optional> header

This breaks TF OSS builds which targets C++14, and it doesn't seem to be used.

PiperOrigin-RevId: 313652090
Change-Id: I2176a01dfc84298ad60e73d904861dc00ed76b04
---
 tensorflow/compiler/mlir/lite/quantization/quantization_config.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index a4046553d17..0e766ec52b6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -19,7 +19,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
 
-#include <optional>
 #include <string>
 #include <vector>
 

From 68e13f00e14c36b95667b8c3454c1ec867df3976 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Thu, 28 May 2020 14:26:22 -0700
Subject: [PATCH 1334/1533] [XLA/GPU] Move Thunk::Initialize() calls before
 ExecuteThunks().

Separating the initialization from execution makes the execution easier for LHLO graph to replace.

PiperOrigin-RevId: 313659758
Change-Id: I30f47cf6186ee907bfc67701be2d8c190f3b524e
---
 tensorflow/compiler/xla/service/gpu/gpu_executable.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 2df6b50d361..b9d1f3ef158 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -176,7 +176,6 @@ Status GpuExecutable::ExecuteThunks(
     // module, we won't get any data, but that's probably an OK trade-off.
     ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
 
-    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -387,6 +386,10 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
             assignment_.get(), executor->device_ordinal(), memory_allocator));
   }
 
+  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
+    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
+  }
+
   TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
                                    block_host_until_done,
                                    hlo_execution_profile));

From a5393e904644b88de2d36cf8ecb85a9fd9f69c8f Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 28 May 2020 14:29:56 -0700
Subject: [PATCH 1335/1533] Support the new CUDA compute capability options in
 configure. sm_35,sm_50,sm_60,compute_70

PiperOrigin-RevId: 313660333
Change-Id: I08b6ccd62fac60645147c30c434055b4e608b190
---
 configure.py | 48 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 17 deletions(-)

diff --git a/configure.py b/configure.py
index 0a5b87172c0..29c37f601fd 100644
--- a/configure.py
+++ b/configure.py
@@ -484,8 +484,8 @@ def check_bazel_version(min_version, max_version):
 
   stderr = open(os.devnull, 'wb')
   curr_version = run_shell(['bazel', '--version'],
-                           allow_non_zero = True,
-                           stderr = stderr)
+                           allow_non_zero=True,
+                           stderr=stderr)
   if curr_version.startswith('bazel '):
     curr_version = curr_version.split('bazel ')[1]
 
@@ -1011,17 +1011,15 @@ def set_tf_cuda_compute_capabilities(environ_cp):
       default_cuda_compute_capabilities = native_cuda_compute_capabilities
 
     ask_cuda_compute_capabilities = (
-        'Please specify a list of comma-separated '
-        'CUDA compute capabilities you want to '
-        'build with.\nYou can find the compute '
-        'capability of your device at: '
-        'https://developer.nvidia.com/cuda-gpus.\nPlease'
-        ' note that each additional compute '
-        'capability significantly increases your '
-        'build time and binary size, and that '
-        'TensorFlow only supports compute '
-        'capabilities >= 3.5 [Default is: %s]: ' %
-        default_cuda_compute_capabilities)
+        'Please specify a list of comma-separated CUDA compute capabilities '
+        'you want to build with.\nYou can find the compute capability of your '
+        'device at: https://developer.nvidia.com/cuda-gpus. Each capability '
+        'can be specified as "x.y" or "compute_xy" to include both virtual and'
+        ' binary GPU code, or as "sm_xy" to only include the binary '
+        'code.\nPlease note that each additional compute capability '
+        'significantly increases your build time and binary size, and that '
+        'TensorFlow only supports compute capabilities >= 3.5 [Default is: '
+        '%s]: ' % default_cuda_compute_capabilities)
     tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
         environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
         ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
@@ -1033,8 +1031,23 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
-        print('Invalid compute capability: %s' % compute_capability)
-        all_valid = False
+        # We now support sm_35,sm_50,sm_60,compute_70.
+        sm_compute_match = re.match('(sm|compute)_?([0-9]+[0-9]+)',
+                                    compute_capability)
+        if not sm_compute_match:
+          print('Invalid compute capability: %s' % compute_capability)
+          all_valid = False
+        else:
+          ver = int(m.group(2))
+          if ver < 30:
+            print(
+                'ERROR: TensorFlow only supports small CUDA compute'
+                ' capabilities of sm_30 and higher. Please re-specify the list'
+                ' of compute capabilities excluding version %s.' % ver)
+            all_valid = False
+          if ver < 35:
+            print('WARNING: XLA does not support CUDA compute capabilities '
+                  'lower than sm_35. Disable XLA when running on older GPUs.')
       else:
         ver = float(m.group(0))
         if ver < 3.0:
@@ -1225,7 +1238,8 @@ def is_reduced_optimize_huge_functions_available(environ_cp):
   only, as of 2019-11-19). TensorFlow needs this flag to massively reduce
   compile times, but until 16.4 is officially released, we can't depend on it.
 
-  See also https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
+  See also
+  https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
 
   Because it's very annoying to check this manually (to check the MSVC installed
   versions, you need to use the registry, and it's not clear if Bazel will be
@@ -1372,7 +1386,7 @@ def main():
     current_bazel_version = check_bazel_version(_TF_MIN_BAZEL_VERSION,
                                                 _TF_MAX_BAZEL_VERSION)
   except subprocess.CalledProcessError as e:
-    print("Error checking bazel version: ", e.output.decode('UTF-8').strip())
+    print('Error checking bazel version: ', e.output.decode('UTF-8').strip())
     raise e
 
   _TF_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)

From 789ebcf00b3ceb8a4a7a073a271a89ed75f9734c Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Thu, 28 May 2020 14:48:02 -0700
Subject: [PATCH 1336/1533] Add doc for
 extended._retrace_functions_for_each_device.

PiperOrigin-RevId: 313663754
Change-Id: Ie643c0e54adb3eed6a7d20ea3328f2d49fb8047c
---
 tensorflow/python/distribute/distribute_lib.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index fbc8923e050..b77163cb97a 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -619,6 +619,11 @@ class StrategyBase(object):
 
     if not hasattr(extended, "_retrace_functions_for_each_device"):
       # pylint: disable=protected-access
+      # `extended._retrace_functions_for_each_device` dictates
+      # 1) whether all the ops created inside function will have devices
+      #    inherited from outer stack, and
+      # 2) whether the same function will be retraced when it is called on
+      #    different devices.
       try:
         extended._retrace_functions_for_each_device = (
             len(extended.worker_devices) > 1)

From bfd14e0aa3c3d63d7e3b8de028c377b8800f106b Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Thu, 28 May 2020 15:00:55 -0700
Subject: [PATCH 1337/1533] Add a mechanism internally in tf_export to allow
 grabbing symbols from their canonical names. This will be used internally
 inside TF for serializing to/from symbols by their exported tf api name.

PiperOrigin-RevId: 313666229
Change-Id: I418d7765f7eb901eda1f3075a44b899ca4530c3e
---
 tensorflow/python/util/tf_export.py      | 11 +++++++
 tensorflow/python/util/tf_export_test.py | 41 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+)

diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 04c96d03617..be1433a723c 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -94,6 +94,12 @@ class InvalidSymbolNameError(Exception):
   """Raised when trying to export symbol as an invalid or unallowed name."""
   pass
 
+_NAME_TO_SYMBOL_MAPPING = dict()
+
+
+def get_symbol_from_name(name):
+  return _NAME_TO_SYMBOL_MAPPING.get(name)
+
 
 def get_canonical_name_for_symbol(
     symbol, api_name=TENSORFLOW_API_NAME,
@@ -333,6 +339,11 @@ class api_export(object):  # pylint: disable=invalid-name
     _, undecorated_func = tf_decorator.unwrap(func)
     self.set_attr(undecorated_func, api_names_attr, self._names)
     self.set_attr(undecorated_func, api_names_attr_v1, self._names_v1)
+
+    for name in self._names:
+      _NAME_TO_SYMBOL_MAPPING[name] = func
+    for name_v1 in self._names_v1:
+      _NAME_TO_SYMBOL_MAPPING['compat.v1.%s' % name_v1] = func
     return func
 
   def set_attr(self, func, api_names_attr, names):
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index 20625792e9b..6716560b79b 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -82,6 +82,33 @@ class ValidateExportTest(test.TestCase):
                       tf_export.get_v1_names(decorated_function))
     self.assertEquals(['nameA', 'nameB'],
                       tf_export.get_v2_names(decorated_function))
+    self.assertEqual(tf_export.get_symbol_from_name('nameA'),
+                     decorated_function)
+    self.assertEqual(tf_export.get_symbol_from_name('nameB'),
+                     decorated_function)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(decorated_function)),
+        decorated_function)
+
+  def testExportSingleFunctionV1Only(self):
+    export_decorator = tf_export.tf_export(v1=['nameA', 'nameB'])
+    decorated_function = export_decorator(_test_function)
+    self.assertEqual(decorated_function, _test_function)
+    self.assertAllEqual(('nameA', 'nameB'), decorated_function._tf_api_names_v1)
+    self.assertAllEqual(['nameA', 'nameB'],
+                        tf_export.get_v1_names(decorated_function))
+    self.assertEqual([],
+                     tf_export.get_v2_names(decorated_function))
+    self.assertEqual(tf_export.get_symbol_from_name('compat.v1.nameA'),
+                     decorated_function)
+    self.assertEqual(tf_export.get_symbol_from_name('compat.v1.nameB'),
+                     decorated_function)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(
+                decorated_function, add_prefix_to_v1_names=True)),
+        decorated_function)
 
   def testExportMultipleFunctions(self):
     export_decorator1 = tf_export.tf_export('nameA', 'nameB')
@@ -92,6 +119,20 @@ class ValidateExportTest(test.TestCase):
     self.assertEquals(decorated_function2, _test_function2)
     self.assertEquals(('nameA', 'nameB'), decorated_function1._tf_api_names)
     self.assertEquals(('nameC', 'nameD'), decorated_function2._tf_api_names)
+    self.assertEqual(tf_export.get_symbol_from_name('nameB'),
+                     decorated_function1)
+    self.assertEqual(tf_export.get_symbol_from_name('nameD'),
+                     decorated_function2)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(
+                decorated_function1)),
+        decorated_function1)
+    self.assertEqual(
+        tf_export.get_symbol_from_name(
+            tf_export.get_canonical_name_for_symbol(
+                decorated_function2)),
+        decorated_function2)
 
   def testExportClasses(self):
     export_decorator_a = tf_export.tf_export('TestClassA1')

From 60c828a70ec0aad85dbd4077a1c84f8f9da88615 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Thu, 28 May 2020 15:12:28 -0700
Subject: [PATCH 1338/1533] Eliminate output_types from If/IfRegion ODS specs

- Also eliminate output_types attribute from several test cases
- This attribute may still be present on these ops since the importer seems to
  generate them.
- Added a test to verify that values generated on one branch of the if cannot be
  consumed on the other branch

PiperOrigin-RevId: 313668390
Change-Id: I97bed79f52f6694ead1931a64c411686067d2800
---
 .../lite/tests/lower-static-tensor-list.mlir  |  2 +-
 .../transforms/lower_static_tensor_list.cc    |  1 -
 .../mlir/tensorflow/ir/tf_attributes.h        |  2 +-
 .../compiler/mlir/tensorflow/ir/tf_ops.td     |  3 --
 .../tests/promote_resources_to_args.mlir      |  4 +--
 .../tests/resource-device-inference.mlir      |  2 +-
 .../tensorflow/tests/resource_op_lifting.mlir |  8 ++---
 .../tensorflow/tests/shape_inference.mlir     | 14 ++++-----
 .../mlir/tensorflow/tests/tf-ops.mlir         | 30 +++++++++++++++++++
 .../transforms/resource_op_lifting.cc         | 13 +-------
 .../tensor_list_ops_decomposition.cc          |  8 -----
 11 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
index 9b1eeab3d7c..a7fb5b1666e 100644
--- a/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/lower-static-tensor-list.mlir
@@ -292,7 +292,7 @@ func @tensorlistResize(%arg0: tensor<3x10xf32>, %arg1: tensor<1xi32>, %arg2: ten
 // CHECK:  [[SIZE_DIFF:%.*]] = "tf.Sub"([[SIZE]], [[INPUT_SIZE]]) : (tensor<i32>, tensor<i32>) -> tensor<i32>
 // CHECK:  [[DIFF_RES:%.*]] = "tf.Greater"([[SIZE_DIFF]], [[ZERO]]) : (tensor<i32>, tensor<i32>) -> tensor<i1>
 // CHECK:  [[SHAPE_1:%.*]] = "tf.Shape"([[INPUT]]) : (tensor<3x10xf32>) -> tensor<?xi32>
-// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) {else_branch = @cond_false, is_stateless = true, output_shapes = [], then_branch = @cond_true} : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
+// CHECK:  [[RESULT:%.*]] = "tf.If"([[DIFF_RES]], [[INPUT]], [[SHAPE_1]], [[SIZE_DIFF]], [[SIZE]]) {else_branch = @cond_false, is_stateless = true, then_branch = @cond_true} : (tensor<i1>, tensor<3x10xf32>, tensor<?xi32>, tensor<i32>, tensor<i32>) -> tensor<?x10xf32>
 // CHECK:  return [[RESULT]] : tensor<?x10xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index 45b8c9e5fb2..2498a732a86 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -577,7 +577,6 @@ struct ConvertTensorListResize
         ArrayRef<Value>({input_handle, input_shape, size_diff, size}),
         /*then_branch=*/rewriter.getSymbolRefAttr(then_branch_op),
         /*else_branch=*/rewriter.getSymbolRefAttr(else_branch_op),
-        /*output_shapes=*/rewriter.getArrayAttr({}),
         /*is_stateless=*/rewriter.getBoolAttr(true));
     return success();
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
index ba67d6cb671..1edc7356ab4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h
@@ -26,7 +26,7 @@ namespace TF {
 
 namespace AttrKind {
 
-// List of supported custom TensorFlow Attributes kinds, necessary for
+// List of supported custom TensorFlow Attribute kinds, necessary for
 // isa/dyn_cast.
 enum Kind {
   FIRST_USED_TENSORFLOW_ATTR = Attribute::FIRST_TENSORFLOW_ATTR,
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 7f31c274a09..51b9dd862ac 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -188,7 +188,6 @@ else_branch: A function that takes 'inputs' and returns a list of
 
     FlatSymbolRefAttr:$then_branch,
     FlatSymbolRefAttr:$else_branch,
-    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
 
     // Used to map StatelessIf and If op defined in TensorFlow to a common op.
     BoolAttr:$is_stateless
@@ -248,8 +247,6 @@ else_branch: A region that computes the outputs of the op if cond = false.
   let arguments = (ins
     TF_Tensor:$cond,
 
-    DefaultValuedAttr<TF_ShapeAttrArray, "{}">:$output_shapes,
-
     // Used to map StatelessIf and If op defined in TensorFlow to a common op.
     BoolAttr:$is_stateless
   );
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index 60663f4bd4a..59c93a66d12 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -145,8 +145,8 @@ func @main(%arg0: tensor<i1>) -> tensor<2xf32> attributes {tf.entry_function = {
   %2 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
   %3 = "tf.Less"(%2, %0) : (tensor<f32>, tensor<f32>) -> tensor<i1>
   %4 = "tf.If"(%3, %1, %2) {Tcond = i1, Tin = ["tfdtype$DT_RESOURCE", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"],
-       else_branch = @cond_false, is_stateless = false, output_shapes = [#tf.shape<>],
-       then_branch = @cond_true} : (tensor<i1>, tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
+       else_branch = @cond_false, is_stateless = false,then_branch = @cond_true} :
+       (tensor<i1>, tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> tensor<f32>
   %5 = "tf.Identity"(%4) : (tensor<f32>) -> tensor<f32>
   %6 = "tf.Pack"(%2, %5) {N = 2 : i64, T = f32, axis = 0 : i64, device = ""} : (tensor<f32>, tensor<f32>) -> tensor<2xf32>
   return %6 : tensor<2xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
index c98e40fed05..60eded3de7e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir
@@ -217,7 +217,7 @@ func @error_on_conflict_multiple_callers(
       // expected-error@above {{Conflicting device assignment for resource}}
           then_branch = @if_then_and_else,
           else_branch = @if_then_and_else,
-          output_shapes = [], is_stateless = false}
+          is_stateless = false}
         : (tensor<i1>, tensor<*x!tf.resource<tensor<32xf32>>>,
            tensor<*x!tf.resource<tensor<32xf32>>>) -> ()
       tf_executor.yield
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index 9e7358ab2f5..b19033ce5b5 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -420,7 +420,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %2 = "tf_device.cluster"() ( {
     // CHECK: %[[IF:.*]]:2 = "tf.If"(%[[ARG0]], %[[READ0]], %[[READ1]])
     %3:2 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = [#tf.shape<>, #tf.shape<4>], is_stateless = false}
+        is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
     // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[IF]]#1, %[[IF]]#0)
@@ -468,7 +468,7 @@ func @cluster_with_nested_if(%arg0: tensor<i1>) -> tensor<f32> {
   %2 = "tf_device.cluster"() ( {
     // CHECK: %[[IF:.*]] = "tf.If"(%[[ARG0]], %[[READ0]])
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = [], is_stateless = false}
+        is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<f32>>>, tensor<*x!tf.resource<tensor<f32>>>)
       -> (tensor<*x!tf.resource<tensor<f32>>>)
     // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[IF]], %[[IF]])
@@ -488,7 +488,7 @@ func @if_then(%arg0: tensor<*x!tf.resource<tensor<f32>>>, %arg1: tensor<*x!tf.re
   // CHECK-NEXT: %[[IIF:.*]] = "tf.If"(%[[TARG0]], %[[TARG0]])
   %read = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32>
   %3 = "tf.If"(%read, %arg0) {then_branch = @inner_if_then, else_branch = @inner_if_else,
-      output_shapes = [], is_stateless = false}
+      is_stateless = false}
     : (tensor<f32>, tensor<*x!tf.resource<tensor<f32>>>)
     -> (tensor<*x!tf.resource<tensor<f32>>>)
   // CHECK-NEXT: return %[[IIF]]
@@ -526,7 +526,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %2 = "tf_device.cluster"() ( {
     // expected-error @+1 {{unsupported tf.IfOp output: resource does not alias a single input.}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
-        output_shapes = [#tf.shape<>], is_stateless = false}
+        is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
       -> (tensor<*x!tf.resource<tensor<4xf32>>>)
     %4 = "tf.ReadVariableOp"(%3) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 3cdade8da59..e3766a7d9d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -102,7 +102,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @shape_from_if_to_branch_functions
   func @shape_from_if_to_branch_functions(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
-    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @if_else_branch, is_stateless = true, name = "if", then_branch = @if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
 
@@ -184,16 +184,16 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @invalid_function_reused_by_control_flows
   func @invalid_function_reused_by_control_flows(%arg0: tensor<i1>, %arg1: tensor<1x2x3xf32>) -> tensor<1x2x3xf32> {
-	  // expected-warning @+1 {{unable to refine shape}}
-    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
-	  // expected-warning @+1 {{unable to refine shape}}
-    %1 = "tf.If"(%arg0, %0) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", output_shapes = [#tf.shape<>], then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    // expected-warning @+1 {{unable to refine shape}}
+    %0 = "tf.If"(%arg0, %arg1) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
+    // expected-warning @+1 {{unable to refine shape}}
+    %1 = "tf.If"(%arg0, %0) {Tcond = i1, Tin = ["tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], _xla_propagate_compile_time_consts = true, device = "", else_branch = @reused_if_else_branch, is_stateless = true, name = "if", then_branch = @reused_if_then_branch} : (tensor<i1>, tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
     return %0 : tensor<1x2x3xf32>
   }
 
   // CHECK-LABEL: func @reused_if_then_branch
   // CHECK-SAME: (%arg0: tensor<*xf32>) -> tensor<*xf32>
-	// expected-warning @+1 {{expected control flow function reused_if_then_branch to have exactly 1 use}}
+  // expected-warning @+1 {{expected control flow function reused_if_then_branch to have exactly 1 use}}
   func @reused_if_then_branch(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     // CHECK: return
     // CHECK-SAME: tensor<*xf32>
@@ -202,7 +202,7 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @reused_if_else_branch
   // CHECK-SAME: (%arg0: tensor<*xf32>) -> tensor<*xf32>
-	// expected-warning @+1 {{expected control flow function reused_if_else_branch to have exactly 1 use}}
+  // expected-warning @+1 {{expected control flow function reused_if_else_branch to have exactly 1 use}}
   func @reused_if_else_branch(%arg0: tensor<*xf32>) -> tensor<*xf32> {
     // CHECK: "tf.Identity"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
     %0 = "tf.Identity"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 2e00dd6a517..20f7c5b9ba1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1048,6 +1048,36 @@ func @testIfRegionOpYieldMismatchElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -
 
 // -----
 
+// value generated in one branch cannot be consumed in the other branch
+func @testIfRegionElseConsumingThen(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     // expected-error @+1 {{use of undeclared SSA value name}}
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testIfRegionThenConsumingElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+   %0 = "tf.IfRegion"(%arg0) ({
+     // expected-error @+1 {{does not dominate this use}}
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+      // expected-note @+1 {{operand defined here}}
+      %t = "tf.Acos"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+      "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
 // Test valid tf.MatrixBandPart
 // CHECK-LABEL: func @testValidMatrixBandPartOp
 func @testValidMatrixBandPartOp(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index 82bc612b1f8..c1e5241a1f0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -700,15 +700,10 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
   // Erase the resource outputs from the branches.
   int64_t non_resource_results = 0;
   llvm::SmallVector<int64_t, 4> old_to_new_output_indices;
-  llvm::SmallVector<Attribute, 4> new_output_shapes;
   bool output_removed = false;
   for (auto result : if_op.getResults()) {
     if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
       old_to_new_output_indices.push_back(non_resource_results++);
-      if (!if_op.output_shapes().getValue().empty()) {
-        new_output_shapes.push_back(
-            if_op.output_shapes().getValue()[result.getResultNumber()]);
-      }
       continue;
     }
     old_to_new_output_indices.push_back(-1);
@@ -781,8 +776,7 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
   auto new_if = builder.create<TF::IfOp>(if_op.getLoc(),
                                          then_branch.getType().getResults(),
                                          new_operands, if_op.getAttrs());
-  // Prepare for AddLoadsStoresOutsideControlFlowOp() and update
-  // new_output_shapes.
+  // Prepare for AddLoadsStoresOutsideControlFlowOp()
   llvm::SmallDenseMap<int64_t, std::pair<Type, int64_t>>
       arg_data_type_and_updated_output_index;
   for (const auto& entry : remaining_resource_data_types) {
@@ -792,14 +786,9 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
                                : new_output_it->getSecond();
     arg_data_type_and_updated_output_index[entry.getFirst() + 1] = {
         entry.getSecond(), update_index};
-    if (!if_op.output_shapes().getValue().empty() && update_index >= 0) {
-      new_output_shapes.push_back(
-          tensorflow::ConvertTypeToTensorShapeAttr(entry.getSecond()));
-    }
   }
   AddLoadsStoresOutsideControlFlowOp(new_if,
                                      arg_data_type_and_updated_output_index);
-  new_if.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
   // Replace uses.
   for (int64_t i = 0; i < old_to_new_output_indices.size(); ++i) {
     if (old_to_new_output_indices[i] >= 0) {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
index 6e27823191b..b2203c890e3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tensor_list_ops_decomposition.cc
@@ -254,22 +254,14 @@ LogicalResult HandleIfOp(TF::IfOp if_op, ModuleOp module,
   if (output_buffer_to_size.empty() && arg_no_changed) return success();
   // Recreate the If op.
   auto new_if_operands = llvm::to_vector<8>(if_op.getOperands());
-  auto new_output_shapes = llvm::to_vector<8>(if_op.output_shapes().getValue());
   for (int64_t i = 1; i < if_op.getNumOperands(); ++i) {
     auto it = buffer_to_size->find(if_op.getOperand(i));
     if (it == buffer_to_size->end()) continue;
     new_if_operands.push_back(it->getSecond().size);
-    if (!new_output_shapes.empty()) {
-      // Size is a scalar shape.
-      tensorflow::TensorShapeProto shape_proto;
-      new_output_shapes.push_back(builder.getStringAttr(
-          tensorflow::mangling_util::MangleShape(shape_proto)));
-    }
   }
   auto new_if = OpBuilder(if_op).create<TF::IfOp>(
       if_op.getLoc(), then_branch.getType().getResults(), new_if_operands,
       if_op.getAttrs());
-  new_if.setAttr("output_shapes", builder.getArrayAttr(new_output_shapes));
   for (const auto& entry : output_buffer_to_size) {
     (*buffer_to_size)[new_if.getResult(std::get<0>(entry))] = {
         new_if.getResult(std::get<1>(entry)), std::get<2>(entry)};

From 7d605fb0e27643622f1fba7b77ba5e19fd210e02 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 15:15:50 -0700
Subject: [PATCH 1339/1533] Add `inference_input_type` and
 `inference_output_type` flags in TF 2.x TFLiteConverter (backward compatible
 with TF 1.x) to support integer (tf.int8, tf.uint8) input and output types in
 post training full integer quantized models.

PiperOrigin-RevId: 313668965
Change-Id: Iea684507f58651b34dada0285b00a82e80066aab
---
 tensorflow/lite/python/lite.py         |  93 +++------------
 tensorflow/lite/python/lite_v2_test.py | 156 +++----------------------
 2 files changed, 32 insertions(+), 217 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index af42c28172a..53814bb0c43 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -201,11 +201,6 @@ class QuantizationMode(object):
             self._representative_dataset is not None and
             self._smallest_supported_type() == constants.INT8)
 
-  def is_post_training_integer_quantize(self):
-    """Post training integer quantization."""
-    return (self.post_training_int8_no_float() or
-            self.post_training_int8_allow_float())
-
   def training_time_int8_allow_float(self):
     """Training-time int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
@@ -418,56 +413,7 @@ class TFLiteConverterBase(object):
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
-  """Converter subclass to share functionality between V2 converters.
-
-  Attributes:
-    allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When False, any unknown operation is an error. When True, custom ops are
-      created for any op that is unknown. The developer needs to provide these
-      to the TensorFlow Lite runtime with a custom resolver. (default False)
-    optimizations: Experimental flag, subject to change. A list of optimizations
-      to apply when converting the model. E.g. `[Optimize.DEFAULT]`
-    representative_dataset: A representative dataset that can be used to
-      generate input and output samples for the model. The converter can use the
-      dataset to evaluate different optimizations. Note that this is an optional
-      attribute but it is necessary if INT8 is the only support builtin ops in
-      target ops.
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    inference_input_type: Data type of the input layer. Note that integer types
-      (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
-    inference_output_type: Data type of the output layer. Note that integer
-      types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion.
-  """
-
-  def __init__(self):
-    """Constructor for TFLiteConverter."""
-    super(TFLiteConverterBaseV2, self).__init__()
-    self.inference_input_type = constants.FLOAT
-    self.inference_output_type = constants.FLOAT
-
-  def _validate_inference_input_output_types(self, quant_mode):
-    """Validate inference_input_type and inference_output_type flags."""
-    default_types = [constants.FLOAT, None]
-    # We only support integer types for post training integer quantization
-    # as we have statistical information to quantize the input and output.
-    if quant_mode.is_post_training_integer_quantize():
-      all_types = default_types + [constants.INT8, constants.QUANTIZED_UINT8]
-      if self.inference_input_type not in all_types or \
-          self.inference_output_type not in all_types:
-        all_types_names = ["tf." + t.name for t in all_types]
-        raise ValueError("The inference_input_type and inference_output_type "
-                         "must be in {}.".format(all_types_names))
-    elif self.inference_input_type not in default_types or \
-        self.inference_output_type not in default_types:
-      raise ValueError("The inference_input_type and inference_output_type "
-                       "must be tf.float32.")
+  """Converter subclass to share functionality between V2 converters."""
 
   def convert(self, graph_def, input_tensors, output_tensors):
     """Converts a TensorFlow GraphDef based on instance variables.
@@ -491,8 +437,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
     quant_mode = QuantizationMode(self.optimizations, self.target_spec,
                                   self.representative_dataset, graph_def)
 
-    self._validate_inference_input_output_types(quant_mode)
-
     if not self._is_unknown_shapes_allowed():
       # Checks dimensions in input tensor.
       for tensor in input_tensors:
@@ -535,9 +479,6 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           "quantize_to_float16": True,
       })
 
-    # Converter requires that the inference_input_type flag is set to FLOAT
-    converter_kwargs.update({"inference_input_type": constants.FLOAT})
-
     if not self.experimental_new_converter:
       logging.warning(
           "Please consider switching to use new converter by setting "
@@ -557,11 +498,11 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         **converter_kwargs)
 
     if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, self.inference_input_type,
-                                              self.inference_output_type, False)
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, False)
     elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, self.inference_input_type,
-                                              self.inference_output_type, True)
+      result = self._calibrate_quantize_model(result, constants.FLOAT,
+                                              constants.FLOAT, True)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -817,9 +758,12 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
 
   Attributes:
     allow_custom_ops: Boolean indicating whether to allow custom operations.
-      When False, any unknown operation is an error. When True, custom ops are
-      created for any op that is unknown. The developer needs to provide these
-      to the TensorFlow Lite runtime with a custom resolver. (default False)
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver.
+      (default False)
+    target_spec: Experimental flag, subject to change. Specification of target
+      device.
     optimizations: Experimental flag, subject to change. A list of optimizations
       to apply when converting the model. E.g. `[Optimize.DEFAULT]`
     representative_dataset: A representative dataset that can be used to
@@ -827,19 +771,8 @@ class TFLiteConverterV2(TFLiteFrozenGraphConverterV2):
       dataset to evaluate different optimizations. Note that this is an optional
       attribute but it is necessary if INT8 is the only support builtin ops in
       target ops.
-    target_spec: Experimental flag, subject to change. Specification of target
-      device.
-    inference_input_type: Data type of the input layer. Note that integer types
-      (tf.int8 and tf.uint8) are currently only supported for post training
-      integer quantization. (default tf.float32, must be in {tf.float32,
-      tf.int8, tf.uint8})
-    inference_output_type: Data type of the output layer. Note that integer
-      types (tf.int8 and tf.uint8) are currently only supported for post
-      training integer quantization. (default tf.float32, must be in
-      {tf.float32, tf.int8, tf.uint8})
-    experimental_new_converter: Experimental flag, subject to change. Enables
-      MLIR-based conversion instead of TOCO conversion.
-
+    experimental_new_converter: Experimental flag, subject to change.
+      Enables MLIR-based conversion instead of TOCO conversion.
   Example usage:
 
     ```python
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index fae55e99cd1..9af37df2975 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -71,27 +71,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     actual_value = self._evaluateTFLiteModel(tflite_model, [input_data])
     self.assertEqual(expected_value.numpy(), actual_value)
 
-  @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  @test_util.run_v2_only
-  def testInvalidFloat(self, inference_input_output_type):
-    root = self._getSimpleVariableModel()
-    input_data = tf.constant(1., shape=[1])
-    concrete_func = root.f.get_concrete_function(input_data)
-
-    # Convert model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func])
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
-    with self.assertRaises(ValueError) as error:
-      converter.inference_input_type = inference_input_output_type
-      converter.inference_output_type = inference_input_output_type
-      converter.convert()
-    self.assertEqual(
-        'The inference_input_type and inference_output_type '
-        'must be tf.float32.', str(error.exception))
-
   @test_util.run_v2_only
   def testScalarInput(self):
     root = self._getSimpleVariableModel()
@@ -193,113 +172,39 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
   @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  @test_util.run_v2_only
-  def testInvalidPostTrainingDynamicRangeQuantization(
-      self, inference_input_output_type):
-    func, _ = self._getCalibrationQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Convert quantized model.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    # We don't support integer types as we don't have statistical information
-    # to quantize (only supported for post training integer quantization).
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = inference_input_output_type
-      quantized_converter.inference_output_type = inference_input_output_type
-      quantized_converter.convert()
-    self.assertEqual(
-        'The inference_input_type and inference_output_type '
-        'must be tf.float32.', str(error.exception))
-
-  @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT),
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  def testPostTrainingIntegerAllowFloatQuantization(
-      self, inference_input_output_type):
+      ('EnableMlirQuantizer', True),  # enable mlir quantizer
+      ('DisableMlirQuantizer', False))  # disable mlir quantizer
+  def testCalibrateAndQuantizeBuiltinInt8(self, mlir_quantizer):
     func, calibration_gen = self._getCalibrationQuantizeModel()
 
     # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    # Convert quantized model.
-    quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_converter.representative_dataset = calibration_gen
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertTrue(quantized_tflite_model)
-
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
-    self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
-
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
-
-  @parameterized.named_parameters(
-      ('_DefaultFLOAT32InputOutput_UseTargetTypesFlag',
-       lite.constants.FLOAT, False),
-      ('_DefaultFLOAT32InputOutput', lite.constants.FLOAT, True),
-      ('_INT8InputOutput', lite.constants.INT8, True),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8, True))
-  @test_util.run_v2_only
-  def testPostTrainingIntegerNoFloatQuantization(self,
-                                                 inference_input_output_type,
-                                                 use_target_ops_flag):
-    func, calibration_gen = self._getCalibrationQuantizeModel()
-
-    # Convert float model.
-    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
 
     # Convert model by specifying target spec (instead of optimizations), since
     # when targeting an integer only backend, quantization is mandatory.
     quantized_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
     quantized_converter.representative_dataset = calibration_gen
-    if use_target_ops_flag:
-      quantized_converter.target_spec.supported_ops = [
-          lite.OpsSet.TFLITE_BUILTINS_INT8
-      ]
-    else:
-      quantized_converter.target_spec.supported_types = [lite.constants.INT8]
-    quantized_converter.inference_input_type = inference_input_output_type
-    quantized_converter.inference_output_type = inference_input_output_type
-    quantized_tflite_model = quantized_converter.convert()
-    self.assertTrue(quantized_tflite_model)
+    quantized_converter._experimental_new_quantizer = mlir_quantizer
+    quantized_tflite = quantized_converter.convert()
+    self.assertTrue(quantized_tflite)
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
     interpreter.allocate_tensors()
     input_details = interpreter.get_input_details()
     self.assertLen(input_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     input_details[0]['dtype'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
     output_details = interpreter.get_output_details()
     self.assertLen(output_details, 1)
-    self.assertEqual(inference_input_output_type.as_numpy_dtype,
-                     output_details[0]['dtype'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
 
-    # Ensure that the quantized tflite model is smaller.
-    self.assertLess(len(quantized_tflite_model), len(tflite_model))
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
 
   def testCalibrateAndQuantizeBuiltinInt16(self):
     func, calibration_gen = self._getCalibrationQuantizeModel()
@@ -374,7 +279,7 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     return tf.keras.Sequential(QLinear(3, input_shape=(2,)))
 
   @test_util.run_v2_only
-  def testTrainingTimeQuantization(self):
+  def testTrainingTimeQuantizeConversion(self):
     model = self._getTrainingTimeQuantizedModel()
 
     float_converter = lite.TFLiteConverterV2.from_keras_model(model)
@@ -392,29 +297,6 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     interpreter = Interpreter(model_content=quantized_tflite)
     self.assertEqual(np.float32, interpreter.get_input_details()[0]['dtype'])
 
-  @parameterized.named_parameters(
-      ('_INT8InputOutput', lite.constants.INT8),
-      ('_UINT8InputOutput', lite.constants.QUANTIZED_UINT8))
-  def testInvalidTrainingTimeQuantization(self, inference_input_output_type):
-    # We currently don't support integer inference_input_type and
-    # inference_output_type flags for training time quantization.
-
-    model = self._getTrainingTimeQuantizedModel()
-
-    converter = lite.TFLiteConverterV2.from_keras_model(model)
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
-
-    quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
-    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
-    with self.assertRaises(ValueError) as error:
-      quantized_converter.inference_input_type = inference_input_output_type
-      quantized_converter.inference_output_type = inference_input_output_type
-      quantized_converter.convert()
-    self.assertEqual(
-        'The inference_input_type and inference_output_type '
-        'must be tf.float32.', str(error.exception))
-
   @test_util.run_v2_only
   def testNewQuantizer(self):
     """Test the model quantized by the new converter."""

From 4e4a4496b7f89b7a3b56bf48257a5f6c560f8dd3 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Thu, 28 May 2020 15:17:20 -0700
Subject: [PATCH 1340/1533] [tf.data service] Export server_lib in public API.

PiperOrigin-RevId: 313669209
Change-Id: Idaf84dc8360a03699e12c7d83511baa4c21d240b
---
 tensorflow/python/BUILD                       |  1 +
 .../python/data/experimental/service/BUILD    | 41 ++++++++++-
 .../data/experimental/service/__init__.py     |  2 +
 .../{ => experimental}/service/server_lib.py  | 73 ++++++++++++-------
 .../service/server_lib_test.py                |  2 +-
 .../service/server_lib_wrapper.cc             |  0
 tensorflow/python/data/kernel_tests/BUILD     |  6 +-
 .../kernel_tests/data_service_ops_test.py     |  2 +-
 tensorflow/python/data/service/BUILD          | 44 -----------
 tensorflow/python/data/service/__init__.py    |  0
 ....experimental.service.-master-server.pbtxt | 21 ++++++
 ....experimental.service.-worker-server.pbtxt | 17 +++++
 ...tensorflow.data.experimental.service.pbtxt |  8 ++
 tensorflow/tools/pip_package/BUILD            |  2 +-
 14 files changed, 143 insertions(+), 76 deletions(-)
 rename tensorflow/python/data/{ => experimental}/service/server_lib.py (73%)
 rename tensorflow/python/data/{ => experimental}/service/server_lib_test.py (97%)
 rename tensorflow/python/data/{ => experimental}/service/server_lib_wrapper.cc (100%)
 delete mode 100644 tensorflow/python/data/service/BUILD
 delete mode 100644 tensorflow/python/data/service/__init__.py
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2fb22a89706..2b057b5db57 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5296,6 +5296,7 @@ py_library(
         ":variable_scope",
         ":variables",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/distribute:distribute_lib",
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index 5e1d8473633..f08fef2b669 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -1,15 +1,54 @@
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
 )
 
-exports_files(["LICENSE"])
+tf_python_pybind_extension(
+    name = "_pywrap_server_lib",
+    srcs = ["server_lib_wrapper.cc"],
+    module_name = "_pywrap_server_lib",
+    deps = [
+        "//tensorflow/core:lib",
+        "//tensorflow/core/data/service:server_lib_headers_lib",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "//third_party/python_runtime:headers",
+        "@com_github_grpc_grpc//:grpc++_public_hdrs",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "server_lib",
+    srcs = ["server_lib.py"],
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        ":_pywrap_server_lib",
+    ],
+)
+
+tf_py_test(
+    name = "server_lib_test",
+    srcs = ["server_lib_test.py"],
+    deps = [
+        ":server_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
 
 py_library(
     name = "service",
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":server_lib",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
     ],
 )
diff --git a/tensorflow/python/data/experimental/service/__init__.py b/tensorflow/python/data/experimental/service/__init__.py
index f3c8aff2b3f..aecc07965bb 100644
--- a/tensorflow/python/data/experimental/service/__init__.py
+++ b/tensorflow/python/data/experimental/service/__init__.py
@@ -19,3 +19,5 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops.data_service_ops import distribute
+from tensorflow.python.data.experimental.service.server_lib import MasterServer
+from tensorflow.python.data.experimental.service.server_lib import WorkerServer
diff --git a/tensorflow/python/data/service/server_lib.py b/tensorflow/python/data/experimental/service/server_lib.py
similarity index 73%
rename from tensorflow/python/data/service/server_lib.py
rename to tensorflow/python/data/experimental/service/server_lib.py
index df65508e6b2..f249af671a6 100644
--- a/tensorflow/python/data/service/server_lib.py
+++ b/tensorflow/python/data/experimental/service/server_lib.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 # pylint: disable=invalid-import-order,g-bad-import-order, unused-import
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.data.service import _pywrap_server_lib
+from tensorflow.python.data.experimental.service import _pywrap_server_lib
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("data.experimental.service.MasterServer", v1=[])
 class MasterServer(object):
   """An in-process tf.data service master server.
 
@@ -30,21 +32,22 @@ class MasterServer(object):
   `tf.data.experimental.service.WorkerServer`s. When the workers start, they
   register themselves with the master.
 
-  ```
-  master_server = tf.data.experimental.service.MasterServer(port=5050)
-  worker_server = tf.data.experimental.service.WorkerServer(
-      port=0, master_address="localhost:5050")
-  dataset = tf.data.Dataset.range(10)
-  dataset = dataset.apply(tf.data.experimental.service.distribute(
-      processing_mode="parallel_epochs", service="grpc://localhost:5050"))
-  ```
+  >>> master = tf.data.experimental.service.MasterServer(port=0)
+  >>> master_address = master.target.split("://")[1]
+  >>> worker = tf.data.experimental.service.WorkerServer(
+  ...     port=0, master_address=master_address)
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+  ...     processing_mode="parallel_epochs", service=master.target))
+  >>> print(list(dataset.as_numpy_iterator()))
+  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
 
   When starting a dedicated tf.data master process, use join() to block
   indefinitely after starting up the server.
 
   ```
-  master_server = tf.data.experimental.service.MasterServer(port=5050)
-  master_server.join()
+  master = tf.data.experimental.service.MasterServer(port=5050)
+  master.join()
   ```
   """
 
@@ -72,6 +75,9 @@ class MasterServer(object):
   def start(self):
     """Starts this server.
 
+    >>> master = tf.data.experimental.service.MasterServer(port=0, start=False)
+    >>> master.start()
+
     Raises:
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         starting the server.
@@ -84,8 +90,8 @@ class MasterServer(object):
     This is useful when starting a dedicated master process.
 
     ```
-    master_server = tf.data.experimental.service.MasterServer(port=5050)
-    master_server.join()
+    master = tf.data.experimental.service.MasterServer(port=5050)
+    master.join()
     ```
 
     Raises:
@@ -94,6 +100,21 @@ class MasterServer(object):
     """
     self._server.join()
 
+  @property
+  def target(self):
+    """Returns a target that can be used to connect to the server.
+
+    >>> master = tf.data.experimental.service.MasterServer(port=0)
+    >>> dataset = tf.data.Dataset.range(10)
+    >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+    ...     processing_mode="parallel_epochs", service=master.target))
+
+    The returned string will be in the form protocol://address, e.g.
+    "grpc://localhost:5050".
+    """
+    return "{0}://localhost:{1}".format(self._protocol,
+                                        self._server.bound_port())
+
   def _stop(self):
     """Stops the server.
 
@@ -119,6 +140,7 @@ class MasterServer(object):
     return self._server.num_workers()
 
 
+@tf_export("data.experimental.service.WorkerServer", v1=[])
 class WorkerServer(object):
   """An in-process tf.data service worker server.
 
@@ -127,22 +149,23 @@ class WorkerServer(object):
   RPC. A worker is associated with a single
   `tf.data.experimental.service.MasterServer`.
 
-  ```
-  master_server = tf.data.experimental.service.MasterServer(port=5050)
-  worker_server = tf.data.experimental.service.WorkerServer(
-      port=0, master_address="localhost:5050")
-  dataset = tf.data.Dataset.range(10)
-  dataset = dataset.apply(tf.data.experimental.service.distribute(
-      processing_mode="parallel_epochs", service="grpc://localhost:5050"))
-  ```
+  >>> master = tf.data.experimental.service.MasterServer(port=0)
+  >>> master_address = master.target.split("://")[1]
+  >>> worker = tf.data.experimental.service.WorkerServer(
+  ...     port=0, master_address=master_address)
+  >>> dataset = tf.data.Dataset.range(10)
+  >>> dataset = dataset.apply(tf.data.experimental.service.distribute(
+  ...     processing_mode="parallel_epochs", service=master.target))
+  >>> print(list(dataset.as_numpy_iterator()))
+  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
 
   When starting a dedicated tf.data worker process, use join() to block
   indefinitely after starting up the server.
 
   ```
-  worker_server = tf.data.experimental.service.WorkerServer(
-      port=5050, master_address="grpc://localhost:5050")
-  worker_server.join()
+  worker = tf.data.experimental.service.WorkerServer(
+      port=5051, master_address="grpc://localhost:5050")
+  worker.join()
   ```
   """
 
@@ -198,7 +221,7 @@ class WorkerServer(object):
 
     ```
     worker_server = tf.data.experimental.service.WorkerServer(
-        port=5050, master_address="grpc://localhost:5050")
+        port=5051, master_address="grpc://localhost:5050")
     worker_server.join()
     ```
 
diff --git a/tensorflow/python/data/service/server_lib_test.py b/tensorflow/python/data/experimental/service/server_lib_test.py
similarity index 97%
rename from tensorflow/python/data/service/server_lib_test.py
rename to tensorflow/python/data/experimental/service/server_lib_test.py
index 59bb731d98e..74eb11dc59c 100644
--- a/tensorflow/python/data/service/server_lib_test.py
+++ b/tensorflow/python/data/experimental/service/server_lib_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.data.service import server_lib
+from tensorflow.python.data.experimental.service import server_lib
 
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/python/data/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
similarity index 100%
rename from tensorflow/python/data/service/server_lib_wrapper.cc
rename to tensorflow/python/data/experimental/service/server_lib_wrapper.cc
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 2e01021cfd2..477f236a8ae 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -1,7 +1,7 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")  # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")  # buildifier: disable=same-origin-load
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -92,8 +92,8 @@ tf_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python/data",
         "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/service:server_lib",
     ],
 )
 
diff --git a/tensorflow/python/data/kernel_tests/data_service_ops_test.py b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
index 726f0dc1530..d316009ce0c 100644
--- a/tensorflow/python/data/kernel_tests/data_service_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/data_service_ops_test.py
@@ -23,9 +23,9 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import data_service_ops
 from tensorflow.python.data.experimental.ops import distribute_options
+from tensorflow.python.data.experimental.service import server_lib
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.service import server_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
diff --git a/tensorflow/python/data/service/BUILD b/tensorflow/python/data/service/BUILD
deleted file mode 100644
index 18678230205..00000000000
--- a/tensorflow/python/data/service/BUILD
+++ /dev/null
@@ -1,44 +0,0 @@
-load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
-
-# buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-tf_python_pybind_extension(
-    name = "_pywrap_server_lib",
-    srcs = ["server_lib_wrapper.cc"],
-    module_name = "_pywrap_server_lib",
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/data/service:server_lib_headers_lib",
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
-        "//third_party/python_runtime:headers",
-        "@com_github_grpc_grpc//:grpc++_public_hdrs",
-        "@pybind11",
-    ],
-)
-
-py_library(
-    name = "server_lib",
-    srcs = ["server_lib.py"],
-    visibility = [
-        "//visibility:public",
-    ],
-    deps = [
-        ":_pywrap_server_lib",
-    ],
-)
-
-tf_py_test(
-    name = "server_lib_test",
-    srcs = ["server_lib_test.py"],
-    deps = [
-        ":server_lib",
-        "//tensorflow/python:platform_test",
-    ],
-)
diff --git a/tensorflow/python/data/service/__init__.py b/tensorflow/python/data/service/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt
new file mode 100644
index 00000000000..daac7716ca8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-master-server.pbtxt
@@ -0,0 +1,21 @@
+path: "tensorflow.data.experimental.service.MasterServer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.MasterServer\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "target"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'port\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
new file mode 100644
index 00000000000..d0121b7edf2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.-worker-server.pbtxt
@@ -0,0 +1,17 @@
+path: "tensorflow.data.experimental.service.WorkerServer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.service.server_lib.WorkerServer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'port\', \'master_address\', \'worker_address\', \'protocol\', \'start\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "join"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "start"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
index 12f4f3c2b08..347dd3c74b1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.data.experimental.service"
 tf_module {
+  member {
+    name: "MasterServer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "WorkerServer"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "distribute"
     argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 2c3734d2fc2..43bc04a1b60 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -108,7 +108,6 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/autograph/pyct/common_transformers:common_transformers",
     "//tensorflow/python/compiler:compiler",
     "//tensorflow/python:cond_v2",
-    "//tensorflow/python/data/service:server_lib",
     "//tensorflow/python:distributed_framework_test_lib",
     "//tensorflow/python/distribute:distribute_test_lib_pip",
     "//tensorflow/python:loss_scale",
@@ -121,6 +120,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/data/experimental/kernel_tests:reader_dataset_ops_test_base",
     "//tensorflow/python/data/experimental/kernel_tests:stats_dataset_test_base",
     "//tensorflow/python/data/experimental/ops:testing",
+    "//tensorflow/python/data/experimental/service:server_lib",
     "//tensorflow/python/data/kernel_tests:test_base",
     "//tensorflow/python/debug:debug_pip",
     "//tensorflow/python/distribute:combinations",

From f27bc524ddc9d3082e08d93f66788770fd674fc0 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Thu, 28 May 2020 15:18:31 -0700
Subject: [PATCH 1341/1533] Add benchmark for no-op convert of an EagerTensor

PiperOrigin-RevId: 313669416
Change-Id: I82fe1ad6bda97a6a69c8f0d69192d762d53d2034
---
 tensorflow/python/eager/benchmarks_test.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 72eaa663c6c..5d57f1d9b93 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -1231,6 +1231,14 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def benchmark_convert_tensor(self):
+    value = ops.convert_to_tensor(42)
+
+    def fn():
+      return ops.convert_to_tensor(value)
+
+    self._run(fn, 10000)
+
   def _benchmark_convert_constant(self, value, cached):
     global GLOBAL_TEST_VALUE
     GLOBAL_TEST_VALUE = value

From d29f1d6fe6d0fddcc4416dc1e95579c40935a976 Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Thu, 28 May 2020 15:19:08 -0700
Subject: [PATCH 1342/1533] Declare tests that uses distribute/combinations as
 Python3

I'll soon introduce PY3 only dependency to distribute/combinations to better
simulate multi worker.

PiperOrigin-RevId: 313669514
Change-Id: Ia986971a7fee469002614a9e272c475e9f93a250
---
 .../core/platform/default/distribute.bzl      |  4 ++++
 .../saved_model/integration_tests/BUILD       |  4 ++--
 tensorflow/python/distribute/BUILD            | 19 ++++++++++++++++++-
 tensorflow/python/keras/distribute/BUILD      |  3 +++
 tensorflow/tensorflow.bzl                     |  2 ++
 5 files changed, 29 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/platform/default/distribute.bzl b/tensorflow/core/platform/default/distribute.bzl
index 58119f055c1..46a5d826a79 100644
--- a/tensorflow/core/platform/default/distribute.bzl
+++ b/tensorflow/core/platform/default/distribute.bzl
@@ -42,6 +42,10 @@ def distribute_py_test(
         disable_v3: whether tests for TPU version 3 should be generated.
         **kwargs: extra keyword arguments to the non-tpu test.
     """
+
+    # Default to PY3 since multi worker tests require PY3.
+    kwargs.setdefault("python_version", "PY3")
+
     _ignore = (full_precision)
     tpu_tags = tags if (tpu_tags == None) else tpu_tags
     main = main if main else "%s.py" % name
diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index 4f55cfa3042..38c9c989168 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
 
 package(
     licenses = ["notice"],  # Apache 2.0
@@ -48,7 +48,7 @@ py_library(
     ],
 )
 
-cuda_py_test(
+distribute_py_test(
     name = "saved_model_test",
     srcs = [
         "saved_model_test.py",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index ed93e87088c..26027d46c98 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -416,6 +416,7 @@ py_library(
 tf_py_test(
     name = "mirrored_function_strategy_test",
     srcs = ["mirrored_function_strategy_test.py"],
+    python_version = "PY3",
     tags = ["no_pip"],
     deps = [
         ":distribute_lib",
@@ -447,6 +448,7 @@ cuda_py_test(
     srcs = [
         "multi_worker_continuous_run_test.py",
     ],
+    python_version = "PY3",
     tags = [
         "notsan",  # TODO(b/151841995)
     ],
@@ -528,6 +530,7 @@ py_library(
 cuda_py_test(
     name = "input_ops_test",
     srcs = ["input_ops_test.py"],
+    python_version = "PY3",
     deps = [
         ":input_ops",
         "//tensorflow/python:client_testlib",
@@ -764,6 +767,7 @@ py_library(
 py_test(
     name = "combinations_test",
     srcs = ["combinations_test.py"],
+    python_version = "PY3",
     tags = [
         "notap",  # TODO(b/153646955): flaky
     ],
@@ -856,6 +860,7 @@ cuda_py_test(
     name = "checkpoint_utils_test",
     size = "medium",
     srcs = ["checkpoint_utils_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -953,6 +958,7 @@ distribute_py_test(
 cuda_py_test(
     name = "cross_device_utils_test",
     srcs = ["cross_device_utils_test.py"],
+    python_version = "PY3",
     deps = [
         ":combinations",
         ":cross_device_utils",
@@ -971,6 +977,7 @@ cuda_py_test(
 cuda_py_test(
     name = "cross_device_ops_test",
     srcs = ["cross_device_ops_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -997,6 +1004,7 @@ cuda_py_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
     grpc_enabled = True,
+    python_version = "PY3",
     deps = [
         ":combinations",
         ":strategy_combinations",
@@ -1350,6 +1358,7 @@ cuda_py_test(
     name = "warm_starting_util_test",
     size = "medium",
     srcs = ["warm_starting_util_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -1367,7 +1376,10 @@ cuda_py_test(
 cuda_py_test(
     name = "remote_mirrored_strategy_eager_test",
     srcs = ["remote_mirrored_strategy_eager_test.py"],
-    tags = ["no_oss"],  # b/154743849
+    python_version = "PY3",
+    tags = [
+        "no_oss",  # b/154743849
+    ],
     deps = [
         ":combinations",
         ":distribute_lib",
@@ -1390,6 +1402,7 @@ cuda_py_test(
 cuda_py_test(
     name = "mirrored_strategy_test",
     srcs = ["mirrored_strategy_test.py"],
+    python_version = "PY3",
     shard_count = 5,
     tags = [
         "multi_and_single_gpu",
@@ -1421,6 +1434,7 @@ cuda_py_test(
 cuda_py_test(
     name = "mirrored_variable_test",
     srcs = ["mirrored_variable_test.py"],
+    python_version = "PY3",
     tags = [
         "guitar",
         "multi_and_single_gpu",
@@ -1599,6 +1613,7 @@ distribute_py_test(
 cuda_py_test(
     name = "collective_all_reduce_strategy_test",
     srcs = ["collective_all_reduce_strategy_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
     ],
@@ -1633,6 +1648,7 @@ cuda_py_test(
 cuda_py_test(
     name = "parameter_server_strategy_test",
     srcs = ["parameter_server_strategy_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
         "no_windows",  # TODO(b/156428279): reenable this test once the image is updated.
@@ -1751,6 +1767,7 @@ py_test(
 cuda_py_test(
     name = "strategy_common_test",
     srcs = ["strategy_common_test.py"],
+    python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
         # TODO(b/155301154): Enable this test on multi-gpu guitar once multi process
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index ef5302b45a5..f989d93e82e 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -65,6 +65,7 @@ py_library(
 cuda_py_test(
     name = "worker_training_state_test",
     srcs = ["worker_training_state_test.py"],
+    python_version = "PY3",
     shard_count = 4,
     deps = [
         ":multi_worker_testing_utils",
@@ -312,6 +313,7 @@ py_library(
 cuda_py_test(
     name = "keras_optimizer_v2_test",
     srcs = ["keras_optimizer_v2_test.py"],
+    python_version = "PY3",
     shard_count = 4,
     tags = [
         "multi_and_single_gpu",
@@ -325,6 +327,7 @@ cuda_py_test(
 cuda_py_test(
     name = "multi_worker_test",
     srcs = ["multi_worker_test.py"],
+    python_version = "PY3",
     shard_count = 32,
     tags = [
         "multi_and_single_gpu",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 9a780839be3..03c561f4fc1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2158,6 +2158,8 @@ def pywrap_tensorflow_macro(
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
 def py_test(deps = [], data = [], kernels = [], **kwargs):
     # Python version placeholder
+    if kwargs.get("python_version", None) == "PY3":
+        kwargs["tags"] = kwargs.get("tags", []) + ["no_oss_py2"]
     native.py_test(
         # TODO(jlebar): Ideally we'd use tcmalloc here.,
         deps = select({

From 1e22a995276499661c50f9f122d3de498429300c Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Thu, 28 May 2020 15:30:46 -0700
Subject: [PATCH 1343/1533] Refactor the code by (1) using EventVisitor in
 EventNode and (2) copying the first child EventNode when creating the virtual
 EventNode instead creating a new XEvent.

PiperOrigin-RevId: 313671525
Change-Id: I421f08025c49a9deda5231184d287535486ac13b
---
 .../core/profiler/utils/group_events.cc       | 68 ++++++++-----------
 tensorflow/core/profiler/utils/group_events.h | 37 +++++-----
 2 files changed, 46 insertions(+), 59 deletions(-)

diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index 42961492225..c07a397ee54 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -103,16 +103,6 @@ int64 GetEventType(const XPlaneVisitor& visitor, const XEvent& event) {
   }
 }
 
-const XStat* GetStat(const XPlaneVisitor& visitor, const XEvent& event,
-                     int64 stat_type) {
-  for (const auto& stat : event.stats()) {
-    if (visitor.GetStatType(stat) == stat_type) {
-      return &stat;
-    }
-  }
-  return nullptr;
-}
-
 void SetGroupId(const XPlaneVisitor& visitor, int64 group_id, XEvent* event) {
   AddOrUpdateIntStat(*visitor.GetStatMetadataId(StatType::kGroupId), group_id,
                      event);
@@ -146,8 +136,7 @@ bool NeedsVirtualEventsForAsyncExecutor(
 
 bool HasFunctionRun(EventNode* event_node) {
   for (EventNode* child : event_node->GetChildren()) {
-    if (child->GetPlaneVisitor().GetEventType(child->GetEvent()) ==
-        HostEventType::kFunctionRun) {
+    if (child->GetEventVisitor().Type() == HostEventType::kFunctionRun) {
       return true;
     }
   }
@@ -156,8 +145,21 @@ bool HasFunctionRun(EventNode* event_node) {
 
 }  // namespace
 
+EventNode::EventNode(const XPlaneVisitor* plane, XLine* raw_line,
+                     XEvent* raw_event)
+    : plane_(plane),
+      visitor_(plane, raw_line, raw_event),
+      raw_line_(raw_line),
+      raw_event_(raw_event) {}
+
+EventNode::EventNode(const EventNode& event_node)
+    : plane_(event_node.plane_),
+      visitor_(event_node.plane_, event_node.raw_line_, event_node.raw_event_),
+      raw_line_(event_node.raw_line_),
+      raw_event_(event_node.raw_event_) {}
+
 const XStat* EventNode::GetContextStat(int64 stat_type) const {
-  if (const XStat* stat = GetStat(*visitor_, *event_, stat_type)) {
+  if (const XStat* stat = visitor_.GetStats(stat_type)) {
     return stat;
   } else if (parent_) {
     return parent_->GetContextStat(stat_type);
@@ -168,7 +170,7 @@ const XStat* EventNode::GetContextStat(int64 stat_type) const {
 std::string EventNode::GetGroupName() const {
   std::vector<std::string> name_parts;
   if (const XStat* graph_type_stat = GetContextStat(StatType::kGraphType)) {
-    XStatVisitor stat(visitor_, graph_type_stat);
+    XStatVisitor stat(plane_, graph_type_stat);
     name_parts.push_back(stat.ToString());
   }
   int64 step_num = group_id_.value_or(0);
@@ -184,7 +186,7 @@ std::string EventNode::GetGroupName() const {
 
 void EventNode::PropagateGroupId(int64 group_id) {
   group_id_ = group_id;
-  SetGroupId(*visitor_, group_id, event_);
+  SetGroupId(*plane_, group_id, raw_event_);
   for (const auto& child : children_) {
     // Skip if it already belongs to a group. Some nodes may be added multiple
     // times as child (e.g., sometimes async ops are executed synchronously and
@@ -196,13 +198,13 @@ void EventNode::PropagateGroupId(int64 group_id) {
 }
 
 void EventNode::AddStepName(absl::string_view step_name) {
-  AddOrUpdateStrStat(*visitor_->GetStatMetadataId(StatType::kStepName),
-                     step_name, event_);
+  AddOrUpdateStrStat(*plane_->GetStatMetadataId(StatType::kStepName), step_name,
+                     raw_event_);
 }
 
 void EventNode::SetIsEager(bool is_eager) {
-  AddOrUpdateIntStat(*visitor_->GetStatMetadataId(StatType::kIsEager),
-                     is_eager ? 1 : 0, event_);
+  AddOrUpdateIntStat(*plane_->GetStatMetadataId(StatType::kIsEager),
+                     is_eager ? 1 : 0, raw_event_);
 }
 
 bool EventNode::IsEager() {
@@ -213,14 +215,9 @@ bool EventNode::IsEager() {
          FindParent(HostEventType::kEagerKernelExecute) != nullptr;
 }
 
-bool EventNode::IsNestedIn(EventNode* parent) {
-  return parent && IsNested(GetEvent(), parent->GetEvent());
-}
-
 EventNode* EventNode::FindParent(int64 event_type) {
   if (parent_) {
-    if (GetEventType(parent_->GetPlaneVisitor(), parent_->GetEvent()) ==
-        event_type) {
+    if (parent_->GetEventVisitor().Type() == event_type) {
       return parent_;
     }
     return parent_->FindParent(event_type);
@@ -233,10 +230,11 @@ void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
   for (auto& line : *plane->mutable_lines()) {
     std::vector<EventNode*> parent_nodes;
     for (auto& event : *line.mutable_events()) {
-      auto cur_node = absl::make_unique<EventNode>(&visitor, &event);
+      auto cur_node = absl::make_unique<EventNode>(&visitor, &line, &event);
       while (!parent_nodes.empty()) {
         EventNode* parent_node = parent_nodes.back();
-        if (cur_node->IsNestedIn(parent_node)) {
+        if (parent_node->GetEventVisitor().GetTimespan().Includes(
+                cur_node->GetEventVisitor().GetTimespan())) {
           parent_node->AddChild(cur_node.get());
           break;
         } else {
@@ -357,12 +355,8 @@ void EventForest::CreateVirtualEventsForHostTrainingLoop() {
     if (!iter_num) continue;
     EventNode*& virtual_event_node = virtual_event_node_map[step_id][iter_num];
     if (!virtual_event_node) {
-      std::unique_ptr<XEvent> new_virtual_event =
-          CreateVirtualEvent(*step_id_stat, *iter_num_stat);
-      auto new_virtual_event_node = absl::make_unique<EventNode>(
-          &executor_event_node->GetPlaneVisitor(), new_virtual_event.get());
-      // virtual_event_container_ keeps new_virtual_event alive.
-      virtual_event_container_.push_back(std::move(new_virtual_event));
+      auto new_virtual_event_node =
+          absl::make_unique<EventNode>(*executor_event_node);
       virtual_event_node = new_virtual_event_node.get();
       // event_node_map_ keeps new_virtual_event_node alive.
       event_node_map_[HostEventType::kHostTrainingLoopIteration].push_back(
@@ -380,12 +374,8 @@ void EventForest::CreateVirtualEventsForAsyncExecutor() {
   for (auto& eager_kernel_execute_event_node :
        *eager_kernel_execute_event_node_list) {
     if (HasFunctionRun(eager_kernel_execute_event_node.get())) {
-      auto new_virtual_event = absl::make_unique<XEvent>();
-      auto new_virtual_event_node = absl::make_unique<EventNode>(
-          &eager_kernel_execute_event_node->GetPlaneVisitor(),
-          new_virtual_event.get());
-      // virtual_event_container_ keeps new_virtual_event alive.
-      virtual_event_container_.push_back(std::move(new_virtual_event));
+      auto new_virtual_event_node =
+          absl::make_unique<EventNode>(*eager_kernel_execute_event_node);
       virtual_event_node = new_virtual_event_node.get();
       // event_node_map_ keeps new_virtual_event_node alive.
       event_node_map_[HostEventType::kAsyncExecutorTraceContext].push_back(
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 4b6fc58e3b8..49a7b349589 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -47,12 +47,10 @@ struct InterThreadConnectInfo {
 // pointers, a tree of EventNode is formed.
 class EventNode {
  public:
-  // REQUIRED: visitor and event should not be nullptr.
-  explicit EventNode(const XPlaneVisitor* visitor, XEvent* event)
-      : visitor_(visitor), event_(event) {
-    DCHECK(visitor);
-    DCHECK(event);
-  }
+  // REQUIRED: all inputs should not be nullptr.
+  EventNode(const XPlaneVisitor* plane, XLine* raw_line, XEvent* raw_event);
+
+  EventNode(const EventNode& event_node);
 
   EventNode* GetParent() const { return parent_; }
 
@@ -70,9 +68,9 @@ class EventNode {
   // Sets group_id for this node and its descendants.
   void PropagateGroupId(int64 group_id);
 
-  const XPlaneVisitor& GetPlaneVisitor() const { return *visitor_; }
+  const XPlaneVisitor& GetPlaneVisitor() const { return *plane_; }
 
-  const XEvent& GetEvent() const { return *event_; }
+  const XEventVisitor& GetEventVisitor() const { return visitor_; }
 
   const XStat* GetContextStat(int64 stat_type) const;
 
@@ -89,8 +87,10 @@ class EventNode {
   EventNode* FindParent(int64 event_type);
 
  private:
-  const XPlaneVisitor* visitor_;
-  XEvent* event_;
+  const XPlaneVisitor* plane_;
+  XEventVisitor visitor_;
+  XLine* raw_line_;
+  XEvent* raw_event_;
   EventNode* parent_ = nullptr;
   std::vector<EventNode*> children_;
   absl::optional<int64> group_id_;
@@ -100,8 +100,6 @@ using EventNodeMap =
     absl::flat_hash_map<int64 /*event_type*/,
                         std::vector<std::unique_ptr<EventNode>>>;
 
-using VirtualEventContainer = std::vector<std::unique_ptr<XEvent>>;
-
 using EventGroupNameMap = absl::flat_hash_map<int64 /*group_id*/, std::string>;
 
 // Creates a forest of EventNode by stitching events in space using the nesting
@@ -141,20 +139,19 @@ class EventForest {
   // Sets the is_eager stat to true for the eagerly executed CPU TF op events.
   void MarkEagerlyExecutedCpuTfOps();
 
-  // Create virtual events of HostEventType::kHostTrainingLoopIteration and
-  // event nodes for them. A virtual event is created for each iteration of the
-  // host training loop and connected to the
-  // HostEventType::kExecutorStateProcess event nodes of the iteration.
+  // Create virtual events of HostEventType::kHostTrainingLoopIteration. A
+  // virtual event is created for each iteration of the host training loop and
+  // connected to the HostEventType::kExecutorStateProcess events of the
+  // iteration.
   void CreateVirtualEventsForHostTrainingLoop();
 
-  // Create virutal events of HostEventType::kAsyncExecutorTraceContext and
-  // event nodes for them. A virtual event is created for every FunctionRun and
-  // the following eager ops (e.g., for Keras callback).
+  // Create virutal events of HostEventType::kAsyncExecutorTraceContext. A
+  // virtual event is created for every FunctionRun and the following eager ops
+  // (e.g., for Keras callback).
   void CreateVirtualEventsForAsyncExecutor();
 
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
-  VirtualEventContainer virtual_event_container_;
   EventGroupNameMap event_group_name_map_;
 };
 

From b1cb3f12da11b3d30da69b554e99638fd2865ca3 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Thu, 28 May 2020 15:32:29 -0700
Subject: [PATCH 1344/1533] Handle Array of SymbolRef attributes during
 tf-executor-tpu-v1-island-outlining pass.

PiperOrigin-RevId: 313671791
Change-Id: I088c53bb45df7f1f9a6284f0dc50173c91bf1b98
---
 .../case_op.mlir                              | 47 +++++++++++++++++++
 .../executor_tpuv1_outline_tpu_island.cc      | 27 ++++++++---
 2 files changed, 67 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
new file mode 100644
index 00000000000..de6f9b42ba4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/case_op.mlir
@@ -0,0 +1,47 @@
+// RUN: tf-opt %s -tf-executor-tpu-v1-island-outlining | FileCheck %s --dump-input=fail
+
+// CHECK: func @control_input
+// CHECK-NOT: func @
+// CHECK-LABEL: module @_tpu_v1_compat_outlined
+// CHECK: @_tpu_v1_compat_outlined_func0
+// CHECK: func @branch_0
+// CHECK: func @branch_1
+// CHECK: func @branch_2
+// CHECK: func @branch_3
+// CHECK: func @branch_4
+module {
+  func @control_input(%arg0: tensor<i1>) -> tensor<i32> {
+    %0 = tf_executor.graph {
+      %output, %control = tf_executor.island {
+       "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", device = "device", num_replicas = 1, topology = "topology"} : () -> ()
+        %index = "tf.opA"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
+        %input = "tf.opB"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i1>) -> tensor<i32>
+        %result = "tf.Case"(%index, %input) {branches = [@branch_0, @branch_1, @branch_2, @branch_3, @branch_4]} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        tf_executor.yield %result : tensor<i32>
+      }
+      tf_executor.fetch %output : tensor<i32>
+
+    }
+    return %0 : tensor<i32>
+  }
+  func @branch_0(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_1(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_2(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_3(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func @branch_4(%arg0: tensor<i32>) -> tensor<i32> {
+    %0 = "tf.some_op"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
index 08645333d5d..e04f6bf3daa 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/executor_tpuv1_outline_tpu_island.cc
@@ -49,6 +49,16 @@ struct TPUBridgeExecutorIslandOutlining
   void runOnOperation() override;
 };
 
+// Move FuncOp referenced by `symbol_ref` from one symbol table to another.
+void MoveFuncOp(FlatSymbolRefAttr &symbol_ref, SymbolTable &from,
+                SymbolTable &to) {
+  if (to.lookup<FuncOp>(symbol_ref.getValue())) return;
+  FuncOp callee = from.lookup<FuncOp>(symbol_ref.getValue());
+  callee.getOperation()->getBlock()->getOperations().remove(
+      callee.getOperation());
+  to.insert(callee);
+}
+
 void TPUBridgeExecutorIslandOutlining::runOnOperation() {
   MLIRContext *ctx = &getContext();
 
@@ -141,14 +151,17 @@ void TPUBridgeExecutorIslandOutlining::runOnOperation() {
   for (FuncOp func : outlined_module.getOps<FuncOp>()) {
     func.walk([&](Operation *op) {
       for (NamedAttribute attr : op->getAttrs()) {
-        auto symbol_ref = attr.second.dyn_cast<FlatSymbolRefAttr>();
-        if (!symbol_ref) continue;
-        if (outlined_symbol_table.lookup<FuncOp>(symbol_ref.getValue()))
+        if (auto symbol_ref = attr.second.dyn_cast<FlatSymbolRefAttr>()) {
+          MoveFuncOp(symbol_ref, symbol_table, outlined_symbol_table);
           continue;
-        FuncOp callee = symbol_table.lookup<FuncOp>(symbol_ref.getValue());
-        callee.getOperation()->getBlock()->getOperations().remove(
-            callee.getOperation());
-        outlined_symbol_table.insert(callee);
+        }
+        if (auto array_attr = attr.second.dyn_cast<ArrayAttr>()) {
+          for (const Attribute &attribute : array_attr) {
+            auto symbol_ref = attribute.dyn_cast<FlatSymbolRefAttr>();
+            if (!symbol_ref) continue;
+            MoveFuncOp(symbol_ref, symbol_table, outlined_symbol_table);
+          }
+        }
       }
     });
   }

From 5c7717429144c1fcf999b64b2da1dfb1388f486d Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Thu, 28 May 2020 15:34:54 -0700
Subject: [PATCH 1345/1533] [XLA:SPMD] More cases of reverse sharding

- Improve sharding propagation to reverse the tile assignment
- Use reshard (collective permute) to fix mismatch operand sharding
- Use halo exchange to fix uneven partitioning

PiperOrigin-RevId: 313672162
Change-Id: I0816de794a0c18a0173889ed8cd638baecf389e9
---
 .../compiler/xla/service/hlo_sharding_util.cc | 18 ++++++
 .../compiler/xla/service/hlo_sharding_util.h  |  6 ++
 .../xla/service/sharding_propagation.cc       | 13 +++++
 .../xla/service/spmd/spmd_partitioner.cc      | 48 +++++++++++----
 .../xla/service/spmd/spmd_partitioner_test.cc | 58 ++++++++++++++++++-
 5 files changed, 131 insertions(+), 12 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
index 129091ca06f..7fc05608800 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -220,6 +220,24 @@ absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
   return HloSharding::Tile(new_tile_assignment);
 }
 
+HloSharding ReverseSharding(const HloSharding& sharding,
+                            absl::Span<const int64> dimensions) {
+  if (sharding.IsTileMaximal() || dimensions.empty()) {
+    return sharding;
+  }
+
+  Array<int64> new_tile_assignment(sharding.tile_assignment().dimensions());
+  new_tile_assignment.Each([&](absl::Span<const int64> indices, int64* device) {
+    std::vector<int64> original_indices(indices.begin(), indices.end());
+    for (int64 d : dimensions) {
+      original_indices[d] =
+          new_tile_assignment.dim(d) - 1 - original_indices[d];
+    }
+    *device = sharding.tile_assignment()(original_indices);
+  });
+  return HloSharding::Tile(new_tile_assignment);
+}
+
 HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
                                    absl::Span<const int64> dims) {
   CHECK(!sharding.IsTuple() && !sharding.IsTileMaximal());
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
index 00d9434a34d..562f6d1420d 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_util.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -70,6 +70,12 @@ absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
                                             const Shape& target_shape,
                                             const HloSharding& sharding);
 
+// Returns the HloSharding with the tile dimensions and tile assignment
+// reversed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding ReverseSharding(const HloSharding& sharding,
+                            absl::Span<const int64> dimensions);
+
 // Returns a sharding tiled on unique dimension dim by reshaping the tile
 // assignment of the sharding argument. Only dimensions in the dims span
 // argument are considered for reshaping, the others are ignored.
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index bee2e04fabf..c6990e76c95 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -717,6 +717,15 @@ bool InferShardingFromOperands(HloInstruction* instruction,
       }
       return false;
     }
+    case HloOpcode::kReverse: {
+      if (!IsSpatiallyPartitioned(instruction->operand(0))) {
+        return false;
+      }
+      return MaybeImproveInstructionSharding(
+          hlo_sharding_util::ReverseSharding(
+              instruction->operand(0)->sharding(), instruction->dimensions()),
+          instruction);
+    }
     case HloOpcode::kDot: {
       auto& dot_dim_numbs = instruction->dot_dimension_numbers();
       // Batch dimensions are the same for lhs and rhs on dot operations.
@@ -1188,6 +1197,10 @@ absl::optional<HloSharding> GetShardingFromUser(
         return user.sharding();
       }
     }
+    case HloOpcode::kReverse: {
+      return hlo_sharding_util::ReverseSharding(user.sharding(),
+                                                user.dimensions());
+    }
     default: {
       // If the user output shape is compatible with the current instruction
       // shape excluding element type and the current instruction is supported
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 068442ad5c7..a0c46e0b6e7 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -2325,18 +2325,44 @@ Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
   if (reverse->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
-  if (absl::c_all_of(reverse->dimensions(), [&](int64 d) {
-        return reverse->sharding().tile_assignment().dim(d) == 1;
-      })) {
-    auto operand =
-        GetPartitionedHlo(reverse->operand(0)).Reshard(reverse->sharding());
-    SetPartitionedHlo(hlo, [&] {
-      return b_.AddInstruction(
-          hlo->CloneWithNewOperands(operand.hlo()->shape(), {operand.hlo()}));
-    });
-    return Status::OK();
+  auto operand = GetPartitionedHlo(reverse->operand(0))
+                     .Reshard(hlo_sharding_util::ReverseSharding(
+                         reverse->sharding(), reverse->dimensions()));
+  // Create a window config to halo exchange for unevenly partitioned reverse
+  // dimensions.
+  Window window;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(1);
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    int64 low_padding = 0;
+    if (absl::c_linear_search(reverse->dimensions(), i)) {
+      low_padding =
+          RoundUpToNearest(reverse->shape().dimensions(i),
+                           reverse->sharding().tile_assignment().dim(i)) -
+          reverse->shape().dimensions(i);
+    }
+    dim->set_padding_low(low_padding);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
   }
-  return DefaultAction(hlo);
+
+  auto reshard_operand = operand.ReshardAsWindowedInput(
+      window, operand.sharding(),
+      CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+      /*mask_invalid_region=*/false);
+  if (!reshard_operand.has_value()) {
+    return DefaultAction(hlo);
+  }
+  TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(
+        hlo->CloneWithNewOperands(reshard_operand->sharded_input->shape(),
+                                  {reshard_operand->sharded_input}));
+  });
+  return Status::OK();
 }
 
 Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index e766695385b..2daf3444014 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -3212,7 +3212,7 @@ ENTRY entry {
                     op::Shape("f32[9,9]")));
 }
 
-TEST_F(SpmdPartitioningTest, TiledReverse) {
+TEST_F(SpmdPartitioningTest, TiledReversePassthrough) {
   const char* const hlo_string = R"(
 HloModule module
 
@@ -3232,6 +3232,62 @@ ENTRY entry {
                               op::Reshape(), op::Constant()))));
 }
 
+TEST_F(SpmdPartitioningTest, TiledReversePassthroughViaReversedSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[4] parameter(0), sharding={devices=[2]0,1}
+  ROOT reverse = f32[4] reverse(param), dimensions={0},
+    sharding={devices=[2]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2]"), op::Reverse(op::Parameter(0))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReverseSwapShards) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[4] parameter(0), sharding={devices=[2]0,1}
+  ROOT reverse = f32[4] reverse(param), dimensions={0},
+    sharding={devices=[2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[2]"),
+                    op::Reverse(op::CollectivePermute(op::Parameter(0)))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReverseHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[3] parameter(0), sharding={devices=[2]0,1}
+  ROOT reverse = f32[3] reverse(param), dimensions={0},
+    sharding={devices=[2]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  auto halo_exchange_concat =
+      op::Concatenate(AllOf(op::Shape("f32[1]"),
+                            op::CollectivePermute(op::Slice(op::Parameter(0)))),
+                      op::Parameter(0));
+  auto after_halo_exchange = op::Slice(halo_exchange_concat);
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[2]"), op::Reverse(after_halo_exchange)));
+}
+
 TEST_F(SpmdPartitioningTest, MixWithManualPartitioning) {
   const char* const hlo_string = R"(
 HloModule module

From 3e842e5ffc07df0d2f627ab9d24c69acb17b1467 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 15:43:40 -0700
Subject: [PATCH 1346/1533] Add EnqueueTPUEmbeddingSparseTensorBatch and
 TF_EnqueueTPUEmbeddingRaggedTensorBatch ops to TF MLIR dialect.

PiperOrigin-RevId: 313673623
Change-Id: I5a0c803e55a036a40d0ef2a9469895cddec15932
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 9f407ea774a..594dfafd991 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -2670,6 +2670,76 @@ This operation creates a tensor of `shape` and `dtype`.
   let hasFolder = 1;
 }
 
+def TF_EnqueueTPUEmbeddingRaggedTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingRaggedTensorBatch", [SameVariadicOperandSize]> {
+  let summary = "Eases the porting of code that uses tf.nn.embedding_lookup().";
+
+  let description = [{
+sample_splits[i], embedding_indices[i] and aggregation_weights[i] correspond
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
+feature.
+
+The tensors at corresponding positions in two of the input lists,
+embedding_indices and aggregation_weights, must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_I32OrI64Tensor>:$sample_splits,
+    Variadic<TF_I32OrI64Tensor>:$embedding_indices,
+    Variadic<TF_F32OrF64Tensor>:$aggregation_weights,
+    TF_StrTensor:$mode_override,
+
+    DefaultValuedAttr<I64Attr, "-1">:$device_ordinal,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$combiners,
+    I64ArrayAttr:$table_ids,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$max_sequence_lengths
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T1 = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T2 = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
+}
+
+def TF_EnqueueTPUEmbeddingSparseTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingSparseTensorBatch", [SameVariadicOperandSize]> {
+  let summary = [{
+Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
+  }];
+
+  let description = [{
+sample_indices[i], embedding_indices[i] and aggregation_weights[i] correspond
+to the ith feature. table_ids[i] indicates which embedding table to look up ith
+feature.
+
+The tensors at corresponding positions in the three input lists (sample_indices,
+embedding_indices and aggregation_weights) must have the same shape, i.e. rank 1
+with dim_size() equal to the total number of lookups into the table described by
+the corresponding feature.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_I32OrI64Tensor>:$sample_indices,
+    Variadic<TF_I32OrI64Tensor>:$embedding_indices,
+    Variadic<TF_F32OrF64Tensor>:$aggregation_weights,
+    TF_StrTensor:$mode_override,
+
+    DefaultValuedAttr<I64Attr, "-1">:$device_ordinal,
+    DefaultValuedAttr<StrArrayAttr, "{}">:$combiners,
+    I64ArrayAttr:$table_ids,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$max_sequence_lengths
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr T1 = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr T2 = TF_DerivedOperandTypeAttr<1>;
+  TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
+  TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
+}
+
 def TF_EqualOp : TF_Op<"Equal", [Commutative, NoSideEffect]> {
   let summary = "Returns the truth value of (x == y) element-wise.";
 

From eb40237008ff46dcaf07cb652d6f9170485b5de2 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Thu, 28 May 2020 16:15:31 -0700
Subject: [PATCH 1347/1533] Clean up linspace implementation, support num=0 and
 integer dtypes

PiperOrigin-RevId: 313679328
Change-Id: I610a0428b790de95d04fdde9f884a1493d8329a8
---
 .../kernel_tests/signal/mel_ops_test.py       |  2 -
 tensorflow/python/ops/math_ops.py             | 42 +++++++++----------
 tensorflow/python/ops/math_ops_test.py        | 17 ++++----
 3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/kernel_tests/signal/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
index 3b5b236ddd5..a7aac484074 100644
--- a/tensorflow/python/kernel_tests/signal/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -169,8 +169,6 @@ class LinearToMelTest(test.TestCase, parameterized.TestCase):
       return
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(num_mel_bins=0)
-    with self.assertRaises(ValueError):
-      mel_ops.linear_to_mel_weight_matrix(num_spectrogram_bins=0)
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(sample_rate=0.0)
     with self.assertRaises(ValueError):
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 9b3a211d6fe..bf462ba8716 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -180,42 +180,38 @@ def linspace_nd(start, stop, num, name=None, axis=0):
 
     axis = array_ops.where_v2(axis >= 0, axis, ndims + axis)
 
+    # The purpose is to avoid having negative values when repeating.
+    num_fill = gen_math_ops.maximum(num_int - 2, 0)
     # To avoid having negative values in the range or zero division
     # the result is sliced in the end so a correct result is returned for
-    # num == 1.
+    # num == 1, and num == 0.
     n_steps = gen_math_ops.maximum(num_int - 1, 1)
     delta = (expanded_stop - expanded_start) / cast(n_steps,
                                                     expanded_stop.dtype)
+    # Re-cast tensors as delta.
+    expanded_start = cast(expanded_start, delta.dtype)
+    expanded_stop = cast(expanded_stop, delta.dtype)
     # If num < 0, we will throw exception in the range
     # otherwise use the same div for delta
-    range_end = array_ops.where_v2(num_int > 0, n_steps, -1)
+    range_end = array_ops.where_v2(num_int >= 0, n_steps, -1)
     # Even though range supports an output dtype, its limited
     # (e.g. doesn't support half at the moment).
-    num_range = cast(range(1, range_end, dtype=dtypes.int64), start.dtype)
-    shape_range = range(ndims)
-    ones_like_shape_range = array_ops.ones_like(shape_range)
-    axis_tiled = ones_like_shape_range * axis
-    # the purpose is to avoid having negative values when repeating
-    num_fill = gen_math_ops.maximum(num_int - 2, 0)
-    num_tiled = array_ops.ones_like(shape_range) * num_fill
-    ones = array_ops.ones_like(num_tiled)
-    mask = gen_math_ops.equal(axis_tiled, shape_range)
-    # reshape_target is [1. 1. 1. ... 1. num 1. 1. ... 1.], where the index
-    # of num is equal to axis
-    reshape_target = array_ops.where_v2(mask, num_fill, shape)
-    delta_expanded = array_ops.reshape(delta, shape)
-    delta_repeated = array_ops.broadcast_to(delta_expanded, reshape_target)
-    start_repeated = array_ops.broadcast_to(expanded_start, reshape_target)
+    desired_range = cast(range(1, range_end, dtype=dtypes.int64), delta.dtype)
+    mask = gen_math_ops.equal(axis, range(ndims))
+    # desired_range_shape is [1. 1. 1. ... 1. num_fill 1. 1. ... 1.], where the
+    # index of num_fill is equal to axis.
+    desired_range_shape = array_ops.where_v2(mask, num_fill, 1)
+    desired_range = array_ops.reshape(desired_range, desired_range_shape)
 
-    expanded_shape = array_ops.where_v2(mask, num_fill, ones)
-    range_indices = array_ops.reshape(num_range, expanded_shape)
-    tiled_range_indices = array_ops.tile(range_indices, shape)
-    res = start_repeated + delta_repeated * tiled_range_indices
+    res = expanded_start + delta * desired_range
+
+    # Add the start and endpoints to the result, and slice out the desired
+    # portion.
     all_tensors = (expanded_start, res, expanded_stop)
     concatenated = array_ops.concat(all_tensors, axis=axis)
     begin = array_ops.zeros_like(shape)
-    num_slice = ones_like_shape_range * num_int
-    size = array_ops.where_v2(mask, num_slice, shape)
+    size = array_ops.where_v2(mask, num_int, shape)
+
     return array_ops.slice(concatenated, begin, size)
 
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 0434aadc066..ebcbd58d9d6 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -798,18 +798,21 @@ class LinspaceTest(test_util.TensorFlowTestCase):
 
     shapes = [(), (2,), (2, 2)]
 
+    types = [np.float64, np.int64]
+
     for start_shape, stop_shape in itertools.product(shapes, repeat=2):
-      for num in [1, 2, 20]:
+      for num in [0, 1, 2, 20]:
         ndims = max(len(start_shape), len(stop_shape))
         for axis in range(-ndims, ndims):
-          start = np.ones(start_shape)
-          stop = 10 * np.ones(stop_shape)
+          for dtype in types:
+            start = np.ones(start_shape, dtype)
+            stop = 10 * np.ones(stop_shape, dtype)
 
-          np_ans = np.linspace(start, stop, num, axis=axis)
-          tf_ans = self.evaluate(
-              math_ops.linspace_nd(start, stop, num, axis=axis))
+            np_ans = np.linspace(start, stop, num, axis=axis)
+            tf_ans = self.evaluate(
+                math_ops.linspace_nd(start, stop, num, axis=axis))
 
-          self.assertAllClose(np_ans, tf_ans)
+            self.assertAllClose(np_ans, tf_ans)
 
 
 if __name__ == "__main__":

From 2003db55b152b1d016b6e0b8941b8cd254d74b86 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Thu, 28 May 2020 16:33:23 -0700
Subject: [PATCH 1348/1533] Support weights for CategoryEncoding.

PiperOrigin-RevId: 313682327
Change-Id: I35b38d66ce5a429cff4ca7a178f13c6649b2027b
---
 .../layers/preprocessing/category_encoding.py | 32 ++++++++++-
 .../preprocessing/category_encoding_test.py   | 53 +++++++++++++++++++
 ...tal.preprocessing.-category-encoding.pbtxt |  2 +-
 ...tal.preprocessing.-category-encoding.pbtxt |  2 +-
 4 files changed, 85 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
index b0a7e746074..b9460ed059c 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -71,6 +71,20 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
            [0, 1, 1, 0],
            [0, 1, 0, 1]])>
 
+
+  Examples with weighted inputs:
+
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
+  ...           max_tokens=4)
+  >>> count_weights = np.array([[.1, .2], [.1, .1], [.2, .3], [.4, .2]])
+  >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]], count_weights=count_weights)
+  <tf.Tensor: shape=(4, 4), dtype=float64, numpy=
+    array([[0.1, 0.2, 0. , 0. ],
+           [0.2, 0. , 0. , 0. ],
+           [0. , 0.2, 0.3, 0. ],
+           [0. , 0.2, 0. , 0.4]])>
+
+
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary.
@@ -85,6 +99,12 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
           value in each token slot.
     sparse: Boolean. If true, returns a `SparseTensor` instead of a dense
       `Tensor`. Defaults to `False`.
+
+  Call arguments:
+    inputs: A 2D tensor `(samples, timesteps)`.
+    count_weights: A 2D tensor in the same shape as `inputs` indicating the
+      weight for each sample value when summing up in `count` mode. Not used in
+      `binary` or `tfidf` mode.
   """
 
   def __init__(self,
@@ -242,7 +262,10 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
         tfidf_data = np.resize(tfidf_data, (self._max_tokens,))
     K.set_value(self.tf_idf_weights, tfidf_data)
 
-  def call(self, inputs):
+  def call(self, inputs, count_weights=None):
+    if count_weights is not None and self._output_mode != COUNT:
+      raise ValueError("count_weights is not used in `output_mode='tf-idf'`, "
+                       "or `output_mode='binary'`. Please pass a single input.")
     self._called = True
     if self._max_tokens is None:
       out_depth = K.get_value(self.num_elements)
@@ -264,10 +287,15 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
     binary_output = (self._output_mode == BINARY)
     if self._sparse:
       return bincount_ops.sparse_bincount(
-          inputs, minlength=out_depth, axis=-1, binary_output=binary_output)
+          inputs,
+          weights=count_weights,
+          minlength=out_depth,
+          axis=-1,
+          binary_output=binary_output)
     else:
       result = bincount_ops.bincount(
           inputs,
+          weights=count_weights,
           minlength=out_depth,
           dtype=dtypes.int64,
           axis=-1,
diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
index 08aa6d4871b..edfacf0d2b3 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding_test.py
@@ -109,6 +109,32 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
     output_dataset = model.predict(sparse_tensor_data, steps=1)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_sparse_input_with_weights(self):
+    input_array = np.array([[1, 2, 3, 4], [4, 3, 1, 4]], dtype=np.int64)
+    weights_array = np.array([[.1, .2, .3, .4], [.2, .1, .4, .3]])
+    sparse_tensor_data = sparse_ops.from_dense(input_array)
+    sparse_weight_data = sparse_ops.from_dense(weights_array)
+
+    # pyformat: disable
+    expected_output = [[0, .1, .2, .3, .4, 0],
+                       [0, .4, 0, .1, .5, 0]]
+    # pyformat: enable
+    max_tokens = 6
+    expected_output_shape = [None, max_tokens]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    weight_data = keras.Input(shape=(None,), dtype=dtypes.float32, sparse=True)
+
+    layer = get_layer_class()(
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT)
+    int_data = layer(input_data, count_weights=weight_data)
+    self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
+
+    model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
+    output_dataset = model.predict([sparse_tensor_data, sparse_weight_data],
+                                   steps=1)
+    self.assertAllClose(expected_output, output_dataset)
+
   def test_sparse_input_sparse_output(self):
     sp_inp = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]],
@@ -146,6 +172,33 @@ class CategoryEncodingInputTest(keras_parameterized.TestCase,
         sparse_ops.sparse_tensor_to_dense(sp_output_dataset, default_value=0),
         output_dataset)
 
+  def test_sparse_input_sparse_output_with_weights(self):
+    indices = [[0, 0], [1, 1], [2, 0], [2, 1], [3, 1]]
+    sp_inp = sparse_tensor.SparseTensor(
+        indices=indices, values=[0, 2, 1, 1, 0], dense_shape=[4, 2])
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    sp_weight = sparse_tensor.SparseTensor(
+        indices=indices, values=[.1, .2, .4, .3, .2], dense_shape=[4, 2])
+    weight_data = keras.Input(shape=(None,), dtype=dtypes.float32, sparse=True)
+
+    # The expected output should be (X for missing value):
+    # [[1, X, X, X]
+    #  [X, X, 1, X]
+    #  [X, 2, X, X]
+    #  [1, X, X, X]]
+    expected_indices = [[0, 0], [1, 2], [2, 1], [3, 0]]
+    expected_values = [.1, .2, .7, .2]
+    max_tokens = 6
+
+    layer = get_layer_class()(
+        max_tokens=max_tokens, output_mode=category_encoding.COUNT, sparse=True)
+    int_data = layer(input_data, count_weights=weight_data)
+
+    model = keras.Model(inputs=[input_data, weight_data], outputs=int_data)
+    sp_output_dataset = model.predict([sp_inp, sp_weight], steps=1)
+    self.assertAllClose(expected_values, sp_output_dataset.values)
+    self.assertAllEqual(expected_indices, sp_output_dataset.indices)
+
   def test_ragged_input(self):
     input_array = ragged_factory_ops.constant([[1, 2, 3], [3, 1]])
 
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index 165a6de49a8..5bd938d8fd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -153,7 +153,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
index 2edcfbb6487..fe0e45c3c67 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-encoding.pbtxt
@@ -151,7 +151,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'count_weights\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"

From 77cb02204f6198aad6c09f72e1eb3a2e33927de3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 16:33:42 -0700
Subject: [PATCH 1349/1533] Add replica id attribute to TPU Embedding ops
 during replicate to islands pass.

PiperOrigin-RevId: 313682388
Change-Id: I0e72b06b5db5c4f92b62562de523adaa01c2fa30
---
 .../tensorflow/tests/replicate_to_island.mlir | 35 +++++++++++++++++++
 .../transforms/replicate_to_island.cc         | 27 ++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index 8da252fc832..28c542cded1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -141,3 +141,38 @@ func @replicate_result(%arg0: tensor<i1>, %arg1: tensor<i1>) {
 // CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
 // CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
 // CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0, %[[REPLICA_0]]#1, %[[REPLICA_1]]#1
+
+
+// Tests replica id is added correctly.
+// CHECK-LABEL: func @replica_id_attr_added
+func @replica_id_attr_added(%arg0: tensor<!tf.string>, %arg1: tensor<!tf.string>) {
+  tf_executor.graph {
+    tf_executor.island {
+      tf_device.replicate([%arg0, %arg1] as %arg2: tensor<!tf.string>) {n = 2 : i32} {
+        "tf.EnqueueTPUEmbeddingSparseTensorBatch"(%arg2){table_ids = [1, 2]} : (tensor<!tf.string>) -> ()
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg2){table_ids = [1, 2]} : (tensor<!tf.string>) -> ()
+        "tf.A"(%arg2) : (tensor<!tf.string>) -> ()
+        tf_device.return
+      }
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
+// CHECK:      tf_executor.island
+// CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 0
+// CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 0
+// CHECK:      "tf.A"
+// CHECK-NOT:   _xla_replica_id
+// CHECK:      tf_executor.island
+// CHECK:      "tf.EnqueueTPUEmbeddingSparseTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 1
+// CHECK:      "tf.EnqueueTPUEmbeddingRaggedTensorBatch"
+// CHECK-SAME:   _xla_replica_id = 1
+// CHECK:      "tf.A"
+// CHECK-NOT:   _xla_replica_id
+// CHECK:      tf_executor.fetch
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index 2fd230005d0..15eb5593651 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -37,18 +37,37 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace mlir {
 namespace TFDevice {
 namespace {
 constexpr char kDeviceAttr[] = "device";
+constexpr char kReplicaIdAttr[] = "_xla_replica_id";
 
 struct ReplicateToIslandPass
     : public PassWrapper<ReplicateToIslandPass, FunctionPass> {
   void runOnFunction() override;
 };
 
+// Returns whether op requires `_xla_replica_id` attribute.
+bool RequiresReplicaIDAttribute(Operation* op) {
+  return llvm::isa<TF::EnqueueTPUEmbeddingSparseTensorBatchOp>(op) ||
+         llvm::isa<TF::EnqueueTPUEmbeddingRaggedTensorBatchOp>(op);
+}
+
+// Adds integer attribute that represents replica id for replicated ops that
+// require replica id attribute.
+void AddReplicaIdToOpsInReplicatedRegion(OpBuilder* builder, Region* region,
+                                         const int replica_id) {
+  region->walk([&](Operation* replicated_op) {
+    if (RequiresReplicaIDAttribute(replicated_op))
+      replicated_op->setAttr(kReplicaIdAttr,
+                             builder->getI32IntegerAttr(replica_id));
+  });
+}
+
 // Creates islands per replica from `tf_device.replicate` region. If for a
 // `tf_device.launch` op the device is an aliased device of the
 // `tf_device.replicate`, the device will be remapped to an explicit device
@@ -90,6 +109,14 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
     // Copy over replicate region into replica island.
     replicate_op.body().cloneInto(&replica.body(), mapping);
 
+    // TODO(b/157624749): Replace this with better abstraction to
+    // differentiate ops for different replicas.
+    // Some ops, such as XlaHostCompute op or TPU Embedding ops, require
+    // replica id to be added as an op attribute to be used during
+    // execution. Handle such ops separately and add an integer attribute
+    // that represents replica id.
+    AddReplicaIdToOpsInReplicatedRegion(builder, &replica.body(), i);
+
     // Map aliased devices to explicit devices based on replica.
     if (has_devices) {
       replica.walk([&](tf_device::LaunchOp launch) {

From c1cac3ce3b8cb14cec25e7ca68dad77dfe0d2217 Mon Sep 17 00:00:00 2001
From: Vo Van Nghia <vovannghia2409@gmail.com>
Date: Fri, 29 May 2020 06:54:29 +0700
Subject: [PATCH 1350/1533] Add select between windows and linux

---
 tensorflow/c/experimental/filesystem/plugins/gcs/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
index 90ddd4a891d..34142fec5f7 100644
--- a/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
+++ b/tensorflow/c/experimental/filesystem/plugins/gcs/BUILD
@@ -19,7 +19,10 @@ tf_cc_shared_object(
 cc_library(
     name = "gcs_filesystem_impl",
     srcs = ["gcs_filesystem.cc"],
-    copts = get_win_copts(),
+    copts = select({
+        "//conditions:default": [],
+        "//tensorflow:windows": get_win_copts(),
+    }),
     deps = [
         "//tensorflow/c:tf_status",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",

From f5e17f30bd165ee84e40e0d74e4ac20bf4881a6d Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Thu, 28 May 2020 16:53:26 -0700
Subject: [PATCH 1351/1533] Align downloaded models to 64 bits.

PiperOrigin-RevId: 313685922
Change-Id: I99564d3b775374569d0cc7315569d7309bfc1719
---
 .../lite/micro/tools/make/third_party_downloads.inc  | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 806501a004a..10be8dbaa34 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -62,14 +62,14 @@ RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
 CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
 CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
 
-IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_4_14.zip"
-IMAGE_RECOGNITION_MODEL_MD5 := "2b886156e7ef4d6e53d0f1a4bc800e56"
+IMAGE_RECOGNITION_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/models/tflite/cifar_image_recognition_model_2020_05_27.zip"
+IMAGE_RECOGNITION_MODEL_MD5 := "1f4607b05ac45b8a6146fb883dbc2d7b"
 
-PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2019_11_21.zip"
-PERSON_MODEL_MD5 := "fe2934bd0788f1dcc7af3f0a954542ab"
+PERSON_MODEL_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_grayscale_2020_05_27.zip"
+PERSON_MODEL_MD5 := "55b85f76e2995153e660391d4a209ef1"
 
-PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_01_13.zip"
-PERSON_MODEL_INT8_MD5 := "8a7d2c70325f53136faea6dde517b8cc"
+PERSON_MODEL_INT8_URL := "https://storage.googleapis.com/download.tensorflow.org/data/tf_lite_micro_person_data_int8_grayscale_2020_05_27.zip"
+PERSON_MODEL_INT8_MD5 := "a0ede2d058aa2a1d413893455dd55352"
 
 EMBARC_MLI_URL := "https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli/archive/58284867ca52d1f43b25045e8601999d7359d986.zip"
 EMBARC_MLI_MD5 := "2bf4982a327fdaa9d475803ce014d1ef"

From 547daed2591ff84b4c9d27ae26336ab4b6d5bf06 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 16:54:35 -0700
Subject: [PATCH 1352/1533] Fix HandleCopies so it no longer invokes UB when
 the `params` or `out` TensorMaps are empty with no backing data.

PiperOrigin-RevId: 313686113
Change-Id: I4b38d7e7e8cebb40d8b7f2390f841f0b541a01e5
---
 tensorflow/core/kernels/gather_functor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/gather_functor.h b/tensorflow/core/kernels/gather_functor.h
index ab8b4b2dd48..948273aa33a 100644
--- a/tensorflow/core/kernels/gather_functor.h
+++ b/tensorflow/core/kernels/gather_functor.h
@@ -44,8 +44,8 @@ SliceIndex HandleCopies(OpKernelContext* ctx,
   const SliceIndex indices_size = static_cast<SliceIndex>(indices.dimension(0));
   const SliceIndex batch_size = static_cast<SliceIndex>(params.dimension(0));
   const Index limit = static_cast<Index>(params.dimension(1));
-  T* out_base = &out(0, 0, 0);
-  const T* params_base = &params(0, 0, 0);
+  T* out_base = out.data();
+  const T* params_base = params.data();
   if (static_slice_elems >= 0) {
     // Give compiler static knowledge of the number of elements/bytes
     slice_elems = static_slice_elems;

From 347fe6ece4535f575c0cde17812b2818d2453987 Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Thu, 28 May 2020 17:00:12 -0700
Subject: [PATCH 1353/1533] Use the new V2 optimizer in resnet50 benchmark.

PiperOrigin-RevId: 313687026
Change-Id: If51837cddc1a7e4d2ef70c064788a8f4a7728a6a
---
 tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 30e2585e842..b2e57c11e3c 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -355,7 +355,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         (images, labels) = resnet50_test_util.random_batch(
             batch_size, data_format)
         model = resnet50.ResNet50(data_format)
-        optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1)
+        optimizer = tf.keras.optimizers.SGD(0.1)
         apply_grads = apply_gradients
         if defun:
           model.call = tf.function(model.call)

From 5c9dd54540fc087387b6b9c998437149ef6a60d9 Mon Sep 17 00:00:00 2001
From: Khanh LeViet <khanhlvg@google.com>
Date: Thu, 28 May 2020 17:08:38 -0700
Subject: [PATCH 1354/1533] Stop using deprecated enum
 tf.lite.Optimize.OPTIMIZE_FOR_SIZE and some other clean up

PiperOrigin-RevId: 313688439
Change-Id: Ice2c0d9aefc57681c20d11dade772251a9eab84e
---
 .../post_training_float16_quant.ipynb         | 32 ++++---------------
 .../post_training_integer_quant.ipynb         | 23 +++++--------
 .../performance/post_training_quant.ipynb     | 19 ++++-------
 3 files changed, 20 insertions(+), 54 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
index cf589a2b968..ef08902865e 100644
--- a/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb
@@ -61,6 +61,9 @@
         "  \u003ctd\u003e\n",
         "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_float16_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
     },
@@ -113,13 +116,7 @@
         "import logging\n",
         "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
         "\n",
-        "try:\n",
-        "  # %tensorflow_version only exists in Colab.\n",
-        "  import tensorflow.compat.v2 as tf\n",
-        "except Exception:\n",
-        "  pass\n",
-        "tf.enable_v2_behavior()\n",
-        "\n",
+        "import tensorflow as tf\n",
         "from tensorflow import keras\n",
         "import numpy as np\n",
         "import pathlib"
@@ -173,12 +170,12 @@
         "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
         "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
         "  keras.layers.Flatten(),\n",
-        "  keras.layers.Dense(10, activation=tf.nn.softmax)\n",
+        "  keras.layers.Dense(10)\n",
         "])\n",
         "\n",
         "# Train the digit classification model\n",
         "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
         "              metrics=['accuracy'])\n",
         "model.fit(\n",
         "  train_images,\n",
@@ -597,28 +594,11 @@
         "\n",
         "Detailed documentation on the TFLite GPU delegate and how to use it in your application can be found [here](https://www.tensorflow.org/lite/performance/gpu_advanced?source=post_page---------------------------)"
       ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "BeUSdwKVixvk"
-      },
-      "outputs": [],
-      "source": [
-        ""
-      ]
     }
   ],
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook_py3",
-        "kind": "private"
-      },
       "name": "post_training-float16-quant.ipynb",
       "private_outputs": true,
       "provenance": [],
diff --git a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
index fddee15bc1d..ad461f56d6f 100644
--- a/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb
@@ -61,6 +61,9 @@
         "  \u003ctd\u003e\n",
         "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_integer_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
     },
@@ -114,13 +117,7 @@
         "import logging\n",
         "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
         "\n",
-        "try:\n",
-        "  # %tensorflow_version only exists in Colab.\n",
-        "  import tensorflow.compat.v2 as tf\n",
-        "except Exception:\n",
-        "  pass\n",
-        "tf.enable_v2_behavior()\n",
-        "\n",
+        "import tensorflow as tf\n",
         "from tensorflow import keras\n",
         "import numpy as np\n",
         "import pathlib"
@@ -158,15 +155,15 @@
         "model = keras.Sequential([\n",
         "  keras.layers.InputLayer(input_shape=(28, 28)),\n",
         "  keras.layers.Reshape(target_shape=(28, 28, 1)),\n",
-        "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
+        "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation='relu'),\n",
         "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
         "  keras.layers.Flatten(),\n",
-        "  keras.layers.Dense(10, activation=tf.nn.softmax)\n",
+        "  keras.layers.Dense(10)\n",
         "])\n",
         "\n",
         "# Train the digit classification model\n",
         "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
         "              metrics=['accuracy'])\n",
         "model.fit(\n",
         "  train_images,\n",
@@ -277,7 +274,7 @@
       },
       "outputs": [],
       "source": [
-        "converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]"
+        "converter.optimizations = [tf.lite.Optimize.DEFAULT]"
       ]
     },
     {
@@ -655,10 +652,6 @@
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook_py3",
-        "kind": "private"
-      },
       "name": "post_training_integer_quant.ipynb",
       "private_outputs": true,
       "provenance": [],
diff --git a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
index d6edb656d0e..201ccf5bdc3 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
+++ b/tensorflow/lite/g3doc/performance/post_training_quant.ipynb
@@ -61,6 +61,9 @@
         "  \u003ctd\u003e\n",
         "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/performance/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
         "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/lite/g3doc/performance/post_training_quant.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
         "\u003c/table\u003e"
       ]
     },
@@ -134,13 +137,7 @@
         "import logging\n",
         "logging.getLogger(\"tensorflow\").setLevel(logging.DEBUG)\n",
         "\n",
-        "try:\n",
-        "  # %tensorflow_version only exists in Colab.\n",
-        "  import tensorflow.compat.v2 as tf\n",
-        "except Exception:\n",
-        "  pass\n",
-        "tf.enable_v2_behavior()\n",
-        "\n",
+        "import tensorflow as tf\n",
         "from tensorflow import keras\n",
         "import numpy as np\n",
         "import pathlib"
@@ -181,12 +178,12 @@
         "  keras.layers.Conv2D(filters=12, kernel_size=(3, 3), activation=tf.nn.relu),\n",
         "  keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
         "  keras.layers.Flatten(),\n",
-        "  keras.layers.Dense(10, activation=tf.nn.softmax)\n",
+        "  keras.layers.Dense(10)\n",
         "])\n",
         "\n",
         "# Train the digit classification model\n",
         "model.compile(optimizer='adam',\n",
-        "              loss='sparse_categorical_crossentropy',\n",
+        "              loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
         "              metrics=['accuracy'])\n",
         "model.fit(\n",
         "  train_images,\n",
@@ -620,10 +617,6 @@
   "metadata": {
     "colab": {
       "collapsed_sections": [],
-      "last_runtime": {
-        "build_target": "//learning/brain/python/client:colab_notebook_py3",
-        "kind": "private"
-      },
       "name": "post_training_quant.ipynb",
       "private_outputs": true,
       "provenance": [],

From 12bda81b3d998ef554c0e0a7c266bca0b4409f13 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Thu, 28 May 2020 17:25:45 -0700
Subject: [PATCH 1355/1533] Add Buckettize, SparseCross and
 BoostedTreesBucketize to flex whitelist

PiperOrigin-RevId: 313690852
Change-Id: I9dc21e2439d103ea3783652fb38826e0fd76aaaf
---
 tensorflow/core/kernels/BUILD                          |  8 ++++++++
 tensorflow/core/kernels/boosted_trees/BUILD            |  2 ++
 tensorflow/core/kernels/boosted_trees/quantiles/BUILD  | 10 ++++++++++
 tensorflow/core/platform/BUILD                         |  1 +
 tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc |  5 +++++
 5 files changed, 26 insertions(+)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index c1fc17079c8..47b2fa44f57 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6617,6 +6617,7 @@ filegroup(
         "batch_matmul_op_impl.h",
         "batch_norm_op.h",
         "broadcast_to_op.h",
+        "bucketize_op.h",
         "control_flow_ops.h",
         "conv_2d.h",
         "conv_3d.h",
@@ -6681,6 +6682,8 @@ filegroup(
         "transpose_op.h",
         "where_op.h",
         "xent_op.h",
+    ] + [
+        "//tensorflow/core/kernels/boosted_trees/quantiles:weighted_quantiles_hdrs",
     ],
 )
 
@@ -6810,6 +6813,7 @@ filegroup(
     srcs = [
         "batchtospace_op.cc",
         "broadcast_to_op.cc",
+        "bucketize_op.cc",
         "ctc_decoder_ops.cc",
         "decode_bmp_op.cc",
         "depthtospace_op.cc",
@@ -6886,6 +6890,7 @@ filegroup(
         "spacetobatch_functor.cc",
         "spacetobatch_op.cc",
         "spacetodepth_op.cc",
+        "sparse_cross_op.cc",
         "sparse_fill_empty_rows_op.cc",
         "sparse_reshape_op.cc",
         "sparse_to_dense_op.cc",
@@ -6934,6 +6939,8 @@ filegroup(
         "where_op.cc",
         "xent_op.cc",
         ":android_extended_ops_headers",
+    ] + [
+        "//tensorflow/core/kernels/boosted_trees:quantile_ops.cc",
     ],
 )
 
@@ -7096,6 +7103,7 @@ cc_library(
     deps = [
         "//tensorflow/core:portable_tensorflow_lib_lite",
         "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/platform:strong_hash",
         "//third_party/eigen3",
         "//third_party/fft2d:fft2d_headers",
         "@com_google_absl//absl/base",
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index b1f6d9d595f..4ca40dc4177 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -19,6 +19,8 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+exports_files(["quantile_ops.cc"])
+
 tf_proto_library(
     name = "boosted_trees_proto",
     srcs = [
diff --git a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
index 1b48065d6f7..fb03e284d8d 100644
--- a/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/BUILD
@@ -10,6 +10,16 @@ package(
 
 exports_files(["LICENSE"])
 
+filegroup(
+    name = "weighted_quantiles_hdrs",
+    srcs = [
+        "quantile_stream_resource.h",
+        "weighted_quantiles_buffer.h",
+        "weighted_quantiles_stream.h",
+        "weighted_quantiles_summary.h",
+    ],
+)
+
 # Quantiles
 
 cc_library(
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 7f7ca0f06cd..30734a840d1 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -675,6 +675,7 @@ cc_library(
 cc_library(
     name = "strong_hash",
     hdrs = ["strong_hash.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         ":types",
diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
index b38a66f5687..d9150698298 100644
--- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc
@@ -66,6 +66,8 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "BiasAdd",
           "BiasAddGrad",
           "BiasAddV1",
+          "BoostedTreesBucketize",
+          "Bucketize",
           "BroadcastArgs",
           "BroadcastGradientArgs",
           "BroadcastTo",
@@ -386,6 +388,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "SparseApplyProximalAdagrad",
           "SparseApplyProximalGradientDescent",
           "SparseApplyRMSProp",
+          "SparseCross",
+          "SparseCrossHashed",
+          "SparseCrossV2",
           "SparseFillEmptyRows",
           "SparseFillEmptyRowsGrad",
           "SparseReshape",

From f9fb66cdb7419d2eadf7faea995f1aacb032104b Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Thu, 28 May 2020 17:27:56 -0700
Subject: [PATCH 1356/1533] Correctly add newline character when concatenating
 schema generated python srcs

The last sed command used for adding the import statement and the '\n' was
working as intended on Linux, but was failing on macOS, due to the difference in
sed behavior between GNU and BSD.

Fixed the issue by using an actual newline character with $'\n', which correctly
works on both platforms.

Fixes https://github.com/tensorflow/tensorflow/issues/39756

PiperOrigin-RevId: 313691192
Change-Id: I96e5544c03641bae05753e8f7d1346c8aa1c0f6e
---
 third_party/flatbuffers/build_defs.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index 9be627119cf..02027aa09f5 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -362,7 +362,7 @@ def _concat_flatbuffer_py_srcs_impl(ctx):
             "sed 's/from flatbuffers." +
             "/from flatbuffers.python.flatbuffers./' |" +
             "sed '1s/^/from flatbuffers.python " +
-            "import flatbuffers\\n/' > %s"
+            "import flatbuffers\\'$'\\n/' > %s"
         ) % (
             ctx.attr.deps[0].files.to_list()[0].path,
             ctx.outputs.out.path,

From 33689c48ad5e00908cd59089ef1956e1478fda78 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Thu, 28 May 2020 17:28:40 -0700
Subject: [PATCH 1357/1533] Add MicroOpResolver interface class.

This will allow us to implement selective registration of the builtin parse
functions without changing the OpResolver base class in TFLite.

* MicroOpResolver is now an interface (matching the OpResolver name in TFLite).
* MicroMutableOpResolver is the implementation of the MicroOpResolver
  interface that should be used by applications that do not want to use
  AllOpsResolver.

PiperOrigin-RevId: 313691276
Change-Id: I0a9f51f6584326a3b3dd645cde083ba42116083d
---
 tensorflow/lite/micro/BUILD                   |  2 +
 .../micro/benchmarks/keyword_benchmark.cc     |  2 +-
 .../benchmarks/person_detection_benchmark.cc  |  2 +-
 .../image_recognition_test.cc                 |  2 +-
 .../image_recognition_experimental/main.cc    |  2 +-
 .../examples/magic_wand/magic_wand_test.cc    |  2 +-
 .../examples/magic_wand/main_functions.cc     |  2 +-
 .../examples/micro_speech/main_functions.cc   |  2 +-
 .../micro_speech/micro_speech_test.cc         |  2 +-
 .../person_detection/main_functions.cc        |  2 +-
 .../person_detection/person_detection_test.cc |  2 +-
 .../main_functions.cc                         |  2 +-
 .../person_detection_test.cc                  |  2 +-
 .../lite/micro/kernels/all_ops_resolver.h     |  7 ++-
 tensorflow/lite/micro/memory_helpers.cc       |  1 +
 tensorflow/lite/micro/micro_allocator.cc      | 16 +++--
 tensorflow/lite/micro/micro_allocator.h       |  6 +-
 tensorflow/lite/micro/micro_interpreter.cc    |  5 +-
 tensorflow/lite/micro/micro_interpreter.h     |  6 +-
 .../lite/micro/micro_interpreter_test.cc      | 20 ++++++-
 .../lite/micro/micro_mutable_op_resolver.h    | 32 +++++-----
 .../micro/micro_mutable_op_resolver_test.cc   | 36 ++++++------
 tensorflow/lite/micro/micro_op_resolver.h     | 58 +++++++++++++++++++
 23 files changed, 150 insertions(+), 63 deletions(-)
 create mode 100644 tensorflow/lite/micro/micro_op_resolver.h

diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 3b05aee30f4..36cc14512b6 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -37,6 +37,7 @@ cc_library(
         "micro_allocator.h",
         "micro_interpreter.h",
         "micro_mutable_op_resolver.h",
+        "micro_op_resolver.h",
         "micro_optional_debug_tools.h",
         "simple_memory_allocator.h",
         "test_helpers.h",
@@ -159,6 +160,7 @@ tflite_micro_cc_test(
     deps = [
         ":micro_framework",
         ":micro_utils",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/micro/testing:micro_test",
     ],
diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
index fd8556f752e..50401039265 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
@@ -84,7 +84,7 @@ class KeywordRunner {
   const tflite::Model* keyword_spotting_model_;
   tflite::MicroErrorReporter micro_reporter_;
   tflite::ErrorReporter* reporter_;
-  tflite::MicroOpResolver<6> resolver_;
+  tflite::MicroMutableOpResolver<6> resolver_;
   tflite::MicroInterpreter interpreter_;
 };
 
diff --git a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
index 5287a9c1e23..bec12ad8642 100644
--- a/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/person_detection_benchmark.cc
@@ -98,7 +98,7 @@ class PersonDetectionRunner {
   const tflite::Model* person_detection_model_;
   tflite::MicroErrorReporter micro_reporter_;
   tflite::ErrorReporter* reporter_;
-  tflite::MicroOpResolver<6> resolver_;
+  tflite::MicroMutableOpResolver<6> resolver_;
   tflite::MicroInterpreter interpreter_;
 };
 
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
index fd547b433ef..ac4de118834 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/image_recognition_test.cc
@@ -42,7 +42,7 @@ TF_LITE_MICRO_TEST(TestImageRecognitionInvoke) {
                          model->version(), TFLITE_SCHEMA_VERSION);
   }
 
-  tflite::MicroOpResolver<4> micro_op_resolver;
+  tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
                                tflite::ops::micro::Register_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
index 09c76df0379..becdbdf1bd7 100644
--- a/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
+++ b/tensorflow/lite/micro/examples/image_recognition_experimental/main.cc
@@ -56,7 +56,7 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  tflite::MicroOpResolver<4> micro_op_resolver;
+  tflite::MicroMutableOpResolver<4> micro_op_resolver;
 
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
                                tflite::ops::micro::Register_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
index ab4f41680fd..88bfad860e2 100644
--- a/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/magic_wand_test.cc
@@ -46,7 +46,7 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) {
   // An easier approach is to just use the AllOpsResolver, but this will
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
-  static tflite::MicroOpResolver<5> micro_op_resolver;  // NOLINT
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
index 51e6e593cd1..26c2eb44747 100644
--- a/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
+++ b/tensorflow/lite/micro/examples/magic_wand/main_functions.cc
@@ -65,7 +65,7 @@ void setup() {
   // An easier approach is to just use the AllOpsResolver, but this will
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
-  static tflite::MicroOpResolver<5> micro_op_resolver;  // NOLINT
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;  // NOLINT
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index e5e6aa7c1f7..5369008182b 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -74,7 +74,7 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<4> micro_op_resolver(error_reporter);
+  static tflite::MicroMutableOpResolver<4> micro_op_resolver(error_reporter);
   if (micro_op_resolver.AddBuiltin(
           tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
           tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
diff --git a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
index a6e011b1224..b1b224c9391 100644
--- a/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc
@@ -48,7 +48,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // needed by this graph.
   //
   // tflite::ops::micro::AllOpsResolver resolver;
-  tflite::MicroOpResolver<4> micro_op_resolver;
+  tflite::MicroMutableOpResolver<4> micro_op_resolver;
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
                                tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
                                tflite::MicroOpResolverAnyVersion());
diff --git a/tensorflow/lite/micro/examples/person_detection/main_functions.cc b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
index 0e5c6394d56..6b07f6514d5 100644
--- a/tensorflow/lite/micro/examples/person_detection/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection/main_functions.cc
@@ -65,7 +65,7 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<3> micro_op_resolver;
+  static tflite::MicroMutableOpResolver<3> micro_op_resolver;
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
index 8acb93ced17..dafed8089e3 100644
--- a/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection/person_detection_test.cc
@@ -54,7 +54,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // needed by this graph.
   //
   // tflite::ops::micro::AllOpsResolver resolver;
-  tflite::MicroOpResolver<3> micro_op_resolver;
+  tflite::MicroMutableOpResolver<3> micro_op_resolver;
   micro_op_resolver.AddBuiltin(
       tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
       tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index 92d2c091f55..6f10d5c3f27 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -72,7 +72,7 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<12> micro_op_resolver;
+  static tflite::MicroMutableOpResolver<12> micro_op_resolver;
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
                                tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
                                1, 3);
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index c3719e559ca..ea37faa15f2 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -52,7 +52,7 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // An easier approach is to just use the AllOpsResolver, but this will
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
-  tflite::MicroOpResolver<11> micro_op_resolver;
+  tflite::MicroMutableOpResolver<11> micro_op_resolver;
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
                                tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
                                1, 3);
diff --git a/tensorflow/lite/micro/kernels/all_ops_resolver.h b/tensorflow/lite/micro/kernels/all_ops_resolver.h
index 26bb03230ed..5637316d0da 100644
--- a/tensorflow/lite/micro/kernels/all_ops_resolver.h
+++ b/tensorflow/lite/micro/kernels/all_ops_resolver.h
@@ -19,7 +19,12 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
-class AllOpsResolver : public MicroMutableOpResolver {
+// The magic number in the template parameter is the maximum number of ops that
+// can be added to AllOpsResolver. It can be increased if needed. And most
+// applications that care about the memory footprint will want to directly use
+// MicroMutableOpResolver and have an application specific template parameter.
+// The examples directory has sample code for this.
+class AllOpsResolver : public MicroMutableOpResolver<128> {
  public:
   AllOpsResolver();
 
diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index 05105f83ff3..37c78162b62 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index d43f0ec076f..35f4bdabd20 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -24,11 +24,14 @@ limitations under the License.
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
 #include "tensorflow/lite/micro/memory_planner/memory_planner.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 
@@ -431,7 +434,7 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
 }
 
 TfLiteStatus MicroAllocator::InitializeFromFlatbuffer(
-    const OpResolver& op_resolver,
+    const MicroOpResolver& op_resolver,
     NodeAndRegistration** node_and_registrations) {
   if (!active_) {
     return kTfLiteError;
@@ -649,7 +652,7 @@ TfLiteStatus MicroAllocator::AllocateNodeAndRegistrations(
 }
 
 TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
-    const OpResolver& op_resolver,
+    const MicroOpResolver& op_resolver,
     NodeAndRegistration* node_and_registrations) {
   TfLiteStatus status = kTfLiteOk;
   auto* opcodes = model_->operator_codes();
@@ -697,9 +700,12 @@ TfLiteStatus MicroAllocator::PrepareNodeAndRegistrationDataFromFlatbuffer(
       custom_data = reinterpret_cast<const char*>(op->custom_options()->data());
       custom_data_size = op->custom_options()->size();
     } else {
-      TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
-                                        &builtin_data_allocator,
-                                        (void**)(&builtin_data)));
+      MicroOpResolver::BuiltinParseFunction parser =
+          op_resolver.GetOpDataParser(op_type);
+      TFLITE_DCHECK(parser != nullptr);
+      TF_LITE_ENSURE_STATUS(parser(op, op_type, error_reporter_,
+                                   &builtin_data_allocator,
+                                   (void**)(&builtin_data)));
     }
 
     // Disregard const qualifier to workaround with existing API.
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index 1dd90c36a4d..d7f7e4c9d6c 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -21,7 +21,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -92,7 +92,7 @@ class MicroAllocator {
   // needs to be called before FinishTensorAllocation method. This method also
   // allocates any internal Op data that is required from the flatbuffer.
   TfLiteStatus InitializeFromFlatbuffer(
-      const OpResolver& op_resolver,
+      const MicroOpResolver& op_resolver,
       NodeAndRegistration** node_and_registrations);
 
   // Runs through the model and allocates all necessary input, output and
@@ -145,7 +145,7 @@ class MicroAllocator {
   // instance). Persistent data (e.g. operator data) is allocated from the
   // arena.
   TfLiteStatus PrepareNodeAndRegistrationDataFromFlatbuffer(
-      const OpResolver& op_resolver,
+      const MicroOpResolver& op_resolver,
       NodeAndRegistration* node_and_registrations);
 
  private:
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 6b78966020e..7e2e56e417d 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -21,9 +21,10 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
@@ -71,7 +72,7 @@ void ContextHelper::ReportOpError(struct TfLiteContext* context,
 }  // namespace internal
 
 MicroInterpreter::MicroInterpreter(const Model* model,
-                                   const OpResolver& op_resolver,
+                                   const MicroOpResolver& op_resolver,
                                    uint8_t* tensor_arena,
                                    size_t tensor_arena_size,
                                    ErrorReporter* error_reporter)
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 180a557668e..a0b70527905 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/type_to_tflitetype.h"
 
@@ -72,7 +72,7 @@ class MicroInterpreter {
   // function.
   // The interpreter doesn't do any deallocation of any of the pointed-to
   // objects, ownership remains with the caller.
-  MicroInterpreter(const Model* model, const OpResolver& op_resolver,
+  MicroInterpreter(const Model* model, const MicroOpResolver& op_resolver,
                    uint8_t* tensor_arena, size_t tensor_arena_size,
                    ErrorReporter* error_reporter);
 
@@ -160,7 +160,7 @@ class MicroInterpreter {
   NodeAndRegistration* node_and_registrations_ = nullptr;
 
   const Model* model_;
-  const OpResolver& op_resolver_;
+  const MicroOpResolver& op_resolver_;
   ErrorReporter* error_reporter_;
   TfLiteContext context_ = {};
   MicroAllocator allocator_;
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 36e8c009b96..2358f763bc0 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/micro/test_helpers.h"
@@ -147,7 +149,7 @@ class MockCustom {
   }
 };
 
-class MockOpResolver : public OpResolver {
+class MockOpResolver : public MicroOpResolver {
  public:
   const TfLiteRegistration* FindOp(BuiltinOperator op,
                                    int version) const override {
@@ -162,6 +164,22 @@ class MockOpResolver : public OpResolver {
       return nullptr;
     }
   }
+
+  MicroOpResolver::BuiltinParseFunction GetOpDataParser(
+      tflite::BuiltinOperator) const override {
+    // TODO(b/149408647): Figure out an alternative so that we do not have any
+    // references to ParseOpData in the micro code and the signature for
+    // MicroOpResolver::BuiltinParseFunction can be changed to be different from
+    // ParseOpData.
+    return ParseOpData;
+  }
+
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration,
+                          int version) override {
+    // This function is currently not used in the tests.
+    return kTfLiteError;
+  }
 };
 
 }  // namespace
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 6c3e9a3331e..88ec1133c9f 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,14 +19,11 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
-#ifndef TFLITE_REGISTRATIONS_MAX
-#define TFLITE_REGISTRATIONS_MAX (128)
-#endif
-
 namespace tflite {
 
 // Op versions discussed in this file are enumerated here:
@@ -34,10 +31,10 @@ namespace tflite {
 
 inline int MicroOpResolverAnyVersion() { return 0; }
 
-template <unsigned int tOpCount = TFLITE_REGISTRATIONS_MAX>
-class MicroOpResolver : public OpResolver {
+template <unsigned int tOpCount>
+class MicroMutableOpResolver : public MicroOpResolver {
  public:
-  explicit MicroOpResolver(ErrorReporter* error_reporter = nullptr)
+  explicit MicroMutableOpResolver(ErrorReporter* error_reporter = nullptr)
       : error_reporter_(error_reporter) {}
 
   const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
@@ -68,8 +65,15 @@ class MicroOpResolver : public OpResolver {
     return nullptr;
   }
 
+  MicroOpResolver::BuiltinParseFunction GetOpDataParser(
+      tflite::BuiltinOperator) const override {
+    // TODO(b/149408647): Replace with the more selective builtin parser.
+    return ParseOpData;
+  }
+
   TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
-                          TfLiteRegistration* registration, int version = 1) {
+                          TfLiteRegistration* registration,
+                          int version = 1) override {
     if (registrations_len_ >= tOpCount) {
       if (error_reporter_) {
         TF_LITE_REPORT_ERROR(error_reporter_,
@@ -144,14 +148,6 @@ class MicroOpResolver : public OpResolver {
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
 
-// TODO(b/147854028): Consider switching all uses of MicroMutableOpResolver to
-// MicroOpResolver.
-class MicroMutableOpResolver
-    : public MicroOpResolver<TFLITE_REGISTRATIONS_MAX> {
- private:
-  TF_LITE_REMOVE_VIRTUAL_DELETE
-};
-
 };  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index 61ab0e3bec9..6b0c9974874 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
+
+#include "tensorflow/lite/micro/micro_op_resolver.h"
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -58,7 +60,7 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(TestOperations) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
   using tflite::OpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
@@ -66,7 +68,7 @@ TF_LITE_MICRO_TEST(TestOperations) {
 
   // We need space for 7 operators because of 2 ops, one with 3 versions, one
   // with 4 versions.
-  MicroOpResolver<7> micro_op_resolver;
+  MicroMutableOpResolver<7> micro_op_resolver;
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
                                          BuiltinOperator_CONV_2D, &r, 1, 3));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
@@ -104,32 +106,30 @@ TF_LITE_MICRO_TEST(TestOperations) {
 TF_LITE_MICRO_TEST(TestOpRegistrationOverflow) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
-  using tflite::OpResolver;
+  using tflite::MicroMutableOpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  MicroOpResolver<4> micro_op_resolver;
+  MicroMutableOpResolver<4> micro_op_resolver;
   // Register 7 ops, but only 4 is expected because the class is created with
   // that limit..
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
                                          BuiltinOperator_CONV_2D, &r, 0, 2));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
                           micro_op_resolver.AddCustom("mock_custom", &r, 0, 3));
-  OpResolver* resolver = &micro_op_resolver;
 
   TF_LITE_MICRO_EXPECT_EQ(4, micro_op_resolver.GetRegistrationLength());
 }
 
 TF_LITE_MICRO_TEST(TestZeroVersionRegistration) {
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
   using tflite::OpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  MicroOpResolver<1> micro_op_resolver;
+  MicroMutableOpResolver<1> micro_op_resolver;
   micro_op_resolver.AddCustom("mock_custom", &r,
                               tflite::MicroOpResolverAnyVersion());
 
@@ -157,13 +157,13 @@ TF_LITE_MICRO_TEST(TestZeroVersionRegistration) {
 }
 
 TF_LITE_MICRO_TEST(TestZeroModelVersion) {
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
   using tflite::OpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  MicroOpResolver<2> micro_op_resolver;
+  MicroMutableOpResolver<2> micro_op_resolver;
   micro_op_resolver.AddCustom("mock_custom", &r, 1, 2);
   TF_LITE_MICRO_EXPECT_EQ(2, micro_op_resolver.GetRegistrationLength());
   OpResolver* resolver = &micro_op_resolver;
@@ -196,13 +196,13 @@ TF_LITE_MICRO_TEST(TestZeroModelVersion) {
 TF_LITE_MICRO_TEST(TestBuiltinRegistrationErrorReporting) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
   tflite::MockErrorReporter mock_reporter;
-  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  MicroMutableOpResolver<1> micro_op_resolver(&mock_reporter);
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
@@ -215,13 +215,13 @@ TF_LITE_MICRO_TEST(TestBuiltinRegistrationErrorReporting) {
 TF_LITE_MICRO_TEST(TestCustomRegistrationErrorReporting) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
   tflite::MockErrorReporter mock_reporter;
-  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  MicroMutableOpResolver<1> micro_op_resolver(&mock_reporter);
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           micro_op_resolver.AddCustom("mock_custom_0", &r));
@@ -234,13 +234,13 @@ TF_LITE_MICRO_TEST(TestCustomRegistrationErrorReporting) {
 TF_LITE_MICRO_TEST(TestBuiltinVersionRegistrationErrorReporting) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
   tflite::MockErrorReporter mock_reporter;
-  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
                                          BuiltinOperator_CONV_2D, &r, 1, 2));
@@ -253,13 +253,13 @@ TF_LITE_MICRO_TEST(TestBuiltinVersionRegistrationErrorReporting) {
 TF_LITE_MICRO_TEST(TestCustomVersionRegistrationErrorReporting) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
-  using tflite::MicroOpResolver;
+  using tflite::MicroMutableOpResolver;
 
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
   tflite::MockErrorReporter mock_reporter;
-  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, micro_op_resolver.AddCustom("mock_custom_0", &r, 1, 2));
diff --git a/tensorflow/lite/micro/micro_op_resolver.h b/tensorflow/lite/micro/micro_op_resolver.h
new file mode 100644
index 00000000000..64a3c85cc78
--- /dev/null
+++ b/tensorflow/lite/micro/micro_op_resolver.h
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_
+#define TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// This is an interface for the OpResolver for TFLiteMicro. The differences from
+// the TFLite OpResolver base class are to allow for finer grained registration
+// of the Builtin Ops to reduce code size for TFLiteMicro.  We need an interface
+// class instead of directly using MicroMutableOpResolver because
+// MicroMutableOpResolver is a class template with the number of registered Ops
+// as the template parameter.
+class MicroOpResolver : public OpResolver {
+ public:
+  // TODO(b/149408647): The op_type parameter enables a gradual transfer to
+  // selective registration of the parse function. It should be removed once we
+  // no longer need to use ParseOpData (from flatbuffer_conversions.h) as part
+  // of the MicroMutableOpResolver.
+  typedef TfLiteStatus (*BuiltinParseFunction)(const Operator* op,
+                                               BuiltinOperator op_type,
+                                               ErrorReporter* error_reporter,
+                                               BuiltinDataAllocator* allocator,
+                                               void** builtin_data);
+
+  // Returns the operator specific parsing function for the OpData for a
+  //   BuiltinOperator (if registered), else nullptr.
+  virtual BuiltinParseFunction GetOpDataParser(
+      tflite::BuiltinOperator op) const = 0;
+
+  virtual TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                                  TfLiteRegistration* registration,
+                                  int version) = 0;
+
+  ~MicroOpResolver() override {}
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_MICRO_OP_RESOLVER_H_

From 865127af8a5a92039509c0bb3432546855cd7a0e Mon Sep 17 00:00:00 2001
From: Terry Heo <terryheo@google.com>
Date: Thu, 28 May 2020 18:02:30 -0700
Subject: [PATCH 1358/1533] Add a way to build TFLite PIP package with Flex
 support

usage)
CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
PiperOrigin-RevId: 313695767
Change-Id: I92f90ac1fc120bf9a1196bbe64ccac4dbb9e55d8
---
 .../lite/python/interpreter_wrapper/BUILD     | 16 +++++++++---
 tensorflow/lite/tools/pip_package/README.md   | 24 +++++++++++++++++
 .../build_pip_package_with_bazel.sh           | 26 ++++++++++++++++---
 tensorflow/tools/ci_build/Dockerfile.pi       |  3 +++
 .../tools/ci_build/Dockerfile.pi-python3      |  3 +++
 .../tools/ci_build/Dockerfile.pi-python37     |  3 +++
 6 files changed, 68 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index c1778e7b12d..b3799be7af9 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -61,6 +61,13 @@ cc_library(
     ],
 )
 
+config_setting(
+    name = "tflite_pip_with_flex",
+    define_values = {
+        "tflite_pip_with_flex": "true",
+    },
+)
+
 pybind_extension(
     name = "_pywrap_tensorflow_interpreter_wrapper",
     srcs = [
@@ -71,10 +78,13 @@ pybind_extension(
     module_name = "_pywrap_tensorflow_interpreter_wrapper",
     deps = [
         ":interpreter_wrapper_lib",
+        "@pybind11",
+        "//third_party/python_runtime:headers",
         "//tensorflow/lite:framework_lib",
         "//tensorflow/lite/experimental/tflite_api_dispatcher",
         "//tensorflow/python:pybind11_lib",
-        "//third_party/python_runtime:headers",
-        "@pybind11",
-    ],
+    ] + select({
+        ":tflite_pip_with_flex": ["//tensorflow/lite/delegates/flex:delegate"],
+        "//conditions:default": [],
+    }),
 )
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
index 8a2be59b980..9622ce0c7e1 100644
--- a/tensorflow/lite/tools/pip_package/README.md
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -93,6 +93,30 @@ CI_DOCKER_EXTRA_PARAMS="-e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_P
   tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh aarch64
 ```
 
+## Enable TF OP support (Flex delegate)
+
+If you want to use TF ops with Python API, you need to enable flex support.
+You can build TFLite interpreter with flex ops support by providing
+"--define=tflite_pip_with_flex=true" to Bazel.
+
+Here are some examples.
+
+### Native build with Flex for your workstation
+
+```sh
+CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+```
+
+### Cross build with Flex for armhf Python 3.5
+
+```sh
+CI_DOCKER_EXTRA_PARAMS="-e CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true \
+  -e CI_BUILD_PYTHON=python3 -e CROSSTOOL_PYTHON_INCLUDE_PATH=/usr/include/python3.5" \
+  tensorflow/tools/ci_build/ci_build.sh PI-PYTHON3 \
+  tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh armhf
+```
+
 ## Usage
 
 Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
index 69afb2f6b80..4976624e340 100755
--- a/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
+++ b/tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh
@@ -25,6 +25,21 @@ export PACKAGE_VERSION="${TENSORFLOW_VERSION}${VERSION_SUFFIX}"
 BUILD_DIR="${SCRIPT_DIR}/gen/tflite_pip/${PYTHON}"
 TENSORFLOW_TARGET=$1
 
+# Fix container image for cross build.
+if [ ! -z "${CI_BUILD_HOME}" ] && [ `pwd` = "/workspace" ]; then
+  # Fix for curl build problem in 32-bit, see https://stackoverflow.com/questions/35181744/size-of-array-curl-rule-01-is-negative
+  if [ "${TENSORFLOW_TARGET}" = "armhf" ]; then
+    sudo sed -i 's/define CURL_SIZEOF_LONG 8/define CURL_SIZEOF_LONG 4/g' /usr/include/curl/curlbuild.h
+    sudo sed -i 's/define CURL_SIZEOF_CURL_OFF_T 8/define CURL_SIZEOF_CURL_OFF_T 4/g' /usr/include/curl/curlbuild.h
+  fi
+
+  # The system-installed OpenSSL headers get pulled in by the latest BoringSSL
+  # release on this configuration, so move them before we build:
+  if [ -d /usr/include/openssl ]; then
+    sudo mv /usr/include/openssl /usr/include/openssl.original
+  fi
+fi
+
 # Build source tree.
 rm -rf "${BUILD_DIR}" && mkdir -p "${BUILD_DIR}/tflite_runtime"
 cp -r "${TENSORFLOW_LITE_DIR}/tools/pip_package/debian" \
@@ -40,14 +55,16 @@ echo "__git_version__ = '$(git -C "${TENSORFLOW_DIR}" describe)'" >> "${BUILD_DI
 # Build python interpreter_wrapper.
 cd "${BUILD_DIR}"
 case "${TENSORFLOW_TARGET}" in
-  rpi|armhf)
+  armhf)
     BAZEL_FLAGS="--config=elinux_armhf
       --copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
       --copt=-O3 --copt=-fno-tree-pre --copt=-fpermissive
+      --define tensorflow_mkldnn_contraction_kernel=0
       --define=raspberry_pi_with_neon=true"
     ;;
   aarch64)
     BAZEL_FLAGS="--config=elinux_aarch64
+      --define tensorflow_mkldnn_contraction_kernel=0
       --copt=-O3"
     ;;
   *)
@@ -58,14 +75,15 @@ esac
 # include path for Python 3.x builds to work.
 export CROSSTOOL_PYTHON_INCLUDE_PATH
 
-bazel build -c opt -s --config=monolithic ${BAZEL_FLAGS} //tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper
+bazel build -c opt -s --config=monolithic --config=noaws --config=nogcp --config=nohdfs --config=nonccl \
+  ${BAZEL_FLAGS} ${CUSTOM_BAZEL_FLAGS} //tensorflow/lite/python/interpreter_wrapper:_pywrap_tensorflow_interpreter_wrapper
 cp "${TENSORFLOW_DIR}/bazel-bin/tensorflow/lite/python/interpreter_wrapper/_pywrap_tensorflow_interpreter_wrapper.so" \
    "${BUILD_DIR}/tflite_runtime"
 
 # Build python wheel.
 cd "${BUILD_DIR}"
 case "${TENSORFLOW_TARGET}" in
-  rpi|armhf)
+  armhf)
     ${PYTHON} setup_with_bazel.py bdist --plat-name=linux-armv7l \
                        bdist_wheel --plat-name=linux-armv7l
     ;;
@@ -111,7 +129,7 @@ EOF
 fi
 
 case "${TENSORFLOW_TARGET}" in
-  rpi|armhf)
+  armhf)
     dpkg-buildpackage -b -rfakeroot -us -uc -tc -d -a armhf
     ;;
   aarch64)
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi b/tensorflow/tools/ci_build/Dockerfile.pi
index f772880cec1..b8b4753a1e6 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi
+++ b/tensorflow/tools/ci_build/Dockerfile.pi
@@ -22,3 +22,6 @@ RUN /install/install_pi_toolchain.sh
 
 # Set up the master bazelrc configuration file.
 COPY install/.bazelrc /etc/bazel.bazelrc
+
+# XLA is not needed for PI
+ENV TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python3 b/tensorflow/tools/ci_build/Dockerfile.pi-python3
index 14ebb9069f9..bcc5d13f9d5 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi-python3
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python3
@@ -21,3 +21,6 @@ RUN /install/install_pi_python3_toolchain.sh
 
 # Set up the master bazelrc configuration file.
 COPY install/.bazelrc /etc/bazel.bazelrc
+
+# XLA is not needed for PI
+ENV TF_ENABLE_XLA=0
diff --git a/tensorflow/tools/ci_build/Dockerfile.pi-python37 b/tensorflow/tools/ci_build/Dockerfile.pi-python37
index 8e231102b79..2c1cd2f8942 100644
--- a/tensorflow/tools/ci_build/Dockerfile.pi-python37
+++ b/tensorflow/tools/ci_build/Dockerfile.pi-python37
@@ -21,3 +21,6 @@ RUN /install/install_pi_python37_toolchain.sh
 
 # Set up the master bazelrc configuration file.
 COPY install/.bazelrc /etc/bazel.bazelrc
+
+# XLA is not needed for PI
+ENV TF_ENABLE_XLA=0

From 988f124ddb84e483ad0e4fff87df26018a73dceb Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Thu, 28 May 2020 18:06:00 -0700
Subject: [PATCH 1359/1533] Internal test change for Hexagon

PiperOrigin-RevId: 313696314
Change-Id: Ie52c3bea91156e135524bc376d537610bd046461
---
 .../hexagon/builders/tests/activations_test.cc |  2 ++
 .../hexagon/builders/tests/concat_test.cc      |  2 ++
 .../delegates/hexagon/builders/tests/tests.bzl | 18 +++++++++++++++++-
 tensorflow/lite/special_rules.bzl              |  4 ++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
index eed70619acf..45698da7a17 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/activations_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
+namespace {
 void GenerateUniformRandomVector(int size, float min, float max,
                                  std::minstd_rand* random_engine,
                                  std::vector<float>* result) {
@@ -44,6 +45,7 @@ void GenerateUniformRandomVector(int size, float min, float max,
     (*result)[i] = min + (max - min) * random_value_scaled_0_1;
   }
 }
+}  // namespace
 
 class ActivationOpModel : public SingleOpModelWithHexagon {
  public:
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
index 335586d7b13..be036323e10 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/concat_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
+namespace {
 void GenerateUniformRandomVector(int size, float min, float max,
                                  std::minstd_rand* random_engine,
                                  std::vector<float>* result) {
@@ -41,6 +42,7 @@ void GenerateUniformRandomVector(int size, float min, float max,
     (*result)[i] = min + (max - min) * random_value_scaled_0_1;
   }
 }
+}  // namespace
 
 class QuantizedConcatenationOpModel : public SingleOpModelWithHexagon {
  public:
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl
index 79cab14128f..57ce69f5e2c 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/tests.bzl
@@ -1,9 +1,11 @@
 """Rules for generating unit-tests using hexagon delegates."""
 
+load("//tensorflow/lite:special_rules.bzl", "tflite_hexagon_mobile_test")
+
 def hexagon_op_tests(
         srcs = [],
         deps = []):
-    """Create separate unit test targets for each test file in 'srcs'.
+    """Create both monolithic and individual unit test targets for each test file in 'srcs'.
 
     Args:
         srcs: list of test files, separate target will be created for each item in the list.
@@ -23,3 +25,17 @@ def hexagon_op_tests(
                 "notap",
             ],
         )
+
+    all_ops_test_name = "hexagon_op_tests_all"
+    native.cc_test(
+        name = all_ops_test_name,
+        srcs = srcs,
+        deps = deps,
+        linkstatic = 1,
+        tags = [
+            "no_oss",
+            "nobuilder",
+            "notap",
+        ],
+    )
+    tflite_hexagon_mobile_test(all_ops_test_name)
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index c1373d3a5c2..5053eb2a16b 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -47,3 +47,7 @@ def if_nnapi(supported, not_supported = [], supported_android = None):
         clean_dep("//tensorflow:android"): supported_android,
         "//conditions:default": supported,
     })
+
+def tflite_hexagon_mobile_test(name):
+    """This is a no-op outside of Google."""
+    pass

From 3ed7b0732e811827b5152b88f0c0cfec2c6894c4 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 28 May 2020 18:16:19 -0700
Subject: [PATCH 1360/1533] Re-enable disabled XNNPACK delegate tests with tsan

PiperOrigin-RevId: 313697493
Change-Id: I2d4785f0d283e28e4655bbeea08db580671490d9
---
 tensorflow/lite/delegates/xnnpack/BUILD | 3 ---
 tensorflow/workspace.bzl                | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 00c0709ccc3..880145e51eb 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -291,9 +291,6 @@ cc_test(
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
-    tags = [
-        "notsan",  # TODO(b/155404603)
-    ],
     deps = [
         ":pool_2d_tester",
         ":test_main",
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index db87f9a730d..bf6440504f0 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "05904bb15b7a5abadc261c16e6be3ac2314d6d4384aa16349b7354d9fa8bbb4f",
-        strip_prefix = "XNNPACK-1e5f80293b3c0197aaf44f3adb9329401fd36ed4",
+        sha256 = "dfa6181e238f0ca88a641952678cd7f3e38da541d8b731ce3fea1d0eeffb6101",
+        strip_prefix = "XNNPACK-b2217ddb5fa74db09d9da1326902269ae18e41ad",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/1e5f80293b3c0197aaf44f3adb9329401fd36ed4.zip",
-            "https://github.com/google/XNNPACK/archive/1e5f80293b3c0197aaf44f3adb9329401fd36ed4.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/b2217ddb5fa74db09d9da1326902269ae18e41ad.zip",
+            "https://github.com/google/XNNPACK/archive/b2217ddb5fa74db09d9da1326902269ae18e41ad.zip",
         ],
     )
 

From 69c0447f011bd5077abcd6078502c64e701d97a8 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 28 May 2020 18:26:21 -0700
Subject: [PATCH 1361/1533] ParallelDevice: Sync executors when returning
 non-parallel TensorHandles, add remote tests

The actual delta isn't huge; I'm moving some test utilities to a testlib since the remote tests need them. The remote tests are in a different target because they need to disable global heap checking, which I'd like to keep on for the rest of the tests.

PiperOrigin-RevId: 313698670
Change-Id: I846294a748e3b007eba0472901b0e58358b8edd5
---
 tensorflow/c/eager/parallel_device/BUILD      |  45 +-
 .../eager/parallel_device/parallel_device.cc  |   5 +
 .../parallel_device_remote_test.cc            | 147 +++++++
 .../parallel_device/parallel_device_test.cc   | 385 +-----------------
 .../parallel_device_testlib.cc                | 308 ++++++++++++++
 .../parallel_device/parallel_device_testlib.h | 174 ++++++++
 6 files changed, 677 insertions(+), 387 deletions(-)
 create mode 100644 tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
 create mode 100644 tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
 create mode 100644 tensorflow/c/eager/parallel_device/parallel_device_testlib.h

diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 3b2640e14d1..6fce918aab1 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -39,9 +39,11 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "parallel_device_test",
-    srcs = ["parallel_device_test.cc"],
+cc_library(
+    name = "parallel_device_testlib",
+    testonly = 1,
+    srcs = ["parallel_device_testlib.cc"],
+    hdrs = ["parallel_device_testlib.h"],
     deps = [
         ":parallel_device",
         ":parallel_device_ops",
@@ -49,12 +51,49 @@ tf_cc_test(
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "parallel_device_test",
+    srcs = ["parallel_device_test.cc"],
+    deps = [
+        ":parallel_device",
+        ":parallel_device_ops",
+        ":parallel_device_testlib",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
 
+tf_cc_test(
+    name = "parallel_device_remote_test",
+    srcs = ["parallel_device_remote_test.cc"],
+    # TODO(b/136478427): Enable global heap checking when servers shut down
+    # cleanly.
+    args = ["--heap_check=local"],
+    deps = [
+        ":parallel_device",
+        ":parallel_device_ops",
+        ":parallel_device_testlib",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+    ],
+)
+
 # Note: ParallelDevice-specific ops are experimental and not currently linked in
 # to TensorFlow by default, just used in a few tests.
 filegroup(
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index 27c2699c4c2..75d188d0c45 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -319,6 +319,11 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     std::vector<MaybeParallelTensorOwned> outputs;
     outputs.reserve(t->num_tensors());
     for (int i = 0; i < t->num_tensors(); ++i) {
+      // TODO(b/157523095): Syncing the executor here shouldn't be
+      // necessary. Currently async+remote is missing cross-executor
+      // coordination.
+      TFE_ExecutorWaitForAllPendingNodes(executors_[i].get(), status);
+      if (TF_GetCode(status) != TF_OK) return result;
       TensorHandlePtr this_output(
           TFE_TensorHandleCopySharingTensor(t->tensor(i), status));
       outputs.emplace_back(std::move(this_output));
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
new file mode 100644
index 00000000000..32a4b440d25
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_remote_test.cc
@@ -0,0 +1,147 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+#include <string>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_testlib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/platform/test.h"
+
+tensorflow::ServerDef GetServerDef(const std::string& job_name, int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol("grpc");
+  server_def.set_job_name(job_name);
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name(job_name);
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost", ":", port)});
+  }
+  return server_def;
+}
+
+TEST(PARALLEL_DEVICE, TestRemoteBasic) {
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  tensorflow::ServerDef server_def = GetServerDef("worker", 3);
+
+  // This server def has the task index set to 0.
+  std::string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TFE_ContextSetServerDef(context.get(), 0, serialized.data(),
+                          serialized.size(), status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  BasicTestsForTwoDevices(context.get(),
+                          "/job:worker/replica:0/task:1/device:CPU:0",
+                          "/job:worker/replica:0/task:2/device:CPU:0");
+
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(PARALLEL_DEVICE, TestAsyncCopyOff) {
+  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
+      TFE_NewContextOptions(), TFE_DeleteContextOptions);
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TFE_Context, decltype(&TFE_DeleteContext)> context(
+      TFE_NewContext(opts.get(), status.get()), TFE_DeleteContext);
+  tensorflow::ServerDef server_def = GetServerDef("worker", 3);
+
+  // This server def has the task index set to 0.
+  std::string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TFE_ContextSetServerDef(context.get(), 0, serialized.data(),
+                          serialized.size(), status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  const char* first_device = "/job:worker/replica:0/task:1/device:CPU:0";
+  const char* second_device = "/job:worker/replica:0/task:2/device:CPU:0";
+  const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  std::array<const char*, 2> underlying_devices{first_device, second_device};
+  RegisterParallelDevice(context.get(), device_name, underlying_devices,
+                         status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  TensorHandlePtr value_one(FloatTensorHandle(3., status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  TensorHandlePtr value_two(FloatTensorHandle(-2., status.get()));
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  std::array<TFE_TensorHandle*, 2> in_components{value_one.get(),
+                                                 value_two.get()};
+  TensorHandlePtr combined_value = CreatePerDeviceValues(
+      context.get(), in_components, device_name, status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Loop to make synchronization failures more deterministic
+  for (int i = 0; i < 100; ++i) {
+    TensorHandlePtr multiply_result(
+        Multiply(context.get(), combined_value.get(), combined_value.get(),
+                 status.get()));
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    std::array<TensorHandlePtr, 2> out_components;
+    ExtractPerDeviceValues(context.get(), multiply_result.get(),
+                           &out_components, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<float>(out_components[0].get(), 9.);
+    ExpectScalarEq<float>(out_components[1].get(), 4.);
+  }
+
+  worker_server1.release();
+  worker_server2.release();
+}
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
index fdc140407df..d9784ac9fa6 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/parallel_device/parallel_device_testlib.h"
 #include "tensorflow/core/platform/test.h"
 
 // NOTE(allenl): These tests currently go through TFE_Execute and so are
@@ -28,390 +29,6 @@ limitations under the License.
 // correspond fairly well to the implementation, but testing the C++ directly is
 // another option.
 
-// Functor for making unique_ptr to TFE_TensorHandle slightly more
-// ergonomic. Using decltype(TFE_DeleteTensorHandle) in the unique_ptr's second
-// template argument requires passing a function pointer to
-// TFE_DeleteTensorHandle when constructing the unique_ptr.
-class TensorHandleDeleter {
- public:
-  void operator()(TFE_TensorHandle* to_delete) {
-    TFE_DeleteTensorHandle(to_delete);
-  }
-};
-
-using TensorHandlePtr = std::unique_ptr<TFE_TensorHandle, TensorHandleDeleter>;
-
-// A helper for performing common operations on variables. A much more
-// restricted stand-in for tf.Variable in Python.
-class Variable {
- public:
-  // Construct a Variable from a resource-dtype TFE_TensorHandle and an
-  // indication of the dtype of the variable's value.
-  //
-  // Note that creating this resource-dtype handle can fail, so `Create` is a
-  // separate static method which returns a status.
-  Variable(TFE_TensorHandle* handle, TF_DataType type)
-      : handle_(handle), type_(type) {}
-
-  // Helper for constructing a resource handle and wrapping it in a `Variable`
-  // object.
-  static Variable* Create(TFE_Context* context, TF_DataType type,
-                          const int64_t* dims, const int num_dims,
-                          const char* device, TF_Status* status);
-  // Dereferences the backing buffer for the variable. Note that since this can
-  // fail (it runs operations), it must be called explicitly and the resulting
-  // `status` checked.
-  void Destroy(TFE_Context* context, TF_Status* status);
-
-  // Reads from the variable.
-  TensorHandlePtr Read(TFE_Context* context, TF_Status* status);
-  // Assigns a new value to the variable.
-  void Assign(TFE_Context* context, TFE_TensorHandle* value, TF_Status* status);
-  // Adds `value` to the existing value of the variable.
-  void AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
-                 TF_Status* status);
-
- private:
-  // Helper for running any single-argument assignment ops (Assign, AssignAdd,
-  // AssignSub, ...).
-  void GeneralAssignment(const char* op_name, TFE_Context* context,
-                         TFE_TensorHandle* value, TF_Status* status);
-
-  // The a handle for the resource-dtype tensor pointing to the variable's
-  // buffer.
-  TFE_TensorHandle* handle_;
-  // The dtype of the variable's buffer (input dtype for assignments, output
-  // dtype of read operations).
-  TF_DataType type_;
-};
-
-Variable* Variable::Create(TFE_Context* context, TF_DataType type,
-                           const int64_t* dims, const int num_dims,
-                           const char* device, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "VarHandleOp", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op.get(), "dtype", type);
-  TFE_OpSetAttrShape(op.get(), "shape", dims, num_dims, status);
-  TFE_OpSetAttrString(op.get(), "container", "", 0);
-  // Use the special GUID for no buffer sharing
-  //
-  // TODO(allenl): Should we provide a better API for this? AFAIK this is the
-  // only reasonable way to make variables with no aliasing using the eager C
-  // API.
-  std::string no_sharing = "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
-  TFE_OpSetAttrString(op.get(), "shared_name", no_sharing.c_str(),
-                      no_sharing.length());
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_TensorHandle* var_handle = nullptr;
-  int num_retvals = 1;
-  TFE_Execute(op.get(), &var_handle, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return new Variable(var_handle, type);
-}
-
-void Variable::Destroy(TFE_Context* context, TF_Status* status) {
-  // Free the backing buffer for the variable.
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "DestroyResourceOp", status), &TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpAddInput(op.get(), handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  const char* device = TFE_TensorHandleDeviceName(handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  int num_retvals = 0;
-  TFE_Execute(op.get(), nullptr, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  // Delete the variable handle itself.
-  TFE_DeleteTensorHandle(handle_);
-}
-
-TensorHandlePtr Variable::Read(TFE_Context* context, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "ReadVariableOp", status), &TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpAddInput(op.get(), handle_, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  const char* device = TFE_TensorHandleDeviceName(handle_, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrType(op.get(), "dtype", type_);
-  int num_retvals = 1;
-  TFE_TensorHandle* var_value = nullptr;
-  TFE_Execute(op.get(), &var_value, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return TensorHandlePtr(var_value);
-}
-
-void Variable::GeneralAssignment(const char* op_name, TFE_Context* context,
-                                 TFE_TensorHandle* value, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, op_name, status), &TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetAttrType(op.get(), "dtype", type_);
-  TFE_OpAddInput(op.get(), handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpAddInput(op.get(), value, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  const char* device = TFE_TensorHandleDeviceName(handle_, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetDevice(op.get(), device, status);
-
-  int num_retvals = 0;
-  TFE_Execute(op.get(), nullptr, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return;
-}
-
-void Variable::AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
-                         TF_Status* status) {
-  GeneralAssignment("AssignAddVariableOp", context, value, status);
-}
-
-void Variable::Assign(TFE_Context* context, TFE_TensorHandle* value,
-                      TF_Status* status) {
-  GeneralAssignment("AssignVariableOp", context, value, status);
-}
-
-// Passed to `TF_NewTensor` to indicate how an array of floats should be
-// deleted.
-static void FloatDeallocator(void* data, size_t, void* arg) {
-  delete[] static_cast<float*>(data);
-}
-
-// Creates a TFE_TensorHandle with value `v`.
-TensorHandlePtr FloatTensorHandle(float v, TF_Status* status) {
-  const int num_bytes = sizeof(float);
-  float* values = new float[1];
-  values[0] = v;
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
-      TF_NewTensor(TF_FLOAT, nullptr, 0, values, num_bytes, &FloatDeallocator,
-                   nullptr),
-      TF_DeleteTensor);
-  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
-}
-
-// Creates a rank-one TFE_TensorHandle with value `v`.
-TensorHandlePtr VectorFloatTensorHandle(const std::vector<float>& v,
-                                        TF_Status* status) {
-  const int num_bytes = v.size() * sizeof(float);
-  float* values = new float[v.size()];
-  memcpy(values, v.data(), num_bytes);
-  int64_t dims = v.size();
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
-      TF_NewTensor(TF_FLOAT, &dims, 1 /* num_dims */, values, num_bytes,
-                   &FloatDeallocator, nullptr),
-      TF_DeleteTensor);
-  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
-}
-
-// Helper to un-pack `num_replicas` TFE_TensorHandles from one parallel handle.
-template <std::size_t num_replicas>
-void ExtractPerDeviceValues(
-    TFE_Context* context, TFE_TensorHandle* input,
-    std::array<TensorHandlePtr, num_replicas>* components, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "TPUReplicatedOutput", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetAttrInt(op.get(), "num_replicas", num_replicas);
-  TFE_OpAddInput(op.get(), input, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  const char* device = TFE_TensorHandleDeviceName(input, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return;
-
-  TFE_TensorHandle* result_handles[num_replicas];
-  int num_retvals = num_replicas;
-  TFE_Execute(op.get(), result_handles, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return;
-  for (int i = 0; i < num_replicas; ++i) {
-    (*components)[i].reset(result_handles[i]);
-  }
-}
-
-// Helper to pack `num_replicas` TFE_TensorHandles into one parallel handle.
-template <std::size_t num_replicas>
-TensorHandlePtr CreatePerDeviceValues(
-    TFE_Context* context,
-    const std::array<TFE_TensorHandle*, num_replicas>& components,
-    const char* device, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "TPUReplicatedInput", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetAttrInt(op.get(), "N", num_replicas);
-  for (int i = 0; i < num_replicas; ++i) {
-    TFE_OpAddInput(op.get(), components[i], status);
-    if (TF_GetCode(status) != TF_OK) return nullptr;
-  }
-  TFE_OpSetDevice(op.get(), device, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-
-  TFE_TensorHandle* result_handle;
-  int num_retvals = 1;
-  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return TensorHandlePtr(result_handle);
-}
-
-TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
-                         TFE_TensorHandle* second, TF_Status* status) {
-  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-      TFE_NewOp(context, "Mul", status), TFE_DeleteOp);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpAddInput(op.get(), first, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpAddInput(op.get(), second, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  const char* first_device = TFE_TensorHandleDeviceName(first, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  TFE_OpSetDevice(op.get(), first_device, status);
-
-  TFE_TensorHandle* result_handle;
-  int num_retvals = 1;
-  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
-  if (TF_GetCode(status) != TF_OK) return nullptr;
-  return TensorHandlePtr(result_handle);
-}
-
-// Assert that `handle` is equal to `expected_value`.
-template <typename value_type>
-void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_zero(
-      TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  EXPECT_EQ(expected_value,
-            *static_cast<value_type*>(TF_TensorData(value_zero.get())));
-}
-
-template <std::size_t num_devices>
-void RegisterParallelDevice(
-    TFE_Context* context, const char* device_name,
-    const std::array<const char*, num_devices>& underlying_devices,
-    TF_Status* status) {
-  TFE_CustomDevice device;
-  void* device_info;
-  tensorflow::eager::AllocateParallelDevice(
-      device_name, underlying_devices.data(), underlying_devices.size(),
-      &device, &device_info);
-  TFE_RegisterCustomDevice(context, device, device_name, device_info, status);
-}
-
-// Create and modify a variable placed on a parallel device which composes
-// `first_device` and `second_device`.
-void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
-                             const char* second_device) {
-  // Register the custom device
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
-  std::array<const char*, 2> underlying_devices{first_device, second_device};
-  RegisterParallelDevice(context, device_name, underlying_devices,
-                         status.get());
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-  // Create a variable handle (uninitialized to start) placed on the parallel
-  // device.
-  std::function<void(Variable*)> variable_deleter = [&](Variable* to_delete) {
-    to_delete->Destroy(context, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    delete to_delete;
-  };
-  std::unique_ptr<Variable, decltype(variable_deleter)> variable(
-      Variable::Create(context, TF_FLOAT, /* Scalar */ {}, 0, device_name,
-                       status.get()),
-      variable_deleter);
-  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-  // Assign an initial value to the variable, implicitly mirroring it to each
-  // component device.
-  {
-    TensorHandlePtr initial_value = FloatTensorHandle(20., status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    variable->Assign(context, initial_value.get(), status.get());
-  }
-
-  // Read from the variable and verify that we have a parallel tensor.
-  {
-    TensorHandlePtr read = variable->Read(context, status.get());
-    std::array<TensorHandlePtr, 2> components;
-    ExtractPerDeviceValues(context, read.get(), &components, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    ExpectScalarEq<float>(components[0].get(), 20.);
-    ExpectScalarEq<float>(components[1].get(), 20.);
-
-    std::string first_device =
-        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
-    ASSERT_EQ(underlying_devices[0], first_device);
-    std::string second_device =
-        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
-    ASSERT_EQ(underlying_devices[1], second_device);
-  }
-
-  // Add a parallel tensor with different values on each device to the variable.
-  {
-    TensorHandlePtr value_one(FloatTensorHandle(3., status.get()));
-    TensorHandlePtr value_two(FloatTensorHandle(-2., status.get()));
-    std::array<TFE_TensorHandle*, 2> components{value_one.get(),
-                                                value_two.get()};
-    TensorHandlePtr combined_value =
-        CreatePerDeviceValues(context, components, device_name, status.get());
-    variable->AssignAdd(context, combined_value.get(), status.get());
-  }
-
-  // Read the variable and verify that each component has the right modified
-  // value.
-  {
-    TensorHandlePtr read = variable->Read(context, status.get());
-    std::array<TensorHandlePtr, 2> components;
-    ExtractPerDeviceValues(context, read.get(), &components, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    ExpectScalarEq<float>(components[0].get(), 23.);
-    ExpectScalarEq<float>(components[1].get(), 18.);
-
-    std::string first_device =
-        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
-    ASSERT_EQ(underlying_devices[0], first_device);
-    std::string second_device =
-        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
-    ASSERT_EQ(underlying_devices[1], second_device);
-  }
-  // Compute the device ID twice and verify the result
-  for (int i = 0; i < 2; ++i) {
-    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
-        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    TFE_OpSetDevice(op.get(), device_name, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    TFE_TensorHandle* result_handle;
-    int num_retvals = 1;
-    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-    std::array<TensorHandlePtr, 2> components;
-    ExtractPerDeviceValues(context, result_handle, &components, status.get());
-    TFE_DeleteTensorHandle(result_handle);
-    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-
-    ExpectScalarEq<int64_t>(components[0].get(), 0);
-    ExpectScalarEq<int64_t>(components[1].get(), 1);
-    std::string first_device =
-        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
-    ASSERT_EQ(underlying_devices[0], first_device);
-    std::string second_device =
-        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
-    ASSERT_EQ(underlying_devices[1], second_device);
-  }
-}
-
 TEST(PARALLEL_DEVICE, TestBasicCPU) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
new file mode 100644
index 00000000000..fba47865c36
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.cc
@@ -0,0 +1,308 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/parallel_device/parallel_device_testlib.h"
+
+#include <array>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/core/platform/test.h"
+
+// NOTE(allenl): These tests currently go through TFE_Execute and so are
+// integration testing rather than purely testing the parallel device. They
+// correspond fairly well to the implementation, but testing the C++ directly is
+// another option.
+
+
+Variable* Variable::Create(TFE_Context* context, TF_DataType type,
+                           const int64_t* dims, const int num_dims,
+                           const char* device, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "VarHandleOp", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op.get(), "dtype", type);
+  TFE_OpSetAttrShape(op.get(), "shape", dims, num_dims, status);
+  TFE_OpSetAttrString(op.get(), "container", "", 0);
+  // Use the special GUID for no buffer sharing
+  //
+  // TODO(allenl): Should we provide a better API for this? AFAIK this is the
+  // only reasonable way to make variables with no aliasing using the eager C
+  // API.
+  std::string no_sharing = "cd2c89b7-88b7-44c8-ad83-06c2a9158347";
+  TFE_OpSetAttrString(op.get(), "shared_name", no_sharing.c_str(),
+                      no_sharing.length());
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_TensorHandle* var_handle = nullptr;
+  int num_retvals = 1;
+  TFE_Execute(op.get(), &var_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return new Variable(var_handle, type);
+}
+
+void Variable::Destroy(TFE_Context* context, TF_Status* status) {
+  // Free the backing buffer for the variable.
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "DestroyResourceOp", status), &TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpAddInput(op.get(), handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* device = TFE_TensorHandleDeviceName(handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  int num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  // Delete the variable handle itself.
+  TFE_DeleteTensorHandle(handle_);
+}
+
+TensorHandlePtr Variable::Read(TFE_Context* context, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "ReadVariableOp", status), &TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpAddInput(op.get(), handle_, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  const char* device = TFE_TensorHandleDeviceName(handle_, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrType(op.get(), "dtype", type_);
+  int num_retvals = 1;
+  TFE_TensorHandle* var_value = nullptr;
+  TFE_Execute(op.get(), &var_value, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return TensorHandlePtr(var_value);
+}
+
+void Variable::GeneralAssignment(const char* op_name, TFE_Context* context,
+                                 TFE_TensorHandle* value, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, op_name, status), &TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetAttrType(op.get(), "dtype", type_);
+  TFE_OpAddInput(op.get(), handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpAddInput(op.get(), value, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* device = TFE_TensorHandleDeviceName(handle_, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetDevice(op.get(), device, status);
+
+  int num_retvals = 0;
+  TFE_Execute(op.get(), nullptr, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return;
+}
+
+void Variable::AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
+                         TF_Status* status) {
+  GeneralAssignment("AssignAddVariableOp", context, value, status);
+}
+
+void Variable::Assign(TFE_Context* context, TFE_TensorHandle* value,
+                      TF_Status* status) {
+  GeneralAssignment("AssignVariableOp", context, value, status);
+}
+
+// Passed to `TF_NewTensor` to indicate how an array of floats should be
+// deleted.
+static void FloatDeallocator(void* data, size_t, void* arg) {
+  delete[] static_cast<float*>(data);
+}
+
+// Creates a TFE_TensorHandle with value `v`.
+TensorHandlePtr FloatTensorHandle(float v, TF_Status* status) {
+  const int num_bytes = sizeof(float);
+  float* values = new float[1];
+  values[0] = v;
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+      TF_NewTensor(TF_FLOAT, nullptr, 0, values, num_bytes, &FloatDeallocator,
+                   nullptr),
+      TF_DeleteTensor);
+  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
+}
+
+// Creates a rank-one TFE_TensorHandle with value `v`.
+TensorHandlePtr VectorFloatTensorHandle(const std::vector<float>& v,
+                                        TF_Status* status) {
+  const int num_bytes = v.size() * sizeof(float);
+  float* values = new float[v.size()];
+  memcpy(values, v.data(), num_bytes);
+  int64_t dims = v.size();
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+      TF_NewTensor(TF_FLOAT, &dims, 1 /* num_dims */, values, num_bytes,
+                   &FloatDeallocator, nullptr),
+      TF_DeleteTensor);
+  return TensorHandlePtr(TFE_NewTensorHandle(tensor.get(), status));
+}
+
+// Helper to un-pack `num_replicas` TFE_TensorHandles from one parallel handle.
+template <std::size_t num_replicas>
+void ExtractPerDeviceValues(
+    TFE_Context* context, TFE_TensorHandle* input,
+    std::array<TensorHandlePtr, num_replicas>* components, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "TPUReplicatedOutput", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetAttrInt(op.get(), "num_replicas", num_replicas);
+  TFE_OpAddInput(op.get(), input, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  const char* device = TFE_TensorHandleDeviceName(input, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return;
+
+  TFE_TensorHandle* result_handles[num_replicas];
+  int num_retvals = num_replicas;
+  TFE_Execute(op.get(), result_handles, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return;
+  for (int i = 0; i < num_replicas; ++i) {
+    (*components)[i].reset(result_handles[i]);
+  }
+}
+
+TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
+                         TFE_TensorHandle* second, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "Mul", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpAddInput(op.get(), first, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpAddInput(op.get(), second, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  const char* first_device = TFE_TensorHandleDeviceName(first, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetDevice(op.get(), first_device, status);
+
+  TFE_TensorHandle* result_handle;
+  int num_retvals = 1;
+  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return TensorHandlePtr(result_handle);
+}
+
+// Create and modify a variable placed on a parallel device which composes
+// `first_device` and `second_device`.
+void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
+                             const char* second_device) {
+  // Register the custom device
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  const char* device_name = "/job:localhost/replica:0/task:0/device:CUSTOM:0";
+  std::array<const char*, 2> underlying_devices{first_device, second_device};
+  RegisterParallelDevice(context, device_name, underlying_devices,
+                         status.get());
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Create a variable handle (uninitialized to start) placed on the parallel
+  // device.
+  std::function<void(Variable*)> variable_deleter = [&](Variable* to_delete) {
+    to_delete->Destroy(context, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    delete to_delete;
+  };
+  std::unique_ptr<Variable, decltype(variable_deleter)> variable(
+      Variable::Create(context, TF_FLOAT, /* Scalar */ {}, 0, device_name,
+                       status.get()),
+      variable_deleter);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+  // Assign an initial value to the variable, implicitly mirroring it to each
+  // component device.
+  {
+    TensorHandlePtr initial_value = FloatTensorHandle(20., status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    variable->Assign(context, initial_value.get(), status.get());
+  }
+
+  // Read from the variable and verify that we have a parallel tensor.
+  {
+    TensorHandlePtr read = variable->Read(context, status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, read.get(), &components, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<float>(components[0].get(), 20.);
+    ExpectScalarEq<float>(components[1].get(), 20.);
+
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
+
+  // Add a parallel tensor with different values on each device to the variable.
+  {
+    TensorHandlePtr value_one(FloatTensorHandle(3., status.get()));
+    TensorHandlePtr value_two(FloatTensorHandle(-2., status.get()));
+    std::array<TFE_TensorHandle*, 2> components{value_one.get(),
+                                                value_two.get()};
+    TensorHandlePtr combined_value =
+        CreatePerDeviceValues(context, components, device_name, status.get());
+    variable->AssignAdd(context, combined_value.get(), status.get());
+  }
+
+  // Read the variable and verify that each component has the right modified
+  // value.
+  {
+    TensorHandlePtr read = variable->Read(context, status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, read.get(), &components, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<float>(components[0].get(), 23.);
+    ExpectScalarEq<float>(components[1].get(), 18.);
+
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
+  // Compute the device ID twice and verify the result
+  for (int i = 0; i < 2; ++i) {
+    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_OpSetDevice(op.get(), device_name, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_TensorHandle* result_handle;
+    int num_retvals = 1;
+    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, result_handle, &components, status.get());
+    TFE_DeleteTensorHandle(result_handle);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<int64_t>(components[0].get(), 0);
+    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
+}
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_testlib.h b/tensorflow/c/eager/parallel_device/parallel_device_testlib.h
new file mode 100644
index 00000000000..fdd21087949
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_testlib.h
@@ -0,0 +1,174 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
+#define TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_
+
+#include "tensorflow/c/eager/parallel_device/parallel_device.h"
+
+#include <array>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/core/platform/test.h"
+
+
+// Functor for making unique_ptr to TFE_TensorHandle slightly more
+// ergonomic. Using decltype(TFE_DeleteTensorHandle) in the unique_ptr's second
+// template argument requires passing a function pointer to
+// TFE_DeleteTensorHandle when constructing the unique_ptr.
+class TensorHandleDeleter {
+ public:
+  void operator()(TFE_TensorHandle* to_delete) {
+    TFE_DeleteTensorHandle(to_delete);
+  }
+};
+
+using TensorHandlePtr = std::unique_ptr<TFE_TensorHandle, TensorHandleDeleter>;
+
+// A helper for performing common operations on variables. A much more
+// restricted stand-in for tf.Variable in Python.
+class Variable {
+ public:
+  // Construct a Variable from a resource-dtype TFE_TensorHandle and an
+  // indication of the dtype of the variable's value.
+  //
+  // Note that creating this resource-dtype handle can fail, so `Create` is a
+  // separate static method which returns a status.
+  Variable(TFE_TensorHandle* handle, TF_DataType type)
+      : handle_(handle), type_(type) {}
+
+  // Helper for constructing a resource handle and wrapping it in a `Variable`
+  // object.
+  static Variable* Create(TFE_Context* context, TF_DataType type,
+                          const int64_t* dims, const int num_dims,
+                          const char* device, TF_Status* status);
+  // Dereferences the backing buffer for the variable. Note that since this can
+  // fail (it runs operations), it must be called explicitly and the resulting
+  // `status` checked.
+  void Destroy(TFE_Context* context, TF_Status* status);
+
+  // Reads from the variable.
+  TensorHandlePtr Read(TFE_Context* context, TF_Status* status);
+  // Assigns a new value to the variable.
+  void Assign(TFE_Context* context, TFE_TensorHandle* value, TF_Status* status);
+  // Adds `value` to the existing value of the variable.
+  void AssignAdd(TFE_Context* context, TFE_TensorHandle* value,
+                 TF_Status* status);
+
+ private:
+  // Helper for running any single-argument assignment ops (Assign, AssignAdd,
+  // AssignSub, ...).
+  void GeneralAssignment(const char* op_name, TFE_Context* context,
+                         TFE_TensorHandle* value, TF_Status* status);
+
+  // The a handle for the resource-dtype tensor pointing to the variable's
+  // buffer.
+  TFE_TensorHandle* handle_;
+  // The dtype of the variable's buffer (input dtype for assignments, output
+  // dtype of read operations).
+  TF_DataType type_;
+};
+
+// Creates a TFE_TensorHandle with value `v`.
+TensorHandlePtr FloatTensorHandle(float v, TF_Status* status);
+
+// Creates a rank-one TFE_TensorHandle with value `v`.
+TensorHandlePtr VectorFloatTensorHandle(const std::vector<float>& v,
+                                        TF_Status* status);
+
+// Helper to un-pack `num_replicas` TFE_TensorHandles from one parallel handle.
+template <std::size_t num_replicas>
+void ExtractPerDeviceValues(
+    TFE_Context* context, TFE_TensorHandle* input,
+    std::array<TensorHandlePtr, num_replicas>* components, TF_Status* status);
+
+// Helper to pack `num_replicas` TFE_TensorHandles into one parallel handle.
+template <std::size_t num_replicas>
+TensorHandlePtr CreatePerDeviceValues(
+    TFE_Context* context,
+    const std::array<TFE_TensorHandle*, num_replicas>& components,
+    const char* device, TF_Status* status);
+
+TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
+                         TFE_TensorHandle* second, TF_Status* status);
+
+// Assert that `handle` is equal to `expected_value`.
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value);
+
+template <std::size_t num_devices>
+void RegisterParallelDevice(
+    TFE_Context* context, const char* device_name,
+    const std::array<const char*, num_devices>& underlying_devices,
+    TF_Status* status);
+
+// Create and modify a variable placed on a parallel device which composes
+// `first_device` and `second_device`.
+void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
+                             const char* second_device);
+
+// Implementations of templated functions ******************************
+
+template <std::size_t num_replicas>
+TensorHandlePtr CreatePerDeviceValues(
+    TFE_Context* context,
+    const std::array<TFE_TensorHandle*, num_replicas>& components,
+    const char* device, TF_Status* status) {
+  std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+      TFE_NewOp(context, "TPUReplicatedInput", status), TFE_DeleteOp);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  TFE_OpSetAttrInt(op.get(), "N", num_replicas);
+  for (int i = 0; i < num_replicas; ++i) {
+    TFE_OpAddInput(op.get(), components[i], status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  TFE_OpSetDevice(op.get(), device, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+
+  TFE_TensorHandle* result_handle;
+  int num_retvals = 1;
+  TFE_Execute(op.get(), &result_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
+  return TensorHandlePtr(result_handle);
+}
+
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_zero(
+      TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
+  ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+  EXPECT_EQ(expected_value,
+            *static_cast<value_type*>(TF_TensorData(value_zero.get())));
+}
+
+template <std::size_t num_devices>
+void RegisterParallelDevice(
+    TFE_Context* context, const char* device_name,
+    const std::array<const char*, num_devices>& underlying_devices,
+    TF_Status* status) {
+  TFE_CustomDevice device;
+  void* device_info;
+  tensorflow::eager::AllocateParallelDevice(
+      device_name, underlying_devices.data(), underlying_devices.size(),
+      &device, &device_info);
+  TFE_RegisterCustomDevice(context, device, device_name, device_info, status);
+}
+
+#endif  // TENSORFLOW_C_EAGER_PARALLEL_DEVICE_PARALLEL_DEVICE_TESTLIB_H_

From a57f4d77e0b56261992e80d3ee44f5996256eead Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 18:32:54 -0700
Subject: [PATCH 1362/1533] [Profiler] Add KPI-related arguments to TraceMe.

PiperOrigin-RevId: 313699382
Change-Id: I99f9d37314ba4551c199bb5681fdc5a3e91d7f92
---
 tensorflow/core/profiler/utils/xplane_schema.cc | 2 ++
 tensorflow/core/profiler/utils/xplane_schema.h  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 3705a4786fa..39cbbf88e95 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -147,6 +147,8 @@ const StatTypeMap& GetStatTypeMap() {
       {"region_type", kRegionType},
       {"data_type", kDataType},
       {"shape", kTensorShapes},
+      {"kpi_name", kKpiName},
+      {"kpi_value", kKpiValue},
       // XPlane semantics related.
       {"$pt", kProducerType},
       {"$ct", kConsumerType},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index de8dc32a4f1..864c1c45ecb 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -139,6 +139,8 @@ enum StatType {
   kRegionType,
   kDataType,
   kTensorShapes,
+  kKpiName,
+  kKpiValue,
   // XPlane semantics related.
   kProducerType,
   kConsumerType,

From b51979e15e684707854e5188e534dc10cbf0dac1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 28 May 2020 18:51:40 -0700
Subject: [PATCH 1363/1533] Minor fix in TF memory profiler.

PiperOrigin-RevId: 313701414
Change-Id: Ie538947d6bc65efa25490cccb7b489d559956933
---
 tensorflow/core/profiler/convert/xplane_to_memory_profile.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
index 5b2a7489241..6dfc3478b31 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
@@ -323,7 +323,7 @@ void InsertSpecialAllocations(int64 unmapped_allocation_bytes, int64 step_id,
     FillActivityMetadata(
         HostEventType::kMemoryAllocation,
         {unmapped_allocation_bytes, unmapped_allocation_bytes, 0,
-         "preallocated/unknown", step_id, "persist", 0, "unknown"},
+         "preallocated/unknown", step_id, "persist/dynamic", 0, "unknown"},
         special_allocation);
     active_allocs->push_back({--index, special_allocation});
   }

From 6115edabbc47cf42fc2f5dda7f0bdf5ab48c0482 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Thu, 28 May 2020 19:08:15 -0700
Subject: [PATCH 1364/1533] Remove the redundant tfl.quantizet/tfl.dequantize
 pair in the post-quantize pass

This patch changes the propagation pass so a special attribute is attached to
the quantize op being added by propagation. These quantize and the paired
dequantize ops can be removed wihtout losing accuracy. So the post-quantize
pass removes these pairs if they are not fused to quantized ops.

PiperOrigin-RevId: 313703390
Change-Id: I002d001147839a7f180071ae7df2a56325e60094
---
 .../compiler/mlir/lite/quantization/BUILD     |  1 +
 .../mlir/lite/quantization/lite/BUILD         |  5 +-
 .../mlir/lite/quantization/lite/tfl_to_std.cc |  7 ++
 .../lite/quantization/quantization_driver.cc  |  7 ++
 .../lite/quantization/quantization_utils.h    |  5 ++
 .../lite/tests/prepare-quantize-signed.mlir   | 10 +--
 .../mlir/lite/tests/prepare-quantize.mlir     | 64 +++++++++----------
 .../compiler/mlir/lite/tests/quantize.mlir    |  8 +--
 .../mlir/lite/transforms/post_quantize.cc     | 22 +++++++
 9 files changed, 86 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index 23a65a88186..91590bfbc13 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -23,6 +23,7 @@ package_group(
 exports_files([
     "quantization_traits.h",
     "quantization_config.h",
+    "quantization_utils.h",
 ])
 
 filegroup(
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index b4fddceb580..d9e478950e6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -49,14 +49,15 @@ cc_library(
     ],
     hdrs = [
         "tfl_to_std.h",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_utils.h",
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
index 8ea1709b15f..6b226fa68e7 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_to_std.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 
 namespace mlir {
 namespace TFL {
@@ -47,12 +48,18 @@ void ConvertMlirQuantOpsToTFLQuantOps(FuncOp func) {
       auto dcast = b.create<DequantizeOp>(dq.getLoc(), dq.getResult().getType(),
                                           dq.arg());
       dq.getResult().replaceAllUsesWith(dcast);
+      if (auto extra_attr = op->getAttr(mlir::quant::kVolatileOpAttrName)) {
+        dcast.setAttr(mlir::quant::kVolatileOpAttrName, extra_attr);
+      }
       dq.erase();
     } else if (auto q = llvm::dyn_cast<quant::QuantizeCastOp>(op)) {
       auto out_type = q.getResult().getType();
       auto qcast = b.create<QuantizeOp>(q.getLoc(), out_type, q.arg(),
                                         TypeAttr::get(out_type));
       q.getResult().replaceAllUsesWith(qcast);
+      if (auto extra_attr = op->getAttr(mlir::quant::kVolatileOpAttrName)) {
+        qcast.setAttr(mlir::quant::kVolatileOpAttrName, extra_attr);
+      }
       q.erase();
     }
   });
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 2964a3e79f8..89443b1ec65 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -494,6 +494,13 @@ void QuantizationDriver::QuantizeValue(Value value, QuantParams params,
   auto quantize = builder_.create<quant::QuantizeCastOp>(loc, new_type, value);
   auto dequantize = builder_.create<quant::DequantizeCastOp>(
       loc, expressed_type, quantize.getResult());
+
+  // This attribute is set to distinguish the quantize ops being added by the
+  // quantization pass. These ops can be removed without losing original
+  // program accuracy.
+  // TODO(fengliuai): make the attribute being part of op definition.
+  quantize.setAttr(kVolatileOpAttrName, builder_.getUnitAttr());
+
   // `original_result` has a use to `quantize`, so this will replace that use
   // by the result of `dequantize`. Remember to reset that use afterwards
   value.replaceAllUsesWith(dequantize);
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index d4512509f6b..b9ff9869232 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -42,6 +42,11 @@ limitations under the License.
 namespace mlir {
 namespace quant {
 
+// A unit attribute can be attached to the quantize/dequantize ops which are
+// added by the quantization passes. These ops can be removed erased without
+// losing accuracy.
+constexpr char kVolatileOpAttrName[] = "volatile";
+
 using QuantParams = quant::QuantizedType;
 using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
 using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index f6054f3d65d..e1f496b91f4 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -63,7 +63,7 @@ func @prepareAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   return %add : tensor<2x2xf32>
 
 // CHECK: %[[cst:.*]] = constant dense<[{{\[}}0.000000e+00, 1.000000e+00], [2.000000e+00, 2.550000e+02]]>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:-128>>}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:-128>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[add:.*]] = tfl.add %arg0, %[[dq]]
 // CHECK: return %[[add]]
@@ -83,7 +83,7 @@ func @prepareConv2DSplat(%arg0: tensor<1x5x5x3xf32>) -> tensor<1x5x5x3xf32> {
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = constant dense<1.270000e+02> : tensor<3x3x3x3xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 }
@@ -97,7 +97,7 @@ func @prepareConv2D(%arg0: tensor<1x5x5x1xf32>) -> tensor<1x5x5x3xf32> {
 
 // CHECK: %[[cst:.*]] = constant dense<[{{\[\[\[}}0.000000e+00]]], [{{\[\[}}1.270000e+02]]], [{{\[\[}}-1.270000e+02]]]]>
 // CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<3x1x1x1x!quant.uniform<i8<-127:127>:f32:0,
-// CHECK-SAME: {0.0078740157480314959,1.000000e+00,1.000000e+00}>>}
+// CHECK-SAME: {0.0078740157480314959,1.000000e+00,1.000000e+00}>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq]]
 
@@ -134,12 +134,12 @@ func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
   return %fc : tensor<1x112x112x32xf32>
 
 // CHECK: %[[cst:.*]] = constant dense<1.270000e+02> : tensor<32x12xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<32x12xf32>) -> tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
 // CHECK: "tfl.fully_connected"(%arg0, %[[dq]]
 
 // PerTensor: %[[cst:.*]] = constant dense<1.270000e+02> : tensor<32x12xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>} : (tensor<32x12xf32>) -> tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
 // PerTensor: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
 // PerTensor: "tfl.fully_connected"(%arg0, %[[dq]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index f937d0afd4d..38f76bb4eb5 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -67,7 +67,7 @@ func @QuantizeConv2DPerChannel(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32,
   return %conv : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[cst:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
-// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>}
+// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.500000e+00,3.000000e+00,4.500000e+00}>>, volatile}
 // CHECK-NEXT: %[[bias:.*]] = "tfl.dequantize"(%[[qbias]])
 // CHECK-NEXT: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK-NEXT: %[[w:.*]] = "tfl.dequantize"(%arg1)
@@ -87,7 +87,7 @@ func @QuantizeConv2DPerChannels(%arg0: tensor<1x224x224x3x!quant.uniform<u8:f32:
   return %conv : tensor<1x112x112x32xf32>
 
 // CHECK-NEXT: %[[cst:.*]] = constant dense<1.000000e+00> : tensor<32xf32>
-// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.000000e+00,4.000000e+00,9.000000e+00}>>}
+// CHECK-NEXT: %[[qbias:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<32x!quant.uniform<i32:f32:0, {1.000000e+00,4.000000e+00,9.000000e+00}>>, volatile}
 // CHECK-NEXT: %[[bias:.*]] = "tfl.dequantize"(%[[qbias]])
 // CHECK-NEXT: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK-NEXT: %[[w:.*]] = "tfl.dequantize"(%arg1)
@@ -107,7 +107,7 @@ func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>
   return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -129,7 +129,7 @@ func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e
   return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -151,7 +151,7 @@ func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500
   return %6 : tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0)
 // CHECK: %3 = "tfl.pseudo_qconst"()
@@ -232,7 +232,7 @@ func @QuantizeStridedSlice(tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg3)
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x2x5x!quant.uniform<u8:f32, 1.000000e-01>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x2x2x5x!quant.uniform<u8:f32, 1.000000e-01>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2)
 // CHECK: return %3 : tensor<1x2x2x5xf32>
 }
@@ -277,7 +277,7 @@ func @QuantizeReshape2D(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>
 
 // CHECK: %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %1 = "tfl.reshape"(%0, %{{.*}}) : (tensor<1x6x6x16xf32>, tensor<3xi32>) -> tensor<1x36x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x36x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: return %3 : tensor<1x36x16xf32>
 }
@@ -291,7 +291,7 @@ func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.softmax"(%0) {beta = 1.000000e+00 : f32} : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2)
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
@@ -305,7 +305,7 @@ func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
 // CHECK: %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
-// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>}
+// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 3.906250e-03>>) -> tensor<1x6x6x16xf32>
 // CHECK: return %3 : tensor<1x6x6x16xf32>
 }
@@ -327,7 +327,7 @@ func @QuantizeL2Norm(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -> ten
 
 // CHECK: %[[in:.*]] = "tfl.dequantize"(%arg0)
 // CHECK: %[[l2:.*]] = "tfl.l2_normalization"(%[[in]])
-// CHECK: %[[q:.*]] = "tfl.quantize"(%[[l2]]) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>}
+// CHECK: %[[q:.*]] = "tfl.quantize"(%[[l2]]) {qtype = tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: return %[[dq]] : tensor<1x6x6x16xf32>
 }
@@ -350,13 +350,13 @@ func @QuantizeConcatOperand0ToAll(tensor<1x2x!quant.uniform<u8:f32, 0.1:128>>, t
   %1 = "tfl.concatenation"(%0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
   return %1 : tensor<2x2xf32>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %3 = "tfl.concatenation"(%2, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %5 = "tfl.dequantize"(%4) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<2x2xf32>
-// CHeCK: return %5 : tensor<2x2xf32>
+// CHECK: return %5 : tensor<2x2xf32>
 }
 
 // CHECK-LABEL: QuantizeConcatOperand1ToAll
@@ -366,11 +366,11 @@ func @QuantizeConcatOperand1ToAll(tensor<1x2xf32>, tensor<1x2x!quant.uniform<u8:
   %1 = "tfl.concatenation"(%arg0, %0) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
   return %1 : tensor<2x2xf32>
 
-// CHECK: %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg1) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %3 = "tfl.concatenation"(%1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
-// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %4 = "tfl.quantize"(%3) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %5 = "tfl.dequantize"(%4) : (tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<2x2xf32>
 // CHECK: return %5 : tensor<2x2xf32>
 }
@@ -382,9 +382,9 @@ func @QuantizeConcatResToAll(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!qu
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %1 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
-// CHECK: %2 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %2 = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %4 = "tfl.concatenation"(%3, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
 // CHECK: %5 = "tfl.quantize"(%4) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
@@ -399,7 +399,7 @@ func @QuantizeConcatResToAllNoRequantize(tensor<1x2x!quant.uniform<u8:f32, 0.1:1
   %2 = "tfl.quantize"(%1) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %2 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %0 = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %3 = "tfl.concatenation"(%2, %1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2xf32>
@@ -416,7 +416,7 @@ func @QuantizeConcatResToAllRequantize(tensor<1x2xf32>, tensor<1x2xf32>) -> tens
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %[[Q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>
 // CHECK: %[[RQ0:.*]] = "tfl.quantize"(%[[Q0]]) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
@@ -434,7 +434,7 @@ func @QuantizeConcatResToAllRequantizeArg(tensor<1x2x!quant.uniform<u8:f32, 2.0:
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[Q1:.*]] =  "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %[[DQ1:.*]] = "tfl.dequantize"(%[[Q1]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
 // CHECK: %[[RQ0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 // CHECK: %[[DQ0:.*]] = "tfl.dequantize"(%[[RQ0]]) : (tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>) -> tensor<1x2xf32>
@@ -475,22 +475,22 @@ func @QuantizeChain(tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
   return %10 : tensor<1x36x16xf32>
 
 // CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform<i32:f32, 1.7052092479439231E-4>>)
 // CHECK: %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %3 = "tfl.pseudo_qconst"()
 // CHECK: %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform<u8<1:255>:f32, 0.021826678373682216:151>>)
 // CHECK: %5 = "tfl.average_pool_2d"(%2)
-// CHECK: %6 = "tfl.quantize"(%5) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>}
+// CHECK: %6 = "tfl.quantize"(%5) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>, volatile}
 // CHECK: %7 = "tfl.dequantize"(%6) : (tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>)
 // CHECK: %8 = "tfl.conv_2d"(%7, %4, %1)
 // CHECK: %9 = "tfl.quantize"(%8) {qtype = tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>}
 // CHECK: %10 = "tfl.dequantize"(%9) : (tensor<1x112x112x32x!quant.uniform<u8:f32, 0.023528476789885875>>)
 // CHECK: %11 = "tfl.reshape"(%10, %{{.*}})
-// CHECK: %12 = "tfl.quantize"(%11) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>}
+// CHECK: %12 = "tfl.quantize"(%11) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>, volatile}
 // CHECK: %13 = "tfl.dequantize"(%12) : (tensor<1x36x16x!quant.uniform<u8:f32, 0.023528476789885875>>)
 // CHECK: %14 = "tfl.softmax"(%13)
-// CHECK: %15 = "tfl.quantize"(%14) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>}
+// CHECK: %15 = "tfl.quantize"(%14) {qtype = tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>, volatile}
 // CHECK: %16 = "tfl.dequantize"(%15) : (tensor<1x36x16x!quant.uniform<u8:f32, 3.906250e-03>>)
 // CHECK: return %16 : tensor<1x36x16xf32>
 }
@@ -501,7 +501,7 @@ func @QuantizeConstant() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK: %cst = constant dense{{.*}}tensor<2x3xf32>
-// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.023529411764705882:128>>}
+// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.023529411764705882:128>>, volatile}
 // CHECK: %1 = "tfl.dequantize"(%0)
 // CHECK: return %1 : tensor<2x3xf32>
 }
@@ -521,7 +521,7 @@ func @QuantizeZeroSplat() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<0.000000e+00> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 1.000000e+00>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeZeroScalar
@@ -530,7 +530,7 @@ func @QuantizeZeroScalar() -> tensor<f32> {
   return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<0.000000e+00> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 1.000000e+00>>, volatile}
 }
 
 // CHECK-LABEL: QuantizePositiveSplat
@@ -539,7 +539,7 @@ func @QuantizePositiveSplat() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<2.540000e+01> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.099607841641295186>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.099607841641295186>>, volatile}
 }
 
 // CHECK-LABEL: QuantizePositiveScalar
@@ -548,7 +548,7 @@ func @QuantizePositiveScalar() -> tensor<f32> {
   return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<2.540000e+00> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.0099607841641295193>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.0099607841641295193>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeNegativeSplat
@@ -557,7 +557,7 @@ func @QuantizeNegativeSplat() -> tensor<2x3xf32> {
   return %cst : tensor<2x3xf32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<-2.540000e+00> : tensor<2x3xf32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.0099607841641295193:255>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<2x3x!quant.uniform<u8:f32, 0.0099607841641295193:255>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeNegativeScalar
@@ -566,7 +566,7 @@ func @QuantizeNegativeScalar() -> tensor<f32> {
   return %cst : tensor<f32>
 
 // CHECK-NEXT:  %[[cst:.*]] = constant dense<-2.540000e+01> : tensor<f32>
-// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.099607841641295186:255>>}
+// CHECK-NEXT:  "tfl.quantize"(%[[cst]]) {qtype = tensor<!quant.uniform<u8:f32, 0.099607841641295186:255>>, volatile}
 }
 
 // CHECK-LABEL: QuantizeSharedBiases
@@ -617,7 +617,7 @@ func @QuantizeSharedBiases2(
 // CHECK: %[[q:.*]] = "tfl.quantize"(%[[cst]])
 // CHECK: %[[dq:.*]] = "tfl.dequantize"(%[[q]])
 // CHECK: %[[cst_0:.*]] = constant dense<0.000000e+00> : tensor<32xf32>
-// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<32x!quant.uniform<u8:f32, 1.000000e+00>>}
+// CHECK: %[[q_0:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<32x!quant.uniform<u8:f32, 1.000000e+00>>, volatile}
 // CHECK: %[[dq_0:.*]] = "tfl.dequantize"(%[[q_0]])
 // CHECK: %{{.*}} = tfl.add %{{.*}}, %[[dq_0]]
 // CHECK: %{{.*}} = "tfl.conv_2d"(%{{.*}}, %{{.*}}, %[[dq]])
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
index 54ca7f043f4..6f42ae6293d 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir
@@ -195,8 +195,8 @@ func @QuantizeConcat(tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<2x2x!quant.unif
   %1 = "tfl.quantize"(%0) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %1 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
 // CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q1]], %[[q0]]) {axis = 0 : i32, fused_activation_function = "NONE"}
 // CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
@@ -209,8 +209,8 @@ func @QuantizeConcatRequantize(tensor<1x2x!quant.uniform<u8:f32, 2.0:128>>, tens
   %3 = "tfl.quantize"(%2) {qtype = tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
   return %3 : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 
-// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
-// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>} : (tensor<1x2x!quant.uniform<u8:f32, 2.000000e+00:128>>) -> tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
+// CHECK: %[[q1:.*]] = "tfl.quantize"(%arg1) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>, volatile}
+// CHECK: %[[q0:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x2x!quant.uniform<u8:f32, 1.000000e-01:128>>}
 // CHECK: %[[cc:.*]] = "tfl.concatenation"(%[[q0]], %[[q1]]) {axis = 0 : i32, fused_activation_function = "NONE"}
 // CHECK: return %[[cc]] : tensor<2x2x!quant.uniform<u8:f32, 1.000000e-01:128>>
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 7954f72046a..9a1da0ad03d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
@@ -118,6 +119,24 @@ void RemoveQuantizationAdaptorOps(FuncOp func) {
   func.setType(new_func_type);
 }
 
+// Remove the back-to-back quantize and dequantize ops with volatile attribute.
+struct RemoveVolatileOps : public OpRewritePattern<DequantizeOp> {
+  explicit RemoveVolatileOps(MLIRContext* context)
+      : OpRewritePattern<DequantizeOp>(context, 1) {}
+
+  LogicalResult matchAndRewrite(DequantizeOp op,
+                                PatternRewriter& rewriter) const override {
+    auto input_op = op.input().getDefiningOp();
+    if (auto q = llvm::dyn_cast_or_null<QuantizeOp>(input_op)) {
+      if (!q.getAttr(mlir::quant::kVolatileOpAttrName)) return failure();
+
+      op.replaceAllUsesWith(q.input());
+      return success();
+    }
+    return failure();
+  }
+};
+
 #include "tensorflow/compiler/mlir/lite/transforms/generated_post_quantize.inc"
 
 void PostQuantizePass::runOnFunction() {
@@ -131,6 +150,9 @@ void PostQuantizePass::runOnFunction() {
   if (!emit_quant_adaptor_ops_) {
     RemoveQuantizationAdaptorOps(getFunction());
   }
+
+  patterns.insert<RemoveVolatileOps>(ctx);
+  applyPatternsAndFoldGreedily(func, patterns);
 }
 
 }  // namespace

From fba1187edac54eedf4556e83a5828b97d4215e10 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Thu, 28 May 2020 19:49:18 -0700
Subject: [PATCH 1365/1533] Adds `custom_grad` and `vjp` to tf_numpy/extensions
 and trax/math.

Also changes tf.custom_gradient to allow nested structures as inputs (currently it only allows a list of tensors).

PiperOrigin-RevId: 313706826
Change-Id: Ia3e9fdc7323476c96f60b6dbb6b89823bc09e995
---
 tensorflow/python/ops/custom_gradient.py | 35 +++++++++-------
 tensorflow/python/ops/gradients_test.py  | 52 ++++++++++++++++++++----
 2 files changed, 65 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 2a9194fb146..953bb252729 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -175,20 +175,23 @@ def custom_gradient(f=None):
 
   Args:
     f: function `f(*x)` that returns a tuple `(y, grad_fn)` where:
-       - `x` is a sequence of `Tensor` inputs to the function.
-       - `y` is a `Tensor` or sequence of `Tensor` outputs of applying
-         TensorFlow operations in `f` to `x`.
+       - `x` is a sequence of (nested structures of) `Tensor` inputs to the
+         function.
+       - `y` is a (nested structure of) `Tensor` outputs of applying TensorFlow
+         operations in `f` to `x`.
        - `grad_fn` is a function with the signature `g(*grad_ys)` which returns
-         a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
-         to the `Tensor`s in `x`.  `grad_ys` is a `Tensor` or sequence of
-         `Tensor`s the same size as `y` holding the initial value gradients for
-         each `Tensor` in `y`. In a pure mathematical sense, a vector-argument
-         vector-valued function `f`'s derivatives should be its Jacobian matrix
-         `J`. Here we are expressing the Jacobian `J` as a function `grad_fn`
-         which defines how `J` will transform a vector `grad_ys` when
-         left-multiplied with it (`grad_ys * J`). This functional representation
-         of a matrix is convenient to use for chain-rule calculation
-         (in e.g. the back-propagation algorithm).
+         a list of `Tensor`s the same size as (flattened) `x` - the derivatives
+         of `Tensor`s in `y` with respect to the `Tensor`s in `x`.  `grad_ys` is
+         a sequence of `Tensor`s the same size as (flattened) `y` holding the
+         initial value gradients for each `Tensor` in `y`.
+
+         In a pure mathematical sense, a vector-argument vector-valued function
+         `f`'s derivatives should be its Jacobian matrix `J`. Here we are
+         expressing the Jacobian `J` as a function `grad_fn` which defines how
+         `J` will transform a vector `grad_ys` when left-multiplied with it
+         (`grad_ys * J`, the vector-Jacobian product, or VJP). This functional
+         representation of a matrix is convenient to use for chain-rule
+         calculation (in e.g. the back-propagation algorithm).
 
          If `f` uses `Variable`s (that are not part of the
          inputs), i.e. through `get_variable`, then `grad_fn` should have
@@ -307,7 +310,7 @@ def _graph_mode_decorator(f, args, kwargs):
         "The custom_gradient decorator currently supports keywords "
         "arguments only when eager execution is enabled.")
   name = "CustomGradient-%s" % ops.uid()
-  args = [ops.convert_to_tensor(x) for x in args]
+  args = nest.map_structure(ops.convert_to_tensor, args)
 
   # Checking global and local variables attempts to ensure that no non-resource
   # Variables are added to the graph.
@@ -318,6 +321,7 @@ def _graph_mode_decorator(f, args, kwargs):
   ])
   with tape_lib.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args)
+  args = nest.flatten(args)
   after_vars = set([
       v.ref() for v in current_var_scope.global_variables() +
       current_var_scope.local_variables()
@@ -404,6 +408,7 @@ def _eager_mode_decorator(f, args, kwargs):
   """Implement custom gradient decorator for eager mode."""
   with tape_lib.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args, **kwargs)
+  args = nest.flatten(args)
   all_inputs = list(args) + list(kwargs.values())
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
@@ -443,7 +448,7 @@ def _eager_mode_decorator(f, args, kwargs):
       raise ValueError(
           "custom_gradient function expected to return", arg_count,
           "gradients but returned", len(flat_grads), "instead.")
-    return nest.flatten(input_grads) + variable_grads
+    return flat_grads + variable_grads
 
   tape_lib.record_operation(f.__name__, flat_result, recorded_inputs,
                             actual_grad_fn)
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index a06be7af74b..9a9ce72a557 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -60,6 +60,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
 from tensorflow.python.ops import gradient_checker_v2
+from tensorflow.python.util import nest
 
 
 class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -1039,7 +1040,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
       self.assertEqual(dependent_vars, [var])
 
 
-class CustomGradientTest(test_util.TensorFlowTestCase):
+class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testCustomGradientTrivial(self):
 
@@ -1119,7 +1120,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       out = core_layers.dense(x, 3, use_bias=False)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))
+        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((4, 3))]
 
@@ -1146,7 +1147,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       out = core_layers.dense(x, 3, use_bias=False)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))
+        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((3, 3))]
 
@@ -1185,7 +1186,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
 
         def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
           del out_grad
-          self.assertEqual(1, len(variables))
+          self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
           return (array_ops.ones((3, 2)),
                   [array_ops.ones((2, 4))])
 
@@ -1209,7 +1210,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
         del out_grad
-        self.assertEqual(1, len(variables))
+        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
         return (array_ops.ones((3, 2)), [array_ops.ones((2, 4))])
 
       return out, Grad
@@ -1273,7 +1274,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       out = core_layers.dense(x, 3, use_bias=False)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))
+        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((4, 3))]
 
@@ -1284,7 +1285,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       out = F(x)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))
+        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((4, 3))]
 
@@ -1303,6 +1304,43 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
+  @parameterized.named_parameters(
+      [(("_%s_%s" % (x_struct, y_struct)).replace(" ", "").replace("None", ""),  # pylint: disable=g-complex-comprehension
+        x_struct, y_struct)
+       for y_struct in [[None, ()], (None, (), [], (None, ((), None)))]
+       for x_struct in [(None, ()), (((), ()), [None, None], [], (None, ()))]
+      ])
+  @test_util.run_in_graph_and_eager_modes
+  def testCustomGradientStructuralInputOutput(self, x_struct, y_struct):
+    """Tests that custom_gradient can handle structured inputs/outputs."""
+    def Zeros(x):
+      return nest.map_structure(lambda _: array_ops.zeros([], "float32"), x)
+    def GetStruct(x):
+      return nest.map_structure(lambda _: None, x)
+
+    def MakeVjp(f, *x):
+      with backprop.GradientTape(persistent=True) as tape:
+        tape.watch(nest.flatten(x))
+        y = f(*x)
+      def Vjp(dy):
+        return tape.gradient(y, x, output_gradients=dy)
+      return y, Vjp
+
+    @custom_gradient.custom_gradient
+    def F(*x):
+      self.assertEqual(x_struct, GetStruct(x))
+      def Vjp(*dy):
+        self.assertEqual(len(nest.flatten(y_struct)),
+                         len(nest.flatten(dy)))
+        return nest.flatten(Zeros(x_struct))
+      return Zeros(y_struct), Vjp
+
+    x, dy = Zeros([x_struct, y_struct])
+    y, vjp = MakeVjp(F, *x)
+    dx = vjp(dy)
+    self.assertEqual(x_struct, GetStruct(dx))
+    self.assertEqual(y_struct, GetStruct(y))
+
 
 class TensorListGradientsTest(test_util.TensorFlowTestCase):
 

From 4c674a64c8f5d6850c2907d5686c9c10bea55b33 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 28 May 2020 19:51:43 -0700
Subject: [PATCH 1366/1533] Migrate float/uint8 broadcast fivefold add to use
 binary broadcast fivefold.

PiperOrigin-RevId: 313707010
Change-Id: I800cbc4406bad709cedecbfa0e41b4e465254f75
---
 .../internal/optimized/optimized_ops.h        | 402 ++++++------------
 .../reference/process_broadcast_shapes.h      |   4 +
 tensorflow/lite/kernels/sub.cc                |  10 +-
 3 files changed, 134 insertions(+), 282 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 746ed622632..6f478daab68 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -234,6 +234,100 @@ inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
 }
 #endif
 
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
+                                    const RuntimeShape& unswitched_input1_shape,
+                                    const T* unswitched_input1_data,
+                                    const RuntimeShape& unswitched_input2_shape,
+                                    const T* unswitched_input2_data,
+                                    const RuntimeShape& output_shape,
+                                    T* output_data, ElementwiseF elementwise_f,
+                                    ScalarBroadcastF scalar_broadcast_f) {
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const T* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const T* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  T* output_data_ptr = output_data;
+  const T* input1_data_ptr = input1_data;
+  const T* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const T* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
+                          output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const T* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
 inline void AddBiasAndEvalActivationFunction(float output_activation_min,
                                              float output_activation_max,
                                              const RuntimeShape& bias_shape,
@@ -2073,186 +2167,6 @@ inline void Add(const ArithmeticParams& params,
   output_map = output_map.cwiseMin(params.quantized_activation_max);
 }
 
-inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastAddFivefold/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
-  switched_params.input1_shift = unswitched_params.input2_shift;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
-  switched_params.input2_shift = unswitched_params.input1_shift;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
-inline void BroadcastAddFivefold(const ArithmeticParams& params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const float* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const float* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 float* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastAddFivefold/float");
-
-  const bool use_unswitched =
-      params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const float* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const float* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  float* output_data_ptr = output_data;
-  const float* input1_data_ptr = input1_data;
-  const float* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            AddElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          AddScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 template <typename T>
 inline void BroadcastAddDispatch(
     const ArithmeticParams& params, const RuntimeShape& input1_shape,
@@ -2263,8 +2177,37 @@ inline void BroadcastAddDispatch(
                               input2_data, output_shape, output_data);
   }
 
-  BroadcastAddFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
+                           T*)>(AddElementwise),
+      static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
+          AddScalarBroadcast));
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  BroadcastAddDispatch(unswitched_params, unswitched_input1_shape,
+                       unswitched_input1_data, unswitched_input2_shape,
+                       unswitched_input2_data, output_shape, output_data);
+}
+
+inline void BroadcastAddFivefold(const ArithmeticParams& params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const float* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const float* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 float* output_data) {
+  BroadcastAddDispatch(params, unswitched_input1_shape, unswitched_input1_data,
+                       unswitched_input2_shape, unswitched_input2_data,
+                       output_shape, output_data);
 }
 
 inline void MulElementwise(int size, const ArithmeticParams& params,
@@ -7979,101 +7922,6 @@ inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
   }
 }
 
-template <typename ElementwiseF, typename ScalarBroadcastF>
-inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
-                                    const RuntimeShape& unswitched_input1_shape,
-                                    const int8* unswitched_input1_data,
-                                    const RuntimeShape& unswitched_input2_shape,
-                                    const int8* unswitched_input2_data,
-                                    const RuntimeShape& output_shape,
-                                    int8* output_data,
-                                    ElementwiseF elementwise_f,
-                                    ScalarBroadcastF scalar_broadcast_f) {
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
-  switched_params.input1_shift = unswitched_params.input2_shift;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
-  switched_params.input2_shift = unswitched_params.input1_shift;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const int8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const int8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise add of
-  // sections of the arrays.
-  int8* output_data_ptr = output_data;
-  const int8* input1_data_ptr = input1_data;
-  const int8* input2_data_reset = input2_data;
-  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
-  // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for
-  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
-  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
-    // dimension.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
-                          output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          // We have broadcast y4 of input1 data y3 times, and now move on.
-          input1_data_ptr += y4;
-        }
-      }
-      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    // Special case of y4 == 1, in which the innermost loop is a single
-    // element and can be combined with the next (y3) as an inner broadcast.
-    //
-    // Note that this handles the case of pure scalar broadcast when
-    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
-    // broadcast with batch (as y2 > 1).
-    //
-    // NOTE The process is the same as the above general case except
-    // simplified for y4 == 1 and the loop over y3 is contained within the
-    // AddScalarBroadcast function.
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          input1_data_ptr += 1;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 template <typename Op>
 inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
                                      const RuntimeShape& input1_shape,
diff --git a/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
index 8e1a6c85919..40f779c5bdf 100644
--- a/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
+++ b/tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h
@@ -76,6 +76,10 @@ inline bool ProcessBroadcastShapes(const RuntimeShape& shape0,
           BroadcastableOpCategory::kFirstInputBroadcastsFast &&
       params->broadcast_category !=
           BroadcastableOpCategory::kSecondInputBroadcastsFast) {
+    // This is unreachable because at least one else clause in the above loop
+    // must be reached.
+    TFLITE_DCHECK(false);
+    params->broadcast_category = BroadcastableOpCategory::kNonBroadcast;
     return false;
   }
 
diff --git a/tensorflow/lite/kernels/sub.cc b/tensorflow/lite/kernels/sub.cc
index 1b04143d222..a2282a0545b 100644
--- a/tensorflow/lite/kernels/sub.cc
+++ b/tensorflow/lite/kernels/sub.cc
@@ -326,11 +326,11 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
         TF_LITE_SUB(reference_ops, Add, uint8_t);
       }
     } else {
-      if (op_params.broadcast_category ==
-          BroadcastableOpCategory::kGenericBroadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastAdd4DSlow, uint8_t);
-      } else if (need_broadcast) {
-        TF_LITE_SUB(optimized_ops, BroadcastAddFivefold, uint8_t);
+      if (need_broadcast) {
+        optimized_ops::BroadcastAddDispatch(
+            op_params, GetTensorShape(input1), GetTensorData<uint8_t>(input1),
+            GetTensorShape(input2), GetTensorData<uint8_t>(input2),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
       } else {
         TF_LITE_SUB(optimized_ops, Add, uint8_t);
       }

From e9ad6196a699454754581f61f47d8a8572c7f21f Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Thu, 28 May 2020 20:15:47 -0700
Subject: [PATCH 1367/1533] Explicitly call RecordStop when calling group
 iterator for GroupByWindow dataset, since the current_group_iterator isn't
 correctly wired up to the output node. See b/154341936 for context.

PiperOrigin-RevId: 313709228
Change-Id: I5aa398f71a46713c96aba96f4d42777edfea4fc0
---
 .../data/experimental/group_by_window_dataset_op.cc    |  7 +++++++
 .../experimental/kernel_tests/group_by_window_test.py  | 10 ++++++++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 462f8ce6ef7..a61ebd70141 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -200,8 +200,15 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             // We are currently processing a group, so try to get the
             // next element.
             bool end_of_group;
+            // TODO(b/154341936): Explicitly stopping and starting this iterator
+            // should not be necessary, but the `::Reduce` added to the prefix
+            // passed to `current_group_iterator_` when it was created prevents
+            // the model from identifying this iterator as the output of
+            // `current_group_iterator_`.
+            RecordStop(ctx);
             TF_RETURN_IF_ERROR(current_group_iterator_->GetNext(
                 ctx, out_tensors, &end_of_group));
+            RecordStart(ctx);
             if (!end_of_group) {
               // Produce the subelement as output.
               *end_of_sequence = false;
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 2495083cf63..581d8f42792 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -331,6 +331,16 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase):
     self.assertDatasetProduces(
         dataset, expected_output=[[i] for i in range(10)])
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testGroupByWindowWithAutotune(self):
+    dataset = dataset_ops.Dataset.range(1000).apply(
+        grouping.group_by_window(
+            lambda x: x // 10,
+            lambda key, window: dataset_ops.Dataset.from_tensors(key), 4))
+    dataset = dataset.map(lambda x: x + 1, num_parallel_calls=-1)
+    get_next = self.getNext(dataset)
+    self.evaluate(get_next())
+
 
 if __name__ == "__main__":
   test.main()

From 618ff4c618ecd76ded8b9bdc6aa827469f9b826d Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Thu, 28 May 2020 21:02:58 -0700
Subject: [PATCH 1368/1533] Store TypeSpec in Keras input layer, and use it
 when tracing the model.

PiperOrigin-RevId: 313714149
Change-Id: I893d7fecda2ac41568a6bc658251a4be14c2211d
---
 tensorflow/python/keras/engine/input_layer.py |  9 +++++
 .../saving/saved_model/saved_model_test.py    | 37 +++++++++++++++++++
 tensorflow/python/keras/utils/tf_utils.py     | 11 ++++--
 3 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 02e43110697..8075cc3fd0f 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -20,7 +20,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.distribute import distributed_training_utils
 from tensorflow.python.keras.engine import base_layer
@@ -170,6 +172,13 @@ class InputLayer(base_layer.Layer):
     input_tensor._keras_mask = None
     node_module.Node(layer=self, outputs=input_tensor)
 
+    # Store type spec
+    if isinstance(input_tensor, composite_tensor.CompositeTensor):
+      self._type_spec = input_tensor._type_spec  # pylint: disable=protected-access
+    else:
+      self._type_spec = tensor_spec.TensorSpec(
+          shape=input_tensor.shape, dtype=input_tensor.dtype, name=self.name)
+
   def get_config(self):
     config = {
         'batch_input_shape': self._batch_input_shape,
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 4ada84191dc..c6cc2f7a1d5 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -57,6 +57,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load as tf_load
 from tensorflow.python.saved_model import save as tf_save
@@ -730,6 +731,42 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
     self.assertAllClose(layer.states, loaded_layer.states)
     self.assertAllClose(model(input_arr), loaded(input_arr))
 
+  def testSaveWithRaggedInputs(self):
+
+    class EmbeddingMerger(keras.layers.Layer):
+
+      def __init__(self, list_features, **kwargs):
+        super().__init__(**kwargs)
+        self._supports_ragged_inputs = True
+        self.embeddings = {
+            feature: keras.layers.Embedding(10, 3) for feature in list_features}
+        self.mean = keras.layers.Lambda(
+            math_ops.reduce_mean, arguments=dict(axis=1))
+
+      def call(self, inputs):
+        tensors = [self.embeddings[col](inputs[col]) for col in inputs]
+        tensors = [self.mean(inp) for inp in tensors]
+        return keras.layers.Add()(tensors)
+
+    list_features = ['feature_1', 'feature_2']
+    feature_1 = ragged_factory_ops.constant([[0.], [1, 3]])
+    feature_2 = ragged_factory_ops.constant([[1., 2], [4]])
+    f = {'feature_1': feature_1,
+         'feature_2': feature_2}
+    f_inputs = {
+        'feature_1': keras.Input(shape=(None,), name='feature_1', ragged=True),
+        'feature_2': keras.Input(shape=(None,), name='feature_2', ragged=True)}
+
+    out = EmbeddingMerger(list_features)(f_inputs)
+    model = keras.Model(f_inputs, out)
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    tf_save.save(model, saved_model_dir)
+
+    loaded = keras_load.load(saved_model_dir)
+    self.evaluate(variables.variables_initializer(loaded.variables))
+    self.assertAllClose(model.predict(f), loaded.predict(f))
+
 
 class TestLayerCallTracing(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index b87ca1623b0..2c8f0f58f6c 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -481,11 +481,15 @@ def dataset_is_infinite(dataset):
 
 def get_tensor_spec(t, dynamic_batch=False, name=None):
   """Returns a `TensorSpec` given a single `Tensor` or `TensorSpec`."""
+  # pylint: disable=protected-access
   if isinstance(t, type_spec.TypeSpec):
     spec = t
   elif isinstance(t, composite_tensor.CompositeTensor):
     # TODO(b/148821952): Should these specs have a name attr?
-    spec = t._type_spec  # pylint: disable=protected-access
+    spec = t._type_spec
+  elif (hasattr(t, '_keras_history') and
+        hasattr(t._keras_history[0], '_type_spec')):
+    return t._keras_history[0]._type_spec
   elif hasattr(t, 'shape') and hasattr(t, 'dtype'):
     spec = tensor_spec.TensorSpec(shape=t.shape, dtype=t.dtype, name=name)
   else:
@@ -496,11 +500,12 @@ def get_tensor_spec(t, dynamic_batch=False, name=None):
 
   dynamic_batch_spec = copy.deepcopy(spec)
   # RaggedTensorSpec only has a private _shape.
-  shape = dynamic_batch_spec._shape.as_list()  # pylint: disable=protected-access
+  shape = dynamic_batch_spec._shape.as_list()
   if shape:
     shape[0] = None
-    dynamic_batch_spec._shape = tensor_shape.TensorShape(shape)  # pylint: disable=protected-access
+    dynamic_batch_spec._shape = tensor_shape.TensorShape(shape)
   return dynamic_batch_spec
+  # pylint: enable=protected-access
 
 
 def to_numpy_or_python_type(tensors):

From a1ae008076e14f7e445abf2605759779d2a1fb8b Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Thu, 28 May 2020 21:44:18 -0700
Subject: [PATCH 1369/1533] Generalizes the first argument in keras layers
 further. Now, functional models get constructed if *any* tensor in the
 arguments or keyword arguments has a keras history, rather than if *all* of
 the elements in the first argument to the layer do.

PiperOrigin-RevId: 313718130
Change-Id: I77f65f49decf45f6a2b53ab0519d6d2ac38232d3
---
 tensorflow/python/keras/engine/base_layer.py  | 36 ++++----
 .../python/keras/engine/base_layer_utils.py   | 14 ++-
 tensorflow/python/keras/engine/functional.py  | 28 ++++--
 .../python/keras/engine/functional_test.py    | 87 +++++++++++++++++++
 tensorflow/python/keras/engine/node.py        | 15 ++--
 5 files changed, 149 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 9958f70ed55..0630199464f 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -830,14 +830,13 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     in_call = call_context.in_call
     input_list = nest.flatten(inputs)
 
-    # We will attempt to build a TF graph if & only if all inputs are symbolic.
-    # This is always the case in graph mode. It can also be the case in eager
-    # mode when all inputs can be traced back to `keras.Input()` (when building
-    # models using the functional API).
-    # TODO(kaftan): make this not special case inputs. Instead
-    # build a functional api model if *any* *arg or **kwarg is symbolic,
-    # even if part of the data structure in that arg is not symbolic.
-    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
+    # We will attempt to trace in a graph if & only if inputs are symbolic.
+    # This is always the case when tracing a function. It can also be the case
+    # when running eagerly if any input can be traced back to `keras.Input()`
+    # (when building models using the functional API).
+    build_graph = tf_utils.are_all_symbolic_tensors(input_list) or (
+        any(map(tf_utils.is_symbolic_tensor, nest.flatten(
+            [input_list, args, kwargs]))) and context.executing_eagerly())
 
     # Accept NumPy and scalar inputs by converting to Tensors.
     if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
@@ -890,11 +889,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             'training', training_value, args, kwargs)
         training_arg_passed_by_framework = True
 
-    # Only create Keras history if at least one tensor originates from a
-    # `keras.Input`. Otherwise this Layer may be being used outside the Keras
-    # framework.
-    # TODO(kaftan): make this not special case inputs
-    if build_graph and base_layer_utils.needs_keras_history(inputs):
+    # Turn inputs into TF op layers if necessary.
+    # This process is fragile and prone to bad interactions with inputs
+    # when calling nested layers with tf.functions floating around,
+    # and with nonsymbolic tensors.
+    # So, we limit it to the
+    # case where *all* inputs in the first arg are symbolic.
+    if (tf_utils.are_all_symbolic_tensors(input_list)
+        and base_layer_utils.needs_keras_history(inputs)):
       base_layer_utils.create_keras_history(inputs)
 
     with call_context.enter(self, inputs, build_graph, training_value):
@@ -968,8 +970,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             raise ValueError('A layer\'s `call` method should return a '
                              'Tensor or a list of Tensors, not None '
                              '(layer: ' + self.name + ').')
-          # TODO(kaftan): This should be 'any' and check all args
-          if base_layer_utils.have_all_keras_metadata(inputs):
+          # We configure connectivity metadata if all inputs in the first
+          # arg have keras history, or if we're actively building the
+          # functional api outside of any outer keras model.
+          if base_layer_utils.have_all_keras_metadata(inputs) or (
+              context.executing_eagerly() and
+              base_layer_utils.have_any_keras_metadata(inputs, args, kwargs)):
             if training_arg_passed_by_framework:
               args, kwargs = self._set_call_arg_value(
                   'training', None, args, kwargs, pop_kwarg_if_none=True)
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 6d25995e4c2..6508d641543 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -165,6 +165,10 @@ def have_all_keras_metadata(tensors):
   return all(hasattr(x, '_keras_history') for x in nest.flatten(tensors))
 
 
+def have_any_keras_metadata(*tensors):
+  return any(hasattr(x, '_keras_history') for x in nest.flatten(tensors))
+
+
 def generate_placeholders_from_shape(shape):
   return array_ops.placeholder(shape=shape, dtype=backend.floatx())
 
@@ -214,7 +218,10 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
   for tensor in tensor_list:
     if getattr(tensor, '_keras_history', None) is not None:
       continue
-    op = tensor.op  # The Op that created this Tensor.
+    try:
+      op = tensor.op  # The Op that created this Tensor.
+    except AttributeError:
+      continue
     if op not in processed_ops:
       if op.type.startswith('Sparse'):
         lambda_example = """
@@ -392,7 +399,10 @@ def mark_checked(tensors):
   """
 
   def _mark_checked(tensor):
-    tensor._keras_history_checked = True  # pylint: disable=protected-access
+    try:
+      tensor._keras_history_checked = True  # pylint: disable=protected-access
+    except AttributeError:
+      pass
 
   nest.map_structure(_mark_checked, tensors)
 
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 4958990ad66..eec13345295 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -32,6 +32,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
+from tensorflow.python.keras.engine import node as node_module
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import network_serialization
@@ -1111,19 +1112,28 @@ def reconstruct_from_config(config, custom_objects=None, created_layers=None):
         kwargs = {}
       elif len(input_data) == 4:
         kwargs = input_data[3]
-        kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+        try:
+          kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+        except IndexError:
+          # Happens if keras tensors in kwargs are still unprocessed
+          add_unprocessed_node(layer, node_data)
+          return
       else:
         raise ValueError('Improperly formatted model config.')
 
-      inbound_layer = created_layers[inbound_layer_name]
-      inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
+      if inbound_layer_name != node_module._CONSTANT_VALUE:
+        inbound_layer = created_layers[inbound_layer_name]
+        inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
 
-      if inbound_node_index is None:
-        add_unprocessed_node(layer, node_data)
-        return
-      inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-      input_tensors.append(
-          nest.flatten(inbound_node.outputs)[inbound_tensor_index])
+        if inbound_node_index is None:
+          add_unprocessed_node(layer, node_data)
+          return
+        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+        input_tensors.append(
+            nest.flatten(inbound_node.outputs)[inbound_tensor_index])
+      else:
+        # We received a constant w/ no Keras history attached
+        input_tensors.append(inbound_tensor_index)
     input_tensors = nest.pack_sequence_as(node_data, input_tensors)
     # Call layer on its inputs, thus creating the node
     # and building the layer if needed.
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index 90fc9f2697f..e975bb85bfc 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -964,6 +964,43 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
+  @combinations.generate(combinations.keras_mode_combinations())
+  def test_call_kwarg_derived_from_keras_layer_and_first_arg_is_constant(self):
+
+    class MaybeAdd(layers.Layer):
+
+      def call(self, x1, x2=None):
+        if x2 is not None:
+          return x1 + x2
+        return x1
+
+    input2 = input_layer_lib.Input(10)
+    outputs = MaybeAdd()(3., x2=input2)
+    model = training_lib.Model([input2], outputs)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=7 * np.ones((10, 10)),
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    model = training_lib.Model.from_config(
+        model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=7 * np.ones((10, 10)),
+        y=10 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that second input was correctly added to first.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
   @combinations.generate(combinations.keras_mode_combinations())
   def test_composite_call_kwarg_derived_from_keras_layer(self):
 
@@ -1005,6 +1042,56 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
+  @combinations.generate(combinations.keras_mode_combinations(mode='eager'))
+  def test_call_some_not_all_nested_in_first_arg_derived_from_keras_layer(self):
+    # This functionality is unsupported in v1 graphs
+
+    class AddAll(layers.Layer):
+
+      def call(self, x1_x2, x3):
+        x1, x2 = x1_x2
+        out = x1 + x2
+        if x3 is not None:
+          for t in x3.values():
+            out += t
+        return out
+
+    input1 = input_layer_lib.Input(10)
+    input2 = input_layer_lib.Input(10)
+    input3 = input_layer_lib.Input(10)
+
+    outputs = AddAll()(
+        [input1, 4 * array_ops.ones((1, 10))],
+        x3={
+            'a': input2,
+            'b': input3,
+            'c': 5 * array_ops.ones((1, 10))
+        })
+    model = training_lib.Model([input1, input2, input3], outputs)
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+        y=15 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that all inputs were correctly added.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
+    model = training_lib.Model.from_config(
+        model.get_config(), custom_objects={'AddAll': AddAll})
+    model.compile(
+        'sgd',
+        'mse',
+        run_eagerly=testing_utils.should_run_eagerly())
+    history = model.fit(
+        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
+        y=15 * np.ones((10, 10)),
+        batch_size=2)
+    # Check that all inputs were correctly added.
+    self.assertEqual(history.history['loss'][0], 0.0)
+
   @combinations.generate(combinations.keras_mode_combinations())
   def test_call_nested_arg_derived_from_keras_layer(self):
 
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index 708904853b2..1637e054d88 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -32,6 +32,8 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 
+_CONSTANT_VALUE = '_CONSTANT_VALUE'
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
@@ -181,11 +183,14 @@ class Node(object):
     # `kwargs` is added to each Tensor in the first arg. This should be
     # changed in a future version of the serialization format.
     def serialize_first_arg_tensor(t):
-      kh = t._keras_history
-      node_index = kh.node_index
-      node_key = make_node_key(kh.layer.name, node_index)
-      new_node_index = node_conversion_map.get(node_key, 0)
-      data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+      if is_keras_tensor(t):
+        kh = t._keras_history
+        node_index = kh.node_index
+        node_key = make_node_key(kh.layer.name, node_index)
+        new_node_index = node_conversion_map.get(node_key, 0)
+        data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+      else:
+        data = [_CONSTANT_VALUE, -1, _serialize_keras_tensor(t), kwargs]
       return tf_utils.ListWrapper(data)
 
     data = nest.map_structure(serialize_first_arg_tensor, inputs)

From 53fe8a50811880a1ca2106acf2a2fcf4415bc1e2 Mon Sep 17 00:00:00 2001
From: Kamil Rakoczy <krakoczy@antmicro.com>
Date: Fri, 29 May 2020 09:24:35 +0200
Subject: [PATCH 1370/1533] lite: Add max.h/min.h to Makefile

Signed-off-by: Kamil Rakoczy <krakoczy@antmicro.com>
---
 tensorflow/lite/micro/tools/make/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 13761cca28b..6744f17bfc9 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -183,6 +183,8 @@ tensorflow/lite/kernels/internal/reference/logistic.h \
 tensorflow/lite/kernels/internal/reference/strided_slice.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
 tensorflow/lite/kernels/internal/cppmath.h \
+tensorflow/lite/kernels/internal/max.h \
+tensorflow/lite/kernels/internal/min.h \
 tensorflow/lite/kernels/internal/strided_slice_logic.h \
 tensorflow/lite/kernels/internal/tensor.h \
 tensorflow/lite/kernels/internal/tensor_ctypes.h \

From 4de4c60972da38d09662842614ad4dcfd019a6be Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 01:02:01 -0700
Subject: [PATCH 1371/1533] Merged commit includes the following changes:
 313738015  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Bump open source llvm revision to b726d071b4aa46004228fc38ee5bfd167f999bfe

--
313737890  by A. Unique TensorFlower<gardener@tensorflow.org>:
    Automated rollback of changelist 313718130.

313733429  by A. Unique TensorFlower<gardener@tensorflow.org>:
    Automated rollback of changelist 313729562.

313729562  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [TF:STATELES_RNG] clarify that the same output of stateless rng is only guaranteed for the same shape and seed.

--
313718732  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [XLA:SPMD] Handle window reversal in backprop filter conv

--
313718302  by A. Unique TensorFlower<gardener@tensorflow.org>:

    [Core ML Delegate] Add FP16 support for Convolution

--
313718156  by A. Unique TensorFlower<gardener@tensorflow.org>:

    Integrate LLVM at https://github.com/llvm/llvm-project/commit/b726d071b4aa

--

PiperOrigin-RevId: 313738015
---
 .../xla/service/spmd/spmd_partitioner.cc      | 95 ++++++++++---------
 .../xla/service/spmd/spmd_partitioner.h       | 11 ++-
 .../xla/service/spmd/spmd_partitioner_test.cc | 29 ++++++
 .../xla/service/spmd/spmd_partitioner_util.cc | 38 ++++++++
 .../xla/service/spmd/spmd_partitioner_util.h  |  7 ++
 tensorflow/lite/delegates/nnapi/BUILD         |  2 +
 .../lite/delegates/nnapi/nnapi_delegate.cc    | 23 ++---
 tensorflow/lite/delegates/utils.cc            | 16 +++-
 tensorflow/lite/delegates/utils.h             |  8 +-
 .../coreml/builders/convolution_op_builder.cc | 39 ++++++--
 .../coreml/builders/dummy_op_builder.cc       | 39 ++++++++
 .../coreml/builders/dummy_op_builder.h        | 41 ++++++++
 .../delegates/coreml/builders/op_builder.cc   |  6 +-
 .../delegates/coreml/builders/op_factory.h    |  2 +
 .../delegates/coreml/coreml_delegate.mm       | 31 ++----
 tensorflow/python/keras/engine/base_layer.py  | 36 +++----
 .../python/keras/engine/base_layer_utils.py   | 14 +--
 tensorflow/python/keras/engine/functional.py  | 28 ++----
 .../python/keras/engine/functional_test.py    | 87 -----------------
 tensorflow/python/keras/engine/node.py        | 15 +--
 tensorflow/workspace.bzl                      |  4 +-
 third_party/mlir/BUILD                        | 37 ++------
 third_party/mlir/test.BUILD                   |  4 +-
 23 files changed, 333 insertions(+), 279 deletions(-)
 create mode 100644 tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.cc
 create mode 100644 tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index a0c46e0b6e7..daa3d157bdc 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -308,7 +308,8 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
   return PartitionedHlo(slice, base_shape_, state_);
 }
 
-PartitionedHlo PartitionedHlo::PadWithValue(HloInstruction* pad_value) const {
+PartitionedHlo PartitionedHlo::PadWithValue(
+    HloInstruction* pad_value, absl::Span<const int64> left_padded_dims) const {
   const HloSharding& sharding = hlo_->sharding();
   const Shape& shape = hlo_->shape();
   CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
@@ -327,13 +328,20 @@ PartitionedHlo PartitionedHlo::PadWithValue(HloInstruction* pad_value) const {
     auto index_in_full_shape =
         state_.b->AddInstruction(HloInstruction::CreateBinary(
             index_shape, HloOpcode::kAdd, iota, broadcast_start_index));
-    auto valid_size = state_.b->AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR0<int32>(base_shape_.dimensions(dim))));
-    auto broadcast_valid_size = state_.b->AddInstruction(
-        HloInstruction::CreateBroadcast(index_shape, valid_size, {}));
+    ComparisonDirection direction = ComparisonDirection::kLt;
+    int64 index_limit = base_shape_.dimensions(dim);
+    if (absl::c_linear_search(left_padded_dims, dim)) {
+      direction = ComparisonDirection::kGe;
+      index_limit =
+          index_shape.dimensions(dim) * sharding.tile_assignment().dim(dim) -
+          index_limit;
+    }
+    auto limit = state_.b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(index_limit)));
+    auto broadcast_limit = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, limit, {}));
     return state_.b->AddInstruction(HloInstruction::CreateCompare(
-        mask_shape, index_in_full_shape, broadcast_valid_size,
-        ComparisonDirection::kLt));
+        mask_shape, index_in_full_shape, broadcast_limit, direction));
   };
 
   HloInstruction* mask = nullptr;
@@ -2328,39 +2336,14 @@ Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
   auto operand = GetPartitionedHlo(reverse->operand(0))
                      .Reshard(hlo_sharding_util::ReverseSharding(
                          reverse->sharding(), reverse->dimensions()));
-  // Create a window config to halo exchange for unevenly partitioned reverse
-  // dimensions.
-  Window window;
-  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
-    WindowDimension* dim = window.add_dimensions();
-    dim->set_size(1);
-    dim->set_stride(1);
-    dim->set_window_dilation(1);
-    dim->set_window_reversal(false);
-    int64 low_padding = 0;
-    if (absl::c_linear_search(reverse->dimensions(), i)) {
-      low_padding =
-          RoundUpToNearest(reverse->shape().dimensions(i),
-                           reverse->sharding().tile_assignment().dim(i)) -
-          reverse->shape().dimensions(i);
-    }
-    dim->set_padding_low(low_padding);
-    dim->set_padding_high(0);
-    dim->set_base_dilation(1);
-  }
-
-  auto reshard_operand = operand.ReshardAsWindowedInput(
-      window, operand.sharding(),
-      CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
-      /*mask_invalid_region=*/false);
-  if (!reshard_operand.has_value()) {
+  auto left_padded_operand =
+      HaloExchangeToPadOnLeft(operand, reverse->dimensions());
+  if (!left_padded_operand) {
     return DefaultAction(hlo);
   }
-  TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
   SetPartitionedHlo(hlo, [&] {
-    return b_.AddInstruction(
-        hlo->CloneWithNewOperands(reshard_operand->sharded_input->shape(),
-                                  {reshard_operand->sharded_input}));
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        left_padded_operand->shape(), {left_padded_operand}));
   });
   return Status::OK();
 }
@@ -2772,10 +2755,31 @@ Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
   for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
     lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
   }
-  auto aligned_rhs_sharding =
-      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
-  auto aligned_lhs_sharding =
-      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  Window window = hlo->window();
+  std::vector<int64> reversed_rhs_dims;
+  for (int64 i = 0; i < window.dimensions_size(); ++i) {
+    if (window.dimensions(i).window_reversal()) {
+      reversed_rhs_dims.push_back(dnums.kernel_spatial_dimensions(i));
+    }
+  }
+  if (!reversed_rhs_dims.empty()) {
+    // Make the reversed dims left-padded to prepare for window reversal.
+    auto left_padded_rhs = HaloExchangeToPadOnLeft(rhs, reversed_rhs_dims);
+    if (left_padded_rhs == nullptr) {
+      return DefaultAction(hlo);
+    }
+    left_padded_rhs->set_sharding(rhs.sharding());
+    rhs = PartitionedHlo(left_padded_rhs, rhs.base_shape(), rhs.state());
+  }
+  // Consider window reversal when resharding RHS or LHS. Note: this will not
+  // reverse the data in the shard. We use window reversal to do that.
+  auto aligned_rhs_sharding = hlo_sharding_util::ReverseSharding(
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices),
+      reversed_rhs_dims);
+  auto aligned_lhs_sharding = hlo_sharding_util::TransposeSharding(
+      hlo_sharding_util::ReverseSharding(rhs.sharding(), reversed_rhs_dims),
+      lhs_to_rhs_indices);
 
   auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
                                   const HloSharding& rhs_sharding) {
@@ -2792,13 +2796,14 @@ Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
       return DefaultAction(hlo);
     }
     lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
-    rhs = rhs.PadWithValue(zero);
+    rhs = rhs.PadWithValue(zero, reversed_rhs_dims);
   } else {
     if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
       return DefaultAction(hlo);
     }
     lhs = lhs.PadWithValue(zero);
-    rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+    rhs =
+        rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero, reversed_rhs_dims);
   }
 
   // Reshard LHS by exchanging halo such that each shard computes the partial
@@ -2817,8 +2822,6 @@ Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
   //              = (LHS - RHS) * i + low_padding
   // * right-halo: limit(i) - (i + 1) * LHS
   //   = [{(RHS - 1) * D + 1} - LHS] * (i + 1) + (WC - 1) * stride - low_padding
-
-  Window window = hlo->window();
   std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
   std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
   std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
@@ -2827,7 +2830,7 @@ Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
     int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
     int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
     auto wd = window.dimensions(i);
-    if (wd.base_dilation() != 1 || wd.window_reversal()) {
+    if (wd.base_dilation() != 1) {
       return DefaultAction(hlo);
     }
 
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 2918cd1ef58..52e4c9021d8 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -243,8 +243,13 @@ class PartitionedHlo {
   // the reshard cache.
   PartitionedHlo Reshard(const HloSharding& target);
 
-  // Pads the garbage area of the output with the provided value.
-  PartitionedHlo PadWithValue(HloInstruction* pad_value) const;
+  // Pads the garbage area of the output with the provided value. Normally,
+  // unevenly partitioned dimensions are padded on the right, but this function
+  // allows specifying left-padded dimensions, which can be used during the
+  // handling of kReverse, etc.
+  PartitionedHlo PadWithValue(
+      HloInstruction* pad_value,
+      absl::Span<const int64> left_padded_dims = {}) const;
 
   // Returns the SPMD instruction.
   HloInstruction* hlo() const { return hlo_; }
@@ -263,6 +268,8 @@ class PartitionedHlo {
       const Window& window, const HloSharding& target,
       HloInstruction* pad_value, bool mask_invalid_region = true);
 
+  const PartitioningState& state() const { return state_; }
+
  private:
   // Same as Reshard except that it does not explicitly modify the reshard
   // cache, although it would indirectly modify by calling Replicate().
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 2daf3444014..60ad0191b89 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -1300,6 +1300,35 @@ ENTRY entry {
                           op::Shape("f32[1,1,64,256]")));
 }
 
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowReversal) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[5,128,64] parameter(0), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[5,128,256] parameter(1), sharding={devices=[2,1,1]1,0}
+  ROOT %conv = f32[1,64,256] convolution(%lhs, %rhs),
+    window={size=5 rhs_reversal=1}, dim_labels=0fb_0io->0bf,
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto lhs_masked =
+      AllOf(op::Shape("f32[3,128,64]"), op::Select(_, op::Parameter(0), _));
+  auto rhs_left_padded = op::Slice(op::Concatenate(
+      op::CollectivePermute(op::Slice(op::Parameter(1))), op::Parameter(1)));
+  auto rhs_masked =
+      AllOf(op::Shape("f32[3,128,256]"), op::Select(_, rhs_left_padded, _));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(lhs_masked, rhs_masked)),
+                    op::Shape("f32[1,64,256]")));
+}
+
 TEST_F(SpmdPartitioningTest, DotLhsTiledRhsTiledWithReshard) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 8db2ca84a05..10b8199a2c9 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -664,5 +664,43 @@ absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
   return valid_slice;
 }
 
+HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
+                                        absl::Span<const int64> dims) {
+  if (original.sharding().IsTileMaximal()) {
+    return original.hlo();
+  }
+  // Create a window config to halo exchange for unevenly partitioned reverse
+  // dimensions.
+  Window window;
+  for (int64 i = 0; i < original.base_shape().rank(); ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(1);
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    int64 low_padding = 0;
+    if (absl::c_linear_search(dims, i)) {
+      low_padding =
+          RoundUpToNearest(original.base_shape().dimensions(i),
+                           original.sharding().tile_assignment().dim(i)) -
+          original.base_shape().dimensions(i);
+    }
+    dim->set_padding_low(low_padding);
+    dim->set_padding_high(0);
+    dim->set_base_dilation(1);
+  }
+
+  auto reshard_window = original.ReshardAsWindowedInput(
+      window, original.sharding(),
+      CreateZero(ShapeUtil::MakeShape(original.base_shape().element_type(), {}),
+                 original.state().b),
+      /*mask_invalid_region=*/false);
+  if (!reshard_window.has_value()) {
+    return nullptr;
+  }
+  CHECK(!reshard_window->dynamic_slice_index_on_output.has_value());
+  return reshard_window->sharded_input;
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index 440f0e78112..40be73283b7 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -227,6 +227,13 @@ absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
     const SPMDCollectiveOpsCreator& collective_ops_creator,
     int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true);
 
+// Uses halo exchange to change from right-padding to left-padding for uneven
+// tiled sharding on the given dimensions. Tiled sharding always pads uneven
+// partitioned data on the right, but we need to swap it to the left for
+// kReverse or kConvolution with window reversal.
+HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
+                                        absl::Span<const int64> dims);
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 11f6b188d7d..5df7b8cf427 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -32,6 +32,7 @@ cc_library(
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -68,6 +69,7 @@ cc_library(
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite/kernels:kernel_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/nnapi:nnapi_lib",
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index fd6703bd46a..00f3f7475bd 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -53,6 +53,7 @@ limitations under the License.
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
 #include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/minimal_logging.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
@@ -4178,17 +4179,6 @@ int StatefulNnApiDelegate::GetNnApiErrno() const {
 using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI;
 using ::tflite::delegate::nnapi::kMinSdkVersionForNNAPI12;
 
-namespace {
-
-std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
-    const std::vector<int>& data) {
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
-      TfLiteIntArrayCreate(data.size()));
-  std::copy(data.begin(), data.end(), result->data);
-  return result;
-}
-}  // namespace
-
 // static
 TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
     TfLiteContext* context, TfLiteDelegate* delegate, const NnApi* nnapi,
@@ -4198,7 +4188,8 @@ TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
   auto* delegate_data = static_cast<Data*>(delegate->data_);
   // The first entry in the array is the element count
 
-  auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
+  auto supported_nodes_int_array =
+      delegates::BuildTfLiteIntArray(supported_nodes);
   TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
       context, supported_nodes_int_array.get(), params_array, num_partitions));
   // For each partition check if which nodes are actually supported by the
@@ -4231,7 +4222,7 @@ TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
     // We changed the set of nodes to delegate this will create a different
     // partitioning layout.
     auto device_sup_nodes_int_array =
-        BuildTfLiteIntArray(*device_supported_nodes);
+        delegates::BuildTfLiteIntArray(*device_supported_nodes);
     TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
         context, device_sup_nodes_int_array.get(), params_array,
         num_partitions));
@@ -4428,7 +4419,8 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
         &num_partitions, &params_array, nnapi_errno));
   } else {
     nodes_to_delegate = supported_nodes;
-    auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
+    auto supported_nodes_int_array =
+        delegates::BuildTfLiteIntArray(supported_nodes);
     TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
         context, supported_nodes_int_array.get(), &params_array,
         &num_partitions));
@@ -4445,7 +4437,8 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
   } else {
     // Request TFLite to partition the graph and make kernels
     // for each independent node sub set a new nnapi_delegate_kernel.
-    auto nodes_to_delegate_int_array = BuildTfLiteIntArray(nodes_to_delegate);
+    auto nodes_to_delegate_int_array =
+        delegates::BuildTfLiteIntArray(nodes_to_delegate);
     return context->ReplaceNodeSubsetsWithDelegateKernels(
         context, nnapi_delegate_kernel, nodes_to_delegate_int_array.get(),
         delegate);
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index f9cf9380a31..d450cd4cea3 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -46,6 +46,14 @@ TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
   return kTfLiteOk;
 }
 
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data) {
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
+      TfLiteIntArrayCreate(data.size()));
+  std::copy(data.begin(), data.end(), result->data);
+  return result;
+}
+
 TfLiteStatus GraphPartitionHelper::Partition(
     std::set<std::string>* unsupported_nodes_info) {
   const auto prepare_status = PrepareSupportedNodes(unsupported_nodes_info);
@@ -148,12 +156,16 @@ TfLiteStatus FP16GraphPartitionHelper::Partition(
 }
 
 std::vector<int> FP16GraphPartitionHelper::GetNodesOfFirstNLargestPartitions(
-    int n) {
+    int n, int min_nodes_per_partition,
+    std::vector<TfLiteDelegateParams*>* partitions) {
   // We first get partitions to reduce the number of nodes to be checked in
   // deciding which dequant ops could actually be replaced. And then we
   // remap input-tensor to dequant nodes' inputs and remove those
   // to-be-reserved dequant nodes.
-  auto first_nps = GetFirstNLargestPartitions(n);
+  auto first_nps = GetFirstNLargestPartitions(n, min_nodes_per_partition);
+  if (partitions != nullptr) {
+    *partitions = first_nps;
+  }
   std::vector<int> ops_to_replace;
   for (const auto p : first_nps) {
     auto nodes = p->nodes_to_replace;
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 11ad9990426..c8d2f4acfc6 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <functional>
 #include <limits>
+#include <memory>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -40,6 +41,9 @@ TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
                                               TfLiteTensor** new_tensor,
                                               int* new_tensor_index);
 
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data);
+
 using IsNodeSupportedFn =
     std::function<bool(TfLiteContext*, TfLiteNode*, TfLiteRegistration*,
                        std::string* unsupported_details)>;
@@ -134,7 +138,9 @@ class FP16GraphPartitionHelper : public GraphPartitionHelper {
   // returned. The partition is ranked according to the number of nodes.
   // TODO(b/156707497): Add this to superclass besides
   // GetFirstNLargestPartitions (one that returns partitions instead of nodes)
-  std::vector<int> GetNodesOfFirstNLargestPartitions(int n);
+  std::vector<int> GetNodesOfFirstNLargestPartitions(
+      int n, int min_nodes_per_partition = 0,
+      std::vector<TfLiteDelegateParams*>* partitions = nullptr);
 
  protected:
   bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
index b6a859d1dff..11b286bdd8e 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
@@ -167,21 +167,42 @@ void ConvolutionOpBuilder::TransposeKernelWeights() {
     layer_->mutable_convolution()->set_isdeconvolution(true);
   }
 
-  auto* coreml_weights =
-      layer_->mutable_convolution()->mutable_weights()->mutable_floatvalue();
-  coreml_weights->Resize(NumElements(weights_), 0);
+  if (weights_->type == kTfLiteFloat32) {
+    auto* coreml_weights =
+        layer_->mutable_convolution()->mutable_weights()->mutable_floatvalue();
+    coreml_weights->Resize(NumElements(weights_), 0);
 
-  optimized_ops::Transpose<float>(params, tfl_shape, weights_->data.f,
-                                  coreml_shape, coreml_weights->mutable_data());
+    optimized_ops::Transpose<float>(params, tfl_shape, weights_->data.f,
+                                    coreml_shape,
+                                    coreml_weights->mutable_data());
+  } else if (weights_->type == kTfLiteFloat16) {
+    auto* coreml_weights = layer_->mutable_convolution()
+                               ->mutable_weights()
+                               ->mutable_float16value();
+    // float16value has type of bytes (std::string)
+    coreml_weights->resize(weights_->bytes, 0);
+
+    optimized_ops::Transpose<uint16_t>(
+        params, tfl_shape, reinterpret_cast<uint16_t*>(weights_->data.raw),
+        coreml_shape, reinterpret_cast<uint16_t*>(coreml_weights->data()));
+  }
 }
 
 void ConvolutionOpBuilder::FillCoreMLBias() {
   if (bias_ != nullptr) {
     layer_->mutable_convolution()->set_hasbias(true);
-    std::copy(bias_->data.f, bias_->data.f + NumElements(bias_->dims),
-              google::protobuf::RepeatedFieldBackInserter(layer_->mutable_convolution()
-                                                    ->mutable_bias()
-                                                    ->mutable_floatvalue()));
+    if (bias_->type == kTfLiteFloat32) {
+      std::copy(bias_->data.f, bias_->data.f + NumElements(bias_->dims),
+                google::protobuf::RepeatedFieldBackInserter(layer_->mutable_convolution()
+                                                      ->mutable_bias()
+                                                      ->mutable_floatvalue()));
+    } else if (bias_->type == kTfLiteFloat16) {
+      // float16value has type of bytes (std::string)
+      layer_->mutable_convolution()
+          ->mutable_bias()
+          ->mutable_float16value()
+          ->assign(bias_->data.raw, bias_->bytes);
+    }
   }
 }
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.cc
new file mode 100644
index 00000000000..b19af8d7f5c
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h"
+
+#include "tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+CoreML::Specification::NeuralNetworkLayer* DummyOpBuilder::Build() {
+  return nullptr;
+}
+
+const char* DummyOpBuilder::DebugName() { return "Dummy OpBuilder"; }
+
+TfLiteStatus DummyOpBuilder::PopulateSubgraph(TfLiteContext* context) {
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateDummyOpBuilder(GraphBuilder* graph_builder) {
+  return new DummyOpBuilder(graph_builder);
+}
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h
new file mode 100644
index 00000000000..1b151886515
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/dummy_op_builder.h
@@ -0,0 +1,41 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace coreml {
+
+// Dummy Opbuilder for nodes that are claimed but not used. ex) FP16 dequantize
+class DummyOpBuilder : public OpBuilder {
+ public:
+  explicit DummyOpBuilder(GraphBuilder* graph_builder)
+      : OpBuilder(graph_builder) {}
+  CoreML::Specification::NeuralNetworkLayer* Build() override;
+  TfLiteStatus PopulateSubgraph(TfLiteContext* context) override;
+  const char* DebugName() override;
+};
+
+}  // namespace coreml
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // THIRD_PARTY_TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_COREML_BUILDERS_DUMMY_OP_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 46634d6970a..47a2eecb51b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -24,7 +24,6 @@ namespace tflite {
 namespace delegates {
 namespace coreml {
 OpBuilder* GraphBuilder::AddBuilder(int builtin_code, const TfLiteNode* node) {
-  // Follow the ordering of TfLiteBuiltinOperator enum.
   switch (builtin_code) {
     case kTfLiteBuiltinAdd:
       return AddBuilder(CreateAddOpBuilder, node);
@@ -36,6 +35,11 @@ OpBuilder* GraphBuilder::AddBuilder(int builtin_code, const TfLiteNode* node) {
       return AddBuilder(CreateConvolutionOpBuilder, node);
     case kTfLiteBuiltinDepthwiseConv2d:
       return AddBuilder(CreateDepthwiseConvolutionOpBuilder, node);
+    // TODO(b/141490853): Add proper dequantize OpBuilder for int8/uint8 inputs.
+    case kTfLiteBuiltinDequantize:
+      // FP16 dequantize is claimed by the delegate to prevent them from running
+      // on CPU, but don't need to be excuted on the Core ML delegate either.
+      return AddBuilder(CreateDummyOpBuilder, node);
     case kTfLiteBuiltinFullyConnected:
       return AddBuilder(CreateFullyConnectedOpBuilder, node);
     case kTfLiteBuiltinLogistic:
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
index 9ca133be064..bc275908d10 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_factory.h
@@ -44,6 +44,8 @@ OpBuilder* CreateTransposeConvolutionOpBuilder(GraphBuilder* graph_builder);
 
 OpBuilder* CreateActivationLayerBuilder(GraphBuilder* graph_builder);
 OpBuilder* CreateThresholdLayerBuilder(GraphBuilder* graph_builder);
+// Dummy Opbuilder for nodes that are claimed but not used. ex) FP16 dequantize
+OpBuilder* CreateDummyOpBuilder(GraphBuilder* graph_builder);
 
 }  // namespace coreml
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 58728659894..9bc5e07fbbe 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -193,8 +193,7 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
   kernel_registration.init = [](TfLiteContext* context, const char* buffer,
                                 size_t length) -> void* {
     const auto* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-    const auto* coreml_options =
-        (reinterpret_cast<CoreMlDelegate*>(params->delegate))->params();
+    const auto* coreml_options = (reinterpret_cast<CoreMlDelegate*>(params->delegate))->params();
     CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel(coreml_options->coreml_version);
     if (coreml_kernel->Init(context, params) != kTfLiteOk) {
       delete coreml_kernel;
@@ -231,31 +230,19 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
     return IsNodeSupportedByDelegate(registration, node, context, params);
   };
 
-  delegates::GraphPartitionHelper helper(context, node_supported_fn);
-  TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
+  delegates::FP16GraphPartitionHelper partition_helper(context, node_supported_fn);
+  TF_LITE_ENSURE_STATUS(partition_helper.Partition(nullptr));
 
-  const auto delegate_partitions = helper.GetFirstNLargestPartitions(
-      params->max_delegated_partitions, params->min_nodes_per_partition);
-
-  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
-  // element to represent TfLiteIntArray.size which is the 1st element of
-  // TfLiteIntArray C struct.
-  std::vector<int> supported_nodes(1);
-  for (const auto partition : delegate_partitions) {
-    auto nodes = TfLiteIntArrayView(partition->nodes_to_replace);
-    supported_nodes.insert(supported_nodes.end(), nodes.begin(), nodes.end());
-  }
-
-  // Set first element to the number of nodes to replace.
-  supported_nodes[0] = supported_nodes.size() - 1;
+  std::vector<TfLiteDelegateParams*> partitions;
+  std::vector<int> delegated_nodes = partition_helper.GetNodesOfFirstNLargestPartitions(
+      params->max_delegated_partitions, params->min_nodes_per_partition, &partitions);
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
                   "CoreML delegate: %d nodes delegated out of %d nodes, "
                   "with %d partitions.\n",
-                  supported_nodes[0], helper.num_total_nodes(), delegate_partitions.size());
-
+                  delegated_nodes.size(), partition_helper.num_total_nodes(), partitions.size());
   return context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, GetCoreMlKernelRegistration(),
-      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+      context, GetCoreMlKernelRegistration(), delegates::BuildTfLiteIntArray(delegated_nodes).get(),
+      delegate);
 }
 
 TfLiteDelegate* CreateCoreMlDelegate(const TfLiteCoreMlDelegateOptions* options) {
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 0630199464f..9958f70ed55 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -830,13 +830,14 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     in_call = call_context.in_call
     input_list = nest.flatten(inputs)
 
-    # We will attempt to trace in a graph if & only if inputs are symbolic.
-    # This is always the case when tracing a function. It can also be the case
-    # when running eagerly if any input can be traced back to `keras.Input()`
-    # (when building models using the functional API).
-    build_graph = tf_utils.are_all_symbolic_tensors(input_list) or (
-        any(map(tf_utils.is_symbolic_tensor, nest.flatten(
-            [input_list, args, kwargs]))) and context.executing_eagerly())
+    # We will attempt to build a TF graph if & only if all inputs are symbolic.
+    # This is always the case in graph mode. It can also be the case in eager
+    # mode when all inputs can be traced back to `keras.Input()` (when building
+    # models using the functional API).
+    # TODO(kaftan): make this not special case inputs. Instead
+    # build a functional api model if *any* *arg or **kwarg is symbolic,
+    # even if part of the data structure in that arg is not symbolic.
+    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
 
     # Accept NumPy and scalar inputs by converting to Tensors.
     if any(isinstance(x, (np.ndarray, float, int)) for x in input_list):
@@ -889,14 +890,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             'training', training_value, args, kwargs)
         training_arg_passed_by_framework = True
 
-    # Turn inputs into TF op layers if necessary.
-    # This process is fragile and prone to bad interactions with inputs
-    # when calling nested layers with tf.functions floating around,
-    # and with nonsymbolic tensors.
-    # So, we limit it to the
-    # case where *all* inputs in the first arg are symbolic.
-    if (tf_utils.are_all_symbolic_tensors(input_list)
-        and base_layer_utils.needs_keras_history(inputs)):
+    # Only create Keras history if at least one tensor originates from a
+    # `keras.Input`. Otherwise this Layer may be being used outside the Keras
+    # framework.
+    # TODO(kaftan): make this not special case inputs
+    if build_graph and base_layer_utils.needs_keras_history(inputs):
       base_layer_utils.create_keras_history(inputs)
 
     with call_context.enter(self, inputs, build_graph, training_value):
@@ -970,12 +968,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             raise ValueError('A layer\'s `call` method should return a '
                              'Tensor or a list of Tensors, not None '
                              '(layer: ' + self.name + ').')
-          # We configure connectivity metadata if all inputs in the first
-          # arg have keras history, or if we're actively building the
-          # functional api outside of any outer keras model.
-          if base_layer_utils.have_all_keras_metadata(inputs) or (
-              context.executing_eagerly() and
-              base_layer_utils.have_any_keras_metadata(inputs, args, kwargs)):
+          # TODO(kaftan): This should be 'any' and check all args
+          if base_layer_utils.have_all_keras_metadata(inputs):
             if training_arg_passed_by_framework:
               args, kwargs = self._set_call_arg_value(
                   'training', None, args, kwargs, pop_kwarg_if_none=True)
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 6508d641543..6d25995e4c2 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -165,10 +165,6 @@ def have_all_keras_metadata(tensors):
   return all(hasattr(x, '_keras_history') for x in nest.flatten(tensors))
 
 
-def have_any_keras_metadata(*tensors):
-  return any(hasattr(x, '_keras_history') for x in nest.flatten(tensors))
-
-
 def generate_placeholders_from_shape(shape):
   return array_ops.placeholder(shape=shape, dtype=backend.floatx())
 
@@ -218,10 +214,7 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
   for tensor in tensor_list:
     if getattr(tensor, '_keras_history', None) is not None:
       continue
-    try:
-      op = tensor.op  # The Op that created this Tensor.
-    except AttributeError:
-      continue
+    op = tensor.op  # The Op that created this Tensor.
     if op not in processed_ops:
       if op.type.startswith('Sparse'):
         lambda_example = """
@@ -399,10 +392,7 @@ def mark_checked(tensors):
   """
 
   def _mark_checked(tensor):
-    try:
-      tensor._keras_history_checked = True  # pylint: disable=protected-access
-    except AttributeError:
-      pass
+    tensor._keras_history_checked = True  # pylint: disable=protected-access
 
   nest.map_structure(_mark_checked, tensors)
 
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index eec13345295..4958990ad66 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -32,7 +32,6 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
-from tensorflow.python.keras.engine import node as node_module
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import network_serialization
@@ -1112,28 +1111,19 @@ def reconstruct_from_config(config, custom_objects=None, created_layers=None):
         kwargs = {}
       elif len(input_data) == 4:
         kwargs = input_data[3]
-        try:
-          kwargs = _deserialize_keras_tensors(kwargs, created_layers)
-        except IndexError:
-          # Happens if keras tensors in kwargs are still unprocessed
-          add_unprocessed_node(layer, node_data)
-          return
+        kwargs = _deserialize_keras_tensors(kwargs, created_layers)
       else:
         raise ValueError('Improperly formatted model config.')
 
-      if inbound_layer_name != node_module._CONSTANT_VALUE:
-        inbound_layer = created_layers[inbound_layer_name]
-        inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
+      inbound_layer = created_layers[inbound_layer_name]
+      inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
 
-        if inbound_node_index is None:
-          add_unprocessed_node(layer, node_data)
-          return
-        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-        input_tensors.append(
-            nest.flatten(inbound_node.outputs)[inbound_tensor_index])
-      else:
-        # We received a constant w/ no Keras history attached
-        input_tensors.append(inbound_tensor_index)
+      if inbound_node_index is None:
+        add_unprocessed_node(layer, node_data)
+        return
+      inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+      input_tensors.append(
+          nest.flatten(inbound_node.outputs)[inbound_tensor_index])
     input_tensors = nest.pack_sequence_as(node_data, input_tensors)
     # Call layer on its inputs, thus creating the node
     # and building the layer if needed.
diff --git a/tensorflow/python/keras/engine/functional_test.py b/tensorflow/python/keras/engine/functional_test.py
index e975bb85bfc..90fc9f2697f 100644
--- a/tensorflow/python/keras/engine/functional_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -964,43 +964,6 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-  @combinations.generate(combinations.keras_mode_combinations())
-  def test_call_kwarg_derived_from_keras_layer_and_first_arg_is_constant(self):
-
-    class MaybeAdd(layers.Layer):
-
-      def call(self, x1, x2=None):
-        if x2 is not None:
-          return x1 + x2
-        return x1
-
-    input2 = input_layer_lib.Input(10)
-    outputs = MaybeAdd()(3., x2=input2)
-    model = training_lib.Model([input2], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=testing_utils.should_run_eagerly())
-    history = model.fit(
-        x=7 * np.ones((10, 10)),
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'MaybeAdd': MaybeAdd})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=testing_utils.should_run_eagerly())
-    history = model.fit(
-        x=7 * np.ones((10, 10)),
-        y=10 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that second input was correctly added to first.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
   @combinations.generate(combinations.keras_mode_combinations())
   def test_composite_call_kwarg_derived_from_keras_layer(self):
 
@@ -1042,56 +1005,6 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     # Check that second input was correctly added to first.
     self.assertEqual(history.history['loss'][0], 0.0)
 
-  @combinations.generate(combinations.keras_mode_combinations(mode='eager'))
-  def test_call_some_not_all_nested_in_first_arg_derived_from_keras_layer(self):
-    # This functionality is unsupported in v1 graphs
-
-    class AddAll(layers.Layer):
-
-      def call(self, x1_x2, x3):
-        x1, x2 = x1_x2
-        out = x1 + x2
-        if x3 is not None:
-          for t in x3.values():
-            out += t
-        return out
-
-    input1 = input_layer_lib.Input(10)
-    input2 = input_layer_lib.Input(10)
-    input3 = input_layer_lib.Input(10)
-
-    outputs = AddAll()(
-        [input1, 4 * array_ops.ones((1, 10))],
-        x3={
-            'a': input2,
-            'b': input3,
-            'c': 5 * array_ops.ones((1, 10))
-        })
-    model = training_lib.Model([input1, input2, input3], outputs)
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=testing_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
-        y=15 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that all inputs were correctly added.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
-    model = training_lib.Model.from_config(
-        model.get_config(), custom_objects={'AddAll': AddAll})
-    model.compile(
-        'sgd',
-        'mse',
-        run_eagerly=testing_utils.should_run_eagerly())
-    history = model.fit(
-        x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))],
-        y=15 * np.ones((10, 10)),
-        batch_size=2)
-    # Check that all inputs were correctly added.
-    self.assertEqual(history.history['loss'][0], 0.0)
-
   @combinations.generate(combinations.keras_mode_combinations())
   def test_call_nested_arg_derived_from_keras_layer(self):
 
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index 1637e054d88..708904853b2 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -32,8 +32,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 
-_CONSTANT_VALUE = '_CONSTANT_VALUE'
-
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
@@ -183,14 +181,11 @@ class Node(object):
     # `kwargs` is added to each Tensor in the first arg. This should be
     # changed in a future version of the serialization format.
     def serialize_first_arg_tensor(t):
-      if is_keras_tensor(t):
-        kh = t._keras_history
-        node_index = kh.node_index
-        node_key = make_node_key(kh.layer.name, node_index)
-        new_node_index = node_conversion_map.get(node_key, 0)
-        data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
-      else:
-        data = [_CONSTANT_VALUE, -1, _serialize_keras_tensor(t), kwargs]
+      kh = t._keras_history
+      node_index = kh.node_index
+      node_key = make_node_key(kh.layer.name, node_index)
+      new_node_index = node_conversion_map.get(node_key, 0)
+      data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
       return tf_utils.ListWrapper(data)
 
     data = nest.map_structure(serialize_first_arg_tensor, inputs)
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index bf6440504f0..15fd1f7a497 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "cf86a234ba86acf0bb875e21d27833be36e08be4"
-    LLVM_SHA256 = "5375bdcdabd4886ab86eddfddef6e21dbc3cac9df67af7d3c44fadb527f74e25"
+    LLVM_COMMIT = "b726d071b4aa46004228fc38ee5bfd167f999bfe"
+    LLVM_SHA256 = "d7e67036dc89906cb2f80df7b0b7de6344d86eddf6e98bb4d01a578242889a73"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index df875ebb62d..624f17e6aa4 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -1176,28 +1176,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "GPURuntimeTransforms",
-    srcs = [
-        "lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp",
-        "lib/Conversion/PassDetail.h",
-    ],
-    hdrs = [
-        "include/mlir/Conversion/GPUCommon/GPUCommonPass.h",
-    ],
-    includes = ["include"],
-    deps = [
-        ":ConversionPassIncGen",
-        ":GPUDialect",
-        ":IR",
-        ":LLVMDialect",
-        ":Pass",
-        ":Support",
-        "@llvm-project//llvm:core",
-        "@llvm-project//llvm:support",
-    ],
-)
-
 gentbl(
     name = "GPUToNVVMGen",
     strip_include_prefix = "lib/Conversion/GPUToNVVM",
@@ -1307,12 +1285,13 @@ cc_library(
 )
 
 cc_library(
-    name = "GPUToCUDATransforms",
+    name = "GPUToGPURuntimeTransforms",
     srcs = [
-        "lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp",
+        "lib/Conversion/GPUCommon/ConvertKernelFuncToBlob.cpp",
+        "lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp",
         "lib/Conversion/PassDetail.h",
     ],
-    hdrs = ["include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"],
+    hdrs = ["include/mlir/Conversion/GPUCommon/GPUCommonPass.h"],
     includes = ["include"],
     deps = [
         ":ConversionPassIncGen",
@@ -2490,7 +2469,7 @@ cc_library(
     includes = ["include"],
     deps = [
         ":Analysis",
-        ":GPURuntimeTransforms",
+        ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUToSPIRVTransforms",
@@ -2570,8 +2549,7 @@ cc_library(
         ":ConversionPassIncGen",
         ":GPUDialect",
         ":GPUPassIncGen",
-        ":GPURuntimeTransforms",
-        ":GPUToCUDATransforms",
+        ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUToSPIRVTransforms",
@@ -2776,7 +2754,7 @@ cc_binary(
         ":AllPassesAndDialectsNoRegistration",
         ":ExecutionEngineUtils",
         ":GPUDialect",
-        ":GPURuntimeTransforms",
+        ":GPUToGPURuntimeTransforms",
         ":GPUToNVVMTransforms",
         ":GPUToROCDLTransforms",
         ":GPUTransforms",
@@ -2786,6 +2764,7 @@ cc_binary(
         ":MlirJitRunner",
         ":NVVMDialect",
         ":Pass",
+        ":TargetNVVMIR",
         ":Transforms",
         "//devtools/build/runtime:get_runfiles_dir",
         "//third_party/gpus/cuda:cuda_headers",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index 24b310f076e..9b6cb28a394 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -158,7 +158,7 @@ cc_library(
         "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:EDSC",
         "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:GPUToCUDATransforms",
+        "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
@@ -167,6 +167,8 @@ cc_library(
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TargetNVVMIR",
+        "@llvm-project//mlir:TargetROCDLIR",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorOps",

From 74d934c259b0ca0f2da62e2027cf4d2b02d4484d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Fri, 29 May 2020 01:51:11 -0700
Subject: [PATCH 1372/1533] Let ConditionalSimplifier handle reused branch
 computations correctly.

TryRemoveUnusedConditionalOperands also needs to update the shape of
conditional ops which use the same branch computation as a different
conditional op which already had unused conditional operands removed.
This is tricky because other simplifications replace branch computations, so we
need to make sure to do this simplification after all other simplifications
have been done.
Also, clone the modified computation because there can be other callers which
are not among the conditional ops that we adjust.

PiperOrigin-RevId: 313742667
Change-Id: Iace37a311e23fea18177fb0e99daba627f6a3095
---
 tensorflow/compiler/xla/service/BUILD         |   2 +
 .../xla/service/conditional_simplifier.cc     | 186 ++++++++++--------
 .../service/conditional_simplifier_test.cc    |  36 ++--
 3 files changed, 120 insertions(+), 104 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 125a42bb2f9..dfc9aae94e0 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2189,6 +2189,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index f60742a8c23..a606e44c5ef 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/conditional_simplifier.h"
 
 #include <iterator>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
@@ -163,94 +166,92 @@ StatusOr<bool> TryRemoveConditional(HloInstruction* conditional) {
   return true;
 }
 StatusOr<bool> TryRemoveUnusedConditionalOperands(
-    HloInstruction* conditional,
-    std::map<HloComputation*, std::set<int64>>* changed_computations) {
-  // Avoid dealing with sharding.
-  if (conditional->has_sharding()) {
+    HloComputation* computation,
+    const absl::flat_hash_set<HloInstruction*>& calling_conditionals) {
+  HloInstruction* param = computation->parameter_instruction(0);
+  // Do not remove from the root instruction.
+  if (param == computation->root_instruction()) {
     return false;
   }
-  std::vector<std::set<int64>> tuple_indices_to_keep(
-      conditional->branch_count());
-  bool will_change = false;
-  for (int64 i = 0; i < conditional->branch_count(); ++i) {
-    HloComputation* computation = conditional->branch_computation(i);
-    if (changed_computations->count(computation) > 0) {
-      will_change = true;
-      break;
-    }
-    HloInstruction* param = computation->parameter_instruction(0);
-    // Do not remove the root instruction.
-    if (param == computation->root_instruction()) {
-      return false;
-    }
-    // There is nothing to be removed for non-tuple operands.
-    if (!param->shape().IsTuple()) {
-      return false;
-    }
-    for (HloInstruction* user : param->users()) {
-      // If the user is not a get tuple element, assume it is unsafe to remove
-      // elements from the tuple.
-      if (user->opcode() != HloOpcode::kGetTupleElement) {
-        return false;
-      }
-      tuple_indices_to_keep[i].insert(user->tuple_index());
-    }
-    // If not all tuple elements are used in this conditional branch, some can
-    // removed from the computation.
-    if (tuple_indices_to_keep[i].size() !=
-        ShapeUtil::TupleElementCount(param->shape())) {
-      will_change = true;
-    }
+  // There is nothing to be removed for non-tuple operands.
+  if (!param->shape().IsTuple()) {
+    return false;
   }
-
-  if (!will_change) {
+  std::set<int64> tuple_indices_to_keep;
+  for (HloInstruction* user : param->users()) {
+    // If the user is not a get tuple element, assume it is unsafe to remove
+    // elements from the tuple.
+    if (user->opcode() != HloOpcode::kGetTupleElement) {
+      return false;
+    }
+    tuple_indices_to_keep.insert(user->tuple_index());
+  }
+  // If all tuple elements are used in this conditional branch, there is nothing
+  // to be removed.
+  int64 old_tuple_element_count = ShapeUtil::TupleElementCount(param->shape());
+  if (tuple_indices_to_keep.size() == old_tuple_element_count) {
     return false;
   }
 
-  for (int64 branch = 0; branch < conditional->branch_count(); ++branch) {
-    const Shape& old_shape = conditional->operand(branch + 1)->shape();
-    int64 old_tuple_element_count = ShapeUtil::TupleElementCount(old_shape);
-    // Clone the computation in case it is called by another instruction.
-    HloComputation* computation = conditional->branch_computation(branch);
-    if (changed_computations
-            ->insert({computation, tuple_indices_to_keep[branch]})
-            .second) {
-      HloInstruction* param = computation->parameter_instruction(0);
+  // Create a new tuple shape based on the indices actually used by this
+  // computation branch.
+  std::vector<Shape> new_tuple_shapes;
+  new_tuple_shapes.reserve(tuple_indices_to_keep.size());
+  std::vector<int64> map(old_tuple_element_count, -1);
+  for (int64 i : tuple_indices_to_keep) {
+    map[i] = new_tuple_shapes.size();
+    new_tuple_shapes.push_back(param->shape().tuple_shapes(i));
+  }
+  Shape tuple_shape = ShapeUtil::MakeTupleShape(new_tuple_shapes);
+  // Clone the computation in case it is called by another non-conditional
+  // instruction.
+  HloComputation* new_computation =
+      computation->parent()->AddEmbeddedComputation(computation->Clone());
+  param = new_computation->parameter_instruction(0);
+  // Reset the parameter shape of the computation.
+  *param->mutable_shape() = tuple_shape;
 
-      // Create a new tuple shape based on the indices actually used by this
-      // branch.
-      std::vector<Shape> new_tuple_shapes;
-      new_tuple_shapes.reserve(tuple_indices_to_keep[branch].size());
-      std::vector<int64> map(old_tuple_element_count, -1);
-      for (int64 i : tuple_indices_to_keep[branch]) {
-        map[i] = new_tuple_shapes.size();
-        new_tuple_shapes.push_back(old_shape.tuple_shapes(i));
-      }
-      Shape tuple_shape = ShapeUtil::MakeTupleShape(new_tuple_shapes);
-      // Reset the parameter shape of the computation.
-      *param->mutable_shape() = tuple_shape;
+  // Reroute the GTE instructions to new tuple indices.
+  for (HloInstruction* user : param->users()) {
+    user->set_tuple_index(map[user->tuple_index()]);
+  }
 
-      // Reroute the GTE instructions to new tuple indices.
-      for (HloInstruction* user : param->users()) {
-        user->set_tuple_index(map[user->tuple_index()]);
-      }
+  // Adjust the operand shape of all calling conditionals.
+  for (HloInstruction* conditional : calling_conditionals) {
+    // Avoid dealing with sharding.
+    if (conditional->has_sharding()) {
+      continue;
     }
+    for (int64 branch = 0; branch < conditional->branch_count(); ++branch) {
+      if (conditional->branch_computation(branch) != computation) {
+        continue;
+      }
+      conditional->set_branch_computation(branch, new_computation);
+      const Shape& old_shape = conditional->operand(branch + 1)->shape();
 
-    // Reroute the operand tuple through a tuple of gte instructions of the
-    // original operand tuple.
-    const auto& to_keep = (*changed_computations)[computation];
-    std::vector<HloInstruction*> new_tuple_operands;
-    new_tuple_operands.reserve(to_keep.size());
-    for (int64 i : to_keep) {
-      new_tuple_operands.push_back(conditional->parent()->AddInstruction(
-          HloInstruction::CreateGetTupleElement(
-              old_shape.tuple_shapes(i),
-              conditional->mutable_operand(branch + 1), i)));
+      // Reroute the operand tuple through a tuple of gte instructions of the
+      // original operand tuple.
+      std::vector<HloInstruction*> new_tuple_operands;
+      new_tuple_operands.reserve(tuple_indices_to_keep.size());
+      for (int64 i : tuple_indices_to_keep) {
+        new_tuple_operands.push_back(conditional->parent()->AddInstruction(
+            HloInstruction::CreateGetTupleElement(
+                old_shape.tuple_shapes(i),
+                conditional->mutable_operand(branch + 1), i)));
+      }
+      HloInstruction* new_tuple = conditional->parent()->AddInstruction(
+          HloInstruction::CreateTuple(new_tuple_operands));
+      TF_RETURN_IF_ERROR(
+          conditional->ReplaceOperandWithDifferentShape(branch + 1, new_tuple));
+      CHECK(ShapeUtil::Compatible(conditional->operand(branch + 1)->shape(),
+                                  conditional->branch_computation(branch)
+                                      ->parameter_instruction(0)
+                                      ->shape()));
+      CHECK(ShapeUtil::Compatible(
+          conditional->shape(),
+          conditional->branch_computation(branch)->root_instruction()->shape()))
+          << conditional->branch_computation(branch)->ToString();
     }
-    HloInstruction* new_tuple = conditional->parent()->AddInstruction(
-        HloInstruction::CreateTuple(new_tuple_operands));
-    TF_RETURN_IF_ERROR(
-        conditional->ReplaceOperandWithDifferentShape(branch + 1, new_tuple));
   }
   return true;
 }
@@ -333,7 +334,7 @@ bool RemoveUnusedTupleElements(HloInstruction* conditional_op) {
   }
 
   // Compute old-to-new (old-to-new) indices mapping.
-  std::map<int, int> new_to_old_mapping, old_to_new_mapping;
+  absl::flat_hash_map<int, int> new_to_old_mapping, old_to_new_mapping;
   auto old_iter = used_indices.begin();
   for (int new_index = 0; new_index < new_tuple_shapes_size; ++new_index) {
     old_iter = std::find(old_iter, used_indices.end(), true);
@@ -519,7 +520,8 @@ bool MergeDuplicateTupleElements(HloInstruction* conditional) {
   };
 
   bool changed = false;
-  std::map<std::vector<const HloInstruction*>, int64> index_collision_table;
+  absl::flat_hash_map<std::vector<const HloInstruction*>, int64>
+      index_collision_table;
   for (int i = 0; i < conditional->shape().tuple_shapes_size(); ++i) {
     const std::vector<const HloInstruction*> ith_operands_vector =
         vectorize_branches_root_tuple_ith_operand(i);
@@ -551,16 +553,34 @@ StatusOr<bool> ConditionalSimplifier::Run(HloModule* module) {
     }
   }
 
-  std::map<HloComputation*, std::set<int64>> changed_computations;
+  absl::flat_hash_set<HloInstruction*> removed_conditionals;
   for (HloInstruction* conditional_op : conditional_ops) {
     changed |= MergeDuplicateTupleElements(conditional_op);
     changed |= RemoveUnusedTupleElements(conditional_op);
     changed |= ReplaceRootWithEmptyTupleIfNoUsers(conditional_op);
     TF_ASSIGN_OR_RETURN(bool result, TryRemoveConditional(conditional_op));
-    if (!result) {
-      TF_ASSIGN_OR_RETURN(result, TryRemoveUnusedConditionalOperands(
-                                      conditional_op, &changed_computations));
+    if (result) {
+      removed_conditionals.insert(conditional_op);
+      changed = true;
     }
+  }
+  // Try to remove unused conditional operands from branch computations. We need
+  // to be careful to adjust *all* calling conditional ops if we do that, so
+  // lets collect them first.
+  absl::flat_hash_map<HloComputation*, absl::flat_hash_set<HloInstruction*>>
+      calling_conditionals;
+  for (HloInstruction* conditional : conditional_ops) {
+    if (removed_conditionals.contains(conditional)) {
+      continue;
+    }
+    for (int64 branch = 0; branch < conditional->branch_count(); ++branch) {
+      calling_conditionals[conditional->branch_computation(branch)].insert(
+          conditional);
+    }
+  }
+  for (const auto& entry : calling_conditionals) {
+    TF_ASSIGN_OR_RETURN(bool result, TryRemoveUnusedConditionalOperands(
+                                         entry.first, entry.second));
     changed |= result;
   }
 
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
index 8a7fba6a48f..ea3101fa0ed 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier_test.cc
@@ -198,32 +198,26 @@ ENTRY main {
   c1_1 = f32[40,40] parameter(3)
   p = pred[] parameter(4)
   t = (f32[20,40], f32[40,40], f32[20,40], f32[40,40]) tuple(c0_0, c0_1, c1_0, c1_1)
-  ROOT result = (f32[20, 40]) conditional(p,t,t), false_computation=on_false, true_computation=on_true
+  call = (f32[20,40]) call(t), to_apply=on_true
+  ROOT result = (f32[20,40]) conditional(p,t,t), false_computation=on_false, true_computation=on_true
 }
 )";
   auto status = ParseAndReturnVerifiedModule(hlo_string);
   TF_ASSERT_OK(status.status());
+  std::unique_ptr<HloModule> module = status.ConsumeValueOrDie();
   HloVerifier v(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false);
-  TF_ASSERT_OK(v.Run(status.ValueOrDie().get()).status());
-  EXPECT_TRUE(
-      ConditionalSimplifier().Run(status.ValueOrDie().get()).ValueOrDie());
-  TF_ASSERT_OK(v.Run(status.ValueOrDie().get()).status());
-  EXPECT_EQ(status.ValueOrDie()
-                ->entry_computation()
-                ->root_instruction()
-                ->operand(1)
-                ->shape()
-                .tuple_shapes()
-                .size(),
-            2);
-  EXPECT_EQ(status.ValueOrDie()
-                ->entry_computation()
-                ->root_instruction()
-                ->operand(2)
-                ->shape()
-                .tuple_shapes()
-                .size(),
-            2);
+  TF_ASSERT_OK(v.Run(module.get()).status());
+  EXPECT_TRUE(ConditionalSimplifier().Run(module.get()).ValueOrDie());
+  TF_ASSERT_OK(v.Run(module.get()).status());
+  HloInstruction* conditional = module->entry_computation()->root_instruction();
+  EXPECT_TRUE(conditional != nullptr);
+  EXPECT_EQ(conditional->operand(1)->shape().tuple_shapes().size(), 2);
+  EXPECT_EQ(conditional->operand(2)->shape().tuple_shapes().size(), 2);
+  // For the call operation, nothing should have changed.
+  HloInstruction* call = FindInstruction(module.get(), "call");
+  EXPECT_EQ(
+      call->to_apply()->parameter_instruction(0)->shape().tuple_shapes().size(),
+      4);
 }
 
 TEST_F(ConditionalSimplifierTest,

From 9e6712cc08e561655f49c32341cf6216d16e2a77 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 02:02:37 -0700
Subject: [PATCH 1373/1533] Update GraphDef version to 416.

PiperOrigin-RevId: 313743772
Change-Id: I86b65f420ca54e4a5197a0470760ea8fe85ed838
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 4f06edd4162..771e2860fc1 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 415  // Updated: 2020/5/28
+#define TF_GRAPH_DEF_VERSION 416  // Updated: 2020/5/29
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From b2cc6c66a8c4ec404964009cef12f145a0ec5f19 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 02:02:38 -0700
Subject: [PATCH 1374/1533] compat: Update forward compatibility horizon to
 2020-05-29

PiperOrigin-RevId: 313743773
Change-Id: Ic9b3f5f2f7df15850312ef6dafbfe66412fb06cb
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index d21f1755d94..e0446d58403 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 28)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 29)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 4d5f0144c7b534ea9a7d68d8b310d96d9745afdd Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Fri, 29 May 2020 02:31:43 -0700
Subject: [PATCH 1375/1533] Replace usage of `GetFirstNLargestPartitions` with
 `GetNodesOfFirstNLargestPartitions`

This removes common logic to iterate through partitions, and flattening supported nodes into one vector.

PiperOrigin-RevId: 313746666
Change-Id: I703bea87cac0ea0ffe25d5a8e11e052465e15f34
---
 .../lite/delegates/nnapi/nnapi_delegate.cc    | 11 ++---
 tensorflow/lite/delegates/utils.cc            | 44 ++++++++-----------
 tensorflow/lite/delegates/utils.h             | 33 +++++++-------
 .../lite/delegates/utils/simple_delegate.cc   | 21 +++------
 .../delegates/coreml/coreml_delegate.mm       |  8 ++--
 .../delegates/hexagon/hexagon_delegate.cc     | 21 +++------
 tensorflow/lite/util.cc                       |  8 ++++
 tensorflow/lite/util.h                        |  6 +++
 8 files changed, 68 insertions(+), 84 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 00f3f7475bd..bf3714a443f 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -4188,8 +4188,7 @@ TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
   auto* delegate_data = static_cast<Data*>(delegate->data_);
   // The first entry in the array is the element count
 
-  auto supported_nodes_int_array =
-      delegates::BuildTfLiteIntArray(supported_nodes);
+  auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
   TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
       context, supported_nodes_int_array.get(), params_array, num_partitions));
   // For each partition check if which nodes are actually supported by the
@@ -4222,7 +4221,7 @@ TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
     // We changed the set of nodes to delegate this will create a different
     // partitioning layout.
     auto device_sup_nodes_int_array =
-        delegates::BuildTfLiteIntArray(*device_supported_nodes);
+        BuildTfLiteIntArray(*device_supported_nodes);
     TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
         context, device_sup_nodes_int_array.get(), params_array,
         num_partitions));
@@ -4419,8 +4418,7 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
         &num_partitions, &params_array, nnapi_errno));
   } else {
     nodes_to_delegate = supported_nodes;
-    auto supported_nodes_int_array =
-        delegates::BuildTfLiteIntArray(supported_nodes);
+    auto supported_nodes_int_array = BuildTfLiteIntArray(supported_nodes);
     TF_LITE_ENSURE_STATUS(context->PreviewDelegatePartitioning(
         context, supported_nodes_int_array.get(), &params_array,
         &num_partitions));
@@ -4437,8 +4435,7 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
   } else {
     // Request TFLite to partition the graph and make kernels
     // for each independent node sub set a new nnapi_delegate_kernel.
-    auto nodes_to_delegate_int_array =
-        delegates::BuildTfLiteIntArray(nodes_to_delegate);
+    auto nodes_to_delegate_int_array = BuildTfLiteIntArray(nodes_to_delegate);
     return context->ReplaceNodeSubsetsWithDelegateKernels(
         context, nnapi_delegate_kernel, nodes_to_delegate_int_array.get(),
         delegate);
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index d450cd4cea3..135b4d531f9 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -46,14 +46,6 @@ TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
-    const std::vector<int>& data) {
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
-      TfLiteIntArrayCreate(data.size()));
-  std::copy(data.begin(), data.end(), result->data);
-  return result;
-}
-
 TfLiteStatus GraphPartitionHelper::Partition(
     std::set<std::string>* unsupported_nodes_info) {
   const auto prepare_status = PrepareSupportedNodes(unsupported_nodes_info);
@@ -103,6 +95,19 @@ GraphPartitionHelper::GetFirstNLargestPartitions(
   return results;
 }
 
+std::vector<int> GraphPartitionHelper::GetNodesOfFirstNLargestPartitionsImpl(
+    int n, int min_nodes_per_partition) {
+  auto first_n_partitions =
+      GetFirstNLargestPartitions(n, min_nodes_per_partition);
+  std::vector<int> ops_to_replace;
+  for (const auto p : first_n_partitions) {
+    auto nodes = p->nodes_to_replace;
+    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
+                          nodes->data + nodes->size);
+  }
+  return ops_to_replace;
+}
+
 TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
     std::set<std::string>* unsupported_nodes_info) {
   if (!is_node_supported_fn_) return kTfLiteOk;
@@ -155,23 +160,12 @@ TfLiteStatus FP16GraphPartitionHelper::Partition(
   return status;
 }
 
-std::vector<int> FP16GraphPartitionHelper::GetNodesOfFirstNLargestPartitions(
-    int n, int min_nodes_per_partition,
-    std::vector<TfLiteDelegateParams*>* partitions) {
-  // We first get partitions to reduce the number of nodes to be checked in
-  // deciding which dequant ops could actually be replaced. And then we
-  // remap input-tensor to dequant nodes' inputs and remove those
-  // to-be-reserved dequant nodes.
-  auto first_nps = GetFirstNLargestPartitions(n, min_nodes_per_partition);
-  if (partitions != nullptr) {
-    *partitions = first_nps;
-  }
-  std::vector<int> ops_to_replace;
-  for (const auto p : first_nps) {
-    auto nodes = p->nodes_to_replace;
-    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
-                          nodes->data + nodes->size);
-  }
+std::vector<int>
+FP16GraphPartitionHelper::GetNodesOfFirstNLargestPartitionsImpl(
+    int n, int min_nodes_per_partition) {
+  std::vector<int> ops_to_replace =
+      GraphPartitionHelper::GetNodesOfFirstNLargestPartitionsImpl(
+          n, min_nodes_per_partition);
   RemapInputTensors(ops_to_replace);
   RemoveReservedDequantsFromNodes(&ops_to_replace);
   return ops_to_replace;
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index c8d2f4acfc6..6b498b908f9 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include <functional>
 #include <limits>
-#include <memory>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -41,9 +40,6 @@ TfLiteStatus CreateNewTensorWithDifferentType(TfLiteContext* context,
                                               TfLiteTensor** new_tensor,
                                               int* new_tensor_index);
 
-std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
-    const std::vector<int>& data);
-
 using IsNodeSupportedFn =
     std::function<bool(TfLiteContext*, TfLiteNode*, TfLiteRegistration*,
                        std::string* unsupported_details)>;
@@ -76,10 +72,21 @@ class GraphPartitionHelper {
   // Note that partitions are ranked according to the number of nodes that
   // a partition has, and the returned TfLiteDelegateParams objects are *owned*
   // by the TfLite runtime.
+  // TODO(b/156707497): remove this and use GetNodesOfFirstNLargestPartitions
   std::vector<TfLiteDelegateParams*> GetFirstNLargestPartitions(
       int n = std::numeric_limits<int>::max(),
       int min_nodes_per_partition = 0) const;
 
+  // Returns a list of node indices of all nodes from the first n largest
+  // partitions. If there are fewer paritions than n, all nodes will be
+  // returned. The partition is ranked according to the number of nodes.
+  std::vector<int> GetNodesOfFirstNLargestPartitions(
+      int n = std::numeric_limits<int>::max(),
+      int min_nodes_per_partition = 0) {
+    // Separated implementation that can be overrided, to preserve default value
+    return GetNodesOfFirstNLargestPartitionsImpl(n, min_nodes_per_partition);
+  }
+
   int num_total_nodes() const { return num_total_nodes_; }
   int num_partitions() const { return partitions_.size(); }
 
@@ -90,6 +97,8 @@ class GraphPartitionHelper {
     return is_node_supported_fn_(context, node, registration,
                                  unsupported_details);
   }
+  virtual std::vector<int> GetNodesOfFirstNLargestPartitionsImpl(
+      int n, int min_nodes_per_partition);
 
   TfLiteContext* const context_ = nullptr;
 
@@ -121,9 +130,6 @@ class GraphPartitionHelper {
 // While partitioning the graph, this claims DEQUANTIZE nodes (FP16->FP32) in
 // addition to supported nodes for the delegate, when the DEQUANTIZE node's
 // output is an input to the kernel that supports FP16 input.
-// Noth that you have to use `GetNodesOfFirstNLargestPartitions` instead of
-// superclass' `GetFirstNLargestPartitions` to do actual remapping of FP16
-// inputs.
 class FP16GraphPartitionHelper : public GraphPartitionHelper {
  public:
   FP16GraphPartitionHelper(TfLiteContext* context,
@@ -133,20 +139,15 @@ class FP16GraphPartitionHelper : public GraphPartitionHelper {
   TfLiteStatus Partition(
       std::set<std::string>* unsupported_nodes_info) override;
 
-  // Returns a list of node indices of all nodes from the first n largest
-  // partitions. If there are fewer paritions than n, all nodes will be
-  // returned. The partition is ranked according to the number of nodes.
-  // TODO(b/156707497): Add this to superclass besides
-  // GetFirstNLargestPartitions (one that returns partitions instead of nodes)
-  std::vector<int> GetNodesOfFirstNLargestPartitions(
-      int n, int min_nodes_per_partition = 0,
-      std::vector<TfLiteDelegateParams*>* partitions = nullptr);
-
  protected:
   bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
                        TfLiteRegistration* registration, int node_id,
                        std::string* unsupported_details) override;
 
+  // This will remap input tensors by removing FP16 to FP32 dequantized tensors.
+  std::vector<int> GetNodesOfFirstNLargestPartitionsImpl(
+      int n, int min_nodes_per_partition) override;
+
  private:
   // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
   // When it's not a dequant op, remap its inputs to the inputs of the preceding
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.cc b/tensorflow/lite/delegates/utils/simple_delegate.cc
index 51736e56d26..6ac99883cd2 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_delegate.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -86,31 +87,19 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context,
   delegates::GraphPartitionHelper helper(context, node_supported_fn);
   TF_LITE_ENSURE_STATUS(helper.Partition(nullptr));
 
-  const auto delegate_partitions = helper.GetFirstNLargestPartitions();
-
-  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
-  // element to represent TfLiteIntArray.size which is the 1st element of
-  // TfLiteIntArray C struct.
-  std::vector<int> supported_nodes(1);
-  for (const auto partition : delegate_partitions) {
-    auto* nodes = partition->nodes_to_replace;
-    supported_nodes.insert(supported_nodes.end(), nodes->data,
-                           nodes->data + nodes->size);
-  }
-  // Set first element to the number of nodes to replace.
-  supported_nodes[0] = supported_nodes.size() - 1;
+  std::vector<int> supported_nodes = helper.GetNodesOfFirstNLargestPartitions();
 
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
                   "%s delegate: %d nodes delegated out of %d nodes with "
                   "%d partitions.\n",
-                  delegate->name(), supported_nodes[0],
-                  helper.num_total_nodes(), delegate_partitions.size());
+                  delegate->name(), supported_nodes.size(),
+                  helper.num_total_nodes(), helper.num_partitions());
   TfLiteRegistration delegate_kernel_registration =
       GetDelegateKernelRegistration(delegate);
 
   return context->ReplaceNodeSubsetsWithDelegateKernels(
       context, delegate_kernel_registration,
-      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), base_delegate);
+      BuildTfLiteIntArray(supported_nodes).get(), base_delegate);
 }
 }  // namespace
 
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 9bc5e07fbbe..3c6d6c57f5f 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -233,15 +233,15 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   delegates::FP16GraphPartitionHelper partition_helper(context, node_supported_fn);
   TF_LITE_ENSURE_STATUS(partition_helper.Partition(nullptr));
 
-  std::vector<TfLiteDelegateParams*> partitions;
   std::vector<int> delegated_nodes = partition_helper.GetNodesOfFirstNLargestPartitions(
-      params->max_delegated_partitions, params->min_nodes_per_partition, &partitions);
+      params->max_delegated_partitions, params->min_nodes_per_partition);
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
                   "CoreML delegate: %d nodes delegated out of %d nodes, "
                   "with %d partitions.\n",
-                  delegated_nodes.size(), partition_helper.num_total_nodes(), partitions.size());
+                  delegated_nodes.size(), partition_helper.num_total_nodes(),
+                  partition_helper.num_partitions());
   return context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, GetCoreMlKernelRegistration(), delegates::BuildTfLiteIntArray(delegated_nodes).get(),
+      context, GetCoreMlKernelRegistration(), BuildTfLiteIntArray(delegated_nodes).get(),
       delegate);
 }
 
diff --git a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
index c6acc5ac947..c1b31135df0 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.cc
@@ -157,23 +157,12 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 
   TfLiteHexagonDelegateOptions* params =
       static_cast<TfLiteHexagonDelegateOptions*>(delegate->data_);
-  const auto delegate_partitions = helper.GetFirstNLargestPartitions(
+  std::vector<int> supported_nodes = helper.GetNodesOfFirstNLargestPartitions(
       params->max_delegated_partitions, params->min_nodes_per_partition);
 
-  // To avoid creating a new TfLiteIntArray and free it later, we reserve one
-  // element to represent TfLiteIntArray.size which is the 1st element of
-  // TfLiteIntArray C struct.
-  std::vector<int> supported_nodes(1);
-  for (const auto partition : delegate_partitions) {
-    auto* nodes = partition->nodes_to_replace;
-    supported_nodes.insert(supported_nodes.end(), nodes->data,
-                           nodes->data + nodes->size);
-  }
-  // Set first element to the number of nodes to replace.
-  supported_nodes[0] = supported_nodes.size() - 1;
   auto* hexagon_delegate = static_cast<HexagonDelegate*>(delegate);
   // Make sure dynamic batch is requested on fully delegated graph only.
-  if (supported_nodes[0] != helper.num_total_nodes() &&
+  if (supported_nodes.size() != helper.num_total_nodes() &&
       hexagon_delegate != nullptr &&
       hexagon_delegate->params()->enable_dynamic_batch_size) {
     TF_LITE_KERNEL_LOG(
@@ -183,12 +172,12 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
                   "Hexagon delegate: %d nodes delegated out of %d nodes with "
                   "%d partitions.\n",
-                  supported_nodes[0], helper.num_total_nodes(),
-                  delegate_partitions.size());
+                  supported_nodes.size(), helper.num_total_nodes(),
+                  helper.num_partitions());
 
   return context->ReplaceNodeSubsetsWithDelegateKernels(
       context, GetHexagonKernelRegistration(),
-      reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
+      BuildTfLiteIntArray(supported_nodes).get(), delegate);
 }
 
 TfLiteDelegate* CreateDelegate(const TfLiteHexagonDelegateOptions* params) {
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index c91e50b1845..09efaa77f15 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -38,6 +38,14 @@ bool IsFlexOp(const char* custom_name) {
                                 strlen(kFlexCustomCodePrefix)) == 0;
 }
 
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data) {
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
+      TfLiteIntArrayCreate(data.size()));
+  std::copy(data.begin(), data.end(), result->data);
+  return result;
+}
+
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
   return ConvertArrayToTfLiteIntArray(static_cast<int>(input.size()),
                                       input.data());
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index 1b68f699662..71b69c755fc 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_UTIL_H_
 #define TENSORFLOW_LITE_UTIL_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -60,6 +61,11 @@ struct TfLiteIntArrayDeleter {
   }
 };
 
+// Helper for Building TfLiteIntArray that is wrapped in a unique_ptr,
+// So that it is automatically freed when it goes out of the scope.
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data);
+
 // Populates the size in bytes of a type into `bytes`. Returns kTfLiteOk for
 // valid types, and kTfLiteError otherwise.
 TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,

From 50b99695bb368cdb4beede16533ca7069c2d3cff Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Fri, 29 May 2020 03:28:25 -0700
Subject: [PATCH 1376/1533] Fix const cast error on convolution_op_builder

PiperOrigin-RevId: 313752261
Change-Id: Iacc5f4673f458192ab3a09d923fac837a555d2dd
---
 .../delegates/coreml/builders/convolution_op_builder.cc         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
index 11b286bdd8e..8dbc18722f1 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/convolution_op_builder.cc
@@ -184,7 +184,7 @@ void ConvolutionOpBuilder::TransposeKernelWeights() {
 
     optimized_ops::Transpose<uint16_t>(
         params, tfl_shape, reinterpret_cast<uint16_t*>(weights_->data.raw),
-        coreml_shape, reinterpret_cast<uint16_t*>(coreml_weights->data()));
+        coreml_shape, reinterpret_cast<uint16_t*>(&coreml_weights->front()));
   }
 }
 

From 09691710835e706df1b586b2940635266dd28bcc Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 29 May 2020 04:32:32 -0700
Subject: [PATCH 1377/1533] [XLA][MLIR] Remove ScalarsToDimensionsTensor op.

PiperOrigin-RevId: 313757742
Change-Id: I640f1fb464e09e480218fbecb130e4c9910d0a27
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 49 +------------------
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    | 17 -------
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      |  6 ---
 .../compiler/mlir/xla/tests/canonicalize.mlir | 10 ----
 .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir  |  4 +-
 tensorflow/compiler/mlir/xla/tests/ops.mlir   | 16 ------
 .../mlir/xla/tests/unfuse_batch_norm.mlir     |  4 +-
 .../xla/transforms/hlo_legalize_to_lhlo.cc    |  2 +-
 .../mlir/xla/transforms/unfuse_batch_norm.cc  |  4 +-
 9 files changed, 8 insertions(+), 104 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index d20f1713eba..a4bdc5c212c 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -567,48 +567,6 @@ OpFoldResult BroadcastInDimOp::fold(ArrayRef<Attribute>) {
   return getOperand();
 }
 
-//===----------------------------------------------------------------------===//
-// ScalarsToDimensionTensorOp
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-// Canonicalizes the pattern of the form
-//
-// %2 = "xla_hlo.scalars_to_dimension_tensor"(%0, %1)
-//          : (i32, i32) -> tensor<2xi32>
-// %3 = extract_element %2[%c0] : tensor<2xi32>
-//
-// to just %0.
-struct ExtractElementFromScalarsToDimensionTensor
-    : public OpRewritePattern<ExtractElementOp> {
-  using OpRewritePattern<ExtractElementOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ExtractElementOp extract,
-                                PatternRewriter& rewriter) const override {
-    if (extract.indices().size() != 1) return failure();
-
-    if (auto scalars_to_tensor = dyn_cast_or_null<ScalarsToDimensionTensorOp>(
-            extract.aggregate().getDefiningOp())) {
-      APInt index;
-      if (!matchPattern(*extract.indices().begin(), m_ConstantInt(&index))) {
-        return failure();
-      }
-      rewriter.replaceOp(extract,
-                         scalars_to_tensor.getOperand(index.getZExtValue()));
-      return success();
-    }
-    return failure();
-  }
-};
-
-}  // namespace
-
-void ScalarsToDimensionTensorOp::getCanonicalizationPatterns(
-    OwningRewritePatternList& results, MLIRContext* context) {
-  results.insert<ExtractElementFromScalarsToDimensionTensor>(context);
-}
-
 //===----------------------------------------------------------------------===//
 // DynamicBroadcastInDimOp
 //===----------------------------------------------------------------------===//
@@ -1961,11 +1919,8 @@ LogicalResult deriveShapeFromFirstOperand(
           loc, builder->getI64IntegerAttr(element.value())));
     }
   }
-  *reifiedReturnShapes =
-      SmallVector<Value, 1>{builder->create<ScalarsToDimensionTensorOp>(
-          loc,
-          RankedTensorType::get({operand_type.getRank()}, shape_scalar_type),
-          shape_values)};
+  *reifiedReturnShapes = SmallVector<Value, 1>{
+      builder->create<TensorFromElementsOp>(loc, shape_values)};
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 6c54e3fbf90..7f162c97ca6 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -789,23 +789,6 @@ def HLO_BroadcastInDimOp : HLO_Op<"broadcast_in_dim",
   let hasCustomHLOConverter = 1;
 }
 
-def HLO_ScalarsToDimensionTensorOp : HLO_Op<"scalars_to_dimension_tensor",
-    [SameOperandsElementType, NoSideEffect]> {
-  string summary = "Converts a sequence of scalars into a 1d tensor.";
-
-  string description = [{
-    This is a useful operation that is currently missing in Standard. Used to
-    compute shape arguments to dynamic operations.
-  }];
-
-  let arguments = (ins Variadic<HLO_DimensionValue>:$scalars);
-  let results = (outs HLO_DimensionTensor);
-
-  // Cannot be exported to legacy formats.
-  let hasCustomHLOConverter = 1;
-  let hasCanonicalizer = 1;
-}
-
 def HLO_DynamicBroadcastInDimOp : HLO_Op<"dynamic_broadcast_in_dim",
       [NoSideEffect]> {
   string summary = "Broadcast a tensor into the given dynamic shape by adding dimensions.";
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 8150d719f3e..6479871e46b 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -602,12 +602,6 @@ LogicalResult ExportXlaOp(BroadcastInDimOp op, OpLoweringContext ctx) {
   return success();
 }
 
-LogicalResult ExportXlaOp(ScalarsToDimensionTensorOp op,
-                          OpLoweringContext ctx) {
-  // This op has no expression in the legacy export format.
-  return failure();
-}
-
 LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
   // This op has no expression in the legacy export format.
   return failure();
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index afe3e1b73a5..483dfb0c34f 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -297,16 +297,6 @@ func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
   return %0 : tensor<2x2xf32>
 }
 
-// CHECK-LABEL: @extract_scalars_to_tensor
-// CHECK-SAME: %[[ARG0:.*]]: i32, %[[ARG1:.*]]: i32
-func @extract_scalars_to_tensor(%arg0: i32, %arg1: i32) -> i32 {
-  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (i32, i32) -> tensor<2xi32>
-  %1 = constant 0 : index
-  %2 = extract_element %0[%1] : tensor<2xi32>
-  // CHECK: return %[[ARG0]]
-  return %2 : i32
-}
-
 // CHECK-LABEL: func @fold_copy
 // CHECK-SAME: [[ARG:%[a-zA-Z0-9]+]]
 func @fold_copy(%arg : tensor<1x4xf32>) -> tensor<1x4xf32> {
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 68f6d172afc..f4b9fa206f2 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -363,7 +363,7 @@ func @add_dyn(%lhs: tensor<?x?xf32>, %rhs: tensor<?x?xf32>) {
   // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
   // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
   // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
   // CHECK: %[[C0:.*]] = constant 0 : index
   // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
   // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
@@ -385,7 +385,7 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // CHECK: %[[IC0:.*]] = index_cast %[[DIM0]] : index to i64
   // CHECK: %[[DIM1:.*]] = dim %arg0, 1 : memref<?x?xf32>
   // CHECK: %[[IC1:.*]] = index_cast %[[DIM1]] : index to i64
-  // CHECK: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[IC0]], %[[IC1]]) : (i64, i64) -> tensor<2xi64>
+  // CHECK: %[[SHAPE:.*]] = tensor_from_elements(%[[IC0]], %[[IC1]]) : tensor<2xi64>
   // CHECK: %[[C0:.*]] = constant 0 : index
   // CHECK: %[[EE0:.*]] = extract_element %[[SHAPE]][%[[C0]]] : tensor<2xi64>
   // CHECK: %[[ICS0:.*]] = index_cast %[[EE0]] : i64 to index
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index e6ae074f922..0a69ee93aee 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -545,22 +545,6 @@ func @rng_uniform_invalid_type(%mu: tensor<complex<f32>>, %sigma: tensor<f32>) -
 
 // -----
 
-// CHECK-LABEL: @scalars_to_dimension_tensor
-func @scalars_to_dimension_tensor(%arg0: i32, %arg1: i32) -> tensor<2xi32> {
-  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (i32, i32) -> tensor<2xi32>
-  return %0 : tensor<2xi32>
-}
-
-// -----
-
-// CHECK-LABEL: @scalars_to_dimension_tensor_index
-func @scalars_to_dimension_tensor_index(%arg0: index, %arg1: index) -> tensor<2xindex> {
-  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (index, index) -> tensor<2xindex>
-  return %0 : tensor<2xindex>
-}
-
-// -----
-
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
index 7a54de73db7..3d8646e7fb9 100644
--- a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
@@ -106,7 +106,7 @@ func @batchNormInference_dynamic_shape(
     -> tensor<?x?x?x?xf32> {
   // CHECK-DAG: %[[EPS:.+]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], 0 : tensor<?xf32>
-  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM]]) : (index) -> tensor<1xindex>
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = tensor_from_elements(%[[DIM]]) : tensor<1xindex>
   // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = xla_hlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "xla_hlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
@@ -114,7 +114,7 @@ func @batchNormInference_dynamic_shape(
   // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], 1 : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], 2 : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], 3 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : (index, index, index, index) -> tensor<4xindex>
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = tensor_from_elements(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : tensor<4xindex>
   // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 5851bad4565..422c5f34608 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -322,7 +322,7 @@ struct HloLegalizeToLhlo
     target.addIllegalOp<mlir::TensorLoadOp>();
     target.addIllegalOp<mlir::TensorStoreOp>();
     target.addLegalOp<ModuleTerminatorOp>();
-    target.addLegalOp<ScalarsToDimensionTensorOp>();
+    target.addLegalOp<TensorFromElementsOp>();
     target.addIllegalDialect<xla_hlo::XlaHloDialect>();
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       auto inputs = op.getType().getInputs();
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
index 98eb404e4d4..eead03404cb 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
@@ -60,9 +60,7 @@ Value CalculateShapeValue(Location loc, Value operand,
   for (int64_t i = 0; i < rank; ++i) {
     shape_values.push_back(rewriter.create<mlir::DimOp>(loc, operand, i));
   }
-  Type shape_element_type = shape_values.front().getType();
-  return rewriter.create<ScalarsToDimensionTensorOp>(
-      loc, RankedTensorType::get({rank}, shape_element_type), shape_values);
+  return rewriter.create<TensorFromElementsOp>(loc, shape_values);
 }
 
 Value MaterializeEpsilon(Operation* op, FloatAttr epsilon_attr,

From 2d4f1920d2d00775196d81073f92ad6079ee7f1a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 05:10:42 -0700
Subject: [PATCH 1378/1533]   [XLA] Transform unranked HLO

  All applications of a unary element-wise operation on a given tensor are
  independent from each other.
  We use this to realize these operations on a flattened, hence ranked, tensor
  when its shape is unknown at compile time.
  With only one dynamic reshape operation before and one after the targeted
  operation we can generate efficient code for the core of the operation.
  This CL realizes the transformation for `xla_hlo.sqrt` and others will follow
  analogously.

PiperOrigin-RevId: 313761871
Change-Id: I3566c2c36f1322a5833ebe49309255ba191c1bd1
---
 tensorflow/compiler/mlir/xla/BUILD            |  62 ++++-----
 .../xla/tests/xla-transform-unranked-hlo.mlir |  66 ++++++++++
 .../compiler/mlir/xla/transforms/passes.h     |   3 +
 .../compiler/mlir/xla/transforms/rewriters.h  |   9 ++
 .../transforms/xla_transform_unranked_hlo.cc  | 124 ++++++++++++++++++
 5 files changed, 229 insertions(+), 35 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
 create mode 100644 tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 179a637ec7b..36a3afcbd81 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -50,9 +50,7 @@ gentbl(
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "ir/chlo_ops.td",
-    td_srcs = [
-        ":hlo_ops_td_files",
-    ],
+    td_srcs = [":hlo_ops_td_files"],
 )
 
 gentbl(
@@ -112,16 +110,11 @@ gentbl(
 gentbl(
     name = "xla_canonicalize_inc_gen",
     tbl_outs = [
-        (
-            "-gen-rewriters",
-            "transforms/generated_canonicalize.inc",
-        ),
+        ("-gen-rewriters", "transforms/generated_canonicalize.inc"),
     ],
     tblgen = "@llvm-project//mlir:mlir-tblgen",
     td_file = "transforms/canonicalize.td",
-    td_srcs = [
-        ":hlo_ops_td_files",
-    ],
+    td_srcs = [":hlo_ops_td_files"],
 )
 
 cc_library(
@@ -157,9 +150,7 @@ cc_library(
 
 cc_library(
     name = "xla_legalize_tf_with_tf2xla",
-    srcs = [
-        "transforms/legalize_tf_with_tf2xla.cc",
-    ],
+    srcs = ["transforms/legalize_tf_with_tf2xla.cc"],
     deps = [
         ":hlo",
         ":mlir_hlo_builder",
@@ -195,9 +186,7 @@ cc_library(
 
 cc_library(
     name = "xla_sink_constants_to_control_flow",
-    srcs = [
-        "transforms/sink_constants_to_control_flow.cc",
-    ],
+    srcs = ["transforms/sink_constants_to_control_flow.cc"],
     deps = [
         ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -285,6 +274,21 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_transform_unranked_hlo",
+    srcs = ["transforms/xla_transform_unranked_hlo.cc"],
+    deps = [
+        ":hlo",
+        "@com_google_absl//absl/memory",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "lhlo_legalize_to_gpu",
     srcs = ["transforms/lhlo_legalize_to_gpu.cc"],
@@ -400,9 +404,7 @@ gentbl(
 
 cc_library(
     name = "xla_legalize_control_flow",
-    srcs = [
-        "transforms/legalize_control_flow.cc",
-    ],
+    srcs = ["transforms/legalize_control_flow.cc"],
     deps = [
         ":hlo",
         "@llvm-project//llvm:support",
@@ -489,9 +491,7 @@ cc_library(
 
 cc_library(
     name = "xla_materialize_broadcasts",
-    srcs = [
-        "transforms/materialize_broadcasts.cc",
-    ],
+    srcs = ["transforms/materialize_broadcasts.cc"],
     deps = [
         ":hlo",
         "@llvm-project//mlir:IR",
@@ -502,9 +502,7 @@ cc_library(
 
 cc_library(
     name = "xla_unfuse_batch_norm",
-    srcs = [
-        "transforms/unfuse_batch_norm.cc",
-    ],
+    srcs = ["transforms/unfuse_batch_norm.cc"],
     deps = [
         ":hlo",
         "@llvm-project//llvm:support",
@@ -516,9 +514,7 @@ cc_library(
 
 cc_library(
     name = "chlo_legalize_to_hlo",
-    srcs = [
-        "transforms/chlo_legalize_to_hlo.cc",
-    ],
+    srcs = ["transforms/chlo_legalize_to_hlo.cc"],
     deps = [
         ":hlo",
         "@llvm-project//mlir:IR",
@@ -593,12 +589,8 @@ cc_library(
 
 cc_library(
     name = "mlir_hlo_builder",
-    srcs = [
-        "ir/mlir_hlo_builder.cc",
-    ],
-    hdrs = [
-        "ir/mlir_hlo_builder.h",
-    ],
+    srcs = ["ir/mlir_hlo_builder.cc"],
+    hdrs = ["ir/mlir_hlo_builder.h"],
     deps = [
         ":attribute_importer",
         ":hlo",
@@ -890,9 +882,9 @@ cc_library(
         ":xla_legalize_to_linalg",
         ":xla_legalize_to_standard",
         ":xla_lower",
-        ":xla_materialize_broadcasts",
         ":xla_sink_constants_to_control_flow",
         ":xla_test_passes",
+        ":xla_transform_unranked_hlo",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir b/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
new file mode 100644
index 00000000000..fdd273c13d3
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
@@ -0,0 +1,66 @@
+// RUN: xla-opt -transform-unranked-hlo -split-input-file %s | FileCheck --dump-input=fail %s
+
+// Check the validity of expected IR.
+// CHECK-LABEL: @sqr_transform_result
+func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
+
+  // Flatten operand shape.
+  %shape = shape.shape_of %a : tensor<*xf32>
+  %num_elements = shape.num_elements %shape
+  %num_elements_as_index = shape.size_to_index %num_elements
+  %flat_shape = "xla_hlo.scalars_to_dimension_tensor"(%num_elements_as_index)
+      : (index) -> tensor<1xi64>
+  %flat_a = "xla_hlo.dynamic_reshape"(%a, %flat_shape)
+      : (tensor<*xf32>, tensor<1xi64>) -> tensor<?xf32>
+
+  // Apply operation.
+  %flat_b = "xla_hlo.sqrt"(%flat_a) : (tensor<?xf32>) -> tensor<?xf32>
+
+  // Restore original shape.
+  %shape_as_extent_tensor = "shape.to_extent_tensor"(%shape)
+      : (!shape.shape) -> tensor<?xindex>
+  %b = "xla_hlo.dynamic_reshape"(%flat_b, %shape_as_extent_tensor)
+      : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+
+  return %b : tensor<*xf32>
+}
+
+// -----
+// Check transformation of unranked code.
+// CHECK-LABEL: @sqrt
+// CHECK-SAME: (%[[A:.*]]: tensor<*xf32>)
+func @sqrt(%a: tensor<*xf32>) -> tensor<*xf32> {
+  // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32>
+  // CHECK-NEXT: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
+  // CHECK-NEXT: %[[NUM_ELEMENTS_AS_INDEX:.*]] = shape.size_to_index %[[NUM_ELEMENTS]]
+  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[NUM_ELEMENTS_AS_INDEX]]) : (index) -> tensor<1xindex>
+  // CHECK-NEXT: %[[FLAT_A:.*]] = "xla_hlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+  // CHECK-NEXT: %[[FLAT_B:.*]] = "xla_hlo.sqrt"(%[[FLAT_A]]) : (tensor<?xf32>) -> tensor<?xf32>
+  // CHECK-NEXT: %[[SHAPE_AS_EXTENT_TENSOR:.*]] = "shape.to_extent_tensor"(%[[SHAPE]]) : (!shape.shape) -> tensor<?xindex>
+  // CHECK-NEXT: %[[B:.*]] = "xla_hlo.dynamic_reshape"(%[[FLAT_B]], %[[SHAPE_AS_EXTENT_TENSOR]]) : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  // CHECK-NEXT: return %[[B]] : tensor<*xf32>
+  %b = "xla_hlo.sqrt"(%a) : (tensor<*xf32>) -> tensor<*xf32>
+  return %b : tensor<*xf32>
+}
+
+// -----
+// Not transformed when ranked.
+// CHECK-LABEL: @sqrt_ranked
+// CHECK-SAME: (%[[A:.*]]: tensor<3x?xf32>)
+func @sqrt_ranked(%a: tensor<3x?xf32>) -> tensor<3x?xf32> {
+  // CHECK-NEXT: %[[B:.*]] = "xla_hlo.sqrt"(%[[A]]) : (tensor<3x?xf32>) -> tensor<3x?xf32>
+  // CHECK-NEXT: return %[[B]] : tensor<3x?xf32>
+  %b = "xla_hlo.sqrt"(%a) : (tensor<3x?xf32>) -> tensor<3x?xf32>
+  return %b : tensor<3x?xf32>
+}
+
+// -----
+// Not transformed when statically shaped.
+// CHECK-LABEL: @sqrt_static
+// CHECK-SAME: (%[[A:.*]]: tensor<2x3xf32>)
+func @sqrt_static(%a: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  // CHECK-NEXT: %[[B:.*]] = "xla_hlo.sqrt"(%[[A]]) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  // CHECK-NEXT: return %[[B]] : tensor<2x3xf32>
+  %b = "xla_hlo.sqrt"(%a) : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %b : tensor<2x3xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index a1dd6c5ce1e..e3dd5380d7c 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -66,6 +66,9 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 
+// Transforms unranked HLO operations to ranked ones where possible.
+std::unique_ptr<OperationPass<FuncOp>> createTransformUnrankedHloPass();
+
 // Sinks constants implicitly captured in control flow regions. This is
 // necessary to export to XLA.
 std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index 9cde6f84474..3e86820d7be 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -55,6 +55,15 @@ void SetupMaterializeBroadcastsLegality(MLIRContext *context,
 void PopulateMaterializeBroadcastsPatterns(MLIRContext *context,
                                            OwningRewritePatternList *patterns);
 
+// Sets up legality definitions for element-wise operations on ranked tensors.
+void SetupTransformUnrankedHloLegality(MLIRContext *context,
+                                       ConversionTarget *conversionTarget);
+
+// Populates a collection of rewrite patterns to realize element-wise operations
+// on ranked tensors where possible.
+void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
+                                          OwningRewritePatternList *patterns);
+
 // Populate a collection of conversion patterns for un-fusing
 // batch_norm_inference and batch_norm_training into constituent HLO ops.
 // TODO(laurenzo): Implement un-fusing of batch_norm_training.
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc
new file mode 100644
index 00000000000..0a8dab1c866
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc
@@ -0,0 +1,124 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+==============================================================================*/
+
+#include "absl/memory/memory.h"
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_hlo {
+namespace {
+
+template <typename OpTy>
+inline void AddLegalOpOnRankedTensor(ConversionTarget *conversionTarget) {
+  conversionTarget->addDynamicallyLegalOp<OpTy>([](OpTy op) {
+    return op.getOperand().getType().template cast<TensorType>().hasRank();
+  });
+}
+
+template <typename OpTy>
+struct UnaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
+  explicit UnaryElementwiseOpConversion(MLIRContext *context)
+      : OpRewritePattern<OpTy>(context) {}
+
+  LogicalResult matchAndRewrite(OpTy op,
+                                PatternRewriter &rewriter) const override {
+    // Don't apply conversion to ops with statically shaped operands.
+    Value operand = op.getOperand();
+    auto operandTy = operand.getType().dyn_cast<TensorType>();
+    if (operandTy.hasRank()) return failure();
+
+    // Generate IR to flatten the operand.
+    auto loc = op.getLoc();
+    Value shape = rewriter.create<shape::ShapeOfOp>(loc, operand);
+    Value numElements = rewriter.create<shape::NumElementsOp>(
+        loc, rewriter.getType<shape::SizeType>(), shape);
+    Value numElementsAsIndex = rewriter.create<shape::SizeToIndexOp>(
+        loc, rewriter.getIndexType(), numElements);
+    auto dimTensorTy = RankedTensorType::get({1}, rewriter.getIndexType());
+    Value flatShapeAsDimTensor =
+        rewriter.create<xla_hlo::ScalarsToDimensionTensorOp>(
+            loc, dimTensorTy, numElementsAsIndex);
+    auto flatTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
+                                              operandTy.getElementType());
+    Value flatOperand = rewriter.create<xla_hlo::DynamicReshapeOp>(
+        loc, flatTensorTy, operand, flatShapeAsDimTensor);
+
+    // Generate IR for the actual operation.
+    Value flatResult = rewriter.create<OpTy>(loc, flatTensorTy, flatOperand);
+
+    // Generate IR to restore the original shape.
+    auto extentTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
+                                                rewriter.getIndexType());
+    Value shapeAsExtentTensor =
+        rewriter.create<shape::ToExtentTensorOp>(loc, extentTensorTy, shape);
+    Value result = rewriter.create<xla_hlo::DynamicReshapeOp>(
+        loc, operandTy, flatResult, shapeAsExtentTensor);
+    rewriter.replaceOp(op, result);
+
+    return success();
+  }
+};
+
+struct TransformUnrankedHloPass
+    : public PassWrapper<TransformUnrankedHloPass, FunctionPass> {
+  void runOnFunction() override {
+    ConversionTarget conversionTarget(getContext());
+    OwningRewritePatternList conversionPatterns;
+    SetupTransformUnrankedHloLegality(&getContext(), &conversionTarget);
+    PopulateTransformUnrankedHloPatterns(&getContext(), &conversionPatterns);
+
+    if (failed(applyPartialConversion(getFunction(), conversionTarget,
+                                      conversionPatterns)))
+      return signalPassFailure();
+  }
+};
+
+}  // namespace
+
+void SetupTransformUnrankedHloLegality(MLIRContext *context,
+                                       ConversionTarget *conversionTarget) {
+  conversionTarget->addLegalDialect<XlaHloDialect, StandardOpsDialect,
+                                    shape::ShapeDialect>();
+
+  // Targeted operations are only legal when they operate on ranked tensors.
+  AddLegalOpOnRankedTensor<SqrtOp>(conversionTarget);
+}
+
+void PopulateTransformUnrankedHloPatterns(MLIRContext *context,
+                                          OwningRewritePatternList *patterns) {
+  patterns->insert<UnaryElementwiseOpConversion<SqrtOp>>(context);
+}
+
+std::unique_ptr<OperationPass<FuncOp>> createTransformUnrankedHloPass() {
+  return absl::make_unique<TransformUnrankedHloPass>();
+}
+
+static PassRegistration<TransformUnrankedHloPass> transform_unranked_hlo_pass(
+    "transform-unranked-hlo",
+    "Realize element-wise operations on ranked tensors where possible");
+
+}  // namespace xla_hlo
+}  // namespace mlir

From baafcdd4f5a2e0267ed04f82475b421d38b3aa65 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 05:23:28 -0700
Subject: [PATCH 1379/1533] Fix tf.function from SavedModel: don't ignore args
 after a RaggedTensor when selecting the concrete function to run.

In the fixed code, the next `elif` is cleaned up for readability as well.

PiperOrigin-RevId: 313763097
Change-Id: I4b2ec78f3c73f2581458fcd1ab353d7a3ae57ddc
---
 .../saved_model/function_deserialization.py   | 12 +++++++----
 tensorflow/python/saved_model/load_test.py    | 20 +++++++++++++------
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index b36a1f27456..dccb222c26e 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -107,10 +107,14 @@ def _concrete_function_callable_with(function, inputs, allow_conversion):
       if not expected.shape.is_compatible_with(arg.shape):
         return False
     elif isinstance(expected, type_spec.TypeSpec):
-      return expected.is_compatible_with(arg)
-    elif (_is_tensor(arg) and
-          id(arg) != id(expected)) or (not _is_tensor(arg) and arg != expected):
-      return False
+      if not expected.is_compatible_with(arg):
+        return False
+    elif _is_tensor(arg):
+      if id(arg) != id(expected):
+        return False
+    else:
+      if arg != expected:
+        return False
   return True
 
 
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 462f6f50f11..2144682e21b 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -1823,11 +1823,15 @@ class LoadTest(test.TestCase, parameterized.TestCase):
 
   def test_ragged(self, cycles):
 
-    @def_function.function(input_signature=[
-        ragged_tensor.RaggedTensorSpec(shape=[None, None], dtype=dtypes.int32)
-    ])
-    def f(x):
-      return x + 1
+    @def_function.function
+    def f(x, c=1):
+      """Returns Tensor x incremented by Python constant c."""
+      return math_ops.add(x, c)
+
+    for c in (1, 2, 3):
+      _ = f.get_concrete_function(
+          ragged_tensor.RaggedTensorSpec([None, None], dtype=dtypes.int32),
+          c)
 
     obj = tracking.AutoTrackable()
     obj.f = f
@@ -1835,10 +1839,14 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     imported1 = cycle(obj, cycles, signatures={})
     rt = ragged_factory_ops.constant([[1, 2], [3]])
     self.assertAllEqual(imported1.f(rt), [[2, 3], [4]])
+    self.assertAllEqual(imported1.f(rt, 2), [[3, 4], [5]])
+    self.assertAllEqual(imported1.f(rt, 3), [[4, 5], [6]])
 
     imported2 = cycle(obj, cycles)
     rt = ragged_factory_ops.constant([[1, 2], [3]])
-    self.assertAllEqual(imported2.f(rt), [[2, 3], [4]])
+    self.assertAllEqual(imported2.f(rt, 1), [[2, 3], [4]])
+    self.assertAllEqual(imported2.f(rt, 2), [[3, 4], [5]])
+    self.assertAllEqual(imported2.f(rt, 3), [[4, 5], [6]])
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)

From 1ab54ed35d910f44a652d0a00a1aab93384cbeb5 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 29 May 2020 05:59:37 -0700
Subject: [PATCH 1380/1533] [XLA][MLIR] Update `xla_transform_unranked_hlo` to
 use `TensorFromElementsOp`.

PiperOrigin-RevId: 313766425
Change-Id: Ia3c138f01370ced0bf7ce8d71373547f2b837c76
---
 .../mlir/xla/tests/xla-transform-unranked-hlo.mlir         | 7 +++----
 .../mlir/xla/transforms/xla_transform_unranked_hlo.cc      | 4 +---
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir b/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
index fdd273c13d3..8b26a5e4121 100644
--- a/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/xla-transform-unranked-hlo.mlir
@@ -8,10 +8,9 @@ func @sqr_transform_result(%a: tensor<*xf32>) -> tensor<*xf32> {
   %shape = shape.shape_of %a : tensor<*xf32>
   %num_elements = shape.num_elements %shape
   %num_elements_as_index = shape.size_to_index %num_elements
-  %flat_shape = "xla_hlo.scalars_to_dimension_tensor"(%num_elements_as_index)
-      : (index) -> tensor<1xi64>
+  %flat_shape = tensor_from_elements(%num_elements_as_index) : tensor<1xindex>
   %flat_a = "xla_hlo.dynamic_reshape"(%a, %flat_shape)
-      : (tensor<*xf32>, tensor<1xi64>) -> tensor<?xf32>
+      : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
 
   // Apply operation.
   %flat_b = "xla_hlo.sqrt"(%flat_a) : (tensor<?xf32>) -> tensor<?xf32>
@@ -33,7 +32,7 @@ func @sqrt(%a: tensor<*xf32>) -> tensor<*xf32> {
   // CHECK-NEXT: %[[SHAPE:.*]] = shape.shape_of %[[A]] : tensor<*xf32>
   // CHECK-NEXT: %[[NUM_ELEMENTS:.*]] = shape.num_elements %[[SHAPE]]
   // CHECK-NEXT: %[[NUM_ELEMENTS_AS_INDEX:.*]] = shape.size_to_index %[[NUM_ELEMENTS]]
-  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[NUM_ELEMENTS_AS_INDEX]]) : (index) -> tensor<1xindex>
+  // CHECK-NEXT: %[[FLAT_SHAPE:.*]] = tensor_from_elements(%[[NUM_ELEMENTS_AS_INDEX]]) : tensor<1xindex>
   // CHECK-NEXT: %[[FLAT_A:.*]] = "xla_hlo.dynamic_reshape"(%[[A]], %[[FLAT_SHAPE]]) : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-NEXT: %[[FLAT_B:.*]] = "xla_hlo.sqrt"(%[[FLAT_A]]) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK-NEXT: %[[SHAPE_AS_EXTENT_TENSOR:.*]] = "shape.to_extent_tensor"(%[[SHAPE]]) : (!shape.shape) -> tensor<?xindex>
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc
index 0a8dab1c866..b2afc7c1026 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_transform_unranked_hlo.cc
@@ -57,10 +57,8 @@ struct UnaryElementwiseOpConversion : public OpRewritePattern<OpTy> {
         loc, rewriter.getType<shape::SizeType>(), shape);
     Value numElementsAsIndex = rewriter.create<shape::SizeToIndexOp>(
         loc, rewriter.getIndexType(), numElements);
-    auto dimTensorTy = RankedTensorType::get({1}, rewriter.getIndexType());
     Value flatShapeAsDimTensor =
-        rewriter.create<xla_hlo::ScalarsToDimensionTensorOp>(
-            loc, dimTensorTy, numElementsAsIndex);
+        rewriter.create<TensorFromElementsOp>(loc, numElementsAsIndex);
     auto flatTensorTy = RankedTensorType::get({ShapedType::kDynamicSize},
                                               operandTy.getElementType());
     Value flatOperand = rewriter.create<xla_hlo::DynamicReshapeOp>(

From ea3af1e6953eba2787d52a7b6ac1eb97c7e853e7 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Fri, 29 May 2020 06:19:59 -0700
Subject: [PATCH 1381/1533] [XLA][MLIR] Remove old BufferAssignment code.

PiperOrigin-RevId: 313768788
Change-Id: I41b201701d67467d50bdce2c0925521bf953dd10
---
 tensorflow/compiler/mlir/xla/BUILD            |  38 --
 .../mlir/xla/tests/buffer-assignment.mlir     | 260 ----------
 .../mlir/xla/transforms/buffer_assignment.cc  | 485 ------------------
 .../mlir/xla/transforms/buffer_assignment.h   | 140 -----
 .../xla/transforms/buffer_assignment_test.cc  | 180 -------
 5 files changed, 1103 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
 delete mode 100644 tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
 delete mode 100644 tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
 delete mode 100644 tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 36a3afcbd81..d2bae14c9c3 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -353,42 +353,6 @@ cc_library(
     alwayslink = 1,
 )
 
-cc_library(
-    name = "buffer_assignment",
-    srcs = ["transforms/buffer_assignment.cc"],
-    hdrs = ["transforms/buffer_assignment.h"],
-    deps = [
-        "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "buffer_assignment_test",
-    srcs = ["transforms/buffer_assignment_test.cc"],
-    hdrs = [
-        "transforms/buffer_assignment.h",
-        "transforms/passes.h",
-    ],
-    deps = [
-        "@com_google_absl//absl/memory",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-    alwayslink = 1,
-)
-
 gentbl(
     name = "xla_legalize_to_standard_inc_gen",
     tbl_outs = [
@@ -863,8 +827,6 @@ cc_library(
         "//tensorflow/compiler/mlir:__subpackages__",
     ],
     deps = [
-        ":buffer_assignment",
-        ":buffer_assignment_test",
         ":chlo_legalize_to_hlo",
         ":hlo",
         ":hlo_legalize_to_lhlo",
diff --git a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
deleted file mode 100644
index d6c164f8160..00000000000
--- a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
+++ /dev/null
@@ -1,260 +0,0 @@
-// RUN: tf-opt -test-buffer-assignment -allow-unregistered-dialect -split-input-file %s | FileCheck %s -dump-input-on-failure
-
-// CHECK-LABEL: func @func_signature_conversion
-func @func_signature_conversion(%arg0: tensor<4x8xf32>) {
-    return
-}
-// CHECK: ({{.*}}: memref<4x8xf32>) {
-
-// -----
-
-// CHECK-LABEL: func @non_void_to_void_return_op_converter
-func @non_void_to_void_return_op_converter(%arg0: tensor<4x8xf32>) -> tensor<4x8xf32> {
-  return %arg0 : tensor<4x8xf32>
-}
-//      CHECK: (%[[ARG0:.*]]: [[TYPE:.*]]<[[RANK:.*]]>, %[[RESULT:.*]]: [[TYPE]]<[[RANK]]>) {
-// CHECK-NEXT: "buffer_assignment_test.copy"(%[[ARG0]], %[[RESULT]])
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @func_and_block_signature_conversion
-func @func_and_block_signature_conversion(%arg0 : tensor<2xf32>, %cond : i1, %arg1: tensor<4x4xf32>) -> tensor<4x4xf32>{
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    br ^exit(%arg0 : tensor<2xf32>)
-  ^bb2:
-    br ^exit(%arg0 : tensor<2xf32>)
-  ^exit(%arg2: tensor<2xf32>):
-    return %arg1 : tensor<4x4xf32>
-}
-//      CHECK: (%[[ARG0:.*]]: [[ARG0_TYPE:.*]], %[[COND:.*]]: i1, %[[ARG1:.*]]: [[ARG1_TYPE:.*]], %[[RESULT:.*]]: [[RESULT_TYPE:.*]]) {
-//      CHECK: br ^[[EXIT_BLOCK:.*]](%[[ARG0]] : [[ARG0_TYPE]])
-//      CHECK: br ^[[EXIT_BLOCK]](%[[ARG0]] : [[ARG0_TYPE]])
-//      CHECK: ^[[EXIT_BLOCK]](%{{.*}}: [[ARG0_TYPE]])
-// CHECK-NEXT: "buffer_assignment_test.copy"(%[[ARG1]], %[[RESULT]])
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @condBranch
-func @condBranch(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    br ^exit(%arg0 : tensor<2xf32>)
-  ^bb2:
-    %1 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    br ^exit(%1 : tensor<2xf32>)
-  ^exit(%arg1: tensor<2xf32>):
-    return %arg1 : tensor<2xf32>
-
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: cond_br
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @emptyUsesValue
-func @emptyUsesValue(%arg0: memref<4xf32>) {
-  %0 = alloc() : memref<4xf32>
-  return
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @criticalEdge
-func @criticalEdge(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    cond_br %cond, ^bb1, ^exit(%arg0 : tensor<2xf32>)
-  ^bb1:
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    br ^exit(%0 : tensor<2xf32>)
-  ^exit(%arg1: tensor<2xf32>):
-    return %arg1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: cond_br
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @invCriticalEdge
-func @invCriticalEdge(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1, ^exit(%arg0 : tensor<2xf32>)
-  ^bb1:
-    br ^exit(%0 : tensor<2xf32>)
-  ^exit(%arg1: tensor<2xf32>):
-    return %arg1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @ifElse
-func @ifElse(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
-                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
-  ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
-    br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
-  ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
-    br ^exit(%arg3, %arg4 : tensor<2xf32>, tensor<2xf32>)
-  ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-    %1 = "buffer_assignment_test.unary"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
-    return %1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
-// CHECK-NEXT: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[SECOND_ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @ifElseNoUsers
-func @ifElseNoUsers(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
-                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
-  ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
-    br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
-  ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
-    br ^exit(%arg3, %arg4 : tensor<2xf32>, tensor<2xf32>)
-  ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-    return %arg0 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @ifElseNested
-func @ifElseNested(%cond : i1, %arg0 : tensor<2xf32>) -> tensor<2xf32>{
-    %0 = "buffer_assignment_test.unary"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
-    cond_br %cond, ^bb1(%arg0, %0: tensor<2xf32>, tensor<2xf32>),
-                   ^bb2(%0, %arg0: tensor<2xf32>, tensor<2xf32>)
-  ^bb1(%arg1 : tensor<2xf32>, %arg2 : tensor<2xf32>):
-    br ^exit(%arg1, %arg2 : tensor<2xf32>, tensor<2xf32>)
-  ^bb2(%arg3 : tensor<2xf32>, %arg4 : tensor<2xf32>):
-    cond_br %cond, ^bb3(%arg3 : tensor<2xf32>), ^bb4(%arg4 : tensor<2xf32>)
-  ^bb3(%arg7 : tensor<2xf32>):
-    br ^exit(%arg7, %arg3 : tensor<2xf32>, tensor<2xf32>)
-  ^bb4(%arg8 : tensor<2xf32>):
-    br ^exit(%arg3, %arg8 : tensor<2xf32>, tensor<2xf32>)
-  ^exit(%arg5 : tensor<2xf32>, %arg6 : tensor<2xf32>):
-    %1 = "buffer_assignment_test.unary"(%arg5) : (tensor<2xf32>) -> tensor<2xf32>
-    return %1 : tensor<2xf32>
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-//      CHECK: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: dealloc %[[FIRST_ALLOC]]
-// CHECK-NEXT: "buffer_assignment_test.copy
-// CHECK-NEXT: dealloc %[[SECOND_ALLOC]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @redundantOperations
-func @redundantOperations(%arg0: tensor<4xf32>) {
-  %1 = "buffer_assignment_test.unary"(%arg0) : (tensor<4xf32>) -> tensor<4xf32>
-  %2 = "buffer_assignment_test.unary"(%1) : (tensor<4xf32>) -> tensor<4xf32>
-  return
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
-// CHECK-NEXT: "buffer_assignment_test.unary_lowered"
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @moving_alloc_and_inserting_missing_dealloc
-func @moving_alloc_and_inserting_missing_dealloc(%cond : i1, %arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    %0 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    br ^exit(%0 : memref<2xf32>)
-  ^bb2:
-
-    %1 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-    br ^exit(%1 : memref<2xf32>)
-  ^exit(%arg2: memref<2xf32>):
-    "buffer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-// CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
-// CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
-//      CHECK: "buffer_assignment_test.copy"
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @moving_invalid_dealloc_op_complex
-func @moving_invalid_dealloc_op_complex(%cond : i1, %arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    cond_br %cond, ^bb1, ^bb2
-  ^bb1:
-    br ^exit(%arg0 : memref<2xf32>)
-  ^bb2:
-    %1 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-    dealloc %1 : memref<2xf32>
-    br ^exit(%1 : memref<2xf32>)
-  ^exit(%arg2: memref<2xf32>):
-    "buffer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-// CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-//      CHECK: buffer_assignment_test.copy
-// CHECK-NEXT: dealloc
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @inserting_missing_dealloc_simple
-func @inserting_missing_dealloc_simple(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    %0 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    "buffer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-//      CHECK: buffer_assignment_test.copy
-// CHECK-NEXT: dealloc
-
-// -----
-
-// CHECK-LABEL: func @moving_invalid_dealloc_op
-func @moving_invalid_dealloc_op(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
-    %0 = alloc() : memref<2xf32>
-    "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    dealloc %0 : memref<2xf32>
-    "buffer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
-    return
-}
-//      CHECK: buffer_assignment_test.copy
-// CHECK-NEXT: dealloc
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
deleted file mode 100644
index 640b9b84622..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.cc
+++ /dev/null
@@ -1,485 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for computing proper alloc and dealloc positions.
-// The main class is the BufferAssignment class that realizes this analysis.
-// In order to put allocations and deallocations at safe positions, it is
-// significantly important to put them into the proper blocks. However, the
-// liveness analysis does not pay attention to aliases, which can occur due to
-// branches (and their associated block arguments) in general. For this purpose,
-// BufferAssignment firstly finds all possible aliases for a single value (using
-// the BufferAssignmentAliasAnalysis class). Consider the following example:
-//
-// ^bb0(%arg0):
-//   cond_br %cond, ^bb1, ^bb2
-// ^bb1:
-//   br ^exit(%arg0)
-// ^bb2:
-//   %new_value = ...
-//   br ^exit(%new_value)
-// ^exit(%arg1):
-//   return %arg1;
-//
-// Using liveness information on its own would cause us to place the allocs and
-// deallocs in the wrong block. This is due to the fact that %new_value will not
-// be liveOut of its block. Instead, we have to place the alloc for %new_value
-// in bb0 and its associated dealloc in exit. Using the class
-// BufferAssignmentAliasAnalysis, we will find out that %new_value has a
-// potential alias %arg1. In order to find the dealloc position we have to find
-// all potential aliases, iterate over their uses and find the common
-// post-dominator block. In this block we can safely be sure that %new_value
-// will die and can use liveness information to determine the exact operation
-// after which we have to insert the dealloc. Finding the alloc position is
-// highly similar and non- obvious. Again, we have to consider all potential
-// aliases and find the common dominator block to place the alloc.
-//
-// TODO(dfki):
-// The current implementation does not support loops. The only thing that
-// is currently missing is a high-level loop analysis that allows us to move
-// allocs and deallocs outside of the loop blocks.
-
-#include "tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h"
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
-#include "mlir/IR/Function.h"                 // TF:llvm-project
-#include "mlir/IR/Operation.h"                // TF:llvm-project
-#include "mlir/Pass/Pass.h"                   // TF:llvm-project
-#include "absl/memory/memory.h"
-
-namespace mlir {
-namespace xla {
-namespace {
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentAliasAnalysis
-//===----------------------------------------------------------------------===//
-
-/// A straight-forward alias analysis which ensures that all aliases of all
-/// values will be determined. This is a requirement for the BufferAssignment
-/// class since you need to determine safe positions to place alloc and
-/// deallocs.
-class BufferAssignmentAliasAnalysis {
- public:
-  using ValueSetT = SmallPtrSet<Value, 16>;
-
- public:
-  /// Constructs a new alias analysis using the op provided.
-  BufferAssignmentAliasAnalysis(Operation* op) { build(op->getRegions()); }
-
-  /// Finds all immediate and indirect aliases this value could potentially
-  /// have. Note that the resulting set will also contain the value provided as
-  /// it is an alias of itself.
-  ValueSetT resolve(Value value) const {
-    ValueSetT result;
-    resolveRecursive(value, result);
-    return result;
-  }
-
- private:
-  /// Recursively determines alias information for the given value. It stores
-  /// all newly found potential aliases in the given result set.
-  void resolveRecursive(Value value, ValueSetT& result) const {
-    if (!result.insert(value).second) {
-      return;
-    }
-    auto it = aliases.find(value);
-    if (it == aliases.end()) return;
-    for (auto alias : it->second) {
-      resolveRecursive(alias, result);
-    }
-  }
-
-  /// This function constructs a mapping from values to its immediate aliases.
-  /// It iterates over all blocks, gets their predecessors, determines the
-  /// values that will be passed to the corresponding block arguments and
-  /// inserts them into map.
-  void build(MutableArrayRef<Region> regions) {
-    for (Region& region : regions) {
-      for (Block& block : region) {
-        // Iterate over all predecessor and get the mapped values to their
-        // corresponding block arguments values.
-        for (auto pred : block.getPredecessors()) {
-          // Determine the current successor index of the current predecessor.
-          unsigned successorIndex = std::distance(
-              pred->getSuccessors().begin(),
-              llvm::find_if(pred->getSuccessors(), [&](Block* successor) {
-                return successor == &block;
-              }));
-          // Get the terminator and the values that will be passed to our block.
-          if (auto branchInterface =
-                  dyn_cast<BranchOpInterface>(pred->getTerminator())) {
-            // Query the branch op interace to get the successor operands.
-            auto successorOps =
-                branchInterface.getSuccessorOperands(successorIndex);
-            if (successorOps.hasValue()) {
-              // Build the actual mapping of values to their immediate aliases.
-              for (auto arg : block.getArguments()) {
-                Value predecessorArgValue =
-                    successorOps.getValue()[arg.getArgNumber()];
-                aliases[predecessorArgValue].insert(arg);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-
-  /// Maps values to all immediate aliases this value can have.
-  llvm::DenseMap<Value, ValueSetT> aliases;
-};
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentPositions
-//===----------------------------------------------------------------------===//
-
-/// Stores proper alloc and dealloc positions to place dialect-specific alloc
-/// and dealloc operations.
-struct BufferAssignmentPositions {
- public:
-  BufferAssignmentPositions()
-      : allocPosition(nullptr), deallocPosition(nullptr) {}
-
-  /// Creates a new positions tuple including alloc and dealloc positions.
-  BufferAssignmentPositions(Operation* allocPosition,
-                            Operation* deallocPosition)
-      : allocPosition(allocPosition), deallocPosition(deallocPosition) {}
-
-  /// Returns the alloc position before which the alloc operation has to be
-  /// inserted.
-  Operation* getAllocPosition() const { return allocPosition; }
-
-  /// Returns the dealloc position after which the dealloc operation has to be
-  /// inserted.
-  Operation* getDeallocPosition() const { return deallocPosition; }
-
- private:
-  Operation* allocPosition;
-  Operation* deallocPosition;
-};
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentAnalysis
-//===----------------------------------------------------------------------===//
-
-// The main buffer assignment analysis used to place allocs and deallocs.
-class BufferAssignmentAnalysis {
- public:
-  using DeallocSetT = SmallPtrSet<Operation*, 2>;
-
- public:
-  BufferAssignmentAnalysis(Operation* op)
-      : operation(op),
-        liveness(op),
-        dominators(op),
-        postDominators(op),
-        aliases(op) {}
-
-  /// Computes the actual positions to place allocs and deallocs for the given
-  /// value.
-  BufferAssignmentPositions computeAllocAndDeallocPositions(Value value) const {
-    if (value.use_empty()) {
-      return BufferAssignmentPositions(value.getDefiningOp(),
-                                       value.getDefiningOp());
-    }
-    // Get all possible aliases
-    auto possibleValues = aliases.resolve(value);
-    return BufferAssignmentPositions(getAllocPosition(value, possibleValues),
-                                     getDeallocPosition(value, possibleValues));
-  }
-
-  /// Finds all associated dealloc nodes for the alloc nodes using alias
-  /// information.
-  DeallocSetT findAssociatedDeallocs(AllocOp alloc) const {
-    DeallocSetT result;
-    auto possibleValues = aliases.resolve(alloc);
-    for (auto alias : possibleValues) {
-      for (auto user : alias.getUsers()) {
-        if (isa<DeallocOp>(user)) result.insert(user);
-      }
-    }
-    return result;
-  }
-
-  /// Dumps the buffer assignment information to the given stream.
-  void print(raw_ostream& os) const {
-    os << "// ---- Buffer Assignment -----\n";
-
-    for (Region& region : operation->getRegions())
-      for (Block& block : region)
-        for (Operation& operation : block)
-          for (Value result : operation.getResults()) {
-            BufferAssignmentPositions positions =
-                computeAllocAndDeallocPositions(result);
-            os << "Positions for ";
-            result.print(os);
-            os << "\n Alloc: ";
-            positions.getAllocPosition()->print(os);
-            os << "\n Dealloc: ";
-            positions.getDeallocPosition()->print(os);
-            os << "\n";
-          }
-  }
-
- private:
-  /// Finds a proper placement block to store alloc/dealloc node according to
-  /// the algorithm described at the top of the file. It supports dominator and
-  /// post-dominator analyses via template arguments.
-  template <typename AliasesT, typename DominatorT>
-  Block* findPlacementBlock(Value value, const AliasesT& aliases,
-                            const DominatorT& doms) const {
-    assert(!value.isa<BlockArgument>() && "Cannot place a block argument");
-    // Start with the current block the value is defined in.
-    Block* dom = value.getDefiningOp()->getBlock();
-    // Iterate over all aliases and their uses to find a safe placement block
-    // according to the given dominator information.
-    for (auto alias : aliases) {
-      for (auto user : alias.getUsers()) {
-        // Move upwards in the dominator tree to find an appropriate
-        // dominator block that takes the current use into account.
-        dom = doms.findNearestCommonDominator(dom, user->getBlock());
-      }
-    }
-    return dom;
-  }
-
-  /// Finds a proper alloc positions according to the algorithm described at the
-  /// top of the file.
-  template <typename AliasesT>
-  Operation* getAllocPosition(Value value, const AliasesT& aliases) const {
-    // Determine the actual block to place the alloc and get liveness
-    // information.
-    auto placementBlock = findPlacementBlock(value, aliases, dominators);
-    auto livenessInfo = liveness.getLiveness(placementBlock);
-
-    // We have to ensure that the alloc will be before the first use of all
-    // aliases of the given value. We first assume that there are no uses in the
-    // placementBlock and that we can safely place the alloc before the
-    // terminator at the end of the block.
-    Operation* startOperation = placementBlock->getTerminator();
-    // Iterate over all aliases and ensure that the startOperation will point to
-    // the first operation of all potential aliases in the placementBlock.
-    for (auto alias : aliases) {
-      auto aliasStartOperation = livenessInfo->getStartOperation(alias);
-      // Check whether the aliasStartOperation lies in the desired block and
-      // whether it is before the current startOperation. If yes, this will be
-      // the new startOperation.
-      if (aliasStartOperation->getBlock() == placementBlock &&
-          aliasStartOperation->isBeforeInBlock(startOperation)) {
-        startOperation = aliasStartOperation;
-      }
-    }
-    // startOperation is the first operation before which we can safely store
-    // the alloc taking all potential aliases into account.
-    return startOperation;
-  }
-
-  /// Finds a proper dealloc positions according to the algorithm described at
-  /// the top of the file.
-  template <typename AliasesT>
-  Operation* getDeallocPosition(Value value, const AliasesT& aliases) const {
-    // Determine the actual block to place the dealloc and get liveness
-    // information.
-    auto placementBlock = findPlacementBlock(value, aliases, postDominators);
-    auto livenessInfo = liveness.getLiveness(placementBlock);
-
-    // We have to ensure that the dealloc will be after the last use of all
-    // aliases of the given value. We first assume that there are no uses in the
-    // placementBlock and that we can safely place the dealloc at the beginning.
-    Operation* endOperation = &placementBlock->front();
-    // Iterate over all aliases and ensure that the endOperation will point to
-    // the last operation of all potential aliases in the placementBlock.
-    for (auto alias : aliases) {
-      auto aliasEndOperation =
-          livenessInfo->getEndOperation(alias, endOperation);
-      // Check whether the aliasEndOperation lies in the desired block and
-      // whether it is behind the current endOperation. If yes, this will be the
-      // new endOperation.
-      if (aliasEndOperation->getBlock() == placementBlock &&
-          endOperation->isBeforeInBlock(aliasEndOperation)) {
-        endOperation = aliasEndOperation;
-      }
-    }
-    // endOperation is the last operation behind which we can safely store the
-    // dealloc taking all potential aliases into account.
-    return endOperation;
-  }
-
-  /// The operation this transformation was constructed from.
-  Operation* operation;
-
-  /// The underlying liveness analysis to compute fine grained information about
-  /// alloc and dealloc positions.
-  Liveness liveness;
-
-  /// The dominator analysis to place allocs in the appropriate blocks.
-  DominanceInfo dominators;
-
-  /// The post dominator analysis to place deallocs in the appropriate blocks.
-  PostDominanceInfo postDominators;
-
-  /// The internal alias analysis to ensure that allocs and deallocs take all
-  /// their potential aliases into account.
-  BufferAssignmentAliasAnalysis aliases;
-};
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentPass
-//===----------------------------------------------------------------------===//
-
-/// The actual buffer assignment pass that moves alloc and dealloc nodes into
-/// the right positions. It uses the algorithm described at the top of the file.
-// TODO(dfki): create a templated version that allows to match dialect-specific
-// alloc/dealloc nodes and to insert dialect-specific dealloc node.
-struct BufferAssignmentPass
-    : mlir::PassWrapper<BufferAssignmentPass, FunctionPass> {
-  void runOnFunction() override {
-    // Get required analysis information first.
-    auto& analysis = getAnalysis<BufferAssignmentAnalysis>();
-
-    // Compute an initial placement of all nodes.
-    llvm::SmallDenseMap<Value, BufferAssignmentPositions, 16> placements;
-    getFunction().walk([&](AllocOp alloc) {
-      placements[alloc] = analysis.computeAllocAndDeallocPositions(alloc);
-    });
-
-    // Move alloc (and dealloc - if any) nodes into the right places
-    // and insert dealloc nodes if necessary.
-    getFunction().walk([&](AllocOp alloc) {
-      // Find already associated dealloc nodes.
-      auto deallocs = analysis.findAssociatedDeallocs(alloc);
-      assert(deallocs.size() < 2 &&
-             "Not supported number of associated dealloc operations");
-
-      // Move alloc node to the right place.
-      BufferAssignmentPositions& positions = placements[alloc];
-      Operation* allocOperation = alloc.getOperation();
-      allocOperation->moveBefore(positions.getAllocPosition());
-
-      // If there is an existing dealloc, move it to the right place.
-      if (deallocs.size()) {
-        Operation* nextOp = positions.getDeallocPosition()->getNextNode();
-        assert(nextOp && "Invalid Dealloc operation position");
-        (*deallocs.begin())->moveBefore(nextOp);
-      } else {
-        // If there is no dealloc node, insert one in the right place.
-        OpBuilder builder(alloc);
-        builder.setInsertionPointAfter(positions.getDeallocPosition());
-        builder.create<DeallocOp>(allocOperation->getLoc(), alloc);
-      }
-    });
-  };
-};
-
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// BufferAssignmentPlacer
-//===----------------------------------------------------------------------===//
-
-/// Creates a new assignment placer.
-BufferAssignmentPlacer::BufferAssignmentPlacer(Operation* op)
-    : operation(op), dominators(op) {}
-
-/// Computes the actual position to place allocs for the given value.
-OpBuilder::InsertPoint BufferAssignmentPlacer::computeAllocPosition(
-    Value value) {
-  Operation* insertOp = value.getDefiningOp();
-  assert(insertOp && "There is not a defining operation for the input value");
-  OpBuilder opBuilder(insertOp);
-  return opBuilder.saveInsertionPoint();
-}
-
-//===----------------------------------------------------------------------===//
-// FunctionAndBlockSignatureConverter
-//===----------------------------------------------------------------------===//
-
-// Performs the actual signature rewriting step.
-LogicalResult FunctionAndBlockSignatureConverter::matchAndRewrite(
-    FuncOp funcOp, ArrayRef<Value> operands,
-    ConversionPatternRewriter& rewriter) const {
-  auto toMemrefConverter = [&](Type t) -> Type {
-    if (auto tensorType = t.dyn_cast<RankedTensorType>()) {
-      return MemRefType::get(tensorType.getShape(),
-                             tensorType.getElementType());
-    }
-    return t;
-  };
-  // Converting tensor-type function arguments to memref-type.
-  auto funcType = funcOp.getType();
-  TypeConverter::SignatureConversion conversion(funcType.getNumInputs());
-  for (auto argType : llvm::enumerate(funcType.getInputs())) {
-    conversion.addInputs(argType.index(), toMemrefConverter(argType.value()));
-  }
-  for (auto resType : funcType.getResults()) {
-    conversion.addInputs(toMemrefConverter(resType));
-  }
-  rewriter.updateRootInPlace(funcOp, [&] {
-    funcOp.setType(
-        rewriter.getFunctionType(conversion.getConvertedTypes(), llvm::None));
-    rewriter.applySignatureConversion(&funcOp.getBody(), conversion);
-  });
-  // Converting tensor-type block arugments of all blocks inside the
-  // function region to memref-type except for the entry block.
-  for (auto& block : funcOp.getBlocks()) {
-    if (block.isEntryBlock()) continue;
-    for (int i = 0, e = block.getNumArguments(); i < e; ++i) {
-      auto oldArg = block.getArgument(i);
-      auto newArg =
-          block.insertArgument(i, toMemrefConverter(oldArg.getType()));
-      oldArg.replaceAllUsesWith(newArg);
-      block.eraseArgument(i + 1);
-    }
-  }
-  return success();
-}
-
-/// A helper method to make the functions, whose all block argument types are
-/// Memref or non-shaped type, legal. BufferAssignmentPlacer expects all
-/// function and block argument types are in Memref or non-shaped type. Using
-/// this helper method and additionally, FunctionAndBlockSignatureConverter as a
-/// pattern conversion make sure that the type of block arguments are compatible
-/// with using BufferAssignmentPlacer.
-void FunctionAndBlockSignatureConverter::addDynamicallyLegalFuncOp(
-    ConversionTarget& target) {
-  auto isLegalBlockArg = [](BlockArgument arg) -> bool {
-    auto type = arg.getType();
-    return type.isa<MemRefType>() || !type.isa<ShapedType>();
-  };
-  target.addDynamicallyLegalOp<FuncOp>([&](FuncOp funcOp) {
-    bool legality = true;
-    for (auto& block2 : funcOp.getBlocks()) {
-      legality &= llvm::all_of(block2.getArguments(), isLegalBlockArg);
-      if (!legality) break;
-    }
-    return legality;
-  });
-}
-
-//===----------------------------------------------------------------------===//
-// Buffer assignment pass registrations
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<OperationPass<FuncOp>> createBufferAssignmentPass() {
-  return absl::make_unique<BufferAssignmentPass>();
-}
-
-static PassRegistration<BufferAssignmentPass> buffer_assignment_pass(
-    "buffer-assignment",
-    "Executes buffer assignment pass to automatically move alloc and dealloc "
-    "operations into their proper positions");
-
-}  // namespace xla
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
deleted file mode 100644
index ced5769b44c..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
-#define TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
-
-#include "mlir/Analysis/Liveness.h"
-#include "mlir/IR/Builders.h"  // TF:llvm-project
-#include "mlir/IR/Dominance.h"
-#include "mlir/IR/Operation.h"  // TF:llvm-project
-#include "mlir/Support/LLVM.h"
-#include "mlir/Transforms/DialectConversion.h"  // TF:llvm-project
-
-namespace mlir {
-namespace xla {
-
-/// Prepares a buffer assignment phase. It can place (user-defined) alloc
-/// nodes. This simplifies the integration of the actual buffer-assignment
-/// pass. Sample usage:
-///   BufferAssignmentPlacer baHelper(regionOp);
-///   -> determine alloc positions
-///   auto allocPosition = baHelper.computeAllocPosition(value);
-///   -> place alloc
-///   allocBuilder.setInsertionPoint(positions.getAllocPosition());
-///   <create alloc>
-///   alternatively:
-///   -> place alloc
-///   baHelper.insertAlloc<AllocOp>(...);
-/// Note: this class is intended to be used during legalization. In order
-/// to move alloc and dealloc nodes into the right places you can use the
-/// createBufferAssignmentPass() function.
-class BufferAssignmentPlacer {
- public:
-  /// Creates a new assignment builder.
-  explicit BufferAssignmentPlacer(Operation* op);
-
-  /// Returns the operation this analysis was constructed from.
-  Operation* getOperation() const { return operation; }
-
-  /// Computes the actual position to place allocs for the given value.
-  OpBuilder::InsertPoint computeAllocPosition(Value value);
-
- private:
-  /// The operation this analysis was constructed from.
-  Operation* operation;
-
-  /// The dominator analysis to place allocs in the appropriate blocks.
-  DominanceInfo dominators;
-};
-
-/// Helper conversion pattern that encapsulates a BufferAssignmentPlacer
-/// instance.
-template <typename SourceOp>
-class BufferAssignmentOpConversionPattern
-    : public OpConversionPattern<SourceOp> {
- public:
-  explicit BufferAssignmentOpConversionPattern(
-      MLIRContext* context_,
-      xla::BufferAssignmentPlacer* bufferAssignment_ = nullptr,
-      PatternBenefit benefit_ = 1)
-      : OpConversionPattern<SourceOp>(context_, benefit_),
-        bufferAssignment(bufferAssignment_) {}
-
- protected:
-  xla::BufferAssignmentPlacer* bufferAssignment;
-};
-
-// Converts only the tensor-type function and block arguments to memref-type.
-class FunctionAndBlockSignatureConverter
-    : public BufferAssignmentOpConversionPattern<FuncOp> {
- public:
-  using BufferAssignmentOpConversionPattern<
-      FuncOp>::BufferAssignmentOpConversionPattern;
-
-  // Adding functions whose arguments are memref type to the set of legal
-  // operations.
-  static void addDynamicallyLegalFuncOp(ConversionTarget& target);
-
-  // Performs the actual signature rewriting step.
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final;
-};
-
-// This pattern converter transforms a non-void ReturnOpSourceTy into a void
-// return of type ReturnOpTargetTy. It uses a copy operation of type CopyOpTy to
-// copy the results to the output buffer.
-template <typename ReturnOpSourceTy, typename ReturnOpTargetTy,
-          typename CopyOpTy>
-class NonVoidToVoidReturnOpConverter
-    : public BufferAssignmentOpConversionPattern<ReturnOpSourceTy> {
- public:
-  using BufferAssignmentOpConversionPattern<
-      ReturnOpSourceTy>::BufferAssignmentOpConversionPattern;
-
-  // Performs the actual return-op conversion step.
-  LogicalResult matchAndRewrite(
-      ReturnOpSourceTy returnOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const final {
-    auto numReturnValues = returnOp.getNumOperands();
-    auto funcOp = returnOp.template getParentOfType<FuncOp>();
-    auto numFuncArgs = funcOp.getNumArguments();
-    auto loc = returnOp.getLoc();
-
-    // Find the corresponding output buffer for each operand.
-    for (auto operand : llvm::enumerate(operands)) {
-      auto returnArgNumber = numFuncArgs - numReturnValues + operand.index();
-      auto dstBuffer = funcOp.getArgument(returnArgNumber);
-      if (dstBuffer == operand.value()) {
-        continue;
-      }
-
-      // Insert the copy operation to copy before the return.
-      rewriter.setInsertionPoint(
-          returnOp.getOperation()->getBlock()->getTerminator());
-      rewriter.create<CopyOpTy>(loc, operand.value(),
-                                funcOp.getArgument(returnArgNumber));
-    }
-    // Insert the new target return operation.
-    rewriter.replaceOpWithNewOp<ReturnOpTargetTy>(returnOp);
-    return success();
-  }
-};
-
-}  // namespace xla
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_XLA_TRANSFORMS_BUFFER_ASSIGNMENT_H_
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
deleted file mode 100644
index 40c115f4cbc..00000000000
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
+++ /dev/null
@@ -1,180 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements logic for testing buffer assignment including its
-// utility converters.
-
-#include "tensorflow/compiler/mlir/xla/transforms/buffer_assignment.h"
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // TF:llvm-project
-#include "mlir/IR/Function.h"                 // TF:llvm-project
-#include "mlir/IR/Operation.h"                // TF:llvm-project
-#include "mlir/Pass/Pass.h"                   // TF:llvm-project
-#include "mlir/Pass/PassManager.h"            // TF:llvm-project
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/mlir/xla/transforms/passes.h"
-
-namespace mlir {
-namespace xla {
-namespace {
-
-/// This dialect independent unary operation has been defined only for testing
-/// buffer assignment.
-class BufferAssignmentTestUnaryOp
-    : public Op<BufferAssignmentTestUnaryOp, OpTrait::OneResult,
-                OpTrait::OneOperand> {
- public:
-  using Op::Op;
-  static StringRef getOperationName() { return "buffer_assignment_test.unary"; }
-  static void build(OpBuilder& b, OperationState& state, Value source) {
-    state.addOperands(source);
-  }
-};
-
-/// This dialect independent lowered unary operation has been defined only for
-/// testing buffer assignment.
-class BufferAssignmentTestUnaryLoweredOp
-    : public Op<BufferAssignmentTestUnaryLoweredOp, OpTrait::ZeroResult,
-                OpTrait::NOperands<2>::Impl> {
- public:
-  using Op::Op;
-  static StringRef getOperationName() {
-    return "buffer_assignment_test.unary_lowered";
-  }
-  static void build(OpBuilder& b, OperationState& state, Value source,
-                    Value target) {
-    state.addOperands(source);
-    state.addOperands(target);
-  }
-};
-
-/// This dialect independent copy operation has been defined only for testing
-/// NonVoidToVoidReturnOpConverter
-class BufferAssignmentTestCopyOp
-    : public Op<BufferAssignmentTestCopyOp, OpTrait::ZeroResult,
-                OpTrait::NOperands<2>::Impl> {
- public:
-  using Op::Op;
-  static StringRef getOperationName() { return "buffer_assignment_test.copy"; }
-  static void build(OpBuilder& b, OperationState& state, Value from, Value to) {
-    state.addOperands(from);
-    state.addOperands(to);
-  }
-};
-
-class BufferAssignmentTestDialect : public Dialect {
- public:
-  explicit BufferAssignmentTestDialect(MLIRContext* context)
-      : Dialect(getDialectNamespace(), context) {
-    addOperations<BufferAssignmentTestCopyOp, BufferAssignmentTestUnaryOp,
-                  BufferAssignmentTestUnaryLoweredOp>();
-  }
-  static StringRef getDialectNamespace() { return "buffer_assignment_test"; }
-};
-
-/// This pass tests two provided operation converters,
-/// FunctionAndBlockSignatureConverter and NonVoidToVoidReturnOpConverter, for
-/// Buffer Assignment.
-struct BufferAssignmentPreparationTestPass
-    : mlir::PassWrapper<BufferAssignmentPreparationTestPass, FunctionPass> {
-  /// A simple converter that legalizes a BufferAssignmentTestUnaryOp to a
-  /// BufferAssignmentTestUnaryLoweredOp and creates buffer allocation for
-  /// the result of the computation.
-  class TestUnaryOpConverter : public BufferAssignmentOpConversionPattern<
-                                   BufferAssignmentTestUnaryOp> {
-   public:
-    using BufferAssignmentOpConversionPattern<
-        BufferAssignmentTestUnaryOp>::BufferAssignmentOpConversionPattern;
-
-    // Performs the actual legalization conversion step.
-    LogicalResult matchAndRewrite(
-        BufferAssignmentTestUnaryOp op, ArrayRef<Value> operands,
-        ConversionPatternRewriter& rewriter) const final {
-      // Create a new buffer allocation using the current BufferAssignmentPlacer
-      // instance.
-      auto result = op.getResult();
-      auto result_type = result.getType().dyn_cast<ShapedType>();
-      auto memref_type =
-          MemRefType::get(result_type.getShape(), result_type.getElementType());
-      rewriter.restoreInsertionPoint(
-          bufferAssignment->computeAllocPosition(result));
-      auto alloc = rewriter.create<AllocOp>(op.getLoc(), memref_type);
-
-      // Create the lowered operation and replace the old operation with a
-      // reference to the allocated buffer.
-      rewriter.create<BufferAssignmentTestUnaryLoweredOp>(op.getLoc(),
-                                                          operands[0], alloc);
-      rewriter.replaceOp(op, {alloc});
-      return success();
-    }
-  };
-
-  void runOnFunction() override {
-    OwningRewritePatternList patterns;
-    auto funcOp = getOperation();
-    auto context = funcOp.getContext();
-    ConversionTarget target(*context);
-    BufferAssignmentPlacer bufferAssignmentPlacer(funcOp);
-
-    // Specifying the legal and illegal operations.
-    context->allowUnregisteredDialects(true);
-    target.addIllegalOp<BufferAssignmentTestUnaryOp>();
-    target.addLegalOp<BufferAssignmentTestUnaryLoweredOp>();
-    target.addLegalOp<BufferAssignmentTestCopyOp>();
-    target.addLegalOp<AllocOp>();
-    target.addLegalOp<DeallocOp>();
-    // TODO(dfki): ReturnOp can also be changed to TestReturnOp like
-    // BufferAssignmentTestCopyOp.
-    target.addDynamicallyLegalOp<ReturnOp>(
-        [](ReturnOp returnOp) { return returnOp.getNumOperands() == 0; });
-    FunctionAndBlockSignatureConverter::addDynamicallyLegalFuncOp(target);
-
-    // Adding patterns for testing this pass.
-    // clang-format off
-    patterns.insert<
-        FunctionAndBlockSignatureConverter,
-        TestUnaryOpConverter,
-        NonVoidToVoidReturnOpConverter
-          <ReturnOp, ReturnOp, BufferAssignmentTestCopyOp>
-    >(context, &bufferAssignmentPlacer);
-    // clang-format on
-
-    if (failed(applyPartialConversion(funcOp, target, patterns, nullptr))) {
-      funcOp.emitOpError()
-          << "Failed to apply buffer assignment preparation steps";
-    }
-  };
-};
-
-}  // namespace
-
-static mlir::DialectRegistration<BufferAssignmentTestDialect>
-    buffer_assignment_test_ops;
-
-/// This pass tests helper methods such as computeAllocPosition,
-/// FunctionAndBlockSignatureConverter, NonVoidToVoidReturnOpConverter
-/// conversion patterns. Furthermore, it checks buffer-assignment pass that
-/// moves existing Alloc and Dealloc operations to their proper positions, and
-/// insert missing Dealloc operations.
-static PassPipelineRegistration<> buffer_assignment_test_pass(
-    "test-buffer-assignment",
-    "Tests buffer assignment helper methods and buffer assignment pass.",
-    [](mlir::OpPassManager& pm) {
-      pm.addPass(absl::make_unique<BufferAssignmentPreparationTestPass>());
-      pm.addPass(createBufferAssignmentPass());
-    });
-
-}  // namespace xla
-}  // namespace mlir

From 052243a12c7cd66be621747df4f73b52743d4bb2 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Fri, 29 May 2020 06:32:24 -0700
Subject: [PATCH 1382/1533] Improve logging when ptxas cannot be found.

PiperOrigin-RevId: 313770396
Change-Id: I9dbbd70cb567fd173219c6744e3ca879fedeafc6
---
 .../compiler/xla/debug_options_flags.cc       |  3 +--
 .../xla/service/gpu/nvptx_compiler.cc         | 21 ++++++++++++-------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index cad73b593a2..958629c5fa6 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -71,8 +71,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_force_host_platform_device_count(1);
   opts.set_xla_gpu_deterministic_reductions(false);
   opts.set_xla_cpu_enable_xprof_traceme(true);
-  // TODO(b/155295372): disable ptxas fallback by default.
-  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(true);
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false);
 
   return opts;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index b0b214832ea..eefa4661d37 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -385,6 +385,19 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
         } else {
           if (maybe_cubin.status().code() ==
               tensorflow::error::Code::NOT_FOUND) {
+            if (!hlo_module_config.debug_options()
+                     .xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found()) {
+              PrintCantFindCudaMessage(
+                  "Can't find ptxas binary in ${CUDA_DIR}/bin.  Custom ptxas "
+                  "location can be specified using $PATH.",
+                  hlo_module_config);
+              LOG(FATAL)
+                  << "Can't find ptxas binary.  You can pass the flag "
+                     "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found "
+                     "to use the GPU driver for compiling ptx instead. However "
+                     "this option is discouraged and can lead to increased "
+                     "memory concumptions and other subtle runtime issues.";
+            }
             // Missing ptxas is expected in some environments where CUDA SDK
             // binaries are not available. We don't want to spam logs with
             // identical warnings in this case.
@@ -402,14 +415,6 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
                   "using $PATH.",
                   hlo_module_config);
             }
-            CHECK(hlo_module_config.debug_options()
-                      .xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found())
-                << "There was an error when trying to compile ptx into sass "
-                   "code. If you want to try falling back to the GPU driver to "
-                   "jit compile ptx, you can use the flag "
-                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found."
-                   " Use at your own risk though, it has known drawbacks like "
-                   "increased memory consumption.";
           } else {
             LOG(FATAL) << "ptxas returned an error during compilation of ptx "
                           "to sass: '"

From 224492192533d83f2c2d03a769a6f69b0426f29a Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Fri, 29 May 2020 09:34:47 -0700
Subject: [PATCH 1383/1533] Reduce 1-Layer Functional.__call__ overhead by
 ~10%.

Moves Model._set_save_spec to Layer. This allows Layer.__call__ to avoid a
hasattr check and also Model.__call__ to avoid an expensive call to a method
wrapped in trackable.no_automatic_dependency_tracking.

This should also allow SavedModel to use this spec in place of
build_input_shape in the future.

PiperOrigin-RevId: 313795786
Change-Id: Id7b23f98911468ed3e11261ac60989685de47aa1
---
 tensorflow/python/keras/engine/base_layer.py | 19 ++++++++++++++++++-
 tensorflow/python/keras/engine/training.py   |  8 --------
 2 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 9958f70ed55..4a33e8f4e20 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -316,6 +316,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # TODO(kathywu): Move this to Layer._set_save_spec once cl/290121460 is
     # submitted.
     self._build_input_shape = None
+    self._saved_model_inputs_spec = None
     # Provides information about which inputs are compatible with the layer.
     self._input_spec = None
     self.supports_masking = False
@@ -1002,7 +1003,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks, build_graph)
-          if hasattr(self, '_set_save_spec'):
+          if self._saved_model_inputs_spec is None:
             self._set_save_spec(cast_inputs)
 
     return outputs
@@ -2809,6 +2810,22 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   # SavedModel properties. Please see keras/saving/saved_model for details.
 
+  @trackable.no_automatic_dependency_tracking
+  def _set_save_spec(self, inputs):
+    if self._saved_model_inputs_spec is not None:
+      return  # Already set.
+
+    self._saved_model_inputs_spec = nest.map_structure(tf_utils.get_tensor_spec,
+                                                       inputs)
+
+  def _get_save_spec(self, dynamic_batch=True):
+    if self._saved_model_inputs_spec is None:
+      return None
+
+    return nest.map_structure(
+        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
+        self._saved_model_inputs_spec)
+
   @property
   def _trackable_saved_model_saver(self):
     return layer_serialization.LayerSavedModelSaver(self)
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index d8c95b2a972..6c6d9ee897b 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -2366,14 +2366,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
     self._saved_model_inputs_spec = specs
 
-  def _get_save_spec(self, dynamic_batch=True):
-    if self._saved_model_inputs_spec is None:
-      return None
-
-    return nest.map_structure(
-        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
-        self._saved_model_inputs_spec)
-
   def _assert_weights_created(self):
     """Asserts that all the weights for the model have been created.
 

From a475c198ecce4980812fc5c6067d76e2167746be Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Fri, 29 May 2020 09:35:46 -0700
Subject: [PATCH 1384/1533] Added TensorLinearDescriptor (GPUObjectDescriptor
 subclass) Implemented GPUObject interface for LinearStorage. Added selector
 resolve pass for arguments. Used linear storage as gpu argument in
 Winograd4x4To36.

PiperOrigin-RevId: 313795928
Change-Id: I6f03d06fc6464cd8f5b93814ad16f23cf59b4e27
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  18 +++
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 151 +++++++++++++++++-
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  20 +++
 .../lite/delegates/gpu/cl/arguments_test.cc   |  96 +++++++++++
 tensorflow/lite/delegates/gpu/cl/gpu_object.h |   2 +
 .../lite/delegates/gpu/cl/kernels/BUILD       |   2 +
 .../lite/delegates/gpu/cl/kernels/winograd.cc |  84 +++++-----
 .../lite/delegates/gpu/cl/kernels/winograd.h  |   3 +-
 .../lite/delegates/gpu/cl/linear_storage.cc   |  76 ++++++++-
 .../lite/delegates/gpu/cl/linear_storage.h    |  36 ++++-
 10 files changed, 440 insertions(+), 48 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/cl/arguments_test.cc

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 95b20bc6e81..d37f666f8a6 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -54,6 +54,23 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "arguments_test",
+    srcs = ["arguments_test.cc"],
+    linkstatic = True,
+    tags = tf_gpu_tests_tags() + [
+        "linux",
+        "local",
+    ],
+    deps = [
+        ":arguments",
+        ":gpu_object",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "buffer",
     srcs = ["buffer.cc"],
@@ -354,6 +371,7 @@ cc_library(
     hdrs = ["linear_storage.h"],
     deps = [
         ":buffer",
+        ":gpu_object",
         ":opencl_wrapper",
         ":tensor_type",
         ":texture2d",
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index bdfae935f28..7b28ee215da 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
@@ -36,6 +38,55 @@ std::string GetNextWord(const std::string& code, size_t first_position) {
   }
   return code.substr(first_position, pos - first_position);
 }
+
+size_t FindEnclosingBracket(const std::string& text, size_t first_pos,
+                            char bracket) {
+  const std::map<char, char> brackets = {
+      {'(', ')'},
+      {'{', '}'},
+      {'[', ']'},
+  };
+  char b_open = bracket;
+  auto it = brackets.find(b_open);
+  if (it == brackets.end()) {
+    return -1;
+  }
+  char b_close = it->second;
+  size_t pos = first_pos;
+  int opened = 1;
+  int closed = 0;
+  while (opened != closed && pos < text.size()) {
+    if (text[pos] == b_open) {
+      opened++;
+    } else if (text[pos] == b_close) {
+      closed++;
+    }
+    pos++;
+  }
+  if (opened == closed) {
+    return pos;
+  } else {
+    return -1;
+  }
+}
+
+void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
+                     std::string* str) {
+  size_t position = str->find(old_word);
+  while (position != std::string::npos) {
+    char prev = position == 0 ? '.' : (*str)[position - 1];
+    char next = position + old_word.size() < str->size()
+                    ? (*str)[position + old_word.size()]
+                    : '.';
+    if (IsWordSymbol(prev) || IsWordSymbol(next)) {
+      position = str->find(old_word, position + 1);
+      continue;
+    }
+    str->replace(position, old_word.size(), new_word);
+    position = str->find(old_word, position + new_word.size());
+  }
+}
+
 }  // namespace
 
 Arguments::Arguments(Arguments&& args)
@@ -45,6 +96,7 @@ Arguments::Arguments(Arguments&& args)
       shared_float4s_data_(std::move(args.shared_float4s_data_)),
       buffers_(std::move(args.buffers_)),
       images2d_(std::move(args.images2d_)),
+      object_refs_(std::move(args.object_refs_)),
       objects_(std::move(args.objects_)) {}
 Arguments& Arguments::operator=(Arguments&& args) {
   if (this != &args) {
@@ -54,6 +106,7 @@ Arguments& Arguments::operator=(Arguments&& args) {
     shared_float4s_data_ = std::move(args.shared_float4s_data_);
     buffers_ = std::move(args.buffers_);
     images2d_ = std::move(args.images2d_);
+    object_refs_ = std::move(args.object_refs_);
     objects_ = std::move(args.objects_);
   }
   return *this;
@@ -74,6 +127,11 @@ void Arguments::AddImage2D(const std::string& name,
   images2d_[name] = desc;
 }
 
+void Arguments::AddObjectRef(const std::string& name,
+                             GPUObjectDescriptorPtr&& descriptor_ptr) {
+  object_refs_[name] = {AccessType::READ, std::move(descriptor_ptr)};
+}
+
 void Arguments::AddObject(const std::string& name, GPUObjectPtr&& object) {
   objects_[name] = {AccessType::READ, std::move(object)};
 }
@@ -159,6 +217,7 @@ absl::Status Arguments::SetGPUResources(
 
 absl::Status Arguments::TransformToCLCode(std::string* code) {
   RETURN_IF_ERROR(AddObjectArgs());
+  RETURN_IF_ERROR(ResolveSelectorsPass(code));
   ResolveArgsPass(code);
   return absl::OkStatus();
 }
@@ -260,18 +319,17 @@ std::string Arguments::AddActiveArgument(const std::string& arg_name) {
 }
 
 void Arguments::ResolveArgsPass(std::string* code) {
-  constexpr char kPrefix[] = "args.";
   std::string result;
   size_t position = 0;
-  size_t next_position = code->find(kPrefix);
+  size_t next_position = code->find(kArgsPrefix);
   while (next_position != std::string::npos) {
     size_t arg_pos = next_position;
-    next_position += strlen(kPrefix);
+    next_position += strlen(kArgsPrefix);
     std::string object_name = GetNextWord(*code, next_position);
     std::string new_name = AddActiveArgument(object_name);
-    code->replace(arg_pos, object_name.size() + strlen(kPrefix), new_name);
+    code->replace(arg_pos, object_name.size() + strlen(kArgsPrefix), new_name);
     position = arg_pos + new_name.size();
-    next_position = code->find(kPrefix, position);
+    next_position = code->find(kArgsPrefix, position);
   }
 
   int shared_int4s_aligned_size = AlignByN(shared_int4s_data_.size(), 4);
@@ -280,6 +338,86 @@ void Arguments::ResolveArgsPass(std::string* code) {
   shared_float4s_data_.resize(shared_float4s_aligned_size);
 }
 
+void Arguments::ResolveObjectNames(const std::string& object_name,
+                                   const std::vector<std::string>& member_names,
+                                   std::string* code) {
+  for (const auto& member_name : member_names) {
+    const std::string new_name = "args." + object_name + "_" + member_name;
+    ReplaceAllWords(member_name, new_name, code);
+  }
+}
+
+absl::Status Arguments::ResolveSelector(const std::string& object_name,
+                                        const std::string& selector,
+                                        const std::vector<std::string>& args,
+                                        std::string* result) {
+  const GPUObjectDescriptor* desc_ptr;
+  AccessType access_type;
+  if (auto it = object_refs_.find(object_name); it != object_refs_.end()) {
+    desc_ptr = it->second.descriptor.get();
+    access_type = it->second.access_type;
+  } else if (auto it = objects_.find(object_name); it != objects_.end()) {
+    desc_ptr = it->second.obj_ptr->GetGPUDescriptor();
+    access_type = it->second.access_type;
+  } else {
+    return absl::NotFoundError(
+        absl::StrCat("No object with name - ", object_name));
+  }
+  RETURN_IF_ERROR(desc_ptr->PerformSelector(selector, args, result));
+  auto names = desc_ptr->GetGPUResources().GetNames();
+  ResolveObjectNames(object_name, names, result);
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::ResolveSelectorsPass(std::string* code) {
+  std::string result;
+  size_t position = 0;
+  size_t next_position = code->find(kArgsPrefix);
+  while (next_position != std::string::npos) {
+    size_t arg_pos = next_position;
+    next_position += strlen(kArgsPrefix);
+    std::string object_name = GetNextWord(*code, next_position);
+    char next = (*code)[next_position + object_name.size()];
+    if (next == '.') {
+      next_position += object_name.size() + 1;
+      std::string selector_name = GetNextWord(*code, next_position);
+      next_position += selector_name.size();
+      next = (*code)[next_position];
+      if (next != '(') {
+        return absl::NotFoundError(
+            absl::StrCat("Expected ( after function ", selector_name, " call"));
+      }
+      next_position += 1;
+      size_t bracket_pos = FindEnclosingBracket(*code, next_position, '(');
+      if (bracket_pos == -1) {
+        return absl::NotFoundError(
+            absl::StrCat("Not found enclosing bracket for function ",
+                         selector_name, " call"));
+      }
+      std::string str_args =
+          code->substr(next_position, bracket_pos - next_position - 1);
+      std::vector<absl::string_view> words = absl::StrSplit(str_args, ',');
+      std::vector<std::string> args;
+      args.reserve(words.size());
+      for (const auto& word : words) {
+        absl::string_view arg = absl::StripAsciiWhitespace(word);
+        if (!arg.empty()) {
+          args.push_back(std::string(arg));
+        }
+      }
+      std::string patch;
+      RETURN_IF_ERROR(
+          ResolveSelector(object_name, selector_name, args, &patch));
+      code->replace(arg_pos, bracket_pos - arg_pos, patch);
+      position = arg_pos + patch.size();
+    } else {
+      position = arg_pos + strlen(kArgsPrefix);
+    }
+    next_position = code->find(kArgsPrefix, position);
+  }
+  return absl::OkStatus();
+}
+
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first,
@@ -287,6 +425,9 @@ absl::Status Arguments::AddObjectArgs() {
     RETURN_IF_ERROR(
         SetGPUResources(t.first, t.second.obj_ptr->GetGPUResources()));
   }
+  for (auto& t : object_refs_) {
+    AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
+  }
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index f1059e77c93..65c114b2cf6 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -40,6 +40,8 @@ class Arguments {
   void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
   void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
 
+  void AddObjectRef(const std::string& name,
+                    GPUObjectDescriptorPtr&& descriptor_ptr);
   void AddObject(const std::string& name, GPUObjectPtr&& object);
 
   absl::Status SetInt(const std::string& name, int value);
@@ -69,6 +71,18 @@ class Arguments {
   absl::Status AddObjectArgs();
 
   void ResolveArgsPass(std::string* code);
+  absl::Status ResolveSelectorsPass(std::string* code);
+
+  absl::Status ResolveSelector(const std::string& object_name,
+                               const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result);
+
+  void ResolveObjectNames(const std::string& object_name,
+                          const std::vector<std::string>& member_names,
+                          std::string* code);
+
+  static constexpr char kArgsPrefix[] = "args.";
 
   struct IntValue {
     int value;
@@ -99,6 +113,12 @@ class Arguments {
   std::map<std::string, GPUBufferDescriptor> buffers_;
   std::map<std::string, GPUImage2DDescriptor> images2d_;
 
+  struct ObjectRefArg {
+    AccessType access_type;
+    GPUObjectDescriptorPtr descriptor;
+  };
+  std::map<std::string, ObjectRefArg> object_refs_;
+
   struct ObjectArg {
     AccessType access_type;
     GPUObjectPtr obj_ptr;
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments_test.cc b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
new file mode 100644
index 00000000000..1a4c9fc9c00
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/arguments_test.cc
@@ -0,0 +1,96 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+namespace {
+struct TestDescriptor : public GPUObjectDescriptor {
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result) const override {
+    if (selector == "Length") {
+      *result = "length";
+      return absl::OkStatus();
+    } else if (selector == "Read") {
+      if (args.size() != 1) {
+        return absl::NotFoundError(
+            absl::StrCat("TestDescriptor Read require one argument, but ",
+                         args.size(), " was passed"));
+      }
+      *result = absl::StrCat("buffer[", args[0], "]");
+      return absl::OkStatus();
+    } else {
+      return absl::NotFoundError(absl::StrCat(
+          "TestDescriptor don't have selector with name - ", selector));
+    }
+  }
+
+  GPUResources GetGPUResources() const override {
+    GPUResources resources;
+    resources.ints.push_back("length");
+    GPUBufferDescriptor desc;
+    desc.data_type = DataType::FLOAT32;
+    desc.element_size = 4;
+    resources.buffers.push_back({"buffer", desc});
+    return resources;
+  }
+};
+}  // namespace
+
+TEST(ArgumentsTest, TestSelectorResolve) {
+  TestDescriptor descriptor;
+  Arguments args;
+  args.AddObjectRef("object", absl::make_unique<TestDescriptor>(descriptor));
+  std::string sample_code = R"(
+  if (a < 3) {
+    value = args.object.Read(id);
+  }
+)";
+  const std::string expected_result = R"(
+  if (a < 3) {
+    value = object_buffer[id];
+  }
+)";
+  ASSERT_OK(args.TransformToCLCode(&sample_code));
+  EXPECT_EQ(sample_code, expected_result);
+
+  std::string cl_arguments = args.GetListOfArgs();
+  EXPECT_TRUE(cl_arguments.find("__global float4* object_buffer") !=
+              std::string::npos);
+}
+
+TEST(ArgumentsTest, TestNoSelector) {
+  TestDescriptor descriptor;
+  Arguments args;
+  args.AddObjectRef("object", absl::make_unique<TestDescriptor>(descriptor));
+  std::string sample_code = R"(
+  if (a < 3) {
+    value = args.object.Write(id);
+  }
+)";
+  EXPECT_FALSE(args.TransformToCLCode(&sample_code).ok());
+}
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 5cc045c6fc7..23d1f210459 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -99,6 +99,8 @@ class GPUObjectDescriptor {
   mutable std::map<std::string, std::string> state_vars_;
 };
 
+using GPUObjectDescriptorPtr = std::unique_ptr<GPUObjectDescriptor>;
+
 class GPUObject {
  public:
   GPUObject() = default;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index b5510b3e8df..24a62e5a82f 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -1385,6 +1385,7 @@ cc_library(
         ":gpu_operation",
         ":util",
         ":work_group_picking",
+        "//tensorflow/lite/delegates/gpu/cl:arguments",
         "//tensorflow/lite/delegates/gpu/cl:cl_device",
         "//tensorflow/lite/delegates/gpu/cl:cl_kernel",
         "//tensorflow/lite/delegates/gpu/cl:linear_storage",
@@ -1395,6 +1396,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:shape",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:winograd_util",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 6219952b9bf..e3c9306b80c 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_format.h"
+#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/work_group_picking.h"
@@ -34,8 +35,9 @@ namespace cl {
 namespace {
 
 std::string GetWinograd4x4To36Code(
-    const OperationDef& op_def, const LinearStorage& bt_arr,
-    const std::vector<ElementwiseOperation*>& linked_operations) {
+    const OperationDef& op_def,
+    const std::vector<ElementwiseOperation*>& linked_operations,
+    Arguments* args) {
   TensorCodeGenerator src_tensor(
       "src_data",
       WHSBPoint{"src_size.x", "src_size.y", "src_size.z", "src_size.w"},
@@ -78,31 +80,31 @@ std::string GetWinograd4x4To36Code(
   }
   c += "};\n";
 
+  args->AddInt("padding_x");
+  args->AddInt("padding_y");
+  args->AddInt("tiles_total");
+  args->AddInt("tiles_x");
+
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ) + ",\n";
-  c += bt_arr.GetDeclaration();
+  c += src_tensor.GetDeclaration(AccessType::READ);
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
   c += "    int4 src_size,                              \n";
-  c += "    int4 dst_size,                              \n";
-  c += "    int2 padding,                               \n";
-  c += "    int tiles_total,                            \n";
-  c += "    int tiles_x                                 \n";
-  c += ") {\n";
+  c += "    int4 dst_size";
+  c += "$0) {\n";
   c += "  int DST_X = get_global_id(0);\n";
   c += "  int DST_Y = get_global_id(1);\n";
   c += "  int DST_Z = get_global_id(2);\n";
-  c += "  if (DST_X >= tiles_total || DST_Y >= 6 || DST_Z >= dst_size.z) {\n";
+  c += "  if (DST_X >= args.tiles_total || DST_Y >= 6 || DST_Z >= dst_size.z) "
+       "{\n";
   c += "    return; \n";
   c += "  }\n";
-  c += "  int tile_x = (DST_X % tiles_x) * 4;\n";
-  c += "  int tile_y = (DST_X / tiles_x) * 4;\n";
+  c += "  int tile_x = (DST_X % args.tiles_x) * 4;\n";
+  c += "  int tile_y = (DST_X / args.tiles_x) * 4;\n";
   c += "  ACCUM_FLT4 I0, I1, I2, I3, I4, I5;\n";
   c += "  ACCUM_FLT bt_ar[6];\n";
-  c += "  ACCUM_FLT4 t0 = TO_ACCUM_TYPE(" +
-       bt_arr.ReadLinearFLT4("DST_Y * 2 + 0") + ");\n";
-  c += "  ACCUM_FLT4 t1 = TO_ACCUM_TYPE(" +
-       bt_arr.ReadLinearFLT4("DST_Y * 2 + 1") + ");\n";
+  c += "  ACCUM_FLT4 t0 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 0));\n";
+  c += "  ACCUM_FLT4 t1 = TO_ACCUM_TYPE(args.bt.Read(DST_Y * 2 + 1));\n";
   c += "  DST_Y *= 6;\n";
   c += "  bt_ar[0] = t0.x;\n";
   c += "  bt_ar[1] = t0.y;\n";
@@ -121,15 +123,16 @@ std::string GetWinograd4x4To36Code(
            " * m" + xs + "_x;\n";
     } else {
       c += "    ACCUM_FLT4 " + src + " = " +
-           src_tensor.ReadAsTypeWHSB(accum_type, "tile_x + padding.x + " + xs,
-                                     "yc", "DST_Z", batch_id) +
+           src_tensor.ReadAsTypeWHSB(accum_type,
+                                     "tile_x + args.padding_x + " + xs, "yc",
+                                     "DST_Z", batch_id) +
            ";\n";
     }
   };
   if (is_buffer || is_image_buffer) {
     for (int x = 0; x < 6; ++x) {
       const std::string xs = std::to_string(x);
-      c += "  int xc" + xs + " = tile_x + padding.x + " + xs + ";\n";
+      c += "  int xc" + xs + " = tile_x + args.padding_x + " + xs + ";\n";
       c += "  ACCUM_FLT m" + xs + "_x = (ACCUM_FLT)(xc" + xs + " >= 0 && xc" +
            xs + " < src_size.x);\n";
       c += "  bool inx" + xs + " = (xc" + xs + " >= 0 && xc" + xs +
@@ -144,7 +147,7 @@ std::string GetWinograd4x4To36Code(
     }
   }
   c += "  {\n";
-  c += "    int yc = tile_y + padding.y;\n";
+  c += "    int yc = tile_y + args.padding_y;\n";
   if (is_buffer || is_image_buffer) {
     c += "    bool iny = (yc >= 0 && yc < src_size.y);\n";
     c += "    int offset = select(0, yc * src_size.x, iny);\n";
@@ -162,7 +165,7 @@ std::string GetWinograd4x4To36Code(
   for (int y = 1; y < 6; ++y) {
     const std::string ys = std::to_string(y);
     c += "  {\n";
-    c += "    int yc = tile_y + padding.y + (" + ys + ");\n";
+    c += "    int yc = tile_y + args.padding_y + (" + ys + ");\n";
     if (is_buffer || is_image_buffer) {
       c += "    bool iny = (yc >= 0 && yc < src_size.y);\n";
       c += "    int offset = select(0, yc * src_size.x, iny);\n";
@@ -223,7 +226,6 @@ std::string GetWinograd4x4To36Code(
   c += "    DST_Y++;\n";
   c += "  }\n";
   c += "}\n";
-  // std::cout << c << std::endl;
   return c;
 }
 
@@ -366,15 +368,15 @@ std::string GetWinograd36To4x4Code(
 
 Winograd4x4To36::Winograd4x4To36(Winograd4x4To36&& operation)
     : GPUOperation(std::move(operation)),
-      bt_(std::move(operation.bt_)),
       padding_(operation.padding_),
+      args_(std::move(operation.args_)),
       kernel_(std::move(operation.kernel_)),
       work_group_size_(operation.work_group_size_) {}
 
 Winograd4x4To36& Winograd4x4To36::operator=(Winograd4x4To36&& operation) {
   if (this != &operation) {
-    bt_ = std::move(operation.bt_);
     std::swap(padding_, operation.padding_);
+    args_ = std::move(operation.args_);
     kernel_ = std::move(operation.kernel_);
     std::swap(work_group_size_, operation.work_group_size_);
     GPUOperation::operator=(std::move(operation));
@@ -392,8 +394,10 @@ absl::Status Winograd4x4To36::Compile(const CreationContext& creation_context) {
     options.push_back(CompilerOptions::POWERVR_FP16);
   }
   RETURN_IF_ERROR(UploadBt(creation_context.context));
-  const auto code =
-      GetWinograd4x4To36Code(definition_, bt_, linked_operations_);
+  std::string code =
+      GetWinograd4x4To36Code(definition_, linked_operations_, &args_);
+  RETURN_IF_ERROR(args_.TransformToCLCode(&code));
+  code = absl::Substitute(code, args_.GetListOfArgs());
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", options, *creation_context.context,
       *creation_context.device, &kernel_));
@@ -418,7 +422,11 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
   create_info.storage_type = LinearStorageType::TEXTURE_2D;
   create_info.data_type = definition_.GetDataType();
   create_info.name = "bt_arr";
-  return CreateLinearStorage(create_info, bt_aligned, context, &bt_);
+
+  LinearStorage lt;
+  RETURN_IF_ERROR(CreateLinearStorage(create_info, bt_aligned, context, &lt));
+  args_.AddObject("bt", absl::make_unique<LinearStorage>(std::move(lt)));
+  return absl::OkStatus();
 }
 
 int3 Winograd4x4To36::SelectBestWorkGroup() {
@@ -429,22 +437,22 @@ int3 Winograd4x4To36::SelectBestWorkGroup() {
 }
 
 absl::Status Winograd4x4To36::BindArguments() {
-  kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(bt_.GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
   const int tiles_x = DivideRoundUp(
       src_[0]->Width() + padding_.prepended.w + padding_.appended.w - 2, 4);
   const int tiles_y = DivideRoundUp(
       src_[0]->Height() + padding_.prepended.h + padding_.appended.h - 2, 4);
   const int tiles_total = tiles_x * tiles_y;
-  RETURN_IF_ERROR(
-      kernel_.SetBytesAuto(int2(-padding_.prepended.w, -padding_.prepended.h)));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_total));
-  RETURN_IF_ERROR(kernel_.SetBytesAuto(tiles_x));
+  RETURN_IF_ERROR(args_.SetInt("padding_x", -padding_.prepended.w));
+  RETURN_IF_ERROR(args_.SetInt("padding_y", -padding_.prepended.h));
+  RETURN_IF_ERROR(args_.SetInt("tiles_total", tiles_total));
+  RETURN_IF_ERROR(args_.SetInt("tiles_x", tiles_x));
+  kernel_.ResetBindingCounter();
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
+  RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(src_[0]->GetWHSB()));
+  RETURN_IF_ERROR(kernel_.SetBytesAuto(dst_[0]->GetWHSB()));
+  RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
index c6a88773af3..02e3c268b28 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WINOGRAD_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_WINOGRAD_H_
 
+#include "tensorflow/lite/delegates/gpu/cl/arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
@@ -59,9 +60,9 @@ class Winograd4x4To36 : public GPUOperation {
   absl::Status BindArguments();
   int3 GetGridSize() const;
 
-  LinearStorage bt_;
   Padding2D padding_;
 
+  Arguments args_;
   CLKernel kernel_;
   int3 work_group_size_ = int3(128, 1, 1);
 };
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index 4fb21d0ec6a..ecf0e087427 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -15,24 +15,79 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
 
+#include "absl/strings/str_cat.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
 
+GPUResources TensorLinearDescriptor::GetGPUResources() const {
+  GPUResources resources;
+  resources.ints.push_back("length");
+  if (storage_type == LinearStorageType::BUFFER) {
+    GPUBufferDescriptor desc;
+    desc.data_type = element_type;
+    desc.element_size = 4;
+    resources.buffers.push_back({"buffer", desc});
+  } else {
+    GPUImage2DDescriptor desc;
+    desc.data_type = element_type;
+    resources.images2d.push_back({"tex2d", desc});
+  }
+  return resources;
+}
+
+absl::Status TensorLinearDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    std::string* result) const {
+  if (selector == "Length") {
+    *result = "length";
+    return absl::OkStatus();
+  } else if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "TensorLinearDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status TensorLinearDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  if (args.size() != 1) {
+    return absl::NotFoundError(
+        absl::StrCat("TensorLinearDescriptor Read require one argument, but ",
+                     args.size(), " was passed"));
+  }
+  if (storage_type == LinearStorageType::BUFFER) {
+    *result = absl::StrCat("buffer[", args[0], "]");
+    return absl::OkStatus();
+  } else {
+    const std::string read =
+        element_type == DataType::FLOAT16 ? "read_imageh" : "read_imagef";
+    *result = absl::StrCat(read, "(tex2d, smp_none, (int2)(", args[0], ", 0))");
+    return absl::OkStatus();
+  }
+}
+
 LinearStorage::LinearStorage(int depth, LinearStorageType storage_type,
                              DataType data_type)
-    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {}
+    : depth_(depth), storage_type_(storage_type), data_type_(data_type) {
+  desc_.storage_type = storage_type;
+  desc_.element_type = data_type;
+}
 
 LinearStorage::LinearStorage(LinearStorage&& storage)
-    : texture_storage_(std::move(storage.texture_storage_)),
+    : GPUObject(std::move(storage)),
+      texture_storage_(std::move(storage.texture_storage_)),
       buffer_storage_(std::move(storage.buffer_storage_)),
       memory_(storage.memory_),
       depth_(storage.depth_),
       name_(std::move(storage.name_)),
       storage_type_(storage.storage_type_),
-      data_type_(storage.data_type_) {
+      data_type_(storage.data_type_),
+      desc_(storage.desc_) {
   storage.memory_ = nullptr;
 }
 
@@ -45,6 +100,8 @@ LinearStorage& LinearStorage::operator=(LinearStorage&& storage) {
     name_ = std::move(storage.name_);
     std::swap(storage_type_, storage.storage_type_);
     std::swap(data_type_, storage.data_type_);
+    desc_ = storage.desc_;
+    GPUObject::operator=(std::move(storage));
   }
   return *this;
 }
@@ -66,6 +123,19 @@ std::string LinearStorage::GetDeclaration() const {
   }
 }
 
+GPUResourcesWithValue LinearStorage::GetGPUResources() const {
+  GPUResourcesWithValue resources;
+  resources.ints.push_back({"length", depth_});
+
+  if (storage_type_ == LinearStorageType::BUFFER) {
+    resources.buffers.push_back({"buffer", memory_});
+  } else {
+    resources.images2d.push_back({"tex2d", memory_});
+  }
+
+  return resources;
+}
+
 LinearStorageType DeduceLinearStorageType(
     TensorStorageType tensor_storage_type) {
   if (tensor_storage_type == TensorStorageType::BUFFER) {
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index f461b08ebec..a31094b4a47 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/delegates/gpu/cl/buffer.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/texture2d.h"
@@ -36,6 +37,33 @@ namespace cl {
 
 enum class LinearStorageType { BUFFER, TEXTURE_2D };
 
+struct TensorLinearDescriptor : public GPUObjectDescriptor {
+  LinearStorageType storage_type;
+  DataType element_type;  // FLOAT32 or FLOAT16
+
+  TensorLinearDescriptor() = default;
+  TensorLinearDescriptor(const TensorLinearDescriptor& desc)
+      : GPUObjectDescriptor(desc),
+        storage_type(desc.storage_type),
+        element_type(desc.element_type) {}
+  TensorLinearDescriptor& operator=(const TensorLinearDescriptor& desc) {
+    if (this != &desc) {
+      storage_type = desc.storage_type;
+      element_type = desc.element_type;
+      GPUObjectDescriptor::operator=(desc);
+    }
+    return *this;
+  }
+
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources() const override;
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+};
+
 struct LinearStorageCreateInfo {
   LinearStorageType storage_type;
   DataType data_type;
@@ -48,7 +76,7 @@ LinearStorageType DeduceLinearStorageType(
 
 // Represent GPU 1D-array of FLT4(float4/half4) values
 // Can use inside texture2d or buffer
-class LinearStorage {
+class LinearStorage : public GPUObject {
  public:
   LinearStorage() {}
 
@@ -63,6 +91,11 @@ class LinearStorage {
   std::string ReadLinearFLT4(const std::string& z_coord) const;
   std::string GetDeclaration() const;
 
+  const GPUObjectDescriptor* GetGPUDescriptor() const override {
+    return &desc_;
+  }
+  GPUResourcesWithValue GetGPUResources() const override;
+
  private:
   friend absl::Status CreateTextureLinearStorage(int size, DataType data_type,
                                                  void* data, CLContext* context,
@@ -81,6 +114,7 @@ class LinearStorage {
   std::string name_;
   LinearStorageType storage_type_;
   DataType data_type_;
+  TensorLinearDescriptor desc_;
 };
 
 absl::Status CreateBufferLinearStorage(int size, DataType data_type, void* data,

From 57749ce64fa2e7626b5b4ed9650a4b5c48956afd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 10:01:29 -0700
Subject: [PATCH 1385/1533] Add Int8 -> Int8 requantize operation.

PiperOrigin-RevId: 313800318
Change-Id: I04650f59f8551b482648a7468fb1b2773a64b415
---
 tensorflow/lite/micro/kernels/quantize.cc     | 25 +++++++-
 .../lite/micro/kernels/quantize_test.cc       | 57 +++++++++++++++----
 2 files changed, 68 insertions(+), 14 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/quantize.cc b/tensorflow/lite/micro/kernels/quantize.cc
index b5bba83beb8..b58a1cb368e 100644
--- a/tensorflow/lite/micro/kernels/quantize.cc
+++ b/tensorflow/lite/micro/kernels/quantize.cc
@@ -63,12 +63,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, affine_quantization->scale);
   TF_LITE_ENSURE(context, affine_quantization->scale->size == 1);
 
-  TF_LITE_ENSURE(context,
-                 input->type == kTfLiteFloat32 || input->type == kTfLiteInt16);
+  TF_LITE_ENSURE(context, input->type == kTfLiteFloat32 ||
+                              input->type == kTfLiteInt16 ||
+                              input->type == kTfLiteInt8);
   TF_LITE_ENSURE(context,
                  output->type == kTfLiteUInt8 || output->type == kTfLiteInt8);
 
-  if (input->type == kTfLiteInt16 && output->type == kTfLiteInt8) {
+  if ((input->type == kTfLiteInt16 || input->type == kTfLiteInt8) &&
+      output->type == kTfLiteInt8) {
     double effective_scale =
         static_cast<double>(input->params.scale / output->params.scale);
 
@@ -122,6 +124,23 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                            TfLiteTypeGetName(output->type));
         return kTfLiteError;
     }
+  } else if (input->type == kTfLiteInt8) {
+    // Int8 to Int8 requantization, required if the input and output tensors
+    // have different scales and/or zero points.
+    size_t size = ElementCount(*input->dims);
+    switch (output->type) {
+      case kTfLiteInt8:
+        reference_ops::Requantize(
+            GetTensorData<int8_t>(input), size, data->output_multiplier,
+            data->output_shift, input->params.zero_point,
+            output->params.zero_point, GetTensorData<int8_t>(output));
+        break;
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
   } else {
     TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
                        TfLiteTypeGetName(input->type),
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index 359abbd73db..0364fbc57ec 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -110,13 +110,13 @@ void TestQuantizeFloat(const int* input_dims_data, const float* input_data,
                           scale, zero_point, output_dims_count, output_data);
 }
 
-template <typename T>
-void TestQuantizeInt16(const int* input_dims_data, const float* input_data,
-                       int16_t* input_quantized, const float input_scale,
-                       const int input_zero_point, const int* output_dims_data,
-                       const float* golden, T* golden_quantized,
-                       const float output_scale, const int output_zero_point,
-                       T* output_data) {
+template <typename InputType, typename OutputType>
+void TestRequantize(const int* input_dims_data, const float* input_data,
+                    InputType* input_quantized, const float input_scale,
+                    const int input_zero_point, const int* output_dims_data,
+                    const float* golden, OutputType* golden_quantized,
+                    const float output_scale, const int output_zero_point,
+                    OutputType* output_data) {
   TfLiteIntArray* input_dims = IntArrayFromInts(input_dims_data);
   TfLiteIntArray* output_dims = IntArrayFromInts(output_dims_data);
   const int output_dims_count = ElementCount(*output_dims);
@@ -212,11 +212,46 @@ TF_LITE_MICRO_TEST(QuantizeOpTestInt16toInt8) {
   const float output_scale = 0.5;
   const int output_zero_point = 0;
   int8_t output_quantized[length];
+  int8_t values_quantized[length];
   int16_t input_quantized[length];
-  tflite::testing::TestQuantizeInt16(dims, values, input_quantized, input_scale,
-                                     input_zero_point, dims, values,
-                                     output_quantized, output_scale,
-                                     output_zero_point, output_quantized);
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-64, -62, -60, -58, -56, 54, 56, 58, 60, 62};
+  const float input_scale = 2.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 32;
+  int8_t output_quantized[length];
+  int8_t values_quantized[length];
+  int8_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
+}
+
+TF_LITE_MICRO_TEST(QuantizeOpTestInt8toInt8NoZeroPoint) {
+  const int length = 10;
+  const int dims[] = {2, 2, 5};
+  const float values[] = {-32, -31, -30, -29, -28, 27, 28, 29, 30, 31};
+  const float input_scale = 1.f;
+  const int input_zero_point = 0;
+  const float output_scale = 0.5;
+  const int output_zero_point = 0;
+  int8_t output_quantized[length];
+  int8_t values_quantized[length];
+  int8_t input_quantized[length];
+  tflite::testing::TestRequantize(dims, values, input_quantized, input_scale,
+                                  input_zero_point, dims, values,
+                                  values_quantized, output_scale,
+                                  output_zero_point, output_quantized);
 }
 
 TF_LITE_MICRO_TESTS_END

From c3769e5ed3ebf91addeb1647a75c8b2853cf763b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 10:08:24 -0700
Subject: [PATCH 1386/1533] Export StringLookup and IntegerLookup layers.

PiperOrigin-RevId: 313801697
Change-Id: Ib159a0b7fe36e6d9d00a7e2d6bc6fbeb3c76af10
---
 tensorflow/python/keras/layers/__init__.py    |  12 +
 .../layers/preprocessing/integer_lookup.py    |  43 +++-
 .../layers/preprocessing/integer_lookup_v1.py |   2 +
 .../layers/preprocessing/string_lookup.py     |  40 ++-
 .../preprocessing/string_lookup_test.py       |  20 +-
 .../layers/preprocessing/string_lookup_v1.py  |   2 +
 .../python/keras/layers/serialization.py      |  10 +-
 ...essing.-integer-lookup.__metaclass__.pbtxt |  14 +
 ...mental.preprocessing.-integer-lookup.pbtxt | 240 ++++++++++++++++++
 ...cessing.-string-lookup.__metaclass__.pbtxt |  14 +
 ...imental.preprocessing.-string-lookup.pbtxt | 240 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   8 +
 ...essing.-integer-lookup.__metaclass__.pbtxt |  14 +
 ...mental.preprocessing.-integer-lookup.pbtxt | 237 +++++++++++++++++
 ...cessing.-string-lookup.__metaclass__.pbtxt |  14 +
 ...imental.preprocessing.-string-lookup.pbtxt | 237 +++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   8 +
 17 files changed, 1133 insertions(+), 22 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 36e58ef6552..8ce1c7d8224 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -47,19 +47,31 @@ if tf2.enabled():
   from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding as CategoryEncodingV1
   CategoryEncodingV2 = CategoryEncoding
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup import IntegerLookup
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup_v1 import IntegerLookup as IntegerLookupV1
+  IntegerLookupV2 = IntegerLookup
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization as NormalizationV1
   NormalizationV2 = Normalization
+  from tensorflow.python.keras.layers.preprocessing.string_lookup import StringLookup
+  from tensorflow.python.keras.layers.preprocessing.string_lookup_v1 import StringLookup as StringLookupV1
+  StringLookupV2 = StringLookup
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization as TextVectorizationV1
   TextVectorizationV2 = TextVectorization
 else:
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup_v1 import IntegerLookup
+  from tensorflow.python.keras.layers.preprocessing.integer_lookup import IntegerLookup as IntegerLookupV2
+  IntegerLookupV1 = IntegerLookup
   from tensorflow.python.keras.layers.preprocessing.category_encoding_v1 import CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.category_encoding import CategoryEncoding as CategoryEncodingV2
   CategoryEncodingV1 = CategoryEncoding
   from tensorflow.python.keras.layers.preprocessing.normalization_v1 import Normalization
   from tensorflow.python.keras.layers.preprocessing.normalization import Normalization as NormalizationV2
   NormalizationV1 = Normalization
+  from tensorflow.python.keras.layers.preprocessing.string_lookup_v1 import StringLookup
+  from tensorflow.python.keras.layers.preprocessing.string_lookup import StringLookup as StringLookupV2
+  StringLookupV1 = StringLookup
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
index 6f497983408..3512b9988c1 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras.layers.preprocessing import index_lookup
 from tensorflow.python.keras.layers.preprocessing import table_utils
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export("keras.layers.experimental.preprocessing.IntegerLookup", v1=[])
 class IntegerLookup(index_lookup.IndexLookup):
   """Maps integers from a vocabulary to integer indices.
 
@@ -39,18 +41,18 @@ class IntegerLookup(index_lookup.IndexLookup):
   Attributes:
     max_values: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary. Note that this vocabulary
-      includes the OOV and mask tokens, so the effective number of tokens is
-      (max_tokens - num_oov_tokens - (1 if mask_token else 0))
+      includes the OOV and mask values, so the effective number of values is
+      (max_values - num_oov_values - (1 if mask_token else 0))
     num_oov_indices: The number of out-of-vocabulary values to use; defaults to
-      1. If this value is more than 1, OOV inputs are hashed to determine their
-      OOV value; if this value is 0, passing an OOV input will result in a '-1'
-      being returned for that value in the output tensor. (Note that, because
-      the value is -1 and not 0, this will allow you to effectively drop OOV
-      values from categorical encodings.)
+      1. If this value is more than 1, OOV inputs are modulated to determine
+      their OOV value; if this value is 0, passing an OOV input will result in
+      a '-1' being returned for that value in the output tensor. (Note that,
+      because the value is -1 and not 0, this will allow you to effectively drop
+      OOV values from categorical encodings.)
     mask_value: A value that represents masked inputs, and which is mapped to
       index 0. Defaults to 0. If set to None, no mask term will be added and the
-      OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of
-      (1...num_oov_tokens+1).
+      OOV values, if any, will be indexed from (0...num_oov_values) instead of
+      (1...num_oov_values+1).
     oov_value: The value representing an out-of-vocabulary value. Defaults to
       -1.
     vocabulary: An optional list of values, or a path to a text file containing
@@ -87,7 +89,7 @@ class IntegerLookup(index_lookup.IndexLookup):
   [0, -1, 42, 1138, 1000, 36, 12]
 
   Note how the mask value 0 and the OOV value -1 have been added to the
-  vocabulary. The remaining tokens are sorted by frequency (1138, which has
+  vocabulary. The remaining values are sorted by frequency (1138, which has
   2 occurrences, is first) then by inverse sort order.
 
   >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
@@ -99,6 +101,27 @@ class IntegerLookup(index_lookup.IndexLookup):
          [2, 4, 5]])>
 
 
+  Lookups with multiple OOV tokens.
+
+  This example demonstrates how to use a lookup layer with multiple OOV tokens.
+  When a layer is created with more than one OOV token, any OOV values are
+  hashed into the number of OOV buckets, distributing OOV values in a
+  deterministic fashion across the set.
+
+  >>> vocab = [12, 36, 1138, 42]
+  >>> data = tf.constant([[12, 1138, 42], [37, 1000, 36]])
+  >>> layer = IntegerLookup(vocabulary=vocab, num_oov_indices=2)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[3, 5, 6],
+         [2, 1, 4]])>
+
+  Note that the output for OOV value 37 is 2, while the output for OOV value
+  1000 is 1. The in-vocab terms have their output index increased by 1 from
+  earlier examples (12 maps to 3, etc) in order to make space for the extra OOV
+  value.
+
+
   Inverse lookup
 
   This example demonstrates how to map indices to values using this layer. (You
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
index ec326f4d78b..2a86e9d56b0 100644
--- a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 
 from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import integer_lookup
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export(v1=["keras.layers.experimental.preprocessing.IntegerLookup"])
 class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
   pass
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
index a420de8678a..d772f57aa4d 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras.layers.preprocessing import index_lookup
 from tensorflow.python.keras.layers.preprocessing import table_utils
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export("keras.layers.experimental.preprocessing.StringLookup", v1=[])
 class StringLookup(index_lookup.IndexLookup):
   """Maps strings from a vocabulary to integer indices.
 
@@ -52,7 +54,7 @@ class StringLookup(index_lookup.IndexLookup):
       will be added and the OOV tokens, if any, will be indexed from
       (0...num_oov_indices) instead of (1...num_oov_indices+1).
     oov_token: The token representing an out-of-vocabulary value. Defaults to
-      "[OOV]".
+      "[UNK]".
     vocabulary: An optional list of vocabulary terms, or a path to a text file
       containing a vocabulary to load into this layer. The file should contain
       one token per line. If the list or file contains the same token multiple
@@ -85,9 +87,9 @@ class StringLookup(index_lookup.IndexLookup):
   >>> layer = StringLookup()
   >>> layer.adapt(data)
   >>> layer.get_vocabulary()
-  ['', '[OOV]', 'd', 'z', 'c', 'b', 'a']
+  ['', '[UNK]', 'd', 'z', 'c', 'b', 'a']
 
-  Note how the mask token '' and the OOV token [OOV] have been added to the
+  Note how the mask token '' and the OOV token [UNK] have been added to the
   vocabulary. The remaining tokens are sorted by frequency ('d', which has
   2 occurrences, is first) then by inverse sort order.
 
@@ -99,6 +101,25 @@ class StringLookup(index_lookup.IndexLookup):
   array([[6, 4, 2],
          [2, 3, 5]])>
 
+  Lookups with multiple OOV tokens.
+
+  This example demonstrates how to use a lookup layer with multiple OOV tokens.
+  When a layer is created with more than one OOV token, any OOV values are
+  hashed into the number of OOV buckets, distributing OOV values in a
+  deterministic fashion across the set.
+
+  >>> vocab = ["a", "b", "c", "d"]
+  >>> data = tf.constant([["a", "c", "d"], ["m", "z", "b"]])
+  >>> layer = StringLookup(vocabulary=vocab, num_oov_indices=2)
+  >>> layer(data)
+  <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
+  array([[3, 5, 6],
+         [1, 2, 4]])>
+
+  Note that the output for OOV value 'm' is 1, while the output for OOV value
+  'z' is 2. The in-vocab terms have their output index increased by 1 from
+  earlier examples (a maps to 3, etc) in order to make space for the extra OOV
+  value.
 
   Inverse lookup
 
@@ -112,7 +133,7 @@ class StringLookup(index_lookup.IndexLookup):
   >>> layer(data)
   <tf.Tensor: shape=(2, 3), dtype=string, numpy=
   array([[b'a', b'c', b'd'],
-         [b'd', b'[OOV]', b'b']], dtype=object)>
+         [b'd', b'[UNK]', b'b']], dtype=object)>
 
   Note that the integer 5, which is out of the vocabulary space, returns an OOV
   token.
@@ -131,9 +152,9 @@ class StringLookup(index_lookup.IndexLookup):
   >>> i_layer(int_data)
   <tf.Tensor: shape=(2, 3), dtype=string, numpy=
   array([[b'a', b'c', b'd'],
-         [b'd', b'[OOV]', b'b']], dtype=object)>
+         [b'd', b'[UNK]', b'b']], dtype=object)>
 
-  In this example, the input value 'z' resulted in an output of '[OOV]', since
+  In this example, the input value 'z' resulted in an output of '[UNK]', since
   1000 was not in the vocabulary - it got represented as an OOV, and all OOV
   values are returned as '[OOV}' in the inverse layer. Also, note that for the
   inverse to work, you must have already set the forward layer vocabulary
@@ -144,9 +165,9 @@ class StringLookup(index_lookup.IndexLookup):
                max_tokens=None,
                num_oov_indices=1,
                mask_token="",
-               oov_token="[OOV]",
+               oov_token="[UNK]",
                vocabulary=None,
-               encoding="utf-8",
+               encoding=None,
                invert=False,
                **kwargs):
     allowed_dtypes = [dtypes.string]
@@ -158,6 +179,9 @@ class StringLookup(index_lookup.IndexLookup):
     if "dtype" not in kwargs:
       kwargs["dtype"] = dtypes.string
 
+    if encoding is None:
+      encoding = "utf-8"
+
     if vocabulary is not None:
       if isinstance(vocabulary, str):
         vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
index 0b9081d815c..2b45b59fcf4 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
@@ -155,7 +156,7 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
 
   def test_get_vocab_returns_str(self):
     vocab_data = ["earth", "wind", "and", "fire"]
-    expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
+    expected_vocab = ["", "[UNK]", "earth", "wind", "and", "fire"]
     layer = get_layer_class()(vocabulary=vocab_data)
     layer_vocab = layer.get_vocabulary()
     self.assertAllEqual(expected_vocab, layer_vocab)
@@ -205,7 +206,7 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
     input_array = np.array([["earth", "wind", "and", "fire"],
                             ["fire", "and", "earth", "michigan"]])
     expected_output = np.array([["earth", "wind", "and", "fire"],
-                                ["fire", "and", "earth", "[OOV]"]])
+                                ["fire", "and", "earth", "[UNK]"]])
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
     layer = get_layer_class()(vocabulary=vocab_data)
@@ -217,6 +218,21 @@ class StringLookupVocabularyTest(keras_parameterized.TestCase,
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
+  def test_ragged_string_input_multi_bucket(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = ragged_factory_ops.constant([["earth", "wind", "fire"],
+                                               ["fire", "and", "earth",
+                                                "ohio"]])
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
+    layer = get_layer_class()(num_oov_indices=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
 class StringLookupSaveableTest(keras_parameterized.TestCase,
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
index 0d4c70de655..3b5d0679372 100644
--- a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
@@ -19,7 +19,9 @@ from __future__ import print_function
 
 from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export(v1=["keras.layers.experimental.preprocessing.StringLookup"])
 class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
   pass
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 6b58a08a4bf..d990f2075c8 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -51,8 +51,12 @@ from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
 from tensorflow.python.keras.layers.preprocessing import discretization
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
+from tensorflow.python.keras.layers.preprocessing import integer_lookup as preprocessing_integer_lookup
+from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1 as preprocessing_integer_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
 from tensorflow.python.keras.layers.preprocessing import normalization_v1 as preprocessing_normalization_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup as preprocessing_string_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1 as preprocessing_string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization as preprocessing_text_vectorization
 from tensorflow.python.keras.layers.preprocessing import text_vectorization_v1 as preprocessing_text_vectorization_v1
 from tensorflow.python.keras.utils import generic_utils
@@ -63,11 +67,13 @@ from tensorflow.python.util.tf_export import keras_export
 ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                convolutional_recurrent, core, cudnn_recurrent, dense_attention,
                embeddings, einsum_dense, local, merge, noise, normalization,
-               pooling, image_preprocessing, preprocessing_normalization_v1,
+               pooling, image_preprocessing, preprocessing_integer_lookup_v1,
+               preprocessing_normalization_v1, preprocessing_string_lookup_v1,
                preprocessing_text_vectorization_v1, recurrent, wrappers,
                hashing, category_crossing, category_encoding_v1, discretization)
 ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
-                  preprocessing_normalization, preprocessing_text_vectorization,
+                  preprocessing_integer_lookup, preprocessing_normalization,
+                  preprocessing_string_lookup, preprocessing_text_vectorization,
                   category_encoding)
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..409509cd4d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
new file mode 100644
index 00000000000..c0052764039
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -0,0 +1,240 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.integer_lookup_v1.IntegerLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.integer_lookup.IntegerLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup_v1.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_values\', \'num_oov_indices\', \'mask_value\', \'oov_value\', \'vocabulary\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'0\', \'-1\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..4cb57350380
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
new file mode 100644
index 00000000000..1deb932cb4e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -0,0 +1,240 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.string_lookup_v1.StringLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup_v1.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer_v1.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'encoding\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'\', \'[UNK]\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 4a0522dc08f..94f6e3ba990 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "Hashing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "IntegerLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
@@ -68,6 +72,10 @@ tf_module {
     name: "Resizing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StringLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextVectorization"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..409509cd4d2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
new file mode 100644
index 00000000000..dedb0f3073f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-integer-lookup.pbtxt
@@ -0,0 +1,237 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.IntegerLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.integer_lookup.IntegerLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_values\', \'num_oov_indices\', \'mask_value\', \'oov_value\', \'vocabulary\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'0\', \'-1\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
new file mode 100644
index 00000000000..4cb57350380
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup.__metaclass__"
+tf_class {
+  is_instance: "<type \'type\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
new file mode 100644
index 00000000000..b419e779c48
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-string-lookup.pbtxt
@@ -0,0 +1,237 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.StringLookup"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.string_lookup.StringLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.index_lookup.IndexLookup\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.CombinerPreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_preprocessing_layer.PreprocessingLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'max_tokens\', \'num_oov_indices\', \'mask_token\', \'oov_token\', \'vocabulary\', \'encoding\', \'invert\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'1\', \'\', \'[UNK]\', \'None\', \'None\', \'False\'], "
+  }
+  member_method {
+    name: "adapt"
+    argspec: "args=[\'self\', \'data\', \'reset_state\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_vocabulary"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_vocabulary"
+    argspec: "args=[\'self\', \'vocab\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "vocab_size"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 4a0522dc08f..94f6e3ba990 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "Hashing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "IntegerLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Normalization"
     mtype: "<type \'type\'>"
@@ -68,6 +72,10 @@ tf_module {
     name: "Resizing"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "StringLookup"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextVectorization"
     mtype: "<type \'type\'>"

From 838341539843b1b972f9a8bbb3afa2d2288a6c63 Mon Sep 17 00:00:00 2001
From: Juhyun Lee <impjdi@google.com>
Date: Fri, 29 May 2020 10:08:56 -0700
Subject: [PATCH 1387/1533] TFLite GPU: Expand how an HWC tensor is read.

https://github.com/tensorflow/tensorflow/issues/39749

PiperOrigin-RevId: 313801850
Change-Id: I6017483d960abbc67572806f943c0b41cb6b5410
---
 .../gpu/common/model_builder_helper.cc        | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index a1705e6cf78..a0f7db25210 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -259,18 +259,25 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
 }
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape) {
-  if (dimensions->size != 4) {
-    return absl::InvalidArgumentError(
-        absl::StrCat("Expected a 4D tensor of shape 1xHxWxC but got ",
-                     GetDimensionString(dimensions)));
+  if (dimensions->size == 3) {
+    shape->h = dimensions->data[0];
+    shape->w = dimensions->data[1];
+    shape->c = dimensions->data[2];
+    return absl::OkStatus();
   }
-  if (dimensions->data[0] != 1) {
-    return absl::UnimplementedError("Batch size is not equal to 1.");
+  if (dimensions->size == 4) {
+    if (dimensions->data[0] != 1) {
+      return absl::UnimplementedError("Batch size is not equal to 1.");
+    }
+    shape->h = dimensions->data[1];
+    shape->w = dimensions->data[2];
+    shape->c = dimensions->data[3];
+    return absl::OkStatus();
   }
-  shape->h = dimensions->data[1];
-  shape->w = dimensions->data[2];
-  shape->c = dimensions->data[3];
-  return absl::OkStatus();
+  return absl::InvalidArgumentError(
+      absl::StrCat("Expected a 3D tensor of shape HxWxC or a 4D tensor of "
+                   "shape 1xHxWxC but got ",
+                   GetDimensionString(dimensions)));
 }
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {

From f36a1a090e469c632448722186a26a416fda4596 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 29 May 2020 10:11:51 -0700
Subject: [PATCH 1388/1533] Lower level of python TraceMe to 1

PiperOrigin-RevId: 313802412
Change-Id: Ia7d3e0866f13676b6562f3290ca8385a1a37c312
---
 .../python/profiler/internal/traceme_wrapper.h   | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.h b/tensorflow/python/profiler/internal/traceme_wrapper.h
index c074e909640..6e61d9cdd8f 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.h
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.h
@@ -34,13 +34,15 @@ class TraceMeWrapper {
   // pybind11::str and pybind11::kwargs are taken by const reference to avoid
   // python reference-counting overhead.
   TraceMeWrapper(const pybind11::str& name, const pybind11::kwargs& kwargs)
-      : traceme_([&]() {
-          std::string name_and_metadata(name);
-          if (!kwargs.empty()) {
-            AppendMetadata(&name_and_metadata, kwargs);
-          }
-          return name_and_metadata;
-        }) {}
+      : traceme_(
+            [&]() {
+              std::string name_and_metadata(name);
+              if (!kwargs.empty()) {
+                AppendMetadata(&name_and_metadata, kwargs);
+              }
+              return name_and_metadata;
+            },
+            /*level=*/1) {}
 
   // pybind11::kwargs is taken by const reference to avoid python
   // reference-counting overhead.

From 55987dbb42b8f066154759785e240c4fe8b825a9 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Fri, 29 May 2020 10:44:36 -0700
Subject: [PATCH 1389/1533] Adds `custom_grad` and `vjp` to tf_numpy/extensions
 and trax/math.

Also changes tf.custom_gradient to allow nested structures as inputs (currently it only allows a list of tensors).

PiperOrigin-RevId: 313808985
Change-Id: Ibf58547ec938c31324156d45b49771ceeafd10ab
---
 tensorflow/python/ops/custom_gradient.py | 35 +++++++---------
 tensorflow/python/ops/gradients_test.py  | 52 ++++--------------------
 2 files changed, 22 insertions(+), 65 deletions(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 953bb252729..2a9194fb146 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -175,23 +175,20 @@ def custom_gradient(f=None):
 
   Args:
     f: function `f(*x)` that returns a tuple `(y, grad_fn)` where:
-       - `x` is a sequence of (nested structures of) `Tensor` inputs to the
-         function.
-       - `y` is a (nested structure of) `Tensor` outputs of applying TensorFlow
-         operations in `f` to `x`.
+       - `x` is a sequence of `Tensor` inputs to the function.
+       - `y` is a `Tensor` or sequence of `Tensor` outputs of applying
+         TensorFlow operations in `f` to `x`.
        - `grad_fn` is a function with the signature `g(*grad_ys)` which returns
-         a list of `Tensor`s the same size as (flattened) `x` - the derivatives
-         of `Tensor`s in `y` with respect to the `Tensor`s in `x`.  `grad_ys` is
-         a sequence of `Tensor`s the same size as (flattened) `y` holding the
-         initial value gradients for each `Tensor` in `y`.
-
-         In a pure mathematical sense, a vector-argument vector-valued function
-         `f`'s derivatives should be its Jacobian matrix `J`. Here we are
-         expressing the Jacobian `J` as a function `grad_fn` which defines how
-         `J` will transform a vector `grad_ys` when left-multiplied with it
-         (`grad_ys * J`, the vector-Jacobian product, or VJP). This functional
-         representation of a matrix is convenient to use for chain-rule
-         calculation (in e.g. the back-propagation algorithm).
+         a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
+         to the `Tensor`s in `x`.  `grad_ys` is a `Tensor` or sequence of
+         `Tensor`s the same size as `y` holding the initial value gradients for
+         each `Tensor` in `y`. In a pure mathematical sense, a vector-argument
+         vector-valued function `f`'s derivatives should be its Jacobian matrix
+         `J`. Here we are expressing the Jacobian `J` as a function `grad_fn`
+         which defines how `J` will transform a vector `grad_ys` when
+         left-multiplied with it (`grad_ys * J`). This functional representation
+         of a matrix is convenient to use for chain-rule calculation
+         (in e.g. the back-propagation algorithm).
 
          If `f` uses `Variable`s (that are not part of the
          inputs), i.e. through `get_variable`, then `grad_fn` should have
@@ -310,7 +307,7 @@ def _graph_mode_decorator(f, args, kwargs):
         "The custom_gradient decorator currently supports keywords "
         "arguments only when eager execution is enabled.")
   name = "CustomGradient-%s" % ops.uid()
-  args = nest.map_structure(ops.convert_to_tensor, args)
+  args = [ops.convert_to_tensor(x) for x in args]
 
   # Checking global and local variables attempts to ensure that no non-resource
   # Variables are added to the graph.
@@ -321,7 +318,6 @@ def _graph_mode_decorator(f, args, kwargs):
   ])
   with tape_lib.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args)
-  args = nest.flatten(args)
   after_vars = set([
       v.ref() for v in current_var_scope.global_variables() +
       current_var_scope.local_variables()
@@ -408,7 +404,6 @@ def _eager_mode_decorator(f, args, kwargs):
   """Implement custom gradient decorator for eager mode."""
   with tape_lib.VariableWatcher() as variable_watcher:
     result, grad_fn = f(*args, **kwargs)
-  args = nest.flatten(args)
   all_inputs = list(args) + list(kwargs.values())
   # The variables that grad_fn needs to return gradients for are the set of
   # variables used that are *not* part of the inputs.
@@ -448,7 +443,7 @@ def _eager_mode_decorator(f, args, kwargs):
       raise ValueError(
           "custom_gradient function expected to return", arg_count,
           "gradients but returned", len(flat_grads), "instead.")
-    return flat_grads + variable_grads
+    return nest.flatten(input_grads) + variable_grads
 
   tape_lib.record_operation(f.__name__, flat_result, recorded_inputs,
                             actual_grad_fn)
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 9a9ce72a557..a06be7af74b 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -60,7 +60,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
 from tensorflow.python.ops import gradient_checker_v2
-from tensorflow.python.util import nest
 
 
 class GradientsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
@@ -1040,7 +1039,7 @@ class GetDependentVariablesTest(test_util.TensorFlowTestCase):
       self.assertEqual(dependent_vars, [var])
 
 
-class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+class CustomGradientTest(test_util.TensorFlowTestCase):
 
   def testCustomGradientTrivial(self):
 
@@ -1120,7 +1119,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       out = core_layers.dense(x, 3, use_bias=False)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
+        self.assertEqual(1, len(variables))
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((4, 3))]
 
@@ -1147,7 +1146,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       out = core_layers.dense(x, 3, use_bias=False)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
+        self.assertEqual(1, len(variables))
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((3, 3))]
 
@@ -1186,7 +1185,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
         def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
           del out_grad
-          self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
+          self.assertEqual(1, len(variables))
           return (array_ops.ones((3, 2)),
                   [array_ops.ones((2, 4))])
 
@@ -1210,7 +1209,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
         del out_grad
-        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
+        self.assertEqual(1, len(variables))
         return (array_ops.ones((3, 2)), [array_ops.ones((2, 4))])
 
       return out, Grad
@@ -1274,7 +1273,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       out = core_layers.dense(x, 3, use_bias=False)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
+        self.assertEqual(1, len(variables))
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((4, 3))]
 
@@ -1285,7 +1284,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       out = F(x)
 
       def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
-        self.assertEqual(1, len(variables))  # pylint: disable=g-generic-assert
+        self.assertEqual(1, len(variables))
         grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
         return grads[0], [array_ops.ones((4, 3))]
 
@@ -1304,43 +1303,6 @@ class CustomGradientTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
-  @parameterized.named_parameters(
-      [(("_%s_%s" % (x_struct, y_struct)).replace(" ", "").replace("None", ""),  # pylint: disable=g-complex-comprehension
-        x_struct, y_struct)
-       for y_struct in [[None, ()], (None, (), [], (None, ((), None)))]
-       for x_struct in [(None, ()), (((), ()), [None, None], [], (None, ()))]
-      ])
-  @test_util.run_in_graph_and_eager_modes
-  def testCustomGradientStructuralInputOutput(self, x_struct, y_struct):
-    """Tests that custom_gradient can handle structured inputs/outputs."""
-    def Zeros(x):
-      return nest.map_structure(lambda _: array_ops.zeros([], "float32"), x)
-    def GetStruct(x):
-      return nest.map_structure(lambda _: None, x)
-
-    def MakeVjp(f, *x):
-      with backprop.GradientTape(persistent=True) as tape:
-        tape.watch(nest.flatten(x))
-        y = f(*x)
-      def Vjp(dy):
-        return tape.gradient(y, x, output_gradients=dy)
-      return y, Vjp
-
-    @custom_gradient.custom_gradient
-    def F(*x):
-      self.assertEqual(x_struct, GetStruct(x))
-      def Vjp(*dy):
-        self.assertEqual(len(nest.flatten(y_struct)),
-                         len(nest.flatten(dy)))
-        return nest.flatten(Zeros(x_struct))
-      return Zeros(y_struct), Vjp
-
-    x, dy = Zeros([x_struct, y_struct])
-    y, vjp = MakeVjp(F, *x)
-    dx = vjp(dy)
-    self.assertEqual(x_struct, GetStruct(dx))
-    self.assertEqual(y_struct, GetStruct(y))
-
 
 class TensorListGradientsTest(test_util.TensorFlowTestCase):
 

From e3e7bd4bf36c0d352695b4dd2d901225d5e9358b Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Fri, 29 May 2020 10:45:00 -0700
Subject: [PATCH 1390/1533] Don't use the cuDNN GPU kernel for LSTMs when
 inputs are RaggedTensors.

PiperOrigin-RevId: 313809073
Change-Id: I368307d458f8b23c320b9c4df31c81d18e7ab43d
---
 tensorflow/python/keras/layers/recurrent_v2.py      |  8 ++++++--
 tensorflow/python/keras/layers/recurrent_v2_test.py | 12 ++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index a9d5ef8587c..adefb689a1f 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -413,7 +413,9 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU):
     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
 
-    if not self._could_use_gpu_kernel:
+    # TODO(b/156447398) Investigate why the cuDNN kernel kernel fails with
+    # ragged inputs.
+    if is_ragged_input or not self._could_use_gpu_kernel:
       kwargs = {'training': training}
       self._maybe_reset_cell_dropout_mask(self.cell)
 
@@ -1109,7 +1111,9 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM):
     input_shape = K.int_shape(inputs)
     timesteps = input_shape[0] if self.time_major else input_shape[1]
 
-    if not self._could_use_gpu_kernel:
+    # TODO(b/156447398) Investigate why the cuDNN kernel kernel fails with
+    # ragged inputs.
+    if is_ragged_input or not self._could_use_gpu_kernel:
       # Fall back to use the normal LSTM.
       kwargs = {'training': training}
       self._maybe_reset_cell_dropout_mask(self.cell)
diff --git a/tensorflow/python/keras/layers/recurrent_v2_test.py b/tensorflow/python/keras/layers/recurrent_v2_test.py
index 4cb964b4bc4..ec70761c8a8 100644
--- a/tensorflow/python/keras/layers/recurrent_v2_test.py
+++ b/tensorflow/python/keras/layers/recurrent_v2_test.py
@@ -30,7 +30,9 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import embeddings
 from tensorflow.python.keras.layers import recurrent_v2 as rnn_v2
+from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
@@ -113,6 +115,16 @@ class RNNV2Test(keras_parameterized.TestCase):
     model = keras.Model(inputs=inputs, outputs=outputs, name='3_layer')
     model.save(os.path.join(self.get_temp_dir(), 'model'), save_format='tf')
 
+  @parameterized.parameters([rnn_v2.LSTM, rnn_v2.GRU])
+  def test_ragged(self, layer):
+    vocab_size = 100
+    inputs = ragged_factory_ops.constant(
+        np.random.RandomState(0).randint(0, vocab_size, [128, 25]))
+    embedder = embeddings.Embedding(input_dim=vocab_size, output_dim=16)
+    embedded_inputs = embedder(inputs)
+    lstm = layer(32)
+    lstm(embedded_inputs)
+
 
 if __name__ == '__main__':
   test.main()

From b1e4822b73315ee851000ab5f97ccb2255b8a1c3 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Fri, 29 May 2020 10:50:11 -0700
Subject: [PATCH 1391/1533] Encoding layer should return float

PiperOrigin-RevId: 313810155
Change-Id: Id4cdd9456951f3b5a6206383a95fabf6b92ea947
---
 .../layers/preprocessing/category_encoding.py     | 15 ++++++++-------
 .../layers/preprocessing/text_vectorization.py    |  2 +-
 .../preprocessing/text_vectorization_test.py      |  6 +++---
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/category_encoding.py b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
index b9460ed059c..a0b7d275e35 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_encoding.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_encoding.py
@@ -65,11 +65,11 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
   >>> layer = tf.keras.layers.experimental.preprocessing.CategoryEncoding(
   ...           max_tokens=4)
   >>> layer([[0, 1], [0, 0], [1, 2], [3, 1]])
-  <tf.Tensor: shape=(4, 4), dtype=int64, numpy=
-    array([[1, 1, 0, 0],
-           [2, 0, 0, 0],
-           [0, 1, 1, 0],
-           [0, 1, 0, 1]])>
+  <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
+    array([[1., 1., 0., 0.],
+           [2., 0., 0., 0.],
+           [0., 1., 1., 0.],
+           [0., 1., 0., 1.]], dtype=float32)>
 
 
   Examples with weighted inputs:
@@ -286,18 +286,19 @@ class CategoryEncoding(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     binary_output = (self._output_mode == BINARY)
     if self._sparse:
-      return bincount_ops.sparse_bincount(
+      result = bincount_ops.sparse_bincount(
           inputs,
           weights=count_weights,
           minlength=out_depth,
           axis=-1,
           binary_output=binary_output)
+      return math_ops.cast(result, K.floatx())
     else:
       result = bincount_ops.bincount(
           inputs,
           weights=count_weights,
           minlength=out_depth,
-          dtype=dtypes.int64,
+          dtype=K.floatx(),
           axis=-1,
           binary_output=binary_output)
       result.set_shape(tensor_shape.TensorShape((None, out_depth)))
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index c80f998fe46..bff7969477c 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -341,7 +341,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
   def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    output_dtype = K.floatx() if self._output_mode == TFIDF else dtypes.int64
+    output_dtype = dtypes.int64 if self._output_mode == INT else K.floatx()
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index 508f222eac7..c641b2b71c9 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -269,10 +269,10 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
   def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
                                        use_dataset, expected_output):
     cls = get_layer_class()
-    if kwargs.get("output_mode") == text_vectorization.TFIDF:
-      expected_output_dtype = dtypes.float32
-    else:
+    if kwargs.get("output_mode") == text_vectorization.INT:
       expected_output_dtype = dtypes.int64
+    else:
+      expected_output_dtype = dtypes.float32
     input_shape = input_data.shape
 
     if use_dataset:

From dae0485fed2c341261a00af498afc9723cfd69cf Mon Sep 17 00:00:00 2001
From: Sung Jin Hwang <sjhwang@google.com>
Date: Fri, 29 May 2020 10:51:52 -0700
Subject: [PATCH 1392/1533] Pass static shapes for TensorArray used inside
 tf.map_fn().

If fn_output_signature argument contains TensorSpec then extract the static
shape information and pass them to TensorArray used to collect output slices.

This is required for TF Lite conversion of models using tf.map_fn(), because
the conversion rejects TensorArray/List with unknown element shapes.

PiperOrigin-RevId: 313810480
Change-Id: I1d1e381b7212b3e691d55ff43e84789a7b9ecd9a
---
 tensorflow/python/ops/map_fn.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 40f8edfcdd1..96810805c18 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -453,13 +453,14 @@ def map_fn(fn,
 
     # Prepare result tensor array.
     # TODO(edloper): Should we set infer_shape=False for composite tensors?
-    result_batchable_dtype = _result_flat_signature_to_batchable_dtype(
-        result_flat_signature)
-    result_batchable_ta = [
-        tensor_array_ops.TensorArray(
-            dtype=dt, size=n, dynamic_size=False, infer_shape=infer_shape)
-        for dt in result_batchable_dtype
-    ]
+    result_batchable_tensor_spec = (
+        _result_flat_signature_to_batchable_tensor_spec(result_flat_signature))
+    result_batchable_ta = []
+    for spec in result_batchable_tensor_spec:
+      result_batchable_ta.append(
+          tensor_array_ops.TensorArray(
+              dtype=spec.dtype, size=n, dynamic_size=False,
+              infer_shape=infer_shape, element_shape=spec.shape))
 
     def compute(i, tas):
       """The loop body of map_fn.
@@ -538,15 +539,14 @@ def _most_general_compatible_type(spec):
     return spec
 
 
-def _result_flat_signature_to_batchable_dtype(result_flat_signature):
-  """Converts result_flat_signature -> result_batchable_dtype."""
-  components = []
+def _result_flat_signature_to_batchable_tensor_spec(result_flat_signature):
+  """Converts result_flat_signature -> result_batchable_tensor_specs."""
+  tensor_specs = []
   for spec in result_flat_signature:
     if not isinstance(spec, type_spec.BatchableTypeSpec):
       raise TypeError("map_fn can not generate %s outputs" % (spec,))
-    # pylint: disable=protected-access
-    components.extend([s.dtype for s in spec._flat_tensor_specs])
-  return components
+    tensor_specs.extend(spec._flat_tensor_specs)  # pylint: disable=protected-access
+  return tensor_specs
 
 
 def _elems_flat_to_batchable(elems_flat):

From 58f1e310194bbff267a5042061628b68c7fc5207 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 29 May 2020 11:06:53 -0700
Subject: [PATCH 1393/1533] Fix c_api_remote_test tsan flakiness.

PiperOrigin-RevId: 313813747
Change-Id: I428eaa271adcb0cca3236edd0c52232dda9719a6
---
 tensorflow/c/eager/BUILD                      |  31 +-
 tensorflow/c/eager/c_api_cluster_test.cc      |  20 -
 tensorflow/c/eager/c_api_distributed_test.cc  | 506 ++++++++++++++++++
 tensorflow/c/eager/c_api_remote_test.cc       | 488 -----------------
 tensorflow/c/eager/c_api_test_util.cc         |  22 +
 tensorflow/c/eager/c_api_test_util.h          |   8 +
 .../rpc/eager/grpc_eager_client.cc            |   2 +-
 7 files changed, 567 insertions(+), 510 deletions(-)
 create mode 100644 tensorflow/c/eager/c_api_distributed_test.cc

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index b8429646960..407acfe1ca9 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -370,7 +370,6 @@ tf_cuda_cc_test(
     extra_copts = tfe_xla_copts(),
     tags = [
         "noasan",  # leaks gRPC server instances
-        "notsan",  # b/157098283
     ],
     deps = [
         ":c_api",
@@ -392,6 +391,36 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "c_api_distributed_test",
+    size = "small",
+    srcs = [
+        "c_api_distributed_test.cc",
+    ],
+    # TODO(b/136478427): Figure out how to correctly shut the server down
+    args = ["--heap_check=local"],
+    extra_copts = tfe_xla_copts(),
+    tags = ["noasan"],  # leaks gRPC server instances
+    deps = [
+        ":c_api",
+        ":c_api_experimental",
+        ":c_api_internal",
+        ":c_api_test_util",
+        ":tfe_tensorhandle_internal",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:function_optimization_registry",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "c_api_cluster_test",
     size = "small",
diff --git a/tensorflow/c/eager/c_api_cluster_test.cc b/tensorflow/c/eager/c_api_cluster_test.cc
index f8c702d592a..7a604950a63 100644
--- a/tensorflow/c/eager/c_api_cluster_test.cc
+++ b/tensorflow/c/eager/c_api_cluster_test.cc
@@ -30,26 +30,6 @@ namespace {
 
 using ::tensorflow::string;
 
-tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
-  tensorflow::ServerDef server_def;
-  server_def.set_protocol("grpc");
-  server_def.set_job_name(job_name);
-  server_def.set_task_index(0);
-  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
-  tensorflow::JobDef* job_def = cluster_def->add_job();
-  job_def->set_name(job_name);
-  for (int i = 0; i < num_tasks; i++) {
-    int port = tensorflow::testing::PickUnusedPortOrDie();
-    job_def->mutable_tasks()->insert(
-        {i, tensorflow::strings::StrCat("localhost", ":", port)});
-  }
-  return server_def;
-}
-
-tensorflow::ServerDef GetServerDef(int num_tasks) {
-  return GetServerDef("localhost", num_tasks);
-}
-
 void ReplaceTaskInServerDef(tensorflow::ServerDef* server_def, int task_index) {
   tensorflow::JobDef* job_def = server_def->mutable_cluster()->mutable_job(0);
   int port = tensorflow::testing::PickUnusedPortOrDie();
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
new file mode 100644
index 00000000000..65f8d3cc646
--- /dev/null
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -0,0 +1,506 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace {
+
+using ::tensorflow::string;
+
+// Add the values of three variables on three different tasks.
+string AddVariablesFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'AddVariablesFunction'"
+      "      input_arg {"
+      "        name: 'var'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'sum'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read1'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read2'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var'"
+      "      device: '/job:localhost/replica:0/task:2/device:CPU:0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add1'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read1:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add2'"
+      "      op: 'Add'"
+      "      input: 'add1:z:0'"
+      "      input: 'read2:value:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'sum'"
+      "      value: 'add2:z:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle, status);
+  TFE_TensorHandle* is_initialized[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
+  CHECK_EQ(1, num_retvals);
+  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
+  bool initialized = false;
+  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
+  EXPECT_EQ(initialized, true);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(is_initialized[0]);
+  TFE_DeleteOp(op);
+  delete status;
+}
+
+void TestFunctionWithPackedInput(const bool remote) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(/*enable=*/true));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  const char task0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  // Create one variable per task.
+  TFE_TensorHandle* h0 = TestVariable(ctx, 1.0, task0_name);
+  TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
+  TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
+
+  // Add a sync point in order to make sure that variables have been initialized
+  // before the function execution starts.
+  // TODO(b/155789951): Remove once b/155789951 is fixed.
+  VarIsInitialized(ctx, h1);
+  VarIsInitialized(ctx, h2);
+
+  // Pack 3 variable handles into one TFE_TensorHandle.
+  int num_replicas = 3;
+  std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &num_replicas, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  EXPECT_EQ(TFE_TensorHandleDataType(packed_handle), TF_RESOURCE);
+  EXPECT_EQ(TFE_TensorHandleNumDims(packed_handle, status), 0);
+  EXPECT_EQ(TFE_TensorHandleNumElements(packed_handle, status), 1);
+
+  const string composite_device_name =
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
+  EXPECT_EQ(TFE_TensorHandleDeviceName(packed_handle, status),
+            composite_device_name);
+  EXPECT_EQ(TFE_TensorHandleBackingDeviceName(packed_handle, status),
+            composite_device_name);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // Register and run a function which returns the sum of 3 variables.
+  const string function_def = AddVariablesFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "AddVariablesFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, packed_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  if (remote) {
+    TFE_OpSetDevice(func, task1_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  ASSERT_EQ(1, num_retvals);
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(packed_handle);
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteTensorHandle(retvals[0]);
+  float sum = 0;
+  EXPECT_EQ(sizeof(sum), TF_TensorByteSize(t));
+  memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(sum, 6.0);
+
+  TFE_DeleteTensorHandle(h0);
+  TFE_DeleteTensorHandle(h1);
+  TFE_DeleteTensorHandle(h2);
+
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteExecutor(executor);
+  TFE_ContextRemoveFunction(ctx, "AddVariablesFunction", status);
+  TFE_DeleteContext(ctx);
+
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, TestLocalFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/false);
+}
+
+TEST(CAPI, TestRemoteFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/true);
+}
+
+string VariableAddFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'VariableAddFunction'"
+      "      input_arg {"
+      "        name: 'var0'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'var0_value'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read0:value:0'"
+      "      device: '/job:localhost/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'identity'"
+      "      op: 'Identity'"
+      "      input: 'add:z:0'"
+      "      device: '/job:localhost/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'var0_value'"
+      "      value: 'identity:output:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
+ public:
+  FunctionErrorInjectionPass(string error_node, string error_device)
+      : error_node_(error_node), error_device_(error_device) {}
+  tensorflow::Status Run(const tensorflow::DeviceSet& device_set,
+                         const tensorflow::ConfigProto& config_proto,
+                         std::unique_ptr<tensorflow::Graph>* graph,
+                         tensorflow::FunctionLibraryDefinition* flib_def,
+                         std::vector<std::string>* control_ret_node_names,
+                         bool* control_rets_updated) override {
+    // Inject failure to function instantiation if finding a node that contains
+    // the given node name (error_node_) and requested device (error_device_).
+    for (const auto node : graph->get()->nodes()) {
+      if (node->name().find(error_node_) != string::npos &&
+          node->requested_device() == error_device_) {
+        return tensorflow::errors::Internal("Injected graph pass error.");
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  const string error_node_;
+  const string error_device_;
+};
+
+void TestDistributedFunctionCancellation(bool inject_error) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+  const char dev2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  if (inject_error) {
+    // Inject a function optimization pass failure when it sees the 'read0' op
+    // having a requested device `dev2_name`. During execution:
+    //   * task:0 processes the main function `VariableAddFunction` and places
+    //     the read0 op on task:2
+    //   * task:0 partitions the main function with a subgraph containing read0
+    //     sent to task:2
+    //   * task:2 graph pass reports an error when it sees read0 with dev2_name
+    tensorflow::function_optimization_registration::
+        FunctionOptimizationPassRegistration register_test_pass(
+            std::make_unique<FunctionErrorInjectionPass>("read0", dev2_name));
+  }
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
+  EXPECT_NE(var_handle, nullptr);
+
+  const string function_def = VariableAddFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "VariableAddFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, var_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+
+  if (inject_error) {
+    ASSERT_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  } else {
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_EQ(1, num_retvals);
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteTensorHandle(retvals[0]);
+    float sum = 0;
+    ASSERT_EQ(sizeof(sum), TF_TensorByteSize(t));
+    memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+    TF_DeleteTensor(t);
+    ASSERT_EQ(sum, 4.0);
+  }
+
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(var_handle);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, DistributedFunctionNoError) {
+  TestDistributedFunctionCancellation(false);
+}
+
+TEST(CAPI, DistributedFunctionCancelledOnError) {
+  TestDistributedFunctionCancellation(true);
+}
+
+void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // Use large matrices so that RPCs don't return before we get a chance
+  // to call TFE_DeleteContext.
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle100x100(ctx);
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle100x100(ctx);
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  auto* h0_task1 =
+      TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* h1_task1 =
+      TFE_TensorHandleCopyToDevice(h1_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_Op* matmul = MatMulOp(ctx, h0_task1, h1_task1);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(h1_task0);
+  TFE_DeleteTensorHandle(h0_task1);
+  TFE_DeleteTensorHandle(h1_task1);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(matmul);
+
+  TFE_DeleteContext(ctx);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPC) {
+  TestRemoteExecuteDeleteContextWithOutstandingRPC(false);
+}
+
+TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPCAsync) {
+  TestRemoteExecuteDeleteContextWithOutstandingRPC(true);
+}
+}  // namespace
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 93d830d2c90..94c32cf3f30 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -35,26 +35,6 @@ namespace {
 
 using ::tensorflow::string;
 
-tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
-  tensorflow::ServerDef server_def;
-  server_def.set_protocol("grpc");
-  server_def.set_job_name(job_name);
-  server_def.set_task_index(0);
-  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
-  tensorflow::JobDef* job_def = cluster_def->add_job();
-  job_def->set_name(job_name);
-  for (int i = 0; i < num_tasks; i++) {
-    int port = tensorflow::testing::PickUnusedPortOrDie();
-    job_def->mutable_tasks()->insert(
-        {i, tensorflow::strings::StrCat("localhost:", port)});
-  }
-  return server_def;
-}
-
-tensorflow::ServerDef GetServerDef(int num_tasks) {
-  return GetServerDef("localhost", num_tasks);
-}
-
 void TestRemoteExecute(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
@@ -356,472 +336,4 @@ TEST(CAPI, RemoteExecuteSilentCopiesLocalAsyncFuncOrdering) {
                                 /*heavy_load_on_streaming_rpc=*/true);
 }
 
-// Add the values of three variables on three different tasks.
-string AddVariablesFunction() {
-  tensorflow::FunctionDef def;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-      "    signature {"
-      "      name: 'AddVariablesFunction'"
-      "      input_arg {"
-      "        name: 'var'"
-      "        type: DT_RESOURCE"
-      "      }"
-      "      output_arg {"
-      "        name: 'sum'"
-      "        type: DT_FLOAT"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'read0'"
-      "      op: 'ReadVariableOp'"
-      "      input: 'var'"
-      "      device: '/job:localhost/replica:0/task:0/device:CPU:0'"
-      "      attr {"
-      "        key: 'dtype'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'read1'"
-      "      op: 'ReadVariableOp'"
-      "      input: 'var'"
-      "      device: '/job:localhost/replica:0/task:1/device:CPU:0'"
-      "      attr {"
-      "        key: 'dtype'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'read2'"
-      "      op: 'ReadVariableOp'"
-      "      input: 'var'"
-      "      device: '/job:localhost/replica:0/task:2/device:CPU:0'"
-      "      attr {"
-      "        key: 'dtype'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'add1'"
-      "      op: 'Add'"
-      "      input: 'read0:value:0'"
-      "      input: 'read1:value:0'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'add2'"
-      "      op: 'Add'"
-      "      input: 'add1:z:0'"
-      "      input: 'read2:value:0'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    ret {"
-      "      key: 'sum'"
-      "      value: 'add2:z:0'"
-      "    }",
-      &def));
-  return def.SerializeAsString();
-}
-
-void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
-  TF_Status* status = TF_NewStatus();
-  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_OpAddInput(op, var_handle, status);
-  TFE_TensorHandle* is_initialized[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
-  CHECK_EQ(1, num_retvals);
-  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
-  bool initialized = false;
-  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
-  EXPECT_EQ(initialized, true);
-  TF_DeleteTensor(t);
-  TFE_DeleteTensorHandle(is_initialized[0]);
-  TFE_DeleteOp(op);
-  delete status;
-}
-
-void TestFunctionWithPackedInput(const bool remote) {
-  tensorflow::ServerDef server_def = GetServerDef(3);
-
-  // This server def has the task index set to 0.
-  string serialized = server_def.SerializeAsString();
-
-  server_def.set_task_index(1);
-  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server1)
-                  .ok());
-  ASSERT_TRUE(worker_server1->Start().ok());
-
-  server_def.set_task_index(2);
-  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server2)
-                  .ok());
-  ASSERT_TRUE(worker_server2->Start().ok());
-
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(/*enable=*/true));
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  const char task0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
-  const char task1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
-  const char task2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
-
-  // Create one variable per task.
-  TFE_TensorHandle* h0 = TestVariable(ctx, 1.0, task0_name);
-  TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
-  TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
-
-  // Add a sync point in order to make sure that variables have been initialized
-  // before the function execution starts.
-  // TODO(b/155789951): Remove once b/155789951 is fixed.
-  VarIsInitialized(ctx, h1);
-  VarIsInitialized(ctx, h2);
-
-  // Pack 3 variable handles into one TFE_TensorHandle.
-  int num_replicas = 3;
-  std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
-  TFE_TensorHandle* packed_handle =
-      TFE_CreatePackedTensorHandle(ctx, handles.data(), &num_replicas, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  EXPECT_EQ(TFE_TensorHandleDataType(packed_handle), TF_RESOURCE);
-  EXPECT_EQ(TFE_TensorHandleNumDims(packed_handle, status), 0);
-  EXPECT_EQ(TFE_TensorHandleNumElements(packed_handle, status), 1);
-
-  const string composite_device_name =
-      "/job:localhost/replica:0/task:0/device:COMPOSITE:0";
-  EXPECT_EQ(TFE_TensorHandleDeviceName(packed_handle, status),
-            composite_device_name);
-  EXPECT_EQ(TFE_TensorHandleBackingDeviceName(packed_handle, status),
-            composite_device_name);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  // Register and run a function which returns the sum of 3 variables.
-  const string function_def = AddVariablesFunction();
-  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
-                            status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  TFE_Op* func = TFE_NewOp(ctx, "AddVariablesFunction", status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_OpAddInput(func, packed_handle, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  if (remote) {
-    TFE_OpSetDevice(func, task1_name, status);
-    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  }
-
-  TFE_TensorHandle* retvals[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(func, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  ASSERT_EQ(1, num_retvals);
-  TFE_DeleteOp(func);
-  TFE_DeleteTensorHandle(packed_handle);
-  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_DeleteTensorHandle(retvals[0]);
-  float sum = 0;
-  EXPECT_EQ(sizeof(sum), TF_TensorByteSize(t));
-  memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
-  TF_DeleteTensor(t);
-  EXPECT_EQ(sum, 6.0);
-
-  TFE_DeleteTensorHandle(h0);
-  TFE_DeleteTensorHandle(h1);
-  TFE_DeleteTensorHandle(h2);
-
-  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
-  TFE_ExecutorWaitForAllPendingNodes(executor, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_DeleteExecutor(executor);
-  TFE_ContextRemoveFunction(ctx, "AddVariablesFunction", status);
-  TFE_DeleteContext(ctx);
-
-  TF_DeleteStatus(status);
-
-  // TODO(b/136478427): Figure out how to correctly shut the server down.
-  worker_server1.release();
-  worker_server2.release();
-}
-
-TEST(CAPI, TestLocalFunctionWithPackedInput) {
-  TestFunctionWithPackedInput(/*remote=*/false);
-}
-
-TEST(CAPI, TestRemoteFunctionWithPackedInput) {
-  TestFunctionWithPackedInput(/*remote=*/true);
-}
-
-string VariableAddFunction() {
-  tensorflow::FunctionDef def;
-  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
-      "    signature {"
-      "      name: 'VariableAddFunction'"
-      "      input_arg {"
-      "        name: 'var0'"
-      "        type: DT_RESOURCE"
-      "      }"
-      "      output_arg {"
-      "        name: 'var0_value'"
-      "        type: DT_FLOAT"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'read0'"
-      "      op: 'ReadVariableOp'"
-      "      input: 'var0'"
-      "      attr {"
-      "        key: 'dtype'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'add'"
-      "      op: 'Add'"
-      "      input: 'read0:value:0'"
-      "      input: 'read0:value:0'"
-      "      device: '/job:localhost/task:1/device:CPU:0'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    node_def {"
-      "      name: 'identity'"
-      "      op: 'Identity'"
-      "      input: 'add:z:0'"
-      "      device: '/job:localhost/task:0/device:CPU:0'"
-      "      attr {"
-      "        key: 'T'"
-      "        value {"
-      "          type: DT_FLOAT"
-      "        }"
-      "      }"
-      "    }"
-      "    ret {"
-      "      key: 'var0_value'"
-      "      value: 'identity:output:0'"
-      "    }",
-      &def));
-  return def.SerializeAsString();
-}
-
-class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
- public:
-  FunctionErrorInjectionPass(string error_node, string error_device)
-      : error_node_(error_node), error_device_(error_device) {}
-  tensorflow::Status Run(const tensorflow::DeviceSet& device_set,
-                         const tensorflow::ConfigProto& config_proto,
-                         std::unique_ptr<tensorflow::Graph>* graph,
-                         tensorflow::FunctionLibraryDefinition* flib_def,
-                         std::vector<std::string>* control_ret_node_names,
-                         bool* control_rets_updated) override {
-    // Inject failure to function instantiation if finding a node that contains
-    // the given node name (error_node_) and requested device (error_device_).
-    for (const auto node : graph->get()->nodes()) {
-      if (node->name().find(error_node_) != string::npos &&
-          node->requested_device() == error_device_) {
-        return tensorflow::errors::Internal("Injected graph pass error.");
-      }
-    }
-    return tensorflow::Status::OK();
-  }
-
- private:
-  const string error_node_;
-  const string error_device_;
-};
-
-void TestDistributedFunctionCancellation(bool inject_error) {
-  tensorflow::ServerDef server_def = GetServerDef(3);
-  // This server def has the task index set to 0.
-  string serialized = server_def.SerializeAsString();
-
-  server_def.set_task_index(1);
-  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server1)
-                  .ok());
-  ASSERT_TRUE(worker_server1->Start().ok());
-  server_def.set_task_index(2);
-  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server2)
-                  .ok());
-  ASSERT_TRUE(worker_server2->Start().ok());
-  const char dev2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
-
-  if (inject_error) {
-    // Inject a function optimization pass failure when it sees the 'read0' op
-    // having a requested device `dev2_name`. During execution:
-    //   * task:0 processes the main function `VariableAddFunction` and places
-    //     the read0 op on task:2
-    //   * task:0 partitions the main function with a subgraph containing read0
-    //     sent to task:2
-    //   * task:2 graph pass reports an error when it sees read0 with dev2_name
-    tensorflow::function_optimization_registration::
-        FunctionOptimizationPassRegistration register_test_pass(
-            std::make_unique<FunctionErrorInjectionPass>("read0", dev2_name));
-  }
-
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
-  EXPECT_NE(var_handle, nullptr);
-
-  const string function_def = VariableAddFunction();
-  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
-                            status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-
-  TFE_Op* func = TFE_NewOp(ctx, "VariableAddFunction", status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_OpAddInput(func, var_handle, status);
-  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
-  TFE_TensorHandle* retvals[1] = {nullptr};
-  int num_retvals = 1;
-  TFE_Execute(func, &retvals[0], &num_retvals, status);
-
-  if (inject_error) {
-    ASSERT_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
-  } else {
-    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    ASSERT_EQ(1, num_retvals);
-    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
-    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-    TFE_DeleteTensorHandle(retvals[0]);
-    float sum = 0;
-    ASSERT_EQ(sizeof(sum), TF_TensorByteSize(t));
-    memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
-    TF_DeleteTensor(t);
-    ASSERT_EQ(sum, 4.0);
-  }
-
-  TFE_DeleteOp(func);
-  TFE_DeleteTensorHandle(var_handle);
-  TFE_DeleteContext(ctx);
-  TF_DeleteStatus(status);
-
-  // TODO(b/136478427): Figure out how to correctly shut the server down.
-  worker_server1.release();
-  worker_server2.release();
-}
-
-TEST(CAPI, DistributedFunctionNoError) {
-  TestDistributedFunctionCancellation(false);
-}
-
-TEST(CAPI, DistributedFunctionCancelledOnError) {
-  TestDistributedFunctionCancellation(true);
-}
-
-void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
-  tensorflow::ServerDef server_def = GetServerDef(2);
-
-  // This server def has the task index set to 0.
-  string serialized = server_def.SerializeAsString();
-
-  server_def.set_task_index(1);
-
-  std::unique_ptr<tensorflow::GrpcServer> worker_server;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(
-                  server_def, tensorflow::Env::Default(), &worker_server)
-                  .ok());
-  ASSERT_TRUE(worker_server->Start().ok());
-
-  TF_Status* status = TF_NewStatus();
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
-                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
-  TFE_Context* ctx = TFE_NewContext(opts, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_DeleteContextOptions(opts);
-
-  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  // Use large matrices so that RPCs don't return before we get a chance
-  // to call TFE_DeleteContext.
-  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle100x100(ctx);
-  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle100x100(ctx);
-  const char remote_device_name[] =
-      "/job:localhost/replica:0/task:1/device:CPU:0";
-  auto* h0_task1 =
-      TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  auto* h1_task1 =
-      TFE_TensorHandleCopyToDevice(h1_task0, ctx, remote_device_name, status);
-  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_Op* matmul = MatMulOp(ctx, h0_task1, h1_task1);
-  TFE_OpSetDevice(matmul, remote_device_name, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-
-  TFE_TensorHandle* retvals[1];
-  int num_retvals = 1;
-  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
-  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteStatus(status);
-
-  TFE_DeleteTensorHandle(h0_task0);
-  TFE_DeleteTensorHandle(h1_task0);
-  TFE_DeleteTensorHandle(h0_task1);
-  TFE_DeleteTensorHandle(h1_task1);
-  TFE_DeleteTensorHandle(retvals[0]);
-
-  TFE_DeleteOp(matmul);
-
-  TFE_DeleteContext(ctx);
-
-  // TODO(b/136478427): Figure out how to correctly shut the server down.
-  worker_server.release();
-}
-
-TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPC) {
-  TestRemoteExecuteDeleteContextWithOutstandingRPC(false);
-}
-
-TEST(CAPI, RemoteExecuteDeleteContextWithOutstandingRPCAsync) {
-  TestRemoteExecuteDeleteContextWithOutstandingRPC(true);
-}
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 29b624b8537..4b5ad8f50f7 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_experimental.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 
 using tensorflow::string;
 
@@ -296,3 +298,23 @@ bool GetDeviceName(TFE_Context* ctx, string* device_name,
   TF_DeleteDeviceList(devices);
   return false;
 }
+
+tensorflow::ServerDef GetServerDef(const string& job_name, int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol("grpc");
+  server_def.set_job_name(job_name);
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name(job_name);
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost:", port)});
+  }
+  return server_def;
+}
+
+tensorflow::ServerDef GetServerDef(int num_tasks) {
+  return GetServerDef("localhost", num_tasks);
+}
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 4c43f8d5833..fcf62223f14 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 // Return a tensor handle containing a float scalar
 TFE_TensorHandle* TestScalarTensorHandle(TFE_Context* ctx, float value);
@@ -72,4 +73,11 @@ TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
 bool GetDeviceName(TFE_Context* ctx, tensorflow::string* device_name,
                    const char* device_type);
 
+// Create a ServerDef with the given `job_name` and add `num_tasks` tasks in it.
+tensorflow::ServerDef GetServerDef(const tensorflow::string& job_name,
+                                   int num_tasks);
+
+// Create a ServerDef with job name "localhost" and add `num_tasks` tasks in it.
+tensorflow::ServerDef GetServerDef(int num_tasks);
+
 #endif  // TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 4e3da8b00e0..425b25e2386 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -181,7 +181,7 @@ class GrpcEagerClient : public EagerClient {
                              StatusCallback done) override {
     StatusCallback done_wrapped = callback_wrapper(std::move(done));
     if (EnableStreaming()) {
-      tf_shared_lock l(mu_);
+      mutex_lock l(mu_);
       auto it = enqueue_dispatchers_.find(request->context_id());
       if (it == enqueue_dispatchers_.end()) {
         auto it_and_bool = enqueue_dispatchers_.emplace(

From 356121e56376d4c872848a710fefa4dc3e1f3b6a Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 29 May 2020 11:09:12 -0700
Subject: [PATCH 1394/1533] Server-side cancellation support for distributed
 function execution.

1. Thread the RPC cancel signal through the eager service RunComponentFunction calls;
2. Always pass the cancellation manager to the underlying executor (instead of only passing when `is_eager` is true, i.e., pure eager ops). With this we do not need to cancel the rendezvous from the process FLR; instead the ExecutorState takes care of it when op fails.
3. Do not mark all statuses as derived when aborting rendezvous or triggering cancellation. This usually results in the original errors buried as one of the derived errors.

PiperOrigin-RevId: 313814162
Change-Id: Ia866f5f522a0b1aa54e9dce7b9cc0bcf7682136a
---
 .../core/common_runtime/eager/execute.cc      |   2 +-
 .../common_runtime/eager/kernel_and_device.cc |   1 -
 tensorflow/core/common_runtime/executor.cc    |  12 +-
 .../process_function_library_runtime.cc       |  23 +-
 .../base_rendezvous_mgr.cc                    |  14 +-
 .../eager/eager_service_impl.cc               |  12 +-
 .../eager/eager_service_impl.h                |   3 +-
 .../eager/eager_service_impl_test.cc          | 204 +++++++++++++++++-
 .../rpc/eager/grpc_eager_service_impl.h       |  12 +-
 tensorflow/core/framework/op_kernel.h         |   5 -
 tensorflow/core/kernels/sendrecv_ops.cc       |   7 +-
 11 files changed, 232 insertions(+), 63 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index f23b0fa7877..e7d75517c15 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -1459,7 +1459,7 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
   EagerKernelExecuteAsync(
       &ctx, op->Inputs(), op->remote_func_params(), std::move(kernel),
       graph_collector, op->GetCancellationManager(), retvals, num_outputs,
-      [op, num_outputs, &retvals, done = std::move(done)](const Status& s) {
+      [op, num_outputs, retvals, done = std::move(done)](const Status& s) {
         op->Clear();
         // Since the operation failed, we need to Unref any outputs if they were
         // allocated.
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index bf7c083f24b..1e69f9d1767 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -236,7 +236,6 @@ Status KernelAndDeviceOp::Run(
     std::vector<Tensor>* outputs, CancellationManager* cancellation_manager,
     const absl::optional<EagerRemoteFunctionParams>& remote_func_params) {
   OpKernelContext::Params params;
-  params.is_eager = true;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
   params.inputs = inputs.GetTensorValues();
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 447a9e0ae77..af011ac95d8 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -66,6 +67,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/tensor_slice_reader_cache.h"
 
 namespace tensorflow {
@@ -1054,10 +1056,12 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
         // aborting all other execution in the step.
         abort_run = true;
 
-        // If execution has been cancelled, mark any new errors as being
-        // derived. This ensures any errors triggered by cancellation are marked
-        // as derived.
-        if (cancellation_manager_ && cancellation_manager_->IsCancelled()) {
+        // If execution has been cancelled, mark cancelled or aborted errors as
+        // being derived. Note that the original node that fails might also
+        // trigger cancellation, and here we make sure the original error is
+        // exposed to users and not buried as a derived error.
+        if (cancellation_manager_ && cancellation_manager_->IsCancelled() &&
+            (errors::IsCancelled(s) || errors::IsAborted(s))) {
           status_ = StatusGroup::MakeDerived(s);
         } else {
           status_ = s;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 364750b6679..c1a76e529d8 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -1055,29 +1055,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     local_cm = std::make_shared<CancellationManager>();
     cm = local_cm.get();
   }
-  auto token = cm->get_cancellation_token();
-  const auto cancelled_error = errors::Cancelled(
-      "ProcessFunctionLibraryRuntime::RunMultiDevice was cancelled.");
-  const bool already_cancelled = !cm->RegisterCallback(
-      token,
-      [rendez = opts.rendezvous, n_func = data->glue_.size(), cancelled_error] {
-        // Abort rendezvous only if there are more than one component functions
-        // to avoid reporting cancellation error directly to PartitionedCallOps
-        // that launch a single component function.
-        if (rendez && n_func > 1) {
-          rendez->StartAbort(cancelled_error);
-        }
-      });
-  if (already_cancelled) {
-    done(cancelled_error);
-    return;
-  }
 
-  auto* refcounted_done = new ReffedStatusCallback(
-      [cm, token, local_cm, done = std::move(done)](const Status& s) {
-        cm->TryDeregisterCallback(token);
-        done(s);
-      });
+  auto* refcounted_done = new ReffedStatusCallback(std::move(done));
   for (int i = 0; i < data->glue_.size(); ++i) {
     refcounted_done->Ref();
   }
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index 8b458021910..7849e094cb9 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
@@ -373,11 +374,14 @@ void BaseRemoteRendezvous::RecvLocalAsyncInternal(const ParsedKey& parsed,
 
 void BaseRemoteRendezvous::StartAbort(const Status& s) {
   CHECK(!s.ok());
-  // Use a "derived" status as the status for the rendezvous. Derived
-  // status messages are ignored when aggregating errors across devices: this
-  // allows us to prefer our original status message over any cancellation
-  // related errors.
-  Status derived_status = StatusGroup::MakeDerived(s);
+  // If the status passed in is a cancelled or aborted error, mark it as
+  // "derived" for the rendezvous. Derived status messages are ignored when
+  // aggregating errors across devices: this allows us to prefer our original
+  // status message over any cancellation related errors.
+  Status derived_status = s;
+  if (errors::IsCancelled(s) || errors::IsAborted(s)) {
+    derived_status = StatusGroup::MakeDerived(s);
+  }
 
   local_->StartAbort(derived_status);
   {
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 5327cbb6480..f1e70d53757 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -411,7 +411,7 @@ Status EagerServiceImpl::CreateMasterContext(
 }
 
 void EagerServiceImpl::RunComponentFunction(
-    const RunComponentFunctionRequest* request,
+    CallOptions* call_opts, const RunComponentFunctionRequest* request,
     RunComponentFunctionResponse* response, StatusCallback done) {
   ServerContext* context = nullptr;
   Status s = GetServerContext(request->context_id(), &context);
@@ -451,11 +451,17 @@ void EagerServiceImpl::RunComponentFunction(
   VLOG(3) << "ServerContext: Calling EagerLocalExecuteAsync for op "
           << operation.id();
 
+  auto cm = std::make_shared<CancellationManager>();
+  op->SetCancellationManager(cm.get());
+  call_opts->SetCancelCallback([cm] { cm->StartCancel(); });
+
   context->Ref();
   EagerLocalExecuteAsync(
       op, retvals->data(), num_retvals,
-      [op, op_id = operation.id(), num_retvals, retvals, response,
-       eager_context, context, done = std::move(done)](const Status& status) {
+      [op, op_id = operation.id(), num_retvals, retvals, cm, call_opts,
+       response, eager_context, context,
+       done = std::move(done)](const Status& status) {
+        call_opts->ClearCancelCallback();
         auto wrapped_done = [&](const Status& status) {
           context->Unref();
           done(status);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 1e4d36ccf9f..09db3883a15 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -96,7 +96,8 @@ class EagerServiceImpl {
   Status WaitQueueDone(const WaitQueueDoneRequest* request,
                        WaitQueueDoneResponse* response);
 
-  void RunComponentFunction(const RunComponentFunctionRequest* request,
+  void RunComponentFunction(CallOptions* call_opts,
+                            const RunComponentFunctionRequest* request,
                             RunComponentFunctionResponse* response,
                             StatusCallback done);
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 3c537d99a3a..cf430d78617 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -15,10 +15,6 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
 
-#include <string.h>
-
-#include <memory>
-
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/c_api_internal.h"
@@ -39,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/eager_service.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/protobuf/remote_tensor_handle.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
@@ -94,7 +91,7 @@ class FakeEagerClient : public EagerClient {
                                  const RunComponentFunctionRequest* request,
                                  RunComponentFunctionResponse* response,
                                  StatusCallback done) override {
-    impl_->RunComponentFunction(request, response, std::move(done));
+    impl_->RunComponentFunction(call_opts, request, response, std::move(done));
   }
 
   void StreamingEnqueueAsync(const EnqueueRequest* request,
@@ -177,14 +174,11 @@ void SetTensorProto(TensorProto* tensor_proto) {
   TF_DeleteTensor(t);
 }
 
-void AddOperationToEnqueueRequest(
-    int64 id, const string& name,
+void BuildOperation(
+    Operation* operation, int64 id, const string& name,
     const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
         inputs,
-    const std::unordered_map<string, AttrValue>& attrs, const string& device,
-    EnqueueRequest* request) {
-  auto* operation = request->add_queue()->mutable_operation();
-
+    const std::unordered_map<string, AttrValue>& attrs, const string& device) {
   operation->set_id(id);
   operation->set_name(name);
   operation->set_device(device);
@@ -209,6 +203,28 @@ void AddOperationToEnqueueRequest(
   }
 }
 
+void AddOperationToEnqueueRequest(
+    int64 id, const string& name,
+    const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
+        inputs,
+    const std::unordered_map<string, AttrValue>& attrs, const string& device,
+    EnqueueRequest* request) {
+  auto* operation = request->add_queue()->mutable_operation();
+  BuildOperation(operation, id, name, inputs, attrs, device);
+}
+
+void AddOperationToRunComponentFunctionRequest(
+    int64 id, const string& name,
+    const std::vector<absl::variant<TensorProto, std::pair<int64, int32>>>&
+        inputs,
+    const std::unordered_map<string, AttrValue>& attrs, const string& device,
+    RunComponentFunctionRequest* request) {
+  auto* operation = request->mutable_operation();
+  operation->set_is_function(true);
+  operation->set_is_component_function(true);
+  BuildOperation(operation, id, name, inputs, attrs, device);
+}
+
 tensorflow::NodeDef MatMulFunctionNodeDef() {
   tensorflow::NodeDef def;
   CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
@@ -299,6 +315,69 @@ tensorflow::FunctionDef MatMulNestedFunction() {
   return def;
 }
 
+tensorflow::FunctionDef SingleRecvNodeFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'SingleRecvNodeFunction'"
+      "      input_arg {"
+      "        name: 'a'"
+      "        type: DT_FLOAT"
+      "      }"
+      "      output_arg {"
+      "        name: 'recv_tensor'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'recv_node'"
+      "      op: '_Recv'"
+      "      device: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'client_terminated'"
+      "        value {"
+      "          b: true"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'recv_device'"
+      "        value {"
+      "          s: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'send_device'"
+      "        value {"
+      "          s: '/job:localhost/replica:0/task:0/device:CPU:0'"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'send_device_incarnation'"
+      "        value {"
+      "          i: 1"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'tensor_name'"
+      "        value {"
+      "          s: 't0'"
+      "        }"
+      "      }"
+      "      attr {"
+      "        key: 'tensor_type'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'recv_tensor'"
+      "      value: 'recv_node:tensor:0'"
+      "    }",
+      &def));
+  return def;
+}
+
 // Test creates a context and attempts to execute some ops.
 TEST_F(EagerServiceImplTest, BasicTest) {
   TestEagerServiceImpl eager_service_impl(&worker_env_);
@@ -462,6 +541,97 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
                                                  &close_context_response));
   }
+
+  // Creates a context and attempts to execute a component function.
+  void TestComponentFunction(const RegisterFunctionOp& register_op,
+                             const string& function_name,
+                             const bool test_cancel) {
+    TestEagerServiceImpl eager_service_impl(&worker_env_);
+    uint64 context_id = random::New64();
+
+    // Create context.
+    CreateContextRequest request;
+    request.mutable_server_def()->set_job_name("localhost");
+    request.mutable_server_def()->set_task_index(0);
+    request.set_context_id(context_id);
+    CreateContextResponse response;
+    TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+    // Register function.
+    EnqueueRequest enqueue_request;
+    enqueue_request.set_context_id(context_id);
+    *enqueue_request.add_queue()->mutable_register_function() = register_op;
+    EnqueueResponse enqueue_response;
+    TF_ASSERT_OK(
+        eager_service_impl.Enqueue(&enqueue_request, &enqueue_response));
+
+    // First run an op to generate input for function.
+    EnqueueRequest remote_enqueue_request;
+    remote_enqueue_request.set_context_id(context_id);
+    EnqueueResponse remote_enqueue_response;
+
+    std::unordered_map<string, AttrValue> const_attrs;
+    AttrValue val;
+    val.set_type(tensorflow::DataType::DT_FLOAT);
+    const_attrs.insert({"dtype", val});
+    val.Clear();
+    SetTensorProto(val.mutable_tensor());
+    const_attrs.insert({"value", val});
+    AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
+                                 "/job:localhost/replica:0/task:0/device:CPU:0",
+                                 &remote_enqueue_request);
+    TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                            &remote_enqueue_response));
+
+    // Run function with input from the previous op.
+    RunComponentFunctionRequest run_comp_func_request;
+    run_comp_func_request.set_context_id(context_id);
+    RunComponentFunctionResponse run_comp_func_response;
+    AddOperationToRunComponentFunctionRequest(
+        2, function_name, {std::make_pair(1, 0)},
+        std::unordered_map<string, AttrValue>(),
+        "/job:localhost/replica:0/task:0/device:CPU:0", &run_comp_func_request);
+
+    CallOptions call_opts;
+    Notification n;
+    Status status;
+    eager_service_impl.RunComponentFunction(&call_opts, &run_comp_func_request,
+                                            &run_comp_func_response,
+                                            [&status, &n](const Status& s) {
+                                              status.Update(s);
+                                              n.Notify();
+                                            });
+    if (test_cancel) {
+      call_opts.StartCancel();
+    }
+    n.WaitForNotification();
+    if (test_cancel) {
+      EXPECT_TRUE(errors::IsCancelled(status)) << status.error_message();
+    } else {
+      TF_ASSERT_OK(status);
+      // Retrieve the output.
+      const tensorflow::Tensor* t = nullptr;
+      tensorflow::TensorHandle* tensor_handle;
+      TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+          context_id, RemoteTensorHandleInternal(2, 0), &tensor_handle));
+      TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+      auto actual = t->flat<float>();
+      EXPECT_EQ(4, actual.size());
+
+      EXPECT_EQ(7, actual(0));
+      EXPECT_EQ(10, actual(1));
+      EXPECT_EQ(15, actual(2));
+      EXPECT_EQ(22, actual(3));
+    }
+
+    CloseContextRequest close_context_request;
+    close_context_request.set_context_id(context_id);
+    close_context_request.set_context_view_id(0);
+    CloseContextResponse close_context_response;
+    TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                                 &close_context_response));
+  }
 };
 
 TEST_F(EagerServiceImplFunctionTest, BasicFunctionTest) {
@@ -483,6 +653,18 @@ TEST_F(EagerServiceImplFunctionTest, NestedFunctionTest) {
   TestFunction(register_op, "MatMulNestedFunction");
 }
 
+TEST_F(EagerServiceImplFunctionTest, ComponentFunctionTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = MatMulFunction();
+  TestComponentFunction(register_op, "MatMulFunction", false);
+}
+
+TEST_F(EagerServiceImplFunctionTest, ComponentFunctionCancellationTest) {
+  RegisterFunctionOp register_op;
+  *register_op.mutable_function_def() = SingleRecvNodeFunction();
+  TestComponentFunction(register_op, "SingleRecvNodeFunction", true);
+}
+
 class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
  public:
   FunctionWithRemoteInputsTest()
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index d95589704b1..924112e0d96 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -76,9 +76,13 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
       EagerCall<RunComponentFunctionRequest, RunComponentFunctionResponse>*
           call) {
     env_->compute_pool->Schedule([this, call]() {
-      local_impl_.RunComponentFunction(
-          &call->request, &call->response,
-          [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
+      auto call_opts = std::make_shared<CallOptions>();
+      call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+      local_impl_.RunComponentFunction(call_opts.get(), &call->request,
+                                       &call->response,
+                                       [call, call_opts](const Status& s) {
+                                         call->SendResponse(ToGrpcStatus(s));
+                                       });
     });
     Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
          RunComponentFunctionRequest, RunComponentFunctionResponse>::
@@ -86,7 +90,7 @@ class GrpcEagerServiceImpl : public AsyncServiceInterface {
             &service_, cq_.get(),
             &grpc::EagerService::AsyncService::RequestRunComponentFunction,
             &GrpcEagerServiceImpl::RunComponentFunctionHandler,
-            /*supports_cancel=*/false);
+            /*supports_cancel=*/true);
   }
 
   // Called when a new request has been received as part of a StreamingEnqueue
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 55de3d9fa03..a4ada3303d3 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -597,9 +597,6 @@ class OpKernelContext {
     // The step being executed.
     int64 step_id = 0;
 
-    // True if the op is created by eager runtime.
-    bool is_eager = false;
-
     // The op kernel being computed.
     OpKernel* op_kernel = nullptr;
 
@@ -718,8 +715,6 @@ class OpKernelContext {
 
   int64 step_id() const { return params_->step_id; }
 
-  bool is_eager() const { return params_->is_eager; }
-
   const OpKernel& op_kernel() const { return *params_->op_kernel; }
 
   // Input/output signature.
diff --git a/tensorflow/core/kernels/sendrecv_ops.cc b/tensorflow/core/kernels/sendrecv_ops.cc
index 3d94fe1b6a5..f82d4645f13 100644
--- a/tensorflow/core/kernels/sendrecv_ops.cc
+++ b/tensorflow/core/kernels/sendrecv_ops.cc
@@ -193,12 +193,7 @@ void RecvOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   Rendezvous::Args args;
   args.device_context = ctx->op_device_context();
   args.alloc_attrs = ctx->output_alloc_attr(0);
-  if (ctx->is_eager()) {
-    // NOTE(fishx): Only set cancellation_manager in eager mode. Because in
-    // Tensorflow 1.x, session (or graph_mgr) will abort the underlying
-    // rendezvous if it encounters any error.
-    args.cancellation_manager = ctx->cancellation_manager();
-  }
+  args.cancellation_manager = ctx->cancellation_manager();
 
   FrameAndIter frame_iter = GetFrameAndIter(ctx, hostmem_sendrecv_);
   if (frame_iter == FrameAndIter(0, 0)) {

From 27ac0a900e3adc2e72d2f0e8bffaf1c362c2012c Mon Sep 17 00:00:00 2001
From: Sreeni Kesavarapu <sreenik@google.com>
Date: Fri, 29 May 2020 11:11:23 -0700
Subject: [PATCH 1395/1533] Implement GPU device priority for virtual devices. 
  - This adds a field in virtual device options to specify priority for
 virtual devices.   - When the priority is specified, it will be used to
 create the cuda streams for the virtual device with the given priority.   -
 This is backwards compatible with no priorities specified. When no priorities
 specified, the current implementation of creating a stream without any
 priority will continue while any non-zero priorities specified will be used
 to create streams with that priority.

PiperOrigin-RevId: 313814564
Change-Id: Ie580ea3f8c7fc7248e40e1ae4c898f0a029fba35
---
 tensorflow/core/common_runtime/gpu/BUILD      |  10 +-
 .../core/common_runtime/gpu/gpu_device.cc     | 161 ++++++++++++++++--
 .../core/common_runtime/gpu/gpu_device.h      |   6 +
 .../common_runtime/gpu/gpu_device_test.cc     |  93 +++++++++-
 tensorflow/core/protobuf/config.proto         |  12 ++
 tensorflow/python/eager/context.py            |  24 ++-
 .../stream_executor/cuda/cuda_driver.cc       |  14 +-
 tensorflow/stream_executor/gpu/gpu_driver.h   |   3 +-
 tensorflow/stream_executor/gpu/gpu_stream.cc  |   3 +-
 tensorflow/stream_executor/gpu/gpu_stream.h   |   3 +
 .../stream_executor/rocm/rocm_driver.cc       |   8 +-
 .../golden/v1/tensorflow.-g-p-u-options.pbtxt |   6 +
 ...config.-logical-device-configuration.pbtxt |   4 +
 ...mental.-virtual-device-configuration.pbtxt |   4 +
 ...config.-logical-device-configuration.pbtxt |   4 +
 ...mental.-virtual-device-configuration.pbtxt |   4 +
 16 files changed, 331 insertions(+), 28 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 07919117051..8665f76f5e6 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -28,6 +28,10 @@ load(
     "if_static",
     "tf_cuda_tests_tags",
 )
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "if_gpu_is_configured",
+)
 
 package(
     default_visibility = [
@@ -151,6 +155,7 @@ tf_cuda_library(
         ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
+        "//third_party/eigen3",
         "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
@@ -162,8 +167,9 @@ tf_cuda_library(
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
-        "//third_party/eigen3",
-    ],
+    ] + if_gpu_is_configured([
+        "//tensorflow/stream_executor/cuda:cuda_platform",
+    ]),
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index cf2e7043cae..416ce29d9a7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -74,6 +74,7 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
+#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
 #if !defined(PLATFORM_GOOGLE)
@@ -244,16 +245,18 @@ class BaseGPUDevice::StreamGroupFactory {
     StreamGroup* group =
         &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
     if (!group->compute) {
-      group->compute = new se::Stream(executor);
+      int priority = GetPriority(tf_gpu_id.value(), options);
+      group->priority = priority;
+      group->compute = GetStream(executor, priority);
       group->compute->Init();
       VLOG(2) << "Created stream[" << stream_group_within_gpu
-              << "] = " << group->compute;
+              << "] = " << group->compute << " with priority: " << priority;
 
 #if TENSORFLOW_USE_ROCM
       // ROCm streams are lightweight and will not necessarily trigger device
       // queue init until they are first used. For optimal performance,
       // compute and nccl streams must be immediate siblings.
-      group->nccl = new se::Stream(executor);
+      group->nccl = GetStream(executor, priority);
       group->nccl->Init();
       VLOG(2) << "Created nccl_stream[" << stream_group_within_gpu
               << "] = " << group->nccl;
@@ -263,12 +266,12 @@ class BaseGPUDevice::StreamGroupFactory {
       group->nccl->ThenWaitFor(group->compute);
 #endif
 
-      group->host_to_device = new se::Stream(executor);
+      group->host_to_device = GetStream(executor, priority);
       group->host_to_device->Init();
       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->host_to_device;
 
-      group->device_to_host = new se::Stream(executor);
+      group->device_to_host = GetStream(executor, priority);
       group->device_to_host->Init();
       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
@@ -283,7 +286,7 @@ class BaseGPUDevice::StreamGroupFactory {
         num_d2d_streams = 1;
       }
       for (int i = 0; i < num_d2d_streams; ++i) {
-        se::Stream* stream = new se::Stream(executor);
+        se::Stream* stream = GetStream(executor, priority);
         stream->Init();
         group->device_to_device.push_back(stream);
         VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
@@ -300,7 +303,70 @@ class BaseGPUDevice::StreamGroupFactory {
     return *instance;
   }
 
+  // Helper method for unit tests to reset the streams. Never use in production.
+  void TestOnlyReset() {
+    mutex_lock guard(lock_);
+    for (auto& item : streams_) {
+      auto& stream = item.second;
+      if (stream.compute) {
+        delete stream.compute;
+        stream.compute = nullptr;
+      }
+#if TENSORFLOW_USE_ROCM
+      if (stream.nccl) {
+        delete stream.nccl;
+        stream.nccl = nullptr;
+      }
+#endif
+      if (stream.host_to_device) {
+        delete stream.host_to_device;
+        stream.host_to_device = nullptr;
+      }
+      if (stream.device_to_host) {
+        delete stream.device_to_host;
+        stream.device_to_host = nullptr;
+      }
+      while (!stream.device_to_device.empty()) {
+        auto back = stream.device_to_device.back();
+        if (back) {
+          delete back;
+        }
+        stream.device_to_device.pop_back();
+      }
+    }
+    streams_.clear();
+  }
+
  private:
+  // Returns priority for the given virtual GPU id from the session options.
+  // Returns 0 if no virtual devices are specified.
+  int GetPriority(int tf_gpu_id, const GPUOptions& options) {
+    int id = tf_gpu_id;
+    int i = 0;
+    int priority = 0;
+    while (i < options.experimental().virtual_devices_size()) {
+      const int size =
+          options.experimental().virtual_devices().Get(i).priority_size();
+      if (id >= size) {
+        id -= size;
+      } else {
+        priority =
+            options.experimental().virtual_devices().Get(i).priority().Get(id);
+        break;
+      }
+      i++;
+    }
+    return priority;
+  }
+
+  // Returns a Stream with the underlying GPUStream with the given priority.
+  se::Stream* GetStream(se::StreamExecutor* executor, int priority) {
+    auto stream = new se::Stream(executor);
+    static_cast<stream_executor::gpu::GpuStream*>(stream->implementation())
+        ->SetPriority(priority);
+    return stream;
+  }
+
   mutex lock_;
   using key_type = std::tuple<int, int>;
   std::map<key_type, StreamGroup> streams_;
@@ -752,7 +818,8 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
 Status VerifyVirtualDeviceSettings(
     const size_t num_gpus_to_use, const GPUOptions& gpu_options,
     const std::vector<PlatformGpuId>& visible_gpu_order,
-    const std::vector<PlatformGpuId>& valid_platform_gpu_ids) {
+    const std::vector<PlatformGpuId>& valid_platform_gpu_ids,
+    const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
   CHECK(!virtual_devices.empty());
   if (gpu_options.per_process_gpu_memory_fraction() > 0) {
@@ -781,6 +848,63 @@ Status VerifyVirtualDeviceSettings(
         " #valid GPUs: ", valid_platform_gpu_ids.size(),
         " virtual_devices.size(): ", virtual_devices.size());
   }
+#if GOOGLE_CUDA
+  // Check memory_limt_mb and priority sizes match if priority is non-empty.
+  bool priority_exists = !virtual_devices.Get(0).priority().empty();
+  for (int i = 0; i < virtual_devices.size(); ++i) {
+    const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
+    const auto& priority = virtual_devices.Get(i).priority();
+    // If the priority is empty in the first one then treat this as having no
+    // priority set in any of the virtual devices for backward compatibility.
+    // Either it's set for all or none.
+    if (!priority_exists) {
+      if (!priority.empty()) {
+        return errors::InvalidArgument(
+            "Priority must be set for all virtual devices or none. But the "
+            "priority is specified for ",
+            i,
+            " while previous devices didn't "
+            "have any set.");
+      }
+    }
+    if (priority_exists && memory_limit_mb.size() != priority.size()) {
+      return errors::InvalidArgument(
+          "Number of virtual device priorities specified doesn't "
+          "match with number of memory_limit_mb specified for GPU# ",
+          i, " memory_limit_mb size: ", memory_limit_mb.size(),
+          " and priority size: ", priority.size());
+    }
+    const int gpu_id = valid_platform_gpu_ids[i].value();
+    auto it = supported_priority_ranges.find(gpu_id);
+    if (it == supported_priority_ranges.end()) {
+      return errors::Internal(
+          "Failed to find supported priority range for GPU"
+          " device ",
+          gpu_id);
+    }
+    const std::pair<int, int>& priority_range = it->second;
+    for (int p : priority) {
+      if (p > priority_range.first || p < priority_range.second) {
+        return errors::InvalidArgument(
+            "Priority ", p,
+            " is outside the range of supported priorities "
+            "[",
+            priority_range.second, ",", priority_range.first,
+            "] for virtual device ", i, " on GPU# ", gpu_id);
+      }
+    }
+  }
+#elif TENSORFLOW_USE_ROCM
+  for (int i = 0; i < virtual_devices.size(); ++i) {
+    if (!virtual_devices.Get(i).priority().empty()) {
+      return errors::InvalidArgument(
+          "Priority is supported only on Nvidia GPUs."
+          " However, priority is set for virtual device ",
+          i, ", which corresponds to a non Nvidia GPU");
+    }
+  }
+#endif
+
   return Status::OK();
 }
 
@@ -1003,6 +1127,7 @@ Status BaseGPUDeviceFactory::CreateDevices(
   if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
     num_gpus_to_use = valid_platform_gpu_ids.size();
   }
+  std::map<int, std::pair<int, int>> supported_priority_ranges;
   if (!valid_platform_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
@@ -1036,6 +1161,18 @@ Status BaseGPUDeviceFactory::CreateDevices(
                                 platform_gpu_id.value(),
                                 " failed. Status: ", cudaGetErrorString(err));
       }
+      int priority_low, priority_high;
+      cudaDeviceGetStreamPriorityRange(&priority_low, &priority_high);
+      if (err != cudaSuccess) {
+        return errors::Internal(
+            "cudaDeviceGetStreamPriorityRange() on GPU:", original_device,
+            " failed. Status: ", cudaGetErrorString(err));
+      }
+      VLOG(1) << "Cuda stream priority range on GPU(" << original_device
+              << "): " << priority_high << "," << priority_low;
+      supported_priority_ranges.insert(
+          std::make_pair(platform_gpu_id.value(),
+                         std::make_pair(priority_low, priority_high)));
 #elif TENSORFLOW_USE_ROCM
       err = hipSetDevice(platform_gpu_id.value());
       if (err != hipSuccess) {
@@ -1107,9 +1244,9 @@ Status BaseGPUDeviceFactory::CreateDevices(
 
   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
   if (!virtual_devices.empty()) {
-    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options,
-                                                   visible_gpu_order,
-                                                   valid_platform_gpu_ids));
+    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
+        num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids,
+        supported_priority_ranges));
     // We've verified that num_gpus_to_use >= virtual_devices.size().
     num_gpus_to_use = virtual_devices.size();
     CHECK(gpu_options.visible_device_list().empty() ||
@@ -1704,6 +1841,10 @@ int BaseGPUDevice::PendingKernels() {
   return 0;
 }
 
+void BaseGPUDevice::TestOnlyReset() {
+  StreamGroupFactory::Global().TestOnlyReset();
+}
+
 uint64 GPUKernelTracker::MaybeQueue(OpKernelContext* ctx) {
   mutex_lock l(mu_);
   ++ops_since_last_;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 3646c59cec1..32c7738d916 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -114,6 +114,11 @@ class BaseGPUDevice : public LocalDevice {
   // the compute stream and are not yet known to have completed.
   int PendingKernels();
 
+  int priority() const { return stream_->priority; }
+
+  // Helper method for unit tests to reset the streams. Never use in production.
+  static void TestOnlyReset();
+
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -131,6 +136,7 @@ class BaseGPUDevice : public LocalDevice {
     se::Stream* host_to_device = nullptr;
     se::Stream* device_to_host = nullptr;
     gtl::InlinedVector<se::Stream*, 4> device_to_device;
+    int priority = 0;
   };
   class StreamGroupFactory;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 0d66324a8e5..26312d35af6 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -61,13 +61,17 @@ void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
 
 class GPUDeviceTest : public ::testing::Test {
  public:
-  void TearDown() override { GPUProcessState::singleton()->TestOnlyReset(); }
+  void TearDown() override {
+    BaseGPUDevice::TestOnlyReset();
+    GPUProcessState::singleton()->TestOnlyReset();
+  }
 
  protected:
   static SessionOptions MakeSessionOptions(
       const string& visible_device_list = "",
       double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
-      const std::vector<std::vector<float>>& memory_limit_mb = {}) {
+      const std::vector<std::vector<float>>& memory_limit_mb = {},
+      const std::vector<std::vector<int32>>& priority = {}) {
     SessionOptions options;
     ConfigProto* config = &options.config;
     (*config->mutable_device_count())["GPU"] = gpu_device_count;
@@ -75,12 +79,17 @@ class GPUDeviceTest : public ::testing::Test {
     gpu_options->set_visible_device_list(visible_device_list);
     gpu_options->set_per_process_gpu_memory_fraction(
         per_process_gpu_memory_fraction);
-    for (const auto& v : memory_limit_mb) {
+    for (int i = 0; i < memory_limit_mb.size(); ++i) {
       auto virtual_devices =
           gpu_options->mutable_experimental()->add_virtual_devices();
-      for (float mb : v) {
+      for (float mb : memory_limit_mb[i]) {
         virtual_devices->add_memory_limit_mb(mb);
       }
+      if (i < priority.size()) {
+        for (int p : priority[i]) {
+          virtual_devices->add_priority(p);
+        }
+      }
     }
     return options;
   }
@@ -193,6 +202,7 @@ TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
@@ -204,25 +214,67 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
-TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+}
+
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
+  {
+    // Priority outside the range (-1, 0).
+    SessionOptions opts =
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 0}});
+    std::vector<std::unique_ptr<Device>> devices;
+    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+    ExpectErrorMessageSubstr(
+        status,
+        "Priority -2 is outside the range of supported priorities [-1,0] for"
+        " virtual device 0 on GPU# 0");
+  }
+  {
+    // Priority outside the range (-1, 0).
+    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
+    std::vector<std::unique_ptr<Device>> devices;
+    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+    ExpectErrorMessageSubstr(
+        status,
+        "Priority 1 is outside the range of supported priorities [-1,0] for"
+        " virtual device 0 on GPU# 0");
+  }
+}
+
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) {
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}}, {{-1}});
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  EXPECT_EQ(1, devices.size());
+  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+  EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
   EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+  EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
   ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
   ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
   EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
@@ -237,6 +289,35 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).strength());
 }
 
+TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
+  {
+    // Multile virtual devices with fewer priorities.
+    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1}});
+    std::vector<std::unique_ptr<Device>> devices;
+    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices);
+    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
+    ExpectErrorMessageSubstr(
+        status,
+        "Number of virtual device priorities specified doesn't "
+        "match with number of memory_limit_mb specified for GPU# 0"
+        " memory_limit_mb size: 2 and priority size: 1");
+  }
+  {
+    // Multile virtual devices with matching priority.
+    SessionOptions opts =
+        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+        opts, kDeviceNamePrefix, &devices));
+    EXPECT_EQ(2, devices.size());
+    EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
+    EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
+    EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
+    EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
+  }
+}
+
 // Enabling unified memory on pre-Pascal GPUs results in an initialization
 // error.
 TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 7973e002762..3d20d004d46 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -110,6 +110,18 @@ message GPUOptions {
       // For the concept of "visible" and "virtual" GPU, see the comments for
       // "visible_device_list" above for more information.
       repeated float memory_limit_mb = 1;
+
+      // Priority values to use with the virtual devices. Use the cuda function
+      // cudaDeviceGetStreamPriorityRange to query for valid range of values for
+      // priority.
+      //
+      // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
+      // least priority and -1 for greatest priority.
+      //
+      // If this field is not specified, then the virtual devices will be
+      // created with the default. If this field has values set, then the size
+      // of this must match with the above memory_limit_mb.
+      repeated int32 priority = 2;
     }
 
     // The multi virtual device settings. If empty (not set), it will create
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 604a960afd5..aa760583800 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -263,7 +263,8 @@ class LogicalDevice(
 @tf_export("config.LogicalDeviceConfiguration",
            "config.experimental.VirtualDeviceConfiguration")
 class LogicalDeviceConfiguration(
-    collections.namedtuple("LogicalDeviceConfiguration", ["memory_limit"])):
+    collections.namedtuple("LogicalDeviceConfiguration",
+                           ["memory_limit", "experimental_priority"])):
   """Configuration class for a logical devices.
 
   The class specifies the parameters to configure a `tf.config.PhysicalDevice`
@@ -276,10 +277,15 @@ class LogicalDeviceConfiguration(
   Fields:
     memory_limit: (optional) Maximum memory (in MB) to allocate on the virtual
       device. Currently only supported for GPUs.
+    experimental_priority: (optional) Priority to assign to a virtual device.
+      Lower values have higher priorities and 0 is the default.
+      Within a physical GPU, the GPU scheduler will prioritize ops on virtual
+      devices with higher priority. Currently only supported for Nvidia GPUs.
   """
 
-  def __new__(cls, memory_limit=None):
-    return super(LogicalDeviceConfiguration, cls).__new__(cls, memory_limit)
+  def __new__(cls, memory_limit=None, experimental_priority=None):
+    return super(LogicalDeviceConfiguration,
+                 cls).__new__(cls, memory_limit, experimental_priority)
 
 
 @tf_export("config.PhysicalDevice")
@@ -1019,12 +1025,19 @@ class Context(object):
       if self._virtual_device_map:
         vdevs = self._virtual_device_map.get(dev, [])
         device_limits = []
+        priority = []
         for virt_dev in vdevs:
           device_limits.append(virt_dev.memory_limit)
+          if virt_dev.experimental_priority is not None:
+            priority.append(virt_dev.experimental_priority)
+        # If priority is specified, it must be specified for all virtual
+        # devices.
+        if priority and len(device_limits) != len(priority):
+          raise ValueError("priority must be specified for all virtual devices")
 
         virtual_devices.append(
             config_pb2.GPUOptions.Experimental.VirtualDevices(
-                memory_limit_mb=device_limits))
+                memory_limit_mb=device_limits, priority=priority))
 
     # Only compute growth if virtual devices have not been configured and we
     # have GPUs
@@ -1394,6 +1407,9 @@ class Context(object):
         if vdev.memory_limit is not None:
           raise ValueError("Setting memory limit on CPU virtual devices is "
                            "currently not supported")
+        if vdev.experimental_priority is not None:
+          raise ValueError("Setting experimental_priority on CPU virtual "
+                           " devices is currently not supported")
     elif dev.device_type == "GPU":
       for vdev in virtual_devices:
         if vdev.memory_limit is None:
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 210c5436fad..3d011123d87 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -713,13 +713,21 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
-/* static */ bool GpuDriver::CreateStream(GpuContext* context,
-                                          CUstream* stream) {
+/* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
+                                          int priority) {
   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
-  CUresult res = cuStreamCreate(stream, 0);
+  CUresult res;
+  // If the priority is 0, then use the previous api to create the stream with
+  // the default priority for backward compatibility. Probably there is no
+  // difference in using the new api call but leaving it as is for now.
+  if (priority == 0) {
+    res = cuStreamCreate(stream, 0);
+  } else {
+    res = cuStreamCreateWithPriority(stream, 0, priority);
+  }
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not allocate CUDA stream for context "
                << context->context() << ": " << ToString(res);
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index f72c9a129cf..25b90be1bd2 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -71,7 +71,8 @@ class GpuDriver {
   // cuStreamCreate.
   // stream is an outparam owned by the caller, must not be null.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
-  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
+  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream,
+                           int priority = 0);
 
   // Destroys a CUDA stream associated with the given context.
   // stream is owned by the caller, must not be null, and *stream is set to null
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc
index 887522cf3ae..9899bbb04a3 100644
--- a/tensorflow/stream_executor/gpu/gpu_stream.cc
+++ b/tensorflow/stream_executor/gpu/gpu_stream.cc
@@ -23,7 +23,8 @@ namespace stream_executor {
 namespace gpu {
 
 bool GpuStream::Init() {
-  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
+  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_,
+                               priority_)) {
     return false;
   }
   return GpuDriver::InitEvent(parent_->gpu_context(), &completed_event_,
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h
index db0eec53b9c..e58ac72dd56 100644
--- a/tensorflow/stream_executor/gpu/gpu_stream.h
+++ b/tensorflow/stream_executor/gpu/gpu_stream.h
@@ -48,6 +48,8 @@ class GpuStream : public internal::StreamInterface {
   // Explicitly initialize the CUDA resources associated with this stream, used
   // by StreamExecutor::AllocateStream().
   bool Init();
+  void SetPriority(int priority) { priority_ = priority; }
+  int priority() const { return priority_; }
 
   // Explicitly destroy the CUDA resources associated with this stream, used by
   // StreamExecutor::DeallocateStream().
@@ -78,6 +80,7 @@ class GpuStream : public internal::StreamInterface {
  private:
   GpuExecutor* parent_;         // Executor that spawned this stream.
   GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
+  int priority_ = 0;
 
   // Event that indicates this stream has completed.
   GpuEventHandle completed_event_ = nullptr;
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 210e9d7a9fa..5a8154f1df8 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -558,7 +558,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 }
 
 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
-                                          GpuStreamHandle* stream) {
+                                          GpuStreamHandle* stream,
+                                          int priority) {
+  if (priority != 0) {
+    LOG(ERROR) << "ROCM stream doesn't support priority. "
+               << " Should be set to 0 but given: " << priority;
+    return false;
+  }
   ScopedActivateContext activated{context};
   hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
       stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 79c33f7e304..62bc83a3a69 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -116,6 +116,12 @@ tf_proto {
           label: LABEL_REPEATED
           type: TYPE_FLOAT
         }
+        field {
+          name: "priority"
+          number: 2
+          label: LABEL_REPEATED
+          type: TYPE_INT32
+        }
       }
     }
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
index 3f6c6e636a1..49750b0af85 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 25b6b6e216e..9b2a7d846b9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
index 3f6c6e636a1..49750b0af85 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 25b6b6e216e..9b2a7d846b9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -3,6 +3,10 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
+  member {
+    name: "experimental_priority"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"

From 8738f2d4c0e2352bdc562eef5b5de91c1ef0883c Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Fri, 29 May 2020 11:13:36 -0700
Subject: [PATCH 1396/1533] Bump the ruy repository reference.

PiperOrigin-RevId: 313815027
Change-Id: Iee056840531149ec6735295b6384c32afb07672a
---
 .../lite/micro/tools/make/third_party_downloads.inc       | 4 ++--
 tensorflow/lite/tools/make/download_dependencies.sh       | 4 ++--
 tensorflow/tools/pip_package/BUILD                        | 2 ++
 third_party/ruy/workspace.bzl                             | 8 ++++----
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index 10be8dbaa34..e4bb4f424ac 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -56,8 +56,8 @@ SIFIVE_FE310_LIB_MD5 := "06ee24c4956f8e21670ab3395861fe64"
 KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
 KISSFFT_MD5="438ba1fef5783cc5f5f201395cc477ca"
 
-RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
-RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
+RUY_URL="https://github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip"
+RUY_MD5="e9c9870741554afcc7a20c360254b31c"
 
 CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
 CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index a7840f6dcd0..293eb762938 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -37,8 +37,8 @@ EIGEN_URL="$(grep -o 'https.*gitlab.com/libeigen/eigen/-/archive/.*tar\.gz' "${B
 EIGEN_SHA="$(eval echo $(grep '# SHARED_EIGEN_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
 GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GEMMLOWP_SHA="$(eval echo $(grep '# SHARED_GEMMLOWP_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
-RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
-RUY_SHA="b21524de00c63b3d5683b42557f78452e791cf77fddb2e63f9bcba1f7bd99093"
+RUY_URL="https://github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip"
+RUY_SHA="d5e703913c9e8f0196d83cc4113ecaae4bcae52181f05836890f16aad402fea4"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 GOOGLETEST_SHA="58a6f4277ca2bc8565222b3bbd58a177609e9c488e8a72649359ba51450db7d8"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 43bc04a1b60..36e20408c53 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -214,6 +214,8 @@ filegroup(
         "@sobol_data//:LICENSE",
         "@termcolor_archive//:COPYING.txt",
         "@zlib//:zlib.h",
+        "@clog//:LICENSE",
+        "@cpuinfo//:LICENSE",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/third_party/ruy/workspace.bzl b/third_party/ruy/workspace.bzl
index c4ed692df4d..5d653e47d06 100644
--- a/third_party/ruy/workspace.bzl
+++ b/third_party/ruy/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "ruy",
-        sha256 = "b21524de00c63b3d5683b42557f78452e791cf77fddb2e63f9bcba1f7bd99093",
-        strip_prefix = "ruy-1b313682ef8b8fc8ed08719c610d1c3503b016bf",
+        sha256 = "d5e703913c9e8f0196d83cc4113ecaae4bcae52181f05836890f16aad402fea4",
+        strip_prefix = "ruy-51b518e755dd3da37a79d16972b76d3baedac22d",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip",
-            "https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip",
+            "https://github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip",
         ],
         build_file = "//third_party/ruy:BUILD",
     )

From 45b2bbc9a32362b36de8cc14e96b5b886c04c179 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 11:37:25 -0700
Subject: [PATCH 1397/1533] [TF/MLIR] Adds legalization from xla_hlo.dot to
 tf.MatMul.

PiperOrigin-RevId: 313819546
Change-Id: I88ed2c455e80cf66bd6d09404b5ae6b3383f186a
---
 .../mlir/tensorflow/tests/legalize_hlo.mlir   | 51 ++++++++++++++++
 .../tensorflow/transforms/legalize_hlo.cc     | 59 ++++++++++++++++++-
 .../transforms/legalize_hlo_patterns.td       |  6 ++
 3 files changed, 115 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index c7934a377b3..198227bf5dd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -690,6 +690,27 @@ func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
 func @reshape(%arg0: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
   %0 = "xla_hlo.reshape"(%arg0) : (tensor<4x6xf32>) -> tensor<2x2x6xf32>
   return %0 : tensor<2x2x6xf32>
+
+}
+
+func @convert_dot_1d_2d(%arg0: tensor<256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1xf32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256x1xf32>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+func @convert_dot_2d_1d(%arg0: tensor<1x256xf32>, %arg1: tensor<256xf32>) -> tensor<1xf32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256xf32>) -> tensor<1xf32>
+  return %0 : tensor<1xf32>
+}
+
+func @convert_dot_1d_1d(%arg0: tensor<256xf32>, %arg1: tensor<256xf32>) -> tensor<f32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<256xf32>, tensor<256xf32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+func @convert_dot_2d_2d(%arg0: tensor<1x256xf32>, %arg1: tensor<256x1xf32>) -> tensor<1x1xf32> {
+  %0 = "xla_hlo.dot"(%arg0, %arg1) {precision_config = ["DEFAULT", "DEFAULT"]} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+  return %0 : tensor<1x1xf32>
 }
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
@@ -1519,3 +1540,33 @@ func @reshape(%arg0: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
 // CHECK:           return [[VAL_374]] : tensor<2x2x6xf32>
 // CHECK:         }
 
+// CHECK-LABEL:   func @convert_dot_1d_2d(
+// CHECK-SAME:                            [[VAL_376:%.*]]: tensor<256xf32>, [[VAL_377:%.*]]: tensor<256x1xf32>) -> tensor<1xf32> {
+// CHECK:           [[VAL_378:%.*]] = "tf.Reshape"([[VAL_376]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           [[VAL_379:%.*]] = "tf.MatMul"([[VAL_378]], [[VAL_377]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           [[VAL_380:%.*]] = "tf.Reshape"([[VAL_379]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return [[VAL_380]] : tensor<1xf32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @convert_dot_2d_1d(
+// CHECK-SAME:                            [[VAL_381:%.*]]: tensor<1x256xf32>, [[VAL_382:%.*]]: tensor<256xf32>) -> tensor<1xf32> {
+// CHECK:           [[VAL_383:%.*]] = "tf.Reshape"([[VAL_382]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           [[VAL_384:%.*]] = "tf.MatMul"([[VAL_381]], [[VAL_383]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           [[VAL_385:%.*]] = "tf.Reshape"([[VAL_384]], {{.*}}) : (tensor<1x1xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           return [[VAL_385]] : tensor<1xf32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @convert_dot_1d_1d(
+// CHECK-SAME:                            [[VAL_386:%.*]]: tensor<256xf32>, [[VAL_387:%.*]]: tensor<256xf32>) -> tensor<f32> {
+// CHECK-DAG:       [[VAL_388:%.*]] = "tf.Reshape"([[VAL_386]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK-DAG:       [[VAL_389:%.*]] = "tf.Reshape"([[VAL_387]], {{.*}}) : (tensor<256xf32>, tensor<2xi64>) -> tensor<1x256xf32>
+// CHECK:           [[VAL_390:%.*]] = "tf.MatMul"([[VAL_388]], [[VAL_389]]) {transpose_a = false, transpose_b = true} : (tensor<1x256xf32>, tensor<1x256xf32>) -> tensor<1x1xf32>
+// CHECK:           [[VAL_391:%.*]] = "tf.Reshape"([[VAL_390]], {{.*}}) : (tensor<1x1xf32>, tensor<0xi64>) -> tensor<f32>
+// CHECK:           return [[VAL_391]] : tensor<f32>
+// CHECK:         }
+
+// CHECK-LABEL:   func @convert_dot_2d_2d(
+// CHECK-SAME:                            [[VAL_392:%.*]]: tensor<1x256xf32>, [[VAL_393:%.*]]: tensor<256x1xf32>) -> tensor<1x1xf32> {
+// CHECK:           [[VAL_394:%.*]] = "tf.MatMul"([[VAL_392]], [[VAL_393]]) {transpose_a = false, transpose_b = false} : (tensor<1x256xf32>, tensor<256x1xf32>) -> tensor<1x1xf32>
+// CHECK:           return [[VAL_394]] : tensor<1x1xf32>
+// CHECK:         }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index c0cdc3e6b8e..d635d605607 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -25,6 +26,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -104,6 +106,59 @@ ConstantOp ShapeToConst(PatternRewriter &rewriter, Value value) {
   return rewriter.create<ConstantOp>(value.getLoc(), attr_type, attr);
 }
 
+// Converts xla_hlo.dot to tf.MatMul. Reshape ops will be inserted when
+// necessary.
+Value ConvertDotOp(PatternRewriter &rewriter, Operation *old_op) {
+  auto dot_op = cast<xla_hlo::DotOp>(old_op);
+  const mlir::Location loc = dot_op.getLoc();
+  // Normalizes a ShapedType to 2d if the ShapedType is less than 2d by
+  // inserting dummy 1-element dimensions in the begining. Does nothing if the
+  // old shape is already 2d or higher. This is necessary because tf.MatMul
+  // requires input tensors to be at least 2d.
+  const auto normalize_rank = [](ShapedType type) -> ShapedType {
+    if (type.getRank() >= 2) {
+      return type;
+    }
+
+    const int rank = type.getRank();
+    llvm::SmallVector<int64_t, 2> shape_2d(type.getShape().begin(),
+                                           type.getShape().end());
+    for (int i = 0; i < 2 - rank; ++i) {
+      shape_2d.insert(shape_2d.begin(), 1);
+    }
+    return RankedTensorType::get(shape_2d, type.getElementType());
+  };
+
+  // Reshapes a tensor value to 2d if it is 1d or scalar. Otherwise does
+  // nothing.
+  const auto reshape_to_2d = [&rewriter, &loc,
+                              &normalize_rank](mlir::Value input) {
+    const auto input_type = input.getType().cast<ShapedType>();
+    if (input_type.getRank() >= 2) {
+      return input;
+    }
+
+    auto reshape = rewriter.create<xla_hlo::ReshapeOp>(
+        loc, normalize_rank(input_type), input);
+    return reshape.getResult();
+  };
+
+  // Reshapes both operand to be 2d for tf.MatMul op.
+  auto a = reshape_to_2d(dot_op.lhs());
+  auto b = reshape_to_2d(dot_op.rhs());
+  // Operand `b` needs to be transposed if it is 1d. This is because dot op will
+  // contract on the only dimension if rhs is 1d.
+  auto b_old_type = dot_op.rhs().getType().cast<ShapedType>();
+  BoolAttr transpose_b = rewriter.getBoolAttr(b_old_type.getRank() == 1);
+  auto output_type = dot_op.getResult().getType().cast<ShapedType>();
+  auto matmul = rewriter.create<TF::MatMulOp>(
+      loc, normalize_rank(output_type), a, b,
+      /*transpose_a=*/rewriter.getBoolAttr(false), transpose_b);
+  auto reshape =
+      rewriter.create<xla_hlo::ReshapeOp>(loc, output_type, matmul.product());
+  return reshape.getResult();
+}
+
 #include "tensorflow/compiler/mlir/tensorflow/transforms/generated_legalize_hlo.inc"
 
 /// Performs the lowering to XLA dialect.
@@ -118,8 +173,10 @@ void LegalizeHloToTf::runOnFunction() {
   ConversionTarget target(context);
   target.addLegalDialect<TensorFlowDialect>();
   target.addLegalOp<CallOp, ConstantOp>();
-  if (failed(applyPartialConversion(getFunction(), target, patterns)))
+  if (failed(applyPartialConversion(getFunction(), target, patterns))) {
+    getFunction().emitError("xla_hlo to TF legalization failed.");
     signalPassFailure();
+  }
 }
 
 static PassRegistration<LegalizeHloToTf> pass(
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 08250ff8e8c..e4b6a28d65f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -155,3 +155,9 @@ foreach pair = [[TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE],
             [(AreBroadcastCompatible $l, $r)]>;
   def : Pat<(HLO_CompareOp $l, $r, pair[1]), (pair[0] $l, $r)>;
 }
+
+def ConvertDotOp : NativeCodeCall<"ConvertDotOp($_builder, "
+                                               "$0.getDefiningOp())">;
+def : Pat<(HLO_DotOp:$old_value AnyStaticShapeTensor:$lhs,
+               AnyStaticShapeTensor:$rhs, $precision_config),
+          (ConvertDotOp $old_value)>;

From d9b280b1cc80eccc12b8030f1f78041c86109a34 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Fri, 29 May 2020 11:37:49 -0700
Subject: [PATCH 1398/1533] Verify that IfRegion regions do not have any
 arguments

PiperOrigin-RevId: 313819636
Change-Id: Ief6f8945bc3b6329b7a82c10f42f28500c187129
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  4 +++
 .../mlir/tensorflow/tests/tf-ops.mlir         | 35 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 389be0d3b2b..c9d61abe507 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1937,6 +1937,10 @@ static LogicalResult Verify(IfRegionOp op) {
     return failure();
   if (failed(VerifyRegionResults(op, op.else_branch(), "else")))
     return failure();
+  if (op.then_branch().front().getNumArguments() != 0)
+    return op.emitOpError() << "then region cannot have any arguments";
+  if (op.else_branch().front().getNumArguments() != 0)
+    return op.emitOpError() << "else region cannot have any arguments";
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 20f7c5b9ba1..8692104e772 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1078,6 +1078,41 @@ func @testIfRegionThenConsumingElse(%arg0: tensor<i1>, %arg1: tensor<2xf32>) ->
 
 // -----
 
+// The regions for IfRegion themselves cannot have any arguments
+func @testInvalidIfRegionThenArg(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %neg = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+  // expected-error @+1 {{then region cannot have any arguments}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     ^bb(%arg_bb: tensor<2xf32>):
+     %t = "tf.Abs"(%arg_bb) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     %e = "tf.Acos"(%neg) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
+func @testInvalidIfRegionElseArg(%arg0: tensor<i1>, %arg1: tensor<2xf32>) -> tensor<2xf32> {
+  %neg = "tf.Neg"(%arg1) : (tensor<2xf32>) -> tensor<2xf32>
+  // expected-error @+1 {{else region cannot have any arguments}}
+  %0 = "tf.IfRegion"(%arg0) ({
+     %t = "tf.Abs"(%neg) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%t) : (tensor<2xf32>) -> ()
+    }, {
+     ^bb(%arg_bb: tensor<2xf32>):
+     %e = "tf.Acos"(%arg_bb) : (tensor<2xf32>) -> tensor<2xf32>
+     "tf.Yield"(%e) : (tensor<2xf32>) -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> tensor<2xf32>
+
+  return %0 : tensor<2xf32>
+}
+
+// -----
+
 // Test valid tf.MatrixBandPart
 // CHECK-LABEL: func @testValidMatrixBandPartOp
 func @testValidMatrixBandPartOp(%arg0: tensor<64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {

From be2bd986588d621622e195451b38275b6b818d87 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Fri, 29 May 2020 11:45:09 -0700
Subject: [PATCH 1399/1533] Generalize functional model serialization to
 support cases where elements in the first argument of a layer call are not
 produced from keras inputs. This will be needed to support generalizing
 functional model construction to cases where only elements outside of the
 first call argument have keras history.

PiperOrigin-RevId: 313820929
Change-Id: Ie9ef0cbc6b8caab189534faf09de897c361f5c08
---
 tensorflow/python/keras/engine/functional.py | 28 +++++++++++++-------
 tensorflow/python/keras/engine/node.py       | 19 +++++++++----
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 4958990ad66..eec13345295 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -32,6 +32,7 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
+from tensorflow.python.keras.engine import node as node_module
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import network_serialization
@@ -1111,19 +1112,28 @@ def reconstruct_from_config(config, custom_objects=None, created_layers=None):
         kwargs = {}
       elif len(input_data) == 4:
         kwargs = input_data[3]
-        kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+        try:
+          kwargs = _deserialize_keras_tensors(kwargs, created_layers)
+        except IndexError:
+          # Happens if keras tensors in kwargs are still unprocessed
+          add_unprocessed_node(layer, node_data)
+          return
       else:
         raise ValueError('Improperly formatted model config.')
 
-      inbound_layer = created_layers[inbound_layer_name]
-      inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
+      if inbound_layer_name != node_module._CONSTANT_VALUE:
+        inbound_layer = created_layers[inbound_layer_name]
+        inbound_node_index = get_node_index(inbound_layer, inbound_node_index)
 
-      if inbound_node_index is None:
-        add_unprocessed_node(layer, node_data)
-        return
-      inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
-      input_tensors.append(
-          nest.flatten(inbound_node.outputs)[inbound_tensor_index])
+        if inbound_node_index is None:
+          add_unprocessed_node(layer, node_data)
+          return
+        inbound_node = inbound_layer._inbound_nodes[inbound_node_index]
+        input_tensors.append(
+            nest.flatten(inbound_node.outputs)[inbound_tensor_index])
+      else:
+        # We received a constant w/ no Keras history attached
+        input_tensors.append(inbound_tensor_index)
     input_tensors = nest.pack_sequence_as(node_data, input_tensors)
     # Call layer on its inputs, thus creating the node
     # and building the layer if needed.
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index 708904853b2..e8d98389bd7 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -32,6 +32,8 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 
+_CONSTANT_VALUE = '_CONSTANT_VALUE'
+
 
 class Node(object):
   """A `Node` describes the connectivity between two layers.
@@ -181,11 +183,18 @@ class Node(object):
     # `kwargs` is added to each Tensor in the first arg. This should be
     # changed in a future version of the serialization format.
     def serialize_first_arg_tensor(t):
-      kh = t._keras_history
-      node_index = kh.node_index
-      node_key = make_node_key(kh.layer.name, node_index)
-      new_node_index = node_conversion_map.get(node_key, 0)
-      data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+      if is_keras_tensor(t):
+        kh = t._keras_history
+        node_index = kh.node_index
+        node_key = make_node_key(kh.layer.name, node_index)
+        new_node_index = node_conversion_map.get(node_key, 0)
+        data = [kh.layer.name, new_node_index, kh.tensor_index, kwargs]
+      else:
+        # If an element in the first call argument did not originate as a
+        # keras tensor and is a constant value, we save it using the format
+        # ['_CONSTANT_VALUE', -1, serializaed_tensor_or_python_constant]
+        # (potentially including serialized kwargs in an optional 4th argument
+        data = [_CONSTANT_VALUE, -1, _serialize_keras_tensor(t), kwargs]
       return tf_utils.ListWrapper(data)
 
     data = nest.map_structure(serialize_first_arg_tensor, inputs)

From 539be67dbe0ad78df6c8de90c34b7148e3779d05 Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Fri, 29 May 2020 11:47:09 -0700
Subject: [PATCH 1400/1533] Support the semantic stats ($pt, $p, $ct, $c, $r
 and $a) in the grouping code.

PiperOrigin-RevId: 313821271
Change-Id: I9dbc10cbcc0f81a96f3e3956242392a44a24445b
---
 .../core/profiler/utils/group_events.cc       | 110 +++++++++++++++---
 tensorflow/core/profiler/utils/group_events.h |  45 ++++++-
 .../core/profiler/utils/group_events_test.cc  |  78 +++++++++++++
 .../core/profiler/utils/xplane_schema.cc      |   2 +
 .../core/profiler/utils/xplane_schema.h       |   2 +
 5 files changed, 215 insertions(+), 22 deletions(-)

diff --git a/tensorflow/core/profiler/utils/group_events.cc b/tensorflow/core/profiler/utils/group_events.cc
index c07a397ee54..001fb78b073 100644
--- a/tensorflow/core/profiler/utils/group_events.cc
+++ b/tensorflow/core/profiler/utils/group_events.cc
@@ -108,6 +108,30 @@ void SetGroupId(const XPlaneVisitor& visitor, int64 group_id, XEvent* event) {
                      event);
 }
 
+void SetContextGroup(EventNode* event, ContextGroupMap* context_groups) {
+  auto producer = event->GetProducerContext();
+  if (producer.has_value()) {
+    ((*context_groups)[producer->type][producer->id]).producer = event;
+  }
+  auto consumer = event->GetConsumerContext();
+  if (consumer.has_value()) {
+    ((*context_groups)[consumer->type][consumer->id])
+        .consumers.push_back(event);
+  }
+}
+
+void ConnectContextGroups(const ContextGroupMap& context_groups) {
+  for (auto& type_id_group : context_groups) {
+    for (auto& id_group : type_id_group.second) {
+      const ContextGroup& group = id_group.second;
+      EventNode* parent = group.producer;
+      for (EventNode* child : group.consumers) {
+        parent->AddChild(child);
+      }
+    }
+  }
+}
+
 using VirtualEventNodeMap =
     absl::flat_hash_map<int64 /*step_id*/,
                         absl::flat_hash_map<int64 /*iter_num*/, EventNode*>>;
@@ -143,6 +167,15 @@ bool HasFunctionRun(EventNode* event_node) {
   return false;
 }
 
+void ProcessRootEvent(int64 group_id, EventNode* root_event,
+                      EventGroupNameMap* event_group_name_map) {
+  root_event->PropagateGroupId(group_id);
+  std::string group_name = root_event->GetGroupName();
+  // TODO(jihochoi): change event name instead.
+  root_event->AddStepName(group_name);
+  event_group_name_map->emplace(group_id, std::move(group_name));
+}
+
 }  // namespace
 
 EventNode::EventNode(const XPlaneVisitor* plane, XLine* raw_line,
@@ -150,13 +183,47 @@ EventNode::EventNode(const XPlaneVisitor* plane, XLine* raw_line,
     : plane_(plane),
       visitor_(plane, raw_line, raw_event),
       raw_line_(raw_line),
-      raw_event_(raw_event) {}
+      raw_event_(raw_event) {
+  absl::optional<int> producer_type;
+  absl::optional<uint64> producer_id;
+  absl::optional<int> consumer_type;
+  absl::optional<uint64> consumer_id;
+  visitor_.ForEachStat([&](const XStatVisitor& stat) {
+    if (!stat.Type().has_value()) return;
+    switch (*stat.Type()) {
+      case StatType::kProducerType:
+        producer_type = stat.IntValue();
+        break;
+      case StatType::kProducerId:
+        producer_id = stat.IntValue();
+        break;
+      case StatType::kConsumerType:
+        consumer_type = stat.IntValue();
+        break;
+      case StatType::kConsumerId:
+        consumer_id = stat.IntValue();
+        break;
+      case StatType::kIsRoot:
+        is_root_ = stat.IntValue();
+        break;
+      case StatType::kIsAsync:
+        is_async_ = stat.IntValue();
+        break;
+      default:
+        break;
+    }
+  });
+  if (producer_type.has_value() && producer_id.has_value()) {
+    producer_context_ = {*producer_type, *producer_id};
+  }
+  if (consumer_type.has_value() && consumer_id.has_value()) {
+    consumer_context_ = {*consumer_type, *consumer_id};
+  }
+}
 
 EventNode::EventNode(const EventNode& event_node)
-    : plane_(event_node.plane_),
-      visitor_(event_node.plane_, event_node.raw_line_, event_node.raw_event_),
-      raw_line_(event_node.raw_line_),
-      raw_event_(event_node.raw_event_) {}
+    : EventNode(event_node.plane_, event_node.raw_line_,
+                event_node.raw_event_) {}
 
 const XStat* EventNode::GetContextStat(int64 stat_type) const {
   if (const XStat* stat = visitor_.GetStats(stat_type)) {
@@ -226,11 +293,18 @@ EventNode* EventNode::FindParent(int64 event_type) {
 }
 
 void EventForest::ConnectIntraThread(const XPlaneVisitor& visitor,
-                                     XPlane* plane) {
+                                     XPlane* plane,
+                                     ContextGroupMap* context_groups) {
   for (auto& line : *plane->mutable_lines()) {
     std::vector<EventNode*> parent_nodes;
     for (auto& event : *line.mutable_events()) {
       auto cur_node = absl::make_unique<EventNode>(&visitor, &line, &event);
+      // Update `context_groups` for `ConnectInterThread`.
+      SetContextGroup(cur_node.get(), context_groups);
+      // Update `root_events_` for `CreateEventGroup`.
+      if (cur_node->IsRoot()) root_events_.push_back(cur_node.get());
+      // Async events are ignored when processing the nesting relationship.
+      if (cur_node->IsAsync()) continue;
       while (!parent_nodes.empty()) {
         EventNode* parent_node = parent_nodes.back();
         if (parent_node->GetEventVisitor().GetTimespan().Includes(
@@ -299,21 +373,19 @@ void EventForest::ConnectInterThread(
 void EventForest::CreateEventGroup(
     const std::vector<int64 /*EventType*/>& root_event_types) {
   int64 next_group_id = 0;
+  for (EventNode* root_event : root_events_) {
+    ProcessRootEvent(next_group_id++, root_event, &event_group_name_map_);
+  }
   for (int64 root_event_type : root_event_types) {
-    if (auto root_event_node_list =
-            gtl::FindOrNull(event_node_map_, root_event_type)) {
-      for (const auto& root_event_node : *root_event_node_list) {
+    if (auto root_events = gtl::FindOrNull(event_node_map_, root_event_type)) {
+      for (const auto& root_event : *root_events) {
         // Skip if it already belongs to a group.
-        if (root_event_node->GetGroupId()) continue;
-        int64 group_id = next_group_id++;
-        root_event_node->PropagateGroupId(group_id);
-        std::string group_name = root_event_node->GetGroupName();
-        // TODO(jihochoi): change event name instead.
-        root_event_node->AddStepName(group_name);
-        event_group_name_map_[group_id] = std::move(group_name);
+        if (root_event->GetGroupId()) continue;
+        ProcessRootEvent(next_group_id++, root_event.get(),
+                         &event_group_name_map_);
       }
       // Only use the first root event type found.
-      if (!root_event_node_list->empty()) break;
+      if (!root_events->empty()) break;
     }
   }
 }
@@ -392,13 +464,15 @@ EventForest::EventForest(
     const std::vector<int64>& root_event_types,
     const std::function<XPlaneVisitor(const XPlane*)> visitor_factory,
     XSpace* space) {
+  ContextGroupMap context_groups;
   visitors_.reserve(space->planes_size());
   for (auto& plane : *space->mutable_planes()) {
     CreateStatMetadata(&plane);
     visitors_.push_back(visitor_factory(&plane));
-    ConnectIntraThread(visitors_.back(), &plane);
+    ConnectIntraThread(visitors_.back(), &plane, &context_groups);
   }
   ConnectInterThread(connect_info_list);
+  ConnectContextGroups(context_groups);
   if (NeedsVirtualEventsForHostTrainingLoop(root_event_types)) {
     CreateVirtualEventsForHostTrainingLoop();
   }
diff --git a/tensorflow/core/profiler/utils/group_events.h b/tensorflow/core/profiler/utils/group_events.h
index 49a7b349589..544ae39c1ed 100644
--- a/tensorflow/core/profiler/utils/group_events.h
+++ b/tensorflow/core/profiler/utils/group_events.h
@@ -43,6 +43,12 @@ struct InterThreadConnectInfo {
   std::vector<int64> child_stat_types;
 };
 
+struct ContextInfo {
+  ContextInfo(int type, uint64 id) : type(type), id(id) {}
+  int type;
+  uint64 id;
+};
+
 // A wrapper for XEvent with parent and children pointers. Through these
 // pointers, a tree of EventNode is formed.
 class EventNode {
@@ -86,6 +92,18 @@ class EventNode {
   // Returns the closest parent of the given event type.
   EventNode* FindParent(int64 event_type);
 
+  absl::optional<ContextInfo> GetProducerContext() const {
+    return producer_context_;
+  }
+
+  absl::optional<ContextInfo> GetConsumerContext() const {
+    return consumer_context_;
+  }
+
+  bool IsRoot() const { return is_root_; }
+
+  bool IsAsync() const { return is_async_; }
+
  private:
   const XPlaneVisitor* plane_;
   XEventVisitor visitor_;
@@ -94,6 +112,10 @@ class EventNode {
   EventNode* parent_ = nullptr;
   std::vector<EventNode*> children_;
   absl::optional<int64> group_id_;
+  absl::optional<ContextInfo> producer_context_;
+  absl::optional<ContextInfo> consumer_context_;
+  bool is_root_ = false;
+  bool is_async_ = false;
 };
 
 using EventNodeMap =
@@ -102,9 +124,22 @@ using EventNodeMap =
 
 using EventGroupNameMap = absl::flat_hash_map<int64 /*group_id*/, std::string>;
 
-// Creates a forest of EventNode by stitching events in space using the nesting
-// relationship within the same thread and connect_info_list across threads, and
-// groups them by the root events.
+using EventList = std::vector<EventNode*>;
+
+struct ContextGroup {
+  EventNode* producer = nullptr;
+  std::vector<EventNode*> consumers;
+};
+
+using ContextGroupMap = absl::flat_hash_map<
+    int /*context_type*/,
+    absl::flat_hash_map<uint64 /*context_id*/, ContextGroup>>;
+
+// EventForest augments the input XSpace with the trace context. The trace
+// context is created by stitching XEvents (1) using the nesting relationship
+// within the same thread and (2) comparing the semantic arguments or using
+// connect_info_list across threads. It also groups the events by the root
+// events specified in root_event_types or marked by the semantic argument.
 class EventForest {
  public:
   EventForest(const std::vector<InterThreadConnectInfo>& connect_info_list,
@@ -121,7 +156,8 @@ class EventForest {
  private:
   // Creates an EventNode for each event in event_node_map and connect events
   // according to the nesting relationship within the thread.
-  void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane);
+  void ConnectIntraThread(const XPlaneVisitor& visitor, XPlane* plane,
+                          ContextGroupMap* context_groups);
 
   // Connects events across threads according to connect_info_list.
   void ConnectInterThread(
@@ -153,6 +189,7 @@ class EventForest {
   EventNodeMap event_node_map_;
   std::vector<XPlaneVisitor> visitors_;
   EventGroupNameMap event_group_name_map_;
+  EventList root_events_;
 };
 
 std::vector<InterThreadConnectInfo> CreateInterThreadConnectInfoList();
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 11996ba4068..70db7c2b8e6 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -237,6 +237,84 @@ TEST(GroupEventsTest, FunctionOpTest) {
   EXPECT_EQ(gpu_kernel.stats(2).int64_value(), 0);
 }
 
+TEST(GroupEventsTest, SemanticArgTest) {
+  constexpr int64 kStepNum = 100;
+  constexpr int kContextType = 123;
+  constexpr uint64 kContextId = 456;
+
+  XSpace raw_space;
+  XPlane* raw_plane = raw_space.add_planes();
+  XPlaneBuilder plane(raw_plane);
+  plane.ReserveLines(2);
+  auto root_producer = plane.GetOrCreateLine(0);
+  CreateXEvent(&plane, &root_producer, HostEventType::kTraceContext, 0, 100,
+               {{StatType::kIsRoot, 1}, {StatType::kStepNum, kStepNum}});
+  CreateXEvent(&plane, &root_producer, HostEventType::kFunctionRun, 10, 90,
+               {{StatType::kProducerType, kContextType},
+                {StatType::kProducerId, kContextId}});
+  auto consumer = plane.GetOrCreateLine(1);
+  CreateXEvent(&plane, &consumer, HostEventType::kExecutorStateProcess, 20, 80,
+               {{StatType::kConsumerType, kContextType},
+                {StatType::kConsumerId, kContextId}});
+
+  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  int num_events = 0;
+  CreateTfXPlaneVisitor(raw_plane).ForEachLine(
+      [&](const tensorflow::profiler::XLineVisitor& line) {
+        num_events += line.NumEvents();
+        line.ForEachEvent(
+            [&](const tensorflow::profiler::XEventVisitor& event) {
+              absl::optional<int64> group_id;
+              event.ForEachStat(
+                  [&](const tensorflow::profiler::XStatVisitor& stat) {
+                    if (stat.Type() == StatType::kGroupId) {
+                      group_id = stat.IntValue();
+                    }
+                  });
+              EXPECT_TRUE(group_id.has_value());
+              EXPECT_EQ(*group_id, 0);
+            });
+      });
+  EXPECT_EQ(num_events, 3);
+}
+
+TEST(GroupEventsTest, AsyncEventTest) {
+  constexpr absl::string_view kParent = "parent";
+  constexpr absl::string_view kAsync = "async";
+  constexpr absl::string_view kChild = "child";
+
+  XSpace raw_space;
+  XPlane* raw_plane = raw_space.add_planes();
+  XPlaneBuilder plane(raw_plane);
+  plane.ReserveLines(1);
+  auto line = plane.GetOrCreateLine(0);
+  CreateXEvent(&plane, &line, kParent, 0, 100, {{StatType::kIsRoot, 1}});
+  CreateXEvent(&plane, &line, kAsync, 10, 200, {{StatType::kIsAsync, 1}});
+  CreateXEvent(&plane, &line, kChild, 20, 80);
+
+  GroupTfEvents(&raw_space, /*event_group_name_map=*/nullptr);
+  CreateTfXPlaneVisitor(raw_plane).ForEachLine(
+      [&](const tensorflow::profiler::XLineVisitor& line) {
+        EXPECT_EQ(line.NumEvents(), 3);
+        line.ForEachEvent(
+            [&](const tensorflow::profiler::XEventVisitor& event) {
+              absl::optional<int64> group_id;
+              event.ForEachStat(
+                  [&](const tensorflow::profiler::XStatVisitor& stat) {
+                    if (stat.Type() == StatType::kGroupId) {
+                      group_id = stat.IntValue();
+                    }
+                  });
+              if (event.Name() == kAsync) {
+                EXPECT_FALSE(group_id.has_value());
+              } else {
+                EXPECT_TRUE(group_id.has_value());
+                EXPECT_EQ(*group_id, 0);
+              }
+            });
+      });
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 39cbbf88e95..2c474917110 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -154,6 +154,8 @@ const StatTypeMap& GetStatTypeMap() {
       {"$ct", kConsumerType},
       {"$p", kProducerId},
       {"$c", kConsumerId},
+      {"$r", kIsRoot},
+      {"$a", kIsAsync},
       // Device trace arguments.
       {"device_id", kDeviceId},
       {"context_id", kContextId},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 864c1c45ecb..82b95b0cdd8 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -146,6 +146,8 @@ enum StatType {
   kConsumerType,
   kProducerId,
   kConsumerId,
+  kIsRoot,
+  kIsAsync,
   // Device trace arguments.
   kDeviceId,
   kContextId,

From 90a005a4e9d87ec9e95bfb64deba552cebf2dde4 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Fri, 29 May 2020 11:49:15 -0700
Subject: [PATCH 1401/1533] Fix complex & integer types in reduce operations

For tf.math.reduce_variance and tf.math.reduce_std we need to take the
absolute value before computing the square of the difference. This can
be skipped for non-complex values. For complex values we achieve this by
simply multiplying with the conjugate.

Further, we explicitly disallow integer types in these APIs since they
lead to imprecise calculations.

PiperOrigin-RevId: 313821668
Change-Id: I5eef00dcb1ecfcea0dd67af0cdc1f9785732a8ae
---
 tensorflow/python/ops/math_ops.py      | 63 ++++++++++++++++----------
 tensorflow/python/ops/math_ops_test.py | 44 ++++++++++++++++--
 2 files changed, 79 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index bf462ba8716..3a19bfa6a9a 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2250,15 +2250,16 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
 
   For example:
 
-  ```python
-  x = tf.constant([[1., 2.], [3., 4.]])
-  tf.reduce_variance(x)  # 1.25
-  tf.reduce_variance(x, 0)  # [1., 1.]
-  tf.reduce_variance(x, 1)  # [0.25,  0.25]
-  ```
+  >>> x = tf.constant([[1., 2.], [3., 4.]])
+  >>> tf.math.reduce_variance(x)
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.25>
+  >>> tf.math.reduce_variance(x, 0)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], ...)>
+  >>> tf.math.reduce_variance(x, 1)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.25, 0.25], ...)>
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real or complex type.
     axis: The dimensions to reduce. If `None` (the default), reduces all
       dimensions. Must be in the range `[-rank(input_tensor),
       rank(input_tensor))`.
@@ -2266,21 +2267,32 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
     name: A name scope for the associated operations (optional).
 
   Returns:
-    The reduced tensor, of the same dtype as the input_tensor.
+    The reduced tensor, of the same dtype as the input_tensor. Note,  for
+    `complex64` or `complex128` input, the returned `Tensor` will be of type
+    `float32` or `float64`, respectively.
 
   @compatibility(numpy)
   Equivalent to np.var
 
-  Please note that `np.var` has a `dtype` parameter that could be used to
-  specify the output type. By default this is `dtype=float64`. On the other
-  hand, `tf.reduce_variance` has an aggressive type inference from
-  `input_tensor`,
+  Please note `np.var` has a `dtype` parameter that could be used to specify the
+  output type. By default this is `dtype=float64`. On the other hand,
+  `tf.math.reduce_variance` has aggressive type inference from `input_tensor`.
   @end_compatibility
   """
   name = name if name else "reduce_variance"
   with ops.name_scope(name):
     means = reduce_mean(input_tensor, axis=axis, keepdims=True)
-    squared_deviations = gen_math_ops.square(input_tensor - means)
+    if means.dtype.is_integer:
+      raise TypeError("Input must be either real or complex")
+    diff = input_tensor - means
+    if diff.dtype.is_complex:
+      # For complex values we need to take the absolute value before squaring.
+      # This is achieved by multiplying with the conjugate.
+      real_dtype = diff.dtype.real_dtype
+      squared_deviations = gen_math_ops.real(
+          gen_math_ops.mul(gen_math_ops.conj(diff), diff), Tout=real_dtype)
+    else:
+      squared_deviations = gen_math_ops.square(diff)
     return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
 
 
@@ -2299,15 +2311,16 @@ def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
 
   For example:
 
-  ```python
-  x = tf.constant([[1., 2.], [3., 4.]])
-  tf.reduce_std(x)  # 1.1180339887498949
-  tf.reduce_std(x, 0)  # [1., 1.]
-  tf.reduce_std(x, 1)  # [0.5,  0.5]
-  ```
+  >>> x = tf.constant([[1., 2.], [3., 4.]])
+  >>> tf.math.reduce_std(x)
+  <tf.Tensor: shape=(), dtype=float32, numpy=1.118034>
+  >>> tf.math.reduce_std(x, 0)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([1., 1.], dtype=float32)>
+  >>> tf.math.reduce_std(x, 1)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.5, 0.5], dtype=float32)>
 
   Args:
-    input_tensor: The tensor to reduce. Should have numeric type.
+    input_tensor: The tensor to reduce. Should have real or complex type.
     axis: The dimensions to reduce. If `None` (the default), reduces all
       dimensions. Must be in the range `[-rank(input_tensor),
       rank(input_tensor))`.
@@ -2315,14 +2328,16 @@ def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
     name: A name scope for the associated operations (optional).
 
   Returns:
-    The reduced tensor, of the same dtype as the input_tensor.
+    The reduced tensor, of the same dtype as the input_tensor. Note,  for
+    `complex64` or `complex128` input, the returned `Tensor` will be of type
+    `float32` or `float64`, respectively.
 
   @compatibility(numpy)
   Equivalent to np.std
 
-  Please note that `np.std` has a `dtype` parameter that could be used to
-  specify the output type. By default this is `dtype=float64`. On the other
-  hand, `tf.reduce_std` has an aggressive type inference from `input_tensor`,
+  Please note `np.std` has a `dtype` parameter that could be used to specify the
+  output type. By default this is `dtype=float64`. On the other hand,
+  `tf.math.reduce_std` has aggressive type inference from `input_tensor`.
   @end_compatibility
   """
   name = name if name else "reduce_std"
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index ebcbd58d9d6..85a5afc6c16 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -75,8 +75,26 @@ class ReduceTest(test_util.TensorFlowTestCase):
     self.assertAllClose(
         self.evaluate(math_ops.reduce_variance(x, axis=0)), [0, 0, 0])
 
-    x = np.array([[0, 2, 1, 1], [1, 2, 0, 1]], "float32")
-    self.assertAllClose(self.evaluate(math_ops.reduce_variance(x)), 0.5)
+    x = [[1, 2, 1, 1], [1, 1, 0, 1]]
+    with self.assertRaisesRegexp(TypeError, "must be either real or complex"):
+      math_ops.reduce_variance(x)
+
+    x = [[1., 2., 1., 1.], [1., 1., 0., 1.]]
+    self.assertEqual(self.evaluate(math_ops.reduce_variance(x)), 0.25)
+    x_np = np.array(x)
+    self.assertEqual(np.var(x_np), 0.25)
+    self.assertEqual(self.evaluate(math_ops.reduce_variance(x_np)), 0.25)
+
+  def testReduceVarComplex(self):
+    # Ensure that complex values are handled to be consistent with numpy
+    complex_ys = [([0 - 1j, 0 + 1j], dtypes.float64),
+                  (np.array([0 - 1j, 0 + 1j], "complex64"), dtypes.float32),
+                  (np.array([0 - 1j, 0 + 1j], "complex128"), dtypes.float64)]
+    for y, dtype in complex_ys:
+      y_result = math_ops.reduce_variance(y)
+      self.assertEqual(np.var(y), 1.0)
+      self.assertEqual(self.evaluate(y_result), 1.0)
+      self.assertEqual(y_result.dtype, dtype)
 
   def testReduceStd(self):
     x = np.array([[0, 0, 0], [0, 0, 0]], "float32")
@@ -84,8 +102,26 @@ class ReduceTest(test_util.TensorFlowTestCase):
     self.assertAllClose(
         self.evaluate(math_ops.reduce_std(x, axis=0)), [0, 0, 0])
 
-    x = np.array([[1, 2, 1, 1], [1, 1, 0, 1]], "float32")
-    self.assertAllClose(self.evaluate(math_ops.reduce_std(x)), 0.5)
+    x = [[1, 2, 1, 1], [1, 1, 0, 1]]
+    with self.assertRaisesRegexp(TypeError, "must be either real or complex"):
+      math_ops.reduce_std(x)
+
+    x = [[1., 2., 1., 1.], [1., 1., 0., 1.]]
+    self.assertEqual(self.evaluate(math_ops.reduce_std(x)), 0.5)
+    x_np = np.array(x)
+    self.assertEqual(np.std(x_np), 0.5)
+    self.assertEqual(self.evaluate(math_ops.reduce_std(x_np)), 0.5)
+
+  def testReduceStdComplex(self):
+    # Ensure that complex values are handled to be consistent with numpy
+    complex_ys = [([0 - 1j, 0 + 1j], dtypes.float64),
+                  (np.array([0 - 1j, 0 + 1j], "complex64"), dtypes.float32),
+                  (np.array([0 - 1j, 0 + 1j], "complex128"), dtypes.float64)]
+    for y, dtype in complex_ys:
+      y_result = math_ops.reduce_std(y)
+      self.assertEqual(np.std(y), 1.0)
+      self.assertEqual(self.evaluate(y_result), 1.0)
+      self.assertEqual(y_result.dtype, dtype)
 
 
 @test_util.run_all_in_graph_and_eager_modes

From 0ccc0ed961a2d80e4325f78c59be81e063fb1a9f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 11:50:52 -0700
Subject: [PATCH 1402/1533] Fix op wrapper code to correctly intercept
 CompositeTensors when doing op layer wrapping.

PiperOrigin-RevId: 313821947
Change-Id: I8e37ab0b38b1f861ca36ba3017e0068ed961e206
---
 .../python/keras/engine/base_layer_utils.py   | 20 ++++++------
 .../keras/engine/base_layer_utils_test.py     | 31 +++++++++++++++++++
 .../python/keras/engine/training_test.py      |  3 +-
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 6d25995e4c2..d7bd3d5d372 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -25,6 +25,7 @@ from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
@@ -214,18 +215,17 @@ def _create_keras_history_helper(tensors, processed_ops, created_layers):
   for tensor in tensor_list:
     if getattr(tensor, '_keras_history', None) is not None:
       continue
+    if sparse_tensor.is_sparse(tensor) or ragged_tensor.is_ragged(tensor):
+      example = """
+      weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
+      output = tf.keras.layers.Lambda(weights_mult)(input)
+      """
+      raise ValueError('Tensorflow ops that generate ragged or sparse tensor '
+                       'outputs are currently not supported by Keras automatic '
+                       'op wrapping. Please wrap these ops in a Lambda layer: '
+                       '\n\n```\n{example}\n```\n'.format(example=example))
     op = tensor.op  # The Op that created this Tensor.
     if op not in processed_ops:
-      if op.type.startswith('Sparse'):
-        lambda_example = """
-        weights_mult = lambda x: tf.sparse.sparse_dense_matmul(x, weights)
-        output = tf.keras.layers.Lambda(weights_mult)(input)
-        """
-        raise ValueError(
-            'Sparse ops are not supported with functional models with built-in '
-            'layer wrapping. Please wrap the sparse ops in a Lambda layer like'
-            ': \n{lambda_example}\n'.format(lambda_example=lambda_example))
-
       # Recursively set `_keras_history`.
       op_inputs = list(op.inputs)
       constants = {}
diff --git a/tensorflow/python/keras/engine/base_layer_utils_test.py b/tensorflow/python/keras/engine/base_layer_utils_test.py
index 21f539d89c5..d27a4cd3297 100644
--- a/tensorflow/python/keras/engine/base_layer_utils_test.py
+++ b/tensorflow/python/keras/engine/base_layer_utils_test.py
@@ -16,12 +16,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
+from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import lookup_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -66,5 +70,32 @@ class TrackableWeightHandlerTest(keras_parameterized.TestCase):
     _ = backend.batch_get_value(table_handler.get_tensors())
 
 
+@combinations.generate(combinations.combine(mode=['graph', 'eager']))
+class OpLayerTest(keras_parameterized.TestCase):
+
+  def test_tensor_op_layer(self):
+    int_values = keras.Input(shape=(2,), dtype=dtypes.int32)
+    float_values = math_ops.cast(int_values, dtypes.float32)
+    model = keras.Model(int_values, float_values)
+    model.compile(loss='mse')
+
+    input_data = np.array([[1, 2], [3, 4]], dtype=np.int32)
+    expected = [[1.0, 2.0], [3.0, 4.0]]
+    output = model.predict(input_data)
+    self.assertAllClose(expected, output)
+
+  def test_ragged_op_layer(self):
+    with self.assertRaisesRegexp(ValueError, 'Keras automatic op wrapping'):
+      int_values = keras.Input(shape=(None,), dtype=dtypes.int32, ragged=True)
+      float_values = math_ops.cast(int_values, dtypes.float32)
+      _ = keras.Model(int_values, float_values)
+
+  def test_sparse_op_layer(self):
+    with self.assertRaisesRegexp(ValueError, 'Keras automatic op wrapping'):
+      int_values = keras.Input(shape=(None,), dtype=dtypes.int32, sparse=True)
+      float_values = math_ops.cast(int_values, dtypes.float32)
+      _ = keras.Model(int_values, float_values)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index c1c498b207b..e1180b5234b 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1531,8 +1531,7 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase):
     output = sparse_ops.sparse_minimum(inputs, inputs)
     with self.assertRaisesRegexp(
         ValueError,
-        'Sparse ops are not supported with functional models with built-in '
-        'layer wrapping'
+        'not supported by Keras automatic op wrapping'
     ):
       training_module.Model([inputs], output)
 

From 13645c7fd2a7c151b82a9bfa144b0e682122ed7b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 11:53:08 -0700
Subject: [PATCH 1403/1533] use static linked cupti internally.

PiperOrigin-RevId: 313822331
Change-Id: Ie2c1e4a2d92ed9f92e2dcd8d3bc354546ab4663f
---
 tensorflow/core/profiler/internal/gpu/BUILD   | 12 ++++++------
 tensorflow/stream_executor/build_defs.bzl     |  3 +++
 tensorflow/stream_executor/cuda/cupti_stub.cc |  4 ++++
 3 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index c6fe4d77031..670080573b2 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -15,6 +15,10 @@ load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
+load(
+    "//tensorflow/stream_executor:build_defs.bzl",
+    "tf_additional_cupti_deps",
+)
 
 package(
     default_visibility = ["//tensorflow:internal"],
@@ -91,9 +95,7 @@ tf_cuda_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:platform_base",
-    ] + if_cuda_is_configured_compat([
-        "//tensorflow/stream_executor/cuda:cupti_stub",
-    ]),
+    ] + tf_additional_cupti_deps(),
 )
 
 tf_cuda_library(
@@ -104,9 +106,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
-    ] + if_cuda_is_configured_compat([
-        "//tensorflow/stream_executor/cuda:cupti_stub",
-    ]),
+    ] + tf_additional_cupti_deps(),
 )
 
 tf_cuda_library(
diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl
index 3cb24f8468f..830f526dbd7 100644
--- a/tensorflow/stream_executor/build_defs.bzl
+++ b/tensorflow/stream_executor/build_defs.bzl
@@ -10,6 +10,9 @@ def tf_additional_cuda_platform_deps():
 def tf_additional_cuda_driver_deps():
     return [":cuda_stub"]
 
+def tf_additional_cupti_deps():
+    return ["//tensorflow/stream_executor/cuda:cupti_stub"]
+
 def tf_additional_cudnn_plugin_deps():
     return []
 
diff --git a/tensorflow/stream_executor/cuda/cupti_stub.cc b/tensorflow/stream_executor/cuda/cupti_stub.cc
index 130c3f96e44..feea09e002a 100644
--- a/tensorflow/stream_executor/cuda/cupti_stub.cc
+++ b/tensorflow/stream_executor/cuda/cupti_stub.cc
@@ -23,12 +23,16 @@ limitations under the License.
 namespace {
 // Returns DSO handle or null if loading the DSO fails.
 void* GetDsoHandle() {
+#if defined(PLATFORM_GOOGLE) && (CUDA_VERSION > 10000)
+  return nullptr;
+#else
   static auto handle = []() -> void* {
     auto handle_or = stream_executor::internal::DsoLoader::GetCuptiDsoHandle();
     if (!handle_or.ok()) return nullptr;
     return handle_or.ValueOrDie();
   }();
   return handle;
+#endif
 }
 
 template <typename T>

From 9c21803bcd892b799210a6e243768ee5421f4066 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Fri, 29 May 2020 12:13:22 -0700
Subject: [PATCH 1404/1533] Add unit tests for TPUEmbedding API.

PiperOrigin-RevId: 313826075
Change-Id: Ib1a37509944106bb5bee9f006e2ad632cd835136
---
 tensorflow/python/tpu/BUILD                   |   60 +
 .../python/tpu/tpu_embedding_v2_cpu_test.py   |  239 +++
 .../python/tpu/tpu_embedding_v2_test.py       | 1397 +++++++++++++++++
 .../python/tpu/tpu_embedding_v2_test_lib.py   |   96 ++
 4 files changed, 1792 insertions(+)
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v2_test.py
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v2_test_lib.py

diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index d398396ec2a..ea1317536a4 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -468,6 +468,66 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tpu_embedding_v2_test_lib",
+    srcs = ["tpu_embedding_v2_test_lib.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding_v2_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:init_ops_v2",
+    ],
+)
+
+tpu_py_test(
+    name = "tpu_embedding_v2_test",
+    srcs = [
+        "tpu_embedding_v2_test.py",
+    ],
+    disable_experimental = True,
+    python_version = "PY3",
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding",
+        ":tpu_embedding_v2",
+        ":tpu_embedding_v2_test_lib",
+        ":tpu_strategy_util",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/training/tracking:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_embedding_v2_cpu_test",
+    srcs = [
+        "tpu_embedding_v2_cpu_test.py",
+    ],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding_v2",
+        ":tpu_embedding_v2_test_lib",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_proto_library(
     name = "tensor_tracer_proto",
     srcs = ["tensor_tracer.proto"],
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
new file mode 100644
index 00000000000..a06f48187c9
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
@@ -0,0 +1,239 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on CPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_test_lib
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.util import nest
+
+
+class CPUEmbeddingTest(tpu_embedding_v2_test_lib.EmbeddingTestBase):
+
+  def setUp(self):
+    super(CPUEmbeddingTest, self).setUp()
+    self._create_initial_data()
+
+  def _create_mid_level(self):
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config,
+        batch_size=self.batch_size,
+        optimizer=optimizer)
+
+  def _get_dense_tensors(self, dtype=dtypes.int32):
+    feature0 = constant_op.constant(self.feature_watched_values, dtype=dtype)
+    feature1 = constant_op.constant(self.feature_favorited_values, dtype=dtype)
+    feature2 = constant_op.constant(self.feature_friends_values, dtype=dtype)
+    return (feature0, feature1, feature2)
+
+  def test_cpu_dense_lookup(self):
+    mid_level = self._create_mid_level()
+    features = self._get_dense_tensors()
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=None,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    all_lookups = []
+    for feature, config in zip(nest.flatten(features), self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      all_lookups.append(table[feature.numpy()])
+    self.assertAllClose(results, nest.pack_sequence_as(results, all_lookups))
+
+  def test_cpu_dense_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_dense_tensors()
+    weights = self._get_dense_tensors(dtype=dtypes.float32)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Weight specified for .*, but input is dense.'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def _get_sparse_tensors(self, dtype=dtypes.int32):
+    feature0 = sparse_tensor.SparseTensor(
+        indices=self.feature_watched_indices,
+        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 2])
+    feature1 = sparse_tensor.SparseTensor(
+        indices=self.feature_favorited_indices,
+        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 2])
+    feature2 = sparse_tensor.SparseTensor(
+        indices=self.feature_friends_indices,
+        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 3])
+    return (feature0, feature1, feature2)
+
+  def test_cpu_sparse_lookup(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=None,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    reduced = []
+    for feature, config in zip(nest.flatten(features), self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      all_lookups = table[feature.values.numpy()]
+      # With row starts we can use reduceat in numpy. Get row starts from the
+      # ragged tensor API.
+      ragged = ragged_tensor.RaggedTensor.from_sparse(feature)
+      row_starts = ragged.row_starts().numpy()
+      reduced.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        # for mean, divide by the row lengths.
+        reduced[-1] /= np.expand_dims(ragged.row_lengths().numpy(), axis=1)
+    self.assertAllClose(results, nest.pack_sequence_as(results, reduced))
+
+  def test_cpu_sparse_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    weights = self._get_sparse_tensors(dtype=dtypes.float32)
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=weights,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    weighted_sum = []
+    for feature, weight, config in zip(nest.flatten(features),
+                                       nest.flatten(weights),
+                                       self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      # Expand dims here needed to broadcast this multiplication properly.
+      weight = np.expand_dims(weight.values.numpy(), axis=1)
+      all_lookups = table[feature.values.numpy()] * weight
+      # With row starts we can use reduceat in numpy. Get row starts from the
+      # ragged tensor API.
+      row_starts = ragged_tensor.RaggedTensor.from_sparse(feature).row_starts()
+      row_starts = row_starts.numpy()
+      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
+    self.assertAllClose(results, nest.pack_sequence_as(results,
+                                                       weighted_sum))
+
+  def test_cpu_sparse_lookup_with_non_sparse_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    weights = self._get_dense_tensors(dtype=dtypes.float32)
+    with self.assertRaisesRegex(
+        ValueError, 'but it does not match type of the input which is'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def _get_ragged_tensors(self, dtype=dtypes.int32):
+    feature0 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
+        row_lengths=self.feature_watched_row_lengths)
+    feature1 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
+        row_lengths=self.feature_favorited_row_lengths)
+    feature2 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
+        row_lengths=self.feature_friends_row_lengths)
+    return (feature0, feature1, feature2)
+
+  def test_cpu_ragged_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_ragged_tensors()
+    weights = self._get_ragged_tensors(dtype=dtypes.float32)
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=weights,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    weighted_sum = []
+    for feature, weight, config in zip(nest.flatten(features),
+                                       nest.flatten(weights),
+                                       self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      # Expand dims here needed to broadcast this multiplication properly.
+      weight = np.expand_dims(weight.values.numpy(), axis=1)
+      all_lookups = table[feature.values.numpy()] * weight
+      row_starts = feature.row_starts().numpy()
+      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
+    self.assertAllClose(results, nest.pack_sequence_as(results,
+                                                       weighted_sum))
+
+  def test_cpu_invalid_structure_for_features(self):
+    mid_level = self._create_mid_level()
+    # Remove one element of the tuple, self.feature_config has 3 so we need to
+    # pass 3.
+    features = tuple(self._get_sparse_tensors()[:2])
+    with self.assertRaises(ValueError):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=None,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def test_cpu_invalid_structure_for_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    # Remove one element of the tuple, self.feature_config has 3 so we need to
+    # pass 3 (or None).
+    weights = tuple(self._get_dense_tensors(dtype=dtypes.float32)[:2])
+    with self.assertRaises(ValueError):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def test_cpu_sequence_lookup(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched', max_sequence_length=2),)
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    mid_level = tpu_embedding_v2.TPUEmbedding(
+        feature_config=feature_config,
+        batch_size=self.batch_size,
+        optimizer=optimizer)
+    features = tuple(self._get_sparse_tensors()[:1])
+    with self.assertRaisesRegex(
+        ValueError, 'Sequence features unsupported at this time.'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=None,
+          tables=mid_level.embedding_tables,
+          feature_config=feature_config)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
new file mode 100644
index 00000000000..5bfbdcb1c8a
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -0,0 +1,1397 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import itertools
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.tpu import tpu_embedding
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_test_lib
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('tpu', '', 'Name of TPU to connect to.')
+flags.DEFINE_string('project', None, 'Name of GCP project with TPU.')
+flags.DEFINE_string('zone', None, 'Name of GCP zone with TPU.')
+flags.DEFINE_string('model_dir', os.environ.get('TEST_TMPDIR'),
+                    'A temporary directory.')
+
+
+class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super(TPUEmbeddingCheckpointTest, self).setUp()
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    self.strategy = tpu_strategy.TPUStrategy(self.resolver)
+    self.num_rows = self.strategy.num_replicas_in_sync
+
+    # These tests use two mid level API objects, initialized with different
+    # values. These have the same sizes.
+    with self.strategy.scope():
+      self.first_mid_level_contents = np.ones((self.num_rows, 4))
+      self.first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+          learning_rate=0.1)
+      self.first_mid_level = self.build_mid_level(
+          self.first_mid_level_contents, self.first_mid_level_optimizer)
+
+      self.second_mid_level_contents = np.ones((self.num_rows, 4)) * 2
+      self.second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+          learning_rate=0.1)
+      self.second_mid_level = self.build_mid_level(
+          self.second_mid_level_contents, self.second_mid_level_optimizer,
+          initialize_tpu_embedding=False)
+
+    self.cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+        learning_rate=0.1)
+    self.cpu_mid_level = self.build_mid_level(
+        self.second_mid_level_contents, self.cpu_mid_level_optimizer)
+
+  def tearDown(self):
+    tpu_strategy_util.shutdown_tpu_system(self.resolver)
+    super(TPUEmbeddingCheckpointTest, self).tearDown()
+
+  def test_checkpoint_save_retrieves(self):
+    # Ensure that the variables from the first model are loaded.
+    self.first_mid_level._load_variables()
+
+    self.assertAllClose(
+        self.first_mid_level_contents,
+        self.make_checkpoint_and_get_embedding('before_load',
+                                               self.first_mid_level),
+        msg='Checkpoint should contain values from the first api object.')
+
+    self.second_mid_level._load_variables()
+
+    # When we load the variables from the second mid level API object to the TPU
+    # we expect that checkpointing the first mid level API object will now
+    # retrieve the values from the TPU which are now different from the current
+    # variables in the first mid level.
+    self.assertAllClose(
+        self.second_mid_level_contents,
+        self.make_checkpoint_and_get_embedding('after_load',
+                                               self.first_mid_level),
+        msg='Checkpoint should contain values from the second api object.')
+
+  def test_checkpoint_restore_loads(self):
+
+    def get_values(mid):
+      return ops.convert_to_tensor(
+          mid._variables['table']['parameters'].variables[0])
+
+    self.first_mid_level._load_variables()
+
+    first_checkpoint = util.Checkpoint(model=self.first_mid_level)
+    first_checkpoint.save(_get_tmpdir('restore', 'save'))
+
+    # Checkpoint now has values from first_mid_level. See first assert in
+    # test_checkpoint_save_retrieves.
+
+    self.second_mid_level._load_variables()
+
+    self.assertAllClose(
+        self.second_mid_level_contents,
+        get_values(self.second_mid_level),
+        msg='Second mid level api should contain its initial values.',
+    )
+
+    # We restore the checkpoint of our first model into our second model.
+    # This should load the first mid level API object onto the TPU.
+    second_checkpoint = util.Checkpoint(model=self.second_mid_level)
+    second_checkpoint.restore(_get_tmpdir('restore', 'save-1'))
+
+    # Call retrieve here as a way to check what the TPU contains contains.
+    # Calling the retrieve ops directly might make for a cleaner separation of
+    # test and module, though.
+    self.second_mid_level._retrieve_variables()
+
+    self.assertAllClose(
+        self.first_mid_level_contents,
+        get_values(self.second_mid_level),
+        msg='Second mid level api should have retrieved the first model values.'
+    )
+
+  def build_mid_level(self, embedding_values, optimizer,
+                      initialize_tpu_embedding=True):
+    """Creates an embedding api object initialized to embedding_values."""
+    initializer = init_ops_v2.Constant(embedding_values)
+
+    table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.num_rows, dim=4, initializer=initializer,
+        combiner='sum', name='table')
+    feature_config = (tpu_embedding_v2_utils.FeatureConfig(
+        table=table, name='feature'),)
+
+    # batch_size here does not matter as we aren't traininig in any of these
+    # tests.
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config, 64, optimizer,
+        initialize_tpu_embedding=initialize_tpu_embedding)
+
+  def make_checkpoint_and_get_embedding(self, name, model):
+    """Saves model to checkpoint name, retrieves embedding variables."""
+    checkpoint = util.Checkpoint(model=model)
+    checkpoint.save(_get_tmpdir(name, 'save'))
+
+    # Get the name of the parameters variable which should be the only
+    # [self.num_rows, 4] shaped tensor in the checkpoint. Note that we do this
+    # as the key can change.
+    variables = checkpoint_utils.list_variables(_get_tmpdir(name))
+    variables = [name for name, size in variables if size == [self.num_rows, 4]]
+    if len(variables) != 1:
+      raise RuntimeError('Found {} copies of the parameter variable in the '
+                         'checkpoint. Exactly one copy exported.'.format(
+                             len(variables)))
+    return checkpoint_utils.load_variable(_get_tmpdir(name), variables[0])
+
+  def test_model_export_cpu(self):
+    self.first_mid_level._load_variables()
+
+    tpu_checkpoint = util.Checkpoint(model=self.first_mid_level)
+    tpu_checkpoint.save(_get_tmpdir('export_cpu', 'save'))
+
+    # We restore the checkpoint of our tpu mid level onto our cpu mid level.
+    cpu_checkpoint = util.Checkpoint(model=self.cpu_mid_level)
+    cpu_checkpoint.restore(_get_tmpdir('export_cpu', 'save-1'))
+
+    @def_function.function
+    def serve_tensors(features):
+      features = tpu_embedding_v2.cpu_embedding_lookup(
+          features, None, self.cpu_mid_level.embedding_tables,
+          self.cpu_mid_level._feature_config)
+      return features[0]
+
+    signatures = {
+        'serving_default':
+            serve_tensors.get_concrete_function(
+                (tensor_spec.TensorSpec(
+                    shape=(2,), dtype=dtypes.int32, name='feature'),))}
+    save.save(self.cpu_mid_level,
+              export_dir=_get_tmpdir('export_cpu', 'exported_model'),
+              signatures=signatures)
+
+    imported = load.load(_get_tmpdir('export_cpu', 'exported_model'))
+    predict_fn = imported.signatures['serving_default']
+
+    input_feature_value = np.array([1, 0])
+    input_batch = (constant_op.constant(input_feature_value,
+                                        dtype=dtypes.int32),)
+    prediction = predict_fn(*input_batch)['output_0']
+    self.assertAllClose(prediction.numpy(),
+                        self.first_mid_level_contents[input_feature_value])
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_check_checkpoint_variable_names_are_same_on_cpu_and_tpu(self,
+                                                                   optimizer):
+    # Reinitialize the TPU so that we can re-initialize the embeddings with the
+    # given optimizer.
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    optimizer = optimizer(learning_rate=0.1)
+
+    with self.strategy.scope():
+      tpu_mid_level = self.build_mid_level(
+          self.first_mid_level_contents, optimizer)
+
+    tpu_checkpoint = util.Checkpoint(model=tpu_mid_level)
+    tpu_checkpoint.save(_get_tmpdir('save-tpu', 'save'))
+    tpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-tpu'))
+
+    cpu_mid_level = self.build_mid_level(
+        self.first_mid_level_contents, optimizer)
+
+    cpu_checkpoint = util.Checkpoint(model=cpu_mid_level)
+    cpu_checkpoint.save(_get_tmpdir('save-cpu', 'save'))
+    cpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-cpu'))
+
+    self.assertAllEqual(tpu_variables, cpu_variables)
+
+
+class TPUEmbeddingTest(parameterized.TestCase,
+                       tpu_embedding_v2_test_lib.EmbeddingTestBase):
+
+  def setUp(self):
+    super(TPUEmbeddingTest, self).setUp()
+    self._create_initial_data()
+    self.resolver = None
+
+  def tearDown(self):
+    if self.resolver:
+      tpu_strategy_util.shutdown_tpu_system(self.resolver)
+    super(TPUEmbeddingTest, self).tearDown()
+
+  def test_tables_with_same_name(self):
+    with self.assertRaisesRegex(
+        ValueError, 'Multiple tables with name table found.'):
+      with self._get_strategy().scope():
+        tpu_embedding_v2.TPUEmbedding(
+            (tpu_embedding_v2_utils.FeatureConfig(
+                table=tpu_embedding_v2_utils.TableConfig(
+                    name='table',
+                    vocabulary_size=4,
+                    dim=2,
+                    initializer=self.initializer,),
+                name='watched'),
+             tpu_embedding_v2_utils.FeatureConfig(
+                 table=tpu_embedding_v2_utils.TableConfig(
+                     name='table',
+                     vocabulary_size=4,
+                     dim=2,
+                     initializer=self.initializer),
+                 name='favorited')),
+            self.batch_size,
+            tpu_embedding_v2_utils.SGD(learning_rate=0.1))
+
+  def test_unsupported_optimizer(self):
+    with self.assertRaisesRegex(
+        ValueError, 'is an unsupported optimizer class.'):
+      with self._get_strategy().scope():
+        tpu_embedding_v2.TPUEmbedding(
+            self.feature_config, self.batch_size,
+            tpu_embedding.AdagradParameters(learning_rate=0.1))
+
+  def test_pass_non_tensor_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    @def_function.function
+    def test_apply():
+      mid_level_api.apply_gradients((1, 2, 3))
+
+    with self.assertRaisesRegex(ValueError, 'Expected Tensor.'):
+      strategy.run(test_apply)
+
+  def test_pass_different_structure_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    @def_function.function
+    def test_apply():
+      # This should be a tuple as feature_config is a tuple of 3 configs.
+      mid_level_api.apply_gradients([1, 2, 3])
+
+    with self.assertRaisesRegex(
+        TypeError,
+        'The two structures don\'t have the same nested structure.'):
+      strategy.run(test_apply)
+
+  def test_pass_none_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+
+    @def_function.function
+    def embedding_and_set_gradients(data):
+      mid_level_api.enqueue(data)
+      def tpu_fn():
+        results = mid_level_api.dequeue()
+        mid_level_api.apply_gradients((None, None,
+                                       array_ops.ones_like(results[2])))
+        return results
+      return strategy.run(tpu_fn)
+
+    @def_function.function
+    def embedding_only(data):
+      mid_level_api.enqueue(data, training=False)
+      def tpu_fn():
+        return mid_level_api.dequeue()
+      return strategy.run(tpu_fn)
+
+    first = self._get_replica_numpy(
+        embedding_and_set_gradients(data), strategy, 0)
+    second = self._get_replica_numpy(embedding_only(data), strategy, 0)
+
+    # First two features should be the same as None gradient was applied.
+    # Third feature had gradient of 1 passed in from each core.
+    # Each core received the same ids per core and returned the following batch:
+    # [ row 3, row 0 + row 1 + row 2 ]
+    # so gradient update was (learning rate = 0.1):
+    #   row 0: -1/3*0.1
+    #   row 1: -1/3*0.1
+    #   row 2: -1/3*0.1
+    #   row 3: -1*0.1
+    # There is a factor of num_replicas because each replica gave an update.
+
+    num_replicas = strategy.num_replicas_in_sync
+    update = ([[0.0]], [[0.0]],
+              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
+    golden = tuple([feature-np.array(up) for feature, up in zip(first, update)])
+
+    self.assertAllClose(golden, second)
+
+  def _get_strategy(self):
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    return tpu_strategy.TPUStrategy(self.resolver)
+
+  def test_dequeue_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.dequeue()
+
+  def test_enqueue_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    features = {
+        'watched': sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices,
+            values=self.feature_watched_values,
+            dense_shape=[2, 2])}
+    with self.assertRaises(RuntimeError):
+      mid_level_api.enqueue(features)
+
+  def test_apply_gradients_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.enqueue(None)
+
+  def test_get_embedding_tables_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    self.assertEqual(
+        set(mid_level_api.embedding_tables.keys()),
+        set([self.table_video, self.table_user]))
+
+  def test_get_embedding_tables_on_tpu(self):
+    with self._get_strategy().scope():
+      mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.embedding_tables()
+
+  def test_enqueue_weight_for_dense_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy, include_weights=True)
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features, weights = next(dist_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(ValueError, 'Weight specified for dense input'):
+      test_fn()
+
+  def test_enqueue_wrong_weight_type_for_sparse_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy, include_weights=True)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features = next(sparse_iter)
+      _, weights = next(ragged_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not match type input which is SparseTensor.'):
+      test_fn()
+
+  def test_enqueue_wrong_weight_type_for_ragged_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy, include_weights=True)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      _, weights = next(sparse_iter)
+      features = next(ragged_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not match type input which is RaggedTensor.'):
+      test_fn()
+
+  def test_enqueue_sparse_and_ragged(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      sparse_features = next(sparse_iter)
+      ragged_features = next(ragged_iter)
+      features = (sparse_features[0], ragged_features[1], sparse_features[2])
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Found both SparseTensors and RaggedTensors'):
+      test_fn()
+
+  def test_enqueue_incorrect_structure_for_features(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features = next(sparse_iter)
+      features = (features[0],)
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    # The error here is raised from nest.assert_same_structure
+    with self.assertRaises(ValueError):
+      test_fn()
+
+  def test_enqueue_incorrect_structure_for_weights(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy, include_weights=True)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features, weights = next(sparse_iter)
+      weights = (weights[0],)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    # The error here is raised from nest.assert_same_structure
+    with self.assertRaises(ValueError):
+      test_fn()
+
+  def test_enqueue_ragged_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def get_activations():
+        return mid_level_api.dequeue()
+
+      sparse_features = next(sparse_iter)
+      ragged_features = next(ragged_iter)
+      mid_level_api.enqueue(sparse_features, training=False)
+      sparse_activations = strategy.run(get_activations)
+      mid_level_api.enqueue(ragged_features, training=False)
+      ragged_activations = strategy.run(get_activations)
+      return sparse_activations, ragged_activations
+
+    sparse_activations, ragged_activations = test_fn()
+
+    # Extact per core numpy arrays and check that both sparse and ragged have
+    # the same results.
+    sparse0 = self._get_replica_numpy(sparse_activations, strategy, 0)
+    ragged0 = self._get_replica_numpy(ragged_activations, strategy, 0)
+    self.assertAllClose(sparse0, ragged0)
+
+  @parameterized.parameters(True, False)
+  def test_enqueue_with_weights(self, ragged):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    weight = 0.5
+    if ragged:
+      dataset = self._create_ragged_dataset(strategy, include_weights=True,
+                                            weight=weight)
+    else:
+      dataset = self._create_sparse_dataset(strategy, include_weights=True,
+                                            weight=weight)
+
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_and_get(features, weights):
+      def get_activations():
+        return mid_level_api.dequeue()
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(get_activations)
+
+    features, weights = next(dataset_iter)
+    # Replace the weight for the second feature by None to test.
+    weights = (weights[0], None, weights[2])
+
+    no_weights_activations = enqueue_and_get(features, weights=None)
+    weights_activations = enqueue_and_get(features, weights=weights)
+
+    # Extact per core numpy arrays.
+    no_weights0 = self._get_replica_numpy(no_weights_activations, strategy, 0)
+    weights0 = self._get_replica_numpy(weights_activations, strategy, 0)
+    # videos table has sum combiner and users table has mean combiner.
+    # i.e. users table lookups isn't affected by the weights as all the weights
+    # are the same.
+    # Tuple entry 0 and 1 are the watched and favorited features from the videos
+    # table and entry 2 is the friends feature from the users table.
+    # Note that None was passed as a weight for entry 1 so weight should have no
+    # effect.
+    weight = (0.5, 1.0, 1.0)
+    golden = tuple([no_weight * w for no_weight, w in zip(no_weights0, weight)])
+
+    self.assertAllClose(golden, weights0)
+
+  def test_enqueue_with_outside_compilation(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_outside_compilation(data):
+      def get_activations(features):
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(data,))
+
+    @def_function.function
+    def enqueue_without_outside_compilation(data):
+      def get_activations():
+        return mid_level_api.dequeue()
+      mid_level_api.enqueue(data, training=False)
+      return strategy.run(get_activations)
+
+    features = next(dataset_iter)
+
+    activations_oc = enqueue_with_outside_compilation(features)
+    activations = enqueue_without_outside_compilation(features)
+
+    # Extact per core numpy arrays.
+    activations_oc0 = self._get_replica_numpy(activations_oc, strategy, 0)
+    activations0 = self._get_replica_numpy(activations, strategy, 0)
+
+    self.assertAllClose(activations_oc0, activations0)
+
+  def test_enqueue_with_outside_compilation_in_control_flow(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    # This is one way to force the enqueue in some control flow. @tf.functions
+    # aren't inlined in the calling tf.function. An alternative would be to
+    # place the enqueue in a switch_v2 or something similar.
+    @def_function.function
+    def enqueue_fn(features):
+      mid_level_api.enqueue(features, training=False)
+
+    @def_function.function
+    def enqueue_with_outside_compilation():
+      def get_activations(features):
+        enqueue_fn(features)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(next(dataset_iter),))
+
+    with self.assertRaisesRegex(
+        RuntimeError,
+        'does not match graph which contains TPUReplicateContext'):
+      enqueue_with_outside_compilation()
+
+  def test_enqueue_with_outside_compilation_non_direct_input(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_outside_compilation():
+      def get_activations(features):
+        # This inserts a mul operation on the TPU to trigger the direct input
+        # error.
+        features = (features[0]*2, features[1]*2, features[2]*2)
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(next(dataset_iter),))
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not have the `_tpu_input_identity` attr'):
+      enqueue_with_outside_compilation()
+
+  def test_enqueue_with_outside_compilation_auto_mode(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_no_gradient_apply(data):
+      def get_activations(features):
+        # Note the lack of setting training=False, so training defaults to true
+        # here even though we don't have apply gradients.
+        # We detect the correct mode based on which ops exist that share the
+        # same 'name'.
+        mid_level_api.enqueue(features, name='call1')
+        return mid_level_api.dequeue(name='call1')
+      return strategy.run(get_activations, args=(data,))
+
+    @def_function.function
+    def enqueue_with_gradient_apply(data):
+      def get_activations(features):
+        mid_level_api.enqueue(features, name='call2')
+        activations = mid_level_api.dequeue(name='call2')
+        # Apply an all ones gradient
+        gradients = nest.map_structure(array_ops.ones_like, activations)
+        mid_level_api.apply_gradients(gradients, name='call2')
+        return activations
+      return strategy.run(get_activations, args=(data,))
+
+    data = next(dataset_iter)
+    before_gradient_apply = enqueue_with_gradient_apply(data)
+    after_gradient_apply = enqueue_with_no_gradient_apply(data)
+    before_gradient_apply0 = self._get_replica_numpy(before_gradient_apply,
+                                                     strategy, 0)
+    after_gradient_apply0 = self._get_replica_numpy(after_gradient_apply,
+                                                    strategy, 0)
+
+    num_replicas = strategy.num_replicas_in_sync
+    # We are passing a gradient of 1 for all lookups, optimizer is SGD with a
+    # learning rate of 0.1. Feature 0 and 1 are looked up with a sum combiner
+    # with the following ids:
+    # Feature 0: [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
+    # Feature 1: [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
+    # i.e. Row 0 and 1 were looked up 3*num_replicas times over all cores and as
+    # the gradient is 1, the accumulated gradient is 3*num_replicas for each
+    # position in row 0 and 1 in table.
+    #
+    # See comments in test_pass_none_to_apply_gradients for the update to
+    # Feature 2 and its table.
+    # The *2 in the next tests are because those rows have 2 lookups vs
+    # the 1 lookup in the other row.
+    update = ([[0.3 * num_replicas], [0.3 * num_replicas * 2]],
+              [[0.3 * num_replicas * 2], [0.3 * num_replicas]],
+              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
+    golden = tuple([before - np.array(up) for before, up in
+                    zip(before_gradient_apply0, update)])
+
+    self.assertAllClose(golden, after_gradient_apply0)
+
+  def _create_strategy_and_mid_level(self, optimizer_name):
+    strategy = self._get_strategy()
+
+    with strategy.scope():
+      if optimizer_name == 'sgd':
+        optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+      elif optimizer_name == 'adagrad':
+        optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1)
+      elif optimizer_name == 'adam':
+        optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1)
+      else:
+        raise ValueError('optimizer is not recognized: ', optimizer_name)
+      mid_level_api = self._create_mid_level(optimizer=optimizer)
+
+    return strategy, mid_level_api, optimizer
+
+  @parameterized.parameters(
+      *itertools.product(
+          ['sgd', 'adagrad', 'adam'],
+          [True, False]))
+  def test_embedding(self, optimizer_name, training):
+    strategy, mid_level_api, optimizer = (
+        self._create_strategy_and_mid_level(optimizer_name))
+
+    dataset = self._create_sparse_dataset(strategy)
+    dist = strategy.experimental_distribute_dataset(dataset)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        """Create and run computation that returns the embedding activations."""
+        if not training:
+          activations = mid_level_api.dequeue()
+          total_loss = _get_total_loss_tensor(activations)
+          ret_val = [total_loss] + list(activations)
+          return ret_val
+        else:
+          with backprop.GradientTape() as tape:
+            activations = mid_level_api.dequeue()
+            tape.watch(activations)
+            total_loss = _get_total_loss_tensor(activations)
+            loss_per_replica = total_loss / strategy.num_replicas_in_sync
+          gradients = tape.gradient(loss_per_replica, activations)
+          mid_level_api.apply_gradients(gradients)
+        ret_val = [total_loss] + list(activations)
+        return ret_val
+
+      mid_level_api.enqueue(next(dist_iter), training=training)
+      result = strategy.run(step)
+      return result
+
+    # Run model.
+    shard_out_val = test_fn()
+
+    # Retrieve TPU weights to CPU.
+    mid_level_api._retrieve_variables()
+
+    # Compute sparse tensors for global batch.
+    input_data = next(iter(self._create_sparse_dataset(strategy)))
+
+    # Check results.
+    self._check_results(strategy, shard_out_val, training, input_data,
+                        mid_level_api._variables,
+                        optimizer)
+
+  def _create_mid_level(self, optimizer=None):
+    # Create `TPUEmbedding` object.
+    if optimizer is None:
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+
+    num_replicas = (
+        distribution_strategy_context.get_strategy().num_replicas_in_sync)
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config,
+        batch_size=self.batch_size * num_replicas,
+        optimizer=optimizer)
+
+  def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    sparse_features = (
+        sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices,
+            values=self.feature_watched_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_favorited_indices,
+            values=self.feature_favorited_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_friends_indices,
+            values=self.feature_friends_values,
+            dense_shape=[self.data_batch_size, 3]))
+    if include_weights:
+      weights = []
+      for sparse in sparse_features:
+        values = (
+            array_ops.ones_like(sparse.values, dtype=dtypes.float32) * weight)
+        weights.append(sparse_tensor.SparseTensor(
+            indices=sparse.indices,
+            values=values,
+            dense_shape=sparse.dense_shape))
+      sparse_features = (sparse_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_ragged_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    ragged_features = (
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_watched_row_lengths,
+            values=self.feature_watched_values),
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_favorited_row_lengths,
+            values=self.feature_favorited_values),
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_friends_row_lengths,
+            values=self.feature_friends_values))
+    if include_weights:
+      weights = []
+      for ragged in ragged_features:
+        weights.append(ragged.with_values(
+            array_ops.ones_like(ragged.values, dtype=dtypes.float32) * weight))
+      ragged_features = (ragged_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(ragged_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_dense_input_fn(self, strategy, include_weights=False, weight=0.5):
+
+    def input_fn(ctx):
+      del ctx
+      features = (
+          constant_op.constant(self.feature_watched_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_favorited_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_friends_values[-2:],
+                               dtype=dtypes.int32))
+      if include_weights:
+        weights = [array_ops.ones_like(t, dtype=dtypes.float32) * weight
+                   for t in features]
+        features = (features, tuple(weights))
+      return dataset_ops.DatasetV2.from_tensors(features).repeat()
+
+    return input_fn
+
+  def _check_results(self, strategy, shard_out_val, training, input_data,
+                     table_to_variable, optimizer):
+    num_replicas = strategy.num_replicas_in_sync
+
+    # Unpack the values `strategy.run()` returns.
+    loss = _unpack(strategy, shard_out_val[0])
+    activation_watched = _unpack(strategy, shard_out_val[1])
+    activation_favorited = _unpack(strategy, shard_out_val[2])
+    activation_friends = _unpack(strategy, shard_out_val[3])
+
+    # Core 0:
+    # Calculate the values of embedding activations.
+    activation_watched_gold0 = np.array([[0, 1, 2, 3], [4, 6, 8, 10]])
+    activation_favorited_gold0 = np.array([[4, 6, 8, 10], [4, 5, 6, 7]])
+    # Second row of `activation_friends_gold0` is the mean of the following.
+    # row 0: 0 1
+    # row 1: 2 3
+    # row 2: 4 5
+    activation_friends_gold0 = np.array([[6, 7], [2, 3]])
+
+    loss_gold0 = _compute_loss(activation_watched_gold0,
+                               activation_favorited_gold0,
+                               activation_friends_gold0)
+
+    # Add on values from other cores:
+    # Activations for watched are an alternating sequence of
+    # activation_watched_gold0 and activation_favorited_gold0.
+    # For favorited it is the same but in the opposite order.
+    activation_watched_gold = np.concatenate(
+        (np.concatenate((np.expand_dims(activation_watched_gold0, axis=0),) *
+                        (num_replicas // 2)),
+         np.concatenate((np.expand_dims(activation_favorited_gold0, axis=0),) *
+                        (num_replicas // 2))),
+        axis=1).reshape([self.batch_size * num_replicas, 4])
+    activation_favorited_gold = np.concatenate(
+        (activation_watched_gold[self.batch_size:,],
+         activation_watched_gold[0:self.batch_size,]))
+    activation_friends_gold = np.concatenate(
+        (activation_friends_gold0,) * num_replicas)
+
+    loss_gold = [loss_gold0] * num_replicas
+
+    # Test values.
+    self.assertAllClose(activation_watched_gold, activation_watched)
+    self.assertAllClose(activation_favorited_gold, activation_favorited)
+    self.assertAllClose(activation_friends_gold, activation_friends)
+
+    self.assertAllClose(loss_gold, loss)
+
+    embedding_table_video_before = np.copy(
+        np.reshape(self.embedding_values, [8, 4]))
+    embedding_table_user_before = np.copy(
+        np.reshape(self.embedding_values, [16, 2]))
+
+    global_batch_size = self.batch_size * num_replicas
+    if training:
+      gradient_wrt_watched_gold = (2 * activation_watched_gold /
+                                   global_batch_size)
+      gradient_wrt_favorited_gold = (2 * activation_favorited_gold /
+                                     global_batch_size)
+      gradient_wrt_friends_gold = (2 * activation_friends_gold /
+                                   global_batch_size)
+
+      # Calculate gradients wrt embedding tables.
+      gradients_wrt_user = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_friends_gold,
+              embedding_table_user_before, input_data[2].indices.numpy(),
+              input_data[2].values.numpy(), self.table_user.combiner))
+      gradients_wrt_video = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_favorited_gold,
+              embedding_table_video_before, input_data[1].indices.numpy(),
+              input_data[1].values.numpy(), self.table_video.combiner) +
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_watched_gold,
+              embedding_table_video_before, input_data[0].indices.numpy(),
+              input_data[0].values.numpy(), self.table_video.combiner))
+
+      self._check_embedding_and_slot_variables(embedding_table_user_before,
+                                               gradients_wrt_user,
+                                               embedding_table_video_before,
+                                               gradients_wrt_video,
+                                               optimizer,
+                                               table_to_variable)
+
+  def _check_embedding_and_slot_variables(self, embedding_table_user_before,
+                                          gradients_wrt_user,
+                                          embedding_table_video_before,
+                                          gradients_wrt_video,
+                                          optimizer,
+                                          table_to_variable):
+    if isinstance(optimizer, tpu_embedding_v2_utils.SGD):
+      check_fn = self._check_embedding_and_slot_variables_for_sgd
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adagrad):
+      check_fn = self._check_embedding_and_slot_variables_for_adagrad
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adam):
+      check_fn = self._check_embedding_and_slot_variables_for_adam
+    else:
+      raise ValueError('optimizer is not recognized: ', type(optimizer))
+    check_fn(embedding_table_user_before, gradients_wrt_user,
+             optimizer, table_to_variable[self.table_user.name])
+    check_fn(embedding_table_video_before, gradients_wrt_video,
+             optimizer, table_to_variable[self.table_video.name])
+
+  def _check_embedding_and_slot_variables_for_sgd(self, embedding_table_before,
+                                                  gradients,
+                                                  optimizer,
+                                                  variables):
+    embedding_table = np.copy(embedding_table_before)
+    embedding_table -= optimizer.learning_rate * np.sum(gradients, axis=0)
+    self.assertAllClose(_get_variable(variables['parameters']).numpy(),
+                        embedding_table)
+
+  def _check_embedding_and_slot_variables_for_adagrad(self,
+                                                      embedding_table_before,
+                                                      gradients,
+                                                      optimizer,
+                                                      variable):
+    embedding_table = np.copy(embedding_table_before)
+    accumulator = (
+        optimizer.initial_accumulator_value + np.sum(gradients, axis=0)**2)
+    embedding_table -= (
+        optimizer.learning_rate * np.sum(gradients, axis=0) /
+        np.sqrt(accumulator))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table)
+    self.assertAllClose(_get_variable(variable['accumulators']).numpy(),
+                        accumulator)
+
+  def _check_embedding_and_slot_variables_for_adam(self, embedding_table_before,
+                                                   gradients,
+                                                   optimizer,
+                                                   variable):
+    embedding_table = np.copy(embedding_table_before)
+    g = np.sum(gradients, axis=0)
+    v = g**2 * (1 - optimizer.beta_2)
+    m = g * (1 - optimizer.beta_1)
+    epsilon = optimizer.epsilon
+    # TPU Embeddings don't have the LR decay factor for Adam.
+    lr_modifier = 1
+    embedding_table -= (
+        m * optimizer.learning_rate * lr_modifier / (np.sqrt(v) + epsilon))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['momenta']).numpy(),
+                        m, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['velocities']).numpy(),
+                        v, rtol=1e-4)
+
+  def _get_replica_numpy(self, structured, strategy, replica_id):
+    def select_replica(x):
+      x = strategy.experimental_local_results(x)
+      if len(x) == 1:
+        return x.numpy()
+      return x[replica_id].numpy()
+    return nest.map_structure(select_replica, structured)
+
+  def test_dense_lookup(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy)
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      mid_level_api.enqueue(next(dist_iter), training=False)
+      return strategy.run(step)
+
+    # Run model.
+    shard0 = self._get_replica_numpy(test_fn(), strategy, 0)
+
+    # embedding_values is a linear list, so we reshape to match the correct
+    # shape of the corresponding table before performing the lookup.
+    numpy_videos = np.reshape(self.embedding_values, (8, 4))
+    numpy_users = np.reshape(self.embedding_values, (16, 2))
+    golden = ((numpy_videos[self.feature_watched_values[-2:]],
+               numpy_videos[self.feature_favorited_values[-2:]],
+               numpy_users[self.feature_friends_values[-2:]]))
+    self.assertAllClose(shard0, golden)
+
+  def test_variable_learning_rate(self):
+    num_steps = 10
+    num_steps_float = float(num_steps)
+    starting_lr = 1.0
+    ending_lr = 0.5
+
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+
+    # Create model with Keras.
+    with strategy.scope():
+      step_counter = tf_variables.Variable(0.0, dtypes.float32)
+
+      def lr_function():
+        return gen_math_ops.maximum(
+            ending_lr,
+            starting_lr + ((ending_lr - starting_lr) * step_counter) /
+            num_steps_float)
+
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=lr_function)
+      table_config = tpu_embedding_v2_utils.TableConfig(
+          vocabulary_size=num_replicas,
+          dim=4,
+          initializer=init_ops_v2.Constant(np.zeros((num_replicas, 4))),
+          combiner='sum', name='table')
+      mid_level_api = tpu_embedding_v2.TPUEmbedding(
+          feature_config={
+              'feature': tpu_embedding_v2_utils.FeatureConfig(
+                  table=table_config, name='feature')},
+          batch_size=num_replicas,
+          optimizer=optimizer)
+
+    feature = {'feature': constant_op.constant([0], dtype=dtypes.int32)}
+
+    def input_fn(ctx):
+      del ctx
+      return dataset_ops.DatasetV2.from_tensors(feature).repeat()
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        with backprop.GradientTape() as tape:
+          activations = mid_level_api.dequeue()
+          tape.watch(activations)
+          result = math_ops.reduce_sum(activations['feature'])
+          loss = result / num_replicas
+        grads = tape.gradient(loss, activations)
+        mid_level_api.apply_gradients(grads)
+        return activations['feature']
+
+      mid_level_api.enqueue(next(dist_iter), training=True)
+      return strategy.run(step)
+
+    # Run model.
+    results = []
+    for _ in range(num_steps):
+      result = test_fn()
+      results.append(_unpack(strategy, result))
+      step_counter.assign_add(1.0)
+
+    # Table is 2 elements wide, per-replica batch size of 1, with id 0.
+    # Loss for the gradient is the sum of the entries divided by the number of
+    # replicas. Thus the per replica gradient is 1/#of replicas for row 0 and no
+    # other updates. The reduced gradient is therefore 1.
+    # Learning rate schedule over num_steps steps:
+    # 1.0 0.95 0.9 0.85 0.8 ...
+    # Since use SGD and the gradient is one, the first row of the table is
+    # [0, 0] [-1.0, -1.0] [-1.95, -1.95] [-2.85, -2.85] ... (the negative
+    # partial sums of the above).
+
+    learning_rates = [starting_lr - (starting_lr - ending_lr) / num_steps * j
+                      for j in range(num_steps)]
+    cumsum = [sum(learning_rates[0:j]) for j in range(num_steps)]
+    goldens = [[[-cumsum[i]] * table_config.dim] * num_replicas
+               for i in range(10)]
+    self.assertAllClose(results, goldens)
+
+  @parameterized.parameters([True, False])
+  def test_optimizer_with_slot_creation_fn(self, use_tpu):
+    def slot_creation_fn(table, slot_names):
+      slots = {}
+      for slot in slot_names:
+        slots[slot] = tf_variables.Variable(
+            name='{}_{}'.format(table.name, slot),
+            initial_value=functools.partial(
+                init_ops_v2.Zeros(), shape=table.shape, dtype=dtypes.float32),
+            trainable=False)
+      return slots
+    optimizer = tpu_embedding_v2_utils.Adagrad(
+        learning_rate=0.1,
+        slot_variable_creation_fn=slot_creation_fn)
+    if use_tpu:
+      strategy = self._get_strategy()
+    else:
+      strategy = distribution_strategy_context.get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      mid_level = tpu_embedding_v2.TPUEmbedding(
+          feature_config=self.feature_config,
+          batch_size=self.batch_size * num_replicas,
+          optimizer=optimizer)
+    video_accumulator = mid_level._variables['video']['accumulators']
+    user_accumulator = mid_level._variables['user']['accumulators']
+    if use_tpu:
+      # To check the table contents (ensure that it is zero rather than the
+      # normal initial accumulator value specified to in the optimizer config),
+      # we need to select the underlying table variable on TPU.
+      # We only have one shard on Forge.
+      video_accumulator = video_accumulator.variables[0]
+      user_accumulator = user_accumulator.variables[0]
+
+    self.assertAllClose(video_accumulator.numpy(),
+                        np.zeros((self.table_video.vocabulary_size,
+                                  self.table_video.dim)))
+    self.assertAllClose(user_accumulator.numpy(),
+                        np.zeros((self.table_user.vocabulary_size,
+                                  self.table_user.dim)))
+
+  def test_optimizer_with_slot_creation_fn_non_partial(self):
+    def slot_creation_fn(table, slot_names):
+      slots = {}
+      for slot in slot_names:
+        # Note that we don't pass functools.partial here, so on TPU we can't
+        # extract the shape. We expect the error below.
+        slots[slot] = tf_variables.Variable(
+            name='{}_{}'.format(table.name, slot),
+            initial_value=init_ops_v2.Zeros()(shape=table.shape,
+                                              dtype=dtypes.float32),
+            trainable=False)
+      return slots
+    optimizer = tpu_embedding_v2_utils.Adagrad(
+        learning_rate=0.1,
+        slot_variable_creation_fn=slot_creation_fn)
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      with self.assertRaisesRegex(ValueError,
+                                  'Unable to extract initializer function'):
+        tpu_embedding_v2.TPUEmbedding(
+            feature_config=self.feature_config,
+            batch_size=self.batch_size*num_replicas,
+            optimizer=optimizer)
+
+  def test_sequence_embeddings(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends',
+            max_sequence_length=3))
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      mid_level = tpu_embedding_v2.TPUEmbedding(
+          feature_config=feature_config,
+          batch_size=self.batch_size * num_replicas,
+          optimizer=optimizer)
+
+    dataset = self._create_sparse_dataset(strategy)
+    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+
+    @def_function.function
+    def embedding_and_set_gradients(data):
+      def tpu_fn():
+        activations = mid_level.dequeue()
+        mid_level.apply_gradients(nest.map_structure(array_ops.ones_like,
+                                                     activations))
+        return activations
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    @def_function.function
+    def embedding_only(data):
+      def tpu_fn():
+        return mid_level.dequeue()
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    # Only check core 0.
+    before_update = self._get_replica_numpy(
+        embedding_and_set_gradients(data), strategy, 0)
+    after_update = self._get_replica_numpy(embedding_only(data), strategy, 0)
+
+    # For videos table, row 0 and row 1 are looked up 3*num_replicas times as
+    # they occur 3 times per replica (considering the features 0 and 1 which are
+    # both looked up in the videos table).
+    # Feature 0 has ids [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
+    # Feature 1 has ids [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
+    # This means that both rows 0 and 1 get a -0.1*3*num_replicas update
+    # For users table, each row is looked up twice:
+    # Feature 2 has ids [3, 0, 1, 2], .. repeated over num_replicas
+    # This means that we get a -0.1*num_replicas update to the third feature.
+
+    # In general this means that after the update, if we lookup feature 0 and 1
+    # the values will be 0.3*num_replicas lower per entry and for feature 2 they
+    # will be 0.1*num_replicas lower.
+    # The one issue that that these lookups contain padding values.
+    # For core 0, we get the first 2 elements of the 4 element batch.
+    # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence
+    # length of 2, which means that [0, 1] will be 0s.
+    # For feature 1, the indices are [[0, 0], [0, 1], [1, 0]] with max sequence
+    # length of 2, which means that [1, 1] will be 0s.
+    # For feature 2, the indices are [[0, 0], [1, 0], [1, 1], [1, 2]] with max
+    # sequence length of 3, which means that [0, 1], [0, 2] will be 0s.
+    # The following masks represent that so that we only apply the above updates
+    # to the non-padding rows:
+    masks = (
+        np.array([[[1], [0]], [[1], [1]]]),
+        np.array([[[1], [1]], [[1], [0]]]),
+        np.array([[[1], [0], [0]], [[1], [1], [1]]]))
+
+    per_row_update = (0.3 * num_replicas,
+                      0.3 * num_replicas,
+                      0.1 * num_replicas)
+    golden = tuple([before - update * mask for before, update, mask in
+                    zip(before_update, per_row_update, masks)])
+    self.assertAllClose(golden, after_update)
+
+
+def _compute_gradients_wrt_embedding_table(batch_size,
+                                           gradient_wrt_activation,
+                                           embedding_table,
+                                           feature_indices,
+                                           feature_values,
+                                           combiner,
+                                           max_sequence_length=0):
+  """Compute gradients wrt embedding_table.
+
+  Args:
+    batch_size: `int`, batch size.
+    gradient_wrt_activation: `np.array` with shape `batch_size` by
+      embedding `dimension`.
+    embedding_table: `np.array` with shape `vocabulary_size` by embedding
+      `dimension`.
+    feature_indices: `indices` as used to construct `SparseTensor`.
+    feature_values: `values` as used to construct `SparseTensor`.
+    combiner: `String`, 'mean' or 'sum'.
+    max_sequence_length: If non-zero, a sequence feature with the given length.
+
+  Returns:
+    Gradients wrt `embedding_table`, an `np.array`s with shape
+      `batch_size` by `vocabulary_size` by
+      embedding `dimension`.
+
+  Raises:
+    ValueError: if `combiner` is not one of 'mean' or 'sum'.
+  """
+  if combiner not in ('mean', 'sum'):
+    raise ValueError('`combiner` must be mean or sum; got {}.'.format(combiner))
+  grads = []
+  for i in range(batch_size):
+    grad = np.zeros_like(embedding_table)
+    count = 0
+    for (batch_i, seq_index), vocabulary_id in zip(feature_indices,
+                                                   feature_values):
+      if batch_i == i:
+        count += 1
+        if max_sequence_length > 0:
+          if seq_index < max_sequence_length:
+            grad[vocabulary_id, :] += gradient_wrt_activation[i, seq_index, :]
+        else:
+          grad[vocabulary_id, :] += gradient_wrt_activation[i, :]
+    if combiner == 'mean' and not max_sequence_length:
+      grad = grad / count
+    grads.append(grad)
+  return np.stack(grads)
+
+
+def _unpack(strategy, per_replica_output):
+  per_replica_output = strategy.experimental_local_results(per_replica_output)
+  per_replica_output = array_ops.concat(per_replica_output, axis=0).numpy()
+  return per_replica_output
+
+
+def _get_total_loss_tensor(activations):
+  losses = []
+  for activation in activations:
+    losses.append(
+        math_ops.reduce_mean(
+            math_ops.reduce_sum(
+                gen_math_ops.squared_difference(activation, 0), 1)))
+  total_loss = array_ops.expand_dims_v2(sum(losses), 0)
+  return total_loss
+
+
+def _compute_loss(activation_watched, activation_favorited, activation_friends):
+  watched_loss = np.mean(np.sum(activation_watched**2, axis=1))
+  if len(activation_favorited.shape) == 2:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=1))
+  else:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=(1, 2)))
+  if len(activation_friends.shape) == 2:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=1))
+  else:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=(1, 2)))
+  loss = watched_loss + favorited_loss + friends_loss
+  return loss
+
+
+def _get_tmpdir(name, subdir=''):
+  segments = [FLAGS.model_dir, name] + ([subdir] if subdir else [])
+  return os.path.join(*segments)
+
+
+def _get_variable(variable):
+  if isinstance(variable, tpu_embedding_v2.TPUShardedVariable):
+    return variable.variables[0]
+  return variable
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test_lib.py b/tensorflow/python/tpu/tpu_embedding_v2_test_lib.py
new file mode 100644
index 00000000000..b6fbdea6cb4
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test_lib.py
@@ -0,0 +1,96 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library module for TPU Embedding mid level API test."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+
+
+class EmbeddingTestBase(test.TestCase):
+  """Base embedding test class for use on CPU and TPU."""
+
+  def _create_initial_data(self):
+    """Create the common test data used by both TPU and CPU."""
+
+    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
+    self.initializer = init_ops_v2.Constant(self.embedding_values)
+    # Embedding for video initialized to
+    # 0 1 2 3
+    # 4 5 6 7
+    # ...
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=8,
+        dim=4,
+        initializer=self.initializer,
+        combiner='sum',
+        name='video')
+    # Embedding for user initialized to
+    # 0 1
+    # 2 3
+    # 4 5
+    # 6 7
+    # ...
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=16,
+        dim=2,
+        initializer=self.initializer,
+        combiner='mean',
+        name='user')
+    self.feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends'))
+
+    self.batch_size = 2
+    self.data_batch_size = 4
+
+    # One (global) batch of inputs
+    # sparse tensor for watched:
+    # row 0: 0
+    # row 1: 0, 1
+    # row 2: 0, 1
+    # row 3: 1
+    self.feature_watched_indices = [[0, 0], [1, 0], [1, 1],
+                                    [2, 0], [2, 1], [3, 0]]
+    self.feature_watched_values = [0, 0, 1, 0, 1, 1]
+    self.feature_watched_row_lengths = [1, 2, 2, 1]
+    # sparse tensor for favorited:
+    # row 0: 0, 1
+    # row 1: 1
+    # row 2: 0
+    # row 3: 0, 1
+    self.feature_favorited_indices = [[0, 0], [0, 1], [1, 0],
+                                      [2, 0], [3, 0], [3, 1]]
+    self.feature_favorited_values = [0, 1, 1, 0, 0, 1]
+    self.feature_favorited_row_lengths = [2, 1, 1, 2]
+    # sparse tensor for friends:
+    # row 0: 3
+    # row 1: 0, 1, 2
+    # row 2: 3
+    # row 3: 0, 1, 2
+    self.feature_friends_indices = [[0, 0], [1, 0], [1, 1], [1, 2],
+                                    [2, 0], [3, 0], [3, 1], [3, 2]]
+    self.feature_friends_values = [3, 0, 1, 2, 3, 0, 1, 2]
+    self.feature_friends_row_lengths = [1, 3, 1, 3]

From 0e6795029eb2a0e102cf4ad80dc0eb01207f3772 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 12:20:42 -0700
Subject: [PATCH 1405/1533] Add RBE multipython Dockerfile for ubuntu 18.04.

PiperOrigin-RevId: 313827428
Change-Id: Ia9863d7bbec978813af894ef70b93c8915025ea0
---
 ...dnn7-ubuntu18.04-manylinux2010-multipython | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
new file mode 100644
index 00000000000..2aab5678590
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
@@ -0,0 +1,91 @@
+# Dockerfile to build a manylinux 2010 compliant cross-compiler.
+#
+# Builds a devtoolset gcc/libstdc++ that targets manylinux 2010 compatible
+# glibc (2.12) and system libstdc++ (4.4).
+#
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython
+
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 as devtoolset
+
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get install -y \
+      cpio \
+      file \
+      flex \
+      g++ \
+      make \
+      rpm2cpio \
+      unar \
+      wget \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+ADD devtoolset/fixlinks.sh fixlinks.sh
+ADD devtoolset/build_devtoolset.sh build_devtoolset.sh
+ADD devtoolset/rpm-patch.sh rpm-patch.sh
+
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-7 in /dt7.
+RUN /build_devtoolset.sh devtoolset-7 /dt7
+# Set up a sysroot for glibc 2.12 / libstdc++ 4.4 / devtoolset-8 in /dt8.
+RUN /build_devtoolset.sh devtoolset-8 /dt8
+
+# TODO(klimek): Split up into two different docker images.
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+COPY --from=devtoolset /dt7 /dt7
+COPY --from=devtoolset /dt8 /dt8
+
+# Install TensorRT.
+RUN apt-get update && apt-get install -y \
+    libnvinfer-dev=6.0.1-1+cuda10.1 \
+    libnvinfer6=6.0.1-1+cuda10.1 \
+    libnvinfer-plugin-dev=6.0.1-1+cuda10.1 \
+    libnvinfer-plugin6=6.0.1-1+cuda10.1 \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+# Copy and run the install scripts.
+ARG DEBIAN_FRONTEND=noninteractive
+
+COPY install/install_bootstrap_deb_packages.sh /install/
+RUN /install/install_bootstrap_deb_packages.sh
+
+COPY install/install_deb_packages.sh /install/
+RUN /install/install_deb_packages.sh
+
+# Install additional packages needed for this image:
+# - dependencies to build Python from source
+# - patchelf, as it is required by auditwheel
+RUN apt-get update && apt-get install -y \
+    libbz2-dev \
+    libffi-dev \
+    libgdbm-dev \
+    libncurses5-dev \
+    libnss3-dev \
+    libreadline-dev \
+    patchelf \
+      && \
+    rm -rf /var/lib/apt/lists/*
+
+COPY install/install_bazel.sh /install/
+RUN /install/install_bazel.sh
+
+COPY install/build_and_install_python.sh /install/
+RUN /install/build_and_install_python.sh "2.7.17" "--enable-unicode=ucs4"
+RUN /install/build_and_install_python.sh "3.5.9"
+RUN /install/build_and_install_python.sh "3.6.9"
+RUN /install/build_and_install_python.sh "3.7.7"
+RUN /install/build_and_install_python.sh "3.8.2"
+
+COPY install/install_pip_packages_by_version.sh /install/
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip2.7"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.5"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.6"
+RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7"
+
+ENV CLANG_VERSION="r42cab985fd95ba4f3f290e7bb26b93805edb447d"
+COPY install/install_latest_clang.sh /install/
+RUN /install/install_latest_clang.sh

From 9cd1555bd297d8140534a69f5d5a8aa59ca1b056 Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Fri, 29 May 2020 13:03:26 -0700
Subject: [PATCH 1406/1533] Disable saved_mode_test on Cloud TPU.

PiperOrigin-RevId: 313835836
Change-Id: Idbc66938bad01cf41dd042c0e05551cd79db8717
---
 tensorflow/examples/saved_model/integration_tests/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/examples/saved_model/integration_tests/BUILD b/tensorflow/examples/saved_model/integration_tests/BUILD
index 38c9c989168..0bfbee1ee2a 100644
--- a/tensorflow/examples/saved_model/integration_tests/BUILD
+++ b/tensorflow/examples/saved_model/integration_tests/BUILD
@@ -60,6 +60,9 @@ distribute_py_test(
         "nomsan",  # forge input size exceeded
         "notsan",  # forge input size exceeded
     ],
+    tpu_tags = [
+        "no_oss",  # Test infra collision (b/157754990)
+    ],
     deps = [
         ":distribution_strategy_utils",
         ":integration_scripts",

From 261921358e6891eb87ca495f597f158f4aa15093 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Fri, 29 May 2020 13:28:06 -0700
Subject: [PATCH 1407/1533] Handle output from OutsideCompiled parallel_execute
 regions.

Adds ops to send/recv data from host -> device when there are outputs from the OutsideCompiled cluster.  _TPUCompileMlir placeholder ops are also added to be replaced later because host side comm ops require the program_key as input.

This handles when a the result from OutsideCompiled cluster was originally returned from the TPU cluster.

PiperOrigin-RevId: 313840240
Change-Id: I2af37282309dd0998f0c15c0954a855b7bc0ac63
---
 .../tpu_extract_outside_compilation.mlir      | 308 ++++++++++++++----
 .../tpu_extract_outside_compilation.cc        |  89 ++++-
 2 files changed, 311 insertions(+), 86 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
index 9396e1fb88a..0c4de285b16 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_outside_compilation.mlir
@@ -96,13 +96,13 @@ func @nodep_multiple_outside_compilation() -> () {
 // CHECK-LABEL: func @single_tpu_return_single_outside_compilation
 func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-      // CHECK-NEXT: "tf_device.launch"
-      // CHECK: %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
-        // CHECK: tf_device.return
-      // CHECK: tf_device.return %[[TPU_CLUSTER_OUTPUT]]
-      // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]] = "tf_device.cluster"
+  // CHECK:            tf_device.return
+  // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+  // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
   %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
     %2 = "tf_device.cluster"() ( {
       "tf.A"() : () -> ()
@@ -121,13 +121,13 @@ func @single_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tens
 // CHECK-LABEL: func @multiple_tpu_return_single_outside_compilation
 func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xf32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
-    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
-      // CHECK-NEXT: "tf_device.launch"
-      // CHECK: %[[TPU_CLUSTER_OUTPUT:[0-9]*]]:2 = "tf_device.cluster"
-        // CHECK: tf_device.return
-      // CHECK: tf_device.return %[[TPU_CLUSTER_OUTPUT]]
-    // CHECK: tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
+  // CHECK:      %[[REPLICATE:[0-9]*]]:4 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]]:2  = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:          %[[TPU_CLUSTER_OUTPUT:[0-9]*]]:2 = "tf_device.cluster"
+  // CHECK:            tf_device.return
+  // CHECK:          tf_device.return %[[TPU_CLUSTER_OUTPUT]]
+  // CHECK:        tf_device.return %[[PARALLEL_EXECUTE_OUTPUT]]
   %1:4 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
     %2, %3 = "tf_device.cluster"() ( {
       %4 = "tf.A"() : () -> tensor<?xf32>
@@ -146,17 +146,17 @@ func @multiple_tpu_return_single_outside_compilation(%arg0: tensor<?xi32>) -> te
 // CHECK-LABEL: func @single_outside_compiled_input_single_outside_compilation
 func @single_outside_compiled_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-      // CHECK-NEXT: "tf_device.launch"
-        // CHECK: %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-        // CHECK: %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-        // CHECK-SAME: key = "host_compute_channel_cluster1"
-        // CHECK: "tf.B"(%[[RECV_OUTPUT]])
-      // CHECK: "tf_device.cluster"
-        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
-        // CHECK: "tf._HostComputeMlir"(%[[A_OUTPUT]])
-        // CHECK-SAME: key = "host_compute_channel_cluster1"
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.B"(%[[RECV_OUTPUT]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
   %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
     %2 = "tf_device.cluster"() ( {
       %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -170,22 +170,192 @@ func @single_outside_compiled_input_single_outside_compilation(%arg0: tensor<?xi
   return %1 : tensor<?xi32>
 }
 
+// Tests extraction of a single outside compiled cluster with single host->device output.
+
+// CHECK-LABEL: func @single_outside_compiled_output_single_outside_compilation
+func @single_outside_compiled_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"()
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:         "tf_device.cluster"
+  // CHECK:           %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:           %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"()
+  // CHECK-SAME:      key = "host_compute_channel_cluster1"
+  // CHECK:           "tf.C"(%[[HOST_OUTPUT]])
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> (tensor<?xi32>)
+      %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %5 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster host output returned by TPU cluster.
+
+// CHECK-LABEL: func @return_host_output_outside_compilation
+func @return_host_output_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            tf_device.return %[[HOST_OUTPUT]]
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5 = "tf.C"(%3) : (tensor<?xi32>) -> (tensor<?xi32>)
+      tf_device.return %4 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a single outside compiled cluster with single input/output.
+
+// CHECK-LABEL: func @single_outside_compiled_input_output_single_outside_compilation
+func @single_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[HOST_OUTPUT:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.C"(%[[HOST_OUTPUT]])
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5 = "tf.C"(%4) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %5 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+
+// Tests extraction of a single outside compiled cluster with multiple input/output.
+
+// CHECK-LABEL: func @multiple_outside_compiled_input_output_single_outside_compilation
+func @multiple_outside_compiled_input_output_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]]:2 = "tf.C"(%[[RECV_OUTPUT]]#0, %[[RECV_OUTPUT]]#1)
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]]#0, %[[B_OUTPUT]]#1, %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:        key = "host_compute_channel_cluster1"
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+  // CHECK:            %[[HOST_OUTPUT:[0-9]*]]:2 = "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.D"(%[[HOST_OUTPUT]]#0)
+  // CHECK:            "tf.E"(%[[HOST_OUTPUT]]#1)
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"() : () -> (tensor<?xi32>)
+      %5, %6 = "tf.C"(%3, %4) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>, tensor<?xi32>) -> (tensor<?xi32>, tensor<?xi32>)
+      %7 = "tf.D"(%5) : (tensor<?xi32>) -> tensor<?xi32>
+      %8 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %8 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
+// Tests extraction of a multiple outside compiled clusters with input/output.
+
+// CHECK-LABEL: func @outside_compiled_input_output_multiple_outside_compilation
+func @outside_compiled_input_output_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT2]])
+  // CHECK:            %[[D_OUTPUT:[0-9]*]] = "tf.D"(%[[RECV_OUTPUT2]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[D_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:         key = "host_compute_channel_cluster2"
+  // CHECK:          "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT1]])
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"(%[[RECV_OUTPUT1]])
+  // CHECK:            "tf._XlaSendFromHost"(%[[B_OUTPUT]], %[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[HOST_OUTPUT1:[0-9]*]] = "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"(%[[HOST_OUTPUT1]])
+  // CHECK:            %[[HOST_OUTPUT2:[0-9]*]] = "tf._HostComputeMlir"(%[[C_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster2"
+  // CHECK:            "tf.E"(%[[HOST_OUTPUT2]])
+  %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
+    %2 = "tf_device.cluster"() ( {
+      %3 = "tf.A"() : () -> (tensor<?xi32>)
+      %4 = "tf.B"(%3) {_xla_outside_compilation = "cluster1"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %5 = "tf.C"(%4) : (tensor<?xi32>) -> (tensor<?xi32>)
+      %6 = "tf.D"(%5) {_xla_outside_compilation = "cluster2"} : (tensor<?xi32>) -> (tensor<?xi32>)
+      %7 = "tf.E"(%6) : (tensor<?xi32>) -> tensor<?xi32>
+      tf_device.return %7 : tensor<?xi32>
+    }) {cluster_attr = "cluster_attr"} : () -> tensor<?xi32>
+    tf_device.return %2 : tensor<?xi32>
+  }
+
+  return %1 : tensor<?xi32>
+}
+
 // Tests extraction of a single outside compiled cluster with arg input and single device->host input.
 
 // CHECK-LABEL: func @mixed_input_single_outside_compilation
 func @mixed_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-      // CHECK-NEXT: "tf_device.launch"
-        // CHECK: %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-        // CHECK: %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-        // CHECK-SAME: key = "host_compute_channel_cluster1"
-        // CHECK: "tf.B"(%arg0, %[[RECV_OUTPUT]])
-      // CHECK: "tf_device.cluster"
-        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
-        // CHECK: "tf._HostComputeMlir"(%[[A_OUTPUT]])
-        // CHECK-SAME: key = "host_compute_channel_cluster1"
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.B"(%arg0, %[[RECV_OUTPUT]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
   %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
     %2 = "tf_device.cluster"() ( {
       %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -204,25 +374,25 @@ func @mixed_input_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi
 // CHECK-LABEL: func @single_outside_compiled_input_multiple_outside_compilation
 func @single_outside_compiled_input_multiple_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-  // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-  // CHECK-NEXT: "tf_device.launch"
-  // CHECK:        %[[STATUS_OUTPUT_2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:        %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
-  // CHECK-SAME: key = "host_compute_channel_cluster2"
-  // CHECK:        "tf.D"(%[[RECV_OUTPUT_2]])
-  // CHECK:       "tf_device.launch"
-  // CHECK:         %[[STATUS_OUTPUT_1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-  // CHECK:         %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
-  // CHECK-SAME: key = "host_compute_channel_cluster1"
-  // CHECK:         "tf.B"(%[[RECV_OUTPUT_1]])
-  // CHECK:       "tf_device.cluster"
-  // CHECK:         %[[A_OUTPUT:[0-9]*]] = "tf.A"
-  // CHECK:         "tf._HostComputeMlir"(%[[A_OUTPUT]])
-  // CHECK-SAME: key = "host_compute_channel_cluster1"
-  // CHECK:         %[[C_OUTPUT:[0-9]*]] = "tf.C"
-  // CHECK:         "tf._HostComputeMlir"(%[[C_OUTPUT]])
-  // CHECK-SAME: key = "host_compute_channel_cluster2"
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT_2:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_2:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT_2:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_2]])
+  // CHECK-SAME:      key = "host_compute_channel_cluster2"
+  // CHECK:           "tf.D"(%[[RECV_OUTPUT_2]])
+  // CHECK:          "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT_1:[a-z_0-9]*]], %[[PROGRAM_OUTPUT_1:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT_1:[0-9]*]] = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT_1]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.B"(%[[RECV_OUTPUT_1]])
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            %[[C_OUTPUT:[0-9]*]] = "tf.C"
+  // CHECK:            "tf._HostComputeMlir"(%[[C_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster2"
   %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
     %2 = "tf_device.cluster"() ( {
       %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -242,19 +412,19 @@ func @single_outside_compiled_input_multiple_outside_compilation(%arg0: tensor<?
 // CHECK-LABEL: func @multiple_outside_compiled_inputs_single_outside_compilation
 func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = "tf.A"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
-  // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
-    // CHECK: %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
-      // CHECK-NEXT: "tf_device.launch"
-        // CHECK: %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
-        // CHECK: %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
-        // CHECK-SAME: key = "host_compute_channel_cluster1"
-        // CHECK: "tf.C"(%[[RECV_OUTPUT]]#0)
-        // CHECK: "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
-      // CHECK: "tf_device.cluster"
-        // CHECK: %[[A_OUTPUT:[0-9]*]] = "tf.A"
-        // CHECK: %[[B_OUTPUT:[0-9]*]] = "tf.B"
-        // CHECK: "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
-        // CHECK-SAME: key = "host_compute_channel_cluster1"
+  // CHECK:      %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
+  // CHECK:        %[[PARALLEL_EXECUTE_OUTPUT:[0-9]*]] = "tf_device.parallel_execute"
+  // CHECK-NEXT:     "tf_device.launch"
+  // CHECK:            %[[STATUS_OUTPUT:[a-z_0-9]*]], %[[PROGRAM_OUTPUT:[a-z_0-9]*]] = "tf._TPUCompileMlir"
+  // CHECK:            %[[RECV_OUTPUT:[0-9]*]]:2 = "tf._XlaRecvAtHost"(%[[PROGRAM_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
+  // CHECK:            "tf.C"(%[[RECV_OUTPUT]]#0)
+  // CHECK:            "tf.D"(%[[RECV_OUTPUT]]#1, %[[RECV_OUTPUT]]#0)
+  // CHECK:          "tf_device.cluster"
+  // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
+  // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
+  // CHECK:            "tf._HostComputeMlir"(%[[A_OUTPUT]], %[[B_OUTPUT]])
+  // CHECK-SAME:       key = "host_compute_channel_cluster1"
   %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
     %2 = "tf_device.cluster"() ( {
       %3 = "tf.A"() : () -> (tensor<?xi32>)
@@ -269,5 +439,3 @@ func @multiple_outside_compiled_inputs_single_outside_compilation(%arg0: tensor<
 
   return %1 : tensor<?xi32>
 }
-
-// TODO(b/154363171): Add test cases for when output of outside compilation is returned by parallel_execute.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
index 58b3bf8bf7d..93e5cc22c30 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_outside_compilation.cc
@@ -141,6 +141,64 @@ llvm::SmallSetVector<Value, 4> GetExternalOperands(
   return external_values;
 }
 
+// Extracts all externally used outputs of `cluster_ops`.
+llvm::SmallVector<Value, 4> GetExternalOutputs(
+    const llvm::SmallVector<Operation*, 8>& cluster_ops) {
+  llvm::SmallSetVector<Value, 4> external_outputs;
+
+  for (Operation* op : cluster_ops) {
+    for (Operation* user : op->getUsers()) {
+      bool is_external = llvm::none_of(cluster_ops, [&](Operation* cluster_op) {
+        return user == cluster_op;
+      });
+      if (!is_external) continue;
+      for (Value v : user->getOperands()) {
+        if (v.getDefiningOp() == op) external_outputs.insert(v);
+      }
+    }
+  }
+
+  return external_outputs.takeVector();
+}
+
+// Sets the insertion point on `builder` for HostCompute op.  Sets insertion
+// point to the first op in `cluster_ops` that has one of `external_inputs`
+// as an operand.  If there are no external_inputs, set insertion point to first
+// cluster_op.
+void SetHostComputeInsertion(
+    OpBuilder* builder, const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    const llvm::SmallSetVector<Value, 4>& external_inputs) {
+  if (external_inputs.empty()) builder->setInsertionPoint(cluster_ops.front());
+  for (const auto& cluster_op : cluster_ops) {
+    for (Value v : cluster_op->getOperands()) {
+      if (external_inputs.count(v)) {
+        builder->setInsertionPoint(cluster_op);
+        return;
+      }
+    }
+  }
+}
+
+// Creates the HostCompute with `inputs` and `outputs`
+// using `communication_key`.
+TF::_HostComputeMlirOp CreateHostCompute(
+    OpBuilder* builder, tf_device::ClusterOp tpu_cluster,
+    const llvm::SmallVector<Operation*, 8>& cluster_ops,
+    const llvm::SmallSetVector<Value, 4>& inputs, llvm::ArrayRef<Value> outputs,
+    const std::string& communication_key) {
+  llvm::SmallVector<Type, 4> device_output_types;
+  for (const auto& output : outputs)
+    device_output_types.push_back(output.getType());
+  SetHostComputeInsertion(builder, cluster_ops, inputs);
+  auto host_compute = builder->create<TF::_HostComputeMlirOp>(
+      tpu_cluster.getLoc(), device_output_types, inputs.getArrayRef(),
+      llvm::ArrayRef<NamedAttribute>{});
+  host_compute.setAttr(kAncestorsAttr, builder->getArrayAttr({}));
+  host_compute.setAttr(kShapesAttr, builder->getArrayAttr({}));
+  host_compute.setAttr(kKeyAttr, builder->getStringAttr(communication_key));
+  return host_compute;
+}
+
 void MoveOutsideCompiledOps(
     tf_device::ClusterOp tpu_cluster, llvm::StringRef outside_cluster_name,
     tf_device::LaunchOp host_launch_op,
@@ -185,19 +243,25 @@ void MoveOutsideCompiledOps(
       builder.getStringAttr(communication_key),
       builder.getIntegerAttr(builder.getIntegerType(64), 0));
 
-  // TODO(b/156006200): Handle host->device outputs.
-  builder.setInsertionPoint(cluster_ops.front());
-  auto host_compute = builder.create<TF::_HostComputeMlirOp>(
-      tpu_cluster.getLoc(), llvm::ArrayRef<Type>{},
-      external_inputs.getArrayRef(), llvm::ArrayRef<NamedAttribute>{});
-  host_compute.setAttr(kAncestorsAttr, builder.getArrayAttr({}));
-  host_compute.setAttr(kShapesAttr, builder.getArrayAttr({}));
-  host_compute.setAttr(kKeyAttr, builder.getStringAttr(communication_key));
+  auto host_compute =
+      CreateHostCompute(&builder, tpu_cluster, cluster_ops, external_inputs,
+                        external_outputs, communication_key);
   MoveOutsideClusterOpsToLaunchOp(host_launch_op, cluster_ops);
 
+  builder.setInsertionPoint(host_launch_op.GetBody().getTerminator());
+  builder.create<TF::_XlaSendFromHostOp>(
+      tpu_cluster.getLoc(), external_outputs,
+      /*dynamic_key=*/compile_op.getResult(1),
+      builder.getStringAttr(communication_key),
+      /*device_ordinal=*/builder.getIntegerAttr(builder.getIntegerType(64), 0));
+
   for (auto result : llvm::zip(external_inputs, recv_at_host.getResults()))
     mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
                                      host_launch_op.body());
+
+  for (auto result : llvm::zip(external_outputs, host_compute.getResults()))
+    mlir::replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
+                                     tpu_cluster.body());
 }
 
 // Creates a `parallel_execute` op in place of launch with 'clusters` and
@@ -223,17 +287,13 @@ void CreateParallelExecuteFromOutsideClusters(
 
     // Determine if there are any inputs that are provided out of cluster.
     auto external_inputs = GetExternalOperands(cluster_ops);
-    llvm::SmallVector<Value, 4> external_outputs;
-    // TODO(b/156006200): Compute the external outputs.
+    auto external_outputs = GetExternalOutputs(cluster_ops);
 
     MoveOutsideCompiledOps(tpu_cluster, cluster.value().getFirst(),
                            host_launch_op, cluster_ops, external_inputs,
                            external_outputs);
 
     builder.setInsertionPointToEnd(&outside_block);
-    // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
-    // regions either through communication with TPU parallel_execute regions
-    // or modifying parallel_execute returns.
     builder.create<tf_device::ReturnOp>(tpu_cluster.getLoc(),
                                         ArrayRef<Value>{});
   }
@@ -248,9 +308,6 @@ void CreateParallelExecuteFromOutsideClusters(
       parallel_execute_tpu_block.getTerminator());
 
   PropagateParallelExecuteReturnToReplicate(parallel_execute_op);
-  // TODO(b/154363171): Handle returns from OutsideCompiled parallel_execute
-  // regions either through communication with TPU parallel_execute regions
-  // or modifying parallel_execute returns.
 }
 
 void TPUExtractOutsideCompilation::runOnFunction() {

From 3b2674dce3c5b4be01176646b573d3281d78bed0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 13:40:42 -0700
Subject: [PATCH 1408/1533] Fix typo in tf.where documentation.

PiperOrigin-RevId: 313842387
Change-Id: I255dfad74a2ddc80373504569b07c39636d90cf1
---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a641633b1f5..a2088fcdc48 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -4400,7 +4400,7 @@ def where_v2(condition, x=None, y=None, name=None):
 
   The `condition` tensor acts as a mask that chooses whether the corresponding
   element / row in the output should be taken from `x`
-  (if the elemment in `condition is True) or `y` (if it is false).
+  (if the element in `condition is True) or `y` (if it is false).
 
   >>> tf.where([True, False, False, True], [1,2,3,4], [100,200,300,400])
   <tf.Tensor: shape=(4,), dtype=int32, numpy=array([  1, 200, 300,   4],

From 7d0ab6178803ea00b1693f8628e5914c46ebde44 Mon Sep 17 00:00:00 2001
From: Nick Kreeger <kreeger@google.com>
Date: Fri, 29 May 2020 13:44:24 -0700
Subject: [PATCH 1409/1533] Add special "recording" SimpleMemoryAllocator class
 to help with logging tail allocations.

This new helper class will enable TFLM to log and record where the allocations in the shared arena are going. A future change will use this new class in a special "recording" MicroAllocator subclass. All these logging mechanisms will be opt-in by code.

PiperOrigin-RevId: 313843072
Change-Id: I3fc9205e475e89b4a3795c3cc79c31d2166da2c8
---
 tensorflow/lite/micro/BUILD                   |  28 +++++
 tensorflow/lite/micro/micro_allocator.cc      |   2 +-
 tensorflow/lite/micro/micro_allocator_test.cc |  21 ++--
 .../recording_simple_memory_allocator.cc      |  65 ++++++++++
 .../micro/recording_simple_memory_allocator.h |  59 +++++++++
 .../recording_simple_memory_allocator_test.cc | 119 ++++++++++++++++++
 .../lite/micro/simple_memory_allocator.cc     |  22 ++--
 .../lite/micro/simple_memory_allocator.h      |  20 +--
 tensorflow/lite/micro/testing/test_utils.cc   |   4 +-
 9 files changed, 313 insertions(+), 27 deletions(-)
 create mode 100644 tensorflow/lite/micro/recording_simple_memory_allocator.cc
 create mode 100644 tensorflow/lite/micro/recording_simple_memory_allocator.h
 create mode 100644 tensorflow/lite/micro/recording_simple_memory_allocator_test.cc

diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 36cc14512b6..ae4ba2fefd6 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -131,6 +131,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "recording_simple_memory_allocator",
+    srcs = [
+        "recording_simple_memory_allocator.cc",
+    ],
+    hdrs = [
+        "recording_simple_memory_allocator.h",
+    ],
+    build_for_embedded = True,
+    copts = micro_copts(),
+    deps = [
+        ":micro_compatibility",
+        ":micro_framework",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "micro_error_reporter_test",
     srcs = [
@@ -177,6 +193,18 @@ tflite_micro_cc_test(
     ],
 )
 
+tflite_micro_cc_test(
+    name = "recording_simple_memory_allocator_test",
+    srcs = [
+        "recording_simple_memory_allocator_test.cc",
+    ],
+    deps = [
+        ":micro_framework",
+        ":recording_simple_memory_allocator",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
+
 tflite_micro_cc_test(
     name = "micro_allocator_test",
     srcs = [
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 35f4bdabd20..ad26483ec3c 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -417,7 +417,7 @@ MicroAllocator::MicroAllocator(TfLiteContext* context, const Model* model,
   // Creates a root memory allocator managing the arena. The allocator itself
   // also locates in the arena buffer. This allocator doesn't need to be
   // destructed as it's the root allocator.
-  memory_allocator_ = CreateInPlaceSimpleMemoryAllocator(
+  memory_allocator_ = SimpleMemoryAllocator::Create(
       error_reporter, aligned_arena, aligned_arena_size);
 
   TfLiteStatus status = InitGraphAndContextTensorData();
diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc
index b34b2dc2866..013c1cefabf 100644
--- a/tensorflow/lite/micro/micro_allocator_test.cc
+++ b/tensorflow/lite/micro/micro_allocator_test.cc
@@ -68,8 +68,9 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
-                                                 arena_size);
+  tflite::SimpleMemoryAllocator* simple_allocator =
+      tflite::SimpleMemoryAllocator::Create(micro_test::reporter, arena,
+                                            arena_size);
 
   const tflite::Tensor* tensor = tflite::testing::Create1dFlatbufferTensor(100);
   const flatbuffers::Vector<flatbuffers::Offset<tflite::Buffer>>* buffers =
@@ -78,7 +79,7 @@ TF_LITE_MICRO_TEST(TestInitializeRuntimeTensor) {
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
-                     &simple_allocator, *tensor, buffers, micro_test::reporter,
+                     simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
@@ -93,8 +94,9 @@ TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
-                                                 arena_size);
+  tflite::SimpleMemoryAllocator* simple_allocator =
+      tflite::SimpleMemoryAllocator::Create(micro_test::reporter, arena,
+                                            arena_size);
 
   const tflite::Tensor* tensor =
       tflite::testing::CreateQuantizedFlatbufferTensor(100);
@@ -104,7 +106,7 @@ TF_LITE_MICRO_TEST(TestInitializeQuantizedTensor) {
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
-                     &simple_allocator, *tensor, buffers, micro_test::reporter,
+                     simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
@@ -119,8 +121,9 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
   TfLiteContext context;
   constexpr size_t arena_size = 1024;
   uint8_t arena[arena_size];
-  tflite::SimpleMemoryAllocator simple_allocator(micro_test::reporter, arena,
-                                                 arena_size);
+  tflite::SimpleMemoryAllocator* simple_allocator =
+      tflite::SimpleMemoryAllocator::Create(micro_test::reporter, arena,
+                                            arena_size);
 
   const tflite::Tensor* tensor =
       tflite::testing::CreateMissingQuantizationFlatbufferTensor(100);
@@ -130,7 +133,7 @@ TF_LITE_MICRO_TEST(TestMissingQuantization) {
   TfLiteTensor allocated_tensor;
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, tflite::internal::InitializeTfLiteTensorFromFlatbuffer(
-                     &simple_allocator, *tensor, buffers, micro_test::reporter,
+                     simple_allocator, *tensor, buffers, micro_test::reporter,
                      &allocated_tensor));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, allocated_tensor.type);
   TF_LITE_MICRO_EXPECT_EQ(1, allocated_tensor.dims->size);
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.cc b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
new file mode 100644
index 00000000000..934fa260e30
--- /dev/null
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.cc
@@ -0,0 +1,65 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
+
+namespace tflite {
+
+RecordingSimpleMemoryAllocator::RecordingSimpleMemoryAllocator(
+    ErrorReporter* error_reporter, uint8_t* buffer_head, size_t buffer_size)
+    : SimpleMemoryAllocator(error_reporter, buffer_head, buffer_size),
+      requested_bytes_(0),
+      used_bytes_(0),
+      alloc_count_(0) {}
+
+RecordingSimpleMemoryAllocator::~RecordingSimpleMemoryAllocator() {}
+
+size_t RecordingSimpleMemoryAllocator::GetRequestedBytes() const {
+  return requested_bytes_;
+}
+
+size_t RecordingSimpleMemoryAllocator::GetUsedBytes() const {
+  return used_bytes_;
+}
+
+size_t RecordingSimpleMemoryAllocator::GetAllocatedCount() const {
+  return alloc_count_;
+}
+
+uint8_t* RecordingSimpleMemoryAllocator::AllocateFromHead(size_t size,
+                                                          size_t alignment) {
+  const uint8_t* previous_head = GetHead();
+  uint8_t* result = SimpleMemoryAllocator::AllocateFromHead(size, alignment);
+  if (result != nullptr) {
+    used_bytes_ += GetHead() - previous_head;
+    requested_bytes_ += size;
+    alloc_count_++;
+  }
+  return result;
+}
+
+uint8_t* RecordingSimpleMemoryAllocator::AllocateFromTail(size_t size,
+                                                          size_t alignment) {
+  const uint8_t* previous_tail = GetTail();
+  uint8_t* result = SimpleMemoryAllocator::AllocateFromTail(size, alignment);
+  if (result != nullptr) {
+    used_bytes_ += previous_tail - GetTail();
+    requested_bytes_ += size;
+    alloc_count_++;
+  }
+  return result;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator.h b/tensorflow/lite/micro/recording_simple_memory_allocator.h
new file mode 100644
index 00000000000..77edadb35be
--- /dev/null
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator.h
@@ -0,0 +1,59 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
+#define TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
+
+#include "tensorflow/lite/micro/compatibility.h"
+#include "tensorflow/lite/micro/simple_memory_allocator.h"
+
+namespace tflite {
+
+// Utility class used to log allocations of a SimpleMemoryAllocator. Should only
+// be used in debug/evaluation settings or unit tests to evaluate allocation
+// usage.
+class RecordingSimpleMemoryAllocator : public SimpleMemoryAllocator {
+ public:
+  RecordingSimpleMemoryAllocator(ErrorReporter* error_reporter,
+                                 uint8_t* buffer_head, size_t buffer_size);
+  // TODO(b/157615197): Cleanup constructors/destructor and use factory
+  // functions.
+  ~RecordingSimpleMemoryAllocator() override;
+
+  // Returns the number of bytes requested from the head or tail.
+  size_t GetRequestedBytes() const;
+
+  // Returns the number of bytes actually allocated from the head or tail. This
+  // value will be >= to the number of requested bytes due to padding and
+  // alignment.
+  size_t GetUsedBytes() const;
+
+  // Returns the number of alloc calls from the head or tail.
+  size_t GetAllocatedCount() const;
+
+  uint8_t* AllocateFromHead(size_t size, size_t alignment) override;
+  uint8_t* AllocateFromTail(size_t size, size_t alignment) override;
+
+ private:
+  size_t requested_bytes_;
+  size_t used_bytes_;
+  size_t alloc_count_;
+
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_MICRO_RECORDING_SIMPLE_MEMORY_ALLOCATOR_H_
diff --git a/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
new file mode 100644
index 00000000000..8fc4745a70e
--- /dev/null
+++ b/tensorflow/lite/micro/recording_simple_memory_allocator_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/recording_simple_memory_allocator.h"
+
+#include <cstdint>
+
+#include "tensorflow/lite/micro/test_helpers.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestRecordsTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(/*size=*/10, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+
+  result = allocator.AllocateFromTail(/*size=*/20, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 30);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 30);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 2);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsMisalignedTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(/*size=*/10, /*alignment=*/12);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  // Validate used bytes in 8 byte range that can included alignment of 12:
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+}
+
+TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromTail(/*size=*/2048, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_EQ(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 0);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsHeadAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromHead(/*size=*/5, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 5);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 5);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+
+  result = allocator.AllocateFromTail(/*size=*/15, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 2);
+}
+
+TF_LITE_MICRO_TEST(TestRecordsMisalignedHeadAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromHead(/*size=*/10, /*alignment=*/12);
+  TF_LITE_MICRO_EXPECT_NE(result, nullptr);
+  // Validate used bytes in 8 byte range that can included alignment of 12:
+  TF_LITE_MICRO_EXPECT_GE(allocator.GetUsedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_LE(allocator.GetUsedBytes(), 20);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 10);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 1);
+}
+
+TF_LITE_MICRO_TEST(TestDoesNotRecordFailedTailAllocations) {
+  constexpr size_t arena_size = 1024;
+  uint8_t arena[arena_size];
+  tflite::RecordingSimpleMemoryAllocator allocator(micro_test::reporter, arena,
+                                                   arena_size);
+
+  uint8_t* result = allocator.AllocateFromHead(/*size=*/2048, /*alignment=*/1);
+  TF_LITE_MICRO_EXPECT_EQ(result, nullptr);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetUsedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetRequestedBytes(), 0);
+  TF_LITE_MICRO_EXPECT_EQ(allocator.GetAllocatedCount(), 0);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index d55e7e87640..65d60161e9a 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -37,17 +37,23 @@ SimpleMemoryAllocator::SimpleMemoryAllocator(ErrorReporter* error_reporter,
                                              size_t buffer_size)
     : SimpleMemoryAllocator(error_reporter, buffer, buffer + buffer_size) {}
 
-SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(
-    ErrorReporter* error_reporter, uint8_t* buffer, size_t buffer_size) {
+/* static */
+SimpleMemoryAllocator* SimpleMemoryAllocator::Create(
+    ErrorReporter* error_reporter, uint8_t* buffer_head, size_t buffer_size) {
   SimpleMemoryAllocator tmp =
-      SimpleMemoryAllocator(error_reporter, buffer, buffer_size);
-  SimpleMemoryAllocator* in_place_allocator =
-      reinterpret_cast<SimpleMemoryAllocator*>(tmp.AllocateFromTail(
-          sizeof(SimpleMemoryAllocator), alignof(SimpleMemoryAllocator)));
-  *in_place_allocator = tmp;
-  return in_place_allocator;
+      SimpleMemoryAllocator(error_reporter, buffer_head, buffer_size);
+
+  // Allocate enough bytes from the buffer to create a SimpleMemoryAllocator.
+  // The new instance will use the current adjusted tail buffer from the tmp
+  // allocator instance.
+  uint8_t* allocator_buffer = tmp.AllocateFromTail(
+      sizeof(SimpleMemoryAllocator), alignof(SimpleMemoryAllocator));
+  return new (allocator_buffer)
+      SimpleMemoryAllocator(error_reporter, tmp.head_, tmp.tail_);
 }
 
+SimpleMemoryAllocator::~SimpleMemoryAllocator() {}
+
 uint8_t* SimpleMemoryAllocator::AllocateFromHead(size_t size,
                                                  size_t alignment) {
   uint8_t* const aligned_result = AlignPointerUp(head_, alignment);
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index 5be260f9ed2..426ced032f6 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstdint>
 
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/micro/compatibility.h"
 
 namespace tflite {
 
@@ -28,17 +29,25 @@ namespace tflite {
 // This makes it pretty wasteful, so we should use a more intelligent method.
 class SimpleMemoryAllocator {
  public:
+  // TODO(b/157615197): Cleanup constructors/destructor and use factory
+  // functions.
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer_head,
                         uint8_t* buffer_tail);
   SimpleMemoryAllocator(ErrorReporter* error_reporter, uint8_t* buffer,
                         size_t buffer_size);
+  virtual ~SimpleMemoryAllocator();
+
+  // Creates a new SimpleMemoryAllocator from a given buffer head and size.
+  static SimpleMemoryAllocator* Create(ErrorReporter* error_reporter,
+                                       uint8_t* buffer_head,
+                                       size_t buffer_size);
 
   // Allocates memory starting at the head of the arena (lowest address and
   // moving upwards).
-  uint8_t* AllocateFromHead(size_t size, size_t alignment);
+  virtual uint8_t* AllocateFromHead(size_t size, size_t alignment);
   // Allocates memory starting at the tail of the arena (highest address and
   // moving downwards).
-  uint8_t* AllocateFromTail(size_t size, size_t alignment);
+  virtual uint8_t* AllocateFromTail(size_t size, size_t alignment);
 
   uint8_t* GetHead() const;
   uint8_t* GetTail() const;
@@ -57,12 +66,9 @@ class SimpleMemoryAllocator {
   uint8_t* buffer_tail_;
   uint8_t* head_;
   uint8_t* tail_;
-};
 
-// Allocate a SimpleMemoryAllocator from the buffer and then return the pointer
-// to this allocator.
-SimpleMemoryAllocator* CreateInPlaceSimpleMemoryAllocator(
-    ErrorReporter* error_reporter, uint8_t* buffer, size_t buffer_size);
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/micro/testing/test_utils.cc b/tensorflow/lite/micro/testing/test_utils.cc
index 62621db40d3..7d7a5554b10 100644
--- a/tensorflow/lite/micro/testing/test_utils.cc
+++ b/tensorflow/lite/micro/testing/test_utils.cc
@@ -119,8 +119,8 @@ int32_t F2Q32(float value, float scale) {
 // TODO(b/141330728): Move this method elsewhere as part clean up.
 void PopulateContext(TfLiteTensor* tensors, int tensors_size,
                      ErrorReporter* error_reporter, TfLiteContext* context) {
-  simple_memory_allocator_ = CreateInPlaceSimpleMemoryAllocator(
-      error_reporter, raw_arena_, kArenaSize);
+  simple_memory_allocator_ =
+      SimpleMemoryAllocator::Create(error_reporter, raw_arena_, kArenaSize);
   TFLITE_DCHECK(simple_memory_allocator_ != nullptr);
   scratch_buffer_count_ = 0;
 

From efa880137ed3e00d7b1178abc95ec58f2108749a Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Fri, 29 May 2020 13:47:23 -0700
Subject: [PATCH 1410/1533] Move linspace tests to their own file.

PiperOrigin-RevId: 313843608
Change-Id: Ifdae2ac60a721795151124a6a2ee643cb0e527ec
---
 tensorflow/python/BUILD                       | 15 +++++
 .../python/ops/math_ops_linspace_test.py      | 60 +++++++++++++++++++
 tensorflow/python/ops/math_ops_test.py        | 30 ----------
 3 files changed, 75 insertions(+), 30 deletions(-)
 create mode 100644 tensorflow/python/ops/math_ops_linspace_test.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 2b057b5db57..d93c2314954 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5079,6 +5079,21 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "math_ops_linspace_test",
+    size = "medium",
+    srcs = ["ops/math_ops_linspace_test.py"],
+    python_version = "PY3",
+    tags = ["no_windows_gpu"],
+    deps = [
+        ":framework_for_generated_wrappers",
+        ":framework_test_lib",
+        ":math_ops",
+        ":platform_test",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "nn_batchnorm_test",
     size = "medium",
diff --git a/tensorflow/python/ops/math_ops_linspace_test.py b/tensorflow/python/ops/math_ops_linspace_test.py
new file mode 100644
index 00000000000..f56b1c9284d
--- /dev/null
+++ b/tensorflow/python/ops/math_ops_linspace_test.py
@@ -0,0 +1,60 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.math_ops.linspace."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Using distutils.version.LooseVersion was resulting in an error, so importing
+# directly.
+from distutils.version import LooseVersion  # pylint: disable=g-importing-member
+import itertools
+
+import numpy as np
+
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class LinspaceTest(test_util.TensorFlowTestCase):
+
+  def testLinspaceBroadcasts(self):
+    if LooseVersion(np.version.version) < LooseVersion("1.16.0"):
+      self.skipTest("numpy doesn't support axes before version 1.16.0")
+
+    shapes = [(), (2,), (2, 2)]
+
+    types = [np.float64, np.int64]
+
+    for start_shape, stop_shape in itertools.product(shapes, repeat=2):
+      for num in [0, 1, 2, 20]:
+        ndims = max(len(start_shape), len(stop_shape))
+        for axis in range(-ndims, ndims):
+          for dtype in types:
+            start = np.ones(start_shape, dtype)
+            stop = 10 * np.ones(stop_shape, dtype)
+
+            np_ans = np.linspace(start, stop, num, axis=axis)
+            tf_ans = self.evaluate(
+                math_ops.linspace_nd(start, stop, num, axis=axis))
+
+            self.assertAllClose(np_ans, tf_ans)
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 85a5afc6c16..afa1dbdbaf7 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -17,9 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import distutils
-import itertools
-
 import numpy as np
 
 from tensorflow.python.eager import backprop
@@ -824,32 +821,5 @@ class RangeTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(values, self.evaluate(tensor))
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class LinspaceTest(test_util.TensorFlowTestCase):
-
-  def testLinspaceBroadcasts(self):
-    if distutils.version.LooseVersion(
-        np.version.version) < distutils.version.LooseVersion("1.16.0"):
-      self.skipTest("numpy doesn't support axes before version 1.16.0")
-
-    shapes = [(), (2,), (2, 2)]
-
-    types = [np.float64, np.int64]
-
-    for start_shape, stop_shape in itertools.product(shapes, repeat=2):
-      for num in [0, 1, 2, 20]:
-        ndims = max(len(start_shape), len(stop_shape))
-        for axis in range(-ndims, ndims):
-          for dtype in types:
-            start = np.ones(start_shape, dtype)
-            stop = 10 * np.ones(stop_shape, dtype)
-
-            np_ans = np.linspace(start, stop, num, axis=axis)
-            tf_ans = self.evaluate(
-                math_ops.linspace_nd(start, stop, num, axis=axis))
-
-            self.assertAllClose(np_ans, tf_ans)
-
-
 if __name__ == "__main__":
   googletest.main()

From 8c63be3940e1e83f2c7e1b6ebfe63d823dbc0f22 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 14:26:22 -0700
Subject: [PATCH 1411/1533] Internal change

PiperOrigin-RevId: 313850352
Change-Id: I89584b0bcb4409eb74d21e31fb0eb68844186707
---
 tensorflow/core/common_runtime/gpu/BUILD      |  10 +-
 .../core/common_runtime/gpu/gpu_device.cc     | 161 ++----------------
 .../core/common_runtime/gpu/gpu_device.h      |   6 -
 .../common_runtime/gpu/gpu_device_test.cc     |  93 +---------
 tensorflow/core/protobuf/config.proto         |  12 --
 tensorflow/python/eager/context.py            |  24 +--
 .../stream_executor/cuda/cuda_driver.cc       |  14 +-
 tensorflow/stream_executor/gpu/gpu_driver.h   |   3 +-
 tensorflow/stream_executor/gpu/gpu_stream.cc  |   3 +-
 tensorflow/stream_executor/gpu/gpu_stream.h   |   3 -
 .../stream_executor/rocm/rocm_driver.cc       |   8 +-
 .../golden/v1/tensorflow.-g-p-u-options.pbtxt |   6 -
 ...config.-logical-device-configuration.pbtxt |   4 -
 ...mental.-virtual-device-configuration.pbtxt |   4 -
 ...config.-logical-device-configuration.pbtxt |   4 -
 ...mental.-virtual-device-configuration.pbtxt |   4 -
 16 files changed, 28 insertions(+), 331 deletions(-)

diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 8665f76f5e6..07919117051 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -28,10 +28,6 @@ load(
     "if_static",
     "tf_cuda_tests_tags",
 )
-load(
-    "//tensorflow/stream_executor:build_defs.bzl",
-    "if_gpu_is_configured",
-)
 
 package(
     default_visibility = [
@@ -155,7 +151,6 @@ tf_cuda_library(
         ":gpu_id_impl",
         ":gpu_init_impl",
         ":gpu_lib",
-        "//third_party/eigen3",
         "//tensorflow/core:core_cpu_impl",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
@@ -167,9 +162,8 @@ tf_cuda_library(
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/profiler/lib:annotated_traceme",
         "//tensorflow/core/profiler/lib:scoped_annotation",
-    ] + if_gpu_is_configured([
-        "//tensorflow/stream_executor/cuda:cuda_platform",
-    ]),
+        "//third_party/eigen3",
+    ],
     alwayslink = 1,
 )
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 416ce29d9a7..cf2e7043cae 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -74,7 +74,6 @@ limitations under the License.
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/core/util/stream_executor_util.h"
-#include "tensorflow/stream_executor/gpu/gpu_stream.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"
 
 #if !defined(PLATFORM_GOOGLE)
@@ -245,18 +244,16 @@ class BaseGPUDevice::StreamGroupFactory {
     StreamGroup* group =
         &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
     if (!group->compute) {
-      int priority = GetPriority(tf_gpu_id.value(), options);
-      group->priority = priority;
-      group->compute = GetStream(executor, priority);
+      group->compute = new se::Stream(executor);
       group->compute->Init();
       VLOG(2) << "Created stream[" << stream_group_within_gpu
-              << "] = " << group->compute << " with priority: " << priority;
+              << "] = " << group->compute;
 
 #if TENSORFLOW_USE_ROCM
       // ROCm streams are lightweight and will not necessarily trigger device
       // queue init until they are first used. For optimal performance,
       // compute and nccl streams must be immediate siblings.
-      group->nccl = GetStream(executor, priority);
+      group->nccl = new se::Stream(executor);
       group->nccl->Init();
       VLOG(2) << "Created nccl_stream[" << stream_group_within_gpu
               << "] = " << group->nccl;
@@ -266,12 +263,12 @@ class BaseGPUDevice::StreamGroupFactory {
       group->nccl->ThenWaitFor(group->compute);
 #endif
 
-      group->host_to_device = GetStream(executor, priority);
+      group->host_to_device = new se::Stream(executor);
       group->host_to_device->Init();
       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->host_to_device;
 
-      group->device_to_host = GetStream(executor, priority);
+      group->device_to_host = new se::Stream(executor);
       group->device_to_host->Init();
       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
@@ -286,7 +283,7 @@ class BaseGPUDevice::StreamGroupFactory {
         num_d2d_streams = 1;
       }
       for (int i = 0; i < num_d2d_streams; ++i) {
-        se::Stream* stream = GetStream(executor, priority);
+        se::Stream* stream = new se::Stream(executor);
         stream->Init();
         group->device_to_device.push_back(stream);
         VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
@@ -303,70 +300,7 @@ class BaseGPUDevice::StreamGroupFactory {
     return *instance;
   }
 
-  // Helper method for unit tests to reset the streams. Never use in production.
-  void TestOnlyReset() {
-    mutex_lock guard(lock_);
-    for (auto& item : streams_) {
-      auto& stream = item.second;
-      if (stream.compute) {
-        delete stream.compute;
-        stream.compute = nullptr;
-      }
-#if TENSORFLOW_USE_ROCM
-      if (stream.nccl) {
-        delete stream.nccl;
-        stream.nccl = nullptr;
-      }
-#endif
-      if (stream.host_to_device) {
-        delete stream.host_to_device;
-        stream.host_to_device = nullptr;
-      }
-      if (stream.device_to_host) {
-        delete stream.device_to_host;
-        stream.device_to_host = nullptr;
-      }
-      while (!stream.device_to_device.empty()) {
-        auto back = stream.device_to_device.back();
-        if (back) {
-          delete back;
-        }
-        stream.device_to_device.pop_back();
-      }
-    }
-    streams_.clear();
-  }
-
  private:
-  // Returns priority for the given virtual GPU id from the session options.
-  // Returns 0 if no virtual devices are specified.
-  int GetPriority(int tf_gpu_id, const GPUOptions& options) {
-    int id = tf_gpu_id;
-    int i = 0;
-    int priority = 0;
-    while (i < options.experimental().virtual_devices_size()) {
-      const int size =
-          options.experimental().virtual_devices().Get(i).priority_size();
-      if (id >= size) {
-        id -= size;
-      } else {
-        priority =
-            options.experimental().virtual_devices().Get(i).priority().Get(id);
-        break;
-      }
-      i++;
-    }
-    return priority;
-  }
-
-  // Returns a Stream with the underlying GPUStream with the given priority.
-  se::Stream* GetStream(se::StreamExecutor* executor, int priority) {
-    auto stream = new se::Stream(executor);
-    static_cast<stream_executor::gpu::GpuStream*>(stream->implementation())
-        ->SetPriority(priority);
-    return stream;
-  }
-
   mutex lock_;
   using key_type = std::tuple<int, int>;
   std::map<key_type, StreamGroup> streams_;
@@ -818,8 +752,7 @@ Status ParseVisibleDeviceList(const string& visible_device_list,
 Status VerifyVirtualDeviceSettings(
     const size_t num_gpus_to_use, const GPUOptions& gpu_options,
     const std::vector<PlatformGpuId>& visible_gpu_order,
-    const std::vector<PlatformGpuId>& valid_platform_gpu_ids,
-    const std::map<int, std::pair<int, int>>& supported_priority_ranges) {
+    const std::vector<PlatformGpuId>& valid_platform_gpu_ids) {
   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
   CHECK(!virtual_devices.empty());
   if (gpu_options.per_process_gpu_memory_fraction() > 0) {
@@ -848,63 +781,6 @@ Status VerifyVirtualDeviceSettings(
         " #valid GPUs: ", valid_platform_gpu_ids.size(),
         " virtual_devices.size(): ", virtual_devices.size());
   }
-#if GOOGLE_CUDA
-  // Check memory_limt_mb and priority sizes match if priority is non-empty.
-  bool priority_exists = !virtual_devices.Get(0).priority().empty();
-  for (int i = 0; i < virtual_devices.size(); ++i) {
-    const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
-    const auto& priority = virtual_devices.Get(i).priority();
-    // If the priority is empty in the first one then treat this as having no
-    // priority set in any of the virtual devices for backward compatibility.
-    // Either it's set for all or none.
-    if (!priority_exists) {
-      if (!priority.empty()) {
-        return errors::InvalidArgument(
-            "Priority must be set for all virtual devices or none. But the "
-            "priority is specified for ",
-            i,
-            " while previous devices didn't "
-            "have any set.");
-      }
-    }
-    if (priority_exists && memory_limit_mb.size() != priority.size()) {
-      return errors::InvalidArgument(
-          "Number of virtual device priorities specified doesn't "
-          "match with number of memory_limit_mb specified for GPU# ",
-          i, " memory_limit_mb size: ", memory_limit_mb.size(),
-          " and priority size: ", priority.size());
-    }
-    const int gpu_id = valid_platform_gpu_ids[i].value();
-    auto it = supported_priority_ranges.find(gpu_id);
-    if (it == supported_priority_ranges.end()) {
-      return errors::Internal(
-          "Failed to find supported priority range for GPU"
-          " device ",
-          gpu_id);
-    }
-    const std::pair<int, int>& priority_range = it->second;
-    for (int p : priority) {
-      if (p > priority_range.first || p < priority_range.second) {
-        return errors::InvalidArgument(
-            "Priority ", p,
-            " is outside the range of supported priorities "
-            "[",
-            priority_range.second, ",", priority_range.first,
-            "] for virtual device ", i, " on GPU# ", gpu_id);
-      }
-    }
-  }
-#elif TENSORFLOW_USE_ROCM
-  for (int i = 0; i < virtual_devices.size(); ++i) {
-    if (!virtual_devices.Get(i).priority().empty()) {
-      return errors::InvalidArgument(
-          "Priority is supported only on Nvidia GPUs."
-          " However, priority is set for virtual device ",
-          i, ", which corresponds to a non Nvidia GPU");
-    }
-  }
-#endif
-
   return Status::OK();
 }
 
@@ -1127,7 +1003,6 @@ Status BaseGPUDeviceFactory::CreateDevices(
   if (num_gpus_to_use > valid_platform_gpu_ids.size()) {
     num_gpus_to_use = valid_platform_gpu_ids.size();
   }
-  std::map<int, std::pair<int, int>> supported_priority_ranges;
   if (!valid_platform_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
@@ -1161,18 +1036,6 @@ Status BaseGPUDeviceFactory::CreateDevices(
                                 platform_gpu_id.value(),
                                 " failed. Status: ", cudaGetErrorString(err));
       }
-      int priority_low, priority_high;
-      cudaDeviceGetStreamPriorityRange(&priority_low, &priority_high);
-      if (err != cudaSuccess) {
-        return errors::Internal(
-            "cudaDeviceGetStreamPriorityRange() on GPU:", original_device,
-            " failed. Status: ", cudaGetErrorString(err));
-      }
-      VLOG(1) << "Cuda stream priority range on GPU(" << original_device
-              << "): " << priority_high << "," << priority_low;
-      supported_priority_ranges.insert(
-          std::make_pair(platform_gpu_id.value(),
-                         std::make_pair(priority_low, priority_high)));
 #elif TENSORFLOW_USE_ROCM
       err = hipSetDevice(platform_gpu_id.value());
       if (err != hipSuccess) {
@@ -1244,9 +1107,9 @@ Status BaseGPUDeviceFactory::CreateDevices(
 
   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
   if (!virtual_devices.empty()) {
-    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
-        num_gpus_to_use, gpu_options, visible_gpu_order, valid_platform_gpu_ids,
-        supported_priority_ranges));
+    TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(num_gpus_to_use, gpu_options,
+                                                   visible_gpu_order,
+                                                   valid_platform_gpu_ids));
     // We've verified that num_gpus_to_use >= virtual_devices.size().
     num_gpus_to_use = virtual_devices.size();
     CHECK(gpu_options.visible_device_list().empty() ||
@@ -1841,10 +1704,6 @@ int BaseGPUDevice::PendingKernels() {
   return 0;
 }
 
-void BaseGPUDevice::TestOnlyReset() {
-  StreamGroupFactory::Global().TestOnlyReset();
-}
-
 uint64 GPUKernelTracker::MaybeQueue(OpKernelContext* ctx) {
   mutex_lock l(mu_);
   ++ops_since_last_;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 32c7738d916..3646c59cec1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -114,11 +114,6 @@ class BaseGPUDevice : public LocalDevice {
   // the compute stream and are not yet known to have completed.
   int PendingKernels();
 
-  int priority() const { return stream_->priority; }
-
-  // Helper method for unit tests to reset the streams. Never use in production.
-  static void TestOnlyReset();
-
  protected:
   Allocator* gpu_allocator_;  // not owned
   Allocator* cpu_allocator_;  // not owned
@@ -136,7 +131,6 @@ class BaseGPUDevice : public LocalDevice {
     se::Stream* host_to_device = nullptr;
     se::Stream* device_to_host = nullptr;
     gtl::InlinedVector<se::Stream*, 4> device_to_device;
-    int priority = 0;
   };
   class StreamGroupFactory;
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 26312d35af6..0d66324a8e5 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -61,17 +61,13 @@ void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
 
 class GPUDeviceTest : public ::testing::Test {
  public:
-  void TearDown() override {
-    BaseGPUDevice::TestOnlyReset();
-    GPUProcessState::singleton()->TestOnlyReset();
-  }
+  void TearDown() override { GPUProcessState::singleton()->TestOnlyReset(); }
 
  protected:
   static SessionOptions MakeSessionOptions(
       const string& visible_device_list = "",
       double per_process_gpu_memory_fraction = 0, int gpu_device_count = 1,
-      const std::vector<std::vector<float>>& memory_limit_mb = {},
-      const std::vector<std::vector<int32>>& priority = {}) {
+      const std::vector<std::vector<float>>& memory_limit_mb = {}) {
     SessionOptions options;
     ConfigProto* config = &options.config;
     (*config->mutable_device_count())["GPU"] = gpu_device_count;
@@ -79,17 +75,12 @@ class GPUDeviceTest : public ::testing::Test {
     gpu_options->set_visible_device_list(visible_device_list);
     gpu_options->set_per_process_gpu_memory_fraction(
         per_process_gpu_memory_fraction);
-    for (int i = 0; i < memory_limit_mb.size(); ++i) {
+    for (const auto& v : memory_limit_mb) {
       auto virtual_devices =
           gpu_options->mutable_experimental()->add_virtual_devices();
-      for (float mb : memory_limit_mb[i]) {
+      for (float mb : v) {
         virtual_devices->add_memory_limit_mb(mb);
       }
-      if (i < priority.size()) {
-        for (int p : priority[i]) {
-          virtual_devices->add_priority(p);
-        }
-      }
     }
     return options;
   }
@@ -202,7 +193,6 @@ TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
@@ -214,67 +204,25 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
-TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndNoPriority) {
+TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-}
-
-TEST_F(GPUDeviceTest, SingleVirtualDeviceWithInvalidPriority) {
-  {
-    // Priority outside the range (-1, 0).
-    SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-2, 0}});
-    std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
-        opts, kDeviceNamePrefix, &devices);
-    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-    ExpectErrorMessageSubstr(
-        status,
-        "Priority -2 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
-  }
-  {
-    // Priority outside the range (-1, 0).
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, 1}});
-    std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
-        opts, kDeviceNamePrefix, &devices);
-    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-    ExpectErrorMessageSubstr(
-        status,
-        "Priority 1 is outside the range of supported priorities [-1,0] for"
-        " virtual device 0 on GPU# 0");
-  }
-}
-
-TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimitAndPriority) {
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}}, {{-1}});
-  std::vector<std::unique_ptr<Device>> devices;
-  TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
-      opts, kDeviceNamePrefix, &devices));
-  EXPECT_EQ(1, devices.size());
-  EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-  EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
-  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{0, -1}});
+  SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
   std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
   EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-  EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-  EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
   ASSERT_EQ(1, devices[0]->attributes().locality().links().link_size());
   ASSERT_EQ(1, devices[1]->attributes().locality().links().link_size());
   EXPECT_EQ(1, devices[0]->attributes().locality().links().link(0).device_id());
@@ -289,35 +237,6 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).strength());
 }
 
-TEST_F(GPUDeviceTest, MultipleVirtualDevicesWithPriority) {
-  {
-    // Multile virtual devices with fewer priorities.
-    SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1}});
-    std::vector<std::unique_ptr<Device>> devices;
-    Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
-        opts, kDeviceNamePrefix, &devices);
-    EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-    ExpectErrorMessageSubstr(
-        status,
-        "Number of virtual device priorities specified doesn't "
-        "match with number of memory_limit_mb specified for GPU# 0"
-        " memory_limit_mb size: 2 and priority size: 1");
-  }
-  {
-    // Multile virtual devices with matching priority.
-    SessionOptions opts =
-        MakeSessionOptions("0", 0, 1, {{123, 456}}, {{-1, 0}});
-    std::vector<std::unique_ptr<Device>> devices;
-    TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
-        opts, kDeviceNamePrefix, &devices));
-    EXPECT_EQ(2, devices.size());
-    EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-    EXPECT_EQ(456 << 20, devices[1]->attributes().memory_limit());
-    EXPECT_EQ(-1, static_cast<BaseGPUDevice*>(devices[0].get())->priority());
-    EXPECT_EQ(0, static_cast<BaseGPUDevice*>(devices[1].get())->priority());
-  }
-}
-
 // Enabling unified memory on pre-Pascal GPUs results in an initialization
 // error.
 TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 3d20d004d46..7973e002762 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -110,18 +110,6 @@ message GPUOptions {
       // For the concept of "visible" and "virtual" GPU, see the comments for
       // "visible_device_list" above for more information.
       repeated float memory_limit_mb = 1;
-
-      // Priority values to use with the virtual devices. Use the cuda function
-      // cudaDeviceGetStreamPriorityRange to query for valid range of values for
-      // priority.
-      //
-      // On a P4000 GPU with cuda 10.1, the priority range reported was 0 for
-      // least priority and -1 for greatest priority.
-      //
-      // If this field is not specified, then the virtual devices will be
-      // created with the default. If this field has values set, then the size
-      // of this must match with the above memory_limit_mb.
-      repeated int32 priority = 2;
     }
 
     // The multi virtual device settings. If empty (not set), it will create
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index aa760583800..604a960afd5 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -263,8 +263,7 @@ class LogicalDevice(
 @tf_export("config.LogicalDeviceConfiguration",
            "config.experimental.VirtualDeviceConfiguration")
 class LogicalDeviceConfiguration(
-    collections.namedtuple("LogicalDeviceConfiguration",
-                           ["memory_limit", "experimental_priority"])):
+    collections.namedtuple("LogicalDeviceConfiguration", ["memory_limit"])):
   """Configuration class for a logical devices.
 
   The class specifies the parameters to configure a `tf.config.PhysicalDevice`
@@ -277,15 +276,10 @@ class LogicalDeviceConfiguration(
   Fields:
     memory_limit: (optional) Maximum memory (in MB) to allocate on the virtual
       device. Currently only supported for GPUs.
-    experimental_priority: (optional) Priority to assign to a virtual device.
-      Lower values have higher priorities and 0 is the default.
-      Within a physical GPU, the GPU scheduler will prioritize ops on virtual
-      devices with higher priority. Currently only supported for Nvidia GPUs.
   """
 
-  def __new__(cls, memory_limit=None, experimental_priority=None):
-    return super(LogicalDeviceConfiguration,
-                 cls).__new__(cls, memory_limit, experimental_priority)
+  def __new__(cls, memory_limit=None):
+    return super(LogicalDeviceConfiguration, cls).__new__(cls, memory_limit)
 
 
 @tf_export("config.PhysicalDevice")
@@ -1025,19 +1019,12 @@ class Context(object):
       if self._virtual_device_map:
         vdevs = self._virtual_device_map.get(dev, [])
         device_limits = []
-        priority = []
         for virt_dev in vdevs:
           device_limits.append(virt_dev.memory_limit)
-          if virt_dev.experimental_priority is not None:
-            priority.append(virt_dev.experimental_priority)
-        # If priority is specified, it must be specified for all virtual
-        # devices.
-        if priority and len(device_limits) != len(priority):
-          raise ValueError("priority must be specified for all virtual devices")
 
         virtual_devices.append(
             config_pb2.GPUOptions.Experimental.VirtualDevices(
-                memory_limit_mb=device_limits, priority=priority))
+                memory_limit_mb=device_limits))
 
     # Only compute growth if virtual devices have not been configured and we
     # have GPUs
@@ -1407,9 +1394,6 @@ class Context(object):
         if vdev.memory_limit is not None:
           raise ValueError("Setting memory limit on CPU virtual devices is "
                            "currently not supported")
-        if vdev.experimental_priority is not None:
-          raise ValueError("Setting experimental_priority on CPU virtual "
-                           " devices is currently not supported")
     elif dev.device_type == "GPU":
       for vdev in virtual_devices:
         if vdev.memory_limit is None:
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 3d011123d87..210c5436fad 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -713,21 +713,13 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
       absl::StrCat("failed to get device for context: ", ToString(result)));
 }
 
-/* static */ bool GpuDriver::CreateStream(GpuContext* context, CUstream* stream,
-                                          int priority) {
+/* static */ bool GpuDriver::CreateStream(GpuContext* context,
+                                          CUstream* stream) {
   // TODO(leary) can we switch this to CU_STREAM_NON_BLOCKING or will that mess
   // up synchronization with respect to memsets and any other things that have
   // to occur on the default stream?
   ScopedActivateContext activated{context};
-  CUresult res;
-  // If the priority is 0, then use the previous api to create the stream with
-  // the default priority for backward compatibility. Probably there is no
-  // difference in using the new api call but leaving it as is for now.
-  if (priority == 0) {
-    res = cuStreamCreate(stream, 0);
-  } else {
-    res = cuStreamCreateWithPriority(stream, 0, priority);
-  }
+  CUresult res = cuStreamCreate(stream, 0);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not allocate CUDA stream for context "
                << context->context() << ": " << ToString(res);
diff --git a/tensorflow/stream_executor/gpu/gpu_driver.h b/tensorflow/stream_executor/gpu/gpu_driver.h
index 25b90be1bd2..f72c9a129cf 100644
--- a/tensorflow/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/stream_executor/gpu/gpu_driver.h
@@ -71,8 +71,7 @@ class GpuDriver {
   // cuStreamCreate.
   // stream is an outparam owned by the caller, must not be null.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1ga581f0c5833e21ded8b5a56594e243f4
-  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream,
-                           int priority = 0);
+  static bool CreateStream(GpuContext* context, GpuStreamHandle* stream);
 
   // Destroys a CUDA stream associated with the given context.
   // stream is owned by the caller, must not be null, and *stream is set to null
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.cc b/tensorflow/stream_executor/gpu/gpu_stream.cc
index 9899bbb04a3..887522cf3ae 100644
--- a/tensorflow/stream_executor/gpu/gpu_stream.cc
+++ b/tensorflow/stream_executor/gpu/gpu_stream.cc
@@ -23,8 +23,7 @@ namespace stream_executor {
 namespace gpu {
 
 bool GpuStream::Init() {
-  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_,
-                               priority_)) {
+  if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_)) {
     return false;
   }
   return GpuDriver::InitEvent(parent_->gpu_context(), &completed_event_,
diff --git a/tensorflow/stream_executor/gpu/gpu_stream.h b/tensorflow/stream_executor/gpu/gpu_stream.h
index e58ac72dd56..db0eec53b9c 100644
--- a/tensorflow/stream_executor/gpu/gpu_stream.h
+++ b/tensorflow/stream_executor/gpu/gpu_stream.h
@@ -48,8 +48,6 @@ class GpuStream : public internal::StreamInterface {
   // Explicitly initialize the CUDA resources associated with this stream, used
   // by StreamExecutor::AllocateStream().
   bool Init();
-  void SetPriority(int priority) { priority_ = priority; }
-  int priority() const { return priority_; }
 
   // Explicitly destroy the CUDA resources associated with this stream, used by
   // StreamExecutor::DeallocateStream().
@@ -80,7 +78,6 @@ class GpuStream : public internal::StreamInterface {
  private:
   GpuExecutor* parent_;         // Executor that spawned this stream.
   GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
-  int priority_ = 0;
 
   // Event that indicates this stream has completed.
   GpuEventHandle completed_event_ = nullptr;
diff --git a/tensorflow/stream_executor/rocm/rocm_driver.cc b/tensorflow/stream_executor/rocm/rocm_driver.cc
index 5a8154f1df8..210e9d7a9fa 100644
--- a/tensorflow/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/stream_executor/rocm/rocm_driver.cc
@@ -558,13 +558,7 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
 }
 
 /* static */ bool GpuDriver::CreateStream(GpuContext* context,
-                                          GpuStreamHandle* stream,
-                                          int priority) {
-  if (priority != 0) {
-    LOG(ERROR) << "ROCM stream doesn't support priority. "
-               << " Should be set to 0 but given: " << priority;
-    return false;
-  }
+                                          GpuStreamHandle* stream) {
   ScopedActivateContext activated{context};
   hipError_t res = tensorflow::wrap::hipStreamCreateWithFlags(
       stream, hipStreamDefault);  // switch to hipStreamNonBlocking?
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
index 62bc83a3a69..79c33f7e304 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-g-p-u-options.pbtxt
@@ -116,12 +116,6 @@ tf_proto {
           label: LABEL_REPEATED
           type: TYPE_FLOAT
         }
-        field {
-          name: "priority"
-          number: 2
-          label: LABEL_REPEATED
-          type: TYPE_INT32
-        }
       }
     }
   }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
index 49750b0af85..3f6c6e636a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.-logical-device-configuration.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
-  member {
-    name: "experimental_priority"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 9b2a7d846b9..25b6b6e216e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
-  member {
-    name: "experimental_priority"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
index 49750b0af85..3f6c6e636a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.-logical-device-configuration.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
-  member {
-    name: "experimental_priority"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
index 9b2a7d846b9..25b6b6e216e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.config.experimental.-virtual-device-configuration.pbtxt
@@ -3,10 +3,6 @@ tf_class {
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<class \'tensorflow.python.eager.context.LogicalDeviceConfiguration\'>"
   is_instance: "<type \'tuple\'>"
-  member {
-    name: "experimental_priority"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "memory_limit"
     mtype: "<type \'property\'>"

From 8b3f0347e74bc69206f5ee36ccccd61ce6fec26f Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Fri, 29 May 2020 14:45:01 -0700
Subject: [PATCH 1412/1533] Register the semantic stats as internal.

PiperOrigin-RevId: 313853594
Change-Id: I4e5ced627e8705cae77230671f70065e3ed25191
---
 tensorflow/core/profiler/utils/BUILD            |  1 +
 tensorflow/core/profiler/utils/xplane_schema.cc | 10 ++++++++++
 tensorflow/core/profiler/utils/xplane_schema.h  |  5 +----
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index ca20236d63b..0bc9374b7ba 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -178,6 +178,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index 2c474917110..8a4538197f3 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -233,5 +234,14 @@ absl::optional<int64> FindStatType(absl::string_view stat_name) {
   return absl::nullopt;
 }
 
+bool IsInternalStat(absl::optional<int64> stat_type) {
+  static const auto* const kInternalStats = new absl::flat_hash_set<int64>{
+      StatType::kKernelDetails, StatType::kLevel0,
+      StatType::kProducerType,  StatType::kProducerId,
+      StatType::kConsumerType,  StatType::kConsumerId,
+      StatType::kIsRoot,        StatType::kIsAsync};
+  return stat_type.has_value() && kInternalStats->contains(*stat_type);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 82b95b0cdd8..d0944bf88e1 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -209,10 +209,7 @@ inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
 absl::optional<int64> FindStatType(absl::string_view stat_name);
 
 // Returns true if the given stat shouldn't be shown in the trace viewer.
-inline bool IsInternalStat(absl::optional<int64> stat_type) {
-  return stat_type == StatType::kKernelDetails ||
-         stat_type == StatType::kLevel0;
-}
+bool IsInternalStat(absl::optional<int64> stat_type);
 
 // Support for flow events:
 // This class enables encoding/decoding the flow id and direction, stored as

From a5bd187cce379a14d8cbf2a0387778b821dc714b Mon Sep 17 00:00:00 2001
From: Robert Suderman <suderman@google.com>
Date: Fri, 29 May 2020 14:51:10 -0700
Subject: [PATCH 1413/1533] Add xla_hlo.dynamic_iota for non-static cases of
 iota

Existing xla_hlo.iota does not cover all use cases. Added an xla_hlo.iota
operation that supports a dynamially shaped output. This matches the
behavior for dynamic_broadcast_in_dim.

PiperOrigin-RevId: 313854741
Change-Id: Idf8361984d48e30eac9fb22ef3b54b178d925f0d
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 28 +++++++++++++++++++
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    | 19 +++++++++++++
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      |  5 ++++
 .../compiler/mlir/xla/tests/canonicalize.mlir |  8 ++++++
 4 files changed, 60 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index a4bdc5c212c..22f1ffe1d3a 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -262,6 +262,34 @@ static LogicalResult Verify(IotaOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicIotaOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+struct DynamicIotaIsStatic : public OpRewritePattern<DynamicIotaOp> {
+  using OpRewritePattern<DynamicIotaOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DynamicIotaOp iota,
+                                PatternRewriter& rewriter) const override {
+    auto result_ty = iota.getType().cast<ShapedType>();
+    if (!result_ty.hasStaticShape()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<IotaOp>(iota, result_ty, iota.iota_dimension());
+    return success();
+  }
+};
+
+}  // namespace
+
+void DynamicIotaOp::getCanonicalizationPatterns(
+    OwningRewritePatternList& results, MLIRContext* context) {
+  results.insert<DynamicIotaIsStatic>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // AbsOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 7f162c97ca6..be8c86a3a0b 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -79,6 +79,25 @@ def HLO_IotaOp : HLO_Op<"iota", [NoSideEffect]>, BASE_HLO_IotaOp {
   let hasCustomHLOConverter = 1;
 }
 
+def HLO_DynamicIotaOp: HLO_Op<"dynamic_iota", [NoSideEffect]> {
+  let summary = "Create linear increasing values from 0 to length -1.";
+  let description = [{
+    Produces an HLO Tensor of the specified shape, with an incremental set of
+    values along the specified dimension starting at 0.
+
+    Requires:
+    - The output length of the tensor result.
+  }];
+
+  let arguments = (ins HLO_DimensionTensor:$output_shape, I64Attr:$iota_dimension);
+  let results = (outs HLO_Tensor:$result);
+
+  let hasCanonicalizer = 1;
+  // Cannot be exported to legacy formats.
+  let hasCustomHLOConverter = 1;
+}
+
+
 def HLO_CreateTokenOp : HLO_Op<"create_token", [NoSideEffect]> {
   string summary = "Create Token operator";
 
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 6479871e46b..3c670ef0c6e 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -607,6 +607,11 @@ LogicalResult ExportXlaOp(DynamicBroadcastInDimOp op, OpLoweringContext ctx) {
   return failure();
 }
 
+LogicalResult ExportXlaOp(DynamicIotaOp op, OpLoweringContext ctx) {
+  // This op has no expression in the legacy export format.
+  return failure();
+}
+
 LogicalResult ExportXlaOp(DynamicReshapeOp op, OpLoweringContext ctx) {
   // This op has no expression in the legacy export format.
   return failure();
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index 483dfb0c34f..ed9f1661616 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -281,6 +281,14 @@ func @complex_collapse_fold(%arg0: tensor<4xcomplex<f32>>) -> tensor<4xcomplex<f
   return %2 : tensor<4xcomplex<f32>>
 }
 
+// CHECK-LABEL: @dynamic_iota_is_static
+func @dynamic_iota_is_static(%arg0 : tensor<1xindex>) -> tensor<4xi32> {
+  // CHECK: [[RESULT:%.*]] = "xla_hlo.iota"
+  // CHECK: return [[RESULT]]
+  %0 = "xla_hlo.dynamic_iota"(%arg0) {iota_dimension = 0 : i64} : (tensor<1xindex>) -> tensor<4xi32>
+  return %0 : tensor<4xi32>
+}
+
 // CHECK-LABEL: @iota_not_lowered_to_constant
 func @iota_not_lowered_to_constant() -> tensor<4xi32> {
   // CHECK: [[RESULT:%.*]] = "xla_hlo.iota"

From a6a3a48679f2e43047ae764478026171217bd35e Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Fri, 29 May 2020 15:11:27 -0700
Subject: [PATCH 1414/1533] Added missing resource types to arguments.
 Image2DArray/Image3D/ImageBuffer.

PiperOrigin-RevId: 313858546
Change-Id: I5a83491728c7f6709994464186725649ad81e3c7
---
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 148 ++++++++++++++++--
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  12 ++
 tensorflow/lite/delegates/gpu/cl/gpu_object.h |  30 ++++
 .../delegates/gpu/cl/kernels/transpose.cc     |   2 +-
 .../lite/delegates/gpu/cl/kernels/winograd.cc |   2 +-
 5 files changed, 183 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 7b28ee215da..908780d6caf 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -87,6 +87,13 @@ void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
   }
 }
 
+void AppendArgument(const std::string& arg, std::string* args) {
+  if (!args->empty()) {
+    absl::StrAppend(args, ",\n  ");
+  }
+  absl::StrAppend(args, arg);
+}
+
 }  // namespace
 
 Arguments::Arguments(Arguments&& args)
@@ -96,6 +103,9 @@ Arguments::Arguments(Arguments&& args)
       shared_float4s_data_(std::move(args.shared_float4s_data_)),
       buffers_(std::move(args.buffers_)),
       images2d_(std::move(args.images2d_)),
+      image2d_arrays_(std::move(args.image2d_arrays_)),
+      images3d_(std::move(args.images3d_)),
+      image_buffers_(std::move(args.image_buffers_)),
       object_refs_(std::move(args.object_refs_)),
       objects_(std::move(args.objects_)) {}
 Arguments& Arguments::operator=(Arguments&& args) {
@@ -106,6 +116,9 @@ Arguments& Arguments::operator=(Arguments&& args) {
     shared_float4s_data_ = std::move(args.shared_float4s_data_);
     buffers_ = std::move(args.buffers_);
     images2d_ = std::move(args.images2d_);
+    image2d_arrays_ = std::move(args.image2d_arrays_);
+    images3d_ = std::move(args.images3d_);
+    image_buffers_ = std::move(args.image_buffers_);
     object_refs_ = std::move(args.object_refs_);
     objects_ = std::move(args.objects_);
   }
@@ -127,6 +140,21 @@ void Arguments::AddImage2D(const std::string& name,
   images2d_[name] = desc;
 }
 
+void Arguments::AddImage2DArray(const std::string& name,
+                                const GPUImage2DArrayDescriptor& desc) {
+  image2d_arrays_[name] = desc;
+}
+
+void Arguments::AddImage3D(const std::string& name,
+                           const GPUImage3DDescriptor& desc) {
+  images3d_[name] = desc;
+}
+
+void Arguments::AddImageBuffer(const std::string& name,
+                               const GPUImageBufferDescriptor& desc) {
+  image_buffers_[name] = desc;
+}
+
 void Arguments::AddObjectRef(const std::string& name,
                              GPUObjectDescriptorPtr&& descriptor_ptr) {
   object_refs_[name] = {AccessType::READ, std::move(descriptor_ptr)};
@@ -150,6 +178,15 @@ void Arguments::AddGPUResources(const std::string& name,
   for (const auto& r : resources.images2d) {
     AddImage2D(absl::StrCat(name, "_", r.first), r.second);
   }
+  for (const auto& r : resources.image2d_arrays) {
+    AddImage2DArray(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.images3d) {
+    AddImage3D(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.image_buffers) {
+    AddImageBuffer(absl::StrCat(name, "_", r.first), r.second);
+  }
 }
 
 absl::Status Arguments::SetInt(const std::string& name, int value) {
@@ -179,12 +216,12 @@ absl::Status Arguments::SetFloat(const std::string& name, float value) {
 }
 
 absl::Status Arguments::SetImage2D(const std::string& name, cl_mem memory) {
-  auto ti = images2d_.find(name);
-  if (ti == images2d_.end()) {
+  auto it = images2d_.find(name);
+  if (it == images2d_.end()) {
     return absl::NotFoundError(
         absl::StrCat("No image2D argument with name - ", name));
   }
-  ti->second.memory = memory;
+  it->second.memory = memory;
   return absl::OkStatus();
 }
 
@@ -198,6 +235,47 @@ absl::Status Arguments::SetBuffer(const std::string& name, cl_mem memory) {
   return absl::OkStatus();
 }
 
+absl::Status Arguments::SetImage2DArray(const std::string& name,
+                                        cl_mem memory) {
+  auto it = image2d_arrays_.find(name);
+  if (it == image2d_arrays_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image2D array argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImage3D(const std::string& name, cl_mem memory) {
+  auto it = images3d_.find(name);
+  if (it == images3d_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image3D argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetImageBuffer(const std::string& name, cl_mem memory) {
+  auto it = image_buffers_.find(name);
+  if (it == image_buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image buffer argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetObjectRef(const std::string& name,
+                                     const GPUObject* object) {
+  auto it = object_refs_.find(name);
+  if (it == object_refs_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No object ref with name - ", name));
+  }
+  return SetGPUResources(name, object->GetGPUResources());
+}
+
 absl::Status Arguments::SetGPUResources(
     const std::string& name, const GPUResourcesWithValue& resources) {
   for (const auto& r : resources.ints) {
@@ -212,6 +290,16 @@ absl::Status Arguments::SetGPUResources(
   for (const auto& r : resources.images2d) {
     RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second));
   }
+  for (const auto& r : resources.image2d_arrays) {
+    RETURN_IF_ERROR(
+        SetImage2DArray(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.images3d) {
+    RETURN_IF_ERROR(SetImage3D(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.image_buffers) {
+    RETURN_IF_ERROR(SetImageBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
   return absl::OkStatus();
 }
 
@@ -227,17 +315,29 @@ std::string Arguments::GetListOfArgs() {
   for (auto& t : buffers_) {
     const std::string type_name =
         t.second.data_type == DataType::FLOAT32 ? "float" : "half";
-    absl::StrAppend(&result, ",\n  __global ", type_name, t.second.element_size,
-                    "* ", t.first);
+    AppendArgument(absl::StrCat("__global ", type_name, t.second.element_size,
+                                "* ", t.first),
+                   &result);
+  }
+  for (auto& t : image_buffers_) {
+    AppendArgument(absl::StrCat("__read_only image1d_buffer_t ", t.first),
+                   &result);
   }
   for (auto& t : images2d_) {
-    absl::StrAppend(&result, ",\n  __read_only image2d_t ", t.first);
+    AppendArgument(absl::StrCat("__read_only image2d_t ", t.first), &result);
+  }
+  for (auto& t : image2d_arrays_) {
+    AppendArgument(absl::StrCat("__read_only image2d_array_t ", t.first),
+                   &result);
+  }
+  for (auto& t : images3d_) {
+    AppendArgument(absl::StrCat("__read_only image3d_t ", t.first), &result);
   }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
-    absl::StrAppend(&result, ",\n  int4 shared_int4_", i);
+    AppendArgument(absl::StrCat("int4 shared_int4_", i), &result);
   }
   for (int i = 0; i < shared_float4s_data_.size() / 4; ++i) {
-    absl::StrAppend(&result, ",\n  float4 shared_float4_", i);
+    AppendArgument(absl::StrCat("float4 shared_float4_", i), &result);
   }
   return result;
 }
@@ -253,6 +353,16 @@ absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
     }
     offset++;
   }
+  for (auto& t : image_buffers_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
   for (auto& t : images2d_) {
     const int error_code =
         clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
@@ -263,6 +373,26 @@ absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
     }
     offset++;
   }
+  for (auto& t : image2d_arrays_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : images3d_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
                                           &shared_int4s_data_[i * 4]);
@@ -342,7 +472,7 @@ void Arguments::ResolveObjectNames(const std::string& object_name,
                                    const std::vector<std::string>& member_names,
                                    std::string* code) {
   for (const auto& member_name : member_names) {
-    const std::string new_name = "args." + object_name + "_" + member_name;
+    const std::string new_name = kArgsPrefix + object_name + "_" + member_name;
     ReplaceAllWords(member_name, new_name, code);
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 65c114b2cf6..bad308d9da5 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -39,6 +39,11 @@ class Arguments {
   void AddInt(const std::string& name, int value = 0);
   void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
   void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+  void AddImage2DArray(const std::string& name,
+                       const GPUImage2DArrayDescriptor& desc);
+  void AddImage3D(const std::string& name, const GPUImage3DDescriptor& desc);
+  void AddImageBuffer(const std::string& name,
+                      const GPUImageBufferDescriptor& desc);
 
   void AddObjectRef(const std::string& name,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
@@ -48,6 +53,10 @@ class Arguments {
   absl::Status SetFloat(const std::string& name, float value);
   absl::Status SetImage2D(const std::string& name, cl_mem memory);
   absl::Status SetBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetImage2DArray(const std::string& name, cl_mem memory);
+  absl::Status SetImage3D(const std::string& name, cl_mem memory);
+  absl::Status SetImageBuffer(const std::string& name, cl_mem memory);
+  absl::Status SetObjectRef(const std::string& name, const GPUObject* object);
 
   std::string GetListOfArgs();
 
@@ -112,6 +121,9 @@ class Arguments {
 
   std::map<std::string, GPUBufferDescriptor> buffers_;
   std::map<std::string, GPUImage2DDescriptor> images2d_;
+  std::map<std::string, GPUImage2DArrayDescriptor> image2d_arrays_;
+  std::map<std::string, GPUImage3DDescriptor> images3d_;
+  std::map<std::string, GPUImageBufferDescriptor> image_buffers_;
 
   struct ObjectRefArg {
     AccessType access_type;
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 23d1f210459..43eba5b1481 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -34,6 +34,21 @@ struct GPUImage2DDescriptor {
   cl_mem memory;
 };
 
+struct GPUImage3DDescriptor {
+  DataType data_type;
+  cl_mem memory;
+};
+
+struct GPUImage2DArrayDescriptor {
+  DataType data_type;
+  cl_mem memory;
+};
+
+struct GPUImageBufferDescriptor {
+  DataType data_type;
+  cl_mem memory;
+};
+
 struct GPUBufferDescriptor {
   DataType data_type;
   int element_size;
@@ -45,6 +60,9 @@ struct GPUResources {
   std::vector<std::string> floats;
   std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
   std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
+  std::vector<std::pair<std::string, GPUImage2DArrayDescriptor>> image2d_arrays;
+  std::vector<std::pair<std::string, GPUImage3DDescriptor>> images3d;
+  std::vector<std::pair<std::string, GPUImageBufferDescriptor>> image_buffers;
 
   std::vector<std::string> GetNames() const {
     std::vector<std::string> names = ints;
@@ -55,6 +73,15 @@ struct GPUResources {
     for (const auto& obj : images2d) {
       names.push_back(obj.first);
     }
+    for (const auto& obj : image2d_arrays) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images3d) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : image_buffers) {
+      names.push_back(obj.first);
+    }
     return names;
   }
 };
@@ -64,6 +91,9 @@ struct GPUResourcesWithValue {
   std::vector<std::pair<std::string, float>> floats;
   std::vector<std::pair<std::string, cl_mem>> buffers;
   std::vector<std::pair<std::string, cl_mem>> images2d;
+  std::vector<std::pair<std::string, cl_mem>> image2d_arrays;
+  std::vector<std::pair<std::string, cl_mem>> images3d;
+  std::vector<std::pair<std::string, cl_mem>> image_buffers;
 };
 
 class GPUObjectDescriptor {
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 7a1d454b571..4102e9d02a1 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -55,7 +55,7 @@ std::string GetTransposeCode(
   c += "__kernel void main_function(\n";
   c += src_tensor.GetDeclaration(AccessType::READ);
   c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n  ";
   c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index e3c9306b80c..82688931755 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -90,7 +90,7 @@ std::string GetWinograd4x4To36Code(
   c += GetArgsDeclaration(linked_operations);
   c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n";
   c += "    int4 src_size,                              \n";
-  c += "    int4 dst_size";
+  c += "    int4 dst_size,\n  ";
   c += "$0) {\n";
   c += "  int DST_X = get_global_id(0);\n";
   c += "  int DST_Y = get_global_id(1);\n";

From 677c1960415d77eba6cd075a8d8fae994a0c730f Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 29 May 2020 15:12:20 -0700
Subject: [PATCH 1415/1533] Fix inlined function logging

PiperOrigin-RevId: 313858692
Change-Id: I8823363003eef3a9bf0f7f66322537f2dc3fc8de
---
 .../common_runtime/inline_function_utils.cc   | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index a074942629d..5a07573a430 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/function_utils.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -545,6 +546,19 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     return node;
   };
 
+  // ------------------------------------------------------------------------ //
+  // Helper function to get an input/output argument name by index. For
+  // functions instantiated from SymbolicGradien corresponding FunctionDef is
+  // empty, and argument name is unknown.
+
+  auto arg_name = [&](auto& args, size_t i) -> absl::string_view {
+    if (i < args.size()) {
+      return args[i].name();
+    } else {
+      return "<unknown>";
+    }
+  };
+
   // ------------------------------------------------------------------------ //
   // Input edges. For data edges coming into "caller", we first compute the
   // <src>:<src_output> for the i-th input in "inputs".
@@ -663,11 +677,12 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   //
   // The added identity nodes depend on "input_control_node".
   VLOG(4) << "Add input Identity nodes for each function argument:";
+
   for (std::size_t i = 0; i < fbody->arg_nodes.size(); ++i) {
     Node* arg = node_map[fbody->arg_nodes[i]->id()];
     Node* n = input_identity("input", inputs[i], i);
     VLOG(4) << "    [index " << i << "] "
-            << fbody->fdef.signature().input_arg(i).name() << " as "
+            << arg_name(fbody->fdef.signature().input_arg(), i) << " as "
             << n->name() << " (input: " << inputs[i].name()
             << ", requested_device: " << n->requested_device() << ")";
 
@@ -704,6 +719,7 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
   // If `keep_node_fetchable` is `true` we always add an output control node, to
   // guarantee that executing a fetchable node will execute all side-effects.
   VLOG(4) << "Add output Identity nodes for each function output argument:";
+
   std::vector<Node*> outputs(caller->num_outputs());
   for (std::size_t i = 0; i < fbody->ret_nodes.size(); ++i) {
     Node* ret = node_map[fbody->ret_nodes[i]->id()];
@@ -718,7 +734,7 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
     Node* n = output_identity("output", data, i);
     outputs[i] = n;
     VLOG(4) << "    [index " << i << "] "
-            << fbody->fdef.signature().output_arg(i).name() << " as "
+            << arg_name(fbody->fdef.signature().output_arg(), i) << " as "
             << n->name() << " (ret: " << data.node->name() << ":" << data.index
             << ", requested_device: " << n->requested_device() << ")";
     for (const Edge* e : ret->in_edges()) {

From 5a62ab6215361d665e9655966b074de1af71f3c7 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 29 May 2020 15:15:45 -0700
Subject: [PATCH 1416/1533] Update TPUExtractHeadTailOutsideCompilation pass in
 preparation for tail extraction (NFC).

This simplifies and updates some test cases, and extract some reused logic used by tail extraction.

PiperOrigin-RevId: 313859255
Change-Id: I35bb385c0a76aae54cc7836db8a8f549cd9b86ff
---
 ...extract_head_tail_outside_compilation.mlir | 188 +++++++----------
 ...u_extract_head_tail_outside_compilation.cc | 194 ++++++++++--------
 2 files changed, 184 insertions(+), 198 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 474bfb1eef1..dd31b7d06ef 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -3,15 +3,16 @@
 // Tests extraction of a outside compiled ops at head of TPU computation.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
-  // CHECK-LABEL: func @single_head_outside_compilation
-  func @single_head_outside_compilation(%arg0: tensor<i32>) {
-    // CHECK:      tf_device.launch
-    // CHECK:        "tf.A"
+  // CHECK-LABEL: func @head_single_outside_compiled_op
+  func @head_single_outside_compiled_op(%arg0: tensor<i32>) {
+    // CHECK:      "tf_device.launch"
+    // CHECK-NEXT:   "tf.A"
     // CHECK-NEXT:   tf_device.return
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
-    // CHECK:        "tf.C"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
       "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
@@ -22,54 +23,54 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     return
   }
 
-  // CHECK-LABEL: func @ops_no_operands
-  func @ops_no_operands() {
-    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+  // CHECK-LABEL: func @head_single_outside_compiled_op_no_operands
+  func @head_single_outside_compiled_op_no_operands() {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.B"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   "tf.C"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %0 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
-      %1 = "tf.B"(%0) {}: (tensor<i32>) -> tensor<i32>
-      "tf.C"(%1) : (tensor<i32>) -> ()
+      %a = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      %b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
+      "tf.C"(%b) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
 
-  // CHECK-LABEL: func @op_operand_outside_cluster
-  func @op_operand_outside_cluster() {
+  // CHECK-LABEL: func @head_operand_op_outside_cluster
+  func @head_operand_op_outside_cluster() {
     // CHECK:      %[[A_OUT:.*]] = "tf.A"
-    %0 = "tf.A"() : () -> tensor<i32>
-    // CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:        %[[B_OUT:.*]] = "tf.B"
+    %a = "tf.A"() : () -> tensor<i32>
+    // CHECK-NEXT: %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"
     // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   "tf.C"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   "tf.D"
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      %2 = "tf.C"(%1) {}: (tensor<i32>) -> tensor<i32>
-      "tf.D"(%2) : (tensor<i32>) -> ()
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b) : (tensor<i32>) -> tensor<i32>
+      "tf.D"(%c) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
 
-  // CHECK-LABEL: func @aliased_output
-  func @aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
-    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+  // CHECK-LABEL: func @head_aliased_output
+  func @head_aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>) {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
     // CHECK-NEXT:   tf_device.return %[[A_OUT]]
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
     // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[LAUNCH_OUT]])
@@ -81,124 +82,75 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK-DAG:  padding_map = []
     // CHECK-DAG:  topology = ""
     // CHECK-DAG:  device_assignment = []
-    //
-    // CHECK:      return %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1
-    %0:3 = "tf_device.cluster"() ( {
-      %1 = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
-      %2 = "tf.B"(%1) {}: (tensor<i32>) -> tensor<i32>
-      %3 = "tf.C"(%2) : (tensor<i32>) -> tensor<i32>
-      tf_device.return %1, %3, %2 : tensor<i32>, tensor<i32>, tensor<i32>
+    %cluster:3 = "tf_device.cluster"() ( {
+      %a = "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> tensor<i32>
+      %b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b) : (tensor<i32>) -> tensor<i32>
+      tf_device.return %a, %c, %b : tensor<i32>, tensor<i32>, tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>)
-    return %0#0, %0#1, %0#2 : tensor<i32>, tensor<i32>, tensor<i32>
+    // CHECK:      return %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#0, %[[CLUSTER_OUT]]#1
+    return %cluster#0, %cluster#1, %cluster#2 : tensor<i32>, tensor<i32>, tensor<i32>
   }
 
-  // CHECK-LABEL: func @all_head_computation_ops
-  func @all_head_computation_ops(%arg0: tensor<i32>) -> tensor<i32> {
-    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:        %[[A_OUT:.*]] = "tf.A"
-    // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
-    // CHECK:        %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]], %arg0)
+  // CHECK-LABEL: func @head_all_cluster_op
+  func @head_all_cluster_op(%arg0: tensor<i32>) -> tensor<i32> {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%[[B_OUT]], %arg0)
     // CHECK-NEXT:   tf_device.return %[[C_OUT]]
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
     // CHECK-NEXT:   tf_device.return
-    //
-    // CHECK:      return %[[LAUNCH_OUT]]
-    %0 = "tf_device.cluster"() ( {
-      %1 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      %2 = "tf.B"(%1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      %3 = "tf.C"(%2, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-      tf_device.return %3 : tensor<i32>
+    %cluster = "tf_device.cluster"() ( {
+      %a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      tf_device.return %c : tensor<i32>
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
-    return %0 : tensor<i32>
+    // CHECK:      return %[[LAUNCH_OUT]]
+    return %cluster : tensor<i32>
   }
 
-  // CHECK-LABEL: func @multiple_head_outside_compilation
-  func @multiple_head_outside_compilation(%arg0: tensor<i32>) {
-    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:        %[[A_OUT:.*]] = "tf.A"
-    // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
-    // CHECK:        "tf.C"
-    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
-    //
-    // CHECK:      "tf_device.cluster"
-    // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
-    // CHECK-NEXT:   tf_device.return
-    "tf_device.cluster"() ( {
-      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
-      "tf.D"(%1) : (tensor<i32>) -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
-    return
-  }
-
-  // CHECK-LABEL: func @no_extraction_middle_outside_compiled_ops
-  func @no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
-    // CHECK-NOT:  tf_device.launch
-    // CHECK:      "tf_device.cluster"
-    // CHECK-NEXT:   "tf.A"
-    // CHECK-NEXT:   "tf.B"
+  // CHECK-LABEL: func @head_multiple_outside_compiled_ops
+  func @head_multiple_outside_compiled_ops(%arg0: tensor<i32>) {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
     // CHECK-NEXT:   "tf.C"
-    // CHECK-NEXT:   tf_device.return
-    "tf_device.cluster"() ( {
-      %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> tensor<i32>
-      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
-      "tf.C"(%1) : (tensor<i32>) -> ()
-      tf_device.return
-    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
-    return
-  }
-
-  // CHECK-LABEL: func @no_extraction_tpu_op_operands
-  func @no_extraction_tpu_op_operands(%arg0: tensor<i32>) {
-    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:        %[[A_OUT:.*]] = "tf.A"
-    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
-    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
-    // CHECK:      device = "/job:worker/replica:0/task:0/device:CPU:0"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
     //
     // CHECK:      "tf_device.cluster"
-    // CHECK:        "tf.B"
-    // CHECK:        "tf.C"
-    // CHECK:        "tf.E"
+    // CHECK-NEXT:   "tf.D"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:   tf_device.return
     "tf_device.cluster"() ( {
-      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-      %1 = "tf.B"() {} : () -> tensor<i32>
-      %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-      %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
-      %4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
+      %a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      "tf.C"(%b, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
+      "tf.D"(%b) : (tensor<i32>) -> ()
       tf_device.return
     }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
     return
   }
 
-  // CHECK-LABEL: func @replicated_head_outside_compilation
-  func @replicated_head_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
+  // CHECK-LABEL: func @head_replicated_outside_compilation
+  func @head_replicated_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
     // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
     //
     // CHECK-NEXT:   %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-    // CHECK:          %[[A_OUT:.*]] = "tf.A"(%[[RI]])
-    // CHECK:          %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
-    // CHECK-NEXT:     tf_device.return %[[D_OUT]]
-    // CHECK:        device = "TPU_REPLICATED_HOST"
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
+    // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
     //
     // CHECK:        "tf_device.cluster"
-    // CHECK:          %[[B_OUT:.*]] = "tf.B"
-    // CHECK:          "tf.C"(%[[RI]], %[[B_OUT]])
-    // CHECK:          "tf.E"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:     "tf.B"(%[[LAUNCH_OUT]])
     // CHECK-NEXT:     tf_device.return
     tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
       "tf_device.cluster"() ( {
-        %0 = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
-        %1 = "tf.B"() {} : () -> tensor<i32>
-        %2 = "tf.C"(%ri, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
-        %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> tensor<i32>
-        %4 = "tf.E"(%3) {} : (tensor<i32>) -> tensor<i32>
+        %a = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+        "tf.B"(%a) : (tensor<i32>) -> ()
         tf_device.return
       }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
       tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 1c442345d43..95183e04223 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <tuple>
 #include <type_traits>
+#include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -53,6 +54,8 @@ bool HasOutsideCompilationAttribute(Operation* op) {
   return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr;
 }
 
+// Finds op that created a given value. If the value is a BlockArgument, this
+// returns the owner of the Block.
 Operation* GetOpOfValue(Value value) {
   if (auto block_arg = value.dyn_cast<BlockArgument>())
     return block_arg.getOwner()->getParentOp();
@@ -60,36 +63,52 @@ Operation* GetOpOfValue(Value value) {
   return value.getDefiningOp();
 }
 
-// Returns a set of ops that are outside compiled and can be extracted to before
-// the TPU computation. These ops are either connected to the inputs of the TPU
-// computation or other ops that can be extracted, and have no dependencies with
-// other ops in the TPU computation that cannot be extracted.
-llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
-    tf_device::ClusterOp cluster) {
-  Region* cluster_region = &cluster.body();
-  llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
+// Checks if `op` is nested in `block`.
+bool OpInBlock(Operation* op, Block* block) {
+  Block* op_block = op->getBlock();
+  while (op_block) {
+    if (op_block == block) return true;
+    if (auto* parent_op = op_block->getParentOp()) {
+      op_block = parent_op->getBlock();
+    } else {
+      break;
+    }
+  }
+  return false;
+}
 
-  auto cluster_ops = cluster.GetBody().without_terminator();
-  for (Operation& cluster_op : cluster_ops) {
-    if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
-    // An outside compiled op can be extracted if its operands are not from
-    // other ops in the cluster that cannot be extracted.
-    auto walk_result = cluster_op.walk([&](Operation* op) {
-      for (Value operand : op->getOperands()) {
-        Operation* operand_op = GetOpOfValue(operand);
-        if (head_outside_compiled_ops.count(operand_op)) continue;
-
-        if (operand_op->getParentRegion() == cluster_region)
-          return WalkResult::interrupt();
+// Wraps block in a Launch. External uses of ops in the block will be return
+// values of the Launch and remapped to the Launch results.
+tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
+                                         Block* launch_block,
+                                         llvm::StringRef host_device) {
+  // Find results and result types of ops in block that needs to returned.
+  llvm::SmallVector<Value, 4> launch_results;
+  llvm::SmallVector<Type, 4> launch_result_types;
+  for (Operation& head_outside_compiled_op : *launch_block) {
+    for (Value result : head_outside_compiled_op.getResults()) {
+      bool has_external_uses = false;
+      for (Operation* user : result.getUsers()) {
+        if (OpInBlock(user, launch_block)) continue;
+        has_external_uses = true;
+        break;
       }
-      return WalkResult::advance();
-    });
-
-    if (!walk_result.wasInterrupted())
-      head_outside_compiled_ops.insert(&cluster_op);
+      if (has_external_uses) {
+        launch_results.push_back(result);
+        launch_result_types.push_back(result.getType());
+      }
+    }
   }
 
-  return head_outside_compiled_ops.takeVector();
+  builder->setInsertionPoint(op);
+  auto launch = builder->create<tf_device::LaunchOp>(
+      op->getLoc(), builder->getStringAttr(host_device), launch_result_types);
+  launch.body().push_back(launch_block);
+
+  builder->setInsertionPointToEnd(&launch.GetBody());
+  builder->create<tf_device::ReturnOp>(op->getLoc(), launch_results);
+
+  return launch;
 }
 
 // Parses TPU compilation and execution devices from a TPU cluster and returns
@@ -150,57 +169,60 @@ LogicalResult GetHostDeviceForHeadTailComputation(
   return success();
 }
 
+// Returns a set of ops that are outside compiled and can be extracted to before
+// the TPU computation. These ops are either connected to the inputs of the TPU
+// computation or other ops that can be extracted, and have no operands from
+// other ops in the TPU computation that cannot be extracted.
+llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
+    tf_device::ClusterOp cluster) {
+  Region* cluster_region = &cluster.body();
+  llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
+
+  auto walk_operands = [&](Operation* op) {
+    for (Value operand : op->getOperands()) {
+      Operation* operand_op = GetOpOfValue(operand);
+      if (head_outside_compiled_ops.count(operand_op)) continue;
+
+      if (operand_op->getParentRegion() == cluster_region)
+        return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  };
+  auto cluster_ops = cluster.GetBody().without_terminator();
+  for (Operation& cluster_op : cluster_ops) {
+    if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
+    // An outside compiled op can be extracted if its operands are not from
+    // other ops in the cluster that cannot be extracted.
+    auto walk_result = cluster_op.walk(walk_operands);
+
+    if (!walk_result.wasInterrupted())
+      head_outside_compiled_ops.insert(&cluster_op);
+  }
+
+  return head_outside_compiled_ops.takeVector();
+}
+
 // Moves head outside compiled ops into its own `tf_device.LaunchOp`
-// computation.
-tf_device::LaunchOp CreateHeadComputation(
-    OpBuilder* builder, tf_device::ClusterOp cluster,
-    llvm::ArrayRef<Operation*> head_outside_compiled_ops,
-    llvm::StringRef host_device) {
+// computation before the cluster.
+void CreateHeadComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
+                           llvm::ArrayRef<Operation*> head_outside_compiled_ops,
+                           llvm::StringRef host_device) {
   Block* launch_block = new Block;
   for (Operation* head_outside_compiled_op : head_outside_compiled_ops)
     head_outside_compiled_op->moveBefore(launch_block, launch_block->end());
 
-  // Find results of ops in head computation that needs to returned.
-  llvm::SmallVector<Value, 4> launch_results;
-  llvm::SmallVector<Type, 4> launch_result_types;
-  for (Operation& head_outside_compiled_op : *launch_block) {
-    for (Value result : head_outside_compiled_op.getResults()) {
-      bool has_uses_in_cluster = false;
-      for (Operation* user : result.getUsers()) {
-        if (user->getParentRegion() &&
-            cluster.body().isAncestor(user->getParentRegion())) {
-          has_uses_in_cluster = true;
-          break;
-        }
-      }
-      if (has_uses_in_cluster) {
-        launch_results.push_back(result);
-        launch_result_types.push_back(result.getType());
-      }
-    }
-  }
+  tf_device::LaunchOp launch =
+      CreateLaunchForBlock(builder, cluster, launch_block, host_device);
 
-  builder->setInsertionPoint(cluster);
-  auto launch = builder->create<tf_device::LaunchOp>(
-      cluster.getLoc(), builder->getStringAttr(host_device),
-      launch_result_types);
-  launch.body().push_back(launch_block);
-
-  builder->setInsertionPointToEnd(&launch.GetBody());
-  builder->create<tf_device::ReturnOp>(cluster.getLoc(), launch_results);
-
-  for (auto result : llvm::zip(launch_results, launch.getResults()))
+  for (auto result : llvm::zip(launch.GetBody().getTerminator()->getOperands(),
+                               launch.getResults()))
     replaceAllUsesInRegionWith(std::get<0>(result), std::get<1>(result),
                                cluster.body());
-
-  return launch;
 }
 
-// Removes aliased outputs in cluster from head computation after head
-// computation has been extracted.
-void RemoveHeadComputationAliasedOutputs(OpBuilder* builder,
-                                         tf_device::LaunchOp head_computation,
-                                         tf_device::ClusterOp cluster) {
+// Removes aliased outputs in cluster from ops outside of cluster.
+void RemoveClusterAliasedOutputs(OpBuilder* builder,
+                                 tf_device::ClusterOp cluster) {
   llvm::SmallVector<Value, 4> used_old_cluster_results;
   llvm::SmallVector<Value, 4> new_cluster_results;
   llvm::SmallVector<Type, 4> new_cluster_result_types;
@@ -208,12 +230,13 @@ void RemoveHeadComputationAliasedOutputs(OpBuilder* builder,
   for (auto result :
        llvm::zip(cluster_terminator->getOperands(), cluster.getResults())) {
     Value cluster_terminator_operand = std::get<0>(result);
-    if (cluster_terminator_operand.getDefiningOp() == head_computation) {
-      std::get<1>(result).replaceAllUsesWith(cluster_terminator_operand);
-    } else {
+    if (cluster.getOperation()->isProperAncestor(
+            cluster_terminator_operand.getDefiningOp())) {
       new_cluster_results.push_back(cluster_terminator_operand);
       new_cluster_result_types.push_back(cluster_terminator_operand.getType());
       used_old_cluster_results.push_back(std::get<1>(result));
+    } else {
+      std::get<1>(result).replaceAllUsesWith(cluster_terminator_operand);
     }
   }
 
@@ -233,6 +256,26 @@ void RemoveHeadComputationAliasedOutputs(OpBuilder* builder,
   cluster.erase();
 }
 
+// Extracts and move outside compiled ops that have no dependencies in the
+// cluster to before the cluster.
+mlir::LogicalResult LiftHeadOutsideCompiledOps(
+    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
+    tf_device::ClusterOp cluster) {
+  llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
+      FindOutsideCompiledOpsAtHead(cluster);
+  if (head_outside_compiled_ops.empty()) return success();
+  std::string host_device;
+  if (failed(
+          GetHostDeviceForHeadTailComputation(devices, cluster, &host_device)))
+    return failure();
+
+  CreateHeadComputation(builder, cluster, head_outside_compiled_ops,
+                        host_device);
+
+  RemoveClusterAliasedOutputs(builder, cluster);
+  return success();
+}
+
 struct TPUExtractHeadTailOutsideCompilation
     : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
                          OperationPass<ModuleOp>> {
@@ -252,18 +295,9 @@ void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
       [&](tf_device::ClusterOp cluster) { clusters.push_back(cluster); });
 
   for (tf_device::ClusterOp cluster : clusters) {
-    llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
-        FindOutsideCompiledOpsAtHead(cluster);
-    if (head_outside_compiled_ops.empty()) continue;
-    std::string host_device;
-    if (failed(GetHostDeviceForHeadTailComputation(devices, cluster,
-                                                   &host_device)))
+    if (failed(LiftHeadOutsideCompiledOps(&builder, devices, cluster)))
       return signalPassFailure();
 
-    tf_device::LaunchOp head_computation = CreateHeadComputation(
-        &builder, cluster, head_outside_compiled_ops, host_device);
-    RemoveHeadComputationAliasedOutputs(&builder, head_computation, cluster);
-
     // TODO(b/157160906): Implement tail outside compiled op extraction.
   }
 }

From f90c649e28a736ef7f1f9349f72b2d8d1afaa906 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 29 May 2020 15:16:01 -0700
Subject: [PATCH 1417/1533] Uniformly import and export _Arg node/FunctionDef
 arg attributes.

In a Function Graph (Graph generated from a Function/FunctionDef), it is possible to have other attributes on the generated _Arg nodes. These attributes are either modeled as fields in FunctionDef ('_resource_arg_unique_id' attributes are stored as FunctionDef::map<uint32, uint32> resource_arg_unique_id) or explicitly in FunctionDef::map<uint32, ArgAttrs> arg_attr. When converting a FunctionDef to a Graph (in import), these attributes are added to generated _Arg node attributes. Some of these attributes should be preserved for downstream users. Currently only '_resource_arg_unique_id' is being imported with special handling.

This change unifies and imports any _Arg attribute that is not a shape inference based attribute or _Arg op def attribute. On export, attributes of the 'tf' dialect ('tf.' prefix) are added back. For the main function Graph, the attributes are simply added back to the generated _Arg node. For other functions, as a FunctionDef is created instead, '_resource_arg_unique_id' is handled differently, specifically adding it's content to FunctionDef::map<uint32, uint32> resource_arg_unique_id while all other attribute are added to FunctionDef::map<uint32, ArgAttrs> arg_attr.

PiperOrigin-RevId: 313859301
Change-Id: I3bb37bb63cc4d401d628c08989900524d0db0572
---
 .../analysis/side_effect_analysis.cc          |  2 +-
 .../graphdef2mlir/graph-as-function.pbtxt     |  2 +-
 .../graph-function-resource-args.pbtxt        |  6 +-
 .../mlir2graphdef/function-resource-args.mlir |  4 +-
 .../tests/side-effect-analysis-test.mlir      |  6 +-
 .../tensorflow/tests/tf_saved_model/basic.py  |  2 +-
 .../tests/tf_saved_model/call_to_exported.py  |  4 +-
 .../tensorflow/tests/tf_saved_model/keras.py  |  2 +-
 .../tests/tf_saved_model/structured_input.py  | 34 ++++----
 .../tensorflow/translate/export_graphdef.cc   | 50 +++++++-----
 .../mlir/tensorflow/translate/import_model.cc | 79 +++++++++----------
 11 files changed, 98 insertions(+), 93 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index ff1620347f7..f7b88317cd4 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -49,7 +49,7 @@ namespace TF {
 namespace {
 
 constexpr int64_t kUnknownResourceId = -1;
-constexpr char kResourceArgUniqueIdAttr[] = "tf.resource_arg_unique_id";
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
 
 // Returns if a VarHandleOp is anonymous, which means it always creates a new
 // variable.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index d26585edb03..03640e24aac 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -13,7 +13,7 @@
 # CHECK:          %[[ISLAND_2:.*]], %[[ISLAND_2_control:.*]] = tf_executor.island wraps "tf.StatefulPartitionedCall"
 # CHECK-SAME:       f = @[[FUNC:[a-z0-9]*]]
 # CHECK:          tf_executor.fetch %[[ISLAND_1]], %[[ISLAND_2]] : tensor<*xf32>, tensor<*xf32>
-# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
+# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32> {tf._user_specified_name = "inputs"}, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
 
 node {
   name: "args_0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
index 0e6e561225d..eb358d52b26 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
@@ -81,7 +81,7 @@ library {
 }
 
 # Check that the `resource_arg_unique_id` for each argument is propagated to the
-# `tf.resource_arg_unique_id` argument attribute of the function
+# `tf._resource_arg_unique_id` argument attribute of the function
 # @test_func_name0.
 
 # CHECK:  func @main
@@ -92,8 +92,8 @@ library {
 # CHECK:      tf_executor.fetch
 # CHECK:    return
 # CHECK:  func @test_func_name0
-# CHECK-SAME:   tf.resource_arg_unique_id = 0
-# CHECK-SAME:   tf.resource_arg_unique_id = 0
+# CHECK-SAME:   tf._resource_arg_unique_id = 0
+# CHECK-SAME:   tf._resource_arg_unique_id = 0
 # CHECK:    tf_executor.graph
 # CHECK:      tf_executor.fetch
 # CHECK:    return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
index 680e26f5cbb..44824ea1424 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
@@ -8,14 +8,14 @@ func @main() -> tensor<*x!tf.resource> attributes {tf.entry_function = {inputs =
   }
   return %0 : tensor<*x!tf.resource>
 }
-func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
+func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
   %0 = tf_executor.graph {
     tf_executor.fetch %arg0 : tensor<*x!tf.resource>
   }
   return %0 : tensor<*x!tf.resource>
 }
 
-// Check that the `tf.resource_arg_unique_id` argument attributes of
+// Check that the `tf._resource_arg_unique_id` argument attributes of
 // test_func_name0 are propagated to the function's arg_attr and
 // resource_arg_unique_id.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index c36771c0576..965b3b10843 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -786,9 +786,9 @@ func @tf_registry_ops(
 // CHECK-LABEL: func @arguments_with_unique_ids
 func @arguments_with_unique_ids(
   // expected-remark@above {{ID: 9}}
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
-  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 33 : i64}) {
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 33 : i64}) {
   tf_executor.graph {
   // expected-remark@above {{ID: 7}}
   // expected-remark@above {{Successors: {8}}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
index 78c18a17d4a..b337224e680 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # CHECK: "tf_saved_model.global_tensor"() {is_mutable, sym_name = "[[VAR:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = ["v42"], type = tensor<f32>, value = dense<4.200000e+01> : tensor<f32>} : () -> ()
   # CHECK: "tf_saved_model.global_tensor"() {sym_name = "[[CONST:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = [], type = tensor<f32>, value = dense<4.300000e+01> : tensor<f32>} : () -> ()
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[VAR]]},
   # CHECK-SAME:   %arg2: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[CONST]]}) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = []})
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
index 658cc37a22f..694942f4b00 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # modify signatures interprocedurally).
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
@@ -54,7 +54,7 @@ class TestModule(tf.Module):
   # CHECK:        "tf.StatefulPartitionedCall"{{.*}}f = @[[CALLEE_INTERNAL:[a-zA-Z_0-9]+]]
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
index a95909b61ef..ffb5c024bbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
@@ -39,7 +39,7 @@ class TestModule(tf.Module):
     super(TestModule, self).__init__()
     self.model = mnist_model()
 
-  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf_saved_model.index_path = [0]}
+  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]}
   # CHECK: attributes {{.*}} tf_saved_model.exported_names = ["my_predict"]
   @tf.function(input_signature=[
       tf.TensorSpec([1, 28, 28, 1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
index 095fddbda96..43591d12183 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
@@ -36,8 +36,8 @@ class TestModule(tf.Module):
   # The outer layer of the index path indexes into the arguments.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [1]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "y", tf_saved_model.index_path = [1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0000_function_arity"]
   @tf.function(input_signature=[
       tf.TensorSpec([1], tf.float32),
@@ -49,8 +49,8 @@ class TestModule(tf.Module):
   # Check index paths for lists.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0, 0]},
-  # CHECK-SAME:   %arg1: tensor<f32> {tf_saved_model.index_path = [0, 1]})
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 0]},
+  # CHECK-SAME:   %arg1: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0001_list_2_elements"]
   @tf.function(input_signature=[[
       tf.TensorSpec([], tf.float32),
@@ -63,8 +63,8 @@ class TestModule(tf.Module):
   # Keys are linearized in sorted order, matching `tf.nest.flatten`.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0002_dict_2_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([1], tf.float32),
@@ -77,8 +77,8 @@ class TestModule(tf.Module):
   # The index path should be insensitive to the key order.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0003_dict_2_keys_out_of_order"]
   @tf.function(input_signature=[{
       'y': tf.TensorSpec([2], tf.float32),
@@ -90,12 +90,12 @@ class TestModule(tf.Module):
   # Slightly stronger stress test of multiple dict keys.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "a"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "b"]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "c"]},
-  # CHECK-SAME:   %arg3: tensor<4xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg4: tensor<5xf32> {tf_saved_model.index_path = [0, "y"]},
-  # CHECK-SAME:   %arg5: tensor<6xf32> {tf_saved_model.index_path = [0, "z"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "a"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "b"]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "c"]},
+  # CHECK-SAME:   %arg3: tensor<4xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg4: tensor<5xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]},
+  # CHECK-SAME:   %arg5: tensor<6xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "z"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0004_dict_many_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([4], tf.float32),
@@ -112,9 +112,9 @@ class TestModule(tf.Module):
   # Note that list elements can have heterogenous types.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x", 0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "x", 1]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 1]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0005_more_complex_recursive_structure"]
   @tf.function(input_signature=[{
       'x': [tf.TensorSpec([1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 2bf55922d4b..8e51f8c9a25 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -70,7 +70,6 @@ using llvm::isa;
 using mlir::BlockArgument;
 using mlir::Dialect;
 using mlir::Operation;
-using mlir::OperationState;
 using mlir::Value;
 using stream_executor::port::StatusOr;
 
@@ -79,6 +78,9 @@ namespace {
 constexpr char kInvalidExecutorGraphMsg[] =
     "Functions must be of a single Graph with single op Islands: ";
 
+constexpr char kDeviceAttr[] = "tf.device";
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+
 bool IsLegalChar(char c, bool first_char) {
   if (isalpha(c)) return true;
   if (isdigit(c)) return true;
@@ -267,17 +269,14 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
   (*node_def->mutable_attr())["index"] = index_attr;
 
   if (auto device_attr =
-          func.getArgAttrOfType<mlir::StringAttr>(index, "tf.device")) {
+          func.getArgAttrOfType<mlir::StringAttr>(index, kDeviceAttr))
     *node_def->mutable_device() = device_attr.getValue().str();
-  }
 
-  if (auto resource_arg_unique_id_attr =
-          func.getArgAttrOfType<mlir::IntegerAttr>(
-              index, "tf.resource_arg_unique_id")) {
-    AttrValue unique_id_attr;
-    unique_id_attr.set_i(resource_arg_unique_id_attr.getInt());
-    (*node_def->mutable_attr())["_resource_arg_unique_id"] = unique_id_attr;
-  }
+  llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
+      func.getArgAttrs(index);
+  absl::flat_hash_set<absl::string_view> attrs_to_ignore = {kDeviceAttr};
+  TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                       node_def->mutable_attr()));
 
   return node_def;
 }
@@ -682,14 +681,6 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
   if (auto attr = function.getAttrOfType<mlir::UnitAttr>(stateful_string)) {
     func_def.mutable_signature()->set_is_stateful(true);
   }
-  for (int64 i = 0; i < function.getNumArguments(); ++i) {
-    if (auto resource_arg_unique_id_attr =
-            function.getArgAttrOfType<mlir::IntegerAttr>(
-                i, "tf.resource_arg_unique_id")) {
-      (*func_def.mutable_resource_arg_unique_id())[i] =
-          resource_arg_unique_id_attr.getInt();
-    }
-  }
 
   // Ignore the gradient and is_stateful attribute on the function as they have
   // been handled above.
@@ -699,7 +690,28 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
       function.getDialectAttrs());
   TF_RETURN_IF_ERROR(
       ConvertAttributes(funcAttrs, attrs_to_ignore, func_def.mutable_attr()));
-  (*flib->add_function()) = func_def;
+
+  for (int i = 0, e = function.getNumArguments(); i < e; ++i) {
+    if (auto resource_arg_unique_id_attr =
+            function.getArgAttrOfType<mlir::IntegerAttr>(
+                i, kResourceArgUniqueIdAttr)) {
+      (*func_def.mutable_resource_arg_unique_id())[i] =
+          resource_arg_unique_id_attr.getInt();
+    }
+
+    llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
+        function.getArgAttrs(i);
+    if (func_arg_i_attrs.empty()) continue;
+    absl::flat_hash_set<absl::string_view> attrs_to_ignore = {
+        kDeviceAttr, kResourceArgUniqueIdAttr};
+    FunctionDef::ArgAttrs func_def_arg_i_attrs;
+    TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                         func_def_arg_i_attrs.mutable_attr()));
+    if (func_def_arg_i_attrs.attr().empty()) continue;
+    (*func_def.mutable_arg_attr())[i] = std::move(func_def_arg_i_attrs);
+  }
+
+  (*flib->add_function()) = std::move(func_def);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index bd63a3b224f..24a1d40a8bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -132,6 +133,13 @@ bool IsOutputShapesAttribute(const AttrValue& attr_value,
          attr_value.value_case() == AttrValue::kList;
 }
 
+bool IsResourceOutputShapesAttribute(const AttrValue& attr_value,
+                                     llvm::StringRef attr_name) {
+  if (attr_name == "_handle_dtypes" || attr_name == "_handle_shapes")
+    return attr_value.value_case() == AttrValue::kList;
+  return false;
+}
+
 // This class is used to generate new MLIR function name strings that are both
 // unique in the TF function library `flib_` and unique among the name strings
 // generated by the class object during its lifetime.
@@ -195,15 +203,11 @@ class ImporterBase {
   StatusOr<mlir::FunctionType> InferLibFunctionType(const FunctionBody& fbody);
 
   // Extracts arg and ret nodes from FunctionBody.
-  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
-  // variables, as a list of {index, ID} pairs.
   void GetArgsAndRetsFromFunctionBody(
       const FunctionBody& fbody,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
       absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-      absl::InlinedVector<Node*, 4>* control_ret_nodes,
-      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-          resource_arg_unique_ids);
+      absl::InlinedVector<Node*, 4>* control_ret_nodes);
 
   // Prepares converting the graph to an MLIR module. This step removes the
   // backedges of the graph, orders the nodes and infers the shapes.
@@ -217,8 +221,7 @@ class ImporterBase {
                  const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
                  const absl::InlinedVector<Node*, 4>& control_ret_nodes,
                  llvm::ArrayRef<mlir::NamedAttribute> attrs,
-                 const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
-                     resource_arg_unique_ids);
+                 bool function_graph);
 
   // Finds out the function definition for the given function name from the
   // graph and converts it to a function of the module. This method is called
@@ -1192,9 +1195,7 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
 void ImporterBase::GetArgsAndRetsFromFunctionBody(
     const FunctionBody& fbody, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
     absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-    absl::InlinedVector<Node*, 4>* control_ret_nodes,
-    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-        resource_arg_unique_ids) {
+    absl::InlinedVector<Node*, 4>* control_ret_nodes) {
   arg_nodes->reserve(fbody.arg_nodes.size());
   ret_nodes->reserve(fbody.ret_nodes.size());
   for (auto arg : fbody.arg_nodes) {
@@ -1203,9 +1204,6 @@ void ImporterBase::GetArgsAndRetsFromFunctionBody(
   for (auto ret : fbody.ret_nodes) {
     ret_nodes->emplace_back(ret, 0);
   }
-  for (const auto& entry : fbody.fdef.resource_arg_unique_id()) {
-    resource_arg_unique_ids->push_back(entry);
-  }
   *control_ret_nodes = fbody.control_ret_nodes;
 }
 
@@ -1300,14 +1298,13 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   GetArgsAndRetsFromFunctionBody(*fbody, &arg_nodes, &ret_nodes,
-                                 &control_ret_nodes, &resource_arg_unique_ids);
+                                 &control_ret_nodes);
 
   TF_RETURN_IF_ERROR(child_importer.Convert(
       mlir_func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes,
       llvm::makeArrayRef(attributes.begin(), attributes.end()),
-      resource_arg_unique_ids));
+      /*function_graph=*/true));
   return Status::OK();
 }
 
@@ -1409,9 +1406,7 @@ Status ImporterBase::Convert(
     const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
     const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
     const absl::InlinedVector<Node*, 4>& control_ret_nodes,
-    llvm::ArrayRef<mlir::NamedAttribute> attrs,
-    const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
-        resource_arg_unique_ids) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs, bool function_graph) {
   // TODO(b/122040776): Uses debug info for FunctionDef.
   auto function = mlir::FuncOp::create(mlir::UnknownLoc::get(context_),
                                        func_name, func_type, attrs);
@@ -1437,10 +1432,6 @@ Status ImporterBase::Convert(
   TF_RETURN_IF_ERROR(ConvertFunctionArgAndRets(function, graph,
                                                func_type.getInputs(), arg_nodes,
                                                ret_nodes, control_ret_nodes));
-  for (const auto& entry : resource_arg_unique_ids) {
-    function.setArgAttr(entry.first, "tf.resource_arg_unique_id",
-                        builder_.getI64IntegerAttr(entry.second));
-  }
 
   // TODO(jpienaar): Update post removing shape_refinier_.
   if (!specs_.enable_shape_inference) {
@@ -1499,6 +1490,22 @@ Status ImporterBase::ConvertFunctionArgAndRets(
           i, "tf.device",
           builder_.getStringAttr(arg_node.node->requested_device()));
 
+    if (arg_node.node->IsArg()) {
+      for (const auto& arg_node_attr : arg_node.node->attrs()) {
+        const auto& key = arg_node_attr.first;
+        // Only import attributes starting with an underscore.
+        if (key.empty() || key[0] != '_') continue;
+        // Ignore shape inference attributes as shape information is already
+        // populated in the result type.
+        if (IsOutputShapesAttribute(arg_node_attr.second, key) ||
+            IsResourceOutputShapesAttribute(arg_node_attr.second, key))
+          continue;
+        TF_ASSIGN_OR_RETURN(auto converted_attr,
+                            ConvertAttributeValue(arg_node_attr.second));
+        func.setArgAttr(i, llvm::formatv("tf.{0}", key).str(), converted_attr);
+      }
+    }
+
     island->dropAllReferences();
     island->erase();
   }
@@ -2108,14 +2115,10 @@ class GraphDefImporter : public ImporterBase {
   // output nodes, for function graphs. Arguments and return values are
   // determined by node op type. Type and shape information of the function are
   // inferred by the shape refiner in ImporterBase.
-  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
-  // variables, as a list of {index, ID} pairs.
   StatusOr<mlir::FunctionType> GetArgsRetsAndTypesFromFunctionGraph(
       mlir::MLIRContext* context,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-      absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-          resource_arg_unique_ids);
+      absl::InlinedVector<OutputTensor, 4>* ret_nodes);
 
   // Finds the graph's target nodes/function's control ret nodes based on
   // supplied node names in `control_outputs`. If `control_outputs` are not
@@ -2143,7 +2146,6 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   llvm::SmallVector<mlir::NamedAttribute, 1> attrs;
   if (specs.graph_as_function) {
     if (specs.prune_unused_nodes || !specs.inputs.empty() ||
@@ -2152,10 +2154,9 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
           "Pruning of graph is currently unsupported when the main graph is "
           "converted to a function.");
 
-    TF_ASSIGN_OR_RETURN(
-        func_type,
-        importer.GetArgsRetsAndTypesFromFunctionGraph(
-            context, &arg_nodes, &ret_nodes, &resource_arg_unique_ids));
+    TF_ASSIGN_OR_RETURN(func_type,
+                        importer.GetArgsRetsAndTypesFromFunctionGraph(
+                            context, &arg_nodes, &ret_nodes));
 
     TF_RETURN_IF_ERROR(importer.GetControlRetsFromGraph(specs.control_outputs,
                                                         &control_ret_nodes));
@@ -2223,7 +2224,7 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
 
   TF_RETURN_IF_ERROR(importer.ImporterBase::Convert(
       func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes, attrs,
-      resource_arg_unique_ids));
+      specs.graph_as_function));
   return module;
 }
 
@@ -2340,9 +2341,7 @@ StatusOr<mlir::FunctionType> GraphDefImporter::InferMainFunctionType(
 StatusOr<mlir::FunctionType>
 GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     mlir::MLIRContext* context, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-    absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-        resource_arg_unique_ids) {
+    absl::InlinedVector<OutputTensor, 4>* ret_nodes) {
   auto add_node = [](Node* node, absl::InlinedVector<OutputTensor, 4>* nodes) {
     auto* attr = node->attrs().Find("index");
     if (!attr)
@@ -2383,12 +2382,6 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     TF_ASSIGN_OR_RETURN(auto type,
                         InferOutputType(*arg_node.node, /*idx=*/0, builder));
     arg_types.push_back(type);
-    tensorflow::int64 resource_arg_unique_id;
-    if (TryGetNodeAttr(arg_node.node->attrs(), "_resource_arg_unique_id",
-                       &resource_arg_unique_id)) {
-      resource_arg_unique_ids->emplace_back(arg_node_and_idx.index(),
-                                            resource_arg_unique_id);
-    }
   }
 
   llvm::SmallVector<mlir::Type, 4> ret_types;

From 85396efcd31fd77fe264f9088d7bd8c92abc7b60 Mon Sep 17 00:00:00 2001
From: Guangda Lai <laigd@google.com>
Date: Fri, 29 May 2020 15:25:59 -0700
Subject: [PATCH 1418/1533] Make tf.If work with ConcreteFunction.

PiperOrigin-RevId: 313860966
Change-Id: I1fccdaf06802511a7020a4045751cdd6b6821687
---
 tensorflow/python/BUILD                      | 15 +++++
 tensorflow/python/ops/functional_ops.py      | 11 ++--
 tensorflow/python/ops/functional_ops_test.py | 69 ++++++++++++++++++++
 3 files changed, 90 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/python/ops/functional_ops_test.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index d93c2314954..246dc34eed8 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2214,6 +2214,21 @@ py_library(
     ],
 )
 
+py_test(
+    name = "ops/functional_ops_test",
+    srcs = ["ops/functional_ops_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":dtypes",
+        ":function",
+        ":functional_ops",
+        ":tensor_spec",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
 cuda_py_test(
     name = "function_test",
     size = "medium",
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 37b41a55eb9..6e285d6681d 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -838,12 +838,13 @@ def If(cond, inputs, then_branch, else_branch, name=None):
     or else_branch(inputs).
   """
   # pylint: disable=protected-access
+  if isinstance(then_branch, function._DefinedFunction):
+    tlist = [_.type for _ in then_branch.definition.signature.output_arg]
+  else:
+    # We assume that `then_branch` is a ConcreteFunction here.
+    tlist = nest.flatten(then_branch.output_dtypes)
   return gen_functional_ops._if(
-      cond,
-      inputs, [_.type for _ in then_branch.definition.signature.output_arg],
-      then_branch,
-      else_branch,
-      name=name)
+      cond, inputs, tlist, then_branch, else_branch, name=name)
 
 
 def Gradient(inputs, f, name=None):
diff --git a/tensorflow/python/ops/functional_ops_test.py b/tensorflow/python/ops/functional_ops_test.py
new file mode 100644
index 00000000000..7e3bc631c44
--- /dev/null
+++ b/tensorflow/python/ops/functional_ops_test.py
@@ -0,0 +1,69 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for functional operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.platform import test
+
+
+class FunctionalOpsTest(test.TestCase):
+
+  @test_util.deprecated_graph_mode_only
+  def testIfWithDefun(self):
+
+    @function.Defun(dtypes.float32)
+    def Then(x):
+      return x + 1
+
+    @function.Defun(dtypes.float32)
+    def Else(x):
+      return x - 1
+
+    with self.cached_session():
+      inputs = [10.]
+      result = self.evaluate(functional_ops.If(False, inputs, Then, Else))
+      self.assertEqual([9.0], result)
+
+  def testIfWithFunction(self):
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec((), dtypes.float32)])
+    def Then(x):
+      return x + 1
+
+    @def_function.function(
+        input_signature=[tensor_spec.TensorSpec((), dtypes.float32)])
+    def Else(x):
+      return x - 1
+
+    with self.cached_session():
+      inputs = [10.]
+      result = self.evaluate(
+          functional_ops.If(False, inputs, Then.get_concrete_function(),
+                            Else.get_concrete_function()))
+      self.assertEqual([9.0], result)
+
+
+if __name__ == '__main__':
+  test.main()

From 66529c35a76cc775605e904ab21b1663b0e7db8e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 15:31:08 -0700
Subject: [PATCH 1419/1533] Add timeout to collective ops to detect deadlocks.

The timeout is set as an argument to a collective op. When non zero value, a completion timeout is set to detect staleness. If a timeout goes off, the execution is aborted through a DEADLINE_EXCEEDED error.

PiperOrigin-RevId: 313861868
Change-Id: I7fee45736608ad7fbcc9dd980db2fd302c9cb4df
---
 .../base_collective_executor.cc               | 60 ++++++++++--
 tensorflow/core/framework/collective.h        |  2 +
 tensorflow/core/kernels/collective_ops.cc     | 15 ++-
 tensorflow/core/ops/collective_ops.cc         |  4 +
 tensorflow/python/ops/collective_ops.py       | 62 +++++++++---
 tensorflow/python/ops/collective_ops_test.py  | 94 +++++++++++++++++--
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |  8 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |  8 +-
 8 files changed, 217 insertions(+), 36 deletions(-)

diff --git a/tensorflow/core/common_runtime/base_collective_executor.cc b/tensorflow/core/common_runtime/base_collective_executor.cc
index 7a614a8d224..12e30da2773 100644
--- a/tensorflow/core/common_runtime/base_collective_executor.cc
+++ b/tensorflow/core/common_runtime/base_collective_executor.cc
@@ -221,23 +221,42 @@ void BaseCollectiveExecutor::ExecuteAsync(OpKernelContext* ctx,
                                           const CollectiveParams& col_params,
                                           const string& exec_key,
                                           StatusCallback done) {
+  const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
+
   // On any individual collective Op failure we need to abort the
   // BufRendezvous so that other Ops in the instance don't hang
   // waiting for transmissions that will never happen.  Do so after a
   // delay so that the original error status is more likely to
   // propagate up, and peers are unlikely to re-create the purged
   // BufRendezvous by late-arriving requests.
-  StatusCallback done_safe = [this, done](const Status& s) {
-    if (!s.ok()) {
-      Ref();  // Ensure this lasts until the closure executes.
-      SchedNonBlockingClosureAfter(1000000, [this, s] {
-        remote_access_->buf_rendezvous()->StartAbort(s);
-        Unref();
-      });
+  StatusCallback done_safe = [this, done, is_callback_called](const Status& s) {
+    auto should_call_callback = !is_callback_called->exchange(true);
+    if (should_call_callback) {
+      if (!s.ok()) {
+        Ref();  // Ensure this lasts until the closure executes.
+        SchedNonBlockingClosureAfter(1000000, [this, s] {
+          remote_access_->buf_rendezvous()->StartAbort(s);
+          Unref();
+        });
+      }
+      done(s);
     }
-    done(s);
   };
 
+  auto timeout_microseconds = static_cast<int64>(
+      col_params.instance.impl_details.timeout_seconds * 1'000'000);
+  if (timeout_microseconds > 0) {
+    // TODO(xldrx): Share the timeout watchdog thread among collectives.
+    SchedNonBlockingClosureAfter(
+        timeout_microseconds, [is_callback_called, done_safe] {
+          if (!is_callback_called->load()) {
+            auto status = Status(error::DEADLINE_EXCEEDED,
+                                 "Collective has timed out during execution.");
+            done_safe(status);
+          }
+        });
+  }
+
   Tensor* output = ctx->mutable_output(0);
   const Tensor* input = (col_params.instance.type == REDUCTION_COLLECTIVE ||
                          col_params.instance.type == GATHER_COLLECTIVE ||
@@ -284,7 +303,30 @@ void BaseCollectiveExecutor::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
     StatusCallback done) {
   cp->instance.gpu_ring_order = *gpu_ring_order_;
-  cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr, done);
+  const auto is_callback_called = std::make_shared<std::atomic<bool>>(false);
+  auto done_with_timeout = done;
+  auto timeout_microseconds =
+      static_cast<int64>(cp->instance.impl_details.timeout_seconds * 1'000'000);
+  if (timeout_microseconds > 0) {
+    // TODO(xldrx): Share the timeout watchdog thread among collectives.
+    SchedNonBlockingClosureAfter(
+        timeout_microseconds, [is_callback_called, done] {
+          if (!is_callback_called->load()) {
+            auto status =
+                Status(error::DEADLINE_EXCEEDED,
+                       "Collective has timed out waiting for other workers.");
+            done(status);
+          }
+        });
+    done_with_timeout = [is_callback_called, done](const Status& s) {
+      auto should_call_callback = !is_callback_called->exchange(true);
+      if (should_call_callback) {
+        done(s);
+      }
+    };
+  }
+  cem_->GetParamResolver()->CompleteParamsAsync(device, cp, cancel_mgr,
+                                                done_with_timeout);
 }
 
 Status BaseCollectiveExecutor::CreateCollective(
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index 50f425eeaca..3726fde9809 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -84,6 +84,8 @@ struct CollImplDetails {
       dependencies;           // collective instances on which this node depends
   string communication_hint;  // user-supplied hint for implementation choice,
                               // e.g. ring or nccl
+  float timeout_seconds;      // If non zero, set a completion timeout for the
+                              // collective op to detect staleness.
 };
 
 // Data common to all members of a collective instance.
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index b457f7e29d4..4951d0895c6 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -85,6 +85,9 @@ class CollectiveGatherOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Gather");
     col_params_.group.device_type = c->device_type();
@@ -176,10 +179,14 @@ class CollectiveReduceOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     VLOG(2) << "CollectiveReduce instance " << col_params_.instance.instance_key
             << " merge_op " << merge_op_name << " final_op " << final_op_name
             << " communication_hint "
-            << col_params_.instance.impl_details.communication_hint;
+            << col_params_.instance.impl_details.communication_hint
+            << " timeout " << col_params_.instance.impl_details.timeout_seconds;
 
     const NodeDef& real_node = c->def();
     col_params_.name = strings::StrCat(real_node.name(), ": Reduce(",
@@ -284,6 +291,9 @@ class CollectiveBcastSendOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     col_params_.is_source = true;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
@@ -363,6 +373,9 @@ class CollectiveBcastRecvOpKernel : public CollectiveOpKernel {
     OP_REQUIRES_OK(
         c, c->GetAttr("communication_hint",
                       &col_params_.instance.impl_details.communication_hint));
+    OP_REQUIRES_OK(
+        c, c->GetAttr("timeout_seconds",
+                      &col_params_.instance.impl_details.timeout_seconds));
     col_params_.is_source = false;
     col_params_.instance.impl_details.subdiv_offsets = {0};
 
diff --git a/tensorflow/core/ops/collective_ops.cc b/tensorflow/core/ops/collective_ops.cc
index 492c7488b2c..23d09ff61ec 100644
--- a/tensorflow/core/ops/collective_ops.cc
+++ b/tensorflow/core/ops/collective_ops.cc
@@ -31,6 +31,7 @@ REGISTER_OP("CollectiveReduce")
     .Attr("subdiv_offsets: list(int)")
     .Attr("wait_for: list(int) = []")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnchangedShape);
 
@@ -43,6 +44,7 @@ REGISTER_OP("CollectiveGather")
     .Attr("instance_key: int")
     .Attr("shape: shape")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Scalar input is not supported.
@@ -86,6 +88,7 @@ REGISTER_OP("CollectiveBcastSend")
     .Attr("instance_key: int")
     .Attr("shape: shape")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ExplicitShape);
 
@@ -97,6 +100,7 @@ REGISTER_OP("CollectiveBcastRecv")
     .Attr("instance_key: int")
     .Attr("shape: shape")
     .Attr("communication_hint: string = 'auto'")
+    .Attr("timeout_seconds: float = 0")
     .SetIsStateful()
     .SetShapeFn(shape_inference::ExplicitShape);
 
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
index f34d6631783..7ae2fec262e 100644
--- a/tensorflow/python/ops/collective_ops.py
+++ b/tensorflow/python/ops/collective_ops.py
@@ -20,8 +20,15 @@ from __future__ import print_function
 from tensorflow.python.ops import gen_collective_ops
 
 
-def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
-               subdiv_offsets=(0,), communication_hint='auto'):
+def all_reduce(t,
+               group_size,
+               group_key,
+               instance_key,
+               merge_op,
+               final_op,
+               subdiv_offsets=(0,),
+               communication_hint='auto',
+               timeout=0):
   """Reduces tensors collectively, across devices.
 
   Args:
@@ -40,6 +47,9 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed reduction.
@@ -57,11 +67,16 @@ def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
       merge_op=merge_op,
       final_op=final_op,
       subdiv_offsets=subdiv_offsets,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
 
 
-def all_gather(t, group_size, group_key, instance_key,
-               communication_hint='auto'):
+def all_gather(t,
+               group_size,
+               group_key,
+               instance_key,
+               communication_hint='auto',
+               timeout=0):
   """Accumulates tensors collectively, across devices, along first dimension.
 
   Args:
@@ -73,6 +88,9 @@ def all_gather(t, group_size, group_key, instance_key,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed operation.
@@ -88,11 +106,18 @@ def all_gather(t, group_size, group_key, instance_key,
       group_size=group_size,
       group_key=group_key,
       instance_key=instance_key,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
 
 
-def broadcast_send(t, shape, dtype, group_size, group_key, instance_key,
-                   communication_hint='auto'):
+def broadcast_send(t,
+                   shape,
+                   dtype,
+                   group_size,
+                   group_key,
+                   instance_key,
+                   communication_hint='auto',
+                   timeout=0):
   """Broadcasts one tensor to a group of others, across devices.
 
   Args:
@@ -107,6 +132,9 @@ def broadcast_send(t, shape, dtype, group_size, group_key, instance_key,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the distributed broadcast send.
@@ -139,11 +167,17 @@ def broadcast_send(t, shape, dtype, group_size, group_key, instance_key,
       group_size=group_size,
       group_key=group_key,
       instance_key=instance_key,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
 
 
-def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
-                   communication_hint='auto'):
+def broadcast_recv(shape,
+                   dtype,
+                   group_size,
+                   group_key,
+                   instance_key,
+                   communication_hint='auto',
+                   timeout=0):
   """Receives a broadcasts tensor, across devices.
 
   Args:
@@ -157,6 +191,9 @@ def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
     communication_hint: preferred collective communication.  The implementation
       may fall back to another mechanism.  Options include `auto`, `ring`, and
       `nccl`.
+    timeout: If set to a non zero, set a completion timeout to detect staleness.
+      If the timer goes off, a DeadlineExceededError is raised.
+      The timeout value in seconds. This feature is experimental.
 
   Returns:
     An Op implementing the broadcast receive.
@@ -173,4 +210,5 @@ def broadcast_recv(shape, dtype, group_size, group_key, instance_key,
       group_size=group_size,
       group_key=group_key,
       instance_key=instance_key,
-      communication_hint=communication_hint.lower())
+      communication_hint=communication_hint.lower(),
+      timeout_seconds=timeout)
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index b9b533147fc..47c25fcafc0 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import time
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.eager import context
@@ -40,11 +42,21 @@ from tensorflow.python.platform import tf_logging as logging
 
 class CollectiveOpTest(test.TestCase):
 
-  def _testCollectiveReduce(self, inputs, expected, set_graph_key,
-                            communication_hint='auto', fp16=False,
-                            instance_key=1, merge_op='Add', final_op='Div'):
+  def _testCollectiveReduce(self,
+                            inputs,
+                            expected,
+                            set_graph_key,
+                            communication_hint='auto',
+                            fp16=False,
+                            instance_key=1,
+                            merge_op='Add',
+                            final_op='Div',
+                            timeout=0,
+                            reported_group_size=None):
     group_key = 1
     group_size = len(inputs)
+    if reported_group_size is None:
+      reported_group_size = group_size
     device_type = 'CPU'
     config = config_pb2.ConfigProto(device_count={device_type: group_size})
     devices = ['/{}:{}'.format(device_type, i) for i in range(group_size)]
@@ -55,9 +67,16 @@ class CollectiveOpTest(test.TestCase):
         with ops.device(devices[i]):
           tensor = constant_op.constant(inputs[i], dtype=(
               dtypes.float16 if fp16 else dtypes.float32))
-          colred.append(collective_ops.all_reduce(
-              tensor, group_size, group_key, instance_key, merge_op, final_op,
-              communication_hint=communication_hint))
+          colred.append(
+              collective_ops.all_reduce(
+                  tensor,
+                  reported_group_size,
+                  group_key,
+                  instance_key,
+                  merge_op,
+                  final_op,
+                  communication_hint=communication_hint,
+                  timeout=timeout))
       run_options = config_pb2.RunOptions()
       if set_graph_key:
         run_options.experimental.collective_graph_key = 1
@@ -117,6 +136,69 @@ class CollectiveOpTest(test.TestCase):
         [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
         [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
 
+  @test_util.run_deprecated_v1
+  def testCollectiveTimeoutV1(self):
+    timeout = 4.5
+    kwargs = dict(
+        inputs=[[i + j + 0.1 for i in range(8)] for j in range(3)],
+        expected=[1 + i + 0.1 for i in range(8)],
+        set_graph_key=True,
+        timeout=timeout)
+
+    self._testCollectiveReduce(**kwargs)
+
+    start_time = time.time()
+    with self.assertRaisesRegex(
+        errors.DeadlineExceededError,
+        'Collective has timed out waiting for other workers'):
+      self._testCollectiveReduce(
+          reported_group_size=len(kwargs['inputs']) + 1, **kwargs)
+    elapsed = time.time() - start_time
+    self.assertAllGreaterEqual(elapsed, timeout)
+
+  @test_util.run_v2_only
+  def testCollectiveTimeoutV2(self):
+    context._reset_context()
+    timeout = 4.5
+    cpus = config.list_physical_devices('CPU')
+    self.assertEqual(len(cpus), 1)
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration()
+    ])
+    context.ensure_initialized()
+
+    @def_function.function
+    def run_all_reduce(group_size, reported_group_size=None):
+      group_key = 20
+      instance_key = 30
+      tensor = [1, 2, 3, 4]
+      results = []
+      if reported_group_size is None:
+        reported_group_size = group_size
+      for i in range(group_size):
+        with ops.device('/CPU:{}'.format(i)):
+          input_data = constant_op.constant(tensor)
+          collective_op = collective_ops.all_reduce(
+              input_data,
+              group_size=reported_group_size,
+              group_key=group_key,
+              instance_key=instance_key,
+              merge_op='Add',
+              final_op='Id',
+              timeout=timeout)
+          results.append(collective_op)
+      return results
+
+    run_all_reduce(2, 2)
+
+    start_time = time.time()
+    with self.assertRaisesRegex(errors.DeadlineExceededError,
+                                'Collective has timed out during execution'):
+      run_all_reduce(1, 2)
+    elapsed = time.time() - start_time
+    self.assertAllGreaterEqual(elapsed, timeout)
+
   @test_util.run_deprecated_v1
   def testNcclHintFallbackToRingReduce(self):
     """Tests that setting `communication_hint=nccl` works on non-GPU builds."""
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 25ae132c775..19bf27e1cde 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -702,15 +702,15 @@ tf_module {
   }
   member_method {
     name: "CollectiveBcastRecv"
-    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveBcastSend"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveGather"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -718,7 +718,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduce"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 25ae132c775..19bf27e1cde 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -702,15 +702,15 @@ tf_module {
   }
   member_method {
     name: "CollectiveBcastRecv"
-    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'T\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveBcastSend"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectiveGather"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'shape\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CollectivePermute"
@@ -718,7 +718,7 @@ tf_module {
   }
   member_method {
     name: "CollectiveReduce"
-    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'None\'], "
+    argspec: "args=[\'input\', \'group_size\', \'group_key\', \'instance_key\', \'merge_op\', \'final_op\', \'subdiv_offsets\', \'wait_for\', \'communication_hint\', \'timeout_seconds\', \'name\'], varargs=None, keywords=None, defaults=[\'[]\', \'auto\', \'0\', \'None\'], "
   }
   member_method {
     name: "CombinedNonMaxSuppression"

From 80671523fcb8f9c377b2ab893bb7dcc7a4446386 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 29 May 2020 15:34:43 -0700
Subject: [PATCH 1420/1533] [tf.data] Remove misleading documentation.

PiperOrigin-RevId: 313862461
Change-Id: I19720b5a90c251f45ab5bc4d90028481b8964f20
---
 tensorflow/python/data/ops/dataset_ops.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index bac69341a79..929fda8f8fd 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -396,8 +396,7 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
   def __iter__(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
-    The returned iterator implements the Python iterator protocol and therefore
-    can only be used in eager mode.
+    The returned iterator implements the Python iterator protocol.
 
     Returns:
       An `Iterator` over the elements of this dataset.

From 02dc6f8dce33e6320b0041379c27e0c4fbf80554 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Fri, 29 May 2020 15:57:38 -0700
Subject: [PATCH 1421/1533] Bump the ruy repository reference.

PiperOrigin-RevId: 313866050
Change-Id: I6a3c97d6f4e74c6078eb3bcc1607e51fc1f4d784
---
 .../lite/micro/tools/make/third_party_downloads.inc       | 4 ++--
 tensorflow/lite/tools/make/download_dependencies.sh       | 4 ++--
 tensorflow/tools/pip_package/BUILD                        | 2 --
 third_party/ruy/workspace.bzl                             | 8 ++++----
 4 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
index ef3176f2617..1543abc63f8 100644
--- a/tensorflow/lite/micro/tools/make/third_party_downloads.inc
+++ b/tensorflow/lite/micro/tools/make/third_party_downloads.inc
@@ -56,8 +56,8 @@ SIFIVE_FE310_LIB_MD5 := "06ee24c4956f8e21670ab3395861fe64"
 KISSFFT_URL="https://github.com/mborgerding/kissfft/archive/v130.zip"
 KISSFFT_MD5="438ba1fef5783cc5f5f201395cc477ca"
 
-RUY_URL="https://github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip"
-RUY_MD5="e9c9870741554afcc7a20c360254b31c"
+RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
+RUY_MD5="2d54f058f8f7120dfc1ecee79dbf259e"
 
 CIFAR10_DATASET_URL="https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz"
 CIFAR10_DATASET_MD5="c32a1d4ab5d03f1284b67883e8d87530"
diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh
index 293eb762938..a7840f6dcd0 100755
--- a/tensorflow/lite/tools/make/download_dependencies.sh
+++ b/tensorflow/lite/tools/make/download_dependencies.sh
@@ -37,8 +37,8 @@ EIGEN_URL="$(grep -o 'https.*gitlab.com/libeigen/eigen/-/archive/.*tar\.gz' "${B
 EIGEN_SHA="$(eval echo $(grep '# SHARED_EIGEN_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
 GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GEMMLOWP_SHA="$(eval echo $(grep '# SHARED_GEMMLOWP_SHA' "${BZL_FILE_PATH}" | grep -o '\".*\"'))"
-RUY_URL="https://github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip"
-RUY_SHA="d5e703913c9e8f0196d83cc4113ecaae4bcae52181f05836890f16aad402fea4"
+RUY_URL="https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip"
+RUY_SHA="b21524de00c63b3d5683b42557f78452e791cf77fddb2e63f9bcba1f7bd99093"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 GOOGLETEST_SHA="58a6f4277ca2bc8565222b3bbd58a177609e9c488e8a72649359ba51450db7d8"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 36e20408c53..43bc04a1b60 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -214,8 +214,6 @@ filegroup(
         "@sobol_data//:LICENSE",
         "@termcolor_archive//:COPYING.txt",
         "@zlib//:zlib.h",
-        "@clog//:LICENSE",
-        "@cpuinfo//:LICENSE",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/third_party/ruy/workspace.bzl b/third_party/ruy/workspace.bzl
index 5d653e47d06..c4ed692df4d 100644
--- a/third_party/ruy/workspace.bzl
+++ b/third_party/ruy/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "ruy",
-        sha256 = "d5e703913c9e8f0196d83cc4113ecaae4bcae52181f05836890f16aad402fea4",
-        strip_prefix = "ruy-51b518e755dd3da37a79d16972b76d3baedac22d",
+        sha256 = "b21524de00c63b3d5683b42557f78452e791cf77fddb2e63f9bcba1f7bd99093",
+        strip_prefix = "ruy-1b313682ef8b8fc8ed08719c610d1c3503b016bf",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip",
-            "https://github.com/google/ruy/archive/51b518e755dd3da37a79d16972b76d3baedac22d.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip",
+            "https://github.com/google/ruy/archive/1b313682ef8b8fc8ed08719c610d1c3503b016bf.zip",
         ],
         build_file = "//third_party/ruy:BUILD",
     )

From 8be4d61574f29568c8699708d88945b441bfd317 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 29 May 2020 16:08:52 -0700
Subject: [PATCH 1422/1533] [tf.data] Explicitly colocate prefetch dataset op
 with its input as this collocation only happens automatically in graph mode.

PiperOrigin-RevId: 313867950
Change-Id: I88962b96f208b6d9019e0a117715f74efc8fdc67
---
 .../kernel_tests/prefetch_to_device_test.py      | 12 ++++++++++++
 tensorflow/python/data/ops/dataset_ops.py        | 16 ++++++++++------
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index 1641243edcc..611fbab4b8b 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -201,6 +201,18 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(next_element)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testPrefetchToDevicePlacement(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    host_dataset = dataset_ops.Dataset.range(10)
+    device_dataset = host_dataset.apply(
+        prefetching_ops.prefetch_to_device("/gpu:0"))
+
+    self.assertEqual(device_dataset._variant_tensor.device,
+                     "/job:localhost/replica:0/task:0/device:GPU:0")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 929fda8f8fd..cf69915ecc0 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -4347,14 +4347,18 @@ class PrefetchDataset(UnaryUnchangedStructureDataset):
     """
     self._input_dataset = input_dataset
     if buffer_size is None:
-      buffer_size = -1  # This is the sentinel for auto-tuning.
+      buffer_size = AUTOTUNE
     self._buffer_size = ops.convert_to_tensor(
         buffer_size, dtype=dtypes.int64, name="buffer_size")
-    variant_tensor = gen_dataset_ops.prefetch_dataset(
-        input_dataset._variant_tensor,  # pylint: disable=protected-access
-        buffer_size=self._buffer_size,
-        slack_period=slack_period,
-        **self._flat_structure)
+    # pylint: disable=protected-access
+    # We colocate the prefetch dataset with its input as this collocation only
+    # happens automatically in graph mode.
+    with ops.device(input_dataset._variant_tensor.device):
+      variant_tensor = gen_dataset_ops.prefetch_dataset(
+          input_dataset._variant_tensor,
+          buffer_size=self._buffer_size,
+          slack_period=slack_period,
+          **self._flat_structure)
     super(PrefetchDataset, self).__init__(input_dataset, variant_tensor)
 
 
From d99affc72b541e0eaef6ab0c0d433c695163eec1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 16:43:26 -0700
Subject: [PATCH 1423/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 313873341
Change-Id: I4ac3ffcf5fc5ed5b1444fc92b2d87988724c310e
---
 tensorflow/go/op/wrappers.go | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 57924f08f83..b5a8308e1a0 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -19606,6 +19606,14 @@ func CollectiveBcastSendCommunicationHint(value string) CollectiveBcastSendAttr
 	}
 }
 
+// CollectiveBcastSendTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveBcastSendTimeoutSeconds(value float32) CollectiveBcastSendAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Broadcasts a tensor value to one or more other devices.
 func CollectiveBcastSend(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape, optional ...CollectiveBcastSendAttr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -32032,6 +32040,14 @@ func CollectiveBcastRecvCommunicationHint(value string) CollectiveBcastRecvAttr
 	}
 }
 
+// CollectiveBcastRecvTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveBcastRecvTimeoutSeconds(value float32) CollectiveBcastRecvAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Receives a tensor value broadcast from another device.
 func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape, optional ...CollectiveBcastRecvAttr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -38224,6 +38240,14 @@ func CollectiveGatherCommunicationHint(value string) CollectiveGatherAttr {
 	}
 }
 
+// CollectiveGatherTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveGatherTimeoutSeconds(value float32) CollectiveGatherAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Mutually accumulates multiple tensors of identical type and shape.
 func CollectiveGather(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, shape tf.Shape, optional ...CollectiveGatherAttr) (data tf.Output) {
 	if scope.Err() != nil {
@@ -42948,6 +42972,14 @@ func CollectiveReduceCommunicationHint(value string) CollectiveReduceAttr {
 	}
 }
 
+// CollectiveReduceTimeoutSeconds sets the optional timeout_seconds attribute to value.
+// If not specified, defaults to 0
+func CollectiveReduceTimeoutSeconds(value float32) CollectiveReduceAttr {
+	return func(m optionalAttr) {
+		m["timeout_seconds"] = value
+	}
+}
+
 // Mutually reduces multiple tensors of identical type and shape.
 func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64, optional ...CollectiveReduceAttr) (data tf.Output) {
 	if scope.Err() != nil {

From bacd18849e106231deb0f2a8eec967a312320dc7 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 29 May 2020 16:44:16 -0700
Subject: [PATCH 1424/1533] Restore `with_select_tf_ops` build flag.

This remains useful for testing and development. Restore the ability
to inject support for TF ops in TFLite using `--define=with_select_tf_ops=true`.

See also issue #34277.

PiperOrigin-RevId: 313873470
Change-Id: I6b68cd863efc17f5ae0667c0d2c9d68958d6e4ad
---
 tensorflow/lite/BUILD                     |  9 +++++++++
 tensorflow/lite/build_def.bzl             | 24 ++++++++++++++++++++++-
 tensorflow/lite/g3doc/guide/ops_select.md |  7 +++++--
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 810f3ab1a2b..19d45423988 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -56,6 +56,15 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+# Enables inclusion of select TensorFlow kernels via the TFLite Flex delegate
+# when building TFLite shared libraries.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "with_select_tf_ops",
+    define_values = {"with_select_tf_ops": "true"},
+    visibility = ["//visibility:public"],
+)
+
 TFLITE_DEFAULT_COPTS = if_not_windows([
     "-Wall",
     "-Wno-comment",
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index fd51ad0a4aa..9850cbedd9a 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -110,6 +110,26 @@ def tflite_jni_linkopts():
     """Defines linker flags to reduce size of TFLite binary with JNI."""
     return tflite_jni_linkopts_unstripped() + tflite_symbol_opts()
 
+def maybe_flex_deps(deps):
+    """Returns necessary flex deps when with_select_tf_ops build flag is used
+
+    Args:
+      deps: The source deps for the target (to avoid deps duplication).
+    Returns:
+       A list of additional flex deps required, based on the build flags used.
+       If with_select_tf_ops is not true, this will be an empty list.
+    """
+
+    # Filter redundant flex deps if already provided.
+    flex_dep = clean_dep("//tensorflow/lite/delegates/flex:delegate")
+    if type(deps) == type([]) and flex_dep in deps:
+        return []
+
+    return select({
+        clean_dep("//tensorflow/lite:with_select_tf_ops"): [flex_dep],
+        "//conditions:default": [],
+    })
+
 def tflite_jni_binary(
         name,
         copts = tflite_copts(),
@@ -139,7 +159,7 @@ def tflite_jni_binary(
         copts = copts,
         linkshared = linkshared,
         linkstatic = linkstatic,
-        deps = deps + [linkscript, exported_symbols],
+        deps = deps + [linkscript, exported_symbols] + maybe_flex_deps(deps),
         srcs = srcs,
         tags = tags,
         linkopts = linkopts,
@@ -149,6 +169,7 @@ def tflite_jni_binary(
 def tflite_cc_shared_object(
         name,
         copts = tflite_copts(),
+        deps = [],
         linkopts = [],
         linkstatic = 1,
         per_os_targets = False,
@@ -160,6 +181,7 @@ def tflite_cc_shared_object(
         linkstatic = linkstatic,
         linkopts = linkopts + tflite_jni_linkopts(),
         framework_so = [],
+        deps = deps + maybe_flex_deps(deps),
         per_os_targets = per_os_targets,
         **kwargs
     )
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 8a9109cf54c..3a867cdc619 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -212,8 +212,11 @@ TensorFlow ops library can be included and enabled as follows:
 
 *   Enable monolithic builds if necessary by adding the `--config=monolithic`
     build flag.
-*   Add the TensorFlow ops delegate library dependency to the build
-    dependencies: `tensorflow/lite/delegates/flex:delegate`.
+*   Do one of the following:
+    *   Include the `--define=with_select_tf_ops=true` build flag in the `bazel
+        build` invocation when building the TensorFlow Lite binary.
+    *   Add the TensorFlow ops delegate library dependency to the build
+        dependencies: `tensorflow/lite/delegates/flex:delegate`.
 
 Note that the necessary `TfLiteDelegate` will be installed automatically when
 creating the interpreter at runtime as long as the delegate is linked into the

From 3fbd5ac42ee7fab9b85af79056db047b8e28d2fd Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Fri, 29 May 2020 16:47:03 -0700
Subject: [PATCH 1425/1533] fix some linter errors for slurm_cluster_resolver.

PiperOrigin-RevId: 313873815
Change-Id: I15ae65bb27af2ee9d60b3629c91c0234fbc8943f
---
 .../slurm_cluster_resolver.py                 | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
index 3b9f8a259dd..94c036963a3 100644
--- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import subprocess
 import re
+import subprocess
 
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
 from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
@@ -29,7 +29,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 def expand_hostlist(hostlist):
-  """Create a list of hosts out of a SLURM hostlist
+  """Create a list of hosts out of a SLURM hostlist.
 
   The order of nodes is preserved and no deduplication is done
   Input: 'n[1-2],m5,o[3-4,6,7-9]')
@@ -37,7 +37,7 @@ def expand_hostlist(hostlist):
   """
 
   def split_hostlist(hostlist):
-    """Split hostlist at commas outside of range expressions ('[3-5]')"""
+    """Split hostlist at commas outside of range expressions ('[3-5]')."""
     in_brackets = False
     cur_host = ''
     for c in hostlist:
@@ -57,7 +57,7 @@ def expand_hostlist(hostlist):
       yield cur_host
 
   def expand_range_expression(range_exp):
-    """Expand a range expression like '3-5' to values 3,4,5"""
+    """Expand a range expression like '3-5' to values 3,4,5."""
     for part in range_exp.split(','):
       sub_range = part.split('-')
       if len(sub_range) == 1:
@@ -87,7 +87,7 @@ def expand_hostlist(hostlist):
 
 
 def expand_tasks_per_node(tasks_per_node):
-  """Expand the tasks per node expression from SLURM
+  """Expands the tasks per node expression from SLURM.
 
   The order is preserved so it can be matched to the hostlist
   Input: '3(x2),2,1'
@@ -108,7 +108,7 @@ def expand_tasks_per_node(tasks_per_node):
 
 
 def _get_slurm_var(name):
-  """Get the SLURM variable from the environment
+  """Gets the SLURM variable from the environment.
 
   Args:
     name: Name of the step variable
@@ -126,8 +126,8 @@ def _get_slurm_var(name):
                        'Not running inside a SLURM step?' % name)
 
 
-def get_num_slurm_tasks():
-  """Return the number of SLURM tasks of the current job step
+def _get_num_slurm_tasks():
+  """Returns the number of SLURM tasks of the current job step.
 
   Returns:
     The number of tasks as an int
@@ -136,7 +136,7 @@ def get_num_slurm_tasks():
 
 
 def _get_num_nvidia_gpus():
-  """Get the number of NVIDIA GPUs by using CUDA_VISIBLE_DEVICES and nvidia-smi
+  """Gets the number of NVIDIA GPUs by using CUDA_VISIBLE_DEVICES and nvidia-smi.
 
   Returns:
     Number of GPUs available on the node
@@ -157,9 +157,9 @@ def _get_num_nvidia_gpus():
 
 
 def get_num_gpus():
-  """Return the number of GPUs visible on the current node
+  """Returns the number of GPUs visible on the current node.
 
-  Currently only implemented for NVIDIA GPUs
+  Currently only implemented for NVIDIA GPUs.
   """
   return _get_num_nvidia_gpus()
 
@@ -176,7 +176,6 @@ class SlurmClusterResolver(ClusterResolver):
   used for distributed TensorFlow.
   """
 
-
   def __init__(self,
                jobs=None,
                port_base=8888,
@@ -276,19 +275,19 @@ class SlurmClusterResolver(ClusterResolver):
           sum(self._jobs.values()), num_tasks))
 
   def _resolve_own_rank(self):
-    """Return the rank of the current task in range [0, num_tasks)"""
+    """Returns the rank of the current task in range [0, num_tasks)."""
     return int(_get_slurm_var('PROCID'))
 
   def _resolve_num_tasks(self):
-    """Return the number of tasks for the current job step"""
-    return get_num_slurm_tasks()
+    """Returns the number of tasks for the current job step."""
+    return _get_num_slurm_tasks()
 
   def _resolve_hostlist(self):
-    """Return a list of hostnames for nodes running the current job step"""
+    """Returns a list of hostnames for nodes running the current job step."""
     return expand_hostlist(_get_slurm_var('STEP_NODELIST'))
 
   def _resolve_task_configuration(self):
-    """Create a mapping of hostnames to the number of tasks allocated on it
+    """Creates a mapping of hostnames to the number of tasks allocated on it.
 
     Reads the SLURM environment to determine the nodes involved in the current
     job step and number of tasks running on each node.
@@ -352,7 +351,7 @@ class SlurmClusterResolver(ClusterResolver):
 
       cluster_rank_offset_start = cluster_rank_offset_end
 
-    if self._auto_set_gpu is True:
+    if self._auto_set_gpu:
       os.environ['CUDA_VISIBLE_DEVICES'] = self._gpu_allocation[self._rank]
 
     return ClusterSpec(self._cluster_allocation)

From 481aec85d669dad80f8d8e0c53e03c45f956959d Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Fri, 29 May 2020 16:51:24 -0700
Subject: [PATCH 1426/1533] Fix for reported Hexagon compilation error

PiperOrigin-RevId: 313874395
Change-Id: If71d9b1aa6aedff436aa8be59efac490ba163ed0
---
 tensorflow/lite/c/common.h                                  | 6 ++++--
 .../lite/tools/benchmark/experimental/c/c_api_types.h       | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index ab150e87d93..d95c1431041 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -86,8 +86,9 @@ typedef struct TfLiteIntArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-    __GNUC_MINOR__ >= 1
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+     __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON)
   int data[0];
 #else
   int data[];
@@ -125,6 +126,7 @@ typedef struct TfLiteFloatArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+// This also applies to the toolchain used for Qualcomm Hexagon DSPs.
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
     __GNUC_MINOR__ >= 1
   float data[0];
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index ab150e87d93..d95c1431041 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -86,8 +86,9 @@ typedef struct TfLiteIntArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
-#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
-    __GNUC_MINOR__ >= 1
+#if (!defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
+     __GNUC_MINOR__ >= 1) ||                                      \
+    defined(HEXAGON)
   int data[0];
 #else
   int data[];
@@ -125,6 +126,7 @@ typedef struct TfLiteFloatArray {
   int size;
 // gcc 6.1+ have a bug where flexible members aren't properly handled
 // https://github.com/google/re2/commit/b94b7cd42e9f02673cd748c1ac1d16db4052514c
+// This also applies to the toolchain used for Qualcomm Hexagon DSPs.
 #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ == 6 && \
     __GNUC_MINOR__ >= 1
   float data[0];

From 00f027a374c71bd1a1976e8ac8f0d9c333c5e172 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 29 May 2020 17:12:33 -0700
Subject: [PATCH 1427/1533] Update TPUExtractHeadTailOutsideCompilation pass to
 support tail outside compiled computation extraction.

This extends the pass to lift ops to after the cluster if they are outside compiled and have no dependencies, other than other ops that are to be lifted.

PiperOrigin-RevId: 313877620
Change-Id: Ia5d068d74383206dc6ffaed06f429b7b93767271
---
 ...extract_head_tail_outside_compilation.mlir | 228 ++++++++++++++++++
 ...u_extract_head_tail_outside_compilation.cc | 217 ++++++++++++++---
 2 files changed, 406 insertions(+), 39 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index dd31b7d06ef..d5fb821b5e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -157,4 +157,232 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }
     return
   }
+
+  // CHECK-LABEL: func @tail_single_outside_compiled_op
+  func @tail_single_outside_compiled_op() {
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      "tf_device.launch"
+    // CHECK-NEXT:   "tf.B"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @tail_single_outside_compiled_op_user
+  func @tail_single_outside_compiled_op_user() -> tensor<i32> {
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    %cluster = "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      "tf.C"() : () -> ()
+      tf_device.return %b : tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
+    // CHECK:      return %[[LAUNCH_OUT]]
+    return %cluster : tensor<i32>
+  }
+
+  // CHECK-LABEL: func @tail_multiple_outside_compiled_ops
+  func @tail_multiple_outside_compiled_ops(%arg0: tensor<i32>) {
+    // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]], %[[A_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      "tf_device.launch"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%arg0, %[[CLUSTER_OUT]]#1)
+    // CHECK-NEXT:   "tf.D"(%[[C_OUT]], %arg0, %[[CLUSTER_OUT]]#0)
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    "tf_device.cluster"() ( {
+      %a = "tf.A"() : () -> tensor<i32>
+      %b = "tf.B"(%arg0) : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%arg0, %a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      "tf.D"(%c, %arg0, %b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @tail_aliased_output
+  func @tail_aliased_output() -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) {
+    // CHECK-NEXT: %[[A_OUT:.*]] = "tf.A"
+    %a = "tf.A"() : () -> tensor<i32>
+    // CHECK-NEXT: %[[B_OUT:.*]] = "tf.B"
+    %b = "tf.B"() : () -> tensor<i32>
+    // CHECK:      %[[CLUSTER_OUT:.*]]:2 = "tf_device.cluster"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"
+    // CHECK-NEXT:   %[[E_OUT:.*]] = "tf.E"
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]], %[[E_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[D_OUT:.*]] = "tf.D"(%[[CLUSTER_OUT]]#0, %[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    %cluster:5 = "tf_device.cluster"() ( {
+      %c = "tf.C"()  : () -> tensor<i32>
+      %d = "tf.D"(%c, %a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      %e = "tf.E"()  : () -> tensor<i32>
+      tf_device.return %a, %b, %c, %d, %e : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+    // CHECK:      return %[[A_OUT]], %[[B_OUT]], %[[CLUSTER_OUT]]#0, %[[LAUNCH_OUT]], %[[CLUSTER_OUT]]#1
+    return %cluster#0, %cluster#1, %cluster#2, %cluster#3, %cluster#4 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+  }
+
+  // CHECK-LABEL: func @tail_replicated_outside_compilation
+  func @tail_replicated_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
+    // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
+    //
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
+    // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:   {
+    // CHECK-DAG:    num_cores_per_replica = 1
+    // CHECK-DAG:    step_marker_location = ""
+    // CHECK-DAG:    padding_map = []
+    // CHECK-DAG:    topology = ""
+    // CHECK-DAG:    device_assignment = []
+    //
+    // CHECK-NEXT:   "tf_device.launch"()
+    // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"(%[[CLUSTER_OUT]], %[[RI]])
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %a = "tf.A"(%ri) : (tensor<i32>) -> tensor<i32>
+        %b = "tf.B"(%a, %ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
+
+  // CHECK-LABEL: func @head_tail_no_extraction_middle_outside_compiled_ops
+  func @head_tail_no_extraction_middle_outside_compiled_ops(%arg0: tensor<i32>) {
+    // CHECK-NOT:  "tf_device.launch"
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %a = "tf.A"(%arg0) : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      "tf.C"(%b) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+
+  // CHECK-LABEL: func @head_tail_simple_extraction
+  func @head_tail_simple_extraction(%arg0: tensor<i32>) -> tensor<i32> {
+    // CHECK:      %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[A_OUT:.*]] = "tf.A"(%arg0)
+    // CHECK-NEXT:   tf_device.return %[[A_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:   %[[B_OUT:.*]] = "tf.B"(%[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK-NEXT: {
+    // CHECK-DAG:  num_cores_per_replica = 1
+    // CHECK-DAG:  step_marker_location = ""
+    // CHECK-DAG:  padding_map = []
+    // CHECK-DAG:  topology = ""
+    // CHECK-DAG:  device_assignment = []
+    //
+    // CHECK:      %[[TAIL_LAUNCH_OUT:.*]] = "tf_device.launch"
+    // CHECK-NEXT:   %[[C_OUT:.*]] = "tf.C"(%[[CLUSTER_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[C_OUT]]
+    // CHECK-NEXT: device = "/job:worker/replica:0/task:0/device:CPU:0"
+    %cluster = "tf_device.cluster"() ( {
+      %a = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      %b = "tf.B"(%a) : (tensor<i32>) -> tensor<i32>
+      %c = "tf.C"(%b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+      tf_device.return %c : tensor<i32>
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> tensor<i32>
+    // CHECK:      return %[[TAIL_LAUNCH_OUT]]
+    return %cluster : tensor<i32>
+  }
+
+  // CHECK-LABEL: func @head_tail_replicated_outside_compilation
+  func @head_tail_replicated_outside_compilation(%arg0: tensor<i32>, %arg1: tensor<i32>) {
+    // CHECK:      tf_device.replicate([%arg0, %arg1] as %[[RI:.*]]: tensor<i32>)
+    //
+    // CHECK-NEXT:   %[[HEAD_LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK-NEXT:     %[[A_OUT:.*]] = "tf.A"(%[[RI]])
+    // CHECK-NEXT:     tf_device.return %[[A_OUT]]
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    //
+    // CHECK:        %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+    // CHECK-NEXT:     %[[B_OUT:.*]] = "tf.B"
+    // CHECK-NEXT:     %[[C_OUT:.*]] = "tf.C"(%[[RI]], %[[B_OUT]])
+    // CHECK-NEXT:     "tf.E"(%[[C_OUT]], %[[HEAD_LAUNCH_OUT]])
+    // CHECK-NEXT:     tf_device.return %[[C_OUT]]
+    // CHECK-NEXT:   {
+    // CHECK-DAG:    num_cores_per_replica = 1
+    // CHECK-DAG:    step_marker_location = ""
+    // CHECK-DAG:    padding_map = []
+    // CHECK-DAG:    topology = ""
+    // CHECK-DAG:    device_assignment = []
+    //
+    // CHECK-NEXT:   "tf_device.launch"()
+    // CHECK-NEXT:     "tf.D"(%[[HEAD_LAUNCH_OUT]], %[[CLUSTER_OUT]], %[[RI]])
+    // CHECK-NEXT:     tf_device.return
+    // CHECK-NEXT:   device = "TPU_REPLICATED_HOST"
+    tf_device.replicate([%arg0, %arg1] as %ri : tensor<i32>) {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %a = "tf.A"(%ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> tensor<i32>
+        %b = "tf.B"() : () -> tensor<i32>
+        %c = "tf.C"(%ri, %b) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        %d = "tf.D"(%a, %c, %ri) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+        %e = "tf.E"(%c, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 95183e04223..688f21c1d52 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -78,9 +78,11 @@ bool OpInBlock(Operation* op, Block* block) {
 }
 
 // Wraps block in a Launch. External uses of ops in the block will be return
-// values of the Launch and remapped to the Launch results.
+// values of the Launch and remapped to the Launch results. If `before` is set
+// to true, the Launch is created before `op`. Otherwise the Launch is created
+// after `op`.
 tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
-                                         Block* launch_block,
+                                         bool before, Block* launch_block,
                                          llvm::StringRef host_device) {
   // Find results and result types of ops in block that needs to returned.
   llvm::SmallVector<Value, 4> launch_results;
@@ -100,7 +102,7 @@ tf_device::LaunchOp CreateLaunchForBlock(OpBuilder* builder, Operation* op,
     }
   }
 
-  builder->setInsertionPoint(op);
+  before ? builder->setInsertionPoint(op) : builder->setInsertionPointAfter(op);
   auto launch = builder->create<tf_device::LaunchOp>(
       op->getLoc(), builder->getStringAttr(host_device), launch_result_types);
   launch.body().push_back(launch_block);
@@ -178,22 +180,21 @@ llvm::SmallVector<Operation*, 4> FindOutsideCompiledOpsAtHead(
   Region* cluster_region = &cluster.body();
   llvm::SmallSetVector<Operation*, 4> head_outside_compiled_ops;
 
-  auto walk_operands = [&](Operation* op) {
-    for (Value operand : op->getOperands()) {
-      Operation* operand_op = GetOpOfValue(operand);
-      if (head_outside_compiled_ops.count(operand_op)) continue;
-
-      if (operand_op->getParentRegion() == cluster_region)
-        return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  };
   auto cluster_ops = cluster.GetBody().without_terminator();
   for (Operation& cluster_op : cluster_ops) {
     if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
     // An outside compiled op can be extracted if its operands are not from
     // other ops in the cluster that cannot be extracted.
-    auto walk_result = cluster_op.walk(walk_operands);
+    auto walk_result = cluster_op.walk([&](Operation* op) {
+      for (Value operand : op->getOperands()) {
+        Operation* operand_op = GetOpOfValue(operand);
+        if (head_outside_compiled_ops.count(operand_op)) continue;
+
+        if (operand_op->getParentRegion() == cluster_region)
+          return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
 
     if (!walk_result.wasInterrupted())
       head_outside_compiled_ops.insert(&cluster_op);
@@ -211,8 +212,8 @@ void CreateHeadComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
   for (Operation* head_outside_compiled_op : head_outside_compiled_ops)
     head_outside_compiled_op->moveBefore(launch_block, launch_block->end());
 
-  tf_device::LaunchOp launch =
-      CreateLaunchForBlock(builder, cluster, launch_block, host_device);
+  tf_device::LaunchOp launch = CreateLaunchForBlock(
+      builder, cluster, /*before=*/true, launch_block, host_device);
 
   for (auto result : llvm::zip(launch.GetBody().getTerminator()->getOperands(),
                                launch.getResults()))
@@ -220,6 +221,160 @@ void CreateHeadComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
                                cluster.body());
 }
 
+// Extracts and move outside compiled ops that have no dependencies in the
+// cluster to before the cluster.
+mlir::LogicalResult LiftHeadOutsideCompiledOps(
+    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
+    tf_device::ClusterOp cluster, std::string* host_device,
+    bool* cluster_updated) {
+  llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
+      FindOutsideCompiledOpsAtHead(cluster);
+  if (head_outside_compiled_ops.empty()) return success();
+  if (failed(
+          GetHostDeviceForHeadTailComputation(devices, cluster, host_device)))
+    return failure();
+
+  CreateHeadComputation(builder, cluster, head_outside_compiled_ops,
+                        *host_device);
+
+  *cluster_updated = true;
+  return success();
+}
+
+// Fills `tail_outside_compiled_ops` with ops that are outside compiled and
+// can be extracted to after the TPU computation, and `cluster_results` with new
+// results of the cluster. These ops are either connected to the output of the
+// TPU computation or other ops that can be extracted, and have no results used
+// by other ops in the TPU computation that cannot be extracted.
+void FindOutsideCompiledOpsAtTailAndClusterResults(
+    tf_device::ClusterOp cluster,
+    llvm::SmallVectorImpl<Operation*>* tail_outside_compiled_ops,
+    llvm::SmallVectorImpl<Value>* cluster_results) {
+  Region* cluster_region = &cluster.body();
+  llvm::SmallSetVector<Operation*, 4> tail_outside_compiled_ops_set;
+  Operation* terminator = cluster.GetBody().getTerminator();
+  llvm::SmallSetVector<Value, 4> cluster_results_set;
+  cluster_results_set.insert(terminator->getOperands().begin(),
+                             terminator->getOperands().end());
+
+  auto cluster_ops = llvm::reverse(cluster.GetBody().without_terminator());
+  for (Operation& cluster_op : cluster_ops) {
+    if (!HasOutsideCompilationAttribute(&cluster_op)) continue;
+
+    llvm::SmallVector<int, 4> results_to_forward;
+    bool can_be_extracted =
+        llvm::all_of(cluster_op.getUsers(), [&](Operation* op) {
+          return op == terminator || tail_outside_compiled_ops_set.count(op);
+        });
+    if (!can_be_extracted) continue;
+
+    // Collect operands of cluster op that are generated within the cluster.
+    // These values should be returned by the cluster.
+    cluster_op.walk([&](Operation* op) {
+      for (Value operand : op->getOperands()) {
+        Operation* operand_op = GetOpOfValue(operand);
+        if (operand_op->getParentRegion() == cluster_region)
+          cluster_results_set.insert(operand);
+      }
+    });
+
+    // Remove results of op to be extracted as there are no uses in the cluster.
+    for (Value result : cluster_op.getResults())
+      cluster_results_set.remove(result);
+    tail_outside_compiled_ops_set.insert(&cluster_op);
+  }
+
+  *tail_outside_compiled_ops = tail_outside_compiled_ops_set.takeVector();
+  *cluster_results = cluster_results_set.takeVector();
+}
+
+// Moves tail outside compiled ops into its own `tf_device.LaunchOp`
+// computation after the cluster.
+void CreateTailComputation(OpBuilder* builder, tf_device::ClusterOp cluster,
+                           llvm::ArrayRef<Operation*> tail_outside_compiled_ops,
+                           llvm::StringRef host_device) {
+  Block* launch_block = new Block;
+  for (Operation* tail_outside_compiled_op : tail_outside_compiled_ops)
+    tail_outside_compiled_op->moveBefore(launch_block, launch_block->begin());
+
+  tf_device::LaunchOp launch = CreateLaunchForBlock(
+      builder, cluster, /*before=*/false, launch_block, host_device);
+
+  auto operand_not_in_launch = [&](OpOperand& operand) {
+    return !launch.getOperation()->isProperAncestor(operand.getOwner());
+  };
+  for (auto result : llvm::zip(launch.GetBody().getTerminator()->getOperands(),
+                               launch.getResults()))
+    std::get<0>(result).replaceUsesWithIf(std::get<1>(result),
+                                          operand_not_in_launch);
+}
+
+// Updates cluster with updated cluster results after extracting tail outside
+// compiled ops.
+tf_device::ClusterOp UpdateClusterResults(
+    OpBuilder* builder, tf_device::ClusterOp cluster,
+    llvm::ArrayRef<Value> new_cluster_results) {
+  Operation* old_terminator = cluster.GetBody().getTerminator();
+  builder->setInsertionPoint(old_terminator);
+  builder->create<tf_device::ReturnOp>(old_terminator->getLoc(),
+                                       new_cluster_results);
+  old_terminator->erase();
+
+  builder->setInsertionPoint(cluster);
+  llvm::SmallVector<Type, 4> new_cluster_result_types;
+  new_cluster_result_types.reserve(new_cluster_results.size());
+  for (const auto& new_cluster_result : new_cluster_results)
+    new_cluster_result_types.push_back(new_cluster_result.getType());
+
+  auto new_cluster = builder->create<tf_device::ClusterOp>(
+      cluster.getLoc(), new_cluster_result_types,
+      /*operands=*/llvm::ArrayRef<Value>{}, cluster.getAttrs());
+  new_cluster.body().takeBody(cluster.body());
+
+  auto operand_not_in_cluster = [&](OpOperand& operand) {
+    return !new_cluster.getOperation()->isProperAncestor(operand.getOwner());
+  };
+  for (auto result :
+       llvm::zip(new_cluster.GetBody().getTerminator()->getOperands(),
+                 new_cluster.getResults()))
+    std::get<0>(result).replaceUsesWithIf(std::get<1>(result),
+                                          operand_not_in_cluster);
+
+  cluster.erase();
+  return new_cluster;
+}
+
+// Extracts and move outside compiled ops that do not create dependencies in the
+// cluster to after the cluster.
+mlir::LogicalResult LiftTailOutsideCompiledOps(
+    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
+    std::string host_device, tf_device::ClusterOp* cluster,
+    bool* cluster_updated) {
+  llvm::SmallVector<Operation*, 4> tail_outside_compiled_ops;
+  llvm::SmallVector<Value, 4> cluster_results;
+  FindOutsideCompiledOpsAtTailAndClusterResults(
+      *cluster, &tail_outside_compiled_ops, &cluster_results);
+  if (tail_outside_compiled_ops.empty()) return success();
+
+  if (host_device.empty())
+    if (failed(GetHostDeviceForHeadTailComputation(devices, *cluster,
+                                                   &host_device)))
+      return failure();
+
+  // Forward all results of cluster first. These results will be remapped once
+  // a new cluster is formed.
+  cluster->replaceAllUsesWith(
+      cluster->GetBody().getTerminator()->getOperands());
+
+  CreateTailComputation(builder, *cluster, tail_outside_compiled_ops,
+                        host_device);
+
+  *cluster = UpdateClusterResults(builder, *cluster, cluster_results);
+
+  *cluster_updated = true;
+  return success();
+}
+
 // Removes aliased outputs in cluster from ops outside of cluster.
 void RemoveClusterAliasedOutputs(OpBuilder* builder,
                                  tf_device::ClusterOp cluster) {
@@ -256,26 +411,6 @@ void RemoveClusterAliasedOutputs(OpBuilder* builder,
   cluster.erase();
 }
 
-// Extracts and move outside compiled ops that have no dependencies in the
-// cluster to before the cluster.
-mlir::LogicalResult LiftHeadOutsideCompiledOps(
-    OpBuilder* builder, const mlir::TF::RuntimeDevices& devices,
-    tf_device::ClusterOp cluster) {
-  llvm::SmallVector<Operation*, 4> head_outside_compiled_ops =
-      FindOutsideCompiledOpsAtHead(cluster);
-  if (head_outside_compiled_ops.empty()) return success();
-  std::string host_device;
-  if (failed(
-          GetHostDeviceForHeadTailComputation(devices, cluster, &host_device)))
-    return failure();
-
-  CreateHeadComputation(builder, cluster, head_outside_compiled_ops,
-                        host_device);
-
-  RemoveClusterAliasedOutputs(builder, cluster);
-  return success();
-}
-
 struct TPUExtractHeadTailOutsideCompilation
     : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
                          OperationPass<ModuleOp>> {
@@ -295,10 +430,14 @@ void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
       [&](tf_device::ClusterOp cluster) { clusters.push_back(cluster); });
 
   for (tf_device::ClusterOp cluster : clusters) {
-    if (failed(LiftHeadOutsideCompiledOps(&builder, devices, cluster)))
+    std::string host_device;
+    bool cluster_updated = false;
+    if (failed(LiftHeadOutsideCompiledOps(&builder, devices, cluster,
+                                          &host_device, &cluster_updated)) ||
+        failed(LiftTailOutsideCompiledOps(&builder, devices, host_device,
+                                          &cluster, &cluster_updated)))
       return signalPassFailure();
-
-    // TODO(b/157160906): Implement tail outside compiled op extraction.
+    if (cluster_updated) RemoveClusterAliasedOutputs(&builder, cluster);
   }
 }
 

From 396105ddd5a6b5838be8de1f20172ea78c58c124 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 17:40:27 -0700
Subject: [PATCH 1428/1533] fix build due to cupti static link.

PiperOrigin-RevId: 313880914
Change-Id: I4cf691e60fd28e773f124e440ebb30f054d5f2e9
---
 tensorflow/core/profiler/internal/gpu/BUILD   | 7 +++++++
 tensorflow/stream_executor/cuda/cupti_stub.cc | 1 +
 2 files changed, 8 insertions(+)

diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index 670080573b2..2bb0805a592 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -98,11 +98,18 @@ tf_cuda_library(
     ] + tf_additional_cupti_deps(),
 )
 
+# Rationale for linkstatic: The symbols in libcupti_static.a have hidden
+# visibility. The wrapper will fail to find them if it's ever built as a
+# shared library. This is the same issue as b/11094727. Always linking
+# the wrapper statically works around the issue. An alternative would be
+# to patch libcupti_static, but it's not worth the trouble considering
+# that the wrapper is about the only direct user.
 tf_cuda_library(
     name = "cupti_wrapper",
     srcs = if_cuda_is_configured_compat(["cupti_wrapper.cc"]),
     hdrs = if_cuda_is_configured_compat(["cupti_wrapper.h"]),
     copts = tf_copts(),
+    linkstatic = 1,
     visibility = ["//visibility:public"],
     deps = [
         ":cupti_interface",
diff --git a/tensorflow/stream_executor/cuda/cupti_stub.cc b/tensorflow/stream_executor/cuda/cupti_stub.cc
index feea09e002a..313d68eddfc 100644
--- a/tensorflow/stream_executor/cuda/cupti_stub.cc
+++ b/tensorflow/stream_executor/cuda/cupti_stub.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 // IWYU pragma: no_include "perftools/gputools/executor/stream_executor.h"
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/platform/dso_loader.h"

From ffa78a720c8b24b600e80b718e4c4f4d4678ac8c Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Fri, 29 May 2020 17:44:00 -0700
Subject: [PATCH 1429/1533] Combine the memory access breakdown when combining
 OpMetrics.

PiperOrigin-RevId: 313881286
Change-Id: I5d8616a14df176e632ccaf43a68f5f0422eda85e
---
 tensorflow/core/profiler/convert/BUILD        |  1 +
 .../convert/op_metrics_db_combiner.cc         | 29 +++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 390f94157c3..6c4116199ef 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -59,6 +59,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/utils:op_metrics_db_utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index 8229d1020b9..0e0f5b1387f 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 
@@ -22,6 +23,33 @@ namespace tensorflow {
 namespace profiler {
 namespace {
 
+using OperationType = OpMetrics::MemoryAccessed::OperationType;
+
+void CombineMemoryAccessedBreakdown(const OpMetrics& src, OpMetrics* dst) {
+  absl::flat_hash_map<std::pair<uint64 /*memory_space*/, OperationType>,
+                      OpMetrics_MemoryAccessed*>
+      dst_memory_accessed_map;
+  for (auto& dst_memory_accessed : *dst->mutable_memory_accessed_breakdown()) {
+    dst_memory_accessed_map[{dst_memory_accessed.memory_space(),
+                             dst_memory_accessed.operation_type()}] =
+        &dst_memory_accessed;
+  }
+  for (const auto& src_memory_accessed : src.memory_accessed_breakdown()) {
+    uint64 memory_space = src_memory_accessed.memory_space();
+    OperationType operation_type = src_memory_accessed.operation_type();
+    auto*& dst_memory_accessed =
+        dst_memory_accessed_map[{memory_space, operation_type}];
+    if (dst_memory_accessed == nullptr) {
+      dst_memory_accessed = dst->add_memory_accessed_breakdown();
+      dst_memory_accessed->set_memory_space(memory_space);
+      dst_memory_accessed->set_operation_type(operation_type);
+    }
+    dst_memory_accessed->set_bytes_accessed(
+        src_memory_accessed.bytes_accessed() +
+        dst_memory_accessed->bytes_accessed());
+  }
+}
+
 // Combines the src OpMetrics into the dst OpMetrics.
 void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
   DCHECK(dst != nullptr);
@@ -42,6 +70,7 @@ void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst) {
   dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
   dst->set_flops(src.flops() + dst->flops());
   dst->set_bytes_accessed(src.bytes_accessed() + dst->bytes_accessed());
+  CombineMemoryAccessedBreakdown(src, dst);
   dst->set_dma_stall_ps(src.dma_stall_ps() + dst->dma_stall_ps());
 }
 

From ea8070dbbb72d558d150940634d32571de7e88c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 17:51:45 -0700
Subject: [PATCH 1430/1533] Internal change

PiperOrigin-RevId: 313882078
Change-Id: I880b13ac290a350151cb41067a73e75809802001
---
 .../analysis/side_effect_analysis.cc          |  2 +-
 .../graphdef2mlir/graph-as-function.pbtxt     |  2 +-
 .../graph-function-resource-args.pbtxt        |  6 +-
 .../mlir2graphdef/function-resource-args.mlir |  4 +-
 .../tests/side-effect-analysis-test.mlir      |  6 +-
 .../tensorflow/tests/tf_saved_model/basic.py  |  2 +-
 .../tests/tf_saved_model/call_to_exported.py  |  4 +-
 .../tensorflow/tests/tf_saved_model/keras.py  |  2 +-
 .../tests/tf_saved_model/structured_input.py  | 34 ++++----
 .../tensorflow/translate/export_graphdef.cc   | 50 +++++-------
 .../mlir/tensorflow/translate/import_model.cc | 79 ++++++++++---------
 11 files changed, 93 insertions(+), 98 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index f7b88317cd4..ff1620347f7 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -49,7 +49,7 @@ namespace TF {
 namespace {
 
 constexpr int64_t kUnknownResourceId = -1;
-constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+constexpr char kResourceArgUniqueIdAttr[] = "tf.resource_arg_unique_id";
 
 // Returns if a VarHandleOp is anonymous, which means it always creates a new
 // variable.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index 03640e24aac..d26585edb03 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -13,7 +13,7 @@
 # CHECK:          %[[ISLAND_2:.*]], %[[ISLAND_2_control:.*]] = tf_executor.island wraps "tf.StatefulPartitionedCall"
 # CHECK-SAME:       f = @[[FUNC:[a-z0-9]*]]
 # CHECK:          tf_executor.fetch %[[ISLAND_1]], %[[ISLAND_2]] : tensor<*xf32>, tensor<*xf32>
-# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32> {tf._user_specified_name = "inputs"}, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
+# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
 
 node {
   name: "args_0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
index eb358d52b26..0e6e561225d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
@@ -81,7 +81,7 @@ library {
 }
 
 # Check that the `resource_arg_unique_id` for each argument is propagated to the
-# `tf._resource_arg_unique_id` argument attribute of the function
+# `tf.resource_arg_unique_id` argument attribute of the function
 # @test_func_name0.
 
 # CHECK:  func @main
@@ -92,8 +92,8 @@ library {
 # CHECK:      tf_executor.fetch
 # CHECK:    return
 # CHECK:  func @test_func_name0
-# CHECK-SAME:   tf._resource_arg_unique_id = 0
-# CHECK-SAME:   tf._resource_arg_unique_id = 0
+# CHECK-SAME:   tf.resource_arg_unique_id = 0
+# CHECK-SAME:   tf.resource_arg_unique_id = 0
 # CHECK:    tf_executor.graph
 # CHECK:      tf_executor.fetch
 # CHECK:    return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
index 44824ea1424..680e26f5cbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
@@ -8,14 +8,14 @@ func @main() -> tensor<*x!tf.resource> attributes {tf.entry_function = {inputs =
   }
   return %0 : tensor<*x!tf.resource>
 }
-func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
+func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
   %0 = tf_executor.graph {
     tf_executor.fetch %arg0 : tensor<*x!tf.resource>
   }
   return %0 : tensor<*x!tf.resource>
 }
 
-// Check that the `tf._resource_arg_unique_id` argument attributes of
+// Check that the `tf.resource_arg_unique_id` argument attributes of
 // test_func_name0 are propagated to the function's arg_attr and
 // resource_arg_unique_id.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index 965b3b10843..c36771c0576 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -786,9 +786,9 @@ func @tf_registry_ops(
 // CHECK-LABEL: func @arguments_with_unique_ids
 func @arguments_with_unique_ids(
   // expected-remark@above {{ID: 9}}
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
-  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 33 : i64}) {
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
+  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
+  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 33 : i64}) {
   tf_executor.graph {
   // expected-remark@above {{ID: 7}}
   // expected-remark@above {{Successors: {8}}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
index b337224e680..78c18a17d4a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # CHECK: "tf_saved_model.global_tensor"() {is_mutable, sym_name = "[[VAR:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = ["v42"], type = tensor<f32>, value = dense<4.200000e+01> : tensor<f32>} : () -> ()
   # CHECK: "tf_saved_model.global_tensor"() {sym_name = "[[CONST:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = [], type = tensor<f32>, value = dense<4.300000e+01> : tensor<f32>} : () -> ()
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[VAR]]},
   # CHECK-SAME:   %arg2: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[CONST]]}) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = []})
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
index 694942f4b00..658cc37a22f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # modify signatures interprocedurally).
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
@@ -54,7 +54,7 @@ class TestModule(tf.Module):
   # CHECK:        "tf.StatefulPartitionedCall"{{.*}}f = @[[CALLEE_INTERNAL:[a-zA-Z_0-9]+]]
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
index ffb5c024bbb..a95909b61ef 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
@@ -39,7 +39,7 @@ class TestModule(tf.Module):
     super(TestModule, self).__init__()
     self.model = mnist_model()
 
-  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]}
+  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf_saved_model.index_path = [0]}
   # CHECK: attributes {{.*}} tf_saved_model.exported_names = ["my_predict"]
   @tf.function(input_signature=[
       tf.TensorSpec([1, 28, 28, 1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
index 43591d12183..095fddbda96 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
@@ -36,8 +36,8 @@ class TestModule(tf.Module):
   # The outer layer of the index path indexes into the arguments.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "y", tf_saved_model.index_path = [1]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0000_function_arity"]
   @tf.function(input_signature=[
       tf.TensorSpec([1], tf.float32),
@@ -49,8 +49,8 @@ class TestModule(tf.Module):
   # Check index paths for lists.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 0]},
-  # CHECK-SAME:   %arg1: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 1]})
+  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0, 0]},
+  # CHECK-SAME:   %arg1: tensor<f32> {tf_saved_model.index_path = [0, 1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0001_list_2_elements"]
   @tf.function(input_signature=[[
       tf.TensorSpec([], tf.float32),
@@ -63,8 +63,8 @@ class TestModule(tf.Module):
   # Keys are linearized in sorted order, matching `tf.nest.flatten`.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0002_dict_2_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([1], tf.float32),
@@ -77,8 +77,8 @@ class TestModule(tf.Module):
   # The index path should be insensitive to the key order.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0003_dict_2_keys_out_of_order"]
   @tf.function(input_signature=[{
       'y': tf.TensorSpec([2], tf.float32),
@@ -90,12 +90,12 @@ class TestModule(tf.Module):
   # Slightly stronger stress test of multiple dict keys.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "a"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "b"]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "c"]},
-  # CHECK-SAME:   %arg3: tensor<4xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg4: tensor<5xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]},
-  # CHECK-SAME:   %arg5: tensor<6xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "z"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "a"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "b"]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "c"]},
+  # CHECK-SAME:   %arg3: tensor<4xf32> {tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg4: tensor<5xf32> {tf_saved_model.index_path = [0, "y"]},
+  # CHECK-SAME:   %arg5: tensor<6xf32> {tf_saved_model.index_path = [0, "z"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0004_dict_many_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([4], tf.float32),
@@ -112,9 +112,9 @@ class TestModule(tf.Module):
   # Note that list elements can have heterogenous types.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 1]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x", 0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "x", 1]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0005_more_complex_recursive_structure"]
   @tf.function(input_signature=[{
       'x': [tf.TensorSpec([1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 8e51f8c9a25..2bf55922d4b 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -70,6 +70,7 @@ using llvm::isa;
 using mlir::BlockArgument;
 using mlir::Dialect;
 using mlir::Operation;
+using mlir::OperationState;
 using mlir::Value;
 using stream_executor::port::StatusOr;
 
@@ -78,9 +79,6 @@ namespace {
 constexpr char kInvalidExecutorGraphMsg[] =
     "Functions must be of a single Graph with single op Islands: ";
 
-constexpr char kDeviceAttr[] = "tf.device";
-constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
-
 bool IsLegalChar(char c, bool first_char) {
   if (isalpha(c)) return true;
   if (isdigit(c)) return true;
@@ -269,14 +267,17 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
   (*node_def->mutable_attr())["index"] = index_attr;
 
   if (auto device_attr =
-          func.getArgAttrOfType<mlir::StringAttr>(index, kDeviceAttr))
+          func.getArgAttrOfType<mlir::StringAttr>(index, "tf.device")) {
     *node_def->mutable_device() = device_attr.getValue().str();
+  }
 
-  llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
-      func.getArgAttrs(index);
-  absl::flat_hash_set<absl::string_view> attrs_to_ignore = {kDeviceAttr};
-  TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
-                                       node_def->mutable_attr()));
+  if (auto resource_arg_unique_id_attr =
+          func.getArgAttrOfType<mlir::IntegerAttr>(
+              index, "tf.resource_arg_unique_id")) {
+    AttrValue unique_id_attr;
+    unique_id_attr.set_i(resource_arg_unique_id_attr.getInt());
+    (*node_def->mutable_attr())["_resource_arg_unique_id"] = unique_id_attr;
+  }
 
   return node_def;
 }
@@ -681,6 +682,14 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
   if (auto attr = function.getAttrOfType<mlir::UnitAttr>(stateful_string)) {
     func_def.mutable_signature()->set_is_stateful(true);
   }
+  for (int64 i = 0; i < function.getNumArguments(); ++i) {
+    if (auto resource_arg_unique_id_attr =
+            function.getArgAttrOfType<mlir::IntegerAttr>(
+                i, "tf.resource_arg_unique_id")) {
+      (*func_def.mutable_resource_arg_unique_id())[i] =
+          resource_arg_unique_id_attr.getInt();
+    }
+  }
 
   // Ignore the gradient and is_stateful attribute on the function as they have
   // been handled above.
@@ -690,28 +699,7 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
       function.getDialectAttrs());
   TF_RETURN_IF_ERROR(
       ConvertAttributes(funcAttrs, attrs_to_ignore, func_def.mutable_attr()));
-
-  for (int i = 0, e = function.getNumArguments(); i < e; ++i) {
-    if (auto resource_arg_unique_id_attr =
-            function.getArgAttrOfType<mlir::IntegerAttr>(
-                i, kResourceArgUniqueIdAttr)) {
-      (*func_def.mutable_resource_arg_unique_id())[i] =
-          resource_arg_unique_id_attr.getInt();
-    }
-
-    llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
-        function.getArgAttrs(i);
-    if (func_arg_i_attrs.empty()) continue;
-    absl::flat_hash_set<absl::string_view> attrs_to_ignore = {
-        kDeviceAttr, kResourceArgUniqueIdAttr};
-    FunctionDef::ArgAttrs func_def_arg_i_attrs;
-    TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
-                                         func_def_arg_i_attrs.mutable_attr()));
-    if (func_def_arg_i_attrs.attr().empty()) continue;
-    (*func_def.mutable_arg_attr())[i] = std::move(func_def_arg_i_attrs);
-  }
-
-  (*flib->add_function()) = std::move(func_def);
+  (*flib->add_function()) = func_def;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 24a1d40a8bb..bd63a3b224f 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -133,13 +132,6 @@ bool IsOutputShapesAttribute(const AttrValue& attr_value,
          attr_value.value_case() == AttrValue::kList;
 }
 
-bool IsResourceOutputShapesAttribute(const AttrValue& attr_value,
-                                     llvm::StringRef attr_name) {
-  if (attr_name == "_handle_dtypes" || attr_name == "_handle_shapes")
-    return attr_value.value_case() == AttrValue::kList;
-  return false;
-}
-
 // This class is used to generate new MLIR function name strings that are both
 // unique in the TF function library `flib_` and unique among the name strings
 // generated by the class object during its lifetime.
@@ -203,11 +195,15 @@ class ImporterBase {
   StatusOr<mlir::FunctionType> InferLibFunctionType(const FunctionBody& fbody);
 
   // Extracts arg and ret nodes from FunctionBody.
+  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
+  // variables, as a list of {index, ID} pairs.
   void GetArgsAndRetsFromFunctionBody(
       const FunctionBody& fbody,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
       absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-      absl::InlinedVector<Node*, 4>* control_ret_nodes);
+      absl::InlinedVector<Node*, 4>* control_ret_nodes,
+      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
+          resource_arg_unique_ids);
 
   // Prepares converting the graph to an MLIR module. This step removes the
   // backedges of the graph, orders the nodes and infers the shapes.
@@ -221,7 +217,8 @@ class ImporterBase {
                  const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
                  const absl::InlinedVector<Node*, 4>& control_ret_nodes,
                  llvm::ArrayRef<mlir::NamedAttribute> attrs,
-                 bool function_graph);
+                 const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
+                     resource_arg_unique_ids);
 
   // Finds out the function definition for the given function name from the
   // graph and converts it to a function of the module. This method is called
@@ -1195,7 +1192,9 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
 void ImporterBase::GetArgsAndRetsFromFunctionBody(
     const FunctionBody& fbody, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
     absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-    absl::InlinedVector<Node*, 4>* control_ret_nodes) {
+    absl::InlinedVector<Node*, 4>* control_ret_nodes,
+    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
+        resource_arg_unique_ids) {
   arg_nodes->reserve(fbody.arg_nodes.size());
   ret_nodes->reserve(fbody.ret_nodes.size());
   for (auto arg : fbody.arg_nodes) {
@@ -1204,6 +1203,9 @@ void ImporterBase::GetArgsAndRetsFromFunctionBody(
   for (auto ret : fbody.ret_nodes) {
     ret_nodes->emplace_back(ret, 0);
   }
+  for (const auto& entry : fbody.fdef.resource_arg_unique_id()) {
+    resource_arg_unique_ids->push_back(entry);
+  }
   *control_ret_nodes = fbody.control_ret_nodes;
 }
 
@@ -1298,13 +1300,14 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   GetArgsAndRetsFromFunctionBody(*fbody, &arg_nodes, &ret_nodes,
-                                 &control_ret_nodes);
+                                 &control_ret_nodes, &resource_arg_unique_ids);
 
   TF_RETURN_IF_ERROR(child_importer.Convert(
       mlir_func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes,
       llvm::makeArrayRef(attributes.begin(), attributes.end()),
-      /*function_graph=*/true));
+      resource_arg_unique_ids));
   return Status::OK();
 }
 
@@ -1406,7 +1409,9 @@ Status ImporterBase::Convert(
     const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
     const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
     const absl::InlinedVector<Node*, 4>& control_ret_nodes,
-    llvm::ArrayRef<mlir::NamedAttribute> attrs, bool function_graph) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs,
+    const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
+        resource_arg_unique_ids) {
   // TODO(b/122040776): Uses debug info for FunctionDef.
   auto function = mlir::FuncOp::create(mlir::UnknownLoc::get(context_),
                                        func_name, func_type, attrs);
@@ -1432,6 +1437,10 @@ Status ImporterBase::Convert(
   TF_RETURN_IF_ERROR(ConvertFunctionArgAndRets(function, graph,
                                                func_type.getInputs(), arg_nodes,
                                                ret_nodes, control_ret_nodes));
+  for (const auto& entry : resource_arg_unique_ids) {
+    function.setArgAttr(entry.first, "tf.resource_arg_unique_id",
+                        builder_.getI64IntegerAttr(entry.second));
+  }
 
   // TODO(jpienaar): Update post removing shape_refinier_.
   if (!specs_.enable_shape_inference) {
@@ -1490,22 +1499,6 @@ Status ImporterBase::ConvertFunctionArgAndRets(
           i, "tf.device",
           builder_.getStringAttr(arg_node.node->requested_device()));
 
-    if (arg_node.node->IsArg()) {
-      for (const auto& arg_node_attr : arg_node.node->attrs()) {
-        const auto& key = arg_node_attr.first;
-        // Only import attributes starting with an underscore.
-        if (key.empty() || key[0] != '_') continue;
-        // Ignore shape inference attributes as shape information is already
-        // populated in the result type.
-        if (IsOutputShapesAttribute(arg_node_attr.second, key) ||
-            IsResourceOutputShapesAttribute(arg_node_attr.second, key))
-          continue;
-        TF_ASSIGN_OR_RETURN(auto converted_attr,
-                            ConvertAttributeValue(arg_node_attr.second));
-        func.setArgAttr(i, llvm::formatv("tf.{0}", key).str(), converted_attr);
-      }
-    }
-
     island->dropAllReferences();
     island->erase();
   }
@@ -2115,10 +2108,14 @@ class GraphDefImporter : public ImporterBase {
   // output nodes, for function graphs. Arguments and return values are
   // determined by node op type. Type and shape information of the function are
   // inferred by the shape refiner in ImporterBase.
+  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
+  // variables, as a list of {index, ID} pairs.
   StatusOr<mlir::FunctionType> GetArgsRetsAndTypesFromFunctionGraph(
       mlir::MLIRContext* context,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-      absl::InlinedVector<OutputTensor, 4>* ret_nodes);
+      absl::InlinedVector<OutputTensor, 4>* ret_nodes,
+      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
+          resource_arg_unique_ids);
 
   // Finds the graph's target nodes/function's control ret nodes based on
   // supplied node names in `control_outputs`. If `control_outputs` are not
@@ -2146,6 +2143,7 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
+  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   llvm::SmallVector<mlir::NamedAttribute, 1> attrs;
   if (specs.graph_as_function) {
     if (specs.prune_unused_nodes || !specs.inputs.empty() ||
@@ -2154,9 +2152,10 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
           "Pruning of graph is currently unsupported when the main graph is "
           "converted to a function.");
 
-    TF_ASSIGN_OR_RETURN(func_type,
-                        importer.GetArgsRetsAndTypesFromFunctionGraph(
-                            context, &arg_nodes, &ret_nodes));
+    TF_ASSIGN_OR_RETURN(
+        func_type,
+        importer.GetArgsRetsAndTypesFromFunctionGraph(
+            context, &arg_nodes, &ret_nodes, &resource_arg_unique_ids));
 
     TF_RETURN_IF_ERROR(importer.GetControlRetsFromGraph(specs.control_outputs,
                                                         &control_ret_nodes));
@@ -2224,7 +2223,7 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
 
   TF_RETURN_IF_ERROR(importer.ImporterBase::Convert(
       func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes, attrs,
-      specs.graph_as_function));
+      resource_arg_unique_ids));
   return module;
 }
 
@@ -2341,7 +2340,9 @@ StatusOr<mlir::FunctionType> GraphDefImporter::InferMainFunctionType(
 StatusOr<mlir::FunctionType>
 GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     mlir::MLIRContext* context, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-    absl::InlinedVector<OutputTensor, 4>* ret_nodes) {
+    absl::InlinedVector<OutputTensor, 4>* ret_nodes,
+    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
+        resource_arg_unique_ids) {
   auto add_node = [](Node* node, absl::InlinedVector<OutputTensor, 4>* nodes) {
     auto* attr = node->attrs().Find("index");
     if (!attr)
@@ -2382,6 +2383,12 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     TF_ASSIGN_OR_RETURN(auto type,
                         InferOutputType(*arg_node.node, /*idx=*/0, builder));
     arg_types.push_back(type);
+    tensorflow::int64 resource_arg_unique_id;
+    if (TryGetNodeAttr(arg_node.node->attrs(), "_resource_arg_unique_id",
+                       &resource_arg_unique_id)) {
+      resource_arg_unique_ids->emplace_back(arg_node_and_idx.index(),
+                                            resource_arg_unique_id);
+    }
   }
 
   llvm::SmallVector<mlir::Type, 4> ret_types;

From 3ebdec47df6c1afaa31ea6f0aa980f0f90729e87 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 29 May 2020 17:58:18 -0700
Subject: [PATCH 1431/1533] (Mostly) Remove operator version support in TfLite
 Micro.

Only one op registration of a particular type can be added and that same op
registration is returned via FindOp. The version number still exists in the API
for backwards compatibility as well as compatibility with TfLite but is
irrelevant for TfLiteMicro after this change.

AddBuiltin/AddCustom API with min and max version params is removed.

This is a step towards completely removing operator versioning from TFLiteMicro.

PiperOrigin-RevId: 313882723
Change-Id: I2bfa8dcee69226d498597b6d0dbfc67c60641a48
---
 .../micro/benchmarks/keyword_benchmark.cc     |   2 +-
 .../main_functions.cc                         |  15 +-
 .../person_detection_test.cc                  |  15 +-
 .../lite/micro/kernels/activations_test.cc    |  12 +-
 tensorflow/lite/micro/kernels/add_test.cc     |   2 +-
 .../lite/micro/kernels/all_ops_resolver.cc    |  90 ++++----
 .../kernels/arc_mli/conv_slicing_test.cc      |   2 +-
 .../arc_mli/depthwise_conv_slicing_test.cc    |   2 +-
 .../kernels/arc_mli/pooling_slicing_test.cc   |   4 +-
 .../lite/micro/kernels/arg_min_max_test.cc    |   4 +-
 tensorflow/lite/micro/kernels/ceil_test.cc    |   2 +-
 .../lite/micro/kernels/comparisons_test.cc    |   2 +-
 .../lite/micro/kernels/concatenation_test.cc  |   4 +-
 tensorflow/lite/micro/kernels/conv_test.cc    |   2 +-
 .../lite/micro/kernels/depthwise_conv_test.cc |   2 +-
 .../lite/micro/kernels/dequantize_test.cc     |   3 +-
 .../lite/micro/kernels/elementwise_test.cc    |   6 +-
 tensorflow/lite/micro/kernels/floor_test.cc   |   2 +-
 .../micro/kernels/fully_connected_test.cc     |   4 +-
 tensorflow/lite/micro/kernels/l2norm_test.cc  |   2 +-
 tensorflow/lite/micro/kernels/logical_test.cc |   2 +-
 .../lite/micro/kernels/logistic_test.cc       |   4 +-
 .../micro/kernels/maximum_minimum_test.cc     |   6 +-
 tensorflow/lite/micro/kernels/mul_test.cc     |   4 +-
 tensorflow/lite/micro/kernels/neg_test.cc     |   2 +-
 tensorflow/lite/micro/kernels/pack_test.cc    |   8 +-
 tensorflow/lite/micro/kernels/pad_test.cc     |   4 +-
 tensorflow/lite/micro/kernels/pooling_test.cc |   8 +-
 tensorflow/lite/micro/kernels/prelu_test.cc   |   4 +-
 .../lite/micro/kernels/quantize_test.cc       |   2 +-
 tensorflow/lite/micro/kernels/reduce_test.cc  |   2 +-
 tensorflow/lite/micro/kernels/reshape_test.cc |   2 +-
 .../kernels/resize_nearest_neighbor_test.cc   |   2 +-
 tensorflow/lite/micro/kernels/round_test.cc   |   2 +-
 tensorflow/lite/micro/kernels/softmax_test.cc |   6 +-
 tensorflow/lite/micro/kernels/split_test.cc   |   8 +-
 .../lite/micro/kernels/strided_slice_test.cc  |   2 +-
 tensorflow/lite/micro/kernels/sub_test.cc     |   2 +-
 tensorflow/lite/micro/kernels/svdf_test.cc    |   4 +-
 tensorflow/lite/micro/kernels/tanh_test.cc    |   4 +-
 tensorflow/lite/micro/kernels/unpack_test.cc  |   8 +-
 .../lite/micro/micro_interpreter_test.cc      |   5 +-
 .../lite/micro/micro_mutable_op_resolver.h    |  69 +++---
 .../micro/micro_mutable_op_resolver_test.cc   | 208 ++++--------------
 tensorflow/lite/micro/micro_op_resolver.h     |  67 +++++-
 45 files changed, 262 insertions(+), 350 deletions(-)

diff --git a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
index 50401039265..ec4e93f9506 100644
--- a/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
+++ b/tensorflow/lite/micro/benchmarks/keyword_benchmark.cc
@@ -53,7 +53,7 @@ class KeywordRunner {
     resolver_.AddBuiltin(tflite::BuiltinOperator_QUANTIZE,
                          tflite::ops::micro::Register_QUANTIZE());
     resolver_.AddBuiltin(tflite::BuiltinOperator_DEQUANTIZE,
-                         tflite::ops::micro::Register_DEQUANTIZE(), 1, 2);
+                         tflite::ops::micro::Register_DEQUANTIZE());
     resolver_.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
                          tflite::ops::micro::Register_SOFTMAX());
     interpreter_.AllocateTensors();
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
index 6f10d5c3f27..3090356ee0d 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/main_functions.cc
@@ -72,19 +72,18 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroMutableOpResolver<12> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
-                               1, 3);
+  static tflite::MicroMutableOpResolver<5> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D(), 1, 3);
+                               tflite::ops::micro::Register_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D(),
-                               1, 2);
+                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
                                tflite::ops::micro::Register_RESHAPE());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX(), 1, 3);
+                               tflite::ops::micro::Register_SOFTMAX());
 
   // Build an interpreter to run the model with.
   // NOLINTNEXTLINE(runtime-global-variables)
diff --git a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
index ea37faa15f2..ddec8951596 100644
--- a/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
+++ b/tensorflow/lite/micro/examples/person_detection_experimental/person_detection_test.cc
@@ -52,19 +52,18 @@ TF_LITE_MICRO_TEST(TestInvoke) {
   // An easier approach is to just use the AllOpsResolver, but this will
   // incur some penalty in code space for op implementations that are not
   // needed by this graph.
-  tflite::MicroMutableOpResolver<11> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-                               tflite::ops::micro::Register_DEPTHWISE_CONV_2D(),
-                               1, 3);
+  tflite::MicroMutableOpResolver<5> micro_op_resolver;
+  micro_op_resolver.AddBuiltin(
+      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_CONV_2D,
-                               tflite::ops::micro::Register_CONV_2D(), 1, 3);
+                               tflite::ops::micro::Register_CONV_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_AVERAGE_POOL_2D,
-                               tflite::ops::micro::Register_AVERAGE_POOL_2D(),
-                               1, 2);
+                               tflite::ops::micro::Register_AVERAGE_POOL_2D());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_RESHAPE,
                                tflite::ops::micro::Register_RESHAPE());
   micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX(), 1, 2);
+                               tflite::ops::micro::Register_SOFTMAX());
 
   // Build an interpreter to run the model with.
   tflite::MicroInterpreter interpreter(model, micro_op_resolver, tensor_arena,
diff --git a/tensorflow/lite/micro/kernels/activations_test.cc b/tensorflow/lite/micro/kernels/activations_test.cc
index 7c6a55f306c..008be1039e1 100644
--- a/tensorflow/lite/micro/kernels/activations_test.cc
+++ b/tensorflow/lite/micro/kernels/activations_test.cc
@@ -43,7 +43,7 @@ void TestReluFloat(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -99,7 +99,7 @@ void TestRelu6Float(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU6);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -160,7 +160,7 @@ void TestReluUint8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -225,7 +225,7 @@ void TestRelu6Uint8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU6);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -289,7 +289,7 @@ void TestReluInt8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -355,7 +355,7 @@ void TestRelu6Int8(const int* input_dims_data, const float* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RELU6, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RELU6);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/add_test.cc b/tensorflow/lite/micro/kernels/add_test.cc
index 96c3aabeb8e..07dc25222b6 100644
--- a/tensorflow/lite/micro/kernels/add_test.cc
+++ b/tensorflow/lite/micro/kernels/add_test.cc
@@ -71,7 +71,7 @@ void ValidateAddGoldens(TfLiteTensor* tensors, int tensors_size,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(::tflite::BuiltinOperator_ADD, 1);
+      resolver.FindOp(::tflite::BuiltinOperator_ADD);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
index dee3cbe0664..e427511c1f2 100644
--- a/tensorflow/lite/micro/kernels/all_ops_resolver.cc
+++ b/tensorflow/lite/micro/kernels/all_ops_resolver.cc
@@ -18,65 +18,61 @@ namespace tflite {
 namespace ops {
 namespace micro {
 
-// Register each supported op with:
-// AddBuiltin(<operator ID>, <registration>, [min version], [max version])
 AllOpsResolver::AllOpsResolver() {
-  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(), 1, 4);
-  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D(), 1, 2);
-  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(), 1, 2);
-  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC(), 1, 2);
-  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF(), 1, 3);
-  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(), 1, 3);
-  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION(), 1, 3);
-  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D(), 1,
-             3);
-  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D(), 1, 2);
+  // Please keep this list of Builtin Operators in alphabetical order.
   AddBuiltin(BuiltinOperator_ABS, Register_ABS());
-  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
-  AddBuiltin(BuiltinOperator_COS, Register_COS());
-  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
-  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
-  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
-  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
-  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
-  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
-  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
-  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_ADD, Register_ADD());
   AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
   AddBuiltin(BuiltinOperator_ARG_MIN, Register_ARG_MIN());
-  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+  AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D());
+  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
+  AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION());
+  AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+  AddBuiltin(BuiltinOperator_COS, Register_COS());
+  AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
+  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+  AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOGICAL_AND, Register_LOGICAL_AND());
   AddBuiltin(BuiltinOperator_LOGICAL_NOT, Register_LOGICAL_NOT());
-  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
-  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER(), 1, 2);
-  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_LESS, Register_LESS(), 1, 2);
-  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL(), 1, 2);
-  AddBuiltin(BuiltinOperator_CEIL, Register_CEIL());
-  AddBuiltin(BuiltinOperator_ROUND, Register_ROUND());
-  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
-  AddBuiltin(BuiltinOperator_PACK, Register_PACK(), 1, 2);
-  AddBuiltin(BuiltinOperator_PAD, Register_PAD(), 1, 2);
-  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2(), 1, 2);
-  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT(), 1, 3);
-  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK(), 1, 2);
+  AddBuiltin(BuiltinOperator_LOGICAL_OR, Register_LOGICAL_OR());
+  AddBuiltin(BuiltinOperator_LOGISTIC, Register_LOGISTIC());
+  AddBuiltin(BuiltinOperator_MAX_POOL_2D, Register_MAX_POOL_2D());
+  AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
+  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
+  AddBuiltin(BuiltinOperator_MUL, Register_MUL());
   AddBuiltin(BuiltinOperator_NEG, Register_NEG());
-  AddBuiltin(BuiltinOperator_ADD, Register_ADD(), 1, 2);
-  AddBuiltin(BuiltinOperator_MUL, Register_MUL(), 1, 3);
-  AddBuiltin(BuiltinOperator_SUB, Register_SUB(), 1, 2);
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
+  AddBuiltin(BuiltinOperator_PACK, Register_PACK());
+  AddBuiltin(BuiltinOperator_PAD, Register_PAD());
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
+  AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_QUANTIZE, Register_QUANTIZE());
-  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(), 1, 2);
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
-  AddBuiltin(BuiltinOperator_MEAN, Register_MEAN());
+  AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-             Register_RESIZE_NEAREST_NEIGHBOR(),
-             /* min_version = */ 1,
-             /* max_version = */ 2);
-  AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
+             Register_RESIZE_NEAREST_NEIGHBOR());
+  AddBuiltin(BuiltinOperator_ROUND, Register_ROUND());
+  AddBuiltin(BuiltinOperator_RSQRT, Register_RSQRT());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX());
+  AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SQRT, Register_SQRT());
+  AddBuiltin(BuiltinOperator_SQUARE, Register_SQUARE());
+  AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
+  AddBuiltin(BuiltinOperator_SUB, Register_SUB());
+  AddBuiltin(BuiltinOperator_SVDF, Register_SVDF());
   AddBuiltin(BuiltinOperator_TANH, Register_TANH());
+  AddBuiltin(BuiltinOperator_UNPACK, Register_UNPACK());
 }
 
 }  // namespace micro
diff --git a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
index 1accc919dd2..2c32b45bc26 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/conv_slicing_test.cc
@@ -139,7 +139,7 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
 
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONV_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_CONV_2D);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
index 2f528ea4e79..10f2f857c01 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/depthwise_conv_slicing_test.cc
@@ -57,7 +57,7 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int input_depth = tensors[0].dims->data[3];
diff --git a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
index 7cf5c9b607e..e8667874120 100644
--- a/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
+++ b/tensorflow/lite/micro/kernels/arc_mli/pooling_slicing_test.cc
@@ -62,7 +62,7 @@ void TestAveragePoolingQuantized(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
@@ -133,7 +133,7 @@ void TestMaxPoolQuantized(const int* input_dims_data, const T* input_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/arg_min_max_test.cc b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
index ea95b8adabf..2b4d22d0f4e 100644
--- a/tensorflow/lite/micro/kernels/arg_min_max_test.cc
+++ b/tensorflow/lite/micro/kernels/arg_min_max_test.cc
@@ -31,9 +31,9 @@ void ValidateArgMinMaxGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration;
   if (using_min) {
-    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MIN, 1);
+    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MIN);
   } else {
-    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MAX, 1);
+    registration = resolver.FindOp(tflite::BuiltinOperator_ARG_MAX);
   }
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/ceil_test.cc b/tensorflow/lite/micro/kernels/ceil_test.cc
index ccd5c712fef..fc21b0141b5 100644
--- a/tensorflow/lite/micro/kernels/ceil_test.cc
+++ b/tensorflow/lite/micro/kernels/ceil_test.cc
@@ -39,7 +39,7 @@ void TestCeil(const int* input_dims_data, const float* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CEIL, 1);
+      resolver.FindOp(tflite::BuiltinOperator_CEIL);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/comparisons_test.cc b/tensorflow/lite/micro/kernels/comparisons_test.cc
index 8f7fb9263c0..731d9d778e7 100644
--- a/tensorflow/lite/micro/kernels/comparisons_test.cc
+++ b/tensorflow/lite/micro/kernels/comparisons_test.cc
@@ -37,7 +37,7 @@ void TestComparison(tflite::BuiltinOperator op, TfLiteTensor* tensors,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const int inputs_array_data[] = {2, 0, 1};
diff --git a/tensorflow/lite/micro/kernels/concatenation_test.cc b/tensorflow/lite/micro/kernels/concatenation_test.cc
index d841d508c80..040cd43a400 100644
--- a/tensorflow/lite/micro/kernels/concatenation_test.cc
+++ b/tensorflow/lite/micro/kernels/concatenation_test.cc
@@ -49,7 +49,7 @@ void TestConcatenateTwoInputs(std::initializer_list<int> input1_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION, /* version */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteConcatenationParams builtin_data = {
@@ -111,7 +111,7 @@ void TestConcatenateQuantizedTwoInputs(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION, /* version */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_CONCATENATION);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteConcatenationParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/conv_test.cc b/tensorflow/lite/micro/kernels/conv_test.cc
index 6d5a6f55814..9b1add6d94a 100644
--- a/tensorflow/lite/micro/kernels/conv_test.cc
+++ b/tensorflow/lite/micro/kernels/conv_test.cc
@@ -60,7 +60,7 @@ TfLiteStatus ValidateConvGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
 
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_CONV_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_CONV_2D);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
index c4a242f480e..07177452ed7 100644
--- a/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
+++ b/tensorflow/lite/micro/kernels/depthwise_conv_test.cc
@@ -48,7 +48,7 @@ TfLiteStatus ValidateDepthwiseConvGoldens(const T* expected_output_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_DEPTHWISE_CONV_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int input_depth = tensors[0].dims->data[3];
diff --git a/tensorflow/lite/micro/kernels/dequantize_test.cc b/tensorflow/lite/micro/kernels/dequantize_test.cc
index 5831791248c..c90f745a778 100644
--- a/tensorflow/lite/micro/kernels/dequantize_test.cc
+++ b/tensorflow/lite/micro/kernels/dequantize_test.cc
@@ -32,9 +32,8 @@ void ValidateDequantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
-  // Version 2 of dequantize supports int8 quantization.
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_DEQUANTIZE, 2);
+      resolver.FindOp(tflite::BuiltinOperator_DEQUANTIZE);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/elementwise_test.cc b/tensorflow/lite/micro/kernels/elementwise_test.cc
index c1e807974dc..923739b5bb4 100644
--- a/tensorflow/lite/micro/kernels/elementwise_test.cc
+++ b/tensorflow/lite/micro/kernels/elementwise_test.cc
@@ -47,8 +47,7 @@ void TestElementwiseFloat(tflite::BuiltinOperator op,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(op, /* version= */ 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
@@ -113,8 +112,7 @@ void TestElementwiseBool(tflite::BuiltinOperator op,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration =
-      resolver.FindOp(op, /* version= */ 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/floor_test.cc b/tensorflow/lite/micro/kernels/floor_test.cc
index ab9cae36177..eea959d54ce 100644
--- a/tensorflow/lite/micro/kernels/floor_test.cc
+++ b/tensorflow/lite/micro/kernels/floor_test.cc
@@ -40,7 +40,7 @@ void TestFloor(const int* input_dims_data, const float* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FLOOR, 1);
+      resolver.FindOp(tflite::BuiltinOperator_FLOOR);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/fully_connected_test.cc b/tensorflow/lite/micro/kernels/fully_connected_test.cc
index a920ca3b132..b464d977385 100644
--- a/tensorflow/lite/micro/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/micro/kernels/fully_connected_test.cc
@@ -55,7 +55,7 @@ TfLiteStatus TestFullyConnectedFloat(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 1);
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteFullyConnectedParams builtin_data = {
@@ -135,7 +135,7 @@ TfLiteStatus TestFullyConnectedQuantized(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED, 4);
+      resolver.FindOp(tflite::BuiltinOperator_FULLY_CONNECTED);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteFullyConnectedParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/l2norm_test.cc b/tensorflow/lite/micro/kernels/l2norm_test.cc
index a4f2fff6a12..4b034b211db 100644
--- a/tensorflow/lite/micro/kernels/l2norm_test.cc
+++ b/tensorflow/lite/micro/kernels/l2norm_test.cc
@@ -106,7 +106,7 @@ void TestL2Normalization(const int* input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_L2_NORMALIZATION, 1);
+      resolver.FindOp(tflite::BuiltinOperator_L2_NORMALIZATION);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteL2NormParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/logical_test.cc b/tensorflow/lite/micro/kernels/logical_test.cc
index f6dc9608fa2..ec5503bc8f3 100644
--- a/tensorflow/lite/micro/kernels/logical_test.cc
+++ b/tensorflow/lite/micro/kernels/logical_test.cc
@@ -48,7 +48,7 @@ void TestLogicalOp(tflite::BuiltinOperator op,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
diff --git a/tensorflow/lite/micro/kernels/logistic_test.cc b/tensorflow/lite/micro/kernels/logistic_test.cc
index 0cfef0704ac..6473858f9fb 100644
--- a/tensorflow/lite/micro/kernels/logistic_test.cc
+++ b/tensorflow/lite/micro/kernels/logistic_test.cc
@@ -45,7 +45,7 @@ void TestLogisticFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC, 1);
+      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -107,7 +107,7 @@ void TestLogisticInt8(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC, 1);
+      resolver.FindOp(tflite::BuiltinOperator_LOGISTIC);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
index 17f13d4b0e9..c60ca906a5d 100644
--- a/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
+++ b/tensorflow/lite/micro/kernels/maximum_minimum_test.cc
@@ -49,7 +49,7 @@ void TestMaxMinFloat(tflite::BuiltinOperator op,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
@@ -108,7 +108,7 @@ void TestMaxMinQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
@@ -165,7 +165,7 @@ void TestMaxMinQuantizedInt32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
 
   ::tflite::ops::micro::AllOpsResolver resolver;
-  const TfLiteRegistration* registration = resolver.FindOp(op, 1);
+  const TfLiteRegistration* registration = resolver.FindOp(op);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteIntArray* inputs_array = IntArrayFromInitializer({2, 0, 1});
diff --git a/tensorflow/lite/micro/kernels/mul_test.cc b/tensorflow/lite/micro/kernels/mul_test.cc
index c740e57f4e2..446512a7c6c 100644
--- a/tensorflow/lite/micro/kernels/mul_test.cc
+++ b/tensorflow/lite/micro/kernels/mul_test.cc
@@ -49,7 +49,7 @@ void TestMulFloat(std::initializer_list<int> input1_dims_data,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MUL);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
@@ -124,7 +124,7 @@ void TestMulQuantized(std::initializer_list<int> input1_dims_data,
   TfLiteContext context;
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MUL, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MUL);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/neg_test.cc b/tensorflow/lite/micro/kernels/neg_test.cc
index ac2b79f8de1..9e8f0c842ea 100644
--- a/tensorflow/lite/micro/kernels/neg_test.cc
+++ b/tensorflow/lite/micro/kernels/neg_test.cc
@@ -43,7 +43,7 @@ void TestNegFloat(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_NEG, 1);
+      resolver.FindOp(tflite::BuiltinOperator_NEG);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/pack_test.cc b/tensorflow/lite/micro/kernels/pack_test.cc
index b384cb78d4e..216cd615e4c 100644
--- a/tensorflow/lite/micro/kernels/pack_test.cc
+++ b/tensorflow/lite/micro/kernels/pack_test.cc
@@ -52,7 +52,7 @@ void TestPackTwoInputsFloat(std::initializer_list<int> input1_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
@@ -129,7 +129,7 @@ void TestPackThreeInputsFloat(std::initializer_list<int> input1_dims_data,
 
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
@@ -202,7 +202,7 @@ void TestPackTwoInputsQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
@@ -272,7 +272,7 @@ void TestPackTwoInputsQuantized32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_PACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePackParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/pad_test.cc b/tensorflow/lite/micro/kernels/pad_test.cc
index 7248a9b2126..eac5b980837 100644
--- a/tensorflow/lite/micro/kernels/pad_test.cc
+++ b/tensorflow/lite/micro/kernels/pad_test.cc
@@ -32,7 +32,7 @@ TfLiteStatus ValidatePadGoldens(TfLiteTensor* tensors, int tensors_size,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PAD, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PAD);
   TF_LITE_ENSURE(&context, registration != nullptr);
 
   int inputs_array_data[] = {2, 0, 1};
@@ -69,7 +69,7 @@ TfLiteStatus ValidatePadV2Goldens(TfLiteTensor* tensors, int tensors_size,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PADV2, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PADV2);
   TF_LITE_ENSURE(&context, registration != nullptr);
 
   int inputs_array_data[] = {3, 0, 1, 2};
diff --git a/tensorflow/lite/micro/kernels/pooling_test.cc b/tensorflow/lite/micro/kernels/pooling_test.cc
index 96dff421d53..1ac74fca644 100644
--- a/tensorflow/lite/micro/kernels/pooling_test.cc
+++ b/tensorflow/lite/micro/kernels/pooling_test.cc
@@ -51,7 +51,7 @@ void TestAveragePoolingFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
@@ -125,7 +125,7 @@ void TestAveragePoolingQuantized(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_AVERAGE_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {padding,      stride_width,  stride_height,
@@ -192,7 +192,7 @@ void TestMaxPoolFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {
@@ -268,7 +268,7 @@ void TestMaxPoolQuantized(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MAX_POOL_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLitePoolParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 4199ae69689..1a41e75501d 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -46,7 +46,7 @@ void TestPreluFloat(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PRELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   size_t init_data_size = 0;
@@ -113,7 +113,7 @@ void TestPreluQuantized(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_PRELU, 1);
+      resolver.FindOp(tflite::BuiltinOperator_PRELU);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   size_t init_data_size = 0;
diff --git a/tensorflow/lite/micro/kernels/quantize_test.cc b/tensorflow/lite/micro/kernels/quantize_test.cc
index 0364fbc57ec..3089357040c 100644
--- a/tensorflow/lite/micro/kernels/quantize_test.cc
+++ b/tensorflow/lite/micro/kernels/quantize_test.cc
@@ -35,7 +35,7 @@ void ValidateQuantizeGoldens(TfLiteTensor* tensors, int tensors_size,
   // Version 1 of quantize supports int8 and uint8 quantization.
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_QUANTIZE, 1);
+      resolver.FindOp(tflite::BuiltinOperator_QUANTIZE);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/reduce_test.cc b/tensorflow/lite/micro/kernels/reduce_test.cc
index a791cdeaba6..965e45adb44 100644
--- a/tensorflow/lite/micro/kernels/reduce_test.cc
+++ b/tensorflow/lite/micro/kernels/reduce_test.cc
@@ -54,7 +54,7 @@ TfLiteStatus ValidateReduceGoldens(TfLiteTensor* tensors, int tensors_size,
   ::tflite::ops::micro::AllOpsResolver resolver;
 
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_MEAN, 1);
+      resolver.FindOp(tflite::BuiltinOperator_MEAN);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/reshape_test.cc b/tensorflow/lite/micro/kernels/reshape_test.cc
index b783c3c3e0f..7e7f58f0a03 100644
--- a/tensorflow/lite/micro/kernels/reshape_test.cc
+++ b/tensorflow/lite/micro/kernels/reshape_test.cc
@@ -62,7 +62,7 @@ void TestReshapeImpl(TfLiteTensor* input_tensor, TfLiteTensor* shape_tensor,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RESHAPE, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RESHAPE);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   void* user_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
index 518eada70fb..72e7252b31b 100644
--- a/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
+++ b/tensorflow/lite/micro/kernels/resize_nearest_neighbor_test.cc
@@ -71,7 +71,7 @@ void TestResizeNearestNeighbor(const int* input_dims_data, const T* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR, 1);
+      resolver.FindOp(tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteResizeNearestNeighborParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/round_test.cc b/tensorflow/lite/micro/kernels/round_test.cc
index e19ea41f2d9..10eb2759a4b 100644
--- a/tensorflow/lite/micro/kernels/round_test.cc
+++ b/tensorflow/lite/micro/kernels/round_test.cc
@@ -39,7 +39,7 @@ void TestRound(const int* input_dims_data, const float* input_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_ROUND, 1);
+      resolver.FindOp(tflite::BuiltinOperator_ROUND);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   int inputs_array_data[] = {1, 0};
diff --git a/tensorflow/lite/micro/kernels/softmax_test.cc b/tensorflow/lite/micro/kernels/softmax_test.cc
index f229ab021f9..afd97f1f015 100644
--- a/tensorflow/lite/micro/kernels/softmax_test.cc
+++ b/tensorflow/lite/micro/kernels/softmax_test.cc
@@ -45,7 +45,7 @@ void TestSoftmaxFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSoftmaxParams builtin_data = {1.0f};
@@ -111,7 +111,7 @@ void TestSoftmaxQuantized(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSoftmaxParams builtin_data = {1.0f};
@@ -177,7 +177,7 @@ void TestSoftmaxQuantizedSigned(
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SOFTMAX);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSoftmaxParams builtin_data = {1.0f};
diff --git a/tensorflow/lite/micro/kernels/split_test.cc b/tensorflow/lite/micro/kernels/split_test.cc
index 59be78f2960..807929ca4d5 100644
--- a/tensorflow/lite/micro/kernels/split_test.cc
+++ b/tensorflow/lite/micro/kernels/split_test.cc
@@ -65,7 +65,7 @@ void TestSplitTwoOutputsFloat(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
@@ -168,7 +168,7 @@ void TestSplitFourOutputsFloat(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
@@ -266,7 +266,7 @@ void TestSplitTwoOutputsQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
@@ -355,7 +355,7 @@ void TestSplitTwoOutputsQuantized32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SPLIT, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_SPLIT);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSplitParams builtin_data = {
diff --git a/tensorflow/lite/micro/kernels/strided_slice_test.cc b/tensorflow/lite/micro/kernels/strided_slice_test.cc
index 75732d8860d..a36a90b858e 100644
--- a/tensorflow/lite/micro/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/micro/kernels/strided_slice_test.cc
@@ -88,7 +88,7 @@ void TestStrideSlide(std::initializer_list<int> input_shape,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_STRIDED_SLICE, 1);
+      resolver.FindOp(tflite::BuiltinOperator_STRIDED_SLICE);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
   TfLiteStridedSliceParams builtin_data = {begin_mask, end_mask, ellipsis_mask,
                                            new_axis_mask, shrink_axis_mask};
diff --git a/tensorflow/lite/micro/kernels/sub_test.cc b/tensorflow/lite/micro/kernels/sub_test.cc
index e59ac636f65..e4875c3bd1e 100644
--- a/tensorflow/lite/micro/kernels/sub_test.cc
+++ b/tensorflow/lite/micro/kernels/sub_test.cc
@@ -71,7 +71,7 @@ void ValidateSubGoldens(TfLiteTensor* tensors, int tensors_size,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(::tflite::BuiltinOperator_SUB, 1);
+      resolver.FindOp(::tflite::BuiltinOperator_SUB);
 
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
diff --git a/tensorflow/lite/micro/kernels/svdf_test.cc b/tensorflow/lite/micro/kernels/svdf_test.cc
index 179cafd152c..fead3ab2fab 100644
--- a/tensorflow/lite/micro/kernels/svdf_test.cc
+++ b/tensorflow/lite/micro/kernels/svdf_test.cc
@@ -175,7 +175,7 @@ void ValidateSVDFGoldens(const int batch_size, const int num_units,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SVDF, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SVDF);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSVDFParams params;
@@ -250,7 +250,7 @@ void ValidateIntegerSVDFGoldens(const int batch_size, const int num_units,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_SVDF, 1);
+      resolver.FindOp(tflite::BuiltinOperator_SVDF);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteSVDFParams params;
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
index 2a367107771..517d8a9c4a8 100644
--- a/tensorflow/lite/micro/kernels/tanh_test.cc
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -45,7 +45,7 @@ void TestTanhFloat(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_TANH, 1);
+      resolver.FindOp(tflite::BuiltinOperator_TANH);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
@@ -107,7 +107,7 @@ void TestTanhInt8(std::initializer_list<int> input_dims_data,
 
   ::tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_TANH, 1);
+      resolver.FindOp(tflite::BuiltinOperator_TANH);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   const char* init_data = nullptr;
diff --git a/tensorflow/lite/micro/kernels/unpack_test.cc b/tensorflow/lite/micro/kernels/unpack_test.cc
index 86ccab8edc0..015c92c208a 100644
--- a/tensorflow/lite/micro/kernels/unpack_test.cc
+++ b/tensorflow/lite/micro/kernels/unpack_test.cc
@@ -67,7 +67,7 @@ void TestUnpackThreeOutputsFloat(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
@@ -144,7 +144,7 @@ void TestUnpackOneOutputFloat(std::initializer_list<int> input_dims_data,
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
@@ -236,7 +236,7 @@ void TestUnpackThreeOutputsQuantized(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
@@ -332,7 +332,7 @@ void TestUnpackThreeOutputsQuantized32(
   PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
   tflite::ops::micro::AllOpsResolver resolver;
   const TfLiteRegistration* registration =
-      resolver.FindOp(tflite::BuiltinOperator_UNPACK, /* version= */ 1);
+      resolver.FindOp(tflite::BuiltinOperator_UNPACK);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
 
   TfLiteUnpackParams builtin_data = {
diff --git a/tensorflow/lite/micro/micro_interpreter_test.cc b/tensorflow/lite/micro/micro_interpreter_test.cc
index 2358f763bc0..aa25593c508 100644
--- a/tensorflow/lite/micro/micro_interpreter_test.cc
+++ b/tensorflow/lite/micro/micro_interpreter_test.cc
@@ -151,11 +151,10 @@ class MockCustom {
 
 class MockOpResolver : public MicroOpResolver {
  public:
-  const TfLiteRegistration* FindOp(BuiltinOperator op,
-                                   int version) const override {
+  const TfLiteRegistration* FindOp(BuiltinOperator op) const override {
     return nullptr;
   }
-  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+  const TfLiteRegistration* FindOp(const char* op) const override {
     if (strcmp(op, "mock_custom") == 0) {
       return MockCustom::getRegistration();
     } else if (strcmp(op, "simple_stateful_op") == 0) {
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 88ec1133c9f..54de9e0e518 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -26,9 +26,9 @@ limitations under the License.
 
 namespace tflite {
 
-// Op versions discussed in this file are enumerated here:
-// tensorflow/lite/tools/versioning/op_version.cc
-
+// TODO(b/151245712) TODO(b/149408647): remove any version once we no longer
+// support op versions in the API or we switch most users and AllOpsResolver to
+// the new selective registration API, whichever seems more appropriate.
 inline int MicroOpResolverAnyVersion() { return 0; }
 
 template <unsigned int tOpCount>
@@ -37,28 +37,23 @@ class MicroMutableOpResolver : public MicroOpResolver {
   explicit MicroMutableOpResolver(ErrorReporter* error_reporter = nullptr)
       : error_reporter_(error_reporter) {}
 
-  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
-                                   int version) const override {
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+    if (op == BuiltinOperator_CUSTOM) return nullptr;
+
     for (unsigned int i = 0; i < registrations_len_; ++i) {
       const TfLiteRegistration& registration = registrations_[i];
-      if ((registration.builtin_code == op) &&
-          (registration.version == MicroOpResolverAnyVersion() ||
-           version == MicroOpResolverAnyVersion() ||
-           registration.version == version)) {
+      if (registration.builtin_code == op) {
         return &registration;
       }
     }
     return nullptr;
   }
 
-  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+  const TfLiteRegistration* FindOp(const char* op) const override {
     for (unsigned int i = 0; i < registrations_len_; ++i) {
       const TfLiteRegistration& registration = registrations_[i];
       if ((registration.builtin_code == BuiltinOperator_CUSTOM) &&
-          (strcmp(registration.custom_name, op) == 0) &&
-          (registration.version == MicroOpResolverAnyVersion() ||
-           version == MicroOpResolverAnyVersion() ||
-           registration.version == version)) {
+          (strcmp(registration.custom_name, op) == 0)) {
         return &registration;
       }
     }
@@ -83,6 +78,17 @@ class MicroMutableOpResolver : public MicroOpResolver {
       }
       return kTfLiteError;
     }
+
+    if (FindOp(op) != nullptr) {
+      if (error_reporter_ != nullptr) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Registering multiple versions of the same op is "
+                             "not supported (Op: #%d, version: %d).",
+                             op, version);
+      }
+      return kTfLiteError;
+    }
+
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
 
@@ -93,20 +99,9 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return kTfLiteOk;
   }
 
-  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
-                          TfLiteRegistration* registration, int min_version,
-                          int max_version) {
-    for (int version = min_version; version <= max_version; ++version) {
-      TfLiteStatus add_status = AddBuiltin(op, registration, version);
-      if (add_status != kTfLiteOk) {
-        return add_status;
-      }
-    }
-    return kTfLiteOk;
-  }
-
   TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
                          int version = 1) {
+    printf("registrations_len_: %d\n", registrations_len_);
     if (registrations_len_ >= tOpCount) {
       if (error_reporter_) {
         TF_LITE_REPORT_ERROR(
@@ -116,6 +111,17 @@ class MicroMutableOpResolver : public MicroOpResolver {
       }
       return kTfLiteError;
     }
+
+    if (FindOp(name) != nullptr) {
+      if (error_reporter_ != nullptr) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Registering multiple versions of the same op is "
+                             "not supported (Op: %s, version: %d).",
+                             name, version);
+      }
+      return kTfLiteError;
+    }
+
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
 
@@ -127,17 +133,6 @@ class MicroMutableOpResolver : public MicroOpResolver {
     return kTfLiteOk;
   }
 
-  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
-                         int min_version, int max_version) {
-    for (int version = min_version; version <= max_version; ++version) {
-      TfLiteStatus add_status = AddCustom(name, registration, version);
-      if (add_status != kTfLiteOk) {
-        return add_status;
-      }
-    }
-    return kTfLiteOk;
-  }
-
   unsigned int GetRegistrationLength() { return registrations_len_; }
 
  private:
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index 6b0c9974874..82f1c4bf1db 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -47,6 +47,8 @@ class MockErrorReporter : public ErrorReporter {
 
   bool HasBeenCalled() { return has_been_called_; }
 
+  void ResetState() { has_been_called_ = false; }
+
  private:
   bool has_been_called_;
   TF_LITE_REMOVE_VIRTUAL_DELETE
@@ -66,134 +68,52 @@ TF_LITE_MICRO_TEST(TestOperations) {
   static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
                                  tflite::MockPrepare, tflite::MockInvoke};
 
-  // We need space for 7 operators because of 2 ops, one with 3 versions, one
-  // with 4 versions.
-  MicroMutableOpResolver<7> micro_op_resolver;
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
-                                         BuiltinOperator_CONV_2D, &r, 1, 3));
+  MicroMutableOpResolver<2> micro_op_resolver;
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 1));
+
+  // Only one AddBuiltin per operator should return kTfLiteOk, regardless of
+  // what the version parameter is.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
+                                            BuiltinOperator_CONV_2D, &r, 1));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
+                                            BuiltinOperator_CONV_2D, &r, 2));
+
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
-                          micro_op_resolver.AddCustom("mock_custom", &r, 1, 4));
-  OpResolver* resolver = &micro_op_resolver;
+                          micro_op_resolver.AddCustom("mock_custom", &r, 2));
+
+  // Only one AddCustom per operator should return kTfLiteOk, regardless of
+  // what the version parameter is.
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 1));
+
+  tflite::MicroOpResolver* resolver = &micro_op_resolver;
 
   const TfLiteRegistration* registration =
-      resolver->FindOp(BuiltinOperator_CONV_2D, 1);
+      resolver->FindOp(BuiltinOperator_CONV_2D);
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
 
-  TF_LITE_MICRO_EXPECT_EQ(7, micro_op_resolver.GetRegistrationLength());
-
-  registration = resolver->FindOp(BuiltinOperator_CONV_2D, 10);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
-
-  registration = resolver->FindOp(BuiltinOperator_RELU, 0);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
-
-  registration = resolver->FindOp("mock_custom", 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-
-  registration = resolver->FindOp("mock_custom", 10);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
-
-  registration = resolver->FindOp("nonexistent_custom", 0);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
-}
-
-TF_LITE_MICRO_TEST(TestOpRegistrationOverflow) {
-  using tflite::BuiltinOperator_CONV_2D;
-  using tflite::BuiltinOperator_RELU;
-  using tflite::MicroMutableOpResolver;
-
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
-
-  MicroMutableOpResolver<4> micro_op_resolver;
-  // Register 7 ops, but only 4 is expected because the class is created with
-  // that limit..
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
-                                         BuiltinOperator_CONV_2D, &r, 0, 2));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
-                          micro_op_resolver.AddCustom("mock_custom", &r, 0, 3));
-
-  TF_LITE_MICRO_EXPECT_EQ(4, micro_op_resolver.GetRegistrationLength());
-}
-
-TF_LITE_MICRO_TEST(TestZeroVersionRegistration) {
-  using tflite::MicroMutableOpResolver;
-  using tflite::OpResolver;
-
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
-
-  MicroMutableOpResolver<1> micro_op_resolver;
-  micro_op_resolver.AddCustom("mock_custom", &r,
-                              tflite::MicroOpResolverAnyVersion());
-
-  TF_LITE_MICRO_EXPECT_EQ(1, micro_op_resolver.GetRegistrationLength());
-
-  OpResolver* resolver = &micro_op_resolver;
-
-  const TfLiteRegistration* registration = resolver->FindOp("mock_custom", 0);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-
-  registration = resolver->FindOp("mock_custom", 1);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-
-  registration = resolver->FindOp("mock_custom", 42);
-  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-}
-
-TF_LITE_MICRO_TEST(TestZeroModelVersion) {
-  using tflite::MicroMutableOpResolver;
-  using tflite::OpResolver;
-
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
-
-  MicroMutableOpResolver<2> micro_op_resolver;
-  micro_op_resolver.AddCustom("mock_custom", &r, 1, 2);
   TF_LITE_MICRO_EXPECT_EQ(2, micro_op_resolver.GetRegistrationLength());
-  OpResolver* resolver = &micro_op_resolver;
 
-  // If the Op version in the model is 0, we should always get the first
-  // registration.
-  const TfLiteRegistration* registration = resolver->FindOp("mock_custom", 0);
+  registration = resolver->FindOp(BuiltinOperator_RELU);
+  TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
+
+  registration = resolver->FindOp("mock_custom");
   TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-  TF_LITE_MICRO_EXPECT_EQ(1, registration->version);
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
 
-  // If a non-zero version is requested, the correct version'd op should be
-  // returned. TODO(b/151245712): Realistically, we are better off removing
-  // these version checks altogether.
-  for (int i = 1; i <= 2; ++i) {
-    registration = resolver->FindOp("mock_custom", i);
-    TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
-    TF_LITE_MICRO_EXPECT_EQ(i, registration->version);
-    TF_LITE_MICRO_EXPECT_EQ(nullptr, registration->init(nullptr, nullptr, 0));
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(nullptr, nullptr));
-    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(nullptr, nullptr));
-  }
-
-  registration = resolver->FindOp("mock_custom", 42);
+  registration = resolver->FindOp("nonexistent_custom");
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 }
 
-TF_LITE_MICRO_TEST(TestBuiltinRegistrationErrorReporting) {
+TF_LITE_MICRO_TEST(TestErrorReporting) {
   using tflite::BuiltinOperator_CONV_2D;
   using tflite::BuiltinOperator_RELU;
   using tflite::MicroMutableOpResolver;
@@ -202,71 +122,31 @@ TF_LITE_MICRO_TEST(TestBuiltinRegistrationErrorReporting) {
                                  tflite::MockPrepare, tflite::MockInvoke};
 
   tflite::MockErrorReporter mock_reporter;
-  MicroMutableOpResolver<1> micro_op_resolver(&mock_reporter);
+  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
+
   TF_LITE_MICRO_EXPECT_EQ(
       kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
-  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
-}
+  mock_reporter.ResetState();
 
-TF_LITE_MICRO_TEST(TestCustomRegistrationErrorReporting) {
-  using tflite::BuiltinOperator_CONV_2D;
-  using tflite::BuiltinOperator_RELU;
-  using tflite::MicroMutableOpResolver;
-
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
-
-  tflite::MockErrorReporter mock_reporter;
-  MicroMutableOpResolver<1> micro_op_resolver(&mock_reporter);
-  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
                           micro_op_resolver.AddCustom("mock_custom_0", &r));
   TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
+
+  // Attempting to Add more operators than the class template parameter for
+  // MicroMutableOpResolver should result in errors.
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
+
   TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
                           micro_op_resolver.AddCustom("mock_custom_1", &r));
   TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
-}
-
-TF_LITE_MICRO_TEST(TestBuiltinVersionRegistrationErrorReporting) {
-  using tflite::BuiltinOperator_CONV_2D;
-  using tflite::BuiltinOperator_RELU;
-  using tflite::MicroMutableOpResolver;
-
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
-
-  tflite::MockErrorReporter mock_reporter;
-  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
-  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
-                                         BuiltinOperator_CONV_2D, &r, 1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
-  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
-                                            BuiltinOperator_RELU, &r, 1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
-}
-
-TF_LITE_MICRO_TEST(TestCustomVersionRegistrationErrorReporting) {
-  using tflite::BuiltinOperator_CONV_2D;
-  using tflite::BuiltinOperator_RELU;
-  using tflite::MicroMutableOpResolver;
-
-  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
-                                 tflite::MockPrepare, tflite::MockInvoke};
-
-  tflite::MockErrorReporter mock_reporter;
-  MicroMutableOpResolver<2> micro_op_resolver(&mock_reporter);
-  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteOk, micro_op_resolver.AddCustom("mock_custom_0", &r, 1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
-  TF_LITE_MICRO_EXPECT_EQ(
-      kTfLiteError, micro_op_resolver.AddCustom("mock_custom_1", &r, 1, 2));
-  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+  mock_reporter.ResetState();
 }
 
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_op_resolver.h b/tensorflow/lite/micro/micro_op_resolver.h
index 64a3c85cc78..68035b3256e 100644
--- a/tensorflow/lite/micro/micro_op_resolver.h
+++ b/tensorflow/lite/micro/micro_op_resolver.h
@@ -24,11 +24,14 @@ limitations under the License.
 namespace tflite {
 
 // This is an interface for the OpResolver for TFLiteMicro. The differences from
-// the TFLite OpResolver base class are to allow for finer grained registration
-// of the Builtin Ops to reduce code size for TFLiteMicro.  We need an interface
-// class instead of directly using MicroMutableOpResolver because
-// MicroMutableOpResolver is a class template with the number of registered Ops
-// as the template parameter.
+// the TFLite OpResolver base class are to:
+//  * explicitly remove support for Op versions
+//  * allow for finer grained registration of the Builtin Ops to reduce code
+//    size for TFLiteMicro.
+//
+// We need an interface class instead of directly using MicroMutableOpResolver
+// because MicroMutableOpResolver is a class template with the number of
+// registered Ops as the template parameter.
 class MicroOpResolver : public OpResolver {
  public:
   // TODO(b/149408647): The op_type parameter enables a gradual transfer to
@@ -41,15 +44,59 @@ class MicroOpResolver : public OpResolver {
                                                BuiltinDataAllocator* allocator,
                                                void** builtin_data);
 
-  // Returns the operator specific parsing function for the OpData for a
-  //   BuiltinOperator (if registered), else nullptr.
-  virtual BuiltinParseFunction GetOpDataParser(
-      tflite::BuiltinOperator op) const = 0;
-
+  // Registers a Builtin Operator with the MicroOpResolver.
+  //
+  // Note that the version parameter is ignored and only a first call for a
+  // given BuiltinOperator enum will be successful. i.e. if this function is
+  // called again for a previously added BuiltinOperator (even with a different
+  // version parameter), the MicroOpResolver will be unchanged and this function
+  // will return kTfLiteError.
+  //
+  // TODO(b/151245712): The version param is kept to avoid breaking the legacy
+  // API but it should be removed eventually.
   virtual TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
                                   TfLiteRegistration* registration,
                                   int version) = 0;
 
+  // Registers a Custom Operator with the MicroOpResolver.
+  //
+  // Note that the version parameter is ignored and only a first call for a
+  // given name will be successful. i.e. if this function is called again for a
+  // previously added Custom Operator (even with a different version parameter),
+  // the MicroOpResolver will be unchanged and this function will return
+  // kTfLiteError.
+  //
+  // TODO(b/151245712): The version param is kept to avoid breaking the legacy
+  // API but it should be removed eventually.
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int version);
+
+  // Returns the Op registration struct corresponding to the enum code from the
+  // flatbuffer schema. Returns nullptr if the op is not found or if op ==
+  // BuiltinOperator_CUSTOM.
+  virtual const TfLiteRegistration* FindOp(BuiltinOperator op) const = 0;
+
+  // Returns the Op registration struct corresponding to the custom operator by
+  // name.
+  virtual const TfLiteRegistration* FindOp(const char* op) const = 0;
+
+  // This implementation exists for compatibility with the OpResolver base class
+  // and disregards the version parameter.
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const final {
+    return FindOp(op);
+  }
+
+  // This implementation exists for compatibility with the OpResolver base class
+  // and disregards the version parameter.
+  const TfLiteRegistration* FindOp(const char* op, int version) const final {
+    return FindOp(op);
+  }
+
+  // Returns the operator specific parsing function for the OpData for a
+  // BuiltinOperator (if registered), else nullptr.
+  virtual BuiltinParseFunction GetOpDataParser(BuiltinOperator op) const = 0;
+
   ~MicroOpResolver() override {}
 };
 

From 48d6fcc90a06fea09220e71680ba9863d94b8213 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 18:30:52 -0700
Subject: [PATCH 1432/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 313885945
Change-Id: Ia1b7a0a7f9a0670da9726b943de03971a9da5a75
---
 .../ops_history_v2/CollectiveBcastRecv.pbtxt  | 52 ++++++++++++
 .../ops_history_v2/CollectiveBcastSend.pbtxt  | 56 ++++++++++++
 .../ops_history_v2/CollectiveGather.pbtxt     | 55 ++++++++++++
 .../ops_history_v2/CollectiveReduce.pbtxt     | 85 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 28 ++++++
 5 files changed, 276 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
index 55935ccca72..5ab41c40b73 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastRecv.pbtxt
@@ -80,3 +80,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastRecv"
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
index 9c67349c01f..35734c2a611 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveBcastSend.pbtxt
@@ -88,3 +88,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveBcastSend"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BOOL
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
index 23326568a79..c093e30dceb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveGather.pbtxt
@@ -87,3 +87,58 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveGather"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
index 25b1485e3e1..14f4b1073a0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CollectiveReduce.pbtxt
@@ -210,3 +210,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CollectiveReduce"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "group_size"
+    type: "int"
+  }
+  attr {
+    name: "group_key"
+    type: "int"
+  }
+  attr {
+    name: "instance_key"
+    type: "int"
+  }
+  attr {
+    name: "merge_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Min"
+        s: "Max"
+        s: "Mul"
+        s: "Add"
+      }
+    }
+  }
+  attr {
+    name: "final_op"
+    type: "string"
+    allowed_values {
+      list {
+        s: "Id"
+        s: "Div"
+      }
+    }
+  }
+  attr {
+    name: "subdiv_offsets"
+    type: "list(int)"
+  }
+  attr {
+    name: "wait_for"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "communication_hint"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 9324181d287..ea7d21a5b2b 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7075,6 +7075,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7124,6 +7131,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7172,6 +7186,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {
@@ -7290,6 +7311,13 @@ op {
       s: "auto"
     }
   }
+  attr {
+    name: "timeout_seconds"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
   is_stateful: true
 }
 op {

From 341bb725559e8b5984242b4af323572a5fb1a110 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 18:42:57 -0700
Subject: [PATCH 1433/1533] Restore `with_select_tf_ops` build flag.

This remains useful for testing and development. Restore the ability
to inject support for TF ops in TFLite using `--define=with_select_tf_ops=true`.

See also issue #34277.

PiperOrigin-RevId: 313887137
Change-Id: Ia7c737b76705d5718895311c9694ffd91164040b
---
 tensorflow/lite/BUILD                     |  9 ---------
 tensorflow/lite/build_def.bzl             | 24 +----------------------
 tensorflow/lite/g3doc/guide/ops_select.md |  7 ++-----
 3 files changed, 3 insertions(+), 37 deletions(-)

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 19d45423988..810f3ab1a2b 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -56,15 +56,6 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
-# Enables inclusion of select TensorFlow kernels via the TFLite Flex delegate
-# when building TFLite shared libraries.
-# WARNING: This build flag is experimental and subject to change.
-config_setting(
-    name = "with_select_tf_ops",
-    define_values = {"with_select_tf_ops": "true"},
-    visibility = ["//visibility:public"],
-)
-
 TFLITE_DEFAULT_COPTS = if_not_windows([
     "-Wall",
     "-Wno-comment",
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 9850cbedd9a..fd51ad0a4aa 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -110,26 +110,6 @@ def tflite_jni_linkopts():
     """Defines linker flags to reduce size of TFLite binary with JNI."""
     return tflite_jni_linkopts_unstripped() + tflite_symbol_opts()
 
-def maybe_flex_deps(deps):
-    """Returns necessary flex deps when with_select_tf_ops build flag is used
-
-    Args:
-      deps: The source deps for the target (to avoid deps duplication).
-    Returns:
-       A list of additional flex deps required, based on the build flags used.
-       If with_select_tf_ops is not true, this will be an empty list.
-    """
-
-    # Filter redundant flex deps if already provided.
-    flex_dep = clean_dep("//tensorflow/lite/delegates/flex:delegate")
-    if type(deps) == type([]) and flex_dep in deps:
-        return []
-
-    return select({
-        clean_dep("//tensorflow/lite:with_select_tf_ops"): [flex_dep],
-        "//conditions:default": [],
-    })
-
 def tflite_jni_binary(
         name,
         copts = tflite_copts(),
@@ -159,7 +139,7 @@ def tflite_jni_binary(
         copts = copts,
         linkshared = linkshared,
         linkstatic = linkstatic,
-        deps = deps + [linkscript, exported_symbols] + maybe_flex_deps(deps),
+        deps = deps + [linkscript, exported_symbols],
         srcs = srcs,
         tags = tags,
         linkopts = linkopts,
@@ -169,7 +149,6 @@ def tflite_jni_binary(
 def tflite_cc_shared_object(
         name,
         copts = tflite_copts(),
-        deps = [],
         linkopts = [],
         linkstatic = 1,
         per_os_targets = False,
@@ -181,7 +160,6 @@ def tflite_cc_shared_object(
         linkstatic = linkstatic,
         linkopts = linkopts + tflite_jni_linkopts(),
         framework_so = [],
-        deps = deps + maybe_flex_deps(deps),
         per_os_targets = per_os_targets,
         **kwargs
     )
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 3a867cdc619..8a9109cf54c 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -212,11 +212,8 @@ TensorFlow ops library can be included and enabled as follows:
 
 *   Enable monolithic builds if necessary by adding the `--config=monolithic`
     build flag.
-*   Do one of the following:
-    *   Include the `--define=with_select_tf_ops=true` build flag in the `bazel
-        build` invocation when building the TensorFlow Lite binary.
-    *   Add the TensorFlow ops delegate library dependency to the build
-        dependencies: `tensorflow/lite/delegates/flex:delegate`.
+*   Add the TensorFlow ops delegate library dependency to the build
+    dependencies: `tensorflow/lite/delegates/flex:delegate`.
 
 Note that the necessary `TfLiteDelegate` will be installed automatically when
 creating the interpreter at runtime as long as the delegate is linked into the

From c14a71fc41d6961d66d3d0a71a2da2a29b93e43d Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 29 May 2020 18:43:16 -0700
Subject: [PATCH 1434/1533] Uniformly import and export _Arg node/FunctionDef
 arg attributes.

In a Function Graph (Graph generated from a Function/FunctionDef), it is possible to have other attributes on the generated _Arg nodes. These attributes are either modeled as fields in FunctionDef ('_resource_arg_unique_id' attributes are stored as FunctionDef::map<uint32, uint32> resource_arg_unique_id) or explicitly in FunctionDef::map<uint32, ArgAttrs> arg_attr. When converting a FunctionDef to a Graph (in import), these attributes are added to generated _Arg node attributes. Some of these attributes should be preserved for downstream users. Currently only '_resource_arg_unique_id' is being imported with special handling.

This change unifies and imports any _Arg attribute that is not a shape inference based attribute or _Arg op def attribute. On export, attributes of the 'tf' dialect ('tf.' prefix) are added back. For the main function Graph, the attributes are simply added back to the generated _Arg node. For other functions, as a FunctionDef is created instead, '_resource_arg_unique_id' is handled differently, specifically adding it's content to FunctionDef::map<uint32, uint32> resource_arg_unique_id while all other attribute are added to FunctionDef::map<uint32, ArgAttrs> arg_attr.

PiperOrigin-RevId: 313887160
Change-Id: Ibf3b7e27c29cee435c8c188bdb8533a641ceef68
---
 .../analysis/side_effect_analysis.cc          |  2 +-
 .../graphdef2mlir/graph-as-function.pbtxt     |  2 +-
 .../graph-function-resource-args.pbtxt        |  6 +-
 .../mlir2graphdef/function-resource-args.mlir |  4 +-
 .../tests/side-effect-analysis-test.mlir      |  6 +-
 .../tensorflow/tests/tf_saved_model/basic.py  |  2 +-
 .../tests/tf_saved_model/call_to_exported.py  |  4 +-
 .../tensorflow/tests/tf_saved_model/keras.py  |  2 +-
 .../tests/tf_saved_model/structured_input.py  | 34 ++++----
 .../tensorflow/translate/export_graphdef.cc   | 50 +++++++-----
 .../mlir/tensorflow/translate/import_model.cc | 79 +++++++++----------
 11 files changed, 98 insertions(+), 93 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index ff1620347f7..f7b88317cd4 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -49,7 +49,7 @@ namespace TF {
 namespace {
 
 constexpr int64_t kUnknownResourceId = -1;
-constexpr char kResourceArgUniqueIdAttr[] = "tf.resource_arg_unique_id";
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
 
 // Returns if a VarHandleOp is anonymous, which means it always creates a new
 // variable.
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index d26585edb03..03640e24aac 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -13,7 +13,7 @@
 # CHECK:          %[[ISLAND_2:.*]], %[[ISLAND_2_control:.*]] = tf_executor.island wraps "tf.StatefulPartitionedCall"
 # CHECK-SAME:       f = @[[FUNC:[a-z0-9]*]]
 # CHECK:          tf_executor.fetch %[[ISLAND_1]], %[[ISLAND_2]] : tensor<*xf32>, tensor<*xf32>
-# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32>, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
+# CHECK:      func @[[FUNC]](%arg0: tensor<*xf32> {tf._user_specified_name = "inputs"}, %arg1: tensor<*x!tf.resource>) -> tensor<*xf32>
 
 node {
   name: "args_0"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
index 0e6e561225d..eb358d52b26 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-resource-args.pbtxt
@@ -81,7 +81,7 @@ library {
 }
 
 # Check that the `resource_arg_unique_id` for each argument is propagated to the
-# `tf.resource_arg_unique_id` argument attribute of the function
+# `tf._resource_arg_unique_id` argument attribute of the function
 # @test_func_name0.
 
 # CHECK:  func @main
@@ -92,8 +92,8 @@ library {
 # CHECK:      tf_executor.fetch
 # CHECK:    return
 # CHECK:  func @test_func_name0
-# CHECK-SAME:   tf.resource_arg_unique_id = 0
-# CHECK-SAME:   tf.resource_arg_unique_id = 0
+# CHECK-SAME:   tf._resource_arg_unique_id = 0
+# CHECK-SAME:   tf._resource_arg_unique_id = 0
 # CHECK:    tf_executor.graph
 # CHECK:      tf_executor.fetch
 # CHECK:    return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
index 680e26f5cbb..44824ea1424 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-resource-args.mlir
@@ -8,14 +8,14 @@ func @main() -> tensor<*x!tf.resource> attributes {tf.entry_function = {inputs =
   }
   return %0 : tensor<*x!tf.resource>
 }
-func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf.resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
+func @test_func_name0(%arg0: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}, %arg1: tensor<*x!tf.resource> {tf._resource_arg_unique_id = 0 : i64}) -> tensor<*x!tf.resource> attributes {tf._disable_call_shape_inference = true} {
   %0 = tf_executor.graph {
     tf_executor.fetch %arg0 : tensor<*x!tf.resource>
   }
   return %0 : tensor<*x!tf.resource>
 }
 
-// Check that the `tf.resource_arg_unique_id` argument attributes of
+// Check that the `tf._resource_arg_unique_id` argument attributes of
 // test_func_name0 are propagated to the function's arg_attr and
 // resource_arg_unique_id.
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index c36771c0576..965b3b10843 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -786,9 +786,9 @@ func @tf_registry_ops(
 // CHECK-LABEL: func @arguments_with_unique_ids
 func @arguments_with_unique_ids(
   // expected-remark@above {{ID: 9}}
-  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
-  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 0 : i64},
-  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf.resource_arg_unique_id = 33 : i64}) {
+  %arg0: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg1: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 0 : i64},
+  %arg2: tensor<*x!tf.resource<tensor<32xf32>>> {tf._resource_arg_unique_id = 33 : i64}) {
   tf_executor.graph {
   // expected-remark@above {{ID: 7}}
   // expected-remark@above {{Successors: {8}}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
index 78c18a17d4a..b337224e680 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/basic.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # CHECK: "tf_saved_model.global_tensor"() {is_mutable, sym_name = "[[VAR:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = ["v42"], type = tensor<f32>, value = dense<4.200000e+01> : tensor<f32>} : () -> ()
   # CHECK: "tf_saved_model.global_tensor"() {sym_name = "[[CONST:[a-zA-Z_0-9]+]]", tf_saved_model.exported_names = [], type = tensor<f32>, value = dense<4.300000e+01> : tensor<f32>} : () -> ()
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[VAR]]},
   # CHECK-SAME:   %arg2: tensor<!tf.resource<tensor<f32>>> {tf_saved_model.bound_input = @[[CONST]]}) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = []})
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
index 658cc37a22f..694942f4b00 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/call_to_exported.py
@@ -45,7 +45,7 @@ class TestModule(tf.Module):
   # modify signatures interprocedurally).
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
@@ -54,7 +54,7 @@ class TestModule(tf.Module):
   # CHECK:        "tf.StatefulPartitionedCall"{{.*}}f = @[[CALLEE_INTERNAL:[a-zA-Z_0-9]+]]
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
   # CHECK-SAME:   %arg1: tensor<!tf.resource<{{.*}}>> {tf_saved_model.bound_input = {{@[a-zA-Z_0-9]+}}}
   # CHECK-SAME: ) -> (
   # CHECK-SAME:   tensor<f32> {tf_saved_model.index_path = [0]},
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
index a95909b61ef..ffb5c024bbb 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/keras.py
@@ -39,7 +39,7 @@ class TestModule(tf.Module):
     super(TestModule, self).__init__()
     self.model = mnist_model()
 
-  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf_saved_model.index_path = [0]}
+  # CHECK: func {{@[a-zA-Z_0-9]+}}(%arg0: tensor<1x28x28x1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]}
   # CHECK: attributes {{.*}} tf_saved_model.exported_names = ["my_predict"]
   @tf.function(input_signature=[
       tf.TensorSpec([1, 28, 28, 1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
index 095fddbda96..43591d12183 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/structured_input.py
@@ -36,8 +36,8 @@ class TestModule(tf.Module):
   # The outer layer of the index path indexes into the arguments.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [1]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "x", tf_saved_model.index_path = [0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "y", tf_saved_model.index_path = [1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0000_function_arity"]
   @tf.function(input_signature=[
       tf.TensorSpec([1], tf.float32),
@@ -49,8 +49,8 @@ class TestModule(tf.Module):
   # Check index paths for lists.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<f32> {tf_saved_model.index_path = [0, 0]},
-  # CHECK-SAME:   %arg1: tensor<f32> {tf_saved_model.index_path = [0, 1]})
+  # CHECK-SAME:   %arg0: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 0]},
+  # CHECK-SAME:   %arg1: tensor<f32> {tf._user_specified_name = "l", tf_saved_model.index_path = [0, 1]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0001_list_2_elements"]
   @tf.function(input_signature=[[
       tf.TensorSpec([], tf.float32),
@@ -63,8 +63,8 @@ class TestModule(tf.Module):
   # Keys are linearized in sorted order, matching `tf.nest.flatten`.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0002_dict_2_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([1], tf.float32),
@@ -77,8 +77,8 @@ class TestModule(tf.Module):
   # The index path should be insensitive to the key order.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0003_dict_2_keys_out_of_order"]
   @tf.function(input_signature=[{
       'y': tf.TensorSpec([2], tf.float32),
@@ -90,12 +90,12 @@ class TestModule(tf.Module):
   # Slightly stronger stress test of multiple dict keys.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "a"]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "b"]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "c"]},
-  # CHECK-SAME:   %arg3: tensor<4xf32> {tf_saved_model.index_path = [0, "x"]},
-  # CHECK-SAME:   %arg4: tensor<5xf32> {tf_saved_model.index_path = [0, "y"]},
-  # CHECK-SAME:   %arg5: tensor<6xf32> {tf_saved_model.index_path = [0, "z"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "a"]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "b"]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "c"]},
+  # CHECK-SAME:   %arg3: tensor<4xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x"]},
+  # CHECK-SAME:   %arg4: tensor<5xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]},
+  # CHECK-SAME:   %arg5: tensor<6xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "z"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0004_dict_many_keys"]
   @tf.function(input_signature=[{
       'x': tf.TensorSpec([4], tf.float32),
@@ -112,9 +112,9 @@ class TestModule(tf.Module):
   # Note that list elements can have heterogenous types.
   #
   # CHECK:      func {{@[a-zA-Z_0-9]+}}(
-  # CHECK-SAME:   %arg0: tensor<1xf32> {tf_saved_model.index_path = [0, "x", 0]},
-  # CHECK-SAME:   %arg1: tensor<2xf32> {tf_saved_model.index_path = [0, "x", 1]},
-  # CHECK-SAME:   %arg2: tensor<3xf32> {tf_saved_model.index_path = [0, "y"]})
+  # CHECK-SAME:   %arg0: tensor<1xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 0]},
+  # CHECK-SAME:   %arg1: tensor<2xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "x", 1]},
+  # CHECK-SAME:   %arg2: tensor<3xf32> {tf._user_specified_name = "d", tf_saved_model.index_path = [0, "y"]})
   # CHECK-SAME: attributes {{.*}} tf_saved_model.exported_names = ["f0005_more_complex_recursive_structure"]
   @tf.function(input_signature=[{
       'x': [tf.TensorSpec([1], tf.float32),
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 2bf55922d4b..8e51f8c9a25 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -70,7 +70,6 @@ using llvm::isa;
 using mlir::BlockArgument;
 using mlir::Dialect;
 using mlir::Operation;
-using mlir::OperationState;
 using mlir::Value;
 using stream_executor::port::StatusOr;
 
@@ -79,6 +78,9 @@ namespace {
 constexpr char kInvalidExecutorGraphMsg[] =
     "Functions must be of a single Graph with single op Islands: ";
 
+constexpr char kDeviceAttr[] = "tf.device";
+constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id";
+
 bool IsLegalChar(char c, bool first_char) {
   if (isalpha(c)) return true;
   if (isdigit(c)) return true;
@@ -267,17 +269,14 @@ StatusOr<std::unique_ptr<NodeDef>> Exporter::GetArgumentNode(
   (*node_def->mutable_attr())["index"] = index_attr;
 
   if (auto device_attr =
-          func.getArgAttrOfType<mlir::StringAttr>(index, "tf.device")) {
+          func.getArgAttrOfType<mlir::StringAttr>(index, kDeviceAttr))
     *node_def->mutable_device() = device_attr.getValue().str();
-  }
 
-  if (auto resource_arg_unique_id_attr =
-          func.getArgAttrOfType<mlir::IntegerAttr>(
-              index, "tf.resource_arg_unique_id")) {
-    AttrValue unique_id_attr;
-    unique_id_attr.set_i(resource_arg_unique_id_attr.getInt());
-    (*node_def->mutable_attr())["_resource_arg_unique_id"] = unique_id_attr;
-  }
+  llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
+      func.getArgAttrs(index);
+  absl::flat_hash_set<absl::string_view> attrs_to_ignore = {kDeviceAttr};
+  TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                       node_def->mutable_attr()));
 
   return node_def;
 }
@@ -682,14 +681,6 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
   if (auto attr = function.getAttrOfType<mlir::UnitAttr>(stateful_string)) {
     func_def.mutable_signature()->set_is_stateful(true);
   }
-  for (int64 i = 0; i < function.getNumArguments(); ++i) {
-    if (auto resource_arg_unique_id_attr =
-            function.getArgAttrOfType<mlir::IntegerAttr>(
-                i, "tf.resource_arg_unique_id")) {
-      (*func_def.mutable_resource_arg_unique_id())[i] =
-          resource_arg_unique_id_attr.getInt();
-    }
-  }
 
   // Ignore the gradient and is_stateful attribute on the function as they have
   // been handled above.
@@ -699,7 +690,28 @@ Status Exporter::ConvertLibFunction(const GraphExportConfig& configs,
       function.getDialectAttrs());
   TF_RETURN_IF_ERROR(
       ConvertAttributes(funcAttrs, attrs_to_ignore, func_def.mutable_attr()));
-  (*flib->add_function()) = func_def;
+
+  for (int i = 0, e = function.getNumArguments(); i < e; ++i) {
+    if (auto resource_arg_unique_id_attr =
+            function.getArgAttrOfType<mlir::IntegerAttr>(
+                i, kResourceArgUniqueIdAttr)) {
+      (*func_def.mutable_resource_arg_unique_id())[i] =
+          resource_arg_unique_id_attr.getInt();
+    }
+
+    llvm::ArrayRef<mlir::NamedAttribute> func_arg_i_attrs =
+        function.getArgAttrs(i);
+    if (func_arg_i_attrs.empty()) continue;
+    absl::flat_hash_set<absl::string_view> attrs_to_ignore = {
+        kDeviceAttr, kResourceArgUniqueIdAttr};
+    FunctionDef::ArgAttrs func_def_arg_i_attrs;
+    TF_RETURN_IF_ERROR(ConvertAttributes(func_arg_i_attrs, attrs_to_ignore,
+                                         func_def_arg_i_attrs.mutable_attr()));
+    if (func_def_arg_i_attrs.attr().empty()) continue;
+    (*func_def.mutable_arg_attr())[i] = std::move(func_def_arg_i_attrs);
+  }
+
+  (*flib->add_function()) = std::move(func_def);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index bd63a3b224f..24a1d40a8bb 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
@@ -132,6 +133,13 @@ bool IsOutputShapesAttribute(const AttrValue& attr_value,
          attr_value.value_case() == AttrValue::kList;
 }
 
+bool IsResourceOutputShapesAttribute(const AttrValue& attr_value,
+                                     llvm::StringRef attr_name) {
+  if (attr_name == "_handle_dtypes" || attr_name == "_handle_shapes")
+    return attr_value.value_case() == AttrValue::kList;
+  return false;
+}
+
 // This class is used to generate new MLIR function name strings that are both
 // unique in the TF function library `flib_` and unique among the name strings
 // generated by the class object during its lifetime.
@@ -195,15 +203,11 @@ class ImporterBase {
   StatusOr<mlir::FunctionType> InferLibFunctionType(const FunctionBody& fbody);
 
   // Extracts arg and ret nodes from FunctionBody.
-  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
-  // variables, as a list of {index, ID} pairs.
   void GetArgsAndRetsFromFunctionBody(
       const FunctionBody& fbody,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
       absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-      absl::InlinedVector<Node*, 4>* control_ret_nodes,
-      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-          resource_arg_unique_ids);
+      absl::InlinedVector<Node*, 4>* control_ret_nodes);
 
   // Prepares converting the graph to an MLIR module. This step removes the
   // backedges of the graph, orders the nodes and infers the shapes.
@@ -217,8 +221,7 @@ class ImporterBase {
                  const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
                  const absl::InlinedVector<Node*, 4>& control_ret_nodes,
                  llvm::ArrayRef<mlir::NamedAttribute> attrs,
-                 const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
-                     resource_arg_unique_ids);
+                 bool function_graph);
 
   // Finds out the function definition for the given function name from the
   // graph and converts it to a function of the module. This method is called
@@ -1192,9 +1195,7 @@ StatusOr<mlir::Attribute> ImporterBase::ConvertAttributeValue(
 void ImporterBase::GetArgsAndRetsFromFunctionBody(
     const FunctionBody& fbody, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
     absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-    absl::InlinedVector<Node*, 4>* control_ret_nodes,
-    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-        resource_arg_unique_ids) {
+    absl::InlinedVector<Node*, 4>* control_ret_nodes) {
   arg_nodes->reserve(fbody.arg_nodes.size());
   ret_nodes->reserve(fbody.ret_nodes.size());
   for (auto arg : fbody.arg_nodes) {
@@ -1203,9 +1204,6 @@ void ImporterBase::GetArgsAndRetsFromFunctionBody(
   for (auto ret : fbody.ret_nodes) {
     ret_nodes->emplace_back(ret, 0);
   }
-  for (const auto& entry : fbody.fdef.resource_arg_unique_id()) {
-    resource_arg_unique_ids->push_back(entry);
-  }
   *control_ret_nodes = fbody.control_ret_nodes;
 }
 
@@ -1300,14 +1298,13 @@ Status ImporterBase::ConvertLibFunction(llvm::StringRef func_name) {
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   GetArgsAndRetsFromFunctionBody(*fbody, &arg_nodes, &ret_nodes,
-                                 &control_ret_nodes, &resource_arg_unique_ids);
+                                 &control_ret_nodes);
 
   TF_RETURN_IF_ERROR(child_importer.Convert(
       mlir_func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes,
       llvm::makeArrayRef(attributes.begin(), attributes.end()),
-      resource_arg_unique_ids));
+      /*function_graph=*/true));
   return Status::OK();
 }
 
@@ -1409,9 +1406,7 @@ Status ImporterBase::Convert(
     const absl::InlinedVector<OutputTensor, 4>& arg_nodes,
     const absl::InlinedVector<OutputTensor, 4>& ret_nodes,
     const absl::InlinedVector<Node*, 4>& control_ret_nodes,
-    llvm::ArrayRef<mlir::NamedAttribute> attrs,
-    const absl::InlinedVector<std::pair<int64_t, int64_t>, 4>&
-        resource_arg_unique_ids) {
+    llvm::ArrayRef<mlir::NamedAttribute> attrs, bool function_graph) {
   // TODO(b/122040776): Uses debug info for FunctionDef.
   auto function = mlir::FuncOp::create(mlir::UnknownLoc::get(context_),
                                        func_name, func_type, attrs);
@@ -1437,10 +1432,6 @@ Status ImporterBase::Convert(
   TF_RETURN_IF_ERROR(ConvertFunctionArgAndRets(function, graph,
                                                func_type.getInputs(), arg_nodes,
                                                ret_nodes, control_ret_nodes));
-  for (const auto& entry : resource_arg_unique_ids) {
-    function.setArgAttr(entry.first, "tf.resource_arg_unique_id",
-                        builder_.getI64IntegerAttr(entry.second));
-  }
 
   // TODO(jpienaar): Update post removing shape_refinier_.
   if (!specs_.enable_shape_inference) {
@@ -1499,6 +1490,22 @@ Status ImporterBase::ConvertFunctionArgAndRets(
           i, "tf.device",
           builder_.getStringAttr(arg_node.node->requested_device()));
 
+    if (arg_node.node->IsArg()) {
+      for (const auto& arg_node_attr : arg_node.node->attrs()) {
+        const auto& key = arg_node_attr.first;
+        // Only import attributes starting with an underscore.
+        if (key.empty() || key[0] != '_') continue;
+        // Ignore shape inference attributes as shape information is already
+        // populated in the result type.
+        if (IsOutputShapesAttribute(arg_node_attr.second, key) ||
+            IsResourceOutputShapesAttribute(arg_node_attr.second, key))
+          continue;
+        TF_ASSIGN_OR_RETURN(auto converted_attr,
+                            ConvertAttributeValue(arg_node_attr.second));
+        func.setArgAttr(i, llvm::formatv("tf.{0}", key).str(), converted_attr);
+      }
+    }
+
     island->dropAllReferences();
     island->erase();
   }
@@ -2108,14 +2115,10 @@ class GraphDefImporter : public ImporterBase {
   // output nodes, for function graphs. Arguments and return values are
   // determined by node op type. Type and shape information of the function are
   // inferred by the shape refiner in ImporterBase.
-  // `resource_arg_unique_ids` will be filled with the unique IDs of resource
-  // variables, as a list of {index, ID} pairs.
   StatusOr<mlir::FunctionType> GetArgsRetsAndTypesFromFunctionGraph(
       mlir::MLIRContext* context,
       absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-      absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-      absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-          resource_arg_unique_ids);
+      absl::InlinedVector<OutputTensor, 4>* ret_nodes);
 
   // Finds the graph's target nodes/function's control ret nodes based on
   // supplied node names in `control_outputs`. If `control_outputs` are not
@@ -2143,7 +2146,6 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
   absl::InlinedVector<OutputTensor, 4> arg_nodes;
   absl::InlinedVector<OutputTensor, 4> ret_nodes;
   absl::InlinedVector<Node*, 4> control_ret_nodes;
-  absl::InlinedVector<std::pair<int64_t, int64_t>, 4> resource_arg_unique_ids;
   llvm::SmallVector<mlir::NamedAttribute, 1> attrs;
   if (specs.graph_as_function) {
     if (specs.prune_unused_nodes || !specs.inputs.empty() ||
@@ -2152,10 +2154,9 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
           "Pruning of graph is currently unsupported when the main graph is "
           "converted to a function.");
 
-    TF_ASSIGN_OR_RETURN(
-        func_type,
-        importer.GetArgsRetsAndTypesFromFunctionGraph(
-            context, &arg_nodes, &ret_nodes, &resource_arg_unique_ids));
+    TF_ASSIGN_OR_RETURN(func_type,
+                        importer.GetArgsRetsAndTypesFromFunctionGraph(
+                            context, &arg_nodes, &ret_nodes));
 
     TF_RETURN_IF_ERROR(importer.GetControlRetsFromGraph(specs.control_outputs,
                                                         &control_ret_nodes));
@@ -2223,7 +2224,7 @@ StatusOr<mlir::OwningModuleRef> GraphDefImporter::Convert(
 
   TF_RETURN_IF_ERROR(importer.ImporterBase::Convert(
       func_name, func_type, arg_nodes, ret_nodes, control_ret_nodes, attrs,
-      resource_arg_unique_ids));
+      specs.graph_as_function));
   return module;
 }
 
@@ -2340,9 +2341,7 @@ StatusOr<mlir::FunctionType> GraphDefImporter::InferMainFunctionType(
 StatusOr<mlir::FunctionType>
 GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     mlir::MLIRContext* context, absl::InlinedVector<OutputTensor, 4>* arg_nodes,
-    absl::InlinedVector<OutputTensor, 4>* ret_nodes,
-    absl::InlinedVector<std::pair<int64_t, int64_t>, 4>*
-        resource_arg_unique_ids) {
+    absl::InlinedVector<OutputTensor, 4>* ret_nodes) {
   auto add_node = [](Node* node, absl::InlinedVector<OutputTensor, 4>* nodes) {
     auto* attr = node->attrs().Find("index");
     if (!attr)
@@ -2383,12 +2382,6 @@ GraphDefImporter::GetArgsRetsAndTypesFromFunctionGraph(
     TF_ASSIGN_OR_RETURN(auto type,
                         InferOutputType(*arg_node.node, /*idx=*/0, builder));
     arg_types.push_back(type);
-    tensorflow::int64 resource_arg_unique_id;
-    if (TryGetNodeAttr(arg_node.node->attrs(), "_resource_arg_unique_id",
-                       &resource_arg_unique_id)) {
-      resource_arg_unique_ids->emplace_back(arg_node_and_idx.index(),
-                                            resource_arg_unique_id);
-    }
   }
 
   llvm::SmallVector<mlir::Type, 4> ret_types;

From 4c6501e0f0ea7f984d5c40204ecd965bbaf63cb5 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Fri, 29 May 2020 18:54:59 -0700
Subject: [PATCH 1435/1533] migrate broadcast mul to use binary broadcast
 fivefold

PiperOrigin-RevId: 313888158
Change-Id: I48451f178577fe8ad685a0fffcd6710d756a1ffa
---
 .../internal/optimized/integer_ops/mul.h      | 77 +------------------
 1 file changed, 4 insertions(+), 73 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index 0d385ec1656..15c3d291ec3 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/mul.h"
 #include "tensorflow/lite/kernels/internal/types.h"
 
@@ -236,77 +237,6 @@ inline void Mul(const ArithmeticParams& params,
   MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const int8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const int8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 int8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMulFivefoldInt8/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const int8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const int8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
-  // sections of the arrays.
-  int8* output_data_ptr = output_data;
-  const int8* input1_data_ptr = input1_data;
-  const int8* input2_data_reset = input2_data;
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          input1_data_ptr += y4;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const int8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          ++input1_data_ptr;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 inline void BroadcastMulDispatch(const ArithmeticParams& params,
                                  const RuntimeShape& input1_shape,
                                  const int8* input1_data,
@@ -320,8 +250,9 @@ inline void BroadcastMulDispatch(const ArithmeticParams& params,
         output_shape, output_data);
   }
 
-  BroadcastMulFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  optimized_ops::BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data, MulElementwise, MulSimpleBroadcast);
 }
 
 }  // namespace optimized_integer_ops

From d017c5dfc1ba345077356ea42ba118af405167f9 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Fri, 29 May 2020 21:59:25 -0700
Subject: [PATCH 1436/1533] Fixes `assertAllEqual()` function in
 framework/test_util.py such that the function has the originally intended
 behavior without breaking PY3 compatibility.

PiperOrigin-RevId: 313901131
Change-Id: Id50755e777728e93c65a47c596cc1a39acdd0ae5
---
 tensorflow/python/framework/test_util.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 1adec3d68fd..aa52bbd8726 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -2713,8 +2713,26 @@ class TensorFlowTestCase(googletest.TestCase):
         x, y = a, b
       msgs.append("not equal lhs = %r" % x)
       msgs.append("not equal rhs = %r" % y)
-      # With Python 3, we need to make sure the dtype matches between a and b.
-      b = b.astype(a.dtype)
+
+      # Handle mixed string types as a result of PY2to3 migration. That is, the
+      # mixing between bytes (b-prefix strings, PY2 default) and unicodes
+      # (u-prefix strings, PY3 default).
+      if six.PY3:
+        if (a.dtype.kind != b.dtype.kind and
+            {a.dtype.kind, b.dtype.kind}.issubset({"U", "S", "O"})):
+          a_list = []
+          b_list = []
+          # OK to flatten `a` and `b` because they are guaranteed to have the
+          # same shape.
+          for out_list, flat_arr in [(a_list, a.flat), (b_list, b.flat)]:
+            for item in flat_arr:
+              if isinstance(item, str):
+                out_list.append(item.encode("utf-8"))
+              else:
+                out_list.append(item)
+          a = np.array(a_list)
+          b = np.array(b_list)
+
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
   @py_func_if_in_function

From da8af6adef95e985c0c0dea87e894ab92331172e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 29 May 2020 22:12:58 -0700
Subject: [PATCH 1437/1533] Fixes `assertAllEqual()` function in
 framework/test_util.py such that the function has the originally intended
 behavior without breaking PY3 compatibility.

PiperOrigin-RevId: 313902146
Change-Id: I3f9337ee4b58fdeb01fc08d3f49cbec7d3022d3e
---
 tensorflow/python/framework/test_util.py | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index aa52bbd8726..1adec3d68fd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -2713,26 +2713,8 @@ class TensorFlowTestCase(googletest.TestCase):
         x, y = a, b
       msgs.append("not equal lhs = %r" % x)
       msgs.append("not equal rhs = %r" % y)
-
-      # Handle mixed string types as a result of PY2to3 migration. That is, the
-      # mixing between bytes (b-prefix strings, PY2 default) and unicodes
-      # (u-prefix strings, PY3 default).
-      if six.PY3:
-        if (a.dtype.kind != b.dtype.kind and
-            {a.dtype.kind, b.dtype.kind}.issubset({"U", "S", "O"})):
-          a_list = []
-          b_list = []
-          # OK to flatten `a` and `b` because they are guaranteed to have the
-          # same shape.
-          for out_list, flat_arr in [(a_list, a.flat), (b_list, b.flat)]:
-            for item in flat_arr:
-              if isinstance(item, str):
-                out_list.append(item.encode("utf-8"))
-              else:
-                out_list.append(item)
-          a = np.array(a_list)
-          b = np.array(b_list)
-
+      # With Python 3, we need to make sure the dtype matches between a and b.
+      b = b.astype(a.dtype)
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
   @py_func_if_in_function

From e2dfc382e6be58fff6ee6d0969f8925e531ac998 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Fri, 29 May 2020 22:55:37 -0700
Subject: [PATCH 1438/1533] Fixes `assertAllEqual()` function in
 framework/test_util.py such that the function has the originally intended
 behavior without breaking PY3 compatibility.

PiperOrigin-RevId: 313906279
Change-Id: I267a8310e0bad813a7dbcf28f5e14b9f4cd66203
---
 tensorflow/python/framework/test_util.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 1adec3d68fd..aa52bbd8726 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -2713,8 +2713,26 @@ class TensorFlowTestCase(googletest.TestCase):
         x, y = a, b
       msgs.append("not equal lhs = %r" % x)
       msgs.append("not equal rhs = %r" % y)
-      # With Python 3, we need to make sure the dtype matches between a and b.
-      b = b.astype(a.dtype)
+
+      # Handle mixed string types as a result of PY2to3 migration. That is, the
+      # mixing between bytes (b-prefix strings, PY2 default) and unicodes
+      # (u-prefix strings, PY3 default).
+      if six.PY3:
+        if (a.dtype.kind != b.dtype.kind and
+            {a.dtype.kind, b.dtype.kind}.issubset({"U", "S", "O"})):
+          a_list = []
+          b_list = []
+          # OK to flatten `a` and `b` because they are guaranteed to have the
+          # same shape.
+          for out_list, flat_arr in [(a_list, a.flat), (b_list, b.flat)]:
+            for item in flat_arr:
+              if isinstance(item, str):
+                out_list.append(item.encode("utf-8"))
+              else:
+                out_list.append(item)
+          a = np.array(a_list)
+          b = np.array(b_list)
+
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
   @py_func_if_in_function

From 0e9ebeda5ca74732c50f60bdcd0a160f5e95c49f Mon Sep 17 00:00:00 2001
From: Yasir Modak <42785357+ymodak@users.noreply.github.com>
Date: Sat, 30 May 2020 00:14:45 -0700
Subject: [PATCH 1439/1533] update python_api.md

This fixes #39962
Changed `tf.gfile.GFile` with `tf.io.gfile.GFile` for the code to work in TF 2.2
The former is an alias declared in TF 1.X and fails in TF 2 due to absence.
---
 tensorflow/lite/g3doc/convert/python_api.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index 44b58fb0759..36dda16a77c 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -93,7 +93,7 @@ converter = tf.lite.TFLiteConverter.from_keras_model(model)
 tflite_model = converter.convert()
 
 # Save the TF Lite model.
-with tf.gfile.GFile('model.tflite', 'wb') as f:
+with tf.io.gfile.GFile('model.tflite', 'wb') as f:
   f.write(tflite_model)
 ```
 

From 639d9373d3e7fc2235c1299641be8e8484d87d62 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Sat, 30 May 2020 01:09:36 -0700
Subject: [PATCH 1440/1533] Fix a typo in configure.

PiperOrigin-RevId: 313914447
Change-Id: Id55394dd29a51e96e448117c7435f057fe7b92ba
---
 configure.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configure.py b/configure.py
index 29c37f601fd..c2850beede6 100644
--- a/configure.py
+++ b/configure.py
@@ -1038,7 +1038,7 @@ def set_tf_cuda_compute_capabilities(environ_cp):
           print('Invalid compute capability: %s' % compute_capability)
           all_valid = False
         else:
-          ver = int(m.group(2))
+          ver = int(sm_compute_match.group(2))
           if ver < 30:
             print(
                 'ERROR: TensorFlow only supports small CUDA compute'

From 6cbf4be35b2732b59091c966dfee3928bd47d627 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 May 2020 02:02:31 -0700
Subject: [PATCH 1441/1533] Update GraphDef version to 417.

PiperOrigin-RevId: 313917104
Change-Id: I5282beab35bebf686121c82b1bc8be1756113f63
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 771e2860fc1..1548c5dbbe5 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 416  // Updated: 2020/5/29
+#define TF_GRAPH_DEF_VERSION 417  // Updated: 2020/5/30
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 64376b543244a29840a62f830a33f68a44ec54bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 May 2020 02:02:31 -0700
Subject: [PATCH 1442/1533] compat: Update forward compatibility horizon to
 2020-05-30

PiperOrigin-RevId: 313917105
Change-Id: I3c18c7e7badb7930dc6612a4780f3ac461ddc2a0
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e0446d58403..e5fd2fac161 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 29)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 30)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e9077ee0471383a00d1e0867761ecc8c752ab0c6 Mon Sep 17 00:00:00 2001
From: Natasha Kononenko <natashaknk@google.com>
Date: Sat, 30 May 2020 02:28:57 -0700
Subject: [PATCH 1443/1533] Add division, min and max constant folding

Note: For integers, these operations are performed in their signed versions.
PiperOrigin-RevId: 313918602
Change-Id: Icc1b68219351985f537ad69b45f7328f65d5d223
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 39 ++++++++++++++
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  3 ++
 .../compiler/mlir/xla/tests/canonicalize.mlir | 54 +++++++++++++++++++
 3 files changed, 96 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 22f1ffe1d3a..c66b8f12332 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -1474,6 +1474,38 @@ static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
   return DenseElementsAttr::get(type, values);
 }
 
+template <typename T>
+struct divide : std::divides<T> {};
+
+template <>
+struct divide<APInt> {
+  APInt operator()(const APInt& a, const APInt& b) const { return a.sdiv(b); }
+};
+
+template <typename T>
+struct max {
+  T operator()(const T& a, const T& b) const { return std::max<T>(a, b); }
+};
+
+template <>
+struct max<APInt> {
+  APInt operator()(const APInt& a, const APInt& b) const {
+    return llvm::APIntOps::smax(a, b);
+  }
+};
+
+template <typename T>
+struct min {
+  T operator()(const T& a, const T& b) const { return std::min<T>(a, b); }
+};
+
+template <>
+struct min<APInt> {
+  APInt operator()(const APInt& a, const APInt& b) const {
+    return llvm::APIntOps::smin(a, b);
+  }
+};
+
 #define BINARY_FOLDER(Op, Func)                                                \
   OpFoldResult Op::fold(ArrayRef<Attribute> attrs) {                           \
     if (getElementTypeOrSelf(getType()).isa<FloatType>())                      \
@@ -1483,9 +1515,16 @@ static Attribute BinaryFolder(Op* op, ArrayRef<Attribute> attrs) {
     return {};                                                                 \
   }
 
+// Addition, subtraction and multiplication use the std:: versions of the ops.
+// Due to the other ops behaving differently in signed vs unsigned integers,
+// APInts need a special implementation. Currently, it replicates signed int
+// op behavior.
 BINARY_FOLDER(AddOp, std::plus);
 BINARY_FOLDER(SubOp, std::minus);
 BINARY_FOLDER(MulOp, std::multiplies);
+BINARY_FOLDER(DivOp, divide);
+BINARY_FOLDER(MaxOp, max);
+BINARY_FOLDER(MinOp, min);
 
 #undef BINARY_FOLDER
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index be8c86a3a0b..97b8e1c1863 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -303,14 +303,17 @@ def HLO_ComplexOp: HLO_Op<"complex",
 
 def HLO_DivOp : HLO_BinaryElementwiseOp<"divide",
       [NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_DivOp {
+  let hasFolder = 1;
 }
 
 def HLO_MaxOp : HLO_BinaryElementwiseOp<"maximum",
       [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MaxOp {
+  let hasFolder = 1;
 }
 
 def HLO_MinOp : HLO_BinaryElementwiseOp<"minimum",
       [Commutative, NoSideEffect, SameOperandsAndResultType]>, BASE_HLO_MinOp {
+  let hasFolder = 1;
 }
 
 def HLO_MulOp : HLO_BinaryElementwiseOp<"multiply",
diff --git a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
index ed9f1661616..6b9ed3e463a 100644
--- a/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/canonicalize.mlir
@@ -45,6 +45,60 @@ func @multiply_scalar_fold() -> tensor<4xi64> {
   return %2 : tensor<4xi64>
 }
 
+// CHECK-LABEL: divide_scalar_fold
+func @divide_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<7> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<5> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<1>
+  %2 = "xla_hlo.divide"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: divide_fold_float
+func @divide_fold_float() -> tensor<4xf64> {
+  %0 = xla_hlo.constant dense<[5.0, 66.0, 5.0, 1.0]> : tensor<4xf64>
+  %1 = xla_hlo.constant dense<[5.0, 3.0, 2.0, 4.0]> : tensor<4xf64>
+  // CHECK: xla_hlo.constant dense<[1.000000e+00, 2.200000e+01, 2.500000e+00, 2.500000e-01]>
+  %2 = "xla_hlo.divide"(%0, %1) : (tensor<4xf64>, tensor<4xf64>) -> (tensor<4xf64>)
+  return %2 : tensor<4xf64>
+}
+
+// CHECK-LABEL: max_scalar_fold
+func @max_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<7> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<5> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<7>
+  %2 = "xla_hlo.maximum"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: max_fold_float
+func @max_fold_float() -> tensor<4xf64> {
+  %0 = xla_hlo.constant dense<[5.0, 66.0, 5.0, 1.0]> : tensor<4xf64>
+  %1 = xla_hlo.constant dense<[5.0, 3.0, 2.0, 4.0]> : tensor<4xf64>
+  // CHECK: xla_hlo.constant dense<[5.000000e+00, 6.600000e+01, 5.000000e+00, 4.000000e+00]>
+  %2 = "xla_hlo.maximum"(%0, %1) : (tensor<4xf64>, tensor<4xf64>) -> (tensor<4xf64>)
+  return %2 : tensor<4xf64>
+}
+
+// CHECK-LABEL: min_scalar_fold
+func @min_scalar_fold() -> tensor<4xi64> {
+  %0 = xla_hlo.constant dense<7> : tensor<4xi64>
+  %1 = xla_hlo.constant dense<-5> : tensor<4xi64>
+  // CHECK: xla_hlo.constant dense<-5>
+  %2 = "xla_hlo.minimum"(%0, %1) : (tensor<4xi64>, tensor<4xi64>) -> (tensor<4xi64>)
+  return %2 : tensor<4xi64>
+}
+
+// CHECK-LABEL: min_fold_float
+func @min_fold_float() -> tensor<4xf64> {
+  %0 = xla_hlo.constant dense<[5.0, 66.0, 5.0, 1.0]> : tensor<4xf64>
+  %1 = xla_hlo.constant dense<[5.0, 3.0, 2.0, 4.0]> : tensor<4xf64>
+  // CHECK: xla_hlo.constant dense<[5.000000e+00, 3.000000e+00, 2.000000e+00, 1.000000e+00]>
+  %2 = "xla_hlo.minimum"(%0, %1) : (tensor<4xf64>, tensor<4xf64>) -> (tensor<4xf64>)
+  return %2 : tensor<4xf64>
+}
+
 // CHECK-LABEL: concatenate_noop
 func @concatenate_noop(%arg0: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-SAME: [[ARG:%.+]]: tensor<4xi32>

From 3073d9b54aaf7b672184df33966f124c55f43427 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 30 May 2020 09:15:23 -0700
Subject: [PATCH 1444/1533] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/034a7b660406

PiperOrigin-RevId: 313942377
Change-Id: I4b88e41cdd557fc99669aa24a24083fead4b3c8b
---
 .../compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc     | 2 +-
 third_party/mlir/BUILD                                       | 5 ++++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 422c5f34608..df92681cd97 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -43,7 +43,7 @@ constexpr StringRef kTempBufferAttr = "temp";
 template <typename T>
 using BaseOpConversion = BufferAssignmentOpConversionPattern<T>;
 using StdReturnOpConverter =
-    NoBufferOperandsReturnOpConverter<mlir::ReturnOp, xla_lhlo::TerminatorOp,
+    BufferAssignmentReturnOpConverter<mlir::ReturnOp, xla_lhlo::TerminatorOp,
                                       xla_lhlo::CopyOp>;
 
 Value InsertDynamicAllocAndDealloc(Location loc, Value result,
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 624f17e6aa4..e3cec7d7104 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -312,6 +312,7 @@ cc_library(
         ":Transforms",
         ":VectorOps",
         ":VectorToLLVM",
+        ":VectorToSCF",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
@@ -1257,6 +1258,7 @@ cc_library(
         ":Transforms",
         ":VectorOps",
         ":VectorToLLVM",
+        ":VectorToSCF",
         "@llvm-project//llvm:support",
     ],
 )
@@ -2589,6 +2591,7 @@ cc_library(
         ":TransformsPassIncGen",
         ":VectorOps",
         ":VectorToLLVM",
+        ":VectorToSCF",
     ],
 )
 
@@ -3393,7 +3396,7 @@ cc_library(
     srcs = glob([
         "lib/Conversion/VectorToSCF/*.cpp",
         "lib/Conversion/VectorToSCF/*.h",
-    ]),
+    ]) + ["lib/Conversion/PassDetail.h"],
     hdrs = glob([
         "include/mlir/Conversion/VectorToSCF/*.h",
     ]),

From 934959af19f882143a32c472e3072c6f8352d955 Mon Sep 17 00:00:00 2001
From: seo-inyoung <62606132+seo-inyoung@users.noreply.github.com>
Date: Sun, 31 May 2020 12:44:24 +0900
Subject: [PATCH 1445/1533] Update tf2.py

I changed the comment symbol.('''->#)
Because it's a line of comment.
---
 tensorflow/python/tf2.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/tf2.py b/tensorflow/python/tf2.py
index bc713d6e28b..4c9d027221f 100644
--- a/tensorflow/python/tf2.py
+++ b/tensorflow/python/tf2.py
@@ -28,19 +28,19 @@ _force_enable = None
 
 
 def enable():
-  """Enables v2 behaviors."""
+  #Enables v2 behaviors.
   global _force_enable
   _force_enable = True
 
 
 def disable():
-  """Disables v2 behaviors."""
+  #Disables v2 behaviors.
   global _force_enable
   _force_enable = False
 
 
 def enabled():
-  """Returns True iff TensorFlow 2.0 behavior should be enabled."""
+  #Returns True iff TensorFlow 2.0 behavior should be enabled.
   if _force_enable is None:
     return os.getenv("TF2_BEHAVIOR", "0") != "0"
 

From fb0356b24bfe702f893274b0a96310b82daa5310 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 31 May 2020 02:02:32 -0700
Subject: [PATCH 1446/1533] Update GraphDef version to 418.

PiperOrigin-RevId: 314003583
Change-Id: I4446387bd56c7c9e58ebf0db4e829ef7757c7b25
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 1548c5dbbe5..2143654d7e4 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 417  // Updated: 2020/5/30
+#define TF_GRAPH_DEF_VERSION 418  // Updated: 2020/5/31
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From e88b0a3c5759c77fda42ae41d09639bdf2d1cc89 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 31 May 2020 02:02:33 -0700
Subject: [PATCH 1447/1533] compat: Update forward compatibility horizon to
 2020-05-31

PiperOrigin-RevId: 314003586
Change-Id: Ic7a5e751f48e3b913f3fca414a17412a19ccf13d
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e5fd2fac161..3791f292d83 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 30)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 31)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From edbe5e189c1ec14d3a3386aa29e6118d807d9379 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Sun, 31 May 2020 03:13:36 -0700
Subject: [PATCH 1448/1533] Support Ubuntu 18.04 as GPU RBE platform.

PiperOrigin-RevId: 314007642
Change-Id: Ic5a432718cf3419589b658fb7d7326ba1bbd71f0
---
 .../toolchains/preconfig/generate/containers.bzl    |  1 +
 third_party/toolchains/remote_config/configs.bzl    | 13 +++++++++++++
 third_party/toolchains/remote_config/containers.bzl |  7 +++++++
 3 files changed, 21 insertions(+)

diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 05b233232e3..8e7cd8d2b6c 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -10,6 +10,7 @@ container_digests = {
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:4dd708781c17a9e8d641c6ad05cc6e235e7147ff70f7b4a2ff6b31af43be4540",
+    "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:016b50adda42173f0fa70533f06c0b28c8a1057c56b2e4cd721295462a248ab7",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }
diff --git a/third_party/toolchains/remote_config/configs.bzl b/third_party/toolchains/remote_config/configs.bzl
index 4098e5f1580..3bbf99e0e36 100644
--- a/third_party/toolchains/remote_config/configs.bzl
+++ b/third_party/toolchains/remote_config/configs.bzl
@@ -39,6 +39,19 @@ def initialize_rbe_configs():
         python_install_path = "/usr/local",
     )
 
+    tensorflow_rbe_config(
+        name = "ubuntu18.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0",
+        compiler = "/dt7/usr/bin/gcc",
+        compiler_prefix = "/usr/bin",
+        cuda_version = "10.1",
+        cudnn_version = "7",
+        os = "ubuntu18.04-manylinux2010-multipython",
+        python_versions = ["2.7", "3.5", "3.6", "3.7", "3.8"],
+        tensorrt_install_path = "/usr",
+        tensorrt_version = "6.0",
+        python_install_path = "/usr/local",
+    )
+
     # TODO(klimek): Delete this once all users are migrated to a python-version
     # independent configuration. In the future, use
     # "ubuntu16.04-gcc7_manylinux2010-cuda10.1-cudnn7-tensorrt6.0" instead.
diff --git a/third_party/toolchains/remote_config/containers.bzl b/third_party/toolchains/remote_config/containers.bzl
index 7b948f78f56..8f6dae7f311 100644
--- a/third_party/toolchains/remote_config/containers.bzl
+++ b/third_party/toolchains/remote_config/containers.bzl
@@ -31,6 +31,13 @@ containers = {
         "digest": container_digests["cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython"],
     },
 
+    # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython.
+    "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": {
+        "registry": "gcr.io",
+        "repository": "tensorflow-testing/nosla-cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython",
+        "digest": container_digests["cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython"],
+    },
+
     # Built with //tensorflow/tools/ci_build/Dockerfile.rbe.rocm-ubuntu16.04
     "rocm-ubuntu16.04": {
         "registry": "gcr.io",

From 4e83a953c1e0eaf12e0005e18cf066f3b66da85f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 31 May 2020 11:30:55 -0700
Subject: [PATCH 1449/1533] [Grappler] Rewrite Case nodes with known output
 index to a simple PartitionedCall node. If possible, the partitioned call
 node will be inlined by the function optimizer in the next meta optimizer
 iteration.

PiperOrigin-RevId: 314035222
Change-Id: I239bfd91ea42dab9e974d5fd53f1fa1f9eec8e4d
---
 .../grappler/optimizers/constant_folding.cc   | 27 ++++++++++
 .../grappler/optimizers/constant_folding.h    |  3 ++
 .../optimizers/constant_folding_test.cc       | 54 +++++++++++++++++++
 3 files changed, 84 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index a0ec3714070..d0942471f13 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -2031,6 +2031,7 @@ Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
       PartialConcatConstFolding(optimized_graph, properties, node));
   SET_AND_RETURN_IF_MODIFIED(
       ConstantPushDownBiasAdd(properties, optimized_graph, node));
+  SET_AND_RETURN_IF_MODIFIED(SimplifyCase(optimized_graph, node));
 
   graph_modified_ = graph_modified_cached;
   return Status::OK();
@@ -2378,6 +2379,32 @@ bool ConstantFolding::SimplifyPack(GraphDef* optimized_graph, NodeDef* node) {
   return true;
 }
 
+bool ConstantFolding::SimplifyCase(GraphDef* optimized_graph, NodeDef* node) {
+  if (node->op() != "Case") return false;
+  const NodeDef* output_idx_node = node_map_->GetNode(node->input(0));
+  if (output_idx_node == nullptr ||
+      !CheckAttrExists(*output_idx_node, "value").ok())
+    return false;
+  Tensor output_idx_t;
+  if (!output_idx_t.FromProto(output_idx_node->attr().at("value").tensor()))
+    return false;
+  int output_idx = output_idx_t.scalar<int>()();
+  const auto& func_list = node->attr().at("branches").list();
+  if (output_idx < 0 || output_idx >= func_list.func_size()) return false;
+  NodeDef call_node = *node;
+  call_node.set_op("PartitionedCall");
+  call_node.clear_input();
+  for (int i = 1; i < node->input_size(); ++i) {
+    call_node.add_input(node->input(i));
+  }
+  auto* new_func = (*call_node.mutable_attr())["f"].mutable_func();
+  *new_func = func_list.func(output_idx);
+  call_node.mutable_attr()->erase("branches");
+  call_node.mutable_attr()->erase("output_shapes");
+  *node = std::move(call_node);
+  return true;
+}
+
 bool ConstantFolding::MoveConstantsPastEnter(GraphDef* optimized_graph,
                                              NodeDef* node) {
   if (!IsEnter(*node) || node->input_size() == 0 ||
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index 074f0c5f057..88784339816 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -279,6 +279,9 @@ class ConstantFolding : public GraphOptimizer {
   Status SimplifySlice(const GraphProperties& properties, bool use_shape_info,
                        GraphDef* optimized_graph, NodeDef* node);
 
+  // Simplify a Case operation where the output_idx is known.
+  bool SimplifyCase(GraphDef* optimized_graph, NodeDef* node);
+
   // Removes Reverse op over dimensions with size 1.
   Status RemoveReverse(const GraphProperties& properties, bool use_shape_info,
                        GraphDef* optimized_graph, NodeDef* node);
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 7fae40501ff..7e4a698fff6 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -4091,6 +4092,59 @@ TEST_F(ConstantFoldingTest, BitcastDenormalFloats) {
   test::ExpectTensorEqual<int64>(tensors[0], tensors_expected[0]);
 }
 
+TEST_F(ConstantFoldingTest, SimplifyCase) {
+  using test::function::NDef;
+
+  // Build a graph to compute y = Case(1, x, XTimesTwo(x), NonZero(x))
+  GrapplerItem item;
+  constexpr char kDevice[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  AttrValue branches;
+  auto* f = branches.mutable_list()->add_func();
+  f->set_name("XTimesTwo");
+  (*f->mutable_attr())["T"].set_type(DT_FLOAT);
+  auto* g = branches.mutable_list()->add_func();
+  *g = *f;
+  g->set_name("NonZero");
+
+  const Tensor kOne = test::AsScalar<int32>(1);
+  item.graph = test::function::GDef(
+      {NDef("one", "Const", {}, {{"value", kOne}, {"dtype", DT_INT32}},
+            kDevice),
+       NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("case", "Case", {"one", "x"},
+            {{"Tin", DataTypeSlice{DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"branches", branches}},
+            kDevice),
+       NDef("y", "Identity", {"case"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {
+          test::function::XTimesTwo(),
+          test::function::NonZero(),
+      });
+  VLOG(1) << "Before: " << item.graph.DebugString();
+
+  item.fetch = {"y"};
+  const Tensor kTwo = test::AsScalar<float>(2.0f);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"x", kTwo}});
+
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
+  GraphDef optimized_graph;
+  TF_ASSERT_OK(optimizer.Optimize(/*cluster=*/nullptr, item, &optimized_graph));
+  VLOG(1) << "After: " << optimized_graph.DebugString();
+
+  int pco_count = 0;
+  for (const auto& node : optimized_graph.node()) {
+    EXPECT_NE(node.op(), "Case");
+    if (node.op() == "PartitionedCall") ++pco_count;
+  }
+  EXPECT_EQ(pco_count, 1);
+
+  auto tensors = EvaluateNodes(optimized_graph, item.fetch, {{"x", kTwo}});
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors[0], tensors_expected[0]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow

From fe018168d00721415e593886b9674e4da6f770d7 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Sun, 31 May 2020 15:32:57 -0700
Subject: [PATCH 1450/1533] Bump open source llvm revision to
 034a7b6604067b0ccb36c761a5782456b76c447e

PiperOrigin-RevId: 314048655
Change-Id: Id39493589057c05e029a4f797ebffd4834722aaa
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 594c0b5c93d..1ac8a49028d 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "b726d071b4aa46004228fc38ee5bfd167f999bfe"
-    LLVM_SHA256 = "d7e67036dc89906cb2f80df7b0b7de6344d86eddf6e98bb4d01a578242889a73"
+    LLVM_COMMIT = "034a7b6604067b0ccb36c761a5782456b76c447e"
+    LLVM_SHA256 = "87bd4dd8c2620ae6371dcfeeb6f1583918945c829cb115020ad4bc0a74a079d7"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 4fb63ecd646062d110db9bbb64598d1c7dac00f1 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Mon, 1 Jun 2020 09:07:00 +0800
Subject: [PATCH 1451/1533] make //tensorflow/lite/java:InterpreterTest build

surpress the two "drepcation" warnings
---
 .../java/src/test/java/org/tensorflow/lite/InterpreterTest.java | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 6b6799eaad9..3daa9fe0766 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -65,6 +65,7 @@ public final class InterpreterTest {
   }
 
   @Test
+  @SuppressWarnings("deprecation")
   public void testInterpreterWithOptions() throws Exception {
     Interpreter interpreter =
         new Interpreter(
@@ -390,6 +391,7 @@ public final class InterpreterTest {
   }
 
   @Test
+  @SuppressWarnings("deprecation")
   public void testTurnOnNNAPI() throws Exception {
     Interpreter interpreter =
         new Interpreter(

From 777b6efed4f5f937a1817f58b4e09d63648017d5 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Mon, 1 Jun 2020 09:52:05 +0800
Subject: [PATCH 1452/1533] add allowFp16() test to NnApiDelegateTest

---
 .../lite/nnapi/NnApiDelegateTest.java         | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
index e3742dab9a3..45d66e24d35 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/nnapi/NnApiDelegateTest.java
@@ -56,6 +56,26 @@ public final class NnApiDelegateTest {
     }
   }
 
+  @Test
+  public void testInterpreterWithNnApiAllowFp16() throws Exception {
+    Interpreter.Options options = new Interpreter.Options();
+    NnApiDelegate.Options nnApiOptions = new NnApiDelegate.Options();
+    nnApiOptions.setAllowFp16(true);
+
+    try (NnApiDelegate delegate = new NnApiDelegate(nnApiOptions);
+        Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
+      float[] oneD = {1.23f, 6.54f, 7.81f};
+      float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+      float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+      float[][][][] fourD = {threeD, threeD};
+      float[][][][] parsedOutputs = new float[2][8][8][3];
+      interpreter.run(fourD, parsedOutputs);
+      float[] outputOneD = parsedOutputs[0][0][0];
+      float[] expected = {3.69f, 19.62f, 23.43f};
+      assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    }
+  }
+
   @Test
   public void testGetNnApiErrnoReturnsZeroIfNoNnapiCallFailed() throws Exception {
     Interpreter.Options options = new Interpreter.Options();

From e9505939ea5f903c066fc3792df9d01c09b2bcb4 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Sun, 31 May 2020 19:11:59 -0700
Subject: [PATCH 1453/1533] Access the rank and shape data after ensuring that
 the given type has enough information

PiperOrigin-RevId: 314063168
Change-Id: I7b8d9116a6688fe4f174a9801e2b6a1a8edaba59
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index edb533c9442..e645f98e922 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -228,10 +228,14 @@ class TFL_OperandHasAtleastRank<int n, int m> :
 
 class TFL_OperandRankEquals1DimOfOperand<int x, int y> :
   PredOpTrait<"operand " # x # "'s rank equals operand " # y # "'s size",
-    CPred<"$_op.getOperand(" # x #
-      ").getType().cast<ShapedType>().getRank() == "
-      "$_op.getOperand(" # y #
-      ").getType().cast<ShapedType>().getShape()[0]">>;
+    Or<[TFL_OperandIsUnrankedPred<x>,
+        TFL_OperandIsUnrankedPred<y>,
+        CPred<"!$_op.getOperand(" # y #
+          ").getType().cast<ShapedType>().hasStaticShape()">,
+        CPred<"$_op.getOperand(" # x #
+          ").getType().cast<ShapedType>().getRank() == "
+          "$_op.getOperand(" # y #
+          ").getType().cast<ShapedType>().getShape()[0]">]>>;
 
 class TFL_Operand0DOr1ElementTensor<int x> :
   PredOpTrait<"operand #" # x # " is an 0-d tensor or 1-d tensor w/ 1 element",

From ea848c042b60105563024fe2a52fe2cf03db9bc0 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Sun, 31 May 2020 20:17:03 -0700
Subject: [PATCH 1454/1533] Make TraceMeEncode compile with C++14

PiperOrigin-RevId: 314068167
Change-Id: I660d181ed9dcc333d07e1c4a9acddbafda2c0bb9
---
 tensorflow/core/profiler/lib/BUILD            | 13 +++-
 tensorflow/core/profiler/lib/traceme_encode.h | 43 +++++++------
 .../core/profiler/lib/traceme_encode_test.cc  | 60 +++++++++++++++++++
 3 files changed, 97 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/core/profiler/lib/traceme_encode_test.cc

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index e80b9fc9766..6bda544055e 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
-load("//tensorflow:tensorflow.bzl", "if_not_android", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "if_not_android", "tf_cc_test", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")
 
 package(
@@ -113,6 +113,17 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "traceme_encode_test",
+    srcs = ["traceme_encode_test.cc"],
+    deps = [
+        ":traceme_encode",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "annotated_traceme",
     hdrs = ["annotated_traceme.h"],
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index 2e23c6d878b..91b23740fc3 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -19,7 +19,6 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
-#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -28,6 +27,19 @@ limitations under the License.
 
 namespace tensorflow {
 namespace profiler {
+
+// An argument passed to TraceMeEncode.
+struct TraceMeArg {
+  // This constructor is required because absl::AlphaNum is non-copyable.
+  template <typename Value>
+  TraceMeArg(absl::string_view k, Value v) : key(k), value(v) {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(TraceMeArg);
+
+  absl::string_view key;
+  absl::AlphaNum value;
+};
+
 namespace traceme_internal {
 
 // Copies the contents of str to the address pointed by out.
@@ -45,23 +57,21 @@ TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
 
 // Appends args encoded as TraceMe metadata to name.
 TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
-    std::string name,
-    const std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>>&
-        args) {
+    std::string name, std::initializer_list<TraceMeArg> args) {
   if (TF_PREDICT_TRUE(args.size() > 0)) {
     const auto old_size = name.size();
     auto new_size = old_size + args.size() * 2 + 1;
     for (const auto& arg : args) {
-      new_size += arg.first.size() + arg.second.size();
+      new_size += arg.key.size() + arg.value.size();
     }
     name.resize(new_size);
     char* const begin = &name[0];
     char* out = begin + old_size;
     *out++ = '#';
     for (const auto& arg : args) {
-      out = Append(out, arg.first);
+      out = Append(out, arg.key);
       *out++ = '=';
-      out = Append(out, arg.second.Piece());
+      out = Append(out, arg.value.Piece());
       *out++ = ',';
     }
     *(out - 1) = '#';
@@ -92,19 +102,16 @@ TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
 //   TraceMe trace_me([value1]() {
 //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
 //   });
-inline std::string TraceMeEncode(
-    std::string name,
-    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    std::string name, std::initializer_list<TraceMeArg> args) {
   return traceme_internal::AppendArgs(std::move(name), args);
 }
-inline std::string TraceMeEncode(
-    absl::string_view name,
-    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    absl::string_view name, std::initializer_list<TraceMeArg> args) {
   return traceme_internal::AppendArgs(std::string(name), args);
 }
-inline std::string TraceMeEncode(
-    const char* name,
-    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    const char* name, std::initializer_list<TraceMeArg> args) {
   return traceme_internal::AppendArgs(std::string(name), args);
 }
 
@@ -116,8 +123,8 @@ inline std::string TraceMeEncode(
 //   trace_me.AppendMetadata([value1]() {
 //     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
 //   });
-inline std::string TraceMeEncode(
-    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
+    std::initializer_list<TraceMeArg> args) {
   return traceme_internal::AppendArgs(std::string(), args);
 }
 
diff --git a/tensorflow/core/profiler/lib/traceme_encode_test.cc b/tensorflow/core/profiler/lib/traceme_encode_test.cc
new file mode 100644
index 00000000000..a849004ab8e
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme_encode_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+TEST(TraceMeEncodeTest, NoArgTest) {
+  EXPECT_EQ(TraceMeEncode("Hello!", {}), "Hello!");
+}
+
+TEST(TraceMeEncodeTest, OneArgTest) {
+  EXPECT_EQ(TraceMeEncode("Hello", {{"context", "World"}}),
+            "Hello#context=World#");
+}
+
+TEST(TraceMeEncodeTest, TwoArgsTest) {
+  EXPECT_EQ(TraceMeEncode("Hello", {{"context", "World"}, {"request_id", 42}}),
+            "Hello#context=World,request_id=42#");
+}
+
+TEST(TraceMeEncodeTest, ThreeArgsTest) {
+  EXPECT_EQ(TraceMeEncode("Hello", {{"context", "World"},
+                                    {"request_id", 42},
+                                    {"addr", absl::Hex(0xdeadbeef)}}),
+            "Hello#context=World,request_id=42,addr=deadbeef#");
+}
+
+TEST(TraceMeEncodeTest, TemporaryStringTest) {
+  EXPECT_EQ(TraceMeEncode("Hello", {{std::string("context"),
+                                     absl::StrCat("World:", 2020)}}),
+            "Hello#context=World:2020#");
+}
+
+TEST(TraceMeEncodeTest, NoNameTest) {
+  EXPECT_EQ(TraceMeEncode({{"context", "World"}, {"request_id", 42}}),
+            "#context=World,request_id=42#");
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow

From d4042feb2d3d06b92f378b31e791dd7fdf70832d Mon Sep 17 00:00:00 2001
From: Ben Barsdell <bbarsdell@nvidia.com>
Date: Mon, 1 Jun 2020 16:38:55 +1000
Subject: [PATCH 1455/1533] Re-enable __builtin_expect when building with NVCC

- NVCC supports this construct as of CUDA 9.0
---
 tensorflow/core/platform/macros.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 62e5e01b3cc..57773c54e3e 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -79,11 +79,7 @@ limitations under the License.
 // analysis. Giving it this information can help it optimize for the
 // common case in the absence of better information (ie.
 // -fprofile-arcs).
-//
-// We need to disable this for GPU builds, though, since nvcc8 and older
-// don't recognize `__builtin_expect` as a builtin, and fail compilation.
-#if (!defined(__NVCC__)) && \
-    (TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3))
+#if TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3)
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else

From 626bb2c4d0bdb626b406c202c2dfd167da586bc6 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Mon, 1 Jun 2020 01:14:56 -0700
Subject: [PATCH 1456/1533] Move DequantizeInputs and QuantizeOutputs into
 common utility

PiperOrigin-RevId: 314093553
Change-Id: Ib15015f738c3e0bbaa1363ba7df9808940075d2c
---
 tensorflow/lite/delegates/gpu/BUILD           |   1 +
 tensorflow/lite/delegates/gpu/common/BUILD    |  24 +++
 .../delegates/gpu/common/quantization_util.cc | 120 +++++++++++++++
 .../delegates/gpu/common/quantization_util.h  |  56 +++++++
 .../gpu/common/quantization_util_test.cc      | 139 ++++++++++++++++++
 tensorflow/lite/delegates/gpu/delegate.cc     |  71 +--------
 6 files changed, 345 insertions(+), 66 deletions(-)
 create mode 100644 tensorflow/lite/delegates/gpu/common/quantization_util.cc
 create mode 100644 tensorflow/lite/delegates/gpu/common/quantization_util.h
 create mode 100644 tensorflow/lite/delegates/gpu/common/quantization_util_test.cc

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index bb509610c7a..181d96b9cdf 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -244,6 +244,7 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/common:model",
         "//tensorflow/lite/delegates/gpu/common:model_builder",
         "//tensorflow/lite/delegates/gpu/common:model_transformer",
+        "//tensorflow/lite/delegates/gpu/common:quantization_util",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/gl:api2",
         "//tensorflow/lite/kernels/internal:optimized_base",
diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index b7120605902..333c54f145f 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -203,6 +203,30 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "quantization_util",
+    srcs = ["quantization_util.cc"],
+    hdrs = ["quantization_util.h"],
+    deps = [
+        ":status",
+        "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels/internal:optimized_base",
+        "//tensorflow/lite/kernels/internal:types",
+    ],
+)
+
+cc_test(
+    name = "quantization_util_test",
+    srcs = ["quantization_util_test.cc"],
+    deps = [
+        ":quantization_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/micro/testing:micro_test",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 # TODO(impjdi): Add unit test for operations.
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.cc b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
new file mode 100644
index 00000000000..9584d1d98ec
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.cc
@@ -0,0 +1,120 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace gpu {
+namespace {
+void DequantizeInput(TfLiteContext* context, int input_index,
+                     const std::unordered_map<int, int>& quant_conversion_map) {
+  if (quant_conversion_map.find(input_index) == quant_conversion_map.end()) {
+    return;
+  }
+  int original_tensor_idx = quant_conversion_map.at(input_index);
+  const TfLiteTensor& dequantized_tflite_tensor = context->tensors[input_index];
+  const TfLiteTensor& original_tflite_tensor =
+      context->tensors[original_tensor_idx];
+  DequantizationParams op_params;
+  op_params.zero_point = original_tflite_tensor.params.zero_point;
+  op_params.scale = original_tflite_tensor.params.scale;
+  if (original_tflite_tensor.type == kTfLiteInt8) {
+    optimized_ops::Dequantize(op_params,
+                              GetTensorShape(&original_tflite_tensor),
+                              original_tflite_tensor.data.int8,
+                              GetTensorShape(&original_tflite_tensor),
+                              dequantized_tflite_tensor.data.f);
+  } else if (original_tflite_tensor.type == kTfLiteUInt8) {
+    optimized_ops::Dequantize(op_params,
+                              GetTensorShape(&original_tflite_tensor),
+                              original_tflite_tensor.data.uint8,
+                              GetTensorShape(&original_tflite_tensor),
+                              dequantized_tflite_tensor.data.f);
+  }
+}
+
+void QuantizeOutput(TfLiteContext* context, int output_index,
+                    const std::unordered_map<int, int>& quant_conversion_map) {
+  if (quant_conversion_map.find(output_index) == quant_conversion_map.end()) {
+    return;
+  }
+  int original_tensor_idx = quant_conversion_map.at(output_index);
+  const TfLiteTensor& dequantized_tflite_tensor =
+      context->tensors[output_index];
+  const TfLiteTensor& original_tflite_tensor =
+      context->tensors[original_tensor_idx];
+  tflite::QuantizationParams op_params;
+  op_params.zero_point = original_tflite_tensor.params.zero_point;
+  op_params.scale = original_tflite_tensor.params.scale;
+  if (original_tflite_tensor.type == kTfLiteInt8) {
+    optimized_ops::AffineQuantize(op_params,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  dequantized_tflite_tensor.data.f,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  original_tflite_tensor.data.int8);
+  } else if (original_tflite_tensor.type == kTfLiteUInt8) {
+    optimized_ops::AffineQuantize(op_params,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  dequantized_tflite_tensor.data.f,
+                                  GetTensorShape(&original_tflite_tensor),
+                                  original_tflite_tensor.data.uint8);
+  }
+}
+}  // namespace
+
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<uint32_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : input_indices) {
+    DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<int64_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : input_indices) {
+    DequantizeInput(context, static_cast<int>(index), quant_conversion_map);
+  }
+  return absl::OkStatus();
+}
+
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<uint32_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : output_indices) {
+    QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<int64_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map) {
+  for (auto index : output_indices) {
+    QuantizeOutput(context, static_cast<int>(index), quant_conversion_map);
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util.h b/tensorflow/lite/delegates/gpu/common/quantization_util.h
new file mode 100644
index 00000000000..26512531f29
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util.h
@@ -0,0 +1,56 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+
+// Dequantizes input tensors pre-inference, leaving float tensors intact.
+// input_indices contains dequantized (fp32) outputs, that are used as
+// inputs to GPU delegate.
+// quant_conversion_map contains bidirectional mapping between dequantized
+// tensor and its original quantized one.
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<uint32_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+
+absl::Status DequantizeInputs(
+    TfLiteContext* context, const std::vector<int64_t>& input_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+
+// Quantizes output tensors post-inference, leaving float tensors intact.
+// output_indices contains (fp32) inputs to be quantized, which are outputs of
+// GPU delegate.
+// quant_conversion_map contains bidirectional mapping between dequantized
+// tensor and its original quantized one.
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<uint32_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+
+absl::Status QuantizeOutputs(
+    TfLiteContext* context, const std::vector<int64_t>& output_indices,
+    const std::unordered_map<int, int>& quant_conversion_map);
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_COMMON_QUANTIZATION_UTIL_H_
diff --git a/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
new file mode 100644
index 00000000000..1ca6922dfe4
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/common/quantization_util_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/micro/testing/test_utils.h"
+#include "tensorflow/lite/util.h"
+
+using ::testing::Eq;
+using ::testing::FloatNear;
+using ::testing::Pointwise;
+
+namespace tflite {
+namespace gpu {
+namespace {
+
+std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
+    const std::vector<int>& data) {
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
+      TfLiteIntArrayCreate(data.size()));
+  std::copy(data.begin(), data.end(), result->data);
+  return result;
+}
+
+TEST(DequantizeInputs, Int8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<int8_t> data = {-3, -2, -1, 1, 2, 3};
+  std::vector<float> dequantized_data(data.size());
+
+  TfLiteTensor input = tflite::testing::CreateQuantizedTensor(
+      data.data(), input_dims.get(), "input",
+      /*min=*/-12.8f, /*max=*/12.7f, /*is_variable=*/false);
+  TfLiteTensor dequantized_input = tflite::testing::CreateFloatTensor(
+      dequantized_data.data(), input_dims.get(), "input_dequant",
+      /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{input, dequantized_input};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<uint32_t> input_indices = {1};
+  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+
+  auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(dequantized_data,
+              Pointwise(FloatNear(1e-6), {-0.3, -0.2, -0.1, 0.1, 0.2, 0.3}));
+}
+
+TEST(DequantizeInputs, UInt8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<uint8_t> data = {0, 1, 2, 3, 4, 5};
+  std::vector<float> dequantized_data(data.size());
+
+  TfLiteTensor input = tflite::testing::CreateQuantizedTensor(
+      data.data(), input_dims.get(), "input",
+      /*min=*/0.0f, /*max=*/25.5f, /*is_variable=*/false);
+  TfLiteTensor dequantized_input = tflite::testing::CreateFloatTensor(
+      dequantized_data.data(), input_dims.get(), "input_dequant",
+      /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{input, dequantized_input};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<int64_t> input_indices = {1};
+  std::unordered_map<int, int> quant_conversion_map = {{1, 0}};
+
+  auto status = DequantizeInputs(&context, input_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(dequantized_data,
+              Pointwise(FloatNear(1e-6), {0.0, 0.1, 0.2, 0.3, 0.4, 0.5}));
+}
+
+TEST(QuantizeOutputs, Int8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<float> data = {-0.3, -0.2, -0.1, 0.1, 0.2, 0.3};
+  std::vector<int8_t> quantized_data(data.size());
+  TfLiteTensor output = tflite::testing::CreateFloatTensor(
+      data.data(), input_dims.get(), "output", /*is_variable=*/false);
+  TfLiteTensor quantized_output = tflite::testing::CreateQuantizedTensor(
+      quantized_data.data(), input_dims.get(), "output_quant",
+      /*min=*/-12.8f, /*max=*/12.7f, /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{output, quantized_output};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<uint32_t> output_indices = {0};
+  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+
+  auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(quantized_data, Pointwise(Eq(), {-3, -2, -1, 1, 2, 3}));
+}
+
+TEST(QuantizeOutputs, UInt8) {
+  TfLiteContext context;
+  auto input_dims = BuildTfLiteIntArray({1, 3, 2, 1});
+  std::vector<float> data = {0.0, 0.1, 0.2, 0.3, 0.4, 0.5};
+  std::vector<uint8_t> quantized_data(data.size());
+  TfLiteTensor output = tflite::testing::CreateFloatTensor(
+      data.data(), input_dims.get(), "output", /*is_variable=*/false);
+  TfLiteTensor quantized_output = tflite::testing::CreateQuantizedTensor(
+      quantized_data.data(), input_dims.get(), "output_quant",
+      /*min=*/0.0f, /*max=*/25.5f, /*is_variable=*/true);
+
+  std::vector<TfLiteTensor> tensors{output, quantized_output};
+  tflite::testing::PopulateContext(tensors.data(), tensors.size(),
+                                   /*error_reporter=*/nullptr, &context);
+
+  std::vector<int64_t> output_indices = {0};
+  std::unordered_map<int, int> quant_conversion_map = {{0, 1}};
+
+  auto status = QuantizeOutputs(&context, output_indices, quant_conversion_map);
+  EXPECT_TRUE(status.ok());
+  EXPECT_THAT(quantized_data, Pointwise(Eq(), {0, 1, 2, 3, 4, 5}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 4b6727e66e7..702dd9a8417 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/model_builder.h"
 #include "tensorflow/lite/delegates/gpu/common/model_transformer.h"
+#include "tensorflow/lite/delegates/gpu/common/quantization_util.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/gl/api2.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
@@ -210,12 +211,14 @@ class DelegateKernel {
 
     const bool is_dequant_required = !quant_conversion_map_.empty();
     if (is_dequant_required) {
-      RETURN_IF_ERROR(DequantizeInputs(context));
+      RETURN_IF_ERROR(
+          DequantizeInputs(context, input_indices_, quant_conversion_map_));
     }
     RETURN_IF_ERROR(SetInputsAndOutputs(context));
     RETURN_IF_ERROR(runner_->Run());
     if (is_dequant_required) {
-      RETURN_IF_ERROR(QuantizeOutputs(context));
+      RETURN_IF_ERROR(
+          QuantizeOutputs(context, output_indices_, quant_conversion_map_));
     }
     return absl::OkStatus();
   }
@@ -277,70 +280,6 @@ class DelegateKernel {
     return absl::OkStatus();
   }
 
-  // TODO(b/150798231): Refactor these two into common utils when generalizing
-  // to other backends.
-
-  // Dequantizes input tensors pre-inference, leaving float tensors intact.
-  absl::Status DequantizeInputs(TfLiteContext* context) {
-    for (auto index : input_indices_) {
-      if (quant_conversion_map_.find(index) == quant_conversion_map_.end()) {
-        continue;
-      }
-      int original_tensor_idx = quant_conversion_map_[index];
-      const TfLiteTensor& dequantized_tflite_tensor = context->tensors[index];
-      const TfLiteTensor& original_tflite_tensor =
-          context->tensors[original_tensor_idx];
-      DequantizationParams op_params;
-      op_params.zero_point = original_tflite_tensor.params.zero_point;
-      op_params.scale = original_tflite_tensor.params.scale;
-      if (original_tflite_tensor.type == kTfLiteInt8) {
-        optimized_ops::Dequantize(op_params,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  original_tflite_tensor.data.int8,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  dequantized_tflite_tensor.data.f);
-      } else if (original_tflite_tensor.type == kTfLiteUInt8) {
-        optimized_ops::Dequantize(op_params,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  original_tflite_tensor.data.uint8,
-                                  GetTensorShape(&original_tflite_tensor),
-                                  dequantized_tflite_tensor.data.f);
-      }
-    }
-    return absl::OkStatus();
-  }
-
-  // Quantizes output tensors post-inference, leaving float tensors intact.
-  absl::Status QuantizeOutputs(TfLiteContext* context) {
-    for (auto index : output_indices_) {
-      if (quant_conversion_map_.find(index) == quant_conversion_map_.end()) {
-        continue;
-      }
-      int original_tensor_idx = quant_conversion_map_[index];
-      const TfLiteTensor& dequantized_tflite_tensor = context->tensors[index];
-      const TfLiteTensor& original_tflite_tensor =
-          context->tensors[original_tensor_idx];
-      tflite::QuantizationParams op_params;
-      op_params.zero_point = original_tflite_tensor.params.zero_point;
-      op_params.scale = original_tflite_tensor.params.scale;
-      if (original_tflite_tensor.type == kTfLiteInt8) {
-        optimized_ops::AffineQuantize(op_params,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      dequantized_tflite_tensor.data.f,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      original_tflite_tensor.data.int8);
-      } else if (original_tflite_tensor.type == kTfLiteUInt8) {
-        optimized_ops::AffineQuantize(op_params,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      dequantized_tflite_tensor.data.f,
-                                      GetTensorShape(&original_tflite_tensor),
-                                      original_tflite_tensor.data.uint8);
-      }
-    }
-
-    return absl::OkStatus();
-  }
-
   absl::Status InitializeOpenClApi(GraphFloat32* graph,
                                    std::unique_ptr<InferenceBuilder>* builder,
                                    bool* graph_is_destroyed) {

From a76afbcc8ab2fab741fd4c4e68af8b5a471fe7b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 02:02:31 -0700
Subject: [PATCH 1457/1533] Update GraphDef version to 419.

PiperOrigin-RevId: 314097101
Change-Id: I9e7fb69f6ac764f4623dfee58b54bc6f3430bd69
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 2143654d7e4..94536ab04ce 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 418  // Updated: 2020/5/31
+#define TF_GRAPH_DEF_VERSION 419  // Updated: 2020/6/1
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 67cd77f1ebe75c76423a007822f574e551c7339d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 02:02:31 -0700
Subject: [PATCH 1458/1533] compat: Update forward compatibility horizon to
 2020-06-01

PiperOrigin-RevId: 314097102
Change-Id: Ie1b78d1043975397bbce5d50372905e260412bc5
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 3791f292d83..79375ba327a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 31)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 1)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 42e08300a053cced03292b430d51a1273bb642f0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 02:31:03 -0700
Subject: [PATCH 1459/1533] Fixed error catching over h5py versions.

PiperOrigin-RevId: 314099586
Change-Id: I4ee2ad47e79399808bd54a441d043f99cd8cddd2
---
 tensorflow/python/keras/callbacks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index c4f29797c24..712a48e7188 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -1287,7 +1287,7 @@ class ModelCheckpoint(Callback):
         self._maybe_remove_file()
       except IOError as e:
         # `e.errno` appears to be `None` so checking the content of `e.args[0]`.
-        if 'is a directory' in six.ensure_str(e.args[0]):
+        if 'is a directory' in six.ensure_str(e.args[0]).lower():
           raise IOError('Please specify a non-directory filepath for '
                         'ModelCheckpoint. Filepath used is an existing '
                         'directory: {}'.format(filepath))

From add80cd47acfa2335b260b8ab877e4dc5cff499b Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Mon, 1 Jun 2020 02:32:18 -0700
Subject: [PATCH 1460/1533] Make flex:delegate_data, init_tensorflow.h and
 init_tensorflow.cc visible for selective registration uses

PiperOrigin-RevId: 314099714
Change-Id: I2da62c0817475cfd268608f2ff32e8ed1a531310
---
 tensorflow/lite/delegates/flex/BUILD | 1 +
 tensorflow/lite/testing/BUILD        | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 98314fdc1b8..8fc59c2c132 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -129,6 +129,7 @@ cc_library(
     name = "delegate_data",
     srcs = ["delegate_data.cc"],
     hdrs = ["delegate_data.h"],
+    visibility = ["//visibility:public"],
     deps = [
         ":buffer_map",
         "@com_google_absl//absl/memory",
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 379230b3a4b..6452d511acc 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -25,6 +25,8 @@ package(
 exports_files([
     "generated_examples_zip_test.cc",
     "tflite_diff_example_test.cc",
+    "init_tensorflow.h",
+    "init_tensorflow.cc",
 ])
 
 [gen_zip_test(

From 80768cb23a3a4314c52af0b48a6bcf23ca541e19 Mon Sep 17 00:00:00 2001
From: Stefano Galarraga <galarragas@google.com>
Date: Mon, 1 Jun 2020 03:28:31 -0700
Subject: [PATCH 1461/1533] Accounts added dequantization operations into NNAPI
 model size

Before this CL application would crash when using NNAPI with target accelerator specified with model containing Conv2d or FullyConnected or LSTM nodes with quantized weights.
The NNAPI models generated by the NNAPI Delegate could contain extra Dequantize operations.  The crash is caused by the buffer passed to ANeuralNetworksModel_getSupportedOperationsForDevices being too small since those extra Dequantize operations were not accounted.

PiperOrigin-RevId: 314104459
Change-Id: I4784e62780c4bc44800f7c1bb3a1bb18b961d212
---
 .../lite/delegates/nnapi/nnapi_delegate.cc    |  57 ++++----
 .../nnapi_delegate_device_selection_test.cc   | 123 +++++++++++++++++-
 .../delegates/nnapi/nnapi_delegate_kernel.h   |   8 +-
 3 files changed, 155 insertions(+), 33 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index bf3714a443f..96b58d74120 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -784,19 +784,36 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
+  // Adds the operation to the model and maps the operation to the originating
+  // TFLite one.
+  TfLiteStatus AddOperationToModel(ANeuralNetworksOperationType type,
+                                   uint32_t input_count, const uint32_t* inputs,
+                                   uint32_t output_count,
+                                   const uint32_t* outputs,
+                                   int lite_node_index) {
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperation(
+            nn_model_, type, input_count, inputs, output_count, outputs),
+        "adding operation", nnapi_errno_);
+    nnapi_to_tflite_op_mapping_->push_back(lite_node_index);
+    return kTfLiteOk;
+  }
+
   // Adds a Dequantize operator and replaces the input tensor index with the
   // dequantized version. If the dequantized version of the operator already
   // exists then it is not added again.
-  TfLiteStatus AddDequantize(int nn_input_index, int lite_index,
-                             TfLiteType dequantized_type) {
-    const int ann_index = operand_mapping_->lite_index_to_ann(lite_index);
+  TfLiteStatus AddDequantize(int nn_input_index, int lite_tensor_index,
+                             TfLiteType dequantized_type, int lite_node_index) {
+    const int ann_index =
+        operand_mapping_->lite_index_to_ann(lite_tensor_index);
     int dequantized_ann_index =
         dequantize_mapping_->DequantizedAnnIndex(ann_index, dequantized_type);
 
     if (dequantized_ann_index == -1) {
       // The dequantized version does not exist yet, it has to be added: a new
       // Dequantize operation is added, yielding a new tensor.
-      const TfLiteTensor& tensor = context_->tensors[lite_index];
+      const TfLiteTensor& tensor = context_->tensors[lite_tensor_index];
       ANeuralNetworksOperandType operand_type{
           ANEURALNETWORKS_TENSOR_FLOAT32,
           static_cast<uint32_t>(tensor.dims->size),
@@ -811,12 +828,11 @@ class NNAPIOpBuilder {
       const uint32_t dequantize_input[1] = {static_cast<uint32_t>(ann_index)};
       const uint32_t dequantize_output[1] = {
           static_cast<uint32_t>(dequantized_ann_index)};
-      RETURN_TFLITE_ERROR_IF_NN_ERROR(
-          context_,
-          nnapi_->ANeuralNetworksModel_addOperation(
-              nn_model_, ANEURALNETWORKS_DEQUANTIZE, 1, dequantize_input, 1,
-              dequantize_output),
-          "adding operation", nnapi_errno_);
+      TF_LITE_ENSURE_OK(
+          context_, AddOperationToModel(ANEURALNETWORKS_DEQUANTIZE,
+                                        /*input_count=*/1, dequantize_input,
+                                        /*output_count=*/1, dequantize_output,
+                                        lite_node_index));
       dequantize_mapping_->Add(ann_index, dequantized_type,
                                dequantized_ann_index);
     }
@@ -832,15 +848,12 @@ class NNAPIOpBuilder {
   TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type,
                                     int lite_node_index) {
     // Actually add a NN API operation
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_,
-        nnapi_->ANeuralNetworksModel_addOperation(
-            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
-            augmented_inputs_.data(),
-            static_cast<uint32_t>(augmented_outputs_.size()),
-            augmented_outputs_.data()),
-        "adding operation", nnapi_errno_);
-    nnapi_to_tflite_op_mapping_->push_back(lite_node_index);
+    TF_LITE_ENSURE_OK(context_,
+                      AddOperationToModel(
+                          type, static_cast<uint32_t>(augmented_inputs_.size()),
+                          augmented_inputs_.data(),
+                          static_cast<uint32_t>(augmented_outputs_.size()),
+                          augmented_outputs_.data(), lite_node_index));
     augmented_inputs_.clear();
     augmented_outputs_.clear();
     return kTfLiteOk;
@@ -3563,7 +3576,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
 
 void NNAPIDelegateKernel::AddDequantizeOperatorsWhereNeeded(
     const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
-    NNAPIOpBuilder* builder, int* nnapi_errno) {
+    int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno) {
   // Depending on the operator and the input data format, Dequantize
   // operators may need to be added. For example when the input is
   // floating-point but weights are quantized then the weights will first be
@@ -3611,7 +3624,7 @@ void NNAPIDelegateKernel::AddDequantizeOperatorsWhereNeeded(
 
     // Insert Dequantize operator if it hasn't been done already and change
     // the node's input accordingly.
-    builder->AddDequantize(i, node->inputs->data[i], type);
+    builder->AddDequantize(i, node->inputs->data[i], type, tflite_node_index);
   }
 }
 
@@ -3925,7 +3938,7 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     // Dequantize operators may have to be added in case inputs are to be
     // floating-point.
     AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
-                                      &builder, nnapi_errno);
+                                      node_index, &builder, nnapi_errno);
 
     builder.FinalizeAddOperation(nn_op_type, node_index);
   }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index c89e7fd14aa..9a0d13af87a 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstdint>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -568,12 +569,9 @@ TEST_F(UnsupportedOperationOnDeviceTest,
 
 // This is a model with two ops:
 //
-//  input1 ---->
-//                ADD --
-//  input2   -->        |
-//                       -->
-//                          SUB --> output
-//  input3 ---------------->
+//  input1 ----> HARD_SWISH ---->
+//                                ADD --> output
+//  input2 ---------------------->
 //
 class HardSwishAddOpsAcceleratedModel : public MultiOpModel,
                                         public AcceleratedModel {
@@ -714,6 +712,119 @@ TEST_F(TfLiteOpMappedToMultipleNnApiOps, AllConstitutentOpsSupported) {
   ASSERT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
 }
 
+class QuantizedWeightsConvolutionOpModel : public SingleOpModel,
+                                           public AcceleratedModel {
+ public:
+  QuantizedWeightsConvolutionOpModel(
+      const NnApi* nnapi, std::string accelerator_name, const TensorData& input,
+      const TensorData& filter, const TensorData& output, int stride_width = 2,
+      int stride_height = 2, enum Padding padding = Padding_VALID,
+      enum ActivationFunctionType activation = ActivationFunctionType_NONE,
+      int dilation_width_factor = 1, int dilation_height_factor = 1,
+      int num_threads = -1, std::initializer_list<uint8_t> filter_data = {})
+      : SingleOpModel(), AcceleratedModel(nnapi, accelerator_name) {
+    auto* delegate = GetDelegate();
+    this->SetApplyDelegate([delegate](Interpreter* interpreter) {
+      interpreter->ModifyGraphWithDelegate(delegate);
+    });
+
+    input_ = AddInput(input);
+
+    if (filter_data.size()) {
+      filter_ = AddConstInput(filter, filter_data);
+    } else {
+      filter_ = AddInput(filter);
+    }
+
+    int bias_size = GetShape(filter_)[0];
+
+    bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
+                 CreateConv2DOptions(
+                     builder_, padding, stride_width, stride_height, activation,
+                     dilation_width_factor, dilation_height_factor)
+                     .Union());
+
+    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)},
+                     num_threads);
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  void SetFilter(std::initializer_list<float> data) {
+    QuantizeAndPopulate<uint8_t>(filter_, data);
+  }
+
+  void SetBias(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+
+  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
+                               GetScale(output_), GetZeroPoint(output_));
+  }
+
+ protected:
+  int input_;
+  int filter_;
+  int bias_;
+  int output_;
+};
+
+int quantized_conv2d_model_added_nnapi_ops_count = 0;
+TEST_F(TfLiteOpMappedToMultipleNnApiOps,
+       AddedDequantizationsAreAccountedInModelOps) {
+  nnapi_mock_->ModelCreateReturns<0>();
+  nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
+      [](const ANeuralNetworksModel* model,
+         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
+         bool* supportedOps) -> int {
+        std::fill(supportedOps,
+                  supportedOps + quantized_conv2d_model_added_nnapi_ops_count,
+                  true);
+        return ANEURALNETWORKS_NO_ERROR;
+      });
+  nnapi_mock_->StubAddOperationWith(
+      [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
+         uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
+         const uint32_t* outputs) -> int {
+        ++quantized_conv2d_model_added_nnapi_ops_count;
+        return ANEURALNETWORKS_NO_ERROR;
+      });
+
+  QuantizedWeightsConvolutionOpModel m(
+      nnapi_mock_->GetNnApi(),
+      /*accelerator_name=*/"test-device", {TensorType_FLOAT32, {2, 2, 4, 1}},
+      {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64}, {TensorType_FLOAT32, {}});
+  m.SetInput({
+      // First batch
+      1, 1, 1, 1,  // row = 1
+      2, 2, 2, 2,  // row = 2
+      // Second batch
+      1, 2, 3, 4,  // row = 1
+      1, 2, 3, 4,  // row = 2
+  });
+  m.SetFilter({
+      1, 2, 3, 4,    // first 2x2 filter
+      -1, 1, -1, 1,  // second 2x2 filter
+      -1, -1, 1, 1,  // third 2x2 filter
+  });
+  m.SetBias({1, 2, 3});
+
+  EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
+  // When delegating quantized Conv2D, for each quantized inputs a
+  // dequantize operation is added to the model.
+  // In our case 1 Dequantize op for the weights is expected generating
+  // a 2 ops model.
+  EXPECT_EQ(quantized_conv2d_model_added_nnapi_ops_count, 2);
+}
+
 // Model with a chain of no-op (add with zero operations)
 // interleaved with no-op custom nodes.
 class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index af93d9650c9..26822c011e3 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -341,11 +341,9 @@ class NNAPIDelegateKernel {
 
   std::vector<int> nnapi_to_tflite_op_mapping_;
 
-  void AddDequantizeOperatorsWhereNeeded(const TfLiteContext* context,
-                                         int builtin_code,
-                                         const TfLiteNode* node,
-                                         NNAPIOpBuilder* builder,
-                                         int* nnapi_errno);
+  void AddDequantizeOperatorsWhereNeeded(
+      const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
+      int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
 
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 

From de32c75f2f0b9c298d858180fc19fa8881bfab41 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 03:46:20 -0700
Subject: [PATCH 1462/1533] Internal change

PiperOrigin-RevId: 314105811
Change-Id: I704529b3c56df993de13b350ec4205857962eedb
---
 .../lite/delegates/nnapi/nnapi_delegate.cc    |  57 ++++----
 .../nnapi_delegate_device_selection_test.cc   | 123 +-----------------
 .../delegates/nnapi/nnapi_delegate_kernel.h   |   8 +-
 3 files changed, 33 insertions(+), 155 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 96b58d74120..bf3714a443f 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -784,36 +784,19 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
-  // Adds the operation to the model and maps the operation to the originating
-  // TFLite one.
-  TfLiteStatus AddOperationToModel(ANeuralNetworksOperationType type,
-                                   uint32_t input_count, const uint32_t* inputs,
-                                   uint32_t output_count,
-                                   const uint32_t* outputs,
-                                   int lite_node_index) {
-    RETURN_TFLITE_ERROR_IF_NN_ERROR(
-        context_,
-        nnapi_->ANeuralNetworksModel_addOperation(
-            nn_model_, type, input_count, inputs, output_count, outputs),
-        "adding operation", nnapi_errno_);
-    nnapi_to_tflite_op_mapping_->push_back(lite_node_index);
-    return kTfLiteOk;
-  }
-
   // Adds a Dequantize operator and replaces the input tensor index with the
   // dequantized version. If the dequantized version of the operator already
   // exists then it is not added again.
-  TfLiteStatus AddDequantize(int nn_input_index, int lite_tensor_index,
-                             TfLiteType dequantized_type, int lite_node_index) {
-    const int ann_index =
-        operand_mapping_->lite_index_to_ann(lite_tensor_index);
+  TfLiteStatus AddDequantize(int nn_input_index, int lite_index,
+                             TfLiteType dequantized_type) {
+    const int ann_index = operand_mapping_->lite_index_to_ann(lite_index);
     int dequantized_ann_index =
         dequantize_mapping_->DequantizedAnnIndex(ann_index, dequantized_type);
 
     if (dequantized_ann_index == -1) {
       // The dequantized version does not exist yet, it has to be added: a new
       // Dequantize operation is added, yielding a new tensor.
-      const TfLiteTensor& tensor = context_->tensors[lite_tensor_index];
+      const TfLiteTensor& tensor = context_->tensors[lite_index];
       ANeuralNetworksOperandType operand_type{
           ANEURALNETWORKS_TENSOR_FLOAT32,
           static_cast<uint32_t>(tensor.dims->size),
@@ -828,11 +811,12 @@ class NNAPIOpBuilder {
       const uint32_t dequantize_input[1] = {static_cast<uint32_t>(ann_index)};
       const uint32_t dequantize_output[1] = {
           static_cast<uint32_t>(dequantized_ann_index)};
-      TF_LITE_ENSURE_OK(
-          context_, AddOperationToModel(ANEURALNETWORKS_DEQUANTIZE,
-                                        /*input_count=*/1, dequantize_input,
-                                        /*output_count=*/1, dequantize_output,
-                                        lite_node_index));
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context_,
+          nnapi_->ANeuralNetworksModel_addOperation(
+              nn_model_, ANEURALNETWORKS_DEQUANTIZE, 1, dequantize_input, 1,
+              dequantize_output),
+          "adding operation", nnapi_errno_);
       dequantize_mapping_->Add(ann_index, dequantized_type,
                                dequantized_ann_index);
     }
@@ -848,12 +832,15 @@ class NNAPIOpBuilder {
   TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type,
                                     int lite_node_index) {
     // Actually add a NN API operation
-    TF_LITE_ENSURE_OK(context_,
-                      AddOperationToModel(
-                          type, static_cast<uint32_t>(augmented_inputs_.size()),
-                          augmented_inputs_.data(),
-                          static_cast<uint32_t>(augmented_outputs_.size()),
-                          augmented_outputs_.data(), lite_node_index));
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context_,
+        nnapi_->ANeuralNetworksModel_addOperation(
+            nn_model_, type, static_cast<uint32_t>(augmented_inputs_.size()),
+            augmented_inputs_.data(),
+            static_cast<uint32_t>(augmented_outputs_.size()),
+            augmented_outputs_.data()),
+        "adding operation", nnapi_errno_);
+    nnapi_to_tflite_op_mapping_->push_back(lite_node_index);
     augmented_inputs_.clear();
     augmented_outputs_.clear();
     return kTfLiteOk;
@@ -3576,7 +3563,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
 
 void NNAPIDelegateKernel::AddDequantizeOperatorsWhereNeeded(
     const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
-    int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno) {
+    NNAPIOpBuilder* builder, int* nnapi_errno) {
   // Depending on the operator and the input data format, Dequantize
   // operators may need to be added. For example when the input is
   // floating-point but weights are quantized then the weights will first be
@@ -3624,7 +3611,7 @@ void NNAPIDelegateKernel::AddDequantizeOperatorsWhereNeeded(
 
     // Insert Dequantize operator if it hasn't been done already and change
     // the node's input accordingly.
-    builder->AddDequantize(i, node->inputs->data[i], type, tflite_node_index);
+    builder->AddDequantize(i, node->inputs->data[i], type);
   }
 }
 
@@ -3938,7 +3925,7 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
     // Dequantize operators may have to be added in case inputs are to be
     // floating-point.
     AddDequantizeOperatorsWhereNeeded(context, reg->builtin_code, node,
-                                      node_index, &builder, nnapi_errno);
+                                      &builder, nnapi_errno);
 
     builder.FinalizeAddOperation(nn_op_type, node_index);
   }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index 9a0d13af87a..c89e7fd14aa 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
-#include <cstdint>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -569,9 +568,12 @@ TEST_F(UnsupportedOperationOnDeviceTest,
 
 // This is a model with two ops:
 //
-//  input1 ----> HARD_SWISH ---->
-//                                ADD --> output
-//  input2 ---------------------->
+//  input1 ---->
+//                ADD --
+//  input2   -->        |
+//                       -->
+//                          SUB --> output
+//  input3 ---------------->
 //
 class HardSwishAddOpsAcceleratedModel : public MultiOpModel,
                                         public AcceleratedModel {
@@ -712,119 +714,6 @@ TEST_F(TfLiteOpMappedToMultipleNnApiOps, AllConstitutentOpsSupported) {
   ASSERT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
 }
 
-class QuantizedWeightsConvolutionOpModel : public SingleOpModel,
-                                           public AcceleratedModel {
- public:
-  QuantizedWeightsConvolutionOpModel(
-      const NnApi* nnapi, std::string accelerator_name, const TensorData& input,
-      const TensorData& filter, const TensorData& output, int stride_width = 2,
-      int stride_height = 2, enum Padding padding = Padding_VALID,
-      enum ActivationFunctionType activation = ActivationFunctionType_NONE,
-      int dilation_width_factor = 1, int dilation_height_factor = 1,
-      int num_threads = -1, std::initializer_list<uint8_t> filter_data = {})
-      : SingleOpModel(), AcceleratedModel(nnapi, accelerator_name) {
-    auto* delegate = GetDelegate();
-    this->SetApplyDelegate([delegate](Interpreter* interpreter) {
-      interpreter->ModifyGraphWithDelegate(delegate);
-    });
-
-    input_ = AddInput(input);
-
-    if (filter_data.size()) {
-      filter_ = AddConstInput(filter, filter_data);
-    } else {
-      filter_ = AddInput(filter);
-    }
-
-    int bias_size = GetShape(filter_)[0];
-
-    bias_ = AddInput({TensorType_FLOAT32, {bias_size}});
-
-    output_ = AddOutput(output);
-
-    SetBuiltinOp(BuiltinOperator_CONV_2D, BuiltinOptions_Conv2DOptions,
-                 CreateConv2DOptions(
-                     builder_, padding, stride_width, stride_height, activation,
-                     dilation_width_factor, dilation_height_factor)
-                     .Union());
-
-    BuildInterpreter({GetShape(input_), GetShape(filter_), GetShape(bias_)},
-                     num_threads);
-  }
-
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-
-  void SetFilter(std::initializer_list<float> data) {
-    QuantizeAndPopulate<uint8_t>(filter_, data);
-  }
-
-  void SetBias(std::initializer_list<float> data) {
-    PopulateTensor(input_, data);
-  }
-
-  std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
-  std::vector<float> GetDequantizedOutput() {
-    return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
-                               GetScale(output_), GetZeroPoint(output_));
-  }
-
- protected:
-  int input_;
-  int filter_;
-  int bias_;
-  int output_;
-};
-
-int quantized_conv2d_model_added_nnapi_ops_count = 0;
-TEST_F(TfLiteOpMappedToMultipleNnApiOps,
-       AddedDequantizationsAreAccountedInModelOps) {
-  nnapi_mock_->ModelCreateReturns<0>();
-  nnapi_mock_->StubGetSupportedOperationsForDevicesWith(
-      [](const ANeuralNetworksModel* model,
-         const ANeuralNetworksDevice* const* devices, uint32_t numDevices,
-         bool* supportedOps) -> int {
-        std::fill(supportedOps,
-                  supportedOps + quantized_conv2d_model_added_nnapi_ops_count,
-                  true);
-        return ANEURALNETWORKS_NO_ERROR;
-      });
-  nnapi_mock_->StubAddOperationWith(
-      [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type,
-         uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount,
-         const uint32_t* outputs) -> int {
-        ++quantized_conv2d_model_added_nnapi_ops_count;
-        return ANEURALNETWORKS_NO_ERROR;
-      });
-
-  QuantizedWeightsConvolutionOpModel m(
-      nnapi_mock_->GetNnApi(),
-      /*accelerator_name=*/"test-device", {TensorType_FLOAT32, {2, 2, 4, 1}},
-      {TensorType_UINT8, {3, 2, 2, 1}, -63.5, 64}, {TensorType_FLOAT32, {}});
-  m.SetInput({
-      // First batch
-      1, 1, 1, 1,  // row = 1
-      2, 2, 2, 2,  // row = 2
-      // Second batch
-      1, 2, 3, 4,  // row = 1
-      1, 2, 3, 4,  // row = 2
-  });
-  m.SetFilter({
-      1, 2, 3, 4,    // first 2x2 filter
-      -1, 1, -1, 1,  // second 2x2 filter
-      -1, -1, 1, 1,  // third 2x2 filter
-  });
-  m.SetBias({1, 2, 3});
-
-  EXPECT_EQ(m.CountOpsExecutedByCpuKernel(), 0);
-  // When delegating quantized Conv2D, for each quantized inputs a
-  // dequantize operation is added to the model.
-  // In our case 1 Dequantize op for the weights is expected generating
-  // a 2 ops model.
-  EXPECT_EQ(quantized_conv2d_model_added_nnapi_ops_count, 2);
-}
-
 // Model with a chain of no-op (add with zero operations)
 // interleaved with no-op custom nodes.
 class LongIdentityModel : public MultiOpModel, public AcceleratedModel {
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 26822c011e3..af93d9650c9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -341,9 +341,11 @@ class NNAPIDelegateKernel {
 
   std::vector<int> nnapi_to_tflite_op_mapping_;
 
-  void AddDequantizeOperatorsWhereNeeded(
-      const TfLiteContext* context, int builtin_code, const TfLiteNode* node,
-      int tflite_node_index, NNAPIOpBuilder* builder, int* nnapi_errno);
+  void AddDequantizeOperatorsWhereNeeded(const TfLiteContext* context,
+                                         int builtin_code,
+                                         const TfLiteNode* node,
+                                         NNAPIOpBuilder* builder,
+                                         int* nnapi_errno);
 
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 

From a26381f3dc49cfe7ef4bdc05652fc71b62f932f1 Mon Sep 17 00:00:00 2001
From: Lev Proleev <levp@google.com>
Date: Mon, 1 Jun 2020 05:49:15 -0700
Subject: [PATCH 1463/1533] Add NNAPI delegate support for Fill

PiperOrigin-RevId: 314116176
Change-Id: I8216b776ec823ff9c0696410091291f6b2d3a385
---
 .../delegates/nnapi/acceleration_test_list.cc |   5 +
 .../lite/delegates/nnapi/nnapi_delegate.cc    |  93 +++++++++++++
 tensorflow/lite/kernels/BUILD                 |   1 +
 tensorflow/lite/kernels/fill_test.cc          | 128 +++++++++++-------
 tensorflow/lite/kernels/test_util.h           |   2 +
 5 files changed, 179 insertions(+), 50 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 46a6a720d1e..7800e984c7f 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -173,6 +173,11 @@ ExpOpTest/FloatTest,29
 # Only constant tensors models
 ExpandDimsOpTest/.+/1,29
 
+# fill_test
+FillOpTest/FillOpTest/FillInt32/0,30
+FillOpTest/FillOpTest/FillFloat/0,30
+FillOpTest/FillOpTest/FillFloatInt32Dims/0,30
+
 # floor_test
 FloorOpTest/.+
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index bf3714a443f..bdcb72848f6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -2343,6 +2343,50 @@ bool NNAPIDelegateKernel::Validate(
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI13,
                                  &val_ctx);
     } break;
+    case kTfLiteBuiltinFill: {
+      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI13,
+                                 &val_ctx);
+      const auto& dims_tensor = context->tensors[node->inputs->data[0]];
+      Expect(IsConstantTensor(&dims_tensor),
+             NNAPIValidationFailureType::kUnsupportedInputType,
+             "NNAPI doesn't support dynamic dimensions tensor.", &val_ctx);
+      EXPECT_INPUT_TYPE_IN(dims_tensor.type, kTfLiteInt32, kTfLiteInt64);
+      if (IsConstantTensor(&dims_tensor)) {
+        Expect(dims_tensor.dims->data[0] != 0,
+               NNAPIValidationFailureType::kUnsupportedOperandValue,
+               "NNAPI doesn't support generating scalars from FILL", &val_ctx);
+        if (dims_tensor.type == kTfLiteInt64) {
+          bool fit_in_int32 =
+              std::all_of(dims_tensor.data.i64,
+                          dims_tensor.data.i64 + dims_tensor.dims->data[0],
+                          [](int64_t dim) {
+                            return std::numeric_limits<int32_t>::min() <= dim &&
+                                   dim <= std::numeric_limits<int32_t>::max();
+                          });
+          Expect(fit_in_int32,
+                 NNAPIValidationFailureType::kUnsupportedOperandValue,
+                 "NNAPI only supports int32 dimensions tensor. If the "
+                 "dimensions type is int64 and they are constant we can "
+                 "convert them to int32 if the value isn't too large.",
+                 &val_ctx);
+        }
+      }
+      const auto& value_tensor = context->tensors[node->inputs->data[1]];
+      EXPECT_INPUT_TYPE_IN(value_tensor.type, kTfLiteFloat32, kTfLiteInt32,
+                           kTfLiteInt64);
+      if (value_tensor.type == kTfLiteInt64) {
+        Expect(
+            IsConstantTensor(&value_tensor) &&
+                *value_tensor.data.i64 <= std::numeric_limits<int32_t>::max() &&
+                *value_tensor.data.i64 >= std::numeric_limits<int32_t>::min(),
+            NNAPIValidationFailureType::kUnsupportedInputType,
+            "NNAPI only supports int32 input. If the input type is int64 and "
+            "constant we can convert it to int32 if the value isn't too "
+            "large.",
+            &val_ctx);
+      }
+    } break;
     default:
       // All other operators are not mapped.
       AddValidationFailure(NNAPIValidationFailureType::kUnsupportedOperator,
@@ -3127,6 +3171,9 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarFloat32Operand(1.0);
       *nn_op_type = ANEURALNETWORKS_ELU;
     } break;
+    case kTfLiteBuiltinFill: {
+      *nn_op_type = ANEURALNETWORKS_FILL;
+    } break;
     default:
       // All other operators are not mapped.
       return kTfLiteError;
@@ -3882,6 +3929,52 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
           TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index, hybrid_op,
                                                        input_tensor_flags));
         }
+      } else if (reg->builtin_code == kTfLiteBuiltinFill) {
+        if (input_pos == 0) {
+          const int dims_id = node->inputs->data[0];
+          const TfLiteTensor& dims_tensor = context->tensors[dims_id];
+          switch (dims_tensor.type) {
+            case kTfLiteInt32:
+              TF_LITE_ENSURE_STATUS(
+                  builder.AddTensorInput(input_index, hybrid_op));
+              break;
+            case kTfLiteInt64: {
+              // We made sure that dimensions are constant and fit into int32 in
+              // Map(), so we can safely create a new tensor with casted values.
+              const int dims_size = dims_tensor.dims->data[0];
+              std::vector<int32_t> dims_int32(dims_size);
+              std::copy(dims_tensor.data.i64, dims_tensor.data.i64 + dims_size,
+                        dims_int32.begin());
+              int new_tensor_index = -1;
+              builder.AddNewInputConstantTensor(
+                  ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, dims_tensor.dims,
+                  dims_int32, dims_tensor.params, &new_tensor_index);
+            } break;
+            default:
+              return kTfLiteError;
+          }
+        } else {
+          const int value_id = node->inputs->data[1];
+          const TfLiteTensor& value_tensor = context->tensors[value_id];
+          switch (value_tensor.type) {
+            case kTfLiteFloat32:
+              TF_LITE_ENSURE_STATUS(
+                  builder.AddScalarFloat32Operand(*value_tensor.data.f));
+              break;
+            case kTfLiteInt32:
+              TF_LITE_ENSURE_STATUS(
+                  builder.AddScalarInt32Operand(*value_tensor.data.i32));
+              break;
+            case kTfLiteInt64:
+              // Map() function already makes sure int64 input is constant and
+              // fits into int32.
+              TF_LITE_ENSURE_STATUS(builder.AddScalarInt32Operand(
+                  static_cast<int32_t>(*value_tensor.data.i64)));
+              break;
+            default:
+              return kTfLiteError;
+          }
+        }
       } else {
         TF_LITE_ENSURE_STATUS(
             builder.AddTensorInput(input_index, hybrid_op, input_tensor_flags));
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 657b5d89a85..eb62a338a45 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -2014,6 +2014,7 @@ cc_test(
     name = "fill_test",
     size = "small",
     srcs = ["fill_test.cc"],
+    tags = ["tflite_nnapi"],
     deps = [
         ":builtin_ops",
         ":test_main",
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
index 4ab013bb357..0717a31b9d7 100644
--- a/tensorflow/lite/kernels/fill_test.cc
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -24,87 +24,115 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::IsEmpty;
 
+enum class TestType {
+  kConst = 0,
+  kDynamic = 1,
+};
+
+template <typename dims_type, typename value_type>
 class FillOpModel : public SingleOpModel {
  public:
-  explicit FillOpModel(const TensorData& input1, const TensorData& input2) {
-    input1_ = AddInput(input1);
-    input2_ = AddInput(input2);
-    output_ = AddOutput(input1);
+  explicit FillOpModel(TensorType dims_tensor_type,
+                       std::initializer_list<int> dims_shape,
+                       std::initializer_list<dims_type> dims_data,
+                       value_type value, TestType input_tensor_types) {
+    if (input_tensor_types == TestType::kDynamic) {
+      dims_ = AddInput(dims_tensor_type);
+      value_ = AddInput(GetTensorType<value_type>());
+    } else {
+      dims_ = AddConstInput(dims_tensor_type, dims_data, dims_shape);
+      value_ = AddConstInput(GetTensorType<value_type>(), {value}, {});
+    }
+    output_ = AddOutput(GetTensorType<value_type>());
     SetBuiltinOp(BuiltinOperator_FILL, BuiltinOptions_FillOptions,
                  CreateFillOptions(builder_).Union());
-    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+    BuildInterpreter({dims_shape, {}});
+
+    if (input_tensor_types == TestType::kDynamic) {
+      if (dims_data.size() > 0) {
+        PopulateTensor<dims_type>(dims_, dims_data);
+      }
+      PopulateTensor<value_type>(value_, {value});
+    }
   }
 
-  int input1() { return input1_; }
-  int input2() { return input2_; }
-  int output() { return output_; }
+  std::vector<value_type> GetOutput() {
+    return ExtractVector<value_type>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
  protected:
-  int input1_;
-  int input2_;
+  int dims_;
+  int value_;
   int output_;
 };
 
-TEST(FillOpModel, FillInt32) {
-  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT32});
-  m.PopulateTensor<int32_t>(m.input1(), {2, 3});
-  m.PopulateTensor<int32_t>(m.input2(), {-11});
+class FillOpTest : public ::testing::TestWithParam<TestType> {};
+
+TEST_P(FillOpTest, FillInt32) {
+  FillOpModel<int32_t, int32_t> m(TensorType_INT32, {2}, {2, 3}, -11,
+                                  GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
-              ElementsAreArray({-11, -11, -11, -11, -11, -11}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-11, -11, -11, -11, -11, -11}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
 }
 
-TEST(FillOpModel, FillInt64) {
-  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT64});
-  m.PopulateTensor<int32_t>(m.input1(), {2, 4});
-  m.PopulateTensor<int64_t>(m.input2(), {1LL << 45});
+TEST_P(FillOpTest, FillInt64) {
+  FillOpModel<int64_t, int64_t> m(TensorType_INT64, {2}, {2, 4}, 1LL << 45,
+                                  GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<int64_t>(m.output()),
+  EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({1LL << 45, 1LL << 45, 1LL << 45, 1LL << 45,
                                 1LL << 45, 1LL << 45, 1LL << 45, 1LL << 45}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 4}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 4}));
 }
 
-TEST(FillOpModel, FillFloat) {
-  FillOpModel m({TensorType_INT64, {3}}, {TensorType_FLOAT32});
-  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
-  m.PopulateTensor<float>(m.input2(), {4.0});
+TEST_P(FillOpTest, FillFloat) {
+  FillOpModel<int64_t, float> m(TensorType_INT64, {3}, {2, 2, 2}, 4.0,
+                                GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+  EXPECT_THAT(m.GetOutput(),
               ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
 }
 
-TEST(FillOpModel, FillOutputScalar) {
-  FillOpModel m({TensorType_INT64, {0}}, {TensorType_FLOAT32});
-  m.PopulateTensor<float>(m.input2(), {4.0});
+TEST_P(FillOpTest, FillFloatInt32Dims) {
+  FillOpModel<int32_t, float> m(TensorType_INT32, {3}, {2, 2, 2}, 4.0,
+                                GetParam());
   m.Invoke();
-  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({4.0}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), IsEmpty());
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
 }
 
-TEST(FillOpModel, FillBool) {
-  FillOpModel m({TensorType_INT64, {3}}, {TensorType_BOOL});
-  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
-  m.PopulateTensor<bool>(m.input2(), {true});
+TEST_P(FillOpTest, FillOutputScalar) {
+  FillOpModel<int64_t, float> m(TensorType_INT64, {0}, {}, 4.0, GetParam());
   m.Invoke();
-  EXPECT_THAT(
-      m.ExtractVector<bool>(m.output()),
-      ElementsAreArray({true, true, true, true, true, true, true, true}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4.0}));
+  EXPECT_THAT(m.GetOutputShape(), IsEmpty());
 }
 
-TEST(FillOpModel, FillString) {
-  FillOpModel m({TensorType_INT64, {3}}, {TensorType_STRING});
-  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
-  m.PopulateTensor<std::string>(m.input2(), {"AB"});
+TEST_P(FillOpTest, FillBool) {
+  FillOpModel<int64_t, bool> m(TensorType_INT64, {3}, {2, 2, 2}, true,
+                               GetParam());
   m.Invoke();
-  EXPECT_THAT(
-      m.ExtractVector<std::string>(m.output()),
-      ElementsAreArray({"AB", "AB", "AB", "AB", "AB", "AB", "AB", "AB"}));
-  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({true, true, true, true, true,
+                                               true, true, true}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
 }
 
+TEST(FillOpTest, FillString) {
+  FillOpModel<int64_t, std::string> m(TensorType_INT64, {3}, {2, 2, 2}, "AB",
+                                      TestType::kDynamic);
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({"AB", "AB", "AB", "AB", "AB",
+                                               "AB", "AB", "AB"}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 2}));
+}
+
+INSTANTIATE_TEST_SUITE_P(FillOpTest, FillOpTest,
+                         ::testing::Values(TestType::kConst,
+                                           TestType::kDynamic));
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 90a4df56c57..ac1ad5d9025 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <cmath>
 #include <complex>
+#include <type_traits>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -812,6 +813,7 @@ TensorType GetTensorType() {
   if (std::is_same<T, int64_t>::value) return TensorType_INT64;
   if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
   if (std::is_same<T, string>::value) return TensorType_STRING;
+  if (std::is_same<T, bool>::value) return TensorType_BOOL;
   return TensorType_MIN;  // default value
 }
 

From b3c62ded48b3ffe9076c227ed9e204f18bd9c85a Mon Sep 17 00:00:00 2001
From: tg-at-google <taregaskin@google.com>
Date: Mon, 1 Jun 2020 11:34:34 -0400
Subject: [PATCH 1464/1533] in resolution of [Wsign-compare] warning id 7

I reexpress the array limit of `handled_signals` and the  index `i` in terms of size_t to resolve build warning:

`tensorflow/core/platform/default/stacktrace_handler.cc:
tensorflow/core/platform/default/stacktrace_handler.cc: In function 'void tensorflow::testing::InstallStacktraceHandler()':
tensorflow/core/platform/default/stacktrace_handler.cc:103:21: warning: comparison of integer expressions of different signedness: 'int' and 'long unsigned int' [-Wsign-compare]
  103 |   for (int i = 0; i < sizeof(handled_signals) / sizeof(int); i++) {
      |                   ~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
---
 tensorflow/core/platform/default/stacktrace_handler.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/platform/default/stacktrace_handler.cc b/tensorflow/core/platform/default/stacktrace_handler.cc
index 72907ecb526..cd38d944857 100644
--- a/tensorflow/core/platform/default/stacktrace_handler.cc
+++ b/tensorflow/core/platform/default/stacktrace_handler.cc
@@ -99,8 +99,9 @@ static void StacktraceHandler(int sig, siginfo_t *si, void *v) {
 
 void InstallStacktraceHandler() {
   int handled_signals[] = {SIGSEGV, SIGABRT, SIGBUS, SIGILL, SIGFPE};
-
-  for (int i = 0; i < sizeof(handled_signals) / sizeof(int); i++) {
+  
+  size_t array_limit = sizeof(handled_signals) / sizeof(int);
+  for (size_t i = 0; i < array_limit; i++) {
     int sig = handled_signals[i];
     struct sigaction sa;
     struct sigaction osa;

From dc08ad80c8e65ac3e245035213a5cef861206aa8 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 1 Jun 2020 09:35:07 -0700
Subject: [PATCH 1465/1533] Add unit tests for TPUEmbedding API.

PiperOrigin-RevId: 314144719
Change-Id: I1f70980270d2484da7f31dd04b0c27087ee15d1a
---
 tensorflow/python/tpu/BUILD                   |   60 -
 .../python/tpu/tpu_embedding_v2_cpu_test.py   |  239 ---
 .../python/tpu/tpu_embedding_v2_test.py       | 1397 -----------------
 .../python/tpu/tpu_embedding_v2_test_lib.py   |   96 --
 4 files changed, 1792 deletions(-)
 delete mode 100644 tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
 delete mode 100644 tensorflow/python/tpu/tpu_embedding_v2_test.py
 delete mode 100644 tensorflow/python/tpu/tpu_embedding_v2_test_lib.py

diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index ea1317536a4..d398396ec2a 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -468,66 +468,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "tpu_embedding_v2_test_lib",
-    srcs = ["tpu_embedding_v2_test_lib.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_embedding_v2_utils",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:init_ops_v2",
-    ],
-)
-
-tpu_py_test(
-    name = "tpu_embedding_v2_test",
-    srcs = [
-        "tpu_embedding_v2_test.py",
-    ],
-    disable_experimental = True,
-    python_version = "PY3",
-    shard_count = 4,
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_embedding",
-        ":tpu_embedding_v2",
-        ":tpu_embedding_v2_test_lib",
-        ":tpu_strategy_util",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:tpu_strategy",
-        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:remote",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/saved_model",
-        "//tensorflow/python/training/tracking:util",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_embedding_v2_cpu_test",
-    srcs = [
-        "tpu_embedding_v2_cpu_test.py",
-    ],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":tpu_embedding_v2",
-        ":tpu_embedding_v2_test_lib",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_proto_library(
     name = "tensor_tracer_proto",
     srcs = ["tensor_tracer.proto"],
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
deleted file mode 100644
index a06f48187c9..00000000000
--- a/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TPU Embeddings mid level API on CPU."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_embedding_v2
-from tensorflow.python.tpu import tpu_embedding_v2_test_lib
-from tensorflow.python.tpu import tpu_embedding_v2_utils
-from tensorflow.python.util import nest
-
-
-class CPUEmbeddingTest(tpu_embedding_v2_test_lib.EmbeddingTestBase):
-
-  def setUp(self):
-    super(CPUEmbeddingTest, self).setUp()
-    self._create_initial_data()
-
-  def _create_mid_level(self):
-    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
-    return tpu_embedding_v2.TPUEmbedding(
-        feature_config=self.feature_config,
-        batch_size=self.batch_size,
-        optimizer=optimizer)
-
-  def _get_dense_tensors(self, dtype=dtypes.int32):
-    feature0 = constant_op.constant(self.feature_watched_values, dtype=dtype)
-    feature1 = constant_op.constant(self.feature_favorited_values, dtype=dtype)
-    feature2 = constant_op.constant(self.feature_friends_values, dtype=dtype)
-    return (feature0, feature1, feature2)
-
-  def test_cpu_dense_lookup(self):
-    mid_level = self._create_mid_level()
-    features = self._get_dense_tensors()
-    results = tpu_embedding_v2.cpu_embedding_lookup(
-        features,
-        weights=None,
-        tables=mid_level.embedding_tables,
-        feature_config=self.feature_config)
-    all_lookups = []
-    for feature, config in zip(nest.flatten(features), self.feature_config):
-      table = mid_level.embedding_tables[config.table].numpy()
-      all_lookups.append(table[feature.numpy()])
-    self.assertAllClose(results, nest.pack_sequence_as(results, all_lookups))
-
-  def test_cpu_dense_lookup_with_weights(self):
-    mid_level = self._create_mid_level()
-    features = self._get_dense_tensors()
-    weights = self._get_dense_tensors(dtype=dtypes.float32)
-
-    with self.assertRaisesRegex(
-        ValueError, 'Weight specified for .*, but input is dense.'):
-      tpu_embedding_v2.cpu_embedding_lookup(
-          features,
-          weights=weights,
-          tables=mid_level.embedding_tables,
-          feature_config=self.feature_config)
-
-  def _get_sparse_tensors(self, dtype=dtypes.int32):
-    feature0 = sparse_tensor.SparseTensor(
-        indices=self.feature_watched_indices,
-        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
-        dense_shape=[self.data_batch_size, 2])
-    feature1 = sparse_tensor.SparseTensor(
-        indices=self.feature_favorited_indices,
-        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
-        dense_shape=[self.data_batch_size, 2])
-    feature2 = sparse_tensor.SparseTensor(
-        indices=self.feature_friends_indices,
-        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
-        dense_shape=[self.data_batch_size, 3])
-    return (feature0, feature1, feature2)
-
-  def test_cpu_sparse_lookup(self):
-    mid_level = self._create_mid_level()
-    features = self._get_sparse_tensors()
-    results = tpu_embedding_v2.cpu_embedding_lookup(
-        features,
-        weights=None,
-        tables=mid_level.embedding_tables,
-        feature_config=self.feature_config)
-    reduced = []
-    for feature, config in zip(nest.flatten(features), self.feature_config):
-      table = mid_level.embedding_tables[config.table].numpy()
-      all_lookups = table[feature.values.numpy()]
-      # With row starts we can use reduceat in numpy. Get row starts from the
-      # ragged tensor API.
-      ragged = ragged_tensor.RaggedTensor.from_sparse(feature)
-      row_starts = ragged.row_starts().numpy()
-      reduced.append(np.add.reduceat(all_lookups, row_starts))
-      if config.table.combiner == 'mean':
-        # for mean, divide by the row lengths.
-        reduced[-1] /= np.expand_dims(ragged.row_lengths().numpy(), axis=1)
-    self.assertAllClose(results, nest.pack_sequence_as(results, reduced))
-
-  def test_cpu_sparse_lookup_with_weights(self):
-    mid_level = self._create_mid_level()
-    features = self._get_sparse_tensors()
-    weights = self._get_sparse_tensors(dtype=dtypes.float32)
-    results = tpu_embedding_v2.cpu_embedding_lookup(
-        features,
-        weights=weights,
-        tables=mid_level.embedding_tables,
-        feature_config=self.feature_config)
-    weighted_sum = []
-    for feature, weight, config in zip(nest.flatten(features),
-                                       nest.flatten(weights),
-                                       self.feature_config):
-      table = mid_level.embedding_tables[config.table].numpy()
-      # Expand dims here needed to broadcast this multiplication properly.
-      weight = np.expand_dims(weight.values.numpy(), axis=1)
-      all_lookups = table[feature.values.numpy()] * weight
-      # With row starts we can use reduceat in numpy. Get row starts from the
-      # ragged tensor API.
-      row_starts = ragged_tensor.RaggedTensor.from_sparse(feature).row_starts()
-      row_starts = row_starts.numpy()
-      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
-      if config.table.combiner == 'mean':
-        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
-    self.assertAllClose(results, nest.pack_sequence_as(results,
-                                                       weighted_sum))
-
-  def test_cpu_sparse_lookup_with_non_sparse_weights(self):
-    mid_level = self._create_mid_level()
-    features = self._get_sparse_tensors()
-    weights = self._get_dense_tensors(dtype=dtypes.float32)
-    with self.assertRaisesRegex(
-        ValueError, 'but it does not match type of the input which is'):
-      tpu_embedding_v2.cpu_embedding_lookup(
-          features,
-          weights=weights,
-          tables=mid_level.embedding_tables,
-          feature_config=self.feature_config)
-
-  def _get_ragged_tensors(self, dtype=dtypes.int32):
-    feature0 = ragged_tensor.RaggedTensor.from_row_lengths(
-        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
-        row_lengths=self.feature_watched_row_lengths)
-    feature1 = ragged_tensor.RaggedTensor.from_row_lengths(
-        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
-        row_lengths=self.feature_favorited_row_lengths)
-    feature2 = ragged_tensor.RaggedTensor.from_row_lengths(
-        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
-        row_lengths=self.feature_friends_row_lengths)
-    return (feature0, feature1, feature2)
-
-  def test_cpu_ragged_lookup_with_weights(self):
-    mid_level = self._create_mid_level()
-    features = self._get_ragged_tensors()
-    weights = self._get_ragged_tensors(dtype=dtypes.float32)
-    results = tpu_embedding_v2.cpu_embedding_lookup(
-        features,
-        weights=weights,
-        tables=mid_level.embedding_tables,
-        feature_config=self.feature_config)
-    weighted_sum = []
-    for feature, weight, config in zip(nest.flatten(features),
-                                       nest.flatten(weights),
-                                       self.feature_config):
-      table = mid_level.embedding_tables[config.table].numpy()
-      # Expand dims here needed to broadcast this multiplication properly.
-      weight = np.expand_dims(weight.values.numpy(), axis=1)
-      all_lookups = table[feature.values.numpy()] * weight
-      row_starts = feature.row_starts().numpy()
-      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
-      if config.table.combiner == 'mean':
-        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
-    self.assertAllClose(results, nest.pack_sequence_as(results,
-                                                       weighted_sum))
-
-  def test_cpu_invalid_structure_for_features(self):
-    mid_level = self._create_mid_level()
-    # Remove one element of the tuple, self.feature_config has 3 so we need to
-    # pass 3.
-    features = tuple(self._get_sparse_tensors()[:2])
-    with self.assertRaises(ValueError):
-      tpu_embedding_v2.cpu_embedding_lookup(
-          features,
-          weights=None,
-          tables=mid_level.embedding_tables,
-          feature_config=self.feature_config)
-
-  def test_cpu_invalid_structure_for_weights(self):
-    mid_level = self._create_mid_level()
-    features = self._get_sparse_tensors()
-    # Remove one element of the tuple, self.feature_config has 3 so we need to
-    # pass 3 (or None).
-    weights = tuple(self._get_dense_tensors(dtype=dtypes.float32)[:2])
-    with self.assertRaises(ValueError):
-      tpu_embedding_v2.cpu_embedding_lookup(
-          features,
-          weights=weights,
-          tables=mid_level.embedding_tables,
-          feature_config=self.feature_config)
-
-  def test_cpu_sequence_lookup(self):
-    feature_config = (
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_video, name='watched', max_sequence_length=2),)
-    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
-    mid_level = tpu_embedding_v2.TPUEmbedding(
-        feature_config=feature_config,
-        batch_size=self.batch_size,
-        optimizer=optimizer)
-    features = tuple(self._get_sparse_tensors()[:1])
-    with self.assertRaisesRegex(
-        ValueError, 'Sequence features unsupported at this time.'):
-      tpu_embedding_v2.cpu_embedding_lookup(
-          features,
-          weights=None,
-          tables=mid_level.embedding_tables,
-          feature_config=feature_config)
-
-
-if __name__ == '__main__':
-  v2_compat.enable_v2_behavior()
-  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
deleted file mode 100644
index 5bfbdcb1c8a..00000000000
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ /dev/null
@@ -1,1397 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TPU Embeddings mid level API on TPU."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import functools
-import itertools
-import os
-
-from absl import flags
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import tpu_strategy
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
-from tensorflow.python.eager import backprop
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import remote
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import init_ops_v2
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.platform import test
-from tensorflow.python.saved_model import load
-from tensorflow.python.saved_model import save
-from tensorflow.python.tpu import tpu_embedding
-from tensorflow.python.tpu import tpu_embedding_v2
-from tensorflow.python.tpu import tpu_embedding_v2_test_lib
-from tensorflow.python.tpu import tpu_embedding_v2_utils
-from tensorflow.python.tpu import tpu_strategy_util
-from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training.tracking import util
-from tensorflow.python.util import nest
-
-
-FLAGS = flags.FLAGS
-flags.DEFINE_string('tpu', '', 'Name of TPU to connect to.')
-flags.DEFINE_string('project', None, 'Name of GCP project with TPU.')
-flags.DEFINE_string('zone', None, 'Name of GCP zone with TPU.')
-flags.DEFINE_string('model_dir', os.environ.get('TEST_TMPDIR'),
-                    'A temporary directory.')
-
-
-class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
-
-  def setUp(self):
-    super(TPUEmbeddingCheckpointTest, self).setUp()
-    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
-    remote.connect_to_cluster(self.resolver)
-    tpu_strategy_util.initialize_tpu_system(self.resolver)
-    self.strategy = tpu_strategy.TPUStrategy(self.resolver)
-    self.num_rows = self.strategy.num_replicas_in_sync
-
-    # These tests use two mid level API objects, initialized with different
-    # values. These have the same sizes.
-    with self.strategy.scope():
-      self.first_mid_level_contents = np.ones((self.num_rows, 4))
-      self.first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
-          learning_rate=0.1)
-      self.first_mid_level = self.build_mid_level(
-          self.first_mid_level_contents, self.first_mid_level_optimizer)
-
-      self.second_mid_level_contents = np.ones((self.num_rows, 4)) * 2
-      self.second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
-          learning_rate=0.1)
-      self.second_mid_level = self.build_mid_level(
-          self.second_mid_level_contents, self.second_mid_level_optimizer,
-          initialize_tpu_embedding=False)
-
-    self.cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
-        learning_rate=0.1)
-    self.cpu_mid_level = self.build_mid_level(
-        self.second_mid_level_contents, self.cpu_mid_level_optimizer)
-
-  def tearDown(self):
-    tpu_strategy_util.shutdown_tpu_system(self.resolver)
-    super(TPUEmbeddingCheckpointTest, self).tearDown()
-
-  def test_checkpoint_save_retrieves(self):
-    # Ensure that the variables from the first model are loaded.
-    self.first_mid_level._load_variables()
-
-    self.assertAllClose(
-        self.first_mid_level_contents,
-        self.make_checkpoint_and_get_embedding('before_load',
-                                               self.first_mid_level),
-        msg='Checkpoint should contain values from the first api object.')
-
-    self.second_mid_level._load_variables()
-
-    # When we load the variables from the second mid level API object to the TPU
-    # we expect that checkpointing the first mid level API object will now
-    # retrieve the values from the TPU which are now different from the current
-    # variables in the first mid level.
-    self.assertAllClose(
-        self.second_mid_level_contents,
-        self.make_checkpoint_and_get_embedding('after_load',
-                                               self.first_mid_level),
-        msg='Checkpoint should contain values from the second api object.')
-
-  def test_checkpoint_restore_loads(self):
-
-    def get_values(mid):
-      return ops.convert_to_tensor(
-          mid._variables['table']['parameters'].variables[0])
-
-    self.first_mid_level._load_variables()
-
-    first_checkpoint = util.Checkpoint(model=self.first_mid_level)
-    first_checkpoint.save(_get_tmpdir('restore', 'save'))
-
-    # Checkpoint now has values from first_mid_level. See first assert in
-    # test_checkpoint_save_retrieves.
-
-    self.second_mid_level._load_variables()
-
-    self.assertAllClose(
-        self.second_mid_level_contents,
-        get_values(self.second_mid_level),
-        msg='Second mid level api should contain its initial values.',
-    )
-
-    # We restore the checkpoint of our first model into our second model.
-    # This should load the first mid level API object onto the TPU.
-    second_checkpoint = util.Checkpoint(model=self.second_mid_level)
-    second_checkpoint.restore(_get_tmpdir('restore', 'save-1'))
-
-    # Call retrieve here as a way to check what the TPU contains contains.
-    # Calling the retrieve ops directly might make for a cleaner separation of
-    # test and module, though.
-    self.second_mid_level._retrieve_variables()
-
-    self.assertAllClose(
-        self.first_mid_level_contents,
-        get_values(self.second_mid_level),
-        msg='Second mid level api should have retrieved the first model values.'
-    )
-
-  def build_mid_level(self, embedding_values, optimizer,
-                      initialize_tpu_embedding=True):
-    """Creates an embedding api object initialized to embedding_values."""
-    initializer = init_ops_v2.Constant(embedding_values)
-
-    table = tpu_embedding_v2_utils.TableConfig(
-        vocabulary_size=self.num_rows, dim=4, initializer=initializer,
-        combiner='sum', name='table')
-    feature_config = (tpu_embedding_v2_utils.FeatureConfig(
-        table=table, name='feature'),)
-
-    # batch_size here does not matter as we aren't traininig in any of these
-    # tests.
-    return tpu_embedding_v2.TPUEmbedding(
-        feature_config, 64, optimizer,
-        initialize_tpu_embedding=initialize_tpu_embedding)
-
-  def make_checkpoint_and_get_embedding(self, name, model):
-    """Saves model to checkpoint name, retrieves embedding variables."""
-    checkpoint = util.Checkpoint(model=model)
-    checkpoint.save(_get_tmpdir(name, 'save'))
-
-    # Get the name of the parameters variable which should be the only
-    # [self.num_rows, 4] shaped tensor in the checkpoint. Note that we do this
-    # as the key can change.
-    variables = checkpoint_utils.list_variables(_get_tmpdir(name))
-    variables = [name for name, size in variables if size == [self.num_rows, 4]]
-    if len(variables) != 1:
-      raise RuntimeError('Found {} copies of the parameter variable in the '
-                         'checkpoint. Exactly one copy exported.'.format(
-                             len(variables)))
-    return checkpoint_utils.load_variable(_get_tmpdir(name), variables[0])
-
-  def test_model_export_cpu(self):
-    self.first_mid_level._load_variables()
-
-    tpu_checkpoint = util.Checkpoint(model=self.first_mid_level)
-    tpu_checkpoint.save(_get_tmpdir('export_cpu', 'save'))
-
-    # We restore the checkpoint of our tpu mid level onto our cpu mid level.
-    cpu_checkpoint = util.Checkpoint(model=self.cpu_mid_level)
-    cpu_checkpoint.restore(_get_tmpdir('export_cpu', 'save-1'))
-
-    @def_function.function
-    def serve_tensors(features):
-      features = tpu_embedding_v2.cpu_embedding_lookup(
-          features, None, self.cpu_mid_level.embedding_tables,
-          self.cpu_mid_level._feature_config)
-      return features[0]
-
-    signatures = {
-        'serving_default':
-            serve_tensors.get_concrete_function(
-                (tensor_spec.TensorSpec(
-                    shape=(2,), dtype=dtypes.int32, name='feature'),))}
-    save.save(self.cpu_mid_level,
-              export_dir=_get_tmpdir('export_cpu', 'exported_model'),
-              signatures=signatures)
-
-    imported = load.load(_get_tmpdir('export_cpu', 'exported_model'))
-    predict_fn = imported.signatures['serving_default']
-
-    input_feature_value = np.array([1, 0])
-    input_batch = (constant_op.constant(input_feature_value,
-                                        dtype=dtypes.int32),)
-    prediction = predict_fn(*input_batch)['output_0']
-    self.assertAllClose(prediction.numpy(),
-                        self.first_mid_level_contents[input_feature_value])
-
-  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
-                            tpu_embedding_v2_utils.Adagrad,
-                            tpu_embedding_v2_utils.Adam)
-  def test_check_checkpoint_variable_names_are_same_on_cpu_and_tpu(self,
-                                                                   optimizer):
-    # Reinitialize the TPU so that we can re-initialize the embeddings with the
-    # given optimizer.
-    tpu_strategy_util.initialize_tpu_system(self.resolver)
-    optimizer = optimizer(learning_rate=0.1)
-
-    with self.strategy.scope():
-      tpu_mid_level = self.build_mid_level(
-          self.first_mid_level_contents, optimizer)
-
-    tpu_checkpoint = util.Checkpoint(model=tpu_mid_level)
-    tpu_checkpoint.save(_get_tmpdir('save-tpu', 'save'))
-    tpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-tpu'))
-
-    cpu_mid_level = self.build_mid_level(
-        self.first_mid_level_contents, optimizer)
-
-    cpu_checkpoint = util.Checkpoint(model=cpu_mid_level)
-    cpu_checkpoint.save(_get_tmpdir('save-cpu', 'save'))
-    cpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-cpu'))
-
-    self.assertAllEqual(tpu_variables, cpu_variables)
-
-
-class TPUEmbeddingTest(parameterized.TestCase,
-                       tpu_embedding_v2_test_lib.EmbeddingTestBase):
-
-  def setUp(self):
-    super(TPUEmbeddingTest, self).setUp()
-    self._create_initial_data()
-    self.resolver = None
-
-  def tearDown(self):
-    if self.resolver:
-      tpu_strategy_util.shutdown_tpu_system(self.resolver)
-    super(TPUEmbeddingTest, self).tearDown()
-
-  def test_tables_with_same_name(self):
-    with self.assertRaisesRegex(
-        ValueError, 'Multiple tables with name table found.'):
-      with self._get_strategy().scope():
-        tpu_embedding_v2.TPUEmbedding(
-            (tpu_embedding_v2_utils.FeatureConfig(
-                table=tpu_embedding_v2_utils.TableConfig(
-                    name='table',
-                    vocabulary_size=4,
-                    dim=2,
-                    initializer=self.initializer,),
-                name='watched'),
-             tpu_embedding_v2_utils.FeatureConfig(
-                 table=tpu_embedding_v2_utils.TableConfig(
-                     name='table',
-                     vocabulary_size=4,
-                     dim=2,
-                     initializer=self.initializer),
-                 name='favorited')),
-            self.batch_size,
-            tpu_embedding_v2_utils.SGD(learning_rate=0.1))
-
-  def test_unsupported_optimizer(self):
-    with self.assertRaisesRegex(
-        ValueError, 'is an unsupported optimizer class.'):
-      with self._get_strategy().scope():
-        tpu_embedding_v2.TPUEmbedding(
-            self.feature_config, self.batch_size,
-            tpu_embedding.AdagradParameters(learning_rate=0.1))
-
-  def test_pass_non_tensor_to_apply_gradients(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    @def_function.function
-    def test_apply():
-      mid_level_api.apply_gradients((1, 2, 3))
-
-    with self.assertRaisesRegex(ValueError, 'Expected Tensor.'):
-      strategy.run(test_apply)
-
-  def test_pass_different_structure_to_apply_gradients(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    @def_function.function
-    def test_apply():
-      # This should be a tuple as feature_config is a tuple of 3 configs.
-      mid_level_api.apply_gradients([1, 2, 3])
-
-    with self.assertRaisesRegex(
-        TypeError,
-        'The two structures don\'t have the same nested structure.'):
-      strategy.run(test_apply)
-
-  def test_pass_none_to_apply_gradients(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    dataset = self._create_sparse_dataset(strategy)
-    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
-
-    @def_function.function
-    def embedding_and_set_gradients(data):
-      mid_level_api.enqueue(data)
-      def tpu_fn():
-        results = mid_level_api.dequeue()
-        mid_level_api.apply_gradients((None, None,
-                                       array_ops.ones_like(results[2])))
-        return results
-      return strategy.run(tpu_fn)
-
-    @def_function.function
-    def embedding_only(data):
-      mid_level_api.enqueue(data, training=False)
-      def tpu_fn():
-        return mid_level_api.dequeue()
-      return strategy.run(tpu_fn)
-
-    first = self._get_replica_numpy(
-        embedding_and_set_gradients(data), strategy, 0)
-    second = self._get_replica_numpy(embedding_only(data), strategy, 0)
-
-    # First two features should be the same as None gradient was applied.
-    # Third feature had gradient of 1 passed in from each core.
-    # Each core received the same ids per core and returned the following batch:
-    # [ row 3, row 0 + row 1 + row 2 ]
-    # so gradient update was (learning rate = 0.1):
-    #   row 0: -1/3*0.1
-    #   row 1: -1/3*0.1
-    #   row 2: -1/3*0.1
-    #   row 3: -1*0.1
-    # There is a factor of num_replicas because each replica gave an update.
-
-    num_replicas = strategy.num_replicas_in_sync
-    update = ([[0.0]], [[0.0]],
-              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
-    golden = tuple([feature-np.array(up) for feature, up in zip(first, update)])
-
-    self.assertAllClose(golden, second)
-
-  def _get_strategy(self):
-    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
-    remote.connect_to_cluster(self.resolver)
-    tpu_strategy_util.initialize_tpu_system(self.resolver)
-    return tpu_strategy.TPUStrategy(self.resolver)
-
-  def test_dequeue_on_cpu(self):
-    mid_level_api = self._create_mid_level()
-    with self.assertRaises(RuntimeError):
-      mid_level_api.dequeue()
-
-  def test_enqueue_on_cpu(self):
-    mid_level_api = self._create_mid_level()
-    features = {
-        'watched': sparse_tensor.SparseTensor(
-            indices=self.feature_watched_indices,
-            values=self.feature_watched_values,
-            dense_shape=[2, 2])}
-    with self.assertRaises(RuntimeError):
-      mid_level_api.enqueue(features)
-
-  def test_apply_gradients_on_cpu(self):
-    mid_level_api = self._create_mid_level()
-    with self.assertRaises(RuntimeError):
-      mid_level_api.enqueue(None)
-
-  def test_get_embedding_tables_on_cpu(self):
-    mid_level_api = self._create_mid_level()
-    self.assertEqual(
-        set(mid_level_api.embedding_tables.keys()),
-        set([self.table_video, self.table_user]))
-
-  def test_get_embedding_tables_on_tpu(self):
-    with self._get_strategy().scope():
-      mid_level_api = self._create_mid_level()
-    with self.assertRaises(RuntimeError):
-      mid_level_api.embedding_tables()
-
-  def test_enqueue_weight_for_dense_tensor(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    input_fn = self._create_dense_input_fn(strategy, include_weights=True)
-    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
-    dist_iter = iter(dist)
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      features, weights = next(dist_iter)
-      mid_level_api.enqueue(features, weights=weights, training=False)
-      return strategy.run(step)
-
-    with self.assertRaisesRegex(ValueError, 'Weight specified for dense input'):
-      test_fn()
-
-  def test_enqueue_wrong_weight_type_for_sparse_tensor(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    sparse = self._create_sparse_dataset(strategy)
-    ragged = self._create_ragged_dataset(strategy, include_weights=True)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      features = next(sparse_iter)
-      _, weights = next(ragged_iter)
-      mid_level_api.enqueue(features, weights=weights, training=False)
-      return strategy.run(step)
-
-    with self.assertRaisesRegex(
-        ValueError, 'which does not match type input which is SparseTensor.'):
-      test_fn()
-
-  def test_enqueue_wrong_weight_type_for_ragged_tensor(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    sparse = self._create_sparse_dataset(strategy, include_weights=True)
-    ragged = self._create_ragged_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      _, weights = next(sparse_iter)
-      features = next(ragged_iter)
-      mid_level_api.enqueue(features, weights=weights, training=False)
-      return strategy.run(step)
-
-    with self.assertRaisesRegex(
-        ValueError, 'which does not match type input which is RaggedTensor.'):
-      test_fn()
-
-  def test_enqueue_sparse_and_ragged(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    sparse = self._create_sparse_dataset(strategy)
-    ragged = self._create_ragged_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      sparse_features = next(sparse_iter)
-      ragged_features = next(ragged_iter)
-      features = (sparse_features[0], ragged_features[1], sparse_features[2])
-      mid_level_api.enqueue(features, training=False)
-      return strategy.run(step)
-
-    with self.assertRaisesRegex(
-        ValueError, 'Found both SparseTensors and RaggedTensors'):
-      test_fn()
-
-  def test_enqueue_incorrect_structure_for_features(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    sparse = self._create_sparse_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      features = next(sparse_iter)
-      features = (features[0],)
-      mid_level_api.enqueue(features, training=False)
-      return strategy.run(step)
-
-    # The error here is raised from nest.assert_same_structure
-    with self.assertRaises(ValueError):
-      test_fn()
-
-  def test_enqueue_incorrect_structure_for_weights(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    sparse = self._create_sparse_dataset(strategy, include_weights=True)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      features, weights = next(sparse_iter)
-      weights = (weights[0],)
-      mid_level_api.enqueue(features, weights=weights, training=False)
-      return strategy.run(step)
-
-    # The error here is raised from nest.assert_same_structure
-    with self.assertRaises(ValueError):
-      test_fn()
-
-  def test_enqueue_ragged_tensor(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    sparse = self._create_sparse_dataset(strategy)
-    ragged = self._create_ragged_dataset(strategy)
-    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
-    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
-
-    @def_function.function
-    def test_fn():
-      def get_activations():
-        return mid_level_api.dequeue()
-
-      sparse_features = next(sparse_iter)
-      ragged_features = next(ragged_iter)
-      mid_level_api.enqueue(sparse_features, training=False)
-      sparse_activations = strategy.run(get_activations)
-      mid_level_api.enqueue(ragged_features, training=False)
-      ragged_activations = strategy.run(get_activations)
-      return sparse_activations, ragged_activations
-
-    sparse_activations, ragged_activations = test_fn()
-
-    # Extact per core numpy arrays and check that both sparse and ragged have
-    # the same results.
-    sparse0 = self._get_replica_numpy(sparse_activations, strategy, 0)
-    ragged0 = self._get_replica_numpy(ragged_activations, strategy, 0)
-    self.assertAllClose(sparse0, ragged0)
-
-  @parameterized.parameters(True, False)
-  def test_enqueue_with_weights(self, ragged):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    weight = 0.5
-    if ragged:
-      dataset = self._create_ragged_dataset(strategy, include_weights=True,
-                                            weight=weight)
-    else:
-      dataset = self._create_sparse_dataset(strategy, include_weights=True,
-                                            weight=weight)
-
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
-
-    @def_function.function
-    def enqueue_and_get(features, weights):
-      def get_activations():
-        return mid_level_api.dequeue()
-      mid_level_api.enqueue(features, weights=weights, training=False)
-      return strategy.run(get_activations)
-
-    features, weights = next(dataset_iter)
-    # Replace the weight for the second feature by None to test.
-    weights = (weights[0], None, weights[2])
-
-    no_weights_activations = enqueue_and_get(features, weights=None)
-    weights_activations = enqueue_and_get(features, weights=weights)
-
-    # Extact per core numpy arrays.
-    no_weights0 = self._get_replica_numpy(no_weights_activations, strategy, 0)
-    weights0 = self._get_replica_numpy(weights_activations, strategy, 0)
-    # videos table has sum combiner and users table has mean combiner.
-    # i.e. users table lookups isn't affected by the weights as all the weights
-    # are the same.
-    # Tuple entry 0 and 1 are the watched and favorited features from the videos
-    # table and entry 2 is the friends feature from the users table.
-    # Note that None was passed as a weight for entry 1 so weight should have no
-    # effect.
-    weight = (0.5, 1.0, 1.0)
-    golden = tuple([no_weight * w for no_weight, w in zip(no_weights0, weight)])
-
-    self.assertAllClose(golden, weights0)
-
-  def test_enqueue_with_outside_compilation(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
-
-    @def_function.function
-    def enqueue_with_outside_compilation(data):
-      def get_activations(features):
-        mid_level_api.enqueue(features, training=False)
-        return mid_level_api.dequeue()
-      return strategy.run(get_activations, args=(data,))
-
-    @def_function.function
-    def enqueue_without_outside_compilation(data):
-      def get_activations():
-        return mid_level_api.dequeue()
-      mid_level_api.enqueue(data, training=False)
-      return strategy.run(get_activations)
-
-    features = next(dataset_iter)
-
-    activations_oc = enqueue_with_outside_compilation(features)
-    activations = enqueue_without_outside_compilation(features)
-
-    # Extact per core numpy arrays.
-    activations_oc0 = self._get_replica_numpy(activations_oc, strategy, 0)
-    activations0 = self._get_replica_numpy(activations, strategy, 0)
-
-    self.assertAllClose(activations_oc0, activations0)
-
-  def test_enqueue_with_outside_compilation_in_control_flow(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
-
-    # This is one way to force the enqueue in some control flow. @tf.functions
-    # aren't inlined in the calling tf.function. An alternative would be to
-    # place the enqueue in a switch_v2 or something similar.
-    @def_function.function
-    def enqueue_fn(features):
-      mid_level_api.enqueue(features, training=False)
-
-    @def_function.function
-    def enqueue_with_outside_compilation():
-      def get_activations(features):
-        enqueue_fn(features)
-        return mid_level_api.dequeue()
-      return strategy.run(get_activations, args=(next(dataset_iter),))
-
-    with self.assertRaisesRegex(
-        RuntimeError,
-        'does not match graph which contains TPUReplicateContext'):
-      enqueue_with_outside_compilation()
-
-  def test_enqueue_with_outside_compilation_non_direct_input(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
-
-    @def_function.function
-    def enqueue_with_outside_compilation():
-      def get_activations(features):
-        # This inserts a mul operation on the TPU to trigger the direct input
-        # error.
-        features = (features[0]*2, features[1]*2, features[2]*2)
-        mid_level_api.enqueue(features, training=False)
-        return mid_level_api.dequeue()
-      return strategy.run(get_activations, args=(next(dataset_iter),))
-
-    with self.assertRaisesRegex(
-        ValueError, 'which does not have the `_tpu_input_identity` attr'):
-      enqueue_with_outside_compilation()
-
-  def test_enqueue_with_outside_compilation_auto_mode(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    dataset = self._create_sparse_dataset(strategy)
-    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
-
-    @def_function.function
-    def enqueue_with_no_gradient_apply(data):
-      def get_activations(features):
-        # Note the lack of setting training=False, so training defaults to true
-        # here even though we don't have apply gradients.
-        # We detect the correct mode based on which ops exist that share the
-        # same 'name'.
-        mid_level_api.enqueue(features, name='call1')
-        return mid_level_api.dequeue(name='call1')
-      return strategy.run(get_activations, args=(data,))
-
-    @def_function.function
-    def enqueue_with_gradient_apply(data):
-      def get_activations(features):
-        mid_level_api.enqueue(features, name='call2')
-        activations = mid_level_api.dequeue(name='call2')
-        # Apply an all ones gradient
-        gradients = nest.map_structure(array_ops.ones_like, activations)
-        mid_level_api.apply_gradients(gradients, name='call2')
-        return activations
-      return strategy.run(get_activations, args=(data,))
-
-    data = next(dataset_iter)
-    before_gradient_apply = enqueue_with_gradient_apply(data)
-    after_gradient_apply = enqueue_with_no_gradient_apply(data)
-    before_gradient_apply0 = self._get_replica_numpy(before_gradient_apply,
-                                                     strategy, 0)
-    after_gradient_apply0 = self._get_replica_numpy(after_gradient_apply,
-                                                    strategy, 0)
-
-    num_replicas = strategy.num_replicas_in_sync
-    # We are passing a gradient of 1 for all lookups, optimizer is SGD with a
-    # learning rate of 0.1. Feature 0 and 1 are looked up with a sum combiner
-    # with the following ids:
-    # Feature 0: [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
-    # Feature 1: [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
-    # i.e. Row 0 and 1 were looked up 3*num_replicas times over all cores and as
-    # the gradient is 1, the accumulated gradient is 3*num_replicas for each
-    # position in row 0 and 1 in table.
-    #
-    # See comments in test_pass_none_to_apply_gradients for the update to
-    # Feature 2 and its table.
-    # The *2 in the next tests are because those rows have 2 lookups vs
-    # the 1 lookup in the other row.
-    update = ([[0.3 * num_replicas], [0.3 * num_replicas * 2]],
-              [[0.3 * num_replicas * 2], [0.3 * num_replicas]],
-              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
-    golden = tuple([before - np.array(up) for before, up in
-                    zip(before_gradient_apply0, update)])
-
-    self.assertAllClose(golden, after_gradient_apply0)
-
-  def _create_strategy_and_mid_level(self, optimizer_name):
-    strategy = self._get_strategy()
-
-    with strategy.scope():
-      if optimizer_name == 'sgd':
-        optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
-      elif optimizer_name == 'adagrad':
-        optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1)
-      elif optimizer_name == 'adam':
-        optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1)
-      else:
-        raise ValueError('optimizer is not recognized: ', optimizer_name)
-      mid_level_api = self._create_mid_level(optimizer=optimizer)
-
-    return strategy, mid_level_api, optimizer
-
-  @parameterized.parameters(
-      *itertools.product(
-          ['sgd', 'adagrad', 'adam'],
-          [True, False]))
-  def test_embedding(self, optimizer_name, training):
-    strategy, mid_level_api, optimizer = (
-        self._create_strategy_and_mid_level(optimizer_name))
-
-    dataset = self._create_sparse_dataset(strategy)
-    dist = strategy.experimental_distribute_dataset(dataset)
-    dist_iter = iter(dist)
-
-    @def_function.function
-    def test_fn():
-
-      def step():
-        """Create and run computation that returns the embedding activations."""
-        if not training:
-          activations = mid_level_api.dequeue()
-          total_loss = _get_total_loss_tensor(activations)
-          ret_val = [total_loss] + list(activations)
-          return ret_val
-        else:
-          with backprop.GradientTape() as tape:
-            activations = mid_level_api.dequeue()
-            tape.watch(activations)
-            total_loss = _get_total_loss_tensor(activations)
-            loss_per_replica = total_loss / strategy.num_replicas_in_sync
-          gradients = tape.gradient(loss_per_replica, activations)
-          mid_level_api.apply_gradients(gradients)
-        ret_val = [total_loss] + list(activations)
-        return ret_val
-
-      mid_level_api.enqueue(next(dist_iter), training=training)
-      result = strategy.run(step)
-      return result
-
-    # Run model.
-    shard_out_val = test_fn()
-
-    # Retrieve TPU weights to CPU.
-    mid_level_api._retrieve_variables()
-
-    # Compute sparse tensors for global batch.
-    input_data = next(iter(self._create_sparse_dataset(strategy)))
-
-    # Check results.
-    self._check_results(strategy, shard_out_val, training, input_data,
-                        mid_level_api._variables,
-                        optimizer)
-
-  def _create_mid_level(self, optimizer=None):
-    # Create `TPUEmbedding` object.
-    if optimizer is None:
-      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
-
-    num_replicas = (
-        distribution_strategy_context.get_strategy().num_replicas_in_sync)
-    return tpu_embedding_v2.TPUEmbedding(
-        feature_config=self.feature_config,
-        batch_size=self.batch_size * num_replicas,
-        optimizer=optimizer)
-
-  def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
-    # Create dataset for enqueue operation
-    sparse_features = (
-        sparse_tensor.SparseTensor(
-            indices=self.feature_watched_indices,
-            values=self.feature_watched_values,
-            dense_shape=[self.data_batch_size, 2]),
-        sparse_tensor.SparseTensor(
-            indices=self.feature_favorited_indices,
-            values=self.feature_favorited_values,
-            dense_shape=[self.data_batch_size, 2]),
-        sparse_tensor.SparseTensor(
-            indices=self.feature_friends_indices,
-            values=self.feature_friends_values,
-            dense_shape=[self.data_batch_size, 3]))
-    if include_weights:
-      weights = []
-      for sparse in sparse_features:
-        values = (
-            array_ops.ones_like(sparse.values, dtype=dtypes.float32) * weight)
-        weights.append(sparse_tensor.SparseTensor(
-            indices=sparse.indices,
-            values=values,
-            dense_shape=sparse.dense_shape))
-      sparse_features = (sparse_features, tuple(weights))
-
-    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
-
-    # Data is batched to self.data_batch_size, rebatch to global batch size.
-    return dataset.unbatch().repeat().batch(
-        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
-
-  def _create_ragged_dataset(self, strategy, include_weights=False, weight=0.5):
-    # Create dataset for enqueue operation
-    ragged_features = (
-        ragged_tensor.RaggedTensor.from_row_lengths(
-            row_lengths=self.feature_watched_row_lengths,
-            values=self.feature_watched_values),
-        ragged_tensor.RaggedTensor.from_row_lengths(
-            row_lengths=self.feature_favorited_row_lengths,
-            values=self.feature_favorited_values),
-        ragged_tensor.RaggedTensor.from_row_lengths(
-            row_lengths=self.feature_friends_row_lengths,
-            values=self.feature_friends_values))
-    if include_weights:
-      weights = []
-      for ragged in ragged_features:
-        weights.append(ragged.with_values(
-            array_ops.ones_like(ragged.values, dtype=dtypes.float32) * weight))
-      ragged_features = (ragged_features, tuple(weights))
-
-    dataset = dataset_ops.DatasetV2.from_tensors(ragged_features)
-
-    # Data is batched to self.data_batch_size, rebatch to global batch size.
-    return dataset.unbatch().repeat().batch(
-        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
-
-  def _create_dense_input_fn(self, strategy, include_weights=False, weight=0.5):
-
-    def input_fn(ctx):
-      del ctx
-      features = (
-          constant_op.constant(self.feature_watched_values[-2:],
-                               dtype=dtypes.int32),
-          constant_op.constant(self.feature_favorited_values[-2:],
-                               dtype=dtypes.int32),
-          constant_op.constant(self.feature_friends_values[-2:],
-                               dtype=dtypes.int32))
-      if include_weights:
-        weights = [array_ops.ones_like(t, dtype=dtypes.float32) * weight
-                   for t in features]
-        features = (features, tuple(weights))
-      return dataset_ops.DatasetV2.from_tensors(features).repeat()
-
-    return input_fn
-
-  def _check_results(self, strategy, shard_out_val, training, input_data,
-                     table_to_variable, optimizer):
-    num_replicas = strategy.num_replicas_in_sync
-
-    # Unpack the values `strategy.run()` returns.
-    loss = _unpack(strategy, shard_out_val[0])
-    activation_watched = _unpack(strategy, shard_out_val[1])
-    activation_favorited = _unpack(strategy, shard_out_val[2])
-    activation_friends = _unpack(strategy, shard_out_val[3])
-
-    # Core 0:
-    # Calculate the values of embedding activations.
-    activation_watched_gold0 = np.array([[0, 1, 2, 3], [4, 6, 8, 10]])
-    activation_favorited_gold0 = np.array([[4, 6, 8, 10], [4, 5, 6, 7]])
-    # Second row of `activation_friends_gold0` is the mean of the following.
-    # row 0: 0 1
-    # row 1: 2 3
-    # row 2: 4 5
-    activation_friends_gold0 = np.array([[6, 7], [2, 3]])
-
-    loss_gold0 = _compute_loss(activation_watched_gold0,
-                               activation_favorited_gold0,
-                               activation_friends_gold0)
-
-    # Add on values from other cores:
-    # Activations for watched are an alternating sequence of
-    # activation_watched_gold0 and activation_favorited_gold0.
-    # For favorited it is the same but in the opposite order.
-    activation_watched_gold = np.concatenate(
-        (np.concatenate((np.expand_dims(activation_watched_gold0, axis=0),) *
-                        (num_replicas // 2)),
-         np.concatenate((np.expand_dims(activation_favorited_gold0, axis=0),) *
-                        (num_replicas // 2))),
-        axis=1).reshape([self.batch_size * num_replicas, 4])
-    activation_favorited_gold = np.concatenate(
-        (activation_watched_gold[self.batch_size:,],
-         activation_watched_gold[0:self.batch_size,]))
-    activation_friends_gold = np.concatenate(
-        (activation_friends_gold0,) * num_replicas)
-
-    loss_gold = [loss_gold0] * num_replicas
-
-    # Test values.
-    self.assertAllClose(activation_watched_gold, activation_watched)
-    self.assertAllClose(activation_favorited_gold, activation_favorited)
-    self.assertAllClose(activation_friends_gold, activation_friends)
-
-    self.assertAllClose(loss_gold, loss)
-
-    embedding_table_video_before = np.copy(
-        np.reshape(self.embedding_values, [8, 4]))
-    embedding_table_user_before = np.copy(
-        np.reshape(self.embedding_values, [16, 2]))
-
-    global_batch_size = self.batch_size * num_replicas
-    if training:
-      gradient_wrt_watched_gold = (2 * activation_watched_gold /
-                                   global_batch_size)
-      gradient_wrt_favorited_gold = (2 * activation_favorited_gold /
-                                     global_batch_size)
-      gradient_wrt_friends_gold = (2 * activation_friends_gold /
-                                   global_batch_size)
-
-      # Calculate gradients wrt embedding tables.
-      gradients_wrt_user = (
-          _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_friends_gold,
-              embedding_table_user_before, input_data[2].indices.numpy(),
-              input_data[2].values.numpy(), self.table_user.combiner))
-      gradients_wrt_video = (
-          _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_favorited_gold,
-              embedding_table_video_before, input_data[1].indices.numpy(),
-              input_data[1].values.numpy(), self.table_video.combiner) +
-          _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_watched_gold,
-              embedding_table_video_before, input_data[0].indices.numpy(),
-              input_data[0].values.numpy(), self.table_video.combiner))
-
-      self._check_embedding_and_slot_variables(embedding_table_user_before,
-                                               gradients_wrt_user,
-                                               embedding_table_video_before,
-                                               gradients_wrt_video,
-                                               optimizer,
-                                               table_to_variable)
-
-  def _check_embedding_and_slot_variables(self, embedding_table_user_before,
-                                          gradients_wrt_user,
-                                          embedding_table_video_before,
-                                          gradients_wrt_video,
-                                          optimizer,
-                                          table_to_variable):
-    if isinstance(optimizer, tpu_embedding_v2_utils.SGD):
-      check_fn = self._check_embedding_and_slot_variables_for_sgd
-    elif isinstance(optimizer, tpu_embedding_v2_utils.Adagrad):
-      check_fn = self._check_embedding_and_slot_variables_for_adagrad
-    elif isinstance(optimizer, tpu_embedding_v2_utils.Adam):
-      check_fn = self._check_embedding_and_slot_variables_for_adam
-    else:
-      raise ValueError('optimizer is not recognized: ', type(optimizer))
-    check_fn(embedding_table_user_before, gradients_wrt_user,
-             optimizer, table_to_variable[self.table_user.name])
-    check_fn(embedding_table_video_before, gradients_wrt_video,
-             optimizer, table_to_variable[self.table_video.name])
-
-  def _check_embedding_and_slot_variables_for_sgd(self, embedding_table_before,
-                                                  gradients,
-                                                  optimizer,
-                                                  variables):
-    embedding_table = np.copy(embedding_table_before)
-    embedding_table -= optimizer.learning_rate * np.sum(gradients, axis=0)
-    self.assertAllClose(_get_variable(variables['parameters']).numpy(),
-                        embedding_table)
-
-  def _check_embedding_and_slot_variables_for_adagrad(self,
-                                                      embedding_table_before,
-                                                      gradients,
-                                                      optimizer,
-                                                      variable):
-    embedding_table = np.copy(embedding_table_before)
-    accumulator = (
-        optimizer.initial_accumulator_value + np.sum(gradients, axis=0)**2)
-    embedding_table -= (
-        optimizer.learning_rate * np.sum(gradients, axis=0) /
-        np.sqrt(accumulator))
-    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
-                        embedding_table)
-    self.assertAllClose(_get_variable(variable['accumulators']).numpy(),
-                        accumulator)
-
-  def _check_embedding_and_slot_variables_for_adam(self, embedding_table_before,
-                                                   gradients,
-                                                   optimizer,
-                                                   variable):
-    embedding_table = np.copy(embedding_table_before)
-    g = np.sum(gradients, axis=0)
-    v = g**2 * (1 - optimizer.beta_2)
-    m = g * (1 - optimizer.beta_1)
-    epsilon = optimizer.epsilon
-    # TPU Embeddings don't have the LR decay factor for Adam.
-    lr_modifier = 1
-    embedding_table -= (
-        m * optimizer.learning_rate * lr_modifier / (np.sqrt(v) + epsilon))
-    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
-                        embedding_table, rtol=1e-4)
-    self.assertAllClose(_get_variable(variable['momenta']).numpy(),
-                        m, rtol=1e-4)
-    self.assertAllClose(_get_variable(variable['velocities']).numpy(),
-                        v, rtol=1e-4)
-
-  def _get_replica_numpy(self, structured, strategy, replica_id):
-    def select_replica(x):
-      x = strategy.experimental_local_results(x)
-      if len(x) == 1:
-        return x.numpy()
-      return x[replica_id].numpy()
-    return nest.map_structure(select_replica, structured)
-
-  def test_dense_lookup(self):
-    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-
-    input_fn = self._create_dense_input_fn(strategy)
-    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
-    dist_iter = iter(dist)
-
-    @def_function.function
-    def test_fn():
-      def step():
-        return mid_level_api.dequeue()
-
-      mid_level_api.enqueue(next(dist_iter), training=False)
-      return strategy.run(step)
-
-    # Run model.
-    shard0 = self._get_replica_numpy(test_fn(), strategy, 0)
-
-    # embedding_values is a linear list, so we reshape to match the correct
-    # shape of the corresponding table before performing the lookup.
-    numpy_videos = np.reshape(self.embedding_values, (8, 4))
-    numpy_users = np.reshape(self.embedding_values, (16, 2))
-    golden = ((numpy_videos[self.feature_watched_values[-2:]],
-               numpy_videos[self.feature_favorited_values[-2:]],
-               numpy_users[self.feature_friends_values[-2:]]))
-    self.assertAllClose(shard0, golden)
-
-  def test_variable_learning_rate(self):
-    num_steps = 10
-    num_steps_float = float(num_steps)
-    starting_lr = 1.0
-    ending_lr = 0.5
-
-    strategy = self._get_strategy()
-    num_replicas = strategy.num_replicas_in_sync
-
-    # Create model with Keras.
-    with strategy.scope():
-      step_counter = tf_variables.Variable(0.0, dtypes.float32)
-
-      def lr_function():
-        return gen_math_ops.maximum(
-            ending_lr,
-            starting_lr + ((ending_lr - starting_lr) * step_counter) /
-            num_steps_float)
-
-      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=lr_function)
-      table_config = tpu_embedding_v2_utils.TableConfig(
-          vocabulary_size=num_replicas,
-          dim=4,
-          initializer=init_ops_v2.Constant(np.zeros((num_replicas, 4))),
-          combiner='sum', name='table')
-      mid_level_api = tpu_embedding_v2.TPUEmbedding(
-          feature_config={
-              'feature': tpu_embedding_v2_utils.FeatureConfig(
-                  table=table_config, name='feature')},
-          batch_size=num_replicas,
-          optimizer=optimizer)
-
-    feature = {'feature': constant_op.constant([0], dtype=dtypes.int32)}
-
-    def input_fn(ctx):
-      del ctx
-      return dataset_ops.DatasetV2.from_tensors(feature).repeat()
-    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
-    dist_iter = iter(dist)
-
-    @def_function.function
-    def test_fn():
-      def step():
-        with backprop.GradientTape() as tape:
-          activations = mid_level_api.dequeue()
-          tape.watch(activations)
-          result = math_ops.reduce_sum(activations['feature'])
-          loss = result / num_replicas
-        grads = tape.gradient(loss, activations)
-        mid_level_api.apply_gradients(grads)
-        return activations['feature']
-
-      mid_level_api.enqueue(next(dist_iter), training=True)
-      return strategy.run(step)
-
-    # Run model.
-    results = []
-    for _ in range(num_steps):
-      result = test_fn()
-      results.append(_unpack(strategy, result))
-      step_counter.assign_add(1.0)
-
-    # Table is 2 elements wide, per-replica batch size of 1, with id 0.
-    # Loss for the gradient is the sum of the entries divided by the number of
-    # replicas. Thus the per replica gradient is 1/#of replicas for row 0 and no
-    # other updates. The reduced gradient is therefore 1.
-    # Learning rate schedule over num_steps steps:
-    # 1.0 0.95 0.9 0.85 0.8 ...
-    # Since use SGD and the gradient is one, the first row of the table is
-    # [0, 0] [-1.0, -1.0] [-1.95, -1.95] [-2.85, -2.85] ... (the negative
-    # partial sums of the above).
-
-    learning_rates = [starting_lr - (starting_lr - ending_lr) / num_steps * j
-                      for j in range(num_steps)]
-    cumsum = [sum(learning_rates[0:j]) for j in range(num_steps)]
-    goldens = [[[-cumsum[i]] * table_config.dim] * num_replicas
-               for i in range(10)]
-    self.assertAllClose(results, goldens)
-
-  @parameterized.parameters([True, False])
-  def test_optimizer_with_slot_creation_fn(self, use_tpu):
-    def slot_creation_fn(table, slot_names):
-      slots = {}
-      for slot in slot_names:
-        slots[slot] = tf_variables.Variable(
-            name='{}_{}'.format(table.name, slot),
-            initial_value=functools.partial(
-                init_ops_v2.Zeros(), shape=table.shape, dtype=dtypes.float32),
-            trainable=False)
-      return slots
-    optimizer = tpu_embedding_v2_utils.Adagrad(
-        learning_rate=0.1,
-        slot_variable_creation_fn=slot_creation_fn)
-    if use_tpu:
-      strategy = self._get_strategy()
-    else:
-      strategy = distribution_strategy_context.get_strategy()
-    num_replicas = strategy.num_replicas_in_sync
-    with strategy.scope():
-      mid_level = tpu_embedding_v2.TPUEmbedding(
-          feature_config=self.feature_config,
-          batch_size=self.batch_size * num_replicas,
-          optimizer=optimizer)
-    video_accumulator = mid_level._variables['video']['accumulators']
-    user_accumulator = mid_level._variables['user']['accumulators']
-    if use_tpu:
-      # To check the table contents (ensure that it is zero rather than the
-      # normal initial accumulator value specified to in the optimizer config),
-      # we need to select the underlying table variable on TPU.
-      # We only have one shard on Forge.
-      video_accumulator = video_accumulator.variables[0]
-      user_accumulator = user_accumulator.variables[0]
-
-    self.assertAllClose(video_accumulator.numpy(),
-                        np.zeros((self.table_video.vocabulary_size,
-                                  self.table_video.dim)))
-    self.assertAllClose(user_accumulator.numpy(),
-                        np.zeros((self.table_user.vocabulary_size,
-                                  self.table_user.dim)))
-
-  def test_optimizer_with_slot_creation_fn_non_partial(self):
-    def slot_creation_fn(table, slot_names):
-      slots = {}
-      for slot in slot_names:
-        # Note that we don't pass functools.partial here, so on TPU we can't
-        # extract the shape. We expect the error below.
-        slots[slot] = tf_variables.Variable(
-            name='{}_{}'.format(table.name, slot),
-            initial_value=init_ops_v2.Zeros()(shape=table.shape,
-                                              dtype=dtypes.float32),
-            trainable=False)
-      return slots
-    optimizer = tpu_embedding_v2_utils.Adagrad(
-        learning_rate=0.1,
-        slot_variable_creation_fn=slot_creation_fn)
-    strategy = self._get_strategy()
-    num_replicas = strategy.num_replicas_in_sync
-    with strategy.scope():
-      with self.assertRaisesRegex(ValueError,
-                                  'Unable to extract initializer function'):
-        tpu_embedding_v2.TPUEmbedding(
-            feature_config=self.feature_config,
-            batch_size=self.batch_size*num_replicas,
-            optimizer=optimizer)
-
-  def test_sequence_embeddings(self):
-    feature_config = (
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_video, name='watched',
-            max_sequence_length=2),
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_video, name='favorited',
-            max_sequence_length=2),
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_user, name='friends',
-            max_sequence_length=3))
-    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
-    strategy = self._get_strategy()
-    num_replicas = strategy.num_replicas_in_sync
-    with strategy.scope():
-      mid_level = tpu_embedding_v2.TPUEmbedding(
-          feature_config=feature_config,
-          batch_size=self.batch_size * num_replicas,
-          optimizer=optimizer)
-
-    dataset = self._create_sparse_dataset(strategy)
-    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
-
-    @def_function.function
-    def embedding_and_set_gradients(data):
-      def tpu_fn():
-        activations = mid_level.dequeue()
-        mid_level.apply_gradients(nest.map_structure(array_ops.ones_like,
-                                                     activations))
-        return activations
-      mid_level.enqueue(data)
-      return strategy.run(tpu_fn)
-
-    @def_function.function
-    def embedding_only(data):
-      def tpu_fn():
-        return mid_level.dequeue()
-      mid_level.enqueue(data)
-      return strategy.run(tpu_fn)
-
-    # Only check core 0.
-    before_update = self._get_replica_numpy(
-        embedding_and_set_gradients(data), strategy, 0)
-    after_update = self._get_replica_numpy(embedding_only(data), strategy, 0)
-
-    # For videos table, row 0 and row 1 are looked up 3*num_replicas times as
-    # they occur 3 times per replica (considering the features 0 and 1 which are
-    # both looked up in the videos table).
-    # Feature 0 has ids [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
-    # Feature 1 has ids [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
-    # This means that both rows 0 and 1 get a -0.1*3*num_replicas update
-    # For users table, each row is looked up twice:
-    # Feature 2 has ids [3, 0, 1, 2], .. repeated over num_replicas
-    # This means that we get a -0.1*num_replicas update to the third feature.
-
-    # In general this means that after the update, if we lookup feature 0 and 1
-    # the values will be 0.3*num_replicas lower per entry and for feature 2 they
-    # will be 0.1*num_replicas lower.
-    # The one issue that that these lookups contain padding values.
-    # For core 0, we get the first 2 elements of the 4 element batch.
-    # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence
-    # length of 2, which means that [0, 1] will be 0s.
-    # For feature 1, the indices are [[0, 0], [0, 1], [1, 0]] with max sequence
-    # length of 2, which means that [1, 1] will be 0s.
-    # For feature 2, the indices are [[0, 0], [1, 0], [1, 1], [1, 2]] with max
-    # sequence length of 3, which means that [0, 1], [0, 2] will be 0s.
-    # The following masks represent that so that we only apply the above updates
-    # to the non-padding rows:
-    masks = (
-        np.array([[[1], [0]], [[1], [1]]]),
-        np.array([[[1], [1]], [[1], [0]]]),
-        np.array([[[1], [0], [0]], [[1], [1], [1]]]))
-
-    per_row_update = (0.3 * num_replicas,
-                      0.3 * num_replicas,
-                      0.1 * num_replicas)
-    golden = tuple([before - update * mask for before, update, mask in
-                    zip(before_update, per_row_update, masks)])
-    self.assertAllClose(golden, after_update)
-
-
-def _compute_gradients_wrt_embedding_table(batch_size,
-                                           gradient_wrt_activation,
-                                           embedding_table,
-                                           feature_indices,
-                                           feature_values,
-                                           combiner,
-                                           max_sequence_length=0):
-  """Compute gradients wrt embedding_table.
-
-  Args:
-    batch_size: `int`, batch size.
-    gradient_wrt_activation: `np.array` with shape `batch_size` by
-      embedding `dimension`.
-    embedding_table: `np.array` with shape `vocabulary_size` by embedding
-      `dimension`.
-    feature_indices: `indices` as used to construct `SparseTensor`.
-    feature_values: `values` as used to construct `SparseTensor`.
-    combiner: `String`, 'mean' or 'sum'.
-    max_sequence_length: If non-zero, a sequence feature with the given length.
-
-  Returns:
-    Gradients wrt `embedding_table`, an `np.array`s with shape
-      `batch_size` by `vocabulary_size` by
-      embedding `dimension`.
-
-  Raises:
-    ValueError: if `combiner` is not one of 'mean' or 'sum'.
-  """
-  if combiner not in ('mean', 'sum'):
-    raise ValueError('`combiner` must be mean or sum; got {}.'.format(combiner))
-  grads = []
-  for i in range(batch_size):
-    grad = np.zeros_like(embedding_table)
-    count = 0
-    for (batch_i, seq_index), vocabulary_id in zip(feature_indices,
-                                                   feature_values):
-      if batch_i == i:
-        count += 1
-        if max_sequence_length > 0:
-          if seq_index < max_sequence_length:
-            grad[vocabulary_id, :] += gradient_wrt_activation[i, seq_index, :]
-        else:
-          grad[vocabulary_id, :] += gradient_wrt_activation[i, :]
-    if combiner == 'mean' and not max_sequence_length:
-      grad = grad / count
-    grads.append(grad)
-  return np.stack(grads)
-
-
-def _unpack(strategy, per_replica_output):
-  per_replica_output = strategy.experimental_local_results(per_replica_output)
-  per_replica_output = array_ops.concat(per_replica_output, axis=0).numpy()
-  return per_replica_output
-
-
-def _get_total_loss_tensor(activations):
-  losses = []
-  for activation in activations:
-    losses.append(
-        math_ops.reduce_mean(
-            math_ops.reduce_sum(
-                gen_math_ops.squared_difference(activation, 0), 1)))
-  total_loss = array_ops.expand_dims_v2(sum(losses), 0)
-  return total_loss
-
-
-def _compute_loss(activation_watched, activation_favorited, activation_friends):
-  watched_loss = np.mean(np.sum(activation_watched**2, axis=1))
-  if len(activation_favorited.shape) == 2:
-    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=1))
-  else:
-    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=(1, 2)))
-  if len(activation_friends.shape) == 2:
-    friends_loss = np.mean(np.sum(activation_friends**2, axis=1))
-  else:
-    friends_loss = np.mean(np.sum(activation_friends**2, axis=(1, 2)))
-  loss = watched_loss + favorited_loss + friends_loss
-  return loss
-
-
-def _get_tmpdir(name, subdir=''):
-  segments = [FLAGS.model_dir, name] + ([subdir] if subdir else [])
-  return os.path.join(*segments)
-
-
-def _get_variable(variable):
-  if isinstance(variable, tpu_embedding_v2.TPUShardedVariable):
-    return variable.variables[0]
-  return variable
-
-
-if __name__ == '__main__':
-  v2_compat.enable_v2_behavior()
-  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test_lib.py b/tensorflow/python/tpu/tpu_embedding_v2_test_lib.py
deleted file mode 100644
index b6fbdea6cb4..00000000000
--- a/tensorflow/python/tpu/tpu_embedding_v2_test_lib.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Library module for TPU Embedding mid level API test."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.ops import init_ops_v2
-from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_embedding_v2_utils
-
-
-class EmbeddingTestBase(test.TestCase):
-  """Base embedding test class for use on CPU and TPU."""
-
-  def _create_initial_data(self):
-    """Create the common test data used by both TPU and CPU."""
-
-    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
-    self.initializer = init_ops_v2.Constant(self.embedding_values)
-    # Embedding for video initialized to
-    # 0 1 2 3
-    # 4 5 6 7
-    # ...
-    self.table_video = tpu_embedding_v2_utils.TableConfig(
-        vocabulary_size=8,
-        dim=4,
-        initializer=self.initializer,
-        combiner='sum',
-        name='video')
-    # Embedding for user initialized to
-    # 0 1
-    # 2 3
-    # 4 5
-    # 6 7
-    # ...
-    self.table_user = tpu_embedding_v2_utils.TableConfig(
-        vocabulary_size=16,
-        dim=2,
-        initializer=self.initializer,
-        combiner='mean',
-        name='user')
-    self.feature_config = (
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_video, name='watched'),
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_video, name='favorited'),
-        tpu_embedding_v2_utils.FeatureConfig(
-            table=self.table_user, name='friends'))
-
-    self.batch_size = 2
-    self.data_batch_size = 4
-
-    # One (global) batch of inputs
-    # sparse tensor for watched:
-    # row 0: 0
-    # row 1: 0, 1
-    # row 2: 0, 1
-    # row 3: 1
-    self.feature_watched_indices = [[0, 0], [1, 0], [1, 1],
-                                    [2, 0], [2, 1], [3, 0]]
-    self.feature_watched_values = [0, 0, 1, 0, 1, 1]
-    self.feature_watched_row_lengths = [1, 2, 2, 1]
-    # sparse tensor for favorited:
-    # row 0: 0, 1
-    # row 1: 1
-    # row 2: 0
-    # row 3: 0, 1
-    self.feature_favorited_indices = [[0, 0], [0, 1], [1, 0],
-                                      [2, 0], [3, 0], [3, 1]]
-    self.feature_favorited_values = [0, 1, 1, 0, 0, 1]
-    self.feature_favorited_row_lengths = [2, 1, 1, 2]
-    # sparse tensor for friends:
-    # row 0: 3
-    # row 1: 0, 1, 2
-    # row 2: 3
-    # row 3: 0, 1, 2
-    self.feature_friends_indices = [[0, 0], [1, 0], [1, 1], [1, 2],
-                                    [2, 0], [3, 0], [3, 1], [3, 2]]
-    self.feature_friends_values = [3, 0, 1, 2, 3, 0, 1, 2]
-    self.feature_friends_row_lengths = [1, 3, 1, 3]

From 373000aab086ce4515771bd8209449a523b00d58 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Mon, 1 Jun 2020 10:02:15 -0700
Subject: [PATCH 1466/1533] Remove NoSideEffect trait from CollectivePermute
 op.

This is a TPU collective and must be kept ordered.

PiperOrigin-RevId: 314149408
Change-Id: I6f460ee87ae3aab2ae6c12c0bbf9367a8c0836c6
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 23 ------------------
 .../compiler/mlir/tensorflow/ir/tf_ops.td     | 24 +++++++++++++++++++
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 594dfafd991..59443ce3547 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1387,29 +1387,6 @@ Mutually accumulates multiple tensors of identical type and shape.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CollectivePermuteOp : TF_Op<"CollectivePermute", [NoSideEffect, SameOperandsAndResultShape]> {
-  let summary = "An Op to permute tensors across replicated TPU instances.";
-
-  let description = [{
-Each instance supplies its own input.
-
-For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
-source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
-`[D, A, B, C]`.
-  }];
-
-  let arguments = (ins
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
-    I32Tensor:$source_target_pairs
-  );
-
-  let results = (outs
-    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_CollectiveReduceOp : TF_Op<"CollectiveReduce", [SameOperandsAndResultType]> {
   let summary = [{
 Mutually reduces multiple tensors of identical type and shape.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 51b9dd862ac..5f3a1a5be35 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -99,6 +99,30 @@ def TF_ConstOp : TF_Op<"Const", [ConstantLike, NoSideEffect,
   }];
 }
 
+def TF_CollectivePermuteOp : TF_Op<"CollectivePermute", []> {
+  let summary = "An Op to permute tensors across replicated TPU instances.";
+
+  let description = [{
+Each instance supplies its own input.
+
+For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+`[D, A, B, C]`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    I32Tensor:$source_target_pairs
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+
 def TF_DataFormatVecPermuteOp : TF_Op<"DataFormatVecPermute", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Permute input tensor from `src_format` to `dst_format`";
 

From 3f91d43368e5855414aacf80cc0615a4edf1975b Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 1 Jun 2020 10:13:27 -0700
Subject: [PATCH 1467/1533] Use static_cast instead of reinterpret_cast for
 casting from void* to another pointer.

PiperOrigin-RevId: 314151811
Change-Id: I5960290346478a3926a1a5d517263cde77d97674
---
 .../delegates/gpu/common/model_builder.cc     | 63 +++++++++----------
 1 file changed, 30 insertions(+), 33 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 29d9813379e..49678a1a947 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -82,7 +82,7 @@ inline void DequantizeConstantTensor(const TfLiteTensor& tensor,
                                      const T* source_data,
                                      float* dequantized_data) {
   TfLiteAffineQuantization* quant_params =
-      reinterpret_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
+      static_cast<TfLiteAffineQuantization*>(tensor.quantization.params);
   if (quant_params->scale->size > 1) {
     // Tensor is per-channel quantized.
     PerChannelDequantizationParams op_params;
@@ -234,8 +234,7 @@ absl::Status GetFullyConnectedAttributes(int weights_tensor_id,
 template <typename ParamsT>
 absl::Status RetrieveBuiltinData(const TfLiteNode* tflite_node,
                                  ParamsT** tf_options) {
-  const auto* params =
-      reinterpret_cast<const ParamsT*>(tflite_node->builtin_data);
+  const auto* params = static_cast<const ParamsT*>(tflite_node->builtin_data);
   if (!params) {
     return absl::InternalError("Unable to retrieve builtin_data.");
   }
@@ -247,7 +246,7 @@ template <typename ParamsType>
 absl::Status RetrieveCustomInitialData(const TfLiteNode* tflite_node,
                                        ParamsType** tf_options) {
   const auto* params =
-      reinterpret_cast<const ParamsType*>(tflite_node->custom_initial_data);
+      static_cast<const ParamsType*>(tflite_node->custom_initial_data);
   if (!params) {
     return absl::InternalError("Unable to retrieve custom_initial_data.");
   }
@@ -427,7 +426,7 @@ class AddOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
     node->operation.attributes = std::move(attr);
     const auto* tf_options =
-        reinterpret_cast<const TfLiteAddParams*>(tflite_node->builtin_data);
+        static_cast<const TfLiteAddParams*>(tflite_node->builtin_data);
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
     }
@@ -503,7 +502,7 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
         break;
       }
     }
-    const auto* tf_options = reinterpret_cast<const TfLiteConcatenationParams*>(
+    const auto* tf_options = static_cast<const TfLiteConcatenationParams*>(
         tflite_node->builtin_data);
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
@@ -606,7 +605,7 @@ class Conv2DOperationParser : public TFLiteOperationParser {
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
 
     const auto* tf_options =
-        reinterpret_cast<const TfLiteConvParams*>(tflite_node->builtin_data);
+        static_cast<const TfLiteConvParams*>(tflite_node->builtin_data);
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
     }
@@ -643,7 +642,7 @@ class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* params = reinterpret_cast<const TfLiteTransposeConvParams*>(
+    const auto* params = static_cast<const TfLiteTransposeConvParams*>(
         tflite_node->custom_initial_data);
     ConvolutionTransposedAttributes attr;
     attr.stride =
@@ -878,16 +877,16 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       TfLiteFusedActivation activation = kTfLiteActNone;
       switch (operation_type_) {
         case OperationType::SUB: {
-          const auto* tf_options = reinterpret_cast<const TfLiteSubParams*>(
-              tflite_node->builtin_data);
+          const auto* tf_options =
+              static_cast<const TfLiteSubParams*>(tflite_node->builtin_data);
           if (tf_options != nullptr) {
             activation = tf_options->activation;
           }
           break;
         }
         case OperationType::DIV: {
-          const auto* tf_options = reinterpret_cast<const TfLiteDivParams*>(
-              tflite_node->builtin_data);
+          const auto* tf_options =
+              static_cast<const TfLiteDivParams*>(tflite_node->builtin_data);
           if (tf_options != nullptr) {
             activation = tf_options->activation;
           }
@@ -1006,9 +1005,8 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
     Node* node = graph->NewNode();
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
-    const auto* tf_options =
-        reinterpret_cast<const TfLiteFullyConnectedParams*>(
-            tflite_node->builtin_data);
+    const auto* tf_options = static_cast<const TfLiteFullyConnectedParams*>(
+        tflite_node->builtin_data);
     if (tf_options->weights_format !=
         kTfLiteFullyConnectedWeightsFormatDefault) {
       return absl::UnimplementedError(
@@ -1118,7 +1116,7 @@ class LSTMOperationParser : public TFLiteOperationParser {
     }
 
     const auto* params =
-        reinterpret_cast<const TfLiteLSTMParams*>(tflite_node->builtin_data);
+        static_cast<const TfLiteLSTMParams*>(tflite_node->builtin_data);
     if (!params) {
       return absl::InternalError("Missing tflite params");
     }
@@ -1257,7 +1255,7 @@ class MulOperationParser : public TFLiteOperationParser {
     }
 
     const auto* tf_options =
-        reinterpret_cast<const TfLiteMulParams*>(tflite_node->builtin_data);
+        static_cast<const TfLiteMulParams*>(tflite_node->builtin_data);
     if (!tf_options) {
       return absl::InternalError("Missing TfLiteMulParams");
     }
@@ -1343,7 +1341,7 @@ class PadOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     if (mirror_pad_) {
-      auto* tf_options = reinterpret_cast<const TfLiteMirrorPaddingParams*>(
+      auto* tf_options = static_cast<const TfLiteMirrorPaddingParams*>(
           tflite_node->builtin_data);
       if (tf_options->mode !=
           TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect) {
@@ -1449,11 +1447,11 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
     // is MaxPoolingWithArgmax2D. There is no way to read
     // tflite_node->builtin_code, so, simply check whether custom data is
     // available.
-    auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
-        tflite_node->custom_initial_data);
+    auto* tf_options =
+        static_cast<const TfLitePoolParams*>(tflite_node->custom_initial_data);
     if (!tf_options) {
       tf_options =
-          reinterpret_cast<const TfLitePoolParams*>(tflite_node->builtin_data);
+          static_cast<const TfLitePoolParams*>(tflite_node->builtin_data);
     }
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
@@ -1642,8 +1640,7 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   template <class T>
   absl::Status GetAlignCornersValueForType(const TfLiteNode* tflite_node,
                                            bool* align_corners) {
-    const auto* tf_options =
-        reinterpret_cast<const T*>(tflite_node->builtin_data);
+    const auto* tf_options = static_cast<const T*>(tflite_node->builtin_data);
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
     }
@@ -1654,8 +1651,8 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   absl::Status GetHalfPixelCentersValue(const TfLiteNode* tflite_node,
                                         bool* half_pixel_centers) {
     if (sampling_type_ == SamplingType::BILINEAR) {
-      const auto* tf_options = reinterpret_cast<TfLiteResizeBilinearParams*>(
-          tflite_node->builtin_data);
+      const auto* tf_options =
+          static_cast<TfLiteResizeBilinearParams*>(tflite_node->builtin_data);
       if (!tf_options) {
         return absl::InternalError(
             "Missing tflite params for ResizeBilinear op");
@@ -1816,7 +1813,7 @@ class SoftmaxOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
     const auto* tf_options =
-        reinterpret_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
+        static_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
     }
@@ -1863,8 +1860,8 @@ class SpaceToDepthOperationParser : public TFLiteOperationParser {
     node->operation.type = ToString(OperationType::SPACE_TO_DEPTH);
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
-    const auto* tf_options = reinterpret_cast<const TfLiteSpaceToDepthParams*>(
-        tflite_node->builtin_data);
+    const auto* tf_options =
+        static_cast<const TfLiteSpaceToDepthParams*>(tflite_node->builtin_data);
     SpaceToDepthAttributes attr;
     attr.block_size = tf_options->block_size;
     node->operation.attributes = attr;
@@ -1904,8 +1901,8 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
           "Slicing is supported for 3 or 4 dimensional tensors only.");
     }
 
-    const auto* tf_options = reinterpret_cast<const TfLiteStridedSliceParams*>(
-        tflite_node->builtin_data);
+    const auto* tf_options =
+        static_cast<const TfLiteStridedSliceParams*>(tflite_node->builtin_data);
     auto out_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
@@ -2080,7 +2077,7 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* tf_options = reinterpret_cast<const TfLiteTransposeConvParams*>(
+    const auto* tf_options = static_cast<const TfLiteTransposeConvParams*>(
         tflite_node->builtin_data);
     if (!tf_options) {
       return absl::InternalError("Missing tflite options.");
@@ -2163,8 +2160,8 @@ class Unpooling2DOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
     auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
     MaxUnpooling2DAttributes attr;
-    const auto* tf_options = reinterpret_cast<const TfLitePoolParams*>(
-        tflite_node->custom_initial_data);
+    const auto* tf_options =
+        static_cast<const TfLitePoolParams*>(tflite_node->custom_initial_data);
     if (!tf_options) {
       return absl::InternalError("Missing tflite params");
     }

From cbac31d59ded64caa23a00481bb33d1104ab8822 Mon Sep 17 00:00:00 2001
From: Penporn Koanantakool <penporn@google.com>
Date: Mon, 1 Jun 2020 10:14:16 -0700
Subject: [PATCH 1468/1533] Add a bfloat16 sum reducer that uses float32
 accumulators. Fix existing tests. The majority of the changes are from PR
 #38630 ([Intel MKL] Enable BF16 Softmax/SoftmaxGrad) which was reverted
 because of test failures.

PiperOrigin-RevId: 314152011
Change-Id: Ib50e1ae90016c05a6fc62b8d21ce7b3f34d28833
---
 tensorflow/core/kernels/mkl_tmp_bf16_ops.cc |  4 +++-
 tensorflow/core/kernels/reduction_ops.h     | 24 +++++++++++++++++++-
 tensorflow/core/kernels/softmax_op.cc       |  1 +
 tensorflow/core/ops/nn_grad.cc              |  2 +-
 tensorflow/python/ops/math_ops_test.py      | 10 +++++++++
 tensorflow/python/ops/nn_grad_test.py       | 25 +++++++++++++++++++++
 tensorflow/python/ops/nn_test.py            | 12 ++++++++++
 7 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
index 9b2d09fb827..ed5fec677e8 100644
--- a/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
+++ b/tensorflow/core/kernels/mkl_tmp_bf16_ops.cc
@@ -58,7 +58,9 @@ namespace tensorflow {
   REGISTER_KERNEL_BUILDER(                                                    \
       Name("_FusedMatMul").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);  \
   REGISTER_KERNEL_BUILDER(                                                    \
-      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
+      Name("BatchMatMulV2").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp); \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), NoOp);
 
 TF_CALL_bfloat16(REGISTER_CPU);
 #undef REGISTER_CPU
diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h
index 46d8051fff1..d1066d6556b 100644
--- a/tensorflow/core/kernels/reduction_ops.h
+++ b/tensorflow/core/kernels/reduction_ops.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 // Functor definitions for Reduction ops, must be compilable by nvcc.
 
-#include <iostream>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
@@ -58,6 +57,29 @@ struct ReduceEigenImpl {
   }
 };
 
+// Specialization for BF16 Reducer to fix accuracy.
+// TODO: All BF16 reducers should have specializations to fix accuracy.
+#define CASTING_SPECIALIZATION(Reducer, ScalarType, IntermediateType)        \
+  template <typename Device, typename OUT_T, typename IN_T,                  \
+            typename ReductionAxes>                                          \
+  struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,                 \
+                         Reducer<ScalarType>> {                              \
+    void operator()(const Device& d, OUT_T out, IN_T in,                     \
+                    const ReductionAxes& reduction_axes,                     \
+                    const Reducer<ScalarType>& reducer) {                    \
+      static_assert(std::is_same<ScalarType, typename OUT_T::Scalar>::value, \
+                    "");                                                     \
+      Reducer<IntermediateType> intermediate_reducer;                        \
+      auto in_as_intermediate = in.template cast<IntermediateType>();        \
+      out.device(d) =                                                        \
+          in_as_intermediate.reduce(reduction_axes, intermediate_reducer)    \
+              .template cast<ScalarType>();                                  \
+    }                                                                        \
+  };
+
+CASTING_SPECIALIZATION(Eigen::internal::SumReducer, bfloat16, float);
+#undef CASTING_SPECIALIZATION
+
 template <typename Device, typename OUT_T, typename IN_T,
           typename ReductionAxes, typename Scalar>
 struct ReduceEigenImpl<Device, OUT_T, IN_T, ReductionAxes,
diff --git a/tensorflow/core/kernels/softmax_op.cc b/tensorflow/core/kernels/softmax_op.cc
index 54c0e9f91e5..9bdfca32d88 100644
--- a/tensorflow/core/kernels/softmax_op.cc
+++ b/tensorflow/core/kernels/softmax_op.cc
@@ -82,6 +82,7 @@ class SoftmaxOp : public OpKernel {
   REGISTER_KERNEL_BUILDER(                                       \
       Name("Softmax").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
       SoftmaxOp<CPUDevice, T>);
+TF_CALL_bfloat16(REGISTER_CPU);
 TF_CALL_half(REGISTER_CPU);
 TF_CALL_float(REGISTER_CPU);
 TF_CALL_double(REGISTER_CPU);
diff --git a/tensorflow/core/ops/nn_grad.cc b/tensorflow/core/ops/nn_grad.cc
index 7beaf57c10b..30c0c96c6c8 100644
--- a/tensorflow/core/ops/nn_grad.cc
+++ b/tensorflow/core/ops/nn_grad.cc
@@ -31,7 +31,7 @@ Status SoftmaxGrad(const AttrSlice& attrs, FunctionDef* g) {
       // Ret val defs
       {"grad_x: T"},
       // Attr defs
-      {{"T: {float, double}"}},
+      {{"T: {float, double, bfloat16}"}},
       // Nodes
       // Based on _SoftmaxGrad in nn_grad.py.
       {
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index afa1dbdbaf7..ca34a0012f1 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -44,6 +44,16 @@ class ReduceTest(test_util.TensorFlowTestCase):
       y_tf = self.evaluate(math_ops.reduce_sum(x))
       self.assertEqual(y_tf, 21)
 
+  def testReduceExtendType(self):
+    in_f32 = np.random.randn(1000, 1000).astype(np.float32)
+    in_bf16 = math_ops.cast(in_f32, dtypes.bfloat16)
+
+    out_f32 = self.evaluate(math_ops.reduce_sum(in_f32))
+    out_bf16 = self.evaluate(math_ops.reduce_sum(in_bf16))
+    expected = math_ops.cast(out_f32, dtypes.bfloat16)
+
+    self.assertAllClose(out_bf16, expected, 1e-3)
+
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with test_util.device(use_gpu=True):
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 9da56cb7200..490451c16c9 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import nn_impl
@@ -33,6 +35,29 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 
 
+class SoftmaxOpTest(test.TestCase):
+
+  # This test is for bfloat16, but the type has a problem with compute_gradient.
+  # TODO(penporn): Change the data type back to bfloat16 once b/157773623 is
+  # fixed. (compute_gradient internally converts bfloat16 to float32 for
+  # calculation anyway.)
+  def testSoftmaxGradGradExtendType(self):
+    with self.cached_session():
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.softmax(x)
+        return tape.gradient(y, x)
+
+      x = constant_op.constant([[-2, -1, 1, 3], [5, 7, 8, 9]],
+                               dtype=dtypes.float32)
+      error = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
+      self.assertLess(error, 1e-4)
+
+
 class Relu6OpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 911eca9fbae..e672018bcf6 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -130,6 +130,18 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(x_neg_axis_tf, y_pos_axis_tf, eps)
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
+  def testSoftmaxExtendType(self):
+    x_shape = [5, 10]
+    x_np = np.random.randn(*x_shape).astype(np.float32)
+
+    x_f32_tf = constant_op.constant(x_np)
+    x_bf16_tf = math_ops.cast(x_f32_tf, dtypes.bfloat16)
+    y_f32_tf = self.evaluate(nn_ops.softmax(x_f32_tf))
+    y_bf16_tf = self.evaluate(nn_ops.softmax(x_bf16_tf))
+    expected = math_ops.cast(y_f32_tf, dtypes.bfloat16)
+    tol = x_shape[1] * 1e-3
+    self.assertAllClose(y_bf16_tf, expected, rtol=tol, atol=tol)
+
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
   @test_util.run_deprecated_v1
   def testGradient(self, x_shape):

From dd10f45e258f0837ad4d0c16a054f5aa016698d7 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Mon, 1 Jun 2020 10:32:40 -0700
Subject: [PATCH 1469/1533] [TF:STATELES_RNG] clarify that the same output of
 stateless rng is only guaranteed for the same shape and seed.

PiperOrigin-RevId: 314155951
Change-Id: I799dd8a3ec95bb5f7df73fcc36067674263525f5
---
 tensorflow/python/ops/stateless_random_ops.py | 53 ++++++++++---------
 1 file changed, 27 insertions(+), 26 deletions(-)

diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 0ae29ba0219..25fefcc514c 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -124,9 +124,9 @@ def stateless_random_uniform(shape,
   """Outputs deterministic pseudorandom values from a uniform distribution.
 
   This is a stateless version of `tf.random.uniform`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   The generated values follow a uniform distribution in the range
@@ -222,10 +222,10 @@ def stateless_random_binomial(shape,
   probability of success parameters.
 
   This is a stateless version of `tf.random.Generator.binomial`: if run twice
-  with the same seeds, it will produce the same pseudorandom numbers. The
-  output is consistent across multiple runs on the same hardware (and between
-  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
-  hardware.
+  with the same seeds and shapes, it will produce the same pseudorandom numbers.
+  The output is consistent across multiple runs on the same hardware (and
+  between CPU and GPU), but may change between versions of TensorFlow or on
+  non-CPU/GPU hardware.
 
   Example:
 
@@ -292,9 +292,10 @@ def stateless_random_gamma(shape,
   (`alpha`) and inverse scale (`beta`) parameters.
 
   This is a stateless version of `tf.random.gamma`: if run twice with the same
-  seeds, it will produce the same pseudorandom numbers. The output is consistent
-  across multiple runs on the same hardware (and between CPU and GPU), but may
-  change between versions of TensorFlow or on non-CPU/GPU hardware.
+  seeds and shapes, it will produce the same pseudorandom numbers. The output is
+  consistent across multiple runs on the same hardware (and between CPU and
+  GPU),
+  but may change between versions of TensorFlow or on non-CPU/GPU hardware.
 
   A slight difference exists in the interpretation of the `shape` parameter
   between `stateless_gamma` and `gamma`: in `gamma`, the `shape` is always
@@ -390,9 +391,9 @@ def stateless_random_poisson(shape,
   parameter.
 
   This is a stateless version of `tf.random.poisson`: if run twice with the same
-  seeds, it will produce the same pseudorandom numbers. The output is consistent
-  across multiple runs on the same hardware, but may change between versions of
-  TensorFlow or on non-CPU/GPU hardware.
+  seeds and shapes, it will produce the same pseudorandom numbers. The output is
+  consistent across multiple runs on the same hardware, but may change between
+  versions of TensorFlow or on non-CPU/GPU hardware.
 
   A slight difference exists in the interpretation of the `shape` parameter
   between `stateless_poisson` and `poisson`: in `poisson`, the `shape` is always
@@ -451,9 +452,9 @@ def stateless_random_normal(shape,
   """Outputs deterministic pseudorandom values from a normal distribution.
 
   This is a stateless version of `tf.random.normal`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   Args:
@@ -492,10 +493,9 @@ def stateless_truncated_normal(shape,
   """Outputs deterministic pseudorandom values, truncated normally distributed.
 
   This is a stateless version of `tf.random.truncated_normal`: if run twice with
-  the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  the same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   The generated values follow a normal distribution with specified mean and
@@ -540,9 +540,9 @@ def stateless_multinomial(logits,
   """Draws deterministic pseudorandom samples from a multinomial distribution.
 
   This is a stateless version of `tf.random.categorical`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
   Example:
@@ -581,11 +581,12 @@ def stateless_categorical(logits,
   """Draws deterministic pseudorandom samples from a categorical distribution.
 
   This is a stateless version of `tf.categorical`: if run twice with the
-  same seeds, it will produce the same pseudorandom numbers.  The output is
-  consistent across multiple runs on the same hardware (and between CPU
-  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  same seeds and shapes, it will produce the same pseudorandom numbers.  The
+  output is consistent across multiple runs on the same hardware (and between
+  CPU and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
   hardware.
 
+
   Example:
 
   ```python

From 38e7a4457d8afd1a9960e079c6857ca32a73c2ef Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 1 Jun 2020 10:51:58 -0700
Subject: [PATCH 1470/1533] Consolidate all the quantization configurations and
 validations

This patch makes the following changes to the tflite converter:

- All the quantization related converter and quantizer configurations are
  consolidated in the QuantizationMode class

- Converter v1 and v2 shared the same interface to set the quantization flags

- The restriction "couldn't use int8 inference type when representative data is
  specified" is removed since the we workflow should be able to partially
  quantize the model by the converter and then post-training quantization by
  the quantizer (MLIR-based).

PiperOrigin-RevId: 314160448
Change-Id: Ic212cddd89cfdd33048592d2aaf8e9b2f9f8999f
---
 tensorflow/lite/python/lite.py      | 172 +++++++++++++---------------
 tensorflow/lite/python/lite_test.py |  29 +++++
 2 files changed, 110 insertions(+), 91 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 53814bb0c43..a68077296dd 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -204,7 +204,8 @@ class QuantizationMode(object):
   def training_time_int8_allow_float(self):
     """Training-time int8 quantize, allow float fallback."""
     return (self._any_optimization_enabled() and
-            self._contains_training_quant_op())
+            not self.post_training_dynamic_range_int8() and
+            not self.post_training_fp16())
 
   def post_training_dynamic_range_int8(self):
     """Post training int8 const, on-the-fly int8 quantize of dynamic tensors."""
@@ -212,7 +213,7 @@ class QuantizationMode(object):
     # int8 quantization and training time quantization was not done.
     return (self._any_optimization_enabled() and
             self._representative_dataset is None and
-            not self._contains_training_quant_op() and
+            not self.contains_training_quant_op() and
             self._smallest_supported_type() == constants.INT8)
 
   def post_training_fp16(self):
@@ -228,6 +229,66 @@ class QuantizationMode(object):
                 self.post_training_dynamic_range_int8() or
                 self.post_training_fp16())
 
+  def converter_flags(self, inference_ty=None, inference_input_ty=None):
+    """Flags to the converter."""
+    if (self.post_training_int8_no_float() or
+        self.post_training_int8_allow_float()):
+      # The inference_input_type is for the quantizer, then we need to keep the
+      # converter inference_iput_type to float.
+      inference_input_ty = constants.FLOAT
+
+    if self.training_time_int8_allow_float():
+      return {
+          "inference_type": inference_ty if inference_ty else constants.INT8,
+          "inference_input_type":
+              inference_input_ty if inference_input_ty else constants.FLOAT,
+          "post_training_quantize": False,  # disable dynamic range quantization
+          "quantize_to_float16": False  # disable float16 quantization
+      }
+    elif self.post_training_dynamic_range_int8():
+      return {
+          "inference_type": constants.FLOAT,
+          "inference_input_type": constants.FLOAT,
+          "post_training_quantize": True,  # enable dynamic range quantization
+          "quantize_to_float16": False  # disable float16 quantization
+      }
+    elif self.post_training_fp16():
+      return {
+          "inference_type": constants.FLOAT,
+          "inference_input_type": constants.FLOAT,
+          "post_training_quantize": True,
+          "quantize_to_float16": True  # enable float16 quantization
+      }
+    else:
+      # Note this might still trigger (uint8) quantization to be compatible with
+      # TOCO.
+      return {
+          "inference_type": inference_ty if inference_ty else constants.FLOAT,
+          "inference_input_type": inference_input_ty,
+          "post_training_quantize": False,  # enable dynamic range quantization
+          "quantize_to_float16": False  # disable float16 quantization
+      }
+
+  def quantizer_flags(self, input_ty=None, output_ty=None):
+    """Default flags to the TFMOT quantizer."""
+
+    inference_input_type = input_ty if input_ty else constants.FLOAT
+    inference_output_type = output_ty if output_ty else constants.FLOAT
+    if self.post_training_int8_no_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "allow_float": False
+      }
+    elif self.post_training_int8_allow_float():
+      return True, {
+          "inference_input_type": inference_input_type,
+          "inference_output_type": inference_output_type,
+          "allow_float": True
+      }
+    else:
+      return False, None
+
   # Below are helpers for the above functions.
 
   def _validate_int8_required(self):
@@ -271,7 +332,7 @@ class QuantizationMode(object):
       # The default smallest supported type is INT8.
       return constants.INT8
 
-  def _contains_training_quant_op(self):
+  def contains_training_quant_op(self):
     """Checks if the graph contains any training-time quantization ops."""
     training_quant_ops = frozenset({
         "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxVarsPerChannel",
@@ -361,8 +422,6 @@ class TFLiteConverterBase(object):
     args = {
         "input_format": constants.TENSORFLOW_GRAPHDEF,
         "allow_custom_ops": self.allow_custom_ops,
-        "post_training_quantize": False,
-        "quantize_to_float16": False,
         "debug_info": self._debug_info,
         "target_ops": self.target_spec.supported_ops,
         "enable_mlir_converter": self.experimental_new_converter,
@@ -462,22 +521,7 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
           graph_def)
 
     converter_kwargs = self._get_base_converter_args()
-
-    if quant_mode.training_time_int8_allow_float():
-      converter_kwargs.update({
-          "inference_type": constants.INT8,
-          "inference_input_type": constants.FLOAT,
-      })
-
-    if quant_mode.post_training_dynamic_range_int8():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-      })
-    elif quant_mode.post_training_fp16():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-          "quantize_to_float16": True,
-      })
+    converter_kwargs.update(quant_mode.converter_flags())
 
     if not self.experimental_new_converter:
       logging.warning(
@@ -497,12 +541,9 @@ class TFLiteConverterBaseV2(TFLiteConverterBase):
         output_tensors=output_tensors,
         **converter_kwargs)
 
-    if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, False)
-    elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, constants.FLOAT,
-                                              constants.FLOAT, True)
+    calibrate_and_quantize, flags = quant_mode.quantizer_flags()
+    if calibrate_and_quantize:
+      result = self._calibrate_quantize_model(result, **flags)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
@@ -1046,7 +1087,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
       return self.target_spec.supported_ops
     return object.__getattribute__(self, name)
 
-  def _validate_quantized_input_stats(self, converter_kwargs):
+  def _validate_quantized_input_stats(self, converter_kwargs, calibrate):
     """Ensure quantized_input_stats provided if required."""
 
     quantized_types = frozenset({constants.INT8, constants.QUANTIZED_UINT8})
@@ -1054,7 +1095,7 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     requires_quantized_input_stats = (
         (converter_kwargs["inference_type"] in quantized_types or
          converter_kwargs["inference_input_type"] in quantized_types) and
-        not converter_kwargs["post_training_quantize"])
+        not calibrate)
 
     if (requires_quantized_input_stats and
         not converter_kwargs["quantized_input_stats"]):
@@ -1110,50 +1151,10 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     else:
       quantized_stats = None
 
-    toco_inference_input_type = self.inference_input_type
-    inference_input_type = self.inference_input_type
-    inference_output_type = self.inference_output_type
-    post_training_optimize = (
-        quant_mode.post_training_int8_no_float() or
-        quant_mode.post_training_int8_allow_float() or
-        quant_mode.post_training_dynamic_range_int8() or
-        quant_mode.post_training_fp16())
-    if post_training_optimize:
-      # Post training optimizations require that TOCO outputs a float model.
-      if self.inference_type != constants.FLOAT:
-        raise ValueError(
-            "`optimizations` require that `inference_type` is set to float.")
-      toco_inference_input_type = constants.FLOAT
-      # Set up default values.
-      if inference_input_type is None:
-        inference_input_type = constants.FLOAT
-      if inference_output_type is None:
-        inference_output_type = constants.FLOAT
-
-    weight_only_quantize = (
-        quant_mode.post_training_dynamic_range_int8() or
-        quant_mode.post_training_fp16())
-    if weight_only_quantize:
-      # Currently, weight only quantization requires float inputs and outputs.
-      if (inference_input_type != constants.FLOAT or
-          inference_output_type != constants.FLOAT):
-        raise ValueError(
-            "Provide an inference_input_type and inference_output_type of type "
-            "tf.float32.")
-
-    if not post_training_optimize and self.inference_output_type is not None:
-      raise ValueError(
-          "inference_output_type is currently not supported if optimizations "
-          "are not enabled.")
-
     optimized_graph = self._graph_def
     if not self.saved_model_dir:
-      # if it is not uint8 or int8 with post-training quantization, it is not
-      # quantization aware training, then graph optimization is applied.
-      # Graph optimization is disabled for quantization aware training.
-      if (self.inference_type != constants.QUANTIZED_UINT8 or
-          (self.inference_type == constants.INT8 and
-           (post_training_optimize or weight_only_quantize))):
+      # Disable grappler constant folding if there are training quant ops.
+      if not quant_mode.contains_training_quant_op():
         try:
           # TODO(b/150163103): Merge `disabling lower using switch merge' calls.
           # Grappler will also try to lower while loop into switch merge
@@ -1174,20 +1175,10 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
     self._debug_info = _get_debug_info(self._debug_info_func, optimized_graph)
 
     converter_kwargs = self._get_base_converter_args()
-
-    if quant_mode.post_training_dynamic_range_int8():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-      })
-    elif quant_mode.post_training_fp16():
-      converter_kwargs.update({
-          "post_training_quantize": True,
-          "quantize_to_float16": True,
-      })
-
+    converter_kwargs.update(
+        quant_mode.converter_flags(self.inference_type,
+                                   self.inference_input_type))
     converter_kwargs.update({
-        "inference_type": self.inference_type,
-        "inference_input_type": toco_inference_input_type,
         "output_format": self.output_format,
         "quantized_input_stats": quantized_stats,
         "default_ranges_stats": self.default_ranges_stats,
@@ -1211,7 +1202,10 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
                    "please file a bug. You can opt-out "
                    "by setting experimental_new_converter=False")
 
-    self._validate_quantized_input_stats(converter_kwargs)
+    calibrate_quantize, flags = quant_mode.quantizer_flags(
+        self.inference_input_type, self.inference_output_type)
+
+    self._validate_quantized_input_stats(converter_kwargs, calibrate_quantize)
 
     # Converts model.
     if self._has_valid_tensors():
@@ -1227,12 +1221,8 @@ class TFLiteConverterBaseV1(TFLiteConverterBase):
           output_arrays=self._output_arrays,
           **converter_kwargs)
 
-    if quant_mode.post_training_int8_no_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, False)
-    elif quant_mode.post_training_int8_allow_float():
-      result = self._calibrate_quantize_model(result, inference_input_type,
-                                              inference_output_type, True)
+    if calibrate_quantize:
+      result = self._calibrate_quantize_model(result, **flags)
 
     if self._experimental_sparsify_model:
       result = _mlir_sparsify(result)
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 1bcb2ce0ee4..9611bda2594 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -1149,6 +1149,35 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     }
     quantized_converter.convert()
 
+  def testTrainingTimeAndPostTrainingCalibrateAndQuantize(self):
+    with ops.Graph().as_default():
+      inp, output, calibration_gen = self._getCalibrationQuantizeModel()
+      sess = session.Session()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    converter = lite.TFLiteConverter.from_session(sess, [inp], [output])
+
+    # extra flags to trigger training time quantization conversion
+    converter.inference_type = lite_constants.INT8
+    converter.inference_input_type = lite_constants.FLOAT
+    converter.inference_output_type = lite_constants.FLOAT
+    input_arrays = converter.get_input_arrays()
+    converter.quantized_input_stats = {
+        input_arrays[0]: (0., 1.)
+    }
+    # trigger post-training quantization
+    converter.optimizations = [lite.Optimize.DEFAULT]
+    converter.representative_dataset = calibration_gen
+    converter._experimental_new_quantizer = True
+    quantized_tflite = converter.convert()
+    self.assertTrue(quantized_tflite)
+
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
   def testFloatTocoConverter(self):
     """Tests deprecated test TocoConverter."""
     with ops.Graph().as_default():

From 59589939b3d01988e22cce93d4ccfeceb5da01e5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 11:20:42 -0700
Subject: [PATCH 1471/1533] Fix a typo in documentation of metadata schema.

PiperOrigin-RevId: 314166955
Change-Id: I76c7a5ed3412a3b40938fe19256d7715fab5d27d
---
 .../lite/experimental/support/metadata/metadata_schema.fbs      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index a2812e1b6e3..2883a91bd0c 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -287,7 +287,7 @@ table Content {
   // dimension 1/2: the heatmap image.
   // dimension 3: 17 body parts of a person.
   // Even though the last axis is body part, the real content of this tensor is
-  // the heatmap. "range" should be [min=2; max=3].
+  // the heatmap. "range" should be [min=1; max=2].
   //
   // Case 3: The tensor contains multiple different objects. (Not supported by
   // Content at this point).

From 4fb21eb662e234805a89642644e94ddabf4d3a9c Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 1 Jun 2020 11:24:04 -0700
Subject: [PATCH 1472/1533] [tf.data service] Always link server lib

This addresses an issue where server_lib could be left out of pywrap_tensorflow.so, causing errors like
"undefined symbol: _ZN10tensorflow4data18GrpcDataServerBase9BoundPortEv"

PiperOrigin-RevId: 314167693
Change-Id: Iead424c4fa5bdcaa3595415444acebc170b35851
---
 tensorflow/core/data/service/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index b76f93c454e..6b2341d2b6d 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -244,6 +244,7 @@ cc_library(
     name = "server_lib",
     srcs = ["server_lib.cc"],
     hdrs = ["server_lib.h"],
+    linkstatic = True,
     visibility = [
         "//visibility:public",
     ],
@@ -257,6 +258,7 @@ cc_library(
         "//tensorflow/core:tensorflow",
         tf_grpc_cc_dependency(),
     ],
+    alwayslink = 1,
 )
 
 cc_library(

From e7c9422d7142fdd9abd6bce4a6ce83c8b1b5e85c Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 1 Jun 2020 11:26:42 -0700
Subject: [PATCH 1473/1533] Provide XPlaneBuilder interface for serializing
 protobuf

PiperOrigin-RevId: 314168274
Change-Id: I9c12f9dfd22982f6c2e7c8f0b132e930688baecd
---
 .../internal/cpu/metadata_collector.cc        |  7 +++---
 .../core/profiler/utils/tfstreamz_utils.cc    | 12 +++------
 .../core/profiler/utils/xplane_builder.h      | 25 ++++++++-----------
 3 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index 58da20ae3c5..da922d4b18b 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -68,12 +68,11 @@ class MetadataCollector : public ProfilerInterface {
       XPlane* plane = GetOrCreatePlane(space, kMetadataPlane);
       plane->set_id(kMetadataPlaneId);
       XPlaneBuilder xplane(plane);
+      const XStatMetadata& hlo_proto_stat =
+          *xplane.GetOrCreateStatMetadata(kHloProto);
       for (auto& p : debug_info_) {
-        std::string hlo_proto;
-        p.hlo_proto->SerializeToString(&hlo_proto);
+        xplane.AddStatValue(hlo_proto_stat, *p.hlo_proto);
         p.hlo_proto.reset();
-        xplane.AddStatValue(*xplane.GetOrCreateStatMetadata(kHloProto),
-                            std::move(hlo_proto), /*is_bytes=*/true);
       }
       debug_info_.clear();
     }
diff --git a/tensorflow/core/profiler/utils/tfstreamz_utils.cc b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
index f4cbaa84100..230efb6a055 100644
--- a/tensorflow/core/profiler/utils/tfstreamz_utils.cc
+++ b/tensorflow/core/profiler/utils/tfstreamz_utils.cc
@@ -53,7 +53,7 @@ std::string ConstructXStatName(absl::string_view name,
           }));
 }
 
-std::string SerializePercentile(const monitoring::Percentiles& percentiles) {
+tfstreamz::Percentiles ToProto(const monitoring::Percentiles& percentiles) {
   tfstreamz::Percentiles output;
   output.set_unit_of_measure(
       static_cast<tfstreamz::UnitOfMeasure>(percentiles.unit_of_measure));
@@ -71,7 +71,7 @@ std::string SerializePercentile(const monitoring::Percentiles& percentiles) {
     percentile_point->set_percentile(pp.percentile);
     percentile_point->set_value(pp.value);
   }
-  return output.SerializeAsString();
+  return output;
 }
 
 }  // namespace
@@ -113,14 +113,10 @@ Status SerializeToXPlane(const std::vector<TfStreamzSnapshot>& snapshots,
                                                point->string_value));
             break;
           case monitoring::ValueType::kHistogram:
-            xevent.AddStatValue(*metadata,
-                                point->histogram_value.SerializeAsString(),
-                                /*is_bytes=*/true);
+            xevent.AddStatValue(*metadata, point->histogram_value);
             break;
           case monitoring::ValueType::kPercentiles:
-            xevent.AddStatValue(*metadata,
-                                SerializePercentile(point->percentiles_value),
-                                /*is_bytes=*/true);
+            xevent.AddStatValue(*metadata, ToProto(point->percentiles_value));
             break;
         }
       }
diff --git a/tensorflow/core/profiler/utils/xplane_builder.h b/tensorflow/core/profiler/utils/xplane_builder.h
index b0d743a0caf..d948964bc2e 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.h
+++ b/tensorflow/core/profiler/utils/xplane_builder.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/numbers.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/time_utils.h"
@@ -55,26 +56,20 @@ class XStatsBuilder {
   void AddStatValue(const XStatMetadata& metadata, double value) {
     AddStat(metadata)->set_double_value(value);
   }
-  void AddStatValue(const XStatMetadata& metadata, absl::string_view value,
-                    bool is_bytes = false) {
-    if (is_bytes) {
-      AddStat(metadata)->set_bytes_value(std::string(value));
-    } else {
-      AddStat(metadata)->set_str_value(std::string(value));
-    }
+  void AddStatValue(const XStatMetadata& metadata, absl::string_view value) {
+    AddStat(metadata)->set_str_value(std::string(value));
   }
-  void AddStatValue(const XStatMetadata& metadata, std::string&& value,
-                    bool is_bytes = false) {
-    if (is_bytes) {
-      AddStat(metadata)->set_bytes_value(std::move(value));
-    } else {
-      AddStat(metadata)->set_str_value(std::move(value));
-    }
+  void AddStatValue(const XStatMetadata& metadata, std::string&& value) {
+    AddStat(metadata)->set_str_value(std::move(value));
   }
-
   void AddStatValue(const XStatMetadata& key, const XStatMetadata& value) {
     AddStat(key)->set_ref_value(value.id());
   }
+  void AddStatValue(const XStatMetadata& metadata,
+                    const protobuf::MessageLite& proto) {
+    auto* bytes = AddStat(metadata)->mutable_bytes_value();
+    proto.SerializeToString(bytes);
+  }
 
   void AddStat(const XStatMetadata& key, const XStat& stat, const XPlane& src);
 

From ecb473d27ba875cdf46aa1630afd1edb9c65423b Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 1 Jun 2020 11:36:44 -0700
Subject: [PATCH 1474/1533] Respect the quantization parameters from QAT when
 applying post-training quantization

PiperOrigin-RevId: 314170250
Change-Id: I516de53a8b1b5156e7f5df02688e04d280436cb5
---
 .../mlir/lite/quantization/quantization_utils.cc      | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 3d50f280d0f..32f68aaae5f 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
@@ -436,6 +437,16 @@ bool RemoveRedundantStatsOps(mlir::FuncOp func,
   llvm::SmallVector<quant::StatisticsOp, 16> all_stats_ops;
   llvm::DenseSet<Operation*> redundant_stats_ops;
 
+  // Step 0: remove the quant::StatisticsOp which are used by the tfl.quantize
+  // op in case it overrides the information from training FakeQuant ops.
+  func.walk([&](quant::QuantizeCastOp q) {
+    auto input_op = q.arg().getDefiningOp();
+    if (auto stats = llvm::dyn_cast_or_null<quant::StatisticsOp>(input_op)) {
+      q.setOperand(stats.arg());
+      if (stats.use_empty()) stats.erase();
+    }
+  });
+
   // Step 1: forward pass: propagate any value scales which are not produces
   // by `SameOperandsAndResultsScale`. Additionally, remove the value scales
   // which are produced by the `restricted_output_params`.

From cf84b28440fed40c6a522cbdc1c50c9fc449c4ba Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 1 Jun 2020 11:43:13 -0700
Subject: [PATCH 1475/1533] Fix const-ness of the out parameters of
 RetrieveBuiltinData and RetrieveCustomInitialData.

Use the returned status to determine if the data is available instead of checking for nullptr.

PiperOrigin-RevId: 314171480
Change-Id: I0819531a8f6dfdceafbfa9fe63539514d8120e03
---
 .../delegates/gpu/common/model_builder.cc     | 65 +++++++++----------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 49678a1a947..29c819f7800 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -233,24 +233,21 @@ absl::Status GetFullyConnectedAttributes(int weights_tensor_id,
 
 template <typename ParamsT>
 absl::Status RetrieveBuiltinData(const TfLiteNode* tflite_node,
-                                 ParamsT** tf_options) {
-  const auto* params = static_cast<const ParamsT*>(tflite_node->builtin_data);
-  if (!params) {
+                                 const ParamsT** tf_options) {
+  *tf_options = static_cast<const ParamsT*>(tflite_node->builtin_data);
+  if (!*tf_options) {
     return absl::InternalError("Unable to retrieve builtin_data.");
   }
-  *tf_options = const_cast<ParamsT*>(params);
   return absl::OkStatus();
 }
 
-template <typename ParamsType>
+template <typename ParamsT>
 absl::Status RetrieveCustomInitialData(const TfLiteNode* tflite_node,
-                                       ParamsType** tf_options) {
-  const auto* params =
-      static_cast<const ParamsType*>(tflite_node->custom_initial_data);
-  if (!params) {
+                                       const ParamsT** tf_options) {
+  *tf_options = static_cast<const ParamsT*>(tflite_node->custom_initial_data);
+  if (!*tf_options) {
     return absl::InternalError("Unable to retrieve custom_initial_data.");
   }
-  *tf_options = const_cast<ParamsType*>(params);
   return absl::OkStatus();
 }
 
@@ -408,7 +405,7 @@ class AddOperationParser : public TFLiteOperationParser {
       }
     }
 
-    TfLiteAddParams* tf_options = nullptr;
+    const TfLiteAddParams* tf_options;
     return RetrieveBuiltinData(tflite_node, &tf_options);
   }
 
@@ -447,7 +444,7 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
     //   RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, idx));
     // }
     // TODO(eignasheva): add axis checking.
-    TfLiteConcatenationParams* tf_options = nullptr;
+    const TfLiteConcatenationParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return absl::OkStatus();
   }
@@ -579,7 +576,7 @@ class Conv2DOperationParser : public TFLiteOperationParser {
     if (runtime_inputs == 1) {
       RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
     }
-    TfLiteConvParams* tf_options = nullptr;
+    const TfLiteConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckStridesAndDilation(
         tf_options->stride_height, tf_options->stride_width,
@@ -627,7 +624,7 @@ class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteTransposeConvParams* tf_options = nullptr;
+    const TfLiteTransposeConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
     RETURN_IF_ERROR(
         CheckStrides(tf_options->stride_height, tf_options->stride_width));
@@ -668,7 +665,7 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteDepthwiseConvParams* tf_options;
+    const TfLiteDepthwiseConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckStridesAndDilation(
         tf_options->stride_height, tf_options->stride_width,
@@ -723,7 +720,7 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser {
     DepthwiseConvolution2DAttributes attr;
     RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
-    TfLiteDepthwiseConvParams* tf_options;
+    const TfLiteDepthwiseConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
     attr.dilations = HW(std::max(1, tf_options->dilation_height_factor),
@@ -920,15 +917,15 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
   absl::Status GetActivation(const TfLiteNode* tflite_node,
                              TfLiteFusedActivation* activation) const {
     if (operation_type_ == OperationType::DIV) {
-      TfLiteDivParams* tf_options;
-      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-      *activation = tf_options ? tf_options->activation : kTfLiteActNone;
+      const TfLiteDivParams* tf_options;
+      auto status = RetrieveBuiltinData(tflite_node, &tf_options);
+      *activation = status.ok() ? tf_options->activation : kTfLiteActNone;
       return absl::OkStatus();
     }
     if (operation_type_ == OperationType::SUB) {
-      TfLiteSubParams* tf_options;
-      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
-      *activation = tf_options ? tf_options->activation : kTfLiteActNone;
+      const TfLiteSubParams* tf_options;
+      auto status = RetrieveBuiltinData(tflite_node, &tf_options);
+      *activation = status.ok() ? tf_options->activation : kTfLiteActNone;
       return absl::OkStatus();
     }
 
@@ -988,7 +985,7 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 4));
-    TfLiteFullyConnectedParams* tf_options = nullptr;
+    const TfLiteFullyConnectedParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->weights_format !=
         kTfLiteFullyConnectedWeightsFormatDefault) {
@@ -1099,7 +1096,7 @@ class LSTMOperationParser : public TFLiteOperationParser {
     // RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
     // /*runtime_inputs=*/5,
     //                                    /*outputs=*/4));
-    TfLiteLSTMParams* tf_options = nullptr;
+    const TfLiteLSTMParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckParameters(tf_options));
     return absl::OkStatus();
@@ -1193,7 +1190,7 @@ class MulOperationParser : public TFLiteOperationParser {
     if (tflite_node->inputs->size != 2) {
       return absl::UnimplementedError("MUL requires two input tensors.");
     }
-    TfLiteMulParams* tf_options;
+    const TfLiteMulParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return IsActivationSupported(tf_options->activation);
   }
@@ -1409,7 +1406,7 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
-    TfLitePoolParams* tf_options = nullptr;
+    const TfLitePoolParams* tf_options;
     auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
     if (status.ok()) {  // custom case with indices as a second output
       RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
@@ -1537,9 +1534,9 @@ class ReLUOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
     ReLUAttributes attr;
-    TfLiteLeakyReluParams* tf_options = nullptr;
-    RetrieveBuiltinData(tflite_node, &tf_options).IgnoreError();
-    attr.alpha = tf_options ? tf_options->alpha : 0;
+    const TfLiteLeakyReluParams* tf_options;
+    auto status = RetrieveBuiltinData(tflite_node, &tf_options);
+    attr.alpha = status.ok() ? tf_options->alpha : 0;
     attr.clip = clip_;
     node->operation.attributes = attr;
     return reader->AddOutputs(node);
@@ -1795,7 +1792,7 @@ class SoftmaxOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
-    TfLiteSoftmaxParams* tf_options = nullptr;
+    const TfLiteSoftmaxParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->beta != 1) {
       // TODO(eignasheva): figure out, what's wrong with softmax.
@@ -1840,7 +1837,7 @@ class SpaceToDepthOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/1, /*outputs=*/1));
     // TODO(impjdi): Dims check.
-    TfLiteSpaceToDepthParams* s2d_params = nullptr;
+    const TfLiteSpaceToDepthParams* s2d_params;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &s2d_params));
     if (s2d_params->block_size == 1) {
       return absl::InvalidArgumentError(
@@ -1875,7 +1872,7 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
-    TfLiteStridedSliceParams* tf_options = nullptr;
+    const TfLiteStridedSliceParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckOptionsSupport(tf_options));
     return absl::OkStatus();
@@ -2057,7 +2054,7 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
                            const TfLiteRegistration* registration) final {
     RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1));
-    TfLiteTransposeConvParams* tf_options = nullptr;
+    const TfLiteTransposeConvParams* tf_options;
     RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(
         CheckStrides(tf_options->stride_height, tf_options->stride_width));
@@ -2140,9 +2137,9 @@ class Unpooling2DOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    TfLitePoolParams* tf_options = nullptr;
     RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
                                        /*runtime_inputs=*/2, /*outputs=*/1));
+    const TfLitePoolParams* tf_options;
     RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckKernelsAndStrides(
         tf_options->filter_height, tf_options->filter_width,

From 0d3ec4d99b43ba35e01bdd10b11e0d5b7a657721 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 1 Jun 2020 11:53:15 -0700
Subject: [PATCH 1476/1533] TensorDescriptor implements GPUObjectDescriptor
 interface. Tensor implements GPUObject interface. Added basic Read selectors.

PiperOrigin-RevId: 314173448
Change-Id: I3a9a594cfac343c231929ee6516fe2096c8e927f
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |   3 +
 .../delegates/gpu/cl/kernels/transpose.cc     |  36 +-
 tensorflow/lite/delegates/gpu/cl/tensor.cc    |  35 ++
 tensorflow/lite/delegates/gpu/cl/tensor.h     |   8 +-
 .../lite/delegates/gpu/cl/tensor_type.cc      | 318 ++++++++++++++++++
 .../lite/delegates/gpu/cl/tensor_type.h       |  35 +-
 6 files changed, 411 insertions(+), 24 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index d37f666f8a6..463fb0aecc5 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -457,6 +457,7 @@ cc_library(
         ":cl_device",
         ":cl_image_format",
         ":cl_memory",
+        ":gpu_object",
         ":tensor_type",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
@@ -493,8 +494,10 @@ cc_library(
     srcs = ["tensor_type.cc"],
     hdrs = ["tensor_type.h"],
     deps = [
+        ":gpu_object",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:shape",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 4102e9d02a1..03c6e22924e 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -31,19 +31,13 @@ std::string GetTransposeCode(
     const OperationDef& op_def, const TransposeAttributes& attr,
     const std::vector<ElementwiseOperation*>& linked_operations,
     Arguments* args) {
-  TensorCodeGenerator src_tensor("src_data",
-                                 WHSBPoint{"args.src_width", "args.src_height",
-                                           "args.src_slices", "args.src_batch"},
-                                 op_def.src_tensors[0]);
   TensorCodeGenerator dst_tensor("dst_data",
                                  WHSBPoint{"args.dst_width", "args.dst_height",
                                            "args.dst_slices", "args.dst_batch"},
                                  op_def.dst_tensors[0]);
 
-  args->AddInt("src_width");
-  args->AddInt("src_height");
-  args->AddInt("src_slices");
-  args->AddInt("src_batch");
+  args->AddObjectRef(
+      "src_tensor", absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
   args->AddInt("dst_width");
   args->AddInt("dst_height");
   args->AddInt("dst_slices");
@@ -53,9 +47,8 @@ std::string GetTransposeCode(
   const std::string batch_id = op_def.IsBatchSupported() ? "B" : "";
   std::string c = GetCommonDefines(op_def.precision);
   c += "__kernel void main_function(\n";
-  c += src_tensor.GetDeclaration(AccessType::READ);
+  c += dst_tensor.GetDeclaration(AccessType::WRITE);
   c += GetArgsDeclaration(linked_operations);
-  c += dst_tensor.GetDeclaration(AccessType::WRITE) + ",\n  ";
   c += "$0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = get_global_id(0);\n";
@@ -82,10 +75,12 @@ std::string GetTransposeCode(
   remap[attr.perm.c] = 3;
   if (attr.perm.c == 3) {  // optimized reading when no channels permutation
     const std::string bhw[] = {"B", "Y", "X"};
-    std::string src_b = op_def.IsBatchSupported() ? bhw[remap[0]] : "";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "  args.src_tensor.SetBatchRef(" + bhw[remap[0]] + ");\n";
+    }
     c += "  int s_y = " + bhw[remap[1]] + ";\n";
     c += "  int s_x = " + bhw[remap[2]] + ";\n";
-    c += "  FLT4 t =" + src_tensor.ReadWHSB("s_x", "s_y", "Z", src_b) + ";\n";
+    c += "  FLT4 t = args.src_tensor.Read(s_x, s_y, Z);\n";
     c += "  temps[0] = t.x;\n";
     c += "  temps[1] = t.y;\n";
     c += "  temps[2] = t.z;\n";
@@ -93,16 +88,17 @@ std::string GetTransposeCode(
   } else {
     c += "  for (int i = 0; i < 4; ++i) {\n";
     c += "    int dst_channel = Z * 4 + i;\n";
-    c += "    if (dst_channel < args.dst_channels) {;\n";
+    c += "    if (dst_channel < args.dst_channels) {\n";
     const std::string bhwc[] = {"B", "Y", "X", "dst_channel"};
-    std::string src_b = op_def.IsBatchSupported() ? bhwc[remap[0]] : "";
+    if (op_def.src_tensors[0].HasAxis(Axis::BATCH)) {
+      c += "      args.src_tensor.SetBatchRef(" + bhwc[remap[0]] + ");\n";
+    }
     c += "      int s_y = " + bhwc[remap[1]] + ";\n";
     c += "      int s_x = " + bhwc[remap[2]] + ";\n";
     c += "      int s_c = " + bhwc[remap[3]] + ";\n";
     c += "      int s_z = s_c / 4;\n";
     c += "      int src_sub_ch = s_c % 4;\n";
-    c += "      FLT4 t =" + src_tensor.ReadWHSB("s_x", "s_y", "s_z", src_b) +
-         ";\n";
+    c += "      FLT4 t = args.src_tensor.Read(s_x, s_y, s_z);\n";
     c += "      FLT t_ar[4] = {t.x, t.y, t.z, t.w};\n";
     c += "      temps[i] = t_ar[src_sub_ch];\n";
     c += "    }\n";
@@ -148,19 +144,15 @@ absl::Status Transpose::Compile(const CreationContext& creation_context) {
 }
 
 absl::Status Transpose::BindArguments() {
-  RETURN_IF_ERROR(args_.SetInt("src_width", src_[0]->Width()));
-  RETURN_IF_ERROR(args_.SetInt("src_height", src_[0]->Height()));
-  RETURN_IF_ERROR(args_.SetInt("src_slices", src_[0]->Slices()));
-  RETURN_IF_ERROR(args_.SetInt("src_batch", src_[0]->Batch()));
+  RETURN_IF_ERROR(args_.SetObjectRef("src_tensor", src_[0]));
   RETURN_IF_ERROR(args_.SetInt("dst_width", dst_[0]->Width()));
   RETURN_IF_ERROR(args_.SetInt("dst_height", dst_[0]->Height()));
   RETURN_IF_ERROR(args_.SetInt("dst_slices", dst_[0]->Slices()));
   RETURN_IF_ERROR(args_.SetInt("dst_batch", dst_[0]->Batch()));
   RETURN_IF_ERROR(args_.SetInt("dst_channels", dst_[0]->Channels()));
   kernel_.ResetBindingCounter();
-  RETURN_IF_ERROR(kernel_.SetMemoryAuto(src_[0]->GetMemoryPtr()));
-  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(kernel_.SetMemoryAuto(dst_[0]->GetMemoryPtrForWriting()));
+  RETURN_IF_ERROR(BindArgs(&kernel_, linked_operations_));
   RETURN_IF_ERROR(args_.Bind(kernel_.kernel(), kernel_.GetBindingCounter()));
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 4a52508af0e..3381266bcba 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -142,6 +142,41 @@ void Tensor::Release() {
   }
 }
 
+GPUResourcesWithValue Tensor::GetGPUResources() const {
+  GPUResourcesWithValue resources;
+  if (descriptor_.HasAxis(Axis::WIDTH)) {
+    resources.ints.push_back({"width", Width()});
+  }
+  if (descriptor_.HasAxis(Axis::HEIGHT)) {
+    resources.ints.push_back({"height", Height()});
+  }
+  if (descriptor_.HasAxis(Axis::CHANNELS)) {
+    resources.ints.push_back({"slices", Slices()});
+    resources.ints.push_back({"channels", Channels()});
+  }
+  if (descriptor_.HasAxis(Axis::BATCH)) {
+    resources.ints.push_back({"batch", Batch()});
+  }
+  if (descriptor_.HasAxis(Axis::DEPTH)) {
+    resources.ints.push_back({"depth", Depth()});
+  }
+
+  if (descriptor_.storage_type == TensorStorageType::BUFFER) {
+    resources.buffers.push_back({"buffer", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_2D ||
+             descriptor_.storage_type == TensorStorageType::SINGLE_TEXTURE_2D) {
+    resources.images2d.push_back({"image2d", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_ARRAY) {
+    resources.image2d_arrays.push_back({"image2d_array", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_3D) {
+    resources.images3d.push_back({"image3d", memory_});
+  } else if (descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER) {
+    resources.image_buffers.push_back({"image_buffer", image_buffer_memory_});
+  }
+
+  return resources;
+}
+
 int3 Tensor::GetFullTensorRegion() const {
   switch (descriptor_.storage_type) {
     case TensorStorageType::BUFFER:
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index cb7d4263a5c..8ded8f2f041 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_memory.h"
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
@@ -36,7 +37,7 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-class Tensor {
+class Tensor : public GPUObject {
  public:
   Tensor()
       : memory_(nullptr), image_buffer_memory_(nullptr), memory_owner_(true) {}
@@ -57,6 +58,11 @@ class Tensor {
 
   virtual ~Tensor() { Release(); }
 
+  const GPUObjectDescriptor* GetGPUDescriptor() const override {
+    return &descriptor_;
+  }
+  GPUResourcesWithValue GetGPUResources() const override;
+
   int Width() const { return shape_.w; }
   int Height() const { return shape_.h; }
   int Depth() const { return shape_.d; }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index 900bf6e620d..d1841b14350 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -15,9 +15,128 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/substitute.h"
+#include "tensorflow/lite/delegates/gpu/common/shape.h"
+
 namespace tflite {
 namespace gpu {
 namespace cl {
+namespace {
+std::string GetGlobalAddressNoDeclarationWHS(const std::string& x,
+                                             const std::string& y,
+                                             const std::string& s,
+                                             TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute("((($2) * height + ($1)) * width + ($0))", x, y,
+                              s);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute("(int2)(($0), ($1) * slices + ($2))", x, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::StrCat("(int2)(", x, ", ", y, ")");
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::StrCat("(int4)(", x, ", ", y, ", ", s, ", 0)");
+    case TensorStorageType::UNKNOWN:
+      return "error";
+  }
+}
+
+std::string GetGlobalAddressNoDeclarationWHSB(const std::string& x,
+                                              const std::string& y,
+                                              const std::string& s,
+                                              const std::string& b,
+                                              TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute(
+          "(((($3) * height + $2) * width + ($1)) * batch + ($0))", b, x, y, s);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)(($0) * batch + ($1), ($2) * slices + ($3))", x, b, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::Substitute("(int2)(($0) * batch + ($1), ($2))", x, b, y);
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::Substitute("(int4)(($0) * batch + ($1), ($2), ($3), 0)", x,
+                              b, y, s);
+    case TensorStorageType::UNKNOWN:
+      return "error";
+    default:
+      return "error";
+  }
+}
+
+std::string GetGlobalAddressNoDeclarationWHDS(const std::string& x,
+                                              const std::string& y,
+                                              const std::string& z,
+                                              const std::string& s,
+                                              TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute(
+          "(((($3) * slices + ($2)) * height + ($1)) * width + ($0))", x, y, s,
+          z);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)(($0) * depth + ($1), ($2) * slices + ($3))", x, z, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::Substitute("(int2)(($0) * depth + ($1), ($2))", x, z, y);
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::Substitute("(int4)(($0), ($1), ($2) * slices + ($3), 0)", x,
+                              y, z, s);
+    case TensorStorageType::UNKNOWN:
+      return "error";
+  }
+}
+
+std::string GetGlobalAddressNoDeclarationWHDSB(const std::string& x,
+                                               const std::string& y,
+                                               const std::string& z,
+                                               const std::string& s,
+                                               const std::string& b,
+                                               TensorStorageType storage_type) {
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::Substitute(
+          "((((($4) * slices + ($3)) * height + $2) * width + ($1)) * batch + "
+          "($0))",
+          b, x, y, s, z);
+    case TensorStorageType::TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)((($0) * batch + ($1)) * depth + ($2), ($3) * slices + ($4))",
+          x, b, z, y, s);
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+      return absl::Substitute(
+          "(int2)((($0) * batch + ($1)) * depth + ($2), ($3))", x, b, z, y);
+    case TensorStorageType::TEXTURE_ARRAY:
+    case TensorStorageType::TEXTURE_3D:
+      return absl::Substitute(
+          "(int4)(($0) * batch + ($1), ($2), ($3) * slices + ($4), 0)", x, b, y,
+          z, s);
+    case TensorStorageType::UNKNOWN:
+      return "error";
+    default:
+      return "error";
+  }
+}
+
+std::string GetReadImageFromDataType(DataType data_type) {
+  if (data_type == DataType::FLOAT32) {
+    return "read_imagef";
+  } else if (data_type == DataType::FLOAT16) {
+    return "read_imageh";
+  } else {
+    return "error";
+  }
+}
+}  // namespace
 
 std::string ToString(TensorStorageType type) {
   switch (type) {
@@ -38,6 +157,205 @@ std::string ToString(TensorStorageType type) {
   }
 }
 
+GPUResources TensorDescriptor::GetGPUResources() const {
+  GPUResources resources;
+  if (HasAxis(Axis::WIDTH)) {
+    resources.ints.push_back("width");
+  }
+  if (HasAxis(Axis::HEIGHT)) {
+    resources.ints.push_back("height");
+  }
+  if (HasAxis(Axis::CHANNELS)) {
+    resources.ints.push_back("slices");
+    resources.ints.push_back("channels");
+  }
+  if (HasAxis(Axis::BATCH)) {
+    resources.ints.push_back("batch");
+  }
+  if (HasAxis(Axis::DEPTH)) {
+    resources.ints.push_back("depth");
+  }
+  if (storage_type == TensorStorageType::BUFFER) {
+    GPUBufferDescriptor desc;
+    desc.data_type = data_type;
+    desc.element_size = 4;
+    resources.buffers.push_back({"buffer", desc});
+  } else if (storage_type == TensorStorageType::SINGLE_TEXTURE_2D ||
+             storage_type == TensorStorageType::TEXTURE_2D) {
+    GPUImage2DDescriptor desc;
+    desc.data_type = data_type;
+    resources.images2d.push_back({"image2d", desc});
+  } else if (storage_type == TensorStorageType::TEXTURE_ARRAY) {
+    GPUImage2DArrayDescriptor desc;
+    desc.data_type = data_type;
+    resources.image2d_arrays.push_back({"image2d_array", desc});
+  } else if (storage_type == TensorStorageType::TEXTURE_3D) {
+    GPUImage3DDescriptor desc;
+    desc.data_type = data_type;
+    resources.images3d.push_back({"image3d", desc});
+  } else if (storage_type == TensorStorageType::IMAGE_BUFFER) {
+    GPUImageBufferDescriptor desc;
+    desc.data_type = data_type;
+    resources.image_buffers.push_back({"image_buffer", desc});
+  }
+  return resources;
+}
+
+absl::Status TensorDescriptor::PerformSelector(
+    const std::string& selector, const std::vector<std::string>& args,
+    std::string* result) const {
+  if (selector == "Width") {
+    *result = "width";
+    return absl::OkStatus();
+  } else if (selector == "Height") {
+    *result = "height";
+    return absl::OkStatus();
+  } else if (selector == "Slices") {
+    *result = "slices";
+    return absl::OkStatus();
+  } else if (selector == "Channels") {
+    *result = "channels";
+    return absl::OkStatus();
+  } else if (selector == "Batch") {
+    *result = "batch";
+    return absl::OkStatus();
+  } else if (selector == "Depth") {
+    *result = "depth";
+    return absl::OkStatus();
+  } else if (selector == "SetBatchRef") {
+    if (args.size() != 1) {
+      return absl::InvalidArgumentError(
+          "Unsupported arguments in SetBatchRef selector");
+    }
+    state_vars_["batch_id"] = args[0];
+    *result = "";
+    return absl::OkStatus();
+  } else if (selector == "Read") {
+    return PerformReadSelector(args, result);
+  } else {
+    return absl::NotFoundError(absl::StrCat(
+        "TensorLinearDescriptor don't have selector with name - ", selector));
+  }
+}
+
+absl::Status TensorDescriptor::PerformReadSelector(
+    const std::vector<std::string>& args, std::string* result) const {
+  std::string xc;
+  std::string yc;
+  std::string zc;
+  std::string sc;
+  std::string bc;
+  bool parsed = ParseCoordsFromArgs(args, 0, &xc, &yc, &zc, &sc, &bc);
+  if (args.size() < 2 || !parsed) {
+    return absl::NotFoundError("Unrecognized Read selector");
+  }
+
+  if (layout == Layout::HWC) {
+    *result = Read(GetGlobalAddressNoDeclarationWHS(xc, yc, sc, storage_type));
+    return absl::OkStatus();
+  } else if (layout == Layout::BHWC) {
+    *result =
+        Read(GetGlobalAddressNoDeclarationWHSB(xc, yc, sc, bc, storage_type));
+    return absl::OkStatus();
+  } else if (layout == Layout::HWDC) {
+    *result =
+        Read(GetGlobalAddressNoDeclarationWHDS(xc, yc, zc, sc, storage_type));
+    return absl::OkStatus();
+  } else if (layout == Layout::BHWDC) {
+    *result = Read(
+        GetGlobalAddressNoDeclarationWHDSB(xc, yc, zc, sc, bc, storage_type));
+    return absl::OkStatus();
+  } else {
+    return absl::NotFoundError("Unsupported layout");
+  }
+}
+
+std::string TensorDescriptor::Read(const std::string& global_address) const {
+  std::string image_type;
+  if (storage_type == TensorStorageType::TEXTURE_2D ||
+      storage_type == TensorStorageType::SINGLE_TEXTURE_2D) {
+    image_type = "image2d";
+  } else if (storage_type == TensorStorageType::TEXTURE_3D) {
+    image_type = "image3d";
+  } else if (storage_type == TensorStorageType::TEXTURE_ARRAY) {
+    image_type = "image2d_array";
+  }
+  switch (storage_type) {
+    case TensorStorageType::BUFFER:
+      return absl::StrCat("buffer[", global_address, "]");
+    case TensorStorageType::TEXTURE_2D:
+    case TensorStorageType::TEXTURE_3D:
+    case TensorStorageType::SINGLE_TEXTURE_2D:
+    case TensorStorageType::TEXTURE_ARRAY:
+      return absl::StrCat(GetReadImageFromDataType(data_type), "(", image_type,
+                          ", smp_none, ", global_address, ")");
+    case TensorStorageType::IMAGE_BUFFER:
+      return absl::StrCat(GetReadImageFromDataType(data_type),
+                          "(image_buffer, ", global_address, ")");
+    case TensorStorageType::UNKNOWN:
+      return "";
+  }
+}
+
+bool TensorDescriptor::HasAxis(Axis axis) const {
+  if (axis == Axis::WIDTH || axis == Axis::HEIGHT || axis == Axis::CHANNELS) {
+    return true;
+  }
+  if (axis == Axis::BATCH &&
+      (layout == Layout::BHWC || layout == Layout::BHWDC)) {
+    return true;
+  }
+  if (axis == Axis::DEPTH &&
+      (layout == Layout::HWDC || layout == Layout::BHWDC)) {
+    return true;
+  }
+  return false;
+}
+
+bool TensorDescriptor::ParseCoordsFromArgs(const std::vector<std::string>& args,
+                                           int offset, std::string* xc,
+                                           std::string* yc, std::string* zc,
+                                           std::string* sc,
+                                           std::string* bc) const {
+  if (HasAxis(Axis::WIDTH)) {
+    if (offset >= args.size()) return false;
+    *xc = args[offset++];
+  }
+  if (HasAxis(Axis::HEIGHT)) {
+    if (offset >= args.size()) return false;
+    *yc = args[offset++];
+  }
+  if (HasAxis(Axis::DEPTH)) {
+    if (offset >= args.size()) return false;
+    *zc = args[offset++];
+  }
+  if (HasAxis(Axis::CHANNELS)) {
+    if (offset >= args.size()) {
+      auto it = state_vars_.find("slice_id");
+      if (it == state_vars_.end()) {
+        return false;
+      } else {
+        *sc = it->second;
+      }
+    } else {
+      *sc = args[offset++];
+    }
+  }
+  if (HasAxis(Axis::BATCH)) {
+    if (offset >= args.size()) {
+      auto it = state_vars_.find("batch_id");
+      if (it == state_vars_.end()) {
+        return false;
+      } else {
+        *bc = it->second;
+      }
+    } else {
+      *bc = args[offset++];
+    }
+  }
+  return true;
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index 9d98d38900f..c00f8d191a8 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstddef>
 #include <string>
 
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
@@ -36,10 +37,24 @@ enum class TensorStorageType {
   SINGLE_TEXTURE_2D
 };
 
-struct TensorDescriptor {
+struct TensorDescriptor : public GPUObjectDescriptor {
   TensorDescriptor() = default;
   TensorDescriptor(DataType dt, TensorStorageType st, Layout l)
       : data_type(dt), storage_type(st), layout(l) {}
+  TensorDescriptor(const TensorDescriptor& desc)
+      : GPUObjectDescriptor(desc),
+        data_type(desc.data_type),
+        storage_type(desc.storage_type),
+        layout(desc.layout) {}
+  TensorDescriptor& operator=(const TensorDescriptor& desc) {
+    if (this != &desc) {
+      data_type = desc.data_type;
+      storage_type = desc.storage_type;
+      layout = desc.layout;
+      GPUObjectDescriptor::operator=(desc);
+    }
+    return *this;
+  }
 
   bool operator==(const TensorDescriptor& d) const {
     return data_type == d.data_type && storage_type == d.storage_type &&
@@ -48,12 +63,30 @@ struct TensorDescriptor {
 
   bool operator!=(const TensorDescriptor& d) const { return !(*this == d); }
 
+  absl::Status PerformSelector(const std::string& selector,
+                               const std::vector<std::string>& args,
+                               std::string* result) const override;
+
+  GPUResources GetGPUResources() const override;
+
+  bool HasAxis(Axis axis) const;
+
   DataType data_type = DataType::UNKNOWN;
   TensorStorageType storage_type = TensorStorageType::UNKNOWN;
   // This field describes logical layout, actual(physical) GPU layout can be
   // totally different.
   Layout layout =
       Layout::UNKNOWN;  // Supported layouts is HWC, BHWC, HWDC, BHWDC
+
+ private:
+  absl::Status PerformReadSelector(const std::vector<std::string>& args,
+                                   std::string* result) const;
+
+  std::string Read(const std::string& global_address) const;
+
+  bool ParseCoordsFromArgs(const std::vector<std::string>& args, int offset,
+                           std::string* xc, std::string* yc, std::string* zc,
+                           std::string* sc, std::string* bc) const;
 };
 
 std::string ToString(TensorStorageType type);

From f3930469e4a9806ede6909c9e16f6e0acdbe66c8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 11:56:13 -0700
Subject: [PATCH 1477/1533] Pass the function definitions while constructing
 the graph.

The graph constructor looks for ops with the function names, if they are not passed to ConvertGraphDefToGraph.

PiperOrigin-RevId: 314174002
Change-Id: Ia764287b383d648ac72c00ab3806aa5756b485e1
---
 .../tests/tf_saved_model/defun_export.py      | 63 +++++++++++++++++++
 .../mlir/tensorflow/translate/import_model.cc |  3 +
 2 files changed, 66 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/defun_export.py

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/defun_export.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/defun_export.py
new file mode 100644
index 00000000000..8bd128898a0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/defun_export.py
@@ -0,0 +1,63 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# RUN: %p/defun_export | FileCheck %s
+
+# pylint: disable=missing-docstring,line-too-long
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow.compat.v1 as tf
+from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common_v1
+from tensorflow.python.framework import function
+
+
+@function.Defun(tf.float32, tf.float32)
+def plus(a, b):
+  return a + b
+
+
+def test_defun():
+  x = tf.constant([[1.0], [1.0], [1.0]])
+  y = tf.constant([[2.0], [2.0], [2.0]])
+
+  # Verify that the function defined using function.Defun
+  # has a corresponding tf.LegacyCall op.
+  # CHECK:      func {{@[a-zA-Z_0-9]+}}(
+  # CHECK-SAME: [[ARG0:%.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["y"]},
+  # CHECK-SAME: [[ARG1:%.*]]: tensor<3x1xf32> {tf_saved_model.index_path = ["x"]}
+  #
+  # CHECK-NEXT: [[R0:%.*]] = "tf.LegacyCall"([[ARG1]], [[ARG0]])
+  z = plus(x, y)
+
+  tensor_info_x = tf.compat.v1.saved_model.utils.build_tensor_info(x)
+  tensor_info_y = tf.compat.v1.saved_model.utils.build_tensor_info(y)
+  tensor_info_z = tf.compat.v1.saved_model.utils.build_tensor_info(z)
+
+  return {
+      'key': (tf.compat.v1.saved_model.signature_def_utils.build_signature_def(
+          inputs={
+              'x': tensor_info_x,
+              'y': tensor_info_y
+          },
+          outputs={'z': tensor_info_z},
+          method_name='test_function'))
+  }
+
+
+if __name__ == '__main__':
+  common_v1.set_tf_options()
+  common_v1.do_test(test_defun())
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 24a1d40a8bb..3aa700d3718 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -3334,6 +3334,9 @@ Status SavedModelSignatureDefImporter::ConvertSignature(
       graphdef, &sub_graph_def,
       /*terminal_nodes=*/{specs.outputs.begin(), specs.outputs.end()}));
 
+  // Set the function library definitions in the pruned graphdef.
+  *sub_graph_def.mutable_library() = flib_def.ToProto();
+
   // Convert sub-graphdef to sub-graph.
   GraphConstructorOptions options;
   options.allow_internal_ops = true;

From 1a430ba06be31ee01d7f0b6b17cfde982f1f38ef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 11:58:35 -0700
Subject: [PATCH 1478/1533] Support partition Sort in TopK when input
 partitioned at sort dimension.

PiperOrigin-RevId: 314174499
Change-Id: I8fbac47edf5a2691c5a51aacda885b0300b53247
---
 tensorflow/compiler/xla/service/spmd/BUILD    |   1 +
 .../xla/service/spmd/spmd_partitioner.cc      | 130 ++++--
 .../xla/service/spmd/spmd_partitioner_test.cc | 379 ++++++++++++++++++
 .../xla/service/spmd/spmd_partitioner_util.cc | 168 ++++++++
 .../xla/service/spmd/spmd_partitioner_util.h  |  28 ++
 5 files changed, 681 insertions(+), 25 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index 280af2246bb..4433078472d 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -43,6 +43,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_query",
         "//tensorflow/compiler/xla/service:hlo_sharding_util",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:shape_inference",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
         "//tensorflow/core/platform:numbers",
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index daa3d157bdc..122812b78e3 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -1282,6 +1282,106 @@ Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
 
 Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
   HloSharding sharding = hlo->sharding();
+  // Special handling for sort in TopK when first operand partitioined at
+  // sort dimension.
+  auto k = GetKValueInTopKWhenPartitionSortDim(hlo);
+  if (k.has_value()) {
+    // When the first operand partitioned at sort dimension:
+    //   1. Partition sort computation to different partitions;
+    //   2. Slice TopK value and index from different partitions;
+    //   3. Gather and replicate value and index from different partitions,
+    //      the shape of replicated value and index will be
+    //      [batch_size, ..., partition_count * k, ...];
+    //   4. Final sort uses replicated value and index from different partitions
+    //      as input.
+    // GetTupleElement and Slice after the non-partitoned sort won't change
+    // at this point, as HandleGetTupleElement and HandleSlice will update them.
+    HloSortInstruction* sort = DynCast<HloSortInstruction>(hlo);
+    const int64 sort_dim = sort->sort_dimension();
+    auto input = hlo->operand(0);
+    auto index = hlo->operand(1);
+    const HloSharding& input_sharding = input->sharding();
+    const int64 partition_count =
+        input_sharding.tile_assignment().dim(sort_dim);
+    const int64 input_size = input->shape().dimensions(sort_dim);
+    const int64 per_partition_size = CeilOfRatio(input_size, partition_count);
+    const auto element_type = input->shape().element_type();
+    const auto index_type = index->shape().element_type();
+
+    // Partition and pad input and index.
+    // Pad input with minimal value.
+    auto partitioned_input = GetPartitionedHlo(input).PadWithValue(
+        CreateFirstWithType(element_type, &b_));
+    // Pad index with max value.
+    auto partitioned_index =
+        GetPartitionedHlo(index)
+            .Reshard(input_sharding)
+            .PadWithValue(CreateLastWithType(index_type, &b_));
+
+    // Each partition needs to do TopK separately, thus the base shape
+    // becomes the padded shape.
+    std::vector<int64> replicated_dimensions(
+        input->shape().dimensions().begin(), input->shape().dimensions().end());
+    replicated_dimensions[sort_dim] = per_partition_size * partition_count;
+    const Shape replicated_shape = ShapeUtil::MakeTupleShape(
+        {ShapeUtil::MakeShape(element_type, replicated_dimensions),
+         ShapeUtil::MakeShape(index_type, replicated_dimensions)});
+
+    // Partition original topk to different shards.
+    auto topk_sharding =
+        input_sharding.GetTupleSharding(replicated_shape).ValueOrDie();
+    auto shard_shape = MakePartitionedShape(replicated_shape, topk_sharding);
+    auto topk = b_.AddInstruction(hlo->CloneWithNewOperands(
+        shard_shape, {partitioned_input.hlo(), partitioned_index.hlo()}));
+
+    // Get value from first sort.
+    HloInstruction* value_gte =
+        b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+            topk->shape().tuple_shapes(0), topk, 0));
+    HloInstruction* index_gte =
+        b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+            topk->shape().tuple_shapes(1), topk, 1));
+
+    // Slice top K value from the first partitioned sort.
+    replicated_dimensions[sort_dim] = k.value() * partition_count;
+    auto slice_input = SliceFirstK(value_gte, &b_, sort_dim, k.value());
+    slice_input->set_sharding(input_sharding);
+    PartitionedHlo partitioned_slice_input(
+        slice_input, ShapeUtil::MakeShape(element_type, replicated_dimensions),
+        MakePartitioningState());
+    // Reshard value to be replicated.
+    auto replicated_slice_input =
+        partitioned_slice_input.Reshard(HloSharding::Replicate()).hlo();
+
+    // Slice top K index from the first parttioned sort.
+    auto slice_index = SliceFirstK(index_gte, &b_, sort_dim, k.value());
+    slice_index->set_sharding(input_sharding);
+    PartitionedHlo partitioned_slice_index(
+        slice_index, ShapeUtil::MakeShape(index_type, replicated_dimensions),
+        MakePartitioningState());
+    // Reshard value to be replicated.
+    auto replicated_slice_index =
+        partitioned_slice_index.Reshard(HloSharding::Replicate()).hlo();
+
+    // Creates replicated sort to do TopK, the input is value and index pairs
+    // from all the partitions.
+    const Shape final_topk_shape = ShapeUtil::MakeTupleShape(
+        {ShapeUtil::MakeShape(element_type, replicated_dimensions),
+         ShapeUtil::MakeShape(index_type, replicated_dimensions)});
+    auto final_sort = b_.AddInstruction(HloInstruction::CreateSort(
+        final_topk_shape, sort_dim,
+        {replicated_slice_input, replicated_slice_index}, sort->to_apply(),
+        sort->is_stable()));
+    final_sort->set_sharding(HloSharding::Replicate()
+                                 .GetTupleSharding(final_sort->shape())
+                                 .ValueOrDie());
+    PartitionedHlo replicated_sort(final_sort, final_topk_shape,
+                                   MakePartitioningState());
+    SetPartitionedHlo(hlo, replicated_sort.Reshard(hlo->sharding()));
+
+    return Status::OK();
+  }
+
   if (hlo->shape().IsTuple()) {
     // Check that all elements are sharded in the same way.
     if (hlo->shape().tuple_shapes_size() == 0) {
@@ -1373,16 +1473,8 @@ Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
   auto input = hlo->operand(0);
   const auto element_type = input->shape().element_type();
 
-  // Pad input with minimal value.
-  auto min_value = b_.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::MinValue(element_type)));
-  // TODO(wangtao): add test to see if -NaN < -Inf in BF16.
-  if (element_type == F32) {
-    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
-    min_value = b_.AddInstruction(HloInstruction::CreateConstant(
-        LiteralUtil::CreateR0<float>(-float_pad_value)));
-  }
-  auto partitioned_input = GetPartitionedHlo(input).PadWithValue(min_value);
+  auto partitioned_input = GetPartitionedHlo(input).PadWithValue(
+      CreateFirstWithType(element_type, &b_));
 
   // Each partition needs to do TopK separately, thus the base shape
   // becomes [batch_size, k * shard_count].
@@ -1476,24 +1568,12 @@ Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
       b_.AddInstruction(HloInstruction::CreateGetTupleElement(
           replicated_sort.hlo()->shape().tuple_shapes(1), replicated_sort.hlo(),
           1));
-  const Shape& hlo_shape = sort_value_gte->shape();
-  auto hlo_dims = hlo_shape.dimensions();
-  std::vector<int64> start_indices(hlo_shape.dimensions_size(), 0);
-  std::vector<int64> limit_indices(hlo_dims.begin(), hlo_dims.end());
-  std::vector<int64> strides(hlo_shape.dimensions_size(), sort_dim);
-  limit_indices[sort_dim] = k;
-  auto output_shape = hlo_shape;
-  output_shape.set_dimensions(sort_dim, k);
   // Slice value from final sort.
   HloInstruction* slice_sort_value =
-      b_.AddInstruction(HloInstruction::CreateSlice(
-          output_shape, sort_value_gte, start_indices, limit_indices, strides));
+      SliceFirstK(sort_value_gte, &b_, sort_dim, k);
   // Slice index from final sort.
-  auto index_output_shape = sort_index_gte->shape();
-  index_output_shape.set_dimensions(sort_dim, k);
-  HloInstruction* slice_index_value = b_.AddInstruction(
-      HloInstruction::CreateSlice(index_output_shape, sort_index_gte,
-                                  start_indices, limit_indices, strides));
+  HloInstruction* slice_index_value =
+      SliceFirstK(sort_index_gte, &b_, sort_dim, k);
   auto create_tuple = b_.AddInstruction(
       HloInstruction::CreateTuple({slice_sort_value, slice_index_value}));
   create_tuple->set_sharding(HloSharding::Replicate());
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 60ad0191b89..26450d9968d 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -1947,6 +1947,385 @@ ENTRY %cluster_2013453984438090939__.47
   EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 4000);
 }
 
+TEST_F(SpmdPartitioningTest, PartitionSortInTopK) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.9: bf16[], p.0.rhs.10: bf16[], p.1.lhs.11: 
+   s32[], p.1.rhs.12: s32[]) -> pred[] {
+  %p.1.lhs.11 = s32[] parameter(2)
+  %p.1.rhs.12 = s32[] parameter(3)
+  %p.0.lhs.9 = bf16[] parameter(0)
+  %convert.13 = f32[] convert(bf16[] %p.0.lhs.9)
+  %bitcast-convert.16 = s32[] bitcast-convert(f32[] %convert.13)
+  %constant.20 = s32[] constant(0)
+  %compare.21 = pred[] compare(s32[] %bitcast-convert.16, s32[] %constant.20),
+    direction=LT
+  %constant.15 = u32[] constant(2147483647)
+  %bitcast-convert.17 = u32[] bitcast-convert(f32[] %convert.13)
+  %subtract.18 = u32[] subtract(u32[] %constant.15, u32[] %bitcast-convert.17)
+  %bitcast-convert.19 = s32[] bitcast-convert(u32[] %subtract.18)
+  %select.22 = s32[] select(pred[] %compare.21, s32[] %bitcast-convert.19, s32[]
+    %bitcast-convert.16)
+  %p.0.rhs.10 = bf16[] parameter(1)
+  %convert.14 = f32[] convert(bf16[] %p.0.rhs.10)
+  %bitcast-convert.24 = s32[] bitcast-convert(f32[] %convert.14)
+  %constant.28 = s32[] constant(0)
+  %compare.29 = pred[] compare(s32[] %bitcast-convert.24, s32[] %constant.28),
+    direction=LT
+  %constant.23 = u32[] constant(2147483647)
+  %bitcast-convert.25 = u32[] bitcast-convert(f32[] %convert.14)
+  %subtract.26 = u32[] subtract(u32[] %constant.23, u32[] %bitcast-convert.25)
+  %bitcast-convert.27 = s32[] bitcast-convert(u32[] %subtract.26)
+  %select.30 = s32[] select(pred[] %compare.29, s32[] %bitcast-convert.27, s32[]
+    %bitcast-convert.24)
+  ROOT %compare.31 = pred[] compare(s32[] %select.22, s32[] %select.30),
+    direction=GT
+}
+
+ENTRY entry
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 104832);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 104832);
+  auto final_sort = FindInstruction(module.get(), "sort.1");
+  EXPECT_EQ(final_sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(final_sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, PartitionSortInTopKWhenComparisonWithSelect) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 104832);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 104832);
+  auto final_sort = FindInstruction(module.get(), "sort.1");
+  EXPECT_EQ(final_sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(final_sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, NoPartitionSortInTopKWhenSecondOperandIsNotIota) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %arg_tuple.2 = s32[2,209664] parameter(1)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %arg_tuple.2),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  std::cout << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
+}
+
+TEST_F(SpmdPartitioningTest, NoPartitionSortInTopKWhenNoPartitionInSortDim) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[2,1]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[2,2000] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[2,2000] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:2], [0:2000]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[2,2000], s32[2,2000])
+    tuple(bf16[2,2000] %slice.34, s32[2,2000]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  std::cout << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
+}
+
+TEST_F(SpmdPartitioningTest, NoPartitionSortInTopKWhenSliceInOtherDim) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%compare-greater-than.8 (p.0.lhs.2566: bf16[],
+  p.0.rhs.2567: bf16[], p.1.lhs.2586: s32[], p.1.rhs.2587: s32[]) -> pred[] {
+  %p.0.lhs.2566 = bf16[] parameter(0)
+  %convert.164 = f32[] convert(bf16[] %p.0.lhs.2566)
+  %bitcast-convert.48 = s32[] bitcast-convert(f32[] %convert.164)
+  %constant.285 = s32[] constant(0)
+  %compare.84 = pred[] compare(s32[] %bitcast-convert.48, s32[] %constant.285),
+    direction=LT
+  %constant.286 = u32[] constant(2147483647)
+  %bitcast-convert.49 = u32[] bitcast-convert(f32[] %convert.164)
+  %subtract.84 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.49)
+  %bitcast-convert.50 = s32[] bitcast-convert(u32[] %subtract.84)
+  %select.40 = s32[] select(pred[] %compare.84, s32[] %bitcast-convert.50,
+    s32[] %bitcast-convert.48)
+  %p.0.rhs.2567 = bf16[] parameter(1)
+  %convert.165 = f32[] convert(bf16[] %p.0.rhs.2567)
+  %bitcast-convert.51 = s32[] bitcast-convert(f32[] %convert.165)
+  %compare.85 = pred[] compare(s32[] %bitcast-convert.51, s32[] %constant.285),
+    direction=LT
+  %bitcast-convert.52 = u32[] bitcast-convert(f32[] %convert.165)
+  %subtract.85 = u32[] subtract(u32[] %constant.286, u32[] %bitcast-convert.52)
+  %bitcast-convert.53 = s32[] bitcast-convert(u32[] %subtract.85)
+  %select.41 = s32[] select(pred[] %compare.85, s32[] %bitcast-convert.53,
+    s32[] %bitcast-convert.51)
+  %compare.86 = pred[] compare(s32[] %select.40, s32[] %select.41), direction=GT
+  %compare.1645 = pred[] compare(s32[] %select.41, s32[] %select.40), direction=GT
+  %compare.1646 = pred[] compare(pred[] %compare.86, pred[] %compare.1645),
+    direction=EQ
+  %p.1.lhs.2586 = s32[] parameter(2)
+  %p.1.rhs.2587 = s32[] parameter(3)
+  %compare.1647 = pred[] compare(s32[] %p.1.lhs.2586, s32[] %p.1.rhs.2587),
+    direction=LT
+  ROOT %select.1054 = pred[] select(pred[] %compare.1646, pred[] %compare.1647,
+    pred[] %compare.86)
+}
+
+ENTRY entry {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %iota.7 = s32[2,209664] iota(), iota_dimension=1,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %sort.32 = (bf16[2,209664], s32[2,209664])
+    sort(bf16[2,209664] %copy.arg_tuple.1, s32[2,209664] %iota.7),
+    dimensions={1}, is_stable=true, to_apply=%compare-greater-than.8,
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.33 = bf16[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=0, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.34 = bf16[1,209664] slice(bf16[2,209664]
+    %get-tuple-element.33), slice={[0:1], [0:209664]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  %get-tuple-element.35 = s32[2,209664]
+    get-tuple-element((bf16[2,209664], s32[2,209664]) %sort.32),
+    index=1, metadata={op_type="TopKV2" op_name="TopKV2"}
+  %slice.36 = s32[1,209664] slice(s32[2,209664]
+    %get-tuple-element.35), slice={[0:1], [0:209664]},
+    metadata={op_type="TopKV2" op_name="TopKV2"}
+  ROOT %tuple.46 = (bf16[1,209664], s32[1,209664])
+    tuple(bf16[1,209664] %slice.34, s32[1,209664]
+    %slice.36), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 209664);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 209664);
+}
+
 TEST_F(SpmdPartitioningTest, ShardableTranspose) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 10b8199a2c9..df7597628af 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -19,10 +19,13 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -702,5 +705,170 @@ HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
   return reshard_window->sharded_input;
 }
 
+bool IsNanSafeGt(HloComputation* comp) {
+  namespace m = match;
+  auto match_bitcast_f32 = [](int64 parameter_number) {
+    auto param = m::Parameter(parameter_number)
+                     .WithShape(m::Shape().WithElementType(F32));
+    auto param_s32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(S32));
+    auto param_u32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(U32));
+    return m::Select(
+        m::Lt(param_s32, m::ConstantScalar(0)),
+        m::BitcastConvert(
+            m::Subtract(m::ConstantScalar(std::numeric_limits<int32>::max()),
+                        param_u32))
+            .WithShape(m::Shape().WithElementType(S32)),
+        param_s32);
+  };
+  auto match_bitcast_bf16 = [](int64 parameter_number) {
+    auto param = m::Convert(m::Parameter(parameter_number)
+                                .WithShape(m::Shape().WithElementType(BF16)))
+                     .WithShape(m::Shape().WithElementType(F32));
+    auto param_s32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(S32));
+    auto param_u32 =
+        m::BitcastConvert(param).WithShape(m::Shape().WithElementType(U32));
+    return m::Select(
+        m::Lt(param_s32, m::ConstantScalar(0)),
+        m::BitcastConvert(
+            m::Subtract(m::ConstantScalar(std::numeric_limits<int32>::max()),
+                        param_u32))
+            .WithShape(m::Shape().WithElementType(S32)),
+        param_s32);
+  };
+  // If root instruction is kSelect and compares indices if values are equal.
+  if (comp->root_instruction()->opcode() == HloOpcode::kSelect) {
+    return Match(comp->root_instruction()->operand(2),
+                 m::Gt(match_bitcast_f32(0), match_bitcast_f32(1))) ||
+           Match(comp->root_instruction()->operand(2),
+                 m::Gt(match_bitcast_bf16(0), match_bitcast_bf16(1)));
+  }
+  return Match(comp->root_instruction(),
+               m::Gt(match_bitcast_f32(0), match_bitcast_f32(1))) ||
+         Match(comp->root_instruction(),
+               m::Gt(match_bitcast_bf16(0), match_bitcast_bf16(1)));
+}
+
+absl::optional<int64> GetKValueInTopKWhenPartitionSortDim(HloInstruction* hlo) {
+  HloSortInstruction* sort = DynCast<HloSortInstruction>(hlo);
+  if (sort == nullptr || sort->operand_count() != 2) {
+    return absl::nullopt;
+  }
+  if (!IsNanSafeGt(sort->to_apply())) {
+    return absl::nullopt;
+  }
+  HloInstruction* data = sort->mutable_operand(0);
+  HloIotaInstruction* iota =
+      DynCast<HloIotaInstruction>(sort->mutable_operand(1));
+  const PrimitiveType element_type = data->shape().element_type();
+  if (iota == nullptr || iota->shape().element_type() != S32 ||
+      iota->opcode() != HloOpcode::kIota ||
+      iota->iota_dimension() != sort->sort_dimension()) {
+    return absl::nullopt;
+  }
+
+  const int64 sort_dim = sort->sort_dimension();
+
+  if (element_type != F32 && element_type != BF16 && element_type != S32 &&
+      element_type != U32) {
+    return absl::nullopt;
+  }
+
+  bool supported = true;
+  absl::optional<int64> k;
+  for (HloInstruction* gte : sort->users()) {
+    if (gte->opcode() != HloOpcode::kGetTupleElement) {
+      supported = false;
+      break;
+    }
+
+    const HloInstruction* slice = gte->users()[0];
+    if (slice->opcode() != HloOpcode::kSlice) {
+      // Non-slice user means we are not doing a TopK
+      supported = false;
+      break;
+    }
+    if (absl::c_any_of(slice->slice_starts(), [](int x) { return x != 0; }) ||
+        absl::c_any_of(slice->slice_strides(), [](int x) { return x != 1; })) {
+      // Strided slice or slicing at the beginning isn't supported.
+      supported = false;
+      break;
+    }
+    for (int64 dim = 0; dim < data->shape().dimensions_size(); dim++) {
+      if (dim == sort_dim) {
+        continue;
+      }
+      if (slice->slice_limits(dim) !=
+          slice->operand(0)->shape().dimensions(dim)) {
+        // Slicing along the other dimension isn't supported.
+        supported = false;
+        break;
+      }
+    }
+    if (!k.has_value()) {
+      k = slice->slice_limits(sort_dim);
+    } else if (k != slice->slice_limits(sort_dim)) {
+      // Different k for the different operands isn't supported.
+      supported = false;
+      break;
+    }
+  }
+  if (k == absl::nullopt || !supported) {
+    return absl::nullopt;
+  }
+
+  // Only support when sort dim is sharded.
+  if (!data->has_sharding()) {
+    return absl::nullopt;
+  }
+  const HloSharding& sharding = sort->operand(0)->sharding();
+
+  if (sharding.IsTileMaximal()) {
+    return absl::nullopt;
+  }
+
+  // Check if partitioned at sort dimension.
+  for (int64 dim : sort->dimensions()) {
+    if (sharding.tile_assignment().dim(dim) > 1) {
+      if (dim != sort_dim) {
+        return absl::nullopt;
+      }
+    }
+  }
+
+  // Checks if partition size is smaller than k.
+  const int64 shard_count = sharding.tile_assignment().dim(sort_dim);
+
+  if (shard_count <= 1) {
+    return absl::nullopt;
+  }
+
+  const int64 input_size = hlo->operand(0)->shape().dimensions(sort_dim);
+  const int64 per_partition_size = CeilOfRatio(input_size, shard_count);
+
+  if (k.value() >= per_partition_size) {
+    return absl::nullopt;
+  }
+
+  return k;
+}
+
+// Slice first k elements from sort_dim.
+HloInstruction* SliceFirstK(HloInstruction* hlo, SpmdBuilder* builder,
+                            int64 slice_dim, int64 k) {
+  const Shape& hlo_shape = hlo->shape();
+  auto hlo_dims = hlo_shape.dimensions();
+  std::vector<int64> start_indices(hlo_shape.dimensions_size(), 0);
+  std::vector<int64> limit_indices(hlo_dims.begin(), hlo_dims.end());
+  std::vector<int64> strides(hlo_shape.dimensions_size(), 1);
+  limit_indices[slice_dim] = k;
+  auto output_shape = hlo_shape;
+  output_shape.set_dimensions(slice_dim, k);
+  return builder->AddInstruction(HloInstruction::CreateSlice(
+      output_shape, hlo, start_indices, limit_indices, strides));
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index 40be73283b7..5f245667970 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -45,6 +45,24 @@ HloInstruction* CreateR0WithType(PrimitiveType type, NativeT value,
   return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
 }
 
+inline HloInstruction* CreateFirstWithType(PrimitiveType type, SpmdBuilder* b) {
+  if (type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    return CreateR0WithType(type, -float_pad_value, b);
+  }
+  auto literal = LiteralUtil::MinValue(type);
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+inline HloInstruction* CreateLastWithType(PrimitiveType type, SpmdBuilder* b) {
+  if (type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    return CreateR0WithType(type, float_pad_value, b);
+  }
+  auto literal = LiteralUtil::MaxValue(type);
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
 // Create a binary add computation of the given type and add to the module.
 HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module);
 
@@ -234,6 +252,16 @@ absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
 HloInstruction* HaloExchangeToPadOnLeft(PartitionedHlo& original,
                                         absl::Span<const int64> dims);
 
+// Check if the computation is GT comparison and safe for NaNs.
+bool IsNanSafeGt(HloComputation* computation);
+
+// Return k in TopK when input value is parttioned in the sort dimension.
+absl::optional<int64> GetKValueInTopKWhenPartitionSortDim(HloInstruction* hlo);
+
+// Slices the first k elements at slice dimension.
+HloInstruction* SliceFirstK(HloInstruction* hlo, SpmdBuilder* builder,
+                            int64 slice_dim, int64 k);
+
 }  // namespace spmd
 }  // namespace xla
 

From 25213f58c433d3712931b4071d2498bb67f8c2ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 12:15:28 -0700
Subject: [PATCH 1479/1533] Fixes:GitHub Issue #39976

PiperOrigin-RevId: 314177901
Change-Id: I5ce76f1d5cabb2f70e24dd095ebe5d3c412cc016
---
 .../python/keras/layers/preprocessing/image_preprocessing.py  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index e4b92e44e69..92de25a46e6 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -1066,8 +1066,8 @@ class RandomContrast(Layer):
     else:
       self.lower = self.upper = factor
     if self.lower < 0. or self.upper < 0. or self.lower > 1.:
-      raise ValueError('Factor cannot have negative values, '
-                       'got {}'.format(factor))
+      raise ValueError('Factor cannot have negative values or greater than 1.0,'
+                       ' got {}'.format(factor))
     self.seed = seed
     self.input_spec = InputSpec(ndim=4)
     super(RandomContrast, self).__init__(name=name, **kwargs)

From 5498a3cb1226f9ba03ee5802162006178c46caaf Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 1 Jun 2020 12:15:28 -0700
Subject: [PATCH 1480/1533] First step towards reducing the size of
 python/__init__.py. API generation has been relying on python/__init__.py to
 look for modules with @tf_export decorators. Now, modules_with_exports.py
 should be used to specify modules to scan instead.

PiperOrigin-RevId: 314177907
Change-Id: I13460b5db55d4e56dd810106086ecb2aeca69a6b
---
 tensorflow/python/BUILD                       |  15 ++
 tensorflow/python/__init__.py                 |  73 +-----
 tensorflow/python/modules_with_exports.py     |  78 +++++++
 .../python/tools/api/generator/api_gen.bzl    |  11 +-
 .../tools/api/generator/create_python_api.py  | 209 ++++++++++--------
 .../tools/api/tests/api_compatibility_test.py |  24 +-
 tensorflow/tools/common/traverse.py           |  13 +-
 7 files changed, 249 insertions(+), 174 deletions(-)
 create mode 100644 tensorflow/python/modules_with_exports.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 246dc34eed8..682d15712b7 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -239,6 +239,21 @@ py_library(
     ],
 )
 
+# This target should only be used for API generation.
+py_library(
+    name = "modules_with_exports",
+    srcs = ["modules_with_exports.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow/python/tools/api/generator:__pkg__",
+        "//third_party/py/tensorflow_core:__subpackages__",
+    ],
+    deps = [
+        ":no_contrib",
+    ],
+)
+
 # TODO(gunan): Investigate making this action hermetic so we do not need
 # to run it locally.
 tf_py_build_info_genrule(
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 781ef33f744..9b42742058c 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -30,51 +30,14 @@ import importlib
 import sys
 import traceback
 
-# TODO(drpng): write up instructions for editing this file in a doc and point to
-# the doc instead.
-# If you want to edit this file to expose modules in public tensorflow API, you
-# need to follow these steps:
-# 1. Consult with tensorflow team and get approval for adding a new API to the
-#    public interface.
-# 2. Document the module in the gen_docs_combined.py.
-# 3. Import the module in the main tensorflow namespace by adding an import
-#    statement in this file.
-# 4. Sanitize the entry point by making sure that your module does not expose
-#    transitively imported modules used for implementation, such as os, sys.
+# We aim to keep this file minimal and ideally remove completely.
+# If you are adding a new file with @tf_export decorators,
+# import it in modules_with_exports.py instead.
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,g-bad-import-order,g-import-not-at-top
 
-import numpy as np
-
-from tensorflow.python import pywrap_tensorflow
-
-# Protocol buffers
-from tensorflow.core.framework.graph_pb2 import *
-from tensorflow.core.framework.node_def_pb2 import *
-from tensorflow.core.framework.summary_pb2 import *
-from tensorflow.core.framework.attr_value_pb2 import *
-from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
-from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
-from tensorflow.core.protobuf.config_pb2 import *
-from tensorflow.core.protobuf.tensorflow_server_pb2 import *
-from tensorflow.core.util.event_pb2 import *
-
-# Framework
-from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
-from tensorflow.python.framework.versions import *
-from tensorflow.python.framework import config
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import graph_util
-
-# Session
-from tensorflow.python.client.client_lib import *
-
-# Ops
-from tensorflow.python.ops.standard_ops import *
-
-# Namespaces
-from tensorflow.python.ops import initializers_ns as initializers
+from tensorflow.python.eager import context
 
 # pylint: enable=wildcard-import
 
@@ -152,8 +115,8 @@ from tensorflow.python.framework.ops import enable_eager_execution
 # Check whether TF2_BEHAVIOR is turned on.
 from tensorflow.python.eager import monitoring as _monitoring
 from tensorflow.python import tf2 as _tf2
-_tf2_gauge = _monitoring.BoolGauge('/tensorflow/api/tf2_enable',
-                                   'Environment variable TF2_BEHAVIOR is set".')
+_tf2_gauge = _monitoring.BoolGauge(
+    '/tensorflow/api/tf2_enable', 'Environment variable TF2_BEHAVIOR is set".')
 _tf2_gauge.get_cell().set(_tf2.enabled())
 
 # Necessary for the symbols in this module to be taken into account by
@@ -186,30 +149,6 @@ nn.bidirectional_dynamic_rnn = rnn.bidirectional_dynamic_rnn
 nn.static_state_saving_rnn = rnn.static_state_saving_rnn
 nn.rnn_cell = rnn_cell
 
-# Export protos
-# pylint: disable=undefined-variable
-tf_export(v1=['AttrValue'])(AttrValue)
-tf_export(v1=['ConfigProto'])(ConfigProto)
-tf_export(v1=['Event', 'summary.Event'])(Event)
-tf_export(v1=['GPUOptions'])(GPUOptions)
-tf_export(v1=['GraphDef'])(GraphDef)
-tf_export(v1=['GraphOptions'])(GraphOptions)
-tf_export(v1=['HistogramProto'])(HistogramProto)
-tf_export(v1=['LogMessage'])(LogMessage)
-tf_export(v1=['MetaGraphDef'])(MetaGraphDef)
-tf_export(v1=['NameAttrList'])(NameAttrList)
-tf_export(v1=['NodeDef'])(NodeDef)
-tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
-tf_export(v1=['RunMetadata'])(RunMetadata)
-tf_export(v1=['RunOptions'])(RunOptions)
-tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
-tf_export(v1=['Summary', 'summary.Summary'])(Summary)
-tf_export(v1=['summary.SummaryDescription'])(SummaryDescription)
-tf_export(v1=['SummaryMetadata'])(SummaryMetadata)
-tf_export(v1=['summary.TaggedRunMetadata'])(TaggedRunMetadata)
-tf_export(v1=['TensorInfo'])(TensorInfo)
-# pylint: enable=undefined-variable
-
 # Special dunders that we choose to export:
 _exported_dunders = set([
     '__version__',
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
new file mode 100644
index 00000000000..40ef715aacf
--- /dev/null
+++ b/tensorflow/python/modules_with_exports.py
@@ -0,0 +1,78 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Imports modules that should be scanned during API generation.
+
+This file should eventually contain everything we need to scan looking for
+tf_export decorators.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# go/tf-wildcard-import
+# pylint: disable=wildcard-import,g-bad-import-order,g-import-not-at-top
+# pylint: disable=unused-import,g-importing-member
+
+# Protocol buffers
+from tensorflow.core.framework.graph_pb2 import *
+from tensorflow.core.framework.node_def_pb2 import *
+from tensorflow.core.framework.summary_pb2 import *
+from tensorflow.core.framework.attr_value_pb2 import *
+from tensorflow.core.protobuf.meta_graph_pb2 import TensorInfo
+from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
+from tensorflow.core.protobuf.config_pb2 import *
+from tensorflow.core.util.event_pb2 import *
+
+# Framework
+from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
+from tensorflow.python.framework.versions import *
+from tensorflow.python.framework import config
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import graph_util
+
+# Session
+from tensorflow.python.client.client_lib import *
+
+# Ops
+from tensorflow.python.ops.standard_ops import *
+
+# Namespaces
+from tensorflow.python.ops import initializers_ns as initializers
+
+from tensorflow.python.util.tf_export import tf_export
+
+# Export protos
+# pylint: disable=undefined-variable
+tf_export(v1=['AttrValue'])(AttrValue)
+tf_export(v1=['ConfigProto'])(ConfigProto)
+tf_export(v1=['Event', 'summary.Event'])(Event)
+tf_export(v1=['GPUOptions'])(GPUOptions)
+tf_export(v1=['GraphDef'])(GraphDef)
+tf_export(v1=['GraphOptions'])(GraphOptions)
+tf_export(v1=['HistogramProto'])(HistogramProto)
+tf_export(v1=['LogMessage'])(LogMessage)
+tf_export(v1=['MetaGraphDef'])(MetaGraphDef)
+tf_export(v1=['NameAttrList'])(NameAttrList)
+tf_export(v1=['NodeDef'])(NodeDef)
+tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
+tf_export(v1=['RunMetadata'])(RunMetadata)
+tf_export(v1=['RunOptions'])(RunOptions)
+tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
+tf_export(v1=['Summary', 'summary.Summary'])(Summary)
+tf_export(v1=['summary.SummaryDescription'])(SummaryDescription)
+tf_export(v1=['SummaryMetadata'])(SummaryMetadata)
+tf_export(v1=['summary.TaggedRunMetadata'])(TaggedRunMetadata)
+tf_export(v1=['TensorInfo'])(TensorInfo)
+# pylint: enable=undefined-variable
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 3c7f7ac3900..45355c90aaa 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -42,8 +42,15 @@ def gen_api_init_files(
         api_version = 2,
         compat_api_versions = [],
         compat_init_templates = [],
-        packages = ["tensorflow.python", "tensorflow.lite.python.lite"],
-        package_deps = ["//tensorflow/python:no_contrib"],
+        packages = [
+            "tensorflow.python",
+            "tensorflow.lite.python.lite",
+            "tensorflow.python.modules_with_exports",
+        ],
+        package_deps = [
+            "//tensorflow/python:no_contrib",
+            "//tensorflow/python:modules_with_exports",
+        ],
         output_package = "tensorflow",
         output_dir = "",
         root_file_name = "__init__.py"):
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index 42119aeae34..cc2f8ebfb28 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
-"""Generates and prints out imports and constants for new TensorFlow python api.
-"""
+"""Generates and prints out imports and constants for new TensorFlow python api."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -85,9 +84,9 @@ def get_canonical_import(import_set):
   ordering.
 
   Args:
-    import_set: (set) Imports providing the same symbol. This is a set of
-      tuples in the form (import, priority). We want to pick an import
-      with highest priority.
+    import_set: (set) Imports providing the same symbol. This is a set of tuples
+      in the form (import, priority). We want to pick an import with highest
+      priority.
 
   Returns:
     A module name to import
@@ -106,9 +105,11 @@ def get_canonical_import(import_set):
 class _ModuleInitCodeBuilder(object):
   """Builds a map from module name to imports included in that module."""
 
-  def __init__(
-      self, output_package, api_version, lazy_loading=_LAZY_LOADING,
-      use_relative_imports=False):
+  def __init__(self,
+               output_package,
+               api_version,
+               lazy_loading=_LAZY_LOADING,
+               use_relative_imports=False):
     self._output_package = output_package
     # Maps API module to API symbol name to set of tuples of the form
     # (module name, priority).
@@ -127,16 +128,13 @@ class _ModuleInitCodeBuilder(object):
 
   def _check_already_imported(self, symbol_id, api_name):
     if (api_name in self._dest_import_to_id and
-        symbol_id != self._dest_import_to_id[api_name] and
-        symbol_id != -1):
+        symbol_id != self._dest_import_to_id[api_name] and symbol_id != -1):
       raise SymbolExposedTwiceError(
-          'Trying to export multiple symbols with same name: %s.' %
-          api_name)
+          'Trying to export multiple symbols with same name: %s.' % api_name)
     self._dest_import_to_id[api_name] = symbol_id
 
-  def add_import(
-      self, symbol, source_module_name, source_name, dest_module_name,
-      dest_name):
+  def add_import(self, symbol, source_module_name, source_name,
+                 dest_module_name, dest_name):
     """Adds this import to module_imports.
 
     Args:
@@ -150,6 +148,10 @@ class _ModuleInitCodeBuilder(object):
       SymbolExposedTwiceError: Raised when an import with the same
         dest_name has already been added to dest_module_name.
     """
+    # modules_with_exports.py is only used during API generation and
+    # won't be available when actually importing tensorflow.
+    if source_module_name.endswith('python.modules_with_exports'):
+      source_module_name = symbol.__module__
     import_str = self.format_import(source_module_name, source_name, dest_name)
 
     # Check if we are trying to expose two different symbols with same name.
@@ -191,7 +193,7 @@ class _ModuleInitCodeBuilder(object):
 
       for submodule_index in range(len(module_split)):
         if submodule_index > 0:
-          submodule = module_split[submodule_index-1]
+          submodule = module_split[submodule_index - 1]
           parent_module += '.' + submodule if parent_module else submodule
         import_from = self._output_package
         if self._lazy_loading:
@@ -264,8 +266,8 @@ __all__.extend([_s for _s in _names_with_underscore])
           if not dest_module.startswith(_COMPAT_MODULE_PREFIX):
             deprecation = 'True'
         # Workaround to make sure not load lite from lite/__init__.py
-        if (not dest_module and 'lite' in self._module_imports
-            and self._lazy_loading):
+        if (not dest_module and 'lite' in self._module_imports and
+            self._lazy_loading):
           has_lite = 'True'
         if self._lazy_loading:
           public_apis_name = '_PUBLIC_APIS'
@@ -311,8 +313,8 @@ __all__.extend([_s for _s in _names_with_underscore])
         self._module_imports[from_dest_module].copy())
 
 
-def add_nested_compat_imports(
-    module_builder, compat_api_versions, output_package):
+def add_nested_compat_imports(module_builder, compat_api_versions,
+                              output_package):
   """Adds compat.vN.compat.vK modules to module builder.
 
   To avoid circular imports, we want to add __init__.py files under
@@ -334,8 +336,8 @@ def add_nested_compat_imports(
       subcompat_module = _SUBCOMPAT_MODULE_TEMPLATE % (v, sv)
       compat_module = _COMPAT_MODULE_TEMPLATE % sv
       module_builder.copy_imports(compat_module, subcompat_module)
-      module_builder.copy_imports(
-          '%s.compat' % compat_module, '%s.compat' % subcompat_module)
+      module_builder.copy_imports('%s.compat' % compat_module,
+                                  '%s.compat' % subcompat_module)
 
   # Prefixes of modules under compatibility packages, for e.g. "compat.v1.".
   compat_prefixes = tuple(
@@ -400,14 +402,13 @@ def _join_modules(module1, module2):
   return '%s.%s' % (module1, module2)
 
 
-def add_imports_for_symbol(
-    module_code_builder,
-    symbol,
-    source_module_name,
-    source_name,
-    api_name,
-    api_version,
-    output_module_prefix=''):
+def add_imports_for_symbol(module_code_builder,
+                           symbol,
+                           source_module_name,
+                           source_name,
+                           api_name,
+                           api_version,
+                           output_module_prefix=''):
   """Add imports for the given symbol to `module_code_builder`.
 
   Args:
@@ -432,8 +433,8 @@ def add_imports_for_symbol(
       for export in exports:
         dest_module, dest_name = _get_name_and_module(export)
         dest_module = _join_modules(output_module_prefix, dest_module)
-        module_code_builder.add_import(
-            None, source_module_name, name, dest_module, dest_name)
+        module_code_builder.add_import(None, source_module_name, name,
+                                       dest_module, dest_name)
 
   # If symbol has _tf_api_names attribute, then add import for it.
   if (hasattr(symbol, '__dict__') and names_attr in symbol.__dict__):
@@ -442,8 +443,8 @@ def add_imports_for_symbol(
     for export in getattr(symbol, names_attr):  # pylint: disable=protected-access
       dest_module, dest_name = _get_name_and_module(export)
       dest_module = _join_modules(output_module_prefix, dest_module)
-      module_code_builder.add_import(
-          symbol, source_module_name, source_name, dest_module, dest_name)
+      module_code_builder.add_import(symbol, source_module_name, source_name,
+                                     dest_module, dest_name)
 
 
 def get_api_init_text(packages,
@@ -466,8 +467,8 @@ def get_api_init_text(packages,
       directory.
     lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
       produced and if `False`, static imports are used.
-    use_relative_imports: True if we should use relative imports when
-      importing submodules.
+    use_relative_imports: True if we should use relative imports when importing
+      submodules.
 
   Returns:
     A dictionary where
@@ -477,8 +478,10 @@ def get_api_init_text(packages,
   """
   if compat_api_versions is None:
     compat_api_versions = []
-  module_code_builder = _ModuleInitCodeBuilder(
-      output_package, api_version, lazy_loading, use_relative_imports)
+  module_code_builder = _ModuleInitCodeBuilder(output_package, api_version,
+                                               lazy_loading,
+                                               use_relative_imports)
+
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
 
@@ -496,24 +499,23 @@ def get_api_init_text(packages,
       continue
 
     for module_contents_name in dir(module):
-      if (module.__name__ + '.' + module_contents_name
-          in _SYMBOLS_TO_SKIP_EXPLICITLY):
+      if (module.__name__ + '.' +
+          module_contents_name in _SYMBOLS_TO_SKIP_EXPLICITLY):
         continue
       attr = getattr(module, module_contents_name)
       _, attr = tf_decorator.unwrap(attr)
 
-      add_imports_for_symbol(
-          module_code_builder, attr, module.__name__, module_contents_name,
-          api_name, api_version)
+      add_imports_for_symbol(module_code_builder, attr, module.__name__,
+                             module_contents_name, api_name, api_version)
       for compat_api_version in compat_api_versions:
-        add_imports_for_symbol(
-            module_code_builder, attr, module.__name__, module_contents_name,
-            api_name, compat_api_version,
-            _COMPAT_MODULE_TEMPLATE % compat_api_version)
+        add_imports_for_symbol(module_code_builder, attr, module.__name__,
+                               module_contents_name, api_name,
+                               compat_api_version,
+                               _COMPAT_MODULE_TEMPLATE % compat_api_version)
 
   if compat_api_versions:
-    add_nested_compat_imports(
-        module_code_builder, compat_api_versions, output_package)
+    add_nested_compat_imports(module_code_builder, compat_api_versions,
+                              output_package)
   return module_code_builder.build()
 
 
@@ -545,8 +547,8 @@ def get_module_docstring(module_name, package, api_name):
   4. Returns a default docstring.
 
   Args:
-    module_name: module name relative to tensorflow
-      (excluding 'tensorflow.' prefix) to get a docstring for.
+    module_name: module name relative to tensorflow (excluding 'tensorflow.'
+      prefix) to get a docstring for.
     package: Base python package containing python with target tf_export
       decorators.
     api_name: API you want to generate (e.g. `tensorflow` or `estimator`).
@@ -581,31 +583,37 @@ def get_module_docstring(module_name, package, api_name):
   return 'Public API for tf.%s namespace.' % module_name
 
 
-def create_api_files(output_files, packages, root_init_template, output_dir,
-                     output_package, api_name, api_version,
-                     compat_api_versions, compat_init_templates,
-                     lazy_loading=_LAZY_LOADING, use_relative_imports=False):
+def create_api_files(output_files,
+                     packages,
+                     root_init_template,
+                     output_dir,
+                     output_package,
+                     api_name,
+                     api_version,
+                     compat_api_versions,
+                     compat_init_templates,
+                     lazy_loading=_LAZY_LOADING,
+                     use_relative_imports=False):
   """Creates __init__.py files for the Python API.
 
   Args:
     output_files: List of __init__.py file paths to create.
     packages: Base python packages containing python with target tf_export
       decorators.
-    root_init_template: Template for top-level __init__.py file.
-      "# API IMPORTS PLACEHOLDER" comment in the template file will be replaced
-      with imports.
+    root_init_template: Template for top-level __init__.py file. "# API IMPORTS
+      PLACEHOLDER" comment in the template file will be replaced with imports.
     output_dir: output API root directory.
     output_package: Base output package where generated API will be added.
     api_name: API you want to generate (e.g. `tensorflow` or `estimator`).
     api_version: API version to generate (`v1` or `v2`).
     compat_api_versions: Additional API versions to generate in compat/
       subdirectory.
-    compat_init_templates: List of templates for top level compat init files
-      in the same order as compat_api_versions.
+    compat_init_templates: List of templates for top level compat init files in
+      the same order as compat_api_versions.
     lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is
       produced and if `False`, static imports are used.
-    use_relative_imports: True if we should use relative imports when
-      import submodules.
+    use_relative_imports: True if we should use relative imports when import
+      submodules.
 
   Raises:
     ValueError: if output_files list is missing a required file.
@@ -645,8 +653,7 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
   for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
-      module_file_path = '"%s/__init__.py"' %  (
-          module.replace('.', '/'))
+      module_file_path = '"%s/__init__.py"' % (module.replace('.', '/'))
       missing_output_files.append(module_file_path)
       continue
 
@@ -664,8 +671,9 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
         contents = contents.replace('# API IMPORTS PLACEHOLDER', text)
     else:
       contents = (
-          _GENERATED_FILE_HEADER % get_module_docstring(
-              module, packages[0], api_name) + text + _GENERATED_FILE_FOOTER)
+          _GENERATED_FILE_HEADER %
+          get_module_docstring(module, packages[0], api_name) + text +
+          _GENERATED_FILE_FOOTER)
     if module in deprecation_footer_map:
       if '# WRAPPER_PLACEHOLDER' in contents:
         contents = contents.replace('# WRAPPER_PLACEHOLDER',
@@ -680,14 +688,17 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
         """Missing outputs for genrule:\n%s. Be sure to add these targets to
 tensorflow/python/tools/api/generator/api_init_files_v1.bzl and
 tensorflow/python/tools/api/generator/api_init_files.bzl (tensorflow repo), or
-tensorflow_estimator/python/estimator/api/api_gen.bzl (estimator repo)"""
-        % ',\n'.join(sorted(missing_output_files)))
+tensorflow_estimator/python/estimator/api/api_gen.bzl (estimator repo)""" %
+        ',\n'.join(sorted(missing_output_files)))
 
 
 def main():
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      'outputs', metavar='O', type=str, nargs='+',
+      'outputs',
+      metavar='O',
+      type=str,
+      nargs='+',
       help='If a single file is passed in, then we we assume it contains a '
       'semicolon-separated list of Python files that we expect this script to '
       'output. If multiple files are passed in, then we assume output files '
@@ -699,46 +710,66 @@ def main():
       help='Base packages that import modules containing the target tf_export '
       'decorators.')
   parser.add_argument(
-      '--root_init_template', default='', type=str,
+      '--root_init_template',
+      default='',
+      type=str,
       help='Template for top level __init__.py file. '
-           '"#API IMPORTS PLACEHOLDER" comment will be replaced with imports.')
+      '"#API IMPORTS PLACEHOLDER" comment will be replaced with imports.')
   parser.add_argument(
-      '--apidir', type=str, required=True,
+      '--apidir',
+      type=str,
+      required=True,
       help='Directory where generated output files are placed. '
-           'gendir should be a prefix of apidir. Also, apidir '
-           'should be a prefix of every directory in outputs.')
+      'gendir should be a prefix of apidir. Also, apidir '
+      'should be a prefix of every directory in outputs.')
   parser.add_argument(
-      '--apiname', required=True, type=str,
+      '--apiname',
+      required=True,
+      type=str,
       choices=API_ATTRS.keys(),
       help='The API you want to generate.')
   parser.add_argument(
-      '--apiversion', default=2, type=int,
+      '--apiversion',
+      default=2,
+      type=int,
       choices=_API_VERSIONS,
       help='The API version you want to generate.')
   parser.add_argument(
-      '--compat_apiversions', default=[], type=int, action='append',
+      '--compat_apiversions',
+      default=[],
+      type=int,
+      action='append',
       help='Additional versions to generate in compat/ subdirectory. '
-           'If set to 0, then no additional version would be generated.')
+      'If set to 0, then no additional version would be generated.')
   parser.add_argument(
-      '--compat_init_templates', default=[], type=str, action='append',
+      '--compat_init_templates',
+      default=[],
+      type=str,
+      action='append',
       help='Templates for top-level __init__ files under compat modules. '
-           'The list of init file templates must be in the same order as '
-           'list of versions passed with compat_apiversions.')
+      'The list of init file templates must be in the same order as '
+      'list of versions passed with compat_apiversions.')
   parser.add_argument(
-      '--output_package', default='tensorflow', type=str,
+      '--output_package',
+      default='tensorflow',
+      type=str,
       help='Root output package.')
   parser.add_argument(
-      '--loading', default='default', type=str,
+      '--loading',
+      default='default',
+      type=str,
       choices=['lazy', 'static', 'default'],
       help='Controls how the generated __init__.py file loads the exported '
-           'symbols. \'lazy\' means the symbols are loaded when first used. '
-           '\'static\' means all exported symbols are loaded in the '
-           '__init__.py file. \'default\' uses the value of the '
-           '_LAZY_LOADING constant in create_python_api.py.')
+      'symbols. \'lazy\' means the symbols are loaded when first used. '
+      '\'static\' means all exported symbols are loaded in the '
+      '__init__.py file. \'default\' uses the value of the '
+      '_LAZY_LOADING constant in create_python_api.py.')
   parser.add_argument(
-      '--use_relative_imports', default=False, type=bool,
+      '--use_relative_imports',
+      default=False,
+      type=bool,
       help='Whether to import submodules using relative imports or absolute '
-           'imports')
+      'imports')
   args = parser.parse_args()
 
   if len(args.outputs) == 1:
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 1732a285855..0098003f0c8 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -34,7 +34,6 @@ import re
 import sys
 
 import six
-from six.moves import range
 import tensorflow as tf
 
 from google.protobuf import message
@@ -87,9 +86,8 @@ _API_GOLDEN_FOLDER_V2 = None
 def _InitPathConstants():
   global _API_GOLDEN_FOLDER_V1
   global _API_GOLDEN_FOLDER_V2
-  root_golden_path_v2 = os.path.join(
-      resource_loader.get_data_files_path(), '..', 'golden', 'v2',
-      'tensorflow.pbtxt')
+  root_golden_path_v2 = os.path.join(resource_loader.get_data_files_path(),
+                                     '..', 'golden', 'v2', 'tensorflow.pbtxt')
 
   if FLAGS.update_goldens:
     root_golden_path_v2 = os.path.realpath(root_golden_path_v2)
@@ -106,7 +104,6 @@ _UPDATE_WARNING_FILE = resource_loader.get_path_to_datafile(
 
 _NON_CORE_PACKAGES = ['estimator']
 
-
 # TODO(annarev): remove this once we test with newer version of
 # estimator that actually has compat v1 version.
 if not hasattr(tf.compat.v1, 'estimator'):
@@ -282,9 +279,6 @@ class ApiCompatibilityTest(test.TestCase):
       diff_count = len(diffs)
       logging.error(self._test_readme_message)
       logging.error('%d differences found between API and golden.', diff_count)
-      messages = verbose_diffs if verbose else diffs
-      for i in range(diff_count):
-        print('Issue %d\t: %s' % (i + 1, messages[i]), file=sys.stderr)
 
       if update_goldens:
         # Write files if requested.
@@ -394,11 +388,12 @@ class ApiCompatibilityTest(test.TestCase):
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     omit_golden_symbols_map = {}
-    if (api_version == 2 and FLAGS.only_test_core_api
-        and not _TENSORBOARD_AVAILABLE):
+    if (api_version == 2 and FLAGS.only_test_core_api and
+        not _TENSORBOARD_AVAILABLE):
       # In TF 2.0 these summary symbols are imported from TensorBoard.
       omit_golden_symbols_map['tensorflow.summary'] = [
-          'audio', 'histogram', 'image', 'scalar', 'text']
+          'audio', 'histogram', 'image', 'scalar', 'text'
+      ]
 
     self._checkBackwardsCompatibility(
         tf,
@@ -418,7 +413,9 @@ class ApiCompatibilityTest(test.TestCase):
         resource_loader.get_root_dir_with_all_resources(),
         _KeyToFilePath('*', api_version))
     self._checkBackwardsCompatibility(
-        tf.compat.v1, golden_file_pattern, api_version,
+        tf.compat.v1,
+        golden_file_pattern,
+        api_version,
         additional_private_map={
             'tf': ['pywrap_tensorflow'],
             'tf.compat': ['v1', 'v2'],
@@ -434,7 +431,8 @@ class ApiCompatibilityTest(test.TestCase):
     if FLAGS.only_test_core_api and not _TENSORBOARD_AVAILABLE:
       # In TF 2.0 these summary symbols are imported from TensorBoard.
       omit_golden_symbols_map['tensorflow.summary'] = [
-          'audio', 'histogram', 'image', 'scalar', 'text']
+          'audio', 'histogram', 'image', 'scalar', 'text'
+      ]
     self._checkBackwardsCompatibility(
         tf.compat.v2,
         golden_file_pattern,
diff --git a/tensorflow/tools/common/traverse.py b/tensorflow/tools/common/traverse.py
index 5efce450dcb..1d9c98277b5 100644
--- a/tensorflow/tools/common/traverse.py
+++ b/tensorflow/tools/common/traverse.py
@@ -46,9 +46,16 @@ def _traverse_internal(root, visit, stack, path):
           children.append(enum_member)
       children = sorted(children)
   except ImportError:
-    # On some Python installations, some modules do not support enumerating
-    # members (six in particular), leading to import errors.
-    children = []
+    # Children could be missing for one of two reasons:
+    # 1. On some Python installations, some modules do not support enumerating
+    #    members (six in particular), leading to import errors.
+    # 2. Children are lazy-loaded.
+    try:
+      children = []
+      for child_name in root.__all__:
+        children.append((child_name, getattr(root, child_name)))
+    except AttributeError:
+      children = []
 
   new_stack = stack + [root]
   visit(path, root, children)

From bbe22cf4dd6471ecb619543509afe81c96584f0d Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 1 Jun 2020 12:17:42 -0700
Subject: [PATCH 1481/1533] Fix failing test in group_conv_gpu.

PiperOrigin-RevId: 314178390
Change-Id: Icbd3300c0ede37b712c69661b5eb7fbbbaa4e7db
---
 tensorflow/python/keras/layers/convolutional_test.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
index 02bbdc7810c..f25eeb66664 100644
--- a/tensorflow/python/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -26,11 +26,11 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
-from tensorflow.python.keras import keras_parameterized
-from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -359,6 +359,7 @@ class Conv3DTest(keras_parameterized.TestCase):
             input_data=input_data)
 
 
+@keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class GroupedConvTest(keras_parameterized.TestCase):
 
   @parameterized.named_parameters(

From 6f22fa93766d18f45e2baac140bded0bc3e5e681 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 1 Jun 2020 12:18:22 -0700
Subject: [PATCH 1482/1533] Removing device-related code that doesn't seem to
 be used. Usage was removed by the following commits:
 https://github.com/tensorflow/tensorflow/commit/223c8bdf8963fef00cd9a1ec0fa10a3fb47fda1e,
 https://github.com/tensorflow/tensorflow/commit/04c23099c2aa745a70f91519f9182de4f2a15358#diff-3780f0ef44936240abc76c4c42541532.

PiperOrigin-RevId: 314178530
Change-Id: I7a2502d691610a6cd44a9752e9f48e4798071f13
---
 tensorflow/core/common_runtime/gpu/BUILD      |  28 ----
 .../core/common_runtime/gpu/gpu_device.cc     |   1 -
 .../common_runtime/gpu/gpu_stream_util.cc     | 112 -------------
 .../core/common_runtime/gpu/gpu_stream_util.h |  45 ------
 .../gpu/gpu_stream_util_test.cc               | 148 ------------------
 tensorflow/core/framework/device_base.h       |   3 -
 6 files changed, 337 deletions(-)
 delete mode 100644 tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
 delete mode 100644 tensorflow/core/common_runtime/gpu/gpu_stream_util.h
 delete mode 100644 tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc

diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 07919117051..3744eb967d3 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -121,7 +121,6 @@ filegroup(
         "gpu_managed_allocator.h",
         "gpu_mem_allocator.h",
         "gpu_process_state.h",
-        "gpu_stream_util.h",
         "gpu_util.h",
         "//tensorflow/core/common_runtime:gpu_runtime_headers",
     ],
@@ -137,7 +136,6 @@ tf_cuda_library(
         "gpu_device_factory.cc",
         "gpu_managed_allocator.cc",
         "gpu_process_state.cc",
-        "gpu_stream_util.cc",
         "gpu_util.cc",
         "gpu_util_platform_specific.cc",
     ],
@@ -400,29 +398,3 @@ tf_cc_test_gpu(
         "//tensorflow/core/kernels:ops_util",
     ],
 )
-
-tf_cc_test_gpu(
-    name = "gpu_stream_util_test",
-    size = "small",
-    srcs = ["gpu_stream_util_test.cc"],
-    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags() + ["nomac"],
-    deps = [
-        ":gpu_runtime",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:sendrecv_ops",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime:core_cpu",
-        "//tensorflow/core/common_runtime:core_cpu_internal",
-        "//tensorflow/core/common_runtime:direct_session_internal",
-        "//tensorflow/core/kernels:matmul_op",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index cf2e7043cae..e11b079b7ec 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -42,7 +42,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
 #include "tensorflow/core/common_runtime/gpu_device_context.h"
 #include "tensorflow/core/common_runtime/local_device.h"
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
deleted file mode 100644
index de715d140a1..00000000000
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
-
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "tensorflow/core/graph/algorithm.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-
-namespace tensorflow {
-namespace gpu_stream_util {
-
-Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
-                     std::unordered_map<int, int>* node_to_stream_id) {
-  VLOG(1) << "AssignStreams";
-  Status status;
-
-  // Sanity check arguments.
-  if (graph == nullptr)
-    status.Update(errors::InvalidArgument("Bad graph argument supplied."));
-  if (node_to_stream_id == nullptr) {
-    status.Update(
-        errors::InvalidArgument("Bad node_to_stream_id argument supplied."));
-  }
-  if ((opts.max_streams < 1) || (opts.send_stream >= opts.max_streams) ||
-      (opts.recv_stream >= opts.max_streams) ||
-      (opts.const_stream >= opts.max_streams) ||
-      (opts.compute_stream >= opts.max_streams)) {
-    status.Update(errors::InvalidArgument("Bad graph argument supplied."));
-  }
-  TF_RETURN_IF_ERROR(status);
-
-  // Topologically sort the nodes.
-  std::vector<Node*> order;
-  GetReversePostOrder(*graph, &order);
-  if (VLOG_IS_ON(2)) {
-    for (Node* n : order) {
-      const int node_id = n->id();
-      VLOG(2) << "Node " << node_id << " " << n->type_string() << " "
-              << n->name() << " " << n->in_edges().size() << " inputs";
-      for (const Edge* e : n->in_edges()) {
-        VLOG(2) << "  Edge from " << e->src()->id() << "  " << e->src()->name()
-                << " fanout " << e->src()->out_edges().size();
-      }
-    }
-  }
-  // We perform stream assignment assuming a large number of
-  // stream IDs and then map these down to the required number of streams
-  // using simple round-robin.
-  // Stream Assignment strategy:
-  // 1. Nodes with zero inputs are always be executed on a
-  // fresh stream.
-  // 2. Try to execute a node on the same stream as one of its
-  // inputs to avoid inter-stream dependencies.
-  // 3. If any input comes from a node with a large fanout then
-  // perhaps an indication that it is shared between parallel
-  // streams of work. We choose a new stream here so that all consumers
-  // of the tensor are likely to run in parallel.
-  int highest_stream_id = -1;
-  for (Node* n : order) {
-    VLOG(3) << "Inspecting node " << n->DebugString();
-    const int node_id = n->id();
-    const string& op = n->type_string();
-
-    // Determine a suitable stream to use.
-    int stream_id = highest_stream_id + 1;
-    for (const Edge* e : n->in_edges()) {
-      const size_t fanout = e->src()->out_edges().size();
-      if (fanout == 1) {
-        stream_id = (*node_to_stream_id)[e->src()->id()];
-        break;
-      }
-    }
-    // Override stream for specific op types.
-    if (op == "_Send") {
-      if (opts.send_stream >= 0) stream_id = opts.send_stream;
-    } else if (op == "_Recv") {
-      if (opts.recv_stream >= 0) stream_id = opts.recv_stream;
-    } else if (op == "Const") {
-      if (opts.const_stream >= 0) stream_id = opts.const_stream;
-    } else {
-      if (opts.compute_stream >= 0) stream_id = opts.compute_stream;
-    }
-
-    (*node_to_stream_id)[node_id] = stream_id % opts.max_streams;
-    highest_stream_id = std::max(stream_id, highest_stream_id);
-  }
-  VLOG(1) << "Identified " << highest_stream_id << " candidate streams for "
-          << order.size() << " nodes.";
-
-  return Status::OK();
-}
-
-}  // namespace gpu_stream_util
-}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h b/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
deleted file mode 100644
index c61ada96efe..00000000000
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
-#define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
-
-#include <unordered_map>
-
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-namespace gpu_stream_util {
-
-struct AssignStreamsOpts {
-  int32 max_streams = 1;
-  // The following options specify a stream to use for specific op
-  // types.  The value -1 allows ops to be assigned to any stream.
-  int32 send_stream = -1;
-  int32 recv_stream = -1;
-  int32 const_stream = -1;
-  int32 compute_stream = -1;
-};
-
-// Given the input graph, assigns every node in the graph with a
-// stream_id that should be used.
-Status AssignStreams(const Graph* graph, const AssignStreamsOpts& opts,
-                     std::unordered_map<int, int>* node_to_stream_id);
-
-}  // namespace gpu_stream_util
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_STREAM_UTIL_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc b/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
deleted file mode 100644
index 2500425359c..00000000000
--- a/tensorflow/core/common_runtime/gpu/gpu_stream_util_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
-
-#include "tensorflow/cc/ops/sendrecv_ops.h"
-#include "tensorflow/cc/ops/standard_ops.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/graph/node_builder.h"
-#include "tensorflow/core/kernels/ops_testutil.h"
-#include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-class GpuStreamUtilTest : public OpsTestBase {
- protected:
-};
-
-TEST_F(GpuStreamUtilTest, BogusOpts) {
-  auto root = Scope::NewRootScope().ExitOnError();
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  Status status;
-  status = gpu_stream_util::AssignStreams(nullptr, opts, &node_to_stream_id);
-  EXPECT_FALSE(status.ok());
-  status = gpu_stream_util::AssignStreams(&g, opts, nullptr);
-  EXPECT_FALSE(status.ok());
-  opts.max_streams = 0;
-  status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id);
-  EXPECT_FALSE(status.ok());
-  opts.max_streams = 1;
-  opts.compute_stream = 5;
-  status = gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id);
-  EXPECT_FALSE(status.ok());
-}
-
-TEST_F(GpuStreamUtilTest, EmptyGraph) {
-  auto root = Scope::NewRootScope().ExitOnError();
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-  EXPECT_EQ(2, node_to_stream_id.size());  // _SOURCE and _SINK
-}
-
-TEST_F(GpuStreamUtilTest, SimpleGraphOneStream) {
-  auto root = Scope::DisabledShapeInferenceScope().ExitOnError();
-  ops::MatMul(root, {}, {});
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-
-  // There should be 5 nodes assigned.
-  EXPECT_EQ(5, node_to_stream_id.size());
-
-  // All of them should have stream 0.
-  for (const auto& it : node_to_stream_id) {
-    EXPECT_EQ(0, it.second);
-  }
-}
-
-TEST_F(GpuStreamUtilTest, SimpleGraphManyStreams) {
-  auto root = Scope::DisabledShapeInferenceScope().ExitOnError();
-  ops::MatMul(root, {}, {});
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  opts.max_streams = 3;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-
-  // There should be 5 nodes assigned.
-  EXPECT_EQ(5, node_to_stream_id.size());
-
-  // All of them should have a stream in the range [0..max_streams).
-  for (const auto& it : node_to_stream_id) {
-    EXPECT_GE(it.second, 0);
-    EXPECT_LT(it.second, opts.max_streams);
-  }
-}
-
-TEST_F(GpuStreamUtilTest, StreamOverrides) {
-  auto root = Scope::DisabledShapeInferenceScope().ExitOnError();
-  ops::_Recv(root.WithOpName("input"), DT_FLOAT, "input", "/cpu:0", 0,
-             "/device:GPU:0");
-  Output n = ops::MatMul(root, {}, {});
-  ops::_Send(root.WithOpName("output"), n, "output", "/device:GPU:0", 0,
-             "/cpu:0");
-  Graph g(OpRegistry::Global());
-  TF_ASSERT_OK(root.ToGraph(&g));
-
-  // Perform stream assignment using a large number of streams, but with
-  // op types constrained to specific streams.
-  std::unordered_map<int, int> node_to_stream_id;
-  gpu_stream_util::AssignStreamsOpts opts;
-  opts.max_streams = 100;
-  opts.const_stream = 90;
-  opts.send_stream = 91;
-  opts.recv_stream = 92;
-  opts.compute_stream = 93;
-  TF_ASSERT_OK(gpu_stream_util::AssignStreams(&g, opts, &node_to_stream_id));
-
-  // There should be 7 nodes assigned.
-  EXPECT_EQ(7, node_to_stream_id.size());  // including _SOURCE and _SINK
-
-  // Nodes should be assigned to streams by op type.
-  for (const auto& it : node_to_stream_id) {
-    Node* n = g.FindNodeId(it.first);
-    const string& op = n->type_string();
-    const int stream = it.second;
-    if (op == "Const") {
-      EXPECT_EQ(stream, 90);
-    } else if (op == "_Send") {
-      EXPECT_EQ(stream, 91);
-    } else if (op == "_Recv") {
-      EXPECT_EQ(stream, 92);
-    } else {  // Compute.
-      EXPECT_EQ(stream, 93);
-    }
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 81ddf8df98d..3415c7f23fc 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -119,9 +119,6 @@ class DeviceContext : public core::RefCounted {
   }
 };
 
-// map[i] is the DeviceContext* for the node with id i, if i < map.size().
-typedef std::vector<DeviceContext*> DeviceContextMap;
-
 class DeviceBase {
  public:
   explicit DeviceBase(Env* env) : env_(env) {}

From 21896de7bc18e5ac0fb876c86e05adda0669e547 Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 1 Jun 2020 12:44:39 -0700
Subject: [PATCH 1483/1533] Use RetrieveBuiltinData and
 RetrieveCustomInitialData instead of directly accessing builtin_data and
 custom_initial_data.

PiperOrigin-RevId: 314183617
Change-Id: I8494bf3fe28fc8e3eeba21611beccc48354eac96
---
 .../delegates/gpu/common/model_builder.cc     | 132 +++++++-----------
 1 file changed, 49 insertions(+), 83 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 29c819f7800..4501ec0f0e0 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -422,11 +422,8 @@ class AddOperationParser : public TFLiteOperationParser {
     AddAttributes attr;
     RETURN_IF_ERROR(ParseInputsWithConstTensor(node, reader, &attr.param));
     node->operation.attributes = std::move(attr);
-    const auto* tf_options =
-        static_cast<const TfLiteAddParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteAddParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
                                                 node);
   }
@@ -499,11 +496,8 @@ class ConcatenationOperationParser : public TFLiteOperationParser {
         break;
       }
     }
-    const auto* tf_options = static_cast<const TfLiteConcatenationParams*>(
-        tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteConcatenationParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation,
                                                          graph, node));
     node->operation.attributes = attr;
@@ -601,11 +595,8 @@ class Conv2DOperationParser : public TFLiteOperationParser {
     }
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
 
-    const auto* tf_options =
-        static_cast<const TfLiteConvParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteConvParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
     attr.dilations = HW(tf_options->dilation_height_factor,
                         tf_options->dilation_width_factor);
@@ -639,17 +630,19 @@ class Convolution2DTransposeBiasParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* params = static_cast<const TfLiteTransposeConvParams*>(
-        tflite_node->custom_initial_data);
+    const TfLiteTransposeConvParams* tf_options;
+    auto status = RetrieveCustomInitialData(tflite_node, &tf_options);
+
     ConvolutionTransposedAttributes attr;
-    attr.stride =
-        params ? HW(params->stride_height, params->stride_width) : HW(1, 1);
+    attr.stride = status.ok()
+                      ? HW(tf_options->stride_height, tf_options->stride_width)
+                      : HW(1, 1);
 
     RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights));
     reader->ReadTensor(2, &attr.bias).IgnoreError();  // bias is optional
 
-    UpdatePadding(params->padding, graph->FindInputs(node->id)[0]->tensor.shape,
-                  &attr);
+    UpdatePadding(status.ok() ? tf_options->padding : kTfLitePaddingUnknown,
+                  graph->FindInputs(node->id)[0]->tensor.shape, &attr);
 
     node->operation.attributes = std::move(attr);
     return absl::OkStatus();
@@ -874,17 +867,15 @@ class ElementwiseOperationParser : public TFLiteOperationParser {
       TfLiteFusedActivation activation = kTfLiteActNone;
       switch (operation_type_) {
         case OperationType::SUB: {
-          const auto* tf_options =
-              static_cast<const TfLiteSubParams*>(tflite_node->builtin_data);
-          if (tf_options != nullptr) {
+          const TfLiteSubParams* tf_options;
+          if (RetrieveBuiltinData(tflite_node, &tf_options).ok()) {
             activation = tf_options->activation;
           }
           break;
         }
         case OperationType::DIV: {
-          const auto* tf_options =
-              static_cast<const TfLiteDivParams*>(tflite_node->builtin_data);
-          if (tf_options != nullptr) {
+          const TfLiteDivParams* tf_options;
+          if (RetrieveBuiltinData(tflite_node, &tf_options).ok()) {
             activation = tf_options->activation;
           }
           break;
@@ -1002,8 +993,8 @@ class FullyConnectedOperationParser : public TFLiteOperationParser {
     Node* node = graph->NewNode();
     RETURN_IF_ERROR(reader->AddInput(node, 0));
 
-    const auto* tf_options = static_cast<const TfLiteFullyConnectedParams*>(
-        tflite_node->builtin_data);
+    const TfLiteFullyConnectedParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->weights_format !=
         kTfLiteFullyConnectedWeightsFormatDefault) {
       return absl::UnimplementedError(
@@ -1112,12 +1103,9 @@ class LSTMOperationParser : public TFLiteOperationParser {
       return absl::InvalidArgumentError("LSTM should have 4 output tensors");
     }
 
-    const auto* params =
-        static_cast<const TfLiteLSTMParams*>(tflite_node->builtin_data);
-    if (!params) {
-      return absl::InternalError("Missing tflite params");
-    }
-    RETURN_IF_ERROR(CheckParameters(params));
+    const TfLiteLSTMParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+    RETURN_IF_ERROR(CheckParameters(tf_options));
 
     Node* concat_node = graph->NewNode();
     concat_node->operation.type = ToString(OperationType::CONCAT);
@@ -1251,11 +1239,8 @@ class MulOperationParser : public TFLiteOperationParser {
                                           constant_dims, graph, reader));
     }
 
-    const auto* tf_options =
-        static_cast<const TfLiteMulParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing TfLiteMulParams");
-    }
+    const TfLiteMulParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph,
                                                 node);
   }
@@ -1338,8 +1323,8 @@ class PadOperationParser : public TFLiteOperationParser {
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
     if (mirror_pad_) {
-      auto* tf_options = static_cast<const TfLiteMirrorPaddingParams*>(
-          tflite_node->builtin_data);
+      const TfLiteMirrorPaddingParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
       if (tf_options->mode !=
           TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect) {
         return absl::InvalidArgumentError(
@@ -1444,14 +1429,9 @@ class Pooling2DOperationParser : public TFLiteOperationParser {
     // is MaxPoolingWithArgmax2D. There is no way to read
     // tflite_node->builtin_code, so, simply check whether custom data is
     // available.
-    auto* tf_options =
-        static_cast<const TfLitePoolParams*>(tflite_node->custom_initial_data);
-    if (!tf_options) {
-      tf_options =
-          static_cast<const TfLitePoolParams*>(tflite_node->builtin_data);
-    }
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
+    const TfLitePoolParams* tf_options;
+    if (!RetrieveCustomInitialData(tflite_node, &tf_options).ok()) {
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     }
 
     std::vector<uint32_t> max_tensor_id{0};
@@ -1637,10 +1617,8 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   template <class T>
   absl::Status GetAlignCornersValueForType(const TfLiteNode* tflite_node,
                                            bool* align_corners) {
-    const auto* tf_options = static_cast<const T*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const T* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     *align_corners = tf_options->align_corners;
     return absl::OkStatus();
   }
@@ -1648,12 +1626,8 @@ class Resize2DOperationParser : public TFLiteOperationParser {
   absl::Status GetHalfPixelCentersValue(const TfLiteNode* tflite_node,
                                         bool* half_pixel_centers) {
     if (sampling_type_ == SamplingType::BILINEAR) {
-      const auto* tf_options =
-          static_cast<TfLiteResizeBilinearParams*>(tflite_node->builtin_data);
-      if (!tf_options) {
-        return absl::InternalError(
-            "Missing tflite params for ResizeBilinear op");
-      }
+      const TfLiteResizeBilinearParams* tf_options;
+      RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
       if (tf_options->align_corners && tf_options->half_pixel_centers) {
         return absl::InternalError(
             "If half_pixel_centers is True, align_corners must be False.");
@@ -1809,11 +1783,8 @@ class SoftmaxOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* tf_options =
-        static_cast<const TfLiteSoftmaxParams*>(tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteSoftmaxParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     if (tf_options->beta != 1) {
       // there is multiply by scalar operation fused in softmax. Make a layer
       // out of it before softmax.
@@ -1857,8 +1828,8 @@ class SpaceToDepthOperationParser : public TFLiteOperationParser {
     node->operation.type = ToString(OperationType::SPACE_TO_DEPTH);
     RETURN_IF_ERROR(reader->AddInput(node, 0));
     RETURN_IF_ERROR(reader->AddOutputs(node));
-    const auto* tf_options =
-        static_cast<const TfLiteSpaceToDepthParams*>(tflite_node->builtin_data);
+    const TfLiteSpaceToDepthParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     SpaceToDepthAttributes attr;
     attr.block_size = tf_options->block_size;
     node->operation.attributes = attr;
@@ -1898,14 +1869,12 @@ class StridedSliceOperationParser : public TFLiteOperationParser {
           "Slicing is supported for 3 or 4 dimensional tensors only.");
     }
 
-    const auto* tf_options =
-        static_cast<const TfLiteStridedSliceParams*>(tflite_node->builtin_data);
-    auto out_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+    const TfLiteStridedSliceParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
     RETURN_IF_ERROR(CheckOptionsSupport(tf_options));
 
+    auto out_shape = graph->FindOutputs(node->id)[0]->tensor.shape;
+
     SliceAttributes attr;
     if (read_without_batch) {
       RETURN_IF_ERROR(ReadAttribsWithoutBatch(reader, tf_options,
@@ -2074,11 +2043,9 @@ class TransposeConvOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(graph->AddConsumer(node->id, input->id));
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    const auto* tf_options = static_cast<const TfLiteTransposeConvParams*>(
-        tflite_node->builtin_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite options.");
-    }
+    const TfLiteTransposeConvParams* tf_options;
+    RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options));
+
     ConvolutionTransposedAttributes attr;
     attr.stride = tf_options
                       ? HW(tf_options->stride_height, tf_options->stride_width)
@@ -2157,11 +2124,10 @@ class Unpooling2DOperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
     auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape;
     MaxUnpooling2DAttributes attr;
-    const auto* tf_options =
-        static_cast<const TfLitePoolParams*>(tflite_node->custom_initial_data);
-    if (!tf_options) {
-      return absl::InternalError("Missing tflite params");
-    }
+
+    const TfLitePoolParams* tf_options;
+    RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options));
+
     attr.kernel = ToHW(tf_options->filter_height, tf_options->filter_width);
     attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width);
     UpdatePadding(tf_options->padding, input_shape, &attr);

From 2e02ff669a047969d2aa0992a4c032871ba8ffe2 Mon Sep 17 00:00:00 2001
From: Liam Miller-Cushon <cushon@google.com>
Date: Mon, 1 Jun 2020 12:53:34 -0700
Subject: [PATCH 1484/1533] Explicitly disable javac -parameters metadata for
 Java 7 compilations

PiperOrigin-RevId: 314185291
Change-Id: I8688c8dfd65680bfeffd5832d3826f013af96d9d
---
 tensorflow/java/build_defs.bzl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index 04139bac065..85256700fae 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -5,6 +5,7 @@
 
 JAVA_VERSION_OPTS = [
     "-source 7 -target 7",
+    "-XDnoparameters",
 ]
 
 # A more robust set of lint and errorprone checks when building

From 31defe9cf14e1773184d371b5b9e9dd3e22c072d Mon Sep 17 00:00:00 2001
From: Robert David <lrdx@google.com>
Date: Mon, 1 Jun 2020 13:06:22 -0700
Subject: [PATCH 1485/1533] Use CheckMaxSupportedOpVersion instead of
 CheckExactSupportedOpVersion in LSTM; the only feature missing in version 1
 is the support for full LSTM, what is already detected through settings.

PiperOrigin-RevId: 314188090
Change-Id: Ia33989f0ebf67c01cb9ca1bf1817320bd438bff6
---
 .../lite/delegates/gpu/common/model_builder.cc    | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 4501ec0f0e0..475f2bade9c 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -257,18 +257,7 @@ absl::Status CheckMaxSupportedOpVersion(const TfLiteRegistration* registration,
   if (op_version > max_version) {
     return absl::UnimplementedError(
         absl::StrCat("Max version supported: ", max_version,
-                     ". Requested version ", op_version));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status CheckExactSupportedOpVersion(
-    const TfLiteRegistration* registration, int expected_version) {
-  int op_version = registration->version;
-  if (op_version != expected_version) {
-    return absl::UnimplementedError(
-        absl::StrCat("Only version ", expected_version,
-                     " is supported. Requested version ", op_version));
+                     ". Requested version ", op_version, "."));
   }
   return absl::OkStatus();
 }
@@ -1082,7 +1071,7 @@ class LSTMOperationParser : public TFLiteOperationParser {
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
                            const TfLiteRegistration* registration) final {
-    RETURN_IF_ERROR(CheckExactSupportedOpVersion(registration, 2));
+    RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 2));
     // TODO(eignasheva): Fix bad check.
     // RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node,
     // /*runtime_inputs=*/5,

From e9728ac5c974f6304ae2ff915bb83f33b2206bb0 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 1 Jun 2020 13:25:58 -0700
Subject: [PATCH 1486/1533] Reorganized ElementwiseTwoInput. Added version for
 constant linear/HWC tensors. Removed MultiplyAdd operation (replaced by
 ElementwiseTwoInput).

PiperOrigin-RevId: 314191840
Change-Id: I58cf275c821249c24a2debfe09b66d9d7f90458a
---
 .../lite/delegates/gpu/cl/kernels/BUILD       |  42 +--
 .../delegates/gpu/cl/kernels/elementwise.cc   | 269 ++++++++++++------
 .../delegates/gpu/cl/kernels/elementwise.h    |  85 ++++--
 .../gpu/cl/kernels/elementwise_test.cc        | 159 ++++++++---
 .../delegates/gpu/cl/kernels/gpu_operation.cc |   2 +-
 .../delegates/gpu/cl/kernels/multiply_add.cc  | 182 ------------
 .../delegates/gpu/cl/kernels/multiply_add.h   | 137 ---------
 .../gpu/cl/kernels/multiply_add_test.cc       | 187 ------------
 .../lite/delegates/gpu/cl/selectors/BUILD     |   1 -
 .../gpu/cl/selectors/operation_selector.cc    | 135 +++++----
 .../gpu/cl/selectors/simple_selectors.cc      |  23 --
 .../gpu/cl/selectors/simple_selectors.h       |  10 -
 tensorflow/lite/delegates/gpu/cl/tensor.cc    |  16 +-
 tensorflow/lite/delegates/gpu/cl/tensor.h     |   7 +
 14 files changed, 476 insertions(+), 779 deletions(-)
 delete mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
 delete mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
 delete mode 100644 tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc

diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
index 24a62e5a82f..8b54078dbf8 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/BUILD
@@ -682,7 +682,9 @@ cc_library(
     deps = [
         ":gpu_operation",
         ":util",
+        "//tensorflow/lite/delegates/gpu/cl:storage_type_util",
         "//tensorflow/lite/delegates/gpu/common:operations",
+        "//tensorflow/lite/delegates/gpu/common:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -874,45 +876,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "multiply_add",
-    srcs = ["multiply_add.cc"],
-    hdrs = ["multiply_add.h"],
-    deps = [
-        ":flt_type",
-        ":gpu_operation",
-        ":util",
-        "//tensorflow/lite/delegates/gpu/cl:cl_context",
-        "//tensorflow/lite/delegates/gpu/cl:linear_storage",
-        "//tensorflow/lite/delegates/gpu/cl:precision",
-        "//tensorflow/lite/delegates/gpu/common:data_type",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:shape",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "//tensorflow/lite/delegates/gpu/common:types",
-        "@com_google_absl//absl/types:variant",
-    ],
-)
-
-cc_test(
-    name = "multiply_add_test",
-    srcs = ["multiply_add_test.cc"],
-    linkstatic = True,
-    tags = tf_gpu_tests_tags() + [
-        "linux",
-        "local",
-    ],
-    deps = [
-        ":cl_test",
-        ":multiply_add",
-        "//tensorflow/lite/delegates/gpu/common:operations",
-        "//tensorflow/lite/delegates/gpu/common:status",
-        "//tensorflow/lite/delegates/gpu/common:tensor",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
-
 cc_library(
     name = "padding",
     srcs = ["padding.cc"],
@@ -1456,7 +1419,6 @@ test_suite(
         "fully_connected_test",
         "lstm_test",
         "max_unpooling_test",
-        "multiply_add_test",
         "padding_test",
         "pooling_test",
         "prelu_test",
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
index e435bccef03..782cf357c40 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.cc
@@ -20,10 +20,48 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
+#include "tensorflow/lite/delegates/gpu/cl/storage_type_util.h"
 
 namespace tflite {
 namespace gpu {
 namespace cl {
+namespace {
+std::string GetTwoInputCode(const OperationType& op_type,
+                            const std::string& input0,
+                            const std::string& input1) {
+  std::string result;
+  switch (op_type) {
+    case OperationType::ADD:
+      result += "$0 += $1;\n";
+      break;
+    case OperationType::DIV:
+      result += "$0 /= $1;\n";
+      break;
+    case OperationType::MAXIMUM:
+      result += "$0 = max($0, $1);\n";
+      break;
+    case OperationType::MINIMUM:
+      result += "$0 = min($0, $1);\n";
+      break;
+    case OperationType::MUL:
+      result += "$0 *= $1;\n";
+      break;
+    case OperationType::POW:
+      result += "$0 = pow($0, $1);\n";
+      break;
+    case OperationType::SQUARED_DIFF:
+      result += "$0 -= $1;\n";
+      result += "$0 *= $0;\n";
+      break;
+    case OperationType::SUB:
+      result += "$0 -= $1;\n";
+      break;
+    default:
+      return "Unknown operation type;\n";
+  }
+  return absl::Substitute(result, input0, input1);
+}
+}  // namespace
 
 ElementwiseOneInput::ElementwiseOneInput(ElementwiseOneInput&& operation)
     : ElementwiseOperation(std::move(operation)),
@@ -105,13 +143,66 @@ ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
   return operation;
 }
 
+ElementwiseOneRuntimeOneScalar::ElementwiseOneRuntimeOneScalar(
+    ElementwiseOneRuntimeOneScalar&& operation)
+    : ElementwiseOperation(std::move(operation)),
+      link_index_(operation.link_index_),
+      op_type_(operation.op_type_),
+      scalar_parameter_(std::move(operation.scalar_parameter_)) {}
+
+ElementwiseOneRuntimeOneScalar& ElementwiseOneRuntimeOneScalar::operator=(
+    ElementwiseOneRuntimeOneScalar&& operation) {
+  if (this != &operation) {
+    link_index_ = operation.link_index_;
+    op_type_ = operation.op_type_;
+    scalar_parameter_ = operation.scalar_parameter_;
+    ElementwiseOperation::operator=(std::move(operation));
+  }
+  return *this;
+}
+
+void ElementwiseOneRuntimeOneScalar::SetLinkIndex(int index) {
+  link_index_ = index;
+  scalar_parameter_.SetName(absl::StrCat("scalar_parmeter_", index));
+}
+
+std::string ElementwiseOneRuntimeOneScalar::GetCoreCode(
+    const LinkingContext& context) const {
+  std::string second_var =
+      absl::StrCat("(FLT)(", scalar_parameter_.GetName(), ")");
+  return GetTwoInputCode(op_type_, context.var_name, second_var);
+}
+
+std::string ElementwiseOneRuntimeOneScalar::GetArgsDeclaration() const {
+  std::string args;
+  absl::StrAppend(&args, ",\n    ", scalar_parameter_.GetDeclaration());
+  return args;
+}
+
+absl::Status ElementwiseOneRuntimeOneScalar::BindArguments(CLKernel* kernel) {
+  RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_parameter_));
+  return absl::OkStatus();
+}
+
+ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type, float scalar_parameter) {
+  const auto scalar_precision = creation_context.device->IsPowerVR()
+                                    ? CalculationsPrecision::F32
+                                    : definition.precision;
+  ElementwiseOneRuntimeOneScalar operation(
+      definition, op_type, FLT(scalar_precision, scalar_parameter));
+  operation.SetLinkIndex(0);
+  return operation;
+}
+
 ElementwiseTwoInput::ElementwiseTwoInput(ElementwiseTwoInput&& operation)
     : ElementwiseOperation(std::move(operation)),
       link_index_(operation.link_index_),
       op_type_(operation.op_type_),
       broadcast_(operation.broadcast_),
-      scalar_para_(operation.scalar_para_),
-      use_scalar_para_(operation.use_scalar_para_) {}
+      use_constant_tensor_(operation.use_constant_tensor_),
+      constant_tensor_(std::move(operation.constant_tensor_)) {}
 
 ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     ElementwiseTwoInput&& operation) {
@@ -119,8 +210,8 @@ ElementwiseTwoInput& ElementwiseTwoInput::operator=(
     link_index_ = operation.link_index_;
     op_type_ = operation.op_type_;
     broadcast_ = operation.broadcast_;
-    scalar_para_ = operation.scalar_para_;
-    use_scalar_para_ = operation.use_scalar_para_;
+    use_constant_tensor_ = operation.use_constant_tensor_;
+    constant_tensor_ = std::move(operation.constant_tensor_);
     ElementwiseOperation::operator=(std::move(operation));
   }
   return *this;
@@ -128,84 +219,51 @@ ElementwiseTwoInput& ElementwiseTwoInput::operator=(
 
 void ElementwiseTwoInput::SetLinkIndex(int index) {
   link_index_ = index;
-  if (use_scalar_para_) {
-    scalar_para_.SetName(absl::StrCat("scalar_para_", index));
-  }
 }
 
 std::string ElementwiseTwoInput::GetCoreCode(
     const LinkingContext& context) const {
   std::string result;
   std::string second_var;
-  if (use_scalar_para_) {
-    second_var = absl::StrCat("(FLT)(", scalar_para_.GetName(), ")");
-  } else {
-    const std::string size_name = "src_size_" + std::to_string(link_index_);
-    TensorCodeGenerator src_tensor(
-        absl::StrCat("src_data_", link_index_),
-        WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
-        definition_.src_tensors[1]);
-    const std::string x_coord = broadcast_.width ? "0" : context.x_coord;
-    const std::string y_coord = broadcast_.height ? "0" : context.y_coord;
-    const std::string s_coord = broadcast_.channels ? "0" : context.s_coord;
-    second_var = "second_var_" + std::to_string(link_index_);
-    result = "  FLT4 " + second_var + " = " +
-             src_tensor.ReadWHS(x_coord, y_coord, s_coord) + ";\n";
-    if (broadcast_.channels) {
-      result += "  " + second_var + ".y = " + second_var + ".x;\n";
-      result += "  " + second_var + ".z = " + second_var + ".x;\n";
-      result += "  " + second_var + ".w = " + second_var + ".x;\n";
-    }
+  const std::string size_name = "src_size_" + std::to_string(link_index_);
+  TensorDescriptor descriptor = use_constant_tensor_
+                                    ? constant_tensor_.GetDescriptor()
+                                    : definition_.src_tensors[1];
+  TensorCodeGenerator src_tensor(
+      absl::StrCat("src_data_", link_index_),
+      WHSPoint{size_name + ".x", size_name + ".y", size_name + ".z"},
+      descriptor);
+  const std::string x_coord = broadcast_.width ? "0" : context.x_coord;
+  const std::string y_coord = broadcast_.height ? "0" : context.y_coord;
+  const std::string s_coord = broadcast_.channels ? "0" : context.s_coord;
+  second_var = "second_var_" + std::to_string(link_index_);
+  result = "  FLT4 " + second_var + " = " +
+           src_tensor.ReadWHS(x_coord, y_coord, s_coord) + ";\n";
+  if (broadcast_.channels) {
+    result += "  " + second_var + ".y = " + second_var + ".x;\n";
+    result += "  " + second_var + ".z = " + second_var + ".x;\n";
+    result += "  " + second_var + ".w = " + second_var + ".x;\n";
   }
-  switch (op_type_) {
-    case OperationType::ADD:
-      result += "$0 += $1;\n";
-      break;
-    case OperationType::DIV:
-      result += "$0 /= $1;\n";
-      break;
-    case OperationType::MAXIMUM:
-      result += "$0 = max($0, $1);\n";
-      break;
-    case OperationType::MINIMUM:
-      result += "$0 = min($0, $1);\n";
-      break;
-    case OperationType::MUL:
-      result += "$0 *= $1;\n";
-      break;
-    case OperationType::POW:
-      result += "$0 = pow($0, $1);\n";
-      break;
-    case OperationType::SQUARED_DIFF:
-      result += "$0 -= $1;\n";
-      result += "$0 *= $0;\n";
-      break;
-    case OperationType::SUB:
-      result += "$0 -= $1;\n";
-      break;
-    default:
-      return "Unknown operation type;\n";
-  }
-  return absl::Substitute(result, context.var_name, second_var);
+  return result + GetTwoInputCode(op_type_, context.var_name, second_var);
 }
 
 std::string ElementwiseTwoInput::GetArgsDeclaration() const {
   std::string args;
-  if (use_scalar_para_) {
-    absl::StrAppend(&args, ",\n    ", scalar_para_.GetDeclaration());
-  } else {
-    absl::StrAppend(&args, ",\n",
-                    GetTensorDeclaration(AccessType::READ,
-                                         absl::StrCat("src_data_", link_index_),
-                                         definition_.src_tensors[1]));
-    absl::StrAppend(&args, ",\n   int4 src_size_", link_index_);
-  }
+  TensorDescriptor descriptor = use_constant_tensor_
+                                    ? constant_tensor_.GetDescriptor()
+                                    : definition_.src_tensors[1];
+  absl::StrAppend(
+      &args, ",\n",
+      GetTensorDeclaration(AccessType::READ,
+                           absl::StrCat("src_data_", link_index_), descriptor));
+  absl::StrAppend(&args, ",\n   int4 src_size_", link_index_);
   return args;
 }
 
 absl::Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
-  if (use_scalar_para_) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_para_));
+  if (use_constant_tensor_) {
+    RETURN_IF_ERROR(kernel->SetMemoryAuto(constant_tensor_.GetMemoryPtr()));
+    RETURN_IF_ERROR(kernel->SetBytesAuto(constant_tensor_.GetWBatchedHSB()));
   } else {
     RETURN_IF_ERROR(kernel->SetMemoryAuto(src_[1]->GetMemoryPtr()));
     RETURN_IF_ERROR(kernel->SetBytesAuto(src_[1]->GetWBatchedHSB()));
@@ -213,27 +271,68 @@ absl::Status ElementwiseTwoInput::BindArguments(CLKernel* kernel) {
   return absl::OkStatus();
 }
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
+absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes* attr) {
-  ElementwiseTwoInput operation(definition, op_type, broadcast);
-  if (attr) {
-    const float* scalar = absl::get_if<float>(&attr->param);
-    if (scalar) {
-      const auto scalar_precision = creation_context.device->IsPowerVR()
-                                        ? CalculationsPrecision::F32
-                                        : definition.precision;
-      operation.SetScalarPara(FLT(scalar_precision, *scalar));
-    }
-  }
-  operation.SetLinkIndex(0);
-  return operation;
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result) {
+  const BHWC shape = BHWC(1, 1, 1, constant_tensor.shape.v);
+  TensorStorageType storage_type =
+      SelectBestStorageType(*creation_context.context, *creation_context.device,
+                            shape, definition.GetPrimaryStorageType(),
+                            definition.GetDataType(), Layout::HWC);
+  TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
+  Tensor gpu_tensor;
+  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
+                               *creation_context.device, shape, desc,
+                               &gpu_tensor));
+  RETURN_IF_ERROR(
+      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  BroadcastSettings broadcast;
+  broadcast.width = true;
+  broadcast.height = true;
+  broadcast.channels = shape.c == 1;
+  *result = ElementwiseTwoInput(definition, op_type, broadcast,
+                                std::move(gpu_tensor));
+  result->SetLinkIndex(0);
+  return absl::OkStatus();
 }
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
-    const OperationDef& definition, const OperationType& op_type,
-    const BroadcastSettings& broadcast) {
+absl::Status CreateElementwiseTwoInput(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result) {
+  const BHWC shape = BHWC(1, constant_tensor.shape.h, constant_tensor.shape.w,
+                          constant_tensor.shape.c);
+  TensorStorageType storage_type =
+      SelectBestStorageType(*creation_context.context, *creation_context.device,
+                            shape, definition.GetPrimaryStorageType(),
+                            definition.GetDataType(), Layout::HWC);
+  TensorDescriptor desc{definition.GetDataType(), storage_type, Layout::HWC};
+  Tensor gpu_tensor;
+  RETURN_IF_ERROR(CreateTensor(*creation_context.context,
+                               *creation_context.device, shape, desc,
+                               &gpu_tensor));
+  RETURN_IF_ERROR(
+      gpu_tensor.WriteData(creation_context.queue, constant_tensor));
+  BroadcastSettings broadcast;
+  broadcast.width = shape.w == 1;
+  broadcast.height = shape.h == 1;
+  broadcast.channels = shape.c == 1;
+  *result = ElementwiseTwoInput(definition, op_type, broadcast,
+                                std::move(gpu_tensor));
+  result->SetLinkIndex(0);
+  return absl::OkStatus();
+}
+
+ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
+                                              const OperationType& op_type,
+                                              const BHWC& shape) {
+  BroadcastSettings broadcast;
+  broadcast.width = shape.w == 1;
+  broadcast.height = shape.h == 1;
+  broadcast.channels = shape.c == 1;
   ElementwiseTwoInput operation(definition, op_type, broadcast);
   operation.SetLinkIndex(0);
   return operation;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
index 4c85fee6071..79467da0077 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
 #include "tensorflow/lite/delegates/gpu/common/operations.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
 
 namespace tflite {
 namespace gpu {
@@ -48,23 +49,71 @@ class ElementwiseOneInput : public ElementwiseOperation {
 ElementwiseOneInput CreateElementwiseOneInput(const OperationDef& definition,
                                               const OperationType& op_type);
 
+// Class for simple two input (first input is runtime tensor and second input is
+// scalar argument) operations without any parameters, for example sub, div and
+// etc.
+class ElementwiseOneRuntimeOneScalar : public ElementwiseOperation {
+ public:
+  ElementwiseOneRuntimeOneScalar(const OperationDef& definition,
+                                 const OperationType& op_type,
+                                 FLT scalar_parameter)
+      : ElementwiseOperation(definition),
+        op_type_(op_type),
+        scalar_parameter_(scalar_parameter) {}
+
+  // Move only
+  ElementwiseOneRuntimeOneScalar(ElementwiseOneRuntimeOneScalar&& operation);
+  ElementwiseOneRuntimeOneScalar& operator=(
+      ElementwiseOneRuntimeOneScalar&& operation);
+  ElementwiseOneRuntimeOneScalar(const ElementwiseOneRuntimeOneScalar&) =
+      delete;
+  ElementwiseOneRuntimeOneScalar& operator=(
+      const ElementwiseOneRuntimeOneScalar&) = delete;
+
+  void SetLinkIndex(int index) override;
+  std::string GetCoreCode(const LinkingContext& context) const override;
+  std::string GetArgsDeclaration() const override;
+  absl::Status BindArguments(CLKernel* kernel) override;
+
+ private:
+  int link_index_;
+  OperationType op_type_;
+  FLT scalar_parameter_;
+};
+
+ElementwiseOneRuntimeOneScalar CreateElementwiseOneRuntimeOneScalar(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type, float scalar_parameter);
+
 struct BroadcastSettings {
   bool width;
   bool height;
   bool channels;
 };
 
-// Class for simple two input operations without any parameters, for example
+// Class for simple two input(first input is runtime tensor and second input is
+// runtime or constant tensor) operations without any parameters, for example
 // sub, div and etc.
 class ElementwiseTwoInput : public ElementwiseOperation {
  public:
-  explicit ElementwiseTwoInput(const OperationDef& definition,
-                               const OperationType& op_type,
-                               const BroadcastSettings& broadcast)
+  ElementwiseTwoInput() = default;
+  ElementwiseTwoInput(const OperationDef& definition,
+                      const OperationType& op_type,
+                      const BroadcastSettings& broadcast)
       : ElementwiseOperation(definition),
         op_type_(op_type),
         broadcast_(broadcast),
-        use_scalar_para_(false) {}
+        use_constant_tensor_(false) {}
+
+  ElementwiseTwoInput(const OperationDef& definition,
+                      const OperationType& op_type,
+                      const BroadcastSettings& broadcast,
+                      Tensor&& constant_tensor)
+      : ElementwiseOperation(definition),
+        op_type_(op_type),
+        broadcast_(broadcast),
+        use_constant_tensor_(true),
+        constant_tensor_(std::move(constant_tensor)) {}
 
   // Move only
   ElementwiseTwoInput(ElementwiseTwoInput&& operation);
@@ -76,30 +125,30 @@ class ElementwiseTwoInput : public ElementwiseOperation {
   std::string GetCoreCode(const LinkingContext& context) const override;
   std::string GetArgsDeclaration() const override;
   absl::Status BindArguments(CLKernel* kernel) override;
-  inline void SetScalarPara(FLT scalar) {
-    scalar_para_ = scalar;
-    use_scalar_para_ = true;
-  }
 
  private:
   int link_index_;
   OperationType op_type_;
   BroadcastSettings broadcast_;
-  FLT scalar_para_;
-  bool use_scalar_para_;
+  bool use_constant_tensor_;
+  Tensor constant_tensor_;
 };
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
+absl::Status CreateElementwiseTwoInput(
     const CreationContext& creation_context, const OperationDef& definition,
-    const OperationType& op_type, const BroadcastSettings& broadcast,
-    const ElementwiseAttributes* attr);
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result);
 
-ElementwiseTwoInput CreateElementwiseTwoInput(
-    const OperationDef& definition, const OperationType& op_type,
-    const BroadcastSettings& broadcast);
+absl::Status CreateElementwiseTwoInput(
+    const CreationContext& creation_context, const OperationDef& definition,
+    const OperationType& op_type,
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& constant_tensor,
+    ElementwiseTwoInput* result);
 
 ElementwiseTwoInput CreateElementwiseTwoInput(const OperationDef& definition,
-                                              const OperationType& op_type);
+                                              const OperationType& op_type,
+                                              const BHWC& shape);
 
 }  // namespace cl
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
index 7a3aaecfe7f..7c3bdbe66e7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/elementwise_test.cc
@@ -329,8 +329,8 @@ TEST_F(OpenCLOperationTest, Sub) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::SUB);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::SUB, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -357,8 +357,8 @@ TEST_F(OpenCLOperationTest, SquaredDiff) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::SQUARED_DIFF);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::SQUARED_DIFF, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -385,8 +385,8 @@ TEST_F(OpenCLOperationTest, Div) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::DIV);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::DIV, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -413,8 +413,8 @@ TEST_F(OpenCLOperationTest, Pow) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::POW);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::POW, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -441,8 +441,8 @@ TEST_F(OpenCLOperationTest, Add) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::ADD);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::ADD, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -452,7 +452,7 @@ TEST_F(OpenCLOperationTest, Add) {
   }
 }
 
-TEST_F(OpenCLOperationTest, Maxiumum) {
+TEST_F(OpenCLOperationTest, Maximum) {
   TensorFloat32 src_tensor_0, src_tensor_1;
   src_tensor_0.shape = BHWC(1, 2, 1, 2);
   src_tensor_1.shape = BHWC(1, 2, 1, 2);
@@ -469,8 +469,8 @@ TEST_F(OpenCLOperationTest, Maxiumum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MAXIMUM);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MAXIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -480,7 +480,7 @@ TEST_F(OpenCLOperationTest, Maxiumum) {
   }
 }
 
-TEST_F(OpenCLOperationTest, MaxiumumWithScalar) {
+TEST_F(OpenCLOperationTest, MaximumWithScalar) {
   TensorFloat32 src_tensor_0;
   src_tensor_0.shape = BHWC(1, 4, 1, 1);
   src_tensor_0.data = {0.0f, -6.2f, 2.0f, -3.0f};
@@ -497,9 +497,10 @@ TEST_F(OpenCLOperationTest, MaxiumumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      BroadcastSettings broadcast;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
-          creation_context_, op_def, OperationType::MAXIMUM, broadcast, &attr);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      ElementwiseOneRuntimeOneScalar operation =
+          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
+                                               OperationType::MAXIMUM, *scalar);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -508,6 +509,97 @@ TEST_F(OpenCLOperationTest, MaxiumumWithScalar) {
   }
 }
 
+TEST_F(OpenCLOperationTest, MaximumWithConstantLinearTensor) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {1.0f, -6.2f, -2.0f, 3.0f};
+
+  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> linear_tensor;
+  linear_tensor.shape = Linear(2);
+  linear_tensor.data = {0.5f, 2.0f};
+  ElementwiseAttributes attr;
+  attr.param = linear_tensor;
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation;
+      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
+                                          OperationType::MAXIMUM, linear_tensor,
+                                          &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 2.0f, 0.5f, 3.0f}));
+    }
+  }
+}
+
+TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensor) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {1.0f, -6.2f, -2.0f, 3.0f};
+
+  ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
+  hwc_tensor.shape = HWC(2, 1, 2);
+  hwc_tensor.data = {0.5f, 2.0f, 0.7f, 4.7f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation;
+      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
+                                          OperationType::MAXIMUM, hwc_tensor,
+                                          &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 2.0f, 0.7f, 4.7f}));
+    }
+  }
+}
+TEST_F(OpenCLOperationTest, MaximumWithConstantHWCTensorBroadcastChannels) {
+  TensorFloat32 src_tensor_0;
+  src_tensor_0.shape = BHWC(1, 2, 1, 2);
+  src_tensor_0.data = {1.0f, -6.2f, -2.0f, 3.0f};
+
+  ::tflite::gpu::Tensor<HWC, DataType::FLOAT32> hwc_tensor;
+  hwc_tensor.shape = HWC(2, 1, 1);
+  hwc_tensor.data = {0.5f, 2.0f};
+
+  for (auto storage : env_.GetSupportedStorages()) {
+    for (auto precision : env_.GetSupportedPrecisions()) {
+      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
+      OperationDef op_def;
+      op_def.precision = precision;
+      auto data_type = DeduceDataTypeFromPrecision(precision);
+      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
+      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
+      TensorFloat32 dst_tensor;
+      ElementwiseTwoInput operation;
+      ASSERT_OK(CreateElementwiseTwoInput(creation_context_, op_def,
+                                          OperationType::MAXIMUM, hwc_tensor,
+                                          &operation));
+      ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
+                                    BHWC(1, 2, 1, 2), &dst_tensor));
+      EXPECT_THAT(dst_tensor.data,
+                  Pointwise(FloatNear(eps), {1.0f, 0.5f, 2.0f, 3.0f}));
+    }
+  }
+}
+
 TEST_F(OpenCLOperationTest, Minimum) {
   TensorFloat32 src_tensor_0, src_tensor_1;
   src_tensor_0.shape = BHWC(1, 2, 1, 2);
@@ -525,8 +617,8 @@ TEST_F(OpenCLOperationTest, Minimum) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MINIMUM);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MINIMUM, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -553,9 +645,10 @@ TEST_F(OpenCLOperationTest, MinimumWithScalar) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      BroadcastSettings broadcast;
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
-          creation_context_, op_def, OperationType::MINIMUM, broadcast, &attr);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      ElementwiseOneRuntimeOneScalar operation =
+          CreateElementwiseOneRuntimeOneScalar(creation_context_, op_def,
+                                               OperationType::MINIMUM, *scalar);
       ASSERT_OK(ExecuteGPUOperation(src_tensor_0, creation_context_, &operation,
                                     BHWC(1, 4, 1, 1), &dst_tensor));
       EXPECT_THAT(dst_tensor.data,
@@ -581,8 +674,8 @@ TEST_F(OpenCLOperationTest, Mul) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MUL);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -599,11 +692,6 @@ TEST_F(OpenCLOperationTest, MulBroadcastHW) {
   src_tensor_0.data = {1.0f, 2.0f, 3.0f, 4.5f};
   src_tensor_1.data = {0.5f, 3.0f};
 
-  BroadcastSettings broadcast;
-  broadcast.width = true;
-  broadcast.height = true;
-  broadcast.channels = false;
-
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
       const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
@@ -614,8 +702,8 @@ TEST_F(OpenCLOperationTest, MulBroadcastHW) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MUL, broadcast);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
@@ -632,11 +720,6 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
   src_tensor_0.data = {1.0f, 2.0f, 3.0f, 4.5f};
   src_tensor_1.data = {0.5f, 3.0f};
 
-  BroadcastSettings broadcast;
-  broadcast.width = false;
-  broadcast.height = false;
-  broadcast.channels = true;
-
   for (auto storage : env_.GetSupportedStorages()) {
     for (auto precision : env_.GetSupportedPrecisions()) {
       const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-2f;
@@ -647,8 +730,8 @@ TEST_F(OpenCLOperationTest, MulBroadcastChannels) {
       op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
       op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
       TensorFloat32 dst_tensor;
-      ElementwiseTwoInput operation =
-          CreateElementwiseTwoInput(op_def, OperationType::MUL, broadcast);
+      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+          op_def, OperationType::MUL, src_tensor_1.shape);
       ASSERT_OK(ExecuteGPUOperation({src_tensor_0, src_tensor_1},
                                     creation_context_, &operation,
                                     BHWC(1, 2, 1, 2), &dst_tensor));
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
index 9f4c9871123..3aa01981844 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.cc
@@ -205,7 +205,7 @@ std::string PostProcess(const std::vector<ElementwiseOperation*>& linked_ops,
                         const LinkingContext& context) {
   std::string code;
   for (auto linked_op : linked_ops) {
-    code += linked_op->GetCoreCode(context);
+    code += "{" + linked_op->GetCoreCode(context) + "}";
   }
   return code;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
deleted file mode 100644
index 5b9b78f073c..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
-
-#include "absl/types/variant.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/util.h"
-#include "tensorflow/lite/delegates/gpu/cl/precision.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/shape.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-MultiplyAdd::MultiplyAdd(MultiplyAdd&& operation)
-    : ElementwiseOperation(std::move(operation)),
-      mul_vec_(std::move(operation.mul_vec_)),
-      add_vec_(std::move(operation.add_vec_)),
-      use_mul_vec_(operation.use_mul_vec_),
-      use_add_vec_(operation.use_add_vec_),
-      scalar_mul_(std::move(operation.scalar_mul_)),
-      scalar_add_(std::move(operation.scalar_add_)) {}
-
-MultiplyAdd& MultiplyAdd::operator=(MultiplyAdd&& operation) {
-  if (this != &operation) {
-    mul_vec_ = std::move(operation.mul_vec_);
-    add_vec_ = std::move(operation.add_vec_);
-    use_mul_vec_ = operation.use_mul_vec_;
-    use_add_vec_ = operation.use_add_vec_;
-    scalar_mul_ = std::move(operation.scalar_mul_);
-    scalar_add_ = std::move(operation.scalar_add_);
-    ElementwiseOperation::operator=(std::move(operation));
-  }
-  return *this;
-}
-
-void MultiplyAdd::SetLinkIndex(int index) {
-  scalar_mul_.SetName(absl::StrCat("mad_scalar_mul_", index));
-  scalar_add_.SetName(absl::StrCat("mad_scalar_add_", index));
-  mul_vec_.SetName(absl::StrCat("mad_mul_", index));
-  add_vec_.SetName(absl::StrCat("mad_add_", index));
-}
-
-std::string MultiplyAdd::GetCoreCode(const LinkingContext& context) const {
-  std::string result = absl::StrCat(context.var_name, " = ", context.var_name);
-  if (use_mul_vec_) {
-    absl::StrAppend(&result, " * ", mul_vec_.ReadLinearFLT4(context.s_coord));
-  }
-  if (scalar_mul_.Active()) {
-    absl::StrAppend(&result, " * (FLT)(", scalar_mul_.GetName(), ")");
-  }
-  if (use_add_vec_) {
-    absl::StrAppend(&result, " + ", add_vec_.ReadLinearFLT4(context.s_coord));
-  }
-  if (scalar_add_.Active()) {
-    absl::StrAppend(&result, " + (FLT)(", scalar_add_.GetName(), ")");
-  }
-  return absl::StrCat(result, ";\n");
-}
-
-std::string MultiplyAdd::GetArgsDeclaration() const {
-  std::string args;
-  if (use_mul_vec_) {
-    absl::StrAppend(&args, ",\n    ", mul_vec_.GetDeclaration());
-  }
-  if (use_add_vec_) {
-    absl::StrAppend(&args, ",\n    ", add_vec_.GetDeclaration());
-  }
-  if (scalar_mul_.Active()) {
-    absl::StrAppend(&args, ",\n    ", scalar_mul_.GetDeclaration());
-  }
-  if (scalar_add_.Active()) {
-    absl::StrAppend(&args, ",\n    ", scalar_add_.GetDeclaration());
-  }
-  return args;
-}
-
-absl::Status MultiplyAdd::BindArguments(CLKernel* kernel) {
-  if (use_mul_vec_) {
-    RETURN_IF_ERROR(kernel->SetMemoryAuto(mul_vec_.GetMemoryPtr()));
-  }
-  if (use_add_vec_) {
-    RETURN_IF_ERROR(kernel->SetMemoryAuto(add_vec_.GetMemoryPtr()));
-  }
-  if (scalar_mul_.Active()) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_mul_));
-  }
-  if (scalar_add_.Active()) {
-    RETURN_IF_ERROR(kernel->SetBytesAuto(scalar_add_));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status MultiplyAdd::UploadMul(const MultiplyAttributes& attr,
-                                    CalculationsPrecision scalar_precision,
-                                    CLContext* context) {
-  auto mul =
-      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
-  auto mul_scalar = absl::get_if<float>(&attr.param);
-  if (mul) {
-    RETURN_IF_ERROR(UploadMul(*mul, context));
-  } else {
-    scalar_mul_ = FLT(scalar_precision, *mul_scalar);
-  }
-  return absl::OkStatus();
-}
-
-absl::Status MultiplyAdd::UploadAdd(const AddAttributes& attr,
-                                    CalculationsPrecision scalar_precision,
-                                    CLContext* context) {
-  auto add =
-      absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(&attr.param);
-  auto add_scalar = absl::get_if<float>(&attr.param);
-  if (add) {
-    RETURN_IF_ERROR(UploadAdd(*add, context));
-  } else {
-    scalar_add_ = FLT(scalar_precision, *add_scalar);
-  }
-  return absl::OkStatus();
-}
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& attr,
-                               MultiplyAdd* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = MultiplyAdd(definition);
-  RETURN_IF_ERROR(
-      result->UploadMul(attr, scalar_precision, creation_context.context));
-  result->SetLinkIndex(0);
-  return absl::OkStatus();
-}
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const AddAttributes& attr, MultiplyAdd* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = MultiplyAdd(definition);
-  RETURN_IF_ERROR(
-      result->UploadAdd(attr, scalar_precision, creation_context.context));
-  result->SetLinkIndex(0);
-  return absl::OkStatus();
-}
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& mul_attr,
-                               const AddAttributes& add_attr,
-                               MultiplyAdd* result) {
-  const auto scalar_precision = creation_context.device->IsPowerVR()
-                                    ? CalculationsPrecision::F32
-                                    : definition.precision;
-  *result = MultiplyAdd(definition);
-  RETURN_IF_ERROR(
-      result->UploadMul(mul_attr, scalar_precision, creation_context.context));
-  RETURN_IF_ERROR(
-      result->UploadAdd(add_attr, scalar_precision, creation_context.context));
-  result->SetLinkIndex(0);
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
deleted file mode 100644
index 2e222f785f6..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
-#define TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
-
-#include <string>
-
-#include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/flt_type.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/gpu_operation.h"
-#include "tensorflow/lite/delegates/gpu/cl/linear_storage.h"
-#include "tensorflow/lite/delegates/gpu/common/data_type.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/types.h"
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-
-class MultiplyAdd : public ElementwiseOperation {
- public:
-  // Move only
-  MultiplyAdd() = default;
-  MultiplyAdd(MultiplyAdd&& operation);
-  MultiplyAdd& operator=(MultiplyAdd&& operation);
-  MultiplyAdd(const MultiplyAdd&) = delete;
-  MultiplyAdd& operator=(const MultiplyAdd&) = delete;
-
-  absl::Status UploadMul(const MultiplyAttributes& attr,
-                         CalculationsPrecision scalar_precision,
-                         CLContext* context);
-  absl::Status UploadAdd(const AddAttributes& attr,
-                         CalculationsPrecision scalar_precision,
-                         CLContext* context);
-
-  template <DataType T>
-  absl::Status UploadMul(const tflite::gpu::Tensor<Linear, T>& mul,
-                         CLContext* context);
-
-  template <DataType T>
-  absl::Status UploadAdd(const tflite::gpu::Tensor<Linear, T>& add,
-                         CLContext* context);
-
-  void SetLinkIndex(int index) override;
-  std::string GetCoreCode(const LinkingContext& context) const override;
-
-  std::string GetArgsDeclaration() const override;
-  absl::Status BindArguments(CLKernel* kernel) override;
-
-  friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const MultiplyAttributes& attr,
-                                        MultiplyAdd* result);
-
-  friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const AddAttributes& attr,
-                                        MultiplyAdd* result);
-
-  friend absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                                        const OperationDef& definition,
-                                        const MultiplyAttributes& mul_attr,
-                                        const AddAttributes& add_attr,
-                                        MultiplyAdd* result);
-
- private:
-  explicit MultiplyAdd(const OperationDef& definition)
-      : ElementwiseOperation(definition),
-        use_mul_vec_(false),
-        use_add_vec_(false) {}
-
-  LinearStorage mul_vec_;
-  LinearStorage add_vec_;
-  bool use_mul_vec_;
-  bool use_add_vec_;
-  FLT scalar_mul_;
-  FLT scalar_add_;
-};
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& attr,
-                               MultiplyAdd* result);
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const AddAttributes& attr, MultiplyAdd* result);
-
-absl::Status CreateMultiplyAdd(const CreationContext& creation_context,
-                               const OperationDef& definition,
-                               const MultiplyAttributes& mul_attr,
-                               const AddAttributes& add_attr,
-                               MultiplyAdd* result);
-
-template <DataType T>
-absl::Status MultiplyAdd::UploadMul(const tflite::gpu::Tensor<Linear, T>& mul,
-                                    CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetDataType();
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, mul, context, &mul_vec_));
-  use_mul_vec_ = true;
-  return absl::OkStatus();
-}
-
-template <DataType T>
-absl::Status MultiplyAdd::UploadAdd(const tflite::gpu::Tensor<Linear, T>& add,
-                                    CLContext* context) {
-  LinearStorageCreateInfo create_info;
-  create_info.storage_type =
-      DeduceLinearStorageType(definition_.GetPrimaryStorageType());
-  create_info.data_type = definition_.GetDataType();
-  RETURN_IF_ERROR(CreateLinearStorage(create_info, add, context, &add_vec_));
-  use_add_vec_ = true;
-  return absl::OkStatus();
-}
-
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_KERNELS_MULTIPLY_ADD_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc b/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
deleted file mode 100644
index 444a380c2e9..00000000000
--- a/tensorflow/lite/delegates/gpu/cl/kernels/multiply_add_test.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
-
-#include <vector>
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/delegates/gpu/cl/kernels/cl_test.h"
-#include "tensorflow/lite/delegates/gpu/common/operations.h"
-#include "tensorflow/lite/delegates/gpu/common/status.h"
-#include "tensorflow/lite/delegates/gpu/common/tensor.h"
-
-using ::testing::FloatNear;
-using ::testing::Pointwise;
-
-namespace tflite {
-namespace gpu {
-namespace cl {
-namespace {
-
-TEST_F(OpenCLOperationTest, MultiplyAddVectorMul) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  MultiplyAttributes attr;
-  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
-  parameters.shape = Linear(2);
-  parameters.data = {0.5f, 2.0f};
-  attr.param = parameters;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {0.0f, 2.0f, 1.0f, 6.0f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddVectorAdd) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  AddAttributes attr;
-  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
-  parameters.shape = Linear(2);
-  parameters.data = {0.5f, 2.0f};
-  attr.param = parameters;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {0.5f, 3.0f, 2.5f, 5.0f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddScalarMul) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  MultiplyAttributes attr;
-  attr.param = 0.5f;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {0.0f, 0.5f, 1.0f, 1.5f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddScalarAdd) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  AddAttributes attr;
-  attr.param = -0.5f;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, attr, &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {-0.5f, 0.5f, 1.5f, 2.5f}));
-    }
-  }
-}
-
-TEST_F(OpenCLOperationTest, MultiplyAddVectorMad) {
-  TensorFloat32 src_tensor;
-  src_tensor.shape = BHWC(1, 2, 1, 2);
-  src_tensor.data = {0.0f, 1.0f, 2.0f, 3.0f};
-
-  MultiplyAttributes mul_attr;
-  ::tflite::gpu::Tensor<Linear, DataType::FLOAT32> parameters;
-  parameters.shape = Linear(2);
-  parameters.data = {0.5f, 2.0f};
-  mul_attr.param = parameters;
-
-  AddAttributes add_attr;
-  parameters.data = {-0.5f, 0.5f};
-  add_attr.param = parameters;
-
-  for (auto storage : env_.GetSupportedStorages()) {
-    for (auto precision : env_.GetSupportedPrecisions()) {
-      const float eps = precision == CalculationsPrecision::F32 ? 1e-6f : 1e-3f;
-      OperationDef op_def;
-      op_def.precision = precision;
-      auto data_type = DeduceDataTypeFromPrecision(precision);
-      op_def.src_tensors.push_back({data_type, storage, Layout::HWC});
-      op_def.dst_tensors.push_back({data_type, storage, Layout::HWC});
-      TensorFloat32 dst_tensor;
-      MultiplyAdd operation;
-      ASSERT_OK(CreateMultiplyAdd(creation_context_, op_def, mul_attr, add_attr,
-                                  &operation));
-      ASSERT_OK(ExecuteGPUOperation(src_tensor, creation_context_, &operation,
-                                    BHWC(1, 2, 1, 2), &dst_tensor));
-      EXPECT_THAT(dst_tensor.data,
-                  Pointwise(FloatNear(eps), {-0.5f, 2.5f, 0.5f, 6.5f}));
-    }
-  }
-}
-
-}  // namespace
-}  // namespace cl
-}  // namespace gpu
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
index 9650b53937a..ff196cfaf71 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/BUILD
@@ -131,7 +131,6 @@ cc_library(
         "//tensorflow/lite/delegates/gpu/cl/kernels:lstm",
         "//tensorflow/lite/delegates/gpu/cl/kernels:max_unpooling",
         "//tensorflow/lite/delegates/gpu/cl/kernels:mean",
-        "//tensorflow/lite/delegates/gpu/cl/kernels:multiply_add",
         "//tensorflow/lite/delegates/gpu/cl/kernels:padding",
         "//tensorflow/lite/delegates/gpu/cl/kernels:pooling",
         "//tensorflow/lite/delegates/gpu/cl/kernels:prelu",
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
index 60d06d7da89..606798db687 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/operation_selector.cc
@@ -37,23 +37,6 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
-
-bool IsWidthBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
-  return inputs.size() == 2 &&
-         inputs[0]->tensor.shape.w != inputs[1]->tensor.shape.w &&
-         inputs[1]->tensor.shape.w == 1;
-}
-bool IsHeightBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
-  return inputs.size() == 2 &&
-         inputs[0]->tensor.shape.h != inputs[1]->tensor.shape.h &&
-         inputs[1]->tensor.shape.h == 1;
-}
-bool IsChannelsBroadcastedForSecondInput(const std::vector<Value*>& inputs) {
-  return inputs.size() == 2 &&
-         inputs[0]->tensor.shape.c != inputs[1]->tensor.shape.c &&
-         inputs[1]->tensor.shape.c == 1;
-}
-
 bool IsSuitableForWinograd4x4To6x6(const Convolution2DAttributes& attr,
                                    const CLDevice& device,
                                    const BHWC& dst_shape) {
@@ -155,33 +138,44 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
   auto op_type = OperationTypeFromString(node.operation.type);
   switch (op_type) {
     case OperationType::ADD: {
-      const auto attr =
-          absl::any_cast<AddAttributes>(node.operation.attributes);
-      const auto* adds =
+      auto attr = absl::any_cast<AddAttributes>(node.operation.attributes);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      const auto* linear_tensor =
           absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
               &attr.param);
-      const auto* adds_scalar = absl::get_if<float>(&attr.param);
-      if (adds || adds_scalar) {
-        return SelectBroadcastAdd(attr, creation_context, op_def, gpu_op);
+      if (scalar) {
+        ElementwiseOneRuntimeOneScalar operation =
+            CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
+                                                 op_type, *scalar);
+        *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
+            std::move(operation));
+        return absl::OkStatus();
+      } else if (linear_tensor) {
+        ElementwiseTwoInput operation;
+        RETURN_IF_ERROR(CreateElementwiseTwoInput(
+            creation_context, op_def, op_type, *linear_tensor, &operation));
+        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        return absl::OkStatus();
       } else {
-        BroadcastSettings broadcast;
-        broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
-        broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
-        broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-        if (broadcast.width || broadcast.height || broadcast.channels) {
-          ElementwiseTwoInput operation =
-              CreateElementwiseTwoInput(op_def, op_type, broadcast);
+        if (inputs.size() == 2 &&
+            (inputs[0]->tensor.shape.c == inputs[1]->tensor.shape.c ||
+             inputs[1]->tensor.shape.c == 1)) {
+          ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+              op_def, op_type, inputs[1]->tensor.shape);
           *gpu_op =
               absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-        } else {
+          return absl::OkStatus();
+        } else if (inputs.size() >= 2) {
           auto output = outputs[0];
           std::vector<int> channels(inputs.size());
           for (int i = 0; i < inputs.size(); ++i) {
             channels[i] = inputs[i]->tensor.shape.c;
           }
           SelectAdd(op_def, channels, output->tensor.shape.c, gpu_op);
+          return absl::OkStatus();
         }
-        return absl::OkStatus();
+        return absl::UnimplementedError(absl::StrCat(
+            "No support of ", node.operation.type, " with this parameters"));
       }
     }
     case OperationType::CONCAT: {
@@ -277,27 +271,35 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
       return SelectMean(attr, op_def, gpu_op);
     }
     case OperationType::MUL: {
-      if (node.operation.attributes.has_value()) {
-        auto attr =
-            absl::any_cast<MultiplyAttributes>(node.operation.attributes);
-
-        return SelectMultiplyScalar(attr, creation_context, op_def, gpu_op);
+      auto attr = absl::any_cast<MultiplyAttributes>(node.operation.attributes);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      const auto* linear_tensor =
+          absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+              &attr.param);
+      if (scalar) {
+        ElementwiseOneRuntimeOneScalar operation =
+            CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
+                                                 op_type, *scalar);
+        *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
+            std::move(operation));
+        return absl::OkStatus();
+      } else if (linear_tensor) {
+        ElementwiseTwoInput operation;
+        RETURN_IF_ERROR(CreateElementwiseTwoInput(
+            creation_context, op_def, op_type, *linear_tensor, &operation));
+        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        return absl::OkStatus();
       } else {
         if (inputs.size() == 2) {
-          BroadcastSettings broadcast;
-          broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
-          broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
-          broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-          ElementwiseTwoInput operation =
-              CreateElementwiseTwoInput(op_def, op_type, broadcast);
+          ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+              op_def, op_type, inputs[1]->tensor.shape);
           *gpu_op =
               absl::make_unique<ElementwiseTwoInput>(std::move(operation));
           return absl::OkStatus();
         } else {
-          return absl::UnimplementedError(
-              "No support of multiply with more than 2 inputs");
+          return absl::UnimplementedError(absl::StrCat(
+              "No support of ", node.operation.type, " with this parameters"));
         }
-        return absl::OkStatus();
       }
     }
     case OperationType::PAD: {
@@ -379,16 +381,37 @@ absl::Status GPUOperationFromNode(const CreationContext& creation_context,
     case OperationType::POW:
     case OperationType::SQUARED_DIFF:
     case OperationType::SUB: {
-      BroadcastSettings broadcast;
-      broadcast.width = IsWidthBroadcastedForSecondInput(inputs);
-      broadcast.height = IsHeightBroadcastedForSecondInput(inputs);
-      broadcast.channels = IsChannelsBroadcastedForSecondInput(inputs);
-      const ElementwiseAttributes* attr =
-          absl::any_cast<ElementwiseAttributes>(&node.operation.attributes);
-      ElementwiseTwoInput operation = CreateElementwiseTwoInput(
-          creation_context, op_def, op_type, broadcast, attr);
-      *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
-      return absl::OkStatus();
+      auto attr =
+          absl::any_cast<ElementwiseAttributes>(node.operation.attributes);
+      const float* scalar = absl::get_if<float>(&attr.param);
+      const auto* linear_tensor =
+          absl::get_if<tflite::gpu::Tensor<Linear, DataType::FLOAT32>>(
+              &attr.param);
+      if (scalar) {
+        ElementwiseOneRuntimeOneScalar operation =
+            CreateElementwiseOneRuntimeOneScalar(creation_context, op_def,
+                                                 op_type, *scalar);
+        *gpu_op = absl::make_unique<ElementwiseOneRuntimeOneScalar>(
+            std::move(operation));
+        return absl::OkStatus();
+      } else if (linear_tensor) {
+        ElementwiseTwoInput operation;
+        RETURN_IF_ERROR(CreateElementwiseTwoInput(
+            creation_context, op_def, op_type, *linear_tensor, &operation));
+        *gpu_op = absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+        return absl::OkStatus();
+      } else {
+        if (inputs.size() == 2) {
+          ElementwiseTwoInput operation = CreateElementwiseTwoInput(
+              op_def, op_type, inputs[1]->tensor.shape);
+          *gpu_op =
+              absl::make_unique<ElementwiseTwoInput>(std::move(operation));
+          return absl::OkStatus();
+        } else {
+          return absl::UnimplementedError(absl::StrCat(
+              "No support of ", node.operation.type, " with this parameters"));
+        }
+      }
     }
     default:
       return SelectDefault(creation_context, op_def, hints, inputs, outputs,
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
index d05e2cf0364..5fc04d12822 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/kernels/lstm.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/max_unpooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/mean.h"
-#include "tensorflow/lite/delegates/gpu/cl/kernels/multiply_add.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/padding.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/pooling.h"
 #include "tensorflow/lite/delegates/gpu/cl/kernels/prelu.h"
@@ -158,28 +157,6 @@ absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
   return absl::OkStatus();
 }
 
-absl::Status SelectMultiplyScalar(const MultiplyAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def,
-                                  std::unique_ptr<GPUOperation>* ptr) {
-  MultiplyAdd operation;
-  RETURN_IF_ERROR(
-      CreateMultiplyAdd(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<MultiplyAdd>(std::move(operation));
-  return absl::OkStatus();
-}
-
-absl::Status SelectBroadcastAdd(const AddAttributes& attr,
-                                const CreationContext& creation_context,
-                                const OperationDef& op_def,
-                                std::unique_ptr<GPUOperation>* ptr) {
-  MultiplyAdd operation;
-  RETURN_IF_ERROR(
-      CreateMultiplyAdd(creation_context, op_def, attr, &operation));
-  *ptr = absl::make_unique<MultiplyAdd>(std::move(operation));
-  return absl::OkStatus();
-}
-
 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr) {
   if (shape.w == 1 && shape.h == 1) {
diff --git a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
index 401bd5df7f9..eb111ff9509 100644
--- a/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
+++ b/tensorflow/lite/delegates/gpu/cl/selectors/simple_selectors.h
@@ -70,16 +70,6 @@ void SelectStridedSlice(const SliceAttributes& attr, const OperationDef& op_def,
 absl::Status SelectMean(const MeanAttributes& attr, const OperationDef& op_def,
                         std::unique_ptr<GPUOperation>* ptr);
 
-absl::Status SelectMultiplyScalar(const MultiplyAttributes& attr,
-                                  const CreationContext& creation_context,
-                                  const OperationDef& op_def,
-                                  std::unique_ptr<GPUOperation>* ptr);
-
-absl::Status SelectBroadcastAdd(const AddAttributes& attr,
-                                const CreationContext& creation_context,
-                                const OperationDef& op_def,
-                                std::unique_ptr<GPUOperation>* ptr);
-
 void SelectSoftmax(const BHWC& shape, const OperationDef& op_def,
                    std::unique_ptr<GPUOperation>* ptr);
 
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 3381266bcba..90e247a10ec 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -117,6 +117,7 @@ Tensor::Tensor(Tensor&& tensor)
       shape_(tensor.shape_),
       descriptor_(tensor.descriptor_) {
   tensor.memory_ = nullptr;
+  tensor.image_buffer_memory_ = nullptr;
 }
 
 Tensor& Tensor::operator=(Tensor&& tensor) {
@@ -132,9 +133,10 @@ Tensor& Tensor::operator=(Tensor&& tensor) {
 }
 
 void Tensor::Release() {
+  // image_buffer_memory_ always owned by object
   if (image_buffer_memory_) {
     clReleaseMemObject(image_buffer_memory_);
-    memory_ = nullptr;
+    image_buffer_memory_ = nullptr;
   }
   if (memory_owner_ && memory_) {
     clReleaseMemObject(memory_);
@@ -319,6 +321,18 @@ absl::Status Tensor::WriteData(CLCommandQueue* queue,
   return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
 }
 
+absl::Status Tensor::WriteData(
+    CLCommandQueue* queue,
+    const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src) {
+  return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
+absl::Status Tensor::WriteData(
+    CLCommandQueue* queue,
+    const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src) {
+  return WriteDataBHWDC(absl::MakeConstSpan(src.data), queue);
+}
+
 absl::Status Tensor::WriteData(CLCommandQueue* queue,
                                const Tensor5DFloat32& src) {
   RETURN_IF_ERROR(IsValid(src.shape));
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index 8ded8f2f041..e80cd01a1f9 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -81,6 +81,7 @@ class Tensor : public GPUObject {
   int4 GetWHSB() const { return int4(shape_.w, shape_.h, Slices(), shape_.b); }
   int4 GetWHDS() const { return int4(shape_.w, shape_.h, shape_.d, Slices()); }
 
+  TensorDescriptor GetDescriptor() const { return descriptor_; }
   DataType GetDataType() const { return descriptor_.data_type; }
   TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
 
@@ -94,6 +95,12 @@ class Tensor : public GPUObject {
   cl_mem GetMemoryPtrForWriting() const;
 
   absl::Status WriteData(CLCommandQueue* queue, const TensorFloat32& src);
+  absl::Status WriteData(
+      CLCommandQueue* queue,
+      const tflite::gpu::Tensor<Linear, DataType::FLOAT32>& src);
+  absl::Status WriteData(
+      CLCommandQueue* queue,
+      const tflite::gpu::Tensor<HWC, DataType::FLOAT32>& src);
   absl::Status WriteData(CLCommandQueue* queue, const Tensor5DFloat32& src);
   absl::Status ReadData(CLCommandQueue* queue, TensorFloat32* dst) const;
   absl::Status ReadData(CLCommandQueue* queue, Tensor5DFloat32* dst) const;

From e1a3f9a9e77c6ce09752c43f61a6eaf88240fa26 Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Mon, 1 Jun 2020 13:46:19 -0700
Subject: [PATCH 1487/1533] Add unit tests for TPUEmbedding API.

PiperOrigin-RevId: 314195595
Change-Id: Ib09a5aa12c2ded276324c9eb448e61a074646024
---
 tensorflow/python/tpu/BUILD                   |   47 +
 .../python/tpu/tpu_embedding_v2_cpu_test.py   |  302 ++++
 .../python/tpu/tpu_embedding_v2_test.py       | 1457 +++++++++++++++++
 3 files changed, 1806 insertions(+)
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
 create mode 100644 tensorflow/python/tpu/tpu_embedding_v2_test.py

diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index d398396ec2a..422c106432e 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -468,6 +468,53 @@ py_library(
     ],
 )
 
+tpu_py_test(
+    name = "tpu_embedding_v2_test",
+    srcs = [
+        "tpu_embedding_v2_test.py",
+    ],
+    disable_experimental = True,
+    python_version = "PY3",
+    shard_count = 4,
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding",
+        ":tpu_embedding_v2",
+        ":tpu_strategy_util",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/training/tracking:util",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_test(
+    name = "tpu_embedding_v2_cpu_test",
+    srcs = [
+        "tpu_embedding_v2_cpu_test.py",
+    ],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":tpu_embedding_v2",
+        "//tensorflow/python:init_ops_v2",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_proto_library(
     name = "tensor_tracer_proto",
     srcs = ["tensor_tracer.proto"],
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
new file mode 100644
index 00000000000..3177498deba
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_cpu_test.py
@@ -0,0 +1,302 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on CPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.util import nest
+
+
+class CPUEmbeddingTest(test.TestCase):
+
+  def setUp(self):
+    super(CPUEmbeddingTest, self).setUp()
+
+    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
+    self.initializer = init_ops_v2.Constant(self.embedding_values)
+    # Embedding for video initialized to
+    # 0 1 2 3
+    # 4 5 6 7
+    # ...
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=8,
+        dim=4,
+        initializer=self.initializer,
+        combiner='sum',
+        name='video')
+    # Embedding for user initialized to
+    # 0 1
+    # 2 3
+    # 4 5
+    # 6 7
+    # ...
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=16,
+        dim=2,
+        initializer=self.initializer,
+        combiner='mean',
+        name='user')
+    self.feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends'))
+
+    self.batch_size = 2
+    self.data_batch_size = 4
+
+    # One (global) batch of inputs
+    # sparse tensor for watched:
+    # row 0: 0
+    # row 1: 0, 1
+    # row 2: 0, 1
+    # row 3: 1
+    self.feature_watched_indices = [[0, 0], [1, 0], [1, 1],
+                                    [2, 0], [2, 1], [3, 0]]
+    self.feature_watched_values = [0, 0, 1, 0, 1, 1]
+    self.feature_watched_row_lengths = [1, 2, 2, 1]
+    # sparse tensor for favorited:
+    # row 0: 0, 1
+    # row 1: 1
+    # row 2: 0
+    # row 3: 0, 1
+    self.feature_favorited_indices = [[0, 0], [0, 1], [1, 0],
+                                      [2, 0], [3, 0], [3, 1]]
+    self.feature_favorited_values = [0, 1, 1, 0, 0, 1]
+    self.feature_favorited_row_lengths = [2, 1, 1, 2]
+    # sparse tensor for friends:
+    # row 0: 3
+    # row 1: 0, 1, 2
+    # row 2: 3
+    # row 3: 0, 1, 2
+    self.feature_friends_indices = [[0, 0], [1, 0], [1, 1], [1, 2],
+                                    [2, 0], [3, 0], [3, 1], [3, 2]]
+    self.feature_friends_values = [3, 0, 1, 2, 3, 0, 1, 2]
+    self.feature_friends_row_lengths = [1, 3, 1, 3]
+
+  def _create_mid_level(self):
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config,
+        batch_size=self.batch_size,
+        optimizer=optimizer)
+
+  def _get_dense_tensors(self, dtype=dtypes.int32):
+    feature0 = constant_op.constant(self.feature_watched_values, dtype=dtype)
+    feature1 = constant_op.constant(self.feature_favorited_values, dtype=dtype)
+    feature2 = constant_op.constant(self.feature_friends_values, dtype=dtype)
+    return (feature0, feature1, feature2)
+
+  def test_cpu_dense_lookup(self):
+    mid_level = self._create_mid_level()
+    features = self._get_dense_tensors()
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=None,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    all_lookups = []
+    for feature, config in zip(nest.flatten(features), self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      all_lookups.append(table[feature.numpy()])
+    self.assertAllClose(results, nest.pack_sequence_as(results, all_lookups))
+
+  def test_cpu_dense_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_dense_tensors()
+    weights = self._get_dense_tensors(dtype=dtypes.float32)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Weight specified for .*, but input is dense.'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def _get_sparse_tensors(self, dtype=dtypes.int32):
+    feature0 = sparse_tensor.SparseTensor(
+        indices=self.feature_watched_indices,
+        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 2])
+    feature1 = sparse_tensor.SparseTensor(
+        indices=self.feature_favorited_indices,
+        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 2])
+    feature2 = sparse_tensor.SparseTensor(
+        indices=self.feature_friends_indices,
+        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
+        dense_shape=[self.data_batch_size, 3])
+    return (feature0, feature1, feature2)
+
+  def test_cpu_sparse_lookup(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=None,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    reduced = []
+    for feature, config in zip(nest.flatten(features), self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      all_lookups = table[feature.values.numpy()]
+      # With row starts we can use reduceat in numpy. Get row starts from the
+      # ragged tensor API.
+      ragged = ragged_tensor.RaggedTensor.from_sparse(feature)
+      row_starts = ragged.row_starts().numpy()
+      reduced.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        # for mean, divide by the row lengths.
+        reduced[-1] /= np.expand_dims(ragged.row_lengths().numpy(), axis=1)
+    self.assertAllClose(results, nest.pack_sequence_as(results, reduced))
+
+  def test_cpu_sparse_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    weights = self._get_sparse_tensors(dtype=dtypes.float32)
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=weights,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    weighted_sum = []
+    for feature, weight, config in zip(nest.flatten(features),
+                                       nest.flatten(weights),
+                                       self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      # Expand dims here needed to broadcast this multiplication properly.
+      weight = np.expand_dims(weight.values.numpy(), axis=1)
+      all_lookups = table[feature.values.numpy()] * weight
+      # With row starts we can use reduceat in numpy. Get row starts from the
+      # ragged tensor API.
+      row_starts = ragged_tensor.RaggedTensor.from_sparse(feature).row_starts()
+      row_starts = row_starts.numpy()
+      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
+    self.assertAllClose(results, nest.pack_sequence_as(results,
+                                                       weighted_sum))
+
+  def test_cpu_sparse_lookup_with_non_sparse_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    weights = self._get_dense_tensors(dtype=dtypes.float32)
+    with self.assertRaisesRegex(
+        ValueError, 'but it does not match type of the input which is'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def _get_ragged_tensors(self, dtype=dtypes.int32):
+    feature0 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_watched_values, dtype=dtype),
+        row_lengths=self.feature_watched_row_lengths)
+    feature1 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_favorited_values, dtype=dtype),
+        row_lengths=self.feature_favorited_row_lengths)
+    feature2 = ragged_tensor.RaggedTensor.from_row_lengths(
+        values=constant_op.constant(self.feature_friends_values, dtype=dtype),
+        row_lengths=self.feature_friends_row_lengths)
+    return (feature0, feature1, feature2)
+
+  def test_cpu_ragged_lookup_with_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_ragged_tensors()
+    weights = self._get_ragged_tensors(dtype=dtypes.float32)
+    results = tpu_embedding_v2.cpu_embedding_lookup(
+        features,
+        weights=weights,
+        tables=mid_level.embedding_tables,
+        feature_config=self.feature_config)
+    weighted_sum = []
+    for feature, weight, config in zip(nest.flatten(features),
+                                       nest.flatten(weights),
+                                       self.feature_config):
+      table = mid_level.embedding_tables[config.table].numpy()
+      # Expand dims here needed to broadcast this multiplication properly.
+      weight = np.expand_dims(weight.values.numpy(), axis=1)
+      all_lookups = table[feature.values.numpy()] * weight
+      row_starts = feature.row_starts().numpy()
+      weighted_sum.append(np.add.reduceat(all_lookups, row_starts))
+      if config.table.combiner == 'mean':
+        weighted_sum[-1] /= np.add.reduceat(weight, row_starts)
+    self.assertAllClose(results, nest.pack_sequence_as(results,
+                                                       weighted_sum))
+
+  def test_cpu_invalid_structure_for_features(self):
+    mid_level = self._create_mid_level()
+    # Remove one element of the tuple, self.feature_config has 3 so we need to
+    # pass 3.
+    features = tuple(self._get_sparse_tensors()[:2])
+    with self.assertRaises(ValueError):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=None,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def test_cpu_invalid_structure_for_weights(self):
+    mid_level = self._create_mid_level()
+    features = self._get_sparse_tensors()
+    # Remove one element of the tuple, self.feature_config has 3 so we need to
+    # pass 3 (or None).
+    weights = tuple(self._get_dense_tensors(dtype=dtypes.float32)[:2])
+    with self.assertRaises(ValueError):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=weights,
+          tables=mid_level.embedding_tables,
+          feature_config=self.feature_config)
+
+  def test_cpu_sequence_lookup(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched', max_sequence_length=2),)
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    mid_level = tpu_embedding_v2.TPUEmbedding(
+        feature_config=feature_config,
+        batch_size=self.batch_size,
+        optimizer=optimizer)
+    features = tuple(self._get_sparse_tensors()[:1])
+    with self.assertRaisesRegex(
+        ValueError, 'Sequence features unsupported at this time.'):
+      tpu_embedding_v2.cpu_embedding_lookup(
+          features,
+          weights=None,
+          tables=mid_level.embedding_tables,
+          feature_config=feature_config)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
new file mode 100644
index 00000000000..78b5c9fa3bc
--- /dev/null
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -0,0 +1,1457 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on TPU."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import itertools
+import os
+
+from absl import flags
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.tpu import tpu_embedding
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu import tpu_strategy_util
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training.tracking import util
+from tensorflow.python.util import nest
+
+
+FLAGS = flags.FLAGS
+flags.DEFINE_string('tpu', '', 'Name of TPU to connect to.')
+flags.DEFINE_string('project', None, 'Name of GCP project with TPU.')
+flags.DEFINE_string('zone', None, 'Name of GCP zone with TPU.')
+flags.DEFINE_string('model_dir', os.environ.get('TEST_TMPDIR'),
+                    'A temporary directory.')
+
+
+class TPUEmbeddingCheckpointTest(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super(TPUEmbeddingCheckpointTest, self).setUp()
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    self.strategy = tpu_strategy.TPUStrategy(self.resolver)
+    self.num_rows = self.strategy.num_replicas_in_sync
+
+    # These tests use two mid level API objects, initialized with different
+    # values. These have the same sizes.
+    with self.strategy.scope():
+      self.first_mid_level_contents = np.ones((self.num_rows, 4))
+      self.first_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+          learning_rate=0.1)
+      self.first_mid_level = self.build_mid_level(
+          self.first_mid_level_contents, self.first_mid_level_optimizer)
+
+      self.second_mid_level_contents = np.ones((self.num_rows, 4)) * 2
+      self.second_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+          learning_rate=0.1)
+      self.second_mid_level = self.build_mid_level(
+          self.second_mid_level_contents, self.second_mid_level_optimizer,
+          initialize_tpu_embedding=False)
+
+    self.cpu_mid_level_optimizer = tpu_embedding_v2_utils.SGD(
+        learning_rate=0.1)
+    self.cpu_mid_level = self.build_mid_level(
+        self.second_mid_level_contents, self.cpu_mid_level_optimizer)
+
+  def tearDown(self):
+    tpu_strategy_util.shutdown_tpu_system(self.resolver)
+    super(TPUEmbeddingCheckpointTest, self).tearDown()
+
+  def test_checkpoint_save_retrieves(self):
+    # Ensure that the variables from the first model are loaded.
+    self.first_mid_level._load_variables()
+
+    self.assertAllClose(
+        self.first_mid_level_contents,
+        self.make_checkpoint_and_get_embedding('before_load',
+                                               self.first_mid_level),
+        msg='Checkpoint should contain values from the first api object.')
+
+    self.second_mid_level._load_variables()
+
+    # When we load the variables from the second mid level API object to the TPU
+    # we expect that checkpointing the first mid level API object will now
+    # retrieve the values from the TPU which are now different from the current
+    # variables in the first mid level.
+    self.assertAllClose(
+        self.second_mid_level_contents,
+        self.make_checkpoint_and_get_embedding('after_load',
+                                               self.first_mid_level),
+        msg='Checkpoint should contain values from the second api object.')
+
+  def test_checkpoint_restore_loads(self):
+
+    def get_values(mid):
+      return ops.convert_to_tensor(
+          mid._variables['table']['parameters'].variables[0])
+
+    self.first_mid_level._load_variables()
+
+    first_checkpoint = util.Checkpoint(model=self.first_mid_level)
+    first_checkpoint.save(_get_tmpdir('restore', 'save'))
+
+    # Checkpoint now has values from first_mid_level. See first assert in
+    # test_checkpoint_save_retrieves.
+
+    self.second_mid_level._load_variables()
+
+    self.assertAllClose(
+        self.second_mid_level_contents,
+        get_values(self.second_mid_level),
+        msg='Second mid level api should contain its initial values.',
+    )
+
+    # We restore the checkpoint of our first model into our second model.
+    # This should load the first mid level API object onto the TPU.
+    second_checkpoint = util.Checkpoint(model=self.second_mid_level)
+    second_checkpoint.restore(_get_tmpdir('restore', 'save-1'))
+
+    # Call retrieve here as a way to check what the TPU contains contains.
+    # Calling the retrieve ops directly might make for a cleaner separation of
+    # test and module, though.
+    self.second_mid_level._retrieve_variables()
+
+    self.assertAllClose(
+        self.first_mid_level_contents,
+        get_values(self.second_mid_level),
+        msg='Second mid level api should have retrieved the first model values.'
+    )
+
+  def build_mid_level(self, embedding_values, optimizer,
+                      initialize_tpu_embedding=True):
+    """Creates an embedding api object initialized to embedding_values."""
+    initializer = init_ops_v2.Constant(embedding_values)
+
+    table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=self.num_rows, dim=4, initializer=initializer,
+        combiner='sum', name='table')
+    feature_config = (tpu_embedding_v2_utils.FeatureConfig(
+        table=table, name='feature'),)
+
+    # batch_size here does not matter as we aren't traininig in any of these
+    # tests.
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config, 64, optimizer,
+        initialize_tpu_embedding=initialize_tpu_embedding)
+
+  def make_checkpoint_and_get_embedding(self, name, model):
+    """Saves model to checkpoint name, retrieves embedding variables."""
+    checkpoint = util.Checkpoint(model=model)
+    checkpoint.save(_get_tmpdir(name, 'save'))
+
+    # Get the name of the parameters variable which should be the only
+    # [self.num_rows, 4] shaped tensor in the checkpoint. Note that we do this
+    # as the key can change.
+    variables = checkpoint_utils.list_variables(_get_tmpdir(name))
+    variables = [name for name, size in variables if size == [self.num_rows, 4]]
+    if len(variables) != 1:
+      raise RuntimeError('Found {} copies of the parameter variable in the '
+                         'checkpoint. Exactly one copy exported.'.format(
+                             len(variables)))
+    return checkpoint_utils.load_variable(_get_tmpdir(name), variables[0])
+
+  def test_model_export_cpu(self):
+    self.first_mid_level._load_variables()
+
+    tpu_checkpoint = util.Checkpoint(model=self.first_mid_level)
+    tpu_checkpoint.save(_get_tmpdir('export_cpu', 'save'))
+
+    # We restore the checkpoint of our tpu mid level onto our cpu mid level.
+    cpu_checkpoint = util.Checkpoint(model=self.cpu_mid_level)
+    cpu_checkpoint.restore(_get_tmpdir('export_cpu', 'save-1'))
+
+    @def_function.function
+    def serve_tensors(features):
+      features = tpu_embedding_v2.cpu_embedding_lookup(
+          features, None, self.cpu_mid_level.embedding_tables,
+          self.cpu_mid_level._feature_config)
+      return features[0]
+
+    signatures = {
+        'serving_default':
+            serve_tensors.get_concrete_function(
+                (tensor_spec.TensorSpec(
+                    shape=(2,), dtype=dtypes.int32, name='feature'),))}
+    save.save(self.cpu_mid_level,
+              export_dir=_get_tmpdir('export_cpu', 'exported_model'),
+              signatures=signatures)
+
+    imported = load.load(_get_tmpdir('export_cpu', 'exported_model'))
+    predict_fn = imported.signatures['serving_default']
+
+    input_feature_value = np.array([1, 0])
+    input_batch = (constant_op.constant(input_feature_value,
+                                        dtype=dtypes.int32),)
+    prediction = predict_fn(*input_batch)['output_0']
+    self.assertAllClose(prediction.numpy(),
+                        self.first_mid_level_contents[input_feature_value])
+
+  @parameterized.parameters(tpu_embedding_v2_utils.SGD,
+                            tpu_embedding_v2_utils.Adagrad,
+                            tpu_embedding_v2_utils.Adam)
+  def test_check_checkpoint_variable_names_are_same_on_cpu_and_tpu(self,
+                                                                   optimizer):
+    # Reinitialize the TPU so that we can re-initialize the embeddings with the
+    # given optimizer.
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    optimizer = optimizer(learning_rate=0.1)
+
+    with self.strategy.scope():
+      tpu_mid_level = self.build_mid_level(
+          self.first_mid_level_contents, optimizer)
+
+    tpu_checkpoint = util.Checkpoint(model=tpu_mid_level)
+    tpu_checkpoint.save(_get_tmpdir('save-tpu', 'save'))
+    tpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-tpu'))
+
+    cpu_mid_level = self.build_mid_level(
+        self.first_mid_level_contents, optimizer)
+
+    cpu_checkpoint = util.Checkpoint(model=cpu_mid_level)
+    cpu_checkpoint.save(_get_tmpdir('save-cpu', 'save'))
+    cpu_variables = checkpoint_utils.list_variables(_get_tmpdir('save-cpu'))
+
+    self.assertAllEqual(tpu_variables, cpu_variables)
+
+
+class TPUEmbeddingTest(parameterized.TestCase, test.TestCase):
+
+  def setUp(self):
+    super(TPUEmbeddingTest, self).setUp()
+    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
+    self.initializer = init_ops_v2.Constant(self.embedding_values)
+    # Embedding for video initialized to
+    # 0 1 2 3
+    # 4 5 6 7
+    # ...
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=8,
+        dim=4,
+        initializer=self.initializer,
+        combiner='sum',
+        name='video')
+    # Embedding for user initialized to
+    # 0 1
+    # 2 3
+    # 4 5
+    # 6 7
+    # ...
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=16,
+        dim=2,
+        initializer=self.initializer,
+        combiner='mean',
+        name='user')
+    self.feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited'),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends'))
+
+    self.batch_size = 2
+    self.data_batch_size = 4
+
+    # One (global) batch of inputs
+    # sparse tensor for watched:
+    # row 0: 0
+    # row 1: 0, 1
+    # row 2: 0, 1
+    # row 3: 1
+    self.feature_watched_indices = [[0, 0], [1, 0], [1, 1],
+                                    [2, 0], [2, 1], [3, 0]]
+    self.feature_watched_values = [0, 0, 1, 0, 1, 1]
+    self.feature_watched_row_lengths = [1, 2, 2, 1]
+    # sparse tensor for favorited:
+    # row 0: 0, 1
+    # row 1: 1
+    # row 2: 0
+    # row 3: 0, 1
+    self.feature_favorited_indices = [[0, 0], [0, 1], [1, 0],
+                                      [2, 0], [3, 0], [3, 1]]
+    self.feature_favorited_values = [0, 1, 1, 0, 0, 1]
+    self.feature_favorited_row_lengths = [2, 1, 1, 2]
+    # sparse tensor for friends:
+    # row 0: 3
+    # row 1: 0, 1, 2
+    # row 2: 3
+    # row 3: 0, 1, 2
+    self.feature_friends_indices = [[0, 0], [1, 0], [1, 1], [1, 2],
+                                    [2, 0], [3, 0], [3, 1], [3, 2]]
+    self.feature_friends_values = [3, 0, 1, 2, 3, 0, 1, 2]
+    self.feature_friends_row_lengths = [1, 3, 1, 3]
+    self.resolver = None
+
+  def tearDown(self):
+    if self.resolver:
+      tpu_strategy_util.shutdown_tpu_system(self.resolver)
+    super(TPUEmbeddingTest, self).tearDown()
+
+  def test_tables_with_same_name(self):
+    with self.assertRaisesRegex(
+        ValueError, 'Multiple tables with name table found.'):
+      with self._get_strategy().scope():
+        tpu_embedding_v2.TPUEmbedding(
+            (tpu_embedding_v2_utils.FeatureConfig(
+                table=tpu_embedding_v2_utils.TableConfig(
+                    name='table',
+                    vocabulary_size=4,
+                    dim=2,
+                    initializer=self.initializer,),
+                name='watched'),
+             tpu_embedding_v2_utils.FeatureConfig(
+                 table=tpu_embedding_v2_utils.TableConfig(
+                     name='table',
+                     vocabulary_size=4,
+                     dim=2,
+                     initializer=self.initializer),
+                 name='favorited')),
+            self.batch_size,
+            tpu_embedding_v2_utils.SGD(learning_rate=0.1))
+
+  def test_unsupported_optimizer(self):
+    with self.assertRaisesRegex(
+        ValueError, 'is an unsupported optimizer class.'):
+      with self._get_strategy().scope():
+        tpu_embedding_v2.TPUEmbedding(
+            self.feature_config, self.batch_size,
+            tpu_embedding.AdagradParameters(learning_rate=0.1))
+
+  def test_pass_non_tensor_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    @def_function.function
+    def test_apply():
+      mid_level_api.apply_gradients((1, 2, 3))
+
+    with self.assertRaisesRegex(ValueError, 'Expected Tensor.'):
+      strategy.run(test_apply)
+
+  def test_pass_different_structure_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    @def_function.function
+    def test_apply():
+      # This should be a tuple as feature_config is a tuple of 3 configs.
+      mid_level_api.apply_gradients([1, 2, 3])
+
+    with self.assertRaisesRegex(
+        TypeError,
+        'The two structures don\'t have the same nested structure.'):
+      strategy.run(test_apply)
+
+  def test_pass_none_to_apply_gradients(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+
+    @def_function.function
+    def embedding_and_set_gradients(data):
+      mid_level_api.enqueue(data)
+      def tpu_fn():
+        results = mid_level_api.dequeue()
+        mid_level_api.apply_gradients((None, None,
+                                       array_ops.ones_like(results[2])))
+        return results
+      return strategy.run(tpu_fn)
+
+    @def_function.function
+    def embedding_only(data):
+      mid_level_api.enqueue(data, training=False)
+      def tpu_fn():
+        return mid_level_api.dequeue()
+      return strategy.run(tpu_fn)
+
+    first = self._get_replica_numpy(
+        embedding_and_set_gradients(data), strategy, 0)
+    second = self._get_replica_numpy(embedding_only(data), strategy, 0)
+
+    # First two features should be the same as None gradient was applied.
+    # Third feature had gradient of 1 passed in from each core.
+    # Each core received the same ids per core and returned the following batch:
+    # [ row 3, row 0 + row 1 + row 2 ]
+    # so gradient update was (learning rate = 0.1):
+    #   row 0: -1/3*0.1
+    #   row 1: -1/3*0.1
+    #   row 2: -1/3*0.1
+    #   row 3: -1*0.1
+    # There is a factor of num_replicas because each replica gave an update.
+
+    num_replicas = strategy.num_replicas_in_sync
+    update = ([[0.0]], [[0.0]],
+              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
+    golden = tuple([feature-np.array(up) for feature, up in zip(first, update)])
+
+    self.assertAllClose(golden, second)
+
+  def _get_strategy(self):
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    return tpu_strategy.TPUStrategy(self.resolver)
+
+  def test_dequeue_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.dequeue()
+
+  def test_enqueue_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    features = {
+        'watched': sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices,
+            values=self.feature_watched_values,
+            dense_shape=[2, 2])}
+    with self.assertRaises(RuntimeError):
+      mid_level_api.enqueue(features)
+
+  def test_apply_gradients_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.enqueue(None)
+
+  def test_get_embedding_tables_on_cpu(self):
+    mid_level_api = self._create_mid_level()
+    self.assertEqual(
+        set(mid_level_api.embedding_tables.keys()),
+        set([self.table_video, self.table_user]))
+
+  def test_get_embedding_tables_on_tpu(self):
+    with self._get_strategy().scope():
+      mid_level_api = self._create_mid_level()
+    with self.assertRaises(RuntimeError):
+      mid_level_api.embedding_tables()
+
+  def test_enqueue_weight_for_dense_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy, include_weights=True)
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features, weights = next(dist_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(ValueError, 'Weight specified for dense input'):
+      test_fn()
+
+  def test_enqueue_wrong_weight_type_for_sparse_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy, include_weights=True)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features = next(sparse_iter)
+      _, weights = next(ragged_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not match type input which is SparseTensor.'):
+      test_fn()
+
+  def test_enqueue_wrong_weight_type_for_ragged_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy, include_weights=True)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      _, weights = next(sparse_iter)
+      features = next(ragged_iter)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not match type input which is RaggedTensor.'):
+      test_fn()
+
+  def test_enqueue_sparse_and_ragged(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      sparse_features = next(sparse_iter)
+      ragged_features = next(ragged_iter)
+      features = (sparse_features[0], ragged_features[1], sparse_features[2])
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    with self.assertRaisesRegex(
+        ValueError, 'Found both SparseTensors and RaggedTensors'):
+      test_fn()
+
+  def test_enqueue_incorrect_structure_for_features(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features = next(sparse_iter)
+      features = (features[0],)
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    # The error here is raised from nest.assert_same_structure
+    with self.assertRaises(ValueError):
+      test_fn()
+
+  def test_enqueue_incorrect_structure_for_weights(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy, include_weights=True)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      features, weights = next(sparse_iter)
+      weights = (weights[0],)
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(step)
+
+    # The error here is raised from nest.assert_same_structure
+    with self.assertRaises(ValueError):
+      test_fn()
+
+  def test_enqueue_ragged_tensor(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    ragged = self._create_ragged_dataset(strategy)
+    sparse_iter = iter(strategy.experimental_distribute_dataset(sparse))
+    ragged_iter = iter(strategy.experimental_distribute_dataset(ragged))
+
+    @def_function.function
+    def test_fn():
+      def get_activations():
+        return mid_level_api.dequeue()
+
+      sparse_features = next(sparse_iter)
+      ragged_features = next(ragged_iter)
+      mid_level_api.enqueue(sparse_features, training=False)
+      sparse_activations = strategy.run(get_activations)
+      mid_level_api.enqueue(ragged_features, training=False)
+      ragged_activations = strategy.run(get_activations)
+      return sparse_activations, ragged_activations
+
+    sparse_activations, ragged_activations = test_fn()
+
+    # Extact per core numpy arrays and check that both sparse and ragged have
+    # the same results.
+    sparse0 = self._get_replica_numpy(sparse_activations, strategy, 0)
+    ragged0 = self._get_replica_numpy(ragged_activations, strategy, 0)
+    self.assertAllClose(sparse0, ragged0)
+
+  @parameterized.parameters(True, False)
+  def test_enqueue_with_weights(self, ragged):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    weight = 0.5
+    if ragged:
+      dataset = self._create_ragged_dataset(strategy, include_weights=True,
+                                            weight=weight)
+    else:
+      dataset = self._create_sparse_dataset(strategy, include_weights=True,
+                                            weight=weight)
+
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_and_get(features, weights):
+      def get_activations():
+        return mid_level_api.dequeue()
+      mid_level_api.enqueue(features, weights=weights, training=False)
+      return strategy.run(get_activations)
+
+    features, weights = next(dataset_iter)
+    # Replace the weight for the second feature by None to test.
+    weights = (weights[0], None, weights[2])
+
+    no_weights_activations = enqueue_and_get(features, weights=None)
+    weights_activations = enqueue_and_get(features, weights=weights)
+
+    # Extact per core numpy arrays.
+    no_weights0 = self._get_replica_numpy(no_weights_activations, strategy, 0)
+    weights0 = self._get_replica_numpy(weights_activations, strategy, 0)
+    # videos table has sum combiner and users table has mean combiner.
+    # i.e. users table lookups isn't affected by the weights as all the weights
+    # are the same.
+    # Tuple entry 0 and 1 are the watched and favorited features from the videos
+    # table and entry 2 is the friends feature from the users table.
+    # Note that None was passed as a weight for entry 1 so weight should have no
+    # effect.
+    weight = (0.5, 1.0, 1.0)
+    golden = tuple([no_weight * w for no_weight, w in zip(no_weights0, weight)])
+
+    self.assertAllClose(golden, weights0)
+
+  def test_enqueue_with_outside_compilation(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_outside_compilation(data):
+      def get_activations(features):
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(data,))
+
+    @def_function.function
+    def enqueue_without_outside_compilation(data):
+      def get_activations():
+        return mid_level_api.dequeue()
+      mid_level_api.enqueue(data, training=False)
+      return strategy.run(get_activations)
+
+    features = next(dataset_iter)
+
+    activations_oc = enqueue_with_outside_compilation(features)
+    activations = enqueue_without_outside_compilation(features)
+
+    # Extact per core numpy arrays.
+    activations_oc0 = self._get_replica_numpy(activations_oc, strategy, 0)
+    activations0 = self._get_replica_numpy(activations, strategy, 0)
+
+    self.assertAllClose(activations_oc0, activations0)
+
+  def test_enqueue_with_outside_compilation_in_control_flow(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    # This is one way to force the enqueue in some control flow. @tf.functions
+    # aren't inlined in the calling tf.function. An alternative would be to
+    # place the enqueue in a switch_v2 or something similar.
+    @def_function.function
+    def enqueue_fn(features):
+      mid_level_api.enqueue(features, training=False)
+
+    @def_function.function
+    def enqueue_with_outside_compilation():
+      def get_activations(features):
+        enqueue_fn(features)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(next(dataset_iter),))
+
+    with self.assertRaisesRegex(
+        RuntimeError,
+        'does not match graph which contains TPUReplicateContext'):
+      enqueue_with_outside_compilation()
+
+  def test_enqueue_with_outside_compilation_non_direct_input(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_outside_compilation():
+      def get_activations(features):
+        # This inserts a mul operation on the TPU to trigger the direct input
+        # error.
+        features = (features[0]*2, features[1]*2, features[2]*2)
+        mid_level_api.enqueue(features, training=False)
+        return mid_level_api.dequeue()
+      return strategy.run(get_activations, args=(next(dataset_iter),))
+
+    with self.assertRaisesRegex(
+        ValueError, 'which does not have the `_tpu_input_identity` attr'):
+      enqueue_with_outside_compilation()
+
+  def test_enqueue_with_outside_compilation_auto_mode(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    dataset = self._create_sparse_dataset(strategy)
+    dataset_iter = iter(strategy.experimental_distribute_dataset(dataset))
+
+    @def_function.function
+    def enqueue_with_no_gradient_apply(data):
+      def get_activations(features):
+        # Note the lack of setting training=False, so training defaults to true
+        # here even though we don't have apply gradients.
+        # We detect the correct mode based on which ops exist that share the
+        # same 'name'.
+        mid_level_api.enqueue(features, name='call1')
+        return mid_level_api.dequeue(name='call1')
+      return strategy.run(get_activations, args=(data,))
+
+    @def_function.function
+    def enqueue_with_gradient_apply(data):
+      def get_activations(features):
+        mid_level_api.enqueue(features, name='call2')
+        activations = mid_level_api.dequeue(name='call2')
+        # Apply an all ones gradient
+        gradients = nest.map_structure(array_ops.ones_like, activations)
+        mid_level_api.apply_gradients(gradients, name='call2')
+        return activations
+      return strategy.run(get_activations, args=(data,))
+
+    data = next(dataset_iter)
+    before_gradient_apply = enqueue_with_gradient_apply(data)
+    after_gradient_apply = enqueue_with_no_gradient_apply(data)
+    before_gradient_apply0 = self._get_replica_numpy(before_gradient_apply,
+                                                     strategy, 0)
+    after_gradient_apply0 = self._get_replica_numpy(after_gradient_apply,
+                                                    strategy, 0)
+
+    num_replicas = strategy.num_replicas_in_sync
+    # We are passing a gradient of 1 for all lookups, optimizer is SGD with a
+    # learning rate of 0.1. Feature 0 and 1 are looked up with a sum combiner
+    # with the following ids:
+    # Feature 0: [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
+    # Feature 1: [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
+    # i.e. Row 0 and 1 were looked up 3*num_replicas times over all cores and as
+    # the gradient is 1, the accumulated gradient is 3*num_replicas for each
+    # position in row 0 and 1 in table.
+    #
+    # See comments in test_pass_none_to_apply_gradients for the update to
+    # Feature 2 and its table.
+    # The *2 in the next tests are because those rows have 2 lookups vs
+    # the 1 lookup in the other row.
+    update = ([[0.3 * num_replicas], [0.3 * num_replicas * 2]],
+              [[0.3 * num_replicas * 2], [0.3 * num_replicas]],
+              [[0.1 * num_replicas], [0.1 / 3 * num_replicas]])
+    golden = tuple([before - np.array(up) for before, up in
+                    zip(before_gradient_apply0, update)])
+
+    self.assertAllClose(golden, after_gradient_apply0)
+
+  def _create_strategy_and_mid_level(self, optimizer_name):
+    strategy = self._get_strategy()
+
+    with strategy.scope():
+      if optimizer_name == 'sgd':
+        optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+      elif optimizer_name == 'adagrad':
+        optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1)
+      elif optimizer_name == 'adam':
+        optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1)
+      else:
+        raise ValueError('optimizer is not recognized: ', optimizer_name)
+      mid_level_api = self._create_mid_level(optimizer=optimizer)
+
+    return strategy, mid_level_api, optimizer
+
+  @parameterized.parameters(
+      *itertools.product(
+          ['sgd', 'adagrad', 'adam'],
+          [True, False]))
+  def test_embedding(self, optimizer_name, training):
+    strategy, mid_level_api, optimizer = (
+        self._create_strategy_and_mid_level(optimizer_name))
+
+    dataset = self._create_sparse_dataset(strategy)
+    dist = strategy.experimental_distribute_dataset(dataset)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        """Create and run computation that returns the embedding activations."""
+        if not training:
+          activations = mid_level_api.dequeue()
+          total_loss = _get_total_loss_tensor(activations)
+          ret_val = [total_loss] + list(activations)
+          return ret_val
+        else:
+          with backprop.GradientTape() as tape:
+            activations = mid_level_api.dequeue()
+            tape.watch(activations)
+            total_loss = _get_total_loss_tensor(activations)
+            loss_per_replica = total_loss / strategy.num_replicas_in_sync
+          gradients = tape.gradient(loss_per_replica, activations)
+          mid_level_api.apply_gradients(gradients)
+        ret_val = [total_loss] + list(activations)
+        return ret_val
+
+      mid_level_api.enqueue(next(dist_iter), training=training)
+      result = strategy.run(step)
+      return result
+
+    # Run model.
+    shard_out_val = test_fn()
+
+    # Retrieve TPU weights to CPU.
+    mid_level_api._retrieve_variables()
+
+    # Compute sparse tensors for global batch.
+    input_data = next(iter(self._create_sparse_dataset(strategy)))
+
+    # Check results.
+    self._check_results(strategy, shard_out_val, training, input_data,
+                        mid_level_api._variables,
+                        optimizer)
+
+  def _create_mid_level(self, optimizer=None):
+    # Create `TPUEmbedding` object.
+    if optimizer is None:
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+
+    num_replicas = (
+        distribution_strategy_context.get_strategy().num_replicas_in_sync)
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config,
+        batch_size=self.batch_size * num_replicas,
+        optimizer=optimizer)
+
+  def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    sparse_features = (
+        sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices,
+            values=self.feature_watched_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_favorited_indices,
+            values=self.feature_favorited_values,
+            dense_shape=[self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_friends_indices,
+            values=self.feature_friends_values,
+            dense_shape=[self.data_batch_size, 3]))
+    if include_weights:
+      weights = []
+      for sparse in sparse_features:
+        values = (
+            array_ops.ones_like(sparse.values, dtype=dtypes.float32) * weight)
+        weights.append(sparse_tensor.SparseTensor(
+            indices=sparse.indices,
+            values=values,
+            dense_shape=sparse.dense_shape))
+      sparse_features = (sparse_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_ragged_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    ragged_features = (
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_watched_row_lengths,
+            values=self.feature_watched_values),
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_favorited_row_lengths,
+            values=self.feature_favorited_values),
+        ragged_tensor.RaggedTensor.from_row_lengths(
+            row_lengths=self.feature_friends_row_lengths,
+            values=self.feature_friends_values))
+    if include_weights:
+      weights = []
+      for ragged in ragged_features:
+        weights.append(ragged.with_values(
+            array_ops.ones_like(ragged.values, dtype=dtypes.float32) * weight))
+      ragged_features = (ragged_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(ragged_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_dense_input_fn(self, strategy, include_weights=False, weight=0.5):
+
+    def input_fn(ctx):
+      del ctx
+      features = (
+          constant_op.constant(self.feature_watched_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_favorited_values[-2:],
+                               dtype=dtypes.int32),
+          constant_op.constant(self.feature_friends_values[-2:],
+                               dtype=dtypes.int32))
+      if include_weights:
+        weights = [array_ops.ones_like(t, dtype=dtypes.float32) * weight
+                   for t in features]
+        features = (features, tuple(weights))
+      return dataset_ops.DatasetV2.from_tensors(features).repeat()
+
+    return input_fn
+
+  def _check_results(self, strategy, shard_out_val, training, input_data,
+                     table_to_variable, optimizer):
+    num_replicas = strategy.num_replicas_in_sync
+
+    # Unpack the values `strategy.run()` returns.
+    loss = _unpack(strategy, shard_out_val[0])
+    activation_watched = _unpack(strategy, shard_out_val[1])
+    activation_favorited = _unpack(strategy, shard_out_val[2])
+    activation_friends = _unpack(strategy, shard_out_val[3])
+
+    # Core 0:
+    # Calculate the values of embedding activations.
+    activation_watched_gold0 = np.array([[0, 1, 2, 3], [4, 6, 8, 10]])
+    activation_favorited_gold0 = np.array([[4, 6, 8, 10], [4, 5, 6, 7]])
+    # Second row of `activation_friends_gold0` is the mean of the following.
+    # row 0: 0 1
+    # row 1: 2 3
+    # row 2: 4 5
+    activation_friends_gold0 = np.array([[6, 7], [2, 3]])
+
+    loss_gold0 = _compute_loss(activation_watched_gold0,
+                               activation_favorited_gold0,
+                               activation_friends_gold0)
+
+    # Add on values from other cores:
+    # Activations for watched are an alternating sequence of
+    # activation_watched_gold0 and activation_favorited_gold0.
+    # For favorited it is the same but in the opposite order.
+    activation_watched_gold = np.concatenate(
+        (np.concatenate((np.expand_dims(activation_watched_gold0, axis=0),) *
+                        (num_replicas // 2)),
+         np.concatenate((np.expand_dims(activation_favorited_gold0, axis=0),) *
+                        (num_replicas // 2))),
+        axis=1).reshape([self.batch_size * num_replicas, 4])
+    activation_favorited_gold = np.concatenate(
+        (activation_watched_gold[self.batch_size:,],
+         activation_watched_gold[0:self.batch_size,]))
+    activation_friends_gold = np.concatenate(
+        (activation_friends_gold0,) * num_replicas)
+
+    loss_gold = [loss_gold0] * num_replicas
+
+    # Test values.
+    self.assertAllClose(activation_watched_gold, activation_watched)
+    self.assertAllClose(activation_favorited_gold, activation_favorited)
+    self.assertAllClose(activation_friends_gold, activation_friends)
+
+    self.assertAllClose(loss_gold, loss)
+
+    embedding_table_video_before = np.copy(
+        np.reshape(self.embedding_values, [8, 4]))
+    embedding_table_user_before = np.copy(
+        np.reshape(self.embedding_values, [16, 2]))
+
+    global_batch_size = self.batch_size * num_replicas
+    if training:
+      gradient_wrt_watched_gold = (2 * activation_watched_gold /
+                                   global_batch_size)
+      gradient_wrt_favorited_gold = (2 * activation_favorited_gold /
+                                     global_batch_size)
+      gradient_wrt_friends_gold = (2 * activation_friends_gold /
+                                   global_batch_size)
+
+      # Calculate gradients wrt embedding tables.
+      gradients_wrt_user = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_friends_gold,
+              embedding_table_user_before, input_data[2].indices.numpy(),
+              input_data[2].values.numpy(), self.table_user.combiner))
+      gradients_wrt_video = (
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_favorited_gold,
+              embedding_table_video_before, input_data[1].indices.numpy(),
+              input_data[1].values.numpy(), self.table_video.combiner) +
+          _compute_gradients_wrt_embedding_table(
+              global_batch_size, gradient_wrt_watched_gold,
+              embedding_table_video_before, input_data[0].indices.numpy(),
+              input_data[0].values.numpy(), self.table_video.combiner))
+
+      self._check_embedding_and_slot_variables(embedding_table_user_before,
+                                               gradients_wrt_user,
+                                               embedding_table_video_before,
+                                               gradients_wrt_video,
+                                               optimizer,
+                                               table_to_variable)
+
+  def _check_embedding_and_slot_variables(self, embedding_table_user_before,
+                                          gradients_wrt_user,
+                                          embedding_table_video_before,
+                                          gradients_wrt_video,
+                                          optimizer,
+                                          table_to_variable):
+    if isinstance(optimizer, tpu_embedding_v2_utils.SGD):
+      check_fn = self._check_embedding_and_slot_variables_for_sgd
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adagrad):
+      check_fn = self._check_embedding_and_slot_variables_for_adagrad
+    elif isinstance(optimizer, tpu_embedding_v2_utils.Adam):
+      check_fn = self._check_embedding_and_slot_variables_for_adam
+    else:
+      raise ValueError('optimizer is not recognized: ', type(optimizer))
+    check_fn(embedding_table_user_before, gradients_wrt_user,
+             optimizer, table_to_variable[self.table_user.name])
+    check_fn(embedding_table_video_before, gradients_wrt_video,
+             optimizer, table_to_variable[self.table_video.name])
+
+  def _check_embedding_and_slot_variables_for_sgd(self, embedding_table_before,
+                                                  gradients,
+                                                  optimizer,
+                                                  variables):
+    embedding_table = np.copy(embedding_table_before)
+    embedding_table -= optimizer.learning_rate * np.sum(gradients, axis=0)
+    self.assertAllClose(_get_variable(variables['parameters']).numpy(),
+                        embedding_table)
+
+  def _check_embedding_and_slot_variables_for_adagrad(self,
+                                                      embedding_table_before,
+                                                      gradients,
+                                                      optimizer,
+                                                      variable):
+    embedding_table = np.copy(embedding_table_before)
+    accumulator = (
+        optimizer.initial_accumulator_value + np.sum(gradients, axis=0)**2)
+    embedding_table -= (
+        optimizer.learning_rate * np.sum(gradients, axis=0) /
+        np.sqrt(accumulator))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table)
+    self.assertAllClose(_get_variable(variable['accumulators']).numpy(),
+                        accumulator)
+
+  def _check_embedding_and_slot_variables_for_adam(self, embedding_table_before,
+                                                   gradients,
+                                                   optimizer,
+                                                   variable):
+    embedding_table = np.copy(embedding_table_before)
+    g = np.sum(gradients, axis=0)
+    v = g**2 * (1 - optimizer.beta_2)
+    m = g * (1 - optimizer.beta_1)
+    epsilon = optimizer.epsilon
+    # TPU Embeddings don't have the LR decay factor for Adam.
+    lr_modifier = 1
+    embedding_table -= (
+        m * optimizer.learning_rate * lr_modifier / (np.sqrt(v) + epsilon))
+    self.assertAllClose(_get_variable(variable['parameters']).numpy(),
+                        embedding_table, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['momenta']).numpy(),
+                        m, rtol=1e-4)
+    self.assertAllClose(_get_variable(variable['velocities']).numpy(),
+                        v, rtol=1e-4)
+
+  def _get_replica_numpy(self, structured, strategy, replica_id):
+    def select_replica(x):
+      x = strategy.experimental_local_results(x)
+      if len(x) == 1:
+        return x.numpy()
+      return x[replica_id].numpy()
+    return nest.map_structure(select_replica, structured)
+
+  def test_dense_lookup(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    input_fn = self._create_dense_input_fn(strategy)
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        return mid_level_api.dequeue()
+
+      mid_level_api.enqueue(next(dist_iter), training=False)
+      return strategy.run(step)
+
+    # Run model.
+    shard0 = self._get_replica_numpy(test_fn(), strategy, 0)
+
+    # embedding_values is a linear list, so we reshape to match the correct
+    # shape of the corresponding table before performing the lookup.
+    numpy_videos = np.reshape(self.embedding_values, (8, 4))
+    numpy_users = np.reshape(self.embedding_values, (16, 2))
+    golden = ((numpy_videos[self.feature_watched_values[-2:]],
+               numpy_videos[self.feature_favorited_values[-2:]],
+               numpy_users[self.feature_friends_values[-2:]]))
+    self.assertAllClose(shard0, golden)
+
+  def test_variable_learning_rate(self):
+    num_steps = 10
+    num_steps_float = float(num_steps)
+    starting_lr = 1.0
+    ending_lr = 0.5
+
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+
+    # Create model with Keras.
+    with strategy.scope():
+      step_counter = tf_variables.Variable(0.0, dtypes.float32)
+
+      def lr_function():
+        return gen_math_ops.maximum(
+            ending_lr,
+            starting_lr + ((ending_lr - starting_lr) * step_counter) /
+            num_steps_float)
+
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=lr_function)
+      table_config = tpu_embedding_v2_utils.TableConfig(
+          vocabulary_size=num_replicas,
+          dim=4,
+          initializer=init_ops_v2.Constant(np.zeros((num_replicas, 4))),
+          combiner='sum', name='table')
+      mid_level_api = tpu_embedding_v2.TPUEmbedding(
+          feature_config={
+              'feature': tpu_embedding_v2_utils.FeatureConfig(
+                  table=table_config, name='feature')},
+          batch_size=num_replicas,
+          optimizer=optimizer)
+
+    feature = {'feature': constant_op.constant([0], dtype=dtypes.int32)}
+
+    def input_fn(ctx):
+      del ctx
+      return dataset_ops.DatasetV2.from_tensors(feature).repeat()
+    dist = strategy.experimental_distribute_datasets_from_function(input_fn)
+    dist_iter = iter(dist)
+
+    @def_function.function
+    def test_fn():
+      def step():
+        with backprop.GradientTape() as tape:
+          activations = mid_level_api.dequeue()
+          tape.watch(activations)
+          result = math_ops.reduce_sum(activations['feature'])
+          loss = result / num_replicas
+        grads = tape.gradient(loss, activations)
+        mid_level_api.apply_gradients(grads)
+        return activations['feature']
+
+      mid_level_api.enqueue(next(dist_iter), training=True)
+      return strategy.run(step)
+
+    # Run model.
+    results = []
+    for _ in range(num_steps):
+      result = test_fn()
+      results.append(_unpack(strategy, result))
+      step_counter.assign_add(1.0)
+
+    # Table is 2 elements wide, per-replica batch size of 1, with id 0.
+    # Loss for the gradient is the sum of the entries divided by the number of
+    # replicas. Thus the per replica gradient is 1/#of replicas for row 0 and no
+    # other updates. The reduced gradient is therefore 1.
+    # Learning rate schedule over num_steps steps:
+    # 1.0 0.95 0.9 0.85 0.8 ...
+    # Since use SGD and the gradient is one, the first row of the table is
+    # [0, 0] [-1.0, -1.0] [-1.95, -1.95] [-2.85, -2.85] ... (the negative
+    # partial sums of the above).
+
+    learning_rates = [starting_lr - (starting_lr - ending_lr) / num_steps * j
+                      for j in range(num_steps)]
+    cumsum = [sum(learning_rates[0:j]) for j in range(num_steps)]
+    goldens = [[[-cumsum[i]] * table_config.dim] * num_replicas
+               for i in range(10)]
+    self.assertAllClose(results, goldens)
+
+  @parameterized.parameters([True, False])
+  def test_optimizer_with_slot_creation_fn(self, use_tpu):
+    def slot_creation_fn(table, slot_names):
+      slots = {}
+      for slot in slot_names:
+        slots[slot] = tf_variables.Variable(
+            name='{}_{}'.format(table.name, slot),
+            initial_value=functools.partial(
+                init_ops_v2.Zeros(), shape=table.shape, dtype=dtypes.float32),
+            trainable=False)
+      return slots
+    optimizer = tpu_embedding_v2_utils.Adagrad(
+        learning_rate=0.1,
+        slot_variable_creation_fn=slot_creation_fn)
+    if use_tpu:
+      strategy = self._get_strategy()
+    else:
+      strategy = distribution_strategy_context.get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      mid_level = tpu_embedding_v2.TPUEmbedding(
+          feature_config=self.feature_config,
+          batch_size=self.batch_size * num_replicas,
+          optimizer=optimizer)
+    video_accumulator = mid_level._variables['video']['accumulators']
+    user_accumulator = mid_level._variables['user']['accumulators']
+    if use_tpu:
+      # To check the table contents (ensure that it is zero rather than the
+      # normal initial accumulator value specified to in the optimizer config),
+      # we need to select the underlying table variable on TPU.
+      # We only have one shard on Forge.
+      video_accumulator = video_accumulator.variables[0]
+      user_accumulator = user_accumulator.variables[0]
+
+    self.assertAllClose(video_accumulator.numpy(),
+                        np.zeros((self.table_video.vocabulary_size,
+                                  self.table_video.dim)))
+    self.assertAllClose(user_accumulator.numpy(),
+                        np.zeros((self.table_user.vocabulary_size,
+                                  self.table_user.dim)))
+
+  def test_optimizer_with_slot_creation_fn_non_partial(self):
+    def slot_creation_fn(table, slot_names):
+      slots = {}
+      for slot in slot_names:
+        # Note that we don't pass functools.partial here, so on TPU we can't
+        # extract the shape. We expect the error below.
+        slots[slot] = tf_variables.Variable(
+            name='{}_{}'.format(table.name, slot),
+            initial_value=init_ops_v2.Zeros()(shape=table.shape,
+                                              dtype=dtypes.float32),
+            trainable=False)
+      return slots
+    optimizer = tpu_embedding_v2_utils.Adagrad(
+        learning_rate=0.1,
+        slot_variable_creation_fn=slot_creation_fn)
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      with self.assertRaisesRegex(ValueError,
+                                  'Unable to extract initializer function'):
+        tpu_embedding_v2.TPUEmbedding(
+            feature_config=self.feature_config,
+            batch_size=self.batch_size*num_replicas,
+            optimizer=optimizer)
+
+  def test_sequence_embeddings(self):
+    feature_config = (
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='watched',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_video, name='favorited',
+            max_sequence_length=2),
+        tpu_embedding_v2_utils.FeatureConfig(
+            table=self.table_user, name='friends',
+            max_sequence_length=3))
+    optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+    strategy = self._get_strategy()
+    num_replicas = strategy.num_replicas_in_sync
+    with strategy.scope():
+      mid_level = tpu_embedding_v2.TPUEmbedding(
+          feature_config=feature_config,
+          batch_size=self.batch_size * num_replicas,
+          optimizer=optimizer)
+
+    dataset = self._create_sparse_dataset(strategy)
+    data = next(iter(strategy.experimental_distribute_dataset(dataset)))
+
+    @def_function.function
+    def embedding_and_set_gradients(data):
+      def tpu_fn():
+        activations = mid_level.dequeue()
+        mid_level.apply_gradients(nest.map_structure(array_ops.ones_like,
+                                                     activations))
+        return activations
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    @def_function.function
+    def embedding_only(data):
+      def tpu_fn():
+        return mid_level.dequeue()
+      mid_level.enqueue(data)
+      return strategy.run(tpu_fn)
+
+    # Only check core 0.
+    before_update = self._get_replica_numpy(
+        embedding_and_set_gradients(data), strategy, 0)
+    after_update = self._get_replica_numpy(embedding_only(data), strategy, 0)
+
+    # For videos table, row 0 and row 1 are looked up 3*num_replicas times as
+    # they occur 3 times per replica (considering the features 0 and 1 which are
+    # both looked up in the videos table).
+    # Feature 0 has ids [0, 0, 1], [0, 1, 1], ... repeated over num_replicas
+    # Feature 1 has ids [0, 1, 1], [0, 0, 1], ... repeated over num_replicas
+    # This means that both rows 0 and 1 get a -0.1*3*num_replicas update
+    # For users table, each row is looked up twice:
+    # Feature 2 has ids [3, 0, 1, 2], .. repeated over num_replicas
+    # This means that we get a -0.1*num_replicas update to the third feature.
+
+    # In general this means that after the update, if we lookup feature 0 and 1
+    # the values will be 0.3*num_replicas lower per entry and for feature 2 they
+    # will be 0.1*num_replicas lower.
+    # The one issue that that these lookups contain padding values.
+    # For core 0, we get the first 2 elements of the 4 element batch.
+    # For feature 0, the indices are [[0, 0], [1, 0], [1, 1]] with max sequence
+    # length of 2, which means that [0, 1] will be 0s.
+    # For feature 1, the indices are [[0, 0], [0, 1], [1, 0]] with max sequence
+    # length of 2, which means that [1, 1] will be 0s.
+    # For feature 2, the indices are [[0, 0], [1, 0], [1, 1], [1, 2]] with max
+    # sequence length of 3, which means that [0, 1], [0, 2] will be 0s.
+    # The following masks represent that so that we only apply the above updates
+    # to the non-padding rows:
+    masks = (
+        np.array([[[1], [0]], [[1], [1]]]),
+        np.array([[[1], [1]], [[1], [0]]]),
+        np.array([[[1], [0], [0]], [[1], [1], [1]]]))
+
+    per_row_update = (0.3 * num_replicas,
+                      0.3 * num_replicas,
+                      0.1 * num_replicas)
+    golden = tuple([before - update * mask for before, update, mask in
+                    zip(before_update, per_row_update, masks)])
+    self.assertAllClose(golden, after_update)
+
+
+def _compute_gradients_wrt_embedding_table(batch_size,
+                                           gradient_wrt_activation,
+                                           embedding_table,
+                                           feature_indices,
+                                           feature_values,
+                                           combiner,
+                                           max_sequence_length=0):
+  """Compute gradients wrt embedding_table.
+
+  Args:
+    batch_size: `int`, batch size.
+    gradient_wrt_activation: `np.array` with shape `batch_size` by
+      embedding `dimension`.
+    embedding_table: `np.array` with shape `vocabulary_size` by embedding
+      `dimension`.
+    feature_indices: `indices` as used to construct `SparseTensor`.
+    feature_values: `values` as used to construct `SparseTensor`.
+    combiner: `String`, 'mean' or 'sum'.
+    max_sequence_length: If non-zero, a sequence feature with the given length.
+
+  Returns:
+    Gradients wrt `embedding_table`, an `np.array`s with shape
+      `batch_size` by `vocabulary_size` by
+      embedding `dimension`.
+
+  Raises:
+    ValueError: if `combiner` is not one of 'mean' or 'sum'.
+  """
+  if combiner not in ('mean', 'sum'):
+    raise ValueError('`combiner` must be mean or sum; got {}.'.format(combiner))
+  grads = []
+  for i in range(batch_size):
+    grad = np.zeros_like(embedding_table)
+    count = 0
+    for (batch_i, seq_index), vocabulary_id in zip(feature_indices,
+                                                   feature_values):
+      if batch_i == i:
+        count += 1
+        if max_sequence_length > 0:
+          if seq_index < max_sequence_length:
+            grad[vocabulary_id, :] += gradient_wrt_activation[i, seq_index, :]
+        else:
+          grad[vocabulary_id, :] += gradient_wrt_activation[i, :]
+    if combiner == 'mean' and not max_sequence_length:
+      grad = grad / count
+    grads.append(grad)
+  return np.stack(grads)
+
+
+def _unpack(strategy, per_replica_output):
+  per_replica_output = strategy.experimental_local_results(per_replica_output)
+  per_replica_output = array_ops.concat(per_replica_output, axis=0).numpy()
+  return per_replica_output
+
+
+def _get_total_loss_tensor(activations):
+  losses = []
+  for activation in activations:
+    losses.append(
+        math_ops.reduce_mean(
+            math_ops.reduce_sum(
+                gen_math_ops.squared_difference(activation, 0), 1)))
+  total_loss = array_ops.expand_dims_v2(sum(losses), 0)
+  return total_loss
+
+
+def _compute_loss(activation_watched, activation_favorited, activation_friends):
+  watched_loss = np.mean(np.sum(activation_watched**2, axis=1))
+  if len(activation_favorited.shape) == 2:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=1))
+  else:
+    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=(1, 2)))
+  if len(activation_friends.shape) == 2:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=1))
+  else:
+    friends_loss = np.mean(np.sum(activation_friends**2, axis=(1, 2)))
+  loss = watched_loss + favorited_loss + friends_loss
+  return loss
+
+
+def _get_tmpdir(name, subdir=''):
+  segments = [FLAGS.model_dir, name] + ([subdir] if subdir else [])
+  return os.path.join(*segments)
+
+
+def _get_variable(variable):
+  if isinstance(variable, tpu_embedding_v2.TPUShardedVariable):
+    return variable.variables[0]
+  return variable
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()

From bfc25531730e9cc100b6ab22f1cff96e6aa3046a Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Mon, 1 Jun 2020 13:49:26 -0700
Subject: [PATCH 1488/1533] Add support for tf.Case op in Resource Op Lifting
 pass.

PiperOrigin-RevId: 314196123
Change-Id: I8eef7aad5b6da7cd69b821d92f940a8f08fef08c
---
 .../tensorflow/tests/resource_op_lifting.mlir |  57 ++++++++-
 .../transforms/resource_op_lifting.cc         | 120 +++++++++++-------
 2 files changed, 127 insertions(+), 50 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index b19033ce5b5..2353dc5a7a8 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -406,6 +406,61 @@ func @while_cond(%arg0: tensor<*x!tf.resource<tensor<f32>>>) -> tensor<f32> {
 
 // -----
 
+// CHECK: func @cluster_with_case(%[[ARG0:.*]]: tensor<i32>) -> tensor<4xf32>
+func @cluster_with_case(%arg0: tensor<i32>) -> tensor<4xf32> {
+  // CHECK: %[[VH0:.*]] = "tf.VarHandleOp"()
+  %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  // CHECK: %[[VH1:.*]] = "tf.VarHandleOp"()
+  %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
+  // CHECK-DAG: %[[READ0:.*]] = "tf.ReadVariableOp"(%[[VH0]])
+  // CHECK-DAG: %[[READ1:.*]] = "tf.ReadVariableOp"(%[[VH1]])
+  // CHECK: %[[CLUSTER:.*]]:2 = "tf_device.cluster"()
+  %2 = "tf_device.cluster"() ( {
+    // CHECK: %[[CASE:.*]]:2 = "tf.Case"(%[[ARG0]], %[[READ0]], %[[READ1]])
+    %3:2 = "tf.Case"(%arg0, %0, %1) {branches = [@branch_0, @branch_1, @branch_2]}
+      : (tensor<i32>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
+      -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>)
+    // CHECK-NEXT: %[[ADD:.*]] = "tf.AddV2"(%[[CASE]]#1, %[[CASE]]#0)
+    %4 = "tf.ReadVariableOp"(%3#0) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+    %5 = "tf.AddV2"(%4, %3#1) : (tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32>
+    // CHECK-NEXT: tf_device.return %[[ADD]], %[[CASE]]#1
+    tf_device.return %5 : tensor<4xf32>
+  // CHECK: {cluster_attr = "cluster_attr"} : () -> (tensor<4xf32>, tensor<4xf32>)
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<4xf32>
+  // CHECK: "tf.AssignVariableOp"(%[[VH0]], %[[CLUSTER]]#1)
+  // CHECK: return %[[CLUSTER]]#0
+  return %2 : tensor<4xf32>
+}
+// CHECK: func @branch_0(%[[TARG0:.*]]: tensor<4xf32>, %[[TARG1:.*]]: tensor<4xf32>)
+func @branch_0(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) {
+  // CHECK-NEXT: %[[CONST:.*]] = "tf.Const"()
+  %constant = "tf.Const"() {value = dense<0.0> : tensor<4xf32>} : () -> tensor<4xf32>
+  "tf.AssignVariableOp"(%arg0, %constant) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+  // CHECK-NEXT: return %[[CONST]], %[[CONST]]
+  return %arg0, %constant : tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>
+}
+// CHECK: func @branch_1(%[[EARG0:.*]]: tensor<4xf32>, %[[EARG1:.*]]: tensor<4xf32>)
+func @branch_1(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) {
+  %id = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+  "tf.AssignVariableOp"(%arg0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+  // CHECK-NEXT: return %[[EARG1]], %[[EARG1]]
+  return %arg0, %read : tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>
+}
+// CHECK: func @branch_2(%[[EARG0:.*]]: tensor<4xf32>, %[[EARG1:.*]]: tensor<4xf32>)
+func @branch_2(%arg0: tensor<*x!tf.resource<tensor<4xf32>>>, %arg1: tensor<*x!tf.resource<tensor<4xf32>>>)
+    -> (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) {
+  %id = "tf.Identity"(%arg1) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<*x!tf.resource<tensor<4xf32>>>
+  %read = "tf.ReadVariableOp"(%id) : (tensor<*x!tf.resource<tensor<4xf32>>>) -> tensor<4xf32>
+  "tf.AssignVariableOp"(%arg0, %read) : (tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>) -> ()
+  // CHECK-NEXT: return %[[EARG1]], %[[EARG1]]
+  return %arg0, %read : tensor<*x!tf.resource<tensor<4xf32>>>, tensor<4xf32>
+}
+
+// -----
+
 // Tests that pass lifts resource reads from if branches.
 
 // CHECK: func @cluster_with_if(%[[ARG0:.*]]: tensor<i1>) -> tensor<4xf32>
@@ -524,7 +579,7 @@ func @cluster_with_if(%arg0: tensor<i1>) -> tensor<4xf32> {
   %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v2"} : () -> tensor<*x!tf.resource<tensor<4xf32>>>
   %2 = "tf_device.cluster"() ( {
-    // expected-error @+1 {{unsupported tf.IfOp output: resource does not alias a single input.}}
+    // expected-error @+1 {{unsupported output: resource does not alias a single input}}
     %3 = "tf.If"(%arg0, %0, %1) {then_branch = @if_then, else_branch = @if_else,
         is_stateless = false}
       : (tensor<i1>, tensor<*x!tf.resource<tensor<4xf32>>>, tensor<*x!tf.resource<tensor<4xf32>>>)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index c1e5241a1f0..ed7ebc25c9f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -668,67 +668,74 @@ LogicalResult HandleWhileLoop(TF::WhileOp while_op, FuncOp body, FuncOp cond) {
   return success();
 }
 
-// Lifts loads/stores from an IfOp's branches.
-LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
-                         FuncOp else_branch) {
+// Lifts loads/stores from an IfOp or CaseOp's branches.
+template <class CaseOrIfOp>
+LogicalResult HandleCaseOrIfOp(CaseOrIfOp op, ArrayRef<FuncOp> branches) {
   // Remove identity nodes to avoid aliasing.
-  RemoveIdentity(&then_branch.front());
-  RemoveIdentity(&else_branch.front());
+  for (auto func : branches) RemoveIdentity(&func.front());
+
   // Sanity check: branch return of resources should be aliases of inputs. If
   // so, replace the output uses with the input so that we can remove these
   // outputs.
-  for (auto entry : llvm::enumerate(
-           llvm::zip(then_branch.front().getTerminator()->getOperands(),
-                     else_branch.front().getTerminator()->getOperands()))) {
-    auto then_retval = std::get<0>(entry.value());
-    auto else_retval = std::get<1>(entry.value());
-    assert(then_retval.getType() == else_retval.getType());
-    if (!getElementTypeOrSelf(then_retval.getType()).isa<TF::ResourceType>()) {
+  for (OpResult result : op.getResults()) {
+    if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>())
       continue;
+    unsigned result_index = result.getResultNumber();
+    constexpr unsigned kUnassigned = -1;
+    unsigned common_aliasing_arg_num = kUnassigned;
+    for (auto func : branches) {
+      auto retval = func.front().getTerminator()->getOperand(result_index);
+      assert(result.getType() == retval.getType());
+      auto aliasing_arg = retval.dyn_cast<BlockArgument>();
+      if (common_aliasing_arg_num == kUnassigned)
+        common_aliasing_arg_num = aliasing_arg.getArgNumber();
+      if (!aliasing_arg ||
+          aliasing_arg.getArgNumber() != common_aliasing_arg_num)
+        return op.emitOpError("unsupported output: ")
+               << "resource does not alias a single input";
     }
-    auto then_aliasing_arg = then_retval.dyn_cast<BlockArgument>();
-    auto else_aliasing_arg = else_retval.dyn_cast<BlockArgument>();
-    if (!then_aliasing_arg || !else_aliasing_arg ||
-        then_aliasing_arg.getArgNumber() != else_aliasing_arg.getArgNumber()) {
-      return if_op.emitOpError("unsupported tf.IfOp output: ")
-             << "resource does not alias a single input.";
-    }
-    if_op.getResult(entry.index())
-        .replaceAllUsesWith(
-            if_op.getOperand(then_aliasing_arg.getArgNumber() + 1));
+    assert(common_aliasing_arg_num != kUnassigned);
+    result.replaceAllUsesWith(op.getOperand(common_aliasing_arg_num + 1));
   }
+
   // Erase the resource outputs from the branches.
   int64_t non_resource_results = 0;
   llvm::SmallVector<int64_t, 4> old_to_new_output_indices;
   bool output_removed = false;
-  for (auto result : if_op.getResults()) {
-    if (!getElementTypeOrSelf(result.getType()).isa<TF::ResourceType>()) {
+  for (auto result : op.getResults()) {
+    if (!getElementTypeOrSelf(result.getType())
+             .template isa<TF::ResourceType>()) {
       old_to_new_output_indices.push_back(non_resource_results++);
       continue;
     }
     old_to_new_output_indices.push_back(-1);
-    then_branch.front().getTerminator()->eraseOperand(non_resource_results);
-    else_branch.front().getTerminator()->eraseOperand(non_resource_results);
+    for (auto func : branches)
+      func.front().getTerminator()->eraseOperand(non_resource_results);
     output_removed = true;
   }
 
-  llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> then_use_info;
-  llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> else_use_info;
-  if (failed(FindResourceArgUseInfo(then_branch, &then_use_info)) ||
-      failed(FindResourceArgUseInfo(else_branch, &else_use_info))) {
+  llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> resource_arg_uses;
+  if (failed(FindResourceArgUseInfo(branches.front(), &resource_arg_uses)))
     return failure();
+
+  for (auto func : branches.drop_front()) {
+    llvm::SmallDenseMap<int64_t, ResourceArgUseInfo> branch_use_info;
+    if (failed(FindResourceArgUseInfo(func, &branch_use_info)))
+      return failure();
+    // A resource is considered used as long as it is used in either branch.
+    resource_arg_uses =
+        MergeArgResourceUseInfo(resource_arg_uses, branch_use_info);
   }
-  // A resource is considered used as long as it is used in either branch.
-  auto resource_arg_uses =
-      MergeArgResourceUseInfo(then_use_info, else_use_info);
+
   if (resource_arg_uses.empty() && !output_removed) return success();
   // Remove unused resources in functions.
   llvm::SmallDenseMap<int64_t, Type> remaining_resource_data_types;
   RemoveUnusedResourceArgumentsAndForwardedRetvals(
-      resource_arg_uses, then_branch, /*old_to_new_arg_indices=*/nullptr,
+      resource_arg_uses, branches.front(), /*old_to_new_arg_indices=*/nullptr,
       &remaining_resource_data_types);
-  RemoveUnusedResourceArgumentsAndForwardedRetvals(resource_arg_uses,
-                                                   else_branch);
+  for (auto func : branches.drop_front())
+    RemoveUnusedResourceArgumentsAndForwardedRetvals(resource_arg_uses, func);
+
   // Forward resource inputs updated in any branch to the outputs of both
   // branches. First prepare the mapping from arg to new update output.
   llvm::SmallDenseMap<int64_t, int64_t> resource_arg_to_new_output;
@@ -746,10 +753,11 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
           new_output_index;
     }
   }
+
   // Append resource updates to the return ops: now they are just forwarded
   // input resources, but will be replaced by the data value in
   // LiftArgRetResourcesForFunction().
-  for (auto branch : {then_branch, else_branch}) {
+  for (auto branch : branches) {
     auto new_retvals =
         llvm::to_vector<4>(branch.front().getTerminator()->getOperands());
     for (const auto& entry : resource_arg_to_new_output) {
@@ -766,16 +774,17 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
         });
   }
 
-  // Recreate the if op.
-  OpBuilder builder(if_op);
+  // Recreate the op without resource operands.
+  OpBuilder builder(op);
   // Now use the filtered original operands, which will be replaced by
   // AddLoadsStoresOutsideControlFlowOp().
   auto new_operands =
-      FilterRange<Value, OperandRange>(if_op.input(), resource_arg_uses);
-  new_operands.insert(new_operands.begin(), if_op.cond());
-  auto new_if = builder.create<TF::IfOp>(if_op.getLoc(),
-                                         then_branch.getType().getResults(),
-                                         new_operands, if_op.getAttrs());
+      FilterRange<Value, OperandRange>(op.input(), resource_arg_uses);
+  new_operands.insert(new_operands.begin(), op.getOperand(0));
+  FuncOp first_func = branches.front();
+  auto new_op =
+      builder.create<CaseOrIfOp>(op.getLoc(), first_func.getType().getResults(),
+                                 new_operands, op.getAttrs());
   // Prepare for AddLoadsStoresOutsideControlFlowOp()
   llvm::SmallDenseMap<int64_t, std::pair<Type, int64_t>>
       arg_data_type_and_updated_output_index;
@@ -787,16 +796,16 @@ LogicalResult HandleIfOP(TF::IfOp if_op, FuncOp then_branch,
     arg_data_type_and_updated_output_index[entry.getFirst() + 1] = {
         entry.getSecond(), update_index};
   }
-  AddLoadsStoresOutsideControlFlowOp(new_if,
+  AddLoadsStoresOutsideControlFlowOp(new_op,
                                      arg_data_type_and_updated_output_index);
   // Replace uses.
   for (int64_t i = 0; i < old_to_new_output_indices.size(); ++i) {
     if (old_to_new_output_indices[i] >= 0) {
-      if_op.getResult(i).replaceAllUsesWith(
-          new_if.getResult(old_to_new_output_indices[i]));
+      op.getResult(i).replaceAllUsesWith(
+          new_op.getResult(old_to_new_output_indices[i]));
     }
   }
-  if_op.erase();
+  op.erase();
   return success();
 }
 
@@ -985,7 +994,20 @@ LogicalResult HoistForFunctionalControlFlow(
                                     lifted_partitioned_call_callees);
       HoistForFunctionalControlFlow(&else_branch.front(), module,
                                     lifted_partitioned_call_callees);
-      if (failed(HandleIfOP(if_op, then_branch, else_branch))) return failure();
+      if (failed(HandleCaseOrIfOp(if_op, {then_branch, else_branch})))
+        return failure();
+    } else if (auto case_op = llvm::dyn_cast<TF::CaseOp>(&op)) {
+      SmallVector<FuncOp, 4> branch_functions;
+      branch_functions.reserve(case_op.branches().size());
+      for (const Attribute& branch : case_op.branches()) {
+        FuncOp func =
+            module.lookupSymbol<FuncOp>(branch.cast<FlatSymbolRefAttr>());
+        // Recursively handle the nested control flow.
+        HoistForFunctionalControlFlow(&func.front(), module,
+                                      lifted_partitioned_call_callees);
+        branch_functions.push_back(func);
+      }
+      if (failed(HandleCaseOrIfOp(case_op, branch_functions))) return failure();
     } else if (auto call_op = llvm::dyn_cast<TF::PartitionedCallOp>(&op)) {
       if (!call_op.f().isa<FlatSymbolRefAttr>()) {
         return call_op.emitOpError(

From 15626c4e8d6b086d45c180e18f17483b9b46a5f4 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 1 Jun 2020 13:50:56 -0700
Subject: [PATCH 1489/1533] Update v1 only test with proper reason.

Also fix existing warning wrt deprecated assertion methods.

PiperOrigin-RevId: 314196442
Change-Id: Ifab24cb9519b093bcf41c39726ed5a4fe6350576
---
 tensorflow/python/training/supervisor_test.py | 72 +++++++++----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index fa0f89f3aa2..0529cff1697 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -110,7 +110,7 @@ class SupervisorTest(test.TestCase):
     with ops.Graph().as_default():
       my_op = constant_op.constant(1.0)
       sv = supervisor.Supervisor(logdir=logdir)
-      with sv.managed_session("") as sess:
+      with sv.managed_session(""):
         for _ in xrange(10):
           self.evaluate(my_op)
       # Supervisor has been stopped.
@@ -170,7 +170,7 @@ class SupervisorTest(test.TestCase):
           "", close_summary_writer=True, start_standard_services=False) as sess:
         sv.summary_computed(sess, sess.run(summ))
     event_paths = sorted(glob.glob(os.path.join(logdir, "event*")))
-    self.assertEquals(2, len(event_paths))
+    self.assertEqual(2, len(event_paths))
     # The two event files should have the same contents.
     for path in event_paths:
       # The summary iterator should report the summary once as we closed the
@@ -178,7 +178,7 @@ class SupervisorTest(test.TestCase):
       rr = summary_iterator.summary_iterator(path)
       # The first event should list the file_version.
       ev = next(rr)
-      self.assertEquals("brain.Event:2", ev.file_version)
+      self.assertEqual("brain.Event:2", ev.file_version)
 
       # The next one has the graph and metagraph.
       ev = next(rr)
@@ -198,7 +198,7 @@ class SupervisorTest(test.TestCase):
 
       # The next one should be a stop message if we closed cleanly.
       ev = next(rr)
-      self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+      self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
       # We should be done.
       with self.assertRaises(StopIteration):
@@ -227,7 +227,7 @@ class SupervisorTest(test.TestCase):
     rr = _summary_iterator(logdir)
     # The first event should list the file_version.
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next one has the graph.
     ev = next(rr)
@@ -360,7 +360,7 @@ class SupervisorTest(test.TestCase):
 
     # The first event should list the file_version.
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next one has the graph.
     ev = next(rr)
@@ -385,7 +385,7 @@ class SupervisorTest(test.TestCase):
 
     # The next one should be a stop message if we closed cleanly.
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
@@ -421,7 +421,7 @@ class SupervisorTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
@@ -460,7 +460,7 @@ class SupervisorTest(test.TestCase):
 
     # The first event should list the file_version.
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
 
     # The next one has the graph.
     ev = next(rr)
@@ -486,7 +486,7 @@ class SupervisorTest(test.TestCase):
 
     # The next one should be a stop message if we closed cleanly.
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
@@ -507,7 +507,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
       sv.prepare_or_wait_for_session("")
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -517,7 +517,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -531,7 +531,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testInitOpWithFeedDict(self):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
@@ -545,7 +545,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testReadyForLocalInitOp(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_ready_for_local_init_op")
@@ -588,7 +588,7 @@ class SupervisorTest(test.TestCase):
     sv0.stop()
     sv1.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testReadyForLocalInitOpRestoreFromCheckpoint(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("ready_for_local_init_op_restore")
@@ -660,7 +660,7 @@ class SupervisorTest(test.TestCase):
 
       # This shouldn't add a variable to the VARIABLES collection responsible
       # for variables that are saved/restored from checkpoints.
-      self.assertEquals(len(variables.global_variables()), 0)
+      self.assertEqual(len(variables.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
@@ -681,7 +681,7 @@ class SupervisorTest(test.TestCase):
             collections=[ops.GraphKeys.LOCAL_VARIABLES])
         # This shouldn't add a variable to the VARIABLES collection responsible
         # for variables that are saved/restored from checkpoints.
-        self.assertEquals(len(variables.global_variables()), 0)
+        self.assertEqual(len(variables.global_variables()), 0)
 
       # Suppress normal variable inits to make sure the local one is
       # initialized via local_init_op.
@@ -720,7 +720,7 @@ class SupervisorTest(test.TestCase):
                                    "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
@@ -731,17 +731,17 @@ class SupervisorTest(test.TestCase):
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
       variables.VariableV1(287, name="global_step")
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
-      self.assertEquals(287, sess.run(sv.global_step))
+      self.assertEqual(287, sess.run(sv.global_step))
       sv.stop()
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
@@ -756,14 +756,14 @@ class SupervisorTest(test.TestCase):
       self.assertIsNotNone(new_saver)
       sv2 = supervisor.Supervisor(logdir=logdir, saver=new_saver)
       sess = sv2.prepare_or_wait_for_session("")
-      self.assertEquals(1, sess.run("v0:0"))
+      self.assertEqual(1, sess.run("v0:0"))
       sv2.saver.save(sess, sv2.save_path)
       sv2.stop()
 
   # This test is based on the fact that the standard services start
   # right away and get to run once before sv.stop() returns.
   # We still sleep a bit to make the test robust.
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testStandardServicesWithoutGlobalStep(self):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
@@ -784,7 +784,7 @@ class SupervisorTest(test.TestCase):
     # There should be an event file with a version number.
     rr = _summary_iterator(logdir)
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
     ev = next(rr)
     ev_graph = graph_pb2.GraphDef()
     ev_graph.ParseFromString(ev.graph_def)
@@ -802,7 +802,7 @@ class SupervisorTest(test.TestCase):
     self.assertProtoEquals("value { tag: 'v' simple_value: 1.0 }", ev.summary)
 
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
 
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
@@ -814,7 +814,7 @@ class SupervisorTest(test.TestCase):
 
   # Same as testStandardServicesNoGlobalStep but with a global step.
   # We should get a summary about the step time.
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("train.Supervisor is for v1 only")
   def testStandardServicesWithGlobalStep(self):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
@@ -835,7 +835,7 @@ class SupervisorTest(test.TestCase):
     # There should be an event file with a version number.
     rr = _summary_iterator(logdir)
     ev = next(rr)
-    self.assertEquals("brain.Event:2", ev.file_version)
+    self.assertEqual("brain.Event:2", ev.file_version)
     ev = next(rr)
     ev_graph = graph_pb2.GraphDef()
     ev_graph.ParseFromString(ev.graph_def)
@@ -849,8 +849,8 @@ class SupervisorTest(test.TestCase):
     ev = next(rr)
     # It is actually undeterministic whether SessionLog.START gets written
     # before the summary or the checkpoint, but this works when run 10000 times.
-    self.assertEquals(123, ev.step)
-    self.assertEquals(event_pb2.SessionLog.START, ev.session_log.status)
+    self.assertEqual(123, ev.step)
+    self.assertEqual(event_pb2.SessionLog.START, ev.session_log.status)
     first = next(rr)
     second = next(rr)
     # It is undeterministic whether the value gets written before the checkpoint
@@ -858,17 +858,17 @@ class SupervisorTest(test.TestCase):
     if first.HasField("summary"):
       self.assertProtoEquals("""value { tag: 'global_step/sec'
                                         simple_value: 0.0 }""", first.summary)
-      self.assertEquals(123, second.step)
-      self.assertEquals(event_pb2.SessionLog.CHECKPOINT,
-                        second.session_log.status)
+      self.assertEqual(123, second.step)
+      self.assertEqual(event_pb2.SessionLog.CHECKPOINT,
+                       second.session_log.status)
     else:
-      self.assertEquals(123, first.step)
-      self.assertEquals(event_pb2.SessionLog.CHECKPOINT,
-                        first.session_log.status)
+      self.assertEqual(123, first.step)
+      self.assertEqual(event_pb2.SessionLog.CHECKPOINT,
+                       first.session_log.status)
       self.assertProtoEquals("""value { tag: 'global_step/sec'
                                         simple_value: 0.0 }""", second.summary)
     ev = next(rr)
-    self.assertEquals(event_pb2.SessionLog.STOP, ev.session_log.status)
+    self.assertEqual(event_pb2.SessionLog.STOP, ev.session_log.status)
     self.assertRaises(StopIteration, lambda: next(rr))
     # There should be a checkpoint file with the variable "foo"
     with ops.Graph().as_default(), self.cached_session() as sess:

From 9985d45d9562c0f7b47b76d0d2b3d2db93e7df97 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Mon, 1 Jun 2020 13:56:03 -0700
Subject: [PATCH 1490/1533] Adds type_spec_from_value, which can be used to get
 the TypeSpec for a Tensor or CompositeTensor value.

PiperOrigin-RevId: 314197441
Change-Id: I20c7034fcae6e70bc994297b84f4737d1794d859
---
 tensorflow/python/framework/type_spec.py      | 20 ++++++++++++++++++-
 .../tools/api/golden/v1/tensorflow.pbtxt      |  4 ++++
 .../tools/api/golden/v2/tensorflow.pbtxt      |  4 ++++
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index e6e921e6184..4bf2ad791d7 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -503,11 +503,29 @@ class BatchableTypeSpec(TypeSpec):
     return tensor_list
 
 
+@tf_export("type_spec_from_value")
 def type_spec_from_value(value):
-  """Returns a `TypeSpec` that represents the given `value`.
+  """Returns a `tf.TypeSpec` that represents the given `value`.
+
+  Examples:
+
+    >>> tf.type_spec_from_value(tf.constant([1, 2, 3]))
+    TensorSpec(shape=(3,), dtype=tf.int32, name=None)
+    >>> tf.type_spec_from_value(np.array([4.0, 5.0], np.float64))
+    TensorSpec(shape=(2,), dtype=tf.float64, name=None)
+    >>> tf.type_spec_from_value(tf.ragged.constant([[1, 2], [3, 4, 5]]))
+    RaggedTensorSpec(TensorShape([2, None]), tf.int32, 1, tf.int64)
+
+    >>> example_input = tf.ragged.constant([[1, 2], [3]])
+    >>> @tf.function(input_signature=[tf.type_spec_from_value(example_input)])
+    ... def f(x):
+    ...   return tf.reduce_sum(x, axis=1)
 
   Args:
     value: A value that can be accepted or returned by TensorFlow APIs.
+      Accepted types for `value` include `tf.Tensor`, any value that can be
+      converted to `tf.Tensor` using `tf.convert_to_tensor`, and any subclass
+      of `CompositeTensor` (such as `tf.RaggedTensor`).
 
   Returns:
     A `TypeSpec` that is compatible with `value`.
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 60ae8ef5be9..e6274357a49 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -2432,6 +2432,10 @@ tf_module {
     name: "tuple"
     argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "type_spec_from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unique"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 8928a9d3b67..468b4e36238 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -1108,6 +1108,10 @@ tf_module {
     name: "tuple"
     argspec: "args=[\'tensors\', \'control_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "type_spec_from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "unique"
     argspec: "args=[\'x\', \'out_idx\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "

From 74bcf3c893d52ade234b0a9d98b71d2044b083d6 Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Mon, 1 Jun 2020 13:59:45 -0700
Subject: [PATCH 1491/1533] [XLA][MLIR] Add StaticMemRefCastOp to LHLO with
 lowering to LLVM.

PiperOrigin-RevId: 314198093
Change-Id: Ib71f8d74743efb531330a0939473bc7f041679ca
---
 tensorflow/compiler/mlir/xla/BUILD            | 22 +++++
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc   | 14 +++
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.h    |  1 +
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   | 42 +++++++++
 .../mlir/xla/tests/lhlo-legalize-to-llvm.mlir | 31 +++++++
 .../compiler/mlir/xla/tests/lhlo_ops.mlir     | 27 ++++++
 .../xla/transforms/lhlo_legalize_to_llvm.cc   | 87 +++++++++++++++++++
 .../transforms/lhlo_legalize_to_llvm_pass.cc  | 58 +++++++++++++
 .../compiler/mlir/xla/transforms/rewriters.h  | 10 +++
 9 files changed, 292 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
 create mode 100644 tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
 create mode 100644 tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index d2bae14c9c3..736651b5022 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -39,6 +39,7 @@ filegroup(
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
         "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/ViewLikeInterface.td",
     ],
 )
 
@@ -256,6 +257,21 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "lhlo_legalize_to_llvm",
+    srcs = ["transforms/lhlo_legalize_to_llvm.cc"],
+    deps = [
+        ":lhlo",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
+        "@llvm-project//mlir:StandardOps",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "xla_legalize_to_linalg",
     srcs = ["transforms/xla_legalize_to_linalg.cc"],
@@ -491,6 +507,7 @@ cc_library(
     name = "xla_test_passes",
     srcs = [
         "transforms/chlo_legalize_to_hlo_pass.cc",
+        "transforms/lhlo_legalize_to_llvm_pass.cc",
         "transforms/materialize_broadcasts_pass.cc",
         "transforms/test_infer_shaped_type_pass.cc",
         "transforms/unfuse_batch_norm_pass.cc",
@@ -498,10 +515,14 @@ cc_library(
     deps = [
         ":chlo_legalize_to_hlo",  # build-cleaner: keep
         ":hlo",
+        ":lhlo",
+        ":lhlo_legalize_to_llvm",  # build-cleaner: keep
         ":xla_materialize_broadcasts",  # build-cleaner: keep
         ":xla_unfuse_batch_norm",  # build-cleaner: keep
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:InferTypeOpInterface",
+        "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LLVMTransforms",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
@@ -600,6 +621,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:ViewLikeInterface",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
index 680a73e49c5..9e5fa4f6d9c 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
@@ -55,6 +55,20 @@ XlaLhloDialect::XlaLhloDialect(MLIRContext *context)
       >();
 }
 
+//===----------------------------------------------------------------------===//
+// StaticMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+Value StaticMemRefCastOp::getViewSource() { return *getODSOperands(0).begin(); }
+
+static LogicalResult Verify(StaticMemRefCastOp op) {
+  if (!op.operand().getType().cast<ShapedType>().hasStaticShape())
+    return op.emitOpError("operand must have static shape");
+  if (!op.getType().hasStaticShape())
+    return op.emitOpError("result must have static shape");
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc.inc"
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
index 1c4ccaae214..3827e8a7a4e 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "mlir/Interfaces/ViewLikeInterface.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index 9a2168d3088..f3b5d2d1b8a 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -20,6 +20,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/ViewLikeInterface.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
@@ -262,6 +263,47 @@ def HLO_DynamicUpdateSliceOp: LHLO_Op<"dynamic-update-slice", []> {
   );
 }
 
+//===----------------------------------------------------------------------===//
+// StaticMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+def HLO_StaticMemRefCastOp: Op<LHLO_Dialect, "static_memref_cast",
+    [NoSideEffect, DeclareOpInterfaceMethods<ViewLikeOpInterface>]> {
+  let summary = "static memref cast operation";
+  let description = [{
+    Allows to modify the offset, sizes and strides of a statically shaped memref.
+
+    Example:
+    ```mlir
+    %buf_transformed =
+        xla_lhlo.static_memref_cast %buf
+        : memref<1x5xf32> -> memref<5xf32, offset: 2, strides: [1]>
+
+    // The result of the op is a rank-1 memref with `[5]` shape, stride 1 and
+    // offset 2.
+    ```
+  }];
+
+  let arguments = (ins Arg<LHLO_Buffer, "", []>:$operand);
+  let results = (outs Res<LHLO_Buffer, "", []>:$result);
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, MemRefType resultType, " #
+    "Value operand", [{
+       result.addOperands(operand);
+       result.types.push_back(resultType);
+     }]>];
+
+  let extraClassDeclaration = [{
+    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+  }];
+
+  let verifier = [{ return Verify(*this); }];
+  let assemblyFormat = [{
+    $operand attr-dict `:` type($operand) `->` type($result)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // XLA Other op definitions.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
new file mode 100644
index 00000000000..0202f39afb1
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
@@ -0,0 +1,31 @@
+// RUN: xla-opt %s --test-lhlo-legalize-to-llvm -split-input-file | FileCheck %s --dump-input-on-failure
+
+// CHECK-LABEL: func @static_memref_cast
+func @static_memref_cast(%buf : memref<10x1x5xf32>) {
+  %0 = xla_lhlo.static_memref_cast %buf
+        : memref<10x1x5xf32> -> memref<10x5xf32, offset: 2, strides: [5, 1]>
+  return
+}
+// CHECK: %[[INPUT_MEMREF_BLDR:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE_3D:!.*]]
+// CHECK: llvm.insertvalue
+// CHECK: %[[MEMREF_BLDR_0:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE_2D:!.*]]
+
+// CHECK: %[[IN_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF:.*]][0] : [[DESCRIPTOR_TYPE_3D]]
+// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_1:.*]] = llvm.insertvalue %[[PTR]], %[[MEMREF_BLDR_0]][0] : [[DESCRIPTOR_TYPE_2D]]
+
+// CHECK: %[[IN_ALIGNED_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][1] : [[DESCRIPTOR_TYPE_3D]]
+// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_2:.*]] = llvm.insertvalue %[[ALIGNED_PTR]], %[[MEMREF_BLDR_1]][1] : [[DESCRIPTOR_TYPE_2D]]
+
+// CHECK: %[[C2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_3:.*]] = llvm.insertvalue %[[C2]], %[[MEMREF_BLDR_2]][2] : [[DESCRIPTOR_TYPE_2D]]
+
+// CHECK: %[[C10:.*]] = llvm.mlir.constant(10 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_4:.*]] = llvm.insertvalue %[[C10]], %[[MEMREF_BLDR_3]][3, 0] : [[DESCRIPTOR_TYPE_2D]]
+// CHECK: %[[C5:.*]] = llvm.mlir.constant(5 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_5:.*]] = llvm.insertvalue %[[C5]], %[[MEMREF_BLDR_4]][4, 0] : [[DESCRIPTOR_TYPE_2D]]
+// CHECK: %[[C5_:.*]] = llvm.mlir.constant(5 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_6:.*]] = llvm.insertvalue %[[C5_]], %[[MEMREF_BLDR_5]][3, 1] : [[DESCRIPTOR_TYPE_2D]]
+// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK: %[[MEMREF_BLDR_7:.*]] = llvm.insertvalue %[[C1]], %[[MEMREF_BLDR_6]][4, 1] : [[DESCRIPTOR_TYPE_2D]]
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index d4d775731c8..cdae187b3b6 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -199,3 +199,30 @@ func @case_memref(%index: memref<i32>, %operand_1: memref<f32>, %operand_2: memr
   ) : (memref<i32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
   return
 }
+
+// -----
+
+func @static_memref_cast(%in: memref<10x1xf32>) {
+  %out = xla_lhlo.static_memref_cast %in
+           : memref<10x1xf32> -> memref<10xf32, offset: 0, strides: [1]>
+  return
+}
+// CHECK-LABEL: func @static_memref_cast
+
+// -----
+
+func @static_memref_cast_dynamic_operand(%in: memref<10x?xf32>) {
+  // expected-error @+1 {{operand must have static shape}}
+  %out = xla_lhlo.static_memref_cast %in
+           : memref<10x?xf32> -> memref<10x1xf32, offset: 0, strides: [10, 1]>
+  return
+}
+
+// -----
+
+func @static_memref_cast_dynamic_result(%in: memref<10x1xf32>) {
+  // expected-error @+1 {{result must have static shape}}
+  %out = xla_lhlo.static_memref_cast %in
+           : memref<10x1xf32> -> memref<10x?xf32, offset: 0, strides: [?, ?]>
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
new file mode 100644
index 00000000000..083365c0e8a
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -0,0 +1,87 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_lhlo {
+namespace {
+
+struct StaticMemRefCastOpConverter
+    : public ConvertOpToLLVMPattern<StaticMemRefCastOp> {
+  using ConvertOpToLLVMPattern<StaticMemRefCastOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto cast_op = cast<StaticMemRefCastOp>(op);
+
+    StaticMemRefCastOpOperandAdaptor operands_adaptor(operands);
+    MemRefDescriptor sourceMemRef(operands_adaptor.operand());
+
+    MemRefType targetMemRefType =
+        cast_op.getResult().getType().cast<MemRefType>();
+    auto llvmTargetDescriptorTy = typeConverter.convertType(targetMemRefType)
+                                      .dyn_cast_or_null<LLVM::LLVMType>();
+    if (!llvmTargetDescriptorTy || !llvmTargetDescriptorTy.isStructTy())
+      return failure();
+    // Create descriptor.
+    auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
+    Type llvmTargetElementTy = desc.getElementType();
+    // Set allocated ptr.
+    Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
+    allocated =
+        rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, allocated);
+    desc.setAllocatedPtr(rewriter, loc, allocated);
+    // Set aligned ptr.
+    Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
+    ptr = rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, ptr);
+    desc.setAlignedPtr(rewriter, loc, ptr);
+
+    // Fill size and stride descriptors in memref.
+    auto target_sizes = targetMemRefType.getShape();
+    int64_t target_offset;
+    llvm::SmallVector<int64_t, 4> target_strides;
+    if (failed((getStridesAndOffset(targetMemRefType, target_strides,
+                                    target_offset))))
+      return failure();
+
+    // Copy offset of `targetMemRef`.
+    desc.setConstantOffset(rewriter, loc, target_offset);
+    for (int i = 0, e = targetMemRefType.getRank(); i < e; ++i) {
+      desc.setConstantSize(rewriter, loc, i, target_sizes[i]);
+      desc.setConstantStride(rewriter, loc, i, target_strides[i]);
+    }
+    rewriter.replaceOp(op, {desc});
+    return success();
+  }
+};
+
+}  // namespace
+
+void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
+                                          OwningRewritePatternList *patterns) {
+  patterns->insert<StaticMemRefCastOpConverter>(*converter);
+}
+
+}  // namespace xla_lhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
new file mode 100644
index 00000000000..9b809049290
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm_pass.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
+#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
+
+namespace mlir {
+namespace xla_lhlo {
+namespace {
+
+class TestLhloToLLVMPass
+    : public ::mlir::PassWrapper<TestLhloToLLVMPass,
+                                 ::mlir::OperationPass<::mlir::ModuleOp>> {
+ public:
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+
+    OwningRewritePatternList patterns;
+    LLVMTypeConverter converter(m.getContext());
+    populateStdToLLVMConversionPatterns(converter, patterns);
+    PopulateLhloToLLVMConversionPatterns(&converter, &patterns);
+
+    ConversionTarget target(getContext());
+    target.addLegalDialect<LLVM::LLVMDialect>();
+    target.addLegalOp<ModuleOp, ModuleTerminatorOp>();
+    target.addIllegalDialect<XlaLhloDialect>();
+
+    if (failed(applyFullConversion(m, target, patterns, &converter))) {
+      signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+static PassRegistration<TestLhloToLLVMPass> legalize_lhlo_pass(
+    "test-lhlo-legalize-to-llvm", "Legalize from LHLO dialect to LLVM.");
+
+}  // namespace xla_lhlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/transforms/rewriters.h b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
index 3e86820d7be..59347198fe4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/xla/transforms/rewriters.h
@@ -23,6 +23,8 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 
 namespace mlir {
+class LLVMTypeConverter;
+class OwningRewritePatternList;
 class BufferAssignmentPlacer;
 namespace xla_hlo {
 
@@ -72,6 +74,14 @@ void PopulateUnfuseBatchNormPatterns(MLIRContext *context,
 
 }  // namespace xla_hlo
 
+namespace xla_lhlo {
+
+/// Collect a set of patterns to convert from the LHLO dialect to LLVM.
+void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
+                                          OwningRewritePatternList *patterns);
+
+}  // namespace xla_lhlo
+
 namespace xla_chlo {
 
 // Populates a collection of conversion patterns for legalizing client-HLO to

From be58eeb5b44be02e9cdf60b9dfe8b7c1b5312dd1 Mon Sep 17 00:00:00 2001
From: Michael Banfield <micban@google.com>
Date: Mon, 1 Jun 2020 14:17:51 -0700
Subject: [PATCH 1492/1533] Use HTTP 1.1 for CurlHttpRequest.

PiperOrigin-RevId: 314201870
Change-Id: I056e556b9b80d406f600a25fb73801960c89b400
---
 tensorflow/core/platform/cloud/curl_http_request.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index a227edb1fb0..6b91d615f6f 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -145,6 +145,8 @@ CurlHttpRequest::CurlHttpRequest(LibCurl* libcurl, Env* env)
   CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_NOSIGNAL, 1L));
 
   // TODO(b/74351157): Enable HTTP/2.
+  CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_HTTP_VERSION,
+                                           CURL_HTTP_VERSION_1_1));
 
   // Set up the progress meter.
   CHECK_CURL_OK(

From a5d05b441a395377c46d6f86832a24622c6d0ad8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 14:18:29 -0700
Subject: [PATCH 1493/1533] Don't call GetFactory() twice.

PiperOrigin-RevId: 314201989
Change-Id: I34d89ff66b927deae9740022a0e83ce8a97ebc72
---
 tensorflow/core/common_runtime/session.cc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/tensorflow/core/common_runtime/session.cc b/tensorflow/core/common_runtime/session.cc
index 3817736020e..05c8432d4fe 100644
--- a/tensorflow/core/common_runtime/session.cc
+++ b/tensorflow/core/common_runtime/session.cc
@@ -60,18 +60,12 @@ Status Session::PRun(const string& handle,
 }
 
 Session* NewSession(const SessionOptions& options) {
-  SessionFactory* factory;
-  Status s = SessionFactory::GetFactory(options, &factory);
-  if (!s.ok()) {
-    LOG(ERROR) << s;
-    return nullptr;
-  }
   // Starts exporting metrics through a platform-specific monitoring API (if
   // provided). For builds using "tensorflow/core/platform/default", this is
   // currently a no-op.
   session_created->GetCell()->Set(true);
   Session* out_session;
-  s = NewSession(options, &out_session);
+  Status s = NewSession(options, &out_session);
   if (!s.ok()) {
     LOG(ERROR) << "Failed to create session: " << s;
     return nullptr;

From 6c84889208f06436399527b57866071053a0592b Mon Sep 17 00:00:00 2001
From: Alexander Belyaev <pifon@google.com>
Date: Mon, 1 Jun 2020 14:46:04 -0700
Subject: [PATCH 1494/1533] [XLA][MLIR] Add DynamicMemRefCastOp to LHLO with
 lowering to LLVM.

PiperOrigin-RevId: 314207315
Change-Id: Idb80930464e9f1b87cbafccba9cde86e28afc093
---
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc   | 16 ++++++
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   | 49 ++++++++++++++++++
 .../mlir/xla/tests/lhlo-legalize-to-llvm.mlir | 34 +++++++++++++
 .../compiler/mlir/xla/tests/lhlo_ops.mlir     | 22 ++++++++
 .../xla/transforms/lhlo_legalize_to_llvm.cc   | 51 ++++++++++++++++++-
 5 files changed, 171 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
index 9e5fa4f6d9c..6f9b39377af 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc
@@ -69,6 +69,22 @@ static LogicalResult Verify(StaticMemRefCastOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+Value DynamicMemRefCastOp::getViewSource() {
+  return *getODSOperands(0).begin();
+}
+
+static LogicalResult Verify(DynamicMemRefCastOp op) {
+  // Check if `sizes` and `strides` args are compatible with the result type.
+  if (op.sizes().size() != op.getType().getRank())
+    return op.emitOpError(
+        "`sizes` args count must be equal to the rank of the output memref");
+  return success();
+}
+
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.cc.inc"
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index f3b5d2d1b8a..d9f3648bb09 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -304,6 +304,55 @@ def HLO_StaticMemRefCastOp: Op<LHLO_Dialect, "static_memref_cast",
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// DynamicMemRefCastOp
+//===----------------------------------------------------------------------===//
+
+def HLO_DynamicMemRefCastOp: Op<LHLO_Dialect, "dynamic_memref_cast",
+    [SameVariadicOperandSize, NoSideEffect,
+     DeclareOpInterfaceMethods<ViewLikeOpInterface>]> {
+  let summary = "dynamic memref cast operation";
+  let description = [{
+    Change sizes and strides of a memref using the values computed in runtime.
+
+    Example:
+    ```mlir
+    %buf_transformed =
+        xla_lhlo.dynamic_memref_cast %buf(%size_X, %size_Y)[%step_X, %step_Y]
+        : memref<?x?xf32> -> memref<?x?xf32, offset: 0, strides: [?, ?]>
+    // The result of the op is a type-erased memref with `[%size_X, %size_Y]`
+    // shape and `[%step_X, %step_Y]` strides. The offset will be inherited
+    // from the input.
+    ```
+  }];
+
+  let arguments = (ins
+    Arg<LHLO_Buffer, "", []>:$operand,
+    Variadic<Index>:$sizes,
+    Variadic<Index>:$strides
+  );
+  let results = (outs Res<LHLO_Buffer, "", []>:$result);
+
+  let builders = [OpBuilder<
+    "OpBuilder &builder, OperationState &result, MemRefType resultType, " #
+    "Value operand, ValueRange sizes, ValueRange strides", [{
+       result.addOperands(operand);
+       result.addOperands(sizes);
+       result.addOperands(strides);
+       result.types.push_back(resultType);
+     }]>];
+
+  let extraClassDeclaration = [{
+    MemRefType getType() { return getResult().getType().cast<MemRefType>(); }
+  }];
+
+  let verifier = [{ return Verify(*this); }];
+  let assemblyFormat = [{
+    $operand `(` $sizes `)` `[` $strides `]` attr-dict `:` type($operand) `->`
+    type($result)
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // XLA Other op definitions.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
index 0202f39afb1..16aad8f7cf3 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-llvm.mlir
@@ -29,3 +29,37 @@ func @static_memref_cast(%buf : memref<10x1x5xf32>) {
 // CHECK: %[[MEMREF_BLDR_6:.*]] = llvm.insertvalue %[[C5_]], %[[MEMREF_BLDR_5]][3, 1] : [[DESCRIPTOR_TYPE_2D]]
 // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
 // CHECK: %[[MEMREF_BLDR_7:.*]] = llvm.insertvalue %[[C1]], %[[MEMREF_BLDR_6]][4, 1] : [[DESCRIPTOR_TYPE_2D]]
+
+// -----
+
+// CHECK-LABEL: func @dynamic_memref_cast
+func @dynamic_memref_cast(%buf : memref<?x?xf32>) {
+  %size_X = constant 10 : index
+  %size_Y = constant 50 : index
+  %stride_X = constant 1 : index
+  %stride_Y = constant 0 : index
+  %0 = xla_lhlo.dynamic_memref_cast %buf(%size_X, %size_Y)[%stride_X, %stride_Y]
+        : memref<?x?xf32> -> memref<?x?xf32, offset: 0, strides: [?, ?]>
+  return
+}
+// CHECK: %[[C10:.*]] = llvm.mlir.constant(10 : index) : !llvm.i64
+// CHECK: %[[C50:.*]] = llvm.mlir.constant(50 : index) : !llvm.i64
+// CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
+// CHECK: %[[C0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
+
+// CHECK: %[[MEMREF_BLDR_0:.*]] = llvm.mlir.undef : [[DESCRIPTOR_TYPE:!.*]]
+
+// CHECK: %[[IN_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF:.*]][0] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[PTR:.*]] = llvm.bitcast %[[IN_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_1:.*]] = llvm.insertvalue %[[PTR]], %[[MEMREF_BLDR_0]][0] : [[DESCRIPTOR_TYPE]]
+
+// CHECK: %[[IN_ALIGNED_PTR:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][1] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[ALIGNED_PTR:.*]] = llvm.bitcast %[[IN_ALIGNED_PTR]] : !llvm<"float*"> to !llvm<"float*">
+// CHECK: %[[MEMREF_BLDR_2:.*]] = llvm.insertvalue %[[ALIGNED_PTR]], %[[MEMREF_BLDR_1]][1] : [[DESCRIPTOR_TYPE]]
+
+// CHECK: %[[SRC_OFFSET:.*]] = llvm.extractvalue %[[INPUT_MEMREF]][2] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_3:.*]] = llvm.insertvalue %[[SRC_OFFSET]], %[[MEMREF_BLDR_2]][2] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_4:.*]] = llvm.insertvalue %[[C10]], %[[MEMREF_BLDR_3]][3, 0] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_5:.*]] = llvm.insertvalue %[[C1]], %[[MEMREF_BLDR_4]][4, 0] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_6:.*]] = llvm.insertvalue %[[C50]], %[[MEMREF_BLDR_5]][3, 1] : [[DESCRIPTOR_TYPE]]
+// CHECK: %[[MEMREF_BLDR_7:.*]] = llvm.insertvalue %[[C0]], %[[MEMREF_BLDR_6]][4, 1] : [[DESCRIPTOR_TYPE]]
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
index cdae187b3b6..1a44428d2e9 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo_ops.mlir
@@ -226,3 +226,25 @@ func @static_memref_cast_dynamic_result(%in: memref<10x1xf32>) {
            : memref<10x1xf32> -> memref<10x?xf32, offset: 0, strides: [?, ?]>
   return
 }
+
+// -----
+
+func @dynamic_memref_cast(%in: memref<?xf32>) {
+  %size = constant 10 : index
+  %step = constant 1 : index
+  %out = xla_lhlo.dynamic_memref_cast %in(%size)[%step]
+           : memref<?xf32> -> memref<?xf32, offset: 0, strides: [?]>
+  return
+}
+// CHECK-LABEL: func @dynamic_memref_cast
+
+// -----
+
+func @dynamic_memref_cast_incompatible_result_type(%in: memref<?xf32>) {
+  // expected-error @+3 {{`sizes` args count must be equal to the rank of the output memref}}
+  %size = constant 10 : index
+  %step = constant 1 : index
+  %out = xla_lhlo.dynamic_memref_cast %in(%size)[%step]
+           : memref<?xf32> -> memref<?x?xf32, offset: 0, strides: [?, ?]>
+  return
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
index 083365c0e8a..385e0859906 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_llvm.cc
@@ -76,11 +76,60 @@ struct StaticMemRefCastOpConverter
   }
 };
 
+struct DynamicMemRefCastOpConverter
+    : public ConvertOpToLLVMPattern<DynamicMemRefCastOp> {
+  using ConvertOpToLLVMPattern<DynamicMemRefCastOp>::ConvertOpToLLVMPattern;
+
+  LogicalResult matchAndRewrite(
+      Operation *op, ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const override {
+    auto loc = op->getLoc();
+    auto cast_op = cast<DynamicMemRefCastOp>(op);
+
+    DynamicMemRefCastOpOperandAdaptor operands_adaptor(operands);
+    MemRefDescriptor sourceMemRef(operands_adaptor.operand());
+
+    MemRefType targetMemRefType =
+        cast_op.getResult().getType().cast<MemRefType>();
+    auto llvmTargetDescriptorTy = typeConverter.convertType(targetMemRefType)
+                                      .dyn_cast_or_null<LLVM::LLVMType>();
+    if (!llvmTargetDescriptorTy || !llvmTargetDescriptorTy.isStructTy())
+      return failure();
+    // Create descriptor.
+    auto desc = MemRefDescriptor::undef(rewriter, loc, llvmTargetDescriptorTy);
+    Type llvmTargetElementTy = desc.getElementType();
+    // Set allocated ptr.
+    Value allocated = sourceMemRef.allocatedPtr(rewriter, loc);
+    allocated =
+        rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, allocated);
+    desc.setAllocatedPtr(rewriter, loc, allocated);
+    // Set aligned ptr.
+    Value ptr = sourceMemRef.alignedPtr(rewriter, loc);
+    ptr = rewriter.create<LLVM::BitcastOp>(loc, llvmTargetElementTy, ptr);
+    desc.setAlignedPtr(rewriter, loc, ptr);
+    // Copy offset of `sourceMemRef`.
+    desc.setOffset(rewriter, loc, sourceMemRef.offset(rewriter, loc));
+
+    // Fill size and stride descriptors in memref.
+    if (!cast_op.sizes().empty()) {
+      auto sizes = operands_adaptor.sizes();
+      auto strides = operands_adaptor.strides();
+      for (int i = 0, e = targetMemRefType.getRank(); i < e; ++i) {
+        desc.setSize(rewriter, loc, i, sizes[i]);
+        desc.setStride(rewriter, loc, i, strides[i]);
+      }
+    }
+    rewriter.replaceOp(op, {desc});
+    return success();
+  }
+};
+
 }  // namespace
 
 void PopulateLhloToLLVMConversionPatterns(LLVMTypeConverter *converter,
                                           OwningRewritePatternList *patterns) {
-  patterns->insert<StaticMemRefCastOpConverter>(*converter);
+  patterns->insert<DynamicMemRefCastOpConverter, StaticMemRefCastOpConverter>(
+      *converter);
 }
 
 }  // namespace xla_lhlo

From c98e2360600d65dfb1f9af16416b69819df68355 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 14:53:54 -0700
Subject: [PATCH 1495/1533] Allow Cross-Program Prefetching the result of a
 Bitcast

PiperOrigin-RevId: 314208916
Change-Id: Ie3882052dd9a48bafa67c72c81ec55a6016cf2d4
---
 .../xla/service/memory_space_assignment.cc    | 18 ++++++--
 .../service/memory_space_assignment_test.cc   | 46 +++++++++++++++++++
 2 files changed, 61 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index e07431bf46f..e4ee79e9f4c 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -1913,6 +1913,8 @@ bool LooksLikeAnActivation(const HloInstruction* inst) {
           }
         }
         break;
+      case HloOpcode::kBitcast:
+        return LooksLikeAnActivation(user);
       default:
         return true;
     }
@@ -1929,10 +1931,20 @@ bool IsCrossProgramPrefetchCandidate(
          !value.uses().empty() &&
          options.size_fn(value) <= options.max_size_in_bytes &&
          absl::c_all_of(value.uses(), [&](const HloUse& use) {
-           const HloInstruction* gte =
+           const HloInstruction* inst =
                use.instruction->operand(use.operand_number);
-           return gte->opcode() == HloOpcode::kGetTupleElement &&
-                  !LooksLikeAnActivation(gte);
+
+           // Skip the LooksLikeAnActivation test since we're testing the
+           // parent GTE and its children below.
+           if (inst->opcode() == HloOpcode::kBitcast &&
+               inst->operand(0)->opcode() == HloOpcode::kGetTupleElement &&
+               inst->operand(0)->operand(0)->opcode() ==
+                   HloOpcode::kParameter) {
+             return true;
+           }
+
+           return inst->opcode() == HloOpcode::kGetTupleElement &&
+                  !LooksLikeAnActivation(inst);
          });
 }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 0a76dd5f31c..23b311730f8 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -3790,6 +3790,52 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchTest) {
   }
 }
 
+TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchBitcastTest) {
+  HloComputation::Builder builder(TestName());
+
+  constexpr int kBatch = 8;
+  constexpr int kFeature = 8;
+  constexpr int kOutput = 2;
+
+  auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
+  auto rhs_shape = ShapeUtil::MakeShape(F32, {kOutput, kFeature});
+  auto bitcast_shape = ShapeUtil::MakeShape(F32, {kFeature, kOutput});
+  auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
+  auto tuple_shape = ShapeUtil::MakeTupleShape({lhs_shape, rhs_shape});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, tuple_shape, "p0"));
+
+  auto lhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(lhs_shape, param, 0));
+  auto rhs = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(rhs_shape, param, 1));
+
+  auto bitcast =
+      builder.AddInstruction(HloInstruction::CreateBitcast(bitcast_shape, rhs));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(HloInstruction::CreateDot(
+      result_shape, lhs, bitcast, dot_dnums, DefaultPrecisionConfig(2)));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  HloSchedule schedule(module.get());
+  schedule.set_sequence(computation, {param, lhs, rhs, bitcast, dot});
+  TF_CHECK_OK(module->set_schedule(schedule));
+
+  AssignMemorySpace(module.get());
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 1);
+  if (!cross_program_prefetches.empty()) {
+    EXPECT_EQ(cross_program_prefetches[0].first, 0);
+    EXPECT_EQ(cross_program_prefetches[0].second, ShapeIndex({1}));
+  }
+}
+
 TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchNestedTupleTest) {
   HloComputation::Builder builder(TestName());
 

From 926090c4d4c2624273198dd70231017c5dc74da7 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Mon, 1 Jun 2020 15:17:02 -0700
Subject: [PATCH 1496/1533] More tolerant control flow grad code. It currently
 fails sometimes inside tf.function

PiperOrigin-RevId: 314213428
Change-Id: I0b0d1ae88401016d3b617cf843192708bda3e695
---
 tensorflow/python/ops/control_flow_grad.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/ops/control_flow_grad.py b/tensorflow/python/ops/control_flow_grad.py
index 6a551deb5ba..ce426ea344b 100644
--- a/tensorflow/python/ops/control_flow_grad.py
+++ b/tensorflow/python/ops/control_flow_grad.py
@@ -210,6 +210,8 @@ def _EnterGrad(op, grad):
   # pylint: disable=protected-access
   grad_ctxt = graph._get_control_flow_context()
   # pylint: enable=protected-access
+  if grad_ctxt is None:
+    return grad
   if not grad_ctxt.back_prop:
     # Skip gradient computation, if the attribute `back_prop` is false.
     return grad

From 625da41a12fb458c3b49a02212226f24c3fe88e5 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Mon, 1 Jun 2020 15:39:04 -0700
Subject: [PATCH 1497/1533] Added access type to gpu_objects/arguments.

PiperOrigin-RevId: 314217440
Change-Id: I5f65b6d51da540686ed9e76e72b8cd9b479ad813
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  1 +
 tensorflow/lite/delegates/gpu/cl/arguments.cc | 48 +++++++++++++------
 tensorflow/lite/delegates/gpu/cl/arguments.h  |  5 +-
 tensorflow/lite/delegates/gpu/cl/gpu_object.h | 13 ++++-
 .../delegates/gpu/cl/kernels/transpose.cc     |  3 +-
 .../lite/delegates/gpu/cl/kernels/winograd.cc |  3 +-
 .../lite/delegates/gpu/cl/linear_storage.cc   |  8 +++-
 .../lite/delegates/gpu/cl/linear_storage.h    |  4 +-
 tensorflow/lite/delegates/gpu/cl/tensor.cc    |  8 +++-
 tensorflow/lite/delegates/gpu/cl/tensor.h     |  2 +-
 .../lite/delegates/gpu/cl/tensor_type.cc      | 21 ++++++--
 .../lite/delegates/gpu/cl/tensor_type.h       |  2 +-
 12 files changed, 86 insertions(+), 32 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 463fb0aecc5..7ace4ebf6a9 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -329,6 +329,7 @@ cc_library(
     hdrs = ["gpu_object.h"],
     deps = [
         ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:data_type",
         "//tensorflow/lite/delegates/gpu/common:status",
     ],
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 908780d6caf..202482170f8 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -94,6 +94,17 @@ void AppendArgument(const std::string& arg, std::string* args) {
   absl::StrAppend(args, arg);
 }
 
+std::string GetImageModifier(AccessType access) {
+  switch (access) {
+    case AccessType::READ:
+      return "__read_only";
+    case AccessType::WRITE:
+      return "__write_only";
+    case AccessType::READ_WRITE:
+      return "__read_write";
+  }
+}
+
 }  // namespace
 
 Arguments::Arguments(Arguments&& args)
@@ -155,13 +166,14 @@ void Arguments::AddImageBuffer(const std::string& name,
   image_buffers_[name] = desc;
 }
 
-void Arguments::AddObjectRef(const std::string& name,
+void Arguments::AddObjectRef(const std::string& name, AccessType access_type,
                              GPUObjectDescriptorPtr&& descriptor_ptr) {
-  object_refs_[name] = {AccessType::READ, std::move(descriptor_ptr)};
+  object_refs_[name] = {access_type, std::move(descriptor_ptr)};
 }
 
-void Arguments::AddObject(const std::string& name, GPUObjectPtr&& object) {
-  objects_[name] = {AccessType::READ, std::move(object)};
+void Arguments::AddObject(const std::string& name, AccessType access_type,
+                          GPUObjectPtr&& object) {
+  objects_[name] = {access_type, std::move(object)};
 }
 
 void Arguments::AddGPUResources(const std::string& name,
@@ -273,7 +285,7 @@ absl::Status Arguments::SetObjectRef(const std::string& name,
     return absl::NotFoundError(
         absl::StrCat("No object ref with name - ", name));
   }
-  return SetGPUResources(name, object->GetGPUResources());
+  return SetGPUResources(name, object->GetGPUResources(it->second.access_type));
 }
 
 absl::Status Arguments::SetGPUResources(
@@ -320,18 +332,24 @@ std::string Arguments::GetListOfArgs() {
                    &result);
   }
   for (auto& t : image_buffers_) {
-    AppendArgument(absl::StrCat("__read_only image1d_buffer_t ", t.first),
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image1d_buffer_t ", t.first),
                    &result);
   }
   for (auto& t : images2d_) {
-    AppendArgument(absl::StrCat("__read_only image2d_t ", t.first), &result);
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image2d_t ", t.first),
+                   &result);
   }
   for (auto& t : image2d_arrays_) {
-    AppendArgument(absl::StrCat("__read_only image2d_array_t ", t.first),
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image2d_array_t ", t.first),
                    &result);
   }
   for (auto& t : images3d_) {
-    AppendArgument(absl::StrCat("__read_only image3d_t ", t.first), &result);
+    AppendArgument(absl::StrCat(GetImageModifier(t.second.access_type),
+                                " image3d_t ", t.first),
+                   &result);
   }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     AppendArgument(absl::StrCat("int4 shared_int4_", i), &result);
@@ -494,7 +512,7 @@ absl::Status Arguments::ResolveSelector(const std::string& object_name,
         absl::StrCat("No object with name - ", object_name));
   }
   RETURN_IF_ERROR(desc_ptr->PerformSelector(selector, args, result));
-  auto names = desc_ptr->GetGPUResources().GetNames();
+  auto names = desc_ptr->GetGPUResources(access_type).GetNames();
   ResolveObjectNames(object_name, names, result);
   return absl::OkStatus();
 }
@@ -551,12 +569,14 @@ absl::Status Arguments::ResolveSelectorsPass(std::string* code) {
 absl::Status Arguments::AddObjectArgs() {
   for (auto& t : objects_) {
     AddGPUResources(t.first,
-                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources());
-    RETURN_IF_ERROR(
-        SetGPUResources(t.first, t.second.obj_ptr->GetGPUResources()));
+                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources(
+                        t.second.access_type));
+    RETURN_IF_ERROR(SetGPUResources(
+        t.first, t.second.obj_ptr->GetGPUResources(t.second.access_type)));
   }
   for (auto& t : object_refs_) {
-    AddGPUResources(t.first, t.second.descriptor->GetGPUResources());
+    AddGPUResources(t.first,
+                    t.second.descriptor->GetGPUResources(t.second.access_type));
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index bad308d9da5..db711bcb1d3 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -45,9 +45,10 @@ class Arguments {
   void AddImageBuffer(const std::string& name,
                       const GPUImageBufferDescriptor& desc);
 
-  void AddObjectRef(const std::string& name,
+  void AddObjectRef(const std::string& name, AccessType access_type,
                     GPUObjectDescriptorPtr&& descriptor_ptr);
-  void AddObject(const std::string& name, GPUObjectPtr&& object);
+  void AddObject(const std::string& name, AccessType access_type,
+                 GPUObjectPtr&& object);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
index 43eba5b1481..b936c1b01ee 100644
--- a/tensorflow/lite/delegates/gpu/cl/gpu_object.h
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 
@@ -31,26 +32,31 @@ namespace cl {
 
 struct GPUImage2DDescriptor {
   DataType data_type;
+  AccessType access_type;
   cl_mem memory;
 };
 
 struct GPUImage3DDescriptor {
   DataType data_type;
+  AccessType access_type;
   cl_mem memory;
 };
 
 struct GPUImage2DArrayDescriptor {
   DataType data_type;
+  AccessType access_type;
   cl_mem memory;
 };
 
 struct GPUImageBufferDescriptor {
   DataType data_type;
+  AccessType access_type;
   cl_mem memory;
 };
 
 struct GPUBufferDescriptor {
   DataType data_type;
+  AccessType access_type;
   int element_size;
   cl_mem memory;
 };
@@ -123,7 +129,9 @@ class GPUObjectDescriptor {
     *result = "";
     return absl::OkStatus();
   }
-  virtual GPUResources GetGPUResources() const { return GPUResources(); }
+  virtual GPUResources GetGPUResources(AccessType access_type) const {
+    return GPUResources();
+  }
 
  protected:
   mutable std::map<std::string, std::string> state_vars_;
@@ -141,7 +149,8 @@ class GPUObject {
   GPUObject& operator=(const GPUObject&) = delete;
   virtual ~GPUObject() = default;
   virtual const GPUObjectDescriptor* GetGPUDescriptor() const = 0;
-  virtual GPUResourcesWithValue GetGPUResources() const = 0;
+  virtual GPUResourcesWithValue GetGPUResources(
+      AccessType access_type) const = 0;
 };
 
 using GPUObjectPtr = std::unique_ptr<GPUObject>;
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index 03c6e22924e..4c46a88d3c4 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -37,7 +37,8 @@ std::string GetTransposeCode(
                                  op_def.dst_tensors[0]);
 
   args->AddObjectRef(
-      "src_tensor", absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
+      "src_tensor", AccessType::READ,
+      absl::make_unique<TensorDescriptor>(op_def.src_tensors[0]));
   args->AddInt("dst_width");
   args->AddInt("dst_height");
   args->AddInt("dst_slices");
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
index 82688931755..eeb95ebaff7 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/winograd.cc
@@ -425,7 +425,8 @@ absl::Status Winograd4x4To36::UploadBt(CLContext* context) {
 
   LinearStorage lt;
   RETURN_IF_ERROR(CreateLinearStorage(create_info, bt_aligned, context, &lt));
-  args_.AddObject("bt", absl::make_unique<LinearStorage>(std::move(lt)));
+  args_.AddObject("bt", AccessType::READ,
+                  absl::make_unique<LinearStorage>(std::move(lt)));
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
index ecf0e087427..7edf83f57ff 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.cc
@@ -23,17 +23,20 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
-GPUResources TensorLinearDescriptor::GetGPUResources() const {
+GPUResources TensorLinearDescriptor::GetGPUResources(
+    AccessType access_type) const {
   GPUResources resources;
   resources.ints.push_back("length");
   if (storage_type == LinearStorageType::BUFFER) {
     GPUBufferDescriptor desc;
     desc.data_type = element_type;
+    desc.access_type = access_type;
     desc.element_size = 4;
     resources.buffers.push_back({"buffer", desc});
   } else {
     GPUImage2DDescriptor desc;
     desc.data_type = element_type;
+    desc.access_type = access_type;
     resources.images2d.push_back({"tex2d", desc});
   }
   return resources;
@@ -123,7 +126,8 @@ std::string LinearStorage::GetDeclaration() const {
   }
 }
 
-GPUResourcesWithValue LinearStorage::GetGPUResources() const {
+GPUResourcesWithValue LinearStorage::GetGPUResources(
+    AccessType access_type) const {
   GPUResourcesWithValue resources;
   resources.ints.push_back({"length", depth_});
 
diff --git a/tensorflow/lite/delegates/gpu/cl/linear_storage.h b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
index a31094b4a47..83c41e2c833 100644
--- a/tensorflow/lite/delegates/gpu/cl/linear_storage.h
+++ b/tensorflow/lite/delegates/gpu/cl/linear_storage.h
@@ -59,7 +59,7 @@ struct TensorLinearDescriptor : public GPUObjectDescriptor {
                                const std::vector<std::string>& args,
                                std::string* result) const override;
 
-  GPUResources GetGPUResources() const override;
+  GPUResources GetGPUResources(AccessType access_type) const override;
   absl::Status PerformReadSelector(const std::vector<std::string>& args,
                                    std::string* result) const;
 };
@@ -94,7 +94,7 @@ class LinearStorage : public GPUObject {
   const GPUObjectDescriptor* GetGPUDescriptor() const override {
     return &desc_;
   }
-  GPUResourcesWithValue GetGPUResources() const override;
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
  private:
   friend absl::Status CreateTextureLinearStorage(int size, DataType data_type,
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.cc b/tensorflow/lite/delegates/gpu/cl/tensor.cc
index 90e247a10ec..053514056f0 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.cc
@@ -144,7 +144,7 @@ void Tensor::Release() {
   }
 }
 
-GPUResourcesWithValue Tensor::GetGPUResources() const {
+GPUResourcesWithValue Tensor::GetGPUResources(AccessType access_type) const {
   GPUResourcesWithValue resources;
   if (descriptor_.HasAxis(Axis::WIDTH)) {
     resources.ints.push_back({"width", Width()});
@@ -173,7 +173,11 @@ GPUResourcesWithValue Tensor::GetGPUResources() const {
   } else if (descriptor_.storage_type == TensorStorageType::TEXTURE_3D) {
     resources.images3d.push_back({"image3d", memory_});
   } else if (descriptor_.storage_type == TensorStorageType::IMAGE_BUFFER) {
-    resources.image_buffers.push_back({"image_buffer", image_buffer_memory_});
+    if (access_type == AccessType::READ) {
+      resources.image_buffers.push_back({"image_buffer", image_buffer_memory_});
+    } else {
+      resources.buffers.push_back({"buffer", memory_});
+    }
   }
 
   return resources;
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index e80cd01a1f9..c1b0b14709f 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -61,7 +61,7 @@ class Tensor : public GPUObject {
   const GPUObjectDescriptor* GetGPUDescriptor() const override {
     return &descriptor_;
   }
-  GPUResourcesWithValue GetGPUResources() const override;
+  GPUResourcesWithValue GetGPUResources(AccessType access_type) const override;
 
   int Width() const { return shape_.w; }
   int Height() const { return shape_.h; }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
index d1841b14350..47caa7fa123 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.cc
@@ -157,7 +157,7 @@ std::string ToString(TensorStorageType type) {
   }
 }
 
-GPUResources TensorDescriptor::GetGPUResources() const {
+GPUResources TensorDescriptor::GetGPUResources(AccessType access_type) const {
   GPUResources resources;
   if (HasAxis(Axis::WIDTH)) {
     resources.ints.push_back("width");
@@ -178,25 +178,38 @@ GPUResources TensorDescriptor::GetGPUResources() const {
   if (storage_type == TensorStorageType::BUFFER) {
     GPUBufferDescriptor desc;
     desc.data_type = data_type;
+    desc.access_type = access_type;
     desc.element_size = 4;
     resources.buffers.push_back({"buffer", desc});
   } else if (storage_type == TensorStorageType::SINGLE_TEXTURE_2D ||
              storage_type == TensorStorageType::TEXTURE_2D) {
     GPUImage2DDescriptor desc;
     desc.data_type = data_type;
+    desc.access_type = access_type;
     resources.images2d.push_back({"image2d", desc});
   } else if (storage_type == TensorStorageType::TEXTURE_ARRAY) {
     GPUImage2DArrayDescriptor desc;
     desc.data_type = data_type;
+    desc.access_type = access_type;
     resources.image2d_arrays.push_back({"image2d_array", desc});
   } else if (storage_type == TensorStorageType::TEXTURE_3D) {
     GPUImage3DDescriptor desc;
     desc.data_type = data_type;
+    desc.access_type = access_type;
     resources.images3d.push_back({"image3d", desc});
   } else if (storage_type == TensorStorageType::IMAGE_BUFFER) {
-    GPUImageBufferDescriptor desc;
-    desc.data_type = data_type;
-    resources.image_buffers.push_back({"image_buffer", desc});
+    if (access_type == AccessType::READ) {
+      GPUImageBufferDescriptor desc;
+      desc.data_type = data_type;
+      desc.access_type = access_type;
+      resources.image_buffers.push_back({"image_buffer", desc});
+    } else {
+      GPUBufferDescriptor desc;
+      desc.data_type = data_type;
+      desc.access_type = access_type;
+      desc.element_size = 4;
+      resources.buffers.push_back({"buffer", desc});
+    }
   }
   return resources;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor_type.h b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
index c00f8d191a8..42f4f9b98e5 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor_type.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor_type.h
@@ -67,7 +67,7 @@ struct TensorDescriptor : public GPUObjectDescriptor {
                                const std::vector<std::string>& args,
                                std::string* result) const override;
 
-  GPUResources GetGPUResources() const override;
+  GPUResources GetGPUResources(AccessType access_type) const override;
 
   bool HasAxis(Axis axis) const;
 

From e0b56ace777a2fa66089352c2ea0004e122f4867 Mon Sep 17 00:00:00 2001
From: Peng Wang <wangpeng@google.com>
Date: Mon, 1 Jun 2020 15:40:04 -0700
Subject: [PATCH 1498/1533] Removes the special case for bfloat16 in
 gradient_checker_v2, which causes an infinite loop and doesn't help with
 anything (if the function returns bfloat16, the precision has already been
 lost).

PiperOrigin-RevId: 314217630
Change-Id: I24ab5decb81baab183099b93b1d8c9ed65bffed3
---
 tensorflow/python/ops/gradient_checker_v2.py      | 11 ++---------
 tensorflow/python/ops/gradient_checker_v2_test.py |  9 +++++++++
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
index 633b5e57d95..3ca0903c80c 100644
--- a/tensorflow/python/ops/gradient_checker_v2.py
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -217,14 +216,8 @@ def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param, delta):
     and "x_size" columns where "x_size" is the number of elements in xs[param]
     and "y_size" is the number of elements in the result.
   """
-  # bfloat16 doesn't have enough bits to represent high precision numbers such
-  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
-  # be the groundtruth to compare against, it shouldn't lose any information.
   x_shape = xs[param].shape
   x_dtype = xs[param].dtype
-  if y_dtype == dtypes.bfloat16:
-    f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32)
-    y_dtype = dtypes.float32
 
   # To compute the jacobian, we treat x and y as one-dimensional vectors
   x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
@@ -292,10 +285,10 @@ def _compute_gradient_list(f, xs, delta):
   xs_shapes = [x.shape for x in xs]
   f_temp = _prepare(f, xs_dtypes, xs_shapes)
   y = f_temp(*xs)
-  return zip(*[
+  return tuple(zip(*[
       _compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype), xs, i, delta)
       for i in range(len(xs))
-  ])
+  ]))
 
 
 @tf_export("test.compute_gradient", v1=[])
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
index b77c95d8968..d59228d78d1 100644
--- a/tensorflow/python/ops/gradient_checker_v2_test.py
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -97,6 +97,15 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     self.assertLess(error, 1e-4)
 
+  def testBfloat16(self):
+    x1 = constant_op.constant(2.0, dtype="bfloat16")
+    x2 = constant_op.constant(3.0, dtype="bfloat16")
+    # bfloat16 is very imprecise, so we use very large delta and error bar here.
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x1: math_ops.add(x1, x2), [x1], delta=0.1))
+    tf_logging.info("x1 error = %f", error)
+    self.assertLess(error, 0.07)
+
   def testAddCustomized(self):
     size = (2, 3)
     x1 = constant_op.constant(2.0, shape=size, dtype=dtypes.float64, name="x1")

From 61a0c3bccd0bf236f469203cf6e641d466e26c61 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Mon, 1 Jun 2020 15:48:07 -0700
Subject: [PATCH 1499/1533] [tf.data] Fix performance modeling bug in
 `group_by_window`.

The use of `::Reduce` suffix would result in iterator nodes created for iterating through the contents of a group not being properly attached to the parent iterator.

PiperOrigin-RevId: 314218939
Change-Id: Ic38851d0f9740fc3731bb93050c5e386368bdec4
---
 .../group_by_window_dataset_op.cc             | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index a61ebd70141..0a6df24d40a 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -200,15 +200,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             // We are currently processing a group, so try to get the
             // next element.
             bool end_of_group;
-            // TODO(b/154341936): Explicitly stopping and starting this iterator
-            // should not be necessary, but the `::Reduce` added to the prefix
-            // passed to `current_group_iterator_` when it was created prevents
-            // the model from identifying this iterator as the output of
-            // `current_group_iterator_`.
-            RecordStop(ctx);
             TF_RETURN_IF_ERROR(current_group_iterator_->GetNext(
                 ctx, out_tensors, &end_of_group));
-            RecordStart(ctx);
             if (!end_of_group) {
               // Produce the subelement as output.
               *end_of_sequence = false;
@@ -360,7 +353,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               full_name("current_iterator_not_initialized"), ""));
         }
-
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("group_counter"),
+                                               group_counter_ - 1));
         return Status::OK();
       }
 
@@ -371,7 +365,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
         if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
 
-        // Restoring groups
+        // Restoring groups_
         if (reader->Contains(full_name("groups_size"))) {
           int64 size;
           TF_RETURN_IF_ERROR(
@@ -388,7 +382,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
-        // Restoring Windows
+        // Restoring window_sizes_
         if (reader->Contains(full_name("window_sizes_size"))) {
           int64 size;
           TF_RETURN_IF_ERROR(
@@ -404,6 +398,10 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
+        // Group counter needs to be restored before current group iterator.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("group_counter"), &group_counter_));
+
         if (reader->Contains(full_name("current_iterator_not_initialized"))) {
           current_group_iterator_.reset();
         } else {
@@ -493,11 +491,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
         // Create an iterator for the dataset that was returned by `f`.
         return returned_dataset->MakeIterator(
-            ctx, this, strings::StrCat(prefix(), "::Reduce"),
+            ctx, this, strings::StrCat(prefix(), "[", group_counter_++, "]"),
             &current_group_iterator_);
       }
 
       mutex mu_;
+      int64 group_counter_ TF_GUARDED_BY(mu_) = 0;
       std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
       // TODO(mrry): Optimize for dense key space if appropriate.
       bool end_of_input_ TF_GUARDED_BY(mu_) = false;

From 5cf536f78dc497bb6071e3d58f1b0251eb049c3e Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 1 Jun 2020 15:51:27 -0700
Subject: [PATCH 1500/1533] Rollback PR #39577: CUDNN v8 support

PiperOrigin-RevId: 314219496
Change-Id: I3e36453044e5b73274e05d84f8d7f2cdb5b144ae
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc   |   74 -
 tensorflow/stream_executor/cuda/cudnn_8_0.inc | 3316 -----------------
 tensorflow/stream_executor/cuda/cudnn_stub.cc |   12 +-
 third_party/gpus/cuda_configure.bzl           |   24 +-
 third_party/gpus/find_cuda_config.py          |   24 +-
 .../gpus/find_cuda_config.py.gz.base64        |    2 +-
 6 files changed, 18 insertions(+), 3434 deletions(-)
 mode change 100644 => 100755 tensorflow/stream_executor/cuda/cuda_dnn.cc
 delete mode 100644 tensorflow/stream_executor/cuda/cudnn_8_0.inc

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
old mode 100644
new mode 100755
index 89a840c2a60..6122877f91f
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1278,11 +1278,7 @@ port::Status CheckAndFetchProjectionWeights(
   cudnnRNNMode_t mode;
   cudnnRNNAlgo_t algo;
   cudnnDataType_t data_type;
-#if CUDNN_VERSION >= 8000
-  RETURN_IF_CUDNN_ERROR(cudnnGetRNNDescriptor_v6(
-#else
   RETURN_IF_CUDNN_ERROR(cudnnGetRNNDescriptor(
-#endif
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
       /*hiddenSize=*/&hidden_size_v,
       /*numLayers=*/&num_layers_v,
@@ -2428,28 +2424,6 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
     const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv,
     const CudnnTensorDescriptor& output_nd, bool specify_workspace_limit,
     size_t memory_limit_bytes) {
-#if CUDNN_VERSION >= 8000
-  const int num_requested_algos = 5;
-  int num_returned_algos = 0;
-  cudnnConvolutionFwdAlgoPerf_t perf_results[num_requested_algos];
-
-  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionForwardAlgorithm_v7(
-      cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
-      output_nd.handle(), num_requested_algos, &num_returned_algos,
-      perf_results));
-
-  size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
-  for (int r = 0; r < num_returned_algos; r++) {
-    if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
-        perf_results[r].algo != CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED &&
-        perf_results[r].memory <= mem_limit) {
-      return perf_results[r].algo;
-    }
-  }
-  return port::Status(port::error::INTERNAL,
-                      "cudnnGetConvolutionForwardAlgorithm_v7 returned "
-                      "no suitable algorithms. This could be a cudnn bug.");
-#else
   cudnnConvolutionFwdPreference_t preference =
       specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
                               : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
@@ -2458,7 +2432,6 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
       cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
       output_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
   return algo_to_use;
-#endif
 }
 
 port::StatusOr<cudnnConvolutionBwdDataAlgo_t>
@@ -2469,29 +2442,6 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
                                     const CudnnTensorDescriptor& output_nd,
                                     bool specify_workspace_limit,
                                     size_t memory_limit_bytes) {
-#if CUDNN_VERSION >= 8000
-  const int num_requested_algos = 5;
-  int num_returned_algos = 0;
-  cudnnConvolutionBwdDataAlgoPerf_t perf_results[num_requested_algos];
-
-  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardDataAlgorithm_v7(
-      cudnn.handle(), filter.handle(), output_nd.handle(), conv.handle(),
-      input_nd.handle(), num_requested_algos, &num_returned_algos,
-      perf_results));
-
-  size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
-  for (int r = 0; r < num_returned_algos; r++) {
-    if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
-        perf_results[r].algo !=
-            CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED &&
-        perf_results[r].memory <= mem_limit) {
-      return perf_results[r].algo;
-    }
-  }
-  return port::Status(port::error::INTERNAL,
-                      "cudnnGetConvolutionBackwardDataAlgorithm_v7 returned "
-                      "no suitable algorithms. This could be a cudnn bug.");
-#else
   cudnnConvolutionBwdDataPreference_t preference =
       specify_workspace_limit
           ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
@@ -2501,7 +2451,6 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
       cudnn.handle(), filter.handle(), output_nd.handle(), conv.handle(),
       input_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
   return algo_to_use;
-#endif
 }
 
 port::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
@@ -2512,28 +2461,6 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
                                       const CudnnTensorDescriptor& output_nd,
                                       bool specify_workspace_limit,
                                       size_t memory_limit_bytes) {
-#if CUDNN_VERSION >= 8000
-  const int num_requested_algos = 5;
-  int num_returned_algos = 0;
-  cudnnConvolutionBwdFilterAlgoPerf_t perf_results[num_requested_algos];
-
-  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-      cudnn.handle(), input_nd.handle(), output_nd.handle(), conv.handle(),
-      filter.handle(), num_requested_algos, &num_returned_algos, perf_results));
-
-  size_t mem_limit = specify_workspace_limit ? memory_limit_bytes : 0ULL;
-  for (int r = 0; r < num_returned_algos; r++) {
-    if (perf_results[r].status == CUDNN_STATUS_SUCCESS &&
-        perf_results[r].algo !=
-            CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED &&
-        perf_results[r].memory <= mem_limit) {
-      return perf_results[r].algo;
-    }
-  }
-  return port::Status(port::error::INTERNAL,
-                      "cudnnGetConvolutionBackwardFilterAlgorithm_v7 returned "
-                      "no suitable algorithms. This could be a cudnn bug.");
-#else
   cudnnConvolutionBwdFilterPreference_t preference =
       specify_workspace_limit
           ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
@@ -2543,7 +2470,6 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
       cudnn.handle(), input_nd.handle(), output_nd.handle(), conv.handle(),
       filter.handle(), preference, memory_limit_bytes, &algo_to_use));
   return algo_to_use;
-#endif
 }
 
 port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
diff --git a/tensorflow/stream_executor/cuda/cudnn_8_0.inc b/tensorflow/stream_executor/cuda/cudnn_8_0.inc
deleted file mode 100644
index 9eca12e94f3..00000000000
--- a/tensorflow/stream_executor/cuda/cudnn_8_0.inc
+++ /dev/null
@@ -1,3316 +0,0 @@
-// Auto-generated, do not edit.
-
-extern "C" {
-size_t CUDNNWINAPI cudnnGetVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-size_t CUDNNWINAPI cudnnGetCudartVersion(void) {
-  using FuncPtr = size_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCudartVersion");
-  if (!func_ptr) return 0;
-  return func_ptr();
-}
-
-const char *CUDNNWINAPI cudnnGetErrorString(cudnnStatus_t status) {
-  using FuncPtr = const char *(CUDNNWINAPI *)(cudnnStatus_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetErrorString");
-  if (!func_ptr) return "cudnnGetErrorString symbol not found.";
-  return func_ptr(status);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnQueryRuntimeError(cudnnHandle_t handle,
-                                                 cudnnStatus_t *rstatus,
-                                                 cudnnErrQueryMode_t mode,
-                                                 cudnnRuntimeTag_t *tag) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnStatus_t *, cudnnErrQueryMode_t, cudnnRuntimeTag_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnQueryRuntimeError");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rstatus, mode, tag);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetProperty(libraryPropertyType type,
-                                           int *value) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(libraryPropertyType, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetProperty");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(type, value);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreate(cudnnHandle_t *handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreate");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroy(cudnnHandle_t handle) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroy");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetStream(cudnnHandle_t handle,
-                                         cudaStream_t streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetStream(cudnnHandle_t handle,
-                                         cudaStream_t *streamId) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, cudaStream_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetStream");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, streamId);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor4dDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t dataType, /* image data type */
-    int n,                    /* number of inputs (batch size) */
-    int c,                    /* number of input feature maps */
-    int h,                    /* height of input section */
-    int w,                    /* width of input section */
-    int nStride, int cStride, int hStride, int wStride) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnDataType_t,
-                                   int, int, int, int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor4dDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensor4dDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc,
-    cudnnDataType_t *dataType, /* image data type */
-    int *n,                    /* number of inputs (batch size) */
-    int *c,                    /* number of input feature maps  */
-    int *h,                    /* height of input section */
-    int *w,                    /* width of input section */
-    int *nStride, int *cStride, int *hStride, int *wStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorDescriptor_t, cudnnDataType_t *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensor4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, n, c, h, w, nStride, cStride, hStride,
-                  wStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptor(
-    cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorNdDescriptorEx(
-    cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format,
-    cudnnDataType_t dataType, int nbDims, const int dimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, cudnnTensorFormat_t,
-                                   cudnnDataType_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensorNdDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, format, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorNdDescriptor(
-    const cudnnTensorDescriptor_t tensorDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, int *nbDims, int dimA[], int strideA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, int,
-                                   cudnnDataType_t *, int *, int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, nbDimsRequested, dataType, nbDims, dimA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorSizeInBytes(
-    const cudnnTensorDescriptor_t tensorDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetTensorSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(tensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnInitTransformDest(
-    const cudnnTensorTransformDescriptor_t transformDesc,
-    const cudnnTensorDescriptor_t srcDesc, cudnnTensorDescriptor_t destDesc,
-    size_t *destSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t,
-      cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnInitTransformDest");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, srcDesc, destDesc, destSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t *transformDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc, const uint32_t nbDims,
-    const cudnnTensorFormat_t destFormat, const int32_t padBeforeA[],
-    const int32_t padAfterA[], const uint32_t foldA[],
-    const cudnnFoldingDirection_t direction) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorTransformDescriptor_t, const uint32_t,
-      const cudnnTensorFormat_t, const int32_t[], const int32_t[],
-      const uint32_t[], const cudnnFoldingDirection_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDims, destFormat, padBeforeA, padAfterA,
-                  foldA, direction);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc, uint32_t nbDimsRequested,
-    cudnnTensorFormat_t *destFormat, int32_t padBeforeA[], int32_t padAfterA[],
-    uint32_t foldA[], cudnnFoldingDirection_t *direction) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnTensorTransformDescriptor_t, uint32_t, cudnnTensorFormat_t *,
-      int32_t[], int32_t[], uint32_t[], cudnnFoldingDirection_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc, nbDimsRequested, destFormat, padBeforeA,
-                  padAfterA, foldA, direction);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyTensorTransformDescriptor(
-    cudnnTensorTransformDescriptor_t transformDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorTransformDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyTensorTransformDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(transformDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensor(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformTensorEx(
-    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
-    const void *alpha, const cudnnTensorDescriptor_t srcDesc,
-    const void *srcData, const void *beta,
-    const cudnnTensorDescriptor_t destDesc, void *destData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformTensorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
-                  destData);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAddTensor(cudnnHandle_t handle,
-                                         const void *alpha,
-                                         const cudnnTensorDescriptor_t aDesc,
-                                         const void *A, const void *beta,
-                                         const cudnnTensorDescriptor_t cDesc,
-                                         void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAddTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, aDesc, A, beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetOpTensorDescriptor(
-    cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t opTensorOp,
-    cudnnDataType_t opTensorCompType, cudnnNanPropagation_t opTensorNanOpt) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t,
-                                   cudnnDataType_t, cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetOpTensorDescriptor(
-    const cudnnOpTensorDescriptor_t opTensorDesc, cudnnOpTensorOp_t *opTensorOp,
-    cudnnDataType_t *opTensorCompType, cudnnNanPropagation_t *opTensorNanOpt) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnOpTensorDescriptor_t, cudnnOpTensorOp_t *, cudnnDataType_t *,
-      cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc, opTensorOp, opTensorCompType, opTensorNanOpt);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnOpTensorDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyOpTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(opTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpTensor(
-    cudnnHandle_t handle, const cudnnOpTensorDescriptor_t opTensorDesc,
-    const void *alpha1, const cudnnTensorDescriptor_t aDesc, const void *A,
-    const void *alpha2, const cudnnTensorDescriptor_t bDesc, const void *B,
-    const void *beta, const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnOpTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B,
-                  beta, cDesc, C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t *reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t reduceTensorOp, cudnnDataType_t reduceTensorCompType,
-    cudnnNanPropagation_t reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t reduceTensorIndices,
-    cudnnIndicesType_t reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t, cudnnDataType_t,
-      cudnnNanPropagation_t, cudnnReduceTensorIndices_t, cudnnIndicesType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReduceTensorDescriptor(
-    const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    cudnnReduceTensorOp_t *reduceTensorOp,
-    cudnnDataType_t *reduceTensorCompType,
-    cudnnNanPropagation_t *reduceTensorNanOpt,
-    cudnnReduceTensorIndices_t *reduceTensorIndices,
-    cudnnIndicesType_t *reduceTensorIndicesType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnReduceTensorDescriptor_t, cudnnReduceTensorOp_t *,
-      cudnnDataType_t *, cudnnNanPropagation_t *, cudnnReduceTensorIndices_t *,
-      cudnnIndicesType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc, reduceTensorOp, reduceTensorCompType,
-                  reduceTensorNanOpt, reduceTensorIndices,
-                  reduceTensorIndicesType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyReduceTensorDescriptor(
-    cudnnReduceTensorDescriptor_t reduceTensorDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnReduceTensorDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyReduceTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(reduceTensorDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionIndicesSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionIndicesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetReductionWorkspaceSize(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    const cudnnTensorDescriptor_t aDesc, const cudnnTensorDescriptor_t cDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetReductionWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, aDesc, cDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReduceTensor(
-    cudnnHandle_t handle, const cudnnReduceTensorDescriptor_t reduceTensorDesc,
-    void *indices, size_t indicesSizeInBytes, void *workspace,
-    size_t workspaceSizeInBytes, const void *alpha,
-    const cudnnTensorDescriptor_t aDesc, const void *A, const void *beta,
-    const cudnnTensorDescriptor_t cDesc, void *C) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnReduceTensorDescriptor_t, void *, size_t,
-      void *, size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReduceTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, reduceTensorDesc, indices, indicesSizeInBytes,
-                  workspace, workspaceSizeInBytes, alpha, aDesc, A, beta, cDesc,
-                  C);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetTensor(cudnnHandle_t handle,
-                                         const cudnnTensorDescriptor_t yDesc,
-                                         void *y, const void *valuePtr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, valuePtr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnScaleTensor(cudnnHandle_t handle,
-                                           const cudnnTensorDescriptor_t yDesc,
-                                           void *y, const void *alpha) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnScaleTensor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, yDesc, y, alpha);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilter4dDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType,          /* image data type */
-    cudnnTensorFormat_t format, int k, /* number of output feature maps */
-    int c,                             /* number of input feature maps */
-    int h,                             /* height of each input filter */
-    int w) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilter4dDescriptor(
-    const cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t *dataType,           /* image data type */
-    cudnnTensorFormat_t *format, int *k, /* number of output feature maps */
-    int *c,                              /* number of input feature maps */
-    int *h,                              /* height of each input filter */
-    int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, cudnnDataType_t *, cudnnTensorFormat_t *,
-      int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilter4dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, k, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFilterNdDescriptor(
-    cudnnFilterDescriptor_t filterDesc,
-    cudnnDataType_t dataType, /* image data type */
-    cudnnTensorFormat_t format, int nbDims, const int filterDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t, cudnnDataType_t,
-                                   cudnnTensorFormat_t, int, const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, dataType, format, nbDims, filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterNdDescriptor(
-    const cudnnFilterDescriptor_t filterDesc, int nbDimsRequested,
-    cudnnDataType_t *dataType, /* image data type */
-    cudnnTensorFormat_t *format, int *nbDims, int filterDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFilterDescriptor_t, int, cudnnDataType_t *,
-      cudnnTensorFormat_t *, int *, int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, nbDimsRequested, dataType, format, nbDims,
-                  filterDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFilterSizeInBytes(
-    const cudnnFilterDescriptor_t filterDesc, size_t *size) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFilterDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetFilterSizeInBytes");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc, size);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnTransformFilter(
-    cudnnHandle_t handle, const cudnnTensorTransformDescriptor_t transDesc,
-    const void *alpha, const cudnnFilterDescriptor_t srcDesc,
-    const void *srcData, const void *beta,
-    const cudnnFilterDescriptor_t destDesc, void *destData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *, const void *,
-      const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnTransformFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc,
-                  destData);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFilterDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFilterDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(filterDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxForward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPooling2dDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t mode,
-    cudnnNanPropagation_t maxpoolingNanOpt, int windowHeight, int windowWidth,
-    int verticalPadding, int horizontalPadding, int verticalStride,
-    int horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, cudnnPoolingMode_t, cudnnNanPropagation_t, int,
-      int, int, int, int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPooling2dDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, cudnnPoolingMode_t *mode,
-    cudnnNanPropagation_t *maxpoolingNanOpt, int *windowHeight,
-    int *windowWidth, int *verticalPadding, int *horizontalPadding,
-    int *verticalStride, int *horizontalStride) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPooling2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, windowHeight,
-                  windowWidth, verticalPadding, horizontalPadding,
-                  verticalStride, horizontalStride);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPoolingNdDescriptor(
-    cudnnPoolingDescriptor_t poolingDesc, const cudnnPoolingMode_t mode,
-    const cudnnNanPropagation_t maxpoolingNanOpt, int nbDims,
-    const int windowDimA[], const int paddingA[], const int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnPoolingDescriptor_t, const cudnnPoolingMode_t,
-      const cudnnNanPropagation_t, int, const int[], const int[], const int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, mode, maxpoolingNanOpt, nbDims, windowDimA,
-                  paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetPoolingNdDescriptor(
-    const cudnnPoolingDescriptor_t poolingDesc, int nbDimsRequested,
-    cudnnPoolingMode_t *mode, cudnnNanPropagation_t *maxpoolingNanOpt,
-    int *nbDims, int windowDimA[], int paddingA[], int strideA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnPoolingDescriptor_t, int, cudnnPoolingMode_t *,
-      cudnnNanPropagation_t *, int *, int[], int[], int[]);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetPoolingNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, nbDimsRequested, mode, maxpoolingNanOpt, nbDims,
-                  windowDimA, paddingA, strideA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int nbDims, int outputTensorDimA[]) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                   const cudnnTensorDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPoolingNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, nbDims, outputTensorDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
-                                  const cudnnTensorDescriptor_t inputTensorDesc,
-                                  int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnPoolingDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetPooling2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc, inputTensorDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPoolingDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPoolingDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(poolingDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingForward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetActivationDescriptor(
-    cudnnActivationDescriptor_t activationDesc, cudnnActivationMode_t mode,
-    cudnnNanPropagation_t reluNanOpt, double coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t,
-                                               cudnnActivationMode_t,
-                                               cudnnNanPropagation_t, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
-                             cudnnActivationMode_t *mode,
-                             cudnnNanPropagation_t *reluNanOpt, double *coef) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnActivationDescriptor_t, cudnnActivationMode_t *,
-      cudnnNanPropagation_t *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc, mode, reluNanOpt, coef);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnActivationDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyActivationDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(activationDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationForward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned lrnN, double lrnAlpha,
-                                                double lrnBeta, double lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int, double, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc,
-                                                unsigned *lrnN,
-                                                double *lrnAlpha,
-                                                double *lrnBeta, double *lrnK) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnLRNDescriptor_t, unsigned int *, double *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(normDesc, lrnN, lrnAlpha, lrnBeta, lrnK);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnLRNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyLRNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(lrnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, xDesc, x, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationForward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, void *, void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, temp, temp2,
-                  beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDeriveBNTensorDescriptor(
-    cudnnTensorDescriptor_t derivedBnDesc, const cudnnTensorDescriptor_t xDesc,
-    cudnnBatchNormMode_t mode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t,
-                                               const cudnnTensorDescriptor_t,
-                                               cudnnBatchNormMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDeriveBNTensorDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(derivedBnDesc, xDesc, mode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardInference(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias, const void *estimatedMean,
-    const void *estimatedVariance, double epsilon) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, const void *, const void *, double);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alpha, beta, xDesc, x, yDesc, y,
-                  bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean,
-                  estimatedVariance, epsilon);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateSpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t *stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateSpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSpatialTransformerNdDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc, cudnnSamplerType_t samplerType,
-    cudnnDataType_t dataType, const int nbDims, const int dimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSpatialTransformerDescriptor_t, cudnnSamplerType_t, cudnnDataType_t,
-      const int, const int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetSpatialTransformerNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc, samplerType, dataType, nbDims, dimA);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroySpatialTransformerDescriptor(
-    cudnnSpatialTransformerDescriptor_t stDesc) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnSpatialTransformerDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroySpatialTransformerDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(stDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorForward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *theta, void *grid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, theta, grid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerForward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *grid, const void *beta, cudnnTensorDescriptor_t yDesc,
-    void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, grid, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetStatesSize(cudnnHandle_t handle,
-                                                    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetStatesSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutGetReserveSpaceSize(
-    cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutGetReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(xdesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float dropout,
-    void *states, size_t stateSizeInBytes, unsigned long long seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float, void *, size_t, unsigned long long);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, stateSizeInBytes, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetDropoutDescriptor(
-    cudnnDropoutDescriptor_t dropoutDesc, cudnnHandle_t handle, float *dropout,
-    void **states, unsigned long long *seed) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnDropoutDescriptor_t, cudnnHandle_t,
-                                   float *, void **, unsigned long long *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetDropoutDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(dropoutDesc, handle, dropout, states, seed);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutForward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t xdesc, const void *x,
-    const cudnnTensorDescriptor_t ydesc, void *y, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, xdesc, x, ydesc, y, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAlgorithmDescriptor(cudnnAlgorithmDescriptor_t *algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmDescriptor(
-    cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t algoDesc, cudnnAlgorithm_t *algorithm) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithm_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc, algorithm);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCopyAlgorithmDescriptor(
-    const cudnnAlgorithmDescriptor_t src, cudnnAlgorithmDescriptor_t dest) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(const cudnnAlgorithmDescriptor_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCopyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(src, dest);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAlgorithmDescriptor(cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToCreate) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToCreate);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t algoPerf, cudnnAlgorithmDescriptor_t algoDesc,
-    cudnnStatus_t status, float time, size_t memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t,
-                                               cudnnAlgorithmDescriptor_t,
-                                               cudnnStatus_t, float, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmPerformance(
-    const cudnnAlgorithmPerformance_t algoPerf,
-    cudnnAlgorithmDescriptor_t *algoDesc, cudnnStatus_t *status, float *time,
-    size_t *memory) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnAlgorithmPerformance_t, cudnnAlgorithmDescriptor_t *,
-      cudnnStatus_t *, float *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, algoDesc, status, time, memory);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyAlgorithmPerformance(
-    cudnnAlgorithmPerformance_t *algoPerf, int numberToDestroy) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnAlgorithmPerformance_t *, int);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyAlgorithmPerformance");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(algoPerf, numberToDestroy);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAlgorithmSpaceSize(
-    cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-    size_t *algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAlgorithmSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSaveAlgorithm(cudnnHandle_t handle, cudnnAlgorithmDescriptor_t algoDesc,
-                   void *algoSpace, size_t algoSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnAlgorithmDescriptor_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSaveAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoDesc, algoSpace, algoSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRestoreAlgorithm(
-    cudnnHandle_t handle, void *algoSpace, size_t algoSpaceSizeInBytes,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, void *, size_t,
-                                               cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRestoreAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algoSpace, algoSpaceSizeInBytes, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCallback(unsigned mask, void *udata,
-                                           cudnnCallback_t fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int, void *, cudnnCallback_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCallback(unsigned *mask, void **udata,
-                                           cudnnCallback_t *fptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(unsigned int *, void **, cudnnCallback_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCallback");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(mask, udata, fptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpsInferVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpsInferVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyConvolutionDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionMathType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnMathType_t *mathType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, mathType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionGroupCount(
-    cudnnConvolutionDescriptor_t convDesc, int *groupCount) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionGroupCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, groupCount);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionReorderType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t reorderType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnReorderType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionReorderType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, reorderType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionReorderType(
-    cudnnConvolutionDescriptor_t convDesc, cudnnReorderType_t *reorderType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnConvolutionDescriptor_t,
-                                               cudnnReorderType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionReorderType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, reorderType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolution2dDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int pad_h, /* zero-padding height */
-    int pad_w,                                        /* zero-padding width */
-    int u,          /* vertical filter stride */
-    int v,          /* horizontal filter stride */
-    int dilation_h, /* filter dilation in the vertical dimension */
-    int dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, int, int, int, int, int,
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc,
-    int *pad_h,      /* zero-padding height */
-    int *pad_w,      /* zero-padding width */
-    int *u,          /* vertical filter stride */
-    int *v,          /* horizontal filter stride */
-    int *dilation_h, /* filter dilation in the vertical dimension */
-    int *dilation_w, /* filter dilation in the horizontal dimension */
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int *, int *, int *, int *, int *,
-      int *, cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolution2dDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, pad_h, pad_w, u, v, dilation_h, dilation_w, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetConvolutionNdDescriptor(
-    cudnnConvolutionDescriptor_t convDesc, int arrayLength, /* nbDims-2 size */
-    const int padA[], const int filterStrideA[], const int dilationA[],
-    cudnnConvolutionMode_t mode, cudnnDataType_t computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[],
-      cudnnConvolutionMode_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLength, padA, filterStrideA, dilationA, mode,
-                  computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdDescriptor(
-    const cudnnConvolutionDescriptor_t convDesc, int arrayLengthRequested,
-    int *arrayLength, int padA[], int strideA[], int dilationA[],
-    cudnnConvolutionMode_t *mode, cudnnDataType_t *computeType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, int, int *, int[], int[], int[],
-      cudnnConvolutionMode_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetConvolutionNdDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, arrayLengthRequested, arrayLength, padA, strideA,
-                  dilationA, mode, computeType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolution2dForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int *n, int *c, int *h, int *w) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int *, int *, int *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolution2dForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, n, c, h, w);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionNdForwardOutputDim(
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t inputTensorDesc,
-    const cudnnFilterDescriptor_t filterDesc, int nbDims,
-    int tensorOuputDimA[]) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, int, int[]);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionNdForwardOutputDim");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(convDesc, inputTensorDesc, filterDesc, nbDims,
-                  tensorOuputDimA);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetConvolutionForwardAlgorithmMaxCount(cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnFilterDescriptor_t filterDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t destDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, filterDesc, convDesc, destDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionFwdAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionForwardAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionFwdAlgoPerf_t *perfResults,
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionFwdAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionForwardAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, w, convDesc, yDesc, y,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnIm2Col(cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-            const void *x, const cudnnFilterDescriptor_t wDesc,
-            const cudnnConvolutionDescriptor_t convDesc, void *colBuffer) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnTensorDescriptor_t,
-                                   const void *, const cudnnFilterDescriptor_t,
-                                   const cudnnConvolutionDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnIm2Col");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, wDesc, convDesc, colBuffer);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnReorderFilterAndBias(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    cudnnReorderType_t reorderType, const void *filterData,
-    void *reorderedFilterData, int reorderBias, const void *biasData,
-    void *reorderedBiasData) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t,
-      const void *, void *, int, const void *, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnReorderFilterAndBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, reorderType, filterData,
-                  reorderedFilterData, reorderBias, biasData,
-                  reorderedBiasData);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionForwardWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t yDesc, cudnnConvolutionFwdAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionForwardWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionForward(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, beta, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBiasActivationForward(
-    cudnnHandle_t handle, const void *alpha1,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnConvolutionDescriptor_t convDesc, cudnnConvolutionFwdAlgo_t algo,
-    void *workSpace, size_t workSpaceSizeInBytes, const void *alpha2,
-    const cudnnTensorDescriptor_t zDesc, const void *z,
-    const cudnnTensorDescriptor_t biasDesc, const void *bias,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t yDesc, void *y) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnConvolutionBiasActivationForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace,
-                  workSpaceSizeInBytes, alpha2, zDesc, z, biasDesc, bias,
-                  activationDesc, yDesc, y);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithm(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdDataAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, w, dyDesc, dy, convDesc, dxDesc, dx,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdDataAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const int, int *,
-      cudnnConvolutionBwdDataAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardDataWorkspaceSize(
-    cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t dxDesc, cudnnConvolutionBwdDataAlgo_t algo,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardDataWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardData(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdDataAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *,
-      size_t, const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFoldedConvBackwardDataDescriptors(
-    const cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnTensorDescriptor_t gradDesc,
-    const cudnnTensorFormat_t transformFormat,
-    cudnnFilterDescriptor_t foldedFilterDesc,
-    cudnnTensorDescriptor_t paddedDiffDesc,
-    cudnnConvolutionDescriptor_t foldedConvDesc,
-    cudnnTensorDescriptor_t foldedGradDesc,
-    cudnnTensorTransformDescriptor_t filterFoldTransDesc,
-    cudnnTensorTransformDescriptor_t diffPadTransDesc,
-    cudnnTensorTransformDescriptor_t gradFoldTransDesc,
-    cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnHandle_t, const cudnnFilterDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorFormat_t,
-      cudnnFilterDescriptor_t, cudnnTensorDescriptor_t,
-      cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t,
-      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t,
-      cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFoldedConvBackwardDataDescriptors");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, filterDesc, diffDesc, convDesc, gradDesc,
-                  transformFormat, foldedFilterDesc, paddedDiffDesc,
-                  foldedConvDesc, foldedGradDesc, filterFoldTransDesc,
-                  diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsConstParamPack(
-    cudnnFusedOpsConstParamPack_t *constPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t *,
-                                               cudnnFusedOps_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsConstParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsConstParamPack(cudnnFusedOpsConstParamPack_t constPack) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsConstParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsConstParamPackAttribute(
-    cudnnFusedOpsConstParamPack_t constPack,
-    cudnnFusedOpsConstParamLabel_t paramLabel, const void *param) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsConstParamPack_t,
-                                               cudnnFusedOpsConstParamLabel_t,
-                                               const void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetFusedOpsConstParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, paramLabel, param);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsConstParamPackAttribute(
-    const cudnnFusedOpsConstParamPack_t constPack,
-    cudnnFusedOpsConstParamLabel_t paramLabel, void *param, int *isNULL) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnFusedOpsConstParamPack_t, cudnnFusedOpsConstParamLabel_t,
-      void *, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFusedOpsConstParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(constPack, paramLabel, param, isNULL);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsVariantParamPack(
-    cudnnFusedOpsVariantParamPack_t *varPack, cudnnFusedOps_t ops) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnFusedOpsVariantParamPack_t *, cudnnFusedOps_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnCreateFusedOpsVariantParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyFusedOpsVariantParamPack(cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsVariantParamPack");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetFusedOpsVariantParamPackAttribute(
-    cudnnFusedOpsVariantParamPack_t varPack,
-    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsVariantParamPack_t,
-                                   cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSetFusedOpsVariantParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, paramLabel, ptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetFusedOpsVariantParamPackAttribute(
-    const cudnnFusedOpsVariantParamPack_t varPack,
-    cudnnFusedOpsVariantParamLabel_t paramLabel, void *ptr) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(const cudnnFusedOpsVariantParamPack_t,
-                                   cudnnFusedOpsVariantParamLabel_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetFusedOpsVariantParamPackAttribute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(varPack, paramLabel, ptr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreateFusedOpsPlan(cudnnFusedOpsPlan_t *plan,
-                                                  cudnnFusedOps_t ops) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t *, cudnnFusedOps_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan, ops);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDestroyFusedOpsPlan(cudnnFusedOpsPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnFusedOpsPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnMakeFusedOpsPlan(cudnnHandle_t handle, cudnnFusedOpsPlan_t plan,
-                      const cudnnFusedOpsConstParamPack_t constPack,
-                      size_t *workspaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnFusedOpsPlan_t, const cudnnFusedOpsConstParamPack_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMakeFusedOpsPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, plan, constPack, workspaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnFusedOpsExecute(cudnnHandle_t handle, const cudnnFusedOpsPlan_t plan,
-                     cudnnFusedOpsVariantParamPack_t varPack) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, const cudnnFusedOpsPlan_t,
-                                   cudnnFusedOpsVariantParamPack_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnFusedOpsExecute");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, plan, varPack);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v8(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNAlgo_t algo, cudnnRNNMode_t cellMode,
-    cudnnRNNBiasMode_t biasMode, cudnnDirectionMode_t dirMode,
-    cudnnRNNInputMode_t inputMode, cudnnDataType_t dataType,
-    cudnnDataType_t mathPrec, cudnnMathType_t mathType, int32_t inputSize,
-    int32_t hiddenSize, int32_t projSize, int32_t numLayers,
-    cudnnDropoutDescriptor_t dropoutDesc, uint32_t auxFlags) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, cudnnRNNAlgo_t, cudnnRNNMode_t, cudnnRNNBiasMode_t,
-      cudnnDirectionMode_t, cudnnRNNInputMode_t, cudnnDataType_t,
-      cudnnDataType_t, cudnnMathType_t, int32_t, int32_t, int32_t, int32_t,
-      cudnnDropoutDescriptor_t, uint32_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, algo, cellMode, biasMode, dirMode, inputMode,
-                  dataType, mathPrec, mathType, inputSize, hiddenSize, projSize,
-                  numLayers, dropoutDesc, auxFlags);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor_v8(
-    cudnnRNNDescriptor_t rnnDesc, cudnnRNNAlgo_t *algo,
-    cudnnRNNMode_t *cellMode, cudnnRNNBiasMode_t *biasMode,
-    cudnnDirectionMode_t *dirMode, cudnnRNNInputMode_t *inputMode,
-    cudnnDataType_t *dataType, cudnnDataType_t *mathPrec,
-    cudnnMathType_t *mathType, int32_t *inputSize, int32_t *hiddenSize,
-    int32_t *projSize, int32_t *numLayers,
-    cudnnDropoutDescriptor_t *dropoutDesc, uint32_t *auxFlags) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDescriptor_t, cudnnRNNAlgo_t *, cudnnRNNMode_t *,
-      cudnnRNNBiasMode_t *, cudnnDirectionMode_t *, cudnnRNNInputMode_t *,
-      cudnnDataType_t *, cudnnDataType_t *, cudnnMathType_t *, int32_t *,
-      int32_t *, int32_t *, int32_t *, cudnnDropoutDescriptor_t *, uint32_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, algo, cellMode, biasMode, dirMode, inputMode,
-                  dataType, mathPrec, mathType, inputSize, hiddenSize, projSize,
-                  numLayers, dropoutDesc, auxFlags);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, const int hiddenSize,
-    const int numLayers, cudnnDropoutDescriptor_t dropoutDesc,
-    cudnnRNNInputMode_t inputMode, cudnnDirectionMode_t direction,
-    cudnnRNNMode_t cellMode, cudnnRNNAlgo_t algo, cudnnDataType_t mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int,
-      cudnnDropoutDescriptor_t, cudnnRNNInputMode_t, cudnnDirectionMode_t,
-      cudnnRNNMode_t, cudnnRNNAlgo_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, cellMode, algo, mathPrec);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDescriptor_v6(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int *hiddenSize,
-    int *numLayers, cudnnDropoutDescriptor_t *dropoutDesc,
-    cudnnRNNInputMode_t *inputMode, cudnnDirectionMode_t *direction,
-    cudnnRNNMode_t *cellMode, cudnnRNNAlgo_t *algo, cudnnDataType_t *mathPrec) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, int *, int *,
-      cudnnDropoutDescriptor_t *, cudnnRNNInputMode_t *, cudnnDirectionMode_t *,
-      cudnnRNNMode_t *, cudnnRNNAlgo_t *, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDescriptor_v6");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, hiddenSize, numLayers, dropoutDesc,
-                  inputMode, direction, cellMode, algo, mathPrec);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNMatrixMathType(
-    cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnMathType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNMatrixMathType");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, mType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
-                                              cudnnRNNBiasMode_t biasMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNBiasMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, biasMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc,
-                                              cudnnRNNBiasMode_t *biasMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, cudnnRNNBiasMode_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNBiasMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, biasMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNSetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t clipMode,
-                                          cudnnNanPropagation_t clipNanOpt,
-                                          double lclip, double rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t,
-      cudnnNanPropagation_t, double, double);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNSetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNGetClip(cudnnHandle_t handle,
-                                          cudnnRNNDescriptor_t rnnDesc,
-                                          cudnnRNNClipMode_t *clipMode,
-                                          cudnnNanPropagation_t *clipNanOpt,
-                                          double *lclip, double *rclip) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnRNNClipMode_t *,
-      cudnnNanPropagation_t *, double *, double *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNGetClip");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, clipMode, clipNanOpt, lclip, rclip);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnSetRNNProjectionLayers(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-                            const int recProjSize, const int outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, const int, const int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNProjectionLayers(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *recProjSize,
-    int *outProjSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNProjectionLayers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, recProjSize, outProjSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCreatePersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, const int minibatch,
-    const cudnnDataType_t dataType, cudnnPersistentRNNPlan_t *plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, const int,
-                                               const cudnnDataType_t,
-                                               cudnnPersistentRNNPlan_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreatePersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, minibatch, dataType, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetPersistentRNNPlan(
-    cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t,
-                                               cudnnPersistentRNNPlan_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetPersistentRNNPlan");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, plan);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNWorkspaceSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetRNNParamsSize(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                      const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes,
-                      cudnnDataType_t dataType) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNParamsSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, sizeInBytes, dataType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerMatrixParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerMatDesc, void **linLayerMat) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerMatrixParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerMatDesc, linLayerMat);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNLinLayerBiasParams(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int pseudoLayer, const cudnnTensorDescriptor_t xDesc,
-    const cudnnFilterDescriptor_t wDesc, const void *w, const int linLayerID,
-    cudnnFilterDescriptor_t linLayerBiasDesc, void **linLayerBias) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t,
-      const void *, const int, cudnnFilterDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNLinLayerBiasParams");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, pseudoLayer, xDesc, wDesc, w, linLayerID,
-                  linLayerBiasDesc, linLayerBias);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInference(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInference");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc,
-                                                 unsigned paddingMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, unsigned int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc,
-                                                 unsigned *paddingMode) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDescriptor_t, unsigned int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNPaddingMode");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDesc, paddingMode);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnRNNDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t dataType,
-    cudnnRNNDataLayout_t layout, int maxSeqLength, int batchSize,
-    int vectorSize,
-    const int seqLengthArray[], /* length of each sequence in the batch */
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t, cudnnRNNDataLayout_t, int, int,
-      int, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNDataDescriptor(
-    cudnnRNNDataDescriptor_t rnnDataDesc, cudnnDataType_t *dataType,
-    cudnnRNNDataLayout_t *layout, int *maxSeqLength, int *batchSize,
-    int *vectorSize, int arrayLengthRequested, int seqLengthArray[],
-    void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnRNNDataDescriptor_t, cudnnDataType_t *, cudnnRNNDataLayout_t *,
-      int *, int *, int *, int, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(rnnDataDesc, dataType, layout, maxSeqLength, batchSize,
-                  vectorSize, arrayLengthRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardInferenceEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardInferenceEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetRNNAlgorithmDescriptor(
-    cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc,
-    cudnnAlgorithmDescriptor_t algoDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnRNNDescriptor_t, cudnnAlgorithmDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetRNNAlgorithmDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, algoDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardInferenceAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardInferenceAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardInferenceAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardInferenceAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnSeqDataDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroySeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetSeqDataDescriptor(
-    cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t dataType, int nbDims,
-    const int dimA[], const cudnnSeqDataAxis_t axes[],
-    size_t seqLengthArraySize, const int seqLengthArray[], void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnSeqDataDescriptor_t, cudnnDataType_t, int, const int[],
-      const cudnnSeqDataAxis_t[], size_t, const int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, dimA, axes, seqLengthArraySize,
-                  seqLengthArray, paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetSeqDataDescriptor(
-    const cudnnSeqDataDescriptor_t seqDataDesc, cudnnDataType_t *dataType,
-    int *nbDims, int nbDimsRequested, int dimA[], cudnnSeqDataAxis_t axes[],
-    size_t *seqLengthArraySize, size_t seqLengthSizeRequested,
-    int seqLengthArray[], void *paddingFill) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      const cudnnSeqDataDescriptor_t, cudnnDataType_t *, int *, int, int[],
-      cudnnSeqDataAxis_t[], size_t *, size_t, int[], void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetSeqDataDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(seqDataDesc, dataType, nbDims, nbDimsRequested, dimA, axes,
-                  seqLengthArraySize, seqLengthSizeRequested, seqLengthArray,
-                  paddingFill);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnAttnDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetAttnDescriptor(
-    cudnnAttnDescriptor_t attnDesc, unsigned attnMode, int nHeads,
-    double smScaler, cudnnDataType_t dataType, cudnnDataType_t computePrec,
-    cudnnMathType_t mathType, cudnnDropoutDescriptor_t attnDropoutDesc,
-    cudnnDropoutDescriptor_t postDropoutDesc, int qSize, int kSize, int vSize,
-    int qProjSize, int kProjSize, int vProjSize, int oProjSize,
-    int qoMaxSeqLength, int kvMaxSeqLength, int maxBatchSize, int maxBeamSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnAttnDescriptor_t, unsigned int, int, double, cudnnDataType_t,
-      cudnnDataType_t, cudnnMathType_t, cudnnDropoutDescriptor_t,
-      cudnnDropoutDescriptor_t, int, int, int, int, int, int, int, int, int,
-      int, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, attnMode, nHeads, smScaler, dataType, computePrec,
-                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
-                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
-                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetAttnDescriptor(
-    cudnnAttnDescriptor_t attnDesc, unsigned *attnMode, int *nHeads,
-    double *smScaler, cudnnDataType_t *dataType, cudnnDataType_t *computePrec,
-    cudnnMathType_t *mathType, cudnnDropoutDescriptor_t *attnDropoutDesc,
-    cudnnDropoutDescriptor_t *postDropoutDesc, int *qSize, int *kSize,
-    int *vSize, int *qProjSize, int *kProjSize, int *vProjSize, int *oProjSize,
-    int *qoMaxSeqLength, int *kvMaxSeqLength, int *maxBatchSize,
-    int *maxBeamSize) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnAttnDescriptor_t, unsigned int *, int *, double *, cudnnDataType_t *,
-      cudnnDataType_t *, cudnnMathType_t *, cudnnDropoutDescriptor_t *,
-      cudnnDropoutDescriptor_t *, int *, int *, int *, int *, int *, int *,
-      int *, int *, int *, int *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetAttnDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(attnDesc, attnMode, nHeads, smScaler, dataType, computePrec,
-                  mathType, attnDropoutDesc, postDropoutDesc, qSize, kSize,
-                  vSize, qProjSize, kProjSize, vProjSize, oProjSize,
-                  qoMaxSeqLength, kvMaxSeqLength, maxBatchSize, maxBeamSize);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnBuffers(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    size_t *weightSizeInBytes, size_t *workSpaceSizeInBytes,
-    size_t *reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, size_t *, size_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnBuffers");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, weightSizeInBytes, workSpaceSizeInBytes,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetMultiHeadAttnWeights(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    cudnnMultiHeadAttnWeightKind_t wKind, size_t weightSizeInBytes,
-    const void *weights, cudnnTensorDescriptor_t wDesc, void **wAddr) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t,
-      cudnnMultiHeadAttnWeightKind_t, size_t, const void *,
-      cudnnTensorDescriptor_t, void **);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetMultiHeadAttnWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, wKind, weightSizeInBytes, weights, wDesc,
-                  wAddr);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnForward(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc, int currIdx,
-    const int loWinIdx[], const int hiWinIdx[], const int devSeqLengthsQO[],
-    const int devSeqLengthsKV[], const cudnnSeqDataDescriptor_t qDesc,
-    const void *queries, const void *residuals,
-    const cudnnSeqDataDescriptor_t kDesc, const void *keys,
-    const cudnnSeqDataDescriptor_t vDesc, const void *values,
-    const cudnnSeqDataDescriptor_t oDesc, void *out, size_t weightSizeInBytes,
-    const void *weights, size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, int, const int[], const int[],
-      const int[], const int[], const cudnnSeqDataDescriptor_t, const void *,
-      const void *, const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, void *, size_t, const void *, size_t,
-      void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnForward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, currIdx, loWinIdx, hiWinIdx,
-                  devSeqLengthsQO, devSeqLengthsKV, qDesc, queries, residuals,
-                  kDesc, keys, vDesc, values, oDesc, out, weightSizeInBytes,
-                  weights, workSpaceSizeInBytes, workSpace,
-                  reserveSpaceSizeInBytes, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAdvInferVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAdvInferVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSoftmaxBackward(
-    cudnnHandle_t handle, cudnnSoftmaxAlgorithm_t algo, cudnnSoftmaxMode_t mode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSoftmaxAlgorithm_t, cudnnSoftmaxMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSoftmaxBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, mode, alpha, yDesc, y, dyDesc, dy, beta, dxDesc,
-                  dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnPoolingBackward(
-    cudnnHandle_t handle, const cudnnPoolingDescriptor_t poolingDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnPoolingDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnPoolingBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnActivationBackward(
-    cudnnHandle_t handle, cudnnActivationDescriptor_t activationDesc,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnActivationDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnActivationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x,
-                  beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnLRNCrossChannelBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc, cudnnLRNMode_t lrnMode,
-    const void *alpha, const cudnnTensorDescriptor_t yDesc, const void *y,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t xDesc, const void *x, const void *beta,
-    const cudnnTensorDescriptor_t dxDesc, void *dx) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnLRNMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnLRNCrossChannelBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, lrnMode, alpha, yDesc, y, dyDesc, dy, xDesc,
-                  x, beta, dxDesc, dx);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDivisiveNormalizationBackward(
-    cudnnHandle_t handle, cudnnLRNDescriptor_t normDesc,
-    cudnnDivNormMode_t mode, const void *alpha,
-    const cudnnTensorDescriptor_t
-        xDesc, /* same desc for x, means, dy, temp, temp2 */
-    const void *x,
-    const void *means, /* if NULL, means are assumed to be zero */
-    const void *dy, void *temp, void *temp2, const void *beta,
-    const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
-    void *dx,                                   /* output x differential */
-    void *dMeans) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnLRNDescriptor_t, cudnnDivNormMode_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *, void *, const void *, const cudnnTensorDescriptor_t, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnDivisiveNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, normDesc, mode, alpha, xDesc, x, means, dy, temp,
-                  temp2, beta, dXdMeansDesc, dx, dMeans);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t zDesc,
-    const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, zDesc, yDesc,
-                  bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationBackwardExWorkspaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnTensorDescriptor_t xDesc, const cudnnTensorDescriptor_t yDesc,
-    const cudnnTensorDescriptor_t dyDesc, const cudnnTensorDescriptor_t dzDesc,
-    const cudnnTensorDescriptor_t dxDesc,
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t,
-      const cudnnActivationDescriptor_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetBatchNormalizationBackwardExWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc,
-                  dBnScaleBiasDesc, activationDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetBatchNormalizationTrainingExReserveSpaceSize(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-    const cudnnActivationDescriptor_t activationDesc,
-    const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t,
-      const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t,
-      size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>(
-      "cudnnGetBatchNormalizationTrainingExReserveSpaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTraining(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *x, /* NxCxHxW */
-    const cudnnTensorDescriptor_t yDesc, void *y,       /* NxCxHxW */
-
-    /* Shared desc for the next 6 tensors in the argument list.
-       Data type to be set as follows:
-       type = (typeOf(x) == double) ? double : float
-       Dimensions for this descriptor depend on normalization mode
-       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
-        (normalization is performed across NxHxW)
-       - Per-Activation Normalization : tensors are expected to have dims of
-       1xCxHxW (normalization is performed across N) */
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
-
-    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation
-     */
-    const void *bnScale, const void *bnBias,
-
-    /* MUST use factor=1 in the very first call of a complete training cycle.
-       Use a factor=1/(1+n) at N-th call to the function to get
-       Cumulative Moving Average (CMA) behavior
-       CMA[n] = (x[1]+...+x[n])/n
-       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
-       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
-       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
-    double exponentialAverageFactor,
-
-    /* Used in Training phase only.
-       runningMean = newMean*factor + runningMean*(1-factor) */
-    void *resultRunningMean,
-    /* Output in training mode, input in inference. Is the moving average
-       of  variance[x] (factor is applied in the same way as for runningMean) */
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc,
-      bnScale, bnBias, exponentialAverageFactor, resultRunningMean,
-      resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationForwardTrainingEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alpha, /* alpha[0] = result blend factor */
-    const void *beta,  /* beta[0] = dest layer blend factor */
-
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t zDesc, const void *zData,
-    const cudnnTensorDescriptor_t yDesc, void *yData,
-
-    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, const void *bnScale,
-    const void *bnBias,
-
-    double exponentialAverageFactor, void *resultRunningMean,
-    void *resultRunningVariance,
-
-    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and
-       backward functions. */
-    double epsilon,
-
-    /* Optionally save intermediate results from the forward pass here
-       - can be reused to speed up backward pass. NULL if unused */
-    void *resultSaveMean, void *resultSaveInvVariance,
-
-    cudnnActivationDescriptor_t activationDesc, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, const void *, double, void *, void *, double, void *,
-      void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData,
-                  yDesc, yData, bnScaleBiasMeanVarDesc, bnScale, bnBias,
-                  exponentialAverageFactor, resultRunningMean,
-                  resultRunningVariance, epsilon, resultSaveMean,
-                  resultSaveInvVariance, activationDesc, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackward(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, const void *alphaDataDiff,
-    const void *betaDataDiff, const void *alphaParamDiff,
-    const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
-    const void *x, const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dxDesc, void *dx,
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc,
-    const void *bnScale, /* bnBias doesn't affect backpropagation */
-    /* scale and bias diff are not backpropagated below this layer */
-    void *dBnScaleResult, void *dBnBiasResult,
-    /* Same epsilon as forward pass */
-    double epsilon,
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *,
-      const void *, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      const void *, void *, void *, double, const void *, const void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, mode, alphaDataDiff, betaDataDiff, alphaParamDiff,
-                  betaParamDiff, xDesc, x, dyDesc, dy, dxDesc, dx,
-                  dBnScaleBiasDesc, bnScale, dBnScaleResult, dBnBiasResult,
-                  epsilon, savedMean, savedInvVariance);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnBatchNormalizationBackwardEx(
-    cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps,
-
-    const void *alphaDataDiff, const void *betaDataDiff,
-    const void *alphaParamDiff, const void *betaParamDiff,
-    const cudnnTensorDescriptor_t xDesc, const void *xData,
-    const cudnnTensorDescriptor_t yDesc, const void *yData,
-    const cudnnTensorDescriptor_t dyDesc, const void *dyData,
-    const cudnnTensorDescriptor_t dzDesc, void *dzData,
-    const cudnnTensorDescriptor_t dxDesc, void *dxData,
-
-    /* Shared tensor desc for the 4 tensors below */
-    const cudnnTensorDescriptor_t dBnScaleBiasDesc, const void *bnScaleData,
-    const void *bnBiasData, /* needed if there is activation */
-    void *dBnScaleData, void *dBnBiasData,
-    double epsilon, /* Same epsilon as forward pass */
-
-    /* Optionally cached intermediate results from
-       forward pass */
-    const void *savedMean, const void *savedInvVariance,
-    cudnnActivationDescriptor_t activationDesc, void *workSpace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *,
-      const void *, const void *, const void *, const cudnnTensorDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, const void *, const void *, void *,
-      void *, double, const void *, const void *, cudnnActivationDescriptor_t,
-      void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnBatchNormalizationBackwardEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(
-      handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff,
-      betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData,
-      dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData,
-      dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc,
-      workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfGridGeneratorBackward(
-    cudnnHandle_t handle, const cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *dgrid, void *dtheta) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnSpatialTransformerDescriptor_t, const void *,
-      void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnSpatialTfGridGeneratorBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, dgrid, dtheta);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSpatialTfSamplerBackward(
-    cudnnHandle_t handle, cudnnSpatialTransformerDescriptor_t stDesc,
-    const void *alpha, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const void *beta, const cudnnTensorDescriptor_t dxDesc, void *dx,
-    const void *alphaDgrid, const cudnnTensorDescriptor_t dyDesc,
-    const void *dy, const void *grid, const void *betaDgrid, void *dgrid) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnSpatialTransformerDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *,
-      const cudnnTensorDescriptor_t, void *, const void *,
-      const cudnnTensorDescriptor_t, const void *, const void *, const void *,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSpatialTfSamplerBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, stDesc, alpha, xDesc, x, beta, dxDesc, dx, alphaDgrid,
-                  dyDesc, dy, grid, betaDgrid, dgrid);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnDropoutBackward(
-    cudnnHandle_t handle, const cudnnDropoutDescriptor_t dropoutDesc,
-    const cudnnTensorDescriptor_t dydesc, const void *dy,
-    const cudnnTensorDescriptor_t dxdesc, void *dx, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnDropoutDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDropoutBackward");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, dropoutDesc, dydesc, dy, dxdesc, dx, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnOpsTrainVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnOpsTrainVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithmMaxCount(
-    cudnnHandle_t handle, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnHandle_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithm(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithm");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount,
-                  returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindConvolutionBackwardFilterAlgorithmEx(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *y,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnConvolutionBwdFilterAlgoPerf_t *perfResults, void *workSpace,
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, void *,
-      const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindConvolutionBackwardFilterAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, x, dyDesc, y, convDesc, dwDesc, dw,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workSpace,
-                  workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterAlgorithm_v7(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t srcDesc,
-    const cudnnTensorDescriptor_t diffDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, const int, int *,
-      cudnnConvolutionBwdFilterAlgoPerf_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterAlgorithm_v7");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, srcDesc, diffDesc, convDesc, gradDesc,
-                  requestedAlgoCount, returnedAlgoCount, perfResults);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetConvolutionBackwardFilterWorkspaceSize(
-    cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc,
-    const cudnnTensorDescriptor_t dyDesc,
-    const cudnnConvolutionDescriptor_t convDesc,
-    const cudnnFilterDescriptor_t gradDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t,
-      const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetConvolutionBackwardFilterWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, xDesc, dyDesc, convDesc, gradDesc, algo, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardFilter(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy,
-    const cudnnConvolutionDescriptor_t convDesc,
-    cudnnConvolutionBwdFilterAlgo_t algo, void *workSpace,
-    size_t workSpaceSizeInBytes, const void *beta,
-    const cudnnFilterDescriptor_t dwDesc, void *dw) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t,
-      void *, size_t, const void *, const cudnnFilterDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardFilter");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo,
-                  workSpace, workSpaceSizeInBytes, beta, dwDesc, dw);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnConvolutionBackwardBias(
-    cudnnHandle_t handle, const void *alpha,
-    const cudnnTensorDescriptor_t dyDesc, const void *dy, const void *beta,
-    const cudnnTensorDescriptor_t dbDesc, void *db) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *,
-      const void *, const cudnnTensorDescriptor_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnConvolutionBackwardBias");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, alpha, dyDesc, dy, beta, dbDesc, db);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNTrainingReserveSize(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc,
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetRNNTrainingReserveSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTraining(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTraining");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnRNNBackwardData(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-                     const int seqLength, const cudnnTensorDescriptor_t *yDesc,
-                     const void *y, const cudnnTensorDescriptor_t *dyDesc,
-                     const void *dy, const cudnnTensorDescriptor_t dhyDesc,
-                     const void *dhy, const cudnnTensorDescriptor_t dcyDesc,
-                     const void *dcy, const cudnnFilterDescriptor_t wDesc,
-                     const void *w, const cudnnTensorDescriptor_t hxDesc,
-                     const void *hx, const cudnnTensorDescriptor_t cxDesc,
-                     const void *cx, const cudnnTensorDescriptor_t *dxDesc,
-                     void *dx, const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-                     const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-                     void *workspace, size_t workSpaceSizeInBytes,
-                     void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeights(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y, const void *workspace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    const void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  workspace, workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNForwardTrainingEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnRNNDataDescriptor_t yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy,
-    const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
-    const void *keys,                     /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
-    void *cAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
-    void *iAttn,                          /* reserved, should pass NULL */
-    const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
-    void *queries,                        /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnRNNDataDescriptor_t,
-      void *, const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *,
-      size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNForwardTrainingEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, cxDesc, cx, wDesc, w,
-                  yDesc, y, hyDesc, hy, cyDesc, cy, kDesc, keys, cDesc, cAttn,
-                  iDesc, iAttn, qDesc, queries, workSpace, workSpaceSizeInBytes,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardDataEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y,
-    const cudnnRNNDataDescriptor_t dyDesc, const void *dy,
-    const cudnnRNNDataDescriptor_t dcDesc, /* reserved, should pass NULL */
-    const void *dcAttn,                    /* reserved, should pass NULL */
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnRNNDataDescriptor_t dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx,
-    const cudnnRNNDataDescriptor_t dkDesc, /* reserved, should pass NULL */
-    void *dkeys,                           /* reserved, should pass NULL */
-    void *workSpace, size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *,
-      const cudnnRNNDataDescriptor_t, void *, void *, size_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardDataEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, yDesc, y, dyDesc, dy, dcDesc, dcAttn,
-                  dhyDesc, dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx,
-                  dxDesc, dx, dhxDesc, dhx, dcxDesc, dcx, dkDesc, dkeys,
-                  workSpace, workSpaceSizeInBytes, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnRNNBackwardWeightsEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const cudnnRNNDataDescriptor_t xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnRNNDataDescriptor_t yDesc, const void *y, void *workSpace,
-    size_t workSpaceSizeInBytes, const cudnnFilterDescriptor_t dwDesc, void *dw,
-    void *reserveSpace, size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const cudnnRNNDataDescriptor_t,
-      const void *, const cudnnTensorDescriptor_t, const void *,
-      const cudnnRNNDataDescriptor_t, const void *, void *, size_t,
-      const cudnnFilterDescriptor_t, void *, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnRNNBackwardWeightsEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, xDesc, x, hxDesc, hx, yDesc, y, workSpace,
-                  workSpaceSizeInBytes, dwDesc, dw, reserveSpace,
-                  reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNForwardTrainingAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNForwardTrainingAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNForwardTrainingAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t *yDesc, void *y,
-    const cudnnTensorDescriptor_t hyDesc, void *hy,
-    const cudnnTensorDescriptor_t cyDesc, void *cy, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNForwardTrainingAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, cxDesc, cx,
-                  wDesc, w, yDesc, y, hyDesc, hy, cyDesc, cy, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardDataAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardDataAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardDataAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const cudnnTensorDescriptor_t *dyDesc, const void *dy,
-    const cudnnTensorDescriptor_t dhyDesc, const void *dhy,
-    const cudnnTensorDescriptor_t dcyDesc, const void *dcy,
-    const cudnnFilterDescriptor_t wDesc, const void *w,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t cxDesc, const void *cx,
-    const cudnnTensorDescriptor_t *dxDesc, void *dx,
-    const cudnnTensorDescriptor_t dhxDesc, void *dhx,
-    const cudnnTensorDescriptor_t dcxDesc, void *dcx, const float findIntensity,
-    const int requestedAlgoCount, int *returnedAlgoCount,
-    cudnnAlgorithmPerformance_t *perfResults, void *workspace,
-    size_t workSpaceSizeInBytes, void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnFilterDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, void *, const cudnnTensorDescriptor_t,
-      void *, const cudnnTensorDescriptor_t, void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, void *, size_t, void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardDataAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, yDesc, y, dyDesc, dy, dhyDesc,
-                  dhy, dcyDesc, dcy, wDesc, w, hxDesc, hx, cxDesc, cx, dxDesc,
-                  dx, dhxDesc, dhx, dcxDesc, dcx, findIntensity,
-                  requestedAlgoCount, returnedAlgoCount, perfResults, workspace,
-                  workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetRNNBackwardWeightsAlgorithmMaxCount(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, int *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnGetRNNBackwardWeightsAlgorithmMaxCount");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, count);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnFindRNNBackwardWeightsAlgorithmEx(
-    cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc,
-    const int seqLength, const cudnnTensorDescriptor_t *xDesc, const void *x,
-    const cudnnTensorDescriptor_t hxDesc, const void *hx,
-    const cudnnTensorDescriptor_t *yDesc, const void *y,
-    const float findIntensity, const int requestedAlgoCount,
-    int *returnedAlgoCount, cudnnAlgorithmPerformance_t *perfResults,
-    const void *workspace, size_t workSpaceSizeInBytes,
-    const cudnnFilterDescriptor_t dwDesc, void *dw, const void *reserveSpace,
-    size_t reserveSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnRNNDescriptor_t, const int,
-      const cudnnTensorDescriptor_t *, const void *,
-      const cudnnTensorDescriptor_t, const void *,
-      const cudnnTensorDescriptor_t *, const void *, const float, const int,
-      int *, cudnnAlgorithmPerformance_t *, const void *, size_t,
-      const cudnnFilterDescriptor_t, void *, const void *, size_t);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnFindRNNBackwardWeightsAlgorithmEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, rnnDesc, seqLength, xDesc, x, hxDesc, hx, yDesc, y,
-                  findIntensity, requestedAlgoCount, returnedAlgoCount,
-                  perfResults, workspace, workSpaceSizeInBytes, dwDesc, dw,
-                  reserveSpace, reserveSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardData(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    const int loWinIdx[], const int hiWinIdx[], const int devSeqLengthsDQDO[],
-    const int devSeqLengthsDKDV[], const cudnnSeqDataDescriptor_t doDesc,
-    const void *dout, const cudnnSeqDataDescriptor_t dqDesc, void *dqueries,
-    const void *queries, const cudnnSeqDataDescriptor_t dkDesc, void *dkeys,
-    const void *keys, const cudnnSeqDataDescriptor_t dvDesc, void *dvalues,
-    const void *values, size_t weightSizeInBytes, const void *weights,
-    size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, const int[], const int[],
-      const int[], const int[], const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *,
-      const cudnnSeqDataDescriptor_t, void *, const void *, size_t,
-      const void *, size_t, void *, size_t, void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardData");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, loWinIdx, hiWinIdx, devSeqLengthsDQDO,
-                  devSeqLengthsDKDV, doDesc, dout, dqDesc, dqueries, queries,
-                  dkDesc, dkeys, keys, dvDesc, dvalues, values,
-                  weightSizeInBytes, weights, workSpaceSizeInBytes, workSpace,
-                  reserveSpaceSizeInBytes, reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnMultiHeadAttnBackwardWeights(
-    cudnnHandle_t handle, const cudnnAttnDescriptor_t attnDesc,
-    cudnnWgradMode_t addGrad, const cudnnSeqDataDescriptor_t qDesc,
-    const void *queries, const cudnnSeqDataDescriptor_t kDesc, const void *keys,
-    const cudnnSeqDataDescriptor_t vDesc, const void *values,
-    const cudnnSeqDataDescriptor_t doDesc, const void *dout,
-    size_t weightSizeInBytes, const void *weights, void *dweights,
-    size_t workSpaceSizeInBytes, void *workSpace,
-    size_t reserveSpaceSizeInBytes, void *reserveSpace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnAttnDescriptor_t, cudnnWgradMode_t,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *,
-      const cudnnSeqDataDescriptor_t, const void *, size_t, const void *,
-      void *, size_t, void *, size_t, void *);
-  static auto func_ptr =
-      LoadSymbol<FuncPtr>("cudnnMultiHeadAttnBackwardWeights");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, attnDesc, addGrad, qDesc, queries, kDesc, keys, vDesc,
-                  values, doDesc, dout, weightSizeInBytes, weights, dweights,
-                  workSpaceSizeInBytes, workSpace, reserveSpaceSizeInBytes,
-                  reserveSpace);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCreateCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptorEx(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType,
-    cudnnLossNormalizationMode_t normMode, cudnnNanPropagation_t gradMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t,
-      cudnnNanPropagation_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnSetCTCLossDescriptor_v8(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType,
-    cudnnLossNormalizationMode_t normMode, cudnnNanPropagation_t gradMode,
-    int maxLabelLength) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t, cudnnLossNormalizationMode_t,
-      cudnnNanPropagation_t, int);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnSetCTCLossDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode, maxLabelLength);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType) {
-  using FuncPtr =
-      cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t, cudnnDataType_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptorEx(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType,
-    cudnnLossNormalizationMode_t *normMode, cudnnNanPropagation_t *gradMode) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t *,
-      cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptorEx");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossDescriptor_v8(
-    cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType,
-    cudnnLossNormalizationMode_t *normMode, cudnnNanPropagation_t *gradMode,
-    int *maxLabelLength) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnCTCLossDescriptor_t, cudnnDataType_t *,
-      cudnnLossNormalizationMode_t *, cudnnNanPropagation_t *, int *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossDescriptor_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc, compType, normMode, gradMode, maxLabelLength);
-}
-
-cudnnStatus_t CUDNNWINAPI
-cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(cudnnCTCLossDescriptor_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnDestroyCTCLossDescriptor");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(ctcLossDesc);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the
-                      mini batch size, A is the alphabet size)  */
-    const void *probs,      /* probabilities after softmax, in GPU memory */
-    const int hostLabels[], /* labels, in CPU memory */
-    const int hostLabelLengths[], /* the length of each label, in CPU memory */
-    const int hostInputLengths[], /* the lengths of timing steps in each batch,
-                                     in CPU memory */
-    void *costs,                  /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    void *gradients,   /* the returned CTC gradients, in GPU memory, to compute
-                          costs only, set it to NULL */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    void *workspace, /* pointer to the workspace, in GPU memory */
-    size_t workSpaceSizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t, const void *, const int[],
-      const int[], const int[], void *, const cudnnTensorDescriptor_t, void *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, void *, size_t);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, probs, hostLabels, hostLabelLengths,
-                  hostInputLengths, costs, gradientsDesc, gradients, algo,
-                  ctcLossDesc, workspace, workSpaceSizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnCTCLoss_v8(
-    cudnnHandle_t handle,
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the timing steps, N is the
-                      mini batch size, A is the alphabet size)  */
-    const void *probs,        /* probabilities after softmax, in GPU memory */
-    const int labels[],       /* labels, in GPU memory */
-    const int labelLengths[], /* the length of each label, in GPU memory */
-    const int inputLengths[], /* the lengths of timing steps in each batch, in
-                                 GPU memory */
-    void *costs,              /* the returned costs of CTC, in GPU memory */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the dimensions are
-                          T,N,A */
-    void *gradients,   /* the returned CTC gradients, in GPU memory, to compute
-                          costs only, set it to NULL */
-    size_t workSpaceSizeInBytes, /* size of the workspace */
-    void *workspace) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t,
-      const cudnnTensorDescriptor_t, const void *, const int[], const int[],
-      const int[], void *, const cudnnTensorDescriptor_t, void *, size_t,
-      void *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnCTCLoss_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, ctcLossDesc, probsDesc, probs, labels,
-                  labelLengths, inputLengths, costs, gradientsDesc, gradients,
-                  workSpaceSizeInBytes, workspace);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize(
-    cudnnHandle_t handle,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the
-                      timing steps, N is the mini batch size, A is the alphabet
-                      size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc,       /* Tensor descriptor for gradients, the
-                                dimensions are T,N,A. To compute costs
-                                only, set it to NULL */
-    const int *labels,       /* labels, in CPU memory */
-    const int *labelLengths, /* the length of each label, in CPU memory */
-    const int *inputLengths, /* the lengths of timing steps in each batch, in
-                                CPU memory */
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc, size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, const cudnnTensorDescriptor_t,
-      const cudnnTensorDescriptor_t, const int *, const int *, const int *,
-      cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, probsDesc, gradientsDesc, labels, labelLengths,
-                  inputLengths, algo, ctcLossDesc, sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnGetCTCLossWorkspaceSize_v8(
-    cudnnHandle_t handle,
-    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
-    cudnnCTCLossDescriptor_t ctcLossDesc,
-    const cudnnTensorDescriptor_t
-        probsDesc, /* Tensor descriptor for probabilities, the dimensions are
-                      T,N,A (T is the
-                      timing steps, N is the mini batch size, A is the alphabet
-                      size) */
-    const cudnnTensorDescriptor_t
-        gradientsDesc, /* Tensor descriptor for gradients, the
-                          dimensions are T,N,A. To compute costs
-                          only, set it to NULL */
-    size_t *sizeInBytes) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)(
-      cudnnHandle_t, cudnnCTCLossAlgo_t, cudnnCTCLossDescriptor_t,
-      const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, size_t *);
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnGetCTCLossWorkspaceSize_v8");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr(handle, algo, ctcLossDesc, probsDesc, gradientsDesc,
-                  sizeInBytes);
-}
-
-cudnnStatus_t CUDNNWINAPI cudnnAdvTrainVersionCheck(void) {
-  using FuncPtr = cudnnStatus_t(CUDNNWINAPI *)();
-  static auto func_ptr = LoadSymbol<FuncPtr>("cudnnAdvTrainVersionCheck");
-  if (!func_ptr) return GetSymbolNotFoundError();
-  return func_ptr();
-}
-
-}  // extern "C"
diff --git a/tensorflow/stream_executor/cuda/cudnn_stub.cc b/tensorflow/stream_executor/cuda/cudnn_stub.cc
index e30f749897e..f683cecdb52 100644
--- a/tensorflow/stream_executor/cuda/cudnn_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudnn_stub.cc
@@ -51,17 +51,15 @@ cudnnStatus_t GetSymbolNotFoundError() { return CUDNN_STATUS_INTERNAL_ERROR; }
 #error cuDNN version earlier than 6 is not supported.
 #elif CUDNN_MAJOR < 7
 #include "tensorflow/stream_executor/cuda/cudnn_6_0.inc"
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 1
+#elif CUDNN_MINOR < 1
 #include "tensorflow/stream_executor/cuda/cudnn_7_0.inc"
 // 2 instead of 3: see https://github.com/tensorflow/tensorflow/issues/32350
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 2
+#elif CUDNN_MINOR < 2
 #include "tensorflow/stream_executor/cuda/cudnn_7_1.inc"
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 4
+#elif CUDNN_MINOR < 4
 #include "tensorflow/stream_executor/cuda/cudnn_7_3.inc"
-#elif CUDNN_MAJOR == 7 && CUDNN_MINOR < 6
+#elif CUDNN_MINOR < 6
 #include "tensorflow/stream_executor/cuda/cudnn_7_4.inc"
-#elif CUDNN_MAJOR == 7
-#include "tensorflow/stream_executor/cuda/cudnn_7_6.inc"
 #else
-#include "tensorflow/stream_executor/cuda/cudnn_8_0.inc"
+#include "tensorflow/stream_executor/cuda/cudnn_7_6.inc"
 #endif
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 62220dbd185..203434ab3f4 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1069,31 +1069,11 @@ def _create_local_cuda_repository(repository_ctx):
         ],
     ))
 
-    if [int(x) for x in cuda_config.cudnn_version.split(".")] < [8, 0]:
-        cudnn_headers = ["cudnn.h"]
-    else:
-        cudnn_headers = [
-            "cudnn_adv_infer.h",
-            "cudnn_adv_train.h",
-            "cudnn_cnn_infer.h",
-            "cudnn_cnn_train.h",
-            "cudnn_ops_infer.h",
-            "cudnn_ops_train.h",
-            "cudnn.h",
-            "cudnn_version.h",
-        ]
-
-    cudnn_srcs = []
-    cudnn_outs = []
-    for header in cudnn_headers:
-        cudnn_srcs.append(cudnn_header_dir + "/" + header)
-        cudnn_outs.append("cudnn/include/" + header)
-
     copy_rules.append(make_copy_files_rule(
         repository_ctx,
         name = "cudnn-include",
-        srcs = cudnn_srcs,
-        outs = cudnn_outs,
+        srcs = [cudnn_header_dir + "/cudnn.h"],
+        outs = ["cudnn/include/cudnn.h"],
     ))
 
     # Set up BUILD file for cuda/
diff --git a/third_party/gpus/find_cuda_config.py b/third_party/gpus/find_cuda_config.py
index 091cd32d5fe..d768d4e3570 100644
--- a/third_party/gpus/find_cuda_config.py
+++ b/third_party/gpus/find_cuda_config.py
@@ -219,20 +219,17 @@ def _find_library(base_paths, library_name, required_version):
   return _find_file(base_paths, _library_paths(), filepattern)
 
 
-def _find_versioned_file(base_paths, relative_paths, filepatterns,
+def _find_versioned_file(base_paths, relative_paths, filepattern,
                          required_version, get_version):
   """Returns first valid path to a file that matches the requested version."""
-  if type(filepatterns) not in [list, tuple]:
-    filepatterns = [filepatterns]
   for path in _cartesian_product(base_paths, relative_paths):
-    for filepattern in filepatterns:
-      for file in glob.glob(os.path.join(path, filepattern)):
-        actual_version = get_version(file)
-        if _matches_version(actual_version, required_version):
-          return file, actual_version
+    for file in glob.glob(os.path.join(path, filepattern)):
+      actual_version = get_version(file)
+      if _matches_version(actual_version, required_version):
+        return file, actual_version
   raise _not_found_error(
       base_paths, relative_paths,
-      ", ".join(filepatterns) + " matching version '%s'" % required_version)
+      filepattern + " matching version '%s'" % required_version)
 
 
 def _find_header(base_paths, header_name, required_version, get_version):
@@ -429,13 +426,12 @@ def _find_cufft_config(base_paths, required_version, cuda_version):
 def _find_cudnn_config(base_paths, required_version):
 
   def get_header_version(path):
-    version = [
+    version = (
         _get_header_version(path, name)
-        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL")]
-    return ".".join(version) if version[0] else None
+        for name in ("CUDNN_MAJOR", "CUDNN_MINOR", "CUDNN_PATCHLEVEL"))
+    return ".".join(version)
 
-  header_path, header_version = _find_header(base_paths,
-                                             ("cudnn.h", "cudnn_version.h"),
+  header_path, header_version = _find_header(base_paths, "cudnn.h",
                                              required_version,
                                              get_header_version)
   cudnn_version = header_version.split(".")[0]
diff --git a/third_party/gpus/find_cuda_config.py.gz.base64 b/third_party/gpus/find_cuda_config.py.gz.base64
index 981219bb10a..ae3ee37c077 100644
--- a/third_party/gpus/find_cuda_config.py.gz.base64
+++ b/third_party/gpus/find_cuda_config.py.gz.base64
@@ -1 +1 @@
-eJzdPGtT40iS3/0r6tRHINNGwOzGxJ5vmQsGum/Y5aAD3D23AV5vIZeNpmXJJ8kG78b+98vMqpKqSpINpnu2Y4iYHkuqzMrMyme93rDTdL7KoulDwb47PPoPNngQbCCSPM3ex+kjO1kUD2mWB+wkjtk1NsvZtchFthTjoPOm84ZdRCE0F2O2SMYiYwXAn8x5CP9TX3rsk8jyKE3Yd8Eh87GBpz553f8EDKt0wWZ8xZK0YItcAIooZ5MoFkw8hWJesChhYTqbxxFPQsEeo+KBulFIgAz2F4UivS84tObQfg5PE7Md4wURjH8PRTHvHxw8Pj4GnIgN0mx6EMuG+cHF+em7y5t3+0AwgXxMYpHnLBP/t4gyYPV+xfgc6An5PVAZ80eWZoxPMwHfihTpfcyiIkqmPZank+KRZwKwjKO8yKL7RWEJS1MHPJsNQFw8Yd7JDTu/8diPJzfnNz3A8fP54KerjwP288n19cnl4PzdDbu6ZqdXl2fng/OrS3h6z04u/8L+fH551mMCRAXdiKd5hvQDkRGKkYaO3QhhETBJJUH5XITRJAqBr2S64FPBpulSZAmww+Yim0U5DmYO5I0BSxzNooIX9KbGFHZz/EX/Op7nfciiBNTw9OPZCXR/n/FshcSwB8Gx/zEMUVikWSSIRraU2gcqlQKBKFjicpUXYhZ0OqjweZhFoGe54BnoQk6iaEOPipnbWHow4ii1Iu/AyxmqwFgUKKqERBxlmghCNJf0I3yYJpNoushIgAiXF+N0UQRE1ZwXD7nUJ8JOwAhV6mHJGiiYHjdUwYcsXUwfmEiWUZYmM5EUnSXPItRWMOXzCZgaW/I4GjsEREpIPcmclIoml4gTWUYDn4likZESMHgF4grTsVDSjEGN0fak8HAYAHYSAfGAv6KSI9nTBVIHRN0s5vM0Q82vwNBsaBj8KAnjxRhehYsfL05uuj34cXZ52WOXp6cXPRKMdFrXA3tIC/4ZEZU03XPQdFNDKnrAqJHrwfsR9jn6cDL46aZjiJBpESLl4I9mfD8Xcw6iA+BpnN5TJwEzeo/T9LPUJqk8eQcp1UolNYnc1QPPxvsowjEoIRGaL+5NMidZOkPygHpigXQj6MBQWvSihNGNllyBbNjVTWXTYzHhi7jAduBqx/1Oh4GxJounPnjFg0WeHcRpyOODcDHmPXohpSt1kajYjcdSa9j+fDcA+J9haNPHvM9KOliT2Hrgfzqs+jvt3919yNJpxmfsPYri7u7y0/nZ+Qn77w8fISzN5gt0oWyQpvHnqLi7Q/R3d3udznsQ6D0PP4NfHdNQAHXRfRRHxQod7kyYOhTnKUUVHoNFJtBwKUXYqYk3lc6ygXQYJQolK9vQAhAdqVVHDel+KedGLEracvRZGRahbaVb7GQ8jtAYeVxTVoDef+UfoCCLoj+tOp/eXd9A8NDjgq8GV1cXfz4f0GgCjDS7EgYfLKhmGLBQo5/LS7sj+er88mZwcnGhgdCeS+LwwSGOXpkw0gWMfjq71ii0IyAUA4jjV9fXAwNN+crquvNJ+9JmBQjB090LnVKASYMhPO1iSN19Cla76OLQLQvQB94p1UA5aIjEs3mxwtaLxPDTKeMhZTg8Wem2yn9BFAD9x/64ClzabYDpGlT0O8Vk9Ef17YeRQtJnQFTwd/ub9EEjUKg+C4LA/qh+VF8h0HY6kC6AU2ZRqn+luf6FHk//nse8QGL0M/ht9Qvc2DxLQ8g+yjervANpwXwVQ1jpY86DAj6e7svv++C+9nmxX6TzTpGt+jCgZJz5A3iDmCkkjw9R+NBR6eE5vXuHYahsjqkUAuRBPuePiYbDKDQSTyJcFNqTS1QVQSJppafTCWMOedQpuT/q0H9HNIDIu9j3HD5DM/CybBTlIFTwrD59kQGzlFQgMwe/C/kRpsPQzjMAH6VLfQaocr4m8IyH6XNAz3gG/ZSQM15g8qNVyOdhseDxqFRhnXXoN9QBaMnpgwg/oxwF+U/yvzrdmQmhMh0FLaN9R8UBLCpELF9qpXa7YYnKqsH6MI1Fs4wo+1JIbDpL3PhXw+U0xhY5hEMD5CWeVP8dMffvKLDfDbKFMAGC71wA5/k9xK0NEMHv1kLU/9w+iCg9Etk075fQrtz6lNRoodVSJgivWVAC2yK2QTEtDjFf0vmMjEaQ4BY8jikJ1YiuZYbZZz8rxcLWEnelXlJl7aQ4TUQgNRP+hbjtDDm4z0toIplV9iFlB2+gKoIKbA4s8tDIuspEGxE7+I5dBcQabu53Ow3qd1x7ZbUmWmrYeFbkWPP6NQPUlsuLUSx4XrzMdJUsj9kt+D1/2aU8dUn5qEMClIyF7wVed2gw1QRY584BNThkP1TS0IxMRaGjlGYFE88eS/hMaH+jFAOdgQot0D2AQ7FF2fSpWakFUg2QwhirMfgepUE6Fxqzl3kQnRPIvSHXPPYWxWT/D143yAAFAig/yqSq0QAG9NP33sg+2U7O3vp347ddj+0QoT3qqktgoH/UXNuWEgG9C6ZQps39I2PwvdIbhzDsIo94MgImx4uw8CdRlheQ3wrIvseuMMB6KEfHbPg+SlRBDi6VoCiFl4BKIKq/2zQPECz4JY0SfwLY5WhOUFASFB9JrhJ+aI5VPB7JUmBE9YHfRFUtzbbrh9JMsWxxYmZFJmqPBqO+YCQocPuefu11Mb3yDnJg/6B8CXAqkzo2cpEgxJg1kl/8WwszqMT+3Bt2KZoXWDPIUccyAxQKlHmPHf/A/GCv68mRwwiC6EVBdmzqmuxBmoGlTiq1MTVL9abUC1sHY4E1oe/xPIwirytVSmU9H5MIP55RkzL5MRFWEaGuhUhzwMdjXysADBPqrm9rZtfQzZzKcx+LXF8i6HZNZVCl5QhLR6UP9NP0OWqczfeSJvMNkO7t4ciJWLUHB0KTgJvArMe3AAZoZK+1pKrSrTJwgShU7h8AP74VS72ytvV6zoftytjlDjQEj2Ehs+RFXwy3ees59fn+To4uxwQC5XUaeeqVSbbTJkm8IUir2aL1CCu33Gzn9emLTMSy1C4nsXAuxJgKkbMfjjdSJHolsZ6a+Km9ULy5b/f2yX3sT5NF9VE8FRnPD04/fhicH6xFKNvQt9LP6cLoC3BuTfxsZB1af/9769F6aOYVPzyZYBbzezZrYFojqi1HNIfn43yD5LNXsqGfkWbloUgGVVs0V08GkFvvLtFKtruT76J6krdG7ulHVPqRCkF3KL2M2eFLkdrQQ8NtmcWaFsppuojH5FloWhKr7x05Ewe/jJm3VR+sNEkn9L/KWH1DGHVJGYyV1kN1J0K9SMYmgw0JQTsq5eEokuM6CsBjuR7gP74V8mXEMzvuOqkKfkNp8igX26uMJQhlBxawNjOZQLXVmtrsZGoip7BJQmBmuggQeTWLvKryi4YIYBAI+mZSQMEDLU5HIaeudmG9nXwvGK8QArQTE16tu/TqrYUcnO3aOs3+W5NT3/aPhhShBQT6RrLW0gGRIU+9DdRs6h4ltWc4shZVd93oGu1QHYmXGUzea+fC5aHHMNS9QLW4NKPigRe1mlOqm5ZOqW7Fai5MPwGJNfobsMRbzKB6rFjMYzGsjRp6vlvzefgFHYHWjCixutQW/zp/0VAZG2ImWXTLptG2E07VmJbeqed03O6tFPwaldIRAmxD1UTWEIKySwXAjE7zqYNSa31Oai3THmusVE7V7PK2UFKVWT1bTS2TXWN2TvJnE77Bla1ny5IQJbIy+3RUuqYH0CnCtcwXKEWp9BDnKdonFzxzEURXWarysKqNUmI4gWTWEN7OONgZk+vXnR4csKPDw8NeScUOPcv33U5HJfuGZEuxVnS3ao6HwMGDt0n6zxqRLXDUZdnt1Koxhx32BhclcAJDMH6fLkVQSsEMDiXbTVkC8Z0VXs8plQx9SJZh2KQNRlQ8BVgm63m98J7G0hcIXHK7G7+9C+CfHvvk69/0T9cr3amu8FsnFfRcwr6ixRt2G+YBGiaWysSyfQqgXtG3zyzZSgsvSECU5IAo8CEQT8KrpUiUVMgGnoZSk3GGhMvRWuM+Gmqb+yiRpUivIqfnVLLuYHYl8cuZrSbt3WHbA9CisVhGoVHx5VC0uQUk1k1Q+48jvk+ldSGrdQccyPXKN3vB0WFwH3rKmOdFZFpzO301b+oRMFhzt0T0EoOA9l6DkyyNC6cD5cIeYHKne8ph7WrhGguBje2V/Ltyuvx/L0501zLWKMGVUYkKrHnxxOVmnFJ6AYEPrs6u/DCPptNunxbP0KTuUwA1pw1xeQBkFmiOVB9lpajjuiY2STPcEvBgpy6OMDDCQxLb7T0L2BUNQR+UCLoy6atTd3s4ZP923PThSOV+MlUxq1TvPIEomEOWiGtdtEBiSrWPteoyZzT70+K76/3ReClP8A+t9abBeX3b/qw2apIEOYd2rlK4kaxrAxtiawM29d2BrkZMU2iMoWnqTj+1EauwooVt4sgx564Dvokn14gdpsyh0Wy5eoMQ/3QSpPuY589JkZz42NezoLWVImfyEDzakSebPyfDMnMsc750w0KO0RJDKcUisHvfq3aWjP7n5E9X1x4lZ9W780t815avmE0/nAxOf6rCpc7TdD1sOknGtk/CaDz4PNomFXt1HtachKFbVdt1FrlyyjP+S1rtQUyTeBWo6XNioC1lM2v9w2HHmmh4g+u6mZAbzkrUUTKBQmTJIbHC7RX3AsZXrfGiavXYL4tcTbrRBhZcoTOHoHkq3x2kl8VWY4wc+RhLwrTfEfWQq42l+psPOWu3RVruCKKEXhK5ER2lsibeFkdtNgGPYcur57Rb79xa3BpBrvdrNY/m+qc8jYGkr+qhjoLDf6mHurm6QAfj+Cjz7UYvZTT+lfxUOS6zGZj1t+SsStq2cUJf0Wu4Emul9rVOQCIkN2DjbnEEdqN1rkC13MoZKNhXuoOMJ+PftDO4Prk8c11B9W6jIyib/kpuAMfjG7N+UpFvzPaVmFpIfK3BIzoydxNvi7GbTdaZOrXbytAJ8pVmPpkUv2krf/9+4Bp5+UrauPnqV7JlEPo3ZsqoBt+YJUshNRP4WjsGbGTGBtYWKzZarDNibLaVDSPgK014nCRfZ+ml2l71XPtzrQ9PiRiWR4+V1eEjWdzFu0/vLuQ+03ajQ3+jfuPMHE1x63nx7UzxZebjeyRqNFwmf5YGAmr6QmSvNOXWNZyKqGeY8YuWbpJErtxUPbRNS1Yt5PRY9dyzW21lMAj42kp3zjNg7rcc9m4+nFzfvKtVusbbzZVu1fjXqnRpXL6xyKiU5RsLjqWoWsl8dW1LCGVta+Fuq22tRmtrW9lyu9pWwr7KASRhGH+dgFnZ7nYBk45DlvYqn3S4pCfLENeZ4bZGiLJ5uQF+lWhGw/Q1ghkiBoma+Bt12myAi1LGY89qs4UmE9yrtLigo7LZs6q3f4Emf6qO71YKbb5cF3+slpbOv2F/U0T8Daf+OTCUiIwX0Hl6/4sI8ZBNyh7pdgZoWLCokPur6BIBtaaxyKNkqtDBd3+2iIpojpsEo5nIoVWcPqp1g0oidHbCig1qmxGPY7+SszaIctsGdvJJfpbn+9UtDTP+WdT20TF9NwOd/wg2mnl5FGVbe79cnicTkX1DMVcdkTEW1ctFK304/Hs2S5dibJ7uo9WrbFbexaH4+qQT9OBLSKnE9o0ICxVAe4FnOsoX+cllhFyD5bq9NLpLtxG4NPdVr9Z2C9dZwm7rPtGSR3icbSSSpQ//qS1M6hDU8e3Q3bLp3hhCzoQOxDVdOFBtJNa46ThZeULJOsJUvb7VrYd6zHqecRZCUWcd4RNTHkoOn8UGlzt7rDPA4BkJSzMnOM71ux56TVc5NF7NQAdrihSw7MpzS3jE5Q/fj77/fXUAZpc90pGOeSaWUbrI45VcZKatRPVLfDpMXaDCzuhAMniBFK8IgQZKUtTVbk/eCfIYyeuR8L4RdAzO6Het48brxqt+iPSv/sHtXw/YcK/7lhi7e3xbcXXwX/8OhtM0umsPl97aewCNI2Sb1LZbnQwCPxhHf5eGXIV3QxHKJnIHck/ey6WllMc8f5D3JKkbCpRGK5ehLS0TPK666LSd1FBQCmQe81D43t0dHW4zFRwbKB5qW4rrmjyOQrppJaOrwtZd+KTURbJQXS5zzG55Nl0GEOvB3cujs/iCNoau8gB/3x71h8NObXOsc9jQc+5iQc6IMeuclTN+nnX3z8aA8qxTmrU+aSOrdQLB+AyjpUdSPAFtuRzHofTt6mDsP/4px5W2PtFBzlKCxinONh5rV8t41hEr5QfpPOtiPgbX6td3k1cdOJMxarrBGRuJ7tbeHjdULWmbhCa2IkQbJB3r8N2D8SYmI5p22R+Zf3TYY0dl1veG/VjbNKO2qzzyXN18IbeqlKfUcNOKsfdS36Dg0FoJweRkg0aaN/2UOukKvMrzG7apmVS4e13aRkOtPn9RKYMcD0sp17qoCae2zt8onmonx8sEZO+Tsamp7wRoExOt3X1NIVkd1ETkrIs2CkivcL9MPOa+AZMKd9W0TTC4IPI15WLir4nFXmVqlIpaKlwrlIbFVaNjZ+Gp1ZDkVNdXNSS7i7ohuZOKzYZUThS/0JCsaXibmvq0Y5OYVGBKkpbIlCRVaHITZq+e1zZEJ3dJpTn0V9ejbVQJY7HOINBdWSm5o7mqBuZo/qidt1p+3sCaM7/WwJl5h9sGxsw51Yo2Z5Kt5EqXcE2cleVdO3eNpUYDhw3FcQOX7jVzGzh1591sehtqZeIazfNzT9+0Q1ijQszKJBmk8hkoG8vbgjyqaelSFOstduEZk0yU6nweopCcrH9plujqoiyZW894lMhey8kjIk6sengocSGMc/71PFxTXRJBN4z63k7el4camF9h6jbP52DKo1ZIMNGWN5MGeOuu8PMi84Wa68OPkJkWeBaq08Higiqe0YhuQBuNkJPRyENMkqnO/wPiKjLf
\ No newline at end of file
+eJzdPGtz4zaS3/UrcJxzmZqRaTu7ldrTrXPl2DMX7/rsKVszuS3bq4VJSGaGInUkJVu7lf9+3Q2ABEBS8iOTuOKqTEQQ3ehu9BMPvmFH2XyVx9O7kn2zt/8fbHQn2EikRZZ/SLJ7drgo77K8CNhhkrAL7FawC1GIfCmioPem94adxiF0FxFbpJHIWQnwh3Mewv/UmwH7LPIizlL2TbDHfOzgqVde/z8BwypbsBlfsTQr2aIQgCIu2CROBBMPoZiXLE5ZmM3mSczTULD7uLyjYRQSIIP9TaHIbksOvTn0n8PTxOzHeEkE499dWc6Hu7v39/cBJ2KDLJ/uJrJjsXt6cvT+7PL9DhBMIJ/SRBQFy8X/LeIcWL1dMT4HekJ+C1Qm/J5lOePTXMC7MkN67/O4jNPpgBXZpLznuQAsUVyUeXy7KC1haeqAZ7MDiIunzDu8ZCeXHvv+8PLkcgA4fjwZ/XD+acR+PLy4ODwbnby/ZOcX7Oj87PhkdHJ+Bk8f2OHZ39hfT86OB0yAqGAY8TDPkX4gMkYx0tSxSyEsAiaZJKiYizCexCHwlU4XfCrYNFuKPAV22Fzks7jAySyAvAiwJPEsLnlJLQ2mcJiDX/Sv53nexzxOQQ2PPh0fwvC3Oc9XSAy7ExzHj2CKwjLLY0E0sqXUPlCpDAhEwRKXq6IUs6DXQ4UvwjwGPSsEz0EXChJFF3pUzMLGMoAZR6mVRQ8aZ6gCkShRVCmJOM41EYRoLulH+DBLJ/F0kZMAEa4oo2xRBkTVnJd3hdQnwk7ACFXpYcUaKJieN1TBuzxbTO+YSJdxnqUzkZa9Jc9j1FYw5ZMJmBpb8iSOHAJiJaSBZE5KRZNLxIk8p4nPRbnISQkYNIG4wiwSSpoJqDHanhQeTgPATmIgHvDXVHIke7pA6oCoy8V8nuWo+TUYmg1Ngx+nYbKIoClcfH96eNkfwI/js7MBOzs6Oh2QYKTTuhjZU1ryL4iooumWg6abGlLTA0aNXI8+jHHM8cfD0Q+XPUOETIsQKQd/NOM7hZhzEB0AT5PslgYJmDF6kmVfpDZJ5Sl6SKlWKqlJ5K7ueB7toAgjUEIitFjcmmRO8myG5AH1xALpRtCDqbToRQmjG624Atmw88vapiMx4YukxH7gaqNhr8fAWNPFwxC84u6iyHeTLOTJbriI+IAapHSlLhIV20kktYbtzLcDgP8Rpja7L4asooO1iW0A/qfH6r+j4fX1xzyb5nzGPqAorq/PPp8cnxyy//74CcLSbL5AF8pGWZZ8icvra0R/ff221/sAAr3l4RfwqxFNBVAX38ZJXK7Q4c6EqUNJkVFU4QlYZAodl1KEvYZ4M+ksW0iHWaJQsrINLQDRkVr11JTuVHJuxaKkLWefVWER+ta6xQ6jKEZj5ElDWQF654V/gIIsiv606nx+f3EJwUPPCzaNzs9P/3oyotkEGGl2FQw+WFDtMGChxjhnZ/ZAsunk7HJ0eHqqgdCeK+LwwSGOmkwY6QLGPxxfaBTaERCKEcTx84uLkYGmarKG7n3WvrRdAULwdLdCpxRg0mAID9sYUrcfgtU2ujh0ywL0gfcqNVAOGiLxbF6usPciNfx0xnhIGQ5PV7qv8l8QBUD/cTyuApd2G2C6BhXDXjkZ/1m9+26skAwZEBX8034nfdAYFGrIgiCwX6of9VsItL0epAvglFmc6V9ZoX+hx9O/5wkvkRj9DH5b/QI3Ns+zELKPqmVV9CAtmK8SCCtDzHlQwAfTHfl+B9zXDi93ymzeK/PVECaUjLO4A2+QMIXk/i4O73oqPTyhtvcYhqrumEohQBEUc36fajiMQmPxIMJFqT25RFUTJNJOenq9MOGQRx2R+6MB/fdEA4i8j2PP4TV0Ay/LxnEBQgXP6tMbGTArSQUyc/D7kB9hOgz9PAPwXrrUR4Aq52sCz3iYPQb0mOcwTgU54yUmP1qFfB6WC56MKxXWWYduoQFAS47uRPgF5SjIf5L/1enOTAiV6ShoGe17Kg5gUSES2aiV2h2GpSqrBuvDNBbNMqbsSyGx6axw418Dl9MZexQQDg2Qp3hS/bfP3L/9wG4b5QthAgTfuADO8weIWxsggj+shWj+uWMQUXom8mkxrKBduQ0pqdFCa6RMEF7zoAK2RWyDYlocYr6k8xkZjSDBLXmSUBKqEV3IDHPIflSKhb0l7lq9pMraSXGWikBqJvwLcduZcnCfZ9BFMqvsQ8oOWqAqggpsDizy0Mi6qkQbETv4DlwFxBpu7vd7Lep30GiyehMtDWw8Lwusef2GAWrL5eU4Ebwon2a6SpYH7Ar8nr/sU566pHzUIQFKxtL3Aq9/YzDVBtjkzgE1OGTf1dLQjExFqaOUZgUTzwFL+Uxof6MUA52BCi0wPIBDsUXZ9JFZqQVSDZDCBKsxeB9nQTYXGrOXexCdU8i9Idc88BblZOdPXj/IAQUCKD/KpKrRBAb00/feyDHZVsHe+dfRu77HtojQAQ3VJzDQP+qubUuJgNqCKZRpc3/fmHyv8sYhTLsoYp6OgcloEZb+JM6LEvJbAdl35AoDrIdydMyGb+NUFeTgUgmKUngJqASixrvKigDBgp+yOPUngF3O5gQFJUHxkeQq4W/MuUqisSwFxlQf+G1UNdJsu36ozBTLFidm1mSi9mgwGgtmggK37+lmr4/plbdbAPu7VSPAqUzqwMhFghBj1li+8a8szKASO3Pvpk/RvMSaQc46lhmgUKDMb9nBd8wP3vY9OXMYQRC9KMmOTV2TI0gzsNRJpTamZqnRlHph7yASWBP6Hi/COPb6UqVU1vMpjfHlMXWpkh8TYR0RmlqINAc8inytADBNqLu+rZl9QzcLKs99LHJ9iaDfN5VBlZZjLB2VPtBP0+eoeTbbJU1mC5DuvcWZE4nqDw6EFgE3gVmP7wAM0MhRG0lVrVtV4AJRqNw/AH58K5Z6VW3rDZwXzytjl1vQETyGhcySF70x3OaV59TnO1sFuhwTCJTX6eSpJpNsp0+aejcgrXaL1jOs3HK7nTeXL3KRyFK7WsTCtRBjKUSufjjeSJHoVcR6auGn0aB4c1vf7pD72Jmmi/qleChzXuweffo4Otldi1D2oXeVn9OF0S/AubXws5F16P3tH61H66GdV3zxYIJZzL+1WQPTGlNtOaY1PB/XGySfg4oN/Yw0Kw9FMqj7orl6MoBcedepVrLtrWIb1ZO8NXJPP+LKj9QI+jfSy5gDPhWpDX1juC2zWNNCOcoWSUSehZYlsfrekitx8MtYeVsNwUrTbEL/q43VN4TRlJTBWGU9VHci1JNkbDLYkhB0o1IejiI57qMAPJbrAf7jWyFfRjxz4L6TquA7lCaPC/F8lbEEoezAAtZmJhOorlpTm51MTeQSNkkIzEwXAaKoV5FXdX7REgEMAkHfTAooeKDF6Sjk1NUurLdVvA2iFUKAdmLCq3WXmt5ZyMHZrq3T7L81OfXVcP+GIrSAQN9K1lo6IDIUmbeBmk3Do6TeGo6sQ9VdN7pGO9RA4mkGM+hmwmVhwDDSPUGzuLSi8o6XjZJTapsWTl1t/HZ226hPDW7Jc/VVx/i5iz4N/zBwBu32Fwp6zaSqHqYig4pJuWMepfnSoaCzKiZlksmGJXKVybQ7mmfohspnHq0dlqGsUXYn5bIJ3+BA1rNlSYjSR5nzOZrZmHkYFOE6qnSlGrXe4epAd0nvmVsPurZR+b6V41cSw2UbM3P3tqJgKyKHqwfd3WX7e3t7g4qKLXqW7f1eT6XYhmQrsdZ0d2qOh8DBnbdJ+o+akWfgaMqy32vUQA477A1uBeCygWD8NluKoJKC6ZIrtttiM/Gdl97AKVAMfUiXYdimDUYsOgJYJqtovd2dJdL+BW50XUfvrgP4Z8A++/o3/dP3Kq+o6+rOUl5X8DuKFijkW6rvluWcKp3rLrybdXT3eo6ttNBAAqLUAkSBD4F4EF4jMaFQLjt4GkotgRkSrmZrjftoqShu41QWAIOanIFTP7qT2ZfEL2e2mnQPh313QYsisYxDo84qoFRyyzasVqDijmK+QwVtKWtkBxzI9aqWt8H+XnAbesqY52VsWnM3fQ1v6hEwWHO/QvQUg4D+XouTrIwLF+HkdhpgchdZqmnta+Ea22+t/ZX8+3KR+n9PD/XQMtYowVVRicqaefnA5RGYSnoBgY/Oj8/9sIin0/6QtqzQpG4zADUX63BRHmQWaI7UGFV9pmO5JjbNctyIv7MTFUcYILsAUsf+4FHArmgIerdC0JeZfZO6q70b9m8HbS/2b9TiD6UnZm3onaQQBYsYmE5LuS1hSnWIFeKyYLTm0uG7m+PRfClP8C+t9abBeUPb/qw+amkCOYd+rlK4kaxvAxti6wI29d2BrmdMU2jMoWnqzjiNGauxooVt4sgx574Dvokn14gdpsyp0Wy5eoMQPzsJ0m3Ci8ekSE58HOq1x8b+jLNkBx5t35PdH5NhmTmWuUq5YfvE6ImhlGIR2L3v1ec5xv9z+JfzC4+Ss7rt5AzbuvIVs+vHw9HRD3W41HmarkJNJ8nY85Mwmg8+j5+Tir04D2tPwtCtqkMyi0I55Rn/KatP/mVpsgrUojUx0JWymRX23k3PKu/f4G5qLuQxrwp1nE6gEFlySKzwUMOtgPlVO6uoWgP206JQS110bAT3xcwpaF9AdyfpabHVmCNHPsZGLJ0yRD3k6jinfudDztrvkJY7gyihp0RuREeprIm3w1GbXcBj2PIaOP3WO7cOt0aQ6/1aw6P9bLunIkuAoq/qoPaDvd/UQV2en6J/cVyU2brRSRmdfyU3Vc3LbAZW/Zp8VUXbc3zQV3QarsQ6qX2pD5AIyQvYuDv8gN1pnSdQPZ/lCxTsy7xBztPod+0LLg7Pjl1PULdt9ANV11/JC+B8vDLjJxV5ZaavxNRB4kvtHdGRtZt4O2zd7LLO0qnfs+ycIF9m5ZNJ+bs28g8fRq6NV00bTVz3/JUsHObilRk4ascrs28ppHYCX2rdgI2M28DaYdtGj3Wmjd2eZdkI+HTDtndj0vTrbMfUhvlYs3SNEu9rGAZJj9IY9SNZ3On7z+9PtdmtM7oXbMOkz0ioX2hwnbsvMGGPN7YnbbqkqdxzqUfoWlCse8iFrfp5YPd6lloj4AvVupjzHJj7Pcesy4+HF5fvG0Wq0bq5SK07/1pFKs3LK4tfSlleWQirRNVJ5ovLUkIoy1ILd1dZanVaW5bKns8rSyXsixxAGobJ6wxrdH2wslf5pIMaPVmG+DXCGcrmlUQzmqavEcwQMUjUxN+q02YH3E4yHgdWn2doMsG9SItLulqaP6r0+g00+XN93bVWaLNxXfyxelo6/4b9QxHxD1y058BQKnJewuDZ7U8ixEspGbunrxlAx5LFpTwZRZfu1W7EoojTqUIH7/3ZIi7jOZ6qi2eigF5Jdq9W/GuJ0F0DKzaoA0I8SfxaztogqgMXOMhn+Vreh1dfNZjxL6Jx5o3pbxnQfYlgo5lXVzeea+9ny5N0IvJXFHPVlRJjO7zabtKXqb9ls2wpIvM2HO075bPq2xWKLyX44C74JaRUYXslwkIF0F7gkY7ySX5yGSPXYLnuKK3u0u0ELs1tGjT6PsN1VrDPdZ9oyWO8/jUW6dKH/9ThI3Vp6ODqxj1s6X5hg5wJXSBru6Bfn/PWuOn6VXWjx7ryUzdf6d43es4GnnF3QFFnXXkTUx5KDh/FBpdncqw7s+AZCUs7JzjPzW8jDNo+fdD6KQO6iFJmgGVb3vPBKyF/+nb87R/rCyPb7J6uQMxzsYyzRZGs5PYwHQJqfvSmx9QHR9gxXeAFL5DhJzWgg5IUDbU9kN/QuI/l54Tw+xzoGJzZ71vXc9fNV/PS5d/93au/77Kbt/13xNj1/buaq93/+ncwnLbZXXsZ88o+vWdcudqktv36Jg34wST+pzTkOrwbilB1kWeHB/I7VlpKRcKLO/ldIXWjX2m0chna0nLBk3qIXtfNBgWlQOYJD4XvXV/TZTBTwbGD4qFxGLipyVEc0pdJcvq01roPJCl1kSzUH2M5YFc8ny4DiPXg7uVVU2ygI52rIsDfV/vDm5te41ircznPc75dgpwRY9a9JGf+POtbORsDyqNuNTbGpCOo1hUA4zXMlp5J8QC0FXIeb6RvVxdJ//WznFc6tEQXHysJGrceu3hsfIrFs64kKT9I9z8X8whcq988B14P4CzGqOUGZ24kuiv7YNuN6kkHHDSxNSHaIMsFJIG+e5HcxGRE0z77M/P39wZsv8r63rDvG8dd1EGTe16oL0XIQybVrS48bmKcmtRfHHBorYVgcrJBI80v41Q66Qq8zvNbDpiZVLinVLpmQ20c/6JSBjnuVVJuDNEQTmOLvlU89SGMpwnIPuJiU9PcxO8SE+27fU0hWQM0ROTsabYKSO9OP0085p6/SYW749klGNy2+JpyMfE3xGLvBbVKRW3oPU0oxhapQYGzT9RpUXLN66talD1E06Lc1cV2i6pWjJ9oUdZ6vE1Nc/2xTUwqQqVpR4hK0zpGuZmz10xwW8KUu7fSngPU3xXr5L5lb80g0N1iqbijRasW5mghqZu3RqLewpqz0NbCmfnxsw2MmYurNW3OalvFla7l2jir6rxu7lprjhYOW6rkFi7d77Nt4NRdgLPpbSmaiWs0zy8D/YkawhqXYlZlyyCVL0BZJD+z41FxS18TsVpxCM9YbaKc58sNCslJ/5dmra6+MCWT7BmPUzlqtYpExInVAO8VLoRxQb6ZkGuqKyLo05y+t1UM5b0E5teY+u0LO5j7qK0SzLjlJz0D/Fyt8Isy94Va9MOXkKKWeJ2p18Mqg0qf8Zg+HTYeIyfjsYeYJFO9/wcrgfOr
\ No newline at end of file

From 659b1fba99a5f14abfcc4300b9f974e14d9b6681 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 1 Jun 2020 16:01:21 -0700
Subject: [PATCH 1501/1533] Re-enable backwards compatibility test

PiperOrigin-RevId: 314221282
Change-Id: If7b6714eef3c3a69c3bddcb468aabb16bd29ddea
---
 tensorflow/core/ops/compat/BUILD | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tensorflow/core/ops/compat/BUILD b/tensorflow/core/ops/compat/BUILD
index d8bfd4473f7..299076d8cfd 100644
--- a/tensorflow/core/ops/compat/BUILD
+++ b/tensorflow/core/ops/compat/BUILD
@@ -39,10 +39,6 @@ tf_cc_test(
         "ops_history_v*/*.pbtxt",
         "ops_history.v*.pbtxt",
     ]),
-    tags = [
-        "no_oss",  # TODO(b/150030420): Reenable when fix lands.
-        "notap",  # TODO(b/150030420): Reenable when fix lands.
-    ],
     deps = [
         ":op_compatibility_lib",
         "//tensorflow/core:framework",

From 25af574ff39d3562eb613c9f05effffee6e59493 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 1 Jun 2020 16:03:16 -0700
Subject: [PATCH 1502/1533] Define the fixed output range interface and fix the
 logistic quantization error

The logistic quantization error is introduced by accidently fused the
requantize op to the logistic op. To fix the issue, an interface needs to be
defined for this op, so this fixed output range property can be queried by the
pass.

In the followup cls, this interface will be used to replace the FixedResultScale trait.

PiperOrigin-RevId: 314221651
Change-Id: Iece591195ca0146b93b5c1a1b9f65c0d205eed11
---
 tensorflow/compiler/mlir/lite/BUILD           |  2 +-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 31 +++++++++++++++++++
 .../compiler/mlir/lite/quantization/BUILD     | 25 +++++++++++++++
 .../mlir/lite/quantization/lite/BUILD         |  1 +
 .../mlir/lite/quantization/quantization.td    | 16 ++++++++++
 .../lite/quantization/quantization_traits.h   | 13 +++++---
 .../lite/quantization/quantization_utils.h    |  4 ++-
 7 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 1a508bdb190..c60c0c0edbf 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -216,13 +216,13 @@ cc_library(
         "ir/tfl_ops.h",
         "transforms/passes.h",
         "utils/attribute_utils.h",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_traits.h",
         "@llvm-project//mlir:include/mlir/Transforms/InliningUtils.h",
     ],
     deps = [
         ":tensorflow_lite_ops_inc_gen",
         ":validators",
         "//tensorflow/compiler/mlir/lite/experimental/estimators:cost_estimators",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/lite/schema:schema_fbs",
         "@llvm-project//llvm:support",
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index e645f98e922..76c342bd10a 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1726,6 +1726,7 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
     // scale = 1. / (max_value + 1)
     FixedResultScale<Int8UniformQuantizedType<-128, 390625, -8>>,
     FixedResultScale<UInt8UniformQuantizedType<0, 390625, -8>>,
+    FixedOutputRangeInterface,
     TFL_GpuTargetOp]> {
   let summary = "Logistic operator";
 
@@ -1736,6 +1737,36 @@ def TFL_LogisticOp: TFL_Op<"logistic", [
   let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$x);
 
   let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$y);
+
+  let extraClassDeclaration = [{
+  // FixedOutputRangeInterface:
+  quant::UniformQuantizedType GetFixedOutputRange(
+      bool is_signed, int bit_width) {
+    auto result_type = y().getType().cast<ShapedType>();
+    if (!result_type.getElementType().isa<FloatType>()) return {};
+    Builder builder(result_type.getContext());
+
+    // Only support 8-bits
+    if (bit_width != 8) return {};
+    IntegerType storage_type = builder.getIntegerType(bit_width);
+
+    double scale = 1.0 / 256;
+    int64_t zero_point, storage_min, storage_max;
+    if (is_signed) {
+      zero_point = -128;
+      storage_min = -128;
+      storage_max = 127;
+    } else {
+      zero_point = 0;
+      storage_min = 0;
+      storage_max = 255;
+    }
+
+    return quant::UniformQuantizedType::getChecked(
+        is_signed, storage_type, result_type.getElementType(), scale,
+        zero_point, storage_min, storage_max, builder.getUnknownLoc());
+  }
+  }];
 }
 
 def TFL_LogOp: TFL_Op<"log", [
diff --git a/tensorflow/compiler/mlir/lite/quantization/BUILD b/tensorflow/compiler/mlir/lite/quantization/BUILD
index 91590bfbc13..57417e95ec6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/BUILD
@@ -3,6 +3,10 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
 )
+load(
+    "//third_party/mlir:tblgen.bzl",
+    "gentbl",
+)
 
 package(
     default_visibility = [
@@ -35,6 +39,25 @@ filegroup(
     ],
 )
 
+gentbl(
+    name = "quantization_interfaces_inc_gen",
+    tbl_outs = [
+        (
+            "-gen-op-interface-decls",
+            "quantization_interface.h.inc",
+        ),
+        (
+            "-gen-op-interface-defs",
+            "quantization_interface.cc.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "quantization.td",
+    td_srcs = [
+        ":quantization_td_files",
+    ],
+)
+
 tf_proto_library(
     name = "quantization_info_proto",
     srcs = [
@@ -72,9 +95,11 @@ cc_library(
     name = "quantization_lib",
     srcs = [
         "quantization_driver.cc",
+        "quantization_interface.cc.inc",
         "quantization_utils.cc",
     ],
     hdrs = [
+        "quantization_interface.h.inc",
         "quantization_traits.h",
         "quantization_utils.h",
     ],
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index d9e478950e6..2783297814b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -53,6 +53,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:QuantOps",
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization.td b/tensorflow/compiler/mlir/lite/quantization/quantization.td
index 7bfcdb65686..c1e392bd3ad 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization.td
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization.td
@@ -63,6 +63,22 @@ def QI32 : QuantizedType<"Uniform", [32], 1>;
 // https://www.tensorflow.org/lite/performance/quantization_spec
 //===----------------------------------------------------------------------===//
 
+// TODO(b/157870442): replace all FixedResultScale trait
+def FixedOutputRangeInterface : OpInterface<
+  "FixedOutputRangeInterface"> {
+  let description = [{
+    Interface for defining the fixed output range.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the fixed output range.}],
+      "UniformQuantizedType", "GetFixedOutputRange",
+      (ins "bool":$sign, "int":$bit_width)
+    >,
+  ];
+}
+
 // Specify this trait if the op has a fixed output value range.
 class FixedResultScale<QuantizedType qt> : NativeOpTrait<!strconcat(
   "quant::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
index b59164b72e6..693f692c61a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_traits.h
@@ -21,13 +21,18 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
-namespace mlir {
-namespace OpTrait {
-namespace quant {
-
 using QuantizedType = mlir::quant::QuantizedType;
 using UniformQuantizedType = mlir::quant::UniformQuantizedType;
 
+namespace mlir {
+
+// This includes the interface class definition. It couldn't be in a namespace
+// because the table gen doesn't emit the namespace when it is used.
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_interface.h.inc"
+
+namespace OpTrait {
+namespace quant {
+
 // The base class that all the quantization related OpTrait implements.
 template <typename ConcreteType, template <typename> class TraitType>
 struct QuantizationSpecTraitBase : public TraitBase<ConcreteType, TraitType> {
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index b9ff9869232..f17e44cd756 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
@@ -385,7 +386,8 @@ struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
 
     Operation* def = pre_quantized.getDefiningOp();
     if (!def) return failure();
-    if (def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() ||
+    if (llvm::isa<FixedOutputRangeInterface>(def) ||
+        def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() ||
         def->hasTrait<OpTrait::quant::NoQuantizableResult>()) {
       return failure();
     }

From 00868972a8fd688560d083c7c1e0ac3021543a6a Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Mon, 1 Jun 2020 16:05:13 -0700
Subject: [PATCH 1503/1533] Try again add dill and tblib to RBE image

PiperOrigin-RevId: 314222042
Change-Id: Ib14707381951e857e09d469345d3d244d86ddfd8
---
 .../tools/ci_build/install/install_pip_packages_by_version.sh | 2 ++
 third_party/toolchains/preconfig/generate/containers.bzl      | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index 81e5f2b6406..6fe5dfc3193 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -57,6 +57,8 @@ PACKAGES=(
   "tb-nightly"
   "argparse"
   "dm-tree"
+  "dill"
+  "tblib"
 )
 
 # tf.mock require the following for python2:
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 8e7cd8d2b6c..4a3ed0b5225 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -9,8 +9,8 @@ container_digests = {
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:4dd708781c17a9e8d641c6ad05cc6e235e7147ff70f7b4a2ff6b31af43be4540",
-    "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:016b50adda42173f0fa70533f06c0b28c8a1057c56b2e4cd721295462a248ab7",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:3f890a951c81a201d60d0161a56ce628a90323be0c7f795550caa37f6f41a85c",
+    "cuda10.1-cudnn7-ubuntu18.04-manylinux2010-multipython": "sha256:bd7666d1ef49b2b2e2a64981f1c9234deeccdb0d5198b30ff4289c3dfcffedbf",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }

From 24580ebdd97df15eb4e047df659adb2b8319355d Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 1 Jun 2020 16:14:25 -0700
Subject: [PATCH 1504/1533] Make TraceMeEncode compile with C++14

PiperOrigin-RevId: 314223547
Change-Id: Iecc44c509c27787127200b3b5c92d3a9f6faad9c
---
 tensorflow/core/profiler/lib/BUILD            | 13 +---
 tensorflow/core/profiler/lib/traceme_encode.h | 43 ++++++-------
 .../core/profiler/lib/traceme_encode_test.cc  | 60 -------------------
 3 files changed, 19 insertions(+), 97 deletions(-)
 delete mode 100644 tensorflow/core/profiler/lib/traceme_encode_test.cc

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 6bda544055e..e80b9fc9766 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
-load("//tensorflow:tensorflow.bzl", "if_not_android", "tf_cc_test", "tf_cuda_library")
+load("//tensorflow:tensorflow.bzl", "if_not_android", "tf_cuda_library")
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")
 
 package(
@@ -113,17 +113,6 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "traceme_encode_test",
-    srcs = ["traceme_encode_test.cc"],
-    deps = [
-        ":traceme_encode",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 cc_library(
     name = "annotated_traceme",
     hdrs = ["annotated_traceme.h"],
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index 91b23740fc3..2e23c6d878b 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <initializer_list>
 #include <string>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -27,19 +28,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace profiler {
-
-// An argument passed to TraceMeEncode.
-struct TraceMeArg {
-  // This constructor is required because absl::AlphaNum is non-copyable.
-  template <typename Value>
-  TraceMeArg(absl::string_view k, Value v) : key(k), value(v) {}
-
-  TF_DISALLOW_COPY_AND_ASSIGN(TraceMeArg);
-
-  absl::string_view key;
-  absl::AlphaNum value;
-};
-
 namespace traceme_internal {
 
 // Copies the contents of str to the address pointed by out.
@@ -57,21 +45,23 @@ TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
 
 // Appends args encoded as TraceMe metadata to name.
 TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
-    std::string name, std::initializer_list<TraceMeArg> args) {
+    std::string name,
+    const std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>>&
+        args) {
   if (TF_PREDICT_TRUE(args.size() > 0)) {
     const auto old_size = name.size();
     auto new_size = old_size + args.size() * 2 + 1;
     for (const auto& arg : args) {
-      new_size += arg.key.size() + arg.value.size();
+      new_size += arg.first.size() + arg.second.size();
     }
     name.resize(new_size);
     char* const begin = &name[0];
     char* out = begin + old_size;
     *out++ = '#';
     for (const auto& arg : args) {
-      out = Append(out, arg.key);
+      out = Append(out, arg.first);
       *out++ = '=';
-      out = Append(out, arg.value.Piece());
+      out = Append(out, arg.second.Piece());
       *out++ = ',';
     }
     *(out - 1) = '#';
@@ -102,16 +92,19 @@ TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
 //   TraceMe trace_me([value1]() {
 //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
 //   });
-TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
-    std::string name, std::initializer_list<TraceMeArg> args) {
+inline std::string TraceMeEncode(
+    std::string name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
   return traceme_internal::AppendArgs(std::move(name), args);
 }
-TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
-    absl::string_view name, std::initializer_list<TraceMeArg> args) {
+inline std::string TraceMeEncode(
+    absl::string_view name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
   return traceme_internal::AppendArgs(std::string(name), args);
 }
-TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
-    const char* name, std::initializer_list<TraceMeArg> args) {
+inline std::string TraceMeEncode(
+    const char* name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
   return traceme_internal::AppendArgs(std::string(name), args);
 }
 
@@ -123,8 +116,8 @@ TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
 //   trace_me.AppendMetadata([value1]() {
 //     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
 //   });
-TF_ATTRIBUTE_ALWAYS_INLINE inline std::string TraceMeEncode(
-    std::initializer_list<TraceMeArg> args) {
+inline std::string TraceMeEncode(
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
   return traceme_internal::AppendArgs(std::string(), args);
 }
 
diff --git a/tensorflow/core/profiler/lib/traceme_encode_test.cc b/tensorflow/core/profiler/lib/traceme_encode_test.cc
deleted file mode 100644
index a849004ab8e..00000000000
--- a/tensorflow/core/profiler/lib/traceme_encode_test.cc
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/profiler/lib/traceme_encode.h"
-
-#include <string>
-
-#include "absl/strings/str_cat.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace {
-
-TEST(TraceMeEncodeTest, NoArgTest) {
-  EXPECT_EQ(TraceMeEncode("Hello!", {}), "Hello!");
-}
-
-TEST(TraceMeEncodeTest, OneArgTest) {
-  EXPECT_EQ(TraceMeEncode("Hello", {{"context", "World"}}),
-            "Hello#context=World#");
-}
-
-TEST(TraceMeEncodeTest, TwoArgsTest) {
-  EXPECT_EQ(TraceMeEncode("Hello", {{"context", "World"}, {"request_id", 42}}),
-            "Hello#context=World,request_id=42#");
-}
-
-TEST(TraceMeEncodeTest, ThreeArgsTest) {
-  EXPECT_EQ(TraceMeEncode("Hello", {{"context", "World"},
-                                    {"request_id", 42},
-                                    {"addr", absl::Hex(0xdeadbeef)}}),
-            "Hello#context=World,request_id=42,addr=deadbeef#");
-}
-
-TEST(TraceMeEncodeTest, TemporaryStringTest) {
-  EXPECT_EQ(TraceMeEncode("Hello", {{std::string("context"),
-                                     absl::StrCat("World:", 2020)}}),
-            "Hello#context=World:2020#");
-}
-
-TEST(TraceMeEncodeTest, NoNameTest) {
-  EXPECT_EQ(TraceMeEncode({{"context", "World"}, {"request_id", 42}}),
-            "#context=World,request_id=42#");
-}
-
-}  // namespace
-}  // namespace profiler
-}  // namespace tensorflow

From edeae9fb69432f9ac4662152e942bb65b7c49f75 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Mon, 1 Jun 2020 16:23:43 -0700
Subject: [PATCH 1505/1533] 1. Adapt flex delegate to the newly created
 SimpleDelegate APIs.

2. Change SimpleDelegate APIs accordingly as needed to adapt the flex delegate.
a. Create a `version` virtual function.
b. Add a new function acting as an initialization for TfLiteDelegate Prepare.
c. Return a std::unique_ptr TfLiteDegate for the factory method of TfLiteDelegateFactory for less error-prone memory management.

PiperOrigin-RevId: 314225150
Change-Id: I6bf7ab48f1e72f390b49dd5c6ac7ecd11d29327a
---
 tensorflow/lite/c/common.h                    |   5 +-
 tensorflow/lite/delegates/flex/BUILD          |  62 ++------
 tensorflow/lite/delegates/flex/delegate.cc    | 110 +++++++-------
 tensorflow/lite/delegates/flex/delegate.h     |  52 +++++--
 .../lite/delegates/flex/delegate_test.cc      |   5 +-
 .../delegates/flex/java/src/main/native/BUILD |   1 +
 .../java/src/main/native/flex_delegate_jni.cc |   4 +-
 tensorflow/lite/delegates/flex/kernel.cc      |  94 ++++--------
 tensorflow/lite/delegates/flex/kernel.h       |  24 +++-
 tensorflow/lite/delegates/flex/kernel_test.cc | 136 +++++-------------
 .../lite/delegates/utils/simple_delegate.cc   |   3 +-
 .../lite/delegates/utils/simple_delegate.h    |  38 +++--
 .../delegates/utils/simple_delegate_test.cc   |  37 ++---
 tensorflow/lite/testing/tflite_driver.cc      |   5 +-
 .../benchmark/experimental/c/c_api_types.h    |   5 +-
 15 files changed, 254 insertions(+), 327 deletions(-)

diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index d95c1431041..31d8d53d563 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -744,8 +744,9 @@ typedef struct TfLiteDelegate {
                           struct TfLiteDelegate* delegate);
 
   // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
-  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
+  // cannot be null.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                        struct TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 8fc59c2c132..7981f4b5c27 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -61,6 +61,7 @@ cc_library(
     deps = [
         ":delegate_data",
         ":delegate_only_runtime",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -82,6 +83,8 @@ cc_library(
     name = "delegate_only_runtime",
     srcs = [
         "delegate.cc",
+        "kernel.cc",
+        "kernel.h",
     ],
     hdrs = [
         "delegate.h",
@@ -90,14 +93,18 @@ cc_library(
     deps = [
         ":buffer_map",
         ":delegate_data",
-        ":kernel",
         ":util",
+        "@flatbuffers",
         "@com_google_absl//absl/strings:strings",
+        "//tensorflow/lite/core/api",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
+        "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
+        "//tensorflow/lite/kernels:kernel_util",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -106,7 +113,12 @@ cc_library(
             "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
+            "//tensorflow/core/common_runtime/eager:context",
+            "//tensorflow/core/common_runtime/eager:execute",
+            "//tensorflow/core/common_runtime/eager:tensor_handle",
             "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+            "//tensorflow/core:framework",
         ],
     }),
     alwayslink = 1,
@@ -163,40 +175,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "kernel",
-    srcs = ["kernel.cc"],
-    hdrs = ["kernel.h"],
-    deps = [
-        ":delegate_data",
-        ":util",
-        "@flatbuffers",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite:kernel_api",
-        "//tensorflow/lite:string",
-        "//tensorflow/lite/kernels:kernel_util",
-    ] + select({
-        # TODO(b/111881878): The android_tensorflow_lib target pulls in the full
-        # set of core TensorFlow kernels. We may want to revisit this dependency
-        # to allow selective registration via build targets.
-        "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
-        ],
-        "//tensorflow:ios": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core/common_runtime/eager:context",
-            "//tensorflow/core/common_runtime/eager:execute",
-            "//tensorflow/core/common_runtime/eager:tensor_handle",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:protos_all_cc",
-            "//tensorflow/core:framework",
-        ],
-    }),
-)
-
 tf_cc_test(
     name = "kernel_test",
     size = "small",
@@ -204,20 +182,10 @@ tf_cc_test(
     tags = ["no_gpu"],  # GPU + flex is not officially supported.
     deps = [
         ":delegate_data",
-        ":kernel",
+        ":delegate_only_runtime",
         ":test_util",
         "@com_google_googletest//:gtest",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib",
-        ],
-        "//tensorflow:ios": [
-            "//tensorflow/core:portable_tensorflow_lib",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:tensorflow",
-        ],
-    }),
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 985b2b68afe..13ce5ff2a22 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/flex/delegate.h"
 
+#include <memory>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -27,10 +28,32 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
-namespace flex {
-namespace delegate {
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
+// Corresponding weak declaration found in lite/interpreter_builder.cc.
+TfLiteDelegateUniquePtr AcquireFlexDelegate() {
+  return tflite::FlexDelegate::Create();
+}
+
+TfLiteDelegateUniquePtr FlexDelegate::Create(
+    std::unique_ptr<FlexDelegate> base_delegate) {
+  TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
+                       "Created TensorFlow Lite delegate for select TF ops.");
+  if (base_delegate == nullptr) {
+    base_delegate.reset(new FlexDelegate());
+  }
+  auto flex_delegate = TfLiteDelegateFactory::Create(std::move(base_delegate));
+  flex_delegate->CopyFromBufferHandle =
+      [](TfLiteContext* context, TfLiteDelegate* delegate,
+         TfLiteBufferHandle buffer_handle,
+         TfLiteTensor* tensor) -> TfLiteStatus {
+    return reinterpret_cast<FlexDelegate*>(delegate->data_)
+        ->CopyFromBufferHandle(context, buffer_handle, tensor);
+  };
+  flex_delegate->flags |= kTfLiteDelegateFlagsAllowDynamicTensors;
+  return flex_delegate;
+}
+
+TfLiteStatus FlexDelegate::Initialize(TfLiteContext* context) {
   // If the TensorFlow Lite thread count is explicitly configured, use it,
   // otherwise rely on the default TensorFlow threading behavior.
   tensorflow::SessionOptions session_options;
@@ -39,47 +62,37 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
         context->recommended_num_threads);
   }
 
-  auto status = reinterpret_cast<DelegateData*>(delegate->data_)
-                    ->Prepare(session_options);
+  auto status = delegate_data_.Prepare(session_options);
   if (!status.ok()) {
     context->ReportError(context, "Failed to initialize TensorFlow context: %s",
                          status.error_message().c_str());
     return kTfLiteError;
   }
 
-  // Get the nodes in the current execution plan. Interpreter owns this array.
-  TfLiteIntArray* plan;
-  TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
-
-  // Add all custom ops starting with "Flex" to list of supported nodes.
-  std::vector<int> supported_nodes;
-  for (int node_index : TfLiteIntArrayView(plan)) {
-    TfLiteNode* node;
-    TfLiteRegistration* registration;
-    TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
-        context, node_index, &node, &registration));
-
-    if (IsFlexOp(registration->custom_name)) {
-      supported_nodes.push_back(node_index);
-    }
-  }
-
-  // Request TFLite to partition the graph and make kernels for each independent
-  // node sub set.
-  TfLiteIntArray* size_and_nodes =
-      ConvertVectorToTfLiteIntArray(supported_nodes);
-  context->ReplaceNodeSubsetsWithDelegateKernels(context, GetKernel(),
-                                                 size_and_nodes, delegate);
-  TfLiteIntArrayFree(size_and_nodes);
   return kTfLiteOk;
 }
 
-TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
-                                  TfLiteDelegate* delegate,
-                                  TfLiteBufferHandle buffer_handle,
-                                  TfLiteTensor* output) {
-  BufferMap* buffer_map =
-      reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
+const char* FlexDelegate::name() const {
+  static constexpr char kName[] = "TfLiteFlexDelegate";
+  return kName;
+}
+
+bool FlexDelegate::IsNodeSupportedByDelegate(
+    const TfLiteRegistration* registration, const TfLiteNode* node,
+    TfLiteContext* context) const {
+  return IsFlexOp(registration->custom_name);
+}
+
+std::unique_ptr<SimpleDelegateKernelInterface>
+FlexDelegate::CreateDelegateKernelInterface() {
+  return std::unique_ptr<SimpleDelegateKernelInterface>(
+      new tflite::flex::DelegateKernel());
+}
+
+TfLiteStatus FlexDelegate::CopyFromBufferHandle(
+    TfLiteContext* context, TfLiteBufferHandle buffer_handle,
+    TfLiteTensor* output) {
+  flex::BufferMap* buffer_map = delegate_data_.GetBufferMap(context);
 
   if (!buffer_map->HasTensor(buffer_handle)) {
     context->ReportError(context, "Invalid tensor index %d.", buffer_handle);
@@ -122,31 +135,4 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-}  // namespace delegate
-}  // namespace flex
-
-// Corresponding weak declaration found in lite/model.cc.
-std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
-AcquireFlexDelegate() {
-  return std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>(
-      tflite::FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
-        delete reinterpret_cast<tflite::FlexDelegate*>(delegate);
-      });
-}
-
-std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
-  TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
-                       "Created TensorFlow Lite delegate for select TF ops.");
-  return std::unique_ptr<FlexDelegate>(new FlexDelegate());
-}
-
-FlexDelegate::FlexDelegate() : TfLiteDelegate(TfLiteDelegateCreate()) {
-  data_ = &delegate_data_;
-  Prepare = &flex::delegate::Prepare;
-  CopyFromBufferHandle = &flex::delegate::CopyFromBufferHandle;
-  flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-}
-
-FlexDelegate::~FlexDelegate() {}
-
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/delegate.h b/tensorflow/lite/delegates/flex/delegate.h
index a199ae9eda8..a760d941656 100644
--- a/tensorflow/lite/delegates/flex/delegate.h
+++ b/tensorflow/lite/delegates/flex/delegate.h
@@ -17,9 +17,16 @@ limitations under the License.
 
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 
 namespace tflite {
 
+namespace flex {
+namespace testing {
+class KernelTest;
+}  // namespace testing
+}  // namespace flex
+
 // WARNING: This is an experimental interface that is subject to change.
 // Delegate that can be used to extract parts of a graph that are designed to be
 // executed by TensorFlow's runtime via Eager.
@@ -33,22 +40,49 @@ namespace tflite {
 //   ... build interpreter ...
 //
 //   if (delegate) {
-//     interpreter->ModifyGraphWithDelegate(delegate.get());
+//     interpreter->ModifyGraphWithDelegate(std::move(delegate));
 //   }
 //   ... run inference ...
 //   ... destroy interpreter ...
-//   ... destroy delegate ...
-class FlexDelegate : public TfLiteDelegate {
+class FlexDelegate : public SimpleDelegateInterface {
  public:
+  friend class flex::testing::KernelTest;
+
   // Creates a delegate that supports TF ops.
-  //
-  // If the underyling TF Flex context creation fails, returns null.
-  static std::unique_ptr<FlexDelegate> Create();
+  static TfLiteDelegateUniquePtr Create() {
+    return Create(/*base_delegate*/ nullptr);
+  }
 
-  ~FlexDelegate();
+  ~FlexDelegate() override {}
 
- private:
-  FlexDelegate();
+  flex::DelegateData* mutable_data() { return &delegate_data_; }
+
+ protected:
+  // We sometimes have to create certain stub data to test FlexDelegate. To
+  // achieve this, we will make a testing flex delegate class that inherits from
+  // FlexDelegate to override certain things for stub data creation. Therefore,
+  // this function accepts a FlexDelegate instance to initiliaze it properly for
+  // create a testing flex delegate in some cases, and it is only used in
+  // testing.
+  static TfLiteDelegateUniquePtr Create(
+      std::unique_ptr<FlexDelegate> base_delegate);
+
+  FlexDelegate() {}
+
+  const char* name() const override;
+
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override;
+
+  TfLiteStatus Initialize(TfLiteContext* context) override;
+
+  std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
+      override;
+
+  TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
+                                    TfLiteBufferHandle buffer_handle,
+                                    TfLiteTensor* output);
 
   flex::DelegateData delegate_data_;
 };
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index b48fe181e1f..d574d8fabbb 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -26,8 +26,7 @@ using ::testing::ElementsAre;
 
 class DelegateTest : public testing::FlexModelTest {
  public:
-  DelegateTest() {
-    delegate_ = FlexDelegate::Create();
+  DelegateTest() : delegate_(FlexDelegate::Create()) {
     interpreter_.reset(new Interpreter(&error_reporter_));
   }
 
@@ -44,7 +43,7 @@ class DelegateTest : public testing::FlexModelTest {
   }
 
  private:
-  std::unique_ptr<FlexDelegate> delegate_;
+  std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)> delegate_;
 };
 
 TEST_F(DelegateTest, FullGraph) {
diff --git a/tensorflow/lite/delegates/flex/java/src/main/native/BUILD b/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
index b240e0d3825..4732b963ed2 100644
--- a/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
+++ b/tensorflow/lite/delegates/flex/java/src/main/native/BUILD
@@ -18,6 +18,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite/delegates/flex:delegate",
+        "//tensorflow/lite/delegates/utils:simple_delegate",
         "//tensorflow/lite/java/jni",
         "//tensorflow/lite/testing:init_tensorflow",
     ],
diff --git a/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc b/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc
index 957c523c9fd..fef71914d80 100644
--- a/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc
+++ b/tensorflow/lite/delegates/flex/java/src/main/native/flex_delegate_jni.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <jni.h>
 
 #include "tensorflow/lite/delegates/flex/delegate.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 #include "tensorflow/lite/testing/init_tensorflow.h"
 
 #ifdef __cplusplus
@@ -37,7 +38,8 @@ Java_org_tensorflow_lite_flex_FlexDelegate_nativeCreateDelegate(JNIEnv* env,
 JNIEXPORT void JNICALL
 Java_org_tensorflow_lite_flex_FlexDelegate_nativeDeleteDelegate(
     JNIEnv* env, jclass clazz, jlong delegate) {
-  delete reinterpret_cast<tflite::FlexDelegate*>(delegate);
+  tflite::TfLiteDelegateFactory::DeleteSimpleDelegate(
+      reinterpret_cast<struct TfLiteDelegate*>(delegate));
 }
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index d1c21086703..b6e809647d5 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/profiler.h"
+#include "tensorflow/lite/delegates/flex/delegate.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -49,7 +50,6 @@ limitations under the License.
 
 namespace tflite {
 namespace flex {
-namespace kernel {
 
 struct OpNode;
 
@@ -357,33 +357,29 @@ struct OpData {
   std::vector<int> subgraph_outputs;
 };
 
-void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* op_data = new OpData;
+DelegateKernel::DelegateKernel() : op_data_(new OpData) {}
+DelegateKernel::~DelegateKernel() {}
 
-  const TfLiteDelegateParams* params =
-      reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-  CHECK(params);
-  CHECK(params->delegate);
-  CHECK(params->delegate->data_);
-  op_data->eager_context =
-      reinterpret_cast<DelegateData*>(params->delegate->data_)
-          ->GetEagerContext();
-  op_data->buffer_map = reinterpret_cast<DelegateData*>(params->delegate->data_)
-                            ->GetBufferMap(context);
+TfLiteStatus DelegateKernel::Init(TfLiteContext* context,
+                                  const TfLiteDelegateParams* params) {
+  auto* flex_delegate_data =
+      reinterpret_cast<FlexDelegate*>(params->delegate->data_)->mutable_data();
+  op_data_->eager_context = flex_delegate_data->GetEagerContext();
+  op_data_->buffer_map = flex_delegate_data->GetBufferMap(context);
 
   CHECK(params->output_tensors);
   std::set<int> output_set;
   for (auto tensor_index : TfLiteIntArrayView(params->output_tensors)) {
-    op_data->subgraph_outputs.push_back(tensor_index);
+    op_data_->subgraph_outputs.push_back(tensor_index);
     output_set.insert(tensor_index);
   }
 
   CHECK(params->input_tensors);
   for (auto tensor_index : TfLiteIntArrayView(params->input_tensors)) {
-    op_data->subgraph_inputs.push_back(tensor_index);
+    op_data_->subgraph_inputs.push_back(tensor_index);
   }
 
-  op_data->nodes.reserve(params->nodes_to_replace->size);
+  op_data_->nodes.reserve(params->nodes_to_replace->size);
 
   CHECK(params->nodes_to_replace);
   tensorflow::Status status;
@@ -392,8 +388,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
     TfLiteRegistration* reg;
     context->GetNodeAndRegistration(context, node_index, &node, &reg);
 
-    op_data->nodes.emplace_back(new OpNode(node->inputs, node->outputs));
-    OpNode& node_data = *op_data->nodes.back();
+    op_data_->nodes.emplace_back(new OpNode(node->inputs, node->outputs));
+    OpNode& node_data = *op_data_->nodes.back();
 
     node_data.set_index(node_index);
     node_data.set_name("");
@@ -401,16 +397,11 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
     status = node_data.InitializeNodeDef(node->custom_initial_data,
                                          node->custom_initial_data_size);
     if (!status.ok()) break;
-    status = node_data.BuildEagerOp(op_data->eager_context);
+    status = node_data.BuildEagerOp(op_data_->eager_context);
     if (!status.ok()) break;
   }
 
-  if (ConvertStatus(context, status) != kTfLiteOk) {
-    // We can't return an error from this function but ConvertStatus will
-    // report them and we will stop processing in Prepare() if anything went
-    // wrong.
-    return op_data;
-  }
+  TF_LITE_ENSURE_STATUS(ConvertStatus(context, status));
 
   // Given a TfLite tensor index, return the OpNode that produces it,
   // along with it index into that OpNodes list of outputs.
@@ -418,7 +409,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 
   // Find out how each tensor is produced. This does not account for
   // tensors that are not produce by eager ops.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     node_data->mutable_outputs()->InitializeGraphOutputs(output_set);
     for (int i = 0; i < node_data->outputs().Size(); ++i) {
       int output_index = node_data->outputs().TfLiteIndex(i);
@@ -428,21 +419,15 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 
   // For each node, resolve the inputs, so we can keep pointers to the nodes
   // that produces them.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     node_data->mutable_inputs()->InitializeTensorSources(tflite_tensor_sources);
   }
-
-  return op_data;
+  return kTfLiteOk;
 }
 
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<OpData*>(buffer);
-}
-
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  const auto* op_data = reinterpret_cast<OpData*>(node->user_data);
+TfLiteStatus DelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_MSG(
-      context, op_data->eager_context != nullptr,
+      context, op_data_->eager_context != nullptr,
       "Failed to initialize eager context. This often happens when a CPU "
       "device has not been registered, presumably because some symbols from "
       "tensorflow/core:core_cpu_impl were not linked into the binary.");
@@ -452,8 +437,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   std::map<int, int> tensor_ref_count;
 
   // Whenever we find a constant tensor, insert it in the buffer map.
-  BufferMap* buffer_map = op_data->buffer_map;
-  for (auto tensor_index : op_data->subgraph_inputs) {
+  BufferMap* buffer_map = op_data_->buffer_map;
+  for (auto tensor_index : op_data_->subgraph_inputs) {
     TfLiteTensor* tensor = &context->tensors[tensor_index];
     if (IsConstantTensor(tensor)) {
       if (!buffer_map->HasTensor(tensor_index)) {
@@ -469,12 +454,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // All output tensors are allocated by TensorFlow/Eager, so we
   // mark them as kTfLiteDynamic.
-  for (auto tensor_index : op_data->subgraph_outputs) {
+  for (auto tensor_index : op_data_->subgraph_outputs) {
     SetTensorToDynamic(&context->tensors[tensor_index]);
     ++tensor_ref_count[tensor_index];
   }
 
-  for (const auto& node_data : op_data->nodes) {
+  for (const auto& node_data : op_data_->nodes) {
     if (node_data->nodedef().op().empty()) {
       context->ReportError(context, "Invalid NodeDef in Flex op '%s'",
                            node_data->name().c_str());
@@ -490,7 +475,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // All tensors that are referenced exactly once are marked as "forwardable",
   // meaning that we will allow TensorFlow to reuse its buffer as the output of
   // an op.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     for (int i = 0; i < node_data->inputs().Size(); ++i) {
       bool f = (tensor_ref_count[node_data->inputs().TfLiteIndex(i)] == 1);
       node_data->mutable_inputs()->SetForwardable(i, f);
@@ -500,13 +485,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_data = reinterpret_cast<OpData*>(node->user_data);
-  BufferMap* buffer_map = op_data->buffer_map;
+TfLiteStatus DelegateKernel::Invoke(TfLiteContext* context, TfLiteNode* node) {
+  BufferMap* buffer_map = op_data_->buffer_map;
 
   // Insert a tensor in the buffer map for all inputs that are not constant.
   // Constants were handled in Prepare() already.
-  for (auto tensor_index : op_data->subgraph_inputs) {
+  for (auto tensor_index : op_data_->subgraph_inputs) {
     TfLiteTensor* tensor = &context->tensors[tensor_index];
     if (!IsConstantTensor(tensor)) {
       // If this tensor is part of an earlier TF subgraph we should not add it
@@ -519,7 +503,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Execute the TensorFlow Ops sequentially.
-  for (auto& node_data : op_data->nodes) {
+  for (auto& node_data : op_data_->nodes) {
     TFLITE_SCOPED_DELEGATE_OPERATOR_PROFILE(
         reinterpret_cast<Profiler*>(context->profiler),
         node_data->name().c_str(), node_data->index());
@@ -528,7 +512,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ConvertStatus(context, status));
   }
 
-  for (auto tensor_index : op_data->subgraph_outputs) {
+  for (auto tensor_index : op_data_->subgraph_outputs) {
     if (!buffer_map->HasTensor(tensor_index)) {
       context->ReportError(context, "Cannot write to invalid tensor index %d",
                            tensor_index);
@@ -546,21 +530,5 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-}  // namespace kernel
-
-TfLiteRegistration GetKernel() {
-  TfLiteRegistration registration{
-      &kernel::Init,
-      &kernel::Free,
-      &kernel::Prepare,
-      &kernel::Eval,
-      nullptr,                 // .profiling_string
-      kTfLiteBuiltinDelegate,  // .builtin_code
-      "TfLiteFlexDelegate",    // .custom_name
-      1,                       // .version
-  };
-  return registration;
-}
-
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/kernel.h b/tensorflow/lite/delegates/flex/kernel.h
index 6b30a19fbc2..27cbfeadc14 100644
--- a/tensorflow/lite/delegates/flex/kernel.h
+++ b/tensorflow/lite/delegates/flex/kernel.h
@@ -15,18 +15,28 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
 #define TENSORFLOW_LITE_DELEGATES_FLEX_KERNEL_H_
 
+#include <memory>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/utils/simple_delegate.h"
 
 namespace tflite {
 namespace flex {
 
-// Return the registration object used to initialize and execute ops that will
-// be delegated to TensorFlow's Eager runtime. This TF Lite op is created by
-// the flex delegate to handle execution of a supported subgraph. The usual
-// flow is that the delegate informs the interpreter of supported nodes in a
-// graph, and each supported subgraph is replaced with one instance of this
-// kernel.
-TfLiteRegistration GetKernel();
+struct OpData;
+class DelegateKernel : public SimpleDelegateKernelInterface {
+ public:
+  DelegateKernel();
+  ~DelegateKernel() override;
+
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) override;
+  TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override;
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) override;
+
+ private:
+  std::unique_ptr<OpData> op_data_;
+};
 
 }  // namespace flex
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index 380dbfb4f03..f7234075c95 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -12,38 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/lite/delegates/flex/kernel.h"
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/flex/delegate.h"
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 #include "tensorflow/lite/delegates/flex/test_util.h"
 
 namespace tflite {
 namespace flex {
-namespace {
+namespace testing {
 
 using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 
-TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteDelegate* delegate,
-                            const std::vector<int>& supported_nodes) {
-  TfLiteIntArray* size_and_nodes =
-      ConvertVectorToTfLiteIntArray(supported_nodes);
-  TF_LITE_ENSURE_STATUS(context->ReplaceNodeSubsetsWithDelegateKernels(
-      context, flex::GetKernel(), size_and_nodes, delegate));
-  TfLiteIntArrayFree(size_and_nodes);
-  return kTfLiteOk;
-}
-
-// There is no easy way to pass a parameter into the TfLiteDelegate's
-// 'prepare' function, so we keep a global map for testing purposed.
-// To avoid collisions use: GetPrepareFunction<__LINE__>().
-std::map<int, std::vector<int>>* GetGlobalOpLists() {
-  static auto* op_list = new std::map<int, std::vector<int>>;
-  return op_list;
-}
+// A testing flex delegate that supports every node regardless whether it's
+// actually supported or not. It's only for testing certain scenarios.
+class TestFlexDelegate : public FlexDelegate {
+ protected:
+  bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
+                                 const TfLiteNode* node,
+                                 TfLiteContext* context) const override {
+    return true;
+  }
+};
 
 class KernelTest : public testing::FlexModelTest {
  public:
@@ -51,51 +43,16 @@ class KernelTest : public testing::FlexModelTest {
   static constexpr int kTwos = 2;  // This is the index of a tensor of 2's.
   static constexpr int kMaxTensors = 30;
 
-  static void SetUpTestSuite() { GetGlobalOpLists()->clear(); }
+  KernelTest() { interpreter_.reset(new Interpreter(&error_reporter_)); }
 
-  KernelTest() {
-    CHECK(delegate_data_.Prepare(tensorflow::SessionOptions{}).ok());
-    interpreter_.reset(new Interpreter(&error_reporter_));
+  void ApplyFlexDelegate(std::unique_ptr<FlexDelegate> delegate = nullptr) {
+    auto flex_delegate = FlexDelegate::Create(std::move(delegate));
+    auto* delegate_data =
+        reinterpret_cast<FlexDelegate*>(flex_delegate->data_)->mutable_data();
+    CHECK(delegate_data->Prepare(tensorflow::SessionOptions{}).ok());
+    CHECK(interpreter_->ModifyGraphWithDelegate(std::move(flex_delegate)) ==
+          kTfLiteOk);
   }
-
-  typedef TfLiteStatus (*PrepareFunction)(TfLiteContext* context,
-                                          TfLiteDelegate* delegate);
-
-  template <int KEY>
-  PrepareFunction GetPrepareFunction() {
-    GetGlobalOpLists()->insert({KEY, tf_ops_});
-    return [](TfLiteContext* context, TfLiteDelegate* delegate) {
-      return GenericPrepare(context, delegate, GetGlobalOpLists()->at(KEY));
-    };
-  }
-
-  template <typename T>
-  void ConfigureDelegate(T prepare_function) {
-    delegate_.data_ = &delegate_data_;
-    delegate_.flags = kTfLiteDelegateFlagsAllowDynamicTensors;
-    delegate_.FreeBufferHandle = nullptr;
-    delegate_.Prepare = prepare_function;
-    delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* output) {
-      auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
-      auto* buffer_map = delegate_data->GetBufferMap(context);
-      if (!buffer_map->HasTensor(buffer_handle)) {
-        context->ReportError(context, "Tensor '%d' not found", buffer_handle);
-        return kTfLiteError;
-      }
-      tensorflow::StringPiece values =
-          buffer_map->GetTensor(buffer_handle).tensor_data();
-      memcpy(output->data.raw, values.data(), values.size());
-      return kTfLiteOk;
-    };
-    CHECK(interpreter_->ModifyGraphWithDelegate(&delegate_) == kTfLiteOk);
-  }
-
- private:
-  DelegateData delegate_data_;
-  TfLiteDelegate delegate_;
 };
 
 TEST_F(KernelTest, FullGraph) {
@@ -108,10 +65,7 @@ TEST_F(KernelTest, FullGraph) {
   AddTfOp(testing::kAdd, {2, 5}, {7});
   AddTfOp(testing::kMul, {6, 7}, {8});
 
-  // Apply Delegate.
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 3, 4});
-  });
+  ApplyFlexDelegate();
 
   // Define inputs.
   SetShape(0, {2, 2, 1});
@@ -140,9 +94,7 @@ TEST_F(KernelTest, BadTensorFlowOp) {
   AddTensors(2, {0}, {1}, kTfLiteFloat32, {3});
   AddTfOp(testing::kNonExistent, {0}, {1});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
+  ApplyFlexDelegate(std::unique_ptr<FlexDelegate>(new TestFlexDelegate()));
 
   ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
@@ -153,9 +105,7 @@ TEST_F(KernelTest, BadNumberOfOutputs) {
   AddTensors(3, {0}, {1, 2}, kTfLiteFloat32, {3});
   AddTfOp(testing::kIdentity, {0}, {1, 2});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 1});
   SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
@@ -171,9 +121,7 @@ TEST_F(KernelTest, IncompatibleNodeDef) {
   // Cast is a TF op, but we don't add the proper nodedef to it in AddTfOp.
   AddTfOp(testing::kIncompatibleNodeDef, {0}, {1});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 1});
   SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
@@ -188,14 +136,14 @@ TEST_F(KernelTest, WrongSetOfNodes) {
   AddTfOp(testing::kUnpack, {0}, {1, 2});
   AddTfLiteMulOp({1, 2}, {3});
 
-  // Specify that testing::kMul (#1) is supported when it actually isn't.
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1});
-  });
+  // Specify that testing::kMul (#1) is supported when it actually isn't so that
+  // we choose to use the TestFlexDelegate that supports every node regardless
+  // whether it's actually supported or not.
+  ApplyFlexDelegate(std::unique_ptr<FlexDelegate>(new TestFlexDelegate()));
 
   ASSERT_NE(interpreter_->AllocateTensors(), kTfLiteOk);
   ASSERT_THAT(error_reporter().error_messages(),
-              ContainsRegex("Invalid NodeDef in Flex op"));
+              ContainsRegex("Cannot convert empty data into a valid NodeDef"));
 }
 
 TEST_F(KernelTest, MixedGraph) {
@@ -207,9 +155,7 @@ TEST_F(KernelTest, MixedGraph) {
   AddTfOp(testing::kAdd, {2, 5}, {7});
   AddTfLiteMulOp({6, 7}, {8});
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    return GenericPrepare(context, delegate, {0, 1, 2, 3});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 1});
   SetValues(0, {1.1f, 2.2f, 3.3f, 4.4f});
@@ -251,14 +197,7 @@ TEST_F(KernelTest, SplitGraph) {
   // The two branches added together:
   AddTfOp(testing::kAdd, {9, 16}, {17});  // => 16
 
-  ConfigureDelegate([](TfLiteContext* context, TfLiteDelegate* delegate) {
-    // All ops but #3 are TF ops, handled by the delegate. However, because #4
-    // depends on the non-TF op, two subgraphs are necessary:
-    //    TF subgraph 1: 0, 1, 2, 6, 7, 8, 9
-    //    TF Lite Op: 3
-    //    TF subgraph 2: 4, 5, 10
-    return GenericPrepare(context, delegate, {0, 1, 2, 4, 5, 6, 7, 8, 9, 10});
-  });
+  ApplyFlexDelegate();
 
   SetShape(0, {2, 2, 2, 1});
   SetValues(0, a);
@@ -291,9 +230,8 @@ class MultipleSubgraphsTest : public KernelTest {
  public:
   static constexpr int kInput = 0;
 
-  void PrepareInterpreter(PrepareFunction prepare,
-                          const std::vector<float>& input) {
-    ConfigureDelegate(prepare);
+  void PrepareInterpreter(const std::vector<float>& input) {
+    ApplyFlexDelegate();
 
     SetShape(kOnes, {3});
     SetValues(kOnes, {1.0f, 1.0f, 1.0f});
@@ -336,7 +274,7 @@ TEST_F(MultipleSubgraphsTest, ForwardabilityIsLocal) {
   AddTfLiteMulOp({10, 7}, {12});
 
   auto input = {3.0f, 4.0f, 5.0f};
-  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+  PrepareInterpreter(input);
 
   ASSERT_TRUE(Invoke());
   ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
@@ -371,7 +309,7 @@ TEST_F(MultipleSubgraphsTest, DoNotRemoveInputTensors) {
   AddTfLiteMulOp({10, 7}, {12});
 
   auto input = {3.0f, 4.0f, 5.0f};
-  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+  PrepareInterpreter(input);
 
   ASSERT_TRUE(Invoke());
   ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
@@ -405,7 +343,7 @@ TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
   AddTfLiteMulOp({10, 7}, {12});
 
   auto input = {3.0f, 4.0f, 5.0f};
-  PrepareInterpreter(GetPrepareFunction<__LINE__>(), input);
+  PrepareInterpreter(input);
 
   ASSERT_TRUE(Invoke());
   ASSERT_THAT(GetValues(12), ElementsAreArray(Apply(input, [](float in) {
@@ -413,7 +351,7 @@ TEST_F(MultipleSubgraphsTest, DoNotForwardInputTensors) {
               })));
 }
 
-}  // namespace
+}  // namespace testing
 }  // namespace flex
 }  // namespace tflite
 
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.cc b/tensorflow/lite/delegates/utils/simple_delegate.cc
index 6ac99883cd2..156f6a9679a 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_delegate.cc
@@ -32,6 +32,7 @@ TfLiteRegistration GetDelegateKernelRegistration(
   kernel_registration.profiling_string = nullptr;
   kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
   kernel_registration.custom_name = delegate->name();
+  kernel_registration.version = 1;
   kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
     delete reinterpret_cast<SimpleDelegateKernelInterface*>(buffer);
   };
@@ -77,6 +78,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context,
                              TfLiteDelegate* base_delegate) {
   auto* delegate =
       reinterpret_cast<SimpleDelegateInterface*>(base_delegate->data_);
+  TF_LITE_ENSURE_STATUS(delegate->Initialize(context));
   delegates::IsNodeSupportedFn node_supported_fn =
       [=](TfLiteContext* context, TfLiteNode* node,
           TfLiteRegistration* registration,
@@ -125,5 +127,4 @@ void TfLiteDelegateFactory::DeleteSimpleDelegate(TfLiteDelegate* delegate) {
   delete simple_delegate;
   delete delegate;
 }
-
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.h b/tensorflow/lite/delegates/utils/simple_delegate.h
index bf35fbc47aa..54473e41901 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate.h
+++ b/tensorflow/lite/delegates/utils/simple_delegate.h
@@ -20,8 +20,12 @@ limitations under the License.
 // this interface to build/prepare/invoke the delegated subgraph.
 // - SimpleDelegateInterface:
 // This class wraps TFLiteDelegate and users need to implement the interface and
-// then Call GetFinalizedDelegate() to get TfLiteDelegate* that can be passed to
-// ModifyGraphWithDelegate.
+// then call TfLiteDelegateFactory::CreateSimpleDelegate(...) to get
+// TfLiteDelegate* that can be passed to ModifyGraphWithDelegate and free it via
+// TfLiteDelegateFactory::DeleteSimpleDelegate(...).
+// or call TfLiteDelegateFactory::Create(...) to get a std::unique_ptr
+// TfLiteDelegate that can also be passed to ModifyGraphWithDelegate, in which
+// case TfLite interpereter takes the memory ownership of the delegate.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_SIMPLE_DELEGATE_H_
 
@@ -31,6 +35,9 @@ limitations under the License.
 
 namespace tflite {
 
+using TfLiteDelegateUniquePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
 // Users should inherit from this class and implement the interface below.
 // Each instance represents a single part of the graph (subgraph).
 class SimpleDelegateKernelInterface {
@@ -49,6 +56,7 @@ class SimpleDelegateKernelInterface {
 
   // Actual subgraph inference should happen on this call.
   // Returns status, and signalling any errors.
+  // TODO(b/157882025): change this to Eval to be consistent w/ a TFLite kernel.
   virtual TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) = 0;
 };
 
@@ -58,6 +66,7 @@ class SimpleDelegateKernelInterface {
 //
 // Clients should implement the following methods:
 // - IsNodeSupportedByDelegate
+// - Initialize
 // - name
 // - CreateDelegateKernelInterface
 class SimpleDelegateInterface {
@@ -71,8 +80,14 @@ class SimpleDelegateInterface {
                                          const TfLiteNode* node,
                                          TfLiteContext* context) const = 0;
 
+  // Initialize the delegate before finding and replacing TfLite nodes with
+  // delegate kernels, for example, retrieving some TFLite settings from
+  // 'context'.
+  virtual TfLiteStatus Initialize(TfLiteContext* context) = 0;
+
   // Returns a name that identifies the delegate.
   // This name is used for debugging/logging/profiling.
+  // TODO(b/157882025): change this to Name()
   virtual const char* name() const = 0;
 
   // Returns instance of an object that implements the interface
@@ -84,13 +99,8 @@ class SimpleDelegateInterface {
   CreateDelegateKernelInterface() = 0;
 };
 
-// Factory class that provides two static methods
-// CreateSimpleDelegate
-// DeleteSimpleDelegate
-// Which should be used to construct TfLiteDelegate from
-// Simple Delegate and delete TfLiteDelegate and SimpleDelegate give
-// tfLiteDelegate* created from 'CreateSimpleDelegate' method.
-// Users should use these methods to Create and Destroy the delegate.
+// Factory class that provides static methods to deal with SimpleDelegate
+// creation and deletion.
 class TfLiteDelegateFactory {
  public:
   // Creates TfLiteDelegate from the provided SimpleDelegateInterface.
@@ -99,9 +109,17 @@ class TfLiteDelegateFactory {
       std::unique_ptr<SimpleDelegateInterface> simple_delegate);
 
   // Deletes 'delegate' the passed pointer must be the one returned
-  // from GetFinalizedDelegate.
+  // from CreateSimpleDelegate.
   // This function will destruct the SimpleDelegate object too.
   static void DeleteSimpleDelegate(TfLiteDelegate* delegate);
+
+  // A convenient function wrapping the above two functions and returning a
+  // std::unique_ptr type for auto memory management.
+  inline static TfLiteDelegateUniquePtr Create(
+      std::unique_ptr<SimpleDelegateInterface> simple_delegate) {
+    return TfLiteDelegateUniquePtr(
+        CreateSimpleDelegate(std::move(simple_delegate)), DeleteSimpleDelegate);
+  }
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils/simple_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
index fa6d528a537..42c0ace6cb7 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
@@ -72,7 +72,12 @@ class TestSimpleDelegate : public SimpleDelegateInterface {
     return options_.allowed_builtin_code == registration->builtin_code;
   }
 
-  const char* name() const override { return "TestSimpleDelegate"; }
+  TfLiteStatus Initialize(TfLiteContext* context) override { return kTfLiteOk; }
+
+  const char* name() const override {
+    static constexpr char kName[] = "TestSimpleDelegate";
+    return kName;
+  }
 
   std::unique_ptr<SimpleDelegateKernelInterface> CreateDelegateKernelInterface()
       override {
@@ -113,27 +118,24 @@ class TestDelegate : public ::testing::Test {
                                         reg);
   }
 
-  void TearDown() override {
-    interpreter_.reset();
-    TfLiteDelegateFactory::DeleteSimpleDelegate(delegate_);
-  }
+  void TearDown() override { interpreter_.reset(); }
 
  protected:
   std::unique_ptr<Interpreter> interpreter_;
-  TfLiteDelegate* delegate_ = nullptr;
 };
 
 TEST_F(TestDelegate, BasicDelegate) {
   TestSimpleDelegateOptions options;
   options.allowed_builtin_code = kTfLiteBuiltinAdd;
-  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+  auto delegate = TfLiteDelegateFactory::Create(
       std::make_unique<TestSimpleDelegate>(options));
-  interpreter_->ModifyGraphWithDelegate(delegate_);
+  interpreter_->ModifyGraphWithDelegate(std::move(delegate));
 
   ASSERT_EQ(interpreter_->execution_plan().size(), 1);
   int node = interpreter_->execution_plan()[0];
   const auto* node_and_reg = interpreter_->node_and_registration(node);
-  EXPECT_EQ("TestSimpleDelegate", node_and_reg->second.custom_name);
+  EXPECT_STREQ("TestSimpleDelegate", node_and_reg->second.custom_name);
+  EXPECT_EQ(1, node_and_reg->second.version);
 
   const TfLiteDelegateParams* params = static_cast<const TfLiteDelegateParams*>(
       node_and_reg->first.builtin_data);
@@ -154,9 +156,9 @@ TEST_F(TestDelegate, BasicDelegate) {
 TEST_F(TestDelegate, NoNodesToDelegate) {
   TestSimpleDelegateOptions options;
   options.allowed_builtin_code = kTfLiteBuiltinSub;
-  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+  auto delegate = TfLiteDelegateFactory::Create(
       std::make_unique<TestSimpleDelegate>(options));
-  interpreter_->ModifyGraphWithDelegate(delegate_);
+  interpreter_->ModifyGraphWithDelegate(std::move(delegate));
 
   ASSERT_EQ(interpreter_->execution_plan().size(), 3);
 }
@@ -165,19 +167,20 @@ TEST_F(TestDelegate, DelegateFailedPrepare) {
   TestSimpleDelegateOptions options;
   options.allowed_builtin_code = kTfLiteBuiltinAdd;
   options.error_during_prepare = true;
-  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+  auto delegate = TfLiteDelegateFactory::Create(
       std::make_unique<TestSimpleDelegate>(options));
   ASSERT_EQ(kTfLiteDelegateError,
-            interpreter_->ModifyGraphWithDelegate(delegate_));
+            interpreter_->ModifyGraphWithDelegate(std::move(delegate)));
 }
 
 TEST_F(TestDelegate, DelegateFailedInvoke) {
   TestSimpleDelegateOptions options;
   options.allowed_builtin_code = kTfLiteBuiltinAdd;
   options.error_during_invoke = true;
-  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+  auto delegate = TfLiteDelegateFactory::Create(
       std::make_unique<TestSimpleDelegate>(options));
-  ASSERT_EQ(kTfLiteOk, interpreter_->ModifyGraphWithDelegate(delegate_));
+  ASSERT_EQ(kTfLiteOk,
+            interpreter_->ModifyGraphWithDelegate(std::move(delegate)));
   ASSERT_EQ(kTfLiteError, interpreter_->Invoke());
 }
 
@@ -185,10 +188,10 @@ TEST_F(TestDelegate, DelegateFailedInit) {
   TestSimpleDelegateOptions options;
   options.allowed_builtin_code = kTfLiteBuiltinAdd;
   options.error_during_init = true;
-  delegate_ = TfLiteDelegateFactory::CreateSimpleDelegate(
+  auto delegate = TfLiteDelegateFactory::Create(
       std::make_unique<TestSimpleDelegate>(options));
   ASSERT_EQ(kTfLiteDelegateError,
-            interpreter_->ModifyGraphWithDelegate(delegate_));
+            interpreter_->ModifyGraphWithDelegate(std::move(delegate)));
 }
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 93a1b813f4d..692155239ff 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -337,10 +337,7 @@ TfLiteDriver::TfLiteDriver(DelegateType delegate_type, bool reference_kernel)
       break;
     case DelegateType::kFlex:
 #if !defined(__APPLE__)
-      delegate_ = Interpreter::TfLiteDelegatePtr(
-          FlexDelegate::Create().release(), [](TfLiteDelegate* delegate) {
-            delete static_cast<tflite::FlexDelegate*>(delegate);
-          });
+      delegate_ = FlexDelegate::Create();
 #endif
       break;
   }
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index d95c1431041..31d8d53d563 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -744,8 +744,9 @@ typedef struct TfLiteDelegate {
                           struct TfLiteDelegate* delegate);
 
   // Copy the data from delegate buffer handle into raw memory of the given
-  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
-  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
+  // 'tensor'. Note that the delegate is allowed to allocate the raw bytes as
+  // long as it follows the rules for kTfLiteDynamic tensors, in which case this
+  // cannot be null.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                        struct TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,

From b9d99cb6f5508ae3cb672f94dc4726d769122711 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Mon, 1 Jun 2020 16:46:21 -0700
Subject: [PATCH 1506/1533] Reduce 1-layer Functional.__call__ overhead by ~10%

Adds a Layer._flatten_layers(recursive=True, include_self=True) method.
Uses this method for gathering unique sublayers, maintaining backwards
compatible ordering.

Removes unnecessary attribute caching for stateful and dynamic properties.

PiperOrigin-RevId: 314229252
Change-Id: I08cb80ae27861c52eae1ebed068b9a10d803e8a0
---
 tensorflow/python/keras/engine/base_layer.py  | 122 +++++++-----------
 .../python/keras/engine/base_layer_v1.py      |  57 +-------
 tensorflow/python/keras/engine/functional.py  |   5 -
 tensorflow/python/keras/engine/sequential.py  |   8 --
 tensorflow/python/keras/engine/training.py    |   5 +-
 .../python/keras/engine/training_test.py      |  59 +++++++++
 tensorflow/python/keras/models.py             |   5 -
 .../python/keras/saving/saved_model/utils.py  |   2 +-
 8 files changed, 112 insertions(+), 151 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 4a33e8f4e20..e817d56f619 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import functools
 import itertools
@@ -1019,31 +1020,16 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     return self._name
 
   @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
     """Whether the layer is dynamic (eager-only); set in the constructor."""
-    # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
-    #                    then this cache logic must be updated.
-    return self._dynamic or any(layer.dynamic
-                                for layer in self._unique_sublayers())
-
-  def _unique_sublayers(self):
-    # Model.layers will use this as implementation, but we can't expose this
-    # one as the public property since it might conflict with subclass layers
-    # which also have user defined layers property.
-    self._maybe_create_attribute('_layers', [])
-    return list(
-        trackable_layer_utils.filter_empty_layer_containers(self._layers))
+    return any(layer._dynamic for layer in self._flatten_layers())
 
   @property
   @doc_controls.do_not_doc_inheritable
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful or any(
-        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
+    return any(layer._stateful for layer in self._flatten_layers())
 
   @stateful.setter
-  @trackable_layer_utils.invalidate_recursive_cache('stateful')
   def stateful(self, value):
     self._stateful = value
 
@@ -1053,9 +1039,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
   @trainable.setter
   def trainable(self, value):
-    self._trainable = value
-    for layer in getattr(self, '_layers', []):
-      layer.trainable = value
+    for layer in self._flatten_layers():
+      layer._trainable = value
 
   @property
   def activity_regularizer(self):
@@ -1162,7 +1147,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   @doc_controls.do_not_doc_inheritable
   def updates(self):
     collected_updates = []
-    all_layers = self._gather_unique_layers()
+    all_layers = self._flatten_layers()
     with backend.get_graph().as_default():
       for layer in all_layers:
         if not layer.trainable and not layer.stateful:
@@ -1215,8 +1200,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       A list of tensors.
     """
     collected_losses = []
-    all_layers = self._gather_unique_layers()
-    for layer in all_layers:
+    for layer in self._flatten_layers():
       # If any eager losses are present, we assume the model to be part of an
       # eager training loop (either a custom one or the one used when
       # `run_eagerly=True`) and so we always return just the eager losses.
@@ -1357,12 +1341,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def _clear_losses(self):
     """Used every step in eager to reset losses."""
     # Set to thread local directly to avoid Layer.__setattr__ overhead.
-    self._thread_local._eager_losses = []
-    sublayers = getattr(self, '_layers', [])
-    if sublayers:
-      sublayers = trackable_layer_utils.filter_empty_layer_containers(sublayers)
-      for layer in sublayers:
-        layer._clear_losses()
+    if not getattr(self, '_layers', None):  # Fast path for single Layer.
+      self._thread_local._eager_losses = []
+    else:
+      for layer in self._flatten_layers():
+        layer._thread_local._eager_losses = []
 
   @property
   def metrics(self):
@@ -1382,8 +1365,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       A list of tensors.
     """
     collected_metrics = []
-    all_layers = self._gather_unique_layers()
-    for layer in all_layers:
+    for layer in self._flatten_layers():
       with layer._metrics_lock:
         collected_metrics.extend(layer._metrics)
     return collected_metrics
@@ -2507,22 +2489,16 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       A dict mapping all sublayers to their `trainable` value.
     """
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
-    # Keep track of each top-level layers' `trainable` as well as the
-    # state of all of its sublayers.
     trainable_state = weakref.WeakKeyDictionary()
-    trainable_state[self] = self.trainable
-    for layer in layers:
-      trainable_state.update(layer._get_trainable_state())
+    for layer in self._flatten_layers():
+      trainable_state[layer] = layer.trainable
     return trainable_state
 
   def _set_trainable_state(self, trainable_state):
     """Set `trainable` state for each sublayer."""
-    layers = trackable_layer_utils.filter_empty_layer_containers(self._layers)
-    if self in trainable_state:
-      self.trainable = trainable_state[self]
-    for layer in layers:
-      layer._set_trainable_state(trainable_state)
+    for layer in self._flatten_layers():
+      if layer in trainable_state:
+        layer.trainable = trainable_state[layer]
 
   @property
   def _obj_reference_counts(self):
@@ -2582,7 +2558,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       super(tracking.AutoTrackable, self).__setattr__(
           '_layers',
           [l for l in self._layers if l is not existing_value])
-      self._attribute_sentinel.invalidate_all()
     if isinstance(existing_value, tf_variables.Variable):
       super(tracking.AutoTrackable, self).__setattr__(
           '_trainable_weights',
@@ -2591,13 +2566,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           '_non_trainable_weights',
           [w for w in self._non_trainable_weights if w is not existing_value])
 
-    # Any time we change `_layers` (either by deleting the attribute or by
-    # reassigning it which will call __delattr__ from __setattr__) the topology
-    # of the subgraph of Layers may change. In that case we will need to
-    # recompute any attribute which depends on that subgraph.
-    if name == '_layers':
-      self._attribute_sentinel.invalidate_all()
-
   def __setattr__(self, name, value):
     if (name == '_self_setattr_tracking' or
         not getattr(self, '_self_setattr_tracking', True) or
@@ -2642,8 +2610,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # container types which compare equal.
       if not any((layer is value for layer in self._layers)):
         self._layers.append(value)
-        if hasattr(value, '_attribute_sentinel'):
-          value._attribute_sentinel.add_parent(self._attribute_sentinel)
         if hasattr(value, '_use_resource_variables'):
           # Legacy layers (V1 tf.layers) must always use
           # resource variables.
@@ -2691,34 +2657,36 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
               getattr(layer, attribute) for layer in nested_layers))
     return []
 
-  def _gather_unique_layers(self):
-    """Returns the current layer and all its children depth first deduped.
+  def _flatten_layers(self, recursive=True, include_self=True):
+    if include_self:
+      yield self
 
-    We are deduping after getting the layers to maintain the order.
-    """
-    all_layers = self._gather_layers()
-    unique_layers, seen_layers = [], object_identity.ObjectIdentitySet()
-    for layer in all_layers:
-      if layer not in seen_layers:
-        unique_layers.append(layer)
-        # Track the Variable's identity to avoid __eq__ issues.
-        seen_layers.add(layer)
-    return unique_layers
+    # Only instantiate set and deque if needed.
+    layers_or_containers = getattr(self, '_layers', None)
+    if layers_or_containers:
+      seen_object_ids = set()
+      deque = collections.deque(layers_or_containers)
+      while deque:
+        layer_or_container = deque.popleft()
 
-  def _gather_layers(self):
-    """Returns the current layer and all its children depth first."""
-    all_layers = [self]
-    if hasattr(self, '_layers'):
-      child_layers = trackable_layer_utils.filter_empty_layer_containers(
-          self._layers)
-      for child_layer in child_layers:
-        all_layers.extend(child_layer._gather_layers())
-    return all_layers
+        layer_or_container_id = id(layer_or_container)
+        if layer_or_container_id in seen_object_ids:
+          continue
+        seen_object_ids.add(layer_or_container_id)
 
-  @property
-  @tracking.cached_per_instance
-  def _attribute_sentinel(self):
-    return trackable_layer_utils.AttributeSentinel()
+        if isinstance(layer_or_container, Layer):
+          yield layer_or_container
+          # Introspect recursively through sublayers.
+          if recursive:
+            sublayers = getattr(layer_or_container, '_layers', None)
+            if sublayers:
+              deque.extendleft(reversed(sublayers))
+        elif isinstance(layer_or_container,
+                        data_structures.TrackableDataStructure):
+          # Data structures are introspected even with `recursive=False`.
+          tracked_values = layer_or_container._values
+          if tracked_values:
+            deque.extendleft(reversed(tracked_values))
 
   # This is a hack so that the is_layer (within
   # training/trackable/layer_utils.py) check doesn't get the weights attr.
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 4c0826be4dc..78140985b4a 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -829,22 +829,15 @@ class Layer(base_layer.Layer):
     return self._name
 
   @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
   def dynamic(self):
-    # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
-    #                    then this cache logic must be updated.
-    return self._dynamic or any(layer.dynamic
-                                for layer in self._unique_sublayers())
+    return any(layer._dynamic for layer in self._flatten_layers())
 
   @property
   @doc_controls.do_not_generate_docs
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful or any(
-        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
+    return any(layer._stateful for layer in self._flatten_layers())
 
   @stateful.setter
-  @trackable_layer_utils.invalidate_recursive_cache('stateful')
   def stateful(self, value):
     self._stateful = value
 
@@ -916,7 +909,7 @@ class Layer(base_layer.Layer):
   @property
   def updates(self):
     collected_updates = []
-    all_layers = self._gather_unique_layers()
+    all_layers = self._flatten_layers()
     with backend.get_graph().as_default():
       for layer in all_layers:
         if not layer.trainable and not layer.stateful:
@@ -945,7 +938,7 @@ class Layer(base_layer.Layer):
       A list of tensors.
     """
     collected_losses = []
-    all_layers = self._gather_unique_layers()
+    all_layers = self._flatten_layers()
     for layer in all_layers:
       # If any eager losses are present, we assume the model to be part of an
       # eager training loop (either a custom one or the one used when
@@ -1075,8 +1068,7 @@ class Layer(base_layer.Layer):
   @property
   def metrics(self):
     collected_metrics = []
-    all_layers = self._gather_unique_layers()
-    for layer in all_layers:
+    for layer in self._flatten_layers():
       collected_metrics.extend(layer._metrics)
     return collected_metrics
 
@@ -2187,7 +2179,6 @@ class Layer(base_layer.Layer):
       super(tracking.AutoTrackable, self).__setattr__(
           '_layers',
           [l for l in self._layers if l is not existing_value])
-      self._attribute_sentinel.invalidate_all()
     if isinstance(existing_value, tf_variables.Variable):
       super(tracking.AutoTrackable, self).__setattr__(
           '_trainable_weights',
@@ -2196,13 +2187,6 @@ class Layer(base_layer.Layer):
           '_non_trainable_weights',
           [w for w in self._non_trainable_weights if w is not existing_value])
 
-    # Any time we change `_layers` (either by deleting the attribute or by
-    # reassigning it which will call __delattr__ from __setattr__) the topology
-    # of the subgraph of Layers may change. In that case we will need to
-    # recompute any attribute which depends on that subgraph.
-    if name == '_layers':
-      self._attribute_sentinel.invalidate_all()
-
   def __setattr__(self, name, value):
     if (name == '_self_setattr_tracking' or
         not getattr(self, '_self_setattr_tracking', True) or
@@ -2247,8 +2231,6 @@ class Layer(base_layer.Layer):
       # container types which compare equal.
       if not any((layer is value for layer in self._layers)):
         self._layers.append(value)
-        if hasattr(value, '_attribute_sentinel'):
-          value._attribute_sentinel.add_parent(self._attribute_sentinel)
         if hasattr(value, '_use_resource_variables'):
           # Legacy layers (V1 tf.layers) must always use
           # resource variables.
@@ -2296,35 +2278,6 @@ class Layer(base_layer.Layer):
               getattr(layer, attribute) for layer in nested_layers))
     return []
 
-  def _gather_unique_layers(self):
-    """Returns the current layer and all its children depth first deduped.
-
-    We are deduping after getting the layers to maintain the order.
-    """
-    all_layers = self._gather_layers()
-    unique_layers, seen_layers = [], object_identity.ObjectIdentitySet()
-    for layer in all_layers:
-      if layer not in seen_layers:
-        unique_layers.append(layer)
-        # Track the Variable's identity to avoid __eq__ issues.
-        seen_layers.add(layer)
-    return unique_layers
-
-  def _gather_layers(self):
-    """Returns the current layer and all its children depth first."""
-    all_layers = [self]
-    if hasattr(self, '_layers'):
-      child_layers = trackable_layer_utils.filter_empty_layer_containers(
-          self._layers)
-      for child_layer in child_layers:
-        all_layers.extend(child_layer._gather_layers())
-    return all_layers
-
-  @property
-  @tracking.cached_per_instance
-  def _attribute_sentinel(self):
-    return trackable_layer_utils.AttributeSentinel()
-
   # This is a hack so that the is_layer (within
   # training/trackable/layer_utils.py) check doesn't get the weights attr.
   # TODO(b/110718070): Remove when fixed.
diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index eec13345295..bdc800d4d1b 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -193,7 +193,6 @@ class Functional(training_lib.Model):
     self._layer_call_argspecs = {}
     for layer in self._layers:
       self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-      layer._attribute_sentinel.add_parent(self._attribute_sentinel)
 
     # Build self.input_names and self.output_names.
     self._set_output_names()
@@ -731,10 +730,6 @@ class Functional(training_lib.Model):
         self._layers.append(layer)
         deferred_layers.append(layer)
         self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-
-        # This allows the added layer to broadcast mutations to the current
-        # layer, which is necessary to ensure cache correctness.
-        layer._attribute_sentinel.add_parent(self._attribute_sentinel)
         layer_set.add(layer)
     self._handle_deferred_layer_dependencies(deferred_layers)
 
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index d8325b98504..a79d541c4e4 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -188,10 +188,6 @@ class Sequential(functional.Functional):
                        ' of a layer in this model. Update the `name` argument '
                        'to pass a unique name.' % (layer.name,))
 
-    # This allows the added layer to broadcast mutations to the current
-    # layer, which is necessary to ensure cache correctness.
-    layer._attribute_sentinel.add_parent(self._attribute_sentinel)
-
     self.built = False
     set_inputs = False
     if not self._layers:
@@ -236,9 +232,6 @@ class Sequential(functional.Functional):
       self._handle_deferred_layer_dependencies([layer])
 
     self._layer_call_argspecs[layer] = tf_inspect.getfullargspec(layer.call)
-    # Different Model types add to `._layers` in different ways, so for safety
-    # we do a cache invalidation to make sure the changes are reflected.
-    self._attribute_sentinel.invalidate_all()
 
   @trackable.no_automatic_dependency_tracking
   def pop(self):
@@ -252,7 +245,6 @@ class Sequential(functional.Functional):
 
     layer = self._layers.pop()
     self._layer_call_argspecs.pop(layer)
-    self._attribute_sentinel.invalidate_all()
     if not self.layers:
       self.outputs = None
       self.inputs = None
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 6c6d9ee897b..adaba76e820 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -628,8 +628,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
       if self.compiled_metrics is not None:
         metrics += self.compiled_metrics.metrics
 
-    all_layers = self._gather_unique_layers()
-    for l in all_layers:
+    for l in self._flatten_layers():
       metrics.extend(l._metrics)  # pylint: disable=protected-access
     return metrics
 
@@ -2310,7 +2309,7 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
   @property
   def layers(self):
-    return self._unique_sublayers()
+    return list(self._flatten_layers(include_self=False, recursive=False))
 
   def get_layer(self, name=None, index=None):
     """Retrieves a layer based on either its name (unique) or index.
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index e1180b5234b..111833ba8b5 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -1499,6 +1499,65 @@ class TrainingTest(keras_parameterized.TestCase):
     new_kernel = model.get_weights()[1]
     self.assertNotAllEqual(old_kernel, new_kernel)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_layer_ordering(self):
+
+    class MyLayer(layers_module.Layer):
+      pass
+
+    class MyModel(training_module.Model):
+
+      def __init__(self, name):
+        super(MyModel, self).__init__(name=name)
+
+        self.weight = variables_lib.Variable(0, name=name)
+
+        self.direct_sublayer = MyLayer(name='direct')
+        self.direct_sublayer.d = {'d': MyLayer(name='direct/dict')}
+
+        self.dict_sublayer = {'d': MyLayer(name='dict')}
+        self.dict_sublayer['d'].direct = MyLayer(name='dict/direct')
+
+    model = MyModel('model')
+    # All sublayers, including self and recursive sublayers.
+    self.assertEqual(['model', 'direct', 'direct/dict', 'dict', 'dict/direct'],
+                     [l.name for l in model._flatten_layers()])
+    # Only direct sublayers, including those in data structures.
+    self.assertEqual(['direct', 'dict'], [l.name for l in model.layers])
+
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  def test_trainable_state_setting(self):
+
+    class UpdateLayer(layers_module.Layer):
+
+      def __init__(self):
+        super(UpdateLayer, self).__init__()
+        self.v = variables_lib.Variable(0., trainable=False)
+
+      def call(self, x):
+        self.add_update(lambda: self.v.assign_add(1.))
+        return x * self.v
+
+    layer = UpdateLayer()
+    model_with_updates = sequential.Sequential([layer])
+    model_with_updates.compile(
+        'sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
+    layer.trainable = False
+    model_without_updates = sequential.Sequential([layer])
+    model_without_updates.compile(
+        'sgd', 'mse', run_eagerly=testing_utils.should_run_eagerly())
+
+    x, y = np.ones((10, 1)), np.ones((10, 1))
+
+    self.assertEqual(self.evaluate(layer.v), 0.)
+    model_with_updates.fit(x, y, batch_size=10)
+    # assign_add called.
+    self.assertEqual(self.evaluate(layer.v), 1.)
+    model_without_updates.fit(x, y, batch_size=10)
+    # assign_add not called.
+    self.assertEqual(self.evaluate(layer.v), 1.)
+
 
 class TestExceptionsAndWarnings(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 9f5099e100e..37a3f01272f 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -505,11 +505,6 @@ def _in_place_subclassed_model_reset(model):
     setattr(model, name, fresh_layer)
     model._layers.append(fresh_layer)
 
-    # The base Layer __setattr__ will invalidate its attribute cache when
-    # `._layers` is assigned, but it has no way to know when the underlying list
-    # is mutated so we must explicitly signal the append.
-    model._attribute_sentinel.invalidate_all()
-
   # Cache original model build attributes (in addition to layers)
   if (not hasattr(model, '_original_attributes_cache') or
       model._original_attributes_cache is None):
diff --git a/tensorflow/python/keras/saving/saved_model/utils.py b/tensorflow/python/keras/saving/saved_model/utils.py
index dedcea02a4f..bd3f0c1b626 100644
--- a/tensorflow/python/keras/saving/saved_model/utils.py
+++ b/tensorflow/python/keras/saving/saved_model/utils.py
@@ -79,7 +79,7 @@ def use_wrapped_call(layer, call_fn, default_training_value=None,
     # child layers. This causes `.losses` to only return eager losses.
     # pylint: disable=protected-access
     if context.executing_eagerly():
-      for i in layer._gather_unique_layers():
+      for i in layer._flatten_layers():
         if i is not layer:
           i._eager_losses = [base_layer_utils.REVIVED_LOSS_PLACEHOLDER]
     # pylint: enable=protected-access

From 166a78057fc8f4fa900f5faf0498837098f9860f Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Mon, 1 Jun 2020 17:24:10 -0700
Subject: [PATCH 1507/1533] [XLA:SPMD] Use halo exchange in pad op.

Similar to the handling for slice, this allows pad to be partitioned on any dimension.

PiperOrigin-RevId: 314235604
Change-Id: I446f0e6e4fc1c175a318897b4e50be3d86dd67e9
---
 .../xla/service/spmd/spmd_partitioner.cc      | 69 +++++++++++++++----
 .../xla/service/spmd/spmd_partitioner_test.cc | 63 +++++++++++++++--
 2 files changed, 111 insertions(+), 21 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 122812b78e3..635446a18a1 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -539,9 +539,11 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
         }
         swd->set_padding_low(max_pad_low);
       } else {
-        CHECK_EQ(
-            (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation(), 0)
-            << "General base dilation not yet implemented.";
+        if ((wd.stride() * per_shard_window_counts[i]) % wd.base_dilation() !=
+            0) {
+          // General base dilation not yet implemented.
+          return absl::nullopt;
+        }
         // padding_low on all shards should equal the initially assigned
         // swd->padding_low(), i.e., the padding_low() on the original window.
       }
@@ -598,7 +600,7 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
   }
 
   if (target != sharding()) {
-    return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+    return Reshard(target).ReshardAsWindowedInput(window, target, pad_value);
   }
 
   // Halo exchange.
@@ -2267,24 +2269,63 @@ Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
   if (hlo->sharding().IsTileMaximal()) {
     return DefaultAction(hlo);
   }
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  // Create a window config to represent the pad.
+  Window window;
   for (int64 i = 0; i < hlo->shape().rank(); ++i) {
     const auto& pd = hlo->padding_config().dimensions(i);
-    // Right now we only support non-padded dimensions to be partitioned.
-    if (hlo->sharding().tile_assignment().dim(i) > 1 &&
-        (pd.edge_padding_high() != 0 || pd.edge_padding_low() != 0 ||
-         pd.interior_padding() != 0)) {
-      return DefaultAction(hlo);
-    }
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(1);
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    dim->set_padding_low(pd.edge_padding_low());
+    dim->set_padding_high(pd.edge_padding_high());
+    dim->set_base_dilation(pd.interior_padding() + 1);
   }
-  auto resharded_lhs =
-      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+
   auto replicated_rhs = GetPartitionedHlo(hlo->operand(1))
                             .Reshard(HloSharding::Replicate())
                             .hlo();
+  auto reshard_operand =
+      lhs.ReshardAsWindowedInput(window, hlo->sharding(), replicated_rhs,
+                                 /*mask_invalid_region=*/false);
+  if (!reshard_operand.has_value()) {
+    return DefaultAction(hlo);
+  }
+  PaddingConfig sharded_padding_config;
+  bool need_pad = false;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    auto dim = sharded_padding_config.add_dimensions();
+    const auto& wd = reshard_operand->shard_window.dimensions(i);
+    dim->set_edge_padding_low(wd.padding_low());
+    dim->set_edge_padding_high(wd.padding_high());
+    dim->set_interior_padding(wd.base_dilation() - 1);
+    if (wd.padding_low() != 0 || wd.padding_high() != 0 ||
+        wd.base_dilation() != 1) {
+      need_pad = true;
+    }
+  }
+  auto sharded_pad = reshard_operand->sharded_input;
+  if (need_pad) {
+    TF_ASSIGN_OR_RETURN(auto sharded_pad_shape,
+                        ShapeInference::InferPadShape(sharded_pad->shape(),
+                                                      replicated_rhs->shape(),
+                                                      sharded_padding_config));
+    sharded_pad = b_.AddInstruction(hlo->CreatePad(sharded_pad_shape,
+                                                   sharded_pad, replicated_rhs,
+                                                   sharded_padding_config));
+  }
+
   SetPartitionedHlo(hlo, [&]() {
+    if (!reshard_operand->dynamic_slice_index_on_output) {
+      return sharded_pad;
+    }
     auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
-    return b_.AddInstruction(hlo->CloneWithNewOperands(
-        shard_shape, {resharded_lhs, replicated_rhs}));
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_pad,
+        *reshard_operand->dynamic_slice_index_on_output,
+        shard_shape.dimensions()));
   });
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 26450d9968d..1f0b1d06c1f 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -1782,10 +1782,9 @@ TEST_F(SpmdPartitioningTest, PadAlongNonPartitionedDimension) {
 HloModule module
 
 ENTRY entry {
-  %param0 = f32[128,14,257] parameter(0)
-  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  %param0 = f32[128,14,257] parameter(0), sharding={devices=[1,1,2]0,1}
   %const = f32[] constant(0)
-  ROOT %pad = f32[128,17,257] pad(%param0.copy, %const), padding=0_0x1_2x0_0,
+  ROOT %pad = f32[128,17,257] pad(%param0, %const), padding=0_0x1_2x0_0,
     sharding={devices=[1,1,2]0,1}
 })";
 
@@ -1794,14 +1793,64 @@ ENTRY entry {
   VLOG(1) << module->ToString();
 
   auto root = module->entry_computation()->root_instruction();
-  auto param0 = AllOf(
-      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
-                                op::Constant(), op::Constant(), op::Reshape())),
-      op::Shape("f32[128,14,129]"));
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[128,14,129]"));
   EXPECT_THAT(root, AllOf(op::Pad(param0, op::Constant()),
                           op::Shape("f32[128,17,129]")));
 }
 
+TEST_F(SpmdPartitioningTest, PadAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0), sharding={devices=[1,2]0,1}
+  %const = f32[] constant(0)
+  ROOT %pad = f32[14,259] pad(%param0, %const), padding=0_0x0_2,
+    sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[14,129]"));
+  auto after_halo_exchange =
+      AllOf(op::Shape("f32[14,130]"),
+            op::Concatenate(param0, op::CollectivePermute(op::Slice(param0))));
+  auto pad = AllOf(op::Shape("f32[14,131]"),
+                   op::Pad(after_halo_exchange, op::Constant()));
+  EXPECT_THAT(root, op::DynamicSlice(pad, op::Constant(), _));
+}
+
+TEST_F(SpmdPartitioningTest, PadAlongPartitionedDimensionWithInteriorPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[7] parameter(0), sharding={devices=[2]0,1}
+  %param1 = f32[] parameter(1), sharding={replicated}
+  ROOT %pad = f32[22] pad(%param0, %param1), padding=2_1_2,
+    sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto param0 = AllOf(op::Parameter(), op::Shape("f32[4]"));
+  auto after_halo_exchange =
+      AllOf(op::Shape("f32[4]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[5]"),
+                      op::Concatenate(op::CollectivePermute(op::Slice(param0)),
+                                      param0)),
+                _));
+  auto pad = op::Pad(after_halo_exchange, op::Parameter(1));
+  EXPECT_THAT(root, op::DynamicSlice(pad, _));
+}
+
 TEST_F(SpmdPartitioningTest, SliceAlongNonPartitionedDimension) {
   const char* const hlo_string = R"(
 HloModule module

From 5bca62e44a309024772b77a971e32fff2ea7301d Mon Sep 17 00:00:00 2001
From: Ran Chen <crccw@google.com>
Date: Mon, 1 Jun 2020 17:53:57 -0700
Subject: [PATCH 1508/1533] Improve multi_process_runner

This is to prepare enabling it for OSS.

PiperOrigin-RevId: 314239700
Change-Id: I3ec9c0984185fc108388d8df2ea64bcca120099c
---
 tensorflow/python/distribute/BUILD            |   9 +-
 .../python/distribute/multi_process_lib.py    |  32 +-
 .../python/distribute/multi_process_runner.py | 532 +++++++++---------
 .../distribute/multi_process_runner_test.py   |  58 +-
 .../multi_worker_continuous_run_test.py       |   2 +-
 tensorflow/python/keras/distribute/BUILD      |   2 +-
 .../multi_worker_callback_tf2_test.py         |   3 +-
 7 files changed, 336 insertions(+), 302 deletions(-)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 26027d46c98..140b3089e32 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1695,11 +1695,14 @@ cuda_py_test(
 py_library(
     name = "multi_process_runner",
     srcs = ["multi_process_runner.py"],
+    srcs_version = "PY3",
     deps = [
         ":multi_process_lib",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:tf2",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/eager:context",
+        "@absl_py//absl/logging",
         "@six_archive//:six",
     ],
 )
@@ -1707,18 +1710,19 @@ py_library(
 py_library(
     name = "multi_process_lib",
     srcs = ["multi_process_lib.py"],
-    deps = ["@six_archive//:six"],
+    deps = ["//tensorflow/python:client_testlib"],
 )
 
 py_test(
     name = "multi_process_runner_test",
     srcs = ["multi_process_runner_test.py"],
     python_version = "PY3",
-    shard_count = 12,
     deps = [
         ":multi_process_runner",
         ":multi_worker_test_base",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/logging",
+        "@six_archive//:six",
     ],
 )
 
@@ -1726,6 +1730,7 @@ py_test(
     name = "multi_process_runner_no_init_test",
     srcs = ["multi_process_runner_no_init_test.py"],
     python_version = "PY3",
+    tags = ["no_oss"],
     deps = [
         ":multi_process_runner",
         ":multi_worker_test_base",
diff --git a/tensorflow/python/distribute/multi_process_lib.py b/tensorflow/python/distribute/multi_process_lib.py
index f3b03ca8bc4..ae9aa494062 100644
--- a/tensorflow/python/distribute/multi_process_lib.py
+++ b/tensorflow/python/distribute/multi_process_lib.py
@@ -18,9 +18,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
+import multiprocessing as _multiprocessing
 import unittest
 
+from tensorflow.python.platform import test
+
+
+try:
+  multiprocessing = _multiprocessing.get_context('forkserver')
+except ValueError:
+  # forkserver is not available on Windows.
+  multiprocessing = _multiprocessing.get_context('spawn')
+
 
 class Process(object):
   """A process simulating a worker for testing multi-worker training."""
@@ -31,20 +40,11 @@ class Process(object):
         'TODO(b/141874796): Implement OSS version of `multi_process_lib`')
 
 
-def get_user_data():
-  """Returns the data commonly shared by parent process and subprocesses."""
-  # TODO(b/141874796): Implement OSS version of `multi_process_lib`.
-  pass
+def test_main():
+  """Main function to be called within `__main__` of a test file."""
+  test.main()
 
 
-@contextlib.contextmanager
-def context_manager(max_subprocess_count=20, barrier_parties=0):
-  """No-op in OSS. This exists to maintain testing compatibility."""
-  del max_subprocess_count, barrier_parties
-  yield
-
-
-def using_context_manager():
-  """Whether the context manager is being used."""
-  raise unittest.SkipTest(
-      'TODO(b/141874796): Implement OSS version of `multi_process_lib`')
+def initialized():
+  """Returns whether the module is initialized."""
+  return True
diff --git a/tensorflow/python/distribute/multi_process_runner.py b/tensorflow/python/distribute/multi_process_runner.py
index e258d98fb8c..924e7b85b12 100644
--- a/tensorflow/python/distribute/multi_process_runner.py
+++ b/tensorflow/python/distribute/multi_process_runner.py
@@ -18,6 +18,7 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
 import collections
 import contextlib
 import json
@@ -26,15 +27,19 @@ import signal
 import sys
 import threading
 import time
+
 from absl import logging
+import dill
 import six
 from six.moves import queue as Queue
+import tblib.pickling_support
 
 from tensorflow.python import tf2
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import multi_process_lib
 from tensorflow.python.eager import context
-from tensorflow.python.platform import test
+
+multiprocessing = multi_process_lib.multiprocessing
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -43,61 +48,45 @@ try:
 except ImportError:
   faulthandler = None
 
+# For pickling traceback objects.
+tblib.pickling_support.install()
+
 # _ProcessStatusInfo contains process status information. When is_successful
 # attribute is True, the subprocess has ended successfully, or if False, the
 # exception stack trace info is stored in exc_info to pass on to parent process
 # to be re-raised.
 _ProcessStatusInfo = collections.namedtuple(
-    '_ProcessStatusInfo', ['task_type', 'is_successful', 'exc_info'])
-
-# _SubprocessInfo collects basic information of a subprocess such as task type
-# and process id.
-# TODO(rchao): Include task_type and task_id in subprocess info.
-_SubprocessInfo = collections.namedtuple('_SubprocessInfo', ['pid'])
+    '_ProcessStatusInfo',
+    ['task_type', 'is_successful', 'exc_info', 'return_value'])
 
 # Information returned from a successful MultiProcessRunner run.
 MultiProcessRunnerResult = collections.namedtuple('MultiProcessRunnerResult',
                                                   ['return_value', 'stdout'])
 
-# Process status queue is used by `multi_process_runner` internally for
-# communication from subprocesses to the parent process for whether it's been
-# successful, and if not what the error stack trace is.
-PROCESS_STATUS_QUEUE = 'process_status_queue'
+TestEnvironment = collections.namedtuple('TestEnvironment', [
+    'task_type', 'task_id', 'cluster_spec', 'rpc_layer', 'grpc_fail_fast',
+    'v2_enabled', 'executing_eagerly'
+])
 
-# Return value queue is intended to be used by users of `multi_process_runner`
-# for the process function to return information to the caller of
-# `multi_process_runner.run()`.
-RETURN_VALUE_QUEUE = 'return_value_queue'
-
-# Subprocess info queue stores `_SubprocessInfo` for later potential
-# termination by the parent.
-SUBPROCESS_INFO_QUEUE = 'subprocess_info_queue'
-
-# Parent-to-sub queue is used for communications from parent to subprocess.
-# Currently this is only used to terminate subprocesses.
+# Resources for communication between worker processes and the main process.
+#
+# `process_status_queue` is used by `multi_process_runner` internally for
+#   communication from subprocesses to the parent process for whether it's been
+#   successful, and if not what the error stack trace is.
+# `parent_to_sub_queue` is used for communications from parent to subprocess.
+#   Currently this is only used to terminate subprocesses.
 # TODO(rchao): Remove this once subprocess is terminated by SIGKILL.
-PARENT_TO_SUB_QUEUE = 'parent_to_sub_queue'
-
-# Streaming queue stores the logged and printed messages from subprocesses.
-STREAMING_QUEUE = 'streaming_queue'
-
-# Pipes to stream stdout and stderr from subprocesses to parent process.
-STREAMING_PIPE = 'streaming_pipe'
-
-# Barrier identifier.
-BARRIER = 'barrier'
-
-_DEFAULT_MAX_SUBPROCESS_COUNT = 20
+# `streaming_pipe_w` is to stream stdout and stderr from subprocesses to parent
+#   process.
+# `barrier` is a barrier for the party of all subprocesses.
+Resources = collections.namedtuple('Resources', [
+    'process_status_queue', 'parent_to_sub_queue', 'streaming_pipe_w', 'barrier'
+])
 
 # Default time out sec is selected so that it's handled before the default
 # "medium" timeout of the test runs.
 _DEFAULT_TIMEOUT_SEC = 200
 
-# Next pipe index to be global so that pipes are not reused across multiple
-# MultiProcessRunner usages.
-# TODO(rchao): Investigate possibility to remove this variable.
-_next_pipe_index = 0
-
 
 class MultiProcessRunner(object):
   """A utility class to start multiple processes to simulate a cluster.
@@ -123,6 +112,7 @@ class MultiProcessRunner(object):
                grpc_fail_fast=None,
                stream_stdout=True,
                list_stdout=False,
+               use_dill_for_args=True,
                args=None,
                kwargs=None):
     """Creates a multi-process runner.
@@ -153,6 +143,9 @@ class MultiProcessRunner(object):
         returned from `MultiProcessRunner.join()`. If True, the list of stdout
         can be retrieved via `MultiProcessRunnerResult.stdout` attribute.
         Defaults to False.
+      use_dill_for_args: Whether to use dill to pickle `args` and `kwargs`. dill
+        can pickle more objects, but doesn't work with types in
+        `multiprocessing` library like `Mutex`.
       args: Positional arguments to be sent to functions run on processes.
       kwargs: Keyword arguments to be sent to functions run on processes.
 
@@ -166,15 +159,15 @@ class MultiProcessRunner(object):
                        'one chief. Current `cluster_spec` has {} chiefs.'
                        .format(len(cluster_spec['chief'])))
 
-    assert callable(proc_func)
-
-    if not multi_process_lib.using_context_manager():
+    if not multi_process_lib.initialized():
       raise RuntimeError('`multi_process_runner` is not initialized. '
                          'Please call `multi_process_runner.test_main()` '
                          'within `if __name__ == \'__main__\':` block '
                          'in your python module to properly initialize '
                          '`multi_process_runner`.')
 
+    assert callable(proc_func)
+
     self._proc_func = proc_func
     self._cluster_spec = cluster_spec
     self._rpc_layer = rpc_layer
@@ -184,62 +177,86 @@ class MultiProcessRunner(object):
     # TODO(rchao): Revisit list_stdout argument to consider other solution.
     self._list_stdout = list_stdout
     self._dependence_on_chief = True
+    self._use_dill_for_args = use_dill_for_args
     self._args = args or ()
     self._kwargs = kwargs or {}
 
-    self._outstanding_subprocess_count = 0
-
     # Child processes should have the same v2 and eager behavior.
     self._v2_enabled = tf2.enabled()
     self._executing_eagerly = context.executing_eagerly()
 
+    self._joined = False
+    self._processes = {}
+    self._outstanding_subprocess_count = 0
+    self._reading_threads = []
+
+    self._manager = multiprocessing.Manager()
+    self._process_status_queue = self._manager.Queue()
+    self._parent_to_sub_queue = self._manager.Queue()
+    parties = sum(len(addresses) for addresses in self._cluster_spec.values())
+    self._barrier = self._manager.Barrier(parties)
+
+    # We use a queue to collect outputs from worker processes since it's thread
+    # safe.
+    self._streaming_queue = self._manager.Queue()
+
     # This flag will be set to True once terminate_all() is called.
     self._all_forced_terminated = False
 
   def _continuously_readline_from_sub(self, pipe_r, task_type, task_id):
     """Function to continuously read lines from subprocesses."""
-    reader = os.fdopen(pipe_r.fileno(), 'r')
-    while True:
-      read_line = reader.readline()
-      if read_line == 'EOF':
-        reader.close()
-        # The thread that runs `_continuously_readline_from_sub` stops here.
-        # However the threads don't exit until the test exits, so we do not
-        # attempt to join the threads (which leads to timeout).
-        # TODO(rchao): Understand why and do thread joining.
-        break
-      task_string = '[{}-{}]:'.format(task_type, task_id)
-      formatted_line = '{} {}'.format(task_string.ljust(14), read_line)
-      if self._stream_stdout:
-        self._print_stdout_in_parent(formatted_line, task_type, task_id)
-      if self._list_stdout:
-        self._add_stdout_in_queue(formatted_line, task_type, task_id)
+    with os.fdopen(pipe_r.fileno(), 'r', closefd=False) as reader:
+      for line in reader:
+        task_string = '[{}-{}]:'.format(task_type, task_id)
+        formatted_line = '{} {}'.format(task_string.ljust(14), line)
+        if self._stream_stdout:
+          # TODO(rchao): Use a lock here to ensure the printed lines are not
+          # broken.
+          print(formatted_line, end='', flush=True)
+        if self._list_stdout:
+          self._streaming_queue.put(formatted_line)
 
-  def _print_stdout_in_parent(self, formatted_line, task_type, task_id):
-    del task_type, task_id
-    # Flush True so the logging order from subprocesses is respected.
-    # TODO(rchao): Use a lock here to ensure the printed lines are not broken.
-    print(formatted_line, end='', flush=True)
-
-  def _add_stdout_in_queue(self, formatted_line, task_type, task_id):
-    del task_type, task_id
-    # A queue instead of a simple list is used here due to b/150652733.
-    _resource(STREAMING_QUEUE).put(formatted_line)
-
-  def _start_subprocess_and_reading_thread(self, proc_func, task_type, task_id,
-                                           cluster_spec, args, kwargs):
+  def _start_subprocess_and_reading_thread(self,
+                                           task_type,
+                                           task_id,
+                                           cluster_spec=None,
+                                           proc_func=None,
+                                           args=None,
+                                           kwargs=None):
     """Start a subprocess and a thread the reads lines from the subprocess."""
-    global _next_pipe_index
-    pipe_r, pipe_w = _resource(STREAMING_PIPE)[_next_pipe_index]
-    _next_pipe_index += 1
 
-    p = multi_process_lib.Process(
-        target=_Subprocess(),
-        args=(proc_func, task_type, task_id, cluster_spec, self._rpc_layer,
-              self._grpc_fail_fast, self._v2_enabled, self._executing_eagerly,
-              pipe_w) + args,
-        kwargs=kwargs)
+    test_env = TestEnvironment(
+        task_type=task_type,
+        task_id=task_id,
+        cluster_spec=cluster_spec or self._cluster_spec,
+        rpc_layer=self._rpc_layer,
+        grpc_fail_fast=self._grpc_fail_fast,
+        v2_enabled=self._v2_enabled,
+        executing_eagerly=self._executing_eagerly,
+    )
+    pipe_r, pipe_w = multiprocessing.Pipe(duplex=False)
+    resources = Resources(
+        process_status_queue=self._process_status_queue,
+        parent_to_sub_queue=self._parent_to_sub_queue,
+        streaming_pipe_w=pipe_w,
+        barrier=self._barrier,
+    )
+    if proc_func is None:
+      proc_func, args, kwargs = self._proc_func, self._args, self._kwargs
+    # Always use dill to pickle proc_func so that we support more callable
+    # types, e.g. lambda.
+    proc_func = dill.dumps(proc_func, dill.HIGHEST_PROTOCOL)
+    if self._use_dill_for_args:
+      args = dill.dumps(args, dill.HIGHEST_PROTOCOL)
+      kwargs = dill.dumps(kwargs, dill.HIGHEST_PROTOCOL)
+
+    p = _Process(
+        test_env=test_env,
+        target=_ProcFunc(),
+        args=(resources, test_env, proc_func, args, kwargs,
+              self._use_dill_for_args))
     p.start()
+    self._processes[(task_type, task_id)] = p
     self._outstanding_subprocess_count += 1
 
     # For each subprocess, we dedicate a thread continuously reading lines
@@ -248,18 +265,15 @@ class MultiProcessRunner(object):
         target=self._continuously_readline_from_sub,
         args=(pipe_r, task_type, task_id))
     thread.start()
+    self._reading_threads.append(thread)
 
   def start(self):
     """Starts processes, one for each task in `cluster_spec`."""
-
-    global _next_pipe_index
-    self._starting_pipe_index = _next_pipe_index
-
+    if self._processes:
+      raise ValueError('MultiProcessRunner already started.')
     for task_type, addresses in self._cluster_spec.items():
       for task_id, _ in enumerate(addresses):
-        self._start_subprocess_and_reading_thread(self._proc_func, task_type,
-                                                  task_id, self._cluster_spec,
-                                                  self._args, self._kwargs)
+        self._start_subprocess_and_reading_thread(task_type, task_id)
 
     # TODO(rchao): Remove the need of using SIGALRM if possible. At this time,
     # without this the tests become very flaky.
@@ -309,33 +323,22 @@ class MultiProcessRunner(object):
       as_task_type: The task type to be run in the main process.
       as_task_id: The task id to be run in the main process.
     """
-    global _next_pipe_index
-    self._starting_pipe_index = _next_pipe_index
-
+    if self._processes:
+      raise ValueError('MultiProcessRunner already started.')
     for task_type, addresses in self._cluster_spec.items():
       for task_id, _ in enumerate(addresses):
         if not (task_type == as_task_type and task_id == as_task_id):
-          self._start_subprocess_and_reading_thread(self._proc_func, task_type,
-                                                    task_id, self._cluster_spec,
-                                                    self._args, self._kwargs)
-    tf_config_dict = {
-        'cluster': self._cluster_spec,
-        'task': {
-            'type': as_task_type,
-            'index': as_task_id,
-        },
-    }
-    if self._rpc_layer is not None:
-      tf_config_dict['rpc_layer'] = self._rpc_layer
-    os.environ['TF_CONFIG'] = json.dumps(tf_config_dict)
+          self._start_subprocess_and_reading_thread(task_type, task_id)
 
+    _set_tf_config(as_task_type, as_task_id, self._cluster_spec,
+                   self._rpc_layer)
     self._proc_func(*self._args, **self._kwargs)
 
   def start_single_process(self,
                            task_type,
                            task_id,
-                           proc_func=None,
                            cluster_spec=None,
+                           proc_func=None,
                            args=None,
                            kwargs=None):
     """Starts a single process.
@@ -352,19 +355,22 @@ class MultiProcessRunner(object):
     Args:
       task_type: The task type.
       task_id: The task id.
-      proc_func: The process function to be run on the newly started
-        process. If `None`, the function provided at `__init__` will be used.
       cluster_spec: The cluster spec to be used on the newly started
         process. If `None`, the cluster spec provided at `__init__` will be
         used.
+      proc_func: The process function to be run on the newly started
+        process. If specified, specify `args` and `kwargs` as well. If `None`,
+        the function provided at `__init__` will be used.
       args: Optional positional arguments to be supplied in `proc_func`.
       kwargs: Optional keyword arguments to be supplied in `proc_func`.
     """
-    cluster_spec = cluster_spec or self._cluster_spec
-    proc_func = proc_func or self._proc_func
-    self._start_subprocess_and_reading_thread(proc_func, task_type, task_id,
-                                              cluster_spec, args or (),
-                                              kwargs or {})
+    self._start_subprocess_and_reading_thread(
+        task_type,
+        task_id,
+        cluster_spec=cluster_spec,
+        proc_func=proc_func,
+        args=args or (),
+        kwargs=kwargs or {})
 
   def _queue_to_list(self, queue_to_convert):
     """Convert `queue.Queue` to `list`."""
@@ -377,6 +383,18 @@ class MultiProcessRunner(object):
         break
     return list_to_return
 
+  def _join_or_terminate(self, task_type, task_id, process, timeout):
+    """Joins a process. If it times out, terminate all procsses."""
+    logging.info('joining %s-%d', task_type, task_id)
+    process.join(timeout)
+    # If exitcode is None, the process aren't terminated and this is a
+    # timeout.
+    if process.exitcode is None:
+      # Force termination to dump worker processes stack trace.
+      self.terminate_all(sig=signal.SIGTERM)
+      raise RuntimeError('%s-%d and possibly more subprocesses timed out.' %
+                         (task_type, task_id))
+
   def join(self, timeout=_DEFAULT_TIMEOUT_SEC):
     """Joins all the processes with timeout.
 
@@ -395,88 +413,97 @@ class MultiProcessRunner(object):
       RuntimeError: if not all processes report status approximatelty within
       `timeout` seconds, or there's an exception propagated from any subprocess.
     """
+    if self._joined:
+      raise ValueError("MultiProcessRunner can't be joined twice.")
+    self._joined = True
 
-    if not timeout:
-      timeout = float('inf')
-    start_time = time.time()
-    while self._outstanding_subprocess_count > 0:
-      while True:
-        try:
-          process_status = _resource(PROCESS_STATUS_QUEUE).get(timeout=10)
-          break
-        except Queue.Empty:
-          if self._all_forced_terminated:
-            break
-          if time.time() - start_time > timeout:
-            # Send SIGTERM signal to subprocesses to dump their current
-            # stack trace.
-            self.terminate_all(sig=signal.SIGTERM)
-            # If none of those did, report timeout to user.
-            raise RuntimeError('One or more subprocesses timed out. '
-                               'Number of outstanding subprocesses '
-                               'is %d.' % self._outstanding_subprocess_count)
+    chief = self._processes.get(('chief', 0), None)
+    if self._dependence_on_chief and chief:
+      self._join_or_terminate('chief', 0, chief, timeout)
+      # Give other processes a chance to exit on their own.
+      for p in self._processes.values():
+        p.join(timeout=3)
+      self.terminate_all()
+    else:
+      for (task_type, task_id), p in self._processes.items():
+        self._join_or_terminate(task_type, task_id, p, timeout)
 
-      if self._all_forced_terminated:
-        break
-      self._outstanding_subprocess_count -= 1
+    for (task_type, task_id), p in self._processes.items():
+      logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode)
+
+    process_statuses = self._queue_to_list(self._process_status_queue)
+    if not self._all_forced_terminated and len(
+        process_statuses) != self._outstanding_subprocess_count:
+      raise RuntimeError(
+          'missing statuses from %d subproceses.' %
+          (self._outstanding_subprocess_count - len(process_statuses)))
+    return_values = []
+    for process_status in process_statuses:
       assert isinstance(process_status, _ProcessStatusInfo)
       if not process_status.is_successful:
         six.reraise(*process_status.exc_info)
+      if process_status.return_value is not None:
+        return_values.append(process_status.return_value)
 
-      if self._dependence_on_chief and process_status.task_type == 'chief':
-        self.terminate_all()
-        break
+    logging.info('Joining log reading threads.')
+    for thread in self._reading_threads:
+      thread.join()
+    logging.info('Joined log reading threads.')
 
-    # Giving threads some time to finish the message reading from subprocesses.
-    time.sleep(5)
+    # Clear the alarm.
+    signal.alarm(0)
 
-    stdout = self._queue_to_list(_resource(STREAMING_QUEUE))
-    return_value = self._queue_to_list(_resource(RETURN_VALUE_QUEUE))
+    stdout = self._queue_to_list(self._streaming_queue)
 
-    # Notifying the threads that are reading lines that we should stop.
-    for pipe_index in range(self._starting_pipe_index, _next_pipe_index):  # pylint: disable=protected-access
-      _, pipe_w = _resource(STREAMING_PIPE)[pipe_index]
-      writer = os.fdopen(pipe_w.fileno(), 'w')
-      # Writing end of file message so the threads that's actively reading lines
-      # know to stop.
-      writer.writelines(['EOF'])
-      writer.close()
-
-    return MultiProcessRunnerResult(stdout=stdout, return_value=return_value)
+    return MultiProcessRunnerResult(stdout=stdout, return_value=return_values)
 
   def terminate(self, task_type, task_id):
     """Terminates the process with `task_type` and `task_id`."""
-    _resource(PARENT_TO_SUB_QUEUE).put('terminate {} {}'.format(
-        task_type, task_id))
+    p = self._processes.get((task_type, task_id), None)
+    if p is None:
+      raise ValueError('{}-{} does not exist'.format(task_type, task_id))
+    # TODO(crccw): change to use Process.terminate() as well.
+    self._parent_to_sub_queue.put('terminate {} {}'.format(task_type, task_id))
+    p.join()
 
   def terminate_all(self, sig=None):
     """Terminates all subprocesses."""
     # Use SIGKILL as default. In systems where that's unavailable such as
     # windows, use SIGTERM.
     sig = sig or getattr(signal, 'SIGKILL', signal.SIGTERM)
-    subprocess_infos = []
-
-    while True:
+    for (task_type, task_id), p in self._processes.items():
       try:
-        subprocess_info = _resource(SUBPROCESS_INFO_QUEUE).get(block=False)
-        subprocess_infos.append(subprocess_info)
-      except Queue.Empty:
-        break
-
-    for subprocess_info in subprocess_infos:
-      logging.info('Parent process is now killing PID: %d', subprocess_info.pid)
-      try:
-        os.kill(subprocess_info.pid, sig)
+        os.kill(p.pid, sig)
       except ProcessLookupError:
-        # TODO(rchao): Remove subprocess info from the queue once a subprocess
-        # is terminated.
-        logging.info('PID %d does not exist.', subprocess_info.pid)
-
+        logging.info('Attempting to kill %s-%d but it does not exist.',
+                     task_type, task_id)
     self._all_forced_terminated = True
 
 
-class _Subprocess(object):
-  """Represents an internal subprocess used in MultiProcessRunner's context."""
+class _Process(multi_process_lib.Process):
+  """A modified `multiprocessing.Process` that can set up environment variables."""
+
+  # TODO(crccw): consider moving other logics in _ProcFunc to _Process.
+
+  def __init__(self, test_env, **kwargs):
+    super(_Process, self).__init__(**kwargs)
+    self._test_env = test_env
+    self._actual_run = getattr(self, 'run')
+    self.run = self._run_with_setenv
+
+  def _run_with_setenv(self):
+    # We need to set environment variables before doing anything because
+    # setenv() is not thread-safe.
+    test_env = self._test_env
+    if test_env.grpc_fail_fast is not None:
+      os.environ['GRPC_FAIL_FAST'] = str(test_env.grpc_fail_fast)
+    _set_tf_config(test_env.task_type, test_env.task_id, test_env.cluster_spec,
+                   test_env.rpc_layer)
+    return self._actual_run()
+
+
+class _ProcFunc(object):
+  """Represents a callable to run in a subprocess."""
 
   @contextlib.contextmanager
   def _runtime_mode(self, executing_eagerly):
@@ -487,21 +514,12 @@ class _Subprocess(object):
       with context.graph_mode():
         yield
 
-  def _finish_process(self, process_status_info, return_value):
-    """Adds data to queues before program exits."""
-    # Clear the alarm.
-    signal.alarm(0)
-
-    if return_value is not None:
-      self._add_return_data(return_value)
-    _resource(PROCESS_STATUS_QUEUE).put(process_status_info)
-
   def _message_checking_func(self, task_type, task_id):
     """A function that regularly checks messages from parent process."""
     # TODO(rchao): Remove this once parent uses SIGKILL to terminate subprocess.
     while True:
       try:
-        message = _resource(PARENT_TO_SUB_QUEUE).get(block=False)
+        message = self._resources.parent_to_sub_queue.get(block=False)
 
         # Currently the only possible message is termination.
         if not message.startswith('terminate'):
@@ -512,62 +530,75 @@ class _Subprocess(object):
         else:
           # If the message is not targeting this process, put it back to the
           # queue.
-          _resource(PARENT_TO_SUB_QUEUE).put(message)
+          self._resources.parent_to_sub_queue.put(message)
           time.sleep(1)
       except Queue.Empty:
         time.sleep(0.1)
-    self._finish_process(
+    self._resources.process_status_queue.put(
         _ProcessStatusInfo(
-            task_type=task_type, is_successful=True, exc_info=None), None)
+            task_type=task_type,
+            is_successful=True,
+            exc_info=None,
+            return_value=None))
     # `os._exit(0)` is used to more reliably terminate a subprocess.
     os._exit(0)  # pylint: disable=protected-access
 
-  def __call__(self, proc_func, task_type, task_id, per_process_cluster_spec,
-               rpc_layer, grpc_fail_fast, v2_enabled, executing_eagerly, pipe_w,
-               *arg, **kwargs):
+  def _close_streaming(self):
+    """Close stdout, stderr and streaming pipe.
+
+    We need to explicitly close them since Tensorflow may take a while to exit,
+    so that the reading threads in the main process can exit more quickly.
+    """
+    sys.stdout.flush()
+    sys.stderr.flush()
+    sys.stdout.close()
+    sys.stderr.close()
+    self._resources.streaming_pipe_w.close()
+
+  def __call__(self, resources, test_env, proc_func, args, kwargs,
+               use_dill_for_args):
     """The wrapper function that actually gets run in child process(es)."""
 
+    global _barrier
+
+    self._resources = resources
+    _barrier = self._resources.barrier
+    proc_func = dill.loads(proc_func)
+    if use_dill_for_args:
+      args = dill.loads(args)
+      kwargs = dill.loads(kwargs)
+
     if faulthandler is not None:
       faulthandler.enable()
       faulthandler.register(signal.SIGTERM, chain=True)
 
+    # All logging should go to stderr to be streamed to the main process.
+    logging.set_stderrthreshold(logging.DEBUG)
+
+    # Assign sys.stdout and sys.stderr as duplicates of `streaming_pipe_w` so
+    # print() and logging.*() write directly to `streaming_pipe_w`.
+    # Unfortunately since we cannot prepend task_type and task_id information to
+    # the streamed logs we will need a thread per subprocess to distinguish
+    # where the piece of message is from.
+    os.dup2(resources.streaming_pipe_w.fileno(), sys.stdout.fileno())
+    os.dup2(resources.streaming_pipe_w.fileno(), sys.stderr.fileno())
+
     pid = os.getpid()
     logging.info('Subprocess with PID %d (%s, %d) is now being started.', pid,
-                 task_type, task_id)
-    _resource(SUBPROCESS_INFO_QUEUE).put(_SubprocessInfo(pid=pid))
-    # Assign sys.stdout and sys.stderr as duplicates of `pipe_w` so print() and
-    # logging.*() write directly to `pipe_w`. Unfortunately since we cannot
-    # prepend task_type and task_id information to the streamed logs we will
-    # need a thread per subprocess to distinguish where the piece of message is
-    # from.
-    os.dup2(pipe_w.fileno(), sys.stdout.fileno())
-    os.dup2(pipe_w.fileno(), sys.stderr.fileno())
+                 test_env.task_type, test_env.task_id)
 
     # The thread will be dedicated to checking messages from the parent process.
     threading.Thread(  # pylint: disable=unexpected-keyword-arg
         target=self._message_checking_func,
-        args=(task_type, task_id),
+        args=(test_env.task_type, test_env.task_id),
         daemon=True).start()
 
-    if grpc_fail_fast is not None:
-      os.environ['GRPC_FAIL_FAST'] = str(grpc_fail_fast)
-    tf_config_dict = {
-        'cluster': per_process_cluster_spec,
-        'task': {
-            'type': task_type,
-            'index': task_id,
-        },
-    }
-    if rpc_layer is not None:
-      tf_config_dict['rpc_layer'] = rpc_layer
-    os.environ['TF_CONFIG'] = json.dumps(tf_config_dict)
-
-    if v2_enabled:
+    if test_env.v2_enabled:
       v2_compat.enable_v2_behavior()
 
     try:
-      with self._runtime_mode(executing_eagerly):
-        return_value = proc_func(*arg, **kwargs)
+      with self._runtime_mode(test_env.executing_eagerly):
+        return_value = proc_func(*args, **kwargs)
         is_successful = True
         exc_info = None
 
@@ -587,35 +618,27 @@ class _Subprocess(object):
       raise
 
     finally:
-      self._finish_process(
-          _ProcessStatusInfo(
-              task_type=task_type,
-              is_successful=is_successful,
-              exc_info=exc_info),
-          return_value)
-
-  def _add_return_data(self, data):
-    """Adds return data that will be returned by `join`.
-
-    The function provides a way for child processes to communicate with the
-    parent process. Data passed to `_add_return_data` will be available in a
-    Python Queue.Queue that is eventually returned by `join`.
-
-    Args:
-      data: data to be made available in the queue returned by `join`.
-    """
-    # TODO(rchao): Incorporate the task type and id information in a data
-    # wrapper that becomes what is stored in the queue so we can tell where
-    # the data is from.
-    _resource(RETURN_VALUE_QUEUE).put(data)
+      info = _ProcessStatusInfo(
+          task_type=test_env.task_type,
+          is_successful=is_successful,
+          exc_info=exc_info,
+          return_value=return_value)
+      self._resources.process_status_queue.put(info)
+      self._close_streaming()
 
 
-def barrier():
-  return multi_process_lib.get_user_data()[BARRIER]
-
-
-def _resource(resource_name):
-  return multi_process_lib.get_user_data()[resource_name]
+def _set_tf_config(task_type, task_id, cluster_spec, rpc_layer=None):
+  """Set TF_CONFIG environment variable."""
+  tf_config_dict = {
+      'cluster': cluster_spec,
+      'task': {
+          'type': task_type,
+          'index': task_id,
+      },
+  }
+  if rpc_layer is not None:
+    tf_config_dict['rpc_layer'] = rpc_layer
+  os.environ['TF_CONFIG'] = json.dumps(tf_config_dict)
 
 
 def run(proc_func,
@@ -651,16 +674,19 @@ def run(proc_func,
   return runner.join(timeout)
 
 
-def test_main(max_subprocess_count=_DEFAULT_MAX_SUBPROCESS_COUNT,
-              barrier_parties=0):
-  """Main function to be called within `__main__` of a test file.
+# This is set by MultiProcessRunner in worker processes.
+_barrier = None
 
-  Args:
-    max_subprocess_count: Maximum number of subprocesses that will be used. User
-      of multi_process_runner needs to determine a number at calling this
-      method, and the subprocesses involved later should not exceed this number.
-    barrier_parties: Number of parties the barrier will be used toward. User of
-      multi_process_runner needs to determine a number at calling this method.
-  """
-  with multi_process_lib.context_manager(max_subprocess_count, barrier_parties):
-    test.main()
+
+def barrier():
+  if _barrier is None:
+    raise ValueError(
+        'barrier is not defined. It is likely because you are calling barrier()'
+        'in the main process. barrier() can only be called in the subprocesses.'
+    )
+  return _barrier
+
+
+def test_main():
+  """Main function to be called within `__main__` of a test file."""
+  multi_process_lib.test_main()
diff --git a/tensorflow/python/distribute/multi_process_runner_test.py b/tensorflow/python/distribute/multi_process_runner_test.py
index 1413777d0bc..cf68ffd50d7 100644
--- a/tensorflow/python/distribute/multi_process_runner_test.py
+++ b/tensorflow/python/distribute/multi_process_runner_test.py
@@ -23,15 +23,13 @@ import os
 import threading
 import time
 from absl import logging
-from six.moves import queue as Queue
 
 from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.eager import test
 
 
-def proc_func_that_adds_task_type_in_return_data(test_obj, val):
-  test_obj.assertEqual(val, 3)
+def proc_func_that_adds_task_type_in_return_data():
   return multi_worker_test_base.get_task_type()
 
 
@@ -51,6 +49,10 @@ def proc_func_that_return_args_and_kwargs(*args, **kwargs):
   return list(args) + list(kwargs.items())
 
 
+def proc_func_with_barrier():
+  return multi_process_runner.barrier()
+
+
 class MultiProcessRunnerTest(test.TestCase):
 
   def _worker_idx(self):
@@ -61,8 +63,7 @@ class MultiProcessRunnerTest(test.TestCase):
     mpr_result = multi_process_runner.run(
         proc_func_that_adds_task_type_in_return_data,
         multi_worker_test_base.create_cluster_spec(
-            num_workers=2, num_ps=3, has_eval=1),
-        args=(self, 3))
+            num_workers=2, num_ps=3, has_eval=1))
 
     job_count_dict = {'worker': 2, 'ps': 3, 'evaluator': 1}
     for data in mpr_result.return_value:
@@ -124,36 +125,22 @@ class MultiProcessRunnerTest(test.TestCase):
 
   def test_process_that_exits(self):
 
-    def func_to_exit_in_15_sec():
-      time.sleep(5)
-      print('foo', flush=True)
-      time.sleep(20)
-      print('bar', flush=True)
+    def func_to_exit_in_5_sec():
+      logging.error('foo')
+      time.sleep(10)
+      logging.error('bar')
 
     mpr = multi_process_runner.MultiProcessRunner(
-        func_to_exit_in_15_sec,
+        func_to_exit_in_5_sec,
         multi_worker_test_base.create_cluster_spec(num_workers=1),
         list_stdout=True,
-        max_run_time=15)
+        max_run_time=5)
 
     mpr.start()
     stdout = mpr.join().stdout
     self.assertLen([msg for msg in stdout if 'foo' in msg], 1)
     self.assertLen([msg for msg in stdout if 'bar' in msg], 0)
 
-  def test_signal_doesnt_fire_after_process_exits(self):
-    mpr = multi_process_runner.MultiProcessRunner(
-        proc_func_that_does_nothing,
-        multi_worker_test_base.create_cluster_spec(num_workers=1),
-        max_run_time=10)
-    mpr.start()
-    mpr.join()
-    with self.assertRaisesRegexp(Queue.Empty, ''):
-      # If the signal was fired, another message would be added to internal
-      # queue, so verifying it's empty.
-      multi_process_runner._resource(
-          multi_process_runner.PROCESS_STATUS_QUEUE).get(block=False)
-
   def test_termination(self):
 
     def proc_func():
@@ -192,7 +179,7 @@ class MultiProcessRunnerTest(test.TestCase):
         multi_worker_test_base.create_cluster_spec(num_workers=2),
         list_stdout=True)
     mpr.start()
-    time.sleep(5)
+    time.sleep(3)
     mpr.terminate('worker', 0)
     mpr.start_single_process('worker', 0)
     std_stream_results = mpr.join().stdout
@@ -273,11 +260,14 @@ class MultiProcessRunnerTest(test.TestCase):
             has_chief=True, num_workers=1),
         list_stdout=True)
 
-    def follow_ups():
+    def eval_func():
+      time.sleep(1)
       mpr.start_single_process(task_type='evaluator', task_id=0)
 
-    threading.Thread(target=follow_ups).start()
+    eval_thread = threading.Thread(target=eval_func)
+    eval_thread.start()
     mpr.start_in_process_as(as_task_type='chief', as_task_id=0)
+    eval_thread.join()
     list_to_assert = mpr.join().stdout
     for job in ['worker', 'evaluator']:
       for iteration in range(5):
@@ -285,5 +275,17 @@ class MultiProcessRunnerTest(test.TestCase):
             any('{}-0, i: {}'.format(job, iteration) in line
                 for line in list_to_assert))
 
+  def test_barrier(self):
+    multi_process_runner.run(
+        proc_func_with_barrier,
+        cluster_spec=multi_worker_test_base.create_cluster_spec(
+            has_chief=True, num_workers=1),
+    )
+
+  def test_barrier_called_in_main_process(self):
+    with self.assertRaises(ValueError):
+      multi_process_runner.barrier()
+
+
 if __name__ == '__main__':
   multi_process_runner.test_main()
diff --git a/tensorflow/python/distribute/multi_worker_continuous_run_test.py b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
index 437255c1015..14e0564874b 100644
--- a/tensorflow/python/distribute/multi_worker_continuous_run_test.py
+++ b/tensorflow/python/distribute/multi_worker_continuous_run_test.py
@@ -127,4 +127,4 @@ class MultiWorkerContinuousRunTest(test.TestCase, parameterized.TestCase):
 
 
 if __name__ == '__main__':
-  multi_process_runner.test_main(barrier_parties=NUM_WORKERS)
+  multi_process_runner.test_main()
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index f989d93e82e..ddf274f299f 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -364,7 +364,7 @@ py_test(
     name = "multi_worker_callback_tf2_test",
     srcs = ["multi_worker_callback_tf2_test.py"],
     python_version = "PY3",
-    shard_count = 10,
+    shard_count = 5,
     deps = [
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:combinations",
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 8daa46f6ea3..660a8e8cb6c 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -208,6 +208,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
               callbacks.BackupAndRestore(backup_dir=bar_dir),
               AssertCallback()
           ])
+      multi_process_runner.barrier()
       test_obj.assertFalse(file_io.file_exists(backup_filepath))
       test_obj.assertTrue(file_io.file_exists(saving_filepath))
 
@@ -343,4 +344,4 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
 
 if __name__ == '__main__':
-  multi_process_runner.test_main(barrier_parties=2)
+  multi_process_runner.test_main()

From 6be6d3b7ea72b369a9691bbfd1d0874f1127a3a3 Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Mon, 1 Jun 2020 18:23:56 -0700
Subject: [PATCH 1509/1533] Make it clear that gen_selected_ops support
 multiple models

PiperOrigin-RevId: 314243469
Change-Id: Id19f7926d60d340222b258a3eb53010ddea4dd89
---
 tensorflow/lite/build_def.bzl                 | 11 +++++++---
 .../lite/tools/gen_op_registration_main.cc    | 21 ++++++++++++-------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index fd51ad0a4aa..285824a613f 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -634,7 +634,7 @@ def gen_selected_ops(name, model, namespace = "", **kwargs):
 
     Args:
       name: Name of the generated library.
-      model: TFLite model to interpret.
+      model: TFLite models to interpret, expect a list in case of multiple models.
       namespace: Namespace in which to put RegisterSelectedOps.
       **kwargs: Additional kwargs to pass to genrule.
     """
@@ -645,12 +645,17 @@ def gen_selected_ops(name, model, namespace = "", **kwargs):
     # isinstance is not supported in skylark.
     if type(model) != type([]):
         model = [model]
+
+    input_models_args = " --input_models=%s" % ",".join(
+        ["$(location %s)" % f for f in model],
+    )
+
     native.genrule(
         name = name,
         srcs = model,
         outs = [out],
-        cmd = ("$(location %s) --namespace=%s --output_registration=$(location %s) --tflite_path=%s $(SRCS)") %
-              (tool, namespace, out, tflite_path[2:]),
+        cmd = ("$(location %s) --namespace=%s --output_registration=$(location %s) --tflite_path=%s %s") %
+              (tool, namespace, out, tflite_path[2:], input_models_args),
         tools = [tool],
         **kwargs
     )
diff --git a/tensorflow/lite/tools/gen_op_registration_main.cc b/tensorflow/lite/tools/gen_op_registration_main.cc
index 410aaabf064..e4398663580 100644
--- a/tensorflow/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/lite/tools/gen_op_registration_main.cc
@@ -19,23 +19,24 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/gen_op_registration.h"
 
-const char kInputModelFlag[] = "input_model";
+const char kInputModelFlag[] = "input_models";
 const char kNamespace[] = "namespace";
 const char kOutputRegistrationFlag[] = "output_registration";
 const char kTfLitePathFlag[] = "tflite_path";
 const char kForMicro[] = "for_micro";
 
-void ParseFlagAndInit(int* argc, char** argv, std::string* input_model,
+void ParseFlagAndInit(int* argc, char** argv, std::string* input_models,
                       std::string* output_registration,
                       std::string* tflite_path, std::string* namespace_flag,
                       bool* for_micro) {
   std::vector<tflite::Flag> flag_list = {
-      tflite::Flag::CreateFlag(kInputModelFlag, input_model,
-                               "path to the tflite model"),
+      tflite::Flag::CreateFlag(kInputModelFlag, input_models,
+                               "path to the tflite models, separated by comma"),
       tflite::Flag::CreateFlag(kOutputRegistrationFlag, output_registration,
                                "filename for generated registration code"),
       tflite::Flag::CreateFlag(kTfLitePathFlag, tflite_path,
@@ -144,22 +145,26 @@ void AddOpsFromModel(const std::string& input_model,
 }  // namespace
 
 int main(int argc, char** argv) {
-  std::string input_model;
+  std::string input_models;
   std::string output_registration;
   std::string tflite_path;
   std::string namespace_flag;
   bool for_micro = false;
-  ParseFlagAndInit(&argc, argv, &input_model, &output_registration,
+  ParseFlagAndInit(&argc, argv, &input_models, &output_registration,
                    &tflite_path, &namespace_flag, &for_micro);
 
   tflite::RegisteredOpMap builtin_ops;
   tflite::RegisteredOpMap custom_ops;
-  if (!input_model.empty()) {
-    AddOpsFromModel(input_model, &builtin_ops, &custom_ops);
+  if (!input_models.empty()) {
+    std::vector<std::string> models = absl::StrSplit(input_models, ',');
+    for (const std::string& input_model : models) {
+      AddOpsFromModel(input_model, &builtin_ops, &custom_ops);
+    }
   }
   for (int i = 1; i < argc; i++) {
     AddOpsFromModel(argv[i], &builtin_ops, &custom_ops);
   }
+
   GenerateFileContent(tflite_path, output_registration, namespace_flag,
                       builtin_ops, custom_ops, for_micro);
   return 0;

From 4061712bfefe6171e7eb84b4c50ab266b3bc498e Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 1 Jun 2020 18:27:40 -0700
Subject: [PATCH 1510/1533] Make XPlaneBuilder::Id const

PiperOrigin-RevId: 314243937
Change-Id: Ie4794e60e484027dfdf99c7089fa58ec6811d0e2
---
 tensorflow/core/profiler/utils/xplane_builder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/utils/xplane_builder.h b/tensorflow/core/profiler/utils/xplane_builder.h
index d948964bc2e..d827953cc88 100644
--- a/tensorflow/core/profiler/utils/xplane_builder.h
+++ b/tensorflow/core/profiler/utils/xplane_builder.h
@@ -202,7 +202,7 @@ class XPlaneBuilder : public XStatsBuilder<XPlane> {
  public:
   explicit XPlaneBuilder(XPlane* plane);
 
-  int64 Id() { return plane_->id(); }
+  int64 Id() const { return plane_->id(); }
   void SetId(int64 id) { plane_->set_id(id); }
 
   void SetName(absl::string_view name) { plane_->set_name(std::string(name)); }

From 9aca1f01b92b8da3783ad04e990b4d0dfe409ee4 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 1 Jun 2020 18:48:02 -0700
Subject: [PATCH 1511/1533] [tf.data] Move cardinality to core API.

To migrate, replace `tf.data.experimental.cardinality(dataset)` with `dataset.cardinality()`.

PiperOrigin-RevId: 314246452
Change-Id: I6839b1519adda790b91b5197ec30bac339ca5149
---
 tensorflow/python/data/__init__.py            |   2 +
 .../data/experimental/ops/cardinality.py      |   1 +
 tensorflow/python/data/kernel_tests/BUILD     |  24 +++
 .../data/kernel_tests/cardinality_test.py     | 174 ++++++++++++++++++
 .../python/data/kernel_tests/len_test.py      |  59 ++++++
 tensorflow/python/data/ops/dataset_ops.py     |  64 +++++++
 .../golden/v1/tensorflow.data.-dataset.pbtxt  |   4 +
 ...ow.data.-fixed-length-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-t-f-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-text-line-dataset.pbtxt  |   4 +
 ...rflow.data.experimental.-csv-dataset.pbtxt |   4 +
 ...ow.data.experimental.-random-dataset.pbtxt |   4 +
 ...rflow.data.experimental.-sql-dataset.pbtxt |   4 +
 .../tools/api/golden/v1/tensorflow.data.pbtxt |   8 +
 .../golden/v2/tensorflow.data.-dataset.pbtxt  |   4 +
 ...ow.data.-fixed-length-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-t-f-record-dataset.pbtxt |   4 +
 .../tensorflow.data.-text-line-dataset.pbtxt  |   4 +
 ...rflow.data.experimental.-csv-dataset.pbtxt |   4 +
 ...ow.data.experimental.-random-dataset.pbtxt |   4 +
 ...rflow.data.experimental.-sql-dataset.pbtxt |   4 +
 .../tools/api/golden/v2/tensorflow.data.pbtxt |   8 +
 22 files changed, 396 insertions(+)
 create mode 100644 tensorflow/python/data/kernel_tests/cardinality_test.py
 create mode 100644 tensorflow/python/data/kernel_tests/len_test.py

diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 9dea461a0f0..39cbd3de735 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -24,8 +24,10 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
 from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops.dataset_ops import INFINITE as INFINITE_CARDINALITY
 from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
 from tensorflow.python.data.ops.dataset_ops import make_one_shot_iterator
+from tensorflow.python.data.ops.dataset_ops import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
index 54d30e1fba8..f1b8908fa68 100644
--- a/tensorflow/python/data/experimental/ops/cardinality.py
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -32,6 +32,7 @@ tf_export("data.experimental.UNKNOWN_CARDINALITY").export_constant(
     __name__, "UNKNOWN")
 
 
+# TODO(b/157691652): Deprecate this method after migrating users to the new API.
 @tf_export("data.experimental.cardinality")
 def cardinality(dataset):
   """Returns the cardinality of `dataset`, if known.
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 477f236a8ae..eaee1184ff4 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -46,6 +46,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "cardinality_test",
+    srcs = ["cardinality_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:cardinality",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "checkpoint_test",
     size = "medium",
@@ -385,6 +396,19 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "len_test",
+    size = "small",
+    srcs = ["len_test.py"],
+    deps = [
+        ":test_base",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_combinations",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "list_files_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/cardinality_test.py b/tensorflow/python/data/kernel_tests/cardinality_test.py
new file mode 100644
index 00000000000..bbc8eac6b0c
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/cardinality_test.py
@@ -0,0 +1,174 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.cardinality()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+
+def _test_combinations():
+  # pylint: disable=g-long-lambda
+  cases = [
+      ("Batch1",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True), 2),
+      ("Batch2",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=False), 3),
+      ("Batch3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).batch(2),
+       dataset_ops.UNKNOWN),
+      ("Batch4", lambda: dataset_ops.Dataset.range(5).repeat().batch(2),
+       dataset_ops.INFINITE),
+      ("Cache1", lambda: dataset_ops.Dataset.range(5).cache(), 5),
+      ("Cache2", lambda: dataset_ops.Dataset.range(5).cache("foo"), 5),
+      ("Concatenate1", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5)), 10),
+      ("Concatenate2",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5)), dataset_ops.UNKNOWN),
+      ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5)), dataset_ops.INFINITE),
+      ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       dataset_ops.UNKNOWN),
+      ("Concatenate5",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       dataset_ops.UNKNOWN),
+      ("Concatenate6", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       dataset_ops.INFINITE),
+      ("Concatenate7", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).repeat()), dataset_ops.INFINITE),
+      ("Concatenate8",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).repeat()), dataset_ops.INFINITE),
+      ("Concatenate9",
+       lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
+           dataset_ops.Dataset.range(5).repeat()), dataset_ops.INFINITE),
+      ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)), dataset_ops.UNKNOWN),
+      ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
+       dataset_ops.UNKNOWN),
+      ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
+      ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
+      ("FromTensorSlices1",
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
+      ("FromTensorSlices2", lambda: dataset_ops.Dataset.from_tensor_slices(
+          ([0, 0, 0], [1, 1, 1])), 3),
+      ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       dataset_ops.UNKNOWN),
+      ("Interleave2", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), dataset_ops.UNKNOWN),
+      ("Map1", lambda: dataset_ops.Dataset.range(5).map(lambda x: x), 5),
+      ("Map2", lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1), 5),
+      ("PaddedBatch1", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=True), 2),
+      ("PaddedBatch2", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=False), 3),
+      ("PaddedBatch3", lambda: dataset_ops.Dataset.range(5).filter(
+          lambda _: True).padded_batch(2, []), dataset_ops.UNKNOWN),
+      ("PaddedBatch4",
+       lambda: dataset_ops.Dataset.range(5).repeat().padded_batch(2, []),
+       dataset_ops.INFINITE),
+      ("Prefetch", lambda: dataset_ops.Dataset.range(5).prefetch(buffer_size=1),
+       5),
+      ("Range1", lambda: dataset_ops.Dataset.range(0), 0),
+      ("Range2", lambda: dataset_ops.Dataset.range(5), 5),
+      ("Range3", lambda: dataset_ops.Dataset.range(5, 10), 5),
+      ("Range4", lambda: dataset_ops.Dataset.range(10, 5), 0),
+      ("Range5", lambda: dataset_ops.Dataset.range(5, 10, 2), 3),
+      ("Range6", lambda: dataset_ops.Dataset.range(10, 5, -2), 3),
+      ("Repeat1", lambda: dataset_ops.Dataset.range(0).repeat(0), 0),
+      ("Repeat2", lambda: dataset_ops.Dataset.range(1).repeat(0), 0),
+      ("Repeat3", lambda: dataset_ops.Dataset.range(0).repeat(5), 0),
+      ("Repeat4", lambda: dataset_ops.Dataset.range(1).repeat(5), 5),
+      ("Repeat5", lambda: dataset_ops.Dataset.range(0).repeat(), 0),
+      ("Repeat6", lambda: dataset_ops.Dataset.range(1).repeat(),
+       dataset_ops.INFINITE),
+      ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
+       5),
+      ("Shard1", lambda: dataset_ops.Dataset.range(5).shard(2, 0), 3),
+      ("Shard2", lambda: dataset_ops.Dataset.range(5).shard(8, 7), 0),
+      ("Shard3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).shard(2, 0),
+       dataset_ops.UNKNOWN),
+      ("Shard4", lambda: dataset_ops.Dataset.range(5).repeat().shard(2, 0),
+       dataset_ops.INFINITE),
+      ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
+      ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
+      ("Skip3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).skip(2),
+       dataset_ops.UNKNOWN),
+      ("Skip4", lambda: dataset_ops.Dataset.range(5).repeat().skip(2),
+       dataset_ops.INFINITE),
+      ("Take1", lambda: dataset_ops.Dataset.range(5).take(2), 2),
+      ("Take2", lambda: dataset_ops.Dataset.range(5).take(8), 5),
+      ("Take3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
+       dataset_ops.UNKNOWN),
+      ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Window1", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=True), 2),
+      ("Window2", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=False), 3),
+      ("Zip1", lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5)),
+       5),
+      ("Zip2", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
+      ("Zip3", lambda: dataset_ops.Dataset.zip((dataset_ops.Dataset.range(
+          5), dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(), dataset_ops.Dataset.range(3).
+           repeat())), dataset_ops.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3).filter(
+              lambda _: True))), dataset_ops.UNKNOWN),
+  ]
+
+  def reduce_fn(x, y):
+    name, dataset_fn, expected_result = y
+    return x + combinations.combine(
+        dataset_fn=combinations.NamedObject(name, dataset_fn),
+        expected_result=expected_result)
+
+  return functools.reduce(reduce_fn, cases, [])
+
+
+class CardinalityTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for `tf.data.Dataset.cardinality()`."""
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         _test_combinations()))
+  def testCardinality(self, dataset_fn, expected_result):
+    dataset = dataset_fn()
+    self.assertEqual(self.evaluate(dataset.cardinality()), expected_result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/len_test.py b/tensorflow/python/data/kernel_tests/len_test.py
new file mode 100644
index 00000000000..a22e46ed664
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/len_test.py
@@ -0,0 +1,59 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.__len__()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+
+class LenTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testKnown(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements)
+    self.assertLen(ds, 10)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testInfinite(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements).repeat()
+    with self.assertRaisesRegex(TypeError, 'infinite'):
+      len(ds)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testUnknown(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements).filter(lambda x: True)
+    with self.assertRaisesRegex(TypeError, 'unknown'):
+      len(ds)
+
+  @combinations.generate(test_base.graph_only_combinations())
+  def testGraphMode(self):
+    num_elements = 10
+    ds = dataset_ops.Dataset.range(num_elements)
+    with self.assertRaisesRegex(TypeError, 'not supported while tracing'):
+      len(ds)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index cf69915ecc0..372c19855af 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -94,6 +94,12 @@ ops.NotDifferentiable("ReduceDataset")
 AUTOTUNE = -1
 tf_export("data.experimental.AUTOTUNE").export_constant(__name__, "AUTOTUNE")
 
+# Constants representing infinite and unknown cardinalities.
+INFINITE = -1
+UNKNOWN = -2
+tf_export("data.INFINITE_CARDINALITY").export_constant(__name__, "INFINITE")
+tf_export("data.UNKNOWN_CARDINALITY").export_constant(__name__, "UNKNOWN")
+
 
 @tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
@@ -410,6 +416,36 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor):
       raise RuntimeError("__iter__() is only supported inside of tf.function "
                          "or when eager execution is enabled.")
 
+  def __bool__(self):
+    return True  # Required as __len__ is defined
+
+  __nonzero__ = __bool__  # Python 2 backward compatibility
+
+  def __len__(self):
+    """Returns the length of the dataset if it is known and finite.
+
+    This method requires that you are running in eager mode, and that the
+    length of the dataset is known and non-infinite. When the length may be
+    unknown or infinite, or if you are running in graph mode, use
+    `tf.data.Dataset.cardinality` instead.
+
+    Returns:
+      An integer representing the length of the dataset.
+
+    Raises:
+      RuntimeError: If the dataset length is unknown or infinite, or if eager
+        execution is not enabled.
+    """
+    if not context.executing_eagerly():
+      raise TypeError("__len__() is not supported while tracing functions. "
+                      "Use `tf.data.Dataset.cardinality` instead.")
+    length = self.cardinality()
+    if length.numpy() == INFINITE:
+      raise TypeError("dataset length is infinite.")
+    if length.numpy() == UNKNOWN:
+      raise TypeError("dataset length is unknown.")
+    return length
+
   @abc.abstractproperty
   def element_spec(self):
     """The type specification of an element of this dataset.
@@ -2095,6 +2131,34 @@ name=None))
     """
     return _OptionsDataset(self, options)
 
+  def cardinality(self):
+    """Returns the cardinality of the dataset, if known.
+
+    `cardinality` may return `tf.data.INFINITE_CARDINALITY` if the dataset
+    contains an infinite number of elements or `tf.data.UNKNOWN_CARDINALITY` if
+    the analysis fails to determine the number of elements in the dataset
+    (e.g. when the dataset source is a file).
+
+    >>> dataset = tf.data.Dataset.range(42)
+    >>> print(dataset.cardinality().numpy())
+    42
+    >>> dataset = dataset.repeat()
+    >>> cardinality = dataset.cardinality()
+    >>> print((cardinality == tf.data.INFINITE_CARDINALITY).numpy())
+    True
+    >>> dataset = dataset.filter(lambda x: True)
+    >>> cardinality = dataset.cardinality()
+    >>> print((cardinality == tf.data.UNKNOWN_CARDINALITY).numpy())
+    True
+
+    Returns:
+      A scalar `tf.int64` `Tensor` representing the cardinality of the dataset.
+      If the cardinality is infinite or unknown, `cardinality` returns the
+      named constants `tf.data.INFINITE_CARDINALITY` and
+      `tf.data.UNKNOWN_CARDINALITY` respectively.
+    """
+    return ged_ops.dataset_cardinality(self._variant_tensor)
+
 
 @tf_export(v1=["data.Dataset"])
 class DatasetV1(DatasetV2):
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 6bf7e809604..df5f7761b07 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -41,6 +41,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 7af13486d3d..3488398a955 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 3e586a48947..ba554656ba1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index b838fe0f336..061ccb70f6d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 9c1adbad96c..f1a99bf2b21 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 905e0d6f8fe..140c1355285 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8eee750860e..a41f7cdfedf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 5013717337b..1d2af017d84 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "FixedLengthRecordDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Iterator"
     mtype: "<type \'type\'>"
@@ -28,6 +32,10 @@ tf_module {
     name: "TextLineDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index acc6334055f..c52d26ec6ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -28,6 +28,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 1c5ab59020e..aa27517a73f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 18c77bf4289..7864c08540c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -29,6 +29,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 6ebe24206ab..1aeaac23b4a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 41865c9700f..087eb1a3860 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index ae905aa1fea..ac00eaf018a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 3f274660402..8fb4318379e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -30,6 +30,10 @@ tf_class {
     name: "cache"
     argspec: "args=[\'self\', \'filename\'], varargs=None, keywords=None, defaults=[\'\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "concatenate"
     argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index ecc61e7b138..f8e0a976a73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "FixedLengthRecordDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "Options"
     mtype: "<type \'type\'>"
@@ -24,6 +28,10 @@ tf_module {
     name: "TextLineDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "experimental"
     mtype: "<type \'module\'>"

From 857a3bc0e1ce857bea9a1d1631a41f9d87ccff56 Mon Sep 17 00:00:00 2001
From: Jinliang Wei <jlwei@google.com>
Date: Mon, 1 Jun 2020 18:51:16 -0700
Subject: [PATCH 1512/1533] [XLA] Adds transformation from collective-permute
 to the asynchronous version, and adds emitter for the asynchronous version.

PiperOrigin-RevId: 314246879
Change-Id: I11865391efb6f3925b02e7b1617ba3cfedcd7701
---
 .../xla/service/hlo_dataflow_analysis.cc      | 37 +++++++++++++++++++
 .../xla/service/hlo_dataflow_analysis.h       |  4 ++
 .../compiler/xla/service/hlo_verifier_test.cc |  3 +-
 .../xla/service/memory_space_assignment.cc    | 14 +++++++
 4 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 4894e566393..d0d533e0b06 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -676,6 +676,39 @@ bool HloDataflowAnalysis::UpdateWhileValueSet(HloInstruction* xla_while) {
   }
 }
 
+bool HloDataflowAnalysis::UpdateCollectivePermuteStartValueSet(
+    HloInstruction* collective_permute_start) {
+  CHECK_EQ(collective_permute_start->opcode(),
+           HloOpcode::kCollectivePermuteStart);
+  bool changed = false;
+  // CollectivePermuteStart forwards the operand value to element {0} of its
+  // output.
+  const HloValueSet& operand_value_set =
+      GetValueSet(collective_permute_start->operand(0));
+  HloValueSet& value_set = GetValueSet(collective_permute_start, {0});
+  if (value_set != operand_value_set) {
+    value_set = operand_value_set;
+    changed = true;
+  }
+  return changed;
+}
+
+bool HloDataflowAnalysis::UpdateCollectivePermuteDoneValueSet(
+    HloInstruction* collective_permute_done) {
+  CHECK_EQ(collective_permute_done->opcode(),
+           HloOpcode::kCollectivePermuteDone);
+  bool changed = false;
+  // CollectivePermuteDone forwards the operand value at {0} to its output.
+  const HloValueSet& operand_value_set =
+      GetValueSet(collective_permute_done->operand(0), {1});
+  HloValueSet& value_set = GetValueSet(collective_permute_done);
+  if (value_set != operand_value_set) {
+    value_set = operand_value_set;
+    changed = true;
+  }
+  return changed;
+}
+
 bool HloDataflowAnalysis::UpdateInstructionValueSet(
     HloInstruction* instruction) {
   // Recompute from operands.
@@ -712,6 +745,10 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
       return UpdateCopyDoneValueSet(instruction);
     case HloOpcode::kConditional:
       return UpdateConditionalValueSet(instruction);
+    case HloOpcode::kCollectivePermuteStart:
+      return UpdateCollectivePermuteStartValueSet(instruction);
+    case HloOpcode::kCollectivePermuteDone:
+      return UpdateCollectivePermuteDoneValueSet(instruction);
     default:
       // Instruction does not forward HloValues (it defines all values in its
       // output). No update is necessary.
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 75bcf7ea318..bec592aeb20 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -216,6 +216,10 @@ class HloDataflowAnalysis {
   bool UpdateTupleValueSet(HloInstruction* tuple);
   bool UpdateWhileValueSet(HloInstruction* xla_while);
   bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
+  bool UpdateCollectivePermuteStartValueSet(
+      HloInstruction* collective_permute_start);
+  bool UpdateCollectivePermuteDoneValueSet(
+      HloInstruction* collective_permute_done);
 
   // Propagates the dataflow through the module. In particular, it propagates
   // the HloValueSet from its defining instruction to the users of the
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 294dfbf66fa..d9709c50df9 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -1201,7 +1201,8 @@ TEST_F(HloVerifierTest, CollectivePermuteDoneNoCollectivePermuteStart) {
     p0 = f32[2,3]{1,0:S(1)} parameter(0)
     p1 = f32[2,3]{1,0:S(1)} parameter(1)
     p2 = u32[] parameter(2)
-    tuple.1 = (f32[2,3], f32[2,3], u32[], u32[]) tuple(p0, p1, p2)
+    p3 = u32[] parameter(3)
+    tuple.1 = (f32[2,3], f32[2,3], u32[], u32[]) tuple(p0, p1, p2, p3)
     ROOT collective-permute-done.1 = f32[2,3]{1,0:S(1)} collective-permute-done(tuple.1)
   }
   )";
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index e4ee79e9f4c..0ed72f51754 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -521,6 +521,20 @@ bool AlternateMemoryBestFitHeap::IsIntervalAllowedInAlternateMemory(
         return false;
       }
     }
+
+    if ((position.instruction->opcode() == HloOpcode::kCollectivePermuteStart ||
+         position.instruction->opcode() == HloOpcode::kCollectivePermuteDone)) {
+      // Disable memory space allocation for these for now.
+      if (position.index == ShapeIndex({0})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a collective-permute buffer.";
+        return false;
+      } else if (position.index == ShapeIndex({1})) {
+        VLOG(4) << "Keeping value " << interval.buffer->ToShortString()
+                << " in default mem because it is a collective-permute buffer.";
+        return false;
+      }
+    }
   }
 
   return true;

From 0b0fd94622a447b740328cc3982427f6bc1a0eec Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 1 Jun 2020 19:03:33 -0700
Subject: [PATCH 1513/1533] Extract :xplane_test_utils from :xplane_utils

PiperOrigin-RevId: 314248706
Change-Id: Iaefcae9e880b4e157856189ce4ee017f2b23cab1
---
 tensorflow/core/profiler/convert/BUILD        |  7 +-
 .../convert/xplane_to_memory_profile_test.cc  | 91 +++++++++---------
 .../convert/xplane_to_op_stats_test.cc        |  2 +-
 .../convert/xplane_to_step_events_test.cc     |  2 +-
 .../convert/xplane_to_tf_functions_test.cc    |  1 +
 tensorflow/core/profiler/utils/BUILD          | 22 ++++-
 .../profiler/utils/derived_timeline_test.cc   | 89 ++++++------------
 .../core/profiler/utils/group_events_test.cc  |  2 +-
 .../core/profiler/utils/xplane_test_utils.cc  | 92 +++++++++++++++++++
 .../core/profiler/utils/xplane_test_utils.h   | 50 ++++++++++
 .../core/profiler/utils/xplane_utils.cc       | 84 -----------------
 tensorflow/core/profiler/utils/xplane_utils.h | 34 -------
 12 files changed, 241 insertions(+), 235 deletions(-)
 create mode 100644 tensorflow/core/profiler/utils/xplane_test_utils.cc
 create mode 100644 tensorflow/core/profiler/utils/xplane_test_utils.h

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 6c4116199ef..b195000137d 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -268,7 +268,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -358,7 +358,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -456,6 +456,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
@@ -498,7 +499,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_test_utils",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
index e0d87ac7567..2b6356a7aa1 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile_test.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -38,53 +38,52 @@ TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
   host_plane_builder.ReserveLines(1);
 
   auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
-  CreateXEventWithIntAndStringViewMetadataValue(
-      &host_plane_builder, &tf_executor_thread, "MemoryAllocation", 40000, 1000,
-      {{StatType::kBytesReserved, 2000},
-       {StatType::kBytesAllocated, 3000},
-       {StatType::kBytesAvailable, 5000},
-       {StatType::kPeakBytesInUse, 8500},
-       {StatType::kRequestedBytes, 200},
-       {StatType::kAllocationBytes, 256},
-       {StatType::kAddress, 222333},
-       {StatType::kStepId, -93746},
-       {StatType::kDataType, 1}},
-      {{StatType::kAllocatorName, "GPU_0_bfc"},
-       {StatType::kTfOp, "foo/bar"},
-       {StatType::kRegionType, "output"},
-       {StatType::kTensorShapes, "[3, 3, 512, 512]"}});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
+               40000, 1000,
+               {{StatType::kBytesReserved, 2000},
+                {StatType::kBytesAllocated, 3000},
+                {StatType::kBytesAvailable, 5000},
+                {StatType::kPeakBytesInUse, 8500},
+                {StatType::kRequestedBytes, 200},
+                {StatType::kAllocationBytes, 256},
+                {StatType::kAddress, 222333},
+                {StatType::kStepId, -93746},
+                {StatType::kDataType, 1},
+                {StatType::kAllocatorName, "GPU_0_bfc"},
+                {StatType::kTfOp, "foo/bar"},
+                {StatType::kRegionType, "output"},
+                {StatType::kTensorShapes, "[3, 3, 512, 512]"}});
 
-  CreateXEventWithIntAndStringViewMetadataValue(
-      &host_plane_builder, &tf_executor_thread, "MemoryDeallocation", 50000,
-      1000,
-      {{StatType::kBytesReserved, 2000},
-       {StatType::kBytesAllocated, 2744},
-       {StatType::kBytesAvailable, 5256},
-       {StatType::kPeakBytesInUse, 8500},
-       {StatType::kRequestedBytes, 200},
-       {StatType::kAllocationBytes, 256},
-       {StatType::kAddress, 222333},
-       {StatType::kStepId, 0},
-       {StatType::kDataType, 0}},
-      {{StatType::kAllocatorName, "GPU_0_bfc"},
-       {StatType::kRegionType, ""},
-       {StatType::kTensorShapes, ""}});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryDeallocation",
+               50000, 1000,
+               {{StatType::kBytesReserved, 2000},
+                {StatType::kBytesAllocated, 2744},
+                {StatType::kBytesAvailable, 5256},
+                {StatType::kPeakBytesInUse, 8500},
+                {StatType::kRequestedBytes, 200},
+                {StatType::kAllocationBytes, 256},
+                {StatType::kAddress, 222333},
+                {StatType::kStepId, 0},
+                {StatType::kDataType, 0},
+                {StatType::kAllocatorName, "GPU_0_bfc"},
+                {StatType::kRegionType, ""},
+                {StatType::kTensorShapes, ""}});
 
-  CreateXEventWithIntAndStringViewMetadataValue(
-      &host_plane_builder, &tf_executor_thread, "MemoryAllocation", 70000, 1000,
-      {{StatType::kBytesReserved, 2000},
-       {StatType::kBytesAllocated, 5000},
-       {StatType::kBytesAvailable, 3000},
-       {StatType::kPeakBytesInUse, 9500},
-       {StatType::kRequestedBytes, 300},
-       {StatType::kAllocationBytes, 300},
-       {StatType::kAddress, 345678},
-       {StatType::kStepId, -93746},
-       {StatType::kDataType, 9}},
-      {{StatType::kAllocatorName, "GPU_0_bfc"},
-       {StatType::kTfOp, "mul_grad/Sum"},
-       {StatType::kRegionType, "temp"},
-       {StatType::kTensorShapes, "[1, 2]"}});
+  CreateXEvent(&host_plane_builder, &tf_executor_thread, "MemoryAllocation",
+               70000, 1000,
+               {{StatType::kBytesReserved, 2000},
+                {StatType::kBytesAllocated, 5000},
+                {StatType::kBytesAvailable, 3000},
+                {StatType::kPeakBytesInUse, 9500},
+                {StatType::kRequestedBytes, 300},
+                {StatType::kAllocationBytes, 300},
+                {StatType::kAddress, 345678},
+                {StatType::kStepId, -93746},
+                {StatType::kDataType, 9},
+                {StatType::kAllocatorName, "GPU_0_bfc"},
+                {StatType::kTfOp, "mul_grad/Sum"},
+                {StatType::kRegionType, "temp"},
+                {StatType::kTensorShapes, "[1, 2]"}});
 
   MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
   EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
index 67901e83dd3..e4cda680a56 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
index 36e6a2c3091..9ace9eb185c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events_test.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/group_events.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
index 25e56d17418..12287217e04 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 0bc9374b7ba..8741ac01fc5 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -192,7 +192,6 @@ cc_library(
     deps = [
         ":timespan",
         ":xplane_builder",
-        ":xplane_schema",
         ":xplane_visitor",
         "//tensorflow/core:platform_base",
         "//tensorflow/core/platform:types",
@@ -219,6 +218,23 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "xplane_test_utils",
+    testonly = True,
+    srcs = ["xplane_test_utils.cc"],
+    hdrs = ["xplane_test_utils.h"],
+    visibility = [":friends"],
+    deps = [
+        ":xplane_builder",
+        ":xplane_schema",
+        "//tensorflow/core/platform:types",
+        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:variant",
+    ],
+)
+
 cc_library(
     name = "xplane_visitor",
     srcs = ["xplane_visitor.cc"],
@@ -276,7 +292,7 @@ tf_cc_test(
         ":tf_xplane_visitor",
         ":xplane_builder",
         ":xplane_schema",
-        ":xplane_utils",
+        ":xplane_test_utils",
         ":xplane_visitor",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -343,7 +359,7 @@ tf_cc_test(
         ":trace_utils",
         ":xplane_builder",
         ":xplane_schema",
-        ":xplane_utils",
+        ":xplane_test_utils",
         ":xplane_visitor",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
diff --git a/tensorflow/core/profiler/utils/derived_timeline_test.cc b/tensorflow/core/profiler/utils/derived_timeline_test.cc
index 76a0188480a..c6922e7ab74 100644
--- a/tensorflow/core/profiler/utils/derived_timeline_test.cc
+++ b/tensorflow/core/profiler/utils/derived_timeline_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/trace_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -47,21 +47,12 @@ TEST(DerivedTimelineTest, HloModuleNameTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kHloModule)),
-                           kHloModuleName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kHloModule)),
-                            kHloModuleName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kHloModule, kHloModuleName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kHloModule, kHloModuleName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // Only the hlo module line is added and other empty lines are removed at the
@@ -86,21 +77,12 @@ TEST(DerivedTimelineTest, TfOpLineTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
-  first_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
-  second_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // Only the tf op line is added and other empty lines are removed at the end.
@@ -127,22 +109,14 @@ TEST(DerivedTimelineTest, DependencyTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
-                                  {{StatType::kGroupId, 0}});
-  first_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event = CreateXEvent(&plane_builder, &line_builder, "op2", 200,
-                                   300, {{StatType::kGroupId, 1}});
-  second_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kGroupId, 0},
+                {StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kGroupId, 1},
+                {StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // The step line and the TF op line are added.
@@ -164,21 +138,12 @@ TEST(DerivedTimelineTest, TfOpNameScopeTest) {
   XPlane* plane = space.add_planes();
   XPlaneBuilder plane_builder(plane);
   auto line_builder = plane_builder.GetOrCreateLine(0);
-  auto first_event = CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100);
-  first_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  first_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                               GetStatTypeStr(StatType::kKernelDetails)),
-                           kKernelDetails);
-  auto second_event =
-      CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300);
-  second_event.AddStatValue(
-      *plane_builder.GetOrCreateStatMetadata(GetStatTypeStr(StatType::kLevel0)),
-      kTfOpName);
-  second_event.AddStatValue(*plane_builder.GetOrCreateStatMetadata(
-                                GetStatTypeStr(StatType::kKernelDetails)),
-                            kKernelDetails);
+  CreateXEvent(&plane_builder, &line_builder, "op1", 0, 100,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
+  CreateXEvent(&plane_builder, &line_builder, "op2", 200, 300,
+               {{StatType::kLevel0, kTfOpName},
+                {StatType::kKernelDetails, kKernelDetails}});
   GenerateDerivedTimeLines(event_group_name_map, &space);
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   // The TF name scope line and the TF op line are added.
diff --git a/tensorflow/core/profiler/utils/group_events_test.cc b/tensorflow/core/profiler/utils/group_events_test.cc
index 70db7c2b8e6..8545bc94e54 100644
--- a/tensorflow/core/profiler/utils/group_events_test.cc
+++ b/tensorflow/core/profiler/utils/group_events_test.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
-#include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/utils/xplane_test_utils.cc b/tensorflow/core/profiler/utils/xplane_test_utils.cc
new file mode 100644
index 00000000000..cd8821f05a8
--- /dev/null
+++ b/tensorflow/core/profiler/utils/xplane_test_utils.cc
@@ -0,0 +1,92 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/utils/xplane_test_utils.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+class XStatValueVisitor {
+ public:
+  XStatValueVisitor(XEventBuilder* event, const XStatMetadata* stat_metadata)
+      : event_(event), stat_metadata_(stat_metadata) {}
+
+  template <typename T>
+  void operator()(const T& value) {
+    event_->AddStatValue(*stat_metadata_, value);
+  }
+
+ private:
+  XEventBuilder* event_;
+  const XStatMetadata* stat_metadata_;
+};
+
+}  // namespace
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats) {
+  auto event_builder = line_builder->AddEvent(
+      *plane_builder->GetOrCreateEventMetadata(event_name));
+  event_builder.SetOffsetPs(offset_ps);
+  event_builder.SetDurationPs(duration_ps);
+  for (const auto& stat_type_and_value : stats) {
+    StatType stat_type = stat_type_and_value.first;
+    const XStatValue& stat_value = stat_type_and_value.second;
+    XStatValueVisitor stat_value_visitor(
+        &event_builder,
+        plane_builder->GetOrCreateStatMetadata(GetStatTypeStr(stat_type)));
+    absl::visit(stat_value_visitor, stat_value);
+  }
+}
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    HostEventType event_type, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats) {
+  CreateXEvent(plane_builder, line_builder, GetHostEventTypeStr(event_type),
+               offset_ps, duration_ps, stats);
+}
+
+void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
+                               XLineBuilder* line_builder,
+                               absl::string_view function_name, int64 offset_ps,
+                               int64 duration_ps,
+                               absl::string_view execution_mode,
+                               int64 tracing_count) {
+  if (tracing_count >= 0) {
+    // Adds the tracing_count stats only if tracing_count is valid.
+    CreateXEvent(plane_builder, line_builder, function_name, offset_ps,
+                 duration_ps,
+                 {{StatType::kTfFunctionCall, execution_mode},
+                  {StatType::kTfFunctionTracingCount, tracing_count}});
+  } else {
+    CreateXEvent(plane_builder, line_builder, function_name, offset_ps,
+                 duration_ps, {{StatType::kTfFunctionCall, execution_mode}});
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_test_utils.h b/tensorflow/core/profiler/utils/xplane_test_utils.h
new file mode 100644
index 00000000000..b42599baecd
--- /dev/null
+++ b/tensorflow/core/profiler/utils/xplane_test_utils.h
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
+
+#include <initializer_list>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/variant.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using XStatValue = absl::variant<int64, absl::string_view>;
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats = {});
+
+void CreateXEvent(
+    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
+    HostEventType event_type, int64 offset_ps, int64 duration_ps,
+    std::initializer_list<std::pair<StatType, XStatValue>> stats);
+
+void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
+                               XLineBuilder* line_builder,
+                               absl::string_view function_name, int64 offset_ps,
+                               int64 duration_ps,
+                               absl::string_view execution_mode,
+                               int64 tracing_count = -1);
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc
index 7f5221c5391..366711aab45 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.cc
+++ b/tensorflow/core/profiler/utils/xplane_utils.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/timespan.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
@@ -138,71 +137,6 @@ void AddOrUpdateStrStat(int64 metadata_id, absl::string_view value,
   stat->set_str_value(std::string(value));
 }
 
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats) {
-  auto event_builder = line_builder->AddEvent(
-      *plane_builder->GetOrCreateEventMetadata(event_name));
-  event_builder.SetOffsetPs(offset_ps);
-  event_builder.SetDurationPs(duration_ps);
-  for (const auto& stat_type_and_value : stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  return event_builder;
-}
-
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    HostEventType event_type, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats) {
-  return CreateXEvent(plane_builder, line_builder,
-                      GetHostEventTypeStr(event_type), offset_ps, duration_ps,
-                      stats);
-}
-
-XEventBuilder CreateXEventWithStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        stats) {
-  auto event_builder = line_builder->AddEvent(
-      *plane_builder->GetOrCreateEventMetadata(event_name));
-  event_builder.SetOffsetPs(offset_ps);
-  event_builder.SetDurationPs(duration_ps);
-  for (const auto& stat_type_and_value : stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  return event_builder;
-}
-
-XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        str_stats) {
-  auto event_builder = line_builder->AddEvent(
-      *plane_builder->GetOrCreateEventMetadata(event_name));
-  event_builder.SetOffsetPs(offset_ps);
-  event_builder.SetDurationPs(duration_ps);
-  for (const auto& stat_type_and_value : int_stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  for (const auto& stat_type_and_value : str_stats) {
-    event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
-                                   GetStatTypeStr(stat_type_and_value.first)),
-                               stat_type_and_value.second);
-  }
-  return event_builder;
-}
-
 void RemovePlaneWithName(XSpace* space, absl::string_view name) {
   auto* planes = space->mutable_planes();
   planes->erase(
@@ -346,23 +280,5 @@ uint64 GetStartTimestampNs(const XPlane& plane) {
   return plane_timestamp;
 }
 
-void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
-                               XLineBuilder* line_builder,
-                               absl::string_view function_name, int64 offset_ps,
-                               int64 duration_ps,
-                               absl::string_view execution_mode,
-                               int64 tracing_count) {
-  XEventBuilder event_builder = CreateXEventWithStringViewMetadataValue(
-      plane_builder, line_builder, function_name, offset_ps, duration_ps,
-      {{StatType::kTfFunctionCall, execution_mode}});
-  if (tracing_count >= 0) {
-    // Adds the tracing_count stats only if tracing_count is valid.
-    event_builder.AddStatValue(
-        *plane_builder->GetOrCreateStatMetadata(
-            GetStatTypeStr(StatType::kTfFunctionTracingCount)),
-        tracing_count);
-  }
-}
-
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h
index 49087c49cd8..e1afb0bd56f 100644
--- a/tensorflow/core/profiler/utils/xplane_utils.h
+++ b/tensorflow/core/profiler/utils/xplane_utils.h
@@ -21,8 +21,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-#include "tensorflow/core/profiler/utils/xplane_builder.h"
-#include "tensorflow/core/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -47,31 +45,6 @@ void AddOrUpdateIntStat(int64 metadata_id, int64 value,
 void AddOrUpdateStrStat(int64 metadata_id, absl::string_view value,
                         tensorflow::profiler::XEvent* event);
 
-// Creates an XEvent with int64 stats.
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats = {});
-XEventBuilder CreateXEvent(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    HostEventType event_type, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& stats);
-
-// Creates an XEvent with string stats.
-XEventBuilder CreateXEventWithStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        stats);
-
-// Creates an XEvent with int64 and string stats.
-XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
-    XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
-    absl::string_view event_name, int64 offset_ps, int64 duration_ps,
-    const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
-    const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
-        str_stats);
-
 void RemovePlaneWithName(XSpace* space, absl::string_view name);
 void RemoveEmptyPlanes(XSpace* space);
 void RemoveEmptyLines(XPlane* plane);
@@ -103,13 +76,6 @@ void MergePlanes(const XPlane& src_plane, XPlane* dst_plane);
 // timestamps. If zero line exists, return 0;
 uint64 GetStartTimestampNs(const XPlane& plane);
 
-// Creates a Xevent in the given plane & line for a tf-function.
-void CreateTfFunctionCallEvent(XPlaneBuilder* plane_builder,
-                               XLineBuilder* line_builder,
-                               absl::string_view function_name, int64 offset_ps,
-                               int64 duration_ps,
-                               absl::string_view execution_mode,
-                               int64 tracing_count = -1);
 }  // namespace profiler
 }  // namespace tensorflow
 

From 37aaafb0c1baa7acd0607748326cc12faf556277 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 1 Jun 2020 19:20:02 -0700
Subject: [PATCH 1514/1533] [XLA:CPU] [XLA:GPU] Add support for double
 precision FFTs on CPU and GPU.

PiperOrigin-RevId: 314250560
Change-Id: Ib9b4a7ea2ec2cc480db09e62bc35cfdcaf1c3b9a
---
 .../compiler/xla/service/cpu/ir_emitter.cc    |   6 +-
 .../compiler/xla/service/cpu/runtime_fft.cc   |   9 +-
 .../compiler/xla/service/cpu/runtime_fft.h    |   3 +-
 .../xla/service/cpu/runtime_fft_impl.h        | 109 +++++++++++-------
 .../cpu/runtime_single_threaded_fft.cc        |   7 +-
 .../service/cpu/runtime_single_threaded_fft.h |   3 +-
 .../compiler/xla/service/gpu/fft_thunk.cc     |  69 ++++++++++-
 .../compiler/xla/service/shape_inference.cc   |  14 ++-
 .../xla/service/shape_inference_test.cc       |  14 +--
 9 files changed, 165 insertions(+), 69 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 1e204afb001..998b9db132c 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -1217,7 +1217,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   auto operand = fft->operand(0);
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*fft, /*operands=*/{operand},
-      /*supported_types=*/{F32, C64}));
+      /*supported_types=*/{F32, F64, C64, C128}));
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(operand->shape().layout()));
   TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(fft->shape().layout()));
   VLOG(3) << "operand=" << ShapeUtil::HumanStringWithLayout(operand->shape());
@@ -1239,7 +1239,7 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
   llvm::FunctionType* fft_type = llvm::FunctionType::get(
       b_.getVoidTy(),
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
-       int64_type, int64_type, int64_type, int64_type},
+       int32_type, int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
 
   bool multi_threaded_eigen =
@@ -1258,6 +1258,8 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
        {GetExecutableRunOptionsArgument(),
         BitCast(GetEmittedValueFor(fft), int8_ptr_type),
         BitCast(operand_address, int8_ptr_type), b_.getInt32(fft->fft_type()),
+        b_.getInt32(operand->shape().element_type() == F64 ||
+                    operand->shape().element_type() == C128),
         b_.getInt32(fft_rank), b_.getInt64(input_batch),
         b_.getInt64(fft_rank > 0 ? fft_length[0] : 0),
         b_.getInt64(fft_rank > 1 ? fft_length[1] : 0),
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
index 051120be324..0c1e9dae751 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
@@ -28,13 +28,14 @@ using tensorflow::int64;
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenFft(
     const void* run_options_ptr, void* out, void* operand, int32 fft_type,
-    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
-    int64 fft_length2) {
+    int32 double_precision, int32 fft_rank, int64 input_batch,
+    int64 fft_length0, int64 fft_length1, int64 fft_length2) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
   tensorflow::xla::EigenFftImpl(
       *run_options->intra_op_thread_pool(), out, operand,
-      static_cast<tensorflow::xla::FftType>(fft_type), fft_rank, input_batch,
-      fft_length0, fft_length1, fft_length2);
+      static_cast<tensorflow::xla::FftType>(fft_type),
+      static_cast<bool>(double_precision), fft_rank, input_batch, fft_length0,
+      fft_length1, fft_length2);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_fft.h
index f20c5aa0aa2..d95da172116 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft.h
@@ -22,7 +22,8 @@ extern "C" {
 
 extern void __xla_cpu_runtime_EigenFft(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
-    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    void* operand, tensorflow::int32 fft_type,
+    tensorflow::int32 double_precision, tensorflow::int32 fft_rank,
     tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
     tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 04dea120a8d..124e7d589a0 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -39,8 +39,8 @@ static constexpr int kFftTypeArraySize = 4;
 namespace internal {
 
 // Computes either a forward or reverse complex-to-complex FFT.
-template <bool Forward, int FFTRank, typename EigenDevice>
-void EigenFftC2C(const EigenDevice& device, complex64* out, complex64* operand,
+template <bool Forward, int FFTRank, typename EigenDevice, typename Complex>
+void EigenFftC2C(const EigenDevice& device, Complex* out, Complex* operand,
                  int64 input_batch, int64 fft_length0, int64 fft_length1,
                  int64 fft_length2) {
   // Create the axes (which are always trailing).
@@ -55,10 +55,10 @@ void EigenFftC2C(const EigenDevice& device, complex64* out, complex64* operand,
   for (int i = 0; i < FFTRank; i++) {
     dims[i + 1] = fft_shape[i];
   }
-  const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  const Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(operand, dims);
-  Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                    Eigen::Aligned>
       output(out, dims);
   output.device(device) = input.template fft<Eigen::BothParts, direction>(axes);
@@ -66,8 +66,8 @@ void EigenFftC2C(const EigenDevice& device, complex64* out, complex64* operand,
 
 // Computes a forward real->complex FFT, slicing out redundant negative
 // frequencies from the innermost dimension.
-template <int FFTRank, typename EigenDevice>
-void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
+template <int FFTRank, typename EigenDevice, typename Real, typename Complex>
+void EigenFftR2C(const EigenDevice& device, Complex* out, Real* operand,
                  int64 input_batch, int64 fft_length0, int64 fft_length1,
                  int64 fft_length2) {
   const std::array<int64, 3> fft_shape = {
@@ -81,10 +81,10 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
   }
-  const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
+  const Eigen::TensorMap<Eigen::Tensor<Real, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(operand, in_dims);
-  Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                    Eigen::Aligned>
       output(out, out_dims);
 
@@ -92,7 +92,7 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+  Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
 
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
@@ -105,8 +105,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
 // Computes a reverse complex->real FFT, reconstructing redundant negative
 // frequencies using reverse conjugate on innermost dimension after doing IFFT
 // on outer dimensions.
-template <int FFTRank, typename EigenDevice>
-void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
+template <int FFTRank, typename EigenDevice, typename Complex, typename Real>
+void EigenFftC2R(const EigenDevice& device, Real* out, Complex* operand,
                  int64 input_batch, int64 fft_length0, int64 fft_length1,
                  int64 fft_length2) {
   const std::array<int64, 3> fft_shape = {
@@ -120,10 +120,10 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
   }
-  const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
+  const Eigen::TensorMap<Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(operand, in_dims);
-  Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
+  Eigen::TensorMap<Eigen::Tensor<Real, FFTRank + 1, Eigen::RowMajor>,
                    Eigen::Aligned>
       output(out, out_dims);
 
@@ -131,7 +131,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
+  Eigen::Tensor<Complex, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -178,30 +178,59 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
 
 template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
-                      FftType fft_type, int64 input_batch, int64 fft_length0,
-                      int64 fft_length1, int64 fft_length2) {
+                      FftType fft_type, bool double_precision,
+                      int64 input_batch, int64 fft_length0, int64 fft_length1,
+                      int64 fft_length2) {
   switch (fft_type) {
     case FftType::FFT:
-      EigenFftC2C<true, FFTRank, EigenDevice>(
-          device, static_cast<complex64*>(out),
-          static_cast<complex64*>(operand), input_batch, fft_length0,
-          fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftC2C<true, FFTRank, EigenDevice, complex128>(
+            device, static_cast<complex128*>(out),
+            static_cast<complex128*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftC2C<true, FFTRank, EigenDevice, complex64>(
+            device, static_cast<complex64*>(out),
+            static_cast<complex64*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      }
       break;
     case FftType::IFFT:
-      EigenFftC2C<false, FFTRank, EigenDevice>(
-          device, static_cast<complex64*>(out),
-          static_cast<complex64*>(operand), input_batch, fft_length0,
-          fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftC2C<false, FFTRank, EigenDevice, complex128>(
+            device, static_cast<complex128*>(out),
+            static_cast<complex128*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftC2C<false, FFTRank, EigenDevice, complex64>(
+            device, static_cast<complex64*>(out),
+            static_cast<complex64*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      }
       break;
     case FftType::RFFT:
-      EigenFftR2C<FFTRank, EigenDevice>(
-          device, static_cast<complex64*>(out), static_cast<float*>(operand),
-          input_batch, fft_length0, fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftR2C<FFTRank, EigenDevice, double, complex128>(
+            device, static_cast<complex128*>(out),
+            static_cast<double*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftR2C<FFTRank, EigenDevice, float, complex64>(
+            device, static_cast<complex64*>(out), static_cast<float*>(operand),
+            input_batch, fft_length0, fft_length1, fft_length2);
+      }
       break;
     case FftType::IRFFT:
-      EigenFftC2R<FFTRank, EigenDevice>(
-          device, static_cast<float*>(out), static_cast<complex64*>(operand),
-          input_batch, fft_length0, fft_length1, fft_length2);
+      if (double_precision) {
+        EigenFftC2R<FFTRank, EigenDevice, complex128, double>(
+            device, static_cast<double*>(out),
+            static_cast<complex128*>(operand), input_batch, fft_length0,
+            fft_length1, fft_length2);
+      } else {
+        EigenFftC2R<FFTRank, EigenDevice, complex64, float>(
+            device, static_cast<float*>(out), static_cast<complex64*>(operand),
+            input_batch, fft_length0, fft_length1, fft_length2);
+      }
       break;
     default:
       // Unsupported FFT type
@@ -213,22 +242,24 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
 
 template <typename EigenDevice>
 void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
-                  FftType fft_type, int32 fft_rank, int64 input_batch,
-                  int64 fft_length0, int64 fft_length1, int64 fft_length2) {
+                  FftType fft_type, bool double_precision, int32 fft_rank,
+                  int64 input_batch, int64 fft_length0, int64 fft_length1,
+                  int64 fft_length2) {
   switch (fft_rank) {
     case 1:
-      internal::EigenFftWithRank<1, EigenDevice>(
-          device, out, operand, fft_type, input_batch, fft_length0, 0, 0);
+      internal::EigenFftWithRank<1, EigenDevice>(device, out, operand, fft_type,
+                                                 double_precision, input_batch,
+                                                 fft_length0, 0, 0);
       break;
     case 2:
       internal::EigenFftWithRank<2, EigenDevice>(device, out, operand, fft_type,
-                                                 input_batch, fft_length0,
-                                                 fft_length1, 0);
+                                                 double_precision, input_batch,
+                                                 fft_length0, fft_length1, 0);
       break;
     case 3:
-      internal::EigenFftWithRank<3, EigenDevice>(device, out, operand, fft_type,
-                                                 input_batch, fft_length0,
-                                                 fft_length1, fft_length2);
+      internal::EigenFftWithRank<3, EigenDevice>(
+          device, out, operand, fft_type, double_precision, input_batch,
+          fft_length0, fft_length1, fft_length2);
       break;
     default:
       // Unsupported FFT rank
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
index d2780dd694e..9476dce5ced 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -24,10 +24,11 @@ using tensorflow::int64;
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
     const void* run_options_ptr, void* out, void* operand, int32 fft_type,
-    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
-    int64 fft_length2) {
+    int32 double_precision, int32 fft_rank, int64 input_batch,
+    int64 fft_length0, int64 fft_length1, int64 fft_length2) {
   tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand,
                                 static_cast<tensorflow::xla::FftType>(fft_type),
-                                fft_rank, input_batch, fft_length0, fft_length1,
+                                static_cast<bool>(double_precision), fft_rank,
+                                input_batch, fft_length0, fft_length1,
                                 fft_length2);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
index dcd133d012c..2f0ccda2d10 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -22,7 +22,8 @@ extern "C" {
 
 extern void __xla_cpu_runtime_EigenSingleThreadedFft(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
-    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    void* operand, tensorflow::int32 fft_type,
+    tensorflow::int32 double_precision, tensorflow::int32 fft_rank,
     tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
     tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index 991a463f2a0..9d6be3c78ea 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -60,16 +60,18 @@ StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
 
 namespace {
 
-se::fft::Type FftTypeToSeType(FftType type) {
+se::fft::Type FftTypeToSeType(FftType type, bool double_precision) {
   switch (type) {
     case FftType::FFT:
-      return se::fft::Type::kC2CForward;
+      return double_precision ? se::fft::Type::kZ2ZForward
+                              : se::fft::Type::kC2CForward;
     case FftType::IFFT:
-      return se::fft::Type::kC2CInverse;
+      return double_precision ? se::fft::Type::kZ2ZInverse
+                              : se::fft::Type::kC2CInverse;
     case FftType::IRFFT:
-      return se::fft::Type::kC2R;
+      return double_precision ? se::fft::Type::kZ2D : se::fft::Type::kC2R;
     case FftType::RFFT:
-      return se::fft::Type::kR2C;
+      return double_precision ? se::fft::Type::kD2Z : se::fft::Type::kR2C;
     default:
       LOG(FATAL) << "unsupported fft type";
   }
@@ -78,12 +80,16 @@ se::fft::Type FftTypeToSeType(FftType type) {
 string FftTypeToString(se::fft::Type type) {
   switch (type) {
     case se::fft::Type::kC2CForward:
+    case se::fft::Type::kZ2ZForward:
       return "FFT";
     case se::fft::Type::kC2CInverse:
+    case se::fft::Type::kZ2ZInverse:
       return "IFFT";
     case se::fft::Type::kC2R:
+    case se::fft::Type::kZ2D:
       return "IRFFT";
     case se::fft::Type::kR2C:
+    case se::fft::Type::kD2Z:
       return "RFFT";
     default:
       LOG(FATAL) << "unknown fft type";
@@ -98,7 +104,9 @@ FftThunk::FftThunk(FftType fft_type, absl::Span<const int64> fft_length,
                    const Shape& input_shape, const Shape& output_shape,
                    const HloInstruction* hlo)
     : Thunk(Kind::kFft, hlo),
-      fft_type_(FftTypeToSeType(fft_type)),
+      fft_type_(
+          FftTypeToSeType(fft_type, input_shape.element_type() == F64 ||
+                                        input_shape.element_type() == C128)),
       fft_length_(fft_length.begin(), fft_length.end()),
       scale_factor_(1.0f),
       input_buffer_(input_buffer),
@@ -166,6 +174,15 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
           stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
       break;
     }
+    case se::fft::Type::kZ2ZForward: {
+      se::DeviceMemory<complex128> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex128> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      break;
+    }
     case se::fft::Type::kC2CInverse: {
       se::DeviceMemory<complex64> input_data(
           buffer_allocations.GetDeviceAddress(input_buffer_));
@@ -181,6 +198,22 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
       }
       break;
     }
+    case se::fft::Type::kZ2ZInverse: {
+      se::DeviceMemory<complex128> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex128> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      if (launch_ok) {
+        launch_ok =
+            stream
+                .ThenBlasScal(ShapeUtil::ElementsIn(output_shape_),
+                              complex128(scale_factor_), &output_data, 1)
+                .ok();
+      }
+      break;
+    }
     case se::fft::Type::kR2C: {
       se::DeviceMemory<float> input_data(
           buffer_allocations.GetDeviceAddress(input_buffer_));
@@ -190,6 +223,15 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
           stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
       break;
     }
+    case se::fft::Type::kD2Z: {
+      se::DeviceMemory<double> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<complex128> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      break;
+    }
     case se::fft::Type::kC2R: {
       se::DeviceMemory<complex64> input_data(
           buffer_allocations.GetDeviceAddress(input_buffer_));
@@ -205,6 +247,21 @@ Status FftThunk::ExecuteOnStream(const ExecuteParams& params) {
       }
       break;
     }
+    case se::fft::Type::kZ2D: {
+      se::DeviceMemory<complex128> input_data(
+          buffer_allocations.GetDeviceAddress(input_buffer_));
+      se::DeviceMemory<double> output_data(
+          buffer_allocations.GetDeviceAddress(output_buffer_));
+      launch_ok =
+          stream.ThenFft(fft_plan_.get(), input_data, &output_data).ok();
+      if (launch_ok) {
+        launch_ok = stream
+                        .ThenBlasScal(ShapeUtil::ElementsIn(output_shape_),
+                                      scale_factor_, &output_data, 1)
+                        .ok();
+      }
+      break;
+    }
     default:
       LOG(FATAL) << "unsupported fft type";
   }
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 0ea7912c95c..75a80747c1d 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -1856,7 +1857,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   switch (fft_type) {
     case FFT:
     case IFFT:
-      if (in.element_type() != C64) {
+      if (!primitive_util::IsComplexType(in.element_type())) {
         return InvalidArgument("%s requires complex input type, found %s.",
                                FftType_Name(fft_type),
                                PrimitiveType_Name(in.element_type()));
@@ -1864,8 +1865,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       RET_CHECK_RANK(in);
       return in;
     case RFFT: {
-      if (in.element_type() != F32) {
-        return InvalidArgument("RFFT requires F32 input type, found %s.",
+      if (in.element_type() != F32 && in.element_type() != F64) {
+        return InvalidArgument("RFFT requires F32 or F64 input type, found %s.",
                                PrimitiveType_Name(in.element_type()));
       }
       RET_CHECK_RANK(in);
@@ -1880,7 +1881,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
               fft_length[i]);
         }
       }
-      Shape result = ShapeUtil::ChangeElementType(in, C64);
+      Shape result = ShapeUtil::ChangeElementType(
+          in, in.element_type() == F32 ? C64 : C128);
       // Preserve the size of zero-sized dimensions.
       if (fft_length[fft_rank - 1] != 0) {
         result.set_dimensions(result.dimensions_size() - 1,
@@ -1889,8 +1891,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
       return result;
     }
     case IRFFT: {
-      if (in.element_type() != C64) {
-        return InvalidArgument("IRFFT requires C64 input type, found %s.",
+      if (!primitive_util::IsComplexType(in.element_type())) {
+        return InvalidArgument("IRFFT requires complex input type, found %s.",
                                PrimitiveType_Name(in.element_type()));
       }
       RET_CHECK_RANK(in);
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 448f5119546..b5ecf6e583e 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -615,8 +615,7 @@ namespace fft {
 static const char* unsupported_rank = "only supports ranks 1-3";
 static const char* invalid_rank = "requires input of at least same rank";
 static const char* requires_complex_input = "requires complex input type";
-static const char* requires_f32_input = "requires F32 input type";
-static const char* requires_c64_input = "requires C64 input type";
+static const char* requires_f32_input = "requires F32 or F64 input type";
 static const char* dimensions_match = "innermost dimensions match fft_length";
 static const char* innermost_dimension_matches =
     "innermost dimension matches fft_length/2+1";
@@ -654,7 +653,7 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestFftTypes) {
   Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
   Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
-  fft::Fail(shape_c128, type, {16, 8}, fft::requires_complex_input);
+  fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestIfftRanks) {
@@ -672,7 +671,7 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIfftTypes) {
   Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
   Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
   fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
-  fft::Fail(shape_c128, type, {16, 8}, fft::requires_complex_input);
+  fft::Pass(shape_c128, type, {16, 8}, shape_c128);
 }
 
 TEST_F(ShapeInferenceTest, InferFftShapeTestRfftRanks) {
@@ -747,9 +746,10 @@ TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftDimensions) {
 TEST_F(ShapeInferenceTest, InferFftShapeTestIrfftTypes) {
   FftType type = FftType::IRFFT;
   Shape shape_f32 = ShapeUtil::MakeShape(F32, {16, 8});
-  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 8});
-  fft::Fail(shape_f32, type, {16, 8}, fft::requires_c64_input);
-  fft::Fail(shape_c128, type, {16, 8}, fft::requires_c64_input);
+  Shape shape_c128 = ShapeUtil::MakeShape(C128, {16, 5});
+  Shape shape_f64_out = ShapeUtil::MakeShape(F64, {16, 8});
+  fft::Fail(shape_f32, type, {16, 8}, fft::requires_complex_input);
+  fft::Pass(shape_c128, type, {16, 8}, shape_f64_out);
 }
 
 TEST_F(ShapeInferenceTest, MapThatChangesElementType) {

From 918731364a6ec2e5872bcce48ab06106f88e95f3 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 1 Jun 2020 19:50:39 -0700
Subject: [PATCH 1515/1533] Migrate uint8/float broadcast mul to use binary
 broadcast fivefold

PiperOrigin-RevId: 314253577
Change-Id: I99e59937ecb693228cc2118fbcc4d8cd663fef03
---
 .../internal/optimized/optimized_ops.h        | 173 ++++--------------
 1 file changed, 33 insertions(+), 140 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 6f478daab68..59d0ba7bf6f 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -2565,144 +2565,6 @@ inline void Mul(const ArithmeticParams& params,
   MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const uint8* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const uint8* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 uint8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMulFivefold/8bit");
-
-  ArithmeticParams switched_params = unswitched_params;
-  switched_params.input1_offset = unswitched_params.input2_offset;
-  switched_params.input2_offset = unswitched_params.input1_offset;
-
-  const bool use_unswitched =
-      unswitched_params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const ArithmeticParams& params =
-      use_unswitched ? unswitched_params : switched_params;
-  const uint8* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const uint8* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
-  // sections of the arrays.
-  uint8* output_data_ptr = output_data;
-  const uint8* input1_data_ptr = input1_data;
-  const uint8* input2_data_reset = input2_data;
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          input1_data_ptr += y4;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const uint8* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          ++input1_data_ptr;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
-inline void BroadcastMulFivefold(const ArithmeticParams& params,
-                                 const RuntimeShape& unswitched_input1_shape,
-                                 const float* unswitched_input1_data,
-                                 const RuntimeShape& unswitched_input2_shape,
-                                 const float* unswitched_input2_data,
-                                 const RuntimeShape& output_shape,
-                                 float* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMulFivefold/float");
-
-  const bool use_unswitched =
-      params.broadcast_category ==
-      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
-
-  const float* input1_data =
-      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
-  const float* input2_data =
-      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
-
-  // Fivefold nested loops. The second input resets its position for each
-  // iteration of the second loop. The first input resets its position at the
-  // beginning of the fourth loop. The innermost loop is an elementwise Mul of
-  // sections of the arrays.
-  float* output_data_ptr = output_data;
-  const float* input1_data_ptr = input1_data;
-  const float* input2_data_reset = input2_data;
-  int y0 = params.broadcast_shape[0];
-  int y1 = params.broadcast_shape[1];
-  int y2 = params.broadcast_shape[2];
-  int y3 = params.broadcast_shape[3];
-  int y4 = params.broadcast_shape[4];
-  if (y4 > 1) {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          for (int i3 = 0; i3 < y3; ++i3) {
-            MulElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                           output_data_ptr);
-            input2_data_ptr += y4;
-            output_data_ptr += y4;
-          }
-          input1_data_ptr += y4;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  } else {
-    for (int i0 = 0; i0 < y0; ++i0) {
-      const float* input2_data_ptr = nullptr;
-      for (int i1 = 0; i1 < y1; ++i1) {
-        input2_data_ptr = input2_data_reset;
-        for (int i2 = 0; i2 < y2; ++i2) {
-          // The input may be switched here, but the common parameters here
-          // do not matter as they will not influence the float math execution.
-          MulSimpleBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                             output_data_ptr);
-          input2_data_ptr += y3;
-          output_data_ptr += y3;
-          ++input1_data_ptr;
-        }
-      }
-      input2_data_reset = input2_data_ptr;
-    }
-  }
-}
-
 template <typename T>
 inline void BroadcastMulDispatch(
     const ArithmeticParams& params, const RuntimeShape& input1_shape,
@@ -2713,10 +2575,41 @@ inline void BroadcastMulDispatch(
                               input2_data, output_shape, output_data);
   }
 
-  BroadcastMulFivefold(params, input1_shape, input1_data, input2_shape,
-                       input2_data, output_shape, output_data);
+  BinaryBroadcastFiveFold(
+      params, input1_shape, input1_data, input2_shape, input2_data,
+      output_shape, output_data,
+      static_cast<void (*)(int, const ArithmeticParams&, const T*, const T*,
+                           T*)>(MulElementwise),
+      static_cast<void (*)(int, const ArithmeticParams&, T, const T*, T*)>(
+          MulSimpleBroadcast));
 }
 
+inline void BroadcastMulFivefold(const ArithmeticParams& unswitched_params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const uint8* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const uint8* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  BroadcastMulDispatch(unswitched_params, unswitched_input1_shape,
+                       unswitched_input1_data, unswitched_input2_shape,
+                       unswitched_input2_data, output_shape, output_data);
+}
+
+inline void BroadcastMulFivefold(const ArithmeticParams& params,
+                                 const RuntimeShape& unswitched_input1_shape,
+                                 const float* unswitched_input1_data,
+                                 const RuntimeShape& unswitched_input2_shape,
+                                 const float* unswitched_input2_data,
+                                 const RuntimeShape& output_shape,
+                                 float* output_data) {
+  BroadcastMulDispatch(params, unswitched_input1_shape, unswitched_input1_data,
+                       unswitched_input2_shape, unswitched_input2_data,
+                       output_shape, output_data);
+}
+
+
+
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then

From df7fd4acda99caf4c272552ea7a1d84444463f97 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 19:54:35 -0700
Subject: [PATCH 1516/1533] [MLIR:TF] TPU space to depth pass.

Add pass that does space to depth transformation at compile time for convolution that incur excessive TPU padding.

PiperOrigin-RevId: 314254011
Change-Id: Id4e9e46a954e05e17a38023bf34615773705aeba
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |   1 +
 .../tests/tpu_space_to_depth_pass.mlir        |  87 +++
 .../transforms/tpu_space_to_depth_pass.cc     | 703 ++++++++++++++++++
 3 files changed, 791 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
 create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 5110ea7fbf5..05b2f891676 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -452,6 +452,7 @@ cc_library(
         "transforms/tpu_merge_variables_with_execute.cc",
         "transforms/tpu_rewrite_pass.cc",
         "transforms/tpu_sharding_identification_pass.cc",
+        "transforms/tpu_space_to_depth_pass.cc",
         "transforms/tpu_variable_runtime_reformatting.cc",
         "translate/breakup-islands.cc",
         "translate/control_to_executor_dialect.cc",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
new file mode 100644
index 00000000000..aa333caa2ae
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_space_to_depth_pass.mlir
@@ -0,0 +1,87 @@
+// RUN: tf-opt %s -split-input-file -tf-tpu-space-to-depth-pass | FileCheck %s --dump-input=fail
+
+// Tests for space to depth host and device transform.
+
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:0" = {}, "/job:localhost/replica:0/task:0/device:TPU:1" = {}, "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0" = {}}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 390 : i32}} {
+  func @main(%arg0: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg1: tensor<!tf.variant> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg2: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg3: tensor<!tf.variant> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg4: tensor<!tf.resource<tensor<7x7x3x64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg5: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg6: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg7: tensor<!tf.resource<tensor<i64>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) attributes {tf.entry_function = {control_outputs = "while", inputs = "iterator,iterator_1,iterator_2,iterator_3,while_input_6,while_input_7,while_input_8,while_input_9", outputs = ""}} {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    %2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %3:10 = "tf.While"(%2, %1, %2, %0, %1, %arg2, %arg4, %arg5, %arg6, %arg7) {_lower_using_switch_merge = true, _num_original_outputs = 10 : i64, _read_only_resource_inputs = [], body = @while_body_2710, cond = @while_cond_2700, device = "", is_stateless = false, output_shapes = [#tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>, #tf.shape<>], parallel_iterations = 10 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>)
+    return
+  }
+  // CHECK-LABEL: func @while_body_2710
+  func @while_body_2710(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg6: tensor<!tf.resource<tensor<7x7x3x64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg7: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg8: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg9: tensor<!tf.resource<tensor<i64>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>) attributes {tf.signature.is_stateful} {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    // CHECK: %[[INPUT:.*]] = "tf.IteratorGetNext"
+    %1 = "tf.IteratorGetNext"(%arg5) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf.resource>) -> tensor<2x224x224x3xf32>
+    // CHECK-DAG: %[[SPACETODEPTH0:.*]] = "tf.SpaceToDepth"([[INPUT:.*]]) {block_size = 2 : i64, data_format = "NHWC"} : (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+    %2 = "tf.AddV2"(%arg2, %arg3) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %3 = "tf.ReadVariableOp"(%arg6) : (tensor<!tf.resource<tensor<7x7x3x64xf32>>>) -> tensor<7x7x3x64xf32>
+    %4 = "tf.ReadVariableOp"(%arg8) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %5 = "tf.ReadVariableOp"(%arg7) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+    %6 = "tf.ReadVariableOp"(%arg9) : (tensor<!tf.resource<tensor<i64>>>) -> tensor<i64>
+    %7:2 = "tf_device.cluster_func"(%1, %3, %5, %6) {_tpu_replicate = "while/cluster_while_body_271", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [0, 0, 0, 0], func = @_func, host_compute_core = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], num_cores_per_replica = 1 : i64, output_sharding_configuration = ["\08\01\1A\01\01\22\01\00", "\08\01\1A\01\01\22\01\00"], padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01", use_tpu = true} : (tensor<2x224x224x3xf32>, tensor<7x7x3x64xf32>, tensor<f32>, tensor<i64>) -> (tensor<7x7x3x64xf32>, tensor<i64>)
+    "tf.AssignVariableOp"(%arg6, %7#0) : (tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<7x7x3x64xf32>) -> ()
+    "tf.AssignVariableOp"(%arg9, %7#1) : (tensor<!tf.resource<tensor<i64>>>, tensor<i64>) -> ()
+    %8 = "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %9 = "tf.Identity"(%2) {device = ""} : (tensor<i32>) -> tensor<i32>
+    %10 = "tf.AddV2"(%arg0, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %11 = "tf.Identity"(%10) {device = ""} : (tensor<i32>) -> tensor<i32>
+    return %11, %8, %9, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8, %arg9 : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf.resource>, tensor<!tf.resource<tensor<7x7x3x64xf32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<f32>>>, tensor<!tf.resource<tensor<i64>>>
+  }
+  func @while_cond_2700(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<!tf.resource> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}, %arg6: tensor<!tf.resource<tensor<7x7x3x64xf32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg7: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg8: tensor<!tf.resource<tensor<f32>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}, %arg9: tensor<!tf.resource<tensor<i64>>> {tf.device = "/job:localhost/replica:0/task:0/device:TPU:0"}) -> tensor<i1> {
+    %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %1 = "tf.GreaterEqual"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %2 = "tf.Less"(%arg3, %0) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %3 = "tf.Greater"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %4 = "tf.LogicalAnd"(%2, %3) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %5 = "tf.Less"(%arg2, %arg4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %6 = "tf.LogicalAnd"(%1, %5) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %7 = "tf.LogicalOr"(%6, %4) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %8 = "tf.Less"(%arg0, %arg1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    %9 = "tf.LogicalAnd"(%8, %7) {device = ""} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+    %10 = "tf.Identity"(%9) {device = ""} : (tensor<i1>) -> tensor<i1>
+    return %10 : tensor<i1>
+  }
+  // CHECK-LABEL: func @_func
+  // CHECK-SAME: [[FUNCINPUT0:.*]]: tensor<2x112x112x12xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, [[FUNCINPUT1:%.*]]: tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, [[FUNCINPUT2:%.*]]: tensor<f32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, [[VAL_59:%.*]]: tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) attributes {sym_visibility = "private"} {
+  func @_func(%arg0: tensor<2x224x224x3xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg1: tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg2: tensor<f32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, %arg3: tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) -> (tensor<7x7x3x64xf32> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}, tensor<i64> {xla_hlo.sharding = "\08\01\1A\01\01\22\01\00"}) attributes {sym_visibility = "private"} {
+    %0 = "tf.Const"() {value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    %1 = "tf.Const"() {value = dense<0> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
+    %2 = "tf.Const"() {value = dense<[7, 7, 3, 64]> : tensor<4xi32>} : () -> tensor<4xi32>
+    %3 = "tf.Const"() {value = dense<[[0, 0], [3, 3], [3, 3], [0, 0]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+    %4 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+    %5 = "tf.Pad"(%arg0, %3) : (tensor<2x224x224x3xf32>, tensor<4x2xi32>) -> tensor<2x230x230x3xf32>
+    // CHECK: "tf.Conv2D"
+    // CHECK-SAME: strides = [1, 1, 1, 1]
+    // CHECK-SAME: (tensor<2x115x115x12xf32>, tensor<4x4x12x64xf32>) -> tensor<2x112x112x64xf32>
+    %6 = "tf.Conv2D"(%5, %arg1) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<2x230x230x3xf32>, tensor<7x7x3x64xf32>) -> tensor<2x112x112x64xf32>
+    // CHECK: %[[BACKPROP:.*]] = "tf.Conv2DBackpropFilter"
+    // CHECK-SAME: strides = [1, 1, 1, 1]
+    // CHECK-SAME: (tensor<2x115x115x12xf32>, tensor<4xi32>, tensor<2x112x112x64xf32>) -> tensor<4x4x12x64xf32>
+    %7 = "tf.Conv2DBackpropFilter"(%5, %2, %6) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} : (tensor<2x230x230x3xf32>, tensor<4xi32>, tensor<2x112x112x64xf32>) -> tensor<7x7x3x64xf32>
+    // CHECK: %[[CONST0:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [4, 4, 2, 2, 3, 64]
+    // CHECK: %[[RESHAPE0:.*]] = "tf.Reshape"(%[[BACKPROP:.*]], %[[CONST0:.*]]) : (tensor<4x4x12x64xf32>, tensor<6xi64>) -> tensor<4x4x2x2x3x64xf32>
+    // CHECK: %[[CONST1:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [0, 2, 1, 3, 4, 5]
+    // CHECK: %[[TRANSPOSE0:.*]] = "tf.Transpose"(%[[RESHAPE0:.*]], %[[CONST1:.*]]) : (tensor<4x4x2x2x3x64xf32>, tensor<6xi32>) -> tensor<4x2x4x2x3x64xf32>
+    // CHECK: %[[CONST2:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [8, 8, 3, 64]
+    // CHECK: %[[RESHAPE1:.*]] = "tf.Reshape"(%[[TRANSPOSE1:.*]], %[[CONST2:.*]]) : (tensor<4x2x4x2x3x64xf32>, tensor<4xi64>) -> tensor<8x8x3x64xf32>
+    // CHECK: %[[CONST3:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: [7, 7, 3, 64]
+    // CHECK: %[[CONST4:.*]] = "tf.Const"() {value = dense<
+    // CHECK-SAME: 0
+    // CHECK: %[[SLICE0:.*]] = "tf.Slice"(%[[RESHAPE1:.*]], %[[CONST4:.*]], %[[CONST3:.*]]) : (tensor<8x8x3x64xf32>, tensor<4xi64>, tensor<4xi32>) -> tensor<7x7x3x64xf32>
+    %8 = "tf.CrossReplicaSum"(%7, %1) : (tensor<7x7x3x64xf32>, tensor<1x1xi32>) -> tensor<7x7x3x64xf32>
+    %9 = "tf.Mul"(%arg2, %8) : (tensor<f32>, tensor<7x7x3x64xf32>) -> tensor<7x7x3x64xf32>
+    %10 = "tf.Sub"(%arg1, %9) : (tensor<7x7x3x64xf32>, tensor<7x7x3x64xf32>) -> tensor<7x7x3x64xf32>
+    %11 = "tf.AddV2"(%arg3, %0) : (tensor<i64>, tensor<i64>) -> tensor<i64>
+    return %10, %11 : tensor<7x7x3x64xf32>, tensor<i64>
+  }
+}
+
+// ----
+
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
new file mode 100644
index 00000000000..7befa68f3d8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_space_to_depth_pass.cc
@@ -0,0 +1,703 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <iostream>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/StandardTypes.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace mlir {
+namespace TFTPU {
+
+namespace {
+
+constexpr char kDeviceAttr[] = "device";
+typedef std::pair<TF::Conv2DOp, int64_t> Conv2DWithBlockSize;
+
+// A pass that applies automatic space to depth transform for the first or
+// frontier convolutions consume host inputs on TPU.
+// This is done by adding space to depth transform op after host input and
+// applying space to depth transform for the first convolution and its backprop
+// filter on TPU.
+//
+// Example: original program:
+//
+// module {
+//   func @while_body {
+//     %input = "tf.IteratorGetNext"(...) {device = "/CPU:0"}:
+//              -> tensor<2x224x224x3xf32>
+//     %device_launch = "tf_device.cluster_func"(%input,...) {func = @_func,...)
+//     return ...
+//   }
+//   func @_func(%input: tensor<2x224x224x3xf32>,
+//               %filter: tensor<7x7x3x64xf32>) {
+//     %6 = "tf.Conv2D"(%input, %filter)  {strides = [1, 2, 2, 1]}:
+//      (tensor<2x230x230x3xf32>, tensor<7x7x3x64xf32>) ->
+//      tensor<2x112x112x64xf32>
+//   }
+// }
+//
+// With this pass, the program will be transformed into:
+// module {
+//   func @while_body {
+//     %input = "tf.IteratorGetNext"(...) {device = "/CPU:0"}
+//               -> tensor<2x224x224x3xf32>
+//     %space_to_depth = "tf.SpaceToDepth"(%input) {block_size = 2, ...}:
+//        (tensor<2x224x224x3xf32>) -> tensor<2x112x112x12xf32>
+//     %device_launch = "tf_device.cluster_func"(%space_to_depth,...)
+//       {func = @_func,...)
+//     return ...
+//   }
+//   func @_func(%input: tensor<2x112x112x12xf32>,
+//               %filter: tensor<7x7x3x64xf32>) {
+//     %filter_transform = "tf.Pad/tf.Transpose/tf.Reshape"(%filter):
+//       tensor<7x7x3x64xf32>) -> tensor<4x4x12x64xf32>
+//     %conv = "tf.Conv2D"(%input, %filter_transfrom) {strides = [1, 1, 1, 1]}:
+//       (tensor<2x112x112x12xf32>, tensor<4x4x12x64xf32>) ->
+//       tensor<2x112x112x64xf32>
+//   }
+// }
+//
+// This way, the first convolution with 3 feature dimension will be transformed
+// to 12 feature dimension, which has better performance on TPU.
+//
+// TODO(wangtao): add a pass to check if it is profitable to space to depth
+// transform and invoke the transform if it is needed.
+struct TPUSpaceToDepthPass
+    : public PassWrapper<TPUSpaceToDepthPass, OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+// Handle padding before convolution for space to depth transform.
+LogicalResult HandlePad(TF::PadOp op, int32_t kernel_size, int32_t block_size) {
+  auto ranked_type = op.input().getType().dyn_cast<RankedTensorType>();
+  if (!ranked_type) return failure();
+  auto pad_input_shape = ranked_type.getShape();
+  Location loc = op.getLoc();
+  OpBuilder builder(op);
+  builder.setInsertionPoint(op);
+  auto padding_type = RankedTensorType::get({4, 2}, builder.getIntegerType(32));
+
+  // Calculate paddings.
+  int32_t pad_total = kernel_size - 1;
+  int32_t pad_beg = (pad_total / 2 + 1) / block_size;
+  int32_t pad_end = (pad_total / 2) / block_size;
+  SmallVector<int32_t, 8> values = {0,       0,       pad_beg, pad_end,
+                                    pad_beg, pad_end, 0,       0};
+  auto paddings = DenseIntElementsAttr::get(padding_type, values);
+  // Update pad_op paddings.
+  op.setOperand(1, builder.create<TF::ConstOp>(loc, paddings));
+
+  // Set input type.
+  auto input = op.getOperand(0);
+  SmallVector<int64_t, 4> transform_shape = {
+      pad_input_shape[0], pad_input_shape[1] / block_size,
+      pad_input_shape[2] / block_size,
+      pad_input_shape[3] * block_size * block_size};
+  auto transform_result_type =
+      RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
+  input.setType(transform_result_type);
+  op.setOperand(0, input);
+  return success();
+}
+
+// Handle stride for the first convolution for the transform.
+void HandleConv2DStride(TF::Conv2DOp conv2d) {
+  MLIRContext* context = conv2d.getContext();
+  SmallVector<int64_t, 4> values = {1, 1, 1, 1};
+  auto attrs = llvm::map_range(values, [context](int64_t v) -> Attribute {
+    return IntegerAttr::get(IntegerType::get(64, context), v);
+  });
+  // TODO(b/157276506): change type of strides to DenseElementsAttr
+  auto strides = ArrayAttr::get(llvm::to_vector<4>(attrs), context);
+  conv2d.setAttr("strides", strides);
+}
+
+// Transform input shape for the first convolution.
+void HandleConv2DInput(TF::Conv2DOp conv2d, int64_t block_size) {
+  auto input = conv2d.input();
+  auto input_shape = input.getType().cast<RankedTensorType>().getShape();
+  SmallVector<int64_t, 4> transform_shape = {
+      input_shape[0], input_shape[1] / block_size, input_shape[2] / block_size,
+      input_shape[3] * block_size * block_size};
+  auto transform_result_type =
+      RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
+  input.setType(transform_result_type);
+}
+
+// Add padding for convolution filter for space to depth transform.
+TF::PadOp GetPadOpForConv2DFilter(ArrayRef<int64_t> filter_shape, Value filter,
+                                  OpBuilder* builder, int32_t pad_h,
+                                  int32_t pad_w) {
+  SmallVector<int32_t, 8> values = {pad_h, 0, pad_w, 0, 0, 0, 0, 0};
+  auto padding_type =
+      RankedTensorType::get({4, 2}, builder->getIntegerType(32));
+  auto paddings = DenseIntElementsAttr::get(padding_type, values);
+  auto paddings_value = builder->create<TF::ConstOp>(filter.getLoc(), paddings);
+  std::vector<int64_t> pad_shape = {filter_shape[0] + pad_h,
+                                    filter_shape[1] + pad_w, filter_shape[2],
+                                    filter_shape[3]};
+  SmallVector<int64_t, 4> expand_shape(pad_shape.begin(), pad_shape.end());
+
+  auto expand_result_type =
+      RankedTensorType::get(expand_shape, getElementTypeOrSelf(filter));
+  return builder->create<TF::PadOp>(filter.getLoc(), expand_result_type, filter,
+                                    paddings_value);
+}
+
+// Create reshape op for space to depth transform.
+TF::ReshapeOp GetReshapeOpForConv2DFilter(ArrayRef<int64_t> new_shape,
+                                          Value input, OpBuilder* builder) {
+  auto reshape_result_type =
+      RankedTensorType::get(new_shape, getElementTypeOrSelf(input));
+  auto reshape_type = RankedTensorType::get(
+      {static_cast<int64_t>(new_shape.size())}, builder->getIntegerType(64));
+  auto reshape_sizes = DenseIntElementsAttr::get(reshape_type, new_shape);
+  auto reshape_value =
+      builder->create<TF::ConstOp>(input.getLoc(), reshape_sizes);
+  return builder->create<TF::ReshapeOp>(input.getLoc(), reshape_result_type,
+                                        input, reshape_value);
+}
+
+// Create transpose op for shape to depth transform.
+TF::TransposeOp GetTransposeOpForConv2DFilter(OpBuilder* builder, Value input) {
+  SmallVector<int32_t, 6> permutation = {0, 2, 1, 3, 4, 5};
+  auto permute_type = RankedTensorType::get({6}, builder->getIntegerType(32));
+  auto permute_attr = DenseIntElementsAttr::get(permute_type, permutation);
+  auto permute_value =
+      builder->create<TF::ConstOp>(input.getLoc(), permute_attr);
+  return builder->create<TF::TransposeOp>(input.getLoc(), input, permute_value);
+}
+
+void HandleConv2DFilter(TF::Conv2DOp conv2d, int64_t block_size) {
+  // For example, if filter shape is [7, 7, 3, 64] with block_size 2,
+  // will apply below transforms to the filter:
+  // 1. Pad the filter to [8, 8, 3, 64]
+  // 2. Reshape to [4, 2, 4, 2, 3, 64]
+  // 3. Transpose to [4, 4, 2, 2, 3, 64]
+  // 4. Reshape to [4, 4, 12, 64]
+  auto filter = conv2d.filter();
+  OpBuilder builder(conv2d);
+  builder.setInsertionPoint(conv2d);
+  // Book keeping filter information.
+  auto filter_shape = filter.getType().cast<RankedTensorType>().getShape();
+  int64_t height = filter_shape[0];
+  int64_t width = filter_shape[1];
+  int64_t channel = filter_shape[2];
+  int64_t out_channel = filter_shape[3];
+  // Value/Op before reshape op.
+  Value before_reshape_value = filter;
+  if (height % block_size != 0 || width % block_size != 0) {
+    // Calculate paddings for height and width.
+    int32_t pad_h = block_size - height % block_size;
+    int32_t pad_w = block_size - width % block_size;
+    auto pad_op =
+        GetPadOpForConv2DFilter(filter_shape, filter, &builder, pad_h, pad_w);
+    // Update op, height and width before reshape.
+    before_reshape_value = pad_op;
+    height = height + pad_h;
+    width = width + pad_w;
+  }
+
+  // Reshape.
+  SmallVector<int64_t, 6> new_shape = {
+      height / block_size, block_size, width / block_size,
+      block_size,          channel,    out_channel};
+  auto reshape_op =
+      GetReshapeOpForConv2DFilter(new_shape, before_reshape_value, &builder);
+
+  // Transpose.
+  auto transpose_op = GetTransposeOpForConv2DFilter(&builder, reshape_op);
+
+  // Reshape Back.
+  SmallVector<int64_t, 4> final_shape = {
+      height / block_size, width / block_size,
+      channel * block_size * block_size, out_channel};
+  auto final_reshape_op =
+      GetReshapeOpForConv2DFilter(final_shape, transpose_op, &builder);
+  // Update filter of Conv2D.
+  conv2d.setOperand(1, final_reshape_op);
+}
+
+// Create slice op for filter in back prop pass.
+TF::SliceOp GetSliceOpForConv2DBackPropFilter(
+    ArrayRef<int32_t> old_filter_shape, Value input, OpBuilder* builder) {
+  SmallVector<int64_t, 4> slice_size(old_filter_shape.begin(),
+                                     old_filter_shape.end());
+  auto slice_result_type =
+      RankedTensorType::get(slice_size, getElementTypeOrSelf(input));
+  auto slice_size_op = builder->create<TF::ConstOp>(
+      input.getLoc(),
+      DenseIntElementsAttr::get(
+          RankedTensorType::get({4}, builder->getIntegerType(32)),
+          old_filter_shape));
+  SmallVector<int64_t, 4> slice_start_position = {0, 0, 0, 0};
+  auto start_position_type =
+      RankedTensorType::get({4}, builder->getIntegerType(64));
+  auto start_position = builder->create<TF::ConstOp>(
+      input.getLoc(),
+      DenseIntElementsAttr::get(start_position_type, slice_start_position));
+  return builder->create<TF::SliceOp>(input.getLoc(), slice_result_type, input,
+                                      start_position, slice_size_op);
+}
+
+// Transform Conv2DBackPropFilter for space to depth.
+void HandleConv2DBackPropFilter(TF::Conv2DBackpropFilterOp backprop,
+                                ArrayRef<int32_t> old_filter_shape,
+                                ArrayRef<int32_t> new_filter_shape,
+                                int64_t block_size) {
+  OpBuilder builder(backprop);
+  builder.setInsertionPoint(backprop);
+
+  auto input = backprop.input();
+  // Get new filter size from new_filter_shape.
+  auto new_filter_sizes = builder.create<TF::ConstOp>(
+      backprop.getLoc(),
+      DenseIntElementsAttr::get(
+          RankedTensorType::get({4}, builder.getIntegerType(32)),
+          new_filter_shape));
+
+  // Set stride to [1, 1, 1, 1].
+  MLIRContext* context = backprop.getContext();
+  SmallVector<int64_t, 4> values = {1, 1, 1, 1};
+  auto attrs = llvm::map_range(values, [context](int64_t v) -> Attribute {
+    return IntegerAttr::get(IntegerType::get(64, context), APInt(64, v));
+  });
+  auto strides = ArrayAttr::get(llvm::to_vector<4>(attrs), context);
+
+  // new result type.
+  SmallVector<int64_t, 4> new_shape(new_filter_shape.begin(),
+                                    new_filter_shape.end());
+  auto new_result_type =
+      RankedTensorType::get(new_shape, getElementTypeOrSelf(input));
+
+  // Build new BackPropFilterOp.
+  auto loc = backprop.getLoc();
+  auto new_backprop = builder.create<TF::Conv2DBackpropFilterOp>(
+      loc, new_result_type, input, new_filter_sizes, backprop.out_backprop(),
+      strides, backprop.use_cudnn_on_gpu(), backprop.padding(),
+      backprop.explicit_paddings(), backprop.data_format(),
+      backprop.dilations());
+
+  // For example, if new filter shape is [4, 4, 12, 64], old filter shape
+  // is [7, 7, 3, 64] with block_size 2.
+  // Below transforms will be applied to the filter:
+  // 1. Reshape to [4, 4, 2, 2, 3, 64];
+  // 2. Transpose to [4, 2, 4, 2, 3, 64];
+  // 3. Reshape to [8, 8, 3, 64];
+  // 4. Slice to [7, 7, 3, 64].
+  SmallVector<int64_t, 6> first_reshape_shape = {
+      new_filter_shape[0],
+      new_filter_shape[1],
+      block_size,
+      block_size,
+      new_filter_shape[2] / (block_size * block_size),
+      new_filter_shape[3]};
+  auto first_reshape_op =
+      GetReshapeOpForConv2DFilter(first_reshape_shape, new_backprop, &builder);
+
+  // Transpose.
+  auto transpose_op = GetTransposeOpForConv2DFilter(&builder, first_reshape_op);
+
+  // Last Reshape op.
+  SmallVector<int64_t, 4> last_reshape_shape = {
+      new_filter_shape[0] * block_size, new_filter_shape[1] * block_size,
+      new_filter_shape[2] / (block_size * block_size), new_filter_shape[3]};
+  auto final_reshape_op =
+      GetReshapeOpForConv2DFilter(last_reshape_shape, transpose_op, &builder);
+
+  // create slice op.
+  auto slice_op = GetSliceOpForConv2DBackPropFilter(old_filter_shape,
+                                                    final_reshape_op, &builder);
+
+  // Update backprop's user with the slice op.
+  backprop.replaceAllUsesWith(slice_op.getResult());
+}
+
+// Update func arugument type to have the updated input shape.
+void UpdateFuncType(FuncOp func) {
+  llvm::SmallVector<Type, 8> arg_types;
+  arg_types.reserve(func.getNumArguments());
+  for (auto arg : func.getArguments()) arg_types.emplace_back(arg.getType());
+  auto terminator = func.front().getTerminator();
+  SmallVector<Type, 4> result_types(terminator->operand_type_begin(),
+                                    terminator->operand_type_end());
+  func.setType(FunctionType::get(arg_types, result_types, func.getContext()));
+}
+
+void HandleFuncOp(Operation* op) {
+  auto func = llvm::cast<FuncOp>(op);
+  UpdateFuncType(func);
+}
+
+// Checks if the input producer op is supported in this transform. Right now, we
+// only check if it is a host tf.IteratorGetNext.
+bool IsSupportedHostInputOp(Operation* op) {
+  TF::IteratorGetNextOp iter = llvm::dyn_cast<TF::IteratorGetNextOp>(op);
+  if (!iter) return false;
+  auto device = op->getAttrOfType<StringAttr>(kDeviceAttr);
+  if (!device) return false;
+  tensorflow::DeviceNameUtils::ParsedName parsed_device;
+  if (!tensorflow::DeviceNameUtils::ParseFullName(device.getValue().str(),
+                                                  &parsed_device)) {
+    return false;
+  }
+  return parsed_device.type == "CPU";
+}
+
+// Builds a SpaceToDepthOp with the given get_layout op and input.
+TF::SpaceToDepthOp BuildSpaceToDepth(tf_device::ClusterFuncOp cluster_func,
+                                     Value input, int32_t block_size,
+                                     ArrayRef<int64_t> input_shape) {
+  auto input_op = input.getDefiningOp();
+  OpBuilder builder(input_op);
+  builder.setInsertionPointAfter(input_op);
+  SmallVector<int64_t, 4> transform_shape = {
+      input_shape[0], input_shape[1] / block_size, input_shape[2] / block_size,
+      input_shape[3] * block_size * block_size};
+  auto transform_result_type =
+      RankedTensorType::get(transform_shape, getElementTypeOrSelf(input));
+  return builder.create<TF::SpaceToDepthOp>(cluster_func.getLoc(),
+                                            transform_result_type, input,
+                                            APInt(64, block_size));
+}
+
+// Performs transformation for a non-replicated input.
+TF::SpaceToDepthOp HandleHostInput(Value input, int64_t index,
+                                   tf_device::ClusterFuncOp cluster_func,
+                                   int32_t block_size,
+                                   ArrayRef<int64_t> input_shape) {
+  auto space_to_depth =
+      BuildSpaceToDepth(cluster_func, input, block_size, input_shape);
+  cluster_func.setOperand(index, space_to_depth);
+  return space_to_depth;
+}
+
+// Performs transformation for replicated inputs. Returns true if this is a
+// supported case (thus transform happened).
+bool HandleHostReplicatedInputs(int64_t index,
+                                tf_device::ClusterFuncOp cluster_func,
+                                int64_t replicate_arg_index,
+                                tf_device::ReplicateOp replicate,
+                                int32_t block_size) {
+  // We need to know the devices to copy to.
+  if (!replicate.devices()) return false;
+  int64_t num_replicas = replicate.n().getZExtValue();
+  // Gets inputs at replicate_arg_index for each replica.
+  auto inputs = replicate.getOperands()
+                    .drop_front(replicate_arg_index * num_replicas)
+                    .take_front(num_replicas);
+  for (auto input : inputs) {
+    auto input_op = input.getDefiningOp();
+    if (!input_op || !IsSupportedHostInputOp(input_op)) return false;
+  }
+  for (auto entry : llvm::enumerate(inputs)) {
+    auto ranked_type = entry.value().getType().dyn_cast<RankedTensorType>();
+    if (!ranked_type) return false;
+    auto input_shape = ranked_type.getShape();
+    auto space_to_depth =
+        BuildSpaceToDepth(cluster_func, entry.value(), block_size, input_shape);
+    replicate.setOperand(num_replicas * replicate_arg_index + entry.index(),
+                         space_to_depth);
+  }
+  return true;
+}
+
+// Performs transformation on a pair of execute and compile ops. The compile
+// should not have other uses.
+void HandleCluster(tf_device::ClusterFuncOp cluster_func, int32_t block_size,
+                   unsigned arg_num) {
+  auto maybe_replicate =
+      llvm::dyn_cast<tf_device::ReplicateOp>(cluster_func.getParentOp());
+
+  llvm::SmallVector<int64_t, 8> transform_input_indices;
+  for (auto input : llvm::enumerate(cluster_func.operands())) {
+    if (auto block_arg = input.value().dyn_cast<BlockArgument>()) {
+      if (block_arg.getArgNumber() != arg_num) continue;
+      // For a block argument, consider transforms only when it is a replicated
+      // input (defining ops will be outside the replicate node).
+      if (maybe_replicate == block_arg.getParentRegion()->getParentOp()) {
+        HandleHostReplicatedInputs(input.index(), cluster_func,
+                                   block_arg.getArgNumber(), maybe_replicate,
+                                   block_size);
+      }
+    } else {
+      // For an op output, consider transforms only when 1) there is no
+      // replicateion or 2) it is outside the replicate node that encloses the
+      // execute node. (Because if the op is inside replicate, it is probably
+      // not on the host.)
+      if (input.index() != arg_num) continue;
+      auto input_op = input.value().getDefiningOp();
+      if (maybe_replicate &&
+          maybe_replicate.body().isAncestor(input_op->getParentRegion())) {
+        continue;
+      }
+      if (!IsSupportedHostInputOp(input_op)) continue;
+      auto ranked_type = input.value().getType().dyn_cast<RankedTensorType>();
+      if (!ranked_type) continue;
+      auto input_shape = ranked_type.getShape();
+      HandleHostInput(input.value(), input.index(), cluster_func, block_size,
+                      input_shape);
+    }
+  }
+}
+
+// Check if input shape of convolution is good for space to depth transform.
+bool Conv2DInputShapeCanTransform(Value input) {
+  auto ranked_type = input.getType().dyn_cast<RankedTensorType>();
+  if (!ranked_type) return false;
+  auto input_shape = ranked_type.getShape();
+  int32_t batch_size = input_shape[0];
+  int32_t channel = input_shape[3];
+  if (batch_size > 8 || channel > 8) {
+    return false;
+  }
+  return true;
+}
+
+// Checks if a convoluton can apply SpaceToDepth transform.
+// Only the first convolution in the graph whose batch size smaller than 8
+// and its input feature size smaller than 8 can be transformed.
+Optional<std::pair<unsigned, int>> GetConv2DInputArgNum(TF::Conv2DOp conv2d) {
+  if (conv2d.data_format() != "NHWC" || conv2d.strides().size() != 4) {
+    return None;
+  }
+  auto conv2d_input = conv2d.input();
+  if (auto block_arg = conv2d_input.dyn_cast<mlir::BlockArgument>()) {
+    if (!Conv2DInputShapeCanTransform(conv2d_input)) return None;
+    int num_users =
+        std::distance(block_arg.getUsers().begin(), block_arg.getUsers().end());
+    return std::make_pair(block_arg.getArgNumber(), num_users);
+  }
+
+  if (auto pad_op = llvm::dyn_cast<TF::PadOp>(conv2d_input.getDefiningOp())) {
+    auto pad_input = pad_op.input();
+    if (auto block_arg = pad_input.dyn_cast<mlir::BlockArgument>()) {
+      if (!Conv2DInputShapeCanTransform(pad_input)) return None;
+      int num_users = std::distance(block_arg.getUsers().begin(),
+                                    block_arg.getUsers().end());
+      return std::make_pair(block_arg.getArgNumber(), num_users);
+    }
+  }
+
+  return None;
+}
+
+// Apply space to depth transform for the first convolution on TPU device.
+void HandleFirstConvolution(TF::Conv2DOp conv2d, int64_t block_size) {
+  // Check if input and filter type are RankedTensorType.
+  auto input_tensor_type =
+      conv2d.input().getType().dyn_cast<RankedTensorType>();
+  auto filter_tensor_type =
+      conv2d.filter().getType().dyn_cast<RankedTensorType>();
+  if (!input_tensor_type || !filter_tensor_type) return;
+  // Book keeping filter shape for padding and backprop filter rewrite.
+  auto filter_shape = filter_tensor_type.getShape();
+  SmallVector<int32_t, 4> old_filter_shape(filter_shape.begin(),
+                                           filter_shape.end());
+  // Handles input.
+  auto conv2d_input = conv2d.input();
+  if (auto block_arg = conv2d_input.dyn_cast<mlir::BlockArgument>()) {
+    // Change on device function type/shape.
+    HandleFuncOp(block_arg.getOwner()->getParentOp());
+  }
+
+  if (auto pad_op = dyn_cast_or_null<TF::PadOp>(conv2d_input.getDefiningOp())) {
+    // Rewrite pad_op before Convolutioin.
+    if (failed(HandlePad(pad_op, filter_shape[0], block_size))) return;
+    auto pad_input = pad_op.input();
+    if (auto block_arg = pad_input.dyn_cast<mlir::BlockArgument>()) {
+      // Change on device function type/shape.
+      HandleFuncOp(block_arg.getOwner()->getParentOp());
+    }
+  }
+
+  // Handle Conv2D input, stride and filter.
+  HandleConv2DInput(conv2d, block_size);
+  HandleConv2DStride(conv2d);
+  HandleConv2DFilter(conv2d, block_size);
+
+  // Book keeping new filter shape for backprop filter rewrite.
+  // Filter shape is defined in HandleConv2DFilter, thus it is RankedTensorType.
+  filter_shape = conv2d.filter().getType().cast<RankedTensorType>().getShape();
+  SmallVector<int32_t, 4> new_filter_shape(filter_shape.begin(),
+                                           filter_shape.end());
+
+  // Rewrite Conv2DBackPropFilter after the first convolution.
+  for (Operation* user : conv2d.getOperation()->getUsers()) {
+    if (auto backprop = dyn_cast<TF::Conv2DBackpropFilterOp>(user)) {
+      HandleConv2DBackPropFilter(backprop, old_filter_shape, new_filter_shape,
+                                 block_size);
+    }
+  }
+}
+
+// Get block size that is equal to stride from spatial dimension
+// from convolution.
+// Space to depth transform won't be triggered if block size <= 1.
+int32_t GetConv2DBlockSize(TF::Conv2DOp conv2d) {
+  SmallVector<int32_t, 4> strides(4, 1);
+  for (int i = 0; i < 3; ++i) {
+    strides[i] = conv2d.strides()[i].cast<mlir::IntegerAttr>().getInt();
+  }
+
+  // Space to depth only supports striding at spatial dimension.
+  if (strides[0] != 1 || strides[3] != 1) return 1;
+
+  // Space to depth only supports height_stride == width_stride case.
+  if (strides[1] != strides[2]) return 1;
+
+  return strides[1];
+}
+
+void TPUSpaceToDepthPass::runOnOperation() {
+  Optional<tf_device::ClusterFuncOp> cluster_func;
+  // Space to depth only supports training loop.
+  auto func_result = getOperation().walk([&](tf_device::ClusterFuncOp cluster) {
+    cluster_func = cluster;
+    return WalkResult::interrupt();
+  });
+
+  // Return if there is no tf_device::ClusterFuncOp in training loop.
+  if (!func_result.wasInterrupted() || !cluster_func.hasValue()) {
+    return;
+  }
+
+  // Get the function on device.
+  auto device_func =
+      getOperation().lookupSymbol<mlir::FuncOp>(cluster_func->getFunc());
+  if (!device_func) return;
+
+  TF::Conv2DOp first_conv;
+  Optional<ArrayRef<int64_t>> input_shape;
+  // A map maps block argument id to the convolutions consumes them.
+  llvm::SmallDenseMap<unsigned, std::vector<Conv2DWithBlockSize>>
+      argnum_and_convolutions;
+  // A map maps block argument id to the number of users.
+  llvm::SmallDenseMap<unsigned, int> argnum_num_users;
+
+  // Find out the qualified convolutions and its block argument ids.
+  auto conv2d_result = device_func.walk([&](TF::Conv2DOp conv2d) {
+    Optional<std::pair<unsigned, int>> arg_num_and_num_users =
+        GetConv2DInputArgNum(conv2d);
+    if (arg_num_and_num_users.hasValue()) {
+      // Get block size for the first convolution.
+      int64_t block_size = GetConv2DBlockSize(conv2d);
+      auto arg_num = arg_num_and_num_users.getValue().first;
+      auto num_users = arg_num_and_num_users.getValue().second;
+      argnum_and_convolutions[arg_num].emplace_back(conv2d, block_size);
+      argnum_num_users[arg_num] = num_users;
+      return WalkResult::interrupt();
+    }
+    return WalkResult::advance();
+  });
+  if (!conv2d_result.wasInterrupted()) {
+    return;
+  }
+
+  // Iterate through block argument and its convolution users. Space to depth
+  // transform will be applied only if all the below conditions are satisfied:
+  //  1. All the users of the block argument will lead to convolutions;
+  //  2. block_size of for the space to depth transform for these convolutions
+  //     are the same;
+  //  3. block_size of for the space to depth transform for these convolutions
+  //     are larger than 1.
+  for (auto argnum_and_convolution : argnum_and_convolutions) {
+    auto arg_num = argnum_and_convolution.getFirst();
+    auto conv2d_and_block_sizes = argnum_and_convolution.getSecond();
+    // Continue if number of users of the block argment doesn't equal to number
+    // of transformable convolutions and there is no qualified convolution
+    // for transform or block size is smaller than 2.
+    if (argnum_num_users[arg_num] != conv2d_and_block_sizes.size() ||
+        conv2d_and_block_sizes.empty()) {
+      argnum_and_convolutions.erase(arg_num);
+      continue;
+    }
+    int64_t block_size = conv2d_and_block_sizes[0].second;
+    if (block_size < 2) {
+      argnum_and_convolutions.erase(arg_num);
+      continue;
+    }
+    // Continue if not all the block sizes for space to depth transform are the
+    // same.
+    for (auto conv2d_and_block_size : conv2d_and_block_sizes) {
+      if (conv2d_and_block_size.second != block_size) {
+        argnum_and_convolutions.erase(arg_num);
+        break;
+      }
+    }
+  }
+
+  // If there is no qualified space to depth transform.
+  if (argnum_and_convolutions.empty()) {
+    return;
+  }
+
+  // Apply space to depth transform.
+  for (auto argnum_and_convolution : argnum_and_convolutions) {
+    auto conv2d_and_block_sizes = argnum_and_convolution.getSecond();
+    int64_t block_size = conv2d_and_block_sizes[0].second;
+    // Apply space to depth transform to the input on the host.
+    HandleCluster(cluster_func.getValue(), block_size,
+                  argnum_and_convolution.getFirst());
+    // Transform the convolution.
+    for (auto conv2d_and_block_size : conv2d_and_block_sizes) {
+      HandleFirstConvolution(conv2d_and_block_size.first,
+                             conv2d_and_block_size.second);
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTPUSpaceToDepthPass() {
+  return std::make_unique<TPUSpaceToDepthPass>();
+}
+
+static PassRegistration<TPUSpaceToDepthPass> pass(
+    "tf-tpu-space-to-depth-pass",
+    "Adds ops that allow TPU program enable automaic space to depth for the"
+    "convolution determined at JIT compile time.");
+
+}  // namespace TFTPU
+}  // namespace mlir

From 70387ab55b1b40608b28a9bfd574496bdf5d49b7 Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Mon, 1 Jun 2020 20:19:00 -0700
Subject: [PATCH 1517/1533] Reduce Dense.__call__ overhead by ~15%

- Creates a layers/ops directory to contain functional implementations of Keras
  layer ops (this should make it easier to XLA compile these ops, as well as potentially being useful to expose stateless versions of the layer ops)
- Moves Dense op to this directory and implements a functional version of
  Dense.call with reduced Python overhead
- Uses this op in Dense layer
- Cleans up input_dim and activity_regularizer handling in Dense and Layer
- Adds a microbenchmark for Dense

Also fixes two small issues:
- compute_dtype_object was not always correctly set in Layer
- activity_regularizer can now be passed as a str to any Layer class

PiperOrigin-RevId: 314256375
Change-Id: I769cef6d67aa117f6cb75dc34c0594e748284af2
---
 tensorflow/python/eager/benchmarks_test.py    | 15 +++-
 tensorflow/python/keras/engine/base_layer.py  | 20 ++---
 .../python/keras/engine/base_layer_test.py    |  8 ++
 .../python/keras/engine/base_layer_v1.py      | 20 ++---
 tensorflow/python/keras/layers/BUILD          |  1 +
 tensorflow/python/keras/layers/core.py        | 79 ++++++++-----------
 tensorflow/python/keras/layers/ops/BUILD      | 24 ++++++
 .../python/keras/layers/ops/__init__.py       | 19 +++++
 tensorflow/python/keras/layers/ops/core.py    | 69 ++++++++++++++++
 9 files changed, 189 insertions(+), 66 deletions(-)
 create mode 100644 tensorflow/python/keras/layers/ops/BUILD
 create mode 100644 tensorflow/python/keras/layers/ops/__init__.py
 create mode 100644 tensorflow/python/keras/layers/ops/core.py

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 5d57f1d9b93..0d62a32b1fe 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -53,6 +53,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.layers import core as core_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -1404,7 +1405,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._run(fn, 10000)
 
   # TODO(b/157587712): Move to keras when benchmarks are setup.
-  def benchmark_tf_keras_layer_call(self):
+  def benchmark_tf_keras_layer_call_overhead(self):
 
     class OnlyOverheadLayer(base_layer.Layer):
 
@@ -1419,6 +1420,18 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  # TODO(b/157587712): Move to keras when benchmarks are setup.
+  def benchmark_tf_keras_dense_overhead(self):
+
+    layer = core_layers.Dense(1)
+    x = ops.convert_to_tensor([[1.]])
+    layer(x)  # Warmup call to `build` layer.
+
+    def fn():
+      layer(x)
+
+    self._run(fn, 10000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index e817d56f619..c7d25f31d73 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -293,12 +293,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'input_shape',
-        'batch_input_shape',
-        'batch_size',
-        'weights',
-        'activity_regularizer',
-        'autocast'
+        'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
+        'weights', 'activity_regularizer', 'autocast'
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -323,7 +319,8 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self.supports_masking = False
 
     self._init_set_name(name)
-    self._activity_regularizer = kwargs.pop('activity_regularizer', None)
+    self._activity_regularizer = regularizers.get(
+        kwargs.pop('activity_regularizer', None))
     self._maybe_create_attribute('_trainable_weights', [])
     self._maybe_create_attribute('_non_trainable_weights', [])
     self._updates = []
@@ -370,6 +367,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._dynamic = dynamic
 
     # Manage input shape information if passed.
+    if 'input_dim' in kwargs and 'input_shape' not in kwargs:
+      # Backwards compatibility: alias 'input_dim' to 'input_shape'.
+      kwargs['input_shape'] = (kwargs['input_dim'],)
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
       # to insert before the current layer
@@ -530,7 +530,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     dtype = dtypes.as_dtype(dtype)
     if self._dtype_policy.variable_dtype is None:
       # The policy is "_infer", so we infer the policy from the variable dtype.
-      self._dtype_policy = policy.Policy(dtype.base_dtype.name)
+      self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -2202,7 +2202,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   @_dtype.setter
   def _dtype(self, value):
     value = dtypes.as_dtype(value).name
-    self._dtype_policy = policy.Policy(value)
+    self._set_dtype_policy(policy.Policy(value))
 
   def _name_scope(self):
     if not tf2.enabled():
@@ -2436,7 +2436,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         except AttributeError:
           pass
         else:
-          self._dtype_policy = policy.Policy(dtype)
+          self._set_dtype_policy(policy.Policy(dtype))
       input_shapes = None
       # Converts Tensors / CompositeTensors to TensorShapes.
       if all(hasattr(x, 'shape') for x in input_list):
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index ca138d79020..13fb2b28bb7 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -616,6 +616,14 @@ class BaseLayerTest(keras_parameterized.TestCase):
     self.assertTrue(layer.built)
     self.assertEqual([None, 3], layer._build_input_shape.as_list())
 
+  def test_activity_regularizer_string(self):
+
+    class MyLayer(base_layer.Layer):
+      pass
+
+    layer = MyLayer(activity_regularizer='l2')
+    self.assertIsInstance(layer.activity_regularizer, regularizers.L2)
+
 
 class SymbolicSupportTest(keras_parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 78140985b4a..725334f8535 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -158,12 +158,8 @@ class Layer(base_layer.Layer):
     # are only applicable to input layers: do not pass these keywords
     # to non-input layers.
     allowed_kwargs = {
-        'input_shape',
-        'batch_input_shape',
-        'batch_size',
-        'weights',
-        'activity_regularizer',
-        'autocast'
+        'input_dim', 'input_shape', 'batch_input_shape', 'batch_size',
+        'weights', 'activity_regularizer', 'autocast'
     }
     # Validate optional keyword arguments.
     generic_utils.validate_kwargs(kwargs, allowed_kwargs)
@@ -184,7 +180,8 @@ class Layer(base_layer.Layer):
     self.supports_masking = False
 
     self._init_set_name(name)
-    self._activity_regularizer = kwargs.pop('activity_regularizer', None)
+    self._activity_regularizer = regularizers.get(
+        kwargs.pop('activity_regularizer', None))
     self._maybe_create_attribute('_trainable_weights', [])
     self._maybe_create_attribute('_non_trainable_weights', [])
     self._updates = []
@@ -229,6 +226,9 @@ class Layer(base_layer.Layer):
     self._dynamic = dynamic
 
     # Manage input shape information if passed.
+    if 'input_dim' in kwargs and 'input_shape' not in kwargs:
+      # Backwards compatibility: alias 'input_dim' to 'input_shape'.
+      kwargs['input_shape'] = (kwargs['input_dim'],)
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
       # In this case we will later create an input layer
       # to insert before the current layer
@@ -378,7 +378,7 @@ class Layer(base_layer.Layer):
     dtype = dtypes.as_dtype(dtype)
     if self._dtype_policy.variable_dtype is None:
       # The policy is "_infer", so we infer the policy from the variable dtype.
-      self._dtype_policy = policy.Policy(dtype.base_dtype.name)
+      self._set_dtype_policy(policy.Policy(dtype.base_dtype.name))
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -1835,7 +1835,7 @@ class Layer(base_layer.Layer):
   @_dtype.setter
   def _dtype(self, value):
     value = dtypes.as_dtype(value).name
-    self._dtype_policy = policy.Policy(value)
+    self._set_dtype_policy(policy.Policy(value))
 
   def _name_scope(self):
     return self.name
@@ -2068,7 +2068,7 @@ class Layer(base_layer.Layer):
         except AttributeError:
           pass
         else:
-          self._dtype_policy = policy.Policy(dtype)
+          self._set_dtype_policy(policy.Policy(dtype))
       input_shapes = None
       if all(hasattr(x, 'shape') for x in input_list):
         input_shapes = nest.map_structure(lambda x: x.shape, inputs)
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 10a9fe088ab..0b664d01b6a 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -141,6 +141,7 @@ py_library(
         "//tensorflow/python/keras:initializers",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras/engine:input_spec",
+        "//tensorflow/python/keras/layers/ops:core",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_utils",
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index 60834fad30b..56512d0d754 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -37,18 +37,15 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers.ops import core as core_ops
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops import standard_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
@@ -1132,11 +1129,8 @@ class Dense(Layer):
                kernel_constraint=None,
                bias_constraint=None,
                **kwargs):
-    if 'input_shape' not in kwargs and 'input_dim' in kwargs:
-      kwargs['input_shape'] = (kwargs.pop('input_dim'),)
-
     super(Dense, self).__init__(
-        activity_regularizer=regularizers.get(activity_regularizer), **kwargs)
+        activity_regularizer=activity_regularizer, **kwargs)
 
     self.units = int(units) if not isinstance(units, int) else units
     self.activation = activations.get(activation)
@@ -1148,19 +1142,20 @@ class Dense(Layer):
     self.kernel_constraint = constraints.get(kernel_constraint)
     self.bias_constraint = constraints.get(bias_constraint)
 
-    self.supports_masking = True
     self.input_spec = InputSpec(min_ndim=2)
+    self.supports_masking = True
 
   def build(self, input_shape):
     dtype = dtypes.as_dtype(self.dtype or K.floatx())
     if not (dtype.is_floating or dtype.is_complex):
       raise TypeError('Unable to build `Dense` layer with non-floating point '
                       'dtype %s' % (dtype,))
+
     input_shape = tensor_shape.TensorShape(input_shape)
-    if tensor_shape.dimension_value(input_shape[-1]) is None:
+    last_dim = tensor_shape.dimension_value(input_shape[-1])
+    if last_dim is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    last_dim = tensor_shape.dimension_value(input_shape[-1])
     self.input_spec = InputSpec(min_ndim=2, axes={-1: last_dim})
     self.kernel = self.add_weight(
         'kernel',
@@ -1184,27 +1179,12 @@ class Dense(Layer):
     self.built = True
 
   def call(self, inputs):
-    base_layer_utils.no_ragged_support(inputs, self.name)
-    rank = inputs.shape.rank
-    if rank is not None and rank > 2:
-      # Broadcasting is required for the inputs.
-      outputs = standard_ops.tensordot(inputs, self.kernel, [[rank - 1], [0]])
-      # Reshape the output back to the original ndim of the input.
-      if not context.executing_eagerly():
-        shape = inputs.shape.as_list()
-        output_shape = shape[:-1] + [self.units]
-        outputs.set_shape(output_shape)
-    else:
-      inputs = math_ops.cast(inputs, self._compute_dtype)
-      if K.is_sparse(inputs):
-        outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, self.kernel)
-      else:
-        outputs = gen_math_ops.mat_mul(inputs, self.kernel)
-    if self.use_bias:
-      outputs = nn.bias_add(outputs, self.bias)
-    if self.activation is not None:
-      return self.activation(outputs)  # pylint: disable=not-callable
-    return outputs
+    return core_ops.dense(
+        inputs,
+        self.kernel,
+        self.bias,
+        self.activation,
+        dtype=self._compute_dtype_object)
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -1216,21 +1196,30 @@ class Dense(Layer):
     return input_shape[:-1].concatenate(self.units)
 
   def get_config(self):
-    config = {
-        'units': self.units,
-        'activation': activations.serialize(self.activation),
-        'use_bias': self.use_bias,
-        'kernel_initializer': initializers.serialize(self.kernel_initializer),
-        'bias_initializer': initializers.serialize(self.bias_initializer),
-        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
-        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+    config = super(Dense, self).get_config()
+    config.update({
+        'units':
+            self.units,
+        'activation':
+            activations.serialize(self.activation),
+        'use_bias':
+            self.use_bias,
+        'kernel_initializer':
+            initializers.serialize(self.kernel_initializer),
+        'bias_initializer':
+            initializers.serialize(self.bias_initializer),
+        'kernel_regularizer':
+            regularizers.serialize(self.kernel_regularizer),
+        'bias_regularizer':
+            regularizers.serialize(self.bias_regularizer),
         'activity_regularizer':
             regularizers.serialize(self.activity_regularizer),
-        'kernel_constraint': constraints.serialize(self.kernel_constraint),
-        'bias_constraint': constraints.serialize(self.bias_constraint)
-    }
-    base_config = super(Dense, self).get_config()
-    return dict(list(base_config.items()) + list(config.items()))
+        'kernel_constraint':
+            constraints.serialize(self.kernel_constraint),
+        'bias_constraint':
+            constraints.serialize(self.bias_constraint)
+    })
+    return config
 
 
 @keras_export('keras.layers.ActivityRegularization')
diff --git a/tensorflow/python/keras/layers/ops/BUILD b/tensorflow/python/keras/layers/ops/BUILD
new file mode 100644
index 00000000000..09973c54790
--- /dev/null
+++ b/tensorflow/python/keras/layers/ops/BUILD
@@ -0,0 +1,24 @@
+# Description:
+#   Contains stateless ops for Keras layers.
+
+package(
+    default_visibility = [
+        "//tensorflow/python/keras/layers:__pkg__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "core",
+    srcs = ["core.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:context",
+    ],
+)
diff --git a/tensorflow/python/keras/layers/ops/__init__.py b/tensorflow/python/keras/layers/ops/__init__.py
new file mode 100644
index 00000000000..27d099a4898
--- /dev/null
+++ b/tensorflow/python/keras/layers/ops/__init__.py
@@ -0,0 +1,19 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless ops for Keras layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/keras/layers/ops/core.py b/tensorflow/python/keras/layers/ops/core.py
new file mode 100644
index 00000000000..1a30472cba3
--- /dev/null
+++ b/tensorflow/python/keras/layers/ops/core.py
@@ -0,0 +1,69 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Stateless ops for core Keras layers."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import standard_ops
+
+
+# TODO(b/157913406): Expose this publicly.
+def dense(inputs, kernel, bias=None, activation=None, dtype=None):
+  """Densely connected NN layer op.
+
+  Arguments:
+    inputs: `tf.Tensor` or `tf.SparseTensor`. Inputs to operation.
+    kernel: `tf.Variable`. Matrix kernel.
+    bias: (Optional) `tf.Variable`. Bias to add to outputs.
+    activation: (Optional) 1-argument callable. Activation function to apply to
+      outputs.
+    dtype: (Optional) `tf.DType`. Dtype to cast `inputs` to.
+
+  Returns:
+    `tf.Tensor`. Output of dense connection.
+  """
+  if dtype:
+    if inputs.dtype.base_dtype != dtype.base_dtype:
+      inputs = math_ops.cast(inputs, dtype=dtype)
+
+  rank = inputs.shape.rank
+  if rank == 2 or rank is None:
+    if isinstance(inputs, sparse_tensor.SparseTensor):
+      outputs = sparse_ops.sparse_tensor_dense_matmul(inputs, kernel)
+    else:
+      outputs = gen_math_ops.mat_mul(inputs, kernel)
+  # Broadcast kernel to inputs.
+  else:
+    outputs = standard_ops.tensordot(inputs, kernel, [[rank - 1], [0]])
+    # Reshape the output back to the original ndim of the input.
+    if not context.executing_eagerly():
+      shape = inputs.shape.as_list()
+      output_shape = shape[:-1] + [kernel.shape[-1]]
+      outputs.set_shape(output_shape)
+
+  if bias is not None:
+    outputs = nn_ops.bias_add(outputs, bias)
+
+  if activation is not None:
+    outputs = activation(outputs)
+
+  return outputs

From a3f393bc95784320fd21d0871116d33dcb72982a Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Mon, 1 Jun 2020 20:34:49 -0700
Subject: [PATCH 1518/1533] Add implementation for snapshot dataset v2.

PiperOrigin-RevId: 314258033
Change-Id: I6151fdc646a297090de6eeeb3254a556ae9d13bc
---
 .../base_api/api_def_SnapshotDatasetV2.pbtxt  |  41 +
 .../core/kernels/data/experimental/BUILD      |   7 +
 .../data/experimental/snapshot_dataset_op.cc  | 873 +++++++++++++++++-
 .../data/experimental/snapshot_dataset_op.h   |  83 ++
 .../core/ops/experimental_dataset_ops.cc      |  20 +
 .../python/data/experimental/__init__.py      |   2 +
 .../data/experimental/kernel_tests/BUILD      |   3 +
 .../kernel_tests/serialization/BUILD          |   2 +
 .../snapshot_dataset_serialization_test.py    |  91 ++
 .../kernel_tests/snapshot_test.py             | 291 +++++-
 .../python/data/experimental/ops/snapshot.py  | 170 +++-
 .../v1/tensorflow.data.experimental.pbtxt     |   4 +
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   4 +
 .../v2/tensorflow.data.experimental.pbtxt     |   4 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   4 +
 15 files changed, 1592 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_SnapshotDatasetV2.pbtxt
 create mode 100644 tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h

diff --git a/tensorflow/core/api_def/base_api/api_def_SnapshotDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SnapshotDatasetV2.pbtxt
new file mode 100644
index 00000000000..0545b343ac8
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SnapshotDatasetV2.pbtxt
@@ -0,0 +1,41 @@
+op {
+  graph_op_name: "SnapshotDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+  name: "path"
+  description: <<END
+The path we should write snapshots to / read snapshots from.
+END
+  }
+  attr {
+    name: "compression"
+    description: <<END
+The type of compression to be applied to the saved snapshot files.
+END
+  }
+  attr {
+    name: "reader_func"
+    description: <<END
+Optional. A function to control how to read data from snapshot shards.
+END
+  }
+  attr {
+    name: "shard_func"
+    description: <<END
+Optional. A function to control how to shard data when writing a snapshot.
+END
+  }
+  summary: "Creates a dataset that will write to / read from a snapshot."
+  description: <<END
+This dataset attempts to determine whether a valid snapshot exists at the
+`snapshot_path`, and reads from the snapshot in lieu of using `input_dataset`.
+If not, it will run the preprocessing pipeline as usual, and write out a
+snapshot of the data processed for future use.
+END
+}
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index a9790fd99a4..60459b31a8f 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -544,6 +544,7 @@ cc_library(
 tf_kernel_library(
     name = "snapshot_dataset_op",
     srcs = ["snapshot_dataset_op.cc"],
+    hdrs = ["snapshot_dataset_op.h"],
     deps = [
         ":snapshot_util",
         "//tensorflow/core:core_cpu_internal",
@@ -552,10 +553,16 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:op_requires",
         "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/kernels/data:captured_function",
         "//tensorflow/core/kernels/data:dataset_utils",
+        "//tensorflow/core/kernels/data:name_utils",
         "//tensorflow/core/platform:platform_port",
         "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/time",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index 2cc602a15ae..f9a20f44b15 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h"
+
 #include <random>
 
 #include "absl/time/clock.h"
@@ -60,6 +62,875 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 namespace experimental {
+
+// ==== Snapshot Implementation ====
+
+namespace {
+
+/* The current snapshot on-disk layout is as follows:
+ *   /user/specified/path/
+ *     - graphhash1/
+ *       - snapshot.metadata  // metadata file
+ *       - run1/
+ *         - 00000000.shard/  // shard index
+ *           // new checkpoint files are created on all threads at once, either
+ *           // when a file gets too big, or when a TF checkpoint happens.
+ *           - 00000000.snapshot  // checkpoint file 0
+ *           - 00000001.snapshot  // checkpoint file 1
+ *           - ...
+ *         - 00000001.shard/
+ *           - 00000000.snapshot
+ *           - 00000001.snapshot
+ *           - ...
+ *         - 00000002.shard/
+ *           - 00000000.snapshot
+ *           - 00000001.snapshot
+ *           - ...
+ *           ...
+ *       - run2/
+ *           ...
+ *     - graphhash2/
+ *       ...
+ *     - graphhash3/
+ *       ...
+ */
+
+constexpr const char* const kShardDirectorySuffix = ".shard";
+
+inline tstring HashDirectory(const tstring& path, const uint64 hash) {
+  return io::JoinPath(path, absl::StrFormat("%d", hash));
+}
+
+inline tstring RunDirectory(const tstring& hash_directory,
+                            const uint64 run_id) {
+  return io::JoinPath(hash_directory, absl::StrFormat("%d", run_id));
+}
+
+inline tstring SnapshotShardDirectory(const tstring& run_directory,
+                                      const int64 snapshot_index) {
+  return io::JoinPath(run_directory, absl::StrFormat("%08d%s", snapshot_index,
+                                                     kShardDirectorySuffix));
+}
+
+}  // namespace
+
+class SnapshotDatasetV2Op::Dataset : public DatasetBase {
+ public:
+  Dataset(OpKernelContext* ctx, const DatasetBase* input, uint64 hash,
+          const std::string& path, const std::string& compression,
+          std::unique_ptr<CapturedFunction> reader_func,
+          std::unique_ptr<CapturedFunction> shard_func);
+
+  ~Dataset() override;
+
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const override;
+
+  const DataTypeVector& output_dtypes() const override;
+
+  const std::vector<PartialTensorShape>& output_shapes() const override;
+
+  string DebugString() const override;
+
+  int64 Cardinality() const override;
+
+  Status CheckExternalState() const override;
+
+ protected:
+  Status AsGraphDefInternal(SerializationContext* ctx,
+                            DatasetGraphDefBuilder* b,
+                            Node** output) const override;
+
+ private:
+  const DatasetBase* input_;
+  const uint64 hash_;
+  const tstring path_;
+  const std::string compression_;
+
+  std::unique_ptr<CapturedFunction> reader_func_;
+  std::unique_ptr<CapturedFunction> shard_func_;
+
+  class Iterator;
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorMode = "iterator_mode";
+  static constexpr const char* const kIndex = "index";
+  static constexpr const char* const kGraphHashDirectory =
+      "graph_hash_directory";
+
+  explicit Iterator(const Params& params);
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  Status InitializeIterator(IteratorContext* ctx, IteratorStateReader* reader);
+
+  int64 index_ TF_GUARDED_BY(mu_);
+  std::unique_ptr<IteratorBase> iterator_ TF_GUARDED_BY(mu_);
+  snapshot_util::Mode mode_ TF_GUARDED_BY(mu_);
+  const std::string hash_dir_;
+
+  mutex mu_;
+
+  class Reader;
+  class Writer;
+  class Passthrough;
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator::Reader
+    : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorName = "Reader";
+
+  explicit Reader(const Params& params, int64 start_index);
+
+  ~Reader() override;
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  const int64 start_index_;
+
+  mutex mu_;
+
+  std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+
+  DatasetBase* input_ TF_GUARDED_BY(mu_);
+
+  std::unique_ptr<InstantiatedCapturedFunction> instantiated_reader_func_
+      TF_GUARDED_BY(mu_);
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator::Writer
+    : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorName = "Writer";
+  static constexpr const char* const kRunId = "run_id";
+  static constexpr const char* const kCurrentCheckpointId =
+      "current_checkpoint_id";
+
+  explicit Writer(const Params& params);
+
+  ~Writer() override;
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  struct BufferElement {
+    std::vector<Tensor> value;
+    bool end_of_sequence = false;
+  };
+
+  class WriterThread {
+   public:
+    explicit WriterThread(Writer* writer_iterator, Env* env, int64 file_index,
+                          const tstring& shard_directory,
+                          uint64 current_checkpoint_id) {
+      thread_ = absl::WrapUnique(env->StartThread(
+          ThreadOptions(), absl::StrCat("snapshot_writer_thread_", file_index),
+          [this, writer_iterator, env, shard_directory, current_checkpoint_id] {
+            RunWriterThread(writer_iterator, env, shard_directory,
+                            current_checkpoint_id);
+          }));
+    }
+
+    void EnqueueTensors(const std::vector<Tensor>& tensors)
+        TF_LOCKS_EXCLUDED(deque_mu_) {
+      // Copy the Tensor to the deque for writing.
+      mutex_lock l(deque_mu_);
+      BufferElement be;
+      be.value = tensors;
+      deque_.push_back(std::move(be));
+    }
+
+    void DequeueTensors(BufferElement* be) TF_LOCKS_EXCLUDED(deque_mu_) {
+      mutex_lock l(deque_mu_);
+      deque_mu_.Await(
+          tensorflow::Condition(this, &WriterThread::DequeIsNotEmpty));
+      *be = deque_.front();
+      deque_.pop_front();
+    }
+
+    void StopThread() TF_LOCKS_EXCLUDED(deque_mu_) {
+      mutex_lock l(deque_mu_);
+      BufferElement be;
+      be.end_of_sequence = true;
+      deque_.push_back(std::move(be));
+    }
+
+    void RunWriterThread(Writer* writer_iterator, Env* env,
+                         const tstring& shard_directory,
+                         uint64 current_checkpoint_id) {
+      Status s = WriterThreadFn(writer_iterator, env, shard_directory,
+                                current_checkpoint_id);
+      if (!s.ok()) {
+        mutex_lock l(writer_iterator->mu_);
+        writer_iterator->writer_status_ = s;
+      }
+    }
+
+   private:
+    bool DequeIsNotEmpty() TF_EXCLUSIVE_LOCKS_REQUIRED(deque_mu_) {
+      return !deque_.empty();
+    }
+
+    Status WriterThreadFn(Writer* writer_iterator, Env* env,
+                          const tstring& shard_directory,
+                          uint64 current_checkpoint_id) {
+      std::unique_ptr<snapshot_util::Writer> writer;
+      TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(shard_directory));
+
+      TF_RETURN_IF_ERROR(snapshot_util::Writer::Create(
+          env,
+          snapshot_util::GetCurrentCheckpointFile(shard_directory,
+                                                  current_checkpoint_id),
+          writer_iterator->dataset()->compression_, kSnapshotFileFormatVersion,
+          writer_iterator->dataset()->output_dtypes(), &writer));
+
+      while (true) {
+        BufferElement be;
+        DequeueTensors(&be);
+
+        if (be.end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->Close());
+          break;
+        }
+
+        TF_RETURN_IF_ERROR(writer->WriteTensors(be.value));
+      }
+      return Status::OK();
+    }
+
+    // If both the writer `mu_` and this `deque_mu_` needs to be acquired, the
+    // writer `mu_` must be acquired first.
+    mutex deque_mu_;
+
+    std::deque<BufferElement> deque_ TF_GUARDED_BY(deque_mu_);
+
+    // This has to be last. During destruction, we need to make sure that
+    // thread_ is destroyed first as the thread destructor blocks on thread
+    // completion. If there are other member variables after this, they may get
+    // destroyed first before the thread finishes, potentially causing the
+    // thread to access invalid memory.
+    std::unique_ptr<Thread> thread_;
+  };
+
+  Status GetShardIndex(std::vector<Tensor>* tensors, int64* shard_index)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  Status WriteMetadataFile(Env* env, bool finalized)
+      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  void StopWriterThreads(bool mark_closed) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+
+  mutex mu_;
+  std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
+
+  absl::flat_hash_map<int64, std::unique_ptr<WriterThread>> writer_threads_
+      TF_GUARDED_BY(mu_);
+  Status writer_status_ TF_GUARDED_BY(mu_);
+  bool writers_closed_ TF_GUARDED_BY(mu_);
+
+  uint64 run_id_ TF_GUARDED_BY(mu_);
+  tstring run_dir_ TF_GUARDED_BY(mu_);
+
+  // Stores the ID of the current checkpoint .snapshot file being read. See top
+  // of this file for the directory layout.
+  uint64 current_checkpoint_id_ TF_GUARDED_BY(mu_);
+
+  std::unique_ptr<InstantiatedCapturedFunction> instantiated_shard_func_
+      TF_GUARDED_BY(mu_);
+};
+
+class SnapshotDatasetV2Op::Dataset::Iterator::Passthrough
+    : public DatasetIterator<Dataset> {
+ public:
+  static constexpr const char* const kIteratorName = "Passthrough";
+
+  explicit Passthrough(const Params& params);
+
+  Status Initialize(IteratorContext* ctx) override;
+
+  Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                         bool* end_of_sequence) override;
+
+ protected:
+  Status SaveInternal(SerializationContext* ctx,
+                      IteratorStateWriter* writer) override;
+
+  Status RestoreInternal(IteratorContext* ctx,
+                         IteratorStateReader* reader) override;
+
+ private:
+  std::unique_ptr<IteratorBase> input_impl_;
+};
+
+SnapshotDatasetV2Op::Dataset::Dataset(
+    OpKernelContext* ctx, const DatasetBase* input, uint64 hash,
+    const std::string& path, const std::string& compression,
+    std::unique_ptr<CapturedFunction> reader_func,
+    std::unique_ptr<CapturedFunction> shard_func)
+    : DatasetBase(DatasetContext(ctx)),
+      input_(input),
+      hash_(hash),
+      path_(path),
+      compression_(compression),
+      reader_func_(std::move(reader_func)),
+      shard_func_(std::move(shard_func)) {
+  input_->Ref();
+}
+
+SnapshotDatasetV2Op::Dataset::~Dataset() { input_->Unref(); }
+
+std::unique_ptr<IteratorBase>
+SnapshotDatasetV2Op::Dataset::MakeIteratorInternal(const string& prefix) const {
+  return absl::make_unique<Iterator>(
+      Iterator::Params{this, absl::StrCat(prefix, "::Snapshot")});
+}
+
+const DataTypeVector& SnapshotDatasetV2Op::Dataset::output_dtypes() const {
+  return input_->output_dtypes();
+}
+
+const std::vector<PartialTensorShape>&
+SnapshotDatasetV2Op::Dataset::output_shapes() const {
+  return input_->output_shapes();
+}
+
+string SnapshotDatasetV2Op::Dataset::DebugString() const {
+  return name_utils::DatasetDebugString(kDatasetType);
+}
+
+int64 SnapshotDatasetV2Op::Dataset::Cardinality() const {
+  return input_->Cardinality();
+}
+
+Status SnapshotDatasetV2Op::Dataset::CheckExternalState() const {
+  return input_->CheckExternalState();
+}
+
+Status SnapshotDatasetV2Op::Dataset::AsGraphDefInternal(
+    SerializationContext* ctx, DatasetGraphDefBuilder* b, Node** output) const {
+  Node* input_graph_node = nullptr;
+  TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+
+  Node* path = nullptr;
+  TF_RETURN_IF_ERROR(b->AddScalar(path_, &path));
+
+  std::vector<Node*> reader_func_other_args;
+  DataTypeVector reader_func_other_args_types;
+  TF_RETURN_IF_ERROR(reader_func_->AddToGraph(ctx, b, &reader_func_other_args,
+                                              &reader_func_other_args_types));
+
+  std::vector<Node*> shard_func_other_args;
+  DataTypeVector shard_func_other_args_types;
+  TF_RETURN_IF_ERROR(shard_func_->AddToGraph(ctx, b, &shard_func_other_args,
+                                             &shard_func_other_args_types));
+
+  AttrValue compression_attr;
+  b->BuildAttrValue(compression_, &compression_attr);
+
+  AttrValue reader_func_attr;
+  b->BuildAttrValue(reader_func_->func(), &reader_func_attr);
+
+  AttrValue shard_func_attr;
+  b->BuildAttrValue(shard_func_->func(), &shard_func_attr);
+
+  AttrValue reader_func_arguments_types_attr;
+  b->BuildAttrValue(reader_func_other_args_types,
+                    &reader_func_arguments_types_attr);
+
+  AttrValue shard_func_arguments_types_attr;
+  b->BuildAttrValue(shard_func_other_args_types,
+                    &shard_func_arguments_types_attr);
+
+  return b->AddDataset(
+      this,
+      /*inputs=*/
+      {std::make_pair(0, input_graph_node), std::make_pair(1, path)},
+      /*list_inputs=*/
+      {std::make_pair(2, reader_func_other_args),
+       std::make_pair(3, shard_func_other_args)},
+      /*attrs=*/
+      {{kCompression, compression_attr},
+       {kReaderFunc, reader_func_attr},
+       {kShardFunc, shard_func_attr},
+       {kReaderFuncTarguments, reader_func_arguments_types_attr},
+       {kShardFuncTarguments, shard_func_arguments_types_attr}},
+      output);
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Iterator(const Params& params)
+    : DatasetIterator<Dataset>(params),
+      index_(0),
+      hash_dir_(HashDirectory(dataset()->path_, dataset()->hash_)) {}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Initialize(
+    IteratorContext* ctx) {
+  return ctx->env()->RecursivelyCreateDir(hash_dir_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  if (iterator_ != nullptr) {
+    TF_RETURN_IF_ERROR(SaveInput(ctx, writer, iterator_));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kIteratorMode),
+                                           static_cast<int64>(mode_)));
+    TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kIndex), index_));
+    TF_RETURN_IF_ERROR(
+        writer->WriteScalar(full_name(kGraphHashDirectory), hash_dir_));
+  }
+
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+
+  if (reader->Contains(full_name(kIteratorMode))) {
+    TF_RETURN_IF_ERROR(InitializeIterator(ctx, reader));
+    return RestoreInput(ctx, reader, iterator_);
+  }
+
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  mutex_lock l(mu_);
+  if (iterator_ == nullptr) {
+    TF_RETURN_IF_ERROR(InitializeIterator(ctx, nullptr));
+  }
+  // TODO(b/154341936): Explicitly stopping and starting this iterator
+  // should not be necessary, but the additional
+  // `{Reader,Writer,Passthrough}::kIteratorName` added to the prefix passed to
+  // `iterator_` when it was created prevents the model from identifying this
+  // iterator as the output of `iterator_`.
+  RecordStop(ctx);
+  Status s = iterator_->GetNext(ctx, out_tensors, end_of_sequence);
+  index_++;
+  RecordStart(ctx);
+  return s;
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::InitializeIterator(
+    IteratorContext* ctx, IteratorStateReader* reader)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (reader != nullptr) {
+    // Check whether the computed hash directory is the same.
+    tstring hash_dir;
+    TF_RETURN_IF_ERROR(
+        reader->ReadScalar(full_name(kGraphHashDirectory), &hash_dir));
+    if (hash_dir != hash_dir_) {
+      return errors::DataLoss(
+          "Dataset has changed while restoring from the checkpoint. Old hash "
+          "directory: ",
+          hash_dir, "; new hash directory: ", hash_dir_);
+    }
+
+    experimental::SnapshotMetadataRecord metadata;
+    bool file_exists;
+    TF_RETURN_IF_ERROR(
+        snapshot_util::ReadMetadataFile(hash_dir_, &metadata, &file_exists));
+    if (!file_exists) {
+      return errors::DataLoss("Snapshot metadata file in ", hash_dir_,
+                              " does not exist any more.");
+    }
+
+    int64 iterator_mode;
+    TF_RETURN_IF_ERROR(
+        reader->ReadScalar(full_name(kIteratorMode), &iterator_mode));
+    mode_ = snapshot_util::Mode(iterator_mode);
+
+    TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kIndex), &index_));
+  } else {
+    experimental::SnapshotMetadataRecord metadata;
+    bool file_exists;
+    TF_RETURN_IF_ERROR(
+        snapshot_util::ReadMetadataFile(hash_dir_, &metadata, &file_exists));
+
+    // `pending_snapshot_expiry_seconds` is a legacy option where we would not
+    // write snapshots that we think were still on-going. We decided that this
+    // would not be necessary as a feature for SnapshotV2, and we would always
+    // write a new snapshot regardless of whether someone else is currently
+    // writing one. Setting this to 0 ensures that all previous snapshots
+    // will be ignored and we will proceed to writing.
+    TF_RETURN_IF_ERROR(snapshot_util::DetermineOpState(
+        /*mode_string=*/"", file_exists, &metadata,
+        /*pending_snapshot_expiry_seconds=*/0, &mode_));
+  }
+
+  switch (mode_) {
+    case snapshot_util::READER:
+      iterator_ = absl::make_unique<Reader>(
+          Reader::Params{dataset(),
+                         absl::StrCat(prefix(), Reader::kIteratorName)},
+          index_);
+      break;
+    case snapshot_util::WRITER:
+      iterator_ = absl::make_unique<Writer>(Writer::Params{
+          dataset(), absl::StrCat(prefix(), Writer::kIteratorName)});
+      break;
+    case snapshot_util::PASSTHROUGH:
+      iterator_ = absl::make_unique<Passthrough>(Passthrough::Params{
+          dataset(), absl::StrCat(prefix(), Passthrough::kIteratorName)});
+      break;
+  }
+  return iterator_->Initialize(ctx);
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Reader::Reader(const Params& params,
+                                                       int64 start_index)
+    : DatasetIterator<Dataset>(params), start_index_(start_index) {}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Reader::~Reader() { input_->Unref(); }
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::Initialize(
+    IteratorContext* ctx) {
+  mutex_lock l(mu_);
+
+  TF_RETURN_IF_ERROR(
+      dataset()->reader_func_->Instantiate(ctx, &instantiated_reader_func_));
+
+  tstring hash_dir = HashDirectory(dataset()->path_, dataset()->hash_);
+  bool metadata_file_exists;
+  experimental::SnapshotMetadataRecord metadata;
+  TF_RETURN_IF_ERROR(snapshot_util::ReadMetadataFile(hash_dir, &metadata,
+                                                     &metadata_file_exists));
+
+  auto run_dir = io::JoinPath(hash_dir, metadata.run_id());
+
+  std::vector<std::string> snapshot_shard_dirs;
+  TF_RETURN_IF_ERROR(ctx->env()->GetMatchingPaths(
+      io::JoinPath(run_dir,
+                   absl::StrFormat("%s%s", "*", kShardDirectorySuffix)),
+      &snapshot_shard_dirs));
+  std::sort(snapshot_shard_dirs.begin(), snapshot_shard_dirs.end());
+
+  DatasetBase* dataset_of_snapshot_files;
+  TF_RETURN_IF_ERROR(snapshot_util::Reader::MakeNestedDataset(
+      ctx->env(), snapshot_shard_dirs, dataset()->compression_,
+      kSnapshotFileFormatVersion, dataset()->output_dtypes(),
+      dataset()->output_shapes(), start_index_, &dataset_of_snapshot_files));
+
+  Tensor input_dataset_tensor(DT_VARIANT, TensorShape({}));
+  TF_RETURN_IF_ERROR(StoreDatasetInVariantTensor(dataset_of_snapshot_files,
+                                                 &input_dataset_tensor));
+
+  std::vector<Tensor> reader_input;
+  std::vector<Tensor> reader_output;
+  reader_input.push_back(std::move(input_dataset_tensor));
+
+  TF_RETURN_IF_ERROR(instantiated_reader_func_->Run(
+      ctx, std::move(reader_input), &reader_output));
+  if (reader_output.size() != 1) {
+    return errors::InvalidArgument(
+        "reader_func returns more than one argument.");
+  }
+  TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(reader_output[0], &input_));
+
+  // We need to take a reference here as we will use the input_ and
+  // its iterator.
+  input_->Ref();
+
+  return input_->MakeIterator(ctx, this, prefix(), &input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  mutex_lock l(mu_);
+  return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+}
+
+// We do not need to checkpoint the reader as we are rebuilding the reader
+// datasets from information that is already saved by the main iterator.
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Reader::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  return Status::OK();
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Writer::Writer(const Params& params)
+    : DatasetIterator<Dataset>(params),
+      writers_closed_(false),
+      run_id_(0),
+      current_checkpoint_id_(0) {}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Writer::~Writer() {
+  mutex_lock l(mu_);
+  StopWriterThreads(true);
+}
+
+void SnapshotDatasetV2Op::Dataset::Iterator::Writer::StopWriterThreads(
+    bool mark_closed) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  if (!writers_closed_) {
+    // Push the end of sequence signal to each of the threads to close files.
+    for (auto& writer_thread : writer_threads_) {
+      writer_thread.second->StopThread();
+    }
+
+    writer_threads_.clear();
+    writers_closed_ = mark_closed;
+  }
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::WriteMetadataFile(
+    Env* env, bool finalized) {
+  DCHECK(!run_dir_.empty());
+
+  experimental::SnapshotMetadataRecord metadata;
+  metadata.set_creation_timestamp(EnvTime::NowMicros());
+  metadata.set_graph_hash(absl::StrFormat("%d", dataset()->hash_));
+  metadata.set_run_id(absl::StrFormat("%d", run_id_));
+  metadata.set_version(kSnapshotFileFormatVersion);
+  for (const auto& output_dtype : dataset()->output_dtypes()) {
+    metadata.add_dtype(output_dtype);
+  }
+  metadata.set_finalized(finalized);
+  tstring hash_directory = HashDirectory(dataset()->path_, dataset()->hash_);
+
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(hash_directory));
+  return snapshot_util::WriteMetadataFile(hash_directory, &metadata);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::Initialize(
+    IteratorContext* ctx) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(
+      dataset()->shard_func_->Instantiate(ctx, &instantiated_shard_func_));
+
+  return dataset()->input_->MakeIterator(
+      ctx, this, strings::StrCat(prefix(), "::WriterIterator"), &input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetShardIndex(
+    std::vector<Tensor>* tensors, int64* shard_index) {
+  std::vector<Tensor> output_tensors;
+
+  // Run the shard function
+  TF_RETURN_IF_ERROR(
+      instantiated_shard_func_->RunInstantiated(*tensors, &output_tensors));
+
+  if (output_tensors.size() != 1 || output_tensors[0].dtype() != DT_INT64 ||
+      output_tensors[0].NumElements() != 1) {
+    return errors::InvalidArgument("`shard_func` must return a scalar int64.");
+  }
+
+  // Create writable files if we see an index bigger than our current files.
+  *shard_index = output_tensors[0].flat<int64>()(0);
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  *end_of_sequence = false;
+  WriterThread* current_writer_thread;
+
+  {
+    std::vector<Tensor> output_tensors;
+    mutex_lock l(mu_);
+
+    // We initialize late here because restoring from checkpoint comes after the
+    // the Initialize call. We cannot initialize within Initialize() because
+    // we cannot determine whether we should overwrite an existing metadata
+    // file or not before `RestoreInternal` is potentially called.
+    if (run_dir_.empty()) {
+      run_id_ = random::New64();
+
+      // Creates the run directory.
+      run_dir_ = RunDirectory(HashDirectory(dataset()->path_, dataset()->hash_),
+                              run_id_);
+      TF_RETURN_IF_ERROR(ctx->env()->RecursivelyCreateDir(run_dir_));
+      TF_RETURN_IF_ERROR(WriteMetadataFile(ctx->env(), /*finalized=*/false));
+    }
+
+    // Writers have either encountered an error or are closed.
+    if (!writer_status_.ok() || writers_closed_) {
+      *end_of_sequence = true;
+      return writer_status_;
+    }
+
+    TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
+
+    // Finalize metadata file when we are at the end of the iterator.
+    if (*end_of_sequence) {
+      StopWriterThreads(/*mark_closed=*/true);
+      TF_RETURN_IF_ERROR(writer_status_);
+      return WriteMetadataFile(ctx->env(), /*finalized=*/true);
+    }
+
+    int64 shard_index = 0;
+    TF_RETURN_IF_ERROR(GetShardIndex(out_tensors, &shard_index));
+
+    // If the index does not exist, we will start a new thread.
+    if (writer_threads_.count(shard_index) == 0) {
+      const tstring snapshot_shard_directory =
+          SnapshotShardDirectory(run_dir_, shard_index);
+      auto thread_data = std::make_unique<WriterThread>(
+          this, ctx->env(), shard_index, snapshot_shard_directory,
+          current_checkpoint_id_);
+      writer_threads_.insert({shard_index, std::move(thread_data)});
+    }
+    current_writer_thread = writer_threads_[shard_index].get();
+  }
+
+  current_writer_thread->EnqueueTensors(*out_tensors);
+  return Status::OK();
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  mutex_lock l(mu_);
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(full_name(kRunId), static_cast<int64>(run_id_)));
+  TF_RETURN_IF_ERROR(
+      writer->WriteScalar(full_name(kCurrentCheckpointId),
+                          static_cast<int64>(current_checkpoint_id_)));
+
+  StopWriterThreads(/*mark_closed=*/false);
+  writer_threads_.clear();
+  current_checkpoint_id_++;
+
+  return SaveInput(ctx, writer, input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Writer::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  mutex_lock l(mu_);
+  int64 run_id_signed;
+  int64 current_checkpoint_id;
+
+  TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kRunId), &run_id_signed));
+  TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kCurrentCheckpointId),
+                                        &current_checkpoint_id));
+
+  run_id_ = static_cast<uint64>(run_id_signed);
+  run_dir_ =
+      RunDirectory(HashDirectory(dataset()->path_, dataset()->hash_), run_id_);
+  current_checkpoint_id_ = static_cast<uint64>(current_checkpoint_id);
+
+  return RestoreInput(ctx, reader, input_impl_);
+}
+
+SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::Passthrough(
+    const Params& params)
+    : DatasetIterator<Dataset>(params) {}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::Initialize(
+    IteratorContext* ctx) {
+  return dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::GetNextInternal(
+    IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+    bool* end_of_sequence) {
+  return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::SaveInternal(
+    SerializationContext* ctx, IteratorStateWriter* writer) {
+  return SaveInput(ctx, writer, input_impl_);
+}
+
+Status SnapshotDatasetV2Op::Dataset::Iterator::Passthrough::RestoreInternal(
+    IteratorContext* ctx, IteratorStateReader* reader) {
+  return RestoreInput(ctx, reader, input_impl_);
+}
+
+SnapshotDatasetV2Op::SnapshotDatasetV2Op(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx), graph_def_version_(ctx->graph_def_version()) {
+  FunctionMetadata::Params reader_params;
+  FunctionMetadata::Params shard_params;
+
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
+  OP_REQUIRES_OK(ctx, ctx->GetAttr(kCompression, &compression_));
+
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kReaderFunc, reader_params,
+                                               &reader_func_metadata_));
+  OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kShardFunc, shard_params,
+                                               &shard_func_metadata_));
+}
+
+void SnapshotDatasetV2Op::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                                      DatasetBase** output) {
+  tstring path;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "path", &path));
+
+  // Computes the hash of the preceding items in the graph.
+  uint64 graph_hash;
+  GraphDef graph_def;
+  SerializationContext::Params params;
+  std::vector<std::pair<string, Tensor>> input_list;
+  params.input_list = &input_list;
+  params.external_state_policy =
+      SerializationContext::ExternalStatePolicy::kIgnore;
+  OP_REQUIRES_OK(
+      ctx, AsGraphDef(ctx, input, SerializationContext(params), &graph_def));
+  OP_REQUIRES_OK(ctx, HashGraph(graph_def, &graph_hash));
+
+  std::unique_ptr<CapturedFunction> reader_func;
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, reader_func_metadata_,
+                                          kReaderFuncOtherArgs, &reader_func));
+  std::unique_ptr<CapturedFunction> shard_func;
+  OP_REQUIRES_OK(ctx,
+                 CapturedFunction::Create(ctx, shard_func_metadata_,
+                                          kShardFuncOtherArgs, &shard_func));
+
+  *output = new SnapshotDatasetV2Op::Dataset(
+      ctx, input, graph_hash, path, compression_, std::move(reader_func),
+      std::move(shard_func));
+}
+
+namespace {
+REGISTER_KERNEL_BUILDER(Name("SnapshotDatasetV2").Device(DEVICE_CPU),
+                        SnapshotDatasetV2Op);
+}  // namespace
+
+// ==== Legacy Snapshot Implementation (Deprecated) ====
+
 namespace {
 
 // Defaults to 10 GiB per shard.
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
new file mode 100644
index 00000000000..daab6d20e9b
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/kernels/data/experimental/snapshot_util.h"
+#include "tensorflow/core/kernels/data/name_utils.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/random.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+
+namespace tensorflow {
+namespace data {
+namespace experimental {
+
+const int64 kSnapshotFileFormatVersion = 1;
+
+class SnapshotDatasetV2Op : public UnaryDatasetOpKernel {
+ public:
+  static constexpr const char* const kDatasetType = "Snapshot";
+  static constexpr const char* const kOutputTypes = "output_types";
+  static constexpr const char* const kOutputShapes = "output_shapes";
+  static constexpr const char* const kCompression = "compression";
+  static constexpr const char* const kReaderFunc = "reader_func";
+  static constexpr const char* const kShardFunc = "shard_func";
+  static constexpr const char* const kReaderFuncOtherArgs =
+      "reader_func_other_args";
+  static constexpr const char* const kShardFuncOtherArgs =
+      "shard_func_other_args";
+  static constexpr const char* const kReaderFuncTarguments =
+      "Treader_func_args";
+  static constexpr const char* const kShardFuncTarguments = "Tshard_func_args";
+
+  explicit SnapshotDatasetV2Op(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+
+  std::string compression_;
+
+  std::shared_ptr<FunctionMetadata> reader_func_metadata_;
+  std::shared_ptr<FunctionMetadata> shard_func_metadata_;
+};
+
+}  // namespace experimental
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SNAPSHOT_DATASET_OP_H_
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index aa4bd64270a..18b35d3a172 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -899,6 +899,26 @@ REGISTER_OP("SnapshotDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("SnapshotDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("path: string")
+    .Input("reader_func_other_args: Treader_func_args")
+    .Input("shard_func_other_args: Tshard_func_args")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("compression: string = ''")
+    .Attr("reader_func: func")
+    .Attr("shard_func: func")
+    .Attr("Treader_func_args: list(type) >= 0")
+    .Attr("Tshard_func_args: list(type) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `path` should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("SqlDataset")
     .Input("driver_name: string")
     .Input("data_source_name: string")
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index badc173acec..c9d3d769c44 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -75,6 +75,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@sample_from_datasets
 @@scan
 @@shuffle_and_repeat
+@@snapshot
 @@take_while
 @@to_variant
 @@unbatch
@@ -128,6 +129,7 @@ from tensorflow.python.data.experimental.ops.readers import SqlDataset
 from tensorflow.python.data.experimental.ops.resampling import rejection_resample
 from tensorflow.python.data.experimental.ops.scan_ops import scan
 from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repeat
+from tensorflow.python.data.experimental.ops.snapshot import snapshot
 from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import bytes_produced_stats
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 1d5abb9871b..5ab02842509 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -753,9 +753,12 @@ tf_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/experimental/ops:snapshot",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index a189e591a3a..b3123d65852 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -199,7 +199,9 @@ tf_py_test(
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
index d54e50ebfa3..d5daaacae9a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
@@ -34,6 +34,97 @@ class SnapshotDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase,
     parameterized.TestCase):
 
+  def _build_snapshot_dataset(self, repeat=False):
+
+    def ds_fn():
+      self._snapshot_dir = os.path.join(self.get_temp_dir(), "snapshot")
+      if not os.path.exists(self._snapshot_dir):
+        os.mkdir(self._snapshot_dir)
+
+      dataset = dataset_ops.Dataset.range(100)
+      dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+      if repeat:
+        dataset = dataset.repeat(2)
+      return dataset
+
+    return ds_fn
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointBeforeEpochEndNoRepeat(self):
+    ds_fn = self._build_snapshot_dataset(repeat=False)
+    outputs = self.gen_outputs(ds_fn, [], 50, verify_exhausted=False)
+    self.assertSequenceEqual(outputs, range(50))
+    outputs.extend(
+        self.gen_outputs(ds_fn, [], 50, ckpt_saved=True, verify_exhausted=True))
+    self.assertSequenceEqual(outputs, range(100))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointBeforeOneEpochWithReading(self):
+    ds_fn = self._build_snapshot_dataset(repeat=True)
+
+    # Generate 50 entries from iterator and save checkpoint.
+    outputs = self.gen_outputs(ds_fn, [], 50, verify_exhausted=False)
+    self.assertSequenceEqual(outputs, list(range(50)))
+
+    # Restore from checkpoint and produce the rest of the elements from the
+    # iterator.
+    t = self.gen_outputs(ds_fn, [], 150, ckpt_saved=True, verify_exhausted=True)
+    outputs.extend(t)
+    self.assertSequenceEqual(
+        outputs,
+        list(range(50)) + list(range(50, 100)) + list(range(100)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointBeforeOneEpochThenRunAFewSteps(self):
+    ds_fn = self._build_snapshot_dataset(repeat=False)
+    outputs = self.gen_outputs(
+        ds_fn, [10], 20, verify_exhausted=False, save_checkpoint_at_end=False)
+    self.assertSequenceEqual(outputs, range(20))
+
+    outputs = outputs[:10]
+    outputs.extend(
+        self.gen_outputs(ds_fn, [], 90, ckpt_saved=True, verify_exhausted=True))
+    self.assertSequenceEqual(outputs, range(100))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointAfterOneEpoch(self):
+    ds_fn = self._build_snapshot_dataset(repeat=True)
+
+    # Generate 110 entries from iterator and save checkpoint.
+    outputs = self.gen_outputs(ds_fn, [], 110, verify_exhausted=False)
+    self.assertSequenceEqual(outputs, list(range(100)) + list(range(10)))
+
+    # Restore from checkpoint and produce the rest of the elements from the
+    # iterator.
+    t = self.gen_outputs(ds_fn, [], 90, ckpt_saved=True, verify_exhausted=True)
+    outputs.extend(t)
+    self.assertSequenceEqual(
+        outputs,
+        list(range(100)) + list(range(10)) + list(range(10, 100)))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCheckpointAfterOneEpochRunFewSteps(self):
+    ds_fn = self._build_snapshot_dataset(repeat=True)
+
+    # Generate 120 entries from iterator and save checkpoint at 110.
+    outputs = self.gen_outputs(
+        ds_fn, [110], 120, verify_exhausted=False, save_checkpoint_at_end=False)
+    self.assertSequenceEqual(outputs, list(range(100)) + list(range(20)))
+
+    # Restore from checkpoint and produce the rest of the elements from the
+    # iterator.
+    outputs = outputs[:110]
+    t = self.gen_outputs(ds_fn, [], 90, ckpt_saved=True, verify_exhausted=True)
+    outputs.extend(t)
+    self.assertSequenceEqual(
+        outputs,
+        list(range(100)) + list(range(10)) + list(range(10, 100)))
+
+
+class LegacySnapshotDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase,
+    parameterized.TestCase):
+
   def _build_snapshot_dataset(self,
                               num_threads=1,
                               repeat=False,
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 6169a1752fc..b6fc337db61 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -17,11 +17,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import multiprocessing
 import os
 import shutil
 import time
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import snapshot
@@ -40,6 +42,285 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
   def setUp(self):
     super(SnapshotDatasetTest, self).setUp()
+    tmpdir = self.get_temp_dir()
+    tmpdir = os.path.join(tmpdir, "snapshot")
+    os.mkdir(tmpdir)
+    self._snapshot_dir = tmpdir
+
+  def tearDown(self):
+    super(SnapshotDatasetTest, self).tearDown()
+    shutil.rmtree(self._snapshot_dir)
+
+  def createTFRecords(self, num_files=10, num_records=100):
+    self._num_files = num_files
+    self._num_records = num_records
+    self._test_filenames = self._createFiles()
+
+  def removeTFRecords(self):
+    for filename in self._test_filenames:
+      os.remove(filename)
+    self._test_filenames = []
+    self._num_files = None
+    self._num_records = None
+
+  def assertDatasetProducesSet(self, dataset, expected):
+    actual = []
+    next_fn = self.getNext(dataset)
+    for _ in range(len(expected)):
+      elem = self.evaluate(next_fn())
+      actual.append(elem)
+    self.assertCountEqual(actual, expected)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_fn())
+
+  def assertSnapshotDirectoryContains(self, directory, num_fingerprints,
+                                      num_runs_per_fingerprint,
+                                      num_snapshot_shards_per_run):
+    dirlist_raw = os.listdir(directory)
+    dirlist = []
+
+    # Ignore the graphdef pbtxts we write for debugging purposes.
+    for i in range(len(dirlist_raw)):
+      if not dirlist_raw[i].endswith("-graph.pbtxt"):
+        dirlist.append(dirlist_raw[i])
+
+    self.assertLen(dirlist, num_fingerprints)
+
+    for i in range(num_fingerprints):
+      fingerprint_dir = os.path.join(directory, dirlist[i])
+      fingerprint_dir_list = sorted(os.listdir(fingerprint_dir))
+      self.assertLen(fingerprint_dir_list, num_runs_per_fingerprint + 1)
+      self.assertEqual(fingerprint_dir_list[num_runs_per_fingerprint],
+                       "snapshot.metadata")
+
+      for j in range(num_runs_per_fingerprint):
+        run_dir = os.path.join(fingerprint_dir, fingerprint_dir_list[j])
+        run_dirlist = sorted(os.listdir(run_dir))
+        self.assertLen(run_dirlist, num_snapshot_shards_per_run)
+
+        file_counter = 0
+        for filename in run_dirlist:
+          self.assertEqual(filename, "%08d.shard" % file_counter)
+          file_counter += 1
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testCreateSnapshotDataset(self):
+    dataset = dataset_ops.Dataset.from_tensors([1, 2, 3])
+    dataset.apply(snapshot.snapshot(self._snapshot_dir))
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testReadSnapshotDatasetDefault(self):
+    self.createTFRecords()
+    filenames = self._test_filenames
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 100)
+    ]
+
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset, expected)
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+    self.removeTFRecords()
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset2, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testReadSnapshotDatasetCustomShardFn(self):
+    self.createTFRecords()
+    filenames = self._test_filenames
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 100)
+    ]
+
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(
+        snapshot.snapshot(self._snapshot_dir, shard_func=lambda _: np.int64(0)))
+    self.assertDatasetProduces(dataset, expected)
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=1)
+
+    self.removeTFRecords()
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(
+        snapshot.snapshot(self._snapshot_dir, shard_func=lambda _: 0))
+    self.assertDatasetProduces(dataset2, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testReadSnapshotDatasetCustomReaderFn(self):
+    self.createTFRecords()
+    filenames = self._test_filenames
+    expected = [
+        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
+        for f in range(0, 10)
+        for r in range(0, 100)
+    ]
+
+    dataset = core_readers._TFRecordDataset(filenames)
+    dataset = dataset.apply(
+        snapshot.snapshot(
+            self._snapshot_dir,
+            reader_func=(
+                lambda ds: ds.interleave(  # pylint:disable=g-long-lambda
+                    lambda x: x,
+                    cycle_length=4,
+                    num_parallel_calls=4))))
+    self.assertDatasetProduces(dataset, expected)
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+    self.removeTFRecords()
+    dataset2 = core_readers._TFRecordDataset(filenames)
+    dataset2 = dataset2.apply(
+        snapshot.snapshot(
+            self._snapshot_dir,
+            reader_func=(
+                lambda ds: ds.interleave(  # pylint:disable=g-long-lambda
+                    lambda x: x,
+                    cycle_length=4,
+                    num_parallel_calls=4))))
+    self.assertDatasetProducesSet(dataset2, expected)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSnapshotDatasetInvalidShardFn(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    with self.assertRaises(TypeError):
+      dataset = dataset.apply(
+          snapshot.snapshot(
+              self._snapshot_dir, shard_func=lambda _: "invalid_fn"))
+      next_fn = self.getNext(dataset)
+      self.evaluate(next_fn())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testSnapshotDatasetInvalidReaderFn(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    with self.assertRaises(TypeError):
+      dataset = dataset.apply(
+          snapshot.snapshot(self._snapshot_dir, reader_func=lambda x: x + 1))
+      next_fn = self.getNext(dataset)
+      self.evaluate(next_fn())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetSimple(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset, list(range(1000)))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetMultipleFingerprints(self):
+    dataset1 = dataset_ops.Dataset.range(1000)
+    dataset1 = dataset1.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset1, list(range(1000)))
+
+    dataset2 = dataset_ops.Dataset.range(2000)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset2, list(range(2000)))
+
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=2,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetSameFingerprintMultipleCompleteRuns(self):
+    dataset1 = dataset_ops.Dataset.range(1000)
+    dataset1 = dataset1.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset1, list(range(1000)))
+    dataset2 = dataset_ops.Dataset.range(1000)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    self.assertDatasetProduces(dataset2, list(range(1000)))
+
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetSameFingerprintIncompleteRunRestart(self):
+    dataset1 = dataset_ops.Dataset.range(1000)
+    dataset1 = dataset1.apply(snapshot.snapshot(self._snapshot_dir))
+    next1 = self.getNext(dataset1)
+    for i in range(500):
+      self.assertEqual(i, self.evaluate(next1()))
+
+    dataset2 = dataset_ops.Dataset.range(1000)
+    dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
+    next2 = self.getNext(dataset2)
+    for i in range(500):
+      self.assertEqual(i, self.evaluate(next2()))
+
+    for i in range(500, 1000):
+      self.assertEqual(i, self.evaluate(next1()))
+      self.assertEqual(i, self.evaluate(next2()))
+
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=2,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotCustomShardFunction(self):
+    dataset = dataset_ops.Dataset.range(1000)
+    dataset = dataset.enumerate()
+    dataset = dataset.apply(
+        snapshot.snapshot(self._snapshot_dir, shard_func=lambda i, _: i % 2))
+    dataset = dataset.map(lambda _, elem: elem)
+    self.assertDatasetProduces(dataset, list(range(1000)))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=2)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testWriteSnapshotDatasetWithTuples(self):
+    dataset1 = dataset_ops.Dataset.range(0, 1000)
+    dataset2 = dataset_ops.Dataset.range(1000, 2000)
+    dataset3 = dataset_ops.Dataset.range(2000, 3000)
+    dataset4 = dataset_ops.Dataset.range(3000, 4000)
+
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2, dataset3, dataset4))
+    dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
+    next1 = self.getNext(dataset)
+    for i in range(0, 1000):
+      self.assertEqual((i, i + 1000, i + 2000, i + 3000),
+                       self.evaluate(next1()))
+    self.assertSnapshotDirectoryContains(
+        self._snapshot_dir,
+        num_fingerprints=1,
+        num_runs_per_fingerprint=1,
+        num_snapshot_shards_per_run=multiprocessing.cpu_count())
+
+
+class LegacySnapshotDatasetTest(
+    reader_dataset_ops_test_base.TFRecordDatasetTestBase,
+    parameterized.TestCase):
+
+  def setUp(self):
+    super(LegacySnapshotDatasetTest, self).setUp()
     self.removeTFRecords()
     tmpdir = self.get_temp_dir()
     tmpdir = os.path.join(tmpdir, "snapshot")
@@ -47,7 +328,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     self.snapshot_dir = tmpdir
 
   def tearDown(self):
-    super(SnapshotDatasetTest, self).tearDown()
+    super(LegacySnapshotDatasetTest, self).tearDown()
     shutil.rmtree(self.snapshot_dir)
 
   def removeTFRecords(self):
@@ -63,8 +344,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
   def makeSnapshotDirectory(self):
     return self.snapshot_dir
 
-  def assertSnapshotDirectoryContains(
-      self, directory, num_fingerprints, num_runs_per_fp, num_snapshot_files):
+  def assertSnapshotDirectoryContains(self, directory, num_fingerprints,
+                                      num_runs_per_fp, num_snapshot_files):
     dirlist_raw = os.listdir(directory)
     dirlist = []
 
@@ -465,8 +746,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
               ]),
               combinations.combine(threads=2, size=[1, 2]) +
               combinations.combine(threads=8, size=[1, 4, 8]))))
-  def testReadSnapshotBackAfterMultiThreadedWrite(
-      self, compression, threads, size):
+  def testReadSnapshotBackAfterMultiThreadedWrite(self, compression, threads,
+                                                  size):
     self.setUpTFRecord()
     filenames = self.test_filenames
 
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 490455fcbc3..8a726490efc 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -17,12 +17,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import multiprocessing
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
-
+from tensorflow.python.util import deprecation
+from tensorflow.python.util.tf_export import tf_export
 
 COMPRESSION_GZIP = "GZIP"
 COMPRESSION_SNAPPY = "SNAPPY"
@@ -99,6 +103,8 @@ class _LegacySnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
     super(_LegacySnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
 
+@deprecation.deprecated(
+    None, "Use `tf.data.experimental.snapshot(...)` instead.")
 def legacy_snapshot(path,
                     compression=None,
                     reader_path_prefix=None,
@@ -186,3 +192,165 @@ def legacy_snapshot(path,
         snapshot_name=snapshot_name)
 
   return _apply_fn
+
+
+class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """A dataset that allows saving and re-use of already processed data."""
+
+  def __init__(self,
+               input_dataset,
+               path,
+               shard_func,
+               compression=None,
+               reader_func=None,
+               pending_snapshot_expiry_seconds=None,
+               use_legacy_function=False):
+
+    if reader_func is None:
+      reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
+          lambda x: x,
+          cycle_length=multiprocessing.cpu_count(),
+          num_parallel_calls=dataset_ops.AUTOTUNE)
+
+    self._input_dataset = input_dataset
+    self._path = path
+    self._compression = compression
+
+    self._reader_func = dataset_ops.StructuredFunctionWrapper(
+        reader_func,
+        self._transformation_name() + ".reader_func",
+        # Dataset of datasets of input elements
+        input_structure=dataset_ops.DatasetSpec(
+            dataset_ops.DatasetSpec(input_dataset.element_spec)),
+        use_legacy_function=use_legacy_function)
+    self._shard_func = dataset_ops.StructuredFunctionWrapper(
+        shard_func,
+        self._transformation_name() + ".shard_func",
+        dataset=input_dataset,
+        use_legacy_function=use_legacy_function)
+
+    if ((not self._shard_func.output_structure.is_compatible_with(
+        tensor_spec.TensorSpec([], dtypes.int32))) and
+        (not self._shard_func.output_structure.is_compatible_with(
+            tensor_spec.TensorSpec([], dtypes.int64)))):
+      raise TypeError(
+          "shard_func must return a 0-dimension tensor containing an int.")
+
+    variant_tensor = ged_ops.snapshot_dataset_v2(
+        input_dataset._variant_tensor,  # pylint: disable=protected-access
+        path,
+        self._reader_func.function.captured_inputs,
+        self._shard_func.function.captured_inputs,
+        compression=compression,
+        reader_func=self._reader_func.function,
+        shard_func=self._shard_func.function,
+        **self._flat_structure)
+    super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
+
+  def _functions(self):
+    return [self._reader_func, self._shard_func]
+
+  def _transformation_name(self):
+    return "Dataset.snapshot"
+
+
+@tf_export("data.experimental.snapshot")
+def snapshot(path, compression="AUTO", reader_func=None, shard_func=None):
+  """API to persist the output of the input dataset.
+
+  The snapshot API allows users to transparently persist the output of their
+  preprocessing pipeline to disk, and materialize the pre-processed data on a
+  different training run.
+
+  This API enables repeated preprocessing steps to be consolidated, and allows
+  re-use of already processed data, trading off disk storage and network
+  bandwidth for freeing up more valuable CPU resources and accelerator compute
+  time.
+
+  https://github.com/tensorflow/community/blob/master/rfcs/20200107-tf-data-snapshot.md
+  has detailed design documentation of this feature.
+
+  Users can specify various options to control the behavior of snapshot,
+  including how snapshots are read from and written to by passing in
+  user-defined functions to the `reader_func` and `shard_func` parameters.
+
+  `shard_func` is a user specified function that maps input elements to snapshot
+  shards.
+
+  Users may want to specify this function to control how snapshot files should
+  be written to disk. Below is an example of how a potential shard_func could
+  be written.
+
+  ```python
+  dataset = ...
+  dataset = dataset.enumerate()
+  dataset = dataset.apply(tf.data.experimental.snapshot(
+      shard_func=lambda x, y: x % NUM_SHARDS, ...))
+  dataset = dataset.map(lambda x, y: y)
+  ```
+
+  `reader_func` is a user specified function that accepts a single argument:
+  (1) a Dataset of Datasets, each representing a "split" of elements of the
+  original dataset. The cardinality of the input dataset matches the
+  number of the shards specified in the `shard_func` (see above). The function
+  should return a Dataset of elements of the original dataset.
+
+  Users may want specify this function to control how snapshot files should be
+  read from disk, including the amount of shuffling and parallelism.
+
+  Here is an example of a standard reader function a user can define. This
+  function enables both dataset shuffling and parallel reading of datasets:
+
+  ```python
+  def user_reader_func(datasets):
+    # shuffle the datasets splits
+    datasets = datasets.shuffle(NUM_CORES)
+    # read datasets in parallel and interleave their elements
+    return datasets.interleave(lambda x: x, num_parallel_calls=AUTOTUNE)
+
+  dataset = dataset.apply(tf.data.experimental.snapshot(
+      reader_func=user_reader_func))
+  ```
+
+  By default, snapshot parallelize reads by the number of cores available on
+  the system, but will not attempt to shuffle the data.
+
+  Args:
+    path: Required. A directory to use for storing / loading the snapshot to /
+      from.
+    compression: Optional. The type of compression to apply to the snapshot
+      written to disk. Supported options are `GZIP`, `SNAPPY`, `AUTO` or None.
+      Defaults to AUTO, which attempts to pick an appropriate compression
+      algorithm for the dataset.
+    reader_func: Optional. A function to control how to read data from snapshot
+      shards.
+    shard_func: Optional. A function to control how to shard data when writing a
+      snapshot.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    """Actual dataset transformation."""
+    if shard_func is None:
+      dataset = dataset.enumerate()
+      dataset = _SnapshotDataset(
+          input_dataset=dataset,
+          path=path,
+          compression=compression,
+          reader_func=reader_func,
+          # This will not do the right thing where the graph is built on a
+          # different machine than the executor (e.g. Cloud TPUs).
+          shard_func=lambda index, _: index % multiprocessing.cpu_count())
+      return dataset.map(lambda _, elem: elem)
+    else:
+      return _SnapshotDataset(
+          input_dataset=dataset,
+          path=path,
+          compression=compression,
+          reader_func=reader_func,
+          shard_func=shard_func)
+
+  return _apply_fn
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 523129c7037..e9ec2a6e187 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -220,6 +220,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "snapshot"
+    argspec: "args=[\'path\', \'compression\', \'reader_func\', \'shard_func\'], varargs=None, keywords=None, defaults=[\'AUTO\', \'None\', \'None\'], "
+  }
   member_method {
     name: "take_while"
     argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 19bf27e1cde..3a47de87737 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -3992,6 +3992,10 @@ tf_module {
     name: "SnapshotDataset"
     argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'shard_size_bytes\', \'pending_snapshot_expiry_seconds\', \'num_reader_threads\', \'reader_buffer_size\', \'num_writer_threads\', \'writer_buffer_size\', \'shuffle_on_read\', \'seed\', \'seed2\', \'mode\', \'snapshot_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'10737418240\', \'86400\', \'1\', \'1\', \'1\', \'1\', \'False\', \'0\', \'0\', \'auto\', \'\', \'None\'], "
   }
+  member_method {
+    name: "SnapshotDatasetV2"
+    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "SobolSample"
     argspec: "args=[\'dim\', \'num_results\', \'skip\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 2fc32b21adc..bef3f166a6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -188,6 +188,10 @@ tf_module {
     name: "shuffle_and_repeat"
     argspec: "args=[\'buffer_size\', \'count\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "snapshot"
+    argspec: "args=[\'path\', \'compression\', \'reader_func\', \'shard_func\'], varargs=None, keywords=None, defaults=[\'AUTO\', \'None\', \'None\'], "
+  }
   member_method {
     name: "take_while"
     argspec: "args=[\'predicate\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 19bf27e1cde..3a47de87737 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -3992,6 +3992,10 @@ tf_module {
     name: "SnapshotDataset"
     argspec: "args=[\'input_dataset\', \'path\', \'output_types\', \'output_shapes\', \'compression\', \'reader_path_prefix\', \'writer_path_prefix\', \'shard_size_bytes\', \'pending_snapshot_expiry_seconds\', \'num_reader_threads\', \'reader_buffer_size\', \'num_writer_threads\', \'writer_buffer_size\', \'shuffle_on_read\', \'seed\', \'seed2\', \'mode\', \'snapshot_name\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'\', \'\', \'10737418240\', \'86400\', \'1\', \'1\', \'1\', \'1\', \'False\', \'0\', \'0\', \'auto\', \'\', \'None\'], "
   }
+  member_method {
+    name: "SnapshotDatasetV2"
+    argspec: "args=[\'input_dataset\', \'path\', \'reader_func_other_args\', \'shard_func_other_args\', \'output_types\', \'output_shapes\', \'reader_func\', \'shard_func\', \'compression\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
+  }
   member_method {
     name: "SobolSample"
     argspec: "args=[\'dim\', \'num_results\', \'skip\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "

From 9fbf5e5d503acb486a6a7023d74a04eff9b0c7e3 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Mon, 1 Jun 2020 21:03:11 -0700
Subject: [PATCH 1519/1533] Minor fix to snapshot documentation to add snapshot
 path as positional argument

PiperOrigin-RevId: 314260887
Change-Id: If3a771a0ba7c6454e9022477614f7bea7f80f404
---
 tensorflow/python/data/experimental/ops/snapshot.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 8a726490efc..782e9490d25 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -284,7 +284,7 @@ def snapshot(path, compression="AUTO", reader_func=None, shard_func=None):
   ```python
   dataset = ...
   dataset = dataset.enumerate()
-  dataset = dataset.apply(tf.data.experimental.snapshot(
+  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
       shard_func=lambda x, y: x % NUM_SHARDS, ...))
   dataset = dataset.map(lambda x, y: y)
   ```
@@ -308,7 +308,7 @@ def snapshot(path, compression="AUTO", reader_func=None, shard_func=None):
     # read datasets in parallel and interleave their elements
     return datasets.interleave(lambda x: x, num_parallel_calls=AUTOTUNE)
 
-  dataset = dataset.apply(tf.data.experimental.snapshot(
+  dataset = dataset.apply(tf.data.experimental.snapshot("/path/to/snapshot/dir",
       reader_func=user_reader_func))
   ```
 

From cae9671a88d44fa1228bb87aafa50713c68f003f Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 1 Jun 2020 21:09:25 -0700
Subject: [PATCH 1520/1533] Add a recursive import before running doctest.

This allows all code files are checked, including things that are not implicitly imported by `import tensorflow`

This also:

- Fixes some broken doctests that were hiding in the docs
- switches StructuredTensor's __repr__ to print one item per line.

PiperOrigin-RevId: 314261531
Change-Id: I9ef7c344e40c5e03fd441105327e70a0ab4fdedc
---
 .../ops/structured/structured_tensor.py       | 66 +++++++++++--------
 .../ops/structured/structured_tensor_test.py  | 30 ++++++---
 .../python/training/tensorboard_logging.py    |  8 ++-
 tensorflow/tools/docs/BUILD                   |  1 -
 tensorflow/tools/docs/tf_doctest.py           | 24 +++++--
 5 files changed, 81 insertions(+), 48 deletions(-)

diff --git a/tensorflow/python/ops/structured/structured_tensor.py b/tensorflow/python/ops/structured/structured_tensor.py
index 2007b68a548..3c3bd03a06b 100644
--- a/tensorflow/python/ops/structured/structured_tensor.py
+++ b/tensorflow/python/ops/structured/structured_tensor.py
@@ -60,7 +60,6 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
   ### Examples
 
-  ```python
   >>> # A scalar StructuredTensor describing a single person.
   >>> s1 = StructuredTensor.from_pyval(
   ...     {"age": 82, "nicknames": ["Bob", "Bobby"]})
@@ -78,7 +77,7 @@ class StructuredTensor(composite_tensor.CompositeTensor):
   TensorShape([3])
   >>> s2[0]["age"]
   <tf.Tensor: shape=(), dtype=int32, numpy=12>
-  ```
+
 
   ### Field Paths
 
@@ -156,18 +155,19 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     Examples:
 
       >>> StructuredTensor.from_fields({'x': 1, 'y': [1, 2, 3]})
-      <StructuredTensor(fields={
-                            x: tf.Tensor(1, shape=(), dtype=int32),
-                            y: tf.Tensor([1 2 3], shape=(3,), dtype=int32)},
-                        shape=())>
+      <StructuredTensor(
+        fields={
+          "x": tf.Tensor(1, shape=(), dtype=int32),
+          "y": tf.Tensor([1 2 3], shape=(3,), dtype=int32)},
+        shape=())>
 
       >>> StructuredTensor.from_fields({'foo': [1, 2], 'bar': [3, 4]},
       ...                              shape=[2])
-      <StructuredTensor(fields={
-                            bar: tf.Tensor([3 4], shape=(2,), dtype=int32),
-                            foo: tf.Tensor([1 2], shape=(2,), dtype=int32)},
-                        shape=(2,))>
-
+      <StructuredTensor(
+        fields={
+          "bar": tf.Tensor([3 4], shape=(2,), dtype=int32),
+          "foo": tf.Tensor([1 2], shape=(2,), dtype=int32)},
+        shape=(2,))>
     """
     shape = tensor_shape.as_shape(shape)
     rank = shape.rank
@@ -437,9 +437,15 @@ class StructuredTensor(composite_tensor.CompositeTensor):
       return self._fields[key[rank]].__getitem__(key[:rank] + key[rank + 1:])
 
   def __repr__(self):
-    return '<StructuredTensor(fields={%s}, shape=%s)>' % (', '.join(
-        '"%s": %s' % (k, v)
-        for k, v in sorted(self._fields.items())), self._shape)
+    fields = sorted(self._fields.items())
+    fields = ((k, str(v).replace('\n', '\n            ')) for k, v in fields)
+    fields = ('"{}": {}'.format(k, v) for k, v in fields)
+    dict_repr = ',\n        '.join(fields)
+    return (
+        '<StructuredTensor(\n'
+        '    fields={\n'
+        '        %s},\n'
+        '    shape=%s)>' % (dict_repr, self._shape))
 
   #=============================================================================
   # Conversion
@@ -506,10 +512,11 @@ class StructuredTensor(composite_tensor.CompositeTensor):
 
     >>> StructuredTensor.from_pyval(
     ...     {'a': [1, 2, 3], 'b': [[4, 5], [6, 7]]})
-    <StructuredTensor(fields={
-                          a: tf.Tensor([1 2 3], shape=(3,), dtype=int32),
-                          b: <tf.RaggedTensor [[4, 5], [6, 7]]>},
-                      shape=())>
+    <StructuredTensor(
+        fields={
+          "a": tf.Tensor([1 2 3], shape=(3,), dtype=int32),
+          "b": <tf.RaggedTensor [[4, 5], [6, 7]]>},
+        shape=())>
 
     Note that `StructuredTensor.from_pyval(pyval).to_pyval() == pyval`.
 
@@ -639,9 +646,10 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     ...     [{'foo': 12}, {'foo': 33}, {'foo': 99}])
     >>> partition = RowPartition.from_row_lengths([2, 0, 1])
     >>> st.partition_outer_dimension(partition)
-    <StructuredTensor(fields={
-                          foo: <tf.RaggedTensor [[12, 33], [], [99]]>},
-                      shape=(3, None))>
+    <StructuredTensor(
+      fields={
+        "foo": <tf.RaggedTensor [[12, 33], [], [99]]>},
+      shape=(3, None))>
 
     Args:
       row_partition: A `RowPartition`.
@@ -664,9 +672,10 @@ class StructuredTensor(composite_tensor.CompositeTensor):
     >>> st = StructuredTensor.from_pyval(
     ...     [[{'foo': 12}, {'foo': 33}], [], [{'foo': 99}]])
     >>> st.merge_dims(0, 1)
-    <StructuredTensor(fields={
-                          foo: tf.Tensor([12 33 99], shape=(3,), dtype=int32)},
-                      shape=(3,))>
+    <StructuredTensor(
+      fields={
+        "foo": tf.Tensor([12 33 99], shape=(3,), dtype=int32)},
+      shape=(3,))>
 
     Args:
       outer_axis: `int`: The first dimension in the range of dimensions to
@@ -1071,16 +1080,17 @@ def _partition_outer_dimension(value, row_partition):
 
   Examples:
 
-    >>> partition = row_partition.RowPartition.from_row_lengths([2, 0, 1])
+    >>> partition = RowPartition.from_row_lengths([2, 0, 1])
     >>> _partition_outer_dimension(tf.constant([1, 2, 3]), partition)
     <tf.RaggedTensor [[1, 2], [], [3]]>
 
     >>> struct_value = StructuredTensor.from_pyval(
     ...     [{'x': 1}, {'x': 2}, {'x': 3}])
     >>> _partition_outer_dimension(struct_value, partition)
-    <StructuredTensor(fields={
-                          x: <tf.RaggedTensor [[1, 2], [], [3]]>},
-                      shape=(3, None))>
+    <StructuredTensor(
+      fields={
+        "x": <tf.RaggedTensor [[1, 2], [], [3]]>},
+      shape=(3, None))>
 
   Args:
     value: Tensor, RaggedTensor, or StructuredTensor
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index 420705b07e7..896bfff1296 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import textwrap
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -929,17 +931,25 @@ class StructuredTensorTest(test_util.TensorFlowTestCase,
   def testRepr(self):
     st = StructuredTensor.from_pyval({"a": 5, "b": {"c": [1, 2, 3]}})
     if context.executing_eagerly():
-      expected = ("<StructuredTensor(fields={"
-                  '"a": tf.Tensor(5, shape=(), dtype=int32), '
-                  '"b": <StructuredTensor(fields={'
-                  '"c": tf.Tensor([1 2 3], shape=(3,), dtype=int32)}, '
-                  "shape=())>}, shape=())>")
+      expected = textwrap.dedent("""
+          <StructuredTensor(
+              fields={
+                  "a": tf.Tensor(5, shape=(), dtype=int32),
+                  "b": <StructuredTensor(
+                          fields={
+                              "c": tf.Tensor([1 2 3], shape=(3,), dtype=int32)},
+                          shape=())>},
+              shape=())>""")[1:]
     else:
-      expected = ("<StructuredTensor(fields={"
-                  '"a": Tensor("Const:0", shape=(), dtype=int32), '
-                  '"b": <StructuredTensor(fields={'
-                  '"c": Tensor("RaggedConstant/Const:0", shape=(3,), '
-                  "dtype=int32)}, shape=())>}, shape=())>")
+      expected = textwrap.dedent("""
+          <StructuredTensor(
+              fields={
+                  "a": Tensor("Const:0", shape=(), dtype=int32),
+                  "b": <StructuredTensor(
+                          fields={
+                              "c": Tensor("RaggedConstant/Const:0", shape=(3,), dtype=int32)},
+                          shape=())>},
+              shape=())>""")[1:]
     self.assertEqual(repr(st), expected)
 
   def testPartitionOuterDimension2DDenseField(self):
diff --git a/tensorflow/python/training/tensorboard_logging.py b/tensorflow/python/training/tensorboard_logging.py
index b275d2f682f..7c1a2cdb086 100644
--- a/tensorflow/python/training/tensorboard_logging.py
+++ b/tensorflow/python/training/tensorboard_logging.py
@@ -21,9 +21,11 @@ or that should be permanently associated with the training session.
 
 You can use this just like the logging module:
 
->>> tensorboard_logging.set_summary_writer(summary_writer)
->>> tensorboard_logging.info("my %s", "message")
->>> tensorboard_logging.log(tensorboard_logging.WARN, "something")
+```
+tensorboard_logging.set_summary_writer(summary_writer)
+tensorboard_logging.info("my %s", "message")
+tensorboard_logging.log(tensorboard_logging.WARN, "something")
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 1f2dd5d31d2..f347cab91bb 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -39,7 +39,6 @@ py_test(
     deps = [
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python/keras/preprocessing",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/tools/docs/tf_doctest.py b/tensorflow/tools/docs/tf_doctest.py
index fc81d33cfde..00bf3492787 100644
--- a/tensorflow/tools/docs/tf_doctest.py
+++ b/tensorflow/tools/docs/tf_doctest.py
@@ -19,16 +19,17 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import importlib
 import os
+import pkgutil
 import sys
 
 from absl import flags
 from absl.testing import absltest
 import numpy as np
-
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.keras import preprocessing
+import tensorflow.python as tf_root
 from tensorflow.tools.docs import tf_doctest_lib
 
 # We put doctest after absltest so that it picks up the unittest monkeypatch.
@@ -37,9 +38,6 @@ import doctest  # pylint: disable=g-bad-import-order
 
 tf.compat.v1.enable_v2_behavior()
 
-# Inject keras.preprocessing files into `tf.keras.preprocessing` namespace.
-tf.keras.preprocessing = preprocessing
-
 FLAGS = flags.FLAGS
 
 flags.DEFINE_list('module', [], 'A list of specific module to run doctest on.')
@@ -56,6 +54,20 @@ flags.mark_flags_as_mutual_exclusive(['list', 'file'])
 PACKAGE = 'tensorflow.python.'
 
 
+def recursive_import(root):
+  """Recursively imports all the sub-modules under a root package.
+
+  Args:
+    root: A python package.
+  """
+  for _, name, _ in pkgutil.walk_packages(
+      root.__path__, prefix=root.__name__ + '.'):
+    try:
+      importlib.import_module(name)
+    except (AttributeError, ImportError):
+      pass
+
+
 def find_modules():
   """Finds all the modules in the core package imported.
 
@@ -166,6 +178,6 @@ def load_tests(unused_loader, tests, unused_ignore):
         ))
   return tests
 
-
 if __name__ == '__main__':
+  recursive_import(tf_root)
   absltest.main()

From e97e7d851b56bc58acfbfc07d7ad3c29e801ded2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 21:18:07 -0700
Subject: [PATCH 1521/1533] Update ops-related pbtxt files.

PiperOrigin-RevId: 314262225
Change-Id: I36b110df3f4fd3e224b2069ee0c5263e702c32e9
---
 .../ops_history_v2/SnapshotDatasetV2.pbtxt    | 60 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 60 +++++++++++++++++++
 2 files changed, 120 insertions(+)
 create mode 100644 tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt

diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
new file mode 100644
index 00000000000..f0868514182
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
@@ -0,0 +1,60 @@
+op {
+  name: "SnapshotDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index ea7d21a5b2b..346cc19c820 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -43279,6 +43279,66 @@ op {
     }
   }
 }
+op {
+  name: "SnapshotDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
 op {
   name: "SobolSample"
   input_arg {

From 3ab25c320300b6c2e9f78fb08db4d15334bf2603 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 1 Jun 2020 21:52:21 -0700
Subject: [PATCH 1522/1533] Fix RNN cell wrt deep copy.

The root cause is that we have some internal cache field, which is not picklable by python.

Fix https://github.com/tensorflow/tensorflow/issues/39978.

PiperOrigin-RevId: 314265192
Change-Id: I66e80dea5fb65ac9c2e567f2af74510cd64ce5fc
---
 tensorflow/python/keras/layers/gru_test.py    |  8 ++++
 tensorflow/python/keras/layers/lstm_test.py   |  8 ++++
 tensorflow/python/keras/layers/recurrent.py   | 45 +++++++++++++++----
 .../python/keras/layers/simplernn_test.py     |  8 ++++
 4 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
index 083d52b1caa..5b794c580a2 100644
--- a/tensorflow/python/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -253,6 +255,12 @@ class GRULayerGenericTest(test.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_deep_copy_GRU(self):
+    cell = keras.layers.GRUCell(5)
+    copied_cell = copy.deepcopy(cell)
+    self.assertEqual(copied_cell.units, 5)
+    self.assertEqual(cell.get_config(), copied_cell.get_config())
+
   def test_regularizers_GRU(self):
     embedding_dim = 4
     layer_class = keras.layers.GRU
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index c6387256e7e..5ed86dd829a 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -182,6 +184,12 @@ class LSTMLayerTest(keras_parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_deep_copy_LSTM(self):
+    cell = keras.layers.LSTMCell(5)
+    copied_cell = copy.deepcopy(cell)
+    self.assertEqual(copied_cell.units, 5)
+    self.assertEqual(cell.get_config(), copied_cell.get_config())
+
   def test_specify_initial_state_keras_tensor(self):
     num_states = 2
     timesteps = 3
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index f11f4e88f21..0ce17c6101e 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -1096,18 +1096,30 @@ class DropoutRNNCellMixin(object):
   """
 
   def __init__(self, *args, **kwargs):
-    # Note that the following two masks will be used in "graph function" mode,
-    # e.g. these masks are symbolic tensors. In eager mode, the `eager_*_mask`
-    # tensors will be generated differently than in the "graph function" case,
-    # and they will be cached.
-    # Also note that in graph mode, we still cache those masks only because the
-    # RNN could be created with `unroll=True`. In that case, the `cell.call()`
-    # function will be invoked multiple times, and we want to ensure same mask
-    # is used every time.
+    self._create_non_trackable_mask_cache()
+    super(DropoutRNNCellMixin, self).__init__(*args, **kwargs)
+
+  @trackable.no_automatic_dependency_tracking
+  def _create_non_trackable_mask_cache(self):
+    """Create the cache for dropout and recurrent dropout mask.
+
+    Note that the following two masks will be used in "graph function" mode,
+    e.g. these masks are symbolic tensors. In eager mode, the `eager_*_mask`
+    tensors will be generated differently than in the "graph function" case,
+    and they will be cached.
+
+    Also note that in graph mode, we still cache those masks only because the
+    RNN could be created with `unroll=True`. In that case, the `cell.call()`
+    function will be invoked multiple times, and we want to ensure same mask
+    is used every time.
+
+    Also the caches are created without tracking. Since they are not picklable
+    by python when deepcopy, we don't want layer._obj_reference_counts_dict
+    to track it by default.
+    """
     self._dropout_mask_cache = K.ContextValueCache(self._create_dropout_mask)
     self._recurrent_dropout_mask_cache = K.ContextValueCache(
         self._create_recurrent_dropout_mask)
-    super(DropoutRNNCellMixin, self).__init__(*args, **kwargs)
 
   def reset_dropout_mask(self):
     """Reset the cached dropout masks if any.
@@ -1187,6 +1199,21 @@ class DropoutRNNCellMixin(object):
     init_kwargs = dict(inputs=inputs, training=training, count=count)
     return self._recurrent_dropout_mask_cache.setdefault(kwargs=init_kwargs)
 
+  def __getstate__(self):
+    # Used for deepcopy. The caching can't be pickled by python, since it will
+    # contain tensor and graph.
+    state = super(DropoutRNNCellMixin, self).__getstate__()
+    state.pop('_dropout_mask_cache', None)
+    state.pop('_recurrent_dropout_mask_cache', None)
+    return state
+
+  def __setstate__(self, state):
+    state['_dropout_mask_cache'] = K.ContextValueCache(
+        self._create_dropout_mask)
+    state['_recurrent_dropout_mask_cache'] = K.ContextValueCache(
+        self._create_recurrent_dropout_mask)
+    super(DropoutRNNCellMixin, self).__setstate__(state)
+
 
 @keras_export('keras.layers.SimpleRNNCell')
 class SimpleRNNCell(DropoutRNNCellMixin, Layer):
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index c9935bfcfe7..b586814a345 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 from absl.testing import parameterized
 import numpy as np
 
@@ -133,6 +135,12 @@ class SimpleRNNLayerTest(test.TestCase, parameterized.TestCase):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  def test_deep_copy_SimpleRNN(self):
+    cell = keras.layers.SimpleRNNCell(5)
+    copied_cell = copy.deepcopy(cell)
+    self.assertEqual(copied_cell.units, 5)
+    self.assertEqual(cell.get_config(), copied_cell.get_config())
+
   def test_regularizers_SimpleRNN(self):
     embedding_dim = 4
     layer_class = keras.layers.SimpleRNN

From 3d552e66e5a16ecf510fd90595d2dc87ff34e99b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 1 Jun 2020 21:55:17 -0700
Subject: [PATCH 1523/1533] Go: Update generated wrapper functions for
 TensorFlow ops.

PiperOrigin-RevId: 314265460
Change-Id: Ie6c7928f93c190e9ce1c331ed8afc87d170b7dc8
---
 tensorflow/go/op/wrappers.go | 70 ++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index b5a8308e1a0..ec9c38bd913 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -3138,41 +3138,6 @@ func BoostedTreesBucketize(scope *Scope, float_values []tf.Output, bucket_bounda
 	return buckets
 }
 
-// Generate the bucket boundaries for each feature based on accumulated summaries.
-//
-// An op that returns a list of float tensors for a quantile stream resource. Each
-// tensor is Rank 1 containing bucket boundaries for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	num_features: inferred int; number of features to get bucket boundaries for.
-//
-// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
-func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_features": num_features}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
-		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
-		return
-	}
-	return bucket_boundaries
-}
-
 // Returns immutable tensor from memory region.
 //
 // The current implementation memmaps the tensor from a file.
@@ -10231,6 +10196,41 @@ func ExperimentalSqlDataset(scope *Scope, driver_name tf.Output, data_source_nam
 	return op.Output(0)
 }
 
+// Generate the bucket boundaries for each feature based on accumulated summaries.
+//
+// An op that returns a list of float tensors for a quantile stream resource. Each
+// tensor is Rank 1 containing bucket boundaries for a single feature.
+//
+// Arguments:
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	num_features: inferred int; number of features to get bucket boundaries for.
+//
+// Returns float; List of Rank 1 Tensors each containing the bucket boundaries for a feature.
+func BoostedTreesQuantileStreamResourceGetBucketBoundaries(scope *Scope, quantile_stream_resource_handle tf.Output, num_features int64) (bucket_boundaries []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_features": num_features}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesQuantileStreamResourceGetBucketBoundaries",
+		Input: []tf.Input{
+			quantile_stream_resource_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if bucket_boundaries, idx, err = makeOutputList(op, idx, "bucket_boundaries"); err != nil {
+		scope.UpdateErr("BoostedTreesQuantileStreamResourceGetBucketBoundaries", err)
+		return
+	}
+	return bucket_boundaries
+}
+
 // Creates a dataset that passes a sliding window over `input_dataset`.
 //
 // Arguments:

From 696715805d137415817d0e10042ff46518074c1a Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 1 Jun 2020 22:07:04 -0700
Subject: [PATCH 1524/1533] Update dependencies for the hexagon tests

PiperOrigin-RevId: 314266741
Change-Id: Ia99f3cf44667fd07ce418f38c9e89c13104a5fbd
---
 .../lite/experimental/delegates/hexagon/builders/tests/BUILD   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index 1a496250467..bfa6618447e 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -17,6 +17,9 @@ cc_library(
         "//tensorflow/lite/experimental/delegates/hexagon:hexagon_delegate",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels/internal:reference_base",
+        "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/kernels/internal:types",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
     ],

From 3ee406e729a36144fdb2579463b1eee46c80902d Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 1 Jun 2020 22:43:23 -0700
Subject: [PATCH 1525/1533] NFC: Remove some dead code Add comments to some
 patterns/constraints. Add names for some patterns in TFL legalize pass.
 Naming follows:  - If rule is direct mapping (TF_X -> TFL_X) then:
 "LegalizeX" Where X is the name of the Op.  - Otherwise, the name should
 explain what it does, for example: LegalizeFakeQuantToDequantizeQuantize,
 which legalizes TF::FakeQuant op to TFL::Dequantize followed by
 TFL::Quantize.

PiperOrigin-RevId: 314270215
Change-Id: I97caeab7443a1230f28ad91d60eb1ecfa5794c97
---
 .../mlir/lite/transforms/legalize_patterns.td | 77 +++++++++++--------
 1 file changed, 44 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index 4c6a16c2233..3e9d6e488b8 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -33,9 +33,6 @@ class ExtractI32At<int i> : NativeCodeCall<
     "$_builder.getI32IntegerAttr($_self.cast<ArrayAttr>().getValue()[" # i #
     "].cast<IntegerAttr>().getInt())">;
 
-// Merge the two Attributes to a ArrayAttr;
-def Merge2AttrsToArray : NativeCodeCall<"$_builder.getArrayAttr({$0, $1})">;
-
 // Use the tensor type information from $0 and convert min $1, max $2 and
 // numBits $3 and narrowRange $4 to a QuantizedType.
 def ConvertToQuantTypeFromAttrs : NativeCodeCall<
@@ -61,15 +58,12 @@ def HasNotSameStaticShapes : Constraint<Neg<HasSameStaticShapesPred>, "op must h
 def CreateNoneValue : NativeCodeCall<
   "$_builder.create<mlir::ConstantOp>($0.getLoc(), $_builder.getNoneType(), $_builder.getUnitAttr())">;
 
-// Checks if the value has only one user.
-// TODO(karimnosseir): Move to a common place?
-def HasOneUse : Constraint<CPred<"$0.hasOneUse()">>;
-
 //===----------------------------------------------------------------------===//
 // Nullary ops patterns.
 //===----------------------------------------------------------------------===//
 
-def : Pat<(TF_ConstOp ElementsAttr:$value), (TFL_ConstOp $value)>;
+def LegalizeTFConstToTFLConst: Pat<(TF_ConstOp ElementsAttr:$value),
+                                   (TFL_ConstOp $value)>;
 
 // Convert to std constant for statically shaped, non-opaque constants.
 def : Pat<(TF_ConstOp:$res NonOpaqueElementsAttr:$value), (ConstantOp $value),
@@ -79,17 +73,23 @@ def : Pat<(TF_ConstOp:$res NonOpaqueElementsAttr:$value), (ConstantOp $value),
 // Unary ops patterns.
 //===----------------------------------------------------------------------===//
 def IsDataFormatNHWC : ConstantAttr<TF_ConvnetDataFormatAttr, "NHWC">;
+
+// Constraint that Attr has values [1, X, Y, 1]
 def IsIntList1XY1 : AttrConstraint<CPred<"TFIntListIs1XY1($_self)">>;
+
+// Constraint that values in list attribute are all ones.
 def IsAllOnes : AttrConstraint<CPred<"TFIntListIsAllOnes($_self)">>;
+
+// Constraint that attribute is string with value either "SAME" or "VALID"
 def IsSameOrValid : AttrConstraint<
     CPred<"$_self.cast<StringAttr>().getValue() == \"SAME\" || " #
           "$_self.cast<StringAttr>().getValue() == \"VALID\"">,
     "'SAME' or 'VALID' paddings">;
 
-def : Pat<(TF_AbsOp $arg), (TFL_AbsOp $arg)>;
-def : Pat<(TF_AddNOp $inputs), (TFL_AddNOp $inputs)>;
+def LegalizeAbs : Pat<(TF_AbsOp $arg), (TFL_AbsOp $arg)>;
+def LegalizeAddN : Pat<(TF_AddNOp $inputs), (TFL_AddNOp $inputs)>;
 
-def : Pat<(TF_AvgPoolOp $value,
+def LegalizeAveragePool : Pat<(TF_AvgPoolOp $value,
               IsIntList1XY1:$ksize,
               IsIntList1XY1:$strides,
               $padding,
@@ -102,35 +102,42 @@ def : Pat<(TF_AvgPoolOp $value,
               /*stride_w=*/ExtractI32At<2>:$strides,
               /*fused_activation_function=*/TFL_AF_None)>;
 
-def : Pat<(TF_ArgMaxOp $input, $dim), (TFL_ArgMaxOp $input, $dim)>;
-def : Pat<(TF_ArgMinOp $input, $dim), (TFL_ArgMinOp $input, $dim)>;
+def LegalizeArgMax : Pat<(TF_ArgMaxOp $input, $dim),
+                         (TFL_ArgMaxOp $input, $dim)>;
+def LegalizeArgMin : Pat<(TF_ArgMinOp $input, $dim),
+                         (TFL_ArgMinOp $input, $dim)>;
 
-def : Pat<(TF_CeilOp $arg), (TFL_CeilOp $arg)>;
+def LegalizeCeil : Pat<(TF_CeilOp $arg), (TFL_CeilOp $arg)>;
 
-def : Pat<(TF_CosOp $arg), (TFL_CosOp $arg)>;
+def LegalizeCos : Pat<(TF_CosOp $arg), (TFL_CosOp $arg)>;
 
-def : Pat<(TF_EluOp $arg), (TFL_EluOp $arg)>;
+def LegalizeElu : Pat<(TF_EluOp $arg), (TFL_EluOp $arg)>;
 
-def : Pat<(TF_ExpandDimsOp $input, $dim), (TFL_ExpandDimsOp $input, $dim)>;
+def LegalizeExpandDims : Pat<(TF_ExpandDimsOp $input, $dim),
+                             (TFL_ExpandDimsOp $input, $dim)>;
 
-def : Pat<(TF_FakeQuantWithMinMaxArgsOp $inputs,
-              $min, $max,
-              $num_bits, $narrow_range),
-          (TFL_DequantizeOp
-              (TFL_QuantizeOp $inputs,
-                 (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
-                     $num_bits, $narrow_range)))>;
+def LegalizeFakeQuantToDequantizeQuantize : Pat<
+  (TF_FakeQuantWithMinMaxArgsOp $inputs, $min, $max, $num_bits, $narrow_range),
+  (TFL_DequantizeOp
+   (TFL_QuantizeOp $inputs,
+                   (ConvertToQuantTypeFromAttrs $inputs, $min, $max,
+                    $num_bits, $narrow_range)))>;
 
-def : Pat<(TF_FillOp $arg, $value), (TFL_FillOp $arg, $value)>;
+def LegalizeFill : Pat<(TF_FillOp $arg, $value), (TFL_FillOp $arg, $value)>;
 
-def : Pat<(TF_FloorOp $arg), (TFL_FloorOp $arg)>;
+def LegalizeFloor : Pat<(TF_FloorOp $arg), (TFL_FloorOp $arg)>;
 
-def : Pat<(TF_LeakyReluOp $arg, F32Attr:$a), (TFL_LeakyReluOp $arg, $a)>;
-def : Pat<(TF_LogOp $arg), (TFL_LogOp $arg)>;
-def : Pat<(TF_LogicalNotOp $arg), (TFL_LogicalNotOp $arg)>;
-def : Pat<(TF_LogSoftmaxOp $arg), (TFL_LogSoftmaxOp $arg)>;
+def LegalizeLeakyRelu : Pat<(TF_LeakyReluOp $arg, F32Attr:$a),
+                            (TFL_LeakyReluOp $arg, $a)>;
 
-def : Pat<(TF_MaxPoolOp $value,
+def LegalizeLog : Pat<(TF_LogOp $arg), (TFL_LogOp $arg)>;
+
+def LegalizeNot : Pat<(TF_LogicalNotOp $arg), (TFL_LogicalNotOp $arg)>;
+
+def LegalizeLogSoftmax : Pat<(TF_LogSoftmaxOp $arg), (TFL_LogSoftmaxOp $arg)>;
+
+def LegalizeMaxPool2D : Pat<
+          (TF_MaxPoolOp $value,
               IsIntList1XY1:$ksize,
               IsIntList1XY1:$strides,
               $padding,
@@ -143,8 +150,12 @@ def : Pat<(TF_MaxPoolOp $value,
               /*filter_height=*/ExtractI32At<1>:$ksize,
               /*fused_activation_function=*/TFL_AF_None)>;
 
-def : Pat<(TF_MaximumOp $arg1, $arg2), (TFL_MaximumOp $arg1, $arg2)>;
-def : Pat<(TF_MinimumOp $arg1, $arg2), (TFL_MinimumOp $arg1, $arg2)>;
+def LegalizeMaximum : Pat<(TF_MaximumOp $arg1, $arg2),
+                          (TFL_MaximumOp $arg1, $arg2)>;
+
+def LegalizeMinimum : Pat<(TF_MinimumOp $arg1, $arg2),
+                          (TFL_MinimumOp $arg1, $arg2)>;
+
 def : Pat<(TF_NegOp $arg), (TFL_NegOp $arg)>;
 def : Pat<(TF_OneHotOp $indices, $depth, $on_value, $off_value, $axis),
           (TFL_OneHotOp $indices, $depth, $on_value, $off_value,

From e20f16bfb875a1cc273cf071da26b1fedc6b0b0e Mon Sep 17 00:00:00 2001
From: Katherine Wu <kathywu@google.com>
Date: Mon, 1 Jun 2020 22:49:54 -0700
Subject: [PATCH 1526/1533] (Bugfix) Restore Sequential input layer from
 SavedModel.

The Input layer is ignored in `Sequential.layers`, so it isn't saved in the SavedModel object graph. However, it is stored in the model config, so restore it from the config.

PiperOrigin-RevId: 314270796
Change-Id: If1e96c8a564a087c2868b7c66869277548fa5f1a
---
 tensorflow/python/keras/saving/saved_model/load.py    |  7 +++++--
 .../python/keras/saving/saved_model/revive_test.py    | 11 ++++++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 13af49e3a0d..ca8164c9407 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -546,8 +546,11 @@ class KerasObjectLoader(tf_load.Loader):
     config = json_utils.decode(
         self._proto.nodes[model_id].user_object.metadata)['config']
     if isinstance(model, models_lib.Sequential):
-      if config['layers'][0]['class_name'] != 'InputLayer':
-        if 'batch_input_shape' in config['layers'][0]['config']:
+      if not isinstance(layers[0], input_layer.InputLayer):
+        if config['layers'][0]['class_name'] == 'InputLayer':
+          layers.insert(0, input_layer.InputLayer.from_config(
+              config['layers'][0]['config']))
+        elif 'batch_input_shape' in config['layers'][0]['config']:
           batch_input_shape = config['layers'][0]['config']['batch_input_shape']
           layers.insert(0, input_layer.InputLayer(
               input_shape=batch_input_shape[1:],
diff --git a/tensorflow/python/keras/saving/saved_model/revive_test.py b/tensorflow/python/keras/saving/saved_model/revive_test.py
index 4bd11460181..5e94597d00d 100644
--- a/tensorflow/python/keras/saving/saved_model/revive_test.py
+++ b/tensorflow/python/keras/saving/saved_model/revive_test.py
@@ -31,6 +31,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
@@ -38,6 +39,7 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
@@ -221,7 +223,6 @@ class TestModelRevive(keras_parameterized.TestCase):
               inner_model_subclassed]
     model = testing_utils.get_model_from_layers(
         layers, input_shape=input_shape)
-
     # Run data through the Model to create save spec and weights.
     model.predict(np.ones((10, 2, 3)), batch_size=10)
 
@@ -242,6 +243,14 @@ class TestModelRevive(keras_parameterized.TestCase):
     revived = keras_load.load(self.path)
     self._assert_revived_correctness(model, revived)
 
+  def test_revive_sequential_inputs(self):
+    model = keras.models.Sequential(
+        [keras.Input((None,), dtype=dtypes.string),
+         keras.layers.Lambda(string_ops.string_lower)])
+    model.save(self.path, save_format='tf')
+    revived = keras_load.load(self.path)
+    self.assertEqual(dtypes.string, revived._layers[0].dtype)
+
 
 if __name__ == '__main__':
   ops.enable_eager_execution()

From 0482d18b52bdd0cda27e84962b487acce7a2e5e0 Mon Sep 17 00:00:00 2001
From: Anudhyan Boral <anudhyan@google.com>
Date: Mon, 1 Jun 2020 23:39:22 -0700
Subject: [PATCH 1527/1533] Mostly internal changes.

PiperOrigin-RevId: 314275429
Change-Id: I75d2431bee12a4ebc92b5ab291451ecce805f09a
---
 tensorflow/compiler/tests/binary_ops_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 422695c374b..eb8883c9ccd 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -1017,7 +1017,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
           math_ops.matmul,
           np.array([[3.1415926535897932]], dtype=dtype),
           np.array([[2.7182818284590452]], dtype=dtype),
-          expected=np.array([[8.5397342226735668]], dtype=dtype))
+          expected=np.array([[8.5397342226735668]], dtype=dtype),
+          rtol=1e-14)
 
       # Edge case with a large range of exponent. Not supported by float16.
       if dtype != np.float16:
@@ -1025,7 +1026,8 @@ class BinaryOpsTest(xla_test.XLATestCase):
             math_ops.matmul,
             np.array([[9.4039548065783000e-38]], dtype=dtype),
             np.array([[4.5070591730234615e37]], dtype=dtype),
-            expected=np.array([[4.2384180773686798]], dtype=dtype))
+            expected=np.array([[4.2384180773686798]], dtype=dtype),
+            rtol=1e-14)
 
   # TODO(phawkins): failing on GPU, no registered kernel.
   def DISABLED_testSparseMatMul(self):

From bc49458b14127fedeb1e1fdf73cf86a1406eb8d8 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Tue, 2 Jun 2020 00:03:54 -0700
Subject: [PATCH 1528/1533] Move the recovering tensor shape logic of
 DistributedIterator get_next_as_optional earlier, so it covers more use
 cases.

PiperOrigin-RevId: 314277550
Change-Id: I2c8f3afcc791b02310b5242251f335d9da7cd5bf
---
 tensorflow/python/distribute/input_lib.py     | 62 +++++++++++--------
 .../python/distribute/input_lib_test.py       | 36 +++++++++++
 2 files changed, 71 insertions(+), 27 deletions(-)

diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 26bc9a087fb..a1e76ea0ecb 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -349,33 +349,6 @@ class DistributedIteratorBase(distribute_types.Iterator):
             results.append(result)
     replicas = results
 
-    # Some dimensions in `replicas` will become unknown after we conditionally
-    # return the real tensors or the dummy tensors. We fix the input shapes by
-    # using the shapes from `out_of_range_replicas` because it is calling
-    # get_next() inside.
-    flattened_replicas = nest.flatten(replicas)
-    for i, replica_data in enumerate(nest.flatten(out_of_range_replicas)):
-      for target, source in zip(
-          nest.flatten(flattened_replicas[i], expand_composites=True),
-          nest.flatten(replica_data, expand_composites=True)):
-        target.set_shape(source.get_shape())
-      # `SparseTensor` shape is not determined by the shape of its component
-      # tensors. Rather, its shape depends on a tensor's values.
-      if sparse_tensor.is_sparse(replica_data) and replica_data.get_shape():
-        dense_shape = replica_data.get_shape()
-        with ops.device(flattened_replicas[i].op.device):
-          # For partially defined shapes, fill in missing values from tensor.
-          if not dense_shape.is_fully_defined():
-            dense_shape = array_ops.stack([
-                flattened_replicas[i].dense_shape[j] if dim is None else dim
-                for j, dim in enumerate(dense_shape.as_list())
-            ])
-          flattened_replicas[i] = sparse_tensor.SparseTensor(
-              indices=flattened_replicas[i].indices,
-              values=flattened_replicas[i].values,
-              dense_shape=dense_shape)
-    replicas = nest.pack_sequence_as(replicas, flattened_replicas)
-
     return values.regroup(replicas)
 
 
@@ -1048,6 +1021,34 @@ def _dummy_tensor_fn(value_structure):
   return nest.map_structure(create_dummy_tensor, value_structure)
 
 
+def _recover_shape_fn(data, value_structure):
+  """Recover the shape of `data` the same as shape of `value_structure`."""
+
+  flattened_data = nest.flatten(data)
+  for i, spec in enumerate(nest.flatten(value_structure)):
+    for target, source in zip(
+        nest.flatten(flattened_data[i], expand_composites=True),
+        nest.flatten(spec, expand_composites=True)):
+      target.set_shape(source.shape)
+    # `SparseTensor` shape is not determined by the shape of its component
+    # tensors. Rather, its shape depends on a tensor's values.
+    if isinstance(spec, sparse_tensor.SparseTensorSpec) and spec.shape:
+      dense_shape = spec.shape
+      with ops.device(flattened_data[i].op.device):
+        # For partially defined shapes, fill in missing values from tensor.
+        if not dense_shape.is_fully_defined():
+          dense_shape = array_ops.stack([
+              flattened_data[i].dense_shape[j] if dim is None else dim
+              for j, dim in enumerate(dense_shape.as_list())
+          ])
+        flattened_data[i] = sparse_tensor.SparseTensor(
+            indices=flattened_data[i].indices,
+            values=flattened_data[i].values,
+            dense_shape=dense_shape)
+  data = nest.pack_sequence_as(data, flattened_data)
+  return data
+
+
 class _SingleWorkerDatasetIteratorBase(object):
   """Iterator for a single `tf.data.Dataset`."""
 
@@ -1132,6 +1133,13 @@ class _SingleWorkerDatasetIteratorBase(object):
               lambda: _dummy_tensor_fn(data.value_structure),
               strict=True,
           )
+          # Some dimensions in `replicas` will become unknown after we
+          # conditionally return the real tensors or the dummy tensors. Recover
+          # the shapes from `data.value_structure`. We only need to do this in
+          # non eager mode because we always know the runtime shape of the
+          # tensors in eager mode.
+          if not context.executing_eagerly():
+            real_data = _recover_shape_fn(real_data, data.value_structure)
           result.append(real_data)
           # pylint: enable=cell-var-from-loop
           # pylint: enable=unnecessary-lambda
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 60212f7a3b7..2114c4e6bda 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -48,6 +48,7 @@ from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -612,6 +613,41 @@ class DistributedIteratorSingleWorkerTest(DistributedIteratorTestBase,
     else:
       self.assertAllEqual(first_epoch, second_epoch)
 
+  @combinations.generate(
+      combinations.combine(
+          mode=["eager"],
+          distribution=[
+              strategy_combinations.one_device_strategy,
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ]))
+  def testGetNextOptionalShape(self, distribution):
+    batch_size = 8
+    dataset = dataset_ops.DatasetV2.from_tensor_slices({
+        "feature": array_ops.ones([batch_size, 10]),
+        "label": array_ops.ones([batch_size]),
+    })
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dist_dataset = distribution.experimental_distribute_dataset(dataset)
+    per_replica_batch_size = batch_size // distribution.num_replicas_in_sync
+
+    @def_function.function
+    def train_fn():
+      for data in dist_dataset:
+        data = nest.map_structure(distribution.experimental_local_results, data)
+        feature = data["feature"]
+        label = data["label"]
+
+        # Asser the shapes are still staic from all replicas.
+        for replica_id in range(distribution.num_replicas_in_sync):
+          self.assertEqual([per_replica_batch_size, 10],
+                           feature[replica_id].shape)
+          self.assertEqual([per_replica_batch_size], label[replica_id].shape)
+
+    train_fn()
+
 
 class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase,
                                         parameterized.TestCase):

From e2aa757a55fd4039b0809ed05ab92febf22d9a44 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Tue, 2 Jun 2020 00:34:47 -0700
Subject: [PATCH 1529/1533] Add op sanity checks to the following TFLite ops:

SpaceToDepthOp
SparseConstOp
SparseQConstOp
SparseToDenseOp
SplitOp
SplitVOp
SquaredDifferenceOp
SqueezeOp
StridedSliceOp
SubOp
SumOp
TanhOp
TileOp
TransposeConvOp
TransposeOp
UniqueOp
UnpackOp
WhereOp
WhileOp
YieldOp
ZerosLikeOp

PiperOrigin-RevId: 314280789
Change-Id: I030c487d9a6363b8f06d103786c9d42bfc7381b3
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.cc   |   4 +-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 308 +++++++++++-------
 .../transforms/load_quantization_recipe.cc    |   2 +-
 .../compiler/mlir/lite/transforms/optimize.cc |   7 +-
 4 files changed, 190 insertions(+), 131 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 3dcfe71770b..0c384ebf9f3 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -1966,9 +1966,9 @@ OpFoldResult TransposeOp::fold(ArrayRef<Attribute> operands) {
 }
 
 static LogicalResult Verify(TransposeOp op) {
-  auto input_type = op.x().getType().cast<ShapedType>();
+  auto input_type = op.input().getType().cast<ShapedType>();
   auto perm_type = op.perm().getType().cast<ShapedType>();
-  auto output_type = op.y().getType().cast<ShapedType>();
+  auto output_type = op.output().getType().cast<ShapedType>();
   if (input_type.hasStaticShape() && perm_type.hasStaticShape()) {
     if (perm_type.getNumElements() != input_type.getRank()) {
       return op.emitOpError(
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 76c342bd10a..48bc68e5c95 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -161,7 +161,6 @@ class TFL_VariadicTensorOf<list<Type> allowedRuntimeTypes,
   Variadic<TensorOf<allowedOpTypes>>,
   TFL_RuntimeType<Variadic<TensorOf<allowedRuntimeTypes>>>;
 
-def TFL_Uint8 : UI<8>;
 def TFL_Int32Or64 : SignlessIntOfWidths<[32, 64]>;
 
 def TFL_BoolTensor : TFL_TensorOf<[I1]>;
@@ -294,21 +293,33 @@ class TFL_OperandHasRankRange<int n, int x, int y> :
       "getRank() <= " # y>]>>;
 
 def TFL_FloatNonNegative : AttrConstraint<
-    CPred<"!$_self.cast<FloatAttr>().getValue().isNegative()">,
+    CPred<"$_self.isa<FloatAttr>() && "
+            "!$_self.cast<FloatAttr>().getValue().isNegative()">,
     "whose value is non-negative">;
 
 def TFL_BoolTrue : AttrConstraint<
-    CPred<"$_self.cast<BoolAttr>().getValue()">,
+    CPred<"$_self.isa<BoolAttr>() && $_self.cast<BoolAttr>().getValue()">,
     "whose value is true">;
 
 def TFL_BoolFalse : AttrConstraint<
-    CPred<"!$_self.cast<BoolAttr>().getValue()">,
+    CPred<"$_self.isa<BoolAttr>() && !$_self.cast<BoolAttr>().getValue()">,
     "whose value is false">;
 
 class TFL_StringEqualsTo<string value> : AttrConstraint<
     CPred<"$_self.cast<StringAttr>().getValue() == \"" # value # "\"">,
     "whose value equals to '" # value # "'">;
 
+// Ensures the array attribute's size is within the given maximum size.
+class TFL_ArrayMaxCount<int n> : AttrConstraint<
+    CPred<"$_self.isa<ArrayAttr>() && $_self.cast<ArrayAttr>().size() <= " # n>,
+    "whose size is at most " # n>;
+
+// Ensures the given integer attribute has the given value.
+class TFL_IntEqualsTo<int n> : AttrConstraint<
+    CPred<"$_self.isa<IntegerAttr>() && "
+            "$_self.cast<IntegerAttr>().getInt() == " # n>,
+    "whose value is " # n>;
+
 // This is a quantization-aware version of TCresVTEtIsSameAsOp
 class TFL_TCresVTEtIsSameAsOp<int i, int j> : And<[
   TCOpResIsShapedTypePred<i, j>,
@@ -472,7 +483,10 @@ an output element, this operation computes \\(y = |x|\\).
 
 def TFL_AddOp : TFL_Op<"add", [
     TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
-    ResultsBroadcastableShape, NoSideEffect, Commutative, TFL_GpuTargetOp]> {
+    ResultsBroadcastableShape,
+    NoSideEffect,
+    Commutative,
+    TFL_GpuTargetOp]> {
   let summary = "Addition operator";
 
   let description = [{
@@ -540,8 +554,14 @@ retained with length 1.
   let customOption = "ReducerOptions";
 }
 
-def TFL_TransposeConvOp:
-    TFL_Op<"transpose_conv", [NoSideEffect, TFL_GpuTargetOp]> {
+def TFL_TransposeConvOp: TFL_Op<"transpose_conv", [
+    NoSideEffect,
+    TFL_OperandHasRank<0, 1>,
+    TFL_OperandHasRank<1, 4>,
+    TFL_OperandHasRank<2, 4>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 2>>,
+    TFL_GpuTargetOp]> {
   let summary = "Transpose convolution operator";
 
   let description = [{
@@ -549,16 +569,16 @@ def TFL_TransposeConvOp:
   }];
 
   let arguments = (ins
-    TFL_1DTensorOf<[I32]>:$output_shape,
-    TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$weights,
-    TFL_TensorOf<[F32, TFL_Uint8, QI8, QUI8]>:$input,
-    TFL_TensorOfOrNone<[F32, QI32, QUI32]>:$bias,
+    TFL_I32Tensor:$output_shape,
+    TFL_TensorOf<[F32, QI8, QUI8]>:$weights,
+    TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+    TFL_TensorOfOrNone<[F32, QI32]>:$bias,
     TFL_PaddingAttr:$padding,
-    I32Attr:$stride_h,
-    I32Attr:$stride_w
+    Confined<I32Attr, [IntPositive]>:$stride_h,
+    Confined<I32Attr, [IntPositive]>:$stride_w
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8]>:$output);
 
   let hasOptions = 1;
 
@@ -600,7 +620,7 @@ def TFL_ArgMaxOp : TFL_Op<"arg_max", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I8, UI8, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$dim
   );
 
@@ -630,7 +650,7 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> {
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, I32, I8, TFL_Uint8, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, I32, I8, UI8, QI8, QUI8]>:$input,
     TFL_I32OrI64Tensor:$dim
   );
 
@@ -677,14 +697,14 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
 
   let arguments = (
     ins TFL_VariadicTensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>:$values,
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8]>:$values,
     I32Attr:$axis,
     TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs
     TFL_TensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, TFL_Uint8]>:$output
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8]>:$output
   );
 
   let hasOptions = 1;
@@ -748,7 +768,8 @@ def SparsityParameterAttr : StructAttr<"SparsityParameterAttr", TFL_Dialect, [
   let storageType = [{ TFL::SparsityParameterAttr }];
 }
 
-def TFL_SparseConstOp : Op<TFL_Dialect, "pseudo_sparse_const", [NoSideEffect,
+def TFL_SparseConstOp : Op<TFL_Dialect, "pseudo_sparse_const", [
+    NoSideEffect,
     FirstAttrDerivedResultType]> {
   let summary = "Sparse constant pseudo op.";
 
@@ -959,12 +980,12 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8, TFL_Str]>:$params,
+    TFL_TensorOf<[F32, I8, I64, I32, UI8, TFL_Str]>:$params,
     TFL_I32OrI64Tensor:$indices
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8, TFL_Str]>:$output
+    TFL_TensorOf<[F32, I8, I64, I32, UI8, TFL_Str]>:$output
   );
 }
 
@@ -983,12 +1004,12 @@ def TFL_ScatterNdOp : TFL_Op<"scatter_nd", [
 
   let arguments = (ins
     TFL_TensorOf<[I32]>:$indices,
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$updates,
+    TFL_TensorOf<[F32, I8, I64, I32, UI8]>:$updates,
     TFL_1DTensorOf<[I32]>:$shape
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I64, I32, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I8, I64, I32, UI8]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -1103,11 +1124,11 @@ def TFL_MatrixDiagOp : TFL_Op<"matrix_diag", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I16, I32, I64, TFL_Uint8, QUI8, QI8, TFL_Quint8]>:$diagonal
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QUI8, QI8, TFL_Quint8]>:$diagonal
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I16, I32, I64, TFL_Uint8, QUI8, QI8, TFL_Quint8]>:$output
+    TFL_TensorOf<[F32, I8, I16, I32, I64, UI8, QUI8, QI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 0;
@@ -1285,8 +1306,10 @@ def TFL_NotEqualOp : TFL_Op<"not_equal", [
 }
 
 def TFL_DivOp : TFL_Op<"div", [
-  // TODO(fengliuai): NoQuantizableResult is only correct for int8
-  // quantization. update to handle Uint8 quantization.
+    // TODO(fengliuai): NoQuantizableResult is only correct for int8
+    // quantization. update to handle Uint8 quantization.
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
     ResultsBroadcastableShape,
     NoSideEffect,
     NoQuantizableResult,
@@ -1299,10 +1322,10 @@ def TFL_DivOp : TFL_Op<"div", [
 
   let arguments = (
       ins TFL_TensorOf<[F32, I32, QUI8]>:$lhs,
-      TFL_TensorOf<[F32, I32, TFL_Uint8]>:$rhs,
+      TFL_TensorOf<[F32, I32, QUI8]>:$rhs,
       TFL_AFAttr:$fused_activation_function);
 
-  let results = (outs TFL_TensorOf<[F32, I32, TFL_Uint8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, QUI8]>:$output);
 
   let builders = [TFL_FusedBroadcastableBinaryBuilder];
 
@@ -1345,10 +1368,10 @@ def TFL_EmbeddingLookupOp: TFL_Op<"embedding_lookup",
 
   let arguments = (ins
     TFL_TensorOf<[I32]>:$lookup,
-    TFL_TensorOf<[F32, I8, TFL_Uint8]>:$value
+    TFL_TensorOf<[F32, I8, UI8]>:$value
    );
 
-  let results = (outs TFL_TensorOf<[F32, I8, TFL_Uint8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I8, UI8]>:$output);
 }
 
 def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
@@ -1364,8 +1387,8 @@ def TFL_EqualOp: TFL_Op<"equal", [Commutative, ResultsBroadcastableShape,
 
   let arguments = (
     ins
-    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, TFL_Uint8, TFL_Str]>:$x,
-    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, TFL_Uint8, TFL_Str]>:$y
+    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, UI8, TFL_Str]>:$x,
+    TFL_TensorOf<[I1, F32, I32, I64, QI8, QUI8, UI8, TFL_Str]>:$y
   );
 
   let results = (outs TFL_BoolTensor:$output);
@@ -1445,7 +1468,7 @@ def TFL_SqueezeOp: TFL_Op<"squeeze", [NoSideEffect,
 Given a tensor `input`, this operation returns a tensor of the same type with
 all dimensions of size 1 removed. If you don't want to remove all size 1
 dimensions, you can remove specific size 1 dimensions by specifying
-`axis`.
+`squeeze_dims`.
 
 For example:
 
@@ -1464,7 +1487,7 @@ shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
 
   let arguments = (ins
     AnyTensor:$input,
-    DefaultValuedAttr<I64ArrayAttr, "{}">:$squeeze_dims
+    Confined<DefaultValuedAttr<I64ArrayAttr, "{}">, [TFL_ArrayMaxCount<8>]>:$squeeze_dims
   );
 
   let results = (outs
@@ -1899,13 +1922,13 @@ def TFL_MeanOp : TFL_Op<"mean", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Uint8]>:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, UI8]>:$input,
     TFL_TensorOf<[I32, I64]>:$axis,
     BoolAttr:$keep_dims
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Uint8]>:$output);
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, UI8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -1998,7 +2021,11 @@ equivalent to setting:
   let hasCanonicalizer = 1;
 }
 
-def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
+def TFL_SumOp: TFL_Op<"sum", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
+
   let summary = "Sum operator";
 
   let description = [{
@@ -2006,12 +2033,12 @@ def TFL_SumOp: TFL_Op<"sum", [NoSideEffect]> {
   }];
 
   let arguments = (ins
-    AnyTensor:$input,
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
     TFL_I32Tensor:$axes,
     BoolAttr:$keep_dims
   );
 
-  let results = (outs AnyTensor);
+  let results = (outs TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output);
 
   let hasOptions = 1;
   let customOption = "ReducerOptions";
@@ -2113,12 +2140,13 @@ def TFL_MinimumOp : TFL_Op<"minimum", [
   let hasOptions = 0;
 }
 
-def TFL_MulOp : TFL_Op<"mul", [ResultsBroadcastableShape,
-                               NoSideEffect,
-                               Commutative,
-                               BinaryOpSameElementTypeConstraint,
-                               TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
-                               TFL_GpuTargetOp]> {
+def TFL_MulOp : TFL_Op<"mul", [
+    ResultsBroadcastableShape,
+    NoSideEffect,
+    Commutative,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
+    TFL_GpuTargetOp]> {
   let summary = "Multiplication operator";
 
   let description = [{
@@ -2611,7 +2639,7 @@ def TFL_SelectOp : TFL_Op<"select", [
   NoSideEffect,
   PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
   PredOpTrait<"operands and result have same element type",
-    TCresVTEtIsSameAsOp<0, 1>>]> {
+    TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
   let summary = "Select operator";
 
   let description = [{
@@ -2647,7 +2675,7 @@ def TFL_SelectV2Op : TFL_Op<"select_v2", [
     TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<1, 2, 4>,
     PredOpTrait<"operands have same element type", TCopVTEtIsSameAs<1, 2>>,
     PredOpTrait<"operands and result have same element type",
-      TCresVTEtIsSameAsOp<0, 1>>]> {
+      TFL_TCresVTEtIsSameAsOp<0, 1>>]> {
   let summary = "SelectV2 operator";
 
   let description = [{
@@ -2760,7 +2788,11 @@ def TFL_SquareOp: TFL_Op<"square", [
   let hasFolder = 1;
 }
 
-def TFL_SubOp : TFL_Op<"sub", [ResultsBroadcastableShape, NoSideEffect]> {
+def TFL_SubOp : TFL_Op<"sub", [
+    ResultsBroadcastableShape,
+    BinaryOpSameElementTypeConstraint,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 5>,
+    NoSideEffect]> {
   let summary = "Subtraction operator";
 
   let description = [{
@@ -2768,11 +2800,11 @@ def TFL_SubOp : TFL_Op<"sub", [ResultsBroadcastableShape, NoSideEffect]> {
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs,
+    ins TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$lhs,
+    TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$rhs,
     TFL_AFAttr:$fused_activation_function);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I32, QI8, QUI8, QI16]>:$output);
 
   let hasFolder = 1;
 
@@ -2788,6 +2820,8 @@ def TFL_SubOp : TFL_Op<"sub", [ResultsBroadcastableShape, NoSideEffect]> {
 // TODO(jpienaar): Expand the kernel implementation to support all types besides
 // I32 and F32.
 def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
+    SameOperandsAndResultElementType,
     ResultsBroadcastableShape,
     NoSideEffect,
     NoQuantizableResult,
@@ -2799,10 +2833,10 @@ def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [
   }];
 
   let arguments = (
-    ins AnyTensor:$lhs,
-    AnyTensor:$rhs);
+    ins TFL_TensorOf<[F32, I32]>:$lhs,
+    TFL_TensorOf<[F32, I32]>:$rhs);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[F32, I32]>:$output);
 
   let builders = [TFL_BroadcastableBinaryBuilder];
 
@@ -2814,6 +2848,8 @@ def TFL_SquaredDifferenceOp : TFL_Op<"squared_difference", [
 def TFL_TanhOp: TFL_Op<"tanh", [
     NoSideEffect,
     SameOperandsAndResultShape,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     // central_value = min_value / 2 + (max_value - 1) / 2 + 1
     // zero_point = central_value
     // scale = 1. / (central_value - min_value)
@@ -2826,9 +2862,9 @@ def TFL_TanhOp: TFL_Op<"tanh", [
     Computes element-wise Hyperbolic tangent of input
   }];
 
-  let arguments = (ins TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$x);
+  let arguments = (ins TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$input);
 
-  let results = (outs TFL_TensorOf<[F32, I16, I8, QI8, QUI8, QI16, QUI16, TFL_Uint8]>:$y);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16, TFL_Quint8]>:$output);
 
   // This builder doesn't work with quantized type, so it can only be used by
   // non-quantization tablegen patterns. Currently, it is used by the
@@ -2842,9 +2878,11 @@ def TFL_TanhOp: TFL_Op<"tanh", [
   ];
 }
 
-def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
-  PredOpTrait<"resultant element type needs to match first operand type",
-              TFL_TCresVTEtIsSameAsOp<0,0>>]> {
+def TFL_TileOp: TFL_Op<"tile", [
+    NoSideEffect,
+    SameOperandsAndResultsScale,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Tile operator.";
   let description = [{
     Constructs a tensor by tiling a given tensor.
@@ -2857,11 +2895,11 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$input,
+    TFL_TensorOf<[F32, I1, I32, I64, UI8, QUI8, TFL_Str]>:$input,
     TFL_I32OrI64Tensor:$multiples);
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Uint8, QUI8, TFL_Str]>:$output);
+    TFL_TensorOf<[F32, I1, I32, I64, UI8, QUI8, TFL_Str]>:$output);
 
   let hasOptions = 0;
 }
@@ -2869,9 +2907,13 @@ def TFL_TileOp: TFL_Op<"tile", [NoSideEffect, SameOperandsAndResultsScale,
 // TODO(jpienaar): Maybe make it accept any single element tensor as `k`.
 // TODO(jpienaar): Check that input has one or more dimensions.
 // TODO(jpienaar): Check that k is less or equal the internal dimension
-def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
+def TFL_TopKV2Op: TFL_Op<"topk_v2", [
+    NoSideEffect,
+    TFL_OperandHasRankAtLeast<0, 1>,
+    TFL_OperandHasRank<1, 0>,
     PredOpTrait<"result and input element type match",
-                TCresVTEtIsSameAsOp<0,0>>, SameOperandsAndResultsScale]> {
+      TFL_TCresVTEtIsSameAsOp<0,0>>,
+    SameOperandsAndResultsScale]> {
   let summary = "TopK operator";
 
   let description = [{
@@ -2881,11 +2923,11 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$input,
     TFL_I32Tensor:$k);
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QI8, QUI8]>:$values,
+    TFL_TensorOf<[F32, I8, I32, I64, UI8, QI8, QUI8]>:$values,
     TFL_I32Tensor:$indices);
 
   let builders = [OpBuilder<"OpBuilder &builder, OperationState &result, "
@@ -2895,29 +2937,27 @@ def TFL_TopKV2Op: TFL_Op<"topk_v2", [NoSideEffect, TFL_OperandHasRank<1,0>,
   let hasOptions = 1;
 }
 
-def TFL_TransposeOp : TFL_Op<"transpose",
-  [NoSideEffect,
-   TFL_OperandHasRank<1,1>,
-   // TODO(jpienaar): these are only true dynamically, change so that it works
-   // with unknowns.
-   // TFL_OperandRankEquals1DimOfOperand<0, 1>,
-   PredOpTrait<"input and output must have same element type",
-   TCresVTEtIsSameAsOp<0, 0>>,
-   SameOperandsAndResultsScale,
-   TFL_GpuTargetOp]> {
+def TFL_TransposeOp : TFL_Op<"transpose", [
+    NoSideEffect,
+    TFL_OperandHasRankAtMost<0, 5>,
+    TFL_OperandHasRank<1, 1>,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    SameOperandsAndResultsScale,
+    TFL_GpuTargetOp]> {
   let summary = "Transpose operator";
 
   let description = [{
     Returns the Transpose of x
   }];
 
-  let arguments = (
-    ins AnyTensor:$x,
+  let arguments = (ins
+    TFL_TensorOf<[I32, F32, I8, UI8, QI8, QUI8, TFL_Quint8, I1, I64]>:$input,
     TFL_TensorOf<[I32]>:$perm
   );
 
   let results = (outs
-    AnyTensor:$y
+    TFL_TensorOf<[I32, F32, I8, UI8, QI8, QUI8, TFL_Quint8, I1, I64]>:$output
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2925,7 +2965,10 @@ def TFL_TransposeOp : TFL_Op<"transpose",
   let hasFolder = 1;
 }
 
-def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect, SameOperandsAndResultsScale]> {
+def TFL_UnpackOp : TFL_Op<"unpack", [
+    NoSideEffect,
+    SameOperandsAndResultElementType,
+    SameOperandsAndResultsScale]> {
   let summary = "Unpacks a tensor along a dimension into multiple tensors";
 
   let description = [{
@@ -2946,14 +2989,14 @@ def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect, SameOperandsAndResultsScale]>
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I8, I32, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$input,
 
     I32Attr:$num,
     I32Attr:$axis
   );
 
   let results = (outs
-    TFL_VariadicTensorOf<[F32, I1, I8, I32, QI8, QUI8]>:$outputs
+    TFL_VariadicTensorOf<[F32, I1, I8, UI8, I32, QI8, QUI8, I16, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -2961,16 +3004,19 @@ def TFL_UnpackOp : TFL_Op<"unpack", [NoSideEffect, SameOperandsAndResultsScale]>
   let hasOptions = 1;
 }
 
-def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [NoSideEffect]> {
+def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoSideEffect]> {
   let summary = "ZerosLike operator";
 
   let description = [{
     Returns a tensor of zeros with the same shape and type as the input tensor.
   }];
 
-  let arguments = (ins AnyTensor:$input);
+  let arguments = (ins TFL_TensorOf<[I64, I32, F32]>:$input);
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[I64, I32, F32]>:$output);
 
   let hasOptions = 1;
 }
@@ -3006,7 +3052,7 @@ def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [
     SameOperandsAndResultsScale,
     TFL_OperandHasRankRange<0, 3, 4>,
     PredOpTrait<"input and output must have same element type",
-      TCresVTEtIsSameAsOp<0, 0>>
+      TFL_TCresVTEtIsSameAsOp<0, 0>>
   ]> {
   let summary = "SpaceToBatchNd operator";
 
@@ -3029,7 +3075,8 @@ def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
     NoSideEffect,
     SameOperandsAndResultsScale,
     PredOpTrait<"input and output must have same element type",
-      TCresVTEtIsSameAsOp<0, 0>>,
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_OperandHasRankAtMost<0, 4>,
     TFL_GpuTargetOp
   ]> {
   let summary = "SpaceToDepth operator";
@@ -3042,12 +3089,12 @@ def TFL_SpaceToDepthOp: TFL_Op<"space_to_depth", [
    }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$input,
-    I32Attr:$block_size
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$input,
+    Confined<I32Attr, [IntPositive]>:$block_size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Uint8, QUI8]>:$output
+    TFL_TensorOf<[F32, I32, I64, QI8, QUI8, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
@@ -3072,12 +3119,12 @@ def TFL_DepthToSpaceOp: TFL_Op<"depth_to_space", [
    }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, TFL_Uint8, UI8, QI8, QUI8]>:$input,
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, UI8, QI8, QUI8]>:$input,
     Confined<I32Attr, [IntPositive]>:$block_size
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, TFL_Uint8, UI8, QI8, QUI8]>:$output
+    TFL_TensorOf<[F32, I8, I32, I64, TFL_Quint8, UI8, QI8, QUI8]>:$output
   );
 
   let hasOptions = 1;
@@ -3097,12 +3144,12 @@ def TFL_SplitOp : TFL_Op<"split", [
 
   let arguments = (ins
     TFL_TensorOf<[I32]>:$split_dim,
-    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
+    TFL_TensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$value,
     Confined<I32Attr, [IntPositive]>:$num_splits
   );
 
   let results = (outs
-    TFL_VariadicTensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$outputs
+    TFL_VariadicTensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -3120,14 +3167,14 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect, SameOperandsAndResultsScale]
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$value,
+    TFL_TensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$value,
     TFL_1DTensorOf<[I32], [I32]>:$size_splits,
     TFL_0DTensorOf<[I32], [I32]>:$split_dim,
     Confined<I32Attr, [IntPositive]>:$num_splits
   );
 
   let results = (outs
-    TFL_VariadicTensorOf<[F32, I16, I32, I64, QI8, QUI8, QI16]>:$outputs
+    TFL_VariadicTensorOf<[F32, I16, I32, I64, I8, UI8, QI8, QUI8, QI16]>:$outputs
   );
 
   let verifier = [{ return Verify(*this); }];
@@ -3189,7 +3236,15 @@ def TFL_ResizeNearestNeighborOp : TFL_Op<"resize_nearest_neighbor", [
   let hasOptions = 1;
 }
 
-def TFL_SparseToDenseOp : TFL_Op<"sparse_to_dense", [NoSideEffect]> {
+def TFL_SparseToDenseOp : TFL_Op<"sparse_to_dense", [
+    NoSideEffect,
+    PredOpTrait<"sparse_values and dense must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 2>>,
+    PredOpTrait<"default_value and dense must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 3>>,
+    TFL_OperandHasRankAtMost<0, 2>,
+    TFL_OperandHasRankAtMost<1, 1>,
+    TFL_OperandHasRankAtMost<2, 1>]> {
   let summary = "Converts a sparse representation into a dense tensor.";
 
   let description = [{
@@ -3217,21 +3272,24 @@ are checked during execution.
   let arguments = (ins
     TFL_I32OrI64Tensor:$sparse_indices,
     TFL_I32OrI64Tensor:$output_shape,
-    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$sparse_values,
-    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$default_value
+    TFL_TensorOf<[I32, I64, I8, QI8, UI8, QUI8, TFL_Quint8, F32]>:$sparse_values,
+    TFL_TensorOf<[I32, I64, I8, QI8, UI8, QUI8, TFL_Quint8, F32]>:$default_value
   );
 
   let results = (outs
-    TFL_TensorOf<[I32, I64, I8, TFL_Uint8, F32]>:$dense
+    TFL_TensorOf<[I32, I64, I8, QI8, UI8, QUI8, TFL_Quint8, F32]>:$dense
   );
 }
 
-def TFL_StridedSliceOp: TFL_Op<"strided_slice",
-  [
+def TFL_StridedSliceOp: TFL_Op<"strided_slice", [
     NoSideEffect,
     PredOpTrait<"input and output must have same element type",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     SameOperandsAndResultsScale,
+    TFL_OperandHasRankAtMost<0, 5>,
+    TFL_OperandHasRank<1, 1>,
+    TFL_OperandHasRank<2, 1>,
+    TFL_OperandHasRank<3, 1>,
     TFL_GpuTargetOp
   ]> {
   let summary = "StridedSlice Op";
@@ -3241,20 +3299,20 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice",
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$input,
-    TFL_TensorOf<[I32]>:$begin,
-    TFL_TensorOf<[I32]>:$end,
-    TFL_TensorOf<[I32]>:$strides,
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, QI8, QUI8, I1, I16, QI16, TFL_Quint8]>:$input,
+    TFL_I32Tensor:$begin,
+    TFL_I32Tensor:$end,
+    TFL_I32Tensor:$strides,
 
     I32Attr:$begin_mask,
     I32Attr:$end_mask,
-    I32Attr:$ellipsis_mask,
-    I32Attr:$new_axis_mask,
+    Confined<I32Attr, [TFL_IntEqualsTo<0>]>:$ellipsis_mask,
+    Confined<I32Attr, [TFL_IntEqualsTo<0>]>:$new_axis_mask,
     I32Attr:$shrink_axis_mask
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I64, I8, QI8, QUI8, I1, TFL_Quint8, TFL_Uint8]>:$output
+    TFL_TensorOf<[F32, I32, I64, I8, UI8, QI8, QUI8, I1, I16, QI16, TFL_Quint8]>:$output
   );
 
   let hasOptions = 1;
@@ -3269,17 +3327,16 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, TFL_Uint8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, UI8, Complex<F<32>>]>:$input
   );
 
-  let results = (outs TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, TFL_Uint8, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I1, I32, I64, TFL_Quint8, UI8, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
   let hasOptions = 0;
 }
 
-
 def TFL_MirrorPadOp: TFL_Op<"mirror_pad", [
                      NoSideEffect, TFL_OperandHasRank<1, 2>, TFL_GpuTargetOp]> {
   let summary = "MirrorPad Operator. Pads a tensor with mirrored values.";
@@ -3315,24 +3372,25 @@ def TFL_MirrorPadOp: TFL_Op<"mirror_pad", [
   let hasOptions = 1;
 }
 
-def TFL_UniqueOp: TFL_Op<"unique", [NoSideEffect]> {
+def TFL_UniqueOp: TFL_Op<"unique", [
+    TFL_OperandHasRank<0, 1>,
+    NoSideEffect]> {
   let summary = "Unique Op.";
 
   let description = [{
-  This operation returns a tensor `y` containing all of the unique elements of `x`
-sorted in the same order that they occur in `x`. This operation also returns a
-tensor `idx` the same size as `x` that contains the index of each value of `x`
-in the unique output `y`. In other words:
+  This operation returns a tensor `output` containing all of the unique elements
+of `input` sorted in the same order that they occur in `input`. This operation
+also returns a tensor `idx` the same size as `x` that contains the index of each
+value of `input` in the unique output `output`. In other words:
   }];
 
   let arguments = (ins
-    // TODO: add uint8 support after quantize support.
-    TFL_TensorOf<[I8, I16, I32, I64, F32]>:$input
+    TFL_TensorOf<[I8, QI8, UI8, QUI8, I16, QI16, I32, I64, F32]>:$input
   );
 
   let results = (outs
-    TFL_TensorOf<[I8, I16, I32, I64, F32]>:$output,
-    TFL_TensorOf<[I32, I64]>:$idx
+    TFL_TensorOf<[I8, QI8, UI8, QUI8, I16, QI16, I32, I64, F32]>:$output,
+    TFL_I32OrI64Tensor:$idx
   );
 
   DerivedTFLiteTypeAttr idx_out_type = DerivedTFLiteTypeAttr<[{
@@ -3432,7 +3490,7 @@ def TFL_SparseQConstOp : Op<TFL_Dialect, "pseudo_sparse_qconst", [
     ElementsAttr:$compressed_data
   );
 
-  let results = (outs AnyTensor:$output);
+  let results = (outs TFL_TensorOf<[QUI8, QI8, QI16, QUI16, TFL_Quint8]>:$output);
 
   let builders = [OpBuilder<
     "OpBuilder &, OperationState &state, TypeAttr qtype, "
@@ -4076,7 +4134,7 @@ def TFL_NumericVerifyOp : Op<TFL_Dialect, "NumericVerify", [
 def TFL_SVDFOp :
   TFL_Op<"svdf", [
     PredOpTrait<"the input and result tensor elemental types must be same",
-      TCresVTEtIsSameAsOp<0, 0>>,
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     TFL_StatefulOp]> {
 
   let summary = "Single value decomposition filter operator";
@@ -4150,8 +4208,8 @@ def TFL_YieldOp : Op<TFL_Dialect, "yield", [Terminator]> {
 }
 
 def TFL_WhileOp : Op<TFL_Dialect, "while", [
-       DeclareOpInterfaceMethods<LoopLikeOpInterface>,
-       SingleBlockImplicitTerminator<"YieldOp">]> {
+    DeclareOpInterfaceMethods<LoopLikeOpInterface>,
+    SingleBlockImplicitTerminator<"YieldOp">]> {
   let summary = [{While loop}];
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
index 307a45639c5..5cb659fa318 100644
--- a/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/load_quantization_recipe.cc
@@ -184,7 +184,7 @@ void LoadQuantizationRecipe::LoadForLSTMOp(LSTMOp lstm, OpBuilder* builder) {
 
   auto new_cell_tanh = builder->create<TanhOp>(loc, int16, new_cell);
   auto hidden_state = builder->create<MulOp>(
-      loc, int16, new_cell_tanh.y(), output_gate->getResult(0), none_af);
+      loc, int16, new_cell_tanh.output(), output_gate->getResult(0), none_af);
   auto act = builder->create<FullyConnectedOp>(
       loc, int8, hidden_state.output(), lstm.projection_weights(),
       lstm.projection_bias(), none_af, fc_format, keep_dims);
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index c1c646c6dff..30ae4b81f4f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -663,8 +663,8 @@ struct ConvertTrivialTransposeOpToReshapeOp
 
   LogicalResult matchAndRewrite(TFL::TransposeOp transpose_op,
                                 PatternRewriter &rewriter) const override {
-    auto input_type = transpose_op.x().getType().cast<ShapedType>();
-    auto output_type = transpose_op.y().getType().cast<ShapedType>();
+    auto input_type = transpose_op.input().getType().cast<ShapedType>();
+    auto output_type = transpose_op.output().getType().cast<ShapedType>();
     // It's possible to know if the transformation is safe only if the input
     // & output shapes are fully known and permutation is a constant.
     if (!input_type.hasStaticShape() || !output_type.hasStaticShape())
@@ -713,7 +713,8 @@ struct ConvertTrivialTransposeOpToReshapeOp
     auto new_shape = rewriter.create<TF::ConstOp>(loc, new_shape_attr);
 
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
-        transpose_op, transpose_op.y().getType(), transpose_op.x(), new_shape);
+        transpose_op, transpose_op.output().getType(), transpose_op.input(),
+        new_shape);
 
     return success();
   }

From e9781e9b1656f28a2eb5ae3290cd35daeae5c3ca Mon Sep 17 00:00:00 2001
From: Thai Nguyen <thaink@google.com>
Date: Tue, 2 Jun 2020 00:45:48 -0700
Subject: [PATCH 1530/1533] Support TFLite in the tool to print selective
 registration header

PiperOrigin-RevId: 314282031
Change-Id: Ie71b434c177d03e246a5cfde3d067ac695b71299
---
 tensorflow/lite/kernels/test_util.h           |   3 +
 tensorflow/lite/testdata/softplus_flex.bin    | Bin 0 -> 576 bytes
 tensorflow/lite/tools/BUILD                   |  55 +++++
 tensorflow/lite/tools/gen_op_registration.cc  |   3 +-
 .../lite/tools/gen_op_registration_main.cc    |   5 +
 tensorflow/lite/tools/list_flex_ops.cc        | 128 +++++++++++
 tensorflow/lite/tools/list_flex_ops.h         |  55 +++++
 tensorflow/lite/tools/list_flex_ops_main.cc   |  50 +++++
 tensorflow/lite/tools/list_flex_ops_test.cc   | 203 ++++++++++++++++++
 .../print_selective_registration_header.py    |  10 +-
 ...rint_selective_registration_header_test.py |  59 +++++
 .../selective_registration_header_lib.py      |  76 ++++---
 12 files changed, 618 insertions(+), 29 deletions(-)
 create mode 100644 tensorflow/lite/testdata/softplus_flex.bin
 create mode 100644 tensorflow/lite/tools/list_flex_ops.cc
 create mode 100644 tensorflow/lite/tools/list_flex_ops.h
 create mode 100644 tensorflow/lite/tools/list_flex_ops_main.cc
 create mode 100644 tensorflow/lite/tools/list_flex_ops_test.cc

diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index ac1ad5d9025..d308dfee469 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -486,6 +486,9 @@ class SingleOpModel {
     return std::vector<T>(v, v + tensor_size);
   }
 
+  // Return the TFLite model buffer, only available after BuildInterpreter.
+  const uint8_t* GetModelBuffer() { return builder_.GetBufferPointer(); }
+
   std::vector<int> GetTensorShape(int index) {
     std::vector<int> result;
     TfLiteTensor* t = interpreter_->tensor(index);
diff --git a/tensorflow/lite/testdata/softplus_flex.bin b/tensorflow/lite/testdata/softplus_flex.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e4715bf60114c1c14697aef2c9a21ea31ee022ad
GIT binary patch
literal 576
zcmZ9JK}y3=5QZl;wn1tQF$<9_f`~*B4edsPyFx*zP_(X#*e5(_lF}x{BX|f;<9T`j
z7lru$iH*>NFY{)Y`Df<E6Or*?*b{NY6H_c{i!CmqCy0pbfe@U7Phf!nm_QmL8$;v=
zjNrnQIka?W+kl!>W-X!SF^F|CS>#EkV>MZ+#ZspUqD|zO_*Ok^*xly{fyd#)liyDh
z)MaXVgGdJx|Ign+<PKZ|A8;eiOyXC~M&*3mjdN~c9BauMd;uFTt2j5ep+_zH_h4H`
z%ZIzR@U8O2XFuM-V5UBv)AwvX%a_cHfK>-Boceq!XNR{_wbE}Y>W^Nc@TzrSjGabk
zbTL`?(sj)M&AyAG@S(w3*(><7wIqDrGy5~AIyXz7cjcC+>TmO|Jir~O<%4~olX;%O
N?&@sN+eAIJe*u20IlBM=

literal 0
HcmV?d00001

diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 6ae5c1dda18..c34453e0809 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -134,6 +134,7 @@ cc_binary(
     deps = [
         ":command_line_flags",
         ":gen_op_registration",
+        "//tensorflow/lite:util",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -252,6 +253,60 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "list_flex_ops",
+    srcs = ["list_flex_ops.cc"],
+    hdrs = ["list_flex_ops.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
+        "@com_google_absl//absl/strings",
+        "@flatbuffers",
+    ],
+)
+
+# This tool list flex ops and kernels inside a TFLite file.
+# It is used to generate header file for selective registration.
+cc_binary(
+    name = "list_flex_ops_main",
+    srcs = ["list_flex_ops_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":list_flex_ops",
+        "//tensorflow/lite/tools:command_line_flags",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_test(
+    name = "list_flex_ops_test",
+    srcs = ["list_flex_ops_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/0_subgraphs.bin",
+        "//tensorflow/lite:testdata/multi_add_flex.bin",
+        "//tensorflow/lite:testdata/softplus_flex.bin",
+        "//tensorflow/lite:testdata/test_model.bin",
+        "//tensorflow/lite:testdata/test_model_broken.bin",
+    ],
+    tags = [
+        "no_oss",  # Currently requires --config=monolithic, b/118895218.
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":list_flex_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 py_binary(
     name = "zip_files",
     srcs = ["zip_files.py"],
diff --git a/tensorflow/lite/tools/gen_op_registration.cc b/tensorflow/lite/tools/gen_op_registration.cc
index be08b6e0d31..2a95df799e8 100644
--- a/tensorflow/lite/tools/gen_op_registration.cc
+++ b/tensorflow/lite/tools/gen_op_registration.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/tools/gen_op_registration.h"
+
 #include <string>
 #include <vector>
 
 #include "re2/re2.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/tools/gen_op_registration.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/tools/gen_op_registration_main.cc b/tensorflow/lite/tools/gen_op_registration_main.cc
index e4398663580..c959945b334 100644
--- a/tensorflow/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/lite/tools/gen_op_registration_main.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/strip.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/gen_op_registration.h"
+#include "tensorflow/lite/util.h"
 
 const char kInputModelFlag[] = "input_models";
 const char kNamespace[] = "namespace";
@@ -84,6 +85,8 @@ void GenerateFileContent(const std::string& tflite_path,
     fout << "namespace custom {\n";
     fout << "// Forward-declarations for the custom ops.\n";
     for (const auto& op : custom_ops) {
+      // Skips Tensorflow ops, only TFLite custom ops can be registered here.
+      if (tflite::IsFlexOp(op.first.c_str())) continue;
       fout << "TfLiteRegistration* Register_"
            << ::tflite::NormalizeCustomOpName(op.first) << "();\n";
     }
@@ -115,6 +118,8 @@ void GenerateFileContent(const std::string& tflite_path,
     fout << ");\n";
   }
   for (const auto& op : custom_ops) {
+    // Skips Tensorflow ops, only TFLite custom ops can be registered here.
+    if (tflite::IsFlexOp(op.first.c_str())) continue;
     fout << "  resolver->AddCustom(\"" << op.first
          << "\", ::tflite::ops::custom::Register_"
          << ::tflite::NormalizeCustomOpName(op.first) << "()";
diff --git a/tensorflow/lite/tools/list_flex_ops.cc b/tensorflow/lite/tools/list_flex_ops.cc
new file mode 100644
index 00000000000..9c80afbc90e
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace flex {
+
+std::string OpListToJSONString(const OpKernelSet& flex_ops) {
+  return absl::StrCat("[",
+                      absl::StrJoin(flex_ops, ",\n",
+                                    [](std::string* out, const OpKernel& op) {
+                                      absl::StrAppend(out, "[\"", op.op_name,
+                                                      "\", \"", op.kernel_name,
+                                                      "\"]");
+                                    }),
+                      "]");
+}
+
+// Find the class name of the op kernel described in the node_def from the pool
+// of registered ops. If no kernel class is found, return an empty string.
+string FindTensorflowKernelClass(tensorflow::NodeDef* node_def) {
+  if (!node_def || node_def->op().empty()) {
+    LOG(FATAL) << "Invalid NodeDef";
+  }
+
+  const tensorflow::OpRegistrationData* op_reg_data;
+  auto status =
+      tensorflow::OpRegistry::Global()->LookUp(node_def->op(), &op_reg_data);
+  if (!status.ok()) {
+    LOG(FATAL) << "Op " << node_def->op() << " not found: " << status;
+  }
+  AddDefaultsToNodeDef(op_reg_data->op_def, node_def);
+
+  tensorflow::DeviceNameUtils::ParsedName parsed_name;
+  if (!tensorflow::DeviceNameUtils::ParseFullName(node_def->device(),
+                                                  &parsed_name)) {
+    LOG(FATAL) << "Failed to parse device from node_def: "
+               << node_def->ShortDebugString();
+  }
+  string class_name;
+  if (!tensorflow::FindKernelDef(
+           tensorflow::DeviceType(parsed_name.type.c_str()), *node_def,
+           nullptr /* kernel_def */, &class_name)
+           .ok()) {
+    LOG(FATAL) << "Failed to find kernel class for op: " << node_def->op();
+  }
+  return class_name;
+}
+
+void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops) {
+  // Read flex ops.
+  auto* subgraphs = model->subgraphs();
+  if (!subgraphs) return;
+  for (int subgraph_index = 0; subgraph_index < subgraphs->size();
+       ++subgraph_index) {
+    const tflite::SubGraph* subgraph = subgraphs->Get(subgraph_index);
+    auto* operators = subgraph->operators();
+    auto* opcodes = model->operator_codes();
+    if (!operators || !opcodes) continue;
+    for (int i = 0; i < operators->size(); ++i) {
+      const tflite::Operator* op = operators->Get(i);
+      const tflite::OperatorCode* opcode = opcodes->Get(op->opcode_index());
+      if (opcode->builtin_code() != tflite::BuiltinOperator_CUSTOM ||
+          !tflite::IsFlexOp(opcode->custom_code()->c_str())) {
+        continue;
+      }
+
+      // Remove the "Flex" prefix from op name.
+      std::string flex_op_name(opcode->custom_code()->c_str());
+      std::string tf_op_name =
+          flex_op_name.substr(strlen(tflite::kFlexCustomCodePrefix));
+
+      // Read NodeDef and find the op kernel class.
+      if (op->custom_options_format() !=
+          tflite::CustomOptionsFormat_FLEXBUFFERS) {
+        LOG(FATAL) << "Invalid CustomOptionsFormat";
+      }
+      const flatbuffers::Vector<uint8_t>* custom_opt_bytes =
+          op->custom_options();
+      if (custom_opt_bytes && custom_opt_bytes->size()) {
+        // NOLINTNEXTLINE: It is common to use references with flatbuffer.
+        const flexbuffers::Vector& v =
+            flexbuffers::GetRoot(custom_opt_bytes->data(),
+                                 custom_opt_bytes->size())
+                .AsVector();
+        std::string nodedef_str = v[1].AsString().str();
+        tensorflow::NodeDef nodedef;
+        if (nodedef_str.empty() || !nodedef.ParseFromString(nodedef_str)) {
+          LOG(FATAL) << "Failed to parse data into a valid NodeDef";
+        }
+        // Flex delegate only supports running flex ops with CPU.
+        *nodedef.mutable_device() = "/CPU:0";
+        std::string kernel_class = FindTensorflowKernelClass(&nodedef);
+        flex_ops->insert({tf_op_name, kernel_class});
+      }
+    }
+  }
+}
+}  // namespace flex
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/list_flex_ops.h b/tensorflow/lite/tools/list_flex_ops.h
new file mode 100644
index 00000000000..070da2d9b3d
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops.h
@@ -0,0 +1,55 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
+#define TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
+
+#include <set>
+#include <string>
+
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace flex {
+
+// Store the Op and Kernel name of an op as the key of a set or map.
+struct OpKernel {
+  std::string op_name;
+  std::string kernel_name;
+};
+
+// The comparison function for OpKernel.
+struct OpKernelCompare {
+  bool operator()(const OpKernel& lhs, const OpKernel& rhs) const {
+    if (lhs.op_name == rhs.op_name) {
+      return lhs.kernel_name < rhs.kernel_name;
+    }
+    return lhs.op_name < rhs.op_name;
+  }
+};
+
+using OpKernelSet = std::set<OpKernel, OpKernelCompare>;
+
+// Find flex ops and its kernel classes inside a TFLite model and add them to
+// the map flex_ops. The map stores
+void AddFlexOpsFromModel(const tflite::Model* model, OpKernelSet* flex_ops);
+
+// Serialize the list op of to a json string. If flex_ops is empty, return an
+// empty string.
+std::string OpListToJSONString(const OpKernelSet& flex_ops);
+
+}  // namespace flex
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_LIST_FLEX_OPS_H_
diff --git a/tensorflow/lite/tools/list_flex_ops_main.cc b/tensorflow/lite/tools/list_flex_ops_main.cc
new file mode 100644
index 00000000000..e20aed43a66
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops_main.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "absl/strings/str_split.h"
+#include "tensorflow/lite/tools/command_line_flags.h"
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+const char kInputModelsFlag[] = "graphs";
+
+int main(int argc, char** argv) {
+  std::string input_models;
+  std::vector<tflite::Flag> flag_list = {
+      tflite::Flag::CreateFlag(kInputModelsFlag, &input_models,
+                               "path to the tflite models, separated by comma.",
+                               tflite::Flag::kRequired),
+  };
+  tflite::Flags::Parse(&argc, const_cast<const char**>(argv), flag_list);
+
+  std::vector<std::string> models = absl::StrSplit(input_models, ',');
+  tflite::flex::OpKernelSet flex_ops;
+  for (const std::string& model_file : models) {
+    std::ifstream fin;
+    fin.exceptions(std::ifstream::failbit | std::ifstream::badbit);
+    fin.open(model_file);
+    std::stringstream content;
+    content << fin.rdbuf();
+
+    // Need to store content data first, otherwise, it won't work in bazel.
+    std::string content_str = content.str();
+    const ::tflite::Model* model = ::tflite::GetModel(content_str.data());
+    tflite::flex::AddFlexOpsFromModel(model, &flex_ops);
+  }
+  std::cout << tflite::flex::OpListToJSONString(flex_ops);
+  return 0;
+}
diff --git a/tensorflow/lite/tools/list_flex_ops_test.cc b/tensorflow/lite/tools/list_flex_ops_test.cc
new file mode 100644
index 00000000000..67ddc06325a
--- /dev/null
+++ b/tensorflow/lite/tools/list_flex_ops_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/tools/list_flex_ops.h"
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+namespace flex {
+
+class FlexOpsListTest : public ::testing::Test {
+ protected:
+  FlexOpsListTest() {}
+
+  void ReadOps(const string& model_path) {
+    auto model = FlatBufferModel::BuildFromFile(model_path.data());
+    AddFlexOpsFromModel(model->GetModel(), &flex_ops_);
+    output_text_ = OpListToJSONString(flex_ops_);
+  }
+
+  void ReadOps(const tflite::Model* model) {
+    AddFlexOpsFromModel(model, &flex_ops_);
+    output_text_ = OpListToJSONString(flex_ops_);
+  }
+
+  std::string output_text_;
+  OpKernelSet flex_ops_;
+};
+
+TfLiteRegistration* Register_TEST() {
+  static TfLiteRegistration r = {nullptr, nullptr, nullptr, nullptr};
+  return &r;
+}
+
+std::vector<uint8_t> CreateFlexCustomOptions(std::string nodedef_raw_string) {
+  tensorflow::NodeDef node_def;
+  tensorflow::protobuf::TextFormat::ParseFromString(nodedef_raw_string,
+                                                    &node_def);
+  std::string node_def_str = node_def.SerializeAsString();
+  auto flex_builder = std::make_unique<flexbuffers::Builder>();
+  flex_builder->Vector([&]() {
+    flex_builder->String(node_def.op());
+    flex_builder->String(node_def_str);
+  });
+  flex_builder->Finish();
+  return flex_builder->GetBuffer();
+}
+
+class FlexOpModel : public SingleOpModel {
+ public:
+  FlexOpModel(const std::string& op_name, const TensorData& input1,
+              const TensorData& input2, const TensorType& output,
+              const std::vector<uint8_t>& custom_options) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetCustomOp(op_name, custom_options, Register_TEST);
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST_F(FlexOpsListTest, TestModelsNoFlex) {
+  ReadOps("third_party/tensorflow/lite/testdata/test_model.bin");
+  EXPECT_EQ(output_text_, "[]");
+}
+
+TEST_F(FlexOpsListTest, TestBrokenModel) {
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps("third_party/tensorflow/lite/testdata/test_model_broken.bin"),
+      "");
+}
+
+TEST_F(FlexOpsListTest, TestZeroSubgraphs) {
+  ReadOps("third_party/tensorflow/lite/testdata/0_subgraphs.bin");
+  EXPECT_EQ(output_text_, "[]");
+}
+
+TEST_F(FlexOpsListTest, TestFlexAdd) {
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+}
+
+TEST_F(FlexOpsListTest, TestTwoModel) {
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("third_party/tensorflow/lite/testdata/softplus_flex.bin");
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, "
+            "functor::add<float>>\"],\n[\"Softplus\", \"SoftplusOp<CPUDevice, "
+            "float>\"]]");
+}
+
+TEST_F(FlexOpsListTest, TestDuplicatedOp) {
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  ReadOps("third_party/tensorflow/lite/testdata/multi_add_flex.bin");
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+}
+
+TEST_F(FlexOpsListTest, TestInvalidCustomOptions) {
+  // Using a invalid custom options, expected to fail.
+  std::vector<uint8_t> random_custom_options(20);
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        random_custom_options);
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())),
+      "Failed to parse data into a valid NodeDef");
+}
+
+TEST_F(FlexOpsListTest, TestOpNameEmpty) {
+  // NodeDef with empty opname.
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_FLOAT } }";
+  std::string random_fieldname = "random string";
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())), "Invalid NodeDef");
+}
+
+TEST_F(FlexOpsListTest, TestOpNotFound) {
+  // NodeDef with invalid opname.
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"FlexInvalidOp\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_FLOAT } }";
+
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())),
+      "Op FlexInvalidOp not found");
+}
+
+TEST_F(FlexOpsListTest, TestKernelNotFound) {
+  // NodeDef with non-supported type.
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"Add\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_BOOL } }";
+
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  EXPECT_DEATH_IF_SUPPORTED(
+      ReadOps(tflite::GetModel(max_model.GetModelBuffer())),
+      "Failed to find kernel class for op: Add");
+}
+
+TEST_F(FlexOpsListTest, TestFlexAddWithSingleOpModel) {
+  std::string nodedef_raw_str =
+      "name: \"node_1\""
+      "op: \"Add\""
+      "input: [ \"b\", \"c\" ]"
+      "attr: { key: \"T\" value: { type: DT_FLOAT } }";
+
+  FlexOpModel max_model("FlexAdd", {TensorType_FLOAT32, {3, 1, 2, 2}},
+                        {TensorType_FLOAT32, {3, 1, 2, 1}}, TensorType_FLOAT32,
+                        CreateFlexCustomOptions(nodedef_raw_str));
+  ReadOps(tflite::GetModel(max_model.GetModelBuffer()));
+  EXPECT_EQ(output_text_,
+            "[[\"Add\", \"BinaryOp<CPUDevice, functor::add<float>>\"]]");
+}
+}  // namespace flex
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  // On Linux, add: FLAGS_logtostderr = true;
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/python/tools/print_selective_registration_header.py b/tensorflow/python/tools/print_selective_registration_header.py
index 0bb56878f48..6d35749ee01 100644
--- a/tensorflow/python/tools/print_selective_registration_header.py
+++ b/tensorflow/python/tools/print_selective_registration_header.py
@@ -46,8 +46,10 @@ FLAGS = None
 
 def main(unused_argv):
   graphs = FLAGS.graphs.split(',')
-  print(selective_registration_header_lib.get_header(
-      graphs, FLAGS.proto_fileformat, FLAGS.default_ops))
+  print(
+      selective_registration_header_lib.get_header(graphs,
+                                                   FLAGS.proto_fileformat,
+                                                   FLAGS.default_ops))
 
 
 if __name__ == '__main__':
@@ -63,7 +65,9 @@ if __name__ == '__main__':
       '--proto_fileformat',
       type=str,
       default='rawproto',
-      help='Format of proto file, either textproto or rawproto.')
+      help='Format of proto file, either textproto, rawproto or ops_list. The '
+      'ops_list is the file contains the list of ops in JSON format. Ex: '
+      '"[["Add", "BinaryOp<CPUDevice, functor::add<float>>"]]".')
   parser.add_argument(
       '--default_ops',
       type=str,
diff --git a/tensorflow/python/tools/print_selective_registration_header_test.py b/tensorflow/python/tools/print_selective_registration_header_test.py
index d03084a2cfe..b06f6123ddd 100644
--- a/tensorflow/python/tools/print_selective_registration_header_test.py
+++ b/tensorflow/python/tools/print_selective_registration_header_test.py
@@ -93,6 +93,12 @@ class PrintOpFilegroupTest(test.TestCase):
       fnames.append(fname)
     return fnames
 
+  def WriteTextFile(self, content):
+    fname = os.path.join(self.get_temp_dir(), 'text.txt')
+    with gfile.GFile(fname, 'w') as f:
+      f.write(content)
+    return [fname]
+
   def testGetOps(self):
     default_ops = 'NoOp:NoOp,_Recv:RecvOp,_Send:SendOp'
     graphs = [
@@ -136,6 +142,59 @@ class PrintOpFilegroupTest(test.TestCase):
         ],
         ops_and_kernels)
 
+  def testGetOpsFromList(self):
+    default_ops = ''
+    # Test with 2 different ops.
+    ops_list = """[["Add", "BinaryOp<CPUDevice, functor::add<float>>"],
+        ["Softplus", "SoftplusOp<CPUDevice, float>"]]"""
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Add', 'BinaryOp<CPUDevice, functor::add<float>>'),
+        ('Softplus', 'SoftplusOp<CPUDevice, float>'),
+    ], ops_and_kernels)
+
+    # Test with a single op.
+    ops_list = '[["Softplus", "SoftplusOp<CPUDevice, float>"]]'
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Softplus', 'SoftplusOp<CPUDevice, float>'),
+    ], ops_and_kernels)
+
+    # Test with duplicated op.
+    ops_list = """[["Add", "BinaryOp<CPUDevice, functor::add<float>>"],
+        ["Add", "BinaryOp<CPUDevice, functor::add<float>>"]]"""
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Add', 'BinaryOp<CPUDevice, functor::add<float>>'),
+    ], ops_and_kernels)
+
+    # Test op with no kernel.
+    ops_list = '[["Softplus", ""]]'
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list', self.WriteTextFile(ops_list), default_ops)
+    self.assertListEqual([
+        ('Softplus', None),
+    ], ops_and_kernels)
+
+    # Test two ops_list files.
+    ops_list = '[["Softplus", "SoftplusOp<CPUDevice, float>"]]'
+    ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+        'ops_list',
+        self.WriteTextFile(ops_list) + self.WriteTextFile(ops_list),
+        default_ops)
+    self.assertListEqual([
+        ('Softplus', 'SoftplusOp<CPUDevice, float>'),
+    ], ops_and_kernels)
+
+    # Test empty file.
+    ops_list = ''
+    with self.assertRaises(Exception):
+      ops_and_kernels = selective_registration_header_lib.get_ops_and_kernels(
+          'ops_list', self.WriteTextFile(ops_list), default_ops)
+
   def testAll(self):
     default_ops = 'all'
     graphs = [
diff --git a/tensorflow/python/tools/selective_registration_header_lib.py b/tensorflow/python/tools/selective_registration_header_lib.py
index da34da594e3..25f538b4f67 100644
--- a/tensorflow/python/tools/selective_registration_header_lib.py
+++ b/tensorflow/python/tools/selective_registration_header_lib.py
@@ -22,11 +22,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import json
 import os
 import sys
 
 from google.protobuf import text_format
-
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import _pywrap_kernel_registry
 from tensorflow.python.platform import gfile
@@ -41,6 +41,39 @@ OPS_WITHOUT_KERNEL_WHITELIST = frozenset([
     # core/common_runtime/accumulate_n_optimizer.cc.
     'AccumulateNV2'
 ])
+FLEX_PREFIX = b'Flex'
+FLEX_PREFIX_LENGTH = len(FLEX_PREFIX)
+
+
+def _get_ops_from_ops_list(input_file):
+  """Gets the ops and kernels needed from the ops list file."""
+  ops = set()
+  ops_list_str = gfile.GFile(input_file, 'r').read()
+  if not ops_list_str:
+    raise Exception('Input file should not be empty')
+  ops_list = json.loads(ops_list_str)
+  for op, kernel in ops_list:
+    op_and_kernel = (op, kernel if kernel else None)
+    ops.add(op_and_kernel)
+  return ops
+
+
+def _get_ops_from_graphdef(graph_def):
+  """Gets the ops and kernels needed from the tensorflow model."""
+  ops = set()
+  for node_def in graph_def.node:
+    if not node_def.device:
+      node_def.device = '/cpu:0'
+    kernel_class = _pywrap_kernel_registry.TryFindKernelClass(
+        node_def.SerializeToString())
+    op = str(node_def.op)
+    if kernel_class or op in OPS_WITHOUT_KERNEL_WHITELIST:
+      op_and_kernel = (op, str(kernel_class.decode('utf-8'))
+                       if kernel_class else None)
+      ops.add(op_and_kernel)
+    else:
+      print('Warning: no kernel found for op %s' % node_def.op, file=sys.stderr)
+  return ops
 
 
 def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
@@ -49,6 +82,11 @@ def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
 
   for proto_file in proto_files:
     tf_logging.info('Loading proto file %s', proto_file)
+    # Load ops list file.
+    if proto_fileformat == 'ops_list':
+      ops = ops.union(_get_ops_from_ops_list(proto_file))
+      continue
+
     # Load GraphDef.
     file_data = gfile.GFile(proto_file, 'rb').read()
     if proto_fileformat == 'rawproto':
@@ -56,22 +94,7 @@ def get_ops_and_kernels(proto_fileformat, proto_files, default_ops_str):
     else:
       assert proto_fileformat == 'textproto'
       graph_def = text_format.Parse(file_data, graph_pb2.GraphDef())
-
-    # Find all ops and kernels used by the graph.
-    for node_def in graph_def.node:
-      if not node_def.device:
-        node_def.device = '/cpu:0'
-      kernel_class = _pywrap_kernel_registry.TryFindKernelClass(
-          node_def.SerializeToString())
-      op = str(node_def.op)
-      if kernel_class or op in OPS_WITHOUT_KERNEL_WHITELIST:
-        op_and_kernel = (op, str(kernel_class.decode('utf-8'))
-                         if kernel_class else None)
-        if op_and_kernel not in ops:
-          ops.add(op_and_kernel)
-      else:
-        print(
-            'Warning: no kernel found for op %s' % node_def.op, file=sys.stderr)
+    ops = ops.union(_get_ops_from_graphdef(graph_def))
 
   # Add default ops.
   if default_ops_str and default_ops_str != 'all':
@@ -91,7 +114,7 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
   Args:
     ops_and_kernels: a set of (op_name, kernel_class_name) pairs to include.
     include_all_ops_and_kernels: if True, ops_and_kernels is ignored and all op
-    kernels are included.
+      kernels are included.
 
   Returns:
     the string of the header that should be written as ops_to_register.h.
@@ -112,7 +135,7 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
     append('#define SHOULD_REGISTER_OP_KERNEL(clz) true')
     append('#define SHOULD_REGISTER_OP_GRADIENT true')
   else:
-    line = '''
+    line = """
     namespace {
       constexpr const char* skip(const char* x) {
         return (*x) ? (*x == ' ' ? skip(x + 1) : x) : x;
@@ -138,10 +161,11 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
         }
       };
     }  // end namespace
-    '''
+    """
     line += 'constexpr const char* kNecessaryOpKernelClasses[] = {\n'
     for _, kernel_class in ops_and_kernels:
-      if kernel_class is None: continue
+      if kernel_class is None:
+        continue
       line += '"%s",\n' % kernel_class
     line += '};'
     append(line)
@@ -160,8 +184,8 @@ def get_header_from_ops_and_kernels(ops_and_kernels,
     append('#define SHOULD_REGISTER_OP(op) ShouldRegisterOp(op)')
     append('')
 
-    append('#define SHOULD_REGISTER_OP_GRADIENT ' + (
-        'true' if 'SymbolicGradient' in ops else 'false'))
+    append('#define SHOULD_REGISTER_OP_GRADIENT ' +
+           ('true' if 'SymbolicGradient' in ops else 'false'))
 
   append('#endif')
   return '\n'.join(result_list)
@@ -174,11 +198,13 @@ def get_header(graphs,
 
   Args:
     graphs: a list of paths to GraphDef files to include.
-    proto_fileformat: optional format of proto file, either 'textproto' or
-      'rawproto' (default).
+    proto_fileformat: optional format of proto file, either 'textproto',
+      'rawproto' (default) or ops_list. The ops_list is the file contain the
+      list of ops in JSON format, Ex: "[["Transpose", "TransposeCpuOp"]]".
     default_ops: optional comma-separated string of operator:kernel pairs to
       always include implementation for. Pass 'all' to have all operators and
       kernels included. Default: 'NoOp:NoOp,_Recv:RecvOp,_Send:SendOp'.
+
   Returns:
     the string of the header that should be written as ops_to_register.h.
   """

From 958990b4706580bfa25dc57dfe6ed49abee93d70 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Tue, 2 Jun 2020 01:34:05 -0700
Subject: [PATCH 1531/1533] Small changes to SimpleDelegate APIs: 1. change
 name() to Name(). 2. change Invoke to Eval().

PiperOrigin-RevId: 314287361
Change-Id: I7c5324a1ee7c4ea5f06987da2f55c245e2d6153e
---
 tensorflow/lite/delegates/flex/delegate.cc              | 2 +-
 tensorflow/lite/delegates/flex/delegate.h               | 2 +-
 tensorflow/lite/delegates/flex/kernel.cc                | 2 +-
 tensorflow/lite/delegates/flex/kernel.h                 | 2 +-
 tensorflow/lite/delegates/utils/simple_delegate.cc      | 6 +++---
 tensorflow/lite/delegates/utils/simple_delegate.h       | 6 ++----
 tensorflow/lite/delegates/utils/simple_delegate_test.cc | 4 ++--
 7 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 13ce5ff2a22..4741bddc2f5 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -72,7 +72,7 @@ TfLiteStatus FlexDelegate::Initialize(TfLiteContext* context) {
   return kTfLiteOk;
 }
 
-const char* FlexDelegate::name() const {
+const char* FlexDelegate::Name() const {
   static constexpr char kName[] = "TfLiteFlexDelegate";
   return kName;
 }
diff --git a/tensorflow/lite/delegates/flex/delegate.h b/tensorflow/lite/delegates/flex/delegate.h
index a760d941656..be890a5456d 100644
--- a/tensorflow/lite/delegates/flex/delegate.h
+++ b/tensorflow/lite/delegates/flex/delegate.h
@@ -69,7 +69,7 @@ class FlexDelegate : public SimpleDelegateInterface {
 
   FlexDelegate() {}
 
-  const char* name() const override;
+  const char* Name() const override;
 
   bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index b6e809647d5..e7705ecf3ce 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -485,7 +485,7 @@ TfLiteStatus DelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
-TfLiteStatus DelegateKernel::Invoke(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus DelegateKernel::Eval(TfLiteContext* context, TfLiteNode* node) {
   BufferMap* buffer_map = op_data_->buffer_map;
 
   // Insert a tensor in the buffer map for all inputs that are not constant.
diff --git a/tensorflow/lite/delegates/flex/kernel.h b/tensorflow/lite/delegates/flex/kernel.h
index 27cbfeadc14..9a7b93e31f2 100644
--- a/tensorflow/lite/delegates/flex/kernel.h
+++ b/tensorflow/lite/delegates/flex/kernel.h
@@ -32,7 +32,7 @@ class DelegateKernel : public SimpleDelegateKernelInterface {
   TfLiteStatus Init(TfLiteContext* context,
                     const TfLiteDelegateParams* params) override;
   TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) override;
-  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) override;
+  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override;
 
  private:
   std::unique_ptr<OpData> op_data_;
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.cc b/tensorflow/lite/delegates/utils/simple_delegate.cc
index 156f6a9679a..f8c0a027cca 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate.cc
+++ b/tensorflow/lite/delegates/utils/simple_delegate.cc
@@ -31,7 +31,7 @@ TfLiteRegistration GetDelegateKernelRegistration(
   TfLiteRegistration kernel_registration;
   kernel_registration.profiling_string = nullptr;
   kernel_registration.builtin_code = kTfLiteBuiltinDelegate;
-  kernel_registration.custom_name = delegate->name();
+  kernel_registration.custom_name = delegate->Name();
   kernel_registration.version = 1;
   kernel_registration.free = [](TfLiteContext* context, void* buffer) -> void {
     delete reinterpret_cast<SimpleDelegateKernelInterface*>(buffer);
@@ -68,7 +68,7 @@ TfLiteRegistration GetDelegateKernelRegistration(
     SimpleDelegateKernelInterface* delegate_kernel =
         reinterpret_cast<SimpleDelegateKernelInterface*>(node->user_data);
     TFLITE_DCHECK(delegate_kernel != nullptr);
-    return delegate_kernel->Invoke(context, node);
+    return delegate_kernel->Eval(context, node);
   };
 
   return kernel_registration;
@@ -94,7 +94,7 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context,
   TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
                   "%s delegate: %d nodes delegated out of %d nodes with "
                   "%d partitions.\n",
-                  delegate->name(), supported_nodes.size(),
+                  delegate->Name(), supported_nodes.size(),
                   helper.num_total_nodes(), helper.num_partitions());
   TfLiteRegistration delegate_kernel_registration =
       GetDelegateKernelRegistration(delegate);
diff --git a/tensorflow/lite/delegates/utils/simple_delegate.h b/tensorflow/lite/delegates/utils/simple_delegate.h
index 54473e41901..7b6be43047b 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate.h
+++ b/tensorflow/lite/delegates/utils/simple_delegate.h
@@ -56,8 +56,7 @@ class SimpleDelegateKernelInterface {
 
   // Actual subgraph inference should happen on this call.
   // Returns status, and signalling any errors.
-  // TODO(b/157882025): change this to Eval to be consistent w/ a TFLite kernel.
-  virtual TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) = 0;
+  virtual TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) = 0;
 };
 
 // Pure Interface that clients should implement.
@@ -87,8 +86,7 @@ class SimpleDelegateInterface {
 
   // Returns a name that identifies the delegate.
   // This name is used for debugging/logging/profiling.
-  // TODO(b/157882025): change this to Name()
-  virtual const char* name() const = 0;
+  virtual const char* Name() const = 0;
 
   // Returns instance of an object that implements the interface
   // SimpleDelegateKernelInterface.
diff --git a/tensorflow/lite/delegates/utils/simple_delegate_test.cc b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
index 42c0ace6cb7..12a790fff1a 100644
--- a/tensorflow/lite/delegates/utils/simple_delegate_test.cc
+++ b/tensorflow/lite/delegates/utils/simple_delegate_test.cc
@@ -52,7 +52,7 @@ class TestSimpleDelegateKernel : public SimpleDelegateKernelInterface {
     return !options_.error_during_prepare ? kTfLiteOk : kTfLiteError;
   }
 
-  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) override {
+  TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) override {
     return !options_.error_during_invoke ? kTfLiteOk : kTfLiteError;
   }
 
@@ -74,7 +74,7 @@ class TestSimpleDelegate : public SimpleDelegateInterface {
 
   TfLiteStatus Initialize(TfLiteContext* context) override { return kTfLiteOk; }
 
-  const char* name() const override {
+  const char* Name() const override {
     static constexpr char kName[] = "TestSimpleDelegate";
     return kName;
   }

From da54a4e6b127a11da6cbf3dbed3592ebb4aad3f9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Jun 2020 02:02:23 -0700
Subject: [PATCH 1532/1533] Update GraphDef version to 420.

PiperOrigin-RevId: 314290238
Change-Id: I5644be9ba2ec826e772efd2c0cf667bd7990c5e4
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 94536ab04ce..c56fac42c1d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 419  // Updated: 2020/6/1
+#define TF_GRAPH_DEF_VERSION 420  // Updated: 2020/6/2
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 424a3072f983edf787746768eb511ce0877ad8c0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 2 Jun 2020 02:02:28 -0700
Subject: [PATCH 1533/1533] compat: Update forward compatibility horizon to
 2020-06-02

PiperOrigin-RevId: 314290251
Change-Id: Ibfe28f9c1d17428c3c5050bc7f2eba9d017f3bfd
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 79375ba327a..2a625496569 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 1)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 6, 2)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None